merge.

checkpoint. does not compile.
author: Sergei Golubchik <sergii@pisem.net> 2010-11-25 18:17:28 +0100
committer: Sergei Golubchik <sergii@pisem.net> 2010-11-25 18:17:28 +0100
commit: 65ca700def99289cc31a7040537f5aa6e12bf485 (patch)
tree: 97b3a07299b626c519da0e80c122b5b79b933914 /storage
parent: 2ab57de38d13d927ddff2d51aed4af34e13998f5 (diff)
parent: 6e5bcca7935d3c62f84bb640e5357664a210ee12 (diff)
download: mariadb-git-65ca700def99289cc31a7040537f5aa6e12bf485.tar.gz
783 files changed, 431216 insertions, 1256 deletions
diff --git a/storage/archive/Makefile.am b/storage/archive/Makefile.am
index 8b08105cef3..14fcde4108b 100644
--- a/storage/archive/Makefile.am
+++ b/storage/archive/Makefile.am
@@ -37,8 +37,8 @@ noinst_PROGRAMS	=	archive_test archive_reader
 EXTRA_LTLIBRARIES =	ha_archive.la
 pkgplugin_LTLIBRARIES =	@plugin_archive_shared_target@
 ha_archive_la_LDFLAGS =	-module -rpath $(pkgplugindir)
-ha_archive_la_CXXFLAGS=	$(AM_CXXFLAGS) -DMYSQL_DYNAMIC_PLUGIN
-ha_archive_la_CFLAGS =	$(AM_CFLAGS) -DMYSQL_DYNAMIC_PLUGIN
+ha_archive_la_CXXFLAGS=	-shared $(AM_CXXFLAGS) -DMYSQL_DYNAMIC_PLUGIN
+ha_archive_la_CFLAGS =	-shared $(AM_CFLAGS) -DMYSQL_DYNAMIC_PLUGIN
 ha_archive_la_SOURCES =	ha_archive.cc azio.c
 
 
diff --git a/storage/archive/azio.c b/storage/archive/azio.c
index 1e2753027dc..936a179b5b4 100644
--- a/storage/archive/azio.c
+++ b/storage/archive/azio.c
@@ -16,6 +16,8 @@
 #include <stdio.h>
 #include <string.h>
 
+#include "my_sys.h"
+
 static int const gz_magic[2] = {0x1f, 0x8b}; /* gzip magic header */
 static int const az_magic[3] = {0xfe, 0x03, 0x01}; /* az magic header */
 
@@ -52,8 +54,8 @@ int az_open (azio_stream *s, const char *path, int Flags, File fd)
   int level = Z_DEFAULT_COMPRESSION; /* compression level */
   int strategy = Z_DEFAULT_STRATEGY; /* compression strategy */
 
-  s->stream.zalloc = (alloc_func)0;
-  s->stream.zfree = (free_func)0;
+  s->stream.zalloc = my_az_allocator;
+  s->stream.zfree = my_az_free;
   s->stream.opaque = (voidpf)0;
   memset(s->inbuf, 0, AZ_BUFSIZE_READ);
   memset(s->outbuf, 0, AZ_BUFSIZE_WRITE);
@@ -148,6 +150,17 @@ int az_open (azio_stream *s, const char *path, int Flags, File fd)
   }
   else
   {
+    /* Reset values in case of old version of archive file */
+    s->rows= 0;
+    s->forced_flushes= 0;
+    s->shortest_row= 0;
+    s->longest_row= 0;
+    s->auto_increment= 0;
+    s->check_point= 0;
+    s->comment_start_pos= 0;
+    s->comment_length= 0;
+    s->frm_start_pos= 0;
+    s->frm_length= 0;
     check_header(s); /* skip the .az header */
   }
 
diff --git a/storage/archive/ha_archive.cc b/storage/archive/ha_archive.cc
index ef907b035b5..8144ea5f319 100644
--- a/storage/archive/ha_archive.cc
+++ b/storage/archive/ha_archive.cc
@@ -322,7 +322,7 @@ int ha_archive::read_data_header(azio_stream *file_to_read)
   DBUG_PRINT("ha_archive", ("Version %u", data_buffer[1]));
 
   if ((data_buffer[0] != (uchar)ARCHIVE_CHECK_HEADER) &&  
-      (data_buffer[1] != (uchar)ARCHIVE_VERSION))
+      (data_buffer[1] == 1 || data_buffer[1] == 2))
     DBUG_RETURN(HA_ERR_CRASHED_ON_USAGE);
 
   DBUG_RETURN(0);
@@ -390,9 +390,19 @@ ARCHIVE_SHARE *ha_archive::get_share(const char *table_name, int *rc)
       my_free(share);
       DBUG_RETURN(NULL);
     }
-    stats.auto_increment_value= archive_tmp.auto_increment + 1;
-    share->rows_recorded= (ha_rows)archive_tmp.rows;
-    share->crashed= archive_tmp.dirty;
+    share->version= archive_tmp.version;
+    if (archive_tmp.version == ARCHIVE_VERSION)
+    {
+      stats.auto_increment_value= archive_tmp.auto_increment + 1;
+      share->rows_recorded= (ha_rows)archive_tmp.rows;
+      share->crashed= archive_tmp.dirty;
+    }
+    else
+    {
+      /* Used by repair */
+      share->rows_recorded= ~(ha_rows) 0;
+      stats.auto_increment_value= 0;
+    }
     /*
       If archive version is less than 3, It should be upgraded before
       use.
@@ -542,10 +552,19 @@ int ha_archive::open(const char *name, int mode, uint open_options)
   case 0:
     break;
   case HA_ERR_CRASHED_ON_USAGE:
+    DBUG_PRINT("ha_archive", ("archive table was crashed"));
     if (open_options & HA_OPEN_FOR_REPAIR)
+    {
+      rc= 0;
       break;
+    }
     /* fall through */
   case HA_ERR_TABLE_NEEDS_UPGRADE:
+    if (open_options & HA_OPEN_FOR_REPAIR)
+    {
+      rc= 0;
+      break;
+    }
     free_share();
     /* fall through */
   default:
@@ -565,13 +584,6 @@ int ha_archive::open(const char *name, int mode, uint open_options)
 
   thr_lock_data_init(&share->lock, &lock, NULL);
 
-  DBUG_PRINT("ha_archive", ("archive table was crashed %s", 
-                      rc == HA_ERR_CRASHED_ON_USAGE ? "yes" : "no"));
-  if (rc == HA_ERR_CRASHED_ON_USAGE && open_options & HA_OPEN_FOR_REPAIR)
-  {
-    DBUG_RETURN(0);
-  }
-
   DBUG_RETURN(rc);
 }
 
@@ -1350,6 +1362,14 @@ end:
   DBUG_RETURN(rc);
 }
 
+int ha_archive::check_for_upgrade(HA_CHECK_OPT *check_opt)
+{
+  if (share->version < ARCHIVE_VERSION)
+    return HA_ADMIN_NEEDS_ALTER;
+  return 0;
+}
+
+
 /*
   This method repairs the meta file. It does this by walking the datafile and 
   rewriting the meta file. If EXTENDED repair is requested, we attempt to
@@ -1764,4 +1784,21 @@ mysql_declare_plugin(archive)
   NULL                        /* config options                  */
 }
 mysql_declare_plugin_end;
+maria_declare_plugin(archive)
+{
+  MYSQL_STORAGE_ENGINE_PLUGIN,
+  &archive_storage_engine,
+  "ARCHIVE",
+  "Brian Aker, MySQL AB",
+  "Archive storage engine",
+  PLUGIN_LICENSE_GPL,
+  archive_db_init, /* Plugin Init */
+  archive_db_done, /* Plugin Deinit */
+  0x0300 /* 3.0 */,
+  NULL,                       /* status variables                */
+  NULL,                       /* system variables                */
+  "1.0",                      /* string version */
+  MariaDB_PLUGIN_MATURITY_STABLE /* maturity */
+}
+maria_declare_plugin_end;
 
diff --git a/storage/archive/ha_archive.h b/storage/archive/ha_archive.h
index b258b403c3c..712c1e1358d 100644
--- a/storage/archive/ha_archive.h
+++ b/storage/archive/ha_archive.h
@@ -35,7 +35,7 @@ typedef struct st_archive_record_buffer {
 typedef struct st_archive_share {
   char *table_name;
   char data_file_name[FN_REFLEN];
-  uint table_name_length,use_count;
+  uint table_name_length,use_count, version;
   mysql_mutex_t mutex;
   THR_LOCK lock;
   azio_stream archive_write;     /* Archive file we are working with */
@@ -134,6 +134,7 @@ public:
   int create(const char *name, TABLE *form, HA_CREATE_INFO *create_info);
   int optimize(THD* thd, HA_CHECK_OPT* check_opt);
   int repair(THD* thd, HA_CHECK_OPT* check_opt);
+  int check_for_upgrade(HA_CHECK_OPT *check_opt);
   void start_bulk_insert(ha_rows rows);
   int end_bulk_insert();
   enum row_type get_row_type() const 
diff --git a/storage/blackhole/Makefile.am b/storage/blackhole/Makefile.am
index 38c2f354844..7016f265365 100644
--- a/storage/blackhole/Makefile.am
+++ b/storage/blackhole/Makefile.am
@@ -35,15 +35,13 @@ noinst_HEADERS =	ha_blackhole.h
 EXTRA_LTLIBRARIES =	ha_blackhole.la
 pkgplugin_LTLIBRARIES =	@plugin_blackhole_shared_target@
 ha_blackhole_la_LDFLAGS=-module -rpath $(pkgplugindir)
-ha_blackhole_la_CXXFLAGS=$(AM_CXXFLAGS) -DMYSQL_DYNAMIC_PLUGIN
-ha_blackhole_la_CFLAGS=	$(AM_CFLAGS) -DMYSQL_DYNAMIC_PLUGIN
+ha_blackhole_la_CXXFLAGS=-shared $(AM_CXXFLAGS) -DMYSQL_DYNAMIC_PLUGIN
 ha_blackhole_la_SOURCES=ha_blackhole.cc
 
 
 EXTRA_LIBRARIES =	libblackhole.a
 noinst_LIBRARIES =	@plugin_blackhole_static_target@
 libblackhole_a_CXXFLAGS=$(AM_CXXFLAGS)
-libblackhole_a_CFLAGS =	$(AM_CFLAGS)
 libblackhole_a_SOURCES=	ha_blackhole.cc
 
 
diff --git a/storage/blackhole/ha_blackhole.cc b/storage/blackhole/ha_blackhole.cc
index 6591c3a2c78..993ba9c7cd4 100644
--- a/storage/blackhole/ha_blackhole.cc
+++ b/storage/blackhole/ha_blackhole.cc
@@ -441,3 +441,20 @@ mysql_declare_plugin(blackhole)
   NULL                        /* config options                  */
 }
 mysql_declare_plugin_end;
+maria_declare_plugin(blackhole)
+{
+  MYSQL_STORAGE_ENGINE_PLUGIN,
+  &blackhole_storage_engine,
+  "BLACKHOLE",
+  "MySQL AB",
+  "/dev/null storage engine (anything you write to it disappears)",
+  PLUGIN_LICENSE_GPL,
+  blackhole_init, /* Plugin Init */
+  blackhole_fini, /* Plugin Deinit */
+  0x0100 /* 1.0 */,
+  NULL,                       /* status variables                */
+  NULL,                       /* system variables                */
+  "1.0",                      /* string version */
+  MariaDB_PLUGIN_MATURITY_STABLE /* maturity */
+}
+maria_declare_plugin_end;
diff --git a/storage/csv/Makefile.am b/storage/csv/Makefile.am
index 5e3587c893f..76e683ec282 100644
--- a/storage/csv/Makefile.am
+++ b/storage/csv/Makefile.am
@@ -32,7 +32,7 @@ noinst_HEADERS	  =	ha_tina.h transparent_file.h
 EXTRA_LTLIBRARIES =	ha_csv.la
 pkglib_LTLIBRARIES =	@plugin_csv_shared_target@
 ha_csv_la_LDFLAGS =	-module -rpath $(MYSQLLIBdir)
-ha_csv_la_CXXFLAGS =	$(AM_CXXFLAGS) -DMYSQL_PLUGIN
+ha_csv_la_CXXFLAGS =	-shared $(AM_CXXFLAGS) -DMYSQL_PLUGIN
 ha_csv_la_SOURCES =	transparent_file.cc ha_tina.cc 
 
 EXTRA_LIBRARIES =	libcsv.a
diff --git a/storage/csv/ha_tina.cc b/storage/csv/ha_tina.cc
index e8012a86ae6..d53f33945fa 100644
--- a/storage/csv/ha_tina.cc
+++ b/storage/csv/ha_tina.cc
@@ -491,7 +491,7 @@ ha_tina::ha_tina(handlerton *hton, TABLE_SHARE *table_arg)
   */
   current_position(0), next_position(0), local_saved_data_file_length(0),
   file_buff(0), chain_alloced(0), chain_size(DEFAULT_CHAIN_LENGTH),
-  local_data_file_version(0), records_is_known(0)
+  local_data_file_version(0), records_is_known(0), curr_lock_type(F_UNLCK)
 {
   /* Set our original buffers from pre-allocated memory */
   buffer.set((char*)byte_buffer, IO_SIZE, &my_charset_bin);
@@ -540,6 +540,13 @@ int ha_tina::encode_quote(uchar *buf)
       ptr= attribute.ptr();
       end_ptr= attribute.length() + ptr;
 
+      /*
+        Ensure that buffer is big enough. This will also speed things up
+        as we don't have to do any new allocation in the loop below
+      */
+      if (buffer.realloc(buffer.length() + attribute.length()*2+2))
+        return 0;                              // Failure
+
       buffer.append('"');
 
       for (; ptr < end_ptr; ptr++)
@@ -845,7 +852,7 @@ const char **ha_tina::bas_ext() const
   for CSV engine. For more details see mysys/thr_lock.c
 */
 
-void tina_get_status(void* param, int concurrent_insert)
+void tina_get_status(void* param, my_bool concurrent_insert)
 {
   ha_tina *tina= (ha_tina*) param;
   tina->get_status();
@@ -1637,6 +1644,14 @@ int ha_tina::delete_all_rows()
   DBUG_RETURN(rc);
 }
 
+int ha_tina::external_lock(THD *thd __attribute__((unused)), int lock_type)
+{
+  if (lock_type==F_UNLCK && curr_lock_type == F_WRLCK)
+    update_status();
+  curr_lock_type= lock_type;
+  return 0;
+}
+
 /*
   Called by the database to lock the table. Keep in mind that this
   is an internal lock.
@@ -1651,7 +1666,7 @@ THR_LOCK_DATA **ha_tina::store_lock(THD *thd,
   return to;
 }
 
-/* 
+/*
   Create a table. You do not want to leave the table open after a call to
   this (the database will call ::open() if it needs to).
 */
@@ -1769,4 +1784,20 @@ mysql_declare_plugin(csv)
   NULL                        /* config options                  */
 }
 mysql_declare_plugin_end;
-
+maria_declare_plugin(csv)
+{
+  MYSQL_STORAGE_ENGINE_PLUGIN,
+  &csv_storage_engine,
+  "CSV",
+  "Brian Aker, MySQL AB",
+  "CSV storage engine",
+  PLUGIN_LICENSE_GPL,
+  tina_init_func, /* Plugin Init */
+  tina_done_func, /* Plugin Deinit */
+  0x0100 /* 1.0 */,
+  NULL,                       /* status variables                */
+  NULL,                       /* system variables                */
+  "1.0",                      /* string version */
+  MariaDB_PLUGIN_MATURITY_STABLE /* maturity */
+}
+maria_declare_plugin_end;
diff --git a/storage/csv/ha_tina.h b/storage/csv/ha_tina.h
index 845b50e3869..dc2fc743117 100644
--- a/storage/csv/ha_tina.h
+++ b/storage/csv/ha_tina.h
@@ -85,6 +85,8 @@ class ha_tina: public handler
   MEM_ROOT blobroot;
 
 private:
+  int curr_lock_type;
+
   bool get_write_pos(my_off_t *end_pos, tina_set *closest_hole);
   int open_update_temp_file_if_needed();
   int init_tina_writer();
@@ -156,6 +158,8 @@ public:
   bool check_if_incompatible_data(HA_CREATE_INFO *info,
                                   uint table_changes);
 
+  int external_lock(THD *thd, int lock_type);
+
   THR_LOCK_DATA **store_lock(THD *thd, THR_LOCK_DATA **to,
       enum thr_lock_type lock_type);
 
diff --git a/storage/example/Makefile.am b/storage/example/Makefile.am
index c79fbe97cc3..a8b0506e8dd 100644
--- a/storage/example/Makefile.am
+++ b/storage/example/Makefile.am
@@ -35,15 +35,12 @@ noinst_HEADERS =	ha_example.h
 EXTRA_LTLIBRARIES =	ha_example.la
 pkgplugin_LTLIBRARIES =	@plugin_example_shared_target@
 ha_example_la_LDFLAGS =	-module -rpath $(pkgplugindir) -L$(top_builddir)/libservices -lmysqlservices
-ha_example_la_CXXFLAGS=	$(AM_CXXFLAGS) -DMYSQL_DYNAMIC_PLUGIN
-ha_example_la_CFLAGS =	$(AM_CFLAGS) -DMYSQL_DYNAMIC_PLUGIN
+ha_example_la_CXXFLAGS=	-shared $(AM_CXXFLAGS) -DMYSQL_DYNAMIC_PLUGIN
 ha_example_la_SOURCES =	ha_example.cc
 
-
 EXTRA_LIBRARIES =	libexample.a
 noinst_LIBRARIES =	@plugin_example_static_target@
 libexample_a_CXXFLAGS =	$(AM_CXXFLAGS)
-libexample_a_CFLAGS =	$(AM_CFLAGS)
 libexample_a_SOURCES=	ha_example.cc
 
 
diff --git a/storage/example/ha_example.cc b/storage/example/ha_example.cc
index 306f8eaeccd..2a076f0dc14 100644
--- a/storage/example/ha_example.cc
+++ b/storage/example/ha_example.cc
@@ -114,6 +114,76 @@ static HASH example_open_tables;
 /* The mutex used to init the hash; variable for example share methods */
 mysql_mutex_t example_mutex;
 
+
+/**
+  structure for CREATE TABLE options (table options)
+
+  These can be specified in the CREATE TABLE:
+  CREATE TABLE ( ... ) {...here...}
+*/
+
+struct example_table_options_struct
+{
+  const char *strparam;
+  ulonglong ullparam;
+  uint enumparam;
+  bool boolparam;
+};
+
+
+/**
+  structure for CREATE TABLE options (field options)
+
+  These can be specified in the CREATE TABLE per field:
+  CREATE TABLE ( field ... {...here...}, ... )
+*/
+
+struct example_field_options_struct
+{
+  const char *compex_param_to_parse_it_in_engine;
+};
+
+/* HA_TOPTION_* macros expect the structure called ha_table_option_struct */
+#define ha_table_option_struct example_table_options_struct
+ha_create_table_option example_table_option_list[]=
+{
+  /*
+    one numeric option, with the default of UINT_MAX32, valid
+    range of values 0..UINT_MAX32, and a "block size" of 10
+    (any value must be divisible by 10).
+  */
+  HA_TOPTION_NUMBER("ULL", ullparam, UINT_MAX32, 0, UINT_MAX32, 10),
+  /*
+    one option that takes an arbitrary string
+  */
+  HA_TOPTION_STRING("STR", strparam),
+  /*
+    one enum option. a valid values are strings ONE and TWO.
+    A default value is 0, that is "one".
+  */
+  HA_TOPTION_ENUM("one_or_two", enumparam, "one,two", 0),
+  /*
+    one boolean option, the valid values are YES/NO, ON/OFF, 1/0.
+    The default is 1, that is true, yes, on.
+  */
+  HA_TOPTION_BOOL("YESNO", boolparam, 1),
+  HA_TOPTION_END
+};
+
+/* HA_FOPTION_* macros expect the structure called ha_field_option_struct */
+#define ha_field_option_struct example_field_options_struct
+ha_create_table_option example_field_option_list[]=
+{
+  /*
+    If the engine wants something more complex than a string, number, enum,
+    or boolean - for example a list - it needs to specify the option
+    as a string and parse it internally.
+  */
+  HA_FOPTION_STRING("COMPLEX", compex_param_to_parse_it_in_engine),
+  HA_FOPTION_END
+};
+
+
 /**
   @brief
   Function we use in the creation of our hash to get key.
@@ -165,6 +235,8 @@ static int example_init_func(void *p)
   example_hton->state=   SHOW_OPTION_YES;
   example_hton->create=  example_create_handler;
   example_hton->flags=   HTON_CAN_RECREATE;
+  example_hton->table_options= example_table_option_list;
+  example_hton->field_options= example_field_option_list;
 
   DBUG_RETURN(0);
 }
@@ -323,6 +395,17 @@ int ha_example::open(const char *name, int mode, uint test_if_locked)
     DBUG_RETURN(1);
   thr_lock_data_init(&share->lock,&lock,NULL);
 
+#ifndef DBUG_OFF
+  example_table_options_struct *options=
+    (example_table_options_struct *)table->s->option_struct;
+
+  DBUG_ASSERT(options);
+  DBUG_PRINT("info", ("strparam: '%-.64s'  ullparam: %llu  enumparam: %u  "\
+                      "boolparam: %u",
+                      (options->strparam ? options->strparam : "<NULL>"),
+                      options->ullparam, options->enumparam, options->boolparam));
+#endif
+
   DBUG_RETURN(0);
 }
 
@@ -561,7 +644,7 @@ int ha_example::index_last(uchar *buf)
 int ha_example::rnd_init(bool scan)
 {
   DBUG_ENTER("ha_example::rnd_init");
-  DBUG_RETURN(HA_ERR_WRONG_COMMAND);
+  DBUG_RETURN(0);
 }
 
 int ha_example::rnd_end()
@@ -838,27 +921,6 @@ int ha_example::delete_table(const char *name)
 
 /**
   @brief
-  Renames a table from one name to another via an alter table call.
-
-  @details
-  If you do not implement this, the default rename_table() is called from
-  handler.cc and it will delete all files with the file extensions returned
-  by bas_ext().
-
-  Called from sql_table.cc by mysql_rename_table().
-
-  @see
-  mysql_rename_table() in sql_table.cc
-*/
-int ha_example::rename_table(const char * from, const char * to)
-{
-  DBUG_ENTER("ha_example::rename_table ");
-  DBUG_RETURN(HA_ERR_WRONG_COMMAND);
-}
-
-
-/**
-  @brief
   Given a starting key and an ending key, estimate the number of rows that
   will exist between the two keys.
 
@@ -900,15 +962,105 @@ ha_rows ha_example::records_in_range(uint inx, key_range *min_key,
 int ha_example::create(const char *name, TABLE *table_arg,
                        HA_CREATE_INFO *create_info)
 {
+#ifndef DBUG_OFF
+  example_table_options_struct *options=
+    (example_table_options_struct *)table_arg->s->option_struct;
   DBUG_ENTER("ha_example::create");
   /*
-    This is not implemented but we want someone to be able to see that it
-    works.
+    This example shows how to support custom engine specific table and field
+    options.
   */
+  DBUG_ASSERT(options);
+  DBUG_PRINT("info", ("strparam: '%-.64s'  ullparam: %llu  enumparam: %u  "\
+                      "boolparam: %u",
+                      (options->strparam ? options->strparam : "<NULL>"),
+                      options->ullparam, options->enumparam, options->boolparam));
+  for (Field **field= table_arg->s->field; *field; field++)
+  {
+    example_field_options_struct *field_options=
+      (example_field_options_struct *)(*field)->option_struct;
+    DBUG_ASSERT(field_options);
+    DBUG_PRINT("info", ("field: %s  complex: '%-.64s'",
+                         (*field)->field_name,
+                         (field_options->compex_param_to_parse_it_in_engine ?
+                          field_options->compex_param_to_parse_it_in_engine :
+                          "<NULL>")));
+  }
+#endif
   DBUG_RETURN(0);
 }
 
 
+/**
+  check_if_incompatible_data() called if ALTER TABLE can't detect otherwise
+  if new and old definition are compatible
+
+  @details If there are no other explicit signs like changed number of
+  fields this function will be called by compare_tables()
+  (sql/sql_tables.cc) to decide should we rewrite whole table or only .frm
+  file.
+
+*/
+
+bool ha_example::check_if_incompatible_data(HA_CREATE_INFO *info,
+                                            uint table_changes)
+{
+  example_table_options_struct *param_old, *param_new;
+  uint i;
+  DBUG_ENTER("ha_example::check_if_incompatible_data");
+  /*
+    This example shows how custom engine specific table and field
+    options can be accessed from this function to be compared.
+  */
+  param_new= (example_table_options_struct *)info->option_struct;
+  DBUG_PRINT("info", ("new strparam: '%-.64s'  ullparam: %llu  enumparam: %u  "
+                      "boolparam: %u",
+                      (param_new->strparam ? param_new->strparam : "<NULL>"),
+                      param_new->ullparam, param_new->enumparam,
+                      param_new->boolparam));
+
+  param_old= (example_table_options_struct *)table->s->option_struct;
+  DBUG_PRINT("info", ("old strparam: '%-.64s'  ullparam: %llu  enumparam: %u  "
+                      "boolparam: %u",
+                      (param_old->strparam ? param_old->strparam : "<NULL>"),
+                      param_old->ullparam, param_old->enumparam,
+                      param_old->boolparam));
+
+  /*
+    check important parameters:
+    for this example engine, we'll assume that changing ullparam or
+    boolparam requires a table to be rebuilt, while changing strparam
+    or enumparam - does not.
+  */
+  if (param_new->ullparam != param_old->ullparam ||
+      param_new->boolparam != param_old->boolparam)
+    DBUG_RETURN(COMPATIBLE_DATA_NO);
+
+  for (i= 0; i < table->s->fields; i++)
+  {
+    example_field_options_struct *f_old, *f_new;
+    f_old= (example_field_options_struct *)table->s->field[i]->option_struct;
+    DBUG_ASSERT(f_old);
+    DBUG_PRINT("info", ("old field: %u old complex: '%-.64s'", i,
+                         (f_old->compex_param_to_parse_it_in_engine ?
+                          f_old->compex_param_to_parse_it_in_engine :
+                          "<NULL>")));
+    if (info->fileds_option_struct[i])
+    {
+      f_new= (example_field_options_struct *)info->fileds_option_struct[i];
+      DBUG_PRINT("info", ("old field: %u  new complex: '%-.64s'", i,
+                          (f_new->compex_param_to_parse_it_in_engine ?
+                           f_new->compex_param_to_parse_it_in_engine :
+                           "<NULL>")));
+    }
+    else
+      DBUG_PRINT("info", ("old field %i did not changed", i));
+  }
+
+  DBUG_RETURN(COMPATIBLE_DATA_YES);
+}
+
+
 struct st_mysql_storage_engine example_storage_engine=
 { MYSQL_HANDLERTON_INTERFACE_VERSION };
 
@@ -988,3 +1140,20 @@ mysql_declare_plugin(example)
   NULL                                          /* config options */
 }
 mysql_declare_plugin_end;
+maria_declare_plugin(example)
+{
+  MYSQL_STORAGE_ENGINE_PLUGIN,
+  &example_storage_engine,
+  "EXAMPLE",
+  "Brian Aker, MySQL AB",
+  "Example storage engine",
+  PLUGIN_LICENSE_GPL,
+  example_init_func,                            /* Plugin Init */
+  example_done_func,                            /* Plugin Deinit */
+  0x0001 /* 0.1 */,
+  func_status,                                  /* status variables */
+  example_system_variables,                     /* system variables */
+  "0.1",                                        /* string version */
+  MariaDB_PLUGIN_MATURITY_EXPERIMENTAL          /* maturity */
+}
+maria_declare_plugin_end;
diff --git a/storage/example/ha_example.h b/storage/example/ha_example.h
index 12e088f5f05..27b97395544 100644
--- a/storage/example/ha_example.h
+++ b/storage/example/ha_example.h
@@ -248,9 +248,10 @@ public:
   ha_rows records_in_range(uint inx, key_range *min_key,
                            key_range *max_key);
   int delete_table(const char *from);
-  int rename_table(const char * from, const char * to);
   int create(const char *name, TABLE *form,
              HA_CREATE_INFO *create_info);                      ///< required
+  bool check_if_incompatible_data(HA_CREATE_INFO *info,
+                                  uint table_changes);
 
   THR_LOCK_DATA **store_lock(THD *thd, THR_LOCK_DATA **to,
                              enum thr_lock_type lock_type);     ///< required
diff --git a/storage/federated/Makefile.am b/storage/federated/Makefile.am
index e07b0d95b97..28b71900af2 100644
--- a/storage/federated/Makefile.am
+++ b/storage/federated/Makefile.am
@@ -27,24 +27,22 @@ INCLUDES =              -I$(top_srcdir)/include -I$(top_builddir)/include \
 
 WRAPLIBS=
 
-LDADD =
-
 DEFS =                  @DEFS@
 
 noinst_HEADERS =	ha_federated.h
 
 EXTRA_LTLIBRARIES =	ha_federated.la
 pkgplugin_LTLIBRARIES =	@plugin_federated_shared_target@
-ha_federated_la_LDFLAGS =	-module -rpath $(pkgplugindir)
-ha_federated_la_CXXFLAGS=	$(AM_CXXFLAGS) -DMYSQL_DYNAMIC_PLUGIN
-ha_federated_la_CFLAGS =	$(AM_CFLAGS) -DMYSQL_DYNAMIC_PLUGIN
-ha_federated_la_SOURCES =	ha_federated.cc
+ha_federated_la_LDFLAGS =	-module -rpath $(pkgplugindir) \
+				-L$(top_builddir)/libservices -lmysqlservices
+ha_federated_la_CXXFLAGS=	-shared $(AM_CXXFLAGS) -DMYSQL_DYNAMIC_PLUGIN
+ha_federated_la_CFLAGS =	-shared $(AM_CFLAGS) -DMYSQL_DYNAMIC_PLUGIN
+ha_federated_la_SOURCES =	ha_federated.cc $(top_srcdir)/mysys/string.c
 
 
 EXTRA_LIBRARIES =	libfederated.a
 noinst_LIBRARIES =	@plugin_federated_static_target@
 libfederated_a_CXXFLAGS =	$(AM_CXXFLAGS)
-libfederated_a_CFLAGS =	$(AM_CFLAGS)
 libfederated_a_SOURCES=	ha_federated.cc
 
 
diff --git a/storage/federated/README b/storage/federated/README
new file mode 100644
index 00000000000..1b521cb7859
--- /dev/null
+++ b/storage/federated/README
@@ -0,0 +1,7 @@
+The files in this directory are not used by MariaDB
+
+MariaDB uses the new federated storage engine that can be found in the
+federatedx directory.
+
+This directory is only kept around to make it easy to merge code from the
+MySQL source repositories that uses the old and disabled federated code.
diff --git a/storage/federated/ha_federated.cc b/storage/federated/ha_federated.cc
index b1ae276dce8..f38f71b15ee 100644
--- a/storage/federated/ha_federated.cc
+++ b/storage/federated/ha_federated.cc
@@ -2019,7 +2019,7 @@ int ha_federated::end_bulk_insert()
   int error= 0;
   DBUG_ENTER("ha_federated::end_bulk_insert");
   
-  if (bulk_insert.str && bulk_insert.length)
+  if (!table_will_be_deleted && bulk_insert.str && bulk_insert.length)
   {
     if (real_query(bulk_insert.str, bulk_insert.length))
       error= stash_remote_error();
@@ -2966,6 +2966,8 @@ int ha_federated::extra(ha_extra_function operation)
   case HA_EXTRA_INSERT_WITH_UPDATE:
     insert_dup_update= TRUE;
     break;
+  case HA_EXTRA_PREPARE_FOR_DROP:
+    table_will_be_deleted = TRUE;
   default:
     /* do nothing */
     DBUG_PRINT("info",("unhandled operation: %d", (uint) operation));
@@ -3366,6 +3368,7 @@ int ha_federated::external_lock(THD *thd, int lock_type)
     }
   }
 #endif /* XXX_SUPERCEDED_BY_WL2952 */
+  table_will_be_deleted = FALSE;
   DBUG_RETURN(error);
 }
 
@@ -3474,3 +3477,20 @@ mysql_declare_plugin(federated)
   NULL                        /* config options                  */
 }
 mysql_declare_plugin_end;
+maria_declare_plugin(federated)
+{
+  MYSQL_STORAGE_ENGINE_PLUGIN,
+  &federated_storage_engine,
+  "FEDERATED",
+  "Patrick Galbraith and Brian Aker, MySQL AB",
+  "Federated MySQL storage engine",
+  PLUGIN_LICENSE_GPL,
+  federated_db_init, /* Plugin Init */
+  federated_done, /* Plugin Deinit */
+  0x0100 /* 1.0 */,
+  NULL,                       /* status variables                */
+  NULL,                       /* system variables                */
+  "1.0",                      /* string version */
+  MariaDB_PLUGIN_MATURITY_BETA /* maturity */
+}
+maria_declare_plugin_end;
diff --git a/storage/federated/ha_federated.h b/storage/federated/ha_federated.h
index 0f4c0201bd7..be52a54be50 100644
--- a/storage/federated/ha_federated.h
+++ b/storage/federated/ha_federated.h
@@ -88,7 +88,7 @@ class ha_federated: public handler
     Array of all stored results we get during a query execution.
   */
   DYNAMIC_ARRAY results;
-  bool position_called;
+  bool position_called, table_will_be_deleted;
   uint fetch_num; // stores the fetch num
   MYSQL_ROW_OFFSET current_position;  // Current position used by ::position()
   int remote_error_number;
diff --git a/storage/federated/plug.in b/storage/federated/plug.in
index 23b607d699b..714888b2ebf 100644
--- a/storage/federated/plug.in
+++ b/storage/federated/plug.in
@@ -1,5 +1,5 @@
 MYSQL_STORAGE_ENGINE(federated,,[Federated Storage Engine],
-        [Connects to tables on remote MySQL servers], [max,max-no-ndb])
+        [Connects to tables on remote MySQL servers], [])
 MYSQL_PLUGIN_STATIC(federated,    [libfederated.a])
 MYSQL_PLUGIN_DYNAMIC(federated,   [ha_federated.la])
 MYSQL_PLUGIN_DEPENDS_ON_MYSQL_INTERNALS(federated, [ha_federated.cc])
diff --git a/storage/federatedx/AUTHORS b/storage/federatedx/AUTHORS
new file mode 100644
index 00000000000..6314d2e4a3d
--- /dev/null
+++ b/storage/federatedx/AUTHORS
@@ -0,0 +1,11 @@
+FederatedX
+
+Patrick Galbraith <patg@patg.net> - Federated
+
+Pluggable Storage Engine Skeleton setup
+
+Brian Aker  <brian@mysql.com> | <brian@tangent.org> - Original Design
+Calvin Sun - Windows Support
+Brian Miezejewski - Bug fixes
+Antony T Curtis   - Help in inital development, transactions and various help
+Michael Widenius  - Bug fixes and some simple early optimizations
diff --git a/storage/federatedx/CMakeLists.txt b/storage/federatedx/CMakeLists.txt
new file mode 100644
index 00000000000..24d64585ddb
--- /dev/null
+++ b/storage/federatedx/CMakeLists.txt
@@ -0,0 +1,4 @@
+SET(FEDERATEDX_PLUGIN_STATIC  "federatedx")
+SET(FEDERATEDX_PLUGIN_DYNAMIC "ha_federatedx")
+SET(FEDERATEDX_SOURCES  ha_federatedx.cc federatedx_txn.cc federatedx_io.cc federatedx_io_null.cc federatedx_io_mysql.cc)
+MYSQL_ADD_PLUGIN(federatedx ${FEDERATEDX_SOURCES} STORAGE_ENGINE)
diff --git a/storage/federatedx/ChangeLog b/storage/federatedx/ChangeLog
new file mode 100644
index 00000000000..170321cc0b0
--- /dev/null
+++ b/storage/federatedx/ChangeLog
@@ -0,0 +1,18 @@
+0.2 -  Thu March 8 00:00:00 EST 2008
+
+  - Fixed bug #30051 "CREATE TABLE does not connect and check existence of remote table"
+    Modified "real_connect" to take a share and create flag to in order to not rely
+    on any settings that are later instantiated and/or set by get_share
+    Also, put logic in the code to not attempt this if a localhost. There's an annoying
+    functionality that if federated tries to connect to itself during creater table, you 
+    get 1159 error (timeout) - only when local. This prevents having this functionality
+    and is probably part of the reason it was removed.
+
+0.1 -  Thu Feb 1 00:00:00 EST 2008
+
+  - This is the FederatedX Storage Engine, 
+    first release.
+  - Added documentation
+  - Added simple test and README file to explain
+    how to run the test
+  - Added FAQ
diff --git a/storage/federatedx/FAQ b/storage/federatedx/FAQ
new file mode 100644
index 00000000000..50def432009
--- /dev/null
+++ b/storage/federatedx/FAQ
@@ -0,0 +1,40 @@
+Q. What is the FederatedX pluggable storage engine?
+
+A. It is a fork of the Federated Storage Engine that Brian Aker and I
+(Patrick Galbraith) developed originally . It is a storage engine that
+uses a client connection to a remote MySQL data source as its data
+source instead of a local file on disk.
+
+Q. Why did you fork from Federated?
+
+A. To enhance the storage engine independently of the
+MySQL Server release schedule. Many people have been 
+mentioning their dissatisfaction with the limitations
+of Federated. I think the engine is a great concept and 
+have a sense of obligation to continue to improve it.
+There are some patches already that are in dire need
+of being applied and tested.
+
+Q. What do you plan to do with FederatedX?
+
+A. Many things need addressing:
+
+- Outstanding bugs
+- How do deal with huge result sets
+- Pushdown conditions (being able to pass things like LIMIT
+  to the remote connection to keep from returning huge
+  result sets).
+- Better transactional support
+- Other connection mechanisms (ODBC, JDBC, native drivers
+  of other RDBMSs)
+
+Q. What FederatedX is and is not?
+
+A. FederatedX is not yet a complete "federated" solution in 
+   the sense that other venders have developed (IBM, etc). It
+   is essentially a networked storage engine. It is my hope
+   to make it a real federated solution.
+
+Q. In which MySQL distributions/forks/branches can I find FederateX
+
+A. MariaDB (http://www.mariadb.com)
diff --git a/storage/federatedx/Makefile.am b/storage/federatedx/Makefile.am
new file mode 100644
index 00000000000..0e3249866ea
--- /dev/null
+++ b/storage/federatedx/Makefile.am
@@ -0,0 +1,63 @@
+# Used to build Makefile.in
+
+MYSQLDATAdir =          $(localstatedir)
+MYSQLSHAREdir =         $(pkgdatadir)
+MYSQLBASEdir=           $(prefix)
+MYSQLLIBdir=            $(pkglibdir)
+pkgplugindir =		$(pkglibdir)/plugin
+INCLUDES =              -I$(top_srcdir)/include -I$(top_builddir)/include \
+			-I$(top_srcdir)/regex \
+			-I$(top_srcdir)/sql \
+                        -I$(srcdir)
+WRAPLIBS=
+
+LDADD =
+
+DEFS =                  @DEFS@
+
+noinst_HEADERS =	ha_federatedx.h federatedx_probes.h
+
+EXTRA_LTLIBRARIES =	ha_federatedx.la
+pkgplugin_LTLIBRARIES =	@plugin_federatedx_shared_target@
+ha_federatedx_la_LDFLAGS =	-module -rpath $(pkgplugindir) \
+				-L$(top_builddir)/libservices -lmysqlservices
+ha_federatedx_la_CXXFLAGS=	-shared $(AM_CXXFLAGS) -DMYSQL_DYNAMIC_PLUGIN
+ha_federatedx_la_CFLAGS =	-shared $(AM_CFLAGS) -DMYSQL_DYNAMIC_PLUGIN
+
+
+EXTRA_LIBRARIES =	libfederatedx.a
+noinst_LIBRARIES =	@plugin_federatedx_static_target@
+libfederatedx_a_CXXFLAGS =	$(AM_CXXFLAGS)
+libfederatedx_a_SOURCES=	ha_federatedx.cc federatedx_txn.cc \
+			        federatedx_io.cc federatedx_io_null.cc \
+			        federatedx_io_mysql.cc
+
+EXTRA_DIST =		CMakeLists.txt plug.in ha_federatedx.h \
+			federatedx_probes.h
+
+ha_federatedx_la_SOURCES = ha_federatedx.cc federatedx_txn.cc \
+			   federatedx_io.cc federatedx_io_null.cc \
+			   federatedx_io_mysql.cc $(top_srcdir)/mysys/string.c
+ha_federatedx_la_LIBADD =
+
+#DTRACE =                @DTRACE@
+#DTRACEFLAGS =           @DTRACEFLAGS@
+#DTRACEFILES =           .libs/libfederatedx_engine_la-ha_federatedx.o
+
+# #if HAVE_DTRACE
+# #  libfederatedx_engine_la_LIBADD += federatedx_probes.o
+# #endif
+
+# federatedx_probes.h: federatedx_probes.d
+#	$(DTRACE) $(DTRACEFLAGS) -h -s federatedx_probes.d
+#	mv federatedx_probes.h federatedx_probes.h.bak
+#	sed "s/#include <unistd.h>//g" federatedx_probes.h.bak > federatedx_probes.h
+#	rm federatedx_probes.h.bak
+
+#federatedx_probes.o:
+#	$(DTRACE) $(DTRACEFLAGS) -G -s federatedx_probes.d $(DTRACEFILES)
+
+# End
+
+# Don't update the files from bitkeeper
+%::SCCS/s.%
diff --git a/storage/federatedx/README b/storage/federatedx/README
new file mode 100644
index 00000000000..6618527c08a
--- /dev/null
+++ b/storage/federatedx/README
@@ -0,0 +1,33 @@
+This is the FederatedX Storage Engine, developed as an external storage engine.
+
+NOTE:
+
+The following is only relevant if you use it for MySQL.  MariaDB already comes
+with the latest version of FederatedX.
+
+To install, grab a copy of the mysql source code and run this:
+
+./configure --with-mysql=/path/to/src/mysql-5.x --libdir=/usr/local/lib/mysql/
+
+make install
+
+And then inside of MySQL:
+
+mysql> INSTALL PLUGIN federatedx SONAME 'libfederatedx_engine.so';
+
+mysql> CREATE TABLE `d` (`a` varchar(125), b text, primary key(a)) ENGINE=FEDERATEDX CONNECTION="mysql://root@host/schema/table"
+
+or 
+
+mysql> CREATE TABLE `d` (`a` varchar(125), b text, primary key(a)) ENGINE=FEDERATEDX CONNECTION="server" CHARSET=latin1;
+
+You will probably need to edit the Makefile.am in the src/ tree if you want
+to build on anything other then Linux (and the Makefile assumes that the
+server was not compiled for debug). The reason for the two possible
+configure lines is that libdir is dependent on where MySQL was installed. If
+you run the "INSTALL PLUGIN ..." and you get a file not found, check that
+your configured this directory correctly.
+
+For Solaris you can enable DTrace probes by adding to configure
+--enable-dtrace
+
diff --git a/storage/federatedx/README.windows b/storage/federatedx/README.windows
new file mode 100644
index 00000000000..3f1f2a3c79a
--- /dev/null
+++ b/storage/federatedx/README.windows
@@ -0,0 +1,23 @@
+The following files are changed in order to build a new engine on Windows:
+
+- Update win\configure.js with
+case "WITH_FEDERATEDX_STORAGE_ENGINE":
+to make sure it will pass WITH_FEDERATEDX_STORAGE_ENGINE in.
+
+- Update CMakeFiles.txt under mysql root:
+  IF(WITH_FEDERATEDX_STORAGE_ENGINE)
+      ADD_DEFINITIONS(-D WITH_FEDERATEDX_STORAGE_ENGINE)
+  SET (mysql_plugin_defs
+      "${mysql_plugin_defs},builtin_skeleton_plugin")
+  ENDIF(WITH_FEDERATEDX_STORAGE_ENGINE)
+
+  and,
+
+  IF(WITH_FEDERATEDX_STORAGE_ENGINE)
+    ADD_SUBDIRECTORY(storage/skeleton/src)
+  ENDIF(WITH_FEDERATEDX_STORAGE_ENGINE)
+
+  - Update CMakeFiles.txt under sql:
+  IF(WITH_FEDERATEDX_STORAGE_ENGINE)
+    TARGET_LINK_LIBRARIES(mysqld skeleton)
+  ENDIF(WITH_FEDERATEDX_STORAGE_ENGINE)
diff --git a/storage/federatedx/TODO b/storage/federatedx/TODO
new file mode 100644
index 00000000000..71330742f4e
--- /dev/null
+++ b/storage/federatedx/TODO
@@ -0,0 +1,30 @@
+Features
+
+* Add Pushdown conditions
+* Add other network driver interfaces
+* Handle large result sets
+* Auto-discovery of tables on foreign data sources
+
+Bugs (http://bugs.mysql.com)
+
+20026 2006-05-23 FEDERATED lacks support for auto_increment_increment and auto_increment_offset   
+20724 2006-06-27 FEDERATED does not honour SET INSERT_ID    
+28269 2007-05-06 Any FEDERATED engine fails to quote reserved words for field names
+25509 2007-01-10 Federated: Failure with non-ASCII characters   
+26697 2007-02-27 Every query to a federated table results in a full scan of MyISAM table.
+21360 2006-07-31 Microsoft Windows (Windows/Linux) mysqldump error on federated tables    
+34189 2008-01-31 Any ALTER TABLE t1 ENGINE=FEDERATED CONNECTION='connectionString' on MyISAM fails    
+31757 2007-10-22 Any Federated tables break replication  Antony Curtis
+33953 2008-01-21 Any mysqld dies on search federated table using nullable index with < or <= operator
+34015 2008-01-23 Linux Problems with float fields using federated tables
+21583 2006-08-11 Linux (Linux) Federated table returns broken strings.    
+33702 2008-01-05 Accessing a federated table with a non existing server returns random error code   
+25512 2007-01-10 Federated: CREATE failures   
+32426 2007-11-16 Any FEDERATED query returns corrupt results for ORDER BY on a TEXT field 
+25510 2007-01-10 Federated: double trigger activation   
+33250 2007-12-14 SELECT * FROM really_big_federated_table eats lots of virtual memory (OOM)   
+14874 2005-11-11 Error 2013: Lost connection to MySQL server with Federated table   
+25508 2007-01-10 Federated: Failure to Remove Partitioning    
+27180 2007-03-15 #1030 - Got error 1 from storage engine with big tables
+33947 2008-01-20 Any Join on Federated tables with Unique index and IS NOT NULL crashes server
+30051 (fixed) CREATE TABLE does not connect and check existence of remote table
diff --git a/storage/federatedx/federatedx_io.cc b/storage/federatedx/federatedx_io.cc
new file mode 100644
index 00000000000..10023bec35b
--- /dev/null
+++ b/storage/federatedx/federatedx_io.cc
@@ -0,0 +1,103 @@
+/* 
+Copyright (c) 2007, Antony T Curtis
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+    * Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+    * Neither the name of FederatedX nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+
+/*#define MYSQL_SERVER 1*/
+#include "mysql_priv.h"
+#include <mysql/plugin.h>
+
+#include "ha_federatedx.h"
+
+#include "m_string.h"
+
+#ifdef USE_PRAGMA_IMPLEMENTATION
+#pragma implementation                          // gcc: Class implementation
+#endif
+
+typedef federatedx_io *(*instantiate_io_type)(MEM_ROOT *server_root,
+                                              FEDERATEDX_SERVER *server);
+struct io_schemes_st
+{
+  const char *scheme;
+  instantiate_io_type instantiate;
+};
+
+
+static const io_schemes_st federated_io_schemes[] =
+{
+  { "mysql", &instantiate_io_mysql },
+  { "null", instantiate_io_null } /* must be last element */
+};
+
+const uint federated_io_schemes_count= array_elements(federated_io_schemes);
+
+federatedx_io::federatedx_io(FEDERATEDX_SERVER *aserver)
+  : server(aserver), owner_ptr(0), txn_next(0), idle_next(0),
+    active(FALSE), busy(FALSE), readonly(TRUE)
+{
+  DBUG_ENTER("federatedx_io::federatedx_io");
+  DBUG_ASSERT(server);
+
+  safe_mutex_assert_owner(&server->mutex);
+  server->io_count++;
+
+  DBUG_VOID_RETURN;
+}
+
+
+federatedx_io::~federatedx_io()
+{
+  DBUG_ENTER("federatedx_io::~federatedx_io");
+
+  server->io_count--;
+
+  DBUG_VOID_RETURN;
+}
+
+
+bool federatedx_io::handles_scheme(const char *scheme)
+{
+  const io_schemes_st *ptr = federated_io_schemes;
+  const io_schemes_st *end = ptr + array_elements(federated_io_schemes);
+  while (ptr != end && strcasecmp(scheme, ptr->scheme))
+    ++ptr;
+  return ptr != end;
+}
+
+
+federatedx_io *federatedx_io::construct(MEM_ROOT *server_root,
+                                        FEDERATEDX_SERVER *server)
+{
+  const io_schemes_st *ptr = federated_io_schemes;
+  const io_schemes_st *end = ptr + (array_elements(federated_io_schemes) - 1);
+  while (ptr != end && strcasecmp(server->scheme, ptr->scheme))
+    ++ptr;
+  return ptr->instantiate(server_root, server);
+}
+
+
diff --git a/storage/federatedx/federatedx_io_mysql.cc b/storage/federatedx/federatedx_io_mysql.cc
new file mode 100644
index 00000000000..d6844fab2c6
--- /dev/null
+++ b/storage/federatedx/federatedx_io_mysql.cc
@@ -0,0 +1,645 @@
+/*
+Copyright (c) 2007, Antony T Curtis
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+    * Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+    * Neither the name of FederatedX nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+
+/*#define MYSQL_SERVER 1*/
+#include "mysql_priv.h"
+#include <mysql/plugin.h>
+
+#include "ha_federatedx.h"
+
+#include "m_string.h"
+
+#ifdef USE_PRAGMA_IMPLEMENTATION
+#pragma implementation                          // gcc: Class implementation
+#endif
+
+
+#define SAVEPOINT_REALIZED  1
+#define SAVEPOINT_RESTRICT  2
+#define SAVEPOINT_EMITTED 4
+
+
+typedef struct federatedx_savepoint
+{
+  ulong level;
+  uint  flags;
+} SAVEPT;
+
+struct mysql_position
+{
+  MYSQL_RES* result;
+  MYSQL_ROW_OFFSET offset;
+};
+
+
+class federatedx_io_mysql :public federatedx_io
+{
+  MYSQL mysql; /* MySQL connection */
+  DYNAMIC_ARRAY savepoints;
+  bool requested_autocommit;
+  bool actual_autocommit;
+
+  int actual_query(const char *buffer, uint length);
+  bool test_all_restrict() const;
+public:
+  federatedx_io_mysql(FEDERATEDX_SERVER *);
+  ~federatedx_io_mysql();
+
+  int simple_query(const char *fmt, ...);
+  int query(const char *buffer, uint length);
+  virtual FEDERATEDX_IO_RESULT *store_result();
+
+  virtual size_t max_query_size() const;
+
+  virtual my_ulonglong affected_rows() const;
+  virtual my_ulonglong last_insert_id() const;
+
+  virtual int error_code();
+  virtual const char *error_str();
+
+  void reset();
+  int commit();
+  int rollback();
+
+  int savepoint_set(ulong sp);
+  ulong savepoint_release(ulong sp);
+  ulong savepoint_rollback(ulong sp);
+  void savepoint_restrict(ulong sp);
+
+  ulong last_savepoint() const;
+  ulong actual_savepoint() const;
+  bool is_autocommit() const;
+
+  bool table_metadata(ha_statistics *stats, const char *table_name,
+                      uint table_name_length, uint flag);
+
+  /* resultset operations */
+
+  virtual void free_result(FEDERATEDX_IO_RESULT *io_result);
+  virtual unsigned int get_num_fields(FEDERATEDX_IO_RESULT *io_result);
+  virtual my_ulonglong get_num_rows(FEDERATEDX_IO_RESULT *io_result);
+  virtual FEDERATEDX_IO_ROW *fetch_row(FEDERATEDX_IO_RESULT *io_result);
+  virtual ulong *fetch_lengths(FEDERATEDX_IO_RESULT *io_result);
+  virtual const char *get_column_data(FEDERATEDX_IO_ROW *row,
+                                      unsigned int column);
+  virtual bool is_column_null(const FEDERATEDX_IO_ROW *row,
+                              unsigned int column) const;
+
+  virtual size_t get_ref_length() const;
+  virtual void mark_position(FEDERATEDX_IO_RESULT *io_result,
+                             void *ref);
+  virtual int seek_position(FEDERATEDX_IO_RESULT **io_result,
+                            const void *ref);
+};
+
+
+federatedx_io *instantiate_io_mysql(MEM_ROOT *server_root,
+                                    FEDERATEDX_SERVER *server)
+{
+  return new (server_root) federatedx_io_mysql(server);
+}
+
+
+federatedx_io_mysql::federatedx_io_mysql(FEDERATEDX_SERVER *aserver)
+  : federatedx_io(aserver),
+    requested_autocommit(TRUE), actual_autocommit(TRUE)
+{
+  DBUG_ENTER("federatedx_io_mysql::federatedx_io_mysql");
+
+  bzero(&mysql, sizeof(MYSQL));
+  bzero(&savepoints, sizeof(DYNAMIC_ARRAY));
+
+  my_init_dynamic_array(&savepoints, sizeof(SAVEPT), 16, 16);  
+  
+  DBUG_VOID_RETURN;
+}
+
+
+federatedx_io_mysql::~federatedx_io_mysql()
+{
+  DBUG_ENTER("federatedx_io_mysql::~federatedx_io_mysql");
+
+  mysql_close(&mysql);
+  delete_dynamic(&savepoints);
+
+  DBUG_VOID_RETURN;
+}
+
+
+void federatedx_io_mysql::reset()
+{
+  reset_dynamic(&savepoints);
+  set_active(FALSE);
+  
+  requested_autocommit= TRUE;
+  mysql.reconnect= 1;
+}
+
+
+int federatedx_io_mysql::commit()
+{
+  int error= 0;
+  DBUG_ENTER("federatedx_io_mysql::commit");
+  
+  if (!actual_autocommit && (error= actual_query("COMMIT", 6)))
+    rollback();
+  
+  reset();
+  
+  DBUG_RETURN(error);
+}
+
+int federatedx_io_mysql::rollback()
+{
+  int error= 0;
+  DBUG_ENTER("federatedx_io_mysql::rollback");
+  
+  if (!actual_autocommit)
+    error= actual_query("ROLLBACK", 8);
+  else
+    error= ER_WARNING_NOT_COMPLETE_ROLLBACK;
+
+  reset();
+  
+  DBUG_RETURN(error);
+}
+
+
+ulong federatedx_io_mysql::last_savepoint() const
+{
+  SAVEPT *savept= NULL;
+  DBUG_ENTER("federatedx_io_mysql::last_savepoint");
+
+  if (savepoints.elements)
+    savept= dynamic_element(&savepoints, savepoints.elements - 1, SAVEPT *);
+
+  DBUG_RETURN(savept ? savept->level : 0);
+}
+
+
+ulong federatedx_io_mysql::actual_savepoint() const
+{
+  SAVEPT *savept= NULL;
+  uint index= savepoints.elements;
+  DBUG_ENTER("federatedx_io_mysql::last_savepoint");
+
+  while (index)
+  {
+    savept= dynamic_element(&savepoints, --index, SAVEPT *);
+    if (savept->flags & SAVEPOINT_REALIZED)
+    break;
+  savept= NULL;
+  }
+
+  DBUG_RETURN(savept ? savept->level : 0);
+}
+
+bool federatedx_io_mysql::is_autocommit() const
+{
+  return actual_autocommit;
+}
+
+
+int federatedx_io_mysql::savepoint_set(ulong sp)
+{
+  int error;
+  SAVEPT savept;
+  DBUG_ENTER("federatedx_io_mysql::savepoint_set");
+  DBUG_PRINT("info",("savepoint=%lu", sp));
+  DBUG_ASSERT(sp > last_savepoint());
+
+  savept.level= sp;
+  savept.flags= 0;
+
+  if ((error= insert_dynamic(&savepoints, (uchar*) &savept) ? -1 : 0))
+    goto err;
+
+  set_active(TRUE);
+  mysql.reconnect= 0;
+  requested_autocommit= FALSE;
+
+err:
+  DBUG_RETURN(error);
+}
+
+
+ulong federatedx_io_mysql::savepoint_release(ulong sp)
+{
+  SAVEPT *savept, *last= NULL;
+  DBUG_ENTER("federatedx_io_mysql::savepoint_release");
+  DBUG_PRINT("info",("savepoint=%lu", sp));
+  
+  while (savepoints.elements)
+  {
+    savept= dynamic_element(&savepoints, savepoints.elements - 1, SAVEPT *);
+    if (savept->level < sp)
+      break;
+  if ((savept->flags & (SAVEPOINT_REALIZED | 
+                        SAVEPOINT_RESTRICT)) == SAVEPOINT_REALIZED)
+    last= savept;
+    savepoints.elements--;
+  }
+
+  if (last)
+  {
+    char buffer[STRING_BUFFER_USUAL_SIZE];
+  int length= my_snprintf(buffer, sizeof(buffer),
+              "RELEASE SAVEPOINT save%lu", last->level);
+    actual_query(buffer, length);
+  }
+
+  DBUG_RETURN(last_savepoint()); 
+}
+
+
+ulong federatedx_io_mysql::savepoint_rollback(ulong sp)
+{
+  SAVEPT *savept;
+  uint index;
+  DBUG_ENTER("federatedx_io_mysql::savepoint_release");
+  DBUG_PRINT("info",("savepoint=%lu", sp));
+  
+  while (savepoints.elements)
+  {
+    savept= dynamic_element(&savepoints, savepoints.elements - 1, SAVEPT *);
+  if (savept->level <= sp)
+    break;
+    savepoints.elements--;
+  }
+
+  for (index= savepoints.elements, savept= NULL; index;)
+  {
+    savept= dynamic_element(&savepoints, --index, SAVEPT *);
+    if (savept->flags & SAVEPOINT_REALIZED)
+    break;
+  savept= NULL;
+  }
+  
+  if (savept && !(savept->flags & SAVEPOINT_RESTRICT))
+  {
+    char buffer[STRING_BUFFER_USUAL_SIZE];
+  int length= my_snprintf(buffer, sizeof(buffer),
+              "ROLLBACK TO SAVEPOINT save%lu", savept->level);
+    actual_query(buffer, length);
+  }
+
+  DBUG_RETURN(last_savepoint());
+}
+
+
+void federatedx_io_mysql::savepoint_restrict(ulong sp)
+{
+  SAVEPT *savept;
+  uint index= savepoints.elements;
+  DBUG_ENTER("federatedx_io_mysql::savepoint_restrict");
+  
+  while (index)
+  {
+    savept= dynamic_element(&savepoints, --index, SAVEPT *);
+  if (savept->level > sp)
+    continue;
+  if (savept->level < sp)
+    break;
+  savept->flags|= SAVEPOINT_RESTRICT;
+  break;
+  }
+  
+  DBUG_VOID_RETURN;
+}
+
+
+int federatedx_io_mysql::simple_query(const char *fmt, ...)
+{
+  char buffer[STRING_BUFFER_USUAL_SIZE];
+  int length, error;
+  va_list arg;
+  DBUG_ENTER("federatedx_io_mysql::simple_query");
+
+  va_start(arg, fmt);  
+  length= my_vsnprintf(buffer, sizeof(buffer), fmt, arg);
+  va_end(arg);
+  
+  error= query(buffer, length);
+  
+  DBUG_RETURN(error);
+}
+
+
+bool federatedx_io_mysql::test_all_restrict() const
+{
+  bool result= FALSE;
+  SAVEPT *savept;
+  uint index= savepoints.elements;
+  DBUG_ENTER("federatedx_io_mysql::test_all_restrict");
+  
+  while (index)
+  {
+    savept= dynamic_element(&savepoints, --index, SAVEPT *);
+  if ((savept->flags & (SAVEPOINT_REALIZED | 
+                        SAVEPOINT_RESTRICT)) == SAVEPOINT_REALIZED ||
+    (savept->flags & SAVEPOINT_EMITTED))
+      DBUG_RETURN(FALSE);
+    if (savept->flags & SAVEPOINT_RESTRICT)
+    result= TRUE;
+  }
+  
+  DBUG_RETURN(result); 
+}
+
+
+int federatedx_io_mysql::query(const char *buffer, uint length)
+{
+  int error;
+  bool wants_autocommit= requested_autocommit | is_readonly();
+  DBUG_ENTER("federatedx_io_mysql::query");
+
+  if (!wants_autocommit && test_all_restrict())
+    wants_autocommit= TRUE;
+
+  if (wants_autocommit != actual_autocommit)
+  {
+    if ((error= actual_query(wants_autocommit ? "SET AUTOCOMMIT=1"
+                                            : "SET AUTOCOMMIT=0", 16)))
+    DBUG_RETURN(error);                         
+    mysql.reconnect= wants_autocommit ? 1 : 0;
+    actual_autocommit= wants_autocommit;
+  }
+  
+  if (!actual_autocommit && last_savepoint() != actual_savepoint())
+  {
+    SAVEPT *savept= dynamic_element(&savepoints, savepoints.elements - 1, 
+                                SAVEPT *);
+    if (!(savept->flags & SAVEPOINT_RESTRICT))
+  {
+      char buf[STRING_BUFFER_USUAL_SIZE];
+    int len= my_snprintf(buf, sizeof(buf),
+                  "SAVEPOINT save%lu", savept->level);
+      if ((error= actual_query(buf, len)))
+    DBUG_RETURN(error);                         
+    set_active(TRUE);
+    savept->flags|= SAVEPOINT_EMITTED;
+    }
+    savept->flags|= SAVEPOINT_REALIZED;
+  }
+
+  if (!(error= actual_query(buffer, length)))
+    set_active(is_active() || !actual_autocommit);
+
+  DBUG_RETURN(error);
+}
+
+
+int federatedx_io_mysql::actual_query(const char *buffer, uint length)
+{
+  int error;
+  DBUG_ENTER("federatedx_io_mysql::actual_query");
+
+  if (!mysql.master)
+  {
+    if (!(mysql_init(&mysql)))
+    DBUG_RETURN(-1);
+  
+    /*
+	BUG# 17044 Federated Storage Engine is not UTF8 clean
+	Add set names to whatever charset the table is at open
+	of table
+    */
+    /* this sets the csname like 'set names utf8' */
+    mysql_options(&mysql, MYSQL_SET_CHARSET_NAME, get_charsetname());
+
+    if (!mysql_real_connect(&mysql,
+                            get_hostname(),
+                            get_username(),
+                            get_password(),
+                            get_database(),
+                            get_port(),
+                            get_socket(), 0))
+      DBUG_RETURN(ER_CONNECT_TO_FOREIGN_DATA_SOURCE);
+    mysql.reconnect= 1;
+  }
+
+  error= mysql_real_query(&mysql, buffer, length);
+  
+  DBUG_RETURN(error);
+}
+
+size_t federatedx_io_mysql::max_query_size() const
+{
+  return mysql.net.max_packet_size;
+}
+
+
+my_ulonglong federatedx_io_mysql::affected_rows() const
+{
+  return mysql.affected_rows;
+}
+
+
+my_ulonglong federatedx_io_mysql::last_insert_id() const
+{
+  return mysql.last_used_con->insert_id;
+}
+
+
+int federatedx_io_mysql::error_code()
+{
+  return mysql_errno(&mysql);
+}
+
+
+const char *federatedx_io_mysql::error_str()
+{
+  return mysql_error(&mysql);
+}
+
+FEDERATEDX_IO_RESULT *federatedx_io_mysql::store_result()
+{
+  FEDERATEDX_IO_RESULT *result;
+  DBUG_ENTER("federatedx_io_mysql::store_result");
+
+  result= (FEDERATEDX_IO_RESULT *) mysql_store_result(&mysql);
+
+  DBUG_RETURN(result);
+}
+
+
+void federatedx_io_mysql::free_result(FEDERATEDX_IO_RESULT *io_result)
+{
+  mysql_free_result((MYSQL_RES *) io_result);
+}
+
+
+unsigned int federatedx_io_mysql::get_num_fields(FEDERATEDX_IO_RESULT *io_result)
+{
+  return mysql_num_fields((MYSQL_RES *) io_result);
+}
+
+
+my_ulonglong federatedx_io_mysql::get_num_rows(FEDERATEDX_IO_RESULT *io_result)
+{
+  return mysql_num_rows((MYSQL_RES *) io_result);
+}
+
+
+FEDERATEDX_IO_ROW *federatedx_io_mysql::fetch_row(FEDERATEDX_IO_RESULT *io_result)
+{
+  return (FEDERATEDX_IO_ROW *) mysql_fetch_row((MYSQL_RES *) io_result);
+}
+
+
+ulong *federatedx_io_mysql::fetch_lengths(FEDERATEDX_IO_RESULT *io_result)
+{
+  return mysql_fetch_lengths((MYSQL_RES *) io_result);
+}
+
+
+const char *federatedx_io_mysql::get_column_data(FEDERATEDX_IO_ROW *row,
+                                                 unsigned int column)
+{
+  return ((MYSQL_ROW)row)[column];
+}
+
+
+bool federatedx_io_mysql::is_column_null(const FEDERATEDX_IO_ROW *row,
+                                         unsigned int column) const
+{
+  return !((MYSQL_ROW)row)[column];
+}
+
+bool federatedx_io_mysql::table_metadata(ha_statistics *stats,
+                                         const char *table_name,
+                                         uint table_name_length, uint flag)
+{
+  char status_buf[FEDERATEDX_QUERY_BUFFER_SIZE];
+  FEDERATEDX_IO_RESULT *result= 0;
+  FEDERATEDX_IO_ROW *row;
+  String status_query_string(status_buf, sizeof(status_buf), &my_charset_bin);
+  int error;
+
+  status_query_string.length(0);
+  status_query_string.append(STRING_WITH_LEN("SHOW TABLE STATUS LIKE "));
+  append_ident(&status_query_string, table_name,
+               table_name_length, value_quote_char);
+
+  if (query(status_query_string.ptr(), status_query_string.length()))
+    goto error;
+
+  status_query_string.length(0);
+
+  result= store_result();
+
+  /*
+    We're going to use fields num. 4, 12 and 13 of the resultset,
+    so make sure we have these fields.
+  */
+  if (!result || (get_num_fields(result) < 14))
+    goto error;
+
+  if (!get_num_rows(result))
+    goto error;
+
+  if (!(row= fetch_row(result)))
+    goto error;
+
+  /*
+    deleted is set in ha_federatedx::info
+  */
+  /*
+    need to figure out what this means as far as federatedx is concerned,
+    since we don't have a "file"
+
+    data_file_length = ?
+    index_file_length = ?
+    delete_length = ?
+  */
+  if (!is_column_null(row, 4))
+    stats->records= (ha_rows) my_strtoll10(get_column_data(row, 4),
+	                                   (char**) 0, &error);
+  if (!is_column_null(row, 5))
+    stats->mean_rec_length= (ulong) my_strtoll10(get_column_data(row, 5),
+	                                         (char**) 0, &error);
+
+  stats->data_file_length= stats->records * stats->mean_rec_length;
+
+  if (!is_column_null(row, 12))
+    stats->update_time= (time_t) my_strtoll10(get_column_data(row, 12),
+	                                      (char**) 0, &error);
+  if (!is_column_null(row, 13))
+    stats->check_time= (time_t) my_strtoll10(get_column_data(row, 13),
+	                                     (char**) 0, &error);
+
+  free_result(result);
+  return 0;
+
+error:
+  free_result(result);
+  return 1;
+}
+
+
+
+size_t federatedx_io_mysql::get_ref_length() const
+{
+  return sizeof(mysql_position);
+}
+
+
+void federatedx_io_mysql::mark_position(FEDERATEDX_IO_RESULT *io_result,
+                                        void *ref)
+{
+  MYSQL_ROWS *tmp= 0;
+  mysql_position& pos= *reinterpret_cast<mysql_position*>(ref);
+  pos.result= (MYSQL_RES *) io_result;
+
+  if (pos.result && pos.result->data)
+  {
+    for (tmp= pos.result->data->data;
+         tmp && (tmp->next != pos.result->data_cursor);
+         tmp= tmp->next)
+    {}
+  }
+
+  pos.offset= tmp;
+}
+
+int federatedx_io_mysql::seek_position(FEDERATEDX_IO_RESULT **io_result,
+                                       const void *ref)
+{
+  const mysql_position& pos= *reinterpret_cast<const mysql_position*>(ref);
+
+  if (!pos.result || !pos.offset)
+    return HA_ERR_END_OF_FILE;
+
+  pos.result->current_row= 0;
+  pos.result->data_cursor= pos.offset;
+  *io_result= (FEDERATEDX_IO_RESULT*) pos.result;
+
+  return 0;
+}
+
diff --git a/storage/federatedx/federatedx_io_null.cc b/storage/federatedx/federatedx_io_null.cc
new file mode 100644
index 00000000000..49f93ab6546
--- /dev/null
+++ b/storage/federatedx/federatedx_io_null.cc
@@ -0,0 +1,299 @@
+/* 
+Copyright (c) 2007, Antony T Curtis
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+    * Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+    * Neither the name of FederatedX nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+
+/*#define MYSQL_SERVER 1*/
+#include "mysql_priv.h"
+#include <mysql/plugin.h>
+
+#include "ha_federatedx.h"
+
+#include "m_string.h"
+
+#ifdef USE_PRAGMA_IMPLEMENTATION
+#pragma implementation                          // gcc: Class implementation
+#endif
+
+
+#define SAVEPOINT_REALIZED  1
+#define SAVEPOINT_RESTRICT  2
+#define SAVEPOINT_EMITTED 4
+
+
+typedef struct federatedx_savepoint
+{
+  ulong level;
+  uint  flags;
+} SAVEPT;
+
+
+class federatedx_io_null :public federatedx_io
+{
+public:
+  federatedx_io_null(FEDERATEDX_SERVER *);
+  ~federatedx_io_null();
+
+  int query(const char *buffer, uint length);
+  virtual FEDERATEDX_IO_RESULT *store_result();
+
+  virtual size_t max_query_size() const;
+
+  virtual my_ulonglong affected_rows() const;
+  virtual my_ulonglong last_insert_id() const;
+
+  virtual int error_code();
+  virtual const char *error_str();
+  
+  void reset();
+  int commit();
+  int rollback();
+  
+  int savepoint_set(ulong sp);
+  ulong savepoint_release(ulong sp);
+  ulong savepoint_rollback(ulong sp);
+  void savepoint_restrict(ulong sp);
+  
+  ulong last_savepoint() const;
+  ulong actual_savepoint() const;
+  bool is_autocommit() const;
+
+  bool table_metadata(ha_statistics *stats, const char *table_name,
+                      uint table_name_length, uint flag);
+  
+  /* resultset operations */
+  
+  virtual void free_result(FEDERATEDX_IO_RESULT *io_result);
+  virtual unsigned int get_num_fields(FEDERATEDX_IO_RESULT *io_result);
+  virtual my_ulonglong get_num_rows(FEDERATEDX_IO_RESULT *io_result);
+  virtual FEDERATEDX_IO_ROW *fetch_row(FEDERATEDX_IO_RESULT *io_result);
+  virtual ulong *fetch_lengths(FEDERATEDX_IO_RESULT *io_result);
+  virtual const char *get_column_data(FEDERATEDX_IO_ROW *row,
+                                      unsigned int column);
+  virtual bool is_column_null(const FEDERATEDX_IO_ROW *row,
+                              unsigned int column) const;
+  virtual size_t get_ref_length() const;
+  virtual void mark_position(FEDERATEDX_IO_RESULT *io_result,
+                             void *ref);
+  virtual int seek_position(FEDERATEDX_IO_RESULT **io_result,
+                            const void *ref);
+};
+
+
+federatedx_io *instantiate_io_null(MEM_ROOT *server_root,
+                                   FEDERATEDX_SERVER *server)
+{
+  return new (server_root) federatedx_io_null(server);
+}
+
+
+federatedx_io_null::federatedx_io_null(FEDERATEDX_SERVER *aserver)
+  : federatedx_io(aserver)
+{
+}
+
+
+federatedx_io_null::~federatedx_io_null()
+{
+}
+
+
+void federatedx_io_null::reset()
+{
+}
+
+
+int federatedx_io_null::commit()
+{
+  return 0;
+}
+
+int federatedx_io_null::rollback()
+{
+  return 0;
+}
+
+
+ulong federatedx_io_null::last_savepoint() const
+{
+  return 0;
+}
+
+
+ulong federatedx_io_null::actual_savepoint() const
+{
+  return 0;
+}
+
+bool federatedx_io_null::is_autocommit() const
+{
+  return 0;
+}
+
+
+int federatedx_io_null::savepoint_set(ulong sp)
+{
+  return 0;
+}
+
+
+ulong federatedx_io_null::savepoint_release(ulong sp)
+{
+  return 0;
+}
+
+
+ulong federatedx_io_null::savepoint_rollback(ulong sp)
+{
+  return 0;
+}
+
+
+void federatedx_io_null::savepoint_restrict(ulong sp)
+{
+}
+
+
+int federatedx_io_null::query(const char *buffer, uint length)
+{
+  return 0;
+}
+
+
+size_t federatedx_io_null::max_query_size() const
+{
+  return INT_MAX;
+}
+
+
+my_ulonglong federatedx_io_null::affected_rows() const
+{
+  return 0;
+}
+
+
+my_ulonglong federatedx_io_null::last_insert_id() const
+{
+  return 0;
+}
+
+
+int federatedx_io_null::error_code()
+{
+  return 0;
+}
+
+
+const char *federatedx_io_null::error_str()
+{
+  return "";
+}
+
+
+FEDERATEDX_IO_RESULT *federatedx_io_null::store_result()
+{
+  FEDERATEDX_IO_RESULT *result;
+  DBUG_ENTER("federatedx_io_null::store_result");
+  
+  result= NULL;
+  
+  DBUG_RETURN(result);
+}
+
+
+void federatedx_io_null::free_result(FEDERATEDX_IO_RESULT *)
+{
+}
+
+
+unsigned int federatedx_io_null::get_num_fields(FEDERATEDX_IO_RESULT *)
+{
+  return 0;
+}
+
+
+my_ulonglong federatedx_io_null::get_num_rows(FEDERATEDX_IO_RESULT *)
+{
+  return 0;
+}
+
+
+FEDERATEDX_IO_ROW *federatedx_io_null::fetch_row(FEDERATEDX_IO_RESULT *)
+{
+  return NULL;
+}
+
+
+ulong *federatedx_io_null::fetch_lengths(FEDERATEDX_IO_RESULT *)
+{
+  return NULL;
+}
+
+
+const char *federatedx_io_null::get_column_data(FEDERATEDX_IO_ROW *,
+                                                 unsigned int)
+{
+  return "";
+}
+
+
+bool federatedx_io_null::is_column_null(const FEDERATEDX_IO_ROW *,
+                                         unsigned int) const
+{
+  return true;
+}
+
+bool federatedx_io_null::table_metadata(ha_statistics *stats,
+                                        const char *table_name,
+                                        uint table_name_length, uint flag)
+{
+  stats->records= (ha_rows) 0;
+  stats->mean_rec_length= (ulong) 0;
+  stats->data_file_length= 0;
+
+  stats->update_time= (time_t) 0;
+  stats->check_time= (time_t) 0;
+
+  return 0;
+}
+
+size_t federatedx_io_null::get_ref_length() const
+{
+  return sizeof(int);
+}
+
+
+void federatedx_io_null::mark_position(FEDERATEDX_IO_RESULT *io_result,
+                                       void *ref)
+{
+}
+
+int federatedx_io_null::seek_position(FEDERATEDX_IO_RESULT **io_result,
+                                      const void *ref)
+{
+  return 0;
+}
diff --git a/storage/federatedx/federatedx_probes.h b/storage/federatedx/federatedx_probes.h
new file mode 100644
index 00000000000..620419512ce
--- /dev/null
+++ b/storage/federatedx/federatedx_probes.h
@@ -0,0 +1,45 @@
+/*
+ * Generated by dtrace(1M).
+ */
+
+#ifndef	_FEDERATED_PROBES_H
+#define	_FEDERATED_PROBES_H
+
+
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+#if _DTRACE_VERSION
+
+#define	FEDERATED_CLOSE() \
+	__dtrace_federated___close()
+#define	FEDERATED_CLOSE_ENABLED() \
+	__dtraceenabled_federated___close()
+#define	FEDERATED_OPEN() \
+	__dtrace_federated___open()
+#define	FEDERATED_OPEN_ENABLED() \
+	__dtraceenabled_federated___open()
+
+
+extern void __dtrace_federated___close(void);
+extern int __dtraceenabled_federated___close(void);
+extern void __dtrace_federated___open(void);
+extern int __dtraceenabled_federated___open(void);
+
+#else
+
+#define	FEDERATED_CLOSE()
+#define	FEDERATED_CLOSE_ENABLED() (0)
+#define	FEDERATED_OPEN()
+#define	FEDERATED_OPEN_ENABLED() (0)
+
+#endif
+
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif	/* _FEDERATED_PROBES_H */
diff --git a/storage/federatedx/federatedx_txn.cc b/storage/federatedx/federatedx_txn.cc
new file mode 100644
index 00000000000..a6ca3acc744
--- /dev/null
+++ b/storage/federatedx/federatedx_txn.cc
@@ -0,0 +1,424 @@
+/* 
+Copyright (c) 2007, Antony T Curtis
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+    * Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+    * Neither the name of FederatedX nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+
+/*#define MYSQL_SERVER 1*/
+#include "mysql_priv.h"
+#include <mysql/plugin.h>
+
+#include "ha_federatedx.h"
+
+#include "m_string.h"
+
+#ifdef USE_PRAGMA_IMPLEMENTATION
+#pragma implementation                          // gcc: Class implementation
+#endif
+
+
+federatedx_txn::federatedx_txn()
+  : txn_list(0), savepoint_level(0), savepoint_stmt(0), savepoint_next(0)
+{
+  DBUG_ENTER("federatedx_txn::federatedx_txn");
+  DBUG_VOID_RETURN;
+}
+
+federatedx_txn::~federatedx_txn()
+{
+  DBUG_ENTER("federatedx_txn::~federatedx_txn");
+  DBUG_ASSERT(!txn_list);
+  DBUG_VOID_RETURN;
+}
+
+
+void federatedx_txn::close(FEDERATEDX_SERVER *server)
+{
+  uint count= 0;
+  federatedx_io *io, **iop;
+  DBUG_ENTER("federatedx_txn::close");
+  
+  DBUG_ASSERT(!server->use_count);
+  DBUG_PRINT("info",("use count: %u  connections: %u", 
+                     server->use_count, server->io_count));
+
+  for (iop= &txn_list; (io= *iop);)
+  {
+    if (io->server != server)
+      iop= &io->txn_next;
+    else
+    {
+      *iop= io->txn_next;
+      io->txn_next= NULL;
+      io->busy= FALSE;
+
+      io->idle_next= server->idle_list;
+      server->idle_list= io;
+    }
+  }
+
+  while ((io= server->idle_list))
+  {
+    server->idle_list= io->idle_next;
+    delete io;
+    count++;
+  }
+  
+  DBUG_PRINT("info",("closed %u connections,  txn_list: %s", count,
+                     txn_list ? "active":  "empty"));
+  DBUG_VOID_RETURN;
+}
+
+
+int federatedx_txn::acquire(FEDERATEDX_SHARE *share, bool readonly,
+                            federatedx_io **ioptr)
+{
+  federatedx_io *io;
+  FEDERATEDX_SERVER *server= share->s;
+  DBUG_ENTER("federatedx_txn::acquire");
+  DBUG_ASSERT(ioptr && server);
+
+  if (!(io= *ioptr))
+  {
+    /* check to see if we have an available IO connection */
+    for (io= txn_list; io; io= io->txn_next)
+      if (io->server == server)
+	break;
+
+    if (!io)
+    {
+      /* check to see if there are any unowned IO connections */
+      pthread_mutex_lock(&server->mutex);
+      if ((io= server->idle_list))
+      {
+	server->idle_list= io->idle_next;
+	io->idle_next= NULL;
+      }
+      else
+	io= federatedx_io::construct(&server->mem_root, server);
+
+      io->txn_next= txn_list;
+      txn_list= io;
+
+      pthread_mutex_unlock(&server->mutex);
+    }
+
+    if (io->busy)
+      *io->owner_ptr= NULL;
+    
+    io->busy= TRUE;
+    io->owner_ptr= ioptr;
+  }
+  
+  DBUG_ASSERT(io->busy && io->server == server);
+  
+  io->readonly&= readonly;
+
+  DBUG_RETURN((*ioptr= io) ? 0 : -1);
+}
+
+
+void federatedx_txn::release(federatedx_io **ioptr)
+{
+  federatedx_io *io;
+  DBUG_ENTER("federatedx_txn::release");
+  DBUG_ASSERT(ioptr);
+
+  if ((io= *ioptr))
+  {
+    /* mark as available for reuse in this transaction */
+    io->busy= FALSE;
+    *ioptr= NULL;
+  
+    DBUG_PRINT("info", ("active: %d autocommit: %d", 
+                	io->active, io->is_autocommit()));
+
+    if (io->is_autocommit())
+      io->active= FALSE;
+  }
+
+  release_scan();
+
+  DBUG_VOID_RETURN;
+}
+
+
+void federatedx_txn::release_scan()
+{
+  uint count= 0, returned= 0;
+  federatedx_io *io, **pio;
+  DBUG_ENTER("federatedx_txn::release_scan");
+
+  /* return any inactive and idle connections to the server */  
+  for (pio= &txn_list; (io= *pio); count++)
+  {
+    if (io->active || io->busy)
+      pio= &io->txn_next;
+    else
+    {
+      FEDERATEDX_SERVER *server= io->server;
+
+      /* unlink from list of connections bound to the transaction */
+      *pio= io->txn_next; 
+      io->txn_next= NULL;
+
+      /* reset some values */
+      io->readonly= TRUE;
+
+      pthread_mutex_lock(&server->mutex);
+      io->idle_next= server->idle_list;
+      server->idle_list= io;
+      pthread_mutex_unlock(&server->mutex);
+      returned++;
+    }
+  }
+  DBUG_PRINT("info",("returned %u of %u connections(s)", returned, count));
+
+  DBUG_VOID_RETURN;
+}
+
+
+bool federatedx_txn::txn_begin()
+{
+  ulong level= 0;
+  DBUG_ENTER("federatedx_txn::txn_begin");
+
+  if (savepoint_next == 0)
+  {
+    savepoint_next++;
+    savepoint_level= savepoint_stmt= 0;
+    sp_acquire(&level);
+  }
+
+  DBUG_RETURN(level == 1);
+}
+
+
+int federatedx_txn::txn_commit()
+{
+  int error= 0;
+  federatedx_io *io;
+  DBUG_ENTER("federatedx_txn::txn_commit");
+
+  if (savepoint_next)
+  {
+    DBUG_ASSERT(savepoint_stmt != 1);
+
+    for (io= txn_list; io; io= io->txn_next)
+    {
+      int rc= 0;
+
+      if (io->active)
+	rc= io->commit();
+      else
+	io->rollback();
+
+      if (io->active && rc)
+	error= -1;
+
+      io->reset();
+    }
+
+    release_scan();
+
+    savepoint_next= savepoint_stmt= savepoint_level= 0;
+  }
+    
+  DBUG_RETURN(error);
+}
+
+
+int federatedx_txn::txn_rollback()
+{
+  int error= 0;
+  federatedx_io *io;
+  DBUG_ENTER("federatedx_txn::txn_commit");
+
+  if (savepoint_next)
+  {
+    DBUG_ASSERT(savepoint_stmt != 1);
+
+    for (io= txn_list; io; io= io->txn_next)
+    {
+      int rc= io->rollback();
+
+      if (io->active && rc)
+	error= -1;
+
+      io->reset();
+    }
+
+    release_scan();
+
+    savepoint_next= savepoint_stmt= savepoint_level= 0;
+  }
+    
+  DBUG_RETURN(error);
+}
+
+
+bool federatedx_txn::sp_acquire(ulong *sp)
+{
+  bool rc= FALSE;
+  federatedx_io *io;
+  DBUG_ENTER("federatedx_txn::sp_acquire");
+  DBUG_ASSERT(sp && savepoint_next);
+  
+  *sp= savepoint_level= savepoint_next++;
+    
+  for (io= txn_list; io; io= io->txn_next)
+  {
+    if (io->readonly)
+      continue;
+
+    io->savepoint_set(savepoint_level);
+    rc= TRUE;
+  }
+
+  DBUG_RETURN(rc);
+}
+
+
+int federatedx_txn::sp_rollback(ulong *sp)
+{
+  ulong level, new_level= savepoint_level;
+  federatedx_io *io;
+  DBUG_ENTER("federatedx_txn::sp_rollback");
+  DBUG_ASSERT(sp && savepoint_next && *sp && *sp <= savepoint_level);
+  
+  for (io= txn_list; io; io= io->txn_next)
+  {
+    if (io->readonly)
+      continue;
+
+    if ((level= io->savepoint_rollback(*sp)) < new_level)
+      new_level= level;
+  } 
+  
+  savepoint_level= new_level;
+  
+  DBUG_RETURN(0);
+}
+
+
+int federatedx_txn::sp_release(ulong *sp)
+{
+  ulong level, new_level= savepoint_level;
+  federatedx_io *io;
+  DBUG_ENTER("federatedx_txn::sp_release");
+  DBUG_ASSERT(sp && savepoint_next && *sp && *sp <= savepoint_level);
+  
+  for (io= txn_list; io; io= io->txn_next)
+  {
+    if (io->readonly)
+      continue;
+
+    if ((level= io->savepoint_release(*sp)) < new_level)
+      new_level= level;
+  }
+
+  savepoint_level= new_level;
+  *sp= 0;
+
+  DBUG_RETURN(0);
+}
+
+
+bool federatedx_txn::stmt_begin()
+{
+  bool result= FALSE;
+  DBUG_ENTER("federatedx_txn::stmt_begin");
+
+  if (!savepoint_stmt)
+  {
+    if (!savepoint_next)
+    {
+      savepoint_next++;
+      savepoint_level= savepoint_stmt= 0;
+    }
+    result= sp_acquire(&savepoint_stmt);
+  }
+
+  DBUG_RETURN(result);
+}
+
+
+int federatedx_txn::stmt_commit()
+{ 
+  int result= 0;
+  DBUG_ENTER("federatedx_txn::stmt_commit");
+  
+  if (savepoint_stmt == 1)
+  {
+    savepoint_stmt= 0;
+    result= txn_commit();
+  }
+  else  
+  if (savepoint_stmt)
+    result= sp_release(&savepoint_stmt);
+
+  DBUG_RETURN(result);
+}
+
+
+int federatedx_txn::stmt_rollback()
+{
+  int result= 0;
+  DBUG_ENTER("federated:txn::stmt_rollback");
+
+  if (savepoint_stmt == 1)
+  {
+    savepoint_stmt= 0;
+    result= txn_rollback();
+  }
+  else
+  if (savepoint_stmt)
+  {
+    result= sp_rollback(&savepoint_stmt);
+    sp_release(&savepoint_stmt);
+  }
+  
+  DBUG_RETURN(result);
+}
+
+
+void federatedx_txn::stmt_autocommit()
+{
+  federatedx_io *io;
+  DBUG_ENTER("federatedx_txn::stmt_autocommit");
+
+  for (io= txn_list; savepoint_stmt && io; io= io->txn_next)
+  {
+    if (io->readonly)
+      continue;
+
+    io->savepoint_restrict(savepoint_stmt);
+  }
+
+  DBUG_VOID_RETURN;  
+}
+
+
diff --git a/storage/federatedx/ha_federatedx.cc b/storage/federatedx/ha_federatedx.cc
new file mode 100644
index 00000000000..2749034cba2
--- /dev/null
+++ b/storage/federatedx/ha_federatedx.cc
@@ -0,0 +1,3604 @@
+/*
+Copyright (c) 2008-2009, Patrick Galbraith & Antony Curtis
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+    * Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+    * Neither the name of Patrick Galbraith nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+/*
+
+  FederatedX Pluggable Storage Engine
+
+  ha_federatedx.cc - FederatedX Pluggable Storage Engine
+  Patrick Galbraith, 2008
+
+  This is a handler which uses a foreign database as the data file, as
+  opposed to a handler like MyISAM, which uses .MYD files locally.
+
+  How this handler works
+  ----------------------------------
+  Normal database files are local and as such: You create a table called
+  'users', a file such as 'users.MYD' is created. A handler reads, inserts,
+  deletes, updates data in this file. The data is stored in particular format,
+  so to read, that data has to be parsed into fields, to write, fields have to
+  be stored in this format to write to this data file.
+
+  With FederatedX storage engine, there will be no local files
+  for each table's data (such as .MYD). A foreign database will store
+  the data that would normally be in this file. This will necessitate
+  the use of MySQL client API to read, delete, update, insert this
+  data. The data will have to be retrieve via an SQL call "SELECT *
+  FROM users". Then, to read this data, it will have to be retrieved
+  via mysql_fetch_row one row at a time, then converted from the
+  column in this select into the format that the handler expects.
+
+  The create table will simply create the .frm file, and within the
+  "CREATE TABLE" SQL, there SHALL be any of the following :
+
+  connection=scheme://username:password@hostname:port/database/tablename
+  connection=scheme://username@hostname/database/tablename
+  connection=scheme://username:password@hostname/database/tablename
+  connection=scheme://username:password@hostname/database/tablename
+
+  - OR -
+
+  As of 5.1 federatedx now allows you to use a non-url
+  format, taking advantage of mysql.servers:
+
+  connection="connection_one"
+  connection="connection_one/table_foo"
+
+  An example would be:
+
+  connection=mysql://username:password@hostname:port/database/tablename
+
+  or, if we had:
+
+  create server 'server_one' foreign data wrapper 'mysql' options
+  (HOST '127.0.0.1',
+  DATABASE 'db1',
+  USER 'root',
+  PASSWORD '',
+  PORT 3306,
+  SOCKET '',
+  OWNER 'root');
+
+  CREATE TABLE federatedx.t1 (
+    `id` int(20) NOT NULL,
+    `name` varchar(64) NOT NULL default ''
+    )
+  ENGINE="FEDERATEDX" DEFAULT CHARSET=latin1
+  CONNECTION='server_one';
+
+  So, this will have been the equivalent of
+
+  CONNECTION="mysql://root@127.0.0.1:3306/db1/t1"
+
+  Then, we can also change the server to point to a new schema:
+
+  ALTER SERVER 'server_one' options(DATABASE 'db2');
+
+  All subsequent calls will now be against db2.t1! Guess what? You don't
+  have to perform an alter table!
+
+  This connecton="connection string" is necessary for the handler to be
+  able to connect to the foreign server, either by URL, or by server
+  name. 
+
+
+  The basic flow is this:
+
+  SQL calls issues locally ->
+  mysql handler API (data in handler format) ->
+  mysql client API (data converted to SQL calls) ->
+  foreign database -> mysql client API ->
+  convert result sets (if any) to handler format ->
+  handler API -> results or rows affected to local
+
+  What this handler does and doesn't support
+  ------------------------------------------
+  * Tables MUST be created on the foreign server prior to any action on those
+    tables via the handler, first version. IMPORTANT: IF you MUST use the
+    federatedx storage engine type on the REMOTE end, MAKE SURE [ :) ] That
+    the table you connect to IS NOT a table pointing BACK to your ORIGNAL
+    table! You know  and have heard the screaching of audio feedback? You
+    know putting two mirror in front of each other how the reflection
+    continues for eternity? Well, need I say more?!
+  * There will not be support for transactions.
+  * There is no way for the handler to know if the foreign database or table
+    has changed. The reason for this is that this database has to work like a
+    data file that would never be written to by anything other than the
+    database. The integrity of the data in the local table could be breached
+    if there was any change to the foreign database.
+  * Support for SELECT, INSERT, UPDATE , DELETE, indexes.
+  * No ALTER TABLE, DROP TABLE or any other Data Definition Language calls.
+  * Prepared statements will not be used in the first implementation, it
+    remains to to be seen whether the limited subset of the client API for the
+    server supports this.
+  * This uses SELECT, INSERT, UPDATE, DELETE and not HANDLER for its
+    implementation.
+  * This will not work with the query cache.
+
+   Method calls
+
+   A two column table, with one record:
+
+   (SELECT)
+
+   "SELECT * FROM foo"
+    ha_federatedx::info
+    ha_federatedx::scan_time:
+    ha_federatedx::rnd_init: share->select_query SELECT * FROM foo
+    ha_federatedx::extra
+
+    <for every row of data retrieved>
+    ha_federatedx::rnd_next
+    ha_federatedx::convert_row_to_internal_format
+    ha_federatedx::rnd_next
+    </for every row of data retrieved>
+
+    ha_federatedx::rnd_end
+    ha_federatedx::extra
+    ha_federatedx::reset
+
+    (INSERT)
+
+    "INSERT INTO foo (id, ts) VALUES (2, now());"
+
+    ha_federatedx::write_row
+
+    ha_federatedx::reset
+
+    (UPDATE)
+
+    "UPDATE foo SET ts = now() WHERE id = 1;"
+
+    ha_federatedx::index_init
+    ha_federatedx::index_read
+    ha_federatedx::index_read_idx
+    ha_federatedx::rnd_next
+    ha_federatedx::convert_row_to_internal_format
+    ha_federatedx::update_row
+
+    ha_federatedx::extra
+    ha_federatedx::extra
+    ha_federatedx::extra
+    ha_federatedx::external_lock
+    ha_federatedx::reset
+
+
+    How do I use this handler?
+    --------------------------
+
+    <insert text about plugin storage engine>
+
+    Next, to use this handler, it's very simple. You must
+    have two databases running, either both on the same host, or
+    on different hosts.
+
+    One the server that will be connecting to the foreign
+    host (client), you create your table as such:
+
+    CREATE TABLE test_table (
+      id     int(20) NOT NULL auto_increment,
+      name   varchar(32) NOT NULL default '',
+      other  int(20) NOT NULL default '0',
+      PRIMARY KEY  (id),
+      KEY name (name),
+      KEY other_key (other))
+       ENGINE="FEDERATEDX"
+       DEFAULT CHARSET=latin1
+       CONNECTION='mysql://root@127.0.0.1:9306/federatedx/test_federatedx';
+
+   Notice the "COMMENT" and "ENGINE" field? This is where you
+   respectively set the engine type, "FEDERATEDX" and foreign
+   host information, this being the database your 'client' database
+   will connect to and use as the "data file". Obviously, the foreign
+   database is running on port 9306, so you want to start up your other
+   database so that it is indeed on port 9306, and your federatedx
+   database on a port other than that. In my setup, I use port 5554
+   for federatedx, and port 5555 for the foreign database.
+
+   Then, on the foreign database:
+
+   CREATE TABLE test_table (
+     id     int(20) NOT NULL auto_increment,
+     name   varchar(32) NOT NULL default '',
+     other  int(20) NOT NULL default '0',
+     PRIMARY KEY  (id),
+     KEY name (name),
+     KEY other_key (other))
+     ENGINE="<NAME>" <-- whatever you want, or not specify
+     DEFAULT CHARSET=latin1 ;
+
+    This table is exactly the same (and must be exactly the same),
+    except that it is not using the federatedx handler and does
+    not need the URL.
+
+
+    How to see the handler in action
+    --------------------------------
+
+    When developing this handler, I compiled the federatedx database with
+    debugging:
+
+    ./configure --with-federatedx-storage-engine
+    --prefix=/home/mysql/mysql-build/federatedx/ --with-debug
+
+    Once compiled, I did a 'make install' (not for the purpose of installing
+    the binary, but to install all the files the binary expects to see in the
+    diretory I specified in the build with --prefix,
+    "/home/mysql/mysql-build/federatedx".
+
+    Then, I started the foreign server:
+
+    /usr/local/mysql/bin/mysqld_safe
+    --user=mysql --log=/tmp/mysqld.5555.log -P 5555
+
+    Then, I went back to the directory containing the newly compiled mysqld,
+    <builddir>/sql/, started up gdb:
+
+    gdb ./mysqld
+
+    Then, withn the (gdb) prompt:
+    (gdb) run --gdb --port=5554 --socket=/tmp/mysqld.5554 --skip-innodb --debug
+
+    Next, I open several windows for each:
+
+    1. Tail the debug trace: tail -f /tmp/mysqld.trace|grep ha_fed
+    2. Tail the SQL calls to the foreign database: tail -f /tmp/mysqld.5555.log
+    3. A window with a client open to the federatedx server on port 5554
+    4. A window with a client open to the federatedx server on port 5555
+
+    I would create a table on the client to the foreign server on port
+    5555, and then to the federatedx server on port 5554. At this point,
+    I would run whatever queries I wanted to on the federatedx server,
+    just always remembering that whatever changes I wanted to make on
+    the table, or if I created new tables, that I would have to do that
+    on the foreign server.
+
+    Another thing to look for is 'show variables' to show you that you have
+    support for federatedx handler support:
+
+    show variables like '%federat%'
+
+    and:
+
+    show storage engines;
+
+    Both should display the federatedx storage handler.
+
+
+    Testing
+    -------
+
+    Testing for FederatedX as a pluggable storage engine for
+    now is a manual process that I intend to build a test
+    suite that works for all pluggable storage engines.
+
+    How to test
+
+    1. cp fed.dat /tmp
+    (make sure you have access to "test". Use a user that has
+    super privileges for now)
+    2. mysql -f -u root test < federated.test > federated.myresult 2>&1
+    3. diff federated.result federated.myresult (there _should_ be no differences)
+
+
+*/
+
+
+#define MYSQL_SERVER 1
+#include "mysql_priv.h"
+#include <mysql/plugin.h>
+
+#ifdef USE_PRAGMA_IMPLEMENTATION
+#pragma implementation                          // gcc: Class implementation
+#endif
+
+#include "ha_federatedx.h"
+
+#include "m_string.h"
+
+#include <mysql/plugin.h>
+
+/* Variables for federatedx share methods */
+static HASH federatedx_open_tables;              // To track open tables
+static HASH federatedx_open_servers;             // To track open servers
+pthread_mutex_t federatedx_mutex;                // To init the hash
+const char ident_quote_char= '`';               // Character for quoting
+                                                // identifiers
+const char value_quote_char= '\'';              // Character for quoting
+                                                // literals
+static const int bulk_padding= 64;              // bytes "overhead" in packet
+
+/* Variables used when chopping off trailing characters */
+static const uint sizeof_trailing_comma= sizeof(", ") - 1;
+static const uint sizeof_trailing_closeparen= sizeof(") ") - 1;
+static const uint sizeof_trailing_and= sizeof(" AND ") - 1;
+static const uint sizeof_trailing_where= sizeof(" WHERE ") - 1;
+
+/* Static declaration for handerton */
+static handler *federatedx_create_handler(handlerton *hton,
+                                         TABLE_SHARE *table,
+                                         MEM_ROOT *mem_root);
+
+/* FederatedX storage engine handlerton */
+
+static handler *federatedx_create_handler(handlerton *hton, 
+                                         TABLE_SHARE *table,
+                                         MEM_ROOT *mem_root)
+{
+  return new (mem_root) ha_federatedx(hton, table);
+}
+
+
+/* Function we use in the creation of our hash to get key */
+
+static uchar *
+federatedx_share_get_key(FEDERATEDX_SHARE *share, size_t *length,
+                         my_bool not_used __attribute__ ((unused)))
+{
+  *length= share->share_key_length;
+  return (uchar*) share->share_key;
+}
+
+
+static uchar *
+federatedx_server_get_key(FEDERATEDX_SERVER *server, size_t *length,
+                          my_bool not_used __attribute__ ((unused)))
+{
+  *length= server->key_length;
+  return server->key;
+}
+
+
+/*
+  Initialize the federatedx handler.
+
+  SYNOPSIS
+    federatedx_db_init()
+    p		Handlerton
+
+  RETURN
+    FALSE       OK
+    TRUE        Error
+*/
+
+int federatedx_db_init(void *p)
+{
+  DBUG_ENTER("federatedx_db_init");
+  handlerton *federatedx_hton= (handlerton *)p;
+  federatedx_hton->state= SHOW_OPTION_YES;
+  /* Needed to work with old .frm files */
+  federatedx_hton->db_type= DB_TYPE_FEDERATED_DB;
+  federatedx_hton->savepoint_offset= sizeof(ulong);
+  federatedx_hton->close_connection= ha_federatedx::disconnect;
+  federatedx_hton->savepoint_set= ha_federatedx::savepoint_set;
+  federatedx_hton->savepoint_rollback= ha_federatedx::savepoint_rollback;
+  federatedx_hton->savepoint_release= ha_federatedx::savepoint_release;
+  federatedx_hton->commit= ha_federatedx::commit;
+  federatedx_hton->rollback= ha_federatedx::rollback;
+  federatedx_hton->create= federatedx_create_handler;
+  federatedx_hton->flags= HTON_ALTER_NOT_SUPPORTED | HTON_NO_PARTITION;
+
+  if (pthread_mutex_init(&federatedx_mutex, MY_MUTEX_INIT_FAST))
+    goto error;
+  if (!hash_init(&federatedx_open_tables, &my_charset_bin, 32, 0, 0,
+                 (hash_get_key) federatedx_share_get_key, 0, 0) &&
+      !hash_init(&federatedx_open_servers, &my_charset_bin, 32, 0, 0,
+                 (hash_get_key) federatedx_server_get_key, 0, 0))
+  {
+    DBUG_RETURN(FALSE);
+  }
+
+  VOID(pthread_mutex_destroy(&federatedx_mutex));
+error:
+  DBUG_RETURN(TRUE);
+}
+
+
+/*
+  Release the federatedx handler.
+
+  SYNOPSIS
+    federatedx_db_end()
+
+  RETURN
+    FALSE       OK
+*/
+
+int federatedx_done(void *p)
+{
+  hash_free(&federatedx_open_tables);
+  hash_free(&federatedx_open_servers);
+  VOID(pthread_mutex_destroy(&federatedx_mutex));
+
+  return 0;
+}
+
+/**
+  @brief Append identifiers to the string.
+
+  @param[in,out] string	The target string.
+  @param[in] name 		Identifier name
+  @param[in] length 	Length of identifier name in bytes
+  @param[in] quote_char Quote char to use for quoting identifier.
+
+  @return Operation Status
+  @retval FALSE OK
+  @retval TRUE  There was an error appending to the string.
+
+  @note This function is based upon the append_identifier() function
+        in sql_show.cc except that quoting always occurs.
+*/
+
+bool append_ident(String *string, const char *name, uint length,
+                  const char quote_char)
+{
+  bool result;
+  uint clen;
+  const char *name_end;
+  DBUG_ENTER("append_ident");
+
+  if (quote_char)
+  {
+    string->reserve(length * 2 + 2);
+    if ((result= string->append(&quote_char, 1, system_charset_info)))
+      goto err;
+
+    for (name_end= name+length; name < name_end; name+= clen)
+    {
+      uchar c= *(uchar *) name;
+      if (!(clen= my_mbcharlen(system_charset_info, c)))
+        clen= 1;
+      if (clen == 1 && c == (uchar) quote_char &&
+          (result= string->append(&quote_char, 1, system_charset_info)))
+        goto err;
+      if ((result= string->append(name, clen, string->charset())))
+        goto err;
+    }
+    result= string->append(&quote_char, 1, system_charset_info);
+  }
+  else
+    result= string->append(name, length, system_charset_info);
+
+err:
+  DBUG_RETURN(result);
+}
+
+
+static int parse_url_error(FEDERATEDX_SHARE *share, TABLE *table, int error_num)
+{
+  char buf[FEDERATEDX_QUERY_BUFFER_SIZE];
+  int buf_len;
+  DBUG_ENTER("ha_federatedx parse_url_error");
+
+  buf_len= min(table->s->connect_string.length,
+               FEDERATEDX_QUERY_BUFFER_SIZE-1);
+  strmake(buf, table->s->connect_string.str, buf_len);
+  my_error(error_num, MYF(0), buf);
+  DBUG_RETURN(error_num);
+}
+
+/*
+  retrieve server object which contains server meta-data 
+  from the system table given a server's name, set share
+  connection parameter members
+*/
+int get_connection(MEM_ROOT *mem_root, FEDERATEDX_SHARE *share)
+{
+  int error_num= ER_FOREIGN_SERVER_DOESNT_EXIST;
+  char error_buffer[FEDERATEDX_QUERY_BUFFER_SIZE];
+  FOREIGN_SERVER *server, server_buffer;
+  DBUG_ENTER("ha_federatedx::get_connection");
+
+  /*
+    get_server_by_name() clones the server if exists and allocates
+	copies of strings in the supplied mem_root
+  */
+  if (!(server=
+       get_server_by_name(mem_root, share->connection_string, &server_buffer)))
+  {
+    DBUG_PRINT("info", ("get_server_by_name returned > 0 error condition!"));
+    /* need to come up with error handling */
+    error_num=1;
+    goto error;
+  }
+  DBUG_PRINT("info", ("get_server_by_name returned server at %lx",
+                      (long unsigned int) server));
+
+  /*
+    Most of these should never be empty strings, error handling will
+    need to be implemented. Also, is this the best way to set the share
+    members? Is there some allocation needed? In running this code, it works
+    except there are errors in the trace file of the share being overrun 
+    at the address of the share.
+  */
+  share->server_name_length= server->server_name_length;
+  share->server_name= server->server_name;
+  share->username= server->username;
+  share->password= server->password;
+  share->database= server->db;
+#ifndef I_AM_PARANOID
+  share->port= server->port > 0 && server->port < 65536 ? 
+#else
+  share->port= server->port > 1023 && server->port < 65536 ? 
+#endif
+               (ushort) server->port : MYSQL_PORT;
+  share->hostname= server->host;
+  if (!(share->socket= server->socket) &&
+      !strcmp(share->hostname, my_localhost))
+    share->socket= (char *) MYSQL_UNIX_ADDR;
+  share->scheme= server->scheme;
+
+  DBUG_PRINT("info", ("share->username: %s", share->username));
+  DBUG_PRINT("info", ("share->password: %s", share->password));
+  DBUG_PRINT("info", ("share->hostname: %s", share->hostname));
+  DBUG_PRINT("info", ("share->database: %s", share->database));
+  DBUG_PRINT("info", ("share->port:     %d", share->port));
+  DBUG_PRINT("info", ("share->socket:   %s", share->socket));
+  DBUG_RETURN(0);
+
+error:
+  my_sprintf(error_buffer,
+             (error_buffer, "server name: '%s' doesn't exist!",
+              share->connection_string));
+  my_error(error_num, MYF(0), error_buffer);
+  DBUG_RETURN(error_num);
+}
+
+/*
+  Parse connection info from table->s->connect_string
+
+  SYNOPSIS
+    parse_url()
+    mem_root            MEM_ROOT pointer for memory allocation
+    share               pointer to FEDERATEDX share
+    table               pointer to current TABLE class
+    table_create_flag   determines what error to throw
+
+  DESCRIPTION
+    Populates the share with information about the connection
+    to the foreign database that will serve as the data source.
+    This string must be specified (currently) in the "CONNECTION" field,
+    listed in the CREATE TABLE statement.
+
+    This string MUST be in the format of any of these:
+
+    CONNECTION="scheme://username:password@hostname:port/database/table"
+    CONNECTION="scheme://username@hostname/database/table"
+    CONNECTION="scheme://username@hostname:port/database/table"
+    CONNECTION="scheme://username:password@hostname/database/table"
+
+    _OR_
+
+    CONNECTION="connection name"
+
+    
+
+  An Example:
+
+  CREATE TABLE t1 (id int(32))
+    ENGINE="FEDERATEDX"
+    CONNECTION="mysql://joe:joespass@192.168.1.111:9308/federatedx/testtable";
+
+  CREATE TABLE t2 (
+    id int(4) NOT NULL auto_increment,
+    name varchar(32) NOT NULL,
+    PRIMARY KEY(id)
+    ) ENGINE="FEDERATEDX" CONNECTION="my_conn";
+
+  ***IMPORTANT***
+  Currently, the FederatedX Storage Engine only supports connecting to another
+  Database ("scheme" of "mysql"). Connections using JDBC as well as 
+  other connectors are in the planning stage.
+  
+
+  'password' and 'port' are both optional.
+
+  RETURN VALUE
+    0           success
+    error_num   particular error code 
+
+*/
+
+static int parse_url(MEM_ROOT *mem_root, FEDERATEDX_SHARE *share, TABLE *table,
+                     uint table_create_flag)
+{
+  uint error_num= (table_create_flag ?
+                   ER_FOREIGN_DATA_STRING_INVALID_CANT_CREATE :
+                   ER_FOREIGN_DATA_STRING_INVALID);
+  DBUG_ENTER("ha_federatedx::parse_url");
+
+  share->port= 0;
+  share->socket= 0;
+  DBUG_PRINT("info", ("share at %lx", (long unsigned int) share));
+  DBUG_PRINT("info", ("Length: %u", (uint) table->s->connect_string.length));
+  DBUG_PRINT("info", ("String: '%.*s'", (int) table->s->connect_string.length,
+                      table->s->connect_string.str));
+  share->connection_string= strmake_root(mem_root, table->s->connect_string.str,
+                                       table->s->connect_string.length);
+
+  DBUG_PRINT("info",("parse_url alloced share->connection_string %lx",
+                     (long unsigned int) share->connection_string));
+
+  DBUG_PRINT("info",("share->connection_string: %s",share->connection_string));
+  /*
+    No :// or @ in connection string. Must be a straight connection name of
+    either "servername" or "servername/tablename"
+  */
+  if ((!strstr(share->connection_string, "://") &&
+       (!strchr(share->connection_string, '@'))))
+  {
+
+    DBUG_PRINT("info",
+               ("share->connection_string: %s  internal format "
+                "share->connection_string: %lx",
+                share->connection_string,
+                (ulong) share->connection_string));
+
+    /* ok, so we do a little parsing, but not completely! */
+    share->parsed= FALSE;
+    /*
+      If there is a single '/' in the connection string, this means the user is
+      specifying a table name
+    */
+
+    if ((share->table_name= strchr(share->connection_string, '/')))
+    {
+      *share->table_name++= '\0';
+      share->table_name_length= strlen(share->table_name);
+
+      DBUG_PRINT("info", 
+                 ("internal format, parsed table_name "
+                  "share->connection_string: %s  share->table_name: %s",
+                  share->connection_string, share->table_name));
+
+      /*
+        there better not be any more '/'s !
+      */
+      if (strchr(share->table_name, '/'))
+        goto error;
+    }
+    /*
+      Otherwise, straight server name, use tablename of federatedx table
+      as remote table name
+    */
+    else
+    {
+      /*
+        Connection specifies everything but, resort to
+        expecting remote and foreign table names to match
+      */
+      share->table_name= strmake_root(mem_root, table->s->table_name.str,
+                                      (share->table_name_length=
+                                       table->s->table_name.length));
+      DBUG_PRINT("info", 
+                 ("internal format, default table_name "
+                  "share->connection_string: %s  share->table_name: %s",
+                  share->connection_string, share->table_name));
+    }
+
+    if ((error_num= get_connection(mem_root, share)))
+      goto error;
+  }
+  else
+  {
+    share->parsed= TRUE;
+    // Add a null for later termination of table name
+    share->connection_string[table->s->connect_string.length]= 0;
+    share->scheme= share->connection_string;
+    DBUG_PRINT("info",("parse_url alloced share->scheme: %lx",
+                       (ulong) share->scheme));
+
+    /*
+      Remove addition of null terminator and store length
+      for each string  in share
+    */
+    if (!(share->username= strstr(share->scheme, "://")))
+      goto error;
+    share->scheme[share->username - share->scheme]= '\0';
+
+    if (!federatedx_io::handles_scheme(share->scheme))
+      goto error;
+
+    share->username+= 3;
+
+    if (!(share->hostname= strchr(share->username, '@')))
+      goto error;
+    *share->hostname++= '\0';                   // End username
+
+    if ((share->password= strchr(share->username, ':')))
+    {
+      *share->password++= '\0';                 // End username
+
+      /* make sure there isn't an extra / or @ */
+      if ((strchr(share->password, '/') || strchr(share->hostname, '@')))
+        goto error;
+      /*
+        Found that if the string is:
+        user:@hostname:port/db/table
+        Then password is a null string, so set to NULL
+      */
+      if ((share->password[0] == '\0'))
+        share->password= NULL;
+    }
+
+    /* make sure there isn't an extra / or @ */
+    if ((strchr(share->username, '/')) || (strchr(share->hostname, '@')))
+      goto error;
+
+    if (!(share->database= strchr(share->hostname, '/')))
+      goto error;
+    *share->database++= '\0';
+
+    if ((share->sport= strchr(share->hostname, ':')))
+    {
+      *share->sport++= '\0';
+      if (share->sport[0] == '\0')
+        share->sport= NULL;
+      else
+        share->port= atoi(share->sport);
+    }
+
+    if (!(share->table_name= strchr(share->database, '/')))
+      goto error;
+    *share->table_name++= '\0';
+
+    share->table_name_length= strlen(share->table_name);
+
+    /* make sure there's not an extra / */
+    if ((strchr(share->table_name, '/')))
+      goto error;
+
+    if (share->hostname[0] == '\0')
+      share->hostname= NULL;
+
+  }
+  if (!share->port)
+  {
+    if (!share->hostname || strcmp(share->hostname, my_localhost) == 0)
+      share->socket= (char *) MYSQL_UNIX_ADDR;
+    else
+      share->port= MYSQL_PORT;
+  }
+
+  DBUG_PRINT("info",
+             ("scheme: %s  username: %s  password: %s  hostname: %s  "
+              "port: %d  db: %s  tablename: %s",
+              share->scheme, share->username, share->password,
+              share->hostname, share->port, share->database,
+              share->table_name));
+
+  DBUG_RETURN(0);
+
+error:
+  DBUG_RETURN(parse_url_error(share, table, error_num));
+}
+
+/*****************************************************************************
+** FEDERATEDX tables
+*****************************************************************************/
+
+ha_federatedx::ha_federatedx(handlerton *hton,
+                           TABLE_SHARE *table_arg)
+  :handler(hton, table_arg),
+   txn(0), io(0), stored_result(0)
+{
+  bzero(&bulk_insert, sizeof(bulk_insert));
+}
+
+
+/*
+  Convert MySQL result set row to handler internal format
+
+  SYNOPSIS
+    convert_row_to_internal_format()
+      record    Byte pointer to record
+      row       MySQL result set row from fetchrow()
+      result	Result set to use
+
+  DESCRIPTION
+    This method simply iterates through a row returned via fetchrow with
+    values from a successful SELECT , and then stores each column's value
+    in the field object via the field object pointer (pointing to the table's
+    array of field object pointers). This is how the handler needs the data
+    to be stored to then return results back to the user
+
+  RETURN VALUE
+    0   After fields have had field values stored from record
+*/
+
+uint ha_federatedx::convert_row_to_internal_format(uchar *record,
+                                                  FEDERATEDX_IO_ROW *row,
+                                                  FEDERATEDX_IO_RESULT *result)
+{
+  ulong *lengths;
+  Field **field;
+  int column= 0;
+  my_bitmap_map *old_map= dbug_tmp_use_all_columns(table, table->write_set);
+  DBUG_ENTER("ha_federatedx::convert_row_to_internal_format");
+
+  lengths= io->fetch_lengths(result);
+
+  for (field= table->field; *field; field++, column++)
+  {
+    /*
+      index variable to move us through the row at the
+      same iterative step as the field
+    */
+    my_ptrdiff_t old_ptr;
+    old_ptr= (my_ptrdiff_t) (record - table->record[0]);
+    (*field)->move_field_offset(old_ptr);
+    if (io->is_column_null(row, column))
+      (*field)->set_null();
+    else
+    {
+      if (bitmap_is_set(table->read_set, (*field)->field_index))
+      {
+        (*field)->set_notnull();
+        (*field)->store(io->get_column_data(row, column), lengths[column], &my_charset_bin);
+      }
+    }
+    (*field)->move_field_offset(-old_ptr);
+  }
+  dbug_tmp_restore_column_map(table->write_set, old_map);
+  DBUG_RETURN(0);
+}
+
+static bool emit_key_part_name(String *to, KEY_PART_INFO *part)
+{
+  DBUG_ENTER("emit_key_part_name");
+  if (append_ident(to, part->field->field_name, 
+                   strlen(part->field->field_name), ident_quote_char))
+    DBUG_RETURN(1);                           // Out of memory
+  DBUG_RETURN(0);
+}
+
+static bool emit_key_part_element(String *to, KEY_PART_INFO *part,
+                                  bool needs_quotes, bool is_like,
+                                  const uchar *ptr, uint len)
+{
+  Field *field= part->field;
+  DBUG_ENTER("emit_key_part_element");
+
+  if (needs_quotes && to->append(STRING_WITH_LEN("'")))
+    DBUG_RETURN(1);
+
+  if (part->type == HA_KEYTYPE_BIT)
+  {
+    char buff[STRING_BUFFER_USUAL_SIZE], *buf= buff;
+
+    *buf++= '0';
+    *buf++= 'x';
+    buf= octet2hex(buf, (char*) ptr, len);
+    if (to->append((char*) buff, (uint)(buf - buff)))
+      DBUG_RETURN(1);
+  }
+  else if (part->key_part_flag & HA_BLOB_PART)
+  {
+    String blob;
+    uint blob_length= uint2korr(ptr);
+    blob.set_quick((char*) ptr+HA_KEY_BLOB_LENGTH,
+                   blob_length, &my_charset_bin);
+    if (append_escaped(to, &blob))
+      DBUG_RETURN(1);
+  }
+  else if (part->key_part_flag & HA_VAR_LENGTH_PART)
+  {
+    String varchar;
+    uint var_length= uint2korr(ptr);
+    varchar.set_quick((char*) ptr+HA_KEY_BLOB_LENGTH,
+                      var_length, &my_charset_bin);
+    if (append_escaped(to, &varchar))
+      DBUG_RETURN(1);
+  }
+  else
+  {
+    char strbuff[MAX_FIELD_WIDTH];
+    String str(strbuff, sizeof(strbuff), part->field->charset()), *res;
+
+    res= field->val_str(&str, ptr);
+
+    if (field->result_type() == STRING_RESULT)
+    {
+      if (append_escaped(to, res))
+        DBUG_RETURN(1);
+    }
+    else if (to->append(res->ptr(), res->length()))
+      DBUG_RETURN(1);
+  }
+
+  if (is_like && to->append(STRING_WITH_LEN("%")))
+    DBUG_RETURN(1);
+
+  if (needs_quotes && to->append(STRING_WITH_LEN("'")))
+    DBUG_RETURN(1);
+
+  DBUG_RETURN(0);
+}
+
+/*
+  Create a WHERE clause based off of values in keys
+  Note: This code was inspired by key_copy from key.cc
+
+  SYNOPSIS
+    create_where_from_key ()
+      to          String object to store WHERE clause
+      key_info    KEY struct pointer
+      key         byte pointer containing key
+      key_length  length of key
+      range_type  0 - no range, 1 - min range, 2 - max range
+                  (see enum range_operation)
+
+  DESCRIPTION
+    Using iteration through all the keys via a KEY_PART_INFO pointer,
+    This method 'extracts' the value of each key in the byte pointer
+    *key, and for each key found, constructs an appropriate WHERE clause
+
+  RETURN VALUE
+    0   After all keys have been accounted for to create the WHERE clause
+    1   No keys found
+
+    Range flags Table per Timour:
+
+   -----------------
+   - start_key:
+     * ">"  -> HA_READ_AFTER_KEY
+     * ">=" -> HA_READ_KEY_OR_NEXT
+     * "="  -> HA_READ_KEY_EXACT
+
+   - end_key:
+     * "<"  -> HA_READ_BEFORE_KEY
+     * "<=" -> HA_READ_AFTER_KEY
+
+   records_in_range:
+   -----------------
+   - start_key:
+     * ">"  -> HA_READ_AFTER_KEY
+     * ">=" -> HA_READ_KEY_EXACT
+     * "="  -> HA_READ_KEY_EXACT
+
+   - end_key:
+     * "<"  -> HA_READ_BEFORE_KEY
+     * "<=" -> HA_READ_AFTER_KEY
+     * "="  -> HA_READ_AFTER_KEY
+
+0 HA_READ_KEY_EXACT,              Find first record else error
+1 HA_READ_KEY_OR_NEXT,            Record or next record
+2 HA_READ_KEY_OR_PREV,            Record or previous
+3 HA_READ_AFTER_KEY,              Find next rec. after key-record
+4 HA_READ_BEFORE_KEY,             Find next rec. before key-record
+5 HA_READ_PREFIX,                 Key which as same prefix
+6 HA_READ_PREFIX_LAST,            Last key with the same prefix
+7 HA_READ_PREFIX_LAST_OR_PREV,    Last or prev key with the same prefix
+
+Flags that I've found:
+
+id, primary key, varchar
+
+id = 'ccccc'
+records_in_range: start_key 0 end_key 3
+read_range_first: start_key 0 end_key NULL
+
+id > 'ccccc'
+records_in_range: start_key 3 end_key NULL
+read_range_first: start_key 3 end_key NULL
+
+id < 'ccccc'
+records_in_range: start_key NULL end_key 4
+read_range_first: start_key NULL end_key 4
+
+id <= 'ccccc'
+records_in_range: start_key NULL end_key 3
+read_range_first: start_key NULL end_key 3
+
+id >= 'ccccc'
+records_in_range: start_key 0 end_key NULL
+read_range_first: start_key 1 end_key NULL
+
+id like 'cc%cc'
+records_in_range: start_key 0 end_key 3
+read_range_first: start_key 1 end_key 3
+
+id > 'aaaaa' and id < 'ccccc'
+records_in_range: start_key 3 end_key 4
+read_range_first: start_key 3 end_key 4
+
+id >= 'aaaaa' and id < 'ccccc';
+records_in_range: start_key 0 end_key 4
+read_range_first: start_key 1 end_key 4
+
+id >= 'aaaaa' and id <= 'ccccc';
+records_in_range: start_key 0 end_key 3
+read_range_first: start_key 1 end_key 3
+
+id > 'aaaaa' and id <= 'ccccc';
+records_in_range: start_key 3 end_key 3
+read_range_first: start_key 3 end_key 3
+
+numeric keys:
+
+id = 4
+index_read_idx: start_key 0 end_key NULL 
+
+id > 4
+records_in_range: start_key 3 end_key NULL
+read_range_first: start_key 3 end_key NULL
+
+id >= 4
+records_in_range: start_key 0 end_key NULL
+read_range_first: start_key 1 end_key NULL
+
+id < 4
+records_in_range: start_key NULL end_key 4
+read_range_first: start_key NULL end_key 4
+
+id <= 4
+records_in_range: start_key NULL end_key 3
+read_range_first: start_key NULL end_key 3
+
+id like 4
+full table scan, select * from
+
+id > 2 and id < 8
+records_in_range: start_key 3 end_key 4
+read_range_first: start_key 3 end_key 4
+
+id >= 2 and id < 8
+records_in_range: start_key 0 end_key 4
+read_range_first: start_key 1 end_key 4
+
+id >= 2 and id <= 8
+records_in_range: start_key 0 end_key 3
+read_range_first: start_key 1 end_key 3
+
+id > 2 and id <= 8
+records_in_range: start_key 3 end_key 3
+read_range_first: start_key 3 end_key 3
+
+multi keys (id int, name varchar, other varchar)
+
+id = 1;
+records_in_range: start_key 0 end_key 3
+read_range_first: start_key 0 end_key NULL
+
+id > 4;
+id > 2 and name = '333'; remote: id > 2
+id > 2 and name > '333'; remote: id > 2
+id > 2 and name > '333' and other < 'ddd'; remote: id > 2 no results
+id > 2 and name >= '333' and other < 'ddd'; remote: id > 2 1 result
+id >= 4 and name = 'eric was here' and other > 'eeee';
+records_in_range: start_key 3 end_key NULL
+read_range_first: start_key 3 end_key NULL
+
+id >= 4;
+id >= 2 and name = '333' and other < 'ddd';
+remote: `id`  >= 2 AND `name`  >= '333';
+records_in_range: start_key 0 end_key NULL
+read_range_first: start_key 1 end_key NULL
+
+id < 4;
+id < 3 and name = '222' and other <= 'ccc'; remote: id < 3
+records_in_range: start_key NULL end_key 4
+read_range_first: start_key NULL end_key 4
+
+id <= 4;
+records_in_range: start_key NULL end_key 3
+read_range_first: start_key NULL end_key 3
+
+id like 4;
+full table scan
+
+id  > 2 and id < 4;
+records_in_range: start_key 3 end_key 4
+read_range_first: start_key 3 end_key 4
+
+id >= 2 and id < 4;
+records_in_range: start_key 0 end_key 4
+read_range_first: start_key 1 end_key 4
+
+id >= 2 and id <= 4;
+records_in_range: start_key 0 end_key 3
+read_range_first: start_key 1 end_key 3
+
+id > 2 and id <= 4;
+id = 6 and name = 'eric was here' and other > 'eeee';
+remote: (`id`  > 6 AND `name`  > 'eric was here' AND `other`  > 'eeee')
+AND (`id`  <= 6) AND ( AND `name`  <= 'eric was here')
+no results
+records_in_range: start_key 3 end_key 3
+read_range_first: start_key 3 end_key 3
+
+Summary:
+
+* If the start key flag is 0 the max key flag shouldn't even be set, 
+  and if it is, the query produced would be invalid.
+* Multipart keys, even if containing some or all numeric columns,
+  are treated the same as non-numeric keys
+
+  If the query is " = " (quotes or not):
+  - records in range start key flag HA_READ_KEY_EXACT,
+    end key flag HA_READ_AFTER_KEY (incorrect)
+  - any other: start key flag HA_READ_KEY_OR_NEXT,
+    end key flag HA_READ_AFTER_KEY (correct)
+
+* 'like' queries (of key)
+  - Numeric, full table scan
+  - Non-numeric
+      records_in_range: start_key 0 end_key 3
+      other : start_key 1 end_key 3
+
+* If the key flag is HA_READ_AFTER_KEY:
+   if start_key, append >
+   if end_key, append <=
+
+* If create_where_key was called by records_in_range:
+
+ - if the key is numeric:
+    start key flag is 0 when end key is NULL, end key flag is 3 or 4
+ - if create_where_key was called by any other function:
+    start key flag is 1 when end key is NULL, end key flag is 3 or 4
+ - if the key is non-numeric, or multipart
+    When the query is an exact match, the start key flag is 0,
+    end key flag is 3 for what should be a no-range condition where
+    you should have 0 and max key NULL, which it is if called by
+    read_range_first
+
+Conclusion:
+
+1. Need logic to determin if a key is min or max when the flag is
+HA_READ_AFTER_KEY, and handle appending correct operator accordingly
+
+2. Need a boolean flag to pass to create_where_from_key, used in the
+switch statement. Add 1 to the flag if:
+  - start key flag is HA_READ_KEY_EXACT and the end key is NULL
+
+*/
+
+bool ha_federatedx::create_where_from_key(String *to,
+                                         KEY *key_info,
+                                         const key_range *start_key,
+                                         const key_range *end_key,
+                                         bool from_records_in_range,
+                                         bool eq_range)
+{
+  bool both_not_null=
+    (start_key != NULL && end_key != NULL) ? TRUE : FALSE;
+  const uchar *ptr;
+  uint remainder, length;
+  char tmpbuff[FEDERATEDX_QUERY_BUFFER_SIZE];
+  String tmp(tmpbuff, sizeof(tmpbuff), system_charset_info);
+  const key_range *ranges[2]= { start_key, end_key };
+  my_bitmap_map *old_map;
+  DBUG_ENTER("ha_federatedx::create_where_from_key");
+
+  tmp.length(0); 
+  if (start_key == NULL && end_key == NULL)
+    DBUG_RETURN(1);
+
+  old_map= dbug_tmp_use_all_columns(table, table->write_set);
+  for (uint i= 0; i <= 1; i++)
+  {
+    bool needs_quotes;
+    KEY_PART_INFO *key_part;
+    if (ranges[i] == NULL)
+      continue;
+
+    if (both_not_null)
+    {
+      if (i > 0)
+        tmp.append(STRING_WITH_LEN(") AND ("));
+      else
+        tmp.append(STRING_WITH_LEN(" ("));
+    }
+
+    for (key_part= key_info->key_part,
+         remainder= key_info->key_parts,
+         length= ranges[i]->length,
+         ptr= ranges[i]->key; ;
+         remainder--,
+         key_part++)
+    {
+      Field *field= key_part->field;
+      uint store_length= key_part->store_length;
+      uint part_length= min(store_length, length);
+      needs_quotes= field->str_needs_quotes();
+      DBUG_DUMP("key, start of loop", ptr, length);
+
+      if (key_part->null_bit)
+      {
+        if (*ptr++)
+        {
+          /*
+            We got "IS [NOT] NULL" condition against nullable column. We
+            distinguish between "IS NOT NULL" and "IS NULL" by flag. For
+            "IS NULL", flag is set to HA_READ_KEY_EXACT.
+          */
+          if (emit_key_part_name(&tmp, key_part) ||
+              tmp.append(ranges[i]->flag == HA_READ_KEY_EXACT ?
+                         " IS NULL " : " IS NOT NULL "))
+            goto err;
+          /*
+            We need to adjust pointer and length to be prepared for next
+            key part. As well as check if this was last key part.
+          */
+          goto prepare_for_next_key_part;
+        }
+      }
+
+      if (tmp.append(STRING_WITH_LEN(" (")))
+        goto err;
+
+      switch (ranges[i]->flag) {
+      case HA_READ_KEY_EXACT:
+        DBUG_PRINT("info", ("federatedx HA_READ_KEY_EXACT %d", i));
+        if (store_length >= length ||
+            !needs_quotes ||
+            key_part->type == HA_KEYTYPE_BIT ||
+            field->result_type() != STRING_RESULT)
+        {
+          if (emit_key_part_name(&tmp, key_part))
+            goto err;
+
+          if (from_records_in_range)
+          {
+            if (tmp.append(STRING_WITH_LEN(" >= ")))
+              goto err;
+          }
+          else
+          {
+            if (tmp.append(STRING_WITH_LEN(" = ")))
+              goto err;
+          }
+
+          if (emit_key_part_element(&tmp, key_part, needs_quotes, 0, ptr,
+                                    part_length))
+            goto err;
+        }
+        else
+        {
+          /* LIKE */
+          if (emit_key_part_name(&tmp, key_part) ||
+              tmp.append(STRING_WITH_LEN(" LIKE ")) ||
+              emit_key_part_element(&tmp, key_part, needs_quotes, 1, ptr,
+                                    part_length))
+            goto err;
+        }
+        break;
+      case HA_READ_AFTER_KEY:
+        if (eq_range)
+        {
+          if (tmp.append("1=1"))                // Dummy
+            goto err;
+          break;
+        }
+        DBUG_PRINT("info", ("federatedx HA_READ_AFTER_KEY %d", i));
+        if (store_length >= length) /* end key */
+        {
+          if (emit_key_part_name(&tmp, key_part))
+            goto err;
+
+          if (i > 0) /* end key */
+          {
+            if (tmp.append(STRING_WITH_LEN(" <= ")))
+              goto err;
+          }
+          else /* start key */
+          {
+            if (tmp.append(STRING_WITH_LEN(" > ")))
+              goto err;
+          }
+
+          if (emit_key_part_element(&tmp, key_part, needs_quotes, 0, ptr,
+                                    part_length))
+          {
+            goto err;
+          }
+          break;
+        }
+      case HA_READ_KEY_OR_NEXT:
+        DBUG_PRINT("info", ("federatedx HA_READ_KEY_OR_NEXT %d", i));
+        if (emit_key_part_name(&tmp, key_part) ||
+            tmp.append(STRING_WITH_LEN(" >= ")) ||
+            emit_key_part_element(&tmp, key_part, needs_quotes, 0, ptr,
+              part_length))
+          goto err;
+        break;
+      case HA_READ_BEFORE_KEY:
+        DBUG_PRINT("info", ("federatedx HA_READ_BEFORE_KEY %d", i));
+        if (store_length >= length)
+        {
+          if (emit_key_part_name(&tmp, key_part) ||
+              tmp.append(STRING_WITH_LEN(" < ")) ||
+              emit_key_part_element(&tmp, key_part, needs_quotes, 0, ptr,
+                                    part_length))
+            goto err;
+          break;
+        }
+      case HA_READ_KEY_OR_PREV:
+        DBUG_PRINT("info", ("federatedx HA_READ_KEY_OR_PREV %d", i));
+        if (emit_key_part_name(&tmp, key_part) ||
+            tmp.append(STRING_WITH_LEN(" <= ")) ||
+            emit_key_part_element(&tmp, key_part, needs_quotes, 0, ptr,
+                                  part_length))
+          goto err;
+        break;
+      default:
+        DBUG_PRINT("info",("cannot handle flag %d", ranges[i]->flag));
+        goto err;
+      }
+      if (tmp.append(STRING_WITH_LEN(") ")))
+        goto err;
+
+prepare_for_next_key_part:
+      if (store_length >= length)
+        break;
+      DBUG_PRINT("info", ("remainder %d", remainder));
+      DBUG_ASSERT(remainder > 1);
+      length-= store_length;
+      /*
+        For nullable columns, null-byte is already skipped before, that is
+        ptr was incremented by 1. Since store_length still counts null-byte,
+        we need to subtract 1 from store_length.
+      */
+      ptr+= store_length - test(key_part->null_bit);
+      if (tmp.append(STRING_WITH_LEN(" AND ")))
+        goto err;
+
+      DBUG_PRINT("info",
+                 ("create_where_from_key WHERE clause: %s",
+                  tmp.c_ptr_quick()));
+    }
+  }
+  dbug_tmp_restore_column_map(table->write_set, old_map);
+
+  if (both_not_null)
+    if (tmp.append(STRING_WITH_LEN(") ")))
+      DBUG_RETURN(1);
+
+  if (to->append(STRING_WITH_LEN(" WHERE ")))
+    DBUG_RETURN(1);
+
+  if (to->append(tmp))
+    DBUG_RETURN(1);
+
+  DBUG_RETURN(0);
+
+err:
+  dbug_tmp_restore_column_map(table->write_set, old_map);
+  DBUG_RETURN(1);
+}
+
+static void fill_server(MEM_ROOT *mem_root, FEDERATEDX_SERVER *server,
+                        FEDERATEDX_SHARE *share, CHARSET_INFO *table_charset)
+{
+  char buffer[STRING_BUFFER_USUAL_SIZE];
+  String key(buffer, sizeof(buffer), &my_charset_bin);  
+  String scheme(share->scheme, &my_charset_latin1);
+  String hostname(share->hostname, &my_charset_latin1);
+  String database(share->database, system_charset_info);
+  String username(share->username, system_charset_info);
+  String socket(share->socket ? share->socket : "", files_charset_info);
+  String password(share->password ? share->password : "", &my_charset_bin);
+  DBUG_ENTER("fill_server");
+
+  /* Do some case conversions */
+  scheme.reserve(scheme.length());
+  scheme.length(my_casedn_str(&my_charset_latin1, scheme.c_ptr_safe()));
+  
+  hostname.reserve(hostname.length());
+  hostname.length(my_casedn_str(&my_charset_latin1, hostname.c_ptr_safe()));
+  
+  if (lower_case_table_names)
+  {
+    database.reserve(database.length());
+    database.length(my_casedn_str(system_charset_info, database.c_ptr_safe()));
+  }
+
+#ifndef __WIN__
+  /*
+    TODO: there is no unix sockets under windows so the engine should be
+    revised about using sockets in such environment.
+  */
+  if (lower_case_file_system && socket.length())
+  {
+    socket.reserve(socket.length());
+    socket.length(my_casedn_str(files_charset_info, socket.c_ptr_safe()));
+  }
+#endif
+
+  /* start with all bytes zeroed */  
+  bzero(server, sizeof(*server));
+
+  key.length(0);
+  key.reserve(scheme.length() + hostname.length() + database.length() +
+              socket.length() + username.length() + password.length() +
+       sizeof(int) + 8);
+  key.append(scheme);
+  key.q_append('\0');
+  server->hostname= (const char *) (intptr) key.length();
+  key.append(hostname);
+  key.q_append('\0');
+  server->database= (const char *) (intptr) key.length();
+  key.append(database);
+  key.q_append('\0');
+  key.q_append((uint32) share->port);
+  server->socket= (const char *) (intptr) key.length();
+  key.append(socket);
+  key.q_append('\0');
+  server->username= (const char *) (intptr) key.length();
+  key.append(username);
+  key.q_append('\0');
+  server->password= (const char *) (intptr) key.length();
+  key.append(password);
+  
+  server->key_length= key.length();
+  server->key= (uchar *)  memdup_root(mem_root, key.ptr(), key.length()+1);
+
+  /* pointer magic */
+  server->scheme+= (intptr) server->key;
+  server->hostname+= (intptr) server->key;
+  server->database+= (intptr) server->key;
+  server->username+= (intptr) server->key;
+  server->password+= (intptr) server->key;
+  server->socket+= (intptr) server->key;
+  server->port= share->port;
+
+  if (!share->socket)
+    server->socket= NULL;
+  if (!share->password)
+    server->password= NULL;
+
+  if (table_charset)
+    server->csname= strdup_root(mem_root, table_charset->csname);
+
+  DBUG_VOID_RETURN;
+}
+
+
+static FEDERATEDX_SERVER *get_server(FEDERATEDX_SHARE *share, TABLE *table)
+{
+  FEDERATEDX_SERVER *server= NULL, tmp_server;
+  MEM_ROOT mem_root;
+  char buffer[STRING_BUFFER_USUAL_SIZE];
+  String key(buffer, sizeof(buffer), &my_charset_bin);  
+  String scheme(share->scheme, &my_charset_latin1);
+  String hostname(share->hostname, &my_charset_latin1);
+  String database(share->database, system_charset_info);
+  String username(share->username, system_charset_info);
+  String socket(share->socket ? share->socket : "", files_charset_info);
+  String password(share->password ? share->password : "", &my_charset_bin);
+  DBUG_ENTER("ha_federated.cc::get_server");
+
+  safe_mutex_assert_owner(&federatedx_mutex);
+
+  init_alloc_root(&mem_root, 4096, 4096);
+
+  fill_server(&mem_root, &tmp_server, share, table ? table->s->table_charset : 0);
+
+  if (!(server= (FEDERATEDX_SERVER *) hash_search(&federatedx_open_servers,
+                                                 tmp_server.key,
+                                                 tmp_server.key_length)))
+  {
+    if (!table || !tmp_server.csname)
+      goto error;
+ 
+    if (!(server= (FEDERATEDX_SERVER *) memdup_root(&mem_root, 
+                          (char *) &tmp_server,
+                          sizeof(*server))))
+      goto error;
+
+    server->mem_root= mem_root;
+
+    if (my_hash_insert(&federatedx_open_servers, (uchar*) server))
+      goto error;
+
+    pthread_mutex_init(&server->mutex, MY_MUTEX_INIT_FAST);
+  }
+  else
+    free_root(&mem_root, MYF(0)); /* prevents memory leak */
+
+  server->use_count++;
+  
+  DBUG_RETURN(server);
+error:
+  free_root(&mem_root, MYF(0));
+  DBUG_RETURN(NULL);
+}
+
+
+/*
+  Example of simple lock controls. The "share" it creates is structure we will
+  pass to each federatedx handler. Do you have to have one of these? Well, you
+  have pieces that are used for locking, and they are needed to function.
+*/
+
+static FEDERATEDX_SHARE *get_share(const char *table_name, TABLE *table)
+{
+  char query_buffer[FEDERATEDX_QUERY_BUFFER_SIZE];
+  Field **field;
+  String query(query_buffer, sizeof(query_buffer), &my_charset_bin);
+  FEDERATEDX_SHARE *share= NULL, tmp_share;
+  MEM_ROOT mem_root;
+  DBUG_ENTER("ha_federatedx.cc::get_share");
+
+  /*
+    In order to use this string, we must first zero it's length,
+    or it will contain garbage
+  */
+  query.length(0);
+
+  bzero(&tmp_share, sizeof(tmp_share));
+  init_alloc_root(&mem_root, 256, 0);
+
+  pthread_mutex_lock(&federatedx_mutex);
+
+  tmp_share.share_key= table_name;
+  tmp_share.share_key_length= strlen(table_name);
+  if (parse_url(&mem_root, &tmp_share, table, 0))
+    goto error;
+
+  /* TODO: change tmp_share.scheme to LEX_STRING object */
+  if (!(share= (FEDERATEDX_SHARE *) hash_search(&federatedx_open_tables,
+                                               (uchar*) tmp_share.share_key,
+                                               tmp_share.
+                                               share_key_length)))
+  {
+    query.set_charset(system_charset_info);
+    query.append(STRING_WITH_LEN("SELECT "));
+    for (field= table->field; *field; field++)
+    {
+      append_ident(&query, (*field)->field_name, 
+                   strlen((*field)->field_name), ident_quote_char);
+      query.append(STRING_WITH_LEN(", "));
+    }
+    /* chops off trailing comma */
+    query.length(query.length() - sizeof_trailing_comma);
+
+    query.append(STRING_WITH_LEN(" FROM "));
+
+    append_ident(&query, tmp_share.table_name, 
+                 tmp_share.table_name_length, ident_quote_char);
+
+    if (!(share= (FEDERATEDX_SHARE *) memdup_root(&mem_root, (char*)&tmp_share, sizeof(*share))) ||
+        !(share->select_query= (char*) strmake_root(&mem_root, query.ptr(), query.length() + 1)))
+      goto error;
+
+    share->mem_root= mem_root;
+
+    DBUG_PRINT("info",
+               ("share->select_query %s", share->select_query));
+
+    if (!(share->s= get_server(share, table)))
+      goto error;
+   
+    if (my_hash_insert(&federatedx_open_tables, (uchar*) share))
+      goto error;
+    thr_lock_init(&share->lock);
+  }
+  else
+    free_root(&mem_root, MYF(0)); /* prevents memory leak */
+
+  share->use_count++;
+  pthread_mutex_unlock(&federatedx_mutex);
+
+  DBUG_RETURN(share);
+
+error:
+  pthread_mutex_unlock(&federatedx_mutex);
+  free_root(&mem_root, MYF(0));
+  DBUG_RETURN(NULL);
+}
+
+
+static int free_server(federatedx_txn *txn, FEDERATEDX_SERVER *server)
+{
+  bool destroy;
+  DBUG_ENTER("free_server");
+
+  pthread_mutex_lock(&federatedx_mutex);
+  if ((destroy= !--server->use_count))
+    hash_delete(&federatedx_open_servers, (uchar*) server);
+  pthread_mutex_unlock(&federatedx_mutex);
+
+  if (destroy)
+  {
+    MEM_ROOT mem_root;
+
+    if (!txn)
+    {
+      federatedx_txn tmp_txn;
+      tmp_txn.close(server);
+    }
+    else
+      txn->close(server);
+
+    DBUG_ASSERT(server->io_count == 0);
+
+    pthread_mutex_destroy(&server->mutex);
+    mem_root= server->mem_root;
+    free_root(&mem_root, MYF(0));
+  }
+
+  DBUG_RETURN(0);
+}
+
+
+/*
+  Free lock controls. We call this whenever we close a table.
+  If the table had the last reference to the share then we
+  free memory associated with it.
+*/
+
+static int free_share(federatedx_txn *txn, FEDERATEDX_SHARE *share)
+{
+  bool destroy;
+  DBUG_ENTER("free_share");
+
+  pthread_mutex_lock(&federatedx_mutex);
+  if ((destroy= !--share->use_count))
+    hash_delete(&federatedx_open_tables, (uchar*) share);
+  pthread_mutex_unlock(&federatedx_mutex);
+
+  if (destroy)
+  {
+    MEM_ROOT mem_root;
+    FEDERATEDX_SERVER *server= share->s;
+
+    thr_lock_delete(&share->lock);
+
+    mem_root= share->mem_root;
+    free_root(&mem_root, MYF(0));
+
+    free_server(txn, server);
+  }
+
+  DBUG_RETURN(0);
+}
+
+
+ha_rows ha_federatedx::records_in_range(uint inx, key_range *start_key,
+                                       key_range *end_key)
+{
+  /*
+
+  We really want indexes to be used as often as possible, therefore
+  we just need to hard-code the return value to a very low number to
+  force the issue
+
+*/
+  DBUG_ENTER("ha_federatedx::records_in_range");
+  DBUG_RETURN(FEDERATEDX_RECORDS_IN_RANGE);
+}
+/*
+  If frm_error() is called then we will use this to to find out
+  what file extentions exist for the storage engine. This is
+  also used by the default rename_table and delete_table method
+  in handler.cc.
+*/
+
+const char **ha_federatedx::bas_ext() const
+{
+  static const char *ext[]=
+  {
+    NullS
+  };
+  return ext;
+}
+
+
+federatedx_txn *ha_federatedx::get_txn(THD *thd, bool no_create)
+{
+  federatedx_txn **txnp= (federatedx_txn **) ha_data(thd);
+  if (!*txnp && !no_create)
+    *txnp= new federatedx_txn();
+  return *txnp;
+}
+
+
+int ha_federatedx::disconnect(handlerton *hton, MYSQL_THD thd)
+{
+  federatedx_txn *txn= (federatedx_txn *) thd_get_ha_data(thd, hton);
+  delete txn;
+  return 0;
+}
+
+
+/*
+  Used for opening tables. The name will be the name of the file.
+  A table is opened when it needs to be opened. For instance
+  when a request comes in for a select on the table (tables are not
+  open and closed for each request, they are cached).
+
+  Called from handler.cc by handler::ha_open(). The server opens
+  all tables by calling ha_open() which then calls the handler
+  specific open().
+*/
+
+int ha_federatedx::open(const char *name, int mode, uint test_if_locked)
+{
+  int error;
+  THD *thd= current_thd;
+  DBUG_ENTER("ha_federatedx::open");
+
+  if (!(share= get_share(name, table)))
+    DBUG_RETURN(1);
+  thr_lock_data_init(&share->lock, &lock, NULL);
+
+  DBUG_ASSERT(io == NULL);
+
+  txn= get_txn(thd);
+
+  if ((error= txn->acquire(share, TRUE, &io)))
+  {
+    free_share(txn, share);
+    DBUG_RETURN(error);
+  }
+
+  ref_length= io->get_ref_length();
+
+  txn->release(&io);
+
+  DBUG_PRINT("info", ("ref_length: %u", ref_length));
+
+  my_init_dynamic_array(&results, sizeof(FEDERATEDX_IO_RESULT*), 4, 4);
+
+  reset();
+
+  DBUG_RETURN(0);
+}
+
+
+/*
+  Closes a table. We call the free_share() function to free any resources
+  that we have allocated in the "shared" structure.
+
+  Called from sql_base.cc, sql_select.cc, and table.cc.
+  In sql_select.cc it is only used to close up temporary tables or during
+  the process where a temporary table is converted over to being a
+  myisam table.
+  For sql_base.cc look at close_data_tables().
+*/
+
+int ha_federatedx::close(void)
+{
+  int retval= 0, error;
+  THD *thd= current_thd;
+  DBUG_ENTER("ha_federatedx::close");
+
+  /* free the result set */
+  reset();
+
+  delete_dynamic(&results);
+
+  /* Disconnect from mysql */
+  if (!thd || !(txn= get_txn(thd, true)))
+  {
+    federatedx_txn tmp_txn;
+
+    tmp_txn.release(&io);
+
+    DBUG_ASSERT(io == NULL);
+
+    if ((error= free_share(&tmp_txn, share)))
+      retval= error;
+  }
+  else
+  {
+    txn->release(&io);
+    DBUG_ASSERT(io == NULL);
+
+    if ((error= free_share(txn, share)))
+      retval= error;
+  }
+  DBUG_RETURN(retval);
+}
+
+/*
+
+  Checks if a field in a record is SQL NULL.
+
+  SYNOPSIS
+    field_in_record_is_null()
+      table     TABLE pointer, MySQL table object
+      field     Field pointer, MySQL field object
+      record    char pointer, contains record
+
+    DESCRIPTION
+      This method uses the record format information in table to track
+      the null bit in record.
+
+    RETURN VALUE
+      1    if NULL
+      0    otherwise
+*/
+
+static inline uint field_in_record_is_null(TABLE *table,
+                                    Field *field,
+                                    char *record)
+{
+  int null_offset;
+  DBUG_ENTER("ha_federatedx::field_in_record_is_null");
+
+  if (!field->null_ptr)
+    DBUG_RETURN(0);
+
+  null_offset= (uint) ((char*)field->null_ptr - (char*)table->record[0]);
+
+  if (record[null_offset] & field->null_bit)
+    DBUG_RETURN(1);
+
+  DBUG_RETURN(0);
+}
+
+
+/**
+  @brief Construct the INSERT statement.
+  
+  @details This method will construct the INSERT statement and appends it to
+  the supplied query string buffer.
+  
+  @return
+    @retval FALSE       No error
+    @retval TRUE        Failure
+*/
+
+bool ha_federatedx::append_stmt_insert(String *query)
+{
+  char insert_buffer[FEDERATEDX_QUERY_BUFFER_SIZE];
+  Field **field;
+  uint tmp_length;
+  bool added_field= FALSE;
+
+  /* The main insert query string */
+  String insert_string(insert_buffer, sizeof(insert_buffer), &my_charset_bin);
+  DBUG_ENTER("ha_federatedx::append_stmt_insert");
+
+  insert_string.length(0);
+
+  if (replace_duplicates)
+    insert_string.append(STRING_WITH_LEN("REPLACE INTO "));
+  else if (ignore_duplicates && !insert_dup_update)
+    insert_string.append(STRING_WITH_LEN("INSERT IGNORE INTO "));
+  else
+    insert_string.append(STRING_WITH_LEN("INSERT INTO "));
+  append_ident(&insert_string, share->table_name, share->table_name_length, 
+               ident_quote_char);
+  tmp_length= insert_string.length();
+  insert_string.append(STRING_WITH_LEN(" ("));
+
+  /*
+    loop through the field pointer array, add any fields to both the values
+    list and the fields list that match the current query id
+  */
+  for (field= table->field; *field; field++)
+  {
+    if (bitmap_is_set(table->write_set, (*field)->field_index))
+    {
+      /* append the field name */
+      append_ident(&insert_string, (*field)->field_name, 
+                   strlen((*field)->field_name), ident_quote_char);
+
+      /* append commas between both fields and fieldnames */
+      /*
+        unfortunately, we can't use the logic if *(fields + 1) to
+        make the following appends conditional as we don't know if the
+        next field is in the write set
+      */
+      insert_string.append(STRING_WITH_LEN(", "));
+      added_field= TRUE;
+    }
+  }
+
+  if (added_field)
+  {
+    /* Remove trailing comma. */
+    insert_string.length(insert_string.length() - sizeof_trailing_comma);
+    insert_string.append(STRING_WITH_LEN(") "));
+  }
+  else
+  {
+    /* If there were no fields, we don't want to add a closing paren. */
+    insert_string.length(tmp_length);
+  }
+
+  insert_string.append(STRING_WITH_LEN(" VALUES "));
+
+  DBUG_RETURN(query->append(insert_string));
+}
+
+
+/*
+  write_row() inserts a row. No extra() hint is given currently if a bulk load
+  is happeneding. buf() is a byte array of data. You can use the field
+  information to extract the data from the native byte array type.
+  Example of this would be:
+  for (Field **field=table->field ; *field ; field++)
+  {
+    ...
+  }
+
+  Called from item_sum.cc, item_sum.cc, sql_acl.cc, sql_insert.cc,
+  sql_insert.cc, sql_select.cc, sql_table.cc, sql_udf.cc, and sql_update.cc.
+*/
+
+int ha_federatedx::write_row(uchar *buf)
+{
+  char values_buffer[FEDERATEDX_QUERY_BUFFER_SIZE];
+  char insert_field_value_buffer[STRING_BUFFER_USUAL_SIZE];
+  Field **field;
+  uint tmp_length;
+  int error= 0;
+  bool use_bulk_insert;
+  bool auto_increment_update_required= (table->next_number_field != NULL);
+
+  /* The string containing the values to be added to the insert */
+  String values_string(values_buffer, sizeof(values_buffer), &my_charset_bin);
+  /* The actual value of the field, to be added to the values_string */
+  String insert_field_value_string(insert_field_value_buffer,
+                                   sizeof(insert_field_value_buffer),
+                                   &my_charset_bin);
+  my_bitmap_map *old_map= dbug_tmp_use_all_columns(table, table->read_set);
+  DBUG_ENTER("ha_federatedx::write_row");
+
+  values_string.length(0);
+  insert_field_value_string.length(0);
+  ha_statistic_increment(&SSV::ha_write_count);
+  if (table->timestamp_field_type & TIMESTAMP_AUTO_SET_ON_INSERT)
+    table->timestamp_field->set_time();
+
+  /*
+    start both our field and field values strings
+    We must disable multi-row insert for "INSERT...ON DUPLICATE KEY UPDATE"
+    Ignore duplicates is always true when insert_dup_update is true.
+    When replace_duplicates == TRUE, we can safely enable multi-row insert.
+    When performing multi-row insert, we only collect the columns values for
+    the row. The start of the statement is only created when the first
+    row is copied in to the bulk_insert string.
+  */
+  if (!(use_bulk_insert= bulk_insert.str && 
+        (!insert_dup_update || replace_duplicates)))
+    append_stmt_insert(&values_string);
+
+  values_string.append(STRING_WITH_LEN(" ("));
+  tmp_length= values_string.length();
+
+  /*
+    loop through the field pointer array, add any fields to both the values
+    list and the fields list that is part of the write set
+  */
+  for (field= table->field; *field; field++)
+  {
+    if (bitmap_is_set(table->write_set, (*field)->field_index))
+    {
+      if ((*field)->is_null())
+        values_string.append(STRING_WITH_LEN(" NULL "));
+      else
+      {
+        bool needs_quote= (*field)->str_needs_quotes();
+        (*field)->val_str(&insert_field_value_string);
+        if (needs_quote)
+          values_string.append(value_quote_char);
+        insert_field_value_string.print(&values_string);
+        if (needs_quote)
+          values_string.append(value_quote_char);
+
+        insert_field_value_string.length(0);
+      }
+
+      /* append commas between both fields and fieldnames */
+      /*
+        unfortunately, we can't use the logic if *(fields + 1) to
+        make the following appends conditional as we don't know if the
+        next field is in the write set
+      */
+      values_string.append(STRING_WITH_LEN(", "));
+    }
+  }
+  dbug_tmp_restore_column_map(table->read_set, old_map);
+
+  /*
+    if there were no fields, we don't want to add a closing paren
+    AND, we don't want to chop off the last char '('
+    insert will be "INSERT INTO t1 VALUES ();"
+  */
+  if (values_string.length() > tmp_length)
+  {
+    /* chops off trailing comma */
+    values_string.length(values_string.length() - sizeof_trailing_comma);
+  }
+  /* we always want to append this, even if there aren't any fields */
+  values_string.append(STRING_WITH_LEN(") "));
+
+  if ((error= txn->acquire(share, FALSE, &io)))
+    DBUG_RETURN(error);
+
+  if (use_bulk_insert)
+  {
+    /*
+      Send the current bulk insert out if appending the current row would
+      cause the statement to overflow the packet size, otherwise set
+      auto_increment_update_required to FALSE as no query was executed.
+    */
+    if (bulk_insert.length + values_string.length() + bulk_padding >
+        io->max_query_size() && bulk_insert.length)
+    {
+      error= io->query(bulk_insert.str, bulk_insert.length);
+      bulk_insert.length= 0;
+    }
+    else
+      auto_increment_update_required= FALSE;
+      
+    if (bulk_insert.length == 0)
+    {
+      char insert_buffer[FEDERATEDX_QUERY_BUFFER_SIZE];
+      String insert_string(insert_buffer, sizeof(insert_buffer), 
+                           &my_charset_bin);
+      insert_string.length(0);
+      append_stmt_insert(&insert_string);
+      dynstr_append_mem(&bulk_insert, insert_string.ptr(), 
+                        insert_string.length());
+    }
+    else
+      dynstr_append_mem(&bulk_insert, ",", 1);
+
+    dynstr_append_mem(&bulk_insert, values_string.ptr(), 
+                      values_string.length());
+  }  
+  else
+  {
+    error= io->query(values_string.ptr(), values_string.length());
+  }
+  
+  if (error)
+  {
+    DBUG_RETURN(stash_remote_error());
+  }
+  /*
+    If the table we've just written a record to contains an auto_increment
+    field, then store the last_insert_id() value from the foreign server
+  */
+  if (auto_increment_update_required)
+  {
+    update_auto_increment();
+
+    /* mysql_insert() uses this for protocol return value */
+    table->next_number_field->store(stats.auto_increment_value, 1);
+  }
+
+  DBUG_RETURN(0);
+}
+
+
+/**
+  @brief Prepares the storage engine for bulk inserts.
+  
+  @param[in] rows       estimated number of rows in bulk insert 
+                        or 0 if unknown.
+  
+  @details Initializes memory structures required for bulk insert.
+*/
+
+void ha_federatedx::start_bulk_insert(ha_rows rows)
+{
+  uint page_size;
+  DBUG_ENTER("ha_federatedx::start_bulk_insert");
+
+  dynstr_free(&bulk_insert);
+  
+  /**
+    We don't bother with bulk-insert semantics when the estimated rows == 1
+    The rows value will be 0 if the server does not know how many rows
+    would be inserted. This can occur when performing INSERT...SELECT
+  */
+  
+  if (rows == 1)
+    DBUG_VOID_RETURN;
+
+  /*
+    Make sure we have an open connection so that we know the 
+    maximum packet size.
+  */
+  if (txn->acquire(share, FALSE, &io))
+    DBUG_VOID_RETURN;
+
+  page_size= (uint) my_getpagesize();
+
+  if (init_dynamic_string(&bulk_insert, NULL, page_size, page_size))
+    DBUG_VOID_RETURN;
+  
+  bulk_insert.length= 0;
+  DBUG_VOID_RETURN;
+}
+
+
+/**
+  @brief End bulk insert.
+  
+  @details This method will send any remaining rows to the remote server.
+  Finally, it will deinitialize the bulk insert data structure.
+  
+  @return Operation status
+  @retval       0       No error
+  @retval       != 0    Error occured at remote server. Also sets my_errno.
+*/
+
+int ha_federatedx::end_bulk_insert()
+{
+  int error= 0;
+  DBUG_ENTER("ha_federatedx::end_bulk_insert");
+  
+  if (bulk_insert.str && bulk_insert.length && !table_will_be_deleted)
+  {
+    if ((error= txn->acquire(share, FALSE, &io)))
+      DBUG_RETURN(error);
+    if (io->query(bulk_insert.str, bulk_insert.length))
+      error= stash_remote_error();
+    else
+    if (table->next_number_field)
+      update_auto_increment();
+  }
+
+  dynstr_free(&bulk_insert);
+  
+  DBUG_RETURN(my_errno= error);
+}
+
+
+/*
+  ha_federatedx::update_auto_increment
+
+  This method ensures that last_insert_id() works properly. What it simply does
+  is calls last_insert_id() on the foreign database immediately after insert
+  (if the table has an auto_increment field) and sets the insert id via
+  thd->insert_id(ID)).
+*/
+void ha_federatedx::update_auto_increment(void)
+{
+  THD *thd= current_thd;
+  DBUG_ENTER("ha_federatedx::update_auto_increment");
+
+  ha_federatedx::info(HA_STATUS_AUTO);
+  thd->first_successful_insert_id_in_cur_stmt= 
+    stats.auto_increment_value;
+  DBUG_PRINT("info",("last_insert_id: %ld", (long) stats.auto_increment_value));
+
+  DBUG_VOID_RETURN;
+}
+
+int ha_federatedx::optimize(THD* thd, HA_CHECK_OPT* check_opt)
+{
+  int error= 0;
+  char query_buffer[STRING_BUFFER_USUAL_SIZE];
+  String query(query_buffer, sizeof(query_buffer), &my_charset_bin);
+  DBUG_ENTER("ha_federatedx::optimize");
+  
+  query.length(0);
+
+  query.set_charset(system_charset_info);
+  query.append(STRING_WITH_LEN("OPTIMIZE TABLE "));
+  append_ident(&query, share->table_name, share->table_name_length, 
+               ident_quote_char);
+
+  DBUG_ASSERT(txn == get_txn(thd));
+
+  if ((error= txn->acquire(share, FALSE, &io)))
+    DBUG_RETURN(error);
+
+  if (io->query(query.ptr(), query.length()))
+    error= stash_remote_error();
+
+  DBUG_RETURN(error);
+}
+
+
+int ha_federatedx::repair(THD* thd, HA_CHECK_OPT* check_opt)
+{
+  int error= 0;
+  char query_buffer[STRING_BUFFER_USUAL_SIZE];
+  String query(query_buffer, sizeof(query_buffer), &my_charset_bin);
+  DBUG_ENTER("ha_federatedx::repair");
+
+  query.length(0);
+
+  query.set_charset(system_charset_info);
+  query.append(STRING_WITH_LEN("REPAIR TABLE "));
+  append_ident(&query, share->table_name, share->table_name_length, 
+               ident_quote_char);
+  if (check_opt->flags & T_QUICK)
+    query.append(STRING_WITH_LEN(" QUICK"));
+  if (check_opt->flags & T_EXTEND)
+    query.append(STRING_WITH_LEN(" EXTENDED"));
+  if (check_opt->sql_flags & TT_USEFRM)
+    query.append(STRING_WITH_LEN(" USE_FRM"));
+
+  DBUG_ASSERT(txn == get_txn(thd));
+
+  if ((error= txn->acquire(share, FALSE, &io)))
+    DBUG_RETURN(error);
+
+  if (io->query(query.ptr(), query.length()))
+    error= stash_remote_error();
+
+  DBUG_RETURN(error);
+}
+
+
+/*
+  Yes, update_row() does what you expect, it updates a row. old_data will have
+  the previous row record in it, while new_data will have the newest data in
+  it.
+
+  Keep in mind that the server can do updates based on ordering if an ORDER BY
+  clause was used. Consecutive ordering is not guaranteed.
+  Currently new_data will not have an updated auto_increament record, or
+  and updated timestamp field. You can do these for federatedx by doing these:
+  if (table->timestamp_on_update_now)
+    update_timestamp(new_row+table->timestamp_on_update_now-1);
+  if (table->next_number_field && record == table->record[0])
+    update_auto_increment();
+
+  Called from sql_select.cc, sql_acl.cc, sql_update.cc, and sql_insert.cc.
+*/
+
+int ha_federatedx::update_row(const uchar *old_data, uchar *new_data)
+{
+  /*
+    This used to control how the query was built. If there was a
+    primary key, the query would be built such that there was a where
+    clause with only that column as the condition. This is flawed,
+    because if we have a multi-part primary key, it would only use the
+    first part! We don't need to do this anyway, because
+    read_range_first will retrieve the correct record, which is what
+    is used to build the WHERE clause. We can however use this to
+    append a LIMIT to the end if there is NOT a primary key. Why do
+    this? Because we only are updating one record, and LIMIT enforces
+    this.
+  */
+  bool has_a_primary_key= test(table->s->primary_key != MAX_KEY);
+  
+  /*
+    buffers for following strings
+  */
+  char field_value_buffer[STRING_BUFFER_USUAL_SIZE];
+  char update_buffer[FEDERATEDX_QUERY_BUFFER_SIZE];
+  char where_buffer[FEDERATEDX_QUERY_BUFFER_SIZE];
+
+  /* Work area for field values */
+  String field_value(field_value_buffer, sizeof(field_value_buffer),
+                     &my_charset_bin);
+  /* stores the update query */
+  String update_string(update_buffer,
+                       sizeof(update_buffer),
+                       &my_charset_bin);
+  /* stores the WHERE clause */
+  String where_string(where_buffer,
+                      sizeof(where_buffer),
+                      &my_charset_bin);
+  uchar *record= table->record[0];
+  int error;
+  DBUG_ENTER("ha_federatedx::update_row");
+  /*
+    set string lengths to 0 to avoid misc chars in string
+  */
+  field_value.length(0);
+  update_string.length(0);
+  where_string.length(0);
+
+  if (ignore_duplicates)
+    update_string.append(STRING_WITH_LEN("UPDATE IGNORE "));
+  else
+    update_string.append(STRING_WITH_LEN("UPDATE "));
+  append_ident(&update_string, share->table_name,
+               share->table_name_length, ident_quote_char);
+  update_string.append(STRING_WITH_LEN(" SET "));
+
+  /*
+    In this loop, we want to match column names to values being inserted
+    (while building INSERT statement).
+
+    Iterate through table->field (new data) and share->old_field (old_data)
+    using the same index to create an SQL UPDATE statement. New data is
+    used to create SET field=value and old data is used to create WHERE
+    field=oldvalue
+  */
+
+  for (Field **field= table->field; *field; field++)
+  {
+    if (bitmap_is_set(table->write_set, (*field)->field_index))
+    {
+      uint field_name_length= strlen((*field)->field_name);
+      append_ident(&update_string, (*field)->field_name, field_name_length,
+                   ident_quote_char);
+      update_string.append(STRING_WITH_LEN(" = "));
+
+      if ((*field)->is_null())
+        update_string.append(STRING_WITH_LEN(" NULL "));
+      else
+      {
+        /* otherwise = */
+        my_bitmap_map *old_map= tmp_use_all_columns(table, table->read_set);
+        bool needs_quote= (*field)->str_needs_quotes();
+	(*field)->val_str(&field_value);
+        if (needs_quote)
+          update_string.append(value_quote_char);
+        field_value.print(&update_string);
+        if (needs_quote)
+          update_string.append(value_quote_char);
+        field_value.length(0);
+        tmp_restore_column_map(table->read_set, old_map);
+      }
+      update_string.append(STRING_WITH_LEN(", "));
+    }
+
+    if (bitmap_is_set(table->read_set, (*field)->field_index))
+    {
+      uint field_name_length= strlen((*field)->field_name);
+      append_ident(&where_string, (*field)->field_name, field_name_length,
+                   ident_quote_char);
+      if (field_in_record_is_null(table, *field, (char*) old_data))
+        where_string.append(STRING_WITH_LEN(" IS NULL "));
+      else
+      {
+        bool needs_quote= (*field)->str_needs_quotes();
+        where_string.append(STRING_WITH_LEN(" = "));
+        (*field)->val_str(&field_value,
+                          (old_data + (*field)->offset(record)));
+        if (needs_quote)
+          where_string.append(value_quote_char);
+        field_value.print(&where_string);
+        if (needs_quote)
+          where_string.append(value_quote_char);
+        field_value.length(0);
+      }
+      where_string.append(STRING_WITH_LEN(" AND "));
+    }
+  }
+
+  /* Remove last ', '. This works as there must be at least on updated field */
+  update_string.length(update_string.length() - sizeof_trailing_comma);
+
+  if (where_string.length())
+  {
+    /* chop off trailing AND */
+    where_string.length(where_string.length() - sizeof_trailing_and);
+    update_string.append(STRING_WITH_LEN(" WHERE "));
+    update_string.append(where_string);
+  }
+
+  /*
+    If this table has not a primary key, then we could possibly
+    update multiple rows. We want to make sure to only update one!
+  */
+  if (!has_a_primary_key)
+    update_string.append(STRING_WITH_LEN(" LIMIT 1"));
+
+  if ((error= txn->acquire(share, FALSE, &io)))
+    DBUG_RETURN(error);
+
+  if (io->query(update_string.ptr(), update_string.length()))
+  {
+    DBUG_RETURN(stash_remote_error());
+  }
+  DBUG_RETURN(0);
+}
+
+/*
+  This will delete a row. 'buf' will contain a copy of the row to be =deleted.
+  The server will call this right after the current row has been called (from
+  either a previous rnd_next() or index call).
+  If you keep a pointer to the last row or can access a primary key it will
+  make doing the deletion quite a bit easier.
+  Keep in mind that the server does no guarentee consecutive deletions.
+  ORDER BY clauses can be used.
+
+  Called in sql_acl.cc and sql_udf.cc to manage internal table information.
+  Called in sql_delete.cc, sql_insert.cc, and sql_select.cc. In sql_select
+  it is used for removing duplicates while in insert it is used for REPLACE
+  calls.
+*/
+
+int ha_federatedx::delete_row(const uchar *buf)
+{
+  char delete_buffer[FEDERATEDX_QUERY_BUFFER_SIZE];
+  char data_buffer[FEDERATEDX_QUERY_BUFFER_SIZE];
+  String delete_string(delete_buffer, sizeof(delete_buffer), &my_charset_bin);
+  String data_string(data_buffer, sizeof(data_buffer), &my_charset_bin);
+  uint found= 0;
+  int error;
+  DBUG_ENTER("ha_federatedx::delete_row");
+
+  delete_string.length(0);
+  delete_string.append(STRING_WITH_LEN("DELETE FROM "));
+  append_ident(&delete_string, share->table_name,
+               share->table_name_length, ident_quote_char);
+  delete_string.append(STRING_WITH_LEN(" WHERE "));
+
+  for (Field **field= table->field; *field; field++)
+  {
+    Field *cur_field= *field;
+    found++;
+    if (bitmap_is_set(table->read_set, cur_field->field_index))
+    {
+      append_ident(&delete_string, (*field)->field_name,
+                   strlen((*field)->field_name), ident_quote_char);
+      data_string.length(0);
+      if (cur_field->is_null())
+      {
+        delete_string.append(STRING_WITH_LEN(" IS NULL "));
+      }
+      else
+      {
+        bool needs_quote= cur_field->str_needs_quotes();
+        delete_string.append(STRING_WITH_LEN(" = "));
+        cur_field->val_str(&data_string);
+        if (needs_quote)
+          delete_string.append(value_quote_char);
+        data_string.print(&delete_string);
+        if (needs_quote)
+          delete_string.append(value_quote_char);
+      }
+      delete_string.append(STRING_WITH_LEN(" AND "));
+    }
+  }
+
+  // Remove trailing AND
+  delete_string.length(delete_string.length() - sizeof_trailing_and);
+  if (!found)
+    delete_string.length(delete_string.length() - sizeof_trailing_where);
+
+  delete_string.append(STRING_WITH_LEN(" LIMIT 1"));
+  DBUG_PRINT("info",
+             ("Delete sql: %s", delete_string.c_ptr_quick()));
+
+  if ((error= txn->acquire(share, FALSE, &io)))
+    DBUG_RETURN(error);
+
+  if (io->query(delete_string.ptr(), delete_string.length()))
+  {
+    DBUG_RETURN(stash_remote_error());
+  }
+  stats.deleted+= (ha_rows) io->affected_rows();
+  stats.records-= (ha_rows) io->affected_rows();
+  DBUG_PRINT("info",
+             ("rows deleted %ld  rows deleted for all time %ld",
+              (long) io->affected_rows(), (long) stats.deleted));
+
+  DBUG_RETURN(0);
+}
+
+
+/*
+  Positions an index cursor to the index specified in the handle. Fetches the
+  row if available. If the key value is null, begin at the first key of the
+  index. This method, which is called in the case of an SQL statement having
+  a WHERE clause on a non-primary key index, simply calls index_read_idx.
+*/
+
+int ha_federatedx::index_read(uchar *buf, const uchar *key,
+                             uint key_len, ha_rkey_function find_flag)
+{
+  DBUG_ENTER("ha_federatedx::index_read");
+
+  if (stored_result)
+    (void) free_result();
+  DBUG_RETURN(index_read_idx_with_result_set(buf, active_index, key,
+                                             key_len, find_flag,
+                                             &stored_result));
+}
+
+
+/*
+  Positions an index cursor to the index specified in key. Fetches the
+  row if any.  This is only used to read whole keys.
+
+  This method is called via index_read in the case of a WHERE clause using
+  a primary key index OR is called DIRECTLY when the WHERE clause
+  uses a PRIMARY KEY index.
+
+  NOTES
+    This uses an internal result set that is deleted before function
+    returns.  We need to be able to be callable from ha_rnd_pos()
+*/
+
+int ha_federatedx::index_read_idx(uchar *buf, uint index, const uchar *key,
+                                 uint key_len, enum ha_rkey_function find_flag)
+{
+  int retval;
+  FEDERATEDX_IO_RESULT *io_result= 0;
+  DBUG_ENTER("ha_federatedx::index_read_idx");
+
+  if ((retval= index_read_idx_with_result_set(buf, index, key,
+                                              key_len, find_flag,
+                                              &io_result)))
+    DBUG_RETURN(retval);
+  /* io is correct, as index_read_idx_with_result_set was ok */
+  io->free_result(io_result);
+  DBUG_RETURN(retval);
+}
+
+
+/*
+  Create result set for rows matching query and return first row
+
+  RESULT
+    0	ok     In this case *result will contain the result set
+	       table->status == 0 
+    #   error  In this case *result will contain 0
+               table->status == STATUS_NOT_FOUND
+*/
+
+int ha_federatedx::index_read_idx_with_result_set(uchar *buf, uint index,
+                                                 const uchar *key,
+                                                 uint key_len,
+                                                 ha_rkey_function find_flag,
+                                                 FEDERATEDX_IO_RESULT **result)
+{
+  int retval;
+  char error_buffer[FEDERATEDX_QUERY_BUFFER_SIZE];
+  char index_value[STRING_BUFFER_USUAL_SIZE];
+  char sql_query_buffer[FEDERATEDX_QUERY_BUFFER_SIZE];
+  String index_string(index_value,
+                      sizeof(index_value),
+                      &my_charset_bin);
+  String sql_query(sql_query_buffer,
+                   sizeof(sql_query_buffer),
+                   &my_charset_bin);
+  key_range range;
+  DBUG_ENTER("ha_federatedx::index_read_idx_with_result_set");
+
+  *result= 0;                                   // In case of errors
+  index_string.length(0);
+  sql_query.length(0);
+  ha_statistic_increment(&SSV::ha_read_key_count);
+
+  sql_query.append(share->select_query);
+
+  range.key= key;
+  range.length= key_len;
+  range.flag= find_flag;
+  create_where_from_key(&index_string,
+                        &table->key_info[index],
+                        &range,
+                        NULL, 0, 0);
+  sql_query.append(index_string);
+
+  if ((retval= txn->acquire(share, TRUE, &io)))
+    DBUG_RETURN(retval);
+
+  if (io->query(sql_query.ptr(), sql_query.length()))
+  {
+    my_sprintf(error_buffer, (error_buffer, "error: %d '%s'",
+                              io->error_code(), io->error_str()));
+    retval= ER_QUERY_ON_FOREIGN_DATA_SOURCE;
+    goto error;
+  }
+  if (!(*result= io->store_result()))
+  {
+    retval= HA_ERR_END_OF_FILE;
+    goto error;
+  }
+  if (!(retval= read_next(buf, *result)))
+    DBUG_RETURN(retval);
+
+  insert_dynamic(&results, (uchar*) result);
+  *result= 0;
+  table->status= STATUS_NOT_FOUND;
+  DBUG_RETURN(retval);
+
+error:
+  table->status= STATUS_NOT_FOUND;
+  my_error(retval, MYF(0), error_buffer);
+  DBUG_RETURN(retval);
+}
+
+
+/*
+  This method is used exlusevely by filesort() to check if we
+  can create sorting buffers of necessary size.
+  If the handler returns more records that it declares
+  here server can just crash on filesort().
+  We cannot guarantee that's not going to happen with
+  the FEDERATEDX engine, as we have records==0 always if the
+  client is a VIEW, and for the table the number of
+  records can inpredictably change during execution.
+  So we return maximum possible value here.
+*/
+
+ha_rows ha_federatedx::estimate_rows_upper_bound()
+{
+  return HA_POS_ERROR;
+}
+
+
+/* Initialized at each key walk (called multiple times unlike rnd_init()) */
+
+int ha_federatedx::index_init(uint keynr, bool sorted)
+{
+  DBUG_ENTER("ha_federatedx::index_init");
+  DBUG_PRINT("info", ("table: '%s'  key: %u", table->s->table_name.str, keynr));
+  active_index= keynr;
+  DBUG_RETURN(0);
+}
+
+
+/*
+  Read first range
+*/
+
+int ha_federatedx::read_range_first(const key_range *start_key,
+                                   const key_range *end_key,
+                                   bool eq_range_arg, bool sorted)
+{
+  char sql_query_buffer[FEDERATEDX_QUERY_BUFFER_SIZE];
+  int retval;
+  String sql_query(sql_query_buffer,
+                   sizeof(sql_query_buffer),
+                   &my_charset_bin);
+  DBUG_ENTER("ha_federatedx::read_range_first");
+
+  DBUG_ASSERT(!(start_key == NULL && end_key == NULL));
+
+  sql_query.length(0);
+  sql_query.append(share->select_query);
+  create_where_from_key(&sql_query,
+                        &table->key_info[active_index],
+                        start_key, end_key, 0, eq_range_arg);
+
+  if ((retval= txn->acquire(share, TRUE, &io)))
+    DBUG_RETURN(retval);
+
+  if (stored_result)
+    (void) free_result();
+
+  if (io->query(sql_query.ptr(), sql_query.length()))
+  {
+    retval= ER_QUERY_ON_FOREIGN_DATA_SOURCE;
+    goto error;
+  }
+  sql_query.length(0);
+
+  if (!(stored_result= io->store_result()))
+  {
+    retval= HA_ERR_END_OF_FILE;
+    goto error;
+  }
+
+  retval= read_next(table->record[0], stored_result);
+  DBUG_RETURN(retval);
+
+error:
+  table->status= STATUS_NOT_FOUND;
+  DBUG_RETURN(retval);
+}
+
+
+int ha_federatedx::read_range_next()
+{
+  int retval;
+  DBUG_ENTER("ha_federatedx::read_range_next");
+  retval= rnd_next(table->record[0]);
+  DBUG_RETURN(retval);
+}
+
+
+/* Used to read forward through the index.  */
+int ha_federatedx::index_next(uchar *buf)
+{
+  DBUG_ENTER("ha_federatedx::index_next");
+  ha_statistic_increment(&SSV::ha_read_next_count);
+  DBUG_RETURN(read_next(buf, stored_result));
+}
+
+
+/*
+  rnd_init() is called when the system wants the storage engine to do a table
+  scan.
+
+  This is the method that gets data for the SELECT calls.
+
+  See the federatedx in the introduction at the top of this file to see when
+  rnd_init() is called.
+
+  Called from filesort.cc, records.cc, sql_handler.cc, sql_select.cc,
+  sql_table.cc, and sql_update.cc.
+*/
+
+int ha_federatedx::rnd_init(bool scan)
+{
+  DBUG_ENTER("ha_federatedx::rnd_init");
+  /*
+    The use of the 'scan' flag is incredibly important for this handler
+    to work properly, especially with updates containing WHERE clauses
+    using indexed columns.
+
+    When the initial query contains a WHERE clause of the query using an
+    indexed column, it's index_read_idx that selects the exact record from
+    the foreign database.
+
+    When there is NO index in the query, either due to not having a WHERE
+    clause, or the WHERE clause is using columns that are not indexed, a
+    'full table scan' done by rnd_init, which in this situation simply means
+    a 'select * from ...' on the foreign table.
+
+    In other words, this 'scan' flag gives us the means to ensure that if
+    there is an index involved in the query, we want index_read_idx to
+    retrieve the exact record (scan flag is 0), and do not  want rnd_init
+    to do a 'full table scan' and wipe out that result set.
+
+    Prior to using this flag, the problem was most apparent with updates.
+
+    An initial query like 'UPDATE tablename SET anything = whatever WHERE
+    indexedcol = someval', index_read_idx would get called, using a query
+    constructed with a WHERE clause built from the values of index ('indexcol'
+    in this case, having a value of 'someval').  mysql_store_result would
+    then get called (this would be the result set we want to use).
+
+    After this rnd_init (from sql_update.cc) would be called, it would then
+    unecessarily call "select * from table" on the foreign table, then call
+    mysql_store_result, which would wipe out the correct previous result set
+    from the previous call of index_read_idx's that had the result set
+    containing the correct record, hence update the wrong row!
+
+  */
+
+  if (scan)
+  {
+    int error;
+
+    if ((error= txn->acquire(share, TRUE, &io)))
+      DBUG_RETURN(error);
+
+    if (stored_result)
+      (void) free_result();
+
+    if (io->query(share->select_query,
+                  strlen(share->select_query)))
+      goto error;
+
+    stored_result= io->store_result();
+    if (!stored_result)
+      goto error;
+  }
+  DBUG_RETURN(0);
+
+error:
+  DBUG_RETURN(stash_remote_error());
+}
+
+
+int ha_federatedx::rnd_end()
+{
+  DBUG_ENTER("ha_federatedx::rnd_end");
+  DBUG_RETURN(index_end());
+}
+
+
+int ha_federatedx::free_result()
+{
+  int error;
+  DBUG_ENTER("ha_federatedx::free_result");
+  DBUG_ASSERT(stored_result);
+  for (uint i= 0; i < results.elements; ++i)
+  {
+    FEDERATEDX_IO_RESULT *result= 0;
+    get_dynamic(&results, (uchar*) &result, i);
+    if (result == stored_result)
+      goto end;
+  }
+  if (position_called)
+  {
+    insert_dynamic(&results, (uchar*) &stored_result);
+  }
+  else
+  {
+    federatedx_io *tmp_io= 0, **iop;
+    if (!*(iop= &io) && (error= txn->acquire(share, TRUE, (iop= &tmp_io))))
+    {
+      DBUG_ASSERT(0);                             // Fail when testing
+      insert_dynamic(&results, (uchar*) &stored_result);
+      goto end;
+    }
+    (*iop)->free_result(stored_result);
+    txn->release(&tmp_io);
+  }
+end:
+  stored_result= 0;
+  position_called= FALSE;
+  DBUG_RETURN(0);
+}
+
+int ha_federatedx::index_end(void)
+{
+  int error= 0;
+  DBUG_ENTER("ha_federatedx::index_end");
+  if (stored_result)
+    error= free_result();
+  active_index= MAX_KEY;
+  DBUG_RETURN(error);
+}
+
+
+/*
+  This is called for each row of the table scan. When you run out of records
+  you should return HA_ERR_END_OF_FILE. Fill buff up with the row information.
+  The Field structure for the table is the key to getting data into buf
+  in a manner that will allow the server to understand it.
+
+  Called from filesort.cc, records.cc, sql_handler.cc, sql_select.cc,
+  sql_table.cc, and sql_update.cc.
+*/
+
+int ha_federatedx::rnd_next(uchar *buf)
+{
+  DBUG_ENTER("ha_federatedx::rnd_next");
+
+  if (stored_result == 0)
+  {
+    /*
+      Return value of rnd_init is not always checked (see records.cc),
+      so we can get here _even_ if there is _no_ pre-fetched result-set!
+      TODO: fix it. We can delete this in 5.1 when rnd_init() is checked.
+    */
+    DBUG_RETURN(1);
+  }
+  DBUG_RETURN(read_next(buf, stored_result));
+}
+
+
+/*
+  ha_federatedx::read_next
+
+  reads from a result set and converts to mysql internal
+  format
+
+  SYNOPSIS
+    field_in_record_is_null()
+      buf       byte pointer to record
+      result    mysql result set
+
+    DESCRIPTION
+     This method is a wrapper method that reads one record from a result
+     set and converts it to the internal table format
+
+    RETURN VALUE
+      1    error
+      0    no error 
+*/
+
+int ha_federatedx::read_next(uchar *buf, FEDERATEDX_IO_RESULT *result)
+{
+  int retval;
+  FEDERATEDX_IO_ROW *row;
+  DBUG_ENTER("ha_federatedx::read_next");
+
+  table->status= STATUS_NOT_FOUND;              // For easier return
+
+  if ((retval= txn->acquire(share, TRUE, &io)))
+    DBUG_RETURN(retval);
+
+  /* Fetch a row, insert it back in a row format. */
+  if (!(row= io->fetch_row(result)))
+    DBUG_RETURN(HA_ERR_END_OF_FILE);
+
+  if (!(retval= convert_row_to_internal_format(buf, row, result)))
+    table->status= 0;
+
+  DBUG_RETURN(retval);
+}
+
+
+/**
+  @brief      Store a reference to current row.
+
+  @details    During a query execution we may have different result sets (RS),
+              e.g. for different ranges. All the RS's used are stored in
+              memory and placed in @c results dynamic array. At the end of
+              execution all stored RS's are freed at once in the
+              @c ha_federated::reset().
+              So, in case of federated, a reference to current row is a
+              stored result address and current data cursor position.
+              As we keep all RS in memory during a query execution,
+              we can get any record using the reference any time until
+              @c ha_federated::reset() is called.
+              TODO: we don't have to store all RS's rows but only those
+              we call @c ha_federated::position() for, so we can free memory
+              where we store other rows in the @c ha_federated::index_end().
+
+  @param[in]  record  record data (unused)
+
+*/
+
+void ha_federatedx::position(const uchar *record __attribute__ ((unused)))
+{
+  DBUG_ENTER("ha_federatedx::position");
+
+  bzero(ref, ref_length);
+
+  if (!stored_result)
+    DBUG_VOID_RETURN;
+
+  if (txn->acquire(share, TRUE, &io))
+    DBUG_VOID_RETURN;
+
+  io->mark_position(stored_result, ref);
+
+  position_called= TRUE;
+
+  DBUG_VOID_RETURN;
+}
+
+
+/*
+  This is like rnd_next, but you are given a position to use to determine the
+  row. The position will be of the type that you stored in ref.
+
+  This method is required for an ORDER BY
+
+  Called from filesort.cc records.cc sql_insert.cc sql_select.cc sql_update.cc.
+*/
+
+int ha_federatedx::rnd_pos(uchar *buf, uchar *pos)
+{
+  int retval;
+  FEDERATEDX_IO_RESULT *result= stored_result;
+  DBUG_ENTER("ha_federatedx::rnd_pos");
+  ha_statistic_increment(&SSV::ha_read_rnd_count);
+
+  /* We have to move this to 'ref' to get things aligned */
+  bmove(ref, pos, ref_length);
+
+  if ((retval= txn->acquire(share, TRUE, &io)))
+    goto error;
+
+  if ((retval= io->seek_position(&result, ref)))
+    goto error;
+
+  retval= read_next(buf, result);
+  DBUG_RETURN(retval);
+
+error:
+  table->status= STATUS_NOT_FOUND;
+  DBUG_RETURN(retval);
+}
+
+
+/*
+  ::info() is used to return information to the optimizer.
+  Currently this table handler doesn't implement most of the fields
+  really needed. SHOW also makes use of this data
+  Another note, you will probably want to have the following in your
+  code:
+  if (records < 2)
+    records = 2;
+  The reason is that the server will optimize for cases of only a single
+  record. If in a table scan you don't know the number of records
+  it will probably be better to set records to two so you can return
+  as many records as you need.
+  Along with records a few more variables you may wish to set are:
+    records
+    deleted
+    data_file_length
+    index_file_length
+    delete_length
+    check_time
+  Take a look at the public variables in handler.h for more information.
+
+  Called in:
+    filesort.cc
+    ha_heap.cc
+    item_sum.cc
+    opt_sum.cc
+    sql_delete.cc
+    sql_delete.cc
+    sql_derived.cc
+    sql_select.cc
+    sql_select.cc
+    sql_select.cc
+    sql_select.cc
+    sql_select.cc
+    sql_show.cc
+    sql_show.cc
+    sql_show.cc
+    sql_show.cc
+    sql_table.cc
+    sql_union.cc
+    sql_update.cc
+
+*/
+
+int ha_federatedx::info(uint flag)
+{
+  uint error_code;
+  THD *thd= current_thd;
+  federatedx_txn *tmp_txn;
+  federatedx_io *tmp_io= 0, **iop= 0;
+  DBUG_ENTER("ha_federatedx::info");
+
+  error_code= ER_QUERY_ON_FOREIGN_DATA_SOURCE;
+  
+  // external_lock may not have been called so txn may not be set
+  tmp_txn= get_txn(thd);
+
+  /* we want not to show table status if not needed to do so */
+  if (flag & (HA_STATUS_VARIABLE | HA_STATUS_CONST | HA_STATUS_AUTO))
+  {
+    if (!*(iop= &io) && (error_code= tmp_txn->acquire(share, TRUE, (iop= &tmp_io))))
+      goto fail;
+  }
+
+  if (flag & (HA_STATUS_VARIABLE | HA_STATUS_CONST))
+  {
+    /*
+      size of IO operations (This is based on a good guess, no high science
+      involved)
+    */
+    if (flag & HA_STATUS_CONST)
+      stats.block_size= 4096;
+
+    if ((*iop)->table_metadata(&stats, share->table_name,
+                               share->table_name_length, flag))
+      goto error;
+  }
+
+  if (flag & HA_STATUS_AUTO)
+    stats.auto_increment_value= (*iop)->last_insert_id();
+
+  /*
+    If ::info created it's own transaction, close it. This happens in case
+    of show table status;
+  */
+  tmp_txn->release(&tmp_io);
+
+  DBUG_RETURN(0);
+
+error:
+  if (iop && *iop)
+  {
+    my_printf_error((*iop)->error_code(), "Received error: %d : %s", MYF(0),
+                    (*iop)->error_code(), (*iop)->error_str());
+  }
+  else if (remote_error_number != -1 /* error already reported */)
+  {
+    error_code= remote_error_number;
+    my_error(error_code, MYF(0), ER(error_code));
+  }
+fail:
+  tmp_txn->release(&tmp_io);
+  DBUG_RETURN(error_code);
+}
+
+
+/**
+  @brief Handles extra signals from MySQL server
+
+  @param[in] operation  Hint for storage engine
+
+  @return Operation Status
+  @retval 0     OK
+ */
+int ha_federatedx::extra(ha_extra_function operation)
+{
+  DBUG_ENTER("ha_federatedx::extra");
+  switch (operation) {
+  case HA_EXTRA_IGNORE_DUP_KEY:
+    ignore_duplicates= TRUE;
+    break;
+  case HA_EXTRA_NO_IGNORE_DUP_KEY:
+    insert_dup_update= FALSE;
+    ignore_duplicates= FALSE;
+    break;
+  case HA_EXTRA_WRITE_CAN_REPLACE:
+    replace_duplicates= TRUE;
+    break;
+  case HA_EXTRA_WRITE_CANNOT_REPLACE:
+    /*
+      We use this flag to ensure that we do not create an "INSERT IGNORE"
+      statement when inserting new rows into the remote table.
+    */
+    replace_duplicates= FALSE;
+    break;
+  case HA_EXTRA_INSERT_WITH_UPDATE:
+    insert_dup_update= TRUE;
+    break;
+  case HA_EXTRA_PREPARE_FOR_DROP:
+    table_will_be_deleted = TRUE;
+    break;
+  default:
+    /* do nothing */
+    DBUG_PRINT("info",("unhandled operation: %d", (uint) operation));
+  }
+  DBUG_RETURN(0);
+}
+
+
+/**
+  @brief Reset state of file to after 'open'.
+
+  @detail This function is called after every statement for all tables
+    used by that statement.
+
+  @return Operation status
+    @retval     0       OK
+*/
+
+int ha_federatedx::reset(void)
+{
+  int error = 0;
+
+  insert_dup_update= FALSE;
+  ignore_duplicates= FALSE;
+  replace_duplicates= FALSE;
+  position_called= FALSE;
+
+  if (stored_result)
+    insert_dynamic(&results, (uchar*) &stored_result);
+  stored_result= 0;
+
+  if (results.elements)
+  {
+    federatedx_txn *tmp_txn;
+    federatedx_io *tmp_io= 0, **iop;
+
+    // external_lock may not have been called so txn may not be set
+    tmp_txn= get_txn(current_thd);
+
+    if (!*(iop= &io) && (error= tmp_txn->acquire(share, TRUE, (iop= &tmp_io))))
+    {
+      DBUG_ASSERT(0);                             // Fail when testing
+      return error;
+    }
+
+    for (uint i= 0; i < results.elements; ++i)
+    {
+      FEDERATEDX_IO_RESULT *result= 0;
+      get_dynamic(&results, (uchar*) &result, i);
+      (*iop)->free_result(result);
+    }
+    tmp_txn->release(&tmp_io);
+    reset_dynamic(&results);
+  }
+
+  return error;
+
+}
+
+/*
+  Used to delete all rows in a table. Both for cases of truncate and
+  for cases where the optimizer realizes that all rows will be
+  removed as a result of a SQL statement.
+
+  Called from item_sum.cc by Item_func_group_concat::clear(),
+  Item_sum_count_distinct::clear(), and Item_func_group_concat::clear().
+  Called from sql_delete.cc by mysql_delete().
+  Called from sql_select.cc by JOIN::reinit().
+  Called from sql_union.cc by st_select_lex_unit::exec().
+*/
+
+int ha_federatedx::delete_all_rows()
+{
+  char query_buffer[FEDERATEDX_QUERY_BUFFER_SIZE];
+  String query(query_buffer, sizeof(query_buffer), &my_charset_bin);
+  int error;
+  DBUG_ENTER("ha_federatedx::delete_all_rows");
+
+  query.length(0);
+
+  query.set_charset(system_charset_info);
+  query.append(STRING_WITH_LEN("TRUNCATE "));
+  append_ident(&query, share->table_name, share->table_name_length,
+               ident_quote_char);
+
+  /* no need for savepoint in autocommit mode */
+  if (!(ha_thd()->options & (OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN)))
+    txn->stmt_autocommit();
+
+  /*
+    TRUNCATE won't return anything in mysql_affected_rows
+  */
+
+  if ((error= txn->acquire(share, FALSE, &io)))
+    DBUG_RETURN(error);
+
+  if (io->query(query.ptr(), query.length()))
+  {
+    DBUG_RETURN(stash_remote_error());
+  }
+  stats.deleted+= stats.records;
+  stats.records= 0;
+  DBUG_RETURN(0);
+}
+
+
+/*
+  The idea with handler::store_lock() is the following:
+
+  The statement decided which locks we should need for the table
+  for updates/deletes/inserts we get WRITE locks, for SELECT... we get
+  read locks.
+
+  Before adding the lock into the table lock handler (see thr_lock.c)
+  mysqld calls store lock with the requested locks.  Store lock can now
+  modify a write lock to a read lock (or some other lock), ignore the
+  lock (if we don't want to use MySQL table locks at all) or add locks
+  for many tables (like we do when we are using a MERGE handler).
+
+  Berkeley DB for federatedx  changes all WRITE locks to TL_WRITE_ALLOW_WRITE
+  (which signals that we are doing WRITES, but we are still allowing other
+  reader's and writer's.
+
+  When releasing locks, store_lock() are also called. In this case one
+  usually doesn't have to do anything.
+
+  In some exceptional cases MySQL may send a request for a TL_IGNORE;
+  This means that we are requesting the same lock as last time and this
+  should also be ignored. (This may happen when someone does a flush
+  table when we have opened a part of the tables, in which case mysqld
+  closes and reopens the tables and tries to get the same locks at last
+  time).  In the future we will probably try to remove this.
+
+  Called from lock.cc by get_lock_data().
+*/
+
+THR_LOCK_DATA **ha_federatedx::store_lock(THD *thd,
+                                         THR_LOCK_DATA **to,
+                                         enum thr_lock_type lock_type)
+{
+  DBUG_ENTER("ha_federatedx::store_lock");
+  if (lock_type != TL_IGNORE && lock.type == TL_UNLOCK)
+  {
+    /*
+      Here is where we get into the guts of a row level lock.
+      If TL_UNLOCK is set
+      If we are not doing a LOCK TABLE or DISCARD/IMPORT
+      TABLESPACE, then allow multiple writers
+    */
+
+    if ((lock_type >= TL_WRITE_CONCURRENT_INSERT &&
+         lock_type <= TL_WRITE) && !thd->in_lock_tables)
+      lock_type= TL_WRITE_ALLOW_WRITE;
+
+    /*
+      In queries of type INSERT INTO t1 SELECT ... FROM t2 ...
+      MySQL would use the lock TL_READ_NO_INSERT on t2, and that
+      would conflict with TL_WRITE_ALLOW_WRITE, blocking all inserts
+      to t2. Convert the lock to a normal read lock to allow
+      concurrent inserts to t2.
+    */
+
+    if (lock_type == TL_READ_NO_INSERT && !thd->in_lock_tables)
+      lock_type= TL_READ;
+
+    lock.type= lock_type;
+  }
+
+  *to++= &lock;
+
+  DBUG_RETURN(to);
+}
+
+
+static int test_connection(MYSQL_THD thd, federatedx_io *io,
+                           FEDERATEDX_SHARE *share)
+{
+  char buffer[FEDERATEDX_QUERY_BUFFER_SIZE];
+  String str(buffer, sizeof(buffer), &my_charset_bin);
+  FEDERATEDX_IO_RESULT *resultset= NULL;
+  int retval;
+
+  str.length(0);
+  str.append(STRING_WITH_LEN("SELECT * FROM "));
+  append_identifier(thd, &str, share->table_name,
+                    share->table_name_length);
+  str.append(STRING_WITH_LEN(" WHERE 1=0"));
+
+  if ((retval= io->query(str.ptr(), str.length())))
+  {
+    my_sprintf(buffer, (buffer,
+               "database: '%s'  username: '%s'  hostname: '%s'",
+               share->database, share->username, share->hostname));
+    DBUG_PRINT("info", ("error-code: %d", io->error_code()));
+    my_error(ER_CANT_CREATE_FEDERATED_TABLE, MYF(0), buffer);
+  }
+  else
+    resultset= io->store_result();
+
+  io->free_result(resultset);
+
+  return retval;
+}
+
+/*
+  create() does nothing, since we have no local setup of our own.
+  FUTURE: We should potentially connect to the foreign database and
+*/
+
+int ha_federatedx::create(const char *name, TABLE *table_arg,
+                         HA_CREATE_INFO *create_info)
+{
+  int retval;
+  THD *thd= current_thd;
+  FEDERATEDX_SHARE tmp_share; // Only a temporary share, to test the url
+  federatedx_txn *tmp_txn;
+  federatedx_io *tmp_io= NULL;
+  DBUG_ENTER("ha_federatedx::create");
+
+  if ((retval= parse_url(thd->mem_root, &tmp_share, table_arg, 1)))
+    goto error;
+
+  /* loopback socket connections hang due to LOCK_open mutex */
+  if ((!tmp_share.hostname || !strcmp(tmp_share.hostname,my_localhost)) &&
+      !tmp_share.port)
+    goto error;
+
+  /*
+    If possible, we try to use an existing network connection to
+    the remote server. To ensure that no new FEDERATEDX_SERVER
+    instance is created, we pass NULL in get_server() TABLE arg.
+  */
+  pthread_mutex_lock(&federatedx_mutex);
+  tmp_share.s= get_server(&tmp_share, NULL);
+  pthread_mutex_unlock(&federatedx_mutex);
+
+  if (tmp_share.s)
+  {
+    tmp_txn= get_txn(thd);
+    if (!(retval= tmp_txn->acquire(&tmp_share, TRUE, &tmp_io)))
+    {
+      retval= test_connection(thd, tmp_io, &tmp_share);
+      tmp_txn->release(&tmp_io);
+    }
+    free_server(tmp_txn, tmp_share.s);
+  }
+  else
+  {
+    FEDERATEDX_SERVER server;
+
+#ifdef NOT_YET
+    /* 
+      Bug#25679
+      Ensure that we do not hold the LOCK_open mutex while attempting
+      to establish FederatedX connection to guard against a trivial
+      Denial of Service scenerio.
+    */
+    safe_mutex_assert_not_owner(&LOCK_open);
+#endif
+
+    fill_server(thd->mem_root, &server, &tmp_share, create_info->table_charset);
+
+#ifndef DBUG_OFF
+    pthread_mutex_init(&server.mutex, MY_MUTEX_INIT_FAST);
+    pthread_mutex_lock(&server.mutex);
+#endif
+
+    tmp_io= federatedx_io::construct(thd->mem_root, &server);
+
+    retval= test_connection(thd, tmp_io, &tmp_share);
+
+#ifndef DBUG_OFF
+    pthread_mutex_unlock(&server.mutex);
+    pthread_mutex_destroy(&server.mutex);
+#endif
+
+    delete tmp_io;
+  }
+
+error:
+  DBUG_RETURN(retval);
+
+}
+
+
+int ha_federatedx::stash_remote_error()
+{
+  DBUG_ENTER("ha_federatedx::stash_remote_error()");
+  if (!io)
+    DBUG_RETURN(remote_error_number);
+  remote_error_number= io->error_code();
+  strmake(remote_error_buf, io->error_str(), sizeof(remote_error_buf)-1);
+  if (remote_error_number == ER_DUP_ENTRY ||
+      remote_error_number == ER_DUP_KEY)
+    DBUG_RETURN(HA_ERR_FOUND_DUPP_KEY);
+  DBUG_RETURN(HA_FEDERATEDX_ERROR_WITH_REMOTE_SYSTEM);
+}
+
+
+bool ha_federatedx::get_error_message(int error, String* buf)
+{
+  DBUG_ENTER("ha_federatedx::get_error_message");
+  DBUG_PRINT("enter", ("error: %d", error));
+  if (error == HA_FEDERATEDX_ERROR_WITH_REMOTE_SYSTEM)
+  {
+    buf->append(STRING_WITH_LEN("Error on remote system: "));
+    buf->qs_append(remote_error_number);
+    buf->append(STRING_WITH_LEN(": "));
+    buf->append(remote_error_buf);
+
+    remote_error_number= 0;
+    remote_error_buf[0]= '\0';
+  }
+  DBUG_PRINT("exit", ("message: %s", buf->ptr()));
+  DBUG_RETURN(FALSE);
+}
+
+
+int ha_federatedx::start_stmt(MYSQL_THD thd, thr_lock_type lock_type)
+{
+  DBUG_ENTER("ha_federatedx::start_stmt");
+  DBUG_ASSERT(txn == get_txn(thd));
+  
+  if (!txn->in_transaction())
+  {
+    txn->stmt_begin();
+    trans_register_ha(thd, FALSE, ht);
+  }
+  DBUG_RETURN(0);
+}
+
+
+int ha_federatedx::external_lock(MYSQL_THD thd, int lock_type)
+{
+  int error= 0;
+  DBUG_ENTER("ha_federatedx::external_lock");
+
+  if (lock_type == F_UNLCK)
+    txn->release(&io);
+  else
+  {
+    table_will_be_deleted = FALSE;
+    txn= get_txn(thd);  
+    if (!(error= txn->acquire(share, lock_type == F_RDLCK, &io)) &&
+        (lock_type == F_WRLCK || !io->is_autocommit()))
+    {
+      if (!thd_test_options(thd, (OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN)))
+      {
+        txn->stmt_begin();
+        trans_register_ha(thd, FALSE, ht);
+      }
+      else
+      {
+        txn->txn_begin();
+        trans_register_ha(thd, TRUE, ht);
+      }
+    }
+  }
+
+  DBUG_RETURN(error);
+}
+
+
+int ha_federatedx::savepoint_set(handlerton *hton, MYSQL_THD thd, void *sv)
+{
+  int error= 0;
+  federatedx_txn *txn= (federatedx_txn *) thd_get_ha_data(thd, hton);
+  DBUG_ENTER("ha_federatedx::savepoint_set");
+
+  if (txn && txn->has_connections())
+  {
+    if (txn->txn_begin())
+      trans_register_ha(thd, TRUE, hton);
+    
+    txn->sp_acquire((ulong *) sv);
+
+    DBUG_ASSERT(1 < *(ulong *) sv);
+  }
+
+  DBUG_RETURN(error);
+}
+
+
+int ha_federatedx::savepoint_rollback(handlerton *hton, MYSQL_THD thd, void *sv)
+ {
+  int error= 0;
+  federatedx_txn *txn= (federatedx_txn *) thd_get_ha_data(thd, hton);
+  DBUG_ENTER("ha_federatedx::savepoint_rollback");
+  
+  if (txn)
+    error= txn->sp_rollback((ulong *) sv);
+
+  DBUG_RETURN(error);
+}
+
+
+int ha_federatedx::savepoint_release(handlerton *hton, MYSQL_THD thd, void *sv)
+{
+  int error= 0;
+  federatedx_txn *txn= (federatedx_txn *) thd_get_ha_data(thd, hton);
+  DBUG_ENTER("ha_federatedx::savepoint_release");
+  
+  if (txn)
+    error= txn->sp_release((ulong *) sv);
+
+  DBUG_RETURN(error);
+}
+
+
+int ha_federatedx::commit(handlerton *hton, MYSQL_THD thd, bool all)
+{
+  int return_val;
+  federatedx_txn *txn= (federatedx_txn *) thd_get_ha_data(thd, hton);
+  DBUG_ENTER("ha_federatedx::commit");
+
+  if (all)
+    return_val= txn->txn_commit();
+  else
+    return_val= txn->stmt_commit();    
+  
+  DBUG_PRINT("info", ("error val: %d", return_val));
+  DBUG_RETURN(return_val);
+}
+
+
+int ha_federatedx::rollback(handlerton *hton, MYSQL_THD thd, bool all)
+{
+  int return_val;
+  federatedx_txn *txn= (federatedx_txn *) thd_get_ha_data(thd, hton);
+  DBUG_ENTER("ha_federatedx::rollback");
+
+  if (all)
+    return_val= txn->txn_rollback();
+  else
+    return_val= txn->stmt_rollback();
+
+  DBUG_PRINT("info", ("error val: %d", return_val));
+  DBUG_RETURN(return_val);
+}
+
+struct st_mysql_storage_engine federatedx_storage_engine=
+{ MYSQL_HANDLERTON_INTERFACE_VERSION };
+
+mysql_declare_plugin(federatedx)
+{
+  MYSQL_STORAGE_ENGINE_PLUGIN,
+  &federatedx_storage_engine,
+  "FEDERATED",
+  "Patrick Galbraith",
+  "FederatedX pluggable storage engine",
+  PLUGIN_LICENSE_GPL,
+  federatedx_db_init, /* Plugin Init */
+  federatedx_done, /* Plugin Deinit */
+  0x0200 /* 2.0 */,
+  NULL,                       /* status variables                */
+  NULL,                       /* system variables                */
+  NULL                        /* config options                  */
+}
+mysql_declare_plugin_end;
+maria_declare_plugin(federatedx)
+{
+  MYSQL_STORAGE_ENGINE_PLUGIN,
+  &federatedx_storage_engine,
+  "FEDERATED",
+  "Patrick Galbraith",
+  "FederatedX pluggable storage engine",
+  PLUGIN_LICENSE_GPL,
+  federatedx_db_init, /* Plugin Init */
+  federatedx_done, /* Plugin Deinit */
+  0x0200 /* 2.0 */,
+  NULL,                       /* status variables                */
+  NULL,                       /* system variables                */
+  "2.0",                      /* string version */
+  MariaDB_PLUGIN_MATURITY_BETA /* maturity */
+}
+maria_declare_plugin_end;
diff --git a/storage/federatedx/ha_federatedx.h b/storage/federatedx/ha_federatedx.h
new file mode 100644
index 00000000000..2820f8a6c29
--- /dev/null
+++ b/storage/federatedx/ha_federatedx.h
@@ -0,0 +1,457 @@
+/*
+Copyright (c) 2008, Patrick Galbraith
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+    * Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above
+copyright notice, this list of conditions and the following disclaimer
+in the documentation and/or other materials provided with the
+distribution.
+
+    * Neither the name of Patrick Galbraith nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+
+class federatedx_io;
+
+/*
+  FEDERATEDX_SERVER will eventually be a structure that will be shared among
+  all FEDERATEDX_SHARE instances so that the federated server can minimise
+  the number of open connections. This will eventually lead to the support
+  of reliable XA federated tables.
+*/
+typedef struct st_fedrated_server {
+  MEM_ROOT mem_root;
+  uint use_count, io_count;
+
+  uchar *key;
+  uint key_length;
+
+  const char *scheme;
+  const char *hostname;
+  const char *username;
+  const char *password;
+  const char *database;
+  const char *socket;
+  ushort port;
+
+  const char *csname;
+
+  pthread_mutex_t mutex;
+  federatedx_io *idle_list;
+} FEDERATEDX_SERVER;
+
+/*
+  Please read ha_exmple.cc before reading this file.
+  Please keep in mind that the federatedx storage engine implements all methods
+  that are required to be implemented. handler.h has a full list of methods
+  that you can implement.
+*/
+
+#ifdef USE_PRAGMA_INTERFACE
+#pragma interface			/* gcc class implementation */
+#endif
+
+#include <mysql.h>
+
+/*
+  handler::print_error has a case statement for error numbers.
+  This value is (10000) is far out of range and will envoke the
+  default: case.
+  (Current error range is 120-159 from include/my_base.h)
+*/
+#define HA_FEDERATEDX_ERROR_WITH_REMOTE_SYSTEM 10000
+
+#define FEDERATEDX_QUERY_BUFFER_SIZE STRING_BUFFER_USUAL_SIZE * 5
+#define FEDERATEDX_RECORDS_IN_RANGE 2
+#define FEDERATEDX_MAX_KEY_LENGTH 3500 // Same as innodb
+
+/*
+  FEDERATEDX_SHARE is a structure that will be shared amoung all open handlers
+  The example implements the minimum of what you will probably need.
+*/
+typedef struct st_federatedx_share {
+  MEM_ROOT mem_root;
+
+  bool parsed;
+  /* this key is unique db/tablename */
+  const char *share_key;
+  /*
+    the primary select query to be used in rnd_init
+  */
+  char *select_query;
+  /*
+    remote host info, parse_url supplies
+  */
+  char *server_name;
+  char *connection_string;
+  char *scheme;
+  char *hostname;
+  char *username;
+  char *password;
+  char *database;
+  char *table_name;
+  char *table;
+  char *socket;
+  char *sport;
+  int share_key_length;
+  ushort port;
+
+  uint table_name_length, server_name_length, connect_string_length;
+  uint use_count;
+  THR_LOCK lock;
+  FEDERATEDX_SERVER *s;
+} FEDERATEDX_SHARE;
+
+
+typedef struct st_federatedx_result FEDERATEDX_IO_RESULT;
+typedef struct st_federatedx_row FEDERATEDX_IO_ROW;
+typedef ptrdiff_t FEDERATEDX_IO_OFFSET;
+
+class federatedx_io
+{
+  friend class federatedx_txn;
+  FEDERATEDX_SERVER * const server;
+  federatedx_io **owner_ptr;
+  federatedx_io *txn_next;
+  federatedx_io *idle_next;
+  bool active;  /* currently participating in a transaction */
+  bool busy;    /* in use by a ha_federated instance */
+  bool readonly;/* indicates that no updates have occurred */
+
+protected:
+  void set_active(bool new_active)
+  { active= new_active; }
+public:
+  federatedx_io(FEDERATEDX_SERVER *);
+  virtual ~federatedx_io();
+
+  bool is_readonly() const { return readonly; }
+  bool is_active() const { return active; }
+
+  const char * get_charsetname() const
+  { return server->csname ? server->csname : "latin1"; }
+
+  const char * get_hostname() const { return server->hostname; }
+  const char * get_username() const { return server->username; }
+  const char * get_password() const { return server->password; }
+  const char * get_database() const { return server->database; }
+  ushort       get_port() const     { return server->port; }
+  const char * get_socket() const   { return server->socket; }
+
+  static bool handles_scheme(const char *scheme);
+  static federatedx_io *construct(MEM_ROOT *server_root,
+                                  FEDERATEDX_SERVER *server);
+
+  static void *operator new(size_t size, MEM_ROOT *mem_root) throw ()
+  { return alloc_root(mem_root, size); }
+  static void operator delete(void *ptr, size_t size)
+  { TRASH(ptr, size); }
+
+  virtual int query(const char *buffer, uint length)=0;
+  virtual FEDERATEDX_IO_RESULT *store_result()=0;
+
+  virtual size_t max_query_size() const=0;
+
+  virtual my_ulonglong affected_rows() const=0;
+  virtual my_ulonglong last_insert_id() const=0;
+
+  virtual int error_code()=0;
+  virtual const char *error_str()=0;
+
+  virtual void reset()=0;
+  virtual int commit()=0;
+  virtual int rollback()=0;
+
+  virtual int savepoint_set(ulong sp)=0;
+  virtual ulong savepoint_release(ulong sp)=0;
+  virtual ulong savepoint_rollback(ulong sp)=0;
+  virtual void savepoint_restrict(ulong sp)=0;
+
+  virtual ulong last_savepoint() const=0;
+  virtual ulong actual_savepoint() const=0;
+  virtual bool is_autocommit() const=0;
+
+  virtual bool table_metadata(ha_statistics *stats, const char *table_name,
+                              uint table_name_length, uint flag) = 0;
+
+  /* resultset operations */
+
+  virtual void free_result(FEDERATEDX_IO_RESULT *io_result)=0;
+  virtual unsigned int get_num_fields(FEDERATEDX_IO_RESULT *io_result)=0;
+  virtual my_ulonglong get_num_rows(FEDERATEDX_IO_RESULT *io_result)=0;
+  virtual FEDERATEDX_IO_ROW *fetch_row(FEDERATEDX_IO_RESULT *io_result)=0;
+  virtual ulong *fetch_lengths(FEDERATEDX_IO_RESULT *io_result)=0;
+  virtual const char *get_column_data(FEDERATEDX_IO_ROW *row,
+                                      unsigned int column)=0;
+  virtual bool is_column_null(const FEDERATEDX_IO_ROW *row,
+                              unsigned int column) const=0;
+
+  virtual size_t get_ref_length() const=0;
+  virtual void mark_position(FEDERATEDX_IO_RESULT *io_result,
+                             void *ref)=0;
+  virtual int seek_position(FEDERATEDX_IO_RESULT **io_result,
+                            const void *ref)=0;
+
+};
+
+
+class federatedx_txn
+{
+  federatedx_io *txn_list;
+  ulong savepoint_level;
+  ulong savepoint_stmt;
+  ulong savepoint_next;
+
+  void release_scan();
+public:
+  federatedx_txn();
+  ~federatedx_txn();
+
+  bool has_connections() const { return txn_list != NULL; }
+  bool in_transaction() const { return savepoint_next != 0; }
+  int acquire(FEDERATEDX_SHARE *share, bool readonly, federatedx_io **io);
+  void release(federatedx_io **io);
+  void close(FEDERATEDX_SERVER *);
+
+  bool txn_begin();
+  int txn_commit();
+  int txn_rollback();
+
+  bool sp_acquire(ulong *save);
+  int sp_rollback(ulong *save);
+  int sp_release(ulong *save);
+
+  bool stmt_begin();
+  int stmt_commit();
+  int stmt_rollback();
+  void stmt_autocommit();
+};
+
+
+/*
+  Class definition for the storage engine
+*/
+class ha_federatedx: public handler
+{
+  friend int federatedx_db_init(void *p);
+
+  THR_LOCK_DATA lock;      /* MySQL lock */
+  FEDERATEDX_SHARE *share;    /* Shared lock info */
+  federatedx_txn *txn;
+  federatedx_io *io;
+  FEDERATEDX_IO_RESULT *stored_result;
+  /**
+      Array of all stored results we get during a query execution.
+  */
+  DYNAMIC_ARRAY results;
+  bool position_called;
+  uint fetch_num; // stores the fetch num
+  int remote_error_number;
+  char remote_error_buf[FEDERATEDX_QUERY_BUFFER_SIZE];
+  bool ignore_duplicates, replace_duplicates;
+  bool insert_dup_update, table_will_be_deleted;
+  DYNAMIC_STRING bulk_insert;
+
+private:
+  /*
+      return 0 on success
+      return errorcode otherwise
+  */
+  uint convert_row_to_internal_format(uchar *buf, FEDERATEDX_IO_ROW *row,
+                                      FEDERATEDX_IO_RESULT *result);
+  bool create_where_from_key(String *to, KEY *key_info,
+                             const key_range *start_key,
+                             const key_range *end_key,
+                             bool records_in_range, bool eq_range);
+  int stash_remote_error();
+
+  federatedx_txn *get_txn(THD *thd, bool no_create= FALSE);
+
+  static int disconnect(handlerton *hton, MYSQL_THD thd);
+  static int savepoint_set(handlerton *hton, MYSQL_THD thd, void *sv);
+  static int savepoint_rollback(handlerton *hton, MYSQL_THD thd, void *sv);
+  static int savepoint_release(handlerton *hton, MYSQL_THD thd, void *sv);
+  static int commit(handlerton *hton, MYSQL_THD thd, bool all);
+  static int rollback(handlerton *hton, MYSQL_THD thd, bool all);
+
+  bool append_stmt_insert(String *query);
+
+  int read_next(uchar *buf, FEDERATEDX_IO_RESULT *result);
+  int index_read_idx_with_result_set(uchar *buf, uint index,
+                                     const uchar *key,
+                                     uint key_len,
+                                     ha_rkey_function find_flag,
+                                     FEDERATEDX_IO_RESULT **result);
+  int real_query(const char *query, uint length);
+  int real_connect(FEDERATEDX_SHARE *my_share, uint create_flag);
+public:
+  ha_federatedx(handlerton *hton, TABLE_SHARE *table_arg);
+  ~ha_federatedx() {}
+  /* The name that will be used for display purposes */
+  const char *table_type() const { return "FEDERATED"; }
+  /*
+    The name of the index type that will be used for display
+    don't implement this method unless you really have indexes
+   */
+  // perhaps get index type
+  const char *index_type(uint inx) { return "REMOTE"; }
+  const char **bas_ext() const;
+  /*
+    This is a list of flags that says what the storage engine
+    implements. The current table flags are documented in
+    handler.h
+  */
+  ulonglong table_flags() const
+  {
+    /* fix server to be able to get remote server table flags */
+    return (HA_PRIMARY_KEY_IN_READ_INDEX | HA_FILE_BASED
+            | HA_REC_NOT_IN_SEQ | HA_AUTO_PART_KEY | HA_CAN_INDEX_BLOBS |
+            HA_BINLOG_ROW_CAPABLE | HA_BINLOG_STMT_CAPABLE |
+            HA_NO_PREFIX_CHAR_KEYS | HA_PRIMARY_KEY_REQUIRED_FOR_DELETE |
+            HA_PARTIAL_COLUMN_READ | HA_NULL_IN_KEY);
+  }
+  /*
+    This is a bitmap of flags that says how the storage engine
+    implements indexes. The current index flags are documented in
+    handler.h. If you do not implement indexes, just return zero
+    here.
+
+    part is the key part to check. First key part is 0
+    If all_parts it's set, MySQL want to know the flags for the combined
+    index up to and including 'part'.
+  */
+    /* fix server to be able to get remote server index flags */
+  ulong index_flags(uint inx, uint part, bool all_parts) const
+  {
+    return (HA_READ_NEXT | HA_READ_RANGE | HA_READ_AFTER_KEY);
+  }
+  uint max_supported_record_length() const { return HA_MAX_REC_LENGTH; }
+  uint max_supported_keys()          const { return MAX_KEY; }
+  uint max_supported_key_parts()     const { return MAX_REF_PARTS; }
+  uint max_supported_key_length()    const { return FEDERATEDX_MAX_KEY_LENGTH; }
+  uint max_supported_key_part_length() const { return FEDERATEDX_MAX_KEY_LENGTH; }
+  /*
+    Called in test_quick_select to determine if indexes should be used.
+    Normally, we need to know number of blocks . For federatedx we need to
+    know number of blocks on remote side, and number of packets and blocks
+    on the network side (?)
+    Talk to Kostja about this - how to get the
+    number of rows * ...
+    disk scan time on other side (block size, size of the row) + network time ...
+    The reason for "records * 1000" is that such a large number forces
+    this to use indexes "
+  */
+  double scan_time()
+  {
+    DBUG_PRINT("info", ("records %lu", (ulong) stats.records));
+    return (double)(stats.records*1000);
+  }
+  /*
+    The next method will never be called if you do not implement indexes.
+  */
+  double read_time(uint index, uint ranges, ha_rows rows)
+  {
+    /*
+      Per Brian, this number is bugus, but this method must be implemented,
+      and at a later date, he intends to document this issue for handler code
+    */
+    return (double) rows /  20.0+1;
+  }
+
+  const key_map *keys_to_use_for_scanning() { return &key_map_full; }
+  /*
+    Everything below are methods that we implment in ha_federatedx.cc.
+
+    Most of these methods are not obligatory, skip them and
+    MySQL will treat them as not implemented
+  */
+  int open(const char *name, int mode, uint test_if_locked);    // required
+  int close(void);                                              // required
+
+  void start_bulk_insert(ha_rows rows);
+  int end_bulk_insert();
+  int write_row(uchar *buf);
+  int update_row(const uchar *old_data, uchar *new_data);
+  int delete_row(const uchar *buf);
+  int index_init(uint keynr, bool sorted);
+  ha_rows estimate_rows_upper_bound();
+  int index_read(uchar *buf, const uchar *key,
+                 uint key_len, enum ha_rkey_function find_flag);
+  int index_read_idx(uchar *buf, uint idx, const uchar *key,
+                     uint key_len, enum ha_rkey_function find_flag);
+  int index_next(uchar *buf);
+  int index_end();
+  int read_range_first(const key_range *start_key,
+                               const key_range *end_key,
+                               bool eq_range, bool sorted);
+  int read_range_next();
+  /*
+    unlike index_init(), rnd_init() can be called two times
+    without rnd_end() in between (it only makes sense if scan=1).
+    then the second call should prepare for the new table scan
+    (e.g if rnd_init allocates the cursor, second call should
+    position it to the start of the table, no need to deallocate
+    and allocate it again
+  */
+  int rnd_init(bool scan);                                      //required
+  int rnd_end();
+  int rnd_next(uchar *buf);                                      //required
+  int rnd_pos(uchar *buf, uchar *pos);                            //required
+  void position(const uchar *record);                            //required
+  int info(uint);                                              //required
+  int extra(ha_extra_function operation);
+
+  void update_auto_increment(void);
+  int repair(THD* thd, HA_CHECK_OPT* check_opt);
+  int optimize(THD* thd, HA_CHECK_OPT* check_opt);
+
+  int delete_all_rows(void);
+  int create(const char *name, TABLE *form,
+             HA_CREATE_INFO *create_info);                      //required
+  ha_rows records_in_range(uint inx, key_range *start_key,
+                                   key_range *end_key);
+  uint8 table_cache_type() { return HA_CACHE_TBL_NOCACHE; }
+
+  THR_LOCK_DATA **store_lock(THD *thd, THR_LOCK_DATA **to,
+                             enum thr_lock_type lock_type);     //required
+  bool get_error_message(int error, String *buf);
+  int start_stmt(THD *thd, thr_lock_type lock_type);
+  int external_lock(THD *thd, int lock_type);
+  int reset(void);
+  int free_result(void);
+};
+
+extern const char ident_quote_char;              // Character for quoting
+                                                 // identifiers
+extern const char value_quote_char;              // Character for quoting
+                                                 // literals
+
+extern bool append_ident(String *string, const char *name, uint length,
+                         const char quote_char);
+
+
+extern federatedx_io *instantiate_io_mysql(MEM_ROOT *server_root,
+                                           FEDERATEDX_SERVER *server);
+extern federatedx_io *instantiate_io_null(MEM_ROOT *server_root,
+                                          FEDERATEDX_SERVER *server);
diff --git a/storage/federatedx/plug.in b/storage/federatedx/plug.in
new file mode 100644
index 00000000000..95afe270f4c
--- /dev/null
+++ b/storage/federatedx/plug.in
@@ -0,0 +1,5 @@
+MYSQL_STORAGE_ENGINE(federatedx,,[FederatedX Storage Engine],
+        [FederatedX Storage Engine], [max,max-no-ndb])
+MYSQL_PLUGIN_DYNAMIC(federatedx,   [ha_federatedx.la])
+MYSQL_PLUGIN_STATIC(federatedx,    [libfederatedx.a])
+MYSQL_PLUGIN_DEPENDS_ON_MYSQL_INTERNALS(federatedx, [ha_federatedx.cc])
diff --git a/storage/heap/ha_heap.cc b/storage/heap/ha_heap.cc
index 481257def1d..93bee5ad7f1 100644
--- a/storage/heap/ha_heap.cc
+++ b/storage/heap/ha_heap.cc
@@ -820,3 +820,20 @@ mysql_declare_plugin(heap)
   NULL                        /* config options                  */
 }
 mysql_declare_plugin_end;
+maria_declare_plugin(heap)
+{
+  MYSQL_STORAGE_ENGINE_PLUGIN,
+  &heap_storage_engine,
+  "MEMORY",
+  "MySQL AB",
+  "Hash based, stored in memory, useful for temporary tables",
+  PLUGIN_LICENSE_GPL,
+  heap_init,
+  NULL,
+  0x0100, /* 1.0 */
+  NULL,                       /* status variables                */
+  NULL,                       /* system variables                */
+  "1.0",                      /* string version */
+  MariaDB_PLUGIN_MATURITY_STABLE /* maturity */
+}
+maria_declare_plugin_end;
diff --git a/storage/heap/hp_hash.c b/storage/heap/hp_hash.c
index f56df42aab3..aaaa0fe833f 100644
--- a/storage/heap/hp_hash.c
+++ b/storage/heap/hp_hash.c
@@ -577,7 +577,7 @@ int hp_rec_key_cmp(HP_KEYDEF *keydef, const uchar *rec1, const uchar *rec2,
     }
     else
     {
-      if (memcmp(rec1+seg->start,rec2+seg->start,seg->length))
+      if (bcmp(rec1+seg->start,rec2+seg->start,seg->length))
 	return 1;
     }
   }
@@ -660,7 +660,7 @@ int hp_key_cmp(HP_KEYDEF *keydef, const uchar *rec, const uchar *key)
     }
     else
     {
-      if (memcmp(rec+seg->start,key,seg->length))
+      if (bcmp(rec+seg->start,key,seg->length))
 	return 1;
     }
   }
diff --git a/storage/heap/hp_write.c b/storage/heap/hp_write.c
index fe83fb1e8e7..4e8fa7e3580 100644
--- a/storage/heap/hp_write.c
+++ b/storage/heap/hp_write.c
@@ -109,7 +109,7 @@ int hp_rb_write_key(HP_INFO *info, HP_KEYDEF *keyinfo, const uchar *record,
   custom_arg.key_length= hp_rb_make_key(keyinfo, info->recbuf, record, recpos);
   if (keyinfo->flag & HA_NOSAME)
   {
-    custom_arg.search_flag= SEARCH_FIND | SEARCH_UPDATE;
+    custom_arg.search_flag= SEARCH_FIND | SEARCH_UPDATE | SEARCH_INSERT;
     keyinfo->rb_tree.flag= TREE_NO_DUPS;
   }
   else
diff --git a/storage/ibmdb2i/ha_ibmdb2i.cc b/storage/ibmdb2i/ha_ibmdb2i.cc
index 947df8ad2fe..f6f1e2bc568 100644
--- a/storage/ibmdb2i/ha_ibmdb2i.cc
+++ b/storage/ibmdb2i/ha_ibmdb2i.cc
@@ -1158,9 +1158,7 @@ int ha_ibmdb2i::rnd_init(bool scan)
   
   rrnAssocHandle= 0;
 
-  DBUG_RETURN(0); // MySQL sometimes does not check the return code, causing 
-                  // an assert in ha_rnd_end later on if we return a non-zero
-                  // value here. 
+  DBUG_RETURN(0);
 }
 
 int ha_ibmdb2i::rnd_end()
@@ -3357,3 +3355,20 @@ mysql_declare_plugin(ibmdb2i)
   NULL                                          /* config options */
 }
 mysql_declare_plugin_end;
+maria_declare_plugin(ibmdb2i)
+{
+  MYSQL_STORAGE_ENGINE_PLUGIN,
+  &ibmdb2i_storage_engine,
+  "IBMDB2I",
+  "The IBM development team in Rochester, Minnesota",
+  "IBM DB2 for i Storage Engine",
+  PLUGIN_LICENSE_GPL,
+  ibmdb2i_init_func,          /* Plugin Init */
+  ibmdb2i_done_func,          /* Plugin Deinit */
+  0x0100 /* 1.0 */,
+  NULL,                       /* status variables */
+  ibmdb2i_system_variables,   /* system variables */
+  "1.0",                      /* string version */
+  MariaDB_PLUGIN_MATURITY_UNKNOWN /* maturity */
+}
+maria_declare_plugin_end;
diff --git a/storage/innobase/Makefile.am b/storage/innobase/Makefile.am
index 7a6103d9e79..674b3754e8c 100644
--- a/storage/innobase/Makefile.am
+++ b/storage/innobase/Makefile.am
@@ -329,9 +329,9 @@ libinnobase_a_CFLAGS=	$(AM_CFLAGS)
 EXTRA_LTLIBRARIES=	ha_innodb.la
 pkgplugin_LTLIBRARIES=	@plugin_innobase_shared_target@
 
-ha_innodb_la_LDFLAGS=	-module -rpath $(pkgplugindir)
-ha_innodb_la_CXXFLAGS=	$(AM_CXXFLAGS) $(INNODB_DYNAMIC_CFLAGS)
-ha_innodb_la_CFLAGS=	$(AM_CFLAGS) $(INNODB_DYNAMIC_CFLAGS)
+ha_innodb_la_LDFLAGS=	-module -rpath $(pkgplugindir) -L$(top_builddir)/libservices -lmysqlservices
+ha_innodb_la_CXXFLAGS=	-shared $(AM_CXXFLAGS) $(INNODB_DYNAMIC_CFLAGS)
+ha_innodb_la_CFLAGS=	-shared $(AM_CFLAGS) $(INNODB_DYNAMIC_CFLAGS)
 ha_innodb_la_SOURCES=	$(libinnobase_a_SOURCES)
 
 EXTRA_DIST=		CMakeLists.txt plug.in \
diff --git a/storage/innobase/handler/ha_innodb.cc b/storage/innobase/handler/ha_innodb.cc
index c6d7d914c18..067406b896b 100644
--- a/storage/innobase/handler/ha_innodb.cc
+++ b/storage/innobase/handler/ha_innodb.cc
@@ -2870,7 +2870,7 @@ innobase_rollback_to_savepoint(
 
 	/* TODO: use provided savepoint data area to store savepoint data */
 
-	longlong2str((ulint)savepoint, name, 36);
+	longlong2str((ulint)savepoint, name, 36, 1);
 
 	error = (int) trx_rollback_to_savepoint_for_mysql(trx, name,
 						&mysql_binlog_cache_pos);
@@ -2901,7 +2901,7 @@ innobase_release_savepoint(
 
 	/* TODO: use provided savepoint data area to store savepoint data */
 
-	longlong2str((ulint)savepoint, name, 36);
+	longlong2str((ulint)savepoint, name, 36, 1);
 
 	error = (int) trx_release_savepoint_for_mysql(trx, name);
 
@@ -2948,7 +2948,7 @@ innobase_savepoint(
 
 	/* TODO: use provided savepoint data area to store savepoint data */
 	char name[64];
-	longlong2str((ulint)savepoint,name,36);
+	longlong2str((ulint)savepoint,name,36,1);
 
 	error = (int) trx_savepoint_for_mysql(trx, name, (ib_int64_t)0);
 
@@ -5790,7 +5790,8 @@ ha_innobase::change_active_index(
 				    keynr);
 		/* The caller seems to ignore this.  Thus, we must check
 		this again in row_search_for_mysql(). */
-		DBUG_RETURN(2);
+		DBUG_RETURN(convert_error_code_to_mysql(DB_MISSING_HISTORY,
+                                                        0, NULL));
 	}
 
 	ut_a(prebuilt->search_tuple != 0);
diff --git a/storage/innobase/include/univ.i b/storage/innobase/include/univ.i
index 5a5af76e175..548eb446fab 100644
--- a/storage/innobase/include/univ.i
+++ b/storage/innobase/include/univ.i
@@ -182,7 +182,7 @@ command. Not tested on Windows. */
 #define UNIV_COMPILE_TEST_FUNCS
 */
 
-#if defined HAVE_VALGRIND
+#if defined(HAVE_valgrind)&& defined(HAVE_VALGRIND_MEMCHECK_H)
 # define UNIV_DEBUG_VALGRIND
 #endif /* HAVE_VALGRIND */
 #if 0
diff --git a/storage/innobase/plug.in b/storage/innobase/plug.in
index 9367b70c52e..76d18cd8dd0 100644
--- a/storage/innobase/plug.in
+++ b/storage/innobase/plug.in
@@ -15,7 +15,7 @@
 #
 
 MYSQL_STORAGE_ENGINE(innobase, innodb, [InnoDB Storage Engine],
-        [Transactional Tables using InnoDB], [default,max,max-no-ndb])
+        [Transactional Tables using InnoDB], [])
 MYSQL_PLUGIN_DIRECTORY(innobase, [storage/innobase])
 MYSQL_PLUGIN_STATIC(innobase,   [libinnobase.a])
 MYSQL_PLUGIN_DYNAMIC(innobase,  [ha_innodb.la])
diff --git a/storage/maria/CMakeLists.txt b/storage/maria/CMakeLists.txt
new file mode 100644
index 00000000000..7b5b190bd57
--- /dev/null
+++ b/storage/maria/CMakeLists.txt
@@ -0,0 +1,84 @@
+# Copyright (C) 2007 MySQL AB
+# 
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; version 2 of the License.
+# 
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+# 
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+
+SET(ARIA_SOURCES ma_init.c ma_open.c ma_extra.c ma_info.c ma_rkey.c 
+            ma_rnext.c ma_rnext_same.c 
+            ma_search.c ma_page.c ma_key_recover.c ma_key.c 
+            ma_locking.c ma_state.c
+            ma_rrnd.c ma_scan.c ma_cache.c 
+            ma_statrec.c ma_packrec.c ma_dynrec.c 
+            ma_blockrec.c ma_bitmap.c 
+            ma_update.c ma_write.c ma_unique.c 
+            ma_delete.c 
+            ma_rprev.c ma_rfirst.c ma_rlast.c ma_rsame.c 
+            ma_rsamepos.c ma_panic.c ma_close.c ma_create.c
+            ma_range.c ma_dbug.c ma_checksum.c 
+            ma_changed.c ma_static.c ma_delete_all.c 
+            ma_delete_table.c ma_rename.c  ma_check.c 
+            ma_keycache.c ma_preload.c ma_ft_parser.c 
+            ma_ft_update.c ma_ft_boolean_search.c 
+            ma_ft_nlq_search.c ft_maria.c ma_sort.c 
+            ha_maria.cc trnman.c lockman.c tablockman.c 
+            ma_rt_index.c ma_rt_key.c ma_rt_mbr.c ma_rt_split.c 
+            ma_sp_key.c ma_control_file.c ma_loghandler.c 
+            ma_pagecache.c ma_pagecaches.c compat_aliases.cc compat_aliases.h
+            ma_checkpoint.c ma_recovery.c ma_commit.c ma_pagecrc.c
+            ha_maria.h maria_def.h ma_recovery_util.c ma_servicethread.c
+)
+
+MYSQL_ADD_PLUGIN(aria ${ARIA_SOURCES} 
+  STORAGE_ENGINE 
+  MANDATORY 
+  RECOMPILE_FOR_EMBEDDED)
+
+TARGET_LINK_LIBRARIES(aria myisam)
+
+MYSQL_ADD_EXECUTABLE(aria_ftdump maria_ftdump.c)
+TARGET_LINK_LIBRARIES(aria_ftdump aria)
+
+MYSQL_ADD_EXECUTABLE(aria_chk maria_chk.c)
+TARGET_LINK_LIBRARIES(aria_chk aria)
+
+MYSQL_ADD_EXECUTABLE(aria_read_log maria_read_log.c)
+TARGET_LINK_LIBRARIES(aria_read_log aria)
+
+MYSQL_ADD_EXECUTABLE(aria_dump_log ma_loghandler.c unittest/ma_loghandler_examples.c)
+TARGET_LINK_LIBRARIES(aria_dump_log aria)
+SET_TARGET_PROPERTIES(aria_dump_log PROPERTIES COMPILE_FLAGS "-DMARIA_DUMP_LOG")
+
+MYSQL_ADD_EXECUTABLE(aria_pack maria_pack.c)
+TARGET_LINK_LIBRARIES(aria_pack aria)
+
+IF(WITH_UNIT_TESTS AND FALSE)
+  ADD_EXECUTABLE(ma_test1 ma_test1.c)
+  TARGET_LINK_LIBRARIES(ma_test1 aria)
+
+  ADD_EXECUTABLE(ma_test2 ma_test2.c)
+  TARGET_LINK_LIBRARIES(ma_test2 aria)
+
+  ADD_EXECUTABLE(ma_test3 ma_test3.c)
+  TARGET_LINK_LIBRARIES(ma_test3 aria)
+
+  ADD_EXECUTABLE(ma_rt_test ma_rt_test.c)
+  TARGET_LINK_LIBRARIES(ma_rt_test aria)
+
+  ADD_EXECUTABLE(ma_sp_test ma_sp_test.c)
+  TARGET_LINK_LIBRARIES(ma_sp_test aria)
+ENDIF()
+  
+IF (MSVC)
+  SET_TARGET_PROPERTIES(aria_chk aria_pack PROPERTIES LINK_FLAGS "setargv.obj")
+ENDIF()
+
diff --git a/storage/maria/Makefile.am b/storage/maria/Makefile.am
new file mode 100644
index 00000000000..a83063a0226
--- /dev/null
+++ b/storage/maria/Makefile.am
@@ -0,0 +1,202 @@
+# Copyright (C) 2000-2008 MySQL AB
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; version 2 of the License.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+
+MYSQLDATAdir =          $(localstatedir)
+MYSQLSHAREdir =         $(pkgdatadir)
+MYSQLBASEdir=           $(prefix)
+MYSQLLIBdir=            $(pkglibdir)
+INCLUDES =              -I$(top_srcdir)/include -I$(top_builddir)/include \
+			-I$(top_srcdir)/regex \
+			-I$(top_srcdir)/sql \
+                        -I$(srcdir)
+WRAPLIBS=
+
+LDADD =
+
+DEFS =                  @DEFS@
+
+# "." is needed first because tests in unittest need libaria
+SUBDIRS =		. unittest
+
+EXTRA_DIST =		ma_test_all.sh ma_test_all.res ma_test_big.sh \
+			ma_ft_stem.c CMakeLists.txt plug.in ma_test_recovery
+pkgdata_DATA =		
+pkglib_LIBRARIES =	libaria.a
+bin_PROGRAMS =		aria_chk aria_pack aria_ftdump aria_read_log \
+			aria_dump_log
+aria_chk_DEPENDENCIES=	$(LIBRARIES)
+# Only reason to link with libmyisam.a here is that it's where some fulltext
+# pieces are (but soon we'll remove fulltext dependencies from Aria).
+# For now, it imposes that storage/myisam be built before storage/maria.
+aria_chk_SOURCES=	maria_chk.c
+aria_chk_LDADD=		@CLIENT_EXTRA_LDFLAGS@ libaria.a \
+                        $(top_builddir)/storage/myisam/libmyisam.a \
+			$(top_builddir)/mysys/libmysys.a \
+			$(top_builddir)/dbug/libdbug.a \
+			$(top_builddir)/strings/libmystrings.a @ZLIB_LIBS@
+aria_pack_SOURCES=	maria_pack.c
+aria_pack_DEPENDENCIES=$(LIBRARIES)
+aria_pack_LDADD=	@CLIENT_EXTRA_LDFLAGS@ libaria.a \
+                        $(top_builddir)/storage/myisam/libmyisam.a \
+			$(top_builddir)/mysys/libmysys.a \
+			$(top_builddir)/dbug/libdbug.a \
+			$(top_builddir)/strings/libmystrings.a @ZLIB_LIBS@
+aria_read_log_SOURCES=	maria_read_log.c
+aria_read_log_DEPENDENCIES=$(LIBRARIES)
+aria_read_log_LDADD=	@CLIENT_EXTRA_LDFLAGS@ libaria.a \
+                        $(top_builddir)/storage/myisam/libmyisam.a \
+			$(top_builddir)/mysys/libmysys.a \
+			$(top_builddir)/dbug/libdbug.a \
+			$(top_builddir)/strings/libmystrings.a @ZLIB_LIBS@
+aria_dump_log_DEPENDENCIES=$(LIBRARIES) ma_loghandler.c
+aria_dump_log_LDADD=	@CLIENT_EXTRA_LDFLAGS@ libaria.a \
+                        $(top_builddir)/storage/myisam/libmyisam.a \
+                        $(top_builddir)/mysys/libmysys.a \
+			$(top_builddir)/dbug/libdbug.a \
+			$(top_builddir)/strings/libmystrings.a @ZLIB_LIBS@
+aria_dump_log_SOURCES= 	ma_loghandler.c unittest/ma_loghandler_examples.c
+aria_dump_log_CPPFLAGS= -DMARIA_DUMP_LOG
+noinst_PROGRAMS =	ma_test1 ma_test2 ma_test3 ma_rt_test ma_sp_test
+noinst_HEADERS =	maria_def.h ma_rt_index.h ma_rt_key.h ma_rt_mbr.h \
+			ma_sp_defs.h ma_fulltext.h ma_ftdefs.h ma_ft_test1.h \
+			ma_ft_eval.h trnman.h lockman.h tablockman.h \
+			ma_control_file.h ha_maria.h ma_blockrec.h \
+			ma_loghandler.h ma_loghandler_lsn.h ma_pagecache.h \
+			ma_checkpoint.h ma_recovery.h ma_commit.h ma_state.h \
+			trnman_public.h ma_check_standalone.h \
+			ma_key_recover.h ma_recovery_util.h \
+			ma_servicethread.h compat_aliases.h
+ma_test1_DEPENDENCIES=	$(LIBRARIES)
+ma_test1_LDADD=		@CLIENT_EXTRA_LDFLAGS@ libaria.a \
+                        $(top_builddir)/storage/myisam/libmyisam.a \
+			$(top_builddir)/mysys/libmysys.a \
+			$(top_builddir)/dbug/libdbug.a \
+			$(top_builddir)/strings/libmystrings.a @ZLIB_LIBS@
+ma_test2_DEPENDENCIES=	$(LIBRARIES)
+ma_test2_LDADD=		@CLIENT_EXTRA_LDFLAGS@ libaria.a \
+                        $(top_builddir)/storage/myisam/libmyisam.a \
+			$(top_builddir)/mysys/libmysys.a \
+			$(top_builddir)/dbug/libdbug.a \
+			$(top_builddir)/strings/libmystrings.a @ZLIB_LIBS@
+ma_test3_DEPENDENCIES=	$(LIBRARIES)
+ma_test3_LDADD=		@CLIENT_EXTRA_LDFLAGS@ libaria.a \
+                        $(top_builddir)/storage/myisam/libmyisam.a \
+			$(top_builddir)/mysys/libmysys.a \
+			$(top_builddir)/dbug/libdbug.a \
+			$(top_builddir)/strings/libmystrings.a @ZLIB_LIBS@
+#ma_ft_test1_DEPENDENCIES=	$(LIBRARIES)
+#ma_ft_eval_DEPENDENCIES=	$(LIBRARIES)
+aria_ftdump_SOURCES=	maria_ftdump.c
+aria_ftdump_DEPENDENCIES= $(LIBRARIES)
+aria_ftdump_LDADD=	@CLIENT_EXTRA_LDFLAGS@ libaria.a \
+                        $(top_builddir)/storage/myisam/libmyisam.a \
+			$(top_builddir)/mysys/libmysys.a \
+			$(top_builddir)/dbug/libdbug.a \
+			$(top_builddir)/strings/libmystrings.a @ZLIB_LIBS@
+ma_rt_test_DEPENDENCIES=	$(LIBRARIES)
+ma_rt_test_LDADD=		@CLIENT_EXTRA_LDFLAGS@ libaria.a \
+                        $(top_builddir)/storage/myisam/libmyisam.a \
+			$(top_builddir)/mysys/libmysys.a \
+			$(top_builddir)/dbug/libdbug.a \
+			$(top_builddir)/strings/libmystrings.a @ZLIB_LIBS@
+ma_sp_test_DEPENDENCIES=	$(LIBRARIES)
+ma_sp_test_LDADD=		@CLIENT_EXTRA_LDFLAGS@ libaria.a \
+                        $(top_builddir)/storage/myisam/libmyisam.a \
+			$(top_builddir)/mysys/libmysys.a \
+			$(top_builddir)/dbug/libdbug.a \
+			$(top_builddir)/strings/libmystrings.a @ZLIB_LIBS@
+libaria_a_SOURCES =	ma_init.c ma_open.c ma_extra.c ma_info.c ma_rkey.c \
+			ma_rnext.c ma_rnext_same.c \
+			ma_search.c ma_page.c ma_key_recover.c ma_key.c \
+			ma_locking.c ma_state.c \
+			ma_rrnd.c ma_scan.c ma_cache.c \
+			ma_statrec.c ma_packrec.c ma_dynrec.c \
+			ma_blockrec.c ma_bitmap.c \
+			ma_update.c ma_write.c ma_unique.c \
+			ma_delete.c \
+			ma_rprev.c ma_rfirst.c ma_rlast.c ma_rsame.c \
+			ma_rsamepos.c ma_panic.c ma_close.c ma_create.c\
+			ma_range.c ma_dbug.c ma_checksum.c \
+			ma_changed.c ma_static.c ma_delete_all.c \
+			ma_delete_table.c ma_rename.c  ma_check.c \
+			ma_keycache.c ma_preload.c ma_ft_parser.c \
+			ma_ft_update.c ma_ft_boolean_search.c \
+			ma_ft_nlq_search.c ft_maria.c ma_sort.c \
+			trnman.c lockman.c tablockman.c \
+			ma_rt_index.c ma_rt_key.c ma_rt_mbr.c ma_rt_split.c \
+			ma_sp_key.c ma_control_file.c ma_loghandler.c \
+			ma_pagecache.c ma_pagecaches.c \
+			ma_checkpoint.c ma_recovery.c ma_commit.c \
+			ma_pagecrc.c ma_recovery_util.c \
+			ha_maria.cc compat_aliases.cc ma_servicethread.c
+CLEANFILES =		test?.MA? FT?.MA? isam.log ma_test_all ma_rt_test.MA? sp_test.MA? aria_log_control aria_log.0000*
+
+SUFFIXES = .sh
+
+.sh:
+	@RM@ -f $@ $@-t
+	@SED@ \
+	  -e 's!@''bindir''@!$(bindir)!g' \
+	  -e 's!@''scriptdir''@!$(bindir)!g' \
+	  -e 's!@''prefix''@!$(prefix)!g' \
+	  -e 's!@''datadir''@!$(datadir)!g' \
+	  -e 's!@''localstatedir''@!$(localstatedir)!g' \
+	  -e 's!@''libexecdir''@!$(libexecdir)!g' \
+	  -e 's!@''CC''@!@CC@!'\
+	  -e 's!@''CXX''@!@CXX@!'\
+	  -e 's!@''GXX''@!@GXX@!'\
+	  -e 's!@''PERL''@!@PERL@!' \
+	  -e 's!@''CFLAGS''@!@SAVE_CFLAGS@!'\
+	  -e 's!@''CXXFLAGS''@!@SAVE_CXXFLAGS@!'\
+	  -e 's!@''LDFLAGS''@!@SAVE_LDFLAGS@!'\
+	  -e 's!@''VERSION''@!@VERSION@!' \
+	  -e 's!@''MYSQL_SERVER_SUFFIX''@!@MYSQL_SERVER_SUFFIX@!' \
+	  -e 's!@''COMPILATION_COMMENT''@!@COMPILATION_COMMENT@!' \
+	  -e 's!@''MACHINE_TYPE''@!@MACHINE_TYPE@!' \
+	  -e 's!@''HOSTNAME''@!@HOSTNAME@!' \
+	  -e 's!@''SYSTEM_TYPE''@!@SYSTEM_TYPE@!' \
+	  -e 's!@''CHECK_PID''@!@CHECK_PID@!' \
+	  -e 's!@''FIND_PROC''@!@FIND_PROC@!' \
+	  -e 's!@''MYSQLD_DEFAULT_SWITCHES''@!@MYSQLD_DEFAULT_SWITCHES@!' \
+	  -e 's!@''MYSQL_UNIX_ADDR''@!@MYSQL_UNIX_ADDR@!' \
+	  -e 's!@''TARGET_LINUX''@!@TARGET_LINUX@!' \
+	  -e "s!@""CONF_COMMAND""@!@CONF_COMMAND@!" \
+	  -e 's!@''MYSQLD_USER''@!@MYSQLD_USER@!' \
+	  -e 's!@''sysconfdir''@!@sysconfdir@!' \
+	  -e 's!@''SHORT_MYSQL_INTRO''@!@SHORT_MYSQL_INTRO@!' \
+	  -e 's!@''SHARED_LIB_VERSION''@!@SHARED_LIB_VERSION@!' \
+	  -e 's!@''MYSQL_BASE_VERSION''@!@MYSQL_BASE_VERSION@!' \
+	  -e 's!@''MYSQL_NO_DASH_VERSION''@!@MYSQL_NO_DASH_VERSION@!' \
+	  -e 's!@''MYSQL_TCP_PORT''@!@MYSQL_TCP_PORT@!' \
+	  -e 's!@''PERL_DBI_VERSION''@!@PERL_DBI_VERSION@!' \
+	  -e 's!@''PERL_DBD_VERSION''@!@PERL_DBD_VERSION@!' \
+	  -e 's!@''PERL_DATA_DUMPER''@!@PERL_DATA_DUMPER@!' \
+	$< > $@-t
+	@CHMOD@ +x $@-t
+	@MV@ $@-t $@
+
+tags:
+	etags *.h *.c *.cc
+
+unittests = unittest
+
+test:
+	perl $(top_srcdir)/unittest/unit.pl run $(unittests)
+
+test-verbose:
+	HARNESS_VERBOSE=1 perl $(top_srcdir)/unittest/unit.pl run $(unittests)
+
+# Don't update the files from bitkeeper
+%::SCCS/s.%
diff --git a/storage/maria/compat_aliases.cc b/storage/maria/compat_aliases.cc
new file mode 100644
index 00000000000..2d3c67d69a7
--- /dev/null
+++ b/storage/maria/compat_aliases.cc
@@ -0,0 +1,245 @@
+/* Copyright (C) 2010 Monty Program Ab
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; version 2 of the License.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */
+
+/*
+  compatibility aliases for system and static variables
+*/
+#include <my_global.h>
+#include <maria.h>
+#include <mysql/plugin.h>
+#include "ma_loghandler.h"
+#include "compat_aliases.h"
+
+ulong block_size_alias;
+static MYSQL_SYSVAR_ULONG(block_size, block_size_alias,
+       PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
+       "Deprecated, use --aria-block-size instead", 0, 0,
+       MARIA_KEY_BLOCK_LENGTH, MARIA_MIN_KEY_BLOCK_LENGTH,
+       MARIA_MAX_KEY_BLOCK_LENGTH, MARIA_MIN_KEY_BLOCK_LENGTH);
+
+ulong checkpoint_interval_alias;
+static MYSQL_SYSVAR_ULONG(checkpoint_interval, checkpoint_interval_alias,
+       PLUGIN_VAR_RQCMDARG,
+       "Deprecated, use --aria-checkpoint-interval instead",
+       NULL, NULL, 30, 0, UINT_MAX, 1);
+
+ulong force_start_after_recovery_failures_alias;
+static MYSQL_SYSVAR_ULONG(force_start_after_recovery_failures, force_start_after_recovery_failures_alias,
+       PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
+       "Deprecated, use --aria-force-start-after-recovery-failures instead",
+       NULL, NULL, 0, 0, UINT_MAX8, 1);
+
+my_bool page_checksum_alias;
+static MYSQL_SYSVAR_BOOL(page_checksum, page_checksum_alias, 0,
+       "Deprecated, use --aria-page-checksum instead", 0, 0, 1);
+
+char *log_dir_path_alias;
+static MYSQL_SYSVAR_STR(log_dir_path, log_dir_path_alias,
+       PLUGIN_VAR_NOSYSVAR | PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
+       "Deprecated, use --aria-log-dir-path instead",
+       NULL, NULL, mysql_real_data_home);
+
+ulong log_file_size_alias;
+static MYSQL_SYSVAR_ULONG(log_file_size, log_file_size_alias,
+       PLUGIN_VAR_RQCMDARG,
+       "Deprecated, use --aria-log-file-size instead",
+       NULL, NULL, TRANSLOG_FILE_SIZE,
+       TRANSLOG_MIN_FILE_SIZE, 0xffffffffL, TRANSLOG_PAGE_SIZE);
+
+ulong group_commit_alias;
+static MYSQL_SYSVAR_ENUM(group_commit, group_commit_alias,
+       PLUGIN_VAR_RQCMDARG,
+       "Deprecated, use --aria-group-commit instead",
+       NULL, NULL,
+       TRANSLOG_GCOMMIT_NONE, &maria_group_commit_typelib);
+
+ulong group_commit_interval_alias;
+static MYSQL_SYSVAR_ULONG(group_commit_interval, group_commit_interval_alias,
+       PLUGIN_VAR_RQCMDARG,
+       "Deprecated, use --aria-group-commit-interval instead",
+       NULL, NULL, 0, 0, UINT_MAX, 1);
+
+ulong log_purge_type_alias;
+static MYSQL_SYSVAR_ENUM(log_purge_type, log_purge_type_alias,
+       PLUGIN_VAR_RQCMDARG,
+       "Deprecated, use --aria-log-purge-type instead",
+       NULL, NULL, TRANSLOG_PURGE_IMMIDIATE,
+       &maria_translog_purge_type_typelib);
+
+ulonglong max_sort_file_size_alias;
+static MYSQL_SYSVAR_ULONGLONG(max_sort_file_size, max_sort_file_size_alias,
+       PLUGIN_VAR_RQCMDARG,
+       "Deprecated, use --aria-max-temp-length instead",
+       0, 0, MAX_FILE_SIZE, 0, MAX_FILE_SIZE, 1024*1024);
+
+ulong pagecache_age_threshold_alias;
+static MYSQL_SYSVAR_ULONG(pagecache_age_threshold, pagecache_age_threshold_alias,
+       PLUGIN_VAR_RQCMDARG,
+       "Deprecated, use --aria-pagecache-age-threshold instead",
+       0, 0, 300, 100, ~0L, 100);
+
+ulonglong pagecache_buffer_size_alias;
+static MYSQL_SYSVAR_ULONGLONG(pagecache_buffer_size, pagecache_buffer_size_alias,
+       PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
+       "Deprecated, use --aria-pagecache-buffer-size instead",
+       0, 0, KEY_CACHE_SIZE, MALLOC_OVERHEAD, ~0UL, IO_SIZE);
+
+ulong pagecache_division_limit_alias;
+static MYSQL_SYSVAR_ULONG(pagecache_division_limit, pagecache_division_limit_alias,
+       PLUGIN_VAR_RQCMDARG,
+       "Deprecated, use --aria-pagecache-division-limit instead",
+       0, 0, 100,  1, 100, 1);
+
+ulong recover_alias;
+static MYSQL_SYSVAR_ENUM(recover, recover_alias, PLUGIN_VAR_OPCMDARG,
+       "Deprecated, use --aria-recover instead",
+       NULL, NULL, HA_RECOVER_DEFAULT, &maria_recover_typelib);
+
+ulong repair_threads_alias;
+static MYSQL_THDVAR_ULONG(repair_threads, PLUGIN_VAR_RQCMDARG,
+       "Deprecated, use --aria-repair-threads instead",
+       0, 0, 1, 1, ~0L, 1);
+
+ulong sort_buffer_size_alias;
+static MYSQL_THDVAR_ULONG(sort_buffer_size, PLUGIN_VAR_RQCMDARG,
+       "Deprecated, use --aria-sort-buffer-size instead",
+       0, 0, 128L*1024L*1024L, 4, ~0L, 1);
+
+ulong stats_method_alias;
+static MYSQL_THDVAR_ENUM(stats_method, PLUGIN_VAR_RQCMDARG,
+       "Deprecated, use --aria-stats-method instead",
+       0, 0, 0, &maria_stats_method_typelib);
+
+ulong sync_log_dir_alias;
+static MYSQL_SYSVAR_ENUM(sync_log_dir, sync_log_dir_alias,
+       PLUGIN_VAR_RQCMDARG,
+       "Deprecated, use --aria-sync-log-dir instead",
+       NULL, NULL, TRANSLOG_SYNC_DIR_NEWFILE,
+       &maria_sync_log_dir_typelib);
+
+my_bool used_for_temp_tables_alias= 1;
+static MYSQL_SYSVAR_BOOL(used_for_temp_tables, 
+       used_for_temp_tables_alias, PLUGIN_VAR_READONLY | PLUGIN_VAR_NOCMDOPT,
+       NULL, 0, 0, 1);
+
+static struct st_mysql_show_var status_variables_aliases[]= {
+  {"Maria", (char*) &status_variables, SHOW_ARRAY},
+  {NullS, NullS, SHOW_LONG}
+};
+
+/*
+  There is one problem with aliases for command-line options.
+  Plugin initialization works like this
+
+     for all plugins:
+       prepare command-line options
+       initialize command-line option variables to the default values
+       parse command line, assign values as necessary
+
+     for all plugins:
+       call the plugin initialization function
+
+  it means, we cannot have maria* and aria* command-line options to use
+  the same underlying variables - because after assigning maria* values,
+  MySQL will put there default values again preparing for parsing aria*
+  values. So, maria* values will be lost.
+
+  So, we create separate set of variables for maria* options,
+  and take both values into account in ha_maria_init().
+
+  When the command line was parsed, we patch maria* options
+  to use the same variables as aria* options so that
+  set @@maria_some_var would have the same value as @@aria_some_var
+  without forcing us to copy the values around all the time.
+*/
+
+static struct st_mysql_sys_var* system_variables_aliases[]= {
+  MYSQL_SYSVAR(block_size),
+  MYSQL_SYSVAR(checkpoint_interval),
+  MYSQL_SYSVAR(force_start_after_recovery_failures),
+  MYSQL_SYSVAR(group_commit),
+  MYSQL_SYSVAR(group_commit_interval),
+  MYSQL_SYSVAR(log_dir_path),
+  MYSQL_SYSVAR(log_file_size),
+  MYSQL_SYSVAR(log_purge_type),
+  MYSQL_SYSVAR(max_sort_file_size),
+  MYSQL_SYSVAR(page_checksum),
+  MYSQL_SYSVAR(pagecache_age_threshold),
+  MYSQL_SYSVAR(pagecache_buffer_size),
+  MYSQL_SYSVAR(pagecache_division_limit),
+  MYSQL_SYSVAR(recover),
+  MYSQL_SYSVAR(repair_threads),
+  MYSQL_SYSVAR(sort_buffer_size),
+  MYSQL_SYSVAR(stats_method),
+  MYSQL_SYSVAR(sync_log_dir),
+  MYSQL_SYSVAR(used_for_temp_tables),
+  NULL
+};
+
+#define COPY_SYSVAR(name) \
+  memcpy(&MYSQL_SYSVAR_NAME(name), system_variables[i++],                 \
+                                        sizeof(MYSQL_SYSVAR_NAME(name))); \
+  if (name ## _alias  != MYSQL_SYSVAR_NAME(name).def_val &&               \
+      *MYSQL_SYSVAR_NAME(name).value == MYSQL_SYSVAR_NAME(name).def_val)  \
+    *MYSQL_SYSVAR_NAME(name).value= name ## _alias;
+
+#define COPY_THDVAR(name) \
+  name ## _alias= THDVAR(0, name);                                        \
+  memcpy(&MYSQL_SYSVAR_NAME(name), system_variables[i++],                 \
+                                        sizeof(MYSQL_SYSVAR_NAME(name))); \
+  if (name ## _alias  != MYSQL_SYSVAR_NAME(name).def_val &&               \
+      THDVAR(0, name) == MYSQL_SYSVAR_NAME(name).def_val)                 \
+    THDVAR(0, name)= name ## _alias;
+
+void copy_variable_aliases()
+{
+  int i= 0;
+  COPY_SYSVAR(block_size);
+  COPY_SYSVAR(checkpoint_interval);
+  COPY_SYSVAR(force_start_after_recovery_failures);
+  COPY_SYSVAR(group_commit);
+  COPY_SYSVAR(group_commit_interval);
+  COPY_SYSVAR(log_dir_path);
+  COPY_SYSVAR(log_file_size);
+  COPY_SYSVAR(log_purge_type);
+  COPY_SYSVAR(max_sort_file_size);
+  COPY_SYSVAR(page_checksum);
+  COPY_SYSVAR(pagecache_age_threshold);
+  COPY_SYSVAR(pagecache_buffer_size);
+  COPY_SYSVAR(pagecache_division_limit);
+  COPY_SYSVAR(recover);
+  COPY_THDVAR(repair_threads);
+  COPY_THDVAR(sort_buffer_size);
+  COPY_THDVAR(stats_method);
+  COPY_SYSVAR(sync_log_dir);
+  COPY_SYSVAR(used_for_temp_tables);
+}
+
+struct st_maria_plugin compat_aliases= {
+  MYSQL_DAEMON_PLUGIN,
+  &maria_storage_engine,
+  "Maria",
+  "Monty Program Ab",
+  "Compatibility aliases for the Aria engine",
+  PLUGIN_LICENSE_GPL,
+  NULL,
+  NULL,
+  0x0105,
+  status_variables_aliases,
+  system_variables_aliases,
+  "1.5",
+  MariaDB_PLUGIN_MATURITY_GAMMA
+};
+
diff --git a/storage/maria/compat_aliases.h b/storage/maria/compat_aliases.h
new file mode 100644
index 00000000000..46a4da74eec
--- /dev/null
+++ b/storage/maria/compat_aliases.h
@@ -0,0 +1,27 @@
+/* Copyright (C) 2010 Monty Program Ab
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; version 2 of the License.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */
+
+extern struct st_maria_plugin compat_aliases;
+extern char mysql_real_data_home[FN_REFLEN];
+extern TYPELIB maria_recover_typelib;
+extern TYPELIB maria_stats_method_typelib;
+extern TYPELIB maria_translog_purge_type_typelib;
+extern TYPELIB maria_sync_log_dir_typelib;
+extern TYPELIB maria_group_commit_typelib;
+extern struct st_mysql_storage_engine maria_storage_engine;
+extern my_bool use_maria_for_temp_tables;
+extern struct st_mysql_sys_var* system_variables[];
+extern st_mysql_show_var status_variables[];
+void copy_variable_aliases();
diff --git a/storage/maria/file_formats.txt b/storage/maria/file_formats.txt
new file mode 100644
index 00000000000..927e8ad985e
--- /dev/null
+++ b/storage/maria/file_formats.txt
@@ -0,0 +1,71 @@
+#
+# This should contain a description of the file format for most Maria files
+#
+
+# Description of the header in the index file
+
+Header, 24 bytes
+
+Pos  Length
+
+0    4  file_version
+4    2  options
+6    2  header_length
+8    2  state_info_length
+10   2  base_info_length
+12   2  base_pos
+14   2  key_parts
+16   2  unique_key_parts
+18   1  keys
+19   1  uniques
+20   1  language
+21   1  fulltext_keys
+22   1  data_file_type
+23   1  org_data_file_type
+
+
+Status part
+
+24   2  open_count
+26   2  state_changed
+28   7  create_rename_lsn
+     7  is_of_horizon
+     7  skip_redo_lsn
+     8  state.records
+     8  state->state.del
+     8  state->split
+     8  state->dellink
+     8  state->first_bitmap_with_space
+     8  state->state.key_file_length
+     8  state->state.data_file_length
+     8  state->state.empty
+     8  state->state.key_empty
+     8  state->auto_increment
+     8  state->state.checksum
+     4  state->process
+     4  state->unique
+     4  state->status
+     4  state->update_count
+
+     1  state->sortkey
+     1  reserved
+
+for each key
+     8  state->key_root[i]
+
+     8  state->key_del
+     4  state->sec_index_changed
+     4  state->sec_index_used
+     4  state->version
+     8  state->key_map
+     8  state->create_time
+     8  state->recover_time
+     8  state->check_time
+     8  state->records_at_analyze
+
+for each key
+    4   reserved
+
+for each key part
+    8   state->rec_per_key_part[i]
+    4   state->nulls_per_key_part[i]
diff --git a/storage/maria/ft_maria.c b/storage/maria/ft_maria.c
new file mode 100644
index 00000000000..b1b24592593
--- /dev/null
+++ b/storage/maria/ft_maria.c
@@ -0,0 +1,48 @@
+/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; version 2 of the License.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */
+
+/* Written by Sergei A. Golubchik, who has a shared copyright to this code */
+
+/*
+  This function is for interface functions between fulltext and maria
+*/
+
+#include "ma_ftdefs.h"
+
+FT_INFO *maria_ft_init_search(uint flags, void *info, uint keynr,
+			      uchar *query, size_t query_len,
+                              CHARSET_INFO *cs, uchar *record)
+{
+  FT_INFO *res;
+  if (flags & FT_BOOL)
+    res= maria_ft_init_boolean_search((MARIA_HA *) info, keynr, query,
+				      query_len, cs);
+  else
+    res= maria_ft_init_nlq_search((MARIA_HA *) info, keynr, query, query_len,
+				  flags, record);
+  return res;
+}
+
+const struct _ft_vft _ma_ft_vft_nlq = {
+  maria_ft_nlq_read_next, maria_ft_nlq_find_relevance,
+  maria_ft_nlq_close_search, maria_ft_nlq_get_relevance,
+  maria_ft_nlq_reinit_search
+};
+const struct _ft_vft _ma_ft_vft_boolean = {
+  maria_ft_boolean_read_next, maria_ft_boolean_find_relevance,
+  maria_ft_boolean_close_search, maria_ft_boolean_get_relevance,
+  maria_ft_boolean_reinit_search
+};
+
diff --git a/storage/maria/ha_maria.cc b/storage/maria/ha_maria.cc
new file mode 100644
index 00000000000..27958285a2e
--- /dev/null
+++ b/storage/maria/ha_maria.cc
@@ -0,0 +1,3686 @@
+/* Copyright (C) 2004-2008 MySQL AB & MySQL Finland AB & TCX DataKonsult AB
+   Copyright (C) 2008-2009 Sun Microsystems, Inc.
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; version 2 of the License.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */
+
+
+#ifdef USE_PRAGMA_IMPLEMENTATION
+#pragma implementation                          // gcc: Class implementation
+#endif
+
+#define MYSQL_SERVER 1
+#include "mysql_priv.h"
+#include <mysql/plugin.h>
+#include <m_ctype.h>
+#include <my_dir.h>
+#include <myisampack.h>
+#include <my_bit.h>
+#include "ha_maria.h"
+#include "trnman_public.h"
+#include "trnman.h"
+#include "compat_aliases.h"
+
+C_MODE_START
+#include "maria_def.h"
+#include "ma_rt_index.h"
+#include "ma_blockrec.h"
+#include "ma_checkpoint.h"
+#include "ma_recovery.h"
+C_MODE_END
+
+/*
+  Note that in future versions, only *transactional* Maria tables can
+  rollback, so this flag should be up or down conditionally.
+*/
+#ifdef MARIA_CANNOT_ROLLBACK
+#define CANNOT_ROLLBACK_FLAG HA_NO_TRANSACTIONS
+#define trans_register_ha(A, B, C)  do { /* nothing */ } while(0)
+#else
+#define CANNOT_ROLLBACK_FLAG 0
+#endif
+#define THD_TRN (*(TRN **)thd_ha_data(thd, maria_hton))
+
+ulong pagecache_division_limit, pagecache_age_threshold;
+ulonglong pagecache_buffer_size;
+
+/**
+   As the auto-repair is initiated when opened from the SQL layer
+   (open_unireg_entry(), check_and_repair()), it does not happen when Maria's
+   Recovery internally opens the table to apply log records to it, which is
+   good. It would happen only after Recovery, if the table is still
+   corrupted.
+*/
+ulong maria_recover_options= HA_RECOVER_NONE;
+handlerton *maria_hton;
+
+/* bits in maria_recover_options */
+const char *maria_recover_names[]=
+{
+  /*
+    Compared to MyISAM, "default" was renamed to "normal" as it collided with
+    SET var=default which sets to the var's default i.e. what happens when the
+    var is not set i.e. HA_RECOVER_NONE.
+    Another change is that OFF is used to disable, not ""; this is to have OFF
+    display in SHOW VARIABLES which is better than "".
+  */
+  "OFF", "NORMAL", "BACKUP", "FORCE", "QUICK", NullS
+};
+TYPELIB maria_recover_typelib=
+{
+  array_elements(maria_recover_names) - 1, "",
+  maria_recover_names, NULL
+};
+
+const char *maria_stats_method_names[]=
+{
+  "nulls_unequal", "nulls_equal",
+  "nulls_ignored", NullS
+};
+TYPELIB maria_stats_method_typelib=
+{
+  array_elements(maria_stats_method_names) - 1, "",
+  maria_stats_method_names, NULL
+};
+
+/* transactions log purge mode */
+const char *maria_translog_purge_type_names[]=
+{
+  "immediate", "external", "at_flush", NullS
+};
+TYPELIB maria_translog_purge_type_typelib=
+{
+  array_elements(maria_translog_purge_type_names) - 1, "",
+  maria_translog_purge_type_names, NULL
+};
+
+/* transactional log directory sync */
+const char *maria_sync_log_dir_names[]=
+{
+  "NEVER", "NEWFILE", "ALWAYS", NullS
+};
+TYPELIB maria_sync_log_dir_typelib=
+{
+  array_elements(maria_sync_log_dir_names) - 1, "",
+  maria_sync_log_dir_names, NULL
+};
+
+/* transactional log group commit */
+const char *maria_group_commit_names[]=
+{
+  "none", "hard", "soft", NullS
+};
+TYPELIB maria_group_commit_typelib=
+{
+  array_elements(maria_group_commit_names) - 1, "",
+  maria_group_commit_names, NULL
+};
+
+/** Interval between background checkpoints in seconds */
+static ulong checkpoint_interval;
+static void update_checkpoint_interval(MYSQL_THD thd,
+                                       struct st_mysql_sys_var *var,
+                                       void *var_ptr, const void *save);
+static void update_maria_group_commit(MYSQL_THD thd,
+                                      struct st_mysql_sys_var *var,
+                                      void *var_ptr, const void *save);
+static void update_maria_group_commit_interval(MYSQL_THD thd,
+                                           struct st_mysql_sys_var *var,
+                                           void *var_ptr, const void *save);
+/** After that many consecutive recovery failures, remove logs */
+static ulong force_start_after_recovery_failures;
+static void update_log_file_size(MYSQL_THD thd,
+                                 struct st_mysql_sys_var *var,
+                                 void *var_ptr, const void *save);
+
+static MYSQL_SYSVAR_ULONG(block_size, maria_block_size,
+       PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
+       "Block size to be used for Aria index pages.", 0, 0,
+       MARIA_KEY_BLOCK_LENGTH, MARIA_MIN_KEY_BLOCK_LENGTH,
+       MARIA_MAX_KEY_BLOCK_LENGTH, MARIA_MIN_KEY_BLOCK_LENGTH);
+
+static MYSQL_SYSVAR_ULONG(checkpoint_interval, checkpoint_interval,
+       PLUGIN_VAR_RQCMDARG,
+       "Interval between automatic checkpoints, in seconds; 0 means"
+       " 'no automatic checkpoints' which makes sense only for testing.",
+       NULL, update_checkpoint_interval, 30, 0, UINT_MAX, 1);
+
+static MYSQL_SYSVAR_ULONG(force_start_after_recovery_failures,
+       force_start_after_recovery_failures,
+       /*
+         Read-only because setting it on the fly has no useful effect,
+         should be set on command-line.
+       */
+       PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
+       "Number of consecutive log recovery failures after which logs will be"
+       " automatically deleted to cure the problem; 0 (the default) disables"
+       " the feature.", NULL, NULL, 0, 0, UINT_MAX8, 1);
+
+static MYSQL_SYSVAR_BOOL(page_checksum, maria_page_checksums, 0,
+       "Maintain page checksums (can be overridden per table "
+       "with PAGE_CHECKSUM clause in CREATE TABLE)", 0, 0, 1);
+
+/* It is only command line argument */
+static MYSQL_SYSVAR_STR(log_dir_path, maria_data_root,
+       PLUGIN_VAR_NOSYSVAR | PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
+       "Path to the directory where to store transactional log",
+       NULL, NULL, mysql_real_data_home);
+
+
+static MYSQL_SYSVAR_ULONG(log_file_size, log_file_size,
+       PLUGIN_VAR_RQCMDARG,
+       "Limit for transaction log size",
+       NULL, update_log_file_size, TRANSLOG_FILE_SIZE,
+       TRANSLOG_MIN_FILE_SIZE, 0xffffffffL, TRANSLOG_PAGE_SIZE);
+
+static MYSQL_SYSVAR_ENUM(group_commit, maria_group_commit,
+       PLUGIN_VAR_RQCMDARG,
+       "Specifies Aria group commit mode. "
+       "Possible values are \"none\" (no group commit), "
+       "\"hard\" (with waiting to actual commit), "
+       "\"soft\" (no wait for commit (DANGEROUS!!!))",
+       NULL, update_maria_group_commit,
+       TRANSLOG_GCOMMIT_NONE, &maria_group_commit_typelib);
+
+static MYSQL_SYSVAR_ULONG(group_commit_interval, maria_group_commit_interval,
+       PLUGIN_VAR_RQCMDARG,
+       "Interval between commite in microseconds (1/1000000c)."
+       " 0 stands for no waiting"
+       " for other threads to come and do a commit in \"hard\" mode and no"
+       " sync()/commit at all in \"soft\" mode.  Option has only an effect"
+       " if aria_group_commit is used",
+       NULL, update_maria_group_commit_interval, 0, 0, UINT_MAX, 1);
+
+static MYSQL_SYSVAR_ENUM(log_purge_type, log_purge_type,
+       PLUGIN_VAR_RQCMDARG,
+       "Specifies how Aria transactional log will be purged. "
+       "Possible values of name are \"immediate\", \"external\" "
+       "and \"at_flush\"",
+       NULL, NULL, TRANSLOG_PURGE_IMMIDIATE,
+       &maria_translog_purge_type_typelib);
+
+static MYSQL_SYSVAR_ULONGLONG(max_sort_file_size,
+       maria_max_temp_length, PLUGIN_VAR_RQCMDARG,
+       "Don't use the fast sort index method to created index if the "
+       "temporary file would get bigger than this.",
+       0, 0, MAX_FILE_SIZE & ~(1*MB-1), 0, MAX_FILE_SIZE, 1*MB);
+
+static MYSQL_SYSVAR_ULONG(pagecache_age_threshold,
+       pagecache_age_threshold, PLUGIN_VAR_RQCMDARG,
+       "This characterizes the number of hits a hot block has to be untouched "
+       "until it is considered aged enough to be downgraded to a warm block. "
+       "This specifies the percentage ratio of that number of hits to the "
+       "total number of blocks in the page cache.", 0, 0,
+        300, 100, ~0L, 100);
+
+static MYSQL_SYSVAR_ULONGLONG(pagecache_buffer_size, pagecache_buffer_size,
+       PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
+       "The size of the buffer used for index blocks for Aria tables. "
+       "Increase this to get better index handling (for all reads and "
+       "multiple writes) to as much as you can afford.", 0, 0,
+       KEY_CACHE_SIZE, 0, ~(ulong) 0, 1);
+
+static MYSQL_SYSVAR_ULONG(pagecache_division_limit, pagecache_division_limit,
+       PLUGIN_VAR_RQCMDARG,
+       "The minimum percentage of warm blocks in key cache", 0, 0,
+       100,  1, 100, 1);
+
+static MYSQL_SYSVAR_ENUM(recover, maria_recover_options, PLUGIN_VAR_OPCMDARG,
+       "Specifies how corrupted tables should be automatically repaired."
+       " Possible values are \"NORMAL\" (the default), \"BACKUP\", \"FORCE\","
+       " \"QUICK\", or \"OFF\" which is like not using the option.",
+       NULL, NULL, HA_RECOVER_DEFAULT, &maria_recover_typelib);
+
+static MYSQL_THDVAR_ULONG(repair_threads, PLUGIN_VAR_RQCMDARG,
+       "Number of threads to use when repairing Aria tables. The value of 1 "
+       "disables parallel repair.",
+       0, 0, 1, 1, ~0L, 1);
+
+static MYSQL_THDVAR_ULONG(sort_buffer_size, PLUGIN_VAR_RQCMDARG,
+       "The buffer that is allocated when sorting the index when doing a "
+       "REPAIR or when creating indexes with CREATE INDEX or ALTER TABLE.",
+       0, 0, 128L*1024L*1024L, 4, ~0L, 1);
+
+static MYSQL_THDVAR_ENUM(stats_method, PLUGIN_VAR_RQCMDARG,
+       "Specifies how Aria index statistics collection code should treat "
+       "NULLs. Possible values are \"nulls_unequal\", \"nulls_equal\", "
+       "and \"nulls_ignored\".", 0, 0, 0, &maria_stats_method_typelib);
+
+static MYSQL_SYSVAR_ENUM(sync_log_dir, sync_log_dir, PLUGIN_VAR_RQCMDARG,
+       "Controls syncing directory after log file growth and new file "
+       "creation. Possible values are \"never\", \"newfile\" and "
+       "\"always\").", NULL, NULL, TRANSLOG_SYNC_DIR_NEWFILE,
+       &maria_sync_log_dir_typelib);
+
+#ifdef USE_MARIA_FOR_TMP_TABLES
+#define USE_MARIA_FOR_TMP_TABLES_VAL 1
+#else
+#define USE_MARIA_FOR_TMP_TABLES_VAL 0
+#endif
+my_bool use_maria_for_temp_tables= USE_MARIA_FOR_TMP_TABLES_VAL;
+
+static MYSQL_SYSVAR_BOOL(used_for_temp_tables, 
+       use_maria_for_temp_tables, PLUGIN_VAR_READONLY | PLUGIN_VAR_NOCMDOPT,
+       "Whether temporary tables should be MyISAM or Aria", 0, 0,
+       1);
+
+/*****************************************************************************
+** MARIA tables
+*****************************************************************************/
+
+static handler *maria_create_handler(handlerton *hton,
+                                     TABLE_SHARE * table,
+                                     MEM_ROOT *mem_root)
+{
+  return new (mem_root) ha_maria(hton, table);
+}
+
+
+// collect errors printed by maria_check routines
+
+static void _ma_check_print_msg(HA_CHECK *param, const char *msg_type,
+                                const char *fmt, va_list args)
+{
+  THD *thd= (THD *) param->thd;
+  Protocol *protocol= thd->protocol;
+  uint length, msg_length;
+  char msgbuf[HA_MAX_MSG_BUF];
+  char name[NAME_LEN * 2 + 2];
+
+  msg_length= my_vsnprintf(msgbuf, sizeof(msgbuf), fmt, args);
+  msgbuf[sizeof(msgbuf) - 1]= 0;                // healthy paranoia
+
+  DBUG_PRINT(msg_type, ("message: %s", msgbuf));
+
+  if (!thd->vio_ok())
+  {
+    sql_print_error(fmt, args);
+    return;
+  }
+
+  if (param->testflag &
+      (T_CREATE_MISSING_KEYS | T_SAFE_REPAIR | T_AUTO_REPAIR))
+  {
+    my_message(ER_NOT_KEYFILE, msgbuf, MYF(MY_WME));
+    return;
+  }
+  length= (uint) (strxmov(name, param->db_name, ".", param->table_name,
+                          NullS) - name);
+  /*
+    TODO: switch from protocol to push_warning here. The main reason we didn't
+    it yet is parallel repair. Due to following trace:
+    ma_check_print_msg/push_warning/sql_alloc/my_pthread_getspecific_ptr.
+
+    Also we likely need to lock mutex here (in both cases with protocol and
+    push_warning).
+  */
+  protocol->prepare_for_resend();
+  protocol->store(name, length, system_charset_info);
+  protocol->store(param->op_name, system_charset_info);
+  protocol->store(msg_type, system_charset_info);
+  protocol->store(msgbuf, msg_length, system_charset_info);
+  if (protocol->write())
+    sql_print_error("Failed on my_net_write, writing to stderr instead: %s\n",
+                    msgbuf);
+  return;
+}
+
+
+/*
+  Convert TABLE object to Maria key and column definition
+
+  SYNOPSIS
+    table2maria()
+      table_arg   in     TABLE object.
+      keydef_out  out    Maria key definition.
+      recinfo_out out    Maria column definition.
+      records_out out    Number of fields.
+
+  DESCRIPTION
+    This function will allocate and initialize Maria key and column
+    definition for further use in ma_create or for a check for underlying
+    table conformance in merge engine.
+
+    The caller needs to free *recinfo_out after use. Since *recinfo_out
+    and *keydef_out are allocated with a my_multi_malloc, *keydef_out
+    is freed automatically when *recinfo_out is freed.
+
+  RETURN VALUE
+    0  OK
+    # error code
+*/
+
+static int table2maria(TABLE *table_arg, data_file_type row_type,
+                       MARIA_KEYDEF **keydef_out,
+                       MARIA_COLUMNDEF **recinfo_out, uint *records_out,
+                       MARIA_CREATE_INFO *create_info)
+{
+  uint i, j, recpos, minpos, fieldpos, temp_length, length;
+  enum ha_base_keytype type= HA_KEYTYPE_BINARY;
+  uchar *record;
+  KEY *pos;
+  MARIA_KEYDEF *keydef;
+  MARIA_COLUMNDEF *recinfo, *recinfo_pos;
+  HA_KEYSEG *keyseg;
+  TABLE_SHARE *share= table_arg->s;
+  uint options= share->db_options_in_use;
+  DBUG_ENTER("table2maria");
+
+  if (row_type == BLOCK_RECORD)
+    options|= HA_OPTION_PACK_RECORD;
+
+  if (!(my_multi_malloc(MYF(MY_WME),
+          recinfo_out, (share->fields * 2 + 2) * sizeof(MARIA_COLUMNDEF),
+          keydef_out, share->keys * sizeof(MARIA_KEYDEF),
+          &keyseg,
+          (share->key_parts + share->keys) * sizeof(HA_KEYSEG),
+          NullS)))
+    DBUG_RETURN(HA_ERR_OUT_OF_MEM); /* purecov: inspected */
+  keydef= *keydef_out;
+  recinfo= *recinfo_out;
+  pos= table_arg->key_info;
+  for (i= 0; i < share->keys; i++, pos++)
+  {
+    keydef[i].flag= (uint16) (pos->flags & (HA_NOSAME | HA_FULLTEXT |
+                                            HA_SPATIAL));
+    keydef[i].key_alg= pos->algorithm == HA_KEY_ALG_UNDEF ?
+      (pos->flags & HA_SPATIAL ? HA_KEY_ALG_RTREE : HA_KEY_ALG_BTREE) :
+      pos->algorithm;
+    keydef[i].block_length= pos->block_size;
+    keydef[i].seg= keyseg;
+    keydef[i].keysegs= pos->key_parts;
+    for (j= 0; j < pos->key_parts; j++)
+    {
+      Field *field= pos->key_part[j].field;
+      type= field->key_type();
+      keydef[i].seg[j].flag= pos->key_part[j].key_part_flag;
+
+      if (options & HA_OPTION_PACK_KEYS ||
+          (pos->flags & (HA_PACK_KEY | HA_BINARY_PACK_KEY |
+                         HA_SPACE_PACK_USED)))
+      {
+        if (pos->key_part[j].length > 8 &&
+            (type == HA_KEYTYPE_TEXT ||
+             type == HA_KEYTYPE_NUM ||
+             (type == HA_KEYTYPE_BINARY && !field->zero_pack())))
+        {
+          /* No blobs here */
+          if (j == 0)
+            keydef[i].flag|= HA_PACK_KEY;
+          if (!(field->flags & ZEROFILL_FLAG) &&
+              (field->type() == MYSQL_TYPE_STRING ||
+               field->type() == MYSQL_TYPE_VAR_STRING ||
+               ((int) (pos->key_part[j].length - field->decimals())) >= 4))
+            keydef[i].seg[j].flag|= HA_SPACE_PACK;
+        }
+        else if (j == 0 && (!(pos->flags & HA_NOSAME) || pos->key_length > 16))
+          keydef[i].flag|= HA_BINARY_PACK_KEY;
+      }
+      keydef[i].seg[j].type= (int) type;
+      keydef[i].seg[j].start= pos->key_part[j].offset;
+      keydef[i].seg[j].length= pos->key_part[j].length;
+      keydef[i].seg[j].bit_start= keydef[i].seg[j].bit_end=
+        keydef[i].seg[j].bit_length= 0;
+      keydef[i].seg[j].bit_pos= 0;
+      keydef[i].seg[j].language= field->charset()->number;
+
+      if (field->null_ptr)
+      {
+        keydef[i].seg[j].null_bit= field->null_bit;
+        keydef[i].seg[j].null_pos= (uint) (field->null_ptr-
+                                           (uchar*) table_arg->record[0]);
+      }
+      else
+      {
+        keydef[i].seg[j].null_bit= 0;
+        keydef[i].seg[j].null_pos= 0;
+      }
+      if (field->type() == MYSQL_TYPE_BLOB ||
+          field->type() == MYSQL_TYPE_GEOMETRY)
+      {
+        keydef[i].seg[j].flag|= HA_BLOB_PART;
+        /* save number of bytes used to pack length */
+        keydef[i].seg[j].bit_start= (uint) (field->pack_length() -
+                                            share->blob_ptr_size);
+      }
+      else if (field->type() == MYSQL_TYPE_BIT)
+      {
+        keydef[i].seg[j].bit_length= ((Field_bit *) field)->bit_len;
+        keydef[i].seg[j].bit_start= ((Field_bit *) field)->bit_ofs;
+        keydef[i].seg[j].bit_pos= (uint) (((Field_bit *) field)->bit_ptr -
+                                          (uchar*) table_arg->record[0]);
+      }
+    }
+    keyseg+= pos->key_parts;
+  }
+  if (table_arg->found_next_number_field)
+    keydef[share->next_number_index].flag|= HA_AUTO_KEY;
+  record= table_arg->record[0];
+  recpos= 0;
+  recinfo_pos= recinfo;
+  create_info->null_bytes= table_arg->s->null_bytes;
+
+  while (recpos < (uint) share->stored_rec_length)
+  {
+    Field **field, *found= 0;
+    minpos= share->reclength;
+    length= 0;
+
+    for (field= table_arg->field; *field; field++)
+    {
+      if ((fieldpos= (*field)->offset(record)) >= recpos &&
+          fieldpos <= minpos)
+      {
+        /* skip null fields */
+        if (!(temp_length= (*field)->pack_length_in_rec()))
+          continue; /* Skip null-fields */
+        if (! found || fieldpos < minpos ||
+            (fieldpos == minpos && temp_length < length))
+        {
+          minpos= fieldpos;
+          found= *field;
+          length= temp_length;
+        }
+      }
+    }
+    DBUG_PRINT("loop", ("found: 0x%lx  recpos: %d  minpos: %d  length: %d",
+                        (long) found, recpos, minpos, length));
+    if (!found)
+      break;
+
+    if (found->flags & BLOB_FLAG)
+      recinfo_pos->type= FIELD_BLOB;
+    else if (found->type() == MYSQL_TYPE_VARCHAR)
+      recinfo_pos->type= FIELD_VARCHAR;
+    else if (!(options & HA_OPTION_PACK_RECORD) ||
+             (found->zero_pack() && (found->flags & PRI_KEY_FLAG)))
+      recinfo_pos->type= FIELD_NORMAL;
+    else if (found->zero_pack())
+      recinfo_pos->type= FIELD_SKIP_ZERO;
+    else
+      recinfo_pos->type= ((length <= 3 ||
+                           (found->flags & ZEROFILL_FLAG)) ?
+                          FIELD_NORMAL :
+                          found->type() == MYSQL_TYPE_STRING ||
+                          found->type() == MYSQL_TYPE_VAR_STRING ?
+                          FIELD_SKIP_ENDSPACE :
+                          FIELD_SKIP_PRESPACE);
+    if (found->null_ptr)
+    {
+      recinfo_pos->null_bit= found->null_bit;
+      recinfo_pos->null_pos= (uint) (found->null_ptr -
+                                     (uchar*) table_arg->record[0]);
+    }
+    else
+    {
+      recinfo_pos->null_bit= 0;
+      recinfo_pos->null_pos= 0;
+    }
+    (recinfo_pos++)->length= (uint16) length;
+    recpos= minpos + length;
+    DBUG_PRINT("loop", ("length: %d  type: %d",
+                        recinfo_pos[-1].length,recinfo_pos[-1].type));
+  }
+  *records_out= (uint) (recinfo_pos - recinfo);
+  DBUG_RETURN(0);
+}
+
+
+/*
+  Check for underlying table conformance
+
+  SYNOPSIS
+    maria_check_definition()
+      t1_keyinfo       in    First table key definition
+      t1_recinfo       in    First table record definition
+      t1_keys          in    Number of keys in first table
+      t1_recs          in    Number of records in first table
+      t2_keyinfo       in    Second table key definition
+      t2_recinfo       in    Second table record definition
+      t2_keys          in    Number of keys in second table
+      t2_recs          in    Number of records in second table
+      strict           in    Strict check switch
+
+  DESCRIPTION
+    This function compares two Maria definitions. By intention it was done
+    to compare merge table definition against underlying table definition.
+    It may also be used to compare dot-frm and MAI definitions of Maria
+    table as well to compare different Maria table definitions.
+
+    For merge table it is not required that number of keys in merge table
+    must exactly match number of keys in underlying table. When calling this
+    function for underlying table conformance check, 'strict' flag must be
+    set to false, and converted merge definition must be passed as t1_*.
+
+    Otherwise 'strict' flag must be set to 1 and it is not required to pass
+    converted dot-frm definition as t1_*.
+
+  RETURN VALUE
+    0 - Equal definitions.
+    1 - Different definitions.
+
+  TODO
+    - compare FULLTEXT keys;
+    - compare SPATIAL keys;
+    - compare FIELD_SKIP_ZERO which is converted to FIELD_NORMAL correctly
+      (should be correctly detected in table2maria).
+*/
+
+int maria_check_definition(MARIA_KEYDEF *t1_keyinfo,
+                           MARIA_COLUMNDEF *t1_recinfo,
+                           uint t1_keys, uint t1_recs,
+                           MARIA_KEYDEF *t2_keyinfo,
+                           MARIA_COLUMNDEF *t2_recinfo,
+                           uint t2_keys, uint t2_recs, bool strict)
+{
+  uint i, j;
+  DBUG_ENTER("maria_check_definition");
+  if ((strict ? t1_keys != t2_keys : t1_keys > t2_keys))
+  {
+    DBUG_PRINT("error", ("Number of keys differs: t1_keys=%u, t2_keys=%u",
+                         t1_keys, t2_keys));
+    DBUG_RETURN(1);
+  }
+  if (t1_recs != t2_recs)
+  {
+    DBUG_PRINT("error", ("Number of recs differs: t1_recs=%u, t2_recs=%u",
+                         t1_recs, t2_recs));
+    DBUG_RETURN(1);
+  }
+  for (i= 0; i < t1_keys; i++)
+  {
+    HA_KEYSEG *t1_keysegs= t1_keyinfo[i].seg;
+    HA_KEYSEG *t2_keysegs= t2_keyinfo[i].seg;
+    if (t1_keyinfo[i].flag & HA_FULLTEXT && t2_keyinfo[i].flag & HA_FULLTEXT)
+      continue;
+    else if (t1_keyinfo[i].flag & HA_FULLTEXT ||
+             t2_keyinfo[i].flag & HA_FULLTEXT)
+    {
+       DBUG_PRINT("error", ("Key %d has different definition", i));
+       DBUG_PRINT("error", ("t1_fulltext= %d, t2_fulltext=%d",
+                            test(t1_keyinfo[i].flag & HA_FULLTEXT),
+                            test(t2_keyinfo[i].flag & HA_FULLTEXT)));
+       DBUG_RETURN(1);
+    }
+    if (t1_keyinfo[i].flag & HA_SPATIAL && t2_keyinfo[i].flag & HA_SPATIAL)
+      continue;
+    else if (t1_keyinfo[i].flag & HA_SPATIAL ||
+             t2_keyinfo[i].flag & HA_SPATIAL)
+    {
+       DBUG_PRINT("error", ("Key %d has different definition", i));
+       DBUG_PRINT("error", ("t1_spatial= %d, t2_spatial=%d",
+                            test(t1_keyinfo[i].flag & HA_SPATIAL),
+                            test(t2_keyinfo[i].flag & HA_SPATIAL)));
+       DBUG_RETURN(1);
+    }
+    if (t1_keyinfo[i].keysegs != t2_keyinfo[i].keysegs ||
+        t1_keyinfo[i].key_alg != t2_keyinfo[i].key_alg)
+    {
+      DBUG_PRINT("error", ("Key %d has different definition", i));
+      DBUG_PRINT("error", ("t1_keysegs=%d, t1_key_alg=%d",
+                           t1_keyinfo[i].keysegs, t1_keyinfo[i].key_alg));
+      DBUG_PRINT("error", ("t2_keysegs=%d, t2_key_alg=%d",
+                           t2_keyinfo[i].keysegs, t2_keyinfo[i].key_alg));
+      DBUG_RETURN(1);
+    }
+    for (j=  t1_keyinfo[i].keysegs; j--;)
+    {
+      uint8 t1_keysegs_j__type= t1_keysegs[j].type;
+      /*
+        Table migration from 4.1 to 5.1. In 5.1 a *TEXT key part is
+        always HA_KEYTYPE_VARTEXT2. In 4.1 we had only the equivalent of
+        HA_KEYTYPE_VARTEXT1. Since we treat both the same on MyISAM
+        level, we can ignore a mismatch between these types.
+      */
+      if ((t1_keysegs[j].flag & HA_BLOB_PART) &&
+          (t2_keysegs[j].flag & HA_BLOB_PART))
+      {
+        if ((t1_keysegs_j__type == HA_KEYTYPE_VARTEXT2) &&
+            (t2_keysegs[j].type == HA_KEYTYPE_VARTEXT1))
+          t1_keysegs_j__type= HA_KEYTYPE_VARTEXT1; /* purecov: tested */
+        else if ((t1_keysegs_j__type == HA_KEYTYPE_VARBINARY2) &&
+                 (t2_keysegs[j].type == HA_KEYTYPE_VARBINARY1))
+          t1_keysegs_j__type= HA_KEYTYPE_VARBINARY1; /* purecov: inspected */
+      }
+
+      if (t1_keysegs_j__type != t2_keysegs[j].type ||
+          t1_keysegs[j].language != t2_keysegs[j].language ||
+          t1_keysegs[j].null_bit != t2_keysegs[j].null_bit ||
+          t1_keysegs[j].length != t2_keysegs[j].length)
+      {
+        DBUG_PRINT("error", ("Key segment %d (key %d) has different "
+                             "definition", j, i));
+        DBUG_PRINT("error", ("t1_type=%d, t1_language=%d, t1_null_bit=%d, "
+                             "t1_length=%d",
+                             t1_keysegs[j].type, t1_keysegs[j].language,
+                             t1_keysegs[j].null_bit, t1_keysegs[j].length));
+        DBUG_PRINT("error", ("t2_type=%d, t2_language=%d, t2_null_bit=%d, "
+                             "t2_length=%d",
+                             t2_keysegs[j].type, t2_keysegs[j].language,
+                             t2_keysegs[j].null_bit, t2_keysegs[j].length));
+
+        DBUG_RETURN(1);
+      }
+    }
+  }
+
+  for (i= 0; i < t1_recs; i++)
+  {
+    MARIA_COLUMNDEF *t1_rec= &t1_recinfo[i];
+    MARIA_COLUMNDEF *t2_rec= &t2_recinfo[i];
+    /*
+      FIELD_SKIP_ZERO can be changed to FIELD_NORMAL in maria_create,
+      see NOTE1 in ma_create.c
+    */
+    if ((t1_rec->type != t2_rec->type &&
+         !(t1_rec->type == (int) FIELD_SKIP_ZERO &&
+           t1_rec->length == 1 &&
+           t2_rec->type == (int) FIELD_NORMAL)) ||
+        t1_rec->length != t2_rec->length ||
+        t1_rec->null_bit != t2_rec->null_bit)
+    {
+      DBUG_PRINT("error", ("Field %d has different definition", i));
+      DBUG_PRINT("error", ("t1_type=%d, t1_length=%d, t1_null_bit=%d",
+                           t1_rec->type, t1_rec->length, t1_rec->null_bit));
+      DBUG_PRINT("error", ("t2_type=%d, t2_length=%d, t2_null_bit=%d",
+                           t2_rec->type, t2_rec->length, t2_rec->null_bit));
+      DBUG_RETURN(1);
+    }
+  }
+  DBUG_RETURN(0);
+}
+
+
+extern "C" {
+
+int _ma_killed_ptr(HA_CHECK *param)
+{
+  return thd_killed((THD*)param->thd);
+}
+
+
+void _ma_check_print_error(HA_CHECK *param, const char *fmt, ...)
+{
+  va_list args;
+  DBUG_ENTER("_ma_check_print_error");
+  param->error_printed |= 1;
+  param->out_flag |= O_DATA_LOST;
+  va_start(args, fmt);
+  _ma_check_print_msg(param, "error", fmt, args);
+  va_end(args);
+  DBUG_VOID_RETURN;
+}
+
+
+void _ma_check_print_info(HA_CHECK *param, const char *fmt, ...)
+{
+  va_list args;
+  DBUG_ENTER("_ma_check_print_info");
+  va_start(args, fmt);
+  _ma_check_print_msg(param, "info", fmt, args);
+  va_end(args);
+  DBUG_VOID_RETURN;
+}
+
+
+void _ma_check_print_warning(HA_CHECK *param, const char *fmt, ...)
+{
+  va_list args;
+  DBUG_ENTER("_ma_check_print_warning");
+  param->warning_printed= 1;
+  param->out_flag |= O_DATA_LOST;
+  va_start(args, fmt);
+  _ma_check_print_msg(param, "warning", fmt, args);
+  va_end(args);
+  DBUG_VOID_RETURN;
+}
+
+/*
+  Create a transaction object
+
+  SYNOPSIS
+    info	Maria handler
+
+  RETURN
+    0 		ok
+    #		Error number (HA_ERR_OUT_OF_MEM)
+*/
+
+static int maria_create_trn_for_mysql(MARIA_HA *info)
+{
+  THD *thd= (THD*) info->external_ptr;
+  TRN *trn= THD_TRN;
+  DBUG_ENTER("maria_create_trn_for_mysql");
+
+  if (!trn)  /* no transaction yet - open it now */
+  {
+    trn= trnman_new_trn(& thd->transaction.wt);
+    if (unlikely(!trn))
+      DBUG_RETURN(HA_ERR_OUT_OF_MEM);
+    THD_TRN= trn;
+    if (thd->options & (OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN))
+      trans_register_ha(thd, TRUE, maria_hton);
+  }
+  _ma_set_trn_for_table(info, trn);
+  if (!trnman_increment_locked_tables(trn))
+  {
+    trans_register_ha(thd, FALSE, maria_hton);
+    trnman_new_statement(trn);
+  }
+#ifdef EXTRA_DEBUG
+  if (info->lock_type == F_WRLCK &&
+      ! (trnman_get_flags(trn) & TRN_STATE_INFO_LOGGED))
+  {
+    trnman_set_flags(trn, trnman_get_flags(trn) | TRN_STATE_INFO_LOGGED |
+                     TRN_STATE_TABLES_CAN_CHANGE);
+    (void) translog_log_debug_info(trn, LOGREC_DEBUG_INFO_QUERY,
+                                   (uchar*) thd->query(),
+                                   thd->query_length());
+  }
+  else
+  {
+    DBUG_PRINT("info", ("lock_type: %d  trnman_flags: %u",
+                        info->lock_type, trnman_get_flags(trn)));
+  }
+  
+#endif
+  DBUG_RETURN(0);
+}
+
+} /* extern "C" */
+
+/**
+  Transactional table doing bulk insert with one single UNDO
+  (UNDO_BULK_INSERT) and with repair.
+*/
+#define BULK_INSERT_SINGLE_UNDO_AND_REPAIR    1
+/**
+  Transactional table doing bulk insert with one single UNDO
+  (UNDO_BULK_INSERT) and without repair.
+*/
+#define BULK_INSERT_SINGLE_UNDO_AND_NO_REPAIR 2
+/**
+  None of BULK_INSERT_SINGLE_UNDO_AND_REPAIR and
+  BULK_INSERT_SINGLE_UNDO_AND_NO_REPAIR.
+*/
+#define BULK_INSERT_NONE      0
+
+ha_maria::ha_maria(handlerton *hton, TABLE_SHARE *table_arg):
+handler(hton, table_arg), file(0),
+int_table_flags(HA_NULL_IN_KEY | HA_CAN_FULLTEXT | HA_CAN_SQL_HANDLER |
+                HA_BINLOG_ROW_CAPABLE | HA_BINLOG_STMT_CAPABLE |
+                HA_DUPLICATE_POS | HA_CAN_INDEX_BLOBS | HA_AUTO_PART_KEY |
+                HA_FILE_BASED | HA_CAN_GEOMETRY | CANNOT_ROLLBACK_FLAG |
+                HA_CAN_BIT_FIELD | HA_CAN_RTREEKEYS |
+                HA_HAS_RECORDS | HA_STATS_RECORDS_IS_EXACT),
+can_enable_indexes(1), bulk_insert_single_undo(BULK_INSERT_NONE)
+{}
+
+
+handler *ha_maria::clone(MEM_ROOT *mem_root)
+{
+  ha_maria *new_handler= static_cast <ha_maria *>(handler::clone(mem_root));
+  if (new_handler)
+  {
+    new_handler->file->state= file->state;
+    /* maria_create_trn_for_mysql() is never called for clone() tables */
+    new_handler->file->trn= file->trn;
+  }
+  return new_handler;
+}
+
+
+static const char *ha_maria_exts[]=
+{
+  MARIA_NAME_IEXT,
+  MARIA_NAME_DEXT,
+  NullS
+};
+
+
+const char **ha_maria::bas_ext() const
+{
+  return ha_maria_exts;
+}
+
+
+const char *ha_maria::index_type(uint key_number)
+{
+  return ((table->key_info[key_number].flags & HA_FULLTEXT) ?
+          "FULLTEXT" :
+          (table->key_info[key_number].flags & HA_SPATIAL) ?
+          "SPATIAL" :
+          (table->key_info[key_number].algorithm == HA_KEY_ALG_RTREE) ?
+          "RTREE" : "BTREE");
+}
+
+
+double ha_maria::scan_time()
+{
+  if (file->s->data_file_type == BLOCK_RECORD)
+    return ulonglong2double(stats.data_file_length - file->s->block_size) / max(file->s->block_size / 2, IO_SIZE) + 2;
+  return handler::scan_time();
+}
+
+/*
+  We need to be able to store at least two keys on an index page as the
+  splitting algorithms depends on this. (With only one key on a page
+  we also can't use any compression, which may make the index file much
+  larger)
+  We use HA_MAX_KEY_BUFF as this is a stack restriction imposed by the
+  handler interface.
+
+  We also need to reserve place for a record pointer (8) and 3 bytes
+  per key segment to store the length of the segment + possible null bytes.
+  These extra bytes are required here so that maria_create() will surely
+  accept any keys created which the returned key data storage length.
+*/
+
+uint ha_maria::max_supported_key_length() const
+{
+  uint tmp= (maria_max_key_length() - 8 - HA_MAX_KEY_SEG*3);
+  return min(HA_MAX_KEY_BUFF, tmp);
+}
+
+
+#ifdef HAVE_REPLICATION
+int ha_maria::net_read_dump(NET * net)
+{
+  int data_fd= file->dfile.file;
+  int error= 0;
+
+  my_seek(data_fd, 0L, MY_SEEK_SET, MYF(MY_WME));
+  for (;;)
+  {
+    ulong packet_len= my_net_read(net);
+    if (!packet_len)
+      break;                                    // end of file
+    if (packet_len == packet_error)
+    {
+      sql_print_error("ha_maria::net_read_dump - read error ");
+      error= -1;
+      goto err;
+    }
+    if (my_write(data_fd, (uchar *) net->read_pos, (uint) packet_len,
+                 MYF(MY_WME | MY_FNABP)))
+    {
+      error= errno;
+      goto err;
+    }
+  }
+err:
+  return error;
+}
+
+
+int ha_maria::dump(THD * thd, int fd)
+{
+  MARIA_SHARE *share= file->s;
+  NET *net= &thd->net;
+  uint block_size= share->block_size;
+  my_off_t bytes_to_read= share->state.state.data_file_length;
+  int data_fd= file->dfile.file;
+  uchar *buf= (uchar *) my_malloc(block_size, MYF(MY_WME));
+  if (!buf)
+    return ENOMEM;
+
+  int error= 0;
+  my_seek(data_fd, 0L, MY_SEEK_SET, MYF(MY_WME));
+  for (; bytes_to_read > 0;)
+  {
+    size_t bytes= my_read(data_fd, buf, block_size, MYF(MY_WME));
+    if (bytes == MY_FILE_ERROR)
+    {
+      error= errno;
+      goto err;
+    }
+
+    if (fd >= 0)
+    {
+      if (my_write(fd, buf, bytes, MYF(MY_WME | MY_FNABP)))
+      {
+        error= errno ? errno : EPIPE;
+        goto err;
+      }
+    }
+    else
+    {
+      if (my_net_write(net, buf, bytes))
+      {
+        error= errno ? errno : EPIPE;
+        goto err;
+      }
+    }
+    bytes_to_read -= bytes;
+  }
+
+  if (fd < 0)
+  {
+    if (my_net_write(net, (uchar*) "", 0))
+      error= errno ? errno : EPIPE;
+    net_flush(net);
+  }
+
+err:
+  my_free((uchar*) buf, MYF(0));
+  return error;
+}
+#endif                                          /* HAVE_REPLICATION */
+
+        /* Name is here without an extension */
+
+int ha_maria::open(const char *name, int mode, uint test_if_locked)
+{
+  uint i;
+
+#ifdef NOT_USED
+  /*
+    If the user wants to have memory mapped data files, add an
+    open_flag. Do not memory map temporary tables because they are
+    expected to be inserted and thus extended a lot. Memory mapping is
+    efficient for files that keep their size, but very inefficient for
+    growing files. Using an open_flag instead of calling ma_extra(...
+    HA_EXTRA_MMAP ...) after maxs_open() has the advantage that the
+    mapping is not repeated for every open, but just done on the initial
+    open, when the MyISAM share is created. Every time the server
+    requires to open a new instance of a table it calls this method. We
+    will always supply HA_OPEN_MMAP for a permanent table. However, the
+    Maria storage engine will ignore this flag if this is a secondary
+    open of a table that is in use by other threads already (if the
+    Maria share exists already).
+  */
+  if (!(test_if_locked & HA_OPEN_TMP_TABLE) && opt_maria_use_mmap)
+    test_if_locked|= HA_OPEN_MMAP;
+#endif
+
+  if (unlikely(maria_recover_options != HA_RECOVER_NONE))
+  {
+    /* user asked to trigger a repair if table was not properly closed */
+    test_if_locked|= HA_OPEN_ABORT_IF_CRASHED;
+  }
+
+  if (!(file= maria_open(name, mode, test_if_locked | HA_OPEN_FROM_SQL_LAYER)))
+    return (my_errno ? my_errno : -1);
+
+  file->s->chst_invalidator= query_cache_invalidate_by_MyISAM_filename_ref;
+
+  if (test_if_locked & (HA_OPEN_IGNORE_IF_LOCKED | HA_OPEN_TMP_TABLE))
+    VOID(maria_extra(file, HA_EXTRA_NO_WAIT_LOCK, 0));
+
+  info(HA_STATUS_NO_LOCK | HA_STATUS_VARIABLE | HA_STATUS_CONST);
+  if (!(test_if_locked & HA_OPEN_WAIT_IF_LOCKED))
+    VOID(maria_extra(file, HA_EXTRA_WAIT_LOCK, 0));
+  if ((data_file_type= file->s->data_file_type) != STATIC_RECORD)
+    int_table_flags |= HA_REC_NOT_IN_SEQ;
+  if (!file->s->base.born_transactional)
+  {
+    /*
+      INSERT DELAYED cannot work with transactional tables (because it cannot
+      stand up to "when client gets ok the data is safe on disk": the record
+      may not even be inserted). In the future, we could enable it back (as a
+      client doing INSERT DELAYED knows the specificities; but we then should
+      make sure to regularly commit in the delayed_insert thread). 
+    */
+    int_table_flags|= HA_CAN_INSERT_DELAYED;
+  }
+  if (file->s->options & (HA_OPTION_CHECKSUM | HA_OPTION_COMPRESS_RECORD))
+    int_table_flags |= HA_HAS_NEW_CHECKSUM;
+
+  for (i= 0; i < table->s->keys; i++)
+  {
+    plugin_ref parser= table->key_info[i].parser;
+    if (table->key_info[i].flags & HA_USES_PARSER)
+      file->s->keyinfo[i].parser=
+        (struct st_mysql_ftparser *)plugin_decl(parser)->info;
+    table->key_info[i].block_size= file->s->keyinfo[i].block_length;
+  }
+  my_errno= 0;
+  return my_errno;
+}
+
+
+int ha_maria::close(void)
+{
+  MARIA_HA *tmp= file;
+  if (!tmp)
+    return 0;
+  file= 0;
+  return maria_close(tmp);
+}
+
+
+int ha_maria::write_row(uchar * buf)
+{
+  ha_statistic_increment(&SSV::ha_write_count);
+
+  /* If we have a timestamp column, update it to the current time */
+  if (table->timestamp_field_type & TIMESTAMP_AUTO_SET_ON_INSERT)
+    table->timestamp_field->set_time();
+
+  /*
+     If we have an auto_increment column and we are writing a changed row
+     or a new row, then update the auto_increment value in the record.
+  */
+  if (table->next_number_field && buf == table->record[0])
+  {
+    int error;
+    if ((error= update_auto_increment()))
+      return error;
+  }
+  return maria_write(file, buf);
+}
+
+
+int ha_maria::check(THD * thd, HA_CHECK_OPT * check_opt)
+{
+  int error;
+  HA_CHECK &param= *(HA_CHECK*) thd->alloc(sizeof(param));
+  MARIA_SHARE *share= file->s;
+  const char *old_proc_info= thd_proc_info(thd, "Checking table");
+  TRN *old_trn= file->trn;
+
+  if (!file || !&param) return HA_ADMIN_INTERNAL_ERROR;
+
+  maria_chk_init(&param);
+  param.thd= thd;
+  param.op_name= "check";
+  param.db_name= table->s->db.str;
+  param.table_name= table->alias;
+  param.testflag= check_opt->flags | T_CHECK | T_SILENT;
+  param.stats_method= (enum_handler_stats_method)THDVAR(thd,stats_method);
+
+  if (!(table->db_stat & HA_READ_ONLY))
+    param.testflag |= T_STATISTICS;
+  param.using_global_keycache= 1;
+
+  if (!maria_is_crashed(file) &&
+      (((param.testflag & T_CHECK_ONLY_CHANGED) &&
+        !(share->state.changed & (STATE_CHANGED | STATE_CRASHED |
+                                  STATE_CRASHED_ON_REPAIR |
+                                  STATE_IN_REPAIR)) &&
+        share->state.open_count == 0) ||
+       ((param.testflag & T_FAST) && (share->state.open_count ==
+                                      (uint) (share->global_changed ? 1 :
+                                              0)))))
+    return HA_ADMIN_ALREADY_DONE;
+
+  maria_chk_init_for_check(&param, file);
+  (void) maria_chk_status(&param, file);                // Not fatal
+  error= maria_chk_size(&param, file);
+  if (!error)
+    error|= maria_chk_del(&param, file, param.testflag);
+  if (!error)
+    error= maria_chk_key(&param, file);
+  if (!error)
+  {
+    if ((!(param.testflag & T_QUICK) &&
+         ((share->options &
+           (HA_OPTION_PACK_RECORD | HA_OPTION_COMPRESS_RECORD)) ||
+          (param.testflag & (T_EXTEND | T_MEDIUM)))) || maria_is_crashed(file))
+    {
+      ulonglong old_testflag= param.testflag;
+      param.testflag |= T_MEDIUM;
+      if (!(error= init_io_cache(&param.read_cache, file->dfile.file,
+                                 my_default_record_cache_size, READ_CACHE,
+                                 share->pack.header_length, 1, MYF(MY_WME))))
+      {
+        error= maria_chk_data_link(&param, file,
+                                   test(param.testflag & T_EXTEND));
+        end_io_cache(&(param.read_cache));
+      }
+      param.testflag= old_testflag;
+    }
+  }
+  if (!error)
+  {
+    if ((share->state.changed & (STATE_CHANGED |
+                                 STATE_CRASHED_ON_REPAIR | STATE_IN_REPAIR |
+                                 STATE_CRASHED | STATE_NOT_ANALYZED)) ||
+        (param.testflag & T_STATISTICS) || maria_is_crashed(file))
+    {
+      file->update |= HA_STATE_CHANGED | HA_STATE_ROW_CHANGED;
+      pthread_mutex_lock(&share->intern_lock);
+      DBUG_PRINT("info", ("Reseting crashed state"));
+      share->state.changed&= ~(STATE_CHANGED | STATE_CRASHED |
+                               STATE_CRASHED_ON_REPAIR | STATE_IN_REPAIR);
+      if (!(table->db_stat & HA_READ_ONLY))
+        error= maria_update_state_info(&param, file,
+                                       UPDATE_TIME | UPDATE_OPEN_COUNT |
+                                       UPDATE_STAT);
+      pthread_mutex_unlock(&share->intern_lock);
+      info(HA_STATUS_NO_LOCK | HA_STATUS_TIME | HA_STATUS_VARIABLE |
+           HA_STATUS_CONST);
+    }
+  }
+  else if (!maria_is_crashed(file) && !thd->killed)
+  {
+    maria_mark_crashed(file);
+    file->update |= HA_STATE_CHANGED | HA_STATE_ROW_CHANGED;
+  }
+
+  /* Reset trn, that may have been set by repair */
+  _ma_set_trn_for_table(file, old_trn);
+  thd_proc_info(thd, old_proc_info);
+  return error ? HA_ADMIN_CORRUPT : HA_ADMIN_OK;
+}
+
+
+/*
+  Analyze the key distribution in the table
+  As the table may be only locked for read, we have to take into account that
+  two threads may do an analyze at the same time!
+*/
+
+int ha_maria::analyze(THD *thd, HA_CHECK_OPT * check_opt)
+{
+  int error= 0;
+  HA_CHECK &param= *(HA_CHECK*) thd->alloc(sizeof(param));
+  MARIA_SHARE *share= file->s;
+
+  if (!&param)
+    return HA_ADMIN_INTERNAL_ERROR;
+
+  maria_chk_init(&param);
+  param.thd= thd;
+  param.op_name= "analyze";
+  param.db_name= table->s->db.str;
+  param.table_name= table->alias;
+  param.testflag= (T_FAST | T_CHECK | T_SILENT | T_STATISTICS |
+                   T_DONT_CHECK_CHECKSUM);
+  param.using_global_keycache= 1;
+  param.stats_method= (enum_handler_stats_method)THDVAR(thd,stats_method);
+
+  if (!(share->state.changed & STATE_NOT_ANALYZED))
+    return HA_ADMIN_ALREADY_DONE;
+
+  error= maria_chk_key(&param, file);
+  if (!error)
+  {
+    pthread_mutex_lock(&share->intern_lock);
+    error= maria_update_state_info(&param, file, UPDATE_STAT);
+    pthread_mutex_unlock(&share->intern_lock);
+  }
+  else if (!maria_is_crashed(file) && !thd->killed)
+    maria_mark_crashed(file);
+  return error ? HA_ADMIN_CORRUPT : HA_ADMIN_OK;
+}
+
+
+int ha_maria::restore(THD * thd, HA_CHECK_OPT *check_opt)
+{
+  HA_CHECK_OPT tmp_check_opt;
+  char *backup_dir= thd->lex->backup_dir;
+  char src_path[FN_REFLEN], dst_path[FN_REFLEN];
+  char table_name[FN_REFLEN];
+  int error;
+  const char *errmsg;
+  DBUG_ENTER("restore");
+
+  VOID(tablename_to_filename(table->s->table_name.str, table_name,
+                             sizeof(table_name)));
+
+  if (fn_format_relative_to_data_home(src_path, table_name, backup_dir,
+                                      MARIA_NAME_DEXT))
+    DBUG_RETURN(HA_ADMIN_INVALID);
+
+  strxmov(dst_path, table->s->normalized_path.str, MARIA_NAME_DEXT, NullS);
+  if (my_copy(src_path, dst_path, MYF(MY_WME)))
+  {
+    error= HA_ADMIN_FAILED;
+    errmsg= "Failed in my_copy (Error %d)";
+    goto err;
+  }
+
+  tmp_check_opt.init();
+  tmp_check_opt.flags |= T_VERY_SILENT | T_CALC_CHECKSUM | T_QUICK;
+  DBUG_RETURN(repair(thd, &tmp_check_opt));
+
+err:
+  {
+    /*
+      Don't allocate param on stack here as this may be huge and it's
+      also allocated by repair()
+    */
+    HA_CHECK *param;
+    if (!(param= (HA_CHECK*) my_malloc(sizeof(*param), MYF(MY_WME | MY_FAE))))
+      DBUG_RETURN(error);
+    maria_chk_init(param);
+    param->thd= thd;
+    param->op_name= "restore";
+    param->db_name= table->s->db.str;
+    param->table_name= table->s->table_name.str;
+    param->testflag= 0;
+    _ma_check_print_error(param, errmsg, my_errno);
+    my_free(param, MYF(0));
+    DBUG_RETURN(error);
+  }
+}
+
+
+int ha_maria::backup(THD * thd, HA_CHECK_OPT *check_opt)
+{
+  char *backup_dir= thd->lex->backup_dir;
+  char src_path[FN_REFLEN], dst_path[FN_REFLEN];
+  char table_name[FN_REFLEN];
+  int error;
+  const char *errmsg;
+  DBUG_ENTER("ha_maria::backup");
+
+  VOID(tablename_to_filename(table->s->table_name.str, table_name,
+                             sizeof(table_name)));
+
+  if (fn_format_relative_to_data_home(dst_path, table_name, backup_dir,
+                                      reg_ext))
+  {
+    errmsg= "Failed in fn_format() for .frm file (errno: %d)";
+    error= HA_ADMIN_INVALID;
+    goto err;
+  }
+
+  strxmov(src_path, table->s->normalized_path.str, reg_ext, NullS);
+  if (my_copy(src_path, dst_path,
+              MYF(MY_WME | MY_HOLD_ORIGINAL_MODES | MY_DONT_OVERWRITE_FILE)))
+  {
+    error= HA_ADMIN_FAILED;
+    errmsg= "Failed copying .frm file (errno: %d)";
+    goto err;
+  }
+
+  /* Change extension */
+  if (fn_format_relative_to_data_home(dst_path, table_name, backup_dir,
+                                      MARIA_NAME_DEXT))
+  {
+    errmsg= "Failed in fn_format() for .MYD file (errno: %d)";
+    error= HA_ADMIN_INVALID;
+    goto err;
+  }
+
+  strxmov(src_path, table->s->normalized_path.str, MARIA_NAME_DEXT, NullS);
+  if (_ma_flush_table_files(file, MARIA_FLUSH_DATA, FLUSH_FORCE_WRITE,
+                            FLUSH_KEEP))
+  {
+    error= HA_ADMIN_FAILED;
+    errmsg= "Failed in flush (Error %d)";
+    goto err;
+  }
+  if (my_copy(src_path, dst_path,
+              MYF(MY_WME | MY_HOLD_ORIGINAL_MODES | MY_DONT_OVERWRITE_FILE)))
+  {
+    errmsg= "Failed copying .MYD file (errno: %d)";
+    error= HA_ADMIN_FAILED;
+    goto err;
+  }
+  DBUG_RETURN(HA_ADMIN_OK);
+
+err:
+  {
+    HA_CHECK &param= *(HA_CHECK*) thd->alloc(sizeof(param));
+    if (!&param)
+      return HA_ADMIN_INTERNAL_ERROR;
+
+    maria_chk_init(&param);
+    param.thd= thd;
+    param.op_name= "backup";
+    param.db_name= table->s->db.str;
+    param.table_name= table->s->table_name.str;
+    param.testflag= 0;
+    _ma_check_print_error(&param, errmsg, my_errno);
+    DBUG_RETURN(error);
+  }
+}
+
+
+int ha_maria::repair(THD * thd, HA_CHECK_OPT *check_opt)
+{
+  int error;
+  HA_CHECK &param= *(HA_CHECK*) thd->alloc(sizeof(param));
+  ha_rows start_records;
+
+  if (!file || !&param)
+    return HA_ADMIN_INTERNAL_ERROR;
+
+  maria_chk_init(&param);
+  param.thd= thd;
+  param.op_name= "repair";
+  param.testflag= ((check_opt->flags & ~(T_EXTEND)) |
+                   T_SILENT | T_FORCE_CREATE | T_CALC_CHECKSUM |
+                   (check_opt->flags & T_EXTEND ? T_REP : T_REP_BY_SORT));
+  param.sort_buffer_length= THDVAR(thd, sort_buffer_size);
+  start_records= file->state->records;
+  while ((error= repair(thd, &param, 0)) && param.retry_repair)
+  {
+    param.retry_repair= 0;
+    if (test_all_bits(param.testflag,
+                      (uint) (T_RETRY_WITHOUT_QUICK | T_QUICK)))
+    {
+      param.testflag&= ~(T_RETRY_WITHOUT_QUICK | T_QUICK);
+      /* Ensure we don't loose any rows when retrying without quick */
+      param.testflag|= T_SAFE_REPAIR;
+      if (thd->vio_ok())
+        _ma_check_print_info(&param, "Retrying repair without quick");
+      else
+        sql_print_information("Retrying repair of: '%s' without quick",
+                              table->s->path.str);
+      continue;
+    }
+    param.testflag &= ~T_QUICK;
+    if ((param.testflag & T_REP_BY_SORT))
+    {
+      param.testflag= (param.testflag & ~T_REP_BY_SORT) | T_REP;
+      sql_print_information("Retrying repair of: '%s' with keycache",
+                            table->s->path.str);
+      continue;
+    }
+    break;
+  }
+  if (!error && start_records != file->state->records &&
+      !(check_opt->flags & T_VERY_SILENT))
+  {
+    char llbuff[22], llbuff2[22];
+    sql_print_information("Found %s of %s rows when repairing '%s'",
+                          llstr(file->state->records, llbuff),
+                          llstr(start_records, llbuff2),
+                          table->s->path.str);
+  }
+  return error;
+}
+
+int ha_maria::zerofill(THD * thd, HA_CHECK_OPT *check_opt)
+{
+  int error;
+  HA_CHECK &param= *(HA_CHECK*) thd->alloc(sizeof(param));
+  TRN *old_trn;
+  MARIA_SHARE *share= file->s;
+
+  if (!file || !&param)
+    return HA_ADMIN_INTERNAL_ERROR;
+
+  old_trn= file->trn;
+  maria_chk_init(&param);
+  param.thd= thd;
+  param.op_name= "zerofill";
+  param.testflag= check_opt->flags | T_SILENT | T_ZEROFILL;
+  param.sort_buffer_length= THDVAR(thd, sort_buffer_size);
+  error=maria_zerofill(&param, file, share->open_file_name.str);
+
+  /* Reset trn, that may have been set by repair */
+  _ma_set_trn_for_table(file, old_trn);
+
+  if (!error)
+  {
+    pthread_mutex_lock(&share->intern_lock);
+    maria_update_state_info(&param, file, UPDATE_TIME | UPDATE_OPEN_COUNT);
+    pthread_mutex_unlock(&share->intern_lock);
+  }
+  return error;
+}
+
+int ha_maria::optimize(THD * thd, HA_CHECK_OPT *check_opt)
+{
+  int error;
+  HA_CHECK &param= *(HA_CHECK*) thd->alloc(sizeof(param));
+
+  if (!file || !&param)
+    return HA_ADMIN_INTERNAL_ERROR;
+
+  maria_chk_init(&param);
+  param.thd= thd;
+  param.op_name= "optimize";
+  param.testflag= (check_opt->flags | T_SILENT | T_FORCE_CREATE |
+                   T_REP_BY_SORT | T_STATISTICS | T_SORT_INDEX);
+  param.sort_buffer_length= THDVAR(thd, sort_buffer_size);
+  if ((error= repair(thd, &param, 1)) && param.retry_repair)
+  {
+    sql_print_warning("Warning: Optimize table got errno %d on %s.%s, retrying",
+                      my_errno, param.db_name, param.table_name);
+    param.testflag &= ~T_REP_BY_SORT;
+    error= repair(thd, &param, 1);
+  }
+
+  return error;
+}
+
+
+int ha_maria::repair(THD *thd, HA_CHECK *param, bool do_optimize)
+{
+  int error= 0;
+  ulonglong local_testflag= param->testflag;
+  bool optimize_done= !do_optimize, statistics_done= 0;
+  const char *old_proc_info= thd->proc_info;
+  char fixed_name[FN_REFLEN];
+  MARIA_SHARE *share= file->s;
+  ha_rows rows= file->state->records;
+  TRN *old_trn= file->trn;
+  DBUG_ENTER("ha_maria::repair");
+
+  /*
+    Normally this method is entered with a properly opened table. If the
+    repair fails, it can be repeated with more elaborate options. Under
+    special circumstances it can happen that a repair fails so that it
+    closed the data file and cannot re-open it. In this case file->dfile
+    is set to -1. We must not try another repair without an open data
+    file. (Bug #25289)
+  */
+  if (file->dfile.file == -1)
+  {
+    sql_print_information("Retrying repair of: '%s' failed. "
+                          "Please try REPAIR EXTENDED or aria_chk",
+                          table->s->path.str);
+    DBUG_RETURN(HA_ADMIN_FAILED);
+  }
+
+  /*
+    If transactions was not enabled for a transactional table then
+    file->s->status is not up to date. This is needed for repair_by_sort
+    to work
+  */
+  if (share->base.born_transactional && !share->now_transactional)
+    _ma_copy_nontrans_state_information(file);
+
+  param->db_name= table->s->db.str;
+  param->table_name= table->alias;
+  param->tmpfile_createflag= O_RDWR | O_TRUNC;
+  param->using_global_keycache= 1;
+  param->thd= thd;
+  param->tmpdir= &mysql_tmpdir_list;
+  param->out_flag= 0;
+  strmov(fixed_name, share->open_file_name.str);
+
+  // Don't lock tables if we have used LOCK TABLE
+  if (!thd->locked_tables &&
+      maria_lock_database(file, table->s->tmp_table ? F_EXTRA_LCK : F_WRLCK))
+  {
+    _ma_check_print_error(param, ER(ER_CANT_LOCK), my_errno);
+    DBUG_RETURN(HA_ADMIN_FAILED);
+  }
+
+  if (!do_optimize ||
+      (((share->data_file_type == BLOCK_RECORD) ?
+        (share->state.changed & STATE_NOT_OPTIMIZED_ROWS) :
+        (file->state->del ||
+         share->state.split != file->state->records)) &&
+       (!(param->testflag & T_QUICK) ||
+        (share->state.changed & (STATE_NOT_OPTIMIZED_KEYS |
+                                 STATE_NOT_OPTIMIZED_ROWS)))))
+  {
+    ulonglong key_map= ((local_testflag & T_CREATE_MISSING_KEYS) ?
+                        maria_get_mask_all_keys_active(share->base.keys) :
+                        share->state.key_map);
+    ulonglong save_testflag= param->testflag;
+    if (maria_test_if_sort_rep(file, file->state->records, key_map, 0) &&
+        (local_testflag & T_REP_BY_SORT))
+    {
+      local_testflag |= T_STATISTICS;
+      param->testflag |= T_STATISTICS;           // We get this for free
+      statistics_done= 1;
+      /* TODO: Remove BLOCK_RECORD test when parallel works with blocks */
+      if (THDVAR(thd,repair_threads) > 1 &&
+          share->data_file_type != BLOCK_RECORD)
+      {
+        char buf[40];
+        /* TODO: respect maria_repair_threads variable */
+        my_snprintf(buf, 40, "Repair with %d threads", my_count_bits(key_map));
+        thd_proc_info(thd, buf);
+        param->testflag|= T_REP_PARALLEL;
+        error= maria_repair_parallel(param, file, fixed_name,
+                                     test(param->testflag & T_QUICK));
+        /* to reset proc_info, as it was pointing to local buffer */
+        thd_proc_info(thd, "Repair done");
+      }
+      else
+      {
+        thd_proc_info(thd, "Repair by sorting");
+        param->testflag|= T_REP_BY_SORT;
+        error= maria_repair_by_sort(param, file, fixed_name,
+                                    test(param->testflag & T_QUICK));
+      }
+    }
+    else
+    {
+      thd_proc_info(thd, "Repair with keycache");
+      param->testflag &= ~(T_REP_BY_SORT | T_REP_PARALLEL);
+      error= maria_repair(param, file, fixed_name,
+                          test(param->testflag & T_QUICK));
+    }
+    param->testflag= save_testflag | (param->testflag & T_RETRY_WITHOUT_QUICK);
+    optimize_done= 1;
+  }
+  if (!error)
+  {
+    if ((local_testflag & T_SORT_INDEX) &&
+        (share->state.changed & STATE_NOT_SORTED_PAGES))
+    {
+      optimize_done= 1;
+      thd_proc_info(thd, "Sorting index");
+      error= maria_sort_index(param, file, fixed_name);
+    }
+    if (!statistics_done && (local_testflag & T_STATISTICS))
+    {
+      if (share->state.changed & STATE_NOT_ANALYZED)
+      {
+        optimize_done= 1;
+        thd_proc_info(thd, "Analyzing");
+        error= maria_chk_key(param, file);
+      }
+      else
+        local_testflag &= ~T_STATISTICS;        // Don't update statistics
+    }
+  }
+  thd_proc_info(thd, "Saving state");
+  pthread_mutex_lock(&share->intern_lock);
+  if (!error)
+  {
+    if ((share->state.changed & STATE_CHANGED) || maria_is_crashed(file))
+    {
+      DBUG_PRINT("info", ("Reseting crashed state"));
+      share->state.changed&= ~(STATE_CHANGED | STATE_CRASHED |
+                               STATE_CRASHED_ON_REPAIR | STATE_IN_REPAIR);
+      file->update |= HA_STATE_CHANGED | HA_STATE_ROW_CHANGED;
+    }
+    /*
+      repair updates share->state.state. Ensure that file->state is up to date
+    */
+    if (file->state != &share->state.state)
+      *file->state= share->state.state;
+    if (share->base.auto_key)
+      _ma_update_auto_increment_key(param, file, 1);
+    if (optimize_done)
+      error= maria_update_state_info(param, file,
+                                     UPDATE_TIME | UPDATE_OPEN_COUNT |
+                                     (local_testflag &
+                                      T_STATISTICS ? UPDATE_STAT : 0));
+    info(HA_STATUS_NO_LOCK | HA_STATUS_TIME | HA_STATUS_VARIABLE |
+         HA_STATUS_CONST, 0);
+    if (rows != file->state->records && !(param->testflag & T_VERY_SILENT))
+    {
+      char llbuff[22], llbuff2[22];
+      _ma_check_print_warning(param, "Number of rows changed from %s to %s",
+                              llstr(rows, llbuff),
+                              llstr(file->state->records, llbuff2));
+      /* Abort if warning was converted to error */
+      if (current_thd->is_error())
+        error= 1;
+    }
+  }
+  else
+  {
+    maria_mark_crashed_on_repair(file);
+    file->update |= HA_STATE_CHANGED | HA_STATE_ROW_CHANGED;
+    maria_update_state_info(param, file, 0);
+  }
+  pthread_mutex_unlock(&share->intern_lock);
+  thd_proc_info(thd, old_proc_info);
+  if (!thd->locked_tables)
+    maria_lock_database(file, F_UNLCK);
+
+  /* Reset trn, that may have been set by repair */
+  _ma_set_trn_for_table(file, old_trn);
+  error= error ? HA_ADMIN_FAILED :
+    (optimize_done ?
+     (write_log_record_for_repair(param, file) ? HA_ADMIN_FAILED :
+      HA_ADMIN_OK) : HA_ADMIN_ALREADY_DONE);
+  DBUG_RETURN(error);
+}
+
+
+/*
+  Assign table indexes to a specific key cache.
+*/
+
+int ha_maria::assign_to_keycache(THD * thd, HA_CHECK_OPT *check_opt)
+{
+#if 0 && NOT_IMPLEMENTED
+  PAGECACHE *new_pagecache= check_opt->pagecache;
+  const char *errmsg= 0;
+  int error= HA_ADMIN_OK;
+  ulonglong map;
+  TABLE_LIST *table_list= table->pos_in_table_list;
+  DBUG_ENTER("ha_maria::assign_to_keycache");
+
+
+  table->keys_in_use_for_query.clear_all();
+
+  if (table_list->process_index_hints(table))
+    DBUG_RETURN(HA_ADMIN_FAILED);
+  map= ~(ulonglong) 0;
+  if (!table->keys_in_use_for_query.is_clear_all())
+    /* use all keys if there's no list specified by the user through hints */
+    map= table->keys_in_use_for_query.to_ulonglong();
+
+  if ((error= maria_assign_to_pagecache(file, map, new_pagecache)))
+  {
+    char buf[STRING_BUFFER_USUAL_SIZE];
+    my_snprintf(buf, sizeof(buf),
+                "Failed to flush to index file (errno: %d)", error);
+    errmsg= buf;
+    error= HA_ADMIN_CORRUPT;
+  }
+
+  if (error != HA_ADMIN_OK)
+  {
+    /* Send error to user */
+    HA_CHECK &param= *(HA_CHECK*) thd->alloc(sizeof(param));
+    if (!&param)
+      return HA_ADMIN_INTERNAL_ERROR;
+
+    maria_chk_init(&param);
+    param.thd= thd;
+    param.op_name= "assign_to_keycache";
+    param.db_name= table->s->db.str;
+    param.table_name= table->s->table_name.str;
+    param.testflag= 0;
+    _ma_check_print_error(&param, errmsg);
+  }
+  DBUG_RETURN(error);
+#else
+  return  HA_ADMIN_NOT_IMPLEMENTED;
+#endif
+}
+
+
+/*
+  Preload pages of the index file for a table into the key cache.
+*/
+
+int ha_maria::preload_keys(THD * thd, HA_CHECK_OPT *check_opt)
+{
+  ulonglong map;
+  TABLE_LIST *table_list= table->pos_in_table_list;
+
+  DBUG_ENTER("ha_maria::preload_keys");
+
+  table->keys_in_use_for_query.clear_all();
+
+  if (table_list->process_index_hints(table))
+    DBUG_RETURN(HA_ADMIN_FAILED);
+
+  map= ~(ulonglong) 0;
+  /* Check validity of the index references */
+  if (!table->keys_in_use_for_query.is_clear_all())
+    /* use all keys if there's no list specified by the user through hints */
+    map= table->keys_in_use_for_query.to_ulonglong();
+
+  maria_extra(file, HA_EXTRA_PRELOAD_BUFFER_SIZE,
+              (void*) &thd->variables.preload_buff_size);
+
+  int error;
+
+  if ((error= maria_preload(file, map, table_list->ignore_leaves)))
+  {
+    char buf[MYSQL_ERRMSG_SIZE+20];
+    const char *errmsg;
+
+    switch (error) {
+    case HA_ERR_NON_UNIQUE_BLOCK_SIZE:
+      errmsg= "Indexes use different block sizes";
+      break;
+    case HA_ERR_OUT_OF_MEM:
+      errmsg= "Failed to allocate buffer";
+      break;
+    default:
+      my_snprintf(buf, sizeof(buf),
+                  "Failed to read from index file (errno: %d)", my_errno);
+      errmsg= buf;
+    }
+
+    HA_CHECK &param= *(HA_CHECK*) thd->alloc(sizeof(param));
+    if (!&param)
+      return HA_ADMIN_INTERNAL_ERROR;
+
+    maria_chk_init(&param);
+    param.thd= thd;
+    param.op_name= "preload_keys";
+    param.db_name= table->s->db.str;
+    param.table_name= table->s->table_name.str;
+    param.testflag= 0;
+    _ma_check_print_error(&param, "%s", errmsg);
+    DBUG_RETURN(HA_ADMIN_FAILED);
+  }
+  DBUG_RETURN(HA_ADMIN_OK);
+}
+
+
+/*
+  Disable indexes, making it persistent if requested.
+
+  SYNOPSIS
+    disable_indexes()
+    mode        mode of operation:
+                HA_KEY_SWITCH_NONUNIQ      disable all non-unique keys
+                HA_KEY_SWITCH_ALL          disable all keys
+                HA_KEY_SWITCH_NONUNIQ_SAVE dis. non-uni. and make persistent
+                HA_KEY_SWITCH_ALL_SAVE     dis. all keys and make persistent
+
+  IMPLEMENTATION
+    HA_KEY_SWITCH_NONUNIQ       is not implemented.
+    HA_KEY_SWITCH_ALL_SAVE      is not implemented.
+
+  RETURN
+    0  ok
+    HA_ERR_WRONG_COMMAND  mode not implemented.
+*/
+
+int ha_maria::disable_indexes(uint mode)
+{
+  int error;
+
+  if (mode == HA_KEY_SWITCH_ALL)
+  {
+    /* call a storage engine function to switch the key map */
+    error= maria_disable_indexes(file);
+  }
+  else if (mode == HA_KEY_SWITCH_NONUNIQ_SAVE)
+  {
+    maria_extra(file, HA_EXTRA_NO_KEYS, 0);
+    info(HA_STATUS_CONST);                      // Read new key info
+    error= 0;
+  }
+  else
+  {
+    /* mode not implemented */
+    error= HA_ERR_WRONG_COMMAND;
+  }
+  return error;
+}
+
+
+/*
+  Enable indexes, making it persistent if requested.
+
+  SYNOPSIS
+    enable_indexes()
+    mode        mode of operation:
+                HA_KEY_SWITCH_NONUNIQ      enable all non-unique keys
+                HA_KEY_SWITCH_ALL          enable all keys
+                HA_KEY_SWITCH_NONUNIQ_SAVE en. non-uni. and make persistent
+                HA_KEY_SWITCH_ALL_SAVE     en. all keys and make persistent
+
+  DESCRIPTION
+    Enable indexes, which might have been disabled by disable_index() before.
+    The modes without _SAVE work only if both data and indexes are empty,
+    since the MARIA repair would enable them persistently.
+    To be sure in these cases, call handler::delete_all_rows() before.
+
+  IMPLEMENTATION
+    HA_KEY_SWITCH_NONUNIQ       is not implemented.
+    HA_KEY_SWITCH_ALL_SAVE      is not implemented.
+
+  RETURN
+    0  ok
+    !=0  Error, among others:
+    HA_ERR_CRASHED  data or index is non-empty. Delete all rows and retry.
+    HA_ERR_WRONG_COMMAND  mode not implemented.
+*/
+
+int ha_maria::enable_indexes(uint mode)
+{
+  int error;
+  DBUG_PRINT("info", ("ha_maria::enable_indexes mode: %d", mode));
+  if (maria_is_all_keys_active(file->s->state.key_map, file->s->base.keys))
+  {
+    /* All indexes are enabled already. */
+    return 0;
+  }
+
+  if (mode == HA_KEY_SWITCH_ALL)
+  {
+    error= maria_enable_indexes(file);
+    /*
+       Do not try to repair on error,
+       as this could make the enabled state persistent,
+       but mode==HA_KEY_SWITCH_ALL forbids it.
+    */
+  }
+  else if (mode == HA_KEY_SWITCH_NONUNIQ_SAVE)
+  {
+    THD *thd= current_thd;
+    HA_CHECK &param= *(HA_CHECK*) thd->alloc(sizeof(param));
+    if (!&param)
+      return HA_ADMIN_INTERNAL_ERROR;
+
+    const char *save_proc_info= thd_proc_info(thd, "Creating index");
+
+    maria_chk_init(&param);
+    param.op_name= "recreating_index";
+    param.testflag= (T_SILENT | T_REP_BY_SORT | T_QUICK |
+                     T_CREATE_MISSING_KEYS | T_SAFE_REPAIR);
+    if (bulk_insert_single_undo == BULK_INSERT_SINGLE_UNDO_AND_NO_REPAIR)
+    {
+      bulk_insert_single_undo= BULK_INSERT_SINGLE_UNDO_AND_REPAIR;
+      /*
+        Don't bump create_rename_lsn, because UNDO_BULK_INSERT
+        should not be skipped in case of crash during repair.
+      */
+      param.testflag|= T_NO_CREATE_RENAME_LSN;
+    }
+    param.myf_rw &= ~MY_WAIT_IF_FULL;
+    param.sort_buffer_length= THDVAR(thd,sort_buffer_size);
+    param.stats_method= (enum_handler_stats_method)THDVAR(thd,stats_method);
+    param.tmpdir= &mysql_tmpdir_list;
+    if ((error= (repair(thd, &param, 0) != HA_ADMIN_OK)) && param.retry_repair)
+    {
+      sql_print_warning("Warning: Enabling keys got errno %d on %s.%s, "
+                        "retrying",
+                        my_errno, param.db_name, param.table_name);
+      /* This should never fail normally */
+      DBUG_ASSERT(thd->killed != 0);
+      /* Repairing by sort failed. Now try standard repair method. */
+      param.testflag &= ~T_REP_BY_SORT;
+      error= (repair(thd, &param, 0) != HA_ADMIN_OK);
+      /*
+        If the standard repair succeeded, clear all error messages which
+        might have been set by the first repair. They can still be seen
+        with SHOW WARNINGS then.
+      */
+      if (!error)
+        thd->clear_error();
+    }
+    info(HA_STATUS_CONST);
+    thd_proc_info(thd, save_proc_info);
+  }
+  else
+  {
+    /* mode not implemented */
+    error= HA_ERR_WRONG_COMMAND;
+  }
+  DBUG_EXECUTE_IF("maria_flush_whole_log",
+                  {
+                    DBUG_PRINT("maria_flush_whole_log", ("now"));
+                    translog_flush(translog_get_horizon());
+                  });
+  DBUG_EXECUTE_IF("maria_crash_enable_index",
+                  {
+                    DBUG_PRINT("maria_crash_enable_index", ("now"));
+                    DBUG_ABORT();
+                  });
+  return error;
+}
+
+
+/*
+  Test if indexes are disabled.
+
+
+  SYNOPSIS
+    indexes_are_disabled()
+      no parameters
+
+
+  RETURN
+    0  indexes are not disabled
+    1  all indexes are disabled
+   [2  non-unique indexes are disabled - NOT YET IMPLEMENTED]
+*/
+
+int ha_maria::indexes_are_disabled(void)
+{
+  return maria_indexes_are_disabled(file);
+}
+
+
+/*
+  prepare for a many-rows insert operation
+  e.g. - disable indexes (if they can be recreated fast) or
+  activate special bulk-insert optimizations
+
+  SYNOPSIS
+    start_bulk_insert(rows)
+    rows        Rows to be inserted
+                0 if we don't know
+
+  NOTICE
+    Do not forget to call end_bulk_insert() later!
+*/
+
+void ha_maria::start_bulk_insert(ha_rows rows)
+{
+  DBUG_ENTER("ha_maria::start_bulk_insert");
+  THD *thd= current_thd;
+  ulong size= min(thd->variables.read_buff_size,
+                  (ulong) (table->s->avg_row_length * rows));
+  MARIA_SHARE *share= file->s;
+  DBUG_PRINT("info", ("start_bulk_insert: rows %lu size %lu",
+                      (ulong) rows, size));
+
+  /* don't enable row cache if too few rows */
+  if (!rows || (rows > MARIA_MIN_ROWS_TO_USE_WRITE_CACHE))
+    maria_extra(file, HA_EXTRA_WRITE_CACHE, (void*) &size);
+
+  can_enable_indexes= (maria_is_all_keys_active(share->state.key_map,
+                                                share->base.keys));
+  bulk_insert_single_undo= BULK_INSERT_NONE;
+
+  if (!(specialflag & SPECIAL_SAFE_MODE))
+  {
+    /*
+       Only disable old index if the table was empty and we are inserting
+       a lot of rows.
+       We should not do this for only a few rows as this is slower and
+       we don't want to update the key statistics based of only a few rows.
+       Index file rebuild requires an exclusive lock, so if versioning is on
+       don't do it (see how ha_maria::store_lock() tries to predict repair).
+       We can repair index only if we have an exclusive (TL_WRITE) lock. To
+       see if table is empty, we shouldn't rely on the old records' count from
+       our transaction's start (if that old count is 0 but now there are
+       records in the table, we would wrongly destroy them).
+       So we need to look at share->state.state.records.
+       As a safety net for now, we don't remove the test of
+       file->state->records, because there is uncertainty on what will happen
+       during repair if the two states disagree.
+    */
+    if ((file->state->records == 0) &&
+        (share->state.state.records == 0) && can_enable_indexes &&
+        (!rows || rows >= MARIA_MIN_ROWS_TO_DISABLE_INDEXES) &&
+        (file->lock.type == TL_WRITE))
+    {
+      /**
+         @todo for a single-row INSERT SELECT, we will go into repair, which
+         is more costly (flushes, syncs) than a row write.
+      */
+      maria_disable_non_unique_index(file, rows);
+      if (share->now_transactional)
+      {
+        bulk_insert_single_undo= BULK_INSERT_SINGLE_UNDO_AND_NO_REPAIR;
+        write_log_record_for_bulk_insert(file);
+        _ma_tmp_disable_logging_for_table(file, TRUE);
+        /*
+          Pages currently in the page cache have type PAGECACHE_LSN_PAGE, we
+          are not allowed to overwrite them with PAGECACHE_PLAIN_PAGE, so
+          throw them away. It is not losing data, because we just wrote and
+          forced an UNDO which will for sure empty the table if we crash. The
+          upcoming unique-key insertions however need a proper index, so we
+          cannot leave the corrupted on-disk index file, thus we truncate it.
+        */
+        maria_delete_all_rows(file);
+      }
+    }
+    else if (!file->bulk_insert &&
+             (!rows || rows >= MARIA_MIN_ROWS_TO_USE_BULK_INSERT))
+    {
+      maria_init_bulk_insert(file, thd->variables.bulk_insert_buff_size, rows);
+    }
+  }
+  DBUG_VOID_RETURN;
+}
+
+
+/*
+  end special bulk-insert optimizations,
+  which have been activated by start_bulk_insert().
+
+  SYNOPSIS
+    end_bulk_insert()
+    no arguments
+
+  RETURN
+    0     OK
+    != 0  Error
+*/
+
+int ha_maria::end_bulk_insert()
+{
+  int err;
+  DBUG_ENTER("ha_maria::end_bulk_insert");
+  maria_end_bulk_insert(file);
+  if ((err= maria_extra(file, HA_EXTRA_NO_CACHE, 0)))
+    goto end;
+  if (can_enable_indexes && !file->s->deleting)
+    err= enable_indexes(HA_KEY_SWITCH_NONUNIQ_SAVE);
+end:
+  if (bulk_insert_single_undo != BULK_INSERT_NONE)
+  {
+    DBUG_ASSERT(can_enable_indexes);
+    /*
+      Table was transactional just before start_bulk_insert().
+      No need to flush pages if we did a repair (which already flushed).
+    */
+    err|=
+      _ma_reenable_logging_for_table(file,
+                                     bulk_insert_single_undo ==
+                                     BULK_INSERT_SINGLE_UNDO_AND_NO_REPAIR);
+  }
+  DBUG_RETURN(err);
+}
+
+
+bool ha_maria::check_and_repair(THD *thd)
+{
+  int error, crashed;
+  LEX_STRING old_query;
+  HA_CHECK_OPT check_opt;
+  DBUG_ENTER("ha_maria::check_and_repair");
+
+  check_opt.init();
+
+  error= 1;
+  if ((file->s->state.changed &
+       (STATE_CRASHED | STATE_CRASHED_ON_REPAIR | STATE_MOVED)) ==
+      STATE_MOVED)
+  {
+    sql_print_information("Zerofilling moved table:  '%s'",
+                          table->s->path.str);
+    if (!(error= zerofill(thd, &check_opt)))
+      DBUG_RETURN(0);
+  }
+
+  /*
+    if we got this far - the table is crashed.
+    but don't auto-repair if maria_recover_options is not set
+  */
+  if (!maria_recover_options)
+    DBUG_RETURN(error);
+
+  error= 0;
+  check_opt.flags= T_MEDIUM | T_AUTO_REPAIR;
+  // Don't use quick if deleted rows
+  if (!file->state->del && (maria_recover_options & HA_RECOVER_QUICK))
+    check_opt.flags |= T_QUICK;
+
+  old_query= thd->query_string;
+  pthread_mutex_lock(&LOCK_thread_count);
+  thd->query_string= table->s->table_name;
+  pthread_mutex_unlock(&LOCK_thread_count);
+
+  if (!(crashed= maria_is_crashed(file)))
+  {
+    sql_print_warning("Checking table:   '%s'", table->s->path.str);
+    crashed= check(thd, &check_opt);
+  }
+
+  if (crashed)
+  {
+    sql_print_warning("Recovering table: '%s'", table->s->path.str);
+    check_opt.flags=
+      ((maria_recover_options & HA_RECOVER_BACKUP ? T_BACKUP_DATA : 0) |
+       (maria_recover_options & HA_RECOVER_FORCE ? 0 : T_SAFE_REPAIR) |
+       T_AUTO_REPAIR);
+    if (repair(thd, &check_opt))
+      error= 1;
+  }
+  pthread_mutex_lock(&LOCK_thread_count);
+  thd->query_string= old_query;
+  pthread_mutex_unlock(&LOCK_thread_count);
+  DBUG_RETURN(error);
+}
+
+
+bool ha_maria::is_crashed() const
+{
+  return (file->s->state.changed & (STATE_CRASHED | STATE_MOVED) ||
+          (my_disable_locking && file->s->state.open_count));
+}
+
+#define CHECK_UNTIL_WE_FULLY_IMPLEMENTED_VERSIONING(msg) \
+  do { \
+    if (file->lock.type == TL_WRITE_CONCURRENT_INSERT) \
+    { \
+      my_error(ER_CHECK_NOT_IMPLEMENTED, MYF(0), msg); \
+      return 1; \
+    } \
+  } while(0)
+
+int ha_maria::update_row(const uchar * old_data, uchar * new_data)
+{
+  CHECK_UNTIL_WE_FULLY_IMPLEMENTED_VERSIONING("UPDATE in WRITE CONCURRENT");
+  ha_statistic_increment(&SSV::ha_update_count);
+  if (table->timestamp_field_type & TIMESTAMP_AUTO_SET_ON_UPDATE)
+    table->timestamp_field->set_time();
+  return maria_update(file, old_data, new_data);
+}
+
+
+int ha_maria::delete_row(const uchar * buf)
+{
+  CHECK_UNTIL_WE_FULLY_IMPLEMENTED_VERSIONING("DELETE in WRITE CONCURRENT");
+  ha_statistic_increment(&SSV::ha_delete_count);
+  return maria_delete(file, buf);
+}
+
+C_MODE_START
+
+ICP_RESULT index_cond_func_maria(void *arg)
+{
+  ha_maria *h= (ha_maria*)arg;
+  if (h->end_range)
+  {
+    if (h->compare_key2(h->end_range) > 0)
+      return ICP_OUT_OF_RANGE; /* caller should return HA_ERR_END_OF_FILE already */
+  }
+  return h->pushed_idx_cond->val_int() ? ICP_MATCH : ICP_NO_MATCH;
+}
+
+C_MODE_END
+
+int ha_maria::index_read_map(uchar * buf, const uchar * key,
+			     key_part_map keypart_map,
+			     enum ha_rkey_function find_flag)
+{
+  DBUG_ASSERT(inited == INDEX);
+  ha_statistic_increment(&SSV::ha_read_key_count);
+  int error= maria_rkey(file, buf, active_index, key, keypart_map, find_flag);
+  table->status= error ? STATUS_NOT_FOUND : 0;
+  return error;
+}
+
+
+int ha_maria::index_read_idx_map(uchar * buf, uint index, const uchar * key,
+				 key_part_map keypart_map,
+				 enum ha_rkey_function find_flag)
+{
+  ha_statistic_increment(&SSV::ha_read_key_count);
+  int error= maria_rkey(file, buf, index, key, keypart_map, find_flag);
+  table->status= error ? STATUS_NOT_FOUND : 0;
+  return error;
+}
+
+
+int ha_maria::index_read_last_map(uchar * buf, const uchar * key,
+				  key_part_map keypart_map)
+{
+  DBUG_ENTER("ha_maria::index_read_last_map");
+  DBUG_ASSERT(inited == INDEX);
+  ha_statistic_increment(&SSV::ha_read_key_count);
+  int error= maria_rkey(file, buf, active_index, key, keypart_map,
+                        HA_READ_PREFIX_LAST);
+  table->status= error ? STATUS_NOT_FOUND : 0;
+  DBUG_RETURN(error);
+}
+
+
+int ha_maria::index_next(uchar * buf)
+{
+  DBUG_ASSERT(inited == INDEX);
+  ha_statistic_increment(&SSV::ha_read_next_count);
+  int error= maria_rnext(file, buf, active_index);
+  table->status= error ? STATUS_NOT_FOUND : 0;
+  return error;
+}
+
+
+int ha_maria::index_prev(uchar * buf)
+{
+  DBUG_ASSERT(inited == INDEX);
+  ha_statistic_increment(&SSV::ha_read_prev_count);
+  int error= maria_rprev(file, buf, active_index);
+  table->status= error ? STATUS_NOT_FOUND : 0;
+  return error;
+}
+
+
+int ha_maria::index_first(uchar * buf)
+{
+  DBUG_ASSERT(inited == INDEX);
+  ha_statistic_increment(&SSV::ha_read_first_count);
+  int error= maria_rfirst(file, buf, active_index);
+  table->status= error ? STATUS_NOT_FOUND : 0;
+  return error;
+}
+
+
+int ha_maria::index_last(uchar * buf)
+{
+  DBUG_ASSERT(inited == INDEX);
+  ha_statistic_increment(&SSV::ha_read_last_count);
+  int error= maria_rlast(file, buf, active_index);
+  table->status= error ? STATUS_NOT_FOUND : 0;
+  return error;
+}
+
+
+int ha_maria::index_next_same(uchar * buf,
+                              const uchar *key __attribute__ ((unused)),
+                              uint length __attribute__ ((unused)))
+{
+  int error;
+  DBUG_ASSERT(inited == INDEX);
+  ha_statistic_increment(&SSV::ha_read_next_count);
+  /*
+    TODO: Delete this loop in Maria 1.5 as versioning will ensure this never
+    happens
+  */
+  do
+  {
+    error= maria_rnext_same(file,buf);
+  } while (error == HA_ERR_RECORD_DELETED);
+  table->status= error ? STATUS_NOT_FOUND : 0;
+  return error;
+}
+
+
+int ha_maria::index_init(uint idx, bool sorted)
+{ 
+  active_index=idx;
+  if (pushed_idx_cond_keyno == idx)
+    ma_set_index_cond_func(file, index_cond_func_maria, this);
+  return 0; 
+}
+
+
+int ha_maria::index_end()
+{
+  active_index=MAX_KEY;
+  ma_set_index_cond_func(file, NULL, 0);
+  in_range_check_pushed_down= FALSE;
+  ds_mrr.dsmrr_close();
+  return 0; 
+}
+
+
+int ha_maria::rnd_init(bool scan)
+{
+  if (scan)
+    return maria_scan_init(file);
+  return maria_reset(file);                        // Free buffers
+}
+
+
+int ha_maria::rnd_end()
+{
+  ds_mrr.dsmrr_close();
+  /* Safe to call even if we don't have started a scan */
+  maria_scan_end(file);
+  return 0;
+}
+
+
+int ha_maria::rnd_next(uchar *buf)
+{
+  ha_statistic_increment(&SSV::ha_read_rnd_next_count);
+  int error= maria_scan(file, buf);
+  table->status= error ? STATUS_NOT_FOUND : 0;
+  return error;
+}
+
+
+int ha_maria::remember_rnd_pos()
+{
+  return (*file->s->scan_remember_pos)(file, &remember_pos);
+}
+
+
+int ha_maria::restart_rnd_next(uchar *buf)
+{
+  (*file->s->scan_restore_pos)(file, remember_pos);
+  return rnd_next(buf);
+}
+
+
+int ha_maria::rnd_pos(uchar *buf, uchar *pos)
+{
+  ha_statistic_increment(&SSV::ha_read_rnd_count);
+  int error= maria_rrnd(file, buf, my_get_ptr(pos, ref_length));
+  table->status= error ? STATUS_NOT_FOUND : 0;
+  return error;
+}
+
+
+void ha_maria::position(const uchar *record)
+{
+  my_off_t row_position= maria_position(file);
+  my_store_ptr(ref, ref_length, row_position);
+}
+
+
+int ha_maria::info(uint flag)
+{
+  return info(flag, table->s->tmp_table == NO_TMP_TABLE);
+}
+
+int ha_maria::info(uint flag, my_bool lock_table_share)
+{
+  MARIA_INFO maria_info;
+  char name_buff[FN_REFLEN];
+
+  (void) maria_status(file, &maria_info, flag);
+  if (flag & HA_STATUS_VARIABLE)
+  {
+    stats.records=           maria_info.records;
+    stats.deleted=           maria_info.deleted;
+    stats.data_file_length=  maria_info.data_file_length;
+    stats.index_file_length= maria_info.index_file_length;
+    stats.delete_length=     maria_info.delete_length;
+    stats.check_time=        maria_info.check_time;
+    stats.mean_rec_length=   maria_info.mean_reclength;
+  }
+  if (flag & HA_STATUS_CONST)
+  {
+    TABLE_SHARE *share= table->s;
+    stats.max_data_file_length=  maria_info.max_data_file_length;
+    stats.max_index_file_length= maria_info.max_index_file_length;
+    stats.create_time= maria_info.create_time;
+    ref_length= maria_info.reflength;
+    share->db_options_in_use= maria_info.options;
+    stats.block_size= maria_block_size;
+    stats.mrr_length_per_rec= maria_info.reflength + 8; // 8 = max(sizeof(void *))
+
+    /* Update share */
+    if (lock_table_share)
+      pthread_mutex_lock(&share->mutex);
+    share->keys_in_use.set_prefix(share->keys);
+    share->keys_in_use.intersect_extended(maria_info.key_map);
+    share->keys_for_keyread.intersect(share->keys_in_use);
+    share->db_record_offset= maria_info.record_offset;
+    if (share->key_parts)
+    {
+      ulong *to= table->key_info[0].rec_per_key, *end;
+      double *from= maria_info.rec_per_key;
+      for (end= to+ share->key_parts ; to < end ; to++, from++)
+        *to= (ulong) (*from + 0.5);
+    }
+    if (lock_table_share)
+      pthread_mutex_unlock(&share->mutex);
+
+    /*
+       Set data_file_name and index_file_name to point at the symlink value
+       if table is symlinked (Ie;  Real name is not same as generated name)
+    */
+    data_file_name= index_file_name= 0;
+    fn_format(name_buff, file->s->open_file_name.str, "", MARIA_NAME_DEXT,
+              MY_APPEND_EXT | MY_UNPACK_FILENAME);
+    if (strcmp(name_buff, maria_info.data_file_name))
+      data_file_name =maria_info.data_file_name;
+    fn_format(name_buff, file->s->open_file_name.str, "", MARIA_NAME_IEXT,
+              MY_APPEND_EXT | MY_UNPACK_FILENAME);
+    if (strcmp(name_buff, maria_info.index_file_name))
+      index_file_name=maria_info.index_file_name;
+  }
+  if (flag & HA_STATUS_ERRKEY)
+  {
+    errkey= maria_info.errkey;
+    my_store_ptr(dup_ref, ref_length, maria_info.dup_key_pos);
+  }
+  /* Faster to always update, than to do it based on flag */
+  stats.update_time= maria_info.update_time;
+  stats.auto_increment_value= maria_info.auto_increment;
+
+  return 0;
+}
+
+
+int ha_maria::extra(enum ha_extra_function operation)
+{
+  int tmp;
+  TRN *old_trn= file->trn;
+  if ((specialflag & SPECIAL_SAFE_MODE) && operation == HA_EXTRA_KEYREAD)
+    return 0;
+#ifdef NOT_USED
+  if (operation == HA_EXTRA_MMAP && !opt_maria_use_mmap)
+    return 0;
+#endif
+
+  /*
+    We have to set file->trn here because in some cases we call
+    extern_lock(F_UNLOCK) (which resets file->trn) followed by maria_close()
+    without calling commit/rollback in between.  If file->trn is not set
+    we can't remove file->share from the transaction list in the extra() call.
+
+    table->in_use is not set in the case this is a done as part of closefrm()
+    as part of drop table.
+  */
+
+  if (file->s->now_transactional && !file->trn && table->in_use && 
+      (operation == HA_EXTRA_PREPARE_FOR_DROP ||
+       operation == HA_EXTRA_PREPARE_FOR_RENAME))
+  {
+    THD *thd= table->in_use;
+    TRN *trn= THD_TRN;
+    _ma_set_trn_for_table(file, trn);
+  }
+  tmp= maria_extra(file, operation, 0);
+  file->trn= old_trn;                           // Reset trn if was used
+  return tmp;
+}
+
+int ha_maria::reset(void)
+{
+  pushed_idx_cond= NULL;
+  pushed_idx_cond_keyno= MAX_KEY;
+  ma_set_index_cond_func(file, NULL, 0);
+  ds_mrr.dsmrr_close();
+  if (file->trn)
+  {
+    /* Next statement is a new statement. Ensure it's logged */
+    trnman_set_flags(file->trn,
+                     trnman_get_flags(file->trn) & ~TRN_STATE_INFO_LOGGED);
+  }
+  return maria_reset(file);
+}
+
+/* To be used with WRITE_CACHE and EXTRA_CACHE */
+
+int ha_maria::extra_opt(enum ha_extra_function operation, ulong cache_size)
+{
+  if ((specialflag & SPECIAL_SAFE_MODE) && operation == HA_EXTRA_WRITE_CACHE)
+    return 0;
+  return maria_extra(file, operation, (void*) &cache_size);
+}
+
+
+int ha_maria::delete_all_rows()
+{
+  THD *thd= current_thd;
+  (void) translog_log_debug_info(file->trn, LOGREC_DEBUG_INFO_QUERY,
+                                 (uchar*) thd->query(), thd->query_length());
+  if (file->s->now_transactional &&
+      ((table->in_use->options & (OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN)) ||
+       table->in_use->locked_tables))
+  {
+    /*
+      We are not in autocommit mode or user have done LOCK TABLES.
+      We must do the delete row by row to be able to rollback the command
+    */
+    return HA_ERR_WRONG_COMMAND;
+  }
+  return maria_delete_all_rows(file);
+}
+
+
+int ha_maria::delete_table(const char *name)
+{
+  THD *thd= current_thd;
+  (void) translog_log_debug_info(0, LOGREC_DEBUG_INFO_QUERY,
+                                 (uchar*) thd->query(), thd->query_length());
+  return maria_delete_table(name);
+}
+
+
+/* This is mainly for temporary tables, so no logging necessary */
+
+void ha_maria::drop_table(const char *name)
+{
+  (void) close();
+  (void) maria_delete_table(name);
+}
+
+
+int ha_maria::external_lock(THD *thd, int lock_type)
+{
+  DBUG_ENTER("ha_maria::external_lock");
+  /*
+    We don't test now_transactional because it may vary between lock/unlock
+    and thus confuse our reference counting.
+    It is critical to skip non-transactional tables: user-visible temporary
+    tables get an external_lock() when read/written for the first time, but no
+    corresponding unlock (they just stay locked and are later dropped while
+    locked); if a tmp table was transactional, "SELECT FROM non_tmp, tmp"
+    would never commit as its "locked_tables" count would stay 1.
+    When Maria has has_transactions()==TRUE, open_temporary_table()
+    (sql_base.cc) will use TRANSACTIONAL_TMP_TABLE and thus the
+    external_lock(F_UNLCK) will happen and we can then allow the user to
+    create transactional temporary tables.
+  */
+  if (file->s->base.born_transactional)
+  {
+    /* Transactional table */
+    if (lock_type != F_UNLCK)
+    {
+      file->external_ptr= thd;                  // For maria_register_trn()
+
+      if (!file->s->lock_key_trees)             // If we don't use versioning
+      {
+        /*
+          We come here in the following cases:
+           - The table is a temporary table
+           - It's a table which is crash safe but not yet versioned, for
+             example a table with fulltext or rtree keys
+
+          Set the current state to point to save_state so that the
+          block_format code don't count the same record twice.
+          Copy also the current state. This may have been wrong if the
+          same file was used several times in the last statement
+        */
+        file->state=  file->state_start;
+        *file->state= file->s->state.state;
+      }
+
+      if (file->trn)
+      {
+        /* This can only happen with tables created with clone() */
+        DBUG_ASSERT(cloned);
+        trnman_increment_locked_tables(file->trn);
+      }
+
+      if (!thd->transaction.on)
+      {
+        /*
+          No need to log REDOs/UNDOs. If this is an internal temporary table
+          which will be renamed to a permanent table (like in ALTER TABLE),
+          the rename happens after unlocking so will be durable (and the table
+          will get its create_rename_lsn).
+          Note: if we wanted to enable users to have an old backup and apply
+          tons of archived logs to roll-forward, we could then not disable
+          REDOs/UNDOs in this case.
+        */
+        DBUG_PRINT("info", ("Disabling logging for table"));
+        _ma_tmp_disable_logging_for_table(file, TRUE);
+      }
+    }
+    else
+    {
+      TRN *trn= THD_TRN;
+      /* End of transaction */
+
+      /*
+        We always re-enable, don't rely on thd->transaction.on as it is
+        sometimes reset to true after unlocking (see mysql_truncate() for a
+        partitioned table based on Maria).
+        Note that we can come here without having an exclusive lock on the
+        table, for example in this case:
+        external_lock(F_(WR|RD)LCK); thr_lock() which fails due to lock
+        abortion; external_lock(F_UNLCK). Fortunately, the re-enabling happens
+        only if we were the thread which disabled logging.
+      */
+      if (_ma_reenable_logging_for_table(file, TRUE))
+        DBUG_RETURN(1);
+      /** @todo zero file->trn also in commit and rollback */
+      _ma_set_trn_for_table(file, NULL);        // Safety
+      /*
+        Ensure that file->state points to the current number of rows. This
+        is needed if someone calls maria_info() without first doing an
+        external lock of the table
+      */
+      file->state= &file->s->state.state;
+      if (trn)
+      {
+        DBUG_PRINT("info",
+                   ("locked_tables: %u", trnman_has_locked_tables(trn)));
+        if (trnman_has_locked_tables(trn) &&
+            !trnman_decrement_locked_tables(trn))
+        {
+          /*
+            OK should not have been sent to client yet (ACID).
+            This is a bit excessive, ACID requires this only if there are some
+            changes to commit (rollback shouldn't be tested).
+          */
+          DBUG_ASSERT(!thd->main_da.is_sent ||
+                      thd->killed == THD::KILL_CONNECTION);
+          /* autocommit ? rollback a transaction */
+#ifdef MARIA_CANNOT_ROLLBACK
+          if (ma_commit(trn))
+            DBUG_RETURN(1);
+          THD_TRN= 0;
+#else
+          if (!(thd->options & (OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN)))
+          {
+            trnman_rollback_trn(trn);
+            DBUG_PRINT("info", ("THD_TRN set to 0x0"));
+            THD_TRN= 0;
+          }
+#endif
+        }
+        trnman_set_flags(trn, trnman_get_flags(trn) & ~ TRN_STATE_INFO_LOGGED);
+      }
+    }
+  } /* if transactional table */
+  DBUG_RETURN(maria_lock_database(file, !table->s->tmp_table ?
+                                  lock_type : ((lock_type == F_UNLCK) ?
+                                               F_UNLCK : F_EXTRA_LCK)));
+}
+
+int ha_maria::start_stmt(THD *thd, thr_lock_type lock_type)
+{
+  TRN *trn;
+  if (file->s->base.born_transactional)
+  {
+    trn= THD_TRN;
+    DBUG_ASSERT(trn); // this may be called only after external_lock()
+    DBUG_ASSERT(trnman_has_locked_tables(trn));
+    DBUG_ASSERT(lock_type != TL_UNLOCK);
+    DBUG_ASSERT(file->trn == trn);
+
+    /*
+      If there was an implicit commit under this LOCK TABLES by a previous
+      statement (like a DDL), at least if that previous statement was about a
+      different ha_maria than 'this' then this->file->trn is a stale
+      pointer. We fix it:
+    */
+    _ma_set_trn_for_table(file, trn);
+    /*
+      As external_lock() was already called, don't increment locked_tables.
+      Note that we call the function below possibly several times when
+      statement starts (once per table). This is ok as long as that function
+      does cheap operations. Otherwise, we will need to do it only on first
+      call to start_stmt().
+    */
+    trnman_new_statement(trn);
+
+#ifdef EXTRA_DEBUG
+    if (!(trnman_get_flags(trn) & TRN_STATE_INFO_LOGGED) &&
+        trnman_get_flags(trn) & TRN_STATE_TABLES_CAN_CHANGE)
+    {
+      trnman_set_flags(trn, trnman_get_flags(trn) | TRN_STATE_INFO_LOGGED);
+      (void) translog_log_debug_info(trn, LOGREC_DEBUG_INFO_QUERY,
+                                     (uchar*) thd->query(),
+                                     thd->query_length());
+    }
+#endif
+  }
+  return 0;
+}
+
+
+/**
+  Performs an implicit commit of the Maria transaction and creates a new
+  one.
+
+  This can be considered a hack. When Maria loses HA_NO_TRANSACTIONS it will
+  be participant in the connection's transaction and so the implicit commits
+  (ha_commit()) (like in end_active_trans()) will do the implicit commit
+  without need to call this function which can then be removed.
+
+  @param  thd              THD object
+  @param  new_trn          if a new transaction should be created; a new
+                           transaction is not needed when we know that the
+                           tables will be unlocked very soon.
+*/
+
+int ha_maria::implicit_commit(THD *thd, bool new_trn)
+{
+#ifndef MARIA_CANNOT_ROLLBACK
+#error this method should be removed
+#endif
+  TRN *trn;
+  int error= 0;
+  TABLE *table;
+  DBUG_ENTER("ha_maria::implicit_commit");
+  if (!new_trn && thd->locked_tables)
+  {
+    /*
+      "we are under LOCK TABLES" <=> "we shouldn't commit".
+      As thd->locked_tables is true, we are either under LOCK TABLES, or in
+      prelocking; prelocking can be under LOCK TABLES, or not (and in this
+      latter case only we should commit).
+      Note that we come here only at the end of the top statement
+      (dispatch_command()), we are never committing inside a sub-statement./
+    */
+    enum prelocked_mode_type prelocked_mode= thd->prelocked_mode;
+    if ((prelocked_mode == NON_PRELOCKED) ||
+        (prelocked_mode == PRELOCKED_UNDER_LOCK_TABLES))
+    {
+      DBUG_PRINT("info", ("locked_tables, skipping"));
+      DBUG_RETURN(0);
+    }
+  }
+  if ((trn= THD_TRN) != NULL)
+  {
+    uint locked_tables= trnman_has_locked_tables(trn);
+    if (unlikely(ma_commit(trn)))
+      error= 1;
+    if (!new_trn)
+    {
+      THD_TRN= NULL;
+      goto end;
+    }
+    /*
+      We need to create a new transaction and put it in THD_TRN. Indeed,
+      tables may be under LOCK TABLES, and so they will start the next
+      statement assuming they have a trn (see ha_maria::start_stmt()).
+    */
+    trn= trnman_new_trn(& thd->transaction.wt);
+    /* This is just a commit, tables stay locked if they were: */
+    trnman_reset_locked_tables(trn, locked_tables);
+    THD_TRN= trn;
+    if (unlikely(trn == NULL))
+      error= HA_ERR_OUT_OF_MEM;
+
+    /*
+      Move all locked tables to the new transaction
+      We must do it here as otherwise file->thd and file->state may be
+      stale pointers. We can't do this in start_stmt() as we don't know
+      when we should call _ma_setup_live_state() and in some cases, like
+      in check table, we use the table without calling start_stmt().
+     */
+    for (table=thd->open_tables; table ; table=table->next)
+    {
+      if (table->db_stat && table->file->ht == maria_hton)
+      {
+        MARIA_HA *handler= ((ha_maria*) table->file)->file;
+        if (handler->s->base.born_transactional)
+        {
+          _ma_set_trn_for_table(handler, trn);
+          /* If handler uses versioning */
+          if (handler->s->lock_key_trees)
+          {
+            if (_ma_setup_live_state(handler))
+              error= HA_ERR_OUT_OF_MEM;
+          }
+        }
+      }
+    }
+  }
+end:
+  DBUG_RETURN(error);
+}
+
+
+THR_LOCK_DATA **ha_maria::store_lock(THD *thd,
+                                     THR_LOCK_DATA **to,
+                                     enum thr_lock_type lock_type)
+{
+  /* Test if we can fix test below */
+  DBUG_ASSERT(lock_type != TL_UNLOCK &&
+              (lock_type == TL_IGNORE || file->lock.type == TL_UNLOCK));
+  if (lock_type != TL_IGNORE && file->lock.type == TL_UNLOCK)
+  {
+    const enum enum_sql_command sql_command= thd->lex->sql_command;
+    /*
+      We have to disable concurrent inserts for INSERT ... SELECT or
+      INSERT/UPDATE/DELETE with sub queries if we are using statement based
+      logging.  We take the safe route here and disable this for all commands
+      that only does reading that are not SELECT.
+    */
+    if (lock_type <= TL_READ_HIGH_PRIORITY &&
+        !thd->current_stmt_binlog_row_based &&
+        (sql_command != SQLCOM_SELECT &&
+         sql_command != SQLCOM_LOCK_TABLES) &&
+        (thd->options & OPTION_BIN_LOG) &&
+        mysql_bin_log.is_open())
+      lock_type= TL_READ_NO_INSERT;
+    else if (lock_type == TL_WRITE_CONCURRENT_INSERT)
+    {
+      const enum enum_duplicates duplicates= thd->lex->duplicates;
+      /*
+        Explanation for the 3 conditions below, in order:
+
+        - Bulk insert may use repair, which will cause problems if other
+        threads try to read/insert to the table: disable versioning.
+        Note that our read of file->state->records is incorrect, as such
+        variable may have changed when we come to start_bulk_insert() (worse
+        case: we see != 0 so allow versioning, start_bulk_insert() sees 0 and
+        uses repair). This is prevented because start_bulk_insert() will not
+        try repair if we enabled versioning.
+        - INSERT SELECT ON DUPLICATE KEY UPDATE comes here with
+        TL_WRITE_CONCURRENT_INSERT but shouldn't because it can do
+        update/delete of a row and versioning doesn't support that
+        - same for LOAD DATA CONCURRENT REPLACE.
+      */
+      if ((file->state->records == 0) ||
+          (sql_command == SQLCOM_INSERT_SELECT && duplicates == DUP_UPDATE) ||
+          (sql_command == SQLCOM_LOAD && duplicates == DUP_REPLACE))
+        lock_type= TL_WRITE;
+    }
+    file->lock.type= lock_type;
+  }
+  *to++= &file->lock;
+  return to;
+}
+
+
+void ha_maria::update_create_info(HA_CREATE_INFO *create_info)
+{
+  ha_maria::info(HA_STATUS_AUTO | HA_STATUS_CONST);
+  if (!(create_info->used_fields & HA_CREATE_USED_AUTO))
+  {
+    create_info->auto_increment_value= stats.auto_increment_value;
+  }
+  create_info->data_file_name= data_file_name;
+  create_info->index_file_name= index_file_name;
+  /* We need to restore the row type as Maria can change it */
+  if (create_info->row_type != ROW_TYPE_DEFAULT &&
+      !(create_info->used_fields & HA_CREATE_USED_ROW_FORMAT))
+    create_info->row_type= get_row_type();
+  /*
+    Show always page checksums, as this can be forced with
+    maria_page_checksums variable
+  */
+  if (create_info->page_checksum == HA_CHOICE_UNDEF)
+    create_info->page_checksum=
+      (file->s->options & HA_OPTION_PAGE_CHECKSUM) ? HA_CHOICE_YES :
+      HA_CHOICE_NO;
+}
+
+
+enum row_type ha_maria::get_row_type() const
+{
+  switch (file->s->data_file_type) {
+  case STATIC_RECORD:     return ROW_TYPE_FIXED;
+  case DYNAMIC_RECORD:    return ROW_TYPE_DYNAMIC;
+  case BLOCK_RECORD:      return ROW_TYPE_PAGE;
+  case COMPRESSED_RECORD: return ROW_TYPE_COMPRESSED;
+  default:                return ROW_TYPE_NOT_USED;
+  }
+}
+
+
+static enum data_file_type maria_row_type(HA_CREATE_INFO *info)
+{
+  if (info->transactional == HA_CHOICE_YES)
+    return BLOCK_RECORD;
+  switch (info->row_type) {
+  case ROW_TYPE_FIXED:   return STATIC_RECORD;
+  case ROW_TYPE_DYNAMIC: return DYNAMIC_RECORD;
+  default:               return BLOCK_RECORD;
+  }
+}
+
+
+int ha_maria::create(const char *name, register TABLE *table_arg,
+                     HA_CREATE_INFO *ha_create_info)
+{
+  int error;
+  uint create_flags= 0, record_count, i;
+  char buff[FN_REFLEN];
+  MARIA_KEYDEF *keydef;
+  MARIA_COLUMNDEF *recinfo;
+  MARIA_CREATE_INFO create_info;
+  TABLE_SHARE *share= table_arg->s;
+  uint options= share->db_options_in_use;
+  enum data_file_type row_type;
+  THD *thd= current_thd;
+  DBUG_ENTER("ha_maria::create");
+
+  for (i= 0; i < share->keys; i++)
+  {
+    if (table_arg->key_info[i].flags & HA_USES_PARSER)
+    {
+      create_flags|= HA_CREATE_RELIES_ON_SQL_LAYER;
+      break;
+    }
+  }
+  /* Note: BLOCK_RECORD is used if table is transactional */
+  row_type= maria_row_type(ha_create_info);
+  if (ha_create_info->transactional == HA_CHOICE_YES &&
+      ha_create_info->row_type != ROW_TYPE_PAGE &&
+      ha_create_info->row_type != ROW_TYPE_NOT_USED &&
+      ha_create_info->row_type != ROW_TYPE_DEFAULT)
+    push_warning(current_thd, MYSQL_ERROR::WARN_LEVEL_NOTE,
+                 ER_ILLEGAL_HA_CREATE_OPTION,
+                 "Row format set to PAGE because of TRANSACTIONAL=1 option");
+
+  bzero((char*) &create_info, sizeof(create_info));
+  if ((error= table2maria(table_arg, row_type, &keydef, &recinfo,
+                          &record_count, &create_info)))
+    DBUG_RETURN(error); /* purecov: inspected */
+  create_info.max_rows= share->max_rows;
+  create_info.reloc_rows= share->min_rows;
+  create_info.with_auto_increment= share->next_number_key_offset == 0;
+  create_info.auto_increment= (ha_create_info->auto_increment_value ?
+                               ha_create_info->auto_increment_value -1 :
+                               (ulonglong) 0);
+  create_info.data_file_length= ((ulonglong) share->max_rows *
+                                 share->avg_row_length);
+  create_info.data_file_name= ha_create_info->data_file_name;
+  create_info.index_file_name= ha_create_info->index_file_name;
+  create_info.language= share->table_charset->number;
+
+  /*
+    Table is transactional:
+    - If the user specify that table is transactional (in this case
+      row type is forced to BLOCK_RECORD)
+    - If they specify BLOCK_RECORD without specifying transactional behaviour
+
+    Shouldn't this test be pushed down to maria_create()? Because currently,
+    ma_test1 -T crashes: it creates a table with DYNAMIC_RECORD but has
+    born_transactional==1, which confuses some recovery-related code.
+  */
+  create_info.transactional= (row_type == BLOCK_RECORD &&
+                              ha_create_info->transactional != HA_CHOICE_NO);
+
+  if (ha_create_info->options & HA_LEX_CREATE_TMP_TABLE)
+    create_flags|= HA_CREATE_TMP_TABLE;
+  if (ha_create_info->options & HA_CREATE_KEEP_FILES)
+    create_flags|= HA_CREATE_KEEP_FILES;
+  if (options & HA_OPTION_PACK_RECORD)
+    create_flags|= HA_PACK_RECORD;
+  if (options & HA_OPTION_CHECKSUM)
+    create_flags|= HA_CREATE_CHECKSUM;
+  if (options & HA_OPTION_DELAY_KEY_WRITE)
+    create_flags|= HA_CREATE_DELAY_KEY_WRITE;
+  if ((ha_create_info->page_checksum == HA_CHOICE_UNDEF &&
+       maria_page_checksums) ||
+       ha_create_info->page_checksum ==  HA_CHOICE_YES)
+    create_flags|= HA_CREATE_PAGE_CHECKSUM;
+
+  (void) translog_log_debug_info(0, LOGREC_DEBUG_INFO_QUERY,
+                                 (uchar*) thd->query(), thd->query_length());
+
+  /* TODO: Check that the following fn_format is really needed */
+  error=
+    maria_create(fn_format(buff, name, "", "",
+                           MY_UNPACK_FILENAME | MY_APPEND_EXT),
+                 row_type, share->keys, keydef,
+                 record_count,  recinfo,
+                 0, (MARIA_UNIQUEDEF *) 0,
+                 &create_info, create_flags);
+
+  my_free((uchar*) recinfo, MYF(0));
+  DBUG_RETURN(error);
+}
+
+
+int ha_maria::rename_table(const char *from, const char *to)
+{
+  THD *thd= current_thd;
+  (void) translog_log_debug_info(0, LOGREC_DEBUG_INFO_QUERY,
+                                 (uchar*) thd->query(), thd->query_length());
+  return maria_rename(from, to);
+}
+
+
+void ha_maria::get_auto_increment(ulonglong offset, ulonglong increment,
+                                  ulonglong nb_desired_values,
+                                  ulonglong *first_value,
+                                  ulonglong *nb_reserved_values)
+{
+  ulonglong nr;
+  int error;
+  uchar key[HA_MAX_KEY_LENGTH];
+
+  if (!table->s->next_number_key_offset)
+  {                                             // Autoincrement at key-start
+    ha_maria::info(HA_STATUS_AUTO);
+    *first_value= stats.auto_increment_value;
+    /* Maria has only table-level lock for now, so reserves to +inf */
+    *nb_reserved_values= ULONGLONG_MAX;
+    return;
+  }
+
+  /* it's safe to call the following if bulk_insert isn't on */
+  maria_flush_bulk_insert(file, table->s->next_number_index);
+
+  (void) extra(HA_EXTRA_KEYREAD);
+  key_copy(key, table->record[0],
+           table->key_info + table->s->next_number_index,
+           table->s->next_number_key_offset);
+  error= maria_rkey(file, table->record[1], (int) table->s->next_number_index,
+                    key, make_prev_keypart_map(table->s->next_number_keypart),
+                    HA_READ_PREFIX_LAST);
+  if (error)
+    nr= 1;
+  else
+  {
+    /* Get data from record[1] */
+    nr= ((ulonglong) table->next_number_field->
+         val_int_offset(table->s->rec_buff_length) + 1);
+  }
+  extra(HA_EXTRA_NO_KEYREAD);
+  *first_value= nr;
+  /*
+    MySQL needs to call us for next row: assume we are inserting ("a",null)
+    here, we return 3, and next this statement will want to insert ("b",null):
+    there is no reason why ("b",3+1) would be the good row to insert: maybe it
+    already exists, maybe 3+1 is too large...
+  */
+  *nb_reserved_values= 1;
+}
+
+
+/*
+  Find out how many rows there is in the given range
+
+  SYNOPSIS
+    records_in_range()
+    inx                 Index to use
+    min_key             Start of range.  Null pointer if from first key
+    max_key             End of range. Null pointer if to last key
+
+  NOTES
+    min_key.flag can have one of the following values:
+      HA_READ_KEY_EXACT         Include the key in the range
+      HA_READ_AFTER_KEY         Don't include key in range
+
+    max_key.flag can have one of the following values:
+      HA_READ_BEFORE_KEY        Don't include key in range
+      HA_READ_AFTER_KEY         Include all 'end_key' values in the range
+
+  RETURN
+   HA_POS_ERROR         Something is wrong with the index tree.
+   0                    There is no matching keys in the given range
+   number > 0           There is approximately 'number' matching rows in
+                        the range.
+*/
+
+ha_rows ha_maria::records_in_range(uint inx, key_range *min_key,
+                                   key_range *max_key)
+{
+  return (ha_rows) maria_records_in_range(file, (int) inx, min_key, max_key);
+}
+
+
+int ha_maria::ft_read(uchar * buf)
+{
+  int error;
+
+  if (!ft_handler)
+    return -1;
+
+  thread_safe_increment(table->in_use->status_var.ha_read_next_count,
+                        &LOCK_status);  // why ?
+
+  error= ft_handler->please->read_next(ft_handler, (char*) buf);
+
+  table->status= error ? STATUS_NOT_FOUND : 0;
+  return error;
+}
+
+
+uint ha_maria::checksum() const
+{
+  return (uint) file->state->checksum;
+}
+
+
+bool ha_maria::check_if_incompatible_data(HA_CREATE_INFO *create_info,
+                                          uint table_changes)
+{
+  DBUG_ENTER("check_if_incompatible_data");
+  uint options= table->s->db_options_in_use;
+
+  if (create_info->auto_increment_value != stats.auto_increment_value ||
+      create_info->data_file_name != data_file_name ||
+      create_info->index_file_name != index_file_name ||
+      (maria_row_type(create_info) != data_file_type &&
+       create_info->row_type != ROW_TYPE_DEFAULT) ||
+      table_changes == IS_EQUAL_NO ||
+      (table_changes & IS_EQUAL_PACK_LENGTH)) // Not implemented yet
+    DBUG_RETURN(COMPATIBLE_DATA_NO);
+
+  if ((options & (HA_OPTION_CHECKSUM |
+                  HA_OPTION_DELAY_KEY_WRITE)) !=
+      (create_info->table_options & (HA_OPTION_CHECKSUM |
+                              HA_OPTION_DELAY_KEY_WRITE)))
+    DBUG_RETURN(COMPATIBLE_DATA_NO);
+  DBUG_RETURN(COMPATIBLE_DATA_YES);
+}
+
+
+static int maria_hton_panic(handlerton *hton, ha_panic_function flag)
+{
+  /* If no background checkpoints, we need to do one now */
+  return ((checkpoint_interval == 0) ?
+          ma_checkpoint_execute(CHECKPOINT_FULL, FALSE) : 0) | maria_panic(flag);
+}
+
+
+static int maria_commit(handlerton *hton __attribute__ ((unused)),
+                        THD *thd, bool all)
+{
+  TRN *trn= THD_TRN;
+  DBUG_ENTER("maria_commit");
+  trnman_reset_locked_tables(trn, 0);
+  trnman_set_flags(trn, trnman_get_flags(trn) & ~TRN_STATE_INFO_LOGGED);
+
+  /* statement or transaction ? */
+  if ((thd->options & (OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN)) && !all)
+    DBUG_RETURN(0); // end of statement
+  DBUG_PRINT("info", ("THD_TRN set to 0x0"));
+  THD_TRN= 0;
+  DBUG_RETURN(ma_commit(trn)); // end of transaction
+}
+
+
+static int maria_rollback(handlerton *hton __attribute__ ((unused)),
+                          THD *thd, bool all)
+{
+  TRN *trn= THD_TRN;
+  DBUG_ENTER("maria_rollback");
+  trnman_reset_locked_tables(trn, 0);
+  /* statement or transaction ? */
+  if ((thd->options & (OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN)) && !all)
+  {
+    trnman_rollback_statement(trn);
+    DBUG_RETURN(0); // end of statement
+  }
+  DBUG_PRINT("info", ("THD_TRN set to 0x0"));
+  THD_TRN= 0;
+  DBUG_RETURN(trnman_rollback_trn(trn) ?
+              HA_ERR_OUT_OF_MEM : 0); // end of transaction
+}
+
+
+
+/**
+  @brief flush log handler
+
+  @param hton            maria handlerton (unused)
+
+  @retval FALSE OK
+  @retval TRUE  Error
+*/
+
+bool maria_flush_logs(handlerton *hton)
+{
+  return test(translog_purge_at_flush());
+}
+
+
+#define SHOW_MSG_LEN (FN_REFLEN + 20)
+/**
+  @brief show status handler
+
+  @param hton            maria handlerton
+  @param thd             thread handler
+  @param print           print function
+  @param stat            type of status
+*/
+
+bool maria_show_status(handlerton *hton,
+                       THD *thd,
+                       stat_print_fn *print,
+                       enum ha_stat_type stat)
+{
+  const LEX_STRING *engine_name= hton_name(hton);
+  switch (stat) {
+  case HA_ENGINE_LOGS:
+  {
+    TRANSLOG_ADDRESS horizon= translog_get_horizon();
+    uint32 last_file= LSN_FILE_NO(horizon);
+    uint32 first_needed= translog_get_first_needed_file();
+    uint32 first_file= translog_get_first_file(horizon);
+    uint32 i;
+    const char unknown[]= "unknown";
+    const char needed[]= "in use";
+    const char unneeded[]= "free";
+    char path[FN_REFLEN];
+
+    if (first_file == 0)
+    {
+      const char error[]= "error";
+      print(thd, engine_name->str, engine_name->length,
+            STRING_WITH_LEN(""), error, sizeof(error) - 1);
+      break;
+    }
+
+    for (i= first_file; i <= last_file; i++)
+    {
+      char *file;
+      const char *status;
+      uint length, status_len;
+      MY_STAT stat_buff, *stat;
+      const char error[]= "can't stat";
+      char object[SHOW_MSG_LEN];
+      file= translog_filename_by_fileno(i, path);
+      if (!(stat= my_stat(file, &stat_buff, MYF(0))))
+      {
+        status= error;
+        status_len= sizeof(error) - 1;
+        length= my_snprintf(object, SHOW_MSG_LEN, "Size unknown ; %s", file);
+      }
+      else
+      {
+        if (first_needed == 0)
+        {
+          status= unknown;
+          status_len= sizeof(unknown) - 1;
+        }
+        else if (i < first_needed)
+        {
+          status= unneeded;
+          status_len= sizeof(unneeded) - 1;
+        }
+        else
+        {
+          status= needed;
+          status_len= sizeof(needed) - 1;
+        }
+        length= my_snprintf(object, SHOW_MSG_LEN, "Size %12lu ; %s",
+                            (ulong) stat->st_size, file);
+      }
+
+      print(thd, engine_name->str, engine_name->length,
+            object, length, status, status_len);
+    }
+    break;
+  }
+  case HA_ENGINE_STATUS:
+  case HA_ENGINE_MUTEX:
+  default:
+    break;
+  }
+  return 0;
+}
+
+
+/**
+  Callback to delete all logs in directory. This is lower-level than other
+  functions in ma_loghandler.c which delete logs, as it does not rely on
+  translog_init() having been called first.
+
+  @param  directory        directory where file is
+  @param  filename         base name of the file to delete
+*/
+
+static my_bool translog_callback_delete_all(const char *directory,
+                                            const char *filename)
+{
+  char complete_name[FN_REFLEN];
+  fn_format(complete_name, filename, directory, "", MYF(MY_UNPACK_FILENAME));
+  return my_delete(complete_name, MYF(MY_WME));
+}
+
+
+/**
+  Helper function for option aria-force-start-after-recovery-failures.
+  Deletes logs if too many failures. Otherwise, increments the counter of
+  failures in the control file.
+  Notice how this has to be called _before_ translog_init() (if log is
+  corrupted, translog_init() might crash the server, so we need to remove logs
+  before).
+
+  @param  log_dir          directory where logs to be deleted are
+*/
+
+static int mark_recovery_start(const char* log_dir)
+{
+  int res;
+  DBUG_ENTER("mark_recovery_start");
+  if (unlikely(maria_recover_options == HA_RECOVER_NONE))
+    ma_message_no_user(ME_JUST_WARNING, "Please consider using option"
+                       " --aria-recover[=...] to automatically check and"
+                       " repair tables when logs are removed by option"
+                       " --aria-force-start-after-recovery-failures=#");
+  if (recovery_failures >= force_start_after_recovery_failures)
+  {
+    /*
+      Remove logs which cause the problem; keep control file which has
+      critical info like uuid, max_trid (removing control file may make
+      correct tables look corrupted!).
+    */
+    char msg[100];
+    res= translog_walk_filenames(log_dir, &translog_callback_delete_all);
+    my_snprintf(msg, sizeof(msg),
+                "%s logs after %u consecutive failures of"
+                " recovery from logs",
+                (res ? "failed to remove some" : "removed all"),
+                recovery_failures);
+    ma_message_no_user((res ? 0 : ME_JUST_WARNING), msg);
+  }
+  else
+    res= ma_control_file_write_and_force(last_checkpoint_lsn, last_logno,
+                                         max_trid_in_control_file,
+                                         recovery_failures + 1);
+  DBUG_RETURN(res);
+}
+
+
+/**
+  Helper function for option aria-force-start-after-recovery-failures.
+  Records in the control file that recovery was a success, so that it's not
+  counted for aria-force-start-after-recovery-failures.
+*/
+
+static int mark_recovery_success(void)
+{
+  /* success of recovery, reset recovery_failures: */
+  int res;
+  DBUG_ENTER("mark_recovery_success");
+  res= ma_control_file_write_and_force(last_checkpoint_lsn, last_logno,
+                                       max_trid_in_control_file, 0);
+  DBUG_RETURN(res);
+}
+
+
+/*
+  Return 1 if table has changed during the current transaction
+*/
+
+bool ha_maria::is_changed() const
+{
+  return file->state->changed;
+}
+
+
+static int ha_maria_init(void *p)
+{
+  int res;
+  copy_variable_aliases();
+  const char *log_dir= maria_data_root;
+  maria_hton= (handlerton *)p;
+  maria_hton->state= SHOW_OPTION_YES;
+  maria_hton->db_type= DB_TYPE_UNKNOWN;
+  maria_hton->create= maria_create_handler;
+  maria_hton->panic= maria_hton_panic;
+  maria_hton->commit= maria_commit;
+  maria_hton->rollback= maria_rollback;
+  maria_hton->flush_logs= maria_flush_logs;
+  maria_hton->show_status= maria_show_status;
+  /* TODO: decide if we support Maria being used for log tables */
+  maria_hton->flags= HTON_CAN_RECREATE | HTON_SUPPORT_LOG_TABLES;
+  bzero(maria_log_pagecache, sizeof(*maria_log_pagecache));
+  maria_tmpdir= &mysql_tmpdir_list;             /* For REDO */
+  res= maria_upgrade() || maria_init() || ma_control_file_open(TRUE, TRUE) ||
+    ((force_start_after_recovery_failures != 0) &&
+     mark_recovery_start(log_dir)) ||
+    !init_pagecache(maria_pagecache,
+                    (size_t) pagecache_buffer_size, pagecache_division_limit,
+                    pagecache_age_threshold, maria_block_size, 0) ||
+    !init_pagecache(maria_log_pagecache,
+                    TRANSLOG_PAGECACHE_SIZE, 0, 0,
+                    TRANSLOG_PAGE_SIZE, 0) ||
+    translog_init(maria_data_root, log_file_size,
+                  MYSQL_VERSION_ID, server_id, maria_log_pagecache,
+                  TRANSLOG_DEFAULT_FLAGS, 0) ||
+    maria_recovery_from_log() ||
+    ((force_start_after_recovery_failures != 0 ||
+      maria_recovery_changed_data) && mark_recovery_success()) ||
+    ma_checkpoint_init(checkpoint_interval);
+  maria_multi_threaded= maria_in_ha_maria= TRUE;
+  maria_create_trn_hook= maria_create_trn_for_mysql;
+
+#if defined(HAVE_REALPATH) && !defined(HAVE_valgrind) && !defined(HAVE_BROKEN_REALPATH)
+  /*  We can only test for sub paths if my_symlink.c is using realpath */
+  maria_test_invalid_symlink= test_if_data_home_dir;
+#endif
+  if (res)
+    maria_hton= 0;
+  return res ? HA_ERR_INITIALIZATION : 0;
+}
+
+
+#ifdef HAVE_QUERY_CACHE
+/**
+  @brief Register a named table with a call back function to the query cache.
+
+  @param thd The thread handle
+  @param table_key A pointer to the table name in the table cache
+  @param key_length The length of the table name
+  @param[out] engine_callback The pointer to the storage engine call back
+    function, currently 0
+  @param[out] engine_data Engine data will be set to 0.
+
+  @note Despite the name of this function, it is used to check each statement
+    before it is cached and not to register a table or callback function.
+
+  @see handler::register_query_cache_table
+
+  @return The error code. The engine_data and engine_callback will be set to 0.
+    @retval TRUE Success
+    @retval FALSE An error occurred
+*/
+
+my_bool ha_maria::register_query_cache_table(THD *thd, char *table_name,
+					     uint table_name_len,
+					     qc_engine_callback
+					     *engine_callback,
+					     ulonglong *engine_data)
+{
+  ulonglong actual_data_file_length;
+  ulonglong current_data_file_length;
+  DBUG_ENTER("ha_maria::register_query_cache_table");
+
+  /*
+    No call back function is needed to determine if a cached statement
+    is valid or not.
+  */
+  *engine_callback= 0;
+
+  /*
+    No engine data is needed.
+  */
+  *engine_data= 0;
+
+  if (file->s->now_transactional && file->s->have_versioning)
+    return (file->trn->trid >= file->s->state.last_change_trn);
+
+  /*
+    If a concurrent INSERT has happened just before the currently processed
+    SELECT statement, the total size of the table is unknown.
+
+    To determine if the table size is known, the current thread's snap shot of
+    the table size with the actual table size are compared.
+
+    If the table size is unknown the SELECT statement can't be cached.
+  */
+
+  /*
+    POSIX visibility rules specify that "2. Whatever memory values a
+    thread can see when it unlocks a mutex <...> can also be seen by any
+    thread that later locks the same mutex". In this particular case,
+    concurrent insert thread had modified the data_file_length in
+    MYISAM_SHARE before it has unlocked (or even locked)
+    structure_guard_mutex. So, here we're guaranteed to see at least that
+    value after we've locked the same mutex. We can see a later value
+    (modified by some other thread) though, but it's ok, as we only want
+    to know if the variable was changed, the actual new value doesn't matter
+  */
+  actual_data_file_length= file->s->state.state.data_file_length;
+  current_data_file_length= file->state->data_file_length;
+
+  /* Return whether is ok to try to cache current statement. */
+  DBUG_RETURN(!(file->s->non_transactional_concurrent_insert &&
+                current_data_file_length != actual_data_file_length));
+}
+#endif
+
+struct st_mysql_sys_var* system_variables[]= {
+  MYSQL_SYSVAR(block_size),
+  MYSQL_SYSVAR(checkpoint_interval),
+  MYSQL_SYSVAR(force_start_after_recovery_failures),
+  MYSQL_SYSVAR(group_commit),
+  MYSQL_SYSVAR(group_commit_interval),
+  MYSQL_SYSVAR(log_dir_path),
+  MYSQL_SYSVAR(log_file_size),
+  MYSQL_SYSVAR(log_purge_type),
+  MYSQL_SYSVAR(max_sort_file_size),
+  MYSQL_SYSVAR(page_checksum),
+  MYSQL_SYSVAR(pagecache_age_threshold),
+  MYSQL_SYSVAR(pagecache_buffer_size),
+  MYSQL_SYSVAR(pagecache_division_limit),
+  MYSQL_SYSVAR(recover),
+  MYSQL_SYSVAR(repair_threads),
+  MYSQL_SYSVAR(sort_buffer_size),
+  MYSQL_SYSVAR(stats_method),
+  MYSQL_SYSVAR(sync_log_dir),
+  MYSQL_SYSVAR(used_for_temp_tables),
+  NULL
+};
+
+
+/**
+   @brief Updates the checkpoint interval and restarts the background thread.
+*/
+
+static void update_checkpoint_interval(MYSQL_THD thd,
+                                        struct st_mysql_sys_var *var,
+                                        void *var_ptr, const void *save)
+{
+  ma_checkpoint_end();
+  ma_checkpoint_init(*(ulong *)var_ptr= (ulong)(*(long *)save));
+}
+
+/**
+   @brief Updates group commit mode
+*/
+
+static void update_maria_group_commit(MYSQL_THD thd,
+                                      struct st_mysql_sys_var *var,
+                                      void *var_ptr, const void *save)
+{
+  ulong value= (ulong)*((long *)var_ptr);
+  DBUG_ENTER("update_maria_group_commit");
+  DBUG_PRINT("enter", ("old value: %lu  new value %lu  rate %lu",
+                       value, (ulong)(*(long *)save),
+                       maria_group_commit_interval));
+  /* old value */
+  switch (value) {
+  case TRANSLOG_GCOMMIT_NONE:
+    break;
+  case TRANSLOG_GCOMMIT_HARD:
+    translog_hard_group_commit(FALSE);
+    break;
+  case TRANSLOG_GCOMMIT_SOFT:
+    translog_soft_sync(FALSE);
+    if (maria_group_commit_interval)
+      translog_soft_sync_end();
+    break;
+  default:
+    DBUG_ASSERT(0); /* impossible */
+  }
+  value= *(ulong *)var_ptr= (ulong)(*(long *)save);
+  translog_sync();
+  /* new value */
+  switch (value) {
+  case TRANSLOG_GCOMMIT_NONE:
+    break;
+  case TRANSLOG_GCOMMIT_HARD:
+    translog_hard_group_commit(TRUE);
+    break;
+  case TRANSLOG_GCOMMIT_SOFT:
+    translog_soft_sync(TRUE);
+    /* variable change made under global lock so we can just read it */
+    if (maria_group_commit_interval)
+      translog_soft_sync_start();
+    break;
+  default:
+    DBUG_ASSERT(0); /* impossible */
+  }
+  DBUG_VOID_RETURN;
+}
+
+/**
+   @brief Updates group commit interval
+*/
+
+static void update_maria_group_commit_interval(MYSQL_THD thd,
+                                               struct st_mysql_sys_var *var,
+                                               void *var_ptr, const void *save)
+{
+  ulong new_value= (ulong)*((long *)save);
+  ulong *value_ptr= (ulong*) var_ptr;
+  DBUG_ENTER("update_maria_group_commit_interval");
+  DBUG_PRINT("enter", ("old value: %lu  new value %lu  group commit %lu",
+                        *value_ptr, new_value, maria_group_commit));
+
+  /* variable change made under global lock so we can just read it */
+  switch (maria_group_commit) {
+    case TRANSLOG_GCOMMIT_NONE:
+      *value_ptr= new_value;
+      translog_set_group_commit_interval(new_value);
+      break;
+    case TRANSLOG_GCOMMIT_HARD:
+      *value_ptr= new_value;
+      translog_set_group_commit_interval(new_value);
+      break;
+    case TRANSLOG_GCOMMIT_SOFT:
+      if (*value_ptr)
+        translog_soft_sync_end();
+      translog_set_group_commit_interval(new_value);
+      if ((*value_ptr= new_value))
+        translog_soft_sync_start();
+      break;
+    default:
+      DBUG_ASSERT(0); /* impossible */
+  }
+  DBUG_VOID_RETURN;
+}
+
+/**
+   @brief Updates the transaction log file limit.
+*/
+
+static void update_log_file_size(MYSQL_THD thd,
+                                 struct st_mysql_sys_var *var,
+                                 void *var_ptr, const void *save)
+{
+  uint32 size= (uint32)((ulong)(*(long *)save));
+  translog_set_file_size(size);
+  *(ulong *)var_ptr= size;
+}
+
+
+SHOW_VAR status_variables[]= {
+  {"pagecache_blocks_not_flushed", (char*) &maria_pagecache_var.global_blocks_changed, SHOW_LONG_NOFLUSH},
+  {"pagecache_blocks_unused",      (char*) &maria_pagecache_var.blocks_unused, SHOW_LONG_NOFLUSH},
+  {"pagecache_blocks_used",        (char*) &maria_pagecache_var.blocks_used, SHOW_LONG_NOFLUSH},
+  {"pagecache_read_requests",      (char*) &maria_pagecache_var.global_cache_r_requests, SHOW_LONGLONG},
+  {"pagecache_reads",              (char*) &maria_pagecache_var.global_cache_read, SHOW_LONGLONG},
+  {"pagecache_write_requests",     (char*) &maria_pagecache_var.global_cache_w_requests, SHOW_LONGLONG},
+  {"pagecache_writes",             (char*) &maria_pagecache_var.global_cache_write, SHOW_LONGLONG},
+  {"transaction_log_syncs",        (char*) &translog_syncs, SHOW_LONGLONG},
+  {NullS, NullS, SHOW_LONG}
+};
+
+static struct st_mysql_show_var aria_status_variables[]= {
+  {"Aria", (char*) &status_variables, SHOW_ARRAY},
+  {NullS, NullS, SHOW_LONG}
+};
+
+/****************************************************************************
+ * Maria MRR implementation: use DS-MRR
+ ***************************************************************************/
+
+int ha_maria::multi_range_read_init(RANGE_SEQ_IF *seq, void *seq_init_param,
+                                     uint n_ranges, uint mode, 
+                                     HANDLER_BUFFER *buf)
+{
+  return ds_mrr.dsmrr_init(this, seq, seq_init_param, n_ranges, mode, buf);
+}
+
+int ha_maria::multi_range_read_next(char **range_info)
+{
+  return ds_mrr.dsmrr_next(range_info);
+}
+
+ha_rows ha_maria::multi_range_read_info_const(uint keyno, RANGE_SEQ_IF *seq,
+                                               void *seq_init_param, 
+                                               uint n_ranges, uint *bufsz,
+                                               uint *flags, COST_VECT *cost)
+{
+  /*
+    This call is here because there is no location where this->table would
+    already be known.
+    TODO: consider moving it into some per-query initialization call.
+  */
+  ds_mrr.init(this, table);
+  return ds_mrr.dsmrr_info_const(keyno, seq, seq_init_param, n_ranges, bufsz,
+                                 flags, cost);
+}
+
+ha_rows ha_maria::multi_range_read_info(uint keyno, uint n_ranges, uint keys,
+                                        uint *bufsz, uint *flags, 
+                                        COST_VECT *cost)
+{
+  ds_mrr.init(this, table);
+  return ds_mrr.dsmrr_info(keyno, n_ranges, keys, bufsz, flags, cost);
+}
+
+/* MyISAM MRR implementation ends */
+
+
+/* Index condition pushdown implementation*/
+
+
+Item *ha_maria::idx_cond_push(uint keyno_arg, Item* idx_cond_arg)
+{
+  pushed_idx_cond_keyno= keyno_arg;
+  pushed_idx_cond= idx_cond_arg;
+  in_range_check_pushed_down= TRUE;
+  if (active_index == pushed_idx_cond_keyno)
+    ma_set_index_cond_func(file, index_cond_func_maria, this);
+  return NULL;
+}
+
+
+
+
+struct st_mysql_storage_engine maria_storage_engine=
+{ MYSQL_HANDLERTON_INTERFACE_VERSION };
+
+maria_declare_plugin(aria)
+compat_aliases,
+{
+  MYSQL_STORAGE_ENGINE_PLUGIN,
+  &maria_storage_engine,
+  "Aria",
+  "Monty Program Ab",
+  "Crash-safe tables with MyISAM heritage",
+  PLUGIN_LICENSE_GPL,
+  ha_maria_init,                /* Plugin Init      */
+  NULL,                         /* Plugin Deinit    */
+  0x0105,                       /* 1.5              */
+  aria_status_variables,        /* status variables */
+  system_variables,             /* system variables */
+  "1.5",                        /* string version   */
+  MariaDB_PLUGIN_MATURITY_GAMMA /* maturity         */
+}
+maria_declare_plugin_end;
diff --git a/storage/maria/ha_maria.h b/storage/maria/ha_maria.h
new file mode 100644
index 00000000000..605ad1d3a20
--- /dev/null
+++ b/storage/maria/ha_maria.h
@@ -0,0 +1,197 @@
+/* Copyright (C) 2006,2004 MySQL AB & MySQL Finland AB & TCX DataKonsult AB
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; version 2 of the License.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */
+
+#ifndef HA_MARIA_INCLUDED
+#define HA_MARIA_INCLUDED
+
+#ifdef USE_PRAGMA_INTERFACE
+#pragma interface                               /* gcc class implementation */
+#endif
+
+/* class for the maria handler */
+
+#include <maria.h>
+
+#define HA_RECOVER_NONE         0       /* No automatic recover */
+#define HA_RECOVER_DEFAULT      1       /* Automatic recover active */
+#define HA_RECOVER_BACKUP       2       /* Make a backupfile on recover */
+#define HA_RECOVER_FORCE        4       /* Recover even if we loose rows */
+#define HA_RECOVER_QUICK        8       /* Don't check rows in data file */
+
+C_MODE_START
+ICP_RESULT index_cond_func_maria(void *arg);
+C_MODE_END
+
+extern ulong maria_sort_buffer_size;
+extern TYPELIB maria_recover_typelib;
+extern ulong maria_recover_options;
+
+class ha_maria :public handler
+{
+  MARIA_HA *file;
+  ulonglong int_table_flags;
+  MARIA_RECORD_POS remember_pos;
+  char *data_file_name, *index_file_name;
+  enum data_file_type data_file_type;
+  bool can_enable_indexes;
+  /**
+    If a transactional table is doing bulk insert with a single
+    UNDO_BULK_INSERT with/without repair. 
+  */
+  uint8 bulk_insert_single_undo;
+  int repair(THD * thd, HA_CHECK *param, bool optimize);
+  int zerofill(THD * thd, HA_CHECK_OPT *check_opt);
+
+public:
+  ha_maria(handlerton *hton, TABLE_SHARE * table_arg);
+  ~ha_maria() {}
+  handler *clone(MEM_ROOT *mem_root);
+  const char *table_type() const
+  { return "Aria"; }
+  const char *index_type(uint key_number);
+  const char **bas_ext() const;
+  ulonglong table_flags() const
+  { return int_table_flags; }
+  ulong index_flags(uint inx, uint part, bool all_parts) const
+  {
+    return ((table_share->key_info[inx].algorithm == HA_KEY_ALG_FULLTEXT) ?
+            0 : HA_READ_NEXT | HA_READ_PREV | HA_READ_RANGE |
+            HA_READ_ORDER | HA_KEYREAD_ONLY | HA_DO_INDEX_COND_PUSHDOWN);
+  }
+  uint max_supported_keys() const
+  { return MARIA_MAX_KEY; }
+  uint max_supported_key_length() const;
+  uint max_supported_key_part_length() const
+  { return max_supported_key_length(); }
+  enum row_type get_row_type() const;
+  uint checksum() const;
+  virtual double scan_time();
+
+  int open(const char *name, int mode, uint test_if_locked);
+  int close(void);
+  int write_row(uchar * buf);
+  int update_row(const uchar * old_data, uchar * new_data);
+  int delete_row(const uchar * buf);
+  int index_read_map(uchar * buf, const uchar * key, key_part_map keypart_map,
+		     enum ha_rkey_function find_flag);
+  int index_read_idx_map(uchar * buf, uint idx, const uchar * key,
+			 key_part_map keypart_map,
+			 enum ha_rkey_function find_flag);
+  int index_read_last_map(uchar * buf, const uchar * key,
+			  key_part_map keypart_map);
+  int index_next(uchar * buf);
+  int index_prev(uchar * buf);
+  int index_first(uchar * buf);
+  int index_last(uchar * buf);
+  int index_next_same(uchar * buf, const uchar * key, uint keylen);
+  int ft_init()
+  {
+    if (!ft_handler)
+      return 1;
+    ft_handler->please->reinit_search(ft_handler);
+    return 0;
+  }
+  FT_INFO *ft_init_ext(uint flags, uint inx, String * key)
+  {
+    return maria_ft_init_search(flags, file, inx,
+                                (uchar *) key->ptr(), key->length(),
+                                key->charset(), table->record[0]);
+  }
+  int ft_read(uchar * buf);
+  int index_init(uint idx, bool sorted);
+  int index_end();
+  int rnd_init(bool scan);
+  int rnd_end(void);
+  int rnd_next(uchar * buf);
+  int rnd_pos(uchar * buf, uchar * pos);
+  int remember_rnd_pos();
+  int restart_rnd_next(uchar * buf);
+  void position(const uchar * record);
+  int info(uint);
+  int info(uint, my_bool);
+  int extra(enum ha_extra_function operation);
+  int extra_opt(enum ha_extra_function operation, ulong cache_size);
+  int reset(void);
+  int external_lock(THD * thd, int lock_type);
+  int start_stmt(THD *thd, thr_lock_type lock_type);
+  int delete_all_rows(void);
+  int disable_indexes(uint mode);
+  int enable_indexes(uint mode);
+  int indexes_are_disabled(void);
+  void start_bulk_insert(ha_rows rows);
+  int end_bulk_insert();
+  ha_rows records_in_range(uint inx, key_range * min_key, key_range * max_key);
+  void update_create_info(HA_CREATE_INFO * create_info);
+  int create(const char *name, TABLE * form, HA_CREATE_INFO * create_info);
+  THR_LOCK_DATA **store_lock(THD * thd, THR_LOCK_DATA ** to,
+                             enum thr_lock_type lock_type);
+  virtual void get_auto_increment(ulonglong offset, ulonglong increment,
+                                  ulonglong nb_desired_values,
+                                  ulonglong *first_value,
+                                  ulonglong *nb_reserved_values);
+  int rename_table(const char *from, const char *to);
+  int delete_table(const char *name);
+  void drop_table(const char *name);
+  int check(THD * thd, HA_CHECK_OPT * check_opt);
+  int analyze(THD * thd, HA_CHECK_OPT * check_opt);
+  int repair(THD * thd, HA_CHECK_OPT * check_opt);
+  bool check_and_repair(THD * thd);
+  bool is_crashed() const;
+  bool is_changed() const;
+  bool auto_repair() const { return maria_recover_options != HA_RECOVER_NONE; }
+  int optimize(THD * thd, HA_CHECK_OPT * check_opt);
+  int restore(THD * thd, HA_CHECK_OPT * check_opt);
+  int backup(THD * thd, HA_CHECK_OPT * check_opt);
+  int assign_to_keycache(THD * thd, HA_CHECK_OPT * check_opt);
+  int preload_keys(THD * thd, HA_CHECK_OPT * check_opt);
+  bool check_if_incompatible_data(HA_CREATE_INFO * info, uint table_changes);
+  bool check_if_supported_virtual_columns(void) { return TRUE;}
+#ifdef HAVE_REPLICATION
+  int dump(THD * thd, int fd);
+  int net_read_dump(NET * net);
+#endif
+#ifdef HAVE_QUERY_CACHE
+  my_bool register_query_cache_table(THD *thd, char *table_key,
+                                     uint key_length,
+                                     qc_engine_callback
+                                     *engine_callback,
+                                     ulonglong *engine_data);
+#endif
+  MARIA_HA *file_ptr(void)
+  {
+    return file;
+  }
+  static int implicit_commit(THD *thd, bool new_trn);
+  /**
+   * Multi Range Read interface
+   */
+  int multi_range_read_init(RANGE_SEQ_IF *seq, void *seq_init_param,
+                            uint n_ranges, uint mode, HANDLER_BUFFER *buf);
+  int multi_range_read_next(char **range_info);
+  ha_rows multi_range_read_info_const(uint keyno, RANGE_SEQ_IF *seq,
+                                      void *seq_init_param, 
+                                      uint n_ranges, uint *bufsz,
+                                      uint *flags, COST_VECT *cost);
+  ha_rows multi_range_read_info(uint keyno, uint n_ranges, uint keys,
+                                uint *bufsz, uint *flags, COST_VECT *cost);
+  
+  /* Index condition pushdown implementation */
+  Item *idx_cond_push(uint keyno, Item* idx_cond);
+private:
+  DsMrr_impl ds_mrr;
+  friend ICP_RESULT index_cond_func_maria(void *arg);
+};
+
+#endif /* HA_MARIA_INCLUDED */
diff --git a/storage/maria/lockman.c b/storage/maria/lockman.c
new file mode 100644
index 00000000000..d6d4dcd44e6
--- /dev/null
+++ b/storage/maria/lockman.c
@@ -0,0 +1,786 @@
+/* QQ: TODO - allocate everything from dynarrays !!! (benchmark) */
+/* QQ: TODO instant duration locks */
+/* QQ: #warning automatically place S instead of LS if possible */
+
+/* Copyright (C) 2006 MySQL AB
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; version 2 of the License.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */
+
+/*
+  Generic Lock Manager
+
+  Lock manager handles locks on "resources", a resource must be uniquely
+  identified by a 64-bit number. Lock manager itself does not imply
+  anything about the nature of a resource - it can be a row, a table, a
+  database, or just anything.
+
+  Locks belong to "lock owners". A Lock owner is uniquely identified by a
+  16-bit number. A function loid2lo must be provided by the application
+  that takes such a number as an argument and returns a LOCK_OWNER
+  structure.
+
+  Lock levels are completely defined by three tables. Lock compatibility
+  matrix specifies which locks can be held at the same time on a resource.
+  Lock combining matrix specifies what lock level has the same behaviour as
+  a pair of two locks of given levels. getlock_result matrix simplifies
+  intention locking and lock escalation for an application, basically it
+  defines which locks are intention locks and which locks are "loose"
+  locks.  It is only used to provide better diagnostics for the
+  application, lock manager itself does not differentiate between normal,
+  intention, and loose locks.
+
+  Internally lock manager is based on a lock-free hash, see lf_hash.c for
+  details.  All locks are stored in a hash, with a resource id as a search
+  key, so all locks for the same resource will be considered collisions and
+  will be put in a one (lock-free) linked list.  The main lock-handling
+  logic is in the inner loop that searches for a lock in such a linked
+  list - lockfind().
+
+  This works as follows. Locks generally are added to the end of the list
+  (with one exception, see below). When scanning the list it is always
+  possible to determine what locks are granted (active) and what locks are
+  waiting - first lock is obviously active, the second is active if it's
+  compatible with the first, and so on, a lock is active if it's compatible
+  with all previous locks and all locks before it are also active.
+  To calculate the "compatible with all previous locks" all locks are
+  accumulated in prev_lock variable using lock_combining_matrix.
+
+  Lock upgrades: when a thread that has a lock on a given resource,
+  requests a new lock on the same resource and the old lock is not enough
+  to satisfy new lock requirements (which is defined by
+  lock_combining_matrix[old_lock][new_lock] != old_lock), a new lock is
+  placed in the list. Depending on other locks it is immediately active or
+  it will wait for other locks. Here's an exception to "locks are added
+  to the end" rule - upgraded locks are added after the last active lock
+  but before all waiting locks. Old lock (the one we upgraded from) is
+  not removed from the list, indeed it may be needed if the new lock was
+  in a savepoint that gets rolled back. So old lock is marked as "ignored"
+  (IGNORE_ME flag). New lock gets an UPGRADED flag.
+
+  Loose locks add an important exception to the above. Loose locks do not
+  always commute with other locks. In the list IX-LS both locks are active,
+  while in the LS-IX list only the first lock is active. This creates a
+  problem in lock upgrades. If the list was IX-LS and the owner of the
+  first lock wants to place LS lock (which can be immediately granted), the
+  IX lock is upgraded to LSIX and the list becomes IX-LS-LSIX, which,
+  according to the lock compatibility matrix means that the last lock is
+  waiting - of course it all happened because IX and LS were swapped and
+  they don't commute. To work around this there's ACTIVE flag which is set
+  in every lock that never waited (was placed active), and this flag
+  overrides "compatible with all previous locks" rule.
+
+  When a lock is placed to the end of the list it's either compatible with
+  all locks and all locks are active - new lock becomes active at once, or
+  it conflicts with some of the locks, in this case in the 'blocker'
+  variable a conflicting lock is returned and the calling thread waits on a
+  pthread condition in the LOCK_OWNER structure of the owner of the
+  conflicting lock. Or a new lock is compatible with all locks, but some
+  existing locks are not compatible with each other (example: request IS,
+  when the list is S-IX) - that is not all locks are active. In this case a
+  first waiting lock is returned in the 'blocker' variable, lockman_getlock()
+  notices that a "blocker" does not conflict with the requested lock, and
+  "dereferences" it, to find the lock that it's waiting on.  The calling
+  thread than begins to wait on the same lock.
+
+  To better support table-row relations where one needs to lock the table
+  with an intention lock before locking the row, extended diagnostics is
+  provided.  When an intention lock (presumably on a table) is granted,
+  lockman_getlock() returns one of GOT_THE_LOCK (no need to lock the row,
+  perhaps the thread already has a normal lock on this table),
+  GOT_THE_LOCK_NEED_TO_LOCK_A_SUBRESOURCE (need to lock the row, as usual),
+  GOT_THE_LOCK_NEED_TO_INSTANT_LOCK_A_SUBRESOURCE (only need to check
+  whether it's possible to lock the row, but no need to lock it - perhaps
+  the thread has a loose lock on this table). This is defined by
+  getlock_result[] table.
+*/
+
+#include <my_global.h>
+#include <my_sys.h>
+#include <my_bit.h>
+#include <lf.h>
+#include "lockman.h"
+
+/*
+  Lock compatibility matrix.
+
+  It's asymmetric. Read it as "Somebody has the lock <value in the row
+  label>, can I set the lock <value in the column label> ?"
+
+  ') Though you can take LS lock while somebody has S lock, it makes no
+  sense - it's simpler to take S lock too.
+
+  1  - compatible
+  0  - incompatible
+  -1 - "impossible", so that we can assert the impossibility.
+*/
+static int lock_compatibility_matrix[10][10]=
+{ /* N    S   X  IS  IX  SIX LS  LX  SLX LSIX          */
+  {  -1,  1,  1,  1,  1,  1,  1,  1,  1,  1 }, /* N    */
+  {  -1,  1,  0,  1,  0,  0,  1,  0,  0,  0 }, /* S    */
+  {  -1,  0,  0,  0,  0,  0,  0,  0,  0,  0 }, /* X    */
+  {  -1,  1,  0,  1,  1,  1,  1,  1,  1,  1 }, /* IS   */
+  {  -1,  0,  0,  1,  1,  0,  1,  1,  0,  1 }, /* IX   */
+  {  -1,  0,  0,  1,  0,  0,  1,  0,  0,  0 }, /* SIX  */
+  {  -1,  1,  0,  1,  0,  0,  1,  0,  0,  0 }, /* LS   */
+  {  -1,  0,  0,  0,  0,  0,  0,  0,  0,  0 }, /* LX   */
+  {  -1,  0,  0,  0,  0,  0,  0,  0,  0,  0 }, /* SLX  */
+  {  -1,  0,  0,  1,  0,  0,  1,  0,  0,  0 }  /* LSIX */
+};
+
+/*
+  Lock combining matrix.
+
+  It's symmetric. Read it as "what lock level L is identical to the
+  set of two locks A and B"
+
+  One should never get N from it, we assert the impossibility
+*/
+static enum lockman_lock_type lock_combining_matrix[10][10]=
+{/*    N    S   X    IS    IX  SIX    LS    LX   SLX   LSIX         */
+  {    N,   S,  X,   IS,   IX, SIX,    S,  SLX, SLX,  SIX}, /* N    */
+  {    S,   S,  X,    S,  SIX, SIX,    S,  SLX, SLX,  SIX}, /* S    */
+  {    X,   X,  X,    X,    X,   X,    X,    X,   X,    X}, /* X    */
+  {   IS,   S,  X,   IS,   IX, SIX,   LS,   LX, SLX, LSIX}, /* IS   */
+  {   IX, SIX,  X,   IX,   IX, SIX, LSIX,   LX, SLX, LSIX}, /* IX   */
+  {  SIX, SIX,  X,  SIX,  SIX, SIX,  SIX,  SLX, SLX,  SIX}, /* SIX  */
+  {   LS,   S,  X,   LS, LSIX, SIX,   LS,   LX, SLX, LSIX}, /* LS   */
+  {   LX, SLX,  X,   LX,   LX, SLX,   LX,   LX, SLX,   LX}, /* LX   */
+  {  SLX, SLX,  X,  SLX,  SLX, SLX,  SLX,  SLX, SLX,  SLX}, /* SLX  */
+  { LSIX, SIX,  X, LSIX, LSIX, SIX, LSIX,   LX, SLX, LSIX}  /* LSIX */
+};
+
+#define REPEAT_ONCE_MORE                0
+#define OK_TO_PLACE_THE_LOCK            1
+#define OK_TO_PLACE_THE_REQUEST         2
+#define ALREADY_HAVE_THE_LOCK           4
+#define ALREADY_HAVE_THE_REQUEST        8
+#define PLACE_NEW_DISABLE_OLD          16
+#define REQUEST_NEW_DISABLE_OLD        32
+#define RESOURCE_WAS_UNLOCKED          64
+
+#define NEED_TO_WAIT (OK_TO_PLACE_THE_REQUEST | ALREADY_HAVE_THE_REQUEST |\
+                      REQUEST_NEW_DISABLE_OLD)
+#define ALREADY_HAVE (ALREADY_HAVE_THE_LOCK   | ALREADY_HAVE_THE_REQUEST)
+#define LOCK_UPGRADE (PLACE_NEW_DISABLE_OLD   | REQUEST_NEW_DISABLE_OLD)
+
+
+/*
+  the return codes for lockman_getlock
+
+  It's asymmetric. Read it as "I have the lock <value in the row label>,
+  what value should be returned for <value in the column label> ?"
+
+  0 means impossible combination (assert!)
+
+  Defines below help to preserve the table structure.
+  I/L/A values are self explanatory
+  x means the combination is possible (assert should not crash)
+    but it cannot happen in row locks, only in table locks (S,X),
+    or lock escalations (LS,LX)
+*/
+#define I GOT_THE_LOCK_NEED_TO_LOCK_A_SUBRESOURCE
+#define L GOT_THE_LOCK_NEED_TO_INSTANT_LOCK_A_SUBRESOURCE
+#define A GOT_THE_LOCK
+#define x GOT_THE_LOCK
+static enum lockman_getlock_result getlock_result[10][10]=
+{/*    N    S   X    IS    IX  SIX    LS    LX   SLX   LSIX         */
+  {    0,   0,  0,    0,    0,   0,    0,    0,   0,    0}, /* N    */
+  {    0,   x,  0,    A,    0,   0,    x,    0,   0,    0}, /* S    */
+  {    0,   x,  x,    A,    A,   0,    x,    x,   0,    0}, /* X    */
+  {    0,   0,  0,    I,    0,   0,    0,    0,   0,    0}, /* IS   */
+  {    0,   0,  0,    I,    I,   0,    0,    0,   0,    0}, /* IX   */
+  {    0,   x,  0,    A,    I,   0,    x,    0,   0,    0}, /* SIX  */
+  {    0,   0,  0,    L,    0,   0,    x,    0,   0,    0}, /* LS   */
+  {    0,   0,  0,    L,    L,   0,    x,    x,   0,    0}, /* LX   */
+  {    0,   x,  0,    A,    L,   0,    x,    x,   0,    0}, /* SLX  */
+  {    0,   0,  0,    L,    I,   0,    x,    0,   0,    0}  /* LSIX */
+};
+#undef I
+#undef L
+#undef A
+#undef x
+
+LF_REQUIRE_PINS(4)
+
+typedef struct lockman_lock {
+  uint64 resource;
+  struct lockman_lock  *lonext;
+  intptr volatile link;
+  uint32 hashnr;
+  /* QQ: TODO - remove hashnr from LOCK */
+  uint16 loid;
+  uchar lock;              /* sizeof(uchar) <= sizeof(enum) */
+  uchar flags;
+} LOCK;
+
+#define IGNORE_ME               1
+#define UPGRADED                2
+#define ACTIVE                  4
+
+typedef struct {
+  intptr volatile *prev;
+  LOCK *curr, *next;
+  LOCK *blocker, *upgrade_from;
+} CURSOR;
+
+#define PTR(V)      (LOCK *)((V) & (~(intptr)1))
+#define DELETED(V)  ((V) & 1)
+
+/*
+  NOTE
+    cursor is positioned in either case
+    pins[0..3] are used, they are NOT removed on return
+*/
+static int lockfind(LOCK * volatile *head, LOCK *node,
+                    CURSOR *cursor, LF_PINS *pins)
+{
+  uint32        hashnr, cur_hashnr;
+  uint64        resource, cur_resource;
+  intptr        cur_link;
+  my_bool       cur_active, compatible, upgrading, prev_active;
+  enum lockman_lock_type lock, prev_lock, cur_lock;
+  uint16        loid, cur_loid;
+  int           cur_flags, flags;
+
+  hashnr= node->hashnr;
+  resource= node->resource;
+  lock= node->lock;
+  loid= node->loid;
+  flags= node->flags;
+
+retry:
+  cursor->prev= (intptr *)head;
+  prev_lock= N;
+  cur_active= TRUE;
+  compatible= TRUE;
+  upgrading= FALSE;
+  cursor->blocker= cursor->upgrade_from= 0;
+  _lf_unpin(pins, 3);
+  do {
+    cursor->curr= PTR(*cursor->prev);
+    _lf_pin(pins, 1, cursor->curr);
+  } while(*cursor->prev != (intptr)cursor->curr && LF_BACKOFF);
+  for (;;)
+  {
+    if (!cursor->curr)
+      break;
+    do {
+      cur_link= cursor->curr->link;
+      cursor->next= PTR(cur_link);
+      _lf_pin(pins, 0, cursor->next);
+    } while (cur_link != cursor->curr->link && LF_BACKOFF);
+    cur_hashnr= cursor->curr->hashnr;
+    cur_resource= cursor->curr->resource;
+    cur_lock= cursor->curr->lock;
+    cur_loid= cursor->curr->loid;
+    cur_flags= cursor->curr->flags;
+    if (*cursor->prev != (intptr)cursor->curr)
+    {
+      (void)LF_BACKOFF;
+      goto retry;
+    }
+    if (!DELETED(cur_link))
+    {
+      if (cur_hashnr > hashnr ||
+          (cur_hashnr == hashnr && cur_resource >= resource))
+      {
+        if (cur_hashnr > hashnr || cur_resource > resource)
+          break;
+        /* ok, we have a lock for this resource */
+        DBUG_ASSERT(lock_compatibility_matrix[prev_lock][cur_lock] >= 0);
+        DBUG_ASSERT(lock_compatibility_matrix[cur_lock][lock] >= 0);
+        if ((cur_flags & IGNORE_ME) && ! (flags & IGNORE_ME))
+        {
+          DBUG_ASSERT(cur_active);
+          if (cur_loid == loid)
+            cursor->upgrade_from= cursor->curr;
+        }
+        else
+        {
+          prev_active= cur_active;
+          if (cur_flags & ACTIVE)
+            DBUG_ASSERT(prev_active == TRUE);
+          else
+            cur_active&= lock_compatibility_matrix[prev_lock][cur_lock];
+          if (upgrading && !cur_active /*&& !(cur_flags & UPGRADED)*/)
+            break;
+          if (prev_active && !cur_active)
+          {
+            cursor->blocker= cursor->curr;
+            _lf_pin(pins, 3, cursor->curr);
+          }
+          if (cur_loid == loid)
+          {
+            /* we already have a lock on this resource */
+            DBUG_ASSERT(lock_combining_matrix[cur_lock][lock] != N);
+            DBUG_ASSERT(!upgrading || (flags & IGNORE_ME));
+            if (lock_combining_matrix[cur_lock][lock] == cur_lock)
+            {
+              /* new lock is compatible */
+              if (cur_active)
+              {
+                cursor->blocker= cursor->curr;  /* loose-locks! */
+                _lf_unpin(pins, 3);             /* loose-locks! */
+                return ALREADY_HAVE_THE_LOCK;
+              }
+              else
+                return ALREADY_HAVE_THE_REQUEST;
+            }
+            /* not compatible, upgrading */
+            upgrading= TRUE;
+            cursor->upgrade_from= cursor->curr;
+          }
+          else
+          {
+            if (!lock_compatibility_matrix[cur_lock][lock])
+            {
+              compatible= FALSE;
+              cursor->blocker= cursor->curr;
+              _lf_pin(pins, 3, cursor->curr);
+            }
+          }
+          prev_lock= lock_combining_matrix[prev_lock][cur_lock];
+          DBUG_ASSERT(prev_lock != N);
+        }
+      }
+      cursor->prev= &(cursor->curr->link);
+      _lf_pin(pins, 2, cursor->curr);
+    }
+    else
+    {
+      if (my_atomic_casptr((void **)cursor->prev,
+                           (void **)(char*) &cursor->curr, cursor->next))
+        _lf_alloc_free(pins, cursor->curr);
+      else
+      {
+        (void)LF_BACKOFF;
+        goto retry;
+      }
+    }
+    cursor->curr= cursor->next;
+    _lf_pin(pins, 1, cursor->curr);
+  }
+  /*
+    either the end of lock list - no more locks for this resource,
+    or upgrading and the end of active lock list
+  */
+  if (upgrading)
+  {
+    if (compatible /*&& prev_active*/)
+      return PLACE_NEW_DISABLE_OLD;
+    else
+      return REQUEST_NEW_DISABLE_OLD;
+  }
+  if (cur_active && compatible)
+  {
+    /*
+      either no locks for this resource or all are compatible.
+      ok to place the lock in any case.
+    */
+    return prev_lock == N ? RESOURCE_WAS_UNLOCKED
+                          : OK_TO_PLACE_THE_LOCK;
+  }
+  /* we have a lock conflict. ok to place a lock request. And wait */
+  return OK_TO_PLACE_THE_REQUEST;
+}
+
+/*
+  NOTE
+    it uses pins[0..3], on return pins 0..2 are removed, pin 3 (blocker) stays
+*/
+static int lockinsert(LOCK * volatile *head, LOCK *node, LF_PINS *pins,
+                      LOCK **blocker)
+{
+  CURSOR         cursor;
+  int            res;
+
+  do
+  {
+    res= lockfind(head, node, &cursor, pins);
+    DBUG_ASSERT(res != ALREADY_HAVE_THE_REQUEST);
+    if (!(res & ALREADY_HAVE))
+    {
+      if (res & LOCK_UPGRADE)
+      {
+        node->flags|= UPGRADED;
+        node->lock= lock_combining_matrix[cursor.upgrade_from->lock][node->lock];
+      }
+      if (!(res & NEED_TO_WAIT))
+        node->flags|= ACTIVE;
+      node->link= (intptr)cursor.curr;
+      DBUG_ASSERT(node->link != (intptr)node);
+      DBUG_ASSERT(cursor.prev != &node->link);
+      if (!my_atomic_casptr((void **)cursor.prev,
+                            (void **)(char*) &cursor.curr, node))
+      {
+        res= REPEAT_ONCE_MORE;
+        node->flags&= ~ACTIVE;
+      }
+      if (res & LOCK_UPGRADE)
+        cursor.upgrade_from->flags|= IGNORE_ME;
+      /*
+        QQ: is this OK ? if a reader has already read upgrade_from,
+        it may find it conflicting with node :(
+        - see the last test from test_lockman_simple()
+      */
+    }
+
+  } while (res == REPEAT_ONCE_MORE);
+  _lf_unpin(pins, 0);
+  _lf_unpin(pins, 1);
+  _lf_unpin(pins, 2);
+  /*
+    note that blocker is not necessarily pinned here (when it's == curr).
+    this is ok as in such a case it's either a dummy node for
+    initialize_bucket() and dummy nodes don't need pinning,
+    or it's a lock of the same transaction for lockman_getlock,
+    and it cannot be removed by another thread
+  */
+  *blocker= cursor.blocker;
+  return res;
+}
+
+/*
+  NOTE
+    it uses pins[0..3], on return pins 0..2 are removed, pin 3 (blocker) stays
+*/
+static int lockpeek(LOCK * volatile *head, LOCK *node, LF_PINS *pins,
+                    LOCK **blocker)
+{
+  CURSOR         cursor;
+  int            res;
+
+  res= lockfind(head, node, &cursor, pins);
+
+  _lf_unpin(pins, 0);
+  _lf_unpin(pins, 1);
+  _lf_unpin(pins, 2);
+  if (blocker)
+    *blocker= cursor.blocker;
+  return res;
+}
+
+/*
+  NOTE
+    it uses pins[0..3], on return all pins are removed.
+
+    One _must_ have the lock (or request) to call this
+*/
+static int lockdelete(LOCK * volatile *head, LOCK *node, LF_PINS *pins)
+{
+  CURSOR cursor;
+  int res;
+
+  do
+  {
+    res= lockfind(head, node, &cursor, pins);
+    DBUG_ASSERT(res & ALREADY_HAVE);
+
+    if (cursor.upgrade_from)
+      cursor.upgrade_from->flags&= ~IGNORE_ME;
+
+    /*
+      XXX this does not work with savepoints, as old lock is left ignored.
+      It cannot be unignored, as would basically mean moving the lock back
+      in the lock chain (from upgraded). And the latter is not allowed -
+      because it breaks list scanning. So old ignored lock must be deleted,
+      new - same - lock must be installed right after the lock we're deleting,
+      then we can delete. Good news is - this is only required when rolling
+      back a savepoint.
+    */
+    if (my_atomic_casptr((void **)(char*)&(cursor.curr->link),
+                         (void **)(char*)&cursor.next, 1+(char *)cursor.next))
+    {
+      if (my_atomic_casptr((void **)cursor.prev,
+                           (void **)(char*)&cursor.curr, cursor.next))
+        _lf_alloc_free(pins, cursor.curr);
+      else
+        lockfind(head, node, &cursor, pins);
+    }
+    else
+    {
+      res= REPEAT_ONCE_MORE;
+      if (cursor.upgrade_from)
+        cursor.upgrade_from->flags|= IGNORE_ME;
+    }
+  } while (res == REPEAT_ONCE_MORE);
+  _lf_unpin(pins, 0);
+  _lf_unpin(pins, 1);
+  _lf_unpin(pins, 2);
+  _lf_unpin(pins, 3);
+  return res;
+}
+
+void lockman_init(LOCKMAN *lm, loid_to_lo_func *func, uint timeout)
+{
+  lf_alloc_init(&lm->alloc, sizeof(LOCK), offsetof(LOCK, lonext));
+  lf_dynarray_init(&lm->array, sizeof(LOCK **));
+  lm->size= 1;
+  lm->count= 0;
+  lm->loid_to_lo= func;
+  lm->lock_timeout= timeout;
+}
+
+void lockman_destroy(LOCKMAN *lm)
+{
+  LOCK *el= *(LOCK **)_lf_dynarray_lvalue(&lm->array, 0);
+  while (el)
+  {
+    intptr next= el->link;
+    if (el->hashnr & 1)
+      lf_alloc_direct_free(&lm->alloc, el);
+    else
+      my_free((void *)el, MYF(0));
+    el= (LOCK *)next;
+  }
+  lf_alloc_destroy(&lm->alloc);
+  lf_dynarray_destroy(&lm->array);
+}
+
+/* TODO: optimize it */
+#define MAX_LOAD 1
+
+static void initialize_bucket(LOCKMAN *lm, LOCK * volatile *node,
+                              uint bucket, LF_PINS *pins)
+{
+  int res;
+  uint parent= my_clear_highest_bit(bucket);
+  LOCK *dummy= (LOCK *)my_malloc(sizeof(LOCK), MYF(MY_WME));
+  LOCK **tmp= 0, *cur;
+  LOCK * volatile *el= _lf_dynarray_lvalue(&lm->array, parent);
+
+  if (*el == NULL && bucket)
+    initialize_bucket(lm, el, parent, pins);
+  dummy->hashnr= my_reverse_bits(bucket);
+  dummy->loid= 0;
+  dummy->lock= X; /* doesn't matter, in fact */
+  dummy->resource= 0;
+  dummy->flags= 0;
+  res= lockinsert(el, dummy, pins, &cur);
+  DBUG_ASSERT(res & (ALREADY_HAVE_THE_LOCK | RESOURCE_WAS_UNLOCKED));
+  if (res & ALREADY_HAVE_THE_LOCK)
+  {
+    my_free((void *)dummy, MYF(0));
+    dummy= cur;
+  }
+  my_atomic_casptr((void **)node, (void **)(char*) &tmp, dummy);
+}
+
+static inline uint calc_hash(uint64 resource)
+{
+  const uchar *pos= (uchar *)&resource;
+  ulong nr1= 1, nr2= 4, i;
+  for (i= 0; i < sizeof(resource) ; i++, pos++)
+  {
+    nr1^= (ulong) ((((uint) nr1 & 63)+nr2) * ((uint)*pos)) + (nr1 << 8);
+    nr2+= 3;
+  }
+  return nr1 & INT_MAX32;
+}
+
+/*
+  RETURN
+    see enum lockman_getlock_result
+  NOTE
+    uses pins[0..3], they're removed on return
+*/
+enum lockman_getlock_result lockman_getlock(LOCKMAN *lm, LOCK_OWNER *lo,
+                                            uint64 resource,
+                                            enum lockman_lock_type lock)
+{
+  int res;
+  uint csize, bucket, hashnr;
+  LOCK *node, * volatile *el, *blocker;
+  LF_PINS *pins= lo->pins;
+  enum lockman_lock_type old_lock;
+
+  DBUG_ASSERT(lo->loid);
+  lf_rwlock_by_pins(pins);
+  node= (LOCK *)_lf_alloc_new(pins);
+  node->flags= 0;
+  node->lock= lock;
+  node->loid= lo->loid;
+  node->resource= resource;
+  hashnr= calc_hash(resource);
+  bucket= hashnr % lm->size;
+  el= _lf_dynarray_lvalue(&lm->array, bucket);
+  if (*el == NULL)
+    initialize_bucket(lm, el, bucket, pins);
+  node->hashnr= my_reverse_bits(hashnr) | 1;
+  res= lockinsert(el, node, pins, &blocker);
+  if (res & ALREADY_HAVE)
+  {
+    int r;
+    old_lock= blocker->lock;
+    _lf_alloc_free(pins, node);
+    lf_rwunlock_by_pins(pins);
+    r= getlock_result[old_lock][lock];
+    DBUG_ASSERT(r);
+    return r;
+  }
+  /* a new value was added to the hash */
+  csize= lm->size;
+  if ((my_atomic_add32(&lm->count, 1)+1.0) / csize > MAX_LOAD)
+    my_atomic_cas32(&lm->size, (int*) &csize, csize*2);
+  node->lonext= lo->all_locks;
+  lo->all_locks= node;
+  for ( ; res & NEED_TO_WAIT; res= lockpeek(el, node, pins, &blocker))
+  {
+    LOCK_OWNER *wait_for_lo;
+    ulonglong deadline;
+    struct timespec timeout;
+
+    _lf_assert_pin(pins, 3); /* blocker must be pinned here */
+    wait_for_lo= lm->loid_to_lo(blocker->loid);
+
+    /*
+      now, this is tricky. blocker is not necessarily a LOCK
+      we're waiting for. If it's compatible with what we want,
+      then we're waiting for a lock that blocker is waiting for
+      (see two places where blocker is set in lockfind)
+      In the latter case, let's "dereference" it
+    */
+    if (lock_compatibility_matrix[blocker->lock][lock])
+    {
+      blocker= wait_for_lo->all_locks;
+      _lf_pin(pins, 3, blocker);
+      if (blocker != wait_for_lo->all_locks)
+        continue;
+      wait_for_lo= wait_for_lo->waiting_for;
+    }
+
+    /*
+      note that the blocker transaction may have ended by now,
+      its LOCK_OWNER and short id were reused, so 'wait_for_lo' may point
+      to an unrelated - albeit valid - LOCK_OWNER
+    */
+    if (!wait_for_lo)
+      continue;
+
+    lo->waiting_for= wait_for_lo;
+    lf_rwunlock_by_pins(pins);
+
+    /*
+      We lock a mutex - it may belong to a wrong LOCK_OWNER, but it must
+      belong to _some_ LOCK_OWNER. It means, we can never free() a LOCK_OWNER,
+      if there're other active LOCK_OWNERs.
+    */
+    /* QQ: race condition here */
+    pthread_mutex_lock(wait_for_lo->mutex);
+    if (DELETED(blocker->link))
+    {
+      /*
+        blocker transaction was ended, or a savepoint that owned
+        the lock was rolled back. Either way - the lock was removed
+      */
+      pthread_mutex_unlock(wait_for_lo->mutex);
+      lf_rwlock_by_pins(pins);
+      continue;
+    }
+
+    /* yuck. waiting */
+    deadline= my_getsystime() + lm->lock_timeout * 10000;
+    set_timespec_nsec(timeout,lm->lock_timeout * 1000000);
+    do
+    {
+      pthread_cond_timedwait(wait_for_lo->cond, wait_for_lo->mutex, &timeout);
+    } while (!DELETED(blocker->link) && my_getsystime() < deadline);
+    pthread_mutex_unlock(wait_for_lo->mutex);
+    lf_rwlock_by_pins(pins);
+    if (!DELETED(blocker->link))
+    {
+      /*
+        timeout.
+        note that we _don't_ release the lock request here.
+        Instead we're relying on the caller to abort the transaction,
+        and release all locks at once - see lockman_release_locks()
+      */
+      _lf_unpin(pins, 3);
+      lf_rwunlock_by_pins(pins);
+      return DIDNT_GET_THE_LOCK;
+    }
+  }
+  lo->waiting_for= 0;
+  _lf_assert_unpin(pins, 3); /* unpin should not be needed */
+  lf_rwunlock_by_pins(pins);
+  return getlock_result[lock][lock];
+}
+
+/*
+  RETURN
+    0 - deleted
+    1 - didn't (not found)
+  NOTE
+    see lockdelete() for pin usage notes
+*/
+int lockman_release_locks(LOCKMAN *lm, LOCK_OWNER *lo)
+{
+  LOCK * volatile *el, *node, *next;
+  uint bucket;
+  LF_PINS *pins= lo->pins;
+
+  pthread_mutex_lock(lo->mutex);
+  lf_rwlock_by_pins(pins);
+  for (node= lo->all_locks; node; node= next)
+  {
+    next= node->lonext;
+    bucket= calc_hash(node->resource) % lm->size;
+    el= _lf_dynarray_lvalue(&lm->array, bucket);
+    if (*el == NULL)
+      initialize_bucket(lm, el, bucket, pins);
+    lockdelete(el, node, pins);
+    my_atomic_add32(&lm->count, -1);
+  }
+  lf_rwunlock_by_pins(pins);
+  lo->all_locks= 0;
+  /* now signal all waiters */
+  pthread_cond_broadcast(lo->cond);
+  pthread_mutex_unlock(lo->mutex);
+  return 0;
+}
+
+#ifdef MY_LF_EXTRA_DEBUG
+static const char *lock2str[]=
+{ "N", "S", "X", "IS", "IX", "SIX", "LS", "LX", "SLX", "LSIX" };
+/*
+  NOTE
+    the function below is NOT thread-safe !!!
+*/
+void print_lockhash(LOCKMAN *lm)
+{
+  LOCK *el= *(LOCK **)_lf_dynarray_lvalue(&lm->array, 0);
+  printf("hash: size %u count %u\n", lm->size, lm->count);
+  while (el)
+  {
+    intptr next= el->link;
+    if (el->hashnr & 1)
+    {
+      printf("0x%08lx { resource %lu, loid %u, lock %s",
+             (long) el->hashnr, (ulong) el->resource, el->loid,
+             lock2str[el->lock]);
+      if (el->flags & IGNORE_ME) printf(" IGNORE_ME");
+      if (el->flags & UPGRADED) printf(" UPGRADED");
+      if (el->flags & ACTIVE) printf(" ACTIVE");
+      if (DELETED(next)) printf(" ***DELETED***");
+      printf("}\n");
+    }
+    else
+    {
+      /*printf("0x%08x { dummy }\n", el->hashnr);*/
+      DBUG_ASSERT(el->resource == 0 && el->loid == 0 && el->lock == X);
+    }
+    el= PTR(next);
+  }
+}
+#endif
diff --git a/storage/maria/lockman.h b/storage/maria/lockman.h
new file mode 100644
index 00000000000..82ab483896f
--- /dev/null
+++ b/storage/maria/lockman.h
@@ -0,0 +1,76 @@
+/* Copyright (C) 2006 MySQL AB
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; version 2 of the License.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */
+
+#ifndef _lockman_h
+#define _lockman_h
+
+/*
+  Lock levels:
+  ^^^^^^^^^^^
+
+  N    - "no lock", not a lock, used sometimes internally to simplify the code
+  S    - Shared
+  X    - eXclusive
+  IS   - Intention Shared
+  IX   - Intention eXclusive
+  SIX  - Shared + Intention eXclusive
+  LS   - Loose Shared
+  LX   - Loose eXclusive
+  SLX  - Shared + Loose eXclusive
+  LSIX - Loose Shared + Intention eXclusive
+*/
+enum lockman_lock_type { N, S, X, IS, IX, SIX, LS, LX, SLX, LSIX, LOCK_TYPE_LAST };
+
+struct lockman_lock;
+
+typedef struct st_lock_owner LOCK_OWNER;
+struct st_lock_owner {
+  LF_PINS  *pins;           /* must be allocated from lockman's pinbox       */
+  struct lockman_lock *all_locks; /* a LIFO                                  */
+  LOCK_OWNER  *waiting_for;
+  pthread_cond_t  *cond;    /* transactions waiting for this, wait on 'cond' */
+  pthread_mutex_t *mutex;   /* mutex is required to use 'cond'               */
+  uint16    loid;
+};
+
+typedef LOCK_OWNER *loid_to_lo_func(uint16);
+typedef struct {
+  LF_DYNARRAY array;                    /* hash itself */
+  LF_ALLOCATOR alloc;                   /* allocator for elements */
+  int32 volatile size;                  /* size of array */
+  int32 volatile count;                 /* number of elements in the hash */
+  uint lock_timeout;
+  loid_to_lo_func *loid_to_lo;
+} LOCKMAN;
+#define DIDNT_GET_THE_LOCK 0
+enum lockman_getlock_result {
+  NO_MEMORY_FOR_LOCK=1, DEADLOCK, LOCK_TIMEOUT,
+  GOT_THE_LOCK,
+  GOT_THE_LOCK_NEED_TO_LOCK_A_SUBRESOURCE,
+  GOT_THE_LOCK_NEED_TO_INSTANT_LOCK_A_SUBRESOURCE
+};
+
+void lockman_init(LOCKMAN *, loid_to_lo_func *, uint);
+void lockman_destroy(LOCKMAN *);
+enum lockman_getlock_result lockman_getlock(LOCKMAN *lm, LOCK_OWNER *lo,
+                                            uint64 resource,
+                                            enum lockman_lock_type lock);
+int lockman_release_locks(LOCKMAN *, LOCK_OWNER *);
+
+#ifdef EXTRA_DEBUG
+void print_lockhash(LOCKMAN *lm);
+#endif
+
+#endif
diff --git a/storage/maria/ma_bitmap.c b/storage/maria/ma_bitmap.c
new file mode 100644
index 00000000000..c0763b0612d
--- /dev/null
+++ b/storage/maria/ma_bitmap.c
@@ -0,0 +1,2910 @@
+/* Copyright (C) 2007 Michael Widenius
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; version 2 of the License.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */
+
+/*
+  Bitmap handling (for records in block)
+
+  The data file starts with a bitmap page, followed by as many data
+  pages as the bitmap can cover. After this there is a new bitmap page
+  and more data pages etc.
+
+  The bitmap code assumes there is always an active bitmap page and thus
+  that there is at least one bitmap page in the file
+
+  Structure of bitmap page:
+
+  Fixed size records (to be implemented later):
+
+  2 bits are used to indicate:
+
+  0      Empty
+  1      0-75 % full  (at least room for 2 records)
+  2      75-100 % full (at least room for one record)
+  3      100 % full    (no more room for records)
+
+  Assuming 8K pages, this will allow us to map:
+  8192 (bytes per page) * 4 (pages mapped per byte) * 8192 (page size)= 256M
+
+  (For Maria this will be 7*4 * 8192 = 224K smaller because of LSN)
+
+  Note that for fixed size rows, we can't add more columns without doing
+  a full reorganization of the table. The user can always force a dynamic
+  size row format by specifying ROW_FORMAT=dynamic.
+
+
+  Dynamic size records:
+
+  3 bits are used to indicate				Bytes free in 8K page
+
+  0      Empty page					8176 (head or tail)
+  1      0-30 % full  (at least room for 3 records)	5724
+  2      30-60 % full (at least room for 2 records)	3271
+  3      60-90 % full (at least room for one record)	818
+  4      100 % full   (no more room for records)	0
+  5      Tail page,  0-40 % full			4906
+  6      Tail page,  40-80 % full			1636
+  7      Full tail page or full blob page		0
+
+  Assuming 8K pages, this will allow us to map:
+  8192 (bytes per page) * 8 bits/byte / 3 bits/page * 8192 (page size)= 170.7M
+
+  Note that values 1-3 may be adjust for each individual table based on
+  'min record length'.  Tail pages are for overflow data which can be of
+  any size and thus doesn't have to be adjusted for different tables.
+  If we add more columns to the table, some of the originally calculated
+  'cut off' points may not be optimal, but they shouldn't be 'drasticly
+  wrong'.
+
+  When allocating data from the bitmap, we are trying to do it in a
+  'best fit' manner. Blobs and varchar blocks are given out in large
+  continuous extents to allow fast access to these. Before allowing a
+  row to 'flow over' to other blocks, we will compact the page and use
+  all space on it. If there is many rows in the page, we will ensure
+  there is *LEFT_TO_GROW_ON_SPLIT* bytes left on the page to allow other
+  rows to grow.
+
+  The bitmap format allows us to extend the row file in big chunks, if needed.
+
+  When calculating the size for a packed row, we will calculate the following
+  things separately:
+  - Row header + null_bits + empty_bits fixed size segments etc.
+  - Size of all char/varchar fields
+  - Size of each blob field
+
+  The bitmap handler will get all the above information and return
+  either one page or a set of pages to put the different parts.
+
+  Bitmaps are read on demand in response to insert/delete/update operations.
+  The following bitmap pointers will be cached and stored on disk on close:
+  - Current insert_bitmap;  When inserting new data we will first try to
+    fill this one.
+  - First bitmap which is not completely full.  This is updated when we
+    free data with an update or delete.
+
+  While flushing out bitmaps, we will cache the status of the bitmap in memory
+  to avoid having to read a bitmap for insert of new data that will not
+  be of any use
+  - Total empty space
+  - Largest number of continuous pages
+
+  Bitmap ONLY goes to disk in the following scenarios
+  - The file is closed (and we flush all changes to disk)
+  - On checkpoint
+  (Ie: When we do a checkpoint, we have to ensure that all bitmaps are
+  put on disk even if they are not in the page cache).
+  - When explicitely requested (for example on backup or after recvoery,
+  to simplify things)
+
+ The flow of writing a row is that:
+ - Lock the bitmap
+ - Decide which data pages we will write to
+ - Mark them full in the bitmap page so that other threads do not try to
+    use the same data pages as us
+ - We unlock the bitmap
+ - Write the data pages
+ - Lock the bitmap
+ - Correct the bitmap page with the true final occupation of the data
+   pages (that is, we marked pages full but when we are done we realize
+   we didn't fill them)
+ - Unlock the bitmap.
+*/
+
+#include "maria_def.h"
+#include "ma_blockrec.h"
+
+#define FULL_HEAD_PAGE 4
+#define FULL_TAIL_PAGE 7
+
+/*#define WRONG_BITMAP_FLUSH 1*/ /*define only for provoking bugs*/
+#undef WRONG_BITMAP_FLUSH
+
+static my_bool _ma_read_bitmap_page(MARIA_HA *info,
+                                    MARIA_FILE_BITMAP *bitmap,
+                                    pgcache_page_no_t page);
+static my_bool _ma_bitmap_create_missing(MARIA_HA *info,
+                                         MARIA_FILE_BITMAP *bitmap,
+                                         pgcache_page_no_t page);
+
+/* Write bitmap page to key cache */
+
+static inline my_bool write_changed_bitmap(MARIA_SHARE *share,
+                                           MARIA_FILE_BITMAP *bitmap)
+{
+  DBUG_ENTER("write_changed_bitmap");
+  DBUG_ASSERT(share->pagecache->block_size == bitmap->block_size);
+  DBUG_ASSERT(bitmap->file.write_callback != 0);
+  DBUG_PRINT("info", ("bitmap->non_flushable: %u", bitmap->non_flushable));
+
+  /*
+    Mark that a bitmap page has been written to page cache and we have
+    to flush it during checkpoint.
+  */
+  bitmap->changed_not_flushed= 1;
+
+  if ((bitmap->non_flushable == 0)
+#ifdef WRONG_BITMAP_FLUSH
+      || 1
+#endif
+      )
+  {
+    my_bool res= pagecache_write(share->pagecache,
+                                 &bitmap->file, bitmap->page, 0,
+                                 bitmap->map, PAGECACHE_PLAIN_PAGE,
+                                 PAGECACHE_LOCK_LEFT_UNLOCKED,
+                                 PAGECACHE_PIN_LEFT_UNPINNED,
+                                 PAGECACHE_WRITE_DELAY, 0, LSN_IMPOSSIBLE);
+    DBUG_RETURN(res);
+  }
+  else
+  {
+    MARIA_PINNED_PAGE page_link;
+    int res= pagecache_write(share->pagecache,
+                             &bitmap->file, bitmap->page, 0,
+                             bitmap->map, PAGECACHE_PLAIN_PAGE,
+                             PAGECACHE_LOCK_LEFT_UNLOCKED, PAGECACHE_PIN,
+                             PAGECACHE_WRITE_DELAY, &page_link.link,
+                             LSN_IMPOSSIBLE);
+    page_link.unlock= PAGECACHE_LOCK_LEFT_UNLOCKED;
+    page_link.changed= 1;
+    push_dynamic(&bitmap->pinned_pages, (void*) &page_link);
+    DBUG_RETURN(res);
+  }
+}
+
+/*
+  Initialize bitmap variables in share
+
+  SYNOPSIS
+    _ma_bitmap_init()
+    share		Share handler
+    file		data file handler
+
+  NOTES
+   This is called the first time a file is opened.
+
+  RETURN
+    0   ok
+    1   error
+*/
+
+my_bool _ma_bitmap_init(MARIA_SHARE *share, File file)
+{
+  uint aligned_bit_blocks;
+  uint max_page_size;
+  MARIA_FILE_BITMAP *bitmap= &share->bitmap;
+  uint size= share->block_size;
+#ifndef DBUG_OFF
+  /* We want to have a copy of the bitmap to be able to print differences */
+  size*= 2;
+#endif
+
+  if (((bitmap->map= (uchar*) my_malloc(size, MYF(MY_WME))) == NULL) ||
+      my_init_dynamic_array(&bitmap->pinned_pages,
+                            sizeof(MARIA_PINNED_PAGE), 1, 1))
+    return 1;
+
+  bitmap->block_size= share->block_size;
+  bitmap->file.file= file;
+  _ma_bitmap_set_pagecache_callbacks(&bitmap->file, share);
+
+  /* Size needs to be aligned on 6 */
+  aligned_bit_blocks= (share->block_size - PAGE_SUFFIX_SIZE) / 6;
+  bitmap->total_size= aligned_bit_blocks * 6;
+  /*
+    In each 6 bytes, we have 6*8/3 = 16 pages covered
+    The +1 is to add the bitmap page, as this doesn't have to be covered
+  */
+  bitmap->pages_covered= aligned_bit_blocks * 16 + 1;
+  bitmap->flush_all_requested= 0;
+  bitmap->non_flushable= 0;
+
+  /* Update size for bits */
+  /* TODO; Make this dependent of the row size */
+  max_page_size= share->block_size - PAGE_OVERHEAD_SIZE + DIR_ENTRY_SIZE;
+  bitmap->sizes[0]= max_page_size;              /* Empty page */
+  bitmap->sizes[1]= max_page_size - max_page_size * 30 / 100;
+  bitmap->sizes[2]= max_page_size - max_page_size * 60 / 100;
+  bitmap->sizes[3]= max_page_size - max_page_size * 90 / 100;
+  bitmap->sizes[4]= 0;                          /* Full page */
+  bitmap->sizes[5]= max_page_size - max_page_size * 40 / 100;
+  bitmap->sizes[6]= max_page_size - max_page_size * 80 / 100;
+  bitmap->sizes[7]= 0;
+
+  pthread_mutex_init(&share->bitmap.bitmap_lock, MY_MUTEX_INIT_SLOW);
+  pthread_cond_init(&share->bitmap.bitmap_cond, 0);
+
+  _ma_bitmap_reset_cache(share);
+
+  if (share->state.first_bitmap_with_space == ~(pgcache_page_no_t) 0)
+  {
+    /* Start scanning for free space from start of file */
+    share->state.first_bitmap_with_space = 0;
+  }
+  return 0;
+}
+
+
+/*
+  Free data allocated by _ma_bitmap_init
+
+  SYNOPSIS
+    _ma_bitmap_end()
+    share		Share handler
+*/
+
+my_bool _ma_bitmap_end(MARIA_SHARE *share)
+{
+  my_bool res= _ma_bitmap_flush(share);
+  safe_mutex_assert_owner(&share->close_lock);
+  pthread_mutex_destroy(&share->bitmap.bitmap_lock);
+  pthread_cond_destroy(&share->bitmap.bitmap_cond);
+  delete_dynamic(&share->bitmap.pinned_pages);
+  my_free(share->bitmap.map, MYF(MY_ALLOW_ZERO_PTR));
+  share->bitmap.map= 0;
+  return res;
+}
+
+
+/*
+  Send updated bitmap to the page cache
+
+  SYNOPSIS
+    _ma_bitmap_flush()
+    share		Share handler
+
+  NOTES
+    In the future, _ma_bitmap_flush() will be called to flush changes don't
+    by this thread (ie, checking the changed flag is ok). The reason we
+    check it again in the mutex is that if someone else did a flush at the
+    same time, we don't have to do the write.
+    This is also ok for _ma_scan_init_block_record() which does not want to
+    miss rows: it cares only for committed rows, that is, rows for which there
+    was a commit before our transaction started; as commit and transaction's
+    start are protected by the same LOCK_trn_list mutex, we see memory at
+    least as new as at other transaction's commit time, so if the committed
+    rows caused bitmap->changed to be true, we see it; if we see 0 it really
+    means a flush happened since then. So, it's ok to read without bitmap's
+    mutex.
+
+  RETURN
+    0    ok
+    1    error
+*/
+
+my_bool _ma_bitmap_flush(MARIA_SHARE *share)
+{
+  my_bool res= 0;
+  DBUG_ENTER("_ma_bitmap_flush");
+  if (share->bitmap.changed)
+  {
+    pthread_mutex_lock(&share->bitmap.bitmap_lock);
+    if (share->bitmap.changed)
+    {
+      res= write_changed_bitmap(share, &share->bitmap);
+      share->bitmap.changed= 0;
+    }
+    pthread_mutex_unlock(&share->bitmap.bitmap_lock);
+  }
+  DBUG_RETURN(res);
+}
+
+
+/**
+   Dirty-page filtering criteria for bitmap pages
+
+   @param  type                Page's type
+   @param  pageno              Page's number
+   @param  rec_lsn             Page's rec_lsn
+   @param  arg                 pages_covered of bitmap
+*/
+
+static enum pagecache_flush_filter_result
+filter_flush_bitmap_pages(enum pagecache_page_type type
+                          __attribute__ ((unused)),
+                          pgcache_page_no_t pageno,
+                          LSN rec_lsn __attribute__ ((unused)),
+                          void *arg)
+{
+  return ((pageno % (*(ulong*)arg)) == 0);
+}
+
+
+/**
+   Flushes current bitmap page to the pagecache, and then all bitmap pages
+   from pagecache to the file. Used by Checkpoint.
+
+   @param  share               Table's share
+*/
+
+my_bool _ma_bitmap_flush_all(MARIA_SHARE *share)
+{
+  my_bool res= 0;
+  MARIA_FILE_BITMAP *bitmap= &share->bitmap;
+  DBUG_ENTER("_ma_bitmap_flush_all");
+  pthread_mutex_lock(&bitmap->bitmap_lock);
+  if (bitmap->changed || bitmap->changed_not_flushed)
+  {
+    bitmap->flush_all_requested++;
+#ifndef WRONG_BITMAP_FLUSH
+    while (bitmap->non_flushable > 0)
+    {
+      DBUG_PRINT("info", ("waiting for bitmap to be flushable"));
+      pthread_cond_wait(&bitmap->bitmap_cond, &bitmap->bitmap_lock);
+    }
+#endif
+    DBUG_ASSERT(bitmap->flush_all_requested == 1);
+    /*
+      Bitmap is in a flushable state: its contents in memory are reflected by
+      log records (complete REDO-UNDO groups) and all bitmap pages are
+      unpinned. We keep the mutex to preserve this situation, and flush to the
+      file.
+    */
+    if (bitmap->changed)
+    {
+      bitmap->changed= FALSE;
+      res= write_changed_bitmap(share, bitmap);
+    }
+    /*
+      We do NOT use FLUSH_KEEP_LAZY because we must be sure that bitmap
+      pages have been flushed. That's a condition of correctness of
+      Recovery: data pages may have been all flushed, if we write the
+      checkpoint record Recovery will start from after their REDOs. If
+      bitmap page was not flushed, as the REDOs about it will be skipped, it
+      will wrongly not be recovered. If bitmap pages had a rec_lsn it would
+      be different.
+      There should be no pinned pages as bitmap->non_flushable==0.
+    */
+    if (flush_pagecache_blocks_with_filter(share->pagecache,
+                                           &bitmap->file, FLUSH_KEEP,
+                                           filter_flush_bitmap_pages,
+                                           &bitmap->pages_covered) &
+        PCFLUSH_PINNED_AND_ERROR)
+      res= TRUE;
+    bitmap->changed_not_flushed= FALSE;
+    bitmap->flush_all_requested--;
+    /*
+      Some well-behaved threads may be waiting for flush_all_requested to
+      become false, wake them up.
+    */
+    DBUG_PRINT("info", ("bitmap flusher waking up others"));
+    pthread_cond_broadcast(&bitmap->bitmap_cond);
+  }
+  pthread_mutex_unlock(&bitmap->bitmap_lock);
+  DBUG_RETURN(res);
+}
+
+
+/**
+   @brief Lock bitmap from being used by another thread
+
+   @fn _ma_bitmap_lock()
+   @param  share               Table's share
+
+   @notes
+   This is a temporary solution for allowing someone to delete an inserted
+   duplicate-key row while someone else is doing concurrent inserts.
+   This is ok for now as duplicate key errors are not that common.
+
+   In the future we will add locks for row-pages to ensure two threads doesn't
+   work at the same time on the same page.
+*/
+
+void _ma_bitmap_lock(MARIA_SHARE *share)
+{
+  MARIA_FILE_BITMAP *bitmap= &share->bitmap;
+  DBUG_ENTER("_ma_bitmap_lock");
+
+  if (!share->now_transactional)
+    DBUG_VOID_RETURN;
+
+  pthread_mutex_lock(&bitmap->bitmap_lock);
+  bitmap->flush_all_requested++;
+  while (bitmap->non_flushable)
+  {
+    DBUG_PRINT("info", ("waiting for bitmap to be flushable"));
+    pthread_cond_wait(&bitmap->bitmap_cond, &bitmap->bitmap_lock);
+  }
+  /*
+    Ensure that _ma_bitmap_flush_all() and _ma_bitmap_lock() are blocked.
+    ma_bitmap_flushable() is blocked thanks to 'flush_all_requested'.
+  */
+  bitmap->non_flushable= 1;
+  pthread_mutex_unlock(&bitmap->bitmap_lock);
+  DBUG_VOID_RETURN;
+}
+  
+/**
+   @brief Unlock bitmap after _ma_bitmap_lock()
+
+   @fn _ma_bitmap_unlock()
+   @param  share               Table's share
+*/
+
+void _ma_bitmap_unlock(MARIA_SHARE *share)
+{
+  MARIA_FILE_BITMAP *bitmap= &share->bitmap;
+  DBUG_ENTER("_ma_bitmap_unlock");
+
+  if (!share->now_transactional)
+    DBUG_VOID_RETURN;
+  DBUG_ASSERT(bitmap->flush_all_requested > 0 && bitmap->non_flushable == 1);
+
+  pthread_mutex_lock(&bitmap->bitmap_lock);
+  bitmap->flush_all_requested--;
+  bitmap->non_flushable= 0;
+  pthread_mutex_unlock(&bitmap->bitmap_lock);
+  pthread_cond_broadcast(&bitmap->bitmap_cond);
+  DBUG_VOID_RETURN;
+}
+
+
+/**
+  @brief Unpin all pinned bitmap pages
+
+  @param  share            Table's share
+
+  @return Operation status
+    @retval   0   ok
+
+  @note This unpins pages pinned by other threads.
+*/
+
+static void _ma_bitmap_unpin_all(MARIA_SHARE *share)
+{
+  MARIA_FILE_BITMAP *bitmap= &share->bitmap;
+  MARIA_PINNED_PAGE *page_link= ((MARIA_PINNED_PAGE*)
+                                 dynamic_array_ptr(&bitmap->pinned_pages, 0));
+  MARIA_PINNED_PAGE *pinned_page= page_link + bitmap->pinned_pages.elements;
+  DBUG_ENTER("_ma_bitmap_unpin_all");
+  DBUG_PRINT("info", ("pinned: %u", bitmap->pinned_pages.elements));
+  while (pinned_page-- != page_link)
+    pagecache_unlock_by_link(share->pagecache, pinned_page->link,
+                             pinned_page->unlock, PAGECACHE_UNPIN,
+                             LSN_IMPOSSIBLE, LSN_IMPOSSIBLE, TRUE, TRUE);
+  bitmap->pinned_pages.elements= 0;
+  DBUG_VOID_RETURN;
+}
+
+
+/*
+  Intialize bitmap in memory to a zero bitmap
+
+  SYNOPSIS
+    _ma_bitmap_delete_all()
+    share		Share handler
+
+  NOTES
+    This is called on maria_delete_all_rows (truncate data file).
+*/
+
+void _ma_bitmap_delete_all(MARIA_SHARE *share)
+{
+  MARIA_FILE_BITMAP *bitmap= &share->bitmap;
+  DBUG_ENTER("_ma_bitmap_delete_all");
+  if (bitmap->map)                              /* Not in create */
+  {
+    bzero(bitmap->map, bitmap->block_size);
+    bitmap->changed= 1;
+    bitmap->page= 0;
+    bitmap->used_size= bitmap->total_size;
+  }
+  DBUG_VOID_RETURN;
+}
+
+
+/**
+   @brief Reset bitmap caches
+
+   @fn    _ma_bitmap_reset_cache()
+   @param share		Maria share
+
+   @notes
+   This is called after we have swapped file descriptors and we want
+   bitmap to forget all cached information
+*/
+
+void _ma_bitmap_reset_cache(MARIA_SHARE *share)
+{
+  MARIA_FILE_BITMAP *bitmap= &share->bitmap;
+
+  if (bitmap->map)                              /* If using bitmap */
+  {
+    /* Forget changes in current bitmap page */
+    bitmap->changed= 0;
+
+    /*
+      We can't read a page yet, as in some case we don't have an active
+      page cache yet.
+      Pretend we have a dummy, full and not changed bitmap page in memory.
+    */
+    bitmap->page= ~(ulonglong) 0;
+    bitmap->used_size= bitmap->total_size;
+    bfill(bitmap->map, share->block_size, 255);
+#ifndef DBUG_OFF
+    memcpy(bitmap->map + bitmap->block_size, bitmap->map, bitmap->block_size);
+#endif
+  }
+}
+
+
+/*
+  Return bitmap pattern for the smallest head block that can hold 'size'
+
+  SYNOPSIS
+    size_to_head_pattern()
+    bitmap      Bitmap
+    size        Requested size
+
+  RETURN
+    0-3         For a description of the bitmap sizes, see the header
+*/
+
+static uint size_to_head_pattern(MARIA_FILE_BITMAP *bitmap, uint size)
+{
+  if (size <= bitmap->sizes[3])
+    return 3;
+  if (size <= bitmap->sizes[2])
+    return 2;
+  if (size <= bitmap->sizes[1])
+    return 1;
+  DBUG_ASSERT(size <= bitmap->sizes[0]);
+  return 0;
+}
+
+
+/*
+  Return bitmap pattern for head block where there is size bytes free
+
+  SYNOPSIS
+    _ma_free_size_to_head_pattern()
+    bitmap      Bitmap
+    size        Requested size
+
+  RETURN
+    0-4  (Possible bitmap patterns for head block)
+*/
+
+uint _ma_free_size_to_head_pattern(MARIA_FILE_BITMAP *bitmap, uint size)
+{
+  if (size < bitmap->sizes[3])
+    return 4;
+  if (size < bitmap->sizes[2])
+    return 3;
+  if (size < bitmap->sizes[1])
+    return 2;
+  return (size < bitmap->sizes[0]) ? 1 : 0;
+}
+
+
+/*
+  Return bitmap pattern for the smallest tail block that can hold 'size'
+
+  SYNOPSIS
+    size_to_tail_pattern()
+    bitmap      Bitmap
+    size        Requested size
+
+  RETURN
+    0, 5 or 6   For a description of the bitmap sizes, see the header
+*/
+
+static uint size_to_tail_pattern(MARIA_FILE_BITMAP *bitmap, uint size)
+{
+  if (size <= bitmap->sizes[6])
+    return 6;
+  if (size <= bitmap->sizes[5])
+    return 5;
+  DBUG_ASSERT(size <= bitmap->sizes[0]);
+  return 0;
+}
+
+
+/*
+  Return bitmap pattern for tail block where there is size bytes free
+
+  SYNOPSIS
+    free_size_to_tail_pattern()
+    bitmap      Bitmap
+    size        Requested size
+
+  RETURN
+    0, 5, 6, 7   For a description of the bitmap sizes, see the header
+*/
+
+static uint free_size_to_tail_pattern(MARIA_FILE_BITMAP *bitmap, uint size)
+{
+  if (size >= bitmap->sizes[0])
+    return 0;                                   /* Revert to empty page */
+  if (size < bitmap->sizes[6])
+    return 7;
+  if (size < bitmap->sizes[5])
+    return 6;
+  return 5;
+}
+
+
+/*
+  Return size guranteed to be available on a page
+
+  SYNOPSIS
+    pattern_to_head_size()
+    bitmap      Bitmap
+    pattern     Pattern (0-7)
+
+  RETURN
+    0 - block_size
+*/
+
+static inline uint pattern_to_size(MARIA_FILE_BITMAP *bitmap, uint pattern)
+{
+  DBUG_ASSERT(pattern <= 7);
+  return bitmap->sizes[pattern];
+}
+
+
+/*
+  Print bitmap for debugging
+
+  SYNOPSIS
+  _ma_print_bitmap()
+  bitmap	Bitmap to print
+
+  IMPLEMENTATION
+    Prints all changed bits since last call to _ma_print_bitmap().
+    This is done by having a copy of the last bitmap in
+    bitmap->map+bitmap->block_size.
+*/
+
+#ifndef DBUG_OFF
+
+const char *bits_to_txt[]=
+{
+  "empty", "00-30% full", "30-60% full", "60-90% full", "full",
+  "tail 00-40 % full", "tail 40-80 % full", "tail/blob full"
+};
+
+static void _ma_print_bitmap_changes(MARIA_FILE_BITMAP *bitmap)
+{
+  uchar *pos, *end, *org_pos;
+  ulong page;
+  DBUG_ENTER("_ma_print_bitmap_changes");
+
+  end= bitmap->map + bitmap->used_size;
+  DBUG_LOCK_FILE;
+  fprintf(DBUG_FILE,"\nBitmap page changes at page: %lu  bitmap: 0x%lx\n",
+          (ulong) bitmap->page, (long) bitmap->map);
+
+  page= (ulong) bitmap->page+1;
+  for (pos= bitmap->map, org_pos= bitmap->map + bitmap->block_size ;
+       pos < end ;
+       pos+= 6, org_pos+= 6)
+  {
+    ulonglong bits= uint6korr(pos);    /* 6 bytes = 6*8/3= 16 patterns */
+    ulonglong org_bits= uint6korr(org_pos);
+    uint i;
+
+    /*
+      Test if there is any changes in the next 16 bitmaps (to not have to
+      loop through all bits if we know they are the same)
+    */
+    if (bits != org_bits)
+    {
+      for (i= 0; i < 16 ; i++, bits>>= 3, org_bits>>= 3)
+      {
+        if ((bits & 7) != (org_bits & 7))
+          fprintf(DBUG_FILE, "Page: %8lu  %s -> %s\n", page+i,
+                  bits_to_txt[org_bits & 7], bits_to_txt[bits & 7]);
+      }
+    }
+    page+= 16;
+  }
+  fputc('\n', DBUG_FILE);
+  DBUG_UNLOCK_FILE;
+  memcpy(bitmap->map + bitmap->block_size, bitmap->map, bitmap->block_size);
+  DBUG_VOID_RETURN;
+}
+
+
+/* Print content of bitmap for debugging */
+
+void _ma_print_bitmap(MARIA_FILE_BITMAP *bitmap, uchar *data,
+                      pgcache_page_no_t page)
+{
+  uchar *pos, *end;
+  char llbuff[22];
+
+  end= bitmap->map + bitmap->used_size;
+  DBUG_LOCK_FILE;
+  fprintf(DBUG_FILE,"\nDump of bitmap page at %s\n", llstr(page, llbuff));
+
+  page++;                                       /* Skip bitmap page */
+  for (pos= data, end= pos + bitmap->total_size;
+       pos < end ;
+       pos+= 6)
+  {
+    ulonglong bits= uint6korr(pos);    /* 6 bytes = 6*8/3= 16 patterns */
+
+    /*
+      Test if there is any changes in the next 16 bitmaps (to not have to
+      loop through all bits if we know they are the same)
+    */
+    if (bits)
+    {
+      uint i;
+      for (i= 0; i < 16 ; i++, bits>>= 3)
+      {
+        if (bits & 7)
+          fprintf(DBUG_FILE, "Page: %8s  %s\n", llstr(page+i, llbuff),
+                  bits_to_txt[bits & 7]);
+      }
+    }
+    page+= 16;
+  }
+  fputc('\n', DBUG_FILE);
+  DBUG_UNLOCK_FILE;
+}
+
+#endif /* DBUG_OFF */
+
+
+/***************************************************************************
+  Reading & writing bitmap pages
+***************************************************************************/
+
+/*
+  Read a given bitmap page
+
+  SYNOPSIS
+    _ma_read_bitmap_page()
+    info                Maria handler
+    bitmap              Bitmap handler
+    page                Page to read
+
+  TODO
+    Update 'bitmap->used_size' to real size of used bitmap
+
+  NOTE
+    We don't always have share->bitmap.bitmap_lock here
+    (when called from_ma_check_bitmap_data() for example).
+
+  RETURN
+    0  ok
+    1  error  (Error writing old bitmap or reading bitmap page)
+*/
+
+static my_bool _ma_read_bitmap_page(MARIA_HA *info,
+                                    MARIA_FILE_BITMAP *bitmap,
+                                    pgcache_page_no_t page)
+{
+  MARIA_SHARE *share= info->s;
+  my_bool res;
+  DBUG_ENTER("_ma_read_bitmap_page");
+  DBUG_ASSERT(page % bitmap->pages_covered == 0);
+  DBUG_ASSERT(!bitmap->changed);
+
+  bitmap->page= page;
+  if (((page + 1) * bitmap->block_size) > share->state.state.data_file_length)
+  {
+    /* Inexistent or half-created page */
+    res= _ma_bitmap_create_missing(info, bitmap, page);
+    DBUG_RETURN(res);
+  }
+  bitmap->used_size= bitmap->total_size;
+  DBUG_ASSERT(share->pagecache->block_size == bitmap->block_size);
+  res= pagecache_read(share->pagecache,
+                      &bitmap->file, page, 0,
+                      bitmap->map, PAGECACHE_PLAIN_PAGE,
+                      PAGECACHE_LOCK_LEFT_UNLOCKED, 0) == NULL;
+
+  /*
+    We can't check maria_bitmap_marker here as if the bitmap page
+    previously had a true checksum and the user switched mode to not checksum
+    this may have any value, except maria_normal_page_marker.
+
+    Using maria_normal_page_marker gives us a protection against bugs
+    when running without any checksums.
+  */
+
+#ifndef DBUG_OFF
+  if (!res)
+    memcpy(bitmap->map + bitmap->block_size, bitmap->map, bitmap->block_size);
+#endif
+  DBUG_RETURN(res);
+}
+
+
+/*
+  Change to another bitmap page
+
+  SYNOPSIS
+  _ma_change_bitmap_page()
+    info                Maria handler
+    bitmap              Bitmap handler
+    page                Bitmap page to read
+
+  NOTES
+   If old bitmap was changed, write it out before reading new one
+   We return empty bitmap if page is outside of file size
+
+  RETURN
+    0  ok
+    1  error  (Error writing old bitmap or reading bitmap page)
+*/
+
+static my_bool _ma_change_bitmap_page(MARIA_HA *info,
+                                      MARIA_FILE_BITMAP *bitmap,
+                                      pgcache_page_no_t page)
+{
+  DBUG_ENTER("_ma_change_bitmap_page");
+
+  if (bitmap->changed)
+  {
+    if (write_changed_bitmap(info->s, bitmap))
+      DBUG_RETURN(1);
+    bitmap->changed= 0;
+  }
+  DBUG_RETURN(_ma_read_bitmap_page(info, bitmap, page));
+}
+
+
+/*
+  Read next suitable bitmap
+
+  SYNOPSIS
+    move_to_next_bitmap()
+    bitmap              Bitmap handle
+
+  NOTES
+    The found bitmap may be full, so calling function may need to call this
+    repeatedly until it finds enough space.
+
+  TODO
+    Add cache of bitmaps to not read something that is not usable
+
+  RETURN
+    0  ok
+    1  error (either couldn't save old bitmap or read new one)
+*/
+
+static my_bool move_to_next_bitmap(MARIA_HA *info, MARIA_FILE_BITMAP *bitmap)
+{
+  pgcache_page_no_t page= bitmap->page;
+  MARIA_STATE_INFO *state= &info->s->state;
+  DBUG_ENTER("move_to_next_bitmap");
+
+  if (state->first_bitmap_with_space != ~(ulonglong) 0 &&
+      state->first_bitmap_with_space != page)
+  {
+    page= state->first_bitmap_with_space;
+    state->first_bitmap_with_space= ~(ulonglong) 0;
+  }
+  else
+    page+= bitmap->pages_covered;
+  DBUG_RETURN(_ma_change_bitmap_page(info, bitmap, page));
+}
+
+
+/****************************************************************************
+ Allocate data in bitmaps
+****************************************************************************/
+
+/*
+  Store data in 'block' and mark the place used in the bitmap
+
+  SYNOPSIS
+    fill_block()
+    bitmap		Bitmap handle
+    block		Store data about what we found
+    best_data		Pointer to best 6 uchar aligned area in bitmap->map
+    best_pos		Which bit in *best_data the area starts
+                        0 = first bit pattern, 1 second bit pattern etc
+    best_bits		The original value of the bits at best_pos
+    fill_pattern	Bitmap pattern to store in best_data[best_pos]
+
+   NOTES
+    We mark all pages to be 'TAIL's, which means that
+    block->page_count is really a row position inside the page.
+*/
+
+static void fill_block(MARIA_FILE_BITMAP *bitmap,
+                       MARIA_BITMAP_BLOCK *block,
+                       uchar *best_data, uint best_pos, uint best_bits,
+                       uint fill_pattern)
+{
+  uint page, offset, tmp;
+  uchar *data;
+  DBUG_ENTER("fill_block");
+
+  /* For each 6 bytes we have 6*8/3= 16 patterns */
+  page= ((uint) (best_data - bitmap->map)) / 6 * 16 + best_pos;
+  DBUG_ASSERT(page + 1 < bitmap->pages_covered);
+  block->page= bitmap->page + 1 + page;
+  block->page_count= TAIL_PAGE_COUNT_MARKER;
+  block->empty_space= pattern_to_size(bitmap, best_bits);
+  block->sub_blocks= 0;
+  block->org_bitmap_value= best_bits;
+  block->used= BLOCKUSED_TAIL; /* See _ma_bitmap_release_unused() */
+
+  /*
+    Mark place used by reading/writing 2 bytes at a time to handle
+    bitmaps in overlapping bytes
+  */
+  best_pos*= 3;
+  data= best_data+ best_pos / 8;
+  offset= best_pos & 7;
+  tmp= uint2korr(data);
+
+  /* we turn off the 3 bits and replace them with fill_pattern */
+  tmp= (tmp & ~(7 << offset)) | (fill_pattern << offset);
+  int2store(data, tmp);
+  bitmap->changed= 1;
+  DBUG_EXECUTE("bitmap", _ma_print_bitmap_changes(bitmap););
+  DBUG_VOID_RETURN;
+}
+
+
+/*
+  Allocate data for head block
+
+  SYNOPSIS
+   allocate_head()
+   bitmap       bitmap
+   size         Size of data region we need to store
+   block        Store found information here
+
+   IMPLEMENTATION
+     Find the best-fit page to put a region of 'size'
+     This is defined as the first page of the set of pages
+     with the smallest free space that can hold 'size'.
+
+   RETURN
+    0   ok    (block is updated)
+    1   error (no space in bitmap; block is not touched)
+*/
+
+
+static my_bool allocate_head(MARIA_FILE_BITMAP *bitmap, uint size,
+                             MARIA_BITMAP_BLOCK *block)
+{
+  uint min_bits= size_to_head_pattern(bitmap, size);
+  uchar *data= bitmap->map, *end= data + bitmap->used_size;
+  uchar *best_data= 0;
+  uint best_bits= (uint) -1, best_pos;
+  DBUG_ENTER("allocate_head");
+
+  LINT_INIT(best_pos);
+  DBUG_ASSERT(size <= FULL_PAGE_SIZE(bitmap->block_size));
+
+  for (; data < end; data+= 6)
+  {
+    ulonglong bits= uint6korr(data);    /* 6 bytes = 6*8/3= 16 patterns */
+    uint i;
+
+    /*
+      Skip common patterns
+      We can skip empty pages (if we already found a match) or
+      anything matching the following pattern as this will be either
+      a full page or a tail page
+    */
+    if ((!bits && best_data) ||
+        ((bits & LL(04444444444444444)) == LL(04444444444444444)))
+      continue;
+    for (i= 0; i < 16 ; i++, bits >>= 3)
+    {
+      uint pattern= (uint) (bits & 7);
+      if (pattern <= min_bits)
+      {
+        /* There is enough space here */
+        if ((int) pattern > (int) best_bits)
+        {
+          /*
+            There is more than enough space here and it's better than what
+            we have found so far. Remember it, as we will choose it if we
+            don't find anything in this bitmap page.
+          */
+          best_bits= pattern;
+          best_data= data;
+          best_pos= i;
+          if (pattern == min_bits)
+            goto found;                         /* Best possible match */
+        }
+      }
+    }
+  }
+  if (!best_data)                               /* Found no place */
+  {
+    if (data >= bitmap->map + bitmap->total_size)
+      DBUG_RETURN(1);                           /* No space in bitmap */
+    /* Allocate data at end of bitmap */
+    bitmap->used_size+= 6;
+    set_if_smaller(bitmap->used_size, bitmap->total_size);
+    best_data= data;
+    best_pos= best_bits= 0;
+  }
+
+found:
+  fill_block(bitmap, block, best_data, best_pos, best_bits, FULL_HEAD_PAGE);
+  DBUG_RETURN(0);
+}
+
+
+/*
+  Allocate data for tail block
+
+  SYNOPSIS
+   allocate_tail()
+   bitmap       bitmap
+   size         Size of block we need to find
+   block        Store found information here
+
+  RETURN
+   0    ok      (block is updated)
+   1    error   (no space in bitmap; block is not touched)
+*/
+
+
+static my_bool allocate_tail(MARIA_FILE_BITMAP *bitmap, uint size,
+                             MARIA_BITMAP_BLOCK *block)
+{
+  uint min_bits= size_to_tail_pattern(bitmap, size);
+  uchar *data= bitmap->map, *end= data + bitmap->used_size;
+  uchar *best_data= 0;
+  uint best_bits= (uint) -1, best_pos;
+  DBUG_ENTER("allocate_tail");
+  DBUG_PRINT("enter", ("size: %u", size));
+
+  LINT_INIT(best_pos);
+  /*
+    We have to add DIR_ENTRY_SIZE here as this is not part of the data size
+    See call to allocate_tail() in find_tail().
+  */
+  DBUG_ASSERT(size <= MAX_TAIL_SIZE(bitmap->block_size) + DIR_ENTRY_SIZE);
+
+  for (; data < end; data += 6)
+  {
+    ulonglong bits= uint6korr(data);    /* 6 bytes = 6*8/3= 16 patterns */
+    uint i;
+
+    /*
+      Skip common patterns
+      We can skip empty pages (if we already found a match) or
+      the following patterns: 1-4 (head pages, not suitable for tail) or
+      7 (full tail page). See 'Dynamic size records' comment at start of file.
+
+      At the moment we only skip full head and tail pages (ie, all bits are
+      set) as this is easy to detect with one simple test and is a
+      quite common case if we have blobs.
+    */
+
+    if ((!bits && best_data) || bits == LL(0xffffffffffff) ||
+        bits == LL(04444444444444444))
+      continue;
+    for (i= 0; i < 16; i++, bits >>= 3)
+    {
+      uint pattern= (uint) (bits & 7);
+      if (pattern <= min_bits && (!pattern || pattern >= 5))
+      {
+        if ((int) pattern > (int) best_bits)
+        {
+          best_bits= pattern;
+          best_data= data;
+          best_pos= i;
+          if (pattern == min_bits)
+            goto found;                         /* Can't be better */
+        }
+      }
+    }
+  }
+  if (!best_data)
+  {
+    if (data >= bitmap->map + bitmap->total_size)
+      DBUG_RETURN(1);
+    /* Allocate data at end of bitmap */
+    best_data= data;
+    bitmap->used_size+= 6;
+    set_if_smaller(bitmap->used_size, bitmap->total_size);
+    best_pos= best_bits= 0;
+  }
+
+found:
+  fill_block(bitmap, block, best_data, best_pos, best_bits, FULL_TAIL_PAGE);
+  DBUG_RETURN(0);
+}
+
+
+/*
+  Allocate data for full blocks
+
+  SYNOPSIS
+   allocate_full_pages()
+   bitmap       bitmap
+   pages_needed Total size in pages (bitmap->total_size) we would like to have
+   block        Store found information here
+   full_page    1 if we are not allowed to split extent
+
+  IMPLEMENTATION
+    We will return the smallest area >= size.  If there is no such
+    block, we will return the biggest area that satisfies
+    area_size >= min(BLOB_SEGMENT_MIN_SIZE*full_page_size, size)
+
+    To speed up searches, we will only consider areas that has at least 16 free
+    pages starting on an even boundary.  When finding such an area, we will
+    extend it with all previous and following free pages.  This will ensure
+    we don't get holes between areas
+
+  RETURN
+   #            Blocks used
+   0            error   (no space in bitmap; block is not touched)
+*/
+
+static ulong allocate_full_pages(MARIA_FILE_BITMAP *bitmap,
+                                 ulong pages_needed,
+                                 MARIA_BITMAP_BLOCK *block, my_bool full_page)
+{
+  uchar *data= bitmap->map, *data_end= data + bitmap->used_size;
+  uchar *page_end= data + bitmap->total_size;
+  uchar *best_data= 0;
+  uint min_size;
+  uint best_area_size, best_prefix_area_size, best_suffix_area_size;
+  uint page, size;
+  ulonglong best_prefix_bits;
+  DBUG_ENTER("allocate_full_pages");
+  DBUG_PRINT("enter", ("pages_needed: %lu", pages_needed));
+
+  /* Following variables are only used if best_data is set */
+  LINT_INIT(best_prefix_bits);
+  LINT_INIT(best_prefix_area_size);
+  LINT_INIT(best_suffix_area_size);
+
+  min_size= pages_needed;
+  if (!full_page && min_size > BLOB_SEGMENT_MIN_SIZE)
+    min_size= BLOB_SEGMENT_MIN_SIZE;
+  best_area_size= ~(uint) 0;
+
+  for (; data < page_end; data+= 6)
+  {
+    ulonglong bits= uint6korr(data);    /* 6 bytes = 6*8/3= 16 patterns */
+    uchar *data_start;
+    ulonglong prefix_bits= 0;
+    uint area_size, prefix_area_size, suffix_area_size;
+
+    /* Find area with at least 16 free pages */
+    if (bits)
+      continue;
+    data_start= data;
+    /* Find size of area */
+    for (data+=6 ; data < data_end ; data+= 6)
+    {
+      if ((bits= uint6korr(data)))
+        break;
+    }
+    area_size= (uint) (data - data_start) / 6 * 16;
+    if (area_size >= best_area_size)
+      continue;
+    prefix_area_size= suffix_area_size= 0;
+    if (!bits)
+    {
+      /*
+        End of page; All the rest of the bits on page are part of area
+        This is needed because bitmap->used_size only covers the set bits
+        in the bitmap.
+      */
+      area_size+= (uint) (page_end - data) / 6 * 16;
+      if (area_size >= best_area_size)
+        break;
+      data= page_end;
+    }
+    else
+    {
+      /* Add bits at end of page */
+      for (; !(bits & 7); bits >>= 3)
+        suffix_area_size++;
+      area_size+= suffix_area_size;
+    }
+    if (data_start != bitmap->map)
+    {
+      /* Add bits before page */
+      bits= prefix_bits= uint6korr(data_start - 6);
+      DBUG_ASSERT(bits != 0);
+      /* 111 000 000 000 000 000 000 000 000 000 000 000 000 000 000 000 */
+      if (!(bits & LL(07000000000000000)))
+      {
+        data_start-= 6;
+        do
+        {
+          prefix_area_size++;
+          bits<<= 3;
+        } while (!(bits & LL(07000000000000000)));
+        area_size+= prefix_area_size;
+        /* Calculate offset to page from data_start */
+        prefix_area_size= 16 - prefix_area_size;
+      }
+    }
+    if (area_size >= min_size && area_size <= best_area_size)
+    {
+      best_data= data_start;
+      best_area_size= area_size;
+      best_prefix_bits= prefix_bits;
+      best_prefix_area_size= prefix_area_size;
+      best_suffix_area_size= suffix_area_size;
+
+      /* Prefer to put data in biggest possible area */
+      if (area_size <= pages_needed)
+        min_size= area_size;
+      else
+        min_size= pages_needed;
+    }
+  }
+  if (!best_data)
+    DBUG_RETURN(0);                             /* No room on page */
+
+  /*
+    Now allocate min(pages_needed, area_size), starting from
+    best_start + best_prefix_area_size
+  */
+  if (best_area_size > pages_needed)
+    best_area_size= pages_needed;
+
+  /* For each 6 bytes we have 6*8/3= 16 patterns */
+  page= ((uint) (best_data - bitmap->map) * 8) / 3 + best_prefix_area_size;
+  block->page= bitmap->page + 1 + page;
+  block->page_count= best_area_size;
+  block->empty_space= 0;
+  block->sub_blocks= 0;
+  block->org_bitmap_value= 0;
+  block->used= 0;
+  DBUG_ASSERT(page + best_area_size < bitmap->pages_covered);
+  DBUG_PRINT("info", ("page: %lu  page_count: %u",
+                      (ulong) block->page, block->page_count));
+
+  if (best_prefix_area_size)
+  {
+    ulonglong tmp;
+    /* Convert offset back to bits */
+    best_prefix_area_size= 16 - best_prefix_area_size;
+    if (best_area_size < best_prefix_area_size)
+    {
+      tmp= (LL(1) << best_area_size*3) - 1;
+      best_area_size= best_prefix_area_size;    /* for easy end test */
+    }
+    else
+      tmp= (LL(1) << best_prefix_area_size*3) - 1;
+    tmp<<= (16 - best_prefix_area_size) * 3;
+    DBUG_ASSERT((best_prefix_bits & tmp) == 0);
+    best_prefix_bits|= tmp;
+    int6store(best_data, best_prefix_bits);
+    if (!(best_area_size-= best_prefix_area_size))
+    {
+      DBUG_EXECUTE("bitmap", _ma_print_bitmap_changes(bitmap););
+      DBUG_RETURN(block->page_count);
+    }
+    best_data+= 6;
+  }
+  best_area_size*= 3;                       /* Bits to set */
+  size= best_area_size/8;                   /* Bytes to set */
+  bfill(best_data, size, 255);
+  best_data+= size;
+  if ((best_area_size-= size * 8))
+  {
+    /* fill last uchar */
+    *best_data|= (uchar) ((1 << best_area_size) -1);
+    best_data++;
+  }
+  if (data_end < best_data)
+  {
+    bitmap->used_size= (uint) (best_data - bitmap->map);
+    DBUG_ASSERT(bitmap->used_size <= bitmap->total_size);
+  }
+  bitmap->changed= 1;
+  DBUG_EXECUTE("bitmap", _ma_print_bitmap_changes(bitmap););
+  DBUG_RETURN(block->page_count);
+}
+
+
+/****************************************************************************
+  Find right bitmaps where to store data
+****************************************************************************/
+
+/*
+  Find right bitmap and position for head block
+
+  SYNOPSIS
+    find_head()
+    info		Maria handler
+    length	        Size of data region we need store
+    position		Position in bitmap_blocks where to store the
+			information for the head block.
+
+  RETURN
+    0  ok
+    1  error
+*/
+
+static my_bool find_head(MARIA_HA *info, uint length, uint position)
+{
+  MARIA_FILE_BITMAP *bitmap= &info->s->bitmap;
+  MARIA_BITMAP_BLOCK *block;
+  /*
+    There is always place for the head block in bitmap_blocks as these are
+    preallocated at _ma_init_block_record().
+  */
+  block= dynamic_element(&info->bitmap_blocks, position, MARIA_BITMAP_BLOCK *);
+
+  /*
+    We need to have DIRENTRY_SIZE here to take into account that we may
+    need an extra directory entry for the row
+  */
+  while (allocate_head(bitmap, length + DIR_ENTRY_SIZE, block))
+    if (move_to_next_bitmap(info, bitmap))
+      return 1;
+  return 0;
+}
+
+
+/*
+  Find right bitmap and position for tail
+
+  SYNOPSIS
+    find_tail()
+    info		Maria handler
+    length	        Size of data region we need store
+    position		Position in bitmap_blocks where to store the
+			information for the head block.
+
+  RETURN
+    0  ok
+    1  error
+*/
+
+static my_bool find_tail(MARIA_HA *info, uint length, uint position)
+{
+  MARIA_FILE_BITMAP *bitmap= &info->s->bitmap;
+  MARIA_BITMAP_BLOCK *block;
+  DBUG_ENTER("find_tail");
+  DBUG_ASSERT(length <= info->s->block_size - PAGE_OVERHEAD_SIZE);
+
+  /* Needed, as there is no error checking in dynamic_element */
+  if (allocate_dynamic(&info->bitmap_blocks, position))
+    DBUG_RETURN(1);
+  block= dynamic_element(&info->bitmap_blocks, position, MARIA_BITMAP_BLOCK *);
+
+  /*
+    We have to add DIR_ENTRY_SIZE to ensure we have space for the tail and
+    it's directroy entry on the page
+  */
+  while (allocate_tail(bitmap, length + DIR_ENTRY_SIZE, block))
+    if (move_to_next_bitmap(info, bitmap))
+      DBUG_RETURN(1);
+  DBUG_RETURN(0);
+}
+
+
+/*
+  Find right bitmap and position for full blocks in one extent
+
+  SYNOPSIS
+    find_mid()
+    info		Maria handler.
+    pages	        How many pages to allocate.
+    position		Position in bitmap_blocks where to store the
+			information for the head block.
+  NOTES
+    This is used to allocate the main extent after the 'head' block
+    (Ie, the middle part of the head-middle-tail entry)
+
+  RETURN
+    0  ok
+    1  error
+*/
+
+static my_bool find_mid(MARIA_HA *info, ulong pages, uint position)
+{
+  MARIA_FILE_BITMAP *bitmap= &info->s->bitmap;
+  MARIA_BITMAP_BLOCK *block;
+  block= dynamic_element(&info->bitmap_blocks, position, MARIA_BITMAP_BLOCK *);
+
+  while (!allocate_full_pages(bitmap, pages, block, 1))
+  {
+    if (move_to_next_bitmap(info, bitmap))
+      return 1;
+  }
+  return 0;
+}
+
+
+/*
+  Find right bitmap and position for putting a blob
+
+  SYNOPSIS
+    find_blob()
+    info		Maria handler.
+    length		Length of the blob
+
+  NOTES
+    The extents are stored last in info->bitmap_blocks
+
+  IMPLEMENTATION
+    Allocate all full pages for the block + optionally one tail
+
+  RETURN
+    0  ok
+    1  error
+*/
+
+static my_bool find_blob(MARIA_HA *info, ulong length)
+{
+  MARIA_FILE_BITMAP *bitmap= &info->s->bitmap;
+  uint full_page_size= FULL_PAGE_SIZE(info->s->block_size);
+  ulong pages;
+  uint rest_length, used;
+  uint first_block_pos;
+  MARIA_BITMAP_BLOCK *first_block= 0;
+  DBUG_ENTER("find_blob");
+  DBUG_PRINT("enter", ("length: %lu", length));
+  LINT_INIT(first_block_pos);
+
+  pages= length / full_page_size;
+  rest_length= (uint) (length - pages * full_page_size);
+  if (rest_length >= MAX_TAIL_SIZE(info->s->block_size))
+  {
+    pages++;
+    rest_length= 0;
+  }
+
+  first_block_pos= info->bitmap_blocks.elements;
+  if (pages)
+  {
+    MARIA_BITMAP_BLOCK *block;
+    if (allocate_dynamic(&info->bitmap_blocks,
+                         info->bitmap_blocks.elements +
+                         pages / BLOB_SEGMENT_MIN_SIZE + 2))
+      DBUG_RETURN(1);
+    block= dynamic_element(&info->bitmap_blocks, info->bitmap_blocks.elements,
+                           MARIA_BITMAP_BLOCK*);
+    do
+    {
+      /*
+        We use 0x3fff here as the two upmost bits are reserved for
+        TAIL_BIT and START_EXTENT_BIT
+      */
+      used= allocate_full_pages(bitmap,
+                                (pages >= 0x3fff ? 0x3fff : (uint) pages),
+                                block, 0);
+      if (!used)
+      {
+        if (move_to_next_bitmap(info, bitmap))
+          DBUG_RETURN(1);
+      }
+      else
+      {
+        pages-= used;
+        info->bitmap_blocks.elements++;
+        block++;
+      }
+    } while (pages != 0);
+  }
+  if (rest_length && find_tail(info, rest_length,
+                               info->bitmap_blocks.elements++))
+    DBUG_RETURN(1);
+  first_block= dynamic_element(&info->bitmap_blocks, first_block_pos,
+                               MARIA_BITMAP_BLOCK*);
+  first_block->sub_blocks= info->bitmap_blocks.elements - first_block_pos;
+  DBUG_RETURN(0);
+}
+
+
+/*
+  Find pages to put ALL blobs
+
+  SYNOPSIS
+  allocate_blobs()
+  info		Maria handler
+  row		Information of what is in the row (from calc_record_size())
+
+  RETURN
+   0    ok
+   1    error
+*/
+
+static my_bool allocate_blobs(MARIA_HA *info, MARIA_ROW *row)
+{
+  ulong *length, *end;
+  uint elements;
+  /*
+    Reserve size for:
+    head block
+    one extent
+    tail block
+  */
+  elements= info->bitmap_blocks.elements;
+  for (length= row->blob_lengths, end= length + info->s->base.blobs;
+       length < end; length++)
+  {
+    if (*length && find_blob(info, *length))
+      return 1;
+  }
+  row->extents_count= (info->bitmap_blocks.elements - elements);
+  return 0;
+}
+
+
+/*
+  Store in the bitmap the new size for a head page
+
+  SYNOPSIS
+    use_head()
+    info		Maria handler
+    page		Page number to update
+			(Note that caller guarantees this is in the active
+                        bitmap)
+    size		How much free space is left on the page
+    block_position	In which info->bitmap_block we have the
+			information about the head block.
+
+  NOTES
+    This is used on update where we are updating an existing head page
+*/
+
+static void use_head(MARIA_HA *info, pgcache_page_no_t page, uint size,
+                     uint block_position)
+{
+  MARIA_FILE_BITMAP *bitmap= &info->s->bitmap;
+  MARIA_BITMAP_BLOCK *block;
+  uchar *data;
+  uint offset, tmp, offset_page;
+  DBUG_ENTER("use_head");
+
+  DBUG_ASSERT(page % bitmap->pages_covered);
+
+  block= dynamic_element(&info->bitmap_blocks, block_position,
+                         MARIA_BITMAP_BLOCK*);
+  block->page= page;
+  block->page_count= 1 + TAIL_BIT;
+  block->empty_space= size;
+  block->used= BLOCKUSED_TAIL;
+
+  /*
+    Mark place used by reading/writing 2 bytes at a time to handle
+    bitmaps in overlapping bytes
+  */
+  offset_page= (uint) (page - bitmap->page - 1) * 3;
+  offset= offset_page & 7;
+  data= bitmap->map + offset_page / 8;
+  tmp= uint2korr(data);
+  block->org_bitmap_value= (tmp >> offset) & 7;
+  tmp= (tmp & ~(7 << offset)) | (FULL_HEAD_PAGE << offset);
+  int2store(data, tmp);
+  bitmap->changed= 1;
+  DBUG_EXECUTE("bitmap", _ma_print_bitmap_changes(bitmap););
+  DBUG_VOID_RETURN;
+}
+
+
+/*
+  Find out where to split the row (ie, what goes in head, middle, tail etc)
+
+  SYNOPSIS
+    find_where_to_split_row()
+    share           Maria share
+    row		    Information of what is in the row (from calc_record_size())
+    extents_length  Number of bytes needed to store all extents
+    split_size	    Free size on the page (The head length must be less
+                    than this)
+
+  RETURN
+    row_length for the head block.
+*/
+
+static uint find_where_to_split_row(MARIA_SHARE *share, MARIA_ROW *row,
+                                    uint extents_length, uint split_size)
+{
+  uint *lengths, *lengths_end;
+  /*
+    Ensure we have the minimum required space on head page:
+    - Header + length of field lengths (row->min_length)
+    - Number of extents
+    - One extent
+  */
+  uint row_length= (row->min_length +
+                    size_to_store_key_length(extents_length) +
+                    ROW_EXTENT_SIZE);
+  DBUG_ASSERT(row_length < split_size);
+  /*
+    Store first in all_field_lengths the different parts that are written
+    to the row. This needs to be in same order as in
+    ma_block_rec.c::write_block_record()
+  */
+  row->null_field_lengths[-3]= extents_length;
+  row->null_field_lengths[-2]= share->base.fixed_not_null_fields_length;
+  row->null_field_lengths[-1]= row->field_lengths_length;
+  for (lengths= row->null_field_lengths - EXTRA_LENGTH_FIELDS,
+       lengths_end= (lengths + share->base.pack_fields - share->base.blobs +
+                     EXTRA_LENGTH_FIELDS); lengths < lengths_end; lengths++)
+  {
+    if (row_length + *lengths > split_size)
+      break;
+    row_length+= *lengths;
+  }
+  return row_length;
+}
+
+
+/*
+  Find where to write the middle parts of the row and the tail
+
+  SYNOPSIS
+    write_rest_of_head()
+    info	Maria handler
+    position    Position in bitmap_blocks. Is 0 for rows that needs
+                full blocks (ie, has a head, middle part and optional tail)
+   rest_length  How much left of the head block to write.
+
+  RETURN
+    0  ok
+    1  error
+*/
+
+static my_bool write_rest_of_head(MARIA_HA *info, uint position,
+                                  ulong rest_length)
+{
+  MARIA_SHARE *share= info->s;
+  uint full_page_size= FULL_PAGE_SIZE(share->block_size);
+  MARIA_BITMAP_BLOCK *block;
+  DBUG_ENTER("write_rest_of_head");
+  DBUG_PRINT("enter", ("position: %u  rest_length: %lu", position,
+                       rest_length));
+
+  if (position == 0)
+  {
+    /* Write out full pages */
+    uint pages= rest_length / full_page_size;
+
+    rest_length%= full_page_size;
+    if (rest_length >= MAX_TAIL_SIZE(share->block_size))
+    {
+      /* Put tail on a full page */
+      pages++;
+      rest_length= 0;
+    }
+    if (find_mid(info, pages, 1))
+      DBUG_RETURN(1);
+    /*
+      Insert empty block after full pages, to allow write_block_record() to
+      split segment into used + free page
+    */
+    block= dynamic_element(&info->bitmap_blocks, 2, MARIA_BITMAP_BLOCK*);
+    block->page_count= 0;
+    block->used= 0;
+  }
+  if (rest_length)
+  {
+    if (find_tail(info, rest_length, ELEMENTS_RESERVED_FOR_MAIN_PART - 1))
+      DBUG_RETURN(1);
+  }
+  else
+  {
+    /* Empty tail block */
+    block= dynamic_element(&info->bitmap_blocks,
+                           ELEMENTS_RESERVED_FOR_MAIN_PART - 1,
+                           MARIA_BITMAP_BLOCK *);
+    block->page_count= 0;
+    block->used= 0;
+  }
+  DBUG_RETURN(0);
+}
+
+
+/*
+  Find where to store one row
+
+  SYNPOSIS
+    _ma_bitmap_find_place()
+    info                  Maria handler
+    row                   Information about row to write
+    blocks                Store data about allocated places here
+
+  RETURN
+    0  ok
+       row->space_on_head_page contains minimum number of bytes we
+       expect to put on the head page.
+    1  error
+       my_errno is set to error
+*/
+
+my_bool _ma_bitmap_find_place(MARIA_HA *info, MARIA_ROW *row,
+                              MARIA_BITMAP_BLOCKS *blocks)
+{
+  MARIA_SHARE *share= info->s;
+  my_bool res= 1;
+  uint full_page_size, position, max_page_size;
+  uint head_length, row_length, rest_length, extents_length;
+  DBUG_ENTER("_ma_bitmap_find_place");
+
+  blocks->count= 0;
+  blocks->tail_page_skipped= blocks->page_skipped= 0;
+  row->extents_count= 0;
+
+  /*
+    Reserve place for the following blocks:
+     - Head block
+     - Full page block
+     - Marker block to allow write_block_record() to split full page blocks
+       into full and free part
+     - Tail block
+  */
+
+  info->bitmap_blocks.elements= ELEMENTS_RESERVED_FOR_MAIN_PART;
+  max_page_size= (share->block_size - PAGE_OVERHEAD_SIZE);
+
+  pthread_mutex_lock(&share->bitmap.bitmap_lock);
+
+  if (row->total_length <= max_page_size)
+  {
+    /* Row fits in one page */
+    position= ELEMENTS_RESERVED_FOR_MAIN_PART - 1;
+    if (find_head(info, (uint) row->total_length, position))
+      goto abort;
+    row->space_on_head_page= row->total_length;
+    goto end;
+  }
+
+  /*
+    First allocate all blobs so that we can find out the needed size for
+    the main block.
+  */
+  if (row->blob_length && allocate_blobs(info, row))
+    goto abort;
+
+  extents_length= row->extents_count * ROW_EXTENT_SIZE;
+  /*
+    The + 3 is reserved for storing the number of segments in the row header.
+  */
+  if ((head_length= (row->head_length + extents_length + 3)) <=
+      max_page_size)
+  {
+    /* Main row part fits into one page */
+    position= ELEMENTS_RESERVED_FOR_MAIN_PART - 1;
+    if (find_head(info, head_length, position))
+      goto abort;
+    row->space_on_head_page= head_length;
+    goto end;
+  }
+
+  /* Allocate enough space */
+  head_length+= ELEMENTS_RESERVED_FOR_MAIN_PART * ROW_EXTENT_SIZE;
+
+  /* The first segment size is stored in 'row_length' */
+  row_length= find_where_to_split_row(share, row, extents_length,
+                                      max_page_size);
+
+  full_page_size= MAX_TAIL_SIZE(share->block_size);
+  position= 0;
+  if (head_length - row_length <= full_page_size)
+    position= ELEMENTS_RESERVED_FOR_MAIN_PART -2;    /* Only head and tail */
+  if (find_head(info, row_length, position))
+    goto abort;
+  row->space_on_head_page= row_length;
+
+  rest_length= head_length - row_length;
+  if (write_rest_of_head(info, position, rest_length))
+    goto abort;
+
+end:
+  blocks->block= dynamic_element(&info->bitmap_blocks, position,
+                                 MARIA_BITMAP_BLOCK*);
+  blocks->block->sub_blocks= ELEMENTS_RESERVED_FOR_MAIN_PART - position;
+  /* First block's page_count is for all blocks */
+  blocks->count= info->bitmap_blocks.elements - position;
+  res= 0;
+
+abort:
+  pthread_mutex_unlock(&share->bitmap.bitmap_lock);
+  DBUG_RETURN(res);
+}
+
+
+/*
+  Find where to put row on update (when head page is already defined)
+
+  SYNPOSIS
+    _ma_bitmap_find_new_place()
+    info                  Maria handler
+    row                   Information about row to write
+    page                  On which page original row was stored
+    free_size             Free size on head page
+    blocks                Store data about allocated places here
+
+  NOTES
+   This function is only called when the new row can't fit in the space of
+   the old row in the head page.
+
+   This is essently same as _ma_bitmap_find_place() except that
+   we don't call find_head() to search in bitmaps where to put the page.
+
+  RETURN
+    0  ok
+    1  error
+*/
+
+my_bool _ma_bitmap_find_new_place(MARIA_HA *info, MARIA_ROW *row,
+                                  pgcache_page_no_t page, uint free_size,
+                                  MARIA_BITMAP_BLOCKS *blocks)
+{
+  MARIA_SHARE *share= info->s;
+  my_bool res= 1;
+  uint position;
+  uint head_length, row_length, rest_length, extents_length;
+  ulonglong bitmap_page;
+  DBUG_ENTER("_ma_bitmap_find_new_place");
+
+  blocks->count= 0;
+  blocks->tail_page_skipped= blocks->page_skipped= 0;
+  row->extents_count= 0;
+  info->bitmap_blocks.elements= ELEMENTS_RESERVED_FOR_MAIN_PART;
+
+  pthread_mutex_lock(&share->bitmap.bitmap_lock);
+
+  /*
+    First allocate all blobs (so that we can find out the needed size for
+    the main block.
+  */
+  if (row->blob_length && allocate_blobs(info, row))
+    goto abort;
+
+  /* Switch bitmap to current head page */
+  bitmap_page= page / share->bitmap.pages_covered;
+  bitmap_page*= share->bitmap.pages_covered;
+
+  if (share->bitmap.page != bitmap_page &&
+      _ma_change_bitmap_page(info, &share->bitmap, bitmap_page))
+    goto abort;
+
+  extents_length= row->extents_count * ROW_EXTENT_SIZE;
+  if ((head_length= (row->head_length + extents_length + 3)) <= free_size)
+  {
+    /* Main row part fits into one page */
+    position= ELEMENTS_RESERVED_FOR_MAIN_PART - 1;
+    use_head(info, page, head_length, position);
+    row->space_on_head_page= head_length;
+    goto end;
+  }
+
+  /* Allocate enough space */
+  head_length+= ELEMENTS_RESERVED_FOR_MAIN_PART * ROW_EXTENT_SIZE;
+
+  /* The first segment size is stored in 'row_length' */
+  row_length= find_where_to_split_row(share, row, extents_length, free_size);
+
+  position= 0;
+  if (head_length - row_length < MAX_TAIL_SIZE(share->block_size))
+    position= ELEMENTS_RESERVED_FOR_MAIN_PART -2;    /* Only head and tail */
+  use_head(info, page, row_length, position);
+  row->space_on_head_page= row_length;
+
+  rest_length= head_length - row_length;
+  if (write_rest_of_head(info, position, rest_length))
+    goto abort;
+
+end:
+  blocks->block= dynamic_element(&info->bitmap_blocks, position,
+                                 MARIA_BITMAP_BLOCK*);
+  blocks->block->sub_blocks= ELEMENTS_RESERVED_FOR_MAIN_PART - position;
+  /* First block's page_count is for all blocks */
+  blocks->count= info->bitmap_blocks.elements - position;
+  res= 0;
+
+abort:
+  pthread_mutex_unlock(&share->bitmap.bitmap_lock);
+  DBUG_RETURN(res);
+}
+
+
+/****************************************************************************
+  Clear and reset bits
+****************************************************************************/
+
+/*
+  Set fill pattern for a page
+
+  set_page_bits()
+  info		Maria handler
+  bitmap	Bitmap handler
+  page		Adress to page
+  fill_pattern  Pattern (not size) for page
+
+  NOTES
+    Page may not be part of active bitmap
+
+  RETURN
+    0  ok
+    1  error
+*/
+
+static my_bool set_page_bits(MARIA_HA *info, MARIA_FILE_BITMAP *bitmap,
+                             pgcache_page_no_t page, uint fill_pattern)
+{
+  pgcache_page_no_t bitmap_page;
+  uint offset_page, offset, tmp, org_tmp;
+  uchar *data;
+  DBUG_ENTER("set_page_bits");
+  DBUG_ASSERT(fill_pattern <= 7);
+
+  bitmap_page= page - page % bitmap->pages_covered;
+  if (bitmap_page != bitmap->page &&
+      _ma_change_bitmap_page(info, bitmap, bitmap_page))
+    DBUG_RETURN(1);
+
+  /* Find page number from start of bitmap */
+  offset_page= (uint) (page - bitmap->page - 1);
+  /*
+    Mark place used by reading/writing 2 bytes at a time to handle
+    bitmaps in overlapping bytes
+  */
+  offset_page*= 3;
+  offset= offset_page & 7;
+  data= bitmap->map + offset_page / 8;
+  org_tmp= tmp= uint2korr(data);
+  tmp= (tmp & ~(7 << offset)) | (fill_pattern << offset);
+  if (tmp == org_tmp)
+    DBUG_RETURN(0);                             /* No changes */
+  int2store(data, tmp);
+
+  bitmap->changed= 1;
+  DBUG_EXECUTE("bitmap", _ma_print_bitmap_changes(bitmap););
+  if (fill_pattern != 3 && fill_pattern != 7)
+    set_if_smaller(info->s->state.first_bitmap_with_space, bitmap_page);
+  /*
+    Note that if the condition above is false (page is full), and all pages of
+    this bitmap are now full, and that bitmap page was
+    first_bitmap_with_space, we don't modify first_bitmap_with_space, indeed
+    its value still tells us where to start our search for a bitmap with space
+    (which is for sure after this full one).
+    That does mean that first_bitmap_with_space is only a lower bound.
+  */
+  DBUG_RETURN(0);
+}
+
+
+/*
+  Get bitmap pattern for a given page
+
+  SYNOPSIS
+    get_page_bits()
+    info	Maria handler
+    bitmap	Bitmap handler
+    page	Page number
+
+  RETURN
+    0-7		Bitmap pattern
+    ~0		Error (couldn't read page)
+*/
+
+uint _ma_bitmap_get_page_bits(MARIA_HA *info, MARIA_FILE_BITMAP *bitmap,
+                              pgcache_page_no_t page)
+{
+  pgcache_page_no_t bitmap_page;
+  uint offset_page, offset, tmp;
+  uchar *data;
+  DBUG_ENTER("_ma_bitmap_get_page_bits");
+
+  bitmap_page= page - page % bitmap->pages_covered;
+  if (bitmap_page != bitmap->page &&
+      _ma_change_bitmap_page(info, bitmap, bitmap_page))
+    DBUG_RETURN(~ (uint) 0);
+
+  /* Find page number from start of bitmap */
+  offset_page= (uint) (page - bitmap->page - 1);
+  /*
+    Mark place used by reading/writing 2 bytes at a time to handle
+    bitmaps in overlapping bytes
+  */
+  offset_page*= 3;
+  offset= offset_page & 7;
+  data= bitmap->map + offset_page / 8;
+  tmp= uint2korr(data);
+  DBUG_RETURN((tmp >> offset) & 7);
+}
+
+
+/*
+  Mark all pages in a region as free
+
+  SYNOPSIS
+    _ma_bitmap_reset_full_page_bits()
+    info                Maria handler
+    bitmap              Bitmap handler
+    page                Start page
+    page_count          Number of pages
+
+  NOTES
+    We assume that all pages in region is covered by same bitmap
+    One must have a lock on info->s->bitmap.bitmap_lock
+
+  RETURN
+    0  ok
+    1  Error (when reading bitmap)
+*/
+
+my_bool _ma_bitmap_reset_full_page_bits(MARIA_HA *info,
+                                        MARIA_FILE_BITMAP *bitmap,
+                                        pgcache_page_no_t page,
+                                        uint page_count)
+{
+  ulonglong bitmap_page;
+  uint offset, bit_start, bit_count, tmp;
+  uchar *data;
+  DBUG_ENTER("_ma_bitmap_reset_full_page_bits");
+  DBUG_PRINT("enter", ("page: %lu  page_count: %u", (ulong) page, page_count));
+  safe_mutex_assert_owner(&info->s->bitmap.bitmap_lock);
+
+  bitmap_page= page - page % bitmap->pages_covered;
+  DBUG_ASSERT(page != bitmap_page);
+
+  if (bitmap_page != bitmap->page &&
+      _ma_change_bitmap_page(info, bitmap, bitmap_page))
+    DBUG_RETURN(1);
+
+  /* Find page number from start of bitmap */
+  offset= (uint) (page - bitmap->page - 1);
+
+  /* Clear bits from 'page * 3' -> '(page + page_count) * 3' */
+  bit_start= offset * 3;
+  bit_count= page_count * 3;
+
+  data= bitmap->map + bit_start / 8;
+  offset= bit_start & 7;
+
+  tmp= (255 << offset);                         /* Bits to keep */
+  if (bit_count + offset < 8)
+  {
+    /* Only clear bits between 'offset' and 'offset+bit_count-1' */
+    tmp^= (255 << (offset + bit_count));
+  }
+  *data&= ~tmp;
+
+  if ((int) (bit_count-= (8 - offset)) > 0)
+  {
+    uint fill;
+    data++;
+    /*
+      -1 is here to avoid one 'if' statement and to let the following code
+      handle the last byte
+    */
+    if ((fill= (bit_count - 1) / 8))
+    {
+      bzero(data, fill);
+      data+= fill;
+    }
+    bit_count-= fill * 8;                       /* Bits left to clear */
+    tmp= (1 << bit_count) - 1;
+    *data&= ~tmp;
+  }
+  set_if_smaller(info->s->state.first_bitmap_with_space, bitmap_page);
+  bitmap->changed= 1;
+  DBUG_EXECUTE("bitmap", _ma_print_bitmap_changes(bitmap););
+  DBUG_RETURN(0);
+}
+
+/*
+  Set all pages in a region as used
+
+  SYNOPSIS
+    _ma_bitmap_set_full_page_bits()
+    info                Maria handler
+    bitmap              Bitmap handler
+    page                Start page
+    page_count          Number of pages
+
+  NOTES
+    We assume that all pages in region is covered by same bitmap
+    One must have a lock on info->s->bitmap.bitmap_lock
+
+  RETURN
+    0  ok
+    1  Error (when reading bitmap)
+*/
+
+my_bool _ma_bitmap_set_full_page_bits(MARIA_HA *info,
+                                      MARIA_FILE_BITMAP *bitmap,
+                                      pgcache_page_no_t page, uint page_count)
+{
+  ulonglong bitmap_page;
+  uint offset, bit_start, bit_count, tmp;
+  uchar *data;
+  DBUG_ENTER("_ma_bitmap_set_full_page_bits");
+  DBUG_PRINT("enter", ("page: %lu  page_count: %u", (ulong) page, page_count));
+  safe_mutex_assert_owner(&info->s->bitmap.bitmap_lock);
+
+  bitmap_page= page - page % bitmap->pages_covered;
+  if (page == bitmap_page ||
+      page + page_count >= bitmap_page + bitmap->pages_covered)
+  {
+    DBUG_ASSERT(0);                             /* Wrong in data */
+    DBUG_RETURN(1);
+  }
+
+  if (bitmap_page != bitmap->page &&
+      _ma_change_bitmap_page(info, bitmap, bitmap_page))
+    DBUG_RETURN(1);
+
+  /* Find page number from start of bitmap */
+  offset= (uint) (page - bitmap->page - 1);
+
+  /* Set bits from 'page * 3' -> '(page + page_count) * 3' */
+  bit_start= offset * 3;
+  bit_count= page_count * 3;
+
+  data= bitmap->map + bit_start / 8;
+  offset= bit_start & 7;
+
+  tmp= (255 << offset);                         /* Bits to keep */
+  if (bit_count + offset < 8)
+  {
+    /* Only set bits between 'offset' and 'offset+bit_count-1' */
+    tmp^= (255 << (offset + bit_count));
+  }
+  *data|= tmp;
+
+  if ((int) (bit_count-= (8 - offset)) > 0)
+  {
+    uint fill;
+    data++;
+    /*
+      -1 is here to avoid one 'if' statement and to let the following code
+      handle the last byte
+    */
+    if ((fill= (bit_count - 1) / 8))
+    {
+      bfill(data, fill, 255);
+      data+= fill;
+    }
+    bit_count-= fill * 8;                       /* Bits left to set */
+    tmp= (1 << bit_count) - 1;
+    *data|= tmp;
+  }
+  bitmap->changed= 1;
+  DBUG_EXECUTE("bitmap", _ma_print_bitmap_changes(bitmap););
+  DBUG_RETURN(0);
+}
+
+
+/**
+   @brief
+   Make a transition of MARIA_FILE_BITMAP::non_flushable.
+   If the bitmap becomes flushable, which requires that REDO-UNDO has been
+   logged and all bitmap pages touched by the thread have a correct
+   allocation, it unpins all bitmap pages, and if _ma_bitmap_flush_all() is
+   waiting (in practice it is a checkpoint), it wakes it up.
+   If the bitmap becomes or stays unflushable, the function merely records it
+   unless a concurrent _ma_bitmap_flush_all() is happening, in which case the
+   function first waits for the flush to be done.
+
+   @note
+   this sets info->non_flushable_state to 1 if we have incremented
+   bitmap->non_flushable and not yet decremented it.
+
+   @param  share               Table's share
+   @param  non_flushable_inc   Increment of MARIA_FILE_BITMAP::non_flushable
+                               (-1 or +1).
+*/
+
+void _ma_bitmap_flushable(MARIA_HA *info, int non_flushable_inc)
+{
+  MARIA_SHARE *share= info->s;
+  MARIA_FILE_BITMAP *bitmap;
+  DBUG_ENTER("_ma_bitmap_flushable");
+
+  /*
+    Not transactional tables are never automaticly flushed and needs no
+    protection
+  */
+  if (!share->now_transactional)
+    DBUG_VOID_RETURN;
+
+  bitmap= &share->bitmap;
+  pthread_mutex_lock(&bitmap->bitmap_lock);
+
+  if (non_flushable_inc == -1)
+  {
+    DBUG_ASSERT((int) bitmap->non_flushable > 0);
+    DBUG_ASSERT(info->non_flushable_state == 1);
+    if (--bitmap->non_flushable == 0)
+    {
+      /*
+        We unlock and unpin pages locked and pinned by other threads. It does
+        not seem to be an issue as all bitmap changes are serialized with
+        the bitmap's mutex.
+      */
+      _ma_bitmap_unpin_all(share);
+      if (unlikely(bitmap->flush_all_requested))
+      {
+        DBUG_PRINT("info", ("bitmap flushable waking up flusher"));
+        pthread_cond_broadcast(&bitmap->bitmap_cond);
+      }
+    }
+    DBUG_PRINT("info", ("bitmap->non_flushable: %u", bitmap->non_flushable));
+    pthread_mutex_unlock(&bitmap->bitmap_lock);
+    info->non_flushable_state= 0;
+    DBUG_VOID_RETURN;
+  }
+  DBUG_ASSERT(non_flushable_inc == 1);
+  DBUG_ASSERT(info->non_flushable_state == 0);
+  while (unlikely(bitmap->flush_all_requested))
+  {
+    /*
+      Some other thread is waiting for the bitmap to become
+      flushable. Not the moment to make the bitmap unflushable or more
+      unflushable; let's rather back off and wait. If we didn't do this, with
+      multiple writers, there may always be one thread causing the bitmap to
+      be unflushable and _ma_bitmap_flush_all() would wait for long.
+      There should not be a deadlock because if our thread increased
+      non_flushable (and thus _ma_bitmap_flush_all() is waiting for at least
+      our thread), it is not going to increase it more so is not going to come
+      here.
+    */
+    DBUG_PRINT("info", ("waiting for bitmap flusher"));
+    pthread_cond_wait(&bitmap->bitmap_cond, &bitmap->bitmap_lock);
+  }
+  bitmap->non_flushable++;
+  DBUG_PRINT("info", ("bitmap->non_flushable: %u", bitmap->non_flushable));
+  pthread_mutex_unlock(&bitmap->bitmap_lock);
+  info->non_flushable_state= 1;
+  DBUG_VOID_RETURN;
+}
+
+
+/*
+  Correct bitmap pages to reflect the true allocation
+
+  SYNOPSIS
+    _ma_bitmap_release_unused()
+    info                Maria handle
+    blocks              Bitmap blocks
+
+  IMPLEMENTATION
+    If block->used & BLOCKUSED_TAIL is set:
+       If block->used & BLOCKUSED_USED is set, then the bits for the
+       corresponding page is set according to block->empty_space
+       If block->used & BLOCKUSED_USED is not set, then the bits for
+       the corresponding page is set to org_bitmap_value;
+
+    If block->used & BLOCKUSED_TAIL is not set:
+       if block->used is not set, the bits for the corresponding page are
+       cleared
+
+  For the first block (head block) the logic is same as for a tail block
+
+  Note that we may have 'filler blocks' that are used to split a block
+  in half; These can be recognized by that they have page_count == 0.
+
+  This code also reverse the effect of ma_bitmap_flushable(.., 1);
+
+  RETURN
+    0  ok
+    1  error (Couldn't write or read bitmap page)
+*/
+
+my_bool _ma_bitmap_release_unused(MARIA_HA *info, MARIA_BITMAP_BLOCKS *blocks)
+{
+  MARIA_BITMAP_BLOCK *block= blocks->block, *end= block + blocks->count;
+  MARIA_FILE_BITMAP *bitmap= &info->s->bitmap;
+  uint bits, current_bitmap_value;
+  DBUG_ENTER("_ma_bitmap_release_unused");
+
+  /*
+    We can skip FULL_HEAD_PAGE (4) as the page was marked as 'full'
+    when we allocated space in the page
+  */
+  current_bitmap_value= FULL_HEAD_PAGE;
+
+  pthread_mutex_lock(&bitmap->bitmap_lock);
+
+  /* First handle head block */
+  if (block->used & BLOCKUSED_USED)
+  {
+    DBUG_PRINT("info", ("head page: %lu  empty_space: %u",
+                        (ulong) block->page, block->empty_space));
+    bits= _ma_free_size_to_head_pattern(bitmap, block->empty_space);
+    if (block->used & BLOCKUSED_USE_ORG_BITMAP)
+      current_bitmap_value= block->org_bitmap_value;
+  }
+  else
+    bits= block->org_bitmap_value;
+  if (bits != current_bitmap_value)
+  {
+    if (set_page_bits(info, bitmap, block->page, bits))
+      goto err;
+  }
+  else
+  {
+    DBUG_ASSERT(current_bitmap_value ==
+                _ma_bitmap_get_page_bits(info, bitmap, block->page));
+  }
+
+  /* Handle all full pages and tail pages (for head page and blob) */
+  for (block++; block < end; block++)
+  {
+    uint page_count;
+    if (!block->page_count)
+      continue;                               /* Skip 'filler blocks' */
+
+    page_count= block->page_count;
+    if (block->used & BLOCKUSED_TAIL)
+    {
+      current_bitmap_value= FULL_TAIL_PAGE;
+      /* The bitmap page is only one page */
+      page_count= 1;
+      if (block->used & BLOCKUSED_USED)
+      {
+        DBUG_PRINT("info", ("tail page: %lu  empty_space: %u",
+                            (ulong) block->page, block->empty_space));
+        bits= free_size_to_tail_pattern(bitmap, block->empty_space);
+        if (block->used & BLOCKUSED_USE_ORG_BITMAP)
+          current_bitmap_value= block->org_bitmap_value;
+      }
+      else
+        bits= block->org_bitmap_value;
+
+      /*
+        The page has all bits set; The following test is an optimization
+        to not set the bits to the same value as before.
+      */
+      if (bits != current_bitmap_value)
+      {
+        if (set_page_bits(info, bitmap, block->page, bits))
+          goto err;
+      }
+      else
+      {
+        DBUG_ASSERT(current_bitmap_value ==
+                    _ma_bitmap_get_page_bits(info, bitmap, block->page));
+      }
+    }
+    else if (!(block->used & BLOCKUSED_USED) &&
+             _ma_bitmap_reset_full_page_bits(info, bitmap,
+                                             block->page, page_count))
+      goto err;
+  }
+
+  /* This duplicates ma_bitmap_flushable(-1) except it already has mutex */
+  if (info->non_flushable_state)
+  {
+    DBUG_ASSERT(((int) (bitmap->non_flushable)) > 0);
+    info->non_flushable_state= 0;
+    if (--bitmap->non_flushable == 0)
+    {
+      _ma_bitmap_unpin_all(info->s);
+      if (unlikely(bitmap->flush_all_requested))
+      {
+        DBUG_PRINT("info", ("bitmap flushable waking up flusher"));
+        pthread_cond_broadcast(&bitmap->bitmap_cond);
+      }
+    }
+  }
+  DBUG_PRINT("info", ("bitmap->non_flushable: %u", bitmap->non_flushable));
+
+  pthread_mutex_unlock(&bitmap->bitmap_lock);
+  DBUG_RETURN(0);
+
+err:
+  pthread_mutex_unlock(&bitmap->bitmap_lock);
+  DBUG_RETURN(1);
+}
+
+
+/*
+  Free full pages from bitmap and pagecache
+
+  SYNOPSIS
+    _ma_bitmap_free_full_pages()
+    info                Maria handle
+    extents             Extents (as stored on disk)
+    count               Number of extents
+
+  IMPLEMENTATION
+    Mark all full pages (not tails) from extents as free, both in bitmap
+    and page cache.
+
+  RETURN
+    0  ok
+    1  error (Couldn't write or read bitmap page)
+*/
+
+my_bool _ma_bitmap_free_full_pages(MARIA_HA *info, const uchar *extents,
+                                   uint count)
+{
+  MARIA_FILE_BITMAP *bitmap= &info->s->bitmap;
+  DBUG_ENTER("_ma_bitmap_free_full_pages");
+
+  pthread_mutex_lock(&bitmap->bitmap_lock);
+  for (; count--; extents+= ROW_EXTENT_SIZE)
+  {
+    pgcache_page_no_t page=  uint5korr(extents);
+    uint page_count= (uint2korr(extents + ROW_EXTENT_PAGE_SIZE) &
+                      ~START_EXTENT_BIT);
+    if (!(page_count & TAIL_BIT))
+    {
+      if (page == 0 && page_count == 0)
+        continue;                               /* Not used extent */
+      if (pagecache_delete_pages(info->s->pagecache, &info->dfile, page,
+                                 page_count, PAGECACHE_LOCK_WRITE, 1) ||
+          _ma_bitmap_reset_full_page_bits(info, bitmap, page, page_count))
+      {
+        pthread_mutex_unlock(&bitmap->bitmap_lock);
+        DBUG_RETURN(1);
+      }
+    }
+  }
+  pthread_mutex_unlock(&bitmap->bitmap_lock);
+  DBUG_RETURN(0);
+}
+
+
+/*
+  Mark in the bitmap how much free space there is on a page
+
+  SYNOPSIS
+   _ma_bitmap_set()
+   info		Maria handler
+   page		Adress to page
+   head		1 if page is a head page, 0 if tail page
+   empty_space	How much empty space there is on page
+
+  RETURN
+    0  ok
+    1  error
+*/
+
+my_bool _ma_bitmap_set(MARIA_HA *info, pgcache_page_no_t page, my_bool head,
+                       uint empty_space)
+{
+  MARIA_FILE_BITMAP *bitmap= &info->s->bitmap;
+  uint bits;
+  my_bool res;
+  DBUG_ENTER("_ma_bitmap_set");
+  DBUG_PRINT("enter", ("page: %lu  head: %d  empty_space: %u",
+                       (ulong) page, head, empty_space));
+
+  pthread_mutex_lock(&info->s->bitmap.bitmap_lock);
+  bits= (head ?
+         _ma_free_size_to_head_pattern(bitmap, empty_space) :
+         free_size_to_tail_pattern(bitmap, empty_space));
+  res= set_page_bits(info, bitmap, page, bits);
+  pthread_mutex_unlock(&info->s->bitmap.bitmap_lock);
+  DBUG_RETURN(res);
+}
+
+
+/*
+  Check that bitmap pattern is correct for a page
+
+  NOTES
+    Used in maria_chk
+
+  SYNOPSIS
+    _ma_check_bitmap_data()
+    info	    Maria handler
+    page_type	    What kind of page this is
+    page	    Adress to page
+    empty_space     Empty space on page
+    bitmap_pattern  Store here the pattern that was in the bitmap for the
+		    page. This is always updated.
+
+  RETURN
+    0  ok
+    1  error
+*/
+
+my_bool _ma_check_bitmap_data(MARIA_HA *info,
+                              enum en_page_type page_type, pgcache_page_no_t page,
+                              uint empty_space, uint *bitmap_pattern)
+{
+  uint bits;
+  switch (page_type) {
+  case UNALLOCATED_PAGE:
+  case MAX_PAGE_TYPE:
+    bits= 0;
+    break;
+  case HEAD_PAGE:
+    bits= _ma_free_size_to_head_pattern(&info->s->bitmap, empty_space);
+    break;
+  case TAIL_PAGE:
+    bits= free_size_to_tail_pattern(&info->s->bitmap, empty_space);
+    break;
+  case BLOB_PAGE:
+    bits= FULL_TAIL_PAGE;
+    break;
+  default:
+    bits= 0; /* to satisfy compiler */
+    DBUG_ASSERT(0);
+  }
+  return ((*bitmap_pattern= _ma_bitmap_get_page_bits(info, &info->s->bitmap,
+                                                     page)) != bits);
+}
+
+
+/*
+  Check if the page type matches the one that we have in the bitmap
+
+  SYNOPSIS
+    _ma_check_if_right_bitmap_type()
+    info	    Maria handler
+    page_type	    What kind of page this is
+    page	    Adress to page
+    bitmap_pattern  Store here the pattern that was in the bitmap for the
+		    page. This is always updated.
+
+  NOTES
+    Used in maria_chk
+
+  RETURN
+    0  ok
+    1  error
+*/
+
+my_bool _ma_check_if_right_bitmap_type(MARIA_HA *info,
+                                       enum en_page_type page_type,
+                                       pgcache_page_no_t page,
+                                       uint *bitmap_pattern)
+{
+  if ((*bitmap_pattern= _ma_bitmap_get_page_bits(info, &info->s->bitmap,
+                                                 page)) > 7)
+    return 1;                                   /* Couldn't read page */
+  switch (page_type) {
+  case HEAD_PAGE:
+    return *bitmap_pattern < 1 || *bitmap_pattern > 4;
+  case TAIL_PAGE:
+    return *bitmap_pattern < 5;
+  case BLOB_PAGE:
+    return *bitmap_pattern != 7;
+  default:
+    break;
+  }
+  DBUG_ASSERT(0);
+  return 1;
+}
+
+
+/**
+   @brief create the first bitmap page of a freshly created data file
+
+   @param  share           table's share
+
+   @return Operation status
+     @retval 0      OK
+     @retval !=0    Error
+*/
+
+int _ma_bitmap_create_first(MARIA_SHARE *share)
+{
+  uint block_size= share->bitmap.block_size;
+  File file= share->bitmap.file.file;
+  uchar marker[CRC_SIZE];
+
+  /*
+    Next write operation of the page will write correct CRC
+    if it is needed
+  */
+  int4store(marker, MARIA_NO_CRC_BITMAP_PAGE);
+
+  if (my_chsize(file, block_size - sizeof(marker),
+                0, MYF(MY_WME)) ||
+      my_pwrite(file, marker, sizeof(marker),
+                block_size - sizeof(marker),
+                MYF(MY_NABP | MY_WME)))
+    return 1;
+  share->state.state.data_file_length= block_size;
+  _ma_bitmap_delete_all(share);
+  return 0;
+}
+
+
+/**
+  @brief Pagecache callback to get the TRANSLOG_ADDRESS to flush up to, when a
+  bitmap page needs to be flushed.
+
+  @param page            Page's content
+  @param page_no         Page's number (<offset>/<page length>)
+  @param data_ptr        Callback data pointer (pointer to MARIA_SHARE)
+
+  @retval TRANSLOG_ADDRESS to flush up to.
+*/
+
+static my_bool
+flush_log_for_bitmap(uchar *page __attribute__((unused)),
+                     pgcache_page_no_t page_no __attribute__((unused)),
+                     uchar *data_ptr __attribute__((unused)))
+{
+#ifndef DBUG_OFF
+  const MARIA_SHARE *share= (MARIA_SHARE*)data_ptr;
+#endif
+  DBUG_ENTER("flush_log_for_bitmap");
+  DBUG_ASSERT(share->now_transactional);
+  /*
+    WAL imposes that UNDOs reach disk before bitmap is flushed. We don't know
+    the LSN of the last UNDO about this bitmap page, so we flush whole log.
+  */
+  DBUG_RETURN(translog_flush(translog_get_horizon()));
+}
+
+
+/**
+   @brief Set callbacks for bitmap pages
+
+   @note
+   We don't use pagecache_file_init here, as we want to keep the
+   code readable
+*/
+
+void _ma_bitmap_set_pagecache_callbacks(PAGECACHE_FILE *file,
+                                        MARIA_SHARE *share)
+{
+  file->callback_data= (uchar*) share;
+  file->flush_log_callback= maria_flush_log_for_page_none;
+  file->write_fail= maria_page_write_failure;
+
+  if (share->temporary)
+  {
+    file->read_callback=  &maria_page_crc_check_none;
+    file->write_callback= &maria_page_filler_set_none;
+  }
+  else
+  {
+    file->read_callback=  &maria_page_crc_check_bitmap;
+    if (share->options & HA_OPTION_PAGE_CHECKSUM)
+      file->write_callback= &maria_page_crc_set_normal;
+    else
+      file->write_callback= &maria_page_filler_set_bitmap;
+    if (share->now_transactional)
+      file->flush_log_callback= flush_log_for_bitmap;
+  }
+}
+
+
+/**
+  Extends data file with zeroes and creates new bitmap pages into page cache.
+
+  Writes all bitmap pages in [from, to].
+
+  Non-bitmap pages of zeroes are correct as they are marked empty in
+  bitmaps. Bitmap pages will not be zeroes: they will get their CRC fixed when
+  flushed. And if there is a crash before flush (so they are zeroes at
+  restart), a REDO will re-create them in page cache.
+*/
+
+static my_bool
+_ma_bitmap_create_missing_into_pagecache(MARIA_SHARE *share,
+                                         MARIA_FILE_BITMAP *bitmap,
+                                         pgcache_page_no_t from,
+                                         pgcache_page_no_t to,
+                                         uchar *zeroes)
+{
+  pgcache_page_no_t i;
+  /*
+    We do not use my_chsize() because there can be a race between when it
+    reads the physical size and when it writes (assume data_file_length is 10,
+    physical length is 8 and two data pages are in cache, and here we do a
+    my_chsize: my_chsize sees physical length is 8, then the two data pages go
+    to disk then my_chsize writes from page 8 and so overwrites the two data
+    pages, wrongly).
+    We instead rely on the filesystem filling gaps with zeroes.
+  */
+  for (i= from; i <= to; i+= bitmap->pages_covered)
+  {
+    /**
+      No need to keep them pinned, they are new so flushable.
+      @todo but we may want to keep them pinned, as an optimization: if they
+      are not pinned they may go to disk before the data pages go (so, the
+      physical pages would be in non-ascending "sparse" order on disk), or the
+      filesystem may fill gaps with zeroes physically which is a waste of
+      time.
+    */
+    if (pagecache_write(share->pagecache,
+                        &bitmap->file, i, 0,
+                        zeroes, PAGECACHE_PLAIN_PAGE,
+                        PAGECACHE_LOCK_LEFT_UNLOCKED,
+                        PAGECACHE_PIN_LEFT_UNPINNED,
+                        PAGECACHE_WRITE_DELAY, 0, LSN_IMPOSSIBLE))
+      goto err;
+  }
+  /*
+    Data pages after data_file_length are full of zeroes but that is allowed
+    as they are marked empty in the bitmap.
+  */
+  return FALSE;
+err:
+  return TRUE;
+}
+
+
+/**
+ Creates missing bitmaps when we extend the data file.
+
+ At run-time, when we need a new bitmap page we come here; and only one bitmap
+ page at a time is created.
+
+ In some recovery cases we insert at a large offset in the data file, way
+ beyond state.data_file_length, so can need to create more than one bitmap
+ page in one go. Known case is:
+ Start a transaction in Maria;
+ delete last row of very large table (with delete_row)
+ do a bulk insert
+ crash
+ Then UNDO_BULK_INSERT will truncate table files, and
+ UNDO_ROW_DELETE will want to put the row back to its original position,
+ extending the data file a lot: bitmap page*s* in the hole must be created,
+ or he table would look corrupted.
+
+ We need to log REDOs for bitmap creation, consider: we apply a REDO for a
+ data page, which creates the first data page covered by a new bitmap
+ not yet created. If the data page is flushed but the bitmap page is not and
+ there is a crash, re-execution of the REDO will complain about the zeroed
+ bitmap page (see it as corruption). Thus a REDO is needed to re-create the
+ bitmap.
+
+ @param  info              Maria handler
+ @param  bitmap            Bitmap handler
+ @param  page              Last bitmap page to create
+
+ @note When this function is called this must be true:
+ ((page + 1) * bitmap->block_size > info->s->state.state.data_file_length)
+
+*/
+
+static my_bool _ma_bitmap_create_missing(MARIA_HA *info,
+                                         MARIA_FILE_BITMAP *bitmap,
+                                         pgcache_page_no_t page)
+{
+  MARIA_SHARE *share= info->s;
+  uint block_size= bitmap->block_size;
+  pgcache_page_no_t from, to;
+  my_off_t data_file_length= share->state.state.data_file_length;
+  DBUG_ENTER("_ma_bitmap_create_missing");
+
+  /* First (in offset order) bitmap page to create */
+  if (data_file_length < block_size)
+    goto err; /* corrupted, should have first bitmap page */
+
+  from= (data_file_length / block_size - 1) / bitmap->pages_covered + 1;
+  from*= bitmap->pages_covered;
+  /*
+    page>=from because:
+    (page + 1) * bs > dfl, and page == k * pc so:
+    (k * pc + 1) * bs > dfl; k * pc + 1 > dfl / bs; k * pc > dfl / bs - 1
+    k > (dfl / bs - 1) / pc; k >= (dfl / bs - 1) / pc + 1
+    k * pc >= ((dfl / bs - 1) / pc + 1) * pc == from.
+  */
+  DBUG_ASSERT(page >= from);
+
+  if (share->now_transactional)
+  {
+    LSN lsn;
+    uchar log_data[FILEID_STORE_SIZE + PAGE_STORE_SIZE * 2];
+    LEX_CUSTRING log_array[TRANSLOG_INTERNAL_PARTS + 1];
+    page_store(log_data + FILEID_STORE_SIZE, from);
+    page_store(log_data + FILEID_STORE_SIZE + PAGE_STORE_SIZE, page);
+    log_array[TRANSLOG_INTERNAL_PARTS + 0].str=    log_data;
+    log_array[TRANSLOG_INTERNAL_PARTS + 0].length= sizeof(log_data);
+    /*
+      We don't use info->trn so that this REDO is always executed even though
+      the UNDO does not reach disk due to crash. This is also consistent with
+      the fact that the new bitmap pages are not pinned.
+    */
+    if (translog_write_record(&lsn, LOGREC_REDO_BITMAP_NEW_PAGE,
+                              &dummy_transaction_object, info,
+                              (translog_size_t)sizeof(log_data),
+                              TRANSLOG_INTERNAL_PARTS + 1, log_array,
+                              log_data, NULL))
+      goto err;
+    /*
+      No need to flush the log: the bitmap pages we are going to create will
+      flush it when they go to disk.
+    */
+  }
+
+  /*
+    Last bitmap page. It has special creation: will go to the page cache
+    only later as we are going to modify it very soon.
+  */
+  bzero(bitmap->map, bitmap->block_size);
+  bitmap->used_size= 0;
+#ifndef DBUG_OFF
+  memcpy(bitmap->map + bitmap->block_size, bitmap->map, bitmap->block_size);
+#endif
+
+  /* Last bitmap page to create before 'page' */
+  DBUG_ASSERT(page >= bitmap->pages_covered);
+  to= page - bitmap->pages_covered;
+  /*
+    In run-time situations, from>=to is always false, i.e. we always create
+    one bitmap at a time ('page').
+  */
+  if ((from <= to) &&
+      _ma_bitmap_create_missing_into_pagecache(share, bitmap, from, to,
+                                               bitmap->map))
+    goto err;
+
+  share->state.state.data_file_length= (page + 1) * bitmap->block_size;
+
+ DBUG_RETURN(FALSE);
+err:
+ DBUG_RETURN(TRUE);
+}
+
+
+my_bool _ma_apply_redo_bitmap_new_page(MARIA_HA *info,
+                                       LSN lsn __attribute__ ((unused)),
+                                       const uchar *header)
+{
+  MARIA_SHARE *share= info->s;
+  MARIA_FILE_BITMAP *bitmap= &share->bitmap;
+  my_bool error;
+  pgcache_page_no_t from, to, min_from;
+  DBUG_ENTER("_ma_apply_redo_bitmap_new_page");
+
+  from= page_korr(header);
+  to=   page_korr(header + PAGE_STORE_SIZE);
+  DBUG_PRINT("info", ("from: %lu to: %lu", (ulong)from, (ulong)to));
+  if ((from > to) ||
+      (from % bitmap->pages_covered) != 0 ||
+      (to % bitmap->pages_covered) != 0)
+  {
+    error= TRUE; /* corrupted log record */
+    goto err;
+  }
+
+  min_from= (share->state.state.data_file_length / bitmap->block_size - 1) /
+    bitmap->pages_covered + 1;
+  min_from*= bitmap->pages_covered;
+  if (from < min_from)
+  {
+    DBUG_PRINT("info", ("overwrite bitmap pages from %lu", (ulong)min_from));
+    /*
+      We have to overwrite. It could be that there was a bitmap page in
+      memory, covering a data page which went to disk, then crash: the
+      bitmap page is now full of zeros and is ==min_from, we have to overwrite
+      it with correct checksum.
+    */
+  }
+  share->state.changed|= STATE_CHANGED;
+  bzero(info->buff, bitmap->block_size);
+  if (!(error=
+        _ma_bitmap_create_missing_into_pagecache(share, bitmap, from, to,
+                                                 info->buff)))
+    share->state.state.data_file_length= (to + 1) * bitmap->block_size;
+
+err:
+  DBUG_RETURN(error);
+}
diff --git a/storage/maria/ma_blockrec.c b/storage/maria/ma_blockrec.c
new file mode 100644
index 00000000000..fd02e2ac0ec
--- /dev/null
+++ b/storage/maria/ma_blockrec.c
@@ -0,0 +1,7404 @@
+/* Copyright (C) 2007-2008 Michael Widenius
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; version 2 of the License.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */
+
+/*
+  Storage of records in block
+
+  Some clarifications about the abbrev used:
+
+  NULL fields      -> Fields that may have contain a NULL value.
+  Not null fields  -> Fields that may not contain a NULL value.
+  Critical fields  -> Fields that can't be null and can't be dropped without
+		      causing a table reorganization.
+
+
+  Maria will have a LSN at start of each page (excluding the bitmap pages)
+
+  The different page types that are in a data file are:
+
+  Bitmap pages     Map of free pages in the next extent (8192 page size
+                   gives us 256M of mapped pages / bitmap)
+  Head page        Start of rows are stored on this page.
+                   A rowid always points to a head page
+  Blob page        This page is totally filled with data from one blob or by
+                   a set of long VARCHAR/CHAR fields
+  Tail page        This contains the last part from different rows, blobs
+                   or varchar fields.
+
+  The data file starts with a bitmap page, followed by as many data
+  pages as the bitmap can cover. After this there is a new bitmap page
+  and more data pages etc.
+
+  For information about the bitmap page, see ma_bitmap.c
+
+  Structure of data and tail page:
+
+  The page has a row directory at end of page to allow us to do deletes
+  without having to reorganize the page.  It also allows us to later store
+  some more bytes after each row to allow them to grow without having to move
+  around other rows.
+
+  Page header:
+
+  LSN            7 bytes   Log position for last page change
+  PAGE_TYPE      1 uchar   1 for head / 2 for tail / 3 for blob
+  DIR_COUNT      1 uchar   Number of row/tail entries on page
+  FREE_DIR_LINK  1 uchar   Pointer to first free director entry or 255 if no
+  empty space    2 bytes  Empty space on page
+
+  The most significant bit in PAGE_TYPE is set to 1 if the data on the page
+  can be compacted to get more space. (PAGE_CAN_BE_COMPACTED)
+
+  Row data
+
+  Row directory of NO entries, that consist of the following for each row
+  (in reverse order; i.e., first record is stored last):
+
+  Position     2 bytes Position of row on page
+  Length       2 bytes Length of entry
+
+  For Position and Length, the 1 most significant bit of the position and
+  the 1 most significant bit of the length could be used for some states of
+  the row (in other words, we should try to keep these reserved)
+
+  Position is 0 if the entry is not used.  In this case length[0] points
+  to a previous free entry (255 if no previous entry) and length[1]
+  to the next free entry (or 255 if last free entry). This works because
+  the directory entry 255 can never be marked free (if the first directory
+  entry is freed, the directory is shrinked).
+
+  checksum     4 bytes  Reserved for full page read testing and live backup.
+
+  ----------------
+
+  Structure of blob pages:
+
+  LSN          7 bytes  Log position for last page change
+  PAGE_TYPE    1 uchar   3
+
+  data
+
+  -----------------
+
+  Row data structure:
+
+  Flag                          1 uchar   Marker of which header field exists
+  TRANSID                       6 bytes  TRANSID of changing transaction
+                                         (optional, added on insert and first
+                                         update/delete)
+  VER_PTR                       7 bytes  Pointer to older version in log
+                                         (undo record)
+                                         (optional, added after first
+                                         update/delete)
+  DELETE_TRANSID                6 bytes  (optional). TRANSID of original row.
+                                         Added on delete.
+  Nulls_extended                1 uchar   To allow us to add new DEFAULT NULL
+                                         fields (optional, added after first
+                                         change of row after alter table)
+  Number of ROW_EXTENT's        1-3 uchar Length encoded, optional
+                                         This is the number of extents the
+                                         row is split into
+  First row_extent              7 uchar  Pointer to first row extent (optional)
+
+  Total length of length array  1-3 uchar Only used if we have
+                                         char/varchar/blob fields.
+  Row checksum		        1 uchar   Only if table created with checksums
+  Null_bits             ..      One bit for each NULL field (a field that may
+				have the value NULL)
+  Empty_bits            ..      One bit for each field that may be 'empty'.
+				(Both for null and not null fields).
+                                This bit is 1 if the value for the field is
+                                0 or empty string.
+
+  field_offsets                 2 byte/offset
+                                  For each 32'th field, there is one offset
+                                  that points to where the field information
+                                  starts in the block. This is to provide
+                                  fast access to later field in the row
+                                  when we only need to return a small
+                                  set of fields.
+                                  TODO: Implement this.
+
+  Things marked above as 'optional' will only be present if the
+  corresponding bit is set in 'Flag' field.  Flag gives us a way to
+  get more space on a page when doing page compaction as we don't need
+  to store TRANSID that have committed before the smallest running
+  transaction we have in memory.
+
+  Data in the following order:
+  (Field order is precalculated when table is created)
+
+  Critical fixed length, not null, fields. (Note, these can't be dropped)
+  Fixed length, null fields
+
+  Length array, 1-4 uchar per field for all CHAR/VARCHAR/BLOB fields.
+  Number of bytes used in length array per entry is depending on max length
+  for field.
+
+  ROW_EXTENT's
+  CHAR data (space stripped)
+  VARCHAR data
+  BLOB data
+
+  Fields marked in null_bits or empty_bits are not stored in data part or
+  length array.
+
+  If row doesn't fit into the given block, then the first EXTENT will be
+  stored last on the row. This is done so that we don't break any field
+  data in the middle.
+
+  We first try to store the full row into one block. If that's not possible
+  we move out each big blob into their own extents. If this is not enough we
+  move out a concatenation of all varchars to their own extent.
+
+  Each blob and the concatenated char/varchar fields are stored the following
+  way:
+  - Store the parts in as many full-contiguous pages as possible.
+  - The last part, that doesn't fill a full page, is stored in tail page.
+
+  When doing an insert of a new row, we don't have to have
+  VER_PTR in the row. This will make rows that are not changed stored
+  efficiently. On update and delete we would add TRANSID (if it was an old
+  committed row) and VER_PTR to
+  the row. On row page compaction we can easily detect rows where
+  TRANSID was committed before the longest running transaction
+  started and we can then delete TRANSID and VER_PTR from the row to
+  gain more space.
+
+  If a row is deleted in Maria, we change TRANSID to the deleting
+  transaction's id, change VER_PTR to point to the undo record for the delete,
+  and add DELETE_TRANSID (the id of the transaction which last
+  inserted/updated the row before its deletion). DELETE_TRANSID allows an old
+  transaction to avoid reading the log to know if it can see the last version
+  before delete (in other words it reduces the probability of having to follow
+  VER_PTR). TODO: depending on a compilation option, evaluate the performance
+  impact of not storing DELETE_TRANSID (which would make the row smaller).
+
+  Description of the different parts:
+
+  Flag is coded as:
+
+  Description           bit
+  TRANS_ID_exists       0
+  VER_PTR_exists        1
+  Row is deleted        2       (Means that DELETE_TRANSID exists)
+  Nulls_extended_exists 3
+  Row is split          7       This means that 'Number_of_row_extents' exists
+
+  Nulls_extended is the number of new DEFAULT NULL fields in the row
+  compared to the number of DEFAULT NULL fields when the first version
+  of the table was created.  If Nulls_extended doesn't exist in the row,
+  we know it's 0 as this must be one of the original rows from when the
+  table was created first time.  This coding allows us to add 255*8 =
+  2048 new fields without requiring a full alter table.
+
+  Empty_bits is used to allow us to store 0, 0.0, empty string, empty
+  varstring and empty blob efficiently. (This is very good for data
+  warehousing where NULL's are often regarded as evil). Having this
+  bitmap also allows us to drop information of a field during a future
+  delete if field was deleted with ALTER TABLE DROP COLUMN.  To be able
+  to handle DROP COLUMN, we must store in the index header the fields
+  that has been dropped. When unpacking a row we will ignore dropped
+  fields. When storing a row, we will mark a dropped field either with a
+  null in the null bit map or in the empty_bits and not store any data
+  for it.
+  TODO: Add code for handling dropped fields.
+
+
+  A ROW EXTENT is range of pages. One ROW_EXTENT is coded as:
+
+  START_PAGE            5 bytes
+  PAGE_COUNT            2 bytes.  Bit 16 is set if this is a tail page.
+                                  Bit 15 is to set if this is start of a new
+                                  blob extent.
+
+  With 8K pages, we can cover 256M in one extent. This coding gives us a
+  maximum file size of 2^40*8192 = 8192 tera
+
+  As an example of ROW_EXTENT handling, assume a row with one integer
+  field (value 5), two big VARCHAR fields (size 250 and 8192*3), and 2
+  big BLOB fields that we have updated.
+
+  The record format for storing this into an empty file would be:
+
+  Page 1:
+
+  00 00 00 00 00 00 00          LSN
+  01                            Only one row in page
+  FF                            No free dir entry
+  xx xx                         Empty space on page
+
+  10                            Flag: row split, VER_PTR exists
+  01 00 00 00 00 00             TRANSID 1
+  00 00 00 00 00 01 00          VER_PTR to first block in LOG file 1
+  5                             Number of row extents
+  02 00 00 00 00 03 00          VARCHAR's are stored in full pages 2,3,4
+  0                             No null fields
+  0                             No empty fields
+  05 00 00 00 00 00 80          Tail page for VARCHAR, rowid 0
+  06 00 00 00 00 80 00          First blob, stored at page 6-133
+  05 00 00 00 00 01 80          Tail of first blob (896 bytes) at page 5
+  86 00 00 00 00 80 00          Second blob, stored at page 134-262
+  05 00 00 00 00 02 80          Tail of second blob (896 bytes) at page 5
+  05 00                         5 integer
+  FA                            Length of first varchar field (size 250)
+  00 60                         Length of second varchar field (size 8192*3)
+  00 60 10                      First medium BLOB, 1M
+  01 00 10 00                   Second BLOB, 1M
+  xx xx xx xx xx xx             Varchars are stored here until end of page
+
+  ..... until end of page
+
+  09 00 F4 1F                   Start position 9, length 8180
+  xx xx xx xx			Checksum
+
+  A data page is allowed to have a wrong CRC and header as long as it is
+  marked empty in the bitmap and its directory's count is 0.
+*/
+
+#include "maria_def.h"
+#include "ma_blockrec.h"
+#include "trnman.h"
+#include "ma_key_recover.h"
+#include "ma_recovery_util.h"
+#include <lf.h>
+
+/*
+  Struct for having a cursor over a set of extent.
+  This is used to loop over all extents for a row when reading
+  the row data. It's also used to store the tail positions for
+  a read row to be used by a later update/delete command.
+*/
+
+typedef struct st_maria_extent_cursor
+{
+  /*
+    Pointer to packed uchar array of extents for the row.
+    Format is described above in the header
+  */
+  uchar *extent;
+  /* Where data starts on page; Only for debugging */
+  uchar *data_start;
+  /* Position to all tails in the row. Updated when reading a row */
+  MARIA_RECORD_POS *tail_positions;
+  /* Current page */
+  pgcache_page_no_t page;
+  /* How many pages in the page region */
+  uint page_count;
+  /* What kind of lock to use for tail pages */
+  enum pagecache_page_lock lock_for_tail_pages;
+  /* Total number of extents (i.e., entries in the 'extent' slot) */
+  uint extent_count;
+  /* <> 0 if current extent is a tail page; Set while using cursor */
+  uint tail;
+  /* Position for tail on tail page */
+  uint tail_row_nr;
+  /*
+    == 1 if we are working on the first extent (i.e., the one that is stored in
+    the row header, not an extent that is stored as part of the row data).
+  */
+  my_bool first_extent;
+} MARIA_EXTENT_CURSOR;
+
+
+/**
+   @brief Structure for passing down info to write_hook_for_clr_end().
+   This hooks needs to know the variation of the live checksum caused by the
+   current operation to update state.checksum under log's mutex,
+   needs to know the transaction's previous undo_lsn to set
+   trn->undo_lsn under log mutex, and needs to know the type of UNDO being
+   undone now to modify state.records under log mutex.
+*/
+
+/** S:share,D:checksum_delta,E:expression,P:pointer_into_record,L:length */
+#define store_checksum_in_rec(S,D,E,P,L)        do      \
+  {                                                     \
+    D= 0;                                               \
+    if ((S)->calc_checksum != NULL)                     \
+    {                                                   \
+      D= (E);                                           \
+      ha_checksum_store(P, D);                          \
+      L+= HA_CHECKSUM_STORE_SIZE;                       \
+    }                                                   \
+  } while (0)
+
+
+static my_bool delete_tails(MARIA_HA *info, MARIA_RECORD_POS *tails);
+static my_bool delete_head_or_tail(MARIA_HA *info,
+                                   pgcache_page_no_t page, uint record_number,
+                                   my_bool head, my_bool from_update);
+#ifndef DBUG_OFF
+static void _ma_print_directory(FILE *file, uchar *buff, uint block_size);
+#endif
+static uchar *store_page_range(uchar *to, MARIA_BITMAP_BLOCK *block,
+                               uint block_size, ulong length,
+                               uint *tot_ranges);
+static size_t fill_insert_undo_parts(MARIA_HA *info, const uchar *record,
+                                     LEX_CUSTRING *log_parts,
+                                     uint *log_parts_count);
+static size_t fill_update_undo_parts(MARIA_HA *info, const uchar *oldrec,
+                                     const uchar *newrec,
+                                     LEX_CUSTRING *log_parts,
+                                     uint *log_parts_count);
+
+/****************************************************************************
+  Initialization
+****************************************************************************/
+
+/*
+  Initialize data needed for block structures
+*/
+
+
+/* Size of the different header elements for a row */
+
+static uchar header_sizes[]=
+{
+  TRANSID_SIZE,
+  VERPTR_SIZE,
+  TRANSID_SIZE,                                 /* Delete transid */
+  1                                             /* Null extends */
+};
+
+/*
+  Calculate array of all used headers
+
+  Used to speed up:
+
+  size= 1;
+  if (flag & 1)
+    size+= TRANSID_SIZE;
+  if (flag & 2)
+    size+= VERPTR_SIZE;
+  if (flag & 4)
+    size+= TRANSID_SIZE
+  if (flag & 8)
+    size+= 1;
+
+   NOTES
+     This is called only once at startup of Maria
+*/
+
+static uchar total_header_size[1 << array_elements(header_sizes)];
+#define PRECALC_HEADER_BITMASK (array_elements(total_header_size) -1)
+
+void _ma_init_block_record_data(void)
+{
+  uint i;
+  bzero(total_header_size, sizeof(total_header_size));
+  total_header_size[0]= FLAG_SIZE;              /* Flag uchar */
+  for (i= 1; i < array_elements(total_header_size); i++)
+  {
+    uint size= FLAG_SIZE, j, bit;
+    for (j= 0; (bit= (1 << j)) <= i; j++)
+    {
+      if (i & bit)
+        size+= header_sizes[j];
+    }
+    total_header_size[i]= size;
+  }
+}
+
+
+my_bool _ma_once_init_block_record(MARIA_SHARE *share, File data_file)
+{
+
+  share->base.max_data_file_length=
+    (((ulonglong) 1 << ((share->base.rec_reflength-1)*8))-1) *
+    share->block_size;
+#if SIZEOF_OFF_T == 4
+    set_if_smaller(share->base.max_data_file_length, INT_MAX32);
+#endif
+  return _ma_bitmap_init(share, data_file);
+}
+
+
+my_bool _ma_once_end_block_record(MARIA_SHARE *share)
+{
+  int res= _ma_bitmap_end(share);
+  if (share->bitmap.file.file >= 0)
+  {
+    if (flush_pagecache_blocks(share->pagecache, &share->bitmap.file,
+                               ((share->temporary || share->deleting) ?
+                                FLUSH_IGNORE_CHANGED :
+                                FLUSH_RELEASE)))
+      res= 1;
+    /*
+      File must be synced as it is going out of the maria_open_list and so
+      becoming unknown to Checkpoint.
+    */
+    if (share->now_transactional &&
+        my_sync(share->bitmap.file.file, MYF(MY_WME)))
+      res= 1;
+    if (my_close(share->bitmap.file.file, MYF(MY_WME)))
+      res= 1;
+    /*
+      Trivial assignment to guard against multiple invocations
+      (May happen if file are closed but we want to keep the maria object
+      around a bit longer)
+    */
+    share->bitmap.file.file= -1;
+  }
+  if (share->id != 0)
+  {
+    /*
+      We de-assign the id even though index has not been flushed, this is ok
+      as close_lock serializes us with a Checkpoint looking at our share.
+    */
+    translog_deassign_id_from_share(share);
+  }
+  return res;
+}
+
+
+/* Init info->cur_row structure */
+
+my_bool _ma_init_block_record(MARIA_HA *info)
+{
+  MARIA_ROW *row= &info->cur_row, *new_row= &info->new_row;
+  MARIA_SHARE *share= info->s;
+  uint default_extents;
+  DBUG_ENTER("_ma_init_block_record");
+
+  if (!my_multi_malloc(MY_WME,
+                       &row->empty_bits, share->base.pack_bytes,
+                       &row->field_lengths,
+                       share->base.max_field_lengths + 2,
+                       &row->blob_lengths, sizeof(ulong) * share->base.blobs,
+                       &row->null_field_lengths, (sizeof(uint) *
+                                                  (share->base.fields -
+                                                   share->base.blobs +
+                                                   EXTRA_LENGTH_FIELDS)),
+                       &row->tail_positions, (sizeof(MARIA_RECORD_POS) *
+                                              (share->base.blobs + 2)),
+                       &new_row->empty_bits, share->base.pack_bytes,
+                       &new_row->field_lengths,
+                       share->base.max_field_lengths + 2,
+                       &new_row->blob_lengths,
+                       sizeof(ulong) * share->base.blobs,
+                       &new_row->null_field_lengths, (sizeof(uint) *
+                                                      (share->base.fields -
+                                                       share->base.blobs +
+                                                       EXTRA_LENGTH_FIELDS)),
+                       &info->log_row_parts,
+                       sizeof(*info->log_row_parts) *
+                       (TRANSLOG_INTERNAL_PARTS + 3 +
+                        share->base.fields + 3),
+                       &info->update_field_data,
+                       (share->base.fields * 4 +
+                        share->base.max_field_lengths + 1 + 4),
+                       NullS, 0))
+    DBUG_RETURN(1);
+  /* Skip over bytes used to store length of field length for logging */
+  row->field_lengths+= 2;
+  new_row->field_lengths+= 2;
+
+  /* Reserve some initial space to avoid mallocs during execution */
+  default_extents= (ELEMENTS_RESERVED_FOR_MAIN_PART + 1 +
+                    (AVERAGE_BLOB_SIZE /
+                     FULL_PAGE_SIZE(share->block_size) /
+                     BLOB_SEGMENT_MIN_SIZE));
+
+  if (my_init_dynamic_array(&info->bitmap_blocks,
+                            sizeof(MARIA_BITMAP_BLOCK), default_extents,
+                            64))
+    goto err;
+  info->cur_row.extents_buffer_length= default_extents * ROW_EXTENT_SIZE;
+  if (!(info->cur_row.extents= my_malloc(info->cur_row.extents_buffer_length,
+                                         MYF(MY_WME))))
+    goto err;
+
+  info->row_base_length= share->base_length;
+  info->row_flag= share->base.default_row_flag;
+
+  /*
+    We need to reserve 'EXTRA_LENGTH_FIELDS' number of parts in
+    null_field_lengths to allow splitting of rows in 'find_where_to_split_row'
+  */
+  row->null_field_lengths+= EXTRA_LENGTH_FIELDS;
+  new_row->null_field_lengths+= EXTRA_LENGTH_FIELDS;
+
+  DBUG_RETURN(0);
+
+err:
+  _ma_end_block_record(info);
+  DBUG_RETURN(1);
+}
+
+
+void _ma_end_block_record(MARIA_HA *info)
+{
+  DBUG_ENTER("_ma_end_block_record");
+  my_free(info->cur_row.empty_bits, MYF(MY_ALLOW_ZERO_PTR));
+  delete_dynamic(&info->bitmap_blocks);
+  my_free(info->cur_row.extents, MYF(MY_ALLOW_ZERO_PTR));
+  my_free(info->blob_buff, MYF(MY_ALLOW_ZERO_PTR));
+  /*
+    The data file is closed, when needed, in ma_once_end_block_record().
+    The following protects us from doing an extra, not allowed, close
+    in maria_close()
+  */
+  info->dfile.file= -1;
+  DBUG_VOID_RETURN;
+}
+
+
+/****************************************************************************
+  Helper functions
+****************************************************************************/
+
+/*
+  Return the next unused postion on the page after a directory entry.
+
+  SYNOPSIS
+    start_of_next_entry()
+    dir		Directory entry to be used. This can not be the
+                the last entry on the page!
+
+  RETURN
+    #   Position in page where next entry starts.
+        Everything between the '*dir' and this are free to be used.
+*/
+
+static inline uint start_of_next_entry(uchar *dir)
+{
+  uchar *prev;
+  /*
+     Find previous used entry. (There is always a previous entry as
+     the directory never starts with a deleted entry)
+  */
+  for (prev= dir - DIR_ENTRY_SIZE ;
+       prev[0] == 0 && prev[1] == 0 ;
+       prev-= DIR_ENTRY_SIZE)
+  {}
+  return (uint) uint2korr(prev);
+}
+
+
+/*
+  Return the offset where the previous entry ends (before on page)
+
+  SYNOPSIS
+    end_of_previous_entry()
+    dir		Address for current directory entry
+    end         Address to last directory entry
+
+  RETURN
+    #   Position where previous entry ends (smallest address on page)
+        Everything between # and current entry are free to be used.
+*/
+
+
+static inline uint end_of_previous_entry(uchar *dir, uchar *end)
+{
+  uchar *pos;
+  for (pos= dir + DIR_ENTRY_SIZE ; pos < end ; pos+= DIR_ENTRY_SIZE)
+  {
+    uint offset;
+    if ((offset= uint2korr(pos)))
+      return offset + uint2korr(pos+2);
+  }
+  return PAGE_HEADER_SIZE;
+}
+
+
+#ifndef DBUG_OFF
+
+static void _ma_print_directory(FILE *file, uchar *buff, uint block_size)
+{
+  uint max_entry= (uint) ((uchar *) buff)[DIR_COUNT_OFFSET], row= 0;
+  uint end_of_prev_row= PAGE_HEADER_SIZE;
+  uchar *dir, *end;
+
+  dir= dir_entry_pos(buff, block_size, max_entry-1);
+  end= dir_entry_pos(buff, block_size, 0);
+
+  DBUG_LOCK_FILE;                               /* If using DBUG_FILE */
+  fprintf(file,"Directory dump (pos:length):\n");
+
+  for (row= 1; dir <= end ; end-= DIR_ENTRY_SIZE, row++)
+  {
+    uint offset= uint2korr(end);
+    uint length= uint2korr(end+2);
+    fprintf(file, "   %4u:%4u", offset, offset ? length : 0);
+    if (!(row % (80/12)))
+      fputc('\n', file);
+    if (offset)
+    {
+      DBUG_ASSERT(offset >= end_of_prev_row);
+      end_of_prev_row= offset + length;
+    }
+  }
+  fputc('\n', file);
+  fflush(file);
+  DBUG_UNLOCK_FILE;
+}
+
+
+static void check_directory(uchar *buff, uint block_size, uint min_row_length,
+                            uint real_empty_size)
+{
+  uchar *dir, *end;
+  uint max_entry= (uint) buff[DIR_COUNT_OFFSET];
+  uint start_of_dir, deleted;
+  uint end_of_prev_row= PAGE_HEADER_SIZE;
+  uint empty_size_on_page;
+  uint empty_size;
+  uchar free_entry, prev_free_entry;
+
+  dir= dir_entry_pos(buff, block_size, max_entry-1);
+  start_of_dir= (uint) (dir - buff);
+  end= dir_entry_pos(buff, block_size, 0);
+  deleted= empty_size= 0;
+
+  empty_size_on_page= (real_empty_size != (uint) -1 ? real_empty_size :
+                       uint2korr(buff + EMPTY_SPACE_OFFSET));
+
+  /* Ensure that all rows are in increasing order and no overlaps */
+  for (; dir <= end ; end-= DIR_ENTRY_SIZE)
+  {
+    uint offset= uint2korr(end);
+    uint length= uint2korr(end+2);
+    if (offset)
+    {
+      DBUG_ASSERT(offset >= end_of_prev_row);
+      DBUG_ASSERT(!length || length >= min_row_length);
+      empty_size+= offset - end_of_prev_row;
+      end_of_prev_row= offset + length;
+    }
+    else
+      deleted++;
+  }
+  empty_size+= start_of_dir - end_of_prev_row;
+  DBUG_ASSERT(end_of_prev_row <= start_of_dir);
+  DBUG_ASSERT(empty_size == empty_size_on_page);
+
+  /* check free links */
+  free_entry= buff[DIR_FREE_OFFSET];
+  prev_free_entry= END_OF_DIR_FREE_LIST;
+  while (free_entry != END_OF_DIR_FREE_LIST)
+  {
+    uchar *dir= dir_entry_pos(buff, block_size, free_entry);
+    DBUG_ASSERT(dir[0] == 0 && dir[1] == 0);
+    DBUG_ASSERT(dir[2] == prev_free_entry);
+    prev_free_entry= free_entry;
+    free_entry= dir[3];
+    deleted--;
+  }
+  DBUG_ASSERT(deleted == 0);
+}
+#else
+#define check_directory(A,B,C,D)
+#endif /* DBUG_OFF */
+
+
+/**
+   @brief Calculate if there is enough entries on the page
+*/
+
+static my_bool enough_free_entries(uchar *buff, uint block_size,
+                                   uint wanted_entries)
+{
+  uint entries= (uint) buff[DIR_COUNT_OFFSET];
+  uint needed_free_entries, free_entry;
+
+  if (entries + wanted_entries <= MAX_ROWS_PER_PAGE)
+    return 1;
+
+  /* Check if enough free entries in free list */
+  needed_free_entries= entries + wanted_entries - MAX_ROWS_PER_PAGE;
+
+  free_entry= (uint) buff[DIR_FREE_OFFSET];
+  while (free_entry != END_OF_DIR_FREE_LIST)
+  {
+    uchar *dir;
+    if (!--needed_free_entries)
+      return 1;
+    dir= dir_entry_pos(buff, block_size, free_entry);
+    free_entry= dir[3];
+  }
+  return 0;                                     /* Not enough entries */
+}
+
+
+/**
+   @brief Check if there is room for more rows on page
+
+   @fn enough_free_entries_on_page
+
+   @return 0    Directory is full
+   @return 1	There is room for more entries on the page
+*/
+
+my_bool enough_free_entries_on_page(MARIA_SHARE *share,
+                                    uchar *page_buff)
+{
+  enum en_page_type page_type;
+  page_type= (enum en_page_type) (page_buff[PAGE_TYPE_OFFSET] &
+                                  ~(uchar) PAGE_CAN_BE_COMPACTED);
+
+  if (page_type == HEAD_PAGE)
+  {
+    uint row_count= (uint) page_buff[DIR_COUNT_OFFSET];
+    return !(row_count == MAX_ROWS_PER_PAGE &&
+             page_buff[DIR_FREE_OFFSET] == END_OF_DIR_FREE_LIST);
+  }
+  return enough_free_entries(page_buff, share->block_size,
+                             1 + share->base.blobs);
+}
+
+
+/**
+   @brief Extend a record area to fit a given size block
+
+   @fn extend_area_on_page()
+   @param info                  Handler if head page and 0 if tail page
+   @param buff			Page buffer
+   @param dir			Pointer to dir entry in buffer
+   @param rownr			Row number we working on
+   @param block_size		Block size of buffer
+   @param request_length	How much data we want to put at [dir]
+   @param empty_space		Total empty space in buffer
+			        This is updated with length after dir
+                                is allocated and current block freed
+
+  @implementation
+    The logic is as follows (same as in _ma_update_block_record())
+    - If new data fits in old block, use old block.
+    - Extend block with empty space before block. If enough, use it.
+    - Extend block with empty space after block. If enough, use it.
+    - Use _ma_compact_block_page() to get all empty space at dir.
+
+  @note
+    The given directory entry is set to rec length.
+    empty_space doesn't include the new directory entry
+
+
+  @return
+  @retval 0   ok
+  @retval ret_offset		Pointer to store offset to found area
+  @retval ret_length		Pointer to store length of found area
+  @retval [dir]                 rec_offset is store here too
+
+  @retval 1   error (wrong info in block)
+*/
+
+static my_bool extend_area_on_page(MARIA_HA *info,
+                                   uchar *buff, uchar *dir,
+                                   uint rownr, uint block_size,
+                                   uint request_length,
+                                   uint *empty_space, uint *ret_offset,
+                                   uint *ret_length)
+{
+  uint rec_offset, length, org_rec_length;
+  uint max_entry= (uint) buff[DIR_COUNT_OFFSET];
+  DBUG_ENTER("extend_area_on_page");
+
+  /*
+    We can't check for min length here as we may have called
+    extend_directory() to create a new (empty) entry just before
+  */
+  check_directory(buff, block_size, 0, *empty_space);
+
+  rec_offset= uint2korr(dir);
+  if (rec_offset)
+  {
+    /* Extending old row;  Mark current space as 'free' */
+    length= org_rec_length= uint2korr(dir + 2);
+    DBUG_PRINT("info", ("rec_offset: %u  length: %u  request_length: %u  "
+                        "empty_space: %u",
+                        rec_offset, org_rec_length, request_length,
+                        *empty_space));
+
+    *empty_space+= org_rec_length;
+  }
+  else
+  {
+    /* Reusing free directory entry; Free it from the directory list */
+    if (dir[2] == END_OF_DIR_FREE_LIST)
+      buff[DIR_FREE_OFFSET]= dir[3];
+    else
+    {
+      uchar *prev_dir= dir_entry_pos(buff, block_size, (uint) dir[2]);
+      DBUG_ASSERT(uint2korr(prev_dir) == 0 && prev_dir[3] == (uchar) rownr);
+      prev_dir[3]= dir[3];
+    }
+    if (dir[3] != END_OF_DIR_FREE_LIST)
+    {
+      uchar *next_dir= dir_entry_pos(buff, block_size, (uint) dir[3]);
+      DBUG_ASSERT(uint2korr(next_dir) == 0 && next_dir[2] == (uchar) rownr);
+      next_dir[2]= dir[2];
+    }
+    rec_offset= start_of_next_entry(dir);
+    length= 0;
+  }
+  if (length < request_length)
+  {
+    uint old_rec_offset;
+    /*
+      New data did not fit in old position.
+      Find first possible position where to put new data.
+    */
+    old_rec_offset= rec_offset;
+    rec_offset= end_of_previous_entry(dir, buff + block_size -
+                                      PAGE_SUFFIX_SIZE);
+    length+= (uint) (old_rec_offset - rec_offset);
+    DBUG_ASSERT(old_rec_offset);
+    /*
+      'length' is 0 if we are doing an insert into a not allocated block.
+      This can only happen during "REDO of INSERT" or "UNDO of DELETE."
+    */
+    if (length < request_length)
+    {
+      /*
+        Did not fit in current block + empty space. Extend with
+        empty space after block.
+      */
+      if (rownr == max_entry - 1)
+      {
+        /* Last entry; Everything is free between this and directory */
+        length= ((block_size - PAGE_SUFFIX_SIZE - DIR_ENTRY_SIZE * max_entry) -
+                 rec_offset);
+      }
+      else
+        length= start_of_next_entry(dir) - rec_offset;
+      DBUG_ASSERT((int) length >= 0);
+      if (length < request_length)
+      {
+        /* Not enough continuous space, compact page to get more */
+        int2store(dir, rec_offset);
+        /* Reset length, as this may be a deleted block */
+        int2store(dir+2, 0);
+        _ma_compact_block_page(buff, block_size, rownr, 1,
+                               info ? info->trn->min_read_from: 0,
+                               info ? info->s->base.min_block_length : 0);
+        rec_offset= uint2korr(dir);
+        length=     uint2korr(dir+2);
+        if (length < request_length)
+        {
+          DBUG_PRINT("error", ("Not enough space: "
+                               "length: %u  request_length: %u",
+                               length, request_length));
+          my_errno= HA_ERR_WRONG_IN_RECORD;     /* File crashed */
+          DBUG_ASSERT(0);                       /* For debugging */
+          DBUG_RETURN(1);                       /* Error in block */
+        }
+        *empty_space= length;                   /* All space is here */
+      }
+    }
+  }
+  int2store(dir, rec_offset);
+  int2store(dir + 2, length);
+  *ret_offset= rec_offset;
+  *ret_length= length;
+
+  check_directory(buff, block_size, info ? info->s->base.min_block_length : 0,
+                  *empty_space - length);
+  DBUG_RETURN(0);
+}
+
+
+/**
+   @brief Copy not changed fields from 'from' to 'to'
+
+   @notes
+   Assumption is that most fields are not changed!
+   (Which is why we don't test if all bits are set for some bytes in bitmap)
+*/
+
+void copy_not_changed_fields(MARIA_HA *info, MY_BITMAP *changed_fields,
+                             uchar *to, uchar *from)
+{
+  MARIA_COLUMNDEF *column, *end_column;
+  uchar *bitmap= (uchar*) changed_fields->bitmap;
+  MARIA_SHARE *share= info->s;
+  uint bit= 1;
+
+  for (column= share->columndef, end_column= column+ share->base.fields;
+       column < end_column; column++)
+  {
+    if (!(*bitmap & bit))
+    {
+      uint field_length= column->length;
+      if (column->type == FIELD_VARCHAR)
+      {
+        if (column->fill_length == 1)
+          field_length= (uint) from[column->offset] + 1;
+        else
+          field_length= uint2korr(from + column->offset) + 2;
+      }
+      memcpy(to + column->offset, from + column->offset, field_length);
+    }
+    if ((bit= (bit << 1)) == 256)
+    {
+      bitmap++;
+      bit= 1;
+    }
+  }
+}
+
+#ifdef NOT_YET_NEEDED
+/* Calculate empty space on a page */
+
+static uint empty_space_on_page(uchar *buff, uint block_size)
+{
+  enum en_page_type;
+  page_type= (enum en_page_type) (buff[PAGE_TYPE_OFFSET] &
+                                  ~(uchar) PAGE_CAN_BE_COMPACTED);
+  if (page_type == UNALLOCATED_PAGE)
+    return block_size;
+  if ((uint) page_type <= TAIL_PAGE)
+    return uint2korr(buff+EMPTY_SPACE_OFFSET);
+  return 0;                                     /* Blob page */
+}
+#endif
+
+
+/*
+  @brief Ensure we have space for new directory entries
+
+  @fn make_space_for_directory()
+  @param buff		Page buffer
+  @param block_size	Block size for pages
+  @param max_entry	Number of current entries in directory
+  @param count		Number of new entries to be added to directory
+  @param first_dir	First directory entry on page
+  @param empty_space    Total empty space in buffer. It's updated
+			to reflect the new empty space
+  @param first_pos      Store position to last data byte on page here
+
+  @note
+  This function is inline as the argument passing is the biggest
+  part of the function
+
+  @return
+  @retval 0  ok
+  @retval 1  error (No data on page, fatal error)
+*/
+
+static inline my_bool
+make_space_for_directory(MARIA_HA *info,
+                         uchar *buff, uint block_size, uint max_entry,
+                         uint count, uchar *first_dir, uint *empty_space,
+                         uint *first_pos)
+{
+  uint length_needed= DIR_ENTRY_SIZE * count;
+
+  /*
+    The following is not true only in the case and UNDO is used to reinsert
+    a row on a previously not used page
+  */
+  if (likely(max_entry))
+  {
+    /* Check if there is place for the directory entry on the page */
+    *first_pos= uint2korr(first_dir) + uint2korr(first_dir + 2);
+
+    if ((uint) (first_dir - buff) < *first_pos + length_needed)
+    {
+      /* Create place for directory */
+      _ma_compact_block_page(buff, block_size, max_entry - 1, 0,
+                             info ? info->trn->min_read_from : 0,
+                             info ? info->s->base.min_block_length : 0);
+      *first_pos= (uint2korr(first_dir) + uint2korr(first_dir + 2));
+      *empty_space= uint2korr(buff + EMPTY_SPACE_OFFSET);
+      if (*empty_space < length_needed)
+      {
+        /*
+          We should always have space, as we only come here for
+          UNDO of DELETE (in which case we know the row was on the
+          page before) or if the bitmap told us there was space on page
+        */
+        DBUG_ASSERT(0);
+        return(1);
+      }
+    }
+  }
+  else
+    *first_pos= PAGE_HEADER_SIZE;
+
+  /* Reduce directory entry size from free space size */
+  (*empty_space)-= length_needed;
+  buff[DIR_COUNT_OFFSET]= (uchar) (max_entry + count);
+  return(0);
+}
+
+
+/*
+  Find free position in directory
+
+  SYNOPSIS
+  find_free_position()
+    info                Handler if head page and 0 otherwise
+    buff                Page
+    block_size          Size of page
+    res_rownr           Store index to free position here
+    res_length		Store length of found segment here
+    empty_space		Store length of empty space on disk here. This is
+		        all empty space, including the found block.
+
+  NOTES
+    If there is a free directory entry (entry with position == 0),
+    then use it and change it to be the size of the empty block
+    after the previous entry. This guarantees that all row entries
+    are stored on disk in inverse directory order, which makes life easier for
+    '_ma_compact_block_page()' and to know if there is free space after any
+    block.
+
+    If there is no free entry (entry with position == 0), then we create
+    a new one. If there is not space for the directory entry (because
+    the last block overlapps with the directory), we compact the page.
+
+    We will update the offset and the length of the found dir entry to
+    match the position and empty space found.
+
+    buff[EMPTY_SPACE_OFFSET] is NOT updated but left up to the caller
+
+    See start of file for description of how free directory entires are linked
+
+  RETURN
+    0      Error (directory full or last block goes over directory)
+    #      Pointer to directory entry on page
+*/
+
+static uchar *find_free_position(MARIA_HA *info,
+                                 uchar *buff, uint block_size, uint *res_rownr,
+                                 uint *res_length, uint *empty_space)
+{
+  uint max_entry, free_entry;
+  uint length, first_pos;
+  uchar *dir, *first_dir;
+  DBUG_ENTER("find_free_position");
+
+  max_entry= (uint) buff[DIR_COUNT_OFFSET];
+  free_entry= (uint) buff[DIR_FREE_OFFSET];
+  *empty_space= uint2korr(buff + EMPTY_SPACE_OFFSET);
+
+  DBUG_PRINT("info", ("max_entry: %u  free_entry: %u", max_entry, free_entry));
+
+  first_dir= dir_entry_pos(buff, block_size, max_entry - 1);
+
+  /* Search after first free position */
+  if (free_entry != END_OF_DIR_FREE_LIST)
+  {
+    if (free_entry >= max_entry)
+      DBUG_RETURN(0);                           /* Consistency error */
+    dir= dir_entry_pos(buff, block_size, free_entry);
+    DBUG_ASSERT(uint2korr(dir) == 0 && dir[2] == END_OF_DIR_FREE_LIST);
+    /* Relink free list */
+    if ((buff[DIR_FREE_OFFSET]= dir[3]) != END_OF_DIR_FREE_LIST)
+    {
+      uchar *next_entry= dir_entry_pos(buff, block_size, (uint) dir[3]);
+      DBUG_ASSERT((uint) next_entry[2] == free_entry &&
+                  uint2korr(next_entry) == 0);
+      next_entry[2]= END_OF_DIR_FREE_LIST;      /* Backlink */
+    }
+
+    first_pos= end_of_previous_entry(dir, buff + block_size -
+                                     PAGE_SUFFIX_SIZE);
+    length= start_of_next_entry(dir) - first_pos;
+    int2store(dir, first_pos);                /* Update dir entry */
+    int2store(dir + 2, 0);
+    *res_rownr= free_entry;
+    *res_length= length;
+
+    check_directory(buff, block_size,
+                    info ? info->s->base.min_block_length : 0, (uint) -1);
+    DBUG_RETURN(dir);
+  }
+  /* No free places in dir; create a new one */
+
+  /* Check if there is place for the directory entry */
+  if (max_entry == MAX_ROWS_PER_PAGE)
+    DBUG_RETURN(0);
+
+  if (make_space_for_directory(info, buff, block_size, max_entry, 1,
+                               first_dir, empty_space, &first_pos))
+    DBUG_RETURN(0);
+
+  dir= first_dir - DIR_ENTRY_SIZE;
+  length= (uint) (dir - buff - first_pos);
+  DBUG_ASSERT(length <= *empty_space);
+  int2store(dir, first_pos);
+  int2store(dir + 2, 0);                      /* Max length of region */
+  *res_rownr= max_entry;
+  *res_length= length;
+
+  check_directory(buff, block_size, info ? info->s->base.min_block_length : 0,
+                  *empty_space);
+  DBUG_RETURN(dir);
+}
+
+
+/**
+   @brief Enlarge page directory to hold more entries
+
+   @fn extend_directory()
+   @param info          Handler if head page and 0 otherwise
+   @param buff		Page buffer
+   @param block_size	Block size
+   @param max_entry	Number of directory entries on page
+   @param new_entry	Position for new entry
+   @param empty_space	Total empty space in buffer. It's updated
+			to reflect the new empty space
+
+   @note
+   This is only called on UNDO when we want to expand the directory
+   to be able to re-insert row in a given position
+
+   The new directory entry will be set to cover the maximum possible space
+
+   @return
+   @retval 0  ok
+   @retval 1  error (No data on page, fatal error)
+*/
+
+static my_bool extend_directory(MARIA_HA *info, uchar *buff, uint block_size,
+                                uint max_entry, uint new_entry,
+                                uint *empty_space)
+{
+  uint length, first_pos;
+  uchar *dir, *first_dir;
+  DBUG_ENTER("extend_directory");
+
+  /*
+    Note that in if max_entry is 0, then first_dir will point to
+    an illegal directory entry. This is ok, as in this case we will
+    not access anything through first_dir.
+  */
+  first_dir= dir_entry_pos(buff, block_size, max_entry) + DIR_ENTRY_SIZE;
+
+  if (make_space_for_directory(info, buff, block_size, max_entry,
+                               new_entry - max_entry + 1,
+                               first_dir, empty_space, &first_pos))
+    DBUG_RETURN(1);
+
+  /* Set the new directory entry to cover the max possible length */
+  dir= first_dir - DIR_ENTRY_SIZE * (new_entry - max_entry + 1);
+  length= (uint) (dir - buff - first_pos);
+  int2store(dir, first_pos);
+  int2store(dir+2, length);
+  *empty_space-= length;
+
+  if (new_entry-- > max_entry)
+  {
+    /* Link all row entries between new_entry and max_entry into free list */
+    uint free_entry= (uint) buff[DIR_FREE_OFFSET];
+    uint prev_entry= END_OF_DIR_FREE_LIST;
+    buff[DIR_FREE_OFFSET]= new_entry;
+    do
+    {
+      dir+= DIR_ENTRY_SIZE;
+      dir[0]= dir[1]= 0;
+      dir[2]= (uchar) prev_entry;
+      dir[3]= (uchar) new_entry-1;
+      prev_entry= new_entry;
+    } while (new_entry-- > max_entry);
+    if ((dir[3]= free_entry) != END_OF_DIR_FREE_LIST)
+    {
+      /* Relink next entry to point to newly freed entry */
+      uchar *next_entry= dir_entry_pos(buff, block_size, (uint) dir[3]);
+      DBUG_ASSERT(uint2korr(next_entry) == 0 &&
+                  next_entry[2] == END_OF_DIR_FREE_LIST);
+      next_entry[2]= max_entry;
+    }
+  }
+
+  check_directory(buff, block_size,
+                  info ? min(info->s->base.min_block_length, length) : 0,
+                  *empty_space);
+  DBUG_RETURN(0);
+}
+
+
+/****************************************************************************
+  Updating records
+****************************************************************************/
+
+/*
+  Calculate length of all the different field parts
+
+  SYNOPSIS
+    calc_record_size()
+    info	Maria handler
+    record      Row to store
+    row		Store statistics about row here
+
+  NOTES
+    The statistics is used to find out how much space a row will need
+    and also where we can split a row when we need to split it into several
+    extents.
+*/
+
+static void calc_record_size(MARIA_HA *info, const uchar *record,
+                             MARIA_ROW *row)
+{
+  MARIA_SHARE *share= info->s;
+  uchar *field_length_data;
+  MARIA_COLUMNDEF *column, *end_column;
+  uint *null_field_lengths= row->null_field_lengths;
+  ulong *blob_lengths= row->blob_lengths;
+  DBUG_ENTER("calc_record_size");
+
+  row->normal_length= row->char_length= row->varchar_length=
+    row->blob_length= row->extents_count= 0;
+
+  /* Create empty bitmap and calculate length of each varlength/char field */
+  bzero(row->empty_bits, share->base.pack_bytes);
+  field_length_data= row->field_lengths;
+  for (column= share->columndef + share->base.fixed_not_null_fields,
+       end_column= share->columndef + share->base.fields;
+       column < end_column; column++, null_field_lengths++)
+  {
+    if ((record[column->null_pos] & column->null_bit))
+    {
+      if (column->type != FIELD_BLOB)
+        *null_field_lengths= 0;
+      else
+        *blob_lengths++= 0;
+      continue;
+    }
+    switch (column->type) {
+    case FIELD_CHECK:
+    case FIELD_NORMAL:                          /* Fixed length field */
+    case FIELD_ZERO:
+      DBUG_ASSERT(column->empty_bit == 0);
+      /* fall through */
+    case FIELD_SKIP_PRESPACE:                   /* Not packed */
+      row->normal_length+= column->length;
+      *null_field_lengths= column->length;
+      break;
+    case FIELD_SKIP_ZERO:                       /* Fixed length field */
+      if (memcmp(record+ column->offset, maria_zero_string,
+                 column->length) == 0)
+      {
+        row->empty_bits[column->empty_pos] |= column->empty_bit;
+        *null_field_lengths= 0;
+      }
+      else
+      {
+        row->normal_length+= column->length;
+        *null_field_lengths= column->length;
+      }
+      break;
+    case FIELD_SKIP_ENDSPACE:                   /* CHAR */
+    {
+      const uchar *pos, *end;
+      for (pos= record + column->offset, end= pos + column->length;
+           end > pos && end[-1] == ' '; end--)
+        ;
+      if (pos == end)                           /* If empty string */
+      {
+        row->empty_bits[column->empty_pos]|= column->empty_bit;
+        *null_field_lengths= 0;
+      }
+      else
+      {
+        uint length= (uint) (end - pos);
+        if (column->length <= 255)
+          *field_length_data++= (uchar) length;
+        else
+        {
+          int2store(field_length_data, length);
+          field_length_data+= 2;
+        }
+        row->char_length+= length;
+        *null_field_lengths= length;
+      }
+      break;
+    }
+    case FIELD_VARCHAR:
+    {
+      uint length, field_length_data_length;
+      const uchar *field_pos= record + column->offset;
+
+      /* 256 is correct as this includes the length uchar */
+      field_length_data[0]= field_pos[0];
+      if (column->length <= 256)
+      {
+        length= (uint) (uchar) *field_pos;
+        field_length_data_length= 1;
+      }
+      else
+      {
+        length= uint2korr(field_pos);
+        field_length_data[1]= field_pos[1];
+        field_length_data_length= 2;
+      }
+      *null_field_lengths= length;
+      if (!length)
+      {
+        row->empty_bits[column->empty_pos]|= column->empty_bit;
+        break;
+      }
+      row->varchar_length+= length;
+      *null_field_lengths= length;
+      field_length_data+= field_length_data_length;
+      break;
+    }
+    case FIELD_BLOB:
+    {
+      const uchar *field_pos= record + column->offset;
+      uint size_length= column->length - portable_sizeof_char_ptr;
+      ulong blob_length= _ma_calc_blob_length(size_length, field_pos);
+
+      *blob_lengths++= blob_length;
+      if (!blob_length)
+        row->empty_bits[column->empty_pos]|= column->empty_bit;
+      else
+      {
+        row->blob_length+= blob_length;
+        memcpy(field_length_data, field_pos, size_length);
+        field_length_data+= size_length;
+      }
+      break;
+    }
+    default:
+      DBUG_ASSERT(0);
+    }
+  }
+  row->field_lengths_length= (uint) (field_length_data - row->field_lengths);
+  /*
+    - info->row_base_length is base information we must have on a page in first
+      extent:
+      - flag byte (1) + is_nulls_extended (0 | 1) + null_bytes + pack_bytes +
+        table_checksum (0 | 1)
+    - row->min_length is minimum amount of data we must store on
+      a page. bitmap code will ensure we get at list this much +
+      total number of extents and one extent information
+    - fixed_not_null_fields_length is length of fixed length fields that can't
+      be compacted
+    - head_length is the amount of data for the head page
+     (ie, all fields except blobs)
+  */
+  row->min_length=   (info->row_base_length +
+                      (share->base.max_field_lengths ?
+                       size_to_store_key_length(row->field_lengths_length) :
+                       0));
+  row->head_length= (row->min_length +
+                     share->base.fixed_not_null_fields_length +
+                     row->field_lengths_length +
+                     row->normal_length +
+                     row->char_length + row->varchar_length);
+  row->total_length= (row->head_length + row->blob_length);
+  if (row->total_length < share->base.min_block_length)
+    row->total_length= share->base.min_block_length;
+  DBUG_PRINT("exit", ("head_length: %lu  total_length: %lu",
+                      (ulong) row->head_length, (ulong) row->total_length));
+  DBUG_VOID_RETURN;
+}
+
+
+/**
+  Compact page by removing all space between rows
+
+  Moves up all rows to start of page. Moves blocks that are directly after
+  each other with one memmove.
+
+  @note if rownr is the last row in the page, and extend_block is false,
+  caller has to make sure to update bitmap page afterwards to reflect freed
+  space.
+
+  @param  buff          Page to compact
+  @param  block_size    Size of page
+  @param  rownr         Put empty data after this row
+  @param  extend_block	If 1, extend the block at 'rownr' to cover the
+                        whole block.
+  @param  min_read_from If <> 0, remove all trid's that are less than this
+*/
+
+void _ma_compact_block_page(uchar *buff, uint block_size, uint rownr,
+                            my_bool extend_block, TrID min_read_from,
+                            uint min_row_length)
+{
+  uint max_entry= (uint) buff[DIR_COUNT_OFFSET];
+  uint page_pos, next_free_pos, start_of_found_block, diff, end_of_found_block;
+  uint freed_size= 0;
+  uchar *dir, *end;
+  DBUG_ENTER("_ma_compact_block_page");
+  DBUG_PRINT("enter", ("rownr: %u  min_read_from: %lu", rownr,
+                       (ulong) min_read_from));
+  DBUG_ASSERT(max_entry > 0 &&
+              max_entry < (block_size - PAGE_HEADER_SIZE -
+                           PAGE_SUFFIX_SIZE) / DIR_ENTRY_SIZE);
+
+  /* Move all entries before and including rownr up to start of page */
+  dir= dir_entry_pos(buff, block_size, rownr);
+  end= dir_entry_pos(buff, block_size, 0);
+  page_pos= next_free_pos= start_of_found_block= PAGE_HEADER_SIZE;
+  diff= 0;
+  for (; dir <= end ; end-= DIR_ENTRY_SIZE)
+  {
+    uint offset= uint2korr(end);
+
+    if (offset)
+    {
+      uint row_length= uint2korr(end + 2);
+      DBUG_ASSERT(offset >= page_pos);
+      DBUG_ASSERT(buff + offset + row_length <= dir);
+      DBUG_ASSERT(row_length >= min_row_length || row_length == 0);
+
+      /* Row length can be zero if row is to be deleted */
+      if (min_read_from && row_length && (buff[offset] & ROW_FLAG_TRANSID))
+      {
+        TrID transid= transid_korr(buff+offset+1);
+        if (transid < min_read_from)
+        {
+          /* Remove transid from row by moving the start point of the row up */
+          buff[offset + TRANSID_SIZE]= buff[offset] & ~ROW_FLAG_TRANSID;
+          offset+= TRANSID_SIZE;
+          freed_size+= TRANSID_SIZE;
+          row_length-= TRANSID_SIZE;
+          int2store(end+2, row_length);
+        }
+      }
+
+      if (offset != next_free_pos)
+      {
+        uint length= (next_free_pos - start_of_found_block);
+        /*
+          There was empty space before this and prev block
+          Check if we have to move previous block up to page start
+        */
+        if (page_pos != start_of_found_block)
+        {
+          /* move up previous block */
+          memmove(buff + page_pos, buff + start_of_found_block, length);
+        }
+        page_pos+= length;
+        /* next continuous block starts here */
+        start_of_found_block= offset;
+        diff= offset - page_pos;
+      }
+      int2store(end, offset - diff);            /* correct current pos */
+      next_free_pos= offset + row_length;
+
+      if (unlikely(row_length < min_row_length) && row_length)
+      {
+        /*
+          This can only happen in the case we compacted transid and
+          the row become 'too short'
+
+          Move the current row down to it's right place and extend it
+          with 0.
+        */
+        uint row_diff= min_row_length - row_length;
+        uint length= (next_free_pos - start_of_found_block);
+
+        DBUG_ASSERT(page_pos != start_of_found_block);
+        bmove(buff + page_pos, buff + start_of_found_block, length);
+        bzero(buff+ page_pos + length, row_diff);
+        page_pos+= min_row_length;
+        int2store(end+2, min_row_length);
+        freed_size-= row_diff;
+        next_free_pos= start_of_found_block= page_pos;
+        diff= 0;
+      }
+    }
+  }
+  if (page_pos != start_of_found_block)
+  {
+    uint length= (next_free_pos - start_of_found_block);
+    memmove(buff + page_pos, buff + start_of_found_block, length);
+  }
+  start_of_found_block= uint2korr(dir);
+
+  if (rownr != max_entry - 1)
+  {
+    /* Move all entries after rownr to end of page */
+    uint rownr_length;
+
+    DBUG_ASSERT(extend_block);                  /* Should always be true */
+    next_free_pos= end_of_found_block= page_pos=
+      block_size - DIR_ENTRY_SIZE * max_entry - PAGE_SUFFIX_SIZE;
+    diff= 0;
+    /* End points to entry before 'rownr' */
+    for (dir= buff + end_of_found_block ; dir <= end ; dir+= DIR_ENTRY_SIZE)
+    {
+      uint offset= uint2korr(dir);
+      uint row_length;
+      uint row_end;
+      if (!offset)
+        continue;
+      row_length= uint2korr(dir + 2);
+      row_end= offset + row_length;
+      DBUG_ASSERT(offset >= start_of_found_block &&
+                  row_end <= next_free_pos && row_length >= min_row_length);
+
+      if (min_read_from && (buff[offset] & ROW_FLAG_TRANSID))
+      {
+        TrID transid= transid_korr(buff + offset+1);
+        if (transid < min_read_from)
+        {
+          /* Remove transid from row */
+          buff[offset + TRANSID_SIZE]= buff[offset] & ~ROW_FLAG_TRANSID;
+          offset+= TRANSID_SIZE;
+          row_length-= TRANSID_SIZE;
+          int2store(dir+2, row_length);
+        }
+        if (unlikely(row_length < min_row_length))
+        {
+          /*
+            This can only happen in the case we compacted transid and
+            the row become 'too short'
+          */
+          uint row_diff= min_row_length - row_length;
+          if (next_free_pos < row_end + row_diff)
+          {
+            /*
+              Not enough space for extending next block with enough
+              end 0's. Move current data down to get place for them
+            */
+            uint move_down= row_diff - (next_free_pos - row_end);
+            bmove(buff + offset - move_down, buff + offset, row_length);
+            offset-= move_down;
+          }
+          /*
+            Extend the next block with 0, which will be part of current
+            row when the blocks are joined together later
+          */
+          bzero(buff + next_free_pos - row_diff, row_diff);
+          next_free_pos-= row_diff;
+          int2store(dir+2, min_row_length);
+        }
+        row_end= offset + row_length;
+      }
+
+      if (row_end != next_free_pos)
+      {
+        uint length= (end_of_found_block - next_free_pos);
+        if (page_pos != end_of_found_block)
+        {
+          /* move next block down */
+          memmove(buff + page_pos - length, buff + next_free_pos, length);
+        }
+        page_pos-= length;
+        /* next continuous block starts here */
+        end_of_found_block= row_end;
+        diff= page_pos - row_end;
+      }
+      int2store(dir, offset + diff);            /* correct current pos */
+      next_free_pos= offset;
+    }
+    if (page_pos != end_of_found_block)
+    {
+      uint length= (end_of_found_block - next_free_pos);
+      memmove(buff + page_pos - length, buff + next_free_pos, length);
+      next_free_pos= page_pos- length;
+    }
+
+    /* Extend rownr block to cover hole */
+    rownr_length= next_free_pos - start_of_found_block;
+    int2store(dir+2, rownr_length);
+    DBUG_ASSERT(rownr_length >= min_row_length);
+  }
+  else
+  {
+    if (extend_block)
+    {
+      /* Extend last block to cover whole page */
+      uint length= ((uint) (dir - buff) - start_of_found_block);
+      int2store(dir+2, length);
+      DBUG_ASSERT(length >= min_row_length);
+    }
+    else
+    {
+      /* Add length gained from freed transaction id's to this page */
+      uint length= uint2korr(buff+ EMPTY_SPACE_OFFSET) + freed_size;
+      int2store(buff + EMPTY_SPACE_OFFSET, length);
+    }
+    buff[PAGE_TYPE_OFFSET]&= ~(uchar) PAGE_CAN_BE_COMPACTED;
+  }
+  check_directory(buff, block_size, min_row_length,
+                  extend_block ? 0 : (uint) -1);
+  DBUG_EXECUTE("directory", _ma_print_directory(DBUG_FILE, buff, block_size););
+  DBUG_VOID_RETURN;
+}
+
+
+/*
+  Create an empty tail or head page
+
+  SYNOPSIS
+    make_empty_page()
+    buff		Page buffer
+    block_size		Block size
+    page_type		HEAD_PAGE or TAIL_PAGE
+    create_dir_entry	TRUE of we should create a directory entry
+
+  NOTES
+    EMPTY_SPACE is not updated
+*/
+
+static void make_empty_page(MARIA_HA *info, uchar *buff, uint page_type,
+                            my_bool create_dir_entry)
+{
+  uint block_size= info->s->block_size;
+  DBUG_ENTER("make_empty_page");
+
+  bzero(buff, PAGE_HEADER_SIZE);
+
+#if !defined(DONT_ZERO_PAGE_BLOCKS) || defined(HAVE_valgrind)
+  /*
+    We zero the rest of the block to avoid getting old memory information
+    to disk and to allow the file to be compressed better if archived.
+    The code does not assume the block is zeroed.
+  */
+  if (page_type != BLOB_PAGE)
+    bzero(buff+ PAGE_HEADER_SIZE, block_size - PAGE_HEADER_SIZE);
+#endif
+  buff[PAGE_TYPE_OFFSET]= (uchar) page_type;
+  buff[DIR_COUNT_OFFSET]= (int) create_dir_entry;
+  buff[DIR_FREE_OFFSET]=  END_OF_DIR_FREE_LIST;
+  if (create_dir_entry)
+  {
+    /* Create directory entry to point to start of page with size 0 */
+    buff+= block_size - PAGE_SUFFIX_SIZE - DIR_ENTRY_SIZE;
+    int2store(buff, PAGE_HEADER_SIZE);
+    int2store(buff+2, 0);
+  }
+  DBUG_VOID_RETURN;
+}
+
+
+/*
+  Read or initialize new head or tail page
+
+  SYNOPSIS
+    get_head_or_tail_page()
+    info                        Maria handler
+    block                       Block to read
+    buff                        Suggest this buffer to key cache
+    length                      Minimum space needed
+    page_type			HEAD_PAGE || TAIL_PAGE
+    res                         Store result position here
+
+  NOTES
+    We don't decremented buff[EMPTY_SPACE_OFFSET] with the allocated data
+    as we don't know how much data the caller will actually use.
+
+    res->empty_space is set to length of empty space
+
+  RETURN
+    0  ok     All slots in 'res' are updated
+    1  error  my_errno is set
+*/
+
+struct st_row_pos_info
+{
+  uchar *buff;                                  /* page buffer */
+  uchar *data;                                  /* Place for data */
+  uchar *dir;                                   /* Directory */
+  uint length;                                  /* Length for data */
+  uint rownr;                                   /* Offset in directory */
+  uint empty_space;                             /* Space left on page */
+};
+
+
+static my_bool get_head_or_tail_page(MARIA_HA *info,
+                                     MARIA_BITMAP_BLOCK *block,
+                                     uchar *buff, uint length, uint page_type,
+                                     enum pagecache_page_lock lock,
+                                     struct st_row_pos_info *res)
+{
+  uint block_size;
+  MARIA_PINNED_PAGE page_link;
+  MARIA_SHARE *share= info->s;
+  DBUG_ENTER("get_head_or_tail_page");
+  DBUG_PRINT("enter", ("page_type: %u  length: %u", page_type, length));
+
+  block_size= share->block_size;
+  if (block->org_bitmap_value == 0)             /* Empty block */
+  {
+    /* New page */
+    make_empty_page(info, buff, page_type, 1);
+    res->buff= buff;
+    res->empty_space= res->length= (block_size - PAGE_OVERHEAD_SIZE);
+    res->data= (buff + PAGE_HEADER_SIZE);
+    res->dir= res->data + res->length;
+    res->rownr= 0;
+    DBUG_ASSERT(length <= res->length);
+  }
+  else
+  {
+    uchar *dir;
+    /* Read old page */
+    page_link.unlock= PAGECACHE_LOCK_WRITE_UNLOCK;
+    res->buff= pagecache_read(share->pagecache, &info->dfile,
+                              block->page, 0, 0, share->page_type,
+                              lock, &page_link.link);
+    page_link.changed= res->buff != 0;
+    push_dynamic(&info->pinned_pages, (void*) &page_link);
+    if (!page_link.changed)
+      goto crashed;
+
+    DBUG_ASSERT((uint) (res->buff[PAGE_TYPE_OFFSET] & PAGE_TYPE_MASK) ==
+                page_type);
+    if (!(dir= find_free_position(page_type == HEAD_PAGE ? info : 0,
+                                  res->buff, block_size, &res->rownr,
+                                  &res->length, &res->empty_space)))
+      goto crashed;
+
+    if (res->length < length)
+    {
+      if (res->empty_space + res->length >= length)
+      {
+        _ma_compact_block_page(res->buff, block_size, res->rownr, 1,
+                               (page_type == HEAD_PAGE ?
+                                info->trn->min_read_from : 0),
+                               (page_type == HEAD_PAGE ?
+                                share->base.min_block_length :
+                                0));
+        /* All empty space are now after current position */
+        dir= dir_entry_pos(res->buff, block_size, res->rownr);
+        res->length= res->empty_space= uint2korr(dir+2);
+      }
+      if (res->length < length)
+      {
+        DBUG_PRINT("error", ("length: %u  res->length: %u  empty_space: %u",
+                             length, res->length, res->empty_space));
+        goto crashed;                         /* Wrong bitmap information */
+      }
+    }
+    res->dir= dir;
+    res->data= res->buff + uint2korr(dir);
+  }
+  DBUG_RETURN(0);
+
+crashed:
+  my_errno= HA_ERR_WRONG_IN_RECORD;             /* File crashed */
+  DBUG_RETURN(1);
+}
+
+
+/*
+  @brief Create room for a head or tail row on a given page at given position
+
+  @fn get_rowpos_in_head_or_tail_page()
+  @param info                        Maria handler
+  @param block                       Block to read
+  @param buff                        Suggest this buffer to key cache
+  @param length                      Minimum space needed
+  @param page_type	             HEAD_PAGE || TAIL_PAGE
+  @param rownr			     Rownr to use
+  @param res                         Store result position here
+
+  @note
+    This is essential same as get_head_or_tail_page, with the difference
+    that the caller species at what position the row should be put.
+    This is used when restoring a row to it's original position as
+    part of UNDO DELETE or UNDO UPDATE
+
+  @return
+  @retval 0  ok     All slots in 'res' are updated
+  @retval 1  error  my_errno is set
+*/
+
+static my_bool get_rowpos_in_head_or_tail_page(MARIA_HA *info,
+                                               MARIA_BITMAP_BLOCK *block,
+                                               uchar *buff, uint length,
+                                               uint page_type,
+                                               enum pagecache_page_lock lock,
+                                               uint rownr,
+                                               struct st_row_pos_info *res)
+{
+  MARIA_PINNED_PAGE page_link;
+  MARIA_SHARE *share= info->s;
+  uchar *dir;
+  uint block_size= share->block_size;
+  uint max_entry, max_length, rec_offset;
+  DBUG_ENTER("get_rowpos_in_head_or_tail_page");
+
+  if (block->org_bitmap_value == 0)             /* Empty block */
+  {
+    /* New page */
+    make_empty_page(info, buff, page_type, 0);
+    res->empty_space= block_size - PAGE_HEADER_SIZE - PAGE_SUFFIX_SIZE;
+  }
+  else
+  {
+    page_link.unlock= PAGECACHE_LOCK_WRITE_UNLOCK;
+    buff= pagecache_read(share->pagecache, &info->dfile,
+                         block->page, 0, 0, share->page_type,
+                         lock, &page_link.link);
+    page_link.changed= buff != 0;
+    push_dynamic(&info->pinned_pages, (void*) &page_link);
+    if (!page_link.changed)                     /* Read error */
+      goto err;
+    DBUG_ASSERT((buff[PAGE_TYPE_OFFSET] & PAGE_TYPE_MASK) ==
+                (uchar) page_type);
+    if ((buff[PAGE_TYPE_OFFSET] & PAGE_TYPE_MASK) != (uchar) page_type)
+      goto err;
+    res->empty_space= uint2korr(buff + EMPTY_SPACE_OFFSET);
+  }
+
+  max_entry= (uint) buff[DIR_COUNT_OFFSET];
+  if (max_entry <= rownr)
+  {
+    if (extend_directory(page_type == HEAD_PAGE ? info : 0, buff, block_size,
+                         max_entry, rownr, &res->empty_space))
+      goto err;
+  }
+
+  /*
+    The following dir entry is unused in case of insert / update but
+    not in case of undo_update / undo_delete
+  */
+  dir= dir_entry_pos(buff, block_size, rownr);
+
+  if (extend_area_on_page(page_type == HEAD_PAGE ? info : 0, buff, dir,
+                          rownr, block_size, length,
+                          &res->empty_space, &rec_offset, &max_length))
+    goto err;
+
+  res->buff= buff;
+  res->rownr= rownr;
+  res->dir= dir;
+  res->data= buff + rec_offset;
+  res->length= length;
+  DBUG_RETURN(0);
+
+err:
+  my_errno= HA_ERR_WRONG_IN_RECORD;             /* File crashed */
+  DBUG_RETURN(1);
+}
+
+
+/*
+  Write tail for head data or blob
+
+  SYNOPSIS
+    write_tail()
+    info                Maria handler
+    block               Block to tail page
+    row_part            Data to write to page
+    length              Length of data
+
+  NOTES
+    block->page_count is updated to the directory offset for the tail
+    so that we can store the position in the row extent information
+
+  RETURN
+    0  ok
+       block->page_count is set to point (dir entry + TAIL_BIT)
+
+    1  error; In this case my_errno is set to the error
+*/
+
+static my_bool write_tail(MARIA_HA *info,
+                          MARIA_BITMAP_BLOCK *block,
+                          uchar *row_part, uint org_length)
+{
+  MARIA_SHARE *share= info->s;
+  MARIA_PINNED_PAGE page_link;
+  uint block_size= share->block_size, empty_space, length= org_length;
+  struct st_row_pos_info row_pos;
+  my_off_t position;
+  my_bool res, block_is_read;
+  DBUG_ENTER("write_tail");
+  DBUG_PRINT("enter", ("page: %lu  length: %u",
+                       (ulong) block->page, length));
+
+  info->keyread_buff_used= 1;
+  /*
+    Don't allocate smaller block than MIN_TAIL_SIZE (we want to give rows
+    some place to grow in the future)
+  */
+  if (length < MIN_TAIL_SIZE)
+    length= MIN_TAIL_SIZE;
+
+  if (block->page_count == TAIL_PAGE_COUNT_MARKER)
+  {
+    /*
+      Create new tail
+      page will be pinned & locked by get_head_or_tail_page
+    */
+    if (get_head_or_tail_page(info, block, info->keyread_buff, length,
+                              TAIL_PAGE, PAGECACHE_LOCK_WRITE,
+                              &row_pos))
+      DBUG_RETURN(1);
+  }
+  else
+  {
+    /* Write tail on predefined row position */
+    if (get_rowpos_in_head_or_tail_page(info, block, info->keyread_buff,
+                                        length, TAIL_PAGE,
+                                        PAGECACHE_LOCK_WRITE,
+                                        block->page_count & ~TAIL_BIT,
+                                        &row_pos))
+      DBUG_RETURN(1);
+  }
+  DBUG_PRINT("info", ("tailid: %lu (%lu:%u)",
+                      (ulong) ma_recordpos(block->page, row_pos.rownr),
+                      (ulong) block->page, row_pos.rownr));
+
+  block_is_read= block->org_bitmap_value != 0;
+
+  memcpy(row_pos.data, row_part, org_length);
+
+  if (share->now_transactional)
+  {
+    /* Log changes in tail block */
+    uchar log_data[FILEID_STORE_SIZE + PAGE_STORE_SIZE + DIRPOS_STORE_SIZE];
+    LEX_CUSTRING log_array[TRANSLOG_INTERNAL_PARTS + 2];
+    LSN lsn;
+
+    /*
+      Log REDO changes of tail page
+      Note that we have to log length, not org_length, to be sure that
+      REDO, which doesn't use write_tail, also creates a block of at least
+      MIN_TAIL_SIZE
+     */
+    page_store(log_data + FILEID_STORE_SIZE, block->page);
+    dirpos_store(log_data + FILEID_STORE_SIZE + PAGE_STORE_SIZE,
+                 row_pos.rownr);
+    log_array[TRANSLOG_INTERNAL_PARTS + 0].str=    log_data;
+    log_array[TRANSLOG_INTERNAL_PARTS + 0].length= sizeof(log_data);
+    log_array[TRANSLOG_INTERNAL_PARTS + 1].str=    row_pos.data;
+    log_array[TRANSLOG_INTERNAL_PARTS + 1].length= length;
+    if (translog_write_record(&lsn,
+                              (block_is_read ? LOGREC_REDO_INSERT_ROW_TAIL :
+                               LOGREC_REDO_NEW_ROW_TAIL),
+                              info->trn, info,
+                              (translog_size_t) (sizeof(log_data) + length),
+                              TRANSLOG_INTERNAL_PARTS + 2, log_array,
+                              log_data, NULL))
+      DBUG_RETURN(1);
+  }
+
+  int2store(row_pos.dir + 2, length);
+  empty_space= row_pos.empty_space - length;
+  int2store(row_pos.buff + EMPTY_SPACE_OFFSET, empty_space);
+  block->page_count= row_pos.rownr + TAIL_BIT;
+  /*
+    If there is less directory entries free than number of possible tails
+    we can write for a row, we mark the page full to ensure that we don't
+    during _ma_bitmap_find_place() allocate more entries on the tail page
+    than it can hold
+  */
+  block->empty_space= (enough_free_entries(row_pos.buff, share->block_size,
+                                           1 + share->base.blobs) ?
+                       empty_space : 0);
+  /* Keep BLOCKUSED_USE_ORG_BITMAP */
+  block->used|= BLOCKUSED_USED | BLOCKUSED_TAIL;
+
+  if (block_is_read)
+  {
+    /* Current page link is last element in pinned_pages */
+    MARIA_PINNED_PAGE *page_link;
+    page_link= dynamic_element(&info->pinned_pages,
+                               info->pinned_pages.elements-1,
+                               MARIA_PINNED_PAGE*);
+    pagecache_unlock_by_link(share->pagecache, page_link->link,
+                             PAGECACHE_LOCK_WRITE_TO_READ,
+                             PAGECACHE_PIN_LEFT_PINNED, LSN_IMPOSSIBLE,
+                             LSN_IMPOSSIBLE, 1, FALSE);
+    DBUG_ASSERT(page_link->changed);
+    page_link->unlock= PAGECACHE_LOCK_READ_UNLOCK;
+    res= 0;
+  }
+  else
+  {
+    if (!(res= pagecache_write(share->pagecache,
+                               &info->dfile, block->page, 0,
+                               row_pos.buff,share->page_type,
+                               PAGECACHE_LOCK_READ,
+                               PAGECACHE_PIN,
+                               PAGECACHE_WRITE_DELAY, &page_link.link,
+                               LSN_IMPOSSIBLE)))
+    {
+      page_link.unlock= PAGECACHE_LOCK_READ_UNLOCK;
+      page_link.changed= 1;
+      push_dynamic(&info->pinned_pages, (void*) &page_link);
+    }
+
+    /* Increase data file size, if extended */
+    position= (my_off_t) block->page * block_size;
+    if (share->state.state.data_file_length <= position)
+    {
+      /*
+        We are modifying a state member before writing the UNDO; this is a WAL
+        violation. But for data_file_length this is ok, as long as we change
+        data_file_length after writing any log record (FILE_ID/REDO/UNDO) (see
+        collect_tables()).
+      */
+      _ma_set_share_data_file_length(share, position + block_size);
+    }
+  }
+  DBUG_RETURN(res);
+}
+
+
+/*
+  Write full pages
+
+  SYNOPSIS
+    write_full_pages()
+    info                Maria handler
+    lsn			LSN for the undo record
+    block               Where to write data
+    data                Data to write
+    length              Length of data
+
+  NOTES
+    Logging of the changes to the full pages are done in the caller
+    write_block_record().
+
+  RETURN
+    0  ok
+    1  error on write
+*/
+
+static my_bool write_full_pages(MARIA_HA *info,
+                                LSN lsn,
+                                MARIA_BITMAP_BLOCK *block,
+                                uchar *data, ulong length)
+{
+  pgcache_page_no_t page;
+  MARIA_SHARE *share= info->s;
+  uint block_size= share->block_size;
+  uint data_size= FULL_PAGE_SIZE(block_size);
+  uchar *buff= info->keyread_buff;
+  uint page_count, sub_blocks;
+  my_off_t position, max_position;
+  DBUG_ENTER("write_full_pages");
+  DBUG_PRINT("enter", ("length: %lu  page: %lu  page_count: %lu",
+                       (ulong) length, (ulong) block->page,
+                       (ulong) block->page_count));
+  DBUG_ASSERT((block->page_count & TAIL_BIT) == 0);
+
+  info->keyread_buff_used= 1;
+  page=       block->page;
+  page_count= block->page_count;
+  sub_blocks= block->sub_blocks;
+
+  max_position= (my_off_t) (page + page_count) * block_size;
+
+  /* Increase data file size, if extended */
+
+  for (; length; data+= data_size)
+  {
+    uint copy_length;
+    if (!page_count--)
+    {
+      if (!--sub_blocks)
+      {
+        DBUG_ASSERT(0);                         /* Wrong in bitmap or UNDO */
+        my_errno= HA_ERR_WRONG_IN_RECORD;       /* File crashed */
+        DBUG_RETURN(1);
+      }
+
+      block++;
+      page= block->page;
+      page_count= block->page_count - 1;
+      DBUG_PRINT("info", ("page: %lu  page_count: %lu",
+                          (ulong) block->page, (ulong) block->page_count));
+
+      position= (page + page_count + 1) * block_size;
+      set_if_bigger(max_position, position);
+    }
+    lsn_store(buff, lsn);
+    buff[PAGE_TYPE_OFFSET]= (uchar) BLOB_PAGE;
+    copy_length= min(data_size, length);
+    memcpy(buff + LSN_SIZE + PAGE_TYPE_SIZE, data, copy_length);
+    length-= copy_length;
+
+    /*
+      Zero out old information from the block. This removes possible
+      sensitive information from the block and also makes the file
+      easier to compress and easier to compare after recovery.
+    */
+    if (copy_length != data_size)
+      bzero(buff + block_size - PAGE_SUFFIX_SIZE - (data_size - copy_length),
+            (data_size - copy_length) + PAGE_SUFFIX_SIZE);
+
+    if (pagecache_write(share->pagecache,
+                        &info->dfile, page, 0,
+                        buff, share->page_type,
+                        PAGECACHE_LOCK_LEFT_UNLOCKED,
+                        PAGECACHE_PIN_LEFT_UNPINNED,
+                        PAGECACHE_WRITE_DELAY,
+                        0, info->trn->rec_lsn))
+      DBUG_RETURN(1);
+    page++;
+    DBUG_ASSERT(block->used & BLOCKUSED_USED);
+  }
+  if (share->state.state.data_file_length < max_position)
+    _ma_set_share_data_file_length(share, max_position);
+  DBUG_RETURN(0);
+}
+
+
+/*
+  Store ranges of full pages in compact format for logging
+
+  SYNOPSIS
+    store_page_range()
+    to		Store data here
+    block       Where pages are to be written
+    block_size  block size
+    length	Length of data to be written
+		Normally this is full pages, except for the last
+                tail block that may only partly fit the last page.
+    tot_ranges  Add here the number of ranges used
+
+  NOTES
+    The format of one entry is:
+
+     Ranges				 SUB_RANGE_SIZE
+     Empty bytes at end of last byte     BLOCK_FILLER_SIZE
+     For each range
+       Page number                       PAGE_STORE_SIZE
+       Number of pages			 PAGERANGE_STORE_SIZE
+
+  RETURN
+    #  end position for 'to'
+*/
+
+static uchar *store_page_range(uchar *to, MARIA_BITMAP_BLOCK *block,
+                               uint block_size, ulong length,
+                               uint *tot_ranges)
+{
+  uint data_size= FULL_PAGE_SIZE(block_size);
+  ulong pages_left= (length + data_size -1) / data_size;
+  uint page_count, ranges, empty_space;
+  uchar *to_start;
+  DBUG_ENTER("store_page_range");
+
+  to_start= to;
+  to+= SUB_RANGE_SIZE;
+
+  /* Store number of unused bytes at last page */
+  empty_space= (uint) (pages_left * data_size - length);
+  int2store(to, empty_space);
+  to+= BLOCK_FILLER_SIZE;
+
+  ranges= 0;
+  do
+  {
+    pgcache_page_no_t page;
+    page=       block->page;
+    page_count= block->page_count;
+    block++;
+    if (page_count > pages_left)
+      page_count= pages_left;
+
+    page_store(to, page);
+    to+= PAGE_STORE_SIZE;
+    pagerange_store(to, page_count);
+    to+= PAGERANGE_STORE_SIZE;
+    ranges++;
+  } while ((pages_left-= page_count));
+  /* Store number of ranges for this block */
+  int2store(to_start, ranges);
+  (*tot_ranges)+= ranges;
+
+  DBUG_RETURN(to);
+}
+
+
+/*
+  Store packed extent data
+
+  SYNOPSIS
+   store_extent_info()
+   to				Store first packed data here
+   row_extents_second_part	Store rest here
+   first_block		        First block to store
+   count			Number of blocks
+
+  NOTES
+    We don't have to store the position for the head block
+
+    We have to set the START_EXTENT_BIT for every extent where the
+    blob will be stored on a page of it's own. We need this in the
+    UNDO phase to generate MARIA_BITMAP_BLOCK's for undo-delete and
+    undo-update.
+*/
+
+static void store_extent_info(uchar *to,
+                              uchar *row_extents_second_part,
+                              MARIA_BITMAP_BLOCK *first_block,
+                              uint count)
+{
+  MARIA_BITMAP_BLOCK *block, *end_block;
+  uint copy_length;
+  my_bool first_found= 0;
+  DBUG_ENTER("store_extent_info");
+  DBUG_PRINT("enter", ("count: %u", count));
+
+  for (block= first_block, end_block= first_block+count ;
+       block < end_block; block++)
+  {
+    /* The following is only false for marker blocks */
+    if (likely(block->used & BLOCKUSED_USED))
+    {
+      uint page_count= block->page_count;
+      DBUG_ASSERT(page_count != 0);
+      page_store(to, block->page);
+      if (block->sub_blocks)
+      {
+        /*
+          Set a bit so that we later know that this was the first block
+          for a blob
+        */
+        page_count|= START_EXTENT_BIT;
+      }
+      pagerange_store(to + PAGE_STORE_SIZE, page_count);
+      DBUG_DUMP("extent", to, ROW_EXTENT_SIZE);
+      to+= ROW_EXTENT_SIZE;
+      if (!first_found)
+      {
+        first_found= 1;
+        to= row_extents_second_part;
+      }
+    }
+  }
+  copy_length= (count - 1) * ROW_EXTENT_SIZE;
+  /*
+    In some unlikely cases we have allocated to many blocks. Clear this
+    data.
+  */
+  bzero(to, (size_t) (row_extents_second_part + copy_length - to));
+  DBUG_VOID_RETURN;
+}
+
+
+/**
+   @brief
+   Convert extent info read from file to MARIA_BITMAP_BLOCKS suitable
+   for write_block_record
+
+   @note
+   In case of blobs, this function marks all the blob pages in the bitmap
+   as full pages. The bitmap bits for other pages will be marked
+   when write_block_record() calls _ma_bitmap_release_unused().
+
+   This function will be removed in Maria 2.0 when we instead of delete rows
+   mark them as deleted and only remove them after commit.
+
+   @return
+   @retval 0  ok
+   @retval 1  Error (out of memory or disk error changing bitmap) or
+              wrong information in extent information
+*/
+
+static my_bool extent_to_bitmap_blocks(MARIA_HA *info,
+                                       MARIA_BITMAP_BLOCKS *blocks,
+                                       pgcache_page_no_t head_page,
+                                       uint extent_count,
+                                       const uchar *extent_info)
+{
+  MARIA_BITMAP_BLOCK *block, *start_block;
+  MARIA_SHARE *share= info->s;
+  uint i, tail_page;
+  DBUG_ENTER("extent_to_bitmap_blocks");
+
+  if (allocate_dynamic(&info->bitmap_blocks, extent_count + 2))
+    DBUG_RETURN(1);
+  block= blocks->block=  dynamic_element(&info->bitmap_blocks, 0,
+                                        MARIA_BITMAP_BLOCK*);
+  blocks->count= extent_count + 1;
+  blocks->tail_page_skipped= blocks->page_skipped= 0;
+  block->page= head_page;
+  block->page_count= 1;
+  block->used= BLOCKUSED_USED | BLOCKUSED_USE_ORG_BITMAP;
+  /* Impossible value, will force storage of real value */
+  block->org_bitmap_value= 255;
+
+  start_block= block++;
+  for (i=0 ;
+       i++ < extent_count ;
+       block++, extent_info+= ROW_EXTENT_SIZE)
+  {
+    uint page_count= uint2korr(extent_info + ROW_EXTENT_PAGE_SIZE);
+    if (page_count & START_EXTENT_BIT)
+    {
+      page_count&= ~START_EXTENT_BIT;
+      start_block->sub_blocks= (uint) (block - start_block);
+      start_block= block;
+    }
+    block->page= page_korr(extent_info);
+    block->page_count= page_count;
+    block->sub_blocks= 0;
+    if (block->page_count == 0)
+    {
+      /* Extend allocated but not used by write_block_record() */
+      DBUG_ASSERT(block->page == 0);
+      /* This is the last block */
+      blocks->count= i;
+      break;
+    }
+    if ((tail_page= page_count & TAIL_BIT))
+      page_count= 1;
+
+    /* Check if wrong data */
+    if (block->page == 0 || page_count == 0 ||
+        (block->page + page_count) * share->block_size >
+         share->state.state.data_file_length)
+    {
+      DBUG_PRINT("error", ("page: %lu  page_count: %u  tail: %u  length: %ld  data_length: %ld",
+                           (ulong) block->page,
+                           (block->page_count & ~TAIL_BIT),
+                           (uint) test(block->page_count & TAIL_BIT),
+                           (ulong) ((block->page + (page_count & ~TAIL_BIT)) *
+                                    share->block_size),
+                           (ulong) share->state.state.data_file_length));
+      DBUG_RETURN(1);
+    }
+    if (tail_page)
+    {
+      block->org_bitmap_value= _ma_bitmap_get_page_bits(info, &share->bitmap,
+                                                        block->page);
+      block->used= (BLOCKUSED_TAIL | BLOCKUSED_USED |
+                    BLOCKUSED_USE_ORG_BITMAP);
+    }
+    else
+    {
+      my_bool res;
+      pthread_mutex_lock(&share->bitmap.bitmap_lock);
+      res= _ma_bitmap_set_full_page_bits(info, &share->bitmap,
+                                         block->page, page_count);
+      pthread_mutex_unlock(&share->bitmap.bitmap_lock);
+      if (res)
+        DBUG_RETURN(1);
+      block->used= BLOCKUSED_USED;
+    }
+  }
+  start_block->sub_blocks= (uint) (block - start_block);
+  DBUG_RETURN(0);
+}
+
+
+/*
+  Free regions of pages with logging
+
+  NOTES
+    We are removing filler events and tail page events from
+    row->extents to get smaller log.
+
+  RETURN
+    0   ok
+    1   error
+*/
+
+static my_bool free_full_pages(MARIA_HA *info, MARIA_ROW *row)
+{
+  uchar log_data[FILEID_STORE_SIZE + PAGERANGE_STORE_SIZE];
+  LEX_CUSTRING log_array[TRANSLOG_INTERNAL_PARTS + 2];
+  LSN lsn;
+  size_t extents_length;
+  uchar *extents= row->extents;
+  DBUG_ENTER("free_full_pages");
+
+  if (info->s->now_transactional)
+  {
+    /* Compact events by removing filler and tail events */
+    uchar *new_block= 0;
+    uchar *end, *to, *compact_extent_info;
+    my_bool res;
+    uint extents_count;
+
+    if (!(compact_extent_info= my_alloca(row->extents_count *
+                                         ROW_EXTENT_SIZE)))
+      DBUG_RETURN(1);
+
+    to= compact_extent_info;
+    for (end= extents + row->extents_count * ROW_EXTENT_SIZE ;
+         extents < end ;
+         extents+= ROW_EXTENT_SIZE)
+    {
+      uint page_count= uint2korr(extents + ROW_EXTENT_PAGE_SIZE);
+      page_count&= ~START_EXTENT_BIT;
+      if (! (page_count & TAIL_BIT) && page_count != 0)
+      {
+        /* Found correct extent */
+        if (!new_block)
+          new_block= extents;                   /* First extent in range */
+        continue;
+      }
+      /* Found extent to remove, copy everything found so far */
+      if (new_block)
+      {
+        size_t length= (size_t) (extents - new_block);
+        memcpy(to, new_block, length);
+        to+= length;
+        new_block= 0;
+      }
+    }
+    if (new_block)
+    {
+      size_t length= (size_t) (extents - new_block);
+      memcpy(to, new_block, length);
+      to+= length;
+    }
+
+    if (!unlikely(extents_length= (uint) (to - compact_extent_info)))
+    {
+      /*
+        No ranges. This happens in the rear case when we have a allocated
+        place for a blob on a tail page but it did fit into the main page.
+      */
+      my_afree(compact_extent_info);
+      DBUG_RETURN(0);
+    }
+    extents_count= (uint) (extents_length / ROW_EXTENT_SIZE);
+    pagerange_store(log_data + FILEID_STORE_SIZE, extents_count);
+    log_array[TRANSLOG_INTERNAL_PARTS + 0].str=    log_data;
+    log_array[TRANSLOG_INTERNAL_PARTS + 0].length= sizeof(log_data);
+    log_array[TRANSLOG_INTERNAL_PARTS + 1].str=    compact_extent_info;
+    log_array[TRANSLOG_INTERNAL_PARTS + 1].length= extents_length;
+    res= translog_write_record(&lsn, LOGREC_REDO_FREE_BLOCKS, info->trn,
+                               info,
+                               (translog_size_t) (sizeof(log_data) +
+                                                  extents_length),
+                               TRANSLOG_INTERNAL_PARTS + 2, log_array,
+                               log_data, NULL);
+    my_afree(compact_extent_info);
+    if (res)
+      DBUG_RETURN(1);
+  }
+
+  DBUG_RETURN(_ma_bitmap_free_full_pages(info, row->extents,
+                                         row->extents_count));
+}
+
+
+/*
+  Free one page range
+
+  NOTES
+    This is very similar to free_full_pages()
+
+  RETURN
+    0   ok
+    1   error
+*/
+
+static my_bool free_full_page_range(MARIA_HA *info, pgcache_page_no_t page,
+                                    uint count)
+{
+  my_bool res= 0;
+  uint delete_count;
+  MARIA_SHARE *share= info->s;
+  DBUG_ENTER("free_full_page_range");
+
+  delete_count= count;
+  if (share->state.state.data_file_length ==
+      (page + count) * share->block_size)
+  {
+    /*
+      Don't delete last page from pagecache as this will make the file
+      shorter than expected if the last operation extended the file
+    */
+    delete_count--;
+  }
+  if (delete_count &&
+      pagecache_delete_pages(share->pagecache, &info->dfile,
+                             page, delete_count, PAGECACHE_LOCK_WRITE, 0))
+    res= 1;
+
+  if (share->now_transactional)
+  {
+    LSN lsn;
+    /** @todo unify log_data's shape with delete_head_or_tail() */
+    uchar log_data[FILEID_STORE_SIZE + PAGERANGE_STORE_SIZE +
+                   ROW_EXTENT_SIZE];
+    LEX_CUSTRING log_array[TRANSLOG_INTERNAL_PARTS + 1];
+    DBUG_ASSERT(info->trn->rec_lsn);
+    pagerange_store(log_data + FILEID_STORE_SIZE, 1);
+    page_store(log_data + FILEID_STORE_SIZE + PAGERANGE_STORE_SIZE,
+              page);
+    int2store(log_data + FILEID_STORE_SIZE + PAGERANGE_STORE_SIZE +
+              PAGE_STORE_SIZE, count);
+    log_array[TRANSLOG_INTERNAL_PARTS + 0].str=    log_data;
+    log_array[TRANSLOG_INTERNAL_PARTS + 0].length= sizeof(log_data);
+
+    if (translog_write_record(&lsn, LOGREC_REDO_FREE_BLOCKS,
+                              info->trn, info,
+                              (translog_size_t) sizeof(log_data),
+                              TRANSLOG_INTERNAL_PARTS + 1, log_array,
+                              log_data, NULL))
+      res= 1;
+  }
+  pthread_mutex_lock(&share->bitmap.bitmap_lock);
+  if (_ma_bitmap_reset_full_page_bits(info, &share->bitmap, page, count))
+    res= 1;
+  pthread_mutex_unlock(&share->bitmap.bitmap_lock);
+  DBUG_RETURN(res);
+}
+
+
+/**
+   @brief Write a record to a (set of) pages
+
+   @fn     write_block_record()
+   @param  info            Maria handler
+   @param  old_record      Original record in case of update; NULL in case of
+                           insert
+   @param  record          Record we should write
+   @param  row             Statistics about record (calculated by
+                           calc_record_size())
+   @param  map_blocks      On which pages the record should be stored
+   @param  row_pos         Position on head page where to put head part of
+                           record
+   @param  undo_lsn	   <> LSN_ERROR if we are executing an UNDO
+   @param  old_record_checksum Checksum of old_record: ignored if table does
+                               not have live checksum; otherwise if
+                               old_record==NULL it must be 0.
+
+   @note
+     On return all pinned pages are released.
+
+     [page_buff + EMPTY_SPACE_OFFSET] is set to
+     row_pos->empty_space - head_length
+
+   @return Operation status
+   @retval 0      OK
+   @retval 1      Error
+*/
+
+static my_bool write_block_record(MARIA_HA *info,
+                                  const uchar *old_record,
+                                  const uchar *record,
+                                  MARIA_ROW *row,
+                                  MARIA_BITMAP_BLOCKS *bitmap_blocks,
+                                  my_bool head_block_is_read,
+                                  struct st_row_pos_info *row_pos,
+                                  LSN undo_lsn,
+                                  ha_checksum old_record_checksum)
+{
+  uchar *data, *end_of_data, *tmp_data_used, *tmp_data;
+  uchar *row_extents_first_part, *row_extents_second_part;
+  uchar *field_length_data;
+  uchar *page_buff;
+  MARIA_BITMAP_BLOCK *block, *head_block;
+  MARIA_SHARE *share= info->s;
+  MARIA_COLUMNDEF *column, *end_column;
+  MARIA_PINNED_PAGE page_link;
+  uint block_size, flag, head_length;
+  ulong *blob_lengths;
+  my_bool row_extents_in_use, blob_full_pages_exists;
+  LSN lsn;
+  my_off_t position;
+  uint save_my_errno;
+  DBUG_ENTER("write_block_record");
+
+  LINT_INIT(row_extents_first_part);
+  LINT_INIT(row_extents_second_part);
+
+  head_block= bitmap_blocks->block;
+  block_size= share->block_size;
+
+  page_buff= row_pos->buff;
+  /* Position on head page where we should store the head part */
+  data= row_pos->data;
+  end_of_data= data + row_pos->length;
+
+  /* Write header */
+  flag= info->row_flag;
+  row_extents_in_use= 0;
+  if (unlikely(row->total_length > row_pos->length))
+  {
+    /* Need extent */
+    DBUG_ASSERT(bitmap_blocks->count > 1);
+    if (bitmap_blocks->count <= 1)
+      goto crashed;                             /* Wrong in bitmap */
+    flag|= ROW_FLAG_EXTENTS;
+    row_extents_in_use= 1;
+  }
+  /* For now we have only a minimum header */
+  *data++= (uchar) flag;
+  if (flag & ROW_FLAG_TRANSID)
+  {
+    transid_store(data, info->trn->trid);
+    data+= TRANSID_SIZE;
+  }
+
+  if (unlikely(flag & ROW_FLAG_NULLS_EXTENDED))
+    *data++= (uchar) (share->base.null_bytes -
+                      share->base.original_null_bytes);
+  if (row_extents_in_use)
+  {
+    /* Store first extent in header */
+    store_key_length_inc(data, bitmap_blocks->count - 1);
+    row_extents_first_part= data;
+    data+= ROW_EXTENT_SIZE;
+  }
+  if (share->base.max_field_lengths)
+    store_key_length_inc(data, row->field_lengths_length);
+  if (share->calc_checksum)
+  {
+    *(data++)= (uchar) (row->checksum); /* store least significant byte */
+    DBUG_ASSERT(!((old_record_checksum != 0) && (old_record == NULL)));
+  }
+  memcpy(data, record, share->base.null_bytes);
+  data+= share->base.null_bytes;
+  memcpy(data, row->empty_bits, share->base.pack_bytes);
+  data+= share->base.pack_bytes;
+
+  DBUG_ASSERT(row_extents_in_use || undo_lsn != LSN_ERROR ||
+              (uint) (data - row_pos->data) == row->min_length);
+
+  /*
+    Allocate a buffer of rest of data (except blobs)
+
+    To avoid double copying of data, we copy as many columns that fits into
+    the page. The rest goes into info->packed_row.
+
+    Using an extra buffer, instead of doing continuous writes to different
+    pages, uses less code and we don't need to have to do a complex call
+    for every data segment we want to store.
+  */
+  if (_ma_alloc_buffer(&info->rec_buff, &info->rec_buff_size,
+                       row->head_length))
+    DBUG_RETURN(1);
+
+  tmp_data_used= 0;                 /* Either 0 or last used uchar in 'data' */
+  tmp_data= data;
+
+  if (row_extents_in_use)
+  {
+    uint copy_length= (bitmap_blocks->count - 2) * ROW_EXTENT_SIZE;
+    if (!tmp_data_used && tmp_data + copy_length > end_of_data)
+    {
+      tmp_data_used= tmp_data;
+      tmp_data= info->rec_buff;
+    }
+    row_extents_second_part= tmp_data;
+    /*
+       We will copy the extents here when we have figured out the tail
+       positions.
+    */
+    tmp_data+= copy_length;
+  }
+
+  /* Copy fields that has fixed lengths (primary key etc) */
+  for (column= share->columndef,
+         end_column= column + share->base.fixed_not_null_fields;
+       column < end_column; column++)
+  {
+    if (!tmp_data_used && tmp_data + column->length > end_of_data)
+    {
+      tmp_data_used= tmp_data;
+      tmp_data= info->rec_buff;
+    }
+    memcpy(tmp_data, record + column->offset, column->length);
+    tmp_data+= column->length;
+  }
+
+  /* Copy length of data for variable length fields */
+  if (!tmp_data_used && tmp_data + row->field_lengths_length > end_of_data)
+  {
+    tmp_data_used= tmp_data;
+    tmp_data= info->rec_buff;
+  }
+  field_length_data= row->field_lengths;
+  memcpy(tmp_data, field_length_data, row->field_lengths_length);
+  tmp_data+= row->field_lengths_length;
+
+  DBUG_ASSERT(row_extents_in_use || undo_lsn != LSN_ERROR ||
+              (uint) (tmp_data - row_pos->data) == row->min_length +
+              share->base.fixed_not_null_fields_length +
+              row->field_lengths_length);
+
+  /* Copy variable length fields and fields with null/zero */
+  for (end_column= share->columndef + share->base.fields - share->base.blobs;
+       column < end_column ;
+       column++)
+  {
+    const uchar *field_pos;
+    ulong length;
+    if ((record[column->null_pos] & column->null_bit) ||
+        (row->empty_bits[column->empty_pos] & column->empty_bit))
+      continue;
+
+    field_pos= record + column->offset;
+    switch (column->type) {
+    case FIELD_NORMAL:                          /* Fixed length field */
+    case FIELD_SKIP_PRESPACE:
+    case FIELD_SKIP_ZERO:                       /* Fixed length field */
+      length= column->length;
+      break;
+    case FIELD_SKIP_ENDSPACE:                   /* CHAR */
+      /* Char that is space filled */
+      if (column->length <= 255)
+        length= (uint) (uchar) *field_length_data++;
+      else
+      {
+        length= uint2korr(field_length_data);
+        field_length_data+= 2;
+      }
+      break;
+    case FIELD_VARCHAR:
+      if (column->length <= 256)
+      {
+        length= (uint) (uchar) *field_length_data++;
+        field_pos++;                            /* Skip length uchar */
+      }
+      else
+      {
+        length= uint2korr(field_length_data);
+        field_length_data+= 2;
+        field_pos+= 2;
+      }
+      DBUG_ASSERT(length <= column->length);
+      break;
+    default:                                    /* Wrong data */
+      DBUG_ASSERT(0);
+      length=0;
+      break;
+    }
+    if (!tmp_data_used && tmp_data + length > end_of_data)
+    {
+      /* Data didn't fit in page; Change to use tmp buffer */
+      tmp_data_used= tmp_data;
+      tmp_data= info->rec_buff;
+    }
+    memcpy((char*) tmp_data, field_pos, length);
+    tmp_data+= length;
+  }
+
+  block= head_block + head_block->sub_blocks;   /* Point to first blob data */
+
+  end_column= column + share->base.blobs;
+  blob_lengths= row->blob_lengths;
+  if (!tmp_data_used)
+  {
+    /* Still room on page; Copy as many blobs we can into this page */
+    data= tmp_data;
+    for (; column < end_column &&
+           *blob_lengths <= (ulong)(end_of_data - data);
+         column++, blob_lengths++)
+    {
+      uchar *tmp_pos;
+      uint length;
+      if (!*blob_lengths)                       /* Null or "" */
+        continue;
+      length= column->length - portable_sizeof_char_ptr;
+      memcpy_fixed((uchar*) &tmp_pos, record + column->offset + length,
+                   sizeof(char*));
+      memcpy(data, tmp_pos, *blob_lengths);
+      data+= *blob_lengths;
+      /*
+        The following is not true when we want to insert data into original
+        place. In this case we don't have any extra blocks allocated
+      */
+      if (likely(undo_lsn == LSN_ERROR))
+      {
+        /* Skip over tail page that was prepared for storing blob */
+        block++;
+        bitmap_blocks->tail_page_skipped= 1;
+      }
+    }
+    if (head_block->sub_blocks > 1)
+    {
+      /* We have allocated pages that where not used */
+      bitmap_blocks->page_skipped= 1;
+    }
+  }
+  else
+    data= tmp_data_used;                        /* Get last used on page */
+
+  /* Update page directory */
+  head_length= (uint) (data - row_pos->data);
+  DBUG_PRINT("info", ("Used head length on page: %u  header_length: %u",
+                      head_length,
+                      (uint) (flag & ROW_FLAG_TRANSID ? TRANSID_SIZE : 0)));
+  DBUG_ASSERT(data <= end_of_data);
+  if (head_length < share->base.min_block_length)
+  {
+    /* Extend row to be of size min_block_length */
+    uint diff_length= share->base.min_block_length - head_length;
+    bzero(data, diff_length);
+    data+= diff_length;
+    head_length= share->base.min_block_length;
+  }
+  /*
+    If this is a redo entry (ie, undo_lsn != LSN_ERROR) then we should have
+    written exactly head_length bytes (same as original record).
+  */
+  DBUG_ASSERT(undo_lsn == LSN_ERROR || head_length == row_pos->length);
+  int2store(row_pos->dir + 2, head_length);
+  /* update empty space at start of block */
+  row_pos->empty_space-= head_length;
+  int2store(page_buff + EMPTY_SPACE_OFFSET, row_pos->empty_space);
+  /* Mark in bitmaps how the current page was actually used */
+  head_block->empty_space= row_pos->empty_space;
+  if (page_buff[DIR_COUNT_OFFSET] == MAX_ROWS_PER_PAGE &&
+      page_buff[DIR_FREE_OFFSET] == END_OF_DIR_FREE_LIST)
+    head_block->empty_space= 0;               /* Page is full */
+  head_block->used|= BLOCKUSED_USED;
+
+  check_directory(page_buff, share->block_size, share->base.min_block_length,
+                  (uint) -1);
+
+  /*
+     Now we have to write tail pages, as we need to store the position
+     to them in the row extent header.
+
+     We first write out all blob tails, to be able to store them in
+     the current page or 'tmp_data'.
+
+     Then we write the tail of the non-blob fields (The position to the
+     tail page is stored either in row header, the extents in the head
+     page or in the first full page of the non-blob data. It's never in
+     the tail page of the non-blob data)
+  */
+
+  blob_full_pages_exists= 0;
+  if (row_extents_in_use)
+  {
+    if (column != end_column)                   /* If blob fields */
+    {
+      MARIA_COLUMNDEF    *save_column=       column;
+      MARIA_BITMAP_BLOCK *save_block=        block;
+      MARIA_BITMAP_BLOCK *end_block;
+      ulong              *save_blob_lengths= blob_lengths;
+
+      for (; column < end_column; column++, blob_lengths++)
+      {
+        uchar *blob_pos;
+        if (!*blob_lengths)                     /* Null or "" */
+          continue;
+        if (block[block->sub_blocks - 1].used & BLOCKUSED_TAIL)
+        {
+          uint length;
+          length= column->length - portable_sizeof_char_ptr;
+          memcpy_fixed((uchar *) &blob_pos, record + column->offset + length,
+                       sizeof(char*));
+          length= *blob_lengths % FULL_PAGE_SIZE(block_size);   /* tail size */
+          if (length != *blob_lengths)
+            blob_full_pages_exists= 1;
+          if (write_tail(info, block + block->sub_blocks-1,
+                         blob_pos + *blob_lengths - length,
+                         length))
+            goto disk_err;
+        }
+        else
+          blob_full_pages_exists= 1;
+
+        for (end_block= block + block->sub_blocks; block < end_block; block++)
+        {
+          /*
+            Set only a bit, to not cause bitmap code to believe a block is full
+            when there is still a lot of entries in it.
+          */
+          block->used|= BLOCKUSED_USED;
+        }
+      }
+      DBUG_ASSERT((undo_lsn == LSN_ERROR ||
+                   block == bitmap_blocks->block + bitmap_blocks->count));
+      column= save_column;
+      block= save_block;
+      blob_lengths= save_blob_lengths;
+    }
+
+    if (tmp_data_used)                          /* non blob data overflows */
+    {
+      MARIA_BITMAP_BLOCK *cur_block, *end_block, *last_head_block;
+      MARIA_BITMAP_BLOCK *head_tail_block= 0;
+      ulong length;
+      ulong data_length= (ulong) (tmp_data - info->rec_buff);
+
+#ifdef SANITY_CHECKS
+      DBUG_ASSERT(head_block->sub_blocks != 1);
+      if (head_block->sub_blocks == 1)
+        goto crashed;                           /* no reserved full or tails */
+#endif
+      /*
+        Find out where to write tail for non-blob fields.
+
+        Problem here is that the bitmap code may have allocated more
+        space than we need. We have to handle the following cases:
+
+        - Bitmap code allocated a tail page we don't need.
+        - The last full page allocated needs to be changed to a tail page
+        (Because we where able to put more data on the head page than
+        the bitmap allocation assumed)
+
+        The reserved pages in bitmap_blocks for the main page has one of
+        the following allocations:
+        - Full pages, with following blocks:
+          # * full pages
+          empty page  ; To be used if we change last full to tail page. This
+          has 'count' = 0.
+          tail page  (optional, if last full page was part full)
+        - One tail page
+      */
+
+      cur_block= head_block + 1;
+      end_block= head_block + head_block->sub_blocks;
+      /*
+        Loop until we have find a block bigger than we need or
+        we find the empty page block.
+      */
+      while (data_length >= (length= (cur_block->page_count *
+                                      FULL_PAGE_SIZE(block_size))) &&
+             cur_block->page_count)
+      {
+#ifdef SANITY_CHECKS
+        DBUG_ASSERT(!((cur_block == end_block) ||
+                      (cur_block->used & BLOCKUSED_USED)));
+        if ((cur_block == end_block) || (cur_block->used & BLOCKUSED_USED))
+          goto crashed;
+#endif
+        data_length-= length;
+        (cur_block++)->used|= BLOCKUSED_USED;
+      }
+      last_head_block= cur_block;
+      if (data_length)
+      {
+        if (cur_block->page_count == 0)
+        {
+          /* Skip empty filler block */
+          cur_block++;
+        }
+#ifdef SANITY_CHECKS
+        DBUG_ASSERT(!(cur_block >= end_block));
+        if ((cur_block >= end_block))
+          goto crashed;
+#endif
+        if (cur_block->used & BLOCKUSED_TAIL)
+        {
+          DBUG_ASSERT(data_length < MAX_TAIL_SIZE(block_size));
+          /* tail written to tail page */
+          cur_block->used|= BLOCKUSED_USED;
+          head_tail_block= cur_block;
+        }
+        else if (data_length > length - MAX_TAIL_SIZE(block_size))
+        {
+          /* tail written to full page */
+          cur_block->used|= BLOCKUSED_USED;
+          if ((cur_block != end_block - 1) &&
+              (end_block[-1].used & BLOCKUSED_TAIL))
+            bitmap_blocks->tail_page_skipped= 1;
+        }
+        else
+        {
+          /*
+            cur_block is a full block, followed by an empty and optional
+            tail block. Change cur_block to a tail block or split it
+            into full blocks and tail blocks.
+
+            TODO:
+             If there is enough space on the following tail block, use
+             this instead of creating a new tail block.
+          */
+          DBUG_ASSERT(cur_block[1].page_count == 0);
+          if (cur_block->page_count == 1)
+          {
+            /* convert full block to tail block */
+            cur_block->used|= BLOCKUSED_USED | BLOCKUSED_TAIL;
+            head_tail_block= cur_block;
+          }
+          else
+          {
+            DBUG_ASSERT(data_length < length - FULL_PAGE_SIZE(block_size));
+            DBUG_PRINT("info", ("Splitting blocks into full and tail"));
+            cur_block[1].page= (cur_block->page + cur_block->page_count - 1);
+            cur_block[1].page_count= 1;         /* Avoid DBUG_ASSERT */
+            cur_block[1].used= BLOCKUSED_USED | BLOCKUSED_TAIL;
+            cur_block->page_count--;
+            cur_block->used|= BLOCKUSED_USED;
+            last_head_block= head_tail_block= cur_block+1;
+          }
+          if (end_block[-1].used & BLOCKUSED_TAIL)
+            bitmap_blocks->tail_page_skipped= 1;
+        }
+      }
+      else
+      {
+        /* Must be an empty or tail page */
+        DBUG_ASSERT(cur_block->page_count == 0 ||
+                    cur_block->used & BLOCKUSED_TAIL);
+        if (end_block[-1].used & BLOCKUSED_TAIL)
+          bitmap_blocks->tail_page_skipped= 1;
+      }
+
+      /*
+        Write all extents into page or tmp_data
+
+        Note that we still don't have a correct position for the tail
+        of the non-blob fields.
+      */
+      store_extent_info(row_extents_first_part,
+                        row_extents_second_part,
+                        head_block+1, bitmap_blocks->count - 1);
+      if (head_tail_block)
+      {
+        ulong block_length= (ulong) (tmp_data - info->rec_buff);
+        uchar *extent_data;
+
+        length= (uint) (block_length % FULL_PAGE_SIZE(block_size));
+        if (write_tail(info, head_tail_block,
+                       info->rec_buff + block_length - length,
+                       length))
+          goto disk_err;
+        tmp_data-= length;                      /* Remove the tail */
+        if (tmp_data == info->rec_buff)
+        {
+          /* We have no full blocks to write for the head part */
+          tmp_data_used= 0;
+        }
+
+        /* Store the tail position for the non-blob fields */
+        if (head_tail_block == head_block + 1)
+        {
+          /*
+            We had a head block + tail block, which means that the
+            tail block is the first extent
+          */
+          extent_data= row_extents_first_part;
+        }
+        else
+        {
+          /*
+            We have a head block + some full blocks + tail block
+            last_head_block is pointing after the last used extent
+            for the head block.
+          */
+          extent_data= row_extents_second_part +
+            ((last_head_block - head_block) - 2) * ROW_EXTENT_SIZE;
+        }
+        DBUG_ASSERT(uint2korr(extent_data+5) & TAIL_BIT);
+        page_store(extent_data, head_tail_block->page);
+        int2store(extent_data + PAGE_STORE_SIZE, head_tail_block->page_count);
+      }
+    }
+    else
+      store_extent_info(row_extents_first_part,
+                        row_extents_second_part,
+                        head_block+1, bitmap_blocks->count - 1);
+  }
+
+  if (share->now_transactional)
+  {
+    uchar log_data[FILEID_STORE_SIZE + PAGE_STORE_SIZE + DIRPOS_STORE_SIZE];
+    LEX_CUSTRING log_array[TRANSLOG_INTERNAL_PARTS + 2];
+
+    /* Log REDO changes of head page */
+    page_store(log_data + FILEID_STORE_SIZE, head_block->page);
+    dirpos_store(log_data + FILEID_STORE_SIZE + PAGE_STORE_SIZE,
+                 row_pos->rownr);
+    log_array[TRANSLOG_INTERNAL_PARTS + 0].str=    log_data;
+    log_array[TRANSLOG_INTERNAL_PARTS + 0].length= sizeof(log_data);
+    log_array[TRANSLOG_INTERNAL_PARTS + 1].str=    row_pos->data;
+    log_array[TRANSLOG_INTERNAL_PARTS + 1].length= head_length;
+    if (translog_write_record(&lsn,
+                              head_block_is_read ?
+                              LOGREC_REDO_INSERT_ROW_HEAD :
+                              LOGREC_REDO_NEW_ROW_HEAD,
+                              info->trn,
+                              info,
+                              (translog_size_t) (sizeof(log_data) +
+                                                 head_length),
+                              TRANSLOG_INTERNAL_PARTS + 2, log_array,
+                              log_data, NULL))
+      goto disk_err;
+  }
+
+#ifdef RECOVERY_EXTRA_DEBUG
+  if (info->trn->undo_lsn != LSN_IMPOSSIBLE)
+  {
+    /* Stop right after the REDO; testing incomplete log record groups */
+    DBUG_EXECUTE_IF("maria_flush_whole_log",
+                    {
+                      DBUG_PRINT("maria_flush_whole_log", ("now"));
+                      translog_flush(translog_get_horizon());
+                    });
+    DBUG_EXECUTE_IF("maria_crash",
+                    { DBUG_PRINT("maria_crash", ("now")); DBUG_ABORT(); });
+  }
+#endif
+
+  if (head_block_is_read)
+  {
+    MARIA_PINNED_PAGE *page_link;
+    /* Head page is always the first pinned page */
+    page_link= dynamic_element(&info->pinned_pages, 0,
+                               MARIA_PINNED_PAGE*);
+    pagecache_unlock_by_link(share->pagecache, page_link->link,
+                             PAGECACHE_LOCK_WRITE_TO_READ,
+                             PAGECACHE_PIN_LEFT_PINNED, LSN_IMPOSSIBLE,
+                             LSN_IMPOSSIBLE, 1, FALSE);
+    page_link->unlock= PAGECACHE_LOCK_READ_UNLOCK;
+    page_link->changed= 1;
+  }
+  else
+  {
+    if (pagecache_write(share->pagecache,
+                        &info->dfile, head_block->page, 0,
+                        page_buff, share->page_type,
+                        head_block_is_read ? PAGECACHE_LOCK_WRITE_TO_READ :
+                        PAGECACHE_LOCK_READ,
+                        head_block_is_read ? PAGECACHE_PIN_LEFT_PINNED :
+                        PAGECACHE_PIN,
+                        PAGECACHE_WRITE_DELAY, &page_link.link,
+                        LSN_IMPOSSIBLE))
+      goto disk_err;
+    page_link.unlock= PAGECACHE_LOCK_READ_UNLOCK;
+    page_link.changed= 1;
+    push_dynamic(&info->pinned_pages, (void*) &page_link);
+
+    /* Increase data file size, if extended */
+    position= (my_off_t) head_block->page * block_size;
+    if (share->state.state.data_file_length <= position)
+      _ma_set_share_data_file_length(share, position + block_size);
+  }
+
+  if (share->now_transactional && (tmp_data_used || blob_full_pages_exists))
+  {
+    /*
+      Log REDO writes for all full pages (head part and all blobs)
+      We write all here to be able to generate the UNDO record early
+      so that we can write the LSN for the UNDO record to all full pages.
+    */
+    uchar tmp_log_data[FILEID_STORE_SIZE + PAGERANGE_STORE_SIZE +
+                       (ROW_EXTENT_SIZE + BLOCK_FILLER_SIZE + SUB_RANGE_SIZE) *
+                       ROW_EXTENTS_ON_STACK];
+    uchar *log_data, *log_pos;
+    LEX_CUSTRING tmp_log_array[TRANSLOG_INTERNAL_PARTS + 2 +
+                               ROW_EXTENTS_ON_STACK];
+    LEX_CUSTRING *log_array_pos, *log_array;
+    int error;
+    translog_size_t log_entry_length= 0;
+    uint ext_length, extents= 0, sub_extents= 0;
+
+    /* If few extents, then allocate things on stack to avoid a malloc call */
+    if (bitmap_blocks->count < ROW_EXTENTS_ON_STACK)
+    {
+      log_array= tmp_log_array;
+      log_data= tmp_log_data;
+    }
+    else
+    {
+      if (!my_multi_malloc(MY_WME, &log_array,
+                          (uint) ((bitmap_blocks->count +
+                                   TRANSLOG_INTERNAL_PARTS + 2) *
+                                  sizeof(*log_array)),
+                          &log_data, FILEID_STORE_SIZE + PAGERANGE_STORE_SIZE +
+                          bitmap_blocks->count * (ROW_EXTENT_SIZE +
+                                                  BLOCK_FILLER_SIZE +
+                                                  SUB_RANGE_SIZE),
+                          NullS))
+        goto disk_err;
+    }
+    log_pos= log_data + FILEID_STORE_SIZE + PAGERANGE_STORE_SIZE * 2;
+    log_array_pos= log_array+ TRANSLOG_INTERNAL_PARTS+1;
+
+    if (tmp_data_used)
+    {
+      /* Full head page */
+      translog_size_t block_length= (translog_size_t) (tmp_data -
+                                                       info->rec_buff);
+      log_pos= store_page_range(log_pos, head_block+1, block_size,
+                                (ulong) block_length, &extents);
+      log_array_pos->str= info->rec_buff;
+      log_array_pos->length= block_length;
+      log_entry_length+= block_length;
+      log_array_pos++;
+      sub_extents++;
+    }
+    if (blob_full_pages_exists)
+    {
+      MARIA_COLUMNDEF *tmp_column= column;
+      ulong *tmp_blob_lengths= blob_lengths;
+      MARIA_BITMAP_BLOCK *tmp_block= block;
+
+      /* Full blob pages */
+      for (; tmp_column < end_column; tmp_column++, tmp_blob_lengths++)
+      {
+        ulong blob_length;
+        uint length;
+
+        if (!*tmp_blob_lengths)                 /* Null or "" */
+          continue;
+        blob_length= *tmp_blob_lengths;
+        length= tmp_column->length - portable_sizeof_char_ptr;
+        /*
+          If last part of blog was on tail page, change blob_length to
+          reflect this
+        */
+        if (tmp_block[tmp_block->sub_blocks - 1].used & BLOCKUSED_TAIL)
+          blob_length-= (blob_length % FULL_PAGE_SIZE(block_size));
+        if (blob_length)
+        {
+          memcpy_fixed((uchar*) &log_array_pos->str,
+                       record + tmp_column->offset + length,
+                       sizeof(uchar*));
+          log_array_pos->length= blob_length;
+          log_entry_length+= blob_length;
+          log_array_pos++;
+          sub_extents++;
+
+          log_pos= store_page_range(log_pos, tmp_block, block_size,
+                                    blob_length, &extents);
+        }
+        tmp_block+= tmp_block->sub_blocks;
+      }
+    }
+
+    log_array[TRANSLOG_INTERNAL_PARTS + 0].str=    log_data;
+    ext_length=  (uint) (log_pos - log_data);
+    log_array[TRANSLOG_INTERNAL_PARTS + 0].length= ext_length;
+    pagerange_store(log_data+ FILEID_STORE_SIZE, extents);
+    pagerange_store(log_data+ FILEID_STORE_SIZE + PAGERANGE_STORE_SIZE,
+                    sub_extents);
+
+    log_entry_length+= ext_length;
+    /* trn->rec_lsn is already set earlier in this function */
+    error= translog_write_record(&lsn, LOGREC_REDO_INSERT_ROW_BLOBS,
+                                 info->trn, info, log_entry_length,
+                                 (uint) (log_array_pos - log_array),
+                                 log_array, log_data, NULL);
+    if (log_array != tmp_log_array)
+      my_free(log_array, MYF(0));
+    if (error)
+      goto disk_err;
+  }
+
+  /* Write UNDO or CLR record */
+  lsn= LSN_IMPOSSIBLE;
+  if (share->now_transactional)
+  {
+    LEX_CUSTRING *log_array= info->log_row_parts;
+
+    if (undo_lsn != LSN_ERROR)
+    {
+      /*
+        Store if this CLR is about UNDO_DELETE or UNDO_UPDATE;
+        in the first case, Recovery, when it sees the CLR_END in the
+        REDO phase, may decrement the records' count.
+      */
+      if (_ma_write_clr(info, undo_lsn,
+                        old_record ? LOGREC_UNDO_ROW_UPDATE :
+                        LOGREC_UNDO_ROW_DELETE,
+                        share->calc_checksum != 0,
+                        row->checksum - old_record_checksum,
+                        &lsn, (void*) 0))
+        goto disk_err;
+    }
+    else
+    {
+      uchar log_data[LSN_STORE_SIZE + FILEID_STORE_SIZE +
+                     PAGE_STORE_SIZE + DIRPOS_STORE_SIZE + 2 +
+                     HA_CHECKSUM_STORE_SIZE + 2 + PAGERANGE_STORE_SIZE +
+                     ROW_EXTENT_SIZE];
+      uchar *log_pos;
+      ha_checksum checksum_delta;
+
+      /* LOGREC_UNDO_ROW_INSERT & LOGREC_UNDO_ROW_UPDATE share same header */
+      lsn_store(log_data, info->trn->undo_lsn);
+      page_store(log_data + LSN_STORE_SIZE + FILEID_STORE_SIZE,
+                 head_block->page);
+      dirpos_store(log_data + LSN_STORE_SIZE + FILEID_STORE_SIZE +
+                   PAGE_STORE_SIZE,
+                   row_pos->rownr);
+      log_pos= (log_data + LSN_STORE_SIZE + FILEID_STORE_SIZE +
+                PAGE_STORE_SIZE + DIRPOS_STORE_SIZE);
+      store_checksum_in_rec(share, checksum_delta,
+                            row->checksum - old_record_checksum,
+                            log_pos, log_pos);
+      compile_time_assert(sizeof(ha_checksum) == HA_CHECKSUM_STORE_SIZE);
+
+      log_array[TRANSLOG_INTERNAL_PARTS + 0].str=    log_data;
+      log_array[TRANSLOG_INTERNAL_PARTS + 0].length= (uint) (log_pos -
+                                                             log_data);
+
+      if (!old_record)
+      {
+        /* Store undo_lsn in case we are aborting the insert */
+        row->orig_undo_lsn= info->trn->undo_lsn;
+        /* Write UNDO log record for the INSERT */
+        if (translog_write_record(&lsn, LOGREC_UNDO_ROW_INSERT,
+                                  info->trn, info,
+                                  (translog_size_t)
+                                  log_array[TRANSLOG_INTERNAL_PARTS +
+                                            0].length,
+                                  TRANSLOG_INTERNAL_PARTS + 1,
+                                  log_array,
+                                  log_data + LSN_STORE_SIZE, &checksum_delta))
+          goto disk_err;
+      }
+      else
+      {
+        /* Write UNDO log record for the UPDATE */
+        size_t row_length, extents_length;
+        uint row_parts_count, cur_head_length;
+
+        /*
+          Write head length and extents of the original row so that we
+          during UNDO can put it back in the original position.
+          We don't store size for TRANSID, as we don't write this during
+          UNDO.
+        */
+        cur_head_length= (info->cur_row.head_length -
+                          info->cur_row.header_length);
+        int2store(log_pos, cur_head_length);
+        pagerange_store(log_pos + 2, info->cur_row.extents_count);
+        log_pos+= 2 + PAGERANGE_STORE_SIZE;
+        log_array[TRANSLOG_INTERNAL_PARTS + 0].length+= (2 +
+                                                         PAGERANGE_STORE_SIZE);
+        info->log_row_parts[TRANSLOG_INTERNAL_PARTS+1].str=
+          info->cur_row.extents;
+        info->log_row_parts[TRANSLOG_INTERNAL_PARTS+1].length=
+          extents_length= info->cur_row.extents_count * ROW_EXTENT_SIZE;
+
+        row_length= fill_update_undo_parts(info, old_record, record,
+                                           log_array +
+                                           TRANSLOG_INTERNAL_PARTS + 2,
+                                           &row_parts_count);
+        if (translog_write_record(&lsn, LOGREC_UNDO_ROW_UPDATE, info->trn,
+                                  info,
+                                  (translog_size_t)
+                                  (log_array[TRANSLOG_INTERNAL_PARTS +
+                                             0].length + extents_length +
+                                   row_length),
+                                  TRANSLOG_INTERNAL_PARTS + 2 +
+                                  row_parts_count,
+                                  log_array,
+                                  log_data + LSN_STORE_SIZE,
+                                  &checksum_delta))
+          goto disk_err;
+      }
+    }
+  }
+  /* Release not used space in used pages */
+  if (_ma_bitmap_release_unused(info, bitmap_blocks))
+    goto disk_err;
+  _ma_unpin_all_pages(info, lsn);
+
+  if (tmp_data_used)
+  {
+    /*
+      Write data stored in info->rec_buff to pages
+      This is the char/varchar data that didn't fit into the head page.
+    */
+    DBUG_ASSERT(bitmap_blocks->count != 0);
+    if (write_full_pages(info, lsn, head_block + 1,
+                         info->rec_buff, (ulong) (tmp_data - info->rec_buff)))
+      goto disk_err;
+  }
+
+  /* Write rest of blobs (data, but no tails as they are already written) */
+  for (; column < end_column; column++, blob_lengths++)
+  {
+    uchar *blob_pos;
+    uint length;
+    ulong blob_length;
+    if (!*blob_lengths)                         /* Null or "" */
+      continue;
+    length= column->length - portable_sizeof_char_ptr;
+    memcpy_fixed((uchar*) &blob_pos, record + column->offset + length,
+                 sizeof(char*));
+    /* remove tail part */
+    blob_length= *blob_lengths;
+    if (block[block->sub_blocks - 1].used & BLOCKUSED_TAIL)
+      blob_length-= (blob_length % FULL_PAGE_SIZE(block_size));
+
+    if (blob_length && write_full_pages(info, lsn, block,
+                                         blob_pos, blob_length))
+      goto disk_err;
+    block+= block->sub_blocks;
+  }
+
+  _ma_finalize_row(info);
+  DBUG_RETURN(0);
+
+crashed:
+  /* Something was wrong with data on page */
+  my_errno= HA_ERR_WRONG_IN_RECORD;
+
+disk_err:
+  /**
+     @todo RECOVERY we are going to let dirty pages go to disk while we have
+     logged UNDO, this violates WAL. We must mark the table corrupted!
+
+     @todo RECOVERY we have written some REDOs without a closing UNDO,
+     it's possible that a next operation by this transaction succeeds and then
+     Recovery would glue the "orphan REDOs" to the succeeded operation and
+     execute the failed REDOs. We need some mark "abort this group" in the
+     log, or mark the table corrupted (then user will repair it and thus REDOs
+     will be skipped).
+
+     @todo RECOVERY to not let write errors go unnoticed, pagecache_write()
+     should take a MARIA_HA* in argument, and it it
+     fails when flushing a page to disk it should call
+     (*the_maria_ha->write_error_func)(the_maria_ha)
+     and this hook will mark the table corrupted.
+     Maybe hook should be stored in the pagecache's block structure, or in a
+     hash "file->maria_ha*".
+
+     @todo RECOVERY we should distinguish below between log write error and
+     table write error. The former should stop Maria immediately, the latter
+     should mark the table corrupted.
+  */
+  /*
+    Unpin all pinned pages to not cause problems for disk cache. This is
+    safe to call even if we already called _ma_unpin_all_pages() above.
+  */
+  save_my_errno= my_errno;
+  _ma_unpin_all_pages_and_finalize_row(info, LSN_IMPOSSIBLE);
+  my_errno= save_my_errno;
+  DBUG_RETURN(1);
+}
+
+
+/*
+  @brief Write a record
+
+  @fn    allocate_and_write_block_record()
+  @param info                Maria handler
+  @param record              Record to write
+  @param row		     Information about fields in 'record'
+  @param undo_lsn	     <> LSN_ERROR if we are executing an UNDO
+
+  @return
+  @retval 0	ok
+  @retval 1	Error
+*/
+
+static my_bool allocate_and_write_block_record(MARIA_HA *info,
+                                               const uchar *record,
+                                               MARIA_ROW *row,
+                                               LSN undo_lsn)
+{
+  struct st_row_pos_info row_pos;
+  MARIA_BITMAP_BLOCKS *blocks= &row->insert_blocks;
+  int save_my_errno;
+  DBUG_ENTER("allocate_and_write_block_record");
+
+  _ma_bitmap_flushable(info, 1);
+  if (_ma_bitmap_find_place(info, row, blocks))
+    goto err;                         /* Error reading bitmap */
+
+  /*
+    Sleep; a checkpoint will happen and should not send this over-allocated
+    bitmap to disk but rather wait.
+  */
+  DBUG_EXECUTE_IF("maria_over_alloc_bitmap", sleep(10););
+
+  /* page will be pinned & locked by get_head_or_tail_page */
+  if (get_head_or_tail_page(info, blocks->block, info->buff,
+                            row->space_on_head_page, HEAD_PAGE,
+                            PAGECACHE_LOCK_WRITE, &row_pos))
+    goto err;
+  row->lastpos= ma_recordpos(blocks->block->page, row_pos.rownr);
+  if (info->s->calc_checksum)
+  {
+    if (undo_lsn == LSN_ERROR)
+      row->checksum= (info->s->calc_checksum)(info, record);
+    else
+    {
+      /* _ma_apply_undo_row_delete() already set row's checksum. Verify it. */
+      DBUG_ASSERT(row->checksum == (info->s->calc_checksum)(info, record));
+    }
+  }
+  DBUG_PRINT("info", ("rowid: %lu (%lu:%u) length: %u", (ulong) row->lastpos,
+                      (ulong) ma_recordpos_to_page(row->lastpos),
+                      ma_recordpos_to_dir_entry(row->lastpos),
+                      row_pos.length));
+  if (write_block_record(info, (uchar*) 0, record, row,
+                         blocks, blocks->block->org_bitmap_value != 0,
+                         &row_pos, undo_lsn, 0))
+    goto err;
+  /* Now let checkpoint happen but don't commit */
+  DBUG_EXECUTE_IF("maria_over_alloc_bitmap", sleep(1000););
+  DBUG_RETURN(0);
+
+err:
+  save_my_errno= my_errno;
+  if (info->non_flushable_state)
+    _ma_bitmap_flushable(info, -1);
+  _ma_unpin_all_pages_and_finalize_row(info, LSN_IMPOSSIBLE);
+  my_errno= save_my_errno;
+  DBUG_RETURN(1);
+}
+
+
+/*
+  Write a record and return rowid for it
+
+  SYNOPSIS
+    _ma_write_init_block_record()
+    info                Maria handler
+    record              Record to write
+
+  NOTES
+    This is done BEFORE we write the keys to the row!
+
+  RETURN
+    HA_OFFSET_ERROR     Something went wrong
+    #                   Rowid for row
+*/
+
+MARIA_RECORD_POS _ma_write_init_block_record(MARIA_HA *info,
+                                             const uchar *record)
+{
+  DBUG_ENTER("_ma_write_init_block_record");
+
+  calc_record_size(info, record, &info->cur_row);
+  if (allocate_and_write_block_record(info, record,
+                                      &info->cur_row, LSN_ERROR))
+    DBUG_RETURN(HA_OFFSET_ERROR);
+  DBUG_RETURN(info->cur_row.lastpos);
+}
+
+
+/*
+  Dummy function for (*info->s->write_record)()
+
+  Nothing to do here, as we already wrote the record in
+  _ma_write_init_block_record()
+*/
+
+my_bool _ma_write_block_record(MARIA_HA *info __attribute__ ((unused)),
+                               const uchar *record __attribute__ ((unused)))
+{
+  return 0;                                     /* Row already written */
+}
+
+
+/**
+   @brief Remove row written by _ma_write_block_record() and log undo
+
+   @param  info            Maria handler
+
+   @note
+     This is called in case we got a duplicate unique key while
+     writing keys.
+
+   @return Operation status
+     @retval 0      OK
+     @retval 1      Error
+*/
+
+my_bool _ma_write_abort_block_record(MARIA_HA *info)
+{
+  my_bool res= 0;
+  MARIA_BITMAP_BLOCKS *blocks= &info->cur_row.insert_blocks;
+  MARIA_BITMAP_BLOCK *block, *end;
+  LSN lsn= LSN_IMPOSSIBLE;
+  MARIA_SHARE *share= info->s;
+  DBUG_ENTER("_ma_write_abort_block_record");
+
+  _ma_bitmap_lock(share);  /* Lock bitmap from other insert threads */
+  if (delete_head_or_tail(info,
+                          ma_recordpos_to_page(info->cur_row.lastpos),
+                          ma_recordpos_to_dir_entry(info->cur_row.lastpos), 1,
+                          0))
+    res= 1;
+  for (block= blocks->block + 1, end= block + blocks->count - 1; block < end;
+       block++)
+  {
+    if (block->used & BLOCKUSED_USED)
+    {
+      if (block->used & BLOCKUSED_TAIL)
+      {
+        /*
+          block->page_count is set to the tail directory entry number in
+          write_block_record()
+        */
+        if (delete_head_or_tail(info, block->page,
+                                block->page_count & ~TAIL_BIT,
+                                0, 0))
+          res= 1;
+      }
+      else
+      {
+        if (free_full_page_range(info, block->page, block->page_count))
+          res= 1;
+      }
+    }
+  }
+  if (share->now_transactional)
+  {
+    if (_ma_write_clr(info, info->cur_row.orig_undo_lsn,
+                      LOGREC_UNDO_ROW_INSERT,
+                      share->calc_checksum != 0,
+                      (ha_checksum) 0 - info->cur_row.checksum,
+                      &lsn, (void*) 0))
+      res= 1;
+  }
+  _ma_bitmap_unlock(share);
+  _ma_unpin_all_pages_and_finalize_row(info, lsn);
+  DBUG_RETURN(res);
+}
+
+
+/*
+  Update a record
+
+  NOTES
+    For the moment, we assume that info->curr_row.extents is always updated
+    when a row is read. In the future we may decide to read this on demand
+    for rows split into many extents.
+*/
+
+static my_bool _ma_update_block_record2(MARIA_HA *info,
+                                        MARIA_RECORD_POS record_pos,
+                                        const uchar *oldrec,
+                                        const uchar *record,
+                                        LSN undo_lsn)
+{
+  MARIA_BITMAP_BLOCKS *blocks= &info->cur_row.insert_blocks;
+  uchar *buff;
+  MARIA_ROW *cur_row= &info->cur_row, *new_row= &info->new_row;
+  MARIA_PINNED_PAGE page_link;
+  uint rownr, org_empty_size, head_length;
+  uint block_size= info->s->block_size;
+  uint errpos= 0;
+  uchar *dir;
+  pgcache_page_no_t page;
+  struct st_row_pos_info row_pos;
+  my_bool res;
+  ha_checksum old_checksum;
+  MARIA_SHARE *share= info->s;
+  DBUG_ENTER("_ma_update_block_record2");
+  DBUG_PRINT("enter", ("rowid: %lu", (long) record_pos));
+
+#ifdef ENABLE_IF_PROBLEM_WITH_UPDATE
+  DBUG_DUMP("oldrec", oldrec, share->base.reclength);
+  DBUG_DUMP("newrec", record, share->base.reclength);
+#endif
+
+  /*
+    Checksums of new and old rows were computed by callers already; new
+    row's was put into cur_row, old row's was put into new_row.
+  */
+  old_checksum= new_row->checksum;
+  new_row->checksum= cur_row->checksum;
+  calc_record_size(info, record, new_row);
+  page= ma_recordpos_to_page(record_pos);
+
+  _ma_bitmap_flushable(info, 1);
+  buff= pagecache_read(share->pagecache,
+                       &info->dfile, (pgcache_page_no_t) page, 0, 0,
+                       share->page_type,
+                       PAGECACHE_LOCK_WRITE, &page_link.link);
+  page_link.unlock= PAGECACHE_LOCK_WRITE_UNLOCK;
+  page_link.changed= buff != 0;
+  push_dynamic(&info->pinned_pages, (void*) &page_link);
+  if (!buff)
+    goto err;
+
+  org_empty_size= uint2korr(buff + EMPTY_SPACE_OFFSET);
+  rownr= ma_recordpos_to_dir_entry(record_pos);
+  dir= dir_entry_pos(buff, block_size, rownr);
+
+  /*
+    We can't use cur_row->head_length as the block may have been compacted
+    since we read it.
+  */
+  head_length= uint2korr(dir + 2);
+
+  if ((org_empty_size + head_length) >= new_row->total_length)
+  {
+    uint rec_offset, length;
+    MARIA_BITMAP_BLOCK block;
+
+    DBUG_PRINT("info", ("org_empty_size: %u  org_length: %u  new_length: %lu",
+                        org_empty_size, head_length,
+                        new_row->total_length));
+
+    /*
+      We can fit the new row in the same page as the original head part
+      of the row
+    */
+    block.org_bitmap_value= _ma_free_size_to_head_pattern(&share->bitmap,
+                                                          org_empty_size);
+    if (extend_area_on_page(info, buff, dir, rownr, block_size,
+                            new_row->total_length, &org_empty_size,
+                            &rec_offset, &length))
+    {
+      errpos= 1;
+      goto err;
+    }
+
+    row_pos.buff= buff;
+    row_pos.rownr= rownr;
+    row_pos.empty_space= org_empty_size;
+    row_pos.dir= dir;
+    row_pos.data= buff + rec_offset;
+    row_pos.length= length;
+    blocks->block= &block;
+    blocks->count= 1;
+    block.page= page;
+    block.sub_blocks= 1;
+    block.used= BLOCKUSED_USED | BLOCKUSED_USE_ORG_BITMAP;
+    block.empty_space= row_pos.empty_space;
+
+    if (*cur_row->tail_positions &&
+        delete_tails(info, cur_row->tail_positions))
+    {
+      errpos= 2;
+      goto err;
+    }
+    if (cur_row->extents_count && free_full_pages(info, cur_row))
+    {
+      errpos= 3;
+      goto err;
+    }
+    res= write_block_record(info, oldrec, record, new_row, blocks,
+                            1, &row_pos, undo_lsn, old_checksum);
+    /* We can't update or delete this without re-reading it again */
+    info->update&= ~HA_STATE_AKTIV;
+    DBUG_RETURN(res);
+  }
+  /* Delete old row */
+  if (*cur_row->tail_positions &&
+      delete_tails(info, cur_row->tail_positions))
+  {
+    errpos= 4;
+    goto err;
+  }
+  if (cur_row->extents_count && free_full_pages(info, cur_row))
+  {
+    errpos= 5;
+    goto err;
+  }
+
+  head_length= uint2korr(dir + 2);
+  if (_ma_bitmap_find_new_place(info, new_row, page, head_length +
+                                org_empty_size, blocks))
+  {
+    errpos= 6;
+    goto err;
+  }
+
+  /*
+    Allocate all size in block for record
+    TODO:
+    Need to improve this to do compact if we can fit one more blob into
+    the head page
+  */
+  if ((head_length < new_row->space_on_head_page ||
+       (new_row->total_length <= head_length &&
+        org_empty_size + head_length >= new_row->total_length)))
+  {
+    _ma_compact_block_page(buff, block_size, rownr, 1,
+                           info->trn->min_read_from,
+                           share->base.min_block_length);
+    org_empty_size= 0;
+    head_length= uint2korr(dir + 2);
+  }
+
+  row_pos.buff= buff;
+  row_pos.rownr= rownr;
+  row_pos.empty_space= org_empty_size + head_length;
+  row_pos.dir= dir;
+  row_pos.data= buff + uint2korr(dir);
+  row_pos.length= head_length;
+  if ((res= write_block_record(info, oldrec, record, new_row, blocks, 1,
+                               &row_pos, undo_lsn, old_checksum)))
+  {
+    errpos= 7;
+    goto err;
+  }
+  DBUG_RETURN(0);
+
+err:
+  DBUG_PRINT("error", ("errpos: %d", errpos));
+  if (info->non_flushable_state)
+    _ma_bitmap_flushable(info, -1);
+  _ma_unpin_all_pages_and_finalize_row(info, LSN_IMPOSSIBLE);
+  DBUG_RETURN(1);
+}
+
+
+/*
+  @brief Store new row on it's original position
+
+  @note
+  This is basicly a copy of _ma_update_block_record2
+  When we have a purge thread for deleted row, we can remove this function
+  and use _ma_update_block_record2 instead.
+
+  This is the main reason we don't make a lot of subfunctions that are
+  common between _ma_update_block_record2() and this function.
+
+  Note: If something goes wrong we mark the file crashed
+*/
+
+static my_bool _ma_update_at_original_place(MARIA_HA *info,
+                                            pgcache_page_no_t page,
+                                            uint rownr,
+                                            uint length_on_head_page,
+                                            uint extent_count,
+                                            const uchar *extent_info,
+                                            const uchar *oldrec,
+                                            const uchar *record,
+                                            LSN undo_lsn)
+{
+  MARIA_BITMAP_BLOCKS *blocks;
+  MARIA_BITMAP_BLOCK *block;
+  MARIA_ROW *cur_row= &info->cur_row, *new_row= &info->new_row;
+  MARIA_PINNED_PAGE page_link;
+  MARIA_SHARE *share= info->s;
+  ha_checksum old_checksum;
+  uint org_empty_size, empty_size;
+  uint block_size= info->s->block_size;
+  uchar *dir, *buff;
+  struct st_row_pos_info row_pos;
+  my_bool res;
+  uint rec_offset, length;
+  DBUG_ENTER("_ma_update_at_original_place");
+
+#ifdef ENABLE_IF_PROBLEM_WITH_UPDATE
+  DBUG_DUMP("oldrec", oldrec, share->base.reclength);
+  DBUG_DUMP("newrec", record, share->base.reclength);
+#endif
+
+  /*
+    Checksums of new and old rows were computed by callers already; new
+    row's was put into cur_row, old row's was put into new_row.
+  */
+  old_checksum= new_row->checksum;
+  new_row->checksum= cur_row->checksum;
+  calc_record_size(info, record, new_row);
+
+  _ma_bitmap_flushable(info, 1);
+  buff= pagecache_read(share->pagecache,
+                       &info->dfile, (pgcache_page_no_t) page, 0, 0,
+                       share->page_type,
+                       PAGECACHE_LOCK_WRITE, &page_link.link);
+  page_link.unlock= PAGECACHE_LOCK_WRITE_UNLOCK;
+  page_link.changed= buff != 0;
+  push_dynamic(&info->pinned_pages, (void*) &page_link);
+  if (!buff)
+    goto err;
+
+  org_empty_size= uint2korr(buff + EMPTY_SPACE_OFFSET);
+  dir= dir_entry_pos(buff, block_size, rownr);
+
+  if ((org_empty_size + cur_row->head_length) < length_on_head_page)
+  {
+    DBUG_PRINT("error",
+               ("org_empty_size: %u  head_length: %u  length_on_page: %u",
+                org_empty_size, (uint) cur_row->head_length,
+                length_on_head_page));
+    my_errno= HA_ERR_WRONG_IN_RECORD;
+    goto err;
+  }
+
+  /*
+    We can fit the new row in the same page as the original head part
+    of the row
+  */
+  empty_size= org_empty_size;
+  if (extend_area_on_page(info, buff, dir, rownr, block_size,
+                          length_on_head_page, &empty_size,
+                          &rec_offset, &length))
+    goto err;
+
+  row_pos.buff= buff;
+  row_pos.rownr= rownr;
+  row_pos.empty_space= empty_size;
+  row_pos.dir= dir;
+  row_pos.data= buff + rec_offset;
+
+  /* Delete old row */
+  if (*cur_row->tail_positions &&
+      delete_tails(info, cur_row->tail_positions))
+    goto err;
+  if (cur_row->extents_count && free_full_pages(info, cur_row))
+    goto err;
+
+  /* Change extent information to be usable by write_block_record() */
+  blocks= &cur_row->insert_blocks;
+  if (extent_to_bitmap_blocks(info, blocks, page, extent_count, extent_info))
+    goto err;
+  block= blocks->block;
+  block->empty_space= row_pos.empty_space;
+  block->org_bitmap_value= _ma_free_size_to_head_pattern(&share->bitmap,
+                                                         org_empty_size);
+  DBUG_ASSERT(block->org_bitmap_value ==
+              _ma_bitmap_get_page_bits(info, &info->s->bitmap, page));
+  block->used|= BLOCKUSED_USE_ORG_BITMAP;
+
+  /*
+    We have to use <= below as the new_row may be smaller than the original
+    row as the new row doesn't have transaction id
+  */
+
+  DBUG_ASSERT(blocks->count > 1 ||
+              max(new_row->total_length, share->base.min_block_length) <=
+              length_on_head_page);
+
+  /* Store same amount of data on head page as on original page */
+  row_pos.length= (length_on_head_page - 
+                   (extent_count + 1 - blocks->count) * ROW_EXTENT_SIZE);
+  set_if_bigger(row_pos.length, share->base.min_block_length);
+  if ((res= write_block_record(info, oldrec, record, new_row, blocks,
+                               1, &row_pos, undo_lsn, old_checksum)))
+    goto err;
+  DBUG_RETURN(0);
+
+err:
+  _ma_mark_file_crashed(share);
+  if (info->non_flushable_state)
+    _ma_bitmap_flushable(info, -1);
+  _ma_unpin_all_pages_and_finalize_row(info, LSN_IMPOSSIBLE);
+  DBUG_RETURN(1);
+}
+
+
+/* Wrapper for _ma_update_block_record2() used by ma_update() */
+
+my_bool _ma_update_block_record(MARIA_HA *info, MARIA_RECORD_POS record_pos,
+                                const uchar *orig_rec, const uchar *new_rec)
+{
+  return _ma_update_block_record2(info, record_pos, orig_rec, new_rec,
+                                  LSN_ERROR);
+}
+
+
+/*
+  Delete a directory entry
+
+  SYNOPSIS
+    delete_dir_entry()
+    buff		Page buffer
+    block_size		Block size
+    record_number	Record number to delete
+    empty_space		Empty space on page after delete
+
+  RETURN
+    -1    Error on page
+    0     ok
+    1     Page is now empty
+*/
+
+static int delete_dir_entry(uchar *buff, uint block_size, uint record_number,
+                            uint *empty_space_res)
+{
+  uint number_of_records= (uint) buff[DIR_COUNT_OFFSET];
+  uint length, empty_space;
+  uchar *dir;
+  DBUG_ENTER("delete_dir_entry");
+
+#ifdef SANITY_CHECKS
+  if (record_number >= number_of_records ||
+      record_number > ((block_size - LSN_SIZE - PAGE_TYPE_SIZE - 1 -
+                        PAGE_SUFFIX_SIZE) / DIR_ENTRY_SIZE))
+  {
+    DBUG_PRINT("error", ("record_number: %u  number_of_records: %u",
+                         record_number, number_of_records));
+
+    DBUG_RETURN(-1);
+  }
+#endif
+
+  check_directory(buff, block_size, 0, (uint) -1);
+  empty_space= uint2korr(buff + EMPTY_SPACE_OFFSET);
+  dir= dir_entry_pos(buff, block_size, record_number);
+  length= uint2korr(dir + 2);
+
+  if (record_number == number_of_records - 1)
+  {
+    /* Delete this entry and all following free directory entries */
+    uchar *end= buff + block_size - PAGE_SUFFIX_SIZE;
+    number_of_records--;
+    dir+= DIR_ENTRY_SIZE;
+    empty_space+= DIR_ENTRY_SIZE;
+
+    /* Unlink and free the next empty ones */
+    while (dir < end && dir[0] == 0 && dir[1] == 0)
+    {
+      number_of_records--;
+      if (dir[2] == END_OF_DIR_FREE_LIST)
+        buff[DIR_FREE_OFFSET]= dir[3];
+      else
+      {
+        uchar *prev_entry= dir_entry_pos(buff, block_size, (uint) dir[2]);
+        DBUG_ASSERT(uint2korr(prev_entry) == 0 && prev_entry[3] ==
+                    number_of_records);
+        prev_entry[3]= dir[3];
+      }
+      if (dir[3] != END_OF_DIR_FREE_LIST)
+      {
+        uchar *next_entry= dir_entry_pos(buff, block_size, (uint) dir[3]);
+        DBUG_ASSERT(uint2korr(next_entry) == 0 && next_entry[2] ==
+                    number_of_records);
+        next_entry[2]= dir[2];
+      }
+      dir+= DIR_ENTRY_SIZE;
+      empty_space+= DIR_ENTRY_SIZE;
+    }
+
+    if (number_of_records == 0)
+    {
+      /* All entries on page deleted */
+      DBUG_PRINT("info", ("Page marked as unallocated"));
+      buff[PAGE_TYPE_OFFSET]= UNALLOCATED_PAGE;
+#ifdef IDENTICAL_PAGES_AFTER_RECOVERY
+      {
+        dir= dir_entry_pos(buff, block_size, record_number);
+        bzero(dir, (record_number+1) * DIR_ENTRY_SIZE);
+      }
+#endif
+      *empty_space_res= block_size;
+      DBUG_RETURN(1);
+    }
+    buff[DIR_COUNT_OFFSET]= (uchar) number_of_records;
+  }
+  else
+  {
+    /* Update directory */
+    dir[0]= dir[1]= 0;
+    dir[2]= END_OF_DIR_FREE_LIST;
+    if ((dir[3]= buff[DIR_FREE_OFFSET]) != END_OF_DIR_FREE_LIST)
+    {
+      /* Relink next entry to point to newly freed entry */
+      uchar *next_entry= dir_entry_pos(buff, block_size, (uint) dir[3]);
+      DBUG_ASSERT(uint2korr(next_entry) == 0 &&
+                  next_entry[2] == END_OF_DIR_FREE_LIST);
+      next_entry[2]= record_number;
+    }
+    buff[DIR_FREE_OFFSET]= record_number;
+  }
+  empty_space+= length;
+
+  int2store(buff + EMPTY_SPACE_OFFSET, empty_space);
+  buff[PAGE_TYPE_OFFSET]|= (uchar) PAGE_CAN_BE_COMPACTED;
+
+  *empty_space_res= empty_space;
+
+  check_directory(buff, block_size, 0, empty_space);
+  DBUG_RETURN(0);
+}
+
+
+/*
+  Delete a head a tail part
+
+  SYNOPSIS
+    delete_head_or_tail()
+    info                Maria handler
+    page                Page (not file offset!) on which the row is
+    head                1 if this is a head page
+    from_update		1 if we are called from update. In this case we
+			leave the page as write locked as we may put
+                        the new row into the old position.
+
+  RETURN
+    0  ok
+    1  error
+*/
+
+static my_bool delete_head_or_tail(MARIA_HA *info,
+                                   pgcache_page_no_t page, uint record_number,
+                                   my_bool head, my_bool from_update)
+{
+  MARIA_SHARE *share= info->s;
+  uint empty_space;
+  uint block_size= share->block_size;
+  uchar *buff;
+  LSN lsn;
+  MARIA_PINNED_PAGE page_link;
+  int res;
+  enum pagecache_page_lock lock_at_write, lock_at_unpin;
+  DBUG_ENTER("delete_head_or_tail");
+  DBUG_PRINT("enter", ("id: %lu (%lu:%u)",
+                       (ulong) ma_recordpos(page, record_number),
+                       (ulong) page, record_number));
+
+  buff= pagecache_read(share->pagecache,
+                       &info->dfile, page, 0, 0,
+                       share->page_type,
+                       PAGECACHE_LOCK_WRITE, &page_link.link);
+  page_link.unlock= PAGECACHE_LOCK_WRITE_UNLOCK;
+  page_link.changed= buff != 0;
+  push_dynamic(&info->pinned_pages, (void*) &page_link);
+  if (!buff)
+    DBUG_RETURN(1);
+  DBUG_ASSERT((buff[PAGE_TYPE_OFFSET] & PAGE_TYPE_MASK) ==
+              (head ? HEAD_PAGE : TAIL_PAGE));
+
+  if (from_update)
+  {
+    lock_at_write= PAGECACHE_LOCK_LEFT_WRITELOCKED;
+    lock_at_unpin= PAGECACHE_LOCK_WRITE_UNLOCK;
+  }
+  else
+  {
+    lock_at_write= PAGECACHE_LOCK_WRITE_TO_READ;
+    lock_at_unpin= PAGECACHE_LOCK_READ_UNLOCK;
+  }
+
+  res= delete_dir_entry(buff, block_size, record_number, &empty_space);
+  if (res < 0)
+    DBUG_RETURN(1);
+  if (res == 0) /* after our deletion, page is still not empty */
+  {
+    uchar log_data[FILEID_STORE_SIZE + PAGE_STORE_SIZE + DIRPOS_STORE_SIZE];
+    LEX_CUSTRING log_array[TRANSLOG_INTERNAL_PARTS + 1];
+    if (share->now_transactional)
+    {
+      /* Log REDO data */
+      page_store(log_data + FILEID_STORE_SIZE, page);
+      dirpos_store(log_data + FILEID_STORE_SIZE + PAGE_STORE_SIZE,
+                   record_number);
+
+      log_array[TRANSLOG_INTERNAL_PARTS + 0].str=    log_data;
+      log_array[TRANSLOG_INTERNAL_PARTS + 0].length= sizeof(log_data);
+      if (translog_write_record(&lsn, (head ? LOGREC_REDO_PURGE_ROW_HEAD :
+                                       LOGREC_REDO_PURGE_ROW_TAIL),
+                                info->trn, info,
+                                (translog_size_t) sizeof(log_data),
+                                TRANSLOG_INTERNAL_PARTS + 1, log_array,
+                                log_data, NULL))
+        DBUG_RETURN(1);
+    }
+  }
+  else /* page is now empty */
+  {
+    if (share->now_transactional)
+    {
+      uchar log_data[FILEID_STORE_SIZE + PAGE_STORE_SIZE];
+      LEX_CUSTRING log_array[TRANSLOG_INTERNAL_PARTS + 1];
+      page_store(log_data + FILEID_STORE_SIZE, page);
+      log_array[TRANSLOG_INTERNAL_PARTS + 0].str=    log_data;
+      log_array[TRANSLOG_INTERNAL_PARTS + 0].length= sizeof(log_data);
+      if (translog_write_record(&lsn, LOGREC_REDO_FREE_HEAD_OR_TAIL,
+                                info->trn, info,
+                                (translog_size_t) sizeof(log_data),
+                                TRANSLOG_INTERNAL_PARTS + 1, log_array,
+                                log_data, NULL))
+        DBUG_RETURN(1);
+    }
+    DBUG_ASSERT(empty_space >= share->bitmap.sizes[0]);
+  }
+
+  pagecache_unlock_by_link(share->pagecache, page_link.link,
+                           lock_at_write,
+                           PAGECACHE_PIN_LEFT_PINNED, LSN_IMPOSSIBLE,
+                           LSN_IMPOSSIBLE, 1, FALSE);
+  page_link.unlock= lock_at_unpin;
+  set_dynamic(&info->pinned_pages, (void*) &page_link,
+              info->pinned_pages.elements-1);
+
+  DBUG_PRINT("info", ("empty_space: %u", empty_space));
+
+  /*
+    If there is not enough space for all possible tails, mark the
+    page full
+  */
+  if (!head && !enough_free_entries(buff, share->block_size,
+                                    1 + share->base.blobs))
+    empty_space= 0;
+
+  DBUG_RETURN(_ma_bitmap_set(info, page, head, empty_space));
+}
+
+
+/*
+  delete all tails
+
+  SYNOPSIS
+    delete_tails()
+    info                Handler
+    tails               Pointer to vector of tail positions, ending with 0
+
+  RETURN
+    0  ok
+    1  error
+*/
+
+static my_bool delete_tails(MARIA_HA *info, MARIA_RECORD_POS *tails)
+{
+  my_bool res= 0;
+  DBUG_ENTER("delete_tails");
+  for (; *tails; tails++)
+  {
+    if (delete_head_or_tail(info,
+                            ma_recordpos_to_page(*tails),
+                            ma_recordpos_to_dir_entry(*tails), 0, 1))
+      res= 1;
+  }
+  DBUG_RETURN(res);
+}
+
+
+/*
+  Delete a record
+
+  NOTES
+   For the moment, we assume that info->cur_row.extents is always updated
+   when a row is read. In the future we may decide to read this on demand
+   for rows with many splits.
+*/
+
+my_bool _ma_delete_block_record(MARIA_HA *info, const uchar *record)
+{
+  pgcache_page_no_t page;
+  uint record_number;
+  MARIA_SHARE *share= info->s;
+  LSN lsn= LSN_IMPOSSIBLE;
+  DBUG_ENTER("_ma_delete_block_record");
+
+  page=          ma_recordpos_to_page(info->cur_row.lastpos);
+  record_number= ma_recordpos_to_dir_entry(info->cur_row.lastpos);
+  DBUG_PRINT("enter", ("rowid: %lu (%lu:%u)", (ulong) info->cur_row.lastpos,
+                       (ulong) page, record_number));
+
+  _ma_bitmap_flushable(info, 1);
+  if (delete_head_or_tail(info, page, record_number, 1, 0) ||
+      delete_tails(info, info->cur_row.tail_positions))
+    goto err;
+
+  if (info->cur_row.extents_count && free_full_pages(info, &info->cur_row))
+    goto err;
+
+  if (share->now_transactional)
+  {
+    uchar log_data[LSN_STORE_SIZE + FILEID_STORE_SIZE + PAGE_STORE_SIZE +
+                   DIRPOS_STORE_SIZE + 2 + PAGERANGE_STORE_SIZE +
+                   HA_CHECKSUM_STORE_SIZE];
+    uchar *log_pos;
+    size_t row_length;
+    uint row_parts_count, extents_length;
+    ha_checksum checksum_delta;
+
+    /* Write UNDO record */
+    lsn_store(log_data, info->trn->undo_lsn);
+    page_store(log_data + LSN_STORE_SIZE + FILEID_STORE_SIZE, page);
+    log_pos= log_data + LSN_STORE_SIZE + FILEID_STORE_SIZE + PAGE_STORE_SIZE;
+    dirpos_store(log_pos, record_number);
+    log_pos+= DIRPOS_STORE_SIZE;
+    int2store(log_pos, info->cur_row.head_length -
+              info->cur_row.header_length);
+    log_pos+= 2;
+    pagerange_store(log_pos, info->cur_row.extents_count);
+    log_pos+= PAGERANGE_STORE_SIZE;
+
+    info->log_row_parts[TRANSLOG_INTERNAL_PARTS].str= log_data;
+    info->log_row_parts[TRANSLOG_INTERNAL_PARTS].length=
+      sizeof(log_data) - HA_CHECKSUM_STORE_SIZE;
+    store_checksum_in_rec(share, checksum_delta,
+                          (ha_checksum) 0 - info->cur_row.checksum, log_pos,
+                          info->log_row_parts[TRANSLOG_INTERNAL_PARTS +
+                                              0].length);
+    info->log_row_parts[TRANSLOG_INTERNAL_PARTS+1].str=
+      info->cur_row.extents;
+    info->log_row_parts[TRANSLOG_INTERNAL_PARTS+1].length=
+      extents_length= info->cur_row.extents_count * ROW_EXTENT_SIZE;
+
+    row_length= fill_insert_undo_parts(info, record,
+                                       (info->log_row_parts +
+                                        TRANSLOG_INTERNAL_PARTS + 2),
+                                       &row_parts_count);
+
+    if (translog_write_record(&lsn, LOGREC_UNDO_ROW_DELETE, info->trn,
+                              info,
+                              (translog_size_t)
+                              (info->log_row_parts[TRANSLOG_INTERNAL_PARTS +
+                                                   0].length + row_length +
+                               extents_length),
+                              TRANSLOG_INTERNAL_PARTS + 2 + row_parts_count,
+                              info->log_row_parts,
+                              log_data + LSN_STORE_SIZE,
+                              &checksum_delta))
+      goto err;
+  }
+
+  _ma_bitmap_flushable(info, -1);
+  _ma_unpin_all_pages_and_finalize_row(info, lsn);
+  DBUG_RETURN(0);
+
+err:
+  _ma_bitmap_flushable(info, -1);
+  _ma_unpin_all_pages_and_finalize_row(info, LSN_IMPOSSIBLE);
+  DBUG_RETURN(1);
+}
+
+
+/****************************************************************************
+  Reading of records
+****************************************************************************/
+
+/*
+  Read position to record from record directory at end of page
+
+  SYNOPSIS
+   get_record_position()
+   buff                 page buffer
+   block_size           block size for page
+   record_number        Record number in index
+   end_of_data          pointer to end of data for record
+
+  RETURN
+    0  Error in data
+    #  Pointer to start of record.
+       In this case *end_of_data is set.
+*/
+
+static uchar *get_record_position(uchar *buff, uint block_size,
+                                 uint record_number, uchar **end_of_data)
+{
+  uint number_of_records= (uint) buff[DIR_COUNT_OFFSET];
+  uchar *dir;
+  uchar *data;
+  uint offset, length;
+
+#ifdef SANITY_CHECKS
+  if (record_number >= number_of_records ||
+      record_number > ((block_size - PAGE_HEADER_SIZE - PAGE_SUFFIX_SIZE) /
+                       DIR_ENTRY_SIZE))
+  {
+    DBUG_PRINT("error",
+               ("Wrong row number: record_number: %u  number_of_records: %u",
+                record_number, number_of_records));
+    return 0;
+  }
+#endif
+
+  dir= dir_entry_pos(buff, block_size, record_number);
+  offset= uint2korr(dir);
+  length= uint2korr(dir + 2);
+#ifdef SANITY_CHECKS
+  if (offset < PAGE_HEADER_SIZE ||
+      offset + length > (block_size -
+                         number_of_records * DIR_ENTRY_SIZE -
+                         PAGE_SUFFIX_SIZE))
+  {
+    DBUG_PRINT("error",
+               ("Wrong row position:  record_number: %u  offset: %u  "
+                "length: %u  number_of_records: %u",
+                record_number, offset, length, number_of_records));
+    return 0;
+  }
+#endif
+  data= buff + offset;
+  *end_of_data= data + length;
+  return data;
+}
+
+
+/*
+  Init extent
+
+  NOTES
+    extent is a cursor over which pages to read
+*/
+
+static void init_extent(MARIA_EXTENT_CURSOR *extent, uchar *extent_info,
+                        uint extents, MARIA_RECORD_POS *tail_positions)
+{
+  uint page_count;
+  extent->extent=       extent_info;
+  extent->extent_count= extents;
+  extent->page=         page_korr(extent_info);         /* First extent */
+  page_count=           (uint2korr(extent_info + ROW_EXTENT_PAGE_SIZE) &
+                         ~START_EXTENT_BIT);
+  extent->tail=         page_count & TAIL_BIT;
+  if (extent->tail)
+  {
+    extent->page_count=   1;
+    extent->tail_row_nr=  page_count & ~TAIL_BIT;
+  }
+  else
+    extent->page_count=   page_count;
+  extent->tail_positions= tail_positions;
+  extent->lock_for_tail_pages= PAGECACHE_LOCK_LEFT_UNLOCKED;
+}
+
+
+/*
+  Read next extent
+
+  SYNOPSIS
+    read_next_extent()
+    info                Maria handler
+    extent              Pointer to current extent (this is updated to point
+                        to next)
+    end_of_data         Pointer to end of data in read block (out)
+
+  NOTES
+    New block is read into info->buff
+
+  RETURN
+    0   Error;  my_errno is set
+    #   Pointer to start of data in read block
+        In this case end_of_data is updated to point to end of data.
+*/
+
+static uchar *read_next_extent(MARIA_HA *info, MARIA_EXTENT_CURSOR *extent,
+                              uchar **end_of_data)
+{
+  MARIA_SHARE *share= info->s;
+  uchar *buff, *data;
+  MARIA_PINNED_PAGE page_link;
+  enum pagecache_page_lock lock;
+  DBUG_ENTER("read_next_extent");
+
+  if (!extent->page_count)
+  {
+    uint page_count;
+    if (!--extent->extent_count)
+      goto crashed;
+    extent->extent+=    ROW_EXTENT_SIZE;
+    extent->page=       page_korr(extent->extent);
+    page_count=         (uint2korr(extent->extent+ROW_EXTENT_PAGE_SIZE) &
+                         ~START_EXTENT_BIT);
+    if (!page_count)
+      goto crashed;
+    extent->tail=       page_count & TAIL_BIT;
+    if (extent->tail)
+      extent->tail_row_nr= page_count & ~TAIL_BIT;
+    else
+      extent->page_count= page_count;
+    DBUG_PRINT("info",("New extent.  Page: %lu  page_count: %u  tail_flag: %d",
+                       (ulong) extent->page, extent->page_count,
+                       extent->tail != 0));
+  }
+  extent->first_extent= 0;
+
+  lock= PAGECACHE_LOCK_LEFT_UNLOCKED;
+  if (extent->tail)
+    lock= extent->lock_for_tail_pages;
+
+  buff= pagecache_read(share->pagecache,
+                       &info->dfile, extent->page, 0,
+                       info->buff, share->page_type,
+                       lock, &page_link.link);
+  if (lock != PAGECACHE_LOCK_LEFT_UNLOCKED)
+  {
+    /* Read during UNDO */
+    page_link.unlock= PAGECACHE_LOCK_WRITE_UNLOCK;
+    page_link.changed= buff != 0;
+    push_dynamic(&info->pinned_pages, (void*) &page_link);
+  }
+  if (!buff)
+  {
+    /* check if we tried to read over end of file (ie: bad data in record) */
+    if ((extent->page + 1) * share->block_size >
+        share->state.state.data_file_length)
+      goto crashed;
+    DBUG_RETURN(0);
+  }
+
+  if (!extent->tail)
+  {
+    /* Full data page */
+    if ((buff[PAGE_TYPE_OFFSET] & PAGE_TYPE_MASK) != BLOB_PAGE)
+      goto crashed;
+    extent->page++;                             /* point to next page */
+    extent->page_count--;
+    *end_of_data= buff + share->block_size - PAGE_SUFFIX_SIZE;
+    info->cur_row.full_page_count++;            /* For maria_chk */
+    DBUG_RETURN(extent->data_start= buff + LSN_SIZE + PAGE_TYPE_SIZE);
+  }
+
+  /* Found tail */
+  if ((buff[PAGE_TYPE_OFFSET] & PAGE_TYPE_MASK) != TAIL_PAGE)
+    goto crashed;
+  *(extent->tail_positions++)= ma_recordpos(extent->page,
+                                            extent->tail_row_nr);
+  info->cur_row.tail_count++;                   /* For maria_chk */
+
+  if (!(data= get_record_position(buff, share->block_size,
+                                  extent->tail_row_nr,
+                                  end_of_data)))
+    goto crashed;
+  extent->data_start= data;
+  extent->page_count= 0;                        /* No more data in extent */
+  DBUG_RETURN(data);
+
+
+crashed:
+  my_errno= HA_ERR_WRONG_IN_RECORD;             /* File crashed */
+  DBUG_PRINT("error", ("wrong extent information"));
+  DBUG_RETURN(0);
+}
+
+
+/*
+  Read data that may be split over many blocks
+
+  SYNOPSIS
+    read_long_data()
+    info                Maria handler
+    to                  Store result string here (this is allocated)
+    extent              Pointer to current extent position
+    data                Current position in buffer
+    end_of_data         End of data in buffer
+
+  NOTES
+    When we have to read a new buffer, it's read into info->buff
+
+    This loop is implemented by goto's instead of a for() loop as
+    the code is notable smaller and faster this way (and it's not nice
+    to jump into a for loop() or into a 'then' clause)
+
+  RETURN
+    0   ok
+    1   error
+*/
+
+static my_bool read_long_data2(MARIA_HA *info, uchar *to, ulong length,
+                              MARIA_EXTENT_CURSOR *extent,
+                              uchar **data, uchar **end_of_data)
+{
+  uint left_length= (uint) (*end_of_data - *data);
+  DBUG_ENTER("read_long_data2");
+  DBUG_PRINT("enter", ("length: %lu  left_length: %u",
+                       length, left_length));
+  DBUG_ASSERT(*data <= *end_of_data);
+
+  /*
+    Fields are never split in middle. This means that if length > rest-of-data
+    we should start reading from the next extent.  The reason we may have
+    data left on the page is that if the fixed part of the row was less than
+    min_block_length the head block was extended to min_block_length.
+
+    This may change in the future, which is why we have the loop written
+    the way it's written.
+  */
+  if (extent->first_extent && length > left_length)
+  {
+    *end_of_data= *data;
+    left_length= 0;
+  }
+
+  for(;;)
+  {
+    if (unlikely(left_length >= length))
+    {
+      memcpy(to, *data, length);
+      (*data)+= length;
+      DBUG_PRINT("info", ("left_length: %u", left_length - (uint) length));
+      DBUG_RETURN(0);
+    }
+    memcpy(to, *data, left_length);
+    to+= left_length;
+    length-= left_length;
+    if (!(*data= read_next_extent(info, extent, end_of_data)))
+      break;
+    left_length= (uint) (*end_of_data - *data);
+  }
+  DBUG_RETURN(1);
+}
+
+static inline my_bool read_long_data(MARIA_HA *info, uchar *to, ulong length,
+                              MARIA_EXTENT_CURSOR *extent,
+                              uchar **data, uchar **end_of_data)
+{
+  uint left_length= (uint) (*end_of_data - *data);
+  if (likely(left_length >= length))
+  {
+    memcpy(to, *data, length);
+    (*data)+= length;
+    return 0;
+  }
+  return read_long_data2(info, to, length, extent, data, end_of_data);
+}
+
+
+/*
+  Read a record from page (helper function for _ma_read_block_record())
+
+  SYNOPSIS
+    _ma_read_block_record2()
+    info                Maria handler
+    record              Store record here
+    data                Start of head data for row
+    end_of_data         End of data for row
+
+  NOTES
+    The head page is already read by caller
+    Following data is update in info->cur_row:
+
+    cur_row.head_length is set to size of entry in head block
+    cur_row.tail_positions is set to point to all tail blocks
+    cur_row.extents points to extents data
+    cur_row.extents_counts contains number of extents
+    cur_row.empty_bits is set to empty bits
+    cur_row.field_lengths contains packed length of all fields
+    cur_row.blob_length contains total length of all blobs
+    cur_row.checksum contains checksum of read record.
+
+   RETURN
+     0  ok
+     #  Error code
+*/
+
+int _ma_read_block_record2(MARIA_HA *info, uchar *record,
+                           uchar *data, uchar *end_of_data)
+{
+  MARIA_SHARE *share= info->s;
+  uchar *field_length_data, *blob_buffer, *start_of_data;
+  uint flag, null_bytes, cur_null_bytes, row_extents, field_lengths;
+  my_bool found_blob= 0;
+  MARIA_EXTENT_CURSOR extent;
+  MARIA_COLUMNDEF *column, *end_column;
+  MARIA_ROW *cur_row= &info->cur_row;
+  DBUG_ENTER("_ma_read_block_record2");
+
+  LINT_INIT(field_length_data);
+  LINT_INIT(blob_buffer);
+
+  start_of_data= data;
+  flag= (uint) (uchar) data[0];
+  cur_null_bytes= share->base.original_null_bytes;
+  null_bytes=     share->base.null_bytes;
+  cur_row->head_length= (uint) (end_of_data - data);
+  cur_row->full_page_count= cur_row->tail_count= 0;
+  cur_row->blob_length= 0;
+  /* Number of bytes in header that we don't need to write during undo */
+  cur_row->header_length= total_header_size[(flag & PRECALC_HEADER_BITMASK)]-1;
+
+  if (flag & ROW_FLAG_TRANSID)
+  {
+    cur_row->trid= transid_korr(data+1);
+    if (!info->trn)
+      DBUG_RETURN(my_errno= HA_ERR_WRONG_IN_RECORD);     /* File crashed */
+    if (!trnman_can_read_from(info->trn, cur_row->trid))
+      DBUG_RETURN(my_errno= HA_ERR_ROW_NOT_VISIBLE);
+  }
+
+  /* Skip trans header (for now, until we have MVCC csupport) */
+  data+= cur_row->header_length + 1 ;
+  if (flag & ROW_FLAG_NULLS_EXTENDED)
+    cur_null_bytes+= data[-1];
+
+  row_extents= 0;
+  if (flag & ROW_FLAG_EXTENTS)
+  {
+    uint row_extent_size;
+    /*
+      Record is split over many data pages.
+      Get number of extents and first extent
+    */
+    get_key_length(row_extents, data);
+    cur_row->extents_count= row_extents;
+    row_extent_size= row_extents * ROW_EXTENT_SIZE;
+    if (cur_row->extents_buffer_length < row_extent_size &&
+        _ma_alloc_buffer(&cur_row->extents,
+                         &cur_row->extents_buffer_length,
+                         row_extent_size))
+      DBUG_RETURN(my_errno);
+    memcpy(cur_row->extents, data, ROW_EXTENT_SIZE);
+    data+= ROW_EXTENT_SIZE;
+    init_extent(&extent, cur_row->extents, row_extents,
+                cur_row->tail_positions);
+  }
+  else
+  {
+    cur_row->extents_count= 0;
+    (*cur_row->tail_positions)= 0;
+    extent.page_count= 0;
+    extent.extent_count= 1;
+  }
+  extent.first_extent= 1;
+
+  field_lengths= 0;
+  if (share->base.max_field_lengths)
+  {
+    get_key_length(field_lengths, data);
+    cur_row->field_lengths_length= field_lengths;
+#ifdef SANITY_CHECKS
+    if (field_lengths > share->base.max_field_lengths)
+      goto err;
+#endif
+  }
+
+  if (share->calc_checksum)
+    cur_row->checksum= (uint) (uchar) *data++;
+  /* data now points on null bits */
+  memcpy(record, data, cur_null_bytes);
+  if (unlikely(cur_null_bytes != null_bytes))
+  {
+    /*
+      This only happens if we have added more NULL columns with
+      ALTER TABLE and are fetching an old, not yet modified old row
+    */
+    bzero(record + cur_null_bytes, (uint) (null_bytes - cur_null_bytes));
+  }
+  data+= null_bytes;
+  /* We copy the empty bits to be able to use them for delete/update */
+  memcpy(cur_row->empty_bits, data, share->base.pack_bytes);
+  data+= share->base.pack_bytes;
+
+  /* TODO: Use field offsets, instead of just skipping them */
+  data+= share->base.field_offsets * FIELD_OFFSET_SIZE;
+
+  /*
+    Read row extents (note that first extent was already read into
+    cur_row->extents above)
+  */
+  if (row_extents > 1)
+  {
+    if (read_long_data(info, cur_row->extents + ROW_EXTENT_SIZE,
+                       (row_extents - 1) * ROW_EXTENT_SIZE,
+                       &extent, &data, &end_of_data))
+      DBUG_RETURN(my_errno);
+  }
+
+  /*
+    Data now points to start of fixed length field data that can't be null
+    or 'empty'. Note that these fields can't be split over blocks.
+  */
+  for (column= share->columndef,
+         end_column= column + share->base.fixed_not_null_fields;
+       column < end_column; column++)
+  {
+    uint column_length= column->length;
+    if (data + column_length > end_of_data &&
+        !(data= read_next_extent(info, &extent, &end_of_data)))
+      goto err;
+    memcpy(record + column->offset, data, column_length);
+    data+= column_length;
+  }
+
+  /* Read array of field lengths. This may be stored in several extents */
+  if (field_lengths)
+  {
+    field_length_data= cur_row->field_lengths;
+    if (read_long_data(info, field_length_data, field_lengths, &extent,
+                       &data, &end_of_data))
+      DBUG_RETURN(my_errno);
+  }
+
+  /* Read variable length data. Each of these may be split over many extents */
+  for (end_column= share->columndef + share->base.fields;
+       column < end_column; column++)
+  {
+    enum en_fieldtype type= column->type;
+    uchar *field_pos= record + column->offset;
+    /* First check if field is present in record */
+    if ((record[column->null_pos] & column->null_bit) ||
+        (cur_row->empty_bits[column->empty_pos] & column->empty_bit))
+    {
+      bfill(record + column->offset, column->fill_length,
+            type == FIELD_SKIP_ENDSPACE ? ' ' : 0);
+      continue;
+    }
+    switch (type) {
+    case FIELD_NORMAL:                          /* Fixed length field */
+    case FIELD_SKIP_PRESPACE:
+    case FIELD_SKIP_ZERO:                       /* Fixed length field */
+      if (data + column->length > end_of_data &&
+          !(data= read_next_extent(info, &extent, &end_of_data)))
+        goto err;
+      memcpy(field_pos, data, column->length);
+      data+= column->length;
+      break;
+    case FIELD_SKIP_ENDSPACE:                   /* CHAR */
+    {
+      /* Char that is space filled */
+      uint length;
+      if (column->length <= 255)
+        length= (uint) (uchar) *field_length_data++;
+      else
+      {
+        length= uint2korr(field_length_data);
+        field_length_data+= 2;
+      }
+#ifdef SANITY_CHECKS
+      if (length > column->length)
+        goto err;
+#endif
+      if (read_long_data(info, field_pos, length, &extent, &data,
+                         &end_of_data))
+        DBUG_RETURN(my_errno);
+      bfill(field_pos + length, column->length - length, ' ');
+      break;
+    }
+    case FIELD_VARCHAR:
+    {
+      ulong length;
+      if (column->length <= 256)
+      {
+        length= (uint) (uchar) (*field_pos++= *field_length_data++);
+      }
+      else
+      {
+        length= uint2korr(field_length_data);
+        field_pos[0]= field_length_data[0];
+        field_pos[1]= field_length_data[1];
+        field_pos+= 2;
+        field_length_data+= 2;
+      }
+#ifdef SANITY_CHECKS
+      if (length > column->length)
+        goto err;
+#endif
+      if (read_long_data(info, field_pos, length, &extent, &data,
+                         &end_of_data))
+        DBUG_RETURN(my_errno);
+      break;
+    }
+    case FIELD_BLOB:
+    {
+      uint column_size_length= column->length - portable_sizeof_char_ptr;
+      ulong blob_length= _ma_calc_blob_length(column_size_length,
+                                              field_length_data);
+
+      if (!found_blob)
+      {
+        /* Calculate total length for all blobs */
+        ulong blob_lengths= 0;
+        uchar *length_data= field_length_data;
+        MARIA_COLUMNDEF *blob_field= column;
+
+        found_blob= 1;
+        for (; blob_field < end_column; blob_field++)
+        {
+          uint size_length;
+          if ((record[blob_field->null_pos] & blob_field->null_bit) ||
+              (cur_row->empty_bits[blob_field->empty_pos] &
+               blob_field->empty_bit))
+            continue;
+          size_length= blob_field->length - portable_sizeof_char_ptr;
+          blob_lengths+= _ma_calc_blob_length(size_length, length_data);
+          length_data+= size_length;
+        }
+        cur_row->blob_length= blob_lengths;
+        DBUG_PRINT("info", ("Total blob length: %lu", blob_lengths));
+        if (_ma_alloc_buffer(&info->blob_buff, &info->blob_buff_size,
+                             blob_lengths))
+          DBUG_RETURN(my_errno);
+        blob_buffer= info->blob_buff;
+      }
+
+      memcpy(field_pos, field_length_data, column_size_length);
+      memcpy_fixed(field_pos + column_size_length, (uchar *) &blob_buffer,
+                   sizeof(char*));
+      field_length_data+= column_size_length;
+
+      /*
+        After we have read one extent, then each blob is in it's own extent
+      */
+      if (!extent.first_extent || (ulong) (end_of_data - data) < blob_length)
+        end_of_data= data;                      /* Force read of next extent */
+
+      if (read_long_data(info, blob_buffer, blob_length, &extent, &data,
+                         &end_of_data))
+        DBUG_RETURN(my_errno);
+      blob_buffer+= blob_length;
+      break;
+    }
+    default:
+#ifdef EXTRA_DEBUG
+      DBUG_ASSERT(0);                           /* purecov: deadcode */
+#endif
+      goto err;
+    }
+    continue;
+  }
+
+  if (row_extents)
+  {
+    DBUG_PRINT("info", ("Row read:  page_count: %u  extent_count: %u",
+                        extent.page_count, extent.extent_count));
+    *extent.tail_positions= 0;                  /* End marker */
+    if (extent.page_count)
+      goto err;
+    if (extent.extent_count > 1)
+    {
+      if (_ma_check_if_zero(extent.extent + ROW_EXTENT_SIZE,
+                            (extent.extent_count-1) * ROW_EXTENT_SIZE))
+      {
+        DBUG_PRINT("error", ("Data in extent is not zero"));
+        DBUG_DUMP("extent", extent.extent + ROW_EXTENT_SIZE,
+                  (extent.extent_count-1) * ROW_EXTENT_SIZE);
+        goto err;
+      }
+    }
+  }
+  else
+  {
+    DBUG_PRINT("info", ("Row read"));
+    /*
+      data should normally point to end_of_date. The only exception is if
+      the row is very short in which case we allocated 'min_block_length' data
+      for allowing the row to expand.
+    */
+    if (data != end_of_data && (uint) (end_of_data - start_of_data) >
+        share->base.min_block_length)
+      goto err;
+  }
+#ifdef EXTRA_DEBUG
+  if (share->calc_checksum)
+  {
+    /* Esnure that row checksum is correct */
+    DBUG_ASSERT(((share->calc_checksum)(info, record) & 255) ==
+                cur_row->checksum);
+  }
+#endif
+  info->update|= HA_STATE_AKTIV;	/* We have an active record */
+  DBUG_RETURN(0);
+
+err:
+  /* Something was wrong with data on record */
+  DBUG_PRINT("error", ("Found record with wrong data"));
+  DBUG_RETURN((my_errno= HA_ERR_WRONG_IN_RECORD));
+}
+
+
+/** @brief Read positions to tail blocks and full blocks
+
+  @fn    read_row_extent_info()
+  @param info	Handler
+
+  @notes
+    This function is a simpler version of _ma_read_block_record2()
+    The data about the used pages is stored in info->cur_row.
+
+  @return Status
+  @retval 0   ok
+  @retval 1   Error. my_errno contains error number
+*/
+
+static my_bool read_row_extent_info(MARIA_HA *info, uchar *buff,
+                                    uint record_number)
+{
+  MARIA_SHARE *share= info->s;
+  MARIA_EXTENT_CURSOR extent;
+  MARIA_RECORD_POS *tail_pos;
+  uchar *data, *end_of_data;
+  uint flag, row_extents, row_extents_size, field_lengths;
+  uchar *extents, *end;
+  DBUG_ENTER("read_row_extent_info");
+
+  if (!(data= get_record_position(buff, share->block_size,
+                                  record_number, &end_of_data)))
+    DBUG_RETURN(1);                             /* Wrong in record */
+
+  flag= (uint) (uchar) data[0];
+  /* Skip trans header */
+  data+= total_header_size[(flag & PRECALC_HEADER_BITMASK)];
+
+  row_extents= 0;
+  row_extents_size= 0;
+  if (flag & ROW_FLAG_EXTENTS)
+  {
+    /*
+      Record is split over many data pages.
+      Get number of extents and first extent
+    */
+    get_key_length(row_extents, data);
+    row_extents_size= row_extents * ROW_EXTENT_SIZE;
+    if (info->cur_row.extents_buffer_length < row_extents_size &&
+        _ma_alloc_buffer(&info->cur_row.extents,
+                         &info->cur_row.extents_buffer_length,
+                         row_extents_size))
+      DBUG_RETURN(1);
+    memcpy(info->cur_row.extents, data, ROW_EXTENT_SIZE);
+    data+= ROW_EXTENT_SIZE;
+    init_extent(&extent, info->cur_row.extents, row_extents,
+                info->cur_row.tail_positions);
+    extent.first_extent= 1;
+  }
+  info->cur_row.extents_count= row_extents;
+
+  if (share->base.max_field_lengths)
+    get_key_length(field_lengths, data);
+
+  if (share->calc_checksum)
+    info->cur_row.checksum= (uint) (uchar) *data++;
+  if (row_extents > 1)
+  {
+    data+= share->base.null_bytes;
+    data+= share->base.pack_bytes;
+    data+= share->base.field_offsets * FIELD_OFFSET_SIZE;
+
+    /*
+      Read row extents (note that first extent was already read into
+      info->cur_row.extents above)
+      Lock tails with write lock as we will delete them later.
+    */
+    extent.lock_for_tail_pages= PAGECACHE_LOCK_LEFT_WRITELOCKED;
+    if (read_long_data(info, info->cur_row.extents + ROW_EXTENT_SIZE,
+                       row_extents_size - ROW_EXTENT_SIZE,
+                       &extent, &data, &end_of_data))
+      DBUG_RETURN(1);
+  }
+
+  /* Update tail_positions with pointer to tails */
+  tail_pos= info->cur_row.tail_positions;
+  for (extents= info->cur_row.extents, end= extents + row_extents_size;
+       extents < end;
+       extents+= ROW_EXTENT_SIZE)
+  {
+    pgcache_page_no_t page=  uint5korr(extents);
+    uint page_count= uint2korr(extents + ROW_EXTENT_PAGE_SIZE);
+    if (page_count & TAIL_BIT)
+      *(tail_pos++)= ma_recordpos(page, (page_count & ~ (TAIL_BIT |
+                                                         START_EXTENT_BIT)));
+  }
+  *tail_pos= 0;                               /* End marker */
+  DBUG_RETURN(0);
+}
+
+
+/*
+  Read a record based on record position
+
+  @fn     _ma_read_block_record()
+  @param info                Maria handler
+  @param record              Store record here
+  @param record_pos          Record position
+
+  @return Status
+  @retval 0  ok
+  @retval #  Error number
+*/
+
+int _ma_read_block_record(MARIA_HA *info, uchar *record,
+                          MARIA_RECORD_POS record_pos)
+{
+  MARIA_SHARE *share= info->s;
+  uchar *data, *end_of_data, *buff;
+  uint offset;
+  uint block_size= share->block_size;
+  DBUG_ENTER("_ma_read_block_record");
+  DBUG_PRINT("enter", ("rowid: %lu  page: %lu  rownr: %u",
+                       (ulong) record_pos,
+                       (ulong) ma_recordpos_to_page(record_pos),
+                       ma_recordpos_to_dir_entry(record_pos)));
+
+  offset= ma_recordpos_to_dir_entry(record_pos);
+
+  if (!(buff= pagecache_read(share->pagecache,
+                             &info->dfile, ma_recordpos_to_page(record_pos), 0,
+                             info->buff, share->page_type,
+                             PAGECACHE_LOCK_LEFT_UNLOCKED, 0)))
+    DBUG_RETURN(my_errno);
+  DBUG_ASSERT((buff[PAGE_TYPE_OFFSET] & PAGE_TYPE_MASK) == HEAD_PAGE);
+  if (!(data= get_record_position(buff, block_size, offset, &end_of_data)))
+  {
+    DBUG_PRINT("error", ("Wrong directory entry in data block"));
+    my_errno= HA_ERR_RECORD_DELETED;           /* File crashed */
+    DBUG_RETURN(HA_ERR_RECORD_DELETED);
+  }
+  DBUG_RETURN(_ma_read_block_record2(info, record, data, end_of_data));
+}
+
+
+/* compare unique constraint between stored rows */
+
+my_bool _ma_cmp_block_unique(MARIA_HA *info, MARIA_UNIQUEDEF *def,
+                             const uchar *record, MARIA_RECORD_POS pos)
+{
+  uchar *org_rec_buff, *old_record;
+  size_t org_rec_buff_size;
+  int error;
+  DBUG_ENTER("_ma_cmp_block_unique");
+
+  if (!(old_record= my_alloca(info->s->base.reclength)))
+    DBUG_RETURN(1);
+
+  /* Don't let the compare destroy blobs that may be in use */
+  org_rec_buff=      info->rec_buff;
+  org_rec_buff_size= info->rec_buff_size;
+  if (info->s->base.blobs)
+  {
+    /* Force realloc of record buffer*/
+    info->rec_buff= 0;
+    info->rec_buff_size= 0;
+  }
+  error= _ma_read_block_record(info, old_record, pos);
+  if (!error)
+    error= _ma_unique_comp(def, record, old_record, def->null_are_equal);
+  if (info->s->base.blobs)
+  {
+    my_free(info->rec_buff, MYF(MY_ALLOW_ZERO_PTR));
+    info->rec_buff=      org_rec_buff;
+    info->rec_buff_size= org_rec_buff_size;
+  }
+  DBUG_PRINT("exit", ("result: %d", error));
+  my_afree(old_record);
+  DBUG_RETURN(error != 0);
+}
+
+
+/****************************************************************************
+  Table scan
+****************************************************************************/
+
+/*
+  Allocate buffers for table scan
+
+  SYNOPSIS
+   _ma_scan_init_block_record(MARIA_HA *info)
+
+  IMPLEMENTATION
+    We allocate one buffer for the current bitmap and one buffer for the
+    current page
+
+  RETURN
+    0  ok
+    1  error (couldn't allocate memory or disk error)
+*/
+
+my_bool _ma_scan_init_block_record(MARIA_HA *info)
+{
+  MARIA_SHARE *share= info->s;
+  DBUG_ENTER("_ma_scan_init_block_record");
+  /*
+    bitmap_buff may already be allocated if this is the second call to
+    rnd_init() without a rnd_end() in between, see sql/handler.h
+  */
+  if (!(info->scan.bitmap_buff ||
+        ((info->scan.bitmap_buff=
+          (uchar *) my_malloc(share->block_size * 2, MYF(MY_WME))))))
+    DBUG_RETURN(1);
+  info->scan.page_buff= info->scan.bitmap_buff + share->block_size;
+  info->scan.bitmap_end= info->scan.bitmap_buff + share->bitmap.total_size;
+
+  /* Set scan variables to get _ma_scan_block() to start with reading bitmap */
+  info->scan.number_of_rows= 0;
+  info->scan.bitmap_pos= info->scan.bitmap_end;
+  info->scan.bitmap_page= (pgcache_page_no_t) 0 - share->bitmap.pages_covered;
+  info->scan.max_page= share->state.state.data_file_length / share->block_size;
+  /*
+    We need to flush what's in memory (bitmap.map) to page cache otherwise, as
+    we are going to read bitmaps from page cache in table scan (see
+    _ma_scan_block_record()), we may miss recently inserted rows (bitmap page
+    in page cache would be too old).
+  */
+  DBUG_RETURN(_ma_bitmap_flush(info->s));
+}
+
+
+/* Free buffers allocated by _ma_scan_block_init() */
+
+void _ma_scan_end_block_record(MARIA_HA *info)
+{
+  DBUG_ENTER("_ma_scan_end_block_record");
+  my_free(info->scan.bitmap_buff, MYF(MY_ALLOW_ZERO_PTR));
+  info->scan.bitmap_buff= 0;
+  if (info->scan_save)
+  {
+    my_free(info->scan_save, MYF(0));
+    info->scan_save= 0;
+  }
+  DBUG_VOID_RETURN;
+}
+
+
+/**
+  @brief Save current scan position
+
+  @note
+  For the moment we can only remember one position, but this is
+  good enough for MySQL usage
+
+  @Warning
+    When this function is called, we assume that the thread is not deleting
+    or updating the current row before ma_scan_restore_block_record()
+    is called!
+
+  @return
+  @retval 0			  ok
+  @retval HA_ERR_WRONG_IN_RECORD  Could not allocate memory to hold position
+*/
+
+int _ma_scan_remember_block_record(MARIA_HA *info,
+                                   MARIA_RECORD_POS *lastpos)
+{
+  uchar *bitmap_buff;
+  DBUG_ENTER("_ma_scan_remember_block_record");
+  if (!(info->scan_save))
+  {
+    if (!(info->scan_save= my_malloc(ALIGN_SIZE(sizeof(*info->scan_save)) +
+                                     info->s->block_size * 2,
+                                     MYF(MY_WME))))
+      DBUG_RETURN(HA_ERR_OUT_OF_MEM);
+    info->scan_save->bitmap_buff= ((uchar*) info->scan_save +
+                                   ALIGN_SIZE(sizeof(*info->scan_save)));
+  }
+  /* Point to the last read row */
+  *lastpos= info->cur_row.nextpos - 1;
+  info->scan.dir+= DIR_ENTRY_SIZE;
+
+  /* Remember used bitmap and used head page */
+  bitmap_buff= info->scan_save->bitmap_buff;
+  memcpy(info->scan_save, &info->scan, sizeof(*info->scan_save));
+  info->scan_save->bitmap_buff= bitmap_buff;
+  memcpy(bitmap_buff, info->scan.bitmap_buff, info->s->block_size * 2);
+  DBUG_RETURN(0);
+}
+
+
+/**
+   @brief restore scan block it's original values
+
+   @note
+   In theory we could swap bitmap buffers instead of copy them.
+   For the moment we don't do that because there are variables pointing
+   inside the buffers and it's a bit of hassle to either make them relative
+   or repoint them.
+*/
+
+void _ma_scan_restore_block_record(MARIA_HA *info,
+                                   MARIA_RECORD_POS lastpos)
+{
+  uchar *bitmap_buff;
+  DBUG_ENTER("_ma_scan_restore_block_record");
+
+  info->cur_row.nextpos= lastpos;
+  bitmap_buff= info->scan.bitmap_buff;
+  memcpy(&info->scan, info->scan_save, sizeof(*info->scan_save));
+  info->scan.bitmap_buff= bitmap_buff;
+  memcpy(bitmap_buff, info->scan_save->bitmap_buff, info->s->block_size * 2);
+
+  DBUG_VOID_RETURN;
+}
+
+
+/*
+  Read next record while scanning table
+
+  SYNOPSIS
+    _ma_scan_block_record()
+    info                Maria handler
+    record              Store found here
+    record_pos          Value stored in info->cur_row.next_pos after last call
+    skip_deleted
+
+  NOTES
+    - One must have called mi_scan() before this
+    - In this version, we don't actually need record_pos, we as easily
+      use a variable in info->scan
+
+  IMPLEMENTATION
+    Current code uses a lot of goto's to separate the different kind of
+    states we may be in. This gives us a minimum of executed if's for
+    the normal cases.  I tried several different ways to code this, but
+    the current one was in the end the most readable and fastest.
+
+  RETURN
+    0   ok
+    #   Error code
+*/
+
+int _ma_scan_block_record(MARIA_HA *info, uchar *record,
+                          MARIA_RECORD_POS record_pos,
+                          my_bool skip_deleted __attribute__ ((unused)))
+{
+  uint block_size;
+  my_off_t filepos;
+  MARIA_SHARE *share= info->s;
+  DBUG_ENTER("_ma_scan_block_record");
+
+restart_record_read:
+  /* Find next row in current page */
+  while (likely(record_pos < info->scan.number_of_rows))
+  {
+    uint length, offset;
+    uchar *data, *end_of_data;
+    int error;
+
+    while (!(offset= uint2korr(info->scan.dir)))
+    {
+      info->scan.dir-= DIR_ENTRY_SIZE;
+      record_pos++;
+#ifdef SANITY_CHECKS
+      if (info->scan.dir < info->scan.dir_end)
+      {
+        DBUG_ASSERT(0);
+        goto err;
+      }
+#endif
+    }
+    /* found row */
+    info->cur_row.lastpos= info->scan.row_base_page + record_pos;
+    info->cur_row.nextpos= record_pos + 1;
+    data= info->scan.page_buff + offset;
+    length= uint2korr(info->scan.dir + 2);
+    end_of_data= data + length;
+    info->scan.dir-= DIR_ENTRY_SIZE;          /* Point to previous row */
+#ifdef SANITY_CHECKS
+    if (end_of_data > info->scan.dir_end ||
+        offset < PAGE_HEADER_SIZE || length < share->base.min_block_length)
+    {
+      DBUG_ASSERT(!(end_of_data > info->scan.dir_end));
+      DBUG_ASSERT(!(offset < PAGE_HEADER_SIZE));
+      DBUG_ASSERT(!(length < share->base.min_block_length));
+      goto err;
+    }
+#endif
+    DBUG_PRINT("info", ("rowid: %lu", (ulong) info->cur_row.lastpos));
+    error= _ma_read_block_record2(info, record, data, end_of_data);
+    if (error != HA_ERR_ROW_NOT_VISIBLE)
+      DBUG_RETURN(error);
+    record_pos++;
+  }
+
+  /* Find next head page in current bitmap */
+restart_bitmap_scan:
+  block_size= share->block_size;
+  if (likely(info->scan.bitmap_pos < info->scan.bitmap_end))
+  {
+    uchar *data=    info->scan.bitmap_pos;
+    longlong bits= info->scan.bits;
+    uint bit_pos=  info->scan.bit_pos;
+
+    do
+    {
+      while (likely(bits))
+      {
+        uint pattern= (uint) (bits & 7);
+        bits >>= 3;
+        bit_pos++;
+        if (pattern > 0 && pattern <= 4)
+        {
+          /* Found head page; Read it */
+          pgcache_page_no_t page;
+          info->scan.bitmap_pos= data;
+          info->scan.bits= bits;
+          info->scan.bit_pos= bit_pos;
+          page= (info->scan.bitmap_page + 1 +
+                 (data - info->scan.bitmap_buff) / 6 * 16 + bit_pos - 1);
+          info->scan.row_base_page= ma_recordpos(page, 0);
+          if (page >= info->scan.max_page)
+          {
+            DBUG_PRINT("info", ("Found end of file"));
+            DBUG_RETURN((my_errno= HA_ERR_END_OF_FILE));
+          }
+          if (!(pagecache_read(share->pagecache,
+                               &info->dfile,
+                               page, 0, info->scan.page_buff,
+                               share->page_type,
+                               PAGECACHE_LOCK_LEFT_UNLOCKED, 0)))
+            DBUG_RETURN(my_errno);
+          if (((info->scan.page_buff[PAGE_TYPE_OFFSET] & PAGE_TYPE_MASK) !=
+               HEAD_PAGE))
+          {
+            /*
+              This may happen if someone has been deleting all rows
+              from a page since we read the bitmap, so it may be ok.
+              Print warning in debug log and continue.
+            */
+            DBUG_PRINT("warning",
+                       ("Found page of type %d when expecting head page",
+                        (info->scan.page_buff[PAGE_TYPE_OFFSET] &
+                         PAGE_TYPE_MASK)));
+            continue;
+          }
+          if ((info->scan.number_of_rows=
+               (uint) (uchar) info->scan.page_buff[DIR_COUNT_OFFSET]) == 0)
+          {
+            DBUG_PRINT("error", ("Wrong page header"));
+            DBUG_RETURN((my_errno= HA_ERR_WRONG_IN_RECORD));
+          }
+          DBUG_PRINT("info", ("Page %lu has %u rows",
+                              (ulong) page, info->scan.number_of_rows));
+          info->scan.dir= (info->scan.page_buff + block_size -
+                           PAGE_SUFFIX_SIZE - DIR_ENTRY_SIZE);
+          info->scan.dir_end= (info->scan.dir -
+                               (info->scan.number_of_rows - 1) *
+                               DIR_ENTRY_SIZE);
+          record_pos= 0;
+          goto restart_record_read;
+        }
+      }
+      for (data+= 6; data < info->scan.bitmap_end; data+= 6)
+      {
+        bits= uint6korr(data);
+        /* Skip not allocated pages and blob / full tail pages */
+        if (bits && bits != LL(07777777777777777))
+          break;
+      }
+      bit_pos= 0;
+    } while (data < info->scan.bitmap_end);
+  }
+
+  /* Read next bitmap */
+  info->scan.bitmap_page+= share->bitmap.pages_covered;
+  filepos= (my_off_t) info->scan.bitmap_page * block_size;
+  if (unlikely(filepos >= share->state.state.data_file_length))
+  {
+    DBUG_PRINT("info", ("Found end of file"));
+    DBUG_RETURN((my_errno= HA_ERR_END_OF_FILE));
+  }
+  DBUG_PRINT("info", ("Reading bitmap at %lu",
+                      (ulong) info->scan.bitmap_page));
+  if (!(pagecache_read(share->pagecache, &info->s->bitmap.file,
+                       info->scan.bitmap_page,
+                       0, info->scan.bitmap_buff, PAGECACHE_PLAIN_PAGE,
+                       PAGECACHE_LOCK_LEFT_UNLOCKED, 0)))
+    DBUG_RETURN(my_errno);
+  /* Skip scanning 'bits' in bitmap scan code */
+  info->scan.bitmap_pos= info->scan.bitmap_buff - 6;
+  info->scan.bits= 0;
+  goto restart_bitmap_scan;
+
+err:
+  DBUG_PRINT("error", ("Wrong data on page"));
+  DBUG_RETURN((my_errno= HA_ERR_WRONG_IN_RECORD));
+}
+
+
+/*
+  Compare a row against a stored one
+
+  NOTES
+    Not implemented, as block record is not supposed to be used in a shared
+    global environment
+*/
+
+my_bool _ma_compare_block_record(MARIA_HA *info __attribute__ ((unused)),
+                                 const uchar *record __attribute__ ((unused)))
+{
+  return 0;
+}
+
+
+/*
+  Store an integer with simple packing
+
+  SYNOPSIS
+    ma_store_integer()
+    to                  Store the packed integer here
+    nr                  Integer to store
+
+  NOTES
+    This is mostly used to store field numbers and lengths of strings.
+    We have to cast the result for the LL() becasue of a bug in Forte CC
+    compiler.
+
+    Packing used is:
+    nr < 251 is stored as is (in 1 byte)
+    Numbers that require 1-4 bytes are stored as char(250+byte_length), data
+    Bigger numbers are stored as 255, data as ulonglong (not yet done).
+
+  RETURN
+    Position in 'to' after the packed length
+*/
+
+uchar *ma_store_length(uchar *to, ulong nr)
+{
+  if (nr < 251)
+  {
+    *to=(uchar) nr;
+    return to+1;
+  }
+  if (nr < 65536)
+  {
+    if (nr <= 255)
+    {
+      to[0]= (uchar) 251;
+      to[1]= (uchar) nr;
+      return to+2;
+    }
+    to[0]= (uchar) 252;
+    int2store(to+1, nr);
+    return to+3;
+  }
+  if (nr < 16777216)
+  {
+    *to++= (uchar) 253;
+    int3store(to, nr);
+    return to+3;
+  }
+  *to++= (uchar) 254;
+  int4store(to, nr);
+  return to+4;
+}
+
+
+/* Calculate how many bytes needed to store a number */
+
+uint ma_calc_length_for_store_length(ulong nr)
+{
+  if (nr < 251)
+    return 1;
+  if (nr < 65536)
+  {
+    if (nr <= 255)
+      return 2;
+    return 3;
+  }
+  if (nr < 16777216)
+    return 4;
+  return 5;
+}
+
+
+/* Retrive a stored number */
+
+static ulong ma_get_length(const uchar **packet)
+{
+  reg1 const uchar *pos= *packet;
+  if (*pos < 251)
+  {
+    (*packet)++;
+    return (ulong) *pos;
+  }
+  if (*pos == 251)
+  {
+    (*packet)+= 2;
+    return (ulong) pos[1];
+  }
+  if (*pos == 252)
+  {
+    (*packet)+= 3;
+    return (ulong) uint2korr(pos+1);
+  }
+  if (*pos == 253)
+  {
+    (*packet)+= 4;
+    return (ulong) uint3korr(pos+1);
+  }
+  DBUG_ASSERT(*pos == 254);
+  (*packet)+= 5;
+  return (ulong) uint4korr(pos+1);
+}
+
+
+/*
+  Fill array with pointers to field parts to be stored in log for insert
+
+  SYNOPSIS
+    fill_insert_undo_parts()
+    info                Maria handler
+    record              Inserted row
+    log_parts           Store pointers to changed memory areas here
+    log_parts_count     See RETURN
+
+  NOTES
+    We have information in info->cur_row about the read row.
+
+  RETURN
+    length of data in log_parts.
+    log_parts_count contains number of used log_parts
+*/
+
+static size_t fill_insert_undo_parts(MARIA_HA *info, const uchar *record,
+                                     LEX_CUSTRING *log_parts,
+                                     uint *log_parts_count)
+{
+  MARIA_SHARE *share= info->s;
+  MARIA_COLUMNDEF *column, *end_column;
+  uchar *field_lengths= info->cur_row.field_lengths;
+  size_t row_length;
+  MARIA_ROW *cur_row= &info->cur_row;
+  LEX_CUSTRING *start_log_parts;
+  DBUG_ENTER("fill_insert_undo_parts");
+
+  start_log_parts= log_parts;
+
+  /* Store null bits */
+  log_parts->str=      record;
+  log_parts->length=   share->base.null_bytes;
+  row_length=          log_parts->length;
+  log_parts++;
+
+  /* Stored bitmap over packed (zero length or all-zero fields) */
+  log_parts->str= info->cur_row.empty_bits;
+  log_parts->length= share->base.pack_bytes;
+  row_length+=       log_parts->length;
+  log_parts++;
+
+  if (share->base.max_field_lengths)
+  {
+    /* Store length of all not empty char, varchar and blob fields */
+    log_parts->str= field_lengths - 2;
+    log_parts->length=   info->cur_row.field_lengths_length+2;
+    int2store(log_parts->str, info->cur_row.field_lengths_length);
+    row_length+= log_parts->length;
+    log_parts++;
+  }
+
+  if (share->base.blobs)
+  {
+    /*
+      Store total blob length to make buffer allocation easier during UNDO
+     */
+    log_parts->str=  info->length_buff;
+    log_parts->length= (uint) (ma_store_length(info->length_buff,
+                                                 info->cur_row.blob_length) -
+                                 (uchar*) log_parts->str);
+    row_length+=          log_parts->length;
+    log_parts++;
+  }
+
+  /* Handle constant length fields that are always present */
+  for (column= share->columndef,
+       end_column= column+ share->base.fixed_not_null_fields;
+       column < end_column;
+       column++)
+  {
+    log_parts->str= record + column->offset;
+    log_parts->length= column->length;
+    row_length+= log_parts->length;
+    log_parts++;
+  }
+
+  /* Handle NULL fields and CHAR/VARCHAR fields */
+  for (end_column= share->columndef + share->base.fields - share->base.blobs;
+       column < end_column;
+       column++)
+  {
+    const uchar *column_pos;
+    size_t column_length;
+    if ((record[column->null_pos] & column->null_bit) ||
+        cur_row->empty_bits[column->empty_pos] & column->empty_bit)
+      continue;
+
+    column_pos=    record+ column->offset;
+    column_length= column->length;
+
+    switch (column->type) {
+    case FIELD_CHECK:
+    case FIELD_NORMAL:                          /* Fixed length field */
+    case FIELD_ZERO:
+    case FIELD_SKIP_PRESPACE:                   /* Not packed */
+    case FIELD_SKIP_ZERO:                       /* Fixed length field */
+      break;
+    case FIELD_SKIP_ENDSPACE:                   /* CHAR */
+    {
+      if (column->length <= 255)
+        column_length= *field_lengths++;
+      else
+      {
+        column_length= uint2korr(field_lengths);
+        field_lengths+= 2;
+      }
+      break;
+    }
+    case FIELD_VARCHAR:
+    {
+      if (column->fill_length == 1)
+        column_length= *field_lengths;
+      else
+        column_length= uint2korr(field_lengths);
+      field_lengths+= column->fill_length;
+      column_pos+= column->fill_length;
+      break;
+    }
+    default:
+      DBUG_ASSERT(0);
+    }
+    log_parts->str= column_pos;
+    log_parts->length= column_length;
+    row_length+= log_parts->length;
+    log_parts++;
+  }
+
+  /* Add blobs */
+  for (end_column+= share->base.blobs; column < end_column; column++)
+  {
+    const uchar *field_pos= record + column->offset;
+    uint size_length= column->length - portable_sizeof_char_ptr;
+    ulong blob_length= _ma_calc_blob_length(size_length, field_pos);
+
+    /*
+      We don't have to check for null, as blob_length is guranteed to be 0
+      if the blob is null
+    */
+    if (blob_length)
+    {
+      uchar *blob_pos;
+      memcpy_fixed(&blob_pos, record + column->offset + size_length,
+                   sizeof(blob_pos));
+      log_parts->str= blob_pos;
+      log_parts->length= blob_length;
+      row_length+= log_parts->length;
+      log_parts++;
+    }
+  }
+  *log_parts_count= (uint) (log_parts - start_log_parts);
+  DBUG_RETURN(row_length);
+}
+
+
+/*
+   Fill array with pointers to field parts to be stored in log for update
+
+  SYNOPSIS
+    fill_update_undo_parts()
+    info                Maria handler
+    oldrec		Original row
+    newrec              New row
+    log_parts           Store pointers to changed memory areas here
+    log_parts_count     See RETURN
+
+  IMPLEMENTATION
+    Format of undo record:
+
+    Fields are stored in same order as the field array.
+
+    Offset to changed field data (packed)
+
+    For each changed field
+      Fieldnumber (packed)
+      Length, if variable length field (packed)
+
+    For each changed field
+     Data
+
+   Packing is using ma_store_integer()
+
+   The reason we store field numbers & length separated from data (ie, not
+   after each other) is to get better cpu caching when we loop over
+   fields (as we probably don't have to access data for each field when we
+   want to read and old row through the undo log record).
+
+   As a special case, we use '255' for the field number of the null bitmap.
+
+  RETURN
+    length of data in log_parts.
+    log_parts_count contains number of used log_parts
+*/
+
+static size_t fill_update_undo_parts(MARIA_HA *info, const uchar *oldrec,
+                                     const uchar *newrec,
+                                     LEX_CUSTRING *log_parts,
+                                     uint *log_parts_count)
+{
+  MARIA_SHARE *share= info->s;
+  MARIA_COLUMNDEF *column, *end_column;
+  MARIA_ROW *old_row= &info->cur_row, *new_row= &info->new_row;
+  uchar *field_data, *start_field_data, *length_str;
+  uchar *old_field_lengths= old_row->field_lengths;
+  uchar *new_field_lengths= new_row->field_lengths;
+  size_t row_length= 0;
+  uint field_lengths;
+  LEX_CUSTRING *start_log_parts;
+  my_bool new_column_is_empty;
+  DBUG_ENTER("fill_update_undo_parts");
+
+  start_log_parts= log_parts;
+
+  /*
+    First log part is for number of fields, field numbers and lengths
+    The +4 is to reserve place for the number of changed fields.
+  */
+  start_field_data= field_data= info->update_field_data + 4;
+  log_parts++;
+
+  if (memcmp(oldrec, newrec, share->base.null_bytes))
+  {
+    /* Store changed null bits */
+    *field_data++=       (uchar) 255;           /* Special case */
+    log_parts->str=      oldrec;
+    log_parts->length=   share->base.null_bytes;
+    row_length=          log_parts->length;
+    log_parts++;
+  }
+
+  /* Handle constant length fields */
+  for (column= share->columndef,
+       end_column= column+ share->base.fixed_not_null_fields;
+       column < end_column;
+       column++)
+  {
+    if (memcmp(oldrec + column->offset, newrec + column->offset,
+               column->length))
+    {
+      field_data= ma_store_length(field_data,
+                                  (uint) (column - share->columndef));
+      log_parts->str= oldrec + column->offset;
+      log_parts->length= column->length;
+      row_length+=       column->length;
+      log_parts++;
+    }
+  }
+
+  /* Handle the rest: NULL fields and CHAR/VARCHAR fields and BLOB's */
+  for (end_column= share->columndef + share->base.fields;
+       column < end_column;
+       column++)
+  {
+    const uchar *new_column_pos, *old_column_pos;
+    size_t new_column_length, old_column_length;
+
+    /* First check if old column is null or empty */
+    if (oldrec[column->null_pos] & column->null_bit)
+    {
+      /*
+        It's safe to skip this one as either the new column is also null
+        (no change) or the new_column is not null, in which case the null-bit
+        maps differed and we have already stored the null bitmap.
+      */
+      continue;
+    }
+    if (old_row->empty_bits[column->empty_pos] & column->empty_bit)
+    {
+      if (new_row->empty_bits[column->empty_pos] & column->empty_bit)
+        continue;                               /* Both are empty; skip */
+
+      /* Store null length column */
+      field_data= ma_store_length(field_data,
+                                  (uint) (column - share->columndef));
+      field_data= ma_store_length(field_data, 0);
+      continue;
+    }
+    /*
+      Remember if the 'new' value is empty (as in this case we must always
+      log the original value
+    */
+    new_column_is_empty= ((newrec[column->null_pos] & column->null_bit) ||
+                          (new_row->empty_bits[column->empty_pos] &
+                           column->empty_bit));
+
+    old_column_pos=      oldrec + column->offset;
+    new_column_pos=      newrec + column->offset;
+    old_column_length= new_column_length= column->length;
+
+    switch (column->type) {
+    case FIELD_CHECK:
+    case FIELD_NORMAL:                          /* Fixed length field */
+    case FIELD_ZERO:
+    case FIELD_SKIP_PRESPACE:                   /* Not packed */
+    case FIELD_SKIP_ZERO:                       /* Fixed length field */
+      break;
+    case FIELD_VARCHAR:
+      new_column_length--;                      /* Skip length prefix */
+      old_column_pos+= column->fill_length;
+      new_column_pos+= column->fill_length;
+      /* Fall through */
+    case FIELD_SKIP_ENDSPACE:                   /* CHAR */
+    {
+      if (new_column_length <= 255)
+      {
+        old_column_length= *old_field_lengths++;
+        if (!new_column_is_empty)
+          new_column_length= *new_field_lengths++;
+      }
+      else
+      {
+        old_column_length= uint2korr(old_field_lengths);
+        old_field_lengths+= 2;
+        if (!new_column_is_empty)
+        {
+          new_column_length= uint2korr(new_field_lengths);
+          new_field_lengths+= 2;
+        }
+      }
+      break;
+    }
+    case FIELD_BLOB:
+    {
+      uint size_length= column->length - portable_sizeof_char_ptr;
+      old_column_length= _ma_calc_blob_length(size_length, old_column_pos);
+      memcpy_fixed((uchar*) &old_column_pos,
+                   oldrec + column->offset + size_length,
+                   sizeof(old_column_pos));
+      if (!new_column_is_empty)
+      {
+        new_column_length= _ma_calc_blob_length(size_length, new_column_pos);
+        memcpy_fixed((uchar*) &new_column_pos,
+                     newrec + column->offset + size_length,
+                     sizeof(old_column_pos));
+      }
+      break;
+    }
+    default:
+      DBUG_ASSERT(0);
+    }
+
+    if (new_column_is_empty || new_column_length != old_column_length ||
+        memcmp(old_column_pos, new_column_pos, new_column_length))
+    {
+      field_data= ma_store_length(field_data,
+                                  (ulong) (column - share->columndef));
+      field_data= ma_store_length(field_data, (ulong) old_column_length);
+
+      log_parts->str=     old_column_pos;
+      log_parts->length=  old_column_length;
+      row_length+=        old_column_length;
+      log_parts++;
+    }
+  }
+
+  *log_parts_count= (uint) (log_parts - start_log_parts);
+
+  /* Store length of field length data before the field/field_lengths */
+  field_lengths= (uint) (field_data - start_field_data);
+  length_str= start_field_data - ma_calc_length_for_store_length(field_lengths);
+  start_log_parts->str= length_str;
+  ma_store_length(length_str, field_lengths);
+  start_log_parts->length= (size_t) (field_data - start_log_parts->str);
+  row_length+= start_log_parts->length;
+  DBUG_RETURN(row_length);
+}
+
+/***************************************************************************
+  In-write hooks called under log's lock when log record is written
+***************************************************************************/
+
+/**
+   @brief Sets transaction's rec_lsn if needed
+
+   A transaction sometimes writes a REDO even before the page is in the
+   pagecache (example: brand new head or tail pages; full pages). So, if
+   Checkpoint happens just after the REDO write, it needs to know that the
+   REDO phase must start before this REDO. Scanning the pagecache cannot
+   tell that as the page is not in the cache. So, transaction sets its rec_lsn
+   to the REDO's LSN or somewhere before, and Checkpoint reads the
+   transaction's rec_lsn.
+
+   @return Operation status, always 0 (success)
+*/
+
+my_bool write_hook_for_redo(enum translog_record_type type
+                            __attribute__ ((unused)),
+                            TRN *trn, MARIA_HA *tbl_info
+                            __attribute__ ((unused)),
+                            LSN *lsn, void *hook_arg
+                            __attribute__ ((unused)))
+{
+  /*
+    Users of dummy_transaction_object must keep this TRN clean as it
+    is used by many threads (like those manipulating non-transactional
+    tables). It might be dangerous if one user sets rec_lsn or some other
+    member and it is picked up by another user (like putting this rec_lsn into
+    a page of a non-transactional table); it's safer if all members stay 0. So
+    non-transactional log records (REPAIR, CREATE, RENAME, DROP) should not
+    call this hook; we trust them but verify ;)
+  */
+  DBUG_ASSERT(trn->trid != 0);
+  /*
+    If the hook stays so simple, it would be faster to pass
+    !trn->rec_lsn ? trn->rec_lsn : some_dummy_lsn
+    to translog_write_record(), like Monty did in his original code, and not
+    have a hook. For now we keep it like this.
+  */
+  if (trn->rec_lsn == 0)
+    trn->rec_lsn= *lsn;
+  return 0;
+}
+
+
+/**
+   @brief Sets transaction's undo_lsn, first_undo_lsn if needed
+
+   @return Operation status, always 0 (success)
+*/
+
+my_bool write_hook_for_undo(enum translog_record_type type
+                            __attribute__ ((unused)),
+                            TRN *trn, MARIA_HA *tbl_info
+                            __attribute__ ((unused)),
+                            LSN *lsn, void *hook_arg
+                            __attribute__ ((unused)))
+{
+  DBUG_ASSERT(trn->trid != 0);
+  trn->undo_lsn= *lsn;
+  if (unlikely(LSN_WITH_FLAGS_TO_LSN(trn->first_undo_lsn) == 0))
+    trn->first_undo_lsn=
+      trn->undo_lsn | LSN_WITH_FLAGS_TO_FLAGS(trn->first_undo_lsn);
+  return 0;
+  /*
+    when we implement purging, we will specialize this hook: UNDO_PURGE
+    records will additionally set trn->undo_purge_lsn
+  */
+}
+
+
+/**
+   @brief Sets the table's records count and checksum and others to 0, then
+   calls the generic REDO hook.
+
+   @return Operation status, always 0 (success)
+*/
+
+my_bool write_hook_for_redo_delete_all(enum translog_record_type type
+                                       __attribute__ ((unused)),
+                                       TRN *trn, MARIA_HA *tbl_info
+                                       __attribute__ ((unused)),
+                                       LSN *lsn, void *hook_arg)
+{
+  _ma_reset_status(tbl_info);
+  return write_hook_for_redo(type, trn, tbl_info, lsn, hook_arg);
+}
+
+
+/**
+   @brief Updates "records" and "checksum" and calls the generic UNDO hook
+
+   @return Operation status, always 0 (success)
+*/
+
+my_bool write_hook_for_undo_row_insert(enum translog_record_type type
+                                       __attribute__ ((unused)),
+                                       TRN *trn, MARIA_HA *tbl_info,
+                                       LSN *lsn, void *hook_arg)
+{
+  MARIA_SHARE *share= tbl_info->s;
+  share->state.state.records++;
+  share->state.state.checksum+= *(ha_checksum *)hook_arg;
+  return write_hook_for_undo(type, trn, tbl_info, lsn, hook_arg);
+}
+
+
+/**
+   @brief Upates "records" and calls the generic UNDO hook
+
+   @return Operation status, always 0 (success)
+*/
+
+my_bool write_hook_for_undo_row_delete(enum translog_record_type type
+                                       __attribute__ ((unused)),
+                                       TRN *trn, MARIA_HA *tbl_info,
+                                       LSN *lsn, void *hook_arg)
+{
+  MARIA_SHARE *share= tbl_info->s;
+  share->state.state.records--;
+  share->state.state.checksum+= *(ha_checksum *)hook_arg;
+  return write_hook_for_undo(type, trn, tbl_info, lsn, hook_arg);
+}
+
+
+/**
+   @brief Upates "records" and "checksum" and calls the generic UNDO hook
+
+   @return Operation status, always 0 (success)
+*/
+
+my_bool write_hook_for_undo_row_update(enum translog_record_type type
+                                       __attribute__ ((unused)),
+                                       TRN *trn, MARIA_HA *tbl_info,
+                                       LSN *lsn, void *hook_arg)
+{
+  MARIA_SHARE *share= tbl_info->s;
+  share->state.state.checksum+= *(ha_checksum *)hook_arg;
+  return write_hook_for_undo(type, trn, tbl_info, lsn, hook_arg);
+}
+
+
+my_bool write_hook_for_undo_bulk_insert(enum translog_record_type type
+                                        __attribute__ ((unused)),
+                                        TRN *trn, MARIA_HA *tbl_info,
+                                        LSN *lsn, void *hook_arg)
+{
+  /*
+    We are going to call maria_delete_all_rows(), but without logging and
+    syncing, as an optimization (if we crash before commit, the UNDO will
+    empty; if we crash after commit, we have flushed and forced the files).
+    Status still needs to be reset under log mutex, in case of a concurrent
+    checkpoint.
+  */
+  _ma_reset_status(tbl_info);
+  return write_hook_for_undo(type, trn, tbl_info, lsn, hook_arg);
+}
+
+
+/**
+   @brief Updates table's lsn_of_file_id.
+
+   @return Operation status, always 0 (success)
+*/
+
+my_bool write_hook_for_file_id(enum translog_record_type type
+                               __attribute__ ((unused)),
+                               TRN *trn
+                               __attribute__ ((unused)),
+                               MARIA_HA *tbl_info,
+                               LSN *lsn,
+                               void *hook_arg
+                               __attribute__ ((unused)))
+{
+  DBUG_ASSERT(cmp_translog_addr(tbl_info->s->lsn_of_file_id, *lsn) < 0);
+  tbl_info->s->lsn_of_file_id= *lsn;
+  return 0;
+}
+
+
+/**
+   Updates transaction's rec_lsn when committing.
+
+   A transaction writes its commit record before being committed in trnman, so
+   if Checkpoint happens just between the COMMIT record log write and the
+   commit in trnman, it will record that transaction is not committed. Assume
+   the transaction (trn1) did an INSERT; after the checkpoint, a second
+   transaction (trn2) does a DELETE of what trn1 has inserted. Then crash,
+   Checkpoint record says that trn1 was not committed, and REDO phase starts
+   from Checkpoint record's LSN. So it will not find the COMMIT record of
+   trn1, will want to roll back trn1, which will fail because the row/key
+   which it wants to delete does not exist anymore.
+   To avoid this, Checkpoint needs to know that the REDO phase must start
+   before this COMMIT record, so transaction sets its rec_lsn to the COMMIT's
+   record LSN, and as Checkpoint reads the transaction's rec_lsn, Checkpoint
+   will know.
+
+   @note so after commit trn->rec_lsn is a "commit LSN", which could be of
+   use later.
+
+   @return Operation status, always 0 (success)
+*/
+
+my_bool write_hook_for_commit(enum translog_record_type type
+                              __attribute__ ((unused)),
+                              TRN *trn,
+                              MARIA_HA *tbl_info
+                              __attribute__ ((unused)),
+                              LSN *lsn,
+                              void *hook_arg
+                              __attribute__ ((unused)))
+{
+  trn->rec_lsn= *lsn;
+  return 0;
+}
+
+
+/***************************************************************************
+  Applying of REDO log records
+***************************************************************************/
+
+/*
+  Apply changes to head and tail pages
+
+  SYNOPSIS
+    _ma_apply_redo_insert_row_head_or_tail()
+    info		Maria handler
+    lsn			LSN to put on page
+    page_type		HEAD_PAGE or TAIL_PAGE
+    new_page		True if this is first entry on page
+    header		Header (without FILEID)
+    data		Data to be put on page
+    data_length		Length of data
+
+  NOTE
+    Handles LOGREC_REDO_INSERT_ROW_HEAD, LOGREC_REDO_INSERT_ROW_TAIL
+    LOGREC_REDO_NEW_ROW_HEAD and LOGREC_REDO_NEW_ROW_TAIL
+
+  RETURN
+    0   ok
+    #   Error number
+*/
+
+uint _ma_apply_redo_insert_row_head_or_tail(MARIA_HA *info, LSN lsn,
+                                            uint page_type,
+                                            my_bool new_page,
+                                            const uchar *header,
+                                            const uchar *data,
+                                            size_t data_length)
+{
+  MARIA_SHARE *share= info->s;
+  pgcache_page_no_t page;
+  uint      rownr, empty_space;
+  uint      block_size= share->block_size;
+  uint      rec_offset;
+  uchar      *buff, *dir;
+  uint      result;
+  MARIA_PINNED_PAGE page_link;
+  enum pagecache_page_lock unlock_method;
+  enum pagecache_page_pin unpin_method;
+  my_off_t end_of_page;
+  uint error;
+  DBUG_ENTER("_ma_apply_redo_insert_row_head_or_tail");
+
+  page=  page_korr(header);
+  rownr= dirpos_korr(header + PAGE_STORE_SIZE);
+
+  DBUG_PRINT("enter", ("rowid: %lu  page: %lu  rownr: %u  data_length: %u",
+                       (ulong) ma_recordpos(page, rownr),
+                       (ulong) page, rownr, (uint) data_length));
+
+  share->state.changed|= (STATE_CHANGED | STATE_NOT_ZEROFILLED |
+                          STATE_NOT_MOVABLE);
+
+  end_of_page= (page + 1) * share->block_size;
+  if (end_of_page > share->state.state.data_file_length)
+  {
+    DBUG_PRINT("info", ("Enlarging data file from %lu to %lu",
+                        (ulong) share->state.state.data_file_length,
+                        (ulong) end_of_page));
+    /*
+      New page at end of file. Note that the test above is also positive if
+      data_file_length is not a multiple of block_size (system crashed while
+      writing the last page): in this case we just extend the last page and
+      fill it entirely with zeroes, then the REDO will put correct data on
+      it.
+    */
+    unlock_method= PAGECACHE_LOCK_WRITE;
+    unpin_method=  PAGECACHE_PIN;
+
+    DBUG_ASSERT(rownr == 0 && new_page);
+    if (rownr != 0 || !new_page)
+      goto crashed_file;
+
+    buff= info->keyread_buff;
+    info->keyread_buff_used= 1;
+    make_empty_page(info, buff, page_type, 1);
+    empty_space= (block_size - PAGE_OVERHEAD_SIZE);
+    rec_offset= PAGE_HEADER_SIZE;
+    dir= buff+ block_size - PAGE_SUFFIX_SIZE - DIR_ENTRY_SIZE;
+  }
+  else
+  {
+    unlock_method= PAGECACHE_LOCK_LEFT_WRITELOCKED;
+    unpin_method=  PAGECACHE_PIN_LEFT_PINNED;
+
+    share->pagecache->readwrite_flags&= ~MY_WME;
+    buff= pagecache_read(share->pagecache, &info->dfile,
+                         page, 0, 0,
+                         PAGECACHE_PLAIN_PAGE, PAGECACHE_LOCK_WRITE,
+                         &page_link.link);
+    share->pagecache->readwrite_flags= share->pagecache->org_readwrite_flags;
+    if (!buff)
+    {
+      /* Skip errors when reading outside of file and uninitialized pages */
+      if (!new_page || (my_errno != HA_ERR_FILE_TOO_SHORT &&
+                        my_errno != HA_ERR_WRONG_CRC))
+      {
+        DBUG_PRINT("error", ("Error %d when reading page", (int) my_errno));
+        goto err;
+      }
+      /* Create new page */
+      buff= pagecache_block_link_to_buffer(page_link.link);
+      buff[PAGE_TYPE_OFFSET]= UNALLOCATED_PAGE;
+    }
+    else if (lsn_korr(buff) >= lsn)           /* Test if already applied */
+    {
+      /* Fix bitmap, just in case */
+      empty_space= uint2korr(buff + EMPTY_SPACE_OFFSET);
+      if (!enough_free_entries_on_page(share, buff))
+        empty_space= 0;                         /* Page is full */
+
+      if (_ma_bitmap_set(info, page, page_type == HEAD_PAGE, empty_space))
+        goto err;
+      pagecache_unlock_by_link(share->pagecache, page_link.link,
+                               PAGECACHE_LOCK_WRITE_UNLOCK,
+                               PAGECACHE_UNPIN, LSN_IMPOSSIBLE,
+                               LSN_IMPOSSIBLE, 0, FALSE);
+      DBUG_RETURN(0);
+    }
+
+    if (((uint) (buff[PAGE_TYPE_OFFSET] & PAGE_TYPE_MASK) != page_type))
+    {
+      /*
+        This is a page that has been freed before and now should be
+        changed to new type.
+      */
+      if (!new_page)
+      {
+        DBUG_PRINT("error",
+                   ("Found page of wrong type: %u, should have been %u",
+                    (uint) (buff[PAGE_TYPE_OFFSET] & PAGE_TYPE_MASK),
+                    page_type));
+        goto crashed_file;
+      }
+      make_empty_page(info, buff, page_type, 0);
+      empty_space= block_size - PAGE_HEADER_SIZE - PAGE_SUFFIX_SIZE;
+      (void) extend_directory(page_type == HEAD_PAGE ? info: 0, buff,
+                              block_size, 0, rownr, &empty_space);
+      rec_offset= PAGE_HEADER_SIZE;
+      dir= dir_entry_pos(buff, block_size, rownr);
+      empty_space+= uint2korr(dir+2);
+    }
+    else
+    {
+      uint max_entry= (uint) buff[DIR_COUNT_OFFSET];
+      uint length;
+
+      DBUG_ASSERT(!new_page);
+      dir= dir_entry_pos(buff, block_size, rownr);
+      empty_space= uint2korr(buff + EMPTY_SPACE_OFFSET);
+
+      if (max_entry <= rownr)
+      {
+        /* Add directory entry first in directory and data last on page */
+        if (extend_directory(page_type == HEAD_PAGE ? info : 0, buff,
+                             block_size, max_entry, rownr, &empty_space))
+          goto crashed_file;
+      }
+      if (extend_area_on_page(page_type == HEAD_PAGE ? info : 0, buff,
+                              dir, rownr, block_size,
+                              (uint) data_length, &empty_space,
+                              &rec_offset, &length))
+        goto crashed_file;
+    }
+  }
+  /* Copy data */
+  int2store(dir+2, data_length);
+  memcpy(buff + rec_offset, data, data_length);
+  empty_space-= (uint) data_length;
+  int2store(buff + EMPTY_SPACE_OFFSET, empty_space);
+
+  /*
+    If page was not read before, write it but keep it pinned.
+    We don't update its LSN When we have processed all REDOs for this page
+    in the current REDO's group, we will stamp page with UNDO's LSN
+    (if we stamped it now, a next REDO, in
+    this group, for this page, would be skipped) and unpin then.
+  */
+  result= 0;
+  if (unlock_method == PAGECACHE_LOCK_WRITE &&
+      pagecache_write(share->pagecache,
+                      &info->dfile, page, 0,
+                      buff, PAGECACHE_PLAIN_PAGE,
+                      unlock_method, unpin_method,
+                      PAGECACHE_WRITE_DELAY, &page_link.link,
+                      LSN_IMPOSSIBLE))
+    result= my_errno;
+
+  /* Fix bitmap */
+  if (!enough_free_entries_on_page(share, buff))
+    empty_space= 0;                         /* Page is full */
+  if (_ma_bitmap_set(info, page, page_type == HEAD_PAGE, empty_space))
+    goto err;
+
+  page_link.unlock= PAGECACHE_LOCK_WRITE_UNLOCK;
+  page_link.changed= 1;
+  push_dynamic(&info->pinned_pages, (void*) &page_link);
+
+  /*
+    Data page and bitmap page are in place, we can update data_file_length in
+    case we extended the file. We could not do it earlier: bitmap code tests
+    data_file_length to know if it has to create a new page or not.
+  */
+  set_if_bigger(share->state.state.data_file_length, end_of_page);
+  DBUG_RETURN(result);
+
+crashed_file:
+  my_errno= HA_ERR_WRONG_IN_RECORD;
+err:
+  error= my_errno;
+  if (unlock_method == PAGECACHE_LOCK_LEFT_WRITELOCKED)
+    pagecache_unlock_by_link(share->pagecache, page_link.link,
+                             PAGECACHE_LOCK_WRITE_UNLOCK,
+                             PAGECACHE_UNPIN, LSN_IMPOSSIBLE,
+                             LSN_IMPOSSIBLE, 0, FALSE);
+  _ma_mark_file_crashed(share);
+  DBUG_ASSERT(0); /* catch recovery errors early */
+  DBUG_RETURN((my_errno= error));
+}
+
+
+/*
+  Apply LOGREC_REDO_PURGE_ROW_HEAD & LOGREC_REDO_PURGE_ROW_TAIL
+
+  SYNOPSIS
+    _ma_apply_redo_purge_row_head_or_tail()
+    info		Maria handler
+    lsn			LSN to put on page
+    page_type		HEAD_PAGE or TAIL_PAGE
+    header		Header (without FILEID)
+
+  NOTES
+    This function is very similar to delete_head_or_tail()
+
+  RETURN
+    0   ok
+    #   Error number
+*/
+
+uint _ma_apply_redo_purge_row_head_or_tail(MARIA_HA *info, LSN lsn,
+                                           uint page_type,
+                                           const uchar *header)
+{
+  MARIA_SHARE *share= info->s;
+  pgcache_page_no_t page;
+  uint      rownr, empty_space;
+  uint      block_size= share->block_size;
+  uchar     *buff;
+  int result;
+  uint error;
+  MARIA_PINNED_PAGE page_link;
+  DBUG_ENTER("_ma_apply_redo_purge_row_head_or_tail");
+
+  page=  page_korr(header);
+  rownr= dirpos_korr(header+PAGE_STORE_SIZE);
+  DBUG_PRINT("enter", ("rowid: %lu  page: %lu  rownr: %u",
+                       (ulong) ma_recordpos(page, rownr),
+                       (ulong) page, rownr));
+
+  share->state.changed|= (STATE_CHANGED | STATE_NOT_ZEROFILLED |
+                          STATE_NOT_MOVABLE);
+
+  if (!(buff= pagecache_read(share->pagecache, &info->dfile,
+                             page, 0, 0,
+                             PAGECACHE_PLAIN_PAGE, PAGECACHE_LOCK_WRITE,
+                             &page_link.link)))
+    goto err;
+
+  if (lsn_korr(buff) >= lsn)
+  {
+    /*
+      Already applied
+      Note that in case the page is not anymore a head or tail page
+      a future redo will fix the bitmap.
+    */
+    if ((uint) (buff[PAGE_TYPE_OFFSET] & PAGE_TYPE_MASK) == page_type)
+    {
+      empty_space= uint2korr(buff+EMPTY_SPACE_OFFSET);
+      if (!enough_free_entries_on_page(share, buff))
+        empty_space= 0;                         /* Page is full */
+      if (_ma_bitmap_set(info, page, page_type == HEAD_PAGE,
+                         empty_space))
+        goto err;
+    }
+    pagecache_unlock_by_link(share->pagecache, page_link.link,
+                             PAGECACHE_LOCK_WRITE_UNLOCK,
+                             PAGECACHE_UNPIN, LSN_IMPOSSIBLE,
+                             LSN_IMPOSSIBLE, 0, FALSE);
+    DBUG_RETURN(0);
+  }
+
+  DBUG_ASSERT((buff[PAGE_TYPE_OFFSET] & PAGE_TYPE_MASK) == (uchar) page_type);
+
+  if (delete_dir_entry(buff, block_size, rownr, &empty_space) < 0)
+  {
+    my_errno= HA_ERR_WRONG_IN_RECORD;
+    goto err;
+  }
+
+  page_link.unlock= PAGECACHE_LOCK_WRITE_UNLOCK;
+  page_link.changed= 1;
+  push_dynamic(&info->pinned_pages, (void*) &page_link);
+
+  result= 0;
+  if (!enough_free_entries_on_page(share, buff))
+    empty_space= 0;                         /* Page is full */
+  /* This will work even if the page was marked as UNALLOCATED_PAGE */
+  if (_ma_bitmap_set(info, page, page_type == HEAD_PAGE, empty_space))
+    result= my_errno;
+
+  DBUG_RETURN(result);
+
+err:
+  error= my_errno;
+  pagecache_unlock_by_link(share->pagecache, page_link.link,
+                           PAGECACHE_LOCK_WRITE_UNLOCK,
+                           PAGECACHE_UNPIN, LSN_IMPOSSIBLE,
+                           LSN_IMPOSSIBLE, 0, FALSE);
+  _ma_mark_file_crashed(share);
+  DBUG_ASSERT(0);
+  DBUG_RETURN((my_errno= error));
+
+}
+
+
+/**
+   @brief Apply LOGREC_REDO_FREE_BLOCKS
+
+   @param  info            Maria handler
+   @param  header          Header (without FILEID)
+
+   @note It marks the pages free in the bitmap
+
+   @return Operation status
+     @retval 0      OK
+     @retval 1      Error
+*/
+
+uint _ma_apply_redo_free_blocks(MARIA_HA *info,
+                                LSN lsn __attribute__((unused)),
+                                const uchar *header)
+{
+  MARIA_SHARE *share= info->s;
+  uint ranges;
+  DBUG_ENTER("_ma_apply_redo_free_blocks");
+
+  share->state.changed|= (STATE_CHANGED | STATE_NOT_ZEROFILLED |
+                          STATE_NOT_MOVABLE);
+
+  ranges= pagerange_korr(header);
+  header+= PAGERANGE_STORE_SIZE;
+  DBUG_ASSERT(ranges > 0);
+
+  while (ranges--)
+  {
+    my_bool res;
+    uint page_range;
+    pgcache_page_no_t page, start_page;
+
+    start_page= page= page_korr(header);
+    header+= PAGE_STORE_SIZE;
+    /* Page range may have this bit set to indicate a tail page */
+    page_range= pagerange_korr(header) & ~(TAIL_BIT | START_EXTENT_BIT);
+    DBUG_ASSERT(page_range > 0);
+
+    header+= PAGERANGE_STORE_SIZE;
+
+    DBUG_PRINT("info", ("page: %lu  pages: %u", (long) page, page_range));
+
+    /** @todo leave bitmap lock to the bitmap code... */
+    pthread_mutex_lock(&share->bitmap.bitmap_lock);
+    res= _ma_bitmap_reset_full_page_bits(info, &share->bitmap, start_page,
+                                         page_range);
+    pthread_mutex_unlock(&share->bitmap.bitmap_lock);
+    if (res)
+    {
+      _ma_mark_file_crashed(share);
+      DBUG_ASSERT(0);
+      DBUG_RETURN(res);
+    }
+  }
+  DBUG_RETURN(0);
+}
+
+
+/**
+   @brief Apply LOGREC_REDO_FREE_HEAD_OR_TAIL
+
+   @param  info            Maria handler
+   @param  header          Header (without FILEID)
+
+   @note It marks the page free in the bitmap, and sets the directory's count
+   to 0.
+
+   @return Operation status
+     @retval 0      OK
+     @retval 1      Error
+*/
+
+uint _ma_apply_redo_free_head_or_tail(MARIA_HA *info, LSN lsn,
+                                      const uchar *header)
+{
+  MARIA_SHARE *share= info->s;
+  uchar *buff;
+  pgcache_page_no_t page;
+  MARIA_PINNED_PAGE page_link;
+  my_bool res;
+  DBUG_ENTER("_ma_apply_redo_free_head_or_tail");
+
+  share->state.changed|= (STATE_CHANGED | STATE_NOT_ZEROFILLED |
+                          STATE_NOT_MOVABLE);
+
+  page= page_korr(header);
+
+  if (!(buff= pagecache_read(share->pagecache,
+                             &info->dfile,
+                             page, 0, 0,
+                             PAGECACHE_PLAIN_PAGE,
+                             PAGECACHE_LOCK_WRITE, &page_link.link)))
+  {
+    pagecache_unlock_by_link(share->pagecache, page_link.link,
+                             PAGECACHE_LOCK_WRITE_UNLOCK,
+                             PAGECACHE_UNPIN, LSN_IMPOSSIBLE,
+                             LSN_IMPOSSIBLE, 0, FALSE);
+    goto err;
+  }
+  if (lsn_korr(buff) >= lsn)
+  {
+    /* Already applied */
+    pagecache_unlock_by_link(share->pagecache, page_link.link,
+                             PAGECACHE_LOCK_WRITE_UNLOCK,
+                             PAGECACHE_UNPIN, LSN_IMPOSSIBLE,
+                             LSN_IMPOSSIBLE, 0, FALSE);
+  }
+  else
+  {
+    buff[PAGE_TYPE_OFFSET]= UNALLOCATED_PAGE;
+#ifdef IDENTICAL_PAGES_AFTER_RECOVERY
+    {
+      uint number_of_records= (uint) buff[DIR_COUNT_OFFSET];
+      uchar *dir= dir_entry_pos(buff, share->block_size,
+                                number_of_records-1);
+      buff[DIR_FREE_OFFSET]=  END_OF_DIR_FREE_LIST;
+      bzero(dir, number_of_records * DIR_ENTRY_SIZE);
+    }
+#endif
+
+    page_link.unlock= PAGECACHE_LOCK_WRITE_UNLOCK;
+    page_link.changed= 1;
+    push_dynamic(&info->pinned_pages, (void*) &page_link);
+  }
+  /** @todo leave bitmap lock to the bitmap code... */
+  pthread_mutex_lock(&share->bitmap.bitmap_lock);
+  res= _ma_bitmap_reset_full_page_bits(info, &share->bitmap, page, 1);
+  pthread_mutex_unlock(&share->bitmap.bitmap_lock);
+  if (res)
+    goto err;
+  DBUG_RETURN(0);
+
+err:
+  _ma_mark_file_crashed(share);
+  DBUG_ASSERT(0);
+  DBUG_RETURN(1);
+}
+
+
+/**
+   @brief Apply LOGREC_REDO_INSERT_ROW_BLOBS
+
+   @param  info            Maria handler
+   @parma  lsn             LSN to put on pages
+   @param  header          Header (with FILEID)
+   @param  redo_lsn        REDO record's LSN
+   @param[out] number_of_blobs Number of blobs found in log record
+   @param[out] number_of_ranges Number of ranges found
+   @param[out] first_page  First page touched
+   @param[out] last_page   Last page touched
+
+   @note Write full pages (full head & blob pages)
+
+   @return Operation status
+     @retval 0      OK
+     @retval !=0    Error
+*/
+
+uint _ma_apply_redo_insert_row_blobs(MARIA_HA *info,
+                                     LSN lsn, const uchar *header,
+                                     LSN redo_lsn,
+                                     uint * const number_of_blobs,
+                                     uint * const number_of_ranges,
+                                     pgcache_page_no_t * const first_page,
+                                     pgcache_page_no_t * const last_page)
+{
+  MARIA_SHARE *share= info->s;
+  const uchar *data;
+  uint      data_size= FULL_PAGE_SIZE(share->block_size);
+  uint      blob_count, ranges;
+  uint16    sid;
+  pgcache_page_no_t first_page2= ULONGLONG_MAX, last_page2= 0;
+  DBUG_ENTER("_ma_apply_redo_insert_row_blobs");
+
+  share->state.changed|= (STATE_CHANGED | STATE_NOT_ZEROFILLED |
+                          STATE_NOT_MOVABLE);
+
+  sid= fileid_korr(header);
+  header+= FILEID_STORE_SIZE;
+  *number_of_ranges= ranges= pagerange_korr(header);
+  header+= PAGERANGE_STORE_SIZE;
+  *number_of_blobs= blob_count= pagerange_korr(header);
+  header+= PAGERANGE_STORE_SIZE;
+  DBUG_ASSERT(ranges >= blob_count);
+
+  data= (header + ranges * ROW_EXTENT_SIZE +
+         blob_count * (SUB_RANGE_SIZE + BLOCK_FILLER_SIZE));
+
+  while (blob_count--)
+  {
+    uint sub_ranges, empty_space;
+
+    sub_ranges=  uint2korr(header);
+    header+= SUB_RANGE_SIZE;
+    empty_space= uint2korr(header);
+    header+= BLOCK_FILLER_SIZE;
+    DBUG_ASSERT(sub_ranges <= ranges && empty_space < data_size);
+    ranges-= sub_ranges;
+
+    while (sub_ranges--)
+    {
+      uint i;
+      uint      res;
+      uint      page_range;
+      pgcache_page_no_t page, start_page;
+      uchar     *buff;
+
+      start_page= page= page_korr(header);
+      header+= PAGE_STORE_SIZE;
+      page_range= pagerange_korr(header);
+      header+= PAGERANGE_STORE_SIZE;
+
+      for (i= page_range; i-- > 0 ; page++)
+      {
+        MARIA_PINNED_PAGE page_link;
+        enum pagecache_page_lock unlock_method;
+        enum pagecache_page_pin unpin_method;
+        uint length;
+
+        set_if_smaller(first_page2, page);
+        set_if_bigger(last_page2, page);
+        if (_ma_redo_not_needed_for_page(sid, redo_lsn, page, FALSE))
+          continue;
+
+        if (((page + 1) * share->block_size) >
+            share->state.state.data_file_length)
+        {
+          /* New page or half written page at end of file */
+          DBUG_PRINT("info", ("Enlarging data file from %lu to %lu",
+                              (ulong) share->state.state.data_file_length,
+                              (ulong) ((page + 1 ) * share->block_size)));
+          share->state.state.data_file_length= (page + 1) * share->block_size;
+          buff= info->keyread_buff;
+          info->keyread_buff_used= 1;
+          make_empty_page(info, buff, BLOB_PAGE, 0);
+          unlock_method= PAGECACHE_LOCK_LEFT_UNLOCKED;
+          unpin_method=  PAGECACHE_PIN_LEFT_UNPINNED;
+        }
+        else
+        {
+          share->pagecache->readwrite_flags&= ~MY_WME;
+          buff= pagecache_read(share->pagecache,
+                               &info->dfile,
+                               page, 0, 0,
+                               PAGECACHE_PLAIN_PAGE,
+                               PAGECACHE_LOCK_WRITE, &page_link.link);
+          share->pagecache->readwrite_flags= share->pagecache->
+            org_readwrite_flags;
+          if (!buff)
+          {
+            if (my_errno != HA_ERR_FILE_TOO_SHORT &&
+                my_errno != HA_ERR_WRONG_CRC)
+            {
+              /* If not read outside of file */
+              pagecache_unlock_by_link(share->pagecache, page_link.link,
+                                       PAGECACHE_LOCK_WRITE_UNLOCK,
+                                       PAGECACHE_UNPIN, LSN_IMPOSSIBLE,
+                                       LSN_IMPOSSIBLE, 0, FALSE);
+              goto err;
+            }
+            /*
+              Physical file was too short, create new page. It can be that
+              recovery started with a file with N pages, wrote page N+2 into
+              pagecache (increased data_file_length but not physical file
+              length), now reads page N+1: the read fails.
+            */
+            buff= pagecache_block_link_to_buffer(page_link.link);
+            make_empty_page(info, buff, BLOB_PAGE, 0);
+          }
+          else
+          {
+#ifndef DBUG_OFF
+            uchar found_page_type= (buff[PAGE_TYPE_OFFSET] & PAGE_TYPE_MASK);
+#endif
+            if (lsn_korr(buff) >= lsn)
+            {
+              /* Already applied */
+              DBUG_PRINT("info", ("already applied %llu >= %llu",
+                                  lsn_korr(buff), lsn));
+              pagecache_unlock_by_link(share->pagecache, page_link.link,
+                                       PAGECACHE_LOCK_WRITE_UNLOCK,
+                                       PAGECACHE_UNPIN, LSN_IMPOSSIBLE,
+                                       LSN_IMPOSSIBLE, 0, FALSE);
+              continue;
+            }
+            DBUG_ASSERT((found_page_type == (uchar) BLOB_PAGE) ||
+                        (found_page_type == (uchar) UNALLOCATED_PAGE));
+          }
+          unlock_method= PAGECACHE_LOCK_WRITE_UNLOCK;
+          unpin_method=  PAGECACHE_UNPIN;
+        }
+
+        /*
+          Blob pages are never updated twice in same redo-undo chain, so
+          it's safe to update lsn for them here
+        */
+        lsn_store(buff, lsn);
+        buff[PAGE_TYPE_OFFSET]= BLOB_PAGE;
+
+        length= data_size;
+        if (i == 0 && sub_ranges == 0)
+        {
+          /*
+            Last page may be only partly filled. We zero the rest, like
+            write_full_pages() does.
+          */
+          length-= empty_space;
+          bzero(buff + share->block_size - PAGE_SUFFIX_SIZE - empty_space,
+                empty_space);
+        }
+        memcpy(buff+ PAGE_TYPE_OFFSET + 1, data, length);
+        data+= length;
+        if (pagecache_write(share->pagecache,
+                            &info->dfile, page, 0,
+                            buff, PAGECACHE_PLAIN_PAGE,
+                            unlock_method, unpin_method,
+                            PAGECACHE_WRITE_DELAY, 0, LSN_IMPOSSIBLE))
+          goto err;
+      }
+      /** @todo leave bitmap lock to the bitmap code... */
+      pthread_mutex_lock(&share->bitmap.bitmap_lock);
+      res= _ma_bitmap_set_full_page_bits(info, &share->bitmap, start_page,
+                                         page_range);
+      pthread_mutex_unlock(&share->bitmap.bitmap_lock);
+      if (res)
+        goto err;
+    }
+  }
+  *first_page= first_page2;
+  *last_page=  last_page2;
+  DBUG_RETURN(0);
+
+err:
+  _ma_mark_file_crashed(share);
+  DBUG_ASSERT(0);
+  DBUG_RETURN(1);
+}
+
+
+/****************************************************************************
+ Applying of UNDO entries
+****************************************************************************/
+
+/** Execute undo of a row insert (delete the inserted row) */
+
+my_bool _ma_apply_undo_row_insert(MARIA_HA *info, LSN undo_lsn,
+                                  const uchar *header)
+{
+  pgcache_page_no_t page;
+  uint rownr;
+  uchar *buff;
+  my_bool res;
+  MARIA_PINNED_PAGE page_link;
+  MARIA_SHARE *share= info->s;
+  ha_checksum checksum;
+  LSN lsn;
+  DBUG_ENTER("_ma_apply_undo_row_insert");
+
+  page=  page_korr(header);
+  header+= PAGE_STORE_SIZE;
+  rownr= dirpos_korr(header);
+  header+= DIRPOS_STORE_SIZE;
+  DBUG_PRINT("enter", ("rowid: %lu  page: %lu  rownr: %u",
+                       (ulong) ma_recordpos(page, rownr),
+                       (ulong) page, rownr));
+
+  buff= pagecache_read(share->pagecache,
+                       &info->dfile, page, 0,
+                       0, share->page_type,
+                       PAGECACHE_LOCK_WRITE,
+                       &page_link.link);
+  page_link.unlock= PAGECACHE_LOCK_WRITE_UNLOCK;
+  page_link.changed= buff != 0;
+  push_dynamic(&info->pinned_pages, (void*) &page_link);
+  if (!buff)
+    goto err;
+
+  if (read_row_extent_info(info, buff, rownr))
+    goto err;
+
+  _ma_bitmap_flushable(info, 1);
+  if (delete_head_or_tail(info, page, rownr, 1, 1) ||
+      delete_tails(info, info->cur_row.tail_positions))
+    goto err;
+
+  if (info->cur_row.extents_count && free_full_pages(info, &info->cur_row))
+    goto err;
+
+  checksum= 0;
+  if (share->calc_checksum)
+    checksum= (ha_checksum) 0 - ha_checksum_korr(header);
+  info->last_auto_increment= ~ (ulonglong) 0;
+  if (_ma_write_clr(info, undo_lsn, LOGREC_UNDO_ROW_INSERT,
+                    share->calc_checksum != 0, checksum, &lsn, (void*) 0))
+    goto err;
+
+  res= 0;
+end:
+  if (info->non_flushable_state)
+    _ma_bitmap_flushable(info, -1);
+  _ma_unpin_all_pages_and_finalize_row(info, lsn);
+  DBUG_RETURN(res);
+
+err:
+  res= 1;
+  _ma_mark_file_crashed(share);
+  goto end;
+}
+
+
+/** Execute undo of a row delete (insert the row back where it was) */
+
+my_bool _ma_apply_undo_row_delete(MARIA_HA *info, LSN undo_lsn,
+                                  const uchar *header, size_t header_length
+                                  __attribute__((unused)))
+{
+  MARIA_SHARE *share= info->s;
+  MARIA_ROW row;
+  MARIA_COLUMNDEF *column, *end_column;
+  MARIA_BITMAP_BLOCKS *blocks;
+  struct st_row_pos_info row_pos;
+  uchar *record;
+  const uchar *null_bits, *field_length_data, *extent_info;
+  pgcache_page_no_t page;
+  ulong *blob_lengths;
+  uint *null_field_lengths, extent_count, rownr, length_on_head_page;
+  DBUG_ENTER("_ma_apply_undo_row_delete");
+
+  /*
+    Use cur row as a base;  We need to make a copy as we will change
+    some buffers to point directly to 'header'
+  */
+  memcpy(&row, &info->cur_row, sizeof(row));
+
+  page=  page_korr(header);
+  header+= PAGE_STORE_SIZE;
+  rownr= dirpos_korr(header);
+  header+= DIRPOS_STORE_SIZE;
+  length_on_head_page= uint2korr(header);
+  header+= 2;
+  extent_count= pagerange_korr(header);
+  header+= PAGERANGE_STORE_SIZE;
+  DBUG_PRINT("enter", ("rowid: %lu  page: %lu  rownr: %u",
+                       (ulong) ma_recordpos(page, rownr),
+                       (ulong) page, rownr));
+
+  if (share->calc_checksum)
+  {
+    /*
+      We extract the checksum delta here, saving a recomputation in
+      allocate_and_write_block_record(). It's only an optimization.
+    */
+    row.checksum= (ha_checksum) 0 - ha_checksum_korr(header);
+    header+= HA_CHECKSUM_STORE_SIZE;
+  }
+  extent_info= header;
+  header+= extent_count * ROW_EXTENT_SIZE;
+
+  null_field_lengths= row.null_field_lengths;
+  blob_lengths= row.blob_lengths;
+
+  /*
+    Fill in info->cur_row with information about the row, like in
+    calc_record_size(), to be used by write_block_record()
+  */
+
+  row.normal_length= row.char_length= row.varchar_length=
+    row.blob_length= row.extents_count= row.field_lengths_length= 0;
+
+  null_bits= header;
+  header+= share->base.null_bytes;
+  /* This will not be changed */
+  row.empty_bits= (uchar*) header;
+  header+= share->base.pack_bytes;
+  if (share->base.max_field_lengths)
+  {
+    row.field_lengths_length= uint2korr(header);
+    row.field_lengths= (uchar*) header + 2 ;
+    header+= 2 + row.field_lengths_length;
+  }
+  if (share->base.blobs)
+    row.blob_length= ma_get_length(&header);
+
+  /* We need to build up a record (without blobs) in rec_buff */
+  if (!(record= my_malloc(share->base.reclength, MYF(MY_WME))))
+    DBUG_RETURN(1);
+
+  memcpy(record, null_bits, share->base.null_bytes);
+
+  /* Copy field information from header to record */
+
+  /* Handle constant length fields that are always present */
+  for (column= share->columndef,
+         end_column= column+ share->base.fixed_not_null_fields;
+       column < end_column;
+       column++)
+  {
+    memcpy(record + column->offset, header, column->length);
+    header+= column->length;
+  }
+
+  /* Handle NULL fields and CHAR/VARCHAR fields */
+  field_length_data= row.field_lengths;
+  for (end_column= share->columndef + share->base.fields;
+       column < end_column;
+       column++, null_field_lengths++)
+  {
+    if ((record[column->null_pos] & column->null_bit) ||
+        row.empty_bits[column->empty_pos] & column->empty_bit)
+    {
+      if (column->type != FIELD_BLOB)
+        *null_field_lengths= 0;
+      else
+        *blob_lengths++= 0;
+      if (share->calc_checksum)
+        bfill(record + column->offset, column->fill_length,
+              column->type == FIELD_SKIP_ENDSPACE ? ' ' : 0);
+      continue;
+    }
+    switch (column->type) {
+    case FIELD_CHECK:
+    case FIELD_NORMAL:                          /* Fixed length field */
+    case FIELD_ZERO:
+    case FIELD_SKIP_PRESPACE:                   /* Not packed */
+    case FIELD_SKIP_ZERO:                       /* Fixed length field */
+      row.normal_length+= column->length;
+      *null_field_lengths= column->length;
+      memcpy(record + column->offset, header, column->length);
+      header+= column->length;
+      break;
+    case FIELD_SKIP_ENDSPACE:                   /* CHAR */
+    {
+      uint length;
+      if (column->length <= 255)
+        length= (uint) *field_length_data++;
+      else
+      {
+        length= uint2korr(field_length_data);
+        field_length_data+= 2;
+      }
+      row.char_length+= length;
+      *null_field_lengths= length;
+      memcpy(record + column->offset, header, length);
+      if (share->calc_checksum)
+        bfill(record + column->offset + length, (column->length - length),
+              ' ');
+      header+= length;
+      break;
+    }
+    case FIELD_VARCHAR:
+    {
+      uint length;
+      uchar *field_pos= record + column->offset;
+
+      /* 256 is correct as this includes the length uchar */
+      if (column->fill_length == 1)
+      {
+        field_pos[0]= *field_length_data;
+        length= (uint) *field_length_data;
+      }
+      else
+      {
+        field_pos[0]= field_length_data[0];
+        field_pos[1]= field_length_data[1];
+        length= uint2korr(field_length_data);
+      }
+      field_length_data+= column->fill_length;
+      field_pos+= column->fill_length;
+      row.varchar_length+= length;
+      *null_field_lengths= length;
+      memcpy(field_pos, header, length);
+      header+= length;
+      break;
+    }
+    case FIELD_BLOB:
+    {
+      /* Copy length of blob and pointer to blob data to record */
+      uchar *field_pos= record + column->offset;
+      uint size_length= column->length - portable_sizeof_char_ptr;
+      ulong blob_length= _ma_calc_blob_length(size_length, field_length_data);
+
+      memcpy(field_pos, field_length_data, size_length);
+      field_length_data+= size_length;
+      memcpy(field_pos + size_length, &header, sizeof(&header));
+      header+= blob_length;
+      *blob_lengths++= blob_length;
+      break;
+    }
+    default:
+      DBUG_ASSERT(0);
+    }
+  }
+  row.head_length= (info->row_base_length +
+                    share->base.fixed_not_null_fields_length +
+                    row.field_lengths_length +
+                    size_to_store_key_length(row.field_lengths_length) +
+                    row.normal_length +
+                    row.char_length + row.varchar_length);
+  row.total_length= (row.head_length + row.blob_length);
+  if (row.total_length < share->base.min_block_length)
+    row.total_length= share->base.min_block_length;
+
+  /*
+    Row is now generated. Now we need to insert record on the original
+    pages with original size on each page.
+  */
+
+  _ma_bitmap_flushable(info, 1);
+  /* Change extent information to be usable by write_block_record() */
+  blocks= &row.insert_blocks;
+  if (extent_to_bitmap_blocks(info, blocks, page, extent_count, extent_info))
+    goto err;
+  blocks->block->org_bitmap_value= _ma_bitmap_get_page_bits(info,
+                                                            &share->bitmap,
+                                                            page);
+  blocks->block->used|= BLOCKUSED_USE_ORG_BITMAP;
+
+  /* Read head page and allocate data for rowid */
+  if (get_rowpos_in_head_or_tail_page(info, blocks->block,
+                                      info->buff,
+                                      length_on_head_page,
+                                      HEAD_PAGE, PAGECACHE_LOCK_WRITE,
+                                      rownr, &row_pos))
+    goto err;
+
+  if (share->calc_checksum)
+  {
+    DBUG_ASSERT(row.checksum == (share->calc_checksum)(info, record));
+  }
+  /* Store same amount of data on head page as on original page */
+  row_pos.length= (length_on_head_page - 
+                   (extent_count + 1 - blocks->count) * ROW_EXTENT_SIZE);
+  set_if_bigger(row_pos.length, share->base.min_block_length);
+  if (write_block_record(info, (uchar*) 0, record, &row,
+                         blocks, blocks->block->org_bitmap_value != 0,
+                         &row_pos, undo_lsn, 0))
+    goto err;
+
+  my_free(record, MYF(0));
+  DBUG_RETURN(0);
+
+err:
+  _ma_mark_file_crashed(share);
+  if (info->non_flushable_state)
+    _ma_bitmap_flushable(info, -1);
+  _ma_unpin_all_pages_and_finalize_row(info, LSN_IMPOSSIBLE);
+  my_free(record, MYF(0));
+  DBUG_RETURN(1);
+}
+
+
+/**
+  Execute undo of a row update
+
+  @fn _ma_apply_undo_row_update()
+
+  @return Operation status
+    @retval 0      OK
+    @retval 1      Error
+*/
+
+my_bool _ma_apply_undo_row_update(MARIA_HA *info, LSN undo_lsn,
+                                  const uchar *header,
+                                  size_t header_length
+                                  __attribute__((unused)))
+{
+  MARIA_SHARE *share= info->s;
+  MARIA_RECORD_POS record_pos;
+  const uchar *field_length_data, *field_length_data_end, *extent_info;
+  uchar *current_record, *orig_record;
+  pgcache_page_no_t page;
+  ha_checksum checksum_delta;
+  uint rownr, field_length_header, extent_count, length_on_head_page;
+  int error;
+  DBUG_ENTER("_ma_apply_undo_row_update");
+  LINT_INIT(checksum_delta);
+
+  page=  page_korr(header);
+  header+= PAGE_STORE_SIZE;
+  rownr= dirpos_korr(header);
+  header+= DIRPOS_STORE_SIZE;
+
+  record_pos= ma_recordpos(page, rownr);
+  DBUG_PRINT("enter", ("rowid: %lu  page: %lu  rownr: %u",
+                       (ulong) record_pos, (ulong) page, rownr));
+
+  if (share->calc_checksum)
+  {
+    checksum_delta= ha_checksum_korr(header);
+    header+= HA_CHECKSUM_STORE_SIZE;
+  }
+  length_on_head_page= uint2korr(header);
+  set_if_bigger(length_on_head_page, share->base.min_block_length);
+  header+= 2;
+  extent_count= pagerange_korr(header);
+  header+= PAGERANGE_STORE_SIZE;
+  extent_info= header;
+  header+= extent_count * ROW_EXTENT_SIZE;
+
+  /*
+    Set header to point to old field values, generated by
+    fill_update_undo_parts()
+  */
+  field_length_header= ma_get_length(&header);
+  field_length_data= (uchar*) header;
+  header+= field_length_header;
+  field_length_data_end= header;
+
+  /* Allocate buffer for current row & original row */
+  if (!(current_record= my_malloc(share->base.reclength * 2, MYF(MY_WME))))
+    DBUG_RETURN(1);
+  orig_record= current_record+ share->base.reclength;
+
+  /* Read current record */
+  if (_ma_read_block_record(info, current_record, record_pos))
+    goto err;
+
+  if (*field_length_data == 255)
+  {
+    /* Bitmap changed */
+    field_length_data++;
+    memcpy(orig_record, header, share->base.null_bytes);
+    header+= share->base.null_bytes;
+  }
+  else
+    memcpy(orig_record, current_record, share->base.null_bytes);
+  bitmap_clear_all(&info->changed_fields);
+
+  while (field_length_data < field_length_data_end)
+  {
+    uint field_nr= ma_get_length(&field_length_data), field_length;
+    MARIA_COLUMNDEF *column= share->columndef + field_nr;
+    uchar *orig_field_pos= orig_record + column->offset;
+
+    bitmap_set_bit(&info->changed_fields, field_nr);
+    if (field_nr >= share->base.fixed_not_null_fields)
+    {
+      if (!(field_length= ma_get_length(&field_length_data)))
+      {
+        /* Null field or empty field */
+        bfill(orig_field_pos, column->fill_length,
+              column->type == FIELD_SKIP_ENDSPACE ? ' ' : 0);
+        continue;
+      }
+    }
+    else
+      field_length= column->length;
+
+    switch (column->type) {
+    case FIELD_CHECK:
+    case FIELD_NORMAL:                          /* Fixed length field */
+    case FIELD_ZERO:
+    case FIELD_SKIP_PRESPACE:                   /* Not packed */
+      memcpy(orig_field_pos, header, column->length);
+      header+= column->length;
+      break;
+    case FIELD_SKIP_ZERO:                       /* Number */
+    case FIELD_SKIP_ENDSPACE:                   /* CHAR */
+    {
+      uint diff;
+      memcpy(orig_field_pos, header, field_length);
+      if ((diff= (column->length - field_length)))
+        bfill(orig_field_pos + column->length - diff, diff,
+              column->type == FIELD_SKIP_ENDSPACE ? ' ' : 0);
+      header+= field_length;
+    }
+    break;
+    case FIELD_VARCHAR:
+      if (column->length <= 256)
+      {
+        *orig_field_pos++= (uchar) field_length;
+      }
+      else
+      {
+        int2store(orig_field_pos, field_length);
+        orig_field_pos+= 2;
+      }
+      memcpy(orig_field_pos, header, field_length);
+      header+= field_length;
+      break;
+    case FIELD_BLOB:
+    {
+      uint size_length= column->length - portable_sizeof_char_ptr;
+      _ma_store_blob_length(orig_field_pos, size_length, field_length);
+      memcpy_fixed(orig_field_pos + size_length, &header, sizeof(header));
+      header+= field_length;
+      break;
+    }
+    default:
+      DBUG_ASSERT(0);
+    }
+  }
+  copy_not_changed_fields(info, &info->changed_fields,
+                          orig_record, current_record);
+
+  if (share->calc_checksum)
+  {
+    info->new_row.checksum= checksum_delta +
+      (info->cur_row.checksum= (*share->calc_checksum)(info, orig_record));
+    /* verify that record's content is sane */
+    DBUG_ASSERT(info->new_row.checksum ==
+                (*share->calc_checksum)(info, current_record));
+  }
+
+  info->last_auto_increment= ~ (ulonglong) 0;
+  /* Now records are up to date, execute the update to original values */
+  if (_ma_update_at_original_place(info, page, rownr, length_on_head_page,
+                                   extent_count, extent_info,
+                                   current_record, orig_record, undo_lsn))
+    goto err;
+
+  error= 0;
+end:
+  my_free(current_record, MYF(0));
+  DBUG_RETURN(error);
+
+err:
+  error= 1;
+  _ma_mark_file_crashed(share);
+  goto end;
+}
+
+
+/**
+  Execute undo of a bulk insert which used repair
+
+  @return Operation status
+    @retval 0      OK
+    @retval 1      Error
+*/
+
+my_bool _ma_apply_undo_bulk_insert(MARIA_HA *info, LSN undo_lsn)
+{
+  my_bool error;
+  LSN lsn;
+  DBUG_ENTER("_ma_apply_undo_bulk_insert");
+  /*
+    We delete all rows, re-enable indices as bulk insert had disabled
+    non-unique ones.
+  */
+  error= (maria_delete_all_rows(info) ||
+          maria_enable_indexes(info) ||
+          /* we enabled indices so need '2' below */
+          _ma_state_info_write(info->s,
+                               MA_STATE_INFO_WRITE_DONT_MOVE_OFFSET |
+                               MA_STATE_INFO_WRITE_FULL_INFO |
+                               MA_STATE_INFO_WRITE_LOCK) ||
+          _ma_write_clr(info, undo_lsn, LOGREC_UNDO_BULK_INSERT,
+                        FALSE, 0, &lsn, NULL));
+  DBUG_RETURN(error);
+}
+
+
+/**
+  @brief Get the TRANSLOG_ADDRESS to flush up to
+
+  @param page            Page's content
+  @param page_no         Page's number (<offset>/<page length>)
+  @param data_ptr        Callback data pointer (pointer to MARIA_SHARE)
+
+  @note
+  Usable for data (non-bitmap) and index pages
+
+  @retval LSN to flush up to
+*/
+
+TRANSLOG_ADDRESS
+maria_page_get_lsn(uchar *page,
+                   pgcache_page_no_t page_no __attribute__((unused)),
+                   uchar* data_ptr __attribute__((unused)))
+{
+#ifndef DBUG_OFF
+  const MARIA_SHARE *share= (MARIA_SHARE*)data_ptr;
+  DBUG_ASSERT(share->page_type == PAGECACHE_LSN_PAGE &&
+              share->now_transactional);
+#endif
+  return lsn_korr(page);
+}
+
+
+/**
+  @brief Enable reading of all rows, ignoring versioning
+
+  @note
+    This is mainly useful in single user applications, like maria_pack,
+    where we want to be able to read all rows without having to read the
+    transaction id from the control file
+*/
+
+void maria_ignore_trids(MARIA_HA *info)
+{
+  if (info->s->base.born_transactional)
+  {
+    if (!info->trn)
+      _ma_set_trn_for_table(info, &dummy_transaction_object);
+    /* Ignore transaction id when row is read */
+    info->trn->min_read_from= ~(TrID) 0;
+  }
+}
+
+
+#ifndef DBUG_OFF
+
+/* The following functions are useful to call from debugger */
+
+void _ma_print_block_info(uchar *buff)
+{
+  LSN lsn= lsn_korr(buff);
+
+  printf("LSN: %lu,0x%lx  type: %u  dir_entries: %u  dir_free: %u  empty_space: %u\n",
+         LSN_IN_PARTS(lsn),
+         (uint)buff[PAGE_TYPE_OFFSET],
+         (uint)buff[DIR_COUNT_OFFSET],
+         (uint)buff[DIR_FREE_OFFSET],
+         (uint) uint2korr(buff + EMPTY_SPACE_OFFSET));
+  printf("Start of directory: %lu\n",
+         maria_block_size - PAGE_SUFFIX_SIZE -
+         (uint) buff[DIR_COUNT_OFFSET] * DIR_ENTRY_SIZE);
+  _ma_print_directory(stdout, buff, maria_block_size);
+}
+#endif
diff --git a/storage/maria/ma_blockrec.h b/storage/maria/ma_blockrec.h
new file mode 100644
index 00000000000..a5858880dd0
--- /dev/null
+++ b/storage/maria/ma_blockrec.h
@@ -0,0 +1,290 @@
+/* Copyright (C) 2007 Michael Widenius
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; version 2 of the License.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */
+
+/*
+  Storage of records in block
+*/
+
+#define LSN_SIZE		7
+#define DIR_COUNT_SIZE		1	/* Stores number of rows on page */
+#define DIR_FREE_SIZE		1	/* Pointer to first free dir entry */
+#define EMPTY_SPACE_SIZE	2	/* Stores empty space on page */
+#define PAGE_TYPE_SIZE		1
+#define PAGE_SUFFIX_SIZE	4	/* Bytes for checksum */
+#define PAGE_HEADER_SIZE	(LSN_SIZE + DIR_COUNT_SIZE + DIR_FREE_SIZE +\
+                                 EMPTY_SPACE_SIZE + PAGE_TYPE_SIZE)
+#define PAGE_OVERHEAD_SIZE	(PAGE_HEADER_SIZE + DIR_ENTRY_SIZE + \
+                                 PAGE_SUFFIX_SIZE)
+#define BLOCK_RECORD_POINTER_SIZE	6
+
+#define FULL_PAGE_SIZE(block_size) ((block_size) - LSN_SIZE - \
+                                    PAGE_TYPE_SIZE - PAGE_SUFFIX_SIZE)
+
+#define ROW_EXTENT_PAGE_SIZE	5
+#define ROW_EXTENT_COUNT_SIZE   2
+#define SUB_RANGE_SIZE		2
+#define BLOCK_FILLER_SIZE	2
+#define ROW_EXTENT_SIZE		(ROW_EXTENT_PAGE_SIZE + ROW_EXTENT_COUNT_SIZE)
+#define TAIL_BIT		0x8000	/* Bit in page_count to signify tail */
+#define START_EXTENT_BIT	0x4000	/* Bit in page_count to signify start*/
+/* page_count set by bitmap code for tail pages */
+#define TAIL_PAGE_COUNT_MARKER  0xffff
+/* Number of extents reserved MARIA_BITMAP_BLOCKS to store head part */
+#define ELEMENTS_RESERVED_FOR_MAIN_PART 4
+/* This is just used to prealloc a dynamic array */
+#define AVERAGE_BLOB_SIZE      1024L*1024L
+/* Number of pages to store continuous blob parts */
+#define BLOB_SEGMENT_MIN_SIZE 128
+
+/* Fields before 'row->null_field_lengths' used by find_where_to_split_row */
+#define EXTRA_LENGTH_FIELDS		3
+
+/* Size for the different parts in the row header (and head page) */
+#define FLAG_SIZE		1
+#define VERPTR_SIZE		7
+#define DIR_ENTRY_SIZE		4
+#define FIELD_OFFSET_SIZE	2      /* size of pointers to field starts */
+
+/* Minimum header size needed for a new row */
+#define BASE_ROW_HEADER_SIZE FLAG_SIZE
+#define TRANS_ROW_EXTRA_HEADER_SIZE TRANSID_SIZE
+
+#define PAGE_TYPE_MASK 7
+enum en_page_type { UNALLOCATED_PAGE, HEAD_PAGE, TAIL_PAGE, BLOB_PAGE, MAX_PAGE_TYPE };
+#define PAGE_CAN_BE_COMPACTED   128             /* Bit in PAGE_TYPE */
+
+#define PAGE_TYPE_OFFSET        LSN_SIZE
+#define DIR_COUNT_OFFSET        (LSN_SIZE+PAGE_TYPE_SIZE)
+#define DIR_FREE_OFFSET         (DIR_COUNT_OFFSET+DIR_COUNT_SIZE)
+#define EMPTY_SPACE_OFFSET      (DIR_FREE_OFFSET+DIR_FREE_SIZE)
+
+/* Bits used for flag uchar (one byte, first in record) */
+#define ROW_FLAG_TRANSID                1
+#define ROW_FLAG_VER_PTR                2
+#define ROW_FLAG_DELETE_TRANSID         4
+#define ROW_FLAG_NULLS_EXTENDED         8
+#define ROW_FLAG_EXTENTS                128
+#define ROW_FLAG_ALL			(1+2+4+8+128)
+
+/******** Variables that affects how data pages are utilized ********/
+
+/* Minium size of tail segment */
+#define MIN_TAIL_SIZE           32
+
+/*
+  Fixed length part of Max possible header size; See row data structure
+  table in ma_blockrec.c.
+*/
+#define MAX_FIXED_HEADER_SIZE (FLAG_SIZE + 3 + ROW_EXTENT_SIZE + 3)
+#define TRANS_MAX_FIXED_HEADER_SIZE (MAX_FIXED_HEADER_SIZE + \
+                                     TRANSID_SIZE + VERPTR_SIZE + \
+                                     TRANSID_SIZE)
+
+/* We use 1 uchar in record header to store number of directory entries */
+#define MAX_ROWS_PER_PAGE	255
+#define END_OF_DIR_FREE_LIST	((uchar) 255)
+
+/* Bits for MARIA_BITMAP_BLOCKS->used */
+/* We stored data on disk in the block */
+#define BLOCKUSED_USED		 1
+/* Bitmap on disk is block->org_bitmap_value ; Happens only on update */
+#define BLOCKUSED_USE_ORG_BITMAP 2
+/* We stored tail data on disk for the block */
+#define BLOCKUSED_TAIL		 4
+
+/******* defines that affects allocation (density) of data *******/
+
+/*
+  If the tail part (from the main block or a blob) would use more than 75 % of
+  the size of page, store the tail on a full page instead of a shared
+ tail page.
+*/
+#define MAX_TAIL_SIZE(block_size) ((block_size) *3 / 4)
+
+/* Don't allocate memory for too many row extents on the stack */
+#define ROW_EXTENTS_ON_STACK	32
+
+/* Functions to convert MARIA_RECORD_POS to/from page:offset */
+
+static inline MARIA_RECORD_POS ma_recordpos(pgcache_page_no_t page,
+                                            uint dir_entry)
+{
+  DBUG_ASSERT(dir_entry <= 255);
+  DBUG_ASSERT(page > 0); /* page 0 is bitmap, not data page */
+  return (MARIA_RECORD_POS) (((ulonglong) page << 8) | dir_entry);
+}
+
+static inline pgcache_page_no_t ma_recordpos_to_page(MARIA_RECORD_POS record_pos)
+{
+  return (pgcache_page_no_t) (record_pos >> 8);
+}
+
+static inline uint ma_recordpos_to_dir_entry(MARIA_RECORD_POS record_pos)
+{
+  return (uint) (record_pos & 255);
+}
+
+static inline uchar *dir_entry_pos(uchar *buff, uint block_size, uint pos)
+{
+  return (buff + block_size - DIR_ENTRY_SIZE * pos - PAGE_SUFFIX_SIZE -
+          DIR_ENTRY_SIZE);
+}
+
+/* ma_blockrec.c */
+void _ma_init_block_record_data(void);
+my_bool _ma_once_init_block_record(MARIA_SHARE *share, File dfile);
+my_bool _ma_once_end_block_record(MARIA_SHARE *share);
+my_bool _ma_init_block_record(MARIA_HA *info);
+void _ma_end_block_record(MARIA_HA *info);
+
+my_bool _ma_update_block_record(MARIA_HA *info, MARIA_RECORD_POS pos,
+                                const uchar *oldrec, const uchar *newrec);
+my_bool _ma_delete_block_record(MARIA_HA *info, const uchar *record);
+int     _ma_read_block_record(MARIA_HA *info, uchar *record,
+                              MARIA_RECORD_POS record_pos);
+int _ma_read_block_record2(MARIA_HA *info, uchar *record,
+                           uchar *data, uchar *end_of_data);
+int     _ma_scan_block_record(MARIA_HA *info, uchar *record,
+                              MARIA_RECORD_POS, my_bool);
+my_bool _ma_cmp_block_unique(MARIA_HA *info, MARIA_UNIQUEDEF *def,
+                             const uchar *record, MARIA_RECORD_POS pos);
+my_bool _ma_scan_init_block_record(MARIA_HA *info);
+void _ma_scan_end_block_record(MARIA_HA *info);
+int _ma_scan_remember_block_record(MARIA_HA *info,
+                                   MARIA_RECORD_POS *lastpos);
+void _ma_scan_restore_block_record(MARIA_HA *info,
+                                   MARIA_RECORD_POS lastpos);
+
+MARIA_RECORD_POS _ma_write_init_block_record(MARIA_HA *info,
+                                             const uchar *record);
+my_bool _ma_write_block_record(MARIA_HA *info, const uchar *record);
+my_bool _ma_write_abort_block_record(MARIA_HA *info);
+my_bool _ma_compare_block_record(register MARIA_HA *info,
+                                 register const uchar *record);
+void    _ma_compact_block_page(uchar *buff, uint block_size, uint rownr,
+                               my_bool extend_block, TrID min_read_from,
+                               uint min_row_length);
+my_bool enough_free_entries_on_page(MARIA_SHARE *share, uchar *page_buff);
+TRANSLOG_ADDRESS
+maria_page_get_lsn(uchar *page, pgcache_page_no_t page_no, uchar* data_ptr);
+
+/* ma_bitmap.c */
+my_bool _ma_bitmap_init(MARIA_SHARE *share, File file);
+my_bool _ma_bitmap_end(MARIA_SHARE *share);
+my_bool _ma_bitmap_flush(MARIA_SHARE *share);
+my_bool _ma_bitmap_flush_all(MARIA_SHARE *share);
+void _ma_bitmap_reset_cache(MARIA_SHARE *share);
+my_bool _ma_bitmap_find_place(MARIA_HA *info, MARIA_ROW *row,
+                              MARIA_BITMAP_BLOCKS *result_blocks);
+my_bool _ma_bitmap_release_unused(MARIA_HA *info, MARIA_BITMAP_BLOCKS *blocks);
+my_bool _ma_bitmap_free_full_pages(MARIA_HA *info, const uchar *extents,
+                                   uint count);
+my_bool _ma_bitmap_set(MARIA_HA *info, pgcache_page_no_t pos, my_bool head,
+                       uint empty_space);
+my_bool _ma_bitmap_reset_full_page_bits(MARIA_HA *info,
+                                        MARIA_FILE_BITMAP *bitmap,
+                                        pgcache_page_no_t page,
+                                        uint page_count);
+my_bool _ma_bitmap_set_full_page_bits(MARIA_HA *info,
+                                      MARIA_FILE_BITMAP *bitmap,
+                                      pgcache_page_no_t page, uint page_count);
+uint _ma_free_size_to_head_pattern(MARIA_FILE_BITMAP *bitmap, uint size);
+my_bool _ma_bitmap_find_new_place(MARIA_HA *info, MARIA_ROW *new_row,
+                                  pgcache_page_no_t page, uint free_size,
+                                  MARIA_BITMAP_BLOCKS *result_blocks);
+my_bool _ma_check_bitmap_data(MARIA_HA *info,
+                              enum en_page_type page_type,
+                              pgcache_page_no_t page,
+                              uint empty_space, uint *bitmap_pattern);
+my_bool _ma_check_if_right_bitmap_type(MARIA_HA *info,
+                                       enum en_page_type page_type,
+                                       pgcache_page_no_t page,
+                                       uint *bitmap_pattern);
+uint _ma_bitmap_get_page_bits(MARIA_HA *info, MARIA_FILE_BITMAP *bitmap,
+                              pgcache_page_no_t page);
+void _ma_bitmap_delete_all(MARIA_SHARE *share);
+int  _ma_bitmap_create_first(MARIA_SHARE *share);
+void _ma_bitmap_flushable(MARIA_HA *info, int non_flushable_inc);
+void _ma_bitmap_lock(MARIA_SHARE *share);
+void _ma_bitmap_unlock(MARIA_SHARE *share);
+void _ma_bitmap_set_pagecache_callbacks(PAGECACHE_FILE *file,
+                                        MARIA_SHARE *share);
+#ifndef DBUG_OFF
+void _ma_print_bitmap(MARIA_FILE_BITMAP *bitmap, uchar *data,
+                      pgcache_page_no_t page);
+#endif
+
+uint _ma_apply_redo_insert_row_head_or_tail(MARIA_HA *info, LSN lsn,
+                                            uint page_type,
+                                            my_bool new_page,
+                                            const uchar *header,
+                                            const uchar *data,
+                                            size_t data_length);
+uint _ma_apply_redo_purge_row_head_or_tail(MARIA_HA *info, LSN lsn,
+                                           uint page_type,
+                                           const uchar *header);
+uint _ma_apply_redo_free_blocks(MARIA_HA *info, LSN lsn,
+                                const uchar *header);
+uint _ma_apply_redo_free_head_or_tail(MARIA_HA *info, LSN lsn,
+                                      const uchar *header);
+uint _ma_apply_redo_insert_row_blobs(MARIA_HA *info, LSN lsn,
+                                     const uchar *header, LSN redo_lsn,
+                                     uint * const number_of_blobs,
+                                     uint * const number_of_ranges,
+                                     pgcache_page_no_t * const first_page,
+                                     pgcache_page_no_t * const last_page);
+my_bool _ma_apply_redo_bitmap_new_page(MARIA_HA *info, LSN lsn,
+                                       const uchar *header);
+my_bool _ma_apply_undo_row_insert(MARIA_HA *info, LSN undo_lsn,
+                                  const uchar *header);
+my_bool _ma_apply_undo_row_delete(MARIA_HA *info, LSN undo_lsn,
+                                  const uchar *header, size_t length);
+my_bool _ma_apply_undo_row_update(MARIA_HA *info, LSN undo_lsn,
+                                  const uchar *header, size_t length);
+my_bool _ma_apply_undo_bulk_insert(MARIA_HA *info, LSN undo_lsn);
+
+my_bool write_hook_for_redo(enum translog_record_type type,
+                            TRN *trn, MARIA_HA *tbl_info, LSN *lsn,
+                            void *hook_arg);
+my_bool write_hook_for_undo(enum translog_record_type type,
+                            TRN *trn, MARIA_HA *tbl_info, LSN *lsn,
+                            void *hook_arg);
+my_bool write_hook_for_redo_delete_all(enum translog_record_type type,
+                                       TRN *trn, MARIA_HA *tbl_info,
+                                       LSN *lsn, void *hook_arg);
+my_bool write_hook_for_undo_row_insert(enum translog_record_type type,
+                                       TRN *trn, MARIA_HA *tbl_info,
+                                       LSN *lsn, void *hook_arg);
+my_bool write_hook_for_undo_row_delete(enum translog_record_type type,
+                                       TRN *trn, MARIA_HA *tbl_info,
+                                       LSN *lsn, void *hook_arg);
+my_bool write_hook_for_undo_row_update(enum translog_record_type type,
+                                       TRN *trn, MARIA_HA *tbl_info,
+                                       LSN *lsn, void *hook_arg);
+my_bool write_hook_for_undo_bulk_insert(enum translog_record_type type,
+                                        TRN *trn, MARIA_HA *tbl_info,
+                                        LSN *lsn, void *hook_arg);
+my_bool write_hook_for_file_id(enum translog_record_type type,
+                               TRN *trn, MARIA_HA *tbl_info, LSN *lsn,
+                               void *hook_arg);
+my_bool write_hook_for_commit(enum translog_record_type type,
+                              TRN *trn, MARIA_HA *tbl_info, LSN *lsn,
+                              void *hook_arg);
+void _ma_block_get_status(void *param, my_bool concurrent_insert);
+my_bool _ma_block_start_trans(void* param);
+my_bool _ma_block_start_trans_no_versioning(void *param);
+void _ma_block_update_status(void *param);
+void _ma_block_restore_status(void *param);
+my_bool _ma_block_check_status(void *param);
diff --git a/storage/maria/ma_cache.c b/storage/maria/ma_cache.c
new file mode 100644
index 00000000000..82b5ddd8047
--- /dev/null
+++ b/storage/maria/ma_cache.c
@@ -0,0 +1,107 @@
+/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; version 2 of the License.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */
+
+/*
+  Functions for read record cacheing with maria
+  Used for reading dynamic/compressed records from datafile.
+
+  Can fetch data directly from file (outside cache),
+  if reading a small chunk straight before the cached part (with possible
+  overlap).
+
+  Can be explicitly asked not to use cache (by not setting READING_NEXT in
+  flag) - useful for occasional out-of-cache reads, when the next read is
+  expected to hit the cache again.
+
+  Allows "partial read" errors in the record header (when READING_HEADER flag
+  is set) - unread part is bzero'ed
+
+  Note: out-of-cache reads are enabled for shared IO_CACHE's too,
+  as these reads will be cached by OS cache (and my_pread is always atomic)
+*/
+
+
+#include "maria_def.h"
+
+my_bool _ma_read_cache(IO_CACHE *info, uchar *buff, my_off_t pos,
+                       size_t length, uint flag)
+{
+  size_t read_length,in_buff_length;
+  my_off_t offset;
+  uchar *in_buff_pos;
+  DBUG_ENTER("_ma_read_cache");
+
+  if (pos < info->pos_in_file)
+  {
+    read_length=length;
+    if ((my_off_t) read_length > (my_off_t) (info->pos_in_file-pos))
+      read_length=(uint) (info->pos_in_file-pos);
+    info->seek_not_done=1;
+    if (my_pread(info->file,buff,read_length,pos,MYF(MY_NABP)))
+      DBUG_RETURN(1);
+    if (!(length-=read_length))
+      DBUG_RETURN(0);
+    pos+=read_length;
+    buff+=read_length;
+  }
+  if (pos >= info->pos_in_file &&
+      (offset= (my_off_t) (pos - info->pos_in_file)) <
+      (my_off_t) (info->read_end - info->request_pos))
+  {
+    in_buff_pos=info->request_pos+(uint) offset;
+    in_buff_length= min(length,(size_t) (info->read_end-in_buff_pos));
+    memcpy(buff,info->request_pos+(uint) offset,(size_t) in_buff_length);
+    if (!(length-=in_buff_length))
+      DBUG_RETURN(0);
+    pos+=in_buff_length;
+    buff+=in_buff_length;
+  }
+  else
+    in_buff_length=0;
+  if (flag & READING_NEXT)
+  {
+    if (pos != (info->pos_in_file +
+		(uint) (info->read_end - info->request_pos)))
+    {
+      info->pos_in_file=pos;				/* Force start here */
+      info->read_pos=info->read_end=info->request_pos;	/* Everything used */
+      info->seek_not_done=1;
+    }
+    else
+      info->read_pos=info->read_end;			/* All block used */
+    if (!(*info->read_function)(info,buff,length))
+      DBUG_RETURN(0);
+    read_length=info->error;
+  }
+  else
+  {
+    info->seek_not_done=1;
+    if ((read_length=my_pread(info->file,buff,length,pos,MYF(0))) == length)
+      DBUG_RETURN(0);
+  }
+  if (!(flag & READING_HEADER) || (int) read_length == -1 ||
+      read_length+in_buff_length < 3)
+  {
+    DBUG_PRINT("error",
+               ("Error %d reading next-multi-part block (Got %d bytes)",
+                my_errno, (int) read_length));
+    if (!my_errno || my_errno == HA_ERR_FILE_TOO_SHORT)
+      my_errno= HA_ERR_WRONG_IN_RECORD;
+    DBUG_RETURN(1);
+  }
+  bzero(buff+read_length,MARIA_BLOCK_INFO_HEADER_LENGTH - in_buff_length -
+        read_length);
+  DBUG_RETURN(0);
+} /* _ma_read_cache */
diff --git a/storage/maria/ma_changed.c b/storage/maria/ma_changed.c
new file mode 100644
index 00000000000..4d0964581f6
--- /dev/null
+++ b/storage/maria/ma_changed.c
@@ -0,0 +1,33 @@
+/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; version 2 of the License.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */
+
+/* Check if somebody has changed table since last check. */
+
+#include "maria_def.h"
+
+       /* Return 0 if table isn't changed */
+
+int maria_is_changed(MARIA_HA *info)
+{
+  int result;
+  DBUG_ENTER("maria_is_changed");
+  if (fast_ma_readinfo(info))
+    DBUG_RETURN(-1);
+  VOID(_ma_writeinfo(info,0));
+  result=(int) info->data_changed;
+  info->data_changed=0;
+  DBUG_PRINT("exit",("result: %d",result));
+  DBUG_RETURN(result);
+}
diff --git a/storage/maria/ma_check.c b/storage/maria/ma_check.c
new file mode 100644
index 00000000000..307befab5c7
--- /dev/null
+++ b/storage/maria/ma_check.c
@@ -0,0 +1,6805 @@
+/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; version 2 of the License.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */
+
+/* Describe, check and repair of MARIA tables */
+
+/*
+  About checksum calculation.
+
+  There are two types of checksums. Table checksum and row checksum.
+
+  Row checksum is an additional uchar at the end of dynamic length
+  records. It must be calculated if the table is configured for them.
+  Otherwise they must not be used. The variable
+  MYISAM_SHARE::calc_checksum determines if row checksums are used.
+  MI_INFO::checksum is used as temporary storage during row handling.
+  For parallel repair we must assure that only one thread can use this
+  variable. There is no problem on the write side as this is done by one
+  thread only. But when checking a record after read this could go
+  wrong. But since all threads read through a common read buffer, it is
+  sufficient if only one thread checks it.
+
+  Table checksum is an eight uchar value in the header of the index file.
+  It can be calculated even if row checksums are not used. The variable
+  MI_CHECK::glob_crc is calculated over all records.
+  MI_SORT_PARAM::calc_checksum determines if this should be done. This
+  variable is not part of MI_CHECK because it must be set per thread for
+  parallel repair. The global glob_crc must be changed by one thread
+  only. And it is sufficient to calculate the checksum once only.
+*/
+
+#include "ma_ftdefs.h"
+#include "ma_rt_index.h"
+#include "ma_blockrec.h"
+#include "trnman.h"
+#include "ma_key_recover.h"
+
+#include <stdarg.h>
+#include <my_getopt.h>
+#ifdef HAVE_SYS_VADVISE_H
+#include <sys/vadvise.h>
+#endif
+#ifdef HAVE_SYS_MMAN_H
+#include <sys/mman.h>
+#endif
+
+/* Functions defined in this file */
+
+static int check_k_link(HA_CHECK *param, MARIA_HA *info, my_off_t next_link);
+static int chk_index(HA_CHECK *param, MARIA_HA *info, MARIA_KEYDEF *keyinfo,
+		     MARIA_PAGE *page, ha_rows *keys,
+		     ha_checksum *key_checksum, uint level);
+static uint isam_key_length(MARIA_HA *info,MARIA_KEYDEF *keyinfo);
+static ha_checksum calc_checksum(ha_rows count);
+static int writekeys(MARIA_SORT_PARAM *sort_param);
+static int sort_one_index(HA_CHECK *param, MARIA_HA *info,
+                          MARIA_KEYDEF *keyinfo,
+			  my_off_t pagepos, File new_file);
+static int sort_key_read(MARIA_SORT_PARAM *sort_param, uchar *key);
+static int sort_maria_ft_key_read(MARIA_SORT_PARAM *sort_param, uchar *key);
+static int sort_get_next_record(MARIA_SORT_PARAM *sort_param);
+static int sort_key_cmp(MARIA_SORT_PARAM *sort_param, const void *a,
+                        const void *b);
+static int sort_maria_ft_key_write(MARIA_SORT_PARAM *sort_param,
+                                   const uchar *a);
+static int sort_key_write(MARIA_SORT_PARAM *sort_param, const uchar *a);
+static my_off_t get_record_for_key(MARIA_KEYDEF *keyinfo, const uchar *key);
+static int sort_insert_key(MARIA_SORT_PARAM  *sort_param,
+                           reg1 SORT_KEY_BLOCKS *key_block,
+			   const uchar *key, my_off_t prev_block);
+static int sort_delete_record(MARIA_SORT_PARAM *sort_param);
+/*static int _ma_flush_pending_blocks(HA_CHECK *param);*/
+static SORT_KEY_BLOCKS	*alloc_key_blocks(HA_CHECK *param, uint blocks,
+					  uint buffer_length);
+static ha_checksum maria_byte_checksum(const uchar *buf, uint length);
+static void set_data_file_type(MARIA_SORT_INFO *sort_info, MARIA_SHARE *share);
+static void restore_data_file_type(MARIA_SHARE *share);
+static void change_data_file_descriptor(MARIA_HA *info, File new_file);
+static void unuse_data_file_descriptor(MARIA_HA *info);
+static int _ma_safe_scan_block_record(MARIA_SORT_INFO *sort_info,
+                                      MARIA_HA *info, uchar *record);
+static void copy_data_file_state(MARIA_STATE_INFO *to,
+                                 MARIA_STATE_INFO *from);
+static void report_keypage_fault(HA_CHECK *param, MARIA_HA *info,
+                                 my_off_t position);
+static my_bool create_new_data_handle(MARIA_SORT_PARAM *param, File new_file);
+static my_bool _ma_flush_table_files_before_swap(HA_CHECK *param,
+                                                 MARIA_HA *info);
+static TrID max_trid_in_system(void);
+static void _ma_check_print_not_visible_error(HA_CHECK *param, TrID used_trid);
+void retry_if_quick(MARIA_SORT_PARAM *param, int error);
+
+
+/* Initialize check param with default values */
+
+void maria_chk_init(HA_CHECK *param)
+{
+  bzero((uchar*) param,sizeof(*param));
+  param->opt_follow_links=1;
+  param->keys_in_use= ~(ulonglong) 0;
+  param->search_after_block=HA_OFFSET_ERROR;
+  param->auto_increment_value= 0;
+  param->use_buffers=USE_BUFFER_INIT;
+  param->read_buffer_length=READ_BUFFER_INIT;
+  param->write_buffer_length=READ_BUFFER_INIT;
+  param->sort_buffer_length=SORT_BUFFER_INIT;
+  param->sort_key_blocks=BUFFERS_WHEN_SORTING;
+  param->tmpfile_createflag=O_RDWR | O_TRUNC | O_EXCL;
+  param->myf_rw=MYF(MY_NABP | MY_WME | MY_WAIT_IF_FULL);
+  param->start_check_pos=0;
+  param->max_record_length= LONGLONG_MAX;
+  param->pagecache_block_size= KEY_CACHE_BLOCK_SIZE;
+  param->stats_method= MI_STATS_METHOD_NULLS_NOT_EQUAL;
+}
+
+
+/* Initialize check param and maria handler for check of table */
+
+void maria_chk_init_for_check(HA_CHECK *param, MARIA_HA *info)
+{
+  param->not_visible_rows_found= 0;
+  param->max_found_trid= 0;
+
+  /*
+    Set up transaction handler so that we can see all rows. When rows is read
+    we will check the found id against param->max_tried
+  */
+  if (param->max_trid == 0)
+  {
+    if (!ma_control_file_inited())
+      param->max_trid= 0;      /* Give warning for first trid found */
+    else
+      param->max_trid= max_trid_in_system();
+  }
+  maria_ignore_trids(info);
+}
+
+
+	/* Check the status flags for the table */
+
+int maria_chk_status(HA_CHECK *param, MARIA_HA *info)
+{
+  MARIA_SHARE *share= info->s;
+
+  if (maria_is_crashed_on_repair(info))
+    _ma_check_print_warning(param,
+			   "Table is marked as crashed and last repair failed");
+  else if (maria_in_repair(info))
+    _ma_check_print_warning(param,
+                            "Last repair was aborted before finishing");
+  else if (maria_is_crashed(info))
+    _ma_check_print_warning(param,
+			   "Table is marked as crashed");
+  if (share->state.open_count != (uint) (share->global_changed ? 1 : 0))
+  {
+    /* Don't count this as a real warning, as check can correct this ! */
+    uint save=param->warning_printed;
+    _ma_check_print_warning(param,
+			   share->state.open_count==1 ?
+			   "%d client is using or hasn't closed the table properly" :
+			   "%d clients are using or haven't closed the table properly",
+			   share->state.open_count);
+    /* If this will be fixed by the check, forget the warning */
+    if (param->testflag & T_UPDATE_STATE)
+      param->warning_printed=save;
+  }
+  return 0;
+}
+
+/*
+  Check delete links in row data
+*/
+
+int maria_chk_del(HA_CHECK *param, register MARIA_HA *info,
+                  ulonglong test_flag)
+{
+  MARIA_SHARE *share= info->s;
+  reg2 ha_rows i;
+  uint delete_link_length;
+  my_off_t empty,next_link,old_link;
+  char buff[22],buff2[22];
+  DBUG_ENTER("maria_chk_del");
+
+  LINT_INIT(old_link);
+
+  param->record_checksum=0;
+
+  if (share->data_file_type == BLOCK_RECORD)
+    DBUG_RETURN(0);                             /* No delete links here */
+
+  delete_link_length=((share->options & HA_OPTION_PACK_RECORD) ? 20 :
+		      share->rec_reflength+1);
+
+  if (!(test_flag & T_SILENT))
+    puts("- check record delete-chain");
+
+  next_link=share->state.dellink;
+  if (share->state.state.del == 0)
+  {
+    if (test_flag & T_VERBOSE)
+    {
+      puts("No recordlinks");
+    }
+  }
+  else
+  {
+    if (test_flag & T_VERBOSE)
+      printf("Recordlinks:    ");
+    empty=0;
+    for (i= share->state.state.del ; i > 0L && next_link != HA_OFFSET_ERROR ; i--)
+    {
+      if (_ma_killed_ptr(param))
+        DBUG_RETURN(1);
+      if (test_flag & T_VERBOSE)
+	printf(" %9s",llstr(next_link,buff));
+      if (next_link >= share->state.state.data_file_length)
+	goto wrong;
+      if (my_pread(info->dfile.file, (uchar*) buff, delete_link_length,
+		   next_link,MYF(MY_NABP)))
+      {
+	if (test_flag & T_VERBOSE) puts("");
+	_ma_check_print_error(param,"Can't read delete-link at filepos: %s",
+		    llstr(next_link,buff));
+	DBUG_RETURN(1);
+      }
+      if (*buff != '\0')
+      {
+	if (test_flag & T_VERBOSE) puts("");
+	_ma_check_print_error(param,"Record at pos: %s is not remove-marked",
+		    llstr(next_link,buff));
+	goto wrong;
+      }
+      if (share->options & HA_OPTION_PACK_RECORD)
+      {
+	my_off_t prev_link=mi_sizekorr(buff+12);
+	if (empty && prev_link != old_link)
+	{
+	  if (test_flag & T_VERBOSE) puts("");
+	  _ma_check_print_error(param,"Deleted block at %s doesn't point back at previous delete link",llstr(next_link,buff2));
+	  goto wrong;
+	}
+	old_link=next_link;
+	next_link=mi_sizekorr(buff+4);
+	empty+=mi_uint3korr(buff+1);
+      }
+      else
+      {
+	param->record_checksum+=(ha_checksum) next_link;
+	next_link= _ma_rec_pos(share, (uchar *) buff + 1);
+	empty+=share->base.pack_reclength;
+      }
+    }
+    if (share->state.state.del && (test_flag & T_VERBOSE))
+      puts("\n");
+    if (empty != share->state.state.empty)
+    {
+      _ma_check_print_warning(param,
+			     "Found %s deleted space in delete link chain. Should be %s",
+			     llstr(empty,buff2),
+			     llstr(share->state.state.empty,buff));
+    }
+    if (next_link != HA_OFFSET_ERROR)
+    {
+      _ma_check_print_error(param,
+			   "Found more than the expected %s deleted rows in delete link chain",
+			   llstr(share->state.state.del, buff));
+      goto wrong;
+    }
+    if (i != 0)
+    {
+      _ma_check_print_error(param,
+			   "Found %s deleted rows in delete link chain. Should be %s",
+			   llstr(share->state.state.del - i, buff2),
+			   llstr(share->state.state.del, buff));
+      goto wrong;
+    }
+  }
+  DBUG_RETURN(0);
+
+wrong:
+  param->testflag|=T_RETRY_WITHOUT_QUICK;
+  if (test_flag & T_VERBOSE)
+    puts("");
+  _ma_check_print_error(param,"record delete-link-chain corrupted");
+  DBUG_RETURN(1);
+} /* maria_chk_del */
+
+
+/* Check delete links in index file */
+
+static int check_k_link(HA_CHECK *param, register MARIA_HA *info,
+                        my_off_t next_link)
+{
+  MARIA_SHARE *share= info->s;
+  uint block_size= share->block_size;
+  ha_rows records;
+  char llbuff[21], llbuff2[21];
+  uchar *buff;
+  DBUG_ENTER("check_k_link");
+
+  if (next_link == HA_OFFSET_ERROR)
+    DBUG_RETURN(0);                             /* Avoid printing empty line */
+
+  records= (ha_rows) (share->state.state.key_file_length / block_size);
+  while (next_link != HA_OFFSET_ERROR && records > 0)
+  {
+    if (_ma_killed_ptr(param))
+      DBUG_RETURN(1);
+    if (param->testflag & T_VERBOSE)
+      printf("%16s",llstr(next_link,llbuff));
+
+    /* Key blocks must lay within the key file length entirely. */
+    if (next_link + block_size > share->state.state.key_file_length)
+    {
+      /* purecov: begin tested */
+      _ma_check_print_error(param, "Invalid key block position: %s  "
+                            "key block size: %u  file_length: %s",
+                            llstr(next_link, llbuff), block_size,
+                            llstr(share->state.state.key_file_length, llbuff2));
+      DBUG_RETURN(1);
+      /* purecov: end */
+    }
+
+    /* Key blocks must be aligned at block_size */
+    if (next_link & (block_size -1))
+    {
+      /* purecov: begin tested */
+      _ma_check_print_error(param, "Mis-aligned key block: %s  "
+                            "minimum key block length: %u",
+                            llstr(next_link, llbuff),
+                            block_size);
+      DBUG_RETURN(1);
+      /* purecov: end */
+    }
+
+    DBUG_ASSERT(share->pagecache->block_size == block_size);
+    if (!(buff= pagecache_read(share->pagecache,
+                               &share->kfile,
+                               (pgcache_page_no_t) (next_link / block_size),
+                               DFLT_INIT_HITS,
+                               info->buff, PAGECACHE_READ_UNKNOWN_PAGE,
+                               PAGECACHE_LOCK_LEFT_UNLOCKED, 0)))
+    {
+      /* purecov: begin tested */
+      _ma_check_print_error(param, "key cache read error for block: %s",
+                            llstr(next_link,llbuff));
+      DBUG_RETURN(1);
+      /* purecov: end */
+    }
+    if (_ma_get_keynr(info->s, buff) != MARIA_DELETE_KEY_NR)
+      _ma_check_print_error(param, "Page at %s is not delete marked",
+                            llstr(next_link, llbuff));
+
+    next_link= mi_sizekorr(buff + share->keypage_header);
+    records--;
+    param->key_file_blocks+=block_size;
+  }
+  if (param->testflag & T_VERBOSE)
+  {
+    if (next_link != HA_OFFSET_ERROR)
+      printf("%16s\n",llstr(next_link,llbuff));
+    else
+      puts("");
+  }
+  DBUG_RETURN (next_link != HA_OFFSET_ERROR);
+} /* check_k_link */
+
+
+	/* Check sizes of files */
+
+int maria_chk_size(HA_CHECK *param, register MARIA_HA *info)
+{
+  MARIA_SHARE *share= info->s;
+  int error;
+  register my_off_t skr,size;
+  char buff[22],buff2[22];
+  DBUG_ENTER("maria_chk_size");
+
+  if (!(param->testflag & T_SILENT))
+    puts("- check file-size");
+
+  /*
+    The following is needed if called externally (not from maria_chk).
+    To get a correct physical size we need to flush them.
+  */
+  if ((error= _ma_flush_table_files(info,
+                                    MARIA_FLUSH_DATA | MARIA_FLUSH_INDEX,
+                                    FLUSH_FORCE_WRITE, FLUSH_FORCE_WRITE)))
+    _ma_check_print_error(param, "Failed to flush data or index file");
+
+  size= my_seek(share->kfile.file, 0L, MY_SEEK_END, MYF(MY_THREADSAFE));
+  if ((skr=(my_off_t) share->state.state.key_file_length) != size)
+  {
+    /* Don't give error if file generated by mariapack */
+    if (skr > size && maria_is_any_key_active(share->state.key_map))
+    {
+      error=1;
+      _ma_check_print_error(param,
+			   "Size of indexfile is: %-8s        Should be: %s",
+			   llstr(size,buff), llstr(skr,buff2));
+    }
+    else if (!(param->testflag & T_VERY_SILENT))
+      _ma_check_print_warning(param,
+			     "Size of indexfile is: %-8s      Should be: %s",
+			     llstr(size,buff), llstr(skr,buff2));
+  }
+  if (!(param->testflag & T_VERY_SILENT) &&
+      ! (share->options & HA_OPTION_COMPRESS_RECORD) &&
+      ulonglong2double(share->state.state.key_file_length) >
+      ulonglong2double(share->base.margin_key_file_length)*0.9)
+    _ma_check_print_warning(param,"Keyfile is almost full, %10s of %10s used",
+			   llstr(share->state.state.key_file_length,buff),
+			   llstr(share->base.max_key_file_length-1,buff));
+
+  size= my_seek(info->dfile.file, 0L, MY_SEEK_END, MYF(0));
+  skr=(my_off_t) share->state.state.data_file_length;
+  if (share->options & HA_OPTION_COMPRESS_RECORD)
+    skr+= MEMMAP_EXTRA_MARGIN;
+#ifdef USE_RELOC
+  if (share->data_file_type == STATIC_RECORD &&
+      skr < (my_off_t) share->base.reloc*share->base.min_pack_length)
+    skr=(my_off_t) share->base.reloc*share->base.min_pack_length;
+#endif
+  if (skr != size)
+  {
+    if (skr > size && skr != size + MEMMAP_EXTRA_MARGIN)
+    {
+      share->state.state.data_file_length=size;	/* Skip other errors */
+      error=1;
+      _ma_check_print_error(param,"Size of datafile is: %-9s         Should be: %s",
+		    llstr(size,buff), llstr(skr,buff2));
+      param->testflag|=T_RETRY_WITHOUT_QUICK;
+    }
+    else
+    {
+      _ma_check_print_warning(param,
+			     "Size of datafile is: %-9s       Should be: %s",
+			     llstr(size,buff), llstr(skr,buff2));
+    }
+  }
+  if (!(param->testflag & T_VERY_SILENT) &&
+      !(share->options & HA_OPTION_COMPRESS_RECORD) &&
+      ulonglong2double(share->state.state.data_file_length) >
+      (ulonglong2double(share->base.max_data_file_length)*0.9))
+    _ma_check_print_warning(param, "Datafile is almost full, %10s of %10s used",
+			   llstr(share->state.state.data_file_length,buff),
+			   llstr(share->base.max_data_file_length-1,buff2));
+  DBUG_RETURN(error);
+} /* maria_chk_size */
+
+
+/* Check keys */
+
+int maria_chk_key(HA_CHECK *param, register MARIA_HA *info)
+{
+  uint key,found_keys=0,full_text_keys=0,result=0;
+  ha_rows keys;
+  ha_checksum old_record_checksum,init_checksum;
+  my_off_t all_keydata,all_totaldata,key_totlength,length;
+  double  *rec_per_key_part;
+  MARIA_SHARE *share= info->s;
+  MARIA_KEYDEF *keyinfo;
+  char buff[22],buff2[22];
+  MARIA_PAGE page;
+  DBUG_ENTER("maria_chk_key");
+
+  if (!(param->testflag & T_SILENT))
+    puts("- check key delete-chain");
+
+  param->key_file_blocks=share->base.keystart;
+  if (check_k_link(param, info, share->state.key_del))
+  {
+    if (param->testflag & T_VERBOSE) puts("");
+    _ma_check_print_error(param,"key delete-link-chain corrupted");
+    DBUG_RETURN(-1);
+  }
+
+  if (!(param->testflag & T_SILENT))
+    puts("- check index reference");
+
+  all_keydata=all_totaldata=key_totlength=0;
+  init_checksum=param->record_checksum;
+  old_record_checksum=0;
+  if (share->data_file_type == STATIC_RECORD)
+    old_record_checksum= (calc_checksum(share->state.state.records +
+                                        share->state.state.del-1) *
+                          share->base.pack_reclength);
+  rec_per_key_part= param->new_rec_per_key_part;
+  for (key= 0,keyinfo= &share->keyinfo[0]; key < share->base.keys ;
+       rec_per_key_part+=keyinfo->keysegs, key++, keyinfo++)
+  {
+    param->key_crc[key]=0;
+    if (! maria_is_key_active(share->state.key_map, key))
+    {
+      /* Remember old statistics for key */
+      memcpy((char*) rec_per_key_part,
+	     (char*) (share->state.rec_per_key_part +
+		      (uint) (rec_per_key_part - param->new_rec_per_key_part)),
+	     keyinfo->keysegs*sizeof(*rec_per_key_part));
+      continue;
+    }
+    found_keys++;
+
+    param->record_checksum=init_checksum;
+
+    bzero((char*) &param->unique_count,sizeof(param->unique_count));
+    bzero((char*) &param->notnull_count,sizeof(param->notnull_count));
+
+    if ((!(param->testflag & T_SILENT)))
+      printf ("- check data record references index: %d\n",key+1);
+    if (keyinfo->flag & (HA_FULLTEXT | HA_SPATIAL))
+      full_text_keys++;
+    if (share->state.key_root[key] == HA_OFFSET_ERROR)
+    {
+      if (share->state.state.records != 0 && !(keyinfo->flag & HA_FULLTEXT))
+        _ma_check_print_error(param, "Key tree %u is empty", key + 1);
+      goto do_stat;
+    }
+    if (_ma_fetch_keypage(&page, info, keyinfo, share->state.key_root[key],
+                          PAGECACHE_LOCK_LEFT_UNLOCKED, DFLT_INIT_HITS,
+                          info->buff, 0))
+    {
+      report_keypage_fault(param, info, share->state.key_root[key]);
+      if (!(param->testflag & T_INFO))
+	DBUG_RETURN(-1);
+      result= -1;
+      continue;
+    }
+    param->key_file_blocks+=keyinfo->block_length;
+    keys=0;
+    param->keydata=param->totaldata=0;
+    param->key_blocks=0;
+    param->max_level=0;
+    if (chk_index(param, info,keyinfo, &page, &keys, param->key_crc+key,1))
+      DBUG_RETURN(-1);
+    if (!(keyinfo->flag & (HA_FULLTEXT | HA_SPATIAL | HA_RTREE_INDEX)))
+    {
+      if (keys != share->state.state.records)
+      {
+	_ma_check_print_error(param,"Found %s keys of %s",llstr(keys,buff),
+		    llstr(share->state.state.records,buff2));
+	if (!(param->testflag & T_INFO))
+	DBUG_RETURN(-1);
+	result= -1;
+	continue;
+      }
+      if ((found_keys - full_text_keys == 1 &&
+           !(share->data_file_type == STATIC_RECORD)) ||
+          (param->testflag & T_DONT_CHECK_CHECKSUM))
+	old_record_checksum= param->record_checksum;
+      else if (old_record_checksum != param->record_checksum)
+      {
+	if (key)
+	  _ma_check_print_error(param,
+                                "Key %u doesn't point at same records as "
+                                "key 1",
+		      key+1);
+	else
+	  _ma_check_print_error(param,"Key 1 doesn't point at all records");
+	if (!(param->testflag & T_INFO))
+	  DBUG_RETURN(-1);
+	result= -1;
+	continue;
+      }
+    }
+    if ((uint) share->base.auto_key -1 == key)
+    {
+      /* Check that auto_increment key is bigger than max key value */
+      ulonglong auto_increment;
+      const HA_KEYSEG *keyseg= share->keyinfo[share->base.auto_key-1].seg;
+      info->lastinx=key;
+      _ma_read_key_record(info, info->rec_buff, 0);
+      auto_increment=
+        ma_retrieve_auto_increment(info->rec_buff + keyseg->start,
+                                   keyseg->type);
+      if (auto_increment > share->state.auto_increment)
+      {
+	_ma_check_print_warning(param, "Auto-increment value: %s is smaller "
+                                "than max used value: %s",
+                                llstr(share->state.auto_increment,buff2),
+                                llstr(auto_increment, buff));
+      }
+      if (param->testflag & T_AUTO_INC)
+      {
+        set_if_bigger(share->state.auto_increment,
+                      auto_increment);
+        set_if_bigger(share->state.auto_increment,
+                      param->auto_increment_value);
+      }
+
+      /* Check that there isn't a row with auto_increment = 0 in the table */
+      maria_extra(info,HA_EXTRA_KEYREAD,0);
+      bzero(info->lastkey_buff, keyinfo->seg->length);
+      if (!maria_rkey(info, info->rec_buff, key,
+                      info->lastkey_buff,
+                      (key_part_map) 1, HA_READ_KEY_EXACT))
+      {
+	/* Don't count this as a real warning, as maria_chk can't correct it */
+	uint save=param->warning_printed;
+	_ma_check_print_warning(param, "Found row where the auto_increment "
+                                "column has the value 0");
+	param->warning_printed=save;
+      }
+      maria_extra(info,HA_EXTRA_NO_KEYREAD,0);
+    }
+
+    length=(my_off_t) isam_key_length(info,keyinfo)*keys + param->key_blocks*2;
+    if (param->testflag & T_INFO && param->totaldata != 0L && keys != 0L)
+      printf("Key: %2d:  Keyblocks used: %3d%%  Packed: %4d%%  Max levels: %2d\n",
+	     key+1,
+	     (int) (my_off_t2double(param->keydata)*100.0/my_off_t2double(param->totaldata)),
+	     (int) ((my_off_t2double(length) - my_off_t2double(param->keydata))*100.0/
+		    my_off_t2double(length)),
+	     param->max_level);
+    all_keydata+=param->keydata; all_totaldata+=param->totaldata; key_totlength+=length;
+
+do_stat:
+    if (param->testflag & T_STATISTICS)
+      maria_update_key_parts(keyinfo, rec_per_key_part, param->unique_count,
+                       param->stats_method == MI_STATS_METHOD_IGNORE_NULLS?
+                       param->notnull_count: NULL,
+                       (ulonglong)share->state.state.records);
+  }
+  if (param->testflag & T_INFO)
+  {
+    if (all_totaldata != 0L && found_keys > 0)
+      printf("Total:    Keyblocks used: %3d%%  Packed: %4d%%\n\n",
+	     (int) (my_off_t2double(all_keydata)*100.0/
+		    my_off_t2double(all_totaldata)),
+	     (int) ((my_off_t2double(key_totlength) -
+		     my_off_t2double(all_keydata))*100.0/
+		     my_off_t2double(key_totlength)));
+    else if (all_totaldata != 0L && maria_is_any_key_active(share->state.key_map))
+      puts("");
+  }
+  if (param->key_file_blocks != share->state.state.key_file_length &&
+      share->state.key_map == ~(ulonglong) 0)
+    _ma_check_print_warning(param, "Some data are unreferenced in keyfile");
+  if (found_keys != full_text_keys)
+    param->record_checksum=old_record_checksum-init_checksum;	/* Remove delete links */
+  else
+    param->record_checksum=0;
+  DBUG_RETURN(result);
+} /* maria_chk_key */
+
+
+
+static int chk_index_down(HA_CHECK *param, MARIA_HA *info,
+                          MARIA_KEYDEF *keyinfo,
+                          my_off_t page, uchar *buff, ha_rows *keys,
+                          ha_checksum *key_checksum, uint level)
+{
+  char llbuff[22],llbuff2[22];
+  MARIA_SHARE *share= info->s;
+  MARIA_PAGE ma_page;
+  DBUG_ENTER("chk_index_down");
+
+  /* Key blocks must lay within the key file length entirely. */
+  if (page + keyinfo->block_length > share->state.state.key_file_length)
+  {
+    /* purecov: begin tested */
+    /* Give it a chance to fit in the real file size. */
+    my_off_t max_length= my_seek(info->s->kfile.file, 0L, MY_SEEK_END,
+                                 MYF(MY_THREADSAFE));
+    _ma_check_print_error(param, "Invalid key block position: %s  "
+                          "key block size: %u  file_length: %s",
+                          llstr(page, llbuff), keyinfo->block_length,
+                          llstr(share->state.state.key_file_length, llbuff2));
+    if (page + keyinfo->block_length > max_length)
+      goto err;
+    /* Fix the remembered key file length. */
+    share->state.state.key_file_length= (max_length &
+                                          ~ (my_off_t) (keyinfo->block_length -
+                                                        1));
+    /* purecov: end */
+  }
+
+  /* Key blocks must be aligned at block length */
+  if (page & (info->s->block_size -1))
+  {
+    /* purecov: begin tested */
+    _ma_check_print_error(param, "Mis-aligned key block: %s  "
+                          "key block length: %u",
+                          llstr(page, llbuff), info->s->block_size);
+    goto err;
+    /* purecov: end */
+  }
+
+  if (_ma_fetch_keypage(&ma_page, info, keyinfo, page,
+                        PAGECACHE_LOCK_LEFT_UNLOCKED,
+                        DFLT_INIT_HITS, buff, 0))
+  {
+    report_keypage_fault(param, info, page);
+    goto err;
+  }
+  param->key_file_blocks+=keyinfo->block_length;
+  if (chk_index(param, info, keyinfo, &ma_page, keys, key_checksum,level))
+    goto err;
+
+  DBUG_RETURN(0);
+
+  /* purecov: begin tested */
+err:
+  DBUG_RETURN(1);
+  /* purecov: end */
+}
+
+
+/*
+  "Ignore NULLs" statistics collection method: process first index tuple.
+
+  SYNOPSIS
+    maria_collect_stats_nonulls_first()
+      keyseg   IN     Array of key part descriptions
+      notnull  INOUT  Array, notnull[i] = (number of {keypart1...keypart_i}
+                                           tuples that don't contain NULLs)
+      key      IN     Key values tuple
+
+  DESCRIPTION
+    Process the first index tuple - find out which prefix tuples don't
+    contain NULLs, and update the array of notnull counters accordingly.
+*/
+
+static
+void maria_collect_stats_nonulls_first(HA_KEYSEG *keyseg, ulonglong *notnull,
+                                       const uchar *key)
+{
+  uint first_null, kp;
+  first_null= ha_find_null(keyseg, key) - keyseg;
+  /*
+    All prefix tuples that don't include keypart_{first_null} are not-null
+    tuples (and all others aren't), increment counters for them.
+  */
+  for (kp= 0; kp < first_null; kp++)
+    notnull[kp]++;
+}
+
+
+/*
+  "Ignore NULLs" statistics collection method: process next index tuple.
+
+  SYNOPSIS
+    maria_collect_stats_nonulls_next()
+      keyseg   IN     Array of key part descriptions
+      notnull  INOUT  Array, notnull[i] = (number of {keypart1...keypart_i}
+                                           tuples that don't contain NULLs)
+      prev_key IN     Previous key values tuple
+      last_key IN     Next key values tuple
+
+  DESCRIPTION
+    Process the next index tuple:
+    1. Find out which prefix tuples of last_key don't contain NULLs, and
+       update the array of notnull counters accordingly.
+    2. Find the first keypart number where the prev_key and last_key tuples
+       are different(A), or last_key has NULL value(B), and return it, so the
+       caller can count number of unique tuples for each key prefix. We don't
+       need (B) to be counted, and that is compensated back in
+       maria_update_key_parts().
+
+  RETURN
+    1 + number of first keypart where values differ or last_key tuple has NULL
+*/
+
+static
+int maria_collect_stats_nonulls_next(HA_KEYSEG *keyseg, ulonglong *notnull,
+                                     const uchar *prev_key,
+                                     const uchar *last_key)
+{
+  uint diffs[2];
+  uint first_null_seg, kp;
+  HA_KEYSEG *seg;
+
+  /*
+     Find the first keypart where values are different or either of them is
+     NULL. We get results in diffs array:
+     diffs[0]= 1 + number of first different keypart
+     diffs[1]=offset: (last_key + diffs[1]) points to first value in
+                      last_key that is NULL or different from corresponding
+                      value in prev_key.
+  */
+  ha_key_cmp(keyseg, prev_key, last_key, USE_WHOLE_KEY,
+             SEARCH_FIND | SEARCH_NULL_ARE_NOT_EQUAL, diffs);
+  seg= keyseg + diffs[0] - 1;
+
+  /* Find first NULL in last_key */
+  first_null_seg= ha_find_null(seg, last_key + diffs[1]) - keyseg;
+  for (kp= 0; kp < first_null_seg; kp++)
+    notnull[kp]++;
+
+  /*
+    Return 1+ number of first key part where values differ. Don't care if
+    these were NULLs and not .... We compensate for that in
+    maria_update_key_parts.
+  */
+  return diffs[0];
+}
+
+
+/* Check if index is ok */
+
+static int chk_index(HA_CHECK *param, MARIA_HA *info, MARIA_KEYDEF *keyinfo,
+		     MARIA_PAGE *anc_page, ha_rows *keys,
+		     ha_checksum *key_checksum, uint level)
+{
+  int flag;
+  uint comp_flag, page_flag, nod_flag;
+  uchar *temp_buff, *keypos, *old_keypos, *endpos;
+  my_off_t next_page,record;
+  MARIA_SHARE *share= info->s;
+  char llbuff[22];
+  uint diff_pos[2];
+  uchar tmp_key_buff[MARIA_MAX_KEY_BUFF];
+  MARIA_KEY tmp_key;
+  DBUG_ENTER("chk_index");
+  DBUG_DUMP("buff", anc_page->buff, anc_page->size);
+
+  /* TODO: implement appropriate check for RTree keys */
+  if (keyinfo->flag & (HA_SPATIAL | HA_RTREE_INDEX))
+    DBUG_RETURN(0);
+
+  if (!(temp_buff=(uchar*) my_alloca((uint) keyinfo->block_length)))
+  {
+    _ma_check_print_error(param,"Not enough memory for keyblock");
+    DBUG_RETURN(-1);
+  }
+
+  if (keyinfo->flag & HA_NOSAME)
+  {
+    /* Not real duplicates */
+    comp_flag=SEARCH_FIND | SEARCH_UPDATE | SEARCH_INSERT;
+  }
+  else
+    comp_flag=SEARCH_SAME;			/* Keys in positionorder */
+
+  page_flag=  anc_page->flag;
+  nod_flag=   anc_page->node;
+  old_keypos= anc_page->buff + share->keypage_header;
+  keypos=     old_keypos + nod_flag;
+  endpos=     anc_page->buff + anc_page->size;
+
+  param->keydata+=   anc_page->size;
+  param->totaldata+= keyinfo->block_length;	/* INFO */
+  param->key_blocks++;
+  if (level > param->max_level)
+    param->max_level=level;
+
+  if (_ma_get_keynr(share, anc_page->buff) !=
+      (uint) (keyinfo - share->keyinfo))
+    _ma_check_print_error(param, "Page at %s is not marked for index %u",
+                          llstr(anc_page->pos, llbuff),
+                          (uint) (keyinfo - share->keyinfo));
+  if ((page_flag & KEYPAGE_FLAG_HAS_TRANSID) &&
+      !share->base.born_transactional)
+  {
+    _ma_check_print_error(param,
+                          "Page at %s is marked with HAS_TRANSID even if "
+                          "table is not transactional",
+                          llstr(anc_page->pos, llbuff));
+  }
+
+  if (anc_page->size > share->max_index_block_size)
+  {
+    _ma_check_print_error(param,
+                          "Page at %s has impossible (too big) pagelength",
+                          llstr(anc_page->pos, llbuff));
+    goto err;
+  }
+
+  info->last_key.keyinfo= tmp_key.keyinfo= keyinfo;
+  tmp_key.data= tmp_key_buff;
+  for ( ;; )
+  {
+    if (nod_flag)
+    {
+      if (_ma_killed_ptr(param))
+        goto err;
+      next_page= _ma_kpos(nod_flag,keypos);
+      if (chk_index_down(param,info,keyinfo,next_page,
+                         temp_buff,keys,key_checksum,level+1))
+      {
+        DBUG_DUMP("page_data", old_keypos, (uint) (keypos - old_keypos));
+	goto err;
+      }
+    }
+    old_keypos=keypos;
+    if (keypos >= endpos ||
+	!(*keyinfo->get_key)(&tmp_key, page_flag, nod_flag, &keypos))
+      break;
+    if (keypos > endpos)
+    {
+      _ma_check_print_error(param,
+                            "Page length and length of keys don't match at "
+                            "page: %s",
+                            llstr(anc_page->pos,llbuff));
+      goto err;
+    }
+    if (share->data_file_type == BLOCK_RECORD &&
+        !(page_flag & KEYPAGE_FLAG_HAS_TRANSID) &&
+        key_has_transid(tmp_key.data + tmp_key.data_length +
+                        share->rec_reflength-1))
+    {
+      _ma_check_print_error(param,
+                            "Found key marked for transid on page that is not "
+                            "marked for transid at: %s",
+                            llstr(anc_page->pos,llbuff));
+      goto err;
+    }
+
+    if ((*keys)++ &&
+	(flag=ha_key_cmp(keyinfo->seg, info->last_key.data, tmp_key.data,
+                         tmp_key.data_length + tmp_key.ref_length,
+                         (comp_flag | SEARCH_INSERT | (tmp_key.flag >> 1) |
+                          info->last_key.flag), diff_pos)) >=0)
+    {
+      DBUG_DUMP_KEY("old", &info->last_key);
+      DBUG_DUMP_KEY("new", &tmp_key);
+      DBUG_DUMP("new_in_page", old_keypos, (uint) (keypos-old_keypos));
+
+      if ((comp_flag & SEARCH_FIND) && flag == 0)
+	_ma_check_print_error(param,"Found duplicated key at page %s",
+                              llstr(anc_page->pos,llbuff));
+      else
+	_ma_check_print_error(param,"Key in wrong position at page %s",
+                              llstr(anc_page->pos,llbuff));
+      goto err;
+    }
+
+    if (param->testflag & T_STATISTICS)
+    {
+      if (*keys != 1L)				/* not first_key */
+      {
+        if (param->stats_method == MI_STATS_METHOD_NULLS_NOT_EQUAL)
+          ha_key_cmp(keyinfo->seg, info->last_key.data,
+                     tmp_key.data, tmp_key.data_length,
+                     SEARCH_FIND | SEARCH_NULL_ARE_NOT_EQUAL,
+                     diff_pos);
+        else if (param->stats_method == MI_STATS_METHOD_IGNORE_NULLS)
+        {
+          diff_pos[0]= maria_collect_stats_nonulls_next(keyinfo->seg,
+                                                        param->notnull_count,
+                                                        info->last_key.data,
+                                                        tmp_key.data);
+        }
+	param->unique_count[diff_pos[0]-1]++;
+      }
+      else
+      {
+        if (param->stats_method == MI_STATS_METHOD_IGNORE_NULLS)
+          maria_collect_stats_nonulls_first(keyinfo->seg, param->notnull_count,
+                                            tmp_key.data);
+      }
+    }
+    _ma_copy_key(&info->last_key, &tmp_key);
+    (*key_checksum)+= maria_byte_checksum(tmp_key.data, tmp_key.data_length);
+    record= _ma_row_pos_from_key(&tmp_key);
+
+    if (keyinfo->flag & HA_FULLTEXT) /* special handling for ft2 */
+    {
+      uint off;
+      int  subkeys;
+      get_key_full_length_rdonly(off, tmp_key.data);
+      subkeys= ft_sintXkorr(tmp_key.data + off);
+      if (subkeys < 0)
+      {
+        ha_rows tmp_keys=0;
+        if (chk_index_down(param,info,&share->ft2_keyinfo,record,
+                           temp_buff,&tmp_keys,key_checksum,1))
+          goto err;
+        if (tmp_keys + subkeys)
+        {
+          _ma_check_print_error(param,
+                               "Number of words in the 2nd level tree "
+                               "does not match the number in the header. "
+                               "Parent word in on the page %s, offset %u",
+                               llstr(anc_page->pos,llbuff),
+                                (uint) (old_keypos - anc_page->buff));
+          goto err;
+        }
+        (*keys)+=tmp_keys-1;
+        continue;
+      }
+      /* fall through */
+    }
+    if ((share->data_file_type != BLOCK_RECORD &&
+         record >= share->state.state.data_file_length) ||
+        (share->data_file_type == BLOCK_RECORD &&
+         ma_recordpos_to_page(record) * share->base.min_block_length >=
+         share->state.state.data_file_length))
+    {
+#ifndef DBUG_OFF
+      char llbuff2[22], llbuff3[22];
+#endif
+      _ma_check_print_error(param,
+                            "Found key at page %s that points to record "
+                            "outside datafile",
+                            llstr(anc_page->pos,llbuff));
+      DBUG_PRINT("test",("page: %s  record: %s  filelength: %s",
+			 llstr(anc_page->pos,llbuff),llstr(record,llbuff2),
+			 llstr(share->state.state.data_file_length,llbuff3)));
+      DBUG_DUMP_KEY("key", &tmp_key);
+      DBUG_DUMP("new_in_page", old_keypos, (uint) (keypos-old_keypos));
+      goto err;
+    }
+    param->record_checksum+= (ha_checksum) record;
+  }
+  if (keypos != endpos)
+  {
+    _ma_check_print_error(param,
+                          "Keyblock size at page %s is not correct. "
+                          "Block length: %u  key length: %u",
+                          llstr(anc_page->pos, llbuff), anc_page->size,
+                          (uint) (keypos - anc_page->buff));
+    goto err;
+  }
+  my_afree(temp_buff);
+  DBUG_RETURN(0);
+ err:
+  my_afree(temp_buff);
+  DBUG_RETURN(1);
+} /* chk_index */
+
+
+	/* Calculate a checksum of 1+2+3+4...N = N*(N+1)/2 without overflow */
+
+static ha_checksum calc_checksum(ha_rows count)
+{
+  ulonglong sum,a,b;
+  DBUG_ENTER("calc_checksum");
+
+  sum=0;
+  a=count; b=count+1;
+  if (a & 1)
+    b>>=1;
+  else
+    a>>=1;
+  while (b)
+  {
+    if (b & 1)
+      sum+=a;
+    a<<=1; b>>=1;
+  }
+  DBUG_PRINT("exit",("sum: %lx",(ulong) sum));
+  DBUG_RETURN((ha_checksum) sum);
+} /* calc_checksum */
+
+
+	/* Calc length of key in normal isam */
+
+static uint isam_key_length(MARIA_HA *info, register MARIA_KEYDEF *keyinfo)
+{
+  uint length;
+  HA_KEYSEG *keyseg;
+  DBUG_ENTER("isam_key_length");
+
+  length= info->s->rec_reflength;
+  for (keyseg=keyinfo->seg ; keyseg->type ; keyseg++)
+    length+= keyseg->length;
+
+  DBUG_PRINT("exit",("length: %d",length));
+  DBUG_RETURN(length);
+} /* key_length */
+
+
+
+static void record_pos_to_txt(MARIA_HA *info, my_off_t recpos,
+                              char *buff)
+{
+  if (info->s->data_file_type != BLOCK_RECORD)
+    llstr(recpos, buff);
+  else
+  {
+    my_off_t page= ma_recordpos_to_page(recpos);
+    uint row= ma_recordpos_to_dir_entry(recpos);
+    char *end= longlong10_to_str(page, buff, 10);
+    *(end++)= ':';
+    longlong10_to_str(row, end, 10);
+  }
+}
+
+
+/*
+  Check that keys in records exist in index tree
+
+  SYNOPSIS
+  check_keys_in_record()
+  param		Check paramenter
+  info		Maria handler
+  extend	Type of check (extended or normal)
+  start_recpos	Position to row
+  record	Record buffer
+
+  NOTES
+    This function also calculates record checksum & number of rows
+*/
+
+static int check_keys_in_record(HA_CHECK *param, MARIA_HA *info, int extend,
+                                my_off_t start_recpos, uchar *record)
+{
+  MARIA_SHARE *share= info->s;
+  MARIA_KEYDEF *keyinfo;
+  char llbuff[22+4];
+  uint keynr;
+
+  param->tmp_record_checksum+= (ha_checksum) start_recpos;
+  param->records++;
+  if (param->testflag & T_WRITE_LOOP && param->records % WRITE_COUNT == 0)
+  {
+    printf("%s\r", llstr(param->records, llbuff));
+    VOID(fflush(stdout));
+  }
+
+  /* Check if keys match the record */
+  for (keynr=0, keyinfo= share->keyinfo; keynr < share->base.keys;
+       keynr++, keyinfo++)
+  {
+    if (maria_is_key_active(share->state.key_map, keynr))
+    {
+      MARIA_KEY key;
+      if (!(keyinfo->flag & HA_FULLTEXT))
+      {
+        (*keyinfo->make_key)(info, &key, keynr, info->lastkey_buff, record,
+                             start_recpos, 0);
+        if (extend)
+        {
+          /* We don't need to lock the key tree here as we don't allow
+             concurrent threads when running maria_chk
+          */
+          int search_result=
+#ifdef HAVE_RTREE_KEYS
+            (keyinfo->flag & (HA_SPATIAL | HA_RTREE_INDEX)) ?
+            maria_rtree_find_first(info, &key, MBR_EQUAL | MBR_DATA) :
+#endif
+            _ma_search(info, &key, SEARCH_SAME, share->state.key_root[keynr]);
+          if (search_result)
+          {
+            record_pos_to_txt(info, start_recpos, llbuff);
+            _ma_check_print_error(param,
+                                  "Record at: %14s  "
+                                  "Can't find key for index: %2d",
+                                  llbuff, keynr+1);
+            if (param->err_count++ > MAXERR || !(param->testflag & T_VERBOSE))
+              return -1;
+          }
+        }
+        else
+          param->tmp_key_crc[keynr]+=
+            maria_byte_checksum(key.data, key.data_length);
+      }
+    }
+  }
+  return 0;
+}
+
+
+/*
+  Functions to loop through all rows and check if they are ok
+
+  NOTES
+    One function for each record format
+
+  RESULT
+    0  ok
+    -1 Interrupted by user
+    1  Error
+*/
+
+static int check_static_record(HA_CHECK *param, MARIA_HA *info, int extend,
+                               uchar *record)
+{
+  MARIA_SHARE *share= info->s;
+  my_off_t start_recpos, pos;
+  char llbuff[22];
+
+  pos= 0;
+  while (pos < share->state.state.data_file_length)
+  {
+    if (_ma_killed_ptr(param))
+      return -1;
+    if (my_b_read(&param->read_cache, record,
+                  share->base.pack_reclength))
+    {
+      _ma_check_print_error(param,
+                            "got error: %d when reading datafile at position: "
+                            "%s",
+                            my_errno, llstr(pos, llbuff));
+      return 1;
+    }
+    start_recpos= pos;
+    pos+= share->base.pack_reclength;
+    param->splits++;
+    if (*record == '\0')
+    {
+      param->del_blocks++;
+      param->del_length+= share->base.pack_reclength;
+      continue;					/* Record removed */
+    }
+    param->glob_crc+= _ma_static_checksum(info,record);
+    param->used+= share->base.pack_reclength;
+    if (check_keys_in_record(param, info, extend, start_recpos, record))
+      return 1;
+  }
+  return 0;
+}
+
+
+static int check_dynamic_record(HA_CHECK *param, MARIA_HA *info, int extend,
+                                uchar *record)
+{
+  MARIA_BLOCK_INFO block_info;
+  MARIA_SHARE *share= info->s;
+  my_off_t start_recpos, start_block, pos;
+  uchar *to;
+  ulong left_length;
+  uint	b_type;
+  char llbuff[22],llbuff2[22],llbuff3[22];
+  DBUG_ENTER("check_dynamic_record");
+
+  LINT_INIT(left_length);
+  LINT_INIT(start_recpos);
+  LINT_INIT(to);
+
+  pos= 0;
+  while (pos < share->state.state.data_file_length)
+  {
+    my_bool got_error= 0;
+    int flag;
+    if (_ma_killed_ptr(param))
+      DBUG_RETURN(-1);
+
+    flag= block_info.second_read=0;
+    block_info.next_filepos=pos;
+    do
+    {
+      if (_ma_read_cache(&param->read_cache, block_info.header,
+                         (start_block=block_info.next_filepos),
+                         sizeof(block_info.header),
+                         (flag ? 0 : READING_NEXT) | READING_HEADER))
+      {
+        _ma_check_print_error(param,
+                              "got error: %d when reading datafile at "
+                              "position: %s",
+                              my_errno, llstr(start_block, llbuff));
+        DBUG_RETURN(1);
+      }
+
+      if (start_block & (MARIA_DYN_ALIGN_SIZE-1))
+      {
+        _ma_check_print_error(param,"Wrong aligned block at %s",
+                              llstr(start_block,llbuff));
+        DBUG_RETURN(1);
+      }
+      b_type= _ma_get_block_info(&block_info,-1,start_block);
+      if (b_type & (BLOCK_DELETED | BLOCK_ERROR | BLOCK_SYNC_ERROR |
+                    BLOCK_FATAL_ERROR))
+      {
+        if (b_type & BLOCK_SYNC_ERROR)
+        {
+          if (flag)
+          {
+            _ma_check_print_error(param,"Unexpected byte: %d at link: %s",
+                                  (int) block_info.header[0],
+                                  llstr(start_block,llbuff));
+            DBUG_RETURN(1);
+          }
+          pos=block_info.filepos+block_info.block_len;
+          goto next;
+        }
+        if (b_type & BLOCK_DELETED)
+        {
+          if (block_info.block_len < share->base.min_block_length)
+          {
+            _ma_check_print_error(param,
+                                  "Deleted block with impossible length %lu "
+                                  "at %s",
+                                  block_info.block_len,llstr(pos,llbuff));
+            DBUG_RETURN(1);
+          }
+          if ((block_info.next_filepos != HA_OFFSET_ERROR &&
+               block_info.next_filepos >= share->state.state.data_file_length) ||
+              (block_info.prev_filepos != HA_OFFSET_ERROR &&
+               block_info.prev_filepos >= share->state.state.data_file_length))
+          {
+            _ma_check_print_error(param,"Delete link points outside datafile "
+                                  "at %s",
+                                  llstr(pos,llbuff));
+            DBUG_RETURN(1);
+          }
+          param->del_blocks++;
+          param->del_length+= block_info.block_len;
+          param->splits++;
+          pos= block_info.filepos+block_info.block_len;
+          goto next;
+        }
+        _ma_check_print_error(param,"Wrong bytesec: %d-%d-%d at linkstart: %s",
+                              block_info.header[0],block_info.header[1],
+                              block_info.header[2],
+                              llstr(start_block,llbuff));
+        DBUG_RETURN(1);
+      }
+      if (share->state.state.data_file_length < block_info.filepos+
+          block_info.block_len)
+      {
+        _ma_check_print_error(param,
+                              "Recordlink that points outside datafile at %s",
+                              llstr(pos,llbuff));
+        got_error=1;
+        break;
+      }
+      param->splits++;
+      if (!flag++)				/* First block */
+      {
+        start_recpos=pos;
+        pos=block_info.filepos+block_info.block_len;
+        if (block_info.rec_len > (uint) share->base.max_pack_length)
+        {
+          _ma_check_print_error(param,"Found too long record (%lu) at %s",
+                                (ulong) block_info.rec_len,
+                                llstr(start_recpos,llbuff));
+          got_error=1;
+          break;
+        }
+        if (share->base.blobs)
+        {
+          if (_ma_alloc_buffer(&info->rec_buff, &info->rec_buff_size,
+                               block_info.rec_len +
+                               share->base.extra_rec_buff_size))
+
+          {
+            _ma_check_print_error(param,
+                                  "Not enough memory (%lu) for blob at %s",
+                                  (ulong) block_info.rec_len,
+                                  llstr(start_recpos,llbuff));
+            got_error=1;
+            break;
+          }
+        }
+        to= info->rec_buff;
+        left_length= block_info.rec_len;
+      }
+      if (left_length < block_info.data_len)
+      {
+        _ma_check_print_error(param,"Found too long record (%lu) at %s",
+                              (ulong) block_info.data_len,
+                              llstr(start_recpos,llbuff));
+        got_error=1;
+        break;
+      }
+      if (_ma_read_cache(&param->read_cache, to, block_info.filepos,
+                         (uint) block_info.data_len,
+                         flag == 1 ? READING_NEXT : 0))
+      {
+        _ma_check_print_error(param,
+                              "got error: %d when reading datafile at "
+                              "position: %s", my_errno,
+                              llstr(block_info.filepos, llbuff));
+
+        DBUG_RETURN(1);
+      }
+      to+=block_info.data_len;
+      param->link_used+= block_info.filepos-start_block;
+      param->used+= block_info.filepos - start_block + block_info.data_len;
+      param->empty+= block_info.block_len-block_info.data_len;
+      left_length-= block_info.data_len;
+      if (left_length)
+      {
+        if (b_type & BLOCK_LAST)
+        {
+          _ma_check_print_error(param,
+                                "Wrong record length %s of %s at %s",
+                                llstr(block_info.rec_len-left_length,llbuff),
+                                llstr(block_info.rec_len, llbuff2),
+                                llstr(start_recpos,llbuff3));
+          got_error=1;
+          break;
+        }
+        if (share->state.state.data_file_length < block_info.next_filepos)
+        {
+          _ma_check_print_error(param,
+                                "Found next-recordlink that points outside "
+                                "datafile at %s",
+                                llstr(block_info.filepos,llbuff));
+          got_error=1;
+          break;
+        }
+      }
+    } while (left_length);
+
+    if (! got_error)
+    {
+      if (_ma_rec_unpack(info,record,info->rec_buff,block_info.rec_len) ==
+          MY_FILE_ERROR)
+      {
+        _ma_check_print_error(param,"Found wrong record at %s",
+                              llstr(start_recpos,llbuff));
+        got_error=1;
+      }
+      else
+      {
+        ha_checksum checksum= 0;
+        if (share->calc_checksum)
+          checksum= (*share->calc_checksum)(info, record);
+
+        if (param->testflag & (T_EXTEND | T_MEDIUM | T_VERBOSE))
+        {
+          if (_ma_rec_check(info,record, info->rec_buff,block_info.rec_len,
+                            test(share->calc_checksum), checksum))
+          {
+            _ma_check_print_error(param,"Found wrong packed record at %s",
+                                  llstr(start_recpos,llbuff));
+            got_error= 1;
+          }
+        }
+        param->glob_crc+= checksum;
+      }
+
+      if (! got_error)
+      {
+        if (check_keys_in_record(param, info, extend, start_recpos, record))
+          DBUG_RETURN(1);
+      }
+      else
+      {
+        if (param->err_count++ > MAXERR || !(param->testflag & T_VERBOSE))
+          DBUG_RETURN(1);
+      }
+    }
+    else if (!flag)
+      pos= block_info.filepos+block_info.block_len;
+next:;
+  }
+  DBUG_RETURN(0);
+}
+
+
+static int check_compressed_record(HA_CHECK *param, MARIA_HA *info, int extend,
+                                   uchar *record)
+{
+  MARIA_BLOCK_INFO block_info;
+  MARIA_SHARE *share= info->s;
+  my_off_t start_recpos, pos;
+  char llbuff[22];
+  my_bool got_error= 0;
+  DBUG_ENTER("check_compressed_record");
+
+  pos= share->pack.header_length;             /* Skip header */
+  while (pos < share->state.state.data_file_length)
+  {
+    if (_ma_killed_ptr(param))
+      DBUG_RETURN(-1);
+
+    if (_ma_read_cache(&param->read_cache, block_info.header, pos,
+                       share->pack.ref_length, READING_NEXT))
+    {
+      _ma_check_print_error(param,
+                            "got error: %d when reading datafile at position: "
+                            "%s",
+                            my_errno, llstr(pos, llbuff));
+      DBUG_RETURN(1);
+    }
+
+    start_recpos= pos;
+    param->splits++;
+    VOID(_ma_pack_get_block_info(info, &info->bit_buff, &block_info,
+                                 &info->rec_buff, &info->rec_buff_size, -1,
+                                 start_recpos));
+    pos=block_info.filepos+block_info.rec_len;
+    if (block_info.rec_len < (uint) share->min_pack_length ||
+        block_info.rec_len > (uint) share->max_pack_length)
+    {
+      _ma_check_print_error(param,
+                            "Found block with wrong recordlength: %lu at %s",
+                            block_info.rec_len, llstr(start_recpos,llbuff));
+      got_error=1;
+      goto end;
+    }
+    if (_ma_read_cache(&param->read_cache, info->rec_buff,
+                       block_info.filepos, block_info.rec_len, READING_NEXT))
+    {
+      _ma_check_print_error(param,
+                            "got error: %d when reading datafile at position: "
+                            "%s",
+                            my_errno, llstr(block_info.filepos, llbuff));
+      DBUG_RETURN(1);
+    }
+    if (_ma_pack_rec_unpack(info, &info->bit_buff, record,
+                            info->rec_buff, block_info.rec_len))
+    {
+      _ma_check_print_error(param,"Found wrong record at %s",
+                            llstr(start_recpos,llbuff));
+      got_error=1;
+      goto end;
+    }
+    param->glob_crc+= (*share->calc_checksum)(info,record);
+    param->link_used+= (block_info.filepos - start_recpos);
+    param->used+= (pos-start_recpos);
+
+end:
+    if (! got_error)
+    {
+      if (check_keys_in_record(param, info, extend, start_recpos, record))
+        DBUG_RETURN(1);
+    }
+    else
+    {
+      got_error= 0;                             /* Reset for next loop */
+      if (param->err_count++ > MAXERR || !(param->testflag & T_VERBOSE))
+        DBUG_RETURN(1);
+    }
+  }
+  DBUG_RETURN(0);
+}
+
+
+/*
+  Check if layout on head or tail page is ok
+
+  NOTES
+    This is for rows-in-block format.
+*/
+
+static int check_page_layout(HA_CHECK *param, MARIA_HA *info,
+                             my_off_t page_pos, uchar *page,
+                             uint row_count, uint head_empty,
+                             uint *real_rows_found, uint *free_slots_found)
+{
+  uint empty, last_row_end, row, first_dir_entry, free_entry, block_size;
+  uint free_entries, prev_free_entry;
+  uchar *dir_entry;
+  char llbuff[22];
+  my_bool error_in_free_list= 0;
+  DBUG_ENTER("check_page_layout");
+
+  block_size= info->s->block_size;
+  empty= 0;
+  last_row_end= PAGE_HEADER_SIZE;
+  *real_rows_found= 0;
+
+  /* Check free directory list */
+  free_entry= (uint) page[DIR_FREE_OFFSET];
+  free_entries= 0;
+  prev_free_entry= END_OF_DIR_FREE_LIST;
+  while (free_entry != END_OF_DIR_FREE_LIST)
+  {
+    uchar *dir;
+    if (free_entry > row_count)
+    {
+      _ma_check_print_error(param,
+                            "Page %9s:  Directory free entry points outside "
+                            "directory",
+                            llstr(page_pos, llbuff));
+      error_in_free_list= 1;
+      break;
+    }
+    dir= dir_entry_pos(page, block_size, free_entry);
+    if (uint2korr(dir) != 0)
+    {
+      _ma_check_print_error(param,
+                            "Page %9s:  Directory free entry points to "
+                            "not deleted entry",
+                            llstr(page_pos, llbuff));
+      error_in_free_list= 1;
+      break;
+    }
+    if (dir[2] != prev_free_entry)
+    {
+      _ma_check_print_error(param,
+                            "Page %9s:  Directory free list back pointer "
+                            "points to wrong entry",
+                            llstr(page_pos, llbuff));
+      error_in_free_list= 1;
+      break;
+    }
+    prev_free_entry= free_entry;
+    free_entry= dir[3];
+    free_entries++;
+  }
+  *free_slots_found= free_entries;
+
+  /* Check directry */
+  dir_entry= page+ block_size - PAGE_SUFFIX_SIZE;
+  first_dir_entry= (block_size - row_count * DIR_ENTRY_SIZE -
+                    PAGE_SUFFIX_SIZE);
+  for (row= 0 ; row < row_count ; row++)
+  {
+    uint pos, length;
+    dir_entry-= DIR_ENTRY_SIZE;
+    pos= uint2korr(dir_entry);
+    if (!pos)
+    {
+      free_entries--;
+      if (row == row_count -1)
+      {
+        _ma_check_print_error(param,
+                              "Page %9s:  First entry in directory is 0",
+                              llstr(page_pos, llbuff));
+        if (param->err_count++ > MAXERR || !(param->testflag & T_VERBOSE))
+          DBUG_RETURN(1);
+      }
+      continue;                                 /* Deleted row */
+    }
+    (*real_rows_found)++;
+    length= uint2korr(dir_entry+2);
+    param->used+= length;
+    if (pos < last_row_end)
+    {
+      _ma_check_print_error(param,
+                            "Page %9s:  Row %3u overlapps with previous row",
+                            llstr(page_pos, llbuff), row);
+      DBUG_RETURN(1);
+    }
+    empty+= (pos - last_row_end);
+    last_row_end= pos + length;
+    if (last_row_end > first_dir_entry)
+    {
+      _ma_check_print_error(param,
+                            "Page %9s:  Row %3u overlapps with directory",
+                            llstr(page_pos, llbuff), row);
+      DBUG_RETURN(1);
+    }
+  }
+  empty+= (first_dir_entry - last_row_end);
+
+  if (empty != head_empty)
+  {
+    _ma_check_print_error(param,
+                          "Page %9s:  Wrong empty size.  Stored: %5u  "
+                          "Actual: %5u",
+                          llstr(page_pos, llbuff), head_empty, empty);
+    param->err_count++;
+  }
+  if (free_entries != 0 && !error_in_free_list)
+  {
+    _ma_check_print_error(param,
+                          "Page %9s:  Directory free link don't include "
+                          "all free entries",
+                          llstr(page_pos, llbuff));
+    param->err_count++;
+  }
+  DBUG_RETURN(param->err_count &&
+              (param->err_count >= MAXERR || !(param->testflag & T_VERBOSE)));
+}
+
+
+/*
+  Check all rows on head page
+
+  NOTES
+    This is for rows-in-block format.
+
+    Before this, we have already called check_page_layout(), so
+    we know the block is logicaly correct (even if the rows may not be that)
+
+  RETURN
+   0  ok
+   1  error
+*/
+
+
+static my_bool check_head_page(HA_CHECK *param, MARIA_HA *info, uchar *record,
+                               int extend, my_off_t page_pos, uchar *page_buff,
+                               uint row_count)
+{
+  MARIA_SHARE *share= info->s;
+  uchar *dir_entry;
+  uint row;
+  char llbuff[22], llbuff2[22];
+  ulonglong page= page_pos / share->block_size;
+  DBUG_ENTER("check_head_page");
+
+  dir_entry= page_buff+ share->block_size - PAGE_SUFFIX_SIZE;
+  for (row= 0 ; row < row_count ; row++)
+  {
+    uint pos, length, flag;
+    dir_entry-= DIR_ENTRY_SIZE;
+    pos= uint2korr(dir_entry);
+    if (!pos)
+      continue;
+    length= uint2korr(dir_entry+2);
+    if (length < share->base.min_block_length)
+    {
+      _ma_check_print_error(param,
+                            "Page %9s:  Row %3u is too short "
+                            "(%d of min %d bytes)",
+                            llstr(page, llbuff), row, length,
+                            (uint) share->base.min_block_length);
+      DBUG_RETURN(1);
+    }
+    flag= (uint) (uchar) page_buff[pos];
+    if (flag & ~(ROW_FLAG_ALL))
+      _ma_check_print_error(param,
+                            "Page %9s: Row %3u has wrong flag: %u",
+                            llstr(page, llbuff), row, flag);
+
+    DBUG_PRINT("info", ("rowid: %s  page: %lu  row: %u",
+                        llstr(ma_recordpos(page, row), llbuff),
+                        (ulong) page, row));
+    info->cur_row.trid= 0;
+    if (_ma_read_block_record2(info, record, page_buff+pos,
+                               page_buff+pos+length))
+    {
+      _ma_check_print_error(param,
+                            "Page %9s:  Row %3d is crashed",
+                            llstr(page, llbuff), row);
+      if (param->err_count++ > MAXERR || !(param->testflag & T_VERBOSE))
+        DBUG_RETURN(1);
+      continue;
+    }
+    set_if_bigger(param->max_found_trid, info->cur_row.trid);
+    if (info->cur_row.trid > param->max_trid)
+      _ma_check_print_not_visible_error(param, info->cur_row.trid);
+
+    if (share->calc_checksum)
+    {
+      ha_checksum checksum= (*share->calc_checksum)(info, record);
+      if (info->cur_row.checksum != (checksum & 255))
+        _ma_check_print_error(param, "Page %9s:  Row %3d has wrong checksum",
+                              llstr(page, llbuff), row);
+      param->glob_crc+= checksum;
+    }
+    if (info->cur_row.extents_count)
+    {
+      uchar *extents= info->cur_row.extents;
+      uint i;
+      /* Check that bitmap has the right marker for the found extents */
+      for (i= 0 ; i < info->cur_row.extents_count ; i++)
+      {
+        pgcache_page_no_t extent_page;
+        uint page_count, page_type;
+        extent_page= uint5korr(extents);
+        page_count=  uint2korr(extents+5) & ~START_EXTENT_BIT;
+        extents+=    ROW_EXTENT_SIZE;
+        page_type=   BLOB_PAGE;
+        if (page_count & TAIL_BIT)
+        {
+          page_count= 1;
+          page_type= TAIL_PAGE;
+        }
+        /*
+          TODO OPTIMIZE:
+          Check the whole extent with one test and only do the loop if
+          something is wrong (for exact error reporting)
+        */
+        for ( ; page_count--; extent_page++)
+        {
+          uint bitmap_pattern;
+          if (_ma_check_if_right_bitmap_type(info, page_type, extent_page,
+                                             &bitmap_pattern))
+          {
+            _ma_check_print_error(param,
+                                  "Page %9s:  Row: %3d has an extent with "
+                                  "wrong information in bitmap:  "
+                                  "Page: %9s  Page_type: %d  Bitmap: %d",
+                                  llstr(page, llbuff), row,
+                                  llstr(extent_page, llbuff2),
+                                  page_type, bitmap_pattern);
+            if (param->err_count++ > MAXERR || !(param->testflag & T_VERBOSE))
+              DBUG_RETURN(1);
+          }
+        }
+      }
+    }
+    param->full_page_count+= info->cur_row.full_page_count;
+    param->tail_count+= info->cur_row.tail_count;
+    if (check_keys_in_record(param, info, extend,
+                             ma_recordpos(page, row), record))
+      DBUG_RETURN(1);
+  }
+  DBUG_RETURN(0);
+}
+
+
+/*
+  Check if rows-in-block data file is consistent
+*/
+
+static int check_block_record(HA_CHECK *param, MARIA_HA *info, int extend,
+                              uchar *record)
+{
+  MARIA_SHARE *share= info->s;
+  my_off_t pos;
+  pgcache_page_no_t page;
+  uchar *page_buff, *bitmap_buff, *data;
+  char llbuff[22], llbuff2[22];
+  uint block_size= share->block_size;
+  ha_rows full_page_count, tail_count;
+  my_bool full_dir;
+  uint offset_page, offset, free_count;
+
+  LINT_INIT(full_dir);
+
+  if (_ma_scan_init_block_record(info))
+  {
+    _ma_check_print_error(param, "got error %d when initializing scan",
+                          my_errno);
+    return 1;
+  }
+  bitmap_buff= info->scan.bitmap_buff;
+  page_buff= info->scan.page_buff;
+  full_page_count= tail_count= 0;
+  param->full_page_count= param->tail_count= 0;
+  param->used= param->link_used= 0;
+  param->splits= share->state.state.data_file_length / block_size;
+
+  for (pos= 0, page= 0;
+       pos < share->state.state.data_file_length;
+       pos+= block_size, page++)
+  {
+    uint row_count, real_row_count, empty_space, page_type, bitmap_pattern;
+    LINT_INIT(row_count);
+    LINT_INIT(empty_space);
+
+    if (_ma_killed_ptr(param))
+    {
+      _ma_scan_end_block_record(info);
+      return -1;
+    }
+    if ((page % share->bitmap.pages_covered) == 0)
+    {
+      /* Bitmap page */
+      if (pagecache_read(share->pagecache,
+                         &info->s->bitmap.file,
+                         page, 1,
+                         bitmap_buff,
+                         PAGECACHE_PLAIN_PAGE,
+                         PAGECACHE_LOCK_LEFT_UNLOCKED, 0) == 0)
+      {
+        _ma_check_print_error(param,
+                              "Page %9s:  Got error: %d when reading datafile",
+                              llstr(page, llbuff), my_errno);
+        goto err;
+      }
+      param->used+= block_size;
+      param->link_used+= block_size;
+      continue;
+    }
+    /* Skip pages marked as empty in bitmap */
+    offset_page= (uint) ((page % share->bitmap.pages_covered) -1) * 3;
+    offset= offset_page & 7;
+    data= bitmap_buff + offset_page / 8;
+    bitmap_pattern= uint2korr(data);
+    if (!((bitmap_pattern >> offset) & 7))
+    {
+      param->empty+= block_size;
+      param->del_blocks++;
+      continue;
+    }
+
+    if (pagecache_read(share->pagecache,
+                       &info->dfile,
+                       page, 1,
+                       page_buff,
+                       share->page_type,
+                       PAGECACHE_LOCK_LEFT_UNLOCKED, 0) == 0)
+    {
+      _ma_check_print_error(param,
+                            "Page %9s:  Got error: %d when reading datafile",
+                            llstr(page, llbuff), my_errno);
+      goto err;
+    }
+    page_type= page_buff[PAGE_TYPE_OFFSET] & PAGE_TYPE_MASK;
+    if (page_type == UNALLOCATED_PAGE || page_type >= MAX_PAGE_TYPE)
+    {
+      _ma_check_print_error(param,
+                            "Page: %9s  Found wrong page type %d",
+                            llstr(page, llbuff), page_type);
+      if (param->err_count++ > MAXERR || !(param->testflag & T_VERBOSE))
+        goto err;
+      continue;
+    }
+    switch ((enum en_page_type) page_type) {
+    case UNALLOCATED_PAGE:
+    case MAX_PAGE_TYPE:
+    default:
+      DBUG_ASSERT(0);                           /* Impossible */
+      break;
+    case HEAD_PAGE:
+      row_count= page_buff[DIR_COUNT_OFFSET];
+      empty_space= uint2korr(page_buff + EMPTY_SPACE_OFFSET);
+      param->used+= block_size - empty_space;
+      param->link_used+= (PAGE_HEADER_SIZE + PAGE_SUFFIX_SIZE +
+                          row_count * DIR_ENTRY_SIZE);
+      if (empty_space < share->bitmap.sizes[3])
+        param->lost+= empty_space;
+      if (check_page_layout(param, info, pos, page_buff, row_count,
+                            empty_space, &real_row_count, &free_count))
+        goto err;
+      full_dir= (row_count == MAX_ROWS_PER_PAGE &&
+                 page_buff[DIR_FREE_OFFSET] == END_OF_DIR_FREE_LIST);
+      break;
+    case TAIL_PAGE:
+      row_count= page_buff[DIR_COUNT_OFFSET];
+      empty_space= uint2korr(page_buff + EMPTY_SPACE_OFFSET);
+      param->used+= block_size - empty_space;
+      param->link_used+= (PAGE_HEADER_SIZE + PAGE_SUFFIX_SIZE +
+                          row_count * DIR_ENTRY_SIZE);
+      if (empty_space < share->bitmap.sizes[6])
+        param->lost+= empty_space;
+      if (check_page_layout(param, info, pos, page_buff, row_count,
+                            empty_space, &real_row_count, &free_count))
+        goto err;
+      full_dir= (row_count - free_count >= MAX_ROWS_PER_PAGE -
+                 share->base.blobs);
+      break;
+    case BLOB_PAGE:
+      full_page_count++;
+      full_dir= 0;
+      empty_space= block_size;                  /* for error reporting */
+      param->link_used+= (LSN_SIZE + PAGE_TYPE_SIZE);
+      param->used+= block_size;
+      break;
+    }
+    if (_ma_check_bitmap_data(info, page_type, page,
+                              full_dir ? 0 : empty_space,
+                              &bitmap_pattern))
+    {
+      if (bitmap_pattern == ~(uint) 0)
+        _ma_check_print_error(param,
+                              "Page %9s: Wrong bitmap for data on page",
+                              llstr(page, llbuff));
+      else
+        _ma_check_print_error(param,
+                              "Page %9s:  Wrong data in bitmap.  Page_type: "
+                              "%d  full: %d  empty_space: %u  Bitmap-bits: %d",
+                              llstr(page, llbuff), page_type, full_dir,
+                              empty_space, bitmap_pattern);
+      if (param->err_count++ > MAXERR || !(param->testflag & T_VERBOSE))
+        goto err;
+    }
+    if ((enum en_page_type) page_type == BLOB_PAGE)
+      continue;
+    param->empty+= empty_space;
+    if ((enum en_page_type) page_type == TAIL_PAGE)
+    {
+      tail_count+= real_row_count;
+      continue;
+    }
+    if (check_head_page(param, info, record, extend, pos, page_buff,
+                        row_count))
+      goto err;
+  }
+
+  /* Verify that rest of bitmap is zero */
+
+  if (page % share->bitmap.pages_covered)
+  {
+    /* Not at end of bitmap */
+    uint bitmap_pattern;
+    offset_page= (uint) ((page % share->bitmap.pages_covered) -1) * 3;
+    offset= offset_page & 7;
+    data= bitmap_buff + offset_page / 8;
+    bitmap_pattern= uint2korr(data);
+    if (((bitmap_pattern >> offset)) ||
+        (data + 2 < bitmap_buff + share->bitmap.total_size &&
+         _ma_check_if_zero(data+2, bitmap_buff + share->bitmap.total_size -
+                           data - 2)))
+    {
+      ulonglong bitmap_page;
+      bitmap_page= page / share->bitmap.pages_covered;
+      bitmap_page*= share->bitmap.pages_covered;
+
+      _ma_check_print_error(param,
+                            "Bitmap at page %s has pages reserved outside of "
+                            "data file length",
+                            llstr(bitmap_page, llbuff));
+      DBUG_EXECUTE("bitmap", _ma_print_bitmap(&share->bitmap, bitmap_buff,
+                                              bitmap_page););
+    }
+  }
+
+  _ma_scan_end_block_record(info);
+
+  if (full_page_count != param->full_page_count)
+    _ma_check_print_error(param, "Full page count read through records was %s "
+                          "but we found %s pages while scanning table",
+                          llstr(param->full_page_count, llbuff),
+                          llstr(full_page_count, llbuff2));
+  if (tail_count != param->tail_count)
+    _ma_check_print_error(param, "Tail count read through records was %s but "
+                          "we found %s tails while scanning table",
+                          llstr(param->tail_count, llbuff),
+                          llstr(tail_count, llbuff2));
+
+  return param->error_printed != 0;
+
+err:
+  _ma_scan_end_block_record(info);
+  return 1;
+}
+
+
+/* Check that record-link is ok */
+
+int maria_chk_data_link(HA_CHECK *param, MARIA_HA *info, my_bool extend)
+{
+  MARIA_SHARE *share= info->s;
+  int	error;
+  uchar *record;
+  char llbuff[22],llbuff2[22],llbuff3[22];
+  DBUG_ENTER("maria_chk_data_link");
+
+  if (!(param->testflag & T_SILENT))
+  {
+    if (extend)
+      puts("- check records and index references");
+    else
+      puts("- check record links");
+  }
+
+  if (!(record= (uchar*) my_malloc(share->base.default_rec_buff_size, MYF(0))))
+  {
+    _ma_check_print_error(param,"Not enough memory for record");
+    DBUG_RETURN(-1);
+  }
+  param->records= param->del_blocks= 0;
+  param->used= param->link_used= param->splits= param->del_length= 0;
+  param->lost= 0;
+  param->tmp_record_checksum= param->glob_crc= 0;
+  param->err_count= 0;
+
+  error= 0;
+  param->empty= share->pack.header_length;
+
+  bzero((char*) param->tmp_key_crc,
+        share->base.keys * sizeof(param->tmp_key_crc[0]));
+
+  switch (share->data_file_type) {
+  case BLOCK_RECORD:
+    error= check_block_record(param, info, extend, record);
+    break;
+  case STATIC_RECORD:
+    error= check_static_record(param, info, extend, record);
+    break;
+  case DYNAMIC_RECORD:
+    error= check_dynamic_record(param, info, extend, record);
+    break;
+  case COMPRESSED_RECORD:
+    error= check_compressed_record(param, info, extend, record);
+    break;
+  } /* switch */
+
+  if (error)
+    goto err;
+
+  if (param->testflag & T_WRITE_LOOP)
+  {
+    VOID(fputs("          \r",stdout)); VOID(fflush(stdout));
+  }
+  if (param->records != share->state.state.records)
+  {
+    _ma_check_print_error(param,
+                          "Record-count is not ok; found %-10s  Should be: %s",
+                          llstr(param->records,llbuff),
+                          llstr(share->state.state.records,llbuff2));
+    error=1;
+  }
+  else if (param->record_checksum &&
+	   param->record_checksum != param->tmp_record_checksum)
+  {
+    _ma_check_print_error(param,
+                          "Key pointers and record positions doesn't match");
+    error=1;
+  }
+  else if (param->glob_crc != share->state.state.checksum &&
+	   (share->options &
+	    (HA_OPTION_CHECKSUM | HA_OPTION_COMPRESS_RECORD)))
+  {
+    _ma_check_print_warning(param,
+                            "Record checksum is not the same as checksum "
+                            "stored in the index file");
+    error=1;
+  }
+  else if (!extend)
+  {
+    uint key;
+    for (key=0 ; key < share->base.keys;  key++)
+    {
+      if (param->tmp_key_crc[key] != param->key_crc[key] &&
+          !(share->keyinfo[key].flag &
+            (HA_FULLTEXT | HA_SPATIAL | HA_RTREE_INDEX)))
+      {
+	_ma_check_print_error(param,"Checksum for key: %2d doesn't match "
+                              "checksum for records",
+                              key+1);
+	error=1;
+      }
+    }
+  }
+
+  if (param->del_length != share->state.state.empty)
+  {
+    _ma_check_print_warning(param,
+                            "Found %s deleted space.   Should be %s",
+                            llstr(param->del_length,llbuff2),
+                            llstr(share->state.state.empty,llbuff));
+  }
+  /* Skip following checks for BLOCK RECORD as they don't make any sence */
+  if (share->data_file_type != BLOCK_RECORD)
+  {
+    if (param->used + param->empty + param->del_length !=
+        share->state.state.data_file_length)
+    {
+      _ma_check_print_warning(param,
+                              "Found %s record data and %s unused data and %s "
+                              "deleted data",
+                              llstr(param->used, llbuff),
+                              llstr(param->empty,llbuff2),
+                              llstr(param->del_length,llbuff3));
+      _ma_check_print_warning(param,
+                              "Total %s   Should be: %s",
+                              llstr((param->used+param->empty +
+                                     param->del_length), llbuff),
+                              llstr(share->state.state.data_file_length,
+                                    llbuff2));
+    }
+    if (param->del_blocks != share->state.state.del)
+    {
+      _ma_check_print_warning(param,
+                              "Found %10s deleted blocks.  Should be: %s",
+                              llstr(param->del_blocks,llbuff),
+                              llstr(share->state.state.del,llbuff2));
+    }
+    if (param->splits != share->state.split)
+    {
+      _ma_check_print_warning(param,
+                              "Found %10s parts.  Should be: %s",
+                              llstr(param->splits, llbuff),
+                              llstr(share->state.split,llbuff2));
+    }
+  }
+  if (param->testflag & T_INFO)
+  {
+    if (param->warning_printed || param->error_printed)
+      puts("");
+    if (param->used != 0 && ! param->error_printed)
+    {
+      if (param->records)
+      {
+        printf("Records:%18s    M.recordlength:%9lu   Packed:%14.0f%%\n",
+               llstr(param->records,llbuff),
+               (long)((param->used - param->link_used)/param->records),
+               (share->base.blobs ? 0.0 :
+                (ulonglong2double((ulonglong) share->base.reclength *
+                                  param->records)-
+                 my_off_t2double(param->used))/
+                ulonglong2double((ulonglong) share->base.reclength *
+                                 param->records)*100.0));
+        printf("Recordspace used:%9.0f%%   Empty space:%12d%%  "
+               "Blocks/Record: %6.2f\n",
+               (ulonglong2double(param->used - param->link_used)/
+                ulonglong2double(param->used-param->link_used+param->empty) *
+                100.0),
+               (!param->records ? 100 :
+                (int) (ulonglong2double(param->del_length+param->empty)/
+                       my_off_t2double(param->used)*100.0)),
+               ulonglong2double(param->splits - param->del_blocks) /
+               param->records);
+      }
+      else
+        printf("Records:%18s\n", "0");
+    }
+    printf("Record blocks:%12s    Delete blocks:%10s\n",
+           llstr(param->splits - param->del_blocks, llbuff),
+           llstr(param->del_blocks, llbuff2));
+    printf("Record data:  %12s    Deleted data: %10s\n",
+           llstr(param->used - param->link_used,llbuff),
+           llstr(param->del_length, llbuff2));
+    printf("Empty space:  %12s    Linkdata:     %10s\n",
+           llstr(param->empty, llbuff),llstr(param->link_used, llbuff2));
+    if (param->lost)
+      printf("Lost space:   %12s", llstr(param->lost, llbuff));
+    if (param->max_found_trid)
+    {
+      printf("Max trans. id: %11s\n",
+             llstr(param->max_found_trid, llbuff));
+    }
+  }
+  my_free(record,MYF(0));
+  DBUG_RETURN (error);
+
+err:
+  my_free(record,MYF(0));
+  param->testflag|=T_RETRY_WITHOUT_QUICK;
+  DBUG_RETURN(1);
+} /* maria_chk_data_link */
+
+
+/**
+  Prepares a table for a repair or index sort: flushes pages, records durably
+  in the table that it is undergoing the operation (if that op crashes, that
+  info will serve for Recovery and the user).
+
+  If we start overwriting the index file, and crash then, old REDOs will
+  be tried and fail. To prevent that, we bump skip_redo_lsn, and thus we have
+  to flush and sync pages so that old REDOs can be skipped.
+  If this is not a bulk insert, which Recovery can handle gracefully (by
+  truncating files, see UNDO_BULK_INSERT) we also mark the table
+  crashed-on-repair, so that user knows it has to re-repair. If bulk insert we
+  shouldn't mark it crashed-on-repair, because if we did this, the UNDO phase
+  would skip the table (UNDO_BULK_INSERT would not be applied),
+  and maria_chk would not improve that.
+  If this is an OPTIMIZE which merely sorts index, we need to do the same
+  too: old REDOs should not apply to the new index file.
+  Only the flush is needed when in maria_chk which is not crash-safe.
+
+  @param  info             table
+  @param  param            repair parameters
+  @param  discard_index    if index pages can be thrown away
+*/
+
+static my_bool protect_against_repair_crash(MARIA_HA *info,
+                                            const HA_CHECK *param,
+                                            my_bool discard_index)
+{
+  MARIA_SHARE *share= info->s;
+
+  /*
+    There are other than recovery-related reasons to do the writes below:
+    - the physical size of the data file is sometimes used during repair: we
+    need to flush to have it exact
+    - we flush the state because maria_open(HA_OPEN_COPY) will want to read
+    it from disk.
+  */
+  if (_ma_flush_table_files(info, MARIA_FLUSH_DATA | MARIA_FLUSH_INDEX,
+                            FLUSH_FORCE_WRITE,
+                            discard_index ? FLUSH_IGNORE_CHANGED :
+                            FLUSH_FORCE_WRITE) ||
+      (share->changed &&
+       _ma_state_info_write(share,
+                            MA_STATE_INFO_WRITE_DONT_MOVE_OFFSET |
+                            MA_STATE_INFO_WRITE_FULL_INFO |
+                            MA_STATE_INFO_WRITE_LOCK)))
+    return TRUE;
+  /* In maria_chk this is not needed: */
+  if (maria_multi_threaded && share->base.born_transactional)
+  {
+    if ((param->testflag & T_NO_CREATE_RENAME_LSN) == 0)
+    {
+      /* this can be true only for a transactional table */
+      maria_mark_in_repair(info);
+      if (_ma_state_info_write(share,
+                               MA_STATE_INFO_WRITE_DONT_MOVE_OFFSET |
+                               MA_STATE_INFO_WRITE_LOCK))
+        return TRUE;
+    }
+    if (translog_status == TRANSLOG_OK &&
+        _ma_update_state_lsns(share, translog_get_horizon(),
+                              share->state.create_trid, FALSE, FALSE))
+      return TRUE;
+    if (_ma_sync_table_files(info))
+      return TRUE;
+  }
+  return FALSE;
+}
+
+
+/**
+   @brief Initialize variables for repair
+*/
+
+static int initialize_variables_for_repair(HA_CHECK *param,
+                                           MARIA_SORT_INFO *sort_info,
+                                           MARIA_SORT_PARAM *sort_param,
+                                           MARIA_HA *info,
+                                           my_bool rep_quick,
+                                           MARIA_SHARE *org_share)
+{
+  MARIA_SHARE *share= info->s;
+
+  /* Ro allow us to restore state and check how state changed */
+  memcpy(org_share, share, sizeof(*share));
+
+  /* Repair code relies on share->state.state so we have to update it here */
+  if (share->lock.update_status)
+    (*share->lock.update_status)(info);
+
+  bzero((char*) sort_info,  sizeof(*sort_info));
+  bzero((char*) sort_param, sizeof(*sort_param));
+
+  param->testflag|= T_REP;                     /* for easy checking */
+  if (share->options & (HA_OPTION_CHECKSUM | HA_OPTION_COMPRESS_RECORD))
+    param->testflag|= T_CALC_CHECKSUM;
+  param->glob_crc= 0;
+  if (rep_quick)
+    param->testflag|= T_QUICK;
+  else
+    param->testflag&= ~T_QUICK;
+  param->org_key_map= share->state.key_map;
+
+  sort_param->sort_info= sort_info;
+  sort_param->fix_datafile= ! rep_quick;
+  sort_param->calc_checksum= test(param->testflag & T_CALC_CHECKSUM);
+  sort_info->info= sort_info->new_info= info;
+  sort_info->param= param;
+  set_data_file_type(sort_info, info->s);
+  sort_info->org_data_file_type= share->data_file_type;
+
+  bzero(&info->rec_cache, sizeof(info->rec_cache));
+  info->rec_cache.file= info->dfile.file;
+  info->update= (short) (HA_STATE_CHANGED | HA_STATE_ROW_CHANGED);
+
+  if (protect_against_repair_crash(info, param, !test(param->testflag &
+                                                      T_CREATE_MISSING_KEYS)))
+    return 1;
+
+  /* calculate max_records */
+  sort_info->filelength= my_seek(info->dfile.file, 0L, MY_SEEK_END, MYF(0));
+  if ((param->testflag & T_CREATE_MISSING_KEYS) ||
+      sort_info->org_data_file_type == COMPRESSED_RECORD)
+    sort_info->max_records= share->state.state.records;
+  else
+  {
+    ulong rec_length;
+    rec_length= max(share->base.min_pack_length,
+                    share->base.min_block_length);
+    sort_info->max_records= (ha_rows) (sort_info->filelength / rec_length);
+  }
+
+  /* Set up transaction handler so that we can see all rows */
+  if (param->max_trid == 0)
+  {
+    if (!ma_control_file_inited())
+      param->max_trid= 0;      /* Give warning for first trid found */
+    else
+      param->max_trid= max_trid_in_system();
+  }
+  maria_ignore_trids(info);
+  /* Don't write transid's during repair */
+  maria_versioning(info, 0);
+  return 0;
+}
+
+
+/*
+  During initialize_variables_for_repair and related functions we set some
+  variables to values that makes sence during repair.
+  This function restores these values to their original values so that we can
+  use the handler in MariaDB without having to close and open the table.
+*/
+
+static void restore_table_state_after_repair(MARIA_HA *info,
+                                             MARIA_SHARE *org_share)
+{
+  maria_versioning(info, info->s->have_versioning);
+  info->s->lock_key_trees= org_share->lock_key_trees;
+}
+
+
+
+
+/**
+  @brief Drop all indexes
+
+  @param[in]    param           check parameters
+  @param[in]    info            MARIA_HA handle
+  @param[in]    force           if to force drop all indexes
+
+  @return       status
+    @retval     0               OK
+    @retval     != 0            Error
+
+  @note
+    Once allocated, index blocks remain part of the key file forever.
+    When indexes are disabled, no block is freed. When enabling indexes,
+    no block is freed either. The new indexes are create from new
+    blocks. (Bug #4692)
+
+    Before recreating formerly disabled indexes, the unused blocks
+    must be freed. There are two options to do this:
+    - Follow the tree of disabled indexes, add all blocks to the
+      deleted blocks chain. Would require a lot of random I/O.
+    - Drop all blocks by clearing all index root pointers and all
+      delete chain pointers and resetting key_file_length to the end
+      of the index file header. This requires to recreate all indexes,
+      even those that may still be intact.
+    The second method is probably faster in most cases.
+
+    When disabling indexes, MySQL disables either all indexes or all
+    non-unique indexes. When MySQL [re-]enables disabled indexes
+    (T_CREATE_MISSING_KEYS), then we either have "lost" blocks in the
+    index file, or there are no non-unique indexes. In the latter case,
+    maria_repair*() would not be called as there would be no disabled
+    indexes.
+
+    If there would be more unique indexes than disabled (non-unique)
+    indexes, we could do the first method. But this is not implemented
+    yet. By now we drop and recreate all indexes when repair is called.
+
+    However, there is an exception. Sometimes MySQL disables non-unique
+    indexes when the table is empty (e.g. when copying a table in
+    mysql_alter_table()). When enabling the non-unique indexes, they
+    are still empty. So there is no index block that can be lost. This
+    optimization is implemented in this function.
+
+    Note that in normal repair (T_CREATE_MISSING_KEYS not set) we
+    recreate all enabled indexes unconditonally. We do not change the
+    key_map. Otherwise we invert the key map temporarily (outside of
+    this function) and recreate the then "seemingly" enabled indexes.
+    When we cannot use the optimization, and drop all indexes, we
+    pretend that all indexes were disabled. By the inversion, we will
+    then recrate all indexes.
+*/
+
+static int maria_drop_all_indexes(HA_CHECK *param, MARIA_HA *info,
+                                  my_bool force)
+{
+  MARIA_SHARE *share= info->s;
+  MARIA_STATE_INFO *state= &share->state;
+  uint i;
+  DBUG_ENTER("maria_drop_all_indexes");
+
+  /*
+    If any of the disabled indexes has a key block assigned, we must
+    drop and recreate all indexes to avoid losing index blocks.
+
+    If we want to recreate disabled indexes only _and_ all of these
+    indexes are empty, we don't need to recreate the existing indexes.
+  */
+  if (!force && (param->testflag & T_CREATE_MISSING_KEYS))
+  {
+    DBUG_PRINT("repair", ("creating missing indexes"));
+    for (i= 0; i < share->base.keys; i++)
+    {
+      DBUG_PRINT("repair", ("index #: %u  key_root: 0x%lx  active: %d",
+                            i, (long) state->key_root[i],
+                            maria_is_key_active(state->key_map, i)));
+      if ((state->key_root[i] != HA_OFFSET_ERROR) &&
+          !maria_is_key_active(state->key_map, i))
+      {
+        /*
+          This index has at least one key block and it is disabled.
+          We would lose its block(s) if would just recreate it.
+          So we need to drop and recreate all indexes.
+        */
+        DBUG_PRINT("repair", ("nonempty and disabled: recreate all"));
+        break;
+      }
+    }
+    if (i >= share->base.keys)
+      goto end;
+
+    /*
+      We do now drop all indexes and declare them disabled. With the
+      T_CREATE_MISSING_KEYS flag, maria_repair*() will recreate all
+      disabled indexes and enable them.
+    */
+    maria_clear_all_keys_active(state->key_map);
+    DBUG_PRINT("repair", ("declared all indexes disabled"));
+  }
+
+  /* Clear index root block pointers. */
+  for (i= 0; i < share->base.keys; i++)
+    state->key_root[i]= HA_OFFSET_ERROR;
+
+  /* Drop the delete chain. */
+  share->state.key_del=  HA_OFFSET_ERROR;
+
+  /* Reset index file length to end of index file header. */
+  share->state.state.key_file_length= share->base.keystart;
+
+end:
+  DBUG_RETURN(0);
+}
+
+
+/*
+  Recover old table by reading each record and writing all keys
+
+  NOTES
+    Save new datafile-name in temp_filename.
+    We overwrite the index file as we go (writekeys() for example), so if we
+    crash during this the table is unusable and user (or Recovery in the
+    future) must repeat the REPAIR/OPTIMIZE operation. We could use a
+    temporary index file in the future (drawback: more disk space).
+
+  IMPLEMENTATION (for hard repair with block format)
+   - Create new, unrelated MARIA_HA of the table
+   - Create new datafile and associate it with new handler
+   - Reset all statistic information in new handler
+   - Copy all data to new handler with normal write operations
+   - Move state of new handler to old handler
+   - Close new handler
+   - Close data file in old handler
+   - Rename old data file to new data file.
+   - Reopen data file in old handler
+*/
+
+int maria_repair(HA_CHECK *param, register MARIA_HA *info,
+                 char *name, my_bool rep_quick)
+{
+  int error, got_error;
+  ha_rows start_records,new_header_length;
+  my_off_t del;
+  File new_file;
+  MARIA_SHARE *share= info->s;
+  char llbuff[22],llbuff2[22];
+  MARIA_SORT_INFO sort_info;
+  MARIA_SORT_PARAM sort_param;
+  my_bool block_record, scan_inited= 0, reenable_logging= 0;
+  enum data_file_type org_data_file_type= share->data_file_type;
+  myf sync_dir= ((share->now_transactional && !share->temporary) ?
+                 MY_SYNC_DIR : 0);
+  MARIA_SHARE backup_share;
+  DBUG_ENTER("maria_repair");
+
+  got_error= 1;
+  new_file= -1;
+  start_records= share->state.state.records;
+  if (!(param->testflag & T_SILENT))
+  {
+    printf("- recovering (with keycache) Aria-table '%s'\n",name);
+    printf("Data records: %s\n", llstr(start_records, llbuff));
+  }
+
+  if (initialize_variables_for_repair(param, &sort_info, &sort_param, info,
+                                      rep_quick, &backup_share))
+    goto err;
+
+  if ((reenable_logging= share->now_transactional))
+    _ma_tmp_disable_logging_for_table(info, 0);
+
+  sort_param.current_filepos= sort_param.filepos= new_header_length=
+    ((param->testflag & T_UNPACK) ? 0L : share->pack.header_length);
+
+  if (!rep_quick)
+  {
+    /* Get real path for data file */
+    if ((new_file= my_create(fn_format(param->temp_filename,
+                                       share->data_file_name.str, "",
+                                       DATA_TMP_EXT, 2+4),
+                             0,param->tmpfile_createflag,
+                             MYF(0))) < 0)
+    {
+      _ma_check_print_error(param,"Can't create new tempfile: '%s'",
+			   param->temp_filename);
+      goto err;
+    }
+    if (new_header_length &&
+        maria_filecopy(param, new_file, info->dfile.file, 0L,
+                       new_header_length, "datafile-header"))
+      goto err;
+    share->state.dellink= HA_OFFSET_ERROR;
+    info->rec_cache.file= new_file;             /* For sort_delete_record */
+    if (share->data_file_type == BLOCK_RECORD ||
+        (param->testflag & T_UNPACK))
+    {
+      if (create_new_data_handle(&sort_param, new_file))
+        goto err;
+      sort_info.new_info->rec_cache.file= new_file;
+    }
+  }
+
+  block_record= sort_info.new_info->s->data_file_type == BLOCK_RECORD;
+
+  if (org_data_file_type != BLOCK_RECORD)
+  {
+    /* We need a read buffer to read rows in big blocks */
+    if (init_io_cache(&param->read_cache, info->dfile.file,
+                      (uint) param->read_buffer_length,
+                      READ_CACHE, share->pack.header_length, 1, MYF(MY_WME)))
+      goto err;
+  }
+  if (sort_info.new_info->s->data_file_type != BLOCK_RECORD)
+  {
+    /* When writing to not block records, we need a write buffer */
+    if (!rep_quick)
+    {
+      if (init_io_cache(&sort_info.new_info->rec_cache, new_file,
+                        (uint) param->write_buffer_length,
+                        WRITE_CACHE, new_header_length, 1,
+                        MYF(MY_WME | MY_WAIT_IF_FULL) & param->myf_rw))
+        goto err;
+      sort_info.new_info->opt_flag|=WRITE_CACHE_USED;
+    }
+  }
+  else if (block_record)
+  {
+    scan_inited= 1;
+    if (maria_scan_init(sort_info.info))
+      goto err;
+  }
+
+  if (!(sort_param.record=
+        (uchar *) my_malloc((uint)
+                            share->base.default_rec_buff_size, MYF(0))) ||
+      _ma_alloc_buffer(&sort_param.rec_buff, &sort_param.rec_buff_size,
+                       share->base.default_rec_buff_size))
+  {
+    _ma_check_print_error(param, "Not enough memory for extra record");
+    goto err;
+  }
+
+  sort_param.read_cache=param->read_cache;
+  sort_param.pos=sort_param.max_pos=share->pack.header_length;
+  param->read_cache.end_of_file= sort_info.filelength;
+  sort_param.master=1;
+  sort_info.max_records= ~(ha_rows) 0;
+
+  del= share->state.state.del;
+  share->state.state.records= share->state.state.del= share->state.split= 0;
+  share->state.state.empty= 0;
+
+  if (param->testflag & T_CREATE_MISSING_KEYS)
+    maria_set_all_keys_active(share->state.key_map, share->base.keys);
+  maria_drop_all_indexes(param, info, TRUE);
+
+  maria_lock_memory(param);			/* Everything is alloced */
+
+  /* Re-create all keys, which are set in key_map. */
+  while (!(error=sort_get_next_record(&sort_param)))
+  {
+    if (block_record && _ma_sort_write_record(&sort_param))
+      goto err;
+
+    if (writekeys(&sort_param))
+    {
+      if (my_errno != HA_ERR_FOUND_DUPP_KEY)
+	goto err;
+      DBUG_DUMP("record", sort_param.record,
+                share->base.default_rec_buff_size);
+      _ma_check_print_warning(param,
+                              "Duplicate key %2d for record at %10s against "
+                              "new record at %10s",
+                              info->errkey+1,
+                              llstr(sort_param.current_filepos, llbuff),
+                              llstr(info->dup_key_pos,llbuff2));
+      if (param->testflag & T_VERBOSE)
+      {
+        MARIA_KEY tmp_key;
+        MARIA_KEYDEF *keyinfo= share->keyinfo + info->errkey;
+	(*keyinfo->make_key)(info, &tmp_key, (uint) info->errkey,
+                             info->lastkey_buff,
+                             sort_param.record, 0L, 0);
+        _ma_print_key(stdout, &tmp_key);
+      }
+      sort_info.dupp++;
+      if ((param->testflag & (T_FORCE_UNIQUENESS|T_QUICK)) == T_QUICK)
+      {
+        param->testflag|=T_RETRY_WITHOUT_QUICK;
+	param->error_printed=1;
+	goto err;
+      }
+      /* purecov: begin tested */
+      if (block_record)
+      {
+        sort_info.new_info->s->state.state.records--;
+        if ((*sort_info.new_info->s->write_record_abort)(sort_info.new_info))
+        {
+          _ma_check_print_error(param,"Couldn't delete duplicate row");
+          goto err;
+        }
+      }
+      /* purecov: end */
+      continue;
+    }
+    if (!block_record)
+    {
+      if (_ma_sort_write_record(&sort_param))
+        goto err;
+      /* Filepos is pointer to where next row will be stored */
+      sort_param.current_filepos= sort_param.filepos;
+    }
+  }
+  if (error > 0 || maria_write_data_suffix(&sort_info, !rep_quick) ||
+      flush_io_cache(&sort_info.new_info->rec_cache) ||
+      param->read_cache.error < 0)
+    goto err;
+
+  if (param->testflag & T_WRITE_LOOP)
+  {
+    VOID(fputs("          \r",stdout)); VOID(fflush(stdout));
+  }
+  if (my_chsize(share->kfile.file, share->state.state.key_file_length, 0, MYF(0)))
+  {
+    _ma_check_print_warning(param,
+			   "Can't change size of indexfile, error: %d",
+			   my_errno);
+    goto err;
+  }
+
+  if (rep_quick && del+sort_info.dupp != share->state.state.del)
+  {
+    _ma_check_print_error(param,"Couldn't fix table with quick recovery: "
+                          "Found wrong number of deleted records");
+    _ma_check_print_error(param,"Run recovery again without -q");
+    param->retry_repair=1;
+    param->testflag|=T_RETRY_WITHOUT_QUICK;
+    goto err;
+  }
+
+  if (param->testflag & T_SAFE_REPAIR)
+  {
+    /* Don't repair if we loosed more than one row */
+    if (sort_info.new_info->s->state.state.records+1 < start_records)
+    {
+      share->state.state.records= start_records;
+      goto err;
+    }
+  }
+
+  VOID(end_io_cache(&sort_info.new_info->rec_cache));
+  info->opt_flag&= ~WRITE_CACHE_USED;
+
+  /*
+    As we have read the data file (sort_get_next_record()) we may have
+    cached, non-changed blocks of it in the page cache. We must throw them
+    away as we are going to close their descriptor ('new_file'). We also want
+    to flush any index block, so that it is ready for the upcoming sync.
+  */
+  if (_ma_flush_table_files_before_swap(param, info))
+    goto err;
+
+  if (!rep_quick)
+  {
+    sort_info.new_info->s->state.state.data_file_length= sort_param.filepos;
+    if (sort_info.new_info != sort_info.info)
+    {
+      MARIA_STATE_INFO save_state= sort_info.new_info->s->state;
+      if (maria_close(sort_info.new_info))
+      {
+        _ma_check_print_error(param, "Got error %d on close", my_errno);
+        goto err;
+      }
+      copy_data_file_state(&share->state, &save_state);
+      new_file= -1;
+      sort_info.new_info= info;
+    }
+    share->state.version=(ulong) time((time_t*) 0);	/* Force reopen */
+
+    /* Replace the actual file with the temporary file */
+    if (new_file >= 0)
+      my_close(new_file, MYF(MY_WME));
+    new_file= -1;
+    change_data_file_descriptor(info, -1);
+    if (maria_change_to_newfile(share->data_file_name.str, MARIA_NAME_DEXT,
+                                DATA_TMP_EXT,
+                                (param->testflag & T_BACKUP_DATA ?
+                                 MYF(MY_REDEL_MAKE_BACKUP): MYF(0)) |
+                                sync_dir) ||
+        _ma_open_datafile(info, share, NullS, -1))
+    {
+      goto err;
+    }
+  }
+  else
+  {
+    share->state.state.data_file_length= sort_param.max_pos;
+  }
+  if (param->testflag & T_CALC_CHECKSUM)
+    share->state.state.checksum= param->glob_crc;
+
+  if (!(param->testflag & T_SILENT))
+  {
+    if (start_records != share->state.state.records)
+      printf("Data records: %s\n", llstr(share->state.state.records,llbuff));
+  }
+  if (sort_info.dupp)
+    _ma_check_print_warning(param,
+                            "%s records have been removed",
+                            llstr(sort_info.dupp,llbuff));
+
+  got_error= 0;
+  /* If invoked by external program that uses thr_lock */
+  if (&share->state.state != info->state)
+    *info->state= *info->state_start= share->state.state;
+
+err:
+  if (scan_inited)
+    maria_scan_end(sort_info.info);
+  _ma_reset_state(info);
+
+  VOID(end_io_cache(&param->read_cache));
+  VOID(end_io_cache(&sort_info.new_info->rec_cache));
+  info->opt_flag&= ~(READ_CACHE_USED | WRITE_CACHE_USED);
+  sort_info.new_info->opt_flag&= ~(READ_CACHE_USED | WRITE_CACHE_USED);
+  /* this below could fail, shouldn't we detect error? */
+  if (got_error)
+  {
+    if (! param->error_printed)
+      _ma_check_print_error(param,"%d for record at pos %s",my_errno,
+		  llstr(sort_param.start_recpos,llbuff));
+    (void)_ma_flush_table_files_before_swap(param, info);
+    if (sort_info.new_info && sort_info.new_info != sort_info.info)
+    {
+      unuse_data_file_descriptor(sort_info.new_info);
+      maria_close(sort_info.new_info);
+    }
+    if (new_file >= 0)
+    {
+      VOID(my_close(new_file,MYF(0)));
+      VOID(my_delete(param->temp_filename, MYF(MY_WME)));
+    }
+    maria_mark_crashed_on_repair(info);
+  }
+  /* If caller had disabled logging it's not up to us to re-enable it */
+  if (reenable_logging)
+    _ma_reenable_logging_for_table(info, FALSE);
+  restore_table_state_after_repair(info, &backup_share);
+
+  my_free(sort_param.rec_buff, MYF(MY_ALLOW_ZERO_PTR));
+  my_free(sort_param.record,MYF(MY_ALLOW_ZERO_PTR));
+  my_free(sort_info.buff,MYF(MY_ALLOW_ZERO_PTR));
+  if (!got_error && (param->testflag & T_UNPACK))
+    restore_data_file_type(share);
+  share->state.changed|= (STATE_NOT_OPTIMIZED_KEYS | STATE_NOT_SORTED_PAGES |
+			  STATE_NOT_ANALYZED | STATE_NOT_ZEROFILLED);
+  if (!rep_quick)
+    share->state.changed&= ~(STATE_NOT_OPTIMIZED_ROWS | STATE_NOT_MOVABLE);
+  DBUG_RETURN(got_error);
+}
+
+
+/* Uppdate keyfile when doing repair */
+
+static int writekeys(MARIA_SORT_PARAM *sort_param)
+{
+  uint i;
+  MARIA_HA *info=     sort_param->sort_info->info;
+  MARIA_SHARE *share= info->s;
+  uchar *record=    sort_param->record;
+  uchar *key_buff;
+  my_off_t filepos=   sort_param->current_filepos;
+  MARIA_KEY key;
+  DBUG_ENTER("writekeys");
+
+  key_buff= info->lastkey_buff+share->base.max_key_length;
+
+  for (i=0 ; i < share->base.keys ; i++)
+  {
+    if (maria_is_key_active(share->state.key_map, i))
+    {
+      if (share->keyinfo[i].flag & HA_FULLTEXT )
+      {
+        if (_ma_ft_add(info, i, key_buff, record, filepos))
+	  goto err;
+      }
+      else
+      {
+	if (!(*share->keyinfo[i].make_key)(info, &key, i, key_buff, record,
+                                         filepos, 0))
+          goto err;
+	if ((*share->keyinfo[i].ck_insert)(info, &key))
+	  goto err;
+      }
+    }
+  }
+  DBUG_RETURN(0);
+
+ err:
+  if (my_errno == HA_ERR_FOUND_DUPP_KEY)
+  {
+    info->errkey=(int) i;			/* This key was found */
+    while ( i-- > 0 )
+    {
+      if (maria_is_key_active(share->state.key_map, i))
+      {
+	if (share->keyinfo[i].flag & HA_FULLTEXT)
+        {
+          if (_ma_ft_del(info,i,key_buff,record,filepos))
+	    break;
+        }
+        else
+	{
+	  (*share->keyinfo[i].make_key)(info, &key, i, key_buff, record,
+                                        filepos, 0);
+	  if (_ma_ck_delete(info, &key))
+	    break;
+	}
+      }
+    }
+  }
+  /* Remove checksum that was added to glob_crc in sort_get_next_record */
+  if (sort_param->calc_checksum)
+    sort_param->sort_info->param->glob_crc-= info->cur_row.checksum;
+  DBUG_PRINT("error",("errno: %d",my_errno));
+  DBUG_RETURN(-1);
+} /* writekeys */
+
+
+	/* Change all key-pointers that points to a records */
+
+int maria_movepoint(register MARIA_HA *info, uchar *record,
+                    MARIA_RECORD_POS oldpos, MARIA_RECORD_POS newpos,
+                    uint prot_key)
+{
+  uint i;
+  uchar *key_buff;
+  MARIA_SHARE *share= info->s;
+  MARIA_PAGE page;
+  DBUG_ENTER("maria_movepoint");
+
+  key_buff= info->lastkey_buff + share->base.max_key_length;
+  for (i=0 ; i < share->base.keys; i++)
+  {
+    if (i != prot_key && maria_is_key_active(share->state.key_map, i))
+    {
+      MARIA_KEY key;
+      (*share->keyinfo[i].make_key)(info, &key, i, key_buff, record, oldpos,
+                                    0);
+      if (key.keyinfo->flag & HA_NOSAME)
+      {					/* Change pointer direct */
+	MARIA_KEYDEF *keyinfo;
+	keyinfo=share->keyinfo+i;
+	if (_ma_search(info, &key, (uint32) (SEARCH_SAME | SEARCH_SAVE_BUFF),
+		       share->state.key_root[i]))
+	  DBUG_RETURN(-1);
+        _ma_page_setup(&page, info, keyinfo, info->last_keypage,
+                       info->keyread_buff);
+
+	_ma_dpointer(share, info->int_keypos - page.node -
+		     share->rec_reflength,newpos);
+
+	if (_ma_write_keypage(&page, PAGECACHE_LOCK_LEFT_UNLOCKED,
+                              DFLT_INIT_HITS))
+	  DBUG_RETURN(-1);
+      }
+      else
+      {					/* Change old key to new */
+	if (_ma_ck_delete(info, &key))
+	  DBUG_RETURN(-1);
+	(*share->keyinfo[i].make_key)(info, &key, i, key_buff, record, newpos,
+                                      0);
+	if (_ma_ck_write(info, &key))
+	  DBUG_RETURN(-1);
+      }
+    }
+  }
+  DBUG_RETURN(0);
+} /* maria_movepoint */
+
+
+	/* Tell system that we want all memory for our cache */
+
+void maria_lock_memory(HA_CHECK *param __attribute__((unused)))
+{
+#ifdef SUN_OS				/* Key-cacheing thrases on sun 4.1 */
+  if (param->opt_maria_lock_memory)
+  {
+    int success = mlockall(MCL_CURRENT);	/* or plock(DATLOCK); */
+    if (geteuid() == 0 && success != 0)
+      _ma_check_print_warning(param,
+			     "Failed to lock memory. errno %d",my_errno);
+  }
+#endif
+} /* maria_lock_memory */
+
+
+/**
+   Flush all changed blocks to disk.
+
+   We release blocks as it's unlikely that they would all be needed soon.
+   This function needs to be called before swapping data or index files or
+   syncing them.
+
+   @param  param           description of the repair operation
+   @param  info            table
+*/
+
+static my_bool _ma_flush_table_files_before_swap(HA_CHECK *param,
+                                                 MARIA_HA *info)
+{
+  DBUG_ENTER("_ma_flush_table_files_before_swap");
+  if (_ma_flush_table_files(info, MARIA_FLUSH_DATA | MARIA_FLUSH_INDEX,
+                            FLUSH_RELEASE, FLUSH_RELEASE))
+  {
+    _ma_check_print_error(param, "%d when trying to write buffers", my_errno);
+    DBUG_RETURN(TRUE);
+  }
+  DBUG_RETURN(FALSE);
+}
+
+
+	/* Sort index for more efficent reads */
+
+int maria_sort_index(HA_CHECK *param, register MARIA_HA *info, char *name)
+{
+  reg2 uint key;
+  reg1 MARIA_KEYDEF *keyinfo;
+  File new_file;
+  my_off_t index_pos[HA_MAX_POSSIBLE_KEY];
+  uint r_locks,w_locks;
+  int old_lock;
+  MARIA_SHARE *share= info->s;
+  MARIA_STATE_INFO old_state;
+  myf sync_dir= ((share->now_transactional && !share->temporary) ?
+                 MY_SYNC_DIR : 0);
+  DBUG_ENTER("maria_sort_index");
+
+  /* cannot sort index files with R-tree indexes */
+  for (key= 0,keyinfo= &share->keyinfo[0]; key < share->base.keys ;
+       key++,keyinfo++)
+    if (keyinfo->key_alg == HA_KEY_ALG_RTREE)
+      DBUG_RETURN(0);
+
+  if (!(param->testflag & T_SILENT))
+    printf("- Sorting index for Aria-table '%s'\n",name);
+
+  if (protect_against_repair_crash(info, param, FALSE))
+    DBUG_RETURN(1);
+
+  /* Get real path for index file */
+  fn_format(param->temp_filename,name,"", MARIA_NAME_IEXT,2+4+32);
+  if ((new_file=my_create(fn_format(param->temp_filename,param->temp_filename,
+				    "", INDEX_TMP_EXT,2+4),
+			  0,param->tmpfile_createflag,MYF(0))) <= 0)
+  {
+    _ma_check_print_error(param,"Can't create new tempfile: '%s'",
+			 param->temp_filename);
+    DBUG_RETURN(-1);
+  }
+  if (maria_filecopy(param, new_file, share->kfile.file, 0L,
+                     (ulong) share->base.keystart, "headerblock"))
+    goto err;
+
+  param->new_file_pos=share->base.keystart;
+  for (key= 0,keyinfo= &share->keyinfo[0]; key < share->base.keys ;
+       key++,keyinfo++)
+  {
+    if (! maria_is_key_active(share->state.key_map, key))
+      continue;
+
+    if (share->state.key_root[key] != HA_OFFSET_ERROR)
+    {
+      index_pos[key]=param->new_file_pos;	/* Write first block here */
+      if (sort_one_index(param,info,keyinfo,share->state.key_root[key],
+			 new_file))
+	goto err;
+    }
+    else
+      index_pos[key]= HA_OFFSET_ERROR;		/* No blocks */
+  }
+
+  /* Flush key cache for this file if we are calling this outside maria_chk */
+  flush_pagecache_blocks(share->pagecache, &share->kfile,
+                         FLUSH_IGNORE_CHANGED);
+
+  share->state.version=(ulong) time((time_t*) 0);
+  old_state= share->state;			/* save state if not stored */
+  r_locks=   share->r_locks;
+  w_locks=   share->w_locks;
+  old_lock=  info->lock_type;
+
+	/* Put same locks as old file */
+  share->r_locks= share->w_locks= share->tot_locks= 0;
+  (void) _ma_writeinfo(info,WRITEINFO_UPDATE_KEYFILE);
+  pthread_mutex_lock(&share->intern_lock);
+  VOID(my_close(share->kfile.file, MYF(MY_WME)));
+  share->kfile.file = -1;
+  pthread_mutex_unlock(&share->intern_lock);
+  VOID(my_close(new_file,MYF(MY_WME)));
+  if (maria_change_to_newfile(share->index_file_name.str, MARIA_NAME_IEXT,
+                              INDEX_TMP_EXT, sync_dir) ||
+      _ma_open_keyfile(share))
+    goto err2;
+  info->lock_type= F_UNLCK;			/* Force maria_readinfo to lock */
+  _ma_readinfo(info,F_WRLCK,0);			/* Will lock the table */
+  info->lock_type=  old_lock;
+  share->r_locks=   r_locks;
+  share->w_locks=   w_locks;
+  share->tot_locks= r_locks+w_locks;
+  share->state=     old_state;			/* Restore old state */
+
+  share->state.state.key_file_length=param->new_file_pos;
+  info->update= (short) (HA_STATE_CHANGED | HA_STATE_ROW_CHANGED);
+  for (key=0 ; key < share->base.keys ; key++)
+    share->state.key_root[key]=index_pos[key];
+  share->state.key_del=  HA_OFFSET_ERROR;
+
+  share->state.changed&= ~STATE_NOT_SORTED_PAGES;
+  DBUG_EXECUTE_IF("maria_flush_whole_log",
+                  {
+                    DBUG_PRINT("maria_flush_whole_log", ("now"));
+                    translog_flush(translog_get_horizon());
+                  });
+  DBUG_EXECUTE_IF("maria_crash_sort_index",
+                  {
+                    DBUG_PRINT("maria_crash_sort_index", ("now"));
+                    DBUG_ABORT();
+                  });
+  DBUG_RETURN(0);
+
+err:
+  VOID(my_close(new_file,MYF(MY_WME)));
+err2:
+  VOID(my_delete(param->temp_filename,MYF(MY_WME)));
+  DBUG_RETURN(-1);
+} /* maria_sort_index */
+
+
+/**
+  @brief put CRC on the page
+
+  @param buff            reference on the page buffer.
+  @param pos             position of the page in the file.
+  @param length          length of the page
+*/
+
+static void put_crc(uchar *buff, my_off_t pos, MARIA_SHARE *share)
+{
+  maria_page_crc_set_index(buff, (pgcache_page_no_t) (pos / share->block_size),
+                           (uchar*) share);
+}
+
+
+/* Sort index blocks recursive using one index */
+
+static int sort_one_index(HA_CHECK *param, MARIA_HA *info,
+                          MARIA_KEYDEF *keyinfo,
+			  my_off_t pagepos, File new_file)
+{
+  uint length,nod_flag;
+  uchar *buff,*keypos,*endpos;
+  my_off_t new_page_pos,next_page;
+  MARIA_SHARE *share= info->s;
+  MARIA_KEY key;
+  MARIA_PAGE page;
+  DBUG_ENTER("sort_one_index");
+
+  /* cannot walk over R-tree indices */
+  DBUG_ASSERT(keyinfo->key_alg != HA_KEY_ALG_RTREE);
+  new_page_pos=param->new_file_pos;
+  param->new_file_pos+=keyinfo->block_length;
+  key.keyinfo= keyinfo;
+
+  if (!(buff= (uchar*) my_alloca((uint) keyinfo->block_length +
+                                 keyinfo->maxlength)))
+  {
+    _ma_check_print_error(param,"Not enough memory for key block");
+    DBUG_RETURN(-1);
+  }
+  key.data= buff + keyinfo->block_length;
+
+  if (_ma_fetch_keypage(&page, info, keyinfo, pagepos,
+                        PAGECACHE_LOCK_LEFT_UNLOCKED,
+                        DFLT_INIT_HITS, buff, 0))
+  {
+    report_keypage_fault(param, info, pagepos);
+    goto err;
+  }
+
+  if ((nod_flag= page.node) || keyinfo->flag & HA_FULLTEXT)
+  {
+    keypos= page.buff + share->keypage_header + nod_flag;
+    endpos= page.buff + page.size;
+
+    for ( ;; )
+    {
+      if (nod_flag)
+      {
+	next_page= _ma_kpos(nod_flag,keypos);
+        /* Save new pos */
+	_ma_kpointer(info,keypos-nod_flag,param->new_file_pos);
+	if (sort_one_index(param,info,keyinfo,next_page,new_file))
+	{
+	  DBUG_PRINT("error",
+		     ("From page: %ld, keyoffset: %lu  used_length: %d",
+		      (ulong) pagepos, (ulong) (keypos - buff),
+		      (int) page.size));
+	  DBUG_DUMP("buff", page.buff, page.size);
+	  goto err;
+	}
+      }
+      if (keypos >= endpos ||
+	  !(*keyinfo->get_key)(&key, page.flag, nod_flag, &keypos))
+	break;
+      DBUG_ASSERT(keypos <= endpos);
+      if (keyinfo->flag & HA_FULLTEXT)
+      {
+        uint off;
+        int  subkeys;
+        get_key_full_length_rdonly(off, key.data);
+        subkeys= ft_sintXkorr(key.data + off);
+        if (subkeys < 0)
+        {
+          next_page= _ma_row_pos_from_key(&key);
+          _ma_dpointer(share, keypos - nod_flag - share->rec_reflength,
+                       param->new_file_pos); /* Save new pos */
+          if (sort_one_index(param,info,&share->ft2_keyinfo,
+                             next_page,new_file))
+            goto err;
+        }
+      }
+    }
+  }
+
+  /* Fill block with zero and write it to the new index file */
+  length= page.size;
+  bzero(buff+length,keyinfo->block_length-length);
+  put_crc(buff, new_page_pos, share);
+  if (my_pwrite(new_file, buff,(uint) keyinfo->block_length,
+		new_page_pos,MYF(MY_NABP | MY_WAIT_IF_FULL)))
+  {
+    _ma_check_print_error(param,"Can't write indexblock, error: %d",my_errno);
+    goto err;
+  }
+  my_afree(buff);
+  DBUG_RETURN(0);
+err:
+  my_afree(buff);
+  DBUG_RETURN(1);
+} /* sort_one_index */
+
+
+/**
+   @brief Fill empty space in index file with zeroes
+
+   @return
+   @retval 0  Ok
+   @retval 1  Error
+*/
+
+static my_bool maria_zerofill_index(HA_CHECK *param, MARIA_HA *info,
+                                    const char *name)
+{
+  MARIA_SHARE *share= info->s;
+  MARIA_PINNED_PAGE page_link;
+  char llbuff[21];
+  uchar *buff;
+  pgcache_page_no_t page;
+  my_off_t pos;
+  my_off_t key_file_length= share->state.state.key_file_length;
+  uint block_size= share->block_size;
+  my_bool zero_lsn= (share->base.born_transactional &&
+                     !(param->testflag & T_ZEROFILL_KEEP_LSN));
+  DBUG_ENTER("maria_zerofill_index");
+
+  if (!(param->testflag & T_SILENT))
+    printf("- Zerofilling index for Aria-table '%s'\n",name);
+
+  /* Go through the index file */
+  for (pos= share->base.keystart, page= (ulonglong) (pos / block_size);
+       pos < key_file_length;
+       pos+= block_size, page++)
+  {
+    uint length;
+    if (!(buff= pagecache_read(share->pagecache,
+                               &share->kfile, page,
+                               DFLT_INIT_HITS, 0,
+                               PAGECACHE_PLAIN_PAGE, PAGECACHE_LOCK_WRITE,
+                               &page_link.link)))
+    {
+      pagecache_unlock_by_link(share->pagecache, page_link.link,
+                               PAGECACHE_LOCK_WRITE_UNLOCK,
+                               PAGECACHE_UNPIN, LSN_IMPOSSIBLE,
+                               LSN_IMPOSSIBLE, 0, FALSE);
+      _ma_check_print_error(param,
+                            "Page %9s: Got error %d when reading index file",
+                            llstr(pos, llbuff), my_errno);
+      DBUG_RETURN(1);
+    }
+    if (zero_lsn)
+      bzero(buff, LSN_SIZE);
+
+    if (share->base.born_transactional)
+    {
+      uint keynr= _ma_get_keynr(share, buff);
+      if (keynr != MARIA_DELETE_KEY_NR)
+      {
+        MARIA_PAGE page;
+        DBUG_ASSERT(keynr < share->base.keys);
+
+        _ma_page_setup(&page, info, share->keyinfo + keynr, pos, buff);
+        if (_ma_compact_keypage(&page, ~(TrID) 0))
+        {
+          _ma_check_print_error(param,
+                                "Page %9s: Got error %d when reading index "
+                                "file",
+                                llstr(pos, llbuff), my_errno);
+          DBUG_RETURN(1);
+        }
+      }
+    }
+
+    length= _ma_get_page_used(share, buff);
+    DBUG_ASSERT(length <= block_size);
+    if (length < block_size)
+      bzero(buff + length, block_size - length);
+    pagecache_unlock_by_link(share->pagecache, page_link.link,
+                             PAGECACHE_LOCK_WRITE_UNLOCK,
+                             PAGECACHE_UNPIN, LSN_IMPOSSIBLE,
+                             LSN_IMPOSSIBLE, 1, FALSE);
+  }
+  if (flush_pagecache_blocks(share->pagecache, &share->kfile,
+                             FLUSH_FORCE_WRITE))
+    DBUG_RETURN(1);
+  DBUG_RETURN(0);
+}
+
+
+/**
+   @brief Fill empty space in data file with zeroes
+
+   @todo
+   Zerofill all pages marked in bitmap as empty and change them to
+   be of type UNALLOCATED_PAGE
+
+   @return
+   @retval 0  Ok
+   @retval 1  Error
+*/
+
+static my_bool maria_zerofill_data(HA_CHECK *param, MARIA_HA *info,
+                                   const char *name)
+{
+  MARIA_SHARE *share= info->s;
+  MARIA_PINNED_PAGE page_link;
+  char llbuff[21];
+  my_off_t pos;
+  pgcache_page_no_t page;
+  uint block_size= share->block_size;
+  MARIA_FILE_BITMAP *bitmap= &share->bitmap;
+  my_bool zero_lsn= !(param->testflag & T_ZEROFILL_KEEP_LSN), error;
+  DBUG_ENTER("maria_zerofill_data");
+
+  /* This works only with BLOCK_RECORD files */
+  if (share->data_file_type != BLOCK_RECORD)
+    DBUG_RETURN(0);
+
+  if (!(param->testflag & T_SILENT))
+    printf("- Zerofilling data  for Aria-table '%s'\n",name);
+
+  /* Go through the record file */
+  for (page= 1, pos= block_size;
+       pos < share->state.state.data_file_length;
+       pos+= block_size, page++)
+  {
+    uchar *buff;
+    enum en_page_type page_type;
+
+    /* Ignore bitmap pages */
+    if ((page % share->bitmap.pages_covered) == 0)
+      continue;
+    if (!(buff= pagecache_read(share->pagecache,
+                               &info->dfile,
+                               page, 1, 0,
+                               PAGECACHE_PLAIN_PAGE, PAGECACHE_LOCK_WRITE,
+                               &page_link.link)))
+    {
+      _ma_check_print_error(param,
+                            "Page %9s:  Got error: %d when reading datafile",
+                            llstr(pos, llbuff), my_errno);
+      goto err;
+    }
+    page_type= (enum en_page_type) (buff[PAGE_TYPE_OFFSET] & PAGE_TYPE_MASK);
+    switch (page_type) {
+    case UNALLOCATED_PAGE:
+      if (zero_lsn)
+        bzero(buff, block_size);
+      else
+        bzero(buff + LSN_SIZE, block_size - LSN_SIZE);
+      break;
+    case BLOB_PAGE:
+      if (_ma_bitmap_get_page_bits(info, bitmap, page) == 0)
+      {
+        /* Unallocated page */
+        if (zero_lsn)
+          bzero(buff, block_size);
+        else
+          bzero(buff + LSN_SIZE, block_size - LSN_SIZE);
+      }
+      else
+        if (zero_lsn)
+          bzero(buff, LSN_SIZE);
+      break;
+    case HEAD_PAGE:
+    case TAIL_PAGE:
+    {
+      uint max_entry= (uint) buff[DIR_COUNT_OFFSET];
+      uint offset, dir_start, empty_space;
+      uchar *dir;
+
+      if (zero_lsn)
+        bzero(buff, LSN_SIZE);
+      if (max_entry != 0)
+      {
+        my_bool is_head_page= (page_type == HEAD_PAGE);
+        dir= dir_entry_pos(buff, block_size, max_entry - 1);
+        _ma_compact_block_page(buff, block_size, max_entry -1, 0,
+                               is_head_page ? ~(TrID) 0 : 0,
+                               is_head_page ?
+                               share->base.min_block_length : 0);
+
+        /* compactation may have increased free space */
+        empty_space= uint2korr(buff + EMPTY_SPACE_OFFSET);
+        if (!enough_free_entries_on_page(share, buff))
+          empty_space= 0;                         /* Page is full */
+        if (_ma_bitmap_set(info, page, is_head_page,
+                           empty_space))
+          goto err;
+
+        /* Zerofill the not used part */
+        offset= uint2korr(dir) + uint2korr(dir+2);
+        dir_start= (uint) (dir - buff);
+        DBUG_ASSERT(dir_start >= offset);
+        if (dir_start > offset)
+          bzero(buff + offset, dir_start - offset);
+      }
+      break;
+    }
+    default:
+      _ma_check_print_error(param,
+                            "Page %9s:  Found unrecognizable block of type %d",
+                            llstr(pos, llbuff), page_type);
+      goto err;
+    }
+    pagecache_unlock_by_link(share->pagecache, page_link.link,
+                             PAGECACHE_LOCK_WRITE_UNLOCK,
+                             PAGECACHE_UNPIN, LSN_IMPOSSIBLE,
+                             LSN_IMPOSSIBLE, 1, FALSE);
+  }
+  error= _ma_bitmap_flush(share);
+  if (flush_pagecache_blocks(share->pagecache, &info->dfile,
+                             FLUSH_FORCE_WRITE))
+    error= 1;
+  DBUG_RETURN(error);
+
+err:
+  pagecache_unlock_by_link(share->pagecache, page_link.link,
+                           PAGECACHE_LOCK_WRITE_UNLOCK,
+                           PAGECACHE_UNPIN, LSN_IMPOSSIBLE,
+                           LSN_IMPOSSIBLE, 0, FALSE);
+  /* flush what was changed so far */
+  (void) _ma_bitmap_flush(share);
+  (void) flush_pagecache_blocks(share->pagecache, &info->dfile,
+                                FLUSH_FORCE_WRITE);
+
+  DBUG_RETURN(1);
+}
+
+
+/**
+   @brief Fill empty space in index and data files with zeroes
+
+   @return
+   @retval 0  Ok
+   @retval 1  Error
+*/
+
+int maria_zerofill(HA_CHECK *param, MARIA_HA *info, const char *name)
+{
+  my_bool error, reenable_logging,
+    zero_lsn= !(param->testflag & T_ZEROFILL_KEEP_LSN);
+  MARIA_SHARE *share= info->s;
+  DBUG_ENTER("maria_zerofill");
+  if ((reenable_logging= share->now_transactional))
+    _ma_tmp_disable_logging_for_table(info, 0);
+  if (!(error= (maria_zerofill_index(param, info, name) ||
+                maria_zerofill_data(param, info, name) ||
+                _ma_set_uuid(info, 0))))
+  {
+    /*
+      Mark that we have done zerofill of data and index. If we zeroed pages'
+      LSN, table is movable.
+    */
+    share->state.changed&= ~STATE_NOT_ZEROFILLED;
+    if (zero_lsn)
+    {
+      share->state.changed&= ~(STATE_NOT_MOVABLE | STATE_MOVED);
+      /* Table should get new LSNs */
+      share->state.create_rename_lsn= share->state.is_of_horizon=
+        share->state.skip_redo_lsn= LSN_NEEDS_NEW_STATE_LSNS;
+    }
+    /* Ensure state is later flushed to disk, if within maria_chk */
+    info->update= (HA_STATE_CHANGED | HA_STATE_ROW_CHANGED);
+
+    /* Reset create_trid to make file comparable */
+    share->state.create_trid= 0;
+  }
+  if (reenable_logging)
+    _ma_reenable_logging_for_table(info, FALSE);
+  DBUG_RETURN(error);
+}
+
+
+/*
+  Let temporary file replace old file.
+  This assumes that the new file was created in the same
+  directory as given by realpath(filename).
+  This will ensure that any symlinks that are used will still work.
+  Copy stats from old file to new file, deletes orignal and
+  changes new file name to old file name
+*/
+
+int maria_change_to_newfile(const char * filename, const char * old_ext,
+                            const char * new_ext, myf MyFlags)
+{
+  char old_filename[FN_REFLEN],new_filename[FN_REFLEN];
+#ifdef USE_RAID
+  if (raid_chunks)
+    return my_raid_redel(fn_format(old_filename,filename,"",old_ext,2+4),
+			 fn_format(new_filename,filename,"",new_ext,2+4),
+			 raid_chunks,
+			 MYF(MY_WME | MY_LINK_WARNING | MyFlags));
+#endif
+  /* Get real path to filename */
+  (void) fn_format(old_filename,filename,"",old_ext,2+4+32);
+  return my_redel(old_filename,
+		  fn_format(new_filename,old_filename,"",new_ext,2+4),
+		  MYF(MY_WME | MY_LINK_WARNING | MyFlags));
+} /* maria_change_to_newfile */
+
+
+/* Copy a block between two files */
+
+int maria_filecopy(HA_CHECK *param, File to,File from,my_off_t start,
+                   my_off_t length, const char *type)
+{
+  uchar tmp_buff[IO_SIZE], *buff;
+  ulong buff_length;
+  DBUG_ENTER("maria_filecopy");
+
+  buff_length=(ulong) min(param->write_buffer_length,length);
+  if (!(buff=my_malloc(buff_length,MYF(0))))
+  {
+    buff=tmp_buff; buff_length=IO_SIZE;
+  }
+
+  VOID(my_seek(from,start,MY_SEEK_SET,MYF(0)));
+  while (length > buff_length)
+  {
+    if (my_read(from, buff, buff_length, MYF(MY_NABP)) ||
+	my_write(to,  buff, buff_length, param->myf_rw))
+      goto err;
+    length-= buff_length;
+  }
+  if (my_read(from, buff, (size_t) length,MYF(MY_NABP)) ||
+      my_write(to,  buff, (size_t) length,param->myf_rw))
+    goto err;
+  if (buff != tmp_buff)
+    my_free(buff,MYF(0));
+  DBUG_RETURN(0);
+err:
+  if (buff != tmp_buff)
+    my_free(buff,MYF(0));
+  _ma_check_print_error(param,"Can't copy %s to tempfile, error %d",
+		       type,my_errno);
+  DBUG_RETURN(1);
+}
+
+
+/*
+  Repair table or given index using sorting
+
+  SYNOPSIS
+    maria_repair_by_sort()
+    param		Repair parameters
+    info		MARIA handler to repair
+    name		Name of table (for warnings)
+    rep_quick		set to <> 0 if we should not change data file
+
+  RESULT
+    0	ok
+    <>0	Error
+*/
+
+int maria_repair_by_sort(HA_CHECK *param, register MARIA_HA *info,
+                         const char * name, my_bool rep_quick)
+{
+  int got_error;
+  uint i;
+  ha_rows start_records;
+  my_off_t new_header_length, org_header_length, del;
+  File new_file;
+  MARIA_SORT_PARAM sort_param;
+  MARIA_SHARE *share= info->s;
+  HA_KEYSEG *keyseg;
+  double  *rec_per_key_part;
+  char llbuff[22];
+  MARIA_SORT_INFO sort_info;
+  ulonglong key_map;
+  myf sync_dir= ((share->now_transactional && !share->temporary) ?
+                 MY_SYNC_DIR : 0);
+  my_bool scan_inited= 0, reenable_logging= 0;
+  MARIA_SHARE backup_share;
+  DBUG_ENTER("maria_repair_by_sort");
+  LINT_INIT(key_map);
+
+  got_error= 1;
+  new_file= -1;
+  start_records= share->state.state.records;
+  if (!(param->testflag & T_SILENT))
+  {
+    printf("- recovering (with sort) Aria-table '%s'\n",name);
+    printf("Data records: %s\n", llstr(start_records,llbuff));
+  }
+
+  if (initialize_variables_for_repair(param, &sort_info, &sort_param, info,
+                                      rep_quick, &backup_share))
+    goto err;
+
+  if ((reenable_logging= share->now_transactional))
+    _ma_tmp_disable_logging_for_table(info, 0);
+
+  org_header_length= share->pack.header_length;
+  new_header_length= (param->testflag & T_UNPACK) ? 0 : org_header_length;
+  sort_param.filepos= new_header_length;
+
+  if (!rep_quick)
+  {
+    /* Get real path for data file */
+    if ((new_file=my_create(fn_format(param->temp_filename,
+                                      share->data_file_name.str, "",
+                                      DATA_TMP_EXT, 2+4),
+                            0,param->tmpfile_createflag,
+                            MYF(0))) < 0)
+    {
+      _ma_check_print_error(param,"Can't create new tempfile: '%s'",
+			   param->temp_filename);
+      goto err;
+    }
+    if (new_header_length &&
+        maria_filecopy(param, new_file, info->dfile.file, 0L,
+                       new_header_length, "datafile-header"))
+      goto err;
+
+    share->state.dellink= HA_OFFSET_ERROR;
+    info->rec_cache.file= new_file;             /* For sort_delete_record */
+    if (share->data_file_type == BLOCK_RECORD ||
+        (param->testflag & T_UNPACK))
+    {
+      if (create_new_data_handle(&sort_param, new_file))
+        goto err;
+      sort_info.new_info->rec_cache.file= new_file;
+    }
+  }
+
+  if (!(sort_info.key_block=
+	alloc_key_blocks(param,
+			 (uint) param->sort_key_blocks,
+			 share->base.max_key_block_length)))
+    goto err;
+  sort_info.key_block_end=sort_info.key_block+param->sort_key_blocks;
+
+  if (share->data_file_type != BLOCK_RECORD)
+  {
+    /* We need a read buffer to read rows in big blocks */
+    if (init_io_cache(&param->read_cache, info->dfile.file,
+                      (uint) param->read_buffer_length,
+                      READ_CACHE, org_header_length, 1, MYF(MY_WME)))
+      goto err;
+  }
+  if (sort_info.new_info->s->data_file_type != BLOCK_RECORD)
+  {
+    /* When writing to not block records, we need a write buffer */
+    if (!rep_quick)
+    {
+      if (init_io_cache(&sort_info.new_info->rec_cache, new_file,
+                        (uint) param->write_buffer_length,
+                        WRITE_CACHE, new_header_length, 1,
+                        MYF(MY_WME | MY_WAIT_IF_FULL) & param->myf_rw))
+        goto err;
+      sort_info.new_info->opt_flag|= WRITE_CACHE_USED;
+    }
+  }
+
+  if (!(sort_param.record=
+        (uchar*) my_malloc((size_t) share->base.default_rec_buff_size,
+                           MYF(0))) ||
+      _ma_alloc_buffer(&sort_param.rec_buff, &sort_param.rec_buff_size,
+                       share->base.default_rec_buff_size))
+  {
+    _ma_check_print_error(param, "Not enough memory for extra record");
+    goto err;
+  }
+
+  /* Optionally drop indexes and optionally modify the key_map */
+  maria_drop_all_indexes(param, info, FALSE);
+  key_map= share->state.key_map;
+  if (param->testflag & T_CREATE_MISSING_KEYS)
+  {
+    /* Invert the copied key_map to recreate all disabled indexes. */
+    key_map= ~key_map;
+  }
+
+  param->read_cache.end_of_file= sort_info.filelength;
+  sort_param.wordlist=NULL;
+  init_alloc_root(&sort_param.wordroot, FTPARSER_MEMROOT_ALLOC_SIZE, 0);
+
+  sort_param.key_cmp=sort_key_cmp;
+  sort_param.lock_in_memory=maria_lock_memory;
+  sort_param.tmpdir=param->tmpdir;
+  sort_param.master =1;
+
+  del=share->state.state.del;
+
+  rec_per_key_part= param->new_rec_per_key_part;
+  for (sort_param.key=0 ; sort_param.key < share->base.keys ;
+       rec_per_key_part+=sort_param.keyinfo->keysegs, sort_param.key++)
+  {
+    sort_param.keyinfo=share->keyinfo+sort_param.key;
+    /*
+      Skip this index if it is marked disabled in the copied
+      (and possibly inverted) key_map.
+    */
+    if (! maria_is_key_active(key_map, sort_param.key))
+    {
+      /* Remember old statistics for key */
+      memcpy((char*) rec_per_key_part,
+	     (char*) (share->state.rec_per_key_part +
+		      (uint) (rec_per_key_part - param->new_rec_per_key_part)),
+	     sort_param.keyinfo->keysegs*sizeof(*rec_per_key_part));
+      DBUG_PRINT("repair", ("skipping seemingly disabled index #: %u",
+                            sort_param.key));
+      continue;
+    }
+
+    if ((!(param->testflag & T_SILENT)))
+      printf ("- Fixing index %d\n",sort_param.key+1);
+
+    sort_param.read_cache=param->read_cache;
+    sort_param.seg=sort_param.keyinfo->seg;
+    sort_param.max_pos= sort_param.pos= org_header_length;
+    keyseg=sort_param.seg;
+    bzero((char*) sort_param.unique,sizeof(sort_param.unique));
+    sort_param.key_length=share->rec_reflength;
+    for (i=0 ; keyseg[i].type != HA_KEYTYPE_END; i++)
+    {
+      sort_param.key_length+=keyseg[i].length;
+      if (keyseg[i].flag & HA_SPACE_PACK)
+	sort_param.key_length+=get_pack_length(keyseg[i].length);
+      if (keyseg[i].flag & (HA_BLOB_PART | HA_VAR_LENGTH_PART))
+	sort_param.key_length+=2 + test(keyseg[i].length >= 127);
+      if (keyseg[i].flag & HA_NULL_PART)
+	sort_param.key_length++;
+    }
+    share->state.state.records=share->state.state.del=share->state.split=0;
+    share->state.state.empty=0;
+
+    if (sort_param.keyinfo->flag & HA_FULLTEXT)
+    {
+      uint ft_max_word_len_for_sort=FT_MAX_WORD_LEN_FOR_SORT*
+                                    sort_param.keyinfo->seg->charset->mbmaxlen;
+      sort_param.key_length+=ft_max_word_len_for_sort-HA_FT_MAXBYTELEN;
+      /*
+        fulltext indexes may have much more entries than the
+        number of rows in the table. We estimate the number here.
+
+        Note, built-in parser is always nr. 0 - see ftparser_call_initializer()
+      */
+      if (sort_param.keyinfo->ftkey_nr == 0)
+      {
+        /*
+          for built-in parser the number of generated index entries
+          cannot be larger than the size of the data file divided
+          by the minimal word's length
+        */
+        sort_info.max_records=
+          (ha_rows) (sort_info.filelength/ft_min_word_len+1);
+      }
+      else
+      {
+        /*
+          for external plugin parser we cannot tell anything at all :(
+          so, we'll use all the sort memory and start from ~10 buffpeks.
+          (see _ma_create_index_by_sort)
+        */
+        sort_info.max_records=
+          10*param->sort_buffer_length/sort_param.key_length;
+      }
+
+      sort_param.key_read=  sort_maria_ft_key_read;
+      sort_param.key_write= sort_maria_ft_key_write;
+    }
+    else
+    {
+      sort_param.key_read=  sort_key_read;
+      sort_param.key_write= sort_key_write;
+    }
+
+    if (sort_info.new_info->s->data_file_type == BLOCK_RECORD)
+    {
+      scan_inited= 1;
+      if (maria_scan_init(sort_info.info))
+        goto err;
+    }
+    if (_ma_create_index_by_sort(&sort_param,
+                                 (my_bool) (!(param->testflag & T_VERBOSE)),
+                                 (size_t) param->sort_buffer_length))
+    {
+      param->retry_repair=1;
+      _ma_check_print_error(param, "Create index by sort failed");
+      goto err;
+    }
+    DBUG_EXECUTE_IF("maria_flush_whole_log",
+                    {
+                      DBUG_PRINT("maria_flush_whole_log", ("now"));
+                      translog_flush(translog_get_horizon());
+                    });
+    DBUG_EXECUTE_IF("maria_crash_create_index_by_sort",
+                    {
+                      DBUG_PRINT("maria_crash_create_index_by_sort", ("now"));
+                      DBUG_ABORT();
+                    });
+    if (scan_inited)
+    {
+      scan_inited= 0;
+      maria_scan_end(sort_info.info);
+    }
+
+    /* No need to calculate checksum again. */
+    sort_param.calc_checksum= 0;
+    free_root(&sort_param.wordroot, MYF(0));
+
+    /* Set for next loop */
+    sort_info.max_records= (ha_rows) sort_info.new_info->s->state.state.records;
+    if (param->testflag & T_STATISTICS)
+      maria_update_key_parts(sort_param.keyinfo, rec_per_key_part,
+                             sort_param.unique,
+                             (param->stats_method ==
+                              MI_STATS_METHOD_IGNORE_NULLS ?
+                              sort_param.notnull : NULL),
+                             (ulonglong) share->state.state.records);
+    maria_set_key_active(share->state.key_map, sort_param.key);
+    DBUG_PRINT("repair", ("set enabled index #: %u", sort_param.key));
+
+    if (_ma_flush_table_files_before_swap(param, info))
+      goto err;
+
+    if (sort_param.fix_datafile)
+    {
+      param->read_cache.end_of_file=sort_param.filepos;
+      if (maria_write_data_suffix(&sort_info,1) ||
+          end_io_cache(&sort_info.new_info->rec_cache))
+      {
+        _ma_check_print_error(param, "Got error when flushing row cache");
+	goto err;
+      }
+      sort_info.new_info->opt_flag&= ~WRITE_CACHE_USED;
+
+      if (param->testflag & T_SAFE_REPAIR)
+      {
+	/* Don't repair if we loosed more than one row */
+	if (share->state.state.records+1 < start_records)
+	{
+          _ma_check_print_error(param,
+                                "Rows lost; Aborting because safe repair was "
+                                "requested");
+          share->state.state.records=start_records;
+	  goto err;
+	}
+      }
+
+      sort_info.new_info->s->state.state.data_file_length= sort_param.filepos;
+      if (sort_info.new_info != sort_info.info)
+      {
+        MARIA_STATE_INFO save_state= sort_info.new_info->s->state;
+        if (maria_close(sort_info.new_info))
+        {
+          _ma_check_print_error(param, "Got error %d on close", my_errno);
+          goto err;
+        }
+        copy_data_file_state(&share->state, &save_state);
+        new_file= -1;
+        sort_info.new_info= info;
+        info->rec_cache.file= info->dfile.file;
+      }
+
+      share->state.version=(ulong) time((time_t*) 0);	/* Force reopen */
+
+      /* Replace the actual file with the temporary file */
+      if (new_file >= 0)
+      {
+        my_close(new_file, MYF(MY_WME));
+        new_file= -1;
+      }
+      change_data_file_descriptor(info, -1);
+      if (maria_change_to_newfile(share->data_file_name.str, MARIA_NAME_DEXT,
+                                  DATA_TMP_EXT,
+                                  (param->testflag & T_BACKUP_DATA ?
+                                   MYF(MY_REDEL_MAKE_BACKUP): MYF(0)) |
+                                  sync_dir) ||
+          _ma_open_datafile(info, share, NullS, -1))
+      {
+        _ma_check_print_error(param, "Couldn't change to new data file");
+        goto err;
+      }
+      if (param->testflag & T_UNPACK)
+        restore_data_file_type(share);
+
+      org_header_length= share->pack.header_length;
+      sort_info.org_data_file_type= share->data_file_type;
+      sort_info.filelength= share->state.state.data_file_length;
+      sort_param.fix_datafile=0;
+    }
+    else
+      share->state.state.data_file_length=sort_param.max_pos;
+
+    param->read_cache.file= info->dfile.file;	/* re-init read cache */
+    reinit_io_cache(&param->read_cache,READ_CACHE,share->pack.header_length,
+                    1,1);
+  }
+
+  if (param->testflag & T_WRITE_LOOP)
+  {
+    VOID(fputs("          \r",stdout)); VOID(fflush(stdout));
+  }
+
+  if (rep_quick && del+sort_info.dupp != share->state.state.del)
+  {
+    _ma_check_print_error(param,"Couldn't fix table with quick recovery: "
+                          "Found wrong number of deleted records");
+    _ma_check_print_error(param,"Run recovery again without -q");
+    got_error=1;
+    param->retry_repair=1;
+    param->testflag|=T_RETRY_WITHOUT_QUICK;
+    goto err;
+  }
+
+  if (rep_quick && (param->testflag & T_FORCE_UNIQUENESS))
+  {
+    my_off_t skr= (share->state.state.data_file_length +
+                   (sort_info.org_data_file_type == COMPRESSED_RECORD) ?
+                   MEMMAP_EXTRA_MARGIN : 0);
+#ifdef USE_RELOC
+    if (sort_info.org_data_file_type == STATIC_RECORD &&
+	skr < share->base.reloc*share->base.min_pack_length)
+      skr=share->base.reloc*share->base.min_pack_length;
+#endif
+    if (skr != sort_info.filelength)
+      if (my_chsize(info->dfile.file, skr, 0, MYF(0)))
+	_ma_check_print_warning(param,
+			       "Can't change size of datafile,  error: %d",
+			       my_errno);
+  }
+
+  if (param->testflag & T_CALC_CHECKSUM)
+    share->state.state.checksum=param->glob_crc;
+
+  if (my_chsize(share->kfile.file, share->state.state.key_file_length, 0,
+                MYF(0)))
+    _ma_check_print_warning(param,
+			   "Can't change size of indexfile, error: %d",
+			   my_errno);
+
+  if (!(param->testflag & T_SILENT))
+  {
+    if (start_records != share->state.state.records)
+      printf("Data records: %s\n", llstr(share->state.state.records,llbuff));
+  }
+  if (sort_info.dupp)
+    _ma_check_print_warning(param,
+                            "%s records have been removed",
+                            llstr(sort_info.dupp,llbuff));
+  got_error=0;
+  /* If invoked by external program that uses thr_lock */
+  if (&share->state.state != info->state)
+    *info->state= *info->state_start= share->state.state;
+
+err:
+  if (scan_inited)
+    maria_scan_end(sort_info.info);
+  _ma_reset_state(info);
+
+  VOID(end_io_cache(&sort_info.new_info->rec_cache));
+  VOID(end_io_cache(&param->read_cache));
+  info->opt_flag&= ~(READ_CACHE_USED | WRITE_CACHE_USED);
+  sort_info.new_info->opt_flag&= ~(READ_CACHE_USED | WRITE_CACHE_USED);
+  if (got_error)
+  {
+    if (! param->error_printed)
+      _ma_check_print_error(param,"%d when fixing table",my_errno);
+    (void)_ma_flush_table_files_before_swap(param, info);
+    if (sort_info.new_info && sort_info.new_info != sort_info.info)
+    {
+      unuse_data_file_descriptor(sort_info.new_info);
+      maria_close(sort_info.new_info);
+    }
+    if (new_file >= 0)
+    {
+      VOID(my_close(new_file,MYF(0)));
+      VOID(my_delete(param->temp_filename, MYF(MY_WME)));
+    }
+    maria_mark_crashed_on_repair(info);
+  }
+  else
+  {
+    if (key_map == share->state.key_map)
+      share->state.changed&= ~STATE_NOT_OPTIMIZED_KEYS;
+    /*
+      Now that we have flushed and forced everything, we can bump
+      create_rename_lsn:
+    */
+    DBUG_EXECUTE_IF("maria_flush_whole_log",
+                    {
+                      DBUG_PRINT("maria_flush_whole_log", ("now"));
+                      translog_flush(translog_get_horizon());
+                    });
+    DBUG_EXECUTE_IF("maria_crash_repair",
+                    {
+                      DBUG_PRINT("maria_crash_repair", ("now"));
+                      DBUG_ABORT();
+                    });
+  }
+  share->state.changed|= STATE_NOT_SORTED_PAGES;
+  if (!rep_quick)
+    share->state.changed&= ~(STATE_NOT_OPTIMIZED_ROWS | STATE_NOT_ZEROFILLED |
+                             STATE_NOT_MOVABLE);
+
+  /* If caller had disabled logging it's not up to us to re-enable it */
+  if (reenable_logging)
+    _ma_reenable_logging_for_table(info, FALSE);
+  restore_table_state_after_repair(info, &backup_share);
+
+  my_free(sort_param.rec_buff, MYF(MY_ALLOW_ZERO_PTR));
+  my_free(sort_param.record,MYF(MY_ALLOW_ZERO_PTR));
+  my_free(sort_info.key_block, MYF(MY_ALLOW_ZERO_PTR));
+  my_free(sort_info.ft_buf, MYF(MY_ALLOW_ZERO_PTR));
+  my_free(sort_info.buff,MYF(MY_ALLOW_ZERO_PTR));
+  DBUG_RETURN(got_error);
+}
+
+
+/*
+  Threaded repair of table using sorting
+
+  SYNOPSIS
+    maria_repair_parallel()
+    param		Repair parameters
+    info		MARIA handler to repair
+    name		Name of table (for warnings)
+    rep_quick		set to <> 0 if we should not change data file
+
+  DESCRIPTION
+    Same as maria_repair_by_sort but do it multithreaded
+    Each key is handled by a separate thread.
+    TODO: make a number of threads a parameter
+
+    In parallel repair we use one thread per index. There are two modes:
+
+    Quick
+
+      Only the indexes are rebuilt. All threads share a read buffer.
+      Every thread that needs fresh data in the buffer enters the shared
+      cache lock. The last thread joining the lock reads the buffer from
+      the data file and wakes all other threads.
+
+    Non-quick
+
+      The data file is rebuilt and all indexes are rebuilt to point to
+      the new record positions. One thread is the master thread. It
+      reads from the old data file and writes to the new data file. It
+      also creates one of the indexes. The other threads read from a
+      buffer which is filled by the master. If they need fresh data,
+      they enter the shared cache lock. If the masters write buffer is
+      full, it flushes it to the new data file and enters the shared
+      cache lock too. When all threads joined in the lock, the master
+      copies its write buffer to the read buffer for the other threads
+      and wakes them.
+
+  RESULT
+    0	ok
+    <>0	Error
+*/
+
+int maria_repair_parallel(HA_CHECK *param, register MARIA_HA *info,
+			const char * name, my_bool rep_quick)
+{
+#ifndef THREAD
+  return maria_repair_by_sort(param, info, name, rep_quick);
+#else
+  int got_error;
+  uint i,key, total_key_length, istep;
+  ha_rows start_records;
+  my_off_t new_header_length,del;
+  File new_file;
+  MARIA_SORT_PARAM *sort_param=0, tmp_sort_param;
+  MARIA_SHARE *share= info->s;
+  double  *rec_per_key_part;
+  HA_KEYSEG *keyseg;
+  char llbuff[22];
+  IO_CACHE new_data_cache; /* For non-quick repair. */
+  IO_CACHE_SHARE io_share;
+  MARIA_SORT_INFO sort_info;
+  MARIA_SHARE backup_share;
+  ulonglong key_map;
+  pthread_attr_t thr_attr;
+  myf sync_dir= ((share->now_transactional && !share->temporary) ?
+                 MY_SYNC_DIR : 0);
+  my_bool reenable_logging= 0;
+  DBUG_ENTER("maria_repair_parallel");
+  LINT_INIT(key_map);
+
+  got_error= 1;
+  new_file= -1;
+  start_records= share->state.state.records;
+  if (!(param->testflag & T_SILENT))
+  {
+    printf("- parallel recovering (with sort) Aria-table '%s'\n",name);
+    printf("Data records: %s\n", llstr(start_records, llbuff));
+  }
+
+  if (initialize_variables_for_repair(param, &sort_info, &tmp_sort_param, info,
+                                      rep_quick, &backup_share))
+    goto err;
+
+  if ((reenable_logging= share->now_transactional))
+    _ma_tmp_disable_logging_for_table(info, 0);
+
+  new_header_length= ((param->testflag & T_UNPACK) ? 0 :
+                      share->pack.header_length);
+
+  /*
+    Quick repair (not touching data file, rebuilding indexes):
+    {
+      Read cache is (HA_CHECK *param)->read_cache using info->dfile.file.
+    }
+
+    Non-quick repair (rebuilding data file and indexes):
+    {
+      Master thread:
+
+        Read  cache is (HA_CHECK *param)->read_cache using info->dfile.file.
+        Write cache is (MARIA_INFO *info)->rec_cache using new_file.
+
+      Slave threads:
+
+        Read cache is new_data_cache synced to master rec_cache.
+
+      The final assignment of the filedescriptor for rec_cache is done
+      after the cache creation.
+
+      Don't check file size on new_data_cache, as the resulting file size
+      is not known yet.
+
+      As rec_cache and new_data_cache are synced, write_buffer_length is
+      used for the read cache 'new_data_cache'. Both start at the same
+      position 'new_header_length'.
+    }
+  */
+  DBUG_PRINT("info", ("is quick repair: %d", (int) rep_quick));
+
+  /* Initialize pthread structures before goto err. */
+  pthread_mutex_init(&sort_info.mutex, MY_MUTEX_INIT_FAST);
+  pthread_cond_init(&sort_info.cond, 0);
+
+  if (!(sort_info.key_block=
+	alloc_key_blocks(param, (uint) param->sort_key_blocks,
+			 share->base.max_key_block_length)) ||
+      init_io_cache(&param->read_cache, info->dfile.file,
+                    (uint) param->read_buffer_length,
+                    READ_CACHE, share->pack.header_length, 1, MYF(MY_WME)) ||
+      (!rep_quick &&
+       (init_io_cache(&info->rec_cache, info->dfile.file,
+                      (uint) param->write_buffer_length,
+                      WRITE_CACHE, new_header_length, 1,
+                      MYF(MY_WME | MY_WAIT_IF_FULL) & param->myf_rw) ||
+        init_io_cache(&new_data_cache, -1,
+                      (uint) param->write_buffer_length,
+                      READ_CACHE, new_header_length, 1,
+                      MYF(MY_WME | MY_DONT_CHECK_FILESIZE)))))
+    goto err;
+  sort_info.key_block_end=sort_info.key_block+param->sort_key_blocks;
+  info->opt_flag|=WRITE_CACHE_USED;
+  info->rec_cache.file= info->dfile.file;         /* for sort_delete_record */
+
+  if (!rep_quick)
+  {
+    /* Get real path for data file */
+    if ((new_file= my_create(fn_format(param->temp_filename,
+                                       share->data_file_name.str, "",
+                                       DATA_TMP_EXT,
+                                       2+4),
+                             0,param->tmpfile_createflag,
+                             MYF(0))) < 0)
+    {
+      _ma_check_print_error(param,"Can't create new tempfile: '%s'",
+			   param->temp_filename);
+      goto err;
+    }
+    if (new_header_length &&
+        maria_filecopy(param, new_file, info->dfile.file,0L,new_header_length,
+                       "datafile-header"))
+      goto err;
+    if (param->testflag & T_UNPACK)
+      restore_data_file_type(share);
+    share->state.dellink= HA_OFFSET_ERROR;
+    info->rec_cache.file=new_file;
+  }
+
+  /* Optionally drop indexes and optionally modify the key_map. */
+  maria_drop_all_indexes(param, info, FALSE);
+  key_map= share->state.key_map;
+  if (param->testflag & T_CREATE_MISSING_KEYS)
+  {
+    /* Invert the copied key_map to recreate all disabled indexes. */
+    key_map= ~key_map;
+  }
+
+  param->read_cache.end_of_file= sort_info.filelength;
+
+  /*
+    +1 below is required hack for parallel repair mode.
+    The share->state.state.records value, that is compared later
+    to sort_info.max_records and cannot exceed it, is
+    increased in sort_key_write. In maria_repair_by_sort, sort_key_write
+    is called after sort_key_read, where the comparison is performed,
+    but in parallel mode master thread can call sort_key_write
+    before some other repair thread calls sort_key_read.
+    Furthermore I'm not even sure +1 would be enough.
+    May be sort_info.max_records shold be always set to max value in
+    parallel mode.
+  */
+  sort_info.max_records++;
+
+  del=share->state.state.del;
+
+  if (!(sort_param=(MARIA_SORT_PARAM *)
+        my_malloc((uint) share->base.keys *
+		  (sizeof(MARIA_SORT_PARAM) + share->base.pack_reclength),
+		  MYF(MY_ZEROFILL))))
+  {
+    _ma_check_print_error(param,"Not enough memory for key!");
+    goto err;
+  }
+  total_key_length=0;
+  rec_per_key_part= param->new_rec_per_key_part;
+  share->state.state.records=share->state.state.del=share->state.split=0;
+  share->state.state.empty=0;
+
+  for (i=key=0, istep=1 ; key < share->base.keys ;
+       rec_per_key_part+=sort_param[i].keyinfo->keysegs, i+=istep, key++)
+  {
+    sort_param[i].key=key;
+    sort_param[i].keyinfo=share->keyinfo+key;
+    sort_param[i].seg=sort_param[i].keyinfo->seg;
+    /*
+      Skip this index if it is marked disabled in the copied
+      (and possibly inverted) key_map.
+    */
+    if (! maria_is_key_active(key_map, key))
+    {
+      /* Remember old statistics for key */
+      memcpy((char*) rec_per_key_part,
+	     (char*) (share->state.rec_per_key_part+
+		      (uint) (rec_per_key_part - param->new_rec_per_key_part)),
+	     sort_param[i].keyinfo->keysegs*sizeof(*rec_per_key_part));
+      istep=0;
+      continue;
+    }
+    istep=1;
+    if ((!(param->testflag & T_SILENT)))
+      printf ("- Fixing index %d\n",key+1);
+    if (sort_param[i].keyinfo->flag & HA_FULLTEXT)
+    {
+      sort_param[i].key_read=sort_maria_ft_key_read;
+      sort_param[i].key_write=sort_maria_ft_key_write;
+    }
+    else
+    {
+      sort_param[i].key_read=sort_key_read;
+      sort_param[i].key_write=sort_key_write;
+    }
+    sort_param[i].key_cmp=sort_key_cmp;
+    sort_param[i].lock_in_memory=maria_lock_memory;
+    sort_param[i].tmpdir=param->tmpdir;
+    sort_param[i].sort_info=&sort_info;
+    sort_param[i].master=0;
+    sort_param[i].fix_datafile=0;
+    sort_param[i].calc_checksum= 0;
+
+    sort_param[i].filepos=new_header_length;
+    sort_param[i].max_pos=sort_param[i].pos=share->pack.header_length;
+
+    sort_param[i].record= (((uchar *)(sort_param+share->base.keys))+
+                          (share->base.pack_reclength * i));
+    if (_ma_alloc_buffer(&sort_param[i].rec_buff, &sort_param[i].rec_buff_size,
+                         share->base.default_rec_buff_size))
+    {
+      _ma_check_print_error(param,"Not enough memory!");
+      goto err;
+    }
+    sort_param[i].key_length=share->rec_reflength;
+    for (keyseg=sort_param[i].seg; keyseg->type != HA_KEYTYPE_END;
+	 keyseg++)
+    {
+      sort_param[i].key_length+=keyseg->length;
+      if (keyseg->flag & HA_SPACE_PACK)
+        sort_param[i].key_length+=get_pack_length(keyseg->length);
+      if (keyseg->flag & (HA_BLOB_PART | HA_VAR_LENGTH_PART))
+        sort_param[i].key_length+=2 + test(keyseg->length >= 127);
+      if (keyseg->flag & HA_NULL_PART)
+        sort_param[i].key_length++;
+    }
+    total_key_length+=sort_param[i].key_length;
+
+    if (sort_param[i].keyinfo->flag & HA_FULLTEXT)
+    {
+      uint ft_max_word_len_for_sort=
+        (FT_MAX_WORD_LEN_FOR_SORT *
+         sort_param[i].keyinfo->seg->charset->mbmaxlen);
+      sort_param[i].key_length+=ft_max_word_len_for_sort-HA_FT_MAXBYTELEN;
+      init_alloc_root(&sort_param[i].wordroot, FTPARSER_MEMROOT_ALLOC_SIZE, 0);
+    }
+  }
+  sort_info.total_keys=i;
+  sort_param[0].master= 1;
+  sort_param[0].fix_datafile= ! rep_quick;
+  sort_param[0].calc_checksum= test(param->testflag & T_CALC_CHECKSUM);
+
+  if (!maria_ftparser_alloc_param(info))
+    goto err;
+
+  sort_info.got_error=0;
+  pthread_mutex_lock(&sort_info.mutex);
+
+  /*
+    Initialize the I/O cache share for use with the read caches and, in
+    case of non-quick repair, the write cache. When all threads join on
+    the cache lock, the writer copies the write cache contents to the
+    read caches.
+  */
+  if (i > 1)
+  {
+    if (rep_quick)
+      init_io_cache_share(&param->read_cache, &io_share, NULL, i);
+    else
+      init_io_cache_share(&new_data_cache, &io_share, &info->rec_cache, i);
+  }
+  else
+    io_share.total_threads= 0; /* share not used */
+
+  (void) pthread_attr_init(&thr_attr);
+  (void) pthread_attr_setdetachstate(&thr_attr,PTHREAD_CREATE_DETACHED);
+
+  for (i=0 ; i < sort_info.total_keys ; i++)
+  {
+    /*
+      Copy the properly initialized IO_CACHE structure so that every
+      thread has its own copy. In quick mode param->read_cache is shared
+      for use by all threads. In non-quick mode all threads but the
+      first copy the shared new_data_cache, which is synchronized to the
+      write cache of the first thread. The first thread copies
+      param->read_cache, which is not shared.
+    */
+    sort_param[i].read_cache= ((rep_quick || !i) ? param->read_cache :
+                               new_data_cache);
+    DBUG_PRINT("io_cache_share", ("thread: %u  read_cache: 0x%lx",
+                                  i, (long) &sort_param[i].read_cache));
+
+    /*
+      two approaches: the same amount of memory for each thread
+      or the memory for the same number of keys for each thread...
+      In the second one all the threads will fill their sort_buffers
+      (and call write_keys) at the same time, putting more stress on i/o.
+    */
+    sort_param[i].sortbuff_size=
+#ifndef USING_SECOND_APPROACH
+      param->sort_buffer_length/sort_info.total_keys;
+#else
+      param->sort_buffer_length*sort_param[i].key_length/total_key_length;
+#endif
+    if (pthread_create(&sort_param[i].thr, &thr_attr,
+		       _ma_thr_find_all_keys,
+		       (void *) (sort_param+i)))
+    {
+      _ma_check_print_error(param,"Cannot start a repair thread");
+      /* Cleanup: Detach from the share. Avoid others to be blocked. */
+      if (io_share.total_threads)
+        remove_io_thread(&sort_param[i].read_cache);
+      DBUG_PRINT("error", ("Cannot start a repair thread"));
+      sort_info.got_error=1;
+    }
+    else
+      sort_info.threads_running++;
+  }
+  (void) pthread_attr_destroy(&thr_attr);
+
+  /* waiting for all threads to finish */
+  while (sort_info.threads_running)
+    pthread_cond_wait(&sort_info.cond, &sort_info.mutex);
+  pthread_mutex_unlock(&sort_info.mutex);
+
+  if ((got_error= _ma_thr_write_keys(sort_param)))
+  {
+    param->retry_repair=1;
+    goto err;
+  }
+  got_error=1;				/* Assume the following may go wrong */
+
+  if (_ma_flush_table_files_before_swap(param, info))
+    goto err;
+
+  if (sort_param[0].fix_datafile)
+  {
+    /*
+      Append some nulls to the end of a memory mapped file. Destroy the
+      write cache. The master thread did already detach from the share
+      by remove_io_thread() in sort.c:thr_find_all_keys().
+    */
+    if (maria_write_data_suffix(&sort_info,1) ||
+        end_io_cache(&info->rec_cache))
+      goto err;
+    if (param->testflag & T_SAFE_REPAIR)
+    {
+      /* Don't repair if we loosed more than one row */
+      if (share->state.state.records+1 < start_records)
+      {
+        share->state.state.records=start_records;
+        goto err;
+      }
+    }
+    share->state.state.data_file_length= sort_param->filepos;
+    /* Only whole records */
+    share->state.version= (ulong) time((time_t*) 0);
+    /*
+      Exchange the data file descriptor of the table, so that we use the
+      new file from now on.
+     */
+    my_close(info->dfile.file, MYF(0));
+    info->dfile.file= new_file;
+    share->pack.header_length=(ulong) new_header_length;
+  }
+  else
+    share->state.state.data_file_length=sort_param->max_pos;
+
+  if (rep_quick && del+sort_info.dupp != share->state.state.del)
+  {
+    _ma_check_print_error(param,"Couldn't fix table with quick recovery: "
+                          "Found wrong number of deleted records");
+    _ma_check_print_error(param,"Run recovery again without -q");
+    param->retry_repair=1;
+    param->testflag|=T_RETRY_WITHOUT_QUICK;
+    goto err;
+  }
+
+  if (rep_quick && (param->testflag & T_FORCE_UNIQUENESS))
+  {
+    my_off_t skr= (share->state.state.data_file_length +
+                   (sort_info.org_data_file_type == COMPRESSED_RECORD) ?
+                   MEMMAP_EXTRA_MARGIN : 0);
+#ifdef USE_RELOC
+    if (sort_info.org_data_file_type == STATIC_RECORD &&
+	skr < share->base.reloc*share->base.min_pack_length)
+      skr=share->base.reloc*share->base.min_pack_length;
+#endif
+    if (skr != sort_info.filelength)
+      if (my_chsize(info->dfile.file, skr, 0, MYF(0)))
+	_ma_check_print_warning(param,
+			       "Can't change size of datafile,  error: %d",
+			       my_errno);
+  }
+  if (param->testflag & T_CALC_CHECKSUM)
+    share->state.state.checksum=param->glob_crc;
+
+  if (my_chsize(share->kfile.file, share->state.state.key_file_length, 0,
+                MYF(0)))
+    _ma_check_print_warning(param,
+			   "Can't change size of indexfile, error: %d",
+                            my_errno);
+
+  if (!(param->testflag & T_SILENT))
+  {
+    if (start_records != share->state.state.records)
+      printf("Data records: %s\n", llstr(share->state.state.records,llbuff));
+  }
+  if (sort_info.dupp)
+    _ma_check_print_warning(param,
+                            "%s records have been removed",
+                            llstr(sort_info.dupp,llbuff));
+  got_error=0;
+  /* If invoked by external program that uses thr_lock */
+  if (&share->state.state != info->state)
+    *info->state= *info->state_start= share->state.state;
+
+err:
+  _ma_reset_state(info);
+
+  /*
+    Destroy the write cache. The master thread did already detach from
+    the share by remove_io_thread() or it was not yet started (if the
+    error happend before creating the thread).
+  */
+  VOID(end_io_cache(&sort_info.new_info->rec_cache));
+  VOID(end_io_cache(&param->read_cache));
+  info->opt_flag&= ~(READ_CACHE_USED | WRITE_CACHE_USED);
+  sort_info.new_info->opt_flag&= ~(READ_CACHE_USED | WRITE_CACHE_USED);
+  /*
+    Destroy the new data cache in case of non-quick repair. All slave
+    threads did either detach from the share by remove_io_thread()
+    already or they were not yet started (if the error happend before
+    creating the threads).
+  */
+  if (!rep_quick)
+    VOID(end_io_cache(&new_data_cache));
+  if (!got_error)
+  {
+    /* Replace the actual file with the temporary file */
+    if (new_file >= 0)
+    {
+      my_close(new_file,MYF(0));
+      info->dfile.file= new_file= -1;
+      if (maria_change_to_newfile(share->data_file_name.str, MARIA_NAME_DEXT,
+                                  DATA_TMP_EXT,
+                                  MYF((param->testflag & T_BACKUP_DATA ?
+                                       MY_REDEL_MAKE_BACKUP : 0) |
+                                      sync_dir)) ||
+	  _ma_open_datafile(info,share, NullS, -1))
+	got_error=1;
+    }
+  }
+  if (got_error)
+  {
+    if (! param->error_printed)
+      _ma_check_print_error(param,"%d when fixing table",my_errno);
+    (void)_ma_flush_table_files_before_swap(param, info);
+    if (new_file >= 0)
+    {
+      VOID(my_close(new_file,MYF(0)));
+      VOID(my_delete(param->temp_filename, MYF(MY_WME)));
+      if (info->dfile.file == new_file)
+	info->dfile.file= -1;
+    }
+    maria_mark_crashed_on_repair(info);
+  }
+  else if (key_map == share->state.key_map)
+    share->state.changed&= ~STATE_NOT_OPTIMIZED_KEYS;
+  share->state.changed|= STATE_NOT_SORTED_PAGES;
+  if (!rep_quick)
+    share->state.changed&= ~(STATE_NOT_OPTIMIZED_ROWS | STATE_NOT_ZEROFILLED |
+                             STATE_NOT_MOVABLE);
+
+  pthread_cond_destroy (&sort_info.cond);
+  pthread_mutex_destroy(&sort_info.mutex);
+
+  /* If caller had disabled logging it's not up to us to re-enable it */
+  if (reenable_logging)
+    _ma_reenable_logging_for_table(info, FALSE);
+  restore_table_state_after_repair(info, &backup_share);
+
+  my_free(sort_info.ft_buf, MYF(MY_ALLOW_ZERO_PTR));
+  my_free(sort_info.key_block,MYF(MY_ALLOW_ZERO_PTR));
+  my_free(sort_param,MYF(MY_ALLOW_ZERO_PTR));
+  my_free(sort_info.buff,MYF(MY_ALLOW_ZERO_PTR));
+  if (!got_error && (param->testflag & T_UNPACK))
+    restore_data_file_type(share);
+  DBUG_RETURN(got_error);
+#endif /* THREAD */
+}
+
+	/* Read next record and return next key */
+
+static int sort_key_read(MARIA_SORT_PARAM *sort_param, uchar *key)
+{
+  int error;
+  MARIA_SORT_INFO *sort_info= sort_param->sort_info;
+  MARIA_HA *info= sort_info->info;
+  MARIA_KEY int_key;
+  DBUG_ENTER("sort_key_read");
+
+  if ((error=sort_get_next_record(sort_param)))
+    DBUG_RETURN(error);
+  if (info->s->state.state.records == sort_info->max_records)
+  {
+    _ma_check_print_error(sort_info->param,
+			 "Key %d - Found too many records; Can't continue",
+                         sort_param->key+1);
+    DBUG_RETURN(1);
+  }
+  if (_ma_sort_write_record(sort_param))
+    DBUG_RETURN(1);
+
+  (*info->s->keyinfo[sort_param->key].make_key)(info, &int_key,
+                                                sort_param->key, key,
+                                                sort_param->record,
+                                                sort_param->current_filepos,
+                                                0);
+  sort_param->real_key_length= int_key.data_length + int_key.ref_length;
+#ifdef HAVE_valgrind
+  bzero(key+sort_param->real_key_length,
+	(sort_param->key_length-sort_param->real_key_length));
+#endif
+  DBUG_RETURN(0);
+} /* sort_key_read */
+
+
+static int sort_maria_ft_key_read(MARIA_SORT_PARAM *sort_param, uchar *key)
+{
+  int error;
+  MARIA_SORT_INFO *sort_info=sort_param->sort_info;
+  MARIA_HA *info=sort_info->info;
+  FT_WORD *wptr=0;
+  MARIA_KEY int_key;
+  DBUG_ENTER("sort_maria_ft_key_read");
+
+  if (!sort_param->wordlist)
+  {
+    for (;;)
+    {
+      free_root(&sort_param->wordroot, MYF(MY_MARK_BLOCKS_FREE));
+      if ((error=sort_get_next_record(sort_param)))
+        DBUG_RETURN(error);
+      if ((error= _ma_sort_write_record(sort_param)))
+        DBUG_RETURN(error);
+      if (!(wptr= _ma_ft_parserecord(info,sort_param->key,sort_param->record,
+                                     &sort_param->wordroot)))
+
+        DBUG_RETURN(1);
+      if (wptr->pos)
+        break;
+    }
+    sort_param->wordptr=sort_param->wordlist=wptr;
+  }
+  else
+  {
+    error=0;
+    wptr=(FT_WORD*)(sort_param->wordptr);
+  }
+
+  _ma_ft_make_key(info, &int_key, sort_param->key, key, wptr++,
+                  sort_param->current_filepos);
+  sort_param->real_key_length= int_key.data_length + int_key.ref_length;
+
+#ifdef HAVE_valgrind
+  if (sort_param->key_length > sort_param->real_key_length)
+    bzero(key+sort_param->real_key_length,
+	  (sort_param->key_length-sort_param->real_key_length));
+#endif
+  if (!wptr->pos)
+  {
+    free_root(&sort_param->wordroot, MYF(MY_MARK_BLOCKS_FREE));
+    sort_param->wordlist=0;
+  }
+  else
+    sort_param->wordptr=(void*)wptr;
+
+  DBUG_RETURN(error);
+} /* sort_maria_ft_key_read */
+
+
+/*
+  Read next record from file using parameters in sort_info.
+
+  SYNOPSIS
+    sort_get_next_record()
+      sort_param                Information about and for the sort process
+
+  NOTES
+    Dynamic Records With Non-Quick Parallel Repair
+
+    For non-quick parallel repair we use a synchronized read/write
+    cache. This means that one thread is the master who fixes the data
+    file by reading each record from the old data file and writing it
+    to the new data file. By doing this the records in the new data
+    file are written contiguously. Whenever the write buffer is full,
+    it is copied to the read buffer. The slaves read from the read
+    buffer, which is not associated with a file. Thus read_cache.file
+    is -1. When using _mi_read_cache(), the slaves must always set
+    flag to READING_NEXT so that the function never tries to read from
+    file. This is safe because the records are contiguous. There is no
+    need to read outside the cache. This condition is evaluated in the
+    variable 'parallel_flag' for quick reference. read_cache.file must
+    be >= 0 in every other case.
+
+  RETURN
+    -1          end of file
+    0           ok
+                sort_param->current_filepos points to record position.
+                sort_param->record contains record
+                sort_param->max_pos contains position to last byte read
+    > 0         error
+*/
+
+static int sort_get_next_record(MARIA_SORT_PARAM *sort_param)
+{
+  int searching;
+  int parallel_flag;
+  uint found_record,b_type,left_length;
+  my_off_t pos;
+  MARIA_BLOCK_INFO block_info;
+  MARIA_SORT_INFO *sort_info=sort_param->sort_info;
+  HA_CHECK *param=sort_info->param;
+  MARIA_HA *info=sort_info->info;
+  MARIA_SHARE *share= info->s;
+  char llbuff[22],llbuff2[22];
+  DBUG_ENTER("sort_get_next_record");
+
+  if (_ma_killed_ptr(param))
+    DBUG_RETURN(1);
+
+  switch (sort_info->org_data_file_type) {
+  case BLOCK_RECORD:
+  {
+    for (;;)
+    {
+      int flag;
+      /*
+        Assume table is transactional and it had LSN pages in the
+        cache. Repair has flushed them, left data pages stay in
+        cache, and disabled transactionality (so share's current page
+        type is PLAIN); page cache would assert if it finds a cached LSN page
+        while _ma_scan_block_record() requested a PLAIN page. So we use
+        UNKNOWN.
+      */
+      enum pagecache_page_type save_page_type= share->page_type;
+      share->page_type= PAGECACHE_READ_UNKNOWN_PAGE;
+      if (info != sort_info->new_info)
+      {
+        /* Safe scanning */
+        flag= _ma_safe_scan_block_record(sort_info, info,
+                                         sort_param->record);
+      }
+      else
+      {
+        /*
+          Scan on clean table.
+          It requires a reliable data_file_length so we set it.
+        */
+        share->state.state.data_file_length= sort_info->filelength;
+        info->cur_row.trid= 0;
+        flag= _ma_scan_block_record(info, sort_param->record,
+                                    info->cur_row.nextpos, 1);
+        set_if_bigger(param->max_found_trid, info->cur_row.trid);
+        if (info->cur_row.trid > param->max_trid)
+        {
+          _ma_check_print_not_visible_error(param, info->cur_row.trid);
+          flag= HA_ERR_ROW_NOT_VISIBLE;
+        }
+      }
+      share->page_type= save_page_type;
+      if (!flag)
+      {
+	if (sort_param->calc_checksum)
+        {
+          ha_checksum checksum;
+          checksum= (*share->calc_check_checksum)(info, sort_param->record);
+          if (share->calc_checksum &&
+              info->cur_row.checksum != (checksum & 255))
+          {
+            if (param->testflag & T_VERBOSE)
+            {
+              record_pos_to_txt(info, info->cur_row.lastpos, llbuff);
+              _ma_check_print_info(param,
+                                   "Found record with wrong checksum at %s",
+                                   llbuff);
+            }
+            continue;
+          }
+          info->cur_row.checksum= checksum;
+	  param->glob_crc+= checksum;
+        }
+        sort_param->start_recpos= sort_param->current_filepos=
+          info->cur_row.lastpos;
+        DBUG_RETURN(0);
+      }
+      if (flag == HA_ERR_END_OF_FILE)
+      {
+        sort_param->max_pos= share->state.state.data_file_length;
+        DBUG_RETURN(-1);
+      }
+      /* Retry only if wrong record, not if disk error */
+      if (flag != HA_ERR_WRONG_IN_RECORD)
+      {
+        retry_if_quick(sort_param, flag);
+        DBUG_RETURN(flag);
+      }
+    }
+    break;                                      /* Impossible */
+  }
+  case STATIC_RECORD:
+    for (;;)
+    {
+      if (my_b_read(&sort_param->read_cache,sort_param->record,
+		    share->base.pack_reclength))
+      {
+	if (sort_param->read_cache.error)
+	  param->out_flag |= O_DATA_LOST;
+        retry_if_quick(sort_param, my_errno);
+	DBUG_RETURN(-1);
+      }
+      sort_param->start_recpos=sort_param->pos;
+      if (!sort_param->fix_datafile)
+      {
+	sort_param->current_filepos= sort_param->pos;
+        if (sort_param->master)
+	  share->state.split++;
+      }
+      sort_param->max_pos=(sort_param->pos+=share->base.pack_reclength);
+      if (*sort_param->record)
+      {
+	if (sort_param->calc_checksum)
+	  param->glob_crc+= (info->cur_row.checksum=
+			     _ma_static_checksum(info,sort_param->record));
+	DBUG_RETURN(0);
+      }
+      if (!sort_param->fix_datafile && sort_param->master)
+      {
+	share->state.state.del++;
+	share->state.state.empty+=share->base.pack_reclength;
+      }
+    }
+  case DYNAMIC_RECORD:
+  {
+    uchar *to;
+    ha_checksum checksum= 0;
+    LINT_INIT(to);
+
+    pos=sort_param->pos;
+    searching=(sort_param->fix_datafile && (param->testflag & T_EXTEND));
+    parallel_flag= (sort_param->read_cache.file < 0) ? READING_NEXT : 0;
+    for (;;)
+    {
+      found_record=block_info.second_read= 0;
+      left_length=1;
+      if (searching)
+      {
+	pos=MY_ALIGN(pos,MARIA_DYN_ALIGN_SIZE);
+        param->testflag|=T_RETRY_WITHOUT_QUICK;
+	sort_param->start_recpos=pos;
+      }
+      do
+      {
+	if (pos > sort_param->max_pos)
+	  sort_param->max_pos=pos;
+	if (pos & (MARIA_DYN_ALIGN_SIZE-1))
+	{
+	  if ((param->testflag & T_VERBOSE) || searching == 0)
+	    _ma_check_print_info(param,"Wrong aligned block at %s",
+				llstr(pos,llbuff));
+	  if (searching)
+	    goto try_next;
+	}
+	if (found_record && pos == param->search_after_block)
+	  _ma_check_print_info(param,"Block: %s used by record at %s",
+		     llstr(param->search_after_block,llbuff),
+		     llstr(sort_param->start_recpos,llbuff2));
+	if (_ma_read_cache(&sort_param->read_cache,
+                           block_info.header, pos,
+			   MARIA_BLOCK_INFO_HEADER_LENGTH,
+			   (! found_record ? READING_NEXT : 0) |
+			   parallel_flag | READING_HEADER))
+	{
+	  if (found_record)
+	  {
+	    _ma_check_print_info(param,
+				"Can't read whole record at %s (errno: %d)",
+				llstr(sort_param->start_recpos,llbuff),errno);
+	    goto try_next;
+	  }
+	  DBUG_RETURN(-1);
+	}
+	if (searching && ! sort_param->fix_datafile)
+	{
+	  param->error_printed=1;
+          param->retry_repair=1;
+          param->testflag|=T_RETRY_WITHOUT_QUICK;
+	  DBUG_RETURN(1);	/* Something wrong with data */
+	}
+	b_type= _ma_get_block_info(&block_info,-1,pos);
+	if ((b_type & (BLOCK_ERROR | BLOCK_FATAL_ERROR)) ||
+	   ((b_type & BLOCK_FIRST) &&
+	     (block_info.rec_len < (uint) share->base.min_pack_length ||
+	      block_info.rec_len > (uint) share->base.max_pack_length)))
+	{
+	  uint i;
+	  if (param->testflag & T_VERBOSE || searching == 0)
+	    _ma_check_print_info(param,
+				"Wrong bytesec: %3d-%3d-%3d at %10s; Skipped",
+		       block_info.header[0],block_info.header[1],
+		       block_info.header[2],llstr(pos,llbuff));
+	  if (found_record)
+	    goto try_next;
+	  block_info.second_read=0;
+	  searching=1;
+	  /* Search after block in read header string */
+	  for (i=MARIA_DYN_ALIGN_SIZE ;
+	       i < MARIA_BLOCK_INFO_HEADER_LENGTH ;
+	       i+= MARIA_DYN_ALIGN_SIZE)
+	    if (block_info.header[i] >= 1 &&
+		block_info.header[i] <= MARIA_MAX_DYN_HEADER_BYTE)
+	      break;
+	  pos+=(ulong) i;
+	  sort_param->start_recpos=pos;
+	  continue;
+	}
+	if (b_type & BLOCK_DELETED)
+	{
+	  my_bool error=0;
+	  if (block_info.block_len+ (uint) (block_info.filepos-pos) <
+	      share->base.min_block_length)
+	  {
+	    if (!searching)
+	      _ma_check_print_info(param,
+                                   "Deleted block with impossible length %lu "
+                                   "at %s",
+                                   block_info.block_len,llstr(pos,llbuff));
+	    error=1;
+	  }
+	  else
+	  {
+	    if ((block_info.next_filepos != HA_OFFSET_ERROR &&
+		 block_info.next_filepos >=
+		 share->state.state.data_file_length) ||
+		(block_info.prev_filepos != HA_OFFSET_ERROR &&
+		 block_info.prev_filepos >=
+                 share->state.state.data_file_length))
+	    {
+	      if (!searching)
+		_ma_check_print_info(param,
+				    "Delete link points outside datafile at "
+                                     "%s",
+                                     llstr(pos,llbuff));
+	      error=1;
+	    }
+	  }
+	  if (error)
+	  {
+	    if (found_record)
+	      goto try_next;
+	    searching=1;
+	    pos+= MARIA_DYN_ALIGN_SIZE;
+	    sort_param->start_recpos=pos;
+	    block_info.second_read=0;
+	    continue;
+	  }
+	}
+	else
+	{
+	  if (block_info.block_len+ (uint) (block_info.filepos-pos) <
+	      share->base.min_block_length ||
+	      block_info.block_len > (uint) share->base.max_pack_length+
+	      MARIA_SPLIT_LENGTH)
+	  {
+	    if (!searching)
+	      _ma_check_print_info(param,
+                                   "Found block with impossible length %lu "
+                                   "at %s; Skipped",
+                                   block_info.block_len+
+                                   (uint) (block_info.filepos-pos),
+                                   llstr(pos,llbuff));
+	    if (found_record)
+	      goto try_next;
+	    searching=1;
+	    pos+= MARIA_DYN_ALIGN_SIZE;
+	    sort_param->start_recpos=pos;
+	    block_info.second_read=0;
+	    continue;
+	  }
+	}
+	if (b_type & (BLOCK_DELETED | BLOCK_SYNC_ERROR))
+	{
+          if (!sort_param->fix_datafile && sort_param->master &&
+              (b_type & BLOCK_DELETED))
+	  {
+	    share->state.state.empty+=block_info.block_len;
+	    share->state.state.del++;
+	    share->state.split++;
+	  }
+	  if (found_record)
+	    goto try_next;
+	  if (searching)
+	  {
+	    pos+=MARIA_DYN_ALIGN_SIZE;
+	    sort_param->start_recpos=pos;
+	  }
+	  else
+	    pos=block_info.filepos+block_info.block_len;
+	  block_info.second_read=0;
+	  continue;
+	}
+
+	if (!sort_param->fix_datafile && sort_param->master)
+	  share->state.split++;
+	if (! found_record++)
+	{
+	  sort_param->find_length=left_length=block_info.rec_len;
+	  sort_param->start_recpos=pos;
+	  if (!sort_param->fix_datafile)
+	    sort_param->current_filepos= sort_param->start_recpos;
+	  if (sort_param->fix_datafile && (param->testflag & T_EXTEND))
+	    sort_param->pos=block_info.filepos+1;
+	  else
+	    sort_param->pos=block_info.filepos+block_info.block_len;
+	  if (share->base.blobs)
+	  {
+	    if (_ma_alloc_buffer(&sort_param->rec_buff,
+                                 &sort_param->rec_buff_size,
+                                 block_info.rec_len +
+                                 share->base.extra_rec_buff_size))
+
+	    {
+	      if (param->max_record_length >= block_info.rec_len)
+	      {
+		_ma_check_print_error(param,"Not enough memory for blob at %s "
+                                      "(need %lu)",
+				     llstr(sort_param->start_recpos,llbuff),
+				     (ulong) block_info.rec_len);
+		DBUG_RETURN(1);
+	      }
+	      else
+	      {
+		_ma_check_print_info(param,"Not enough memory for blob at %s "
+                                     "(need %lu); Row skipped",
+				    llstr(sort_param->start_recpos,llbuff),
+				    (ulong) block_info.rec_len);
+		goto try_next;
+	      }
+	    }
+	  }
+          to= sort_param->rec_buff;
+	}
+	if (left_length < block_info.data_len || ! block_info.data_len)
+	{
+	  _ma_check_print_info(param,
+			      "Found block with too small length at %s; "
+                               "Skipped",
+                               llstr(sort_param->start_recpos,llbuff));
+	  goto try_next;
+	}
+	if (block_info.filepos + block_info.data_len >
+	    sort_param->read_cache.end_of_file)
+	{
+	  _ma_check_print_info(param,
+			      "Found block that points outside data file "
+                               "at %s",
+                               llstr(sort_param->start_recpos,llbuff));
+	  goto try_next;
+	}
+        /*
+          Copy information that is already read. Avoid accessing data
+          below the cache start. This could happen if the header
+          streched over the end of the previous buffer contents.
+        */
+        {
+          uint header_len= (uint) (block_info.filepos - pos);
+          uint prefetch_len= (MARIA_BLOCK_INFO_HEADER_LENGTH - header_len);
+
+          if (prefetch_len > block_info.data_len)
+            prefetch_len= block_info.data_len;
+          if (prefetch_len)
+          {
+            memcpy(to, block_info.header + header_len, prefetch_len);
+            block_info.filepos+= prefetch_len;
+            block_info.data_len-= prefetch_len;
+            left_length-= prefetch_len;
+            to+= prefetch_len;
+          }
+        }
+        if (block_info.data_len &&
+            _ma_read_cache(&sort_param->read_cache,to,block_info.filepos,
+                           block_info.data_len,
+                           (found_record == 1 ? READING_NEXT : 0) |
+                           parallel_flag))
+	{
+	  _ma_check_print_info(param,
+			      "Read error for block at: %s (error: %d); "
+                               "Skipped",
+			      llstr(block_info.filepos,llbuff),my_errno);
+	  goto try_next;
+	}
+	left_length-=block_info.data_len;
+	to+=block_info.data_len;
+	pos=block_info.next_filepos;
+	if (pos == HA_OFFSET_ERROR && left_length)
+	{
+	  _ma_check_print_info(param,
+                               "Wrong block with wrong total length "
+                               "starting at %s",
+			      llstr(sort_param->start_recpos,llbuff));
+	  goto try_next;
+	}
+	if (pos + MARIA_BLOCK_INFO_HEADER_LENGTH >
+            sort_param->read_cache.end_of_file)
+	{
+	  _ma_check_print_info(param,
+                               "Found link that points at %s (outside data "
+                               "file) at %s",
+			      llstr(pos,llbuff2),
+			      llstr(sort_param->start_recpos,llbuff));
+	  goto try_next;
+	}
+      } while (left_length);
+
+      if (_ma_rec_unpack(info,sort_param->record,sort_param->rec_buff,
+			 sort_param->find_length) != MY_FILE_ERROR)
+      {
+	if (sort_param->read_cache.error < 0)
+	  DBUG_RETURN(1);
+	if (sort_param->calc_checksum)
+	  checksum= (share->calc_check_checksum)(info, sort_param->record);
+	if ((param->testflag & (T_EXTEND | T_REP)) || searching)
+	{
+	  if (_ma_rec_check(info, sort_param->record, sort_param->rec_buff,
+                            sort_param->find_length,
+                            (param->testflag & T_QUICK) &&
+                            sort_param->calc_checksum &&
+                            test(share->calc_checksum), checksum))
+	  {
+	    _ma_check_print_info(param,"Found wrong packed record at %s",
+				llstr(sort_param->start_recpos,llbuff));
+	    goto try_next;
+	  }
+	}
+	if (sort_param->calc_checksum)
+	  param->glob_crc+= checksum;
+	DBUG_RETURN(0);
+      }
+      if (!searching)
+        _ma_check_print_info(param,"Key %d - Found wrong stored record at %s",
+                            sort_param->key+1,
+                            llstr(sort_param->start_recpos,llbuff));
+    try_next:
+      pos=(sort_param->start_recpos+=MARIA_DYN_ALIGN_SIZE);
+      searching=1;
+    }
+  }
+  case COMPRESSED_RECORD:
+    for (searching=0 ;; searching=1, sort_param->pos++)
+    {
+      if (_ma_read_cache(&sort_param->read_cache, block_info.header,
+			 sort_param->pos,
+			 share->pack.ref_length,READING_NEXT))
+	DBUG_RETURN(-1);
+      if (searching && ! sort_param->fix_datafile)
+      {
+	param->error_printed=1;
+        param->retry_repair=1;
+        param->testflag|=T_RETRY_WITHOUT_QUICK;
+	DBUG_RETURN(1);		/* Something wrong with data */
+      }
+      sort_param->start_recpos=sort_param->pos;
+      if (_ma_pack_get_block_info(info, &sort_param->bit_buff, &block_info,
+                                  &sort_param->rec_buff,
+                                  &sort_param->rec_buff_size, -1,
+                                  sort_param->pos))
+	DBUG_RETURN(-1);
+      if (!block_info.rec_len &&
+	  sort_param->pos + MEMMAP_EXTRA_MARGIN ==
+	  sort_param->read_cache.end_of_file)
+	DBUG_RETURN(-1);
+      if (block_info.rec_len < (uint) share->min_pack_length ||
+	  block_info.rec_len > (uint) share->max_pack_length)
+      {
+	if (! searching)
+	  _ma_check_print_info(param,
+                               "Found block with wrong recordlength: %lu "
+                               "at %s\n",
+                               block_info.rec_len,
+                               llstr(sort_param->pos,llbuff));
+	continue;
+      }
+      if (_ma_read_cache(&sort_param->read_cache, sort_param->rec_buff,
+			 block_info.filepos, block_info.rec_len,
+			 READING_NEXT))
+      {
+	if (! searching)
+	  _ma_check_print_info(param,"Couldn't read whole record from %s",
+			      llstr(sort_param->pos,llbuff));
+	continue;
+      }
+#ifdef HAVE_valgrind
+      bzero(sort_param->rec_buff + block_info.rec_len,
+            share->base.extra_rec_buff_size);
+#endif
+      if (_ma_pack_rec_unpack(info, &sort_param->bit_buff, sort_param->record,
+                              sort_param->rec_buff, block_info.rec_len))
+      {
+	if (! searching)
+	  _ma_check_print_info(param,"Found wrong record at %s",
+			      llstr(sort_param->pos,llbuff));
+	continue;
+      }
+      if (!sort_param->fix_datafile)
+      {
+	sort_param->current_filepos= sort_param->pos;
+        if (sort_param->master)
+	  share->state.split++;
+      }
+      sort_param->max_pos= (sort_param->pos=block_info.filepos+
+                            block_info.rec_len);
+      info->packed_length=block_info.rec_len;
+
+      if (sort_param->calc_checksum)
+      {
+        info->cur_row.checksum= (*share->calc_check_checksum)(info,
+                                                                sort_param->
+                                                                record);
+	param->glob_crc+= info->cur_row.checksum;
+      }
+      DBUG_RETURN(0);
+    }
+  }
+  DBUG_RETURN(1);		/* Impossible */
+}
+
+
+/**
+   @brief Write record to new file.
+
+   @fn    _ma_sort_write_record()
+   @param sort_param                Sort parameters.
+
+   @note
+   This is only called by a master thread if parallel repair is used.
+
+   @return
+   @retval  0   OK
+                sort_param->current_filepos points to inserted record for
+                block_records and to the place for the next record for
+                other row types.
+                sort_param->filepos points to end of file
+  @retval   1   Error
+*/
+
+int _ma_sort_write_record(MARIA_SORT_PARAM *sort_param)
+{
+  int flag;
+  uint length;
+  ulong block_length,reclength;
+  uchar *from;
+  uchar block_buff[8];
+  MARIA_SORT_INFO *sort_info=sort_param->sort_info;
+  HA_CHECK *param= sort_info->param;
+  MARIA_HA *info= sort_info->new_info;
+  MARIA_SHARE *share= info->s;
+  DBUG_ENTER("_ma_sort_write_record");
+
+  if (sort_param->fix_datafile)
+  {
+    sort_param->current_filepos= sort_param->filepos;
+    switch (sort_info->new_data_file_type) {
+    case BLOCK_RECORD:
+      if ((sort_param->current_filepos=
+           (*share->write_record_init)(info, sort_param->record)) ==
+          HA_OFFSET_ERROR)
+        DBUG_RETURN(1);
+      /* Pointer to end of file */
+      sort_param->filepos= share->state.state.data_file_length;
+      break;
+    case STATIC_RECORD:
+      if (my_b_write(&info->rec_cache,sort_param->record,
+		     share->base.pack_reclength))
+      {
+	_ma_check_print_error(param,"%d when writing to datafile",my_errno);
+	DBUG_RETURN(1);
+      }
+      sort_param->filepos+=share->base.pack_reclength;
+      share->state.split++;
+      break;
+    case DYNAMIC_RECORD:
+      if (! info->blobs)
+	from=sort_param->rec_buff;
+      else
+      {
+	/* must be sure that local buffer is big enough */
+	reclength=share->base.pack_reclength+
+	  _ma_calc_total_blob_length(info,sort_param->record)+
+	  ALIGN_SIZE(MARIA_MAX_DYN_BLOCK_HEADER)+MARIA_SPLIT_LENGTH+
+	  MARIA_DYN_DELETE_BLOCK_HEADER;
+	if (sort_info->buff_length < reclength)
+	{
+	  if (!(sort_info->buff=my_realloc(sort_info->buff, (uint) reclength,
+					   MYF(MY_FREE_ON_ERROR |
+					       MY_ALLOW_ZERO_PTR))))
+	    DBUG_RETURN(1);
+	  sort_info->buff_length=reclength;
+	}
+	from= (uchar *) sort_info->buff+ALIGN_SIZE(MARIA_MAX_DYN_BLOCK_HEADER);
+      }
+      /* We can use info->checksum here as only one thread calls this */
+      info->cur_row.checksum= (*share->calc_check_checksum)(info,
+                                                              sort_param->
+                                                              record);
+      reclength= _ma_rec_pack(info,from,sort_param->record);
+      flag=0;
+
+      do
+      {
+	block_length=reclength+ 3 + test(reclength >= (65520-3));
+	if (block_length < share->base.min_block_length)
+	  block_length=share->base.min_block_length;
+	info->update|=HA_STATE_WRITE_AT_END;
+	block_length=MY_ALIGN(block_length,MARIA_DYN_ALIGN_SIZE);
+	if (block_length > MARIA_MAX_BLOCK_LENGTH)
+	  block_length=MARIA_MAX_BLOCK_LENGTH;
+	if (_ma_write_part_record(info,0L,block_length,
+				  sort_param->filepos+block_length,
+				  &from,&reclength,&flag))
+	{
+	  _ma_check_print_error(param,"%d when writing to datafile",my_errno);
+	  DBUG_RETURN(1);
+	}
+	sort_param->filepos+=block_length;
+	share->state.split++;
+      } while (reclength);
+      break;
+    case COMPRESSED_RECORD:
+      reclength=info->packed_length;
+      length= _ma_save_pack_length((uint) share->pack.version, block_buff,
+                               reclength);
+      if (share->base.blobs)
+	length+= _ma_save_pack_length((uint) share->pack.version,
+	                          block_buff + length, info->blob_length);
+      if (my_b_write(&info->rec_cache,block_buff,length) ||
+	  my_b_write(&info->rec_cache, sort_param->rec_buff, reclength))
+      {
+	_ma_check_print_error(param,"%d when writing to datafile",my_errno);
+	DBUG_RETURN(1);
+      }
+      sort_param->filepos+=reclength+length;
+      share->state.split++;
+      break;
+    }
+  }
+  if (sort_param->master)
+  {
+    share->state.state.records++;
+    if ((param->testflag & T_WRITE_LOOP) &&
+        (share->state.state.records % WRITE_COUNT) == 0)
+    {
+      char llbuff[22];
+      printf("%s\r", llstr(share->state.state.records,llbuff));
+      VOID(fflush(stdout));
+    }
+  }
+  DBUG_RETURN(0);
+} /* _ma_sort_write_record */
+
+
+/* Compare two keys from _ma_create_index_by_sort */
+
+static int sort_key_cmp(MARIA_SORT_PARAM *sort_param, const void *a,
+			const void *b)
+{
+  uint not_used[2];
+  return (ha_key_cmp(sort_param->seg, *((uchar* const *) a),
+                     *((uchar* const *) b),
+		     USE_WHOLE_KEY, SEARCH_SAME, not_used));
+} /* sort_key_cmp */
+
+
+static int sort_key_write(MARIA_SORT_PARAM *sort_param, const uchar *a)
+{
+  uint diff_pos[2];
+  char llbuff[22],llbuff2[22];
+  MARIA_SORT_INFO *sort_info=sort_param->sort_info;
+  HA_CHECK *param= sort_info->param;
+  int cmp;
+
+  if (sort_info->key_block->inited)
+  {
+    cmp= ha_key_cmp(sort_param->seg, sort_info->key_block->lastkey,
+                    a, USE_WHOLE_KEY,
+                    SEARCH_FIND | SEARCH_UPDATE | SEARCH_INSERT,
+                    diff_pos);
+    if (param->stats_method == MI_STATS_METHOD_NULLS_NOT_EQUAL)
+      ha_key_cmp(sort_param->seg, sort_info->key_block->lastkey,
+                 a, USE_WHOLE_KEY,
+                 SEARCH_FIND | SEARCH_NULL_ARE_NOT_EQUAL, diff_pos);
+    else if (param->stats_method == MI_STATS_METHOD_IGNORE_NULLS)
+    {
+      diff_pos[0]= maria_collect_stats_nonulls_next(sort_param->seg,
+                                                 sort_param->notnull,
+                                                 sort_info->key_block->lastkey,
+                                                 a);
+    }
+    sort_param->unique[diff_pos[0]-1]++;
+  }
+  else
+  {
+    cmp= -1;
+    if (param->stats_method == MI_STATS_METHOD_IGNORE_NULLS)
+      maria_collect_stats_nonulls_first(sort_param->seg, sort_param->notnull,
+                                        a);
+  }
+  if ((sort_param->keyinfo->flag & HA_NOSAME) && cmp == 0)
+  {
+    sort_info->dupp++;
+    sort_info->info->cur_row.lastpos= get_record_for_key(sort_param->keyinfo,
+                                                         a);
+    _ma_check_print_warning(param,
+			   "Duplicate key %2u for record at %10s against "
+                            "record at %10s",
+                            sort_param->key + 1,
+                            llstr(sort_info->info->cur_row.lastpos, llbuff),
+                            llstr(get_record_for_key(sort_param->keyinfo,
+                                                     sort_info->key_block->
+                                                     lastkey),
+                                  llbuff2));
+    param->testflag|=T_RETRY_WITHOUT_QUICK;
+    if (sort_info->param->testflag & T_VERBOSE)
+      _ma_print_keydata(stdout,sort_param->seg, a, USE_WHOLE_KEY);
+    return (sort_delete_record(sort_param));
+  }
+#ifndef DBUG_OFF
+  if (cmp > 0)
+  {
+    _ma_check_print_error(param,
+			 "Internal error: Keys are not in order from sort");
+    return(1);
+  }
+#endif
+  return (sort_insert_key(sort_param, sort_info->key_block,
+			  a, HA_OFFSET_ERROR));
+} /* sort_key_write */
+
+
+int _ma_sort_ft_buf_flush(MARIA_SORT_PARAM *sort_param)
+{
+  MARIA_SORT_INFO *sort_info=sort_param->sort_info;
+  SORT_KEY_BLOCKS *key_block=sort_info->key_block;
+  MARIA_SHARE *share=sort_info->info->s;
+  uint val_off, val_len;
+  int error;
+  SORT_FT_BUF *maria_ft_buf=sort_info->ft_buf;
+  uchar *from, *to;
+
+  val_len=share->ft2_keyinfo.keylength;
+  get_key_full_length_rdonly(val_off, maria_ft_buf->lastkey);
+  to= maria_ft_buf->lastkey+val_off;
+
+  if (maria_ft_buf->buf)
+  {
+    /* flushing first-level tree */
+    error= sort_insert_key(sort_param,key_block,maria_ft_buf->lastkey,
+                           HA_OFFSET_ERROR);
+    for (from=to+val_len;
+         !error && from < maria_ft_buf->buf;
+         from+= val_len)
+    {
+      memcpy(to, from, val_len);
+      error= sort_insert_key(sort_param,key_block,maria_ft_buf->lastkey,
+                             HA_OFFSET_ERROR);
+    }
+    return error;
+  }
+  /* flushing second-level tree keyblocks */
+  error=_ma_flush_pending_blocks(sort_param);
+  /* updating lastkey with second-level tree info */
+  ft_intXstore(maria_ft_buf->lastkey+val_off, -maria_ft_buf->count);
+  _ma_dpointer(sort_info->info->s, maria_ft_buf->lastkey+val_off+HA_FT_WLEN,
+      share->state.key_root[sort_param->key]);
+  /* restoring first level tree data in sort_info/sort_param */
+  sort_info->key_block=sort_info->key_block_end- sort_info->param->sort_key_blocks;
+  sort_param->keyinfo=share->keyinfo+sort_param->key;
+  share->state.key_root[sort_param->key]=HA_OFFSET_ERROR;
+  /* writing lastkey in first-level tree */
+  return error ? error :
+                 sort_insert_key(sort_param,sort_info->key_block,
+                                 maria_ft_buf->lastkey,HA_OFFSET_ERROR);
+}
+
+
+static int sort_maria_ft_key_write(MARIA_SORT_PARAM *sort_param,
+                                   const uchar *a)
+{
+  uint a_len, val_off, val_len, error;
+  MARIA_SORT_INFO *sort_info= sort_param->sort_info;
+  SORT_FT_BUF *ft_buf= sort_info->ft_buf;
+  SORT_KEY_BLOCKS *key_block= sort_info->key_block;
+  MARIA_SHARE *share= sort_info->info->s;
+
+  val_len=HA_FT_WLEN+share->base.rec_reflength;
+  get_key_full_length_rdonly(a_len, a);
+
+  if (!ft_buf)
+  {
+    /*
+      use two-level tree only if key_reflength fits in rec_reflength place
+      and row format is NOT static - for _ma_dpointer not to garble offsets
+     */
+    if ((share->base.key_reflength <=
+         share->base.rec_reflength) &&
+        (share->options &
+          (HA_OPTION_PACK_RECORD | HA_OPTION_COMPRESS_RECORD)))
+      ft_buf= (SORT_FT_BUF *)my_malloc(sort_param->keyinfo->block_length +
+                                       sizeof(SORT_FT_BUF), MYF(MY_WME));
+
+    if (!ft_buf)
+    {
+      sort_param->key_write=sort_key_write;
+      return sort_key_write(sort_param, a);
+    }
+    sort_info->ft_buf= ft_buf;
+    goto word_init_ft_buf;              /* no need to duplicate the code */
+  }
+  get_key_full_length_rdonly(val_off, ft_buf->lastkey);
+
+  if (ha_compare_text(sort_param->seg->charset,
+                      a+1,a_len-1,
+                      ft_buf->lastkey+1,val_off-1, 0, 0)==0)
+  {
+    uchar *p;
+    if (!ft_buf->buf)                   /* store in second-level tree */
+    {
+      ft_buf->count++;
+      return sort_insert_key(sort_param,key_block,
+                             a + a_len, HA_OFFSET_ERROR);
+    }
+
+    /* storing the key in the buffer. */
+    memcpy (ft_buf->buf, (const char *)a+a_len, val_len);
+    ft_buf->buf+=val_len;
+    if (ft_buf->buf < ft_buf->end)
+      return 0;
+
+    /* converting to two-level tree */
+    p=ft_buf->lastkey+val_off;
+
+    while (key_block->inited)
+      key_block++;
+    sort_info->key_block=key_block;
+    sort_param->keyinfo= &share->ft2_keyinfo;
+    ft_buf->count=(ft_buf->buf - p)/val_len;
+
+    /* flushing buffer to second-level tree */
+    for (error=0; !error && p < ft_buf->buf; p+= val_len)
+      error=sort_insert_key(sort_param,key_block,p,HA_OFFSET_ERROR);
+    ft_buf->buf=0;
+    return error;
+  }
+
+  /* flushing buffer */
+  if ((error=_ma_sort_ft_buf_flush(sort_param)))
+    return error;
+
+word_init_ft_buf:
+  a_len+=val_len;
+  memcpy(ft_buf->lastkey, a, a_len);
+  ft_buf->buf=ft_buf->lastkey+a_len;
+  /*
+    32 is just a safety margin here
+    (at least max(val_len, sizeof(nod_flag)) should be there).
+    May be better performance could be achieved if we'd put
+      (sort_info->keyinfo->block_length-32)/XXX
+      instead.
+        TODO: benchmark the best value for XXX.
+  */
+  ft_buf->end= ft_buf->lastkey+ (sort_param->keyinfo->block_length-32);
+  return 0;
+} /* sort_maria_ft_key_write */
+
+
+/* get pointer to record from a key */
+
+static my_off_t get_record_for_key(MARIA_KEYDEF *keyinfo,
+				   const uchar *key_data)
+{
+  MARIA_KEY key;
+  key.keyinfo= keyinfo;
+  key.data= (uchar*) key_data;
+  key.data_length= _ma_keylength(keyinfo, key_data);
+  return _ma_row_pos_from_key(&key);
+} /* get_record_for_key */
+
+
+/* Insert a key in sort-key-blocks */
+
+static int sort_insert_key(MARIA_SORT_PARAM *sort_param,
+			   register SORT_KEY_BLOCKS *key_block,
+                           const uchar *key,
+			   my_off_t prev_block)
+{
+  uint a_length,t_length,nod_flag;
+  my_off_t filepos,key_file_length;
+  uchar *anc_buff,*lastkey;
+  MARIA_KEY_PARAM s_temp;
+  MARIA_KEYDEF *keyinfo=sort_param->keyinfo;
+  MARIA_SORT_INFO *sort_info= sort_param->sort_info;
+  HA_CHECK *param=sort_info->param;
+  MARIA_PINNED_PAGE tmp_page_link, *page_link= &tmp_page_link;
+  MARIA_KEY tmp_key;
+  MARIA_HA *info= sort_info->info;
+  MARIA_SHARE *share= info->s;
+  DBUG_ENTER("sort_insert_key");
+
+  anc_buff= key_block->buff;
+  lastkey=key_block->lastkey;
+  nod_flag= (key_block == sort_info->key_block ? 0 :
+	     share->base.key_reflength);
+
+  if (!key_block->inited)
+  {
+    key_block->inited=1;
+    if (key_block == sort_info->key_block_end)
+    {
+      _ma_check_print_error(param,
+                            "To many key-block-levels; "
+                            "Try increasing sort_key_blocks");
+      DBUG_RETURN(1);
+    }
+    a_length= share->keypage_header + nod_flag;
+    key_block->end_pos= anc_buff + share->keypage_header;
+    bzero(anc_buff, share->keypage_header);
+    _ma_store_keynr(share, anc_buff, (uint) (sort_param->keyinfo -
+                                            share->keyinfo));
+    lastkey=0;					/* No previous key in block */
+  }
+  else
+    a_length= _ma_get_page_used(share, anc_buff);
+
+	/* Save pointer to previous block */
+  if (nod_flag)
+  {
+    _ma_store_keypage_flag(share, anc_buff, KEYPAGE_FLAG_ISNOD);
+    _ma_kpointer(info,key_block->end_pos,prev_block);
+  }
+
+  tmp_key.keyinfo= keyinfo;
+  tmp_key.data= (uchar*) key;
+  tmp_key.data_length= _ma_keylength(keyinfo, key) - share->base.rec_reflength;
+  tmp_key.ref_length=  share->base.rec_reflength;
+
+  t_length= (*keyinfo->pack_key)(&tmp_key, nod_flag,
+                                 (uchar*) 0, lastkey, lastkey, &s_temp);
+  (*keyinfo->store_key)(keyinfo, key_block->end_pos+nod_flag,&s_temp);
+  a_length+=t_length;
+  _ma_store_page_used(share, anc_buff, a_length);
+  key_block->end_pos+=t_length;
+  if (a_length <= share->max_index_block_size)
+  {
+    MARIA_KEY tmp_key2;
+    tmp_key2.data= key_block->lastkey;
+    _ma_copy_key(&tmp_key2, &tmp_key);
+    key_block->last_length=a_length-t_length;
+    DBUG_RETURN(0);
+  }
+
+  /* Fill block with end-zero and write filled block */
+  _ma_store_page_used(share, anc_buff, key_block->last_length);
+  bzero(anc_buff+key_block->last_length,
+	keyinfo->block_length- key_block->last_length);
+  key_file_length=share->state.state.key_file_length;
+  if ((filepos= _ma_new(info, DFLT_INIT_HITS, &page_link)) == HA_OFFSET_ERROR)
+    DBUG_RETURN(1);
+  _ma_fast_unlock_key_del(info);
+
+  /* If we read the page from the key cache, we have to write it back to it */
+  if (page_link->changed)
+  {
+    MARIA_PAGE page;
+    pop_dynamic(&info->pinned_pages);
+    _ma_page_setup(&page, info, keyinfo, filepos, anc_buff);
+    if (_ma_write_keypage(&page, PAGECACHE_LOCK_WRITE_UNLOCK, DFLT_INIT_HITS))
+      DBUG_RETURN(1);
+  }
+  else
+  {
+    put_crc(anc_buff, filepos, share);
+    if (my_pwrite(share->kfile.file, anc_buff,
+                  (uint) keyinfo->block_length, filepos, param->myf_rw))
+      DBUG_RETURN(1);
+  }
+  DBUG_DUMP("buff", anc_buff, _ma_get_page_used(share, anc_buff));
+
+	/* Write separator-key to block in next level */
+  if (sort_insert_key(sort_param,key_block+1,key_block->lastkey,filepos))
+    DBUG_RETURN(1);
+
+	/* clear old block and write new key in it */
+  key_block->inited=0;
+  DBUG_RETURN(sort_insert_key(sort_param, key_block,key,prev_block));
+} /* sort_insert_key */
+
+
+/* Delete record when we found a duplicated key */
+
+static int sort_delete_record(MARIA_SORT_PARAM *sort_param)
+{
+  uint i;
+  int old_file,error;
+  uchar *key;
+  MARIA_SORT_INFO *sort_info=sort_param->sort_info;
+  HA_CHECK *param=sort_info->param;
+  MARIA_HA *row_info= sort_info->new_info, *key_info= sort_info->info;
+  DBUG_ENTER("sort_delete_record");
+
+  if ((param->testflag & (T_FORCE_UNIQUENESS|T_QUICK)) == T_QUICK)
+  {
+    _ma_check_print_error(param,
+			 "Quick-recover aborted; Run recovery without switch "
+                          "-q or with switch -qq");
+    DBUG_RETURN(1);
+  }
+  if (key_info->s->options & HA_OPTION_COMPRESS_RECORD)
+  {
+    _ma_check_print_error(param,
+                          "Recover aborted; Can't run standard recovery on "
+                          "compressed tables with errors in data-file. "
+                          "Use 'aria_chk --safe-recover' to fix it");
+    DBUG_RETURN(1);
+  }
+
+  old_file= row_info->dfile.file;
+  /* This only affects static and dynamic row formats */
+  row_info->dfile.file= row_info->rec_cache.file;
+  if (flush_io_cache(&row_info->rec_cache))
+    DBUG_RETURN(1);
+
+  key= key_info->lastkey_buff + key_info->s->base.max_key_length;
+  if ((error=(*row_info->s->read_record)(row_info, sort_param->record,
+                                         key_info->cur_row.lastpos)) &&
+	error != HA_ERR_RECORD_DELETED)
+  {
+    _ma_check_print_error(param,"Can't read record to be removed");
+    row_info->dfile.file= old_file;
+    DBUG_RETURN(1);
+  }
+  row_info->cur_row.lastpos= key_info->cur_row.lastpos;
+
+  for (i=0 ; i < sort_info->current_key ; i++)
+  {
+    MARIA_KEY tmp_key;
+    (*key_info->s->keyinfo[i].make_key)(key_info, &tmp_key, i, key,
+                                        sort_param->record,
+                                        key_info->cur_row.lastpos, 0);
+    if (_ma_ck_delete(key_info, &tmp_key))
+    {
+      _ma_check_print_error(param,
+                            "Can't delete key %d from record to be removed",
+                            i+1);
+      row_info->dfile.file= old_file;
+      DBUG_RETURN(1);
+    }
+  }
+  if (sort_param->calc_checksum)
+    param->glob_crc-=(*key_info->s->calc_check_checksum)(key_info,
+                                                         sort_param->record);
+  error= (*row_info->s->delete_record)(row_info, sort_param->record);
+  if (error)
+    _ma_check_print_error(param,"Got error %d when deleting record",
+                          my_errno);
+  row_info->dfile.file= old_file;           /* restore actual value */
+  row_info->s->state.state.records--;
+  DBUG_RETURN(error);
+} /* sort_delete_record */
+
+
+/* Fix all pending blocks and flush everything to disk */
+
+int _ma_flush_pending_blocks(MARIA_SORT_PARAM *sort_param)
+{
+  uint nod_flag,length;
+  my_off_t filepos,key_file_length;
+  SORT_KEY_BLOCKS *key_block;
+  MARIA_SORT_INFO *sort_info= sort_param->sort_info;
+  myf myf_rw=sort_info->param->myf_rw;
+  MARIA_HA *info=sort_info->info;
+  MARIA_KEYDEF *keyinfo=sort_param->keyinfo;
+  MARIA_PINNED_PAGE tmp_page_link, *page_link= &tmp_page_link;
+  DBUG_ENTER("_ma_flush_pending_blocks");
+
+  filepos= HA_OFFSET_ERROR;			/* if empty file */
+  nod_flag=0;
+  for (key_block=sort_info->key_block ; key_block->inited ; key_block++)
+  {
+    key_block->inited=0;
+    length= _ma_get_page_used(info->s, key_block->buff);
+    if (nod_flag)
+      _ma_kpointer(info,key_block->end_pos,filepos);
+    key_file_length= info->s->state.state.key_file_length;
+    bzero(key_block->buff+length, keyinfo->block_length-length);
+    if ((filepos= _ma_new(info, DFLT_INIT_HITS, &page_link)) ==
+        HA_OFFSET_ERROR)
+      goto err;
+
+    /* If we read the page from the key cache, we have to write it back */
+    if (page_link->changed)
+    {
+      MARIA_PAGE page;
+      pop_dynamic(&info->pinned_pages);
+
+      _ma_page_setup(&page, info, keyinfo, filepos, key_block->buff);
+      if (_ma_write_keypage(&page, PAGECACHE_LOCK_WRITE_UNLOCK,
+                            DFLT_INIT_HITS))
+	goto err;
+    }
+    else
+    {
+      put_crc(key_block->buff, filepos, info->s);
+      if (my_pwrite(info->s->kfile.file, key_block->buff,
+                    (uint) keyinfo->block_length,filepos, myf_rw))
+        goto err;
+    }
+    DBUG_DUMP("buff",key_block->buff,length);
+    nod_flag=1;
+  }
+  info->s->state.key_root[sort_param->key]=filepos; /* Last is root for tree */
+  _ma_fast_unlock_key_del(info);
+  DBUG_RETURN(0);
+
+err:
+  _ma_fast_unlock_key_del(info);
+  DBUG_RETURN(1);
+} /* _ma_flush_pending_blocks */
+
+	/* alloc space and pointers for key_blocks */
+
+static SORT_KEY_BLOCKS *alloc_key_blocks(HA_CHECK *param, uint blocks,
+                                         uint buffer_length)
+{
+  reg1 uint i;
+  SORT_KEY_BLOCKS *block;
+  DBUG_ENTER("alloc_key_blocks");
+
+  if (!(block= (SORT_KEY_BLOCKS*) my_malloc((sizeof(SORT_KEY_BLOCKS)+
+                                             buffer_length+IO_SIZE)*blocks,
+                                            MYF(0))))
+  {
+    _ma_check_print_error(param,"Not enough memory for sort-key-blocks");
+    return(0);
+  }
+  for (i=0 ; i < blocks ; i++)
+  {
+    block[i].inited=0;
+    block[i].buff= (uchar*) (block+blocks)+(buffer_length+IO_SIZE)*i;
+  }
+  DBUG_RETURN(block);
+} /* alloc_key_blocks */
+
+
+	/* Check if file is almost full */
+
+int maria_test_if_almost_full(MARIA_HA *info)
+{
+  MARIA_SHARE *share= info->s;
+
+  if (share->options & HA_OPTION_COMPRESS_RECORD)
+    return 0;
+  return my_seek(share->kfile.file, 0L, MY_SEEK_END,
+                 MYF(MY_THREADSAFE))/10*9 >
+    (my_off_t) share->base.max_key_file_length ||
+    my_seek(info->dfile.file, 0L, MY_SEEK_END, MYF(0)) / 10 * 9 >
+    (my_off_t) share->base.max_data_file_length;
+}
+
+
+/* Recreate table with bigger more alloced record-data */
+
+int maria_recreate_table(HA_CHECK *param, MARIA_HA **org_info, char *filename)
+{
+  int error;
+  MARIA_HA info;
+  MARIA_SHARE share;
+  MARIA_KEYDEF *keyinfo,*key,*key_end;
+  HA_KEYSEG *keysegs,*keyseg;
+  MARIA_COLUMNDEF *columndef,*column,*end;
+  MARIA_UNIQUEDEF *uniquedef,*u_ptr,*u_end;
+  MARIA_STATUS_INFO status_info;
+  uint unpack,key_parts;
+  ha_rows max_records;
+  ulonglong file_length,tmp_length;
+  MARIA_CREATE_INFO create_info;
+  DBUG_ENTER("maria_recreate_table");
+
+  error=1;					/* Default error */
+  info= **org_info;
+  status_info= (*org_info)->state[0];
+  info.state= &status_info;
+  share= *(*org_info)->s;
+  unpack= ((share.data_file_type == COMPRESSED_RECORD) &&
+           (param->testflag & T_UNPACK));
+  if (!(keyinfo=(MARIA_KEYDEF*) my_alloca(sizeof(MARIA_KEYDEF) *
+                                          share.base.keys)))
+    DBUG_RETURN(0);
+  memcpy((uchar*) keyinfo,(uchar*) share.keyinfo,
+	 (size_t) (sizeof(MARIA_KEYDEF)*share.base.keys));
+
+  key_parts= share.base.all_key_parts;
+  if (!(keysegs=(HA_KEYSEG*) my_alloca(sizeof(HA_KEYSEG)*
+				       (key_parts+share.base.keys))))
+  {
+    my_afree(keyinfo);
+    DBUG_RETURN(1);
+  }
+  if (!(columndef=(MARIA_COLUMNDEF*)
+	my_alloca(sizeof(MARIA_COLUMNDEF)*(share.base.fields+1))))
+  {
+    my_afree(keyinfo);
+    my_afree(keysegs);
+    DBUG_RETURN(1);
+  }
+  if (!(uniquedef=(MARIA_UNIQUEDEF*)
+	my_alloca(sizeof(MARIA_UNIQUEDEF)*(share.state.header.uniques+1))))
+  {
+    my_afree(columndef);
+    my_afree(keyinfo);
+    my_afree(keysegs);
+    DBUG_RETURN(1);
+  }
+
+  /* Copy the column definitions in their original order */
+  for (column= share.columndef, end= share.columndef+share.base.fields;
+       column != end ;
+       column++)
+    columndef[column->column_nr]= *column;
+
+  /* Change the new key to point at the saved key segments */
+  memcpy((uchar*) keysegs,(uchar*) share.keyparts,
+	 (size_t) (sizeof(HA_KEYSEG)*(key_parts+share.base.keys+
+				      share.state.header.uniques)));
+  keyseg=keysegs;
+  for (key=keyinfo,key_end=keyinfo+share.base.keys; key != key_end ; key++)
+  {
+    key->seg=keyseg;
+    for (; keyseg->type ; keyseg++)
+    {
+      if (param->language)
+	keyseg->language=param->language;	/* change language */
+    }
+    keyseg++;					/* Skip end pointer */
+  }
+
+  /*
+    Copy the unique definitions and change them to point at the new key
+    segments
+  */
+  memcpy((uchar*) uniquedef,(uchar*) share.uniqueinfo,
+	 (size_t) (sizeof(MARIA_UNIQUEDEF)*(share.state.header.uniques)));
+  for (u_ptr=uniquedef,u_end=uniquedef+share.state.header.uniques;
+       u_ptr != u_end ; u_ptr++)
+  {
+    u_ptr->seg=keyseg;
+    keyseg+=u_ptr->keysegs+1;
+  }
+
+  file_length=(ulonglong) my_seek(info.dfile.file, 0L, MY_SEEK_END, MYF(0));
+  if (share.options & HA_OPTION_COMPRESS_RECORD)
+    share.base.records=max_records=info.state->records;
+  else if (share.base.min_pack_length)
+    max_records=(ha_rows) (file_length / share.base.min_pack_length);
+  else
+    max_records=0;
+  share.options&= ~HA_OPTION_TEMP_COMPRESS_RECORD;
+
+  tmp_length= file_length+file_length/10;
+  set_if_bigger(file_length,param->max_data_file_length);
+  set_if_bigger(file_length,tmp_length);
+  set_if_bigger(file_length,(ulonglong) share.base.max_data_file_length);
+
+  VOID(maria_close(*org_info));
+
+  bzero((char*) &create_info,sizeof(create_info));
+  create_info.max_rows=max(max_records,share.base.records);
+  create_info.reloc_rows=share.base.reloc;
+  create_info.old_options=(share.options |
+			   (unpack ? HA_OPTION_TEMP_COMPRESS_RECORD : 0));
+
+  create_info.data_file_length=file_length;
+  create_info.auto_increment=share.state.auto_increment;
+  create_info.language = (param->language ? param->language :
+			  share.state.header.language);
+  create_info.key_file_length=  status_info.key_file_length;
+  create_info.org_data_file_type= ((enum data_file_type)
+                                   share.state.header.org_data_file_type);
+
+  /*
+    Allow for creating an auto_increment key. This has an effect only if
+    an auto_increment key exists in the original table.
+  */
+  create_info.with_auto_increment= TRUE;
+  create_info.null_bytes= share.base.null_bytes;
+  create_info.transactional= share.base.born_transactional;
+
+  /*
+    We don't have to handle symlinks here because we are using
+    HA_DONT_TOUCH_DATA
+  */
+  if (maria_create(filename, share.data_file_type,
+                   share.base.keys - share.state.header.uniques,
+                   keyinfo, share.base.fields, columndef,
+                   share.state.header.uniques, uniquedef,
+                   &create_info,
+                   HA_DONT_TOUCH_DATA))
+  {
+    _ma_check_print_error(param,
+                          "Got error %d when trying to recreate indexfile",
+                          my_errno);
+    goto end;
+  }
+  *org_info= maria_open(filename,O_RDWR,
+                        (HA_OPEN_FOR_REPAIR |
+                         ((param->testflag & T_WAIT_FOREVER) ?
+                          HA_OPEN_WAIT_IF_LOCKED :
+                          (param->testflag & T_DESCRIPT) ?
+                          HA_OPEN_IGNORE_IF_LOCKED :
+                          HA_OPEN_ABORT_IF_LOCKED)));
+  if (!*org_info)
+  {
+    _ma_check_print_error(param,
+                          "Got error %d when trying to open re-created "
+                          "indexfile", my_errno);
+    goto end;
+  }
+  /* We are modifing */
+  (*org_info)->s->options&= ~HA_OPTION_READ_ONLY_DATA;
+  VOID(_ma_readinfo(*org_info,F_WRLCK,0));
+  (*org_info)->s->state.state.records= info.state->records;
+  if (share.state.create_time)
+    (*org_info)->s->state.create_time=share.state.create_time;
+#ifdef EXTERNAL_LOCKING
+  (*org_info)->s->state.unique= (*org_info)->this_unique= share.state.unique;
+#endif
+  (*org_info)->s->state.state.checksum= info.state->checksum;
+  (*org_info)->s->state.state.del= info.state->del;
+  (*org_info)->s->state.dellink= share.state.dellink;
+  (*org_info)->s->state.state.empty= info.state->empty;
+  (*org_info)->s->state.state.data_file_length= info.state->data_file_length;
+  *(*org_info)->state= (*org_info)->s->state.state;
+  if (maria_update_state_info(param,*org_info,UPDATE_TIME | UPDATE_STAT |
+                              UPDATE_OPEN_COUNT))
+    goto end;
+  error=0;
+end:
+  my_afree(uniquedef);
+  my_afree(keyinfo);
+  my_afree(columndef);
+  my_afree(keysegs);
+  DBUG_RETURN(error);
+}
+
+
+	/* write suffix to data file if neaded */
+
+int maria_write_data_suffix(MARIA_SORT_INFO *sort_info, my_bool fix_datafile)
+{
+  MARIA_HA *info=sort_info->new_info;
+
+  if (info->s->data_file_type == COMPRESSED_RECORD && fix_datafile)
+  {
+    uchar buff[MEMMAP_EXTRA_MARGIN];
+    bzero(buff,sizeof(buff));
+    if (my_b_write(&info->rec_cache,buff,sizeof(buff)))
+    {
+      _ma_check_print_error(sort_info->param,
+			   "%d when writing to datafile",my_errno);
+      return 1;
+    }
+    sort_info->param->read_cache.end_of_file+=sizeof(buff);
+  }
+  return 0;
+}
+
+
+/* Update state and maria_chk time of indexfile */
+
+int maria_update_state_info(HA_CHECK *param, MARIA_HA *info,uint update)
+{
+  MARIA_SHARE *share= info->s;
+  DBUG_ENTER("maria_update_state_info");
+
+  if (update & UPDATE_OPEN_COUNT)
+  {
+    share->state.open_count=0;
+    share->global_changed=0;
+  }
+  if (update & UPDATE_STAT)
+  {
+    uint i, key_parts= mi_uint2korr(share->state.header.key_parts);
+    share->state.records_at_analyze= share->state.state.records;
+    share->state.changed&= ~STATE_NOT_ANALYZED;
+    if (share->state.state.records)
+    {
+      for (i=0; i<key_parts; i++)
+      {
+        if (!(share->state.rec_per_key_part[i]=param->new_rec_per_key_part[i]))
+          share->state.changed|= STATE_NOT_ANALYZED;
+      }
+    }
+  }
+  if (update & (UPDATE_STAT | UPDATE_SORT | UPDATE_TIME | UPDATE_AUTO_INC))
+  {
+    if (update & UPDATE_TIME)
+    {
+      share->state.check_time= time((time_t*) 0);
+      if (!share->state.create_time)
+	share->state.create_time= share->state.check_time;
+    }
+    if (_ma_state_info_write(share,
+                             MA_STATE_INFO_WRITE_DONT_MOVE_OFFSET |
+                             MA_STATE_INFO_WRITE_FULL_INFO))
+      goto err;
+    share->changed=0;
+  }
+  {						/* Force update of status */
+    int error;
+    uint r_locks=share->r_locks,w_locks=share->w_locks;
+    share->r_locks= share->w_locks= share->tot_locks= 0;
+    error= _ma_writeinfo(info,WRITEINFO_NO_UNLOCK);
+    share->r_locks=r_locks;
+    share->w_locks=w_locks;
+    share->tot_locks=r_locks+w_locks;
+    if (!error)
+      DBUG_RETURN(0);
+  }
+err:
+  _ma_check_print_error(param,"%d when updating keyfile",my_errno);
+  DBUG_RETURN(1);
+}
+
+/*
+  Update auto increment value for a table
+  When setting the 'repair_only' flag we only want to change the
+  old auto_increment value if its wrong (smaller than some given key).
+  The reason is that we shouldn't change the auto_increment value
+  for a table without good reason when only doing a repair; If the
+  user have inserted and deleted rows, the auto_increment value
+  may be bigger than the biggest current row and this is ok.
+
+  If repair_only is not set, we will update the flag to the value in
+  param->auto_increment is bigger than the biggest key.
+*/
+
+void _ma_update_auto_increment_key(HA_CHECK *param, MARIA_HA *info,
+                                   my_bool repair_only)
+{
+  MARIA_SHARE *share= info->s;
+  uchar *record;
+  DBUG_ENTER("update_auto_increment_key");
+
+  if (!share->base.auto_key ||
+      ! maria_is_key_active(share->state.key_map, share->base.auto_key - 1))
+  {
+    if (!(param->testflag & T_VERY_SILENT))
+      _ma_check_print_info(param,
+			  "Table: %s doesn't have an auto increment key\n",
+			  param->isam_file_name);
+    DBUG_VOID_RETURN;
+  }
+  if (!(param->testflag & T_SILENT) &&
+      !(param->testflag & T_REP))
+    printf("Updating Aria file: %s\n", param->isam_file_name);
+  /*
+    We have to use an allocated buffer instead of info->rec_buff as
+    _ma_put_key_in_record() may use info->rec_buff
+  */
+  if (!(record= (uchar*) my_malloc((size_t) share->base.default_rec_buff_size,
+                                   MYF(0))))
+  {
+    _ma_check_print_error(param,"Not enough memory for extra record");
+    DBUG_VOID_RETURN;
+  }
+
+  maria_extra(info,HA_EXTRA_KEYREAD,0);
+  if (maria_rlast(info, record, share->base.auto_key-1))
+  {
+    if (my_errno != HA_ERR_END_OF_FILE)
+    {
+      maria_extra(info,HA_EXTRA_NO_KEYREAD,0);
+      my_free((char*) record, MYF(0));
+      _ma_check_print_error(param,"%d when reading last record",my_errno);
+      DBUG_VOID_RETURN;
+    }
+    if (!repair_only)
+      share->state.auto_increment=param->auto_increment_value;
+  }
+  else
+  {
+    const HA_KEYSEG *keyseg= share->keyinfo[share->base.auto_key-1].seg;
+    ulonglong auto_increment=
+      ma_retrieve_auto_increment(record + keyseg->start, keyseg->type);
+    set_if_bigger(share->state.auto_increment,auto_increment);
+    if (!repair_only)
+      set_if_bigger(share->state.auto_increment, param->auto_increment_value);
+  }
+  maria_extra(info,HA_EXTRA_NO_KEYREAD,0);
+  my_free((char*) record, MYF(0));
+  maria_update_state_info(param, info, UPDATE_AUTO_INC);
+  DBUG_VOID_RETURN;
+}
+
+
+/*
+  Update statistics for each part of an index
+
+  SYNOPSIS
+    maria_update_key_parts()
+      keyinfo           IN  Index information (only key->keysegs used)
+      rec_per_key_part  OUT Store statistics here
+      unique            IN  Array of (#distinct tuples)
+      notnull_tuples    IN  Array of (#tuples), or NULL
+      records               Number of records in the table
+
+  DESCRIPTION
+    This function is called produce index statistics values from unique and
+    notnull_tuples arrays after these arrays were produced with sequential
+    index scan (the scan is done in two places: chk_index() and
+    sort_key_write()).
+
+    This function handles all 3 index statistics collection methods.
+
+    Unique is an array:
+      unique[0]= (#different values of {keypart1}) - 1
+      unique[1]= (#different values of {keypart1,keypart2} tuple)-unique[0]-1
+      ...
+
+    For MI_STATS_METHOD_IGNORE_NULLS method, notnull_tuples is an array too:
+      notnull_tuples[0]= (#of {keypart1} tuples such that keypart1 is not NULL)
+      notnull_tuples[1]= (#of {keypart1,keypart2} tuples such that all
+                          keypart{i} are not NULL)
+      ...
+    For all other statistics collection methods notnull_tuples==NULL.
+
+    Output is an array:
+    rec_per_key_part[k] =
+     = E(#records in the table such that keypart_1=c_1 AND ... AND
+         keypart_k=c_k for arbitrary constants c_1 ... c_k)
+
+     = {assuming that values have uniform distribution and index contains all
+        tuples from the domain (or that {c_1, ..., c_k} tuple is choosen from
+        index tuples}
+
+     = #tuples-in-the-index / #distinct-tuples-in-the-index.
+
+    The #tuples-in-the-index and #distinct-tuples-in-the-index have different
+    meaning depending on which statistics collection method is used:
+
+    MI_STATS_METHOD_*  how are nulls compared?  which tuples are counted?
+     NULLS_EQUAL            NULL == NULL           all tuples in table
+     NULLS_NOT_EQUAL        NULL != NULL           all tuples in table
+     IGNORE_NULLS               n/a             tuples that don't have NULLs
+*/
+
+void maria_update_key_parts(MARIA_KEYDEF *keyinfo, double *rec_per_key_part,
+                      ulonglong *unique, ulonglong *notnull,
+                      ulonglong records)
+{
+  ulonglong count=0, unique_tuples;
+  ulonglong tuples= records;
+  uint parts;
+  double tmp;
+  for (parts=0 ; parts < keyinfo->keysegs  ; parts++)
+  {
+    count+=unique[parts];
+    unique_tuples= count + 1;
+    if (notnull)
+    {
+      tuples= notnull[parts];
+      /*
+        #(unique_tuples not counting tuples with NULLs) =
+          #(unique_tuples counting tuples with NULLs as different) -
+          #(tuples with NULLs)
+      */
+      unique_tuples -= (records - notnull[parts]);
+    }
+
+    if (unique_tuples == 0)
+      tmp= 1;
+    else if (count == 0)
+      tmp= ulonglong2double(tuples); /* 1 unique tuple */
+    else
+      tmp= ulonglong2double(tuples) / ulonglong2double(unique_tuples);
+
+    /*
+      for some weird keys (e.g. FULLTEXT) tmp can be <1 here.
+      let's ensure it is not
+    */
+    set_if_bigger(tmp,1);
+
+    *rec_per_key_part++= tmp;
+  }
+}
+
+
+static ha_checksum maria_byte_checksum(const uchar *buf, uint length)
+{
+  ha_checksum crc;
+  const uchar *end=buf+length;
+  for (crc=0; buf != end; buf++)
+    crc=((crc << 1) + *buf) +
+      test(crc & (((ha_checksum) 1) << (8*sizeof(ha_checksum)-1)));
+  return crc;
+}
+
+static my_bool maria_too_big_key_for_sort(MARIA_KEYDEF *key, ha_rows rows)
+{
+  uint key_maxlength=key->maxlength;
+  if (key->flag & HA_FULLTEXT)
+  {
+    uint ft_max_word_len_for_sort=FT_MAX_WORD_LEN_FOR_SORT*
+                                  key->seg->charset->mbmaxlen;
+    key_maxlength+=ft_max_word_len_for_sort-HA_FT_MAXBYTELEN;
+  }
+  return (key->flag & HA_SPATIAL) ||
+          (key->flag & (HA_BINARY_PACK_KEY | HA_VAR_LENGTH_KEY | HA_FULLTEXT) &&
+	  ((ulonglong) rows * key_maxlength >
+	   (ulonglong) maria_max_temp_length));
+}
+
+/*
+  Deactivate all not unique index that can be recreated fast
+  These include packed keys on which sorting will use more temporary
+  space than the max allowed file length or for which the unpacked keys
+  will take much more space than packed keys.
+  Note that 'rows' may be zero for the case when we don't know how many
+  rows we will put into the file.
+ */
+
+void maria_disable_non_unique_index(MARIA_HA *info, ha_rows rows)
+{
+  MARIA_SHARE *share= info->s;
+  MARIA_KEYDEF    *key=share->keyinfo;
+  uint          i;
+
+  DBUG_ASSERT(share->state.state.records == 0 &&
+              (!rows || rows >= MARIA_MIN_ROWS_TO_DISABLE_INDEXES));
+  for (i=0 ; i < share->base.keys ; i++,key++)
+  {
+    if (!(key->flag &
+          (HA_NOSAME | HA_SPATIAL | HA_AUTO_KEY | HA_RTREE_INDEX)) &&
+        ! maria_too_big_key_for_sort(key,rows) && share->base.auto_key != i+1)
+    {
+      maria_clear_key_active(share->state.key_map, i);
+      info->update|= HA_STATE_CHANGED;
+    }
+  }
+}
+
+
+/*
+  Return TRUE if we can use repair by sorting
+  One can set the force argument to force to use sorting
+  even if the temporary file would be quite big!
+*/
+
+my_bool maria_test_if_sort_rep(MARIA_HA *info, ha_rows rows,
+                               ulonglong key_map, my_bool force)
+{
+  MARIA_SHARE *share= info->s;
+  MARIA_KEYDEF *key=share->keyinfo;
+  uint i;
+
+  /*
+    maria_repair_by_sort only works if we have at least one key. If we don't
+    have any keys, we should use the normal repair.
+  */
+  if (! maria_is_any_key_active(key_map))
+    return FALSE;				/* Can't use sort */
+  for (i=0 ; i < share->base.keys ; i++,key++)
+  {
+    if (!force && maria_too_big_key_for_sort(key,rows))
+      return FALSE;
+  }
+  return TRUE;
+}
+
+
+/**
+   @brief Create a new handle for manipulation the new record file
+
+   @note
+   It's ok for Recovery to have two MARIA_SHARE on the same index file
+   because the one we create here is not transactional
+*/
+
+static my_bool create_new_data_handle(MARIA_SORT_PARAM *param, File new_file)
+{
+
+  MARIA_SORT_INFO *sort_info= param->sort_info;
+  MARIA_HA *info= sort_info->info;
+  MARIA_HA *new_info;
+  DBUG_ENTER("create_new_data_handle");
+
+  if (!(sort_info->new_info= maria_open(info->s->open_file_name.str, O_RDWR,
+                                        HA_OPEN_COPY | HA_OPEN_FOR_REPAIR)))
+    DBUG_RETURN(1);
+
+  new_info= sort_info->new_info;
+  _ma_bitmap_set_pagecache_callbacks(&new_info->s->bitmap.file,
+                                     new_info->s);
+  _ma_set_data_pagecache_callbacks(&new_info->dfile, new_info->s);
+  change_data_file_descriptor(new_info, new_file);
+  maria_lock_database(new_info, F_EXTRA_LCK);
+  if ((sort_info->param->testflag & T_UNPACK) &&
+      info->s->data_file_type == COMPRESSED_RECORD)
+  {
+    (*new_info->s->once_end)(new_info->s);
+    (*new_info->s->end)(new_info);
+    restore_data_file_type(new_info->s);
+    _ma_setup_functions(new_info->s);
+    if ((*new_info->s->once_init)(new_info->s, new_file) ||
+        (*new_info->s->init)(new_info))
+      DBUG_RETURN(1);
+  }
+  _ma_reset_status(new_info);
+  if (_ma_initialize_data_file(new_info->s, new_file))
+    DBUG_RETURN(1);
+
+  /* Take into account any bitmap page created above: */
+  param->filepos= new_info->s->state.state.data_file_length;
+
+  /* Use new virtual functions for key generation */
+  info->s->keypos_to_recpos= new_info->s->keypos_to_recpos;
+  info->s->recpos_to_keypos= new_info->s->recpos_to_keypos;
+  DBUG_RETURN(0);
+}
+
+
+static void
+set_data_file_type(MARIA_SORT_INFO *sort_info, MARIA_SHARE *share)
+{
+  if ((sort_info->new_data_file_type=share->data_file_type) ==
+      COMPRESSED_RECORD && sort_info->param->testflag & T_UNPACK)
+  {
+    MARIA_SHARE tmp;
+    sort_info->new_data_file_type= share->state.header.org_data_file_type;
+    /* Set delete_function for sort_delete_record() */
+    tmp= *share;
+    tmp.state.header.data_file_type= tmp.state.header.org_data_file_type;
+    tmp.options= ~HA_OPTION_COMPRESS_RECORD;
+    _ma_setup_functions(&tmp);
+    share->delete_record=tmp.delete_record;
+  }
+}
+
+static void restore_data_file_type(MARIA_SHARE *share)
+{
+  MARIA_SHARE tmp_share;
+  share->options&= ~HA_OPTION_COMPRESS_RECORD;
+  mi_int2store(share->state.header.options,share->options);
+  share->state.header.data_file_type=
+    share->state.header.org_data_file_type;
+  share->data_file_type= share->state.header.data_file_type;
+  share->pack.header_length= 0;
+
+  /* Use new virtual functions for key generation */
+  tmp_share= *share;
+  _ma_setup_functions(&tmp_share);
+  share->keypos_to_recpos= tmp_share.keypos_to_recpos;
+  share->recpos_to_keypos= tmp_share.recpos_to_keypos;
+}
+
+
+static void change_data_file_descriptor(MARIA_HA *info, File new_file)
+{
+  my_close(info->dfile.file, MYF(MY_WME));
+  info->dfile.file= info->s->bitmap.file.file= new_file;
+  _ma_bitmap_reset_cache(info->s);
+}
+
+
+/**
+   @brief Mark the data file to not be used
+
+   @note
+   This is used in repair when we want to ensure the handler will not
+   write anything to the data file anymore
+*/
+
+static void unuse_data_file_descriptor(MARIA_HA *info)
+{
+  info->dfile.file= info->s->bitmap.file.file= -1;
+  _ma_bitmap_reset_cache(info->s);
+}
+
+
+/*
+  Copy all states that has to do with the data file
+
+  NOTES
+    This is done to copy the state from the data file generated from
+    repair to the original handler
+*/
+
+static void copy_data_file_state(MARIA_STATE_INFO *to,
+                                 MARIA_STATE_INFO *from)
+{
+  to->state.records=           from->state.records;
+  to->state.del=               from->state.del;
+  to->state.empty=             from->state.empty;
+  to->state.data_file_length=  from->state.data_file_length;
+  to->split=                   from->split;
+  to->dellink=		       from->dellink;
+  to->first_bitmap_with_space= from->first_bitmap_with_space;
+}
+
+
+/*
+  Read 'safely' next record while scanning table.
+
+  SYNOPSIS
+    _ma_safe_scan_block_record()
+    info                Maria handler
+    record              Store found here
+
+  NOTES
+    - One must have called mi_scan() before this
+
+    Differences compared to  _ma_scan_block_records() are:
+    - We read all blocks, not only blocks marked by the bitmap to be safe
+    - In case of errors, next read will read next record.
+    - More sanity checks
+
+  RETURN
+    0   ok
+    HA_ERR_END_OF_FILE  End of file
+    #   error number
+*/
+
+
+static int _ma_safe_scan_block_record(MARIA_SORT_INFO *sort_info,
+                                      MARIA_HA *info, uchar *record)
+{
+  MARIA_SHARE *share= info->s;
+  MARIA_RECORD_POS record_pos= info->cur_row.nextpos;
+  pgcache_page_no_t page= sort_info->page;
+  DBUG_ENTER("_ma_safe_scan_block_record");
+
+  for (;;)
+  {
+    /* Find next row in current page */
+    if (likely(record_pos < info->scan.number_of_rows))
+    {
+      uint length, offset;
+      uchar *data, *end_of_data;
+      char llbuff[22];
+
+      while (!(offset= uint2korr(info->scan.dir)))
+      {
+        info->scan.dir-= DIR_ENTRY_SIZE;
+        record_pos++;
+        if (info->scan.dir < info->scan.dir_end)
+        {
+          _ma_check_print_info(sort_info->param,
+                               "Wrong directory on page %s",
+                               llstr(page, llbuff));
+          goto read_next_page;
+        }
+      }
+      /* found row */
+      info->cur_row.lastpos= info->scan.row_base_page + record_pos;
+      info->cur_row.nextpos= record_pos + 1;
+      data= info->scan.page_buff + offset;
+      length= uint2korr(info->scan.dir + 2);
+      end_of_data= data + length;
+      info->scan.dir-= DIR_ENTRY_SIZE;          /* Point to previous row */
+
+      if (end_of_data > info->scan.dir_end ||
+          offset < PAGE_HEADER_SIZE || length < share->base.min_block_length)
+      {
+        _ma_check_print_info(sort_info->param,
+                             "Wrong directory entry %3u at page %s",
+                             (uint) record_pos, llstr(page, llbuff));
+        record_pos++;
+        continue;
+      }
+      else
+      {
+        DBUG_PRINT("info", ("rowid: %lu", (ulong) info->cur_row.lastpos));
+        DBUG_RETURN(_ma_read_block_record2(info, record, data, end_of_data));
+      }
+    }
+
+read_next_page:
+    /* Read until we find next head page */
+    for (;;)
+    {
+      uint page_type;
+      char llbuff[22];
+
+      sort_info->page++;                        /* In case of errors */
+      page++;
+      if (!(page % share->bitmap.pages_covered))
+      {
+        /* Skip bitmap */
+        page++;
+        sort_info->page++;
+      }
+      if ((my_off_t) (page + 1) * share->block_size > sort_info->filelength)
+        DBUG_RETURN(HA_ERR_END_OF_FILE);
+      if (!(pagecache_read(share->pagecache,
+                           &info->dfile,
+                           page, 0, info->scan.page_buff,
+                           PAGECACHE_READ_UNKNOWN_PAGE,
+                           PAGECACHE_LOCK_LEFT_UNLOCKED, 0)))
+      {
+        if (my_errno == HA_ERR_WRONG_CRC)
+        {
+          _ma_check_print_info(sort_info->param,
+                               "Wrong CRC on datapage at %s",
+                               llstr(page, llbuff));
+          continue;
+        }
+        DBUG_RETURN(my_errno);
+      }
+      page_type= (info->scan.page_buff[PAGE_TYPE_OFFSET] &
+                  PAGE_TYPE_MASK);
+      if (page_type == HEAD_PAGE)
+      {
+        if ((info->scan.number_of_rows=
+             (uint) (uchar) info->scan.page_buff[DIR_COUNT_OFFSET]) != 0)
+          break;
+        _ma_check_print_info(sort_info->param,
+                             "Wrong head page at page %s",
+                             llstr(page, llbuff));
+      }
+      else if (page_type >= MAX_PAGE_TYPE)
+      {
+        _ma_check_print_info(sort_info->param,
+                             "Found wrong page type: %d at page %s",
+                             page_type, llstr(page, llbuff));
+      }
+    }
+
+    /* New head page */
+    info->scan.dir= (info->scan.page_buff + share->block_size -
+                     PAGE_SUFFIX_SIZE - DIR_ENTRY_SIZE);
+    info->scan.dir_end= (info->scan.dir -
+                         (info->scan.number_of_rows - 1) *
+                         DIR_ENTRY_SIZE);
+    info->scan.row_base_page= ma_recordpos(page, 0);
+    record_pos= 0;
+  }
+}
+
+
+/**
+   @brief Writes a LOGREC_REPAIR_TABLE record and updates create_rename_lsn
+   if needed (so that maria_read_log does not redo the repair).
+
+   @param  param            description of the REPAIR operation
+   @param  info             table
+
+   @return Operation status
+     @retval 0      ok
+     @retval 1      error (disk problem)
+*/
+
+my_bool write_log_record_for_repair(const HA_CHECK *param, MARIA_HA *info)
+{
+  MARIA_SHARE *share= info->s;
+  /* in case this is maria_chk or recovery... */
+  if (translog_status == TRANSLOG_OK && !maria_in_recovery &&
+      share->base.born_transactional)
+  {
+    my_bool save_now_transactional= share->now_transactional;
+
+    /*
+      For now this record is only informative. It could serve when applying
+      logs to a backup, but that needs more thought. Assume table became
+      corrupted. It is repaired, then some writes happen to it.
+      Later we restore an old backup, and want to apply this REDO_REPAIR_TABLE
+      record. For it to give the same result as originally, the table should
+      be corrupted the same way, so applying previous REDOs should produce the
+      same corruption; that's really not guaranteed (different execution paths
+      in execution of REDOs vs runtime code so not same bugs hit, temporary
+      hardware issues not repeatable etc). Corruption may not be repeatable.
+      A reasonable solution is to execute the REDO_REPAIR_TABLE record and
+      check if the checksum of the resulting table matches what it was at the
+      end of the original repair (should be stored in log record); or execute
+      the REDO_REPAIR_TABLE if the checksum of the table-before-repair matches
+      was it was at the start of the original repair (should be stored in log
+      record).
+    */
+    LEX_CUSTRING log_array[TRANSLOG_INTERNAL_PARTS + 1];
+    uchar log_data[FILEID_STORE_SIZE + 8 + 8];
+    LSN lsn;
+
+    /*
+      testflag gives an idea of what REPAIR did (in particular T_QUICK
+      or not: did it touch the data file or not?).
+    */
+    int8store(log_data + FILEID_STORE_SIZE, param->testflag);
+    /* org_key_map is used when recreating index after a load data infile */
+    int8store(log_data + FILEID_STORE_SIZE + 8, param->org_key_map);
+
+    log_array[TRANSLOG_INTERNAL_PARTS + 0].str=    log_data;
+    log_array[TRANSLOG_INTERNAL_PARTS + 0].length= sizeof(log_data);
+
+    share->now_transactional= 1;
+    if (unlikely(translog_write_record(&lsn, LOGREC_REDO_REPAIR_TABLE,
+                                       &dummy_transaction_object, info,
+                                       (translog_size_t) sizeof(log_data),
+                                       sizeof(log_array)/sizeof(log_array[0]),
+                                       log_array, log_data, NULL) ||
+                 translog_flush(lsn)))
+      return TRUE;
+    /*
+      The table's existence was made durable earlier (MY_SYNC_DIR passed to
+      maria_change_to_newfile()). All pages have been flushed, state too, we
+      need to force it to disk. Old REDOs should not be applied to the table,
+      which is already enforced as skip_redos_lsn was increased in
+      protect_against_repair_crash(). But if this is an explicit repair,
+      even UNDO phase should ignore this table: create_rename_lsn should be
+      increased, and this also serves for the REDO_REPAIR to be ignored by
+      maria_read_log.
+      The fully correct order would be: sync data and index file, remove crash
+      mark and update LSNs then write state and sync index file. But at this
+      point state (without crash mark) is already written.
+    */
+    if ((!(param->testflag & T_NO_CREATE_RENAME_LSN) &&
+         _ma_update_state_lsns(share, lsn, share->state.create_trid, FALSE,
+                               FALSE)) ||
+        _ma_sync_table_files(info))
+      return TRUE;
+    share->now_transactional= save_now_transactional;
+  }
+  return FALSE;
+}
+
+
+/**
+  Writes an UNDO record which if executed in UNDO phase, will empty the
+  table. Such record is thus logged only in certain cases of bulk insert
+  (table needs to be empty etc).
+*/
+my_bool write_log_record_for_bulk_insert(MARIA_HA *info)
+{
+  LEX_CUSTRING log_array[TRANSLOG_INTERNAL_PARTS + 1];
+  uchar log_data[LSN_STORE_SIZE + FILEID_STORE_SIZE];
+  LSN lsn;
+  lsn_store(log_data, info->trn->undo_lsn);
+  log_array[TRANSLOG_INTERNAL_PARTS + 0].str=    log_data;
+  log_array[TRANSLOG_INTERNAL_PARTS + 0].length= sizeof(log_data);
+  return translog_write_record(&lsn, LOGREC_UNDO_BULK_INSERT,
+                               info->trn, info,
+                               (translog_size_t)
+                               log_array[TRANSLOG_INTERNAL_PARTS +
+                                         0].length,
+                               TRANSLOG_INTERNAL_PARTS + 1, log_array,
+                               log_data + LSN_STORE_SIZE, NULL) ||
+    translog_flush(lsn); /* WAL */
+}
+
+
+/* Give error message why reading of key page failed */
+
+static void report_keypage_fault(HA_CHECK *param, MARIA_HA *info,
+                                 my_off_t position)
+{
+  char buff[11];
+  uint32 block_size= info->s->block_size;
+
+  if (my_errno == HA_ERR_CRASHED)
+    _ma_check_print_error(param,
+                          "Wrong base information on indexpage at page: %s",
+                          llstr(position / block_size, buff));
+  else
+    _ma_check_print_error(param,
+                          "Can't read indexpage from page: %s, "
+                          "error: %d",
+                          llstr(position / block_size, buff), my_errno);
+}
+
+
+/**
+  When we want to check a table, we verify that the transaction ids of rows
+  and keys are not bigger than the biggest id generated by Maria so far, which
+  is returned by the function below.
+
+  @note If control file is not open, 0 may be returned; to not confuse
+  this with a valid max trid of 0, the caller should notice that it failed to
+  open the control file (ma_control_file_inited() can serve for that).
+*/
+
+static TrID max_trid_in_system(void)
+{
+  TrID id= trnman_get_max_trid(); /* 0 if transac manager not initialized */
+  /* 'id' may be far bigger, if last shutdown is old */
+  return max(id, max_trid_in_control_file);
+}
+
+
+static void _ma_check_print_not_visible_error(HA_CHECK *param, TrID used_trid)
+{
+  char buff[22], buff2[22];
+  if (!param->not_visible_rows_found++)
+  {
+    if (!ma_control_file_inited())
+    {
+      _ma_check_print_warning(param,
+                              "Found row with transaction id %s but no "
+                              "aria_control_file was used or specified.  "
+                              "The table may be corrupted",
+                              llstr(used_trid, buff));
+    }
+    else
+    {
+      _ma_check_print_error(param,
+                            "Found row with transaction id %s when max "
+                            "transaction id according to aria_control_file "
+                            "is %s",
+                            llstr(used_trid, buff),
+                            llstr(param->max_trid, buff2));
+    }
+  }
+}
+
+
+/**
+  Mark that we can retry normal repair if we used quick repair
+
+  We shouldn't do this in case of disk error as in this case we are likely
+  to loose much more than expected.
+*/
+
+void retry_if_quick(MARIA_SORT_PARAM *sort_param, int error)
+{
+  HA_CHECK *param=sort_param->sort_info->param;
+
+  if (!sort_param->fix_datafile && error >= HA_ERR_FIRST)
+  {
+    param->retry_repair=1;
+    param->testflag|=T_RETRY_WITHOUT_QUICK;
+  }
+}
diff --git a/storage/maria/ma_check_standalone.h b/storage/maria/ma_check_standalone.h
new file mode 100644
index 00000000000..8cda285bb99
--- /dev/null
+++ b/storage/maria/ma_check_standalone.h
@@ -0,0 +1,104 @@
+/* Copyright (C) 2007 MySQL AB
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; version 2 of the License.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */
+
+/*
+  All standalone programs which need to use functions from ma_check.c
+  (like maria_repair()) must define their version of _ma_killed_ptr()
+  and _ma_check_print_info|warning|error(). Indeed, linking with ma_check.o
+  brings in the dependencies of ma_check.o which are definitions of the above
+  functions; if the program does not define them then the ones of
+  ha_maria.o are used i.e. ha_maria.o is linked into the program, and this
+  brings dependencies of ha_maria.o on mysqld.o into the program's linking
+  which thus fails, as the program is not linked with mysqld.o.
+  This file contains the versions of these functions used by maria_chk and
+  maria_read_log.
+*/
+
+/*
+  Check if check/repair operation was killed by a signal
+*/
+
+int _ma_killed_ptr(HA_CHECK *param __attribute__((unused)))
+{
+  return 0;
+}
+
+	/* print warnings and errors */
+	/* VARARGS */
+
+void _ma_check_print_info(HA_CHECK *param __attribute__((unused)),
+			 const char *fmt,...)
+{
+  va_list args;
+  DBUG_ENTER("_ma_check_print_info");
+  DBUG_PRINT("enter", ("format: %s", fmt));
+
+  va_start(args,fmt);
+  VOID(vfprintf(stdout, fmt, args));
+  VOID(fputc('\n',stdout));
+  va_end(args);
+  DBUG_VOID_RETURN;
+}
+
+/* VARARGS */
+
+void _ma_check_print_warning(HA_CHECK *param, const char *fmt,...)
+{
+  va_list args;
+  DBUG_ENTER("_ma_check_print_warning");
+  DBUG_PRINT("enter", ("format: %s", fmt));
+
+  fflush(stdout);
+  if (!param->warning_printed && !param->error_printed)
+  {
+    if (param->testflag & T_SILENT)
+      fprintf(stderr,"%s: Aria file %s\n",my_progname_short,
+	      param->isam_file_name);
+    param->out_flag|= O_DATA_LOST;
+  }
+  param->warning_printed=1;
+  va_start(args,fmt);
+  fprintf(stderr,"%s: warning: ",my_progname_short);
+  VOID(vfprintf(stderr, fmt, args));
+  VOID(fputc('\n',stderr));
+  fflush(stderr);
+  va_end(args);
+  DBUG_VOID_RETURN;
+}
+
+/* VARARGS */
+
+void _ma_check_print_error(HA_CHECK *param, const char *fmt,...)
+{
+  va_list args;
+  DBUG_ENTER("_ma_check_print_error");
+  DBUG_PRINT("enter", ("format: %s", fmt));
+
+  fflush(stdout);
+  if (!param->warning_printed && !param->error_printed)
+  {
+    if (param->testflag & T_SILENT)
+      fprintf(stderr,"%s: Aria file %s\n",my_progname_short,param->isam_file_name);
+    param->out_flag|= O_DATA_LOST;
+  }
+  param->error_printed|=1;
+  va_start(args,fmt);
+  fprintf(stderr,"%s: error: ",my_progname_short);
+  VOID(vfprintf(stderr, fmt, args));
+  VOID(fputc('\n',stderr));
+  fflush(stderr);
+  va_end(args);
+  DBUG_VOID_RETURN;
+}
diff --git a/storage/maria/ma_checkpoint.c b/storage/maria/ma_checkpoint.c
new file mode 100644
index 00000000000..cf13cee9452
--- /dev/null
+++ b/storage/maria/ma_checkpoint.c
@@ -0,0 +1,1196 @@
+/* Copyright (C) 2006,2007 MySQL AB
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; version 2 of the License.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */
+
+/*
+  WL#3071 Maria checkpoint
+  First version written by Guilhem Bichot on 2006-04-27.
+*/
+
+/* Here is the implementation of this module */
+
+/** @todo RECOVERY BUG this is unreviewed code */
+/*
+  Summary:
+  checkpoints are done either by a background thread (checkpoint every Nth
+  second) or by a client.
+  In ha_maria, it's not made available to clients, and will soon be done by a
+  background thread (periodically taking checkpoints and flushing dirty
+  pages).
+*/
+
+#include "maria_def.h"
+#include "ma_pagecache.h"
+#include "ma_blockrec.h"
+#include "ma_checkpoint.h"
+#include "ma_loghandler_lsn.h"
+#include "ma_servicethread.h"
+
+
+/** @brief type of checkpoint currently running */
+static CHECKPOINT_LEVEL checkpoint_in_progress= CHECKPOINT_NONE;
+/** @brief protects checkpoint_in_progress */
+static pthread_mutex_t LOCK_checkpoint;
+/** @brief for killing the background checkpoint thread */
+static pthread_cond_t  COND_checkpoint;
+/** @brief control structure for checkpoint background thread */
+static MA_SERVICE_THREAD_CONTROL checkpoint_control=
+  {THREAD_DEAD, FALSE, &LOCK_checkpoint, &COND_checkpoint};
+/* is ulong like pagecache->blocks_changed */
+static ulong pages_to_flush_before_next_checkpoint;
+static PAGECACHE_FILE *dfiles, /**< data files to flush in background */
+  *dfiles_end; /**< list of data files ends here */
+static PAGECACHE_FILE *kfiles, /**< index files to flush in background */
+  *kfiles_end; /**< list of index files ends here */
+/* those two statistics below could serve in SHOW GLOBAL STATUS */
+static uint checkpoints_total= 0, /**< all checkpoint requests made */
+  checkpoints_ok_total= 0; /**< all checkpoints which succeeded */
+
+struct st_filter_param
+{
+  LSN up_to_lsn; /**< only pages with rec_lsn < this LSN */
+  uint max_pages; /**< stop after flushing this number pages */
+}; /**< information to determine which dirty pages should be flushed */
+
+static enum pagecache_flush_filter_result
+filter_flush_file_medium(enum pagecache_page_type type,
+                         pgcache_page_no_t page,
+                         LSN rec_lsn, void *arg);
+static enum pagecache_flush_filter_result
+filter_flush_file_full(enum pagecache_page_type type,
+                       pgcache_page_no_t page,
+                       LSN rec_lsn, void *arg);
+static enum pagecache_flush_filter_result
+filter_flush_file_evenly(enum pagecache_page_type type,
+                         pgcache_page_no_t pageno,
+                         LSN rec_lsn, void *arg);
+static int really_execute_checkpoint(void);
+pthread_handler_t ma_checkpoint_background(void *arg);
+static int collect_tables(LEX_STRING *str, LSN checkpoint_start_log_horizon);
+
+/**
+   @brief Does a checkpoint
+
+   @param  level               what level of checkpoint to do
+   @param  no_wait             if another checkpoint of same or stronger level
+                               is already running, consider our job done
+
+   @note In ha_maria, there can never be two threads trying a checkpoint at
+   the same time.
+
+   @return Operation status
+    @retval 0 ok
+    @retval !=0 error
+*/
+
+int ma_checkpoint_execute(CHECKPOINT_LEVEL level, my_bool no_wait)
+{
+  int result= 0;
+  DBUG_ENTER("ma_checkpoint_execute");
+
+  if (!checkpoint_control.inited)
+  {
+    /*
+      If ha_maria failed to start, maria_panic_hton is called, we come here.
+    */
+    DBUG_RETURN(0);
+  }
+  DBUG_ASSERT(level > CHECKPOINT_NONE);
+
+  /* look for already running checkpoints */
+  pthread_mutex_lock(&LOCK_checkpoint);
+  while (checkpoint_in_progress != CHECKPOINT_NONE)
+  {
+    if (no_wait && (checkpoint_in_progress >= level))
+    {
+      /*
+        If we are the checkpoint background thread, we don't wait (it's
+        smarter to flush pages instead of waiting here while the other thread
+        finishes its checkpoint).
+      */
+      pthread_mutex_unlock(&LOCK_checkpoint);
+      goto end;
+    }
+    pthread_cond_wait(&COND_checkpoint, &LOCK_checkpoint);
+  }
+
+  checkpoint_in_progress= level;
+  pthread_mutex_unlock(&LOCK_checkpoint);
+  /* from then on, we are sure to be and stay the only checkpointer */
+
+  result= really_execute_checkpoint();
+  pthread_cond_broadcast(&COND_checkpoint);
+end:
+  DBUG_RETURN(result);
+}
+
+
+/**
+   @brief Does a checkpoint, really; expects no other checkpoints
+   running.
+
+   Checkpoint level requested is read from checkpoint_in_progress.
+
+   @return Operation status
+    @retval 0   ok
+    @retval !=0 error
+*/
+
+static int really_execute_checkpoint(void)
+{
+  uint i, error= 0;
+  /** @brief checkpoint_start_log_horizon will be stored there */
+  char *ptr;
+  LEX_STRING record_pieces[4]; /**< only malloc-ed pieces */
+  LSN min_page_rec_lsn, min_trn_rec_lsn, min_first_undo_lsn;
+  TRANSLOG_ADDRESS checkpoint_start_log_horizon;
+  char checkpoint_start_log_horizon_char[LSN_STORE_SIZE];
+  DBUG_ENTER("really_execute_checkpoint");
+  DBUG_PRINT("enter", ("level: %d", checkpoint_in_progress));
+  bzero(&record_pieces, sizeof(record_pieces));
+
+  /*
+    STEP 1: record current end-of-log position using log's lock. It is
+    critical for the correctness of Checkpoint (related to memory visibility
+    rules, the log's lock is a mutex).
+    "Horizon" is a lower bound of the LSN of the next log record.
+  */
+  checkpoint_start_log_horizon= translog_get_horizon();
+  DBUG_PRINT("info",("checkpoint_start_log_horizon (%lu,0x%lx)",
+                     LSN_IN_PARTS(checkpoint_start_log_horizon)));
+  lsn_store(checkpoint_start_log_horizon_char, checkpoint_start_log_horizon);
+
+  /*
+    STEP 2: fetch information about transactions.
+    We must fetch transactions before dirty pages. Indeed, a transaction
+    first sets its rec_lsn then sets the page's rec_lsn then sets its rec_lsn
+    to 0. If we fetched pages first, we may see no dirty page yet, then we
+    fetch transactions but the transaction has already reset its rec_lsn to 0
+    so we miss rec_lsn again.
+    For a similar reason (over-allocated bitmap pages) we have to fetch
+    transactions before flushing bitmap pages.
+
+    min_trn_rec_lsn will serve to lower the starting point of the REDO phase
+    (down from checkpoint_start_log_horizon).
+ */
+  if (unlikely(trnman_collect_transactions(&record_pieces[0],
+                                           &record_pieces[1],
+                                           &min_trn_rec_lsn,
+                                           &min_first_undo_lsn)))
+    goto err;
+
+
+  /* STEP 3: fetch information about table files */
+  if (unlikely(collect_tables(&record_pieces[2],
+                              checkpoint_start_log_horizon)))
+    goto err;
+
+
+  /* STEP 4: fetch information about dirty pages */
+  /*
+    It's better to do it _after_ having flushed some data pages (which
+    collect_tables() may have done), because those are now non-dirty and so we
+    have a more up-to-date dirty pages list to put into the checkpoint record,
+    and thus we will have less work at Recovery.
+  */
+  /* Using default pagecache for now */
+  if (unlikely(pagecache_collect_changed_blocks_with_lsn(maria_pagecache,
+                                                         &record_pieces[3],
+                                                         &min_page_rec_lsn)))
+    goto err;
+
+
+  /* LAST STEP: now write the checkpoint log record */
+  {
+    LSN lsn;
+    translog_size_t total_rec_length;
+    /*
+      the log handler is allowed to modify "str" and "length" (but not "*str")
+      of its argument, so we must not pass it record_pieces directly,
+      otherwise we would later not know what memory pieces to my_free().
+    */
+    LEX_CUSTRING log_array[TRANSLOG_INTERNAL_PARTS + 5];
+    log_array[TRANSLOG_INTERNAL_PARTS + 0].str=
+      (uchar*) checkpoint_start_log_horizon_char;
+    log_array[TRANSLOG_INTERNAL_PARTS + 0].length= total_rec_length=
+      sizeof(checkpoint_start_log_horizon_char);
+    for (i= 0; i < (sizeof(record_pieces)/sizeof(record_pieces[0])); i++)
+    {
+      log_array[TRANSLOG_INTERNAL_PARTS + 1 + i]=
+        *(LEX_CUSTRING *)&record_pieces[i];
+      total_rec_length+= (translog_size_t) record_pieces[i].length;
+    }
+    if (unlikely(translog_write_record(&lsn, LOGREC_CHECKPOINT,
+                                       &dummy_transaction_object, NULL,
+                                       total_rec_length,
+                                       sizeof(log_array)/sizeof(log_array[0]),
+                                       log_array, NULL, NULL) ||
+                 translog_flush(lsn)))
+      goto err;
+    translog_lock();
+    /*
+      This cannot be done as a inwrite_rec_hook of LOGREC_CHECKPOINT, because
+      such hook would be called before translog_flush (and we must be sure
+      that log was flushed before we write to the control file).
+    */
+    if (unlikely(ma_control_file_write_and_force(lsn, last_logno,
+                                                 max_trid_in_control_file,
+                                                 recovery_failures)))
+    {
+      translog_unlock();
+      goto err;
+    }
+    translog_unlock();
+  }
+
+  /*
+    Note that we should not alter memory structures until we have successfully
+    written the checkpoint record and control file.
+  */
+  /* checkpoint succeeded */
+  ptr= record_pieces[3].str;
+  pages_to_flush_before_next_checkpoint= uint4korr(ptr);
+  DBUG_PRINT("checkpoint",("%u pages to flush before next checkpoint",
+                           (uint)pages_to_flush_before_next_checkpoint));
+
+  /* compute log's low-water mark */
+  {
+    TRANSLOG_ADDRESS log_low_water_mark= min_page_rec_lsn;
+    set_if_smaller(log_low_water_mark, min_trn_rec_lsn);
+    set_if_smaller(log_low_water_mark, min_first_undo_lsn);
+    set_if_smaller(log_low_water_mark, checkpoint_start_log_horizon);
+    /**
+       Now purge unneeded logs.
+       As some systems have an unreliable fsync (drive lying), we could try to
+       be robust against that: remember a few previous checkpoints in the
+       control file, and not purge logs immediately... Think about it.
+    */
+    if (translog_purge(log_low_water_mark))
+      ma_message_no_user(0, "log purging failed");
+  }
+
+  goto end;
+
+err:
+  error= 1;
+  ma_message_no_user(0, "checkpoint failed");
+  /* we were possibly not able to determine what pages to flush */
+  pages_to_flush_before_next_checkpoint= 0;
+
+end:
+  for (i= 0; i < (sizeof(record_pieces)/sizeof(record_pieces[0])); i++)
+    my_free(record_pieces[i].str, MYF(MY_ALLOW_ZERO_PTR));
+  pthread_mutex_lock(&LOCK_checkpoint);
+  checkpoint_in_progress= CHECKPOINT_NONE;
+  checkpoints_total++;
+  checkpoints_ok_total+= !error;
+  pthread_mutex_unlock(&LOCK_checkpoint);
+  DBUG_RETURN(error);
+}
+
+
+/**
+   @brief Initializes the checkpoint module
+
+   @param  interval           If one wants the module to create a
+                              thread which will periodically do
+                              checkpoints, and flush dirty pages, in the
+                              background, it should specify a non-zero
+                              interval in seconds. The thread will then be
+                              created and will take checkpoints separated by
+                              approximately 'interval' second.
+
+   @note A checkpoint is taken only if there has been some significant
+   activity since the previous checkpoint. Between checkpoint N and N+1 the
+   thread flushes all dirty pages which were already dirty at the time of
+   checkpoint N.
+
+   @return Operation status
+    @retval 0   ok
+    @retval !=0 error
+*/
+
+int ma_checkpoint_init(ulong interval)
+{
+  pthread_t th;
+  int res= 0;
+  DBUG_ENTER("ma_checkpoint_init");
+  if (ma_service_thread_control_init(&checkpoint_control))
+    res= 1;
+  else if (interval > 0)
+  {
+    compile_time_assert(sizeof(void *) >= sizeof(ulong));
+    if (!(res= pthread_create(&th, NULL, ma_checkpoint_background,
+                              (void *)interval)))
+    {
+      /* thread lives, will have to be killed */
+      checkpoint_control.status= THREAD_RUNNING;
+    }
+  }
+  DBUG_RETURN(res);
+}
+
+
+#ifndef DBUG_OFF
+/**
+   Function used to test recovery: flush some table pieces and then caller
+   crashes.
+
+   @param  what_to_flush   0: current bitmap and all data pages
+                           1: state
+                           2: all bitmap pages
+*/
+static void flush_all_tables(int what_to_flush)
+{
+  int res= 0;
+  LIST *pos; /**< to iterate over open tables */
+  pthread_mutex_lock(&THR_LOCK_maria);
+  for (pos= maria_open_list; pos; pos= pos->next)
+  {
+    MARIA_HA *info= (MARIA_HA*)pos->data;
+    if (info->s->now_transactional)
+    {
+      switch (what_to_flush)
+      {
+      case 0:
+        res= _ma_flush_table_files(info, MARIA_FLUSH_DATA | MARIA_FLUSH_INDEX,
+                                   FLUSH_KEEP, FLUSH_KEEP);
+        break;
+      case 1:
+        res= _ma_state_info_write(info->s,
+                                  MA_STATE_INFO_WRITE_DONT_MOVE_OFFSET|
+                                  MA_STATE_INFO_WRITE_LOCK);
+        DBUG_PRINT("maria_flush_states",
+                   ("is_of_horizon: LSN (%lu,0x%lx)",
+                    LSN_IN_PARTS(info->s->state.is_of_horizon)));
+        break;
+      case 2:
+        res= _ma_bitmap_flush_all(info->s);
+        break;
+      }
+    }
+    DBUG_ASSERT(res == 0);
+  }
+  pthread_mutex_unlock(&THR_LOCK_maria);
+}
+#endif
+
+
+/**
+   @brief Destroys the checkpoint module
+*/
+
+void ma_checkpoint_end(void)
+{
+  DBUG_ENTER("ma_checkpoint_end");
+  /*
+    Some intentional crash methods, usually triggered by
+    SET MARIA_CHECKPOINT_INTERVAL=X
+  */
+  DBUG_EXECUTE_IF("maria_flush_bitmap",
+                  {
+                    DBUG_PRINT("maria_flush_bitmap", ("now"));
+                    flush_all_tables(2);
+                  });
+  DBUG_EXECUTE_IF("maria_flush_whole_page_cache",
+                  {
+                    DBUG_PRINT("maria_flush_whole_page_cache", ("now"));
+                    flush_all_tables(0);
+                  });
+  DBUG_EXECUTE_IF("maria_flush_whole_log",
+                  {
+                    DBUG_PRINT("maria_flush_whole_log", ("now"));
+                    translog_flush(translog_get_horizon());
+                  });
+  /*
+    Note that for WAL reasons, maria_flush_states requires
+    maria_flush_whole_log.
+  */
+  DBUG_EXECUTE_IF("maria_flush_states",
+                  {
+                    DBUG_PRINT("maria_flush_states", ("now"));
+                    flush_all_tables(1);
+                  });
+  DBUG_EXECUTE_IF("maria_crash",
+                  { DBUG_PRINT("maria_crash", ("now")); DBUG_ABORT(); });
+
+  if (checkpoint_control.inited)
+  {
+    ma_service_thread_control_end(&checkpoint_control);
+    my_free((uchar *)dfiles, MYF(MY_ALLOW_ZERO_PTR));
+    my_free((uchar *)kfiles, MYF(MY_ALLOW_ZERO_PTR));
+    dfiles= kfiles= NULL;
+  }
+  DBUG_VOID_RETURN;
+}
+
+
+/**
+   @brief dirty-page filtering criteria for MEDIUM checkpoint.
+
+   We flush data/index pages which have been dirty since the previous
+   checkpoint (this is the two-checkpoint rule: the REDO phase will not have
+   to start from earlier than the next-to-last checkpoint).
+   Bitmap pages are handled by _ma_bitmap_flush_all().
+
+   @param  type                Page's type
+   @param  pageno              Page's number
+   @param  rec_lsn             Page's rec_lsn
+   @param  arg                 filter_param
+*/
+
+static enum pagecache_flush_filter_result
+filter_flush_file_medium(enum pagecache_page_type type,
+                         pgcache_page_no_t pageno __attribute__ ((unused)),
+                         LSN rec_lsn, void *arg)
+{
+  struct st_filter_param *param= (struct st_filter_param *)arg;
+  return (type == PAGECACHE_LSN_PAGE) &&
+    (cmp_translog_addr(rec_lsn, param->up_to_lsn) <= 0);
+}
+
+
+/**
+   @brief dirty-page filtering criteria for FULL checkpoint.
+
+   We flush all dirty data/index pages.
+   Bitmap pages are handled by _ma_bitmap_flush_all().
+
+   @param  type                Page's type
+   @param  pageno              Page's number
+   @param  rec_lsn             Page's rec_lsn
+   @param  arg                 filter_param
+*/
+
+static enum pagecache_flush_filter_result
+filter_flush_file_full(enum pagecache_page_type type,
+                       pgcache_page_no_t pageno __attribute__ ((unused)),
+                       LSN rec_lsn __attribute__ ((unused)),
+                       void *arg __attribute__ ((unused)))
+{
+  return (type == PAGECACHE_LSN_PAGE);
+}
+
+
+/**
+   @brief dirty-page filtering criteria for background flushing thread.
+
+   We flush data/index pages which have been dirty since the previous
+   checkpoint (this is the two-checkpoint rule: the REDO phase will not have
+   to start from earlier than the next-to-last checkpoint), and no
+   bitmap pages. But we flush no more than a certain number of pages (to have
+   an even flushing, no write burst).
+   The reason to not flush bitmap pages is that they may not be in a flushable
+   state at this moment and we don't want to wait for them.
+
+   @param  type                Page's type
+   @param  pageno              Page's number
+   @param  rec_lsn             Page's rec_lsn
+   @param  arg                 filter_param
+*/
+
+static enum pagecache_flush_filter_result
+filter_flush_file_evenly(enum pagecache_page_type type,
+                         pgcache_page_no_t pageno __attribute__ ((unused)),
+                         LSN rec_lsn, void *arg)
+{
+  struct st_filter_param *param= (struct st_filter_param *)arg;
+  if (unlikely(param->max_pages == 0)) /* all flushed already */
+    return FLUSH_FILTER_SKIP_ALL;
+  if ((type == PAGECACHE_LSN_PAGE) &&
+      (cmp_translog_addr(rec_lsn, param->up_to_lsn) <= 0))
+  {
+    param->max_pages--;
+    return FLUSH_FILTER_OK;
+  }
+  return FLUSH_FILTER_SKIP_TRY_NEXT;
+}
+
+
+/**
+   @brief Background thread which does checkpoints and flushes periodically.
+
+   Takes a checkpoint. After this, all pages dirty at the time of that
+   checkpoint are flushed evenly until it is time to take another checkpoint.
+   This ensures that the REDO phase starts at earliest (in LSN time) at the
+   next-to-last checkpoint record ("two-checkpoint rule").
+
+   @note MikaelR questioned why the same thread does two different jobs, the
+   risk could be that while a checkpoint happens no LRD flushing happens.
+*/
+
+pthread_handler_t ma_checkpoint_background(void *arg)
+{
+  /** @brief At least this of log/page bytes written between checkpoints */
+  const uint checkpoint_min_activity= 2*1024*1024;
+  /*
+    If the interval could be changed by the user while we are in this thread,
+    it could be annoying: for example it could cause "case 2" to be executed
+    right after "case 0", thus having 'dfile' unset. So the thread cares only
+    about the interval's value when it started.
+  */
+  const ulong interval= (ulong)arg;
+  uint sleeps, sleep_time;
+  TRANSLOG_ADDRESS log_horizon_at_last_checkpoint=
+    translog_get_horizon();
+  ulonglong pagecache_flushes_at_last_checkpoint=
+    maria_pagecache->global_cache_write;
+  uint pages_bunch_size;
+  struct st_filter_param filter_param;
+  PAGECACHE_FILE *dfile; /**< data file currently being flushed */
+  PAGECACHE_FILE *kfile; /**< index file currently being flushed */
+  LINT_INIT(kfile);
+  LINT_INIT(dfile);
+  LINT_INIT(pages_bunch_size);
+
+  my_thread_init();
+  DBUG_PRINT("info",("Maria background checkpoint thread starts"));
+  DBUG_ASSERT(interval > 0);
+
+  /*
+    Recovery ended with all tables closed and a checkpoint: no need to take
+    one immediately.
+  */
+  sleeps= 1;
+  pages_to_flush_before_next_checkpoint= 0;
+
+  for(;;) /* iterations of checkpoints and dirty page flushing */
+  {
+#if 0 /* good for testing, to do a lot of checkpoints, finds a lot of bugs */
+    sleeps=0;
+#endif
+    switch (sleeps % interval)
+    {
+    case 0:
+      /*
+        With background flushing evenly distributed over the time
+        between two checkpoints, we should have only little flushing to do
+        in the checkpoint.
+      */
+      /*
+        No checkpoint if little work of interest for recovery was done
+        since last checkpoint. Such work includes log writing (lengthens
+        recovery, checkpoint would shorten it), page flushing (checkpoint
+        would decrease the amount of read pages in recovery).
+        In case of one short statement per minute (very low load), we don't
+        want to checkpoint every minute, hence the positive
+        checkpoint_min_activity.
+      */
+      if (((translog_get_horizon() - log_horizon_at_last_checkpoint) +
+           (maria_pagecache->global_cache_write -
+            pagecache_flushes_at_last_checkpoint) *
+           maria_pagecache->block_size) < checkpoint_min_activity)
+      {
+        /* don't take checkpoint, so don't know what to flush */
+        pages_to_flush_before_next_checkpoint= 0;
+        sleep_time= interval;
+        break;
+      }
+      sleep_time= 1;
+      ma_checkpoint_execute(CHECKPOINT_MEDIUM, TRUE);
+      /*
+        Snapshot this kind of "state" of the engine. Note that the value below
+        is possibly greater than last_checkpoint_lsn.
+      */
+      log_horizon_at_last_checkpoint= translog_get_horizon();
+      pagecache_flushes_at_last_checkpoint=
+        maria_pagecache->global_cache_write;
+      /*
+        If the checkpoint above succeeded it has set d|kfiles and
+        d|kfiles_end. If is has failed, it has set
+        pages_to_flush_before_next_checkpoint to 0 so we will skip flushing
+        and sleep until the next checkpoint.
+      */
+      break;
+    case 1:
+      /* set up parameters for background page flushing */
+      filter_param.up_to_lsn= last_checkpoint_lsn;
+      pages_bunch_size= pages_to_flush_before_next_checkpoint / interval;
+      dfile= dfiles;
+      kfile= kfiles;
+      /* fall through */
+    default:
+      if (pages_bunch_size > 0)
+      {
+        DBUG_PRINT("checkpoint",
+                   ("Maria background checkpoint thread: %u pages",
+                    pages_bunch_size));
+        /* flush a bunch of dirty pages */
+        filter_param.max_pages= pages_bunch_size;
+        while (dfile != dfiles_end)
+        {
+          /*
+            We use FLUSH_KEEP_LAZY: if a file is already in flush, it's
+            smarter to move to the next file than wait for this one to be
+            completely flushed, which may take long.
+            StaleFilePointersInFlush: notice how below we use "dfile" which
+            is an OS file descriptor plus some function and MARIA_SHARE
+            pointers; this data dates from a previous checkpoint; since then,
+            the table may have been closed (so MARIA_SHARE* became stale), and
+            the file descriptor reassigned to another table which does not
+            have the same CRC-read-set callbacks: it is thus important that
+            flush_pagecache_blocks_with_filter() does not use the pointers,
+            only the OS file descriptor.
+          */
+          int res=
+            flush_pagecache_blocks_with_filter(maria_pagecache,
+                                               dfile, FLUSH_KEEP_LAZY,
+                                               filter_flush_file_evenly,
+                                               &filter_param);
+          if (unlikely(res & PCFLUSH_ERROR))
+            ma_message_no_user(0, "background data page flush failed");
+          if (filter_param.max_pages == 0) /* bunch all flushed, sleep */
+            break; /* and we will continue with the same file */
+          dfile++; /* otherwise all this file is flushed, move to next file */
+          /*
+            MikaelR noted that he observed that Linux's file cache may never
+            fsync to  disk until this cache is full, at which point it decides
+            to empty the cache, making the machine very slow. A solution was
+            to fsync after writing 2 MB. So we might want to fsync() here if
+            we wrote enough pages.
+          */
+        }
+        while (kfile != kfiles_end)
+        {
+          int res=
+            flush_pagecache_blocks_with_filter(maria_pagecache,
+                                               kfile, FLUSH_KEEP_LAZY,
+                                               filter_flush_file_evenly,
+                                               &filter_param);
+          if (unlikely(res & PCFLUSH_ERROR))
+            ma_message_no_user(0, "background index page flush failed");
+          if (filter_param.max_pages == 0) /* bunch all flushed, sleep */
+            break; /* and we will continue with the same file */
+          kfile++; /* otherwise all this file is flushed, move to next file */
+        }
+        sleep_time= 1;
+      }
+      else
+      {
+        /* Can directly sleep until the next checkpoint moment */
+        sleep_time= interval - (sleeps % interval);
+      }
+    }
+    if (my_service_thread_sleep(&checkpoint_control,
+                                sleep_time * 1000000000ULL))
+      break;
+    sleeps+= sleep_time;
+  }
+  DBUG_PRINT("info",("Maria background checkpoint thread ends"));
+  {
+    CHECKPOINT_LEVEL level= CHECKPOINT_FULL;
+    /*
+      That's the final one, which guarantees that a clean shutdown always ends
+      with a checkpoint.
+    */
+    DBUG_EXECUTE_IF("maria_checkpoint_indirect", level= CHECKPOINT_INDIRECT;);
+    ma_checkpoint_execute(level, FALSE);
+  }
+  my_service_thread_signal_end(&checkpoint_control);
+  my_thread_end();
+  return 0;
+}
+
+
+/**
+   @brief Allocates buffer and stores in it some info about open tables,
+   does some flushing on those.
+
+   Does the allocation because the caller cannot know the size itself.
+   Memory freeing is to be done by the caller (if the "str" member of the
+   LEX_STRING is not NULL).
+   The caller is taking a checkpoint.
+
+   @param[out]  str        pointer to where the allocated buffer,
+                           and its size, will be put; buffer will be filled
+                           with info about open tables
+   @param       checkpoint_start_log_horizon  Of the in-progress checkpoint
+                                              record.
+
+   @return Operation status
+     @retval 0      OK
+     @retval 1      Error
+*/
+
+static int collect_tables(LEX_STRING *str, LSN checkpoint_start_log_horizon)
+{
+  MARIA_SHARE **distinct_shares= NULL;
+  char *ptr;
+  uint error= 1, sync_error= 0, nb, nb_stored, i;
+  my_bool unmark_tables= TRUE;
+  uint total_names_length;
+  LIST *pos; /**< to iterate over open tables */
+  struct st_state_copy {
+    uint index;
+    MARIA_STATE_INFO state;
+  };
+  struct st_state_copy *state_copies= NULL, /**< fixed-size cache of states */
+    *state_copies_end, /**< cache ends here */
+    *state_copy; /**< iterator in cache */
+  TRANSLOG_ADDRESS state_copies_horizon; /**< horizon of states' _copies_ */
+  struct st_filter_param filter_param;
+  PAGECACHE_FLUSH_FILTER filter;
+  DBUG_ENTER("collect_tables");
+
+  LINT_INIT(state_copies_horizon);
+  /* let's make a list of distinct shares */
+  pthread_mutex_lock(&THR_LOCK_maria);
+  for (nb= 0, pos= maria_open_list; pos; pos= pos->next)
+  {
+    MARIA_HA *info= (MARIA_HA*)pos->data;
+    MARIA_SHARE *share= info->s;
+    /* the first three variables below can never change */
+    if (share->base.born_transactional && !share->temporary &&
+        share->mode != O_RDONLY &&
+        !(share->in_checkpoint & MARIA_CHECKPOINT_SEEN_IN_LOOP))
+    {
+      /*
+        Apart from us, only maria_close() reads/sets in_checkpoint but cannot
+        run now as we hold THR_LOCK_maria.
+      */
+      /*
+        This table is relevant for checkpoint and not already seen. Mark it,
+        so that it is not seen again in the loop.
+      */
+      nb++;
+      DBUG_ASSERT(share->in_checkpoint == 0);
+      /* This flag ensures that we count only _distinct_ shares. */
+      share->in_checkpoint= MARIA_CHECKPOINT_SEEN_IN_LOOP;
+    }
+  }
+  if (unlikely((distinct_shares=
+                (MARIA_SHARE **)my_malloc(nb * sizeof(MARIA_SHARE *),
+                                          MYF(MY_WME))) == NULL))
+    goto err;
+  for (total_names_length= 0, i= 0, pos= maria_open_list; pos; pos= pos->next)
+  {
+    MARIA_HA *info= (MARIA_HA*)pos->data;
+    MARIA_SHARE *share= info->s;
+    if (share->in_checkpoint & MARIA_CHECKPOINT_SEEN_IN_LOOP)
+    {
+      distinct_shares[i++]= share;
+      /*
+        With this we prevent the share from going away while we later flush
+        and force it without holding THR_LOCK_maria. For example if the share
+        could be my_free()d by maria_close() we would have a problem when we
+        access it to flush the table. We "pin" the share pointer.
+        And we also take down MARIA_CHECKPOINT_SEEN_IN_LOOP, so that it is
+        not seen again in the loop.
+      */
+      share->in_checkpoint= MARIA_CHECKPOINT_LOOKS_AT_ME;
+      total_names_length+= share->open_file_name.length;
+    }
+  }
+
+  DBUG_ASSERT(i == nb);
+  pthread_mutex_unlock(&THR_LOCK_maria);
+  DBUG_PRINT("info",("found %u table shares", nb));
+
+  str->length=
+    4 +               /* number of tables */
+    (2 +              /* short id */
+     LSN_STORE_SIZE + /* first_log_write_at_lsn */
+     1                /* end-of-name 0 */
+     ) * nb + total_names_length;
+  if (unlikely((str->str= my_malloc(str->length, MYF(MY_WME))) == NULL))
+    goto err;
+
+  ptr= str->str;
+  ptr+= 4; /* real number of stored tables is not yet know */
+
+  /* only possible checkpointer, so can do the read below without mutex */
+  filter_param.up_to_lsn= last_checkpoint_lsn;
+  switch(checkpoint_in_progress)
+  {
+  case CHECKPOINT_MEDIUM:
+    filter= &filter_flush_file_medium;
+    break;
+  case CHECKPOINT_FULL:
+    filter= &filter_flush_file_full;
+    break;
+  case CHECKPOINT_INDIRECT:
+    filter= NULL;
+    break;
+  default:
+    DBUG_ASSERT(0);
+    goto err;
+  }
+
+  /*
+    The principle of reading/writing the state below is explained in
+    ma_recovery.c, look for "Recovery of the state".
+  */
+#define STATE_COPIES 1024
+  state_copies= (struct st_state_copy *)
+    my_malloc(STATE_COPIES * sizeof(struct st_state_copy), MYF(MY_WME));
+  dfiles= (PAGECACHE_FILE *)my_realloc((uchar *)dfiles,
+                                       /* avoid size of 0 for my_realloc */
+                                       max(1, nb) * sizeof(PAGECACHE_FILE),
+                                       MYF(MY_WME | MY_ALLOW_ZERO_PTR));
+  kfiles= (PAGECACHE_FILE *)my_realloc((uchar *)kfiles,
+                                       /* avoid size of 0 for my_realloc */
+                                       max(1, nb) * sizeof(PAGECACHE_FILE),
+                                       MYF(MY_WME | MY_ALLOW_ZERO_PTR));
+  if (unlikely((state_copies == NULL) ||
+               (dfiles == NULL) || (kfiles == NULL)))
+    goto err;
+  state_copy= state_copies_end= NULL;
+  dfiles_end= dfiles;
+  kfiles_end= kfiles;
+
+  for (nb_stored= 0, i= 0; i < nb; i++)
+  {
+    MARIA_SHARE *share= distinct_shares[i];
+    PAGECACHE_FILE kfile, dfile;
+    my_bool ignore_share;
+    if (!(share->in_checkpoint & MARIA_CHECKPOINT_LOOKS_AT_ME))
+    {
+      /*
+        No need for a mutex to read the above, only us can write *this* bit of
+        the in_checkpoint bitmap
+      */
+      continue;
+    }
+    /**
+       @todo We should not look at tables which didn't change since last
+       checkpoint.
+    */
+    DBUG_PRINT("info",("looking at table '%s'", share->open_file_name.str));
+    if (state_copy == state_copies_end) /* we have no more cached states */
+    {
+      /*
+        Collect and cache a bunch of states. We do this for many states at a
+        time, to not lock/unlock the log's lock too often.
+      */
+      uint j, bound= min(nb, i + STATE_COPIES);
+      state_copy= state_copies;
+      /* part of the state is protected by log's lock */
+      translog_lock();
+      state_copies_horizon= translog_get_horizon_no_lock();
+      for (j= i; j < bound; j++)
+      {
+        MARIA_SHARE *share2= distinct_shares[j];
+        if (!(share2->in_checkpoint & MARIA_CHECKPOINT_LOOKS_AT_ME))
+          continue;
+        state_copy->index= j;
+        state_copy->state= share2->state; /* we copy the state */
+        state_copy++;
+        /*
+          data_file_length is not updated under log's lock by the bitmap
+          code, but writing a wrong data_file_length is ok: a next
+          maria_close() will correct it; if we crash before, Recovery will
+          set it to the true physical size.
+        */
+      }
+      translog_unlock();
+      /**
+         We are going to flush these states.
+         Before, all records describing how to undo such state must be
+         in the log (WAL). Usually this means UNDOs. In the special case of
+         data|key_file_length, recovery just needs to open the table to fix the
+         length, so any LOGREC_FILE_ID/REDO/UNDO allowing recovery to
+         understand it must open a table, is enough; so as long as
+         data|key_file_length is updated after writing any log record it's ok:
+         if we copied new value above, it means the record was before
+         state_copies_horizon and we flush such record below.
+         Apart from data|key_file_length which are easily recoverable from the
+         real file's size, all other state members must be updated only when
+         writing the UNDO; otherwise, if updated before, if their new value is
+         flushed by a checkpoint and there is a crash before UNDO is written,
+         their REDO group will be missing or at least incomplete and skipped
+         by recovery, so bad state value will stay. For example, setting
+         key_root before writing the UNDO: the table would have old index
+         pages (they were pinned at time of crash) and a new, thus wrong,
+         key_root.
+         @todo RECOVERY BUG check that all code honours that.
+      */
+      if (translog_flush(state_copies_horizon))
+        goto err;
+      /* now we have cached states and they are WAL-safe*/
+      state_copies_end= state_copy;
+      state_copy= state_copies;
+    }
+
+    /* locate our state among these cached ones */
+    for ( ; state_copy->index != i; state_copy++)
+      DBUG_ASSERT(state_copy < state_copies_end);
+
+    /* OS file descriptors are ints which we stored in 4 bytes */
+    compile_time_assert(sizeof(int) <= 4);
+    /*
+      Protect against maria_close() (which does some memory freeing in
+      MARIA_FILE_BITMAP) with close_lock. intern_lock is not
+      sufficient as we, as well as maria_close(), are going to unlock
+      intern_lock in the middle of manipulating the table. Serializing us and
+      maria_close() should help avoid problems.
+    */
+    pthread_mutex_lock(&share->close_lock);
+    pthread_mutex_lock(&share->intern_lock);
+    /*
+      Tables in a normal state have their two file descriptors open.
+      In some rare cases like REPAIR, some descriptor may be closed or even
+      -1. If that happened, the _ma_state_info_write() may fail. This is
+      prevented by enclosing all all places which close/change kfile.file with
+      intern_lock.
+    */
+    kfile= share->kfile;
+    dfile= share->bitmap.file;
+    /*
+      Ignore table which has no logged writes (all its future log records will
+      be found naturally by Recovery). Ignore obsolete shares (_before_
+      setting themselves to last_version=0 they already did all flush and
+      sync; if we flush their state now we may be flushing an obsolete state
+      onto a newer one (assuming the table has been reopened with a different
+      share but of course same physical index file).
+    */
+    ignore_share= (share->id == 0) | (share->last_version == 0);
+    DBUG_PRINT("info", ("ignore_share: %d", ignore_share));
+    if (!ignore_share)
+    {
+      uint open_file_name_len= share->open_file_name.length + 1;
+      /* remember the descriptors for background flush */
+      *(dfiles_end++)= dfile;
+      *(kfiles_end++)= kfile;
+      /* we will store this table in the record */
+      nb_stored++;
+      int2store(ptr, share->id);
+      ptr+= 2;
+      lsn_store(ptr, share->lsn_of_file_id);
+      ptr+= LSN_STORE_SIZE;
+      /*
+        first_bitmap_with_space is not updated under log's lock, and is
+        important. We would need the bitmap's lock to get it right. Recovery
+        of this is not clear, so we just play safe: write it out as
+        unknown: if crash, _ma_bitmap_init() at next open (for example in
+        Recovery) will convert it to 0 and thus the first insertion will
+        search for free space from the file's first bitmap (0) -
+        under-optimal but safe.
+        If no crash, maria_close() will write the exact value.
+      */
+      state_copy->state.first_bitmap_with_space= ~(ulonglong)0;
+      memcpy(ptr, share->open_file_name.str, open_file_name_len);
+      ptr+= open_file_name_len;
+      if (cmp_translog_addr(share->state.is_of_horizon,
+                            checkpoint_start_log_horizon) >= 0)
+      {
+        /*
+          State was flushed recently, it does not hold down the log's
+          low-water mark and will not give avoidable work to Recovery. So we
+          needn't flush it. Also, it is possible that while we copied the
+          state above (under log's lock, without intern_lock) it was being
+          modified in memory or flushed to disk (without log's lock, under
+          intern_lock, like in maria_extra()), so our copy may be incorrect
+          and we should not flush it.
+          It may also be a share which got last_version==0 since we checked
+          last_version; in this case, it flushed its state and the LSN test
+          above will catch it.
+        */
+      }
+      else
+      {
+        /*
+          We could do the state flush only if share->changed, but it's
+          tricky.
+          Consider a maria_write() which has written REDO,UNDO, and before it
+          calls _ma_writeinfo() (setting share->changed=1), checkpoint
+          happens and sees share->changed=0, does not flush state. It is
+          possible that Recovery does not start from before the REDO and thus
+          the state is not recovered. A solution may be to set
+          share->changed=1 under log mutex when writing log records.
+          But as anyway we have another problem below, this optimization would
+          be of little use.
+        */
+        /** @todo flush state only if changed since last checkpoint */
+        DBUG_ASSERT(share->last_version != 0);
+        state_copy->state.is_of_horizon= share->state.is_of_horizon=
+          state_copies_horizon;
+        if (kfile.file >= 0)
+          sync_error|=
+            _ma_state_info_write_sub(kfile.file, &state_copy->state,
+                                     MA_STATE_INFO_WRITE_DONT_MOVE_OFFSET);
+        /*
+          We don't set share->changed=0 because it may interfere with a
+          concurrent _ma_writeinfo() doing share->changed=1 (cancel its
+          effect). The sad consequence is that we will flush the same state at
+          each checkpoint if the table was once written and then not anymore.
+        */
+      }
+    }
+    /*
+      _ma_bitmap_flush_all() may wait, so don't keep intern_lock as
+      otherwise this would deadlock with allocate_and_write_block_record()
+      calling _ma_set_share_data_file_length()
+    */
+    pthread_mutex_unlock(&share->intern_lock);
+    
+    if (!ignore_share)
+    {
+      /*
+        share->bitmap is valid because it's destroyed under close_lock which
+        we hold.
+      */
+      if (_ma_bitmap_flush_all(share))
+      {
+        sync_error= 1;
+        /** @todo all write failures should mark table corrupted */
+        ma_message_no_user(0, "checkpoint bitmap page flush failed");
+      }
+      DBUG_ASSERT(share->pagecache == maria_pagecache);
+    }
+    /*
+      Clean up any unused states.
+      TODO: Only do this call if there has been # (10?) ended transactions
+      since last call.
+      We had to release intern_lock to respect lock order with LOCK_trn_list.
+    */
+    _ma_remove_not_visible_states_with_lock(share, FALSE);
+
+    if (share->in_checkpoint & MARIA_CHECKPOINT_SHOULD_FREE_ME)
+    {
+      /*
+        maria_close() left us free the share. When it run it set share->id
+        to 0. As it run before we locked close_lock, we should have seen this
+        and so this assertion should be true:
+      */
+      DBUG_ASSERT(ignore_share);
+      pthread_mutex_destroy(&share->intern_lock);
+      pthread_mutex_unlock(&share->close_lock);
+      pthread_mutex_destroy(&share->close_lock);
+      my_free((uchar *)share, MYF(0));
+    }
+    else
+    {
+      /* share goes back to normal state */
+      share->in_checkpoint= 0;
+      pthread_mutex_unlock(&share->close_lock);
+    }
+
+    /*
+      We do the big disk writes out of intern_lock to not block other
+      users of this table (intern_lock is taken at the start and end of
+      every statement). This means that file descriptors may be invalid
+      (files may have been closed for example by HA_EXTRA_PREPARE_FOR_*
+      under Windows, or REPAIR). This should not be a problem as we use
+      MY_IGNORE_BADFD. Descriptors may even point to other files but then
+      the old blocks (of before the close) must have been flushed for sure,
+      so our flush will flush new blocks (of after the latest open) and that
+      should do no harm.
+    */
+    /*
+      If CHECKPOINT_MEDIUM, this big flush below may result in a
+      serious write burst. Realize that all pages dirtied between the
+      last checkpoint and the one we are doing now, will be flushed at
+      next checkpoint, except those evicted by LRU eviction (depending on
+      the size of the page cache compared to the size of the working data
+      set, eviction may be rare or frequent).
+      We avoid that burst by anticipating: those pages are flushed
+      in bunches spanned regularly over the time interval between now and
+      the next checkpoint, by a background thread. Thus the next checkpoint
+      will have only little flushing to do (CHECKPOINT_MEDIUM should thus be
+      only a little slower than CHECKPOINT_INDIRECT).
+    */
+
+    /*
+      PageCacheFlushConcurrencyBugs
+      Inside the page cache, calls to flush_pagecache_blocks_int() on the same
+      file are serialized. Examples of concurrency bugs which happened when we
+      didn't have this serialization:
+      - maria_chk_size() (via CHECK TABLE) happens concurrently with
+      Checkpoint: Checkpoint is flushing a page: it pins the page and is
+      pre-empted, maria_chk_size() wants to flush this page too so gets an
+      error because Checkpoint pinned this page. Such error makes
+      maria_chk_size() mark the table as corrupted.
+      - maria_close() happens concurrently with Checkpoint:
+      Checkpoint is flushing a page: it registers a request on the page, is
+      pre-empted ; maria_close() flushes this page too with FLUSH_RELEASE:
+      FLUSH_RELEASE will cause a free_block() which assumes the page is in the
+      LRU, but it is not (as Checkpoint registered a request). Crash.
+      - one thread is evicting a page of the file out of the LRU: it marks it
+      iPC_BLOCK_IN_SWITCH and is pre-empted. Then two other threads do flushes
+      of the same file concurrently (like above). Then one flusher sees the
+      page is in switch, removes it from changed_blocks[] and puts it in its
+      first_in_switch, so the other flusher will not see the page at all and
+      return too early. If it's maria_close() which returns too early, then
+      maria_close() may close the file descriptor, and the other flusher, and
+      the evicter will fail to write their page: corruption.
+    */
+
+    if (!ignore_share)
+    {
+      if (filter != NULL)
+      {
+        if ((flush_pagecache_blocks_with_filter(maria_pagecache,
+                                                &dfile, FLUSH_KEEP_LAZY,
+                                                filter, &filter_param) &
+             PCFLUSH_ERROR))
+          ma_message_no_user(0, "checkpoint data page flush failed");
+        if ((flush_pagecache_blocks_with_filter(maria_pagecache,
+                                                &kfile, FLUSH_KEEP_LAZY,
+                                                filter, &filter_param) &
+             PCFLUSH_ERROR))
+          ma_message_no_user(0, "checkpoint index page flush failed");
+      }
+      /*
+        fsyncs the fd, that's the loooong operation (e.g. max 150 fsync
+        per second, so if you have touched 1000 files it's 7 seconds).
+      */
+      sync_error|=
+        my_sync(dfile.file, MYF(MY_WME | MY_IGNORE_BADFD)) |
+        my_sync(kfile.file, MYF(MY_WME | MY_IGNORE_BADFD));
+      /*
+        in case of error, we continue because writing other tables to disk is
+        still useful.
+      */
+    }
+  }
+
+  if (sync_error)
+    goto err;
+  /* We maybe over-estimated (due to share->id==0 or last_version==0) */
+  DBUG_ASSERT(str->length >= (uint)(ptr - str->str));
+  str->length= (uint)(ptr - str->str);
+  /*
+    As we support max 65k tables open at a time (2-byte short id), we
+    assume uint is enough for the cumulated length of table names; and
+    LEX_STRING::length is uint.
+  */
+  int4store(str->str, nb_stored);
+  error= unmark_tables= 0;
+
+err:
+  if (unlikely(unmark_tables))
+  {
+    /* maria_close() uses THR_LOCK_maria from start to end */
+    pthread_mutex_lock(&THR_LOCK_maria);
+    for (i= 0; i < nb; i++)
+    {
+      MARIA_SHARE *share= distinct_shares[i];
+      if (share->in_checkpoint & MARIA_CHECKPOINT_SHOULD_FREE_ME)
+      {
+        /* maria_close() left us to free the share */
+        pthread_mutex_destroy(&share->intern_lock);
+        my_free((uchar *)share, MYF(0));
+      }
+      else
+      {
+        /* share goes back to normal state */
+        share->in_checkpoint= 0;
+      }
+    }
+    pthread_mutex_unlock(&THR_LOCK_maria);
+  }
+  my_free((uchar *)distinct_shares, MYF(MY_ALLOW_ZERO_PTR));
+  my_free((uchar *)state_copies, MYF(MY_ALLOW_ZERO_PTR));
+  DBUG_RETURN(error);
+}
diff --git a/storage/maria/ma_checkpoint.h b/storage/maria/ma_checkpoint.h
new file mode 100644
index 00000000000..126f8111a23
--- /dev/null
+++ b/storage/maria/ma_checkpoint.h
@@ -0,0 +1,92 @@
+/* Copyright (C) 2006,2007 MySQL AB
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; version 2 of the License.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */
+
+/*
+  WL#3071 Maria checkpoint
+  First version written by Guilhem Bichot on 2006-04-27.
+  Does not compile yet.
+*/
+
+/* This is the interface of this module. */
+
+typedef enum enum_ma_checkpoint_level {
+  CHECKPOINT_NONE= 0,
+  /* just write dirty_pages, transactions table and sync files */
+  CHECKPOINT_INDIRECT,
+  /* also flush all dirty pages which were already dirty at prev checkpoint */
+  CHECKPOINT_MEDIUM,
+  /* also flush all dirty pages */
+  CHECKPOINT_FULL
+} CHECKPOINT_LEVEL;
+
+C_MODE_START
+int ma_checkpoint_init(ulong interval);
+void ma_checkpoint_end(void);
+int ma_checkpoint_execute(CHECKPOINT_LEVEL level, my_bool no_wait);
+C_MODE_END
+
+/**
+   @brief reads some LSNs with special trickery
+
+   If a 64-bit variable transitions between both halves being zero to both
+   halves being non-zero, and back, this function can be used to do a read of
+   it (without mutex, without atomic load) which always produces a correct
+   (though maybe slightly old) value (even on 32-bit CPUs). The value is at
+   least as new as the latest mutex unlock done by the calling thread.
+   The assumption is that the system sets both 4-byte halves either at the
+   same time, or one after the other (in any order), but NOT some bytes of the
+   first half then some bytes of the second half then the rest of bytes of the
+   first half. With this assumption, the function can detect when it is
+   seeing an inconsistent value.
+
+   @param LSN              pointer to the LSN variable to read
+
+   @return LSN part (most significant byte always 0)
+*/
+#if ( SIZEOF_CHARP >= 8 )
+/* 64-bit CPU, 64-bit reads are atomic */
+#define lsn_read_non_atomic LSN_WITH_FLAGS_TO_LSN
+#else
+static inline LSN lsn_read_non_atomic_32(const volatile LSN *x)
+{
+  /*
+    32-bit CPU, 64-bit reads may give a mixed of old half and new half (old
+    low bits and new high bits, or the contrary).
+  */
+  for (;;) /* loop until no atomicity problems */
+  {
+    /*
+      Remove most significant byte in case this is a LSN_WITH_FLAGS object.
+      Those flags in TRN::first_undo_lsn break the condition on transitions so
+      they must be removed below.
+    */
+    LSN y= LSN_WITH_FLAGS_TO_LSN(*x);
+    if (likely((y == LSN_IMPOSSIBLE) || LSN_VALID(y)))
+      return y;
+  }
+}
+#define lsn_read_non_atomic(x) lsn_read_non_atomic_32(&x)
+#endif
+
+/**
+   prints a message from a task not connected to any user (checkpoint
+   and recovery for example).
+
+   @param  level           0 if error, ME_JUST_WARNING if warning,
+                           ME_JUST_INFO if info
+   @param  sentence        text to write
+*/
+#define ma_message_no_user(level, sentence)                               \
+  my_printf_error(HA_ERR_GENERIC, "Aria engine: %s", MYF(level), sentence)
diff --git a/storage/maria/ma_checksum.c b/storage/maria/ma_checksum.c
new file mode 100644
index 00000000000..61ec638053a
--- /dev/null
+++ b/storage/maria/ma_checksum.c
@@ -0,0 +1,89 @@
+/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; version 2 of the License.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */
+
+/* Calculate a checksum for a row */
+
+#include "maria_def.h"
+
+/**
+   Calculate a checksum for the record
+
+   _ma_checksum()
+   @param info		Maria handler
+   @param record	Record
+
+   @note
+     To ensure that the checksum is independent of the row format
+     we need to always calculate the checksum in the original field order.
+
+   @return  checksum
+*/
+
+ha_checksum _ma_checksum(MARIA_HA *info, const uchar *record)
+{
+  ha_checksum crc=0;
+  uint i,end;
+  MARIA_COLUMNDEF *base_column= info->s->columndef;
+  uint16 *column_nr= info->s->column_nr;
+
+  if (info->s->base.null_bytes)
+    crc= my_checksum(crc, record, info->s->base.null_bytes);
+
+  for (i= 0, end= info->s->base.fields ; i < end ; i++)
+  {
+    MARIA_COLUMNDEF *column= base_column + column_nr[i];
+    const uchar *pos;
+    ulong length;
+
+    if (record[column->null_pos] & column->null_bit)
+      continue;                                 /* Null field */
+
+    pos= record + column->offset;
+    switch (column->type) {
+    case FIELD_BLOB:
+    {
+      uint blob_size_length= column->length- portable_sizeof_char_ptr;
+      length= _ma_calc_blob_length(blob_size_length, pos);
+      if (length)
+      {
+        memcpy((char*) &pos, pos + blob_size_length, sizeof(char*));
+        crc= my_checksum(crc, pos, length);
+      }
+      continue;
+    }
+    case FIELD_VARCHAR:
+    {
+      uint pack_length= column->fill_length;
+      if (pack_length == 1)
+        length= (ulong) *pos;
+      else
+        length= uint2korr(pos);
+      pos+= pack_length;                        /* Skip length information */
+      break;
+    }
+    default:
+      length= column->length;
+      break;
+    }
+    crc= my_checksum(crc, pos, length);
+  }
+  return crc;
+}
+
+
+ha_checksum _ma_static_checksum(MARIA_HA *info, const uchar *pos)
+{
+  return my_checksum(0, pos, info->s->base.reclength);
+}
diff --git a/storage/maria/ma_close.c b/storage/maria/ma_close.c
new file mode 100644
index 00000000000..df525d45d14
--- /dev/null
+++ b/storage/maria/ma_close.c
@@ -0,0 +1,208 @@
+/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; version 2 of the License.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */
+
+/* close a isam-database */
+/*
+  TODO:
+   We need to have a separate mutex on the closed file to allow other threads
+   to open other files during the time we flush the cache and close this file
+*/
+
+#include "maria_def.h"
+
+int maria_close(register MARIA_HA *info)
+{
+  int error=0,flag;
+  my_bool share_can_be_freed= FALSE;
+  MARIA_SHARE *share= info->s;
+  DBUG_ENTER("maria_close");
+  DBUG_PRINT("enter",("base: 0x%lx  reopen: %u  locks: %u",
+		      (long) info, (uint) share->reopen,
+                      (uint) share->tot_locks));
+
+  /* Check that we have unlocked key delete-links properly */
+  DBUG_ASSERT(info->key_del_used == 0);
+
+  pthread_mutex_lock(&THR_LOCK_maria);
+  if (info->lock_type == F_EXTRA_LCK)
+    info->lock_type=F_UNLCK;			/* HA_EXTRA_NO_USER_CHANGE */
+
+  if (share->reopen == 1 && share->kfile.file >= 0)
+    _ma_decrement_open_count(info);
+
+  if (info->lock_type != F_UNLCK)
+  {
+    if (maria_lock_database(info,F_UNLCK))
+      error=my_errno;
+  }
+  pthread_mutex_lock(&share->close_lock);
+  pthread_mutex_lock(&share->intern_lock);
+
+  if (share->options & HA_OPTION_READ_ONLY_DATA)
+  {
+    share->r_locks--;
+    share->tot_locks--;
+  }
+  if (info->opt_flag & (READ_CACHE_USED | WRITE_CACHE_USED))
+  {
+    if (end_io_cache(&info->rec_cache))
+      error=my_errno;
+    info->opt_flag&= ~(READ_CACHE_USED | WRITE_CACHE_USED);
+  }
+  flag= !--share->reopen;
+  maria_open_list=list_delete(maria_open_list,&info->open_list);
+
+  my_free(info->rec_buff, MYF(MY_ALLOW_ZERO_PTR));
+  (*share->end)(info);
+
+  if (flag)
+  {
+    /* Last close of file; Flush everything */
+
+    /* Check that we don't have any dangling pointers from the transaction */
+    DBUG_ASSERT(share->in_trans == 0);
+
+    if (share->kfile.file >= 0)
+    {
+      if ((*share->once_end)(share))
+        error= my_errno;
+      if (flush_pagecache_blocks(share->pagecache, &share->kfile,
+                                 ((share->temporary || share->deleting) ?
+                                  FLUSH_IGNORE_CHANGED :
+                                  FLUSH_RELEASE)))
+        error= my_errno;
+#ifdef HAVE_MMAP
+      if (share->file_map)
+        _ma_unmap_file(info);
+#endif
+      /*
+        If we are crashed, we can safely flush the current state as it will
+        not change the crashed state.
+        We can NOT write the state in other cases as other threads
+        may be using the file at this point
+        IF using --external-locking, which does not apply to Maria.
+      */
+      if (((share->changed && share->base.born_transactional) ||
+           maria_is_crashed(info)))
+      {
+        /*
+          State must be written to file as it was not done at table's
+          unlocking.
+        */
+        if (_ma_state_info_write(share, MA_STATE_INFO_WRITE_DONT_MOVE_OFFSET))
+          error= my_errno;
+      }
+      /*
+        File must be synced as it is going out of the maria_open_list and so
+        becoming unknown to future Checkpoints.
+      */
+      if (share->now_transactional && my_sync(share->kfile.file, MYF(MY_WME)))
+        error= my_errno;
+      if (my_close(share->kfile.file, MYF(0)))
+        error= my_errno;
+    }
+#ifdef THREAD
+    thr_lock_delete(&share->lock);
+    (void) pthread_mutex_destroy(&share->key_del_lock);
+    {
+      int i,keys;
+      keys = share->state.header.keys;
+      VOID(rwlock_destroy(&share->mmap_lock));
+      for(i=0; i<keys; i++) {
+	VOID(rwlock_destroy(&share->keyinfo[i].root_lock));
+      }
+    }
+#endif
+    DBUG_ASSERT(share->now_transactional == share->base.born_transactional);
+    /*
+      We assign -1 because checkpoint does not need to flush (in case we
+      have concurrent checkpoint if no then we do not need it here also)
+    */
+    share->kfile.file= -1;
+
+    /*
+      Remember share->history for future opens
+
+      We have to unlock share->intern_lock then lock it after
+      LOCK_trn_list (trnman_lock()) to avoid dead locks.
+    */
+    pthread_mutex_unlock(&share->intern_lock);
+    _ma_remove_not_visible_states_with_lock(share, TRUE);
+    pthread_mutex_lock(&share->intern_lock);
+
+    if (share->in_checkpoint & MARIA_CHECKPOINT_LOOKS_AT_ME)
+    {
+      /* we cannot my_free() the share, Checkpoint would see a bad pointer */
+      share->in_checkpoint|= MARIA_CHECKPOINT_SHOULD_FREE_ME;
+    }
+    else
+      share_can_be_freed= TRUE;
+
+    if (share->state_history)
+    {
+      MARIA_STATE_HISTORY_CLOSED *history;
+      /*
+        Here we ignore the unlikely case that we don't have memory to
+        store the state. In the worst case what happens is that any transaction
+        that tries to access this table will get a wrong status information.
+      */
+      if ((history= (MARIA_STATE_HISTORY_CLOSED *)
+           my_malloc(sizeof(*history), MYF(MY_WME))))
+      {
+        history->create_rename_lsn= share->state.create_rename_lsn;
+        history->state_history= share->state_history;
+        if (my_hash_insert(&maria_stored_state, (uchar*) history))
+          my_free(history, MYF(0));
+      }
+      /* Marker for concurrent checkpoint */
+      share->state_history= 0;
+    }
+  }
+  pthread_mutex_unlock(&THR_LOCK_maria);
+  pthread_mutex_unlock(&share->intern_lock);
+  pthread_mutex_unlock(&share->close_lock);
+  if (share_can_be_freed)
+  {
+    (void) pthread_mutex_destroy(&share->intern_lock);
+    (void) pthread_mutex_destroy(&share->close_lock);
+    (void) pthread_cond_destroy(&share->key_del_cond);
+    my_free((uchar *)share, MYF(0));
+    /*
+      If share cannot be freed, it's because checkpoint has previously
+      recorded to include this share in the checkpoint and so is soon going to
+      look at some of its content (share->in_checkpoint/id/last_version).
+    */
+  }
+  my_free(info->ftparser_param, MYF(MY_ALLOW_ZERO_PTR));
+  if (info->dfile.file >= 0)
+  {
+    /*
+      This is outside of mutex so would confuse a concurrent
+      Checkpoint. Fortunately in BLOCK_RECORD we close earlier under mutex.
+    */
+    if (my_close(info->dfile.file, MYF(0)))
+      error= my_errno;
+  }
+
+  delete_dynamic(&info->pinned_pages);
+  my_free(info, MYF(0));
+
+  if (error)
+  {
+    DBUG_PRINT("error", ("Got error on close: %d", my_errno));
+    DBUG_RETURN(my_errno= error);
+  }
+  DBUG_RETURN(0);
+} /* maria_close */
diff --git a/storage/maria/ma_commit.c b/storage/maria/ma_commit.c
new file mode 100644
index 00000000000..70bc668a220
--- /dev/null
+++ b/storage/maria/ma_commit.c
@@ -0,0 +1,129 @@
+/* Copyright (C) 2007-2008 MySQL AB, 2008-2009 Sun Microsystems, Inc.
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; version 2 of the License.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */
+
+#include "maria_def.h"
+#include "trnman.h"
+
+/**
+   writes a COMMIT record to log and commits transaction in memory
+
+   @param  trn              transaction
+
+   @return Operation status
+     @retval 0      ok
+     @retval 1      error (disk error or out of memory)
+*/
+
+int ma_commit(TRN *trn)
+{
+  int res;
+  LSN commit_lsn;
+  LEX_CUSTRING log_array[TRANSLOG_INTERNAL_PARTS];
+  DBUG_ENTER("ma_commit");
+
+  DBUG_ASSERT(trn->rec_lsn == LSN_IMPOSSIBLE);
+  if (trn->undo_lsn == 0) /* no work done, rollback (cheaper than commit) */
+    DBUG_RETURN(trnman_rollback_trn(trn));
+  /*
+    - if COMMIT record is written before trnman_commit_trn():
+    if Checkpoint comes in the middle it will see trn is not committed,
+    then if crash, Recovery might roll back trn (if min(rec_lsn) is after
+    COMMIT record) and this is not an issue as
+    * transaction's updates were not made visible to other transactions
+    * "commit ok" was not sent to client
+    Alternatively, Recovery might commit trn (if min(rec_lsn) is before COMMIT
+    record), which is ok too. All in all it means that "trn committed" is not
+    100% equal to "COMMIT record written".
+    - if COMMIT record is written after trnman_commit_trn():
+    if crash happens between the two, trn will be rolled back which is an
+    issue (transaction's updates were made visible to other transactions).
+    So we need to go the first way.
+
+    Note that we have to use | here to ensure that all calls are made.
+  */
+
+  /*
+    We do not store "thd->transaction.xid_state.xid" for now, it will be
+    needed only when we support XA.
+  */
+  res= (translog_write_record(&commit_lsn, LOGREC_COMMIT,
+                             trn, NULL, 0,
+                             sizeof(log_array)/sizeof(log_array[0]),
+                             log_array, NULL, NULL) |
+        translog_flush(commit_lsn));
+
+  DBUG_EXECUTE_IF("maria_sleep_in_commit",
+                  {
+                    DBUG_PRINT("info", ("maria_sleep_in_commit"));
+                    sleep(3);
+                  });
+  res|= trnman_commit_trn(trn);
+
+
+  /*
+    Note: if trnman_commit_trn() fails above, we have already
+    written the COMMIT record, so Checkpoint and Recovery will see the
+    transaction as committed.
+  */
+  DBUG_RETURN(res);
+}
+
+
+/**
+   Writes a COMMIT record for a transaciton associated with a file
+
+   @param  info              Maria handler
+
+   @return Operation status
+     @retval 0      ok
+     @retval #      error (disk error or out of memory)
+*/
+
+int maria_commit(MARIA_HA *info)
+{
+  return info->s->now_transactional ? ma_commit(info->trn) : 0;
+}
+
+
+/**
+   Starts a transaction on a file handle
+
+   @param  info              Maria handler
+
+   @return Operation status
+     @retval 0      ok
+     @retval #      Error code.
+
+   @note this can be used only in single-threaded programs (tests),
+   because we create a transaction (trnman_new_trn) with WT_THD=0.
+   XXX it needs to be fixed when we'll start using maria_begin from SQL.
+*/
+
+int maria_begin(MARIA_HA *info)
+{
+  DBUG_ENTER("maria_begin");
+
+  if (info->s->now_transactional)
+  {
+    TRN *trn= trnman_new_trn(0);
+    if (unlikely(!trn))
+      DBUG_RETURN(HA_ERR_OUT_OF_MEM);
+
+    DBUG_PRINT("info", ("TRN set to 0x%lx", (ulong) trn));
+    _ma_set_trn_for_table(info, trn);
+  }
+  DBUG_RETURN(0);
+}
+
diff --git a/storage/maria/ma_commit.h b/storage/maria/ma_commit.h
new file mode 100644
index 00000000000..2c57c73fd7a
--- /dev/null
+++ b/storage/maria/ma_commit.h
@@ -0,0 +1,18 @@
+/* Copyright (C) 2007 MySQL AB
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; version 2 of the License.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */
+
+C_MODE_START
+int ma_commit(TRN *trn);
+C_MODE_END
diff --git a/storage/maria/ma_control_file.c b/storage/maria/ma_control_file.c
new file mode 100644
index 00000000000..6f9018885e9
--- /dev/null
+++ b/storage/maria/ma_control_file.c
@@ -0,0 +1,607 @@
+/* Copyright (C) 2007 MySQL AB & Guilhem Bichot & Michael Widenius
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; version 2 of the License.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */
+
+/*
+  WL#3234 Maria control file
+  First version written by Guilhem Bichot on 2006-04-27.
+*/
+
+#ifndef EXTRACT_DEFINITIONS
+#include "maria_def.h"
+#include "ma_checkpoint.h"
+#endif
+
+/*
+  A control file contains the following objects:
+
+Start of create time variables (at start of file):
+  - Magic string (including version number of Maria control file)
+  - Uuid
+  - Size of create time part
+  - Size of dynamic part
+  - Maria block size
+.....  Here we can add new variables without changing format
+  - Checksum of create time part (last of block)
+
+Start of changeable part:
+  - Checksum of changeable part
+  - LSN of last checkpoint
+  - Number of last log file
+  - Max trid in control file (since Maria 1.5 May 2008)
+  - Number of consecutive recovery failures (since Maria 1.5 May 2008)
+.....  Here we can add new variables without changing format
+
+The idea is that one can add new variables to the control file and still
+use it with old program versions. If one needs to do an incompatible change
+one should increment the control file version number.
+*/
+
+/* Total size should be < sector size for atomic write operation */
+#define CF_MAX_SIZE 512
+#define CF_MIN_SIZE (CF_BLOCKSIZE_OFFSET + CF_BLOCKSIZE_SIZE + \
+                     CF_CHECKSUM_SIZE * 2 + CF_LSN_SIZE + CF_FILENO_SIZE)
+
+/* Create time variables */
+#define CF_MAGIC_STRING "\xfe\xfe\xc"
+#define CF_MAGIC_STRING_OFFSET 0
+#define CF_MAGIC_STRING_SIZE   (sizeof(CF_MAGIC_STRING)-1)
+#define CF_VERSION_OFFSET      (CF_MAGIC_STRING_OFFSET + CF_MAGIC_STRING_SIZE)
+#define CF_VERSION_SIZE        1
+#define CF_UUID_OFFSET         (CF_VERSION_OFFSET + CF_VERSION_SIZE)
+#define CF_UUID_SIZE           MY_UUID_SIZE
+#define CF_CREATE_TIME_SIZE_OFFSET  (CF_UUID_OFFSET + CF_UUID_SIZE)
+#define CF_SIZE_SIZE           2
+#define CF_CHANGEABLE_SIZE_OFFSET   (CF_CREATE_TIME_SIZE_OFFSET + CF_SIZE_SIZE)
+#define CF_BLOCKSIZE_OFFSET    (CF_CHANGEABLE_SIZE_OFFSET + CF_SIZE_SIZE)
+#define CF_BLOCKSIZE_SIZE      2
+
+#define CF_CREATE_TIME_TOTAL_SIZE (CF_BLOCKSIZE_OFFSET + CF_BLOCKSIZE_SIZE + \
+                                   CF_CHECKSUM_SIZE)
+
+/*
+  Start of the part that changes during execution
+  This is stored at offset uint2korr(file[CF_CHANGEABLE_SIZE])
+*/
+#define CF_CHECKSUM_OFFSET 0
+#define CF_CHECKSUM_SIZE 4
+#define CF_LSN_OFFSET (CF_CHECKSUM_OFFSET + CF_CHECKSUM_SIZE)
+#define CF_LSN_SIZE LSN_STORE_SIZE
+#define CF_FILENO_OFFSET (CF_LSN_OFFSET + CF_LSN_SIZE)
+#define CF_FILENO_SIZE 4
+#define CF_MAX_TRID_OFFSET (CF_FILENO_OFFSET + CF_FILENO_SIZE)
+#define CF_MAX_TRID_SIZE TRANSID_SIZE
+#define CF_RECOV_FAIL_OFFSET (CF_MAX_TRID_OFFSET + CF_MAX_TRID_SIZE)
+#define CF_RECOV_FAIL_SIZE 1
+#define CF_CHANGEABLE_TOTAL_SIZE (CF_RECOV_FAIL_OFFSET + CF_RECOV_FAIL_SIZE)
+
+/*
+  The following values should not be changed, except when changing version
+  number of the maria control file. These are the minimum sizes of the
+  parts the code can handle.
+*/
+
+#define CF_MIN_CREATE_TIME_TOTAL_SIZE \
+(CF_BLOCKSIZE_OFFSET + CF_BLOCKSIZE_SIZE + CF_CHECKSUM_SIZE)
+#define CF_MIN_CHANGEABLE_TOTAL_SIZE \
+(CF_FILENO_OFFSET + CF_FILENO_SIZE)
+
+#ifndef EXTRACT_DEFINITIONS
+
+/* This module owns these two vars. */
+/**
+   This LSN serves for the two-checkpoint rule, and also to find the
+   checkpoint record when doing a recovery.
+*/
+LSN    last_checkpoint_lsn= LSN_IMPOSSIBLE;
+uint32 last_logno=          FILENO_IMPOSSIBLE;
+/**
+   The maximum transaction id given to a transaction. It is only updated at
+   clean shutdown (in case of crash, logs have better information).
+*/
+TrID   max_trid_in_control_file= 0;
+
+/**
+  Number of consecutive log or recovery failures. Reset to 0 after recovery's
+  success.
+*/
+uint8 recovery_failures= 0;
+
+/**
+   @brief If log's lock should be asserted when writing to control file.
+
+   Can be re-used by any function which needs to be thread-safe except when
+   it is called at startup.
+*/
+my_bool maria_multi_threaded= FALSE;
+/** @brief if currently doing a recovery */
+my_bool maria_in_recovery= FALSE;
+
+/**
+  Control file is less then  512 bytes (a disk sector),
+  to be as atomic as possible
+*/
+static int control_file_fd= -1;
+
+static uint cf_create_time_size;
+static uint cf_changeable_size;
+
+/**
+   @brief Create Maria control file
+*/
+
+static CONTROL_FILE_ERROR create_control_file(const char *name,
+                                              int open_flags)
+{
+  uint32 sum;
+  uchar buffer[CF_CREATE_TIME_TOTAL_SIZE];
+  DBUG_ENTER("maria_create_control_file");
+
+  if ((control_file_fd= my_create(name, 0,
+                                  open_flags,
+                                  MYF(MY_SYNC_DIR | MY_WME))) < 0)
+    DBUG_RETURN(CONTROL_FILE_UNKNOWN_ERROR);
+
+  /* Reset variables, as we are creating the file */
+  cf_create_time_size= CF_CREATE_TIME_TOTAL_SIZE;
+  cf_changeable_size=  CF_CHANGEABLE_TOTAL_SIZE;
+
+  /* Create unique uuid for the control file */
+  my_uuid_init((ulong) &buffer, (ulong) &maria_uuid);
+  my_uuid(maria_uuid);
+
+  /* Prepare and write the file header */
+  memcpy(buffer, CF_MAGIC_STRING, CF_MAGIC_STRING_SIZE);
+  buffer[CF_VERSION_OFFSET]= CONTROL_FILE_VERSION;
+  memcpy(buffer + CF_UUID_OFFSET, maria_uuid, CF_UUID_SIZE);
+  int2store(buffer + CF_CREATE_TIME_SIZE_OFFSET, cf_create_time_size);
+  int2store(buffer + CF_CHANGEABLE_SIZE_OFFSET, cf_changeable_size);
+
+  /* Write create time variables */
+  int2store(buffer + CF_BLOCKSIZE_OFFSET, maria_block_size);
+
+  /* Store checksum for create time parts */
+  sum= (uint32) my_checksum(0, buffer, cf_create_time_size -
+                            CF_CHECKSUM_SIZE);
+  int4store(buffer + cf_create_time_size - CF_CHECKSUM_SIZE, sum);
+
+  if (my_pwrite(control_file_fd, buffer, cf_create_time_size,
+                0, MYF(MY_FNABP |  MY_WME)))
+    DBUG_RETURN(CONTROL_FILE_UNKNOWN_ERROR);
+
+  /*
+    To be safer we should make sure that there are no logs or data/index
+    files around (indeed it could be that the control file alone was deleted
+    or not restored, and we should not go on with life at this point).
+
+    Things should still be relatively safe as if someone tries to use
+    an old table with a new control file the different uuid:s between
+    the files will cause ma_open() to generate an HA_ERR_OLD_FILE
+    error. When used from mysqld this will cause the table to be open
+    in repair mode which will remove all dependencies between the
+    table and the old control file.
+
+    We could have a tool which can rebuild the control file, by reading the
+    directory of logs, finding the newest log, reading it to find last
+    checkpoint... Slow but can save your db. For this to be possible, we
+    must always write to the control file right after writing the checkpoint
+    log record, and do nothing in between (i.e. the checkpoint must be
+    usable as soon as it has been written to the log).
+  */
+
+  /* init the file with these "undefined" values */
+  DBUG_RETURN(ma_control_file_write_and_force(LSN_IMPOSSIBLE,
+                                              FILENO_IMPOSSIBLE, 0, 0));
+}
+
+
+/**
+  Locks control file exclusively. This is kept for the duration of the engine
+  process, to prevent another Maria instance to write to our logs or control
+  file.
+*/
+
+static int lock_control_file(const char *name)
+{
+  uint retry= 0;
+  /*
+    On Windows, my_lock() uses locking() which is mandatory locking and so
+    prevents maria-recovery.test from copying the control file. And in case of
+    crash, it may take a while for Windows to unlock file, causing downtime.
+  */
+  /**
+    @todo BUG We should explore my_sopen(_SH_DENYWRD) to open or create the
+    file under Windows.
+  */
+#ifndef __WIN__
+  /*
+    We can't here use the automatic wait in my_lock() as the alarm thread
+    may not yet exists.
+  */
+  while (my_lock(control_file_fd, F_WRLCK, 0L, F_TO_EOF,
+                 MYF(MY_SEEK_NOT_DONE | MY_FORCE_LOCK | MY_NO_WAIT)))
+  {
+    if (retry == 0)
+      my_printf_error(HA_ERR_INITIALIZATION,
+                      "Can't lock aria control file '%s' for exclusive use, "
+                      "error: %d. Will retry for %d seconds", 0,
+                      name, my_errno, MARIA_MAX_CONTROL_FILE_LOCK_RETRY);
+    if (retry++ > MARIA_MAX_CONTROL_FILE_LOCK_RETRY)
+      return 1;
+    sleep(1);
+  }
+#endif
+  return 0;
+}
+
+
+/*
+  @brief Initialize control file subsystem
+
+  Looks for the control file. If none and creation is requested, creates file.
+  If present, reads it to find out last checkpoint's LSN and last log, updates
+  the last_checkpoint_lsn and last_logno global variables.
+  Called at engine's start.
+
+  @note
+    The format of the control file is defined in the comments and defines
+    at the start of this file.
+
+  @param create_if_missing create file if not found
+
+  @return Operation status
+    @retval 0      OK
+    @retval 1      Error (in which case the file is left closed)
+*/
+
+CONTROL_FILE_ERROR ma_control_file_open(my_bool create_if_missing,
+                                        my_bool print_error)
+{
+  uchar buffer[CF_MAX_SIZE];
+  char name[FN_REFLEN], errmsg_buff[256];
+  const char *errmsg, *lock_failed_errmsg= "Could not get an exclusive lock;"
+    " file is probably in use by another process";
+  uint new_cf_create_time_size, new_cf_changeable_size, new_block_size;
+  my_off_t file_size;
+  int open_flags= O_BINARY | /*O_DIRECT |*/ O_RDWR;
+  int error= CONTROL_FILE_UNKNOWN_ERROR;
+  DBUG_ENTER("ma_control_file_open");
+
+  /*
+    If you change sizes in the #defines, you at least have to change the
+    "*store" and "*korr" calls in this file, and can even create backward
+    compatibility problems. Beware!
+  */
+  DBUG_ASSERT(CF_LSN_SIZE == (3+4));
+  DBUG_ASSERT(CF_FILENO_SIZE == 4);
+
+  if (control_file_fd >= 0) /* already open */
+    DBUG_RETURN(0);
+
+  if (fn_format(name, CONTROL_FILE_BASE_NAME,
+                maria_data_root, "", MYF(MY_WME)) == NullS)
+    DBUG_RETURN(CONTROL_FILE_UNKNOWN_ERROR);
+
+  if (my_access(name,F_OK))
+  {
+    CONTROL_FILE_ERROR create_error;
+    if (!create_if_missing)
+    {
+      error= CONTROL_FILE_MISSING;
+      errmsg= "Can't find file";
+      goto err;
+    }
+    if ((create_error= create_control_file(name, open_flags)))
+    {
+      error= create_error;
+      errmsg= "Can't create file";
+      goto err;
+    }
+    if (lock_control_file(name))
+    {
+      errmsg= lock_failed_errmsg;
+      goto err;
+    }
+    goto ok;
+  }
+
+  /* Otherwise, file exists */
+
+  if ((control_file_fd= my_open(name, open_flags, MYF(MY_WME))) < 0)
+  {
+    errmsg= "Can't open file";
+    goto err;
+  }
+
+  if (lock_control_file(name)) /* lock it before reading content */
+  {
+    errmsg= lock_failed_errmsg;
+    goto err;
+  }
+
+  file_size= my_seek(control_file_fd, 0, SEEK_END, MYF(MY_WME));
+  if (file_size == MY_FILEPOS_ERROR)
+  {
+    errmsg= "Can't read size";
+    goto err;
+  }
+  if (file_size < CF_MIN_SIZE)
+  {
+    /*
+      Given that normally we write only a sector and it's atomic, the only
+      possibility for a file to be of too short size is if we crashed at the
+      very first startup, between file creation and file write. Quite unlikely
+      (and can be made even more unlikely by doing this: create a temp file,
+      write it, and then rename it to be the control file).
+      What's more likely is if someone forgot to restore the control file,
+      just did a "touch control" to try to get Maria to start, or if the
+      disk/filesystem has a problem.
+      So let's be rigid.
+    */
+    error= CONTROL_FILE_TOO_SMALL;
+    errmsg= "Size of control file is smaller than expected";
+    goto err;
+  }
+
+  /* Check if control file is unexpectedly big */
+  if (file_size > CF_MAX_SIZE)
+  {
+    error= CONTROL_FILE_TOO_BIG;
+    errmsg= "File size bigger than expected";
+    goto err;
+  }
+
+  if (my_pread(control_file_fd, buffer, (size_t)file_size, 0, MYF(MY_FNABP)))
+  {
+    errmsg= "Can't read file";
+    goto err;
+  }
+
+  if (memcmp(buffer + CF_MAGIC_STRING_OFFSET,
+             CF_MAGIC_STRING, CF_MAGIC_STRING_SIZE))
+  {
+    error= CONTROL_FILE_BAD_MAGIC_STRING;
+    errmsg= "Missing valid id at start of file. File is not a valid aria control file";
+    goto err;
+  }
+
+  if (buffer[CF_VERSION_OFFSET] > CONTROL_FILE_VERSION)
+  {
+    error= CONTROL_FILE_BAD_VERSION;
+    sprintf(errmsg_buff, "File is from a future aria system: %d. Current version is: %d",
+            (int) buffer[CF_VERSION_OFFSET], CONTROL_FILE_VERSION);
+    errmsg= errmsg_buff;
+    goto err;
+  }
+
+  new_cf_create_time_size= uint2korr(buffer + CF_CREATE_TIME_SIZE_OFFSET);
+  new_cf_changeable_size=  uint2korr(buffer + CF_CHANGEABLE_SIZE_OFFSET);
+
+  if (new_cf_create_time_size < CF_MIN_CREATE_TIME_TOTAL_SIZE ||
+      new_cf_changeable_size <  CF_MIN_CHANGEABLE_TOTAL_SIZE ||
+      new_cf_create_time_size + new_cf_changeable_size != file_size)
+  {
+    error= CONTROL_FILE_INCONSISTENT_INFORMATION;
+    errmsg= "Sizes stored in control file are inconsistent";
+    goto err;
+  }
+
+  new_block_size= uint2korr(buffer + CF_BLOCKSIZE_OFFSET);
+  if (new_block_size != maria_block_size && maria_block_size)
+  {
+    error= CONTROL_FILE_WRONG_BLOCKSIZE;
+    sprintf(errmsg_buff,
+            "Block size in control file (%u) is different than given aria_block_size: %u",
+            new_block_size, (uint) maria_block_size);
+    errmsg= errmsg_buff;
+    goto err;
+  }
+  maria_block_size= new_block_size;
+
+  if (my_checksum(0, buffer, new_cf_create_time_size - CF_CHECKSUM_SIZE) !=
+      uint4korr(buffer + new_cf_create_time_size - CF_CHECKSUM_SIZE))
+  {
+    error= CONTROL_FILE_BAD_HEAD_CHECKSUM;
+    errmsg= "Fixed part checksum mismatch";
+    goto err;
+  }
+
+  if (my_checksum(0, buffer + new_cf_create_time_size + CF_CHECKSUM_SIZE,
+                  new_cf_changeable_size - CF_CHECKSUM_SIZE) !=
+      uint4korr(buffer + new_cf_create_time_size))
+  {
+    error= CONTROL_FILE_BAD_CHECKSUM;
+    errmsg= "Changeable part (end of control file) checksum mismatch";
+    goto err;
+  }
+
+  memcpy(maria_uuid, buffer + CF_UUID_OFFSET, CF_UUID_SIZE);
+  cf_create_time_size= new_cf_create_time_size;
+  cf_changeable_size=  new_cf_changeable_size;
+  last_checkpoint_lsn= lsn_korr(buffer + new_cf_create_time_size +
+                                CF_LSN_OFFSET);
+  last_logno= uint4korr(buffer + new_cf_create_time_size + CF_FILENO_OFFSET);
+  if (new_cf_changeable_size >= (CF_MAX_TRID_OFFSET + CF_MAX_TRID_SIZE))
+    max_trid_in_control_file=
+      transid_korr(buffer + new_cf_create_time_size + CF_MAX_TRID_OFFSET);
+  if (new_cf_changeable_size >= (CF_RECOV_FAIL_OFFSET + CF_RECOV_FAIL_SIZE))
+    recovery_failures=
+      (buffer + new_cf_create_time_size + CF_RECOV_FAIL_OFFSET)[0];
+
+ok:
+  DBUG_RETURN(0);
+
+err:
+  if (print_error)
+    my_printf_error(HA_ERR_INITIALIZATION,
+                    "Got error '%s' when trying to use aria control file "
+                    "'%s'", 0, errmsg, name);
+  ma_control_file_end(); /* will unlock file if needed */
+  DBUG_RETURN(error);
+}
+
+
+/*
+  Write information durably to the control file; stores this information into
+  the last_checkpoint_lsn, last_logno, max_trid_in_control_file,
+  recovery_failures global variables.
+  Called when we have created a new log (after syncing this log's creation),
+  when we have written a checkpoint (after syncing this log record), at
+  shutdown (for storing trid in case logs are soon removed by user), and
+  before and after recovery (to store recovery_failures).
+  Variables last_checkpoint_lsn and last_logno must be protected by caller
+  using log's lock, unless this function is called at startup.
+
+  SYNOPSIS
+    ma_control_file_write_and_force()
+    last_checkpoint_lsn_arg LSN of last checkpoint
+    last_logno_arg          last log file number
+    max_trid_arg            maximum transaction longid
+    recovery_failures_arg   consecutive recovery failures
+
+  NOTE
+    We always want to do one single my_pwrite() here to be as atomic as
+    possible.
+
+  RETURN
+    0 - OK
+    1 - Error
+*/
+
+int ma_control_file_write_and_force(LSN last_checkpoint_lsn_arg,
+                                    uint32 last_logno_arg,
+                                    TrID max_trid_arg,
+                                    uint8 recovery_failures_arg)
+{
+  uchar buffer[CF_MAX_SIZE];
+  uint32 sum;
+  my_bool no_need_sync;
+  DBUG_ENTER("ma_control_file_write_and_force");
+
+  /*
+    We don't need to sync if this is just an increase of
+    recovery_failures: it's even good if that counter is not increased on disk
+    in case of power or hardware failure (less false positives when removing
+    logs).
+  */
+  no_need_sync= ((last_checkpoint_lsn == last_checkpoint_lsn_arg) &&
+                 (last_logno == last_logno_arg) &&
+                 (max_trid_in_control_file == max_trid_arg) &&
+                 (recovery_failures_arg > 0));
+
+  if (control_file_fd < 0)
+    DBUG_RETURN(1);
+
+#ifndef DBUG_OFF
+  if (maria_multi_threaded)
+    translog_lock_handler_assert_owner();
+#endif
+
+  lsn_store(buffer + CF_LSN_OFFSET, last_checkpoint_lsn_arg);
+  int4store(buffer + CF_FILENO_OFFSET, last_logno_arg);
+  transid_store(buffer + CF_MAX_TRID_OFFSET, max_trid_arg);
+  (buffer + CF_RECOV_FAIL_OFFSET)[0]= recovery_failures_arg;
+
+  if (cf_changeable_size > CF_CHANGEABLE_TOTAL_SIZE)
+  {
+    /*
+      More room than needed for us. Must be a newer version. Clear part which
+      we cannot maintain, so that any future version notices we didn't
+      maintain its extra data.
+    */
+    uint zeroed= cf_changeable_size - CF_CHANGEABLE_TOTAL_SIZE;
+    char msg[150];
+    bzero(buffer + CF_CHANGEABLE_TOTAL_SIZE, zeroed);
+    my_snprintf(msg, sizeof(msg),
+                "Control file must be from a newer version; zero-ing out %u"
+                " unknown bytes in control file at offset %u", zeroed,
+                cf_changeable_size + cf_create_time_size);
+    ma_message_no_user(ME_JUST_WARNING, msg);
+  }
+  else
+  {
+    /* not enough room for what we need to store: enlarge */
+    cf_changeable_size= CF_CHANGEABLE_TOTAL_SIZE;
+  }
+  /* Note that the create-time portion is not touched */
+
+  /* Checksum is stored first */
+  compile_time_assert(CF_CHECKSUM_OFFSET == 0);
+  sum= my_checksum(0, buffer + CF_CHECKSUM_SIZE,
+                   cf_changeable_size - CF_CHECKSUM_SIZE);
+  int4store(buffer, sum);
+
+  if (my_pwrite(control_file_fd, buffer, cf_changeable_size,
+                cf_create_time_size, MYF(MY_FNABP |  MY_WME)) ||
+      (!no_need_sync && my_sync(control_file_fd, MYF(MY_WME))))
+    DBUG_RETURN(1);
+
+  last_checkpoint_lsn= last_checkpoint_lsn_arg;
+  last_logno= last_logno_arg;
+  max_trid_in_control_file= max_trid_arg;
+  recovery_failures= recovery_failures_arg;
+
+  cf_changeable_size= CF_CHANGEABLE_TOTAL_SIZE; /* no more warning */
+  DBUG_RETURN(0);
+}
+
+
+/*
+  Free resources taken by control file subsystem
+
+  SYNOPSIS
+    ma_control_file_end()
+*/
+
+int ma_control_file_end(void)
+{
+  int close_error;
+  DBUG_ENTER("ma_control_file_end");
+
+  if (control_file_fd < 0) /* already closed */
+    DBUG_RETURN(0);
+
+#ifndef __WIN__
+  (void) my_lock(control_file_fd, F_UNLCK, 0L, F_TO_EOF,
+                 MYF(MY_SEEK_NOT_DONE | MY_FORCE_LOCK));
+#endif
+
+  close_error= my_close(control_file_fd, MYF(MY_WME));
+  /*
+    As my_close() frees structures even if close() fails, we do the same,
+    i.e. we mark the file as closed in all cases.
+  */
+  control_file_fd= -1;
+  /*
+    As this module owns these variables, closing the module forbids access to
+    them (just a safety):
+  */
+  last_checkpoint_lsn= LSN_IMPOSSIBLE;
+  last_logno= FILENO_IMPOSSIBLE;
+  max_trid_in_control_file= recovery_failures= 0;
+
+  DBUG_RETURN(close_error);
+}
+
+
+/**
+  Tells if control file is initialized.
+*/
+
+my_bool ma_control_file_inited(void)
+{
+  return (control_file_fd >= 0);
+}
+
+#endif /* EXTRACT_DEFINITIONS */
diff --git a/storage/maria/ma_control_file.h b/storage/maria/ma_control_file.h
new file mode 100644
index 00000000000..f828ae69c6d
--- /dev/null
+++ b/storage/maria/ma_control_file.h
@@ -0,0 +1,74 @@
+/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; version 2 of the License.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */
+
+/*
+  WL#3234 Maria control file
+  First version written by Guilhem Bichot on 2006-04-27.
+*/
+
+#ifndef _ma_control_file_h
+#define _ma_control_file_h
+
+#define CONTROL_FILE_BASE_NAME "aria_log_control"
+/*
+  Major version for control file. Should only be changed when doing
+  big changes that made the new control file incompatible with all
+  older versions of Maria.
+*/
+#define CONTROL_FILE_VERSION   1
+
+/* Here is the interface of this module */
+
+/*
+  LSN of the last checkoint
+  (if last_checkpoint_lsn == LSN_IMPOSSIBLE then there was never a checkpoint)
+*/
+extern LSN last_checkpoint_lsn;
+/*
+  Last log number (if last_logno == FILENO_IMPOSSIBLE then there is no log
+  file yet)
+*/
+extern uint32 last_logno;
+
+extern TrID max_trid_in_control_file;
+
+extern uint8 recovery_failures;
+
+extern my_bool maria_multi_threaded, maria_in_recovery;
+
+typedef enum enum_control_file_error {
+  CONTROL_FILE_OK= 0,
+  CONTROL_FILE_TOO_SMALL,
+  CONTROL_FILE_TOO_BIG,
+  CONTROL_FILE_BAD_MAGIC_STRING,
+  CONTROL_FILE_BAD_VERSION,
+  CONTROL_FILE_BAD_CHECKSUM,
+  CONTROL_FILE_BAD_HEAD_CHECKSUM,
+  CONTROL_FILE_MISSING,
+  CONTROL_FILE_INCONSISTENT_INFORMATION,
+  CONTROL_FILE_WRONG_BLOCKSIZE,
+  CONTROL_FILE_UNKNOWN_ERROR /* any other error */
+} CONTROL_FILE_ERROR;
+
+C_MODE_START
+CONTROL_FILE_ERROR ma_control_file_open(my_bool create_if_missing,
+                                        my_bool print_error);
+int ma_control_file_write_and_force(LSN last_checkpoint_lsn_arg,
+                                    uint32 last_logno_arg, TrID max_trid_arg,
+                                    uint8 recovery_failures_arg);
+int ma_control_file_end(void);
+my_bool ma_control_file_inited(void);
+C_MODE_END
+#endif
diff --git a/storage/maria/ma_create.c b/storage/maria/ma_create.c
new file mode 100644
index 00000000000..9cf042ed21e
--- /dev/null
+++ b/storage/maria/ma_create.c
@@ -0,0 +1,1419 @@
+/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; version 2 of the License.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */
+
+/* Create a MARIA table */
+
+#include "ma_ftdefs.h"
+#include "ma_sp_defs.h"
+#include <my_bit.h>
+#include "ma_blockrec.h"
+#include "trnman_public.h"
+
+#if defined(MSDOS) || defined(__WIN__)
+#ifdef __WIN__
+#include <fcntl.h>
+#else
+#include <process.h>			/* Prototype for getpid */
+#endif
+#endif
+#include <m_ctype.h>
+
+static int compare_columns(MARIA_COLUMNDEF **a, MARIA_COLUMNDEF **b);
+
+/*
+  Old options is used when recreating database, from maria_chk
+*/
+
+int maria_create(const char *name, enum data_file_type datafile_type,
+                 uint keys,MARIA_KEYDEF *keydefs,
+                 uint columns, MARIA_COLUMNDEF *columndef,
+                 uint uniques, MARIA_UNIQUEDEF *uniquedefs,
+                 MARIA_CREATE_INFO *ci,uint flags)
+{
+  register uint i,j;
+  File dfile,file;
+  int errpos,save_errno, create_mode= O_RDWR | O_TRUNC, res;
+  myf create_flag;
+  uint length,max_key_length,packed,pack_bytes,pointer,real_length_diff,
+       key_length,info_length,key_segs,options,min_key_length,
+       base_pos,long_varchar_count,varchar_length,
+       unique_key_parts,fulltext_keys,offset, not_block_record_extra_length;
+  uint max_field_lengths, extra_header_size, column_nr;
+  ulong reclength, real_reclength,min_pack_length;
+  char filename[FN_REFLEN], linkname[FN_REFLEN], *linkname_ptr;
+  ulong pack_reclength;
+  ulonglong tot_length,max_rows, tmp;
+  enum en_fieldtype type;
+  enum data_file_type org_datafile_type= datafile_type;
+  MARIA_SHARE share;
+  MARIA_KEYDEF *keydef,tmp_keydef;
+  MARIA_UNIQUEDEF *uniquedef;
+  HA_KEYSEG *keyseg,tmp_keyseg;
+  MARIA_COLUMNDEF *column, *end_column;
+  double *rec_per_key_part;
+  ulong  *nulls_per_key_part;
+  uint16 *column_array;
+  my_off_t key_root[HA_MAX_POSSIBLE_KEY], kfile_size_before_extension;
+  MARIA_CREATE_INFO tmp_create_info;
+  my_bool tmp_table= FALSE; /* cache for presence of HA_OPTION_TMP_TABLE */
+  my_bool forced_packed;
+  myf     sync_dir=  0;
+  uchar   *log_data= NULL;
+  DBUG_ENTER("maria_create");
+  DBUG_PRINT("enter", ("keys: %u  columns: %u  uniques: %u  flags: %u",
+                      keys, columns, uniques, flags));
+
+  DBUG_ASSERT(maria_inited);
+  LINT_INIT(dfile);
+  LINT_INIT(file);
+
+  if (!ci)
+  {
+    bzero((char*) &tmp_create_info,sizeof(tmp_create_info));
+    ci=&tmp_create_info;
+  }
+
+  if (keys + uniques > MARIA_MAX_KEY)
+  {
+    DBUG_RETURN(my_errno=HA_WRONG_CREATE_OPTION);
+  }
+  errpos=0;
+  options=0;
+  bzero((uchar*) &share,sizeof(share));
+
+  if (flags & HA_DONT_TOUCH_DATA)
+  {
+    /* We come here from recreate table */
+    org_datafile_type= ci->org_data_file_type;
+    if (!(ci->old_options & HA_OPTION_TEMP_COMPRESS_RECORD))
+      options= (ci->old_options &
+                (HA_OPTION_COMPRESS_RECORD | HA_OPTION_PACK_RECORD |
+                 HA_OPTION_READ_ONLY_DATA | HA_OPTION_CHECKSUM |
+                 HA_OPTION_TMP_TABLE | HA_OPTION_DELAY_KEY_WRITE |
+                 HA_OPTION_LONG_BLOB_PTR | HA_OPTION_PAGE_CHECKSUM));
+    else
+    {
+      /* Uncompressing rows */
+      options= (ci->old_options &
+                (HA_OPTION_CHECKSUM | HA_OPTION_TMP_TABLE |
+                 HA_OPTION_DELAY_KEY_WRITE | HA_OPTION_LONG_BLOB_PTR |
+                 HA_OPTION_PAGE_CHECKSUM));
+    }
+  }
+  else
+  {
+    /* Transactional tables must be of type BLOCK_RECORD */
+    if (ci->transactional)
+      datafile_type= BLOCK_RECORD;
+  }
+
+  if (ci->reloc_rows > ci->max_rows)
+    ci->reloc_rows=ci->max_rows;		/* Check if wrong parameter */
+
+  if (!(rec_per_key_part=
+	(double*) my_malloc((keys + uniques)*HA_MAX_KEY_SEG*sizeof(double) +
+                            (keys + uniques)*HA_MAX_KEY_SEG*sizeof(ulong) +
+                            sizeof(uint16) * columns,
+                            MYF(MY_WME | MY_ZEROFILL))))
+    DBUG_RETURN(my_errno);
+  nulls_per_key_part= (ulong*) (rec_per_key_part +
+                                (keys + uniques) * HA_MAX_KEY_SEG);
+  column_array= (uint16*) (nulls_per_key_part +
+                           (keys + uniques) * HA_MAX_KEY_SEG);
+
+
+  /* Start by checking fields and field-types used */
+  varchar_length=long_varchar_count=packed= not_block_record_extra_length=
+    pack_reclength= max_field_lengths= 0;
+  reclength= min_pack_length= ci->null_bytes;
+  forced_packed= 0;
+  column_nr= 0;
+
+  for (column= columndef, end_column= column + columns ;
+       column != end_column ;
+       column++)
+  {
+    /* Fill in not used struct parts */
+    column->column_nr= column_nr++;
+    column->offset= reclength;
+    column->empty_pos= 0;
+    column->empty_bit= 0;
+    column->fill_length= column->length;
+    if (column->null_bit)
+      options|= HA_OPTION_NULL_FIELDS;
+
+    reclength+= column->length;
+    type= column->type;
+    if (datafile_type == BLOCK_RECORD)
+    {
+      if (type == FIELD_SKIP_PRESPACE)
+        type= column->type= FIELD_NORMAL; /* SKIP_PRESPACE not supported */
+      if (type == FIELD_NORMAL &&
+          column->length > FULL_PAGE_SIZE(maria_block_size))
+      {
+        /* FIELD_NORMAL can't be split over many blocks, convert to a CHAR */
+        type= column->type= FIELD_SKIP_ENDSPACE;
+      }
+    }
+
+    if (type != FIELD_NORMAL && type != FIELD_CHECK)
+    {
+      column->empty_pos= packed/8;
+      column->empty_bit= (1 << (packed & 7));
+      if (type == FIELD_BLOB)
+      {
+        forced_packed= 1;
+        packed++;
+	share.base.blobs++;
+	if (pack_reclength != INT_MAX32)
+	{
+	  if (column->length == 4+portable_sizeof_char_ptr)
+	    pack_reclength= INT_MAX32;
+	  else
+          {
+            /* Add max possible blob length */
+	    pack_reclength+= (1 << ((column->length-
+                                     portable_sizeof_char_ptr)*8));
+          }
+	}
+        max_field_lengths+= (column->length - portable_sizeof_char_ptr);
+      }
+      else if (type == FIELD_SKIP_PRESPACE ||
+	       type == FIELD_SKIP_ENDSPACE)
+      {
+        forced_packed= 1;
+        max_field_lengths+= column->length > 255 ? 2 : 1;
+        not_block_record_extra_length++;
+        packed++;
+      }
+      else if (type == FIELD_VARCHAR)
+      {
+	varchar_length+= column->length-1; /* Used for min_pack_length */
+	pack_reclength++;
+        not_block_record_extra_length++;
+        max_field_lengths++;
+        packed++;
+        column->fill_length= 1;
+        options|= HA_OPTION_NULL_FIELDS;        /* Use ma_checksum() */
+
+        /* We must test for 257 as length includes pack-length */
+        if (test(column->length >= 257))
+	{
+	  long_varchar_count++;
+          max_field_lengths++;
+          column->fill_length= 2;
+	}
+      }
+      else if (type == FIELD_SKIP_ZERO)
+        packed++;
+      else
+      {
+        if (!column->null_bit)
+          min_pack_length+= column->length;
+        else
+        {
+          /* Only BLOCK_RECORD skips NULL fields for all field values */
+          not_block_record_extra_length+= column->length;
+        }
+        column->empty_pos= 0;
+        column->empty_bit= 0;
+      }
+    }
+    else					/* FIELD_NORMAL */
+    {
+      if (!column->null_bit)
+      {
+        min_pack_length+= column->length;
+        share.base.fixed_not_null_fields++;
+        share.base.fixed_not_null_fields_length+= column->length;
+      }
+      else
+        not_block_record_extra_length+= column->length;
+    }
+  }
+
+  if (datafile_type == STATIC_RECORD && forced_packed)
+  {
+    /* Can't use fixed length records, revert to block records */
+    datafile_type= BLOCK_RECORD;
+  }
+
+  if (datafile_type == DYNAMIC_RECORD)
+    options|= HA_OPTION_PACK_RECORD;	/* Must use packed records */
+
+  if (datafile_type == STATIC_RECORD)
+  {
+    /* We can't use checksum with static length rows */
+    flags&= ~HA_CREATE_CHECKSUM;
+    options&= ~HA_OPTION_CHECKSUM;
+    min_pack_length= reclength;
+    packed= 0;
+  }
+  else if (datafile_type != BLOCK_RECORD)
+    min_pack_length+= not_block_record_extra_length;
+  else
+    min_pack_length+= 5;                        /* Min row overhead */
+
+  if (flags & HA_CREATE_TMP_TABLE)
+  {
+    options|= HA_OPTION_TMP_TABLE;
+    tmp_table= TRUE;
+    create_mode|= O_NOFOLLOW;
+    /* "CREATE TEMPORARY" tables are not crash-safe (dropped at restart) */
+    ci->transactional= FALSE;
+    flags&= ~HA_CREATE_PAGE_CHECKSUM;
+  }
+  share.base.null_bytes= ci->null_bytes;
+  share.base.original_null_bytes= ci->null_bytes;
+  share.base.born_transactional= ci->transactional;
+  share.base.max_field_lengths= max_field_lengths;
+  share.base.field_offsets= 0;                  /* for future */
+
+  if (flags & HA_CREATE_CHECKSUM || (options & HA_OPTION_CHECKSUM))
+  {
+    options|= HA_OPTION_CHECKSUM;
+    min_pack_length++;
+    pack_reclength++;
+  }
+  if (pack_reclength < INT_MAX32)
+    pack_reclength+= max_field_lengths + long_varchar_count;
+  else
+    pack_reclength= INT_MAX32;
+
+  if (flags & HA_CREATE_DELAY_KEY_WRITE)
+    options|= HA_OPTION_DELAY_KEY_WRITE;
+  if (flags & HA_CREATE_RELIES_ON_SQL_LAYER)
+    options|= HA_OPTION_RELIES_ON_SQL_LAYER;
+  if (flags & HA_CREATE_PAGE_CHECKSUM)
+    options|= HA_OPTION_PAGE_CHECKSUM;
+
+  pack_bytes= (packed + 7) / 8;
+  if (pack_reclength != INT_MAX32)
+    pack_reclength+= reclength+pack_bytes +
+      test(test_all_bits(options, HA_OPTION_CHECKSUM | HA_OPTION_PACK_RECORD));
+  min_pack_length+= pack_bytes;
+  /* Calculate min possible row length for rows-in-block */
+  extra_header_size= MAX_FIXED_HEADER_SIZE;
+  if (ci->transactional)
+  {
+    extra_header_size= TRANS_MAX_FIXED_HEADER_SIZE;
+    DBUG_PRINT("info",("creating a transactional table"));
+  }
+  share.base.min_block_length= (extra_header_size + share.base.null_bytes +
+                                pack_bytes);
+  if (!ci->data_file_length && ci->max_rows)
+  {
+    if (pack_reclength == INT_MAX32 ||
+             (~(ulonglong) 0)/ci->max_rows < (ulonglong) pack_reclength)
+      ci->data_file_length= ~(ulonglong) 0;
+    else
+      ci->data_file_length=(ulonglong) ci->max_rows*pack_reclength;
+  }
+  else if (!ci->max_rows)
+  {
+    if (datafile_type == BLOCK_RECORD)
+    {
+      uint rows_per_page= ((maria_block_size - PAGE_OVERHEAD_SIZE) /
+                           (min_pack_length + extra_header_size +
+                            DIR_ENTRY_SIZE));
+      ulonglong data_file_length= ci->data_file_length;
+      if (!data_file_length)
+        data_file_length= ((((ulonglong) 1 << ((BLOCK_RECORD_POINTER_SIZE-1) *
+                                               8)) -1) * maria_block_size);
+      if (rows_per_page > 0)
+      {
+        set_if_smaller(rows_per_page, MAX_ROWS_PER_PAGE);
+        ci->max_rows= data_file_length / maria_block_size * rows_per_page;
+      }
+      else
+        ci->max_rows= data_file_length / (min_pack_length +
+                                          extra_header_size +
+                                          DIR_ENTRY_SIZE);
+    }
+    else
+      ci->max_rows=(ha_rows) (ci->data_file_length/(min_pack_length +
+                                                    ((options &
+                                                      HA_OPTION_PACK_RECORD) ?
+                                                     3 : 0)));
+  }
+  max_rows= (ulonglong) ci->max_rows;
+  if (datafile_type == BLOCK_RECORD)
+  {
+    /*
+      The + 1 is for record position withing page
+      The / 2 is because we need one bit for knowing if there is transid's
+      after the row pointer
+    */
+    pointer= maria_get_pointer_length((ci->data_file_length /
+                                       (maria_block_size * 2)), 3) + 1;
+    set_if_smaller(pointer, BLOCK_RECORD_POINTER_SIZE);
+
+    if (!max_rows)
+      max_rows= (((((ulonglong) 1 << ((pointer-1)*8)) -1) * maria_block_size) /
+                 min_pack_length / 2);
+                                      }
+  else
+  {
+    if (datafile_type != STATIC_RECORD)
+      pointer= maria_get_pointer_length(ci->data_file_length,
+                                        maria_data_pointer_size);
+    else
+      pointer= maria_get_pointer_length(ci->max_rows, maria_data_pointer_size);
+    if (!max_rows)
+      max_rows= ((((ulonglong) 1 << (pointer*8)) -1) / min_pack_length);
+  }
+
+  real_reclength=reclength;
+  if (datafile_type == STATIC_RECORD)
+  {
+    if (reclength <= pointer)
+      reclength=pointer+1;		/* reserve place for delete link */
+  }
+  else
+    reclength+= long_varchar_count;	/* We need space for varchar! */
+
+  max_key_length=0; tot_length=0 ; key_segs=0;
+  fulltext_keys=0;
+  share.state.rec_per_key_part=   rec_per_key_part;
+  share.state.nulls_per_key_part= nulls_per_key_part;
+  share.state.key_root=key_root;
+  share.state.key_del= HA_OFFSET_ERROR;
+  if (uniques)
+    max_key_length= MARIA_UNIQUE_HASH_LENGTH + pointer;
+
+  for (i=0, keydef=keydefs ; i < keys ; i++ , keydef++)
+  {
+    share.state.key_root[i]= HA_OFFSET_ERROR;
+    length= real_length_diff= 0;
+    min_key_length= key_length= pointer;
+
+    if (keydef->key_alg == HA_KEY_ALG_RTREE)
+      keydef->flag|= HA_RTREE_INDEX;            /* For easier tests */
+
+    if (keydef->flag & HA_SPATIAL)
+    {
+#ifdef HAVE_SPATIAL
+      /* BAR TODO to support 3D and more dimensions in the future */
+      uint sp_segs=SPDIMS*2;
+      keydef->flag=HA_SPATIAL;
+
+      if (flags & HA_DONT_TOUCH_DATA)
+      {
+        /*
+          Called by maria_chk - i.e. table structure was taken from
+          MYI file and SPATIAL key *does have* additional sp_segs keysegs.
+          keydef->seg here points right at the GEOMETRY segment,
+          so we only need to decrease keydef->keysegs.
+          (see maria_recreate_table() in _ma_check.c)
+        */
+        keydef->keysegs-=sp_segs-1;
+      }
+
+      for (j=0, keyseg=keydef->seg ; (int) j < keydef->keysegs ;
+	   j++, keyseg++)
+      {
+        if (keyseg->type != HA_KEYTYPE_BINARY &&
+	    keyseg->type != HA_KEYTYPE_VARBINARY1 &&
+            keyseg->type != HA_KEYTYPE_VARBINARY2)
+        {
+          my_errno=HA_WRONG_CREATE_OPTION;
+          goto err_no_lock;
+        }
+      }
+      keydef->keysegs+=sp_segs;
+      key_length+=SPLEN*sp_segs;
+      length++;                              /* At least one length uchar */
+      min_key_length++;
+#else
+      my_errno= HA_ERR_UNSUPPORTED;
+      goto err_no_lock;
+#endif /*HAVE_SPATIAL*/
+    }
+    else if (keydef->flag & HA_FULLTEXT)
+    {
+      keydef->flag=HA_FULLTEXT | HA_PACK_KEY | HA_VAR_LENGTH_KEY;
+      options|=HA_OPTION_PACK_KEYS;             /* Using packed keys */
+
+      for (j=0, keyseg=keydef->seg ; (int) j < keydef->keysegs ;
+	   j++, keyseg++)
+      {
+        if (keyseg->type != HA_KEYTYPE_TEXT &&
+	    keyseg->type != HA_KEYTYPE_VARTEXT1 &&
+            keyseg->type != HA_KEYTYPE_VARTEXT2)
+        {
+          my_errno=HA_WRONG_CREATE_OPTION;
+          goto err_no_lock;
+        }
+        if (!(keyseg->flag & HA_BLOB_PART) &&
+	    (keyseg->type == HA_KEYTYPE_VARTEXT1 ||
+             keyseg->type == HA_KEYTYPE_VARTEXT2))
+        {
+          /* Make a flag that this is a VARCHAR */
+          keyseg->flag|= HA_VAR_LENGTH_PART;
+          /* Store in bit_start number of bytes used to pack the length */
+          keyseg->bit_start= ((keyseg->type == HA_KEYTYPE_VARTEXT1)?
+                              1 : 2);
+        }
+      }
+
+      fulltext_keys++;
+      key_length+= HA_FT_MAXBYTELEN+HA_FT_WLEN;
+      length++;                              /* At least one length uchar */
+      min_key_length+= 1 + HA_FT_WLEN;
+      real_length_diff=HA_FT_MAXBYTELEN-FT_MAX_WORD_LEN_FOR_SORT;
+    }
+    else
+    {
+      /* Test if prefix compression */
+      if (keydef->flag & HA_PACK_KEY)
+      {
+	/* Can't use space_compression on number keys */
+	if ((keydef->seg[0].flag & HA_SPACE_PACK) &&
+	    keydef->seg[0].type == (int) HA_KEYTYPE_NUM)
+	  keydef->seg[0].flag&= ~HA_SPACE_PACK;
+
+	/* Only use HA_PACK_KEY when first segment is a variable length key */
+	if (!(keydef->seg[0].flag & (HA_SPACE_PACK | HA_BLOB_PART |
+				     HA_VAR_LENGTH_PART)))
+	{
+	  /* pack relative to previous key */
+	  keydef->flag&= ~HA_PACK_KEY;
+	  keydef->flag|= HA_BINARY_PACK_KEY | HA_VAR_LENGTH_KEY;
+	}
+	else
+	{
+	  keydef->seg[0].flag|=HA_PACK_KEY;	/* for easyer intern test */
+	  keydef->flag|=HA_VAR_LENGTH_KEY;
+	  options|=HA_OPTION_PACK_KEYS;		/* Using packed keys */
+	}
+      }
+      if (keydef->flag & HA_BINARY_PACK_KEY)
+	options|=HA_OPTION_PACK_KEYS;		/* Using packed keys */
+
+      if (keydef->flag & HA_AUTO_KEY && ci->with_auto_increment)
+	share.base.auto_key=i+1;
+      for (j=0, keyseg=keydef->seg ; j < keydef->keysegs ; j++, keyseg++)
+      {
+	/* numbers are stored with high by first to make compression easier */
+	switch (keyseg->type) {
+	case HA_KEYTYPE_SHORT_INT:
+	case HA_KEYTYPE_LONG_INT:
+	case HA_KEYTYPE_FLOAT:
+	case HA_KEYTYPE_DOUBLE:
+	case HA_KEYTYPE_USHORT_INT:
+	case HA_KEYTYPE_ULONG_INT:
+	case HA_KEYTYPE_LONGLONG:
+	case HA_KEYTYPE_ULONGLONG:
+	case HA_KEYTYPE_INT24:
+	case HA_KEYTYPE_UINT24:
+	case HA_KEYTYPE_INT8:
+	  keyseg->flag|= HA_SWAP_KEY;
+          break;
+        case HA_KEYTYPE_VARTEXT1:
+        case HA_KEYTYPE_VARTEXT2:
+        case HA_KEYTYPE_VARBINARY1:
+        case HA_KEYTYPE_VARBINARY2:
+          if (!(keyseg->flag & HA_BLOB_PART))
+          {
+            /* Make a flag that this is a VARCHAR */
+            keyseg->flag|= HA_VAR_LENGTH_PART;
+            /* Store in bit_start number of bytes used to pack the length */
+            keyseg->bit_start= ((keyseg->type == HA_KEYTYPE_VARTEXT1 ||
+                                 keyseg->type == HA_KEYTYPE_VARBINARY1) ?
+                                1 : 2);
+          }
+          break;
+	default:
+	  break;
+	}
+	if (keyseg->flag & HA_SPACE_PACK)
+	{
+          DBUG_ASSERT(!(keyseg->flag & (HA_VAR_LENGTH_PART | HA_BLOB_PART)));
+	  keydef->flag |= HA_SPACE_PACK_USED | HA_VAR_LENGTH_KEY;
+	  options|=HA_OPTION_PACK_KEYS;		/* Using packed keys */
+	  length++;				/* At least one length uchar */
+          if (!keyseg->null_bit)
+            min_key_length++;
+          key_length+= keyseg->length;
+	  if (keyseg->length >= 255)
+	  {
+            /* prefix may be 3 bytes */
+	    length+= 2;
+	  }
+	}
+	else if (keyseg->flag & (HA_VAR_LENGTH_PART | HA_BLOB_PART))
+	{
+          DBUG_ASSERT(!test_all_bits(keyseg->flag,
+                                    (HA_VAR_LENGTH_PART | HA_BLOB_PART)));
+	  keydef->flag|=HA_VAR_LENGTH_KEY;
+	  length++;				/* At least one length uchar */
+          if (!keyseg->null_bit)
+            min_key_length++;
+	  options|=HA_OPTION_PACK_KEYS;		/* Using packed keys */
+          key_length+= keyseg->length;
+	  if (keyseg->length >= 255)
+	  {
+            /* prefix may be 3 bytes */
+	    length+= 2;
+	  }
+	}
+        else
+        {
+          key_length+= keyseg->length;
+          if (!keyseg->null_bit)
+            min_key_length+= keyseg->length;
+        }
+	if (keyseg->null_bit)
+	{
+	  key_length++;
+          /* min key part is 1 byte */
+          min_key_length++;
+	  options|=HA_OPTION_PACK_KEYS;
+	  keyseg->flag|=HA_NULL_PART;
+	  keydef->flag|=HA_VAR_LENGTH_KEY | HA_NULL_PART_KEY;
+	}
+      }
+    } /* if HA_FULLTEXT */
+    key_segs+=keydef->keysegs;
+    if (keydef->keysegs > HA_MAX_KEY_SEG)
+    {
+      my_errno=HA_WRONG_CREATE_OPTION;
+      goto err_no_lock;
+    }
+    /*
+      key_segs may be 0 in the case when we only want to be able to
+      add on row into the table. This can happen with some DISTINCT queries
+      in MySQL
+    */
+    if ((keydef->flag & (HA_NOSAME | HA_NULL_PART_KEY)) == HA_NOSAME &&
+	key_segs)
+      share.state.rec_per_key_part[key_segs-1]=1L;
+    length+=key_length;
+    /*
+      A key can't be longer than than half a index block (as we have
+      to be able to put at least 2 keys on an index block for the key
+      algorithms to work).
+    */
+    if (length > maria_max_key_length())
+    {
+      my_errno=HA_WRONG_CREATE_OPTION;
+      goto err_no_lock;
+    }
+    keydef->block_length= (uint16) maria_block_size;
+    keydef->keylength= (uint16) key_length;
+    keydef->minlength= (uint16) min_key_length;
+    keydef->maxlength= (uint16) length;
+
+    if (length > max_key_length)
+      max_key_length= length;
+    tot_length+= ((max_rows/(ulong) (((uint) maria_block_size -
+                                      MAX_KEYPAGE_HEADER_SIZE -
+                                      KEYPAGE_CHECKSUM_SIZE)/
+                                     (length*2))) *
+                  maria_block_size);
+  }
+
+  unique_key_parts=0;
+  for (i=0, uniquedef=uniquedefs ; i < uniques ; i++ , uniquedef++)
+  {
+    uniquedef->key=keys+i;
+    unique_key_parts+=uniquedef->keysegs;
+    share.state.key_root[keys+i]= HA_OFFSET_ERROR;
+    tot_length+= (max_rows/(ulong) (((uint) maria_block_size -
+                                     MAX_KEYPAGE_HEADER_SIZE -
+                                     KEYPAGE_CHECKSUM_SIZE) /
+                         ((MARIA_UNIQUE_HASH_LENGTH + pointer)*2)))*
+                         (ulong) maria_block_size;
+  }
+  keys+=uniques;				/* Each unique has 1 key */
+  key_segs+=uniques;				/* Each unique has 1 key seg */
+
+  base_pos=(MARIA_STATE_INFO_SIZE + keys * MARIA_STATE_KEY_SIZE +
+	    key_segs * MARIA_STATE_KEYSEG_SIZE);
+  info_length= base_pos+(uint) (MARIA_BASE_INFO_SIZE+
+                                keys * MARIA_KEYDEF_SIZE+
+                                uniques * MARIA_UNIQUEDEF_SIZE +
+                                (key_segs + unique_key_parts)*HA_KEYSEG_SIZE+
+                                columns*(MARIA_COLUMNDEF_SIZE + 2));
+
+ DBUG_PRINT("info", ("info_length: %u", info_length));
+  /* There are only 16 bits for the total header length. */
+  if (info_length > 65535)
+  {
+    my_printf_error(HA_WRONG_CREATE_OPTION,
+                    "Aria table '%s' has too many columns and/or "
+                    "indexes and/or unique constraints.",
+                    MYF(0), name + dirname_length(name));
+    my_errno= HA_WRONG_CREATE_OPTION;
+    goto err_no_lock;
+  }
+
+  bmove(share.state.header.file_version, maria_file_magic, 4);
+  ci->old_options=options | (ci->old_options & HA_OPTION_TEMP_COMPRESS_RECORD ?
+                             HA_OPTION_COMPRESS_RECORD |
+                             HA_OPTION_TEMP_COMPRESS_RECORD: 0);
+  mi_int2store(share.state.header.options,ci->old_options);
+  mi_int2store(share.state.header.header_length,info_length);
+  mi_int2store(share.state.header.state_info_length,MARIA_STATE_INFO_SIZE);
+  mi_int2store(share.state.header.base_info_length,MARIA_BASE_INFO_SIZE);
+  mi_int2store(share.state.header.base_pos,base_pos);
+  share.state.header.data_file_type= share.data_file_type= datafile_type;
+  share.state.header.org_data_file_type= org_datafile_type;
+  share.state.header.language= (ci->language ?
+				ci->language : default_charset_info->number);
+
+  share.state.dellink = HA_OFFSET_ERROR;
+  share.state.first_bitmap_with_space= 0;
+#ifdef EXTERNAL_LOCKING
+  share.state.process=	(ulong) getpid();
+#endif
+  share.state.version=	(ulong) time((time_t*) 0);
+  share.state.sortkey=  (ushort) ~0;
+  share.state.auto_increment=ci->auto_increment;
+  share.options=options;
+  share.base.rec_reflength=pointer;
+  share.base.block_size= maria_block_size;
+
+  /*
+    Get estimate for index file length (this may be wrong for FT keys)
+    This is used for pointers to other key pages.
+  */
+  tmp= (tot_length + maria_block_size * keys *
+	MARIA_INDEX_BLOCK_MARGIN) / maria_block_size;
+
+  /*
+    use maximum of key_file_length we calculated and key_file_length value we
+    got from MAI file header (see also mariapack.c:save_state)
+  */
+  share.base.key_reflength=
+    maria_get_pointer_length(max(ci->key_file_length,tmp),3);
+  share.base.keys= share.state.header.keys= keys;
+  share.state.header.uniques= uniques;
+  share.state.header.fulltext_keys= fulltext_keys;
+  mi_int2store(share.state.header.key_parts,key_segs);
+  mi_int2store(share.state.header.unique_key_parts,unique_key_parts);
+
+  maria_set_all_keys_active(share.state.key_map, keys);
+
+  share.base.keystart = share.state.state.key_file_length=
+    MY_ALIGN(info_length, maria_block_size);
+  share.base.max_key_block_length= maria_block_size;
+  share.base.max_key_length=ALIGN_SIZE(max_key_length+4);
+  share.base.records=ci->max_rows;
+  share.base.reloc=  ci->reloc_rows;
+  share.base.reclength=real_reclength;
+  share.base.pack_reclength=reclength+ test(options & HA_OPTION_CHECKSUM);
+  share.base.max_pack_length=pack_reclength;
+  share.base.min_pack_length=min_pack_length;
+  share.base.pack_bytes= pack_bytes;
+  share.base.fields= columns;
+  share.base.pack_fields= packed;
+
+  if (share.data_file_type == BLOCK_RECORD)
+  {
+    /*
+      we are going to create a first bitmap page, set data_file_length
+      to reflect this, before the state goes to disk
+    */
+    share.state.state.data_file_length= maria_block_size;
+    /* Add length of packed fields + length */
+    share.base.pack_reclength+= share.base.max_field_lengths+3;
+
+    /* Adjust max_pack_length, to be used if we have short rows */
+    if (share.base.max_pack_length < maria_block_size)
+    {
+      share.base.max_pack_length+= FLAG_SIZE;
+      if (ci->transactional)
+        share.base.max_pack_length+= TRANSID_SIZE * 2;
+    }
+  }
+
+  /* max_data_file_length and max_key_file_length are recalculated on open */
+  if (tmp_table)
+    share.base.max_data_file_length= (my_off_t) ci->data_file_length;
+  else if (ci->transactional && translog_status == TRANSLOG_OK &&
+           !maria_in_recovery)
+  {
+    /*
+      we have checked translog_inited above, because maria_chk may call us
+      (via maria_recreate_table()) and it does not have a log.
+    */
+    sync_dir= MY_SYNC_DIR;
+    /*
+      If crash between _ma_state_info_write_sub() and
+      _ma_update_state__lsns_sub(), table should be ignored by Recovery (or
+      old REDOs would fail), so we cannot let LSNs be 0:
+    */
+    share.state.skip_redo_lsn= share.state.is_of_horizon=
+      share.state.create_rename_lsn= LSN_MAX;
+  }
+
+  if (datafile_type == DYNAMIC_RECORD)
+  {
+    share.base.min_block_length=
+      (share.base.pack_reclength+3 < MARIA_EXTEND_BLOCK_LENGTH &&
+       ! share.base.blobs) ?
+      max(share.base.pack_reclength,MARIA_MIN_BLOCK_LENGTH) :
+      MARIA_EXTEND_BLOCK_LENGTH;
+  }
+  else if (datafile_type == STATIC_RECORD)
+    share.base.min_block_length= share.base.pack_reclength;
+
+  if (! (flags & HA_DONT_TOUCH_DATA))
+    share.state.create_time= time((time_t*) 0);
+
+  pthread_mutex_lock(&THR_LOCK_maria);
+
+  /*
+    NOTE: For test_if_reopen() we need a real path name. Hence we need
+    MY_RETURN_REAL_PATH for every fn_format(filename, ...).
+  */
+  if (ci->index_file_name)
+  {
+    char *iext= strrchr(ci->index_file_name, '.');
+    int have_iext= iext && !strcmp(iext, MARIA_NAME_IEXT);
+    if (tmp_table)
+    {
+      char *path;
+      /* chop off the table name, tempory tables use generated name */
+      if ((path= strrchr(ci->index_file_name, FN_LIBCHAR)))
+        *path= '\0';
+      fn_format(filename, name, ci->index_file_name, MARIA_NAME_IEXT,
+                MY_REPLACE_DIR | MY_UNPACK_FILENAME |
+                MY_RETURN_REAL_PATH | MY_APPEND_EXT);
+    }
+    else
+    {
+      fn_format(filename, ci->index_file_name, "", MARIA_NAME_IEXT,
+                MY_UNPACK_FILENAME | MY_RETURN_REAL_PATH |
+                (have_iext ? MY_REPLACE_EXT : MY_APPEND_EXT));
+    }
+    fn_format(linkname, name, "", MARIA_NAME_IEXT,
+              MY_UNPACK_FILENAME|MY_APPEND_EXT);
+    linkname_ptr= linkname;
+    /*
+      Don't create the table if the link or file exists to ensure that one
+      doesn't accidently destroy another table.
+      Don't sync dir now if the data file has the same path.
+    */
+    create_flag=
+      (ci->data_file_name &&
+       !strcmp(ci->index_file_name, ci->data_file_name)) ? 0 : sync_dir;
+  }
+  else
+  {
+    char *iext= strrchr(name, '.');
+    int have_iext= iext && !strcmp(iext, MARIA_NAME_IEXT);
+    fn_format(filename, name, "", MARIA_NAME_IEXT,
+              MY_UNPACK_FILENAME | MY_RETURN_REAL_PATH |
+              (have_iext ? MY_REPLACE_EXT : MY_APPEND_EXT));
+    linkname_ptr= NullS;
+    /*
+      Replace the current file.
+      Don't sync dir now if the data file has the same path.
+    */
+    create_flag=  (flags & HA_CREATE_KEEP_FILES) ? 0 : MY_DELETE_OLD;
+    create_flag|= (!ci->data_file_name ? 0 : sync_dir);
+  }
+
+  /*
+    If a MRG_MARIA table is in use, the mapped MARIA tables are open,
+    but no entry is made in the table cache for them.
+    A TRUNCATE command checks for the table in the cache only and could
+    be fooled to believe, the table is not open.
+    Pull the emergency brake in this situation. (Bug #8306)
+
+
+    NOTE: The filename is compared against unique_file_name of every
+    open table. Hence we need a real path here.
+  */
+  if (_ma_test_if_reopen(filename))
+  {
+    my_printf_error(0, "Aria table '%s' is in use "
+                    "(most likely by a MERGE table). Try FLUSH TABLES.",
+                    MYF(0), name + dirname_length(name));
+    my_errno= HA_ERR_TABLE_EXIST;
+    goto err;
+  }
+
+  if ((file= my_create_with_symlink(linkname_ptr, filename, 0, create_mode,
+				    MYF(MY_WME|create_flag))) < 0)
+    goto err;
+  errpos=1;
+
+  DBUG_PRINT("info", ("write state info and base info"));
+  if (_ma_state_info_write_sub(file, &share.state,
+                               MA_STATE_INFO_WRITE_FULL_INFO) ||
+      _ma_base_info_write(file, &share.base))
+    goto err;
+  DBUG_PRINT("info", ("base_pos: %d  base_info_size: %d",
+                      base_pos, MARIA_BASE_INFO_SIZE));
+  DBUG_ASSERT(my_tell(file,MYF(0)) == base_pos+ MARIA_BASE_INFO_SIZE);
+
+  /* Write key and keyseg definitions */
+  DBUG_PRINT("info", ("write key and keyseg definitions"));
+  for (i=0 ; i < share.base.keys - uniques; i++)
+  {
+    uint sp_segs=(keydefs[i].flag & HA_SPATIAL) ? 2*SPDIMS : 0;
+
+    if (_ma_keydef_write(file, &keydefs[i]))
+      goto err;
+    for (j=0 ; j < keydefs[i].keysegs-sp_segs ; j++)
+      if (_ma_keyseg_write(file, &keydefs[i].seg[j]))
+       goto err;
+#ifdef HAVE_SPATIAL
+    for (j=0 ; j < sp_segs ; j++)
+    {
+      HA_KEYSEG sseg;
+      sseg.type=SPTYPE;
+      sseg.language= 7;                         /* Binary */
+      sseg.null_bit=0;
+      sseg.bit_start=0;
+      sseg.bit_end=0;
+      sseg.bit_length= 0;
+      sseg.bit_pos= 0;
+      sseg.length=SPLEN;
+      sseg.null_pos=0;
+      sseg.start=j*SPLEN;
+      sseg.flag= HA_SWAP_KEY;
+      if (_ma_keyseg_write(file, &sseg))
+        goto err;
+    }
+#endif
+  }
+  /* Create extra keys for unique definitions */
+  offset= real_reclength - uniques*MARIA_UNIQUE_HASH_LENGTH;
+  bzero((char*) &tmp_keydef,sizeof(tmp_keydef));
+  bzero((char*) &tmp_keyseg,sizeof(tmp_keyseg));
+  for (i=0; i < uniques ; i++)
+  {
+    tmp_keydef.keysegs=1;
+    tmp_keydef.flag=		HA_UNIQUE_CHECK;
+    tmp_keydef.block_length=	(uint16) maria_block_size;
+    tmp_keydef.keylength=	MARIA_UNIQUE_HASH_LENGTH + pointer;
+    tmp_keydef.minlength=tmp_keydef.maxlength=tmp_keydef.keylength;
+    tmp_keyseg.type=		MARIA_UNIQUE_HASH_TYPE;
+    tmp_keyseg.length=		MARIA_UNIQUE_HASH_LENGTH;
+    tmp_keyseg.start=		offset;
+    offset+=			MARIA_UNIQUE_HASH_LENGTH;
+    if (_ma_keydef_write(file,&tmp_keydef) ||
+	_ma_keyseg_write(file,(&tmp_keyseg)))
+      goto err;
+  }
+
+  /* Save unique definition */
+  DBUG_PRINT("info", ("write unique definitions"));
+  for (i=0 ; i < share.state.header.uniques ; i++)
+  {
+    HA_KEYSEG *keyseg_end;
+    keyseg= uniquedefs[i].seg;
+    if (_ma_uniquedef_write(file, &uniquedefs[i]))
+      goto err;
+    for (keyseg= uniquedefs[i].seg, keyseg_end= keyseg+ uniquedefs[i].keysegs;
+         keyseg < keyseg_end;
+         keyseg++)
+    {
+      switch (keyseg->type) {
+      case HA_KEYTYPE_VARTEXT1:
+      case HA_KEYTYPE_VARTEXT2:
+      case HA_KEYTYPE_VARBINARY1:
+      case HA_KEYTYPE_VARBINARY2:
+        if (!(keyseg->flag & HA_BLOB_PART))
+        {
+          keyseg->flag|= HA_VAR_LENGTH_PART;
+          keyseg->bit_start= ((keyseg->type == HA_KEYTYPE_VARTEXT1 ||
+                               keyseg->type == HA_KEYTYPE_VARBINARY1) ?
+                              1 : 2);
+        }
+        break;
+      default:
+        DBUG_ASSERT((keyseg->flag & HA_VAR_LENGTH_PART) == 0);
+        break;
+      }
+      if (_ma_keyseg_write(file, keyseg))
+	goto err;
+    }
+  }
+  DBUG_PRINT("info", ("write field definitions"));
+  if (datafile_type == BLOCK_RECORD)
+  {
+    /* Store columns in a more efficent order */
+    MARIA_COLUMNDEF **col_order, **pos;
+    if (!(col_order= (MARIA_COLUMNDEF**) my_malloc(share.base.fields *
+                                                   sizeof(MARIA_COLUMNDEF*),
+                                                   MYF(MY_WME))))
+      goto err;
+    for (column= columndef, pos= col_order ;
+         column != end_column ;
+         column++, pos++)
+      *pos= column;
+    qsort(col_order, share.base.fields, sizeof(*col_order),
+          (qsort_cmp) compare_columns);
+    for (i=0 ; i < share.base.fields ; i++)
+    {
+      column_array[col_order[i]->column_nr]= i;
+      if (_ma_columndef_write(file, col_order[i]))
+      {
+        my_free(col_order, MYF(0));
+        goto err;
+      }
+    }
+    my_free(col_order, MYF(0));
+  }
+  else
+  {
+    for (i=0 ; i < share.base.fields ; i++)
+    {
+      column_array[i]= (uint16) i;
+      if (_ma_columndef_write(file, &columndef[i]))
+        goto err;
+    }
+  }
+  if (_ma_column_nr_write(file, column_array, columns))
+    goto err;
+
+  if ((kfile_size_before_extension= my_tell(file,MYF(0))) == MY_FILEPOS_ERROR)
+    goto err;
+#ifndef DBUG_OFF
+  if (kfile_size_before_extension != info_length)
+    DBUG_PRINT("warning",("info_length: %u  != used_length: %u",
+			  info_length, (uint)kfile_size_before_extension));
+#endif
+
+  if (sync_dir)
+  {
+    /*
+      we log the first bytes and then the size to which we extend; this is
+      not log 1 KB of mostly zeroes if this is a small table.
+    */
+    char empty_string[]= "";
+    LEX_CUSTRING log_array[TRANSLOG_INTERNAL_PARTS + 4];
+    translog_size_t total_rec_length= 0;
+    uint k;
+    LSN lsn;
+    log_array[TRANSLOG_INTERNAL_PARTS + 1].length= 1 + 2 + 2 +
+      (uint) kfile_size_before_extension;
+    /* we are needing maybe 64 kB, so don't use the stack */
+    log_data= my_malloc(log_array[TRANSLOG_INTERNAL_PARTS + 1].length, MYF(0));
+    if ((log_data == NULL) ||
+        my_pread(file, 1 + 2 + 2 + log_data,
+                 (size_t) kfile_size_before_extension, 0, MYF(MY_NABP)))
+      goto err;
+    /*
+      remember if the data file was created or not, to know if Recovery can
+      do it or not, in the future
+    */
+    log_data[0]= test(flags & HA_DONT_TOUCH_DATA);
+    int2store(log_data + 1, kfile_size_before_extension);
+    int2store(log_data + 1 + 2, share.base.keystart);
+    log_array[TRANSLOG_INTERNAL_PARTS + 0].str= (uchar *)name;
+    /* we store the end-zero, for Recovery to just pass it to my_create() */
+    log_array[TRANSLOG_INTERNAL_PARTS + 0].length= strlen(name) + 1;
+    log_array[TRANSLOG_INTERNAL_PARTS + 1].str= log_data;
+    /* symlink description is also needed for re-creation by Recovery: */
+    {
+      const char *s= ci->data_file_name ? ci->data_file_name : empty_string;
+      log_array[TRANSLOG_INTERNAL_PARTS + 2].str= (uchar*)s;
+      log_array[TRANSLOG_INTERNAL_PARTS + 2].length= strlen(s) + 1;
+      s= ci->index_file_name ? ci->index_file_name : empty_string;
+      log_array[TRANSLOG_INTERNAL_PARTS + 3].str= (uchar*)s;
+      log_array[TRANSLOG_INTERNAL_PARTS + 3].length= strlen(s) + 1;
+    }
+    for (k= TRANSLOG_INTERNAL_PARTS;
+         k < (sizeof(log_array)/sizeof(log_array[0])); k++)
+      total_rec_length+= (translog_size_t) log_array[k].length;
+    /**
+       For this record to be of any use for Recovery, we need the upper
+       MySQL layer to be crash-safe, which it is not now (that would require
+       work using the ddl_log of sql/sql_table.cc); when it is, we should
+       reconsider the moment of writing this log record (before or after op,
+       under THR_LOCK_maria or not...), how to use it in Recovery.
+       For now this record can serve when we apply logs to a backup,
+       so we sync it. This happens before the data file is created. If the
+       data file was created before, and we crashed before writing the log
+       record, at restart the table may be used, so we would not have a
+       trustable history in the log (impossible to apply this log to a
+       backup). The way we do it, if we crash before writing the log record
+       then there is no data file and the table cannot be used.
+       @todo Note that in case of TRUNCATE TABLE we also come here; for
+       Recovery to be able to finish TRUNCATE TABLE, instead of leaving a
+       half-truncated table, we should log the record at start of
+       maria_create(); for that we shouldn't write to the index file but to a
+       buffer (DYNAMIC_STRING), put the buffer into the record, then put the
+       buffer into the index file (so, change _ma_keydef_write() etc). That
+       would also enable Recovery to finish a CREATE TABLE. The final result
+       would be that we would be able to finish what the SQL layer has asked
+       for: it would be atomic.
+       When in CREATE/TRUNCATE (or DROP or RENAME or REPAIR) we have not
+       called external_lock(), so have no TRN. It does not matter, as all
+       these operations are non-transactional and sync their files.
+    */
+    if (unlikely(translog_write_record(&lsn,
+                                       LOGREC_REDO_CREATE_TABLE,
+                                       &dummy_transaction_object, NULL,
+                                       total_rec_length,
+                                       sizeof(log_array)/sizeof(log_array[0]),
+                                       log_array, NULL, NULL) ||
+                 translog_flush(lsn)))
+      goto err;
+    share.kfile.file= file;
+    DBUG_EXECUTE_IF("maria_flush_whole_log",
+                    {
+                      DBUG_PRINT("maria_flush_whole_log", ("now"));
+                      translog_flush(translog_get_horizon());
+                    });
+    DBUG_EXECUTE_IF("maria_crash_create_table",
+                    {
+                      DBUG_PRINT("maria_crash_create_table", ("now"));
+                      DBUG_ABORT();
+                    });
+    /*
+      store LSN into file, needed for Recovery to not be confused if a
+      DROP+CREATE happened (applying REDOs to the wrong table).
+    */
+    if (_ma_update_state_lsns_sub(&share, lsn, trnman_get_min_safe_trid(),
+                                  FALSE, TRUE))
+      goto err;
+    my_free(log_data, MYF(0));
+  }
+
+  if (!(flags & HA_DONT_TOUCH_DATA))
+  {
+    if (ci->data_file_name)
+    {
+      char *dext= strrchr(ci->data_file_name, '.');
+      int have_dext= dext && !strcmp(dext, MARIA_NAME_DEXT);
+
+      if (tmp_table)
+      {
+        char *path;
+        /* chop off the table name, tempory tables use generated name */
+        if ((path= strrchr(ci->data_file_name, FN_LIBCHAR)))
+          *path= '\0';
+        fn_format(filename, name, ci->data_file_name, MARIA_NAME_DEXT,
+                  MY_REPLACE_DIR | MY_UNPACK_FILENAME | MY_APPEND_EXT);
+      }
+      else
+      {
+        fn_format(filename, ci->data_file_name, "", MARIA_NAME_DEXT,
+                  MY_UNPACK_FILENAME |
+                  (have_dext ? MY_REPLACE_EXT : MY_APPEND_EXT));
+      }
+      fn_format(linkname, name, "",MARIA_NAME_DEXT,
+                MY_UNPACK_FILENAME | MY_APPEND_EXT);
+      linkname_ptr= linkname;
+      create_flag=0;
+    }
+    else
+    {
+      fn_format(filename,name,"", MARIA_NAME_DEXT,
+                MY_UNPACK_FILENAME | MY_APPEND_EXT);
+      linkname_ptr= NullS;
+      create_flag= (flags & HA_CREATE_KEEP_FILES) ? 0 : MY_DELETE_OLD;
+    }
+    if ((dfile=
+         my_create_with_symlink(linkname_ptr, filename, 0, create_mode,
+                                MYF(MY_WME | create_flag | sync_dir))) < 0)
+      goto err;
+    errpos=3;
+
+    if (_ma_initialize_data_file(&share, dfile))
+      goto err;
+  }
+
+	/* Enlarge files */
+  DBUG_PRINT("info", ("enlarge to keystart: %lu",
+                      (ulong) share.base.keystart));
+  if (my_chsize(file,(ulong) share.base.keystart,0,MYF(0)))
+    goto err;
+
+  if (sync_dir && my_sync(file, MYF(0)))
+    goto err;
+
+  if (! (flags & HA_DONT_TOUCH_DATA))
+  {
+#ifdef USE_RELOC
+    if (my_chsize(dfile,share.base.min_pack_length*ci->reloc_rows,0,MYF(0)))
+      goto err;
+#endif
+    if (sync_dir && my_sync(dfile, MYF(0)))
+      goto err;
+    if (my_close(dfile,MYF(0)))
+      goto err;
+  }
+  pthread_mutex_unlock(&THR_LOCK_maria);
+  res= 0;
+  my_free((char*) rec_per_key_part,MYF(0));
+  errpos=0;
+  if (my_close(file,MYF(0)))
+    res= my_errno;
+  DBUG_RETURN(res);
+
+err:
+  pthread_mutex_unlock(&THR_LOCK_maria);
+
+err_no_lock:
+  save_errno=my_errno;
+  switch (errpos) {
+  case 3:
+    VOID(my_close(dfile,MYF(0)));
+    /* fall through */
+  case 2:
+  if (! (flags & HA_DONT_TOUCH_DATA))
+    my_delete_with_symlink(fn_format(filename,name,"",MARIA_NAME_DEXT,
+                                     MY_UNPACK_FILENAME | MY_APPEND_EXT),
+			   sync_dir);
+    /* fall through */
+  case 1:
+    VOID(my_close(file,MYF(0)));
+    if (! (flags & HA_DONT_TOUCH_DATA))
+      my_delete_with_symlink(fn_format(filename,name,"",MARIA_NAME_IEXT,
+                                       MY_UNPACK_FILENAME | MY_APPEND_EXT),
+			     sync_dir);
+  }
+  my_free(log_data, MYF(MY_ALLOW_ZERO_PTR));
+  my_free((char*) rec_per_key_part, MYF(0));
+  DBUG_RETURN(my_errno=save_errno);		/* return the fatal errno */
+}
+
+
+uint maria_get_pointer_length(ulonglong file_length, uint def)
+{
+  DBUG_ASSERT(def >= 2 && def <= 7);
+  if (file_length)				/* If not default */
+  {
+#ifdef NOT_YET_READY_FOR_8_BYTE_POINTERS
+    if (file_length >= (ULL(1) << 56))
+      def=8;
+    else
+#endif
+      if (file_length >= (ULL(1) << 48))
+      def=7;
+    else if (file_length >= (ULL(1) << 40))
+      def=6;
+    else if (file_length >= (ULL(1) << 32))
+      def=5;
+    else if (file_length >= (ULL(1) << 24))
+      def=4;
+    else if (file_length >= (ULL(1) << 16))
+      def=3;
+    else
+      def=2;
+  }
+  return def;
+}
+
+
+/*
+  Sort columns for records-in-block
+
+  IMPLEMENTATION
+   Sort columns in following order:
+
+   Fixed size, not null columns
+   Fixed length, null fields
+   Numbers (zero fill fields)
+   Variable length fields (CHAR, VARCHAR) according to length
+   Blobs
+
+   For same kind of fields, keep fields in original order
+*/
+
+static inline int sign(long a)
+{
+  return a < 0 ? -1 : (a > 0 ? 1 : 0);
+}
+
+
+static int compare_columns(MARIA_COLUMNDEF **a_ptr, MARIA_COLUMNDEF **b_ptr)
+{
+  MARIA_COLUMNDEF *a= *a_ptr, *b= *b_ptr;
+  enum en_fieldtype a_type, b_type;
+
+  a_type= (a->type == FIELD_CHECK) ? FIELD_NORMAL : a->type;
+  b_type= (b->type == FIELD_CHECK) ? FIELD_NORMAL : b->type;
+
+  if (a_type == FIELD_NORMAL && !a->null_bit)
+  {
+    if (b_type != FIELD_NORMAL || b->null_bit)
+      return -1;
+    return sign((long) a->offset - (long) b->offset);
+  }
+  if (b_type == FIELD_NORMAL && !b->null_bit)
+    return 1;
+  if (a_type == b_type)
+    return sign((long) a->offset - (long) b->offset);
+  if (a_type == FIELD_NORMAL)
+    return -1;
+  if (b_type == FIELD_NORMAL)
+    return 1;
+  if (a_type == FIELD_SKIP_ZERO)
+    return -1;
+  if (b_type == FIELD_SKIP_ZERO)
+    return 1;
+  if (a->type != FIELD_BLOB && b->type != FIELD_BLOB)
+    if (a->length != b->length)
+      return sign((long) a->length - (long) b->length);
+  if (a_type == FIELD_BLOB)
+    return 1;
+  if (b_type == FIELD_BLOB)
+    return -1;
+  return sign((long) a->offset - (long) b->offset);
+}
+
+
+/**
+   @brief Initialize data file
+
+   @note
+   In BLOCK_RECORD, a freshly created datafile is one page long; while in
+   other formats it is 0-byte long.
+ */
+
+int _ma_initialize_data_file(MARIA_SHARE *share, File dfile)
+{
+  if (share->data_file_type == BLOCK_RECORD)
+  {
+    share->bitmap.block_size= share->base.block_size;
+    share->bitmap.file.file = dfile;
+    return _ma_bitmap_create_first(share);
+  }
+  return 0;
+}
+
+
+/**
+   @brief Writes create_rename_lsn, skip_redo_lsn and is_of_horizon to disk,
+   can force.
+
+   This is for special cases where:
+   - we don't want to write the full state to disk (so, not call
+   _ma_state_info_write()) because some parts of the state may be
+   currently inconsistent, or because it would be overkill
+   - we must sync these LSNs immediately for correctness.
+   It acquires intern_lock to protect the LSNs and state write.
+
+   @param  share           table's share
+   @param  lsn		   LSN to write to log files
+   @param  create_trid     Trid to be used as state.create_trid
+   @param  do_sync         if the write should be forced to disk
+   @param  update_create_rename_lsn if this LSN should be updated or not
+
+   @return Operation status
+     @retval 0      ok
+     @retval 1      error (disk problem)
+*/
+
+int _ma_update_state_lsns(MARIA_SHARE *share, LSN lsn, TrID create_trid,
+                          my_bool do_sync, my_bool update_create_rename_lsn)
+{
+  int res;
+  pthread_mutex_lock(&share->intern_lock);
+  res= _ma_update_state_lsns_sub(share, lsn, create_trid, do_sync,
+                                 update_create_rename_lsn);
+  pthread_mutex_unlock(&share->intern_lock);
+  return res;
+}
+
+
+/**
+   @brief Writes create_rename_lsn, skip_redo_lsn and is_of_horizon to disk,
+   can force.
+
+   Shortcut of _ma_update_state_lsns() when we know that intern_lock is not
+   needed (when creating a table or opening it for the first time).
+
+   @param  share           table's share
+   @param  lsn             LSN to write to state; if LSN_IMPOSSIBLE, write
+                           a LOGREC_IMPORTED_TABLE and use its LSN as lsn.
+   @param  create_trid     Trid to be used as state.create_trid
+   @param  do_sync         if the write should be forced to disk
+   @param  update_create_rename_lsn if this LSN should be updated or not
+
+   @return Operation status
+     @retval 0      ok
+     @retval 1      error (disk problem)
+*/
+
+#if (_MSC_VER == 1310)
+/*
+ Visual Studio 2003 compiler produces internal compiler error
+ in this function. Disable optimizations to workaround.
+*/
+#pragma optimize("",off)
+#endif
+int _ma_update_state_lsns_sub(MARIA_SHARE *share, LSN lsn, TrID create_trid,
+                              my_bool do_sync,
+                              my_bool update_create_rename_lsn)
+{
+  uchar buf[LSN_STORE_SIZE * 3], *ptr;
+  uchar trid_buff[8];
+  File file= share->kfile.file;
+  DBUG_ASSERT(file >= 0);
+
+  if (lsn == LSN_IMPOSSIBLE)
+  {
+    int res;
+    LEX_CUSTRING log_array[TRANSLOG_INTERNAL_PARTS + 1];
+    /* table name is logged only for information */
+    log_array[TRANSLOG_INTERNAL_PARTS + 0].str=
+      (uchar *)(share->open_file_name.str);
+    log_array[TRANSLOG_INTERNAL_PARTS + 0].length=
+      share->open_file_name.length + 1;
+    if ((res= translog_write_record(&lsn, LOGREC_IMPORTED_TABLE,
+                                    &dummy_transaction_object, NULL,
+                                    (translog_size_t)
+                                    log_array[TRANSLOG_INTERNAL_PARTS +
+                                              0].length,
+                                    sizeof(log_array)/sizeof(log_array[0]),
+                                    log_array, NULL, NULL)))
+      return res;
+  }
+
+  for (ptr= buf; ptr < (buf + sizeof(buf)); ptr+= LSN_STORE_SIZE)
+    lsn_store(ptr, lsn);
+  share->state.skip_redo_lsn= share->state.is_of_horizon= lsn;
+  share->state.create_trid= create_trid;
+  mi_int8store(trid_buff, create_trid);
+  if (update_create_rename_lsn)
+  {
+    share->state.create_rename_lsn= lsn;
+    if (share->id != 0)
+    {
+      /*
+        If OP is the operation which is calling us, if table is later written,
+        we could see in the log:
+        FILE_ID ... REDO_OP ... REDO_INSERT.
+        (that can happen in real life at least with OP=REPAIR).
+        As FILE_ID will be ignored by Recovery because it is <
+        create_rename_lsn, REDO_INSERT would be ignored too, wrongly.
+        To avoid that, we force a LOGREC_FILE_ID to be logged at next write:
+      */
+      translog_deassign_id_from_share(share);
+    }
+  }
+  else
+    lsn_store(buf, share->state.create_rename_lsn);
+  return (my_pwrite(file, buf, sizeof(buf),
+                    sizeof(share->state.header) +
+                    MARIA_FILE_CREATE_RENAME_LSN_OFFSET, MYF(MY_NABP)) ||
+          my_pwrite(file, trid_buff, sizeof(trid_buff),
+                    sizeof(share->state.header) +
+                    MARIA_FILE_CREATE_TRID_OFFSET, MYF(MY_NABP)) ||
+          (do_sync && my_sync(file, MYF(0))));
+}
+#if (_MSC_VER == 1310)
+#pragma optimize("",on)
+#endif /*VS2003 compiler bug workaround*/
diff --git a/storage/maria/ma_dbug.c b/storage/maria/ma_dbug.c
new file mode 100644
index 00000000000..af90a108e2a
--- /dev/null
+++ b/storage/maria/ma_dbug.c
@@ -0,0 +1,201 @@
+/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; version 2 of the License.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */
+
+/* Support rutiner with are using with dbug */
+
+#include "maria_def.h"
+
+void _ma_print_key(FILE *stream, MARIA_KEY *key)
+{
+  _ma_print_keydata(stream, key->keyinfo->seg, key->data, key->data_length);
+}
+
+
+/* Print a key in a user understandable format */
+
+void _ma_print_keydata(FILE *stream, register HA_KEYSEG *keyseg,
+                       const uchar *key, uint length)
+{
+  int flag;
+  short int s_1;
+  long	int l_1;
+  float f_1;
+  double d_1;
+  const uchar *end;
+  const uchar *key_end= key + length;
+
+  VOID(fputs("Key: \"",stream));
+  flag=0;
+  for (; keyseg->type && key < key_end ;keyseg++)
+  {
+    if (flag++)
+      VOID(putc('-',stream));
+    end= key+ keyseg->length;
+    if (keyseg->flag & HA_NULL_PART)
+    {
+      /* A NULL value is encoded by a 1-byte flag. Zero means NULL. */
+      if (! *(key++))
+      {
+	fprintf(stream,"NULL");
+	continue;
+      }
+      end++;
+    }
+
+    switch (keyseg->type) {
+    case HA_KEYTYPE_BINARY:
+      if (!(keyseg->flag & HA_SPACE_PACK) && keyseg->length == 1)
+      {						/* packed binary digit */
+	VOID(fprintf(stream,"%d",(uint) *key++));
+	break;
+      }
+      /* fall through */
+    case HA_KEYTYPE_TEXT:
+    case HA_KEYTYPE_NUM:
+      if (keyseg->flag & HA_SPACE_PACK)
+      {
+	VOID(fprintf(stream,"%.*s",(int) *key,key+1));
+	key+= (int) *key+1;
+      }
+      else
+      {
+	VOID(fprintf(stream,"%.*s",(int) keyseg->length,key));
+	key=end;
+      }
+      break;
+    case HA_KEYTYPE_INT8:
+      VOID(fprintf(stream,"%d",(int) *((const signed char*) key)));
+      key=end;
+      break;
+    case HA_KEYTYPE_SHORT_INT:
+      s_1= mi_sint2korr(key);
+      VOID(fprintf(stream,"%d",(int) s_1));
+      key=end;
+      break;
+    case HA_KEYTYPE_USHORT_INT:
+      {
+	ushort u_1;
+	u_1= mi_uint2korr(key);
+	VOID(fprintf(stream,"%u",(uint) u_1));
+	key=end;
+	break;
+      }
+    case HA_KEYTYPE_LONG_INT:
+      l_1=mi_sint4korr(key);
+      VOID(fprintf(stream,"%ld",l_1));
+      key=end;
+      break;
+    case HA_KEYTYPE_ULONG_INT:
+      l_1=mi_uint4korr(key);
+      VOID(fprintf(stream,"%lu",(ulong) l_1));
+      key=end;
+      break;
+    case HA_KEYTYPE_INT24:
+      VOID(fprintf(stream,"%ld",(long) mi_sint3korr(key)));
+      key=end;
+      break;
+    case HA_KEYTYPE_UINT24:
+      VOID(fprintf(stream,"%lu",(ulong) mi_uint3korr(key)));
+      key=end;
+      break;
+    case HA_KEYTYPE_FLOAT:
+      mi_float4get(f_1,key);
+      VOID(fprintf(stream,"%g",(double) f_1));
+      key=end;
+      break;
+    case HA_KEYTYPE_DOUBLE:
+      mi_float8get(d_1,key);
+      VOID(fprintf(stream,"%g",d_1));
+      key=end;
+      break;
+#ifdef HAVE_LONG_LONG
+    case HA_KEYTYPE_LONGLONG:
+    {
+      char buff[21];
+      longlong10_to_str(mi_sint8korr(key),buff,-10);
+      VOID(fprintf(stream,"%s",buff));
+      key=end;
+      break;
+    }
+    case HA_KEYTYPE_ULONGLONG:
+    {
+      char buff[21];
+      longlong10_to_str(mi_sint8korr(key),buff,10);
+      VOID(fprintf(stream,"%s",buff));
+      key=end;
+      break;
+    }
+#endif
+    case HA_KEYTYPE_BIT:
+    {
+      uint i;
+      fputs("0x",stream);
+      for (i=0 ; i < keyseg->length ; i++)
+        fprintf(stream, "%02x", (uint) *key++);
+      key= end;
+      break;
+    }
+    case HA_KEYTYPE_VARTEXT1:                   /* VARCHAR and TEXT */
+    case HA_KEYTYPE_VARTEXT2:                   /* VARCHAR and TEXT */
+    case HA_KEYTYPE_VARBINARY1:                 /* VARBINARY and BLOB */
+    case HA_KEYTYPE_VARBINARY2:                 /* VARBINARY and BLOB */
+    {
+      uint tmp_length;
+      get_key_length(tmp_length,key);
+      /*
+	The following command sometimes gives a warning from valgrind.
+	Not yet sure if the bug is in valgrind, glibc or mysqld
+      */
+      VOID(fprintf(stream,"%.*s",(int) tmp_length,key));
+      key+=tmp_length;
+      break;
+    }
+    default: break;			/* This never happens */
+    }
+  }
+  VOID(fputs("\"\n",stream));
+  return;
+} /* print_key */
+
+
+#ifdef EXTRA_DEBUG
+
+my_bool _ma_check_table_is_closed(const char *name, const char *where)
+{
+  char filename[FN_REFLEN];
+  LIST *pos;
+  DBUG_ENTER("_ma_check_table_is_closed");
+
+  (void) fn_format(filename,name,"",MARIA_NAME_IEXT,4+16+32);
+  pthread_mutex_lock(&THR_LOCK_maria);
+  for (pos=maria_open_list ; pos ; pos=pos->next)
+  {
+    MARIA_HA *info=(MARIA_HA*) pos->data;
+    MARIA_SHARE *share= info->s;
+    if (!strcmp(share->unique_file_name.str, filename))
+    {
+      if (share->last_version)
+      {
+	fprintf(stderr,"Warning:  Table: %s is open on %s\n", name,where);
+	DBUG_PRINT("warning",("Table: %s is open on %s", name,where));
+        pthread_mutex_unlock(&THR_LOCK_maria);
+	DBUG_RETURN(1);
+      }
+    }
+  }
+  pthread_mutex_unlock(&THR_LOCK_maria);
+  DBUG_RETURN(0);
+}
+#endif /* EXTRA_DEBUG */
diff --git a/storage/maria/ma_delete.c b/storage/maria/ma_delete.c
new file mode 100644
index 00000000000..5c04f358b14
--- /dev/null
+++ b/storage/maria/ma_delete.c
@@ -0,0 +1,1650 @@
+/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB
+   Copyright (C) 2009-2010 Monty Program Ab
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; version 2 of the License.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */
+
+#include "ma_fulltext.h"
+#include "ma_rt_index.h"
+#include "trnman.h"
+#include "ma_key_recover.h"
+
+static int d_search(MARIA_HA *info, MARIA_KEY *key, uint32 comp_flag,
+                    MARIA_PAGE *page);
+static int del(MARIA_HA *info, MARIA_KEY *key,
+               MARIA_PAGE *anc_page, MARIA_PAGE *leaf_page,
+	       uchar *keypos, my_off_t next_block, uchar *ret_key_buff);
+static int underflow(MARIA_HA *info, MARIA_KEYDEF *keyinfo,
+		     MARIA_PAGE *anc_page, MARIA_PAGE *leaf_page,
+		     uchar *keypos);
+static uint remove_key(MARIA_KEYDEF *keyinfo, uint page_flag, uint nod_flag,
+                       uchar *keypos, uchar *lastkey, uchar *page_end,
+		       my_off_t *next_block, MARIA_KEY_PARAM *s_temp);
+
+/* @breif Remove a row from a MARIA table */
+
+int maria_delete(MARIA_HA *info,const uchar *record)
+{
+  uint i;
+  uchar *old_key;
+  int save_errno;
+  char lastpos[8];
+  MARIA_SHARE *share= info->s;
+  MARIA_KEYDEF *keyinfo;
+  DBUG_ENTER("maria_delete");
+
+  /* Test if record is in datafile */
+  DBUG_EXECUTE_IF("maria_pretend_crashed_table_on_usage",
+                  maria_print_error(share, HA_ERR_CRASHED);
+                  DBUG_RETURN(my_errno= HA_ERR_CRASHED););
+  DBUG_EXECUTE_IF("my_error_test_undefined_error",
+                  maria_print_error(share, INT_MAX);
+                  DBUG_RETURN(my_errno= INT_MAX););
+  if (!(info->update & HA_STATE_AKTIV))
+  {
+    DBUG_RETURN(my_errno=HA_ERR_KEY_NOT_FOUND);	/* No database read */
+  }
+  if (share->options & HA_OPTION_READ_ONLY_DATA)
+  {
+    DBUG_RETURN(my_errno=EACCES);
+  }
+  if (_ma_readinfo(info,F_WRLCK,1))
+    DBUG_RETURN(my_errno);
+  if ((*share->compare_record)(info,record))
+    goto err;				/* Error on read-check */
+
+  if (_ma_mark_file_changed(info))
+    goto err;
+
+  /* Ensure we don't change the autoincrement value */
+  info->last_auto_increment= ~(ulonglong) 0;
+  /* Remove all keys from the index file */
+
+  old_key= info->lastkey_buff2;
+
+  for (i=0, keyinfo= share->keyinfo ; i < share->base.keys ; i++, keyinfo++)
+  {
+    if (maria_is_key_active(share->state.key_map, i))
+    {
+      keyinfo->version++;
+      if (keyinfo->flag & HA_FULLTEXT)
+      {
+        if (_ma_ft_del(info, i, old_key, record, info->cur_row.lastpos))
+          goto err;
+      }
+      else
+      {
+        MARIA_KEY key;
+        if (keyinfo->ck_delete(info,
+                               (*keyinfo->make_key)(info, &key, i, old_key,
+                                                    record,
+                                                    info->cur_row.lastpos,
+                                                    info->cur_row.trid)))
+          goto err;
+      }
+      /* The above changed info->lastkey2. Inform maria_rnext_same(). */
+      info->update&= ~HA_STATE_RNEXT_SAME;
+    }
+  }
+
+  if (share->calc_checksum)
+  {
+    /*
+      We can't use the row based checksum as this doesn't have enough
+      precision.
+    */
+    info->cur_row.checksum= (*share->calc_checksum)(info, record);
+  }
+
+  if ((*share->delete_record)(info, record))
+    goto err;				/* Remove record from database */
+
+  info->state->checksum-= info->cur_row.checksum;
+  info->state->records--;
+  info->update= HA_STATE_CHANGED+HA_STATE_DELETED+HA_STATE_ROW_CHANGED;
+  share->state.changed|= (STATE_NOT_OPTIMIZED_ROWS | STATE_NOT_MOVABLE |
+                          STATE_NOT_ZEROFILLED);
+  info->state->changed=1;
+
+  mi_sizestore(lastpos, info->cur_row.lastpos);
+  VOID(_ma_writeinfo(info,WRITEINFO_UPDATE_KEYFILE));
+  allow_break();			/* Allow SIGHUP & SIGINT */
+  if (info->invalidator != 0)
+  {
+    DBUG_PRINT("info", ("invalidator... '%s' (delete)",
+                        share->open_file_name.str));
+    (*info->invalidator)(share->open_file_name.str);
+    info->invalidator=0;
+  }
+  DBUG_RETURN(0);
+
+err:
+  save_errno= my_errno;
+  DBUG_ASSERT(save_errno);
+  if (!save_errno)
+    save_errno= HA_ERR_INTERNAL_ERROR;          /* Should never happen */
+
+  mi_sizestore(lastpos, info->cur_row.lastpos);
+  if (save_errno != HA_ERR_RECORD_CHANGED)
+  {
+    maria_print_error(share, HA_ERR_CRASHED);
+    maria_mark_crashed(info);		/* mark table crashed */
+  }
+  VOID(_ma_writeinfo(info,WRITEINFO_UPDATE_KEYFILE));
+  info->update|=HA_STATE_WRITTEN;	/* Buffer changed */
+  allow_break();			/* Allow SIGHUP & SIGINT */
+  if (save_errno == HA_ERR_KEY_NOT_FOUND)
+  {
+    maria_print_error(share, HA_ERR_CRASHED);
+    my_errno=HA_ERR_CRASHED;
+  }
+  DBUG_RETURN(my_errno= save_errno);
+} /* maria_delete */
+
+
+/*
+  Remove a key from the btree index
+
+  TODO:
+   Change ma_ck_real_delete() to use another buffer for changed keys instead
+   of key->data. This would allows us to remove the copying of the key here.
+*/
+
+my_bool _ma_ck_delete(MARIA_HA *info, MARIA_KEY *key)
+{
+  MARIA_SHARE *share= info->s;
+  int res;
+  LSN lsn= LSN_IMPOSSIBLE;
+  my_off_t new_root= share->state.key_root[key->keyinfo->key_nr];
+  uchar key_buff[MARIA_MAX_KEY_BUFF], *save_key_data;
+  MARIA_KEY org_key;
+  DBUG_ENTER("_ma_ck_delete");
+
+  LINT_INIT_STRUCT(org_key);
+
+  save_key_data= key->data;
+  if (share->now_transactional)
+  {
+    /* Save original value as the key may change */
+    memcpy(key_buff, key->data, key->data_length + key->ref_length);
+    org_key= *key;
+    key->data= key_buff;
+  }
+
+  if ((res= _ma_ck_real_delete(info, key, &new_root)))
+  {
+    /* We have to mark the table crashed before unpin_all_pages() */
+    maria_mark_crashed(info);
+  }
+
+  key->data= save_key_data;
+  if (!res && share->now_transactional)
+    res= _ma_write_undo_key_delete(info, &org_key, new_root, &lsn);
+  else
+  {
+    share->state.key_root[key->keyinfo->key_nr]= new_root;
+    _ma_fast_unlock_key_del(info);
+  }
+  _ma_unpin_all_pages_and_finalize_row(info, lsn);
+  DBUG_RETURN(res != 0);
+} /* _ma_ck_delete */
+
+
+my_bool _ma_ck_real_delete(register MARIA_HA *info, MARIA_KEY *key,
+                           my_off_t *root)
+{
+  int error;
+  my_bool result= 0;
+  my_off_t old_root;
+  uchar *root_buff;
+  MARIA_KEYDEF *keyinfo= key->keyinfo;
+  MARIA_PAGE page;
+  DBUG_ENTER("_ma_ck_real_delete");
+
+  if ((old_root=*root) == HA_OFFSET_ERROR)
+  {
+    my_errno=HA_ERR_CRASHED;
+    DBUG_RETURN(1);
+  }
+  if (!(root_buff= (uchar*)  my_alloca((uint) keyinfo->block_length+
+                                       MARIA_MAX_KEY_BUFF*2)))
+  {
+    DBUG_PRINT("error",("Couldn't allocate memory"));
+    my_errno=ENOMEM;
+    DBUG_RETURN(1);
+  }
+  DBUG_PRINT("info",("root_page: %lu",
+                     (ulong) (old_root / keyinfo->block_length)));
+  if (_ma_fetch_keypage(&page, info, keyinfo, old_root,
+                        PAGECACHE_LOCK_WRITE, DFLT_INIT_HITS, root_buff, 0))
+  {
+    result= 1;
+    goto err;
+  }
+  if ((error= d_search(info, key, (keyinfo->flag & HA_FULLTEXT ?
+                                   SEARCH_FIND | SEARCH_UPDATE | SEARCH_INSERT:
+                                   SEARCH_SAME),
+                       &page)))
+  {
+    if (error < 0)
+      result= 1;
+    else if (error == 2)
+    {
+      DBUG_PRINT("test",("Enlarging of root when deleting"));
+      if (_ma_enlarge_root(info, key, root))
+        result= 1;
+    }
+    else /* error == 1 */
+    {
+      MARIA_SHARE *share= info->s;
+
+      page_mark_changed(info, &page);
+
+      if (page.size <= page.node + share->keypage_header + 1)
+      {
+	if (page.node)
+	  *root= _ma_kpos(page.node, root_buff +share->keypage_header +
+                          page.node);
+	else
+	  *root=HA_OFFSET_ERROR;
+	if (_ma_dispose(info, old_root, 0))
+	  result= 1;
+      }
+      else if (_ma_write_keypage(&page, PAGECACHE_LOCK_LEFT_WRITELOCKED,
+                                 DFLT_INIT_HITS))
+        result= 1;
+    }
+  }
+err:
+  my_afree(root_buff);
+  DBUG_PRINT("exit",("Return: %d",result));
+  DBUG_RETURN(result);
+} /* _ma_ck_real_delete */
+
+
+/**
+   @brief Remove key below key root
+
+   @param key  Key to delete.  Will contain new key if block was enlarged
+
+   @return
+   @retval 0   ok (anc_page is not changed)
+   @retval 1   If data on page is too small; In this case anc_buff is not saved
+   @retval 2   If data on page is too big
+   @retval -1  On errors
+*/
+
+static int d_search(MARIA_HA *info, MARIA_KEY *key, uint32 comp_flag,
+                    MARIA_PAGE *anc_page)
+{
+  int flag,ret_value,save_flag;
+  uint nod_flag, page_flag;
+  my_bool last_key;
+  uchar *leaf_buff,*keypos;
+  uchar lastkey[MARIA_MAX_KEY_BUFF];
+  MARIA_KEY_PARAM s_temp;
+  MARIA_SHARE *share= info->s;
+  MARIA_KEYDEF *keyinfo= key->keyinfo;
+  MARIA_PAGE leaf_page;
+  DBUG_ENTER("d_search");
+  DBUG_DUMP("page", anc_page->buff, anc_page->size);
+
+  flag=(*keyinfo->bin_search)(key, anc_page, comp_flag, &keypos, lastkey,
+                              &last_key);
+  if (flag == MARIA_FOUND_WRONG_KEY)
+  {
+    DBUG_PRINT("error",("Found wrong key"));
+    DBUG_RETURN(-1);
+  }
+  page_flag= anc_page->flag;
+  nod_flag=  anc_page->node;
+
+  if (!flag && (keyinfo->flag & HA_FULLTEXT))
+  {
+    uint off;
+    int  subkeys;
+
+    get_key_full_length_rdonly(off, lastkey);
+    subkeys=ft_sintXkorr(lastkey+off);
+    DBUG_ASSERT(info->ft1_to_ft2==0 || subkeys >=0);
+    comp_flag=SEARCH_SAME;
+    if (subkeys >= 0)
+    {
+      /* normal word, one-level tree structure */
+      if (info->ft1_to_ft2)
+      {
+        /* we're in ft1->ft2 conversion mode. Saving key data */
+        insert_dynamic(info->ft1_to_ft2, (lastkey+off));
+      }
+      else
+      {
+        /* we need exact match only if not in ft1->ft2 conversion mode */
+        flag=(*keyinfo->bin_search)(key, anc_page, comp_flag, &keypos,
+                                    lastkey, &last_key);
+      }
+      /* fall through to normal delete */
+    }
+    else
+    {
+      /* popular word. two-level tree. going down */
+      uint tmp_key_length;
+      my_off_t root;
+      uchar *kpos=keypos;
+      MARIA_KEY tmp_key;
+
+      tmp_key.data=    lastkey;
+      tmp_key.keyinfo= keyinfo;
+
+      if (!(tmp_key_length=(*keyinfo->get_key)(&tmp_key, page_flag, nod_flag,
+                                               &kpos)))
+      {
+        my_errno= HA_ERR_CRASHED;
+        DBUG_RETURN(-1);
+      }
+      root= _ma_row_pos_from_key(&tmp_key);
+      if (subkeys == -1)
+      {
+        /* the last entry in sub-tree */
+        if (_ma_dispose(info, root, 1))
+          DBUG_RETURN(-1);
+        /* fall through to normal delete */
+      }
+      else
+      {
+        MARIA_KEY word_key;
+        keyinfo=&share->ft2_keyinfo;
+        /* we'll modify key entry 'in vivo' */
+        kpos-=keyinfo->keylength+nod_flag;
+        get_key_full_length_rdonly(off, key->data);
+
+        word_key.data=        key->data + off;
+        word_key.keyinfo=     &share->ft2_keyinfo;
+        word_key.data_length= HA_FT_WLEN;
+        word_key.ref_length= 0;
+        word_key.flag= 0;
+        ret_value= _ma_ck_real_delete(info, &word_key, &root);
+        _ma_dpointer(share, kpos+HA_FT_WLEN, root);
+        subkeys++;
+        ft_intXstore(kpos, subkeys);
+        if (!ret_value)
+        {
+          page_mark_changed(info, anc_page);
+          ret_value= _ma_write_keypage(anc_page,
+                                       PAGECACHE_LOCK_LEFT_WRITELOCKED,
+                                       DFLT_INIT_HITS);
+        }
+        DBUG_PRINT("exit",("Return: %d",ret_value));
+        DBUG_RETURN(ret_value);
+      }
+    }
+  }
+  leaf_buff=0;
+  if (nod_flag)
+  {
+    /* Read left child page */
+    leaf_page.pos= _ma_kpos(nod_flag,keypos);
+    if (!(leaf_buff= (uchar*) my_alloca((uint) keyinfo->block_length+
+                                       MARIA_MAX_KEY_BUFF*2)))
+    {
+      DBUG_PRINT("error", ("Couldn't allocate memory"));
+      my_errno=ENOMEM;
+      DBUG_RETURN(-1);
+    }
+    if (_ma_fetch_keypage(&leaf_page, info,keyinfo, leaf_page.pos,
+                          PAGECACHE_LOCK_WRITE, DFLT_INIT_HITS, leaf_buff,
+                          0))
+      goto err;
+  }
+
+  if (flag != 0)
+  {
+    if (!nod_flag)
+    {
+      DBUG_PRINT("error",("Didn't find key"));
+      my_errno=HA_ERR_CRASHED;		/* This should newer happend */
+      goto err;
+    }
+    save_flag=0;
+    ret_value= d_search(info, key, comp_flag, &leaf_page);
+  }
+  else
+  {						/* Found key */
+    uint tmp;
+    uint anc_buff_length= anc_page->size;
+    uint anc_page_flag=   anc_page->flag;
+    my_off_t next_block;
+
+    if (!(tmp= remove_key(keyinfo, anc_page_flag, nod_flag, keypos, lastkey,
+                          anc_page->buff + anc_buff_length,
+                          &next_block, &s_temp)))
+      goto err;
+
+    page_mark_changed(info, anc_page);
+    anc_buff_length-= tmp;
+    anc_page->size= anc_buff_length;
+    page_store_size(share, anc_page);
+
+    /*
+      Log initial changes on pages
+      If there is an underflow, there will be more changes logged to the
+      page
+    */
+    if (share->now_transactional &&
+        _ma_log_delete(anc_page, s_temp.key_pos,
+                       s_temp.changed_length, s_temp.move_length,
+                       0, KEY_OP_DEBUG_LOG_DEL_CHANGE_1))
+      DBUG_RETURN(-1);
+
+    if (!nod_flag)
+    {						/* On leaf page */
+      if (anc_buff_length <= (info->quick_mode ?
+                              MARIA_MIN_KEYBLOCK_LENGTH :
+                              (uint) keyinfo->underflow_block_length))
+      {
+        /* Page will be written by caller if we return 1 */
+        DBUG_RETURN(1);
+      }
+      if (_ma_write_keypage(anc_page,
+                            PAGECACHE_LOCK_LEFT_WRITELOCKED, DFLT_INIT_HITS))
+	DBUG_RETURN(-1);
+      DBUG_RETURN(0);
+    }
+    save_flag=1;                         /* Mark that anc_buff is changed */
+    ret_value= del(info, key, anc_page, &leaf_page,
+                   keypos, next_block, lastkey);
+  }
+  if (ret_value >0)
+  {
+    save_flag= 2;
+    if (ret_value == 1)
+      ret_value= underflow(info, keyinfo, anc_page, &leaf_page, keypos);
+    else
+    {
+      /* This can only happen with variable length keys */
+      MARIA_KEY last_key;
+      DBUG_PRINT("test",("Enlarging of key when deleting"));
+
+      last_key.data=    lastkey;
+      last_key.keyinfo= keyinfo;
+      if (!_ma_get_last_key(&last_key, anc_page, keypos))
+	goto err;
+      ret_value= _ma_insert(info, key, anc_page, keypos,
+                            last_key.data,
+                            (MARIA_PAGE*) 0, (uchar*) 0, (my_bool) 0);
+
+      if (_ma_write_keypage(&leaf_page, PAGECACHE_LOCK_LEFT_WRITELOCKED,
+                            DFLT_INIT_HITS))
+        ret_value= -1;
+    }
+  }
+  if (ret_value == 0 && anc_page->size > share->max_index_block_size)
+  {
+    /*
+      parent buffer got too big ; We have to split the page.
+      The | 2 is there to force write of anc page below
+    */
+    save_flag= 3;
+    ret_value= _ma_split_page(info, key, anc_page,
+                              share->max_index_block_size,
+                              (uchar*) 0, 0, 0, lastkey, 0) | 2;
+    DBUG_ASSERT(anc_page->org_size == anc_page->size);
+  }
+  if (save_flag && ret_value != 1)
+  {
+    page_mark_changed(info, anc_page);
+    if (_ma_write_keypage(anc_page, PAGECACHE_LOCK_LEFT_WRITELOCKED,
+                          DFLT_INIT_HITS))
+      ret_value= -1;
+  }
+  else
+  {
+    DBUG_DUMP("page", anc_page->buff, anc_page->size);
+  }
+  my_afree(leaf_buff);
+  DBUG_PRINT("exit",("Return: %d",ret_value));
+  DBUG_RETURN(ret_value);
+
+err:
+  my_afree(leaf_buff);
+  DBUG_PRINT("exit",("Error: %d",my_errno));
+  DBUG_RETURN (-1);
+} /* d_search */
+
+
+/**
+   @brief Remove a key that has a page-reference
+
+   @param info		 Maria handler
+   @param key		 Buffer for key to be inserted at upper level
+   @param anc_page	 Page address for page where deleted key was
+   @param anc_buff       Page buffer (nod) where deleted key was
+   @param leaf_page      Page address for nod before the deleted key
+   @param leaf_buff      Buffer for leaf_page
+   @param leaf_buff_link Pinned page link for leaf_buff
+   @param keypos         Pos to where deleted key was on anc_buff
+   @param next_block	 Page adress for nod after deleted key
+   @param ret_key_buff	 Key before keypos in anc_buff
+
+   @notes
+      leaf_page must be written to disk if retval > 0
+      anc_page  is not updated on disk. Caller should do this
+
+   @return
+   @retval < 0   Error
+   @retval 0     OK.    leaf_buff is written to disk
+
+   @retval 1     key contains key to upper level (from balance page)
+                 leaf_buff has underflow
+   @retval 2     key contains key to upper level (from split space)
+*/
+
+static int del(MARIA_HA *info, MARIA_KEY *key,
+               MARIA_PAGE *anc_page, MARIA_PAGE *leaf_page,
+	       uchar *keypos, my_off_t next_block, uchar *ret_key_buff)
+{
+  int ret_value,length;
+  uint a_length, page_flag, nod_flag, leaf_length, new_leaf_length;
+  uchar keybuff[MARIA_MAX_KEY_BUFF],*endpos,*next_buff,*key_start, *prev_key;
+  uchar *anc_buff;
+  MARIA_KEY_PARAM s_temp;
+  MARIA_KEY tmp_key;
+  MARIA_SHARE *share= info->s;
+  MARIA_KEYDEF *keyinfo= key->keyinfo;
+  MARIA_KEY ret_key;
+  MARIA_PAGE next_page;
+  DBUG_ENTER("del");
+  DBUG_PRINT("enter",("leaf_page: %lu  keypos: 0x%lx",
+                      (ulong) (leaf_page->pos / share->block_size),
+		      (ulong) keypos));
+  DBUG_DUMP("leaf_buff", leaf_page->buff, leaf_page->size);
+
+  page_flag=   leaf_page->flag;
+  leaf_length= leaf_page->size;
+  nod_flag=    leaf_page->node;
+
+  endpos= leaf_page->buff + leaf_length;
+  tmp_key.keyinfo= keyinfo;
+  tmp_key.data=    keybuff;
+
+  if (!(key_start= _ma_get_last_key(&tmp_key, leaf_page, endpos)))
+    DBUG_RETURN(-1);
+
+  if (nod_flag)
+  {
+    next_page.pos= _ma_kpos(nod_flag,endpos);
+    if (!(next_buff= (uchar*) my_alloca((uint) keyinfo->block_length+
+					MARIA_MAX_KEY_BUFF*2)))
+      DBUG_RETURN(-1);
+    if (_ma_fetch_keypage(&next_page, info, keyinfo, next_page.pos,
+                          PAGECACHE_LOCK_WRITE, DFLT_INIT_HITS, next_buff, 0))
+      ret_value= -1;
+    else
+    {
+      DBUG_DUMP("next_page", next_page.buff, next_page.size);
+      if ((ret_value= del(info, key, anc_page, &next_page,
+                          keypos, next_block, ret_key_buff)) >0)
+      {
+        /* Get new length after key was deleted */
+	endpos= leaf_page->buff+ leaf_page->size;
+	if (ret_value == 1)
+	{
+          /* underflow writes "next_page" to disk */
+	  ret_value= underflow(info, keyinfo, leaf_page, &next_page,
+                               endpos);
+	  if (ret_value == 0 && leaf_page->size >
+              share->max_index_block_size)
+	  {
+	    ret_value= (_ma_split_page(info, key, leaf_page,
+                                       share->max_index_block_size,
+                                       (uchar*) 0, 0, 0,
+                                       ret_key_buff, 0) | 2);
+	  }
+	}
+	else
+	{
+          if (_ma_write_keypage(&next_page, PAGECACHE_LOCK_LEFT_WRITELOCKED,
+                                DFLT_INIT_HITS))
+            goto err;
+	  DBUG_PRINT("test",("Inserting of key when deleting"));
+	  if (!_ma_get_last_key(&tmp_key, leaf_page, endpos))
+	    goto err;
+	  ret_value= _ma_insert(info, key, leaf_page, endpos,
+                                tmp_key.data, (MARIA_PAGE *) 0, (uchar*) 0,
+                                0);
+	}
+      }
+      page_mark_changed(info, leaf_page);
+      /*
+        If ret_value <> 0, then leaf_page underflowed and caller will have
+        to handle underflow and write leaf_page to disk.
+        We can't write it here, as if leaf_page is empty we get an assert
+        in _ma_write_keypage.
+      */
+      if (ret_value == 0 && _ma_write_keypage(leaf_page,
+                                              PAGECACHE_LOCK_LEFT_WRITELOCKED,
+                                              DFLT_INIT_HITS))
+	goto err;
+    }
+    my_afree(next_buff);
+    DBUG_RETURN(ret_value);
+  }
+
+  /*
+    Remove last key from leaf page
+    Note that leaf_page page may only have had one key (can normally only
+    happen in quick mode), in which ase it will now temporary have 0 keys
+    on it. This will be corrected by the caller as we will return 0.
+  */
+  new_leaf_length= (uint) (key_start - leaf_page->buff);
+  leaf_page->size= new_leaf_length;
+  page_store_size(share, leaf_page);
+
+  if (share->now_transactional &&
+      _ma_log_suffix(leaf_page, leaf_length, new_leaf_length))
+    goto err;
+
+  page_mark_changed(info, leaf_page);           /* Safety */
+  if (new_leaf_length <= (info->quick_mode ? MARIA_MIN_KEYBLOCK_LENGTH :
+                          (uint) keyinfo->underflow_block_length))
+  {
+    /* Underflow, leaf_page will be written by caller */
+    ret_value= 1;
+  }
+  else
+  {
+    ret_value= 0;
+    if (_ma_write_keypage(leaf_page, PAGECACHE_LOCK_LEFT_WRITELOCKED,
+                          DFLT_INIT_HITS))
+      goto err;
+  }
+
+  /* Place last key in ancestor page on deleted key position */
+  a_length= anc_page->size;
+  anc_buff= anc_page->buff;
+  endpos=   anc_buff + a_length;
+
+  ret_key.keyinfo= keyinfo;
+  ret_key.data=    ret_key_buff;
+
+  prev_key= 0;
+  if (keypos != anc_buff+share->keypage_header + share->base.key_reflength)
+  {
+    if (!_ma_get_last_key(&ret_key, anc_page, keypos))
+      goto err;
+    prev_key= ret_key.data;
+  }
+  length= (*keyinfo->pack_key)(&tmp_key, share->base.key_reflength,
+                               keypos == endpos ? (uchar*) 0 : keypos,
+                               prev_key, prev_key,
+                               &s_temp);
+  if (length > 0)
+    bmove_upp(endpos+length,endpos,(uint) (endpos-keypos));
+  else
+    bmove(keypos,keypos-length, (int) (endpos-keypos)+length);
+  (*keyinfo->store_key)(keyinfo,keypos,&s_temp);
+  key_start= keypos;
+  if (tmp_key.flag & (SEARCH_USER_KEY_HAS_TRANSID |
+                      SEARCH_PAGE_KEY_HAS_TRANSID))
+  {
+    _ma_mark_page_with_transid(share, anc_page);
+  }
+
+  /* Save pointer to next leaf on parent page */
+  if (!(*keyinfo->get_key)(&ret_key, page_flag, share->base.key_reflength,
+                           &keypos))
+    goto err;
+  _ma_kpointer(info,keypos - share->base.key_reflength,next_block);
+  anc_page->size= a_length + length;
+  page_store_size(share, anc_page);
+
+  if (share->now_transactional &&
+      _ma_log_add(anc_page, a_length,
+                  key_start, s_temp.changed_length, s_temp.move_length, 1,
+                  KEY_OP_DEBUG_LOG_ADD_2))
+    goto err;
+
+  DBUG_RETURN(new_leaf_length <=
+              (info->quick_mode ? MARIA_MIN_KEYBLOCK_LENGTH :
+               (uint) keyinfo->underflow_block_length));
+err:
+  DBUG_RETURN(-1);
+} /* del */
+
+
+/**
+   @brief Balances adjacent pages if underflow occours
+
+   @fn    underflow()
+   @param anc_buff        Anchestor page data
+   @param leaf_page       Leaf page (page that underflowed)
+   @param leaf_page_link  Pointer to pin information about leaf page
+   @param keypos          Position after current key in anc_buff
+
+   @note
+     This function writes redo entries for all changes
+     leaf_page is saved to disk
+     Caller must save anc_buff
+
+   @return
+   @retval  0  ok
+   @retval  1  ok, but anc_buff did underflow
+   @retval -1  error
+ */
+
+static int underflow(MARIA_HA *info, MARIA_KEYDEF *keyinfo,
+		     MARIA_PAGE *anc_page, MARIA_PAGE *leaf_page,
+		     uchar *keypos)
+{
+  int t_length;
+  uint anc_length,buff_length,leaf_length,p_length,s_length,nod_flag;
+  uint next_buff_length, new_buff_length, key_reflength;
+  uint unchanged_leaf_length, new_leaf_length, new_anc_length;
+  uint anc_page_flag, page_flag;
+  uchar anc_key_buff[MARIA_MAX_KEY_BUFF], leaf_key_buff[MARIA_MAX_KEY_BUFF];
+  uchar *endpos, *next_keypos, *anc_pos, *half_pos, *prev_key;
+  uchar *anc_buff, *leaf_buff;
+  uchar *after_key, *anc_end_pos;
+  MARIA_KEY_PARAM key_deleted, key_inserted;
+  MARIA_SHARE *share= info->s;
+  my_bool first_key;
+  MARIA_KEY tmp_key, anc_key, leaf_key;
+  MARIA_PAGE next_page;
+  DBUG_ENTER("underflow");
+  DBUG_PRINT("enter",("leaf_page: %lu  keypos: 0x%lx",
+                      (ulong) (leaf_page->pos / share->block_size),
+		      (ulong) keypos));
+  DBUG_DUMP("anc_buff", anc_page->buff,  anc_page->size);
+  DBUG_DUMP("leaf_buff", leaf_page->buff, leaf_page->size);
+
+  anc_page_flag= anc_page->flag;
+  anc_buff= anc_page->buff;
+  leaf_buff= leaf_page->buff;
+  info->keyread_buff_used=1;
+  next_keypos=keypos;
+  nod_flag= leaf_page->node;
+  p_length= nod_flag+share->keypage_header;
+  anc_length= anc_page->size;
+  leaf_length= leaf_page->size;
+  key_reflength= share->base.key_reflength;
+  if (share->keyinfo+info->lastinx == keyinfo)
+    info->page_changed=1;
+  first_key= keypos == anc_buff + share->keypage_header + key_reflength;
+
+  tmp_key.data=  info->buff;
+  anc_key.data=  anc_key_buff;
+  leaf_key.data= leaf_key_buff;
+  tmp_key.keyinfo= leaf_key.keyinfo= anc_key.keyinfo= keyinfo;
+
+  if ((keypos < anc_buff + anc_length && (info->state->records & 1)) ||
+      first_key)
+  {
+    size_t tmp_length;
+    uint next_page_flag;
+    /* Use page right of anc-page */
+    DBUG_PRINT("test",("use right page"));
+
+    /*
+      Calculate position after the current key. Note that keydata itself is
+      not used
+    */
+    if (keyinfo->flag & HA_BINARY_PACK_KEY)
+    {
+      if (!(next_keypos= _ma_get_key(&tmp_key, anc_page, keypos)))
+	goto err;
+    }
+    else
+    {
+      /* Avoid length error check if packed key */
+      tmp_key.data[0]= tmp_key.data[1]= 0;
+      /* Got to end of found key */
+      if (!(*keyinfo->get_key)(&tmp_key, anc_page_flag, key_reflength,
+                               &next_keypos))
+        goto err;
+    }
+    next_page.pos= _ma_kpos(key_reflength, next_keypos);
+    if (_ma_fetch_keypage(&next_page, info, keyinfo, next_page.pos,
+                          PAGECACHE_LOCK_WRITE, DFLT_INIT_HITS, info->buff, 0))
+      goto err;
+    next_buff_length= next_page.size;
+    next_page_flag=   next_page.flag;
+    DBUG_DUMP("next", next_page.buff, next_page.size);
+
+    /* find keys to make a big key-page */
+    bmove(next_keypos-key_reflength, next_page.buff + share->keypage_header,
+          key_reflength);
+
+    if (!_ma_get_last_key(&anc_key, anc_page, next_keypos) ||
+	!_ma_get_last_key(&leaf_key, leaf_page, leaf_buff+leaf_length))
+      goto err;
+
+    /* merge pages and put parting key from anc_page between */
+    prev_key= (leaf_length == p_length ? (uchar*) 0 : leaf_key.data);
+    t_length= (*keyinfo->pack_key)(&anc_key, nod_flag, next_page.buff+p_length,
+                                   prev_key, prev_key, &key_inserted);
+    tmp_length= next_buff_length - p_length;
+    endpos= next_page.buff + tmp_length + leaf_length + t_length;
+    /* next_page.buff will always be larger than before !*/
+    bmove_upp(endpos, next_page.buff + next_buff_length, tmp_length);
+    memcpy(next_page.buff, leaf_buff,(size_t) leaf_length);
+    (*keyinfo->store_key)(keyinfo, next_page.buff+leaf_length, &key_inserted);
+    buff_length= (uint) (endpos - next_page.buff);
+
+    /* Set page flag from combination of both key pages and parting key */
+    page_flag= next_page_flag | leaf_page->flag;
+    if (anc_key.flag & (SEARCH_USER_KEY_HAS_TRANSID |
+                        SEARCH_PAGE_KEY_HAS_TRANSID))
+      page_flag|= KEYPAGE_FLAG_HAS_TRANSID;
+
+    next_page.size= buff_length;
+    next_page.flag= page_flag;
+    page_store_info(share, &next_page);
+
+    /* remove key from anc_page */
+    if (!(s_length=remove_key(keyinfo, anc_page_flag, key_reflength, keypos,
+                              anc_key_buff, anc_buff+anc_length,
+                              (my_off_t *) 0, &key_deleted)))
+      goto err;
+
+    new_anc_length= anc_length - s_length;
+    anc_page->size= new_anc_length;
+    page_store_size(share, anc_page);
+
+    if (buff_length <= share->max_index_block_size)
+    {
+      /* All keys fitted into one page */
+      page_mark_changed(info, &next_page);
+      if (_ma_dispose(info, next_page.pos, 0))
+       goto err;
+
+      memcpy(leaf_buff, next_page.buff, (size_t) buff_length);
+      leaf_page->size= next_page.size;
+      leaf_page->flag= next_page.flag;
+
+      if (share->now_transactional)
+      {
+        /*
+          Log changes to parent page. Note that this page may have been
+          temporarily bigger than block_size.
+         */
+        if (_ma_log_delete(anc_page, key_deleted.key_pos,
+                           key_deleted.changed_length,
+                           key_deleted.move_length,
+                           anc_length - anc_page->org_size,
+                           KEY_OP_DEBUG_LOG_DEL_CHANGE_2))
+          goto err;
+        /*
+          Log changes to leaf page. Data for leaf page is in leaf_buff
+          which contains original leaf_buff, parting key and next_buff
+        */
+        if (_ma_log_suffix(leaf_page, leaf_length, buff_length))
+          goto err;
+      }
+    }
+    else
+    {
+      /*
+        Balancing didn't free a page, so we have to split 'buff' into two
+        pages:
+        - Find key in middle of buffer
+        - Store everything before key in 'leaf_page'
+        - Pack key into anc_page at position of deleted key
+          Note that anc_page may overflow! (is handled by caller)
+        - Store remaining keys in next_page (buff)
+      */
+      MARIA_KEY_PARAM anc_key_inserted;
+
+      anc_end_pos= anc_buff + new_anc_length;
+
+      DBUG_PRINT("test",("anc_buff: 0x%lx  anc_end_pos: 0x%lx",
+                         (long) anc_buff, (long) anc_end_pos));
+
+      if (!first_key && !_ma_get_last_key(&anc_key, anc_page, keypos))
+	goto err;
+      if (!(half_pos= _ma_find_half_pos(&leaf_key, &next_page, &after_key)))
+	goto err;
+      new_leaf_length= (uint) (half_pos - next_page.buff);
+      memcpy(leaf_buff, next_page.buff, (size_t) new_leaf_length);
+
+      leaf_page->size= new_leaf_length;
+      leaf_page->flag= page_flag;
+      page_store_info(share, leaf_page);
+
+      /* Correct new keypointer to leaf_page */
+      half_pos=after_key;
+      _ma_kpointer(info,
+                   leaf_key.data + leaf_key.data_length + leaf_key.ref_length,
+                   next_page.pos);
+
+      /* Save key in anc_page */
+      prev_key= (first_key  ? (uchar*) 0 : anc_key.data);
+      t_length= (*keyinfo->pack_key)(&leaf_key, key_reflength,
+                                     (keypos == anc_end_pos ? (uchar*) 0 :
+                                      keypos),
+                                     prev_key, prev_key, &anc_key_inserted);
+      if (t_length >= 0)
+	bmove_upp(anc_end_pos+t_length, anc_end_pos,
+                  (uint) (anc_end_pos - keypos));
+      else
+	bmove(keypos,keypos-t_length,(uint) (anc_end_pos-keypos)+t_length);
+      (*keyinfo->store_key)(keyinfo,keypos, &anc_key_inserted);
+      new_anc_length+= t_length;
+      anc_page->size= new_anc_length;
+      page_store_size(share, anc_page);
+
+      if (leaf_key.flag & (SEARCH_USER_KEY_HAS_TRANSID |
+                           SEARCH_PAGE_KEY_HAS_TRANSID))
+        _ma_mark_page_with_transid(share, anc_page);
+
+      /* Store key first in new page */
+      if (nod_flag)
+	bmove(next_page.buff + share->keypage_header, half_pos-nod_flag,
+              (size_t) nod_flag);
+      if (!(*keyinfo->get_key)(&leaf_key, page_flag, nod_flag, &half_pos))
+	goto err;
+      t_length=(int) (*keyinfo->pack_key)(&leaf_key, nod_flag, (uchar*) 0,
+					  (uchar*) 0, (uchar*) 0,
+					  &key_inserted);
+      /* t_length will always be > 0 for a new page !*/
+      tmp_length= (size_t) ((next_page.buff + buff_length) - half_pos);
+      bmove(next_page.buff + p_length + t_length, half_pos, tmp_length);
+      (*keyinfo->store_key)(keyinfo, next_page.buff + p_length, &key_inserted);
+      new_buff_length= tmp_length + t_length + p_length;
+      next_page.size= new_buff_length;
+      page_store_size(share, &next_page);
+      /* keypage flag is already up to date */
+
+      if (share->now_transactional)
+      {
+        /*
+          Log changes to parent page
+          This has one key deleted from it and one key inserted to it at
+          keypos
+
+          ma_log_add ensures that we don't log changes that is outside of
+          key block size, as the REDO code can't handle that
+        */
+        if (_ma_log_add(anc_page, anc_length, keypos,
+                        anc_key_inserted.move_length +
+                        max(anc_key_inserted.changed_length -
+                            anc_key_inserted.move_length,
+                            key_deleted.changed_length),
+                        anc_key_inserted.move_length -
+                        key_deleted.move_length, 1,
+                        KEY_OP_DEBUG_LOG_ADD_3))
+          goto err;
+
+        /*
+          Log changes to leaf page.
+          This contains original data with new data added at end
+        */
+        DBUG_ASSERT(leaf_length <= new_leaf_length);
+        if (_ma_log_suffix(leaf_page, leaf_length, new_leaf_length))
+          goto err;
+        /*
+          Log changes to next page
+
+          This contains original data with some prefix data deleted and
+          some compressed data at start possible extended
+
+          Data in buff was originally:
+          org_leaf_buff     [leaf_length]
+          separator_key     [buff_key_inserted.move_length]
+          next_key_changes  [buff_key_inserted.changed_length -move_length]
+          next_page_data    [next_buff_length - p_length -
+                            (buff_key_inserted.changed_length -move_length)]
+
+          After changes it's now:
+          unpacked_key      [key_inserted.changed_length]
+          next_suffix       [next_buff_length - key_inserted.changed_length]
+
+        */
+        DBUG_ASSERT(new_buff_length <= next_buff_length);
+        if (_ma_log_prefix(&next_page, key_inserted.changed_length,
+                           (int) (new_buff_length - next_buff_length),
+                           KEY_OP_DEBUG_LOG_PREFIX_1))
+          goto err;
+      }
+      page_mark_changed(info, &next_page);
+      if (_ma_write_keypage(&next_page,
+                            PAGECACHE_LOCK_LEFT_WRITELOCKED, DFLT_INIT_HITS))
+	goto err;
+    }
+
+    page_mark_changed(info, leaf_page);
+    if (_ma_write_keypage(leaf_page,
+                          PAGECACHE_LOCK_LEFT_WRITELOCKED, DFLT_INIT_HITS))
+      goto err;
+    DBUG_RETURN(new_anc_length <=
+                ((info->quick_mode ? MARIA_MIN_KEYBLOCK_LENGTH :
+                  (uint) keyinfo->underflow_block_length)));
+  }
+
+  DBUG_PRINT("test",("use left page"));
+
+  keypos= _ma_get_last_key(&anc_key, anc_page, keypos);
+  if (!keypos)
+    goto err;
+  next_page.pos= _ma_kpos(key_reflength,keypos);
+  if (_ma_fetch_keypage(&next_page, info, keyinfo, next_page.pos,
+                        PAGECACHE_LOCK_WRITE, DFLT_INIT_HITS, info->buff, 0))
+    goto err;
+  buff_length= next_page.size;
+  endpos= next_page.buff + buff_length;
+  DBUG_DUMP("prev", next_page.buff, next_page.size);
+
+  /* find keys to make a big key-page */
+  bmove(next_keypos - key_reflength, leaf_buff + share->keypage_header,
+        key_reflength);
+  next_keypos=keypos;
+  if (!(*keyinfo->get_key)(&anc_key, anc_page_flag, key_reflength,
+                           &next_keypos))
+    goto err;
+  if (!_ma_get_last_key(&leaf_key, &next_page, endpos))
+    goto err;
+
+  /* merge pages and put parting key from anc_page between */
+  prev_key= (leaf_length == p_length ? (uchar*) 0 : leaf_key.data);
+  t_length=(*keyinfo->pack_key)(&anc_key, nod_flag,
+				(leaf_length == p_length ?
+                                 (uchar*) 0 : leaf_buff+p_length),
+				prev_key, prev_key,
+				&key_inserted);
+  if (t_length >= 0)
+    bmove(endpos+t_length, leaf_buff+p_length,
+          (size_t) (leaf_length-p_length));
+  else						/* We gained space */
+    bmove(endpos,leaf_buff+((int) p_length-t_length),
+	  (size_t) (leaf_length-p_length+t_length));
+  (*keyinfo->store_key)(keyinfo,endpos, &key_inserted);
+
+  /* Remember for logging how many bytes of leaf_buff that are not changed */
+  DBUG_ASSERT((int) key_inserted.changed_length >= key_inserted.move_length);
+  unchanged_leaf_length= (leaf_length - p_length -
+                          (key_inserted.changed_length -
+                           key_inserted.move_length));
+
+  new_buff_length= buff_length + leaf_length - p_length + t_length;
+
+#ifdef EXTRA_DEBUG
+  /* Ensure that unchanged_leaf_length is correct */
+  DBUG_ASSERT(bcmp(next_page.buff + new_buff_length - unchanged_leaf_length,
+                   leaf_buff + leaf_length - unchanged_leaf_length,
+                   unchanged_leaf_length) == 0);
+#endif
+
+  page_flag= next_page.flag | leaf_page->flag;
+  if (anc_key.flag & (SEARCH_USER_KEY_HAS_TRANSID |
+                       SEARCH_PAGE_KEY_HAS_TRANSID))
+    page_flag|= KEYPAGE_FLAG_HAS_TRANSID;
+
+  next_page.size= new_buff_length;
+  next_page.flag= page_flag;
+  page_store_info(share, &next_page);
+
+  /* remove key from anc_page */
+  if (!(s_length= remove_key(keyinfo, anc_page_flag, key_reflength, keypos,
+                             anc_key_buff,
+                             anc_buff+anc_length, (my_off_t *) 0,
+                             &key_deleted)))
+    goto err;
+
+  new_anc_length= anc_length - s_length;
+  anc_page->size= new_anc_length;
+  page_store_size(share, anc_page);
+
+  if (new_buff_length <= share->max_index_block_size)
+  {
+    /* All keys fitted into one page */
+    page_mark_changed(info, leaf_page);
+    if (_ma_dispose(info, leaf_page->pos, 0))
+      goto err;
+
+    if (share->now_transactional)
+    {
+      /*
+        Log changes to parent page. Note that this page may have been
+        temporarily bigger than block_size.
+      */
+      if (_ma_log_delete(anc_page, key_deleted.key_pos,
+                         key_deleted.changed_length, key_deleted.move_length,
+                         anc_length - anc_page->org_size,
+                         KEY_OP_DEBUG_LOG_DEL_CHANGE_3))
+        goto err;
+      /*
+        Log changes to next page. Data for leaf page is in buff
+        that contains original leaf_buff, parting key and next_buff
+      */
+      if (_ma_log_suffix(&next_page, buff_length, new_buff_length))
+        goto err;
+    }
+  }
+  else
+  {
+    /*
+      Balancing didn't free a page, so we have to split 'next_page' into two
+      pages
+      - Find key in middle of buffer (buff)
+      - Pack key at half_buff into anc_page at position of deleted key
+        Note that anc_page may overflow! (is handled by caller)
+      - Move everything after middlekey to 'leaf_buff'
+      - Shorten buff at 'endpos'
+    */
+    MARIA_KEY_PARAM anc_key_inserted;
+    size_t tmp_length;
+
+    if (keypos == anc_buff + share->keypage_header + key_reflength)
+      anc_pos= 0;				/* First key */
+    else
+    {
+      if (!_ma_get_last_key(&anc_key, anc_page, keypos))
+        goto err;
+      anc_pos= anc_key.data;
+    }
+    if (!(endpos= _ma_find_half_pos(&leaf_key, &next_page, &half_pos)))
+      goto err;
+
+    /* Correct new keypointer to leaf_page */
+    _ma_kpointer(info,leaf_key.data + leaf_key.data_length +
+                 leaf_key.ref_length, leaf_page->pos);
+
+    /* Save key in anc_page */
+    DBUG_DUMP("anc_buff", anc_buff, new_anc_length);
+    DBUG_DUMP_KEY("key_to_anc", &leaf_key);
+    anc_end_pos= anc_buff + new_anc_length;
+    t_length=(*keyinfo->pack_key)(&leaf_key, key_reflength,
+				  keypos == anc_end_pos ? (uchar*) 0
+				  : keypos,
+				  anc_pos, anc_pos,
+				  &anc_key_inserted);
+    if (t_length >= 0)
+      bmove_upp(anc_end_pos+t_length, anc_end_pos,
+                (uint) (anc_end_pos-keypos));
+    else
+      bmove(keypos,keypos-t_length,(uint) (anc_end_pos-keypos)+t_length);
+    (*keyinfo->store_key)(keyinfo,keypos, &anc_key_inserted);
+    new_anc_length+= t_length;
+    anc_page->size= new_anc_length;
+    page_store_size(share, anc_page);
+
+    if (leaf_key.flag & (SEARCH_USER_KEY_HAS_TRANSID |
+                         SEARCH_PAGE_KEY_HAS_TRANSID))
+      _ma_mark_page_with_transid(share, anc_page);
+
+    /* Store first key on new page */
+    if (nod_flag)
+      bmove(leaf_buff + share->keypage_header, half_pos-nod_flag,
+            (size_t) nod_flag);
+    if (!(*keyinfo->get_key)(&leaf_key, page_flag, nod_flag, &half_pos))
+      goto err;
+    DBUG_DUMP_KEY("key_to_leaf", &leaf_key);
+    t_length=(*keyinfo->pack_key)(&leaf_key, nod_flag, (uchar*) 0,
+				  (uchar*) 0, (uchar*) 0, &key_inserted);
+    /* t_length will always be > 0 for a new page !*/
+    tmp_length= (size_t) ((next_page.buff + new_buff_length) - half_pos);
+    DBUG_PRINT("info",("t_length: %d  length: %d",t_length, (int) tmp_length));
+    bmove(leaf_buff+p_length+t_length, half_pos, tmp_length);
+    (*keyinfo->store_key)(keyinfo,leaf_buff+p_length, &key_inserted);
+    new_leaf_length= tmp_length + t_length + p_length;
+
+    leaf_page->size= new_leaf_length;
+    leaf_page->flag= page_flag;
+    page_store_info(share, leaf_page);
+
+    new_buff_length= (uint) (endpos - next_page.buff);
+    next_page.size= new_buff_length;
+    page_store_size(share, &next_page);
+
+    if (share->now_transactional)
+    {
+      /*
+        Log changes to parent page
+        This has one key deleted from it and one key inserted to it at
+        keypos
+
+        ma_log_add() ensures that we don't log changes that is outside of
+        key block size, as the REDO code can't handle that
+      */
+      if (_ma_log_add(anc_page, anc_length, keypos,
+                      anc_key_inserted.move_length +
+                      max(anc_key_inserted.changed_length -
+                          anc_key_inserted.move_length,
+                          key_deleted.changed_length),
+                      anc_key_inserted.move_length -
+                      key_deleted.move_length, 1,KEY_OP_DEBUG_LOG_ADD_4))
+        goto err;
+
+      /*
+        Log changes to leaf page.
+        This contains original data with new data added first
+      */
+      DBUG_ASSERT(leaf_length <= new_leaf_length);
+      DBUG_ASSERT(new_leaf_length >= unchanged_leaf_length);
+      if (_ma_log_prefix(leaf_page, new_leaf_length - unchanged_leaf_length,
+                         (int) (new_leaf_length - leaf_length),
+                         KEY_OP_DEBUG_LOG_PREFIX_2))
+        goto err;
+      /*
+        Log changes to next page
+        This contains original data with some suffix data deleted
+
+      */
+      DBUG_ASSERT(new_buff_length <= buff_length);
+      if (_ma_log_suffix(&next_page, buff_length, new_buff_length))
+        goto err;
+    }
+
+    page_mark_changed(info, leaf_page);
+    if (_ma_write_keypage(leaf_page,
+                          PAGECACHE_LOCK_LEFT_WRITELOCKED, DFLT_INIT_HITS))
+      goto err;
+  }
+  page_mark_changed(info, &next_page);
+  if (_ma_write_keypage(&next_page,
+                        PAGECACHE_LOCK_LEFT_WRITELOCKED, DFLT_INIT_HITS))
+    goto err;
+
+  DBUG_RETURN(new_anc_length <=
+              ((info->quick_mode ? MARIA_MIN_KEYBLOCK_LENGTH :
+                (uint) keyinfo->underflow_block_length)));
+
+err:
+  DBUG_RETURN(-1);
+} /* underflow */
+
+
+/**
+  @brief Remove a key from page
+
+  @fn remove_key()
+    keyinfo	          Key handle
+    nod_flag              Length of node ptr
+    keypos	          Where on page key starts
+    lastkey	          Buffer for storing keys to be removed
+    page_end	          Pointer to end of page
+    next_block	          If <> 0 and node-page, this is set to address of
+    		          next page
+    s_temp	          Information about what changes was done one the page:
+    s_temp.key_pos        Start of key
+    s_temp.move_length    Number of bytes removed at keypos
+    s_temp.changed_length Number of bytes changed at keypos
+
+  @todo
+    The current code doesn't handle the case that the next key may be
+    packed better against the previous key if there is a case difference
+
+  @return
+  @retval 0  error
+  @retval #  How many chars was removed
+*/
+
+static uint remove_key(MARIA_KEYDEF *keyinfo, uint page_flag, uint nod_flag,
+		       uchar *keypos, uchar *lastkey,
+		       uchar *page_end, my_off_t *next_block,
+                       MARIA_KEY_PARAM *s_temp)
+{
+  int s_length;
+  uchar *start;
+  DBUG_ENTER("remove_key");
+  DBUG_PRINT("enter", ("keypos: 0x%lx  page_end: 0x%lx",
+                       (long) keypos, (long) page_end));
+
+  start= s_temp->key_pos= keypos;
+  s_temp->changed_length= 0;
+  if (!(keyinfo->flag &
+	(HA_PACK_KEY | HA_SPACE_PACK_USED | HA_VAR_LENGTH_KEY |
+	 HA_BINARY_PACK_KEY)) &&
+      !(page_flag & KEYPAGE_FLAG_HAS_TRANSID))
+  {
+    /* Static length key */
+    s_length=(int) (keyinfo->keylength+nod_flag);
+    if (next_block && nod_flag)
+      *next_block= _ma_kpos(nod_flag,keypos+s_length);
+  }
+  else
+  {
+    /* Let keypos point at next key */
+    MARIA_KEY key;
+
+    /* Calculate length of key */
+    key.keyinfo= keyinfo;
+    key.data=    lastkey;
+    if (!(*keyinfo->get_key)(&key, page_flag, nod_flag, &keypos))
+      DBUG_RETURN(0);				/* Error */
+
+    if (next_block && nod_flag)
+      *next_block= _ma_kpos(nod_flag,keypos);
+    s_length=(int) (keypos-start);
+    if (keypos != page_end)
+    {
+      if (keyinfo->flag & HA_BINARY_PACK_KEY)
+      {
+	uchar *old_key= start;
+	uint next_length,prev_length,prev_pack_length;
+
+        /* keypos points here on start of next key */
+	get_key_length(next_length,keypos);
+	get_key_pack_length(prev_length,prev_pack_length,old_key);
+	if (next_length > prev_length)
+	{
+          uint diff= (next_length-prev_length);
+	  /* We have to copy data from the current key to the next key */
+	  keypos-= diff + prev_pack_length;
+	  store_key_length(keypos, prev_length);
+          bmove(keypos + prev_pack_length, lastkey + prev_length, diff);
+	  s_length=(int) (keypos-start);
+          s_temp->changed_length= diff + prev_pack_length;
+	}
+      }
+      else
+      {
+	/* Check if a variable length first key part */
+	if ((keyinfo->seg->flag & HA_PACK_KEY) && *keypos & 128)
+	{
+	  /* Next key is packed against the current one */
+	  uint next_length,prev_length,prev_pack_length,lastkey_length,
+	    rest_length;
+	  if (keyinfo->seg[0].length >= 127)
+	  {
+	    if (!(prev_length=mi_uint2korr(start) & 32767))
+	      goto end;
+	    next_length=mi_uint2korr(keypos) & 32767;
+	    keypos+=2;
+	    prev_pack_length=2;
+	  }
+	  else
+	  {
+	    if (!(prev_length= *start & 127))
+	      goto end;				/* Same key as previous*/
+	    next_length= *keypos & 127;
+	    keypos++;
+	    prev_pack_length=1;
+	  }
+	  if (!(*start & 128))
+	    prev_length=0;			/* prev key not packed */
+	  if (keyinfo->seg[0].flag & HA_NULL_PART)
+	    lastkey++;				/* Skip null marker */
+	  get_key_length(lastkey_length,lastkey);
+	  if (!next_length)			/* Same key after */
+	  {
+	    next_length=lastkey_length;
+	    rest_length=0;
+	  }
+	  else
+	    get_key_length(rest_length,keypos);
+
+	  if (next_length >= prev_length)
+	  {
+            /* Next key is based on deleted key */
+            uint pack_length;
+            uint diff= (next_length-prev_length);
+
+            /* keypos points to data of next key (after key length) */
+	    bmove(keypos - diff, lastkey + prev_length, diff);
+	    rest_length+= diff;
+	    pack_length= prev_length ? get_pack_length(rest_length): 0;
+	    keypos-= diff + pack_length + prev_pack_length;
+	    s_length=(int) (keypos-start);
+	    if (prev_length)			/* Pack against prev key */
+	    {
+	      *keypos++= start[0];
+	      if (prev_pack_length == 2)
+		*keypos++= start[1];
+	      store_key_length(keypos,rest_length);
+	    }
+	    else
+	    {
+	      /* Next key is not packed anymore */
+	      if (keyinfo->seg[0].flag & HA_NULL_PART)
+	      {
+		rest_length++;			/* Mark not null */
+	      }
+	      if (prev_pack_length == 2)
+	      {
+		mi_int2store(keypos,rest_length);
+	      }
+	      else
+		*keypos= rest_length;
+	    }
+            s_temp->changed_length= diff + pack_length + prev_pack_length;
+	  }
+	}
+      }
+    }
+  }
+  end:
+  bmove(start, start+s_length, (uint) (page_end-start-s_length));
+  s_temp->move_length= s_length;
+  DBUG_RETURN((uint) s_length);
+} /* remove_key */
+
+
+/****************************************************************************
+  Logging of redos
+****************************************************************************/
+
+/**
+   @brief
+   log entry where some parts are deleted and some things are changed
+   and some data could be added last.
+
+   @fn _ma_log_delete()
+   @param info		  Maria handler
+   @param page	          Pageaddress for changed page
+   @param buff		  Page buffer
+   @param key_pos         Start of change area
+   @param changed_length  How many bytes where changed at key_pos
+   @param move_length     How many bytes where deleted at key_pos
+   @param append_length	  Length of data added last
+		          This is taken from end of ma_page->buff
+
+   This is mainly used when a key is deleted. The append happens
+   when we delete a key from a page with data > block_size kept in
+   memory and we have to add back the data that was stored > block_size
+*/
+
+my_bool _ma_log_delete(MARIA_PAGE *ma_page, const uchar *key_pos,
+                       uint changed_length, uint move_length,
+                       uint append_length __attribute__((unused)),
+                       enum en_key_debug debug_marker __attribute__((unused)))
+{
+  LSN lsn;
+  uchar log_data[FILEID_STORE_SIZE + PAGE_STORE_SIZE + 2 + 5+ 2 + 3 + 3 + 6 + 3 + 7];
+  uchar *log_pos;
+  LEX_CUSTRING log_array[TRANSLOG_INTERNAL_PARTS + 7];
+  uint translog_parts, current_size, extra_length;
+  uint offset= (uint) (key_pos - ma_page->buff);
+  MARIA_HA *info= ma_page->info;
+  MARIA_SHARE *share= info->s;
+  my_off_t page= ma_page->pos / share->block_size;
+  DBUG_ENTER("_ma_log_delete");
+  DBUG_PRINT("enter", ("page: %lu  changed_length: %u  move_length: %d",
+                       (ulong) page, changed_length, move_length));
+  DBUG_ASSERT(share->now_transactional && move_length);
+  DBUG_ASSERT(offset + changed_length <= ma_page->size);
+  DBUG_ASSERT(ma_page->org_size - move_length + append_length == ma_page->size);
+  DBUG_ASSERT(move_length <= ma_page->org_size - share->keypage_header);
+
+  /* Store address of new root page */
+  page_store(log_data + FILEID_STORE_SIZE, page);
+  log_pos= log_data+ FILEID_STORE_SIZE + PAGE_STORE_SIZE;
+  current_size= ma_page->org_size;
+
+#ifdef EXTRA_DEBUG_KEY_CHANGES
+  *log_pos++= KEY_OP_DEBUG;
+  *log_pos++= debug_marker;
+
+  *log_pos++= KEY_OP_DEBUG_2;
+  int2store(log_pos,   ma_page->org_size);
+  int2store(log_pos+2, ma_page->size);
+  log_pos+=4;
+#endif
+
+  /* Store keypage_flag */
+  *log_pos++= KEY_OP_SET_PAGEFLAG;
+  *log_pos++= ma_page->buff[KEYPAGE_TRANSFLAG_OFFSET];
+
+  log_pos[0]= KEY_OP_OFFSET;
+  int2store(log_pos+1, offset);
+  log_pos+= 3;
+  translog_parts= TRANSLOG_INTERNAL_PARTS + 1;
+  extra_length= 0;
+
+  if (changed_length)
+  {
+    if (offset + changed_length >= share->max_index_block_size)
+    {
+      changed_length= share->max_index_block_size - offset;
+      move_length= 0;                           /* Nothing to move */
+      current_size= share->max_index_block_size;
+    }
+
+    log_pos[0]= KEY_OP_CHANGE;
+    int2store(log_pos+1, changed_length);
+    log_pos+= 3;
+    log_array[translog_parts].str=    ma_page->buff + offset;
+    log_array[translog_parts].length= changed_length;
+    translog_parts++;
+
+    /* We only have to move things after offset+changed_length */
+    offset+= changed_length;
+  }
+
+  log_array[TRANSLOG_INTERNAL_PARTS + 0].str=    log_data;
+  log_array[TRANSLOG_INTERNAL_PARTS + 0].length= (uint) (log_pos - log_data);
+
+  if (move_length)
+  {
+    uint log_length;
+    if (offset + move_length < share->max_index_block_size)
+    {
+      /*
+        Move down things that is on page.
+        page_offset in apply_redo_inxed() will be at original offset
+        + changed_length.
+      */
+      log_pos[0]= KEY_OP_SHIFT;
+      int2store(log_pos+1, - (int) move_length);
+      log_length= 3;
+      current_size-= move_length;
+    }
+    else
+    {
+      /* Delete to end of page */
+      uint tmp= current_size - offset;
+      current_size= offset;
+      log_pos[0]= KEY_OP_DEL_SUFFIX;
+      int2store(log_pos+1, tmp);
+      log_length= 3;
+    }
+    log_array[translog_parts].str=    log_pos;
+    log_array[translog_parts].length= log_length;
+    translog_parts++;
+    log_pos+= log_length;
+    extra_length+= log_length;
+  }
+
+  if (current_size != ma_page->size &&
+      current_size != share->max_index_block_size)
+  {
+    /* Append data that didn't fit on the page before */
+    uint length= (min(ma_page->size, share->max_index_block_size) -
+                  current_size);
+    uchar *data= ma_page->buff + current_size;
+
+    DBUG_ASSERT(length <= append_length);
+
+    log_pos[0]= KEY_OP_ADD_SUFFIX;
+    int2store(log_pos+1, length);
+    log_array[translog_parts].str=        log_pos;
+    log_array[translog_parts].length=     3;
+    log_array[translog_parts + 1].str=    data;
+    log_array[translog_parts + 1].length= length;
+    log_pos+= 3;
+    translog_parts+= 2;
+    current_size+= length;
+    extra_length+= 3 + length;
+  }
+
+  _ma_log_key_changes(ma_page,
+                      log_array + translog_parts,
+                      log_pos, &extra_length, &translog_parts);
+  /* Remember new page length for future log entires for same page */
+  ma_page->org_size= current_size;
+
+  if (translog_write_record(&lsn, LOGREC_REDO_INDEX,
+                            info->trn, info,
+                            (translog_size_t)
+                            log_array[TRANSLOG_INTERNAL_PARTS].length +
+                            changed_length + extra_length, translog_parts,
+                            log_array, log_data, NULL))
+    DBUG_RETURN(1);
+
+  DBUG_RETURN(0);
+}
+
+
+/****************************************************************************
+  Logging of undos
+****************************************************************************/
+
+my_bool _ma_write_undo_key_delete(MARIA_HA *info, const MARIA_KEY *key,
+                                  my_off_t new_root, LSN *res_lsn)
+{
+  MARIA_SHARE *share= info->s;
+  uchar log_data[LSN_STORE_SIZE + FILEID_STORE_SIZE +
+                 KEY_NR_STORE_SIZE + PAGE_STORE_SIZE], *log_pos;
+  LEX_CUSTRING log_array[TRANSLOG_INTERNAL_PARTS + 2];
+  struct st_msg_to_write_hook_for_undo_key msg;
+  enum translog_record_type log_type= LOGREC_UNDO_KEY_DELETE;
+  uint keynr= key->keyinfo->key_nr;
+
+  lsn_store(log_data, info->trn->undo_lsn);
+  key_nr_store(log_data + LSN_STORE_SIZE + FILEID_STORE_SIZE, keynr);
+  log_pos= log_data + LSN_STORE_SIZE + FILEID_STORE_SIZE + KEY_NR_STORE_SIZE;
+
+  /**
+    @todo BUG if we had concurrent insert/deletes, reading state's key_root
+    like this would be unsafe.
+  */
+  if (new_root != share->state.key_root[keynr])
+  {
+    my_off_t page;
+    page= ((new_root == HA_OFFSET_ERROR) ? IMPOSSIBLE_PAGE_NO :
+           new_root / share->block_size);
+    page_store(log_pos, page);
+    log_pos+= PAGE_STORE_SIZE;
+    log_type= LOGREC_UNDO_KEY_DELETE_WITH_ROOT;
+  }
+
+  log_array[TRANSLOG_INTERNAL_PARTS + 0].str=    log_data;
+  log_array[TRANSLOG_INTERNAL_PARTS + 0].length= (uint) (log_pos - log_data);
+  log_array[TRANSLOG_INTERNAL_PARTS + 1].str=    key->data;
+  log_array[TRANSLOG_INTERNAL_PARTS + 1].length= (key->data_length +
+                                                  key->ref_length);
+
+  msg.root= &share->state.key_root[keynr];
+  msg.value= new_root;
+  /*
+    set autoincrement to 1 if this is an auto_increment key
+    This is only used if we are now in a rollback of a duplicate key
+  */
+  msg.auto_increment= share->base.auto_key == keynr + 1;
+
+  return translog_write_record(res_lsn, log_type,
+                               info->trn, info,
+                               (translog_size_t)
+                               (log_array[TRANSLOG_INTERNAL_PARTS + 0].length +
+                                log_array[TRANSLOG_INTERNAL_PARTS + 1].length),
+                               TRANSLOG_INTERNAL_PARTS + 2, log_array,
+                               log_data + LSN_STORE_SIZE, &msg) ? -1 : 0;
+}
diff --git a/storage/maria/ma_delete_all.c b/storage/maria/ma_delete_all.c
new file mode 100644
index 00000000000..4661ea0ab59
--- /dev/null
+++ b/storage/maria/ma_delete_all.c
@@ -0,0 +1,192 @@
+/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; version 2 of the License.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */
+
+/* Remove all rows from a MARIA table */
+/* This clears the status information and truncates files */
+
+#include "maria_def.h"
+#include "trnman.h"
+
+/**
+   @brief deletes all rows from a table
+
+   @param  info             Maria handler
+
+   @note It is important that this function does not rely on the state
+   information, as it may be called by ma_apply_undo_bulk_insert() on an
+   inconsistent table left by a crash.
+
+   @return Operation status
+     @retval 0      ok
+     @retval 1      error
+*/
+
+int maria_delete_all_rows(MARIA_HA *info)
+{
+  MARIA_SHARE *share= info->s;
+  my_bool log_record;
+  LSN lsn;
+  DBUG_ENTER("maria_delete_all_rows");
+
+  if (share->options & HA_OPTION_READ_ONLY_DATA)
+  {
+    DBUG_RETURN(my_errno=EACCES);
+  }
+  /**
+     @todo LOCK take X-lock on table here.
+     When we have versioning, if some other thread is looking at this table,
+     we cannot shrink the file like this.
+  */
+  if (_ma_readinfo(info,F_WRLCK,1))
+    DBUG_RETURN(my_errno);
+  log_record= share->now_transactional && !share->temporary;
+  if (_ma_mark_file_changed(info))
+    goto err;
+
+  if (log_record)
+  {
+    /*
+      This record will be used by Recovery to finish the deletion if it
+      crashed. We force it to have a complete history in the log.
+    */
+    LEX_CUSTRING log_array[TRANSLOG_INTERNAL_PARTS + 1];
+    uchar log_data[FILEID_STORE_SIZE];
+    log_array[TRANSLOG_INTERNAL_PARTS + 0].str=    log_data;
+    log_array[TRANSLOG_INTERNAL_PARTS + 0].length= sizeof(log_data);
+    if (unlikely(translog_write_record(&lsn, LOGREC_REDO_DELETE_ALL,
+                                       info->trn, info, 0,
+                                       sizeof(log_array)/sizeof(log_array[0]),
+                                       log_array, log_data, NULL) ||
+                 translog_flush(lsn)))
+      goto err;
+    /*
+      If we fail in this function after this point, log and table will be
+      inconsistent.
+    */
+  }
+  else
+  {
+    /* Other branch called function below when writing log record, in hook */
+    _ma_reset_status(info);
+  }
+  /* Remove old history as the table is now empty for everyone */
+  _ma_reset_state(info);
+
+  /*
+    If we are using delayed keys or if the user has done changes to the tables
+    since it was locked then there may be key blocks in the page cache. Or
+    there may be data blocks there. We need to throw them away or they may
+    re-enter the emptied table or another table later.
+  */
+
+#ifdef HAVE_MMAP
+  if (share->file_map)
+    _ma_unmap_file(info);
+#endif
+
+  if (_ma_flush_table_files(info, MARIA_FLUSH_DATA|MARIA_FLUSH_INDEX,
+                            FLUSH_IGNORE_CHANGED, FLUSH_IGNORE_CHANGED) ||
+      my_chsize(info->dfile.file, 0, 0, MYF(MY_WME)) ||
+      my_chsize(share->kfile.file, share->base.keystart, 0, MYF(MY_WME)))
+    goto err;
+
+  if (_ma_initialize_data_file(share, info->dfile.file))
+    goto err;
+
+  if (log_record)
+  {
+    /*
+      Because LOGREC_REDO_DELETE_ALL does not operate on pages, it has the
+      following problem:
+      delete_all; inserts (redo_insert); all pages get flushed; checkpoint:
+      the dirty pages list will be empty. In recovery, delete_all is executed,
+      but redo_insert are skipped (dirty pages list is empty).
+      To avoid this, we need to set skip_redo_lsn now, and thus need to sync
+      files.
+      Also fixes the problem of:
+      bulk insert; insert; delete_all; crash:
+      "bulk insert" is skipped (no REDOs), so if "insert" would not be skipped
+      (if we didn't update skip_redo_lsn below) then "insert" would be tried
+      and fail, saying that it sees that the first page has to be created
+      though the inserted row has rownr>0.
+    */
+    my_bool error= _ma_state_info_write(share,
+                                        MA_STATE_INFO_WRITE_DONT_MOVE_OFFSET |
+                                        MA_STATE_INFO_WRITE_LOCK) ||
+      _ma_update_state_lsns(share, lsn, trnman_get_min_trid(), FALSE, FALSE) ||
+      _ma_sync_table_files(info);
+    info->trn->rec_lsn= LSN_IMPOSSIBLE;
+    if (error)
+      goto err;
+  }
+
+  VOID(_ma_writeinfo(info,WRITEINFO_UPDATE_KEYFILE));
+#ifdef HAVE_MMAP
+  /* Map again */
+  if (share->file_map)
+    _ma_dynmap_file(info, (my_off_t) 0);
+#endif
+  allow_break();			/* Allow SIGHUP & SIGINT */
+  DBUG_RETURN(0);
+
+err:
+  {
+    int save_errno=my_errno;
+    VOID(_ma_writeinfo(info,WRITEINFO_UPDATE_KEYFILE));
+    info->update|=HA_STATE_WRITTEN;	/* Buffer changed */
+    allow_break();			/* Allow SIGHUP & SIGINT */
+    DBUG_RETURN(my_errno=save_errno);
+  }
+} /* maria_delete_all_rows */
+
+
+/*
+  Reset status information
+
+  SYNOPSIS
+    _ma_reset_status()
+    maria	Maria handler
+
+  DESCRIPTION
+    Resets data and index file information as if the file would be empty
+    Files are not touched.
+*/
+
+void _ma_reset_status(MARIA_HA *info)
+{
+  MARIA_SHARE *share= info->s;
+  MARIA_STATE_INFO *state= &share->state;
+  uint i;
+  DBUG_ENTER("_ma_reset_status");
+
+  state->split= 0;
+  state->state.records= state->state.del= 0;
+  state->changed=  0;                            /* File is optimized */
+  state->dellink= HA_OFFSET_ERROR;
+  state->sortkey=  (ushort) ~0;
+  state->state.key_file_length= share->base.keystart;
+  state->state.data_file_length= 0;
+  state->state.empty= state->state.key_empty= 0;
+  state->state.checksum= 0;
+
+  *info->state= state->state;
+
+  /* Drop the delete key chain. */
+  state->key_del= HA_OFFSET_ERROR;
+  /* Clear all keys */
+  for (i=0 ; i < share->base.keys ; i++)
+    state->key_root[i]= HA_OFFSET_ERROR;
+  DBUG_VOID_RETURN;
+}
diff --git a/storage/maria/ma_delete_table.c b/storage/maria/ma_delete_table.c
new file mode 100644
index 00000000000..0237bb884c5
--- /dev/null
+++ b/storage/maria/ma_delete_table.c
@@ -0,0 +1,107 @@
+/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; version 2 of the License.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */
+
+#include "ma_fulltext.h"
+#include "trnman_public.h"
+
+/**
+   @brief drops (deletes) a table
+
+   @param  name             table's name
+
+   @return Operation status
+     @retval 0      ok
+     @retval 1      error
+*/
+
+int maria_delete_table(const char *name)
+{
+  char from[FN_REFLEN];
+#ifdef USE_RAID
+  uint raid_type=0,raid_chunks=0;
+#endif
+  MARIA_HA *info;
+  myf sync_dir;
+  DBUG_ENTER("maria_delete_table");
+
+#ifdef EXTRA_DEBUG
+  _ma_check_table_is_closed(name,"delete");
+#endif
+  /** @todo LOCK take X-lock on table */
+  /*
+    We need to know if this table is transactional.
+    When built with RAID support, we also need to determine if this table
+    makes use of the raid feature. If yes, we need to remove all raid
+    chunks. This is done with my_raid_delete(). Unfortunately it is
+    necessary to open the table just to check this. We use
+    'open_for_repair' to be able to open even a crashed table. If even
+    this open fails, we assume no raid configuration for this table
+    and try to remove the normal data file only. This may however
+    leave the raid chunks behind.
+  */
+  if (!(info= maria_open(name, O_RDONLY, HA_OPEN_FOR_REPAIR)))
+  {
+#ifdef USE_RAID
+    raid_type= 0;
+#endif
+    sync_dir= 0;
+  }
+  else
+  {
+#ifdef USE_RAID
+    raid_type=      info->s->base.raid_type;
+    raid_chunks=    info->s->base.raid_chunks;
+#endif
+    sync_dir= (info->s->now_transactional && !info->s->temporary &&
+               !maria_in_recovery) ?
+      MY_SYNC_DIR : 0;
+    maria_close(info);
+  }
+
+  if (sync_dir)
+  {
+    /*
+      For this log record to be of any use for Recovery, we need the upper
+      MySQL layer to be crash-safe in DDLs.
+      For now this record can serve when we apply logs to a backup, so we sync
+      it.
+    */
+    LSN lsn;
+    LEX_CUSTRING log_array[TRANSLOG_INTERNAL_PARTS + 1];
+    log_array[TRANSLOG_INTERNAL_PARTS + 0].str= (uchar*)name;
+    log_array[TRANSLOG_INTERNAL_PARTS + 0].length= strlen(name) + 1;
+    if (unlikely(translog_write_record(&lsn, LOGREC_REDO_DROP_TABLE,
+                                       &dummy_transaction_object, NULL,
+                                       (translog_size_t)
+                                       log_array[TRANSLOG_INTERNAL_PARTS +
+                                                 0].length,
+                                       sizeof(log_array)/sizeof(log_array[0]),
+                                       log_array, NULL, NULL) ||
+                 translog_flush(lsn)))
+      DBUG_RETURN(1);
+  }
+
+  fn_format(from,name,"",MARIA_NAME_IEXT,MY_UNPACK_FILENAME|MY_APPEND_EXT);
+  if (my_delete_with_symlink(from, MYF(MY_WME | sync_dir)))
+    DBUG_RETURN(my_errno);
+  fn_format(from,name,"",MARIA_NAME_DEXT,MY_UNPACK_FILENAME|MY_APPEND_EXT);
+#ifdef USE_RAID
+  if (raid_type)
+    DBUG_RETURN(my_raid_delete(from, raid_chunks, MYF(MY_WME | sync_dir)) ?
+                my_errno : 0);
+#endif
+  DBUG_RETURN(my_delete_with_symlink(from, MYF(MY_WME | sync_dir)) ?
+              my_errno : 0);
+}
diff --git a/storage/maria/ma_dynrec.c b/storage/maria/ma_dynrec.c
new file mode 100644
index 00000000000..57b76b713f4
--- /dev/null
+++ b/storage/maria/ma_dynrec.c
@@ -0,0 +1,2042 @@
+/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; version 2 of the License.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */
+
+/*
+  Functions to handle space-packed-records and blobs
+
+  A row may be stored in one or more linked blocks.
+  The block size is between MARIA_MIN_BLOCK_LENGTH and MARIA_MAX_BLOCK_LENGTH.
+  Each block is aligned on MARIA_DYN_ALIGN_SIZE.
+  The reson for the max block size is to not have too many different types
+  of blocks.  For the differnet block types, look at _ma_get_block_info()
+*/
+
+#include "maria_def.h"
+
+static my_bool write_dynamic_record(MARIA_HA *info,const uchar *record,
+                                    ulong reclength);
+static int _ma_find_writepos(MARIA_HA *info,ulong reclength,my_off_t *filepos,
+			     ulong *length);
+static my_bool update_dynamic_record(MARIA_HA *info, MARIA_RECORD_POS filepos,
+                                     uchar *record, ulong reclength);
+static my_bool delete_dynamic_record(MARIA_HA *info,MARIA_RECORD_POS filepos,
+                                     uint second_read);
+static my_bool _ma_cmp_buffer(File file, const uchar *buff, my_off_t filepos,
+                              uint length);
+
+#ifdef THREAD
+/* Play it safe; We have a small stack when using threads */
+#undef my_alloca
+#undef my_afree
+#define my_alloca(A) my_malloc((A),MYF(0))
+#define my_afree(A) my_free((A),MYF(0))
+#endif
+
+	/* Interface function from MARIA_HA */
+
+#ifdef HAVE_MMAP
+
+/*
+  Create mmaped area for MARIA handler
+
+  SYNOPSIS
+    _ma_dynmap_file()
+    info		MARIA handler
+
+  RETURN
+    0  ok
+    1  error.
+*/
+
+my_bool _ma_dynmap_file(MARIA_HA *info, my_off_t size)
+{
+  DBUG_ENTER("_ma_dynmap_file");
+  if (size > (my_off_t) (~((size_t) 0)) - MEMMAP_EXTRA_MARGIN)
+  {
+    DBUG_PRINT("warning", ("File is too large for mmap"));
+    DBUG_RETURN(1);
+  }
+  /*
+    Ingo wonders if it is good to use MAP_NORESERVE. From the Linux man page:
+    MAP_NORESERVE
+      Do not reserve swap space for this mapping. When swap space is
+      reserved, one has the guarantee that it is possible to modify the
+      mapping. When swap space is not reserved one might get SIGSEGV
+      upon a write if no physical memory is available.
+  */
+  info->s->file_map= (uchar*)
+                  my_mmap(0, (size_t)(size + MEMMAP_EXTRA_MARGIN),
+                          info->s->mode==O_RDONLY ? PROT_READ :
+                          PROT_READ | PROT_WRITE,
+                          MAP_SHARED | MAP_NORESERVE,
+                          info->dfile.file, 0L);
+  if (info->s->file_map == (uchar*) MAP_FAILED)
+  {
+    info->s->file_map= NULL;
+    DBUG_RETURN(1);
+  }
+#if defined(HAVE_MADVISE)
+  madvise((char*) info->s->file_map, size, MADV_RANDOM);
+#endif
+  info->s->mmaped_length= size;
+  DBUG_RETURN(0);
+}
+
+
+/*
+  Resize mmaped area for MARIA handler
+
+  SYNOPSIS
+    _ma_remap_file()
+    info		MARIA handler
+
+  RETURN
+*/
+
+void _ma_remap_file(MARIA_HA *info, my_off_t size)
+{
+  if (info->s->file_map)
+  {
+    VOID(my_munmap((char*) info->s->file_map,
+                   (size_t) info->s->mmaped_length + MEMMAP_EXTRA_MARGIN));
+    _ma_dynmap_file(info, size);
+  }
+}
+#endif
+
+
+/*
+  Read bytes from MySAM handler, using mmap or pread
+
+  SYNOPSIS
+    _ma_mmap_pread()
+    info		MARIA handler
+    Buffer              Input buffer
+    Count               Count of bytes for read
+    offset              Start position
+    MyFlags
+
+  RETURN
+    0  ok
+*/
+
+size_t _ma_mmap_pread(MARIA_HA *info, uchar *Buffer,
+		      size_t Count, my_off_t offset, myf MyFlags)
+{
+  DBUG_PRINT("info", ("maria_read with mmap %d\n", info->dfile.file));
+  if (info->s->lock_key_trees)
+    rw_rdlock(&info->s->mmap_lock);
+
+  /*
+    The following test may fail in the following cases:
+    - We failed to remap a memory area (fragmented memory?)
+    - This thread has done some writes, but not yet extended the
+    memory mapped area.
+  */
+
+  if (info->s->mmaped_length >= offset + Count)
+  {
+    memcpy(Buffer, info->s->file_map + offset, Count);
+    if (info->s->lock_key_trees)
+      rw_unlock(&info->s->mmap_lock);
+    return 0;
+  }
+  else
+  {
+    if (info->s->lock_key_trees)
+      rw_unlock(&info->s->mmap_lock);
+    return my_pread(info->dfile.file, Buffer, Count, offset, MyFlags);
+  }
+}
+
+
+        /* wrapper for my_pread in case if mmap isn't used */
+
+size_t _ma_nommap_pread(MARIA_HA *info, uchar *Buffer,
+			size_t Count, my_off_t offset, myf MyFlags)
+{
+  return my_pread(info->dfile.file, Buffer, Count, offset, MyFlags);
+}
+
+
+/*
+  Write bytes to MySAM handler, using mmap or pwrite
+
+  SYNOPSIS
+    _ma_mmap_pwrite()
+    info		MARIA handler
+    Buffer              Output buffer
+    Count               Count of bytes for write
+    offset              Start position
+    MyFlags
+
+  RETURN
+    0  ok
+    !=0  error.  In this case return error from pwrite
+*/
+
+size_t _ma_mmap_pwrite(MARIA_HA *info, const uchar *Buffer,
+		       size_t Count, my_off_t offset, myf MyFlags)
+{
+  DBUG_PRINT("info", ("maria_write with mmap %d\n", info->dfile.file));
+  if (info->s->lock_key_trees)
+    rw_rdlock(&info->s->mmap_lock);
+
+  /*
+    The following test may fail in the following cases:
+    - We failed to remap a memory area (fragmented memory?)
+    - This thread has done some writes, but not yet extended the
+    memory mapped area.
+  */
+
+  if (info->s->mmaped_length >= offset + Count)
+  {
+    memcpy(info->s->file_map + offset, Buffer, Count);
+    if (info->s->lock_key_trees)
+      rw_unlock(&info->s->mmap_lock);
+    return 0;
+  }
+  else
+  {
+    info->s->nonmmaped_inserts++;
+    if (info->s->lock_key_trees)
+      rw_unlock(&info->s->mmap_lock);
+    return my_pwrite(info->dfile.file, Buffer, Count, offset, MyFlags);
+  }
+
+}
+
+
+        /* wrapper for my_pwrite in case if mmap isn't used */
+
+size_t _ma_nommap_pwrite(MARIA_HA *info, const uchar *Buffer,
+			 size_t Count, my_off_t offset, myf MyFlags)
+{
+  return my_pwrite(info->dfile.file, Buffer, Count, offset, MyFlags);
+}
+
+
+my_bool _ma_write_dynamic_record(MARIA_HA *info, const uchar *record)
+{
+  ulong reclength= _ma_rec_pack(info,info->rec_buff + MARIA_REC_BUFF_OFFSET,
+                                record);
+  return (write_dynamic_record(info,info->rec_buff + MARIA_REC_BUFF_OFFSET,
+                               reclength));
+}
+
+my_bool _ma_update_dynamic_record(MARIA_HA *info, MARIA_RECORD_POS pos,
+                                  const uchar *oldrec __attribute__ ((unused)),
+                                  const uchar *record)
+{
+  uint length= _ma_rec_pack(info, info->rec_buff + MARIA_REC_BUFF_OFFSET,
+                            record);
+  return (update_dynamic_record(info, pos,
+                                info->rec_buff + MARIA_REC_BUFF_OFFSET,
+                                length));
+}
+
+
+my_bool _ma_write_blob_record(MARIA_HA *info, const uchar *record)
+{
+  uchar *rec_buff;
+  int error;
+  ulong reclength,reclength2,extra;
+
+  extra= (ALIGN_SIZE(MARIA_MAX_DYN_BLOCK_HEADER)+MARIA_SPLIT_LENGTH+
+	  MARIA_DYN_DELETE_BLOCK_HEADER+1);
+  reclength= (info->s->base.pack_reclength +
+	      _ma_calc_total_blob_length(info,record)+ extra);
+  if (!(rec_buff=(uchar*) my_alloca(reclength)))
+  {
+    my_errno= HA_ERR_OUT_OF_MEM; /* purecov: inspected */
+    return(1);
+  }
+  reclength2= _ma_rec_pack(info,
+                           rec_buff+ALIGN_SIZE(MARIA_MAX_DYN_BLOCK_HEADER),
+			   record);
+  DBUG_PRINT("info",("reclength: %lu  reclength2: %lu",
+		     reclength, reclength2));
+  DBUG_ASSERT(reclength2 <= reclength);
+  error= write_dynamic_record(info,
+                              rec_buff+ALIGN_SIZE(MARIA_MAX_DYN_BLOCK_HEADER),
+                              reclength2);
+  my_afree(rec_buff);
+  return(error != 0);
+}
+
+
+my_bool _ma_update_blob_record(MARIA_HA *info, MARIA_RECORD_POS pos,
+                               const uchar *oldrec __attribute__ ((unused)),
+                               const uchar *record)
+{
+  uchar *rec_buff;
+  int error;
+  ulong reclength,extra;
+
+  extra= (ALIGN_SIZE(MARIA_MAX_DYN_BLOCK_HEADER)+MARIA_SPLIT_LENGTH+
+	  MARIA_DYN_DELETE_BLOCK_HEADER);
+  reclength= (info->s->base.pack_reclength+
+	      _ma_calc_total_blob_length(info,record)+ extra);
+#ifdef NOT_USED					/* We now support big rows */
+  if (reclength > MARIA_DYN_MAX_ROW_LENGTH)
+  {
+    my_errno=HA_ERR_TO_BIG_ROW;
+    return 1;
+  }
+#endif
+  if (!(rec_buff=(uchar*) my_alloca(reclength)))
+  {
+    my_errno= HA_ERR_OUT_OF_MEM; /* purecov: inspected */
+    return(1);
+  }
+  reclength= _ma_rec_pack(info,rec_buff+ALIGN_SIZE(MARIA_MAX_DYN_BLOCK_HEADER),
+			 record);
+  error=update_dynamic_record(info,pos,
+			      rec_buff+ALIGN_SIZE(MARIA_MAX_DYN_BLOCK_HEADER),
+			      reclength);
+  my_afree(rec_buff);
+  return(error != 0);
+}
+
+
+my_bool _ma_delete_dynamic_record(MARIA_HA *info,
+                                  const uchar *record __attribute__ ((unused)))
+{
+  return delete_dynamic_record(info, info->cur_row.lastpos, 0);
+}
+
+
+/**
+  Write record to data-file.
+
+  @todo it's cheating: it casts "const uchar*" to uchar*.
+*/
+
+static my_bool write_dynamic_record(MARIA_HA *info, const uchar *record,
+                                    ulong reclength)
+{
+  int flag;
+  ulong length;
+  my_off_t filepos;
+  DBUG_ENTER("write_dynamic_record");
+
+  flag=0;
+
+  /*
+    Check if we have enough room for the new record.
+    First we do simplified check to make usual case faster.
+    Then we do more precise check for the space left.
+    Though it still is not absolutely precise, as
+    we always use MARIA_MAX_DYN_BLOCK_HEADER while it can be
+    less in the most of the cases.
+  */
+
+  if (unlikely(info->s->base.max_data_file_length -
+               info->state->data_file_length <
+               reclength + MARIA_MAX_DYN_BLOCK_HEADER))
+  {
+    if (info->s->base.max_data_file_length - info->state->data_file_length +
+        info->state->empty - info->state->del * MARIA_MAX_DYN_BLOCK_HEADER <
+        reclength + MARIA_MAX_DYN_BLOCK_HEADER)
+    {
+      my_errno=HA_ERR_RECORD_FILE_FULL;
+      DBUG_RETURN(1);
+    }
+  }
+
+  do
+  {
+    if (_ma_find_writepos(info,reclength,&filepos,&length))
+      goto err;
+    if (_ma_write_part_record(info,filepos,length,
+                              (info->append_insert_at_end ?
+                               HA_OFFSET_ERROR : info->s->state.dellink),
+			      (uchar**) &record,&reclength,&flag))
+      goto err;
+  } while (reclength);
+
+  DBUG_RETURN(0);
+err:
+  DBUG_RETURN(1);
+}
+
+
+	/* Get a block for data ; The given data-area must be used !! */
+
+static int _ma_find_writepos(MARIA_HA *info,
+			     ulong reclength, /* record length */
+			     my_off_t *filepos, /* Return file pos */
+			     ulong *length)   /* length of block at filepos */
+{
+  MARIA_BLOCK_INFO block_info;
+  ulong tmp;
+  DBUG_ENTER("_ma_find_writepos");
+
+  if (info->s->state.dellink != HA_OFFSET_ERROR &&
+      !info->append_insert_at_end)
+  {
+    /* Deleted blocks exists;  Get last used block */
+    *filepos=info->s->state.dellink;
+    block_info.second_read=0;
+    info->rec_cache.seek_not_done=1;
+    if (!(_ma_get_block_info(&block_info, info->dfile.file,
+                             info->s->state.dellink) &
+	   BLOCK_DELETED))
+    {
+      DBUG_PRINT("error",("Delete link crashed"));
+      my_errno=HA_ERR_WRONG_IN_RECORD;
+      DBUG_RETURN(-1);
+    }
+    info->s->state.dellink=block_info.next_filepos;
+    info->state->del--;
+    info->state->empty-= block_info.block_len;
+    *length= block_info.block_len;
+  }
+  else
+  {
+    /* No deleted blocks;  Allocate a new block */
+    *filepos=info->state->data_file_length;
+    if ((tmp=reclength+3 + test(reclength >= (65520-3))) <
+	info->s->base.min_block_length)
+      tmp= info->s->base.min_block_length;
+    else
+      tmp= ((tmp+MARIA_DYN_ALIGN_SIZE-1) &
+	    (~ (ulong) (MARIA_DYN_ALIGN_SIZE-1)));
+    if (info->state->data_file_length >
+	(info->s->base.max_data_file_length - tmp))
+    {
+      my_errno=HA_ERR_RECORD_FILE_FULL;
+      DBUG_RETURN(-1);
+    }
+    if (tmp > MARIA_MAX_BLOCK_LENGTH)
+      tmp=MARIA_MAX_BLOCK_LENGTH;
+    *length= tmp;
+    info->state->data_file_length+= tmp;
+    info->s->state.split++;
+    info->update|=HA_STATE_WRITE_AT_END;
+  }
+  DBUG_RETURN(0);
+} /* _ma_find_writepos */
+
+
+
+/*
+  Unlink a deleted block from the deleted list.
+  This block will be combined with the preceding or next block to form
+  a big block.
+*/
+
+static my_bool unlink_deleted_block(MARIA_HA *info,
+                                    MARIA_BLOCK_INFO *block_info)
+{
+  DBUG_ENTER("unlink_deleted_block");
+  if (block_info->filepos == info->s->state.dellink)
+  {
+    /* First deleted block;  We can just use this ! */
+    info->s->state.dellink=block_info->next_filepos;
+  }
+  else
+  {
+    MARIA_BLOCK_INFO tmp;
+    tmp.second_read=0;
+    /* Unlink block from the previous block */
+    if (!(_ma_get_block_info(&tmp, info->dfile.file, block_info->prev_filepos)
+	  & BLOCK_DELETED))
+      DBUG_RETURN(1);				/* Something is wrong */
+    mi_sizestore(tmp.header+4,block_info->next_filepos);
+    if (info->s->file_write(info, tmp.header+4,8,
+		  block_info->prev_filepos+4, MYF(MY_NABP)))
+      DBUG_RETURN(1);
+    /* Unlink block from next block */
+    if (block_info->next_filepos != HA_OFFSET_ERROR)
+    {
+      if (!(_ma_get_block_info(&tmp, info->dfile.file,
+                               block_info->next_filepos)
+	    & BLOCK_DELETED))
+	DBUG_RETURN(1);				/* Something is wrong */
+      mi_sizestore(tmp.header+12,block_info->prev_filepos);
+      if (info->s->file_write(info, tmp.header+12,8,
+		    block_info->next_filepos+12,
+		    MYF(MY_NABP)))
+	DBUG_RETURN(1);
+    }
+  }
+  /* We now have one less deleted block */
+  info->state->del--;
+  info->state->empty-= block_info->block_len;
+  info->s->state.split--;
+
+  /*
+    If this was a block that we where accessing through table scan
+    (maria_rrnd() or maria_scan(), then ensure that we skip over this block
+    when doing next maria_rrnd() or maria_scan().
+  */
+  if (info->cur_row.nextpos == block_info->filepos)
+    info->cur_row.nextpos+= block_info->block_len;
+  DBUG_RETURN(0);
+}
+
+
+/*
+  Add a backward link to delete block
+
+  SYNOPSIS
+    update_backward_delete_link()
+    info		MARIA handler
+    delete_block	Position to delete block to update.
+			If this is 'HA_OFFSET_ERROR', nothing will be done
+    filepos		Position to block that 'delete_block' should point to
+
+  RETURN
+    0  ok
+    1  error.  In this case my_error is set.
+*/
+
+static my_bool update_backward_delete_link(MARIA_HA *info,
+                                           my_off_t delete_block,
+                                           MARIA_RECORD_POS filepos)
+{
+  MARIA_BLOCK_INFO block_info;
+  DBUG_ENTER("update_backward_delete_link");
+
+  if (delete_block != HA_OFFSET_ERROR)
+  {
+    block_info.second_read=0;
+    if (_ma_get_block_info(&block_info, info->dfile.file, delete_block)
+	& BLOCK_DELETED)
+    {
+      uchar buff[8];
+      mi_sizestore(buff,filepos);
+      if (info->s->file_write(info,buff, 8, delete_block+12, MYF(MY_NABP)))
+	DBUG_RETURN(1);				/* Error on write */
+    }
+    else
+    {
+      my_errno=HA_ERR_WRONG_IN_RECORD;
+      DBUG_RETURN(1);				/* Wrong delete link */
+    }
+  }
+  DBUG_RETURN(0);
+}
+
+/* Delete datarecord from database */
+/* info->rec_cache.seek_not_done is updated in cmp_record */
+
+static my_bool delete_dynamic_record(MARIA_HA *info, MARIA_RECORD_POS filepos,
+                                     uint second_read)
+{
+  uint length,b_type;
+  MARIA_BLOCK_INFO block_info,del_block;
+  int error;
+  my_bool remove_next_block;
+  DBUG_ENTER("delete_dynamic_record");
+
+  /* First add a link from the last block to the new one */
+  error= update_backward_delete_link(info, info->s->state.dellink, filepos);
+
+  block_info.second_read=second_read;
+  do
+  {
+    /* Remove block at 'filepos' */
+    if ((b_type= _ma_get_block_info(&block_info, info->dfile.file, filepos))
+	& (BLOCK_DELETED | BLOCK_ERROR | BLOCK_SYNC_ERROR |
+	   BLOCK_FATAL_ERROR) ||
+	(length=(uint) (block_info.filepos-filepos) +block_info.block_len) <
+	MARIA_MIN_BLOCK_LENGTH)
+    {
+      my_errno=HA_ERR_WRONG_IN_RECORD;
+      DBUG_RETURN(1);
+    }
+    /* Check if next block is a delete block */
+    del_block.second_read=0;
+    remove_next_block=0;
+    if (_ma_get_block_info(&del_block, info->dfile.file, filepos + length) &
+	BLOCK_DELETED && del_block.block_len+length <
+        MARIA_DYN_MAX_BLOCK_LENGTH)
+    {
+      /* We can't remove this yet as this block may be the head block */
+      remove_next_block=1;
+      length+=del_block.block_len;
+    }
+
+    block_info.header[0]=0;
+    mi_int3store(block_info.header+1,length);
+    mi_sizestore(block_info.header+4,info->s->state.dellink);
+    if (b_type & BLOCK_LAST)
+      bfill(block_info.header+12,8,255);
+    else
+      mi_sizestore(block_info.header+12,block_info.next_filepos);
+    if (info->s->file_write(info, block_info.header, 20, filepos,
+		  MYF(MY_NABP)))
+      DBUG_RETURN(1);
+    info->s->state.dellink = filepos;
+    info->state->del++;
+    info->state->empty+=length;
+    filepos=block_info.next_filepos;
+
+    /* Now it's safe to unlink the deleted block directly after this one */
+    if (remove_next_block && unlink_deleted_block(info,&del_block))
+      error=1;
+  } while (!(b_type & BLOCK_LAST));
+
+  DBUG_RETURN(error);
+}
+
+
+	/* Write a block to datafile */
+
+int _ma_write_part_record(MARIA_HA *info,
+			  my_off_t filepos,	/* points at empty block */
+			  ulong length,		/* length of block */
+			  my_off_t next_filepos,/* Next empty block */
+			  uchar **record,	/* pointer to record ptr */
+			  ulong *reclength,	/* length of *record */
+			  int *flag)		/* *flag == 0 if header */
+{
+  ulong head_length,res_length,extra_length,long_block,del_length;
+  uchar *pos,*record_end;
+  my_off_t  next_delete_block;
+  uchar temp[MARIA_SPLIT_LENGTH+MARIA_DYN_DELETE_BLOCK_HEADER];
+  DBUG_ENTER("_ma_write_part_record");
+
+  next_delete_block=HA_OFFSET_ERROR;
+
+  res_length=extra_length=0;
+  if (length > *reclength + MARIA_SPLIT_LENGTH)
+  {						/* Splitt big block */
+    res_length=MY_ALIGN(length- *reclength - MARIA_EXTEND_BLOCK_LENGTH,
+			MARIA_DYN_ALIGN_SIZE);
+    length-= res_length;			/* Use this for first part */
+  }
+  long_block= (length < 65520L && *reclength < 65520L) ? 0 : 1;
+  if (length == *reclength+ 3 + long_block)
+  {
+    /* Block is exactly of the right length */
+    temp[0]=(uchar) (1+ *flag)+(uchar) long_block;	/* Flag is 0 or 6 */
+    if (long_block)
+    {
+      mi_int3store(temp+1,*reclength);
+      head_length=4;
+    }
+    else
+    {
+      mi_int2store(temp+1,*reclength);
+      head_length=3;
+    }
+  }
+  else if (length-long_block < *reclength+4)
+  {						/* To short block */
+    if (next_filepos == HA_OFFSET_ERROR)
+      next_filepos= (info->s->state.dellink != HA_OFFSET_ERROR &&
+                     !info->append_insert_at_end ?
+                     info->s->state.dellink : info->state->data_file_length);
+    if (*flag == 0)				/* First block */
+    {
+      if (*reclength > MARIA_MAX_BLOCK_LENGTH)
+      {
+	head_length= 16;
+	temp[0]=13;
+	mi_int4store(temp+1,*reclength);
+	mi_int3store(temp+5,length-head_length);
+	mi_sizestore(temp+8,next_filepos);
+      }
+      else
+      {
+	head_length=5+8+long_block*2;
+	temp[0]=5+(uchar) long_block;
+	if (long_block)
+	{
+	  mi_int3store(temp+1,*reclength);
+	  mi_int3store(temp+4,length-head_length);
+	  mi_sizestore(temp+7,next_filepos);
+	}
+	else
+	{
+	  mi_int2store(temp+1,*reclength);
+	  mi_int2store(temp+3,length-head_length);
+	  mi_sizestore(temp+5,next_filepos);
+	}
+      }
+    }
+    else
+    {
+      head_length=3+8+long_block;
+      temp[0]=11+(uchar) long_block;
+      if (long_block)
+      {
+	mi_int3store(temp+1,length-head_length);
+	mi_sizestore(temp+4,next_filepos);
+      }
+      else
+      {
+	mi_int2store(temp+1,length-head_length);
+	mi_sizestore(temp+3,next_filepos);
+      }
+    }
+  }
+  else
+  {					/* Block with empty info last */
+    head_length=4+long_block;
+    extra_length= length- *reclength-head_length;
+    temp[0]= (uchar) (3+ *flag)+(uchar) long_block; /* 3,4 or 9,10 */
+    if (long_block)
+    {
+      mi_int3store(temp+1,*reclength);
+      temp[4]= (uchar) (extra_length);
+    }
+    else
+    {
+      mi_int2store(temp+1,*reclength);
+      temp[3]= (uchar) (extra_length);
+    }
+    length= *reclength+head_length;	/* Write only what is needed */
+  }
+  DBUG_DUMP("header", temp, head_length);
+
+	/* Make a long block for one write */
+  record_end= *record+length-head_length;
+  del_length=(res_length ? MARIA_DYN_DELETE_BLOCK_HEADER : 0);
+  bmove((*record-head_length), temp, head_length);
+  memcpy(temp,record_end,(size_t) (extra_length+del_length));
+  bzero(record_end, extra_length);
+
+  if (res_length)
+  {
+    /* Check first if we can join this block with the next one */
+    MARIA_BLOCK_INFO del_block;
+    my_off_t next_block=filepos+length+extra_length+res_length;
+
+    del_block.second_read=0;
+    if (next_block < info->state->data_file_length &&
+	info->s->state.dellink != HA_OFFSET_ERROR)
+    {
+      if ((_ma_get_block_info(&del_block, info->dfile.file, next_block)
+	   & BLOCK_DELETED) &&
+	  res_length + del_block.block_len < MARIA_DYN_MAX_BLOCK_LENGTH)
+      {
+	if (unlink_deleted_block(info,&del_block))
+	  goto err;
+	res_length+=del_block.block_len;
+      }
+    }
+
+    /* Create a delete link of the last part of the block */
+    pos=record_end+extra_length;
+    pos[0]= '\0';
+    mi_int3store(pos+1,res_length);
+    mi_sizestore(pos+4,info->s->state.dellink);
+    bfill(pos+12,8,255);			/* End link */
+    next_delete_block=info->s->state.dellink;
+    info->s->state.dellink= filepos+length+extra_length;
+    info->state->del++;
+    info->state->empty+=res_length;
+    info->s->state.split++;
+  }
+  if (info->opt_flag & WRITE_CACHE_USED &&
+      info->update & HA_STATE_WRITE_AT_END)
+  {
+    if (info->update & HA_STATE_EXTEND_BLOCK)
+    {
+      info->update&= ~HA_STATE_EXTEND_BLOCK;
+      if (my_block_write(&info->rec_cache, *record-head_length,
+			 length+extra_length+del_length,filepos))
+      goto err;
+    }
+    else if (my_b_write(&info->rec_cache, *record-head_length,
+			length+extra_length+del_length))
+      goto err;
+  }
+  else
+  {
+    info->rec_cache.seek_not_done=1;
+    if (info->s->file_write(info, *record-head_length,
+                            length+extra_length+
+                            del_length,filepos,info->s->write_flag))
+      goto err;
+  }
+  memcpy(record_end,temp,(size_t) (extra_length+del_length));
+  *record=record_end;
+  *reclength-=(length-head_length);
+  *flag=6;
+
+  if (del_length)
+  {
+    /* link the next delete block to this */
+    if (update_backward_delete_link(info, next_delete_block,
+				    info->s->state.dellink))
+      goto err;
+  }
+
+  DBUG_RETURN(0);
+err:
+  DBUG_PRINT("exit",("errno: %d",my_errno));
+  DBUG_RETURN(1);
+} /* _ma_write_part_record */
+
+
+	/* update record from datafile */
+
+static my_bool update_dynamic_record(MARIA_HA *info, MARIA_RECORD_POS filepos,
+                                     uchar *record, ulong reclength)
+{
+  int flag;
+  uint error;
+  ulong length;
+  MARIA_BLOCK_INFO block_info;
+  DBUG_ENTER("update_dynamic_record");
+
+  flag=block_info.second_read=0;
+  /*
+    Check if we have enough room for the record.
+    First we do simplified check to make usual case faster.
+    Then we do more precise check for the space left.
+    Though it still is not absolutely precise, as
+    we always use MARIA_MAX_DYN_BLOCK_HEADER while it can be
+    less in the most of the cases.
+  */
+
+  /*
+    compare with just the reclength as we're going
+    to get some space from the old replaced record
+  */
+  if (unlikely(info->s->base.max_data_file_length -
+        info->state->data_file_length < reclength))
+  {
+    /* If new record isn't longer, we can go on safely */
+    if (info->cur_row.total_length < reclength)
+    {
+      if (info->s->base.max_data_file_length - info->state->data_file_length +
+          info->state->empty - info->state->del * MARIA_MAX_DYN_BLOCK_HEADER <
+          reclength - info->cur_row.total_length + MARIA_MAX_DYN_BLOCK_HEADER)
+      {
+        my_errno=HA_ERR_RECORD_FILE_FULL;
+        goto err;
+      }
+    }
+  }
+  /* Remember length for updated row if it's updated again */
+  info->cur_row.total_length= reclength;
+
+  while (reclength > 0)
+  {
+    if (filepos != info->s->state.dellink)
+    {
+      block_info.next_filepos= HA_OFFSET_ERROR;
+      if ((error= _ma_get_block_info(&block_info, info->dfile.file, filepos))
+	  & (BLOCK_DELETED | BLOCK_ERROR | BLOCK_SYNC_ERROR |
+	     BLOCK_FATAL_ERROR))
+      {
+	DBUG_PRINT("error",("Got wrong block info"));
+	if (!(error & BLOCK_FATAL_ERROR))
+	  my_errno=HA_ERR_WRONG_IN_RECORD;
+	goto err;
+      }
+      length=(ulong) (block_info.filepos-filepos) + block_info.block_len;
+      if (length < reclength)
+      {
+	uint tmp=MY_ALIGN(reclength - length + 3 +
+			  test(reclength >= 65520L),MARIA_DYN_ALIGN_SIZE);
+	/* Don't create a block bigger than MARIA_MAX_BLOCK_LENGTH */
+	tmp= min(length+tmp, MARIA_MAX_BLOCK_LENGTH)-length;
+	/* Check if we can extend this block */
+	if (block_info.filepos + block_info.block_len ==
+	    info->state->data_file_length &&
+	    info->state->data_file_length <
+	    info->s->base.max_data_file_length-tmp)
+	{
+	  /* extend file */
+	  DBUG_PRINT("info",("Extending file with %d bytes",tmp));
+	  if (info->cur_row.nextpos == info->state->data_file_length)
+	    info->cur_row.nextpos+= tmp;
+	  info->state->data_file_length+= tmp;
+	  info->update|= HA_STATE_WRITE_AT_END | HA_STATE_EXTEND_BLOCK;
+	  length+=tmp;
+	}
+	else if (length < MARIA_MAX_BLOCK_LENGTH - MARIA_MIN_BLOCK_LENGTH)
+	{
+	  /*
+	    Check if next block is a deleted block
+	    Above we have MARIA_MIN_BLOCK_LENGTH to avoid the problem where
+	    the next block is so small it can't be splited which could
+	    casue problems
+	  */
+
+	  MARIA_BLOCK_INFO del_block;
+	  del_block.second_read=0;
+	  if (_ma_get_block_info(&del_block, info->dfile.file,
+				 block_info.filepos + block_info.block_len) &
+	      BLOCK_DELETED)
+	  {
+	    /* Use; Unlink it and extend the current block */
+	    DBUG_PRINT("info",("Extending current block"));
+	    if (unlink_deleted_block(info,&del_block))
+	      goto err;
+	    if ((length+=del_block.block_len) > MARIA_MAX_BLOCK_LENGTH)
+	    {
+	      /*
+		New block was too big, link overflow part back to
+		delete list
+	      */
+	      my_off_t next_pos;
+	      ulong rest_length= length-MARIA_MAX_BLOCK_LENGTH;
+	      set_if_bigger(rest_length, MARIA_MIN_BLOCK_LENGTH);
+	      next_pos= del_block.filepos+ del_block.block_len - rest_length;
+
+	      if (update_backward_delete_link(info, info->s->state.dellink,
+					      next_pos))
+		DBUG_RETURN(1);
+
+	      /* create delete link for data that didn't fit into the page */
+	      del_block.header[0]=0;
+	      mi_int3store(del_block.header+1, rest_length);
+	      mi_sizestore(del_block.header+4,info->s->state.dellink);
+	      bfill(del_block.header+12,8,255);
+	      if (info->s->file_write(info, del_block.header, 20,
+                                      next_pos, MYF(MY_NABP)))
+		DBUG_RETURN(1);
+	      info->s->state.dellink= next_pos;
+	      info->s->state.split++;
+	      info->state->del++;
+	      info->state->empty+= rest_length;
+	      length-= rest_length;
+	    }
+	  }
+	}
+      }
+    }
+    else
+    {
+      if (_ma_find_writepos(info,reclength,&filepos,&length))
+	goto err;
+    }
+    if (_ma_write_part_record(info,filepos,length,block_info.next_filepos,
+			      &record,&reclength,&flag))
+      goto err;
+    if ((filepos=block_info.next_filepos) == HA_OFFSET_ERROR)
+    {
+      /* Start writing data on deleted blocks */
+      filepos=info->s->state.dellink;
+    }
+  }
+
+  if (block_info.next_filepos != HA_OFFSET_ERROR)
+    if (delete_dynamic_record(info,block_info.next_filepos,1))
+      goto err;
+
+  DBUG_RETURN(0);
+err:
+  DBUG_RETURN(1);
+}
+
+
+	/* Pack a record. Return new reclength */
+
+uint _ma_rec_pack(MARIA_HA *info, register uchar *to,
+                  register const uchar *from)
+{
+  uint		length,new_length,flag,bit,i;
+  const uchar   *pos,*end;
+  uchar         *startpos,*packpos;
+  enum en_fieldtype type;
+  reg3 MARIA_COLUMNDEF *column;
+  MARIA_BLOB	*blob;
+  DBUG_ENTER("_ma_rec_pack");
+
+  flag= 0;
+  bit= 1;
+  startpos= packpos=to;
+  to+= info->s->base.pack_bytes;
+  blob= info->blobs;
+  column= info->s->columndef;
+  if (info->s->base.null_bytes)
+  {
+    memcpy(to, from, info->s->base.null_bytes);
+    from+= info->s->base.null_bytes;
+    to+=   info->s->base.null_bytes;
+  }
+
+  for (i=info->s->base.fields ; i-- > 0; from+= length, column++)
+  {
+    length=(uint) column->length;
+    if ((type = (enum en_fieldtype) column->type) != FIELD_NORMAL)
+    {
+      if (type == FIELD_BLOB)
+      {
+	if (!blob->length)
+	  flag|=bit;
+	else
+	{
+	  char *temp_pos;
+	  size_t tmp_length=length-portable_sizeof_char_ptr;
+	  memcpy(to,from,tmp_length);
+	  memcpy_fixed(&temp_pos,from+tmp_length,sizeof(char*));
+	  memcpy(to+tmp_length,temp_pos,(size_t) blob->length);
+	  to+=tmp_length+blob->length;
+	}
+	blob++;
+      }
+      else if (type == FIELD_SKIP_ZERO)
+      {
+	if (memcmp(from, maria_zero_string, length) == 0)
+	  flag|=bit;
+	else
+	{
+          memcpy(to, from, (size_t) length);
+          to+=length;
+	}
+      }
+      else if (type == FIELD_SKIP_ENDSPACE ||
+	       type == FIELD_SKIP_PRESPACE)
+      {
+	pos= from; end= from + length;
+	if (type == FIELD_SKIP_ENDSPACE)
+	{					/* Pack trailing spaces */
+	  while (end > from && *(end-1) == ' ')
+	    end--;
+	}
+	else
+	{					/* Pack pref-spaces */
+	  while (pos < end && *pos == ' ')
+	    pos++;
+	}
+	new_length=(uint) (end-pos);
+	if (new_length +1 + test(column->length > 255 && new_length > 127)
+	    < length)
+	{
+	  if (column->length > 255 && new_length > 127)
+	  {
+            to[0]= (uchar) ((new_length & 127) + 128);
+            to[1]= (uchar) (new_length >> 7);
+	    to+=2;
+	  }
+	  else
+	    *to++= (uchar) new_length;
+	  memcpy(to, pos, (size_t) new_length); to+=new_length;
+	  flag|=bit;
+	}
+	else
+	{
+	  memcpy(to,from,(size_t) length); to+=length;
+	}
+      }
+      else if (type == FIELD_VARCHAR)
+      {
+        uint pack_length= HA_VARCHAR_PACKLENGTH(column->length -1);
+	uint tmp_length;
+        if (pack_length == 1)
+        {
+          tmp_length= (uint) *from;
+          *to++= *from;
+        }
+        else
+        {
+          tmp_length= uint2korr(from);
+          store_key_length_inc(to,tmp_length);
+        }
+        memcpy(to, from+pack_length,tmp_length);
+        to+= tmp_length;
+        continue;
+      }
+      else
+      {
+	memcpy(to,from,(size_t) length); to+=length;
+	continue;				/* Normal field */
+      }
+      if ((bit= bit << 1) >= 256)
+      {
+	*packpos++ = (uchar) flag;
+	bit=1; flag=0;
+      }
+    }
+    else
+    {
+      memcpy(to,from,(size_t) length); to+=length;
+    }
+  }
+  if (bit != 1)
+    *packpos= (uchar) flag;
+  if (info->s->calc_checksum)
+    *to++= (uchar) info->cur_row.checksum;
+  DBUG_PRINT("exit",("packed length: %d",(int) (to-startpos)));
+  DBUG_RETURN((uint) (to-startpos));
+} /* _ma_rec_pack */
+
+
+
+/*
+  Check if a record was correctly packed. Used only by maria_chk
+  Returns 0 if record is ok.
+*/
+
+my_bool _ma_rec_check(MARIA_HA *info,const uchar *record, uchar *rec_buff,
+                      ulong packed_length, my_bool with_checksum,
+                      ha_checksum checksum)
+{
+  uint		length,new_length,flag,bit,i;
+  const uchar   *pos,*end;
+  uchar         *packpos,*to;
+  enum en_fieldtype type;
+  reg3 MARIA_COLUMNDEF *column;
+  DBUG_ENTER("_ma_rec_check");
+
+  packpos=rec_buff; to= rec_buff+info->s->base.pack_bytes;
+  column= info->s->columndef;
+  flag= *packpos; bit=1;
+  record+= info->s->base.null_bytes;
+  to+= info->s->base.null_bytes;
+
+  for (i=info->s->base.fields ; i-- > 0; record+= length, column++)
+  {
+    length=(uint) column->length;
+    if ((type = (enum en_fieldtype) column->type) != FIELD_NORMAL)
+    {
+      if (type == FIELD_BLOB)
+      {
+	uint blob_length=
+	  _ma_calc_blob_length(length-portable_sizeof_char_ptr,record);
+	if (!blob_length && !(flag & bit))
+	  goto err;
+	if (blob_length)
+	  to+=length - portable_sizeof_char_ptr+ blob_length;
+      }
+      else if (type == FIELD_SKIP_ZERO)
+      {
+	if (memcmp(record, maria_zero_string, length) == 0)
+	{
+	  if (!(flag & bit))
+	    goto err;
+	}
+	else
+	  to+=length;
+      }
+      else if (type == FIELD_SKIP_ENDSPACE ||
+	       type == FIELD_SKIP_PRESPACE)
+      {
+	pos= record; end= record + length;
+	if (type == FIELD_SKIP_ENDSPACE)
+	{					/* Pack trailing spaces */
+	  while (end > record && *(end-1) == ' ')
+	    end--;
+	}
+	else
+	{					/* Pack pre-spaces */
+	  while (pos < end && *pos == ' ')
+	    pos++;
+	}
+	new_length=(uint) (end-pos);
+	if (new_length +1 + test(column->length > 255 && new_length > 127)
+	    < length)
+	{
+	  if (!(flag & bit))
+	    goto err;
+	  if (column->length > 255 && new_length > 127)
+	  {
+            /* purecov: begin inspected */
+            if (to[0] != (uchar) ((new_length & 127) + 128) ||
+                to[1] != (uchar) (new_length >> 7))
+	      goto err;
+	    to+=2;
+            /* purecov: end */
+	  }
+	  else if (*to++ != (uchar) new_length)
+	    goto err;
+	  to+=new_length;
+	}
+	else
+	  to+=length;
+      }
+      else if (type == FIELD_VARCHAR)
+      {
+        uint pack_length= HA_VARCHAR_PACKLENGTH(column->length -1);
+	uint tmp_length;
+        if (pack_length == 1)
+        {
+          tmp_length= (uint) *record;
+          to+= 1+ tmp_length;
+          continue;
+        }
+        else
+        {
+          tmp_length= uint2korr(record);
+          to+= get_pack_length(tmp_length)+tmp_length;
+        }
+        continue;
+      }
+      else
+      {
+	to+=length;
+	continue;				/* Normal field */
+      }
+      if ((bit= bit << 1) >= 256)
+      {
+	flag= *++packpos;
+	bit=1;
+      }
+    }
+    else
+      to+= length;
+  }
+  if (packed_length != (uint) (to - rec_buff) +
+      test(info->s->calc_checksum) || (bit != 1 && (flag & ~(bit - 1))))
+    goto err;
+  if (with_checksum && ((uchar) checksum != (uchar) *to))
+  {
+    DBUG_PRINT("error",("wrong checksum for row"));
+    goto err;
+  }
+  DBUG_RETURN(0);
+
+err:
+  DBUG_RETURN(1);
+}
+
+
+/*
+  @brief Unpacks a record
+
+  @return Recordlength
+  @retval >0  ok
+  @retval MY_FILE_ERROR (== -1)  Error.
+          my_errno is set to HA_ERR_WRONG_IN_RECORD
+*/
+
+ulong _ma_rec_unpack(register MARIA_HA *info, register uchar *to, uchar *from,
+		     ulong found_length)
+{
+  uint flag,bit,length,min_pack_length, column_length;
+  enum en_fieldtype type;
+  uchar *from_end,*to_end,*packpos;
+  reg3 MARIA_COLUMNDEF *column, *end_column;
+  DBUG_ENTER("_ma_rec_unpack");
+
+  to_end=to + info->s->base.reclength;
+  from_end=from+found_length;
+  flag= (uchar) *from; bit=1; packpos=from;
+  if (found_length < info->s->base.min_pack_length)
+    goto err;
+  from+= info->s->base.pack_bytes;
+  min_pack_length= info->s->base.min_pack_length - info->s->base.pack_bytes;
+
+  if ((length= info->s->base.null_bytes))
+  {
+    memcpy(to, from, length);
+    from+= length;
+    to+= length;
+    min_pack_length-= length;
+  }
+
+  for (column= info->s->columndef, end_column= column + info->s->base.fields;
+       column < end_column ; to+= column_length, column++)
+  {
+    column_length= column->length;
+    if ((type = (enum en_fieldtype) column->type) != FIELD_NORMAL &&
+	(type != FIELD_CHECK))
+    {
+      if (type == FIELD_VARCHAR)
+      {
+        uint pack_length= HA_VARCHAR_PACKLENGTH(column_length-1);
+        if (pack_length == 1)
+        {
+          length= (uint) *(uchar*) from;
+          if (length > column_length-1)
+            goto err;
+          *to= *from++;
+        }
+        else
+        {
+          get_key_length(length, from);
+          if (length > column_length-2)
+            goto err;
+          int2store(to,length);
+        }
+        if (from+length > from_end)
+          goto err;
+        memcpy(to+pack_length, from, length);
+        from+= length;
+        min_pack_length--;
+        continue;
+      }
+      if (flag & bit)
+      {
+	if (type == FIELD_BLOB || type == FIELD_SKIP_ZERO)
+	  bzero(to, column_length);
+	else if (type == FIELD_SKIP_ENDSPACE ||
+		 type == FIELD_SKIP_PRESPACE)
+	{
+	  if (column->length > 255 && *from & 128)
+	  {
+	    if (from + 1 >= from_end)
+	      goto err;
+	    length= (*from & 127)+ ((uint) (uchar) *(from+1) << 7); from+=2;
+	  }
+	  else
+	  {
+	    if (from == from_end)
+	      goto err;
+	    length= (uchar) *from++;
+	  }
+	  min_pack_length--;
+	  if (length >= column_length ||
+	      min_pack_length + length > (uint) (from_end - from))
+	    goto err;
+	  if (type == FIELD_SKIP_ENDSPACE)
+	  {
+	    memcpy(to, from, (size_t) length);
+	    bfill(to+length, column_length-length, ' ');
+	  }
+	  else
+	  {
+	    bfill(to, column_length-length, ' ');
+	    memcpy(to+column_length-length, from, (size_t) length);
+	  }
+	  from+=length;
+	}
+      }
+      else if (type == FIELD_BLOB)
+      {
+	uint size_length=column_length- portable_sizeof_char_ptr;
+	ulong blob_length= _ma_calc_blob_length(size_length,from);
+        ulong from_left= (ulong) (from_end - from);
+        if (from_left < size_length ||
+            from_left - size_length < blob_length ||
+            from_left - size_length - blob_length < min_pack_length)
+	  goto err;
+	memcpy(to, from, (size_t) size_length);
+	from+=size_length;
+	memcpy_fixed(to+size_length,(uchar*) &from,sizeof(char*));
+	from+=blob_length;
+      }
+      else
+      {
+	if (type == FIELD_SKIP_ENDSPACE || type == FIELD_SKIP_PRESPACE)
+	  min_pack_length--;
+	if (min_pack_length + column_length > (uint) (from_end - from))
+	  goto err;
+	memcpy(to, from, (size_t) column_length); from+=column_length;
+      }
+      if ((bit= bit << 1) >= 256)
+      {
+	flag= (uchar) *++packpos; bit=1;
+      }
+    }
+    else
+    {
+      if (min_pack_length > (uint) (from_end - from))
+	goto err;
+      min_pack_length-=column_length;
+      memcpy(to, from, (size_t) column_length);
+      from+=column_length;
+    }
+  }
+  if (info->s->calc_checksum)
+    info->cur_row.checksum= (uint) (uchar) *from++;
+  if (to == to_end && from == from_end && (bit == 1 || !(flag & ~(bit-1))))
+    DBUG_RETURN(found_length);
+
+err:
+  my_errno= HA_ERR_WRONG_IN_RECORD;
+  DBUG_PRINT("error",("to_end: 0x%lx -> 0x%lx  from_end: 0x%lx -> 0x%lx",
+		      (long) to, (long) to_end, (long) from, (long) from_end));
+  DBUG_DUMP("from", info->rec_buff, info->s->base.min_pack_length);
+  DBUG_RETURN(MY_FILE_ERROR);
+} /* _ma_rec_unpack */
+
+
+	/* Calc length of blob. Update info in blobs->length */
+
+ulong _ma_calc_total_blob_length(MARIA_HA *info, const uchar *record)
+{
+  ulong length;
+  MARIA_BLOB *blob,*end;
+
+  for (length=0, blob= info->blobs, end=blob+info->s->base.blobs ;
+       blob != end;
+       blob++)
+  {
+    blob->length= _ma_calc_blob_length(blob->pack_length,
+                                       record + blob->offset);
+    length+=blob->length;
+  }
+  return length;
+}
+
+
+ulong _ma_calc_blob_length(uint length, const uchar *pos)
+{
+  switch (length) {
+  case 1:
+    return (uint) (uchar) *pos;
+  case 2:
+    return (uint) uint2korr(pos);
+  case 3:
+    return uint3korr(pos);
+  case 4:
+    return uint4korr(pos);
+  default:
+    break;
+  }
+  return 0; /* Impossible */
+}
+
+
+void _ma_store_blob_length(uchar *pos,uint pack_length,uint length)
+{
+  switch (pack_length) {
+  case 1:
+    *pos= (uchar) length;
+    break;
+  case 2:
+    int2store(pos,length);
+    break;
+  case 3:
+    int3store(pos,length);
+    break;
+  case 4:
+    int4store(pos,length);
+  default:
+    break;
+  }
+  return;
+}
+
+
+/*
+  Read record from datafile.
+
+  SYNOPSIS
+    _ma_read_dynamic_record()
+      info                      MARIA_HA pointer to table.
+      filepos                   From where to read the record.
+      buf                       Destination for record.
+
+  NOTE
+    If a write buffer is active, it needs to be flushed if its contents
+    intersects with the record to read. We always check if the position
+    of the first uchar of the write buffer is lower than the position
+    past the last uchar to read. In theory this is also true if the write
+    buffer is completely below the read segment. That is, if there is no
+    intersection. But this case is unusual. We flush anyway. Only if the
+    first uchar in the write buffer is above the last uchar to read, we do
+    not flush.
+
+    A dynamic record may need several reads. So this check must be done
+    before every read. Reading a dynamic record starts with reading the
+    block header. If the record does not fit into the free space of the
+    header, the block may be longer than the header. In this case a
+    second read is necessary. These one or two reads repeat for every
+    part of the record.
+
+  RETURN
+    0          OK
+    #          Error number
+*/
+
+int _ma_read_dynamic_record(MARIA_HA *info, uchar *buf,
+                            MARIA_RECORD_POS filepos)
+{
+  int block_of_record;
+  uint b_type;
+  MARIA_BLOCK_INFO block_info;
+  File file;
+  uchar *to;
+  uint left_length;
+  DBUG_ENTER("_ma_read_dynamic_record");
+
+  if (filepos == HA_OFFSET_ERROR)
+    goto err;
+
+  LINT_INIT(to);
+  LINT_INIT(left_length);
+  file= info->dfile.file;
+  block_of_record= 0;   /* First block of record is numbered as zero. */
+  block_info.second_read= 0;
+  do
+  {
+    /* A corrupted table can have wrong pointers. (Bug# 19835) */
+    if (filepos == HA_OFFSET_ERROR)
+      goto panic;
+    if (info->opt_flag & WRITE_CACHE_USED &&
+        (info->rec_cache.pos_in_file < filepos +
+         MARIA_BLOCK_INFO_HEADER_LENGTH) &&
+        flush_io_cache(&info->rec_cache))
+      goto err;
+    info->rec_cache.seek_not_done=1;
+    if ((b_type= _ma_get_block_info(&block_info, file, filepos)) &
+        (BLOCK_DELETED | BLOCK_ERROR | BLOCK_SYNC_ERROR |
+         BLOCK_FATAL_ERROR))
+    {
+      if (b_type & (BLOCK_SYNC_ERROR | BLOCK_DELETED))
+        my_errno=HA_ERR_RECORD_DELETED;
+      goto err;
+    }
+    if (block_of_record++ == 0)			/* First block */
+    {
+      info->cur_row.total_length= block_info.rec_len;
+      if (block_info.rec_len > (uint) info->s->base.max_pack_length)
+        goto panic;
+      if (info->s->base.blobs)
+      {
+        if (_ma_alloc_buffer(&info->rec_buff, &info->rec_buff_size,
+                             block_info.rec_len +
+                             info->s->base.extra_rec_buff_size))
+          goto err;
+      }
+      to= info->rec_buff;
+      left_length=block_info.rec_len;
+    }
+    if (left_length < block_info.data_len || ! block_info.data_len)
+      goto panic;			/* Wrong linked record */
+    /* copy information that is already read */
+    {
+      uint offset= (uint) (block_info.filepos - filepos);
+      uint prefetch_len= (sizeof(block_info.header) - offset);
+      filepos+= sizeof(block_info.header);
+
+      if (prefetch_len > block_info.data_len)
+        prefetch_len= block_info.data_len;
+      if (prefetch_len)
+      {
+        memcpy(to, block_info.header + offset, prefetch_len);
+        block_info.data_len-= prefetch_len;
+        left_length-= prefetch_len;
+        to+= prefetch_len;
+      }
+    }
+    /* read rest of record from file */
+    if (block_info.data_len)
+    {
+      if (info->opt_flag & WRITE_CACHE_USED &&
+          info->rec_cache.pos_in_file < filepos + block_info.data_len &&
+          flush_io_cache(&info->rec_cache))
+        goto err;
+      /*
+        What a pity that this method is not called 'file_pread' and that
+        there is no equivalent without seeking. We are at the right
+        position already. :(
+      */
+      if (info->s->file_read(info, to, block_info.data_len,
+                             filepos, MYF(MY_NABP)))
+        goto panic;
+      left_length-=block_info.data_len;
+      to+=block_info.data_len;
+    }
+    filepos= block_info.next_filepos;
+  } while (left_length);
+
+  info->update|= HA_STATE_AKTIV;	/* We have a aktive record */
+  fast_ma_writeinfo(info);
+  DBUG_RETURN(_ma_rec_unpack(info,buf,info->rec_buff,block_info.rec_len) !=
+              MY_FILE_ERROR ? 0 : my_errno);
+
+err:
+  fast_ma_writeinfo(info);
+  DBUG_RETURN(my_errno);
+
+panic:
+  my_errno=HA_ERR_WRONG_IN_RECORD;
+  goto err;
+}
+
+	/* compare unique constraint between stored rows */
+
+my_bool _ma_cmp_dynamic_unique(MARIA_HA *info, MARIA_UNIQUEDEF *def,
+                               const uchar *record, MARIA_RECORD_POS pos)
+{
+  uchar *old_rec_buff,*old_record;
+  size_t old_rec_buff_size;
+  my_bool error;
+  DBUG_ENTER("_ma_cmp_dynamic_unique");
+
+  if (!(old_record=my_alloca(info->s->base.reclength)))
+    DBUG_RETURN(1);
+
+  /* Don't let the compare destroy blobs that may be in use */
+  old_rec_buff=      info->rec_buff;
+  old_rec_buff_size= info->rec_buff_size;
+
+  if (info->s->base.blobs)
+  {
+    info->rec_buff= 0;
+    info->rec_buff_size= 0;
+  }
+  error= _ma_read_dynamic_record(info, old_record, pos) != 0;
+  if (!error)
+    error=_ma_unique_comp(def, record, old_record, def->null_are_equal) != 0;
+  if (info->s->base.blobs)
+  {
+    my_free(info->rec_buff, MYF(MY_ALLOW_ZERO_PTR));
+    info->rec_buff=      old_rec_buff;
+    info->rec_buff_size= old_rec_buff_size;
+  }
+  my_afree(old_record);
+  DBUG_RETURN(error);
+}
+
+
+	/* Compare of record on disk with packed record in memory */
+
+my_bool _ma_cmp_dynamic_record(register MARIA_HA *info,
+                               register const uchar *record)
+{
+  uint flag, reclength, b_type,cmp_length;
+  my_off_t filepos;
+  uchar *buffer;
+  MARIA_BLOCK_INFO block_info;
+  my_bool error= 1;
+  DBUG_ENTER("_ma_cmp_dynamic_record");
+
+	/* We are going to do changes; dont let anybody disturb */
+  dont_break();				/* Dont allow SIGHUP or SIGINT */
+
+  if (info->opt_flag & WRITE_CACHE_USED)
+  {
+    info->update&= ~(HA_STATE_WRITE_AT_END | HA_STATE_EXTEND_BLOCK);
+    if (flush_io_cache(&info->rec_cache))
+      DBUG_RETURN(1);
+  }
+  info->rec_cache.seek_not_done=1;
+
+	/* If nobody have touched the database we don't have to test rec */
+
+  buffer=info->rec_buff;
+  if ((info->opt_flag & READ_CHECK_USED))
+  {						/* If check isn't disabled  */
+    if (info->s->base.blobs)
+    {
+      if (!(buffer=(uchar*) my_alloca(info->s->base.pack_reclength+
+				     _ma_calc_total_blob_length(info,record))))
+	DBUG_RETURN(1);
+    }
+    reclength= _ma_rec_pack(info,buffer,record);
+    record= buffer;
+
+    filepos= info->cur_row.lastpos;
+    flag=block_info.second_read=0;
+    block_info.next_filepos=filepos;
+    while (reclength > 0)
+    {
+      if ((b_type= _ma_get_block_info(&block_info, info->dfile.file,
+				    block_info.next_filepos))
+	  & (BLOCK_DELETED | BLOCK_ERROR | BLOCK_SYNC_ERROR |
+	     BLOCK_FATAL_ERROR))
+      {
+	if (b_type & (BLOCK_SYNC_ERROR | BLOCK_DELETED))
+	  my_errno=HA_ERR_RECORD_CHANGED;
+	goto err;
+      }
+      if (flag == 0)				/* First block */
+      {
+	flag=1;
+	if (reclength != block_info.rec_len)
+	{
+	  my_errno=HA_ERR_RECORD_CHANGED;
+	  goto err;
+	}
+      } else if (reclength < block_info.data_len)
+      {
+	my_errno=HA_ERR_WRONG_IN_RECORD;
+	goto err;
+      }
+      reclength-= block_info.data_len;
+      cmp_length= block_info.data_len;
+      if (!reclength && info->s->calc_checksum)
+        cmp_length--;        /* 'record' may not contain checksum */
+
+      if (_ma_cmp_buffer(info->dfile.file, record, block_info.filepos,
+			 cmp_length))
+      {
+	my_errno=HA_ERR_RECORD_CHANGED;
+	goto err;
+      }
+      flag=1;
+      record+=block_info.data_len;
+    }
+  }
+  my_errno=0;
+  error= 0;
+err:
+  if (buffer != info->rec_buff)
+    my_afree(buffer);
+  DBUG_PRINT("exit", ("result: %d", error));
+  DBUG_RETURN(error);
+}
+
+
+	/* Compare file to buffert */
+
+static my_bool _ma_cmp_buffer(File file, const uchar *buff, my_off_t filepos,
+                              uint length)
+{
+  uint next_length;
+  uchar temp_buff[IO_SIZE*2];
+  DBUG_ENTER("_ma_cmp_buffer");
+
+  next_length= IO_SIZE*2 - (uint) (filepos & (IO_SIZE-1));
+
+  while (length > IO_SIZE*2)
+  {
+    if (my_pread(file,temp_buff,next_length,filepos, MYF(MY_NABP)) ||
+	memcmp(buff, temp_buff, next_length))
+      goto err;
+    filepos+=next_length;
+    buff+=next_length;
+    length-= next_length;
+    next_length=IO_SIZE*2;
+  }
+  if (my_pread(file,temp_buff,length,filepos,MYF(MY_NABP)))
+    goto err;
+  DBUG_RETURN(memcmp(buff, temp_buff, length) != 0);
+err:
+  DBUG_RETURN(1);
+}
+
+
+/*
+  Read next record from datafile during table scan.
+
+  SYNOPSIS
+    _ma_read_rnd_dynamic_record()
+      info                      MARIA_HA pointer to table.
+      buf                       Destination for record.
+      filepos                   From where to read the record.
+      skip_deleted_blocks       If to repeat reading until a non-deleted
+                                record is found.
+
+  NOTE
+    This is identical to _ma_read_dynamic_record(), except the following
+    cases:
+
+    - If there is no active row at 'filepos', continue scanning for
+      an active row. (This is becasue the previous
+      _ma_read_rnd_dynamic_record() call stored the next block position
+      in filepos, but this position may not be a start block for a row
+    - We may have READ_CACHING enabled, in which case we use the cache
+      to read rows.
+
+   For other comments, check _ma_read_dynamic_record()
+
+  RETURN
+    0           OK
+    != 0        Error number
+*/
+
+int _ma_read_rnd_dynamic_record(MARIA_HA *info,
+                                uchar *buf,
+                                MARIA_RECORD_POS filepos,
+				my_bool skip_deleted_blocks)
+{
+  int block_of_record, info_read;
+  uint left_len,b_type;
+  uchar *to;
+  MARIA_BLOCK_INFO block_info;
+  MARIA_SHARE *share= info->s;
+  DBUG_ENTER("_ma_read_rnd_dynamic_record");
+
+  info_read=0;
+  LINT_INIT(to);
+
+  if (info->lock_type == F_UNLCK)
+  {
+#ifndef UNSAFE_LOCKING
+#else
+    info->tmp_lock_type=F_RDLCK;
+#endif
+  }
+  else
+    info_read=1;				/* memory-keyinfoblock is ok */
+
+  block_of_record= 0;   /* First block of record is numbered as zero. */
+  block_info.second_read= 0;
+  left_len=1;
+  do
+  {
+    if (filepos >= info->state->data_file_length)
+    {
+      if (!info_read)
+      {						/* Check if changed */
+	info_read=1;
+	info->rec_cache.seek_not_done=1;
+	if (_ma_state_info_read_dsk(share->kfile.file, &share->state))
+	  goto panic;
+      }
+      if (filepos >= info->state->data_file_length)
+      {
+	my_errno= HA_ERR_END_OF_FILE;
+	goto err;
+      }
+    }
+    if (info->opt_flag & READ_CACHE_USED)
+    {
+      if (_ma_read_cache(&info->rec_cache, block_info.header, filepos,
+			 sizeof(block_info.header),
+			 (!block_of_record && skip_deleted_blocks ?
+                          READING_NEXT : 0) | READING_HEADER))
+	goto panic;
+      b_type= _ma_get_block_info(&block_info,-1,filepos);
+    }
+    else
+    {
+      if (info->opt_flag & WRITE_CACHE_USED &&
+	  info->rec_cache.pos_in_file < filepos + MARIA_BLOCK_INFO_HEADER_LENGTH &&
+	  flush_io_cache(&info->rec_cache))
+	DBUG_RETURN(my_errno);
+      info->rec_cache.seek_not_done=1;
+      b_type= _ma_get_block_info(&block_info, info->dfile.file, filepos);
+    }
+
+    if (b_type & (BLOCK_DELETED | BLOCK_ERROR | BLOCK_SYNC_ERROR |
+		  BLOCK_FATAL_ERROR))
+    {
+      if ((b_type & (BLOCK_DELETED | BLOCK_SYNC_ERROR))
+	  && skip_deleted_blocks)
+      {
+	filepos=block_info.filepos+block_info.block_len;
+	block_info.second_read=0;
+	continue;		/* Search after next_record */
+      }
+      if (b_type & (BLOCK_DELETED | BLOCK_SYNC_ERROR))
+      {
+	my_errno= HA_ERR_RECORD_DELETED;
+	info->cur_row.lastpos= block_info.filepos;
+	info->cur_row.nextpos= block_info.filepos+block_info.block_len;
+      }
+      goto err;
+    }
+    if (block_of_record == 0)				/* First block */
+    {
+      info->cur_row.total_length= block_info.rec_len;
+      if (block_info.rec_len > (uint) share->base.max_pack_length)
+	goto panic;
+      info->cur_row.lastpos= filepos;
+      if (share->base.blobs)
+      {
+	if (_ma_alloc_buffer(&info->rec_buff, &info->rec_buff_size,
+                             block_info.rec_len +
+                             info->s->base.extra_rec_buff_size))
+	  goto err;
+      }
+      to= info->rec_buff;
+      left_len=block_info.rec_len;
+    }
+    if (left_len < block_info.data_len)
+      goto panic;				/* Wrong linked record */
+
+    /* copy information that is already read */
+    {
+      uint offset=(uint) (block_info.filepos - filepos);
+      uint tmp_length= (sizeof(block_info.header) - offset);
+      filepos=block_info.filepos;
+
+      if (tmp_length > block_info.data_len)
+	tmp_length= block_info.data_len;
+      if (tmp_length)
+      {
+	memcpy(to, block_info.header+offset, tmp_length);
+	block_info.data_len-=tmp_length;
+	left_len-=tmp_length;
+	to+=tmp_length;
+	filepos+=tmp_length;
+     }
+    }
+    /* read rest of record from file */
+    if (block_info.data_len)
+    {
+      if (info->opt_flag & READ_CACHE_USED)
+      {
+	if (_ma_read_cache(&info->rec_cache, to,filepos,
+			   block_info.data_len,
+			   (!block_of_record && skip_deleted_blocks) ?
+                           READING_NEXT : 0))
+	  goto panic;
+      }
+      else
+      {
+        if (info->opt_flag & WRITE_CACHE_USED &&
+            info->rec_cache.pos_in_file <
+            block_info.filepos + block_info.data_len &&
+            flush_io_cache(&info->rec_cache))
+          goto err;
+	/* VOID(my_seek(info->dfile.file, filepos, MY_SEEK_SET, MYF(0))); */
+	if (my_read(info->dfile.file, to, block_info.data_len, MYF(MY_NABP)))
+	{
+	  if (my_errno == HA_ERR_FILE_TOO_SHORT)
+	    my_errno= HA_ERR_WRONG_IN_RECORD;	/* Unexpected end of file */
+	  goto err;
+	}
+      }
+    }
+    /*
+      Increment block-of-record counter. If it was the first block,
+      remember the position behind the block for the next call.
+    */
+    if (block_of_record++ == 0)
+    {
+      info->cur_row.nextpos= block_info.filepos+block_info.block_len;
+      skip_deleted_blocks=0;
+    }
+    left_len-=block_info.data_len;
+    to+=block_info.data_len;
+    filepos=block_info.next_filepos;
+  } while (left_len);
+
+  info->update|= HA_STATE_AKTIV | HA_STATE_KEY_CHANGED;
+  fast_ma_writeinfo(info);
+  if (_ma_rec_unpack(info,buf,info->rec_buff,block_info.rec_len) !=
+      MY_FILE_ERROR)
+    DBUG_RETURN(0);
+  DBUG_RETURN(my_errno);			/* Wrong record */
+
+panic:
+  my_errno=HA_ERR_WRONG_IN_RECORD;		/* Something is fatal wrong */
+err:
+  fast_ma_writeinfo(info);
+  DBUG_RETURN(my_errno);
+}
+
+
+	/* Read and process header from a dynamic-record-file */
+
+uint _ma_get_block_info(MARIA_BLOCK_INFO *info, File file, my_off_t filepos)
+{
+  uint return_val=0;
+  uchar *header=info->header;
+
+  if (file >= 0)
+  {
+    /*
+      We do not use my_pread() here because we want to have the file
+      pointer set to the end of the header after this function.
+      my_pread() may leave the file pointer untouched.
+    */
+    VOID(my_seek(file,filepos,MY_SEEK_SET,MYF(0)));
+    if (my_read(file, header, sizeof(info->header),MYF(0)) !=
+	sizeof(info->header))
+      goto err;
+  }
+  DBUG_DUMP("header",header,MARIA_BLOCK_INFO_HEADER_LENGTH);
+  if (info->second_read)
+  {
+    if (info->header[0] <= 6 || info->header[0] == 13)
+      return_val=BLOCK_SYNC_ERROR;
+  }
+  else
+  {
+    if (info->header[0] > 6 && info->header[0] != 13)
+      return_val=BLOCK_SYNC_ERROR;
+  }
+  info->next_filepos= HA_OFFSET_ERROR; /* Dummy if no next block */
+
+  switch (info->header[0]) {
+  case 0:
+    if ((info->block_len=(uint) mi_uint3korr(header+1)) <
+	MARIA_MIN_BLOCK_LENGTH ||
+	(info->block_len & (MARIA_DYN_ALIGN_SIZE -1)))
+      goto err;
+    info->filepos=filepos;
+    info->next_filepos=mi_sizekorr(header+4);
+    info->prev_filepos=mi_sizekorr(header+12);
+#if SIZEOF_OFF_T == 4
+    if ((mi_uint4korr(header+4) != 0 &&
+	 (mi_uint4korr(header+4) != (ulong) ~0 ||
+	  info->next_filepos != (ulong) ~0)) ||
+	(mi_uint4korr(header+12) != 0 &&
+	 (mi_uint4korr(header+12) != (ulong) ~0 ||
+	  info->prev_filepos != (ulong) ~0)))
+      goto err;
+#endif
+    return return_val | BLOCK_DELETED;		/* Deleted block */
+
+  case 1:
+    info->rec_len=info->data_len=info->block_len=mi_uint2korr(header+1);
+    info->filepos=filepos+3;
+    return return_val | BLOCK_FIRST | BLOCK_LAST;
+  case 2:
+    info->rec_len=info->data_len=info->block_len=mi_uint3korr(header+1);
+    info->filepos=filepos+4;
+    return return_val | BLOCK_FIRST | BLOCK_LAST;
+
+  case 13:
+    info->rec_len=mi_uint4korr(header+1);
+    info->block_len=info->data_len=mi_uint3korr(header+5);
+    info->next_filepos=mi_sizekorr(header+8);
+    info->second_read=1;
+    info->filepos=filepos+16;
+    return return_val | BLOCK_FIRST;
+
+  case 3:
+    info->rec_len=info->data_len=mi_uint2korr(header+1);
+    info->block_len=info->rec_len+ (uint) header[3];
+    info->filepos=filepos+4;
+    return return_val | BLOCK_FIRST | BLOCK_LAST;
+  case 4:
+    info->rec_len=info->data_len=mi_uint3korr(header+1);
+    info->block_len=info->rec_len+ (uint) header[4];
+    info->filepos=filepos+5;
+    return return_val | BLOCK_FIRST | BLOCK_LAST;
+
+  case 5:
+    info->rec_len=mi_uint2korr(header+1);
+    info->block_len=info->data_len=mi_uint2korr(header+3);
+    info->next_filepos=mi_sizekorr(header+5);
+    info->second_read=1;
+    info->filepos=filepos+13;
+    return return_val | BLOCK_FIRST;
+  case 6:
+    info->rec_len=mi_uint3korr(header+1);
+    info->block_len=info->data_len=mi_uint3korr(header+4);
+    info->next_filepos=mi_sizekorr(header+7);
+    info->second_read=1;
+    info->filepos=filepos+15;
+    return return_val | BLOCK_FIRST;
+
+    /* The following blocks are identical to 1-6 without rec_len */
+  case 7:
+    info->data_len=info->block_len=mi_uint2korr(header+1);
+    info->filepos=filepos+3;
+    return return_val | BLOCK_LAST;
+  case 8:
+    info->data_len=info->block_len=mi_uint3korr(header+1);
+    info->filepos=filepos+4;
+    return return_val | BLOCK_LAST;
+
+  case 9:
+    info->data_len=mi_uint2korr(header+1);
+    info->block_len=info->data_len+ (uint) header[3];
+    info->filepos=filepos+4;
+    return return_val | BLOCK_LAST;
+  case 10:
+    info->data_len=mi_uint3korr(header+1);
+    info->block_len=info->data_len+ (uint) header[4];
+    info->filepos=filepos+5;
+    return return_val | BLOCK_LAST;
+
+  case 11:
+    info->data_len=info->block_len=mi_uint2korr(header+1);
+    info->next_filepos=mi_sizekorr(header+3);
+    info->second_read=1;
+    info->filepos=filepos+11;
+    return return_val;
+  case 12:
+    info->data_len=info->block_len=mi_uint3korr(header+1);
+    info->next_filepos=mi_sizekorr(header+4);
+    info->second_read=1;
+    info->filepos=filepos+12;
+    return return_val;
+  }
+
+err:
+  my_errno=HA_ERR_WRONG_IN_RECORD;	 /* Garbage */
+  return BLOCK_ERROR;
+}
diff --git a/storage/maria/ma_extra.c b/storage/maria/ma_extra.c
new file mode 100644
index 00000000000..7a30b613ea5
--- /dev/null
+++ b/storage/maria/ma_extra.c
@@ -0,0 +1,637 @@
+/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; version 2 of the License.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */
+
+#include "maria_def.h"
+#ifdef HAVE_SYS_MMAN_H
+#include <sys/mman.h>
+#endif
+#include "ma_blockrec.h"
+
+static void maria_extra_keyflag(MARIA_HA *info,
+                                enum ha_extra_function function);
+
+/**
+   @brief Set options and buffers to optimize table handling
+
+   @param  name             table's name
+   @param  info             open table
+   @param  function         operation
+   @param  extra_arg        Pointer to extra argument (normally pointer to
+                            ulong); used when function is one of:
+                            HA_EXTRA_WRITE_CACHE
+                            HA_EXTRA_CACHE
+
+   @return Operation status
+     @retval 0      ok
+     @retval !=0    error
+*/
+
+int maria_extra(MARIA_HA *info, enum ha_extra_function function,
+                void *extra_arg)
+{
+  int error= 0;
+  ulong cache_size;
+  MARIA_SHARE *share= info->s;
+  my_bool block_records= share->data_file_type == BLOCK_RECORD;
+  DBUG_ENTER("maria_extra");
+  DBUG_PRINT("enter",("function: %d",(int) function));
+
+  switch (function) {
+  case HA_EXTRA_RESET_STATE:		/* Reset state (don't free buffers) */
+    info->lastinx= 0;			/* Use first index as def */
+    info->last_search_keypage= info->cur_row.lastpos= HA_OFFSET_ERROR;
+    info->page_changed= 1;
+					/* Next/prev gives first/last */
+    if (info->opt_flag & READ_CACHE_USED)
+    {
+      reinit_io_cache(&info->rec_cache,READ_CACHE,0,
+		      (pbool) (info->lock_type != F_UNLCK),
+		      (pbool) test(info->update & HA_STATE_ROW_CHANGED)
+		      );
+    }
+    info->update= ((info->update & HA_STATE_CHANGED) | HA_STATE_NEXT_FOUND |
+		   HA_STATE_PREV_FOUND);
+    break;
+  case HA_EXTRA_CACHE:
+    if (block_records)
+      break;                                    /* Not supported */
+
+    if (info->lock_type == F_UNLCK &&
+	(share->options & HA_OPTION_PACK_RECORD))
+    {
+      error= 1;			/* Not possibly if not locked */
+      my_errno= EACCES;
+      break;
+    }
+    if (info->s->file_map) /* Don't use cache if mmap */
+      break;
+#if defined(HAVE_MMAP) && defined(HAVE_MADVISE)
+    if ((share->options & HA_OPTION_COMPRESS_RECORD))
+    {
+      pthread_mutex_lock(&share->intern_lock);
+      if (_ma_memmap_file(info))
+      {
+	/* We don't nead MADV_SEQUENTIAL if small file */
+	madvise((char*) share->file_map, share->state.state.data_file_length,
+		share->state.state.data_file_length <= RECORD_CACHE_SIZE*16 ?
+		MADV_RANDOM : MADV_SEQUENTIAL);
+	pthread_mutex_unlock(&share->intern_lock);
+	break;
+      }
+      pthread_mutex_unlock(&share->intern_lock);
+    }
+#endif
+    if (info->opt_flag & WRITE_CACHE_USED)
+    {
+      info->opt_flag&= ~WRITE_CACHE_USED;
+      if ((error= end_io_cache(&info->rec_cache)))
+	break;
+    }
+    if (!(info->opt_flag &
+	  (READ_CACHE_USED | WRITE_CACHE_USED | MEMMAP_USED)))
+    {
+      cache_size= (extra_arg ? *(ulong*) extra_arg :
+		   my_default_record_cache_size);
+      if (!(init_io_cache(&info->rec_cache, info->dfile.file,
+			 (uint) min(share->state.state.data_file_length+1,
+				    cache_size),
+			  READ_CACHE,0L,(pbool) (info->lock_type != F_UNLCK),
+			  MYF(share->write_flag & MY_WAIT_IF_FULL))))
+      {
+	info->opt_flag|= READ_CACHE_USED;
+	info->update&=   ~HA_STATE_ROW_CHANGED;
+      }
+      if (share->non_transactional_concurrent_insert)
+	info->rec_cache.end_of_file= info->state->data_file_length;
+    }
+    break;
+  case HA_EXTRA_REINIT_CACHE:
+    if (info->opt_flag & READ_CACHE_USED)
+    {
+      reinit_io_cache(&info->rec_cache, READ_CACHE, info->cur_row.nextpos,
+		      (pbool) (info->lock_type != F_UNLCK),
+		      (pbool) test(info->update & HA_STATE_ROW_CHANGED));
+      info->update&= ~HA_STATE_ROW_CHANGED;
+      if (share->non_transactional_concurrent_insert)
+	info->rec_cache.end_of_file= info->state->data_file_length;
+    }
+    break;
+  case HA_EXTRA_WRITE_CACHE:
+    if (info->lock_type == F_UNLCK)
+    {
+      error= 1;                        	/* Not possibly if not locked */
+      break;
+    }
+    if (block_records)
+      break;                            /* Not supported */
+
+    cache_size= (extra_arg ? *(ulong*) extra_arg :
+		 my_default_record_cache_size);
+    if (!(info->opt_flag &
+	  (READ_CACHE_USED | WRITE_CACHE_USED | OPT_NO_ROWS)) &&
+	!share->state.header.uniques)
+      if (!(init_io_cache(&info->rec_cache, info->dfile.file, cache_size,
+			 WRITE_CACHE,share->state.state.data_file_length,
+			  (pbool) (info->lock_type != F_UNLCK),
+			  MYF(share->write_flag & MY_WAIT_IF_FULL))))
+      {
+	info->opt_flag|= WRITE_CACHE_USED;
+	info->update&=   ~(HA_STATE_ROW_CHANGED |
+                           HA_STATE_WRITE_AT_END |
+                           HA_STATE_EXTEND_BLOCK);
+      }
+    break;
+  case HA_EXTRA_PREPARE_FOR_UPDATE:
+    if (info->s->data_file_type != DYNAMIC_RECORD)
+      break;
+    /* Remove read/write cache if dynamic rows */
+  case HA_EXTRA_NO_CACHE:
+    if (info->opt_flag & (READ_CACHE_USED | WRITE_CACHE_USED))
+    {
+      info->opt_flag&= ~(READ_CACHE_USED | WRITE_CACHE_USED);
+      error= end_io_cache(&info->rec_cache);
+      /* Sergei will insert full text index caching here */
+    }
+#if defined(HAVE_MMAP) && defined(HAVE_MADVISE)
+    if (info->opt_flag & MEMMAP_USED)
+      madvise((char*) share->file_map, share->state.state.data_file_length,
+              MADV_RANDOM);
+#endif
+    break;
+  case HA_EXTRA_FLUSH_CACHE:
+    if (info->opt_flag & WRITE_CACHE_USED)
+    {
+      if ((error= flush_io_cache(&info->rec_cache)))
+      {
+        maria_print_error(info->s, HA_ERR_CRASHED);
+	maria_mark_crashed(info);			/* Fatal error found */
+      }
+    }
+    break;
+  case HA_EXTRA_NO_READCHECK:
+    info->opt_flag&= ~READ_CHECK_USED;		/* No readcheck */
+    break;
+  case HA_EXTRA_READCHECK:
+    info->opt_flag|= READ_CHECK_USED;
+    break;
+  case HA_EXTRA_KEYREAD:			/* Read only keys to record */
+  case HA_EXTRA_REMEMBER_POS:
+    info->opt_flag|= REMEMBER_OLD_POS;
+    bmove(info->last_key.data + share->base.max_key_length*2,
+	  info->last_key.data,
+          info->last_key.data_length + info->last_key.ref_length);
+    info->save_update=	info->update;
+    info->save_lastinx= info->lastinx;
+    info->save_lastpos= info->cur_row.lastpos;
+    info->save_lastkey_data_length= info->last_key.data_length;
+    info->save_lastkey_ref_length= info->last_key.ref_length;
+    if (function == HA_EXTRA_REMEMBER_POS)
+      break;
+    /* fall through */
+  case HA_EXTRA_KEYREAD_CHANGE_POS:
+    info->opt_flag|= KEY_READ_USED;
+    info->read_record= _ma_read_key_record;
+    break;
+  case HA_EXTRA_NO_KEYREAD:
+  case HA_EXTRA_RESTORE_POS:
+    if (info->opt_flag & REMEMBER_OLD_POS)
+    {
+      bmove(info->last_key.data,
+	    info->last_key.data + share->base.max_key_length*2,
+	    info->save_lastkey_data_length + info->save_lastkey_ref_length);
+      info->update=	info->save_update | HA_STATE_WRITTEN;
+      info->lastinx=	info->save_lastinx;
+      info->cur_row.lastpos= info->save_lastpos;
+      info->last_key.data_length= info->save_lastkey_data_length;
+      info->last_key.ref_length= info->save_lastkey_ref_length;
+      info->last_key.flag= 0;
+    }
+    info->read_record=	share->read_record;
+    info->opt_flag&= ~(KEY_READ_USED | REMEMBER_OLD_POS);
+    break;
+  case HA_EXTRA_NO_USER_CHANGE: /* Database is somehow locked agains changes */
+    info->lock_type= F_EXTRA_LCK; /* Simulate as locked */
+    break;
+  case HA_EXTRA_WAIT_LOCK:
+    info->lock_wait= 0;
+    break;
+  case HA_EXTRA_NO_WAIT_LOCK:
+    info->lock_wait= MY_SHORT_WAIT;
+    break;
+  case HA_EXTRA_NO_KEYS:
+    /* we're going to modify pieces of the state, stall Checkpoint */
+    pthread_mutex_lock(&share->intern_lock);
+    if (info->lock_type == F_UNLCK)
+    {
+      pthread_mutex_unlock(&share->intern_lock);
+      error= 1;					/* Not possibly if not lock */
+      break;
+    }
+    if (maria_is_any_key_active(share->state.key_map))
+    {
+      MARIA_KEYDEF *key= share->keyinfo;
+      uint i;
+      for (i =0 ; i < share->base.keys ; i++,key++)
+      {
+        if (!(key->flag & HA_NOSAME) && info->s->base.auto_key != i+1)
+        {
+          maria_clear_key_active(share->state.key_map, i);
+          info->update|= HA_STATE_CHANGED;
+        }
+      }
+
+      if (!share->changed)
+      {
+	share->state.changed|= STATE_CHANGED | STATE_NOT_ANALYZED;
+	share->changed= 1;			/* Update on close */
+	if (!share->global_changed)
+	{
+	  share->global_changed= 1;
+	  share->state.open_count++;
+	}
+      }
+      if (!share->now_transactional)
+        share->state.state= *info->state;
+      /*
+        That state write to disk must be done, even for transactional tables;
+        indeed the table's share is going to be lost (there was a
+        HA_EXTRA_FORCE_REOPEN before, which set share->last_version to
+        0), and so the only way it leaves information (share->state.key_map)
+        for the posterity is by writing it to disk.
+      */
+      DBUG_ASSERT(!maria_in_recovery);
+      error= _ma_state_info_write(share,
+                                  MA_STATE_INFO_WRITE_DONT_MOVE_OFFSET |
+                                  MA_STATE_INFO_WRITE_FULL_INFO);
+    }
+    pthread_mutex_unlock(&share->intern_lock);
+    break;
+  case HA_EXTRA_FORCE_REOPEN:
+    /*
+      MySQL uses this case after it has closed all other instances
+      of this table.
+      We however do a flush here for additional safety.
+    */
+    /** @todo consider porting these flush-es to MyISAM */
+    DBUG_ASSERT(share->reopen == 1);
+    error= _ma_flush_table_files(info, MARIA_FLUSH_DATA | MARIA_FLUSH_INDEX,
+                                 FLUSH_FORCE_WRITE, FLUSH_FORCE_WRITE);
+    if (!error && share->changed)
+    {
+      pthread_mutex_lock(&share->intern_lock);
+      if (!(error= _ma_state_info_write(share,
+                                        MA_STATE_INFO_WRITE_DONT_MOVE_OFFSET|
+                                        MA_STATE_INFO_WRITE_FULL_INFO)))
+        share->changed= 0;
+      pthread_mutex_unlock(&share->intern_lock);
+    }
+    pthread_mutex_lock(&THR_LOCK_maria);
+    pthread_mutex_lock(&share->intern_lock); /* protect against Checkpoint */
+    /* this makes the share not be re-used next time the table is opened */
+    share->last_version= 0L;			/* Impossible version */
+    pthread_mutex_unlock(&share->intern_lock);
+    pthread_mutex_unlock(&THR_LOCK_maria);
+    break;
+  case HA_EXTRA_PREPARE_FOR_DROP:
+    /* Signals about intent to delete this table */
+    share->deleting= TRUE;
+    share->global_changed= FALSE;     /* force writing changed flag */
+    /* To force repair if reopened */
+    _ma_mark_file_changed(info);
+    /* Fall trough */
+  case HA_EXTRA_PREPARE_FOR_RENAME:
+  {
+    my_bool do_flush= test(function != HA_EXTRA_PREPARE_FOR_DROP);
+    enum flush_type type;
+    pthread_mutex_lock(&THR_LOCK_maria);
+    /*
+      This share, to have last_version=0, needs to save all its data/index
+      blocks to disk if this is not for a DROP TABLE. Otherwise they would be
+      invisible to future openers; and they could even go to disk late and
+      cancel the work of future openers.
+    */
+    if (info->lock_type != F_UNLCK && !info->was_locked)
+    {
+      info->was_locked= info->lock_type;
+      if (maria_lock_database(info, F_UNLCK))
+        error= my_errno;
+      info->lock_type= F_UNLCK;
+    }
+    /*
+      We don't need to call _mi_decrement_open_count() if we are
+      dropping the table, as the files will be removed anyway. If we
+      are aborted before the files is removed, it's better to not
+      call it as in that case the automatic repair on open will add
+      the missing index entries
+    */
+    pthread_mutex_lock(&share->intern_lock);
+    if (share->kfile.file >= 0 && function != HA_EXTRA_PREPARE_FOR_DROP)
+      _ma_decrement_open_count(info);
+    if (info->trn)
+    {
+      _ma_remove_table_from_trnman(share, info->trn);
+      /* Ensure we don't point to the deleted data in trn */
+      info->state= info->state_start= &share->state.state;
+    }
+
+    type= do_flush ? FLUSH_RELEASE : FLUSH_IGNORE_CHANGED;
+    if (_ma_flush_table_files(info, MARIA_FLUSH_DATA | MARIA_FLUSH_INDEX,
+                              type, type))
+    {
+      error=my_errno;
+      share->changed= 1;
+    }
+    if (info->opt_flag & (READ_CACHE_USED | WRITE_CACHE_USED))
+    {
+      info->opt_flag&= ~(READ_CACHE_USED | WRITE_CACHE_USED);
+      if (end_io_cache(&info->rec_cache))
+        error= 1;
+    }
+    if (share->kfile.file >= 0)
+    {
+      if (do_flush)
+      {
+        /* Save the state so that others can find it from disk. */
+        if ((share->changed &&
+             _ma_state_info_write(share,
+                                  MA_STATE_INFO_WRITE_DONT_MOVE_OFFSET |
+                                  MA_STATE_INFO_WRITE_FULL_INFO)) ||
+            my_sync(share->kfile.file, MYF(0)))
+          error= my_errno;
+        else
+          share->changed= 0;
+      }
+      else
+      {
+        /* be sure that state is not tried for write as file may be closed */
+        share->changed= 0;
+      }
+    }
+    if (share->data_file_type == BLOCK_RECORD &&
+        share->bitmap.file.file >= 0)
+    {
+      if (do_flush && my_sync(share->bitmap.file.file, MYF(0)))
+        error= my_errno;
+    }
+    /* For protection against Checkpoint, we set under intern_lock: */
+    share->last_version= 0L;			/* Impossible version */
+    pthread_mutex_unlock(&share->intern_lock);
+    pthread_mutex_unlock(&THR_LOCK_maria);
+    break;
+  }
+  case HA_EXTRA_PREPARE_FOR_FORCED_CLOSE:
+    if (info->trn)
+    {
+      pthread_mutex_lock(&share->intern_lock);
+      _ma_remove_table_from_trnman(share, info->trn);
+      /* Ensure we don't point to the deleted data in trn */
+      info->state= info->state_start= &share->state.state;
+      pthread_mutex_unlock(&share->intern_lock);    
+    }
+    break;
+  case HA_EXTRA_FLUSH:
+    if (!share->temporary)
+      error= _ma_flush_table_files(info, MARIA_FLUSH_DATA | MARIA_FLUSH_INDEX,
+                                   FLUSH_KEEP, FLUSH_KEEP);
+#ifdef HAVE_PWRITE
+    _ma_decrement_open_count(info);
+#endif
+    if (share->not_flushed)
+    {
+      share->not_flushed= 0;
+      if (_ma_sync_table_files(info))
+	error= my_errno;
+      if (error)
+      {
+	share->changed= 1;
+        maria_print_error(info->s, HA_ERR_CRASHED);
+	maria_mark_crashed(info);			/* Fatal error found */
+      }
+    }
+    break;
+  case HA_EXTRA_NORMAL:				/* Theese isn't in use */
+    info->quick_mode= 0;
+    break;
+  case HA_EXTRA_QUICK:
+    info->quick_mode= 1;
+    break;
+  case HA_EXTRA_NO_ROWS:
+    if (!share->state.header.uniques)
+      info->opt_flag|= OPT_NO_ROWS;
+    break;
+  case HA_EXTRA_PRELOAD_BUFFER_SIZE:
+    info->preload_buff_size= *((ulong *) extra_arg);
+    break;
+  case HA_EXTRA_CHANGE_KEY_TO_UNIQUE:
+  case HA_EXTRA_CHANGE_KEY_TO_DUP:
+    maria_extra_keyflag(info, function);
+    break;
+  case HA_EXTRA_MMAP:
+#ifdef HAVE_MMAP
+    if (block_records)
+      break;                                    /* Not supported */
+    pthread_mutex_lock(&share->intern_lock);
+    /*
+      Memory map the data file if it is not already mapped. It is safe
+      to memory map a file while other threads are using file I/O on it.
+      Assigning a new address to a function pointer is an atomic
+      operation. intern_lock prevents that two or more mappings are done
+      at the same time.
+    */
+    if (!share->file_map)
+    {
+      if (_ma_dynmap_file(info, share->state.state.data_file_length))
+      {
+        DBUG_PRINT("warning",("mmap failed: errno: %d",errno));
+        error= my_errno= errno;
+      }
+      else
+      {
+        share->file_read=  _ma_mmap_pread;
+        share->file_write= _ma_mmap_pwrite;
+      }
+    }
+    pthread_mutex_unlock(&share->intern_lock);
+#endif
+    break;
+  case HA_EXTRA_MARK_AS_LOG_TABLE:
+    pthread_mutex_lock(&share->intern_lock);
+    share->is_log_table= TRUE;
+    pthread_mutex_unlock(&share->intern_lock);
+    break;
+  case HA_EXTRA_KEY_CACHE:
+  case HA_EXTRA_NO_KEY_CACHE:
+  default:
+    break;
+  }
+  DBUG_RETURN(error);
+} /* maria_extra */
+
+
+void ma_set_index_cond_func(MARIA_HA *info, index_cond_func_t func,
+                            void *func_arg)
+{
+  info->index_cond_func= func;
+  info->index_cond_func_arg= func_arg;
+}
+
+
+/*
+  Start/Stop Inserting Duplicates Into a Table, WL#1648.
+*/
+
+static void maria_extra_keyflag(MARIA_HA *info,
+                                enum ha_extra_function function)
+{
+  uint  idx;
+
+  for (idx= 0; idx< info->s->base.keys; idx++)
+  {
+    switch (function) {
+    case HA_EXTRA_CHANGE_KEY_TO_UNIQUE:
+      info->s->keyinfo[idx].flag|= HA_NOSAME;
+      break;
+    case HA_EXTRA_CHANGE_KEY_TO_DUP:
+      info->s->keyinfo[idx].flag&= ~(HA_NOSAME);
+      break;
+    default:
+      break;
+    }
+  }
+}
+
+
+int maria_reset(MARIA_HA *info)
+{
+  int error= 0;
+  MARIA_SHARE *share= info->s;
+  DBUG_ENTER("maria_reset");
+  /*
+    Free buffers and reset the following flags:
+    EXTRA_CACHE, EXTRA_WRITE_CACHE, EXTRA_KEYREAD, EXTRA_QUICK
+
+    If the row buffer cache is large (for dynamic tables), reduce it
+    to save memory.
+  */
+  if (info->opt_flag & (READ_CACHE_USED | WRITE_CACHE_USED))
+  {
+    info->opt_flag&= ~(READ_CACHE_USED | WRITE_CACHE_USED);
+    error= end_io_cache(&info->rec_cache);
+  }
+  /* Free memory used for keeping blobs */
+  if (share->base.blobs)
+  {
+    if (info->rec_buff_size > share->base.default_rec_buff_size)
+    {
+      info->rec_buff_size= 1;                 /* Force realloc */
+      _ma_alloc_buffer(&info->rec_buff, &info->rec_buff_size,
+                       share->base.default_rec_buff_size);
+    }
+    if (info->blob_buff_size > MARIA_SMALL_BLOB_BUFFER)
+    {
+      info->blob_buff_size= 1;                 /* Force realloc */
+      _ma_alloc_buffer(&info->blob_buff, &info->blob_buff_size,
+                       MARIA_SMALL_BLOB_BUFFER);
+    }
+  }
+#if defined(HAVE_MMAP) && defined(HAVE_MADVISE)
+  if (info->opt_flag & MEMMAP_USED)
+    madvise((char*) share->file_map, share->state.state.data_file_length,
+            MADV_RANDOM);
+#endif
+  info->opt_flag&= ~(KEY_READ_USED | REMEMBER_OLD_POS);
+  info->quick_mode= 0;
+  info->lastinx= 0;			/* Use first index as def */
+  info->last_search_keypage= info->cur_row.lastpos= HA_OFFSET_ERROR;
+  info->page_changed= 1;
+  info->update= ((info->update & HA_STATE_CHANGED) | HA_STATE_NEXT_FOUND |
+                 HA_STATE_PREV_FOUND);
+  DBUG_RETURN(error);
+}
+
+
+int _ma_sync_table_files(const MARIA_HA *info)
+{
+  return (my_sync(info->dfile.file, MYF(MY_WME)) ||
+          my_sync(info->s->kfile.file, MYF(MY_WME)));
+}
+
+
+/**
+   @brief flushes the data and/or index file of a table
+
+   This is useful when one wants to read a table using OS syscalls (like
+   my_copy()) and first wants to be sure that MySQL-level caches go down to
+   the OS so that OS syscalls can see all data. It can flush rec_cache,
+   bitmap, pagecache of data file, pagecache of index file.
+
+   @param  info                table
+   @param  flush_data_or_index one or two of these flags:
+                               MARIA_FLUSH_DATA, MARIA_FLUSH_INDEX
+   @param  flush_type_for_data
+   @param  flush_type_for_index
+
+   @note does not sync files (@see _ma_sync_table_files()).
+   @note Progressively this function will be used in all places where we flush
+   the index but not the data file (probable bugs).
+
+   @return Operation status
+     @retval 0      OK
+     @retval 1      Error
+*/
+
+int _ma_flush_table_files(MARIA_HA *info, uint flush_data_or_index,
+                          enum flush_type flush_type_for_data,
+                          enum flush_type flush_type_for_index)
+{
+  int error= 0;
+  MARIA_SHARE *share= info->s;
+  /* flush data file first because it's more critical */
+  if (flush_data_or_index & MARIA_FLUSH_DATA)
+  {
+    if ((info->opt_flag & WRITE_CACHE_USED) &&
+        flush_type_for_data != FLUSH_IGNORE_CHANGED &&
+        flush_io_cache(&info->rec_cache))
+      error= 1;
+    if (share->data_file_type == BLOCK_RECORD)
+    {
+      if (flush_type_for_data != FLUSH_IGNORE_CHANGED)
+      {
+        if (_ma_bitmap_flush(share))
+          error= 1;
+      }
+      else
+      {
+        pthread_mutex_lock(&share->bitmap.bitmap_lock);
+        share->bitmap.changed= 0;
+        pthread_mutex_unlock(&share->bitmap.bitmap_lock);
+      }
+      if (flush_pagecache_blocks(share->pagecache, &info->dfile,
+                                 flush_type_for_data))
+        error= 1;
+    }
+  }
+  if ((flush_data_or_index & MARIA_FLUSH_INDEX) &&
+      flush_pagecache_blocks(share->pagecache, &share->kfile,
+                             flush_type_for_index))
+    error= 1;
+  if (!error)
+    return 0;
+
+  maria_print_error(info->s, HA_ERR_CRASHED);
+  maria_mark_crashed(info);
+  return 1;
+}
+
diff --git a/storage/maria/ma_ft_boolean_search.c b/storage/maria/ma_ft_boolean_search.c
new file mode 100644
index 00000000000..0783f679843
--- /dev/null
+++ b/storage/maria/ma_ft_boolean_search.c
@@ -0,0 +1,1042 @@
+/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; version 2 of the License.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */
+
+/* Written by Sergei A. Golubchik, who has a shared copyright to this code */
+
+/*  TODO: add caching - pre-read several index entries at once */
+
+/*
+  Added optimization for full-text queries with plus-words. It was
+  implemented by sharing maximal document id (max_docid) variable
+  inside plus subtree. max_docid could be used by any word in plus
+  subtree, but it could be updated by plus-word only.
+
+  Fulltext "smarter index merge" optimization assumes that rows
+  it gets are ordered by doc_id. That is not the case when we
+  search for a word with truncation operator. It may return
+  rows in random order. Thus we may not use "smarter index merge"
+  optimization with "trunc-words".
+
+  The idea is: there is no need to search for docid smaller than
+  biggest docid inside current plus subtree or any upper plus subtree.
+
+  Examples:
+  +word1 word2
+    share same max_docid
+    max_docid updated by word1
+  +word1 +(word2 word3)
+    share same max_docid
+    max_docid updated by word1
+  +(word1 -word2) +(+word3 word4)
+    share same max_docid
+    max_docid updated by word3
+   +word1 word2 (+word3 word4 (+word5 word6))
+    three subexpressions (including the top-level one),
+    every one has its own max_docid, updated by its plus word.
+    but for the search word6 uses
+    max(word1.max_docid, word3.max_docid, word5.max_docid),
+    while word4 uses, accordingly,
+    max(word1.max_docid, word3.max_docid).
+*/
+
+#define FT_CORE
+#include "ma_ftdefs.h"
+
+/* search with boolean queries */
+
+static double _wghts[11]=
+{
+  0.131687242798354,
+  0.197530864197531,
+  0.296296296296296,
+  0.444444444444444,
+  0.666666666666667,
+  1.000000000000000,
+  1.500000000000000,
+  2.250000000000000,
+  3.375000000000000,
+  5.062500000000000,
+  7.593750000000000};
+static double *wghts=_wghts+5; /* wghts[i] = 1.5**i */
+
+static double _nwghts[11]=
+{
+ -0.065843621399177,
+ -0.098765432098766,
+ -0.148148148148148,
+ -0.222222222222222,
+ -0.333333333333334,
+ -0.500000000000000,
+ -0.750000000000000,
+ -1.125000000000000,
+ -1.687500000000000,
+ -2.531250000000000,
+ -3.796875000000000};
+static double *nwghts=_nwghts+5; /* nwghts[i] = -0.5*1.5**i */
+
+#define FTB_FLAG_TRUNC 1
+/* At most one of the following flags can be set */
+#define FTB_FLAG_YES   2
+#define FTB_FLAG_NO    4
+#define FTB_FLAG_WONLY 8
+
+typedef struct st_ftb_expr FTB_EXPR;
+struct st_ftb_expr
+{
+  FTB_EXPR *up;
+  uint      flags;
+/* ^^^^^^^^^^^^^^^^^^ FTB_{EXPR,WORD} common section */
+  my_off_t  docid[2];
+  my_off_t  max_docid;
+  float     weight;
+  float     cur_weight;
+  LIST     *phrase;               /* phrase words */
+  LIST     *document;             /* for phrase search */
+  uint      yesses;               /* number of "yes" words matched */
+  uint      nos;                  /* number of "no"  words matched */
+  uint      ythresh;              /* number of "yes" words in expr */
+  uint      yweaks;               /* number of "yes" words for scan only */
+};
+
+typedef struct st_ftb_word
+{
+  FTB_EXPR  *up;
+  uint       flags;
+/* ^^^^^^^^^^^^^^^^^^ FTB_{EXPR,WORD} common section */
+  my_off_t   docid[2];             /* for index search and for scan */
+  my_off_t   key_root;
+  FTB_EXPR  *max_docid_expr;
+  MARIA_KEYDEF *keyinfo;
+  struct st_ftb_word *prev;
+  float      weight;
+  uint       ndepth;
+  uint       len;
+  uchar      off;
+  uchar      word[1];
+} FTB_WORD;
+
+typedef struct st_ft_info
+{
+  struct _ft_vft *please;
+  MARIA_HA   *info;
+  CHARSET_INFO *charset;
+  FTB_EXPR  *root;
+  FTB_WORD **list;
+  FTB_WORD  *last_word;
+  MEM_ROOT   mem_root;
+  QUEUE      queue;
+  TREE       no_dupes;
+  my_off_t   lastpos;
+  uint       keynr;
+  uchar      with_scan;
+  enum { UNINITIALIZED, READY, INDEX_SEARCH, INDEX_DONE } state;
+} FTB;
+
+static int FTB_WORD_cmp(my_off_t *v, FTB_WORD *a, FTB_WORD *b)
+{
+  int i;
+
+  /* if a==curdoc, take it as  a < b */
+  if (v && a->docid[0] == *v)
+    return -1;
+
+  /* ORDER BY docid, ndepth DESC */
+  i=CMP_NUM(a->docid[0], b->docid[0]);
+  if (!i)
+    i=CMP_NUM(b->ndepth,a->ndepth);
+  return i;
+}
+
+static int FTB_WORD_cmp_list(CHARSET_INFO *cs, FTB_WORD **a, FTB_WORD **b)
+{
+  /* ORDER BY word, ndepth */
+  int i= ha_compare_text(cs, (uchar*) (*a)->word + 1,(*a)->len - 1,
+                             (uchar*) (*b)->word + 1,(*b)->len - 1, 0, 0);
+  if (!i)
+    i=CMP_NUM((*a)->ndepth, (*b)->ndepth);
+  return i;
+}
+
+
+typedef struct st_my_ftb_param
+{
+  FTB *ftb;
+  FTB_EXPR *ftbe;
+  uchar *up_quot;
+  uint depth;
+} MY_FTB_PARAM;
+
+
+static int ftb_query_add_word(MYSQL_FTPARSER_PARAM *param,
+                              const uchar *word, mysql_ft_size_t word_len,
+                              MYSQL_FTPARSER_BOOLEAN_INFO *info)
+{
+  MY_FTB_PARAM *ftb_param= param->mysql_ftparam;
+  FTB_WORD *ftbw;
+  FTB_EXPR *ftbe, *tmp_expr;
+  FT_WORD *phrase_word;
+  LIST *tmp_element;
+  int r= info->weight_adjust;
+  float weight= (float)
+        (info->wasign ? nwghts : wghts)[(r>5)?5:((r<-5)?-5:r)];
+
+  switch (info->type) {
+    case FT_TOKEN_WORD:
+      ftbw= (FTB_WORD *)alloc_root(&ftb_param->ftb->mem_root,
+                                   sizeof(FTB_WORD) +
+                                   (info->trunc ? MARIA_MAX_KEY_BUFF :
+                                    word_len * ftb_param->ftb->charset->mbmaxlen +
+                                    HA_FT_WLEN +
+                                    ftb_param->ftb->info->s->rec_reflength));
+      ftbw->len= word_len + 1;
+      ftbw->flags= 0;
+      ftbw->off= 0;
+      if (info->yesno > 0) ftbw->flags|= FTB_FLAG_YES;
+      if (info->yesno < 0) ftbw->flags|= FTB_FLAG_NO;
+      if (info->trunc) ftbw->flags|= FTB_FLAG_TRUNC;
+      ftbw->weight= weight;
+      ftbw->up= ftb_param->ftbe;
+      ftbw->docid[0]= ftbw->docid[1]= HA_OFFSET_ERROR;
+      ftbw->ndepth= (info->yesno < 0) + ftb_param->depth;
+      ftbw->key_root= HA_OFFSET_ERROR;
+      memcpy(ftbw->word + 1, word, word_len);
+      ftbw->word[0]= word_len;
+      if (info->yesno > 0) ftbw->up->ythresh++;
+      ftb_param->ftb->queue.max_elements++;
+      ftbw->prev= ftb_param->ftb->last_word;
+      ftb_param->ftb->last_word= ftbw;
+      ftb_param->ftb->with_scan|= (info->trunc & FTB_FLAG_TRUNC);
+      for (tmp_expr= ftb_param->ftbe; tmp_expr->up; tmp_expr= tmp_expr->up)
+        if (! (tmp_expr->flags & FTB_FLAG_YES))
+          break;
+      ftbw->max_docid_expr= tmp_expr;
+      /* fall through */
+    case FT_TOKEN_STOPWORD:
+      if (! ftb_param->up_quot) break;
+      phrase_word= (FT_WORD *)alloc_root(&ftb_param->ftb->mem_root, sizeof(FT_WORD));
+      tmp_element= (LIST *)alloc_root(&ftb_param->ftb->mem_root, sizeof(LIST));
+      phrase_word->pos= (uchar*) word;
+      phrase_word->len= word_len;
+      tmp_element->data= (void *)phrase_word;
+      ftb_param->ftbe->phrase= list_add(ftb_param->ftbe->phrase, tmp_element);
+      /* Allocate document list at this point.
+         It allows to avoid huge amount of allocs/frees for each row.*/
+      tmp_element= (LIST *)alloc_root(&ftb_param->ftb->mem_root, sizeof(LIST));
+      tmp_element->data= alloc_root(&ftb_param->ftb->mem_root, sizeof(FT_WORD));
+      ftb_param->ftbe->document=
+        list_add(ftb_param->ftbe->document, tmp_element);
+      break;
+    case FT_TOKEN_LEFT_PAREN:
+      ftbe=(FTB_EXPR *)alloc_root(&ftb_param->ftb->mem_root, sizeof(FTB_EXPR));
+      ftbe->flags= 0;
+      if (info->yesno > 0) ftbe->flags|= FTB_FLAG_YES;
+      if (info->yesno < 0) ftbe->flags|= FTB_FLAG_NO;
+      ftbe->weight= weight;
+      ftbe->up= ftb_param->ftbe;
+      ftbe->max_docid= ftbe->ythresh= ftbe->yweaks= 0;
+      ftbe->docid[0]= ftbe->docid[1]= HA_OFFSET_ERROR;
+      ftbe->phrase= NULL;
+      ftbe->document= 0;
+      if (info->quot) ftb_param->ftb->with_scan|= 2;
+      if (info->yesno > 0) ftbe->up->ythresh++;
+      ftb_param->ftbe= ftbe;
+      ftb_param->depth++;
+      ftb_param->up_quot= (uchar*) info->quot;
+      break;
+    case FT_TOKEN_RIGHT_PAREN:
+      if (ftb_param->ftbe->document)
+      {
+        /* Circuit document list */
+        for (tmp_element= ftb_param->ftbe->document;
+             tmp_element->next; tmp_element= tmp_element->next) /* no-op */;
+        tmp_element->next= ftb_param->ftbe->document;
+        ftb_param->ftbe->document->prev= tmp_element;
+      }
+      info->quot= 0;
+      if (ftb_param->ftbe->up)
+      {
+        DBUG_ASSERT(ftb_param->depth);
+        ftb_param->ftbe= ftb_param->ftbe->up;
+        ftb_param->depth--;
+        ftb_param->up_quot= 0;
+      }
+      break;
+    case FT_TOKEN_EOF:
+    default:
+      break;
+  }
+  return(0);
+}
+
+
+static int ftb_parse_query_internal(MYSQL_FTPARSER_PARAM *param,
+                                    const uchar *query, mysql_ft_size_t len)
+{
+  MY_FTB_PARAM *ftb_param= param->mysql_ftparam;
+  MYSQL_FTPARSER_BOOLEAN_INFO info;
+  CHARSET_INFO *cs= ftb_param->ftb->charset;
+  const uchar **start= &query;
+  const uchar *end= query + len;
+  FT_WORD w;
+
+  info.prev= ' ';
+  info.quot= 0;
+  while (maria_ft_get_word(cs, start, end, &w, &info))
+    param->mysql_add_word(param, w.pos, w.len, &info);
+  return(0);
+}
+
+
+static int _ftb_parse_query(FTB *ftb, uchar *query, size_t len,
+                            struct st_mysql_ftparser *parser)
+{
+  MYSQL_FTPARSER_PARAM *param;
+  MY_FTB_PARAM ftb_param;
+  DBUG_ENTER("_ftb_parse_query");
+  DBUG_ASSERT(parser);
+
+  if (ftb->state != UNINITIALIZED)
+    DBUG_RETURN(0);
+  if (! (param= maria_ftparser_call_initializer(ftb->info, ftb->keynr, 0)))
+    DBUG_RETURN(1);
+
+  ftb_param.ftb= ftb;
+  ftb_param.depth= 0;
+  ftb_param.ftbe= ftb->root;
+  ftb_param.up_quot= 0;
+
+  param->mysql_parse= ftb_parse_query_internal;
+  param->mysql_add_word= ftb_query_add_word;
+  param->mysql_ftparam= (void *)&ftb_param;
+  param->cs= ftb->charset;
+  param->doc= query;
+  param->length= len;
+  param->flags= 0;
+  param->mode= MYSQL_FTPARSER_FULL_BOOLEAN_INFO;
+  DBUG_RETURN(parser->parse(param));
+}
+
+
+static int _ftb_no_dupes_cmp(void* not_used __attribute__((unused)),
+                             const void *a,const void *b)
+{
+  return CMP_NUM((*((my_off_t*)a)), (*((my_off_t*)b)));
+}
+
+
+/* returns 1 if the search was finished (must-word wasn't found) */
+
+static int _ft2_search(FTB *ftb, FTB_WORD *ftbw, my_bool init_search)
+{
+  int r;
+  int subkeys=1;
+  my_bool can_go_down;
+  MARIA_HA *info=ftb->info;
+  uint off, extra=HA_FT_WLEN+info->s->base.rec_reflength;
+  uchar *lastkey_buf= ftbw->word+ftbw->off;
+  MARIA_KEY key;
+  LINT_INIT(off);
+
+  if (ftbw->flags & FTB_FLAG_TRUNC)
+    lastkey_buf+=ftbw->len;
+
+  if (init_search)
+  {
+    ftbw->key_root=info->s->state.key_root[ftb->keynr];
+    ftbw->keyinfo=info->s->keyinfo+ftb->keynr;
+    key.keyinfo= ftbw->keyinfo;
+    key.data= ftbw->word;
+    key.data_length= ftbw->len;
+    key.ref_length= 0;
+    key.flag= 0;
+
+    r= _ma_search(info, &key, SEARCH_FIND | SEARCH_BIGGER, ftbw->key_root);
+  }
+  else
+  {
+    uint sflag= SEARCH_BIGGER;
+    my_off_t max_docid=0;
+    FTB_EXPR *tmp;
+
+    for (tmp= ftbw->max_docid_expr; tmp; tmp= tmp->up)
+      set_if_bigger(max_docid, tmp->max_docid);
+
+    if (ftbw->docid[0] < max_docid)
+    {
+      sflag|= SEARCH_SAME;
+      _ma_dpointer(info->s, (uchar*) (ftbw->word + ftbw->len + HA_FT_WLEN),
+                   max_docid);
+    }
+
+    key.keyinfo= ftbw->keyinfo;
+    key.data= lastkey_buf;
+    key.data_length= USE_WHOLE_KEY;
+    key.ref_length= 0;
+    key.flag= 0;
+
+    r= _ma_search(info, &key, sflag, ftbw->key_root);
+  }
+
+  can_go_down=(!ftbw->off && (init_search || (ftbw->flags & FTB_FLAG_TRUNC)));
+  /* Skip rows inserted by concurrent insert */
+  while (!r)
+  {
+    if (can_go_down)
+    {
+      /* going down ? */
+      off= info->last_key.data_length + info->last_key.ref_length - extra;
+      subkeys=ft_sintXkorr(info->last_key.data + off);
+    }
+    if (subkeys<0 || info->cur_row.lastpos < info->state->data_file_length)
+      break;
+    r= _ma_search_next(info, &info->last_key, SEARCH_BIGGER, ftbw->key_root);
+  }
+
+  if (!r && !ftbw->off)
+  {
+    r= ha_compare_text(ftb->charset,
+                       info->last_key.data+1,
+                       info->last_key.data_length + info->last_key.ref_length-
+                       extra-1,
+                       (uchar*) ftbw->word+1,
+                       ftbw->len-1,
+                       (my_bool) (ftbw->flags & FTB_FLAG_TRUNC), 0);
+  }
+
+  if (r) /* not found */
+  {
+    if (!ftbw->off || !(ftbw->flags & FTB_FLAG_TRUNC))
+    {
+      ftbw->docid[0]=HA_OFFSET_ERROR;
+      if ((ftbw->flags & FTB_FLAG_YES) && ftbw->up->up==0)
+      {
+        /*
+          This word MUST BE present in every document returned,
+          so we can stop the search right now
+        */
+        ftb->state=INDEX_DONE;
+        return 1; /* search is done */
+      }
+      else
+        return 0;
+    }
+
+    /* going up to the first-level tree to continue search there */
+    _ma_dpointer(info->s, (lastkey_buf+HA_FT_WLEN), ftbw->key_root);
+    ftbw->key_root=info->s->state.key_root[ftb->keynr];
+    ftbw->keyinfo=info->s->keyinfo+ftb->keynr;
+    ftbw->off=0;
+    return _ft2_search(ftb, ftbw, 0);
+  }
+
+  /* matching key found */
+  memcpy(lastkey_buf, info->last_key.data,
+         info->last_key.data_length + info->last_key.ref_length);
+  if (lastkey_buf == ftbw->word)
+    ftbw->len= info->last_key.data_length + info->last_key.ref_length - extra;
+
+  /* going down ? */
+  if (subkeys<0)
+  {
+    /*
+      yep, going down, to the second-level tree
+      TODO here: subkey-based optimization
+    */
+    ftbw->off=off;
+    ftbw->key_root= info->cur_row.lastpos;
+    ftbw->keyinfo=& info->s->ft2_keyinfo;
+    r= _ma_search_first(info, ftbw->keyinfo, ftbw->key_root);
+    DBUG_ASSERT(r==0);  /* found something */
+    memcpy(lastkey_buf+off, info->last_key.data,
+           info->last_key.data_length + info->last_key.ref_length);
+  }
+  ftbw->docid[0]= info->cur_row.lastpos;
+  if (ftbw->flags & FTB_FLAG_YES && !(ftbw->flags & FTB_FLAG_TRUNC))
+    ftbw->max_docid_expr->max_docid= info->cur_row.lastpos;
+  return 0;
+}
+
+static void _ftb_init_index_search(FT_INFO *ftb)
+{
+  int i;
+  FTB_WORD   *ftbw;
+
+  if (ftb->state == UNINITIALIZED || ftb->keynr == NO_SUCH_KEY)
+    return;
+  ftb->state=INDEX_SEARCH;
+
+  for (i= queue_last_element(&ftb->queue);
+       (int) i >= (int) queue_first_element(&ftb->queue);
+       i--)
+  {
+    ftbw=(FTB_WORD *)(queue_element(&ftb->queue, i));
+
+    if (ftbw->flags & FTB_FLAG_TRUNC)
+    {
+      /*
+        special treatment for truncation operator
+        1. there are some (besides this) +words
+           | no need to search in the index, it can never ADD new rows
+           | to the result, and to remove half-matched rows we do scan anyway
+        2. -trunc*
+           | same as 1.
+        3. in 1 and 2, +/- need not be on the same expr. level,
+           but can be on any upper level, as in +word +(trunc1* trunc2*)
+        4. otherwise
+           | We have to index-search for this prefix.
+           | It may cause duplicates, as in the index (sorted by <word,docid>)
+           |   <aaaa,row1>
+           |   <aabb,row2>
+           |   <aacc,row1>
+           | Searching for "aa*" will find row1 twice...
+      */
+      FTB_EXPR *ftbe;
+      for (ftbe=(FTB_EXPR*)ftbw;
+           ftbe->up && !(ftbe->up->flags & FTB_FLAG_TRUNC);
+           ftbe->up->flags|= FTB_FLAG_TRUNC, ftbe=ftbe->up)
+      {
+        if (ftbe->flags & FTB_FLAG_NO ||                     /* 2 */
+            ftbe->up->ythresh - ftbe->up->yweaks >
+            (uint) test(ftbe->flags & FTB_FLAG_YES))         /* 1 */
+        {
+          FTB_EXPR *top_ftbe=ftbe->up;
+          ftbw->docid[0]=HA_OFFSET_ERROR;
+          for (ftbe=(FTB_EXPR *)ftbw;
+               ftbe != top_ftbe && !(ftbe->flags & FTB_FLAG_NO);
+               ftbe=ftbe->up)
+              ftbe->up->yweaks++;
+          ftbe=0;
+          break;
+        }
+      }
+      if (!ftbe)
+        continue;
+      /* 4 */
+      if (!is_tree_inited(& ftb->no_dupes))
+        init_tree(& ftb->no_dupes,0,0,sizeof(my_off_t),
+            _ftb_no_dupes_cmp,0,0,0);
+      else
+        reset_tree(& ftb->no_dupes);
+    }
+
+    ftbw->off=0; /* in case of reinit */
+    if (_ft2_search(ftb, ftbw, 1))
+      return;
+  }
+  queue_fix(& ftb->queue);
+}
+
+
+FT_INFO * maria_ft_init_boolean_search(MARIA_HA *info, uint keynr,
+                                       uchar *query, size_t query_len,
+                                       CHARSET_INFO *cs)
+{
+  FTB       *ftb;
+  FTB_EXPR  *ftbe;
+  FTB_WORD  *ftbw;
+
+  if (!(ftb=(FTB *)my_malloc(sizeof(FTB), MYF(MY_WME))))
+    return 0;
+  ftb->please= (struct _ft_vft *) & _ma_ft_vft_boolean;
+  ftb->state=UNINITIALIZED;
+  ftb->info=info;
+  ftb->keynr=keynr;
+  ftb->charset=cs;
+  DBUG_ASSERT(keynr==NO_SUCH_KEY || cs == info->s->keyinfo[keynr].seg->charset);
+  ftb->with_scan=0;
+  ftb->lastpos=HA_OFFSET_ERROR;
+  bzero(& ftb->no_dupes, sizeof(TREE));
+  ftb->last_word= 0;
+
+  init_alloc_root(&ftb->mem_root, 1024, 1024);
+  ftb->queue.max_elements= 0;
+  if (!(ftbe=(FTB_EXPR *)alloc_root(&ftb->mem_root, sizeof(FTB_EXPR))))
+    goto err;
+  ftbe->weight=1;
+  ftbe->flags=FTB_FLAG_YES;
+  ftbe->nos=1;
+  ftbe->up=0;
+  ftbe->max_docid= ftbe->ythresh= ftbe->yweaks= 0;
+  ftbe->docid[0]=ftbe->docid[1]=HA_OFFSET_ERROR;
+  ftbe->phrase= NULL;
+  ftbe->document= 0;
+  ftb->root=ftbe;
+  if (unlikely(_ftb_parse_query(ftb, query, query_len,
+                                keynr == NO_SUCH_KEY ? &ft_default_parser :
+                                info->s->keyinfo[keynr].parser)))
+    goto err;
+  /*
+    Hack: instead of init_queue, we'll use reinit queue to be able
+    to alloc queue with alloc_root()
+  */
+  if (! (ftb->queue.root= (uchar **)alloc_root(&ftb->mem_root,
+                                              (ftb->queue.max_elements + 1) *
+                                              sizeof(void *))))
+    goto err;
+  reinit_queue(&ftb->queue, ftb->queue.max_elements, 0, 0,
+               (int (*)(void*, uchar*, uchar*))FTB_WORD_cmp, 0, 0, 0);
+  for (ftbw= ftb->last_word; ftbw; ftbw= ftbw->prev)
+    queue_insert(&ftb->queue, (uchar *)ftbw);
+  ftb->list=(FTB_WORD **)alloc_root(&ftb->mem_root,
+                                     sizeof(FTB_WORD *)*ftb->queue.elements);
+  memcpy(ftb->list, ftb->queue.root+1, sizeof(FTB_WORD *)*ftb->queue.elements);
+  my_qsort2(ftb->list, ftb->queue.elements, sizeof(FTB_WORD *),
+            (qsort2_cmp)FTB_WORD_cmp_list, (void*) ftb->charset);
+  if (ftb->queue.elements<2) ftb->with_scan &= ~FTB_FLAG_TRUNC;
+  ftb->state=READY;
+  return ftb;
+err:
+  free_root(& ftb->mem_root, MYF(0));
+  my_free(ftb, MYF(0));
+  return 0;
+}
+
+
+typedef struct st_my_ftb_phrase_param
+{
+  LIST *phrase;
+  LIST *document;
+  CHARSET_INFO *cs;
+  uint phrase_length;
+  uint document_length;
+  uint match;
+} MY_FTB_PHRASE_PARAM;
+
+
+static int ftb_phrase_add_word(MYSQL_FTPARSER_PARAM *param,
+                               const uchar *word, mysql_ft_size_t word_len,
+                               MYSQL_FTPARSER_BOOLEAN_INFO
+                               *boolean_info __attribute__((unused)))
+{
+  MY_FTB_PHRASE_PARAM *phrase_param= param->mysql_ftparam;
+  FT_WORD *w= (FT_WORD *)phrase_param->document->data;
+  LIST *phrase, *document;
+  w->pos= (uchar*) word;
+  w->len= word_len;
+  phrase_param->document= phrase_param->document->prev;
+  if (phrase_param->phrase_length > phrase_param->document_length)
+  {
+    phrase_param->document_length++;
+    return 0;
+  }
+  /* TODO: rewrite phrase search to avoid
+     comparing the same word twice. */
+  for (phrase= phrase_param->phrase, document= phrase_param->document->next;
+       phrase; phrase= phrase->next, document= document->next)
+  {
+    FT_WORD *phrase_word= (FT_WORD *)phrase->data;
+    FT_WORD *document_word= (FT_WORD *)document->data;
+    if (my_strnncoll(phrase_param->cs, (uchar*) phrase_word->pos,
+                     phrase_word->len,
+                     (uchar*) document_word->pos, document_word->len))
+      return 0;
+  }
+  phrase_param->match++;
+  return 0;
+}
+
+
+static int ftb_check_phrase_internal(MYSQL_FTPARSER_PARAM *param,
+                                     const uchar *document,
+                                     mysql_ft_size_t len)
+{
+  FT_WORD word;
+  MY_FTB_PHRASE_PARAM *phrase_param= param->mysql_ftparam;
+  const uchar *docend= document + len;
+  while (maria_ft_simple_get_word(phrase_param->cs, &document,
+                                  docend, &word, FALSE))
+  {
+    param->mysql_add_word(param, word.pos, word.len, 0);
+    if (phrase_param->match)
+      break;
+  }
+  return 0;
+}
+
+
+/*
+  Checks if given buffer matches phrase list.
+
+  SYNOPSIS
+    _ftb_check_phrase()
+    s0     start of buffer
+    e0     end of buffer
+    phrase broken into list phrase
+    cs     charset info
+
+  RETURN VALUE
+    1 is returned if phrase found, 0 else.
+    -1 is returned if error occurs.
+*/
+
+static int _ftb_check_phrase(FTB *ftb, const uchar *document, size_t len,
+                             FTB_EXPR *ftbe, struct st_mysql_ftparser *parser)
+{
+  MY_FTB_PHRASE_PARAM ftb_param;
+  MYSQL_FTPARSER_PARAM *param;
+  DBUG_ENTER("_ftb_check_phrase");
+  DBUG_ASSERT(parser);
+
+  if (! (param= maria_ftparser_call_initializer(ftb->info, ftb->keynr, 1)))
+    DBUG_RETURN(0);
+  ftb_param.phrase= ftbe->phrase;
+  ftb_param.document= ftbe->document;
+  ftb_param.cs= ftb->charset;
+  ftb_param.phrase_length= list_length(ftbe->phrase);
+  ftb_param.document_length= 1;
+  ftb_param.match= 0;
+
+  param->mysql_parse= ftb_check_phrase_internal;
+  param->mysql_add_word= ftb_phrase_add_word;
+  param->mysql_ftparam= (void *)&ftb_param;
+  param->cs= ftb->charset;
+  param->doc= document;
+  param->length= len;
+  param->flags= 0;
+  param->mode= MYSQL_FTPARSER_WITH_STOPWORDS;
+  if (unlikely(parser->parse(param)))
+    return -1;
+  DBUG_RETURN(ftb_param.match ? 1 : 0);
+}
+
+
+static int _ftb_climb_the_tree(FTB *ftb, FTB_WORD *ftbw, FT_SEG_ITERATOR *ftsi_orig)
+{
+  FT_SEG_ITERATOR ftsi;
+  FTB_EXPR *ftbe;
+  float weight=ftbw->weight;
+  int  yn_flag= ftbw->flags, ythresh, mode=(ftsi_orig != 0);
+  my_off_t curdoc=ftbw->docid[mode];
+  struct st_mysql_ftparser *parser= ftb->keynr == NO_SUCH_KEY ?
+                                    &ft_default_parser :
+                                    ftb->info->s->keyinfo[ftb->keynr].parser;
+
+  for (ftbe=ftbw->up; ftbe; ftbe=ftbe->up)
+  {
+    ythresh = ftbe->ythresh - (mode ? 0 : ftbe->yweaks);
+    if (ftbe->docid[mode] != curdoc)
+    {
+      ftbe->cur_weight=0;
+      ftbe->yesses=ftbe->nos=0;
+      ftbe->docid[mode]=curdoc;
+    }
+    if (ftbe->nos)
+      break;
+    if (yn_flag & FTB_FLAG_YES)
+    {
+      weight /= ftbe->ythresh;
+      ftbe->cur_weight += weight;
+      if ((int) ++ftbe->yesses == ythresh)
+      {
+        yn_flag=ftbe->flags;
+        weight=ftbe->cur_weight*ftbe->weight;
+        if (mode && ftbe->phrase)
+        {
+          int found= 0;
+
+          memcpy(&ftsi, ftsi_orig, sizeof(ftsi));
+          while (_ma_ft_segiterator(&ftsi) && !found)
+          {
+            if (!ftsi.pos)
+              continue;
+            found= _ftb_check_phrase(ftb, ftsi.pos, ftsi.len, ftbe, parser);
+            if (unlikely(found < 0))
+              return 1;
+          }
+          if (!found)
+            break;
+        } /* ftbe->quot */
+      }
+      else
+        break;
+    }
+    else
+    if (yn_flag & FTB_FLAG_NO)
+    {
+      /*
+        NOTE: special sort function of queue assures that all
+        (yn_flag & FTB_FLAG_NO) != 0
+        events for every particular subexpression will
+        "auto-magically" happen BEFORE all the
+        (yn_flag & FTB_FLAG_YES) != 0 events. So no
+        already matched expression can become not-matched again.
+      */
+      ++ftbe->nos;
+      break;
+    }
+    else
+    {
+      if (ftbe->ythresh)
+        weight/=3;
+      ftbe->cur_weight +=  weight;
+      if ((int) ftbe->yesses < ythresh)
+        break;
+      if (!(yn_flag & FTB_FLAG_WONLY))
+        yn_flag= ((int) ftbe->yesses++ == ythresh) ? ftbe->flags : FTB_FLAG_WONLY ;
+      weight*= ftbe->weight;
+    }
+  }
+  return 0;
+}
+
+
+int maria_ft_boolean_read_next(FT_INFO *ftb, char *record)
+{
+  FTB_EXPR  *ftbe;
+  FTB_WORD  *ftbw;
+  MARIA_HA   *info=ftb->info;
+  my_off_t   curdoc;
+
+  if (ftb->state != INDEX_SEARCH && ftb->state != INDEX_DONE)
+    return -1;
+
+  /* black magic ON */
+  if ((int) _ma_check_index(info, ftb->keynr) < 0)
+    return my_errno;
+  if (_ma_readinfo(info, F_RDLCK, 1))
+    return my_errno;
+  /* black magic OFF */
+
+  if (!ftb->queue.elements)
+    return my_errno=HA_ERR_END_OF_FILE;
+
+  /* Attention!!! Address of a local variable is used here! See err: label */
+  ftb->queue.first_cmp_arg=(void *)&curdoc;
+
+  while (ftb->state == INDEX_SEARCH &&
+         (curdoc=((FTB_WORD *)queue_top(& ftb->queue))->docid[0]) !=
+         HA_OFFSET_ERROR)
+  {
+    while (curdoc == (ftbw=(FTB_WORD *)queue_top(& ftb->queue))->docid[0])
+    {
+      if (unlikely(_ftb_climb_the_tree(ftb, ftbw, 0)))
+      {
+        my_errno= HA_ERR_OUT_OF_MEM;
+        goto err;
+      }
+
+      /* update queue */
+      _ft2_search(ftb, ftbw, 0);
+      queue_replace_top(&ftb->queue);
+    }
+
+    ftbe=ftb->root;
+    if (ftbe->docid[0]==curdoc && ftbe->cur_weight>0 &&
+        ftbe->yesses>=(ftbe->ythresh-ftbe->yweaks) && !ftbe->nos)
+    {
+      /* curdoc matched ! */
+      if (is_tree_inited(&ftb->no_dupes) &&
+          tree_insert(&ftb->no_dupes, &curdoc, 0,
+                      ftb->no_dupes.custom_arg)->count >1)
+        /* but it managed already to get past this line once */
+        continue;
+
+      info->cur_row.lastpos= curdoc;
+      /* Clear all states, except that the table was updated */
+      info->update&= (HA_STATE_CHANGED | HA_STATE_ROW_CHANGED);
+
+      if (!(*info->read_record)(info, (uchar *) record, curdoc))
+      {
+        info->update|= HA_STATE_AKTIV;          /* Record is read */
+        if (ftb->with_scan &&
+            maria_ft_boolean_find_relevance(ftb, (uchar *) record, 0)==0)
+            continue; /* no match */
+        my_errno=0;
+        goto err;
+      }
+      goto err;
+    }
+  }
+  ftb->state=INDEX_DONE;
+  my_errno=HA_ERR_END_OF_FILE;
+err:
+  ftb->queue.first_cmp_arg=(void *)0;
+  return my_errno;
+}
+
+
+typedef struct st_my_ftb_find_param
+{
+  FT_INFO *ftb;
+  FT_SEG_ITERATOR *ftsi;
+} MY_FTB_FIND_PARAM;
+
+
+static int ftb_find_relevance_add_word(MYSQL_FTPARSER_PARAM *param,
+                                       const uchar *word, mysql_ft_size_t len,
+                                       MYSQL_FTPARSER_BOOLEAN_INFO
+                                       *boolean_info __attribute__((unused)))
+{
+  MY_FTB_FIND_PARAM *ftb_param= param->mysql_ftparam;
+  FT_INFO *ftb= ftb_param->ftb;
+  FTB_WORD *ftbw;
+  int a, b, c;
+  /*
+    Find right-most element in the array of query words matching this
+    word from a document.
+  */
+  for (a= 0, b= ftb->queue.elements, c= (a+b)/2; b-a>1; c= (a+b)/2)
+  {
+    ftbw= ftb->list[c];
+    if (ha_compare_text(ftb->charset, (uchar*)word, len,
+                        (uchar*)ftbw->word+1, ftbw->len-1,
+                        (my_bool)(ftbw->flags&FTB_FLAG_TRUNC), 0) < 0)
+      b= c;
+    else
+      a= c;
+  }
+  /*
+    If there were no words with truncation operator, we iterate to the
+    beginning of an array until array element is equal to the word from
+    a document. This is done mainly because the same word may be
+    mentioned twice (or more) in the query.
+
+    In case query has words with truncation operator we must iterate
+    to the beginning of the array. There may be non-matching query words
+    between matching word with truncation operator and the right-most
+    matching element. E.g., if we're looking for 'aaa15' in an array of
+    'aaa1* aaa14 aaa15 aaa16'.
+
+    Worse of that there still may be match even if the binary search
+    above didn't find matching element. E.g., if we're looking for
+    'aaa15' in an array of 'aaa1* aaa14 aaa16'. The binary search will
+    stop at 'aaa16'.
+  */
+  for (; c >= 0; c--)
+  {
+    ftbw= ftb->list[c];
+    if (ha_compare_text(ftb->charset, (uchar*)word, len,
+                        (uchar*)ftbw->word + 1,ftbw->len - 1,
+                        (my_bool)(ftbw->flags & FTB_FLAG_TRUNC), 0))
+    {
+      if (ftb->with_scan & FTB_FLAG_TRUNC)
+        continue;
+      else
+        break;
+    }
+    if (ftbw->docid[1] == ftb->info->cur_row.lastpos)
+      continue;
+    ftbw->docid[1]= ftb->info->cur_row.lastpos;
+    if (unlikely(_ftb_climb_the_tree(ftb, ftbw, ftb_param->ftsi)))
+      return 1;
+  }
+  return(0);
+}
+
+
+static int ftb_find_relevance_parse(MYSQL_FTPARSER_PARAM *param,
+                                    const uchar *doc, mysql_ft_size_t len)
+{
+  MY_FTB_FIND_PARAM *ftb_param= param->mysql_ftparam;
+  FT_INFO *ftb= ftb_param->ftb;
+  const uchar *end= doc + len;
+  FT_WORD w;
+  while (maria_ft_simple_get_word(ftb->charset, &doc, end, &w, TRUE))
+    param->mysql_add_word(param, w.pos, w.len, 0);
+  return(0);
+}
+
+
+float maria_ft_boolean_find_relevance(FT_INFO *ftb, uchar *record, uint length)
+{
+  FTB_EXPR *ftbe;
+  FT_SEG_ITERATOR ftsi, ftsi2;
+  MARIA_RECORD_POS docid= ftb->info->cur_row.lastpos;
+  MY_FTB_FIND_PARAM ftb_param;
+  MYSQL_FTPARSER_PARAM *param;
+  struct st_mysql_ftparser *parser= ftb->keynr == NO_SUCH_KEY ?
+                                    &ft_default_parser :
+                                    ftb->info->s->keyinfo[ftb->keynr].parser;
+
+  if (docid == HA_OFFSET_ERROR)
+    return -2.0;
+  if (!ftb->queue.elements)
+    return 0;
+  if (! (param= maria_ftparser_call_initializer(ftb->info, ftb->keynr, 0)))
+    return 0;
+
+  if (ftb->state != INDEX_SEARCH && docid <= ftb->lastpos)
+  {
+    FTB_EXPR *x;
+    uint i;
+
+    for (i=0; i < ftb->queue.elements; i++)
+    {
+      ftb->list[i]->docid[1]=HA_OFFSET_ERROR;
+      for (x=ftb->list[i]->up; x; x=x->up)
+        x->docid[1]=HA_OFFSET_ERROR;
+    }
+  }
+
+  ftb->lastpos=docid;
+
+  if (ftb->keynr==NO_SUCH_KEY)
+    _ma_ft_segiterator_dummy_init(record, length, &ftsi);
+  else
+    _ma_ft_segiterator_init(ftb->info, ftb->keynr, record, &ftsi);
+  memcpy(&ftsi2, &ftsi, sizeof(ftsi));
+
+  ftb_param.ftb= ftb;
+  ftb_param.ftsi= &ftsi2;
+  param->mysql_parse= ftb_find_relevance_parse;
+  param->mysql_add_word= ftb_find_relevance_add_word;
+  param->mysql_ftparam= (void *)&ftb_param;
+  param->flags= 0;
+  param->cs= ftb->charset;
+  param->mode= MYSQL_FTPARSER_SIMPLE_MODE;
+
+  while (_ma_ft_segiterator(&ftsi))
+  {
+    if (!ftsi.pos)
+      continue;
+    param->doc= ftsi.pos;
+    param->length= ftsi.len;
+    if (unlikely(parser->parse(param)))
+      return 0;
+  }
+  ftbe=ftb->root;
+  if (ftbe->docid[1]==docid && ftbe->cur_weight>0 &&
+      ftbe->yesses>=ftbe->ythresh && !ftbe->nos)
+  { /* row matched ! */
+    return ftbe->cur_weight;
+  }
+  else
+  { /* match failed ! */
+    return 0.0;
+  }
+}
+
+
+void maria_ft_boolean_close_search(FT_INFO *ftb)
+{
+  if (is_tree_inited(& ftb->no_dupes))
+  {
+    delete_tree(& ftb->no_dupes);
+  }
+  free_root(& ftb->mem_root, MYF(0));
+  my_free(ftb, MYF(0));
+}
+
+
+float maria_ft_boolean_get_relevance(FT_INFO *ftb)
+{
+  return ftb->root->cur_weight;
+}
+
+
+void maria_ft_boolean_reinit_search(FT_INFO *ftb)
+{
+  _ftb_init_index_search(ftb);
+}
diff --git a/storage/maria/ma_ft_eval.c b/storage/maria/ma_ft_eval.c
new file mode 100644
index 00000000000..5fc67c6c664
--- /dev/null
+++ b/storage/maria/ma_ft_eval.c
@@ -0,0 +1,254 @@
+/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; version 2 of the License.
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */
+
+/* Written by Sergei A. Golubchik, who has a shared copyright to this code
+   added support for long options (my_getopt) 22.5.2002 by Jani Tolonen */
+
+#include "ma_ftdefs.h"
+#include "maria_ft_eval.h"
+#include <stdarg.h>
+#include <my_getopt.h>
+
+static void print_error(int exit_code, const char *fmt,...);
+static void get_options(int argc, char *argv[]);
+static int create_record(char *pos, FILE *file);
+static void usage();
+
+static struct my_option my_long_options[] =
+{
+  {"", 's', "", 0, 0, 0, GET_STR, REQUIRED_ARG, 0, 0, 0, 0, 0, 0},
+  {"", 'q', "", 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
+  {"", 'S', "", 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
+  {"", '#', "", 0, 0, 0, GET_STR, REQUIRED_ARG, 0, 0, 0, 0, 0, 0},
+  {"", 'V', "", 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
+  {"", '?', "", 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
+  {"", 'h', "", 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
+  { 0, 0, 0, 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}
+};
+
+int main(int argc, char *argv[])
+{
+  MARIA_HA *file;
+  int i,j;
+
+  MY_INIT(argv[0]);
+  get_options(argc,argv);
+  bzero((char*)recinfo,sizeof(recinfo));
+
+  maria_init();
+  /* First define 2 columns */
+  recinfo[0].type=FIELD_SKIP_ENDSPACE;
+  recinfo[0].length=docid_length;
+  recinfo[1].type=FIELD_BLOB;
+  recinfo[1].length= 4+portable_sizeof_char_ptr;
+
+  /* Define a key over the first column */
+  keyinfo[0].seg=keyseg;
+  keyinfo[0].keysegs=1;
+  keyinfo[0].block_length= 0;                   /* Default block length */
+  keyinfo[0].seg[0].type= HA_KEYTYPE_TEXT;
+  keyinfo[0].seg[0].flag= HA_BLOB_PART;
+  keyinfo[0].seg[0].start=recinfo[0].length;
+  keyinfo[0].seg[0].length=key_length;
+  keyinfo[0].seg[0].null_bit=0;
+  keyinfo[0].seg[0].null_pos=0;
+  keyinfo[0].seg[0].bit_start=4;
+  keyinfo[0].seg[0].language=MY_CHARSET_CURRENT;
+  keyinfo[0].flag = HA_FULLTEXT;
+
+  if (!silent)
+    printf("- Creating isam-file\n");
+  if (maria_create(filename,1,keyinfo,2,recinfo,0,NULL,(MARIA_CREATE_INFO*) 0,0))
+    goto err;
+  if (!(file=maria_open(filename,2,0)))
+    goto err;
+  if (!silent)
+    printf("Initializing stopwords\n");
+  maria_ft_init_stopwords(stopwordlist);
+
+  if (!silent)
+    printf("- Writing key:s\n");
+
+  my_errno=0;
+  i=0;
+  while (create_record(record,df))
+  {
+    error=maria_write(file,record);
+    if (error)
+      printf("I= %2d  maria_write: %d  errno: %d\n",i,error,my_errno);
+    i++;
+  }
+  fclose(df);
+
+  if (maria_close(file)) goto err;
+  if (!silent)
+    printf("- Reopening file\n");
+  if (!(file=maria_open(filename,2,0))) goto err;
+  if (!silent)
+    printf("- Reading rows with key\n");
+  for (i=1;create_record(record,qf);i++)
+  {
+    FT_DOCLIST *result;
+    double w;
+    int t, err;
+
+    result=maria_ft_nlq_init_search(file,0,blob_record,(uint) strlen(blob_record),1);
+    if (!result)
+    {
+      printf("Query %d failed with errno %3d\n",i,my_errno);
+      goto err;
+    }
+    if (!silent)
+      printf("Query %d. Found: %d.\n",i,result->ndocs);
+    for (j=0;(err=maria_ft_nlq_read_next(result, read_record))==0;j++)
+    {
+      t=uint2korr(read_record);
+      w=maria_ft_nlq_get_relevance(result);
+      printf("%d %.*s %f\n",i,t,read_record+2,w);
+    }
+    if (err != HA_ERR_END_OF_FILE)
+    {
+      printf("maria_ft_read_next %d failed with errno %3d\n",j,my_errno);
+      goto err;
+    }
+    maria_ft_nlq_close_search(result);
+  }
+
+  if (maria_close(file)) goto err;
+  maria_end();
+  my_end(MY_CHECK_ERROR);
+
+  return (0);
+
+ err:
+  printf("got error: %3d when using maria-database\n",my_errno);
+  return 1;			/* skip warning */
+
+}
+
+
+static my_bool
+get_one_option(int optid, const struct my_option *opt __attribute__((unused)),
+	       char *argument)
+{
+  switch (optid) {
+  case 's':
+    if (stopwordlist && stopwordlist != maria_ft_precompiled_stopwords)
+      break;
+    {
+      FILE *f; char s[HA_FT_MAXLEN]; int i=0,n=SWL_INIT;
+
+      if (!(stopwordlist=(const char**) malloc(n*sizeof(char *))))
+	print_error(1,"malloc(%d)",n*sizeof(char *));
+      if (!(f=fopen(argument,"r")))
+	print_error(1,"fopen(%s)",argument);
+      while (!feof(f))
+      {
+	if (!(fgets(s,HA_FT_MAXLEN,f)))
+	  print_error(1,"fgets(s,%d,%s)",HA_FT_MAXLEN,argument);
+	if (!(stopwordlist[i++]=strdup(s)))
+	  print_error(1,"strdup(%s)",s);
+	if (i >= n)
+	{
+	  n+=SWL_PLUS;
+	  if (!(stopwordlist=(const char**) realloc((char*) stopwordlist,
+						    n*sizeof(char *))))
+	    print_error(1,"realloc(%d)",n*sizeof(char *));
+	}
+      }
+      fclose(f);
+      stopwordlist[i]=NULL;
+      break;
+    }
+  case 'q': silent=1; break;
+  case 'S': if (stopwordlist==maria_ft_precompiled_stopwords) stopwordlist=NULL; break;
+  case '#':
+    DBUG_PUSH (argument);
+    break;
+  case 'V':
+  case '?':
+  case 'h':
+    usage();
+    exit(1);
+  }
+  return 0;
+}
+
+
+static void get_options(int argc, char *argv[])
+{
+  int ho_error;
+
+  if ((ho_error=handle_options(&argc, &argv, my_long_options, get_one_option)))
+    exit(ho_error);
+
+  if (!(d_file=argv[optind])) print_error(1,"No d_file");
+  if (!(df=fopen(d_file,"r")))
+    print_error(1,"fopen(%s)",d_file);
+  if (!(q_file=argv[optind+1])) print_error(1,"No q_file");
+  if (!(qf=fopen(q_file,"r")))
+    print_error(1,"fopen(%s)",q_file);
+  return;
+} /* get options */
+
+
+static int create_record(char *pos, FILE *file)
+{
+  uint tmp; char *ptr;
+
+  bzero((char *)pos,MAX_REC_LENGTH);
+
+  /* column 1 - VARCHAR */
+  if (!(fgets(pos+2,MAX_REC_LENGTH-32,file)))
+  {
+    if (feof(file))
+      return 0;
+    else
+      print_error(1,"fgets(docid) - 1");
+  }
+  tmp=(uint) strlen(pos+2)-1;
+  int2store(pos,tmp);
+  pos+=recinfo[0].length;
+
+  /* column 2 - BLOB */
+
+  if (!(fgets(blob_record,MAX_BLOB_LENGTH,file)))
+    print_error(1,"fgets(docid) - 2");
+  tmp=(uint) strlen(blob_record);
+  int4store(pos,tmp);
+  ptr=blob_record;
+  memcpy_fixed(pos+4,&ptr,sizeof(char*));
+  return 1;
+}
+
+/* VARARGS */
+
+static void print_error(int exit_code, const char *fmt,...)
+{
+  va_list args;
+
+  va_start(args,fmt);
+  fprintf(stderr,"%s: error: ",my_progname);
+  VOID(vfprintf(stderr, fmt, args));
+  VOID(fputc('\n',stderr));
+  fflush(stderr);
+  va_end(args);
+  exit(exit_code);
+}
+
+
+static void usage()
+{
+  printf("%s [options]\n", my_progname);
+  my_print_help(my_long_options);
+  my_print_variables(my_long_options);
+}
diff --git a/storage/maria/ma_ft_eval.h b/storage/maria/ma_ft_eval.h
new file mode 100644
index 00000000000..481943dfb0b
--- /dev/null
+++ b/storage/maria/ma_ft_eval.h
@@ -0,0 +1,41 @@
+/* Copyright (C) 2006 MySQL AB & Sergei A. Golubchik
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; version 2 of the License.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */
+
+/* Written by Sergei A. Golubchik, who has a shared copyright to this code */
+
+const char **stopwordlist=maria_ft_precompiled_stopwords;
+
+#define MAX_REC_LENGTH 128
+#define MAX_BLOB_LENGTH 60000
+char record[MAX_REC_LENGTH], read_record[MAX_REC_LENGTH+MAX_BLOB_LENGTH];
+char blob_record[MAX_BLOB_LENGTH+20*20];
+
+char *filename= (char*) "EVAL";
+
+int silent=0, error=0;
+
+uint key_length=MAX_BLOB_LENGTH,docid_length=32;
+char *d_file, *q_file;
+FILE *df,*qf;
+
+MARIA_COLUMNDEF recinfo[3];
+MARIA_KEYDEF keyinfo[2];
+HA_KEYSEG keyseg[10];
+
+#define SWL_INIT 500
+#define SWL_PLUS 50
+
+#define MAX_LINE_LENGTH 128
+char line[MAX_LINE_LENGTH];
diff --git a/storage/maria/ma_ft_nlq_search.c b/storage/maria/ma_ft_nlq_search.c
new file mode 100644
index 00000000000..3bb7defcaaf
--- /dev/null
+++ b/storage/maria/ma_ft_nlq_search.c
@@ -0,0 +1,380 @@
+/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; version 2 of the License.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */
+
+/* Written by Sergei A. Golubchik, who has a shared copyright to this code */
+
+#define FT_CORE
+#include "ma_ftdefs.h"
+
+/* search with natural language queries */
+
+typedef struct ft_doc_rec
+{
+  my_off_t  dpos;
+  double    weight;
+} FT_DOC;
+
+struct st_ft_info
+{
+  struct _ft_vft *please;
+  MARIA_HA  *info;
+  int       ndocs;
+  int       curdoc;
+  FT_DOC    doc[1];
+};
+
+typedef struct st_all_in_one
+{
+  MARIA_HA    *info;
+  uint	      keynr;
+  CHARSET_INFO *charset;
+  uchar      *keybuff;
+  TREE	      dtree;
+} ALL_IN_ONE;
+
+typedef struct st_ft_superdoc
+{
+    FT_DOC   doc;
+    FT_WORD *word_ptr;
+    double   tmp_weight;
+} FT_SUPERDOC;
+
+
+static int FT_SUPERDOC_cmp(void* cmp_arg __attribute__((unused)),
+			   FT_SUPERDOC *p1, FT_SUPERDOC *p2)
+{
+  if (p1->doc.dpos < p2->doc.dpos)
+    return -1;
+  if (p1->doc.dpos == p2->doc.dpos)
+    return 0;
+  return 1;
+}
+
+static int walk_and_match(FT_WORD *word, uint32 count, ALL_IN_ONE *aio)
+{
+  FT_WEIGTH    subkeys;
+  int          r;
+  uint	       doc_cnt;
+  FT_SUPERDOC  sdoc, *sptr;
+  TREE_ELEMENT *selem;
+  double       gweight=1;
+  MARIA_HA     *info= aio->info;
+  uchar        *keybuff= aio->keybuff;
+  MARIA_KEYDEF *keyinfo= info->s->keyinfo+aio->keynr;
+  my_off_t     key_root=info->s->state.key_root[aio->keynr];
+  uint         extra=HA_FT_WLEN+info->s->base.rec_reflength;
+  MARIA_KEY    key;
+#if HA_FT_WTYPE == HA_KEYTYPE_FLOAT
+  float tmp_weight;
+#else
+#error
+#endif
+  DBUG_ENTER("walk_and_match");
+
+  word->weight=LWS_FOR_QUERY;
+
+  _ma_ft_make_key(info, &key, aio->keynr, keybuff, word, 0);
+  key.data_length-= HA_FT_WLEN;
+  doc_cnt=0;
+
+  /* Skip rows inserted by current inserted */
+  for (r= _ma_search(info, &key, SEARCH_FIND, key_root) ;
+       !r &&
+         (subkeys.i= ft_sintXkorr(info->last_key.data +
+                                  info->last_key.data_length +
+                                  info->last_key.ref_length - extra)) > 0 &&
+         info->cur_row.lastpos >= info->state->data_file_length ;
+       r= _ma_search_next(info, &info->last_key, SEARCH_BIGGER, key_root))
+    ;
+
+  info->update|= HA_STATE_AKTIV;              /* for _ma_test_if_changed() */
+
+  /* The following should be safe, even if we compare doubles */
+  while (!r && gweight)
+  {
+
+    if (key.data_length &&
+        ha_compare_text(aio->charset,
+                        info->last_key.data+1,
+                        info->last_key.data_length +
+                        info->last_key.ref_length - extra - 1,
+                        key.data+1, key.data_length-1, 0, 0))
+     break;
+
+    if (subkeys.i < 0)
+    {
+      if (doc_cnt)
+        DBUG_RETURN(1); /* index is corrupted */
+      /*
+        TODO here: unsafe optimization, should this word
+        be skipped (based on subkeys) ?
+      */
+      keybuff+= key.data_length;
+      keyinfo= &info->s->ft2_keyinfo;
+      key_root= info->cur_row.lastpos;
+      key.data_length= 0;
+      r= _ma_search_first(info, keyinfo, key_root);
+      goto do_skip;
+    }
+#if HA_FT_WTYPE == HA_KEYTYPE_FLOAT
+    /* The weight we read was actually a float */
+    tmp_weight= subkeys.f;
+#else
+#error
+#endif
+  /* The following should be safe, even if we compare doubles */
+    if (tmp_weight==0)
+      DBUG_RETURN(doc_cnt); /* stopword, doc_cnt should be 0 */
+
+    sdoc.doc.dpos= info->cur_row.lastpos;
+
+    /* saving document matched into dtree */
+    if (!(selem=tree_insert(&aio->dtree, &sdoc, 0, aio->dtree.custom_arg)))
+      DBUG_RETURN(1);
+
+    sptr=(FT_SUPERDOC *)ELEMENT_KEY((&aio->dtree), selem);
+
+    if (selem->count==1) /* document's first match */
+      sptr->doc.weight=0;
+    else
+      sptr->doc.weight+=sptr->tmp_weight*sptr->word_ptr->weight;
+
+    sptr->word_ptr=word;
+    sptr->tmp_weight=tmp_weight;
+
+    doc_cnt++;
+
+    gweight=word->weight*GWS_IN_USE;
+    if (gweight < 0 || doc_cnt > 2000000)
+      gweight=0;
+
+    if (_ma_test_if_changed(info) == 0)
+	r= _ma_search_next(info, &info->last_key, SEARCH_BIGGER, key_root);
+    else
+	r= _ma_search(info, &info->last_key, SEARCH_BIGGER, key_root);
+do_skip:
+    while ((subkeys.i= ft_sintXkorr(info->last_key.data +
+                                    info->last_key.data_length +
+                                    info->last_key.ref_length - extra)) > 0 &&
+           !r && info->cur_row.lastpos >= info->state->data_file_length)
+      r= _ma_search_next(info, &info->last_key, SEARCH_BIGGER, key_root);
+
+  }
+  word->weight=gweight;
+
+  DBUG_RETURN(0);
+}
+
+
+static int walk_and_copy(FT_SUPERDOC *from,
+			 uint32 count __attribute__((unused)), FT_DOC **to)
+{
+  DBUG_ENTER("walk_and_copy");
+  from->doc.weight+=from->tmp_weight*from->word_ptr->weight;
+  (*to)->dpos=from->doc.dpos;
+  (*to)->weight=from->doc.weight;
+  (*to)++;
+  DBUG_RETURN(0);
+}
+
+static int walk_and_push(FT_SUPERDOC *from,
+			 uint32 count __attribute__((unused)), QUEUE *best)
+{
+  DBUG_ENTER("walk_and_copy");
+  from->doc.weight+=from->tmp_weight*from->word_ptr->weight;
+  set_if_smaller(best->elements, ft_query_expansion_limit-1);
+  queue_insert(best, (uchar *)& from->doc);
+  DBUG_RETURN(0);
+}
+
+
+static int FT_DOC_cmp(void *unused __attribute__((unused)),
+                      FT_DOC *a, FT_DOC *b)
+{
+  return sgn(b->weight - a->weight);
+}
+
+
+FT_INFO *maria_ft_init_nlq_search(MARIA_HA *info, uint keynr, uchar *query,
+                                  size_t query_len, uint flags, uchar *record)
+{
+  TREE	      wtree;
+  ALL_IN_ONE  aio;
+  FT_DOC     *dptr;
+  FT_INFO    *dlist=NULL;
+  MARIA_RECORD_POS saved_lastpos= info->cur_row.lastpos;
+  struct st_mysql_ftparser *parser;
+  MYSQL_FTPARSER_PARAM *ftparser_param;
+  DBUG_ENTER("maria_ft_init_nlq_search");
+
+  /* black magic ON */
+  if ((int) (keynr = _ma_check_index(info,keynr)) < 0)
+    DBUG_RETURN(NULL);
+  if (_ma_readinfo(info,F_RDLCK,1))
+    DBUG_RETURN(NULL);
+  /* black magic OFF */
+
+  aio.info=info;
+  aio.keynr=keynr;
+  aio.charset=info->s->keyinfo[keynr].seg->charset;
+  aio.keybuff= info->lastkey_buff2;
+  parser= info->s->keyinfo[keynr].parser;
+  if (! (ftparser_param= maria_ftparser_call_initializer(info, keynr, 0)))
+    goto err;
+
+  bzero(&wtree,sizeof(wtree));
+
+  init_tree(&aio.dtree,0,0,sizeof(FT_SUPERDOC),(qsort_cmp2)&FT_SUPERDOC_cmp,0,
+            NULL, NULL);
+
+  maria_ft_parse_init(&wtree, aio.charset);
+  ftparser_param->flags= 0;
+  if (maria_ft_parse(&wtree, query, query_len, parser, ftparser_param,
+               &wtree.mem_root))
+    goto err;
+
+  if (tree_walk(&wtree, (tree_walk_action)&walk_and_match, &aio,
+		left_root_right))
+    goto err;
+
+  if (flags & FT_EXPAND && ft_query_expansion_limit)
+  {
+    QUEUE best;
+    init_queue(&best,ft_query_expansion_limit,0,0, (queue_compare) &FT_DOC_cmp,
+	       0, 0, 0);
+    tree_walk(&aio.dtree, (tree_walk_action) &walk_and_push,
+              &best, left_root_right);
+    while (best.elements)
+    {
+      my_off_t docid= ((FT_DOC *)queue_remove_top(&best))->dpos;
+      if (!(*info->read_record)(info, record, docid))
+      {
+        info->update|= HA_STATE_AKTIV;
+        ftparser_param->flags= MYSQL_FTFLAGS_NEED_COPY;
+        if (unlikely(_ma_ft_parse(&wtree, info, keynr, record, ftparser_param,
+                                  &wtree.mem_root)))
+        {
+          delete_queue(&best);
+          goto err;
+        }
+      }
+    }
+    delete_queue(&best);
+    reset_tree(&aio.dtree);
+    if (tree_walk(&wtree, (tree_walk_action)&walk_and_match, &aio,
+                  left_root_right))
+      goto err;
+
+  }
+
+  /*
+    If ndocs == 0, this will not allocate RAM for FT_INFO.doc[],
+    so if ndocs == 0, FT_INFO.doc[] must not be accessed.
+   */
+  dlist=(FT_INFO *)my_malloc(sizeof(FT_INFO)+
+			     sizeof(FT_DOC)*
+			     (int)(aio.dtree.elements_in_tree-1),
+			     MYF(0));
+  if (!dlist)
+    goto err;
+
+  dlist->please= (struct _ft_vft *) & _ma_ft_vft_nlq;
+  dlist->ndocs=aio.dtree.elements_in_tree;
+  dlist->curdoc=-1;
+  dlist->info=aio.info;
+  dptr=dlist->doc;
+
+  tree_walk(&aio.dtree, (tree_walk_action) &walk_and_copy,
+	    &dptr, left_root_right);
+
+  if (flags & FT_SORTED)
+    my_qsort2(dlist->doc, dlist->ndocs, sizeof(FT_DOC),
+              (qsort2_cmp)&FT_DOC_cmp, 0);
+
+err:
+  delete_tree(&aio.dtree);
+  delete_tree(&wtree);
+  info->cur_row.lastpos= saved_lastpos;
+  DBUG_RETURN(dlist);
+}
+
+
+int maria_ft_nlq_read_next(FT_INFO *handler, char *record)
+{
+  MARIA_HA *info= (MARIA_HA *) handler->info;
+
+  if (++handler->curdoc >= handler->ndocs)
+  {
+    --handler->curdoc;
+    return HA_ERR_END_OF_FILE;
+  }
+
+  info->update&= (HA_STATE_CHANGED | HA_STATE_ROW_CHANGED);
+
+  info->cur_row.lastpos= handler->doc[handler->curdoc].dpos;
+  if (!(*info->read_record)(info, (uchar *) record, info->cur_row.lastpos))
+  {
+    info->update|= HA_STATE_AKTIV;		/* Record is read */
+    return 0;
+  }
+  return my_errno;
+}
+
+
+float maria_ft_nlq_find_relevance(FT_INFO *handler,
+			    uchar *record __attribute__((unused)),
+			    uint length __attribute__((unused)))
+{
+  int a,b,c;
+  FT_DOC  *docs=handler->doc;
+  MARIA_RECORD_POS docid= handler->info->cur_row.lastpos;
+
+  if (docid == HA_POS_ERROR)
+    return -5.0;
+
+  /* Assuming docs[] is sorted by dpos... */
+
+  for (a=0, b=handler->ndocs, c=(a+b)/2; b-a>1; c=(a+b)/2)
+  {
+    if (docs[c].dpos > docid)
+      b=c;
+    else
+      a=c;
+  }
+  /* bounds check to avoid accessing unallocated handler->doc  */
+  if (a < handler->ndocs && docs[a].dpos == docid)
+    return (float) docs[a].weight;
+  else
+    return 0.0;
+}
+
+
+void maria_ft_nlq_close_search(FT_INFO *handler)
+{
+  my_free(handler, MYF(0));
+}
+
+
+float maria_ft_nlq_get_relevance(FT_INFO *handler)
+{
+  return (float) handler->doc[handler->curdoc].weight;
+}
+
+
+void maria_ft_nlq_reinit_search(FT_INFO *handler)
+{
+  handler->curdoc=-1;
+}
+
diff --git a/storage/maria/ma_ft_parser.c b/storage/maria/ma_ft_parser.c
new file mode 100644
index 00000000000..b35c2227ca2
--- /dev/null
+++ b/storage/maria/ma_ft_parser.c
@@ -0,0 +1,417 @@
+/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; version 2 of the License.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */
+
+/* Written by Sergei A. Golubchik, who has a shared copyright to this code */
+
+#include "ma_ftdefs.h"
+
+typedef struct st_maria_ft_docstat {
+  FT_WORD *list;
+  uint uniq;
+  double sum;
+} FT_DOCSTAT;
+
+
+typedef struct st_my_maria_ft_parser_param
+{
+  TREE *wtree;
+  MEM_ROOT *mem_root;
+} MY_FT_PARSER_PARAM;
+
+
+static int FT_WORD_cmp(CHARSET_INFO* cs, FT_WORD *w1, FT_WORD *w2)
+{
+  return ha_compare_text(cs, (uchar*) w1->pos, w1->len,
+                         (uchar*) w2->pos, w2->len, 0, 0);
+}
+
+static int walk_and_copy(FT_WORD *word,uint32 count,FT_DOCSTAT *docstat)
+{
+    word->weight=LWS_IN_USE;
+    docstat->sum+=word->weight;
+    memcpy_fixed((docstat->list)++,word,sizeof(FT_WORD));
+    return 0;
+}
+
+/* transforms tree of words into the array, applying normalization */
+
+FT_WORD * maria_ft_linearize(TREE *wtree, MEM_ROOT *mem_root)
+{
+  FT_WORD *wlist,*p;
+  FT_DOCSTAT docstat;
+  DBUG_ENTER("maria_ft_linearize");
+
+  if ((wlist=(FT_WORD *) alloc_root(mem_root, sizeof(FT_WORD)*
+                                    (1+wtree->elements_in_tree))))
+  {
+    docstat.list=wlist;
+    docstat.uniq=wtree->elements_in_tree;
+    docstat.sum=0;
+    tree_walk(wtree,(tree_walk_action)&walk_and_copy,&docstat,left_root_right);
+  }
+  delete_tree(wtree);
+  if (!wlist)
+    DBUG_RETURN(NULL);
+
+  docstat.list->pos=NULL;
+
+  for (p=wlist;p->pos;p++)
+  {
+    p->weight=PRENORM_IN_USE;
+  }
+
+  for (p=wlist;p->pos;p++)
+  {
+    p->weight/=NORM_IN_USE;
+  }
+
+  DBUG_RETURN(wlist);
+}
+
+my_bool maria_ft_boolean_check_syntax_string(const uchar *str)
+{
+  uint i, j;
+
+  if (!str ||
+      (strlen((const char *) str) + 1 != sizeof(ft_boolean_syntax)) ||
+      (str[0] != ' ' && str[1] != ' '))
+    return 1;
+  for (i=0; i<sizeof(ft_boolean_syntax); i++)
+  {
+    /* limiting to 7-bit ascii only */
+    if ((unsigned char)(str[i]) > 127 ||
+        my_isalnum(default_charset_info, str[i]))
+      return 1;
+    for (j=0; j<i; j++)
+      if (str[i] == str[j] && (i != 11 || j != 10))
+        return 1;
+  }
+  return 0;
+}
+
+/*
+  RETURN VALUE
+  0 - eof
+  1 - word found
+  2 - left bracket
+  3 - right bracket
+  4 - stopword found
+*/
+uchar maria_ft_get_word(CHARSET_INFO *cs, const uchar **start,
+                        const uchar *end,
+                        FT_WORD *word, MYSQL_FTPARSER_BOOLEAN_INFO *param)
+{
+  const uchar *doc= *start;
+  int ctype;
+  uint mwc, length;
+  int mbl;
+
+  param->yesno=(FTB_YES==' ') ? 1 : (param->quot != 0);
+  param->weight_adjust= param->wasign= 0;
+  param->type= FT_TOKEN_EOF;
+
+  while (doc<end)
+  {
+    for (; doc < end; doc+= (mbl > 0 ? mbl : (mbl < 0 ? -mbl : 1)))
+    {
+      mbl= cs->cset->ctype(cs, &ctype, doc, end);
+      if (true_word_char(ctype, *doc))
+        break;
+      if (*doc == FTB_RQUOT && param->quot)
+      {
+        param->quot= (char *) doc;
+        *start=doc+1;
+        param->type= FT_TOKEN_RIGHT_PAREN;
+        goto ret;
+      }
+      if (!param->quot)
+      {
+        if (*doc == FTB_LBR || *doc == FTB_RBR || *doc == FTB_LQUOT)
+        {
+          /* param->prev=' '; */
+          *start=doc+1;
+          if (*doc == FTB_LQUOT)
+            param->quot= (char *) *start;
+          param->type= (*doc == FTB_RBR ? FT_TOKEN_RIGHT_PAREN : FT_TOKEN_LEFT_PAREN);
+          goto ret;
+        }
+        if (param->prev == ' ')
+        {
+          if (*doc == FTB_YES ) { param->yesno=+1;    continue; } else
+          if (*doc == FTB_EGAL) { param->yesno= 0;    continue; } else
+          if (*doc == FTB_NO  ) { param->yesno=-1;    continue; } else
+          if (*doc == FTB_INC ) { param->weight_adjust++; continue; } else
+          if (*doc == FTB_DEC ) { param->weight_adjust--; continue; } else
+          if (*doc == FTB_NEG ) { param->wasign= !param->wasign; continue; }
+        }
+      }
+      param->prev=*doc;
+      param->yesno=(FTB_YES==' ') ? 1 : (param->quot != 0);
+      param->weight_adjust= param->wasign= 0;
+    }
+
+    mwc=length=0;
+    for (word->pos= doc; doc < end; length++,
+         doc+= (mbl > 0 ? mbl : (mbl < 0 ? -mbl : 1)))
+    {
+      mbl= cs->cset->ctype(cs, &ctype, doc, end);
+      if (true_word_char(ctype, *doc))
+        mwc=0;
+      else if (!misc_word_char(*doc) || mwc)
+        break;
+      else
+        mwc++;
+    }
+    param->prev='A'; /* be sure *prev is true_word_char */
+    word->len= (uint)(doc-word->pos) - mwc;
+    if ((param->trunc=(doc<end && *doc == FTB_TRUNC)))
+      doc++;
+
+    if (((length >= ft_min_word_len && !is_stopword((char *) word->pos,
+                                                    word->len))
+         || param->trunc) && length < ft_max_word_len)
+    {
+      *start=doc;
+      param->type= FT_TOKEN_WORD;
+      goto ret;
+    }
+    else if (length) /* make sure length > 0 (if start contains spaces only) */
+    {
+      *start= doc;
+      param->type= FT_TOKEN_STOPWORD;
+      goto ret;
+    }
+  }
+  if (param->quot)
+  {
+    param->quot= (char *)(*start= doc);
+    param->type= 3; /* FT_RBR */
+    goto ret;
+  }
+ret:
+  return param->type;
+}
+
+uchar maria_ft_simple_get_word(CHARSET_INFO *cs, const uchar **start,
+                               const uchar *end, FT_WORD *word,
+                               my_bool skip_stopwords)
+{
+  const uchar *doc= *start;
+  uint mwc, length;
+  int ctype, mbl;
+  DBUG_ENTER("maria_ft_simple_get_word");
+
+  do
+  {
+    for (;; doc+= (mbl > 0 ? mbl : (mbl < 0 ? -mbl : 1)))
+    {
+      if (doc >= end)
+        DBUG_RETURN(0);
+      mbl= cs->cset->ctype(cs, &ctype, doc, end);
+      if (true_word_char(ctype, *doc))
+        break;
+    }
+
+    mwc= length= 0;
+    for (word->pos= doc; doc < end; length++,
+         doc+= (mbl > 0 ? mbl : (mbl < 0 ? -mbl : 1)))
+    {
+      mbl= cs->cset->ctype(cs, &ctype, doc, end);
+      if (true_word_char(ctype, *doc))
+        mwc= 0;
+      else if (!misc_word_char(*doc) || mwc)
+        break;
+      else
+        mwc++;
+    }
+
+    word->len= (uint)(doc-word->pos) - mwc;
+
+    if (skip_stopwords == FALSE ||
+        (length >= ft_min_word_len && length < ft_max_word_len &&
+         !is_stopword((char *) word->pos, word->len)))
+    {
+      *start= doc;
+      DBUG_RETURN(1);
+    }
+  } while (doc < end);
+  DBUG_RETURN(0);
+}
+
+void maria_ft_parse_init(TREE *wtree, CHARSET_INFO *cs)
+{
+  DBUG_ENTER("maria_ft_parse_init");
+  if (!is_tree_inited(wtree))
+    init_tree(wtree,0,0,sizeof(FT_WORD),(qsort_cmp2)&FT_WORD_cmp,0, NULL,
+              (void*) cs);
+  DBUG_VOID_RETURN;
+}
+
+
+static int maria_ft_add_word(MYSQL_FTPARSER_PARAM *param,
+                             const uchar *word, mysql_ft_size_t word_len,
+                             MYSQL_FTPARSER_BOOLEAN_INFO *boolean_info
+                             __attribute__((unused)))
+{
+  TREE *wtree;
+  FT_WORD w;
+  MY_FT_PARSER_PARAM *ft_param=param->mysql_ftparam;
+  DBUG_ENTER("maria_ft_add_word");
+  wtree= ft_param->wtree;
+  if (param->flags & MYSQL_FTFLAGS_NEED_COPY)
+  {
+    uchar *ptr;
+    DBUG_ASSERT(wtree->with_delete == 0);
+    ptr= (uchar *)alloc_root(ft_param->mem_root, word_len);
+    memcpy(ptr, word, word_len);
+    w.pos= ptr;
+  }
+  else
+    w.pos= word;
+  w.len= word_len;
+  if (!tree_insert(wtree, &w, 0, wtree->custom_arg))
+  {
+    delete_tree(wtree);
+    DBUG_RETURN(1);
+  }
+  DBUG_RETURN(0);
+}
+
+
+static int maria_ft_parse_internal(MYSQL_FTPARSER_PARAM *param,
+                                   const uchar *doc_arg,
+                                   mysql_ft_size_t doc_len)
+{
+  const uchar *doc= doc_arg;
+  const uchar *end= doc + doc_len;
+  MY_FT_PARSER_PARAM *ft_param=param->mysql_ftparam;
+  TREE *wtree= ft_param->wtree;
+  FT_WORD w;
+  DBUG_ENTER("maria_ft_parse_internal");
+
+  while (maria_ft_simple_get_word(wtree->custom_arg, &doc, end, &w, TRUE))
+    if (param->mysql_add_word(param, w.pos, w.len, 0))
+      DBUG_RETURN(1);
+  DBUG_RETURN(0);
+}
+
+
+int maria_ft_parse(TREE *wtree, uchar *doc, size_t doclen,
+                   struct st_mysql_ftparser *parser,
+                   MYSQL_FTPARSER_PARAM *param, MEM_ROOT *mem_root)
+{
+  MY_FT_PARSER_PARAM my_param;
+  DBUG_ENTER("maria_ft_parse");
+  DBUG_ASSERT(parser);
+  my_param.wtree= wtree;
+  my_param.mem_root= mem_root;
+
+  param->mysql_parse= maria_ft_parse_internal;
+  param->mysql_add_word= maria_ft_add_word;
+  param->mysql_ftparam= &my_param;
+  param->cs= wtree->custom_arg;
+  param->doc= doc;
+  param->length= doclen;
+  param->mode= MYSQL_FTPARSER_SIMPLE_MODE;
+  DBUG_RETURN(parser->parse(param));
+}
+
+
+#define MAX_PARAM_NR 2
+
+MYSQL_FTPARSER_PARAM* maria_ftparser_alloc_param(MARIA_HA *info)
+{
+  if (!info->ftparser_param)
+  {
+    /* 
+.     info->ftparser_param can not be zero after the initialization,
+      because it always includes built-in fulltext parser. And built-in
+      parser can be called even if the table has no fulltext indexes and
+      no varchar/text fields.
+
+      ftb_find_relevance... parser (ftb_find_relevance_parse,
+      ftb_find_relevance_add_word) calls ftb_check_phrase... parser
+      (ftb_check_phrase_internal, ftb_phrase_add_word). Thus MAX_PARAM_NR=2.
+    */
+    info->ftparser_param= (MYSQL_FTPARSER_PARAM *)
+      my_malloc(MAX_PARAM_NR * sizeof(MYSQL_FTPARSER_PARAM) *
+                info->s->ftkeys, MYF(MY_WME | MY_ZEROFILL));
+    init_alloc_root(&info->ft_memroot, FTPARSER_MEMROOT_ALLOC_SIZE, 0);
+  }
+  return info->ftparser_param;
+}
+
+
+MYSQL_FTPARSER_PARAM *maria_ftparser_call_initializer(MARIA_HA *info,
+                                                      uint keynr, uint paramnr)
+{
+  uint32 ftparser_nr;
+  struct st_mysql_ftparser *parser;
+  
+  if (!maria_ftparser_alloc_param(info))
+    return 0;
+
+  if (keynr == NO_SUCH_KEY)
+  {
+    ftparser_nr= 0;
+    parser= &ft_default_parser;
+  }
+  else
+  {
+    ftparser_nr= info->s->keyinfo[keynr].ftkey_nr;
+    parser= info->s->keyinfo[keynr].parser;
+  }
+  DBUG_ASSERT(paramnr < MAX_PARAM_NR);
+  ftparser_nr= ftparser_nr*MAX_PARAM_NR + paramnr;
+  if (! info->ftparser_param[ftparser_nr].mysql_add_word)
+  {
+    /* Note, that mysql_add_word is used here as a flag:
+       mysql_add_word == 0 - parser is not initialized
+       mysql_add_word != 0 - parser is initialized, or no
+                             initialization needed. */
+    info->ftparser_param[ftparser_nr].mysql_add_word=
+      (int (*)(struct st_mysql_ftparser_param *, const uchar *,
+               mysql_ft_size_t, MYSQL_FTPARSER_BOOLEAN_INFO *)) 1;
+    if (parser->init && parser->init(&info->ftparser_param[ftparser_nr]))
+      return 0;
+  }
+  return &info->ftparser_param[ftparser_nr];
+}
+
+
+void maria_ftparser_call_deinitializer(MARIA_HA *info)
+{
+  uint i, j, keys= info->s->state.header.keys;
+  free_root(&info->ft_memroot, MYF(0));
+  if (! info->ftparser_param)
+    return;
+  for (i= 0; i < keys; i++)
+  {
+    MARIA_KEYDEF *keyinfo= &info->s->keyinfo[i];
+    for (j=0; j < MAX_PARAM_NR; j++)
+    {
+      MYSQL_FTPARSER_PARAM *ftparser_param=
+        &info->ftparser_param[keyinfo->ftkey_nr*MAX_PARAM_NR + j];
+      if (keyinfo->flag & HA_FULLTEXT && ftparser_param->mysql_add_word)
+      {
+        if (keyinfo->parser->deinit)
+          keyinfo->parser->deinit(ftparser_param);
+        ftparser_param->mysql_add_word= 0;
+      }
+      else
+        break;
+    }
+  }
+}
diff --git a/storage/maria/ma_ft_stem.c b/storage/maria/ma_ft_stem.c
new file mode 100644
index 00000000000..06fc0b2df6c
--- /dev/null
+++ b/storage/maria/ma_ft_stem.c
@@ -0,0 +1,18 @@
+/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; version 2 of the License.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */
+
+/* Written by Sergei A. Golubchik, who has a shared copyright to this code */
+
+/* mulitingual stem */
diff --git a/storage/maria/ma_ft_test1.c b/storage/maria/ma_ft_test1.c
new file mode 100644
index 00000000000..4c98e766234
--- /dev/null
+++ b/storage/maria/ma_ft_test1.c
@@ -0,0 +1,317 @@
+/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; version 2 of the License.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */
+
+/* Written by Sergei A. Golubchik, who has a shared copyright to this code
+   added support for long options (my_getopt) 22.5.2002 by Jani Tolonen */
+
+#include "ma_ftdefs.h"
+#include "maria_ft_test1.h"
+#include <my_getopt.h>
+
+static int key_field=FIELD_VARCHAR,extra_field=FIELD_SKIP_ENDSPACE;
+static uint key_length=200,extra_length=50;
+static int key_type=HA_KEYTYPE_TEXT;
+static int verbose=0,silent=0,skip_update=0,
+	   no_keys=0,no_stopwords=0,no_search=0,no_fulltext=0;
+static int create_flag=0,error=0;
+
+#define MAX_REC_LENGTH 300
+static char record[MAX_REC_LENGTH],read_record[MAX_REC_LENGTH];
+
+static int run_test(const char *filename);
+static void get_options(int argc, char *argv[]);
+static void create_record(char *, int);
+static void usage();
+
+static struct my_option my_long_options[] =
+{
+  {"", 'v', "", 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
+  {"", '?', "", 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
+  {"", 'h', "", 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
+  {"", 'V', "", 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
+  {"", 'v', "", 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
+  {"", 's', "", 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
+  {"", 'N', "", 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
+  {"", 'S', "", 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
+  {"", 'K', "", 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
+  {"", 'F', "", 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
+  {"", 'U', "", 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
+  {"", '#', "", 0, 0, 0, GET_STR, OPT_ARG, 0, 0, 0, 0, 0, 0},
+  { 0, 0, 0, 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}
+};
+
+int main(int argc, char *argv[])
+{
+  MY_INIT(argv[0]);
+
+  get_options(argc,argv);
+  maria_init();
+
+  exit(run_test("FT1"));
+}
+
+static MARIA_COLUMNDEF recinfo[3];
+static MARIA_KEYDEF keyinfo[2];
+static HA_KEYSEG keyseg[10];
+
+static int run_test(const char *filename)
+{
+  MARIA_HA *file;
+  int i,j;
+  my_off_t pos;
+
+  bzero((char*) recinfo,sizeof(recinfo));
+
+  /* First define 2 columns */
+  recinfo[0].type=extra_field;
+  recinfo[0].length= (extra_field == FIELD_BLOB ? 4 + portable_sizeof_char_ptr :
+	      extra_length);
+  if (extra_field == FIELD_VARCHAR)
+    recinfo[0].length+= HA_VARCHAR_PACKLENGTH(extra_length);
+  recinfo[1].type=key_field;
+  recinfo[1].length= (key_field == FIELD_BLOB ? 4+portable_sizeof_char_ptr :
+		      key_length);
+  if (key_field == FIELD_VARCHAR)
+    recinfo[1].length+= HA_VARCHAR_PACKLENGTH(key_length);
+
+  /* Define a key over the first column */
+  keyinfo[0].seg=keyseg;
+  keyinfo[0].keysegs=1;
+  keyinfo[0].block_length= 0;                   /* Default block length */
+  keyinfo[0].seg[0].type= key_type;
+  keyinfo[0].seg[0].flag= (key_field == FIELD_BLOB) ? HA_BLOB_PART:
+			  (key_field == FIELD_VARCHAR) ? HA_VAR_LENGTH_PART:0;
+  keyinfo[0].seg[0].start=recinfo[0].length;
+  keyinfo[0].seg[0].length=key_length;
+  keyinfo[0].seg[0].null_bit= 0;
+  keyinfo[0].seg[0].null_pos=0;
+  keyinfo[0].seg[0].language= default_charset_info->number;
+  keyinfo[0].flag = (no_fulltext?HA_PACK_KEY:HA_FULLTEXT);
+
+  if (!silent)
+    printf("- Creating isam-file\n");
+  if (maria_create(filename,(no_keys?0:1),keyinfo,2,recinfo,0,NULL,
+		(MARIA_CREATE_INFO*) 0, create_flag))
+    goto err;
+  if (!(file=maria_open(filename,2,0)))
+    goto err;
+
+  if (!silent)
+    printf("- %s stopwords\n",no_stopwords?"Skipping":"Initializing");
+  maria_ft_init_stopwords(no_stopwords?NULL:maria_ft_precompiled_stopwords);
+
+  if (!silent)
+    printf("- Writing key:s\n");
+
+  my_errno=0;
+  for (i=NUPD ; i<NDATAS; i++ )
+  {
+    create_record(record,i);
+    error=maria_write(file,record);
+    if (verbose || error)
+      printf("I= %2d  maria_write: %d  errno: %d, record: %s\n",
+	i,error,my_errno,data[i].f0);
+  }
+
+  if (!skip_update)
+  {
+    if (!silent)
+      printf("- Updating rows\n");
+
+    /* Read through all rows and update them */
+    pos=(ha_rows) 0;
+    i=0;
+    while ((error=maria_rrnd(file,read_record,pos)) == 0)
+    {
+      create_record(record,NUPD-i-1);
+      if (maria_update(file,read_record,record))
+      {
+	printf("Can't update row: %.*s, error: %d\n",
+	       keyinfo[0].seg[0].length,record,my_errno);
+      }
+      if(++i == NUPD) break;
+      pos=HA_OFFSET_ERROR;
+    }
+    if (i != NUPD)
+      printf("Found %d of %d rows\n", i,NUPD);
+  }
+
+  if (maria_close(file)) goto err;
+  if(no_search) return 0;
+  if (!silent)
+    printf("- Reopening file\n");
+  if (!(file=maria_open(filename,2,0))) goto err;
+  if (!silent)
+    printf("- Reading rows with key\n");
+  for (i=0 ; i < NQUERIES ; i++)
+  {
+    FT_DOCLIST *result;
+    result=maria_ft_nlq_init_search(file,0,(char*) query[i],strlen(query[i]),1);
+    if(!result)
+    {
+      printf("Query %d: `%s' failed with errno %3d\n",i,query[i],my_errno);
+      continue;
+    }
+    printf("Query %d: `%s'. Found: %d. Top five documents:\n",
+           i,query[i],result->ndocs);
+    for (j=0;j<5;j++)
+    {
+      double w; int err;
+      err= maria_ft_nlq_read_next(result, read_record);
+      if (err==HA_ERR_END_OF_FILE)
+      {
+        printf("No more matches!\n");
+        break;
+      }
+      else if (err)
+      {
+        printf("maria_ft_read_next %d failed with errno %3d\n",j,my_errno);
+        break;
+      }
+      w=maria_ft_nlq_get_relevance(result);
+      if (key_field == FIELD_VARCHAR)
+      {
+        uint l;
+        char *p;
+        p=recinfo[0].length+read_record;
+        l=uint2korr(p);
+        printf("%10.7f: %.*s\n",w,(int) l,p+2);
+      }
+      else
+        printf("%10.7f: %.*s\n",w,recinfo[1].length,
+               recinfo[0].length+read_record);
+    }
+    maria_ft_nlq_close_search(result);
+  }
+
+  if (maria_close(file)) goto err;
+  maria_end();
+  my_end(MY_CHECK_ERROR);
+
+  return (0);
+err:
+  printf("got error: %3d when using maria-database\n",my_errno);
+  return 1;			/* skip warning */
+}
+
+static char blob_key[MAX_REC_LENGTH];
+/* static char blob_record[MAX_REC_LENGTH+20*20]; */
+
+void create_record(char *pos, int n)
+{
+  bzero((char*) pos,MAX_REC_LENGTH);
+  if (recinfo[0].type == FIELD_BLOB)
+  {
+    uint tmp;
+    char *ptr;
+    strnmov(blob_key,data[n].f0,keyinfo[0].seg[0].length);
+    tmp=strlen(blob_key);
+    int4store(pos,tmp);
+    ptr=blob_key;
+    memcpy_fixed(pos+4,&ptr,sizeof(char*));
+    pos+=recinfo[0].length;
+  }
+  else if (recinfo[0].type == FIELD_VARCHAR)
+  {
+    uint tmp;
+    /* -1 is here because pack_length is stored in seg->length */
+    uint pack_length= HA_VARCHAR_PACKLENGTH(keyinfo[0].seg[0].length-1);
+    strnmov(pos+pack_length,data[n].f0,keyinfo[0].seg[0].length);
+    tmp=strlen(pos+pack_length);
+    if (pack_length == 1)
+      *pos= (char) tmp;
+    else
+      int2store(pos,tmp);
+    pos+=recinfo[0].length;
+  }
+  else
+  {
+    strnmov(pos,data[n].f0,keyinfo[0].seg[0].length);
+    pos+=recinfo[0].length;
+  }
+  if (recinfo[1].type == FIELD_BLOB)
+  {
+    uint tmp;
+    char *ptr;
+    strnmov(blob_key,data[n].f2,keyinfo[0].seg[0].length);
+    tmp=strlen(blob_key);
+    int4store(pos,tmp);
+    ptr=blob_key;
+    memcpy_fixed(pos+4,&ptr,sizeof(char*));
+    pos+=recinfo[1].length;
+  }
+  else if (recinfo[1].type == FIELD_VARCHAR)
+  {
+    uint tmp;
+    /* -1 is here because pack_length is stored in seg->length */
+    uint pack_length= HA_VARCHAR_PACKLENGTH(keyinfo[0].seg[0].length-1);
+    strnmov(pos+pack_length,data[n].f2,keyinfo[0].seg[0].length);
+    tmp=strlen(pos+1);
+    if (pack_length == 1)
+      *pos= (char) tmp;
+    else
+      int2store(pos,tmp);
+    pos+=recinfo[1].length;
+  }
+  else
+  {
+    strnmov(pos,data[n].f2,keyinfo[0].seg[0].length);
+    pos+=recinfo[1].length;
+  }
+}
+
+
+static my_bool
+get_one_option(int optid, const struct my_option *opt __attribute__((unused)),
+	       char *argument)
+{
+  switch(optid) {
+  case 'v': verbose=1; break;
+  case 's': silent=1; break;
+  case 'F': no_fulltext=1; no_search=1;
+  case 'U': skip_update=1; break;
+  case 'K': no_keys=no_search=1; break;
+  case 'N': no_search=1; break;
+  case 'S': no_stopwords=1; break;
+  case '#':
+    DBUG_PUSH (argument);
+    break;
+  case 'V':
+  case '?':
+  case 'h':
+    usage();
+    exit(1);
+  }
+  return 0;
+}
+
+/* Read options */
+
+static void get_options(int argc,char *argv[])
+{
+  int ho_error;
+
+  if ((ho_error=handle_options(&argc, &argv, my_long_options, get_one_option)))
+    exit(ho_error);
+  return;
+} /* get options */
+
+
+static void usage()
+{
+  printf("%s [options]\n", my_progname);
+  my_print_help(my_long_options);
+  my_print_variables(my_long_options);
+}
diff --git a/storage/maria/ma_ft_test1.h b/storage/maria/ma_ft_test1.h
new file mode 100644
index 00000000000..5883c42f5c5
--- /dev/null
+++ b/storage/maria/ma_ft_test1.h
@@ -0,0 +1,420 @@
+/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; version 2 of the License.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */
+
+/* Written by Sergei A. Golubchik, who has a shared copyright to this code */
+
+#define NUPD            20
+#define NDATAS          389
+struct { const char *f0, *f2; } data[NDATAS] = {
+   {"1", "General Information about MySQL"},
+   {"1.1", "What is MySQL?"},
+   {"1.2", "About this manual"},
+   {"1.3", "History of MySQL"},
+   {"1.4", "The main features of MySQL"},
+   {"1.5", "General SQL information and tutorials"},
+   {"1.6", "Useful MySQL-related links"},
+   {"1.7", "What are stored procedures and triggers and so on?"},
+   {"2", "MySQL mailing lists and how to ask questions/give error (bug) reports"},
+   {"2.1", "Subscribing to/un-subscribing from the MySQL mailing list"},
+   {"2.2", "Asking questions or reporting bugs"},
+   {"2.3", "I think I have found a bug. What information do you need to help me?"},
+   {"2.3.1", "MySQL keeps crashing"},
+   {"2.4", "Guidelines for answering questions on the mailing list"},
+   {"3", "Licensing or When do I have/want to pay for MySQL?"},
+   {"3.1", "How much does MySQL cost?"},
+   {"3.2", "How do I get commercial support?"},
+   {"3.2.1", "Types of commercial support"},
+   {"3.2.1.1", "Basic email support"},
+   {"3.2.1.2", "Extended email support"},
+/*------------------------------- NUPD=20 -------------------------------*/
+   {"3.2.1.3", "Asking: Login support"},
+   {"3.2.1.4", "Extended login support"},
+   {"3.3", "How do I pay for licenses/support?"},
+   {"3.4", "Who do I contact when I want more information about licensing/support?"},
+   {"3.5", "What Copyright does MySQL use?"},
+   {"3.6", "When may I distribute MySQL commercially without a fee?"},
+   {"3.7", "I want to sell a product that can be configured to use MySQL"},
+   {"3.8", "I am running a commercial web server using MySQL"},
+   {"3.9", "Do I need a license to sell commercial Perl/tcl/PHP/Web+ etc applications?"},
+   {"3.10", "Possible future changes in the licensing"},
+   {"4", "Compiling and installing MySQL"},
+   {"4.1", "How do I get MySQL?"},
+   {"4.2", "Which MySQL version should I use?"},
+   {"4.3", "How/when will you release updates?"},
+   {"4.4", "What operating systems does MySQL support?"},
+   {"4.5", "Compiling MySQL from source code"},
+   {"4.5.1", "Quick installation overview"},
+   {"4.5.2", "Usual configure switches"},
+   {"4.5.3", "Applying a patch"},
+   {"4.6", "Problems compiling?"},
+   {"4.7", "General compilation notes"},
+   {"4.8", "MIT-pthreads notes (FreeBSD)"},
+   {"4.9", "Perl installation comments"},
+   {"4.10", "Special things to consider for some machine/OS combinations"},
+   {"4.10.1", "Solaris notes"},
+   {"4.10.2", "SunOS 4 notes"},
+   {"4.10.3", "Linux notes for all versions"},
+   {"4.10.3.1", "Linux-x86 notes"},
+   {"4.10.3.2", "RedHat 5.0"},
+   {"4.10.3.3", "RedHat 5.1"},
+   {"4.10.3.4", "Linux-Sparc notes"},
+   {"4.10.3.5", "Linux-Alpha notes"},
+   {"4.10.3.6", "MkLinux notes"},
+   {"4.10.4", "Alpha-DEC-Unix notes"},
+   {"4.10.5", "Alpha-DEC-OSF1 notes"},
+   {"4.10.6", "SGI-IRIX notes"},
+   {"4.10.7", "FreeBSD notes"},
+   {"4.10.7.1", "FreeBSD-3.0 notes"},
+   {"4.10.8", "BSD/OS 2.# notes"},
+   {"4.10.8.1", "BSD/OS 3.# notes"},
+   {"4.10.9", "SCO notes"},
+   {"4.10.10", "SCO Unixware 7.0 notes"},
+   {"4.10.11", "IBM-AIX notes"},
+   {"4.10.12", "HP-UX notes"},
+   {"4.11", "TcX binaries"},
+   {"4.12", "Win32 notes"},
+   {"4.13", "Installation instructions for MySQL binary releases"},
+   {"4.13.1", "How to get MySQL Perl support working"},
+   {"4.13.2", "Linux notes"},
+   {"4.13.3", "HP-UX notes"},
+   {"4.13.4", "Linking client libraries"},
+   {"4.14", "Problems running mysql_install_db"},
+   {"4.15", "Problems starting MySQL"},
+   {"4.16", "Automatic start/stop of MySQL"},
+   {"4.17", "Option files"},
+   {"5", "How standards-compatible is MySQL?"},
+   {"5.1", "What extensions has MySQL to ANSI SQL92?"},
+   {"5.2", "What functionality is missing in MySQL?"},
+   {"5.2.1", "Sub-selects"},
+   {"5.2.2", "SELECT INTO TABLE"},
+   {"5.2.3", "Transactions"},
+   {"5.2.4", "Triggers"},
+   {"5.2.5", "Foreign Keys"},
+   {"5.2.5.1", "Some reasons NOT to use FOREIGN KEYS"},
+   {"5.2.6", "Views"},
+   {"5.2.7", "-- as start of a comment"},
+   {"5.3", "What standards does MySQL follow?"},
+   {"5.4", "What functions exist only for compatibility?"},
+   {"5.5", "Limitations of BLOB and TEXT types"},
+   {"5.6", "How to cope without COMMIT-ROLLBACK"},
+   {"6", "The MySQL access privilege system"},
+   {"6.1", "What the privilege system does"},
+   {"6.2", "Connecting to the MySQL server"},
+   {"6.2.1", "Keeping your password secure"},
+   {"6.3", "Privileges provided by MySQL"},
+   {"6.4", "How the privilege system works"},
+   {"6.5", "The privilege tables"},
+   {"6.6", "Setting up the initial MySQL privileges"},
+   {"6.7", "Adding new user privileges to MySQL"},
+   {"6.8", "An example permission setup"},
+   {"6.9", "Causes of Access denied errors"},
+   {"6.10", "How to make MySQL secure against crackers"},
+   {"7", "MySQL language reference"},
+   {"7.1", "Literals: how to write strings and numbers"},
+   {"7.1.1", "Strings"},
+   {"7.1.2", "Numbers"},
+   {"7.1.3", "NULL values"},
+   {"7.1.4", "Database, table, index, column and alias names"},
+   {"7.1.4.1", "Case sensitivity in names"},
+   {"7.2", "Column types"},
+   {"7.2.1", "Column type storage requirements"},
+   {"7.2.5", "Numeric types"},
+   {"7.2.6", "Date and time types"},
+   {"7.2.6.1", "The DATE type"},
+   {"7.2.6.2", "The TIME type"},
+   {"7.2.6.3", "The DATETIME type"},
+   {"7.2.6.4", "The TIMESTAMP type"},
+   {"7.2.6.5", "The YEAR type"},
+   {"7.2.6.6", "Miscellaneous date and time properties"},
+   {"7.2.7", "String types"},
+   {"7.2.7.1", "The CHAR and VARCHAR types"},
+   {"7.2.7.2", "The BLOB and TEXT types"},
+   {"7.2.7.3", "The ENUM type"},
+   {"7.2.7.4", "The SET type"},
+   {"7.2.8", "Choosing the right type for a column"},
+   {"7.2.9", "Column indexes"},
+   {"7.2.10", "Multiple-column indexes"},
+   {"7.2.11", "Using column types from other database engines"},
+   {"7.3", "Functions for use in SELECT and WHERE clauses"},
+   {"7.3.1", "Grouping functions"},
+   {"7.3.2", "Normal arithmetic operations"},
+   {"7.3.3", "Bit functions"},
+   {"7.3.4", "Logical operations"},
+   {"7.3.5", "Comparison operators"},
+   {"7.3.6", "String comparison functions"},
+   {"7.3.7", "Control flow functions"},
+   {"7.3.8", "Mathematical functions"},
+   {"7.3.9", "String functions"},
+   {"7.3.10", "Date and time functions"},
+   {"7.3.11", "Miscellaneous functions"},
+   {"7.3.12", "Functions for use with GROUP BY clauses"},
+   {"7.4", "CREATE DATABASE syntax"},
+   {"7.5", "DROP DATABASE syntax"},
+   {"7.6", "CREATE TABLE syntax"},
+   {"7.7", "ALTER TABLE syntax"},
+   {"7.8", "OPTIMIZE TABLE syntax"},
+   {"7.9", "DROP TABLE syntax"},
+   {"7.10", "DELETE syntax"},
+   {"7.11", "SELECT syntax"},
+   {"7.12", "JOIN syntax"},
+   {"7.13", "INSERT syntax"},
+   {"7.14", "REPLACE syntax"},
+   {"7.15", "LOAD DATA INFILE syntax"},
+   {"7.16", "UPDATE syntax"},
+   {"7.17", "USE syntax"},
+   {"7.18", "SHOW syntax (Get information about tables, columns...)"},
+   {"7.19", "EXPLAIN syntax (Get information about a SELECT)"},
+   {"7.20", "DESCRIBE syntax (Get information about columns)"},
+   {"7.21", "LOCK TABLES/UNLOCK TABLES syntax"},
+   {"7.22", "SET OPTION syntax"},
+   {"7.23", "GRANT syntax (Compatibility function)"},
+   {"7.24", "CREATE INDEX syntax (Compatibility function)"},
+   {"7.25", "DROP INDEX syntax (Compatibility function)"},
+   {"7.26", "Comment syntax"},
+   {"7.27", "CREATE FUNCTION/DROP FUNCTION syntax"},
+   {"7.28", "Is MySQL picky about reserved words?"},
+   {"8", "Example SQL queries"},
+   {"8.1", "Queries from twin project"},
+   {"8.1.1", "Find all non-distributed twins"},
+   {"8.1.2", "Show a table on twin pair status"},
+   {"9", "How safe/stable is MySQL?"},
+   {"9.1", "How stable is MySQL?"},
+   {"9.2", "Why are there is so many releases of MySQL?"},
+   {"9.3", "Checking a table for errors"},
+   {"9.4", "How to repair tables"},
+   {"9.5", "Is there anything special to do when upgrading/downgrading MySQL?"},
+   {"9.5.1", "Upgrading from a 3.21 version to 3.22"},
+   {"9.5.2", "Upgrading from a 3.20 version to 3.21"},
+   {"9.5.3", "Upgrading to another architecture"},
+   {"9.6", "Year 2000 compliance"},
+   {"10", "MySQL Server functions"},
+   {"10.1", "What languages are supported by MySQL?"},
+   {"10.1.1", "Character set used for data &#38; sorting"},
+   {"10.2", "The update log"},
+   {"10.3", "How big can MySQL tables be?"},
+   {"11", "Getting maximum performance from MySQL"},
+   {"11.1", "How does one change the size of MySQL buffers?"},
+   {"11.2", "How compiling and linking affects the speed of MySQL"},
+   {"11.3", "How does MySQL use memory?"},
+   {"11.4", "How does MySQL use indexes?"},
+   {"11.5", "What optimizations are done on WHERE clauses?"},
+   {"11.6", "How does MySQL open &#38; close tables?"},
+   {"11.6.0.1", "What are the drawbacks of creating possibly thousands of tables in a database?"},
+   {"11.7", "How does MySQL lock tables?"},
+   {"11.8", "How should I arrange my table to be as fast/small as possible?"},
+   {"11.9", "What affects the speed of INSERT statements?"},
+   {"11.10", "What affects the speed DELETE statements?"},
+   {"11.11", "How do I get MySQL to run at full speed?"},
+   {"11.12", "What are the different row formats? Or, when should VARCHAR/CHAR be used?"},
+   {"11.13", "Why so many open tables?"},
+   {"12", "MySQL benchmark suite"},
+   {"13", "MySQL Utilites"},
+   {"13.1", "Overview of the different MySQL programs"},
+   {"13.2", "The MySQL table check, optimize and repair program"},
+   {"13.2.1", "isamchk memory use"},
+   {"13.2.2", "Getting low-level table information"},
+   {"13.3", "The MySQL compressed read-only table generator"},
+   {"14", "Adding new functions to MySQL"},
+   {"15", "MySQL ODBC Support"},
+   {"15.1", "Operating systems supported by MyODBC"},
+   {"15.2", "How to report problems with MyODBC"},
+   {"15.3", "Programs known to work with MyODBC"},
+   {"15.4", "How to fill in the various fields in the ODBC administrator program"},
+   {"15.5", "How to get the value of an AUTO_INCREMENT column in ODBC"},
+   {"16", "Problems and common errors"},
+   {"16.1", "Some common errors when using MySQL"},
+   {"16.1.1", "MySQL server has gone away error"},
+   {"16.1.2", "Can't connect to local MySQL server error"},
+   {"16.1.3", "Out of memory error"},
+   {"16.1.4", "Packet too large error"},
+   {"16.1.5", "The table is full error"},
+   {"16.1.6", "Commands out of sync error in client"},
+   {"16.1.7", "Removing user error"},
+   {"16.2", "How MySQL handles a full disk"},
+   {"16.3", "How to run SQL commands from a text file"},
+   {"16.4", "Where MySQL stores temporary files"},
+   {"16.5", "Access denied error"},
+   {"16.6", "How to run MySQL as a normal user"},
+   {"16.7", "Problems with file permissions"},
+   {"16.8", "File not found"},
+   {"16.9", "Problems using DATE columns"},
+   {"16.10", "Case sensitivity in searches"},
+   {"16.11", "Problems with NULL values"},
+   {"17", "Solving some common problems with MySQL"},
+   {"17.1", "Database replication"},
+   {"17.2", "Database backups"},
+   {"18", "MySQL client tools and API's"},
+   {"18.1", "MySQL C API"},
+   {"18.2", "C API datatypes"},
+   {"18.3", "C API function overview"},
+   {"18.4", "C API function descriptions"},
+   {"18.4.1", "mysql_affected_rows()"},
+   {"18.4.2", "mysql_close()"},
+   {"18.4.3", "mysql_connect()"},
+   {"18.4.4", "mysql_create_db()"},
+   {"18.4.5", "mysql_data_seek()"},
+   {"18.4.6", "mysql_debug()"},
+   {"18.4.7", "mysql_drop_db()"},
+   {"18.4.8", "mysql_dump_debug_info()"},
+   {"18.4.9", "mysql_eof()"},
+   {"18.4.10", "mysql_errno()"},
+   {"18.4.11", "mysql_error()"},
+   {"18.4.12", "mysql_escape_string()"},
+   {"18.4.13", "mysql_fetch_field()"},
+   {"18.4.14", "mysql_fetch_fields()"},
+   {"18.4.15", "mysql_fetch_field_direct()"},
+   {"18.4.16", "mysql_fetch_lengths()"},
+   {"18.4.17", "mysql_fetch_row()"},
+   {"18.4.18", "mysql_field_seek()"},
+   {"18.4.19", "mysql_field_tell()"},
+   {"18.4.20", "mysql_free_result()"},
+   {"18.4.21", "mysql_get_client_info()"},
+   {"18.4.22", "mysql_get_host_info()"},
+   {"18.4.23", "mysql_get_proto_info()"},
+   {"18.4.24", "mysql_get_server_info()"},
+   {"18.4.25", "mysql_info()"},
+   {"18.4.26", "mysql_init()"},
+   {"18.4.27", "mysql_insert_id()"},
+   {"18.4.28", "mysql_kill()"},
+   {"18.4.29", "mysql_list_dbs()"},
+   {"18.4.30", "mysql_list_fields()"},
+   {"18.4.31", "mysql_list_processes()"},
+   {"18.4.32", "mysql_list_tables()"},
+   {"18.4.33", "mysql_num_fields()"},
+   {"18.4.34", "mysql_num_rows()"},
+   {"18.4.35", "mysql_query()"},
+   {"18.4.36", "mysql_real_connect()"},
+   {"18.4.37", "mysql_real_query()"},
+   {"18.4.38", "mysql_reload()"},
+   {"18.4.39", "mysql_row_tell()"},
+   {"18.4.40", "mysql_select_db()"},
+   {"18.4.41", "mysql_shutdown()"},
+   {"18.4.42", "mysql_stat()"},
+   {"18.4.43", "mysql_store_result()"},
+   {"18.4.44", "mysql_thread_id()"},
+   {"18.4.45", "mysql_use_result()"},
+   {"18.4.46", "Why is it that after mysql_query() returns success, mysql_store_result() sometimes returns NULL?"},
+   {"18.4.47", "What results can I get from a query?"},
+   {"18.4.48", "How can I get the unique ID for the last inserted row?"},
+   {"18.4.49", "Problems linking with the C API"},
+   {"18.4.50", "How to make a thread-safe client"},
+   {"18.5", "MySQL Perl API's"},
+   {"18.5.1", "DBI with DBD::mysql"},
+   {"18.5.1.1", "The DBI interface"},
+   {"18.5.1.2", "More DBI/DBD information"},
+   {"18.6", "MySQL Java connectivity (JDBC)"},
+   {"18.7", "MySQL PHP API's"},
+   {"18.8", "MySQL C++ API's"},
+   {"18.9", "MySQL Python API's"},
+   {"18.10", "MySQL TCL API's"},
+   {"19", "How MySQL compares to other databases"},
+   {"19.1", "How MySQL compares to mSQL"},
+   {"19.1.1", "How to convert mSQL tools for MySQL"},
+   {"19.1.2", "How mSQL and MySQL client/server communications protocols differ"},
+   {"19.1.3", "How mSQL 2.0 SQL syntax differs from MySQL"},
+   {"19.2", "How MySQL compares to PostgreSQL"},
+   {"A", "Some users of MySQL"},
+   {"B", "Contributed programs"},
+   {"C", "Contributors to MySQL"},
+   {"D", "MySQL change history"},
+   {"19.3", "Changes in release 3.22.x (Alpha version)"},
+   {"19.3.1", "Changes in release 3.22.7"},
+   {"19.3.2", "Changes in release 3.22.6"},
+   {"19.3.3", "Changes in release 3.22.5"},
+   {"19.3.4", "Changes in release 3.22.4"},
+   {"19.3.5", "Changes in release 3.22.3"},
+   {"19.3.6", "Changes in release 3.22.2"},
+   {"19.3.7", "Changes in release 3.22.1"},
+   {"19.3.8", "Changes in release 3.22.0"},
+   {"19.4", "Changes in release 3.21.x"},
+   {"19.4.1", "Changes in release 3.21.33"},
+   {"19.4.2", "Changes in release 3.21.32"},
+   {"19.4.3", "Changes in release 3.21.31"},
+   {"19.4.4", "Changes in release 3.21.30"},
+   {"19.4.5", "Changes in release 3.21.29"},
+   {"19.4.6", "Changes in release 3.21.28"},
+   {"19.4.7", "Changes in release 3.21.27"},
+   {"19.4.8", "Changes in release 3.21.26"},
+   {"19.4.9", "Changes in release 3.21.25"},
+   {"19.4.10", "Changes in release 3.21.24"},
+   {"19.4.11", "Changes in release 3.21.23"},
+   {"19.4.12", "Changes in release 3.21.22"},
+   {"19.4.13", "Changes in release 3.21.21a"},
+   {"19.4.14", "Changes in release 3.21.21"},
+   {"19.4.15", "Changes in release 3.21.20"},
+   {"19.4.16", "Changes in release 3.21.19"},
+   {"19.4.17", "Changes in release 3.21.18"},
+   {"19.4.18", "Changes in release 3.21.17"},
+   {"19.4.19", "Changes in release 3.21.16"},
+   {"19.4.20", "Changes in release 3.21.15"},
+   {"19.4.21", "Changes in release 3.21.14b"},
+   {"19.4.22", "Changes in release 3.21.14a"},
+   {"19.4.23", "Changes in release 3.21.13"},
+   {"19.4.24", "Changes in release 3.21.12"},
+   {"19.4.25", "Changes in release 3.21.11"},
+   {"19.4.26", "Changes in release 3.21.10"},
+   {"19.4.27", "Changes in release 3.21.9"},
+   {"19.4.28", "Changes in release 3.21.8"},
+   {"19.4.29", "Changes in release 3.21.7"},
+   {"19.4.30", "Changes in release 3.21.6"},
+   {"19.4.31", "Changes in release 3.21.5"},
+   {"19.4.32", "Changes in release 3.21.4"},
+   {"19.4.33", "Changes in release 3.21.3"},
+   {"19.4.34", "Changes in release 3.21.2"},
+   {"19.4.35", "Changes in release 3.21.0"},
+   {"19.5", "Changes in release 3.20.x"},
+   {"19.5.1", "Changes in release 3.20.18"},
+   {"19.5.2", "Changes in release 3.20.17"},
+   {"19.5.3", "Changes in release 3.20.16"},
+   {"19.5.4", "Changes in release 3.20.15"},
+   {"19.5.5", "Changes in release 3.20.14"},
+   {"19.5.6", "Changes in release 3.20.13"},
+   {"19.5.7", "Changes in release 3.20.11"},
+   {"19.5.8", "Changes in release 3.20.10"},
+   {"19.5.9", "Changes in release 3.20.9"},
+   {"19.5.10", "Changes in release 3.20.8"},
+   {"19.5.11", "Changes in release 3.20.7"},
+   {"19.5.12", "Changes in release 3.20.6"},
+   {"19.5.13", "Changes in release 3.20.3"},
+   {"19.5.14", "Changes in release 3.20.0"},
+   {"19.6", "Changes in release 3.19.x"},
+   {"19.6.1", "Changes in release 3.19.5"},
+   {"19.6.2", "Changes in release 3.19.4"},
+   {"19.6.3", "Changes in release 3.19.3"},
+   {"E", "Known errors and design deficiencies in MySQL"},
+   {"F", "List of things we want to add to MySQL in the future (The TODO)"},
+   {"19.7", "Things that must done in the real near future"},
+   {"19.8", "Things that have to be done sometime"},
+   {"19.9", "Some things we don't have any plans to do"},
+   {"G", "Comments on porting to other systems"},
+   {"19.10", "Debugging MySQL"},
+   {"19.11", "Comments about RTS threads"},
+   {"19.12", "What is the difference between different thread packages?"},
+   {"H", "Description of MySQL regular expression syntax"},
+   {"I", "What is Unireg?"},
+   {"J", "The MySQL server license"},
+   {"K", "The MySQL license for Microsoft operating systems"},
+   {"*", "SQL command, type and function index"},
+   {"*", "Concept Index"}
+};
+
+#define NQUERIES 5
+const char *query[NQUERIES]={
+   "mysql information and manual",
+   "upgrading from previous version",
+   "column indexes",
+   "against about after more right the with/without", /* stopwords test */
+   "mysql license and copyright"
+};
diff --git a/storage/maria/ma_ft_update.c b/storage/maria/ma_ft_update.c
new file mode 100644
index 00000000000..f38990efab9
--- /dev/null
+++ b/storage/maria/ma_ft_update.c
@@ -0,0 +1,379 @@
+/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; version 2 of the License.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */
+
+/* Written by Sergei A. Golubchik, who has a shared copyright to this code */
+
+/* functions to work with full-text indices */
+
+#include "ma_ftdefs.h"
+#include <math.h>
+
+void _ma_ft_segiterator_init(MARIA_HA *info, uint keynr, const uchar *record,
+			     FT_SEG_ITERATOR *ftsi)
+{
+  DBUG_ENTER("_ma_ft_segiterator_init");
+
+  ftsi->num=info->s->keyinfo[keynr].keysegs;
+  ftsi->seg=info->s->keyinfo[keynr].seg;
+  ftsi->rec=record;
+  DBUG_VOID_RETURN;
+}
+
+void _ma_ft_segiterator_dummy_init(const uchar *record, uint len,
+				   FT_SEG_ITERATOR *ftsi)
+{
+  DBUG_ENTER("_ma_ft_segiterator_dummy_init");
+
+  ftsi->num=1;
+  ftsi->seg=0;
+  ftsi->pos=record;
+  ftsi->len=len;
+  DBUG_VOID_RETURN;
+}
+
+/*
+  This function breaks convention "return 0 in success"
+  but it's easier to use like this
+
+     while(_ma_ft_segiterator())
+
+  so "1" means "OK", "0" means "EOF"
+*/
+
+uint _ma_ft_segiterator(register FT_SEG_ITERATOR *ftsi)
+{
+  DBUG_ENTER("_ma_ft_segiterator");
+
+  if (!ftsi->num)
+    DBUG_RETURN(0);
+
+  ftsi->num--;
+  if (!ftsi->seg)
+    DBUG_RETURN(1);
+
+  ftsi->seg--;
+
+  if (ftsi->seg->null_bit &&
+      (ftsi->rec[ftsi->seg->null_pos] & ftsi->seg->null_bit))
+  {
+    ftsi->pos=0;
+    DBUG_RETURN(1);
+  }
+  ftsi->pos= ftsi->rec+ftsi->seg->start;
+  if (ftsi->seg->flag & HA_VAR_LENGTH_PART)
+  {
+    uint pack_length= (ftsi->seg->bit_start);
+    ftsi->len= (pack_length == 1 ? (uint) * ftsi->pos :
+                uint2korr(ftsi->pos));
+    ftsi->pos+= pack_length;			 /* Skip VARCHAR length */
+    DBUG_RETURN(1);
+  }
+  if (ftsi->seg->flag & HA_BLOB_PART)
+  {
+    ftsi->len= _ma_calc_blob_length(ftsi->seg->bit_start,ftsi->pos);
+    memcpy_fixed((char*) &ftsi->pos, ftsi->pos+ftsi->seg->bit_start,
+		 sizeof(char*));
+    DBUG_RETURN(1);
+  }
+  ftsi->len=ftsi->seg->length;
+  DBUG_RETURN(1);
+}
+
+
+/* parses a document i.e. calls maria_ft_parse for every keyseg */
+
+uint _ma_ft_parse(TREE *parsed, MARIA_HA *info, uint keynr, const uchar *record,
+                  MYSQL_FTPARSER_PARAM *param, MEM_ROOT *mem_root)
+{
+  FT_SEG_ITERATOR ftsi;
+  struct st_mysql_ftparser *parser;
+  DBUG_ENTER("_ma_ft_parse");
+
+  _ma_ft_segiterator_init(info, keynr, record, &ftsi);
+
+  maria_ft_parse_init(parsed, info->s->keyinfo[keynr].seg->charset);
+  parser= info->s->keyinfo[keynr].parser;
+  while (_ma_ft_segiterator(&ftsi))
+  {
+    /** @todo this casts ftsi.pos (const) to non-const */
+    if (ftsi.pos)
+      if (maria_ft_parse(parsed, (uchar *)ftsi.pos, ftsi.len, parser, param,
+                         mem_root))
+        DBUG_RETURN(1);
+  }
+  DBUG_RETURN(0);
+}
+
+FT_WORD * _ma_ft_parserecord(MARIA_HA *info, uint keynr, const uchar *record,
+                             MEM_ROOT *mem_root)
+{
+  TREE ptree;
+  MYSQL_FTPARSER_PARAM *param;
+  DBUG_ENTER("_ma_ft_parserecord");
+  if (! (param= maria_ftparser_call_initializer(info, keynr, 0)))
+    DBUG_RETURN(NULL);
+  bzero((char*) &ptree, sizeof(ptree));
+  param->flags= 0;
+  if (_ma_ft_parse(&ptree, info, keynr, record, param, mem_root))
+    DBUG_RETURN(NULL);
+
+  DBUG_RETURN(maria_ft_linearize(&ptree, mem_root));
+}
+
+static int _ma_ft_store(MARIA_HA *info, uint keynr, uchar *keybuf,
+			FT_WORD *wlist, my_off_t filepos)
+{
+  DBUG_ENTER("_ma_ft_store");
+
+  for (; wlist->pos; wlist++)
+  {
+    MARIA_KEY key;
+    _ma_ft_make_key(info, &key, keynr, keybuf, wlist, filepos);
+    if (_ma_ck_write(info, &key))
+      DBUG_RETURN(1);
+   }
+   DBUG_RETURN(0);
+}
+
+static int _ma_ft_erase(MARIA_HA *info, uint keynr, uchar *keybuf,
+			FT_WORD *wlist, my_off_t filepos)
+{
+  uint err=0;
+  DBUG_ENTER("_ma_ft_erase");
+
+  for (; wlist->pos; wlist++)
+  {
+    MARIA_KEY key;
+    _ma_ft_make_key(info, &key, keynr, keybuf, wlist, filepos);
+    if (_ma_ck_delete(info, &key))
+      err=1;
+   }
+   DBUG_RETURN(err);
+}
+
+/*
+  Compares an appropriate parts of two WORD_KEY keys directly out of records
+  returns 1 if they are different
+*/
+
+#define THOSE_TWO_DAMN_KEYS_ARE_REALLY_DIFFERENT 1
+#define GEE_THEY_ARE_ABSOLUTELY_IDENTICAL	 0
+
+int _ma_ft_cmp(MARIA_HA *info, uint keynr, const uchar *rec1, const uchar *rec2)
+{
+  FT_SEG_ITERATOR ftsi1, ftsi2;
+  CHARSET_INFO *cs=info->s->keyinfo[keynr].seg->charset;
+  DBUG_ENTER("_ma_ft_cmp");
+
+  _ma_ft_segiterator_init(info, keynr, rec1, &ftsi1);
+  _ma_ft_segiterator_init(info, keynr, rec2, &ftsi2);
+
+  while (_ma_ft_segiterator(&ftsi1) && _ma_ft_segiterator(&ftsi2))
+  {
+    if ((ftsi1.pos != ftsi2.pos) &&
+        (!ftsi1.pos || !ftsi2.pos ||
+         ha_compare_text(cs, ftsi1.pos,ftsi1.len,
+                         ftsi2.pos,ftsi2.len,0,0)))
+      DBUG_RETURN(THOSE_TWO_DAMN_KEYS_ARE_REALLY_DIFFERENT);
+  }
+  DBUG_RETURN(GEE_THEY_ARE_ABSOLUTELY_IDENTICAL);
+}
+
+
+/* update a document entry */
+
+int _ma_ft_update(MARIA_HA *info, uint keynr, uchar *keybuf,
+                  const uchar *oldrec, const uchar *newrec, my_off_t pos)
+{
+  int error= -1;
+  FT_WORD *oldlist,*newlist, *old_word, *new_word;
+  CHARSET_INFO *cs=info->s->keyinfo[keynr].seg->charset;
+  int cmp, cmp2;
+  DBUG_ENTER("_ma_ft_update");
+
+  if (!(old_word=oldlist=_ma_ft_parserecord(info, keynr, oldrec,
+                                            &info->ft_memroot)) ||
+      !(new_word=newlist=_ma_ft_parserecord(info, keynr, newrec,
+                                            &info->ft_memroot)))
+    goto err;
+
+  error=0;
+  while(old_word->pos && new_word->pos)
+  {
+    cmp= ha_compare_text(cs, (uchar*) old_word->pos,old_word->len,
+                             (uchar*) new_word->pos,new_word->len,0,0);
+    cmp2= cmp ? 0 : (fabs(old_word->weight - new_word->weight) > 1.e-5);
+
+    if (cmp < 0 || cmp2)
+    {
+      MARIA_KEY key;
+      _ma_ft_make_key(info, &key, keynr, keybuf, old_word, pos);
+      if (_ma_ck_delete(info, &key))
+      {
+        error= -1;
+        goto err;
+      }
+    }
+    if (cmp > 0 || cmp2)
+    {
+      MARIA_KEY key;
+      _ma_ft_make_key(info, &key, keynr, keybuf, new_word,pos);
+      if ((error= _ma_ck_write(info, &key)))
+        goto err;
+    }
+    if (cmp<=0) old_word++;
+    if (cmp>=0) new_word++;
+ }
+ if (old_word->pos)
+   error= _ma_ft_erase(info,keynr,keybuf,old_word,pos);
+ else if (new_word->pos)
+   error= _ma_ft_store(info,keynr,keybuf,new_word,pos);
+
+err:
+  free_root(&info->ft_memroot, MYF(MY_MARK_BLOCKS_FREE));
+  DBUG_RETURN(error);
+}
+
+
+/* adds a document to the collection */
+
+int _ma_ft_add(MARIA_HA *info, uint keynr, uchar *keybuf, const uchar *record,
+	       my_off_t pos)
+{
+  int error= -1;
+  FT_WORD *wlist;
+  DBUG_ENTER("_ma_ft_add");
+  DBUG_PRINT("enter",("keynr: %d",keynr));
+
+  if ((wlist= _ma_ft_parserecord(info, keynr, record, &info->ft_memroot)))
+    error= _ma_ft_store(info,keynr,keybuf,wlist,pos);
+  free_root(&info->ft_memroot, MYF(MY_MARK_BLOCKS_FREE));
+  DBUG_PRINT("exit",("Return: %d",error));
+  DBUG_RETURN(error);
+}
+
+
+/* removes a document from the collection */
+
+int _ma_ft_del(MARIA_HA *info, uint keynr, uchar *keybuf, const uchar *record,
+	       my_off_t pos)
+{
+  int error= -1;
+  FT_WORD *wlist;
+  DBUG_ENTER("_ma_ft_del");
+  DBUG_PRINT("enter",("keynr: %d",keynr));
+
+  if ((wlist= _ma_ft_parserecord(info, keynr, record, &info->ft_memroot)))
+    error= _ma_ft_erase(info,keynr,keybuf,wlist,pos);
+  free_root(&info->ft_memroot, MYF(MY_MARK_BLOCKS_FREE));
+  DBUG_PRINT("exit",("Return: %d",error));
+  DBUG_RETURN(error);
+}
+
+
+MARIA_KEY *_ma_ft_make_key(MARIA_HA *info, MARIA_KEY *key, uint keynr,
+                           uchar *keybuf,
+                           FT_WORD *wptr, my_off_t filepos)
+{
+  uchar buf[HA_FT_MAXBYTELEN+16];
+  DBUG_ENTER("_ma_ft_make_key");
+
+#if HA_FT_WTYPE == HA_KEYTYPE_FLOAT
+  {
+    float weight=(float) ((filepos==HA_OFFSET_ERROR) ? 0 : wptr->weight);
+    mi_float4store(buf,weight);
+  }
+#else
+#error
+#endif
+
+  int2store(buf+HA_FT_WLEN,wptr->len);
+  memcpy(buf+HA_FT_WLEN+2,wptr->pos,wptr->len);
+  /* Can't be spatial so it's ok to call _ma_make_key directly here */
+  DBUG_RETURN(_ma_make_key(info, key, keynr, keybuf, buf, filepos, 0));
+}
+
+
+/*
+  convert key value to ft2
+*/
+
+my_bool _ma_ft_convert_to_ft2(MARIA_HA *info, MARIA_KEY *key)
+{
+  MARIA_SHARE *share= info->s;
+  my_off_t root;
+  DYNAMIC_ARRAY *da=info->ft1_to_ft2;
+  MARIA_KEYDEF *keyinfo=&share->ft2_keyinfo;
+  uchar *key_ptr= (uchar*) dynamic_array_ptr(da, 0), *end;
+  uint length, key_length;
+  MARIA_PINNED_PAGE tmp_page_link, *page_link= &tmp_page_link;
+  MARIA_KEY tmp_key;
+  MARIA_PAGE page;
+  DBUG_ENTER("_ma_ft_convert_to_ft2");
+
+  /* we'll generate one pageful at once, and insert the rest one-by-one */
+  /* calculating the length of this page ...*/
+  length=(keyinfo->block_length-2) / keyinfo->keylength;
+  set_if_smaller(length, da->elements);
+  length=length * keyinfo->keylength;
+
+  get_key_full_length_rdonly(key_length, key->data);
+  while (_ma_ck_delete(info, key) == 0)
+  {
+    /*
+      nothing to do here.
+      _ma_ck_delete() will populate info->ft1_to_ft2 with deleted keys
+    */
+  }
+
+  /* creating pageful of keys */
+  bzero(info->buff, share->keypage_header);
+  _ma_store_keynr(share, info->buff, keyinfo->key_nr);
+  _ma_store_page_used(share, info->buff, length + share->keypage_header);
+  memcpy(info->buff + share->keypage_header, key_ptr, length);
+  info->keyread_buff_used= info->page_changed=1;      /* info->buff is used */
+  /**
+    @todo RECOVERY BUG this is not logged yet. Ok as this code is never
+    called, but soon it will be.
+  */
+  if ((root= _ma_new(info, DFLT_INIT_HITS, &page_link)) == HA_OFFSET_ERROR)
+    DBUG_RETURN(1);
+
+  _ma_page_setup(&page, info, keyinfo, root, info->buff);
+  if (_ma_write_keypage(&page, page_link->write_lock, DFLT_INIT_HITS))
+    DBUG_RETURN(1);
+
+  /* inserting the rest of key values */
+  end= (uchar*) dynamic_array_ptr(da, da->elements);
+  tmp_key.keyinfo= keyinfo;
+  tmp_key.data_length= keyinfo->keylength;
+  tmp_key.ref_length= 0;
+  tmp_key.flag= 0;
+  for (key_ptr+=length; key_ptr < end; key_ptr+=keyinfo->keylength)
+  {
+    tmp_key.data= key_ptr;
+    if (_ma_ck_real_write_btree(info, key, &root, SEARCH_SAME))
+      DBUG_RETURN(1);
+  }
+
+  /* now, writing the word key entry */
+  ft_intXstore(key->data + key_length, - (int) da->elements);
+  _ma_dpointer(share, key->data + key_length + HA_FT_WLEN, root);
+
+  DBUG_RETURN(_ma_ck_real_write_btree(info, key,
+                                      &share->state.key_root[key->keyinfo->
+                                                             key_nr],
+                                      SEARCH_SAME));
+}
diff --git a/storage/maria/ma_ftdefs.h b/storage/maria/ma_ftdefs.h
new file mode 100644
index 00000000000..4ce4e9e22ba
--- /dev/null
+++ b/storage/maria/ma_ftdefs.h
@@ -0,0 +1,156 @@
+/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; version 2 of the License.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */
+
+/* Written by Sergei A. Golubchik, who has a shared copyright to this code */
+
+/* some definitions for full-text indices */
+
+#include "ma_fulltext.h"
+#include <m_ctype.h>
+#include <my_tree.h>
+#include <queues.h>
+#include <mysql/plugin.h>
+
+#define true_word_char(ctype, character) \
+                      ((ctype) & (_MY_U | _MY_L | _MY_NMR) || \
+                       (character) == '_')
+#define misc_word_char(X)	0
+
+#define FT_MAX_WORD_LEN_FOR_SORT 31
+
+#define FTPARSER_MEMROOT_ALLOC_SIZE 65536
+
+#define COMPILE_STOPWORDS_IN
+
+/* Interested readers may consult SMART
+   (ftp://ftp.cs.cornell.edu/pub/smart/smart.11.0.tar.Z)
+   for an excellent implementation of vector space model we use.
+   It also demonstrate the usage of different weghting techniques.
+   This code, though, is completely original and is not based on the
+   SMART code but was in some cases inspired by it.
+
+   NORM_PIVOT was taken from the article
+   A.Singhal, C.Buckley, M.Mitra, "Pivoted Document Length Normalization",
+   ACM SIGIR'96, 21-29, 1996
+ */
+
+#define LWS_FOR_QUERY					  LWS_TF
+#define LWS_IN_USE					 LWS_LOG
+#define PRENORM_IN_USE				     PRENORM_AVG
+#define NORM_IN_USE				      NORM_PIVOT
+#define GWS_IN_USE					GWS_PROB
+/*==============================================================*/
+#define LWS_TF						  (count)
+#define LWS_BINARY					(count>0)
+#define LWS_SQUARE				    (count*count)
+#define LWS_LOG				 (count?(log( (double) count)+1):0)
+/*--------------------------------------------------------------*/
+#define PRENORM_NONE				      (p->weight)
+#define PRENORM_MAX			  (p->weight/docstat.max)
+#define PRENORM_AUG		  (0.4+0.6*p->weight/docstat.max)
+#define PRENORM_AVG	     (p->weight/docstat.sum*docstat.uniq)
+#define PRENORM_AVGLOG ((1+log(p->weight))/(1+log(docstat.sum/docstat.uniq)))
+/*--------------------------------------------------------------*/
+#define NORM_NONE					      (1)
+#define NORM_SUM				   (docstat.nsum)
+#define NORM_COS			    (sqrt(docstat.nsum2))
+
+#define PIVOT_VAL (0.0115)
+#define NORM_PIVOT  (1+PIVOT_VAL*docstat.uniq)
+/*---------------------------------------------------------------*/
+#define GWS_NORM				     (1/sqrt(sum2))
+#define GWS_GFIDF				      (sum/doc_cnt)
+/* Mysterious, but w/o (double) GWS_IDF performs better :-o */
+#define GWS_IDF		   log(aio->info->state->records/doc_cnt)
+#define GWS_IDF1	   log((double)aio->info->state->records/doc_cnt)
+#define GWS_PROB ((aio->info->state->records > doc_cnt) ? log(((double)(aio->info->state->records-doc_cnt))/doc_cnt) : 0 )
+#define GWS_FREQ					(1.0/doc_cnt)
+#define GWS_SQUARED pow(log((double)aio->info->state->records/doc_cnt),2)
+#define GWS_CUBIC   pow(log((double)aio->info->state->records/doc_cnt),3)
+#define GWS_ENTROPY (1-(suml/sum-log(sum))/log(aio->info->state->records))
+/*=================================================================*/
+
+/* Boolean search operators */
+#define FTB_YES   (ft_boolean_syntax[0])
+#define FTB_EGAL  (ft_boolean_syntax[1])
+#define FTB_NO    (ft_boolean_syntax[2])
+#define FTB_INC   (ft_boolean_syntax[3])
+#define FTB_DEC   (ft_boolean_syntax[4])
+#define FTB_LBR   (ft_boolean_syntax[5])
+#define FTB_RBR   (ft_boolean_syntax[6])
+#define FTB_NEG   (ft_boolean_syntax[7])
+#define FTB_TRUNC (ft_boolean_syntax[8])
+#define FTB_LQUOT (ft_boolean_syntax[10])
+#define FTB_RQUOT (ft_boolean_syntax[11])
+
+typedef struct st_maria_ft_word {
+  const uchar * pos;
+  uint	 len;
+  double weight;
+} FT_WORD;
+
+int is_stopword(char *word, uint len);
+
+MARIA_KEY *_ma_ft_make_key(MARIA_HA *, MARIA_KEY *, uint , uchar *, FT_WORD *,
+                           my_off_t);
+
+uchar maria_ft_get_word(CHARSET_INFO *, const uchar **, const uchar *,
+                        FT_WORD *, MYSQL_FTPARSER_BOOLEAN_INFO *);
+uchar maria_ft_simple_get_word(CHARSET_INFO *, const uchar **, const uchar *,
+                               FT_WORD *, my_bool);
+
+typedef struct _st_maria_ft_seg_iterator {
+  uint        num, len;
+  HA_KEYSEG  *seg;
+  const uchar *rec, *pos;
+} FT_SEG_ITERATOR;
+
+void _ma_ft_segiterator_init(MARIA_HA *, uint, const uchar *, FT_SEG_ITERATOR *);
+void _ma_ft_segiterator_dummy_init(const uchar *, uint, FT_SEG_ITERATOR *);
+uint _ma_ft_segiterator(FT_SEG_ITERATOR *);
+
+void maria_ft_parse_init(TREE *, CHARSET_INFO *);
+int maria_ft_parse(TREE *, uchar *, size_t, struct st_mysql_ftparser *parser,
+             MYSQL_FTPARSER_PARAM *, MEM_ROOT *);
+FT_WORD * maria_ft_linearize(TREE *, MEM_ROOT *);
+FT_WORD * _ma_ft_parserecord(MARIA_HA *, uint, const uchar *, MEM_ROOT *);
+uint _ma_ft_parse(TREE *, MARIA_HA *, uint, const uchar *,
+                  MYSQL_FTPARSER_PARAM *, MEM_ROOT *);
+
+FT_INFO *maria_ft_init_nlq_search(MARIA_HA *, uint, uchar *, size_t, uint,
+                                  uchar *);
+FT_INFO *maria_ft_init_boolean_search(MARIA_HA *, uint, uchar *, size_t,
+                                      CHARSET_INFO *);
+
+extern const struct _ft_vft _ma_ft_vft_nlq;
+int maria_ft_nlq_read_next(FT_INFO *, char *);
+float maria_ft_nlq_find_relevance(FT_INFO *, uchar *, uint);
+void maria_ft_nlq_close_search(FT_INFO *);
+float maria_ft_nlq_get_relevance(FT_INFO *);
+my_off_t maria_ft_nlq_get_docid(FT_INFO *);
+void maria_ft_nlq_reinit_search(FT_INFO *);
+
+extern const struct _ft_vft _ma_ft_vft_boolean;
+int maria_ft_boolean_read_next(FT_INFO *, char *);
+float maria_ft_boolean_find_relevance(FT_INFO *, uchar *, uint);
+void maria_ft_boolean_close_search(FT_INFO *);
+float maria_ft_boolean_get_relevance(FT_INFO *);
+my_off_t maria_ft_boolean_get_docid(FT_INFO *);
+void maria_ft_boolean_reinit_search(FT_INFO *);
+MYSQL_FTPARSER_PARAM* maria_ftparser_alloc_param(MARIA_HA *info);
+extern MYSQL_FTPARSER_PARAM *maria_ftparser_call_initializer(MARIA_HA *info,
+                                                             uint keynr,
+                                                             uint paramnr);
+extern void maria_ftparser_call_deinitializer(MARIA_HA *info);
diff --git a/storage/maria/ma_fulltext.h b/storage/maria/ma_fulltext.h
new file mode 100644
index 00000000000..6e087990bd2
--- /dev/null
+++ b/storage/maria/ma_fulltext.h
@@ -0,0 +1,27 @@
+/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; version 2 of the License.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */
+
+/* Written by Sergei A. Golubchik, who has a shared copyright to this code */
+
+/* some definitions for full-text indices */
+
+#include "maria_def.h"
+#include "ft_global.h"
+
+int  _ma_ft_cmp(MARIA_HA *, uint, const uchar *, const uchar *);
+int  _ma_ft_add(MARIA_HA *, uint, uchar *, const uchar *, my_off_t);
+int  _ma_ft_del(MARIA_HA *, uint, uchar *, const uchar *, my_off_t);
+
+my_bool _ma_ft_convert_to_ft2(MARIA_HA *, MARIA_KEY *);
diff --git a/storage/maria/ma_info.c b/storage/maria/ma_info.c
new file mode 100644
index 00000000000..1bbfa3cbf7e
--- /dev/null
+++ b/storage/maria/ma_info.c
@@ -0,0 +1,142 @@
+/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; version 2 of the License.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */
+
+/* Return useful base information for an open table */
+
+#include "maria_def.h"
+#ifdef	__WIN__
+#include <sys/stat.h>
+#endif
+
+	/* Get position to last record */
+
+MARIA_RECORD_POS maria_position(MARIA_HA *info)
+{
+  return info->cur_row.lastpos;
+}
+
+
+/* Get information about the table */
+/* if flag == 2 one get current info (no sync from database */
+
+int maria_status(MARIA_HA *info, register MARIA_INFO *x, uint flag)
+{
+  MY_STAT state;
+  MARIA_SHARE *share= info->s;
+  DBUG_ENTER("maria_status");
+
+  x->recpos= info->cur_row.lastpos;
+  if (flag == HA_STATUS_POS)
+    DBUG_RETURN(0);				/* Compatible with ISAM */
+  if (!(flag & HA_STATUS_NO_LOCK))
+  {
+    pthread_mutex_lock(&share->intern_lock);
+    VOID(_ma_readinfo(info,F_RDLCK,0));
+    fast_ma_writeinfo(info);
+    pthread_mutex_unlock(&share->intern_lock);
+  }
+  if (flag & HA_STATUS_VARIABLE)
+  {
+    x->records	 	= info->state->records;
+    x->deleted	 	= share->state.state.del;
+    x->delete_length	= share->state.state.empty;
+    x->data_file_length	= share->state.state.data_file_length;
+    x->index_file_length= share->state.state.key_file_length;
+
+    x->keys	 	= share->state.header.keys;
+    x->check_time	= share->state.check_time;
+    x->mean_reclength	= x->records ?
+      (ulong) ((x->data_file_length - x->delete_length) /x->records) :
+      (ulong) share->min_pack_length;
+  }
+  if (flag & HA_STATUS_ERRKEY)
+  {
+    x->errkey=       info->errkey;
+    x->dup_key_pos=  info->dup_key_pos;
+  }
+  if (flag & HA_STATUS_CONST)
+  {
+    x->reclength	= share->base.reclength;
+    x->max_data_file_length=share->base.max_data_file_length;
+    x->max_index_file_length=info->s->base.max_key_file_length;
+    x->filenr	 = info->dfile.file;
+    x->options	 = share->options;
+    x->create_time=share->state.create_time;
+    x->reflength= maria_get_pointer_length(share->base.max_data_file_length,
+                                        maria_data_pointer_size);
+    x->record_offset= (info->s->data_file_type == STATIC_RECORD ?
+                       share->base.pack_reclength: 0);
+    x->sortkey= -1;				/* No clustering */
+    x->rec_per_key	= share->state.rec_per_key_part;
+    x->key_map	 	= share->state.key_map;
+    x->data_file_name   = share->data_file_name.str;
+    x->index_file_name  = share->index_file_name.str;
+    x->data_file_type   = share->data_file_type;
+  }
+  if ((flag & HA_STATUS_TIME) && !my_fstat(info->dfile.file, &state, MYF(0)))
+    x->update_time=state.st_mtime;
+  else
+    x->update_time=0;
+  if (flag & HA_STATUS_AUTO)
+  {
+    x->auto_increment= share->state.auto_increment+1;
+    if (!x->auto_increment)			/* This shouldn't happen */
+      x->auto_increment= ~(ulonglong) 0;
+  }
+  DBUG_RETURN(0);
+}
+
+
+/*
+  Write a message to the error log.
+
+  SYNOPSIS
+    _ma_report_error()
+    file_name                   Name of table file (e.g. index_file_name).
+    errcode                     Error number.
+
+  DESCRIPTION
+    This function supplies my_error() with a table name. Most error
+    messages need one. Since string arguments in error messages are limited
+    to 64 characters by convention, we ensure that in case of truncation,
+    that the end of the index file path is in the message. This contains
+    the most valuable information (the table name and the database name).
+
+  RETURN
+    void
+*/
+
+void _ma_report_error(int errcode, const LEX_STRING *name)
+{
+  size_t length;
+  const char *file_name= name->str;
+  DBUG_ENTER("_ma_report_error");
+  DBUG_PRINT("enter",("errcode %d, table '%s'", errcode, file_name));
+
+  if ((length= name->length) > 64)
+  {
+    /* we first remove the directory */
+    size_t dir_length= dirname_length(file_name);
+    file_name+= dir_length;
+    if ((length-= dir_length) > 64)
+    {
+      /* still too long, chop start of table name */
+      file_name+= length - 64;
+    }
+  }
+
+  my_error(errcode, MYF(ME_NOREFRESH), file_name);
+  DBUG_VOID_RETURN;
+}
diff --git a/storage/maria/ma_init.c b/storage/maria/ma_init.c
new file mode 100644
index 00000000000..902f06d93e5
--- /dev/null
+++ b/storage/maria/ma_init.c
@@ -0,0 +1,184 @@
+/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; version 2 of the License.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */
+
+/* Initialize an maria-database */
+
+#include "maria_def.h"
+#include <ft_global.h>
+#include "ma_blockrec.h"
+#include "trnman_public.h"
+#include "ma_checkpoint.h"
+#include <hash.h>
+
+void history_state_free(MARIA_STATE_HISTORY_CLOSED *closed_history)
+{
+  MARIA_STATE_HISTORY *history, *next;
+
+  /*
+    Free all active history
+    In case of maria_open() this list should be empty as the history is moved
+    to handler->share.
+ */
+  for (history= closed_history->state_history; history ; history= next)
+  {
+    next= history->next;
+    my_free(history, MYF(0));
+  }
+  my_free(closed_history, MYF(0));
+}
+
+
+static int dummy_maria_create_trn_hook(MARIA_HA *info __attribute__((unused)))
+{
+  return 0;
+}
+
+/*
+  Initialize maria
+
+  SYNOPSIS
+    maria_init()
+
+  TODO
+    Open log files and do recovery if need
+
+  RETURN
+  0  ok
+  #  error number
+*/
+
+int maria_init(void)
+{
+  DBUG_ASSERT(maria_block_size &&
+              maria_block_size % MARIA_MIN_KEY_BLOCK_LENGTH == 0);
+  if (!maria_inited)
+  {
+    maria_inited= TRUE;
+    pthread_mutex_init(&THR_LOCK_maria,MY_MUTEX_INIT_SLOW);
+    _ma_init_block_record_data();
+    trnman_end_trans_hook= _ma_trnman_end_trans_hook;
+    maria_create_trn_hook= dummy_maria_create_trn_hook;
+    my_handler_error_register();
+  }
+  hash_init(&maria_stored_state, &my_charset_bin, 32,
+            0, sizeof(LSN), 0, (hash_free_key) history_state_free, 0);
+  DBUG_PRINT("info",("dummy_transaction_object: %p",
+                     &dummy_transaction_object));
+  return 0;
+}
+
+
+void maria_end(void)
+{
+  if (maria_inited)
+  {
+    TrID trid;
+    maria_inited= maria_multi_threaded= FALSE;
+    ft_free_stopwords();
+    ma_checkpoint_end();
+    if (translog_status == TRANSLOG_OK)
+    {
+      translog_soft_sync_end();
+      translog_sync();
+    }
+    if ((trid= trnman_get_max_trid()) > max_trid_in_control_file)
+    {
+      /*
+        Store max transaction id into control file, in case logs are removed
+        by user, or maria_chk wants to check tables (it cannot access max trid
+        from the log, as it cannot process REDOs).
+      */
+      (void)ma_control_file_write_and_force(last_checkpoint_lsn, last_logno,
+                                            trid, recovery_failures);
+    }
+    trnman_destroy();
+    if (translog_status == TRANSLOG_OK)
+      translog_destroy();
+    end_pagecache(maria_log_pagecache, TRUE);
+    end_pagecache(maria_pagecache, TRUE);
+    ma_control_file_end();
+    pthread_mutex_destroy(&THR_LOCK_maria);
+    hash_free(&maria_stored_state);
+  }
+}
+
+/**
+   Upgrade from older Aria versions:
+
+  - In MariaDB 5.1, the name of the control file and log files had the
+    'maria' prefix, now they have the 'aria' prefix.
+
+  @return: 0 ok
+           1 error
+
+*/
+
+my_bool maria_upgrade()
+{
+  char name[FN_REFLEN], new_name[FN_REFLEN];
+  DBUG_ENTER("maria_upgrade");
+
+  fn_format(name, "maria_log_control", maria_data_root, "", MYF(MY_WME));
+
+  if (!my_access(name,F_OK))
+  {
+    /*
+      Old style control file found; Rename the control file and the log files.
+      We start by renaming all log files, so that if we get a crash
+      we will continue from where we left.
+    */
+    uint i;
+    MY_DIR *dir= my_dir(maria_data_root, MYF(MY_WME));
+    if (!dir)
+      DBUG_RETURN(1);
+
+    my_message(HA_ERR_INITIALIZATION,
+               "Found old style Maria log files; "
+               "Converting them to Aria names",
+               MYF(ME_JUST_INFO));
+
+    for (i= 0; i < dir->number_off_files; i++)
+    {
+      const char *file= dir->dir_entry[i].name;
+      if (strncmp(file, "maria_log.", 10) == 0 &&
+          file[10] >= '0' && file[10] <= '9' &&
+        file[11] >= '0' && file[11] <= '9' &&
+        file[12] >= '0' && file[12] <= '9' &&
+        file[13] >= '0' && file[13] <= '9' &&
+        file[14] >= '0' && file[14] <= '9' &&
+        file[15] >= '0' && file[15] <= '9' &&
+        file[16] >= '0' && file[16] <= '9' &&
+        file[17] >= '0' && file[17] <= '9' &&
+        file[18] == '\0')
+      {
+        /* Remove the 'm' in 'maria' */
+        char old_logname[FN_REFLEN], new_logname[FN_REFLEN];
+        fn_format(old_logname, file, maria_data_root, "", MYF(0));
+        fn_format(new_logname, file+1, maria_data_root, "", MYF(0));
+        if (my_rename(old_logname, new_logname, MYF(MY_WME)))
+        {
+          my_dirend(dir);
+          DBUG_RETURN(1);
+        }
+      }
+    }
+    my_dirend(dir);
+    
+    fn_format(new_name, CONTROL_FILE_BASE_NAME, maria_data_root, "", MYF(0));
+    if (my_rename(name, new_name, MYF(MY_WME)))
+      DBUG_RETURN(1);
+  }
+  DBUG_RETURN(0);
+}
diff --git a/storage/maria/ma_key.c b/storage/maria/ma_key.c
new file mode 100644
index 00000000000..ac23bf5fef6
--- /dev/null
+++ b/storage/maria/ma_key.c
@@ -0,0 +1,775 @@
+/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; version 2 of the License.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */
+
+/* Functions to handle keys */
+
+#include "maria_def.h"
+#include "m_ctype.h"
+#include "ma_sp_defs.h"
+#include "ma_blockrec.h"                        /* For ROW_FLAG_TRANSID */
+#include "trnman.h"
+#ifdef HAVE_IEEEFP_H
+#include <ieeefp.h>
+#endif
+
+#define CHECK_KEYS                              /* Enable safety checks */
+
+static int _ma_put_key_in_record(MARIA_HA *info, uint keynr,
+                                 my_bool unpack_blobs, uchar *record);
+
+#define FIX_LENGTH(cs, pos, length, char_length)                            \
+            do {                                                            \
+              if (length > char_length)                                     \
+                char_length= (uint) my_charpos(cs, pos, pos+length, char_length); \
+              set_if_smaller(char_length,length);                           \
+            } while(0)
+
+
+/**
+  Store trid in a packed format as part of a key
+
+  @fn    transid_store_packed
+  @param info   Maria handler
+  @param to     End of key to which we should store a packed transid
+  @param trid   Trid to be stored
+
+  @notes
+
+  Keys that have a transid has the lowest bit set for the last byte of the key
+  This function sets this bit for the key.
+
+  Trid is max 6 bytes long
+
+  First Trid it's converted to a smaller number by using
+  trid= trid - create_trid.
+  Then trid is then shifted up one bit so that we can use the
+  lowest bit as a marker if it's followed by another trid.
+
+  Trid is then stored as follows:
+
+  if trid < 256-12
+    one byte
+  else
+    one byte prefix length_of_trid_in_bytes + 249 followed by data
+    in high-byte-first order
+
+  Prefix bytes 244 to 249 are reserved for negative transid, that can be used
+  when we pack transid relative to each other on a key block.
+
+  We have to store transid in high-byte-first order so that we can compare
+  them unpacked byte per byte and as soon we find a difference we know
+  which is smaller.
+
+  For example, assuming we the following data:
+
+  key_data:               1                (4 byte integer)
+  pointer_to_row:         2 << 8 + 3 = 515 (page 2, row 3)
+  table_create_transid    1000             Defined at create table time and
+                                           stored in table definition
+  transid                 1010	           Transaction that created row
+  delete_transid          2011             Transaction that deleted row
+
+  In addition we assume the table is created with a data pointer length
+  of 4 bytes (this is automatically calculated based on the medium
+  length of rows and the given max number of rows)
+
+  The binary data for the key would then look like this in hex:
+
+  00 00 00 01     Key data (1 stored high byte first)
+  00 00 00 47	  (515 << 1) + 1         ;  The last 1 is marker that key cont.
+  15              ((1010-1000) << 1) + 1 ;  The last 1 is marker that key cont.
+  FB 07 E6        Length byte (FE = 249 + 2 means 2 bytes) and 
+                  ((2011 - 1000) << 1) = 07 E6
+*/
+
+uint transid_store_packed(MARIA_HA *info, uchar *to, ulonglong trid)
+{
+  uchar *start;
+  uint length;
+  uchar buff[8];
+  DBUG_ASSERT(trid < (LL(1) << (MARIA_MAX_PACK_TRANSID_SIZE*8)));
+  DBUG_ASSERT(trid >= info->s->state.create_trid);
+
+  trid= (trid - info->s->state.create_trid) << 1;
+
+  /* Mark that key contains transid */
+  to[-1]|= 1;
+
+  if (trid < MARIA_MIN_TRANSID_PACK_OFFSET)
+  {
+    to[0]= (uchar) trid;
+    return 1;
+  }
+  start= to;
+
+  /* store things in low-byte-first-order in buff */
+  to= buff;
+  do
+  {
+    *to++= (uchar) trid;
+    trid= trid>>8;
+  } while (trid);
+
+  length= (uint) (to - buff);
+  /* Store length prefix */
+  start[0]= (uchar) (length + MARIA_TRANSID_PACK_OFFSET);
+  start++;
+  /* Copy things in high-byte-first order to output buffer */
+  do
+  {
+    *start++= *--to;
+  } while (to != buff);
+  return length+1;
+}
+
+
+/**
+   Read packed transid
+
+   @fn    transid_get_packed
+   @param info   Maria handler
+   @param from	 Transid is stored here
+
+   See transid_store_packed() for how transid is packed
+
+*/
+
+ulonglong transid_get_packed(MARIA_SHARE *share, const uchar *from)
+{
+  ulonglong value;
+  uint length;
+
+  if (from[0] < MARIA_MIN_TRANSID_PACK_OFFSET)
+    value= (ulonglong) from[0];
+  else
+  {
+    value= 0;
+    for (length= (uint) (from[0] - MARIA_TRANSID_PACK_OFFSET),
+           value= (ulonglong) from[1], from+=2;
+         --length ;
+         from++)
+      value= (value << 8) + ((ulonglong) *from);
+  }
+  return (value >> 1) + share->state.create_trid;
+}
+
+
+/*
+  Make a normal (not spatial or fulltext) intern key from a record
+
+  SYNOPSIS
+    _ma_make_key()
+    info		MyiSAM handler
+    int_key		Store created key here
+    keynr		key number
+    key			Buffer used to store key data
+    record		Record
+    filepos		Position to record in the data file
+
+  NOTES
+    This is used to generate keys from the record on insert, update and delete
+
+  RETURN
+    key
+*/
+
+MARIA_KEY *_ma_make_key(MARIA_HA *info, MARIA_KEY *int_key, uint keynr,
+                        uchar *key, const uchar *record,
+                        MARIA_RECORD_POS filepos, ulonglong trid)
+{
+  const uchar *pos;
+  reg1 HA_KEYSEG *keyseg;
+  my_bool is_ft;
+  DBUG_ENTER("_ma_make_key");
+
+  int_key->data= key;
+  int_key->flag= 0;                             /* Always return full key */
+  int_key->keyinfo= info->s->keyinfo + keynr;
+
+  is_ft= int_key->keyinfo->flag & HA_FULLTEXT;
+  for (keyseg= int_key->keyinfo->seg ; keyseg->type ;keyseg++)
+  {
+    enum ha_base_keytype type=(enum ha_base_keytype) keyseg->type;
+    uint length=keyseg->length;
+    uint char_length;
+    CHARSET_INFO *cs=keyseg->charset;
+
+    if (keyseg->null_bit)
+    {
+      if (record[keyseg->null_pos] & keyseg->null_bit)
+      {
+	*key++= 0;				/* NULL in key */
+	continue;
+      }
+      *key++=1;					/* Not NULL */
+    }
+
+    char_length= ((!is_ft && cs && cs->mbmaxlen > 1) ? length/cs->mbmaxlen :
+                  length);
+
+    pos= record+keyseg->start;
+    if (type == HA_KEYTYPE_BIT)
+    {
+      if (keyseg->bit_length)
+      {
+        uchar bits= get_rec_bits(record + keyseg->bit_pos,
+                                 keyseg->bit_start, keyseg->bit_length);
+        *key++= (char) bits;
+        length--;
+      }
+      memcpy(key, pos, length);
+      key+= length;
+      continue;
+    }
+    if (keyseg->flag & HA_SPACE_PACK)
+    {
+      if (type != HA_KEYTYPE_NUM)
+      {
+        length= (uint) cs->cset->lengthsp(cs, (const char*)pos, length);
+      }
+      else
+      {
+        const uchar *end= pos + length;
+	while (pos < end && pos[0] == ' ')
+	  pos++;
+	length= (uint) (end-pos);
+      }
+      FIX_LENGTH(cs, pos, length, char_length);
+      store_key_length_inc(key,char_length);
+      memcpy(key, pos, (size_t) char_length);
+      key+=char_length;
+      continue;
+    }
+    if (keyseg->flag & HA_VAR_LENGTH_PART)
+    {
+      uint pack_length= (keyseg->bit_start == 1 ? 1 : 2);
+      uint tmp_length= (pack_length == 1 ? (uint) *pos :
+                        uint2korr(pos));
+      pos+= pack_length;			/* Skip VARCHAR length */
+      set_if_smaller(length,tmp_length);
+      FIX_LENGTH(cs, pos, length, char_length);
+      store_key_length_inc(key,char_length);
+      memcpy(key,pos,(size_t) char_length);
+      key+= char_length;
+      continue;
+    }
+    else if (keyseg->flag & HA_BLOB_PART)
+    {
+      uint tmp_length= _ma_calc_blob_length(keyseg->bit_start,pos);
+      uchar *blob_pos;
+      memcpy_fixed(&blob_pos, pos+keyseg->bit_start,sizeof(char*));
+      set_if_smaller(length,tmp_length);
+      FIX_LENGTH(cs, blob_pos, length, char_length);
+      store_key_length_inc(key,char_length);
+      memcpy(key, blob_pos, (size_t) char_length);
+      key+= char_length;
+      continue;
+    }
+    else if (keyseg->flag & HA_SWAP_KEY)
+    {						/* Numerical column */
+#ifdef HAVE_ISNAN
+      if (type == HA_KEYTYPE_FLOAT)
+      {
+	float nr;
+	float4get(nr,pos);
+	if (isnan(nr))
+	{
+	  /* Replace NAN with zero */
+	  bzero(key,length);
+	  key+=length;
+	  continue;
+	}
+      }
+      else if (type == HA_KEYTYPE_DOUBLE)
+      {
+	double nr;
+	float8get(nr,pos);
+	if (isnan(nr))
+	{
+	  bzero(key,length);
+	  key+=length;
+	  continue;
+	}
+      }
+#endif
+      pos+=length;
+      while (length--)
+      {
+	*key++ = *--pos;
+      }
+      continue;
+    }
+    FIX_LENGTH(cs, pos, length, char_length);
+    memcpy(key, pos, char_length);
+    if (length > char_length)
+      cs->cset->fill(cs, (char*) key+char_length, length-char_length, ' ');
+    key+= length;
+  }
+  _ma_dpointer(info->s, key, filepos);
+  int_key->data_length= (key - int_key->data);
+  int_key->ref_length= info->s->rec_reflength;
+  int_key->flag= 0;
+  if (_ma_have_versioning(info) && trid)
+  {
+    int_key->ref_length+= transid_store_packed(info,
+                                               key + int_key->ref_length,
+                                               (TrID) trid);
+    int_key->flag|= SEARCH_USER_KEY_HAS_TRANSID;
+  }
+
+  DBUG_PRINT("exit",("keynr: %d",keynr));
+  DBUG_DUMP_KEY("key", int_key);
+  DBUG_EXECUTE("key", _ma_print_key(DBUG_FILE, int_key););
+  DBUG_RETURN(int_key);
+} /* _ma_make_key */
+
+
+/*
+  Pack a key to intern format from given format (c_rkey)
+
+  SYNOPSIS
+    _ma_pack_key()
+    info		MARIA handler
+    int_key		Store key here
+    keynr		key number
+    key			Buffer for key data
+    old			Original not packed key
+    keypart_map         bitmap of used keyparts
+    last_used_keyseg	out parameter.  May be NULL
+
+   RETURN
+   int_key
+
+     last_use_keyseg    Store pointer to the keyseg after the last used one
+*/
+
+MARIA_KEY *_ma_pack_key(register MARIA_HA *info, MARIA_KEY *int_key,
+                        uint keynr, uchar *key,
+                        const uchar *old, key_part_map keypart_map,
+                        HA_KEYSEG **last_used_keyseg)
+{
+  HA_KEYSEG *keyseg;
+  my_bool is_ft;
+  DBUG_ENTER("_ma_pack_key");
+
+  int_key->data= key;
+  int_key->keyinfo= info->s->keyinfo + keynr;
+
+  /* "one part" rtree key is 2*SPDIMS part key in Maria */
+  if (int_key->keyinfo->key_alg == HA_KEY_ALG_RTREE)
+    keypart_map= (((key_part_map)1) << (2*SPDIMS)) - 1;
+
+  /* only key prefixes are supported */
+  DBUG_ASSERT(((keypart_map+1) & keypart_map) == 0);
+
+  is_ft= int_key->keyinfo->flag & HA_FULLTEXT;
+  for (keyseg=int_key->keyinfo->seg ; keyseg->type && keypart_map;
+       old+= keyseg->length, keyseg++)
+  {
+    enum ha_base_keytype type= (enum ha_base_keytype) keyseg->type;
+    uint length= keyseg->length;
+    uint char_length;
+    const uchar *pos;
+    CHARSET_INFO *cs=keyseg->charset;
+
+    keypart_map>>= 1;
+    if (keyseg->null_bit)
+    {
+      if (!(*key++= (char) 1-*old++))			/* Copy null marker */
+      {
+        if (keyseg->flag & (HA_VAR_LENGTH_PART | HA_BLOB_PART))
+          old+= 2;
+	continue;					/* Found NULL */
+      }
+    }
+    char_length= ((!is_ft && cs && cs->mbmaxlen > 1) ? length/cs->mbmaxlen :
+                  length);
+    pos= old;
+    if (keyseg->flag & HA_SPACE_PACK)
+    {
+      const uchar *end= pos + length;
+      if (type == HA_KEYTYPE_NUM)
+      {
+	while (pos < end && pos[0] == ' ')
+	  pos++;
+      }
+      else if (type != HA_KEYTYPE_BINARY)
+      {
+	while (end > pos && end[-1] == ' ')
+	  end--;
+      }
+      length=(uint) (end-pos);
+      FIX_LENGTH(cs, pos, length, char_length);
+      store_key_length_inc(key,char_length);
+      memcpy(key,pos,(size_t) char_length);
+      key+= char_length;
+      continue;
+    }
+    else if (keyseg->flag & (HA_VAR_LENGTH_PART | HA_BLOB_PART))
+    {
+      /* Length of key-part used with maria_rkey() always 2 */
+      uint tmp_length=uint2korr(pos);
+      pos+=2;
+      set_if_smaller(length,tmp_length);	/* Safety */
+      FIX_LENGTH(cs, pos, length, char_length);
+      store_key_length_inc(key,char_length);
+      old+=2;					/* Skip length */
+      memcpy(key, pos,(size_t) char_length);
+      key+= char_length;
+      continue;
+    }
+    else if (keyseg->flag & HA_SWAP_KEY)
+    {						/* Numerical column */
+      pos+=length;
+      while (length--)
+	*key++ = *--pos;
+      continue;
+    }
+    FIX_LENGTH(cs, pos, length, char_length);
+    memcpy(key, pos, char_length);
+    if (length > char_length)
+      cs->cset->fill(cs, (char*) key+char_length, length-char_length, ' ');
+    key+= length;
+  }
+  if (last_used_keyseg)
+    *last_used_keyseg= keyseg;
+
+  /* set flag to SEARCH_PART_KEY if we are not using all key parts */
+  int_key->flag= keyseg->type ? SEARCH_PART_KEY : 0;
+  int_key->ref_length= 0;
+  int_key->data_length= (key - int_key->data);
+
+  DBUG_PRINT("exit", ("length: %u", int_key->data_length));
+  DBUG_RETURN(int_key);
+} /* _ma_pack_key */
+
+
+/**
+   Copy a key
+*/
+
+void _ma_copy_key(MARIA_KEY *to, const MARIA_KEY *from)
+{
+  memcpy(to->data, from->data, from->data_length + from->ref_length);
+  to->keyinfo=     from->keyinfo;
+  to->data_length= from->data_length;
+  to->ref_length=  from->ref_length;
+  to->flag=        from->flag;
+}
+
+
+/*
+  Store found key in record
+
+  SYNOPSIS
+    _ma_put_key_in_record()
+    info		MARIA handler
+    keynr		Key number that was used
+    unpack_blobs        TRUE  <=> Unpack blob columns
+                        FALSE <=> Skip them. This is used by index condition 
+                                  pushdown check function
+    record 		Store key here
+
+    Last read key is in info->lastkey
+
+ NOTES
+   Used when only-keyread is wanted
+
+ RETURN
+   0   ok
+   1   error
+*/
+
+static int _ma_put_key_in_record(register MARIA_HA *info, uint keynr,
+				 my_bool unpack_blobs, uchar *record)
+{
+  reg2 uchar *key;
+  uchar *pos,*key_end;
+  reg1 HA_KEYSEG *keyseg;
+  uchar *blob_ptr;
+  DBUG_ENTER("_ma_put_key_in_record");
+
+  blob_ptr= info->lastkey_buff2;         /* Place to put blob parts */
+  key= info->last_key.data;               /* Key that was read */
+  key_end= key + info->last_key.data_length;
+  for (keyseg=info->s->keyinfo[keynr].seg ; keyseg->type ;keyseg++)
+  {
+    if (keyseg->null_bit)
+    {
+      if (!*key++)
+      {
+	record[keyseg->null_pos]|= keyseg->null_bit;
+	continue;
+      }
+      record[keyseg->null_pos]&= ~keyseg->null_bit;
+    }
+    if (keyseg->type == HA_KEYTYPE_BIT)
+    {
+      uint length= keyseg->length;
+
+      if (keyseg->bit_length)
+      {
+        uchar bits= *key++;
+        set_rec_bits(bits, record + keyseg->bit_pos, keyseg->bit_start,
+                     keyseg->bit_length);
+        length--;
+      }
+      else
+      {
+        clr_rec_bits(record + keyseg->bit_pos, keyseg->bit_start,
+                     keyseg->bit_length);
+      }
+      memcpy(record + keyseg->start, key, length);
+      key+= length;
+      continue;
+    }
+    if (keyseg->flag & HA_SPACE_PACK)
+    {
+      uint length;
+      get_key_length(length,key);
+#ifdef CHECK_KEYS
+      if (length > keyseg->length || key+length > key_end)
+	goto err;
+#endif
+      pos= record+keyseg->start;
+      if (keyseg->type != (int) HA_KEYTYPE_NUM)
+      {
+        memcpy(pos,key,(size_t) length);
+        keyseg->charset->cset->fill(keyseg->charset,
+                                    (char*) pos + length,
+                                    keyseg->length - length,
+                                    ' ');
+      }
+      else
+      {
+	bfill(pos,keyseg->length-length,' ');
+	memcpy(pos+keyseg->length-length,key,(size_t) length);
+      }
+      key+=length;
+      continue;
+    }
+
+    if (keyseg->flag & HA_VAR_LENGTH_PART)
+    {
+      uint length;
+      get_key_length(length,key);
+#ifdef CHECK_KEYS
+      if (length > keyseg->length || key+length > key_end)
+	goto err;
+#endif
+      /* Store key length */
+      if (keyseg->bit_start == 1)
+        *(uchar*) (record+keyseg->start)= (uchar) length;
+      else
+        int2store(record+keyseg->start, length);
+      /* And key data */
+      memcpy(record+keyseg->start + keyseg->bit_start, key, length);
+      key+= length;
+    }
+    else if (keyseg->flag & HA_BLOB_PART)
+    {
+      uint length;
+      get_key_length(length,key);
+#ifdef CHECK_KEYS
+      if (length > keyseg->length || key+length > key_end)
+	goto err;
+#endif
+      if (unpack_blobs)
+      {
+        memcpy(record+keyseg->start+keyseg->bit_start,
+               (char*) &blob_ptr,sizeof(char*));
+        memcpy(blob_ptr,key,length);
+        blob_ptr+=length;
+
+        /* The above changed info->lastkey2. Inform maria_rnext_same(). */
+        info->update&= ~HA_STATE_RNEXT_SAME;
+
+        _ma_store_blob_length(record+keyseg->start,
+                              (uint) keyseg->bit_start,length);
+      }
+      key+=length;
+    }
+    else if (keyseg->flag & HA_SWAP_KEY)
+    {
+      uchar *to=  record+keyseg->start+keyseg->length;
+      uchar *end= key+keyseg->length;
+#ifdef CHECK_KEYS
+      if (end > key_end)
+	goto err;
+#endif
+      do
+      {
+	 *--to= *key++;
+      } while (key != end);
+      continue;
+    }
+    else
+    {
+#ifdef CHECK_KEYS
+      if (key+keyseg->length > key_end)
+	goto err;
+#endif
+      memcpy(record+keyseg->start, key, (size_t) keyseg->length);
+      key+= keyseg->length;
+    }
+  }
+  DBUG_RETURN(0);
+
+err:
+  DBUG_PRINT("info",("error"));
+  DBUG_RETURN(1);				/* Crashed row */
+} /* _ma_put_key_in_record */
+
+
+	/* Here when key reads are used */
+
+int _ma_read_key_record(MARIA_HA *info, uchar *buf, MARIA_RECORD_POS filepos)
+{
+  fast_ma_writeinfo(info);
+  if (filepos != HA_OFFSET_ERROR)
+  {
+    if (info->lastinx >= 0)
+    {				/* Read only key */
+      if (_ma_put_key_in_record(info, (uint)info->lastinx, TRUE, buf))
+      {
+        maria_print_error(info->s, HA_ERR_CRASHED);
+	my_errno=HA_ERR_CRASHED;
+	return -1;
+      }
+      info->update|= HA_STATE_AKTIV; /* We should find a record */
+      return 0;
+    }
+    my_errno=HA_ERR_WRONG_INDEX;
+  }
+  return(-1);				/* Wrong data to read */
+}
+
+
+
+/*
+  Save current key tuple to record and call index condition check function
+
+  SYNOPSIS
+    ma_check_index_cond()
+      info    MyISAM handler
+      keynr   Index we're running a scan on
+      record  Record buffer to use (it is assumed that index check function 
+              will look for column values there)
+
+  RETURN
+    ICP_ERROR         Error 
+    ICP_NO_MATCH      Index condition is not satisfied, continue scanning
+    ICP_MATCH         Index condition is satisfied
+    ICP_OUT_OF_RANGE  Index condition is not satisfied, end the scan. 
+*/
+
+int ma_check_index_cond(register MARIA_HA *info, uint keynr, uchar *record)
+{
+  if (info->index_cond_func)
+  {
+    if (_ma_put_key_in_record(info, keynr, FALSE, record))
+    {
+      maria_print_error(info->s, HA_ERR_CRASHED);
+      my_errno=HA_ERR_CRASHED;
+      return -1;
+    }
+    return info->index_cond_func(info->index_cond_func_arg);
+  }
+  return 1;
+}
+
+
+/*
+  Retrieve auto_increment info
+
+  SYNOPSIS
+    retrieve_auto_increment()
+    key                         Auto-increment key
+    key_type                    Key's type
+
+  NOTE
+    'key' should in "record" format, that is, how it is packed in a record
+    (this matters with HA_SWAP_KEY).
+
+  IMPLEMENTATION
+    For signed columns we don't retrieve the auto increment value if it's
+    less than zero.
+*/
+
+ulonglong ma_retrieve_auto_increment(const uchar *key, uint8 key_type)
+{
+  ulonglong value= 0;			/* Store unsigned values here */
+  longlong s_value= 0;			/* Store signed values here */
+
+  switch (key_type) {
+  case HA_KEYTYPE_INT8:
+    s_value= (longlong) *(const char*)key;
+    break;
+  case HA_KEYTYPE_BINARY:
+    value=(ulonglong)  *key;
+    break;
+  case HA_KEYTYPE_SHORT_INT:
+    s_value= (longlong) sint2korr(key);
+    break;
+  case HA_KEYTYPE_USHORT_INT:
+    value=(ulonglong) uint2korr(key);
+    break;
+  case HA_KEYTYPE_LONG_INT:
+    s_value= (longlong) sint4korr(key);
+    break;
+  case HA_KEYTYPE_ULONG_INT:
+    value=(ulonglong) uint4korr(key);
+    break;
+  case HA_KEYTYPE_INT24:
+    s_value= (longlong) sint3korr(key);
+    break;
+  case HA_KEYTYPE_UINT24:
+    value=(ulonglong) uint3korr(key);
+    break;
+  case HA_KEYTYPE_FLOAT:                        /* This shouldn't be used */
+  {
+    float f_1;
+    float4get(f_1,key);
+    /* Ignore negative values */
+    value = (f_1 < (float) 0.0) ? 0 : (ulonglong) f_1;
+    break;
+  }
+  case HA_KEYTYPE_DOUBLE:                       /* This shouldn't be used */
+  {
+    double f_1;
+    float8get(f_1,key);
+    /* Ignore negative values */
+    value = (f_1 < 0.0) ? 0 : (ulonglong) f_1;
+    break;
+  }
+  case HA_KEYTYPE_LONGLONG:
+    s_value= sint8korr(key);
+    break;
+  case HA_KEYTYPE_ULONGLONG:
+    value= uint8korr(key);
+    break;
+  default:
+    DBUG_ASSERT(0);
+    value=0;                                    /* Error */
+    break;
+  }
+
+  /*
+    The following code works becasue if s_value < 0 then value is 0
+    and if s_value == 0 then value will contain either s_value or the
+    correct value.
+  */
+  return (s_value > 0) ? (ulonglong) s_value : value;
+}
diff --git a/storage/maria/ma_key_recover.c b/storage/maria/ma_key_recover.c
new file mode 100644
index 00000000000..6de5253a2dd
--- /dev/null
+++ b/storage/maria/ma_key_recover.c
@@ -0,0 +1,1432 @@
+/* Copyright (C) 2007 Michael Widenius
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; version 2 of the License.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */
+
+/* Redo of index */
+
+#include "maria_def.h"
+#include "ma_blockrec.h"
+#include "trnman.h"
+#include "ma_key_recover.h"
+#include "ma_rt_index.h"
+
+/****************************************************************************
+  Some helper functions used both by key page loggin and block page loggin
+****************************************************************************/
+
+/**
+  @brief Unpin all pinned pages
+
+  @fn _ma_unpin_all_pages()
+  @param info	   Maria handler
+  @param undo_lsn  LSN for undo pages. LSN_IMPOSSIBLE if we shouldn't write
+                   undo (like on duplicate key errors)
+
+  info->pinned_pages is the list of pages to unpin. Each member of the list
+  must have its 'changed' saying if the page was changed or not.
+
+  @note
+    We unpin pages in the reverse order as they where pinned; This is not
+    necessary now, but may simplify things in the future.
+
+  @return
+  @retval   0   ok
+  @retval   1   error (fatal disk error)
+*/
+
+void _ma_unpin_all_pages(MARIA_HA *info, LSN undo_lsn)
+{
+  MARIA_PINNED_PAGE *page_link= ((MARIA_PINNED_PAGE*)
+                                 dynamic_array_ptr(&info->pinned_pages, 0));
+  MARIA_PINNED_PAGE *pinned_page= page_link + info->pinned_pages.elements;
+  DBUG_ENTER("_ma_unpin_all_pages");
+  DBUG_PRINT("info", ("undo_lsn: %lu", (ulong) undo_lsn));
+
+  if (!info->s->now_transactional)
+    DBUG_ASSERT(undo_lsn == LSN_IMPOSSIBLE || maria_in_recovery);
+
+  while (pinned_page-- != page_link)
+  {
+    /*
+      Note this assert fails if we got a disk error or the record file
+      is corrupted, which means we should have this enabled only in debug
+      builds.
+    */
+#ifdef EXTRA_DEBUG
+    DBUG_ASSERT((!pinned_page->changed ||
+                 undo_lsn != LSN_IMPOSSIBLE || !info->s->now_transactional) ||
+                (info->s->state.changed & STATE_CRASHED));
+#endif
+    pagecache_unlock_by_link(info->s->pagecache, pinned_page->link,
+                             pinned_page->unlock, PAGECACHE_UNPIN,
+                             info->trn->rec_lsn, undo_lsn,
+                             pinned_page->changed, FALSE);
+  }
+
+  info->pinned_pages.elements= 0;
+  DBUG_VOID_RETURN;
+}
+
+
+my_bool _ma_write_clr(MARIA_HA *info, LSN undo_lsn,
+                      enum translog_record_type undo_type,
+                      my_bool store_checksum, ha_checksum checksum,
+                      LSN *res_lsn, void *extra_msg)
+{
+  uchar log_data[LSN_STORE_SIZE + FILEID_STORE_SIZE + CLR_TYPE_STORE_SIZE +
+                 HA_CHECKSUM_STORE_SIZE+ KEY_NR_STORE_SIZE + PAGE_STORE_SIZE];
+  uchar *log_pos;
+  LEX_CUSTRING log_array[TRANSLOG_INTERNAL_PARTS + 1];
+  struct st_msg_to_write_hook_for_clr_end msg;
+  my_bool res;
+  DBUG_ENTER("_ma_write_clr");
+
+  /* undo_lsn must be first for compression to work */
+  lsn_store(log_data, undo_lsn);
+  clr_type_store(log_data + LSN_STORE_SIZE + FILEID_STORE_SIZE, undo_type);
+  log_pos= log_data + LSN_STORE_SIZE + FILEID_STORE_SIZE + CLR_TYPE_STORE_SIZE;
+
+  /* Extra_msg is handled in write_hook_for_clr_end() */
+  msg.undone_record_type= undo_type;
+  msg.previous_undo_lsn=  undo_lsn;
+  msg.extra_msg= extra_msg;
+  msg.checksum_delta= 0;
+
+  if (store_checksum)
+  {
+    msg.checksum_delta= checksum;
+    ha_checksum_store(log_pos, checksum);
+    log_pos+= HA_CHECKSUM_STORE_SIZE;
+  }
+  else if (undo_type == LOGREC_UNDO_KEY_INSERT_WITH_ROOT ||
+           undo_type == LOGREC_UNDO_KEY_DELETE_WITH_ROOT)
+  {
+    /* Key root changed. Store new key root */
+    struct st_msg_to_write_hook_for_undo_key *undo_msg= extra_msg;
+    pgcache_page_no_t page;
+    key_nr_store(log_pos, undo_msg->keynr);
+    page= (undo_msg->value == HA_OFFSET_ERROR ? IMPOSSIBLE_PAGE_NO :
+           undo_msg->value / info->s->block_size);
+    page_store(log_pos + KEY_NR_STORE_SIZE, page);
+    log_pos+= KEY_NR_STORE_SIZE + PAGE_STORE_SIZE;
+  }
+  log_array[TRANSLOG_INTERNAL_PARTS + 0].str=    log_data;
+  log_array[TRANSLOG_INTERNAL_PARTS + 0].length= (uint) (log_pos - log_data);
+
+
+  /*
+    We need intern_lock mutex for calling _ma_state_info_write in the trigger.
+    We do it here to have the same sequence of mutexes locking everywhere
+    (first intern_lock then transactional log  buffer lock)
+  */
+  if (undo_type == LOGREC_UNDO_BULK_INSERT)
+    pthread_mutex_lock(&info->s->intern_lock);
+
+  res= translog_write_record(res_lsn, LOGREC_CLR_END,
+                             info->trn, info,
+                             (translog_size_t)
+                             log_array[TRANSLOG_INTERNAL_PARTS + 0].length,
+                             TRANSLOG_INTERNAL_PARTS + 1, log_array,
+                             log_data + LSN_STORE_SIZE, &msg);
+  if (undo_type == LOGREC_UNDO_BULK_INSERT)
+    pthread_mutex_unlock(&info->s->intern_lock);
+  DBUG_RETURN(res);
+}
+
+
+/**
+   @brief Sets transaction's undo_lsn, first_undo_lsn if needed
+
+   @return Operation status, always 0 (success)
+*/
+
+my_bool write_hook_for_clr_end(enum translog_record_type type
+                               __attribute__ ((unused)),
+                               TRN *trn, MARIA_HA *tbl_info,
+                               LSN *lsn __attribute__ ((unused)),
+                               void *hook_arg)
+{
+  MARIA_SHARE *share= tbl_info->s;
+  struct st_msg_to_write_hook_for_clr_end *msg=
+    (struct st_msg_to_write_hook_for_clr_end *)hook_arg;
+  my_bool error= FALSE;
+  DBUG_ENTER("write_hook_for_clr_end");
+  DBUG_ASSERT(trn->trid != 0);
+  trn->undo_lsn= msg->previous_undo_lsn;
+
+  switch (msg->undone_record_type) {
+  case LOGREC_UNDO_ROW_DELETE:
+    share->state.state.records++;
+    share->state.state.checksum+= msg->checksum_delta;
+    break;
+  case LOGREC_UNDO_ROW_INSERT:
+    share->state.state.records--;
+    share->state.state.checksum+= msg->checksum_delta;
+    break;
+  case LOGREC_UNDO_ROW_UPDATE:
+    share->state.state.checksum+= msg->checksum_delta;
+    break;
+  case LOGREC_UNDO_KEY_INSERT_WITH_ROOT:
+  case LOGREC_UNDO_KEY_DELETE_WITH_ROOT:
+  {
+    /* Update key root */
+    struct st_msg_to_write_hook_for_undo_key *extra_msg=
+      (struct st_msg_to_write_hook_for_undo_key *) msg->extra_msg;
+    *extra_msg->root= extra_msg->value;
+    break;
+  }
+  case LOGREC_UNDO_KEY_INSERT:
+  case LOGREC_UNDO_KEY_DELETE:
+    break;
+  case LOGREC_UNDO_BULK_INSERT:
+    safe_mutex_assert_owner(&share->intern_lock);
+    error= (maria_enable_indexes(tbl_info) ||
+            /* we enabled indices, need '2' below */
+            _ma_state_info_write(share,
+                                 MA_STATE_INFO_WRITE_DONT_MOVE_OFFSET |
+                                 MA_STATE_INFO_WRITE_FULL_INFO));
+    /* no need for _ma_reset_status(): REDO_DELETE_ALL is just before us */
+    break;
+  default:
+    DBUG_ASSERT(0);
+  }
+  if (trn->undo_lsn == LSN_IMPOSSIBLE) /* has fully rolled back */
+    trn->first_undo_lsn= LSN_WITH_FLAGS_TO_FLAGS(trn->first_undo_lsn);
+  DBUG_RETURN(error);
+}
+
+
+/**
+  @brief write hook for undo key
+*/
+
+my_bool write_hook_for_undo_key(enum translog_record_type type,
+                                TRN *trn, MARIA_HA *tbl_info,
+                                LSN *lsn, void *hook_arg)
+{
+  struct st_msg_to_write_hook_for_undo_key *msg=
+    (struct st_msg_to_write_hook_for_undo_key *) hook_arg;
+
+  *msg->root= msg->value;
+  _ma_fast_unlock_key_del(tbl_info);
+  return write_hook_for_undo(type, trn, tbl_info, lsn, 0);
+}
+
+
+/**
+   Updates "auto_increment" and calls the generic UNDO_KEY hook
+
+   @return Operation status, always 0 (success)
+*/
+
+my_bool write_hook_for_undo_key_insert(enum translog_record_type type,
+                                       TRN *trn, MARIA_HA *tbl_info,
+                                       LSN *lsn, void *hook_arg)
+{
+  struct st_msg_to_write_hook_for_undo_key *msg=
+    (struct st_msg_to_write_hook_for_undo_key *) hook_arg;
+  MARIA_SHARE *share= tbl_info->s;
+  if (msg->auto_increment > 0)
+  {
+    /*
+      Only reason to set it here is to have a mutex protect from checkpoint
+      reading at the same time (would see a corrupted value).
+
+      The purpose of the following code is to set auto_increment if the row
+      has a with auto_increment value higher than the current one. We also
+      want to be able to restore the old value, in case of rollback,
+      if no one else has tried to set the value.
+
+      The logic used is that we only restore the auto_increment value if
+      tbl_info->last_auto_increment == share->last_auto_increment
+      when it's time to do the rollback.
+    */
+    DBUG_PRINT("info",("auto_inc: %lu new auto_inc: %lu",
+                       (ulong)share->state.auto_increment,
+                       (ulong)msg->auto_increment));
+    if (share->state.auto_increment < msg->auto_increment)
+    {
+      /* Remember the original value, in case of rollback */
+      tbl_info->last_auto_increment= share->last_auto_increment=
+        share->state.auto_increment;
+      share->state.auto_increment= msg->auto_increment;
+    }
+    else
+    {
+      /*
+        If the current value would have affected the original auto_increment
+        value, set it to an impossible value so that it's not restored on
+        rollback
+      */
+      if (msg->auto_increment > share->last_auto_increment)
+        share->last_auto_increment= ~(ulonglong) 0;
+    }
+  }
+  return write_hook_for_undo_key(type, trn, tbl_info, lsn, hook_arg);
+}
+
+
+/**
+   @brief Updates "share->auto_increment" in case of abort and calls
+   generic UNDO_KEY hook
+
+   @return Operation status, always 0 (success)
+*/
+
+my_bool write_hook_for_undo_key_delete(enum translog_record_type type,
+                                       TRN *trn, MARIA_HA *tbl_info,
+                                       LSN *lsn, void *hook_arg)
+{
+  struct st_msg_to_write_hook_for_undo_key *msg=
+    (struct st_msg_to_write_hook_for_undo_key *) hook_arg;
+  MARIA_SHARE *share= tbl_info->s;
+  if (msg->auto_increment > 0)                  /* If auto increment key */
+  {
+    /* Restore auto increment if no one has changed it in between */
+    if (share->last_auto_increment == tbl_info->last_auto_increment &&
+        tbl_info->last_auto_increment != ~(ulonglong) 0)
+      share->state.auto_increment= tbl_info->last_auto_increment;
+  }
+  return write_hook_for_undo_key(type, trn, tbl_info, lsn, hook_arg);
+}
+
+
+/*****************************************************************************
+  Functions for logging of key page changes
+*****************************************************************************/
+
+/**
+   @brief
+   Write log entry for page that has got data added or deleted at start of page
+*/
+
+my_bool _ma_log_prefix(MARIA_PAGE *ma_page, uint changed_length,
+                       int move_length,
+                       enum en_key_debug debug_marker __attribute__((unused)))
+{
+  uint translog_parts;
+  LSN lsn;
+  uchar log_data[FILEID_STORE_SIZE + PAGE_STORE_SIZE + 7 + 7 + 2 + 2];
+  uchar *log_pos;
+  uchar *buff= ma_page->buff;
+  LEX_CUSTRING log_array[TRANSLOG_INTERNAL_PARTS + 4];
+  MARIA_HA *info= ma_page->info;
+  pgcache_page_no_t page= ma_page->pos / info->s->block_size;
+  DBUG_ENTER("_ma_log_prefix");
+  DBUG_PRINT("enter", ("page: %lu  changed_length: %u  move_length: %d",
+                       (ulong) page, changed_length, move_length));
+
+  DBUG_ASSERT(ma_page->size == ma_page->org_size + move_length);
+
+  log_pos= log_data + FILEID_STORE_SIZE;
+  page_store(log_pos, page);
+  log_pos+= PAGE_STORE_SIZE;
+
+#ifdef EXTRA_DEBUG_KEY_CHANGES
+  (*log_pos++)= KEY_OP_DEBUG;
+  (*log_pos++)= debug_marker;
+#endif
+
+  /* Store keypage_flag */
+  *log_pos++= KEY_OP_SET_PAGEFLAG;
+  *log_pos++= buff[KEYPAGE_TRANSFLAG_OFFSET];
+
+  if (move_length < 0)
+  {
+    /* Delete prefix */
+    log_pos[0]= KEY_OP_DEL_PREFIX;
+    int2store(log_pos+1, -move_length);
+    log_pos+= 3;
+    if (changed_length)
+    {
+      /*
+        We don't need a KEY_OP_OFFSET as KEY_OP_DEL_PREFIX has an implicit
+        offset
+      */
+      log_pos[0]= KEY_OP_CHANGE;
+      int2store(log_pos+1, changed_length);
+      log_pos+= 3;
+    }
+  }
+  else
+  {
+    /* Add prefix */
+    DBUG_ASSERT(changed_length >0 && (int) changed_length >= move_length);
+    log_pos[0]= KEY_OP_ADD_PREFIX;
+    int2store(log_pos+1, move_length);
+    int2store(log_pos+3, changed_length);
+    log_pos+= 5;
+  }
+
+  translog_parts= 1;
+  log_array[TRANSLOG_INTERNAL_PARTS + 0].str=    log_data;
+  log_array[TRANSLOG_INTERNAL_PARTS + 0].length= (uint) (log_pos -
+                                                         log_data);
+  if (changed_length)
+  {
+    log_array[TRANSLOG_INTERNAL_PARTS + 1].str=    (buff +
+                                                    info->s->keypage_header);
+    log_array[TRANSLOG_INTERNAL_PARTS + 1].length= changed_length;
+    translog_parts= 2;
+  }
+
+  _ma_log_key_changes(ma_page, log_array + TRANSLOG_INTERNAL_PARTS +
+                      translog_parts, log_pos, &changed_length,
+                      &translog_parts);
+  /* Remember new page length for future log entires for same page */
+  ma_page->org_size= ma_page->size;
+
+  DBUG_RETURN(translog_write_record(&lsn, LOGREC_REDO_INDEX,
+                                    info->trn, info,
+                                    (translog_size_t)
+                                    log_array[TRANSLOG_INTERNAL_PARTS +
+                                              0].length + changed_length,
+                                    TRANSLOG_INTERNAL_PARTS + translog_parts,
+                                    log_array, log_data, NULL));
+}
+
+
+/**
+   @brief
+   Write log entry for page that has got data added or deleted at end of page
+*/
+
+my_bool _ma_log_suffix(MARIA_PAGE *ma_page, uint org_length, uint new_length)
+{
+  LSN lsn;
+  LEX_CUSTRING log_array[TRANSLOG_INTERNAL_PARTS + 4];
+  uchar log_data[FILEID_STORE_SIZE + PAGE_STORE_SIZE + 10 + 7 + 2], *log_pos;
+  uchar *buff= ma_page->buff;
+  int diff;
+  uint translog_parts, extra_length;
+  MARIA_HA *info= ma_page->info;
+  pgcache_page_no_t page= ma_page->pos / info->s->block_size;
+  DBUG_ENTER("_ma_log_suffix");
+  DBUG_PRINT("enter", ("page: %lu  org_length: %u  new_length: %u",
+                       (ulong) page, org_length, new_length));
+  DBUG_ASSERT(ma_page->size == new_length);
+  DBUG_ASSERT(ma_page->org_size == org_length);
+
+  log_pos= log_data + FILEID_STORE_SIZE;
+  page_store(log_pos, page);
+  log_pos+= PAGE_STORE_SIZE;
+
+  /* Store keypage_flag */
+  *log_pos++= KEY_OP_SET_PAGEFLAG;
+  *log_pos++= buff[KEYPAGE_TRANSFLAG_OFFSET];
+
+  if ((diff= (int) (new_length - org_length)) < 0)
+  {
+    log_pos[0]= KEY_OP_DEL_SUFFIX;
+    int2store(log_pos+1, -diff);
+    log_pos+= 3;
+    translog_parts= 1;
+    extra_length= 0;
+  }
+  else
+  {
+    log_pos[0]= KEY_OP_ADD_SUFFIX;
+    int2store(log_pos+1, diff);
+    log_pos+= 3;
+    log_array[TRANSLOG_INTERNAL_PARTS + 1].str=    buff + org_length;
+    log_array[TRANSLOG_INTERNAL_PARTS + 1].length= (uint) diff;
+    translog_parts= 2;
+    extra_length= (uint) diff;
+  }
+
+  log_array[TRANSLOG_INTERNAL_PARTS + 0].str=    log_data;
+  log_array[TRANSLOG_INTERNAL_PARTS + 0].length= (uint) (log_pos -
+                                                         log_data);
+
+  _ma_log_key_changes(ma_page,
+                      log_array + TRANSLOG_INTERNAL_PARTS + translog_parts,
+                      log_pos, &extra_length, &translog_parts);
+  /* Remember new page length for future log entires for same page */
+  ma_page->org_size= ma_page->size;
+
+  DBUG_RETURN(translog_write_record(&lsn, LOGREC_REDO_INDEX,
+                                    info->trn, info,
+                                    (translog_size_t)
+                                    log_array[TRANSLOG_INTERNAL_PARTS +
+                                              0].length + extra_length,
+                                    TRANSLOG_INTERNAL_PARTS + translog_parts,
+                                    log_array, log_data, NULL));
+}
+
+
+/**
+   @brief Log that a key was added to the page
+
+   @param ma_page          Changed page
+   @param org_page_length  Length of data in page before key was added
+			   Final length in ma_page->size
+
+   @note
+     If handle_overflow is set, then we have to protect against
+     logging changes that is outside of the page.
+     This may happen during underflow() handling where the buffer
+     in memory temporary contains more data than block_size
+
+     ma_page may be a page that was previously logged and cuted down
+     becasue it's too big. (org_page_length > ma_page->org_size)
+*/
+
+my_bool _ma_log_add(MARIA_PAGE *ma_page,
+                    uint org_page_length __attribute__ ((unused)),
+                    uchar *key_pos, uint changed_length, int move_length,
+                    my_bool handle_overflow __attribute__ ((unused)),
+                    enum en_key_debug debug_marker __attribute__((unused)))
+{
+  LSN lsn;
+  uchar log_data[FILEID_STORE_SIZE + PAGE_STORE_SIZE + 2 + 3 + 3 + 3 + 3 + 7 +
+                 3 + 2];
+  uchar *log_pos;
+  uchar *buff= ma_page->buff;
+  LEX_CUSTRING log_array[TRANSLOG_INTERNAL_PARTS + 6];
+  MARIA_HA *info= ma_page->info;
+  uint offset= (uint) (key_pos - buff);
+  uint max_page_size= info->s->max_index_block_size;
+  uint translog_parts, current_size;
+  pgcache_page_no_t page_pos= ma_page->pos / info->s->block_size;
+  DBUG_ENTER("_ma_log_add");
+  DBUG_PRINT("enter", ("page: %lu  org_page_length: %u  changed_length: %u  "
+                       "move_length: %d",
+                       (ulong) page_pos, org_page_length, changed_length,
+                       move_length));
+  DBUG_ASSERT(info->s->now_transactional);
+  DBUG_ASSERT(move_length <= (int) changed_length);
+  DBUG_ASSERT(ma_page->org_size == min(org_page_length, max_page_size));
+  DBUG_ASSERT(ma_page->size == org_page_length + move_length);
+  DBUG_ASSERT(offset <= ma_page->org_size);
+
+  /*
+    Write REDO entry that contains the logical operations we need
+    to do the page
+  */
+  log_pos= log_data + FILEID_STORE_SIZE;
+  page_store(log_pos, page_pos);
+  current_size= ma_page->org_size;
+  log_pos+= PAGE_STORE_SIZE;
+
+#ifdef EXTRA_DEBUG_KEY_CHANGES
+  *log_pos++= KEY_OP_DEBUG;
+  *log_pos++= debug_marker;
+#endif
+
+  /* Store keypage_flag */
+  *log_pos++= KEY_OP_SET_PAGEFLAG;
+  *log_pos++= buff[KEYPAGE_TRANSFLAG_OFFSET];
+
+  /*
+    Don't overwrite page boundary
+    It's ok to cut this as we will append the data at end of page
+    in the next log entry
+  */
+  if (offset + changed_length > max_page_size)
+  {
+    DBUG_ASSERT(handle_overflow);
+    changed_length= max_page_size - offset;   /* Update to end of page */
+    move_length= 0;                             /* Nothing to move */
+    /* Extend the page to max length on recovery */
+    *log_pos++= KEY_OP_MAX_PAGELENGTH;
+    current_size= max_page_size;
+  }
+
+  /* Check if adding the key made the page overflow */
+  if (current_size + move_length > max_page_size)
+  {
+    /*
+      Adding the key caused an overflow. Cut away the part of the
+      page that doesn't fit.
+    */
+    uint diff;
+    DBUG_ASSERT(handle_overflow);
+    diff= current_size + move_length - max_page_size;
+    log_pos[0]= KEY_OP_DEL_SUFFIX;
+    int2store(log_pos+1, diff);
+    log_pos+= 3;
+    current_size= max_page_size - move_length;
+  }
+
+  if (offset == current_size)
+  {
+    log_pos[0]= KEY_OP_ADD_SUFFIX;
+    current_size+= changed_length;
+  }
+  else
+  {
+    log_pos[0]= KEY_OP_OFFSET;
+    int2store(log_pos+1, offset);
+    log_pos+= 3;
+    if (move_length)
+    {
+      if (move_length < 0)
+      {
+        DBUG_ASSERT(offset - move_length <= org_page_length);
+        if (offset - move_length > current_size)
+        {
+          /*
+            Truncate to end of page. We will add data to it from
+            the page buffer below
+          */
+          move_length= (int) offset - (int) current_size;
+        }
+      }
+      log_pos[0]= KEY_OP_SHIFT;
+      int2store(log_pos+1, move_length);
+      log_pos+= 3;
+      current_size+= move_length;
+    }
+    /*
+      Handle case where page was shortend but 'changed_length' goes over
+      'current_size'. This can only happen when there was a page overflow
+      and we will below add back the overflow part
+    */
+    if (offset + changed_length > current_size)
+    {
+      DBUG_ASSERT(offset + changed_length <= ma_page->size);
+      changed_length= current_size - offset;
+    }
+    log_pos[0]= KEY_OP_CHANGE;
+  }
+  int2store(log_pos+1, changed_length);
+  log_pos+= 3;
+
+  log_array[TRANSLOG_INTERNAL_PARTS + 0].str=    log_data;
+  log_array[TRANSLOG_INTERNAL_PARTS + 0].length= (uint) (log_pos -
+                                                         log_data);
+  log_array[TRANSLOG_INTERNAL_PARTS + 1].str=    key_pos;
+  log_array[TRANSLOG_INTERNAL_PARTS + 1].length= changed_length;
+  translog_parts= TRANSLOG_INTERNAL_PARTS + 2;
+
+  /*
+    If page was originally > block_size before operation and now all data
+    fits, append the end data that was not part of the previous logged
+    page to it.
+  */
+  DBUG_ASSERT(current_size <= max_page_size && current_size <= ma_page->size);
+  if (current_size != ma_page->size && current_size != max_page_size)
+  {
+    uint length= min(ma_page->size, max_page_size) - current_size;
+    uchar *data= ma_page->buff + current_size;
+
+    log_pos[0]= KEY_OP_ADD_SUFFIX;
+    int2store(log_pos+1, length);
+    log_array[translog_parts].str=      log_pos;
+    log_array[translog_parts].length=   3;
+    log_array[translog_parts+1].str=    data;
+    log_array[translog_parts+1].length= length;
+    log_pos+= 3;
+    translog_parts+= 2;
+    current_size+=   length;
+    changed_length+= length + 3;
+  }
+
+  _ma_log_key_changes(ma_page, log_array + translog_parts,
+                      log_pos, &changed_length, &translog_parts);
+  /*
+    Remember new page length for future log entries for same page
+    Note that this can be different from ma_page->size in case of page
+    overflow!
+  */
+  ma_page->org_size= current_size;
+  DBUG_ASSERT(ma_page->org_size == min(ma_page->size, max_page_size));
+
+  if (translog_write_record(&lsn, LOGREC_REDO_INDEX,
+                            info->trn, info,
+                            (translog_size_t)
+                            log_array[TRANSLOG_INTERNAL_PARTS + 0].length +
+                            changed_length, translog_parts,
+                            log_array, log_data, NULL))
+    DBUG_RETURN(-1);
+  DBUG_RETURN(0);
+}
+
+
+#ifdef EXTRA_DEBUG_KEY_CHANGES
+
+/* Log checksum and optionally key page to log */
+
+void _ma_log_key_changes(MARIA_PAGE *ma_page, LEX_CUSTRING *log_array,
+                         uchar *log_pos, uint *changed_length,
+                         uint *translog_parts)
+{
+  MARIA_SHARE *share= ma_page->info->s;
+  int page_length= min(ma_page->size, share->max_index_block_size);
+  uint org_length;
+  ha_checksum crc;
+
+  DBUG_ASSERT(ma_page->flag == (uint) ma_page->buff[KEYPAGE_TRANSFLAG_OFFSET]);
+
+  /* We have to change length as the page may have been shortened */
+  org_length= _ma_get_page_used(share, ma_page->buff);
+  _ma_store_page_used(share, ma_page->buff, page_length);
+  crc= my_checksum(0, ma_page->buff + LSN_STORE_SIZE,
+                   page_length - LSN_STORE_SIZE);
+  _ma_store_page_used(share, ma_page->buff, org_length);
+
+  log_pos[0]= KEY_OP_CHECK;
+  int2store(log_pos+1, page_length);
+  int4store(log_pos+3, crc);
+
+  log_array[0].str=    log_pos;
+  log_array[0].length= 7;
+  (*changed_length)+=  7;
+  (*translog_parts)++;
+#ifdef EXTRA_STORE_FULL_PAGE_IN_KEY_CHANGES
+  log_array[1].str=    ma_page->buff;
+  log_array[1].length= page_length;
+  (*changed_length)+=  page_length;
+  (*translog_parts)++;
+#endif /* EXTRA_STORE_FULL_PAGE_IN_KEY_CHANGES */
+}
+
+#endif /* EXTRA_DEBUG_KEY_CHANGES */
+
+/****************************************************************************
+  Redo of key pages
+****************************************************************************/
+
+/**
+   @brief Apply LOGREC_REDO_INDEX_NEW_PAGE
+
+   @param  info            Maria handler
+   @param  header          Header (without FILEID)
+
+   @return Operation status
+     @retval 0      OK
+     @retval 1      Error
+*/
+
+uint _ma_apply_redo_index_new_page(MARIA_HA *info, LSN lsn,
+                                   const uchar *header, uint length)
+{
+  pgcache_page_no_t root_page= page_korr(header);
+  pgcache_page_no_t free_page= page_korr(header + PAGE_STORE_SIZE);
+  uint      key_nr=    key_nr_korr(header + PAGE_STORE_SIZE * 2);
+  my_bool   page_type_flag= header[PAGE_STORE_SIZE * 2 + KEY_NR_STORE_SIZE];
+  enum pagecache_page_lock unlock_method;
+  enum pagecache_page_pin unpin_method;
+  MARIA_PINNED_PAGE page_link;
+  my_off_t file_size;
+  uchar *buff;
+  uint result;
+  MARIA_SHARE *share= info->s;
+  DBUG_ENTER("_ma_apply_redo_index_new_page");
+  DBUG_PRINT("enter", ("root_page: %lu  free_page: %lu",
+                       (ulong) root_page, (ulong) free_page));
+
+  /* Set header to point at key data */
+
+  share->state.changed|= (STATE_CHANGED | STATE_NOT_OPTIMIZED_KEYS |
+                          STATE_NOT_SORTED_PAGES | STATE_NOT_ZEROFILLED |
+                          STATE_NOT_MOVABLE);
+
+  header+= PAGE_STORE_SIZE * 2 + KEY_NR_STORE_SIZE + 1;
+  length-= PAGE_STORE_SIZE * 2 + KEY_NR_STORE_SIZE + 1;
+
+  file_size= (my_off_t) (root_page + 1) * share->block_size;
+  if (cmp_translog_addr(lsn, share->state.is_of_horizon) >= 0)
+  {
+    /* free_page is 0 if we shouldn't set key_del */
+    if (free_page)
+    {
+      if (free_page != IMPOSSIBLE_PAGE_NO)
+        share->state.key_del= (my_off_t) free_page * share->block_size;
+      else
+        share->state.key_del= HA_OFFSET_ERROR;
+    }
+    if (page_type_flag)     /* root page */
+      share->state.key_root[key_nr]= file_size - share->block_size;
+  }
+
+  if (file_size > share->state.state.key_file_length)
+  {
+    share->state.state.key_file_length= file_size;
+    buff= info->keyread_buff;
+    info->keyread_buff_used= 1;
+    unlock_method= PAGECACHE_LOCK_WRITE;
+    unpin_method=  PAGECACHE_PIN;
+  }
+  else
+  {
+    if (!(buff= pagecache_read(share->pagecache, &share->kfile,
+                               root_page, 0, 0,
+                               PAGECACHE_PLAIN_PAGE, PAGECACHE_LOCK_WRITE,
+                               &page_link.link)))
+    {
+      if (my_errno != HA_ERR_FILE_TOO_SHORT &&
+          my_errno != HA_ERR_WRONG_CRC)
+      {
+        result= 1;
+        goto err;
+      }
+      buff= pagecache_block_link_to_buffer(page_link.link);
+    }
+    else if (lsn_korr(buff) >= lsn)
+    {
+      /* Already applied */
+      DBUG_PRINT("info", ("Page is up to date, skipping redo"));
+      result= 0;
+      goto err;
+    }
+    unlock_method= PAGECACHE_LOCK_LEFT_WRITELOCKED;
+    unpin_method=  PAGECACHE_PIN_LEFT_PINNED;
+  }
+
+  /* Write modified page */
+  bzero(buff, LSN_STORE_SIZE);
+  memcpy(buff + LSN_STORE_SIZE, header, length);
+  bzero(buff + LSN_STORE_SIZE + length,
+        share->max_index_block_size - LSN_STORE_SIZE -  length);
+  bfill(buff + share->block_size - KEYPAGE_CHECKSUM_SIZE,
+        KEYPAGE_CHECKSUM_SIZE, (uchar) 255);
+
+  result= 0;
+  if (unlock_method == PAGECACHE_LOCK_WRITE &&
+      pagecache_write(share->pagecache,
+                      &share->kfile, root_page, 0,
+                      buff, PAGECACHE_PLAIN_PAGE,
+                      unlock_method, unpin_method,
+                      PAGECACHE_WRITE_DELAY, &page_link.link,
+                      LSN_IMPOSSIBLE))
+    result= 1;
+
+  /* Mark page to be unlocked and written at _ma_unpin_all_pages() */
+  page_link.unlock= PAGECACHE_LOCK_WRITE_UNLOCK;
+  page_link.changed= 1;
+  push_dynamic(&info->pinned_pages, (void*) &page_link);
+  DBUG_RETURN(result);
+
+err:
+  pagecache_unlock_by_link(share->pagecache, page_link.link,
+                           PAGECACHE_LOCK_WRITE_UNLOCK,
+                           PAGECACHE_UNPIN, LSN_IMPOSSIBLE,
+                           LSN_IMPOSSIBLE, 0, FALSE);
+  DBUG_RETURN(result);
+}
+
+
+/**
+   @brief Apply LOGREC_REDO_INDEX_FREE_PAGE
+
+   @param  info            Maria handler
+   @param  header          Header (without FILEID)
+
+   @return Operation status
+     @retval 0      OK
+     @retval 1      Error
+*/
+
+uint _ma_apply_redo_index_free_page(MARIA_HA *info,
+                                    LSN lsn,
+                                    const uchar *header)
+{
+  pgcache_page_no_t page= page_korr(header);
+  pgcache_page_no_t free_page= page_korr(header + PAGE_STORE_SIZE);
+  my_off_t old_link;
+  MARIA_PINNED_PAGE page_link;
+  MARIA_SHARE *share= info->s;
+  uchar *buff;
+  int result;
+  DBUG_ENTER("_ma_apply_redo_index_free_page");
+  DBUG_PRINT("enter", ("page: %lu  free_page: %lu",
+                       (ulong) page, (ulong) free_page));
+
+  share->state.changed|= (STATE_CHANGED | STATE_NOT_OPTIMIZED_KEYS |
+                          STATE_NOT_SORTED_PAGES | STATE_NOT_ZEROFILLED |
+                          STATE_NOT_MOVABLE);
+
+  if (cmp_translog_addr(lsn, share->state.is_of_horizon) >= 0)
+    share->state.key_del= (my_off_t) page * share->block_size;
+
+  old_link=  ((free_page != IMPOSSIBLE_PAGE_NO) ?
+              (my_off_t) free_page * share->block_size :
+              HA_OFFSET_ERROR);
+  if (!(buff= pagecache_read(share->pagecache, &share->kfile,
+                             page, 0, 0,
+                             PAGECACHE_PLAIN_PAGE, PAGECACHE_LOCK_WRITE,
+                             &page_link.link)))
+  {
+    result= (uint) my_errno;
+    goto err;
+  }
+  if (lsn_korr(buff) >= lsn)
+  {
+    /* Already applied */
+    result= 0;
+    goto err;
+  }
+  /* Free page */
+  bzero(buff + LSN_STORE_SIZE, share->keypage_header - LSN_STORE_SIZE);
+  _ma_store_keynr(share, buff, (uchar) MARIA_DELETE_KEY_NR);
+  _ma_store_page_used(share, buff, share->keypage_header + 8);
+  mi_sizestore(buff + share->keypage_header, old_link);
+
+#ifdef IDENTICAL_PAGES_AFTER_RECOVERY
+  {
+    bzero(buff + share->keypage_header + 8,
+          share->block_size - share->keypage_header - 8 -
+          KEYPAGE_CHECKSUM_SIZE);
+  }
+#endif
+
+  /* Mark page to be unlocked and written at _ma_unpin_all_pages() */
+  page_link.unlock= PAGECACHE_LOCK_WRITE_UNLOCK;
+  page_link.changed= 1;
+  push_dynamic(&info->pinned_pages, (void*) &page_link);
+  DBUG_RETURN(0);
+
+err:
+  pagecache_unlock_by_link(share->pagecache, page_link.link,
+                           PAGECACHE_LOCK_WRITE_UNLOCK,
+                           PAGECACHE_UNPIN, LSN_IMPOSSIBLE,
+                           LSN_IMPOSSIBLE, 0, FALSE);
+  DBUG_RETURN(result);
+}
+
+
+/**
+   @brief Apply LOGREC_REDO_INDEX
+
+   @fn ma_apply_redo_index()
+   @param  info            Maria handler
+   @param  header          Header (without FILEID)
+
+   @notes
+     Data for this part is a set of logical instructions of how to
+     construct the key page.
+
+   Information of the layout of the components for REDO_INDEX:
+
+   Name              Parameters (in byte) Information
+   KEY_OP_OFFSET     2                    Set position for next operations
+   KEY_OP_SHIFT      2 (signed int)       How much to shift down or up
+   KEY_OP_CHANGE     2 length,  data      Data to replace at 'pos'
+   KEY_OP_ADD_PREFIX 2 move-length        How much data should be moved up
+                     2 change-length      Data to be replaced at page start
+   KEY_OP_DEL_PREFIX 2 length             Bytes to be deleted at page start
+   KEY_OP_ADD_SUFFIX 2 length, data       Add data to end of page
+   KEY_OP_DEL_SUFFIX 2 length             Reduce page length with this
+				          Sets position to start of page
+   KEY_OP_CHECK      6 page_length[2],CRC  Used only when debugging
+					  This may be followed by page_length
+                                          of data (until end of log record)
+   KEY_OP_COMPACT_PAGE  6 transid
+   KEY_OP_SET_PAGEFLAG  1 flag for page
+   KEY_OP_MAX_PAGELENGTH 0                Set page to max length
+   KEY_OP_DEBUG	     1                    Info where logging was done
+
+   @return Operation status
+     @retval 0      OK
+     @retval 1      Error
+*/
+
+long my_counter= 0;
+
+uint _ma_apply_redo_index(MARIA_HA *info,
+                          LSN lsn, const uchar *header, uint head_length)
+{
+  MARIA_SHARE *share= info->s;
+  pgcache_page_no_t page_pos= page_korr(header);
+  MARIA_PINNED_PAGE page_link;
+  uchar *buff;
+  const uchar *header_end= header + head_length;
+  uint page_offset= 0, org_page_length;
+  uint nod_flag, page_length, keypage_header, keynr;
+  uint max_page_size= share->max_index_block_size;
+  int result;
+  MARIA_PAGE page;
+  DBUG_ENTER("_ma_apply_redo_index");
+  DBUG_PRINT("enter", ("page: %lu", (ulong) page_pos));
+
+  /* Set header to point at key data */
+  header+= PAGE_STORE_SIZE;
+
+  if (!(buff= pagecache_read(share->pagecache, &share->kfile,
+                             page_pos, 0, 0,
+                             PAGECACHE_PLAIN_PAGE, PAGECACHE_LOCK_WRITE,
+                             &page_link.link)))
+  {
+    result= 1;
+    goto err;
+  }
+  if (lsn_korr(buff) >= lsn)
+  {
+    /* Already applied */
+    DBUG_PRINT("info", ("Page is up to date, skipping redo"));
+    result= 0;
+    goto err;
+  }
+
+  keynr= _ma_get_keynr(share, buff);
+  _ma_page_setup(&page, info, share->keyinfo + keynr, page_pos, buff);
+  nod_flag=    page.node;
+  org_page_length= page_length= page.size;
+
+  keypage_header= share->keypage_header;
+  DBUG_PRINT("redo", ("page_length: %u", page_length));
+
+  /* Apply modifications to page */
+  do
+  {
+    switch ((enum en_key_op) (*header++)) {
+    case KEY_OP_OFFSET:                         /* 1 */
+      page_offset= uint2korr(header);
+      header+= 2;
+      DBUG_PRINT("redo", ("key_op_offset: %u", page_offset));
+      DBUG_ASSERT(page_offset >= keypage_header && page_offset <= page_length);
+      break;
+    case KEY_OP_SHIFT:                          /* 2 */
+    {
+      int length= sint2korr(header);
+      header+= 2;
+      DBUG_PRINT("redo", ("key_op_shift: %d", length));
+      DBUG_ASSERT(page_offset != 0 && page_offset <= page_length &&
+                  page_length + length <= max_page_size);
+
+      if (length < 0)
+      {
+        DBUG_ASSERT(page_offset - length <= page_length);
+        bmove(buff + page_offset, buff + page_offset - length,
+              page_length - page_offset + length);
+      }
+      else if (page_length != page_offset)
+        bmove_upp(buff + page_length + length, buff + page_length,
+                  page_length - page_offset);
+      page_length+= length;
+      break;
+    }
+    case KEY_OP_CHANGE:                         /* 3 */
+    {
+      uint length= uint2korr(header);
+      DBUG_PRINT("redo", ("key_op_change: %u", length));
+      DBUG_ASSERT(page_offset != 0 && page_offset + length <= page_length);
+
+      memcpy(buff + page_offset, header + 2 , length);
+      page_offset+= length;           /* Put offset after changed length */
+      header+= 2 + length;
+      break;
+    }
+    case KEY_OP_ADD_PREFIX:                     /* 4 */
+    {
+      uint insert_length= uint2korr(header);
+      uint changed_length= uint2korr(header+2);
+      DBUG_PRINT("redo", ("key_op_add_prefix: %u  %u",
+                          insert_length, changed_length));
+
+      DBUG_ASSERT(insert_length <= changed_length &&
+                  page_length + changed_length <= max_page_size);
+
+      bmove_upp(buff + page_length + insert_length, buff + page_length,
+                page_length - keypage_header);
+      memcpy(buff + keypage_header, header + 4 , changed_length);
+      header+= 4 + changed_length;
+      page_length+= insert_length;
+      break;
+    }
+    case KEY_OP_DEL_PREFIX:                     /* 5 */
+    {
+      uint length= uint2korr(header);
+      header+= 2;
+      DBUG_PRINT("redo", ("key_op_del_prefix: %u", length));
+      DBUG_ASSERT(length <= page_length - keypage_header);
+
+      bmove(buff + keypage_header, buff + keypage_header +
+            length, page_length - keypage_header - length);
+      page_length-= length;
+
+      page_offset= keypage_header;              /* Prepare for change */
+      break;
+    }
+    case KEY_OP_ADD_SUFFIX:                     /* 6 */
+    {
+      uint insert_length= uint2korr(header);
+      DBUG_PRINT("redo", ("key_op_add_suffix: %u", insert_length));
+      DBUG_ASSERT(page_length + insert_length <= max_page_size);
+      memcpy(buff + page_length, header+2, insert_length);
+
+      page_length+= insert_length;
+      header+= 2 + insert_length;
+      break;
+    }
+    case KEY_OP_DEL_SUFFIX:                     /* 7 */
+    {
+      uint del_length= uint2korr(header);
+      header+= 2;
+      DBUG_PRINT("redo", ("key_op_del_suffix: %u", del_length));
+      DBUG_ASSERT(page_length - del_length >= keypage_header);
+      page_length-= del_length;
+      break;
+    }
+    case KEY_OP_CHECK:                          /* 8 */
+    {
+#ifdef EXTRA_DEBUG_KEY_CHANGES
+      uint check_page_length;
+      ha_checksum crc;
+      check_page_length= uint2korr(header);
+      crc=               uint4korr(header+2);
+      _ma_store_page_used(share, buff, page_length);
+      if (check_page_length != page_length ||
+          crc != (uint32) my_checksum(0, buff + LSN_STORE_SIZE,
+                                      page_length - LSN_STORE_SIZE))
+      {
+        DBUG_DUMP("KEY_OP_CHECK bad page", buff, page_length);
+        if (header + 6 + check_page_length <= header_end)
+        {
+          DBUG_DUMP("KEY_OP_CHECK org page", header + 6, check_page_length);
+        }
+        DBUG_ASSERT("crc failure in REDO_INDEX" == 0);
+      }
+#endif
+      DBUG_PRINT("redo", ("key_op_check"));
+      /*
+        This is the last entry in the block and it can contain page_length
+        data or not
+      */
+      DBUG_ASSERT(header + 6 == header_end ||
+                  header + 6 + page_length == header_end);
+      header= header_end;
+      break;
+    }
+    case KEY_OP_DEBUG:
+      DBUG_PRINT("redo", ("Debug: %u", (uint) header[0]));
+      header++;
+      break;
+    case KEY_OP_DEBUG_2:
+      DBUG_PRINT("redo", ("org_page_length: %u  new_page_length: %u",
+                          uint2korr(header), uint2korr(header+2)));
+      header+= 4;
+      break;
+    case KEY_OP_MAX_PAGELENGTH:
+      DBUG_PRINT("redo", ("key_op_max_page_length"));
+      page_length= max_page_size;
+      break;
+    case KEY_OP_MULTI_COPY:                     /* 9 */
+    {
+      /*
+        List of fixed-len memcpy() operations with their source located inside
+        the page. The log record's piece looks like:
+        first the length 'full_length' to be used by memcpy()
+        then the number of bytes used by the list of (to,from) pairs
+        then the (to,from) pairs, so we do:
+        for (t,f) in [list of (to,from) pairs]:
+            memcpy(t, f, full_length).
+      */
+      uint full_length, log_memcpy_length;
+      const uchar *log_memcpy_end;
+
+      DBUG_PRINT("redo", ("key_op_multi_copy"));
+      full_length= uint2korr(header);
+      header+= 2;
+      log_memcpy_length= uint2korr(header);
+      header+= 2;
+      log_memcpy_end= header + log_memcpy_length;
+      DBUG_ASSERT(full_length <= max_page_size);
+      while (header < log_memcpy_end)
+      {
+        uint to, from;
+        to= uint2korr(header);
+        header+= 2;
+        from= uint2korr(header);
+        header+= 2;
+        /* "from" is a place in the existing page */
+        DBUG_ASSERT(max(from, to) < max_page_size);
+        memcpy(buff + to, buff + from, full_length);
+      }
+      break;
+    }
+    case KEY_OP_SET_PAGEFLAG:
+      DBUG_PRINT("redo", ("key_op_set_pageflag"));
+      buff[KEYPAGE_TRANSFLAG_OFFSET]= *header++;
+      break;
+    case KEY_OP_COMPACT_PAGE:
+    {
+      TrID transid= transid_korr(header);
+
+      DBUG_PRINT("redo", ("key_op_compact_page"));
+      header+= TRANSID_SIZE;
+      if (_ma_compact_keypage(&page, transid))
+      {
+        result= 1;
+        goto err;
+      }
+      page_length= page.size;
+    }
+    case KEY_OP_NONE:
+    default:
+      DBUG_ASSERT(0);
+      result= 1;
+      goto err;
+    }
+  } while (header < header_end);
+  DBUG_ASSERT(header == header_end);
+
+  /* Write modified page */
+  page.size= page_length;
+  _ma_store_page_used(share, buff, page_length);
+
+  /*
+    Clean old stuff up. Gives us better compression of we archive things
+    and makes things easer to debug
+  */
+  if (page_length < org_page_length)
+    bzero(buff + page_length, org_page_length-page_length);
+
+  /* Mark page to be unlocked and written at _ma_unpin_all_pages() */
+  page_link.unlock= PAGECACHE_LOCK_WRITE_UNLOCK;
+  page_link.changed= 1;
+  push_dynamic(&info->pinned_pages, (void*) &page_link);
+  DBUG_RETURN(0);
+
+err:
+  pagecache_unlock_by_link(share->pagecache, page_link.link,
+                           PAGECACHE_LOCK_WRITE_UNLOCK,
+                           PAGECACHE_UNPIN, LSN_IMPOSSIBLE,
+                           LSN_IMPOSSIBLE, 0, FALSE);
+  if (result)
+    _ma_mark_file_crashed(share);
+  DBUG_RETURN(result);
+}
+
+
+/****************************************************************************
+  Undo of key block changes
+****************************************************************************/
+
+/**
+   @brief Undo of insert of key (ie, delete the inserted key)
+*/
+
+my_bool _ma_apply_undo_key_insert(MARIA_HA *info, LSN undo_lsn,
+                                  const uchar *header, uint length)
+{
+  LSN lsn;
+  my_bool res;
+  uint keynr;
+  uchar key_buff[MARIA_MAX_KEY_BUFF];
+  MARIA_SHARE *share= info->s;
+  MARIA_KEY key;
+  my_off_t new_root;
+  struct st_msg_to_write_hook_for_undo_key msg;
+  DBUG_ENTER("_ma_apply_undo_key_insert");
+
+  share->state.changed|= (STATE_CHANGED | STATE_NOT_OPTIMIZED_KEYS |
+                          STATE_NOT_SORTED_PAGES | STATE_NOT_ZEROFILLED |
+                          STATE_NOT_MOVABLE);
+  keynr= key_nr_korr(header);
+  length-= KEY_NR_STORE_SIZE;
+
+  /* We have to copy key as _ma_ck_real_delete() may change it */
+  memcpy(key_buff, header + KEY_NR_STORE_SIZE, length);
+  DBUG_DUMP("key_buff", key_buff, length);
+
+  new_root= share->state.key_root[keynr];
+  /*
+    Change the key to an internal structure.
+    It's safe to have SEARCH_USER_KEY_HAS_TRANSID even if there isn't
+    a transaction id, as ha_key_cmp() will stop comparison when key length
+    is reached.
+    For index with transid flag, the ref_length of the key is not correct.
+    This should however be safe as long as this key is only used for
+    comparsion against other keys (not for packing or for read-next etc as
+    in this case we use data_length + ref_length, which is correct.
+  */
+  key.keyinfo=     share->keyinfo + keynr;
+  key.data=        key_buff;
+  key.data_length= length - share->rec_reflength;
+  key.ref_length=  share->rec_reflength;
+  key.flag=        SEARCH_USER_KEY_HAS_TRANSID;
+
+  res= ((share->keyinfo[keynr].key_alg == HA_KEY_ALG_RTREE) ?
+        maria_rtree_real_delete(info, &key, &new_root) :
+        _ma_ck_real_delete(info, &key, &new_root));
+  if (res)
+    _ma_mark_file_crashed(share);
+  msg.root= &share->state.key_root[keynr];
+  msg.value= new_root;
+  msg.keynr= keynr;
+
+  if (_ma_write_clr(info, undo_lsn, *msg.root == msg.value ?
+                    LOGREC_UNDO_KEY_INSERT : LOGREC_UNDO_KEY_INSERT_WITH_ROOT,
+                    0, 0, &lsn, (void*) &msg))
+    res= 1;
+
+  _ma_fast_unlock_key_del(info);
+  _ma_unpin_all_pages_and_finalize_row(info, lsn);
+  DBUG_RETURN(res);
+}
+
+
+/**
+   @brief Undo of delete of key (ie, insert the deleted key)
+
+   @param  with_root       If the UNDO is UNDO_KEY_DELETE_WITH_ROOT
+*/
+
+my_bool _ma_apply_undo_key_delete(MARIA_HA *info, LSN undo_lsn,
+                                  const uchar *header, uint length,
+                                  my_bool with_root)
+{
+  LSN lsn;
+  my_bool res;
+  uint keynr, skip_bytes;
+  uchar key_buff[MARIA_MAX_KEY_BUFF];
+  MARIA_SHARE *share= info->s;
+  my_off_t new_root;
+  struct st_msg_to_write_hook_for_undo_key msg;
+  MARIA_KEY key;
+  DBUG_ENTER("_ma_apply_undo_key_delete");
+
+  share->state.changed|= (STATE_CHANGED | STATE_NOT_OPTIMIZED_KEYS |
+                          STATE_NOT_SORTED_PAGES | STATE_NOT_ZEROFILLED |
+                          STATE_NOT_MOVABLE);
+  keynr= key_nr_korr(header);
+  skip_bytes= KEY_NR_STORE_SIZE + (with_root ? PAGE_STORE_SIZE : 0);
+  header+= skip_bytes;
+  length-= skip_bytes;
+
+  /* We have to copy key as _ma_ck_real_write_btree() may change it */
+  memcpy(key_buff, header, length);
+  DBUG_DUMP("key", key_buff, length);
+
+  key.keyinfo=     share->keyinfo + keynr;
+  key.data=        key_buff;
+  key.data_length= length - share->rec_reflength;
+  key.ref_length=  share->rec_reflength;
+  key.flag=        SEARCH_USER_KEY_HAS_TRANSID;
+
+  new_root= share->state.key_root[keynr];
+  res= (share->keyinfo[keynr].key_alg == HA_KEY_ALG_RTREE) ?
+    maria_rtree_insert_level(info, &key, -1, &new_root) :
+    _ma_ck_real_write_btree(info, &key, &new_root,
+                            share->keyinfo[keynr].write_comp_flag |
+                            key.flag);
+  if (res)
+    _ma_mark_file_crashed(share);
+
+  msg.root= &share->state.key_root[keynr];
+  msg.value= new_root;
+  msg.keynr= keynr;
+  if (_ma_write_clr(info, undo_lsn,
+                    *msg.root == msg.value ?
+                    LOGREC_UNDO_KEY_DELETE : LOGREC_UNDO_KEY_DELETE_WITH_ROOT,
+                    0, 0, &lsn,
+                    (void*) &msg))
+    res= 1;
+
+  _ma_fast_unlock_key_del(info);
+  _ma_unpin_all_pages_and_finalize_row(info, lsn);
+  DBUG_RETURN(res);
+}
+
+
+/****************************************************************************
+  Handle some local variables
+****************************************************************************/
+
+/**
+  @brief lock key_del for other threads usage
+
+  @fn     _ma_lock_key_del()
+  @param  info            Maria handler
+  @param  insert_at_end   Set to 1 if we are doing an insert
+
+  @note
+    To allow higher concurrency in the common case where we do inserts
+    and we don't have any linked blocks we do the following:
+    - Mark in info->key_del_used that we are not using key_del
+    - Return at once (without marking key_del as used)
+
+    This is safe as we in this case don't write key_del_current into
+    the redo log and during recover we are not updating key_del.
+
+  @retval 1  Use page at end of file
+  @retval 0  Use page at share->key_del_current
+*/
+
+my_bool _ma_lock_key_del(MARIA_HA *info, my_bool insert_at_end)
+{
+  MARIA_SHARE *share= info->s;
+
+  /*
+    info->key_del_used is 0 initially.
+    If the caller needs a block (_ma_new()), we look at the free list:
+    - looks empty? then caller will create a new block at end of file and
+    remember (through info->key_del_used==2) that it will not change
+    state.key_del and does not need to wake up waiters as nobody will wait for
+    it.
+    - non-empty? then we wait for other users of the state.key_del list to
+    have finished, then we lock this list (through share->key_del_used==1)
+    because we need to prevent some other thread to also read state.key_del
+    and use the same page as ours. We remember through info->key_del_used==1
+    that we will have to set state.key_del at unlock time and wake up
+    waiters.
+    If the caller wants to free a block (_ma_dispose()), "empty" and
+    "non-empty" are treated as "non-empty" is treated above.
+    When we are ready to unlock, we copy share->key_del_current into
+    state.key_del. Unlocking happens when writing the UNDO log record, that
+    can make a long lock time.
+    Why we wrote "*looks* empty": because we are looking at state.key_del
+    which may be slightly old (share->key_del_current may be more recent and
+    exact): when we want a new page, we tolerate to treat "there was no free
+    page 1 millisecond ago"  as "there is no free page". It's ok to non-pop
+    (_ma_new(), page will be found later anyway) but it's not ok to non-push
+    (_ma_dispose(), page would be lost).
+    When we leave this function, info->key_del_used is always 1 or 2.
+  */
+  if (info->key_del_used != 1)
+  {
+    pthread_mutex_lock(&share->key_del_lock);
+    if (share->state.key_del == HA_OFFSET_ERROR && insert_at_end)
+    {
+      pthread_mutex_unlock(&share->key_del_lock);
+      info->key_del_used= 2;                  /* insert-with-append */
+      return 1;
+    }
+#ifdef THREAD
+    while (share->key_del_used)
+      pthread_cond_wait(&share->key_del_cond, &share->key_del_lock);
+#endif
+    info->key_del_used= 1;
+    share->key_del_used= 1;
+    share->key_del_current= share->state.key_del;
+    pthread_mutex_unlock(&share->key_del_lock);
+  }
+  return share->key_del_current == HA_OFFSET_ERROR;
+}
+
+
+/**
+  @brief copy changes to key_del and unlock it
+
+  @notes
+  In case of many threads using the maria table, we always have a lock
+  on the translog when comming here.
+*/
+
+void _ma_unlock_key_del(MARIA_HA *info)
+{
+  DBUG_ASSERT(info->key_del_used);
+  if (info->key_del_used == 1)                  /* Ignore insert-with-append */
+  {
+    MARIA_SHARE *share= info->s;
+    pthread_mutex_lock(&share->key_del_lock);
+    share->key_del_used= 0;
+    share->state.key_del= share->key_del_current;
+    pthread_mutex_unlock(&share->key_del_lock);
+    pthread_cond_signal(&share->key_del_cond);
+  }
+  info->key_del_used= 0;
+}
diff --git a/storage/maria/ma_key_recover.h b/storage/maria/ma_key_recover.h
new file mode 100644
index 00000000000..d6b69010d5d
--- /dev/null
+++ b/storage/maria/ma_key_recover.h
@@ -0,0 +1,122 @@
+/* Copyright (C) 2007 Michael Widenius
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; version 2 of the License.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */
+
+/*
+  When we have finished the write/update/delete of a row, we have cleanups to
+  do. For now it is signalling to Checkpoint that all dirtied pages have
+  their rec_lsn set and page LSN set (_ma_unpin_all_pages() has been called),
+  and that bitmap pages are correct (_ma_bitmap_release_unused() has been
+  called).
+*/
+
+/* Struct for clr_end */
+
+struct st_msg_to_write_hook_for_clr_end
+{
+  LSN previous_undo_lsn;
+  enum translog_record_type undone_record_type;
+  ha_checksum checksum_delta;
+  void *extra_msg;
+};
+
+struct st_msg_to_write_hook_for_undo_key
+{
+  my_off_t *root;
+  my_off_t value;
+  uint keynr;
+  ulonglong auto_increment;
+};
+
+
+/* Function definitions for some redo functions */
+
+my_bool _ma_write_clr(MARIA_HA *info, LSN undo_lsn,
+                      enum translog_record_type undo_type,
+                      my_bool store_checksum, ha_checksum checksum,
+                      LSN *res_lsn, void *extra_msg);
+int _ma_write_undo_key_insert(MARIA_HA *info, const MARIA_KEY *key,
+                              my_off_t *root, my_off_t new_root,
+                              LSN *res_lsn);
+my_bool _ma_write_undo_key_delete(MARIA_HA *info, const MARIA_KEY *key,
+                                  my_off_t new_root, LSN *res_lsn);
+my_bool write_hook_for_clr_end(enum translog_record_type type,
+                               TRN *trn, MARIA_HA *tbl_info, LSN *lsn,
+                               void *hook_arg);
+extern my_bool write_hook_for_undo_key(enum translog_record_type type,
+                                       TRN *trn, MARIA_HA *tbl_info,
+                                       LSN *lsn, void *hook_arg);
+extern my_bool write_hook_for_undo_key_insert(enum translog_record_type type,
+                                              TRN *trn, MARIA_HA *tbl_info,
+                                              LSN *lsn, void *hook_arg);
+extern my_bool write_hook_for_undo_key_delete(enum translog_record_type type,
+                                              TRN *trn, MARIA_HA *tbl_info,
+                                              LSN *lsn, void *hook_arg);
+
+my_bool _ma_log_prefix(MARIA_PAGE *page, uint changed_length, int move_length,
+                       enum en_key_debug debug_marker);
+my_bool _ma_log_suffix(MARIA_PAGE *page, uint org_length,
+                       uint new_length);
+my_bool _ma_log_add(MARIA_PAGE *page, uint buff_length, uchar *key_pos,
+                    uint changed_length, int move_length,
+                    my_bool handle_overflow,
+                    enum en_key_debug debug_marker);
+my_bool _ma_log_delete(MARIA_PAGE *page, const uchar *key_pos,
+                       uint changed_length, uint move_length,
+                       uint append_length, enum en_key_debug debug_marker);
+my_bool _ma_log_change(MARIA_PAGE *page, const uchar *key_pos, uint length,
+                       enum en_key_debug debug_marker);
+my_bool _ma_log_new(MARIA_PAGE *page, my_bool root_page);
+#ifdef EXTRA_DEBUG_KEY_CHANGES
+void _ma_log_key_changes(MARIA_PAGE *ma_page, LEX_CUSTRING *log_array,
+                         uchar *log_pos, uint *changed_length,
+                         uint *translog_parts);
+#else
+#define _ma_log_key_changes(A,B,C,D,E)
+#endif
+
+uint _ma_apply_redo_index_new_page(MARIA_HA *info, LSN lsn,
+                                   const uchar *header, uint length);
+uint _ma_apply_redo_index_free_page(MARIA_HA *info, LSN lsn,
+                                    const uchar *header);
+uint _ma_apply_redo_index(MARIA_HA *info,
+                          LSN lsn, const uchar *header, uint length);
+
+my_bool _ma_apply_undo_key_insert(MARIA_HA *info, LSN undo_lsn,
+                                  const uchar *header, uint length);
+my_bool _ma_apply_undo_key_delete(MARIA_HA *info, LSN undo_lsn,
+                                  const uchar *header, uint length,
+                                  my_bool with_root);
+
+static inline void _ma_finalize_row(MARIA_HA *info)
+{
+  info->trn->rec_lsn= LSN_IMPOSSIBLE;
+}
+
+/* unpinning is often the last operation before finalizing */
+
+static inline void _ma_unpin_all_pages_and_finalize_row(MARIA_HA *info,
+                                                        LSN undo_lsn)
+{
+  _ma_unpin_all_pages(info, undo_lsn);
+  _ma_finalize_row(info);
+}
+
+extern my_bool _ma_lock_key_del(MARIA_HA *info, my_bool insert_at_end);
+extern void _ma_unlock_key_del(MARIA_HA *info);
+static inline void _ma_fast_unlock_key_del(MARIA_HA *info)
+{
+  if (info->key_del_used)
+    _ma_unlock_key_del(info);
+}
diff --git a/storage/maria/ma_keycache.c b/storage/maria/ma_keycache.c
new file mode 100644
index 00000000000..39fc7d421ae
--- /dev/null
+++ b/storage/maria/ma_keycache.c
@@ -0,0 +1,164 @@
+/* Copyright (C) 2006 MySQL AB
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; version 2 of the License.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */
+
+/*
+  Key cache assignments
+*/
+
+#include "maria_def.h"
+
+/*
+  Assign pages of the index file for a table to a key cache
+
+  SYNOPSIS
+    maria_assign_to_pagecache()
+      info          open table
+      key_map       map of indexes to assign to the key cache
+      pagecache_ptr pointer to the key cache handle
+      assign_lock   Mutex to lock during assignment
+
+  PREREQUESTS
+    One must have a READ lock or a WRITE lock on the table when calling
+    the function to ensure that there is no other writers to it.
+
+    The caller must also ensure that one doesn't call this function from
+    two different threads with the same table.
+
+  NOTES
+    At present pages for all indexes must be assigned to the same key cache.
+    In future only pages for indexes specified in the key_map parameter
+    of the table will be assigned to the specified key cache.
+
+  RETURN VALUE
+    0  If a success
+    #  Error code
+*/
+
+int maria_assign_to_pagecache(MARIA_HA *info,
+                              ulonglong key_map __attribute__((unused)),
+                              PAGECACHE *pagecache)
+{
+  int error= 0;
+  MARIA_SHARE* share= info->s;
+  DBUG_ENTER("maria_assign_to_pagecache");
+  DBUG_PRINT("enter",
+             ("old_pagecache_handle: 0x%lx  new_pagecache_handle: 0x%lx",
+             (long) share->pagecache, (long) pagecache));
+
+  /*
+    Skip operation if we didn't change key cache. This can happen if we
+    call this for all open instances of the same table
+  */
+  if (share->pagecache == pagecache)
+    DBUG_RETURN(0);
+
+  /*
+    First flush all blocks for the table in the old key cache.
+    This is to ensure that the disk is consistent with the data pages
+    in memory (which may not be the case if the table uses delayed_key_write)
+
+    Note that some other read thread may still fill in the key cache with
+    new blocks during this call and after, but this doesn't matter as
+    all threads will start using the new key cache for their next call to
+    maria library and we know that there will not be any changed blocks
+    in the old key cache.
+  */
+
+  if (flush_pagecache_blocks(share->pagecache, &share->kfile, FLUSH_RELEASE))
+  {
+    error= my_errno;
+    maria_print_error(info->s, HA_ERR_CRASHED);
+    maria_mark_crashed(info);		/* Mark that table must be checked */
+  }
+
+  /*
+    Flush the new key cache for this file.  This is needed to ensure
+    that there is no old blocks (with outdated data) left in the new key
+    cache from an earlier assign_to_keycache operation
+
+    (This can never fail as there is never any not written data in the
+    new key cache)
+  */
+  (void) flush_pagecache_blocks(pagecache, &share->kfile, FLUSH_RELEASE);
+
+  /*
+    ensure that setting the key cache and changing the multi_pagecache
+    is done atomicly
+  */
+  pthread_mutex_lock(&share->intern_lock);
+  /*
+    Tell all threads to use the new key cache
+    This should be seen at the lastes for the next call to an maria function.
+  */
+  share->pagecache= pagecache;
+
+  /* store the key cache in the global hash structure for future opens */
+  if (multi_pagecache_set((uchar*) share->unique_file_name.str,
+                          share->unique_file_name.length,
+			  share->pagecache))
+    error= my_errno;
+  pthread_mutex_unlock(&share->intern_lock);
+  DBUG_RETURN(error);
+}
+
+
+/*
+  Change all MARIA entries that uses one key cache to another key cache
+
+  SYNOPSIS
+    maria_change_pagecache()
+    old_pagecache	Old key cache
+    new_pagecache	New key cache
+
+  NOTES
+    This is used when we delete one key cache.
+
+    To handle the case where some other threads tries to open an MARIA
+    table associated with the to-be-deleted key cache while this operation
+    is running, we have to call 'multi_pagecache_change()' from this
+    function while we have a lock on the MARIA table list structure.
+
+    This is safe as long as it's only MARIA that is using this specific
+    key cache.
+*/
+
+
+void maria_change_pagecache(PAGECACHE *old_pagecache,
+                            PAGECACHE *new_pagecache)
+{
+  LIST *pos;
+  DBUG_ENTER("maria_change_pagecache");
+
+  /*
+    Lock list to ensure that no one can close the table while we manipulate it
+  */
+  pthread_mutex_lock(&THR_LOCK_maria);
+  for (pos=maria_open_list ; pos ; pos=pos->next)
+  {
+    MARIA_HA *info= (MARIA_HA*) pos->data;
+    MARIA_SHARE *share= info->s;
+    if (share->pagecache == old_pagecache)
+      maria_assign_to_pagecache(info, (ulonglong) ~0, new_pagecache);
+  }
+
+  /*
+    We have to do the following call while we have the lock on the
+    MARIA list structure to ensure that another thread is not trying to
+    open a new table that will be associted with the old key cache
+  */
+  multi_pagecache_change(old_pagecache, new_pagecache);
+  pthread_mutex_unlock(&THR_LOCK_maria);
+  DBUG_VOID_RETURN;
+}
diff --git a/storage/maria/ma_locking.c b/storage/maria/ma_locking.c
new file mode 100644
index 00000000000..6bb308e5959
--- /dev/null
+++ b/storage/maria/ma_locking.c
@@ -0,0 +1,554 @@
+/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; version 2 of the License.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */
+
+/*
+  Locking of Maria-tables.
+  Must be first request before doing any furter calls to any Maria function.
+  Is used to allow many process use the same non transactional Maria table
+*/
+
+#include "ma_ftdefs.h"
+
+	/* lock table by F_UNLCK, F_RDLCK or F_WRLCK */
+
+int maria_lock_database(MARIA_HA *info, int lock_type)
+{
+  int error;
+  uint count;
+  MARIA_SHARE *share= info->s;
+  DBUG_ENTER("maria_lock_database");
+  DBUG_PRINT("enter",("lock_type: %d  old lock %d  r_locks: %u  w_locks: %u "
+                      "global_changed:  %d  open_count: %u  name: '%s'",
+                      lock_type, info->lock_type, share->r_locks,
+                      share->w_locks,
+                      share->global_changed, share->state.open_count,
+                      share->index_file_name.str));
+  if (share->options & HA_OPTION_READ_ONLY_DATA ||
+      info->lock_type == lock_type)
+    DBUG_RETURN(0);
+  if (lock_type == F_EXTRA_LCK)                 /* Used by TMP tables */
+  {
+    ++share->w_locks;
+    ++share->tot_locks;
+    info->lock_type= lock_type;
+    DBUG_RETURN(0);
+  }
+
+  error=0;
+  pthread_mutex_lock(&share->intern_lock);
+  if (share->kfile.file >= 0)		/* May only be false on windows */
+  {
+    switch (lock_type) {
+    case F_UNLCK:
+      maria_ftparser_call_deinitializer(info);
+      if (info->lock_type == F_RDLCK)
+      {
+	count= --share->r_locks;
+        if (share->lock_restore_status)
+          (*share->lock_restore_status)(info);
+      }
+      else
+      {
+	count= --share->w_locks;
+        if (share->lock.update_status)
+          _ma_update_status_with_lock(info);
+      }
+      --share->tot_locks;
+      if (info->lock_type == F_WRLCK && !share->w_locks)
+      {
+        /* pages of transactional tables get flushed at Checkpoint */
+        if (!share->base.born_transactional && !share->temporary &&
+            _ma_flush_table_files(info,
+                                  share->delay_key_write ? MARIA_FLUSH_DATA :
+                                  MARIA_FLUSH_DATA | MARIA_FLUSH_INDEX,
+                                  FLUSH_KEEP, FLUSH_KEEP))
+          error= my_errno;
+      }
+      if (info->opt_flag & (READ_CACHE_USED | WRITE_CACHE_USED))
+      {
+	if (end_io_cache(&info->rec_cache))
+	{
+	  error=my_errno;
+          maria_print_error(info->s, HA_ERR_CRASHED);
+	  maria_mark_crashed(info);
+	}
+      }
+      if (!count)
+      {
+	DBUG_PRINT("info",("changed: %u  w_locks: %u",
+			   (uint) share->changed, share->w_locks));
+	if (share->changed && !share->w_locks)
+	{
+#ifdef HAVE_MMAP
+          if ((share->mmaped_length !=
+               share->state.state.data_file_length) &&
+              (share->nonmmaped_inserts > MAX_NONMAPPED_INSERTS))
+          {
+            if (share->lock_key_trees)
+              rw_wrlock(&share->mmap_lock);
+            _ma_remap_file(info, share->state.state.data_file_length);
+            share->nonmmaped_inserts= 0;
+            if (share->lock_key_trees)
+              rw_unlock(&share->mmap_lock);
+          }
+#endif
+#ifdef EXTERNAL_LOCKING
+	  share->state.process= share->last_process=share->this_process;
+	  share->state.unique=   info->last_unique=  info->this_unique;
+	  share->state.update_count= info->last_loop= ++info->this_loop;
+#endif
+          /* transactional tables rather flush their state at Checkpoint */
+          if (!share->base.born_transactional)
+          {
+            if (_ma_state_info_write_sub(share->kfile.file, &share->state,
+                                         MA_STATE_INFO_WRITE_DONT_MOVE_OFFSET))
+              error= my_errno;
+            else
+            {
+              /* A value of 0 means below means "state flushed" */
+              share->changed= 0;
+            }
+          }
+	  if (maria_flush)
+	  {
+            if (_ma_sync_table_files(info))
+	      error= my_errno;
+	  }
+	  else
+	    share->not_flushed=1;
+	  if (error)
+          {
+            maria_print_error(info->s, HA_ERR_CRASHED);
+	    maria_mark_crashed(info);
+          }
+	}
+      }
+      info->opt_flag&= ~(READ_CACHE_USED | WRITE_CACHE_USED);
+      info->lock_type= F_UNLCK;
+      break;
+    case F_RDLCK:
+      if (info->lock_type == F_WRLCK)
+      {
+        /*
+          Change RW to READONLY
+
+          mysqld does not turn write locks to read locks,
+          so we're never here in mysqld.
+        */
+	share->w_locks--;
+	share->r_locks++;
+	info->lock_type=lock_type;
+	break;
+      }
+#ifdef MARIA_EXTERNAL_LOCKING
+      if (!share->r_locks && !share->w_locks)
+      {
+        /* note that a transactional table should not do this */
+	if (_ma_state_info_read_dsk(share->kfile.file, &share->state))
+	{
+	  error=my_errno;
+	  break;
+	}
+      }
+#endif
+      VOID(_ma_test_if_changed(info));
+      share->r_locks++;
+      share->tot_locks++;
+      info->lock_type=lock_type;
+      break;
+    case F_WRLCK:
+      if (info->lock_type == F_RDLCK)
+      {						/* Change READONLY to RW */
+	if (share->r_locks == 1)
+	{
+	  share->r_locks--;
+	  share->w_locks++;
+	  info->lock_type=lock_type;
+	  break;
+	}
+      }
+#ifdef MARIA_EXTERNAL_LOCKING
+      if (!(share->options & HA_OPTION_READ_ONLY_DATA))
+      {
+	if (!share->w_locks)
+	{
+	  if (!share->r_locks)
+	  {
+            /*
+              Note that transactional tables should not do this.
+              If we enabled this code, we should make sure to skip it if
+              born_transactional is true. We should not test
+              now_transactional to decide if we can call
+              _ma_state_info_read_dsk(), because it can temporarily be 0
+              (TRUNCATE on a partitioned table) and thus it would make a state
+              modification below without mutex, confusing a concurrent
+              checkpoint running.
+              Even if this code was enabled only for non-transactional tables:
+              in scenario LOCK TABLE t1 WRITE; INSERT INTO t1; DELETE FROM t1;
+              state on disk read by DELETE is obsolete as it was not flushed
+              at the end of INSERT. MyISAM same. It however causes no issue as
+              maria_delete_all_rows() calls _ma_reset_status() thus is not
+              influenced by the obsolete read values.
+            */
+	    if (_ma_state_info_read_dsk(share->kfile.file, &share->state))
+	    {
+	      error=my_errno;
+	      break;
+	    }
+	  }
+	}
+      }
+#endif /* defined(MARIA_EXTERNAL_LOCKING) */
+      VOID(_ma_test_if_changed(info));
+
+      info->lock_type=lock_type;
+      info->invalidator=share->invalidator;
+      share->w_locks++;
+      share->tot_locks++;
+      break;
+    default:
+      DBUG_ASSERT(0);
+      break;				/* Impossible */
+    }
+  }
+#ifdef __WIN__
+  else
+  {
+    /*
+       Check for bad file descriptors if this table is part
+       of a merge union. Failing to capture this may cause
+       a crash on windows if the table is renamed and
+       later on referenced by the merge table.
+     */
+    if( info->owned_by_merge && (info->s)->kfile.file < 0 )
+    {
+      error = HA_ERR_NO_SUCH_TABLE;
+    }
+  }
+#endif
+  pthread_mutex_unlock(&share->intern_lock);
+  DBUG_RETURN(error);
+} /* maria_lock_database */
+
+
+/****************************************************************************
+ ** functions to read / write the state
+****************************************************************************/
+
+int _ma_readinfo(register MARIA_HA *info __attribute__ ((unused)),
+                 int lock_type __attribute__ ((unused)),
+                 int check_keybuffer __attribute__ ((unused)))
+{
+#ifdef MARIA_EXTERNAL_LOCKING
+  DBUG_ENTER("_ma_readinfo");
+
+  if (info->lock_type == F_UNLCK)
+  {
+    MARIA_SHARE *share= info->s;
+    if (!share->tot_locks)
+    {
+      /* should not be done for transactional tables */
+      if (_ma_state_info_read_dsk(share->kfile.file, &share->state))
+      {
+        if (!my_errno)
+          my_errno= HA_ERR_FILE_TOO_SHORT;
+	DBUG_RETURN(1);
+      }
+    }
+    if (check_keybuffer)
+      VOID(_ma_test_if_changed(info));
+    info->invalidator=share->invalidator;
+  }
+  else if (lock_type == F_WRLCK && info->lock_type == F_RDLCK)
+  {
+    my_errno=EACCES;				/* Not allowed to change */
+    DBUG_RETURN(-1);				/* when have read_lock() */
+  }
+  DBUG_RETURN(0);
+#else
+  return 0;
+#endif /* defined(MARIA_EXTERNAL_LOCKING) */
+} /* _ma_readinfo */
+
+
+/*
+  Every isam-function that uppdates the isam-database MUST end with this
+  request
+
+  NOTES
+    my_errno is not changed if this succeeds!
+*/
+
+int _ma_writeinfo(register MARIA_HA *info, uint operation)
+{
+  int error,olderror;
+  MARIA_SHARE *share= info->s;
+  DBUG_ENTER("_ma_writeinfo");
+  DBUG_PRINT("info",("operation: %u  tot_locks: %u", operation,
+		     share->tot_locks));
+
+  error=0;
+  if (share->tot_locks == 0 && !share->base.born_transactional)
+  {
+    /* transactional tables flush their state at Checkpoint */
+    if (operation)
+    {					/* Two threads can't be here */
+      olderror= my_errno;               /* Remember last error */
+
+#ifdef EXTERNAL_LOCKING
+      /*
+        The following only makes sense if we want to be allow two different
+        processes access the same table at the same time
+      */
+      share->state.process= share->last_process=   share->this_process;
+      share->state.unique=  info->last_unique=	   info->this_unique;
+      share->state.update_count= info->last_loop= ++info->this_loop;
+#endif
+
+      if ((error=
+           _ma_state_info_write_sub(share->kfile.file,
+                                    &share->state,
+                                    MA_STATE_INFO_WRITE_DONT_MOVE_OFFSET)))
+	olderror=my_errno;
+#ifdef __WIN__
+      if (maria_flush)
+      {
+	_commit(share->kfile.file);
+	_commit(info->dfile.file);
+      }
+#endif
+      my_errno=olderror;
+    }
+  }
+  else if (operation)
+    share->changed= 1;			/* Mark keyfile changed */
+  DBUG_RETURN(error);
+} /* _ma_writeinfo */
+
+
+/*
+  Test if an external process has changed the database
+  (Should be called after readinfo)
+*/
+
+int _ma_test_if_changed(register MARIA_HA *info)
+{
+#ifdef EXTERNAL_LOCKING
+  MARIA_SHARE *share= info->s;
+  if (share->state.process != share->last_process ||
+      share->state.unique  != info->last_unique ||
+      share->state.update_count != info->last_loop)
+  {						/* Keyfile has changed */
+    DBUG_PRINT("info",("index file changed"));
+    if (share->state.process != share->this_process)
+      VOID(flush_pagecache_blocks(share->pagecache, &share->kfile,
+                                  FLUSH_RELEASE));
+    share->last_process=share->state.process;
+    info->last_unique=	share->state.unique;
+    info->last_loop=	share->state.update_count;
+    info->update|=	HA_STATE_WRITTEN;	/* Must use file on next */
+    info->data_changed= 1;			/* For maria_is_changed */
+    return 1;
+  }
+#endif
+  return (!(info->update & HA_STATE_AKTIV) ||
+	  (info->update & (HA_STATE_WRITTEN | HA_STATE_DELETED |
+			   HA_STATE_KEY_CHANGED)));
+} /* _ma_test_if_changed */
+
+
+/*
+  Put a mark in the .MAI file that someone is updating the table
+
+  DOCUMENTATION
+  state.open_count in the .MAI file is used the following way:
+  - For the first change of the .MYI file in this process open_count is
+    incremented by _ma_mark_file_changed(). (We have a write lock on the file
+    when this happens)
+  - In maria_close() it's decremented by _ma_decrement_open_count() if it
+    was incremented in the same process.
+
+  This mean that if we are the only process using the file, the open_count
+  tells us if the MARIA file wasn't properly closed. (This is true if
+  my_disable_locking is set).
+
+  open_count is not maintained on disk for temporary tables.
+*/
+
+#define _MA_ALREADY_MARKED_FILE_CHANGED                                 \
+  ((share->state.changed & STATE_CHANGED) && share->global_changed)
+
+int _ma_mark_file_changed(MARIA_HA *info)
+{
+  uchar buff[3];
+  register MARIA_SHARE *share= info->s;
+  int error= 1;
+  DBUG_ENTER("_ma_mark_file_changed");
+
+  if (_MA_ALREADY_MARKED_FILE_CHANGED)
+    DBUG_RETURN(0);
+  pthread_mutex_lock(&share->intern_lock); /* recheck under mutex */
+  if (! _MA_ALREADY_MARKED_FILE_CHANGED)
+  {
+    share->state.changed|=(STATE_CHANGED | STATE_NOT_ANALYZED |
+			   STATE_NOT_OPTIMIZED_KEYS);
+    if (!share->global_changed)
+    {
+      share->global_changed=1;
+      share->state.open_count++;
+    }
+    /*
+      Temp tables don't need an open_count as they are removed on crash.
+      In theory transactional tables are fixed by log-based recovery, so don't
+      need an open_count either, but if recovery has failed and logs have been
+      removed (by maria-force-start-after-recovery-failures), we still need to
+      detect dubious tables.
+      If we didn't maintain open_count on disk for a table, after a crash
+      we wouldn't know if it was closed at crash time (thus does not need a
+      check) or not. So we would have to check all tables: overkill.
+    */
+    if (!share->temporary)
+    {
+      mi_int2store(buff,share->state.open_count);
+      buff[2]=1;				/* Mark that it's changed */
+      if (my_pwrite(share->kfile.file, buff, sizeof(buff),
+                    sizeof(share->state.header) +
+                    MARIA_FILE_OPEN_COUNT_OFFSET,
+                    MYF(MY_NABP)))
+        goto err;
+    }
+    /* Set uuid of file if not yet set (zerofilled file) */
+    if (share->base.born_transactional &&
+        !(share->state.changed & STATE_NOT_MOVABLE))
+    {
+      /* Lock table to current installation */
+      if (_ma_set_uuid(info, 0) ||
+          (share->state.create_rename_lsn == LSN_NEEDS_NEW_STATE_LSNS &&
+           _ma_update_state_lsns_sub(share, LSN_IMPOSSIBLE,
+                                     trnman_get_min_trid(),
+                                     TRUE, TRUE)))
+        goto err;
+      share->state.changed|= STATE_NOT_MOVABLE;
+    }
+  }
+  error= 0;
+err:
+  pthread_mutex_unlock(&share->intern_lock);
+  DBUG_RETURN(error);
+#undef _MA_ALREADY_MARKED_FILE_CHANGED
+}
+
+/*
+  Check that a region is all zero
+
+  SYNOPSIS
+    check_if_zero()
+    pos		Start of memory to check
+    length	length of memory region
+
+  NOTES
+    Used mainly to detect rows with wrong extent information
+*/
+
+my_bool _ma_check_if_zero(uchar *pos, size_t length)
+{
+  uchar *end;
+  for (end= pos+ length; pos != end ; pos++)
+    if (pos[0] != 0)
+      return 1;
+  return 0;
+}
+
+/*
+  This is only called by close or by extra(HA_FLUSH) if the OS has the pwrite()
+  call.  In these context the following code should be safe!
+ */
+
+int _ma_decrement_open_count(MARIA_HA *info)
+{
+  uchar buff[2];
+  register MARIA_SHARE *share= info->s;
+  int lock_error=0,write_error=0;
+  if (share->global_changed)
+  {
+    uint old_lock=info->lock_type;
+    share->global_changed=0;
+    lock_error= my_disable_locking ? 0 : maria_lock_database(info, F_WRLCK);
+    /* Its not fatal even if we couldn't get the lock ! */
+    if (share->state.open_count > 0)
+    {
+      share->state.open_count--;
+      share->changed= 1;                        /* We have to update state */
+      if (!share->temporary)
+      {
+        mi_int2store(buff,share->state.open_count);
+        write_error= (int) my_pwrite(share->kfile.file, buff, sizeof(buff),
+                                     sizeof(share->state.header) +
+                                     MARIA_FILE_OPEN_COUNT_OFFSET,
+                                     MYF(MY_NABP));
+      }
+    }
+    if (!lock_error && !my_disable_locking)
+      lock_error=maria_lock_database(info,old_lock);
+  }
+  return test(lock_error || write_error);
+}
+
+
+/** @brief mark file as crashed */
+
+void _ma_mark_file_crashed(MARIA_SHARE *share)
+{
+  uchar buff[2];
+  DBUG_ENTER("_ma_mark_file_crashed");
+
+  share->state.changed|= STATE_CRASHED;
+  mi_int2store(buff, share->state.changed);
+  /*
+    We can ignore the errors, as if the mark failed, there isn't anything
+    else we can do;  The user should already have got an error that the
+    table was crashed.
+  */
+  (void) my_pwrite(share->kfile.file, buff, sizeof(buff),
+                   sizeof(share->state.header) +
+                   MARIA_FILE_CHANGED_OFFSET,
+                   MYF(MY_NABP));
+  DBUG_VOID_RETURN;
+}
+
+
+/**
+   @brief Set uuid of for a Maria file
+
+   @fn _ma_set_uuid()
+   @param info		Maria handler
+   @param reset_uuid    Instead of setting file to maria_uuid, set it to
+			0 to mark it as movable
+*/
+
+my_bool _ma_set_uuid(MARIA_HA *info, my_bool reset_uuid)
+{
+  uchar buff[MY_UUID_SIZE], *uuid;
+
+  uuid= maria_uuid;
+  if (reset_uuid)
+  {
+    bzero(buff, sizeof(buff));
+    uuid= buff;
+  }
+  return (my_bool) my_pwrite(info->s->kfile.file, uuid, MY_UUID_SIZE,
+                             mi_uint2korr(info->s->state.header.base_pos),
+                             MYF(MY_NABP));
+}
diff --git a/storage/maria/ma_loghandler.c b/storage/maria/ma_loghandler.c
new file mode 100644
index 00000000000..dc99554a08d
--- /dev/null
+++ b/storage/maria/ma_loghandler.c
@@ -0,0 +1,9316 @@
+/* Copyright (C) 2007 MySQL AB & Sanja Belkin
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; version 2 of the License.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */
+
+#include "maria_def.h"
+#include "trnman.h"
+#include "ma_blockrec.h" /* for some constants and in-write hooks */
+#include "ma_key_recover.h" /* For some in-write hooks */
+#include "ma_checkpoint.h"
+#include "ma_servicethread.h"
+
+/*
+  On Windows, neither my_open() nor my_sync() work for directories.
+  Also there is no need to flush filesystem changes ,i.e to sync()
+  directories.
+*/
+#ifdef __WIN__
+#define sync_dir(A,B) 0
+#else
+#define sync_dir(A,B) my_sync(A,B)
+#endif
+
+/**
+   @file
+   @brief Module which writes and reads to a transaction log
+*/
+
+/* 0xFF can never be valid first byte of a chunk */
+#define TRANSLOG_FILLER 0xFF
+
+/* number of opened log files in the pagecache (should be at least 2) */
+#define OPENED_FILES_NUM 3
+#define CACHED_FILES_NUM 5
+#define CACHED_FILES_NUM_DIRECT_SEARCH_LIMIT 7
+#if CACHED_FILES_NUM > CACHED_FILES_NUM_DIRECT_SEARCH_LIMIT
+#include <hash.h>
+#include <m_ctype.h>
+#endif
+
+/** @brief protects checkpoint_in_progress */
+static pthread_mutex_t LOCK_soft_sync;
+/** @brief for killing the background checkpoint thread */
+static pthread_cond_t  COND_soft_sync;
+/** @brief control structure for checkpoint background thread */
+static MA_SERVICE_THREAD_CONTROL soft_sync_control=
+  {THREAD_DEAD, FALSE, &LOCK_soft_sync, &COND_soft_sync};
+
+
+/* transaction log file descriptor */
+typedef struct st_translog_file
+{
+  uint32 number;
+  PAGECACHE_FILE handler;
+  my_bool was_recovered;
+  my_bool is_sync;
+} TRANSLOG_FILE;
+
+/* records buffer size (should be TRANSLOG_PAGE_SIZE * n) */
+#define TRANSLOG_WRITE_BUFFER (1024*1024)
+/*
+  pagecache_read/write/inject() use bmove512() on their buffers so those must
+  be long-aligned, which we guarantee by using the type below:
+*/
+typedef union
+{
+  ulonglong dummy;
+  uchar buffer[TRANSLOG_PAGE_SIZE];
+} TRANSLOG_PAGE_SIZE_BUFF;
+
+/* min chunk length */
+#define TRANSLOG_MIN_CHUNK 3
+/*
+  Number of buffers used by loghandler
+
+  Should be at least 4, because one thread can block up to 2 buffers in
+  normal circumstances (less then half of one and full other, or just
+  switched one and other), But if we met end of the file in the middle and
+  have to switch buffer it will be 3.  + 1 buffer for flushing/writing.
+  We have a bigger number here for higher concurrency and to make division
+  faster.
+
+  The number should be power of 2 to be fast.
+*/
+#define TRANSLOG_BUFFERS_NO 8
+/* number of bytes (+ header) which can be unused on first page in sequence */
+#define TRANSLOG_MINCHUNK_CONTENT 1
+/* version of log file */
+#define TRANSLOG_VERSION_ID 10000               /* 1.00.00 */
+
+#define TRANSLOG_PAGE_FLAGS 6 /* transaction log page flags offset */
+
+/* Maximum length of compressed LSNs (the worst case of whole LSN storing) */
+#define COMPRESSED_LSN_MAX_STORE_SIZE (2 + LSN_STORE_SIZE)
+#define MAX_NUMBER_OF_LSNS_PER_RECORD 2
+
+
+/* max lsn calculation for buffer */
+#define BUFFER_MAX_LSN(B)  \
+  ((B)->last_lsn == LSN_IMPOSSIBLE ? (B)->prev_last_lsn : (B)->last_lsn)
+
+/* log write buffer descriptor */
+struct st_translog_buffer
+{
+  /*
+    Cache for current log. Comes first to be aligned for bmove512() in
+    pagecache_inject()
+  */
+  uchar buffer[TRANSLOG_WRITE_BUFFER];
+  /*
+    Maximum LSN of records which ends in this buffer (or IMPOSSIBLE_LSN
+    if no LSNs ends here)
+  */
+  LSN last_lsn;
+  /* last_lsn of previous buffer or IMPOSSIBLE_LSN if it is very first one */
+  LSN prev_last_lsn;
+  /* This buffer offset in the file */
+  TRANSLOG_ADDRESS offset;
+  /*
+    Next buffer offset in the file (it is not always offset + size,
+    in case of flush by LSN it can be offset + size - TRANSLOG_PAGE_SIZE)
+  */
+  TRANSLOG_ADDRESS next_buffer_offset;
+  /* Previous buffer offset to detect it flush finish */
+  TRANSLOG_ADDRESS prev_buffer_offset;
+  /*
+    If the buffer was forced to close it save value of its horizon
+    otherwise LSN_IMPOSSIBLE
+  */
+  TRANSLOG_ADDRESS pre_force_close_horizon;
+  /*
+     How much is written (or will be written when copy_to_buffer_in_progress
+     become 0) to this buffer
+  */
+  translog_size_t size;
+  /*
+     When moving from one log buffer to another, we write the last of the
+     previous buffer to file and then move to start using the new log
+     buffer.  In the case of a part filed last page, this page is not moved
+     to the start of the new buffer but instead we set the 'skip_data'
+     variable to tell us how much data at the beginning of the buffer is not
+     relevant.
+  */
+  uint skipped_data;
+  /* File handler for this buffer */
+  TRANSLOG_FILE *file;
+  /* Threads which are waiting for buffer filling/freeing */
+  pthread_cond_t waiting_filling_buffer;
+  /* Number of records which are in copy progress */
+  uint copy_to_buffer_in_progress;
+  /* list of waiting buffer ready threads */
+  struct st_my_thread_var *waiting_flush;
+  /*
+    If true then previous buffer overlap with this one (due to flush of
+    loghandler, the last page of that buffer is the same as the first page
+    of this buffer) and have to be written first (because contain old
+    content of page which present in both buffers)
+  */
+  my_bool overlay;
+  uint buffer_no;
+  /*
+    Lock for the buffer.
+
+    Current buffer also lock the whole handler (if one want lock the handler
+    one should lock the current buffer).
+
+    Buffers are locked only in one direction (with overflow and beginning
+    from the first buffer). If we keep lock on buffer N we can lock only
+    buffer N+1 (never N-1).
+
+    One thread do not lock more then 2 buffer in a time, so to make dead
+    lock it should be N thread (where N equal number of buffers) takes one
+    buffer and try to lock next. But it is impossible because there is only
+    2 cases when thread take 2 buffers: 1) one thread finishes current
+    buffer (where horizon is) and start next (to which horizon moves).  2)
+    flush start from buffer after current (oldest) and go till the current
+    crabbing by buffer sequence. And there is  only one flush in a moment
+    (they are serialised).
+
+   Because of above and number of buffers equal 5 we can't get dead lock (it is
+   impossible to get all 5 buffers locked simultaneously).
+  */
+  pthread_mutex_t mutex;
+  /*
+    Some thread is going to close the buffer and it should be
+    done only by that thread
+  */
+  my_bool is_closing_buffer;
+  /*
+    Version of the buffer increases every time buffer the buffer flushed.
+    With file and offset it allow detect buffer changes
+  */
+  uint8 ver;
+
+  /*
+    When previous buffer sent to disk it set its address here to allow
+    to detect when it is done
+    (we have to keep it in this buffer to lock buffers only in one direction).
+  */
+  TRANSLOG_ADDRESS prev_sent_to_disk;
+  pthread_cond_t prev_sent_to_disk_cond;
+};
+
+
+struct st_buffer_cursor
+{
+  /* pointer into the buffer */
+  uchar *ptr;
+  /* current buffer */
+  struct st_translog_buffer *buffer;
+  /* How many bytes we wrote on the current page */
+  uint16 current_page_fill;
+  /*
+    How many times we write the page on the disk during flushing process
+    (for sector protection).
+  */
+  uint16 write_counter;
+  /* previous write offset */
+  uint16 previous_offset;
+  /* Number of current buffer */
+  uint8 buffer_no;
+  /*
+    True if it is just filling buffer after advancing the pointer to
+    the horizon.
+  */
+  my_bool chaser;
+  /*
+    Is current page of the cursor already finished (sector protection
+    should be applied if it is needed)
+  */
+  my_bool protected;
+};
+
+
+typedef uint8 dirty_buffer_mask_t;
+
+struct st_translog_descriptor
+{
+  /* *** Parameters of the log handler *** */
+
+  /* Page cache for the log reads */
+  PAGECACHE *pagecache;
+  uint flags;
+  /* File open flags */
+  uint open_flags;
+  /* max size of one log size (for new logs creation) */
+  uint32 log_file_max_size;
+  uint32 server_version;
+  /* server ID (used for replication) */
+  uint32 server_id;
+  /* Loghandler's buffer capacity in case of chunk 2 filling */
+  uint32 buffer_capacity_chunk_2;
+  /*
+    Half of the buffer capacity in case of chunk 2 filling,
+    used to decide will we write a record in one group or many.
+    It is written to the variable just to avoid devision every
+    time we need it.
+  */
+  uint32 half_buffer_capacity_chunk_2;
+  /* Page overhead calculated by flags (whether CRC is enabled, etc) */
+  uint16 page_overhead;
+  /*
+    Page capacity ("useful load") calculated by flags
+    (TRANSLOG_PAGE_SIZE - page_overhead-1)
+  */
+  uint16 page_capacity_chunk_2;
+  /* Path to the directory where we store log store files */
+  char directory[FN_REFLEN];
+
+  /* *** Current state of the log handler *** */
+  /* list of opened files */
+  DYNAMIC_ARRAY open_files;
+  /* min/max number of file in the array */
+  uint32 max_file, min_file;
+  /* the opened files list guard */
+  rw_lock_t open_files_lock;
+
+  /*
+    File descriptor of the directory where we store log files for syncing
+    it.
+  */
+  File directory_fd;
+  /* buffers for log writing */
+  struct st_translog_buffer buffers[TRANSLOG_BUFFERS_NO];
+  /* Mask where 1 in position N mean that buffer N is not flushed */
+  dirty_buffer_mask_t dirty_buffer_mask;
+  /* The above variable protection */
+  pthread_mutex_t dirty_buffer_mask_lock;
+  /*
+     horizon - visible end of the log (here is absolute end of the log:
+     position where next chunk can start
+  */
+  TRANSLOG_ADDRESS horizon;
+  /* horizon buffer cursor */
+  struct st_buffer_cursor bc;
+  /* maximum LSN of the current (not finished) file */
+  LSN max_lsn;
+
+  /*
+    Last flushed LSN (protected by log_flush_lock).
+    Pointers in the log ordered like this:
+    last_lsn_checked <= flushed <= sent_to_disk <= in_buffers_only <=
+    max_lsn <= horizon
+  */
+  LSN flushed;
+  /* Last LSN sent to the disk (but maybe not written yet) */
+  LSN sent_to_disk;
+  /* Horizon from which log started after initialization */
+  TRANSLOG_ADDRESS log_start;
+  TRANSLOG_ADDRESS previous_flush_horizon;
+  /* All what is after this address is not sent to disk yet */
+  TRANSLOG_ADDRESS in_buffers_only;
+  /* protection of sent_to_disk and in_buffers_only */
+  pthread_mutex_t sent_to_disk_lock;
+  /*
+    Protect flushed (see above) and for flush serialization (will
+    be removed in v1.5
+  */
+  pthread_mutex_t log_flush_lock;
+  pthread_cond_t log_flush_cond;
+  pthread_cond_t new_goal_cond;
+
+  /* Protects changing of headers of finished files (max_lsn) */
+  pthread_mutex_t file_header_lock;
+
+  /*
+    Sorted array (with protection) of files where we started writing process
+    and so we can't give last LSN yet
+  */
+  pthread_mutex_t unfinished_files_lock;
+  DYNAMIC_ARRAY unfinished_files;
+
+  /*
+    minimum number of still need file calculeted during last
+    translog_purge call
+  */
+  uint32 min_need_file;
+  /* Purger data: minimum file in the log (or 0 if unknown) */
+  uint32 min_file_number;
+  /* Protect purger from many calls and it's data */
+  pthread_mutex_t purger_lock;
+  /* last low water mark checked */
+  LSN last_lsn_checked;
+  /**
+    Must be set to 0 under loghandler lock every time a new LSN
+    is generated.
+  */
+  my_bool is_everything_flushed;
+  /* True when flush pass is in progress */
+  my_bool flush_in_progress;
+  /* The flush number (used to distinguish two flushes goes one by one) */
+  volatile int flush_no;
+  /* Next flush pass variables */
+  TRANSLOG_ADDRESS next_pass_max_lsn;
+  pthread_t max_lsn_requester;
+};
+
+static struct st_translog_descriptor log_descriptor;
+
+ulong log_purge_type= TRANSLOG_PURGE_IMMIDIATE;
+ulong log_file_size= TRANSLOG_FILE_SIZE;
+/* sync() of log files directory mode */
+ulong sync_log_dir= TRANSLOG_SYNC_DIR_NEWFILE;
+ulong maria_group_commit= TRANSLOG_GCOMMIT_NONE;
+ulong maria_group_commit_interval= 0;
+
+/* Marker for end of log */
+static uchar end_of_log= 0;
+#define END_OF_LOG &end_of_log
+/**
+  Switch for "soft" sync (no real sync() but periodical sync by service
+  thread)
+*/
+static volatile my_bool soft_sync= FALSE;
+/**
+  Switch for "hard" group commit mode
+*/
+static volatile my_bool hard_group_commit= FALSE;
+/**
+  File numbers interval which have to be sync()
+*/
+static uint32 soft_sync_min= 0;
+static uint32 soft_sync_max= 0;
+static uint32 soft_need_sync= 1;
+/**
+  stores interval in microseconds
+*/
+static uint32 group_commit_wait= 0;
+
+enum enum_translog_status translog_status= TRANSLOG_UNINITED;
+ulonglong translog_syncs= 0; /* Number of sync()s */
+
+/* time of last flush */
+static ulonglong flush_start= 0;
+
+/* chunk types */
+#define TRANSLOG_CHUNK_LSN   0x00      /* 0 chunk refer as LSN (head or tail */
+#define TRANSLOG_CHUNK_FIXED (1 << 6)  /* 1 (pseudo)fixed record (also LSN) */
+#define TRANSLOG_CHUNK_NOHDR (2 << 6)  /* 2 no head chunk (till page end) */
+#define TRANSLOG_CHUNK_LNGTH (3 << 6)  /* 3 chunk with chunk length */
+#define TRANSLOG_CHUNK_TYPE  (3 << 6)  /* Mask to get chunk type */
+#define TRANSLOG_REC_TYPE    0x3F      /* Mask to get record type */
+#define TRANSLOG_CHUNK_0_CONT 0x3F     /* the type to mark chunk 0 continue */
+
+/* compressed (relative) LSN constants */
+#define TRANSLOG_CLSN_LEN_BITS 0xC0    /* Mask to get compressed LSN length */
+
+
+#include <my_atomic.h>
+/* an array that maps id of a MARIA_SHARE to this MARIA_SHARE */
+static MARIA_SHARE **id_to_share= NULL;
+/* lock for id_to_share */
+static my_atomic_rwlock_t LOCK_id_to_share;
+
+static my_bool translog_dummy_callback(uchar *page,
+                                       pgcache_page_no_t page_no,
+                                       uchar* data_ptr);
+static my_bool translog_page_validator(uchar *page,
+                                       pgcache_page_no_t page_no,
+                                       uchar* data_ptr);
+
+static my_bool translog_get_next_chunk(TRANSLOG_SCANNER_DATA *scanner);
+static uint32 translog_first_file(TRANSLOG_ADDRESS horizon, int is_protected);
+LSN translog_next_LSN(TRANSLOG_ADDRESS addr, TRANSLOG_ADDRESS horizon);
+
+
+/*
+  Initialize log_record_type_descriptors
+*/
+
+LOG_DESC log_record_type_descriptor[LOGREC_NUMBER_OF_TYPES];
+
+
+#ifndef DBUG_OFF
+
+#define translog_buffer_lock_assert_owner(B) \
+  safe_mutex_assert_owner(&(B)->mutex)
+#define translog_lock_assert_owner() \
+  safe_mutex_assert_owner(&log_descriptor.bc.buffer->mutex)
+void translog_lock_handler_assert_owner()
+{
+  translog_lock_assert_owner();
+}
+
+/**
+  @brief check the description table validity
+
+  @param num             how many records should be filled
+*/
+
+static void check_translog_description_table(int num)
+{
+  int i;
+  DBUG_ENTER("check_translog_description_table");
+  DBUG_PRINT("enter", ("last record: %d", num));
+  DBUG_ASSERT(num > 0);
+  /* last is reserved for extending the table */
+  DBUG_ASSERT(num < LOGREC_NUMBER_OF_TYPES - 1);
+  DBUG_ASSERT(log_record_type_descriptor[0].rclass == LOGRECTYPE_NOT_ALLOWED);
+
+  for (i= 0; i <= num; i++)
+  {
+    DBUG_PRINT("info",
+               ("record type: %d  class: %d  fixed: %u  header: %u  LSNs: %u  "
+                "name: %s",
+                i, log_record_type_descriptor[i].rclass,
+                (uint)log_record_type_descriptor[i].fixed_length,
+                (uint)log_record_type_descriptor[i].read_header_len,
+                (uint)log_record_type_descriptor[i].compressed_LSN,
+                log_record_type_descriptor[i].name));
+    switch (log_record_type_descriptor[i].rclass) {
+    case LOGRECTYPE_NOT_ALLOWED:
+      DBUG_ASSERT(i == 0);
+      break;
+    case LOGRECTYPE_VARIABLE_LENGTH:
+      DBUG_ASSERT(log_record_type_descriptor[i].fixed_length == 0);
+      DBUG_ASSERT((log_record_type_descriptor[i].compressed_LSN == 0) ||
+                  ((log_record_type_descriptor[i].compressed_LSN == 1) &&
+                   (log_record_type_descriptor[i].read_header_len >=
+                    LSN_STORE_SIZE)) ||
+                  ((log_record_type_descriptor[i].compressed_LSN == 2) &&
+                   (log_record_type_descriptor[i].read_header_len >=
+                    LSN_STORE_SIZE * 2)));
+      break;
+    case LOGRECTYPE_PSEUDOFIXEDLENGTH:
+      DBUG_ASSERT(log_record_type_descriptor[i].fixed_length ==
+                  log_record_type_descriptor[i].read_header_len);
+      DBUG_ASSERT(log_record_type_descriptor[i].compressed_LSN > 0);
+      DBUG_ASSERT(log_record_type_descriptor[i].compressed_LSN <= 2);
+      break;
+    case LOGRECTYPE_FIXEDLENGTH:
+      DBUG_ASSERT(log_record_type_descriptor[i].fixed_length ==
+                  log_record_type_descriptor[i].read_header_len);
+      DBUG_ASSERT(log_record_type_descriptor[i].compressed_LSN == 0);
+      break;
+    default:
+      DBUG_ASSERT(0);
+    }
+  }
+  for (i= num + 1; i < LOGREC_NUMBER_OF_TYPES; i++)
+  {
+    DBUG_ASSERT(log_record_type_descriptor[i].rclass ==
+                LOGRECTYPE_NOT_ALLOWED);
+  }
+  DBUG_VOID_RETURN;
+}
+#else
+#define translog_buffer_lock_assert_owner(B) {}
+#define translog_lock_assert_owner() {}
+#endif
+
+static LOG_DESC INIT_LOGREC_RESERVED_FOR_CHUNKS23=
+{LOGRECTYPE_NOT_ALLOWED, 0, 0, NULL, NULL, NULL, 0,
+ "reserved", LOGREC_NOT_LAST_IN_GROUP, NULL, NULL };
+
+static LOG_DESC INIT_LOGREC_REDO_INSERT_ROW_HEAD=
+{LOGRECTYPE_VARIABLE_LENGTH, 0,
+ FILEID_STORE_SIZE + PAGE_STORE_SIZE + DIRPOS_STORE_SIZE, NULL,
+ write_hook_for_redo, NULL, 0,
+ "redo_insert_row_head", LOGREC_NOT_LAST_IN_GROUP, NULL, NULL};
+
+static LOG_DESC INIT_LOGREC_REDO_INSERT_ROW_TAIL=
+{LOGRECTYPE_VARIABLE_LENGTH, 0,
+ FILEID_STORE_SIZE + PAGE_STORE_SIZE + DIRPOS_STORE_SIZE, NULL,
+ write_hook_for_redo, NULL, 0,
+ "redo_insert_row_tail", LOGREC_NOT_LAST_IN_GROUP, NULL, NULL};
+
+static LOG_DESC INIT_LOGREC_REDO_NEW_ROW_HEAD=
+{LOGRECTYPE_VARIABLE_LENGTH, 0,
+ FILEID_STORE_SIZE + PAGE_STORE_SIZE + DIRPOS_STORE_SIZE, NULL,
+ write_hook_for_redo, NULL, 0,
+ "redo_new_row_head", LOGREC_NOT_LAST_IN_GROUP, NULL, NULL};
+
+static LOG_DESC INIT_LOGREC_REDO_NEW_ROW_TAIL=
+{LOGRECTYPE_VARIABLE_LENGTH, 0,
+ FILEID_STORE_SIZE + PAGE_STORE_SIZE + DIRPOS_STORE_SIZE, NULL,
+ write_hook_for_redo, NULL, 0,
+ "redo_new_row_tail", LOGREC_NOT_LAST_IN_GROUP, NULL, NULL};
+
+static LOG_DESC INIT_LOGREC_REDO_INSERT_ROW_BLOBS=
+{LOGRECTYPE_VARIABLE_LENGTH, 0, FILEID_STORE_SIZE, NULL,
+ write_hook_for_redo, NULL, 0,
+ "redo_insert_row_blobs", LOGREC_NOT_LAST_IN_GROUP, NULL, NULL};
+
+static LOG_DESC INIT_LOGREC_REDO_PURGE_ROW_HEAD=
+{LOGRECTYPE_FIXEDLENGTH,
+ FILEID_STORE_SIZE + PAGE_STORE_SIZE + DIRPOS_STORE_SIZE,
+ FILEID_STORE_SIZE + PAGE_STORE_SIZE + DIRPOS_STORE_SIZE,
+ NULL, write_hook_for_redo, NULL, 0,
+ "redo_purge_row_head", LOGREC_NOT_LAST_IN_GROUP, NULL, NULL};
+
+static LOG_DESC INIT_LOGREC_REDO_PURGE_ROW_TAIL=
+{LOGRECTYPE_FIXEDLENGTH,
+ FILEID_STORE_SIZE + PAGE_STORE_SIZE + DIRPOS_STORE_SIZE,
+ FILEID_STORE_SIZE + PAGE_STORE_SIZE + DIRPOS_STORE_SIZE,
+ NULL, write_hook_for_redo, NULL, 0,
+ "redo_purge_row_tail", LOGREC_NOT_LAST_IN_GROUP, NULL, NULL};
+
+static LOG_DESC INIT_LOGREC_REDO_FREE_BLOCKS=
+{LOGRECTYPE_VARIABLE_LENGTH, 0,
+ FILEID_STORE_SIZE + PAGERANGE_STORE_SIZE,
+ NULL, write_hook_for_redo, NULL, 0,
+ "redo_free_blocks", LOGREC_NOT_LAST_IN_GROUP, NULL, NULL};
+
+static LOG_DESC INIT_LOGREC_REDO_FREE_HEAD_OR_TAIL=
+{LOGRECTYPE_FIXEDLENGTH,
+ FILEID_STORE_SIZE + PAGE_STORE_SIZE,
+ FILEID_STORE_SIZE + PAGE_STORE_SIZE,
+ NULL, write_hook_for_redo, NULL, 0,
+ "redo_free_head_or_tail", LOGREC_NOT_LAST_IN_GROUP, NULL, NULL};
+
+/* not yet used; for when we have versioning */
+static LOG_DESC INIT_LOGREC_REDO_DELETE_ROW=
+{LOGRECTYPE_FIXEDLENGTH, 16, 16, NULL, write_hook_for_redo, NULL, 0,
+ "redo_delete_row", LOGREC_NOT_LAST_IN_GROUP, NULL, NULL};
+
+/** @todo RECOVERY BUG unused, remove? */
+static LOG_DESC INIT_LOGREC_REDO_UPDATE_ROW_HEAD=
+{LOGRECTYPE_VARIABLE_LENGTH, 0, 9, NULL, write_hook_for_redo, NULL, 0,
+ "redo_update_row_head", LOGREC_NOT_LAST_IN_GROUP, NULL, NULL};
+
+static LOG_DESC INIT_LOGREC_REDO_INDEX=
+{LOGRECTYPE_VARIABLE_LENGTH, 0, 9, NULL, write_hook_for_redo, NULL, 0,
+ "redo_index", LOGREC_NOT_LAST_IN_GROUP, NULL, NULL};
+
+static LOG_DESC INIT_LOGREC_REDO_INDEX_NEW_PAGE=
+{LOGRECTYPE_VARIABLE_LENGTH, 0,
+ FILEID_STORE_SIZE + PAGE_STORE_SIZE * 2 + KEY_NR_STORE_SIZE + 1,
+ NULL, write_hook_for_redo, NULL, 0,
+ "redo_index_new_page", LOGREC_NOT_LAST_IN_GROUP, NULL, NULL};
+
+static LOG_DESC INIT_LOGREC_REDO_INDEX_FREE_PAGE=
+{LOGRECTYPE_FIXEDLENGTH, FILEID_STORE_SIZE + PAGE_STORE_SIZE * 2,
+ FILEID_STORE_SIZE + PAGE_STORE_SIZE * 2,
+ NULL, write_hook_for_redo, NULL, 0,
+ "redo_index_free_page", LOGREC_NOT_LAST_IN_GROUP, NULL, NULL};
+
+static LOG_DESC INIT_LOGREC_REDO_UNDELETE_ROW=
+{LOGRECTYPE_FIXEDLENGTH, 16, 16, NULL, write_hook_for_redo, NULL, 0,
+ "redo_undelete_row", LOGREC_NOT_LAST_IN_GROUP, NULL, NULL};
+
+static LOG_DESC INIT_LOGREC_CLR_END=
+{LOGRECTYPE_VARIABLE_LENGTH, 0, LSN_STORE_SIZE + FILEID_STORE_SIZE +
+ CLR_TYPE_STORE_SIZE, NULL, write_hook_for_clr_end, NULL, 1,
+ "clr_end", LOGREC_LAST_IN_GROUP, NULL, NULL};
+
+static LOG_DESC INIT_LOGREC_PURGE_END=
+{LOGRECTYPE_PSEUDOFIXEDLENGTH, 5, 5, NULL, NULL, NULL, 1,
+ "purge_end", LOGREC_LAST_IN_GROUP, NULL, NULL};
+
+static LOG_DESC INIT_LOGREC_UNDO_ROW_INSERT=
+{LOGRECTYPE_VARIABLE_LENGTH, 0,
+ LSN_STORE_SIZE + FILEID_STORE_SIZE + PAGE_STORE_SIZE + DIRPOS_STORE_SIZE,
+ NULL, write_hook_for_undo_row_insert, NULL, 1,
+ "undo_row_insert", LOGREC_LAST_IN_GROUP, NULL, NULL};
+
+static LOG_DESC INIT_LOGREC_UNDO_ROW_DELETE=
+{LOGRECTYPE_VARIABLE_LENGTH, 0,
+ LSN_STORE_SIZE + FILEID_STORE_SIZE + PAGE_STORE_SIZE + DIRPOS_STORE_SIZE,
+ NULL, write_hook_for_undo_row_delete, NULL, 1,
+ "undo_row_delete", LOGREC_LAST_IN_GROUP, NULL, NULL};
+
+static LOG_DESC INIT_LOGREC_UNDO_ROW_UPDATE=
+{LOGRECTYPE_VARIABLE_LENGTH, 0,
+ LSN_STORE_SIZE + FILEID_STORE_SIZE + PAGE_STORE_SIZE + DIRPOS_STORE_SIZE,
+ NULL, write_hook_for_undo_row_update, NULL, 1,
+ "undo_row_update", LOGREC_LAST_IN_GROUP, NULL, NULL};
+
+static LOG_DESC INIT_LOGREC_UNDO_KEY_INSERT=
+{LOGRECTYPE_VARIABLE_LENGTH, 0,
+ LSN_STORE_SIZE + FILEID_STORE_SIZE + KEY_NR_STORE_SIZE,
+ NULL, write_hook_for_undo_key_insert, NULL, 1,
+ "undo_key_insert", LOGREC_LAST_IN_GROUP, NULL, NULL};
+
+/* This will never be in the log, only in the clr */
+static LOG_DESC INIT_LOGREC_UNDO_KEY_INSERT_WITH_ROOT=
+{LOGRECTYPE_VARIABLE_LENGTH, 0,
+ LSN_STORE_SIZE + FILEID_STORE_SIZE + KEY_NR_STORE_SIZE + PAGE_STORE_SIZE,
+ NULL, write_hook_for_undo_key, NULL, 1,
+ "undo_key_insert_with_root", LOGREC_LAST_IN_GROUP, NULL, NULL};
+
+static LOG_DESC INIT_LOGREC_UNDO_KEY_DELETE=
+{LOGRECTYPE_VARIABLE_LENGTH, 0,
+ LSN_STORE_SIZE + FILEID_STORE_SIZE + KEY_NR_STORE_SIZE,
+ NULL, write_hook_for_undo_key_delete, NULL, 1,
+ "undo_key_delete", LOGREC_LAST_IN_GROUP, NULL, NULL};
+
+static LOG_DESC INIT_LOGREC_UNDO_KEY_DELETE_WITH_ROOT=
+{LOGRECTYPE_VARIABLE_LENGTH, 0,
+ LSN_STORE_SIZE + FILEID_STORE_SIZE + KEY_NR_STORE_SIZE + PAGE_STORE_SIZE,
+ NULL, write_hook_for_undo_key_delete, NULL, 1,
+ "undo_key_delete_with_root", LOGREC_LAST_IN_GROUP, NULL, NULL};
+
+static LOG_DESC INIT_LOGREC_PREPARE=
+{LOGRECTYPE_VARIABLE_LENGTH, 0, 0, NULL, NULL, NULL, 0,
+ "prepare", LOGREC_IS_GROUP_ITSELF, NULL, NULL};
+
+static LOG_DESC INIT_LOGREC_PREPARE_WITH_UNDO_PURGE=
+{LOGRECTYPE_VARIABLE_LENGTH, 0, LSN_STORE_SIZE, NULL, NULL, NULL, 1,
+ "prepare_with_undo_purge", LOGREC_IS_GROUP_ITSELF, NULL, NULL};
+
+static LOG_DESC INIT_LOGREC_COMMIT=
+{LOGRECTYPE_FIXEDLENGTH, 0, 0, NULL,
+ write_hook_for_commit, NULL, 0, "commit", LOGREC_IS_GROUP_ITSELF, NULL,
+ NULL};
+
+static LOG_DESC INIT_LOGREC_COMMIT_WITH_UNDO_PURGE=
+{LOGRECTYPE_PSEUDOFIXEDLENGTH, 5, 5, NULL, write_hook_for_commit, NULL, 1,
+ "commit_with_undo_purge", LOGREC_IS_GROUP_ITSELF, NULL, NULL};
+
+static LOG_DESC INIT_LOGREC_CHECKPOINT=
+{LOGRECTYPE_VARIABLE_LENGTH, 0, 0, NULL, NULL, NULL, 0,
+ "checkpoint", LOGREC_IS_GROUP_ITSELF, NULL, NULL};
+
+static LOG_DESC INIT_LOGREC_REDO_CREATE_TABLE=
+{LOGRECTYPE_VARIABLE_LENGTH, 0, 1 + 2, NULL, NULL, NULL, 0,
+"redo_create_table", LOGREC_IS_GROUP_ITSELF, NULL, NULL};
+
+static LOG_DESC INIT_LOGREC_REDO_RENAME_TABLE=
+{LOGRECTYPE_VARIABLE_LENGTH, 0, 0, NULL, NULL, NULL, 0,
+ "redo_rename_table", LOGREC_IS_GROUP_ITSELF, NULL, NULL};
+
+static LOG_DESC INIT_LOGREC_REDO_DROP_TABLE=
+{LOGRECTYPE_VARIABLE_LENGTH, 0, 0, NULL, NULL, NULL, 0,
+ "redo_drop_table", LOGREC_IS_GROUP_ITSELF, NULL, NULL};
+
+static LOG_DESC INIT_LOGREC_REDO_DELETE_ALL=
+{LOGRECTYPE_FIXEDLENGTH, FILEID_STORE_SIZE, FILEID_STORE_SIZE,
+ NULL, write_hook_for_redo_delete_all, NULL, 0,
+ "redo_delete_all", LOGREC_IS_GROUP_ITSELF, NULL, NULL};
+
+static LOG_DESC INIT_LOGREC_REDO_REPAIR_TABLE=
+{LOGRECTYPE_FIXEDLENGTH, FILEID_STORE_SIZE + 8 + 8, FILEID_STORE_SIZE + 8 + 8,
+ NULL, NULL, NULL, 0,
+ "redo_repair_table", LOGREC_IS_GROUP_ITSELF, NULL, NULL};
+
+static LOG_DESC INIT_LOGREC_FILE_ID=
+{LOGRECTYPE_VARIABLE_LENGTH, 0, 2, NULL, write_hook_for_file_id, NULL, 0,
+ "file_id", LOGREC_IS_GROUP_ITSELF, NULL, NULL};
+
+static LOG_DESC INIT_LOGREC_LONG_TRANSACTION_ID=
+{LOGRECTYPE_FIXEDLENGTH, 6, 6, NULL, NULL, NULL, 0,
+ "long_transaction_id", LOGREC_IS_GROUP_ITSELF, NULL, NULL};
+
+static LOG_DESC INIT_LOGREC_INCOMPLETE_LOG=
+{LOGRECTYPE_FIXEDLENGTH, FILEID_STORE_SIZE, FILEID_STORE_SIZE,
+ NULL, NULL, NULL, 0,
+ "incomplete_log", LOGREC_IS_GROUP_ITSELF, NULL, NULL};
+
+static LOG_DESC INIT_LOGREC_INCOMPLETE_GROUP=
+{LOGRECTYPE_FIXEDLENGTH, 0, 0,
+ NULL, NULL, NULL, 0,
+ "incomplete_group", LOGREC_IS_GROUP_ITSELF, NULL, NULL};
+
+static LOG_DESC INIT_LOGREC_UNDO_BULK_INSERT=
+{LOGRECTYPE_VARIABLE_LENGTH, 0,
+ LSN_STORE_SIZE + FILEID_STORE_SIZE,
+ NULL, write_hook_for_undo_bulk_insert, NULL, 1,
+ "undo_bulk_insert", LOGREC_LAST_IN_GROUP, NULL, NULL};
+
+static LOG_DESC INIT_LOGREC_REDO_BITMAP_NEW_PAGE=
+{LOGRECTYPE_FIXEDLENGTH, FILEID_STORE_SIZE + PAGE_STORE_SIZE * 2,
+ FILEID_STORE_SIZE + PAGE_STORE_SIZE * 2,
+ NULL, NULL, NULL, 0,
+ "redo_create_bitmap", LOGREC_IS_GROUP_ITSELF, NULL, NULL};
+
+static LOG_DESC INIT_LOGREC_IMPORTED_TABLE=
+{LOGRECTYPE_VARIABLE_LENGTH, 0, 0, NULL, NULL, NULL, 0,
+ "imported_table", LOGREC_IS_GROUP_ITSELF, NULL, NULL};
+
+static LOG_DESC INIT_LOGREC_DEBUG_INFO=
+{LOGRECTYPE_VARIABLE_LENGTH, 0, 0, NULL, NULL, NULL, 0,
+ "info", LOGREC_IS_GROUP_ITSELF, NULL, NULL};
+
+const myf log_write_flags= MY_WME | MY_NABP | MY_WAIT_IF_FULL;
+
+void translog_table_init()
+{
+  int i;
+  log_record_type_descriptor[LOGREC_RESERVED_FOR_CHUNKS23]=
+    INIT_LOGREC_RESERVED_FOR_CHUNKS23;
+  log_record_type_descriptor[LOGREC_REDO_INSERT_ROW_HEAD]=
+    INIT_LOGREC_REDO_INSERT_ROW_HEAD;
+  log_record_type_descriptor[LOGREC_REDO_INSERT_ROW_TAIL]=
+    INIT_LOGREC_REDO_INSERT_ROW_TAIL;
+  log_record_type_descriptor[LOGREC_REDO_NEW_ROW_HEAD]=
+    INIT_LOGREC_REDO_NEW_ROW_HEAD;
+  log_record_type_descriptor[LOGREC_REDO_NEW_ROW_TAIL]=
+    INIT_LOGREC_REDO_NEW_ROW_TAIL;
+  log_record_type_descriptor[LOGREC_REDO_INSERT_ROW_BLOBS]=
+    INIT_LOGREC_REDO_INSERT_ROW_BLOBS;
+  log_record_type_descriptor[LOGREC_REDO_PURGE_ROW_HEAD]=
+    INIT_LOGREC_REDO_PURGE_ROW_HEAD;
+  log_record_type_descriptor[LOGREC_REDO_PURGE_ROW_TAIL]=
+    INIT_LOGREC_REDO_PURGE_ROW_TAIL;
+  log_record_type_descriptor[LOGREC_REDO_FREE_BLOCKS]=
+    INIT_LOGREC_REDO_FREE_BLOCKS;
+  log_record_type_descriptor[LOGREC_REDO_FREE_HEAD_OR_TAIL]=
+    INIT_LOGREC_REDO_FREE_HEAD_OR_TAIL;
+  log_record_type_descriptor[LOGREC_REDO_DELETE_ROW]=
+    INIT_LOGREC_REDO_DELETE_ROW;
+  log_record_type_descriptor[LOGREC_REDO_UPDATE_ROW_HEAD]=
+    INIT_LOGREC_REDO_UPDATE_ROW_HEAD;
+  log_record_type_descriptor[LOGREC_REDO_INDEX]=
+    INIT_LOGREC_REDO_INDEX;
+  log_record_type_descriptor[LOGREC_REDO_INDEX_NEW_PAGE]=
+    INIT_LOGREC_REDO_INDEX_NEW_PAGE;
+  log_record_type_descriptor[LOGREC_REDO_INDEX_FREE_PAGE]=
+    INIT_LOGREC_REDO_INDEX_FREE_PAGE;
+  log_record_type_descriptor[LOGREC_REDO_UNDELETE_ROW]=
+    INIT_LOGREC_REDO_UNDELETE_ROW;
+  log_record_type_descriptor[LOGREC_CLR_END]=
+    INIT_LOGREC_CLR_END;
+  log_record_type_descriptor[LOGREC_PURGE_END]=
+    INIT_LOGREC_PURGE_END;
+  log_record_type_descriptor[LOGREC_UNDO_ROW_INSERT]=
+    INIT_LOGREC_UNDO_ROW_INSERT;
+  log_record_type_descriptor[LOGREC_UNDO_ROW_DELETE]=
+    INIT_LOGREC_UNDO_ROW_DELETE;
+  log_record_type_descriptor[LOGREC_UNDO_ROW_UPDATE]=
+    INIT_LOGREC_UNDO_ROW_UPDATE;
+  log_record_type_descriptor[LOGREC_UNDO_KEY_INSERT]=
+    INIT_LOGREC_UNDO_KEY_INSERT;
+  log_record_type_descriptor[LOGREC_UNDO_KEY_INSERT_WITH_ROOT]=
+    INIT_LOGREC_UNDO_KEY_INSERT_WITH_ROOT;
+  log_record_type_descriptor[LOGREC_UNDO_KEY_DELETE]=
+    INIT_LOGREC_UNDO_KEY_DELETE;
+  log_record_type_descriptor[LOGREC_UNDO_KEY_DELETE_WITH_ROOT]=
+    INIT_LOGREC_UNDO_KEY_DELETE_WITH_ROOT;
+  log_record_type_descriptor[LOGREC_PREPARE]=
+    INIT_LOGREC_PREPARE;
+  log_record_type_descriptor[LOGREC_PREPARE_WITH_UNDO_PURGE]=
+    INIT_LOGREC_PREPARE_WITH_UNDO_PURGE;
+  log_record_type_descriptor[LOGREC_COMMIT]=
+    INIT_LOGREC_COMMIT;
+  log_record_type_descriptor[LOGREC_COMMIT_WITH_UNDO_PURGE]=
+    INIT_LOGREC_COMMIT_WITH_UNDO_PURGE;
+  log_record_type_descriptor[LOGREC_CHECKPOINT]=
+    INIT_LOGREC_CHECKPOINT;
+  log_record_type_descriptor[LOGREC_REDO_CREATE_TABLE]=
+    INIT_LOGREC_REDO_CREATE_TABLE;
+  log_record_type_descriptor[LOGREC_REDO_RENAME_TABLE]=
+    INIT_LOGREC_REDO_RENAME_TABLE;
+  log_record_type_descriptor[LOGREC_REDO_DROP_TABLE]=
+    INIT_LOGREC_REDO_DROP_TABLE;
+  log_record_type_descriptor[LOGREC_REDO_DELETE_ALL]=
+    INIT_LOGREC_REDO_DELETE_ALL;
+  log_record_type_descriptor[LOGREC_REDO_REPAIR_TABLE]=
+    INIT_LOGREC_REDO_REPAIR_TABLE;
+  log_record_type_descriptor[LOGREC_FILE_ID]=
+    INIT_LOGREC_FILE_ID;
+  log_record_type_descriptor[LOGREC_LONG_TRANSACTION_ID]=
+    INIT_LOGREC_LONG_TRANSACTION_ID;
+  log_record_type_descriptor[LOGREC_INCOMPLETE_LOG]=
+    INIT_LOGREC_INCOMPLETE_LOG;
+  log_record_type_descriptor[LOGREC_INCOMPLETE_GROUP]=
+    INIT_LOGREC_INCOMPLETE_GROUP;
+  log_record_type_descriptor[LOGREC_UNDO_BULK_INSERT]=
+    INIT_LOGREC_UNDO_BULK_INSERT;
+  log_record_type_descriptor[LOGREC_REDO_BITMAP_NEW_PAGE]=
+    INIT_LOGREC_REDO_BITMAP_NEW_PAGE;
+  log_record_type_descriptor[LOGREC_IMPORTED_TABLE]=
+    INIT_LOGREC_IMPORTED_TABLE;
+  log_record_type_descriptor[LOGREC_DEBUG_INFO]=
+    INIT_LOGREC_DEBUG_INFO;
+
+  for (i= LOGREC_FIRST_FREE; i < LOGREC_NUMBER_OF_TYPES; i++)
+    log_record_type_descriptor[i].rclass= LOGRECTYPE_NOT_ALLOWED;
+#ifndef DBUG_OFF
+  check_translog_description_table(LOGREC_FIRST_FREE -1);
+#endif
+}
+
+
+/* all possible flags page overheads */
+static uint page_overhead[TRANSLOG_FLAGS_NUM];
+
+typedef struct st_translog_validator_data
+{
+  TRANSLOG_ADDRESS *addr;
+  my_bool was_recovered;
+} TRANSLOG_VALIDATOR_DATA;
+
+
+/*
+  Check cursor/buffer consistence
+
+  SYNOPSIS
+    translog_check_cursor
+    cursor               cursor which will be checked
+*/
+
+static void translog_check_cursor(struct st_buffer_cursor *cursor
+                                 __attribute__((unused)))
+{
+  DBUG_ASSERT(cursor->chaser ||
+              ((ulong) (cursor->ptr - cursor->buffer->buffer) ==
+               cursor->buffer->size));
+  DBUG_ASSERT(cursor->buffer->buffer_no == cursor->buffer_no);
+  DBUG_ASSERT((cursor->ptr -cursor->buffer->buffer) %TRANSLOG_PAGE_SIZE ==
+              cursor->current_page_fill % TRANSLOG_PAGE_SIZE);
+  DBUG_ASSERT(cursor->current_page_fill <= TRANSLOG_PAGE_SIZE);
+}
+
+
+/**
+  @brief switch the loghandler in read only mode in case of write error
+*/
+
+void translog_stop_writing()
+{
+  DBUG_ENTER("translog_stop_writing");
+  DBUG_PRINT("error", ("errno: %d   my_errno: %d", errno, my_errno));
+  translog_status= (translog_status == TRANSLOG_SHUTDOWN ?
+                    TRANSLOG_UNINITED :
+                    TRANSLOG_READONLY);
+  log_descriptor.is_everything_flushed= 1;
+  log_descriptor.open_flags= O_BINARY | O_RDONLY;
+  DBUG_ASSERT(0);
+  DBUG_VOID_RETURN;
+}
+
+
+/*
+  @brief Get file name of the log by log number
+
+  @param file_no         Number of the log we want to open
+  @param path            Pointer to buffer where file name will be
+                         stored (must be FN_REFLEN bytes at least)
+
+  @return pointer to path
+*/
+
+char *translog_filename_by_fileno(uint32 file_no, char *path)
+{
+  char buff[11], *end;
+  uint length;
+  DBUG_ENTER("translog_filename_by_fileno");
+  DBUG_ASSERT(file_no <= 0xfffffff);
+
+  /* log_descriptor.directory is already formated */
+  end= strxmov(path, log_descriptor.directory, "aria_log.0000000", NullS);
+  length= (uint) (int10_to_str(file_no, buff, 10) - buff);
+  strmov(end - length +1, buff);
+
+  DBUG_PRINT("info", ("Path: '%s'  path: 0x%lx", path, (ulong) path));
+  DBUG_RETURN(path);
+}
+
+
+/**
+  @brief Create log file with given number without cache
+
+  @param file_no         Number of the log we want to open
+
+  retval -1  error
+  retval # file descriptor number
+*/
+
+static File create_logfile_by_number_no_cache(uint32 file_no)
+{
+  File file;
+  char path[FN_REFLEN];
+  DBUG_ENTER("create_logfile_by_number_no_cache");
+
+  if (translog_status != TRANSLOG_OK)
+     DBUG_RETURN(-1);
+
+  /* TODO: add O_DIRECT to open flags (when buffer is aligned) */
+  if ((file= my_create(translog_filename_by_fileno(file_no, path),
+                       0, O_BINARY | O_RDWR, MYF(MY_WME))) < 0)
+  {
+    DBUG_PRINT("error", ("Error %d during creating file '%s'", errno, path));
+    translog_stop_writing();
+    DBUG_RETURN(-1);
+  }
+  if (sync_log_dir >= TRANSLOG_SYNC_DIR_NEWFILE &&
+      sync_dir(log_descriptor.directory_fd, MYF(MY_WME | MY_IGNORE_BADFD)))
+  {
+    DBUG_PRINT("error", ("Error %d during syncing directory '%s'",
+                         errno, log_descriptor.directory));
+    translog_stop_writing();
+    DBUG_RETURN(-1);
+  }
+  DBUG_PRINT("info", ("File: '%s'  handler: %d", path, file));
+  DBUG_RETURN(file);
+}
+
+/**
+  @brief Open (not create) log file with given number without cache
+
+  @param file_no         Number of the log we want to open
+
+  retval -1  error
+  retval # file descriptor number
+*/
+
+static File open_logfile_by_number_no_cache(uint32 file_no)
+{
+  File file;
+  char path[FN_REFLEN];
+  DBUG_ENTER("open_logfile_by_number_no_cache");
+
+  /* TODO: add O_DIRECT to open flags (when buffer is aligned) */
+  /* TODO: use my_create() */
+  if ((file= my_open(translog_filename_by_fileno(file_no, path),
+                     log_descriptor.open_flags,
+                     MYF(MY_WME))) < 0)
+  {
+    DBUG_PRINT("error", ("Error %d during opening file '%s'", errno, path));
+    DBUG_RETURN(-1);
+  }
+  DBUG_PRINT("info", ("File: '%s'  handler: %d", path, file));
+  DBUG_RETURN(file);
+}
+
+
+/**
+  @brief get file descriptor by given number using cache
+
+  @param file_no         Number of the log we want to open
+
+  retval # file descriptor
+  retval NULL file is not opened
+*/
+
+static TRANSLOG_FILE *get_logfile_by_number(uint32 file_no)
+{
+  TRANSLOG_FILE *file;
+  DBUG_ENTER("get_logfile_by_number");
+  rw_rdlock(&log_descriptor.open_files_lock);
+  if (log_descriptor.max_file - file_no >=
+      log_descriptor.open_files.elements)
+  {
+    DBUG_PRINT("info", ("File #%u is not opened", file_no));
+    rw_unlock(&log_descriptor.open_files_lock);
+    DBUG_RETURN(NULL);
+  }
+  DBUG_ASSERT(log_descriptor.max_file - log_descriptor.min_file + 1 ==
+              log_descriptor.open_files.elements);
+  DBUG_ASSERT(log_descriptor.max_file >= file_no);
+  DBUG_ASSERT(log_descriptor.min_file <= file_no);
+
+  file= *dynamic_element(&log_descriptor.open_files,
+                         log_descriptor.max_file - file_no, TRANSLOG_FILE **);
+  rw_unlock(&log_descriptor.open_files_lock);
+  DBUG_PRINT("info", ("File 0x%lx File no: %lu, File handler: %d",
+                      (ulong)file, (ulong)file_no,
+                      (file ? file->handler.file : -1)));
+  DBUG_ASSERT(!file || file->number == file_no);
+  DBUG_RETURN(file);
+}
+
+
+/**
+  @brief get current file descriptor
+
+  retval # file descriptor
+*/
+
+static TRANSLOG_FILE *get_current_logfile()
+{
+  TRANSLOG_FILE *file;
+  DBUG_ENTER("get_current_logfile");
+  rw_rdlock(&log_descriptor.open_files_lock);
+  DBUG_PRINT("info", ("max_file: %lu  min_file: %lu  open_files: %lu",
+                      (ulong) log_descriptor.max_file,
+                      (ulong) log_descriptor.min_file,
+                      (ulong) log_descriptor.open_files.elements));
+  DBUG_ASSERT(log_descriptor.max_file - log_descriptor.min_file + 1 ==
+              log_descriptor.open_files.elements);
+  file= *dynamic_element(&log_descriptor.open_files, 0, TRANSLOG_FILE **);
+  rw_unlock(&log_descriptor.open_files_lock);
+  DBUG_RETURN(file);
+}
+
+uchar	NEAR maria_trans_file_magic[]=
+{ (uchar) 254, (uchar) 254, (uchar) 11, '\001', 'M', 'A', 'R', 'I', 'A',
+ 'L', 'O', 'G' };
+#define LOG_HEADER_DATA_SIZE (sizeof(maria_trans_file_magic) + \
+                              8 + 4 + 4 + 4 + 2 + 3 + \
+                              LSN_STORE_SIZE)
+
+
+/*
+  Write log file page header in the just opened new log file
+
+  SYNOPSIS
+    translog_write_file_header();
+
+   NOTES
+    First page is just a marker page; We don't store any real log data in it.
+
+  RETURN
+    0 OK
+    1 ERROR
+*/
+
+static my_bool translog_write_file_header()
+{
+  TRANSLOG_FILE *file;
+  ulonglong timestamp;
+  uchar page_buff[TRANSLOG_PAGE_SIZE], *page= page_buff;
+  my_bool rc;
+  DBUG_ENTER("translog_write_file_header");
+
+  /* file tag */
+  memcpy(page, maria_trans_file_magic, sizeof(maria_trans_file_magic));
+  page+= sizeof(maria_trans_file_magic);
+  /* timestamp */
+  timestamp= my_getsystime();
+  int8store(page, timestamp);
+  page+= 8;
+  /* maria version */
+  int4store(page, TRANSLOG_VERSION_ID);
+  page+= 4;
+  /* mysql version (MYSQL_VERSION_ID) */
+  int4store(page, log_descriptor.server_version);
+  page+= 4;
+  /* server ID */
+  int4store(page, log_descriptor.server_id);
+  page+= 4;
+  /* loghandler page_size */
+  int2store(page, TRANSLOG_PAGE_SIZE - 1);
+  page+= 2;
+  /* file number */
+  int3store(page, LSN_FILE_NO(log_descriptor.horizon));
+  page+= 3;
+  lsn_store(page, LSN_IMPOSSIBLE);
+  page+= LSN_STORE_SIZE;
+  memset(page, TRANSLOG_FILLER, sizeof(page_buff) - (page- page_buff));
+
+  file= get_current_logfile();
+  rc= my_pwrite(file->handler.file, page_buff, sizeof(page_buff), 0,
+                log_write_flags) != 0;
+  /*
+    Dropping the flag in such way can make false alarm: signalling than the
+    file in not sync when it is sync, but the situation is quite rare and
+    protections with mutexes give much more overhead to the whole engine
+  */
+  file->is_sync= 0;
+  DBUG_RETURN(rc);
+}
+
+/*
+  @brief write the new LSN on the given file header
+
+  @param file            The file descriptor
+  @param lsn             That LSN which should be written
+
+  @retval 0 OK
+  @retval 1 Error
+*/
+
+static my_bool translog_max_lsn_to_header(File file, LSN lsn)
+{
+  uchar lsn_buff[LSN_STORE_SIZE];
+  my_bool rc;
+  DBUG_ENTER("translog_max_lsn_to_header");
+  DBUG_PRINT("enter", ("File descriptor: %ld  "
+                       "lsn: (%lu,0x%lx)",
+                       (long) file,
+                       LSN_IN_PARTS(lsn)));
+
+  lsn_store(lsn_buff, lsn);
+
+  rc= (my_pwrite(file, lsn_buff,
+                 LSN_STORE_SIZE,
+                 (LOG_HEADER_DATA_SIZE - LSN_STORE_SIZE),
+                 log_write_flags) != 0 ||
+       my_sync(file, MYF(MY_WME)) != 0);
+  /*
+    We should not increase counter in case of error above, but it is so
+    unlikely that we can ignore this case
+  */
+  translog_syncs++;
+  DBUG_RETURN(rc);
+}
+
+
+/*
+  Information from transaction log file header
+*/
+
+typedef struct st_loghandler_file_info
+{
+  /*
+    LSN_IMPOSSIBLE for current file (not finished file).
+    Maximum LSN of the record which parts stored in the
+    file.
+  */
+  LSN max_lsn;
+  ulonglong timestamp;   /* Time stamp */
+  ulong maria_version;   /* Version of maria loghandler */
+  ulong mysql_version;   /* Version of mysql server */
+  ulong server_id;       /* Server ID */
+  ulong page_size;       /* Loghandler page size */
+  ulong file_number;     /* Number of the file (from the file header) */
+} LOGHANDLER_FILE_INFO;
+
+/*
+  @brief Extract hander file information from loghandler file page
+
+  @param desc header information descriptor to be filled with information
+  @param page_buff buffer with the page content
+*/
+
+static void translog_interpret_file_header(LOGHANDLER_FILE_INFO *desc,
+                                           uchar *page_buff)
+{
+  uchar *ptr;
+
+  ptr= page_buff + sizeof(maria_trans_file_magic);
+  desc->timestamp= uint8korr(ptr);
+  ptr+= 8;
+  desc->maria_version= uint4korr(ptr);
+  ptr+= 4;
+  desc->mysql_version= uint4korr(ptr);
+  ptr+= 4;
+  desc->server_id= uint4korr(ptr + 4);
+  ptr+= 4;
+  desc->page_size= uint2korr(ptr) + 1;
+  ptr+= 2;
+  desc->file_number= uint3korr(ptr);
+  ptr+=3;
+  desc->max_lsn= lsn_korr(ptr);
+}
+
+
+/*
+  @brief Read hander file information from loghandler file
+
+  @param desc header information descriptor to be filled with information
+  @param file file descriptor to read
+
+  @retval 0 OK
+  @retval 1 Error
+*/
+
+my_bool translog_read_file_header(LOGHANDLER_FILE_INFO *desc, File file)
+{
+  uchar page_buff[LOG_HEADER_DATA_SIZE];
+  DBUG_ENTER("translog_read_file_header");
+
+  if (my_pread(file, page_buff,
+               sizeof(page_buff), 0, MYF(MY_FNABP | MY_WME)))
+  {
+    DBUG_PRINT("info", ("log read fail error: %d", my_errno));
+    DBUG_RETURN(1);
+  }
+  translog_interpret_file_header(desc, page_buff);
+  DBUG_PRINT("info", ("timestamp: %llu  aria ver: %lu mysql ver: %lu  "
+                      "server id %lu page size %lu file number %lu  "
+                      "max lsn: (%lu,0x%lx)",
+                      (ulonglong) desc->timestamp,
+                      (ulong) desc->maria_version,
+                      (ulong) desc->mysql_version,
+                      (ulong) desc->server_id,
+                      desc->page_size, (ulong) desc->file_number,
+                      LSN_IN_PARTS(desc->max_lsn)));
+  DBUG_RETURN(0);
+}
+
+
+/*
+  @brief set the lsn to the files from_file - to_file if it is greater
+  then written in the file
+
+  @param from_file       first file number (min)
+  @param to_file         last file number (max)
+  @param lsn             the lsn for writing
+  @param is_locked       true if current thread locked the log handler
+
+  @retval 0 OK
+  @retval 1 Error
+*/
+
+static my_bool translog_set_lsn_for_files(uint32 from_file, uint32 to_file,
+                                          LSN lsn, my_bool is_locked)
+{
+  uint32 file;
+  DBUG_ENTER("translog_set_lsn_for_files");
+  DBUG_PRINT("enter", ("From: %lu  to: %lu  lsn: (%lu,0x%lx)  locked: %d",
+                       (ulong) from_file, (ulong) to_file,
+                       LSN_IN_PARTS(lsn),
+                       is_locked));
+  DBUG_ASSERT(from_file <= to_file);
+  DBUG_ASSERT(from_file > 0); /* we have not file 0 */
+
+  /* Checks the current file (not finished yet file) */
+  if (!is_locked)
+    translog_lock();
+  if (to_file == (uint32) LSN_FILE_NO(log_descriptor.horizon))
+  {
+    if (likely(cmp_translog_addr(lsn, log_descriptor.max_lsn) > 0))
+      log_descriptor.max_lsn= lsn;
+    to_file--;
+  }
+  if (!is_locked)
+    translog_unlock();
+
+  /* Checks finished files if they are */
+  pthread_mutex_lock(&log_descriptor.file_header_lock);
+  for (file= from_file; file <= to_file; file++)
+  {
+    LOGHANDLER_FILE_INFO info;
+    File fd;
+    LINT_INIT(info.max_lsn);
+
+    fd= open_logfile_by_number_no_cache(file);
+    if ((fd < 0) ||
+        ((translog_read_file_header(&info, fd) ||
+          (cmp_translog_addr(lsn, info.max_lsn) > 0 &&
+           translog_max_lsn_to_header(fd, lsn))) |
+          my_close(fd, MYF(MY_WME))))
+    {
+      translog_stop_writing();
+      DBUG_RETURN(1);
+    }
+  }
+  pthread_mutex_unlock(&log_descriptor.file_header_lock);
+
+  DBUG_RETURN(0);
+}
+
+
+/* descriptor of file in unfinished_files */
+struct st_file_counter
+{
+  uint32 file;            /* file number */
+  uint32 counter;         /* counter for started writes */
+};
+
+
+/*
+  @brief mark file "in progress" (for multi-group records)
+
+  @param file            log file number
+*/
+
+static void translog_mark_file_unfinished(uint32 file)
+{
+  int place, i;
+  struct st_file_counter fc, *fc_ptr;
+
+  DBUG_ENTER("translog_mark_file_unfinished");
+  DBUG_PRINT("enter", ("file: %lu", (ulong) file));
+
+  fc.file= file; fc.counter= 1;
+  pthread_mutex_lock(&log_descriptor.unfinished_files_lock);
+
+  if (log_descriptor.unfinished_files.elements == 0)
+  {
+    insert_dynamic(&log_descriptor.unfinished_files, (uchar*) &fc);
+    DBUG_PRINT("info", ("The first element inserted"));
+    goto end;
+  }
+
+  for (place= log_descriptor.unfinished_files.elements - 1;
+       place >= 0;
+       place--)
+  {
+    fc_ptr= dynamic_element(&log_descriptor.unfinished_files,
+                            place, struct st_file_counter *);
+    if (fc_ptr->file <= file)
+      break;
+  }
+
+  if (place >= 0 && fc_ptr->file == file)
+  {
+     fc_ptr->counter++;
+     DBUG_PRINT("info", ("counter increased"));
+     goto end;
+  }
+
+  if (place == (int)log_descriptor.unfinished_files.elements)
+  {
+    insert_dynamic(&log_descriptor.unfinished_files, (uchar*) &fc);
+    DBUG_PRINT("info", ("The last element inserted"));
+    goto end;
+  }
+  /* shift and assign new element */
+  insert_dynamic(&log_descriptor.unfinished_files,
+                 (uchar*)
+                 dynamic_element(&log_descriptor.unfinished_files,
+                                 log_descriptor.unfinished_files.elements- 1,
+                                 struct st_file_counter *));
+  for(i= log_descriptor.unfinished_files.elements - 1; i > place; i--)
+  {
+    /* we do not use set_dynamic() to avoid unneeded checks */
+    memcpy(dynamic_element(&log_descriptor.unfinished_files,
+                           i, struct st_file_counter *),
+           dynamic_element(&log_descriptor.unfinished_files,
+                           i + 1, struct st_file_counter *),
+           sizeof(struct st_file_counter));
+  }
+  memcpy(dynamic_element(&log_descriptor.unfinished_files,
+                         place + 1, struct st_file_counter *),
+         &fc, sizeof(struct st_file_counter));
+end:
+  pthread_mutex_unlock(&log_descriptor.unfinished_files_lock);
+  DBUG_VOID_RETURN;
+}
+
+
+/*
+  @brief remove file mark "in progress" (for multi-group records)
+
+  @param file            log file number
+*/
+
+static void translog_mark_file_finished(uint32 file)
+{
+  int i;
+  struct st_file_counter *fc_ptr;
+  DBUG_ENTER("translog_mark_file_finished");
+  DBUG_PRINT("enter", ("file: %lu", (ulong) file));
+
+  LINT_INIT(fc_ptr);
+
+  pthread_mutex_lock(&log_descriptor.unfinished_files_lock);
+
+  DBUG_ASSERT(log_descriptor.unfinished_files.elements > 0);
+  for (i= 0;
+       i < (int) log_descriptor.unfinished_files.elements;
+       i++)
+  {
+    fc_ptr= dynamic_element(&log_descriptor.unfinished_files,
+                            i, struct st_file_counter *);
+    if (fc_ptr->file == file)
+    {
+      break;
+    }
+  }
+  DBUG_ASSERT(i < (int) log_descriptor.unfinished_files.elements);
+
+  if (! --fc_ptr->counter)
+    delete_dynamic_element(&log_descriptor.unfinished_files, i);
+  pthread_mutex_unlock(&log_descriptor.unfinished_files_lock);
+  DBUG_VOID_RETURN;
+}
+
+
+/*
+  @brief get max LSN of the record which parts stored in this file
+
+  @param file            file number
+
+  @return requested LSN or LSN_IMPOSSIBLE/LSN_ERROR
+    @retval LSN_IMPOSSIBLE File is still not finished
+    @retval LSN_ERROR Error opening file
+    @retval # LSN of the record which parts stored in this file
+*/
+
+LSN translog_get_file_max_lsn_stored(uint32 file)
+{
+  uint32 limit= FILENO_IMPOSSIBLE;
+  DBUG_ENTER("translog_get_file_max_lsn_stored");
+  DBUG_PRINT("enter", ("file: %lu", (ulong)file));
+  DBUG_ASSERT(translog_status == TRANSLOG_OK ||
+              translog_status == TRANSLOG_READONLY);
+
+  pthread_mutex_lock(&log_descriptor.unfinished_files_lock);
+
+  /* find file with minimum file number "in progress" */
+  if (log_descriptor.unfinished_files.elements > 0)
+  {
+    struct st_file_counter *fc_ptr;
+    fc_ptr= dynamic_element(&log_descriptor.unfinished_files,
+                            0, struct st_file_counter *);
+    limit= fc_ptr->file; /* minimal file number "in progress" */
+  }
+  pthread_mutex_unlock(&log_descriptor.unfinished_files_lock);
+
+  /*
+    if there is no "in progress file" then unfinished file is in progress
+    for sure
+  */
+  if (limit == FILENO_IMPOSSIBLE)
+  {
+    TRANSLOG_ADDRESS horizon= translog_get_horizon();
+    limit= LSN_FILE_NO(horizon);
+  }
+
+  if (file >= limit)
+  {
+    DBUG_PRINT("info", ("The file in in progress"));
+    DBUG_RETURN(LSN_IMPOSSIBLE);
+  }
+
+  {
+    LOGHANDLER_FILE_INFO info;
+    File fd;
+    LINT_INIT_STRUCT(info);
+    fd= open_logfile_by_number_no_cache(file);
+    if ((fd < 0) ||
+        (translog_read_file_header(&info, fd) | my_close(fd, MYF(MY_WME))))
+    {
+      DBUG_PRINT("error", ("Can't read file header"));
+      DBUG_RETURN(LSN_ERROR);
+    }
+    DBUG_PRINT("info", ("Max lsn: (%lu,0x%lx)",
+                         LSN_IN_PARTS(info.max_lsn)));
+    DBUG_RETURN(info.max_lsn);
+  }
+}
+
+/*
+  Initialize transaction log file buffer
+
+  SYNOPSIS
+    translog_buffer_init()
+    buffer               The buffer to initialize
+    num                  Number of this buffer
+
+  RETURN
+    0  OK
+    1  Error
+*/
+
+static my_bool translog_buffer_init(struct st_translog_buffer *buffer, int num)
+{
+  DBUG_ENTER("translog_buffer_init");
+  buffer->pre_force_close_horizon=
+    buffer->prev_last_lsn= buffer->last_lsn=
+    LSN_IMPOSSIBLE;
+  DBUG_PRINT("info", ("last_lsn  and prev_last_lsn set to 0  buffer: 0x%lx",
+                      (ulong) buffer));
+
+  buffer->buffer_no= (uint8) num;
+  /* This Buffer File */
+  buffer->file= NULL;
+  buffer->overlay= 0;
+  /* cache for current log */
+  memset(buffer->buffer, TRANSLOG_FILLER, TRANSLOG_WRITE_BUFFER);
+  /* Buffer size */
+  buffer->size= 0;
+  buffer->skipped_data= 0;
+  /* cond of thread which is waiting for buffer filling */
+  if (pthread_cond_init(&buffer->waiting_filling_buffer, 0))
+    DBUG_RETURN(1);
+  /* Number of records which are in copy progress */
+  buffer->copy_to_buffer_in_progress= 0;
+  /* list of waiting buffer ready threads */
+  buffer->waiting_flush= 0;
+  /*
+    Buffers locked by fallowing mutex. As far as buffers create logical
+    circle (after last buffer goes first) it trigger false alarm of deadlock
+    detect system, so we remove check of deadlock for this buffers. In deed
+    all mutex locks concentrated around current buffer except flushing
+    thread (but it is only one thread). One thread can't take more then
+    2 buffer locks at once. So deadlock is impossible here.
+
+    To prevent false alarm of dead lock detection we switch dead lock
+    detection for one buffer in the middle of the buffers chain. Excluding
+    only one of eight buffers from deadlock detection hardly can hide other
+    possible problems which include this mutexes.
+  */
+  if (my_pthread_mutex_init(&buffer->mutex, MY_MUTEX_INIT_FAST,
+                            "translog_buffer->mutex",
+                            (num == TRANSLOG_BUFFERS_NO - 2 ?
+                             MYF_NO_DEADLOCK_DETECTION : 0)) ||
+      pthread_cond_init(&buffer->prev_sent_to_disk_cond, 0))
+    DBUG_RETURN(1);
+  buffer->is_closing_buffer= 0;
+  buffer->prev_sent_to_disk= LSN_IMPOSSIBLE;
+  buffer->prev_buffer_offset= LSN_IMPOSSIBLE;
+  buffer->ver= 0;
+  DBUG_RETURN(0);
+}
+
+
+/*
+  @brief close transaction log file by descriptor
+
+  @param file            pagegecache file descriptor reference
+
+  @return Operation status
+    @retval 0  OK
+    @retval 1  Error
+*/
+
+static my_bool translog_close_log_file(TRANSLOG_FILE *file)
+{
+  int rc= 0;
+  flush_pagecache_blocks(log_descriptor.pagecache, &file->handler,
+                         FLUSH_RELEASE);
+  /*
+    Sync file when we close it
+    TODO: sync only we have changed the log
+  */
+  if (!file->is_sync)
+  {
+    rc= my_sync(file->handler.file, MYF(MY_WME));
+    translog_syncs++;
+  }
+  rc|= my_close(file->handler.file, MYF(MY_WME));
+  my_free(file, MYF(0));
+  return test(rc);
+}
+
+
+/**
+  @brief Dummy function for write failure (the log to not use
+  pagecache writing)
+*/
+
+void translog_dummy_write_failure(uchar *data __attribute__((unused)))
+{
+  return;
+}
+
+
+/**
+  @brief Initializes TRANSLOG_FILE structure
+
+  @param file            reference on the file to initialize
+  @param number          file number
+  @param is_sync         is file synced on disk
+*/
+
+static void translog_file_init(TRANSLOG_FILE *file, uint32 number,
+                               my_bool is_sync)
+{
+  pagecache_file_init(file->handler, &translog_page_validator,
+                      &translog_dummy_callback,
+                      &translog_dummy_write_failure,
+                      maria_flush_log_for_page_none, file);
+  file->number= number;
+  file->was_recovered= 0;
+  file->is_sync= is_sync;
+}
+
+
+/**
+  @brief Create and fill header of new file.
+
+  @note the caller must call it right after it has increased
+   log_descriptor.horizon to the new file
+   (log_descriptor.horizon+= LSN_ONE_FILE)
+
+
+  @retval 0 OK
+  @retval 1 Error
+*/
+
+static my_bool translog_create_new_file()
+{
+  TRANSLOG_FILE *file= (TRANSLOG_FILE*)my_malloc(sizeof(TRANSLOG_FILE),
+                                                 MYF(0));
+
+  TRANSLOG_FILE *old= get_current_logfile();
+  uint32 file_no= LSN_FILE_NO(log_descriptor.horizon);
+  DBUG_ENTER("translog_create_new_file");
+
+  if (file == NULL)
+    goto error;
+
+  /*
+    Writes max_lsn to the file header before finishing it (there is no need
+    to lock file header buffer because it is still unfinished file, so only
+    one thread can finish the file and nobody interested of LSN of current
+    (unfinished) file, because no one can purge it).
+  */
+  if (translog_max_lsn_to_header(old->handler.file, log_descriptor.max_lsn))
+    goto error;
+
+  rw_wrlock(&log_descriptor.open_files_lock);
+  DBUG_ASSERT(log_descriptor.max_file - log_descriptor.min_file + 1 ==
+              log_descriptor.open_files.elements);
+  DBUG_ASSERT(file_no == log_descriptor.max_file + 1);
+  if (allocate_dynamic(&log_descriptor.open_files,
+                       log_descriptor.max_file - log_descriptor.min_file + 2))
+    goto error_lock;
+  if ((file->handler.file=
+       create_logfile_by_number_no_cache(file_no)) == -1)
+    goto error_lock;
+  translog_file_init(file, file_no, 0);
+
+  /* this call just expand the array */
+  insert_dynamic(&log_descriptor.open_files, (uchar*)&file);
+  log_descriptor.max_file++;
+  {
+    char *start= (char*) dynamic_element(&log_descriptor.open_files, 0,
+                                         TRANSLOG_FILE**);
+    memmove(start + sizeof(TRANSLOG_FILE*), start,
+            sizeof(TRANSLOG_FILE*) *
+            (log_descriptor.max_file - log_descriptor.min_file + 1 - 1));
+  }
+  /* can't fail we because we expanded array */
+  set_dynamic(&log_descriptor.open_files, (uchar*)&file, 0);
+  DBUG_ASSERT(log_descriptor.max_file - log_descriptor.min_file + 1 ==
+              log_descriptor.open_files.elements);
+  rw_unlock(&log_descriptor.open_files_lock);
+
+  DBUG_PRINT("info", ("file_no: %lu", (ulong)file_no));
+
+  if (translog_write_file_header())
+    DBUG_RETURN(1);
+
+  if (ma_control_file_write_and_force(last_checkpoint_lsn, file_no,
+                                      max_trid_in_control_file,
+                                      recovery_failures))
+  {
+    translog_stop_writing();
+    DBUG_RETURN(1);
+  }
+
+  DBUG_RETURN(0);
+
+error_lock:
+  rw_unlock(&log_descriptor.open_files_lock);
+error:
+  translog_stop_writing();
+  DBUG_RETURN(1);
+}
+
+
+/**
+  @brief Locks the loghandler buffer.
+
+  @param buffer          This buffer which should be locked
+
+  @note See comment before buffer 'mutex' variable.
+
+  @retval 0 OK
+  @retval 1 Error
+*/
+
+static void translog_buffer_lock(struct st_translog_buffer *buffer)
+{
+  DBUG_ENTER("translog_buffer_lock");
+  DBUG_PRINT("enter",
+             ("Lock buffer #%u: (0x%lx)", (uint) buffer->buffer_no,
+              (ulong) buffer));
+  pthread_mutex_lock(&buffer->mutex);
+  DBUG_VOID_RETURN;
+}
+
+
+/*
+  Unlock the loghandler buffer
+
+  SYNOPSIS
+    translog_buffer_unlock()
+    buffer               This buffer which should be unlocked
+
+  RETURN
+    0  OK
+    1  Error
+*/
+
+static void translog_buffer_unlock(struct st_translog_buffer *buffer)
+{
+  DBUG_ENTER("translog_buffer_unlock");
+  DBUG_PRINT("enter", ("Unlock buffer... #%u (0x%lx)",
+                       (uint) buffer->buffer_no, (ulong) buffer));
+
+  pthread_mutex_unlock(&buffer->mutex);
+  DBUG_VOID_RETURN;
+}
+
+
+/*
+  Write a header on the page
+
+  SYNOPSIS
+    translog_new_page_header()
+    horizon              Where to write the page
+    cursor               Where to write the page
+
+  NOTE
+    - space for page header should be checked before
+*/
+
+static uchar translog_sector_random;
+
+static void translog_new_page_header(TRANSLOG_ADDRESS *horizon,
+                                     struct st_buffer_cursor *cursor)
+{
+  uchar *ptr;
+
+  DBUG_ENTER("translog_new_page_header");
+  DBUG_ASSERT(cursor->ptr);
+
+  cursor->protected= 0;
+
+  ptr= cursor->ptr;
+  /* Page number */
+  int3store(ptr, LSN_OFFSET(*horizon) / TRANSLOG_PAGE_SIZE);
+  ptr+= 3;
+  /* File number */
+  int3store(ptr, LSN_FILE_NO(*horizon));
+  ptr+= 3;
+  DBUG_ASSERT(TRANSLOG_PAGE_FLAGS == (ptr - cursor->ptr));
+  cursor->ptr[TRANSLOG_PAGE_FLAGS]= (uchar) log_descriptor.flags;
+  ptr++;
+  if (log_descriptor.flags & TRANSLOG_PAGE_CRC)
+  {
+#ifndef DBUG_OFF
+    DBUG_PRINT("info", ("write  0x11223344 CRC to (%lu,0x%lx)",
+                        LSN_IN_PARTS(*horizon)));
+    /* This will be overwritten by real CRC; This is just for debugging */
+    int4store(ptr, 0x11223344);
+#endif
+    /* CRC will be put when page is finished */
+    ptr+= CRC_SIZE;
+  }
+  if (log_descriptor.flags & TRANSLOG_SECTOR_PROTECTION)
+  {
+    /*
+      translog_sector_randmo works like "random" values producer because
+      it is enough to have such "random" for this purpose and it will
+      not interfere with higher level pseudo random value generator
+    */
+    ptr[0]= translog_sector_random++;
+    ptr+= TRANSLOG_PAGE_SIZE / DISK_DRIVE_SECTOR_SIZE;
+  }
+  {
+    uint len= (ptr - cursor->ptr);
+    (*horizon)+= len; /* increasing the offset part of the address */
+    cursor->current_page_fill= len;
+    if (!cursor->chaser)
+      cursor->buffer->size+= len;
+  }
+  cursor->ptr= ptr;
+  DBUG_PRINT("info", ("NewP buffer #%u: 0x%lx  chaser: %d  Size: %lu (%lu)  "
+                      "Horizon: (%lu,0x%lx)",
+                      (uint) cursor->buffer->buffer_no, (ulong) cursor->buffer,
+                      cursor->chaser, (ulong) cursor->buffer->size,
+                      (ulong) (cursor->ptr - cursor->buffer->buffer),
+                      LSN_IN_PARTS(*horizon)));
+  translog_check_cursor(cursor);
+  DBUG_VOID_RETURN;
+}
+
+
+/*
+  Put sector protection on the page image
+
+  SYNOPSIS
+    translog_put_sector_protection()
+    page                 reference on the page content
+    cursor               cursor of the buffer
+
+  NOTES
+    We put a sector protection on all following sectors on the page,
+    except the first sector that is protected by page header.
+*/
+
+static void translog_put_sector_protection(uchar *page,
+                                           struct st_buffer_cursor *cursor)
+{
+  uchar *table= page + log_descriptor.page_overhead -
+    TRANSLOG_PAGE_SIZE / DISK_DRIVE_SECTOR_SIZE;
+  uint i, offset;
+  uint16 last_protected_sector= ((cursor->previous_offset - 1) /
+                                 DISK_DRIVE_SECTOR_SIZE);
+  uint16 start_sector= cursor->previous_offset / DISK_DRIVE_SECTOR_SIZE;
+  uint8 value= table[0] + cursor->write_counter;
+  DBUG_ENTER("translog_put_sector_protection");
+
+  if (start_sector == 0)
+  {
+    /* First sector is protected by file & page numbers in the page header. */
+    start_sector= 1;
+  }
+
+  DBUG_PRINT("enter", ("Write counter:%u  value:%u  offset:%u, "
+                       "last protected:%u  start sector:%u",
+                       (uint) cursor->write_counter,
+                       (uint) value,
+                       (uint) cursor->previous_offset,
+                       (uint) last_protected_sector, (uint) start_sector));
+  if (last_protected_sector == start_sector)
+  {
+    i= last_protected_sector;
+    offset= last_protected_sector * DISK_DRIVE_SECTOR_SIZE;
+    /* restore data, because we modified sector which was protected */
+    if (offset < cursor->previous_offset)
+      page[offset]= table[i];
+  }
+  for (i= start_sector, offset= start_sector * DISK_DRIVE_SECTOR_SIZE;
+       i < TRANSLOG_PAGE_SIZE / DISK_DRIVE_SECTOR_SIZE;
+       i++, (offset+= DISK_DRIVE_SECTOR_SIZE))
+  {
+    DBUG_PRINT("info", ("sector:%u  offset:%u  data 0x%x",
+                        i, offset, (uint) page[offset]));
+    table[i]= page[offset];
+    page[offset]= value;
+    DBUG_PRINT("info", ("sector:%u  offset:%u  data 0x%x",
+                        i, offset, (uint) page[offset]));
+  }
+  DBUG_VOID_RETURN;
+}
+
+
+/*
+  Calculate CRC32 of given area
+
+  SYNOPSIS
+    translog_crc()
+    area                 Pointer of the area beginning
+    length               The Area length
+
+  RETURN
+    CRC32
+*/
+
+static uint32 translog_crc(uchar *area, uint length)
+{
+  DBUG_ENTER("translog_crc");
+  DBUG_RETURN(crc32(0L, (unsigned char*) area, length));
+}
+
+
+/*
+  Finish current page with zeros
+
+  SYNOPSIS
+    translog_finish_page()
+    horizon              \ horizon & buffer pointers
+    cursor               /
+*/
+
+static void translog_finish_page(TRANSLOG_ADDRESS *horizon,
+                                 struct st_buffer_cursor *cursor)
+{
+  uint16 left= TRANSLOG_PAGE_SIZE - cursor->current_page_fill;
+  uchar *page= cursor->ptr - cursor->current_page_fill;
+  DBUG_ENTER("translog_finish_page");
+  DBUG_PRINT("enter", ("Buffer: #%u 0x%lx  "
+                       "Buffer addr: (%lu,0x%lx)  "
+                       "Page addr: (%lu,0x%lx)  "
+                       "size:%lu (%lu)  Pg:%u  left:%u",
+                       (uint) cursor->buffer_no, (ulong) cursor->buffer,
+                       LSN_IN_PARTS(cursor->buffer->offset),
+                       (ulong) LSN_FILE_NO(*horizon),
+                       (ulong) (LSN_OFFSET(*horizon) -
+                                cursor->current_page_fill),
+                       (ulong) cursor->buffer->size,
+                       (ulong) (cursor->ptr -cursor->buffer->buffer),
+                       (uint) cursor->current_page_fill, (uint) left));
+  DBUG_ASSERT(LSN_FILE_NO(*horizon) == LSN_FILE_NO(cursor->buffer->offset));
+  translog_check_cursor(cursor);
+  if (cursor->protected)
+  {
+    DBUG_PRINT("info", ("Already protected and finished"));
+    DBUG_VOID_RETURN;
+  }
+  cursor->protected= 1;
+
+  DBUG_ASSERT(left < TRANSLOG_PAGE_SIZE);
+  if (left != 0)
+  {
+    DBUG_PRINT("info", ("left: %u", (uint) left));
+    memset(cursor->ptr, TRANSLOG_FILLER, left);
+    cursor->ptr+= left;
+    (*horizon)+= left; /* offset increasing */
+    if (!cursor->chaser)
+      cursor->buffer->size+= left;
+    /* We are finishing the page so reset the counter */
+    cursor->current_page_fill= 0;
+    DBUG_PRINT("info", ("Finish Page buffer #%u: 0x%lx  "
+                        "chaser: %d  Size: %lu (%lu)",
+                        (uint) cursor->buffer->buffer_no,
+                        (ulong) cursor->buffer, cursor->chaser,
+                        (ulong) cursor->buffer->size,
+                        (ulong) (cursor->ptr - cursor->buffer->buffer)));
+    translog_check_cursor(cursor);
+  }
+  /*
+    When we are finishing the page other thread might not finish the page
+    header yet (in case if we started from the middle of the page) so we
+    have to read log_descriptor.flags but not the flags from the page.
+  */
+  if (log_descriptor.flags & TRANSLOG_SECTOR_PROTECTION)
+  {
+    translog_put_sector_protection(page, cursor);
+    DBUG_PRINT("info", ("drop write_counter"));
+    cursor->write_counter= 0;
+    cursor->previous_offset= 0;
+  }
+  if (log_descriptor.flags & TRANSLOG_PAGE_CRC)
+  {
+    uint32 crc= translog_crc(page + log_descriptor.page_overhead,
+                             TRANSLOG_PAGE_SIZE -
+                             log_descriptor.page_overhead);
+    DBUG_PRINT("info", ("CRC: %lx", (ulong) crc));
+    /* We have page number, file number and flag before crc */
+    int4store(page + 3 + 3 + 1, crc);
+  }
+  DBUG_VOID_RETURN;
+}
+
+
+/*
+  @brief Wait until all threads have finished closing this buffer.
+
+  @param buffer          This buffer should be check
+*/
+
+static void translog_wait_for_closing(struct st_translog_buffer *buffer)
+{
+  DBUG_ENTER("translog_wait_for_closing");
+  DBUG_PRINT("enter", ("Buffer #%u 0x%lx  copies in progress: %u  "
+                       "is closing %u  File: %d  size: %lu",
+                       (uint) buffer->buffer_no, (ulong) buffer,
+                       (uint) buffer->copy_to_buffer_in_progress,
+                       (uint) buffer->is_closing_buffer,
+                       (buffer->file ? buffer->file->handler.file : -1),
+                       (ulong) buffer->size));
+  translog_buffer_lock_assert_owner(buffer);
+
+  while (buffer->is_closing_buffer)
+  {
+    DBUG_PRINT("info", ("wait for writers... buffer: #%u  0x%lx",
+                        (uint) buffer->buffer_no, (ulong) buffer));
+    DBUG_ASSERT(buffer->file != NULL);
+    pthread_cond_wait(&buffer->waiting_filling_buffer, &buffer->mutex);
+    DBUG_PRINT("info", ("wait for writers done buffer: #%u  0x%lx",
+                        (uint) buffer->buffer_no, (ulong) buffer));
+  }
+
+  DBUG_VOID_RETURN;
+}
+
+
+/*
+  @brief Wait until all threads have finished filling this buffer.
+
+  @param buffer          This buffer should be check
+*/
+
+static void translog_wait_for_writers(struct st_translog_buffer *buffer)
+{
+  DBUG_ENTER("translog_wait_for_writers");
+  DBUG_PRINT("enter", ("Buffer #%u 0x%lx  copies in progress: %u  "
+                       "is closing %u  File: %d  size: %lu",
+                       (uint) buffer->buffer_no, (ulong) buffer,
+                       (uint) buffer->copy_to_buffer_in_progress,
+                       (uint) buffer->is_closing_buffer,
+                       (buffer->file ? buffer->file->handler.file : -1),
+                       (ulong) buffer->size));
+  translog_buffer_lock_assert_owner(buffer);
+
+  while (buffer->copy_to_buffer_in_progress)
+  {
+    DBUG_PRINT("info", ("wait for writers... buffer: #%u  0x%lx",
+                        (uint) buffer->buffer_no, (ulong) buffer));
+    DBUG_ASSERT(buffer->file != NULL);
+    pthread_cond_wait(&buffer->waiting_filling_buffer, &buffer->mutex);
+    DBUG_PRINT("info", ("wait for writers done buffer: #%u  0x%lx",
+                        (uint) buffer->buffer_no, (ulong) buffer));
+  }
+
+  DBUG_VOID_RETURN;
+}
+
+
+/*
+
+  Wait for buffer to become free
+
+  SYNOPSIS
+    translog_wait_for_buffer_free()
+    buffer               The buffer we are waiting for
+
+  NOTE
+    - this buffer should be locked
+*/
+
+static void translog_wait_for_buffer_free(struct st_translog_buffer *buffer)
+{
+  TRANSLOG_ADDRESS offset= buffer->offset;
+  TRANSLOG_FILE *file= buffer->file;
+  uint8 ver= buffer->ver;
+  DBUG_ENTER("translog_wait_for_buffer_free");
+  DBUG_PRINT("enter", ("Buffer #%u 0x%lx  copies in progress: %u  "
+                       "is closing %u  File: %d  size: %lu",
+                       (uint) buffer->buffer_no, (ulong) buffer,
+                       (uint) buffer->copy_to_buffer_in_progress,
+                       (uint) buffer->is_closing_buffer,
+                       (buffer->file ? buffer->file->handler.file : -1),
+                       (ulong) buffer->size));
+
+  translog_wait_for_writers(buffer);
+
+  if (offset != buffer->offset || file != buffer->file || ver != buffer->ver)
+    DBUG_VOID_RETURN; /* the buffer if already freed */
+
+  while (buffer->file != NULL)
+  {
+    DBUG_PRINT("info", ("wait for writers... buffer: #%u  0x%lx",
+                        (uint) buffer->buffer_no, (ulong) buffer));
+    pthread_cond_wait(&buffer->waiting_filling_buffer, &buffer->mutex);
+    DBUG_PRINT("info", ("wait for writers done. buffer: #%u  0x%lx",
+                        (uint) buffer->buffer_no, (ulong) buffer));
+  }
+  DBUG_ASSERT(buffer->copy_to_buffer_in_progress == 0);
+  DBUG_VOID_RETURN;
+}
+
+
+/*
+  Initialize the cursor for a buffer
+
+  SYNOPSIS
+    translog_cursor_init()
+    buffer               The buffer
+    cursor               It's cursor
+    buffer_no            Number of buffer
+*/
+
+static void translog_cursor_init(struct st_buffer_cursor *cursor,
+                                 struct st_translog_buffer *buffer,
+                                 uint8 buffer_no)
+{
+  DBUG_ENTER("translog_cursor_init");
+  cursor->ptr= buffer->buffer;
+  cursor->buffer= buffer;
+  cursor->buffer_no= buffer_no;
+  cursor->current_page_fill= 0;
+  cursor->chaser= (cursor != &log_descriptor.bc);
+  cursor->write_counter= 0;
+  cursor->previous_offset= 0;
+  cursor->protected= 0;
+  DBUG_VOID_RETURN;
+}
+
+
+/*
+  @brief Initialize buffer for the current file, and a cursor for this buffer.
+
+  @param buffer          The buffer
+  @param cursor          It's cursor
+  @param buffer_no       Number of buffer
+*/
+
+static void translog_start_buffer(struct st_translog_buffer *buffer,
+                                  struct st_buffer_cursor *cursor,
+                                  uint buffer_no)
+{
+  DBUG_ENTER("translog_start_buffer");
+  DBUG_PRINT("enter",
+             ("Assign buffer: #%u (0x%lx) offset: 0x%lx(%lu)",
+              (uint) buffer->buffer_no, (ulong) buffer,
+              (ulong) LSN_OFFSET(log_descriptor.horizon),
+              (ulong) LSN_OFFSET(log_descriptor.horizon)));
+  DBUG_ASSERT(buffer_no == buffer->buffer_no);
+  buffer->pre_force_close_horizon=
+    buffer->prev_last_lsn= buffer->last_lsn= LSN_IMPOSSIBLE;
+  DBUG_PRINT("info", ("last_lsn and prev_last_lsn set to 0  buffer: 0x%lx",
+                      (ulong) buffer));
+  buffer->offset= log_descriptor.horizon;
+  buffer->next_buffer_offset= LSN_IMPOSSIBLE;
+  buffer->file= get_current_logfile();
+  buffer->overlay= 0;
+  buffer->size= 0;
+  buffer->skipped_data= 0;
+  translog_cursor_init(cursor, buffer, buffer_no);
+  DBUG_PRINT("info", ("file: #%ld (%d)  init cursor #%u: 0x%lx  "
+                      "chaser: %d  Size: %lu (%lu)",
+                      (long) (buffer->file ? buffer->file->number : 0),
+                      (buffer->file ? buffer->file->handler.file : -1),
+                      (uint) cursor->buffer->buffer_no, (ulong) cursor->buffer,
+                      cursor->chaser, (ulong) cursor->buffer->size,
+                      (ulong) (cursor->ptr - cursor->buffer->buffer)));
+  translog_check_cursor(cursor);
+  pthread_mutex_lock(&log_descriptor.dirty_buffer_mask_lock);
+  log_descriptor.dirty_buffer_mask|= (1 << buffer->buffer_no);
+  pthread_mutex_unlock(&log_descriptor.dirty_buffer_mask_lock);
+
+  DBUG_VOID_RETURN;
+}
+
+
+/*
+  @brief Switch to the next buffer in a chain.
+
+  @param horizon         \ Pointers on current position in file and buffer
+  @param cursor          /
+  @param new_file        Also start new file
+
+  @note
+   - loghandler should be locked
+   - after return new and old buffer still are locked
+
+  @retval 0 OK
+  @retval 1 Error
+*/
+
+static my_bool translog_buffer_next(TRANSLOG_ADDRESS *horizon,
+                                    struct st_buffer_cursor *cursor,
+                                    my_bool new_file)
+{
+  uint old_buffer_no= cursor->buffer_no;
+  uint new_buffer_no= (old_buffer_no + 1) % TRANSLOG_BUFFERS_NO;
+  struct st_translog_buffer *new_buffer= log_descriptor.buffers + new_buffer_no;
+  my_bool chasing= cursor->chaser;
+  DBUG_ENTER("translog_buffer_next");
+
+  DBUG_PRINT("info", ("horizon: (%lu,0x%lx)  chasing: %d",
+                      LSN_IN_PARTS(log_descriptor.horizon), chasing));
+
+  DBUG_ASSERT(cmp_translog_addr(log_descriptor.horizon, *horizon) >= 0);
+
+  translog_finish_page(horizon, cursor);
+
+  if (!chasing)
+  {
+    translog_buffer_lock(new_buffer);
+#ifndef DBUG_OFF
+    {
+      TRANSLOG_ADDRESS offset= new_buffer->offset;
+      TRANSLOG_FILE *file= new_buffer->file;
+      uint8 ver= new_buffer->ver;
+      translog_lock_assert_owner();
+#endif
+      translog_wait_for_buffer_free(new_buffer);
+#ifndef DBUG_OFF
+      /* We keep the handler locked so nobody can start this new buffer */
+      DBUG_ASSERT(offset == new_buffer->offset && new_buffer->file == NULL &&
+                  (file == NULL ? ver : (uint8)(ver + 1)) == new_buffer->ver);
+    }
+#endif
+  }
+  else
+    DBUG_ASSERT(new_buffer->file != NULL);
+
+  if (new_file)
+  {
+    /* move the horizon to the next file and its header page */
+    (*horizon)+= LSN_ONE_FILE;
+    (*horizon)= LSN_REPLACE_OFFSET(*horizon, TRANSLOG_PAGE_SIZE);
+    if (!chasing && translog_create_new_file())
+    {
+      DBUG_RETURN(1);
+    }
+  }
+
+  /* prepare next page */
+  if (chasing)
+    translog_cursor_init(cursor, new_buffer, new_buffer_no);
+  else
+  {
+    translog_lock_assert_owner();
+    translog_start_buffer(new_buffer, cursor, new_buffer_no);
+    new_buffer->prev_buffer_offset=
+      log_descriptor.buffers[old_buffer_no].offset;
+    new_buffer->prev_last_lsn=
+      BUFFER_MAX_LSN(log_descriptor.buffers + old_buffer_no);
+  }
+  log_descriptor.buffers[old_buffer_no].next_buffer_offset= new_buffer->offset;
+  DBUG_PRINT("info", ("prev_last_lsn set to (%lu,0x%lx)  buffer: 0x%lx",
+                      LSN_IN_PARTS(new_buffer->prev_last_lsn),
+                      (ulong) new_buffer));
+  translog_new_page_header(horizon, cursor);
+  DBUG_RETURN(0);
+}
+
+
+/*
+  Sets max LSN sent to file, and address from which data is only in the buffer
+
+  SYNOPSIS
+    translog_set_sent_to_disk()
+    buffer               buffer which we have sent to disk
+
+  TODO: use atomic operations if possible (64bit architectures?)
+*/
+
+static void translog_set_sent_to_disk(struct st_translog_buffer *buffer)
+{
+  LSN lsn= buffer->last_lsn;
+  TRANSLOG_ADDRESS in_buffers= buffer->next_buffer_offset;
+
+  DBUG_ENTER("translog_set_sent_to_disk");
+  pthread_mutex_lock(&log_descriptor.sent_to_disk_lock);
+  DBUG_PRINT("enter", ("lsn: (%lu,0x%lx) in_buffers: (%lu,0x%lx)  "
+                       "in_buffers_only: (%lu,0x%lx)  start: (%lu,0x%lx)  "
+                       "sent_to_disk: (%lu,0x%lx)",
+                       LSN_IN_PARTS(lsn),
+                       LSN_IN_PARTS(in_buffers),
+                       LSN_IN_PARTS(log_descriptor.log_start),
+                       LSN_IN_PARTS(log_descriptor.in_buffers_only),
+                       LSN_IN_PARTS(log_descriptor.sent_to_disk)));
+  /*
+    We write sequentially (first part of following assert) but we rewrite
+    the same page in case we started mysql and shut it down immediately
+    (second part of the following assert)
+  */
+  DBUG_ASSERT(cmp_translog_addr(lsn, log_descriptor.sent_to_disk) >= 0 ||
+              cmp_translog_addr(lsn, log_descriptor.log_start) < 0);
+  log_descriptor.sent_to_disk= lsn;
+  /* LSN_IMPOSSIBLE == 0 => it will work for very first time */
+  if (cmp_translog_addr(in_buffers, log_descriptor.in_buffers_only) > 0)
+  {
+    log_descriptor.in_buffers_only= in_buffers;
+    DBUG_PRINT("info", ("set new in_buffers_only"));
+  }
+  pthread_mutex_unlock(&log_descriptor.sent_to_disk_lock);
+  DBUG_VOID_RETURN;
+}
+
+
+/*
+  Sets address from which data is only in the buffer
+
+  SYNOPSIS
+    translog_set_only_in_buffers()
+    lsn                  LSN to assign
+    in_buffers           to assign to in_buffers_only
+*/
+
+static void translog_set_only_in_buffers(TRANSLOG_ADDRESS in_buffers)
+{
+  DBUG_ENTER("translog_set_only_in_buffers");
+  pthread_mutex_lock(&log_descriptor.sent_to_disk_lock);
+  DBUG_PRINT("enter", ("in_buffers: (%lu,0x%lx)  "
+                       "in_buffers_only: (%lu,0x%lx)",
+                       LSN_IN_PARTS(in_buffers),
+                       LSN_IN_PARTS(log_descriptor.in_buffers_only)));
+  /* LSN_IMPOSSIBLE == 0 => it will work for very first time */
+  if (cmp_translog_addr(in_buffers, log_descriptor.in_buffers_only) > 0)
+  {
+    if (translog_status != TRANSLOG_OK)
+      DBUG_VOID_RETURN;
+    log_descriptor.in_buffers_only= in_buffers;
+    DBUG_PRINT("info", ("set new in_buffers_only"));
+  }
+  pthread_mutex_unlock(&log_descriptor.sent_to_disk_lock);
+  DBUG_VOID_RETURN;
+}
+
+
+/*
+  Gets address from which data is only in the buffer
+
+  SYNOPSIS
+    translog_only_in_buffers()
+
+  RETURN
+    address from which data is only in the buffer
+*/
+
+static TRANSLOG_ADDRESS translog_only_in_buffers()
+{
+  register TRANSLOG_ADDRESS addr;
+  DBUG_ENTER("translog_only_in_buffers");
+  pthread_mutex_lock(&log_descriptor.sent_to_disk_lock);
+  addr= log_descriptor.in_buffers_only;
+  pthread_mutex_unlock(&log_descriptor.sent_to_disk_lock);
+  DBUG_RETURN(addr);
+}
+
+
+/*
+  Get max LSN sent to file
+
+  SYNOPSIS
+    translog_get_sent_to_disk()
+
+  RETURN
+    max LSN send to file
+*/
+
+static LSN translog_get_sent_to_disk()
+{
+  register LSN lsn;
+  DBUG_ENTER("translog_get_sent_to_disk");
+  pthread_mutex_lock(&log_descriptor.sent_to_disk_lock);
+  lsn= log_descriptor.sent_to_disk;
+  DBUG_PRINT("info", ("sent to disk up to (%lu,0x%lx)", LSN_IN_PARTS(lsn)));
+  pthread_mutex_unlock(&log_descriptor.sent_to_disk_lock);
+  DBUG_RETURN(lsn);
+}
+
+
+/*
+  Get first chunk address on the given page
+
+  SYNOPSIS
+    translog_get_first_chunk_offset()
+    page                 The page where to find first chunk
+
+  RETURN
+    first chunk offset
+*/
+
+static my_bool translog_get_first_chunk_offset(uchar *page)
+{
+  DBUG_ENTER("translog_get_first_chunk_offset");
+  DBUG_ASSERT(page[TRANSLOG_PAGE_FLAGS] < TRANSLOG_FLAGS_NUM);
+  DBUG_RETURN(page_overhead[page[TRANSLOG_PAGE_FLAGS]]);
+}
+
+
+/*
+  Write coded length of record
+
+  SYNOPSIS
+    translog_write_variable_record_1group_code_len
+    dst                  Destination buffer pointer
+    length               Length which should be coded
+    header_len           Calculated total header length
+*/
+
+static void
+translog_write_variable_record_1group_code_len(uchar *dst,
+                                               translog_size_t length,
+                                               uint16 header_len)
+{
+  switch (header_len) {
+  case 6:                                      /* (5 + 1) */
+    DBUG_ASSERT(length <= 250);
+    *dst= (uint8) length;
+    return;
+  case 8:                                      /* (5 + 3) */
+    DBUG_ASSERT(length <= 0xFFFF);
+    *dst= 251;
+    int2store(dst + 1, length);
+    return;
+  case 9:                                      /* (5 + 4) */
+    DBUG_ASSERT(length <= (ulong) 0xFFFFFF);
+    *dst= 252;
+    int3store(dst + 1, length);
+    return;
+  case 10:                                     /* (5 + 5) */
+    *dst= 253;
+    int4store(dst + 1, length);
+    return;
+  default:
+    DBUG_ASSERT(0);
+  }
+  return;
+}
+
+
+/*
+  Decode record data length and advance given pointer to the next field
+
+  SYNOPSIS
+    translog_variable_record_1group_decode_len()
+    src                  The pointer to the pointer to the length beginning
+
+  RETURN
+    decoded length
+*/
+
+static translog_size_t translog_variable_record_1group_decode_len(uchar **src)
+{
+  uint8 first= (uint8) (**src);
+  switch (first) {
+  case 251:
+    (*src)+= 3;
+    return (uint2korr((*src) - 2));
+  case 252:
+    (*src)+= 4;
+    return (uint3korr((*src) - 3));
+  case 253:
+    (*src)+= 5;
+    return (uint4korr((*src) - 4));
+  case 254:
+  case 255:
+    DBUG_ASSERT(0);                             /* reserved for future use */
+    return (0);
+  default:
+    (*src)++;
+    return (first);
+  }
+}
+
+
+/*
+  Get total length of this chunk (not only body)
+
+  SYNOPSIS
+    translog_get_total_chunk_length()
+    page                 The page where chunk placed
+    offset               Offset of the chunk on this place
+
+  RETURN
+    total length of the chunk
+*/
+
+static uint16 translog_get_total_chunk_length(uchar *page, uint16 offset)
+{
+  DBUG_ENTER("translog_get_total_chunk_length");
+  switch (page[offset] & TRANSLOG_CHUNK_TYPE) {
+  case TRANSLOG_CHUNK_LSN:
+  {
+    /* 0 chunk referred as LSN (head or tail) */
+    translog_size_t rec_len;
+    uchar *start= page + offset;
+    uchar *ptr= start + 1 + 2; /* chunk type and short trid */
+    uint16 chunk_len, header_len, page_rest;
+    DBUG_PRINT("info", ("TRANSLOG_CHUNK_LSN"));
+    rec_len= translog_variable_record_1group_decode_len(&ptr);
+    chunk_len= uint2korr(ptr);
+    header_len= (uint16) (ptr -start) + 2;
+    DBUG_PRINT("info", ("rec len: %lu  chunk len: %u  header len: %u",
+                        (ulong) rec_len, (uint) chunk_len, (uint) header_len));
+    if (chunk_len)
+    {
+      DBUG_PRINT("info", ("chunk len: %u + %u = %u",
+                          (uint) header_len, (uint) chunk_len,
+                          (uint) (chunk_len + header_len)));
+      DBUG_RETURN(chunk_len + header_len);
+    }
+    page_rest= TRANSLOG_PAGE_SIZE - offset;
+    DBUG_PRINT("info", ("page_rest %u", (uint) page_rest));
+    if (rec_len + header_len < page_rest)
+      DBUG_RETURN(rec_len + header_len);
+    DBUG_RETURN(page_rest);
+  }
+  case TRANSLOG_CHUNK_FIXED:
+  {
+    uchar *ptr;
+    uint type= page[offset] & TRANSLOG_REC_TYPE;
+    uint length;
+    int i;
+    /* 1 (pseudo)fixed record (also LSN) */
+    DBUG_PRINT("info", ("TRANSLOG_CHUNK_FIXED"));
+    DBUG_ASSERT(log_record_type_descriptor[type].rclass ==
+                LOGRECTYPE_FIXEDLENGTH ||
+                log_record_type_descriptor[type].rclass ==
+                LOGRECTYPE_PSEUDOFIXEDLENGTH);
+    if (log_record_type_descriptor[type].rclass == LOGRECTYPE_FIXEDLENGTH)
+    {
+      DBUG_PRINT("info",
+                 ("Fixed length: %u",
+                  (uint) (log_record_type_descriptor[type].fixed_length + 3)));
+      DBUG_RETURN(log_record_type_descriptor[type].fixed_length + 3);
+    }
+
+    ptr= page + offset + 3;            /* first compressed LSN */
+    length= log_record_type_descriptor[type].fixed_length + 3;
+    for (i= 0; i < log_record_type_descriptor[type].compressed_LSN; i++)
+    {
+      /* first 2 bits is length - 2 */
+      uint len= (((uint8) (*ptr)) >> 6) + 2;
+      if (ptr[0] == 0 && ((uint8) ptr[1]) == 1)
+        len+= LSN_STORE_SIZE; /* case of full LSN storing */
+      ptr+= len;
+      /* subtract saved bytes */
+      length-= (LSN_STORE_SIZE - len);
+    }
+    DBUG_PRINT("info", ("Pseudo-fixed length: %u", length));
+    DBUG_RETURN(length);
+  }
+  case TRANSLOG_CHUNK_NOHDR:
+    /* 2 no header chunk (till page end) */
+    DBUG_PRINT("info", ("TRANSLOG_CHUNK_NOHDR  length: %u",
+                        (uint) (TRANSLOG_PAGE_SIZE - offset)));
+    DBUG_RETURN(TRANSLOG_PAGE_SIZE - offset);
+  case TRANSLOG_CHUNK_LNGTH:                   /* 3 chunk with chunk length */
+    DBUG_PRINT("info", ("TRANSLOG_CHUNK_LNGTH"));
+    DBUG_ASSERT(TRANSLOG_PAGE_SIZE - offset >= 3);
+    DBUG_PRINT("info", ("length: %u", uint2korr(page + offset + 1) + 3));
+    DBUG_RETURN(uint2korr(page + offset + 1) + 3);
+  default:
+    DBUG_ASSERT(0);
+    DBUG_RETURN(0);
+  }
+}
+
+/*
+  @brief Waits previous buffer flush finish
+
+  @param buffer          buffer for check
+
+  @retval 0 previous buffer flushed and this thread have to flush this one
+  @retval 1 previous buffer flushed and this buffer flushed by other thread too
+*/
+
+my_bool translog_prev_buffer_flush_wait(struct st_translog_buffer *buffer)
+{
+  TRANSLOG_ADDRESS offset= buffer->offset;
+  TRANSLOG_FILE *file= buffer->file;
+  uint8 ver= buffer->ver;
+  DBUG_ENTER("translog_prev_buffer_flush_wait");
+  DBUG_PRINT("enter", ("buffer: 0x%lx  #%u  offset: (%lu,0x%lx)  "
+                       "prev sent: (%lu,0x%lx) prev offset: (%lu,0x%lx)",
+                       (ulong) buffer, (uint) buffer->buffer_no,
+                       LSN_IN_PARTS(buffer->offset),
+                       LSN_IN_PARTS(buffer->prev_sent_to_disk),
+                       LSN_IN_PARTS(buffer->prev_buffer_offset)));
+  translog_buffer_lock_assert_owner(buffer);
+  /*
+    if prev_sent_to_disk == LSN_IMPOSSIBLE then
+    prev_buffer_offset should be LSN_IMPOSSIBLE
+    because it means that this buffer was never used
+  */
+  DBUG_ASSERT((buffer->prev_sent_to_disk == LSN_IMPOSSIBLE &&
+               buffer->prev_buffer_offset == LSN_IMPOSSIBLE) ||
+              buffer->prev_sent_to_disk != LSN_IMPOSSIBLE);
+  if (buffer->prev_buffer_offset != buffer->prev_sent_to_disk)
+  {
+    do {
+      pthread_cond_wait(&buffer->prev_sent_to_disk_cond, &buffer->mutex);
+      if (buffer->file != file || buffer->offset != offset ||
+          buffer->ver != ver)
+      {
+        translog_buffer_unlock(buffer);
+        DBUG_RETURN(1); /* some the thread flushed the buffer already */
+      }
+    } while(buffer->prev_buffer_offset != buffer->prev_sent_to_disk);
+  }
+  DBUG_RETURN(0);
+}
+
+
+/*
+  Flush given buffer
+
+  SYNOPSIS
+    translog_buffer_flush()
+    buffer               This buffer should be flushed
+
+  RETURN
+    0  OK
+    1  Error
+*/
+
+static my_bool translog_buffer_flush(struct st_translog_buffer *buffer)
+{
+  uint32 i, pg;
+  TRANSLOG_ADDRESS offset= buffer->offset;
+  TRANSLOG_FILE *file= buffer->file;
+  uint8 ver= buffer->ver;
+  uint skipped_data;
+  DBUG_ENTER("translog_buffer_flush");
+  DBUG_PRINT("enter",
+             ("Buffer: #%u 0x%lx file: %d  offset: (%lu,0x%lx)  size: %lu",
+              (uint) buffer->buffer_no, (ulong) buffer,
+              buffer->file->handler.file,
+              LSN_IN_PARTS(buffer->offset),
+              (ulong) buffer->size));
+  translog_buffer_lock_assert_owner(buffer);
+
+  if (buffer->file == NULL)
+    DBUG_RETURN(0);
+
+  translog_wait_for_writers(buffer);
+
+  if (buffer->file != file || buffer->offset != offset || buffer->ver != ver)
+    DBUG_RETURN(0); /* some the thread flushed the buffer already */
+
+  if (buffer->is_closing_buffer)
+  {
+    /* some other flush in progress */
+    translog_wait_for_closing(buffer);
+  }
+
+  if (buffer->file != file || buffer->offset != offset || buffer->ver != ver)
+    DBUG_RETURN(0); /* some the thread flushed the buffer already */
+
+  if (buffer->overlay && translog_prev_buffer_flush_wait(buffer))
+    DBUG_RETURN(0); /* some the thread flushed the buffer already */
+
+  /*
+    Send page by page in the pagecache what we are going to write on the
+    disk
+  */
+  file= buffer->file;
+  skipped_data= buffer->skipped_data;
+  DBUG_ASSERT(skipped_data < TRANSLOG_PAGE_SIZE);
+  for (i= 0, pg= LSN_OFFSET(buffer->offset) / TRANSLOG_PAGE_SIZE;
+       i < buffer->size;
+       i+= TRANSLOG_PAGE_SIZE, pg++)
+  {
+    TRANSLOG_ADDRESS addr= (buffer->offset + i);
+    TRANSLOG_VALIDATOR_DATA data;
+    DBUG_PRINT("info", ("send log form %lu till %lu  address: (%lu,0x%lx)  "
+                        "page #: %lu  buffer size: %lu  buffer: 0x%lx",
+                        (ulong) i, (ulong) (i + TRANSLOG_PAGE_SIZE),
+                        LSN_IN_PARTS(addr), (ulong) pg, (ulong) buffer->size,
+                        (ulong) buffer));
+    data.addr= &addr;
+    DBUG_ASSERT(log_descriptor.pagecache->block_size == TRANSLOG_PAGE_SIZE);
+    DBUG_ASSERT(i + TRANSLOG_PAGE_SIZE <= buffer->size);
+    if (translog_status != TRANSLOG_OK && translog_status != TRANSLOG_SHUTDOWN)
+      DBUG_RETURN(1);
+    if (pagecache_write_part(log_descriptor.pagecache,
+                        &file->handler, pg, 3,
+                        buffer->buffer + i,
+                        PAGECACHE_PLAIN_PAGE,
+                        PAGECACHE_LOCK_LEFT_UNLOCKED,
+                        PAGECACHE_PIN_LEFT_UNPINNED,
+                        PAGECACHE_WRITE_DONE, 0,
+                        LSN_IMPOSSIBLE,
+                        skipped_data,
+                        TRANSLOG_PAGE_SIZE - skipped_data))
+    {
+      DBUG_PRINT("error",
+                 ("Can't write page (%lu,0x%lx) to pagecache, error: %d",
+                  (ulong) buffer->file->number,
+                  (ulong) (LSN_OFFSET(buffer->offset)+ i),
+                  my_errno));
+      translog_stop_writing();
+      DBUG_RETURN(1);
+    }
+    skipped_data= 0;
+  }
+  file->is_sync= 0;
+  if (my_pwrite(file->handler.file, buffer->buffer + buffer->skipped_data,
+                buffer->size - buffer->skipped_data,
+                LSN_OFFSET(buffer->offset) + buffer->skipped_data,
+                log_write_flags))
+  {
+    DBUG_PRINT("error", ("Can't write buffer (%lu,0x%lx) size %lu "
+                         "to the disk (%d)",
+                         (ulong) file->handler.file,
+                         (ulong) LSN_OFFSET(buffer->offset),
+                         (ulong) buffer->size, errno));
+    translog_stop_writing();
+    DBUG_RETURN(1);
+  }
+  /*
+    Dropping the flag in such way can make false alarm: signalling than the
+    file in not sync when it is sync, but the situation is quite rare and
+    protections with mutexes give much more overhead to the whole engine
+  */
+  file->is_sync= 0;
+
+  if (LSN_OFFSET(buffer->last_lsn) != 0)    /* if buffer->last_lsn is set */
+  {
+    if (translog_prev_buffer_flush_wait(buffer))
+      DBUG_RETURN(0); /* some the thread flushed the buffer already */
+    translog_set_sent_to_disk(buffer);
+  }
+  else
+    translog_set_only_in_buffers(buffer->next_buffer_offset);
+
+  /* say to next buffer that we are finished */
+  {
+    struct st_translog_buffer *next_buffer=
+      log_descriptor.buffers + ((buffer->buffer_no + 1) % TRANSLOG_BUFFERS_NO);
+    if (likely(translog_status == TRANSLOG_OK)){
+      translog_buffer_lock(next_buffer);
+      next_buffer->prev_sent_to_disk= buffer->offset;
+      translog_buffer_unlock(next_buffer);
+      pthread_cond_broadcast(&next_buffer->prev_sent_to_disk_cond);
+    }
+    else
+    {
+      /*
+        It is shutdown =>
+          1) there is only one thread
+          2) mutexes of other buffers can be destroyed => we can't use them
+      */
+      next_buffer->prev_sent_to_disk= buffer->offset;
+    }
+  }
+  /* Free buffer */
+  buffer->file= NULL;
+  buffer->overlay= 0;
+  buffer->ver++;
+  pthread_mutex_lock(&log_descriptor.dirty_buffer_mask_lock);
+  log_descriptor.dirty_buffer_mask&= ~(1 << buffer->buffer_no);
+  pthread_mutex_unlock(&log_descriptor.dirty_buffer_mask_lock);
+  pthread_cond_broadcast(&buffer->waiting_filling_buffer);
+  DBUG_RETURN(0);
+}
+
+
+/*
+  Recover page with sector protection (wipe out failed chunks)
+
+  SYNOPSYS
+    translog_recover_page_up_to_sector()
+    page                 reference on the page
+    offset               offset of failed sector
+
+  RETURN
+    0  OK
+    1  Error
+*/
+
+static my_bool translog_recover_page_up_to_sector(uchar *page, uint16 offset)
+{
+  uint16 chunk_offset= translog_get_first_chunk_offset(page), valid_chunk_end;
+  DBUG_ENTER("translog_recover_page_up_to_sector");
+  DBUG_PRINT("enter", ("offset: %u  first chunk: %u",
+                       (uint) offset, (uint) chunk_offset));
+
+  while (page[chunk_offset] != TRANSLOG_FILLER && chunk_offset < offset)
+  {
+    uint16 chunk_length;
+    if ((chunk_length=
+         translog_get_total_chunk_length(page, chunk_offset)) == 0)
+    {
+      DBUG_PRINT("error", ("cant get chunk length (offset %u)",
+                           (uint) chunk_offset));
+      DBUG_RETURN(1);
+    }
+    DBUG_PRINT("info", ("chunk: offset: %u  length %u",
+                        (uint) chunk_offset, (uint) chunk_length));
+    if (((ulong) chunk_offset) + ((ulong) chunk_length) > TRANSLOG_PAGE_SIZE)
+    {
+      DBUG_PRINT("error", ("damaged chunk (offset %u) in trusted area",
+                           (uint) chunk_offset));
+      DBUG_RETURN(1);
+    }
+    chunk_offset+= chunk_length;
+  }
+
+  valid_chunk_end= chunk_offset;
+  /* end of trusted area - sector parsing */
+  while (page[chunk_offset] != TRANSLOG_FILLER)
+  {
+    uint16 chunk_length;
+    if ((chunk_length=
+         translog_get_total_chunk_length(page, chunk_offset)) == 0)
+      break;
+
+    DBUG_PRINT("info", ("chunk: offset: %u  length %u",
+                        (uint) chunk_offset, (uint) chunk_length));
+    if (((ulong) chunk_offset) + ((ulong) chunk_length) >
+        (uint) (offset + DISK_DRIVE_SECTOR_SIZE))
+      break;
+
+    chunk_offset+= chunk_length;
+    valid_chunk_end= chunk_offset;
+  }
+  DBUG_PRINT("info", ("valid chunk end offset: %u", (uint) valid_chunk_end));
+
+  memset(page + valid_chunk_end, TRANSLOG_FILLER,
+         TRANSLOG_PAGE_SIZE - valid_chunk_end);
+
+  DBUG_RETURN(0);
+}
+
+
+/**
+  @brief Dummy write callback.
+*/
+
+static my_bool
+translog_dummy_callback(uchar *page __attribute__((unused)),
+                        pgcache_page_no_t page_no __attribute__((unused)),
+                        uchar* data_ptr __attribute__((unused)))
+{
+  return 0;
+}
+
+
+/**
+  @brief Checks and removes sector protection.
+
+  @param page            reference on the page content.
+  @param file            transaction log descriptor.
+
+  @retvat 0 OK
+  @retval 1 Error
+*/
+
+static my_bool
+translog_check_sector_protection(uchar *page, TRANSLOG_FILE *file)
+{
+  uint i, offset;
+  uchar *table= page + page_overhead[page[TRANSLOG_PAGE_FLAGS]] -
+    TRANSLOG_PAGE_SIZE / DISK_DRIVE_SECTOR_SIZE;
+  uint8 current= table[0];
+  DBUG_ENTER("translog_check_sector_protection");
+
+  for (i= 1, offset= DISK_DRIVE_SECTOR_SIZE;
+       i < TRANSLOG_PAGE_SIZE / DISK_DRIVE_SECTOR_SIZE;
+       i++, offset+= DISK_DRIVE_SECTOR_SIZE)
+  {
+    /*
+      TODO: add chunk counting for "suspecting" sectors (difference is
+      more than 1-2), if difference more then present chunks then it is
+      the problem.
+    */
+    uint8 test= page[offset];
+    DBUG_PRINT("info", ("sector: #%u  offset: %u  current: %lx "
+                        "read: 0x%x  stored: 0x%x%x",
+                        i, offset, (ulong) current,
+                        (uint) uint2korr(page + offset), (uint) table[i],
+                        (uint) table[i + 1]));
+    /*
+      3 is minimal possible record length. So we can have "distance"
+      between 2 sectors value more then DISK_DRIVE_SECTOR_SIZE / 3
+      only if it is old value, i.e. the sector was not written.
+    */
+    if (((test < current) &&
+         ((uint)(0xFFL - current + test) > DISK_DRIVE_SECTOR_SIZE / 3)) ||
+        ((test >= current) &&
+         ((uint)(test - current) > DISK_DRIVE_SECTOR_SIZE / 3)))
+    {
+      if (translog_recover_page_up_to_sector(page, offset))
+        DBUG_RETURN(1);
+      file->was_recovered= 1;
+      DBUG_RETURN(0);
+    }
+
+    /* Restore value on the page */
+    page[offset]= table[i];
+    current= test;
+    DBUG_PRINT("info", ("sector: #%u  offset: %u  current: %lx  "
+                        "read: 0x%x  stored: 0x%x",
+                        i, offset, (ulong) current,
+                        (uint) page[offset], (uint) table[i]));
+  }
+  DBUG_RETURN(0);
+}
+
+
+/**
+  @brief Log page validator (read callback)
+
+  @param page            The page data to check
+  @param page_no         The page number (<offset>/<page length>)
+  @param data_ptr        Read callback data pointer (pointer to TRANSLOG_FILE)
+
+  @todo: add turning loghandler to read-only mode after merging with
+  that patch.
+
+  @retval 0 OK
+  @retval 1 Error
+*/
+
+static my_bool translog_page_validator(uchar *page,
+                                       pgcache_page_no_t page_no,
+                                       uchar* data_ptr)
+{
+  uint this_page_page_overhead;
+  uint flags;
+  uchar *page_pos;
+  TRANSLOG_FILE *data= (TRANSLOG_FILE *) data_ptr;
+#ifndef DBUG_OFF
+  pgcache_page_no_t offset= page_no * TRANSLOG_PAGE_SIZE;
+#endif
+  DBUG_ENTER("translog_page_validator");
+
+  data->was_recovered= 0;
+
+  if ((pgcache_page_no_t) uint3korr(page) != page_no ||
+      (uint32) uint3korr(page + 3) != data->number)
+  {
+    DBUG_PRINT("error", ("Page (%lu,0x%lx): "
+                         "page address written in the page is incorrect: "
+                         "File %lu instead of %lu or page %lu instead of %lu",
+                         (ulong) data->number, (ulong) offset,
+                         (ulong) uint3korr(page + 3), (ulong) data->number,
+                         (ulong) uint3korr(page),
+                         (ulong) page_no));
+    DBUG_RETURN(1);
+  }
+  flags= (uint)(page[TRANSLOG_PAGE_FLAGS]);
+  this_page_page_overhead= page_overhead[flags];
+  if (flags & ~(TRANSLOG_PAGE_CRC | TRANSLOG_SECTOR_PROTECTION |
+                TRANSLOG_RECORD_CRC))
+  {
+    DBUG_PRINT("error", ("Page (%lu,0x%lx): "
+                         "Garbage in the page flags field detected : %x",
+                         (ulong) data->number, (ulong) offset,
+                         (uint) flags));
+    DBUG_RETURN(1);
+  }
+  page_pos= page + (3 + 3 + 1);
+  if (flags & TRANSLOG_PAGE_CRC)
+  {
+    uint32 crc= translog_crc(page + this_page_page_overhead,
+                             TRANSLOG_PAGE_SIZE -
+                             this_page_page_overhead);
+    if (crc != uint4korr(page_pos))
+    {
+      DBUG_PRINT("error", ("Page (%lu,0x%lx): "
+                           "CRC mismatch: calculated: %lx on the page %lx",
+                           (ulong) data->number, (ulong) offset,
+                           (ulong) crc, (ulong) uint4korr(page_pos)));
+      DBUG_RETURN(1);
+    }
+    page_pos+= CRC_SIZE;                      /* Skip crc */
+  }
+  if (flags & TRANSLOG_SECTOR_PROTECTION &&
+      translog_check_sector_protection(page, data))
+  {
+    DBUG_RETURN(1);
+  }
+  DBUG_RETURN(0);
+}
+
+
+/**
+  @brief Locks the loghandler.
+*/
+
+void translog_lock()
+{
+  uint8 current_buffer;
+  DBUG_ENTER("translog_lock");
+
+  /*
+     Locking the loghandler mean locking current buffer, but it can change
+     during locking, so we should check it
+  */
+  for (;;)
+  {
+    /*
+      log_descriptor.bc.buffer_no is only one byte so its reading is
+      an atomic operation
+    */
+    current_buffer= log_descriptor.bc.buffer_no;
+    translog_buffer_lock(log_descriptor.buffers + current_buffer);
+    if (log_descriptor.bc.buffer_no == current_buffer)
+      break;
+    translog_buffer_unlock(log_descriptor.buffers + current_buffer);
+  }
+  DBUG_VOID_RETURN;
+}
+
+
+/*
+  Unlock the loghandler
+
+  SYNOPSIS
+    translog_unlock()
+
+  RETURN
+    0  OK
+    1  Error
+*/
+
+void translog_unlock()
+{
+  translog_buffer_unlock(log_descriptor.bc.buffer);
+}
+
+
+/**
+  @brief Get log page by file number and offset of the beginning of the page
+
+  @param data            validator data, which contains the page address
+  @param buffer          buffer for page placing
+                         (might not be used in some cache implementations)
+  @param direct_link     if it is not NULL then caller can accept direct
+                         link to the page cache
+
+  @retval NULL Error
+  @retval #    pointer to the page cache which should be used to read this page
+*/
+
+static uchar *translog_get_page(TRANSLOG_VALIDATOR_DATA *data, uchar *buffer,
+                                PAGECACHE_BLOCK_LINK **direct_link)
+{
+  TRANSLOG_ADDRESS addr= *(data->addr), in_buffers;
+  uint32 file_no= LSN_FILE_NO(addr);
+  TRANSLOG_FILE *file;
+  DBUG_ENTER("translog_get_page");
+  DBUG_PRINT("enter", ("File: %lu  Offset: %lu(0x%lx)",
+                       (ulong) file_no,
+                       (ulong) LSN_OFFSET(addr),
+                       (ulong) LSN_OFFSET(addr)));
+
+  /* it is really page address */
+  DBUG_ASSERT(LSN_OFFSET(addr) % TRANSLOG_PAGE_SIZE == 0);
+  if (direct_link)
+    *direct_link= NULL;
+
+restart:
+
+  in_buffers= translog_only_in_buffers();
+  DBUG_PRINT("info", ("in_buffers: (%lu,0x%lx)",
+                      LSN_IN_PARTS(in_buffers)));
+  if (in_buffers != LSN_IMPOSSIBLE &&
+      cmp_translog_addr(addr, in_buffers) >= 0)
+  {
+    translog_lock();
+    DBUG_ASSERT(cmp_translog_addr(addr, log_descriptor.horizon) < 0);
+    /* recheck with locked loghandler */
+    in_buffers= translog_only_in_buffers();
+    if (cmp_translog_addr(addr, in_buffers) >= 0)
+    {
+      uint16 buffer_no= log_descriptor.bc.buffer_no;
+#ifndef DBUG_OFF
+      uint16 buffer_start= buffer_no;
+#endif
+      struct st_translog_buffer *buffer_unlock= log_descriptor.bc.buffer;
+      struct st_translog_buffer *curr_buffer= log_descriptor.bc.buffer;
+      for (;;)
+      {
+        /*
+          if the page is in the buffer and it is the last version of the
+          page (in case of division the page by buffer flush)
+        */
+        if (curr_buffer->file != NULL &&
+            cmp_translog_addr(addr, curr_buffer->offset) >= 0 &&
+            cmp_translog_addr(addr,
+                              (curr_buffer->next_buffer_offset ?
+                               curr_buffer->next_buffer_offset:
+                               curr_buffer->offset + curr_buffer->size)) < 0)
+        {
+          TRANSLOG_ADDRESS offset= curr_buffer->offset;
+          TRANSLOG_FILE *fl= curr_buffer->file;
+          uchar *from, *table= NULL;
+          int is_last_unfinished_page;
+          uint last_protected_sector= 0;
+          uint skipped_data= curr_buffer->skipped_data;
+          TRANSLOG_FILE file_copy;
+          uint8 ver= curr_buffer->ver;
+          translog_wait_for_writers(curr_buffer);
+          if (offset != curr_buffer->offset || fl != curr_buffer->file ||
+              ver != curr_buffer->ver)
+          {
+            DBUG_ASSERT(buffer_unlock == curr_buffer);
+            translog_buffer_unlock(buffer_unlock);
+            goto restart;
+          }
+          DBUG_ASSERT(LSN_FILE_NO(addr) ==  LSN_FILE_NO(curr_buffer->offset));
+          from= curr_buffer->buffer + (addr - curr_buffer->offset);
+          if (skipped_data && addr == curr_buffer->offset)
+          {
+            /*
+              We read page part of which is not present in buffer,
+              so we should read absent part from file (page cache actually)
+            */
+            file= get_logfile_by_number(file_no);
+            DBUG_ASSERT(file != NULL);
+            /*
+              it's ok to not lock the page because:
+                - The log handler has it's own page cache.
+                - There is only one thread that can access the log
+                cache at a time
+            */
+            if (!(buffer= pagecache_read(log_descriptor.pagecache,
+                                         &file->handler,
+                                         LSN_OFFSET(addr) / TRANSLOG_PAGE_SIZE,
+                                         3, buffer,
+                                         PAGECACHE_PLAIN_PAGE,
+                                         PAGECACHE_LOCK_LEFT_UNLOCKED,
+                                         NULL)))
+              DBUG_RETURN(NULL);
+          }
+          else
+            skipped_data= 0;  /* Read after skipped in buffer data */
+          /*
+            Now we have correct data in buffer up to 'skipped_data'. The
+            following memcpy() will move the data from the internal buffer
+            that was not yet on disk.
+          */
+          memcpy(buffer + skipped_data, from + skipped_data,
+                 TRANSLOG_PAGE_SIZE - skipped_data);
+          /*
+            We can use copy then in translog_page_validator() because it
+            do not put it permanently somewhere.
+            We have to use copy because after releasing log lock we can't
+            guaranty that the file still be present (in real life it will be
+            present but theoretically possible that it will be released
+            already from last files cache);
+          */
+          file_copy= *(curr_buffer->file);
+          file_copy.handler.callback_data= (uchar*) &file_copy;
+          is_last_unfinished_page= ((log_descriptor.bc.buffer ==
+                                     curr_buffer) &&
+                                    (log_descriptor.bc.ptr >= from) &&
+                                    (log_descriptor.bc.ptr <
+                                     from + TRANSLOG_PAGE_SIZE));
+          if (is_last_unfinished_page &&
+              (buffer[TRANSLOG_PAGE_FLAGS] & TRANSLOG_SECTOR_PROTECTION))
+          {
+            last_protected_sector= ((log_descriptor.bc.previous_offset - 1) /
+                                    DISK_DRIVE_SECTOR_SIZE);
+            table= buffer + log_descriptor.page_overhead -
+              TRANSLOG_PAGE_SIZE / DISK_DRIVE_SECTOR_SIZE;
+          }
+
+          DBUG_ASSERT(buffer_unlock == curr_buffer);
+          translog_buffer_unlock(buffer_unlock);
+          if (is_last_unfinished_page)
+          {
+            uint i;
+            /*
+              This is last unfinished page => we should not check CRC and
+              remove only that protection which already installed (no need
+              to check it)
+
+              We do not check the flag of sector protection, because if
+              (buffer[TRANSLOG_PAGE_FLAGS] & TRANSLOG_SECTOR_PROTECTION) is
+              not set then last_protected_sector will be 0 so following loop
+              will be never executed
+            */
+            DBUG_PRINT("info", ("This is last unfinished page, "
+                                "last protected sector %u",
+                                last_protected_sector));
+            for (i= 1; i <= last_protected_sector; i++)
+            {
+              uint offset= i * DISK_DRIVE_SECTOR_SIZE;
+              DBUG_PRINT("info", ("Sector %u: 0x%02x <- 0x%02x",
+                                  i, buffer[offset],
+                                  table[i]));
+              buffer[offset]= table[i];
+            }
+          }
+          else
+          {
+            /*
+              This IF should be true because we use in-memory data which
+              supposed to be correct.
+            */
+            if (translog_page_validator(buffer,
+                                        LSN_OFFSET(addr) / TRANSLOG_PAGE_SIZE,
+                                        (uchar*) &file_copy))
+            {
+              DBUG_ASSERT(0);
+              buffer= NULL;
+            }
+          }
+          DBUG_RETURN(buffer);
+        }
+        buffer_no= (buffer_no + 1) % TRANSLOG_BUFFERS_NO;
+        curr_buffer= log_descriptor.buffers + buffer_no;
+        translog_buffer_lock(curr_buffer);
+        translog_buffer_unlock(buffer_unlock);
+        buffer_unlock= curr_buffer;
+        /* we can't make a full circle */
+        DBUG_ASSERT(buffer_start != buffer_no);
+      }
+    }
+    translog_unlock();
+  }
+  file= get_logfile_by_number(file_no);
+  DBUG_ASSERT(file != NULL);
+  buffer= pagecache_read(log_descriptor.pagecache, &file->handler,
+                         LSN_OFFSET(addr) / TRANSLOG_PAGE_SIZE,
+                         3, (direct_link ? NULL : buffer),
+                         PAGECACHE_PLAIN_PAGE,
+                         (direct_link ?
+                          PAGECACHE_LOCK_READ :
+                          PAGECACHE_LOCK_LEFT_UNLOCKED),
+                         direct_link);
+  DBUG_PRINT("info", ("Direct link is assigned to : 0x%lx * 0x%lx",
+                      (ulong) direct_link,
+                      (ulong)(direct_link ? *direct_link : NULL)));
+  data->was_recovered= file->was_recovered;
+  DBUG_RETURN(buffer);
+}
+
+
+/**
+  @brief free direct log page link
+
+  @param direct_link the direct log page link to be freed
+
+*/
+
+static void translog_free_link(PAGECACHE_BLOCK_LINK *direct_link)
+{
+  DBUG_ENTER("translog_free_link");
+  DBUG_PRINT("info", ("Direct link: 0x%lx",
+                      (ulong) direct_link));
+  if (direct_link)
+    pagecache_unlock_by_link(log_descriptor.pagecache, direct_link,
+                             PAGECACHE_LOCK_READ_UNLOCK, PAGECACHE_UNPIN,
+                             LSN_IMPOSSIBLE, LSN_IMPOSSIBLE, 0, FALSE);
+  DBUG_VOID_RETURN;
+}
+
+
+/**
+  @brief Finds last full page of the given log file.
+
+  @param addr            address structure to fill with data, which contain
+                         file number of the log file
+  @param last_page_ok    Result of the check whether last page OK.
+                         (for now only we check only that file length
+                         divisible on page length).
+  @param no_errors       suppress messages about non-critical errors
+
+  @retval 0 OK
+  @retval 1 Error
+*/
+
+static my_bool translog_get_last_page_addr(TRANSLOG_ADDRESS *addr,
+                                           my_bool *last_page_ok,
+                                           my_bool no_errors)
+{
+  char path[FN_REFLEN];
+  uint32 rec_offset;
+  my_off_t file_size;
+  uint32 file_no= LSN_FILE_NO(*addr);
+  TRANSLOG_FILE *file;
+#ifndef DBUG_OFF
+  char buff[21];
+#endif
+  DBUG_ENTER("translog_get_last_page_addr");
+
+  if (likely((file= get_logfile_by_number(file_no)) != NULL))
+  {
+    /*
+      This function used only during initialization of loghandler or in
+      scanner (which mean we need read that part of the log), so the
+      requested log file have to be opened and can't be freed after
+      returning pointer on it (file_size).
+    */
+    file_size= my_seek(file->handler.file, 0, SEEK_END, MYF(0));
+  }
+  else
+  {
+    /*
+      This branch is used only during very early initialization
+      when files are not opened.
+    */
+    File fd;
+    if ((fd= my_open(translog_filename_by_fileno(file_no, path),
+                     O_RDONLY, (no_errors ? MYF(0) : MYF(MY_WME)))) < 0)
+    {
+      my_errno= errno;
+      DBUG_PRINT("error", ("Error %d during opening file #%d",
+                           errno, file_no));
+      DBUG_RETURN(1);
+    }
+    file_size= my_seek(fd, 0, SEEK_END, MYF(0));
+    my_close(fd, MYF(0));
+  }
+  DBUG_PRINT("info", ("File size: %s", llstr(file_size, buff)));
+  if (file_size == MY_FILEPOS_ERROR)
+    DBUG_RETURN(1);
+  DBUG_ASSERT(file_size < ULL(0xffffffff));
+  if (((uint32)file_size) > TRANSLOG_PAGE_SIZE)
+  {
+    rec_offset= (((((uint32)file_size) / TRANSLOG_PAGE_SIZE) - 1) *
+                       TRANSLOG_PAGE_SIZE);
+    *last_page_ok= (((uint32)file_size) == rec_offset + TRANSLOG_PAGE_SIZE);
+  }
+  else
+  {
+    *last_page_ok= 0;
+    rec_offset= 0;
+  }
+  *addr= MAKE_LSN(file_no, rec_offset);
+  DBUG_PRINT("info", ("Last page: 0x%lx  ok: %d", (ulong) rec_offset,
+                      *last_page_ok));
+  DBUG_RETURN(0);
+}
+
+
+/**
+  @brief Get number bytes for record length storing
+
+  @param length          Record length which will be encoded
+
+  @return 1,3,4,5 - number of bytes to store given length
+*/
+
+static uint translog_variable_record_length_bytes(translog_size_t length)
+{
+  if (length < 250)
+    return 1;
+  if (length < 0xFFFF)
+    return 3;
+  if (length < (ulong) 0xFFFFFF)
+    return 4;
+  return 5;
+}
+
+
+/**
+  @brief Gets header of this chunk.
+
+  @param chunk           The pointer to the chunk beginning
+
+  @retval # total length of the chunk
+  @retval 0 Error
+*/
+
+static uint16 translog_get_chunk_header_length(uchar *chunk)
+{
+  DBUG_ENTER("translog_get_chunk_header_length");
+  switch (*chunk & TRANSLOG_CHUNK_TYPE) {
+  case TRANSLOG_CHUNK_LSN:
+  {
+    /* 0 chunk referred as LSN (head or tail) */
+    translog_size_t rec_len;
+    uchar *start= chunk;
+    uchar *ptr= start + 1 + 2;
+    uint16 chunk_len, header_len;
+    DBUG_PRINT("info", ("TRANSLOG_CHUNK_LSN"));
+    rec_len= translog_variable_record_1group_decode_len(&ptr);
+    chunk_len= uint2korr(ptr);
+    header_len= (uint16) (ptr - start) +2;
+    DBUG_PRINT("info", ("rec len: %lu  chunk len: %u  header len: %u",
+                        (ulong) rec_len, (uint) chunk_len, (uint) header_len));
+    if (chunk_len)
+    {
+      /* TODO: fine header end */
+      /*
+        The last chunk of multi-group record can be base for it header
+        calculation (we skip to the first group to read the header) so if we
+        stuck here something is wrong.
+      */
+      DBUG_ASSERT(0);
+      DBUG_RETURN(0);                               /* Keep compiler happy */
+    }
+    DBUG_RETURN(header_len);
+  }
+  case TRANSLOG_CHUNK_FIXED:
+  {
+    /* 1 (pseudo)fixed record (also LSN) */
+    DBUG_PRINT("info", ("TRANSLOG_CHUNK_FIXED = 3"));
+    DBUG_RETURN(3);
+  }
+  case TRANSLOG_CHUNK_NOHDR:
+    /* 2 no header chunk (till page end) */
+    DBUG_PRINT("info", ("TRANSLOG_CHUNK_NOHDR = 1"));
+    DBUG_RETURN(1);
+    break;
+  case TRANSLOG_CHUNK_LNGTH:
+    /* 3 chunk with chunk length */
+    DBUG_PRINT("info", ("TRANSLOG_CHUNK_LNGTH = 3"));
+    DBUG_RETURN(3);
+    break;
+  default:
+    DBUG_ASSERT(0);
+    DBUG_RETURN(0);                               /* Keep compiler happy */
+  }
+}
+
+
+/**
+  @brief Truncate the log to the given address. Used during the startup if the
+         end of log if corrupted.
+
+  @param addr            new horizon
+
+  @retval 0 OK
+  @retval 1 Error
+*/
+
+static my_bool translog_truncate_log(TRANSLOG_ADDRESS addr)
+{
+  uchar *page;
+  TRANSLOG_ADDRESS current_page;
+  uint32 next_page_offset, page_rest;
+  uint32 i;
+  File fd;
+  int rc;
+  TRANSLOG_VALIDATOR_DATA data;
+  char path[FN_REFLEN];
+  uchar page_buff[TRANSLOG_PAGE_SIZE];
+  DBUG_ENTER("translog_truncate_log");
+  /* TODO: write warning to the client */
+  DBUG_PRINT("warning", ("removing all records from (%lu,0x%lx) "
+                         "till (%lu,0x%lx)",
+                         LSN_IN_PARTS(addr),
+                         LSN_IN_PARTS(log_descriptor.horizon)));
+  DBUG_ASSERT(cmp_translog_addr(addr, log_descriptor.horizon) < 0);
+  /* remove files between the address and horizon */
+  for (i= LSN_FILE_NO(addr) + 1; i <= LSN_FILE_NO(log_descriptor.horizon); i++)
+    if (my_delete(translog_filename_by_fileno(i, path),  MYF(MY_WME)))
+    {
+      translog_unlock();
+      DBUG_RETURN(1);
+    }
+
+  /* truncate the last file up to the last page */
+  next_page_offset= LSN_OFFSET(addr);
+  next_page_offset= (next_page_offset -
+                     ((next_page_offset - 1) % TRANSLOG_PAGE_SIZE + 1) +
+                     TRANSLOG_PAGE_SIZE);
+  page_rest= next_page_offset - LSN_OFFSET(addr);
+  memset(page_buff, TRANSLOG_FILLER, page_rest);
+  rc= ((fd= open_logfile_by_number_no_cache(LSN_FILE_NO(addr))) < 0 ||
+       ((my_chsize(fd, next_page_offset, TRANSLOG_FILLER, MYF(MY_WME)) ||
+         (page_rest && my_pwrite(fd, page_buff, page_rest, LSN_OFFSET(addr),
+                                 log_write_flags)) ||
+         my_sync(fd, MYF(MY_WME)))));
+  translog_syncs++;
+  rc|= (fd > 0 && my_close(fd, MYF(MY_WME)));
+  if (sync_log_dir >= TRANSLOG_SYNC_DIR_ALWAYS)
+  {
+    rc|= sync_dir(log_descriptor.directory_fd, MYF(MY_WME | MY_IGNORE_BADFD));
+    translog_syncs++;
+  }
+  if (rc)
+    DBUG_RETURN(1);
+
+  /* fix the horizon */
+  log_descriptor.horizon= addr;
+  /* fix the buffer data */
+  current_page= MAKE_LSN(LSN_FILE_NO(addr), (next_page_offset -
+                                             TRANSLOG_PAGE_SIZE));
+  data.addr= &current_page;
+  if ((page= translog_get_page(&data, log_descriptor.buffers->buffer, NULL)) ==
+      NULL)
+    DBUG_RETURN(1);
+  if (page != log_descriptor.buffers->buffer)
+    memcpy(log_descriptor.buffers->buffer, page, TRANSLOG_PAGE_SIZE);
+  log_descriptor.bc.buffer->offset= current_page;
+  log_descriptor.bc.buffer->size= LSN_OFFSET(addr) - LSN_OFFSET(current_page);
+  log_descriptor.bc.ptr=
+    log_descriptor.buffers->buffer + log_descriptor.bc.buffer->size;
+  log_descriptor.bc.current_page_fill= log_descriptor.bc.buffer->size;
+  DBUG_RETURN(0);
+}
+
+
+/**
+  Applies function 'callback' to all files (in a directory) which
+  name looks like a log's name (aria_log.[0-9]{7}).
+  If 'callback' returns TRUE this interrupts the walk and returns
+  TRUE. Otherwise FALSE is returned after processing all log files.
+  It cannot just use log_descriptor.directory because that may not yet have
+  been initialized.
+
+  @param  directory        directory to scan
+  @param  callback         function to apply; is passed directory and base
+                           name of found file
+*/
+
+my_bool translog_walk_filenames(const char *directory,
+                                my_bool (*callback)(const char *,
+                                                    const char *))
+{
+  MY_DIR *dirp;
+  uint i;
+  my_bool rc= FALSE;
+
+  /* Finds and removes transaction log files */
+  if (!(dirp = my_dir(directory, MYF(MY_DONT_SORT))))
+    return FALSE;
+
+  for (i= 0; i < dirp->number_off_files; i++)
+  {
+    char *file= dirp->dir_entry[i].name;
+    if (strncmp(file, "aria_log.", 10) == 0 &&
+        file[10] >= '0' && file[10] <= '9' &&
+        file[11] >= '0' && file[11] <= '9' &&
+        file[12] >= '0' && file[12] <= '9' &&
+        file[13] >= '0' && file[13] <= '9' &&
+        file[14] >= '0' && file[14] <= '9' &&
+        file[15] >= '0' && file[15] <= '9' &&
+        file[16] >= '0' && file[16] <= '9' &&
+        file[17] >= '0' && file[17] <= '9' &&
+        file[18] == '\0' && (*callback)(directory, file))
+    {
+      rc= TRUE;
+      break;
+    }
+  }
+  my_dirend(dirp);
+  return rc;
+}
+
+
+/**
+  @brief Fills table of dependence length of page header from page flags
+*/
+
+static void translog_fill_overhead_table()
+{
+  uint i;
+  for (i= 0; i < TRANSLOG_FLAGS_NUM; i++)
+  {
+     page_overhead[i]= 7;
+     if (i & TRANSLOG_PAGE_CRC)
+       page_overhead[i]+= CRC_SIZE;
+     if (i & TRANSLOG_SECTOR_PROTECTION)
+       page_overhead[i]+= TRANSLOG_PAGE_SIZE /
+                           DISK_DRIVE_SECTOR_SIZE;
+  }
+}
+
+
+/**
+  Callback to find first log in directory.
+*/
+
+static my_bool translog_callback_search_first(const char *directory
+                                              __attribute__((unused)),
+                                              const char *filename
+                                              __attribute__((unused)))
+{
+  return TRUE;
+}
+
+
+/**
+  @brief Checks that chunk is LSN one
+
+  @param type            type of the chunk
+
+  @retval 1 the chunk is LNS
+  @retval 0 the chunk is not LSN
+*/
+
+static my_bool translog_is_LSN_chunk(uchar type)
+{
+  DBUG_ENTER("translog_is_LSN_chunk");
+  DBUG_PRINT("info", ("byte: %x  chunk type: %u  record type: %u",
+                      type, type >> 6, type & TRANSLOG_REC_TYPE));
+  DBUG_RETURN(((type & TRANSLOG_CHUNK_TYPE) == TRANSLOG_CHUNK_FIXED) ||
+              (((type & TRANSLOG_CHUNK_TYPE) == TRANSLOG_CHUNK_LSN)  &&
+               ((type & TRANSLOG_REC_TYPE)) != TRANSLOG_CHUNK_0_CONT));
+}
+
+
+/**
+  @brief Initialize transaction log
+
+  @param directory       Directory where log files are put
+  @param log_file_max_size max size of one log size (for new logs creation)
+  @param server_version  version of MySQL server (MYSQL_VERSION_ID)
+  @param server_id       server ID (replication & Co)
+  @param pagecache       Page cache for the log reads
+  @param flags           flags (TRANSLOG_PAGE_CRC, TRANSLOG_SECTOR_PROTECTION
+                           TRANSLOG_RECORD_CRC)
+  @param read_only       Put transaction log in read-only mode
+  @param init_table_func function to initialize record descriptors table
+  @param no_errors       suppress messages about non-critical errors
+
+  @todo
+    Free used resources in case of error.
+
+  @retval 0 OK
+  @retval 1 Error
+*/
+
+my_bool translog_init_with_table(const char *directory,
+                                 uint32 log_file_max_size,
+                                 uint32 server_version,
+                                 uint32 server_id, PAGECACHE *pagecache,
+                                 uint flags, my_bool readonly,
+                                 void (*init_table_func)(),
+                                 my_bool no_errors)
+{
+  int i;
+  int old_log_was_recovered= 0, logs_found= 0;
+  uint old_flags= flags;
+  uint32 start_file_num= 1;
+  TRANSLOG_ADDRESS sure_page, last_page, last_valid_page, checkpoint_lsn;
+  my_bool version_changed= 0;
+  DBUG_ENTER("translog_init_with_table");
+
+  translog_syncs= 0;
+  flush_start= 0;
+  id_to_share= NULL;
+
+  log_descriptor.directory_fd= -1;
+  log_descriptor.is_everything_flushed= 1;
+  log_descriptor.flush_in_progress= 0;
+  log_descriptor.flush_no= 0;
+  log_descriptor.next_pass_max_lsn= LSN_IMPOSSIBLE;
+
+  (*init_table_func)();
+  compile_time_assert(sizeof(log_descriptor.dirty_buffer_mask) * 8 >=
+                      TRANSLOG_BUFFERS_NO);
+  log_descriptor.dirty_buffer_mask= 0;
+  if (readonly)
+    log_descriptor.open_flags= O_BINARY | O_RDONLY;
+  else
+    log_descriptor.open_flags= O_BINARY | O_RDWR;
+  if (pthread_mutex_init(&log_descriptor.sent_to_disk_lock,
+                         MY_MUTEX_INIT_FAST) ||
+      pthread_mutex_init(&log_descriptor.file_header_lock,
+                         MY_MUTEX_INIT_FAST) ||
+      pthread_mutex_init(&log_descriptor.unfinished_files_lock,
+                         MY_MUTEX_INIT_FAST) ||
+      pthread_mutex_init(&log_descriptor.purger_lock,
+                         MY_MUTEX_INIT_FAST) ||
+      pthread_mutex_init(&log_descriptor.log_flush_lock,
+                         MY_MUTEX_INIT_FAST) ||
+      pthread_mutex_init(&log_descriptor.dirty_buffer_mask_lock,
+                         MY_MUTEX_INIT_FAST) ||
+      pthread_cond_init(&log_descriptor.log_flush_cond, 0) ||
+      pthread_cond_init(&log_descriptor.new_goal_cond, 0) ||
+      my_rwlock_init(&log_descriptor.open_files_lock,
+                     NULL) ||
+      my_init_dynamic_array(&log_descriptor.open_files,
+                            sizeof(TRANSLOG_FILE*), 10, 10) ||
+      my_init_dynamic_array(&log_descriptor.unfinished_files,
+                            sizeof(struct st_file_counter),
+                            10, 10))
+    goto err;
+  log_descriptor.min_need_file= 0;
+  log_descriptor.min_file_number= 0;
+  log_descriptor.last_lsn_checked= LSN_IMPOSSIBLE;
+
+  /* Directory to store files */
+  unpack_dirname(log_descriptor.directory, directory);
+#ifndef __WIN__
+  if ((log_descriptor.directory_fd= my_open(log_descriptor.directory,
+                                            O_RDONLY, MYF(MY_WME))) < 0)
+  {
+    my_errno= errno;
+    DBUG_PRINT("error", ("Error %d during opening directory '%s'",
+                         errno, log_descriptor.directory));
+    goto err;
+  }
+#endif
+  log_descriptor.in_buffers_only= LSN_IMPOSSIBLE;
+  DBUG_ASSERT(log_file_max_size % TRANSLOG_PAGE_SIZE == 0 &&
+              log_file_max_size >= TRANSLOG_MIN_FILE_SIZE);
+  /* max size of one log size (for new logs creation) */
+  log_file_size= log_descriptor.log_file_max_size=
+    log_file_max_size;
+  /* server version */
+  log_descriptor.server_version= server_version;
+  /* server ID */
+  log_descriptor.server_id= server_id;
+  /* Page cache for the log reads */
+  log_descriptor.pagecache= pagecache;
+  /* Flags */
+  DBUG_ASSERT((flags &
+               ~(TRANSLOG_PAGE_CRC | TRANSLOG_SECTOR_PROTECTION |
+                 TRANSLOG_RECORD_CRC)) == 0);
+  log_descriptor.flags= flags;
+  translog_fill_overhead_table();
+  log_descriptor.page_overhead= page_overhead[flags];
+  log_descriptor.page_capacity_chunk_2=
+    TRANSLOG_PAGE_SIZE - log_descriptor.page_overhead - 1;
+  compile_time_assert(TRANSLOG_WRITE_BUFFER % TRANSLOG_PAGE_SIZE == 0);
+  log_descriptor.buffer_capacity_chunk_2=
+    (TRANSLOG_WRITE_BUFFER / TRANSLOG_PAGE_SIZE) *
+    log_descriptor.page_capacity_chunk_2;
+  log_descriptor.half_buffer_capacity_chunk_2=
+    log_descriptor.buffer_capacity_chunk_2 / 2;
+  DBUG_PRINT("info",
+             ("Overhead: %u  pc2: %u  bc2: %u,  bc2/2: %u",
+              log_descriptor.page_overhead,
+              log_descriptor.page_capacity_chunk_2,
+              log_descriptor.buffer_capacity_chunk_2,
+              log_descriptor.half_buffer_capacity_chunk_2));
+
+  /* Just to init it somehow (hack for bootstrap)*/
+  {
+    TRANSLOG_FILE *file= 0;
+    log_descriptor.min_file = log_descriptor.max_file= 1;
+    insert_dynamic(&log_descriptor.open_files, (uchar *)&file);
+    translog_start_buffer(log_descriptor.buffers, &log_descriptor.bc, 0);
+    pop_dynamic(&log_descriptor.open_files);
+  }
+
+  /* Buffers for log writing */
+  for (i= 0; i < TRANSLOG_BUFFERS_NO; i++)
+  {
+    if (translog_buffer_init(log_descriptor.buffers + i, i))
+      goto err;
+    DBUG_PRINT("info", ("translog_buffer buffer #%u: 0x%lx",
+                        i, (ulong) log_descriptor.buffers + i));
+  }
+
+  /*
+    last_logno and last_checkpoint_lsn were set in
+    ma_control_file_create_or_open()
+  */
+  logs_found= (last_logno != FILENO_IMPOSSIBLE);
+
+  translog_status= (readonly ? TRANSLOG_READONLY : TRANSLOG_OK);
+  checkpoint_lsn= last_checkpoint_lsn;
+
+  if (logs_found)
+  {
+    my_bool pageok;
+    DBUG_PRINT("info", ("log found..."));
+    /*
+      TODO: scan directory for aria_log.XXXXXXXX files and find
+       highest XXXXXXXX & set logs_found
+      TODO: check that last checkpoint within present log addresses space
+
+      find the log end
+    */
+    if (LSN_FILE_NO(last_checkpoint_lsn) == FILENO_IMPOSSIBLE)
+    {
+      DBUG_ASSERT(LSN_OFFSET(last_checkpoint_lsn) == 0);
+      /* only last log needs to be checked */
+      sure_page= MAKE_LSN(last_logno, TRANSLOG_PAGE_SIZE);
+    }
+    else
+    {
+      sure_page= last_checkpoint_lsn;
+      DBUG_ASSERT(LSN_OFFSET(sure_page) % TRANSLOG_PAGE_SIZE != 0);
+      sure_page-= LSN_OFFSET(sure_page) % TRANSLOG_PAGE_SIZE;
+    }
+    /* Set horizon to the beginning of the last file first */
+    log_descriptor.horizon= last_page= MAKE_LSN(last_logno, 0);
+    if (translog_get_last_page_addr(&last_page, &pageok, no_errors))
+    {
+      if (!translog_walk_filenames(log_descriptor.directory,
+                                   &translog_callback_search_first))
+      {
+        /*
+          Files was deleted, just start from the next log number, so that
+          existing tables are in the past.
+        */
+        start_file_num= last_logno + 1;
+        checkpoint_lsn= LSN_IMPOSSIBLE; /* no log so no checkpoint */
+        logs_found= 0;
+      }
+      else
+        goto err;
+    }
+    else if (LSN_OFFSET(last_page) == 0)
+    {
+      if (LSN_FILE_NO(last_page) == 1)
+      {
+        logs_found= 0;                          /* file #1 has no pages */
+        DBUG_PRINT("info", ("log found. But is is empty => no log assumed"));
+      }
+      else
+      {
+        last_page-= LSN_ONE_FILE;
+        if (translog_get_last_page_addr(&last_page, &pageok, 0))
+          goto err;
+      }
+    }
+    if (logs_found)
+    {
+      uint32 i;
+      log_descriptor.min_file= translog_first_file(log_descriptor.horizon, 1);
+      log_descriptor.max_file= last_logno;
+      /* Open all files */
+      if (allocate_dynamic(&log_descriptor.open_files,
+                           log_descriptor.max_file -
+                           log_descriptor.min_file + 1))
+        goto err;
+      for (i = log_descriptor.max_file; i >= log_descriptor.min_file; i--)
+      {
+        /*
+          We can't allocate all file together because they will be freed
+          one by one
+        */
+        TRANSLOG_FILE *file= (TRANSLOG_FILE *)my_malloc(sizeof(TRANSLOG_FILE),
+                                                        MYF(0));
+
+        compile_time_assert(MY_FILEPOS_ERROR > ULL(0xffffffff));
+        if (file == NULL ||
+            (file->handler.file=
+             open_logfile_by_number_no_cache(i)) < 0 ||
+            my_seek(file->handler.file, 0, SEEK_END, MYF(0)) >=
+            ULL(0xffffffff))
+        {
+          int j;
+          for (j= i - log_descriptor.min_file - 1; j > 0; j--)
+          {
+            TRANSLOG_FILE *el=
+              *dynamic_element(&log_descriptor.open_files, j,
+                               TRANSLOG_FILE **);
+            my_close(el->handler.file, MYF(MY_WME));
+            my_free(el, MYF(0));
+          }
+          if (file)
+          {
+            free(file);
+            goto err;
+          }
+          else
+            goto err;
+        }
+        translog_file_init(file, i, 1);
+        /* we allocated space so it can't fail */
+        insert_dynamic(&log_descriptor.open_files, (uchar *)&file);
+      }
+      DBUG_ASSERT(log_descriptor.max_file - log_descriptor.min_file + 1 ==
+                  log_descriptor.open_files.elements);
+    }
+  }
+  else if (readonly)
+  {
+    /* There is no logs and there is read-only mode => nothing to read */
+    DBUG_PRINT("error", ("No logs and read-only mode"));
+    goto err;
+  }
+
+  if (logs_found)
+  {
+    TRANSLOG_ADDRESS current_page= sure_page;
+    my_bool pageok;
+
+    DBUG_PRINT("info", ("The log is really present"));
+    DBUG_ASSERT(sure_page <= last_page);
+
+    /* TODO: check page size */
+
+    last_valid_page= LSN_IMPOSSIBLE;
+    /*
+      Scans and validate pages. We need it to show "outside" only for sure
+      valid part of the log. If the log was damaged then fixed we have to
+      cut off damaged part before some other process start write something
+      in the log.
+    */
+    do
+    {
+      TRANSLOG_ADDRESS current_file_last_page;
+      current_file_last_page= current_page;
+      if (translog_get_last_page_addr(&current_file_last_page, &pageok, 0))
+        goto err;
+      if (!pageok)
+      {
+        DBUG_PRINT("error", ("File %lu have no complete last page",
+                             (ulong) LSN_FILE_NO(current_file_last_page)));
+        old_log_was_recovered= 1;
+        /* This file is not written till the end so it should be last */
+        last_page= current_file_last_page;
+        /* TODO: issue warning */
+      }
+      do
+      {
+        TRANSLOG_VALIDATOR_DATA data;
+        TRANSLOG_PAGE_SIZE_BUFF psize_buff;
+        uchar *page;
+        data.addr= &current_page;
+        if ((page= translog_get_page(&data, psize_buff.buffer, NULL)) == NULL)
+          goto err;
+        if (data.was_recovered)
+        {
+          DBUG_PRINT("error", ("file no: %lu (%d)  "
+                               "rec_offset: 0x%lx (%lu) (%d)",
+                               (ulong) LSN_FILE_NO(current_page),
+                               (uint3korr(page + 3) !=
+                                LSN_FILE_NO(current_page)),
+                               (ulong) LSN_OFFSET(current_page),
+                               (ulong) (LSN_OFFSET(current_page) /
+                                        TRANSLOG_PAGE_SIZE),
+                               (uint3korr(page) !=
+                                LSN_OFFSET(current_page) /
+                                TRANSLOG_PAGE_SIZE)));
+          old_log_was_recovered= 1;
+          break;
+        }
+        old_flags= page[TRANSLOG_PAGE_FLAGS];
+        last_valid_page= current_page;
+        current_page+= TRANSLOG_PAGE_SIZE; /* increase offset */
+      } while (current_page <= current_file_last_page);
+      current_page+= LSN_ONE_FILE;
+      current_page= LSN_REPLACE_OFFSET(current_page, TRANSLOG_PAGE_SIZE);
+    } while (LSN_FILE_NO(current_page) <= LSN_FILE_NO(last_page) &&
+             !old_log_was_recovered);
+    if (last_valid_page == LSN_IMPOSSIBLE)
+    {
+      /* Panic!!! Even page which should be valid is invalid */
+      /* TODO: issue error */
+      goto err;
+    }
+    DBUG_PRINT("info", ("Last valid page is in file: %lu  "
+                        "offset: %lu (0x%lx)  "
+                        "Logs found: %d  was recovered: %d  "
+                        "flags match: %d",
+                        (ulong) LSN_FILE_NO(last_valid_page),
+                        (ulong) LSN_OFFSET(last_valid_page),
+                        (ulong) LSN_OFFSET(last_valid_page),
+                        logs_found, old_log_was_recovered,
+                        (old_flags == flags)));
+
+    /* TODO: check server ID */
+    if (logs_found && !old_log_was_recovered && old_flags == flags)
+    {
+      TRANSLOG_VALIDATOR_DATA data;
+      TRANSLOG_PAGE_SIZE_BUFF psize_buff;
+      uchar *page;
+      uint16 chunk_offset;
+      data.addr= &last_valid_page;
+      /* continue old log */
+      DBUG_ASSERT(LSN_FILE_NO(last_valid_page)==
+                  LSN_FILE_NO(log_descriptor.horizon));
+      if ((page= translog_get_page(&data, psize_buff.buffer, NULL)) == NULL ||
+          (chunk_offset= translog_get_first_chunk_offset(page)) == 0)
+        goto err;
+
+      /* Puts filled part of old page in the buffer */
+      log_descriptor.horizon= last_valid_page;
+      translog_start_buffer(log_descriptor.buffers, &log_descriptor.bc, 0);
+      /*
+         Free space if filled with TRANSLOG_FILLER and first uchar of
+         real chunk can't be TRANSLOG_FILLER
+      */
+      while (chunk_offset < TRANSLOG_PAGE_SIZE &&
+             page[chunk_offset] != TRANSLOG_FILLER)
+      {
+        uint16 chunk_length;
+        if ((chunk_length=
+             translog_get_total_chunk_length(page, chunk_offset)) == 0)
+          goto err;
+        DBUG_PRINT("info", ("chunk: offset: %u  length: %u",
+                            (uint) chunk_offset, (uint) chunk_length));
+        chunk_offset+= chunk_length;
+
+        /* chunk can't cross the page border */
+        DBUG_ASSERT(chunk_offset <= TRANSLOG_PAGE_SIZE);
+      }
+      memcpy(log_descriptor.buffers->buffer, page, chunk_offset);
+      log_descriptor.bc.buffer->size+= chunk_offset;
+      log_descriptor.bc.ptr+= chunk_offset;
+      log_descriptor.bc.current_page_fill= chunk_offset;
+      log_descriptor.horizon= LSN_REPLACE_OFFSET(log_descriptor.horizon,
+                                                 (chunk_offset +
+                                                  LSN_OFFSET(last_valid_page)));
+      DBUG_PRINT("info", ("Move Page #%u: 0x%lx  chaser: %d  Size: %lu (%lu)",
+                          (uint) log_descriptor.bc.buffer_no,
+                          (ulong) log_descriptor.bc.buffer,
+                          log_descriptor.bc.chaser,
+                          (ulong) log_descriptor.bc.buffer->size,
+                          (ulong) (log_descriptor.bc.ptr - log_descriptor.bc.
+                                   buffer->buffer)));
+      translog_check_cursor(&log_descriptor.bc);
+    }
+    if (!old_log_was_recovered && old_flags == flags)
+    {
+      LOGHANDLER_FILE_INFO info;
+      LINT_INIT_STRUCT(info);
+
+      /*
+        Accessing &log_descriptor.open_files without mutex is safe
+        because it is initialization
+      */
+      if (translog_read_file_header(&info,
+                                    (*dynamic_element(&log_descriptor.
+                                                      open_files,
+                                                      0, TRANSLOG_FILE **))->
+                                    handler.file))
+        goto err;
+      version_changed= (info.maria_version != TRANSLOG_VERSION_ID);
+    }
+  }
+  DBUG_PRINT("info", ("Logs found: %d  was recovered: %d",
+                      logs_found, old_log_was_recovered));
+  if (!logs_found)
+  {
+    TRANSLOG_FILE *file= (TRANSLOG_FILE*)my_malloc(sizeof(TRANSLOG_FILE),
+                                                   MYF(0));
+    DBUG_PRINT("info", ("The log is not found => we will create new log"));
+    if (file == NULL)
+       goto err;
+    /* Start new log system from scratch */
+    log_descriptor.horizon= MAKE_LSN(start_file_num,
+                                     TRANSLOG_PAGE_SIZE); /* header page */
+    if ((file->handler.file=
+         create_logfile_by_number_no_cache(start_file_num)) == -1)
+      goto err;
+    translog_file_init(file, start_file_num, 0);
+    if (insert_dynamic(&log_descriptor.open_files, (uchar*)&file))
+      goto err;
+    log_descriptor.min_file= log_descriptor.max_file= start_file_num;
+    if (translog_write_file_header())
+      goto err;
+    DBUG_ASSERT(log_descriptor.max_file - log_descriptor.min_file + 1 ==
+                log_descriptor.open_files.elements);
+
+    if (ma_control_file_write_and_force(checkpoint_lsn, start_file_num,
+                                        max_trid_in_control_file,
+                                        recovery_failures))
+      goto err;
+    /* assign buffer 0 */
+    translog_start_buffer(log_descriptor.buffers, &log_descriptor.bc, 0);
+    translog_new_page_header(&log_descriptor.horizon, &log_descriptor.bc);
+  }
+  else if ((old_log_was_recovered || old_flags != flags || version_changed) &&
+           !readonly)
+  {
+    /* leave the damaged file untouched */
+    log_descriptor.horizon+= LSN_ONE_FILE;
+    /* header page */
+    log_descriptor.horizon= LSN_REPLACE_OFFSET(log_descriptor.horizon,
+                                               TRANSLOG_PAGE_SIZE);
+    if (translog_create_new_file())
+      goto err;
+    /*
+      Buffer system left untouched after recovery => we should init it
+      (starting from buffer 0)
+    */
+    translog_start_buffer(log_descriptor.buffers, &log_descriptor.bc, 0);
+    translog_new_page_header(&log_descriptor.horizon, &log_descriptor.bc);
+  }
+
+  /* all LSNs that are on disk are flushed */
+  log_descriptor.log_start= log_descriptor.sent_to_disk=
+    log_descriptor.flushed= log_descriptor.horizon;
+  log_descriptor.in_buffers_only= log_descriptor.bc.buffer->offset;
+  log_descriptor.max_lsn= LSN_IMPOSSIBLE; /* set to 0 */
+  /*
+    Now 'flushed' is set to 'horizon' value, but 'horizon' is (potentially)
+    address of the next LSN and we want indicate that all LSNs that are
+    already on the disk are flushed so we need decrease horizon on 1 (we are
+    sure that there is no LSN on the disk which is greater then 'flushed'
+    and there will not be LSN created that is equal or less then the value
+    of the 'flushed').
+  */
+  log_descriptor.flushed--; /* offset decreased */
+  log_descriptor.sent_to_disk--; /* offset decreased */
+  /*
+    Log records will refer to a MARIA_SHARE by a unique 2-byte id; set up
+    structures for generating 2-byte ids:
+  */
+  my_atomic_rwlock_init(&LOCK_id_to_share);
+  id_to_share= (MARIA_SHARE **) my_malloc(SHARE_ID_MAX * sizeof(MARIA_SHARE*),
+                                          MYF(MY_WME | MY_ZEROFILL));
+  if (unlikely(!id_to_share))
+    goto err;
+  id_to_share--; /* min id is 1 */
+
+  /* Check the last LSN record integrity */
+  if (logs_found)
+  {
+    TRANSLOG_SCANNER_DATA scanner;
+    TRANSLOG_ADDRESS page_addr;
+    LSN last_lsn= LSN_IMPOSSIBLE;
+    /*
+      take very last page address and try to find LSN record on it
+      if it fail take address of previous page and so on
+    */
+    page_addr= (log_descriptor.horizon -
+                ((log_descriptor.horizon - 1) % TRANSLOG_PAGE_SIZE + 1));
+    if (translog_scanner_init(page_addr, 1, &scanner, 1))
+      goto err;
+    scanner.page_offset= page_overhead[scanner.page[TRANSLOG_PAGE_FLAGS]];
+    for (;;)
+    {
+      uint chunk_1byte;
+      chunk_1byte= scanner.page[scanner.page_offset];
+      while (!translog_is_LSN_chunk(chunk_1byte) &&
+             scanner.page != END_OF_LOG &&
+             scanner.page[scanner.page_offset] != TRANSLOG_FILLER &&
+             scanner.page_addr == page_addr)
+      {
+        if (translog_get_next_chunk(&scanner))
+        {
+          translog_destroy_scanner(&scanner);
+          goto err;
+        }
+        if (scanner.page != END_OF_LOG)
+          chunk_1byte= scanner.page[scanner.page_offset];
+      }
+      if (translog_is_LSN_chunk(chunk_1byte))
+      {
+        last_lsn= scanner.page_addr + scanner.page_offset;
+        if (translog_get_next_chunk(&scanner))
+        {
+          translog_destroy_scanner(&scanner);
+          goto err;
+        }
+        if (scanner.page == END_OF_LOG)
+          break; /* it was the last record */
+        chunk_1byte= scanner.page[scanner.page_offset];
+        continue; /* try to find other record on this page */
+      }
+
+      if (last_lsn != LSN_IMPOSSIBLE)
+        break; /* there is no more records on the page */
+
+      /* We have to make step back */
+      if (unlikely(LSN_OFFSET(page_addr) == TRANSLOG_PAGE_SIZE))
+      {
+        uint32 file_no= LSN_FILE_NO(page_addr);
+        my_bool last_page_ok;
+        /* it is beginning of the current file */
+        if (unlikely(file_no == 1))
+        {
+          /*
+            It is beginning of the log => there is no LSNs in the log =>
+            There is no harm in leaving it "as-is".
+          */
+          log_descriptor.previous_flush_horizon= log_descriptor.horizon;
+          DBUG_PRINT("info", ("previous_flush_horizon: (%lu,0x%lx)",
+                              LSN_IN_PARTS(log_descriptor.
+                                           previous_flush_horizon)));
+          DBUG_RETURN(0);
+        }
+        file_no--;
+        page_addr= MAKE_LSN(file_no, TRANSLOG_PAGE_SIZE);
+        translog_get_last_page_addr(&page_addr, &last_page_ok, 0);
+        /* page should be OK as it is not the last file */
+        DBUG_ASSERT(last_page_ok);
+      }
+      else
+      {
+         page_addr-= TRANSLOG_PAGE_SIZE;
+      }
+      translog_destroy_scanner(&scanner);
+      if (translog_scanner_init(page_addr, 1, &scanner, 1))
+        goto err;
+      scanner.page_offset= page_overhead[scanner.page[TRANSLOG_PAGE_FLAGS]];
+    }
+    translog_destroy_scanner(&scanner);
+
+    /* Now scanner points to the last LSN chunk, lets check it */
+    {
+      TRANSLOG_HEADER_BUFFER rec;
+      translog_size_t rec_len;
+      int len;
+      uchar buffer[1];
+      DBUG_PRINT("info", ("going to check the last found record (%lu,0x%lx)",
+                          LSN_IN_PARTS(last_lsn)));
+
+      len=
+        translog_read_record_header(last_lsn, &rec);
+      if (unlikely (len == RECHEADER_READ_ERROR ||
+                    len == RECHEADER_READ_EOF))
+      {
+        DBUG_PRINT("error", ("unexpected end of log or record during "
+                             "reading record header: (%lu,0x%lx)  len: %d",
+                             LSN_IN_PARTS(last_lsn), len));
+        if (readonly)
+          log_descriptor.log_start= log_descriptor.horizon= last_lsn;
+        else if (translog_truncate_log(last_lsn))
+        {
+          translog_free_record_header(&rec);
+          goto err;
+        }
+      }
+      else
+      {
+        DBUG_ASSERT(last_lsn == rec.lsn);
+        if (likely(rec.record_length != 0))
+        {
+          /*
+            Reading the last byte of record will trigger scanning all
+            record chunks for now
+          */
+          rec_len= translog_read_record(rec.lsn, rec.record_length - 1, 1,
+                                        buffer, NULL);
+          if (rec_len != 1)
+          {
+            DBUG_PRINT("error", ("unexpected end of log or record during "
+                                 "reading record body: (%lu,0x%lx)  len: %d",
+                                 LSN_IN_PARTS(rec.lsn),
+                                 len));
+            if (readonly)
+              log_descriptor.log_start= log_descriptor.horizon= last_lsn;
+
+            else if (translog_truncate_log(last_lsn))
+            {
+              translog_free_record_header(&rec);
+              goto err;
+            }
+          }
+        }
+      }
+      translog_free_record_header(&rec);
+    }
+  }
+  log_descriptor.previous_flush_horizon= log_descriptor.horizon;
+  DBUG_PRINT("info", ("previous_flush_horizon: (%lu,0x%lx)",
+                      LSN_IN_PARTS(log_descriptor.previous_flush_horizon)));
+  DBUG_RETURN(0);
+err:
+  ma_message_no_user(0, "log initialization failed");
+  DBUG_RETURN(1);
+}
+
+
+/*
+  @brief Free transaction log file buffer.
+
+  @param buffer_no       The buffer to free
+*/
+
+static void translog_buffer_destroy(struct st_translog_buffer *buffer)
+{
+  DBUG_ENTER("translog_buffer_destroy");
+  DBUG_PRINT("enter",
+             ("Buffer #%u: 0x%lx  file: %d  offset: (%lu,0x%lx)  size: %lu",
+              (uint) buffer->buffer_no, (ulong) buffer,
+              (buffer->file ? buffer->file->handler.file : -1),
+              LSN_IN_PARTS(buffer->offset),
+              (ulong) buffer->size));
+  if (buffer->file != NULL)
+  {
+    /*
+      We ignore errors here, because we can't do something about it
+      (it is shutting down)
+
+      We also have to take the locks even if there can't be any other
+      threads running, because translog_buffer_flush()
+      requires that we have the buffer locked.
+    */
+    translog_buffer_lock(buffer);
+    translog_buffer_flush(buffer);
+    translog_buffer_unlock(buffer);
+  }
+  DBUG_PRINT("info", ("Destroy mutex: 0x%lx", (ulong) &buffer->mutex));
+  pthread_mutex_destroy(&buffer->mutex);
+  pthread_cond_destroy(&buffer->waiting_filling_buffer);
+  DBUG_VOID_RETURN;
+}
+
+
+/*
+  Free log handler resources
+
+  SYNOPSIS
+    translog_destroy()
+*/
+
+void translog_destroy()
+{
+  TRANSLOG_FILE **file;
+  uint i;
+  uint8 current_buffer;
+  DBUG_ENTER("translog_destroy");
+
+  DBUG_ASSERT(translog_status == TRANSLOG_OK ||
+              translog_status == TRANSLOG_READONLY);
+  translog_lock();
+  current_buffer= log_descriptor.bc.buffer_no;
+  translog_status= (translog_status == TRANSLOG_READONLY ?
+                    TRANSLOG_UNINITED :
+                    TRANSLOG_SHUTDOWN);
+  if (log_descriptor.bc.buffer->file != NULL)
+    translog_finish_page(&log_descriptor.horizon, &log_descriptor.bc);
+  translog_unlock();
+
+  for (i= 0; i < TRANSLOG_BUFFERS_NO; i++)
+  {
+    struct st_translog_buffer *buffer= (log_descriptor.buffers +
+                                        ((i + current_buffer + 1) %
+                                         TRANSLOG_BUFFERS_NO));
+    translog_buffer_destroy(buffer);
+  }
+  translog_status= TRANSLOG_UNINITED;
+
+  /* close files */
+  while ((file= (TRANSLOG_FILE **)pop_dynamic(&log_descriptor.open_files)))
+    translog_close_log_file(*file);
+  pthread_mutex_destroy(&log_descriptor.sent_to_disk_lock);
+  pthread_mutex_destroy(&log_descriptor.file_header_lock);
+  pthread_mutex_destroy(&log_descriptor.unfinished_files_lock);
+  pthread_mutex_destroy(&log_descriptor.purger_lock);
+  pthread_mutex_destroy(&log_descriptor.log_flush_lock);
+  pthread_mutex_destroy(&log_descriptor.dirty_buffer_mask_lock);
+  pthread_cond_destroy(&log_descriptor.log_flush_cond);
+  pthread_cond_destroy(&log_descriptor.new_goal_cond);
+  rwlock_destroy(&log_descriptor.open_files_lock);
+  delete_dynamic(&log_descriptor.open_files);
+  delete_dynamic(&log_descriptor.unfinished_files);
+
+  if (log_descriptor.directory_fd >= 0)
+    my_close(log_descriptor.directory_fd, MYF(MY_WME));
+  my_atomic_rwlock_destroy(&LOCK_id_to_share);
+  if (id_to_share != NULL)
+    my_free((id_to_share + 1), MYF(MY_WME));
+  DBUG_VOID_RETURN;
+}
+
+
+/*
+  @brief Starts new page.
+
+  @param horizon         \ Position in file and buffer where we are
+  @param cursor          /
+  @param prev_buffer     Buffer which should be flushed will be assigned here.
+                         This is always set (to NULL if nothing to flush).
+
+  @note We do not want to flush the buffer immediately because we want to
+  let caller of this function first advance 'horizon' pointer and unlock the
+  loghandler and only then flush the log which can take some time.
+
+  @retval 0 OK
+  @retval 1 Error
+*/
+
+static my_bool translog_page_next(TRANSLOG_ADDRESS *horizon,
+                                  struct st_buffer_cursor *cursor,
+                                  struct st_translog_buffer **prev_buffer)
+{
+  struct st_translog_buffer *buffer= cursor->buffer;
+  DBUG_ENTER("translog_page_next");
+
+  *prev_buffer= NULL;
+  if ((cursor->ptr + TRANSLOG_PAGE_SIZE >
+       cursor->buffer->buffer + TRANSLOG_WRITE_BUFFER) ||
+      (LSN_OFFSET(*horizon) >
+       log_descriptor.log_file_max_size - TRANSLOG_PAGE_SIZE))
+  {
+    DBUG_PRINT("info", ("Switch to next buffer  Buffer Size: %lu (%lu) => %d  "
+                        "File size: %lu  max: %lu => %d",
+                        (ulong) cursor->buffer->size,
+                        (ulong) (cursor->ptr - cursor->buffer->buffer),
+                        (cursor->ptr + TRANSLOG_PAGE_SIZE >
+                         cursor->buffer->buffer + TRANSLOG_WRITE_BUFFER),
+                        (ulong) LSN_OFFSET(*horizon),
+                        (ulong) log_descriptor.log_file_max_size,
+                        (LSN_OFFSET(*horizon) >
+                         (log_descriptor.log_file_max_size -
+                          TRANSLOG_PAGE_SIZE))));
+    if (translog_buffer_next(horizon, cursor,
+                             LSN_OFFSET(*horizon) >
+                             (log_descriptor.log_file_max_size -
+                              TRANSLOG_PAGE_SIZE)))
+      DBUG_RETURN(1);
+    *prev_buffer= buffer;
+    DBUG_PRINT("info", ("Buffer #%u (0x%lu): have to be flushed",
+                        (uint) buffer->buffer_no, (ulong) buffer));
+  }
+  else
+  {
+    DBUG_PRINT("info", ("Use the same buffer #%u (0x%lu): "
+                        "Buffer Size: %lu (%lu)",
+                        (uint) buffer->buffer_no,
+                        (ulong) buffer,
+                        (ulong) cursor->buffer->size,
+                        (ulong) (cursor->ptr - cursor->buffer->buffer)));
+    translog_finish_page(horizon, cursor);
+    translog_new_page_header(horizon, cursor);
+  }
+  DBUG_RETURN(0);
+}
+
+
+/*
+  Write data of given length to the current page
+
+  SYNOPSIS
+    translog_write_data_on_page()
+    horizon              \ Pointers on file and buffer
+    cursor               /
+    length               IN     length of the chunk
+    buffer               buffer with data
+
+  RETURN
+    0  OK
+    1  Error
+*/
+
+static my_bool translog_write_data_on_page(TRANSLOG_ADDRESS *horizon,
+                                           struct st_buffer_cursor *cursor,
+                                           translog_size_t length,
+                                           uchar *buffer)
+{
+  DBUG_ENTER("translog_write_data_on_page");
+  DBUG_PRINT("enter", ("Chunk length: %lu  Page size %u",
+                       (ulong) length, (uint) cursor->current_page_fill));
+  DBUG_ASSERT(length > 0);
+  DBUG_ASSERT(length + cursor->current_page_fill <= TRANSLOG_PAGE_SIZE);
+  DBUG_ASSERT(length + cursor->ptr <= cursor->buffer->buffer +
+              TRANSLOG_WRITE_BUFFER);
+
+  memcpy(cursor->ptr, buffer, length);
+  cursor->ptr+= length;
+  (*horizon)+= length; /* adds offset */
+  cursor->current_page_fill+= length;
+  if (!cursor->chaser)
+    cursor->buffer->size+= length;
+  DBUG_PRINT("info", ("Write data buffer #%u: 0x%lx  "
+                      "chaser: %d  Size: %lu (%lu)",
+                      (uint) cursor->buffer->buffer_no, (ulong) cursor->buffer,
+                      cursor->chaser, (ulong) cursor->buffer->size,
+                      (ulong) (cursor->ptr - cursor->buffer->buffer)));
+  translog_check_cursor(cursor);
+
+  DBUG_RETURN(0);
+}
+
+
+/*
+  Write data from parts of given length to the current page
+
+  SYNOPSIS
+    translog_write_parts_on_page()
+    horizon              \ Pointers on file and buffer
+    cursor               /
+    length               IN     length of the chunk
+    parts                IN/OUT chunk source
+
+  RETURN
+    0  OK
+    1  Error
+*/
+
+static my_bool translog_write_parts_on_page(TRANSLOG_ADDRESS *horizon,
+                                            struct st_buffer_cursor *cursor,
+                                            translog_size_t length,
+                                            struct st_translog_parts *parts)
+{
+  translog_size_t left= length;
+  uint cur= (uint) parts->current;
+  DBUG_ENTER("translog_write_parts_on_page");
+  DBUG_PRINT("enter", ("Chunk length: %lu  parts: %u of %u. Page size: %u  "
+                       "Buffer size: %lu (%lu)",
+                       (ulong) length,
+                       (uint) (cur + 1), (uint) parts->elements,
+                       (uint) cursor->current_page_fill,
+                       (ulong) cursor->buffer->size,
+                       (ulong) (cursor->ptr - cursor->buffer->buffer)));
+  DBUG_ASSERT(length > 0);
+  DBUG_ASSERT(length + cursor->current_page_fill <= TRANSLOG_PAGE_SIZE);
+  DBUG_ASSERT(length + cursor->ptr <= cursor->buffer->buffer +
+              TRANSLOG_WRITE_BUFFER);
+
+  do
+  {
+    translog_size_t len;
+    LEX_CUSTRING *part;
+    const uchar *buff;
+
+    DBUG_ASSERT(cur < parts->elements);
+    part= parts->parts + cur;
+    buff= part->str;
+    DBUG_PRINT("info", ("Part: %u  Length: %lu  left: %lu  buff: 0x%lx",
+                        (uint) (cur + 1), (ulong) part->length, (ulong) left,
+                        (ulong) buff));
+
+    if (part->length > left)
+    {
+      /* we should write less then the current part */
+      len= left;
+      part->length-= len;
+      part->str+= len;
+      DBUG_PRINT("info", ("Set new part: %u  Length: %lu",
+                          (uint) (cur + 1), (ulong) part->length));
+    }
+    else
+    {
+      len= (translog_size_t) part->length;
+      cur++;
+      DBUG_PRINT("info", ("moved to next part (len: %lu)", (ulong) len));
+    }
+    DBUG_PRINT("info", ("copy: 0x%lx <- 0x%lx  %u",
+                        (ulong) cursor->ptr, (ulong)buff, (uint)len));
+    if (likely(len))
+    {
+      memcpy(cursor->ptr, buff, len);
+      left-= len;
+      cursor->ptr+= len;
+    }
+  } while (left);
+
+  DBUG_PRINT("info", ("Horizon: (%lu,0x%lx)  Length %lu(0x%lx)",
+                      LSN_IN_PARTS(*horizon),
+                      (ulong) length, (ulong) length));
+  parts->current= cur;
+  (*horizon)+= length; /* offset increasing */
+  cursor->current_page_fill+= length;
+  if (!cursor->chaser)
+    cursor->buffer->size+= length;
+  /*
+    We do not not updating parts->total_record_length here because it is
+    need only before writing record to have total length
+  */
+  DBUG_PRINT("info", ("Write parts buffer #%u: 0x%lx  "
+                      "chaser: %d  Size: %lu (%lu)  "
+                      "Horizon: (%lu,0x%lx)  buff offset: 0x%lx",
+                      (uint) cursor->buffer->buffer_no, (ulong) cursor->buffer,
+                      cursor->chaser, (ulong) cursor->buffer->size,
+                      (ulong) (cursor->ptr - cursor->buffer->buffer),
+                      LSN_IN_PARTS(*horizon),
+                      (ulong) (LSN_OFFSET(cursor->buffer->offset) +
+                               cursor->buffer->size)));
+  translog_check_cursor(cursor);
+
+  DBUG_RETURN(0);
+}
+
+
+/*
+  Put 1 group chunk type 0 header into parts array
+
+  SYNOPSIS
+    translog_write_variable_record_1group_header()
+    parts                Descriptor of record source parts
+    type                 The log record type
+    short_trid           Short transaction ID or 0 if it has no sense
+    header_length        Calculated header length of chunk type 0
+    chunk0_header        Buffer for the chunk header writing
+*/
+
+static void
+translog_write_variable_record_1group_header(struct st_translog_parts *parts,
+                                             enum translog_record_type type,
+                                             SHORT_TRANSACTION_ID short_trid,
+                                             uint16 header_length,
+                                             uchar *chunk0_header)
+{
+  LEX_CUSTRING *part;
+  DBUG_ASSERT(parts->current != 0);     /* first part is left for header */
+  part= parts->parts + (--parts->current);
+  parts->total_record_length+= (translog_size_t) (part->length= header_length);
+  part->str= chunk0_header;
+  /* puts chunk type */
+  *chunk0_header= (uchar) (type | TRANSLOG_CHUNK_LSN);
+  int2store(chunk0_header + 1, short_trid);
+  /* puts record length */
+  translog_write_variable_record_1group_code_len(chunk0_header + 3,
+                                                 parts->record_length,
+                                                 header_length);
+  /* puts 0 as chunk length which indicate 1 group record */
+  int2store(chunk0_header + header_length - 2, 0);
+}
+
+
+/*
+  Increase number of writers for this buffer
+
+  SYNOPSIS
+    translog_buffer_increase_writers()
+    buffer               target buffer
+*/
+
+static inline void
+translog_buffer_increase_writers(struct st_translog_buffer *buffer)
+{
+  DBUG_ENTER("translog_buffer_increase_writers");
+  translog_buffer_lock_assert_owner(buffer);
+  buffer->copy_to_buffer_in_progress++;
+  DBUG_PRINT("info", ("copy_to_buffer_in_progress. Buffer #%u  0x%lx  progress: %d",
+                      (uint) buffer->buffer_no, (ulong) buffer,
+                      buffer->copy_to_buffer_in_progress));
+  DBUG_VOID_RETURN;
+}
+
+
+/*
+  Decrease number of writers for this buffer
+
+  SYNOPSIS
+    translog_buffer_decrease_writers()
+    buffer               target buffer
+*/
+
+static void translog_buffer_decrease_writers(struct st_translog_buffer *buffer)
+{
+  DBUG_ENTER("translog_buffer_decrease_writers");
+  translog_buffer_lock_assert_owner(buffer);
+  buffer->copy_to_buffer_in_progress--;
+  DBUG_PRINT("info",
+             ("copy_to_buffer_in_progress. Buffer #%u  0x%lx  progress: %d",
+              (uint) buffer->buffer_no, (ulong) buffer,
+              buffer->copy_to_buffer_in_progress));
+  if (buffer->copy_to_buffer_in_progress == 0)
+    pthread_cond_broadcast(&buffer->waiting_filling_buffer);
+  DBUG_VOID_RETURN;
+}
+
+
+/**
+  @brief Skip to the next page for chaser (thread which advanced horizon
+  pointer and now feeling the buffer)
+
+  @param horizon         \ Pointers on file position and buffer
+  @param cursor          /
+
+  @retval 1 OK
+  @retval 0 Error
+*/
+
+static my_bool translog_chaser_page_next(TRANSLOG_ADDRESS *horizon,
+                                         struct st_buffer_cursor *cursor)
+{
+  struct st_translog_buffer *buffer_to_flush;
+  my_bool rc;
+  DBUG_ENTER("translog_chaser_page_next");
+  DBUG_ASSERT(cursor->chaser);
+  rc= translog_page_next(horizon, cursor, &buffer_to_flush);
+  if (buffer_to_flush != NULL)
+  {
+    translog_buffer_lock(buffer_to_flush);
+    translog_buffer_decrease_writers(buffer_to_flush);
+    if (!rc)
+      rc= translog_buffer_flush(buffer_to_flush);
+    translog_buffer_unlock(buffer_to_flush);
+  }
+  DBUG_RETURN(rc);
+}
+
+/*
+  Put chunk 2 from new page beginning
+
+  SYNOPSIS
+    translog_write_variable_record_chunk2_page()
+    parts                Descriptor of record source parts
+    horizon              \ Pointers on file position and buffer
+    cursor               /
+
+  RETURN
+    0  OK
+    1  Error
+*/
+
+static my_bool
+translog_write_variable_record_chunk2_page(struct st_translog_parts *parts,
+                                           TRANSLOG_ADDRESS *horizon,
+                                           struct st_buffer_cursor *cursor)
+{
+  uchar chunk2_header[1];
+  DBUG_ENTER("translog_write_variable_record_chunk2_page");
+  chunk2_header[0]= TRANSLOG_CHUNK_NOHDR;
+
+  if (translog_chaser_page_next(horizon, cursor))
+    DBUG_RETURN(1);
+
+  /* Puts chunk type */
+  translog_write_data_on_page(horizon, cursor, 1, chunk2_header);
+  /* Puts chunk body */
+  translog_write_parts_on_page(horizon, cursor,
+                               log_descriptor.page_capacity_chunk_2, parts);
+  DBUG_RETURN(0);
+}
+
+
+/*
+  Put chunk 3 of requested length in the buffer from new page beginning
+
+  SYNOPSIS
+    translog_write_variable_record_chunk3_page()
+    parts                Descriptor of record source parts
+    length               Length of this chunk
+    horizon              \ Pointers on file position and buffer
+    cursor               /
+
+  RETURN
+    0  OK
+    1  Error
+*/
+
+static my_bool
+translog_write_variable_record_chunk3_page(struct st_translog_parts *parts,
+                                           uint16 length,
+                                           TRANSLOG_ADDRESS *horizon,
+                                           struct st_buffer_cursor *cursor)
+{
+  LEX_CUSTRING *part;
+  uchar chunk3_header[1 + 2];
+  DBUG_ENTER("translog_write_variable_record_chunk3_page");
+
+  if (translog_chaser_page_next(horizon, cursor))
+    DBUG_RETURN(1);
+
+  if (length == 0)
+  {
+    /* It was call to write page header only (no data for chunk 3) */
+    DBUG_PRINT("info", ("It is a call to make page header only"));
+    DBUG_RETURN(0);
+  }
+
+  DBUG_ASSERT(parts->current != 0);       /* first part is left for header */
+  part= parts->parts + (--parts->current);
+  parts->total_record_length+= (translog_size_t) (part->length= 1 + 2);
+  part->str= chunk3_header;
+  /* Puts chunk type */
+  *chunk3_header= (uchar) (TRANSLOG_CHUNK_LNGTH);
+  /* Puts chunk length */
+  int2store(chunk3_header + 1, length);
+
+  translog_write_parts_on_page(horizon, cursor, length + 1 + 2, parts);
+  DBUG_RETURN(0);
+}
+
+/*
+  Move log pointer (horizon) on given number pages starting from next page,
+  and given offset on the last page
+
+  SYNOPSIS
+    translog_advance_pointer()
+    pages                Number of full pages starting from the next one
+    last_page_data       Plus this data on the last page
+
+  RETURN
+    0  OK
+    1  Error
+*/
+
+static my_bool translog_advance_pointer(int pages, uint16 last_page_data)
+{
+  translog_size_t last_page_offset= (log_descriptor.page_overhead +
+                                     last_page_data);
+  translog_size_t offset= (TRANSLOG_PAGE_SIZE -
+                           log_descriptor.bc.current_page_fill +
+                           pages * TRANSLOG_PAGE_SIZE + last_page_offset);
+  translog_size_t buffer_end_offset, file_end_offset, min_offset;
+  DBUG_ENTER("translog_advance_pointer");
+  DBUG_PRINT("enter", ("Pointer:  (%lu, 0x%lx) + %u + %u pages + %u + %u",
+                       LSN_IN_PARTS(log_descriptor.horizon),
+                       (uint) (TRANSLOG_PAGE_SIZE -
+                               log_descriptor.bc.current_page_fill),
+                       pages, (uint) log_descriptor.page_overhead,
+                       (uint) last_page_data));
+  translog_lock_assert_owner();
+
+  if (pages == -1)
+  {
+    /*
+      It is special case when we advance the pointer on the same page.
+      It can happened when we write last part of multi-group record.
+    */
+    DBUG_ASSERT(last_page_data + log_descriptor.bc.current_page_fill <=
+                TRANSLOG_PAGE_SIZE);
+    offset= last_page_data;
+    last_page_offset= log_descriptor.bc.current_page_fill + last_page_data;
+    goto end;
+  }
+  DBUG_PRINT("info", ("last_page_offset %lu", (ulong) last_page_offset));
+  DBUG_ASSERT(last_page_offset <= TRANSLOG_PAGE_SIZE);
+
+  /*
+    The loop will be executed 1-3 times. Usually we advance the
+    pointer to fill only the current buffer (if we have more then 1/2 of
+    buffer free or 2 buffers (rest of current and all next). In case of
+    really huge record end where we write last group with "table of
+    content" of all groups and ignore buffer borders we can occupy
+    3 buffers.
+  */
+  for (;;)
+  {
+    uint8 new_buffer_no;
+    struct st_translog_buffer *new_buffer;
+    struct st_translog_buffer *old_buffer;
+    buffer_end_offset= TRANSLOG_WRITE_BUFFER - log_descriptor.bc.buffer->size;
+    if (likely(log_descriptor.log_file_max_size >=
+               LSN_OFFSET(log_descriptor.horizon)))
+      file_end_offset= (log_descriptor.log_file_max_size -
+                        LSN_OFFSET(log_descriptor.horizon));
+    else
+    {
+      /*
+        We already have written more then current file limit allow,
+        So we will finish this page and start new file
+      */
+      file_end_offset= (TRANSLOG_PAGE_SIZE -
+                        log_descriptor.bc.current_page_fill);
+    }
+    DBUG_PRINT("info", ("offset: %lu  buffer_end_offs: %lu, "
+                        "file_end_offs:  %lu",
+                        (ulong) offset, (ulong) buffer_end_offset,
+                        (ulong) file_end_offset));
+    DBUG_PRINT("info", ("Buff #%u %u (0x%lx) offset 0x%lx + size 0x%lx = "
+                        "0x%lx (0x%lx)",
+                        (uint) log_descriptor.bc.buffer->buffer_no,
+                        (uint) log_descriptor.bc.buffer_no,
+                        (ulong) log_descriptor.bc.buffer,
+                        (ulong) LSN_OFFSET(log_descriptor.bc.buffer->offset),
+                        (ulong) log_descriptor.bc.buffer->size,
+                        (ulong) (LSN_OFFSET(log_descriptor.bc.buffer->offset) +
+                                 log_descriptor.bc.buffer->size),
+                        (ulong) LSN_OFFSET(log_descriptor.horizon)));
+    DBUG_ASSERT(LSN_OFFSET(log_descriptor.bc.buffer->offset) +
+                log_descriptor.bc.buffer->size ==
+                LSN_OFFSET(log_descriptor.horizon));
+
+    if (offset <= buffer_end_offset && offset <= file_end_offset)
+      break;
+    old_buffer= log_descriptor.bc.buffer;
+    new_buffer_no= (log_descriptor.bc.buffer_no + 1) % TRANSLOG_BUFFERS_NO;
+    new_buffer= log_descriptor.buffers + new_buffer_no;
+
+    translog_buffer_lock(new_buffer);
+#ifndef DBUG_OFF
+    {
+      TRANSLOG_ADDRESS offset= new_buffer->offset;
+      TRANSLOG_FILE *file= new_buffer->file;
+      uint8 ver= new_buffer->ver;
+      translog_lock_assert_owner();
+#endif
+      translog_wait_for_buffer_free(new_buffer);
+#ifndef DBUG_OFF
+      /* We keep the handler locked so nobody can start this new buffer */
+      DBUG_ASSERT(offset == new_buffer->offset && new_buffer->file == NULL &&
+                  (file == NULL ? ver : (uint8)(ver + 1)) == new_buffer->ver);
+    }
+#endif
+
+    min_offset= min(buffer_end_offset, file_end_offset);
+    /* TODO: check is it ptr or size enough */
+    log_descriptor.bc.buffer->size+= min_offset;
+    log_descriptor.bc.ptr+= min_offset;
+    DBUG_PRINT("info", ("NewP buffer #%u: 0x%lx  chaser: %d  Size: %lu (%lu)",
+                        (uint) log_descriptor.bc.buffer->buffer_no,
+                        (ulong) log_descriptor.bc.buffer,
+                        log_descriptor.bc.chaser,
+                        (ulong) log_descriptor.bc.buffer->size,
+                        (ulong) (log_descriptor.bc.ptr -log_descriptor.bc.
+                                 buffer->buffer)));
+    DBUG_ASSERT((ulong) (log_descriptor.bc.ptr -
+                         log_descriptor.bc.buffer->buffer) ==
+                log_descriptor.bc.buffer->size);
+    DBUG_ASSERT(log_descriptor.bc.buffer->buffer_no ==
+                log_descriptor.bc.buffer_no);
+    translog_buffer_increase_writers(log_descriptor.bc.buffer);
+
+    if (file_end_offset <= buffer_end_offset)
+    {
+      log_descriptor.horizon+= LSN_ONE_FILE;
+      log_descriptor.horizon= LSN_REPLACE_OFFSET(log_descriptor.horizon,
+                                                 TRANSLOG_PAGE_SIZE);
+      DBUG_PRINT("info", ("New file: %lu",
+                          (ulong) LSN_FILE_NO(log_descriptor.horizon)));
+      if (translog_create_new_file())
+      {
+        DBUG_RETURN(1);
+      }
+    }
+    else
+    {
+      DBUG_PRINT("info", ("The same file"));
+      log_descriptor.horizon+= min_offset; /* offset increasing */
+    }
+    translog_start_buffer(new_buffer, &log_descriptor.bc, new_buffer_no);
+    old_buffer->next_buffer_offset= new_buffer->offset;
+    new_buffer->prev_buffer_offset= old_buffer->offset;
+    translog_buffer_unlock(old_buffer);
+    offset-= min_offset;
+  }
+  DBUG_PRINT("info", ("drop write_counter"));
+  log_descriptor.bc.write_counter= 0;
+  log_descriptor.bc.previous_offset= 0;
+end:
+  log_descriptor.bc.ptr+= offset;
+  log_descriptor.bc.buffer->size+= offset;
+  translog_buffer_increase_writers(log_descriptor.bc.buffer);
+  log_descriptor.horizon+= offset; /* offset increasing */
+  log_descriptor.bc.current_page_fill= last_page_offset;
+  DBUG_PRINT("info", ("NewP buffer #%u: 0x%lx  chaser: %d  Size: %lu (%lu)  "
+                      "offset: %u  last page: %u",
+                      (uint) log_descriptor.bc.buffer->buffer_no,
+                      (ulong) log_descriptor.bc.buffer,
+                      log_descriptor.bc.chaser,
+                      (ulong) log_descriptor.bc.buffer->size,
+                      (ulong) (log_descriptor.bc.ptr -
+                               log_descriptor.bc.buffer->
+                               buffer), (uint) offset,
+                      (uint) last_page_offset));
+  DBUG_PRINT("info",
+             ("pointer moved to: (%lu, 0x%lx)",
+              LSN_IN_PARTS(log_descriptor.horizon)));
+  translog_check_cursor(&log_descriptor.bc);
+  log_descriptor.bc.protected= 0;
+  DBUG_RETURN(0);
+}
+
+
+/*
+  Get page rest
+
+  SYNOPSIS
+    translog_get_current_page_rest()
+
+  NOTE loghandler should be locked
+
+  RETURN
+    number of bytes left on the current page
+*/
+
+static uint translog_get_current_page_rest()
+{
+  return (TRANSLOG_PAGE_SIZE - log_descriptor.bc.current_page_fill);
+}
+
+
+/*
+  Get buffer rest in full pages
+
+  SYNOPSIS
+     translog_get_current_buffer_rest()
+
+  NOTE loghandler should be locked
+
+  RETURN
+    number of full pages left on the current buffer
+*/
+
+static uint translog_get_current_buffer_rest()
+{
+  return ((log_descriptor.bc.buffer->buffer + TRANSLOG_WRITE_BUFFER -
+           log_descriptor.bc.ptr) /
+          TRANSLOG_PAGE_SIZE);
+}
+
+/*
+  Calculate possible group size without first (current) page
+
+  SYNOPSIS
+    translog_get_current_group_size()
+
+  NOTE loghandler should be locked
+
+  RETURN
+    group size without first (current) page
+*/
+
+static translog_size_t translog_get_current_group_size()
+{
+  /* buffer rest in full pages */
+  translog_size_t buffer_rest= translog_get_current_buffer_rest();
+  DBUG_ENTER("translog_get_current_group_size");
+  DBUG_PRINT("info", ("buffer_rest in pages: %u", buffer_rest));
+
+  buffer_rest*= log_descriptor.page_capacity_chunk_2;
+  /* in case of only half of buffer free we can write this and next buffer */
+  if (buffer_rest < log_descriptor.half_buffer_capacity_chunk_2)
+  {
+    DBUG_PRINT("info", ("buffer_rest: %lu -> add %lu",
+                        (ulong) buffer_rest,
+                        (ulong) log_descriptor.buffer_capacity_chunk_2));
+    buffer_rest+= log_descriptor.buffer_capacity_chunk_2;
+  }
+
+  DBUG_PRINT("info", ("buffer_rest: %lu", (ulong) buffer_rest));
+
+  DBUG_RETURN(buffer_rest);
+}
+
+
+static inline void set_lsn(LSN *lsn, LSN value)
+{
+  DBUG_ENTER("set_lsn");
+  translog_lock_assert_owner();
+  *lsn= value;
+  /* we generate LSN so something is not flushed in log */
+  log_descriptor.is_everything_flushed= 0;
+  DBUG_PRINT("info", ("new LSN appeared: (%lu,0x%lx)", LSN_IN_PARTS(value)));
+  DBUG_VOID_RETURN;
+}
+
+
+/**
+   @brief Write variable record in 1 group.
+
+   @param  lsn             LSN of the record will be written here
+   @param  type            the log record type
+   @param  short_trid      Short transaction ID or 0 if it has no sense
+   @param  parts           Descriptor of record source parts
+   @param  buffer_to_flush Buffer which have to be flushed if it is not 0
+   @param  header_length   Calculated header length of chunk type 0
+   @param  trn             Transaction structure pointer for hooks by
+                           record log type, for short_id
+   @param  hook_arg        Argument which will be passed to pre-write and
+                           in-write hooks of this record.
+
+   @note
+     We must have a translog_lock() when entering this function
+     We must have buffer_to_flush locked (if not null)
+
+   @return Operation status
+     @retval 0      OK
+     @retval 1      Error
+*/
+
+static my_bool
+translog_write_variable_record_1group(LSN *lsn,
+                                      enum translog_record_type type,
+                                      MARIA_HA *tbl_info,
+                                      SHORT_TRANSACTION_ID short_trid,
+                                      struct st_translog_parts *parts,
+                                      struct st_translog_buffer
+                                      *buffer_to_flush, uint16 header_length,
+                                      TRN *trn, void *hook_arg)
+{
+  TRANSLOG_ADDRESS horizon;
+  struct st_buffer_cursor cursor;
+  int rc= 0;
+  uint i;
+  translog_size_t record_rest, full_pages, first_page;
+  uint additional_chunk3_page= 0;
+  uchar chunk0_header[1 + 2 + 5 + 2];
+  DBUG_ENTER("translog_write_variable_record_1group");
+  translog_lock_assert_owner();
+  if (buffer_to_flush)
+    translog_buffer_lock_assert_owner(buffer_to_flush);
+
+  set_lsn(lsn, horizon= log_descriptor.horizon);
+  if (translog_set_lsn_for_files(LSN_FILE_NO(*lsn), LSN_FILE_NO(*lsn),
+                                 *lsn, TRUE) ||
+      (log_record_type_descriptor[type].inwrite_hook &&
+       (*log_record_type_descriptor[type].inwrite_hook)(type, trn, tbl_info,
+                                                        lsn, hook_arg)))
+  {
+    translog_unlock();
+    DBUG_RETURN(1);
+  }
+  cursor= log_descriptor.bc;
+  cursor.chaser= 1;
+
+  /* Advance pointer to be able unlock the loghandler */
+  first_page= translog_get_current_page_rest();
+  record_rest= parts->record_length - (first_page - header_length);
+  full_pages= record_rest / log_descriptor.page_capacity_chunk_2;
+  record_rest= (record_rest % log_descriptor.page_capacity_chunk_2);
+
+  if (record_rest + 1 == log_descriptor.page_capacity_chunk_2)
+  {
+    DBUG_PRINT("info", ("2 chunks type 3 is needed"));
+    /* We will write 2 chunks type 3 at the end of this group */
+    additional_chunk3_page= 1;
+    record_rest= 1;
+  }
+
+  DBUG_PRINT("info", ("first_page: %u (%u)  full_pages: %u (%lu)  "
+                      "additional: %u (%u)  rest %u = %u",
+                      first_page, first_page - header_length,
+                      full_pages,
+                      (ulong) full_pages *
+                      log_descriptor.page_capacity_chunk_2,
+                      additional_chunk3_page,
+                      additional_chunk3_page *
+                      (log_descriptor.page_capacity_chunk_2 - 1),
+                      record_rest, parts->record_length));
+  /* record_rest + 3 is chunk type 3 overhead + record_rest */
+  rc|= translog_advance_pointer((int)(full_pages + additional_chunk3_page),
+                                (record_rest ? record_rest + 3 : 0));
+  log_descriptor.bc.buffer->last_lsn= *lsn;
+  DBUG_PRINT("info", ("last_lsn set to (%lu,0x%lx)  buffer: 0x%lx",
+                      LSN_IN_PARTS(log_descriptor.bc.buffer->last_lsn),
+                      (ulong) log_descriptor.bc.buffer));
+
+  translog_unlock();
+
+  /*
+     Check if we switched buffer and need process it (current buffer is
+     unlocked already => we will not delay other threads
+  */
+  if (buffer_to_flush != NULL)
+  {
+    if (!rc)
+      rc= translog_buffer_flush(buffer_to_flush);
+    translog_buffer_unlock(buffer_to_flush);
+  }
+  if (rc)
+    DBUG_RETURN(1);
+
+  translog_write_variable_record_1group_header(parts, type, short_trid,
+                                               header_length, chunk0_header);
+
+  /* fill the pages */
+  translog_write_parts_on_page(&horizon, &cursor, first_page, parts);
+
+  DBUG_PRINT("info", ("absolute horizon: (%lu,0x%lx)  local: (%lu,0x%lx)",
+                      LSN_IN_PARTS(log_descriptor.horizon),
+                      LSN_IN_PARTS(horizon)));
+
+  for (i= 0; i < full_pages; i++)
+  {
+    if (translog_write_variable_record_chunk2_page(parts, &horizon, &cursor))
+      DBUG_RETURN(1);
+
+    DBUG_PRINT("info", ("absolute horizon: (%lu,0x%lx)  local: (%lu,0x%lx)",
+                        LSN_IN_PARTS(log_descriptor.horizon),
+                        LSN_IN_PARTS(horizon)));
+  }
+
+  if (additional_chunk3_page)
+  {
+    if (translog_write_variable_record_chunk3_page(parts,
+                                                   log_descriptor.
+                                                   page_capacity_chunk_2 - 2,
+                                                   &horizon, &cursor))
+      DBUG_RETURN(1);
+    DBUG_PRINT("info", ("absolute horizon: (%lu,0x%lx)  local: (%lu,0x%lx)",
+                        LSN_IN_PARTS(log_descriptor.horizon),
+                        LSN_IN_PARTS(horizon)));
+    DBUG_ASSERT(cursor.current_page_fill == TRANSLOG_PAGE_SIZE);
+  }
+
+  if (translog_write_variable_record_chunk3_page(parts,
+                                                 record_rest,
+                                                 &horizon, &cursor))
+    DBUG_RETURN(1);
+    DBUG_PRINT("info", ("absolute horizon: (%lu,0x%lx)  local: (%lu,0x%lx)",
+                        (ulong) LSN_FILE_NO(log_descriptor.horizon),
+                        (ulong) LSN_OFFSET(log_descriptor.horizon),
+                        (ulong) LSN_FILE_NO(horizon),
+                        (ulong) LSN_OFFSET(horizon)));
+
+  translog_buffer_lock(cursor.buffer);
+  translog_buffer_decrease_writers(cursor.buffer);
+  translog_buffer_unlock(cursor.buffer);
+  DBUG_RETURN(rc);
+}
+
+
+/**
+   @brief Write variable record in 1 chunk.
+
+   @param  lsn             LSN of the record will be written here
+   @param  type            the log record type
+   @param  short_trid      Short transaction ID or 0 if it has no sense
+   @param  parts           Descriptor of record source parts
+   @param  buffer_to_flush Buffer which have to be flushed if it is not 0
+   @param  header_length   Calculated header length of chunk type 0
+   @param  trn             Transaction structure pointer for hooks by
+                           record log type, for short_id
+   @param  hook_arg        Argument which will be passed to pre-write and
+                           in-write hooks of this record.
+
+   @note
+     We must have a translog_lock() when entering this function
+     We must have buffer_to_flush locked (if not null)
+
+   @return Operation status
+     @retval 0      OK
+     @retval 1      Error
+*/
+
+static my_bool
+translog_write_variable_record_1chunk(LSN *lsn,
+                                      enum translog_record_type type,
+                                      MARIA_HA *tbl_info,
+                                      SHORT_TRANSACTION_ID short_trid,
+                                      struct st_translog_parts *parts,
+                                      struct st_translog_buffer
+                                      *buffer_to_flush, uint16 header_length,
+                                      TRN *trn, void *hook_arg)
+{
+  int rc;
+  uchar chunk0_header[1 + 2 + 5 + 2];
+  DBUG_ENTER("translog_write_variable_record_1chunk");
+  translog_lock_assert_owner();
+  if (buffer_to_flush)
+    translog_buffer_lock_assert_owner(buffer_to_flush);
+
+  translog_write_variable_record_1group_header(parts, type, short_trid,
+                                               header_length, chunk0_header);
+  set_lsn(lsn, log_descriptor.horizon);
+  if (translog_set_lsn_for_files(LSN_FILE_NO(*lsn), LSN_FILE_NO(*lsn),
+                                 *lsn, TRUE) ||
+      (log_record_type_descriptor[type].inwrite_hook &&
+       (*log_record_type_descriptor[type].inwrite_hook)(type, trn, tbl_info,
+                                                        lsn, hook_arg)))
+  {
+    translog_unlock();
+    DBUG_RETURN(1);
+  }
+
+  rc= translog_write_parts_on_page(&log_descriptor.horizon,
+                                   &log_descriptor.bc,
+                                   parts->total_record_length, parts);
+  log_descriptor.bc.buffer->last_lsn= *lsn;
+  DBUG_PRINT("info", ("last_lsn set to (%lu,0x%lx)  buffer: 0x%lx",
+                      LSN_IN_PARTS(log_descriptor.bc.buffer->last_lsn),
+                      (ulong) log_descriptor.bc.buffer));
+  translog_unlock();
+
+  /*
+     check if we switched buffer and need process it (current buffer is
+     unlocked already => we will not delay other threads
+  */
+  if (buffer_to_flush != NULL)
+  {
+    if (!rc)
+      rc= translog_buffer_flush(buffer_to_flush);
+    translog_buffer_unlock(buffer_to_flush);
+  }
+
+  DBUG_RETURN(rc);
+}
+
+
+/*
+  @brief Calculates and write LSN difference (compressed LSN).
+
+  @param base_lsn        LSN from which we calculate difference
+  @param lsn             LSN for codding
+  @param dst             Result will be written to dst[-pack_length] .. dst[-1]
+
+  @note To store an LSN in a compact way we will use the following compression:
+    If a log record has LSN1, and it contains the LSN2 as a back reference,
+    Instead of LSN2 we write LSN1-LSN2, encoded as:
+     two bits     the number N (see below)
+     14 bits
+     N bytes
+     That is, LSN is encoded in 2..5 bytes, and the number of bytes minus 2
+     is stored in the first two bits.
+
+  @note function made to write the result in backward direction with no
+  special sense or tricks both directions are equal in complicity
+
+  @retval #    pointer on coded LSN
+*/
+
+static uchar *translog_put_LSN_diff(LSN base_lsn, LSN lsn, uchar *dst)
+{
+  uint64 diff;
+  DBUG_ENTER("translog_put_LSN_diff");
+  DBUG_PRINT("enter", ("Base: (%lu,0x%lx)  val: (%lu,0x%lx)  dst: 0x%lx",
+                       LSN_IN_PARTS(base_lsn), LSN_IN_PARTS(lsn),
+                       (ulong) dst));
+  DBUG_ASSERT(base_lsn > lsn);
+  diff= base_lsn - lsn;
+  DBUG_PRINT("info", ("Diff: 0x%llx", (ulonglong) diff));
+  if (diff <= 0x3FFF)
+  {
+    dst-= 2;
+    /*
+      Note we store this high uchar first to ensure that first uchar has
+      0 in the 3 upper bits.
+    */
+    dst[0]= (uchar)(diff >> 8);
+    dst[1]= (uchar)(diff & 0xFF);
+  }
+  else if (diff <= 0x3FFFFFL)
+  {
+    dst-= 3;
+    dst[0]= (uchar)(0x40 | (diff >> 16));
+    int2store(dst + 1, diff & 0xFFFF);
+  }
+  else if (diff <= 0x3FFFFFFFL)
+  {
+    dst-= 4;
+    dst[0]= (uchar)(0x80 | (diff >> 24));
+    int3store(dst + 1, diff & 0xFFFFFFL);
+  }
+  else if (diff <= LL(0x3FFFFFFFFF))
+
+  {
+    dst-= 5;
+    dst[0]= (uchar)(0xC0 | (diff >> 32));
+    int4store(dst + 1, diff & 0xFFFFFFFFL);
+  }
+  else
+  {
+    /*
+      It is full LSN after special 1 diff (which is impossible
+      in real life)
+    */
+    dst-= 2 + LSN_STORE_SIZE;
+    dst[0]= 0;
+    dst[1]= 1;
+    lsn_store(dst + 2, lsn);
+  }
+  DBUG_PRINT("info", ("new dst: 0x%lx", (ulong) dst));
+  DBUG_RETURN(dst);
+}
+
+
+/*
+  Get LSN from LSN-difference (compressed LSN)
+
+  SYNOPSIS
+    translog_get_LSN_from_diff()
+    base_lsn             LSN from which we calculate difference
+    src                  pointer to coded lsn
+    dst                  pointer to buffer where to write 7byte LSN
+
+  NOTE:
+    To store an LSN in a compact way we will use the following compression:
+
+    If a log record has LSN1, and it contains the lSN2 as a back reference,
+    Instead of LSN2 we write LSN1-LSN2, encoded as:
+
+     two bits     the number N (see below)
+     14 bits
+     N bytes
+
+    That is, LSN is encoded in 2..5 bytes, and the number of bytes minus 2
+    is stored in the first two bits.
+
+  RETURN
+    pointer to buffer after decoded LSN
+*/
+
+static uchar *translog_get_LSN_from_diff(LSN base_lsn, uchar *src, uchar *dst)
+{
+  LSN lsn;
+  uint32 diff;
+  uint32 first_byte;
+  uint32 file_no, rec_offset;
+  uint8 code;
+  DBUG_ENTER("translog_get_LSN_from_diff");
+  DBUG_PRINT("enter", ("Base: (%lu,0x%lx)  src: 0x%lx  dst 0x%lx",
+                       LSN_IN_PARTS(base_lsn), (ulong) src, (ulong) dst));
+  first_byte= *((uint8*) src);
+  code= first_byte >> 6; /* Length is in 2 most significant bits */
+  first_byte&= 0x3F;
+  src++;                                        /* Skip length + encode */
+  file_no= LSN_FILE_NO(base_lsn);               /* Assume relative */
+  DBUG_PRINT("info", ("code: %u  first byte: %lu",
+                      (uint) code, (ulong) first_byte));
+  switch (code) {
+  case 0:
+    if (first_byte == 0 && *((uint8*)src) == 1)
+    {
+      /*
+        It is full LSN after special 1 diff (which is impossible
+        in real life)
+      */
+      memcpy(dst, src + 1, LSN_STORE_SIZE);
+      DBUG_PRINT("info", ("Special case of full LSN, new src: 0x%lx",
+                          (ulong) (src + 1 + LSN_STORE_SIZE)));
+      DBUG_RETURN(src + 1 + LSN_STORE_SIZE);
+    }
+    rec_offset= LSN_OFFSET(base_lsn) - ((first_byte << 8) + *((uint8*)src));
+    break;
+  case 1:
+    diff= uint2korr(src);
+    rec_offset= LSN_OFFSET(base_lsn) - ((first_byte << 16) + diff);
+    break;
+  case 2:
+    diff= uint3korr(src);
+    rec_offset= LSN_OFFSET(base_lsn) - ((first_byte << 24) + diff);
+    break;
+  case 3:
+  {
+    ulonglong base_offset= LSN_OFFSET(base_lsn);
+    diff= uint4korr(src);
+    if (diff > LSN_OFFSET(base_lsn))
+    {
+      /* take 1 from file offset */
+      first_byte++;
+      base_offset+= LL(0x100000000);
+    }
+    file_no= LSN_FILE_NO(base_lsn) - first_byte;
+    DBUG_ASSERT(base_offset - diff <= UINT_MAX);
+    rec_offset= (uint32)(base_offset - diff);
+    break;
+  }
+  default:
+    DBUG_ASSERT(0);
+    DBUG_RETURN(NULL);
+  }
+  lsn= MAKE_LSN(file_no, rec_offset);
+  src+= code + 1;
+  lsn_store(dst, lsn);
+  DBUG_PRINT("info", ("new src: 0x%lx", (ulong) src));
+  DBUG_RETURN(src);
+}
+
+
+/**
+  @brief Encodes relative LSNs listed in the parameters.
+
+  @param parts           Parts list with encoded LSN(s)
+  @param base_lsn        LSN which is base for encoding
+  @param lsns            number of LSN(s) to encode
+  @param compressed_LSNs buffer which can be used for storing compressed LSN(s)
+*/
+
+static void  translog_relative_LSN_encode(struct st_translog_parts *parts,
+                                          LSN base_lsn,
+                                          uint lsns, uchar *compressed_LSNs)
+{
+  LEX_CUSTRING *part;
+  uint lsns_len= lsns * LSN_STORE_SIZE;
+  uchar buffer_src[MAX_NUMBER_OF_LSNS_PER_RECORD * LSN_STORE_SIZE];
+  uchar *buffer= buffer_src;
+  const uchar *cbuffer;
+
+  DBUG_ENTER("translog_relative_LSN_encode");
+
+  DBUG_ASSERT(parts->current != 0);
+  part= parts->parts + parts->current;
+
+  /* collect all LSN(s) in one chunk if it (they) is (are) divided */
+  if (part->length < lsns_len)
+  {
+    uint copied= part->length;
+    LEX_CUSTRING *next_part;
+    DBUG_PRINT("info", ("Using buffer: 0x%lx", (ulong) compressed_LSNs));
+    memcpy(buffer, part->str, part->length);
+    next_part= parts->parts + parts->current + 1;
+    do
+    {
+      DBUG_ASSERT(next_part < parts->parts + parts->elements);
+      if ((next_part->length + copied) < lsns_len)
+      {
+        memcpy(buffer + copied, next_part->str,
+               next_part->length);
+        copied+= next_part->length;
+        next_part->length= 0; next_part->str= 0;
+        /* delete_dynamic_element(&parts->parts, parts->current + 1); */
+        next_part++;
+        parts->current++;
+        part= parts->parts + parts->current;
+      }
+      else
+      {
+        uint len= lsns_len - copied;
+        memcpy(buffer + copied, next_part->str, len);
+        copied= lsns_len;
+        next_part->str+= len;
+        next_part->length-= len;
+      }
+    } while (copied < lsns_len);
+    cbuffer= buffer;
+  }
+  else
+  {
+    cbuffer= part->str;
+    part->str+= lsns_len;
+    part->length-= lsns_len;
+    parts->current--;
+    part= parts->parts + parts->current;
+  }
+
+  {
+    /* Compress */
+    LSN ref;
+    int economy;
+    const uchar *src_ptr;
+    uchar *dst_ptr= compressed_LSNs + (MAX_NUMBER_OF_LSNS_PER_RECORD *
+                                      COMPRESSED_LSN_MAX_STORE_SIZE);
+    /*
+      We write the result in backward direction with no special sense or
+      tricks both directions are equal in complicity
+    */
+    for (src_ptr= cbuffer + lsns_len - LSN_STORE_SIZE;
+         src_ptr >= (const uchar*)cbuffer;
+         src_ptr-= LSN_STORE_SIZE)
+    {
+      ref= lsn_korr(src_ptr);
+      dst_ptr= translog_put_LSN_diff(base_lsn, ref, dst_ptr);
+    }
+    part->length= (uint)((compressed_LSNs +
+                          (MAX_NUMBER_OF_LSNS_PER_RECORD *
+                           COMPRESSED_LSN_MAX_STORE_SIZE)) -
+                         dst_ptr);
+    parts->record_length-= (economy= lsns_len - part->length);
+    DBUG_PRINT("info", ("new length of LSNs: %lu  economy: %d",
+                        (ulong)part->length, economy));
+    parts->total_record_length-= economy;
+    part->str= dst_ptr;
+  }
+  DBUG_VOID_RETURN;
+}
+
+
+/**
+   @brief Write multi-group variable-size record.
+
+   @param  lsn             LSN of the record will be written here
+   @param  type            the log record type
+   @param  short_trid      Short transaction ID or 0 if it has no sense
+   @param  parts           Descriptor of record source parts
+   @param  buffer_to_flush Buffer which have to be flushed if it is not 0
+   @param  header_length   Header length calculated for 1 group
+   @param  buffer_rest     Beginning from which we plan to write in full pages
+   @param  trn             Transaction structure pointer for hooks by
+                           record log type, for short_id
+   @param  hook_arg        Argument which will be passed to pre-write and
+                           in-write hooks of this record.
+
+   @note
+     We must have a translog_lock() when entering this function
+
+     We must have buffer_to_flush locked (if not null)
+     buffer_to_flush should *NOT* be locked when calling this function.
+     (This is note is here as this is different from most other
+     translog_write...() functions which require the buffer to be locked)
+
+   @return Operation status
+     @retval 0      OK
+     @retval 1      Error
+*/
+
+static my_bool
+translog_write_variable_record_mgroup(LSN *lsn,
+                                      enum translog_record_type type,
+                                      MARIA_HA *tbl_info,
+                                      SHORT_TRANSACTION_ID short_trid,
+                                      struct st_translog_parts *parts,
+                                      struct st_translog_buffer
+                                      *buffer_to_flush,
+                                      uint16 header_length,
+                                      translog_size_t buffer_rest,
+                                      TRN *trn, void *hook_arg)
+{
+  TRANSLOG_ADDRESS horizon;
+  struct st_buffer_cursor cursor;
+  int rc= 0;
+  uint i, chunk2_page, full_pages;
+  uint curr_group= 0;
+  translog_size_t record_rest, first_page, chunk3_pages, chunk0_pages= 1;
+  translog_size_t done= 0;
+  struct st_translog_group_descriptor group;
+  DYNAMIC_ARRAY groups;
+  uint16 chunk3_size;
+  uint16 page_capacity= log_descriptor.page_capacity_chunk_2 + 1;
+  uint16 last_page_capacity;
+  my_bool new_page_before_chunk0= 1, first_chunk0= 1;
+  uchar chunk0_header[1 + 2 + 5 + 2 + 2], group_desc[7 + 1];
+  uchar chunk2_header[1];
+  uint header_fixed_part= header_length + 2;
+  uint groups_per_page= (page_capacity - header_fixed_part) / (7 + 1);
+  uint file_of_the_first_group;
+  int pages_to_skip;
+  struct st_translog_buffer *buffer_of_last_lsn;
+  DBUG_ENTER("translog_write_variable_record_mgroup");
+  translog_lock_assert_owner();
+
+  chunk2_header[0]= TRANSLOG_CHUNK_NOHDR;
+
+  if (my_init_dynamic_array(&groups,
+                            sizeof(struct st_translog_group_descriptor),
+                            10, 10))
+  {
+    translog_unlock();
+    DBUG_PRINT("error", ("init array failed"));
+    DBUG_RETURN(1);
+  }
+
+  first_page= translog_get_current_page_rest();
+  record_rest= parts->record_length - (first_page - 1);
+  DBUG_PRINT("info", ("Record Rest: %lu", (ulong) record_rest));
+
+  if (record_rest < buffer_rest)
+  {
+    /*
+      The record (group 1 type) is larger than the free space on the page
+      - we need to split it in two. But when we split it in two, the first
+      part is big enough to hold all the data of the record (because the
+      header of the first part of the split is smaller than the header of
+      the record as a whole when it takes only one chunk)
+    */
+    DBUG_PRINT("info", ("too many free space because changing header"));
+    buffer_rest-= log_descriptor.page_capacity_chunk_2;
+    DBUG_ASSERT(record_rest >= buffer_rest);
+  }
+
+  file_of_the_first_group= LSN_FILE_NO(log_descriptor.horizon);
+  translog_mark_file_unfinished(file_of_the_first_group);
+  do
+  {
+    group.addr= horizon= log_descriptor.horizon;
+    cursor= log_descriptor.bc;
+    cursor.chaser= 1;
+    if ((full_pages= buffer_rest / log_descriptor.page_capacity_chunk_2) > 255)
+    {
+      /* sizeof(uint8) == 256 is max number of chunk in multi-chunks group */
+      full_pages= 255;
+      buffer_rest= full_pages * log_descriptor.page_capacity_chunk_2;
+    }
+    /*
+       group chunks =
+       full pages + first page (which actually can be full, too).
+       But here we assign number of chunks - 1
+    */
+    group.num= full_pages;
+    if (insert_dynamic(&groups, (uchar*) &group))
+    {
+      DBUG_PRINT("error", ("insert into array failed"));
+      goto err_unlock;
+    }
+
+    DBUG_PRINT("info", ("chunk: #%u  first_page: %u (%u)  "
+                        "full_pages: %lu (%lu)  "
+                        "Left %lu",
+                        groups.elements,
+                        first_page, first_page - 1,
+                        (ulong) full_pages,
+                        (ulong) (full_pages *
+                                 log_descriptor.page_capacity_chunk_2),
+                        (ulong)(parts->record_length - (first_page - 1 +
+                                                        buffer_rest) -
+                                done)));
+    rc|= translog_advance_pointer((int)full_pages, 0);
+
+    translog_unlock();
+
+    if (buffer_to_flush != NULL)
+    {
+      translog_buffer_decrease_writers(buffer_to_flush);
+      if (!rc)
+        rc= translog_buffer_flush(buffer_to_flush);
+      translog_buffer_unlock(buffer_to_flush);
+      buffer_to_flush= NULL;
+    }
+    if (rc)
+    {
+      DBUG_PRINT("error", ("flush of unlock buffer failed"));
+      goto err;
+    }
+
+    translog_write_data_on_page(&horizon, &cursor, 1, chunk2_header);
+    translog_write_parts_on_page(&horizon, &cursor, first_page - 1, parts);
+    DBUG_PRINT("info", ("absolute horizon: (%lu,0x%lx)  local: (%lu,0x%lx)  "
+                        "Left  %lu",
+                        LSN_IN_PARTS(log_descriptor.horizon),
+                        LSN_IN_PARTS(horizon),
+                        (ulong) (parts->record_length - (first_page - 1) -
+                                 done)));
+
+    for (i= 0; i < full_pages; i++)
+    {
+      if (translog_write_variable_record_chunk2_page(parts, &horizon, &cursor))
+        goto err;
+
+      DBUG_PRINT("info", ("absolute horizon: (%lu,0x%lx)  "
+                          "local: (%lu,0x%lx)  "
+                          "Left: %lu",
+                          LSN_IN_PARTS(log_descriptor.horizon),
+                          LSN_IN_PARTS(horizon),
+                          (ulong) (parts->record_length - (first_page - 1) -
+                                   i * log_descriptor.page_capacity_chunk_2 -
+                                   done)));
+    }
+
+    done+= (first_page - 1 + buffer_rest);
+
+    if (translog_chaser_page_next(&horizon, &cursor))
+    {
+      DBUG_PRINT("error", ("flush of unlock buffer failed"));
+      goto err;
+    }
+    translog_buffer_lock(cursor.buffer);
+    translog_buffer_decrease_writers(cursor.buffer);
+    translog_buffer_unlock(cursor.buffer);
+
+    translog_lock();
+
+    /* Check that we have place for chunk type 2 */
+    first_page= translog_get_current_page_rest();
+    if (first_page <= 1)
+    {
+      if (translog_page_next(&log_descriptor.horizon, &log_descriptor.bc,
+                             &buffer_to_flush))
+        goto err_unlock;
+      first_page= translog_get_current_page_rest();
+    }
+    buffer_rest= translog_get_current_group_size();
+  } while ((translog_size_t)(first_page + buffer_rest) <
+           (translog_size_t)(parts->record_length - done));
+
+  group.addr= horizon= log_descriptor.horizon;
+  cursor= log_descriptor.bc;
+  cursor.chaser= 1;
+  group.num= 0;                       /* 0 because it does not matter */
+  if (insert_dynamic(&groups, (uchar*) &group))
+  {
+    DBUG_PRINT("error", ("insert into array failed"));
+    goto err_unlock;
+  }
+  record_rest= parts->record_length - done;
+  DBUG_PRINT("info", ("Record rest: %lu", (ulong) record_rest));
+  if (first_page > record_rest + 1)
+  {
+    /*
+      We have not so much data to fill all first page
+      (no speaking about full pages)
+      so it will be:
+      <chunk0 <data>>
+      or
+      <chunk0>...<chunk0><chunk0 <data>>
+      or
+      <chunk3 <data>><chunk0>...<chunk0><chunk0 <possible data of 1 byte>>
+    */
+    chunk2_page= full_pages= 0;
+    last_page_capacity= first_page;
+    pages_to_skip= -1;
+  }
+  else
+  {
+    /*
+      We will have:
+      <chunk2 <data>>...<chunk2 <data>><chunk0 <data>>
+      or
+      <chunk2 <data>>...<chunk2 <data>><chunk0>...<chunk0><chunk0 <data>>
+      or
+      <chunk3 <data>><chunk0>...<chunk0><chunk0 <possible data of 1 byte>>
+    */
+    chunk2_page= 1;
+    record_rest-= (first_page - 1);
+    pages_to_skip= full_pages=
+      record_rest / log_descriptor.page_capacity_chunk_2;
+    record_rest= (record_rest % log_descriptor.page_capacity_chunk_2);
+    last_page_capacity= page_capacity;
+  }
+  chunk3_size= 0;
+  chunk3_pages= 0;
+  if (last_page_capacity > record_rest + 1 && record_rest != 0)
+  {
+    if (last_page_capacity >
+        record_rest + header_fixed_part + groups.elements * (7 + 1))
+    {
+      /* 1 record of type 0 */
+      chunk3_pages= 0;
+    }
+    else
+    {
+      pages_to_skip++;
+      chunk3_pages= 1;
+      if (record_rest + 2 == last_page_capacity)
+      {
+        chunk3_size= record_rest - 1;
+        record_rest= 1;
+      }
+      else
+      {
+        chunk3_size= record_rest;
+        record_rest= 0;
+      }
+    }
+  }
+  /*
+     A first non-full page will hold type 0 chunk only if it fit in it with
+     all its headers
+  */
+  while (page_capacity <
+         record_rest + header_fixed_part +
+         (groups.elements - groups_per_page * (chunk0_pages - 1)) * (7 + 1))
+    chunk0_pages++;
+  DBUG_PRINT("info", ("chunk0_pages: %u  groups %u  groups per full page: %u  "
+                      "Group on last page: %u",
+                      chunk0_pages, groups.elements,
+                      groups_per_page,
+                      (groups.elements -
+                       ((page_capacity - header_fixed_part) / (7 + 1)) *
+                       (chunk0_pages - 1))));
+  DBUG_PRINT("info", ("first_page: %u  chunk2: %u  full_pages: %u (%lu)  "
+                      "chunk3: %u (%u)  rest: %u",
+                      first_page,
+                      chunk2_page, full_pages,
+                      (ulong) full_pages *
+                      log_descriptor.page_capacity_chunk_2,
+                      chunk3_pages, (uint) chunk3_size, (uint) record_rest));
+  rc= translog_advance_pointer(pages_to_skip + (int)(chunk0_pages - 1),
+                               record_rest + header_fixed_part +
+                               (groups.elements -
+                                ((page_capacity -
+                                  header_fixed_part) / (7 + 1)) *
+                                (chunk0_pages - 1)) * (7 + 1));
+  buffer_of_last_lsn= log_descriptor.bc.buffer;
+  translog_unlock();
+
+  if (buffer_to_flush != NULL)
+  {
+    translog_buffer_decrease_writers(buffer_to_flush);
+    if (!rc)
+      rc= translog_buffer_flush(buffer_to_flush);
+    translog_buffer_unlock(buffer_to_flush);
+    buffer_to_flush= NULL;
+  }
+  if (rc)
+  {
+    DBUG_PRINT("error", ("flush of unlock buffer failed"));
+    goto err;
+  }
+
+  if (rc)
+    goto err;
+
+  if (chunk2_page)
+  {
+    DBUG_PRINT("info", ("chunk 2 to finish first page"));
+    translog_write_data_on_page(&horizon, &cursor, 1, chunk2_header);
+    translog_write_parts_on_page(&horizon, &cursor, first_page - 1, parts);
+    DBUG_PRINT("info", ("absolute horizon: (%lu,0x%lx)  local: (%lu,0x%lx) "
+                        "Left: %lu",
+                        LSN_IN_PARTS(log_descriptor.horizon),
+                        LSN_IN_PARTS(horizon),
+                        (ulong) (parts->record_length - (first_page - 1) -
+                                 done)));
+  }
+  else if (chunk3_pages)
+  {
+    uchar chunk3_header[3];
+    DBUG_PRINT("info", ("chunk 3"));
+    DBUG_ASSERT(full_pages == 0);
+    chunk3_pages= 0;
+    chunk3_header[0]= TRANSLOG_CHUNK_LNGTH;
+    int2store(chunk3_header + 1, chunk3_size);
+    translog_write_data_on_page(&horizon, &cursor, 3, chunk3_header);
+    translog_write_parts_on_page(&horizon, &cursor, chunk3_size, parts);
+    DBUG_PRINT("info", ("absolute horizon: (%lu,0x%lx)  local: (%lu,0x%lx) "
+                        "Left: %lu",
+                        LSN_IN_PARTS(log_descriptor.horizon),
+                        LSN_IN_PARTS(horizon),
+                        (ulong) (parts->record_length - chunk3_size - done)));
+  }
+  else
+  {
+    DBUG_PRINT("info", ("no new_page_before_chunk0"));
+    new_page_before_chunk0= 0;
+  }
+
+  for (i= 0; i < full_pages; i++)
+  {
+    DBUG_ASSERT(chunk2_page != 0);
+    if (translog_write_variable_record_chunk2_page(parts, &horizon, &cursor))
+      goto err;
+
+    DBUG_PRINT("info", ("absolute horizon: (%lu,0x%lx)  local: (%lu,0x%lx) "
+                        "Left: %lu",
+                        LSN_IN_PARTS(log_descriptor.horizon),
+                        LSN_IN_PARTS(horizon),
+                        (ulong) (parts->record_length - (first_page - 1) -
+                                 i * log_descriptor.page_capacity_chunk_2 -
+                                 done)));
+  }
+
+  if (chunk3_pages &&
+      translog_write_variable_record_chunk3_page(parts,
+                                                 chunk3_size,
+                                                 &horizon, &cursor))
+    goto err;
+  DBUG_PRINT("info", ("absolute horizon: (%lu,0x%lx)  local: (%lu,0x%lx)",
+                      LSN_IN_PARTS(log_descriptor.horizon),
+                      LSN_IN_PARTS(horizon)));
+
+  *chunk0_header= (uchar) (type | TRANSLOG_CHUNK_LSN);
+  int2store(chunk0_header + 1, short_trid);
+  translog_write_variable_record_1group_code_len(chunk0_header + 3,
+                                                 parts->record_length,
+                                                 header_length);
+  do
+  {
+    int limit;
+    if (new_page_before_chunk0 &&
+        translog_chaser_page_next(&horizon, &cursor))
+    {
+      DBUG_PRINT("error", ("flush of unlock buffer failed"));
+      goto err;
+    }
+    new_page_before_chunk0= 1;
+
+    if (first_chunk0)
+    {
+      first_chunk0= 0;
+
+      /*
+        We can drop "log_descriptor.is_everything_flushed" earlier when have
+        lock on loghandler and assign initial value of "horizon" variable or
+        before unlocking loghandler (because we will increase writers
+        counter on the buffer and every thread which wanted flush the buffer
+        will wait till we finish with it). But IMHO better here take short
+        lock and do not bother other threads with waiting.
+      */
+      translog_lock();
+      set_lsn(lsn, horizon);
+      buffer_of_last_lsn->last_lsn= *lsn;
+      DBUG_PRINT("info", ("last_lsn set to (%lu,0x%lx)  buffer: 0x%lx",
+                          LSN_IN_PARTS(buffer_of_last_lsn->last_lsn),
+                          (ulong) buffer_of_last_lsn));
+      if (log_record_type_descriptor[type].inwrite_hook &&
+          (*log_record_type_descriptor[type].inwrite_hook) (type, trn,
+                                                            tbl_info,
+                                                            lsn, hook_arg))
+        goto err_unlock;
+      translog_unlock();
+    }
+
+    /*
+       A first non-full page will hold type 0 chunk only if it fit in it with
+       all its headers => the fist page is full or number of groups less then
+       possible number of full page.
+    */
+    limit= (groups_per_page < groups.elements - curr_group ?
+            groups_per_page : groups.elements - curr_group);
+    DBUG_PRINT("info", ("Groups: %u  curr: %u  limit: %u",
+                        (uint) groups.elements, (uint) curr_group,
+                        (uint) limit));
+
+    if (chunk0_pages == 1)
+    {
+      DBUG_PRINT("info", ("chunk_len: 2 + %u * (7+1) + %u = %u",
+                          (uint) limit, (uint) record_rest,
+                          (uint) (2 + limit * (7 + 1) + record_rest)));
+      int2store(chunk0_header + header_length - 2,
+                2 + limit * (7 + 1) + record_rest);
+    }
+    else
+    {
+      DBUG_PRINT("info", ("chunk_len: 2 + %u * (7+1) = %u",
+                          (uint) limit, (uint) (2 + limit * (7 + 1))));
+      int2store(chunk0_header + header_length - 2, 2 + limit * (7 + 1));
+    }
+    int2store(chunk0_header + header_length, groups.elements - curr_group);
+    translog_write_data_on_page(&horizon, &cursor, header_fixed_part,
+                                chunk0_header);
+    for (i= curr_group; i < limit + curr_group; i++)
+    {
+      struct st_translog_group_descriptor *grp_ptr;
+      grp_ptr= dynamic_element(&groups, i,
+                               struct st_translog_group_descriptor *);
+      lsn_store(group_desc, grp_ptr->addr);
+      group_desc[7]= grp_ptr->num;
+      translog_write_data_on_page(&horizon, &cursor, (7 + 1), group_desc);
+    }
+
+    if (chunk0_pages == 1 && record_rest != 0)
+      translog_write_parts_on_page(&horizon, &cursor, record_rest, parts);
+
+    chunk0_pages--;
+    curr_group+= limit;
+    /* put special type to indicate that it is not LSN chunk */
+    *chunk0_header= (uchar) (TRANSLOG_CHUNK_LSN | TRANSLOG_CHUNK_0_CONT);
+  } while (chunk0_pages != 0);
+  translog_buffer_lock(cursor.buffer);
+  translog_buffer_decrease_writers(cursor.buffer);
+  translog_buffer_unlock(cursor.buffer);
+  rc= 0;
+
+  if (translog_set_lsn_for_files(file_of_the_first_group, LSN_FILE_NO(*lsn),
+                                 *lsn, FALSE))
+    goto err;
+
+  translog_mark_file_finished(file_of_the_first_group);
+
+  delete_dynamic(&groups);
+  DBUG_RETURN(rc);
+
+err_unlock:
+
+  translog_unlock();
+
+err:
+  if (buffer_to_flush != NULL)
+  {
+    /* This is to prevent locking buffer forever in case of error */
+    translog_buffer_decrease_writers(buffer_to_flush);
+    if (!rc)
+      rc= translog_buffer_flush(buffer_to_flush);
+    translog_buffer_unlock(buffer_to_flush);
+    buffer_to_flush= NULL;
+  }
+
+
+  translog_mark_file_finished(file_of_the_first_group);
+
+  delete_dynamic(&groups);
+  DBUG_RETURN(1);
+}
+
+
+/**
+   @brief Write the variable length log record.
+
+   @param  lsn             LSN of the record will be written here
+   @param  type            the log record type
+   @param  short_trid      Short transaction ID or 0 if it has no sense
+   @param  parts           Descriptor of record source parts
+   @param  trn             Transaction structure pointer for hooks by
+                           record log type, for short_id
+   @param  hook_arg        Argument which will be passed to pre-write and
+                           in-write hooks of this record.
+
+   @return Operation status
+     @retval 0      OK
+     @retval 1      Error
+*/
+
+static my_bool translog_write_variable_record(LSN *lsn,
+                                              enum translog_record_type type,
+                                              MARIA_HA *tbl_info,
+                                              SHORT_TRANSACTION_ID short_trid,
+                                              struct st_translog_parts *parts,
+                                              TRN *trn, void *hook_arg)
+{
+  struct st_translog_buffer *buffer_to_flush= NULL;
+  uint header_length1= 1 + 2 + 2 +
+    translog_variable_record_length_bytes(parts->record_length);
+  ulong buffer_rest;
+  uint page_rest;
+  /* Max number of such LSNs per record is 2 */
+  uchar compressed_LSNs[MAX_NUMBER_OF_LSNS_PER_RECORD *
+    COMPRESSED_LSN_MAX_STORE_SIZE];
+  my_bool res;
+  DBUG_ENTER("translog_write_variable_record");
+
+  translog_lock();
+  DBUG_PRINT("info", ("horizon: (%lu,0x%lx)",
+                      LSN_IN_PARTS(log_descriptor.horizon)));
+  page_rest= TRANSLOG_PAGE_SIZE - log_descriptor.bc.current_page_fill;
+  DBUG_PRINT("info", ("header length: %u  page_rest: %u",
+                      header_length1, page_rest));
+
+  /*
+    header and part which we should read have to fit in one chunk
+    TODO: allow to divide readable header
+  */
+  if (page_rest <
+      (header_length1 + log_record_type_descriptor[type].read_header_len))
+  {
+    DBUG_PRINT("info",
+               ("Next page, size: %u  header: %u + %u",
+                log_descriptor.bc.current_page_fill,
+                header_length1,
+                log_record_type_descriptor[type].read_header_len));
+    translog_page_next(&log_descriptor.horizon, &log_descriptor.bc,
+                       &buffer_to_flush);
+    /* Chunk 2 header is 1 byte, so full page capacity will be one uchar more */
+    page_rest= log_descriptor.page_capacity_chunk_2 + 1;
+    DBUG_PRINT("info", ("page_rest: %u", page_rest));
+  }
+
+  /*
+     To minimize compressed size we will compress always relative to
+     very first chunk address (log_descriptor.horizon for now)
+  */
+  if (log_record_type_descriptor[type].compressed_LSN > 0)
+  {
+    translog_relative_LSN_encode(parts, log_descriptor.horizon,
+                                 log_record_type_descriptor[type].
+                                 compressed_LSN, compressed_LSNs);
+    /* recalculate header length after compression */
+    header_length1= 1 + 2 + 2 +
+      translog_variable_record_length_bytes(parts->record_length);
+    DBUG_PRINT("info", ("after compressing LSN(s) header length: %u  "
+                        "record length: %lu",
+                        header_length1, (ulong)parts->record_length));
+  }
+
+  /* TODO: check space on current page for header + few bytes */
+  if (page_rest >= parts->record_length + header_length1)
+  {
+    /* following function makes translog_unlock(); */
+    res= translog_write_variable_record_1chunk(lsn, type, tbl_info,
+                                               short_trid,
+                                               parts, buffer_to_flush,
+                                               header_length1, trn, hook_arg);
+    DBUG_RETURN(res);
+  }
+
+  buffer_rest= translog_get_current_group_size();
+
+  if (buffer_rest >= parts->record_length + header_length1 - page_rest)
+  {
+    /* following function makes translog_unlock(); */
+    res= translog_write_variable_record_1group(lsn, type, tbl_info,
+                                               short_trid,
+                                               parts, buffer_to_flush,
+                                               header_length1, trn, hook_arg);
+    DBUG_RETURN(res);
+  }
+  /* following function makes translog_unlock(); */
+  res= translog_write_variable_record_mgroup(lsn, type, tbl_info,
+                                             short_trid,
+                                             parts, buffer_to_flush,
+                                             header_length1,
+                                             buffer_rest, trn, hook_arg);
+  DBUG_RETURN(res);
+}
+
+
+/**
+   @brief Write the fixed and pseudo-fixed log record.
+
+   @param  lsn             LSN of the record will be written here
+   @param  type            the log record type
+   @param  short_trid      Short transaction ID or 0 if it has no sense
+   @param  parts           Descriptor of record source parts
+   @param  trn             Transaction structure pointer for hooks by
+                           record log type, for short_id
+   @param  hook_arg        Argument which will be passed to pre-write and
+                           in-write hooks of this record.
+
+   @return Operation status
+     @retval 0      OK
+     @retval 1      Error
+*/
+
+static my_bool translog_write_fixed_record(LSN *lsn,
+                                           enum translog_record_type type,
+                                           MARIA_HA *tbl_info,
+                                           SHORT_TRANSACTION_ID short_trid,
+                                           struct st_translog_parts *parts,
+                                           TRN *trn, void *hook_arg)
+{
+  struct st_translog_buffer *buffer_to_flush= NULL;
+  uchar chunk1_header[1 + 2];
+  /* Max number of such LSNs per record is 2 */
+  uchar compressed_LSNs[MAX_NUMBER_OF_LSNS_PER_RECORD *
+    COMPRESSED_LSN_MAX_STORE_SIZE];
+  LEX_CUSTRING *part;
+  int rc= 1;
+  DBUG_ENTER("translog_write_fixed_record");
+  DBUG_ASSERT((log_record_type_descriptor[type].rclass ==
+               LOGRECTYPE_FIXEDLENGTH &&
+               parts->record_length ==
+               log_record_type_descriptor[type].fixed_length) ||
+              (log_record_type_descriptor[type].rclass ==
+               LOGRECTYPE_PSEUDOFIXEDLENGTH &&
+               parts->record_length ==
+               log_record_type_descriptor[type].fixed_length));
+
+  translog_lock();
+  DBUG_PRINT("info", ("horizon: (%lu,0x%lx)",
+                      LSN_IN_PARTS(log_descriptor.horizon)));
+
+  DBUG_ASSERT(log_descriptor.bc.current_page_fill <= TRANSLOG_PAGE_SIZE);
+  DBUG_PRINT("info",
+             ("Page size: %u  record: %u  next cond: %d",
+              log_descriptor.bc.current_page_fill,
+              (parts->record_length +
+               log_record_type_descriptor[type].compressed_LSN * 2 + 3),
+              ((((uint) log_descriptor.bc.current_page_fill) +
+                (parts->record_length +
+                 log_record_type_descriptor[type].compressed_LSN * 2 + 3)) >
+               TRANSLOG_PAGE_SIZE)));
+  /*
+    check that there is enough place on current page.
+    NOTE: compressing may increase page LSN size on two bytes for every LSN
+  */
+  if ((((uint) log_descriptor.bc.current_page_fill) +
+       (parts->record_length +
+        log_record_type_descriptor[type].compressed_LSN * 2 + 3)) >
+      TRANSLOG_PAGE_SIZE)
+  {
+    DBUG_PRINT("info", ("Next page"));
+    if (translog_page_next(&log_descriptor.horizon, &log_descriptor.bc,
+                           &buffer_to_flush))
+      goto err;                                 /* rc == 1 */
+    if (buffer_to_flush)
+      translog_buffer_lock_assert_owner(buffer_to_flush);
+  }
+
+  set_lsn(lsn, log_descriptor.horizon);
+  if (translog_set_lsn_for_files(LSN_FILE_NO(*lsn), LSN_FILE_NO(*lsn),
+                             *lsn, TRUE) ||
+      (log_record_type_descriptor[type].inwrite_hook &&
+       (*log_record_type_descriptor[type].inwrite_hook)(type, trn, tbl_info,
+                                                        lsn, hook_arg)))
+    goto err;
+
+  /* compress LSNs */
+  if (log_record_type_descriptor[type].rclass ==
+      LOGRECTYPE_PSEUDOFIXEDLENGTH)
+  {
+    DBUG_ASSERT(log_record_type_descriptor[type].compressed_LSN > 0);
+    translog_relative_LSN_encode(parts, *lsn,
+                                 log_record_type_descriptor[type].
+                                 compressed_LSN, compressed_LSNs);
+  }
+
+  /*
+    Write the whole record at once (we know that there is enough place on
+    the destination page)
+  */
+  DBUG_ASSERT(parts->current != 0);       /* first part is left for header */
+  part= parts->parts + (--parts->current);
+  parts->total_record_length+= (translog_size_t) (part->length= 1 + 2);
+  part->str= chunk1_header;
+  *chunk1_header= (uchar) (type | TRANSLOG_CHUNK_FIXED);
+  int2store(chunk1_header + 1, short_trid);
+
+  rc= translog_write_parts_on_page(&log_descriptor.horizon,
+                                   &log_descriptor.bc,
+                                   parts->total_record_length, parts);
+
+  log_descriptor.bc.buffer->last_lsn= *lsn;
+  DBUG_PRINT("info", ("last_lsn set to (%lu,0x%lx)  buffer: 0x%lx",
+                      LSN_IN_PARTS(log_descriptor.bc.buffer->last_lsn),
+                      (ulong) log_descriptor.bc.buffer));
+
+err:
+  translog_unlock();
+
+  /*
+    check if we switched buffer and need process it (current buffer is
+    unlocked already => we will not delay other threads
+  */
+  if (buffer_to_flush != NULL)
+  {
+    if (!rc)
+      rc= translog_buffer_flush(buffer_to_flush);
+    translog_buffer_unlock(buffer_to_flush);
+  }
+
+  DBUG_RETURN(rc);
+}
+
+
+/**
+   @brief Writes the log record
+
+   If share has no 2-byte-id yet, gives an id to the share and logs
+   LOGREC_FILE_ID. If transaction has not logged LOGREC_LONG_TRANSACTION_ID
+   yet, logs it.
+
+   @param  lsn             LSN of the record will be written here
+   @param  type            the log record type
+   @param  trn             Transaction structure pointer for hooks by
+                           record log type, for short_id
+   @param  tbl_info        MARIA_HA of table or NULL
+   @param  rec_len         record length or 0 (count it)
+   @param  part_no         number of parts or 0 (count it)
+   @param  parts_data      zero ended (in case of number of parts is 0)
+                           array of LEX_STRINGs (parts), first
+                           TRANSLOG_INTERNAL_PARTS positions in the log
+                           should be unused (need for loghandler)
+   @param  store_share_id  if tbl_info!=NULL then share's id will
+                           automatically be stored in the two first bytes
+                           pointed (so pointer is assumed to be !=NULL)
+   @param  hook_arg        argument which will be passed to pre-write and
+                           in-write hooks of this record.
+
+   @return Operation status
+     @retval 0      OK
+     @retval 1      Error
+*/
+
+my_bool translog_write_record(LSN *lsn,
+                              enum translog_record_type type,
+                              TRN *trn, MARIA_HA *tbl_info,
+                              translog_size_t rec_len,
+                              uint part_no,
+                              LEX_CUSTRING *parts_data,
+                              uchar *store_share_id,
+                              void *hook_arg)
+{
+  struct st_translog_parts parts;
+  LEX_CUSTRING *part;
+  int rc;
+  uint short_trid= trn->short_id;
+  DBUG_ENTER("translog_write_record");
+  DBUG_PRINT("enter", ("type: %u (%s)  ShortTrID: %u  rec_len: %lu",
+                       (uint) type, log_record_type_descriptor[type].name,
+                       (uint) short_trid, (ulong) rec_len));
+  DBUG_ASSERT(translog_status == TRANSLOG_OK ||
+              translog_status == TRANSLOG_READONLY);
+  if (unlikely(translog_status != TRANSLOG_OK))
+  {
+    DBUG_PRINT("error", ("Transaction log is write protected"));
+    DBUG_RETURN(1);
+  }
+
+  if (tbl_info)
+  {
+    MARIA_SHARE *share= tbl_info->s;
+    DBUG_ASSERT(share->now_transactional);
+    if (unlikely(share->id == 0))
+    {
+      /*
+        First log write for this MARIA_SHARE; give it a short id.
+        When the lock manager is enabled and needs a short id, it should be
+        assigned in the lock manager (because row locks will be taken before
+        log records are written; for example SELECT FOR UPDATE takes locks but
+        writes no log record.
+      */
+      if (unlikely(translog_assign_id_to_share(tbl_info, trn)))
+        DBUG_RETURN(1);
+    }
+    fileid_store(store_share_id, share->id);
+  }
+  if (unlikely(!(trn->first_undo_lsn & TRANSACTION_LOGGED_LONG_ID)))
+  {
+    LSN dummy_lsn;
+    LEX_CUSTRING log_array[TRANSLOG_INTERNAL_PARTS + 1];
+    uchar log_data[6];
+    DBUG_ASSERT(trn->undo_lsn == LSN_IMPOSSIBLE);
+    int6store(log_data, trn->trid);
+    log_array[TRANSLOG_INTERNAL_PARTS + 0].str=    log_data;
+    log_array[TRANSLOG_INTERNAL_PARTS + 0].length= sizeof(log_data);
+    trn->first_undo_lsn|= TRANSACTION_LOGGED_LONG_ID; /* no recursion */
+    if (unlikely(translog_write_record(&dummy_lsn, LOGREC_LONG_TRANSACTION_ID,
+                                       trn, NULL, sizeof(log_data),
+                                       sizeof(log_array)/sizeof(log_array[0]),
+                                       log_array, NULL, NULL)))
+      DBUG_RETURN(1);
+  }
+
+  parts.parts= parts_data;
+
+  /* count parts if they are not counted by upper level */
+  if (part_no == 0)
+  {
+    for (part_no= TRANSLOG_INTERNAL_PARTS;
+         parts_data[part_no].length != 0;
+         part_no++);
+  }
+  parts.elements= part_no;
+  parts.current= TRANSLOG_INTERNAL_PARTS;
+
+  /* clear TRANSLOG_INTERNAL_PARTS */
+  compile_time_assert(TRANSLOG_INTERNAL_PARTS != 0);
+  parts_data[0].str= 0;
+  parts_data[0].length= 0;
+
+  /* count length of the record */
+  if (rec_len == 0)
+  {
+    for(part= parts_data + TRANSLOG_INTERNAL_PARTS;\
+        part < parts_data + part_no;
+        part++)
+    {
+      rec_len+= (translog_size_t) part->length;
+    }
+  }
+  parts.record_length= rec_len;
+
+#ifndef DBUG_OFF
+  {
+    uint i;
+    uint len= 0;
+#ifdef HAVE_valgrind
+    ha_checksum checksum= 0;
+#endif
+    for (i= TRANSLOG_INTERNAL_PARTS; i < part_no; i++)
+    {
+#ifdef HAVE_valgrind
+      /* Find unitialized bytes early */
+      checksum+= my_checksum(checksum, parts_data[i].str,
+                             parts_data[i].length);
+#endif
+      len+= parts_data[i].length;
+    }
+    DBUG_ASSERT(len == rec_len);
+  }
+#endif
+  /*
+    Start total_record_length from record_length then overhead will
+    be add
+  */
+  parts.total_record_length= parts.record_length;
+  DBUG_PRINT("info", ("record length: %lu", (ulong) parts.record_length));
+
+  /* process this parts */
+  if (!(rc= (log_record_type_descriptor[type].prewrite_hook &&
+             (*log_record_type_descriptor[type].prewrite_hook) (type, trn,
+                                                                tbl_info,
+                                                                hook_arg))))
+  {
+    switch (log_record_type_descriptor[type].rclass) {
+    case LOGRECTYPE_VARIABLE_LENGTH:
+      rc= translog_write_variable_record(lsn, type, tbl_info,
+                                         short_trid, &parts, trn, hook_arg);
+      break;
+    case LOGRECTYPE_PSEUDOFIXEDLENGTH:
+    case LOGRECTYPE_FIXEDLENGTH:
+      rc= translog_write_fixed_record(lsn, type, tbl_info,
+                                      short_trid, &parts, trn, hook_arg);
+      break;
+    case LOGRECTYPE_NOT_ALLOWED:
+    default:
+      DBUG_ASSERT(0);
+      rc= 1;
+    }
+  }
+
+  DBUG_PRINT("info", ("LSN: (%lu,0x%lx)", LSN_IN_PARTS(*lsn)));
+  DBUG_RETURN(rc);
+}
+
+
+/*
+  Decode compressed (relative) LSN(s)
+
+  SYNOPSIS
+   translog_relative_lsn_decode()
+   base_lsn              LSN for encoding
+   src                   Decode LSN(s) from here
+   dst                   Put decoded LSNs here
+   lsns                  number of LSN(s)
+
+   RETURN
+     position in sources after decoded LSN(s)
+*/
+
+static uchar *translog_relative_LSN_decode(LSN base_lsn,
+                                          uchar *src, uchar *dst, uint lsns)
+{
+  uint i;
+  for (i= 0; i < lsns; i++, dst+= LSN_STORE_SIZE)
+  {
+    src= translog_get_LSN_from_diff(base_lsn, src, dst);
+  }
+  return src;
+}
+
+/**
+   @brief Get header of fixed/pseudo length record and call hook for
+   it processing
+
+   @param page            Pointer to the buffer with page where LSN chunk is
+                          placed
+   @param page_offset     Offset of the first chunk in the page
+   @param buff            Buffer to be filled with header data
+
+   @return Length of header or operation status
+     @retval #  number of bytes in TRANSLOG_HEADER_BUFFER::header where
+                stored decoded part of the header
+*/
+
+static int translog_fixed_length_header(uchar *page,
+                                        translog_size_t page_offset,
+                                        TRANSLOG_HEADER_BUFFER *buff)
+{
+  struct st_log_record_type_descriptor *desc=
+    log_record_type_descriptor + buff->type;
+  uchar *src= page + page_offset + 3;
+  uchar *dst= buff->header;
+  uchar *start= src;
+  int lsns= desc->compressed_LSN;
+  uint length= desc->fixed_length;
+  DBUG_ENTER("translog_fixed_length_header");
+
+  buff->record_length= length;
+
+  if (desc->rclass == LOGRECTYPE_PSEUDOFIXEDLENGTH)
+  {
+    DBUG_ASSERT(lsns > 0);
+    src= translog_relative_LSN_decode(buff->lsn, src, dst, lsns);
+    lsns*= LSN_STORE_SIZE;
+    dst+= lsns;
+    length-= lsns;
+    buff->compressed_LSN_economy= (lsns - (int) (src - start));
+  }
+  else
+    buff->compressed_LSN_economy= 0;
+
+  memcpy(dst, src, length);
+  buff->non_header_data_start_offset= (uint16) (page_offset +
+                                                ((src + length) -
+                                                 (page + page_offset)));
+  buff->non_header_data_len= 0;
+  DBUG_RETURN(buff->record_length);
+}
+
+
+/*
+  Free resources used by TRANSLOG_HEADER_BUFFER
+
+  SYNOPSIS
+    translog_free_record_header();
+*/
+
+void translog_free_record_header(TRANSLOG_HEADER_BUFFER *buff)
+{
+  DBUG_ENTER("translog_free_record_header");
+  if (buff->groups_no != 0)
+  {
+    my_free(buff->groups, MYF(0));
+    buff->groups_no= 0;
+  }
+  DBUG_VOID_RETURN;
+}
+
+
+/**
+   @brief Returns the current horizon at the end of the current log
+
+   @return Horizon
+   @retval LSN_ERROR     error
+   @retvar #             Horizon
+*/
+
+TRANSLOG_ADDRESS translog_get_horizon()
+{
+  TRANSLOG_ADDRESS res;
+  DBUG_ASSERT(translog_status == TRANSLOG_OK ||
+              translog_status == TRANSLOG_READONLY);
+  translog_lock();
+  res= log_descriptor.horizon;
+  translog_unlock();
+  return res;
+}
+
+
+/**
+   @brief Returns the current horizon at the end of the current log, caller is
+   assumed to already hold the lock
+
+   @return Horizon
+   @retval LSN_ERROR     error
+   @retvar #             Horizon
+*/
+
+TRANSLOG_ADDRESS translog_get_horizon_no_lock()
+{
+  DBUG_ASSERT(translog_status == TRANSLOG_OK ||
+              translog_status == TRANSLOG_READONLY);
+  translog_lock_assert_owner();
+  return log_descriptor.horizon;
+}
+
+
+/*
+  Set last page in the scanner data structure
+
+  SYNOPSIS
+    translog_scanner_set_last_page()
+    scanner              Information about current chunk during scanning
+
+  RETURN
+    0  OK
+    1  Error
+*/
+
+static my_bool translog_scanner_set_last_page(TRANSLOG_SCANNER_DATA *scanner)
+{
+  my_bool page_ok;
+  if (LSN_FILE_NO(scanner->page_addr) == LSN_FILE_NO(scanner->horizon))
+  {
+    /* It is last file => we can easy find last page address by horizon */
+    uint pagegrest= LSN_OFFSET(scanner->horizon) % TRANSLOG_PAGE_SIZE;
+    scanner->last_file_page= (scanner->horizon -
+                              (pagegrest ? pagegrest : TRANSLOG_PAGE_SIZE));
+    return (0);
+  }
+  scanner->last_file_page= scanner->page_addr;
+  return (translog_get_last_page_addr(&scanner->last_file_page, &page_ok, 0));
+}
+
+
+/**
+  @brief Get page from page cache according to requested method
+
+  @param scanner         The scanner data
+
+  @return operation status
+  @retval 0 OK
+  @retval 1 Error
+*/
+
+static my_bool
+translog_scanner_get_page(TRANSLOG_SCANNER_DATA *scanner)
+{
+  TRANSLOG_VALIDATOR_DATA data;
+  DBUG_ENTER("translog_scanner_get_page");
+  data.addr= &scanner->page_addr;
+  data.was_recovered= 0;
+  DBUG_RETURN((scanner->page=
+               translog_get_page(&data, scanner->buffer,
+                                 (scanner->use_direct_link ?
+                                  &scanner->direct_link :
+                                  NULL))) ==
+               NULL);
+}
+
+
+/**
+  @brief Initialize reader scanner.
+
+  @param lsn             LSN with which it have to be inited
+  @param fixed_horizon   true if it is OK do not read records which was written
+                         after scanning beginning
+  @param scanner         scanner which have to be inited
+  @param use_direct      prefer using direct lings from page handler
+                         where it is possible.
+
+  @note If direct link was used translog_destroy_scanner should be
+        called after it using
+
+  @return status of the operation
+  @retval 0 OK
+  @retval 1 Error
+*/
+
+my_bool translog_scanner_init(LSN lsn,
+                              my_bool fixed_horizon,
+                              TRANSLOG_SCANNER_DATA *scanner,
+                              my_bool use_direct)
+{
+  TRANSLOG_VALIDATOR_DATA data;
+  DBUG_ENTER("translog_scanner_init");
+  DBUG_PRINT("enter", ("Scanner: 0x%lx  LSN: (%lu,0x%lx)",
+                       (ulong) scanner, LSN_IN_PARTS(lsn)));
+  DBUG_ASSERT(translog_status == TRANSLOG_OK ||
+              translog_status == TRANSLOG_READONLY);
+
+  data.addr= &scanner->page_addr;
+  data.was_recovered= 0;
+
+  scanner->page_offset= LSN_OFFSET(lsn) % TRANSLOG_PAGE_SIZE;
+
+  scanner->fixed_horizon= fixed_horizon;
+  scanner->use_direct_link= use_direct;
+  scanner->direct_link= NULL;
+
+  scanner->horizon= translog_get_horizon();
+  DBUG_PRINT("info", ("horizon: (%lu,0x%lx)", LSN_IN_PARTS(scanner->horizon)));
+
+  /* lsn < horizon */
+  DBUG_ASSERT(lsn <= scanner->horizon);
+
+  scanner->page_addr= lsn;
+  scanner->page_addr-= scanner->page_offset; /*decrease offset */
+
+  if (translog_scanner_set_last_page(scanner))
+    DBUG_RETURN(1);
+
+  if (translog_scanner_get_page(scanner))
+    DBUG_RETURN(1);
+  DBUG_RETURN(0);
+}
+
+
+/**
+  @brief Destroy scanner object;
+
+  @param scanner         The scanner object to destroy
+*/
+
+void translog_destroy_scanner(TRANSLOG_SCANNER_DATA *scanner)
+{
+  DBUG_ENTER("translog_destroy_scanner");
+  DBUG_PRINT("enter", ("Scanner: 0x%lx", (ulong)scanner));
+  translog_free_link(scanner->direct_link);
+  DBUG_VOID_RETURN;
+}
+
+
+/*
+  Checks End of the Log
+
+  SYNOPSIS
+    translog_scanner_eol()
+    scanner              Information about current chunk during scanning
+
+  RETURN
+    1  End of the Log
+    0  OK
+*/
+
+static my_bool translog_scanner_eol(TRANSLOG_SCANNER_DATA *scanner)
+{
+  DBUG_ENTER("translog_scanner_eol");
+  DBUG_PRINT("enter",
+             ("Horizon: (%lu, 0x%lx)  Current: (%lu, 0x%lx+0x%x=0x%lx)",
+              LSN_IN_PARTS(scanner->horizon),
+              LSN_IN_PARTS(scanner->page_addr),
+              (uint) scanner->page_offset,
+              (ulong) (LSN_OFFSET(scanner->page_addr) + scanner->page_offset)));
+  if (scanner->horizon > (scanner->page_addr +
+                          scanner->page_offset))
+  {
+    DBUG_PRINT("info", ("Horizon is not reached"));
+    DBUG_RETURN(0);
+  }
+  if (scanner->fixed_horizon)
+  {
+    DBUG_PRINT("info", ("Horizon is fixed and reached"));
+    DBUG_RETURN(1);
+  }
+  scanner->horizon= translog_get_horizon();
+  DBUG_PRINT("info",
+             ("Horizon is re-read, EOL: %d",
+              scanner->horizon <= (scanner->page_addr +
+                                   scanner->page_offset)));
+  DBUG_RETURN(scanner->horizon <= (scanner->page_addr +
+                                   scanner->page_offset));
+}
+
+
+/**
+  @brief Cheks End of the Page
+
+  @param scanner         Information about current chunk during scanning
+
+  @retval 1  End of the Page
+  @retval 0  OK
+*/
+
+static my_bool translog_scanner_eop(TRANSLOG_SCANNER_DATA *scanner)
+{
+  DBUG_ENTER("translog_scanner_eop");
+  DBUG_RETURN(scanner->page_offset >= TRANSLOG_PAGE_SIZE ||
+              scanner->page[scanner->page_offset] == TRANSLOG_FILLER);
+}
+
+
+/**
+  @brief Checks End of the File (i.e. we are scanning last page, which do not
+    mean end of this page)
+
+  @param scanner         Information about current chunk during scanning
+
+  @retval 1 End of the File
+  @retval 0 OK
+*/
+
+static my_bool translog_scanner_eof(TRANSLOG_SCANNER_DATA *scanner)
+{
+  DBUG_ENTER("translog_scanner_eof");
+  DBUG_ASSERT(LSN_FILE_NO(scanner->page_addr) ==
+              LSN_FILE_NO(scanner->last_file_page));
+  DBUG_PRINT("enter", ("curr Page: 0x%lx  last page: 0x%lx  "
+                       "normal EOF: %d",
+                       (ulong) LSN_OFFSET(scanner->page_addr),
+                       (ulong) LSN_OFFSET(scanner->last_file_page),
+                       LSN_OFFSET(scanner->page_addr) ==
+                       LSN_OFFSET(scanner->last_file_page)));
+  /*
+     TODO: detect damaged file EOF,
+     TODO: issue warning if damaged file EOF detected
+  */
+  DBUG_RETURN(scanner->page_addr ==
+              scanner->last_file_page);
+}
+
+/*
+  Move scanner to the next chunk
+
+  SYNOPSIS
+    translog_get_next_chunk()
+    scanner              Information about current chunk during scanning
+
+  RETURN
+    0  OK
+    1  Error
+*/
+
+static my_bool
+translog_get_next_chunk(TRANSLOG_SCANNER_DATA *scanner)
+{
+  uint16 len;
+  DBUG_ENTER("translog_get_next_chunk");
+
+  if (translog_scanner_eop(scanner))
+    len= TRANSLOG_PAGE_SIZE - scanner->page_offset;
+  else if ((len= translog_get_total_chunk_length(scanner->page,
+                                                 scanner->page_offset)) == 0)
+    DBUG_RETURN(1);
+  scanner->page_offset+= len;
+
+  if (translog_scanner_eol(scanner))
+  {
+    scanner->page= END_OF_LOG;
+    scanner->page_offset= 0;
+    DBUG_RETURN(0);
+  }
+  if (translog_scanner_eop(scanner))
+  {
+    /* before reading next page we should unpin current one if it was pinned */
+    translog_free_link(scanner->direct_link);
+    if (translog_scanner_eof(scanner))
+    {
+      DBUG_PRINT("info", ("horizon: (%lu,0x%lx)  pageaddr: (%lu,0x%lx)",
+                          LSN_IN_PARTS(scanner->horizon),
+                          LSN_IN_PARTS(scanner->page_addr)));
+      /* if it is log end it have to be caught before */
+      DBUG_ASSERT(LSN_FILE_NO(scanner->horizon) >
+                  LSN_FILE_NO(scanner->page_addr));
+      scanner->page_addr+= LSN_ONE_FILE;
+      scanner->page_addr= LSN_REPLACE_OFFSET(scanner->page_addr,
+                                             TRANSLOG_PAGE_SIZE);
+      if (translog_scanner_set_last_page(scanner))
+        DBUG_RETURN(1);
+    }
+    else
+    {
+      scanner->page_addr+= TRANSLOG_PAGE_SIZE; /* offset increased */
+    }
+
+    if (translog_scanner_get_page(scanner))
+      DBUG_RETURN(1);
+
+    scanner->page_offset= translog_get_first_chunk_offset(scanner->page);
+    if (translog_scanner_eol(scanner))
+    {
+      scanner->page= END_OF_LOG;
+      scanner->page_offset= 0;
+      DBUG_RETURN(0);
+    }
+    DBUG_ASSERT(scanner->page[scanner->page_offset] != TRANSLOG_FILLER);
+  }
+  DBUG_RETURN(0);
+}
+
+
+/**
+   @brief Get header of variable length record and call hook for it processing
+
+   @param page            Pointer to the buffer with page where LSN chunk is
+                          placed
+   @param page_offset     Offset of the first chunk in the page
+   @param buff            Buffer to be filled with header data
+   @param scanner         If present should be moved to the header page if
+                          it differ from LSN page
+
+   @return                Length of header or operation status
+     @retval RECHEADER_READ_ERROR  error
+     @retval RECHEADER_READ_EOF    End of the log reached during the read
+     @retval #                     number of bytes in
+                                   TRANSLOG_HEADER_BUFFER::header where
+                                   stored decoded part of the header
+*/
+
+static int
+translog_variable_length_header(uchar *page, translog_size_t page_offset,
+                                TRANSLOG_HEADER_BUFFER *buff,
+                                TRANSLOG_SCANNER_DATA *scanner)
+{
+  struct st_log_record_type_descriptor *desc= (log_record_type_descriptor +
+                                               buff->type);
+  uchar *src= page + page_offset + 1 + 2;
+  uchar *dst= buff->header;
+  LSN base_lsn;
+  uint lsns= desc->compressed_LSN;
+  uint16 chunk_len;
+  uint16 length= desc->read_header_len;
+  uint16 buffer_length= length;
+  uint16 body_len;
+  int rc;
+  TRANSLOG_SCANNER_DATA internal_scanner;
+  DBUG_ENTER("translog_variable_length_header");
+
+  buff->record_length= translog_variable_record_1group_decode_len(&src);
+  chunk_len= uint2korr(src);
+  DBUG_PRINT("info", ("rec len: %lu  chunk len: %u  length: %u  bufflen: %u",
+                      (ulong) buff->record_length, (uint) chunk_len,
+                      (uint) length, (uint) buffer_length));
+  if (chunk_len == 0)
+  {
+    uint16 page_rest;
+    DBUG_PRINT("info", ("1 group"));
+    src+= 2;
+    page_rest= (uint16) (TRANSLOG_PAGE_SIZE - (src - page));
+
+    base_lsn= buff->lsn;
+    body_len= min(page_rest, buff->record_length);
+  }
+  else
+  {
+    uint grp_no, curr;
+    uint header_to_skip;
+    uint16 page_rest;
+
+    DBUG_PRINT("info", ("multi-group"));
+    grp_no= buff->groups_no= uint2korr(src + 2);
+    if (!(buff->groups=
+          (TRANSLOG_GROUP*) my_malloc(sizeof(TRANSLOG_GROUP) * grp_no,
+                                      MYF(0))))
+      DBUG_RETURN(RECHEADER_READ_ERROR);
+    DBUG_PRINT("info", ("Groups: %u", (uint) grp_no));
+    src+= (2 + 2);
+    page_rest= (uint16) (TRANSLOG_PAGE_SIZE - (src - page));
+    curr= 0;
+    header_to_skip= src - (page + page_offset);
+    buff->chunk0_pages= 0;
+
+    for (;;)
+    {
+      uint i, read_length= grp_no;
+
+      buff->chunk0_pages++;
+      if (page_rest < grp_no * (7 + 1))
+        read_length= page_rest / (7 + 1);
+      DBUG_PRINT("info", ("Read chunk0 page#%u  read: %u  left: %u  "
+                          "start from: %u",
+                          buff->chunk0_pages, read_length, grp_no, curr));
+      for (i= 0; i < read_length; i++, curr++)
+      {
+        DBUG_ASSERT(curr < buff->groups_no);
+        buff->groups[curr].addr= lsn_korr(src + i * (7 + 1));
+        buff->groups[curr].num= src[i * (7 + 1) + 7];
+        DBUG_PRINT("info", ("group #%u (%lu,0x%lx)  chunks: %u",
+                            curr,
+                            LSN_IN_PARTS(buff->groups[curr].addr),
+                            (uint) buff->groups[curr].num));
+      }
+      grp_no-= read_length;
+      if (grp_no == 0)
+      {
+        if (scanner)
+        {
+          buff->chunk0_data_addr= scanner->page_addr;
+          /* offset increased */
+          buff->chunk0_data_addr+= (page_offset + header_to_skip +
+                                    read_length * (7 + 1));
+        }
+        else
+        {
+          buff->chunk0_data_addr= buff->lsn;
+          /* offset increased */
+          buff->chunk0_data_addr+= (header_to_skip + read_length * (7 + 1));
+        }
+        buff->chunk0_data_len= chunk_len - 2 - read_length * (7 + 1);
+        DBUG_PRINT("info", ("Data address: (%lu,0x%lx)  len: %u",
+                            LSN_IN_PARTS(buff->chunk0_data_addr),
+                            buff->chunk0_data_len));
+        break;
+      }
+      if (scanner == NULL)
+      {
+        DBUG_PRINT("info", ("use internal scanner for header reading"));
+        scanner= &internal_scanner;
+        if (translog_scanner_init(buff->lsn, 1, scanner, 0))
+        {
+          rc= RECHEADER_READ_ERROR;
+          goto exit_and_free;
+        }
+      }
+      if (translog_get_next_chunk(scanner))
+      {
+        if (scanner == &internal_scanner)
+          translog_destroy_scanner(scanner);
+        rc= RECHEADER_READ_ERROR;
+        goto exit_and_free;
+      }
+      if (scanner->page == END_OF_LOG)
+      {
+        if (scanner == &internal_scanner)
+          translog_destroy_scanner(scanner);
+        rc= RECHEADER_READ_EOF;
+        goto exit_and_free;
+      }
+      page= scanner->page;
+      page_offset= scanner->page_offset;
+      src= page + page_offset + header_to_skip;
+      chunk_len= uint2korr(src - 2 - 2);
+      DBUG_PRINT("info", ("Chunk len: %u", (uint) chunk_len));
+      page_rest= (uint16) (TRANSLOG_PAGE_SIZE - (src - page));
+    }
+
+    if (scanner == NULL)
+    {
+      DBUG_PRINT("info", ("use internal scanner"));
+      scanner= &internal_scanner;
+    }
+    else
+    {
+      translog_destroy_scanner(scanner);
+    }
+    base_lsn= buff->groups[0].addr;
+    translog_scanner_init(base_lsn, 1, scanner, scanner == &internal_scanner);
+    /* first group chunk is always chunk type 2 */
+    page= scanner->page;
+    page_offset= scanner->page_offset;
+    src= page + page_offset + 1;
+    page_rest= (uint16) (TRANSLOG_PAGE_SIZE - (src - page));
+    body_len= page_rest;
+    if (scanner == &internal_scanner)
+      translog_destroy_scanner(scanner);
+  }
+  if (lsns)
+  {
+    uchar *start= src;
+    src= translog_relative_LSN_decode(base_lsn, src, dst, lsns);
+    lsns*= LSN_STORE_SIZE;
+    dst+= lsns;
+    length-= lsns;
+    buff->record_length+= (buff->compressed_LSN_economy=
+                           (int) (lsns - (src - start)));
+    DBUG_PRINT("info", ("lsns: %u  length: %u  economy: %d  new length: %lu",
+                        lsns / LSN_STORE_SIZE, (uint) length,
+                        (int) buff->compressed_LSN_economy,
+                        (ulong) buff->record_length));
+    body_len-= (uint16) (src - start);
+  }
+  else
+    buff->compressed_LSN_economy= 0;
+
+  DBUG_ASSERT(body_len >= length);
+  body_len-= length;
+  memcpy(dst, src, length);
+  buff->non_header_data_start_offset= (uint16) (src + length - page);
+  buff->non_header_data_len= body_len;
+  DBUG_PRINT("info", ("non_header_data_start_offset: %u  len: %u  buffer: %u",
+                      buff->non_header_data_start_offset,
+                      buff->non_header_data_len, buffer_length));
+  DBUG_RETURN(buffer_length);
+
+exit_and_free:
+  my_free(buff->groups, MYF(0));
+  buff->groups_no= 0; /* prevent try to use of buff->groups */
+  DBUG_RETURN(rc);
+}
+
+
+/**
+   @brief Read record header from the given buffer
+
+   @param page            page content buffer
+   @param page_offset     offset of the chunk in the page
+   @param buff            destination buffer
+   @param scanner         If this is set the scanner will be moved to the
+                          record header page (differ from LSN page in case of
+                          multi-group records)
+
+   @return Length of header or operation status
+     @retval RECHEADER_READ_ERROR  error
+     @retval #                     number of bytes in
+                                   TRANSLOG_HEADER_BUFFER::header where
+                                   stored decoded part of the header
+*/
+
+int translog_read_record_header_from_buffer(uchar *page,
+                                            uint16 page_offset,
+                                            TRANSLOG_HEADER_BUFFER *buff,
+                                            TRANSLOG_SCANNER_DATA *scanner)
+{
+  translog_size_t res;
+  DBUG_ENTER("translog_read_record_header_from_buffer");
+  DBUG_PRINT("info", ("page byte: 0x%x  offset: %u",
+                      (uint) page[page_offset], (uint) page_offset));
+  DBUG_ASSERT(translog_is_LSN_chunk(page[page_offset]));
+  DBUG_ASSERT(translog_status == TRANSLOG_OK ||
+              translog_status == TRANSLOG_READONLY);
+  buff->type= (page[page_offset] & TRANSLOG_REC_TYPE);
+  buff->short_trid= uint2korr(page + page_offset + 1);
+  DBUG_PRINT("info", ("Type %u, Short TrID %u, LSN (%lu,0x%lx)",
+                      (uint) buff->type, (uint)buff->short_trid,
+                      LSN_IN_PARTS(buff->lsn)));
+  /* Read required bytes from the header and call hook */
+  switch (log_record_type_descriptor[buff->type].rclass) {
+  case LOGRECTYPE_VARIABLE_LENGTH:
+    res= translog_variable_length_header(page, page_offset, buff,
+                                         scanner);
+    break;
+  case LOGRECTYPE_PSEUDOFIXEDLENGTH:
+  case LOGRECTYPE_FIXEDLENGTH:
+    res= translog_fixed_length_header(page, page_offset, buff);
+    break;
+  default:
+    DBUG_ASSERT(0); /* we read some junk (got no LSN) */
+    res= RECHEADER_READ_ERROR;
+  }
+  DBUG_RETURN(res);
+}
+
+
+/**
+   @brief Read record header and some fixed part of a record (the part depend
+   on record type).
+
+   @param lsn             log record serial number (address of the record)
+   @param buff            log record header buffer
+
+   @note Some type of record can be read completely by this call
+   @note "Decoded" header stored in TRANSLOG_HEADER_BUFFER::header (relative
+   LSN can be translated to absolute one), some fields can be added (like
+   actual header length in the record if the header has variable length)
+
+   @return Length of header or operation status
+     @retval RECHEADER_READ_ERROR  error
+     @retval #                     number of bytes in
+                                   TRANSLOG_HEADER_BUFFER::header where
+                                   stored decoded part of the header
+*/
+
+int translog_read_record_header(LSN lsn, TRANSLOG_HEADER_BUFFER *buff)
+{
+  TRANSLOG_PAGE_SIZE_BUFF psize_buff;
+  uchar *page;
+  translog_size_t res, page_offset= LSN_OFFSET(lsn) % TRANSLOG_PAGE_SIZE;
+  PAGECACHE_BLOCK_LINK *direct_link;
+  TRANSLOG_ADDRESS addr;
+  TRANSLOG_VALIDATOR_DATA data;
+  DBUG_ENTER("translog_read_record_header");
+  DBUG_PRINT("enter", ("LSN: (%lu,0x%lx)", LSN_IN_PARTS(lsn)));
+  DBUG_ASSERT(LSN_OFFSET(lsn) % TRANSLOG_PAGE_SIZE != 0);
+  DBUG_ASSERT(translog_status == TRANSLOG_OK ||
+              translog_status == TRANSLOG_READONLY);
+
+  buff->lsn= lsn;
+  buff->groups_no= 0;
+  data.addr= &addr;
+  data.was_recovered= 0;
+  addr= lsn;
+  addr-= page_offset; /* offset decreasing */
+  res= (!(page= translog_get_page(&data, psize_buff.buffer, &direct_link))) ?
+    RECHEADER_READ_ERROR :
+    translog_read_record_header_from_buffer(page, page_offset, buff, 0);
+  translog_free_link(direct_link);
+  DBUG_RETURN(res);
+}
+
+
+/**
+   @brief Read record header and some fixed part of a record (the part depend
+   on record type).
+
+   @param scan            scanner position to read
+   @param buff            log record header buffer
+   @param move_scanner    request to move scanner to the header position
+
+   @note Some type of record can be read completely by this call
+   @note "Decoded" header stored in TRANSLOG_HEADER_BUFFER::header (relative
+   LSN can be translated to absolute one), some fields can be added (like
+   actual header length in the record if the header has variable length)
+
+   @return Length of header or operation status
+     @retval RECHEADER_READ_ERROR  error
+     @retval #                     number of bytes in
+                                   TRANSLOG_HEADER_BUFFER::header where stored
+                                   decoded part of the header
+*/
+
+int translog_read_record_header_scan(TRANSLOG_SCANNER_DATA *scanner,
+                                     TRANSLOG_HEADER_BUFFER *buff,
+                                     my_bool move_scanner)
+{
+  translog_size_t res;
+  DBUG_ENTER("translog_read_record_header_scan");
+  DBUG_PRINT("enter", ("Scanner: Cur: (%lu,0x%lx)  Hrz: (%lu,0x%lx)  "
+                       "Lst: (%lu,0x%lx)  Offset: %u(%x)  fixed %d",
+                       LSN_IN_PARTS(scanner->page_addr),
+                       LSN_IN_PARTS(scanner->horizon),
+                       LSN_IN_PARTS(scanner->last_file_page),
+                       (uint) scanner->page_offset,
+                       (uint) scanner->page_offset, scanner->fixed_horizon));
+  DBUG_ASSERT(translog_status == TRANSLOG_OK ||
+              translog_status == TRANSLOG_READONLY);
+  buff->groups_no= 0;
+  buff->lsn= scanner->page_addr;
+  buff->lsn+= scanner->page_offset; /* offset increasing */
+  res= translog_read_record_header_from_buffer(scanner->page,
+                                               scanner->page_offset,
+                                               buff,
+                                               (move_scanner ?
+                                                scanner : 0));
+  DBUG_RETURN(res);
+}
+
+
+/**
+   @brief Read record header and some fixed part of the next record (the part
+   depend on record type).
+
+   @param scanner         data for scanning if lsn is NULL scanner data
+                          will be used for continue scanning.
+                          The scanner can be NULL.
+
+   @param buff            log record header buffer
+
+   @return Length of header or operation status
+     @retval RECHEADER_READ_ERROR  error
+     @retval RECHEADER_READ_EOF    EOF
+     @retval #                     number of bytes in
+                                   TRANSLOG_HEADER_BUFFER::header where
+                                   stored decoded part of the header
+*/
+
+int translog_read_next_record_header(TRANSLOG_SCANNER_DATA *scanner,
+                                     TRANSLOG_HEADER_BUFFER *buff)
+{
+  translog_size_t res;
+
+  DBUG_ENTER("translog_read_next_record_header");
+  buff->groups_no= 0;        /* to be sure that we will free it right */
+  DBUG_PRINT("enter", ("scanner: 0x%lx", (ulong) scanner));
+  DBUG_PRINT("info", ("Scanner: Cur: (%lu,0x%lx)  Hrz: (%lu,0x%lx)  "
+                      "Lst: (%lu,0x%lx)  Offset: %u(%x)  fixed: %d",
+                      LSN_IN_PARTS(scanner->page_addr),
+                      LSN_IN_PARTS(scanner->horizon),
+                      LSN_IN_PARTS(scanner->last_file_page),
+                      (uint) scanner->page_offset,
+                      (uint) scanner->page_offset, scanner->fixed_horizon));
+  DBUG_ASSERT(translog_status == TRANSLOG_OK ||
+              translog_status == TRANSLOG_READONLY);
+
+  do
+  {
+    if (translog_get_next_chunk(scanner))
+      DBUG_RETURN(RECHEADER_READ_ERROR);
+    if (scanner->page == END_OF_LOG)
+    {
+       DBUG_PRINT("info", ("End of file from the scanner"));
+       /* Last record was read */
+       buff->lsn= LSN_IMPOSSIBLE;
+       DBUG_RETURN(RECHEADER_READ_EOF);
+    }
+    DBUG_PRINT("info", ("Page: (%lu,0x%lx)  offset: %lu  byte: %x",
+                        LSN_IN_PARTS(scanner->page_addr),
+                        (ulong) scanner->page_offset,
+                        (uint) scanner->page[scanner->page_offset]));
+  } while (!translog_is_LSN_chunk(scanner->page[scanner->page_offset]) &&
+           scanner->page[scanner->page_offset] != TRANSLOG_FILLER);
+
+  if (scanner->page[scanner->page_offset] == TRANSLOG_FILLER)
+  {
+    DBUG_PRINT("info", ("End of file"));
+    /* Last record was read */
+    buff->lsn= LSN_IMPOSSIBLE;
+    /* Return 'end of log' marker */
+    res= RECHEADER_READ_EOF;
+  }
+  else
+    res= translog_read_record_header_scan(scanner, buff, 0);
+  DBUG_RETURN(res);
+}
+
+
+/*
+  Moves record data reader to the next chunk and fill the data reader
+  information about that chunk.
+
+  SYNOPSIS
+    translog_record_read_next_chunk()
+    data                 data cursor
+
+  RETURN
+    0  OK
+    1  Error
+*/
+
+static my_bool translog_record_read_next_chunk(TRANSLOG_READER_DATA *data)
+{
+  translog_size_t new_current_offset= data->current_offset + data->chunk_size;
+  uint16 chunk_header_len, chunk_len;
+  uint8 type;
+  DBUG_ENTER("translog_record_read_next_chunk");
+
+  if (data->eor)
+  {
+    DBUG_PRINT("info", ("end of the record flag set"));
+    DBUG_RETURN(1);
+  }
+
+  if (data->header.groups_no &&
+      data->header.groups_no - 1 != data->current_group &&
+      data->header.groups[data->current_group].num == data->current_chunk)
+  {
+    /* Goto next group */
+    data->current_group++;
+    data->current_chunk= 0;
+    DBUG_PRINT("info", ("skip to group: #%u", data->current_group));
+    translog_destroy_scanner(&data->scanner);
+    translog_scanner_init(data->header.groups[data->current_group].addr,
+                          1, &data->scanner, 1);
+  }
+  else
+  {
+    data->current_chunk++;
+    if (translog_get_next_chunk(&data->scanner))
+      DBUG_RETURN(1);
+     if (data->scanner.page == END_OF_LOG)
+     {
+       /*
+         Actually it should not happened, but we want to quit nicely in case
+         of a truncated log
+       */
+       DBUG_RETURN(1);
+     }
+  }
+  type= data->scanner.page[data->scanner.page_offset] & TRANSLOG_CHUNK_TYPE;
+
+  if (type == TRANSLOG_CHUNK_LSN && data->header.groups_no)
+  {
+    DBUG_PRINT("info",
+               ("Last chunk: data len: %u  offset: %u  group: %u of %u",
+                data->header.chunk0_data_len, data->scanner.page_offset,
+                data->current_group, data->header.groups_no - 1));
+    DBUG_ASSERT(data->header.groups_no - 1 == data->current_group);
+    DBUG_ASSERT(data->header.lsn ==
+                data->scanner.page_addr + data->scanner.page_offset);
+    translog_destroy_scanner(&data->scanner);
+    translog_scanner_init(data->header.chunk0_data_addr, 1, &data->scanner, 1);
+    data->chunk_size= data->header.chunk0_data_len;
+    data->body_offset= data->scanner.page_offset;
+    data->current_offset= new_current_offset;
+    data->eor= 1;
+    DBUG_RETURN(0);
+  }
+
+  if (type == TRANSLOG_CHUNK_LSN || type == TRANSLOG_CHUNK_FIXED)
+  {
+    data->eor= 1;
+    DBUG_RETURN(1);                             /* End of record */
+  }
+
+  chunk_header_len=
+    translog_get_chunk_header_length(data->scanner.page +
+                                     data->scanner.page_offset);
+  chunk_len= translog_get_total_chunk_length(data->scanner.page,
+                                             data->scanner.page_offset);
+  data->chunk_size= chunk_len - chunk_header_len;
+  data->body_offset= data->scanner.page_offset + chunk_header_len;
+  data->current_offset= new_current_offset;
+  DBUG_PRINT("info", ("grp: %u  chunk: %u  body_offset: %u  chunk_size: %u  "
+                      "current_offset: %lu",
+                      (uint) data->current_group,
+                      (uint) data->current_chunk,
+                      (uint) data->body_offset,
+                      (uint) data->chunk_size, (ulong) data->current_offset));
+  DBUG_RETURN(0);
+}
+
+
+/*
+  Initialize record reader data from LSN
+
+  SYNOPSIS
+    translog_init_reader_data()
+    lsn                  reference to LSN we should start from
+    data                 reader data to initialize
+
+  RETURN
+    0  OK
+    1  Error
+*/
+
+static my_bool translog_init_reader_data(LSN lsn,
+                                         TRANSLOG_READER_DATA *data)
+{
+  int read_header;
+  DBUG_ENTER("translog_init_reader_data");
+  if (translog_scanner_init(lsn, 1, &data->scanner, 1) ||
+      ((read_header=
+        translog_read_record_header_scan(&data->scanner, &data->header, 1))
+       == RECHEADER_READ_ERROR))
+    DBUG_RETURN(1);
+  data->read_header= read_header;
+  data->body_offset= data->header.non_header_data_start_offset;
+  data->chunk_size= data->header.non_header_data_len;
+  data->current_offset= data->read_header;
+  data->current_group= 0;
+  data->current_chunk= 0;
+  data->eor= 0;
+  DBUG_PRINT("info", ("read_header: %u  "
+                      "body_offset: %u  chunk_size: %u  current_offset: %lu",
+                      (uint) data->read_header,
+                      (uint) data->body_offset,
+                      (uint) data->chunk_size, (ulong) data->current_offset));
+  DBUG_RETURN(0);
+}
+
+
+/**
+  @brief Destroy reader data object
+*/
+
+static void translog_destroy_reader_data(TRANSLOG_READER_DATA *data)
+{
+  translog_destroy_scanner(&data->scanner);
+  translog_free_record_header(&data->header);
+}
+
+
+/*
+  Read a part of the record.
+
+  SYNOPSIS
+    translog_read_record_header()
+    lsn                  log record serial number (address of the record)
+    offset               From the beginning of the record beginning (read
+                         by translog_read_record_header).
+    length               Length of record part which have to be read.
+    buffer               Buffer where to read the record part (have to be at
+                         least 'length' bytes length)
+
+  RETURN
+    length of data actually read
+*/
+
+translog_size_t translog_read_record(LSN lsn,
+                                     translog_size_t offset,
+                                     translog_size_t length,
+                                     uchar *buffer,
+                                     TRANSLOG_READER_DATA *data)
+{
+  translog_size_t requested_length= length;
+  translog_size_t end= offset + length;
+  TRANSLOG_READER_DATA internal_data;
+  DBUG_ENTER("translog_read_record");
+  DBUG_ASSERT(translog_status == TRANSLOG_OK ||
+              translog_status == TRANSLOG_READONLY);
+
+  if (data == NULL)
+  {
+    DBUG_ASSERT(lsn != LSN_IMPOSSIBLE);
+    data= &internal_data;
+  }
+  if (lsn ||
+      (offset < data->current_offset &&
+       !(offset < data->read_header && offset + length < data->read_header)))
+  {
+    if (translog_init_reader_data(lsn, data))
+      DBUG_RETURN(0);
+  }
+  DBUG_PRINT("info", ("Offset: %lu  length: %lu  "
+                      "Scanner: Cur: (%lu,0x%lx)  Hrz: (%lu,0x%lx)  "
+                      "Lst: (%lu,0x%lx)  Offset: %u(%x)  fixed: %d",
+                      (ulong) offset, (ulong) length,
+                      LSN_IN_PARTS(data->scanner.page_addr),
+                      LSN_IN_PARTS(data->scanner.horizon),
+                      LSN_IN_PARTS(data->scanner.last_file_page),
+                      (uint) data->scanner.page_offset,
+                      (uint) data->scanner.page_offset,
+                      data->scanner.fixed_horizon));
+  if (offset < data->read_header)
+  {
+    uint16 len= min(data->read_header, end) - offset;
+    DBUG_PRINT("info",
+               ("enter header offset: %lu  length: %lu",
+                (ulong) offset, (ulong) length));
+    memcpy(buffer, data->header.header + offset, len);
+    length-= len;
+    if (length == 0)
+    {
+      translog_destroy_reader_data(data);
+      DBUG_RETURN(requested_length);
+    }
+    offset+= len;
+    buffer+= len;
+    DBUG_PRINT("info",
+               ("len: %u  offset: %lu   curr: %lu  length: %lu",
+                len, (ulong) offset, (ulong) data->current_offset,
+                (ulong) length));
+  }
+  /* TODO: find first page which we should read by offset */
+
+  /* read the record chunk by chunk */
+  for(;;)
+  {
+    uint page_end= data->current_offset + data->chunk_size;
+    DBUG_PRINT("info",
+               ("enter body offset: %lu  curr: %lu  "
+                "length: %lu  page_end: %lu",
+                (ulong) offset, (ulong) data->current_offset, (ulong) length,
+                (ulong) page_end));
+    if (offset < page_end)
+    {
+      uint len= page_end - offset;
+      set_if_smaller(len, length); /* in case we read beyond record's end */
+      DBUG_ASSERT(offset >= data->current_offset);
+      memcpy(buffer,
+              data->scanner.page + data->body_offset +
+              (offset - data->current_offset), len);
+      length-= len;
+      if (length == 0)
+      {
+        translog_destroy_reader_data(data);
+        DBUG_RETURN(requested_length);
+      }
+      offset+= len;
+      buffer+= len;
+      DBUG_PRINT("info",
+                 ("len: %u  offset: %lu  curr: %lu  length: %lu",
+                  len, (ulong) offset, (ulong) data->current_offset,
+                  (ulong) length));
+    }
+    if (translog_record_read_next_chunk(data))
+    {
+      translog_destroy_reader_data(data);
+      DBUG_RETURN(requested_length - length);
+    }
+  }
+}
+
+
+/*
+  @brief Force skipping to the next buffer
+
+  @todo Do not copy old page content if all page protections are switched off
+  (because we do not need calculate something or change old parts of the page)
+*/
+
+static void translog_force_current_buffer_to_finish()
+{
+  TRANSLOG_ADDRESS new_buff_beginning;
+  uint16 old_buffer_no= log_descriptor.bc.buffer_no;
+  uint16 new_buffer_no= (old_buffer_no + 1) % TRANSLOG_BUFFERS_NO;
+  struct st_translog_buffer *new_buffer= (log_descriptor.buffers +
+                                          new_buffer_no);
+  struct st_translog_buffer *old_buffer= log_descriptor.bc.buffer;
+  uchar *data= log_descriptor.bc.ptr - log_descriptor.bc.current_page_fill;
+  uint16 left= TRANSLOG_PAGE_SIZE - log_descriptor.bc.current_page_fill;
+  uint16 current_page_fill, write_counter, previous_offset;
+  DBUG_ENTER("translog_force_current_buffer_to_finish");
+  DBUG_PRINT("enter", ("Buffer #%u 0x%lx  "
+                       "Buffer addr: (%lu,0x%lx)  "
+                       "Page addr: (%lu,0x%lx)  "
+                       "size: %lu (%lu)  Pg: %u  left: %u  in progress %u",
+                       (uint) old_buffer_no,
+                       (ulong) old_buffer,
+                       LSN_IN_PARTS(old_buffer->offset),
+                       (ulong) LSN_FILE_NO(log_descriptor.horizon),
+                       (ulong) (LSN_OFFSET(log_descriptor.horizon) -
+                                log_descriptor.bc.current_page_fill),
+                       (ulong) old_buffer->size,
+                       (ulong) (log_descriptor.bc.ptr -log_descriptor.bc.
+                                buffer->buffer),
+                       (uint) log_descriptor.bc.current_page_fill,
+                       (uint) left,
+                       (uint) old_buffer->
+                       copy_to_buffer_in_progress));
+  translog_lock_assert_owner();
+  LINT_INIT(current_page_fill);
+  new_buff_beginning= old_buffer->offset;
+  new_buff_beginning+= old_buffer->size; /* increase offset */
+
+  DBUG_ASSERT(log_descriptor.bc.ptr !=NULL);
+  DBUG_ASSERT(LSN_FILE_NO(log_descriptor.horizon) ==
+              LSN_FILE_NO(old_buffer->offset));
+  translog_check_cursor(&log_descriptor.bc);
+  DBUG_ASSERT(left < TRANSLOG_PAGE_SIZE);
+  if (left)
+  {
+    /*
+       TODO: if 'left' is so small that can't hold any other record
+       then do not move the page
+    */
+    DBUG_PRINT("info", ("left: %u", (uint) left));
+
+    old_buffer->pre_force_close_horizon=
+      old_buffer->offset + old_buffer->size;
+    /* decrease offset */
+    new_buff_beginning-= log_descriptor.bc.current_page_fill;
+    current_page_fill= log_descriptor.bc.current_page_fill;
+
+    memset(log_descriptor.bc.ptr, TRANSLOG_FILLER, left);
+    old_buffer->size+= left;
+    DBUG_PRINT("info", ("Finish Page buffer #%u: 0x%lx  "
+                        "Size: %lu",
+                        (uint) old_buffer->buffer_no,
+                        (ulong) old_buffer,
+                        (ulong) old_buffer->size));
+    DBUG_ASSERT(old_buffer->buffer_no ==
+                log_descriptor.bc.buffer_no);
+  }
+  else
+  {
+    log_descriptor.bc.current_page_fill= 0;
+  }
+
+  translog_buffer_lock(new_buffer);
+#ifndef DBUG_OFF
+  {
+    TRANSLOG_ADDRESS offset= new_buffer->offset;
+    TRANSLOG_FILE *file= new_buffer->file;
+    uint8 ver= new_buffer->ver;
+    translog_lock_assert_owner();
+#endif
+    translog_wait_for_buffer_free(new_buffer);
+#ifndef DBUG_OFF
+    /* We keep the handler locked so nobody can start this new buffer */
+    DBUG_ASSERT(offset == new_buffer->offset && new_buffer->file == NULL &&
+                (file == NULL ? ver : (uint8)(ver + 1)) == new_buffer->ver);
+  }
+#endif
+
+  write_counter= log_descriptor.bc.write_counter;
+  previous_offset= log_descriptor.bc.previous_offset;
+  translog_start_buffer(new_buffer, &log_descriptor.bc, new_buffer_no);
+  /* Fix buffer offset (which was incorrectly set to horizon) */
+  log_descriptor.bc.buffer->offset= new_buff_beginning;
+  log_descriptor.bc.write_counter= write_counter;
+  log_descriptor.bc.previous_offset= previous_offset;
+  new_buffer->prev_last_lsn= BUFFER_MAX_LSN(old_buffer);
+  DBUG_PRINT("info", ("prev_last_lsn set to (%lu,0x%lx)  buffer: 0x%lx",
+                      LSN_IN_PARTS(new_buffer->prev_last_lsn),
+                      (ulong) new_buffer));
+
+  /*
+    Advances this log pointer, increases writers and let other threads to
+    write to the log while we process old page content
+  */
+  if (left)
+  {
+    log_descriptor.bc.ptr+= current_page_fill;
+    log_descriptor.bc.buffer->size= log_descriptor.bc.current_page_fill=
+      current_page_fill;
+    new_buffer->overlay= 1;
+  }
+  else
+    translog_new_page_header(&log_descriptor.horizon, &log_descriptor.bc);
+  translog_buffer_increase_writers(new_buffer);
+  translog_buffer_unlock(new_buffer);
+
+  /*
+    We have to wait until all writers finish before start changing the
+    pages by applying protection and copying the page content in the
+    new buffer.
+  */
+#ifndef DBUG_OFF
+  {
+    TRANSLOG_ADDRESS offset= old_buffer->offset;
+    TRANSLOG_FILE *file= old_buffer->file;
+    uint8 ver= old_buffer->ver;
+#endif
+    /*
+      Now only one thread can flush log (buffer can flush many threads but
+      log flush log flush where this function is used can do only one thread)
+      so no other thread can set is_closing_buffer.
+    */
+    DBUG_ASSERT(!old_buffer->is_closing_buffer);
+    old_buffer->is_closing_buffer= 1; /* Other flushes will wait */
+    DBUG_PRINT("enter", ("Buffer #%u 0x%lx  is_closing_buffer set",
+                         (uint) old_buffer->buffer_no, (ulong) old_buffer));
+    translog_wait_for_writers(old_buffer);
+#ifndef DBUG_OFF
+    /* We blocked flushing this buffer so the buffer should not changed */
+    DBUG_ASSERT(offset == old_buffer->offset && file == old_buffer->file &&
+                ver == old_buffer->ver);
+  }
+#endif
+
+  if (log_descriptor.flags & TRANSLOG_SECTOR_PROTECTION)
+  {
+    translog_put_sector_protection(data, &log_descriptor.bc);
+    if (left)
+    {
+      log_descriptor.bc.write_counter++;
+      log_descriptor.bc.previous_offset= current_page_fill;
+    }
+    else
+    {
+      DBUG_PRINT("info", ("drop write_counter"));
+      log_descriptor.bc.write_counter= 0;
+      log_descriptor.bc.previous_offset= 0;
+    }
+  }
+
+  if (log_descriptor.flags & TRANSLOG_PAGE_CRC)
+  {
+    uint32 crc= translog_crc(data + log_descriptor.page_overhead,
+                             TRANSLOG_PAGE_SIZE -
+                             log_descriptor.page_overhead);
+    DBUG_PRINT("info", ("CRC: 0x%lx", (ulong) crc));
+    int4store(data + 3 + 3 + 1, crc);
+  }
+  old_buffer->is_closing_buffer= 0;
+  DBUG_PRINT("enter", ("Buffer #%u 0x%lx  is_closing_buffer cleared",
+                       (uint) old_buffer->buffer_no, (ulong) old_buffer));
+  pthread_cond_broadcast(&old_buffer->waiting_filling_buffer);
+
+  if (left)
+  {
+    if (log_descriptor.flags &
+        (TRANSLOG_PAGE_CRC | TRANSLOG_SECTOR_PROTECTION))
+      memcpy(new_buffer->buffer, data, current_page_fill);
+    else
+    {
+      /*
+        This page header does not change if we add more data to the page so
+        we can not copy it and will not overwrite later
+      */
+      new_buffer->skipped_data= current_page_fill;
+#ifndef DBUG_OFF
+      memset(new_buffer->buffer, 0xa5, current_page_fill);
+#endif
+      DBUG_ASSERT(new_buffer->skipped_data < TRANSLOG_PAGE_SIZE);
+    }
+  }
+  old_buffer->next_buffer_offset= new_buffer->offset;
+  translog_buffer_lock(new_buffer);
+  new_buffer->prev_buffer_offset= old_buffer->offset;
+  translog_buffer_decrease_writers(new_buffer);
+  translog_buffer_unlock(new_buffer);
+
+  DBUG_VOID_RETURN;
+}
+
+
+/**
+  @brief Waits while given lsn will be flushed
+
+  @param  lsn            log record serial number up to which (inclusive)
+                         the log has to be flushed
+*/
+
+void  translog_flush_wait_for_end(LSN lsn)
+{
+  DBUG_ENTER("translog_flush_wait_for_end");
+  DBUG_PRINT("enter", ("LSN: (%lu,0x%lx)", LSN_IN_PARTS(lsn)));
+  safe_mutex_assert_owner(&log_descriptor.log_flush_lock);
+  while (cmp_translog_addr(log_descriptor.flushed, lsn) < 0)
+    pthread_cond_wait(&log_descriptor.log_flush_cond,
+                      &log_descriptor.log_flush_lock);
+  DBUG_VOID_RETURN;
+}
+
+
+/**
+  @brief Sets goal for the next flush pass and waits for this pass end.
+
+  @param  lsn            log record serial number up to which (inclusive)
+                         the log has to be flushed
+*/
+
+void translog_flush_set_new_goal_and_wait(TRANSLOG_ADDRESS lsn)
+{
+  int flush_no= log_descriptor.flush_no;
+  DBUG_ENTER("translog_flush_set_new_goal_and_wait");
+  DBUG_PRINT("enter", ("LSN: (%lu,0x%lx)", LSN_IN_PARTS(lsn)));
+  safe_mutex_assert_owner(&log_descriptor.log_flush_lock);
+  if (cmp_translog_addr(lsn, log_descriptor.next_pass_max_lsn) > 0)
+  {
+    log_descriptor.next_pass_max_lsn= lsn;
+    log_descriptor.max_lsn_requester= pthread_self();
+    pthread_cond_broadcast(&log_descriptor.new_goal_cond);
+  }
+  while (flush_no == log_descriptor.flush_no)
+  {
+    pthread_cond_wait(&log_descriptor.log_flush_cond,
+                      &log_descriptor.log_flush_lock);
+  }
+  DBUG_VOID_RETURN;
+}
+
+
+/**
+  @brief sync() range of files (inclusive) and directory (by request)
+
+  @param min             min internal file number to flush
+  @param max             max internal file number to flush
+  @param sync_dir        need sync directory
+
+  return Operation status
+    @retval 0      OK
+    @retval 1      Error
+*/
+
+static my_bool translog_sync_files(uint32 min, uint32 max,
+                                   my_bool sync_dir)
+{
+  uint fn;
+  my_bool rc= 0;
+  ulonglong flush_interval;
+  DBUG_ENTER("translog_sync_files");
+  DBUG_PRINT("info", ("min: %lu  max: %lu  sync dir: %d",
+                      (ulong) min, (ulong) max, (int) sync_dir));
+  DBUG_ASSERT(min <= max);
+
+  flush_interval= group_commit_wait;
+  if (flush_interval)
+    flush_start= my_micro_time();
+  for (fn= min; fn <= max; fn++)
+  {
+    TRANSLOG_FILE *file= get_logfile_by_number(fn);
+    DBUG_ASSERT(file != NULL);
+    if (!file->is_sync)
+    {
+      if (my_sync(file->handler.file, MYF(MY_WME)))
+      {
+        rc= 1;
+        translog_stop_writing();
+        DBUG_RETURN(rc);
+      }
+      translog_syncs++;
+      file->is_sync= 1;
+    }
+  }
+
+  if (sync_dir)
+  {
+    if (!(rc= sync_dir(log_descriptor.directory_fd,
+                       MYF(MY_WME | MY_IGNORE_BADFD))))
+      translog_syncs++;
+  }
+
+  DBUG_RETURN(rc);
+}
+
+
+/*
+  @brief Flushes buffers with LSNs in them less or equal address <lsn>
+
+  @param lsn             address up to which all LSNs should be flushed,
+                         can be reset to real last LSN address
+  @parem sent_to_disk    returns 'sent to disk' position
+  @param flush_horizon   returns horizon of the flush
+
+  @note About terminology see comment to translog_flush().
+*/
+
+void translog_flush_buffers(TRANSLOG_ADDRESS *lsn,
+                               TRANSLOG_ADDRESS *sent_to_disk,
+                               TRANSLOG_ADDRESS *flush_horizon)
+{
+  dirty_buffer_mask_t dirty_buffer_mask;
+  uint i;
+  uint8 last_buffer_no, start_buffer_no;
+  DBUG_ENTER("translog_flush_buffers");
+
+  /*
+    We will recheck information when will lock buffers one by
+    one so we can use unprotected read here (this is just for
+    speed up buffers processing)
+  */
+  dirty_buffer_mask= log_descriptor.dirty_buffer_mask;
+  DBUG_PRINT("info", ("Dirty buffer mask: %lx  current buffer: %u",
+                      (ulong) dirty_buffer_mask,
+                      (uint) log_descriptor.bc.buffer_no));
+  for (i= (log_descriptor.bc.buffer_no + 1) % TRANSLOG_BUFFERS_NO;
+       i != log_descriptor.bc.buffer_no && !(dirty_buffer_mask & (1 << i));
+       i= (i + 1) % TRANSLOG_BUFFERS_NO) {}
+  start_buffer_no= i;
+
+  DBUG_PRINT("info",
+             ("start from: %u  current: %u  prev last lsn: (%lu,0x%lx)",
+              (uint) start_buffer_no, (uint) log_descriptor.bc.buffer_no,
+              LSN_IN_PARTS(log_descriptor.bc.buffer->prev_last_lsn)));
+
+
+  /*
+    if LSN up to which we have to flush bigger then maximum LSN of previous
+    buffer and at least one LSN was saved in the current buffer (last_lsn !=
+    LSN_IMPOSSIBLE) then we have to close the current buffer.
+  */
+  if (cmp_translog_addr(*lsn, log_descriptor.bc.buffer->prev_last_lsn) > 0 &&
+      log_descriptor.bc.buffer->last_lsn != LSN_IMPOSSIBLE)
+  {
+    struct st_translog_buffer *buffer= log_descriptor.bc.buffer;
+    *lsn= log_descriptor.bc.buffer->last_lsn; /* fix lsn if it was horizon */
+    DBUG_PRINT("info", ("LSN to flush fixed to last lsn: (%lu,0x%lx)",
+                        LSN_IN_PARTS(log_descriptor.bc.buffer->last_lsn)));
+    last_buffer_no= log_descriptor.bc.buffer_no;
+    log_descriptor.is_everything_flushed= 1;
+    translog_force_current_buffer_to_finish();
+    translog_buffer_unlock(buffer);
+  }
+  else
+  {
+    last_buffer_no= ((log_descriptor.bc.buffer_no + TRANSLOG_BUFFERS_NO -1) %
+                     TRANSLOG_BUFFERS_NO);
+    translog_unlock();
+  }
+
+  /* flush buffers */
+  *sent_to_disk= translog_get_sent_to_disk();
+  if (cmp_translog_addr(*lsn, *sent_to_disk) > 0)
+  {
+
+    DBUG_PRINT("info", ("Start buffer #: %u  last buffer #: %u",
+                        (uint) start_buffer_no, (uint) last_buffer_no));
+    last_buffer_no= (last_buffer_no + 1) % TRANSLOG_BUFFERS_NO;
+    i= start_buffer_no;
+    do
+    {
+      struct st_translog_buffer *buffer= log_descriptor.buffers + i;
+      translog_buffer_lock(buffer);
+      DBUG_PRINT("info", ("Check buffer: 0x%lx  #: %u  "
+                          "prev last LSN: (%lu,0x%lx)  "
+                          "last LSN: (%lu,0x%lx)  status: %s",
+                          (ulong)(buffer),
+                          (uint) i,
+                          LSN_IN_PARTS(buffer->prev_last_lsn),
+                          LSN_IN_PARTS(buffer->last_lsn),
+                          (buffer->file ?
+                           "dirty" : "closed")));
+      if (buffer->prev_last_lsn <= *lsn &&
+          buffer->file != NULL)
+      {
+        DBUG_ASSERT(*flush_horizon <= buffer->offset + buffer->size);
+        *flush_horizon= (buffer->pre_force_close_horizon != LSN_IMPOSSIBLE ?
+                         buffer->pre_force_close_horizon :
+                         buffer->offset + buffer->size);
+        /* pre_force_close_horizon is reset during new buffer start */
+        DBUG_PRINT("info", ("flush_horizon: (%lu,0x%lx)",
+                            LSN_IN_PARTS(*flush_horizon)));
+        DBUG_ASSERT(*flush_horizon <= log_descriptor.horizon);
+
+        translog_buffer_flush(buffer);
+      }
+      translog_buffer_unlock(buffer);
+      i= (i + 1) % TRANSLOG_BUFFERS_NO;
+    } while (i != last_buffer_no);
+    *sent_to_disk= translog_get_sent_to_disk();
+  }
+
+  DBUG_VOID_RETURN;
+}
+
+/**
+  @brief Flush the log up to given LSN (included)
+
+  @param  lsn            log record serial number up to which (inclusive)
+                         the log has to be flushed
+
+  @return Operation status
+    @retval 0      OK
+    @retval 1      Error
+
+  @note
+
+  - Non group commit logic: Commits made in passes. Thread which started
+  flush first is performing actual flush, other threads sets new goal (LSN)
+  of the next pass (if it is maximum) and waits for the pass end or just
+  wait for the pass end.
+
+  - If hard group commit enabled and rate set to zero:
+  The first thread sends all changed buffers to disk. This is repeated
+  as long as there are new LSNs added. The process can not loop
+  forever because we have limited number of threads and they will wait
+  for the data to be synced.
+  Pseudo code:
+
+   do
+     send changed buffers to disk
+   while new_goal
+   sync
+
+  - If hard group commit switched ON and less than rate microseconds has
+  passed from last sync, then after buffers have been sent to disk
+  wait until rate microseconds has passed since last sync, do sync and return.
+  This ensures that if we call sync infrequently we don't do any waits.
+
+  - If soft group commit enabled everything works as with 'non group commit'
+  but the thread doesn't do any real sync(). If rate is not zero the
+  sync() will be performed by a service thread with the given rate
+  when needed (new LSN appears).
+
+  @note Terminology:
+  'sent to disk' means written to disk but not sync()ed,
+  'flushed' mean sent to disk and synced().
+*/
+
+my_bool translog_flush(TRANSLOG_ADDRESS lsn)
+{
+  struct timespec abstime;
+  ulonglong flush_interval;
+  ulonglong time_spent;
+  LSN sent_to_disk= LSN_IMPOSSIBLE;
+  TRANSLOG_ADDRESS flush_horizon;
+  my_bool rc= 0;
+  my_bool hgroup_commit_at_start;
+  DBUG_ENTER("translog_flush");
+  DBUG_PRINT("enter", ("Flush up to LSN: (%lu,0x%lx)", LSN_IN_PARTS(lsn)));
+  DBUG_ASSERT(translog_status == TRANSLOG_OK ||
+              translog_status == TRANSLOG_READONLY);
+  LINT_INIT(sent_to_disk);
+  LINT_INIT(flush_interval);
+
+  pthread_mutex_lock(&log_descriptor.log_flush_lock);
+  DBUG_PRINT("info", ("Everything is flushed up to (%lu,0x%lx)",
+                      LSN_IN_PARTS(log_descriptor.flushed)));
+  if (cmp_translog_addr(log_descriptor.flushed, lsn) >= 0)
+  {
+    pthread_mutex_unlock(&log_descriptor.log_flush_lock);
+    DBUG_RETURN(0);
+  }
+  if (log_descriptor.flush_in_progress)
+  {
+    translog_lock();
+    /* fix lsn if it was horizon */
+    if (cmp_translog_addr(lsn, log_descriptor.bc.buffer->last_lsn) > 0)
+      lsn= BUFFER_MAX_LSN(log_descriptor.bc.buffer);
+    translog_unlock();
+    translog_flush_set_new_goal_and_wait(lsn);
+    if (!pthread_equal(log_descriptor.max_lsn_requester, pthread_self()))
+    {
+      /*
+        translog_flush_wait_for_end() release log_flush_lock while is
+        waiting then acquire it again
+      */
+      translog_flush_wait_for_end(lsn);
+      pthread_mutex_unlock(&log_descriptor.log_flush_lock);
+      DBUG_RETURN(0);
+    }
+    log_descriptor.next_pass_max_lsn= LSN_IMPOSSIBLE;
+  }
+  log_descriptor.flush_in_progress= 1;
+  flush_horizon= log_descriptor.previous_flush_horizon;
+  DBUG_PRINT("info", ("flush_in_progress is set, flush_horizon: (%lu,0x%lx)",
+                      LSN_IN_PARTS(flush_horizon)));
+  pthread_mutex_unlock(&log_descriptor.log_flush_lock);
+
+  hgroup_commit_at_start= hard_group_commit;
+  if (hgroup_commit_at_start)
+    flush_interval= group_commit_wait;
+
+  translog_lock();
+  if (log_descriptor.is_everything_flushed)
+  {
+    DBUG_PRINT("info", ("everything is flushed"));
+    translog_unlock();
+    pthread_mutex_lock(&log_descriptor.log_flush_lock);
+    goto out;
+  }
+
+  for (;;)
+  {
+    /* Following function flushes buffers and makes translog_unlock() */
+    translog_flush_buffers(&lsn, &sent_to_disk, &flush_horizon);
+
+    if (!hgroup_commit_at_start)
+      break;  /* flush pass is ended */
+
+retest:
+    /*
+      We do not check time here because pthread_mutex_lock rarely takes
+      a lot of time so we can sacrifice a bit precision to performance
+      (taking into account that my_micro_time() might be expensive call).
+    */
+    if (flush_interval == 0)
+      break;  /* flush pass is ended */
+
+    pthread_mutex_lock(&log_descriptor.log_flush_lock);
+    if (log_descriptor.next_pass_max_lsn == LSN_IMPOSSIBLE)
+    {
+      if (flush_interval == 0 ||
+          (time_spent= (my_micro_time() - flush_start)) >= flush_interval)
+      {
+        pthread_mutex_unlock(&log_descriptor.log_flush_lock);
+        break;
+      }
+      DBUG_PRINT("info", ("flush waits: %llu  interval: %llu  spent: %llu",
+                          flush_interval - time_spent,
+                          flush_interval, time_spent));
+      /* wait time or next goal */
+      set_timespec_nsec(abstime, flush_interval - time_spent);
+      pthread_cond_timedwait(&log_descriptor.new_goal_cond,
+                             &log_descriptor.log_flush_lock,
+                             &abstime);
+      pthread_mutex_unlock(&log_descriptor.log_flush_lock);
+      DBUG_PRINT("info", ("retest conditions"));
+      goto retest;
+    }
+
+    /* take next goal */
+    lsn= log_descriptor.next_pass_max_lsn;
+    log_descriptor.next_pass_max_lsn= LSN_IMPOSSIBLE;
+    /* prevent other thread from continue */
+    log_descriptor.max_lsn_requester= pthread_self();
+    DBUG_PRINT("info", ("flush took next goal: (%lu,0x%lx)",
+                        LSN_IN_PARTS(lsn)));
+    pthread_mutex_unlock(&log_descriptor.log_flush_lock);
+
+    /* next flush pass */
+    DBUG_PRINT("info", ("next flush pass"));
+    translog_lock();
+  }
+
+  /*
+    sync() files from previous flush till current one
+  */
+  if (!soft_sync || hgroup_commit_at_start)
+  {
+    if ((rc=
+         translog_sync_files(LSN_FILE_NO(log_descriptor.flushed),
+                             LSN_FILE_NO(lsn),
+                             sync_log_dir >= TRANSLOG_SYNC_DIR_ALWAYS &&
+                             (LSN_FILE_NO(log_descriptor.
+                                          previous_flush_horizon) !=
+                              LSN_FILE_NO(flush_horizon) ||
+                              (LSN_OFFSET(log_descriptor.
+                                          previous_flush_horizon) /
+                               TRANSLOG_PAGE_SIZE) !=
+                              (LSN_OFFSET(flush_horizon) /
+                               TRANSLOG_PAGE_SIZE)))))
+    {
+      sent_to_disk= LSN_IMPOSSIBLE;
+      pthread_mutex_lock(&log_descriptor.log_flush_lock);
+      goto out;
+    }
+    /* keep values for soft sync() and forced sync() actual */
+    {
+      uint32 fileno= LSN_FILE_NO(lsn);
+      soft_sync_min= fileno;
+      soft_sync_max= fileno;
+    }
+  }
+  else
+  {
+    soft_sync_max= LSN_FILE_NO(lsn);
+    soft_need_sync= 1;
+  }
+
+  DBUG_ASSERT(flush_horizon <= log_descriptor.horizon);
+
+  pthread_mutex_lock(&log_descriptor.log_flush_lock);
+  log_descriptor.previous_flush_horizon= flush_horizon;
+out:
+  if (sent_to_disk != LSN_IMPOSSIBLE)
+    log_descriptor.flushed= sent_to_disk;
+  log_descriptor.flush_in_progress= 0;
+  log_descriptor.flush_no++;
+  DBUG_PRINT("info", ("flush_in_progress is dropped"));
+  pthread_mutex_unlock(&log_descriptor.log_flush_lock);
+  pthread_cond_broadcast(&log_descriptor.log_flush_cond);
+  DBUG_RETURN(rc);
+}
+
+
+/**
+   @brief Gives a 2-byte-id to MARIA_SHARE and logs this fact
+
+   If a MARIA_SHARE does not yet have a 2-byte-id (unique over all currently
+   open MARIA_SHAREs), give it one and record this assignment in the log
+   (LOGREC_FILE_ID log record).
+
+   @param  tbl_info        table
+   @param  trn             calling transaction
+
+   @return Operation status
+     @retval 0      OK
+     @retval 1      Error
+
+   @note Can be called even if share already has an id (then will do nothing)
+*/
+
+int translog_assign_id_to_share(MARIA_HA *tbl_info, TRN *trn)
+{
+  MARIA_SHARE *share= tbl_info->s;
+  /*
+    If you give an id to a non-BLOCK_RECORD table, you also need to release
+    this id somewhere. Then you can change the assertion.
+  */
+  DBUG_ASSERT(share->data_file_type == BLOCK_RECORD);
+  /* re-check under mutex to avoid having 2 ids for the same share */
+  pthread_mutex_lock(&share->intern_lock);
+  if (unlikely(share->id == 0))
+  {
+    LSN lsn;
+    LEX_CUSTRING log_array[TRANSLOG_INTERNAL_PARTS + 2];
+    uchar log_data[FILEID_STORE_SIZE];
+    /* Inspired by set_short_trid() of trnman.c */
+    uint i= share->kfile.file % SHARE_ID_MAX + 1;
+    do
+    {
+      my_atomic_rwlock_wrlock(&LOCK_id_to_share);
+      for ( ; i <= SHARE_ID_MAX ; i++) /* the range is [1..SHARE_ID_MAX] */
+      {
+        void *tmp= NULL;
+        if (id_to_share[i] == NULL &&
+            my_atomic_casptr((void **)&id_to_share[i], &tmp, share))
+        {
+          share->id= (uint16)i;
+          break;
+        }
+      }
+      my_atomic_rwlock_wrunlock(&LOCK_id_to_share);
+      i= 1; /* scan the whole array */
+    } while (share->id == 0);
+    DBUG_PRINT("info", ("id_to_share: 0x%lx -> %u", (ulong)share, share->id));
+    log_array[TRANSLOG_INTERNAL_PARTS + 0].str=    log_data;
+    log_array[TRANSLOG_INTERNAL_PARTS + 0].length= sizeof(log_data);
+    /*
+      open_file_name is an unresolved name (symlinks are not resolved, datadir
+      is not realpath-ed, etc) which is good: the log can be moved to another
+      directory and continue working.
+    */
+    log_array[TRANSLOG_INTERNAL_PARTS + 1].str=
+      (uchar *)share->open_file_name.str;
+    log_array[TRANSLOG_INTERNAL_PARTS + 1].length=
+      share->open_file_name.length + 1;
+    /*
+      We can't unlock share->intern_lock before the log entry is written to
+      ensure no one uses the id before it's logged.
+    */
+    if (unlikely(translog_write_record(&lsn, LOGREC_FILE_ID, trn, tbl_info,
+                                       (translog_size_t)
+                                       (sizeof(log_data) +
+                                        log_array[TRANSLOG_INTERNAL_PARTS +
+                                                  1].length),
+                                       sizeof(log_array)/sizeof(log_array[0]),
+                                       log_array, log_data, NULL)))
+    {
+      pthread_mutex_unlock(&share->intern_lock);
+      return 1;
+    }
+  }
+  pthread_mutex_unlock(&share->intern_lock);
+  return 0;
+}
+
+
+/**
+   @brief Recycles a MARIA_SHARE's short id.
+
+   @param  share           table
+
+   @note Must be called only if share has an id (i.e. id != 0)
+*/
+
+void translog_deassign_id_from_share(MARIA_SHARE *share)
+{
+  DBUG_PRINT("info", ("id_to_share: 0x%lx id %u -> 0",
+                      (ulong)share, share->id));
+  /*
+    We don't need any mutex as we are called only when closing the last
+    instance of the table or at the end of REPAIR: no writes can be
+    happening. But a Checkpoint may be reading share->id, so we require this
+    mutex:
+  */
+  safe_mutex_assert_owner(&share->intern_lock);
+  my_atomic_rwlock_rdlock(&LOCK_id_to_share);
+  my_atomic_storeptr((void **)&id_to_share[share->id], 0);
+  my_atomic_rwlock_rdunlock(&LOCK_id_to_share);
+  share->id= 0;
+  /* useless but safety: */
+  share->lsn_of_file_id= LSN_IMPOSSIBLE;
+}
+
+
+void translog_assign_id_to_share_from_recovery(MARIA_SHARE *share,
+                                               uint16 id)
+{
+  DBUG_ASSERT(maria_in_recovery && !maria_multi_threaded);
+  DBUG_ASSERT(share->data_file_type == BLOCK_RECORD);
+  DBUG_ASSERT(share->id == 0);
+  DBUG_ASSERT(id_to_share[id] == NULL);
+  id_to_share[share->id= id]= share;
+}
+
+
+/**
+   @brief check if such log file exists
+
+   @param file_no number of the file to test
+
+   @retval 0 no such file
+   @retval 1 there is file with such number
+*/
+
+my_bool translog_is_file(uint file_no)
+{
+  MY_STAT stat_buff;
+  char path[FN_REFLEN];
+  return (test(my_stat(translog_filename_by_fileno(file_no, path),
+                       &stat_buff, MYF(0))));
+}
+
+
+/**
+  @brief returns minimum log file number
+
+  @param horizon         the end of the log
+  @param is_protected    true if it is under purge_log protection
+
+  @retval minimum file number
+  @retval 0 no files found
+*/
+
+static uint32 translog_first_file(TRANSLOG_ADDRESS horizon, int is_protected)
+{
+  uint min_file= 0, max_file;
+  DBUG_ENTER("translog_first_file");
+  if (!is_protected)
+    pthread_mutex_lock(&log_descriptor.purger_lock);
+  if (log_descriptor.min_file_number &&
+      translog_is_file(log_descriptor.min_file_number))
+  {
+    DBUG_PRINT("info", ("cached %lu",
+                        (ulong) log_descriptor.min_file_number));
+    if (!is_protected)
+      pthread_mutex_unlock(&log_descriptor.purger_lock);
+    DBUG_RETURN(log_descriptor.min_file_number);
+  }
+
+  max_file= LSN_FILE_NO(horizon);
+
+  /* binary search for last file */
+  while (min_file != max_file && min_file != (max_file - 1))
+  {
+    uint test= (min_file + max_file) / 2;
+    DBUG_PRINT("info", ("min_file: %u  test: %u  max_file: %u",
+                        min_file, test, max_file));
+    if (test == max_file)
+      test--;
+    if (translog_is_file(test))
+      max_file= test;
+    else
+      min_file= test;
+  }
+  log_descriptor.min_file_number= max_file;
+  if (!is_protected)
+    pthread_mutex_unlock(&log_descriptor.purger_lock);
+  DBUG_PRINT("info", ("first file :%lu", (ulong) max_file));
+  DBUG_ASSERT(max_file >= 1);
+  DBUG_RETURN(max_file);
+}
+
+
+/**
+  @brief returns the most close LSN higher the given chunk address
+
+  @param addr the chunk address to start from
+  @param horizon the horizon if it is known or LSN_IMPOSSIBLE
+
+  @retval LSN_ERROR Error
+  @retval LSN_IMPOSSIBLE no LSNs after the address
+  @retval # LSN of the most close LSN higher the given chunk address
+*/
+
+LSN translog_next_LSN(TRANSLOG_ADDRESS addr, TRANSLOG_ADDRESS horizon)
+{
+  TRANSLOG_SCANNER_DATA scanner;
+  LSN result;
+  DBUG_ENTER("translog_next_LSN");
+
+  if (horizon == LSN_IMPOSSIBLE)
+    horizon= translog_get_horizon();
+
+  if (addr == horizon)
+    DBUG_RETURN(LSN_IMPOSSIBLE);
+
+  translog_scanner_init(addr, 0, &scanner, 1);
+  /*
+    addr can point not to a chunk beginning but page end so next
+    page beginning.
+  */
+  if (addr % TRANSLOG_PAGE_SIZE == 0)
+  {
+    /*
+      We are emulating the page end which cased such horizon value to
+      trigger translog_scanner_eop().
+
+      We can't just increase addr on page header overhead because it
+      can be file end so we allow translog_get_next_chunk() to skip
+      to the next page in correct way
+    */
+    scanner.page_addr-= TRANSLOG_PAGE_SIZE;
+    scanner.page_offset= TRANSLOG_PAGE_SIZE;
+#ifndef DBUG_OFF
+    scanner.page= NULL; /* prevent using incorrect page content */
+#endif
+  }
+  /* addr can point not to a chunk beginning but to a page end */
+  if (translog_scanner_eop(&scanner))
+  {
+    if (translog_get_next_chunk(&scanner))
+    {
+      result= LSN_ERROR;
+      goto out;
+    }
+    if (scanner.page == END_OF_LOG)
+    {
+      result= LSN_IMPOSSIBLE;
+      goto out;
+    }
+  }
+
+  while (!translog_is_LSN_chunk(scanner.page[scanner.page_offset]) &&
+         scanner.page[scanner.page_offset] != TRANSLOG_FILLER)
+  {
+    if (translog_get_next_chunk(&scanner))
+    {
+      result= LSN_ERROR;
+      goto out;
+    }
+    if (scanner.page == END_OF_LOG)
+    {
+      result= LSN_IMPOSSIBLE;
+      goto out;
+    }
+  }
+
+  if (scanner.page[scanner.page_offset] == TRANSLOG_FILLER)
+    result= LSN_IMPOSSIBLE; /* reached page filler */
+  else
+    result= scanner.page_addr + scanner.page_offset;
+out:
+  translog_destroy_scanner(&scanner);
+  DBUG_RETURN(result);
+}
+
+
+/**
+   @brief returns the LSN of the first record starting in this log
+
+   @retval LSN_ERROR Error
+   @retval LSN_IMPOSSIBLE no log or the log is empty
+   @retval # LSN of the first record
+*/
+
+LSN translog_first_lsn_in_log()
+{
+  TRANSLOG_ADDRESS addr, horizon= translog_get_horizon();
+  TRANSLOG_VALIDATOR_DATA data;
+  uint file;
+  uint16 chunk_offset;
+  uchar *page;
+  DBUG_ENTER("translog_first_lsn_in_log");
+  DBUG_PRINT("info", ("Horizon: (%lu,0x%lx)", LSN_IN_PARTS(horizon)));
+  DBUG_ASSERT(translog_status == TRANSLOG_OK ||
+              translog_status == TRANSLOG_READONLY);
+
+  if (!(file= translog_first_file(horizon, 0)))
+  {
+    /* log has no records yet */
+    DBUG_RETURN(LSN_IMPOSSIBLE);
+  }
+
+  addr= MAKE_LSN(file, TRANSLOG_PAGE_SIZE); /* the first page of the file */
+  data.addr= &addr;
+  {
+    TRANSLOG_PAGE_SIZE_BUFF psize_buff;
+    if ((page= translog_get_page(&data, psize_buff.buffer, NULL)) == NULL ||
+        (chunk_offset= translog_get_first_chunk_offset(page)) == 0)
+      DBUG_RETURN(LSN_ERROR);
+  }
+  addr+= chunk_offset;
+
+  DBUG_RETURN(translog_next_LSN(addr, horizon));
+}
+
+
+/**
+   @brief Returns theoretical first LSN if first log is present
+
+   @retval LSN_ERROR Error
+   @retval LSN_IMPOSSIBLE no log
+   @retval # LSN of the first record
+*/
+
+LSN translog_first_theoretical_lsn()
+{
+  TRANSLOG_ADDRESS addr= translog_get_horizon();
+  TRANSLOG_PAGE_SIZE_BUFF psize_buff;
+  uchar *page;
+  TRANSLOG_VALIDATOR_DATA data;
+  DBUG_ENTER("translog_first_theoretical_lsn");
+  DBUG_PRINT("info", ("Horizon: (%lu,0x%lx)", LSN_IN_PARTS(addr)));
+  DBUG_ASSERT(translog_status == TRANSLOG_OK ||
+              translog_status == TRANSLOG_READONLY);
+
+  if (!translog_is_file(1))
+    DBUG_RETURN(LSN_IMPOSSIBLE);
+  if (addr == MAKE_LSN(1, TRANSLOG_PAGE_SIZE))
+  {
+    /* log has no records yet */
+    DBUG_RETURN(MAKE_LSN(1, TRANSLOG_PAGE_SIZE +
+                         log_descriptor.page_overhead));
+  }
+
+  addr= MAKE_LSN(1, TRANSLOG_PAGE_SIZE); /* the first page of the file */
+  data.addr= &addr;
+  if ((page= translog_get_page(&data, psize_buff.buffer, NULL)) == NULL)
+    DBUG_RETURN(LSN_ERROR);
+
+  DBUG_RETURN(MAKE_LSN(1, TRANSLOG_PAGE_SIZE +
+                       page_overhead[page[TRANSLOG_PAGE_FLAGS]]));
+}
+
+
+/**
+  @brief Checks given low water mark and purge files if it is need
+
+  @param low the last (minimum) address which is need
+
+  @retval 0 OK
+  @retval 1 Error
+*/
+
+my_bool translog_purge(TRANSLOG_ADDRESS low)
+{
+  uint32 last_need_file= LSN_FILE_NO(low);
+  uint32 min_unsync;
+  int soft;
+  TRANSLOG_ADDRESS horizon= translog_get_horizon();
+  int rc= 0;
+  DBUG_ENTER("translog_purge");
+  DBUG_PRINT("enter", ("low: (%lu,0x%lx)", LSN_IN_PARTS(low)));
+  DBUG_ASSERT(translog_status == TRANSLOG_OK ||
+              translog_status == TRANSLOG_READONLY);
+
+  soft= soft_sync;
+  min_unsync= soft_sync_min;
+  DBUG_PRINT("info", ("min_unsync: %lu", (ulong) min_unsync));
+  if (soft && min_unsync < last_need_file)
+  {
+    last_need_file= min_unsync;
+    DBUG_PRINT("info", ("last_need_file set to %lu", (ulong)last_need_file));
+  }
+
+  pthread_mutex_lock(&log_descriptor.purger_lock);
+  DBUG_PRINT("info", ("last_lsn_checked file: %lu:",
+                      (ulong) log_descriptor.last_lsn_checked));
+  if (LSN_FILE_NO(log_descriptor.last_lsn_checked) < last_need_file)
+  {
+    uint32 i;
+    uint32 min_file= translog_first_file(horizon, 1);
+    DBUG_ASSERT(min_file != 0); /* log is already started */
+    DBUG_PRINT("info", ("min_file:  %lu:",(ulong) min_file));
+    for(i= min_file; i < last_need_file && rc == 0; i++)
+    {
+      LSN lsn= translog_get_file_max_lsn_stored(i);
+      if (lsn == LSN_IMPOSSIBLE)
+        break;   /* files are still in writing */
+      if (lsn == LSN_ERROR)
+      {
+        rc= 1;
+        break;
+      }
+      if (cmp_translog_addr(lsn, low) >= 0)
+        break;
+
+      DBUG_PRINT("info", ("purge file %lu", (ulong) i));
+
+      /* remove file descriptor from the cache */
+      /*
+        log_descriptor.min_file can be changed only here during execution
+        and the function is serialized, so we can access it without problems
+      */
+      if (i >= log_descriptor.min_file)
+      {
+        TRANSLOG_FILE *file;
+        rw_wrlock(&log_descriptor.open_files_lock);
+        DBUG_ASSERT(log_descriptor.max_file - log_descriptor.min_file + 1 ==
+                    log_descriptor.open_files.elements);
+        DBUG_ASSERT(log_descriptor.min_file == i);
+        file= *((TRANSLOG_FILE **)pop_dynamic(&log_descriptor.open_files));
+        DBUG_PRINT("info", ("Files : %d", log_descriptor.open_files.elements));
+        DBUG_ASSERT(i == file->number);
+        log_descriptor.min_file++;
+        DBUG_ASSERT(log_descriptor.max_file - log_descriptor.min_file + 1 ==
+                    log_descriptor.open_files.elements);
+        rw_unlock(&log_descriptor.open_files_lock);
+        translog_close_log_file(file);
+      }
+      if (log_purge_type == TRANSLOG_PURGE_IMMIDIATE)
+      {
+        char path[FN_REFLEN], *file_name;
+        file_name= translog_filename_by_fileno(i, path);
+        rc= test(my_delete(file_name, MYF(MY_WME)));
+      }
+    }
+    if (unlikely(rc == 1))
+      log_descriptor.min_need_file= 0; /* impossible value */
+    else
+      log_descriptor.min_need_file= i;
+  }
+
+  pthread_mutex_unlock(&log_descriptor.purger_lock);
+  DBUG_RETURN(rc);
+}
+
+
+/**
+  @brief Purges files by stored min need file in case of
+    "ondemend" purge type
+
+  @note This function do real work only if it is "ondemend" purge type
+    and translog_purge() was called at least once and last time without
+    errors
+
+  @retval 0 OK
+  @retval 1 Error
+*/
+
+my_bool translog_purge_at_flush()
+{
+  uint32 i, min_file;
+  int rc= 0;
+  DBUG_ENTER("translog_purge_at_flush");
+  DBUG_ASSERT(translog_status == TRANSLOG_OK ||
+              translog_status == TRANSLOG_READONLY);
+
+  if (unlikely(translog_status == TRANSLOG_READONLY))
+  {
+    DBUG_PRINT("info", ("The log is read only => exit"));
+    DBUG_RETURN(0);
+  }
+
+  if (log_purge_type != TRANSLOG_PURGE_ONDEMAND)
+  {
+    DBUG_PRINT("info", ("It is not \"at_flush\" => exit"));
+    DBUG_RETURN(0);
+  }
+
+  pthread_mutex_lock(&log_descriptor.purger_lock);
+
+  if (unlikely(log_descriptor.min_need_file == 0))
+  {
+    DBUG_PRINT("info", ("No info about min need file => exit"));
+    pthread_mutex_unlock(&log_descriptor.purger_lock);
+    DBUG_RETURN(0);
+  }
+
+  min_file= translog_first_file(translog_get_horizon(), 1);
+  DBUG_ASSERT(min_file != 0); /* log is already started */
+  for(i= min_file; i < log_descriptor.min_need_file && rc == 0; i++)
+  {
+    char path[FN_REFLEN], *file_name;
+    DBUG_PRINT("info", ("purge file %lu\n", (ulong) i));
+    file_name= translog_filename_by_fileno(i, path);
+    rc= test(my_delete(file_name, MYF(MY_WME)));
+  }
+
+  pthread_mutex_unlock(&log_descriptor.purger_lock);
+  DBUG_RETURN(rc);
+}
+
+
+/**
+  @brief Gets min file number
+
+  @param horizon         the end of the log
+
+  @retval minimum file number
+  @retval 0 no files found
+*/
+
+uint32 translog_get_first_file(TRANSLOG_ADDRESS horizon)
+{
+  return translog_first_file(horizon, 0);
+}
+
+
+/**
+  @brief Gets min file number which is needed
+
+  @retval minimum file number
+  @retval 0 unknown
+*/
+
+uint32 translog_get_first_needed_file()
+{
+  uint32 file_no;
+  pthread_mutex_lock(&log_descriptor.purger_lock);
+  file_no= log_descriptor.min_need_file;
+  pthread_mutex_unlock(&log_descriptor.purger_lock);
+  return file_no;
+}
+
+
+/**
+  @brief Gets transaction log file size
+
+  @return transaction log file size
+*/
+
+uint32 translog_get_file_size()
+{
+  uint32 res;
+  translog_lock();
+  res= log_descriptor.log_file_max_size;
+  translog_unlock();
+  return (res);
+}
+
+
+/**
+  @brief Sets transaction log file size
+
+  @return Returns actually set transaction log size
+*/
+
+void translog_set_file_size(uint32 size)
+{
+  struct st_translog_buffer *old_buffer= NULL;
+  DBUG_ENTER("translog_set_file_size");
+  translog_lock();
+  DBUG_PRINT("enter", ("Size: %lu", (ulong) size));
+  DBUG_ASSERT(size % TRANSLOG_PAGE_SIZE == 0 &&
+              size >= TRANSLOG_MIN_FILE_SIZE);
+  log_descriptor.log_file_max_size= size;
+  /* if current file longer then finish it*/
+  if (LSN_OFFSET(log_descriptor.horizon) >=  log_descriptor.log_file_max_size)
+  {
+    old_buffer= log_descriptor.bc.buffer;
+    translog_buffer_next(&log_descriptor.horizon, &log_descriptor.bc, 1);
+    translog_buffer_unlock(old_buffer);
+  }
+  translog_unlock();
+  if (old_buffer)
+  {
+    translog_buffer_lock(old_buffer);
+    translog_buffer_flush(old_buffer);
+    translog_buffer_unlock(old_buffer);
+  }
+  DBUG_VOID_RETURN;
+}
+
+
+/**
+   Write debug information to log if we EXTRA_DEBUG is enabled
+*/
+
+my_bool translog_log_debug_info(TRN *trn __attribute__((unused)),
+                                enum translog_debug_info_type type
+                                __attribute__((unused)),
+                                uchar *info __attribute__((unused)),
+                                size_t length __attribute__((unused)))
+{
+#ifdef EXTRA_DEBUG
+  LEX_CUSTRING log_array[TRANSLOG_INTERNAL_PARTS + 2];
+  uchar debug_type;
+  LSN lsn;
+
+  if (!trn)
+  {
+    /*
+      We can't log the current transaction because we don't have
+      an active transaction. Use a temporary transaction object instead
+    */
+    trn= &dummy_transaction_object;
+  }
+  debug_type= (uchar) type;
+  log_array[TRANSLOG_INTERNAL_PARTS + 0].str= &debug_type;
+  log_array[TRANSLOG_INTERNAL_PARTS + 0].length= 1;
+  log_array[TRANSLOG_INTERNAL_PARTS + 1].str= info;
+  log_array[TRANSLOG_INTERNAL_PARTS + 1].length= length;
+  return translog_write_record(&lsn, LOGREC_DEBUG_INFO,
+                               trn, NULL,
+                               (translog_size_t) (1+ length),
+                               sizeof(log_array)/sizeof(log_array[0]),
+                               log_array, NULL, NULL);
+#else
+  return 0;
+#endif
+}
+
+
+
+/**
+  Sets soft sync mode
+
+  @param mode            TRUE if we need switch soft sync on else off
+*/
+
+void translog_soft_sync(my_bool mode)
+{
+  soft_sync= mode;
+}
+
+
+/**
+  Sets hard group commit
+
+  @param mode            TRUE if we need switch hard group commit on else off
+*/
+
+void translog_hard_group_commit(my_bool mode)
+{
+  hard_group_commit= mode;
+}
+
+
+/**
+  @brief forced log sync (used when we are switching modes)
+*/
+
+void translog_sync()
+{
+  uint32 max= get_current_logfile()->number;
+  uint32 min;
+  DBUG_ENTER("ma_translog_sync");
+
+  min= soft_sync_min;
+  if (!min)
+    min= max;
+
+  translog_sync_files(min, max, sync_log_dir >= TRANSLOG_SYNC_DIR_ALWAYS);
+
+  DBUG_VOID_RETURN;
+}
+
+
+/**
+  @brief set rate for group commit
+
+  @param interval            interval to set.
+
+  @note We use this function with additional variable because have to
+  restart service thread with new value which we can't make inside changing
+  variable routine (update_maria_group_commit_interval)
+*/
+
+void translog_set_group_commit_interval(uint32 interval)
+{
+  DBUG_ENTER("translog_set_group_commit_interval");
+  group_commit_wait= interval;
+  DBUG_PRINT("info", ("wait: %llu",
+                      (ulonglong)group_commit_wait));
+  DBUG_VOID_RETURN;
+}
+
+
+/**
+  @brief syncing service thread
+*/
+
+static pthread_handler_t
+ma_soft_sync_background( void *arg __attribute__((unused)))
+{
+
+  my_thread_init();
+  {
+    DBUG_ENTER("ma_soft_sync_background");
+    for(;;)
+    {
+      ulonglong prev_loop= my_micro_time();
+      ulonglong time, sleep;
+      uint32 min, max, sync_request;
+      min= soft_sync_min;
+      max= soft_sync_max;
+      sync_request= soft_need_sync;
+      soft_sync_min= max;
+      soft_need_sync= 0;
+
+      sleep= group_commit_wait;
+      if (sync_request)
+        translog_sync_files(min, max, FALSE);
+      time= my_micro_time() - prev_loop;
+      if (time > sleep)
+        sleep= 0;
+      else
+        sleep-= time;
+      if (my_service_thread_sleep(&soft_sync_control, sleep))
+        break;
+    }
+    my_service_thread_signal_end(&soft_sync_control);
+    my_thread_end();
+    DBUG_RETURN(0);
+  }
+}
+
+
+/**
+  @brief Starts syncing thread
+*/
+
+int translog_soft_sync_start(void)
+{
+  pthread_t th;
+  int res= 0;
+  uint32 min, max;
+  DBUG_ENTER("translog_soft_sync_start");
+
+  /* check and init variables */
+  min= soft_sync_min;
+  max= soft_sync_max;
+  if (!max)
+    soft_sync_max= max= get_current_logfile()->number;
+  if (!min)
+    soft_sync_min= max;
+  soft_need_sync= 1;
+
+  if (!(res= ma_service_thread_control_init(&soft_sync_control)))
+    if (!(res= pthread_create(&th, NULL, ma_soft_sync_background, NULL)))
+      soft_sync_control.status= THREAD_RUNNING;
+  DBUG_RETURN(res);
+}
+
+
+/**
+  @brief Stops syncing thread
+*/
+
+void  translog_soft_sync_end(void)
+{
+  DBUG_ENTER("translog_soft_sync_end");
+  if (soft_sync_control.inited)
+  {
+    ma_service_thread_control_end(&soft_sync_control);
+  }
+  DBUG_VOID_RETURN;
+}
+
+
+#ifdef MARIA_DUMP_LOG
+#include <my_getopt.h>
+extern void translog_example_table_init();
+static const char *load_default_groups[]= { "aria_dump_log",0 };
+static void get_options(int *argc,char * * *argv);
+#ifndef DBUG_OFF
+#if defined(__WIN__)
+const char *default_dbug_option= "d:t:i:O,\\aria_dump_log.trace";
+#else
+const char *default_dbug_option= "d:t:i:o,/tmp/aria_dump_log.trace";
+#endif
+#endif
+static ulonglong opt_offset;
+static ulong opt_pages;
+static const char *opt_file= NULL;
+static File handler= -1;
+static my_bool opt_unit= 0;
+static struct my_option my_long_options[] =
+{
+#ifdef IMPLTMENTED
+  {"body", 'b',
+   "Print chunk body dump",
+   (uchar **) &opt_body, (uchar **) &opt_body, 0,
+   GET_BOOL, NO_ARG, 0, 0, 0, 0, 0, 0},
+#endif
+#ifndef DBUG_OFF
+  {"debug", '#', "Output debug log. Often the argument is 'd:t:o,filename'.",
+   0, 0, 0, GET_STR, OPT_ARG, 0, 0, 0, 0, 0, 0},
+#endif
+  {"file", 'f', "Path to file which will be read",
+    (uchar**) &opt_file, 0, 0, GET_STR, REQUIRED_ARG, 0, 0, 0, 0, 0, 0},
+  {"help", '?', "Display this help and exit.",
+   0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
+  { "offset", 'o', "Start reading log from this offset",
+    (uchar**) &opt_offset, (uchar**) &opt_offset,
+    0, GET_ULL, REQUIRED_ARG, 0, 0, ~(longlong) 0, 0, 0, 0 },
+  { "pages", 'n', "Number of pages to read",
+    (uchar**) &opt_pages, (uchar**) &opt_pages, 0,
+    GET_ULONG, REQUIRED_ARG, (long) ~(ulong) 0,
+    (long) 1, (long) ~(ulong) 0, (long) 0,
+    (long) 1, 0},
+  {"unit-test", 'U',
+   "Use unit test record table (for logs created by unittests",
+   (uchar **) &opt_unit, (uchar **) &opt_unit, 0,
+   GET_BOOL, NO_ARG, 0, 0, 0, 0, 0, 0},
+  {"version", 'V', "Print version and exit.",
+   0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
+  { 0, 0, 0, 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}
+};
+
+
+static void print_version(void)
+{
+  VOID(printf("%s Ver 1.0 for %s on %s\n",
+              my_progname_short, SYSTEM_TYPE, MACHINE_TYPE));
+  NETWARE_SET_SCREEN_MODE(1);
+}
+
+
+static void usage(void)
+{
+  print_version();
+  puts("Copyright (C) 2008 MySQL AB");
+  puts("This software comes with ABSOLUTELY NO WARRANTY. This is free software,");
+  puts("and you are welcome to modify and redistribute it under the GPL license\n");
+
+  puts("Dump content of aria log pages.");
+  VOID(printf("\nUsage: %s -f file OPTIONS\n", my_progname_short));
+  my_print_help(my_long_options);
+  print_defaults("my", load_default_groups);
+  my_print_variables(my_long_options);
+}
+
+
+static my_bool
+get_one_option(int optid __attribute__((unused)),
+               const struct my_option *opt __attribute__((unused)),
+               char *argument __attribute__((unused)))
+{
+  switch (optid) {
+  case '?':
+    usage();
+    exit(0);
+  case 'V':
+    print_version();
+    exit(0);
+#ifndef DBUG_OFF
+  case '#':
+    DBUG_SET_INITIAL(argument ? argument : default_dbug_option);
+    break;
+#endif
+  }
+  return 0;
+}
+
+
+static void get_options(int *argc,char ***argv)
+{
+  int ho_error;
+
+  if ((ho_error=handle_options(argc, argv, my_long_options, get_one_option)))
+    exit(ho_error);
+
+  if (opt_file == NULL)
+  {
+    usage();
+    exit(1);
+  }
+}
+
+
+/**
+  @brief Dump information about file header page.
+*/
+
+static void dump_header_page(uchar *buff)
+{
+  LOGHANDLER_FILE_INFO desc;
+  char strbuff[21];
+  LINT_INIT_STRUCT(desc);
+  translog_interpret_file_header(&desc, buff);
+  printf("  This can be header page:\n"
+         "    Timestamp: %s\n"
+         "    Aria log version: %lu\n"
+         "    Server version: %lu\n"
+         "    Server id %lu\n"
+         "    Page size %lu\n",
+         llstr(desc.timestamp, strbuff),
+         desc.maria_version,
+         desc.mysql_version,
+         desc.server_id,
+         desc.page_size);
+  if (desc.page_size != TRANSLOG_PAGE_SIZE)
+    printf("      WARNING: page size is not equal compiled in one %lu!!!\n",
+           (ulong) TRANSLOG_PAGE_SIZE);
+  printf("    File number %lu\n"
+         "    Max lsn: (%lu,0x%lx)\n",
+         desc.file_number,
+         LSN_IN_PARTS(desc.max_lsn));
+}
+
+static const char *record_class_string[]=
+{
+  "LOGRECTYPE_NOT_ALLOWED",
+  "LOGRECTYPE_VARIABLE_LENGTH",
+  "LOGRECTYPE_PSEUDOFIXEDLENGTH",
+  "LOGRECTYPE_FIXEDLENGTH"
+};
+
+
+/**
+  @brief dump information about transaction log chunk
+
+  @param buffer          reference to the whole page
+  @param ptr             pointer to the chunk
+
+  @reval # reference to the next chunk
+  @retval NULL can't interpret data
+*/
+
+static uchar *dump_chunk(uchar *buffer, uchar *ptr)
+{
+  uint length;
+  if (*ptr == TRANSLOG_FILLER)
+  {
+    printf("  Filler till the page end\n");
+    for (; ptr < buffer + TRANSLOG_PAGE_SIZE; ptr++)
+    {
+      if (*ptr != TRANSLOG_FILLER)
+      {
+        printf("    WARNING: non filler character met before page end "
+               "(page + 0x%04x: 0x%02x) (stop interpretation)!!!",
+               (uint) (ptr - buffer), (uint) ptr[0]);
+        return NULL;
+      }
+    }
+    return ptr;
+  }
+  if (*ptr == 0 || *ptr == 0xFF)
+  {
+    printf("    WARNING: chunk can't start from 0x0 "
+           "(stop interpretation)!!!\n");
+    return NULL;
+  }
+  switch (ptr[0] & TRANSLOG_CHUNK_TYPE) {
+  case TRANSLOG_CHUNK_LSN:
+    printf("    LSN chunk type 0 (variable length)\n");
+    if (likely((ptr[0] & TRANSLOG_REC_TYPE) != TRANSLOG_CHUNK_0_CONT))
+    {
+      printf("      Record type %u: %s  record class %s compressed LSNs: %u\n",
+             ptr[0] & TRANSLOG_REC_TYPE,
+             (log_record_type_descriptor[ptr[0] & TRANSLOG_REC_TYPE].name ?
+              log_record_type_descriptor[ptr[0] & TRANSLOG_REC_TYPE].name :
+              "NULL"),
+             record_class_string[log_record_type_descriptor[ptr[0] &
+                                                            TRANSLOG_REC_TYPE].
+                                                            rclass],
+             log_record_type_descriptor[ptr[0] & TRANSLOG_REC_TYPE].
+             compressed_LSN);
+      if (log_record_type_descriptor[ptr[0] & TRANSLOG_REC_TYPE].rclass !=
+          LOGRECTYPE_VARIABLE_LENGTH)
+      {
+        printf("        WARNING: this record class here can't be used "
+               "(stop interpretation)!!!\n");
+        break;
+      }
+    }
+    else
+      printf("      Continuation of previous chunk 0 header \n");
+    printf("      Short transaction id: %u\n", (uint) uint2korr(ptr + 1));
+    {
+      uchar *hdr_ptr= ptr + 1 + 2; /* chunk type and short trid */
+      uint16 chunk_len;
+      printf ("      Record length: %lu\n",
+              (ulong) translog_variable_record_1group_decode_len(&hdr_ptr));
+      chunk_len= uint2korr(hdr_ptr);
+      if (chunk_len == 0)
+        printf ("      It is 1 group record (chunk length == 0)\n");
+      else
+      {
+        uint16 groups, i;
+
+        printf ("      Chunk length %u\n", (uint) chunk_len);
+        groups= uint2korr(hdr_ptr + 2);
+        hdr_ptr+= 4;
+        printf ("      Number of groups left to the end %u:\n", (uint) groups);
+        for(i= 0;
+            i < groups && hdr_ptr < buffer + TRANSLOG_PAGE_SIZE;
+            i++, hdr_ptr+= LSN_STORE_SIZE + 1)
+        {
+          TRANSLOG_ADDRESS gpr_addr= lsn_korr(hdr_ptr);
+          uint pages= hdr_ptr[LSN_STORE_SIZE];
+          printf ("        Group +#%u: (%lu,0x%lx)  pages: %u\n",
+                  (uint) i, LSN_IN_PARTS(gpr_addr), pages);
+        }
+      }
+    }
+    break;
+  case TRANSLOG_CHUNK_FIXED:
+    printf("    LSN chunk type 1 (fixed size)\n");
+    printf("      Record type %u: %s  record class %s compressed LSNs: %u\n",
+           ptr[0] & TRANSLOG_REC_TYPE,
+           (log_record_type_descriptor[ptr[0] & TRANSLOG_REC_TYPE].name ?
+            log_record_type_descriptor[ptr[0] & TRANSLOG_REC_TYPE].name :
+            "NULL"),
+           record_class_string[log_record_type_descriptor[ptr[0] &
+                                                          TRANSLOG_REC_TYPE].
+                                                          rclass],
+           log_record_type_descriptor[ptr[0] & TRANSLOG_REC_TYPE].
+           compressed_LSN);
+    if (log_record_type_descriptor[ptr[0] & TRANSLOG_REC_TYPE].rclass !=
+        LOGRECTYPE_PSEUDOFIXEDLENGTH &&
+        log_record_type_descriptor[ptr[0] & TRANSLOG_REC_TYPE].rclass !=
+        LOGRECTYPE_FIXEDLENGTH)
+    {
+      printf("        WARNING: this record class here can't be used "
+             "(stop interpretation)!!!\n");
+    }
+    printf("      Short transaction id: %u\n", (uint) uint2korr(ptr + 1));
+    break;
+  case TRANSLOG_CHUNK_NOHDR:
+    printf("    No header chunk type 2(till the end of the page)\n");
+    if (ptr[0] & TRANSLOG_REC_TYPE)
+    {
+      printf("      WARNING: chunk header content record type: 0x%02x "
+             "(dtop interpretation)!!!",
+             (uint) ptr[0]);
+      return NULL;
+    }
+    break;
+  case TRANSLOG_CHUNK_LNGTH:
+    printf("    Chunk with length type 3\n");
+    if (ptr[0] & TRANSLOG_REC_TYPE)
+    {
+      printf("      WARNING: chunk header content record type: 0x%02x "
+             "(dtop interpretation)!!!",
+             (uint) ptr[0]);
+      return NULL;
+    }
+    break;
+  }
+  {
+    intptr offset= ptr - buffer;
+    DBUG_ASSERT(offset >= 0 && offset <= UINT_MAX16);
+    length= translog_get_total_chunk_length(buffer, (uint16)offset);
+  }
+  printf("      Length %u\n", length);
+  ptr+= length;
+  return ptr;
+}
+
+
+/**
+  @brief Dump information about page with data.
+*/
+
+static void dump_datapage(uchar *buffer)
+{
+  uchar *ptr;
+  ulong offset;
+  uint32 page, file;
+  uint header_len;
+  printf("  Page: %ld  File number: %ld\n",
+         (ulong) (page= uint3korr(buffer)),
+         (ulong) (file= uint3korr(buffer + 3)));
+  if (page == 0)
+    printf("    WARNING: page == 0!!!\n");
+  if (file == 0)
+    printf("    WARNING: file == 0!!!\n");
+  offset= page * TRANSLOG_PAGE_SIZE;
+  printf("  Flags (0x%x):\n", (uint) buffer[TRANSLOG_PAGE_FLAGS]);
+  if (buffer[TRANSLOG_PAGE_FLAGS])
+  {
+    if (buffer[TRANSLOG_PAGE_FLAGS] & TRANSLOG_PAGE_CRC)
+      printf("    Page CRC\n");
+    if (buffer[TRANSLOG_PAGE_FLAGS] & TRANSLOG_SECTOR_PROTECTION)
+      printf("    Sector protection\n");
+    if (buffer[TRANSLOG_PAGE_FLAGS] & TRANSLOG_RECORD_CRC)
+      printf("    Record CRC (WARNING: not yet implemented!!!)\n");
+    if (buffer[TRANSLOG_PAGE_FLAGS] & ~(TRANSLOG_PAGE_CRC |
+                                        TRANSLOG_SECTOR_PROTECTION |
+                                        TRANSLOG_RECORD_CRC))
+    {
+      printf("    WARNING: unknown flags (stop interpretation)!!!\n");
+      return;
+    }
+  }
+  else
+    printf("    No flags\n");
+  printf("  Page header length: %u\n",
+         (header_len= page_overhead[buffer[TRANSLOG_PAGE_FLAGS]]));
+  if (buffer[TRANSLOG_PAGE_FLAGS] & TRANSLOG_RECORD_CRC)
+  {
+    uint32 crc= uint4korr(buffer + TRANSLOG_PAGE_FLAGS + 1);
+    uint32 ccrc;
+    printf ("  Page CRC 0x%04lx\n", (ulong) crc);
+    ccrc= translog_crc(buffer + header_len, TRANSLOG_PAGE_SIZE - header_len);
+    if (crc != ccrc)
+      printf("    WARNING: calculated CRC: 0x%04lx!!!\n", (ulong) ccrc);
+  }
+  if (buffer[TRANSLOG_PAGE_FLAGS] & TRANSLOG_SECTOR_PROTECTION)
+  {
+    TRANSLOG_FILE tfile;
+    {
+      uchar *table= buffer + header_len -
+        TRANSLOG_PAGE_SIZE / DISK_DRIVE_SECTOR_SIZE;
+      uint i;
+      printf("    Sector protection current value: 0x%02x\n", (uint) table[0]);
+      for (i= 1; i < TRANSLOG_PAGE_SIZE / DISK_DRIVE_SECTOR_SIZE; i++)
+      {
+         printf("    Sector protection in sector: 0x%02x  saved value 0x%02x\n",
+                (uint)buffer[i * DISK_DRIVE_SECTOR_SIZE],
+                (uint)table[i]);
+      }
+    }
+    tfile.number= file;
+    tfile.handler.file= handler;
+    pagecache_file_init(tfile.handler, NULL, NULL, NULL, NULL, NULL);
+    tfile.was_recovered= 0;
+    tfile.is_sync= 1;
+    if (translog_check_sector_protection(buffer, &tfile))
+      printf("    WARNING: sector protection found problems!!!\n");
+  }
+  ptr= buffer + header_len;
+  while (ptr && ptr < buffer + TRANSLOG_PAGE_SIZE)
+  {
+    printf("  Chunk (%lu,0x%lx):\n",
+           (ulong)file, (ulong) offset + (ptr - buffer));
+    ptr= dump_chunk(buffer, ptr);
+  }
+}
+
+
+/**
+  @brief Dump information about page.
+*/
+
+static void dump_page(uchar *buffer)
+{
+  printf("Page by offset %llu (0x%llx)\n", opt_offset, opt_offset);
+  if (strncmp((char*)maria_trans_file_magic, (char*)buffer,
+              sizeof(maria_trans_file_magic)) == 0)
+  {
+    dump_header_page(buffer);
+  }
+  dump_datapage(buffer);
+}
+
+
+/**
+  @brief maria_dump_log main function.
+*/
+
+int main(int argc, char **argv)
+{
+  char **default_argv;
+  uchar buffer[TRANSLOG_PAGE_SIZE];
+  MY_INIT(argv[0]);
+
+  load_defaults("my", load_default_groups, &argc, &argv);
+  default_argv= argv;
+  get_options(&argc, &argv);
+
+  if (opt_unit)
+    translog_example_table_init();
+  else
+    translog_table_init();
+  translog_fill_overhead_table();
+
+  maria_data_root= (char *)".";
+
+  if ((handler= my_open(opt_file, O_RDONLY, MYF(MY_WME))) < 0)
+  {
+    fprintf(stderr, "Can't open file: '%s'  errno: %d\n",
+            opt_file, my_errno);
+    goto err;
+  }
+  if (my_seek(handler, opt_offset, SEEK_SET, MYF(MY_WME)) !=
+      opt_offset)
+  {
+     fprintf(stderr, "Can't set position %lld  file: '%s'  errno: %d\n",
+             opt_offset, opt_file, my_errno);
+     goto err;
+  }
+  for (;
+       opt_pages;
+       opt_offset+= TRANSLOG_PAGE_SIZE, opt_pages--)
+  {
+    if (my_pread(handler, buffer, TRANSLOG_PAGE_SIZE, opt_offset,
+                 MYF(MY_NABP)))
+    {
+      if (my_errno == HA_ERR_FILE_TOO_SHORT)
+        goto end;
+      fprintf(stderr, "Can't read page at position %lld  file: '%s'  "
+              "errno: %d\n", opt_offset, opt_file, my_errno);
+      goto err;
+    }
+    dump_page(buffer);
+  }
+
+end:
+  my_close(handler, MYF(0));
+  free_defaults(default_argv);
+  exit(0);
+  return 0;				/* No compiler warning */
+
+err:
+  my_close(handler, MYF(0));
+  fprintf(stderr, "%s: FAILED\n", my_progname_short);
+  free_defaults(default_argv);
+  exit(1);
+}
+#endif
diff --git a/storage/maria/ma_loghandler.h b/storage/maria/ma_loghandler.h
new file mode 100644
index 00000000000..698a8ead7b6
--- /dev/null
+++ b/storage/maria/ma_loghandler.h
@@ -0,0 +1,506 @@
+/* Copyright (C) 2007 MySQL AB & Sanja Belkin
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; version 2 of the License.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */
+
+#ifndef _ma_loghandler_h
+#define _ma_loghandler_h
+
+#define MB (1024UL*1024)
+
+/* transaction log default cache size  (TODO: make it global variable) */
+#define TRANSLOG_PAGECACHE_SIZE (2*MB)
+/* transaction log default file size */
+#define TRANSLOG_FILE_SIZE (1024U*MB)
+/* minimum possible transaction log size */
+#define TRANSLOG_MIN_FILE_SIZE (8*MB)
+/* transaction log default flags (TODO: make it global variable) */
+#define TRANSLOG_DEFAULT_FLAGS 0
+
+/*
+  Transaction log flags.
+
+  We allow all kind protections to be switched on together for people who
+  really unsure in their hardware/OS.
+*/
+#define TRANSLOG_PAGE_CRC              1
+#define TRANSLOG_SECTOR_PROTECTION     (1<<1)
+#define TRANSLOG_RECORD_CRC            (1<<2)
+#define TRANSLOG_FLAGS_NUM ((TRANSLOG_PAGE_CRC | TRANSLOG_SECTOR_PROTECTION | \
+                           TRANSLOG_RECORD_CRC) + 1)
+
+#define RECHEADER_READ_ERROR -1
+#define RECHEADER_READ_EOF   -2
+
+/*
+  Page size in transaction log
+  It should be Power of 2 and multiple of DISK_DRIVE_SECTOR_SIZE
+  (DISK_DRIVE_SECTOR_SIZE * 2^N)
+*/
+#define TRANSLOG_PAGE_SIZE (8U*1024)
+
+#include "ma_loghandler_lsn.h"
+#include "trnman_public.h"
+
+/* short transaction ID type */
+typedef uint16 SHORT_TRANSACTION_ID;
+
+struct st_maria_handler;
+
+/* Changing one of the "SIZE" below will break backward-compatibility! */
+/* Length of CRC at end of pages */
+#define ROW_EXTENT_PAGE_SIZE	5
+#define ROW_EXTENT_COUNT_SIZE   2
+/* Size of file id in logs */
+#define FILEID_STORE_SIZE 2
+/* Size of page reference in log */
+#define PAGE_STORE_SIZE ROW_EXTENT_PAGE_SIZE
+/* Size of page ranges in log */
+#define PAGERANGE_STORE_SIZE ROW_EXTENT_COUNT_SIZE
+#define DIRPOS_STORE_SIZE 1
+#define CLR_TYPE_STORE_SIZE 1
+/* If table has live checksum we store its changes in UNDOs */
+#define HA_CHECKSUM_STORE_SIZE 4
+#define KEY_NR_STORE_SIZE 1
+#define PAGE_LENGTH_STORE_SIZE 2
+
+/* Store methods to match the above sizes */
+#define fileid_store(T,A) int2store(T,A)
+#define page_store(T,A)   int5store(T,((ulonglong)(A)))
+#define dirpos_store(T,A) ((*(uchar*) (T)) = A)
+#define pagerange_store(T,A) int2store(T,A)
+#define clr_type_store(T,A) ((*(uchar*) (T)) = A)
+#define key_nr_store(T, A) ((*(uchar*) (T)) = A)
+#define ha_checksum_store(T,A) int4store(T,A)
+#define fileid_korr(P) uint2korr(P)
+#define page_korr(P)   uint5korr(P)
+#define dirpos_korr(P) (*(const uchar *) (P))
+#define pagerange_korr(P) uint2korr(P)
+#define clr_type_korr(P) (*(const uchar *) (P))
+#define key_nr_korr(P) (*(const uchar *) (P))
+#define ha_checksum_korr(P) uint4korr(P)
+
+/*
+  Length of disk drive sector size (we assume that writing it
+  to disk is an atomic operation)
+*/
+#define DISK_DRIVE_SECTOR_SIZE 512U
+
+/* position reserved in an array of parts of a log record */
+#define TRANSLOG_INTERNAL_PARTS 2
+
+/* types of records in the transaction log */
+/* TODO: Set numbers for these when we have all entries figured out */
+
+enum translog_record_type
+{
+  LOGREC_RESERVED_FOR_CHUNKS23= 0,
+  LOGREC_REDO_INSERT_ROW_HEAD,
+  LOGREC_REDO_INSERT_ROW_TAIL,
+  LOGREC_REDO_NEW_ROW_HEAD,
+  LOGREC_REDO_NEW_ROW_TAIL,
+  LOGREC_REDO_INSERT_ROW_BLOBS,
+  LOGREC_REDO_PURGE_ROW_HEAD,
+  LOGREC_REDO_PURGE_ROW_TAIL,
+  LOGREC_REDO_FREE_BLOCKS,
+  LOGREC_REDO_FREE_HEAD_OR_TAIL,
+  LOGREC_REDO_DELETE_ROW, /* unused */
+  LOGREC_REDO_UPDATE_ROW_HEAD, /* unused */
+  LOGREC_REDO_INDEX,
+  LOGREC_REDO_INDEX_NEW_PAGE,
+  LOGREC_REDO_INDEX_FREE_PAGE,
+  LOGREC_REDO_UNDELETE_ROW,
+  LOGREC_CLR_END,
+  LOGREC_PURGE_END,
+  LOGREC_UNDO_ROW_INSERT,
+  LOGREC_UNDO_ROW_DELETE,
+  LOGREC_UNDO_ROW_UPDATE,
+  LOGREC_UNDO_KEY_INSERT,
+  LOGREC_UNDO_KEY_INSERT_WITH_ROOT,
+  LOGREC_UNDO_KEY_DELETE,
+  LOGREC_UNDO_KEY_DELETE_WITH_ROOT,
+  LOGREC_PREPARE,
+  LOGREC_PREPARE_WITH_UNDO_PURGE,
+  LOGREC_COMMIT,
+  LOGREC_COMMIT_WITH_UNDO_PURGE,
+  LOGREC_CHECKPOINT,
+  LOGREC_REDO_CREATE_TABLE,
+  LOGREC_REDO_RENAME_TABLE,
+  LOGREC_REDO_DROP_TABLE,
+  LOGREC_REDO_DELETE_ALL,
+  LOGREC_REDO_REPAIR_TABLE,
+  LOGREC_FILE_ID,
+  LOGREC_LONG_TRANSACTION_ID,
+  LOGREC_INCOMPLETE_LOG,
+  LOGREC_INCOMPLETE_GROUP,
+  LOGREC_UNDO_BULK_INSERT,
+  LOGREC_REDO_BITMAP_NEW_PAGE,
+  LOGREC_IMPORTED_TABLE,
+  LOGREC_DEBUG_INFO,
+  LOGREC_FIRST_FREE,
+  LOGREC_RESERVED_FUTURE_EXTENSION= 63
+};
+#define LOGREC_NUMBER_OF_TYPES 64              /* Maximum, can't be extended */
+
+/* Type of operations in LOGREC_REDO_INDEX */
+
+enum en_key_op
+{
+  KEY_OP_NONE,		/* Not used */
+  KEY_OP_OFFSET,	/* Set current position */
+  KEY_OP_SHIFT,		/* Shift up/or down at current position */
+  KEY_OP_CHANGE,	/* Change data at current position */
+  KEY_OP_ADD_PREFIX,    /* Insert data at start of page */
+  KEY_OP_DEL_PREFIX,	/* Delete data at start of page */
+  KEY_OP_ADD_SUFFIX,    /* Insert data at end of page */
+  KEY_OP_DEL_SUFFIX,    /* Delete data at end of page */
+  KEY_OP_CHECK,         /* For debugging; CRC of used part of page */
+  KEY_OP_MULTI_COPY,    /* List of memcpy()s with fixed-len sources in page */
+  KEY_OP_SET_PAGEFLAG,  /* Set pageflag from next byte */
+  KEY_OP_COMPACT_PAGE,	/* Compact key page */
+  KEY_OP_MAX_PAGELENGTH, /* Set page to max page length */
+  KEY_OP_DEBUG,		/* Entry for storing what triggered redo_index */
+  KEY_OP_DEBUG_2	/* Entry for pagelengths */
+};
+
+enum en_key_debug
+{
+  KEY_OP_DEBUG_RTREE_COMBINE, 		/* 0 */
+  KEY_OP_DEBUG_RTREE_SPLIT,		/* 1 */
+  KEY_OP_DEBUG_RTREE_SET_KEY,		/* 2 */
+  KEY_OP_DEBUG_FATHER_CHANGED_1,	/* 3 */
+  KEY_OP_DEBUG_FATHER_CHANGED_2,	/* 4 */
+  KEY_OP_DEBUG_LOG_SPLIT,		/* 5 */
+  KEY_OP_DEBUG_LOG_ADD_1,		/* 6 */
+  KEY_OP_DEBUG_LOG_ADD_2,		/* 7 */
+  KEY_OP_DEBUG_LOG_ADD_3,		/* 8 */
+  KEY_OP_DEBUG_LOG_ADD_4,		/* 9 */
+  KEY_OP_DEBUG_LOG_PREFIX_1,		/* 10 */
+  KEY_OP_DEBUG_LOG_PREFIX_2,		/* 11 */
+  KEY_OP_DEBUG_LOG_PREFIX_3,		/* 12 */
+  KEY_OP_DEBUG_LOG_PREFIX_4,		/* 13 */
+  KEY_OP_DEBUG_LOG_PREFIX_5,		/* 14 */
+  KEY_OP_DEBUG_LOG_DEL_CHANGE_1,	/* 15 */
+  KEY_OP_DEBUG_LOG_DEL_CHANGE_2,	/* 16 */
+  KEY_OP_DEBUG_LOG_DEL_CHANGE_3,	/* 17 */
+  KEY_OP_DEBUG_LOG_DEL_CHANGE_RT,	/* 18 */
+  KEY_OP_DEBUG_LOG_DEL_PREFIX,		/* 19 */
+  KEY_OP_DEBUG_LOG_MIDDLE		/* 20 */
+};
+
+
+enum translog_debug_info_type
+{
+  LOGREC_DEBUG_INFO_QUERY
+};
+
+/* Size of log file; One log file is restricted to 4G */
+typedef uint32 translog_size_t;
+
+#define TRANSLOG_RECORD_HEADER_MAX_SIZE 1024U
+
+typedef struct st_translog_group_descriptor
+{
+  TRANSLOG_ADDRESS addr;
+  uint8 num;
+} TRANSLOG_GROUP;
+
+
+typedef struct st_translog_header_buffer
+{
+  /* LSN of the read record */
+  LSN lsn;
+  /* array of groups descriptors, can be used only if groups_no > 0 */
+  TRANSLOG_GROUP *groups;
+  /* short transaction ID or 0 if it has no sense for the record */
+  SHORT_TRANSACTION_ID short_trid;
+  /*
+     The Record length in buffer (including read header, but excluding
+     hidden part of record (type, short TrID, length)
+  */
+  translog_size_t record_length;
+  /*
+     Buffer for write decoded header of the record (depend on the record
+     type)
+  */
+  uchar header[TRANSLOG_RECORD_HEADER_MAX_SIZE];
+  /* number of groups listed in  */
+  uint groups_no;
+  /* in multi-group number of chunk0 pages (valid only if groups_no > 0) */
+  uint chunk0_pages;
+  /* type of the read record */
+  enum translog_record_type type;
+  /* chunk 0 data address (valid only if groups_no > 0) */
+  TRANSLOG_ADDRESS chunk0_data_addr;
+   /*
+     Real compressed LSN(s) size economy (<number of LSN(s)>*7 - <real_size>)
+  */
+  int16 compressed_LSN_economy;
+  /* short transaction ID or 0 if it has no sense for the record */
+  uint16 non_header_data_start_offset;
+  /* non read body data length in this first chunk */
+  uint16 non_header_data_len;
+  /* chunk 0 data size (valid only if groups_no > 0) */
+  uint16 chunk0_data_len;
+} TRANSLOG_HEADER_BUFFER;
+
+
+typedef struct st_translog_scanner_data
+{
+  uchar buffer[TRANSLOG_PAGE_SIZE];             /* buffer for page content */
+  TRANSLOG_ADDRESS page_addr;                  /* current page address */
+  /* end of the log which we saw last time */
+  TRANSLOG_ADDRESS horizon;
+  TRANSLOG_ADDRESS last_file_page;             /* Last page on in this file */
+  uchar *page;                                  /* page content pointer */
+  /* direct link on the current page or NULL if not supported/requested */
+  PAGECACHE_BLOCK_LINK *direct_link;
+  /* offset of the chunk in the page */
+  translog_size_t page_offset;
+  /* set horizon only once at init */
+  my_bool fixed_horizon;
+  /* try to get direct link on the page if it is possible */
+  my_bool use_direct_link;
+} TRANSLOG_SCANNER_DATA;
+
+
+typedef struct st_translog_reader_data
+{
+  TRANSLOG_HEADER_BUFFER header;                /* Header */
+  TRANSLOG_SCANNER_DATA scanner;                /* chunks scanner */
+  translog_size_t body_offset;                  /* current chunk body offset */
+  /* data offset from the record beginning */
+  translog_size_t current_offset;
+  /* number of bytes read in header */
+  uint16 read_header;
+  uint16 chunk_size;                            /* current chunk size */
+  uint current_group;                           /* current group */
+  uint current_chunk;                           /* current chunk in the group */
+  my_bool eor;                                  /* end of the record */
+} TRANSLOG_READER_DATA;
+
+C_MODE_START
+
+/* Records types for unittests */
+#define LOGREC_FIXED_RECORD_0LSN_EXAMPLE 1
+#define LOGREC_VARIABLE_RECORD_0LSN_EXAMPLE 2
+#define LOGREC_FIXED_RECORD_1LSN_EXAMPLE 3
+#define LOGREC_VARIABLE_RECORD_1LSN_EXAMPLE 4
+#define LOGREC_FIXED_RECORD_2LSN_EXAMPLE 5
+#define LOGREC_VARIABLE_RECORD_2LSN_EXAMPLE 6
+
+extern void translog_example_table_init();
+extern void translog_table_init();
+#define translog_init(D,M,V,I,C,F,R) \
+  translog_init_with_table(D,M,V,I,C,F,R,&translog_table_init,0)
+extern my_bool translog_init_with_table(const char *directory,
+                                        uint32 log_file_max_size,
+                                        uint32 server_version,
+                                        uint32 server_id,
+                                        PAGECACHE *pagecache,
+                                        uint flags,
+                                        my_bool readonly,
+                                        void (*init_table_func)(),
+                                        my_bool no_error);
+
+extern my_bool
+translog_write_record(LSN *lsn, enum translog_record_type type, TRN *trn,
+                      MARIA_HA *tbl_info,
+                      translog_size_t rec_len, uint part_no,
+                      LEX_CUSTRING *parts_data, uchar *store_share_id,
+                      void *hook_arg);
+
+extern void translog_destroy();
+
+extern int translog_read_record_header(LSN lsn, TRANSLOG_HEADER_BUFFER *buff);
+
+extern void translog_free_record_header(TRANSLOG_HEADER_BUFFER *buff);
+
+extern translog_size_t translog_read_record(LSN lsn,
+					    translog_size_t offset,
+					    translog_size_t length,
+					    uchar *buffer,
+					    struct st_translog_reader_data
+					    *data);
+
+extern my_bool translog_flush(TRANSLOG_ADDRESS lsn);
+
+extern my_bool translog_scanner_init(LSN lsn,
+				     my_bool fixed_horizon,
+				     struct st_translog_scanner_data *scanner,
+                                     my_bool use_direct_link);
+extern void translog_destroy_scanner(TRANSLOG_SCANNER_DATA *scanner);
+
+extern int translog_read_next_record_header(TRANSLOG_SCANNER_DATA *scanner,
+                                            TRANSLOG_HEADER_BUFFER *buff);
+extern LSN translog_get_file_max_lsn_stored(uint32 file);
+extern my_bool translog_purge(TRANSLOG_ADDRESS low);
+extern my_bool translog_is_file(uint file_no);
+extern void translog_lock();
+extern void translog_unlock();
+extern void translog_lock_handler_assert_owner();
+extern TRANSLOG_ADDRESS translog_get_horizon();
+extern TRANSLOG_ADDRESS translog_get_horizon_no_lock();
+extern int translog_assign_id_to_share(struct st_maria_handler *tbl_info,
+                                       TRN *trn);
+extern void translog_deassign_id_from_share(struct st_maria_share *share);
+extern void
+translog_assign_id_to_share_from_recovery(struct st_maria_share *share,
+                                          uint16 id);
+extern my_bool translog_walk_filenames(const char *directory,
+                                       my_bool (*callback)(const char *,
+                                                           const char *));
+extern my_bool translog_log_debug_info(TRN *trn,
+                                       enum translog_debug_info_type type,
+                                       uchar *info, size_t length);
+
+enum enum_translog_status
+{
+  TRANSLOG_UNINITED, /* no initialization done or error during initialization */
+  TRANSLOG_OK,       /* transaction log is functioning */
+  TRANSLOG_READONLY, /* read only mode due to write errors */
+  TRANSLOG_SHUTDOWN  /* going to shutdown the loghandler */
+};
+extern enum enum_translog_status translog_status;
+extern ulonglong translog_syncs; /* Number of sync()s */
+
+void translog_soft_sync(my_bool mode);
+void translog_hard_group_commit(my_bool mode);
+int translog_soft_sync_start(void);
+void  translog_soft_sync_end(void);
+void translog_sync();
+void translog_set_group_commit_interval(uint32 interval);
+
+/*
+  all the rest added because of recovery; should we make
+  ma_loghandler_for_recovery.h ?
+*/
+
+#define SHARE_ID_MAX 65535 /* array's size */
+
+extern LSN translog_first_lsn_in_log();
+extern LSN translog_first_theoretical_lsn();
+extern LSN translog_next_LSN(TRANSLOG_ADDRESS addr, TRANSLOG_ADDRESS horizon);
+extern my_bool translog_purge_at_flush();
+extern uint32 translog_get_first_file(TRANSLOG_ADDRESS horizon);
+extern uint32 translog_get_first_needed_file();
+extern char *translog_filename_by_fileno(uint32 file_no, char *path);
+extern void translog_set_file_size(uint32 size);
+
+/* record parts descriptor */
+struct st_translog_parts
+{
+  /* full record length */
+  translog_size_t record_length;
+  /* full record length with chunk headers */
+  translog_size_t total_record_length;
+  /* current part index */
+  uint current;
+  /* total number of elements in parts */
+  uint elements;
+  /* array of parts */
+  LEX_CUSTRING *parts;
+};
+
+typedef my_bool(*prewrite_rec_hook) (enum translog_record_type type,
+                                     TRN *trn,
+                                     struct st_maria_handler *tbl_info,
+                                     void *hook_arg);
+
+typedef my_bool(*inwrite_rec_hook) (enum translog_record_type type,
+                                    TRN *trn,
+                                    struct st_maria_handler *tbl_info,
+                                    LSN *lsn, void *hook_arg);
+
+typedef uint16(*read_rec_hook) (enum translog_record_type type,
+                                uint16 read_length, uchar *read_buff,
+                                uchar *decoded_buff);
+
+
+/* record classes */
+enum record_class
+{
+  LOGRECTYPE_NOT_ALLOWED,
+  LOGRECTYPE_VARIABLE_LENGTH,
+  LOGRECTYPE_PSEUDOFIXEDLENGTH,
+  LOGRECTYPE_FIXEDLENGTH
+};
+
+enum enum_record_in_group {
+  LOGREC_NOT_LAST_IN_GROUP= 0, LOGREC_LAST_IN_GROUP, LOGREC_IS_GROUP_ITSELF
+};
+
+/*
+  Descriptor of log record type
+*/
+typedef struct st_log_record_type_descriptor
+{
+  /* internal class of the record */
+  enum record_class rclass;
+  /*
+    length for fixed-size record, pseudo-fixed record
+    length with uncompressed LSNs
+  */
+  uint16 fixed_length;
+  /* how much record body (belonged to headers too) read with headers */
+  uint16 read_header_len;
+  /* HOOK for writing the record called before lock */
+  prewrite_rec_hook prewrite_hook;
+  /* HOOK for writing the record called when LSN is known, inside lock */
+  inwrite_rec_hook inwrite_hook;
+  /* HOOK for reading headers */
+  read_rec_hook read_hook;
+  /*
+    For pseudo fixed records number of compressed LSNs followed by
+    system header
+  */
+  int16 compressed_LSN;
+  /*  the rest is for maria_read_log & Recovery */
+  /** @brief for debug error messages or "maria_read_log" command-line tool */
+  const char *name;
+  enum enum_record_in_group record_in_group;
+  /* a function to execute when we see the record during the REDO phase */
+  int (*record_execute_in_redo_phase)(const TRANSLOG_HEADER_BUFFER *);
+  /* a function to execute when we see the record during the UNDO phase */
+  int (*record_execute_in_undo_phase)(const TRANSLOG_HEADER_BUFFER *, TRN *);
+} LOG_DESC;
+
+extern LOG_DESC log_record_type_descriptor[LOGREC_NUMBER_OF_TYPES];
+
+typedef enum
+{
+  TRANSLOG_GCOMMIT_NONE,
+  TRANSLOG_GCOMMIT_HARD,
+  TRANSLOG_GCOMMIT_SOFT
+} enum_maria_group_commit;
+extern ulong maria_group_commit;
+extern ulong maria_group_commit_interval;
+typedef enum
+{
+  TRANSLOG_PURGE_IMMIDIATE,
+  TRANSLOG_PURGE_EXTERNAL,
+  TRANSLOG_PURGE_ONDEMAND
+} enum_maria_translog_purge_type;
+extern ulong log_purge_type;
+extern ulong log_file_size;
+
+typedef enum
+{
+  TRANSLOG_SYNC_DIR_NEVER,
+  TRANSLOG_SYNC_DIR_NEWFILE,
+  TRANSLOG_SYNC_DIR_ALWAYS
+} enum_maria_sync_log_dir;
+extern ulong sync_log_dir;
+
+C_MODE_END
+#endif
diff --git a/storage/maria/ma_loghandler_lsn.h b/storage/maria/ma_loghandler_lsn.h
new file mode 100644
index 00000000000..7fa53bc0a50
--- /dev/null
+++ b/storage/maria/ma_loghandler_lsn.h
@@ -0,0 +1,111 @@
+/* Copyright (C) 2007 MySQL AB & Sanja Belkin
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; version 2 of the License.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */
+
+#ifndef _ma_loghandler_lsn_h
+#define _ma_loghandler_lsn_h
+
+/*
+  Transaction log record address:
+  file_no << 32 | offset
+  file_no is only 3 bytes so we can use signed integer to make
+  comparison simpler.
+*/
+typedef int64 TRANSLOG_ADDRESS;
+
+/*
+  Compare addresses
+    A1 >  A2 -> result  > 0
+    A1 == A2 -> 0
+    A1 <  A2 -> result < 0
+*/
+#define cmp_translog_addr(A1,A2) ((A1) - (A2))
+
+/*
+  TRANSLOG_ADDRESS is just address of some byte in the log (usually some
+    chunk)
+  LSN used where address of some record in the log needed (not just any
+    address)
+*/
+typedef TRANSLOG_ADDRESS LSN;
+
+/* Gets file number part of a LSN/log address */
+#define LSN_FILE_NO(L) (uint32) ((L) >> 32)
+
+/* Gets raw file number part of a LSN/log address */
+#define LSN_FILE_NO_PART(L) ((L) & ((int64)0xFFFFFF00000000LL))
+
+/* Parts of LSN for printing */
+#define LSN_IN_PARTS(L) (ulong)LSN_FILE_NO(L),(ulong)LSN_OFFSET(L)
+
+/* Gets record offset of a LSN/log address */
+#define LSN_OFFSET(L) (ulong) ((L) & 0xFFFFFFFFL)
+
+/* Makes lsn/log address from file number and record offset */
+#define MAKE_LSN(F,S) ((LSN) ((((uint64)(F)) << 32) | (S)))
+
+/* checks LSN */
+#define LSN_VALID(L)                                    \
+  ((LSN_FILE_NO_PART(L) != FILENO_IMPOSSIBLE) &&        \
+   (LSN_OFFSET(L) != LOG_OFFSET_IMPOSSIBLE))
+
+/* size of stored LSN on a disk, don't change it! */
+#define LSN_STORE_SIZE 7
+
+/* Puts LSN into buffer (dst) */
+#define lsn_store(dst, lsn) \
+  do { \
+    int3store((dst), LSN_FILE_NO(lsn)); \
+    int4store((char*)(dst) + 3, LSN_OFFSET(lsn)); \
+  } while (0)
+
+/* Unpacks LSN from the buffer (P) */
+#define lsn_korr(P) MAKE_LSN(uint3korr(P), uint4korr((const char*)(P) + 3))
+
+/* what we need to add to LSN to increase it on one file */
+#define LSN_ONE_FILE ((int64)0x100000000LL)
+
+#define LSN_REPLACE_OFFSET(L, S) (LSN_FILE_NO_PART(L) | (S))
+
+/*
+  an 8-byte type whose most significant uchar is used for "flags"; 7
+  other bytes are a LSN.
+*/
+typedef LSN LSN_WITH_FLAGS;
+#define LSN_WITH_FLAGS_TO_LSN(x)   (x & ULL(0x00FFFFFFFFFFFFFF))
+#define LSN_WITH_FLAGS_TO_FLAGS(x) (x & ULL(0xFF00000000000000))
+
+#define FILENO_IMPOSSIBLE     0 /**< log file's numbering starts at 1 */
+#define LOG_OFFSET_IMPOSSIBLE 0 /**< log always has a header */
+#define LSN_IMPOSSIBLE        ((LSN)0)
+/* following LSN also is impossible */
+#define LSN_ERROR             ((LSN)1)
+
+/** @brief some impossible LSN serve as markers */
+
+/**
+   When table is modified by maria_chk, or auto-zerofilled, old REDOs don't
+   apply, table is freshly born again somehow: its state's LSNs need to be
+   updated to the new instance which receives this table.
+*/
+#define LSN_NEEDS_NEW_STATE_LSNS ((LSN)2)
+
+/**
+   @brief the maximum valid LSN.
+   Unlike ULONGLONG_MAX, it can be safely used in comparison with valid LSNs
+   (ULONGLONG_MAX is too big for correctness of cmp_translog_addr()).
+*/
+#define LSN_MAX (LSN)ULL(0x00FFFFFFFFFFFFFF)
+
+#endif
diff --git a/storage/maria/ma_open.c b/storage/maria/ma_open.c
new file mode 100644
index 00000000000..63e1801a39a
--- /dev/null
+++ b/storage/maria/ma_open.c
@@ -0,0 +1,1945 @@
+/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; version 2 of the License.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */
+
+/* open a isam-database */
+
+#include "ma_fulltext.h"
+#include "ma_sp_defs.h"
+#include "ma_rt_index.h"
+#include "ma_blockrec.h"
+#include <m_ctype.h>
+
+#if defined(MSDOS) || defined(__WIN__)
+#ifdef __WIN__
+#include <fcntl.h>
+#else
+#include <process.h>			/* Prototype for getpid */
+#endif
+#endif
+
+static void setup_key_functions(MARIA_KEYDEF *keyinfo);
+static my_bool maria_scan_init_dummy(MARIA_HA *info);
+static void maria_scan_end_dummy(MARIA_HA *info);
+static my_bool maria_once_init_dummy(MARIA_SHARE *, File);
+static my_bool maria_once_end_dummy(MARIA_SHARE *);
+static uchar *_ma_base_info_read(uchar *ptr, MARIA_BASE_INFO *base);
+static uchar *_ma_state_info_read(uchar *ptr, MARIA_STATE_INFO *state);
+
+#define get_next_element(to,pos,size) { memcpy((char*) to,pos,(size_t) size); \
+					pos+=size;}
+
+
+#define disk_pos_assert(pos, end_pos) \
+if (pos > end_pos)             \
+{                              \
+  my_errno=HA_ERR_CRASHED;     \
+  goto err;                    \
+}
+
+
+/******************************************************************************
+** Return the shared struct if the table is already open.
+** In MySQL the server will handle version issues.
+******************************************************************************/
+
+MARIA_HA *_ma_test_if_reopen(const char *filename)
+{
+  LIST *pos;
+
+  for (pos=maria_open_list ; pos ; pos=pos->next)
+  {
+    MARIA_HA *info=(MARIA_HA*) pos->data;
+    MARIA_SHARE *share= info->s;
+    if (!strcmp(share->unique_file_name.str,filename) && share->last_version)
+      return info;
+  }
+  return 0;
+}
+
+
+/*
+  Open a new instance of an already opened Maria table
+
+  SYNOPSIS
+    maria_clone_internal()
+    share	Share of already open table
+    mode	Mode of table (O_RDONLY | O_RDWR)
+    data_file   Filedescriptor of data file to use < 0 if one should open
+	        open it.
+
+ RETURN
+    #   Maria handler
+    0   Error
+*/
+
+
+static MARIA_HA *maria_clone_internal(MARIA_SHARE *share, const char *name,
+                                      int mode, File data_file)
+{
+  int save_errno;
+  uint errpos;
+  MARIA_HA info,*m_info;
+  my_bitmap_map *changed_fields_bitmap;
+  DBUG_ENTER("maria_clone_internal");
+
+  errpos= 0;
+  bzero((uchar*) &info,sizeof(info));
+
+  if (mode == O_RDWR && share->mode == O_RDONLY)
+  {
+    my_errno=EACCES;				/* Can't open in write mode */
+    goto err;
+  }
+  if (data_file >= 0)
+    info.dfile.file= data_file;
+  else if (_ma_open_datafile(&info, share, name, -1))
+    goto err;
+  errpos= 5;
+
+  /* alloc and set up private structure parts */
+  if (!my_multi_malloc(MY_WME,
+		       &m_info,sizeof(MARIA_HA),
+		       &info.blobs,sizeof(MARIA_BLOB)*share->base.blobs,
+		       &info.buff,(share->base.max_key_block_length*2+
+				   share->base.max_key_length),
+		       &info.lastkey_buff,share->base.max_key_length*2+1,
+		       &info.first_mbr_key, share->base.max_key_length,
+		       &info.maria_rtree_recursion_state,
+                       share->have_rtree ? 1024 : 0,
+                       &changed_fields_bitmap,
+                       bitmap_buffer_size(share->base.fields),
+		       NullS))
+    goto err;
+  errpos= 6;
+
+  memcpy(info.blobs,share->blobs,sizeof(MARIA_BLOB)*share->base.blobs);
+  info.lastkey_buff2= info.lastkey_buff + share->base.max_key_length;
+  info.last_key.data= info.lastkey_buff;
+
+  info.s=share;
+  info.cur_row.lastpos= HA_OFFSET_ERROR;
+  info.update= (short) (HA_STATE_NEXT_FOUND+HA_STATE_PREV_FOUND);
+  info.opt_flag=READ_CHECK_USED;
+  info.this_unique= (ulong) info.dfile.file; /* Uniq number in process */
+#ifdef EXTERNAL_LOCKING
+  if (share->data_file_type == COMPRESSED_RECORD)
+    info.this_unique= share->state.unique;
+  info.this_loop=0;				/* Update counter */
+  info.last_unique= share->state.unique;
+  info.last_loop=   share->state.update_count;
+#endif
+  info.errkey= -1;
+  info.page_changed=1;
+  info.keyread_buff= info.buff + share->base.max_key_block_length;
+
+  info.lock_type= F_UNLCK;
+  if (share->options & HA_OPTION_TMP_TABLE)
+    info.lock_type= F_WRLCK;
+
+  _ma_set_data_pagecache_callbacks(&info.dfile, share);
+  bitmap_init(&info.changed_fields, changed_fields_bitmap,
+              share->base.fields, 0);
+  if ((*share->init)(&info))
+    goto err;
+
+  /* The following should be big enough for all pinning purposes */
+  if (my_init_dynamic_array(&info.pinned_pages,
+                            sizeof(MARIA_PINNED_PAGE),
+                            max(share->base.blobs*2 + 4,
+                                MARIA_MAX_TREE_LEVELS*3), 16))
+    goto err;
+
+
+  pthread_mutex_lock(&share->intern_lock);
+  info.read_record= share->read_record;
+  share->reopen++;
+  share->write_flag=MYF(MY_NABP | MY_WAIT_IF_FULL);
+  if (share->options & HA_OPTION_READ_ONLY_DATA)
+  {
+    info.lock_type=F_RDLCK;
+    share->r_locks++;
+    share->tot_locks++;
+  }
+  if ((share->options & HA_OPTION_DELAY_KEY_WRITE) &&
+      maria_delay_key_write)
+    share->delay_key_write=1;
+
+  if (!share->base.born_transactional)   /* For transactional ones ... */
+  {
+    /* ... force crash if no trn given */
+    _ma_set_trn_for_table(&info, &dummy_transaction_object);
+    info.state= &share->state.state;	/* Change global values by default */
+  }
+  else
+  {
+    info.state=  &share->state.common;
+    *info.state= share->state.state;            /* Initial values */
+  }
+  info.state_start= info.state;                 /* Initial values */
+
+  pthread_mutex_unlock(&share->intern_lock);
+
+  /* Allocate buffer for one record */
+  /* prerequisites: info->rec_buffer == 0 && info->rec_buff_size == 0 */
+  if (_ma_alloc_buffer(&info.rec_buff, &info.rec_buff_size,
+                       share->base.default_rec_buff_size))
+    goto err;
+
+  bzero(info.rec_buff, share->base.default_rec_buff_size);
+
+  *m_info=info;
+#ifdef THREAD
+  thr_lock_data_init(&share->lock,&m_info->lock,(void*) m_info);
+#endif
+  m_info->open_list.data=(void*) m_info;
+  maria_open_list=list_add(maria_open_list,&m_info->open_list);
+
+  DBUG_RETURN(m_info);
+
+err:
+  DBUG_PRINT("error", ("error: %d", my_errno));
+  save_errno=my_errno ? my_errno : HA_ERR_END_OF_FILE;
+  if ((save_errno == HA_ERR_CRASHED) ||
+      (save_errno == HA_ERR_CRASHED_ON_USAGE) ||
+      (save_errno == HA_ERR_CRASHED_ON_REPAIR))
+    _ma_report_error(save_errno, &share->open_file_name);
+  switch (errpos) {
+  case 6:
+    (*share->end)(&info);
+    delete_dynamic(&info.pinned_pages);
+    my_free(m_info, MYF(0));
+    /* fall through */
+  case 5:
+    if (data_file < 0)
+      VOID(my_close(info.dfile.file, MYF(0)));
+    break;
+  }
+  my_errno=save_errno;
+  DBUG_RETURN (NULL);
+} /* maria_clone_internal */
+
+
+/* Make a clone of a maria table */
+
+MARIA_HA *maria_clone(MARIA_SHARE *share, int mode)
+{
+  MARIA_HA *new_info;
+  pthread_mutex_lock(&THR_LOCK_maria);
+  new_info= maria_clone_internal(share, NullS, mode,
+                                 share->data_file_type == BLOCK_RECORD ?
+                                 share->bitmap.file.file : -1);
+  pthread_mutex_unlock(&THR_LOCK_maria);
+  return new_info;
+}
+
+
+/******************************************************************************
+  open a MARIA table
+
+  See my_base.h for the handle_locking argument
+  if handle_locking and HA_OPEN_ABORT_IF_CRASHED then abort if the table
+  is marked crashed or if we are not using locking and the table doesn't
+  have an open count of 0.
+******************************************************************************/
+
+MARIA_HA *maria_open(const char *name, int mode, uint open_flags)
+{
+  int kfile,open_mode,save_errno;
+  uint i,j,len,errpos,head_length,base_pos,keys, realpath_err,
+    key_parts,unique_key_parts,fulltext_keys,uniques;
+  size_t info_length;
+  char name_buff[FN_REFLEN], org_name[FN_REFLEN], index_name[FN_REFLEN],
+       data_name[FN_REFLEN];
+  uchar *disk_cache, *disk_pos, *end_pos;
+  MARIA_HA info,*m_info,*old_info;
+  MARIA_SHARE share_buff,*share;
+  double *rec_per_key_part;
+  ulong  *nulls_per_key_part;
+  my_off_t key_root[HA_MAX_POSSIBLE_KEY];
+  ulonglong max_key_file_length, max_data_file_length;
+  my_bool versioning= 1;
+  File data_file= -1;
+  DBUG_ENTER("maria_open");
+
+  LINT_INIT(m_info);
+  kfile= -1;
+  errpos= 0;
+  head_length=sizeof(share_buff.state.header);
+  bzero((uchar*) &info,sizeof(info));
+
+  realpath_err= my_realpath(name_buff, fn_format(org_name, name, "",
+                                                 MARIA_NAME_IEXT,
+                                                 MY_UNPACK_FILENAME),MYF(0));
+  if (my_is_symlink(org_name) &&
+      (realpath_err || (*maria_test_invalid_symlink)(name_buff)))
+  {
+    my_errno= HA_WRONG_CREATE_OPTION;
+    DBUG_RETURN(0);
+  }
+
+  pthread_mutex_lock(&THR_LOCK_maria);
+  old_info= 0;
+  if ((open_flags & HA_OPEN_COPY) ||
+      !(old_info=_ma_test_if_reopen(name_buff)))
+  {
+    share= &share_buff;
+    bzero((uchar*) &share_buff,sizeof(share_buff));
+    share_buff.state.key_root=key_root;
+    share_buff.pagecache= multi_pagecache_search((uchar*) name_buff,
+						 (uint) strlen(name_buff),
+                                                 maria_pagecache);
+
+    DBUG_EXECUTE_IF("maria_pretend_crashed_table_on_open",
+                    if (strstr(name, "/t1"))
+                    {
+                      my_errno= HA_ERR_CRASHED;
+                      goto err;
+                    });
+    if ((kfile=my_open(name_buff,(open_mode=O_RDWR) | O_SHARE,MYF(0))) < 0)
+    {
+      if ((errno != EROFS && errno != EACCES) ||
+	  mode != O_RDONLY ||
+	  (kfile=my_open(name_buff,(open_mode=O_RDONLY) | O_SHARE,MYF(0))) < 0)
+	goto err;
+    }
+    share->mode=open_mode;
+    errpos= 1;
+    if (my_pread(kfile,share->state.header.file_version, head_length, 0,
+                 MYF(MY_NABP)))
+    {
+      my_errno= HA_ERR_NOT_A_TABLE;
+      goto err;
+    }
+    if (memcmp(share->state.header.file_version, maria_file_magic, 4))
+    {
+      DBUG_PRINT("error",("Wrong header in %s",name_buff));
+      DBUG_DUMP("error_dump", share->state.header.file_version,
+		head_length);
+      my_errno=HA_ERR_NOT_A_TABLE;
+      goto err;
+    }
+    share->options= mi_uint2korr(share->state.header.options);
+    if (share->options &
+	~(HA_OPTION_PACK_RECORD | HA_OPTION_PACK_KEYS |
+	  HA_OPTION_COMPRESS_RECORD | HA_OPTION_READ_ONLY_DATA |
+	  HA_OPTION_TEMP_COMPRESS_RECORD | HA_OPTION_CHECKSUM |
+          HA_OPTION_TMP_TABLE | HA_OPTION_DELAY_KEY_WRITE |
+          HA_OPTION_RELIES_ON_SQL_LAYER | HA_OPTION_NULL_FIELDS |
+          HA_OPTION_PAGE_CHECKSUM))
+    {
+      DBUG_PRINT("error",("wrong options: 0x%lx", share->options));
+      my_errno=HA_ERR_NEW_FILE;
+      goto err;
+    }
+    if ((share->options & HA_OPTION_RELIES_ON_SQL_LAYER) &&
+        ! (open_flags & HA_OPEN_FROM_SQL_LAYER))
+    {
+      DBUG_PRINT("error", ("table cannot be opened from non-sql layer"));
+      my_errno= HA_ERR_UNSUPPORTED;
+      goto err;
+    }
+    /* Don't call realpath() if the name can't be a link */
+    if (!strcmp(name_buff, org_name) ||
+        my_readlink(index_name, org_name, MYF(0)) == -1)
+      (void) strmov(index_name, org_name);
+    *strrchr(org_name, FN_EXTCHAR)= '\0';
+    (void) fn_format(data_name,org_name,"",MARIA_NAME_DEXT,
+                     MY_APPEND_EXT|MY_UNPACK_FILENAME|MY_RESOLVE_SYMLINKS);
+
+    info_length=mi_uint2korr(share->state.header.header_length);
+    base_pos= mi_uint2korr(share->state.header.base_pos);
+
+    /*
+      Allocate space for header information and for data that is too
+      big to keep on stack
+    */
+    if (!my_multi_malloc(MY_WME,
+                         &disk_cache, info_length+128,
+                         &rec_per_key_part,
+                         (sizeof(*rec_per_key_part) * HA_MAX_POSSIBLE_KEY *
+                          HA_MAX_KEY_SEG),
+                         &nulls_per_key_part,
+                         (sizeof(*nulls_per_key_part) * HA_MAX_POSSIBLE_KEY *
+                          HA_MAX_KEY_SEG),
+                         NullS))
+    {
+      my_errno=ENOMEM;
+      goto err;
+    }
+    share_buff.state.rec_per_key_part=   rec_per_key_part;
+    share_buff.state.nulls_per_key_part= nulls_per_key_part;
+
+    end_pos=disk_cache+info_length;
+    errpos= 3;
+    if (my_pread(kfile, disk_cache, info_length, 0L, MYF(MY_NABP)))
+    {
+      my_errno=HA_ERR_CRASHED;
+      goto err;
+    }
+    len=mi_uint2korr(share->state.header.state_info_length);
+    keys=    (uint) share->state.header.keys;
+    uniques= (uint) share->state.header.uniques;
+    fulltext_keys= (uint) share->state.header.fulltext_keys;
+    key_parts= mi_uint2korr(share->state.header.key_parts);
+    unique_key_parts= mi_uint2korr(share->state.header.unique_key_parts);
+    if (len != MARIA_STATE_INFO_SIZE)
+    {
+      DBUG_PRINT("warning",
+		 ("saved_state_info_length: %d  state_info_length: %d",
+		  len,MARIA_STATE_INFO_SIZE));
+    }
+    share->state_diff_length=len-MARIA_STATE_INFO_SIZE;
+
+    _ma_state_info_read(disk_cache, &share->state);
+    len= mi_uint2korr(share->state.header.base_info_length);
+    if (len != MARIA_BASE_INFO_SIZE)
+    {
+      DBUG_PRINT("warning",("saved_base_info_length: %d  base_info_length: %d",
+			    len,MARIA_BASE_INFO_SIZE));
+    }
+    disk_pos= _ma_base_info_read(disk_cache + base_pos, &share->base);
+    share->state.state_length=base_pos;
+
+    if (!(open_flags & HA_OPEN_FOR_REPAIR) &&
+	((share->state.changed & STATE_CRASHED) ||
+	 ((open_flags & HA_OPEN_ABORT_IF_CRASHED) &&
+	  (my_disable_locking && share->state.open_count))))
+    {
+      DBUG_PRINT("error",("Table is marked as crashed. open_flags: %u  "
+                          "changed: %u  open_count: %u  !locking: %d",
+                          open_flags, share->state.changed,
+                          share->state.open_count, my_disable_locking));
+      my_errno=((share->state.changed & STATE_CRASHED_ON_REPAIR) ?
+		HA_ERR_CRASHED_ON_REPAIR : HA_ERR_CRASHED_ON_USAGE);
+      goto err;
+    }
+
+    /*
+      We can ignore testing uuid if STATE_NOT_MOVABLE is set, as in this
+      case the uuid will be set in _ma_mark_file_changed()
+    */
+    if ((share->state.changed & STATE_NOT_MOVABLE) &&
+        share->base.born_transactional &&
+        ((!(open_flags & HA_OPEN_IGNORE_MOVED_STATE) &&
+          memcmp(share->base.uuid, maria_uuid, MY_UUID_SIZE)) ||
+         (share->state.create_trid > trnman_get_max_trid() &&
+          !maria_in_recovery)))
+    {
+      DBUG_PRINT("warning", ("table is moved from another system.  uuid_diff: %d  create_trid: %lu  max_trid: %lu",
+                            memcmp(share->base.uuid, maria_uuid,
+                                   MY_UUID_SIZE) != 0,
+                             (ulong) share->state.create_trid,
+                             (ulong) trnman_get_max_trid()));
+      if (open_flags & HA_OPEN_FOR_REPAIR)
+        share->state.changed|= STATE_MOVED;
+      else
+      {
+        my_errno= HA_ERR_OLD_FILE;
+        goto err;
+      }
+    }
+
+    /* sanity check */
+    if (share->base.keystart > 65535 || share->base.rec_reflength > 8)
+    {
+      my_errno=HA_ERR_CRASHED;
+      goto err;
+    }
+
+    key_parts+=fulltext_keys*FT_SEGS;
+    if (share->base.max_key_length > maria_max_key_length() ||
+        keys > MARIA_MAX_KEY || key_parts > MARIA_MAX_KEY * HA_MAX_KEY_SEG)
+    {
+      DBUG_PRINT("error",("Wrong key info:  Max_key_length: %d  keys: %d  key_parts: %d", share->base.max_key_length, keys, key_parts));
+      my_errno=HA_ERR_UNSUPPORTED;
+      goto err;
+    }
+
+    /* Ensure we have space in the key buffer for transaction id's */
+    if (share->base.born_transactional)
+      share->base.max_key_length= ALIGN_SIZE(share->base.max_key_length +
+                                             MARIA_MAX_PACK_TRANSID_SIZE);
+
+    /*
+      If page cache is not initialized, then assume we will create the
+      page_cache after the table is opened!
+      This is only used by maria_check to allow it to check/repair tables
+      with different block sizes.
+    */
+    if (share->base.block_size != maria_block_size &&
+        share_buff.pagecache->inited != 0)
+    {
+      DBUG_PRINT("error", ("Wrong block size %u; Expected %u",
+                           (uint) share->base.block_size,
+                           (uint) maria_block_size));
+      my_errno=HA_ERR_UNSUPPORTED;
+      goto err;
+    }
+
+    /* Correct max_file_length based on length of sizeof(off_t) */
+    max_data_file_length=
+      (share->options & (HA_OPTION_PACK_RECORD | HA_OPTION_COMPRESS_RECORD)) ?
+      (((ulonglong) 1 << (share->base.rec_reflength*8))-1) :
+      (_ma_safe_mul(share->base.pack_reclength,
+		   (ulonglong) 1 << (share->base.rec_reflength*8))-1);
+
+    max_key_file_length=
+      _ma_safe_mul(maria_block_size,
+		  ((ulonglong) 1 << (share->base.key_reflength*8))-1);
+#if SIZEOF_OFF_T == 4
+    set_if_smaller(max_data_file_length, INT_MAX32);
+    set_if_smaller(max_key_file_length, INT_MAX32);
+#endif
+    share->base.max_data_file_length=(my_off_t) max_data_file_length;
+    share->base.max_key_file_length=(my_off_t) max_key_file_length;
+
+    if (share->options & HA_OPTION_COMPRESS_RECORD)
+      share->base.max_key_length+=2;	/* For safety */
+    /* Add space for node pointer */
+    share->base.max_key_length+= share->base.key_reflength;
+
+    share->unique_file_name.length= strlen(name_buff);
+    share->index_file_name.length=  strlen(index_name);
+    share->data_file_name.length=   strlen(data_name);
+    share->open_file_name.length=   strlen(name);
+    if (!my_multi_malloc(MY_WME,
+			 &share,sizeof(*share),
+			 &share->state.rec_per_key_part,
+                         sizeof(double) * key_parts,
+                         &share->state.nulls_per_key_part,
+                         sizeof(long)* key_parts,
+			 &share->keyinfo,keys*sizeof(MARIA_KEYDEF),
+			 &share->uniqueinfo,uniques*sizeof(MARIA_UNIQUEDEF),
+			 &share->keyparts,
+			 (key_parts+unique_key_parts+keys+uniques) *
+			 sizeof(HA_KEYSEG),
+			 &share->columndef,
+			 (share->base.fields+1)*sizeof(MARIA_COLUMNDEF),
+                         &share->column_nr, share->base.fields*sizeof(uint16),
+			 &share->blobs,sizeof(MARIA_BLOB)*share->base.blobs,
+			 &share->unique_file_name.str,
+			 share->unique_file_name.length+1,
+			 &share->index_file_name.str,
+                         share->index_file_name.length+1,
+			 &share->data_file_name.str,
+                         share->data_file_name.length+1,
+                         &share->open_file_name.str,
+                         share->open_file_name.length+1,
+			 &share->state.key_root,keys*sizeof(my_off_t),
+			 &share->mmap_lock,sizeof(rw_lock_t),
+			 NullS))
+      goto err;
+    errpos= 4;
+
+    *share=share_buff;
+    memcpy((char*) share->state.rec_per_key_part,
+	   (char*) rec_per_key_part, sizeof(double)*key_parts);
+    memcpy((char*) share->state.nulls_per_key_part,
+	   (char*) nulls_per_key_part, sizeof(long)*key_parts);
+    memcpy((char*) share->state.key_root,
+	   (char*) key_root, sizeof(my_off_t)*keys);
+    strmov(share->unique_file_name.str, name_buff);
+    strmov(share->index_file_name.str, index_name);
+    strmov(share->data_file_name.str,  data_name);
+    strmov(share->open_file_name.str,  name);
+
+    share->block_size= share->base.block_size;   /* Convenience */
+    share->max_index_block_size= share->block_size - KEYPAGE_CHECKSUM_SIZE;
+    {
+      HA_KEYSEG *pos=share->keyparts;
+      uint32 ftkey_nr= 1;
+      for (i=0 ; i < keys ; i++)
+      {
+        share->keyinfo[i].share= share;
+	disk_pos=_ma_keydef_read(disk_pos, &share->keyinfo[i]);
+        share->keyinfo[i].key_nr= i;
+        disk_pos_assert(disk_pos + share->keyinfo[i].keysegs * HA_KEYSEG_SIZE,
+ 			end_pos);
+        if (share->keyinfo[i].key_alg == HA_KEY_ALG_RTREE)
+          share->have_rtree= 1;
+	share->keyinfo[i].seg=pos;
+	for (j=0 ; j < share->keyinfo[i].keysegs; j++,pos++)
+	{
+	  disk_pos=_ma_keyseg_read(disk_pos, pos);
+	  if (pos->type == HA_KEYTYPE_TEXT ||
+              pos->type == HA_KEYTYPE_VARTEXT1 ||
+              pos->type == HA_KEYTYPE_VARTEXT2)
+	  {
+	    if (!pos->language)
+	      pos->charset=default_charset_info;
+	    else if (!(pos->charset= get_charset(pos->language, MYF(MY_WME))))
+	    {
+	      my_errno=HA_ERR_UNKNOWN_CHARSET;
+	      goto err;
+	    }
+	  }
+	  else if (pos->type == HA_KEYTYPE_BINARY)
+	    pos->charset= &my_charset_bin;
+	}
+	if (share->keyinfo[i].flag & HA_SPATIAL)
+	{
+#ifdef HAVE_SPATIAL
+	  uint sp_segs=SPDIMS*2;
+	  share->keyinfo[i].seg=pos-sp_segs;
+	  share->keyinfo[i].keysegs--;
+          versioning= 0;
+#else
+	  my_errno=HA_ERR_UNSUPPORTED;
+	  goto err;
+#endif
+	}
+        else if (share->keyinfo[i].flag & HA_FULLTEXT)
+	{
+          versioning= 0;
+          DBUG_ASSERT(fulltext_keys);
+          {
+            uint k;
+            share->keyinfo[i].seg=pos;
+            for (k=0; k < FT_SEGS; k++)
+            {
+              *pos= ft_keysegs[k];
+              pos[0].language= pos[-1].language;
+              if (!(pos[0].charset= pos[-1].charset))
+              {
+                my_errno=HA_ERR_CRASHED;
+                goto err;
+              }
+              pos++;
+            }
+          }
+          if (!share->ft2_keyinfo.seg)
+          {
+            memcpy(&share->ft2_keyinfo, &share->keyinfo[i],
+                   sizeof(MARIA_KEYDEF));
+            share->ft2_keyinfo.keysegs=1;
+            share->ft2_keyinfo.flag=0;
+            share->ft2_keyinfo.keylength=
+            share->ft2_keyinfo.minlength=
+            share->ft2_keyinfo.maxlength=HA_FT_WLEN+share->base.rec_reflength;
+            share->ft2_keyinfo.seg=pos-1;
+            share->ft2_keyinfo.end=pos;
+            setup_key_functions(& share->ft2_keyinfo);
+          }
+          share->keyinfo[i].ftkey_nr= ftkey_nr++;
+	}
+        setup_key_functions(share->keyinfo+i);
+	share->keyinfo[i].end=pos;
+	pos->type=HA_KEYTYPE_END;			/* End */
+	pos->length=share->base.rec_reflength;
+	pos->null_bit=0;
+	pos->flag=0;					/* For purify */
+	pos++;
+      }
+      for (i=0 ; i < uniques ; i++)
+      {
+	disk_pos=_ma_uniquedef_read(disk_pos, &share->uniqueinfo[i]);
+        disk_pos_assert(disk_pos + share->uniqueinfo[i].keysegs *
+			HA_KEYSEG_SIZE, end_pos);
+	share->uniqueinfo[i].seg=pos;
+	for (j=0 ; j < share->uniqueinfo[i].keysegs; j++,pos++)
+	{
+	  disk_pos=_ma_keyseg_read(disk_pos, pos);
+	  if (pos->type == HA_KEYTYPE_TEXT ||
+              pos->type == HA_KEYTYPE_VARTEXT1 ||
+              pos->type == HA_KEYTYPE_VARTEXT2)
+	  {
+	    if (!pos->language)
+	      pos->charset=default_charset_info;
+	    else if (!(pos->charset= get_charset(pos->language, MYF(MY_WME))))
+	    {
+	      my_errno=HA_ERR_UNKNOWN_CHARSET;
+	      goto err;
+	    }
+	  }
+	}
+	share->uniqueinfo[i].end=pos;
+	pos->type=HA_KEYTYPE_END;			/* End */
+	pos->null_bit=0;
+	pos->flag=0;
+	pos++;
+      }
+      share->ftkeys= ftkey_nr;
+    }
+    share->data_file_type= share->state.header.data_file_type;
+    share->base_length= (BASE_ROW_HEADER_SIZE +
+                         share->base.is_nulls_extended +
+                         share->base.null_bytes +
+                         share->base.pack_bytes +
+                         test(share->options & HA_OPTION_CHECKSUM));
+    share->keypage_header= ((share->base.born_transactional ?
+                             LSN_STORE_SIZE + TRANSID_SIZE :
+                             0) + KEYPAGE_KEYID_SIZE + KEYPAGE_FLAG_SIZE +
+                            KEYPAGE_USED_SIZE);
+    share->kfile.file= kfile;
+
+    if (open_flags & HA_OPEN_COPY)
+    {
+      /*
+        this instance will be a temporary one used just to create a data
+        file for REPAIR. Don't do logging. This base information will not go
+        to disk.
+      */
+      share->base.born_transactional= FALSE;
+    }
+    if (share->base.born_transactional)
+    {
+      share->page_type= PAGECACHE_LSN_PAGE;
+      if (share->state.create_rename_lsn == LSN_NEEDS_NEW_STATE_LSNS)
+      {
+        /*
+          Was repaired with maria_chk, maybe later maria_pack-ed. Some sort of
+          import into the server. It starts its existence (from the point of
+          view of the server, including server's recovery) now.
+        */
+        if (((open_flags & HA_OPEN_FROM_SQL_LAYER) &&
+             (share->state.changed & STATE_NOT_MOVABLE)) || maria_in_recovery)
+          _ma_update_state_lsns_sub(share, LSN_IMPOSSIBLE,
+                                    trnman_get_min_safe_trid(), TRUE, TRUE);
+      }
+      else if ((!LSN_VALID(share->state.create_rename_lsn) ||
+                !LSN_VALID(share->state.is_of_horizon) ||
+                (cmp_translog_addr(share->state.create_rename_lsn,
+                                   share->state.is_of_horizon) > 0) ||
+                !LSN_VALID(share->state.skip_redo_lsn) ||
+                (cmp_translog_addr(share->state.create_rename_lsn,
+                                   share->state.skip_redo_lsn) > 0)) &&
+               !(open_flags & HA_OPEN_FOR_REPAIR))
+      {
+        /*
+          If in Recovery, it will not work. If LSN is invalid and not
+          LSN_NEEDS_NEW_STATE_LSNS, header must be corrupted.
+          In both cases, must repair.
+        */
+        my_errno=((share->state.changed & STATE_CRASHED_ON_REPAIR) ?
+                  HA_ERR_CRASHED_ON_REPAIR : HA_ERR_CRASHED_ON_USAGE);
+        goto err;
+      }
+    }
+    else
+      share->page_type= PAGECACHE_PLAIN_PAGE;
+    share->now_transactional= share->base.born_transactional;
+
+    /* Use pack_reclength as we don't want to modify base.pack_recklength */
+    if (share->state.header.org_data_file_type == DYNAMIC_RECORD)
+    {
+      /* add bits used to pack data to pack_reclength for faster allocation */
+      share->base.pack_reclength+= share->base.pack_bytes;
+      share->base.extra_rec_buff_size=
+        (ALIGN_SIZE(MARIA_MAX_DYN_BLOCK_HEADER) + MARIA_SPLIT_LENGTH +
+         MARIA_REC_BUFF_OFFSET);
+    }
+    if (share->data_file_type == COMPRESSED_RECORD)
+    {
+      /* Need some extra bytes for decode_bytes */
+      share->base.extra_rec_buff_size+= 7;
+    }
+    share->base.default_rec_buff_size= max(share->base.pack_reclength +
+                                           share->base.extra_rec_buff_size,
+                                           share->base.max_key_length);
+
+    disk_pos_assert(disk_pos + share->base.fields *MARIA_COLUMNDEF_SIZE,
+                    end_pos);
+    for (i= j= 0 ; i < share->base.fields ; i++)
+    {
+      disk_pos=_ma_columndef_read(disk_pos,&share->columndef[i]);
+      share->columndef[i].pack_type=0;
+      share->columndef[i].huff_tree=0;
+      if (share->columndef[i].type == FIELD_BLOB)
+      {
+	share->blobs[j].pack_length=
+	  share->columndef[i].length-portable_sizeof_char_ptr;
+	share->blobs[j].offset= share->columndef[i].offset;
+	j++;
+      }
+    }
+    share->columndef[i].type= FIELD_LAST;	/* End marker */
+    disk_pos= _ma_column_nr_read(disk_pos, share->column_nr,
+                                 share->base.fields);
+
+    if ((share->data_file_type == BLOCK_RECORD ||
+         share->data_file_type == COMPRESSED_RECORD))
+    {
+      if (_ma_open_datafile(&info, share, name, -1))
+        goto err;
+      data_file= info.dfile.file;
+    }
+    errpos= 5;
+
+    if (open_flags & HA_OPEN_DELAY_KEY_WRITE)
+      share->options|= HA_OPTION_DELAY_KEY_WRITE;
+    if (mode == O_RDONLY)
+      share->options|= HA_OPTION_READ_ONLY_DATA;
+    share->is_log_table= FALSE;
+
+    if (open_flags & HA_OPEN_TMP_TABLE)
+    {
+      share->options|= HA_OPTION_TMP_TABLE;
+      share->temporary= share->delay_key_write= 1;
+      share->write_flag=MYF(MY_NABP);
+      share->w_locks++;			/* We don't have to update status */
+      share->tot_locks++;
+    }
+
+    _ma_set_index_pagecache_callbacks(&share->kfile, share);
+    share->this_process=(ulong) getpid();
+#ifdef EXTERNAL_LOCKING
+    share->last_process= share->state.process;
+#endif
+    share->base.key_parts=key_parts;
+    share->base.all_key_parts=key_parts+unique_key_parts;
+    if (!(share->last_version=share->state.version))
+      share->last_version=1;			/* Safety */
+    share->rec_reflength=share->base.rec_reflength; /* May be changed */
+    share->base.margin_key_file_length=(share->base.max_key_file_length -
+					(keys ? MARIA_INDEX_BLOCK_MARGIN *
+					 share->block_size * keys : 0));
+    share->block_size= share->base.block_size;
+    my_free(disk_cache, MYF(0));
+    _ma_setup_functions(share);
+    if ((*share->once_init)(share, info.dfile.file))
+      goto err;
+    if (share->now_transactional)
+    {
+      /* Setup initial state that is visible for all */
+      MARIA_STATE_HISTORY_CLOSED *history;
+      if ((history= (MARIA_STATE_HISTORY_CLOSED *)
+           hash_search(&maria_stored_state,
+                       (uchar*) &share->state.create_rename_lsn, 0)))
+      {
+        /*
+          Move history from hash to share. This is safe to do as we
+          don't have a lock on share->intern_lock.
+        */
+        share->state_history=
+          _ma_remove_not_visible_states(history->state_history, 0, 0);
+        history->state_history= 0;
+        (void) hash_delete(&maria_stored_state, (uchar*) history);
+      }
+      else
+      {
+        /* Table is not part of any active transaction; Create new history */
+        if (!(share->state_history= (MARIA_STATE_HISTORY *)
+              my_malloc(sizeof(*share->state_history), MYF(MY_WME))))
+          goto err;
+        share->state_history->trid= 0;          /* Visible by all */
+        share->state_history->state= share->state.state;
+        share->state_history->next= 0;
+      }
+    }
+#ifdef THREAD
+    thr_lock_init(&share->lock);
+    pthread_mutex_init(&share->intern_lock, MY_MUTEX_INIT_FAST);
+    pthread_mutex_init(&share->key_del_lock, MY_MUTEX_INIT_FAST);
+    pthread_cond_init(&share->key_del_cond, 0);
+    pthread_mutex_init(&share->close_lock, MY_MUTEX_INIT_FAST);
+    for (i=0; i<keys; i++)
+      VOID(my_rwlock_init(&share->keyinfo[i].root_lock, NULL));
+    VOID(my_rwlock_init(&share->mmap_lock, NULL));
+
+    share->row_is_visible= _ma_row_visible_always;
+    share->lock.get_status= _ma_reset_update_flag;
+    if (!thr_lock_inited)
+    {
+      /* Probably a single threaded program; Don't use concurrent inserts */
+      maria_concurrent_insert=0;
+    }
+    else if (maria_concurrent_insert)
+    {
+      share->non_transactional_concurrent_insert=
+	((share->options & (HA_OPTION_READ_ONLY_DATA | HA_OPTION_TMP_TABLE |
+                            HA_OPTION_COMPRESS_RECORD |
+                            HA_OPTION_TEMP_COMPRESS_RECORD)) ||
+	 (open_flags & HA_OPEN_TMP_TABLE) ||
+         share->data_file_type == BLOCK_RECORD ||
+	 share->have_rtree) ? 0 : 1;
+      if (share->non_transactional_concurrent_insert ||
+          (!share->temporary && share->now_transactional && versioning))
+      {
+        share->lock_key_trees= 1;
+        if (share->data_file_type == BLOCK_RECORD)
+        {
+          DBUG_ASSERT(share->now_transactional);
+          share->have_versioning= 1;
+          share->row_is_visible=     _ma_row_visible_transactional_table;
+          share->lock.get_status=    _ma_block_get_status;
+          share->lock.check_status=  _ma_block_check_status;
+          share->lock.start_trans=   _ma_block_start_trans;
+          /*
+            We can for the moment only allow multiple concurrent inserts
+            only if there is no auto-increment key.  To lift this restriction
+            we have to:
+            - Extend statement base replication to support auto-increment
+            intervalls.
+            - Fix that we allocate auto-increment in intervals and that
+              it's properly reset if the interval was not used
+          */
+          share->lock.allow_multiple_concurrent_insert=
+            share->base.auto_key == 0;
+          share->lock_restore_status= 0;
+        }
+        else
+        {
+          share->row_is_visible=      _ma_row_visible_non_transactional_table;
+          share->lock.get_status=     _ma_get_status;
+          share->lock.copy_status=    _ma_copy_status;
+          share->lock.update_status=  _ma_update_status;
+          share->lock.restore_status= _ma_restore_status;
+          share->lock.check_status=   _ma_check_status;
+          share->lock_restore_status= _ma_restore_status;
+        }
+      }
+      else if (share->now_transactional)
+      {
+        DBUG_ASSERT(share->data_file_type == BLOCK_RECORD);
+        share->lock.start_trans=    _ma_block_start_trans_no_versioning;
+      }
+    }
+#endif
+    /*
+      Memory mapping can only be requested after initializing intern_lock.
+    */
+    if (open_flags & HA_OPEN_MMAP)
+    {
+      info.s= share;
+      maria_extra(&info, HA_EXTRA_MMAP, 0);
+    }
+  }
+  else
+  {
+    share= old_info->s;
+    if (share->data_file_type == BLOCK_RECORD)
+      data_file= share->bitmap.file.file;       /* Only opened once */
+  }
+
+  if (!(m_info= maria_clone_internal(share, name, mode, data_file)))
+    goto err;
+
+  if (maria_is_crashed(m_info))
+    DBUG_PRINT("warning", ("table is crashed: changed: %u",
+                           share->state.changed));
+
+  pthread_mutex_unlock(&THR_LOCK_maria);
+  DBUG_RETURN(m_info);
+
+err:
+  DBUG_PRINT("error", ("error: %d  errpos: %d", my_errno, errpos));
+  save_errno=my_errno ? my_errno : HA_ERR_END_OF_FILE;
+  if ((save_errno == HA_ERR_CRASHED) ||
+      (save_errno == HA_ERR_CRASHED_ON_USAGE) ||
+      (save_errno == HA_ERR_CRASHED_ON_REPAIR))
+  {
+    LEX_STRING tmp_name;
+    tmp_name.str= (char*) name;
+    tmp_name.length= strlen(name);
+    _ma_report_error(save_errno, &tmp_name);
+  }
+  if (save_errno == HA_ERR_OLD_FILE) /* uuid is different ? */
+    save_errno= HA_ERR_CRASHED_ON_USAGE; /* the code to trigger auto-repair */
+  switch (errpos) {
+  case 5:
+    if (data_file >= 0)
+      VOID(my_close(data_file, MYF(0)));
+    if (old_info)
+      break;					/* Don't remove open table */
+    (*share->once_end)(share);
+    /* fall through */
+  case 4:
+    my_free(share,MYF(0));
+    /* fall through */
+  case 3:
+    my_free(disk_cache, MYF(0));
+    /* fall through */
+  case 1:
+    VOID(my_close(kfile,MYF(0)));
+    /* fall through */
+  case 0:
+  default:
+    break;
+  }
+  pthread_mutex_unlock(&THR_LOCK_maria);
+  my_errno= save_errno;
+  DBUG_RETURN (NULL);
+} /* maria_open */
+
+
+/*
+  Reallocate a buffer, if the current buffer is not large enough
+*/
+
+my_bool _ma_alloc_buffer(uchar **old_addr, size_t *old_size,
+                         size_t new_size)
+{
+  if (*old_size < new_size)
+  {
+    uchar *addr;
+    if (!(addr= (uchar*) my_realloc(*old_addr, new_size,
+                                    MYF(MY_ALLOW_ZERO_PTR))))
+      return 1;
+    *old_addr= addr;
+    *old_size= new_size;
+  }
+  return 0;
+}
+
+
+ulonglong _ma_safe_mul(ulonglong a, ulonglong b)
+{
+  ulonglong max_val= ~ (ulonglong) 0;		/* my_off_t is unsigned */
+
+  if (!a || max_val / a < b)
+    return max_val;
+  return a*b;
+}
+
+	/* Set up functions in structs */
+
+void _ma_setup_functions(register MARIA_SHARE *share)
+{
+  share->once_init=          maria_once_init_dummy;
+  share->once_end=           maria_once_end_dummy;
+  share->init=      	     maria_scan_init_dummy;
+  share->end=       	     maria_scan_end_dummy;
+  share->scan_init=          maria_scan_init_dummy;/* Compat. dummy function */
+  share->scan_end=           maria_scan_end_dummy;/* Compat. dummy function */
+  share->scan_remember_pos=  _ma_def_scan_remember_pos;
+  share->scan_restore_pos=   _ma_def_scan_restore_pos;
+
+  share->write_record_init=  _ma_write_init_default;
+  share->write_record_abort= _ma_write_abort_default;
+  share->keypos_to_recpos=   _ma_transparent_recpos;
+  share->recpos_to_keypos=   _ma_transparent_recpos;
+
+  switch (share->data_file_type) {
+  case COMPRESSED_RECORD:
+    share->read_record= _ma_read_pack_record;
+    share->scan= _ma_read_rnd_pack_record;
+    share->once_init= _ma_once_init_pack_row;
+    share->once_end=  _ma_once_end_pack_row;
+    /*
+      Calculate checksum according to data in the original, not compressed,
+      row.
+    */
+    if (share->state.header.org_data_file_type == STATIC_RECORD &&
+        ! (share->options & HA_OPTION_NULL_FIELDS))
+      share->calc_checksum= _ma_static_checksum;
+    else
+      share->calc_checksum= _ma_checksum;
+    share->calc_write_checksum= share->calc_checksum;
+    break;
+  case DYNAMIC_RECORD:
+    share->read_record= _ma_read_dynamic_record;
+    share->scan= _ma_read_rnd_dynamic_record;
+    share->delete_record= _ma_delete_dynamic_record;
+    share->compare_record= _ma_cmp_dynamic_record;
+    share->compare_unique= _ma_cmp_dynamic_unique;
+    share->calc_checksum= share->calc_write_checksum= _ma_checksum;
+    if (share->base.blobs)
+    {
+      share->update_record= _ma_update_blob_record;
+      share->write_record= _ma_write_blob_record;
+    }
+    else
+    {
+      share->write_record= _ma_write_dynamic_record;
+      share->update_record= _ma_update_dynamic_record;
+    }
+    break;
+  case STATIC_RECORD:
+    share->read_record=      _ma_read_static_record;
+    share->scan=             _ma_read_rnd_static_record;
+    share->delete_record=    _ma_delete_static_record;
+    share->compare_record=   _ma_cmp_static_record;
+    share->update_record=    _ma_update_static_record;
+    share->write_record=     _ma_write_static_record;
+    share->compare_unique=   _ma_cmp_static_unique;
+    share->keypos_to_recpos= _ma_static_keypos_to_recpos;
+    share->recpos_to_keypos= _ma_static_recpos_to_keypos;
+    if (share->state.header.org_data_file_type == STATIC_RECORD &&
+        ! (share->options & HA_OPTION_NULL_FIELDS))
+      share->calc_checksum= _ma_static_checksum;
+    else
+      share->calc_checksum= _ma_checksum;
+    break;
+  case BLOCK_RECORD:
+    share->once_init= _ma_once_init_block_record;
+    share->once_end=  _ma_once_end_block_record;
+    share->init=      _ma_init_block_record;
+    share->end=       _ma_end_block_record;
+    share->write_record_init= _ma_write_init_block_record;
+    share->write_record_abort= _ma_write_abort_block_record;
+    share->scan_init=   _ma_scan_init_block_record;
+    share->scan_end=    _ma_scan_end_block_record;
+    share->scan=        _ma_scan_block_record;
+    share->scan_remember_pos=  _ma_scan_remember_block_record;
+    share->scan_restore_pos=   _ma_scan_restore_block_record;
+    share->read_record= _ma_read_block_record;
+    share->delete_record= _ma_delete_block_record;
+    share->compare_record= _ma_compare_block_record;
+    share->update_record= _ma_update_block_record;
+    share->write_record=  _ma_write_block_record;
+    share->compare_unique= _ma_cmp_block_unique;
+    share->calc_checksum= _ma_checksum;
+    share->keypos_to_recpos= _ma_transaction_keypos_to_recpos;
+    share->recpos_to_keypos= _ma_transaction_recpos_to_keypos;
+
+    /*
+      write_block_record() will calculate the checksum; Tell maria_write()
+      that it doesn't have to do this.
+    */
+    share->calc_write_checksum= 0;
+    break;
+  }
+  share->file_read= _ma_nommap_pread;
+  share->file_write= _ma_nommap_pwrite;
+  share->calc_check_checksum= share->calc_checksum;
+
+  if (!(share->options & HA_OPTION_CHECKSUM) &&
+      share->data_file_type != COMPRESSED_RECORD)
+    share->calc_checksum= share->calc_write_checksum= 0;
+  return;
+}
+
+
+static void setup_key_functions(register MARIA_KEYDEF *keyinfo)
+{
+  if (keyinfo->key_alg == HA_KEY_ALG_RTREE)
+  {
+#ifdef HAVE_RTREE_KEYS
+    keyinfo->ck_insert = maria_rtree_insert;
+    keyinfo->ck_delete = maria_rtree_delete;
+#else
+    DBUG_ASSERT(0); /* maria_open should check it never happens */
+#endif
+  }
+  else
+  {
+    keyinfo->ck_insert = _ma_ck_write;
+    keyinfo->ck_delete = _ma_ck_delete;
+  }
+  if (keyinfo->flag & HA_SPATIAL)
+    keyinfo->make_key= _ma_sp_make_key;
+  else
+    keyinfo->make_key= _ma_make_key;
+
+  if (keyinfo->flag & HA_BINARY_PACK_KEY)
+  {						/* Simple prefix compression */
+    keyinfo->bin_search= _ma_seq_search;
+    keyinfo->get_key= _ma_get_binary_pack_key;
+    keyinfo->skip_key= _ma_skip_binary_pack_key;
+    keyinfo->pack_key= _ma_calc_bin_pack_key_length;
+    keyinfo->store_key= _ma_store_bin_pack_key;
+  }
+  else if (keyinfo->flag & HA_VAR_LENGTH_KEY)
+  {
+    keyinfo->get_key=  _ma_get_pack_key;
+    keyinfo->skip_key= _ma_skip_pack_key;
+    if (keyinfo->seg[0].flag & HA_PACK_KEY)
+    {						/* Prefix compression */
+      /*
+        _ma_prefix_search() compares end-space against ASCII blank (' ').
+        It cannot be used for character sets, that do not encode the
+        blank character like ASCII does. UCS2 is an example. All
+        character sets with a fixed width > 1 or a mimimum width > 1
+        cannot represent blank like ASCII does. In these cases we have
+        to use _ma_seq_search() for the search.
+      */
+      if (!keyinfo->seg->charset || use_strnxfrm(keyinfo->seg->charset) ||
+          (keyinfo->seg->flag & HA_NULL_PART) ||
+          keyinfo->seg->charset->mbminlen > 1)
+        keyinfo->bin_search= _ma_seq_search;
+      else
+        keyinfo->bin_search= _ma_prefix_search;
+      keyinfo->pack_key= _ma_calc_var_pack_key_length;
+      keyinfo->store_key= _ma_store_var_pack_key;
+    }
+    else
+    {
+      keyinfo->bin_search= _ma_seq_search;
+      keyinfo->pack_key= _ma_calc_var_key_length; /* Variable length key */
+      keyinfo->store_key= _ma_store_static_key;
+    }
+  }
+  else
+  {
+    keyinfo->bin_search= _ma_bin_search;
+    keyinfo->get_key= _ma_get_static_key;
+    keyinfo->skip_key= _ma_skip_static_key;
+    keyinfo->pack_key= _ma_calc_static_key_length;
+    keyinfo->store_key= _ma_store_static_key;
+  }
+
+  /* set keyinfo->write_comp_flag */
+  if (keyinfo->flag & HA_SORT_ALLOWS_SAME)
+    keyinfo->write_comp_flag=SEARCH_BIGGER; /* Put after same key */
+  else if (keyinfo->flag & ( HA_NOSAME | HA_FULLTEXT))
+  {
+    keyinfo->write_comp_flag= SEARCH_FIND | SEARCH_UPDATE; /* No duplicates */
+    if (keyinfo->flag & HA_NULL_ARE_EQUAL)
+      keyinfo->write_comp_flag|= SEARCH_NULL_ARE_EQUAL;
+  }
+  else
+    keyinfo->write_comp_flag= SEARCH_SAME; /* Keys in rec-pos order */
+  keyinfo->write_comp_flag|= SEARCH_INSERT;
+  return;
+}
+
+
+/**
+   @brief Function to save and store the header in the index file (.MYI)
+
+   Operates under MARIA_SHARE::intern_lock if requested.
+   Sets MARIA_SHARE::MARIA_STATE_INFO::is_of_horizon if transactional table.
+   Then calls _ma_state_info_write_sub().
+
+   @param  share           table
+   @param  pWrite          bitmap: if 1 (MA_STATE_INFO_WRITE_DONT_MOVE_OFFSET)
+                           is set my_pwrite() is used otherwise my_write();
+                           if 2 (MA_STATE_INFO_WRITE_FULL_INFO) is set, info
+                           about keys is written (should only be needed
+                           after ALTER TABLE ENABLE/DISABLE KEYS, and
+                           REPAIR/OPTIMIZE); if 4 (MA_STATE_INFO_WRITE_LOCK)
+                           is set, MARIA_SHARE::intern_lock is taken.
+
+   @return Operation status
+     @retval 0      OK
+     @retval 1      Error
+*/
+
+uint _ma_state_info_write(MARIA_SHARE *share, uint pWrite)
+{
+  uint res;
+  if (share->options & HA_OPTION_READ_ONLY_DATA)
+    return 0;
+
+  if (pWrite & MA_STATE_INFO_WRITE_LOCK)
+    pthread_mutex_lock(&share->intern_lock);
+  else if (maria_multi_threaded)
+  {
+    safe_mutex_assert_owner(&share->intern_lock);
+  }
+  if (share->base.born_transactional && translog_status == TRANSLOG_OK &&
+      !maria_in_recovery)
+  {
+    /*
+      In a recovery, we want to set is_of_horizon to the LSN of the last
+      record executed by Recovery, not the current EOF of the log (which
+      is too new). Recovery does it by itself.
+    */
+    share->state.is_of_horizon= translog_get_horizon();
+    DBUG_PRINT("info", ("is_of_horizon set to LSN (%lu,0x%lx)",
+                        LSN_IN_PARTS(share->state.is_of_horizon)));
+  }
+  res= _ma_state_info_write_sub(share->kfile.file, &share->state, pWrite);
+  if (pWrite & MA_STATE_INFO_WRITE_LOCK)
+    pthread_mutex_unlock(&share->intern_lock);
+  share->changed= 0;
+  return res;
+}
+
+
+/**
+   @brief Function to save and store the header in the index file (.MYI).
+
+   Shortcut to use instead of _ma_state_info_write() when appropriate.
+
+   @param  file            descriptor of the index file to write
+   @param  state           state information to write to the file
+   @param  pWrite          bitmap: if 1 (MA_STATE_INFO_WRITE_DONT_MOVE_OFFSET)
+                           is set my_pwrite() is used otherwise my_write();
+                           if 2 (MA_STATE_INFO_WRITE_FULL_INFO) is set, info
+                           about keys is written (should only be needed
+                           after ALTER TABLE ENABLE/DISABLE KEYS, and
+                           REPAIR/OPTIMIZE).
+
+   @notes
+     For transactional multiuser tables, this function is called
+     with intern_lock & translog_lock or when the last thread who
+     is using the table is closing it.
+     Because of the translog_lock we don't need to have a lock on
+     key_del_lock.
+
+   @return Operation status
+     @retval 0      OK
+     @retval 1      Error
+*/
+
+uint _ma_state_info_write_sub(File file, MARIA_STATE_INFO *state, uint pWrite)
+{
+  uchar  buff[MARIA_STATE_INFO_SIZE + MARIA_STATE_EXTRA_SIZE];
+  uchar *ptr=buff;
+  uint	i, keys= (uint) state->header.keys;
+  size_t res;
+  DBUG_ENTER("_ma_state_info_write_sub");
+
+  memcpy_fixed(ptr,&state->header,sizeof(state->header));
+  ptr+=sizeof(state->header);
+
+  /* open_count must be first because of _ma_mark_file_changed ! */
+  mi_int2store(ptr,state->open_count);			ptr+= 2;
+  /* changed must be second, because of _ma_mark_file_crashed */
+  mi_int2store(ptr,state->changed);			ptr+= 2;
+
+  /*
+    If you change the offset of these LSNs, note that some functions do a
+    direct write of them without going through this function.
+  */
+  lsn_store(ptr, state->create_rename_lsn);		ptr+= LSN_STORE_SIZE;
+  lsn_store(ptr, state->is_of_horizon);			ptr+= LSN_STORE_SIZE;
+  lsn_store(ptr, state->skip_redo_lsn);			ptr+= LSN_STORE_SIZE;
+  mi_rowstore(ptr,state->state.records);		ptr+= 8;
+  mi_rowstore(ptr,state->state.del);			ptr+= 8;
+  mi_rowstore(ptr,state->split);			ptr+= 8;
+  mi_sizestore(ptr,state->dellink);			ptr+= 8;
+  mi_sizestore(ptr,state->first_bitmap_with_space);	ptr+= 8;
+  mi_sizestore(ptr,state->state.key_file_length);	ptr+= 8;
+  mi_sizestore(ptr,state->state.data_file_length);	ptr+= 8;
+  mi_sizestore(ptr,state->state.empty);			ptr+= 8;
+  mi_sizestore(ptr,state->state.key_empty);		ptr+= 8;
+  mi_int8store(ptr,state->auto_increment);		ptr+= 8;
+  mi_int8store(ptr,(ulonglong) state->state.checksum);	ptr+= 8;
+  mi_int8store(ptr,state->create_trid);			ptr+= 8;
+  mi_int4store(ptr,state->status);			ptr+= 4;
+  mi_int4store(ptr,state->update_count);		ptr+= 4;
+  *ptr++= state->sortkey;
+  *ptr++= 0;                                    /* Reserved */
+  ptr+=	state->state_diff_length;
+
+  for (i=0; i < keys; i++)
+  {
+    mi_sizestore(ptr,state->key_root[i]);		ptr+= 8;
+  }
+  mi_sizestore(ptr,state->key_del);	        	ptr+= 8;
+  if (pWrite & MA_STATE_INFO_WRITE_FULL_INFO)	/* From maria_chk */
+  {
+    uint key_parts= mi_uint2korr(state->header.key_parts);
+    mi_int4store(ptr,state->sec_index_changed); 	ptr+= 4;
+    mi_int4store(ptr,state->sec_index_used);		ptr+= 4;
+    mi_int4store(ptr,state->version);			ptr+= 4;
+    mi_int8store(ptr,state->key_map);			ptr+= 8;
+    mi_int8store(ptr,(ulonglong) state->create_time);	ptr+= 8;
+    mi_int8store(ptr,(ulonglong) state->recover_time);	ptr+= 8;
+    mi_int8store(ptr,(ulonglong) state->check_time);	ptr+= 8;
+    mi_sizestore(ptr, state->records_at_analyze);	ptr+= 8;
+    /* reserve place for some information per key */
+    bzero(ptr, keys*4); 				ptr+= keys*4;
+    for (i=0 ; i < key_parts ; i++)
+    {
+      float8store(ptr, state->rec_per_key_part[i]);  	ptr+= 8;
+      mi_int4store(ptr, state->nulls_per_key_part[i]);  ptr+= 4;
+    }
+  }
+
+  res= (pWrite & MA_STATE_INFO_WRITE_DONT_MOVE_OFFSET) ?
+    my_pwrite(file, buff, (size_t) (ptr-buff), 0L,
+              MYF(MY_NABP | MY_THREADSAFE)) :
+    my_write(file,  buff, (size_t) (ptr-buff),
+             MYF(MY_NABP));
+  DBUG_RETURN(res != 0);
+}
+
+
+static uchar *_ma_state_info_read(uchar *ptr, MARIA_STATE_INFO *state)
+{
+  uint i,keys,key_parts;
+  memcpy_fixed(&state->header,ptr, sizeof(state->header));
+  ptr+= sizeof(state->header);
+  keys= (uint) state->header.keys;
+  key_parts= mi_uint2korr(state->header.key_parts);
+
+  state->open_count = mi_uint2korr(ptr);		ptr+= 2;
+  state->changed= mi_uint2korr(ptr);			ptr+= 2;
+  state->create_rename_lsn= lsn_korr(ptr);		ptr+= LSN_STORE_SIZE;
+  state->is_of_horizon= lsn_korr(ptr);			ptr+= LSN_STORE_SIZE;
+  state->skip_redo_lsn= lsn_korr(ptr);			ptr+= LSN_STORE_SIZE;
+  state->state.records= mi_rowkorr(ptr);		ptr+= 8;
+  state->state.del = mi_rowkorr(ptr);			ptr+= 8;
+  state->split	= mi_rowkorr(ptr);			ptr+= 8;
+  state->dellink= mi_sizekorr(ptr);			ptr+= 8;
+  state->first_bitmap_with_space= mi_sizekorr(ptr);	ptr+= 8;
+  state->state.key_file_length = mi_sizekorr(ptr);	ptr+= 8;
+  state->state.data_file_length= mi_sizekorr(ptr);	ptr+= 8;
+  state->state.empty	= mi_sizekorr(ptr);		ptr+= 8;
+  state->state.key_empty= mi_sizekorr(ptr);		ptr+= 8;
+  state->auto_increment=mi_uint8korr(ptr);		ptr+= 8;
+  state->state.checksum=(ha_checksum) mi_uint8korr(ptr);ptr+= 8;
+  state->create_trid= mi_uint8korr(ptr);		ptr+= 8;
+  state->status = mi_uint4korr(ptr);			ptr+= 4;
+  state->update_count=mi_uint4korr(ptr);		ptr+= 4;
+  state->sortkey= 					(uint) *ptr++;
+  ptr++;                                                /* reserved */
+
+  ptr+= state->state_diff_length;
+
+  for (i=0; i < keys; i++)
+  {
+    state->key_root[i]= mi_sizekorr(ptr);		ptr+= 8;
+  }
+  state->key_del= mi_sizekorr(ptr);			ptr+= 8;
+  state->sec_index_changed = mi_uint4korr(ptr); 	ptr+= 4;
+  state->sec_index_used =    mi_uint4korr(ptr); 	ptr+= 4;
+  state->version     = mi_uint4korr(ptr);		ptr+= 4;
+  state->key_map     = mi_uint8korr(ptr);		ptr+= 8;
+  state->create_time = (time_t) mi_sizekorr(ptr);	ptr+= 8;
+  state->recover_time =(time_t) mi_sizekorr(ptr);	ptr+= 8;
+  state->check_time =  (time_t) mi_sizekorr(ptr);	ptr+= 8;
+  state->records_at_analyze=    mi_sizekorr(ptr);	ptr+= 8;
+  ptr+= keys * 4;                               /* Skip reserved bytes */
+  for (i=0 ; i < key_parts ; i++)
+  {
+    float8get(state->rec_per_key_part[i], ptr);		ptr+= 8;
+    state->nulls_per_key_part[i]= mi_uint4korr(ptr);	ptr+= 4;
+  }
+  return ptr;
+}
+
+
+/**
+   @brief Fills the state by reading its copy on disk.
+
+   Should not be called for transactional tables, as their state on disk is
+   rarely current and so is often misleading for a reader.
+   Does nothing in single user mode.
+
+   @param  file            file to read from
+   @param  state           state which will be filled
+*/
+
+uint _ma_state_info_read_dsk(File file __attribute__((unused)),
+                             MARIA_STATE_INFO *state __attribute__((unused)))
+{
+#ifdef EXTERNAL_LOCKING
+  uchar	buff[MARIA_STATE_INFO_SIZE + MARIA_STATE_EXTRA_SIZE];
+
+  /* trick to detect transactional tables */
+  DBUG_ASSERT(state->create_rename_lsn == LSN_IMPOSSIBLE);
+  if (!maria_single_user)
+  {
+    if (my_pread(file, buff, state->state_length, 0L, MYF(MY_NABP)))
+      return 1;
+    _ma_state_info_read(buff, state);
+  }
+#endif
+  return 0;
+}
+
+
+/****************************************************************************
+**  store and read of MARIA_BASE_INFO
+****************************************************************************/
+
+uint _ma_base_info_write(File file, MARIA_BASE_INFO *base)
+{
+  uchar buff[MARIA_BASE_INFO_SIZE], *ptr=buff;
+
+  bmove(ptr, maria_uuid, MY_UUID_SIZE);
+  ptr+= MY_UUID_SIZE;
+  mi_sizestore(ptr,base->keystart);			ptr+= 8;
+  mi_sizestore(ptr,base->max_data_file_length);		ptr+= 8;
+  mi_sizestore(ptr,base->max_key_file_length);		ptr+= 8;
+  mi_rowstore(ptr,base->records);			ptr+= 8;
+  mi_rowstore(ptr,base->reloc);				ptr+= 8;
+  mi_int4store(ptr,base->mean_row_length);		ptr+= 4;
+  mi_int4store(ptr,base->reclength);			ptr+= 4;
+  mi_int4store(ptr,base->pack_reclength);		ptr+= 4;
+  mi_int4store(ptr,base->min_pack_length);		ptr+= 4;
+  mi_int4store(ptr,base->max_pack_length);		ptr+= 4;
+  mi_int4store(ptr,base->min_block_length);		ptr+= 4;
+  mi_int2store(ptr,base->fields);			ptr+= 2;
+  mi_int2store(ptr,base->fixed_not_null_fields);	ptr+= 2;
+  mi_int2store(ptr,base->fixed_not_null_fields_length);	ptr+= 2;
+  mi_int2store(ptr,base->max_field_lengths);		ptr+= 2;
+  mi_int2store(ptr,base->pack_fields);			ptr+= 2;
+  mi_int2store(ptr,base->extra_options)			ptr+= 2;
+  mi_int2store(ptr,base->null_bytes);                   ptr+= 2;
+  mi_int2store(ptr,base->original_null_bytes);	        ptr+= 2;
+  mi_int2store(ptr,base->field_offsets);	        ptr+= 2;
+  mi_int2store(ptr,0);				        ptr+= 2; /* reserved */
+  mi_int2store(ptr,base->block_size);	        	ptr+= 2;
+  *ptr++= base->rec_reflength;
+  *ptr++= base->key_reflength;
+  *ptr++= base->keys;
+  *ptr++= base->auto_key;
+  *ptr++= base->born_transactional;
+  *ptr++= 0;                                    /* Reserved */
+  mi_int2store(ptr,base->pack_bytes);			ptr+= 2;
+  mi_int2store(ptr,base->blobs);			ptr+= 2;
+  mi_int2store(ptr,base->max_key_block_length);		ptr+= 2;
+  mi_int2store(ptr,base->max_key_length);		ptr+= 2;
+  mi_int2store(ptr,base->extra_alloc_bytes);		ptr+= 2;
+  *ptr++= base->extra_alloc_procent;
+  bzero(ptr,16);					ptr+= 16; /* extra */
+  DBUG_ASSERT((ptr - buff) == MARIA_BASE_INFO_SIZE);
+  return my_write(file, buff, (size_t) (ptr-buff), MYF(MY_NABP)) != 0;
+}
+
+
+static uchar *_ma_base_info_read(uchar *ptr, MARIA_BASE_INFO *base)
+{
+  bmove(base->uuid, ptr, MY_UUID_SIZE);                 ptr+= MY_UUID_SIZE;
+  base->keystart= mi_sizekorr(ptr);			ptr+= 8;
+  base->max_data_file_length= mi_sizekorr(ptr); 	ptr+= 8;
+  base->max_key_file_length= mi_sizekorr(ptr);		ptr+= 8;
+  base->records=  (ha_rows) mi_sizekorr(ptr);		ptr+= 8;
+  base->reloc= (ha_rows) mi_sizekorr(ptr);		ptr+= 8;
+  base->mean_row_length= mi_uint4korr(ptr);		ptr+= 4;
+  base->reclength= mi_uint4korr(ptr);			ptr+= 4;
+  base->pack_reclength= mi_uint4korr(ptr);		ptr+= 4;
+  base->min_pack_length= mi_uint4korr(ptr);		ptr+= 4;
+  base->max_pack_length= mi_uint4korr(ptr);		ptr+= 4;
+  base->min_block_length= mi_uint4korr(ptr);		ptr+= 4;
+  base->fields= mi_uint2korr(ptr);			ptr+= 2;
+  base->fixed_not_null_fields= mi_uint2korr(ptr);       ptr+= 2;
+  base->fixed_not_null_fields_length= mi_uint2korr(ptr);ptr+= 2;
+  base->max_field_lengths= mi_uint2korr(ptr);	        ptr+= 2;
+  base->pack_fields= mi_uint2korr(ptr);			ptr+= 2;
+  base->extra_options= mi_uint2korr(ptr);		ptr+= 2;
+  base->null_bytes= mi_uint2korr(ptr);			ptr+= 2;
+  base->original_null_bytes= mi_uint2korr(ptr);		ptr+= 2;
+  base->field_offsets= mi_uint2korr(ptr);		ptr+= 2;
+                                                        ptr+= 2;
+  base->block_size= mi_uint2korr(ptr);			ptr+= 2;
+
+  base->rec_reflength= *ptr++;
+  base->key_reflength= *ptr++;
+  base->keys=	       *ptr++;
+  base->auto_key=      *ptr++;
+  base->born_transactional= *ptr++;
+  ptr++;
+  base->pack_bytes= mi_uint2korr(ptr);			ptr+= 2;
+  base->blobs= mi_uint2korr(ptr);			ptr+= 2;
+  base->max_key_block_length= mi_uint2korr(ptr);	ptr+= 2;
+  base->max_key_length= mi_uint2korr(ptr);		ptr+= 2;
+  base->extra_alloc_bytes= mi_uint2korr(ptr);		ptr+= 2;
+  base->extra_alloc_procent= *ptr++;
+  ptr+= 16;
+  return ptr;
+}
+
+/*--------------------------------------------------------------------------
+  maria_keydef
+---------------------------------------------------------------------------*/
+
+my_bool _ma_keydef_write(File file, MARIA_KEYDEF *keydef)
+{
+  uchar buff[MARIA_KEYDEF_SIZE];
+  uchar *ptr=buff;
+
+  *ptr++= (uchar) keydef->keysegs;
+  *ptr++= keydef->key_alg;			/* Rtree or Btree */
+  mi_int2store(ptr,keydef->flag);		ptr+= 2;
+  mi_int2store(ptr,keydef->block_length);	ptr+= 2;
+  mi_int2store(ptr,keydef->keylength);		ptr+= 2;
+  mi_int2store(ptr,keydef->minlength);		ptr+= 2;
+  mi_int2store(ptr,keydef->maxlength);		ptr+= 2;
+  return my_write(file, buff, (size_t) (ptr-buff), MYF(MY_NABP)) != 0;
+}
+
+uchar *_ma_keydef_read(uchar *ptr, MARIA_KEYDEF *keydef)
+{
+   keydef->keysegs	= (uint) *ptr++;
+   keydef->key_alg	= *ptr++;		/* Rtree or Btree */
+
+   keydef->flag		= mi_uint2korr(ptr);	ptr+= 2;
+   keydef->block_length = mi_uint2korr(ptr);	ptr+= 2;
+   keydef->keylength	= mi_uint2korr(ptr);	ptr+= 2;
+   keydef->minlength	= mi_uint2korr(ptr);	ptr+= 2;
+   keydef->maxlength	= mi_uint2korr(ptr);	ptr+= 2;
+   keydef->underflow_block_length=keydef->block_length/3;
+   keydef->version	= 0;			/* Not saved */
+   keydef->parser       = &ft_default_parser;
+   keydef->ftkey_nr     = 0;
+   return ptr;
+}
+
+/***************************************************************************
+**  maria_keyseg
+***************************************************************************/
+
+my_bool _ma_keyseg_write(File file, const HA_KEYSEG *keyseg)
+{
+  uchar buff[HA_KEYSEG_SIZE];
+  uchar *ptr=buff;
+  ulong pos;
+
+  *ptr++= keyseg->type;
+  *ptr++= keyseg->language;
+  *ptr++= keyseg->null_bit;
+  *ptr++= keyseg->bit_start;
+  *ptr++= keyseg->bit_end;
+  *ptr++= keyseg->bit_length;
+  mi_int2store(ptr,keyseg->flag);	ptr+= 2;
+  mi_int2store(ptr,keyseg->length);	ptr+= 2;
+  mi_int4store(ptr,keyseg->start);	ptr+= 4;
+  pos= keyseg->null_bit ? keyseg->null_pos : keyseg->bit_pos;
+  mi_int4store(ptr, pos);
+  ptr+=4;
+
+  return my_write(file, buff, (size_t) (ptr-buff), MYF(MY_NABP)) != 0;
+}
+
+
+uchar *_ma_keyseg_read(uchar *ptr, HA_KEYSEG *keyseg)
+{
+   keyseg->type		= *ptr++;
+   keyseg->language	= *ptr++;
+   keyseg->null_bit	= *ptr++;
+   keyseg->bit_start	= *ptr++;
+   keyseg->bit_end	= *ptr++;
+   keyseg->bit_length   = *ptr++;
+   keyseg->flag		= mi_uint2korr(ptr);  ptr+= 2;
+   keyseg->length	= mi_uint2korr(ptr);  ptr+= 2;
+   keyseg->start	= mi_uint4korr(ptr);  ptr+= 4;
+   keyseg->null_pos	= mi_uint4korr(ptr);  ptr+= 4;
+   keyseg->charset=0;				/* Will be filled in later */
+   if (keyseg->null_bit)
+     keyseg->bit_pos= (uint16)(keyseg->null_pos + (keyseg->null_bit == 7));
+   else
+   {
+     keyseg->bit_pos= (uint16)keyseg->null_pos;
+     keyseg->null_pos= 0;
+   }
+   return ptr;
+}
+
+/*--------------------------------------------------------------------------
+  maria_uniquedef
+---------------------------------------------------------------------------*/
+
+my_bool _ma_uniquedef_write(File file, MARIA_UNIQUEDEF *def)
+{
+  uchar buff[MARIA_UNIQUEDEF_SIZE];
+  uchar *ptr=buff;
+
+  mi_int2store(ptr,def->keysegs);		ptr+=2;
+  *ptr++=  (uchar) def->key;
+  *ptr++ = (uchar) def->null_are_equal;
+
+  return my_write(file, buff, (size_t) (ptr-buff), MYF(MY_NABP)) != 0;
+}
+
+uchar *_ma_uniquedef_read(uchar *ptr, MARIA_UNIQUEDEF *def)
+{
+   def->keysegs = mi_uint2korr(ptr);
+   def->key	= ptr[2];
+   def->null_are_equal=ptr[3];
+   return ptr+4;				/* 1 extra uchar */
+}
+
+/***************************************************************************
+**  MARIA_COLUMNDEF
+***************************************************************************/
+
+my_bool _ma_columndef_write(File file, MARIA_COLUMNDEF *columndef)
+{
+  uchar buff[MARIA_COLUMNDEF_SIZE];
+  uchar *ptr=buff;
+
+  mi_int2store(ptr,(ulong) columndef->column_nr); ptr+= 2;
+  mi_int2store(ptr,(ulong) columndef->offset);	  ptr+= 2;
+  mi_int2store(ptr,columndef->type);		  ptr+= 2;
+  mi_int2store(ptr,columndef->length);		  ptr+= 2;
+  mi_int2store(ptr,columndef->fill_length);	  ptr+= 2;
+  mi_int2store(ptr,columndef->null_pos);	  ptr+= 2;
+  mi_int2store(ptr,columndef->empty_pos);	  ptr+= 2;
+
+  (*ptr++)= columndef->null_bit;
+  (*ptr++)= columndef->empty_bit;
+  ptr[0]= ptr[1]= ptr[2]= ptr[3]= 0;            ptr+= 4;  /* For future */
+  return my_write(file, buff, (size_t) (ptr-buff), MYF(MY_NABP)) != 0;
+}
+
+uchar *_ma_columndef_read(uchar *ptr, MARIA_COLUMNDEF *columndef)
+{
+  columndef->column_nr= mi_uint2korr(ptr);      ptr+= 2;
+  columndef->offset= mi_uint2korr(ptr);         ptr+= 2;
+  columndef->type=   mi_sint2korr(ptr);		ptr+= 2;
+  columndef->length= mi_uint2korr(ptr);		ptr+= 2;
+  columndef->fill_length= mi_uint2korr(ptr);	ptr+= 2;
+  columndef->null_pos= mi_uint2korr(ptr);	ptr+= 2;
+  columndef->empty_pos= mi_uint2korr(ptr);	ptr+= 2;
+  columndef->null_bit=  (uint8) *ptr++;
+  columndef->empty_bit= (uint8) *ptr++;
+  ptr+= 4;
+  return ptr;
+}
+
+my_bool _ma_column_nr_write(File file, uint16 *offsets, uint columns)
+{
+  uchar *buff, *ptr, *end;
+  size_t size= columns*2;
+  my_bool res;
+
+  if (!(buff= (uchar*) my_alloca(size)))
+    return 1;
+  for (ptr= buff, end= ptr + size; ptr < end ; ptr+= 2, offsets++)
+    int2store(ptr, *offsets);
+  res= my_write(file, buff, size, MYF(MY_NABP)) != 0;
+  my_afree(buff);
+  return res;
+}
+
+
+uchar *_ma_column_nr_read(uchar *ptr, uint16 *offsets, uint columns)
+{
+  uchar *end;
+  size_t size= columns*2;
+  for (end= ptr + size; ptr < end ; ptr+=2, offsets++)
+    *offsets= uint2korr(ptr);
+  return ptr;
+}
+
+/**
+   @brief Set callbacks for data pages
+
+   @note
+   We don't use pagecache_file_init here, as we want to keep the
+   code readable
+*/
+
+void _ma_set_data_pagecache_callbacks(PAGECACHE_FILE *file,
+                                      MARIA_SHARE *share)
+{
+  file->callback_data= (uchar*) share;
+  file->flush_log_callback= &maria_flush_log_for_page_none; /* Do nothing */
+
+  if (share->temporary)
+  {
+    file->read_callback=  &maria_page_crc_check_none;
+    file->write_callback= &maria_page_filler_set_none;
+  }
+  else
+  {
+    file->read_callback=  &maria_page_crc_check_data;
+    if (share->options & HA_OPTION_PAGE_CHECKSUM)
+      file->write_callback= &maria_page_crc_set_normal;
+    else
+      file->write_callback= &maria_page_filler_set_normal;
+    if (share->now_transactional)
+      file->flush_log_callback= maria_flush_log_for_page;
+  }
+}
+
+
+/**
+   @brief Set callbacks for index pages
+
+   @note
+   We don't use pagecache_file_init here, as we want to keep the
+   code readable
+*/
+
+void _ma_set_index_pagecache_callbacks(PAGECACHE_FILE *file,
+                                       MARIA_SHARE *share)
+{
+  file->callback_data= (uchar*) share;
+  file->flush_log_callback= &maria_flush_log_for_page_none; /* Do nothing */
+  file->write_fail= maria_page_write_failure;
+
+  if (share->temporary)
+  {
+    file->read_callback=  &maria_page_crc_check_none;
+    file->write_callback= &maria_page_filler_set_none;
+  }
+  else
+  {
+    file->read_callback=  &maria_page_crc_check_index;
+    if (share->options & HA_OPTION_PAGE_CHECKSUM)
+      file->write_callback= &maria_page_crc_set_index;
+    else
+      file->write_callback= &maria_page_filler_set_normal;
+
+    if (share->now_transactional)
+      file->flush_log_callback= maria_flush_log_for_page;
+  }
+}
+
+
+/**************************************************************************
+ Open data file
+  We can't use dup() here as the data file descriptors need to have different
+  active seek-positions.
+
+  The argument file_to_dup is here for the future if there would on some OS
+  exist a dup()-like call that would give us two different file descriptors.
+*************************************************************************/
+
+int _ma_open_datafile(MARIA_HA *info, MARIA_SHARE *share, const char *org_name,
+                      File file_to_dup __attribute__((unused)))
+{
+  char *data_name= share->data_file_name.str;
+  char real_data_name[FN_REFLEN];
+
+  if (org_name)
+  {
+    fn_format(real_data_name, org_name, "", MARIA_NAME_DEXT, 4);
+    if (my_is_symlink(real_data_name))
+    {
+      if (my_realpath(real_data_name, real_data_name, MYF(0)) ||
+          (*maria_test_invalid_symlink)(real_data_name))
+      {
+        my_errno= HA_WRONG_CREATE_OPTION;
+        return 1;
+      }
+      data_name= real_data_name;
+    }
+  }
+
+  info->dfile.file= share->bitmap.file.file=
+    my_open(share->data_file_name.str, share->mode | O_SHARE,
+            MYF(MY_WME));
+  return info->dfile.file >= 0 ? 0 : 1;
+}
+
+
+int _ma_open_keyfile(MARIA_SHARE *share)
+{
+  /*
+    Modifications to share->kfile should be under intern_lock to protect
+    against a concurrent checkpoint.
+  */
+  pthread_mutex_lock(&share->intern_lock);
+  share->kfile.file= my_open(share->unique_file_name.str,
+                             share->mode | O_SHARE,
+                             MYF(MY_WME));
+  pthread_mutex_unlock(&share->intern_lock);
+  return (share->kfile.file < 0);
+}
+
+
+/*
+  Disable all indexes.
+
+  SYNOPSIS
+    maria_disable_indexes()
+    info        A pointer to the MARIA storage engine MARIA_HA struct.
+
+  DESCRIPTION
+    Disable all indexes.
+
+  RETURN
+    0  ok
+*/
+
+int maria_disable_indexes(MARIA_HA *info)
+{
+  MARIA_SHARE *share= info->s;
+
+  maria_clear_all_keys_active(share->state.key_map);
+  return 0;
+}
+
+
+/*
+  Enable all indexes
+
+  SYNOPSIS
+    maria_enable_indexes()
+    info        A pointer to the MARIA storage engine MARIA_HA struct.
+
+  DESCRIPTION
+    Enable all indexes. The indexes might have been disabled
+    by maria_disable_index() before.
+    The function works only if both data and indexes are empty,
+    otherwise a repair is required.
+    To be sure, call handler::delete_all_rows() before.
+
+  RETURN
+    0  ok
+    HA_ERR_CRASHED data or index is non-empty.
+*/
+
+int maria_enable_indexes(MARIA_HA *info)
+{
+  int error= 0;
+  MARIA_SHARE *share= info->s;
+  DBUG_ENTER("maria_enable_indexes");
+
+  if ((share->state.state.data_file_length !=
+       (share->data_file_type == BLOCK_RECORD ? share->block_size : 0)) ||
+      (share->state.state.key_file_length != share->base.keystart))
+  {
+    DBUG_PRINT("error", ("data_file_length: %lu  key_file_length: %lu",
+                         (ulong) share->state.state.data_file_length,
+                         (ulong) share->state.state.key_file_length));
+    maria_print_error(info->s, HA_ERR_CRASHED);
+    error= HA_ERR_CRASHED;
+  }
+  else
+    maria_set_all_keys_active(share->state.key_map, share->base.keys);
+  DBUG_RETURN(error);
+}
+
+
+/*
+  Test if indexes are disabled.
+
+  SYNOPSIS
+    maria_indexes_are_disabled()
+    info        A pointer to the MARIA storage engine MARIA_HA struct.
+
+  DESCRIPTION
+    Test if indexes are disabled.
+
+  RETURN
+    0  indexes are not disabled
+    1  all indexes are disabled
+    2  non-unique indexes are disabled
+*/
+
+int maria_indexes_are_disabled(MARIA_HA *info)
+{
+  MARIA_SHARE *share= info->s;
+
+  /*
+    No keys or all are enabled. keys is the number of keys. Left shifted
+    gives us only one bit set. When decreased by one, gives us all all bits
+    up to this one set and it gets unset.
+  */
+  if (!share->base.keys ||
+      (maria_is_all_keys_active(share->state.key_map, share->base.keys)))
+    return 0;
+
+  /* All are disabled */
+  if (maria_is_any_key_active(share->state.key_map))
+    return 1;
+
+  /*
+    We have keys. Some enabled, some disabled.
+    Don't check for any non-unique disabled but return directly 2
+  */
+  return 2;
+}
+
+
+static my_bool maria_scan_init_dummy(MARIA_HA *info __attribute__((unused)))
+{
+  return 0;
+}
+
+static void maria_scan_end_dummy(MARIA_HA *info __attribute__((unused)))
+{
+}
+
+static my_bool maria_once_init_dummy(MARIA_SHARE *share
+                                     __attribute__((unused)),
+                                     File dfile __attribute__((unused)))
+{
+  return 0;
+}
+
+static my_bool maria_once_end_dummy(MARIA_SHARE *share __attribute__((unused)))
+{
+  return 0;
+}
diff --git a/storage/maria/ma_packrec.c b/storage/maria/ma_packrec.c
new file mode 100644
index 00000000000..4df00d9bb88
--- /dev/null
+++ b/storage/maria/ma_packrec.c
@@ -0,0 +1,1723 @@
+/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; version 2 of the License.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */
+
+	/* Functions to compressed records */
+
+#include "maria_def.h"
+
+#define IS_CHAR ((uint) 32768)		/* Bit if char (not offset) in tree */
+
+/* Some definitions to keep in sync with maria_pack.c */
+#define HEAD_LENGTH	32              /* Length of fixed header */
+
+#if INT_MAX > 32767
+#define BITS_SAVED 32
+#define MAX_QUICK_TABLE_BITS 9		/* Because we may shift in 24 bits */
+#else
+#define BITS_SAVED 16
+#define MAX_QUICK_TABLE_BITS 6
+#endif
+
+#define get_bit(BU) ((BU)->bits ? \
+		     (BU)->current_byte & ((maria_bit_type) 1 << --(BU)->bits) :\
+		     (fill_buffer(BU), (BU)->bits= BITS_SAVED-1,\
+		      (BU)->current_byte & ((maria_bit_type) 1 << (BITS_SAVED-1))))
+#define skip_to_next_byte(BU) ((BU)->bits&=~7)
+#define get_bits(BU,count) (((BU)->bits >= count) ? (((BU)->current_byte >> ((BU)->bits-=count)) & mask[count]) : fill_and_get_bits(BU,count))
+
+#define decode_bytes_test_bit(bit) \
+  if (low_byte & (1 << (7-bit))) \
+    pos++; \
+  if (*pos & IS_CHAR) \
+  { bits-=(bit+1); break; } \
+  pos+= *pos
+
+/*
+  Size in uint16 of a Huffman tree for uchar compression of 256 uchar values
+*/
+#define OFFSET_TABLE_SIZE 512
+
+static my_bool _ma_read_pack_info(MARIA_SHARE *share, File file,
+                                  pbool fix_keys);
+static uint read_huff_table(MARIA_BIT_BUFF *bit_buff,
+                            MARIA_DECODE_TREE *decode_tree,
+			    uint16 **decode_table,uchar **intervall_buff,
+			    uint16 *tmp_buff);
+static void make_quick_table(uint16 *to_table,uint16 *decode_table,
+			     uint *next_free,uint value,uint bits,
+			     uint max_bits);
+static void fill_quick_table(uint16 *table,uint bits, uint max_bits,
+			     uint value);
+static uint copy_decode_table(uint16 *to_pos,uint offset,
+			      uint16 *decode_table);
+static uint find_longest_bitstream(uint16 *table, uint16 *end);
+static void (*get_unpack_function(MARIA_COLUMNDEF *rec))(MARIA_COLUMNDEF *field,
+                                                         MARIA_BIT_BUFF *buff,
+                                                         uchar *to,
+                                                         uchar *end);
+static void uf_zerofill_skip_zero(MARIA_COLUMNDEF *rec,
+                                  MARIA_BIT_BUFF *bit_buff,
+                                  uchar *to,uchar *end);
+static void uf_skip_zero(MARIA_COLUMNDEF *rec,MARIA_BIT_BUFF *bit_buff,
+                         uchar *to,uchar *end);
+static void uf_space_normal(MARIA_COLUMNDEF *rec,MARIA_BIT_BUFF *bit_buff,
+			    uchar *to,uchar *end);
+static void uf_space_endspace_selected(MARIA_COLUMNDEF *rec,
+                                       MARIA_BIT_BUFF *bit_buff,
+				       uchar *to, uchar *end);
+static void uf_endspace_selected(MARIA_COLUMNDEF *rec,MARIA_BIT_BUFF *bit_buff,
+				 uchar *to,uchar *end);
+static void uf_space_endspace(MARIA_COLUMNDEF *rec,MARIA_BIT_BUFF *bit_buff,
+			      uchar *to,uchar *end);
+static void uf_endspace(MARIA_COLUMNDEF *rec,MARIA_BIT_BUFF *bit_buff,
+			uchar *to,uchar *end);
+static void uf_space_prespace_selected(MARIA_COLUMNDEF *rec,
+                                       MARIA_BIT_BUFF *bit_buff,
+				       uchar *to, uchar *end);
+static void uf_prespace_selected(MARIA_COLUMNDEF *rec,MARIA_BIT_BUFF *bit_buff,
+				 uchar *to,uchar *end);
+static void uf_space_prespace(MARIA_COLUMNDEF *rec,MARIA_BIT_BUFF *bit_buff,
+			      uchar *to,uchar *end);
+static void uf_prespace(MARIA_COLUMNDEF *rec,MARIA_BIT_BUFF *bit_buff,
+			uchar *to,uchar *end);
+static void uf_zerofill_normal(MARIA_COLUMNDEF *rec,MARIA_BIT_BUFF *bit_buff,
+			       uchar *to,uchar *end);
+static void uf_constant(MARIA_COLUMNDEF *rec,MARIA_BIT_BUFF *bit_buff,
+			uchar *to,uchar *end);
+static void uf_intervall(MARIA_COLUMNDEF *rec,MARIA_BIT_BUFF *bit_buff,
+			 uchar *to,uchar *end);
+static void uf_zero(MARIA_COLUMNDEF *rec,MARIA_BIT_BUFF *bit_buff,
+		    uchar *to,uchar *end);
+static void uf_blob(MARIA_COLUMNDEF *rec, MARIA_BIT_BUFF *bit_buff,
+		    uchar *to, uchar *end);
+static void uf_varchar1(MARIA_COLUMNDEF *rec, MARIA_BIT_BUFF *bit_buff,
+                        uchar *to, uchar *end);
+static void uf_varchar2(MARIA_COLUMNDEF *rec, MARIA_BIT_BUFF *bit_buff,
+                        uchar *to, uchar *end);
+static void decode_bytes(MARIA_COLUMNDEF *rec,MARIA_BIT_BUFF *bit_buff,
+			 uchar *to,uchar *end);
+static uint decode_pos(MARIA_BIT_BUFF *bit_buff,
+                       MARIA_DECODE_TREE *decode_tree);
+static void init_bit_buffer(MARIA_BIT_BUFF *bit_buff,uchar *buffer,
+                            uint length);
+static uint fill_and_get_bits(MARIA_BIT_BUFF *bit_buff,uint count);
+static void fill_buffer(MARIA_BIT_BUFF *bit_buff);
+static uint max_bit(uint value);
+static uint read_pack_length(uint version, const uchar *buf, ulong *length);
+#ifdef HAVE_MMAP
+static uchar *_ma_mempack_get_block_info(MARIA_HA *maria,
+                                         MARIA_BIT_BUFF *bit_buff,
+                                         MARIA_BLOCK_INFO *info,
+                                         uchar **rec_buff_p,
+                                         size_t *rec_buff_size_p,
+					 uchar *header);
+#endif
+
+static maria_bit_type mask[]=
+{
+   0x00000000,
+   0x00000001, 0x00000003, 0x00000007, 0x0000000f,
+   0x0000001f, 0x0000003f, 0x0000007f, 0x000000ff,
+   0x000001ff, 0x000003ff, 0x000007ff, 0x00000fff,
+   0x00001fff, 0x00003fff, 0x00007fff, 0x0000ffff,
+#if BITS_SAVED > 16
+   0x0001ffff, 0x0003ffff, 0x0007ffff, 0x000fffff,
+   0x001fffff, 0x003fffff, 0x007fffff, 0x00ffffff,
+   0x01ffffff, 0x03ffffff, 0x07ffffff, 0x0fffffff,
+   0x1fffffff, 0x3fffffff, 0x7fffffff, 0xffffffff,
+#endif
+};
+
+
+my_bool _ma_once_init_pack_row(MARIA_SHARE *share, File dfile)
+{
+  share->options|= HA_OPTION_READ_ONLY_DATA;
+  return (_ma_read_pack_info(share, dfile,
+                             (pbool)
+                             test(!(share->options &
+                                    (HA_OPTION_PACK_RECORD |
+                                     HA_OPTION_TEMP_COMPRESS_RECORD)))));
+}
+
+
+my_bool _ma_once_end_pack_row(MARIA_SHARE *share)
+{
+  if (share->decode_trees)
+  {
+    my_free(share->decode_trees,MYF(0));
+    my_free(share->decode_tables,MYF(0));
+  }
+  return 0;
+}
+
+
+/* Read all packed info, allocate memory and fix field structs */
+
+static my_bool _ma_read_pack_info(MARIA_SHARE *share, File file,
+                                  pbool fix_keys)
+{
+  int diff_length;
+  uint i,trees,huff_tree_bits,rec_reflength,length;
+  uint16 *decode_table,*tmp_buff;
+  ulong elements,intervall_length;
+  uchar *disk_cache;
+  uchar *intervall_buff;
+  uchar header[HEAD_LENGTH];
+  MARIA_BIT_BUFF bit_buff;
+  DBUG_ENTER("_ma_read_pack_info");
+
+  if (maria_quick_table_bits < 4)
+    maria_quick_table_bits=4;
+  else if (maria_quick_table_bits > MAX_QUICK_TABLE_BITS)
+    maria_quick_table_bits=MAX_QUICK_TABLE_BITS;
+
+  my_errno=0;
+  if (my_read(file, header, sizeof(header), MYF(MY_NABP)))
+  {
+    if (!my_errno)
+      my_errno=HA_ERR_END_OF_FILE;
+    goto err0;
+  }
+  /* Only the first three bytes of magic number are independent of version. */
+  if (memcmp(header, maria_pack_file_magic, 3))
+  {
+    my_errno=HA_ERR_WRONG_IN_RECORD;
+    goto err0;
+  }
+  share->pack.version= header[3]; /* fourth uchar of magic number */
+  share->pack.header_length=	uint4korr(header+4);
+  share->min_pack_length=(uint) uint4korr(header+8);
+  share->max_pack_length=(uint) uint4korr(header+12);
+  set_if_bigger(share->base.default_rec_buff_size,
+                share->max_pack_length + 7);
+  elements=uint4korr(header+16);
+  intervall_length=uint4korr(header+20);
+  trees=uint2korr(header+24);
+  share->pack.ref_length=header[26];
+  rec_reflength=header[27];
+  diff_length=(int) rec_reflength - (int) share->base.rec_reflength;
+  if (fix_keys)
+    share->rec_reflength=rec_reflength;
+  DBUG_PRINT("info", ("fixed header length:   %u", HEAD_LENGTH));
+  DBUG_PRINT("info", ("total header length:   %lu", share->pack.header_length));
+  DBUG_PRINT("info", ("pack file version:     %u", share->pack.version));
+  DBUG_PRINT("info", ("min pack length:       %lu", share->min_pack_length));
+  DBUG_PRINT("info", ("max pack length:       %lu", share->max_pack_length));
+  DBUG_PRINT("info", ("elements of all trees: %lu", elements));
+  DBUG_PRINT("info", ("distinct values bytes: %lu", intervall_length));
+  DBUG_PRINT("info", ("number of code trees:  %u", trees));
+  DBUG_PRINT("info", ("bytes for record lgt:  %u", share->pack.ref_length));
+  DBUG_PRINT("info", ("record pointer length: %u", rec_reflength));
+
+
+  /*
+    Memory segment #1:
+    - Decode tree heads
+    - Distinct column values
+  */
+  if (!(share->decode_trees=(MARIA_DECODE_TREE*)
+	my_malloc((uint) (trees*sizeof(MARIA_DECODE_TREE)+
+			  intervall_length*sizeof(uchar)),
+		  MYF(MY_WME))))
+    goto err0;
+  intervall_buff=(uchar*) (share->decode_trees+trees);
+
+  /*
+    Memory segment #2:
+    - Decode tables
+    - Quick decode tables
+    - Temporary decode table
+    - Compressed data file header cache
+    This segment will be reallocated after construction of the tables.
+  */
+  length=(uint) (elements*2+trees*(1 << maria_quick_table_bits));
+  if (!(share->decode_tables=(uint16*)
+	my_malloc((length+OFFSET_TABLE_SIZE)*sizeof(uint16)+
+		  (uint) (share->pack.header_length - sizeof(header)) +
+                  share->base.extra_rec_buff_size,
+		  MYF(MY_WME | MY_ZEROFILL))))
+    goto err1;
+  tmp_buff=share->decode_tables+length;
+  disk_cache=(uchar*) (tmp_buff+OFFSET_TABLE_SIZE);
+
+  if (my_read(file,disk_cache,
+	      (uint) (share->pack.header_length-sizeof(header)),
+	      MYF(MY_NABP)))
+    goto err2;
+#ifdef HAVE_valgrind
+  /* Zero bytes accessed by fill_buffer */
+  bzero(disk_cache + (share->pack.header_length-sizeof(header)),
+        share->base.extra_rec_buff_size);
+#endif
+
+  huff_tree_bits=max_bit(trees ? trees-1 : 0);
+  init_bit_buffer(&bit_buff, disk_cache,
+		  (uint) (share->pack.header_length-sizeof(header)));
+	/* Read new info for each field */
+  for (i=0 ; i < share->base.fields ; i++)
+  {
+    share->columndef[i].base_type=(enum en_fieldtype) get_bits(&bit_buff,5);
+    share->columndef[i].pack_type=(uint) get_bits(&bit_buff,6);
+    share->columndef[i].space_length_bits=get_bits(&bit_buff,5);
+    share->columndef[i].huff_tree=share->decode_trees+(uint) get_bits(&bit_buff,
+								huff_tree_bits);
+    share->columndef[i].unpack= get_unpack_function(share->columndef + i);
+    DBUG_PRINT("info", ("col: %2u  type: %2u  pack: %u  slbits: %2u",
+                        i, share->columndef[i].base_type,
+                        share->columndef[i].pack_type,
+                        share->columndef[i].space_length_bits));
+  }
+  skip_to_next_byte(&bit_buff);
+  /*
+    Construct the decoding tables from the file header. Keep track of
+    the used memory.
+  */
+  decode_table=share->decode_tables;
+  for (i=0 ; i < trees ; i++)
+    if (read_huff_table(&bit_buff,share->decode_trees+i,&decode_table,
+                        &intervall_buff,tmp_buff))
+      goto err3;
+  /* Reallocate the decoding tables to the used size. */
+  decode_table=(uint16*)
+    my_realloc((uchar*) share->decode_tables,
+	       (uint) ((uchar*) decode_table - (uchar*) share->decode_tables),
+	       MYF(MY_HOLD_ON_ERROR));
+  /* Fix the table addresses in the tree heads. */
+  {
+    my_ptrdiff_t diff= PTR_BYTE_DIFF(decode_table,share->decode_tables);
+    share->decode_tables=decode_table;
+    for (i=0 ; i < trees ; i++)
+      share->decode_trees[i].table=ADD_TO_PTR(share->decode_trees[i].table,
+                                              diff, uint16*);
+  }
+
+	/* Fix record-ref-length for keys */
+  if (fix_keys)
+  {
+    for (i=0 ; i < share->base.keys ; i++)
+    {
+      MARIA_KEYDEF *keyinfo= &share->keyinfo[i];
+      keyinfo->keylength+= (uint16) diff_length;
+      keyinfo->minlength+= (uint16) diff_length;
+      keyinfo->maxlength+= (uint16) diff_length;
+      keyinfo->seg[keyinfo->flag & HA_FULLTEXT ?
+                   FT_SEGS : keyinfo->keysegs].length= (uint16) rec_reflength;
+    }
+    if (share->ft2_keyinfo.seg)
+    {
+      MARIA_KEYDEF *ft2_keyinfo= &share->ft2_keyinfo;
+      ft2_keyinfo->keylength+= (uint16) diff_length;
+      ft2_keyinfo->minlength+= (uint16) diff_length;
+      ft2_keyinfo->maxlength+= (uint16) diff_length;
+    }
+  }
+
+  if (bit_buff.error || bit_buff.pos < bit_buff.end)
+    goto err3;
+
+  DBUG_RETURN(0);
+
+err3:
+  my_errno=HA_ERR_WRONG_IN_RECORD;
+err2:
+  my_free(share->decode_tables, MYF(0));
+err1:
+  my_free(share->decode_trees, MYF(0));
+err0:
+  DBUG_RETURN(1);
+}
+
+
+/*
+  Read a huff-code-table from datafile.
+
+  SYNOPSIS
+    read_huff_table()
+      bit_buff                  Bit buffer pointing at start of the
+                                decoding table in the file header cache.
+      decode_tree               Pointer to the decode tree head.
+      decode_table      IN/OUT  Address of a pointer to the next free space.
+      intervall_buff    IN/OUT  Address of a pointer to the next unused values.
+      tmp_buff                  Buffer for temporary extraction of a full
+                                decoding table as read from bit_buff.
+
+  RETURN
+    0           OK.
+    1           Error.
+*/
+static uint read_huff_table(MARIA_BIT_BUFF *bit_buff,
+                            MARIA_DECODE_TREE *decode_tree,
+			    uint16 **decode_table, uchar **intervall_buff,
+			    uint16 *tmp_buff)
+{
+  uint min_chr,elements,char_bits,offset_bits,size,intervall_length,table_bits,
+  next_free_offset;
+  uint16 *ptr,*end;
+  DBUG_ENTER("read_huff_table");
+
+  if (!get_bits(bit_buff,1))
+  {
+    /* Byte value compression. */
+    min_chr=get_bits(bit_buff,8);
+    elements=get_bits(bit_buff,9);
+    char_bits=get_bits(bit_buff,5);
+    offset_bits=get_bits(bit_buff,5);
+    intervall_length=0;
+    ptr=tmp_buff;
+    ptr=tmp_buff;
+    DBUG_PRINT("info", ("byte value compression"));
+    DBUG_PRINT("info", ("minimum uchar value:    %u", min_chr));
+    DBUG_PRINT("info", ("number of tree nodes:  %u", elements));
+    DBUG_PRINT("info", ("bits for values:       %u", char_bits));
+    DBUG_PRINT("info", ("bits for tree offsets: %u", offset_bits));
+    if (elements > 256)
+    {
+      DBUG_PRINT("error", ("ERROR: illegal number of tree elements: %u",
+                           elements));
+      DBUG_RETURN(1);
+    }
+  }
+  else
+  {
+    /* Distinct column value compression. */
+    min_chr=0;
+    elements=get_bits(bit_buff,15);
+    intervall_length=get_bits(bit_buff,16);
+    char_bits=get_bits(bit_buff,5);
+    offset_bits=get_bits(bit_buff,5);
+    decode_tree->quick_table_bits=0;
+    ptr= *decode_table;
+    DBUG_PRINT("info", ("distinct column value compression"));
+    DBUG_PRINT("info", ("number of tree nodes:  %u", elements));
+    DBUG_PRINT("info", ("value buffer length:   %u", intervall_length));
+    DBUG_PRINT("info", ("bits for value index:  %u", char_bits));
+    DBUG_PRINT("info", ("bits for tree offsets: %u", offset_bits));
+  }
+  size=elements*2-2;
+  DBUG_PRINT("info", ("tree size in uint16:   %u", size));
+  DBUG_PRINT("info", ("tree size in bytes:    %u",
+                      size * (uint) sizeof(uint16)));
+
+  for (end=ptr+size ; ptr < end ; ptr++)
+  {
+    if (get_bit(bit_buff))
+    {
+      *ptr= (uint16) get_bits(bit_buff,offset_bits);
+      if ((ptr + *ptr >= end) || !*ptr)
+      {
+        DBUG_PRINT("error", ("ERROR: illegal pointer in decode tree"));
+        DBUG_RETURN(1);
+      }
+    }
+    else
+      *ptr= (uint16) (IS_CHAR + (get_bits(bit_buff,char_bits) + min_chr));
+  }
+  skip_to_next_byte(bit_buff);
+
+  decode_tree->table= *decode_table;
+  decode_tree->intervalls= *intervall_buff;
+  if (! intervall_length)
+  {
+    /* Byte value compression. ptr started from tmp_buff. */
+    /* Find longest Huffman code from begin to end of tree in bits. */
+    table_bits= find_longest_bitstream(tmp_buff, ptr);
+    if (table_bits >= OFFSET_TABLE_SIZE)
+      DBUG_RETURN(1);
+    if (table_bits > maria_quick_table_bits)
+      table_bits=maria_quick_table_bits;
+    DBUG_PRINT("info", ("table bits:            %u", table_bits));
+
+    next_free_offset= (1 << table_bits);
+    make_quick_table(*decode_table,tmp_buff,&next_free_offset,0,table_bits,
+		     table_bits);
+    (*decode_table)+= next_free_offset;
+    decode_tree->quick_table_bits=table_bits;
+  }
+  else
+  {
+    /* Distinct column value compression. ptr started from *decode_table */
+    (*decode_table)=end;
+    /*
+      get_bits() moves some bytes to a cache buffer in advance. May need
+      to step back.
+    */
+    bit_buff->pos-= bit_buff->bits/8;
+    /* Copy the distinct column values from the buffer. */
+    memcpy(*intervall_buff,bit_buff->pos,(size_t) intervall_length);
+    (*intervall_buff)+=intervall_length;
+    bit_buff->pos+=intervall_length;
+    bit_buff->bits=0;
+  }
+  DBUG_RETURN(0);
+}
+
+
+/*
+  Make a quick_table for faster decoding.
+
+  SYNOPSIS
+    make_quick_table()
+      to_table                  Target quick_table and remaining decode table.
+      decode_table              Source Huffman (sub-)tree within tmp_buff.
+      next_free_offset   IN/OUT Next free offset from to_table.
+                                Starts behind quick_table on the top-level.
+      value                     Huffman bits found so far.
+      bits                      Remaining bits to be collected.
+      max_bits                  Total number of bits to collect (table_bits).
+
+  DESCRIPTION
+
+    The quick table is an array of 16-bit values. There exists one value
+    for each possible code representable by max_bits (table_bits) bits.
+    In most cases table_bits is 9. So there are 512 16-bit values.
+
+    If the high-order bit (16) is set (IS_CHAR) then the array slot for
+    this value is a valid Huffman code for a resulting uchar value.
+
+    The low-order 8 bits (1..8) are the resulting uchar value.
+
+    Bits 9..14 are the length of the Huffman code for this uchar value.
+    This means so many bits from the input stream were needed to
+    represent this uchar value. The remaining bits belong to later
+    Huffman codes. This also means that for every Huffman code shorter
+    than table_bits there are multiple entires in the array, which
+    differ just in the unused bits.
+
+    If the high-order bit (16) is clear (0) then the remaining bits are
+    the position of the remaining Huffman decode tree segment behind the
+    quick table.
+
+  RETURN
+    void
+*/
+
+static void make_quick_table(uint16 *to_table, uint16 *decode_table,
+			     uint *next_free_offset, uint value, uint bits,
+			     uint max_bits)
+{
+  DBUG_ENTER("make_quick_table");
+
+  /*
+    When down the table to the requested maximum, copy the rest of the
+    Huffman table.
+  */
+  if (!bits--)
+  {
+    /*
+      Remaining left  Huffman tree segment starts behind quick table.
+      Remaining right Huffman tree segment starts behind left segment.
+    */
+    to_table[value]= (uint16) *next_free_offset;
+    /*
+      Re-construct the remaining Huffman tree segment at
+      next_free_offset in to_table.
+    */
+    *next_free_offset=copy_decode_table(to_table, *next_free_offset,
+					decode_table);
+    DBUG_VOID_RETURN;
+  }
+
+  /* Descent on the left side. Left side bits are clear (0). */
+  if (!(*decode_table & IS_CHAR))
+  {
+    /* Not a leaf. Follow the pointer. */
+    make_quick_table(to_table,decode_table+ *decode_table,
+		     next_free_offset,value,bits,max_bits);
+  }
+  else
+  {
+    /*
+      A leaf. A Huffman code is complete. Fill the quick_table
+      array for all possible bit strings starting with this Huffman
+      code.
+    */
+    fill_quick_table(to_table+value,bits,max_bits,(uint) *decode_table);
+  }
+
+  /* Descent on the right side. Right side bits are set (1). */
+  decode_table++;
+  value|= (1 << bits);
+  if (!(*decode_table & IS_CHAR))
+  {
+    /* Not a leaf. Follow the pointer. */
+    make_quick_table(to_table,decode_table+ *decode_table,
+		     next_free_offset,value,bits,max_bits);
+  }
+  else
+  {
+    /*
+      A leaf. A Huffman code is complete. Fill the quick_table
+      array for all possible bit strings starting with this Huffman
+      code.
+    */
+    fill_quick_table(to_table+value,bits,max_bits,(uint) *decode_table);
+  }
+
+  DBUG_VOID_RETURN;
+}
+
+
+/*
+  Fill quick_table for all possible values starting with this Huffman code.
+
+  SYNOPSIS
+    fill_quick_table()
+      table                     Target quick_table position.
+      bits                      Unused bits from max_bits.
+      max_bits                  Total number of bits to collect (table_bits).
+      value                     The uchar encoded by the found Huffman code.
+
+  DESCRIPTION
+
+    Fill the segment (all slots) of the quick_table array with the
+    resulting value for the found Huffman code. There are as many slots
+    as there are combinations representable by the unused bits.
+
+    In most cases we use 9 table bits. Assume a 3-bit Huffman code. Then
+    there are 6 unused bits. Hence we fill 2**6 = 64 slots with the
+    value.
+
+  RETURN
+    void
+*/
+
+static void fill_quick_table(uint16 *table, uint bits, uint max_bits,
+			     uint value)
+{
+  uint16 *end;
+  DBUG_ENTER("fill_quick_table");
+
+  /*
+    Bits 1..8 of value represent the decoded uchar value.
+    Bits 9..14 become the length of the Huffman code for this uchar value.
+    Bit 16 flags a valid code (IS_CHAR).
+  */
+  value|= (max_bits - bits) << 8 | IS_CHAR;
+
+  for (end= table + ((my_ptrdiff_t) 1 << bits); table < end; table++)
+  {
+    *table= (uint16) value;
+  }
+  DBUG_VOID_RETURN;
+}
+
+
+/*
+  Reconstruct a decode subtree at the target position.
+
+  SYNOPSIS
+    copy_decode_table()
+      to_pos                    Target quick_table and remaining decode table.
+      offset                    Next free offset from to_pos.
+      decode_table              Source Huffman subtree within tmp_buff.
+
+  NOTE
+    Pointers in the decode tree are relative to the pointers position.
+
+  RETURN
+    next free offset from to_pos.
+*/
+
+static uint copy_decode_table(uint16 *to_pos, uint offset,
+			      uint16 *decode_table)
+{
+  uint prev_offset= offset;
+  DBUG_ENTER("copy_decode_table");
+
+  /* Descent on the left side. */
+  if (!(*decode_table & IS_CHAR))
+  {
+    /* Set a pointer to the next target node. */
+    to_pos[offset]=2;
+    /* Copy the left hand subtree there. */
+    offset=copy_decode_table(to_pos,offset+2,decode_table+ *decode_table);
+  }
+  else
+  {
+    /* Copy the uchar value. */
+    to_pos[offset]= *decode_table;
+    /* Step behind this node. */
+    offset+=2;
+  }
+
+  /* Descent on the right side. */
+  decode_table++;
+  if (!(*decode_table & IS_CHAR))
+  {
+    /* Set a pointer to the next free target node. */
+    to_pos[prev_offset+1]=(uint16) (offset-prev_offset-1);
+    /* Copy the right hand subtree to the entry of that node. */
+    offset=copy_decode_table(to_pos,offset,decode_table+ *decode_table);
+  }
+  else
+  {
+    /* Copy the uchar value. */
+    to_pos[prev_offset+1]= *decode_table;
+  }
+  DBUG_RETURN(offset);
+}
+
+
+/*
+  Find the length of the longest Huffman code in this table in bits.
+
+  SYNOPSIS
+    find_longest_bitstream()
+      table                     Code (sub-)table start.
+      end                       End of code table.
+
+  IMPLEMENTATION
+
+    Recursively follow the branch(es) of the code pair on every level of
+    the tree until two uchar values (and no branch) are found. Add one to
+    each level when returning back from each recursion stage.
+
+    'end' is used for error checking only. A clean tree terminates
+    before reaching 'end'. Hence the exact value of 'end' is not too
+    important. However having it higher than necessary could lead to
+    misbehaviour should 'next' jump into the dirty area.
+
+  RETURN
+    length                  Length of longest Huffman code in bits.
+    >= OFFSET_TABLE_SIZE    Error, broken tree. It does not end before 'end'.
+*/
+
+static uint find_longest_bitstream(uint16 *table, uint16 *end)
+{
+  uint length=1;
+  uint length2;
+  if (!(*table & IS_CHAR))
+  {
+    uint16 *next= table + *table;
+    if (next > end || next == table)
+    {
+      DBUG_PRINT("error", ("ERROR: illegal pointer in decode tree"));
+      return OFFSET_TABLE_SIZE;
+    }
+    length=find_longest_bitstream(next, end)+1;
+  }
+  table++;
+  if (!(*table & IS_CHAR))
+  {
+    uint16 *next= table + *table;
+    if (next > end || next == table)
+    {
+      DBUG_PRINT("error", ("ERROR: illegal pointer in decode tree"));
+      return OFFSET_TABLE_SIZE;
+    }
+    length2= find_longest_bitstream(next, end) + 1;
+    length=max(length,length2);
+  }
+  return length;
+}
+
+
+/*
+  Read record from datafile.
+
+  SYNOPSIS
+    _ma_read_pack_record()
+    info                        A pointer to MARIA_HA.
+    filepos                     File offset of the record.
+    buf                 RETURN  The buffer to receive the record.
+
+  RETURN
+    0   On success
+    #   Error number
+*/
+
+int _ma_read_pack_record(MARIA_HA *info, uchar *buf, MARIA_RECORD_POS filepos)
+{
+  MARIA_BLOCK_INFO block_info;
+  File file;
+  DBUG_ENTER("maria_read_pack_record");
+
+  if (filepos == HA_OFFSET_ERROR)
+    DBUG_RETURN(my_errno);          /* _search() didn't find record */
+
+  file= info->dfile.file;
+  if (_ma_pack_get_block_info(info, &info->bit_buff, &block_info,
+                              &info->rec_buff, &info->rec_buff_size, file,
+                              filepos))
+    goto err;
+  if (my_read(file, info->rec_buff + block_info.offset ,
+	      block_info.rec_len - block_info.offset, MYF(MY_NABP)))
+    goto panic;
+  info->update|= HA_STATE_AKTIV;
+  DBUG_RETURN(_ma_pack_rec_unpack(info,&info->bit_buff, buf,
+                                  info->rec_buff, block_info.rec_len));
+panic:
+  my_errno=HA_ERR_WRONG_IN_RECORD;
+err:
+  DBUG_RETURN(my_errno);
+}
+
+
+
+int _ma_pack_rec_unpack(register MARIA_HA *info, MARIA_BIT_BUFF *bit_buff,
+                        register uchar *to, uchar *from, ulong reclength)
+{
+  uchar *end_field;
+  reg3 MARIA_COLUMNDEF *end;
+  MARIA_COLUMNDEF *current_field;
+  MARIA_SHARE *share= info->s;
+  DBUG_ENTER("_ma_pack_rec_unpack");
+
+  if (info->s->base.null_bytes)
+  {
+    memcpy(to, from, info->s->base.null_bytes);
+    to+=   info->s->base.null_bytes;
+    from+= info->s->base.null_bytes;
+    reclength-= info->s->base.null_bytes;
+  }
+  init_bit_buffer(bit_buff, from, reclength);
+  for (current_field=share->columndef, end=current_field+share->base.fields ;
+       current_field < end ;
+       current_field++,to=end_field)
+  {
+    end_field=to+current_field->length;
+    (*current_field->unpack)(current_field, bit_buff, to, end_field);
+  }
+  if (!bit_buff->error &&
+      bit_buff->pos - bit_buff->bits / 8 == bit_buff->end)
+    DBUG_RETURN(0);
+  info->update&= ~HA_STATE_AKTIV;
+  DBUG_RETURN(my_errno=HA_ERR_WRONG_IN_RECORD);
+} /* _ma_pack_rec_unpack */
+
+
+	/* Return function to unpack field */
+
+static void (*get_unpack_function(MARIA_COLUMNDEF *rec))
+  (MARIA_COLUMNDEF *, MARIA_BIT_BUFF *, uchar *, uchar *)
+{
+  switch (rec->base_type) {
+  case FIELD_SKIP_ZERO:
+    if (rec->pack_type & PACK_TYPE_ZERO_FILL)
+      return &uf_zerofill_skip_zero;
+    return &uf_skip_zero;
+  case FIELD_NORMAL:
+    if (rec->pack_type & PACK_TYPE_SPACE_FIELDS)
+      return &uf_space_normal;
+    if (rec->pack_type & PACK_TYPE_ZERO_FILL)
+      return &uf_zerofill_normal;
+    return &decode_bytes;
+  case FIELD_SKIP_ENDSPACE:
+    if (rec->pack_type & PACK_TYPE_SPACE_FIELDS)
+    {
+      if (rec->pack_type & PACK_TYPE_SELECTED)
+	return &uf_space_endspace_selected;
+      return &uf_space_endspace;
+    }
+    if (rec->pack_type & PACK_TYPE_SELECTED)
+      return &uf_endspace_selected;
+    return &uf_endspace;
+  case FIELD_SKIP_PRESPACE:
+    if (rec->pack_type & PACK_TYPE_SPACE_FIELDS)
+    {
+      if (rec->pack_type & PACK_TYPE_SELECTED)
+	return &uf_space_prespace_selected;
+      return &uf_space_prespace;
+    }
+    if (rec->pack_type & PACK_TYPE_SELECTED)
+      return &uf_prespace_selected;
+    return &uf_prespace;
+  case FIELD_CONSTANT:
+    return &uf_constant;
+  case FIELD_INTERVALL:
+    return &uf_intervall;
+  case FIELD_ZERO:
+  case FIELD_CHECK:
+    return &uf_zero;
+  case FIELD_BLOB:
+    return &uf_blob;
+  case FIELD_VARCHAR:
+    if (rec->length <= 256)                      /* 255 + 1 uchar length */
+      return &uf_varchar1;
+    return &uf_varchar2;
+  case FIELD_LAST:
+  default:
+    return 0;			/* This should never happend */
+  }
+}
+
+	/* The different functions to unpack a field */
+
+static void uf_zerofill_skip_zero(MARIA_COLUMNDEF *rec,
+                                  MARIA_BIT_BUFF *bit_buff,
+                                  uchar *to, uchar *end)
+{
+  if (get_bit(bit_buff))
+    bzero((char*) to,(uint) (end-to));
+  else
+  {
+    end-=rec->space_length_bits;
+    decode_bytes(rec,bit_buff,to,end);
+    bzero((char*) end,rec->space_length_bits);
+  }
+}
+
+static void uf_skip_zero(MARIA_COLUMNDEF *rec, MARIA_BIT_BUFF *bit_buff,
+                         uchar *to, uchar *end)
+{
+  if (get_bit(bit_buff))
+    bzero((char*) to,(uint) (end-to));
+  else
+    decode_bytes(rec,bit_buff,to,end);
+}
+
+static void uf_space_normal(MARIA_COLUMNDEF *rec, MARIA_BIT_BUFF *bit_buff,
+                            uchar *to, uchar *end)
+{
+  if (get_bit(bit_buff))
+    bfill(to, (end-to), ' ');
+  else
+    decode_bytes(rec,bit_buff,to,end);
+}
+
+static void uf_space_endspace_selected(MARIA_COLUMNDEF *rec,
+                                       MARIA_BIT_BUFF *bit_buff,
+				       uchar *to, uchar *end)
+{
+  uint spaces;
+  if (get_bit(bit_buff))
+    bfill(to, (end-to), ' ');
+  else
+  {
+    if (get_bit(bit_buff))
+    {
+      if ((spaces=get_bits(bit_buff,rec->space_length_bits))+to > end)
+      {
+	bit_buff->error=1;
+	return;
+      }
+      if (to+spaces != end)
+	decode_bytes(rec,bit_buff,to,end-spaces);
+      bfill(end - spaces, spaces, ' ');
+    }
+    else
+      decode_bytes(rec,bit_buff,to,end);
+  }
+}
+
+static void uf_endspace_selected(MARIA_COLUMNDEF *rec,
+                                 MARIA_BIT_BUFF *bit_buff,
+				 uchar *to, uchar *end)
+{
+  uint spaces;
+  if (get_bit(bit_buff))
+  {
+    if ((spaces=get_bits(bit_buff,rec->space_length_bits))+to > end)
+    {
+      bit_buff->error=1;
+      return;
+    }
+    if (to+spaces != end)
+      decode_bytes(rec,bit_buff,to,end-spaces);
+    bfill(end - spaces, spaces, ' ');
+  }
+  else
+    decode_bytes(rec,bit_buff,to,end);
+}
+
+static void uf_space_endspace(MARIA_COLUMNDEF *rec, MARIA_BIT_BUFF *bit_buff,
+                              uchar *to, uchar *end)
+{
+  uint spaces;
+  if (get_bit(bit_buff))
+    bfill(to, (end-to), ' ');
+  else
+  {
+    if ((spaces=get_bits(bit_buff,rec->space_length_bits))+to > end)
+    {
+      bit_buff->error=1;
+      return;
+    }
+    if (to+spaces != end)
+      decode_bytes(rec,bit_buff,to,end-spaces);
+    bfill(end - spaces, spaces, ' ');
+  }
+}
+
+static void uf_endspace(MARIA_COLUMNDEF *rec, MARIA_BIT_BUFF *bit_buff,
+                        uchar *to, uchar *end)
+{
+  uint spaces;
+  if ((spaces=get_bits(bit_buff,rec->space_length_bits))+to > end)
+  {
+    bit_buff->error=1;
+    return;
+  }
+  if (to+spaces != end)
+    decode_bytes(rec,bit_buff,to,end-spaces);
+  bfill(end - spaces, spaces, ' ');
+}
+
+static void uf_space_prespace_selected(MARIA_COLUMNDEF *rec,
+                                       MARIA_BIT_BUFF *bit_buff,
+				       uchar *to, uchar *end)
+{
+  uint spaces;
+  if (get_bit(bit_buff))
+    bfill(to, (end-to), ' ');
+  else
+  {
+    if (get_bit(bit_buff))
+    {
+      if ((spaces=get_bits(bit_buff,rec->space_length_bits))+to > end)
+      {
+	bit_buff->error=1;
+	return;
+      }
+      bfill(to, spaces, ' ');
+      if (to+spaces != end)
+	decode_bytes(rec,bit_buff,to+spaces,end);
+    }
+    else
+      decode_bytes(rec,bit_buff,to,end);
+  }
+}
+
+
+static void uf_prespace_selected(MARIA_COLUMNDEF *rec,
+                                 MARIA_BIT_BUFF *bit_buff,
+				 uchar *to, uchar *end)
+{
+  uint spaces;
+  if (get_bit(bit_buff))
+  {
+    if ((spaces=get_bits(bit_buff,rec->space_length_bits))+to > end)
+    {
+      bit_buff->error=1;
+      return;
+    }
+    bfill(to, spaces, ' ');
+    if (to+spaces != end)
+      decode_bytes(rec,bit_buff,to+spaces,end);
+  }
+  else
+    decode_bytes(rec,bit_buff,to,end);
+}
+
+
+static void uf_space_prespace(MARIA_COLUMNDEF *rec, MARIA_BIT_BUFF *bit_buff,
+                              uchar *to, uchar *end)
+{
+  uint spaces;
+  if (get_bit(bit_buff))
+    bfill(to, (end-to), ' ');
+  else
+  {
+    if ((spaces=get_bits(bit_buff,rec->space_length_bits))+to > end)
+    {
+      bit_buff->error=1;
+      return;
+    }
+    bfill(to, spaces, ' ');
+    if (to+spaces != end)
+      decode_bytes(rec,bit_buff,to+spaces,end);
+  }
+}
+
+static void uf_prespace(MARIA_COLUMNDEF *rec, MARIA_BIT_BUFF *bit_buff,
+                        uchar *to, uchar *end)
+{
+  uint spaces;
+  if ((spaces=get_bits(bit_buff,rec->space_length_bits))+to > end)
+  {
+    bit_buff->error=1;
+    return;
+  }
+  bfill(to, spaces, ' ');
+  if (to+spaces != end)
+    decode_bytes(rec,bit_buff,to+spaces,end);
+}
+
+static void uf_zerofill_normal(MARIA_COLUMNDEF *rec, MARIA_BIT_BUFF *bit_buff,
+                               uchar *to, uchar *end)
+{
+  end-=rec->space_length_bits;
+  decode_bytes(rec,bit_buff, to, end);
+  bzero((char*) end,rec->space_length_bits);
+}
+
+static void uf_constant(MARIA_COLUMNDEF *rec,
+			MARIA_BIT_BUFF *bit_buff __attribute__((unused)),
+			uchar *to, uchar *end)
+{
+  memcpy(to,rec->huff_tree->intervalls,(size_t) (end-to));
+}
+
+static void uf_intervall(MARIA_COLUMNDEF *rec, MARIA_BIT_BUFF *bit_buff,
+                         uchar *to,
+			 uchar *end)
+{
+  reg1 uint field_length=(uint) (end-to);
+  memcpy(to,rec->huff_tree->intervalls+field_length*decode_pos(bit_buff,
+							       rec->huff_tree),
+	 (size_t) field_length);
+}
+
+
+/*ARGSUSED*/
+static void uf_zero(MARIA_COLUMNDEF *rec __attribute__((unused)),
+		    MARIA_BIT_BUFF *bit_buff __attribute__((unused)),
+		    uchar *to, uchar *end)
+{
+  bzero(to, (uint) (end-to));
+}
+
+static void uf_blob(MARIA_COLUMNDEF *rec, MARIA_BIT_BUFF *bit_buff,
+		    uchar *to, uchar *end)
+{
+  if (get_bit(bit_buff))
+    bzero(to, (uint) (end-to));
+  else
+  {
+    ulong length=get_bits(bit_buff,rec->space_length_bits);
+    uint pack_length=(uint) (end-to)-portable_sizeof_char_ptr;
+    if (bit_buff->blob_pos+length > bit_buff->blob_end)
+    {
+      bit_buff->error=1;
+      bzero(to, (end-to));
+      return;
+    }
+    decode_bytes(rec, bit_buff, bit_buff->blob_pos,
+                 bit_buff->blob_pos + length);
+    _ma_store_blob_length(to, pack_length, length);
+    memcpy_fixed((uchar*) to+pack_length,(uchar*) &bit_buff->blob_pos,
+		 sizeof(uchar*));
+    bit_buff->blob_pos+=length;
+  }
+}
+
+
+static void uf_varchar1(MARIA_COLUMNDEF *rec, MARIA_BIT_BUFF *bit_buff,
+		       uchar *to, uchar *end __attribute__((unused)))
+{
+  if (get_bit(bit_buff))
+    to[0]= 0;				/* Zero lengths */
+  else
+  {
+    ulong length=get_bits(bit_buff,rec->space_length_bits);
+    *to= (char) length;
+    decode_bytes(rec,bit_buff,to+1,to+1+length);
+  }
+}
+
+
+static void uf_varchar2(MARIA_COLUMNDEF *rec, MARIA_BIT_BUFF *bit_buff,
+		       uchar *to, uchar *end __attribute__((unused)))
+{
+  if (get_bit(bit_buff))
+    to[0]=to[1]=0;				/* Zero lengths */
+  else
+  {
+    ulong length=get_bits(bit_buff,rec->space_length_bits);
+    int2store(to,length);
+    decode_bytes(rec,bit_buff,to+2,to+2+length);
+  }
+}
+
+	/* Functions to decode of buffer of bits */
+
+#if BITS_SAVED == 64
+
+static void decode_bytes(MARIA_COLUMNDEF *rec,MARIA_BIT_BUFF *bit_buff,
+                         uchar *to, uchar *end)
+{
+  reg1 uint bits,low_byte;
+  reg3 uint16 *pos;
+  reg4 uint table_bits,table_and;
+  MARIA_DECODE_TREE *decode_tree;
+
+  decode_tree=rec->decode_tree;
+  bits=bit_buff->bits;			/* Save in reg for quicker access */
+  table_bits=decode_tree->quick_table_bits;
+  table_and= (1 << table_bits)-1;
+
+  do
+  {
+    if (bits <= 32)
+    {
+      if (bit_buff->pos > bit_buff->end+4)
+      {
+	bit_buff->error=1;
+	return;				/* Can't be right */
+      }
+      bit_buff->current_byte= (bit_buff->current_byte << 32) +
+	((((uint) bit_buff->pos[3])) +
+	 (((uint) bit_buff->pos[2]) << 8) +
+	 (((uint) bit_buff->pos[1]) << 16) +
+	 (((uint) bit_buff->pos[0]) << 24));
+      bit_buff->pos+=4;
+      bits+=32;
+    }
+    /*
+      First use info in quick_table.
+
+      The quick table is an array of 16-bit values. There exists one
+      value for each possible code representable by table_bits bits.
+      In most cases table_bits is 9. So there are 512 16-bit values.
+
+      If the high-order bit (16) is set (IS_CHAR) then the array slot
+      for this value is a valid Huffman code for a resulting uchar value.
+
+      The low-order 8 bits (1..8) are the resulting uchar value.
+
+      Bits 9..14 are the length of the Huffman code for this uchar value.
+      This means so many bits from the input stream were needed to
+      represent this uchar value. The remaining bits belong to later
+      Huffman codes. This also means that for every Huffman code shorter
+      than table_bits there are multiple entires in the array, which
+      differ just in the unused bits.
+
+      If the high-order bit (16) is clear (0) then the remaining bits are
+      the position of the remaining Huffman decode tree segment behind the
+      quick table.
+    */
+    low_byte=(uint) (bit_buff->current_byte >> (bits - table_bits)) & table_and;
+    low_byte=decode_tree->table[low_byte];
+    if (low_byte & IS_CHAR)
+    {
+      /*
+        All Huffman codes of less or equal table_bits length are in the
+        quick table. This is one of them.
+      */
+      *to++ = (char) (low_byte & 255);		/* Found char in quick table */
+      bits-=  ((low_byte >> 8) & 31);	/* Remove bits used */
+    }
+    else
+    {					/* Map through rest of decode-table */
+      /* This means that the Huffman code must be longer than table_bits. */
+      pos=decode_tree->table+low_byte;
+      bits-=table_bits;
+      /* NOTE: decode_bytes_test_bit() is a macro wich contains a break !!! */
+      for (;;)
+      {
+	low_byte=(uint) (bit_buff->current_byte >> (bits-8));
+	decode_bytes_test_bit(0);
+	decode_bytes_test_bit(1);
+	decode_bytes_test_bit(2);
+	decode_bytes_test_bit(3);
+	decode_bytes_test_bit(4);
+	decode_bytes_test_bit(5);
+	decode_bytes_test_bit(6);
+	decode_bytes_test_bit(7);
+	bits-=8;
+      }
+      *to++ = (char) *pos;
+    }
+  } while (to != end);
+
+  bit_buff->bits=bits;
+  return;
+}
+
+#else
+
+static void decode_bytes(MARIA_COLUMNDEF *rec, MARIA_BIT_BUFF *bit_buff,
+                         uchar *to, uchar *end)
+{
+  reg1 uint bits,low_byte;
+  reg3 uint16 *pos;
+  reg4 uint table_bits,table_and;
+  MARIA_DECODE_TREE *decode_tree;
+
+  decode_tree=rec->huff_tree;
+  bits=bit_buff->bits;			/* Save in reg for quicker access */
+  table_bits=decode_tree->quick_table_bits;
+  table_and= (1 << table_bits)-1;
+
+  do
+  {
+    if (bits < table_bits)
+    {
+      if (bit_buff->pos > bit_buff->end+1)
+      {
+	bit_buff->error=1;
+	return;				/* Can't be right */
+      }
+#if BITS_SAVED == 32
+      bit_buff->current_byte= (bit_buff->current_byte << 24) +
+	(((uint) ((uchar) bit_buff->pos[2]))) +
+	  (((uint) ((uchar) bit_buff->pos[1])) << 8) +
+	    (((uint) ((uchar) bit_buff->pos[0])) << 16);
+      bit_buff->pos+=3;
+      bits+=24;
+#else
+      if (bits)				/* We must have at leasts 9 bits */
+      {
+	bit_buff->current_byte=  (bit_buff->current_byte << 8) +
+	  (uint) ((uchar) bit_buff->pos[0]);
+	bit_buff->pos++;
+	bits+=8;
+      }
+      else
+      {
+	bit_buff->current_byte= ((uint) ((uchar) bit_buff->pos[0]) << 8) +
+	  ((uint) ((uchar) bit_buff->pos[1]));
+	bit_buff->pos+=2;
+	bits+=16;
+      }
+#endif
+    }
+	/* First use info in quick_table */
+    low_byte=(bit_buff->current_byte >> (bits - table_bits)) & table_and;
+    low_byte=decode_tree->table[low_byte];
+    if (low_byte & IS_CHAR)
+    {
+      *to++ = (low_byte & 255);		/* Found char in quick table */
+      bits-=  ((low_byte >> 8) & 31);	/* Remove bits used */
+    }
+    else
+    {					/* Map through rest of decode-table */
+      pos=decode_tree->table+low_byte;
+      bits-=table_bits;
+      for (;;)
+      {
+	if (bits < 8)
+	{				/* We don't need to check end */
+#if BITS_SAVED == 32
+	  bit_buff->current_byte= (bit_buff->current_byte << 24) +
+	    (((uint) ((uchar) bit_buff->pos[2]))) +
+	      (((uint) ((uchar) bit_buff->pos[1])) << 8) +
+		(((uint) ((uchar) bit_buff->pos[0])) << 16);
+	  bit_buff->pos+=3;
+	  bits+=24;
+#else
+	  bit_buff->current_byte=  (bit_buff->current_byte << 8) +
+	    (uint) ((uchar) bit_buff->pos[0]);
+	  bit_buff->pos+=1;
+	  bits+=8;
+#endif
+	}
+	low_byte=(uint) (bit_buff->current_byte >> (bits-8));
+	decode_bytes_test_bit(0);
+	decode_bytes_test_bit(1);
+	decode_bytes_test_bit(2);
+	decode_bytes_test_bit(3);
+	decode_bytes_test_bit(4);
+	decode_bytes_test_bit(5);
+	decode_bytes_test_bit(6);
+	decode_bytes_test_bit(7);
+	bits-=8;
+      }
+      *to++ = (char) *pos;
+    }
+  } while (to != end);
+
+  bit_buff->bits=bits;
+  return;
+}
+#endif /* BIT_SAVED == 64 */
+
+
+static uint decode_pos(MARIA_BIT_BUFF *bit_buff,
+                       MARIA_DECODE_TREE *decode_tree)
+{
+  uint16 *pos=decode_tree->table;
+  for (;;)
+  {
+    if (get_bit(bit_buff))
+      pos++;
+    if (*pos & IS_CHAR)
+      return (uint) (*pos & ~IS_CHAR);
+    pos+= *pos;
+  }
+}
+
+
+int _ma_read_rnd_pack_record(MARIA_HA *info,
+                             uchar *buf,
+			     register MARIA_RECORD_POS filepos,
+			     my_bool skip_deleted_blocks)
+{
+  File file;
+  MARIA_BLOCK_INFO block_info;
+  MARIA_SHARE *share= info->s;
+  DBUG_ENTER("_ma_read_rnd_pack_record");
+
+  if (filepos >= info->state->data_file_length)
+  {
+    my_errno= HA_ERR_END_OF_FILE;
+    goto err;
+  }
+
+  file= info->dfile.file;
+  if (info->opt_flag & READ_CACHE_USED)
+  {
+    if (_ma_read_cache(&info->rec_cache, block_info.header,
+                       filepos, share->pack.ref_length,
+                       skip_deleted_blocks ? READING_NEXT : 0))
+      goto err;
+    file= -1;
+  }
+  if (_ma_pack_get_block_info(info, &info->bit_buff, &block_info,
+                              &info->rec_buff, &info->rec_buff_size,
+                              file, filepos))
+    goto err;					/* Error code is already set */
+#ifndef DBUG_OFF
+  if (block_info.rec_len > share->max_pack_length)
+  {
+    my_errno=HA_ERR_WRONG_IN_RECORD;
+    goto err;
+  }
+#endif
+
+  if (info->opt_flag & READ_CACHE_USED)
+  {
+    if (_ma_read_cache(&info->rec_cache, info->rec_buff,
+                       block_info.filepos, block_info.rec_len,
+                       skip_deleted_blocks ? READING_NEXT : 0))
+      goto err;
+  }
+  else
+  {
+    if (my_read(info->dfile.file, info->rec_buff + block_info.offset,
+		block_info.rec_len-block_info.offset,
+		MYF(MY_NABP)))
+      goto err;
+  }
+  info->packed_length=   block_info.rec_len;
+  info->cur_row.lastpos= filepos;
+  info->cur_row.nextpos= block_info.filepos+block_info.rec_len;
+  info->update|= HA_STATE_AKTIV | HA_STATE_KEY_CHANGED;
+
+  DBUG_RETURN (_ma_pack_rec_unpack(info, &info->bit_buff, buf,
+                                   info->rec_buff, block_info.rec_len));
+ err:
+  DBUG_RETURN(my_errno);
+}
+
+
+	/* Read and process header from a huff-record-file */
+
+uint _ma_pack_get_block_info(MARIA_HA *maria, MARIA_BIT_BUFF *bit_buff,
+                             MARIA_BLOCK_INFO *info,
+                             uchar **rec_buff_p, size_t *rec_buff_size_p,
+                             File file, my_off_t filepos)
+{
+  uchar *header= info->header;
+  uint head_length,ref_length;
+  LINT_INIT(ref_length);
+
+  if (file >= 0)
+  {
+    ref_length=maria->s->pack.ref_length;
+    /*
+      We can't use my_pread() here because _ma_read_rnd_pack_record assumes
+      position is ok
+    */
+    VOID(my_seek(file,filepos,MY_SEEK_SET,MYF(0)));
+    if (my_read(file, header,ref_length,MYF(MY_NABP)))
+      return BLOCK_FATAL_ERROR;
+    DBUG_DUMP("header", header, ref_length);
+  }
+  head_length= read_pack_length((uint) maria->s->pack.version, header,
+                                &info->rec_len);
+  if (maria->s->base.blobs)
+  {
+    head_length+= read_pack_length((uint) maria->s->pack.version,
+                                   header + head_length, &info->blob_len);
+    /*
+      Ensure that the record buffer is big enough for the compressed
+      record plus all expanded blobs. [We do not have an extra buffer
+      for the resulting blobs. Sigh.]
+    */
+    if (_ma_alloc_buffer(rec_buff_p, rec_buff_size_p,
+                         info->rec_len + info->blob_len +
+                         maria->s->base.extra_rec_buff_size))
+      return BLOCK_FATAL_ERROR;			/* not enough memory */
+    bit_buff->blob_pos= *rec_buff_p + info->rec_len;
+    bit_buff->blob_end= bit_buff->blob_pos + info->blob_len;
+    maria->blob_length=info->blob_len;
+  }
+  info->filepos=filepos+head_length;
+  if (file > 0)
+  {
+    info->offset=min(info->rec_len, ref_length - head_length);
+    memcpy(*rec_buff_p, header + head_length, info->offset);
+  }
+  return 0;
+}
+
+
+	/* rutines for bit buffer */
+	/* Note buffer must be 6 uchar bigger than longest row */
+
+static void init_bit_buffer(MARIA_BIT_BUFF *bit_buff, uchar *buffer,
+                            uint length)
+{
+  bit_buff->pos=buffer;
+  bit_buff->end=buffer+length;
+  bit_buff->bits=bit_buff->error=0;
+  bit_buff->current_byte=0;			/* Avoid purify errors */
+}
+
+static uint fill_and_get_bits(MARIA_BIT_BUFF *bit_buff, uint count)
+{
+  uint tmp;
+  count-=bit_buff->bits;
+  tmp=(bit_buff->current_byte & mask[bit_buff->bits]) << count;
+  fill_buffer(bit_buff);
+  bit_buff->bits=BITS_SAVED - count;
+  return tmp+(bit_buff->current_byte >> (BITS_SAVED - count));
+}
+
+	/* Fill in empty bit_buff->current_byte from buffer */
+	/* Sets bit_buff->error if buffer is exhausted */
+
+static void fill_buffer(MARIA_BIT_BUFF *bit_buff)
+{
+  if (bit_buff->pos >= bit_buff->end)
+  {
+    bit_buff->error= 1;
+    bit_buff->current_byte=0;
+    return;
+  }
+#if BITS_SAVED == 64
+  bit_buff->current_byte=  ((((uint) ((uchar) bit_buff->pos[7]))) +
+			     (((uint) ((uchar) bit_buff->pos[6])) << 8) +
+			     (((uint) ((uchar) bit_buff->pos[5])) << 16) +
+			     (((uint) ((uchar) bit_buff->pos[4])) << 24) +
+			     ((ulonglong)
+			      ((((uint) ((uchar) bit_buff->pos[3]))) +
+			       (((uint) ((uchar) bit_buff->pos[2])) << 8) +
+			       (((uint) ((uchar) bit_buff->pos[1])) << 16) +
+			       (((uint) ((uchar) bit_buff->pos[0])) << 24)) << 32));
+  bit_buff->pos+=8;
+#else
+#if BITS_SAVED == 32
+  bit_buff->current_byte=  (((uint) ((uchar) bit_buff->pos[3])) +
+			     (((uint) ((uchar) bit_buff->pos[2])) << 8) +
+			     (((uint) ((uchar) bit_buff->pos[1])) << 16) +
+			     (((uint) ((uchar) bit_buff->pos[0])) << 24));
+  bit_buff->pos+=4;
+#else
+  bit_buff->current_byte=  (uint) (((uint) ((uchar) bit_buff->pos[1]))+
+				    (((uint) ((uchar) bit_buff->pos[0])) << 8));
+  bit_buff->pos+=2;
+#endif
+#endif
+}
+
+	/* Get number of bits neaded to represent value */
+
+static uint max_bit(register uint value)
+{
+  reg2 uint power=1;
+
+  while ((value>>=1))
+    power++;
+  return (power);
+}
+
+
+/*****************************************************************************
+	Some redefined functions to handle files when we are using memmap
+*****************************************************************************/
+#ifdef HAVE_SYS_MMAN_H
+#include <sys/mman.h>
+#endif
+
+#ifdef HAVE_MMAP
+
+static int _ma_read_mempack_record(MARIA_HA *info, uchar *buf,
+                                   MARIA_RECORD_POS filepos);
+static int _ma_read_rnd_mempack_record(MARIA_HA*, uchar *, MARIA_RECORD_POS,
+                                       my_bool);
+
+my_bool _ma_memmap_file(MARIA_HA *info)
+{
+  MARIA_SHARE *share= info->s;
+  DBUG_ENTER("maria_memmap_file");
+
+  if (!info->s->file_map)
+  {
+    if (my_seek(info->dfile.file, 0L, MY_SEEK_END, MYF(0)) <
+        share->state.state.data_file_length+MEMMAP_EXTRA_MARGIN)
+    {
+      DBUG_PRINT("warning",("File isn't extended for memmap"));
+      DBUG_RETURN(0);
+    }
+    if (_ma_dynmap_file(info, share->state.state.data_file_length))
+      DBUG_RETURN(0);
+  }
+  info->opt_flag|= MEMMAP_USED;
+  info->read_record= share->read_record= _ma_read_mempack_record;
+  share->scan= _ma_read_rnd_mempack_record;
+  DBUG_RETURN(1);
+}
+
+
+void _ma_unmap_file(MARIA_HA *info)
+{
+  VOID(my_munmap((char*) info->s->file_map,
+                 (size_t) info->s->mmaped_length + MEMMAP_EXTRA_MARGIN));
+}
+
+
+static uchar *
+_ma_mempack_get_block_info(MARIA_HA *maria,
+                           MARIA_BIT_BUFF *bit_buff,
+                           MARIA_BLOCK_INFO *info,
+                           uchar **rec_buff_p,
+                           size_t *rec_buff_size_p,
+                           uchar *header)
+{
+  header+= read_pack_length((uint) maria->s->pack.version, header,
+                            &info->rec_len);
+  if (maria->s->base.blobs)
+  {
+    header+= read_pack_length((uint) maria->s->pack.version, header,
+                              &info->blob_len);
+    /* _ma_alloc_rec_buff sets my_errno on error */
+    if (_ma_alloc_buffer(rec_buff_p, rec_buff_size_p,
+                         info->blob_len + maria->s->base.extra_rec_buff_size))
+      return 0;				/* not enough memory */
+    bit_buff->blob_pos= *rec_buff_p;
+    bit_buff->blob_end= *rec_buff_p + info->blob_len;
+  }
+  return header;
+}
+
+
+static int _ma_read_mempack_record(MARIA_HA *info, uchar *buf,
+                                   MARIA_RECORD_POS filepos)
+{
+  MARIA_BLOCK_INFO block_info;
+  MARIA_SHARE *share= info->s;
+  uchar *pos;
+  DBUG_ENTER("maria_read_mempack_record");
+
+  if (filepos == HA_OFFSET_ERROR)
+    DBUG_RETURN(my_errno);          /* _search() didn't find record */
+
+  if (!(pos= (uchar*) _ma_mempack_get_block_info(info, &info->bit_buff,
+                                                &block_info, &info->rec_buff,
+                                                &info->rec_buff_size,
+						(uchar*) share->file_map+
+						filepos)))
+    DBUG_RETURN(my_errno);
+  DBUG_RETURN(_ma_pack_rec_unpack(info, &info->bit_buff, buf,
+                                  pos, block_info.rec_len));
+}
+
+
+/*ARGSUSED*/
+static int _ma_read_rnd_mempack_record(MARIA_HA *info,
+                                       uchar *buf,
+				       register MARIA_RECORD_POS filepos,
+				       my_bool skip_deleted_blocks
+				       __attribute__((unused)))
+{
+  MARIA_BLOCK_INFO block_info;
+  MARIA_SHARE *share= info->s;
+  uchar *pos,*start;
+  DBUG_ENTER("_ma_read_rnd_mempack_record");
+
+  if (filepos >= share->state.state.data_file_length)
+  {
+    my_errno=HA_ERR_END_OF_FILE;
+    goto err;
+  }
+  if (!(pos= (uchar*) _ma_mempack_get_block_info(info, &info->bit_buff,
+                                                &block_info,
+                                                &info->rec_buff,
+                                                &info->rec_buff_size,
+						(uchar*)
+                                                (start= share->file_map +
+						 filepos))))
+    goto err;
+#ifndef DBUG_OFF
+  if (block_info.rec_len > info->s->max_pack_length)
+  {
+    my_errno=HA_ERR_WRONG_IN_RECORD;
+    goto err;
+  }
+#endif
+  info->packed_length=block_info.rec_len;
+  info->cur_row.lastpos= filepos;
+  info->cur_row.nextpos= filepos+(uint) (pos-start)+block_info.rec_len;
+  info->update|= HA_STATE_AKTIV | HA_STATE_KEY_CHANGED;
+
+  DBUG_RETURN (_ma_pack_rec_unpack(info, &info->bit_buff, buf,
+                                   pos, block_info.rec_len));
+ err:
+  DBUG_RETURN(my_errno);
+}
+
+#endif /* HAVE_MMAP */
+
+	/* Save length of row */
+
+uint _ma_save_pack_length(uint version, uchar *block_buff, ulong length)
+{
+  if (length < 254)
+  {
+    *(uchar*) block_buff= (uchar) length;
+    return 1;
+  }
+  if (length <= 65535)
+  {
+    *(uchar*) block_buff=254;
+    int2store(block_buff+1,(uint) length);
+    return 3;
+  }
+  *(uchar*) block_buff=255;
+  if (version == 1) /* old format */
+  {
+    DBUG_ASSERT(length <= 0xFFFFFF);
+    int3store(block_buff + 1, (ulong) length);
+    return 4;
+  }
+  else
+  {
+    int4store(block_buff + 1, (ulong) length);
+    return 5;
+  }
+}
+
+
+static uint read_pack_length(uint version, const uchar *buf, ulong *length)
+{
+  if (buf[0] < 254)
+  {
+    *length= buf[0];
+    return 1;
+  }
+  else if (buf[0] == 254)
+  {
+    *length= uint2korr(buf + 1);
+    return 3;
+  }
+  if (version == 1) /* old format */
+  {
+    *length= uint3korr(buf + 1);
+    return 4;
+  }
+  else
+  {
+    *length= uint4korr(buf + 1);
+    return 5;
+  }
+}
+
+
+uint _ma_calc_pack_length(uint version, ulong length)
+{
+  return (length < 254) ? 1 : (length < 65536) ? 3 : (version == 1) ? 4 : 5;
+}
diff --git a/storage/maria/ma_page.c b/storage/maria/ma_page.c
new file mode 100644
index 00000000000..a4423133270
--- /dev/null
+++ b/storage/maria/ma_page.c
@@ -0,0 +1,619 @@
+/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; version 2 of the License.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */
+
+/*
+  Read and write key blocks
+
+  The basic structure of a key block is as follows:
+
+  LSN		7 (LSN_STORE_SIZE);     Log number for last change;
+  		Only for transactional pages
+  PACK_TRANSID  6 (TRANSID_SIZE);       Relative transid to pack page transid's
+  		Only for transactional pages
+  KEYNR		1 (KEYPAGE_KEYID_SIZE)  Which index this page belongs to
+  FLAG          1 (KEYPAGE_FLAG_SIZE)   Flags for page
+  PAGE_SIZE	2 (KEYPAGE_USED_SIZE)   How much of the page is used.
+  					high-byte-first
+
+  The flag is a combination of the following values:
+
+   KEYPAGE_FLAG_ISNOD            Page is a node
+   KEYPAGE_FLAG_HAS_TRANSID      There may be a transid on the page.
+
+  After this we store key data, either packed or not packed, directly
+  after each other.  If the page is a node flag, there is a pointer to
+  the next key page at page start and after each key.
+
+  At end of page the last KEYPAGE_CHECKSUM_SIZE bytes are reserved for a
+  page checksum.
+*/
+
+#include "maria_def.h"
+#include "trnman.h"
+#include "ma_key_recover.h"
+
+/**
+   Fill MARIA_PAGE structure for usage with _ma_write_keypage
+*/
+
+void _ma_page_setup(MARIA_PAGE *page, MARIA_HA *info,
+                    const MARIA_KEYDEF *keyinfo, my_off_t pos,
+                    uchar *buff)
+{
+  MARIA_SHARE *share= info->s;
+
+  page->info=    info;
+  page->keyinfo= keyinfo;
+  page->buff=    buff;
+  page->pos=     pos;
+  page->size=    _ma_get_page_used(share, buff);
+  page->org_size= page->size;
+  page->flag=    _ma_get_keypage_flag(share, buff);
+  page->node=    ((page->flag & KEYPAGE_FLAG_ISNOD) ?
+                  share->base.key_reflength : 0);
+}
+
+#ifdef IDENTICAL_PAGES_AFTER_RECOVERY
+void page_cleanup(MARIA_SHARE *share, MARIA_PAGE *page)
+{
+  uint length= page->size;
+  DBUG_ASSERT(length <= share->max_index_block_size);
+  bzero(page->buff + length, share->block_size - length);
+}
+#endif
+
+
+/**
+  Fetch a key-page in memory
+
+  @fn _ma_fetch_keypage()
+  @param page		Fill this struct with information about read page
+  @param info		Maria handler
+  @param keyinfo        Key definition for used key
+  @param pos		Position for page (in bytes)
+  @param lock		Lock type for page
+  @param level		Importance of page; Priority for page cache
+  @param buff	        Buffer to use for page
+  @param return_buffer  Set to 1 if we want to force useage of buff
+
+  @return
+  @retval 0  ok
+  @retval 1  error
+*/
+
+my_bool _ma_fetch_keypage(MARIA_PAGE *page, MARIA_HA *info,
+                          const MARIA_KEYDEF *keyinfo,
+                          my_off_t pos, enum pagecache_page_lock lock,
+                          int level, uchar *buff,
+                          my_bool return_buffer __attribute__ ((unused)))
+{
+  uchar *tmp;
+  MARIA_PINNED_PAGE page_link;
+  MARIA_SHARE *share= info->s;
+  uint block_size= share->block_size;
+  DBUG_ENTER("_ma_fetch_keypage");
+  DBUG_PRINT("enter",("page: %lu", (ulong) (pos / block_size)));
+
+  tmp= pagecache_read(share->pagecache, &share->kfile,
+                      (pgcache_page_no_t) (pos / block_size), level, buff,
+                      share->page_type, lock, &page_link.link);
+
+  if (lock != PAGECACHE_LOCK_LEFT_UNLOCKED)
+  {
+    DBUG_ASSERT(lock == PAGECACHE_LOCK_WRITE || PAGECACHE_LOCK_READ);
+    page_link.unlock= (lock == PAGECACHE_LOCK_WRITE ?
+                       PAGECACHE_LOCK_WRITE_UNLOCK :
+                       PAGECACHE_LOCK_READ_UNLOCK);
+    page_link.changed= 0;
+    push_dynamic(&info->pinned_pages, (void*) &page_link);
+    page->link_offset= info->pinned_pages.elements-1;
+  }
+
+  if (tmp == info->buff)
+    info->keyread_buff_used=1;
+  else if (!tmp)
+  {
+    DBUG_PRINT("error",("Got errno: %d from pagecache_read",my_errno));
+    info->last_keypage=HA_OFFSET_ERROR;
+    maria_print_error(share, HA_ERR_CRASHED);
+    my_errno=HA_ERR_CRASHED;
+    DBUG_RETURN(1);
+  }
+  info->last_keypage= pos;
+
+  /*
+    Setup page structure to make pages easy to use
+    This is same as page_fill_info, but here inlined as this si used
+    so often.
+  */
+  page->info=    info;
+  page->keyinfo= keyinfo;
+  page->buff=    tmp;
+  page->pos=     pos;
+  page->size=    _ma_get_page_used(share, tmp);
+  page->org_size= page->size;                    /* For debugging */
+  page->flag=    _ma_get_keypage_flag(share, tmp);
+  page->node=   ((page->flag & KEYPAGE_FLAG_ISNOD) ?
+                 share->base.key_reflength : 0);
+
+#ifdef EXTRA_DEBUG
+  {
+    uint page_size= page->size;
+    if (page_size < 4 || page_size > share->max_index_block_size ||
+        _ma_get_keynr(share, tmp) != keyinfo->key_nr)
+    {
+      DBUG_PRINT("error",("page %lu had wrong page length: %u  keynr: %u",
+                          (ulong) (pos / block_size), page_size,
+                          _ma_get_keynr(share, tmp)));
+      DBUG_DUMP("page", tmp, page_size);
+      info->last_keypage = HA_OFFSET_ERROR;
+      maria_print_error(share, HA_ERR_CRASHED);
+      my_errno= HA_ERR_CRASHED;
+      DBUG_RETURN(1);
+    }
+  }
+#endif
+  DBUG_RETURN(0);
+} /* _ma_fetch_keypage */
+
+
+/* Write a key-page on disk */
+
+my_bool _ma_write_keypage(MARIA_PAGE *page, enum pagecache_page_lock lock,
+                          int level)
+{
+  MARIA_SHARE *share= page->info->s;
+  uint block_size= share->block_size;
+  uchar *buff= page->buff;
+  my_bool res;
+  MARIA_PINNED_PAGE page_link;
+  DBUG_ENTER("_ma_write_keypage");
+
+  /*
+    The following ensures that for transactional tables we have logged
+    all changes that changes the page size (as the logging code sets
+    page->org_size)
+  */
+  DBUG_ASSERT(!share->now_transactional || page->size == page->org_size);
+
+#ifdef EXTRA_DEBUG				/* Safety check */
+  {
+    uint page_length, nod_flag;
+    page_length= _ma_get_page_used(share, buff);
+    nod_flag=    _ma_test_if_nod(share, buff);
+
+    DBUG_ASSERT(page->size == page_length);
+    DBUG_ASSERT(page->flag == _ma_get_keypage_flag(share, buff));
+
+    if (page->pos < share->base.keystart ||
+        page->pos+block_size > share->state.state.key_file_length ||
+        (page->pos & (maria_block_size-1)))
+    {
+      DBUG_PRINT("error",("Trying to write inside key status region: "
+                          "key_start: %lu  length: %lu  page_pos: %lu",
+                          (long) share->base.keystart,
+                          (long) share->state.state.key_file_length,
+                          (long) page->pos));
+      my_errno=EINVAL;
+      DBUG_ASSERT(0);
+      DBUG_RETURN(1);
+    }
+    DBUG_PRINT("page",("write page at: %lu",(ulong) (page->pos / block_size)));
+    DBUG_DUMP("buff", buff, page_length);
+    DBUG_ASSERT(page_length >= share->keypage_header + nod_flag +
+                page->keyinfo->minlength || maria_in_recovery);
+  }
+#endif
+
+  /* Verify that keynr is correct */
+  DBUG_ASSERT(_ma_get_keynr(share, buff) == page->keyinfo->key_nr);
+
+#if defined(EXTRA_DEBUG) && defined(HAVE_valgrind) && defined(NOT_ANYMORE)
+  {
+    /* This is here to catch uninitialized bytes */
+    uint length= page->size;
+    ulong crc= my_checksum(0, buff, length);
+    int4store(buff + block_size - KEYPAGE_CHECKSUM_SIZE, crc);
+  }
+#endif
+
+  page_cleanup(share, page);
+  res= pagecache_write(share->pagecache,
+                       &share->kfile,
+                       (pgcache_page_no_t) (page->pos / block_size),
+                       level, buff, share->page_type,
+                       lock,
+                       lock == PAGECACHE_LOCK_LEFT_WRITELOCKED ?
+                       PAGECACHE_PIN_LEFT_PINNED :
+                       (lock == PAGECACHE_LOCK_WRITE_UNLOCK ?
+                        PAGECACHE_UNPIN : PAGECACHE_PIN),
+                       PAGECACHE_WRITE_DELAY, &page_link.link,
+		       LSN_IMPOSSIBLE);
+
+  if (lock == PAGECACHE_LOCK_WRITE)
+  {
+    /* It was not locked before, we have to unlock it when we unpin pages */
+    page_link.unlock= PAGECACHE_LOCK_WRITE_UNLOCK;
+    page_link.changed= 1;
+    push_dynamic(&page->info->pinned_pages, (void*) &page_link);
+  }
+  DBUG_RETURN(res);
+}
+
+
+/**
+  @brief Put page in free list
+
+  @fn    _ma_dispose()
+  @param info		Maria handle
+  @param pos	 	Address to page
+  @param page_not_read  1 if page has not yet been read
+
+  @note
+    The page at 'pos' must have been read with a write lock.
+    This function does logging (unlike _ma_new()).
+
+  @return
+  @retval  0    ok
+  @retval  1    error
+
+*/
+
+int _ma_dispose(register MARIA_HA *info, my_off_t pos, my_bool page_not_read)
+{
+  my_off_t old_link;
+  uchar buff[MAX_KEYPAGE_HEADER_SIZE+ 8 + 2];
+  ulonglong page_no;
+  MARIA_SHARE *share= info->s;
+  MARIA_PINNED_PAGE page_link;
+  uint block_size= share->block_size;
+  int result= 0;
+  enum pagecache_page_lock lock_method;
+  enum pagecache_page_pin pin_method;
+  DBUG_ENTER("_ma_dispose");
+  DBUG_PRINT("enter",("page: %lu", (ulong) (pos / block_size)));
+  DBUG_ASSERT(pos % block_size == 0);
+
+  (void) _ma_lock_key_del(info, 0);
+
+  old_link= share->key_del_current;
+  share->key_del_current= pos;
+  page_no= pos / block_size;
+  bzero(buff, share->keypage_header);
+  _ma_store_keynr(share, buff, (uchar) MARIA_DELETE_KEY_NR);
+  _ma_store_page_used(share, buff, share->keypage_header + 8);
+  mi_sizestore(buff + share->keypage_header, old_link);
+  share->state.changed|= STATE_NOT_SORTED_PAGES;
+
+  if (share->now_transactional)
+  {
+    LSN lsn;
+    uchar log_data[FILEID_STORE_SIZE + PAGE_STORE_SIZE * 2];
+    LEX_CUSTRING log_array[TRANSLOG_INTERNAL_PARTS + 1];
+    my_off_t page;
+
+    /* Store address of deleted page */
+    page_store(log_data + FILEID_STORE_SIZE, page_no);
+
+    /* Store link to next unused page (the link that is written to page) */
+    page= (old_link == HA_OFFSET_ERROR ? IMPOSSIBLE_PAGE_NO :
+           old_link / block_size);
+    page_store(log_data + FILEID_STORE_SIZE + PAGE_STORE_SIZE, page);
+
+    log_array[TRANSLOG_INTERNAL_PARTS + 0].str=    log_data;
+    log_array[TRANSLOG_INTERNAL_PARTS + 0].length= sizeof(log_data);
+
+    if (translog_write_record(&lsn, LOGREC_REDO_INDEX_FREE_PAGE,
+                              info->trn, info,
+                              (translog_size_t) sizeof(log_data),
+                              TRANSLOG_INTERNAL_PARTS + 1, log_array,
+                              log_data, NULL))
+      result= 1;
+  }
+
+  if (page_not_read)
+  {
+    lock_method= PAGECACHE_LOCK_WRITE;
+    pin_method= PAGECACHE_PIN;
+  }
+  else
+  {
+    lock_method= PAGECACHE_LOCK_LEFT_WRITELOCKED;
+    pin_method= PAGECACHE_PIN_LEFT_PINNED;
+  }
+
+  if (pagecache_write_part(share->pagecache,
+                           &share->kfile, (pgcache_page_no_t) page_no,
+                           PAGECACHE_PRIORITY_LOW, buff,
+                           share->page_type,
+                           lock_method, pin_method,
+                           PAGECACHE_WRITE_DELAY, &page_link.link,
+			   LSN_IMPOSSIBLE,
+                           0, share->keypage_header + 8))
+    result= 1;
+
+#ifdef IDENTICAL_PAGES_AFTER_RECOVERY
+  {
+    uchar *page_buff= pagecache_block_link_to_buffer(page_link.link);
+    bzero(page_buff + share->keypage_header + 8,
+          block_size - share->keypage_header - 8 - KEYPAGE_CHECKSUM_SIZE);
+  }
+#endif
+
+  if (page_not_read)
+  {
+    /* It was not locked before, we have to unlock it when we unpin pages */
+    page_link.unlock= PAGECACHE_LOCK_WRITE_UNLOCK;
+    page_link.changed= 1;
+    push_dynamic(&info->pinned_pages, (void*) &page_link);
+  }
+
+  DBUG_RETURN(result);
+} /* _ma_dispose */
+
+
+/**
+  @brief Get address for free page to use
+
+  @fn     _ma_new()
+  @param  info		Maria handle
+  @param  level         Type of key block (caching priority for pagecache)
+  @param  page_link	Pointer to page in page cache if read. One can
+                        check if this is used by checking if
+                        page_link->changed != 0
+
+  @note Logging of this is left to the caller (so that the "new"ing and the
+  first changes done to this new page can be logged as one single entry - one
+  single _ma_log_new()) call).
+
+  @return
+    HA_OFFSET_ERROR     File is full or page read error
+    #		        Page address to use
+*/
+
+my_off_t _ma_new(register MARIA_HA *info, int level,
+                 MARIA_PINNED_PAGE **page_link)
+
+{
+  my_off_t pos;
+  MARIA_SHARE *share= info->s;
+  uint block_size= share->block_size;
+  DBUG_ENTER("_ma_new");
+
+  if (_ma_lock_key_del(info, 1))
+  {
+    pthread_mutex_lock(&share->intern_lock);
+    pos= share->state.state.key_file_length;
+    if (pos >= share->base.max_key_file_length - block_size)
+    {
+      my_errno=HA_ERR_INDEX_FILE_FULL;
+      pthread_mutex_unlock(&share->intern_lock);
+      DBUG_RETURN(HA_OFFSET_ERROR);
+    }
+    share->state.state.key_file_length+= block_size;
+    /* Following is for not transactional tables */
+    info->state->key_file_length= share->state.state.key_file_length;
+    pthread_mutex_unlock(&share->intern_lock);
+    (*page_link)->changed= 0;
+    (*page_link)->write_lock= PAGECACHE_LOCK_WRITE;
+  }
+  else
+  {
+    uchar *buff;
+    pos= share->key_del_current;                /* Protected */
+    DBUG_ASSERT(share->pagecache->block_size == block_size);
+    if (!(buff= pagecache_read(share->pagecache,
+                               &share->kfile,
+                               (pgcache_page_no_t) (pos / block_size), level,
+                               0, share->page_type,
+                               PAGECACHE_LOCK_WRITE, &(*page_link)->link)))
+      pos= HA_OFFSET_ERROR;
+    else
+    {
+      /*
+        Next deleted page's number is in the header of the present page
+        (single linked list):
+      */
+#ifndef DBUG_OFF
+      my_off_t key_del_current;
+#endif
+      share->key_del_current= mi_sizekorr(buff+share->keypage_header);
+#ifndef DBUG_OFF
+      key_del_current= share->key_del_current;
+      DBUG_ASSERT((key_del_current != 0) &&
+                  ((key_del_current == HA_OFFSET_ERROR) ||
+                   (key_del_current <=
+                    (share->state.state.key_file_length - block_size))));
+#endif
+    }
+
+    (*page_link)->unlock=     PAGECACHE_LOCK_WRITE_UNLOCK;
+    (*page_link)->write_lock= PAGECACHE_LOCK_WRITE;
+    /*
+      We have to mark it changed as _ma_flush_pending_blocks() uses
+      'changed' to know if we used the page cache or not
+    */
+    (*page_link)->changed= 1;
+    push_dynamic(&info->pinned_pages, (void*) *page_link);
+    *page_link= dynamic_element(&info->pinned_pages,
+                                info->pinned_pages.elements-1,
+                                MARIA_PINNED_PAGE *);
+  }
+  share->state.changed|= STATE_NOT_SORTED_PAGES;
+  DBUG_PRINT("exit",("Pos: %ld",(long) pos));
+  DBUG_RETURN(pos);
+} /* _ma_new */
+
+
+/**
+   Log compactation of a index page
+*/
+
+static my_bool _ma_log_compact_keypage(MARIA_PAGE *ma_page,
+                                       TrID min_read_from)
+{
+  LSN lsn;
+  uchar log_data[FILEID_STORE_SIZE + PAGE_STORE_SIZE + 1 + 7 + TRANSID_SIZE];
+  uchar *log_pos;
+  LEX_CUSTRING log_array[TRANSLOG_INTERNAL_PARTS + 1];
+  MARIA_HA *info= ma_page->info;
+  MARIA_SHARE *share= info->s;
+  uint translog_parts, extra_length;
+  my_off_t page= ma_page->pos;
+  DBUG_ENTER("_ma_log_compact_keypage");
+  DBUG_PRINT("enter", ("page: %lu", (ulong) (page / share->block_size)));
+
+  /* Store address of new root page */
+  page/= share->block_size;
+  page_store(log_data + FILEID_STORE_SIZE, page);
+
+  log_pos= log_data + FILEID_STORE_SIZE + PAGE_STORE_SIZE;
+
+  log_pos[0]= KEY_OP_COMPACT_PAGE;
+  transid_store(log_pos + 1, min_read_from);
+  log_pos+= 1 + TRANSID_SIZE;
+
+  log_array[TRANSLOG_INTERNAL_PARTS + 0].str=    log_data;
+  log_array[TRANSLOG_INTERNAL_PARTS + 0].length= (uint) (log_pos -
+                                                         log_data);
+  translog_parts= 1;
+  extra_length= 0;
+
+  _ma_log_key_changes(ma_page,
+                      log_array + TRANSLOG_INTERNAL_PARTS + translog_parts,
+                      log_pos, &extra_length, &translog_parts);
+  /* Remember new page length for future log entires for same page */
+  ma_page->org_size= ma_page->size;
+
+  if (translog_write_record(&lsn, LOGREC_REDO_INDEX,
+                            info->trn, info,
+                            log_array[TRANSLOG_INTERNAL_PARTS +
+                                      0].length + extra_length,
+                            TRANSLOG_INTERNAL_PARTS + translog_parts,
+                            log_array, log_data, NULL))
+    DBUG_RETURN(1);
+  DBUG_RETURN(0);
+}
+
+
+/**
+   Remove all transaction id's less than given one from a key page
+
+   @fn    _ma_compact_keypage()
+   @param keyinfo        Key handler
+   @param page_pos       Page position on disk
+   @param page           Buffer for page
+   @param min_read_from  Remove all trids from page less than this
+
+   @retval 0             Ok
+   �retval 1             Error;  my_errno contains the error
+*/
+
+my_bool _ma_compact_keypage(MARIA_PAGE *ma_page, TrID min_read_from)
+{
+  MARIA_HA *info= ma_page->info;
+  MARIA_SHARE *share= info->s;
+  MARIA_KEY key;
+  uchar *page, *endpos, *start_of_empty_space;
+  uint page_flag, nod_flag, saved_space;
+  my_bool page_has_transid;
+  DBUG_ENTER("_ma_compact_keypage");
+
+  page_flag= ma_page->flag;
+  if (!(page_flag & KEYPAGE_FLAG_HAS_TRANSID))
+    DBUG_RETURN(0);                    /* No transaction id on page */
+
+  nod_flag= ma_page->node;
+  page=    ma_page->buff;
+  endpos= page + ma_page->size;
+  key.data= info->lastkey_buff;
+  key.keyinfo= (MARIA_KEYDEF*) ma_page->keyinfo;
+
+  page_has_transid= 0;
+  page+= share->keypage_header + nod_flag;
+  key.data[0]= 0;                             /* safety */
+  start_of_empty_space= 0;
+  saved_space= 0;
+  do
+  {
+    if (!(page= (*ma_page->keyinfo->skip_key)(&key, 0, 0, page)))
+    {
+      DBUG_PRINT("error",("Couldn't find last key:  page_pos: 0x%lx",
+                          (long) page));
+      maria_print_error(share, HA_ERR_CRASHED);
+      my_errno=HA_ERR_CRASHED;
+      DBUG_RETURN(1);
+    }
+    if (key_has_transid(page-1))
+    {
+      uint transid_length;
+      transid_length= transid_packed_length(page);
+
+      if (min_read_from == ~(TrID) 0 ||
+          min_read_from < transid_get_packed(share, page))
+      {
+        page[-1]&= 254;                           /* Remove transid marker */
+        transid_length= transid_packed_length(page);
+        if (start_of_empty_space)
+        {
+          /* Move block before the transid up in page */
+          uint copy_length= (uint) (page - start_of_empty_space) - saved_space;
+          memmove(start_of_empty_space, start_of_empty_space + saved_space,
+                  copy_length);
+          start_of_empty_space+= copy_length;
+        }
+        else
+          start_of_empty_space= page;
+        saved_space+= transid_length;
+      }
+      else
+        page_has_transid= 1;                /* At least one id left */
+      page+= transid_length;
+    }
+    page+= nod_flag;
+  } while (page < endpos);
+
+  DBUG_ASSERT(page == endpos);
+
+  if (start_of_empty_space)
+  {
+    /*
+      Move last block down
+      This is always true if any transid was removed
+    */
+    uint copy_length= (uint) (endpos - start_of_empty_space) - saved_space;
+
+    if (copy_length)
+      memmove(start_of_empty_space, start_of_empty_space + saved_space,
+              copy_length);
+    ma_page->size= (uint) (start_of_empty_space + copy_length - ma_page->buff);
+    page_store_size(share, ma_page);
+  }
+
+  if (!page_has_transid)
+  {
+    ma_page->flag&= ~KEYPAGE_FLAG_HAS_TRANSID;
+    _ma_store_keypage_flag(share, ma_page->buff, ma_page->flag);
+    /* Clear packed transid (in case of zerofill) */
+    bzero(ma_page->buff + LSN_STORE_SIZE, TRANSID_SIZE);
+  }
+
+  if (share->now_transactional)
+  {
+    if (_ma_log_compact_keypage(ma_page, min_read_from))
+      DBUG_RETURN(1);
+  }
+  DBUG_RETURN(0);
+}
diff --git a/storage/maria/ma_pagecache.c b/storage/maria/ma_pagecache.c
new file mode 100644
index 00000000000..441310a60ea
--- /dev/null
+++ b/storage/maria/ma_pagecache.c
@@ -0,0 +1,5104 @@
+/* Copyright (C) 2000-2008 MySQL AB
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; version 2 of the License.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */
+
+/*
+  These functions handle page caching for Maria tables.
+
+  One cache can handle many files.
+  It must contain buffers of the same blocksize.
+  init_pagecache() should be used to init cache handler.
+
+  The free list (free_block_list) is a stack like structure.
+  When a block is freed by free_block(), it is pushed onto the stack.
+  When a new block is required it is first tried to pop one from the stack.
+  If the stack is empty, it is tried to get a never-used block from the pool.
+  If this is empty too, then a block is taken from the LRU ring, flushing it
+  to disk, if necessary. This is handled in find_block().
+  With the new free list, the blocks can have three temperatures:
+  hot, warm and cold (which is free). This is remembered in the block header
+  by the enum PCBLOCK_TEMPERATURE temperature variable. Remembering the
+  temperature is necessary to correctly count the number of warm blocks,
+  which is required to decide when blocks are allowed to become hot. Whenever
+  a block is inserted to another (sub-)chain, we take the old and new
+  temperature into account to decide if we got one more or less warm block.
+  blocks_unused is the sum of never used blocks in the pool and of currently
+  free blocks. blocks_used is the number of blocks fetched from the pool and
+  as such gives the maximum number of in-use blocks at any time.
+
+  TODO: Write operation locks whole cache till the end of the operation.
+    Should be fixed.
+*/
+
+#include "maria_def.h"
+#include <m_string.h>
+#include "ma_pagecache.h"
+#include "ma_blockrec.h"
+#include <my_bit.h>
+#include <errno.h>
+
+/*
+  Some compilation flags have been added specifically for this module
+  to control the following:
+  - not to let a thread to yield the control when reading directly
+    from page cache, which might improve performance in many cases;
+    to enable this add:
+    #define SERIALIZED_READ_FROM_CACHE
+  - to set an upper bound for number of threads simultaneously
+    using the page cache; this setting helps to determine an optimal
+    size for hash table and improve performance when the number of
+    blocks in the page cache much less than the number of threads
+    accessing it;
+    to set this number equal to <N> add
+      #define MAX_THREADS <N>
+  - to substitute calls of pthread_cond_wait for calls of
+    pthread_cond_timedwait (wait with timeout set up);
+    this setting should be used only when you want to trap a deadlock
+    situation, which theoretically should not happen;
+    to set timeout equal to <T> seconds add
+      #define PAGECACHE_TIMEOUT <T>
+  - to enable the module traps and to send debug information from
+    page cache module to a special debug log add:
+      #define PAGECACHE_DEBUG
+    the name of this debug log file <LOG NAME> can be set through:
+      #define PAGECACHE_DEBUG_LOG  <LOG NAME>
+    if the name is not defined, it's set by default;
+    if the PAGECACHE_DEBUG flag is not set up and we are in a debug
+    mode, i.e. when ! defined(DBUG_OFF), the debug information from the
+    module is sent to the regular debug log.
+
+  Example of the settings:
+    #define SERIALIZED_READ_FROM_CACHE
+    #define MAX_THREADS   100
+    #define PAGECACHE_TIMEOUT  1
+    #define PAGECACHE_DEBUG
+    #define PAGECACHE_DEBUG_LOG  "my_pagecache_debug.log"
+*/
+
+/*
+  In key cache we have external raw locking here we use
+  SERIALIZED_READ_FROM_CACHE to avoid problem of reading
+  not consistent data from the page.
+  (keycache functions (key_cache_read(), key_cache_insert() and
+  key_cache_write()) rely on external MyISAM lock, we don't)
+*/
+#define SERIALIZED_READ_FROM_CACHE yes
+
+#define PCBLOCK_INFO(B) \
+  DBUG_PRINT("info", \
+             ("block: 0x%lx  fd: %lu  page: %lu  s: %0x  hshL: " \
+              " 0x%lx  req: %u/%u wrlocks: %u  rdlocks %u  " \
+              "rdlocks_q: %u  pins: %u  status: %u  type: %s", \
+              (ulong)(B), \
+              (ulong)((B)->hash_link ? \
+                      (B)->hash_link->file.file : \
+                      0), \
+              (ulong)((B)->hash_link ? \
+                      (B)->hash_link->pageno : \
+                      0), \
+              (B)->status, \
+              (ulong)(B)->hash_link, \
+              (uint) (B)->requests, \
+              (uint)((B)->hash_link ? \
+                     (B)->hash_link->requests : \
+                       0), \
+              block->wlocks, block->rlocks, block->rlocks_queue, \
+              (uint)(B)->pins, (uint)(B)->status, \
+              page_cache_page_type_str[(B)->type]))
+
+/* TODO: put it to my_static.c */
+my_bool my_disable_flush_pagecache_blocks= 0;
+
+#define STRUCT_PTR(TYPE, MEMBER, a)                                           \
+          (TYPE *) ((char *) (a) - offsetof(TYPE, MEMBER))
+
+/* types of condition variables */
+#define  COND_FOR_REQUESTED 0  /* queue of thread waiting for read operation */
+#define  COND_FOR_SAVED     1  /* queue of thread waiting for flush */
+#define  COND_FOR_WRLOCK    2  /* queue of write lock */
+#define  COND_SIZE          3  /* number of COND_* queues */
+
+typedef pthread_cond_t KEYCACHE_CONDVAR;
+
+/* descriptor of the page in the page cache block buffer */
+struct st_pagecache_page
+{
+  PAGECACHE_FILE file;    /* file to which the page belongs to  */
+  pgcache_page_no_t pageno; /* number of the page in the file   */
+};
+
+/* element in the chain of a hash table bucket */
+struct st_pagecache_hash_link
+{
+  struct st_pagecache_hash_link
+    *next, **prev;                   /* to connect links in the same bucket  */
+  struct st_pagecache_block_link
+    *block;                          /* reference to the block for the page: */
+  PAGECACHE_FILE file;               /* from such a file                     */
+  pgcache_page_no_t pageno;            /* this page                            */
+  uint requests;                     /* number of requests for the page      */
+};
+
+/* simple states of a block */
+#define PCBLOCK_ERROR       1 /* an error occurred when performing disk i/o  */
+#define PCBLOCK_READ        2 /* the is page in the block buffer             */
+#define PCBLOCK_IN_SWITCH   4 /* block is preparing to read new page         */
+#define PCBLOCK_REASSIGNED  8 /* block does not accept requests for old page */
+#define PCBLOCK_IN_FLUSH   16 /* block is in flush operation                 */
+#define PCBLOCK_CHANGED    32 /* block buffer contains a dirty page          */
+#define PCBLOCK_DIRECT_W   64 /* possible direct write to the block          */
+
+/* page status, returned by find_block */
+#define PAGE_READ               0
+#define PAGE_TO_BE_READ         1
+#define PAGE_WAIT_TO_BE_READ    2
+
+/* block temperature determines in which (sub-)chain the block currently is */
+enum PCBLOCK_TEMPERATURE { PCBLOCK_COLD /*free*/ , PCBLOCK_WARM , PCBLOCK_HOT };
+
+/* debug info */
+#ifndef DBUG_OFF
+static const char *page_cache_page_type_str[]=
+{
+  /* used only for control page type changing during debugging */
+  "EMPTY",
+  "PLAIN",
+  "LSN",
+  "READ_UNKNOWN"
+};
+
+static const char *page_cache_page_write_mode_str[]=
+{
+  "DELAY",
+  "DONE"
+};
+
+static const char *page_cache_page_lock_str[]=
+{
+  "free -> free",
+  "read -> read",
+  "write -> write",
+  "free -> read",
+  "free -> write",
+  "read -> free",
+  "write -> free",
+  "write -> read"
+};
+
+static const char *page_cache_page_pin_str[]=
+{
+  "pinned -> pinned",
+  "unpinned -> unpinned",
+  "unpinned -> pinned",
+  "pinned -> unpinned"
+};
+
+
+typedef struct st_pagecache_pin_info
+{
+  struct st_pagecache_pin_info *next, **prev;
+  struct st_my_thread_var *thread;
+}  PAGECACHE_PIN_INFO;
+
+/*
+  st_pagecache_lock_info structure should be kept in next, prev, thread part
+  compatible with st_pagecache_pin_info to be compatible in functions.
+*/
+
+typedef struct st_pagecache_lock_info
+{
+  struct st_pagecache_lock_info *next, **prev;
+  struct st_my_thread_var *thread;
+  my_bool write_lock;
+} PAGECACHE_LOCK_INFO;
+
+
+/* service functions maintain debugging info about pin & lock */
+
+
+/*
+  Links information about thread pinned/locked the block to the list
+
+  SYNOPSIS
+    info_link()
+    list                 the list to link in
+    node                 the node which should be linked
+*/
+
+static void info_link(PAGECACHE_PIN_INFO **list, PAGECACHE_PIN_INFO *node)
+{
+  if ((node->next= *list))
+    node->next->prev= &(node->next);
+  *list= node;
+  node->prev= list;
+}
+
+
+/*
+  Unlinks information about thread pinned/locked the block from the list
+
+  SYNOPSIS
+    info_unlink()
+    node                 the node which should be unlinked
+*/
+
+static void info_unlink(PAGECACHE_PIN_INFO *node)
+{
+  if ((*node->prev= node->next))
+   node->next->prev= node->prev;
+}
+
+
+/*
+  Finds information about given thread in the list of threads which
+  pinned/locked this block.
+
+  SYNOPSIS
+    info_find()
+    list                 the list where to find the thread
+    thread               thread ID (reference to the st_my_thread_var
+                         of the thread)
+    any                  return any thread of the list
+
+  RETURN
+    0 - the thread was not found
+    pointer to the information node of the thread in the list, or, if 'any',
+    to any thread of the list.
+*/
+
+static PAGECACHE_PIN_INFO *info_find(PAGECACHE_PIN_INFO *list,
+                                     struct st_my_thread_var *thread,
+                                     my_bool any)
+{
+  register PAGECACHE_PIN_INFO *i= list;
+  if (any)
+    return i;
+  for(; i != 0; i= i->next)
+    if (i->thread == thread)
+      return i;
+  return 0;
+}
+
+#endif /* !DBUG_OFF */
+
+/* page cache block */
+struct st_pagecache_block_link
+{
+  struct st_pagecache_block_link
+    *next_used, **prev_used;   /* to connect links in the LRU chain (ring)   */
+  struct st_pagecache_block_link
+    *next_changed, **prev_changed; /* for lists of file dirty/clean blocks   */
+  struct st_pagecache_hash_link
+    *hash_link;           /* backward ptr to referring hash_link             */
+#ifndef DBUG_OFF
+  PAGECACHE_PIN_INFO *pin_list;
+  PAGECACHE_LOCK_INFO *lock_list;
+#endif
+  KEYCACHE_CONDVAR *condvar; /* condition variable for 'no readers' event    */
+  uchar *buffer;           /* buffer for the block page                      */
+  pthread_t write_locker;
+
+  ulonglong last_hit_time; /* timestamp of the last hit                      */
+  WQUEUE
+    wqueue[COND_SIZE];    /* queues on waiting requests for new/old pages    */
+  uint32 requests;        /* number of requests for the block                */
+  uint32 pins;            /* pin counter                                     */
+  uint32 wlocks;          /* write locks counter                             */
+  uint32 rlocks;          /* read locks counter                              */
+  uint32 rlocks_queue;    /* rd. locks waiting wr. lock of this thread       */
+  uint16 status;          /* state of the block                              */
+  int16  error;           /* error code for block in case of error */
+  enum PCBLOCK_TEMPERATURE temperature; /* block temperature: cold, warm, hot*/
+  enum pagecache_page_type type; /* type of the block                        */
+  uint hits_left;         /* number of hits left until promotion             */
+  /** @brief LSN when first became dirty; LSN_MAX means "not yet set"        */
+  LSN rec_lsn;
+};
+
+/** @brief information describing a run of flush_pagecache_blocks_int() */
+struct st_file_in_flush
+{
+  File file;
+  /**
+     @brief threads waiting for the thread currently flushing this file to be
+     done
+  */
+  WQUEUE flush_queue;
+  /**
+     @brief if the thread currently flushing the file has a non-empty
+     first_in_switch list.
+  */
+  my_bool first_in_switch;
+};
+
+#ifndef DBUG_OFF
+/* debug checks */
+
+#ifdef NOT_USED
+static my_bool info_check_pin(PAGECACHE_BLOCK_LINK *block,
+                              enum pagecache_page_pin mode
+                              __attribute__((unused)))
+{
+  struct st_my_thread_var *thread= my_thread_var;
+  PAGECACHE_PIN_INFO *info= info_find(block->pin_list, thread);
+  DBUG_ENTER("info_check_pin");
+  DBUG_PRINT("enter", ("thread: 0x%lx  pin: %s",
+                       (ulong) thread, page_cache_page_pin_str[mode]));
+  if (info)
+  {
+    if (mode == PAGECACHE_PIN_LEFT_UNPINNED)
+    {
+      DBUG_PRINT("info",
+                 ("info_check_pin: thread: 0x%lx  block: 0x%lx  ; LEFT_UNPINNED!!!",
+                  (ulong)thread, (ulong)block));
+      DBUG_RETURN(1);
+    }
+    else if (mode == PAGECACHE_PIN)
+    {
+      DBUG_PRINT("info",
+                 ("info_check_pin: thread: 0x%lx  block: 0x%lx  ; PIN!!!",
+                  (ulong)thread, (ulong)block));
+      DBUG_RETURN(1);
+    }
+  }
+  else
+  {
+    if (mode == PAGECACHE_PIN_LEFT_PINNED)
+    {
+      DBUG_PRINT("info",
+                 ("info_check_pin: thread: 0x%lx  block: 0x%lx  ; LEFT_PINNED!!!",
+                  (ulong)thread, (ulong)block));
+      DBUG_RETURN(1);
+    }
+    else if (mode == PAGECACHE_UNPIN)
+    {
+      DBUG_PRINT("info",
+                 ("info_check_pin: thread: 0x%lx  block: 0x%lx  ; UNPIN!!!",
+                  (ulong)thread, (ulong)block));
+      DBUG_RETURN(1);
+    }
+  }
+  DBUG_RETURN(0);
+}
+
+
+/*
+  Debug function which checks current lock/pin state and requested changes
+
+  SYNOPSIS
+    info_check_lock()
+    lock                 requested lock changes
+    pin                  requested pin changes
+
+  RETURN
+    0 - OK
+    1 - Error
+*/
+
+static my_bool info_check_lock(PAGECACHE_BLOCK_LINK *block,
+                               enum pagecache_page_lock lock,
+                               enum pagecache_page_pin pin)
+{
+  struct st_my_thread_var *thread= my_thread_var;
+  PAGECACHE_LOCK_INFO *info=
+    (PAGECACHE_LOCK_INFO *) info_find((PAGECACHE_PIN_INFO *) block->lock_list,
+                                      thread);
+  DBUG_ENTER("info_check_lock");
+  switch(lock) {
+  case PAGECACHE_LOCK_LEFT_UNLOCKED:
+    if (pin != PAGECACHE_PIN_LEFT_UNPINNED ||
+        info)
+      goto error;
+    break;
+  case PAGECACHE_LOCK_LEFT_READLOCKED:
+    if ((pin != PAGECACHE_PIN_LEFT_UNPINNED &&
+         pin != PAGECACHE_PIN_LEFT_PINNED) ||
+        info == 0 || info->write_lock)
+      goto error;
+    break;
+  case PAGECACHE_LOCK_LEFT_WRITELOCKED:
+    if (pin != PAGECACHE_PIN_LEFT_PINNED ||
+        info == 0 || !info->write_lock)
+      goto error;
+    break;
+  case PAGECACHE_LOCK_READ:
+    if ((pin != PAGECACHE_PIN_LEFT_UNPINNED &&
+         pin != PAGECACHE_PIN) ||
+        info != 0)
+      goto error;
+    break;
+  case PAGECACHE_LOCK_WRITE:
+    if (pin != PAGECACHE_PIN ||
+        info != 0)
+      goto error;
+    break;
+  case PAGECACHE_LOCK_READ_UNLOCK:
+    if ((pin != PAGECACHE_PIN_LEFT_UNPINNED &&
+         pin != PAGECACHE_UNPIN) ||
+        info == 0 || info->write_lock)
+      goto error;
+    break;
+  case PAGECACHE_LOCK_WRITE_UNLOCK:
+    if (pin != PAGECACHE_UNPIN ||
+        info == 0 || !info->write_lock)
+      goto error;
+    break;
+  case PAGECACHE_LOCK_WRITE_TO_READ:
+    if ((pin != PAGECACHE_PIN_LEFT_PINNED &&
+         pin != PAGECACHE_UNPIN) ||
+        info == 0 || !info->write_lock)
+      goto error;
+    break;
+  }
+  DBUG_RETURN(0);
+error:
+  DBUG_PRINT("info",
+             ("info_check_lock: thread: 0x%lx block 0x%lx: info: %d wrt: %d,"
+              "to lock: %s, to pin: %s",
+              (ulong)thread, (ulong)block, test(info),
+              (info ? info->write_lock : 0),
+              page_cache_page_lock_str[lock],
+              page_cache_page_pin_str[pin]));
+  DBUG_RETURN(1);
+}
+#endif /* NOT_USED */
+#endif /* !DBUG_OFF */
+
+#define FLUSH_CACHE         2000            /* sort this many blocks at once */
+
+static void free_block(PAGECACHE *pagecache, PAGECACHE_BLOCK_LINK *block);
+#ifndef DBUG_OFF
+static void test_key_cache(PAGECACHE *pagecache,
+                           const char *where, my_bool lock);
+#endif
+
+#define PAGECACHE_HASH(p, f, pos) (((ulong) (pos) +                          \
+                                    (ulong) (f).file) & (p->hash_entries-1))
+#define FILE_HASH(f) ((uint) (f).file & (PAGECACHE_CHANGED_BLOCKS_HASH - 1))
+
+#define DEFAULT_PAGECACHE_DEBUG_LOG  "pagecache_debug.log"
+
+#if defined(PAGECACHE_DEBUG) && ! defined(PAGECACHE_DEBUG_LOG)
+#define PAGECACHE_DEBUG_LOG  DEFAULT_PAGECACHE_DEBUG_LOG
+#endif
+
+#if defined(PAGECACHE_DEBUG_LOG)
+static FILE *pagecache_debug_log= NULL;
+static void pagecache_debug_print _VARARGS((const char *fmt, ...));
+#define PAGECACHE_DEBUG_OPEN                                                  \
+          if (!pagecache_debug_log)                                           \
+          {                                                                   \
+            pagecache_debug_log= fopen(PAGECACHE_DEBUG_LOG, "w");             \
+            (void) setvbuf(pagecache_debug_log, NULL, _IOLBF, BUFSIZ);        \
+          }
+
+#define PAGECACHE_DEBUG_CLOSE                                                 \
+          if (pagecache_debug_log)                                            \
+          {                                                                   \
+            fclose(pagecache_debug_log);                                      \
+            pagecache_debug_log= 0;                                           \
+          }
+#else
+#define PAGECACHE_DEBUG_OPEN
+#define PAGECACHE_DEBUG_CLOSE
+#endif /* defined(PAGECACHE_DEBUG_LOG) */
+
+#if defined(PAGECACHE_DEBUG_LOG) && defined(PAGECACHE_DEBUG)
+#define KEYCACHE_DBUG_PRINT(l, m)                                             \
+            { if (pagecache_debug_log)                                        \
+                fprintf(pagecache_debug_log, "%s: ", l);                      \
+              pagecache_debug_print m; }
+
+#define KEYCACHE_DBUG_ASSERT(a)                                               \
+            { if (! (a) && pagecache_debug_log)                               \
+                fclose(pagecache_debug_log);                                  \
+              assert(a); }
+#else
+#define KEYCACHE_DBUG_PRINT(l, m)  DBUG_PRINT(l, m)
+#define KEYCACHE_DBUG_ASSERT(a)    DBUG_ASSERT(a)
+#endif /* defined(PAGECACHE_DEBUG_LOG) && defined(PAGECACHE_DEBUG) */
+
+#if defined(PAGECACHE_DEBUG) || !defined(DBUG_OFF)
+#ifdef THREAD
+static long pagecache_thread_id;
+#define KEYCACHE_THREAD_TRACE(l)                                              \
+             KEYCACHE_DBUG_PRINT(l,("|thread %ld",pagecache_thread_id))
+
+#define KEYCACHE_THREAD_TRACE_BEGIN(l)                                        \
+            { struct st_my_thread_var *thread_var= my_thread_var;             \
+              pagecache_thread_id= thread_var->id;                            \
+              KEYCACHE_DBUG_PRINT(l,("[thread %ld",pagecache_thread_id)) }
+
+#define KEYCACHE_THREAD_TRACE_END(l)                                          \
+            KEYCACHE_DBUG_PRINT(l,("]thread %ld",pagecache_thread_id))
+#else /* THREAD */
+#define KEYCACHE_THREAD_TRACE(l)        KEYCACHE_DBUG_PRINT(l,(""))
+#define KEYCACHE_THREAD_TRACE_BEGIN(l)  KEYCACHE_DBUG_PRINT(l,(""))
+#define KEYCACHE_THREAD_TRACE_END(l)    KEYCACHE_DBUG_PRINT(l,(""))
+#endif /* THREAD */
+#else
+#define KEYCACHE_THREAD_TRACE_BEGIN(l)
+#define KEYCACHE_THREAD_TRACE_END(l)
+#define KEYCACHE_THREAD_TRACE(l)
+#endif /* defined(PAGECACHE_DEBUG) || !defined(DBUG_OFF) */
+
+#define PCBLOCK_NUMBER(p, b)                                                    \
+  ((uint) (((char*)(b)-(char *) p->block_root)/sizeof(PAGECACHE_BLOCK_LINK)))
+#define PAGECACHE_HASH_LINK_NUMBER(p, h)                                      \
+  ((uint) (((char*)(h)-(char *) p->hash_link_root)/                           \
+           sizeof(PAGECACHE_HASH_LINK)))
+
+#if (defined(PAGECACHE_TIMEOUT) && !defined(__WIN__)) || defined(PAGECACHE_DEBUG)
+static int pagecache_pthread_cond_wait(pthread_cond_t *cond,
+                                      pthread_mutex_t *mutex);
+#else
+#define  pagecache_pthread_cond_wait pthread_cond_wait
+#endif
+
+#if defined(PAGECACHE_DEBUG)
+static int ___pagecache_pthread_mutex_lock(pthread_mutex_t *mutex);
+static void ___pagecache_pthread_mutex_unlock(pthread_mutex_t *mutex);
+static int ___pagecache_pthread_cond_signal(pthread_cond_t *cond);
+#define pagecache_pthread_mutex_lock(M) \
+{ DBUG_PRINT("lock", ("mutex lock 0x%lx %u", (ulong)(M), __LINE__)); \
+  ___pagecache_pthread_mutex_lock(M);}
+#define pagecache_pthread_mutex_unlock(M) \
+{ DBUG_PRINT("lock", ("mutex unlock 0x%lx %u", (ulong)(M), __LINE__)); \
+  ___pagecache_pthread_mutex_unlock(M);}
+#define pagecache_pthread_cond_signal(M) \
+{ DBUG_PRINT("lock", ("signal 0x%lx %u", (ulong)(M), __LINE__)); \
+  ___pagecache_pthread_cond_signal(M);}
+#else
+#define pagecache_pthread_mutex_lock pthread_mutex_lock
+#define pagecache_pthread_mutex_unlock pthread_mutex_unlock
+#define pagecache_pthread_cond_signal pthread_cond_signal
+#endif /* defined(PAGECACHE_DEBUG) */
+
+extern my_bool translog_flush(TRANSLOG_ADDRESS lsn);
+
+/*
+  Write page to the disk
+
+  SYNOPSIS
+    pagecache_fwrite()
+    pagecache - page cache pointer
+    filedesc  - pagecache file descriptor structure
+    buffer    - buffer which we will write
+    type      - page type (plain or with LSN)
+    flags     - MYF() flags
+
+  RETURN
+    0   - OK
+    1   - Error
+*/
+
+static my_bool pagecache_fwrite(PAGECACHE *pagecache,
+                                PAGECACHE_FILE *filedesc,
+                                uchar *buffer,
+                                pgcache_page_no_t pageno,
+                                enum pagecache_page_type type
+                                __attribute__((unused)),
+                                myf flags)
+{
+  DBUG_ENTER("pagecache_fwrite");
+  DBUG_ASSERT(type != PAGECACHE_READ_UNKNOWN_PAGE);
+
+  /* Todo: Integrate this with write_callback so we have only one callback */
+  if ((*filedesc->flush_log_callback)(buffer, pageno, filedesc->callback_data))
+    DBUG_RETURN(1);
+  DBUG_PRINT("info", ("write_callback: 0x%lx  data: 0x%lx",
+                      (ulong) filedesc->write_callback,
+                      (ulong) filedesc->callback_data));
+  if ((*filedesc->write_callback)(buffer, pageno, filedesc->callback_data))
+  {
+    DBUG_PRINT("error", ("write callback problem"));
+    DBUG_RETURN(1);
+  }
+  if (my_pwrite(filedesc->file, buffer, pagecache->block_size,
+                ((my_off_t) pageno << pagecache->shift), flags))
+  {
+    (*filedesc->write_fail)(filedesc->callback_data);
+    DBUG_RETURN(1);
+  }
+  DBUG_RETURN(0);
+}
+
+
+/*
+  Read page from the disk
+
+  SYNOPSIS
+    pagecache_fread()
+    pagecache - page cache pointer
+    filedesc  - pagecache file descriptor structure
+    buffer    - buffer in which we will read
+    pageno    - page number
+    flags     - MYF() flags
+*/
+#define pagecache_fread(pagecache, filedesc, buffer, pageno, flags) \
+  my_pread((filedesc)->file, buffer, pagecache->block_size,         \
+           ((my_off_t) pageno << pagecache->shift), flags)
+
+
+/**
+  @brief set rec_lsn of pagecache block (if it is needed)
+
+  @param block                   block where to set rec_lsn
+  @param first_REDO_LSN_for_page the LSN to set
+*/
+
+static inline void pagecache_set_block_rec_lsn(PAGECACHE_BLOCK_LINK *block,
+                                               LSN first_REDO_LSN_for_page)
+{
+  if (block->rec_lsn == LSN_MAX)
+    block->rec_lsn= first_REDO_LSN_for_page;
+  else
+    DBUG_ASSERT(cmp_translog_addr(block->rec_lsn,
+                                  first_REDO_LSN_for_page) <= 0);
+}
+
+
+/*
+  next_power(value) is 2 at the power of (1+floor(log2(value)));
+  e.g. next_power(2)=4, next_power(3)=4.
+*/
+static inline uint next_power(uint value)
+{
+  return (uint) my_round_up_to_next_power((uint32) value) << 1;
+}
+
+
+/*
+  Initialize a page cache
+
+  SYNOPSIS
+    init_pagecache()
+    pagecache			pointer to a page cache data structure
+    key_cache_block_size	size of blocks to keep cached data
+    use_mem                     total memory to use for the key cache
+    division_limit		division limit (may be zero)
+    age_threshold		age threshold (may be zero)
+    block_size                  size of block (should be power of 2)
+    my_read_flags		Flags used for all pread/pwrite calls
+			        Usually MY_WME in case of recovery
+
+  RETURN VALUE
+    number of blocks in the key cache, if successful,
+    0 - otherwise.
+
+  NOTES.
+    if pagecache->inited != 0 we assume that the key cache
+    is already initialized.  This is for now used by myisamchk, but shouldn't
+    be something that a program should rely on!
+
+    It's assumed that no two threads call this function simultaneously
+    referring to the same key cache handle.
+
+*/
+
+ulong init_pagecache(PAGECACHE *pagecache, size_t use_mem,
+                     uint division_limit, uint age_threshold,
+                     uint block_size, myf my_readwrite_flags)
+{
+  ulong blocks, hash_links, length;
+  int error;
+  DBUG_ENTER("init_pagecache");
+  DBUG_ASSERT(block_size >= 512);
+
+  PAGECACHE_DEBUG_OPEN;
+  if (pagecache->inited && pagecache->disk_blocks > 0)
+  {
+    DBUG_PRINT("warning",("key cache already in use"));
+    DBUG_RETURN(0);
+  }
+
+  pagecache->global_cache_w_requests= pagecache->global_cache_r_requests= 0;
+  pagecache->global_cache_read= pagecache->global_cache_write= 0;
+  pagecache->disk_blocks= -1;
+  if (! pagecache->inited)
+  {
+    if (pthread_mutex_init(&pagecache->cache_lock, MY_MUTEX_INIT_FAST) ||
+        hash_init(&pagecache->files_in_flush, &my_charset_bin, 32,
+                  offsetof(struct st_file_in_flush, file),
+                  sizeof(((struct st_file_in_flush *)NULL)->file),
+                  NULL, NULL, 0))
+      goto err;
+    pagecache->inited= 1;
+    pagecache->in_init= 0;
+    pagecache->resize_queue.last_thread= NULL;
+  }
+
+  pagecache->mem_size= use_mem;
+  pagecache->block_size= block_size;
+  pagecache->shift= my_bit_log2(block_size);
+  pagecache->readwrite_flags= my_readwrite_flags | MY_NABP | MY_WAIT_IF_FULL;
+  pagecache->org_readwrite_flags= pagecache->readwrite_flags;
+  DBUG_PRINT("info", ("block_size: %u", block_size));
+  DBUG_ASSERT(((uint)(1 << pagecache->shift)) == block_size);
+
+  blocks= (ulong) (use_mem / (sizeof(PAGECACHE_BLOCK_LINK) +
+                              2 * sizeof(PAGECACHE_HASH_LINK) +
+                              sizeof(PAGECACHE_HASH_LINK*) *
+                              5/4 + block_size));
+  /*
+    We need to support page cache with just one block to be able to do
+    scanning of rows-in-block files
+  */
+  for ( ; ; )
+  {
+    if (blocks < 8)
+    {
+      my_errno= ENOMEM;
+      goto err;
+    }
+    /* Set my_hash_entries to the next bigger 2 power */
+    if ((pagecache->hash_entries= next_power(blocks)) <
+        (blocks) * 5/4)
+      pagecache->hash_entries<<= 1;
+    hash_links= 2 * blocks;
+#if defined(MAX_THREADS)
+    if (hash_links < MAX_THREADS + blocks - 1)
+      hash_links= MAX_THREADS + blocks - 1;
+#endif
+    while ((length= (ALIGN_SIZE(blocks * sizeof(PAGECACHE_BLOCK_LINK)) +
+                     ALIGN_SIZE(hash_links * sizeof(PAGECACHE_HASH_LINK)) +
+                     ALIGN_SIZE(sizeof(PAGECACHE_HASH_LINK*) *
+                                pagecache->hash_entries))) +
+           (blocks << pagecache->shift) > use_mem)
+      blocks--;
+    /* Allocate memory for cache page buffers */
+    if ((pagecache->block_mem=
+         my_large_malloc((ulong) blocks * pagecache->block_size,
+                         MYF(MY_WME))))
+    {
+      /*
+        Allocate memory for blocks, hash_links and hash entries;
+        For each block 2 hash links are allocated
+      */
+      if ((pagecache->block_root=
+           (PAGECACHE_BLOCK_LINK*) my_malloc((size_t) length, MYF(0))))
+        break;
+      my_large_free(pagecache->block_mem, MYF(0));
+      pagecache->block_mem= 0;
+    }
+    blocks= blocks / 4*3;
+  }
+  pagecache->blocks_unused= blocks;
+  pagecache->disk_blocks= (long) blocks;
+  pagecache->hash_links= hash_links;
+  pagecache->hash_root=
+    (PAGECACHE_HASH_LINK**) ((char*) pagecache->block_root +
+                             ALIGN_SIZE(blocks*sizeof(PAGECACHE_BLOCK_LINK)));
+  pagecache->hash_link_root=
+    (PAGECACHE_HASH_LINK*) ((char*) pagecache->hash_root +
+                            ALIGN_SIZE((sizeof(PAGECACHE_HASH_LINK*) *
+                                        pagecache->hash_entries)));
+  bzero((uchar*) pagecache->block_root,
+        pagecache->disk_blocks * sizeof(PAGECACHE_BLOCK_LINK));
+  bzero((uchar*) pagecache->hash_root,
+        pagecache->hash_entries * sizeof(PAGECACHE_HASH_LINK*));
+  bzero((uchar*) pagecache->hash_link_root,
+        pagecache->hash_links * sizeof(PAGECACHE_HASH_LINK));
+  pagecache->hash_links_used= 0;
+  pagecache->free_hash_list= NULL;
+  pagecache->blocks_used= pagecache->blocks_changed= 0;
+
+  pagecache->global_blocks_changed= 0;
+  pagecache->blocks_available=0;		/* For debugging */
+
+  /* The LRU chain is empty after initialization */
+  pagecache->used_last= NULL;
+  pagecache->used_ins= NULL;
+  pagecache->free_block_list= NULL;
+  pagecache->time= 0;
+  pagecache->warm_blocks= 0;
+  pagecache->min_warm_blocks= (division_limit ?
+                               blocks * division_limit / 100 + 1 :
+                               blocks);
+  pagecache->age_threshold= (age_threshold ?
+                             blocks * age_threshold / 100 :
+                             blocks);
+
+  pagecache->cnt_for_resize_op= 0;
+  pagecache->resize_in_flush= 0;
+  pagecache->can_be_used= 1;
+
+  pagecache->waiting_for_hash_link.last_thread= NULL;
+  pagecache->waiting_for_block.last_thread= NULL;
+  DBUG_PRINT("exit",
+             ("disk_blocks: %ld  block_root: 0x%lx  hash_entries: %ld\
+ hash_root: 0x%lx  hash_links: %ld  hash_link_root: 0x%lx",
+              pagecache->disk_blocks, (long) pagecache->block_root,
+              pagecache->hash_entries, (long) pagecache->hash_root,
+              pagecache->hash_links, (long) pagecache->hash_link_root));
+  bzero((uchar*) pagecache->changed_blocks,
+        sizeof(pagecache->changed_blocks[0]) *
+        PAGECACHE_CHANGED_BLOCKS_HASH);
+  bzero((uchar*) pagecache->file_blocks,
+        sizeof(pagecache->file_blocks[0]) *
+        PAGECACHE_CHANGED_BLOCKS_HASH);
+
+  pagecache->blocks= pagecache->disk_blocks > 0 ? pagecache->disk_blocks : 0;
+  DBUG_RETURN((ulong) pagecache->disk_blocks);
+
+err:
+  error= my_errno;
+  pagecache->disk_blocks= 0;
+  pagecache->blocks=  0;
+  if (pagecache->block_mem)
+  {
+    my_large_free(pagecache->block_mem, MYF(0));
+    pagecache->block_mem= NULL;
+  }
+  if (pagecache->block_root)
+  {
+    my_free(pagecache->block_root, MYF(0));
+    pagecache->block_root= NULL;
+  }
+  my_errno= error;
+  pagecache->can_be_used= 0;
+  DBUG_RETURN(0);
+}
+
+
+/*
+  Flush all blocks in the key cache to disk
+*/
+
+#ifdef NOT_USED
+static int flush_all_key_blocks(PAGECACHE *pagecache)
+{
+#if defined(PAGECACHE_DEBUG)
+  uint cnt=0;
+#endif
+  while (pagecache->blocks_changed > 0)
+  {
+    PAGECACHE_BLOCK_LINK *block;
+    for (block= pagecache->used_last->next_used ; ; block=block->next_used)
+    {
+      if (block->hash_link)
+      {
+#if defined(PAGECACHE_DEBUG)
+        cnt++;
+        KEYCACHE_DBUG_ASSERT(cnt <= pagecache->blocks_used);
+#endif
+        if (flush_pagecache_blocks_int(pagecache, &block->hash_link->file,
+                                       FLUSH_RELEASE, NULL, NULL))
+          return 1;
+        break;
+      }
+      if (block == pagecache->used_last)
+        break;
+    }
+  }
+  return 0;
+}
+#endif /* NOT_USED */
+
+/*
+  Resize a key cache
+
+  SYNOPSIS
+    resize_pagecache()
+    pagecache                   pointer to a page cache data structure
+    use_mem			total memory to use for the new key cache
+    division_limit		new division limit (if not zero)
+    age_threshold		new age threshold (if not zero)
+
+  RETURN VALUE
+    number of blocks in the key cache, if successful,
+    0 - otherwise.
+
+  NOTES.
+    The function first compares the memory size parameter
+    with the key cache value.
+
+    If they differ the function free the the memory allocated for the
+    old key cache blocks by calling the end_pagecache function and
+    then rebuilds the key cache with new blocks by calling
+    init_key_cache.
+
+    The function starts the operation only when all other threads
+    performing operations with the key cache let her to proceed
+    (when cnt_for_resize=0).
+
+     Before being usable, this function needs:
+     - to receive fixes for BUG#17332 "changing key_buffer_size on a running
+     server can crash under load" similar to those done to the key cache
+     - to have us (Sanja) look at the additional constraints placed on
+     resizing, due to the page locking specific to this page cache.
+     So we disable it for now.
+*/
+#if NOT_USED /* keep disabled until code is fixed see above !! */
+ulong resize_pagecache(PAGECACHE *pagecache,
+                       size_t use_mem, uint division_limit,
+                       uint age_threshold)
+{
+  ulong blocks;
+#ifdef THREAD
+  struct st_my_thread_var *thread;
+  WQUEUE *wqueue;
+
+#endif
+  DBUG_ENTER("resize_pagecache");
+
+  if (!pagecache->inited)
+    DBUG_RETURN(pagecache->disk_blocks);
+
+  if(use_mem == pagecache->mem_size)
+  {
+    change_pagecache_param(pagecache, division_limit, age_threshold);
+    DBUG_RETURN(pagecache->disk_blocks);
+  }
+
+  pagecache_pthread_mutex_lock(&pagecache->cache_lock);
+
+#ifdef THREAD
+  wqueue= &pagecache->resize_queue;
+  thread= my_thread_var;
+  wqueue_link_into_queue(wqueue, thread);
+
+  while (wqueue->last_thread->next != thread)
+  {
+    pagecache_pthread_cond_wait(&thread->suspend, &pagecache->cache_lock);
+  }
+#endif
+
+  pagecache->resize_in_flush= 1;
+  if (flush_all_key_blocks(pagecache))
+  {
+    /* TODO: if this happens, we should write a warning in the log file ! */
+    pagecache->resize_in_flush= 0;
+    blocks= 0;
+    pagecache->can_be_used= 0;
+    goto finish;
+  }
+  pagecache->resize_in_flush= 0;
+  pagecache->can_be_used= 0;
+#ifdef THREAD
+  while (pagecache->cnt_for_resize_op)
+  {
+    KEYCACHE_DBUG_PRINT("resize_pagecache: wait",
+                        ("suspend thread %ld", thread->id));
+    pagecache_pthread_cond_wait(&thread->suspend, &pagecache->cache_lock);
+  }
+#else
+  KEYCACHE_DBUG_ASSERT(pagecache->cnt_for_resize_op == 0);
+#endif
+
+  end_pagecache(pagecache, 0);			/* Don't free mutex */
+  /* The following will work even if use_mem is 0 */
+  blocks= init_pagecache(pagecache, pagecache->block_size, use_mem,
+			 division_limit, age_threshold,
+                         pagecache->readwrite_flags);
+
+finish:
+#ifdef THREAD
+  wqueue_unlink_from_queue(wqueue, thread);
+  /* Signal for the next resize request to proceeed if any */
+  if (wqueue->last_thread)
+  {
+    KEYCACHE_DBUG_PRINT("resize_pagecache: signal",
+                        ("thread %ld", wqueue->last_thread->next->id));
+    pagecache_pthread_cond_signal(&wqueue->last_thread->next->suspend);
+  }
+#endif
+  pagecache_pthread_mutex_unlock(&pagecache->cache_lock);
+  DBUG_RETURN(blocks);
+}
+#endif /* 0 */
+
+
+/*
+  Increment counter blocking resize key cache operation
+*/
+static inline void inc_counter_for_resize_op(PAGECACHE *pagecache)
+{
+  pagecache->cnt_for_resize_op++;
+}
+
+
+/*
+  Decrement counter blocking resize key cache operation;
+  Signal the operation to proceed when counter becomes equal zero
+*/
+static inline void dec_counter_for_resize_op(PAGECACHE *pagecache)
+{
+#ifdef THREAD
+  struct st_my_thread_var *last_thread;
+  if (!--pagecache->cnt_for_resize_op &&
+      (last_thread= pagecache->resize_queue.last_thread))
+  {
+    KEYCACHE_DBUG_PRINT("dec_counter_for_resize_op: signal",
+                        ("thread %ld", last_thread->next->id));
+    pagecache_pthread_cond_signal(&last_thread->next->suspend);
+  }
+#else
+  pagecache->cnt_for_resize_op--;
+#endif
+}
+
+/*
+  Change the page cache parameters
+
+  SYNOPSIS
+    change_pagecache_param()
+    pagecache			pointer to a page cache data structure
+    division_limit		new division limit (if not zero)
+    age_threshold		new age threshold (if not zero)
+
+  RETURN VALUE
+    none
+
+  NOTES.
+    Presently the function resets the key cache parameters
+    concerning midpoint insertion strategy - division_limit and
+    age_threshold.
+*/
+
+void change_pagecache_param(PAGECACHE *pagecache, uint division_limit,
+			    uint age_threshold)
+{
+  DBUG_ENTER("change_pagecache_param");
+
+  pagecache_pthread_mutex_lock(&pagecache->cache_lock);
+  if (division_limit)
+    pagecache->min_warm_blocks= (pagecache->disk_blocks *
+				division_limit / 100 + 1);
+  if (age_threshold)
+    pagecache->age_threshold=   (pagecache->disk_blocks *
+				age_threshold / 100);
+  pagecache_pthread_mutex_unlock(&pagecache->cache_lock);
+  DBUG_VOID_RETURN;
+}
+
+
+/*
+  Removes page cache from memory. Does NOT flush pages to disk.
+
+  SYNOPSIS
+    end_pagecache()
+    pagecache		page cache handle
+    cleanup		Complete free (Free also mutex for key cache)
+
+  RETURN VALUE
+    none
+*/
+
+void end_pagecache(PAGECACHE *pagecache, my_bool cleanup)
+{
+  DBUG_ENTER("end_pagecache");
+  DBUG_PRINT("enter", ("key_cache: 0x%lx", (long) pagecache));
+
+  if (!pagecache->inited)
+    DBUG_VOID_RETURN;
+
+  if (pagecache->disk_blocks > 0)
+  {
+    if (pagecache->block_mem)
+    {
+      my_large_free(pagecache->block_mem, MYF(0));
+      pagecache->block_mem= NULL;
+      my_free(pagecache->block_root, MYF(0));
+      pagecache->block_root= NULL;
+    }
+    pagecache->disk_blocks= -1;
+    /* Reset blocks_changed to be safe if flush_all_key_blocks is called */
+    pagecache->blocks_changed= 0;
+  }
+
+  DBUG_PRINT("status", ("used: %lu  changed: %lu  w_requests: %lu  "
+                        "writes: %lu  r_requests: %lu  reads: %lu",
+                        pagecache->blocks_used,
+                        pagecache->global_blocks_changed,
+                        (ulong) pagecache->global_cache_w_requests,
+                        (ulong) pagecache->global_cache_write,
+                        (ulong) pagecache->global_cache_r_requests,
+                        (ulong) pagecache->global_cache_read));
+
+  if (cleanup)
+  {
+    hash_free(&pagecache->files_in_flush);
+    pthread_mutex_destroy(&pagecache->cache_lock);
+    pagecache->inited= pagecache->can_be_used= 0;
+    PAGECACHE_DEBUG_CLOSE;
+  }
+  DBUG_VOID_RETURN;
+} /* end_pagecache */
+
+
+/*
+  Unlink a block from the chain of dirty/clean blocks
+*/
+
+static inline void unlink_changed(PAGECACHE_BLOCK_LINK *block)
+{
+  if (block->next_changed)
+    block->next_changed->prev_changed= block->prev_changed;
+  *block->prev_changed= block->next_changed;
+}
+
+
+/*
+  Link a block into the chain of dirty/clean blocks
+*/
+
+static inline void link_changed(PAGECACHE_BLOCK_LINK *block,
+                                PAGECACHE_BLOCK_LINK **phead)
+{
+  block->prev_changed= phead;
+  if ((block->next_changed= *phead))
+    (*phead)->prev_changed= &block->next_changed;
+  *phead= block;
+}
+
+
+/*
+  Unlink a block from the chain of dirty/clean blocks, if it's asked for,
+  and link it to the chain of clean blocks for the specified file
+*/
+
+static void link_to_file_list(PAGECACHE *pagecache,
+                              PAGECACHE_BLOCK_LINK *block,
+                              PAGECACHE_FILE *file, my_bool unlink_flag)
+{
+  if (unlink_flag)
+    unlink_changed(block);
+  link_changed(block, &pagecache->file_blocks[FILE_HASH(*file)]);
+  if (block->status & PCBLOCK_CHANGED)
+  {
+    block->status&= ~PCBLOCK_CHANGED;
+    block->rec_lsn= LSN_MAX;
+    pagecache->blocks_changed--;
+    pagecache->global_blocks_changed--;
+  }
+}
+
+
+/*
+  Unlink a block from the chain of clean blocks for the specified
+  file and link it to the chain of dirty blocks for this file
+*/
+
+static inline void link_to_changed_list(PAGECACHE *pagecache,
+                                        PAGECACHE_BLOCK_LINK *block)
+{
+  unlink_changed(block);
+  link_changed(block,
+               &pagecache->changed_blocks[FILE_HASH(block->hash_link->file)]);
+  block->status|=PCBLOCK_CHANGED;
+  pagecache->blocks_changed++;
+  pagecache->global_blocks_changed++;
+}
+
+
+/*
+  Link a block to the LRU chain at the beginning or at the end of
+  one of two parts.
+
+  SYNOPSIS
+    link_block()
+      pagecache            pointer to a page cache data structure
+      block               pointer to the block to link to the LRU chain
+      hot                 <-> to link the block into the hot subchain
+      at_end              <-> to link the block at the end of the subchain
+
+  RETURN VALUE
+    none
+
+  NOTES.
+    The LRU chain is represented by a circular list of block structures.
+    The list is double-linked of the type (**prev,*next) type.
+    The LRU chain is divided into two parts - hot and warm.
+    There are two pointers to access the last blocks of these two
+    parts. The beginning of the warm part follows right after the
+    end of the hot part.
+    Only blocks of the warm part can be used for replacement.
+    The first block from the beginning of this subchain is always
+    taken for eviction (pagecache->last_used->next)
+
+    LRU chain:       +------+   H O T    +------+
+                +----| end  |----...<----| beg  |----+
+                |    +------+last        +------+    |
+                v<-link in latest hot (new end)      |
+                |     link in latest warm (new end)->^
+                |    +------+  W A R M   +------+    |
+                +----| beg  |---->...----| end  |----+
+                     +------+            +------+ins
+                  first for eviction
+*/
+
+static void link_block(PAGECACHE *pagecache, PAGECACHE_BLOCK_LINK *block,
+                       my_bool hot, my_bool at_end)
+{
+  PAGECACHE_BLOCK_LINK *ins;
+  PAGECACHE_BLOCK_LINK **ptr_ins;
+
+  PCBLOCK_INFO(block);
+  KEYCACHE_DBUG_ASSERT(! (block->hash_link && block->hash_link->requests));
+#ifdef THREAD
+  if (!hot && pagecache->waiting_for_block.last_thread)
+  {
+    /* Signal that in the LRU warm sub-chain an available block has appeared */
+    struct st_my_thread_var *last_thread=
+                               pagecache->waiting_for_block.last_thread;
+    struct st_my_thread_var *first_thread= last_thread->next;
+    struct st_my_thread_var *next_thread= first_thread;
+    PAGECACHE_HASH_LINK *hash_link=
+      (PAGECACHE_HASH_LINK *) first_thread->opt_info;
+    struct st_my_thread_var *thread;
+    do
+    {
+      thread= next_thread;
+      next_thread= thread->next;
+      /*
+         We notify about the event all threads that ask
+         for the same page as the first thread in the queue
+      */
+      if ((PAGECACHE_HASH_LINK *) thread->opt_info == hash_link)
+      {
+        KEYCACHE_DBUG_PRINT("link_block: signal", ("thread: %ld", thread->id));
+        pagecache_pthread_cond_signal(&thread->suspend);
+        wqueue_unlink_from_queue(&pagecache->waiting_for_block, thread);
+        block->requests++;
+      }
+    }
+    while (thread != last_thread);
+    hash_link->block= block;
+    KEYCACHE_THREAD_TRACE("link_block: after signaling");
+#if defined(PAGECACHE_DEBUG)
+    KEYCACHE_DBUG_PRINT("link_block",
+        ("linked,unlinked block: %u  status: %x  #requests: %u  #available: %u",
+         PCBLOCK_NUMBER(pagecache, block), block->status,
+         block->requests, pagecache->blocks_available));
+#endif
+    return;
+  }
+#else /* THREAD */
+  KEYCACHE_DBUG_ASSERT(! (!hot && pagecache->waiting_for_block.last_thread));
+  /* Condition not transformed using DeMorgan, to keep the text identical */
+#endif /* THREAD */
+  ptr_ins= hot ? &pagecache->used_ins : &pagecache->used_last;
+  ins= *ptr_ins;
+  if (ins)
+  {
+    ins->next_used->prev_used= &block->next_used;
+    block->next_used= ins->next_used;
+    block->prev_used= &ins->next_used;
+    ins->next_used= block;
+    if (at_end)
+      *ptr_ins= block;
+  }
+  else
+  {
+    /* The LRU chain is empty */
+    pagecache->used_last= pagecache->used_ins= block->next_used= block;
+    block->prev_used= &block->next_used;
+  }
+  KEYCACHE_THREAD_TRACE("link_block");
+#if defined(PAGECACHE_DEBUG)
+  pagecache->blocks_available++;
+  KEYCACHE_DBUG_PRINT("link_block",
+                      ("linked block: %u:%1u  status: %x  #requests: %u  #available: %u",
+                       PCBLOCK_NUMBER(pagecache, block), at_end, block->status,
+                       block->requests, pagecache->blocks_available));
+  KEYCACHE_DBUG_ASSERT((ulong) pagecache->blocks_available <=
+                       pagecache->blocks_used);
+#endif
+}
+
+
+/*
+  Unlink a block from the LRU chain
+
+  SYNOPSIS
+    unlink_block()
+      pagecache            pointer to a page cache data structure
+      block               pointer to the block to unlink from the LRU chain
+
+  RETURN VALUE
+    none
+
+  NOTES.
+    See NOTES for link_block
+*/
+
+static void unlink_block(PAGECACHE *pagecache, PAGECACHE_BLOCK_LINK *block)
+{
+  DBUG_ENTER("unlink_block");
+  DBUG_PRINT("unlink_block", ("unlink 0x%lx", (ulong)block));
+  DBUG_ASSERT(block->next_used != NULL);
+  if (block->next_used == block)
+  {
+    /* The list contains only one member */
+    pagecache->used_last= pagecache->used_ins= NULL;
+  }
+  else
+  {
+    block->next_used->prev_used= block->prev_used;
+    *block->prev_used= block->next_used;
+    if (pagecache->used_last == block)
+      pagecache->used_last= STRUCT_PTR(PAGECACHE_BLOCK_LINK,
+                                       next_used, block->prev_used);
+    if (pagecache->used_ins == block)
+      pagecache->used_ins= STRUCT_PTR(PAGECACHE_BLOCK_LINK,
+                                      next_used, block->prev_used);
+  }
+  block->next_used= NULL;
+
+  KEYCACHE_THREAD_TRACE("unlink_block");
+#if defined(PAGECACHE_DEBUG)
+  KEYCACHE_DBUG_ASSERT(pagecache->blocks_available != 0);
+  pagecache->blocks_available--;
+  KEYCACHE_DBUG_PRINT("unlink_block",
+                      ("unlinked block: 0x%lx (%u)  status: %x   #requests: %u  #available: %u",
+                       (ulong)block, PCBLOCK_NUMBER(pagecache, block),
+                       block->status,
+                       block->requests, pagecache->blocks_available));
+  PCBLOCK_INFO(block);
+#endif
+  DBUG_VOID_RETURN;
+}
+
+
+/*
+  Register requests for a block
+
+  SYNOPSIS
+    reg_requests()
+    pagecache            this page cache reference
+    block                the block we request reference
+    count                how many requests we register (it is 1 everywhere)
+
+  NOTE
+  Registration of request means we are going to use this block so we exclude
+  it from the LRU if it is first request
+*/
+static void reg_requests(PAGECACHE *pagecache, PAGECACHE_BLOCK_LINK *block,
+                         int count)
+{
+  DBUG_ENTER("reg_requests");
+  DBUG_PRINT("enter", ("block: 0x%lx (%u)  status: %x  reqs: %u",
+		       (ulong)block, PCBLOCK_NUMBER(pagecache, block),
+                       block->status, block->requests));
+  PCBLOCK_INFO(block);
+  if (! block->requests)
+    /* First request for the block unlinks it */
+    unlink_block(pagecache, block);
+  block->requests+= count;
+  DBUG_VOID_RETURN;
+}
+
+
+/*
+  Unregister request for a block
+  linking it to the LRU chain if it's the last request
+
+  SYNOPSIS
+    unreg_request()
+    pagecache            pointer to a page cache data structure
+    block               pointer to the block to link to the LRU chain
+    at_end              <-> to link the block at the end of the LRU chain
+
+  RETURN VALUE
+    none
+
+  NOTES.
+    Every linking to the LRU chain decrements by one a special block
+    counter (if it's positive). If the at_end parameter is TRUE the block is
+    added either at the end of warm sub-chain or at the end of hot sub-chain.
+    It is added to the hot subchain if its counter is zero and number of
+    blocks in warm sub-chain is not less than some low limit (determined by
+    the division_limit parameter). Otherwise the block is added to the warm
+    sub-chain. If the at_end parameter is FALSE the block is always added
+    at beginning of the warm sub-chain.
+    Thus a warm block can be promoted to the hot sub-chain when its counter
+    becomes zero for the first time.
+    At the same time  the block at the very beginning of the hot subchain
+    might be moved to the beginning of the warm subchain if it stays untouched
+    for a too long time (this time is determined by parameter age_threshold).
+*/
+
+static void unreg_request(PAGECACHE *pagecache,
+                          PAGECACHE_BLOCK_LINK *block, int at_end)
+{
+  DBUG_ENTER("unreg_request");
+  DBUG_PRINT("enter", ("block 0x%lx (%u)  status: %x  reqs: %u",
+		       (ulong)block, PCBLOCK_NUMBER(pagecache, block),
+                       block->status, block->requests));
+  PCBLOCK_INFO(block);
+  DBUG_ASSERT(block->requests > 0);
+  if (! --block->requests)
+  {
+    my_bool hot;
+    if (block->hits_left)
+      block->hits_left--;
+    hot= !block->hits_left && at_end &&
+      pagecache->warm_blocks > pagecache->min_warm_blocks;
+    if (hot)
+    {
+      if (block->temperature == PCBLOCK_WARM)
+        pagecache->warm_blocks--;
+      block->temperature= PCBLOCK_HOT;
+      KEYCACHE_DBUG_PRINT("unreg_request", ("#warm_blocks: %lu",
+                           pagecache->warm_blocks));
+    }
+    link_block(pagecache, block, hot, (my_bool)at_end);
+    block->last_hit_time= pagecache->time;
+    pagecache->time++;
+
+    block= pagecache->used_ins;
+    /* Check if we should link a hot block to the warm block */
+    if (block && pagecache->time - block->last_hit_time >
+	pagecache->age_threshold)
+    {
+      unlink_block(pagecache, block);
+      link_block(pagecache, block, 0, 0);
+      if (block->temperature != PCBLOCK_WARM)
+      {
+        pagecache->warm_blocks++;
+        block->temperature= PCBLOCK_WARM;
+      }
+      KEYCACHE_DBUG_PRINT("unreg_request", ("#warm_blocks: %lu",
+                           pagecache->warm_blocks));
+    }
+  }
+  DBUG_VOID_RETURN;
+}
+
+/*
+  Remove a reader of the page in block
+*/
+
+static inline void remove_reader(PAGECACHE_BLOCK_LINK *block)
+{
+  DBUG_ENTER("remove_reader");
+  PCBLOCK_INFO(block);
+  DBUG_ASSERT(block->hash_link->requests > 0);
+#ifdef THREAD
+  if (! --block->hash_link->requests && block->condvar)
+    pagecache_pthread_cond_signal(block->condvar);
+#else
+  --block->hash_link->requests;
+#endif
+  DBUG_VOID_RETURN;
+}
+
+
+/*
+  Wait until the last reader of the page in block
+  signals on its termination
+*/
+
+static inline void wait_for_readers(PAGECACHE *pagecache
+                                    __attribute__((unused)),
+                                    PAGECACHE_BLOCK_LINK *block)
+{
+#ifdef THREAD
+  struct st_my_thread_var *thread= my_thread_var;
+  while (block->hash_link->requests)
+  {
+    KEYCACHE_DBUG_PRINT("wait_for_readers: wait",
+                        ("suspend thread: %ld  block: %u",
+                         thread->id, PCBLOCK_NUMBER(pagecache, block)));
+    block->condvar= &thread->suspend;
+    pagecache_pthread_cond_wait(&thread->suspend, &pagecache->cache_lock);
+    block->condvar= NULL;
+  }
+#else
+  KEYCACHE_DBUG_ASSERT(block->hash_link->requests == 0);
+#endif
+}
+
+
+/*
+  Add a hash link to a bucket in the hash_table
+*/
+
+static inline void link_hash(PAGECACHE_HASH_LINK **start,
+                             PAGECACHE_HASH_LINK *hash_link)
+{
+  if (*start)
+    (*start)->prev= &hash_link->next;
+  hash_link->next= *start;
+  hash_link->prev= start;
+  *start= hash_link;
+}
+
+
+/*
+  Remove a hash link from the hash table
+*/
+
+static void unlink_hash(PAGECACHE *pagecache, PAGECACHE_HASH_LINK *hash_link)
+{
+  KEYCACHE_DBUG_PRINT("unlink_hash", ("fd: %u  pos_ %lu  #requests=%u",
+      (uint) hash_link->file.file, (ulong) hash_link->pageno,
+      hash_link->requests));
+  KEYCACHE_DBUG_ASSERT(hash_link->requests == 0);
+  if ((*hash_link->prev= hash_link->next))
+    hash_link->next->prev= hash_link->prev;
+  hash_link->block= NULL;
+#ifdef THREAD
+  if (pagecache->waiting_for_hash_link.last_thread)
+  {
+    /* Signal that a free hash link has appeared */
+    struct st_my_thread_var *last_thread=
+                               pagecache->waiting_for_hash_link.last_thread;
+    struct st_my_thread_var *first_thread= last_thread->next;
+    struct st_my_thread_var *next_thread= first_thread;
+    PAGECACHE_PAGE *first_page= (PAGECACHE_PAGE *) (first_thread->opt_info);
+    struct st_my_thread_var *thread;
+
+    hash_link->file= first_page->file;
+    DBUG_ASSERT(first_page->pageno < ((ULL(1)) << 40));
+    hash_link->pageno= first_page->pageno;
+    do
+    {
+      PAGECACHE_PAGE *page;
+      thread= next_thread;
+      page= (PAGECACHE_PAGE *) thread->opt_info;
+      next_thread= thread->next;
+      /*
+         We notify about the event all threads that ask
+         for the same page as the first thread in the queue
+      */
+      if (page->file.file == hash_link->file.file &&
+          page->pageno == hash_link->pageno)
+      {
+        KEYCACHE_DBUG_PRINT("unlink_hash: signal", ("thread %ld", thread->id));
+        pagecache_pthread_cond_signal(&thread->suspend);
+        wqueue_unlink_from_queue(&pagecache->waiting_for_hash_link, thread);
+      }
+    }
+    while (thread != last_thread);
+    link_hash(&pagecache->hash_root[PAGECACHE_HASH(pagecache,
+                                                   hash_link->file,
+                                                   hash_link->pageno)],
+              hash_link);
+    return;
+  }
+#else /* THREAD */
+  KEYCACHE_DBUG_ASSERT(! (pagecache->waiting_for_hash_link.last_thread));
+#endif /* THREAD */
+  hash_link->next= pagecache->free_hash_list;
+  pagecache->free_hash_list= hash_link;
+}
+
+
+/*
+  Get the hash link for the page if it is in the cache (do not put the
+  page in the cache if it is absent there)
+
+  SYNOPSIS
+    get_present_hash_link()
+    pagecache            Pagecache reference
+    file                 file ID
+    pageno               page number in the file
+    start                where to put pointer to found hash bucket (for
+                         direct referring it)
+
+  RETURN
+    found hashlink pointer
+*/
+
+static PAGECACHE_HASH_LINK *get_present_hash_link(PAGECACHE *pagecache,
+                                                  PAGECACHE_FILE *file,
+                                                  pgcache_page_no_t pageno,
+                                                  PAGECACHE_HASH_LINK ***start)
+{
+  reg1 PAGECACHE_HASH_LINK *hash_link;
+#if defined(PAGECACHE_DEBUG)
+  int cnt;
+#endif
+  DBUG_ENTER("get_present_hash_link");
+
+  KEYCACHE_DBUG_PRINT("get_present_hash_link", ("fd: %u  pos: %lu",
+                      (uint) file->file, (ulong) pageno));
+
+  /*
+     Find the bucket in the hash table for the pair (file, pageno);
+     start contains the head of the bucket list,
+     hash_link points to the first member of the list
+  */
+  hash_link= *(*start= &pagecache->hash_root[PAGECACHE_HASH(pagecache,
+                                                            *file, pageno)]);
+#if defined(PAGECACHE_DEBUG)
+  cnt= 0;
+#endif
+  /* Look for an element for the pair (file, pageno) in the bucket chain */
+  while (hash_link &&
+         (hash_link->pageno != pageno ||
+          hash_link->file.file != file->file))
+  {
+    hash_link= hash_link->next;
+#if defined(PAGECACHE_DEBUG)
+    cnt++;
+    if (! (cnt <= pagecache->hash_links_used))
+    {
+      int i;
+      for (i=0, hash_link= **start ;
+           i < cnt ; i++, hash_link= hash_link->next)
+      {
+        KEYCACHE_DBUG_PRINT("get_present_hash_link", ("fd: %u  pos: %lu",
+            (uint) hash_link->file.file, (ulong) hash_link->pageno));
+      }
+    }
+    KEYCACHE_DBUG_ASSERT(cnt <= pagecache->hash_links_used);
+#endif
+  }
+  if (hash_link)
+  {
+    /* Register the request for the page */
+    hash_link->requests++;
+  }
+  /*
+    As soon as the caller will release the page cache's lock, "hash_link"
+    will be potentially obsolete (unusable) information.
+  */
+  DBUG_RETURN(hash_link);
+}
+
+
+/*
+  Get the hash link for a page
+*/
+
+static PAGECACHE_HASH_LINK *get_hash_link(PAGECACHE *pagecache,
+                                          PAGECACHE_FILE *file,
+                                          pgcache_page_no_t pageno)
+{
+  reg1 PAGECACHE_HASH_LINK *hash_link;
+  PAGECACHE_HASH_LINK **start;
+
+  KEYCACHE_DBUG_PRINT("get_hash_link", ("fd: %u  pos: %lu",
+                      (uint) file->file, (ulong) pageno));
+
+restart:
+  /* try to find the page in the cache */
+  hash_link= get_present_hash_link(pagecache, file, pageno,
+                                   &start);
+  if (!hash_link)
+  {
+    /* There is no hash link in the hash table for the pair (file, pageno) */
+    if (pagecache->free_hash_list)
+    {
+      hash_link= pagecache->free_hash_list;
+      pagecache->free_hash_list= hash_link->next;
+    }
+    else if (pagecache->hash_links_used < pagecache->hash_links)
+    {
+      hash_link= &pagecache->hash_link_root[pagecache->hash_links_used++];
+    }
+    else
+    {
+#ifdef THREAD
+      /* Wait for a free hash link */
+      struct st_my_thread_var *thread= my_thread_var;
+      PAGECACHE_PAGE page;
+      KEYCACHE_DBUG_PRINT("get_hash_link", ("waiting"));
+      page.file= *file;
+      page.pageno= pageno;
+      thread->opt_info= (void *) &page;
+      wqueue_link_into_queue(&pagecache->waiting_for_hash_link, thread);
+      KEYCACHE_DBUG_PRINT("get_hash_link: wait",
+                        ("suspend thread %ld", thread->id));
+      pagecache_pthread_cond_wait(&thread->suspend,
+                                 &pagecache->cache_lock);
+      thread->opt_info= NULL;
+#else
+      KEYCACHE_DBUG_ASSERT(0);
+#endif
+      DBUG_PRINT("info", ("restarting..."));
+      goto restart;
+    }
+    hash_link->file= *file;
+    DBUG_ASSERT(pageno < ((ULL(1)) << 40));
+    hash_link->pageno= pageno;
+    link_hash(start, hash_link);
+    /* Register the request for the page */
+    hash_link->requests++;
+  }
+
+  return hash_link;
+}
+
+
+/*
+  Get a block for the file page requested by a pagecache read/write operation;
+  If the page is not in the cache return a free block, if there is none
+  return the lru block after saving its buffer if the page is dirty.
+
+  SYNOPSIS
+
+    find_block()
+      pagecache            pointer to a page cache data structure
+      file                handler for the file to read page from
+      pageno              number of the page in the file
+      init_hits_left      how initialize the block counter for the page
+      wrmode              <-> get for writing
+      reg_req             Register request to thye page
+      page_st        out  {PAGE_READ,PAGE_TO_BE_READ,PAGE_WAIT_TO_BE_READ}
+
+  RETURN VALUE
+    Pointer to the found block if successful, 0 - otherwise
+
+  NOTES.
+    For the page from file positioned at pageno the function checks whether
+    the page is in the key cache specified by the first parameter.
+    If this is the case it immediately returns the block.
+    If not, the function first chooses  a block for this page. If there is
+    no not used blocks in the key cache yet, the function takes the block
+    at the very beginning of the warm sub-chain. It saves the page in that
+    block if it's dirty before returning the pointer to it.
+    The function returns in the page_st parameter the following values:
+      PAGE_READ         - if page already in the block,
+      PAGE_TO_BE_READ   - if it is to be read yet by the current thread
+      WAIT_TO_BE_READ   - if it is to be read by another thread
+    If an error occurs THE PCBLOCK_ERROR bit is set in the block status.
+    It might happen that there are no blocks in LRU chain (in warm part) -
+    all blocks  are unlinked for some read/write operations. Then the function
+    waits until first of this operations links any block back.
+*/
+
+static PAGECACHE_BLOCK_LINK *find_block(PAGECACHE *pagecache,
+                                        PAGECACHE_FILE *file,
+                                        pgcache_page_no_t pageno,
+                                        int init_hits_left,
+                                        my_bool wrmode,
+                                        my_bool reg_req,
+                                        int *page_st)
+{
+  PAGECACHE_HASH_LINK *hash_link;
+  PAGECACHE_BLOCK_LINK *block;
+  int error= 0;
+  int page_status;
+
+  DBUG_ENTER("find_block");
+  KEYCACHE_THREAD_TRACE("find_block:begin");
+  DBUG_PRINT("enter", ("fd: %d  pos: %lu  wrmode: %d",
+                       file->file, (ulong) pageno, wrmode));
+  KEYCACHE_DBUG_PRINT("find_block", ("fd: %d  pos: %lu  wrmode: %d",
+                                     file->file, (ulong) pageno,
+                                     wrmode));
+#if !defined(DBUG_OFF) && defined(EXTRA_DEBUG)
+  DBUG_EXECUTE("check_pagecache",
+               test_key_cache(pagecache, "start of find_block", 0););
+#endif
+
+restart:
+  /* Find the hash link for the requested page (file, pageno) */
+  hash_link= get_hash_link(pagecache, file, pageno);
+
+  page_status= -1;
+  if ((block= hash_link->block) &&
+      block->hash_link == hash_link && (block->status & PCBLOCK_READ))
+    page_status= PAGE_READ;
+
+  if (wrmode && pagecache->resize_in_flush)
+  {
+    /* This is a write request during the flush phase of a resize operation */
+
+    if (page_status != PAGE_READ)
+    {
+      /* We don't need the page in the cache: we are going to write on disk */
+      DBUG_ASSERT(hash_link->requests > 0);
+      hash_link->requests--;
+      unlink_hash(pagecache, hash_link);
+      return 0;
+    }
+    if (!(block->status & PCBLOCK_IN_FLUSH))
+    {
+      DBUG_ASSERT(hash_link->requests > 0);
+      hash_link->requests--;
+      /*
+        Remove block to invalidate the page in the block buffer
+        as we are going to write directly on disk.
+        Although we have an exclusive lock for the updated key part
+        the control can be yielded by the current thread as we might
+        have unfinished readers of other key parts in the block
+        buffer. Still we are guaranteed not to have any readers
+        of the key part we are writing into until the block is
+        removed from the cache as we set the PCBLOCK_REASSIGNED
+        flag (see the code below that handles reading requests).
+      */
+      free_block(pagecache, block);
+      return 0;
+    }
+    /* Wait until the page is flushed on disk */
+    DBUG_ASSERT(hash_link->requests > 0);
+    hash_link->requests--;
+    {
+#ifdef THREAD
+      struct st_my_thread_var *thread= my_thread_var;
+      wqueue_add_to_queue(&block->wqueue[COND_FOR_SAVED], thread);
+      do
+      {
+        KEYCACHE_DBUG_PRINT("find_block: wait",
+                            ("suspend thread %ld", thread->id));
+        pagecache_pthread_cond_wait(&thread->suspend,
+                                   &pagecache->cache_lock);
+      }
+      while(thread->next);
+#else
+      KEYCACHE_DBUG_ASSERT(0);
+      /*
+        Given the use of "resize_in_flush", it seems impossible
+        that this whole branch is ever entered in single-threaded case
+        because "(wrmode && pagecache->resize_in_flush)" cannot be true.
+        TODO: Check this, and then put the whole branch into the
+        "#ifdef THREAD" guard.
+      */
+#endif
+    }
+    /* Invalidate page in the block if it has not been done yet */
+    if (block->status)
+      free_block(pagecache, block);
+    return 0;
+  }
+
+  if (page_status == PAGE_READ &&
+      (block->status & (PCBLOCK_IN_SWITCH | PCBLOCK_REASSIGNED)))
+  {
+    /* This is a request for a page to be removed from cache */
+
+    KEYCACHE_DBUG_PRINT("find_block",
+                        ("request for old page in block: %u  "
+                         "wrmode: %d  block->status: %d",
+                         PCBLOCK_NUMBER(pagecache, block), wrmode,
+                         block->status));
+    /*
+       Only reading requests can proceed until the old dirty page is flushed,
+       all others are to be suspended, then resubmitted
+    */
+    if (!wrmode && !(block->status & PCBLOCK_REASSIGNED))
+    {
+      if (reg_req)
+        reg_requests(pagecache, block, 1);
+    }
+    else
+    {
+      DBUG_ASSERT(hash_link->requests > 0);
+      hash_link->requests--;
+      KEYCACHE_DBUG_PRINT("find_block",
+                          ("request waiting for old page to be saved"));
+      {
+#ifdef THREAD
+        struct st_my_thread_var *thread= my_thread_var;
+        /* Put the request into the queue of those waiting for the old page */
+        wqueue_add_to_queue(&block->wqueue[COND_FOR_SAVED], thread);
+        /* Wait until the request can be resubmitted */
+        do
+        {
+          KEYCACHE_DBUG_PRINT("find_block: wait",
+                              ("suspend thread %ld", thread->id));
+          pagecache_pthread_cond_wait(&thread->suspend,
+                                     &pagecache->cache_lock);
+        }
+        while(thread->next);
+#else
+        KEYCACHE_DBUG_ASSERT(0);
+          /* No parallel requests in single-threaded case */
+#endif
+      }
+      KEYCACHE_DBUG_PRINT("find_block",
+                          ("request for old page resubmitted"));
+      DBUG_PRINT("info", ("restarting..."));
+      /* Resubmit the request */
+      goto restart;
+    }
+  }
+  else
+  {
+    /* This is a request for a new page or for a page not to be removed */
+    if (! block)
+    {
+      /* No block is assigned for the page yet */
+      if (pagecache->blocks_unused)
+      {
+        if (pagecache->free_block_list)
+        {
+          /* There is a block in the free list. */
+          block= pagecache->free_block_list;
+          pagecache->free_block_list= block->next_used;
+          block->next_used= NULL;
+        }
+        else
+        {
+          /* There are some never used blocks, take first of them */
+          block= &pagecache->block_root[pagecache->blocks_used];
+          block->buffer= ADD_TO_PTR(pagecache->block_mem,
+                                    ((ulong) pagecache->blocks_used*
+                                     pagecache->block_size),
+                                    uchar*);
+          pagecache->blocks_used++;
+        }
+        pagecache->blocks_unused--;
+        DBUG_ASSERT(block->wlocks == 0);
+        DBUG_ASSERT(block->rlocks == 0);
+        DBUG_ASSERT(block->rlocks_queue == 0);
+        DBUG_ASSERT(block->pins == 0);
+        block->status= 0;
+#ifndef DBUG_OFF
+        block->type= PAGECACHE_EMPTY_PAGE;
+#endif
+        block->requests= 1;
+        block->temperature= PCBLOCK_COLD;
+        block->hits_left= init_hits_left;
+        block->last_hit_time= 0;
+        block->rec_lsn= LSN_MAX;
+        link_to_file_list(pagecache, block, file, 0);
+        block->hash_link= hash_link;
+        hash_link->block= block;
+        page_status= PAGE_TO_BE_READ;
+        DBUG_PRINT("info", ("page to be read set for page 0x%lx",
+                            (ulong)block));
+        KEYCACHE_DBUG_PRINT("find_block",
+                            ("got free or never used block %u",
+                             PCBLOCK_NUMBER(pagecache, block)));
+      }
+      else
+      {
+	/* There are no never used blocks, use a block from the LRU chain */
+
+        /*
+          Wait until a new block is added to the LRU chain;
+          several threads might wait here for the same page,
+          all of them must get the same block
+        */
+
+#ifdef THREAD
+        if (! pagecache->used_last)
+        {
+          struct st_my_thread_var *thread= my_thread_var;
+          thread->opt_info= (void *) hash_link;
+          wqueue_link_into_queue(&pagecache->waiting_for_block, thread);
+          do
+          {
+            KEYCACHE_DBUG_PRINT("find_block: wait",
+                                ("suspend thread %ld", thread->id));
+            pagecache_pthread_cond_wait(&thread->suspend,
+                                       &pagecache->cache_lock);
+          }
+          while (thread->next);
+          thread->opt_info= NULL;
+        }
+#else
+        KEYCACHE_DBUG_ASSERT(pagecache->used_last);
+#endif
+        block= hash_link->block;
+        if (! block)
+        {
+          /*
+             Take the first block from the LRU chain
+             unlinking it from the chain
+          */
+          block= pagecache->used_last->next_used;
+          block->hits_left= init_hits_left;
+          block->last_hit_time= 0;
+	  if (reg_req)
+            reg_requests(pagecache, block, 1);
+          hash_link->block= block;
+        }
+        PCBLOCK_INFO(block);
+        DBUG_ASSERT(block->wlocks == 0);
+        DBUG_ASSERT(block->rlocks == 0);
+        DBUG_ASSERT(block->rlocks_queue == 0);
+        DBUG_ASSERT(block->pins == 0);
+
+        if (block->hash_link != hash_link &&
+	    ! (block->status & PCBLOCK_IN_SWITCH) )
+        {
+	  /* this is a primary request for a new page */
+          DBUG_ASSERT(block->wlocks == 0);
+          DBUG_ASSERT(block->rlocks == 0);
+          DBUG_ASSERT(block->rlocks_queue == 0);
+          DBUG_ASSERT(block->pins == 0);
+          block->status|= PCBLOCK_IN_SWITCH;
+
+          KEYCACHE_DBUG_PRINT("find_block",
+                              ("got block %u for new page",
+                               PCBLOCK_NUMBER(pagecache, block)));
+
+          if (block->status & PCBLOCK_CHANGED)
+          {
+	    /* The block contains a dirty page - push it out of the cache */
+
+            KEYCACHE_DBUG_PRINT("find_block", ("block is dirty"));
+
+            pagecache_pthread_mutex_unlock(&pagecache->cache_lock);
+            /*
+	      The call is thread safe because only the current
+	      thread might change the block->hash_link value
+            */
+            DBUG_ASSERT(block->pins == 0);
+            error= pagecache_fwrite(pagecache,
+                                    &block->hash_link->file,
+                                    block->buffer,
+                                    block->hash_link->pageno,
+                                    block->type,
+                                    pagecache->readwrite_flags);
+            pagecache_pthread_mutex_lock(&pagecache->cache_lock);
+	    pagecache->global_cache_write++;
+          }
+
+          block->status|= PCBLOCK_REASSIGNED;
+          if (block->hash_link)
+          {
+            /*
+	      Wait until all pending read requests
+	      for this page are executed
+	      (we could have avoided this waiting, if we had read
+	      a page in the cache in a sweep, without yielding control)
+            */
+            wait_for_readers(pagecache, block);
+
+            /* Remove the hash link for this page from the hash table */
+            unlink_hash(pagecache, block->hash_link);
+            /* All pending requests for this page must be resubmitted */
+#ifdef THREAD
+            if (block->wqueue[COND_FOR_SAVED].last_thread)
+              wqueue_release_queue(&block->wqueue[COND_FOR_SAVED]);
+#endif
+          }
+          link_to_file_list(pagecache, block, file,
+                            (my_bool)(block->hash_link ? 1 : 0));
+          PCBLOCK_INFO(block);
+          block->status= error ? PCBLOCK_ERROR : 0;
+          block->error=  (int16) my_errno;
+#ifndef DBUG_OFF
+          block->type= PAGECACHE_EMPTY_PAGE;
+          if (error)
+            my_debug_put_break_here();
+#endif
+          block->hash_link= hash_link;
+          page_status= PAGE_TO_BE_READ;
+          DBUG_PRINT("info", ("page to be read set for page 0x%lx",
+                              (ulong)block));
+
+          KEYCACHE_DBUG_ASSERT(block->hash_link->block == block);
+          KEYCACHE_DBUG_ASSERT(hash_link->block->hash_link == hash_link);
+        }
+        else
+        {
+          /* This is for secondary requests for a new page only */
+          KEYCACHE_DBUG_PRINT("find_block",
+                              ("block->hash_link: %p  hash_link: %p  "
+                               "block->status: %u", block->hash_link,
+                               hash_link, block->status ));
+          page_status= (((block->hash_link == hash_link) &&
+                         (block->status & PCBLOCK_READ)) ?
+                        PAGE_READ : PAGE_WAIT_TO_BE_READ);
+        }
+      }
+    }
+    else
+    {
+      if (reg_req)
+	reg_requests(pagecache, block, 1);
+      KEYCACHE_DBUG_PRINT("find_block",
+                          ("block->hash_link: %p  hash_link: %p  "
+                           "block->status: %u", block->hash_link,
+                           hash_link, block->status ));
+      page_status= (((block->hash_link == hash_link) &&
+                     (block->status & PCBLOCK_READ)) ?
+                    PAGE_READ : PAGE_WAIT_TO_BE_READ);
+    }
+  }
+
+  KEYCACHE_DBUG_ASSERT(page_status != -1);
+  *page_st= page_status;
+  DBUG_PRINT("info",
+             ("block: 0x%lx  fd: %u  pos: %lu  block->status: %u  page_status: %u",
+              (ulong) block, (uint) file->file,
+              (ulong) pageno, block->status, (uint) page_status));
+  KEYCACHE_DBUG_PRINT("find_block",
+                      ("block: 0x%lx  fd: %d  pos: %lu  block->status: %u  page_status: %d",
+                       (ulong) block,
+                       file->file, (ulong) pageno, block->status,
+                       page_status));
+
+#if !defined(DBUG_OFF) && defined(EXTRA_DEBUG)
+  DBUG_EXECUTE("check_pagecache",
+               test_key_cache(pagecache, "end of find_block",0););
+#endif
+  KEYCACHE_THREAD_TRACE("find_block:end");
+  DBUG_RETURN(block);
+}
+
+
+static void add_pin(PAGECACHE_BLOCK_LINK *block)
+{
+  DBUG_ENTER("add_pin");
+  DBUG_PRINT("enter", ("block: 0x%lx  pins: %u",
+                       (ulong) block,
+                       block->pins));
+  PCBLOCK_INFO(block);
+  block->pins++;
+#ifndef DBUG_OFF
+  {
+    PAGECACHE_PIN_INFO *info=
+      (PAGECACHE_PIN_INFO *)my_malloc(sizeof(PAGECACHE_PIN_INFO), MYF(0));
+    info->thread= my_thread_var;
+    info_link(&block->pin_list, info);
+  }
+#endif
+  DBUG_VOID_RETURN;
+}
+
+static void remove_pin(PAGECACHE_BLOCK_LINK *block, my_bool any
+#ifdef DBUG_OFF
+                       __attribute__((unused))
+#endif
+                       )
+{
+  DBUG_ENTER("remove_pin");
+  DBUG_PRINT("enter", ("block: 0x%lx  pins: %u  any: %d",
+                       (ulong) block,
+                       block->pins, (int)any));
+  PCBLOCK_INFO(block);
+  DBUG_ASSERT(block->pins > 0);
+  block->pins--;
+#ifndef DBUG_OFF
+  {
+    PAGECACHE_PIN_INFO *info= info_find(block->pin_list, my_thread_var, any);
+    DBUG_ASSERT(info != 0);
+    info_unlink(info);
+    my_free(info, MYF(0));
+  }
+#endif
+  DBUG_VOID_RETURN;
+}
+#ifndef DBUG_OFF
+static void info_add_lock(PAGECACHE_BLOCK_LINK *block, my_bool wl)
+{
+  PAGECACHE_LOCK_INFO *info=
+    (PAGECACHE_LOCK_INFO *)my_malloc(sizeof(PAGECACHE_LOCK_INFO), MYF(0));
+  info->thread= my_thread_var;
+  info->write_lock= wl;
+  info_link((PAGECACHE_PIN_INFO **)&block->lock_list,
+	    (PAGECACHE_PIN_INFO *)info);
+}
+static void info_remove_lock(PAGECACHE_BLOCK_LINK *block)
+{
+  PAGECACHE_LOCK_INFO *info=
+    (PAGECACHE_LOCK_INFO *)info_find((PAGECACHE_PIN_INFO *)block->lock_list,
+                                     my_thread_var, FALSE);
+  DBUG_ASSERT(info != 0);
+  info_unlink((PAGECACHE_PIN_INFO *)info);
+  my_free(info, MYF(0));
+}
+static void info_change_lock(PAGECACHE_BLOCK_LINK *block, my_bool wl)
+{
+  PAGECACHE_LOCK_INFO *info=
+    (PAGECACHE_LOCK_INFO *)info_find((PAGECACHE_PIN_INFO *)block->lock_list,
+                                     my_thread_var, FALSE);
+  DBUG_ASSERT(info != 0);
+  DBUG_ASSERT(info->write_lock != wl);
+  info->write_lock= wl;
+}
+#else
+#define info_add_lock(B,W)
+#define info_remove_lock(B)
+#define info_change_lock(B,W)
+#endif
+
+
+/**
+  @brief waiting for lock for read and write lock
+
+  @parem pagecache       pointer to a page cache data structure
+  @parem block           the block to work with
+  @param file            file of the block when it was locked
+  @param pageno          page number of the block when it was locked
+  @param lock_type       MY_PTHREAD_LOCK_READ or MY_PTHREAD_LOCK_WRITE
+
+  @retval 0 OK
+  @retval 1 Can't lock this block, need retry
+*/
+
+static my_bool pagecache_wait_lock(PAGECACHE *pagecache,
+                                  PAGECACHE_BLOCK_LINK *block,
+                                  PAGECACHE_FILE file,
+                                  pgcache_page_no_t pageno,
+                                  uint lock_type)
+{
+  /* Lock failed we will wait */
+#ifdef THREAD
+  struct st_my_thread_var *thread= my_thread_var;
+  DBUG_ENTER("pagecache_wait_lock");
+  DBUG_PRINT("info", ("fail to lock, waiting... 0x%lx", (ulong)block));
+  thread->lock_type= lock_type;
+  wqueue_add_to_queue(&block->wqueue[COND_FOR_WRLOCK], thread);
+  dec_counter_for_resize_op(pagecache);
+  do
+  {
+    KEYCACHE_DBUG_PRINT("get_wrlock: wait",
+                        ("suspend thread %ld", thread->id));
+    pagecache_pthread_cond_wait(&thread->suspend,
+                                &pagecache->cache_lock);
+  }
+  while(thread->next);
+#else
+  DBUG_ASSERT(0);
+#endif
+  PCBLOCK_INFO(block);
+  if ((block->status & (PCBLOCK_REASSIGNED | PCBLOCK_IN_SWITCH)) ||
+      file.file != block->hash_link->file.file ||
+      pageno != block->hash_link->pageno)
+  {
+    DBUG_PRINT("info", ("the block 0x%lx changed => need retry "
+                        "status: %x  files %d != %d or pages %lu != %lu",
+                        (ulong)block, block->status,
+                        file.file, block->hash_link->file.file,
+                        (ulong) pageno, (ulong) block->hash_link->pageno));
+    DBUG_RETURN(1);
+  }
+  DBUG_RETURN(0);
+}
+
+/**
+  @brief Put on the block write lock
+
+  @parem pagecache       pointer to a page cache data structure
+  @parem block           the block to work with
+
+  @note We have loose scheme for locking by the same thread:
+    * Downgrade to read lock if no other locks are taken
+    * Our scheme of locking allow for the same thread
+      - the same kind of lock
+      - taking read lock if write lock present
+      - downgrading to read lock if still other place the same
+        thread keep write lock
+    * But unlock operation number should be the same to lock operation.
+    * If we try to get read lock having active write locks we put read
+      locks to queue, and as soon as write lock(s) gone the read locks
+      from queue came in force.
+    * If read lock is unlocked earlier then it came to force it
+      just removed from the queue
+
+  @retval 0 OK
+  @retval 1 Can't lock this block, need retry
+*/
+
+static my_bool get_wrlock(PAGECACHE *pagecache,
+                          PAGECACHE_BLOCK_LINK *block)
+{
+  PAGECACHE_FILE file= block->hash_link->file;
+  pgcache_page_no_t pageno= block->hash_link->pageno;
+  pthread_t locker= pthread_self();
+  DBUG_ENTER("get_wrlock");
+  DBUG_PRINT("info", ("the block 0x%lx "
+                      "files %d(%d)  pages %lu(%lu)",
+                      (ulong) block,
+                      file.file, block->hash_link->file.file,
+                      (ulong) pageno, (ulong) block->hash_link->pageno));
+  PCBLOCK_INFO(block);
+  /*
+    We assume that the same thread will try write lock on block on which it
+    has already read lock.
+  */
+  while ((block->wlocks && !pthread_equal(block->write_locker, locker)) ||
+         block->rlocks)
+  {
+    /* Lock failed we will wait */
+    if (pagecache_wait_lock(pagecache, block, file, pageno,
+                           MY_PTHREAD_LOCK_WRITE))
+      DBUG_RETURN(1);
+  }
+  /* we are doing it by global cache mutex protection, so it is OK */
+  block->wlocks++;
+  block->write_locker= locker;
+  DBUG_PRINT("info", ("WR lock set, block 0x%lx", (ulong)block));
+  DBUG_RETURN(0);
+}
+
+
+/*
+  @brief Put on the block read lock
+
+  @param pagecache       pointer to a page cache data structure
+  @param block           the block to work with
+  @param user_file	 Unique handler per handler file. Used to check if
+			 we request many write locks withing the same
+                         statement
+
+  @note see note for get_wrlock().
+
+  @retvalue 0 OK
+  @retvalue 1 Can't lock this block, need retry
+*/
+
+static my_bool get_rdlock(PAGECACHE *pagecache,
+                          PAGECACHE_BLOCK_LINK *block)
+{
+  PAGECACHE_FILE file= block->hash_link->file;
+  pgcache_page_no_t pageno= block->hash_link->pageno;
+  pthread_t locker= pthread_self();
+  DBUG_ENTER("get_rdlock");
+  DBUG_PRINT("info", ("the block 0x%lx "
+                      "files %d(%d)  pages %lu(%lu)",
+                      (ulong) block,
+                      file.file, block->hash_link->file.file,
+                      (ulong) pageno, (ulong) block->hash_link->pageno));
+  PCBLOCK_INFO(block);
+  while (block->wlocks && !pthread_equal(block->write_locker, locker))
+  {
+    /* Lock failed we will wait */
+    if (pagecache_wait_lock(pagecache, block, file, pageno,
+                           MY_PTHREAD_LOCK_READ))
+      DBUG_RETURN(1);
+  }
+  /* we are doing it by global cache mutex protection, so it is OK */
+  if (block->wlocks)
+  {
+    DBUG_ASSERT(pthread_equal(block->write_locker, locker));
+    block->rlocks_queue++;
+    DBUG_PRINT("info", ("RD lock put into queue, block 0x%lx", (ulong)block));
+  }
+  else
+  {
+    block->rlocks++;
+    DBUG_PRINT("info", ("RD lock set, block 0x%lx", (ulong)block));
+  }
+  DBUG_RETURN(0);
+}
+
+
+/*
+  @brief Remove write lock from the block
+
+  @param pagecache       pointer to a page cache data structure
+  @param block           the block to work with
+  @param read_lock       downgrade to read lock
+
+  @note see note for get_wrlock().
+*/
+
+static void release_wrlock(PAGECACHE_BLOCK_LINK *block, my_bool read_lock)
+{
+  DBUG_ENTER("release_wrlock");
+  PCBLOCK_INFO(block);
+  DBUG_ASSERT(block->wlocks > 0);
+  DBUG_ASSERT(block->rlocks == 0);
+  DBUG_ASSERT(block->pins > 0);
+  if (read_lock)
+    block->rlocks_queue++;
+  if (block->wlocks == 1)
+  {
+    block->rlocks= block->rlocks_queue;
+    block->rlocks_queue= 0;
+  }
+  block->wlocks--;
+  if (block->wlocks > 0)
+    DBUG_VOID_RETURN;                      /* Multiple write locked */
+  DBUG_PRINT("info", ("WR lock reset, block 0x%lx", (ulong)block));
+#ifdef THREAD
+  /* release all threads waiting for read lock or one waiting for write */
+  if (block->wqueue[COND_FOR_WRLOCK].last_thread)
+    wqueue_release_one_locktype_from_queue(&block->wqueue[COND_FOR_WRLOCK]);
+#endif
+  PCBLOCK_INFO(block);
+  DBUG_VOID_RETURN;
+}
+
+/*
+  @brief Remove read lock from the block
+
+  @param pagecache       pointer to a page cache data structure
+  @param block           the block to work with
+
+  @note see note for get_wrlock().
+*/
+
+static void release_rdlock(PAGECACHE_BLOCK_LINK *block)
+{
+  DBUG_ENTER("release_wrlock");
+  PCBLOCK_INFO(block);
+  if (block->wlocks)
+  {
+    DBUG_ASSERT(pthread_equal(block->write_locker, pthread_self()));
+    DBUG_ASSERT(block->rlocks == 0);
+    DBUG_ASSERT(block->rlocks_queue > 0);
+    block->rlocks_queue--;
+    DBUG_PRINT("info", ("RD lock queue decreased, block 0x%lx", (ulong)block));
+    DBUG_VOID_RETURN;
+  }
+  DBUG_ASSERT(block->rlocks > 0);
+  DBUG_ASSERT(block->rlocks_queue == 0);
+  block->rlocks--;
+  DBUG_PRINT("info", ("RD lock decreased, block 0x%lx", (ulong)block));
+  if (block->rlocks > 0)
+    DBUG_VOID_RETURN;                      /* Multiple write locked */
+  DBUG_PRINT("info", ("RD lock reset, block 0x%lx", (ulong)block));
+#ifdef THREAD
+  /* release all threads waiting for read lock or one waiting for write */
+  if (block->wqueue[COND_FOR_WRLOCK].last_thread)
+    wqueue_release_one_locktype_from_queue(&block->wqueue[COND_FOR_WRLOCK]);
+#endif
+  PCBLOCK_INFO(block);
+  DBUG_VOID_RETURN;
+}
+
+/**
+  @brief Try to lock/unlock and pin/unpin the block
+
+  @param pagecache       pointer to a page cache data structure
+  @param block           the block to work with
+  @param lock            lock change mode
+  @param pin             pinchange mode
+  @param file            File handler requesting pin
+  @param any             allow unpinning block pinned by any thread; possible
+                         only if not locked, see pagecache_unlock_by_link()
+
+  @retval 0 OK
+  @retval 1 Try to lock the block failed
+*/
+
+static my_bool make_lock_and_pin(PAGECACHE *pagecache,
+                                 PAGECACHE_BLOCK_LINK *block,
+                                 enum pagecache_page_lock lock,
+                                 enum pagecache_page_pin pin,
+                                 my_bool any)
+{
+  DBUG_ENTER("make_lock_and_pin");
+
+  DBUG_PRINT("enter", ("block: 0x%lx", (ulong)block));
+#ifndef DBUG_OFF
+  if (block)
+  {
+    DBUG_PRINT("enter", ("block: 0x%lx (%u)  wrlocks: %u  rdlocks: %u  "
+                         "rdlocks_q: %u  pins: %u  lock: %s  pin: %s any %d",
+                         (ulong)block, PCBLOCK_NUMBER(pagecache, block),
+                         block->wlocks, block->rlocks, block->rlocks_queue,
+                         block->pins,
+                         page_cache_page_lock_str[lock],
+                         page_cache_page_pin_str[pin], (int)any));
+    PCBLOCK_INFO(block);
+  }
+#endif
+
+  DBUG_ASSERT(!any ||
+              ((lock == PAGECACHE_LOCK_LEFT_UNLOCKED) &&
+               (pin == PAGECACHE_UNPIN)));
+
+  switch (lock) {
+  case PAGECACHE_LOCK_WRITE:               /* free  -> write */
+    /* Writelock and pin the buffer */
+    if (get_wrlock(pagecache, block))
+    {
+      /* Couldn't lock because block changed status => need retry */
+      goto retry;
+    }
+
+    /* The cache is locked so nothing afraid of */
+    add_pin(block);
+    info_add_lock(block, 1);
+    break;
+  case PAGECACHE_LOCK_WRITE_TO_READ:       /* write -> read  */
+  case PAGECACHE_LOCK_WRITE_UNLOCK:        /* write -> free  */
+    /* Removes write lock and puts read lock */
+    release_wrlock(block, lock == PAGECACHE_LOCK_WRITE_TO_READ);
+    /* fall through */
+  case PAGECACHE_LOCK_READ_UNLOCK:         /* read  -> free  */
+    if (lock == PAGECACHE_LOCK_READ_UNLOCK)
+      release_rdlock(block);
+    /* fall through */
+  case PAGECACHE_LOCK_LEFT_READLOCKED:     /* read  -> read  */
+    if (pin == PAGECACHE_UNPIN)
+    {
+      remove_pin(block, FALSE);
+    }
+    if (lock == PAGECACHE_LOCK_WRITE_TO_READ)
+    {
+      info_change_lock(block, 0);
+    }
+    else if (lock == PAGECACHE_LOCK_WRITE_UNLOCK ||
+             lock == PAGECACHE_LOCK_READ_UNLOCK)
+    {
+      info_remove_lock(block);
+    }
+    break;
+  case PAGECACHE_LOCK_READ:                /* free  -> read  */
+    if (get_rdlock(pagecache, block))
+    {
+      /* Couldn't lock because block changed status => need retry */
+      goto retry;
+    }
+
+    if (pin == PAGECACHE_PIN)
+    {
+      /* The cache is locked so nothing afraid off */
+      add_pin(block);
+    }
+    info_add_lock(block, 0);
+    break;
+  case PAGECACHE_LOCK_LEFT_UNLOCKED:       /* free  -> free  */
+    if (pin == PAGECACHE_UNPIN)
+    {
+      remove_pin(block, any);
+    }
+    /* fall through */
+  case PAGECACHE_LOCK_LEFT_WRITELOCKED:    /* write -> write */
+    break; /* do nothing */
+  default:
+    DBUG_ASSERT(0); /* Never should happened */
+  }
+
+#ifndef DBUG_OFF
+  if (block)
+    PCBLOCK_INFO(block);
+#endif
+  DBUG_RETURN(0);
+retry:
+  DBUG_PRINT("INFO", ("Retry block 0x%lx", (ulong)block));
+  PCBLOCK_INFO(block);
+  DBUG_ASSERT(block->hash_link->requests > 0);
+  block->hash_link->requests--;
+  PCBLOCK_INFO(block);
+  DBUG_RETURN(1);
+
+}
+
+
+/*
+  Read into a key cache block buffer from disk.
+
+  SYNOPSIS
+
+    read_block()
+      pagecache           pointer to a page cache data structure
+      block               block to which buffer the data is to be read
+      primary             <-> the current thread will read the data
+
+  RETURN VALUE
+    None
+
+  NOTES.
+    The function either reads a page data from file to the block buffer,
+    or waits until another thread reads it. What page to read is determined
+    by a block parameter - reference to a hash link for this page.
+    If an error occurs THE PCBLOCK_ERROR bit is set in the block status.
+
+    On entry cache_lock is locked
+*/
+
+static void read_block(PAGECACHE *pagecache,
+                       PAGECACHE_BLOCK_LINK *block,
+                       my_bool primary)
+{
+
+  DBUG_ENTER("read_block");
+  DBUG_PRINT("enter", ("read block: 0x%lx  primary: %d",
+                       (ulong)block, primary));
+  if (primary)
+  {
+    size_t error;
+    /*
+      This code is executed only by threads
+      that submitted primary requests
+    */
+
+    pagecache->global_cache_read++;
+    /* Page is not in buffer yet, is to be read from disk */
+    pagecache_pthread_mutex_unlock(&pagecache->cache_lock);
+    /*
+      Here other threads may step in and register as secondary readers.
+      They will register in block->wqueue[COND_FOR_REQUESTED].
+    */
+    error= pagecache_fread(pagecache, &block->hash_link->file,
+                           block->buffer,
+                           block->hash_link->pageno,
+                           pagecache->readwrite_flags);
+    pagecache_pthread_mutex_lock(&pagecache->cache_lock);
+    if (error)
+    {
+      block->status|= PCBLOCK_ERROR;
+      block->error=   (int16) my_errno;
+      my_debug_put_break_here();
+    }
+    else
+    {
+      block->status|= PCBLOCK_READ;
+      if ((*block->hash_link->file.read_callback)(block->buffer,
+                                                  block->hash_link->pageno,
+                                                  block->hash_link->
+                                                  file.callback_data))
+      {
+        DBUG_PRINT("error", ("read callback problem"));
+        block->status|= PCBLOCK_ERROR;
+        block->error=  (int16) my_errno;
+        my_debug_put_break_here();
+      }
+    }
+    DBUG_PRINT("read_block",
+               ("primary request: new page in cache"));
+    /* Signal that all pending requests for this page now can be processed */
+#ifdef THREAD
+    if (block->wqueue[COND_FOR_REQUESTED].last_thread)
+      wqueue_release_queue(&block->wqueue[COND_FOR_REQUESTED]);
+#endif
+  }
+  else
+  {
+    /*
+      This code is executed only by threads
+      that submitted secondary requests
+    */
+
+#ifdef THREAD
+      struct st_my_thread_var *thread= my_thread_var;
+      /* Put the request into a queue and wait until it can be processed */
+      wqueue_add_to_queue(&block->wqueue[COND_FOR_REQUESTED], thread);
+      do
+      {
+        DBUG_PRINT("read_block: wait",
+                  ("suspend thread %ld", thread->id));
+        pagecache_pthread_cond_wait(&thread->suspend,
+                                   &pagecache->cache_lock);
+      }
+      while (thread->next);
+#else
+      KEYCACHE_DBUG_ASSERT(0);
+      /* No parallel requests in single-threaded case */
+#endif
+    DBUG_PRINT("read_block",
+               ("secondary request: new page in cache"));
+  }
+  DBUG_VOID_RETURN;
+}
+
+
+/**
+   @brief Set LSN on the page to the given one if the given LSN is bigger
+
+   @param  pagecache        pointer to a page cache data structure
+   @param  lsn              LSN to set
+   @param  block            block to check and set
+*/
+
+static void check_and_set_lsn(PAGECACHE *pagecache,
+                              LSN lsn, PAGECACHE_BLOCK_LINK *block)
+{
+  LSN old;
+  DBUG_ENTER("check_and_set_lsn");
+  /*
+    In recovery, we can _ma_unpin_all_pages() to put a LSN on page, though
+    page would be PAGECACHE_PLAIN_PAGE (transactionality temporarily disabled
+    to not log REDOs).
+  */
+  DBUG_ASSERT((block->type == PAGECACHE_LSN_PAGE) || maria_in_recovery);
+  old= lsn_korr(block->buffer);
+  DBUG_PRINT("info", ("old lsn: (%lu, 0x%lx)  new lsn: (%lu, 0x%lx)",
+                      LSN_IN_PARTS(old), LSN_IN_PARTS(lsn)));
+  if (cmp_translog_addr(lsn, old) > 0)
+  {
+
+    DBUG_ASSERT(block->type != PAGECACHE_READ_UNKNOWN_PAGE);
+    lsn_store(block->buffer, lsn);
+    /* we stored LSN in page so we dirtied it */
+    if (!(block->status & PCBLOCK_CHANGED))
+      link_to_changed_list(pagecache, block);
+  }
+  DBUG_VOID_RETURN;
+}
+
+
+/**
+  @brief Unlock/unpin page and put LSN stamp if it need
+
+  @param pagecache      pointer to a page cache data structure
+  @pagam file           handler for the file for the block of data to be read
+  @param pageno         number of the block of data in the file
+  @param lock           lock change
+  @param pin            pin page
+  @param first_REDO_LSN_for_page do not set it if it is zero
+  @param lsn            if it is not LSN_IMPOSSIBLE (0) and it
+                        is bigger then LSN on the page it will be written on
+                        the page
+  @param was_changed    should be true if the page was write locked with
+                        direct link giving and the page was changed
+
+  @note
+    Pininig uses requests registration mechanism it works following way:
+                                | beginnig    | ending        |
+                                | of func.    | of func.      |
+    ----------------------------+-------------+---------------+
+    PAGECACHE_PIN_LEFT_PINNED   |      -      |       -       |
+    PAGECACHE_PIN_LEFT_UNPINNED | reg request | unreg request |
+    PAGECACHE_PIN               | reg request |       -       |
+    PAGECACHE_UNPIN             |      -      | unreg request |
+
+
+*/
+
+void pagecache_unlock(PAGECACHE *pagecache,
+                      PAGECACHE_FILE *file,
+                      pgcache_page_no_t pageno,
+                      enum pagecache_page_lock lock,
+                      enum pagecache_page_pin pin,
+                      LSN first_REDO_LSN_for_page,
+                      LSN lsn, my_bool was_changed)
+{
+  PAGECACHE_BLOCK_LINK *block;
+  int page_st;
+  DBUG_ENTER("pagecache_unlock");
+  DBUG_PRINT("enter", ("fd: %u  page: %lu  %s  %s",
+                       (uint) file->file, (ulong) pageno,
+                       page_cache_page_lock_str[lock],
+                       page_cache_page_pin_str[pin]));
+  /* we do not allow any lock/pin increasing here */
+  DBUG_ASSERT(pin != PAGECACHE_PIN);
+  DBUG_ASSERT(lock != PAGECACHE_LOCK_READ);
+  DBUG_ASSERT(lock != PAGECACHE_LOCK_WRITE);
+
+  pagecache_pthread_mutex_lock(&pagecache->cache_lock);
+  /*
+    As soon as we keep lock cache can be used, and we have lock because want
+    to unlock.
+  */
+  DBUG_ASSERT(pagecache->can_be_used);
+
+  inc_counter_for_resize_op(pagecache);
+  /* See NOTE for pagecache_unlock about registering requests */
+  block= find_block(pagecache, file, pageno, 0, 0,
+                    pin == PAGECACHE_PIN_LEFT_UNPINNED, &page_st);
+  PCBLOCK_INFO(block);
+  DBUG_ASSERT(block != 0 && page_st == PAGE_READ);
+  if (first_REDO_LSN_for_page)
+  {
+    DBUG_ASSERT(lock == PAGECACHE_LOCK_WRITE_UNLOCK);
+    DBUG_ASSERT(pin == PAGECACHE_UNPIN);
+    pagecache_set_block_rec_lsn(block, first_REDO_LSN_for_page);
+  }
+  if (lsn != LSN_IMPOSSIBLE)
+    check_and_set_lsn(pagecache, lsn, block);
+
+  /* if we lock for write we must link the block to changed blocks */
+  DBUG_ASSERT((block->status & PCBLOCK_DIRECT_W) == 0 ||
+              (lock == PAGECACHE_LOCK_WRITE_UNLOCK ||
+               lock == PAGECACHE_LOCK_WRITE_TO_READ ||
+               lock == PAGECACHE_LOCK_LEFT_WRITELOCKED));
+  /*
+    if was_changed then status should be PCBLOCK_DIRECT_W or marked
+    as dirty
+  */
+  DBUG_ASSERT(!was_changed || (block->status & PCBLOCK_DIRECT_W) ||
+              (block->status & PCBLOCK_CHANGED));
+  if ((block->status & PCBLOCK_DIRECT_W) &&
+      (lock == PAGECACHE_LOCK_WRITE_UNLOCK ||
+       lock == PAGECACHE_LOCK_WRITE_TO_READ))
+  {
+    if (!(block->status & PCBLOCK_CHANGED) && was_changed)
+      link_to_changed_list(pagecache, block);
+    block->status&= ~PCBLOCK_DIRECT_W;
+    DBUG_PRINT("info", ("Drop PCBLOCK_DIRECT_W for block: 0x%lx",
+                        (ulong) block));
+  }
+
+  if (make_lock_and_pin(pagecache, block, lock, pin, FALSE))
+  {
+    DBUG_ASSERT(0); /* should not happend */
+  }
+
+  remove_reader(block);
+  /*
+    Link the block into the LRU chain if it's the last submitted request
+    for the block and block will not be pinned.
+    See NOTE for pagecache_unlock about registering requests.
+  */
+  if (pin != PAGECACHE_PIN_LEFT_PINNED)
+    unreg_request(pagecache, block, 1);
+
+  dec_counter_for_resize_op(pagecache);
+
+  pagecache_pthread_mutex_unlock(&pagecache->cache_lock);
+
+  DBUG_VOID_RETURN;
+}
+
+
+/*
+  Unpin page
+
+  SYNOPSIS
+    pagecache_unpin()
+    pagecache           pointer to a page cache data structure
+    file                handler for the file for the block of data to be read
+    pageno              number of the block of data in the file
+    lsn                 if it is not LSN_IMPOSSIBLE (0) and it
+                        is bigger then LSN on the page it will be written on
+                        the page
+*/
+
+void pagecache_unpin(PAGECACHE *pagecache,
+                     PAGECACHE_FILE *file,
+                     pgcache_page_no_t pageno,
+                     LSN lsn)
+{
+  PAGECACHE_BLOCK_LINK *block;
+  int page_st;
+  DBUG_ENTER("pagecache_unpin");
+  DBUG_PRINT("enter", ("fd: %u  page: %lu",
+                       (uint) file->file, (ulong) pageno));
+  pagecache_pthread_mutex_lock(&pagecache->cache_lock);
+  /*
+    As soon as we keep lock cache can be used, and we have lock bacause want
+    aunlock.
+  */
+  DBUG_ASSERT(pagecache->can_be_used);
+
+  inc_counter_for_resize_op(pagecache);
+  /* See NOTE for pagecache_unlock about registering requests */
+  block= find_block(pagecache, file, pageno, 0, 0, 0, &page_st);
+  DBUG_ASSERT(block != 0);
+  DBUG_ASSERT(page_st == PAGE_READ);
+  /* we can't unpin such page without unlock */
+  DBUG_ASSERT((block->status & PCBLOCK_DIRECT_W) == 0);
+
+  if (lsn != LSN_IMPOSSIBLE)
+    check_and_set_lsn(pagecache, lsn, block);
+
+  /*
+    we can just unpin only with keeping read lock because:
+    a) we can't pin without any lock
+    b) we can't unpin keeping write lock
+  */
+  if (make_lock_and_pin(pagecache, block,
+                        PAGECACHE_LOCK_LEFT_READLOCKED,
+                        PAGECACHE_UNPIN, FALSE))
+    DBUG_ASSERT(0);                           /* should not happend */
+
+  remove_reader(block);
+  /*
+    Link the block into the LRU chain if it's the last submitted request
+    for the block and block will not be pinned.
+    See NOTE for pagecache_unlock about registering requests
+  */
+  unreg_request(pagecache, block, 1);
+
+  dec_counter_for_resize_op(pagecache);
+
+  pagecache_pthread_mutex_unlock(&pagecache->cache_lock);
+
+  DBUG_VOID_RETURN;
+}
+
+
+/**
+  @brief Unlock/unpin page and put LSN stamp if it need
+  (uses direct block/page pointer)
+
+  @param pagecache       pointer to a page cache data structure
+  @param link            direct link to page (returned by read or write)
+  @param lock            lock change
+  @param pin             pin page
+  @param first_REDO_LSN_for_page do not set it if it is LSN_IMPOSSIBLE (0)
+  @param lsn             if it is not LSN_IMPOSSIBLE and it is bigger then
+                         LSN on the page it will be written on the page
+  @param was_changed     should be true if the page was write locked with
+                         direct link giving and the page was changed
+  @param any             allow unpinning block pinned by any thread; possible
+                         only if not locked
+
+  @note 'any' is a hack so that _ma_bitmap_unpin_all() is allowed to unpin
+  non-locked bitmap pages pinned by other threads. Because it always uses
+  PAGECACHE_LOCK_LEFT_UNLOCKED and PAGECACHE_UNPIN
+  (see write_changed_bitmap()), the hack is limited to these conditions.
+*/
+
+void pagecache_unlock_by_link(PAGECACHE *pagecache,
+                              PAGECACHE_BLOCK_LINK *block,
+                              enum pagecache_page_lock lock,
+                              enum pagecache_page_pin pin,
+                              LSN first_REDO_LSN_for_page,
+                              LSN lsn, my_bool was_changed,
+                              my_bool any)
+{
+  DBUG_ENTER("pagecache_unlock_by_link");
+  DBUG_PRINT("enter", ("block: 0x%lx  fd: %u  page: %lu  changed: %d  %s  %s",
+                       (ulong) block,
+                       (uint) block->hash_link->file.file,
+                       (ulong) block->hash_link->pageno, was_changed,
+                       page_cache_page_lock_str[lock],
+                       page_cache_page_pin_str[pin]));
+  /*
+    We do not allow any lock/pin increasing here and page can't be
+    unpinned because we use direct link.
+  */
+  DBUG_ASSERT(pin != PAGECACHE_PIN);
+  DBUG_ASSERT(pin != PAGECACHE_PIN_LEFT_UNPINNED);
+  DBUG_ASSERT(lock != PAGECACHE_LOCK_READ);
+  DBUG_ASSERT(lock != PAGECACHE_LOCK_WRITE);
+  pagecache_pthread_mutex_lock(&pagecache->cache_lock);
+  if (pin == PAGECACHE_PIN_LEFT_UNPINNED &&
+      lock == PAGECACHE_LOCK_READ_UNLOCK)
+  {
+    if (make_lock_and_pin(pagecache, block, lock, pin, FALSE))
+      DBUG_ASSERT(0);                         /* should not happend */
+    pagecache_pthread_mutex_unlock(&pagecache->cache_lock);
+    DBUG_VOID_RETURN;
+  }
+
+  /*
+    As soon as we keep lock cache can be used, and we have lock because want
+    unlock.
+  */
+  DBUG_ASSERT(pagecache->can_be_used);
+
+  inc_counter_for_resize_op(pagecache);
+  if (was_changed)
+  {
+    if (first_REDO_LSN_for_page != LSN_IMPOSSIBLE)
+    {
+      /*
+        LOCK_READ_UNLOCK is ok here as the page may have first locked
+        with WRITE lock that was temporarly converted to READ lock before
+        it's unpinned
+      */
+      DBUG_ASSERT(lock == PAGECACHE_LOCK_WRITE_UNLOCK ||
+                  lock == PAGECACHE_LOCK_READ_UNLOCK);
+      DBUG_ASSERT(pin == PAGECACHE_UNPIN);
+      pagecache_set_block_rec_lsn(block, first_REDO_LSN_for_page);
+    }
+    if (lsn != LSN_IMPOSSIBLE)
+      check_and_set_lsn(pagecache, lsn, block);
+    /*
+      Reset error flag. Mark also that page is active; This may not have
+      been the case if there was an error reading the page
+    */
+    block->status= (block->status & ~PCBLOCK_ERROR) | PCBLOCK_READ;
+  }
+
+  /* if we lock for write we must link the block to changed blocks */
+  DBUG_ASSERT((block->status & PCBLOCK_DIRECT_W) == 0 ||
+              (lock == PAGECACHE_LOCK_WRITE_UNLOCK ||
+               lock == PAGECACHE_LOCK_WRITE_TO_READ ||
+               lock == PAGECACHE_LOCK_LEFT_WRITELOCKED));
+  /*
+    If was_changed then status should be PCBLOCK_DIRECT_W or marked
+    as dirty
+  */
+  DBUG_ASSERT(!was_changed || (block->status & PCBLOCK_DIRECT_W) ||
+              (block->status & PCBLOCK_CHANGED));
+  if ((block->status & PCBLOCK_DIRECT_W) &&
+      (lock == PAGECACHE_LOCK_WRITE_UNLOCK ||
+       lock == PAGECACHE_LOCK_WRITE_TO_READ))
+  {
+    if (!(block->status & PCBLOCK_CHANGED) && was_changed)
+      link_to_changed_list(pagecache, block);
+    block->status&= ~PCBLOCK_DIRECT_W;
+    DBUG_PRINT("info", ("Drop PCBLOCK_DIRECT_W for block: 0x%lx",
+                        (ulong) block));
+  }
+
+  if (make_lock_and_pin(pagecache, block, lock, pin, any))
+    DBUG_ASSERT(0);                           /* should not happend */
+
+  /*
+    Link the block into the LRU chain if it's the last submitted request
+    for the block and block will not be pinned.
+    See NOTE for pagecache_unlock about registering requests.
+  */
+  if (pin != PAGECACHE_PIN_LEFT_PINNED)
+    unreg_request(pagecache, block, 1);
+
+  dec_counter_for_resize_op(pagecache);
+
+  pagecache_pthread_mutex_unlock(&pagecache->cache_lock);
+
+  DBUG_VOID_RETURN;
+}
+
+
+/*
+  Unpin page
+  (uses direct block/page pointer)
+
+  SYNOPSIS
+    pagecache_unpin_by_link()
+    pagecache           pointer to a page cache data structure
+    link                direct link to page (returned by read or write)
+    lsn                 if it is not LSN_IMPOSSIBLE (0) and it
+                        is bigger then LSN on the page it will be written on
+                        the page
+*/
+
+void pagecache_unpin_by_link(PAGECACHE *pagecache,
+                             PAGECACHE_BLOCK_LINK *block,
+                             LSN lsn)
+{
+  DBUG_ENTER("pagecache_unpin_by_link");
+  DBUG_PRINT("enter", ("block: 0x%lx  fd: %u page: %lu",
+                       (ulong) block,
+                       (uint) block->hash_link->file.file,
+                       (ulong) block->hash_link->pageno));
+
+  pagecache_pthread_mutex_lock(&pagecache->cache_lock);
+  /*
+    As soon as we keep lock cache can be used, and we have lock because want
+    unlock.
+  */
+  DBUG_ASSERT(pagecache->can_be_used);
+  /* we can't unpin such page without unlock */
+  DBUG_ASSERT((block->status & PCBLOCK_DIRECT_W) == 0);
+
+  inc_counter_for_resize_op(pagecache);
+
+  if (lsn != LSN_IMPOSSIBLE)
+    check_and_set_lsn(pagecache, lsn, block);
+
+  /*
+    We can just unpin only with keeping read lock because:
+    a) we can't pin without any lock
+    b) we can't unpin keeping write lock
+  */
+  if (make_lock_and_pin(pagecache, block,
+                        PAGECACHE_LOCK_LEFT_READLOCKED,
+                        PAGECACHE_UNPIN, FALSE))
+    DBUG_ASSERT(0); /* should not happend */
+
+  /*
+    Link the block into the LRU chain if it's the last submitted request
+    for the block and block will not be pinned.
+    See NOTE for pagecache_unlock about registering requests.
+  */
+  unreg_request(pagecache, block, 1);
+
+  dec_counter_for_resize_op(pagecache);
+
+  pagecache_pthread_mutex_unlock(&pagecache->cache_lock);
+
+  DBUG_VOID_RETURN;
+}
+
+/* description of how to change lock before and after read/write */
+struct rw_lock_change
+{
+  my_bool need_lock_change; /* need changing of lock at the end */
+  enum pagecache_page_lock new_lock; /* lock at the beginning */
+  enum pagecache_page_lock unlock_lock; /* lock at the end */
+};
+
+/* description of how to change pin before and after read/write */
+struct rw_pin_change
+{
+  enum pagecache_page_pin new_pin; /* pin status at the beginning */
+  enum pagecache_page_pin unlock_pin; /* pin status at the end */
+};
+
+/**
+  Depending on the lock which the user wants in pagecache_read(), we
+  need to acquire a first type of lock at start of pagecache_read(), and
+  downgrade it to a second type of lock at end. For example, if user
+  asked for no lock (PAGECACHE_LOCK_LEFT_UNLOCKED) this translates into
+  taking first a read lock PAGECACHE_LOCK_READ (to rightfully block on
+  existing write locks) then read then unlock the lock i.e. change lock
+  to PAGECACHE_LOCK_READ_UNLOCK (the "1" below tells that a change is
+  needed).
+*/ 
+
+static struct rw_lock_change lock_to_read[8]=
+{
+  { /*PAGECACHE_LOCK_LEFT_UNLOCKED*/
+    1,
+    PAGECACHE_LOCK_READ, PAGECACHE_LOCK_READ_UNLOCK
+  },
+  { /*PAGECACHE_LOCK_LEFT_READLOCKED*/
+    0,
+    PAGECACHE_LOCK_LEFT_READLOCKED, PAGECACHE_LOCK_LEFT_READLOCKED
+  },
+  { /*PAGECACHE_LOCK_LEFT_WRITELOCKED*/
+    0,
+    PAGECACHE_LOCK_LEFT_WRITELOCKED, PAGECACHE_LOCK_LEFT_WRITELOCKED
+  },
+  { /*PAGECACHE_LOCK_READ*/
+    1,
+    PAGECACHE_LOCK_READ, PAGECACHE_LOCK_LEFT_READLOCKED
+  },
+  { /*PAGECACHE_LOCK_WRITE*/
+    1,
+    PAGECACHE_LOCK_WRITE, PAGECACHE_LOCK_LEFT_WRITELOCKED
+  },
+  { /*PAGECACHE_LOCK_READ_UNLOCK*/
+    1,
+    PAGECACHE_LOCK_LEFT_READLOCKED, PAGECACHE_LOCK_READ_UNLOCK
+  },
+  { /*PAGECACHE_LOCK_WRITE_UNLOCK*/
+    1,
+    PAGECACHE_LOCK_LEFT_WRITELOCKED, PAGECACHE_LOCK_WRITE_UNLOCK
+  },
+  { /*PAGECACHE_LOCK_WRITE_TO_READ*/
+    1,
+    PAGECACHE_LOCK_LEFT_WRITELOCKED, PAGECACHE_LOCK_WRITE_TO_READ
+  }
+};
+
+/**
+  Two sets of pin modes (every as for lock upper but for pinning). The
+  difference between sets if whether we are going to provide caller with
+  reference on the block or not
+*/
+
+static struct rw_pin_change lock_to_pin[2][8]=
+{
+  {
+    { /*PAGECACHE_LOCK_LEFT_UNLOCKED*/
+      PAGECACHE_PIN_LEFT_UNPINNED,
+      PAGECACHE_PIN_LEFT_UNPINNED
+    },
+    { /*PAGECACHE_LOCK_LEFT_READLOCKED*/
+      PAGECACHE_PIN_LEFT_UNPINNED,
+      PAGECACHE_PIN_LEFT_UNPINNED,
+    },
+    { /*PAGECACHE_LOCK_LEFT_WRITELOCKED*/
+      PAGECACHE_PIN_LEFT_PINNED,
+      PAGECACHE_PIN_LEFT_PINNED
+    },
+    { /*PAGECACHE_LOCK_READ*/
+      PAGECACHE_PIN_LEFT_UNPINNED,
+      PAGECACHE_PIN_LEFT_UNPINNED
+    },
+    { /*PAGECACHE_LOCK_WRITE*/
+      PAGECACHE_PIN,
+      PAGECACHE_PIN_LEFT_PINNED
+    },
+    { /*PAGECACHE_LOCK_READ_UNLOCK*/
+      PAGECACHE_PIN_LEFT_UNPINNED,
+      PAGECACHE_PIN_LEFT_UNPINNED
+    },
+    { /*PAGECACHE_LOCK_WRITE_UNLOCK*/
+      PAGECACHE_PIN_LEFT_PINNED,
+      PAGECACHE_UNPIN
+    },
+    { /*PAGECACHE_LOCK_WRITE_TO_READ*/
+      PAGECACHE_PIN_LEFT_PINNED,
+      PAGECACHE_UNPIN
+    }
+  },
+  {
+    { /*PAGECACHE_LOCK_LEFT_UNLOCKED*/
+      PAGECACHE_PIN_LEFT_UNPINNED,
+      PAGECACHE_PIN_LEFT_UNPINNED
+    },
+    { /*PAGECACHE_LOCK_LEFT_READLOCKED*/
+      PAGECACHE_PIN_LEFT_UNPINNED,
+      PAGECACHE_PIN_LEFT_UNPINNED,
+    },
+    { /*PAGECACHE_LOCK_LEFT_WRITELOCKED*/
+      PAGECACHE_PIN_LEFT_PINNED,
+      PAGECACHE_PIN_LEFT_PINNED
+    },
+    { /*PAGECACHE_LOCK_READ*/
+      PAGECACHE_PIN,
+      PAGECACHE_PIN_LEFT_PINNED
+    },
+    { /*PAGECACHE_LOCK_WRITE*/
+      PAGECACHE_PIN,
+      PAGECACHE_PIN_LEFT_PINNED
+    },
+    { /*PAGECACHE_LOCK_READ_UNLOCK*/
+      PAGECACHE_PIN_LEFT_UNPINNED,
+      PAGECACHE_PIN_LEFT_UNPINNED
+    },
+    { /*PAGECACHE_LOCK_WRITE_UNLOCK*/
+      PAGECACHE_PIN_LEFT_PINNED,
+      PAGECACHE_UNPIN
+    },
+    { /*PAGECACHE_LOCK_WRITE_TO_READ*/
+      PAGECACHE_PIN_LEFT_PINNED,
+      PAGECACHE_PIN_LEFT_PINNED,
+    }
+  }
+};
+
+
+/*
+  @brief Read a block of data from a cached file into a buffer;
+
+  @param pagecache      pointer to a page cache data structure
+  @param file           handler for the file for the block of data to be read
+  @param pageno         number of the block of data in the file
+  @param level          determines the weight of the data
+  @param buff           buffer to where the data must be placed
+  @param type           type of the page
+  @param lock           lock change
+  @param link           link to the page if we pin it
+
+  @return address from where the data is placed if successful, 0 - otherwise.
+
+  @note Pin will be chosen according to lock parameter (see lock_to_pin)
+
+  @note 'buff', if not NULL, must be long-aligned.
+
+  @note  If buff==0 then we provide reference on the page so should keep the
+  page pinned.
+*/
+
+uchar *pagecache_read(PAGECACHE *pagecache,
+                      PAGECACHE_FILE *file,
+                      pgcache_page_no_t pageno,
+                      uint level,
+                      uchar *buff,
+                      enum pagecache_page_type type,
+                      enum pagecache_page_lock lock,
+                      PAGECACHE_BLOCK_LINK **page_link)
+{
+  my_bool error= 0;
+  enum pagecache_page_pin
+    new_pin= lock_to_pin[buff==0][lock].new_pin,
+    unlock_pin= lock_to_pin[buff==0][lock].unlock_pin;
+  PAGECACHE_BLOCK_LINK *fake_link;
+  my_bool reg_request;
+#ifndef DBUG_OFF
+  char llbuf[22];
+  DBUG_ENTER("pagecache_read");
+  DBUG_PRINT("enter", ("fd: %u  page: %s  buffer: 0x%lx level: %u  "
+                       "t:%s  (%d)%s->%s  %s->%s",
+                       (uint) file->file, ullstr(pageno, llbuf),
+                       (ulong) buff, level,
+                       page_cache_page_type_str[type],
+                       lock_to_read[lock].need_lock_change,
+                       page_cache_page_lock_str[lock_to_read[lock].new_lock],
+                       page_cache_page_lock_str[lock_to_read[lock].unlock_lock],
+                       page_cache_page_pin_str[new_pin],
+                       page_cache_page_pin_str[unlock_pin]));
+  DBUG_ASSERT(buff != 0 || (buff == 0 && (unlock_pin == PAGECACHE_PIN ||
+                                          unlock_pin == PAGECACHE_PIN_LEFT_PINNED)));
+  DBUG_ASSERT(pageno < ((ULL(1)) << 40));
+#endif
+
+  if (!page_link)
+    page_link= &fake_link;
+  *page_link= 0;                                 /* Catch errors */
+
+restart:
+
+  if (pagecache->can_be_used)
+  {
+    /* Key cache is used */
+    PAGECACHE_BLOCK_LINK *block;
+    uint status;
+    int page_st;
+
+    pagecache_pthread_mutex_lock(&pagecache->cache_lock);
+    if (!pagecache->can_be_used)
+    {
+      pagecache_pthread_mutex_unlock(&pagecache->cache_lock);
+      goto no_key_cache;
+    }
+
+    inc_counter_for_resize_op(pagecache);
+    pagecache->global_cache_r_requests++;
+    /* See NOTE for pagecache_unlock about registering requests. */
+    reg_request= ((new_pin == PAGECACHE_PIN_LEFT_UNPINNED) ||
+                  (new_pin == PAGECACHE_PIN));
+    block= find_block(pagecache, file, pageno, level,
+                      lock == PAGECACHE_LOCK_WRITE,
+                      reg_request, &page_st);
+    DBUG_PRINT("info", ("Block type: %s current type %s",
+                        page_cache_page_type_str[block->type],
+                        page_cache_page_type_str[type]));
+    if (((block->status & PCBLOCK_ERROR) == 0) && (page_st != PAGE_READ))
+    {
+      /* The requested page is to be read into the block buffer */
+      read_block(pagecache, block,
+                 (my_bool)(page_st == PAGE_TO_BE_READ));
+      DBUG_PRINT("info", ("read is done"));
+    }
+    /*
+      Assert after block is read. Imagine two concurrent SELECTs on same
+      table (thread1 and 2), which want to pagecache_read() the same
+      pageno/fileno. Thread1 calls find_block(), decides to evict a dirty
+      page from LRU; while it's writing this dirty page to disk, it is
+      pre-empted and thread2 runs its find_block(), gets the block (in
+      PAGE_TO_BE_READ state). This block is still containing the in-eviction
+      dirty page so has an its type, which cannot be tested.
+      So thread2 has to wait for read_block() to finish (when it wakes up in
+      read_block(), it's woken up by read_block() of thread1, which implies
+      that block's type was set to EMPTY by thread1 as part of find_block()).
+    */
+    DBUG_ASSERT(block->type == PAGECACHE_EMPTY_PAGE ||
+                block->type == type ||
+                type == PAGECACHE_LSN_PAGE ||
+                type == PAGECACHE_READ_UNKNOWN_PAGE ||
+                block->type == PAGECACHE_READ_UNKNOWN_PAGE);
+    if (type != PAGECACHE_READ_UNKNOWN_PAGE ||
+        block->type == PAGECACHE_EMPTY_PAGE)
+      block->type= type;
+
+    if (make_lock_and_pin(pagecache, block, lock_to_read[lock].new_lock,
+                          new_pin, FALSE))
+    {
+      /*
+        We failed to write lock the block, cache is unlocked,
+        we will try to get the block again.
+      */
+      if (reg_request)
+        unreg_request(pagecache, block, 1);
+      pagecache_pthread_mutex_unlock(&pagecache->cache_lock);
+      DBUG_PRINT("info", ("restarting..."));
+      goto restart;
+    }
+
+    status= block->status;
+    if (!buff)
+    {
+      buff=  block->buffer;
+      /* possibly we will write here (resolved on unlock) */
+      if ((lock == PAGECACHE_LOCK_WRITE ||
+           lock == PAGECACHE_LOCK_LEFT_WRITELOCKED) &&
+          !(block->status & PCBLOCK_CHANGED))
+      {
+        block->status|= PCBLOCK_DIRECT_W;
+        DBUG_PRINT("info", ("Set PCBLOCK_DIRECT_W for block: 0x%lx",
+                            (ulong) block));
+      }
+    }
+    else
+    {
+      if (!(status & PCBLOCK_ERROR))
+      {
+#if !defined(SERIALIZED_READ_FROM_CACHE)
+        pagecache_pthread_mutex_unlock(&pagecache->cache_lock);
+#endif
+
+        DBUG_ASSERT((pagecache->block_size & 511) == 0);
+        /* Copy data from the cache buffer */
+        bmove512(buff, block->buffer, pagecache->block_size);
+
+#if !defined(SERIALIZED_READ_FROM_CACHE)
+        pagecache_pthread_mutex_lock(&pagecache->cache_lock);
+#endif
+      }
+      else
+        my_errno= block->error;
+    }
+
+    remove_reader(block);
+    if (lock_to_read[lock].need_lock_change)
+    {
+      if (make_lock_and_pin(pagecache, block,
+                            lock_to_read[lock].unlock_lock,
+                            unlock_pin, FALSE))
+        DBUG_ASSERT(0);
+    }
+    /*
+      Link the block into the LRU chain if it's the last submitted request
+      for the block and block will not be pinned.
+      See NOTE for pagecache_unlock about registering requests.
+    */
+    if (unlock_pin == PAGECACHE_PIN_LEFT_UNPINNED ||
+        unlock_pin == PAGECACHE_UNPIN)
+      unreg_request(pagecache, block, 1);
+    else
+      *page_link= block;
+
+    dec_counter_for_resize_op(pagecache);
+
+    pagecache_pthread_mutex_unlock(&pagecache->cache_lock);
+
+    if (status & PCBLOCK_ERROR)
+    {
+      DBUG_ASSERT(my_errno != 0);
+      DBUG_PRINT("error", ("Got error %d when doing page read", my_errno));
+      DBUG_RETURN((uchar *) 0);
+    }
+
+    DBUG_RETURN(buff);
+  }
+
+no_key_cache:					/* Key cache is not used */
+
+  /* We can't use mutex here as the key cache may not be initialized */
+  pagecache->global_cache_r_requests++;
+  pagecache->global_cache_read++;
+  if (pagecache_fread(pagecache, file, buff, pageno,
+                      pagecache->readwrite_flags))
+    error= 1;
+  DBUG_RETURN(error ? (uchar*) 0 : buff);
+}
+
+
+/*
+  @brief Delete page from the buffer (common part for link and file/page)
+
+  @param pagecache      pointer to a page cache data structure
+  @param block          direct link to page (returned by read or write)
+  @param page_link      hash link of the block
+  @param flush          flush page if it is dirty
+
+  @retval 0 deleted or was not present at all
+  @retval 1 error
+
+*/
+
+static my_bool pagecache_delete_internal(PAGECACHE *pagecache,
+                                         PAGECACHE_BLOCK_LINK *block,
+                                         PAGECACHE_HASH_LINK *page_link,
+                                         my_bool flush)
+{
+  my_bool error= 0;
+  if (block->status & PCBLOCK_CHANGED)
+  {
+    if (flush)
+    {
+      /* The block contains a dirty page - push it out of the cache */
+
+      KEYCACHE_DBUG_PRINT("find_block", ("block is dirty"));
+
+      pagecache_pthread_mutex_unlock(&pagecache->cache_lock);
+      /*
+        The call is thread safe because only the current
+        thread might change the block->hash_link value
+      */
+      DBUG_ASSERT(block->pins == 1);
+      error= pagecache_fwrite(pagecache,
+                              &block->hash_link->file,
+                              block->buffer,
+                              block->hash_link->pageno,
+                              block->type,
+                              pagecache->readwrite_flags);
+      pagecache_pthread_mutex_lock(&pagecache->cache_lock);
+      pagecache->global_cache_write++;
+
+      if (error)
+      {
+        block->status|= PCBLOCK_ERROR;
+        block->error=   (int16) my_errno;
+        my_debug_put_break_here();
+        goto err;
+      }
+    }
+    pagecache->blocks_changed--;
+    pagecache->global_blocks_changed--;
+    /*
+      free_block() will change the status and rec_lsn of the block so no
+      need to change them here.
+    */
+  }
+  /* Cache is locked, so we can relese page before freeing it */
+  if (make_lock_and_pin(pagecache, block,
+                        PAGECACHE_LOCK_WRITE_UNLOCK,
+                        PAGECACHE_UNPIN, FALSE))
+    DBUG_ASSERT(0);
+  DBUG_ASSERT(block->hash_link->requests > 0);
+  page_link->requests--;
+  /* See NOTE for pagecache_unlock about registering requests. */
+  free_block(pagecache, block);
+
+err:
+  dec_counter_for_resize_op(pagecache);
+  return error;
+}
+
+
+/*
+  @brief Delete page from the buffer by link
+
+  @param pagecache      pointer to a page cache data structure
+  @param link           direct link to page (returned by read or write)
+  @param lock           lock change
+  @param flush          flush page if it is dirty
+
+  @retval 0 deleted or was not present at all
+  @retval 1 error
+
+  @note lock  can be only PAGECACHE_LOCK_LEFT_WRITELOCKED (page was
+  write locked before) or PAGECACHE_LOCK_WRITE (delete will write
+  lock page before delete)
+*/
+
+my_bool pagecache_delete_by_link(PAGECACHE *pagecache,
+                                 PAGECACHE_BLOCK_LINK *block,
+                                 enum pagecache_page_lock lock,
+                                 my_bool flush)
+{
+  my_bool error= 0;
+  enum pagecache_page_pin pin= PAGECACHE_PIN_LEFT_PINNED;
+  DBUG_ENTER("pagecache_delete_by_link");
+  DBUG_PRINT("enter", ("fd: %d block 0x%lx  %s  %s",
+                       block->hash_link->file.file,
+                       (ulong) block,
+                       page_cache_page_lock_str[lock],
+                       page_cache_page_pin_str[pin]));
+  DBUG_ASSERT(lock == PAGECACHE_LOCK_WRITE ||
+              lock == PAGECACHE_LOCK_LEFT_WRITELOCKED);
+  DBUG_ASSERT(block->pins != 0); /* should be pinned */
+
+  if (pagecache->can_be_used)
+  {
+    pagecache_pthread_mutex_lock(&pagecache->cache_lock);
+    if (!pagecache->can_be_used)
+      goto end;
+
+    /*
+      This block should be pinned (i.e. has not zero request counter) =>
+      Such block can't be chosen for eviction.
+    */
+    DBUG_ASSERT((block->status &
+                 (PCBLOCK_IN_SWITCH | PCBLOCK_REASSIGNED)) == 0);
+    /*
+      make_lock_and_pin() can't fail here, because we are keeping pin on the
+      block and it can't be evicted (which is cause of lock fail and retry)
+    */
+    if (make_lock_and_pin(pagecache, block, lock, pin, FALSE))
+      DBUG_ASSERT(0);
+
+    /*
+      get_present_hash_link() side effect emulation before call
+      pagecache_delete_internal()
+    */
+    block->hash_link->requests++;
+
+    error= pagecache_delete_internal(pagecache, block, block->hash_link,
+                                     flush);
+end:
+    pagecache_pthread_mutex_unlock(&pagecache->cache_lock);
+  }
+
+  DBUG_RETURN(error);
+}
+
+
+/**
+  @brief Returns "hits" for promotion
+
+  @return "hits" for promotion
+*/
+
+uint pagecache_pagelevel(PAGECACHE_BLOCK_LINK *block)
+{
+  return block->hits_left;
+}
+
+/*
+  @brief Adds "hits" to the page
+
+  @param link           direct link to page (returned by read or write)
+  @param level          number of "hits" which we add to the page
+*/
+
+void pagecache_add_level_by_link(PAGECACHE_BLOCK_LINK *block,
+                                 uint level)
+{
+  DBUG_ASSERT(block->pins != 0); /* should be pinned */
+  /*
+    Operation is just for statistics so it is not really important
+    if it interfere with other hit increasing => we are doing it without
+    locking the pagecache.
+  */
+  block->hits_left+= level;
+}
+
+/*
+  @brief Delete page from the buffer
+
+  @param pagecache      pointer to a page cache data structure
+  @param file           handler for the file for the block of data to be read
+  @param pageno         number of the block of data in the file
+  @param lock           lock change
+  @param flush          flush page if it is dirty
+
+  @retval 0 deleted or was not present at all
+  @retval 1 error
+
+  @note lock  can be only PAGECACHE_LOCK_LEFT_WRITELOCKED (page was
+  write locked before) or PAGECACHE_LOCK_WRITE (delete will write
+  lock page before delete)
+*/
+static enum pagecache_page_pin lock_to_pin_one_phase[8]=
+{
+  PAGECACHE_PIN_LEFT_UNPINNED /*PAGECACHE_LOCK_LEFT_UNLOCKED*/,
+  PAGECACHE_PIN_LEFT_UNPINNED /*PAGECACHE_LOCK_LEFT_READLOCKED*/,
+  PAGECACHE_PIN_LEFT_PINNED   /*PAGECACHE_LOCK_LEFT_WRITELOCKED*/,
+  PAGECACHE_PIN_LEFT_UNPINNED /*PAGECACHE_LOCK_READ*/,
+  PAGECACHE_PIN               /*PAGECACHE_LOCK_WRITE*/,
+  PAGECACHE_PIN_LEFT_UNPINNED /*PAGECACHE_LOCK_READ_UNLOCK*/,
+  PAGECACHE_UNPIN             /*PAGECACHE_LOCK_WRITE_UNLOCK*/,
+  PAGECACHE_UNPIN             /*PAGECACHE_LOCK_WRITE_TO_READ*/
+};
+
+my_bool pagecache_delete(PAGECACHE *pagecache,
+                         PAGECACHE_FILE *file,
+                         pgcache_page_no_t pageno,
+                         enum pagecache_page_lock lock,
+                         my_bool flush)
+{
+  my_bool error= 0;
+  enum pagecache_page_pin pin= lock_to_pin_one_phase[lock];
+  DBUG_ENTER("pagecache_delete");
+  DBUG_PRINT("enter", ("fd: %u  page: %lu  %s  %s",
+                       (uint) file->file, (ulong) pageno,
+                       page_cache_page_lock_str[lock],
+                       page_cache_page_pin_str[pin]));
+  DBUG_ASSERT(lock == PAGECACHE_LOCK_WRITE ||
+              lock == PAGECACHE_LOCK_LEFT_WRITELOCKED);
+  DBUG_ASSERT(pin == PAGECACHE_PIN ||
+              pin == PAGECACHE_PIN_LEFT_PINNED);
+restart:
+
+  DBUG_ASSERT(pageno < ((ULL(1)) << 40));
+  if (pagecache->can_be_used)
+  {
+    /* Key cache is used */
+    reg1 PAGECACHE_BLOCK_LINK *block;
+    PAGECACHE_HASH_LINK **unused_start, *page_link;
+
+    pagecache_pthread_mutex_lock(&pagecache->cache_lock);
+    if (!pagecache->can_be_used)
+      goto end;
+
+    inc_counter_for_resize_op(pagecache);
+    page_link= get_present_hash_link(pagecache, file, pageno, &unused_start);
+    if (!page_link)
+    {
+      DBUG_PRINT("info", ("There is no such page in the cache"));
+      pagecache_pthread_mutex_unlock(&pagecache->cache_lock);
+      DBUG_RETURN(0);
+    }
+    block= page_link->block;
+    if (block->status & (PCBLOCK_REASSIGNED | PCBLOCK_IN_SWITCH))
+    {
+      DBUG_PRINT("info", ("Block 0x%0lx already is %s",
+                          (ulong) block,
+                          ((block->status & PCBLOCK_REASSIGNED) ?
+                           "reassigned" : "in switch")));
+      PCBLOCK_INFO(block);
+      page_link->requests--;
+      goto end;
+    }
+    /* See NOTE for pagecache_unlock about registering requests. */
+    if (pin == PAGECACHE_PIN)
+      reg_requests(pagecache, block, 1);
+    DBUG_ASSERT(block != 0);
+    if (make_lock_and_pin(pagecache, block, lock, pin, FALSE))
+    {
+      /*
+        We failed to writelock the block, cache is unlocked, and last write
+        lock is released, we will try to get the block again.
+      */
+      if (pin == PAGECACHE_PIN)
+        unreg_request(pagecache, block, 1);
+      pagecache_pthread_mutex_unlock(&pagecache->cache_lock);
+      DBUG_PRINT("info", ("restarting..."));
+      goto restart;
+    }
+
+    /* we can't delete with opened direct link for write */
+    DBUG_ASSERT((block->status & PCBLOCK_DIRECT_W) == 0);
+
+    error= pagecache_delete_internal(pagecache, block, page_link, flush);
+end:
+    pagecache_pthread_mutex_unlock(&pagecache->cache_lock);
+  }
+
+  DBUG_RETURN(error);
+}
+
+
+my_bool pagecache_delete_pages(PAGECACHE *pagecache,
+                               PAGECACHE_FILE *file,
+                               pgcache_page_no_t pageno,
+                               uint page_count,
+                               enum pagecache_page_lock lock,
+                               my_bool flush)
+{
+  pgcache_page_no_t page_end;
+  DBUG_ENTER("pagecache_delete_pages");
+  DBUG_ASSERT(page_count > 0);
+
+  page_end= pageno + page_count;
+  do
+  {
+    if (pagecache_delete(pagecache, file, pageno,
+                         lock, flush))
+      DBUG_RETURN(1);
+  } while (++pageno != page_end);
+  DBUG_RETURN(0);
+}
+
+
+/**
+  @brief Writes a buffer into a cached file.
+
+  @param pagecache       pointer to a page cache data structure
+  @param file            handler for the file to write data to
+  @param pageno          number of the block of data in the file
+  @param level           determines the weight of the data
+  @param buff            buffer with the data
+  @param type            type of the page
+  @param lock            lock change
+  @param pin             pin page
+  @param write_mode      how to write page
+  @param link            link to the page if we pin it
+  @param first_REDO_LSN_for_page the lsn to set rec_lsn
+  @param offset          offset in the page
+  @param size            size of data
+  @param validator       read page validator
+  @param validator_data  the validator data
+
+  @retval 0 if a success.
+  @retval 1 Error.
+*/
+
+static struct rw_lock_change write_lock_change_table[]=
+{
+  {1,
+   PAGECACHE_LOCK_WRITE,
+   PAGECACHE_LOCK_WRITE_UNLOCK} /*PAGECACHE_LOCK_LEFT_UNLOCKED*/,
+  {0, /*unsupported (we can't write having the block read locked) */
+   PAGECACHE_LOCK_LEFT_UNLOCKED,
+   PAGECACHE_LOCK_LEFT_UNLOCKED} /*PAGECACHE_LOCK_LEFT_READLOCKED*/,
+  {0, PAGECACHE_LOCK_LEFT_WRITELOCKED, 0} /*PAGECACHE_LOCK_LEFT_WRITELOCKED*/,
+  {1,
+   PAGECACHE_LOCK_WRITE,
+   PAGECACHE_LOCK_WRITE_TO_READ} /*PAGECACHE_LOCK_READ*/,
+  {0, PAGECACHE_LOCK_WRITE, 0} /*PAGECACHE_LOCK_WRITE*/,
+  {0, /*unsupported (we can't write having the block read locked) */
+   PAGECACHE_LOCK_LEFT_UNLOCKED,
+   PAGECACHE_LOCK_LEFT_UNLOCKED} /*PAGECACHE_LOCK_READ_UNLOCK*/,
+  {1,
+   PAGECACHE_LOCK_LEFT_WRITELOCKED,
+   PAGECACHE_LOCK_WRITE_UNLOCK } /*PAGECACHE_LOCK_WRITE_UNLOCK*/,
+  {1,
+   PAGECACHE_LOCK_LEFT_WRITELOCKED,
+   PAGECACHE_LOCK_WRITE_TO_READ} /*PAGECACHE_LOCK_WRITE_TO_READ*/
+};
+
+
+static struct rw_pin_change write_pin_change_table[]=
+{
+  {PAGECACHE_PIN_LEFT_PINNED,
+   PAGECACHE_PIN_LEFT_PINNED} /*PAGECACHE_PIN_LEFT_PINNED*/,
+  {PAGECACHE_PIN,
+   PAGECACHE_UNPIN} /*PAGECACHE_PIN_LEFT_UNPINNED*/,
+  {PAGECACHE_PIN,
+   PAGECACHE_PIN_LEFT_PINNED} /*PAGECACHE_PIN*/,
+  {PAGECACHE_PIN_LEFT_PINNED,
+   PAGECACHE_UNPIN} /*PAGECACHE_UNPIN*/
+};
+
+
+/**
+  @note 'buff', if not NULL, must be long-aligned.
+*/
+
+my_bool pagecache_write_part(PAGECACHE *pagecache,
+                             PAGECACHE_FILE *file,
+                             pgcache_page_no_t pageno,
+                             uint level,
+                             uchar *buff,
+                             enum pagecache_page_type type,
+                             enum pagecache_page_lock lock,
+                             enum pagecache_page_pin pin,
+                             enum pagecache_write_mode write_mode,
+                             PAGECACHE_BLOCK_LINK **page_link,
+                             LSN first_REDO_LSN_for_page,
+                             uint offset, uint size)
+{
+  PAGECACHE_BLOCK_LINK *block= NULL;
+  PAGECACHE_BLOCK_LINK *fake_link;
+  my_bool error= 0;
+  int need_lock_change= write_lock_change_table[lock].need_lock_change;
+  my_bool reg_request;
+#ifndef DBUG_OFF
+  char llbuf[22];
+  DBUG_ENTER("pagecache_write_part");
+  DBUG_PRINT("enter", ("fd: %u  page: %s  level: %u  type: %s  lock: %s  "
+                       "pin: %s   mode: %s  offset: %u  size %u",
+                       (uint) file->file, ullstr(pageno, llbuf), level,
+                       page_cache_page_type_str[type],
+                       page_cache_page_lock_str[lock],
+                       page_cache_page_pin_str[pin],
+                       page_cache_page_write_mode_str[write_mode],
+                       offset, size));
+  DBUG_ASSERT(type != PAGECACHE_READ_UNKNOWN_PAGE);
+  DBUG_ASSERT(lock != PAGECACHE_LOCK_LEFT_READLOCKED);
+  DBUG_ASSERT(lock != PAGECACHE_LOCK_READ_UNLOCK);
+  DBUG_ASSERT(offset + size <= pagecache->block_size);
+  DBUG_ASSERT(pageno < ((ULL(1)) << 40));
+#endif
+
+  if (!page_link)
+    page_link= &fake_link;
+  *page_link= 0;
+
+restart:
+
+#if !defined(DBUG_OFF) && defined(EXTRA_DEBUG)
+  DBUG_EXECUTE("check_pagecache",
+               test_key_cache(pagecache, "start of key_cache_write", 1););
+#endif
+
+  if (pagecache->can_be_used)
+  {
+    /* Key cache is used */
+    int page_st;
+    my_bool need_page_ready_signal= FALSE;
+
+    pagecache_pthread_mutex_lock(&pagecache->cache_lock);
+    if (!pagecache->can_be_used)
+    {
+      pagecache_pthread_mutex_unlock(&pagecache->cache_lock);
+      goto no_key_cache;
+    }
+
+    inc_counter_for_resize_op(pagecache);
+    pagecache->global_cache_w_requests++;
+    /* See NOTE for pagecache_unlock about registering requests. */
+    reg_request= ((pin == PAGECACHE_PIN_LEFT_UNPINNED) ||
+                  (pin == PAGECACHE_PIN));
+    block= find_block(pagecache, file, pageno, level,
+                      TRUE,
+                      reg_request, &page_st);
+    if (!block)
+    {
+      DBUG_ASSERT(write_mode != PAGECACHE_WRITE_DONE);
+      /* It happens only for requests submitted during resize operation */
+      dec_counter_for_resize_op(pagecache);
+      pagecache_pthread_mutex_unlock(&pagecache->cache_lock);
+      /* Write to the disk key cache is in resize at the moment*/
+      goto no_key_cache;
+    }
+    DBUG_PRINT("info", ("page status: %d", page_st));
+    if (!(block->status & PCBLOCK_ERROR) &&
+        ((page_st == PAGE_TO_BE_READ &&
+          (offset || size < pagecache->block_size)) ||
+         (page_st == PAGE_WAIT_TO_BE_READ)))
+    {
+      /* The requested page is to be read into the block buffer */
+      read_block(pagecache, block,
+                 (my_bool)(page_st == PAGE_TO_BE_READ));
+      DBUG_PRINT("info", ("read is done"));
+    }
+    else if (page_st == PAGE_TO_BE_READ)
+    {
+      need_page_ready_signal= TRUE;
+    }
+
+    DBUG_ASSERT(block->type == PAGECACHE_EMPTY_PAGE ||
+                block->type == PAGECACHE_READ_UNKNOWN_PAGE ||
+                block->type == type ||
+                /* this is for when going to non-trans to trans */
+                (block->type == PAGECACHE_PLAIN_PAGE &&
+                 type == PAGECACHE_LSN_PAGE));
+    block->type= type;
+    /* we write to the page so it has no sense to keep the flag */
+    block->status&= ~PCBLOCK_DIRECT_W;
+    DBUG_PRINT("info", ("Drop PCBLOCK_DIRECT_W for block: 0x%lx",
+                        (ulong) block));
+
+    if (make_lock_and_pin(pagecache, block,
+                          write_lock_change_table[lock].new_lock,
+                          (need_lock_change ?
+                           write_pin_change_table[pin].new_pin :
+                           pin), FALSE))
+    {
+      /*
+        We failed to writelock the block, cache is unlocked, and last write
+        lock is released, we will try to get the block again.
+      */
+      if (reg_request)
+        unreg_request(pagecache, block, 1);
+      pagecache_pthread_mutex_unlock(&pagecache->cache_lock);
+      DBUG_PRINT("info", ("restarting..."));
+      goto restart;
+    }
+
+    if (write_mode == PAGECACHE_WRITE_DONE)
+    {
+      if (block->status & PCBLOCK_ERROR)
+      {
+        my_debug_put_break_here();
+        DBUG_PRINT("warning", ("Writing on page with error"));
+      }
+      else
+      {
+        /* Copy data from buff */
+        if (!(size & 511))
+          bmove512(block->buffer + offset, buff, size);
+        else
+          memcpy(block->buffer + offset, buff, size);
+        block->status= PCBLOCK_READ;
+        /*
+          The read_callback can change the page content (removing page
+          protection) so it have to be called
+        */
+        DBUG_PRINT("info", ("read_callback: 0x%lx  data: 0x%lx",
+                            (ulong) block->hash_link->file.read_callback,
+                            (ulong) block->hash_link->file.callback_data));
+        if ((*block->hash_link->file.read_callback)(block->buffer,
+                                                    block->hash_link->pageno,
+                                                    block->hash_link->
+                                                    file.callback_data))
+        {
+          DBUG_PRINT("error", ("read callback problem"));
+          block->status|= PCBLOCK_ERROR;
+          block->error= (int16) my_errno;
+          my_debug_put_break_here();
+        }
+        KEYCACHE_DBUG_PRINT("key_cache_insert",
+                            ("Page injection"));
+#ifdef THREAD
+        /* Signal that all pending requests for this now can be processed. */
+        if (block->wqueue[COND_FOR_REQUESTED].last_thread)
+          wqueue_release_queue(&block->wqueue[COND_FOR_REQUESTED]);
+#endif
+      }
+    }
+    else
+    {
+      if (! (block->status & PCBLOCK_CHANGED))
+          link_to_changed_list(pagecache, block);
+
+      if (!(size & 511))
+        bmove512(block->buffer + offset, buff, size);
+      else
+        memcpy(block->buffer + offset, buff, size);
+      block->status|= PCBLOCK_READ;
+      /* Page is correct again if we made a full write in it */
+      if (size == pagecache->block_size)
+        block->status&= ~PCBLOCK_ERROR;
+    }
+
+#ifdef THREAD
+    if (need_page_ready_signal &&
+        block->wqueue[COND_FOR_REQUESTED].last_thread)
+      wqueue_release_queue(&block->wqueue[COND_FOR_REQUESTED]);
+#endif
+
+    if (first_REDO_LSN_for_page)
+    {
+      /* single write action of the last write action */
+      DBUG_ASSERT(lock == PAGECACHE_LOCK_WRITE_UNLOCK ||
+                  lock == PAGECACHE_LOCK_LEFT_UNLOCKED);
+      DBUG_ASSERT(pin == PAGECACHE_UNPIN ||
+                  pin == PAGECACHE_PIN_LEFT_UNPINNED);
+      pagecache_set_block_rec_lsn(block, first_REDO_LSN_for_page);
+    }
+
+    if (need_lock_change)
+    {
+      /*
+        We don't set rec_lsn of the block; this is ok as for the
+        Maria-block-record's pages, we always keep pages pinned here.
+      */
+      if (make_lock_and_pin(pagecache, block,
+                            write_lock_change_table[lock].unlock_lock,
+                            write_pin_change_table[pin].unlock_pin, FALSE))
+        DBUG_ASSERT(0);
+    }
+
+    /* Unregister the request */
+    DBUG_ASSERT(block->hash_link->requests > 0);
+    block->hash_link->requests--;
+    /* See NOTE for pagecache_unlock about registering requests. */
+    if (pin == PAGECACHE_PIN_LEFT_UNPINNED || pin == PAGECACHE_UNPIN)
+      unreg_request(pagecache, block, 1);
+    else
+      *page_link= block;
+
+    if (block->status & PCBLOCK_ERROR)
+    {
+      error= 1;
+      my_debug_put_break_here();
+    }
+
+    dec_counter_for_resize_op(pagecache);
+
+    pagecache_pthread_mutex_unlock(&pagecache->cache_lock);
+
+    goto end;
+  }
+
+no_key_cache:
+  /*
+    We can't by pass the normal page cache operations because need
+    whole page for calling callbacks & so on.
+    This branch should not be used for now (but it is fixed as it
+    should be just to avoid confusing)
+  */
+  DBUG_ASSERT(0);
+  /* Key cache is not used */
+  if (write_mode == PAGECACHE_WRITE_DELAY)
+  {
+    /* We can't use mutex here as the key cache may not be initialized */
+    pagecache->global_cache_w_requests++;
+    pagecache->global_cache_write++;
+    if (offset != 0 || size != pagecache->block_size)
+    {
+      uchar *page_buffer= (uchar *) alloca(pagecache->block_size);
+
+      pagecache->global_cache_read++;
+      if ((error= (pagecache_fread(pagecache, file,
+                                   page_buffer,
+                                   pageno,
+                                   pagecache->readwrite_flags) != 0)))
+        goto end;
+      if ((file->read_callback)(page_buffer, pageno, file->callback_data))
+      {
+        DBUG_PRINT("error", ("read callback problem"));
+        error= 1;
+        goto end;
+      }
+      memcpy((char *)page_buffer + offset, buff, size);
+      buff= page_buffer;
+    }
+    if (pagecache_fwrite(pagecache, file, buff, pageno, type,
+                         pagecache->readwrite_flags))
+      error= 1;
+  }
+
+end:
+#if !defined(DBUG_OFF) && defined(EXTRA_DEBUG)
+  DBUG_EXECUTE("exec",
+               test_key_cache(pagecache, "end of key_cache_write", 1););
+#endif
+  if (block)
+    PCBLOCK_INFO(block);
+  else
+    DBUG_PRINT("info", ("No block"));
+  DBUG_RETURN(error);
+}
+
+
+/*
+  Free block: remove reference to it from hash table,
+  remove it from the chain file of dirty/clean blocks
+  and add it to the free list.
+*/
+
+static void free_block(PAGECACHE *pagecache, PAGECACHE_BLOCK_LINK *block)
+{
+  KEYCACHE_THREAD_TRACE("free block");
+  KEYCACHE_DBUG_PRINT("free_block",
+                      ("block: %u  hash_link 0x%lx",
+                       PCBLOCK_NUMBER(pagecache, block),
+                       (long) block->hash_link));
+  if (block->hash_link)
+  {
+    /*
+      While waiting for readers to finish, new readers might request the
+      block. But since we set block->status|= PCBLOCK_REASSIGNED, they
+      will wait on block->wqueue[COND_FOR_SAVED]. They must be signalled
+      later.
+    */
+    block->status|= PCBLOCK_REASSIGNED;
+    wait_for_readers(pagecache, block);
+    unlink_hash(pagecache, block->hash_link);
+  }
+
+  unlink_changed(block);
+  DBUG_ASSERT(block->wlocks == 0);
+  DBUG_ASSERT(block->rlocks == 0);
+  DBUG_ASSERT(block->rlocks_queue == 0);
+  DBUG_ASSERT(block->pins == 0);
+  block->status= 0;
+#ifndef DBUG_OFF
+  block->type= PAGECACHE_EMPTY_PAGE;
+#endif
+  block->rec_lsn= LSN_MAX;
+  KEYCACHE_THREAD_TRACE("free block");
+  KEYCACHE_DBUG_PRINT("free_block",
+                      ("block is freed"));
+  unreg_request(pagecache, block, 0);
+  block->hash_link= NULL;
+
+  /* Remove the free block from the LRU ring. */
+  unlink_block(pagecache, block);
+  if (block->temperature == PCBLOCK_WARM)
+    pagecache->warm_blocks--;
+  block->temperature= PCBLOCK_COLD;
+  /* Insert the free block in the free list. */
+  block->next_used= pagecache->free_block_list;
+  pagecache->free_block_list= block;
+  /* Keep track of the number of currently unused blocks. */
+  pagecache->blocks_unused++;
+
+#ifdef THREAD
+  /* All pending requests for this page must be resubmitted. */
+  if (block->wqueue[COND_FOR_SAVED].last_thread)
+    wqueue_release_queue(&block->wqueue[COND_FOR_SAVED]);
+#endif
+}
+
+
+static int cmp_sec_link(PAGECACHE_BLOCK_LINK **a, PAGECACHE_BLOCK_LINK **b)
+{
+  return (((*a)->hash_link->pageno < (*b)->hash_link->pageno) ? -1 :
+      ((*a)->hash_link->pageno > (*b)->hash_link->pageno) ? 1 : 0);
+}
+
+
+/**
+  @brief Flush a portion of changed blocks to disk, free used blocks
+  if requested
+
+  @param pagecache       This page cache reference.
+  @param file            File which should be flushed
+  @param cache           Beginning of array of the block.
+  @param end             Reference to the block after last in the array.
+  @param flush_type      Type of the flush.
+  @param first_errno     Where to store first errno of the flush.
+
+
+  @return Operation status
+  @retval PCFLUSH_OK OK
+  @retval PCFLUSH_ERROR There was errors during the flush process.
+  @retval PCFLUSH_PINNED Pinned blocks was met and skipped.
+  @retval PCFLUSH_PINNED_AND_ERROR PCFLUSH_ERROR and PCFLUSH_PINNED.
+*/
+
+static int flush_cached_blocks(PAGECACHE *pagecache,
+                               PAGECACHE_FILE *file,
+                               PAGECACHE_BLOCK_LINK **cache,
+                               PAGECACHE_BLOCK_LINK **end,
+                               enum flush_type type,
+                               int *first_errno)
+{
+  int rc= PCFLUSH_OK;
+  my_bool error;
+  uint count= (uint) (end-cache);
+  DBUG_ENTER("flush_cached_blocks");
+  *first_errno= 0;
+
+  /* Don't lock the cache during the flush */
+  pagecache_pthread_mutex_unlock(&pagecache->cache_lock);
+  /*
+     As all blocks referred in 'cache' are marked by PCBLOCK_IN_FLUSH
+     we are guaranteed that no thread will change them
+  */
+  qsort((uchar*) cache, count, sizeof(*cache), (qsort_cmp) cmp_sec_link);
+
+  pagecache_pthread_mutex_lock(&pagecache->cache_lock);
+  for (; cache != end; cache++)
+  {
+    PAGECACHE_BLOCK_LINK *block= *cache;
+
+    /*
+      In the case of non_transactional tables we want to flush also
+      block pinned with reads. This is becasue we may have other
+      threads reading the block during flush, as non transactional
+      tables can have many readers while the one writer is doing the
+      flush.
+      We don't want to do flush pinned blocks during checkpoint.
+      We detect the checkpoint case by checking if type is LAZY.
+    */
+    if ((type == FLUSH_KEEP_LAZY && block->pins) || block->wlocks)
+    {
+      KEYCACHE_DBUG_PRINT("flush_cached_blocks",
+                          ("block: %u (0x%lx)  pinned",
+                           PCBLOCK_NUMBER(pagecache, block), (ulong)block));
+      DBUG_PRINT("info", ("block: %u (0x%lx)  pinned",
+                          PCBLOCK_NUMBER(pagecache, block), (ulong)block));
+      PCBLOCK_INFO(block);
+      /* undo the mark put by flush_pagecache_blocks_int(): */
+      block->status&= ~PCBLOCK_IN_FLUSH;
+      rc|= PCFLUSH_PINNED;
+      DBUG_PRINT("warning", ("Page pinned"));
+      unreg_request(pagecache, block, 1);
+      if (!*first_errno)
+        *first_errno= HA_ERR_INTERNAL_ERROR;
+      continue;
+    }
+    if (make_lock_and_pin(pagecache, block,
+                          PAGECACHE_LOCK_READ, PAGECACHE_PIN, FALSE))
+      DBUG_ASSERT(0);
+
+    KEYCACHE_DBUG_PRINT("flush_cached_blocks",
+                        ("block: %u (0x%lx)  to be flushed",
+                         PCBLOCK_NUMBER(pagecache, block), (ulong)block));
+    DBUG_PRINT("info", ("block: %u (0x%lx)  to be flushed",
+                        PCBLOCK_NUMBER(pagecache, block), (ulong)block));
+    PCBLOCK_INFO(block);
+    pagecache_pthread_mutex_unlock(&pagecache->cache_lock);
+    DBUG_PRINT("info", ("block: %u (0x%lx)  pins: %u",
+                        PCBLOCK_NUMBER(pagecache, block), (ulong)block,
+                        block->pins));
+    /**
+       @todo IO If page is contiguous with next page to flush, group flushes
+       in one single my_pwrite().
+    */
+    /**
+      It is important to use block->hash_link->file below and not 'file', as
+      the first one is right and the second may have different out-of-date
+      content (see StaleFilePointersInFlush in ma_checkpoint.c).
+      @todo change argument of functions to be File.
+    */
+    error= pagecache_fwrite(pagecache, &block->hash_link->file,
+                            block->buffer,
+                            block->hash_link->pageno,
+                            block->type,
+                            pagecache->readwrite_flags);
+    pagecache_pthread_mutex_lock(&pagecache->cache_lock);
+
+    if (make_lock_and_pin(pagecache, block,
+                          PAGECACHE_LOCK_READ_UNLOCK,
+                          PAGECACHE_UNPIN, FALSE))
+      DBUG_ASSERT(0);
+
+    pagecache->global_cache_write++;
+    if (error)
+    {
+      block->status|= PCBLOCK_ERROR;
+      block->error=   (int16) my_errno;
+      my_debug_put_break_here();
+      if (!*first_errno)
+        *first_errno= my_errno ? my_errno : -1;
+      rc|= PCFLUSH_ERROR;
+    }
+#ifdef THREAD
+    /*
+      Let to proceed for possible waiting requests to write to the block page.
+      It might happen only during an operation to resize the key cache.
+    */
+    if (block->wqueue[COND_FOR_SAVED].last_thread)
+      wqueue_release_queue(&block->wqueue[COND_FOR_SAVED]);
+#endif
+    /* type will never be FLUSH_IGNORE_CHANGED here */
+    if (! (type == FLUSH_KEEP || type == FLUSH_KEEP_LAZY ||
+           type == FLUSH_FORCE_WRITE))
+    {
+      pagecache->blocks_changed--;
+      pagecache->global_blocks_changed--;
+      free_block(pagecache, block);
+    }
+    else
+    {
+      block->status&= ~PCBLOCK_IN_FLUSH;
+      link_to_file_list(pagecache, block, file, 1);
+      unreg_request(pagecache, block, 1);
+    }
+  }
+  DBUG_RETURN(rc);
+}
+
+
+/**
+   @brief flush all blocks for a file to disk but don't do any mutex locks
+
+   @param  pagecache       pointer to a pagecache data structure
+   @param  file            handler for the file to flush to
+   @param  flush_type      type of the flush
+   @param  filter          optional function which tells what blocks to flush;
+                           can be non-NULL only if FLUSH_KEEP, FLUSH_KEEP_LAZY
+                           or FLUSH_FORCE_WRITE.
+   @param  filter_arg      an argument to pass to 'filter'. Information about
+                           the block will be passed too.
+
+   @note
+     Flushes all blocks having the same OS file descriptor as 'file->file', so
+     can flush blocks having '*block->hash_link->file' != '*file'.
+
+   @note
+     This function doesn't do any mutex locks because it needs to be called
+     both from flush_pagecache_blocks and flush_all_key_blocks (the later one
+     does the mutex lock in the resize_pagecache() function).
+
+   @note
+     This function can cause problems if two threads call it
+     concurrently on the same file (look for "PageCacheFlushConcurrencyBugs"
+     in ma_checkpoint.c); to avoid them, it has internal logic to serialize in
+     this situation.
+
+   @return Operation status
+   @retval PCFLUSH_OK OK
+   @retval PCFLUSH_ERROR There was errors during the flush process.
+   @retval PCFLUSH_PINNED Pinned blocks was met and skipped.
+   @retval PCFLUSH_PINNED_AND_ERROR PCFLUSH_ERROR and PCFLUSH_PINNED.
+*/
+
+static int flush_pagecache_blocks_int(PAGECACHE *pagecache,
+                                      PAGECACHE_FILE *file,
+                                      enum flush_type type,
+                                      PAGECACHE_FLUSH_FILTER filter,
+                                      void *filter_arg)
+{
+  PAGECACHE_BLOCK_LINK *cache_buff[FLUSH_CACHE],**cache;
+  int last_errno= 0;
+  int rc= PCFLUSH_OK;
+  DBUG_ENTER("flush_pagecache_blocks_int");
+  DBUG_PRINT("enter",
+             ("fd: %d  blocks_used: %lu  blocks_changed: %lu  type: %d",
+              file->file, pagecache->blocks_used, pagecache->blocks_changed,
+              type));
+
+#if !defined(DBUG_OFF) && defined(EXTRA_DEBUG)
+    DBUG_EXECUTE("check_pagecache",
+                 test_key_cache(pagecache,
+                                "start of flush_pagecache_blocks", 0););
+#endif
+
+  cache= cache_buff;
+  if (pagecache->disk_blocks > 0 &&
+      (!my_disable_flush_pagecache_blocks ||
+       (type != FLUSH_KEEP && type != FLUSH_KEEP_LAZY)))
+  {
+    /*
+      Key cache exists. If my_disable_flush_pagecache_blocks is true it
+      disables the operation but only FLUSH_KEEP[_LAZY]: other flushes still
+      need to be allowed: FLUSH_RELEASE has to free blocks, and
+      FLUSH_FORCE_WRITE is to overrule my_disable_flush_pagecache_blocks.
+    */
+    int error= 0;
+    uint count= 0;
+    PAGECACHE_BLOCK_LINK **pos, **end;
+    PAGECACHE_BLOCK_LINK *first_in_switch= NULL;
+    PAGECACHE_BLOCK_LINK *block, *next;
+#if defined(PAGECACHE_DEBUG)
+    uint cnt= 0;
+#endif
+
+#ifdef THREAD
+    struct st_file_in_flush us_flusher, *other_flusher;
+    us_flusher.file= file->file;
+    us_flusher.flush_queue.last_thread= NULL;
+    us_flusher.first_in_switch= FALSE;
+    while ((other_flusher= (struct st_file_in_flush *)
+            hash_search(&pagecache->files_in_flush, (uchar *)&file->file,
+                        sizeof(file->file))))
+    {
+      /*
+        File is in flush already: wait, unless FLUSH_KEEP_LAZY. "Flusher"
+        means "who can mark PCBLOCK_IN_FLUSH", i.e. caller of
+        flush_pagecache_blocks_int().
+      */
+      struct st_my_thread_var *thread;
+      if (type == FLUSH_KEEP_LAZY)
+      {
+        DBUG_PRINT("info",("FLUSH_KEEP_LAZY skips"));
+        DBUG_RETURN(0);
+      }
+      thread= my_thread_var;
+      wqueue_add_to_queue(&other_flusher->flush_queue, thread);
+      do
+      {
+        KEYCACHE_DBUG_PRINT("flush_pagecache_blocks_int: wait1",
+                            ("suspend thread %ld", thread->id));
+        pagecache_pthread_cond_wait(&thread->suspend,
+                                    &pagecache->cache_lock);
+      }
+      while (thread->next);
+    }
+    /* we are the only flusher of this file now */
+    while (my_hash_insert(&pagecache->files_in_flush, (uchar *)&us_flusher))
+    {
+      /*
+        Out of memory, wait for flushers to empty the hash and retry; should
+        rarely happen. Other threads are flushing the file; when done, they
+        are going to remove themselves from the hash, and thus memory will
+        appear again. However, this memory may be stolen by yet another thread
+        (for a purpose unrelated to page cache), before we retry
+        hash_insert(). So the loop may run for long. Only if the thread was
+        killed do we abort the loop, returning 1 (error) which can cause the
+        table to be marked as corrupted (cf maria_chk_size(), maria_close())
+        and thus require a table check.
+      */
+      DBUG_ASSERT(0);
+      pagecache_pthread_mutex_unlock(&pagecache->cache_lock);
+      if (my_thread_var->abort)
+        DBUG_RETURN(1);		/* End if aborted by user */
+      sleep(10);
+      pagecache_pthread_mutex_lock(&pagecache->cache_lock);
+    }
+#endif
+
+    if (type != FLUSH_IGNORE_CHANGED)
+    {
+      /*
+        Count how many key blocks we have to cache to be able
+        to flush all dirty pages with minimum seek moves.
+      */
+      for (block= pagecache->changed_blocks[FILE_HASH(*file)] ;
+           block;
+           block= block->next_changed)
+      {
+        if (block->hash_link->file.file == file->file)
+        {
+          count++;
+          KEYCACHE_DBUG_ASSERT(count<= pagecache->blocks_used);
+        }
+      }
+      /* Allocate a new buffer only if its bigger than the one we have */
+      if (count > FLUSH_CACHE &&
+          !(cache=
+            (PAGECACHE_BLOCK_LINK**)
+            my_malloc(sizeof(PAGECACHE_BLOCK_LINK*)*count, MYF(0))))
+      {
+        cache= cache_buff;
+        count= FLUSH_CACHE;
+      }
+    }
+
+    /* Retrieve the blocks and write them to a buffer to be flushed */
+restart:
+    end= (pos= cache)+count;
+    for (block= pagecache->changed_blocks[FILE_HASH(*file)] ;
+         block;
+         block= next)
+    {
+#if defined(PAGECACHE_DEBUG)
+      cnt++;
+      KEYCACHE_DBUG_ASSERT(cnt <= pagecache->blocks_used);
+#endif
+      next= block->next_changed;
+      if (block->hash_link->file.file != file->file)
+        continue;
+      if (filter != NULL)
+      {
+        int filter_res= (*filter)(block->type, block->hash_link->pageno,
+                                  block->rec_lsn, filter_arg);
+        DBUG_PRINT("info",("filter returned %d", filter_res));
+        if (filter_res == FLUSH_FILTER_SKIP_TRY_NEXT)
+          continue;
+        if (filter_res == FLUSH_FILTER_SKIP_ALL)
+          break;
+        DBUG_ASSERT(filter_res == FLUSH_FILTER_OK);
+      }
+      {
+        /*
+           Mark the block with BLOCK_IN_FLUSH in order not to let
+           other threads to use it for new pages and interfere with
+           our sequence of flushing dirty file pages
+        */
+        block->status|= PCBLOCK_IN_FLUSH;
+
+        if (! (block->status & PCBLOCK_IN_SWITCH))
+        {
+	  /*
+	    We care only for the blocks for which flushing was not
+	    initiated by other threads as a result of page swapping
+          */
+          reg_requests(pagecache, block, 1);
+          if (type != FLUSH_IGNORE_CHANGED)
+          {
+	    /* It's not a temporary file */
+            if (pos == end)
+            {
+	      /*
+		This happens only if there is not enough
+		memory for the big block
+              */
+              if ((rc|= flush_cached_blocks(pagecache, file, cache,
+                                            end, type, &error)) &
+                  (PCFLUSH_ERROR | PCFLUSH_PINNED))
+                last_errno=error;
+              DBUG_PRINT("info", ("restarting..."));
+              /*
+		Restart the scan as some other thread might have changed
+		the changed blocks chain: the blocks that were in switch
+		state before the flush started have to be excluded
+              */
+              goto restart;
+            }
+            *pos++= block;
+          }
+          else
+          {
+            /* It's a temporary file */
+            pagecache->blocks_changed--;
+	    pagecache->global_blocks_changed--;
+            free_block(pagecache, block);
+          }
+        }
+        else if (type != FLUSH_KEEP_LAZY)
+        {
+          /*
+            Link the block into a list of blocks 'in switch', and then we will
+            wait for this list to be empty, which means they have been flushed
+          */
+          unlink_changed(block);
+          link_changed(block, &first_in_switch);
+          us_flusher.first_in_switch= TRUE;
+        }
+      }
+    }
+    if (pos != cache)
+    {
+      if ((rc|= flush_cached_blocks(pagecache, file, cache, pos, type,
+                                    &error)) &
+          (PCFLUSH_ERROR | PCFLUSH_PINNED))
+        last_errno= error;
+    }
+    /* Wait until list of blocks in switch is empty */
+    while (first_in_switch)
+    {
+#if defined(PAGECACHE_DEBUG)
+      cnt= 0;
+#endif
+      block= first_in_switch;
+      {
+#ifdef THREAD
+        struct st_my_thread_var *thread= my_thread_var;
+        wqueue_add_to_queue(&block->wqueue[COND_FOR_SAVED], thread);
+        do
+        {
+          KEYCACHE_DBUG_PRINT("flush_pagecache_blocks_int: wait2",
+                              ("suspend thread %ld", thread->id));
+          pagecache_pthread_cond_wait(&thread->suspend,
+                                     &pagecache->cache_lock);
+        }
+        while (thread->next);
+#else
+        KEYCACHE_DBUG_ASSERT(0);
+        /* No parallel requests in single-threaded case */
+#endif
+      }
+#if defined(PAGECACHE_DEBUG)
+      cnt++;
+      KEYCACHE_DBUG_ASSERT(cnt <= pagecache->blocks_used);
+#endif
+    }
+    us_flusher.first_in_switch= FALSE;
+    /* The following happens very seldom */
+    if (! (type == FLUSH_KEEP || type == FLUSH_KEEP_LAZY ||
+           type == FLUSH_FORCE_WRITE))
+    {
+      /*
+        this code would free all blocks while filter maybe handled only a
+        few, that is not possible.
+      */
+      DBUG_ASSERT(filter == NULL);
+#if defined(PAGECACHE_DEBUG)
+      cnt=0;
+#endif
+      for (block= pagecache->file_blocks[FILE_HASH(*file)] ;
+           block;
+           block= next)
+      {
+#if defined(PAGECACHE_DEBUG)
+        cnt++;
+        KEYCACHE_DBUG_ASSERT(cnt <= pagecache->blocks_used);
+#endif
+        next= block->next_changed;
+        if (block->hash_link->file.file == file->file &&
+            (! (block->status & PCBLOCK_CHANGED)
+             || type == FLUSH_IGNORE_CHANGED))
+        {
+          reg_requests(pagecache, block, 1);
+          free_block(pagecache, block);
+        }
+      }
+    }
+#ifdef THREAD
+    /* wake up others waiting to flush this file */
+    hash_delete(&pagecache->files_in_flush, (uchar *)&us_flusher);
+    if (us_flusher.flush_queue.last_thread)
+      wqueue_release_queue(&us_flusher.flush_queue);
+#endif
+  }
+
+#ifndef DBUG_OFF
+  DBUG_EXECUTE("check_pagecache",
+               test_key_cache(pagecache, "end of flush_pagecache_blocks", 0););
+#endif
+  if (cache != cache_buff)
+    my_free(cache, MYF(0));
+  if (rc != 0)
+  {
+    if (last_errno)
+      my_errno= last_errno;                /* Return first error */
+    DBUG_PRINT("error", ("Got error: %d", my_errno));
+  }
+  DBUG_RETURN(rc);
+}
+
+
+/**
+   @brief flush all blocks for a file to disk
+
+   @param  pagecache       pointer to a pagecache data structure
+   @param  file            handler for the file to flush to
+   @param  flush_type      type of the flush
+   @param  filter          optional function which tells what blocks to flush;
+                           can be non-NULL only if FLUSH_KEEP, FLUSH_KEEP_LAZY
+                           or FLUSH_FORCE_WRITE.
+   @param  filter_arg      an argument to pass to 'filter'. Information about
+                           the block will be passed too.
+
+   @return Operation status
+   @retval PCFLUSH_OK OK
+   @retval PCFLUSH_ERROR There was errors during the flush process.
+   @retval PCFLUSH_PINNED Pinned blocks was met and skipped.
+   @retval PCFLUSH_PINNED_AND_ERROR PCFLUSH_ERROR and PCFLUSH_PINNED.
+*/
+
+int flush_pagecache_blocks_with_filter(PAGECACHE *pagecache,
+                                       PAGECACHE_FILE *file,
+                                       enum flush_type type,
+                                       PAGECACHE_FLUSH_FILTER filter,
+                                       void *filter_arg)
+{
+  int res;
+  DBUG_ENTER("flush_pagecache_blocks_with_filter");
+  DBUG_PRINT("enter", ("pagecache: 0x%lx", (long) pagecache));
+
+  if (pagecache->disk_blocks <= 0)
+    DBUG_RETURN(0);
+  pagecache_pthread_mutex_lock(&pagecache->cache_lock);
+  inc_counter_for_resize_op(pagecache);
+  res= flush_pagecache_blocks_int(pagecache, file, type, filter, filter_arg);
+  dec_counter_for_resize_op(pagecache);
+  pagecache_pthread_mutex_unlock(&pagecache->cache_lock);
+  DBUG_RETURN(res);
+}
+
+
+/*
+  Reset the counters of a key cache.
+
+  SYNOPSIS
+    reset_pagecache_counters()
+    name       the name of a key cache
+    pagecache  pointer to the pagecache to be reset
+
+  DESCRIPTION
+    This procedure is used to reset the counters of all currently used key
+    caches, both the default one and the named ones.
+
+  RETURN
+    0 on success (always because it can't fail)
+*/
+
+int reset_pagecache_counters(const char *name __attribute__((unused)),
+                             PAGECACHE *pagecache)
+{
+  DBUG_ENTER("reset_pagecache_counters");
+  if (!pagecache->inited)
+  {
+    DBUG_PRINT("info", ("Key cache %s not initialized.", name));
+    DBUG_RETURN(0);
+  }
+  DBUG_PRINT("info", ("Resetting counters for key cache %s.", name));
+
+  pagecache->global_blocks_changed= 0;   /* Key_blocks_not_flushed */
+  pagecache->global_cache_r_requests= 0; /* Key_read_requests */
+  pagecache->global_cache_read= 0;       /* Key_reads */
+  pagecache->global_cache_w_requests= 0; /* Key_write_requests */
+  pagecache->global_cache_write= 0;      /* Key_writes */
+  DBUG_RETURN(0);
+}
+
+
+/**
+   @brief Allocates a buffer and stores in it some info about all dirty pages
+
+   Does the allocation because the caller cannot know the size itself.
+   Memory freeing is to be done by the caller (if the "str" member of the
+   LEX_STRING is not NULL).
+   Ignores all pages of another type than PAGECACHE_LSN_PAGE, because they
+   are not interesting for a checkpoint record.
+   The caller has the intention of doing checkpoints.
+
+   @param       pagecache   pointer to the page cache
+   @param[out]  str         pointer to where the allocated buffer, and
+                            its size, will be put
+   @param[out]  min_rec_lsn pointer to where the minimum rec_lsn of all
+                            relevant dirty pages will be put
+   @return Operation status
+     @retval 0      OK
+     @retval 1      Error
+*/
+
+my_bool pagecache_collect_changed_blocks_with_lsn(PAGECACHE *pagecache,
+                                                  LEX_STRING *str,
+                                                  LSN *min_rec_lsn)
+{
+  my_bool error= 0;
+  ulong stored_list_size= 0;
+  uint file_hash;
+  char *ptr;
+  LSN minimum_rec_lsn= LSN_MAX;
+  DBUG_ENTER("pagecache_collect_changed_blocks_with_LSN");
+
+  DBUG_ASSERT(NULL == str->str);
+  /*
+    We lock the entire cache but will be quick, just reading/writing a few MBs
+    of memory at most.
+  */
+  pagecache_pthread_mutex_lock(&pagecache->cache_lock);
+#ifdef THREAD
+  for (;;)
+  {
+    struct st_file_in_flush *other_flusher;
+    for (file_hash= 0;
+         (other_flusher= (struct st_file_in_flush *)
+          hash_element(&pagecache->files_in_flush, file_hash)) != NULL &&
+           !other_flusher->first_in_switch;
+         file_hash++)
+    {}
+    if (other_flusher == NULL)
+      break;
+    /*
+      other_flusher.first_in_switch is true: some thread is flushing a file
+      and has removed dirty blocks from changed_blocks[] while they were still
+      dirty (they were being evicted (=>flushed) by yet another thread, which
+      may not have flushed the block yet so it may still be dirty).
+      If Checkpoint proceeds now, it will not see the page. If there is a
+      crash right after writing the checkpoint record, before the page is
+      flushed, at recovery the page will be wrongly ignored because it won't
+      be in the dirty pages list in the checkpoint record. So wait.
+    */
+    {
+      struct st_my_thread_var *thread= my_thread_var;
+      wqueue_add_to_queue(&other_flusher->flush_queue, thread);
+      do
+      {
+        KEYCACHE_DBUG_PRINT("pagecache_collect_changed_blocks_with_lsn: wait",
+                            ("suspend thread %ld", thread->id));
+        pagecache_pthread_cond_wait(&thread->suspend,
+                                    &pagecache->cache_lock);
+      }
+      while (thread->next);
+    }
+  }
+#endif
+
+  /* Count how many dirty pages are interesting */
+  for (file_hash= 0; file_hash < PAGECACHE_CHANGED_BLOCKS_HASH; file_hash++)
+  {
+    PAGECACHE_BLOCK_LINK *block;
+    for (block= pagecache->changed_blocks[file_hash] ;
+         block;
+         block= block->next_changed)
+    {
+      /*
+        Q: is there something subtle with block->hash_link: can it be NULL?
+        does it have to be == hash_link->block... ?
+      */
+      DBUG_ASSERT(block->hash_link != NULL);
+      DBUG_ASSERT(block->status & PCBLOCK_CHANGED);
+      /*
+        Note that we don't store bitmap pages, or pages from non-transactional
+        (like temporary) tables. Don't checkpoint during Recovery which uses
+        PAGECACHE_PLAIN_PAGE.
+      */
+      if (block->type != PAGECACHE_LSN_PAGE)
+        continue; /* no need to store it */
+      stored_list_size++;
+    }
+  }
+
+  compile_time_assert(sizeof(pagecache->blocks) <= 8);
+  str->length= 8 + /* number of dirty pages */
+    (2 + /* table id */
+     1 + /* data or index file */
+     5 + /* pageno */
+     LSN_STORE_SIZE /* rec_lsn */
+     ) * stored_list_size;
+  if (NULL == (str->str= my_malloc(str->length, MYF(MY_WME))))
+    goto err;
+  ptr= str->str;
+  int8store(ptr, (ulonglong)stored_list_size);
+  ptr+= 8;
+  DBUG_PRINT("info", ("found %lu dirty pages", stored_list_size));
+  if (stored_list_size == 0)
+    goto end;
+  for (file_hash= 0; file_hash < PAGECACHE_CHANGED_BLOCKS_HASH; file_hash++)
+  {
+    PAGECACHE_BLOCK_LINK *block;
+    for (block= pagecache->changed_blocks[file_hash] ;
+         block;
+         block= block->next_changed)
+    {
+      uint16 table_id;
+      MARIA_SHARE *share;
+      if (block->type != PAGECACHE_LSN_PAGE)
+        continue; /* no need to store it in the checkpoint record */
+      share= (MARIA_SHARE *)(block->hash_link->file.callback_data);
+      table_id= share->id;
+      int2store(ptr, table_id);
+      ptr+= 2;
+      ptr[0]= (share->kfile.file == block->hash_link->file.file);
+      ptr++;
+      DBUG_ASSERT(block->hash_link->pageno < ((ULL(1)) << 40));
+      page_store(ptr, block->hash_link->pageno);
+      ptr+= PAGE_STORE_SIZE;
+      lsn_store(ptr, block->rec_lsn);
+      ptr+= LSN_STORE_SIZE;
+      if (block->rec_lsn != LSN_MAX)
+      {
+        DBUG_ASSERT(LSN_VALID(block->rec_lsn));
+        if (cmp_translog_addr(block->rec_lsn, minimum_rec_lsn) < 0)
+          minimum_rec_lsn= block->rec_lsn;
+      } /* otherwise, some trn->rec_lsn should hold the correct info */
+    }
+  }
+end:
+  pagecache_pthread_mutex_unlock(&pagecache->cache_lock);
+  *min_rec_lsn= minimum_rec_lsn;
+  DBUG_RETURN(error);
+
+err:
+  error= 1;
+  goto end;
+}
+
+
+#ifndef DBUG_OFF
+
+/**
+  Verifies that a file has no dirty pages.
+*/
+
+void pagecache_file_no_dirty_page(PAGECACHE *pagecache, PAGECACHE_FILE *file)
+{
+  File fd= file->file;
+  PAGECACHE_BLOCK_LINK *block;
+  for (block= pagecache->changed_blocks[FILE_HASH(*file)];
+       block != NULL;
+       block= block->next_changed)
+    if (block->hash_link->file.file == fd)
+    {
+      DBUG_PRINT("info", ("pagecache_file_not_in error"));
+      PCBLOCK_INFO(block);
+      DBUG_ASSERT(0);
+    }
+}
+
+
+/*
+  Test if disk-cache is ok
+*/
+static void test_key_cache(PAGECACHE *pagecache __attribute__((unused)),
+                           const char *where __attribute__((unused)),
+                           my_bool lock __attribute__((unused)))
+{
+  /* TODO */
+}
+#endif
+
+uchar *pagecache_block_link_to_buffer(PAGECACHE_BLOCK_LINK *block)
+{
+  return block->buffer;
+}
+
+#if defined(PAGECACHE_TIMEOUT)
+
+#define KEYCACHE_DUMP_FILE  "pagecache_dump.txt"
+#define MAX_QUEUE_LEN  100
+
+
+static void pagecache_dump(PAGECACHE *pagecache)
+{
+  FILE *pagecache_dump_file=fopen(KEYCACHE_DUMP_FILE, "w");
+  struct st_my_thread_var *last;
+  struct st_my_thread_var *thread;
+  PAGECACHE_BLOCK_LINK *block;
+  PAGECACHE_HASH_LINK *hash_link;
+  PAGECACHE_PAGE *page;
+  uint i;
+
+  fprintf(pagecache_dump_file, "thread:%u\n", thread->id);
+
+  i=0;
+  thread=last=waiting_for_hash_link.last_thread;
+  fprintf(pagecache_dump_file, "queue of threads waiting for hash link\n");
+  if (thread)
+    do
+    {
+      thread= thread->next;
+      page= (PAGECACHE_PAGE *) thread->opt_info;
+      fprintf(pagecache_dump_file,
+              "thread:%u, (file,pageno)=(%u,%lu)\n",
+              thread->id,(uint) page->file.file,(ulong) page->pageno);
+      if (++i == MAX_QUEUE_LEN)
+        break;
+    }
+    while (thread != last);
+
+  i=0;
+  thread=last=waiting_for_block.last_thread;
+  fprintf(pagecache_dump_file, "queue of threads waiting for block\n");
+  if (thread)
+    do
+    {
+      thread=thread->next;
+      hash_link= (PAGECACHE_HASH_LINK *) thread->opt_info;
+      fprintf(pagecache_dump_file,
+        "thread:%u hash_link:%u (file,pageno)=(%u,%lu)\n",
+        thread->id, (uint) PAGECACHE_HASH_LINK_NUMBER(pagecache, hash_link),
+        (uint) hash_link->file.file,(ulong) hash_link->pageno);
+      if (++i == MAX_QUEUE_LEN)
+        break;
+    }
+    while (thread != last);
+
+  for (i=0 ; i < pagecache->blocks_used ; i++)
+  {
+    int j;
+    block= &pagecache->block_root[i];
+    hash_link= block->hash_link;
+    fprintf(pagecache_dump_file,
+            "block:%u hash_link:%d status:%x #requests=%u waiting_for_readers:%d\n",
+            i, (int) (hash_link ?
+                      PAGECACHE_HASH_LINK_NUMBER(pagecache, hash_link) :
+                      -1),
+            block->status, block->requests, block->condvar ? 1 : 0);
+    for (j=0 ; j < COND_SIZE; j++)
+    {
+      PAGECACHE_WQUEUE *wqueue=&block->wqueue[j];
+      thread= last= wqueue->last_thread;
+      fprintf(pagecache_dump_file, "queue #%d\n", j);
+      if (thread)
+      {
+        do
+        {
+          thread=thread->next;
+          fprintf(pagecache_dump_file,
+                  "thread:%u\n", thread->id);
+          if (++i == MAX_QUEUE_LEN)
+            break;
+        }
+        while (thread != last);
+      }
+    }
+  }
+  fprintf(pagecache_dump_file, "LRU chain:");
+  block= pagecache= used_last;
+  if (block)
+  {
+    do
+    {
+      block= block->next_used;
+      fprintf(pagecache_dump_file,
+              "block:%u, ", PCBLOCK_NUMBER(pagecache, block));
+    }
+    while (block != pagecache->used_last);
+  }
+  fprintf(pagecache_dump_file, "\n");
+
+  fclose(pagecache_dump_file);
+}
+
+#endif /* defined(PAGECACHE_TIMEOUT) */
+
+#if defined(PAGECACHE_TIMEOUT) && !defined(__WIN__)
+
+
+static int pagecache_pthread_cond_wait(pthread_cond_t *cond,
+                                      pthread_mutex_t *mutex)
+{
+  int rc;
+  struct timeval  now;            /* time when we started waiting        */
+  struct timespec timeout;        /* timeout value for the wait function */
+  struct timezone tz;
+#if defined(PAGECACHE_DEBUG)
+  int cnt=0;
+#endif
+
+  /* Get current time */
+  gettimeofday(&now, &tz);
+  /* Prepare timeout value */
+  timeout.tv_sec= now.tv_sec + PAGECACHE_TIMEOUT;
+ /*
+   timeval uses microseconds.
+   timespec uses nanoseconds.
+   1 nanosecond = 1000 micro seconds
+ */
+  timeout.tv_nsec= now.tv_usec * 1000;
+  KEYCACHE_THREAD_TRACE_END("started waiting");
+#if defined(PAGECACHE_DEBUG)
+  cnt++;
+  if (cnt % 100 == 0)
+    fprintf(pagecache_debug_log, "waiting...\n");
+    fflush(pagecache_debug_log);
+#endif
+  rc= pthread_cond_timedwait(cond, mutex, &timeout);
+  KEYCACHE_THREAD_TRACE_BEGIN("finished waiting");
+  if (rc == ETIMEDOUT || rc == ETIME)
+  {
+#if defined(PAGECACHE_DEBUG)
+    fprintf(pagecache_debug_log,"aborted by pagecache timeout\n");
+    fclose(pagecache_debug_log);
+    abort();
+#endif
+    pagecache_dump();
+  }
+
+#if defined(PAGECACHE_DEBUG)
+  KEYCACHE_DBUG_ASSERT(rc != ETIMEDOUT);
+#else
+  assert(rc != ETIMEDOUT);
+#endif
+  return rc;
+}
+#else
+#if defined(PAGECACHE_DEBUG)
+static int pagecache_pthread_cond_wait(pthread_cond_t *cond,
+                                      pthread_mutex_t *mutex)
+{
+  int rc;
+  KEYCACHE_THREAD_TRACE_END("started waiting");
+  rc= pthread_cond_wait(cond, mutex);
+  KEYCACHE_THREAD_TRACE_BEGIN("finished waiting");
+  return rc;
+}
+#endif
+#endif /* defined(PAGECACHE_TIMEOUT) && !defined(__WIN__) */
+
+#if defined(PAGECACHE_DEBUG)
+static int ___pagecache_pthread_mutex_lock(pthread_mutex_t *mutex)
+{
+  int rc;
+  rc= pthread_mutex_lock(mutex);
+  KEYCACHE_THREAD_TRACE_BEGIN("");
+  return rc;
+}
+
+
+static void ___pagecache_pthread_mutex_unlock(pthread_mutex_t *mutex)
+{
+  KEYCACHE_THREAD_TRACE_END("");
+  pthread_mutex_unlock(mutex);
+}
+
+
+static int ___pagecache_pthread_cond_signal(pthread_cond_t *cond)
+{
+  int rc;
+  KEYCACHE_THREAD_TRACE("signal");
+  rc= pthread_cond_signal(cond);
+  return rc;
+}
+
+
+#if defined(PAGECACHE_DEBUG_LOG)
+
+
+static void pagecache_debug_print(const char * fmt, ...)
+{
+  va_list args;
+  va_start(args,fmt);
+  if (pagecache_debug_log)
+  {
+    VOID(vfprintf(pagecache_debug_log, fmt, args));
+    VOID(fputc('\n',pagecache_debug_log));
+  }
+  va_end(args);
+}
+#endif /* defined(PAGECACHE_DEBUG_LOG) */
+
+#if defined(PAGECACHE_DEBUG_LOG)
+
+
+void pagecache_debug_log_close(void)
+{
+  if (pagecache_debug_log)
+    fclose(pagecache_debug_log);
+}
+#endif /* defined(PAGECACHE_DEBUG_LOG) */
+
+#endif /* defined(PAGECACHE_DEBUG) */
diff --git a/storage/maria/ma_pagecache.h b/storage/maria/ma_pagecache.h
new file mode 100644
index 00000000000..821728ef374
--- /dev/null
+++ b/storage/maria/ma_pagecache.h
@@ -0,0 +1,325 @@
+/* Copyright (C) 2006 MySQL AB
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; version 2 of the License.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */
+
+/* Page cache variable structures */
+
+#ifndef _ma_pagecache_h
+#define _ma_pagecache_h
+C_MODE_START
+
+#include "ma_loghandler_lsn.h"
+#include <m_string.h>
+#include <hash.h>
+
+/* Type of the page */
+enum pagecache_page_type
+{
+  /*
+    Used only for control page type changing during debugging. This define
+    should only be using when using DBUG.
+  */
+  PAGECACHE_EMPTY_PAGE,
+  /* the page does not contain LSN */
+  PAGECACHE_PLAIN_PAGE,
+  /* the page contain LSN (maria tablespace page) */
+  PAGECACHE_LSN_PAGE,
+  /* Page type used when scanning file and we don't care about the type */
+  PAGECACHE_READ_UNKNOWN_PAGE
+};
+
+/*
+  This enum describe lock status changing. every type of page cache will
+  interpret WRITE/READ lock as it need.
+*/
+enum pagecache_page_lock
+{
+  PAGECACHE_LOCK_LEFT_UNLOCKED,       /* free  -> free  */
+  PAGECACHE_LOCK_LEFT_READLOCKED,     /* read  -> read  */
+  PAGECACHE_LOCK_LEFT_WRITELOCKED,    /* write -> write */
+  PAGECACHE_LOCK_READ,                /* free  -> read  */
+  PAGECACHE_LOCK_WRITE,               /* free  -> write */
+  PAGECACHE_LOCK_READ_UNLOCK,         /* read  -> free  */
+  PAGECACHE_LOCK_WRITE_UNLOCK,        /* write -> free  */
+  PAGECACHE_LOCK_WRITE_TO_READ        /* write -> read  */
+};
+/*
+  This enum describe pin status changing
+*/
+enum pagecache_page_pin
+{
+  PAGECACHE_PIN_LEFT_PINNED,   /* pinned   -> pinned   */
+  PAGECACHE_PIN_LEFT_UNPINNED, /* unpinned -> unpinned */
+  PAGECACHE_PIN,               /* unpinned -> pinned   */
+  PAGECACHE_UNPIN              /* pinned   -> unpinned */
+};
+/* How to write the page */
+enum pagecache_write_mode
+{
+  /* do not write immediately, i.e. it will be dirty page */
+  PAGECACHE_WRITE_DELAY,
+  /* page already is in the file. (key cache insert analogue) */
+  PAGECACHE_WRITE_DONE
+};
+
+/* page number for maria */
+typedef ulonglong pgcache_page_no_t;
+
+/* file descriptor for Maria */
+typedef struct st_pagecache_file
+{
+  File file;
+  /** Cannot be NULL */
+  my_bool (*read_callback)(uchar *page, pgcache_page_no_t offset,
+                           uchar *data);
+  /** Cannot be NULL */
+  my_bool (*write_callback)(uchar *page, pgcache_page_no_t offset,
+                            uchar *data);
+  void (*write_fail)(uchar *data);
+  /** Cannot be NULL */
+  my_bool (*flush_log_callback)(uchar *page, pgcache_page_no_t offset,
+                                uchar *data);
+  uchar *callback_data;
+} PAGECACHE_FILE;
+
+/* declare structures that is used by  st_pagecache */
+
+struct st_pagecache_block_link;
+typedef struct st_pagecache_block_link PAGECACHE_BLOCK_LINK;
+struct st_pagecache_page;
+typedef struct st_pagecache_page PAGECACHE_PAGE;
+struct st_pagecache_hash_link;
+typedef struct st_pagecache_hash_link PAGECACHE_HASH_LINK;
+
+#include <wqueue.h>
+
+#define PAGECACHE_CHANGED_BLOCKS_HASH 128  /* must be power of 2 */
+#define PAGECACHE_PRIORITY_LOW 0
+#define PAGECACHE_PRIORITY_DEFAULT 3
+#define PAGECACHE_PRIORITY_HIGH 6
+
+/*
+  The page cache structure
+  It also contains read-only statistics parameters.
+*/
+
+typedef struct st_pagecache
+{
+  size_t mem_size;               /* specified size of the cache memory       */
+  ulong min_warm_blocks;         /* min number of warm blocks;               */
+  ulong age_threshold;           /* age threshold for hot blocks             */
+  ulonglong time;                /* total number of block link operations    */
+  ulong hash_entries;            /* max number of entries in the hash table  */
+  long hash_links;               /* max number of hash links                 */
+  long hash_links_used;   /* number of hash links taken from free links pool */
+  long disk_blocks;              /* max number of blocks in the cache        */
+  ulong blocks_used;           /* maximum number of concurrently used blocks */
+  ulong blocks_unused;           /* number of currently unused blocks        */
+  ulong blocks_changed;          /* number of currently dirty blocks         */
+  ulong warm_blocks;             /* number of blocks in warm sub-chain       */
+  ulong cnt_for_resize_op;       /* counter to block resize operation        */
+  ulong blocks_available;     /* number of blocks available in the LRU chain */
+  long blocks;                   /* max number of blocks in the cache        */
+  uint32 block_size;             /* size of the page buffer of a cache block */
+  PAGECACHE_HASH_LINK **hash_root;/* arr. of entries into hash table buckets */
+  PAGECACHE_HASH_LINK *hash_link_root;/* memory for hash table links         */
+  PAGECACHE_HASH_LINK *free_hash_list;/* list of free hash links             */
+  PAGECACHE_BLOCK_LINK *free_block_list;/* list of free blocks               */
+  PAGECACHE_BLOCK_LINK *block_root;/* memory for block links                 */
+  uchar HUGE_PTR *block_mem;     /* memory for block buffers                 */
+  PAGECACHE_BLOCK_LINK *used_last;/* ptr to the last block of the LRU chain  */
+  PAGECACHE_BLOCK_LINK *used_ins;/* ptr to the insertion block in LRU chain  */
+  pthread_mutex_t cache_lock;    /* to lock access to the cache structure    */
+  WQUEUE resize_queue; /* threads waiting during resize operation  */
+  WQUEUE waiting_for_hash_link;/* waiting for a free hash link     */
+  WQUEUE waiting_for_block;   /* requests waiting for a free block */
+  /* hash for dirty file bl.*/
+  PAGECACHE_BLOCK_LINK *changed_blocks[PAGECACHE_CHANGED_BLOCKS_HASH];
+  /* hash for other file bl.*/
+  PAGECACHE_BLOCK_LINK *file_blocks[PAGECACHE_CHANGED_BLOCKS_HASH];
+
+  /*
+    The following variables are and variables used to hold parameters for
+    initializing the key cache.
+  */
+
+  ulonglong param_buff_size;    /* size the memory allocated for the cache  */
+  ulong param_block_size;       /* size of the blocks in the key cache      */
+  ulong param_division_limit;   /* min. percentage of warm blocks           */
+  ulong param_age_threshold;    /* determines when hot block is downgraded  */
+
+  /* Statistics variables. These are reset in reset_pagecache_counters().    */
+  ulong global_blocks_changed;	/* number of currently dirty blocks          */
+  ulonglong global_cache_w_requests;/* number of write requests (write hits) */
+  ulonglong global_cache_write;     /* number of writes from cache to files  */
+  ulonglong global_cache_r_requests;/* number of read requests (read hits)   */
+  ulonglong global_cache_read;      /* number of reads from files to cache   */
+
+  uint shift;                       /* block size = 2 ^ shift                */
+  myf  readwrite_flags;             /* Flags to pread/pwrite() */
+  myf  org_readwrite_flags;         /* Flags to pread/pwrite() at init */
+  my_bool inited;
+  my_bool resize_in_flush;       /* true during flush of resize operation    */
+  my_bool can_be_used;           /* usage of cache for read/write is allowed */
+  my_bool in_init;		/* Set to 1 in MySQL during init/resize     */
+  HASH    files_in_flush;       /**< files in flush_pagecache_blocks_int() */
+} PAGECACHE;
+
+/** @brief Return values for PAGECACHE_FLUSH_FILTER */
+enum pagecache_flush_filter_result
+{
+  FLUSH_FILTER_SKIP_TRY_NEXT= 0,/**< skip page and move on to next one */
+  FLUSH_FILTER_OK,              /**< flush page and move on to next one */
+  FLUSH_FILTER_SKIP_ALL         /**< skip page and all next ones */
+};
+/** @brief a filter function type for flush_pagecache_blocks_with_filter() */
+typedef enum pagecache_flush_filter_result
+(*PAGECACHE_FLUSH_FILTER)(enum pagecache_page_type type,
+                          pgcache_page_no_t page,
+                          LSN rec_lsn, void *arg);
+
+/* The default key cache */
+extern PAGECACHE dflt_pagecache_var, *dflt_pagecache;
+
+extern ulong init_pagecache(PAGECACHE *pagecache, size_t use_mem,
+                            uint division_limit, uint age_threshold,
+                            uint block_size, myf my_read_flags);
+extern ulong resize_pagecache(PAGECACHE *pagecache,
+                              size_t use_mem, uint division_limit,
+                              uint age_threshold);
+extern void change_pagecache_param(PAGECACHE *pagecache, uint division_limit,
+                                   uint age_threshold);
+
+extern uchar *pagecache_read(PAGECACHE *pagecache,
+                             PAGECACHE_FILE *file,
+                             pgcache_page_no_t pageno,
+                             uint level,
+                             uchar *buff,
+                             enum pagecache_page_type type,
+                             enum pagecache_page_lock lock,
+                             PAGECACHE_BLOCK_LINK **link);
+
+#define  pagecache_write(P,F,N,L,B,T,O,I,M,K,R) \
+   pagecache_write_part(P,F,N,L,B,T,O,I,M,K,R,0,(P)->block_size)
+
+#define  pagecache_inject(P,F,N,L,B,T,O,I,K,R) \
+   pagecache_write_part(P,F,N,L,B,T,O,I,PAGECACHE_WRITE_DONE, \
+                        K,R,0,(P)->block_size)
+
+extern my_bool pagecache_write_part(PAGECACHE *pagecache,
+                                    PAGECACHE_FILE *file,
+                                    pgcache_page_no_t pageno,
+                                    uint level,
+                                    uchar *buff,
+                                    enum pagecache_page_type type,
+                                    enum pagecache_page_lock lock,
+                                    enum pagecache_page_pin pin,
+                                    enum pagecache_write_mode write_mode,
+                                    PAGECACHE_BLOCK_LINK **link,
+                                    LSN first_REDO_LSN_for_page,
+                                    uint offset,
+                                    uint size);
+extern void pagecache_unlock(PAGECACHE *pagecache,
+                             PAGECACHE_FILE *file,
+                             pgcache_page_no_t pageno,
+                             enum pagecache_page_lock lock,
+                             enum pagecache_page_pin pin,
+                             LSN first_REDO_LSN_for_page,
+                             LSN lsn, my_bool was_changed);
+extern void pagecache_unlock_by_link(PAGECACHE *pagecache,
+                                     PAGECACHE_BLOCK_LINK *block,
+                                     enum pagecache_page_lock lock,
+                                     enum pagecache_page_pin pin,
+                                     LSN first_REDO_LSN_for_page,
+                                     LSN lsn, my_bool was_changed,
+                                     my_bool any);
+extern void pagecache_unpin(PAGECACHE *pagecache,
+                            PAGECACHE_FILE *file,
+                            pgcache_page_no_t pageno,
+                            LSN lsn);
+extern void pagecache_unpin_by_link(PAGECACHE *pagecache,
+                                    PAGECACHE_BLOCK_LINK *link,
+                                    LSN lsn);
+
+
+/* Results of flush operation (bit field in fact) */
+
+/* The flush is done. */
+#define PCFLUSH_OK 0
+/* There was errors during the flush process. */
+#define PCFLUSH_ERROR 1
+/* Pinned blocks was met and skipped. */
+#define PCFLUSH_PINNED 2
+/* PCFLUSH_ERROR and PCFLUSH_PINNED. */
+#define PCFLUSH_PINNED_AND_ERROR (PCFLUSH_ERROR|PCFLUSH_PINNED)
+
+#define pagecache_file_init(F,RC,WC,WF,GLC,D) \
+  do{ \
+    (F).read_callback= (RC); (F).write_callback= (WC); \
+    (F).write_fail= (WF); \
+    (F).flush_log_callback= (GLC); (F).callback_data= (uchar*)(D); \
+  } while(0)
+
+#define flush_pagecache_blocks(A,B,C)                   \
+  flush_pagecache_blocks_with_filter(A,B,C,NULL,NULL)
+extern int flush_pagecache_blocks_with_filter(PAGECACHE *keycache,
+                                              PAGECACHE_FILE *file,
+                                              enum flush_type type,
+                                              PAGECACHE_FLUSH_FILTER filter,
+                                              void *filter_arg);
+extern my_bool pagecache_delete(PAGECACHE *pagecache,
+                                PAGECACHE_FILE *file,
+                                pgcache_page_no_t pageno,
+                                enum pagecache_page_lock lock,
+                                my_bool flush);
+extern my_bool pagecache_delete_by_link(PAGECACHE *pagecache,
+					PAGECACHE_BLOCK_LINK *link,
+					enum pagecache_page_lock lock,
+					my_bool flush);
+extern my_bool pagecache_delete_pages(PAGECACHE *pagecache,
+                                      PAGECACHE_FILE *file,
+                                      pgcache_page_no_t pageno,
+                                      uint page_count,
+                                      enum pagecache_page_lock lock,
+                                      my_bool flush);
+extern void end_pagecache(PAGECACHE *keycache, my_bool cleanup);
+extern my_bool pagecache_collect_changed_blocks_with_lsn(PAGECACHE *pagecache,
+                                                         LEX_STRING *str,
+                                                         LSN *min_lsn);
+extern int reset_pagecache_counters(const char *name, PAGECACHE *pagecache);
+extern uchar *pagecache_block_link_to_buffer(PAGECACHE_BLOCK_LINK *block);
+
+extern uint pagecache_pagelevel(PAGECACHE_BLOCK_LINK *block);
+extern void pagecache_add_level_by_link(PAGECACHE_BLOCK_LINK *block,
+					uint level);
+
+/* Functions to handle multiple key caches */
+extern my_bool multi_pagecache_init(void);
+extern void multi_pagecache_free(void);
+extern PAGECACHE *multi_pagecache_search(uchar *key, uint length,
+                                         PAGECACHE *def);
+extern my_bool multi_pagecache_set(const uchar *key, uint length,
+				   PAGECACHE *pagecache);
+extern void multi_pagecache_change(PAGECACHE *old_data,
+				   PAGECACHE *new_data);
+extern int reset_pagecache_counters(const char *name,
+                                    PAGECACHE *pagecache);
+#ifndef DBUG_OFF
+void pagecache_file_no_dirty_page(PAGECACHE *pagecache, PAGECACHE_FILE *file);
+#else
+#define pagecache_file_no_dirty_page(A,B) {}
+#endif
+
+C_MODE_END
+#endif /* _keycache_h */
diff --git a/storage/maria/ma_pagecaches.c b/storage/maria/ma_pagecaches.c
new file mode 100644
index 00000000000..8a1423ee0d7
--- /dev/null
+++ b/storage/maria/ma_pagecaches.c
@@ -0,0 +1,104 @@
+/* Copyright (C) 2003-2007 MySQL AB
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; version 2 of the License.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */
+
+/*
+  Handling of multiple key caches
+
+  The idea is to have a thread safe hash on the table name,
+  with a default key cache value that is returned if the table name is not in
+  the cache.
+*/
+
+#include "maria_def.h"
+#include "ma_pagecache.h"
+#include <hash.h>
+#include <m_string.h>
+#include "../../mysys/my_safehash.h"
+
+/*****************************************************************************
+  Functions to handle the pagecache objects
+*****************************************************************************/
+
+/* Variable to store all key cache objects */
+static SAFE_HASH pagecache_hash;
+
+
+my_bool multi_pagecache_init(void)
+{
+  return safe_hash_init(&pagecache_hash, 16, (uchar*) maria_pagecache);
+}
+
+
+void multi_pagecache_free(void)
+{
+  safe_hash_free(&pagecache_hash);
+}
+
+/*
+  Get a key cache to be used for a specific table.
+
+  SYNOPSIS
+    multi_pagecache_search()
+    key				key to find (usually table path)
+    uint length			Length of key.
+    def				Default value if no key cache
+
+  NOTES
+    This function is coded in such a way that we will return the
+    default key cache even if one never called multi_pagecache_init.
+    This will ensure that it works with old MyISAM clients.
+
+  RETURN
+    key cache to use
+*/
+
+PAGECACHE *multi_pagecache_search(uchar *key, uint length,
+                                  PAGECACHE *def)
+{
+  if (!pagecache_hash.hash.records)
+    return def;
+  return (PAGECACHE*) safe_hash_search(&pagecache_hash, key, length,
+                                       (void*) def);
+}
+
+
+/*
+  Assosiate a key cache with a key
+
+
+  SYONOPSIS
+    multi_pagecache_set()
+    key				key (path to table etc..)
+    length			Length of key
+    pagecache			cache to assococite with the table
+
+  NOTES
+    This can be used both to insert a new entry and change an existing
+    entry
+*/
+
+
+my_bool multi_pagecache_set(const uchar *key, uint length,
+			    PAGECACHE *pagecache)
+{
+  return safe_hash_set(&pagecache_hash, key, length, (uchar*) pagecache);
+}
+
+
+void multi_pagecache_change(PAGECACHE *old_data,
+			    PAGECACHE *new_data)
+{
+  safe_hash_change(&pagecache_hash, (uchar*) old_data, (uchar*) new_data);
+}
diff --git a/storage/maria/ma_pagecrc.c b/storage/maria/ma_pagecrc.c
new file mode 100644
index 00000000000..640bb8880f4
--- /dev/null
+++ b/storage/maria/ma_pagecrc.c
@@ -0,0 +1,378 @@
+/* Copyright (C) 2007-2008 MySQL AB
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; version 2 of the License.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */
+
+#include "maria_def.h"
+
+
+/**
+  @brief calculate crc of the page avoiding special values
+
+  @param start           The value to start CRC (we use page number here)
+  @param data            data pointer
+  @param length          length of the data
+
+  @return crc of the page without special values
+*/
+
+static uint32 maria_page_crc(uint32 start, uchar *data, uint length)
+{
+  uint32 crc= crc32(start, data, length);
+
+  /* we need this assert to get following comparison working */
+  compile_time_assert(MARIA_NO_CRC_BITMAP_PAGE ==
+                      MARIA_NO_CRC_NORMAL_PAGE - 1 &&
+                      MARIA_NO_CRC_NORMAL_PAGE == 0xffffffff);
+  if (crc >= MARIA_NO_CRC_BITMAP_PAGE)
+    crc= MARIA_NO_CRC_BITMAP_PAGE - 1;
+
+  return(crc);
+}
+
+/**
+  @brief Maria pages read callback (checks the page CRC)
+
+  @param page            The page data to check
+  @param page_no         The page number (<offset>/<page length>)
+  @param data_ptr        pointer to MARIA_SHARE
+  @param no_crc_val      Value which means CRC absence
+                         (MARIA_NO_CRC_NORMAL_PAGE or MARIA_NO_CRC_BITMAP_PAGE)
+  @param data_length     length of data to calculate CRC
+
+  @retval 0 OK
+  @retval 1 Error
+*/
+
+static my_bool maria_page_crc_check(uchar *page,
+                                    pgcache_page_no_t page_no,
+                                    MARIA_SHARE *share,
+                                    uint32 no_crc_val,
+                                    int data_length)
+{
+  uint32 crc= uint4korr(page + share->block_size - CRC_SIZE), new_crc;
+  my_bool res;
+  DBUG_ENTER("maria_page_crc_check");
+
+  DBUG_ASSERT((uint)data_length <= share->block_size - CRC_SIZE);
+
+  /* we need this assert to get following comparison working */
+  compile_time_assert(MARIA_NO_CRC_BITMAP_PAGE ==
+                      MARIA_NO_CRC_NORMAL_PAGE - 1 &&
+                      MARIA_NO_CRC_NORMAL_PAGE == 0xffffffff);
+  /*
+    If crc is no_crc_val then
+    the page has no crc, so there is nothing to check.
+  */
+  if (crc >= MARIA_NO_CRC_BITMAP_PAGE)
+  {
+    DBUG_PRINT("info", ("No crc: %lu  crc: %lu  page: %lu  ",
+                        (ulong) no_crc_val, (ulong) crc, (ulong) page_no));
+    if (crc != no_crc_val)
+    {
+      my_errno= HA_ERR_WRONG_CRC;
+      DBUG_PRINT("error", ("Wrong no CRC value"));
+      DBUG_RETURN(1);
+    }
+    DBUG_RETURN(0);
+  }
+  new_crc= maria_page_crc((uint32) page_no, page, data_length);
+  DBUG_ASSERT(new_crc != no_crc_val);
+  res= test(new_crc != crc);
+  if (res)
+  {
+    /*
+      Bitmap pages may be totally zero filled in some cases.
+      This happens when we get a crash after the pagecache has written
+      out a page that is on a newly created bitmap page and we get
+      a crash before the bitmap page is written out.
+
+      We handle this case with the following logic:
+      When reading, approve of bitmap pages where all bytes are zero
+      (This is after all a bitmap pages where no data is reserved and
+      the CRC will be corrected at next write)
+    */
+    if (no_crc_val == MARIA_NO_CRC_BITMAP_PAGE &&
+        crc == 0 && _ma_check_if_zero(page, data_length))
+    {
+      DBUG_PRINT("warning", ("Found bitmap page that was not initialized"));
+      DBUG_RETURN(0);
+    }
+
+    DBUG_PRINT("error", ("Page: %lu  crc: %lu  calculated crc: %lu",
+                         (ulong) page_no, (ulong) crc, (ulong) new_crc));
+    my_errno= HA_ERR_WRONG_CRC;
+  }
+  DBUG_RETURN(res);
+}
+
+
+/**
+  @brief Maria pages write callback (sets the page CRC for data and index
+  files)
+
+  @param page            The page data to set
+  @param page_no         The page number (<offset>/<page length>)
+  @param data_ptr        Write callback data pointer (pointer to MARIA_SHARE)
+
+  @retval 0 OK
+*/
+
+my_bool maria_page_crc_set_normal(uchar *page,
+                                  pgcache_page_no_t page_no,
+                                  uchar *data_ptr)
+{
+  MARIA_SHARE *share= (MARIA_SHARE *)data_ptr;
+  int data_length= share->block_size - CRC_SIZE;
+  uint32 crc= maria_page_crc((uint32) page_no, page, data_length);
+  DBUG_ENTER("maria_page_crc_set_normal");
+  DBUG_PRINT("info", ("Page %lu  crc: %lu", (ulong) page_no, (ulong)crc));
+
+  /* crc is on the stack so it is aligned, pagecache buffer is aligned, too */
+  int4store_aligned(page + data_length, crc);
+  DBUG_RETURN(0);
+}
+
+
+/**
+  @brief Maria pages write callback (sets the page CRC for keys)
+
+  @param page            The page data to set
+  @param page_no         The page number (<offset>/<page length>)
+  @param data_ptr        Write callback data pointer (pointer to MARIA_SHARE)
+
+  @retval 0 OK
+*/
+
+my_bool maria_page_crc_set_index(uchar *page,
+                                 pgcache_page_no_t page_no,
+                                 uchar *data_ptr)
+{
+  MARIA_SHARE *share= (MARIA_SHARE *)data_ptr;
+  int data_length= _ma_get_page_used(share, page);
+  uint32 crc= maria_page_crc((uint32) page_no, page, data_length);
+  DBUG_ENTER("maria_page_crc_set_index");
+  DBUG_PRINT("info", ("Page %lu  crc: %lu",
+                      (ulong) page_no, (ulong) crc));
+  DBUG_ASSERT((uint)data_length <= share->block_size - CRC_SIZE);
+  /* crc is on the stack so it is aligned, pagecache buffer is aligned, too */
+  int4store_aligned(page + share->block_size - CRC_SIZE, crc);
+  DBUG_RETURN(0);
+}
+
+
+/* interface functions */
+
+
+/**
+  @brief Maria pages read callback (checks the page CRC) for index/data pages
+
+  @param page            The page data to check
+  @param page_no         The page number (<offset>/<page length>)
+  @param data_ptr        Read callback data pointer (pointer to MARIA_SHARE)
+
+  @retval 0 OK
+  @retval 1 Error
+*/
+
+my_bool maria_page_crc_check_data(uchar *page,
+                                  pgcache_page_no_t page_no,
+                                  uchar *data_ptr)
+{
+  MARIA_SHARE *share= (MARIA_SHARE *)data_ptr;
+  return (maria_page_crc_check(page, (uint32) page_no, share,
+                               MARIA_NO_CRC_NORMAL_PAGE,
+                               share->block_size - CRC_SIZE));
+}
+
+
+/**
+  @brief Maria pages read callback (checks the page CRC) for bitmap pages
+
+  @param page            The page data to check
+  @param page_no         The page number (<offset>/<page length>)
+  @param data_ptr        Read callback data pointer (pointer to MARIA_SHARE)
+
+  @retval 0 OK
+  @retval 1 Error
+*/
+
+my_bool maria_page_crc_check_bitmap(uchar *page,
+                                    pgcache_page_no_t page_no,
+                                    uchar *data_ptr)
+{
+  MARIA_SHARE *share= (MARIA_SHARE *)data_ptr;
+  return (maria_page_crc_check(page, (uint32) page_no, share,
+                               MARIA_NO_CRC_BITMAP_PAGE,
+                               share->block_size - CRC_SIZE));
+}
+
+
+/**
+  @brief Maria pages read callback (checks the page CRC) for index pages
+
+  @param page            The page data to check
+  @param page_no         The page number (<offset>/<page length>)
+  @param data_ptr        Read callback data pointer (pointer to MARIA_SHARE)
+
+  @retval 0 OK
+  @retval 1 Error
+*/
+
+my_bool maria_page_crc_check_index(uchar *page,
+                                   pgcache_page_no_t page_no,
+                                   uchar *data_ptr)
+{
+  MARIA_SHARE *share= (MARIA_SHARE *)data_ptr;
+  uint length= _ma_get_page_used(share, page);
+  if (length > share->block_size - CRC_SIZE)
+  {
+    DBUG_PRINT("error", ("Wrong page length: %u", length));
+    return (my_errno= HA_ERR_WRONG_CRC);
+  }
+  return maria_page_crc_check(page, (uint32) page_no, share,
+                               MARIA_NO_CRC_NORMAL_PAGE,
+                              length);
+}
+
+
+/**
+  @brief Maria pages dumme read callback for temporary tables
+
+  @retval 0 OK
+  @retval 1 Error
+*/
+
+my_bool maria_page_crc_check_none(uchar *page __attribute__((unused)),
+                                  pgcache_page_no_t page_no
+                                  __attribute__((unused)),
+                                  uchar *data_ptr __attribute__((unused)))
+{
+  return 0;
+}
+
+
+/**
+  @brief Maria pages write callback (sets the page filler for index/data)
+
+  @param page            The page data to set
+  @param page_no         The page number (<offset>/<page length>)
+  @param data_ptr        Write callback data pointer (pointer to MARIA_SHARE)
+
+  @retval 0 OK
+*/
+
+my_bool maria_page_filler_set_normal(uchar *page,
+                                     pgcache_page_no_t page_no
+                                     __attribute__((unused)),
+                                     uchar *data_ptr)
+{
+  DBUG_ENTER("maria_page_filler_set_normal");
+  DBUG_ASSERT(page_no != 0);                    /* Catches some simple bugs */
+  int4store_aligned(page + ((MARIA_SHARE *)data_ptr)->block_size - CRC_SIZE,
+                    MARIA_NO_CRC_NORMAL_PAGE);
+  DBUG_RETURN(0);
+}
+
+
+/**
+  @brief Maria pages write callback (sets the page filler for bitmap)
+
+  @param page            The page data to set
+  @param page_no         The page number (<offset>/<page length>)
+  @param data_ptr        Write callback data pointer (pointer to MARIA_SHARE)
+
+  @retval 0 OK
+*/
+
+my_bool maria_page_filler_set_bitmap(uchar *page,
+                                     pgcache_page_no_t page_no
+                                     __attribute__((unused)),
+                                     uchar *data_ptr)
+{
+  DBUG_ENTER("maria_page_filler_set_bitmap");
+  int4store_aligned(page + ((MARIA_SHARE *)data_ptr)->block_size - CRC_SIZE,
+                    MARIA_NO_CRC_BITMAP_PAGE);
+  DBUG_RETURN(0);
+}
+
+
+/**
+  @brief Maria pages dummy write callback for temporary tables
+
+  @retval 0 OK
+*/
+
+my_bool maria_page_filler_set_none(uchar *page __attribute__((unused)),
+                                   pgcache_page_no_t page_no
+                                   __attribute__((unused)),
+                                   uchar *data_ptr __attribute__((unused)))
+{
+#ifdef HAVE_valgrind
+  int4store_aligned(page + ((MARIA_SHARE *)data_ptr)->block_size - CRC_SIZE,
+                    0);
+#endif
+  return 0;
+}
+
+
+/**
+  @brief Write failure callback (mark table as corrupted)
+
+  @param data_ptr        Write callback data pointer (pointer to MARIA_SHARE)
+*/
+
+void maria_page_write_failure(uchar* data_ptr)
+{
+  maria_mark_crashed_share((MARIA_SHARE *)data_ptr);
+}
+
+
+/**
+  @brief Maria flush log log if needed
+
+  @param page            The page data to set
+  @param page_no         The page number (<offset>/<page length>)
+  @param data_ptr        Write callback data pointer (pointer to MARIA_SHARE)
+
+  @retval 0  OK
+  @retval 1  error
+*/
+
+my_bool maria_flush_log_for_page(uchar *page,
+                                 pgcache_page_no_t page_no
+                                 __attribute__((unused)),
+                                 uchar *data_ptr __attribute__((unused)))
+{
+  LSN lsn;
+#ifndef DBUG_OFF
+  const MARIA_SHARE *share= (MARIA_SHARE*) data_ptr;
+#endif
+  DBUG_ENTER("maria_flush_log_for_page");
+  /* share is 0 here only in unittest */
+  DBUG_ASSERT(!share || (share->page_type == PAGECACHE_LSN_PAGE &&
+                         share->now_transactional));
+  lsn= lsn_korr(page);
+  if (translog_flush(lsn))
+    DBUG_RETURN(1);
+  DBUG_RETURN(0);
+}
+
+
+my_bool maria_flush_log_for_page_none(uchar *page __attribute__((unused)),
+                                      pgcache_page_no_t page_no
+                                      __attribute__((unused)),
+                                      uchar *data_ptr __attribute__((unused)))
+{
+  return 0;
+}
diff --git a/storage/maria/ma_panic.c b/storage/maria/ma_panic.c
new file mode 100644
index 00000000000..a86563f31fb
--- /dev/null
+++ b/storage/maria/ma_panic.c
@@ -0,0 +1,140 @@
+/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; version 2 of the License.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */
+
+#include "ma_fulltext.h"
+
+/*
+  Stop usage of Maria
+
+  SYNOPSIS
+     maria_panic()
+     flag	HA_PANIC_CLOSE:  All maria files (tables and log) are closed.
+				 maria_end() is called.
+                HA_PANIC_WRITE:  All misam files are unlocked and
+                                 all changed data in single user maria is
+                                 written to file
+                HA_PANIC_READ    All maria files that was locked when
+			         maria_panic(HA_PANIC_WRITE) was done is
+                                 locked. A maria_readinfo() is done for
+                                 all single user files to get changes
+                                 in database
+
+  RETURN
+    0  ok
+    #  error number in case of error
+*/
+
+int maria_panic(enum ha_panic_function flag)
+{
+  int error=0;
+  LIST *list_element,*next_open;
+  MARIA_HA *info;
+  DBUG_ENTER("maria_panic");
+
+  if (!maria_inited)
+    DBUG_RETURN(0);
+  pthread_mutex_lock(&THR_LOCK_maria);
+  for (list_element=maria_open_list ; list_element ; list_element=next_open)
+  {
+    next_open=list_element->next;		/* Save if close */
+    info=(MARIA_HA*) list_element->data;
+    switch (flag) {
+    case HA_PANIC_CLOSE:
+      /*
+        If bad luck (if some tables would be used now, which normally does not
+        happen in MySQL), as we release the mutex, the list may change and so
+        we may crash.
+      */
+      pthread_mutex_unlock(&THR_LOCK_maria);
+      if (maria_close(info))
+	error=my_errno;
+      pthread_mutex_lock(&THR_LOCK_maria);
+      break;
+    case HA_PANIC_WRITE:		/* Do this to free databases */
+#ifdef CANT_OPEN_FILES_TWICE
+      if (info->s->options & HA_OPTION_READ_ONLY_DATA)
+	break;
+#endif
+      if (flush_pagecache_blocks(info->s->pagecache, &info->s->kfile,
+                                 FLUSH_RELEASE))
+	error=my_errno;
+      if (info->opt_flag & WRITE_CACHE_USED)
+	if (flush_io_cache(&info->rec_cache))
+	  error=my_errno;
+      if (info->opt_flag & READ_CACHE_USED)
+      {
+	if (flush_io_cache(&info->rec_cache))
+	  error=my_errno;
+	reinit_io_cache(&info->rec_cache,READ_CACHE,0,
+		       (pbool) (info->lock_type != F_UNLCK),1);
+      }
+      if (info->lock_type != F_UNLCK && ! info->was_locked)
+      {
+	info->was_locked=info->lock_type;
+	if (maria_lock_database(info,F_UNLCK))
+	  error=my_errno;
+      }
+#ifdef CANT_OPEN_FILES_TWICE
+      if (info->s->kfile.file >= 0 && my_close(info->s->kfile.file, MYF(0)))
+	error = my_errno;
+      if (info->dfile.file >= 0 && my_close(info->dfile.file, MYF(0)))
+	error = my_errno;
+      info->s->kfile.file= info->dfile.file= -1;/* Files aren't open anymore */
+      break;
+#endif
+    case HA_PANIC_READ:			/* Restore to before WRITE */
+#ifdef CANT_OPEN_FILES_TWICE
+      {					/* Open closed files */
+	char name_buff[FN_REFLEN];
+        MARIA_SHARE *share= info->s;
+	if (share->kfile.file < 0)
+        {
+
+	  if ((share->kfile.file= my_open(fn_format(name_buff,
+                                                    info->filename, "",
+                                                    N_NAME_IEXT,4),
+                                          info->mode,
+                                          MYF(MY_WME))) < 0)
+	    error = my_errno;  
+        }
+	if (info->dfile.file < 0)
+	{
+	  if ((info->dfile.file= my_open(fn_format(name_buff, info->filename,
+                                                   "", N_NAME_DEXT, 4),
+                                         info->mode,
+                                         MYF(MY_WME))) < 0)
+	    error = my_errno;
+	  info->rec_cache.file= info->dfile.file;
+	}
+	if (share->bitmap.file.file < 0)
+	  share->bitmap.file.file= info->dfile.file;
+      }
+#endif
+      if (info->was_locked)
+      {
+	if (maria_lock_database(info, info->was_locked))
+	  error=my_errno;
+	info->was_locked=0;
+      }
+      break;
+    }
+  }
+  pthread_mutex_unlock(&THR_LOCK_maria);
+  if (flag == HA_PANIC_CLOSE)
+    maria_end();
+  if (!error)
+    DBUG_RETURN(0);
+  DBUG_RETURN(my_errno=error);
+} /* maria_panic */
diff --git a/storage/maria/ma_preload.c b/storage/maria/ma_preload.c
new file mode 100644
index 00000000000..6dfb4e437b6
--- /dev/null
+++ b/storage/maria/ma_preload.c
@@ -0,0 +1,116 @@
+/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; version 2 of the License.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */
+
+/*
+  Preload indexes into key cache
+*/
+
+#include "maria_def.h"
+
+
+/*
+  Preload pages of the index file for a table into the key cache
+
+  SYNOPSIS
+    maria_preload()
+      info          open table
+      map           map of indexes to preload into key cache
+      ignore_leaves only non-leaves pages are to be preloaded
+
+  RETURN VALUE
+    0 if a success. error code - otherwise.
+
+  NOTES.
+    At present pages for all indexes are preloaded.
+    In future only pages for indexes specified in the key_map parameter
+    of the table will be preloaded.
+    We don't yet use preload_buff_size (we read page after page).
+*/
+
+int maria_preload(MARIA_HA *info, ulonglong key_map, my_bool ignore_leaves)
+{
+  ulong block_length= 0;
+  uchar *buff;
+  MARIA_SHARE* share= info->s;
+  uint keynr;
+  my_off_t key_file_length= share->state.state.key_file_length;
+  pgcache_page_no_t page_no, page_no_max;
+  PAGECACHE_BLOCK_LINK *page_link;
+  DBUG_ENTER("maria_preload");
+
+  if (!share->state.header.keys || !maria_is_any_key_active(key_map) ||
+      (key_file_length == share->base.keystart))
+    DBUG_RETURN(0);
+
+  block_length= share->pagecache->block_size;
+
+  if (!(buff= (uchar *) my_malloc(block_length, MYF(MY_WME))))
+    DBUG_RETURN(my_errno= HA_ERR_OUT_OF_MEM);
+
+  if (flush_pagecache_blocks(share->pagecache, &share->kfile, FLUSH_RELEASE))
+    goto err;
+
+  /*
+    Currently when we come here all other open instances of the table have
+    been closed, and we flushed all pages of our own instance, so there
+    cannot be any page of this table in the cache. Thus my_pread() would be
+    safe. But in the future, we will allow more concurrency during
+    preloading, so we use pagecache_read() instead of my_pread() because we
+    observed that on some Linux, concurrent pread() and pwrite() (which
+    could be from a page eviction by another thread) to the same page can
+    make pread() see an half-written page.
+    In this future, we should find a way to read state.key_file_length
+    reliably, handle concurrent shrinks (delete_all_rows()) etc.
+  */
+  for ((page_no= share->base.keystart / block_length),
+         (page_no_max= key_file_length / block_length);
+       page_no < page_no_max; page_no++)
+  {
+    /**
+      @todo instead of reading pages one by one we could have a call
+      pagecache_read_several_pages() which does a single my_pread() for many
+      consecutive pages (like the my_pread() in mi_preload()).
+    */
+    if (pagecache_read(share->pagecache, &share->kfile, page_no,
+                       DFLT_INIT_HITS, buff, share->page_type,
+                       PAGECACHE_LOCK_WRITE, &page_link) == NULL)
+      goto err;
+    keynr= _ma_get_keynr(share, buff);
+    if (((ignore_leaves && !_ma_test_if_nod(share, buff)) ||
+         keynr == MARIA_DELETE_KEY_NR ||
+         !(key_map & ((ulonglong) 1 << keynr))) &&
+        (pagecache_pagelevel(page_link) == DFLT_INIT_HITS))
+    {
+      /*
+        This page is not interesting, and (last condition above) we are the
+        ones who put it in the cache, so nobody else is interested in it.
+      */
+      if (pagecache_delete_by_link(share->pagecache, page_link,
+                                   PAGECACHE_LOCK_LEFT_WRITELOCKED, FALSE))
+        goto err;
+    }
+    else /* otherwise it stays in cache: */
+      pagecache_unlock_by_link(share->pagecache, page_link,
+                               PAGECACHE_LOCK_WRITE_UNLOCK, PAGECACHE_UNPIN,
+                               LSN_IMPOSSIBLE, LSN_IMPOSSIBLE, FALSE, FALSE);
+  }
+
+  my_free(buff, MYF(0));
+  DBUG_RETURN(0);
+
+err:
+  my_free(buff, MYF(MY_ALLOW_ZERO_PTR));
+  DBUG_RETURN(my_errno= errno);
+}
diff --git a/storage/maria/ma_range.c b/storage/maria/ma_range.c
new file mode 100644
index 00000000000..5dc4e3a9959
--- /dev/null
+++ b/storage/maria/ma_range.c
@@ -0,0 +1,312 @@
+/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; version 2 of the License.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */
+
+/*
+  Gives a approximated number of how many records there is between two keys.
+  Used when optimizing querries.
+ */
+
+#include "maria_def.h"
+#include "ma_rt_index.h"
+
+static ha_rows _ma_record_pos(MARIA_HA *,const uchar *, key_part_map,
+			      enum ha_rkey_function);
+static double _ma_search_pos(MARIA_HA *, MARIA_KEY *, uint32, my_off_t);
+static uint _ma_keynr(MARIA_PAGE *page, uchar *keypos, uint *ret_max_key);
+
+
+/**
+   @brief Estimate how many records there is in a given range
+
+   @param  info            MARIA handler
+   @param  inx             Index to use
+   @param  min_key         Min key. Is = 0 if no min range
+   @param  max_key         Max key. Is = 0 if no max range
+
+   @note
+     We should ONLY return 0 if there is no rows in range
+
+   @return Estimated number of rows or error
+     @retval HA_POS_ERROR  error (or we can't estimate number of rows)
+     @retval number        Estimated number of rows
+*/
+
+ha_rows maria_records_in_range(MARIA_HA *info, int inx, key_range *min_key,
+                            key_range *max_key)
+{
+  ha_rows start_pos,end_pos,res;
+  MARIA_SHARE *share= info->s;
+  MARIA_KEY key;
+  MARIA_KEYDEF *keyinfo;
+  DBUG_ENTER("maria_records_in_range");
+
+  if ((inx = _ma_check_index(info,inx)) < 0)
+    DBUG_RETURN(HA_POS_ERROR);
+
+  if (fast_ma_readinfo(info))
+    DBUG_RETURN(HA_POS_ERROR);
+  info->update&= (HA_STATE_CHANGED+HA_STATE_ROW_CHANGED);
+  keyinfo= share->keyinfo + inx;
+  if (share->lock_key_trees)
+    rw_rdlock(&keyinfo->root_lock);
+
+  switch (keyinfo->key_alg) {
+#ifdef HAVE_RTREE_KEYS
+  case HA_KEY_ALG_RTREE:
+  {
+    uchar *key_buff;
+
+    /*
+      The problem is that the optimizer doesn't support
+      RTree keys properly at the moment.
+      Hope this will be fixed some day.
+      But now NULL in the min_key means that we
+      didn't make the task for the RTree key
+      and expect BTree functionality from it.
+      As it's not able to handle such request
+      we return the error.
+    */
+    if (!min_key)
+    {
+      res= HA_POS_ERROR;
+      break;
+    }
+    key_buff= info->last_key.data + share->base.max_key_length;
+    _ma_pack_key(info, &key, inx, key_buff,
+                 min_key->key, min_key->keypart_map,
+                 (HA_KEYSEG**) 0);
+    res= maria_rtree_estimate(info, &key, maria_read_vec[min_key->flag]);
+    res= res ? res : 1;                       /* Don't return 0 */
+    break;
+  }
+#endif
+  case HA_KEY_ALG_BTREE:
+  default:
+    start_pos= (min_key ?
+                _ma_record_pos(info, min_key->key, min_key->keypart_map,
+                               min_key->flag) :
+                (ha_rows) 0);
+    end_pos=   (max_key ?
+                _ma_record_pos(info, max_key->key, max_key->keypart_map,
+                               max_key->flag) :
+                info->state->records + (ha_rows) 1);
+    res= (end_pos < start_pos ? (ha_rows) 0 :
+          (end_pos == start_pos ? (ha_rows) 1 : end_pos-start_pos));
+    if (start_pos == HA_POS_ERROR || end_pos == HA_POS_ERROR)
+      res=HA_POS_ERROR;
+  }
+
+  if (share->lock_key_trees)
+    rw_unlock(&keyinfo->root_lock);
+  fast_ma_writeinfo(info);
+
+  /**
+     @todo LOCK
+     If res==0 (no rows), if we need to guarantee repeatability of the search,
+     we will need to set a next-key lock in this statement.
+     Also SELECT COUNT(*)...
+  */
+
+  DBUG_PRINT("info",("records: %ld",(ulong) (res)));
+  DBUG_RETURN(res);
+}
+
+
+	/* Find relative position (in records) for key in index-tree */
+
+static ha_rows _ma_record_pos(MARIA_HA *info, const uchar *key_data,
+                              key_part_map keypart_map,
+			      enum ha_rkey_function search_flag)
+{
+  uint inx= (uint) info->lastinx;
+  uint32 nextflag;
+  uchar *key_buff;
+  double pos;
+  MARIA_KEY key;
+  DBUG_ENTER("_ma_record_pos");
+  DBUG_PRINT("enter",("search_flag: %d",search_flag));
+  DBUG_ASSERT(keypart_map);
+
+  key_buff= info->lastkey_buff+info->s->base.max_key_length;
+  _ma_pack_key(info, &key, inx, key_buff, key_data, keypart_map,
+		       (HA_KEYSEG**) 0);
+  DBUG_EXECUTE("key", _ma_print_key(DBUG_FILE, &key););
+  nextflag=maria_read_vec[search_flag];
+
+  /*
+    my_handler.c:ha_compare_text() has a flag 'skip_end_space'.
+    This is set in my_handler.c:ha_key_cmp() in dependence on the
+    compare flags 'nextflag' and the column type.
+
+    TEXT columns are of type HA_KEYTYPE_VARTEXT. In this case the
+    condition is skip_end_space= ((nextflag & (SEARCH_FIND |
+    SEARCH_UPDATE)) == SEARCH_FIND).
+
+    SEARCH_FIND is used for an exact key search. The combination
+    SEARCH_FIND | SEARCH_UPDATE is used in write/update/delete
+    operations with a comment like "Not real duplicates", whatever this
+    means. From the condition above we can see that 'skip_end_space' is
+    always false for these operations. The result is that trailing space
+    counts in key comparison and hence, emtpy strings ('', string length
+    zero, but not NULL) compare less that strings starting with control
+    characters and these in turn compare less than strings starting with
+    blanks.
+
+    When estimating the number of records in a key range, we request an
+    exact search for the minimum key. This translates into a plain
+    SEARCH_FIND flag. Using this alone would lead to a 'skip_end_space'
+    compare. Empty strings would be expected above control characters.
+    Their keys would not be found because they are located below control
+    characters.
+
+    This is the reason that we add the SEARCH_UPDATE flag here. It makes
+    the key estimation compare in the same way like key write operations
+    do. Olny so we will find the keys where they have been inserted.
+
+    Adding the flag unconditionally does not hurt as it is used in the
+    above mentioned condition only. So it can safely be used together
+    with other flags.
+  */
+  pos= _ma_search_pos(info, &key,
+                      nextflag | SEARCH_SAVE_BUFF | SEARCH_UPDATE,
+                      info->s->state.key_root[inx]);
+  if (pos >= 0.0)
+  {
+    DBUG_PRINT("exit",("pos: %ld",(ulong) (pos*info->state->records)));
+    DBUG_RETURN((ulong) (pos*info->state->records+0.5));
+  }
+  DBUG_RETURN(HA_POS_ERROR);
+}
+
+
+/**
+  Find offset for key on index page
+
+  @notes
+   Modified version of _ma_search()
+
+  @return
+  @retval 0.0 <= x <= 1.0
+*/
+
+static double _ma_search_pos(MARIA_HA *info, MARIA_KEY *key,
+			     uint32 nextflag, my_off_t pos)
+{
+  int flag;
+  uint keynr, max_keynr;
+  my_bool after_key;
+  uchar *keypos;
+  double offset;
+  MARIA_KEYDEF *keyinfo= key->keyinfo;
+  MARIA_PAGE page;
+  DBUG_ENTER("_ma_search_pos");
+  LINT_INIT(max_keynr);
+
+  if (pos == HA_OFFSET_ERROR)
+    DBUG_RETURN(0.5);
+
+  if (_ma_fetch_keypage(&page, info, keyinfo, pos,
+                        PAGECACHE_LOCK_LEFT_UNLOCKED, DFLT_INIT_HITS,
+                        info->buff, 1))
+    goto err;
+  flag= (*keyinfo->bin_search)(key, &page, nextflag, &keypos,
+                               info->lastkey_buff, &after_key);
+  keynr= _ma_keynr(&page, keypos, &max_keynr);
+
+  if (flag)
+  {
+    if (flag == MARIA_FOUND_WRONG_KEY)
+      DBUG_RETURN(-1);				/* error */
+    /*
+      Didn't found match. keypos points at next (bigger) key
+      Try to find a smaller, better matching key.
+      Matches keynr + [0-1]
+    */
+    if (flag > 0 && ! page.node)
+      offset= 1.0;
+    else if ((offset= _ma_search_pos(info, key, nextflag,
+                                     _ma_kpos(page.node,keypos))) < 0)
+      DBUG_RETURN(offset);
+  }
+  else
+  {
+    /*
+      Found match. Keypos points at the start of the found key
+      Matches keynr+1
+    */
+    offset=1.0;					/* Matches keynr+1 */
+    if ((nextflag & SEARCH_FIND) && page.node &&
+	((keyinfo->flag & (HA_NOSAME | HA_NULL_PART)) != HA_NOSAME ||
+         (nextflag & (SEARCH_PREFIX | SEARCH_NO_FIND | SEARCH_LAST |
+                      SEARCH_PART_KEY))))
+    {
+      /*
+        There may be identical keys in the tree. Try to match on of those.
+        Matches keynr + [0-1]
+      */
+      if ((offset= _ma_search_pos(info, key, SEARCH_FIND,
+                                  _ma_kpos(page.node,keypos))) < 0)
+	DBUG_RETURN(offset);			/* Read error */
+    }
+  }
+  DBUG_PRINT("info",("keynr: %d  offset: %g  max_keynr: %d  nod: %d  flag: %d",
+		     keynr,offset,max_keynr,page.node,flag));
+  DBUG_RETURN((keynr+offset)/(max_keynr+1));
+err:
+  DBUG_PRINT("exit",("Error: %d",my_errno));
+  DBUG_RETURN (-1.0);
+}
+
+
+/* Get keynummer of current key and max number of keys in nod */
+
+static uint _ma_keynr(MARIA_PAGE *page, uchar *keypos, uint *ret_max_key)
+{
+  uint page_flag, nod_flag, keynr, max_key;
+  uchar t_buff[MARIA_MAX_KEY_BUFF], *pos, *end;
+  const MARIA_KEYDEF *keyinfo= page->keyinfo;
+  MARIA_KEY key;
+
+  page_flag= page->flag;
+  nod_flag=  page->node;
+  pos= page->buff + page->info->s->keypage_header + nod_flag;
+  end= page->buff + page->size;
+
+  if (!(keyinfo->flag & (HA_VAR_LENGTH_KEY | HA_BINARY_PACK_KEY)) &&
+      ! (page_flag & KEYPAGE_FLAG_HAS_TRANSID))
+  {
+    *ret_max_key= (uint) (end - pos)/(keyinfo->keylength+nod_flag);
+    return (uint) (keypos - pos)/(keyinfo->keylength+nod_flag);
+  }
+
+  max_key=keynr=0;
+  t_buff[0]=0;					/* Safety */
+  key.data= t_buff;
+  key.keyinfo= (MARIA_KEYDEF*) keyinfo;
+
+  while (pos < end)
+  {
+    if (!(pos= (*keyinfo->skip_key)(&key, page_flag, nod_flag, pos)))
+    {
+      DBUG_ASSERT(0);
+      return 0;					/* Error */
+    }
+    max_key++;
+    if (pos == keypos)
+      keynr= max_key;
+  }
+  *ret_max_key=max_key;
+  return(keynr);
+}
diff --git a/storage/maria/ma_recovery.c b/storage/maria/ma_recovery.c
new file mode 100644
index 00000000000..7a7286e26f9
--- /dev/null
+++ b/storage/maria/ma_recovery.c
@@ -0,0 +1,3755 @@
+/* Copyright (C) 2006, 2007 MySQL AB
+   Copyright (C) 2010 Monty Program Ab
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; version 2 of the License.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */
+
+/*
+  WL#3072 Maria recovery
+  First version written by Guilhem Bichot on 2006-04-27.
+*/
+
+/* Here is the implementation of this module */
+
+#include "maria_def.h"
+#include "ma_recovery.h"
+#include "ma_blockrec.h"
+#include "ma_checkpoint.h"
+#include "trnman.h"
+#include "ma_key_recover.h"
+#include "ma_recovery_util.h"
+
+struct st_trn_for_recovery /* used only in the REDO phase */
+{
+  LSN group_start_lsn, undo_lsn, first_undo_lsn;
+  TrID long_trid;
+};
+struct st_table_for_recovery /* used in the REDO and UNDO phase */
+{
+  MARIA_HA *info;
+};
+/* Variables used by all functions of this module. Ok as single-threaded */
+static struct st_trn_for_recovery *all_active_trans;
+static struct st_table_for_recovery *all_tables;
+static struct st_dirty_page *dirty_pages_pool;
+static LSN current_group_end_lsn;
+#ifndef DBUG_OFF
+/** Current group of REDOs is about this table and only this one */
+static MARIA_HA *current_group_table;
+#endif
+static TrID max_long_trid= 0; /**< max long trid seen by REDO phase */
+static my_bool skip_DDLs; /**< if REDO phase should skip DDL records */
+/** @brief to avoid writing a checkpoint if recovery did nothing. */
+static my_bool checkpoint_useful;
+static my_bool in_redo_phase;
+static my_bool trns_created;
+static ulong skipped_undo_phase;
+static ulonglong now; /**< for tracking execution time of phases */
+static int (*save_error_handler_hook)(uint, const char *,myf);
+static uint recovery_warnings; /**< count of warnings */
+static uint recovery_found_crashed_tables;
+
+#define prototype_redo_exec_hook(R)                                          \
+  static int exec_REDO_LOGREC_ ## R(const TRANSLOG_HEADER_BUFFER *rec)
+
+#define prototype_redo_exec_hook_dummy(R)                                    \
+  static int exec_REDO_LOGREC_ ## R(const TRANSLOG_HEADER_BUFFER *rec        \
+                               __attribute__ ((unused)))
+
+#define prototype_undo_exec_hook(R)                                          \
+  static int exec_UNDO_LOGREC_ ## R(const TRANSLOG_HEADER_BUFFER *rec, TRN *trn)
+
+prototype_redo_exec_hook(LONG_TRANSACTION_ID);
+prototype_redo_exec_hook_dummy(CHECKPOINT);
+prototype_redo_exec_hook(REDO_CREATE_TABLE);
+prototype_redo_exec_hook(REDO_RENAME_TABLE);
+prototype_redo_exec_hook(REDO_REPAIR_TABLE);
+prototype_redo_exec_hook(REDO_DROP_TABLE);
+prototype_redo_exec_hook(FILE_ID);
+prototype_redo_exec_hook(INCOMPLETE_LOG);
+prototype_redo_exec_hook_dummy(INCOMPLETE_GROUP);
+prototype_redo_exec_hook(UNDO_BULK_INSERT);
+prototype_redo_exec_hook(IMPORTED_TABLE);
+prototype_redo_exec_hook(REDO_INSERT_ROW_HEAD);
+prototype_redo_exec_hook(REDO_INSERT_ROW_TAIL);
+prototype_redo_exec_hook(REDO_INSERT_ROW_HEAD);
+prototype_redo_exec_hook(REDO_PURGE_ROW_HEAD);
+prototype_redo_exec_hook(REDO_PURGE_ROW_TAIL);
+prototype_redo_exec_hook(REDO_FREE_HEAD_OR_TAIL);
+prototype_redo_exec_hook(REDO_FREE_BLOCKS);
+prototype_redo_exec_hook(REDO_DELETE_ALL);
+prototype_redo_exec_hook(REDO_INDEX);
+prototype_redo_exec_hook(REDO_INDEX_NEW_PAGE);
+prototype_redo_exec_hook(REDO_INDEX_FREE_PAGE);
+prototype_redo_exec_hook(REDO_BITMAP_NEW_PAGE);
+prototype_redo_exec_hook(UNDO_ROW_INSERT);
+prototype_redo_exec_hook(UNDO_ROW_DELETE);
+prototype_redo_exec_hook(UNDO_ROW_UPDATE);
+prototype_redo_exec_hook(UNDO_KEY_INSERT);
+prototype_redo_exec_hook(UNDO_KEY_DELETE);
+prototype_redo_exec_hook(UNDO_KEY_DELETE_WITH_ROOT);
+prototype_redo_exec_hook(COMMIT);
+prototype_redo_exec_hook(CLR_END);
+prototype_redo_exec_hook(DEBUG_INFO);
+prototype_undo_exec_hook(UNDO_ROW_INSERT);
+prototype_undo_exec_hook(UNDO_ROW_DELETE);
+prototype_undo_exec_hook(UNDO_ROW_UPDATE);
+prototype_undo_exec_hook(UNDO_KEY_INSERT);
+prototype_undo_exec_hook(UNDO_KEY_DELETE);
+prototype_undo_exec_hook(UNDO_KEY_DELETE_WITH_ROOT);
+prototype_undo_exec_hook(UNDO_BULK_INSERT);
+
+static int run_redo_phase(LSN lsn, LSN end_lsn,
+                          enum maria_apply_log_way apply);
+static uint end_of_redo_phase(my_bool prepare_for_undo_phase);
+static int run_undo_phase(uint uncommitted);
+static void display_record_position(const LOG_DESC *log_desc,
+                                    const TRANSLOG_HEADER_BUFFER *rec,
+                                    uint number);
+static int display_and_apply_record(const LOG_DESC *log_desc,
+                                    const TRANSLOG_HEADER_BUFFER *rec);
+static MARIA_HA *get_MARIA_HA_from_REDO_record(const
+                                               TRANSLOG_HEADER_BUFFER *rec);
+static MARIA_HA *get_MARIA_HA_from_UNDO_record(const
+                                               TRANSLOG_HEADER_BUFFER *rec);
+static void prepare_table_for_close(MARIA_HA *info, TRANSLOG_ADDRESS horizon);
+static LSN parse_checkpoint_record(LSN lsn);
+static void new_transaction(uint16 sid, TrID long_id, LSN undo_lsn,
+                            LSN first_undo_lsn);
+static int new_table(uint16 sid, const char *name, LSN lsn_of_file_id);
+static int new_page(uint32 fileid, pgcache_page_no_t pageid, LSN rec_lsn,
+                    struct st_dirty_page *dirty_page);
+static int close_all_tables(void);
+static my_bool close_one_table(const char *name, TRANSLOG_ADDRESS addr);
+static void print_redo_phase_progress(TRANSLOG_ADDRESS addr);
+static void delete_all_transactions();
+
+/** @brief global [out] buffer for translog_read_record(); never shrinks */
+static struct
+{
+  /*
+    uchar* is more adapted (less casts) than char*, thus we don't use
+    LEX_STRING.
+  */
+  uchar *str;
+  size_t length;
+} log_record_buffer;
+static void enlarge_buffer(const TRANSLOG_HEADER_BUFFER *rec)
+{
+  if (log_record_buffer.length < rec->record_length)
+  {
+    log_record_buffer.length= rec->record_length;
+    log_record_buffer.str= my_realloc(log_record_buffer.str,
+                                      rec->record_length,
+                                      MYF(MY_WME | MY_ALLOW_ZERO_PTR));
+  }
+}
+/** @brief Tells what kind of progress message was printed to the error log */
+static enum recovery_message_type
+{
+  REC_MSG_NONE= 0, REC_MSG_REDO, REC_MSG_UNDO, REC_MSG_FLUSH
+} recovery_message_printed;
+
+
+/* Hook to ensure we get nicer output if we get an error */
+
+int maria_recover_error_handler_hook(uint error, const char *str,
+                                     myf flags)
+{
+  if (procent_printed)
+  {
+    procent_printed= 0;
+    fputc('\n', stderr);
+    fflush(stderr);
+  }
+  return (*save_error_handler_hook)(error, str, flags);
+}
+
+/* Define this if you want gdb to break in some interesting situations */
+#define ALERT_USER()
+
+static void print_preamble()
+{
+  ma_message_no_user(ME_JUST_INFO, "starting recovery");
+}
+
+
+/**
+   @brief Recovers from the last checkpoint.
+
+   Runs the REDO phase using special structures, then sets up the playground
+   of runtime: recreates transactions inside trnman, open tables with their
+   two-byte-id mapping; takes a checkpoint and runs the UNDO phase. Closes all
+   tables.
+
+   @return Operation status
+     @retval 0      OK
+     @retval !=0    Error
+*/
+
+int maria_recovery_from_log(void)
+{
+  int res= 1;
+  FILE *trace_file;
+  uint warnings_count;
+#ifdef EXTRA_DEBUG
+  char name_buff[FN_REFLEN];
+#endif
+  DBUG_ENTER("maria_recovery_from_log");
+
+  DBUG_ASSERT(!maria_in_recovery);
+  maria_in_recovery= TRUE;
+
+#ifdef EXTRA_DEBUG
+  fn_format(name_buff, "aria_recovery.trace", maria_data_root, "", MYF(0));
+  trace_file= my_fopen(name_buff, O_WRONLY|O_APPEND|O_CREAT, MYF(MY_WME));
+#else
+  trace_file= NULL; /* no trace file for being fast */
+#endif
+  tprint(trace_file, "TRACE of the last Aria recovery from mysqld\n");
+  DBUG_ASSERT(maria_pagecache->inited);
+  res= maria_apply_log(LSN_IMPOSSIBLE, LSN_IMPOSSIBLE, MARIA_LOG_APPLY,
+                       trace_file, TRUE, TRUE, TRUE, &warnings_count);
+  if (!res)
+  {
+    if (warnings_count == 0 && recovery_found_crashed_tables == 0)
+      tprint(trace_file, "SUCCESS\n");
+    else
+      tprint(trace_file, "DOUBTFUL (%u warnings, check previous output)\n",
+             warnings_count);
+  }
+  if (trace_file)
+    my_fclose(trace_file, MYF(0));
+  maria_in_recovery= FALSE;
+  DBUG_RETURN(res);
+}
+
+
+/**
+   @brief Displays and/or applies the log
+
+   @param  from_lsn        LSN from which log reading/applying should start;
+                           LSN_IMPOSSIBLE means "use last checkpoint"
+   @param  end_lsn         Apply until this. LSN_IMPOSSIBLE means until end.
+   @param  apply           how log records should be applied or not
+   @param  trace_file      trace file where progress/debug messages will go
+   @param  skip_DDLs_arg   Should DDL records (CREATE/RENAME/DROP/REPAIR)
+                           be skipped by the REDO phase or not
+   @param  take_checkpoints Should we take checkpoints or not.
+   @param[out] warnings_count Count of warnings will be put there
+
+   @todo This trace_file thing is primitive; soon we will make it similar to
+   ma_check_print_warning() etc, and a successful recovery does not need to
+   create a trace file. But for debugging now it is useful.
+
+   @return Operation status
+     @retval 0      OK
+     @retval !=0    Error
+*/
+
+int maria_apply_log(LSN from_lsn, LSN end_lsn,
+                    enum maria_apply_log_way apply,
+                    FILE *trace_file,
+                    my_bool should_run_undo_phase, my_bool skip_DDLs_arg,
+                    my_bool take_checkpoints, uint *warnings_count)
+{
+  int error= 0;
+  uint uncommitted_trans;
+  ulonglong old_now;
+  my_bool abort_message_printed= 0;
+  DBUG_ENTER("maria_apply_log");
+
+  DBUG_ASSERT(apply == MARIA_LOG_APPLY || !should_run_undo_phase);
+  DBUG_ASSERT(!maria_multi_threaded);
+  recovery_warnings= recovery_found_crashed_tables= 0;
+  maria_recovery_changed_data= 0;
+  /* checkpoints can happen only if TRNs have been built */
+  DBUG_ASSERT(should_run_undo_phase || !take_checkpoints);
+  DBUG_ASSERT(end_lsn == LSN_IMPOSSIBLE || should_run_undo_phase == 0);
+  all_active_trans= (struct st_trn_for_recovery *)
+    my_malloc((SHORT_TRID_MAX + 1) * sizeof(struct st_trn_for_recovery),
+              MYF(MY_ZEROFILL));
+  all_tables= (struct st_table_for_recovery *)
+    my_malloc((SHARE_ID_MAX + 1) * sizeof(struct st_table_for_recovery),
+              MYF(MY_ZEROFILL));
+
+  save_error_handler_hook= error_handler_hook;
+  error_handler_hook= maria_recover_error_handler_hook;
+
+  if (!all_active_trans || !all_tables)
+    goto err;
+
+  if (take_checkpoints && ma_checkpoint_init(0))
+    goto err;
+
+  recovery_message_printed= REC_MSG_NONE;
+  checkpoint_useful= trns_created= FALSE;
+  tracef= trace_file;
+#ifdef INSTANT_FLUSH_OF_MESSAGES
+  /* enable this for instant flush of messages to trace file */
+  setbuf(tracef, NULL);
+#endif
+  skip_DDLs= skip_DDLs_arg;
+  skipped_undo_phase= 0;
+
+  if (from_lsn == LSN_IMPOSSIBLE)
+  {
+    if (last_checkpoint_lsn == LSN_IMPOSSIBLE)
+    {
+      from_lsn= translog_first_lsn_in_log();
+      if (unlikely(from_lsn == LSN_ERROR))
+        goto err;
+    }
+    else
+    {
+      from_lsn= parse_checkpoint_record(last_checkpoint_lsn);
+      if (from_lsn == LSN_ERROR)
+        goto err;
+    }
+  }
+
+  now= my_getsystime();
+  in_redo_phase= TRUE;
+  trnman_init(max_trid_in_control_file);
+  if (run_redo_phase(from_lsn, end_lsn, apply))
+  {
+    ma_message_no_user(0, "Redo phase failed");
+    trnman_destroy();
+    goto err;
+  }
+  trnman_destroy();
+
+  if (end_lsn != LSN_IMPOSSIBLE)
+  {
+    abort_message_printed= 1;
+    if (!trace_file)
+      fputc('\n', stderr);
+    my_message(HA_ERR_INITIALIZATION,
+               "Maria recovery aborted as end_lsn/end of file was reached",
+               MYF(0));
+    goto err2;
+  }
+
+  if ((uncommitted_trans=
+       end_of_redo_phase(should_run_undo_phase)) == (uint)-1)
+  {
+    ma_message_no_user(0, "End of redo phase failed");
+    goto err;
+  }
+  in_redo_phase= FALSE;
+
+  old_now= now;
+  now= my_getsystime();
+  if (recovery_message_printed == REC_MSG_REDO)
+  {
+    double phase_took= (now - old_now)/10000000.0;
+    /*
+      Detailed progress info goes to stderr, because ma_message_no_user()
+      cannot put several messages on one line.
+    */
+    procent_printed= 1;
+    fprintf(stderr, " (%.1f seconds); ", phase_took);
+    fflush(stderr);
+  }
+
+  /**
+     REDO phase does not fill blocks' rec_lsn, so a checkpoint now would be
+     wrong: if a future recovery used it, the REDO phase would always
+     start from the checkpoint and never from before, wrongly skipping REDOs
+     (tested). Another problem is that the REDO phase uses
+     PAGECACHE_PLAIN_PAGE, while Checkpoint only collects PAGECACHE_LSN_PAGE.
+
+     @todo fix this. pagecache_write() now can have a rec_lsn argument. And we
+     could make a function which goes through pages at end of REDO phase and
+     changes their type.
+  */
+#ifdef FIX_AND_ENABLE_LATER
+  if (take_checkpoints && checkpoint_useful)
+  {
+    /*
+      We take a checkpoint as it can save future recovery work if we crash
+      during the UNDO phase. But we don't flush pages, as UNDOs will change
+      them again probably.
+      If we wanted to take checkpoints in the middle of the REDO phase, at a
+      moment when we haven't reached the end of log so don't have exact data
+      about transactions, we could write a special checkpoint: containing only
+      the list of dirty pages, otherwise to be treated as if it was at the
+      same LSN as the last checkpoint.
+    */
+    if (ma_checkpoint_execute(CHECKPOINT_INDIRECT, FALSE))
+      goto err;
+  }
+#endif
+
+  if (should_run_undo_phase)
+  {
+    if (run_undo_phase(uncommitted_trans))
+    {
+      ma_message_no_user(0, "Undo phase failed");
+      goto err;
+    }
+  }
+  else if (uncommitted_trans > 0)
+  {
+    eprint(tracef, "***WARNING: %u uncommitted transactions; some tables may"
+           " be left inconsistent!***", uncommitted_trans);
+    recovery_warnings++;
+  }
+
+  if (skipped_undo_phase)
+  {
+    /*
+      We could want to print a list of tables for which UNDOs were skipped,
+      but not one line per skipped UNDO.
+    */
+    eprint(tracef, "***WARNING: %lu UNDO records skipped in UNDO phase; some"
+           " tables may be left inconsistent!***", skipped_undo_phase);
+    recovery_warnings++;
+  }
+
+  old_now= now;
+  now= my_getsystime();
+  if (recovery_message_printed == REC_MSG_UNDO)
+  {
+    double phase_took= (now - old_now)/10000000.0;
+    procent_printed= 1;
+    fprintf(stderr, " (%.1f seconds); ", phase_took);
+    fflush(stderr);
+  }
+
+  /*
+    we don't use maria_panic() because it would maria_end(), and Recovery does
+    not want that (we want to keep some modules initialized for runtime).
+  */
+  if (close_all_tables())
+  {
+    ma_message_no_user(0, "closing of tables failed");
+    goto err;
+  }
+
+  old_now= now;
+  now= my_getsystime();
+  if (recovery_message_printed == REC_MSG_FLUSH)
+  {
+    double phase_took= (now - old_now)/10000000.0;
+    procent_printed= 1;
+    fprintf(stderr, " (%.1f seconds); ", phase_took);
+    fflush(stderr);
+  }
+
+  if (take_checkpoints && checkpoint_useful)
+  {
+    /* No dirty pages, all tables are closed, no active transactions, save: */
+    if (ma_checkpoint_execute(CHECKPOINT_FULL, FALSE))
+      goto err;
+  }
+
+  goto end;
+err:
+  tprint(tracef, "\nRecovery of tables with transaction logs FAILED\n");
+err2:
+  if (trns_created)
+    delete_all_transactions();
+  error= 1;
+  if (close_all_tables())
+  {
+    ma_message_no_user(0, "closing of tables failed");
+  }
+end:
+  error_handler_hook= save_error_handler_hook;
+  hash_free(&all_dirty_pages);
+  bzero(&all_dirty_pages, sizeof(all_dirty_pages));
+  my_free(dirty_pages_pool, MYF(MY_ALLOW_ZERO_PTR));
+  dirty_pages_pool= NULL;
+  my_free(all_tables, MYF(MY_ALLOW_ZERO_PTR));
+  all_tables= NULL;
+  my_free(all_active_trans, MYF(MY_ALLOW_ZERO_PTR));
+  all_active_trans= NULL;
+  my_free(log_record_buffer.str, MYF(MY_ALLOW_ZERO_PTR));
+  log_record_buffer.str= NULL;
+  log_record_buffer.length= 0;
+  ma_checkpoint_end();
+  *warnings_count= recovery_warnings + recovery_found_crashed_tables;
+  if (recovery_message_printed != REC_MSG_NONE)
+  {
+    if (procent_printed)
+    {
+      procent_printed= 0;
+      fprintf(stderr, "\n");
+      fflush(stderr);
+    }
+    if (!error)
+    {
+      ma_message_no_user(ME_JUST_INFO, "recovery done");
+      maria_recovery_changed_data= 1;
+    }
+  }
+  else if (!error && max_trid_in_control_file != max_long_trid)
+  {
+    /*
+      maria_end() will set max trid in log file so that one can run
+      maria_chk on the tables
+    */
+    maria_recovery_changed_data= 1;
+  }
+
+  if (error && !abort_message_printed)
+  {
+    if (!trace_file)
+      fputc('\n', stderr);
+    my_message(HA_ERR_INITIALIZATION,
+               "Aria recovery failed. Please run aria_chk -r on all Aria "
+               "tables and delete all aria_log.######## files", MYF(0));
+  }
+  procent_printed= 0;
+  /*
+    We don't cleanly close tables if we hit some error (may corrupt them by
+    flushing some wrong blocks made from wrong REDOs). It also leaves their
+    open_count>0, which ensures that --aria-recover, if used, will try to
+    repair them.
+  */
+  DBUG_RETURN(error);
+}
+
+
+/* very basic info about the record's header */
+static void display_record_position(const LOG_DESC *log_desc,
+                                    const TRANSLOG_HEADER_BUFFER *rec,
+                                    uint number)
+{
+  /*
+    if number==0, we're going over records which we had already seen and which
+    form a group, so we indent below the group's end record
+  */
+  tprint(tracef,
+         "%sRec#%u LSN (%lu,0x%lx) short_trid %u %s(num_type:%u) len %lu\n",
+         number ? "" : "   ", number, LSN_IN_PARTS(rec->lsn),
+         rec->short_trid, log_desc->name, rec->type,
+         (ulong)rec->record_length);
+  if (rec->type == LOGREC_DEBUG_INFO)
+  {
+    /* Print some extra information */
+    (*log_desc->record_execute_in_redo_phase)(rec);
+  }
+}
+
+
+static int display_and_apply_record(const LOG_DESC *log_desc,
+                                    const TRANSLOG_HEADER_BUFFER *rec)
+{
+  int error;
+  if (log_desc->record_execute_in_redo_phase == NULL)
+  {
+    /* die on all not-yet-handled records :) */
+    DBUG_ASSERT("one more hook to write" == 0);
+    return 1;
+  }
+  if (rec->type == LOGREC_DEBUG_INFO)
+  {
+    /* Query already printed by display_record_position() */
+    return 0;
+  }
+  if ((error= (*log_desc->record_execute_in_redo_phase)(rec)))
+    eprint(tracef, "Got error %d when executing record %s",
+           my_errno, log_desc->name);
+  return error;
+}
+
+
+prototype_redo_exec_hook(LONG_TRANSACTION_ID)
+{
+  uint16 sid= rec->short_trid;
+  TrID long_trid= all_active_trans[sid].long_trid;
+  /*
+    Any incomplete group should be of an old crash which already had a
+    recovery and thus has logged INCOMPLETE_GROUP which we must have seen.
+  */
+  DBUG_ASSERT(all_active_trans[sid].group_start_lsn == LSN_IMPOSSIBLE);
+  if (long_trid != 0)
+  {
+    LSN ulsn= all_active_trans[sid].undo_lsn;
+    /*
+      If the first record of that transaction is after 'rec', it's probably
+      because that transaction was found in the checkpoint record, and then
+      it's ok, we can forget about that transaction (we'll meet it later
+      again in the REDO phase) and replace it with the one in 'rec'.
+    */
+    if ((ulsn != LSN_IMPOSSIBLE) &&
+        (cmp_translog_addr(ulsn, rec->lsn) < 0))
+    {
+      char llbuf[22];
+      llstr(long_trid, llbuf);
+      eprint(tracef, "Found an old transaction long_trid %s short_trid %u"
+             " with same short id as this new transaction, and has neither"
+             " committed nor rollback (undo_lsn: (%lu,0x%lx))",
+             llbuf, sid, LSN_IN_PARTS(ulsn));
+      goto err;
+    }
+  }
+  long_trid= uint6korr(rec->header);
+  new_transaction(sid, long_trid, LSN_IMPOSSIBLE, LSN_IMPOSSIBLE);
+  goto end;
+err:
+  ALERT_USER();
+  return 1;
+end:
+  return 0;
+}
+
+
+static void new_transaction(uint16 sid, TrID long_id, LSN undo_lsn,
+                            LSN first_undo_lsn)
+{
+  char llbuf[22];
+  all_active_trans[sid].long_trid= long_id;
+  llstr(long_id, llbuf);
+  tprint(tracef, "Transaction long_trid %s short_trid %u starts,"
+         " undo_lsn (%lu,0x%lx) first_undo_lsn (%lu,0x%lx)\n",
+         llbuf, sid, LSN_IN_PARTS(undo_lsn), LSN_IN_PARTS(first_undo_lsn));
+  all_active_trans[sid].undo_lsn= undo_lsn;
+  all_active_trans[sid].first_undo_lsn= first_undo_lsn;
+  set_if_bigger(max_long_trid, long_id);
+}
+
+
+prototype_redo_exec_hook_dummy(CHECKPOINT)
+{
+  /* the only checkpoint we care about was found via control file, ignore */
+  return 0;
+}
+
+
+prototype_redo_exec_hook_dummy(INCOMPLETE_GROUP)
+{
+  /* abortion was already made */
+  return 0;
+}
+
+
+prototype_redo_exec_hook(INCOMPLETE_LOG)
+{
+  MARIA_HA *info;
+  if (skip_DDLs)
+  {
+    tprint(tracef, "we skip DDLs\n");
+    return 0;
+  }
+  if ((info= get_MARIA_HA_from_REDO_record(rec)) == NULL)
+  {
+    /* no such table, don't need to warn */
+    return 0;
+  }
+
+  if (maria_is_crashed(info))
+    return 0;
+
+  if (info->s->state.is_of_horizon > rec->lsn)
+  {
+    /*
+      This table was repaired at a time after this log entry.
+      We can assume that all rows was inserted sucessfully and we don't
+      have to warn about that the inserted data was not logged
+    */
+    return 0;
+  }
+
+  /*
+    Example of what can go wrong when replaying DDLs:
+    CREATE TABLE t (logged); INSERT INTO t VALUES(1) (logged);
+    ALTER TABLE t ... which does
+    CREATE a temporary table #sql... (logged)
+    INSERT data from t into #sql... (not logged)
+    RENAME #sql TO t (logged)
+    Removing tables by hand and replaying the log will leave in the
+    end an empty table "t": missing records. If after the RENAME an INSERT
+    into t was done, that row had number 1 in its page, executing the
+    REDO_INSERT_ROW_HEAD on the recreated empty t will fail (assertion
+    failure in _ma_apply_redo_insert_row_head_or_tail(): new data page is
+    created whereas rownr is not 0).
+    So when the server disables logging for ALTER TABLE or CREATE SELECT, it
+    logs LOGREC_INCOMPLETE_LOG to warn aria_read_log and then the user.
+
+    Another issue is that replaying of DDLs is not correct enough to work if
+    there was a crash during a DDL (see comment in execution of
+    REDO_RENAME_TABLE ).
+  */
+
+  eprint(tracef, "***WARNING: Aria engine currently logs no records "
+          "about insertion of data by ALTER TABLE and CREATE SELECT, "
+          "as they are not necessary for recovery; "
+          "present applying of log records to table '%s' may well not work."
+          "***", info->s->index_file_name.str);
+
+  /* Prevent using the table for anything else than undo repair */
+  _ma_mark_file_crashed(info->s);
+  recovery_warnings++;
+  return 0;
+}
+
+
+static my_bool create_database_if_not_exists(const char *name)
+{
+  char dirname[FN_REFLEN];
+  size_t length;
+  MY_STAT stat_info;
+  DBUG_ENTER("create_database_if_not_exists");
+
+  dirname_part(dirname, name, &length);
+  if (!length)
+  {
+    /* Skip files without directores */
+    DBUG_RETURN(0);
+  }
+  /*
+    Safety;  Don't create files with hard path;
+    Should never happen with MariaDB
+    If hard path, then error will be detected when trying to create index file
+  */
+  if (test_if_hard_path(dirname))
+    DBUG_RETURN(0);
+
+  if (my_stat(dirname,&stat_info,MYF(0)))
+    DBUG_RETURN(0);
+
+
+  tprint(tracef, "Creating not existing database '%s'\n", dirname);
+  if (my_mkdir(dirname, 0777, MYF(MY_WME)))
+  {
+    eprint(tracef, "***WARNING: Can't create not existing database '%s'",
+           dirname);
+    DBUG_RETURN(1);
+  }
+  DBUG_RETURN(0);
+}
+
+    
+
+
+
+prototype_redo_exec_hook(REDO_CREATE_TABLE)
+{
+  File dfile= -1, kfile= -1;
+  char *linkname_ptr, filename[FN_REFLEN], *name, *ptr, *ptr2,
+    *data_file_name, *index_file_name;
+  uchar *kfile_header;
+  myf create_flag;
+  uint flags;
+  int error= 1, create_mode= O_RDWR | O_TRUNC, i;
+  MARIA_HA *info= NULL;
+  uint kfile_size_before_extension, keystart;
+  DBUG_ENTER("exec_REDO_LOGREC_REDO_CREATE_TABLE");
+
+  if (skip_DDLs)
+  {
+    tprint(tracef, "we skip DDLs\n");
+    DBUG_RETURN(0);
+  }
+  enlarge_buffer(rec);
+  if (log_record_buffer.str == NULL ||
+      translog_read_record(rec->lsn, 0, rec->record_length,
+                           log_record_buffer.str, NULL) !=
+      rec->record_length)
+  {
+    eprint(tracef, "Failed to read record");
+    goto end;
+  }
+  name= (char *)log_record_buffer.str;
+  /*
+    TRUNCATE TABLE and REPAIR USE_FRM call maria_create(), so below we can
+    find a REDO_CREATE_TABLE for a table which we have open, that's why we
+    need to look for any open instances and close them first.
+  */
+  if (close_one_table(name, rec->lsn))
+  {
+    eprint(tracef, "Table '%s' got error %d on close", name, my_errno);
+    ALERT_USER();
+    goto end;
+  }
+  /* we try hard to get create_rename_lsn, to avoid mistakes if possible */
+  info= maria_open(name, O_RDONLY, HA_OPEN_FOR_REPAIR);
+  if (info)
+  {
+    MARIA_SHARE *share= info->s;
+    /* check that we're not already using it */
+    if (share->reopen != 1)
+    {
+      eprint(tracef, "Table '%s is already open (reopen=%u)",
+             name, share->reopen);
+      ALERT_USER();
+      goto end;
+    }
+    DBUG_ASSERT(share->now_transactional == share->base.born_transactional);
+    if (!share->base.born_transactional)
+    {
+      /*
+        could be that transactional table was later dropped, and a non-trans
+        one was renamed to its name, thus create_rename_lsn is 0 and should
+        not be trusted.
+      */
+      tprint(tracef, "Table '%s' is not transactional, ignoring creation\n",
+             name);
+      ALERT_USER();
+      error= 0;
+      goto end;
+    }
+    if (cmp_translog_addr(share->state.create_rename_lsn, rec->lsn) >= 0)
+    {
+      tprint(tracef, "Table '%s' has create_rename_lsn (%lu,0x%lx) more "
+             "recent than record, ignoring creation",
+             name, LSN_IN_PARTS(share->state.create_rename_lsn));
+      error= 0;
+      goto end;
+    }
+    if (maria_is_crashed(info))
+    {
+      eprint(tracef, "Table '%s' is crashed, can't recreate it", name);
+      ALERT_USER();
+      goto end;
+    }
+    maria_close(info);
+    info= NULL;
+  }
+  else
+  {
+    /* one or two files absent, or header corrupted... */
+    tprint(tracef, "Table '%s' can't be opened (Error: %d)\n",
+           name, my_errno);
+  }
+  /* if does not exist, or is older, overwrite it */
+  ptr= name + strlen(name) + 1;
+  if ((flags= ptr[0] ? HA_DONT_TOUCH_DATA : 0))
+    tprint(tracef, ", we will only touch index file");
+  ptr++;
+  kfile_size_before_extension= uint2korr(ptr);
+  ptr+= 2;
+  keystart= uint2korr(ptr);
+  ptr+= 2;
+  kfile_header= (uchar *)ptr;
+  ptr+= kfile_size_before_extension;
+  /* set header lsns */
+  ptr2= (char *) kfile_header + sizeof(info->s->state.header) +
+    MARIA_FILE_CREATE_RENAME_LSN_OFFSET;
+  for (i= 0; i<3; i++)
+  {
+    lsn_store(ptr2, rec->lsn);
+    ptr2+= LSN_STORE_SIZE;
+  }
+  data_file_name= ptr;
+  ptr+= strlen(data_file_name) + 1;
+  index_file_name= ptr;
+  ptr+= strlen(index_file_name) + 1;
+  /** @todo handle symlinks */
+  if (data_file_name[0] || index_file_name[0])
+  {
+    eprint(tracef, "Table '%s' DATA|INDEX DIRECTORY clauses are not handled",
+           name);
+    goto end;
+  }
+  if (create_database_if_not_exists(name))
+    goto end;
+  fn_format(filename, name, "", MARIA_NAME_IEXT,
+            (MY_UNPACK_FILENAME |
+             (flags & HA_DONT_TOUCH_DATA) ? MY_RETURN_REAL_PATH : 0) |
+            MY_APPEND_EXT);
+  linkname_ptr= NULL;
+  create_flag= MY_DELETE_OLD;
+  tprint(tracef, "Table '%s' creating as '%s'\n", name, filename);
+  if ((kfile= my_create_with_symlink(linkname_ptr, filename, 0, create_mode,
+                                     MYF(MY_WME|create_flag))) < 0)
+  {
+    eprint(tracef, "Failed to create index file");
+    goto end;
+  }
+  if (my_pwrite(kfile, kfile_header,
+                kfile_size_before_extension, 0, MYF(MY_NABP|MY_WME)) ||
+      my_chsize(kfile, keystart, 0, MYF(MY_WME)))
+  {
+    eprint(tracef, "Failed to write to index file");
+    goto end;
+  }
+  if (!(flags & HA_DONT_TOUCH_DATA))
+  {
+    fn_format(filename,name,"", MARIA_NAME_DEXT,
+              MY_UNPACK_FILENAME | MY_APPEND_EXT);
+    linkname_ptr= NULL;
+    create_flag=MY_DELETE_OLD;
+    if (((dfile=
+          my_create_with_symlink(linkname_ptr, filename, 0, create_mode,
+                                 MYF(MY_WME | create_flag))) < 0) ||
+        my_close(dfile, MYF(MY_WME)))
+    {
+      eprint(tracef, "Failed to create data file");
+      goto end;
+    }
+    /*
+      we now have an empty data file. To be able to
+      _ma_initialize_data_file() we need some pieces of the share to be
+      correctly filled. So we just open the table (fortunately, an empty
+      data file does not preclude this).
+    */
+    if (((info= maria_open(name, O_RDONLY, 0)) == NULL) ||
+        _ma_initialize_data_file(info->s, info->dfile.file))
+    {
+      eprint(tracef, "Failed to open new table or write to data file");
+      goto end;
+    }
+  }
+  error= 0;
+end:
+  if (kfile >= 0)
+    error|= my_close(kfile, MYF(MY_WME));
+  if (info != NULL)
+    error|= maria_close(info);
+  DBUG_RETURN(error);
+}
+
+
+prototype_redo_exec_hook(REDO_RENAME_TABLE)
+{
+  char *old_name, *new_name;
+  int error= 1;
+  MARIA_HA *info= NULL;
+  DBUG_ENTER("exec_REDO_LOGREC_REDO_RENAME_TABLE");
+
+  if (skip_DDLs)
+  {
+    tprint(tracef, "we skip DDLs\n");
+    DBUG_RETURN(0);
+  }
+  enlarge_buffer(rec);
+  if (log_record_buffer.str == NULL ||
+      translog_read_record(rec->lsn, 0, rec->record_length,
+                           log_record_buffer.str, NULL) !=
+      rec->record_length)
+  {
+    eprint(tracef, "Failed to read record");
+    goto end;
+  }
+  old_name= (char *)log_record_buffer.str;
+  new_name= old_name + strlen(old_name) + 1;
+  tprint(tracef, "Table '%s' to rename to '%s'; old-name table ", old_name,
+         new_name);
+  /*
+    Here is why we skip CREATE/DROP/RENAME when doing a recovery from
+    ha_maria (whereas we do when called from aria_read_log). Consider:
+    CREATE TABLE t;
+    RENAME TABLE t to u;
+    DROP TABLE u;
+    RENAME TABLE v to u; # crash between index rename and data rename.
+    And do a Recovery (not removing tables beforehand).
+    Recovery replays CREATE, then RENAME: the maria_open("t") works,
+    maria_open("u") does not (no data file) so table "u" is considered
+    inexistent and so maria_rename() is done which overwrites u's index file,
+    which is lost. Ok, the data file (v.MAD) is still available, but only a
+    REPAIR USE_FRM can rebuild the index, which is unsafe and downtime.
+    So it is preferrable to not execute RENAME, and leave the "mess" of files,
+    rather than possibly destroy a file. DBA will manually rename files.
+    A safe recovery method would probably require checking the existence of
+    the index file and of the data file separately (not via maria_open()), and
+    maybe also to store a create_rename_lsn in the data file too
+    For now, all we risk is to leave the mess (half-renamed files) left by the
+    crash. We however sync files and directories at each file rename. The SQL
+    layer is anyway not crash-safe for DDLs (except the repartioning-related
+    ones).
+    We replay DDLs in aria_read_log to be able to recreate tables from
+    scratch. It means that "aria_read_log -a" should not be used on a
+    database which just crashed during a DDL. And also ALTER TABLE does not
+    log insertions of records into the temporary table, so replaying may
+    fail (grep for INCOMPLETE_LOG in files).
+  */
+  info= maria_open(old_name, O_RDONLY, HA_OPEN_FOR_REPAIR);
+  if (info)
+  {
+    MARIA_SHARE *share= info->s;
+    if (!share->base.born_transactional)
+    {
+      tprint(tracef, ", is not transactional, ignoring renaming\n");
+      ALERT_USER();
+      error= 0;
+      goto end;
+    }
+    if (cmp_translog_addr(share->state.create_rename_lsn, rec->lsn) >= 0)
+    {
+      tprint(tracef, ", has create_rename_lsn (%lu,0x%lx) more recent than"
+             " record, ignoring renaming",
+             LSN_IN_PARTS(share->state.create_rename_lsn));
+      error= 0;
+      goto end;
+    }
+    if (maria_is_crashed(info))
+    {
+      tprint(tracef, ", is crashed, can't rename it");
+      ALERT_USER();
+      goto end;
+    }
+    if (close_one_table(info->s->open_file_name.str, rec->lsn) ||
+        maria_close(info))
+      goto end;
+    info= NULL;
+    tprint(tracef, ", is ok for renaming; new-name table ");
+  }
+  else /* one or two files absent, or header corrupted... */
+  {
+    tprint(tracef, ", can't be opened, probably does not exist");
+    error= 0;
+    goto end;
+  }
+  /*
+    We must also check the create_rename_lsn of the 'new_name' table if it
+    exists: otherwise we may, with our rename which overwrites, destroy
+    another table. For example:
+    CREATE TABLE t;
+    RENAME t to u;
+    DROP TABLE u;
+    RENAME v to u; # v is an old table, its creation/insertions not in log
+    And start executing the log (without removing tables beforehand): creates
+    t, renames it to u (if not testing create_rename_lsn) thus overwriting
+    old-named v, drops u, and we are stuck, we have lost data.
+  */
+  info= maria_open(new_name, O_RDONLY, HA_OPEN_FOR_REPAIR);
+  if (info)
+  {
+    MARIA_SHARE *share= info->s;
+    /* We should not have open instances on this table. */
+    if (share->reopen != 1)
+    {
+      tprint(tracef, ", is already open (reopen=%u)\n", share->reopen);
+      ALERT_USER();
+      goto end;
+    }
+    if (!share->base.born_transactional)
+    {
+      tprint(tracef, ", is not transactional, ignoring renaming\n");
+      ALERT_USER();
+      goto drop;
+    }
+    if (cmp_translog_addr(share->state.create_rename_lsn, rec->lsn) >= 0)
+    {
+      tprint(tracef, ", has create_rename_lsn (%lu,0x%lx) more recent than"
+             " record, ignoring renaming",
+             LSN_IN_PARTS(share->state.create_rename_lsn));
+      /*
+        We have to drop the old_name table. Consider:
+        CREATE TABLE t;
+        CREATE TABLE v;
+        RENAME TABLE t to u;
+        DROP TABLE u;
+        RENAME TABLE v to u;
+        and apply the log without removing tables beforehand. t will be
+        created, v too; in REDO_RENAME u will be more recent, but we still
+        have to drop t otherwise it stays.
+      */
+      goto drop;
+    }
+    if (maria_is_crashed(info))
+    {
+      tprint(tracef, ", is crashed, can't rename it");
+      ALERT_USER();
+      goto end;
+    }
+    if (maria_close(info))
+      goto end;
+    info= NULL;
+    /* abnormal situation */
+    tprint(tracef, ", exists but is older than record, can't rename it");
+    goto end;
+  }
+  else /* one or two files absent, or header corrupted... */
+    tprint(tracef, ", can't be opened, probably does not exist");
+  tprint(tracef, ", renaming '%s'", old_name);
+  if (maria_rename(old_name, new_name))
+  {
+    eprint(tracef, "Failed to rename table");
+    goto end;
+  }
+  info= maria_open(new_name, O_RDONLY, 0);
+  if (info == NULL)
+  {
+    eprint(tracef, "Failed to open renamed table");
+    goto end;
+  }
+  if (_ma_update_state_lsns(info->s, rec->lsn, info->s->state.create_trid,
+                            TRUE, TRUE))
+    goto end;
+  if (maria_close(info))
+    goto end;
+  info= NULL;
+  error= 0;
+  goto end;
+drop:
+  tprint(tracef, ", only dropping '%s'", old_name);
+  if (maria_delete_table(old_name))
+  {
+    eprint(tracef, "Failed to drop table");
+    goto end;
+  }
+  error= 0;
+  goto end;
+end:
+  tprint(tracef, "\n");
+  if (info != NULL)
+    error|= maria_close(info);
+  DBUG_RETURN(error);
+}
+
+
+/*
+  The record may come from REPAIR, ALTER TABLE ENABLE KEYS, OPTIMIZE.
+*/
+prototype_redo_exec_hook(REDO_REPAIR_TABLE)
+{
+  int error= 1;
+  MARIA_HA *info;
+  HA_CHECK param;
+  char *name;
+  my_bool quick_repair;
+  DBUG_ENTER("exec_REDO_LOGREC_REDO_REPAIR_TABLE");
+
+  if (skip_DDLs)
+  {
+    /*
+      REPAIR is not exactly a DDL, but it manipulates files without logging
+      insertions into them.
+    */
+    tprint(tracef, "we skip DDLs\n");
+    DBUG_RETURN(0);
+  }
+  if ((info= get_MARIA_HA_from_REDO_record(rec)) == NULL)
+    DBUG_RETURN(0);
+  if (maria_is_crashed(info))
+  {
+    tprint(tracef, "we skip repairing crashed table\n");
+    DBUG_RETURN(0);
+  }
+  /*
+    Otherwise, the mapping is newer than the table, and our record is newer
+    than the mapping, so we can repair.
+  */
+  tprint(tracef, "   repairing...\n");
+
+  maria_chk_init(&param);
+  param.isam_file_name= name= info->s->open_file_name.str;
+  param.testflag= uint8korr(rec->header + FILEID_STORE_SIZE);
+  param.tmpdir= maria_tmpdir;
+  param.max_trid= max_long_trid;
+  DBUG_ASSERT(maria_tmpdir);
+
+  info->s->state.key_map= uint8korr(rec->header + FILEID_STORE_SIZE + 8);
+  quick_repair= test(param.testflag & T_QUICK);
+
+  if (param.testflag & T_REP_PARALLEL)
+  {
+    if (maria_repair_parallel(&param, info, name, quick_repair))
+      goto end;
+  }
+  else if (param.testflag & T_REP_BY_SORT)
+  {
+    if (maria_repair_by_sort(&param, info, name, quick_repair))
+      goto end;
+  }
+  else if (maria_repair(&param, info, name, quick_repair))
+    goto end;
+
+  if (_ma_update_state_lsns(info->s, rec->lsn, trnman_get_min_safe_trid(),
+                            TRUE, !(param.testflag & T_NO_CREATE_RENAME_LSN)))
+    goto end;
+  error= 0;
+
+end:
+  DBUG_RETURN(error);
+}
+
+
+prototype_redo_exec_hook(REDO_DROP_TABLE)
+{
+  char *name;
+  int error= 1;
+  MARIA_HA *info;
+  if (skip_DDLs)
+  {
+    tprint(tracef, "we skip DDLs\n");
+    return 0;
+  }
+  enlarge_buffer(rec);
+  if (log_record_buffer.str == NULL ||
+      translog_read_record(rec->lsn, 0, rec->record_length,
+                           log_record_buffer.str, NULL) !=
+      rec->record_length)
+  {
+    eprint(tracef, "Failed to read record");
+    return 1;
+  }
+  name= (char *)log_record_buffer.str;
+  tprint(tracef, "Table '%s'", name);
+  info= maria_open(name, O_RDONLY, HA_OPEN_FOR_REPAIR);
+  if (info)
+  {
+    MARIA_SHARE *share= info->s;
+    if (!share->base.born_transactional)
+    {
+      tprint(tracef, ", is not transactional, ignoring removal\n");
+      ALERT_USER();
+      error= 0;
+      goto end;
+    }
+    if (cmp_translog_addr(share->state.create_rename_lsn, rec->lsn) >= 0)
+    {
+      tprint(tracef, ", has create_rename_lsn (%lu,0x%lx) more recent than"
+             " record, ignoring removal",
+             LSN_IN_PARTS(share->state.create_rename_lsn));
+      error= 0;
+      goto end;
+    }
+    if (maria_is_crashed(info))
+    {
+      tprint(tracef, ", is crashed, can't drop it");
+      ALERT_USER();
+      goto end;
+    }
+    if (close_one_table(info->s->open_file_name.str, rec->lsn) ||
+        maria_close(info))
+      goto end;
+    info= NULL;
+    /* if it is older, or its header is corrupted, drop it */
+    tprint(tracef, ", dropping '%s'", name);
+    if (maria_delete_table(name))
+    {
+      eprint(tracef, "Failed to drop table");
+      goto end;
+    }
+  }
+  else /* one or two files absent, or header corrupted... */
+    tprint(tracef,", can't be opened, probably does not exist");
+  error= 0;
+end:
+  tprint(tracef, "\n");
+  if (info != NULL)
+    error|= maria_close(info);
+  return error;
+}
+
+
+prototype_redo_exec_hook(FILE_ID)
+{
+  uint16 sid;
+  int error= 1;
+  const char *name;
+  MARIA_HA *info;
+  DBUG_ENTER("exec_REDO_LOGREC_FILE_ID");
+
+  if (cmp_translog_addr(rec->lsn, checkpoint_start) < 0)
+  {
+    /*
+      If that mapping was still true at checkpoint time, it was found in
+      checkpoint record, no need to recreate it. If that mapping had ended at
+      checkpoint time (table was closed or repaired), a flush and force
+      happened and so mapping is not needed.
+    */
+    tprint(tracef, "ignoring because before checkpoint\n");
+    DBUG_RETURN(0);
+  }
+
+  enlarge_buffer(rec);
+  if (log_record_buffer.str == NULL ||
+      translog_read_record(rec->lsn, 0, rec->record_length,
+                           log_record_buffer.str, NULL) !=
+       rec->record_length)
+  {
+    eprint(tracef, "Failed to read record");
+    goto end;
+  }
+  sid= fileid_korr(log_record_buffer.str);
+  info= all_tables[sid].info;
+  if (info != NULL)
+  {
+    tprint(tracef, "   Closing table '%s'\n", info->s->open_file_name.str);
+    prepare_table_for_close(info, rec->lsn);
+    if (maria_close(info))
+    {
+      eprint(tracef, "Failed to close table");
+      goto end;
+    }
+    all_tables[sid].info= NULL;
+  }
+  name= (char *)log_record_buffer.str + FILEID_STORE_SIZE;
+  if (new_table(sid, name, rec->lsn))
+    goto end;
+  error= 0;
+end:
+  DBUG_RETURN(error);
+}
+
+
+static int new_table(uint16 sid, const char *name, LSN lsn_of_file_id)
+{
+  /*
+    -1 (skip table): close table and return 0;
+    1 (error): close table and return 1;
+    0 (success): leave table open and return 0.
+  */
+  int error= 1;
+  MARIA_HA *info;
+  MARIA_SHARE *share;
+  my_off_t dfile_len, kfile_len;
+  DBUG_ENTER("new_table");
+
+  checkpoint_useful= TRUE;
+  if ((name == NULL) || (name[0] == 0))
+  {
+    /*
+      we didn't use DBUG_ASSERT() because such record corruption could
+      silently pass in the "info == NULL" test below.
+    */
+    tprint(tracef, ", record is corrupted");
+    info= NULL;
+    recovery_warnings++;
+    goto end;
+  }
+  tprint(tracef, "Table '%s', id %u", name, sid);
+  info= maria_open(name, O_RDWR, HA_OPEN_FOR_REPAIR);
+  if (info == NULL)
+  {
+    tprint(tracef, ", is absent (must have been dropped later?)"
+           " or its header is so corrupted that we cannot open it;"
+           " we skip it");
+    if (my_errno != ENOENT)
+      recovery_found_crashed_tables++;
+    error= 0;
+    goto end;
+  }
+  share= info->s;
+  /* check that we're not already using it */
+  if (share->reopen != 1)
+  {
+    tprint(tracef, ", is already open (reopen=%u)\n", share->reopen);
+    /*
+      It could be that we have in the log
+      FILE_ID(t1,10) ... (t1 was flushed) ... FILE_ID(t1,12);
+    */
+    if (close_one_table(share->open_file_name.str, lsn_of_file_id))
+      goto end;
+    /*
+      We should not try to get length of data/index files as the files
+      are not on disk yet.
+    */
+    _ma_tmp_disable_logging_for_table(info, FALSE);
+    goto set_lsn_of_file_id;
+  }
+  if (!share->base.born_transactional)
+  {
+    /*
+      This can happen if one converts a transactional table to a
+      not transactional table
+    */
+    tprint(tracef, ", is not transactional.  Ignoring open request");
+    error= -1;
+    recovery_warnings++;
+    goto end;
+  }
+  if (cmp_translog_addr(lsn_of_file_id, share->state.create_rename_lsn) <= 0)
+  {
+    tprint(tracef, ", has create_rename_lsn (%lu,0x%lx) more recent than"
+           " LOGREC_FILE_ID's LSN (%lu,0x%lx), ignoring open request",
+           LSN_IN_PARTS(share->state.create_rename_lsn),
+           LSN_IN_PARTS(lsn_of_file_id));
+    recovery_warnings++;
+    error= -1;
+    goto end;
+    /*
+      Note that we tested that before testing corruption; a recent corrupted
+      table is not a blocker for the present log record.
+    */
+  }
+  if (maria_is_crashed(info))
+  {
+    eprint(tracef, "Table '%s' is crashed, skipping it. Please repair it with"
+           " aria_chk -r", share->open_file_name.str);
+    recovery_found_crashed_tables++;
+    error= -1; /* not fatal, try with other tables */
+    goto end;
+    /*
+      Note that if a first recovery fails to apply a REDO, it marks the table
+      corrupted and stops the entire recovery. A second recovery will find the
+      table is marked corrupted and skip it (and thus possibly handle other
+      tables).
+    */
+  }
+  /* don't log any records for this work */
+  _ma_tmp_disable_logging_for_table(info, FALSE);
+  /* execution of some REDO records relies on data_file_length */
+  dfile_len= my_seek(info->dfile.file, 0, SEEK_END, MYF(MY_WME));
+  kfile_len= my_seek(info->s->kfile.file, 0, SEEK_END, MYF(MY_WME));
+  if ((dfile_len == MY_FILEPOS_ERROR) ||
+      (kfile_len == MY_FILEPOS_ERROR))
+  {
+    tprint(tracef, ", length unknown\n");
+    recovery_warnings++;
+    goto end;
+  }
+  if (share->state.state.data_file_length != dfile_len)
+  {
+    tprint(tracef, ", has wrong state.data_file_length (fixing it)");
+    share->state.state.data_file_length= dfile_len;
+  }
+  if (share->state.state.key_file_length != kfile_len)
+  {
+    tprint(tracef, ", has wrong state.key_file_length (fixing it)");
+    share->state.state.key_file_length= kfile_len;
+  }
+  if ((dfile_len % share->block_size) || (kfile_len % share->block_size))
+  {
+    tprint(tracef, ", has too short last page\n");
+    /* Recovery will fix this, no error */
+    ALERT_USER();
+  }
+
+set_lsn_of_file_id:
+  /*
+    This LSN serves in this situation; assume log is:
+    FILE_ID(6->"t2") REDO_INSERT(6) FILE_ID(6->"t1") CHECKPOINT(6->"t1")
+    then crash, checkpoint record is parsed and opens "t1" with id 6; assume
+    REDO phase starts from the REDO_INSERT above: it will wrongly try to
+    update a page of "t1". With this LSN below, REDO_INSERT can realize the
+    mapping is newer than itself, and not execute.
+    Same example is possible with UNDO_INSERT (update of the state).
+  */
+  info->s->lsn_of_file_id= lsn_of_file_id;
+  all_tables[sid].info= info;
+  /*
+    We don't set info->s->id, it would be useless (no logging in REDO phase);
+    if you change that, know that some records in REDO phase call
+    _ma_update_state_lsns() which resets info->s->id.
+  */
+  tprint(tracef, ", opened");
+  error= 0;
+end:
+  tprint(tracef, "\n");
+  if (error)
+  {
+    if (info != NULL)
+      maria_close(info);
+    if (error == -1)
+      error= 0;
+  }
+  DBUG_RETURN(error);
+}
+
+/*
+  NOTE
+  This is called for REDO_INSERT_ROW_HEAD and READ_NEW_ROW_HEAD
+*/
+
+prototype_redo_exec_hook(REDO_INSERT_ROW_HEAD)
+{
+  int error= 1;
+  uchar *buff= NULL;
+  MARIA_HA *info= get_MARIA_HA_from_REDO_record(rec);
+  if (info == NULL || maria_is_crashed(info))
+
+  {
+    /*
+      Table was skipped at open time (because later dropped/renamed, not
+      transactional, or create_rename_lsn newer than LOGREC_FILE_ID), or
+      record was skipped due to skip_redo_lsn; it is not an error.
+    */
+    return 0;
+  }
+  /*
+    Note that REDO is per page, we still consider it if its transaction
+    committed long ago and is unknown.
+  */
+  /*
+    If REDO's LSN is > page's LSN (read from disk), we are going to modify the
+    page and change its LSN. The normal runtime code stores the UNDO's LSN
+    into the page. Here storing the REDO's LSN (rec->lsn) would work
+    (we are not writing to the log here, so don't have to "flush up to UNDO's
+    LSN"). But in a test scenario where we do updates at runtime, then remove
+    tables, apply the log and check that this results in the same table as at
+    runtime, putting the same LSN as runtime had done will decrease
+    differences. So we use the UNDO's LSN which is current_group_end_lsn.
+  */
+  enlarge_buffer(rec);
+  if (log_record_buffer.str == NULL)
+  {
+    eprint(tracef, "Failed to read allocate buffer for record");
+    goto end;
+  }
+  if (translog_read_record(rec->lsn, 0, rec->record_length,
+                           log_record_buffer.str, NULL) !=
+      rec->record_length)
+  {
+    eprint(tracef, "Failed to read record");
+    goto end;
+  }
+  buff= log_record_buffer.str;
+  if (_ma_apply_redo_insert_row_head_or_tail(info, current_group_end_lsn,
+                                             HEAD_PAGE,
+                                             (rec->type ==
+                                              LOGREC_REDO_NEW_ROW_HEAD),
+                                             buff + FILEID_STORE_SIZE,
+                                             buff +
+                                             FILEID_STORE_SIZE +
+                                             PAGE_STORE_SIZE +
+                                             DIRPOS_STORE_SIZE,
+                                             rec->record_length -
+                                             (FILEID_STORE_SIZE +
+                                              PAGE_STORE_SIZE +
+                                              DIRPOS_STORE_SIZE)))
+    goto end;
+  error= 0;
+end:
+  return error;
+}
+
+/*
+  NOTE
+  This is called for REDO_INSERT_ROW_TAIL and READ_NEW_ROW_TAIL
+*/
+
+prototype_redo_exec_hook(REDO_INSERT_ROW_TAIL)
+{
+  int error= 1;
+  uchar *buff;
+  MARIA_HA *info= get_MARIA_HA_from_REDO_record(rec);
+  if (info == NULL || maria_is_crashed(info))
+    return 0;
+  enlarge_buffer(rec);
+  if (log_record_buffer.str == NULL ||
+      translog_read_record(rec->lsn, 0, rec->record_length,
+                           log_record_buffer.str, NULL) !=
+       rec->record_length)
+  {
+    eprint(tracef, "Failed to read record");
+    goto end;
+  }
+  buff= log_record_buffer.str;
+  if (_ma_apply_redo_insert_row_head_or_tail(info, current_group_end_lsn,
+                                             TAIL_PAGE,
+                                             (rec->type ==
+                                              LOGREC_REDO_NEW_ROW_TAIL),
+                                             buff + FILEID_STORE_SIZE,
+                                             buff +
+                                             FILEID_STORE_SIZE +
+                                             PAGE_STORE_SIZE +
+                                             DIRPOS_STORE_SIZE,
+                                             rec->record_length -
+                                             (FILEID_STORE_SIZE +
+                                              PAGE_STORE_SIZE +
+                                              DIRPOS_STORE_SIZE)))
+    goto end;
+  error= 0;
+
+end:
+  return error;
+}
+
+
+prototype_redo_exec_hook(REDO_INSERT_ROW_BLOBS)
+{
+  int error= 1;
+  uchar *buff;
+  uint number_of_blobs, number_of_ranges;
+  pgcache_page_no_t first_page, last_page;
+  char llbuf1[22], llbuf2[22];
+  MARIA_HA *info= get_MARIA_HA_from_REDO_record(rec);
+  if (info == NULL  || maria_is_crashed(info))
+    return 0;
+  enlarge_buffer(rec);
+  if (log_record_buffer.str == NULL ||
+      translog_read_record(rec->lsn, 0, rec->record_length,
+                           log_record_buffer.str, NULL) !=
+       rec->record_length)
+  {
+    eprint(tracef, "Failed to read record");
+    goto end;
+  }
+  buff= log_record_buffer.str;
+  if (_ma_apply_redo_insert_row_blobs(info, current_group_end_lsn,
+                                      buff, rec->lsn, &number_of_blobs,
+                                      &number_of_ranges,
+                                      &first_page, &last_page))
+    goto end;
+  llstr(first_page, llbuf1);
+  llstr(last_page, llbuf2);
+  tprint(tracef, " %u blobs %u ranges, first page %s last %s",
+         number_of_blobs, number_of_ranges, llbuf1, llbuf2);
+
+  error= 0;
+
+end:
+  tprint(tracef, " \n");
+  return error;
+}
+
+
+prototype_redo_exec_hook(REDO_PURGE_ROW_HEAD)
+{
+  int error= 1;
+  MARIA_HA *info= get_MARIA_HA_from_REDO_record(rec);
+  if (info == NULL || maria_is_crashed(info))
+    return 0;
+  if (_ma_apply_redo_purge_row_head_or_tail(info, current_group_end_lsn,
+                                            HEAD_PAGE,
+                                            rec->header + FILEID_STORE_SIZE))
+    goto end;
+  error= 0;
+end:
+  return error;
+}
+
+
+prototype_redo_exec_hook(REDO_PURGE_ROW_TAIL)
+{
+  int error= 1;
+  MARIA_HA *info= get_MARIA_HA_from_REDO_record(rec);
+  if (info == NULL || maria_is_crashed(info))
+    return 0;
+  if (_ma_apply_redo_purge_row_head_or_tail(info, current_group_end_lsn,
+                                            TAIL_PAGE,
+                                            rec->header + FILEID_STORE_SIZE))
+    goto end;
+  error= 0;
+end:
+  return error;
+}
+
+
+prototype_redo_exec_hook(REDO_FREE_BLOCKS)
+{
+  int error= 1;
+  uchar *buff;
+  MARIA_HA *info= get_MARIA_HA_from_REDO_record(rec);
+  if (info == NULL || maria_is_crashed(info))
+    return 0;
+  enlarge_buffer(rec);
+
+  if (log_record_buffer.str == NULL ||
+      translog_read_record(rec->lsn, 0, rec->record_length,
+                           log_record_buffer.str, NULL) !=
+       rec->record_length)
+  {
+    eprint(tracef, "Failed to read record");
+    goto end;
+  }
+
+  buff= log_record_buffer.str;
+  if (_ma_apply_redo_free_blocks(info, current_group_end_lsn,
+                                 buff + FILEID_STORE_SIZE))
+    goto end;
+  error= 0;
+end:
+  return error;
+}
+
+
+prototype_redo_exec_hook(REDO_FREE_HEAD_OR_TAIL)
+{
+  int error= 1;
+  MARIA_HA *info= get_MARIA_HA_from_REDO_record(rec);
+  if (info == NULL || maria_is_crashed(info))
+    return 0;
+
+  if (_ma_apply_redo_free_head_or_tail(info, current_group_end_lsn,
+                                       rec->header + FILEID_STORE_SIZE))
+    goto end;
+  error= 0;
+end:
+  return error;
+}
+
+
+prototype_redo_exec_hook(REDO_DELETE_ALL)
+{
+  int error= 1;
+  MARIA_HA *info= get_MARIA_HA_from_REDO_record(rec);
+  if (info == NULL)
+    return 0;
+  tprint(tracef, "   deleting all %lu rows\n",
+         (ulong)info->s->state.state.records);
+  if (maria_delete_all_rows(info))
+    goto end;
+  error= 0;
+end:
+  return error;
+}
+
+
+prototype_redo_exec_hook(REDO_INDEX)
+{
+  int error= 1;
+  MARIA_HA *info= get_MARIA_HA_from_REDO_record(rec);
+  if (info == NULL || maria_is_crashed(info))
+    return 0;
+  enlarge_buffer(rec);
+
+  if (log_record_buffer.str == NULL ||
+      translog_read_record(rec->lsn, 0, rec->record_length,
+                           log_record_buffer.str, NULL) !=
+       rec->record_length)
+  {
+    eprint(tracef, "Failed to read record");
+    goto end;
+  }
+
+  if (_ma_apply_redo_index(info, current_group_end_lsn,
+                           log_record_buffer.str + FILEID_STORE_SIZE,
+                           rec->record_length - FILEID_STORE_SIZE))
+    goto end;
+  error= 0;
+end:
+  return error;
+}
+
+prototype_redo_exec_hook(REDO_INDEX_NEW_PAGE)
+{
+  int error= 1;
+  MARIA_HA *info= get_MARIA_HA_from_REDO_record(rec);
+  if (info == NULL || maria_is_crashed(info))
+    return 0;
+  enlarge_buffer(rec);
+
+  if (log_record_buffer.str == NULL ||
+      translog_read_record(rec->lsn, 0, rec->record_length,
+                           log_record_buffer.str, NULL) !=
+       rec->record_length)
+  {
+    eprint(tracef, "Failed to read record");
+    goto end;
+  }
+
+  if (_ma_apply_redo_index_new_page(info, current_group_end_lsn,
+                                    log_record_buffer.str + FILEID_STORE_SIZE,
+                                    rec->record_length - FILEID_STORE_SIZE))
+    goto end;
+  error= 0;
+end:
+  return error;
+}
+
+
+prototype_redo_exec_hook(REDO_INDEX_FREE_PAGE)
+{
+  int error= 1;
+  MARIA_HA *info= get_MARIA_HA_from_REDO_record(rec);
+  if (info == NULL || maria_is_crashed(info))
+    return 0;
+
+  if (_ma_apply_redo_index_free_page(info, current_group_end_lsn,
+                                     rec->header + FILEID_STORE_SIZE))
+    goto end;
+  error= 0;
+end:
+  return error;
+}
+
+
+prototype_redo_exec_hook(REDO_BITMAP_NEW_PAGE)
+{
+  int error= 1;
+  MARIA_HA *info= get_MARIA_HA_from_REDO_record(rec);
+  if (info == NULL || maria_is_crashed(info))
+    return 0;
+  enlarge_buffer(rec);
+
+  if (log_record_buffer.str == NULL ||
+      translog_read_record(rec->lsn, 0, rec->record_length,
+                           log_record_buffer.str, NULL) !=
+       rec->record_length)
+  {
+    eprint(tracef, "Failed to read record");
+    goto end;
+  }
+
+  if (cmp_translog_addr(rec->lsn, checkpoint_start) >= 0)
+  {
+    /*
+      Record is potentially after the bitmap flush made by Checkpoint, so has
+      to be replayed. It may overwrite a more recent state but that will be
+      corrected by all upcoming REDOs for data pages.
+      If the condition is false, we must not apply the record: it is unneeded
+      and nocive (may not be corrected as REDOs can be skipped due to
+      dirty-pages list).
+    */
+    if (_ma_apply_redo_bitmap_new_page(info, current_group_end_lsn,
+                                       log_record_buffer.str +
+                                       FILEID_STORE_SIZE))
+      goto end;
+  }
+  error= 0;
+end:
+  return error;
+}
+
+
+static inline void set_undo_lsn_for_active_trans(uint16 short_trid, LSN lsn)
+{
+  if (all_active_trans[short_trid].long_trid == 0)
+  {
+    /* transaction unknown, so has committed or fully rolled back long ago */
+    return;
+  }
+  all_active_trans[short_trid].undo_lsn= lsn;
+  if (all_active_trans[short_trid].first_undo_lsn == LSN_IMPOSSIBLE)
+    all_active_trans[short_trid].first_undo_lsn= lsn;
+}
+
+
+prototype_redo_exec_hook(UNDO_ROW_INSERT)
+{
+  MARIA_HA *info= get_MARIA_HA_from_UNDO_record(rec);
+  MARIA_SHARE *share;
+
+  set_undo_lsn_for_active_trans(rec->short_trid, rec->lsn);
+  if (info == NULL)
+  {
+    /*
+      Note that we set undo_lsn anyway. So that if the transaction is later
+      rolled back, this UNDO is tried for execution and we get a warning (as
+      it would then be abnormal that info==NULL).
+    */
+    return 0;
+  }
+  share= info->s;
+  if (cmp_translog_addr(rec->lsn, share->state.is_of_horizon) >= 0)
+  {
+    tprint(tracef, "   state has LSN (%lu,0x%lx) older than record, updating"
+           " rows' count\n", LSN_IN_PARTS(share->state.is_of_horizon));
+    share->state.state.records++;
+    if (share->calc_checksum)
+    {
+      uchar buff[HA_CHECKSUM_STORE_SIZE];
+      if (translog_read_record(rec->lsn, LSN_STORE_SIZE + FILEID_STORE_SIZE +
+                               PAGE_STORE_SIZE + DIRPOS_STORE_SIZE,
+                               HA_CHECKSUM_STORE_SIZE, buff, NULL) !=
+          HA_CHECKSUM_STORE_SIZE)
+      {
+        eprint(tracef, "Failed to read record");
+        return 1;
+      }
+      share->state.state.checksum+= ha_checksum_korr(buff);
+    }
+    info->s->state.changed|= (STATE_CHANGED | STATE_NOT_ANALYZED |
+                              STATE_NOT_ZEROFILLED | STATE_NOT_MOVABLE);
+  }
+  tprint(tracef, "   rows' count %lu\n", (ulong)info->s->state.state.records);
+  /* Unpin all pages, stamp them with UNDO's LSN */
+  _ma_unpin_all_pages(info, rec->lsn);
+  return 0;
+}
+
+
+prototype_redo_exec_hook(UNDO_ROW_DELETE)
+{
+  MARIA_HA *info= get_MARIA_HA_from_UNDO_record(rec);
+  MARIA_SHARE *share;
+
+  set_undo_lsn_for_active_trans(rec->short_trid, rec->lsn);
+  if (info == NULL)
+    return 0;
+  share= info->s;
+  if (cmp_translog_addr(rec->lsn, share->state.is_of_horizon) >= 0)
+  {
+    tprint(tracef, "   state older than record\n");
+    share->state.state.records--;
+    if (share->calc_checksum)
+    {
+      uchar buff[HA_CHECKSUM_STORE_SIZE];
+      if (translog_read_record(rec->lsn, LSN_STORE_SIZE + FILEID_STORE_SIZE +
+                               PAGE_STORE_SIZE + DIRPOS_STORE_SIZE + 2 +
+                               PAGERANGE_STORE_SIZE,
+                               HA_CHECKSUM_STORE_SIZE, buff, NULL) !=
+          HA_CHECKSUM_STORE_SIZE)
+      {
+        eprint(tracef, "Failed to read record");
+        return 1;
+      }
+      share->state.state.checksum+= ha_checksum_korr(buff);
+    }
+    share->state.changed|= (STATE_CHANGED | STATE_NOT_ANALYZED |
+                            STATE_NOT_OPTIMIZED_ROWS | STATE_NOT_ZEROFILLED |
+                            STATE_NOT_MOVABLE);
+  }
+  tprint(tracef, "   rows' count %lu\n", (ulong)share->state.state.records);
+  _ma_unpin_all_pages(info, rec->lsn);
+  return 0;
+}
+
+
+prototype_redo_exec_hook(UNDO_ROW_UPDATE)
+{
+  MARIA_HA *info= get_MARIA_HA_from_UNDO_record(rec);
+  MARIA_SHARE *share;
+
+  set_undo_lsn_for_active_trans(rec->short_trid, rec->lsn);
+  if (info == NULL)
+    return 0;
+  share= info->s;
+  if (cmp_translog_addr(rec->lsn, share->state.is_of_horizon) >= 0)
+  {
+    if (share->calc_checksum)
+    {
+      uchar buff[HA_CHECKSUM_STORE_SIZE];
+      if (translog_read_record(rec->lsn, LSN_STORE_SIZE + FILEID_STORE_SIZE +
+                               PAGE_STORE_SIZE + DIRPOS_STORE_SIZE,
+                               HA_CHECKSUM_STORE_SIZE, buff, NULL) !=
+          HA_CHECKSUM_STORE_SIZE)
+      {
+        eprint(tracef, "Failed to read record");
+        return 1;
+      }
+      share->state.state.checksum+= ha_checksum_korr(buff);
+    }
+    share->state.changed|= (STATE_CHANGED | STATE_NOT_ANALYZED |
+                            STATE_NOT_ZEROFILLED | STATE_NOT_MOVABLE);
+  }
+  _ma_unpin_all_pages(info, rec->lsn);
+  return 0;
+}
+
+
+prototype_redo_exec_hook(UNDO_KEY_INSERT)
+{
+  MARIA_HA *info;
+  MARIA_SHARE *share;
+
+  set_undo_lsn_for_active_trans(rec->short_trid, rec->lsn);
+  if (!(info= get_MARIA_HA_from_UNDO_record(rec)))
+    return 0;
+  share= info->s;
+  if (cmp_translog_addr(rec->lsn, share->state.is_of_horizon) >= 0)
+  {
+    const uchar *ptr= rec->header + LSN_STORE_SIZE + FILEID_STORE_SIZE;
+    uint keynr= key_nr_korr(ptr);
+    if (share->base.auto_key == (keynr + 1)) /* it's auto-increment */
+    {
+      const HA_KEYSEG *keyseg= info->s->keyinfo[keynr].seg;
+      ulonglong value;
+      char llbuf[22];
+      uchar *to;
+      tprint(tracef, "   state older than record\n");
+      /* we read the record to find the auto_increment value */
+      enlarge_buffer(rec);
+      if (log_record_buffer.str == NULL ||
+          translog_read_record(rec->lsn, 0, rec->record_length,
+                               log_record_buffer.str, NULL) !=
+          rec->record_length)
+      {
+        eprint(tracef, "Failed to read record");
+        return 1;
+      }
+      to= log_record_buffer.str + LSN_STORE_SIZE + FILEID_STORE_SIZE +
+        KEY_NR_STORE_SIZE;
+      if (keyseg->flag & HA_SWAP_KEY)
+      {
+        /* We put key from log record to "data record" packing format... */
+        uchar reversed[MARIA_MAX_KEY_BUFF];
+        uchar *key_ptr= to;
+        uchar *key_end= key_ptr + keyseg->length;
+        to= reversed + keyseg->length;
+        do
+        {
+          *--to= *key_ptr++;
+        } while (key_ptr != key_end);
+        /* ... so that we can read it with: */
+      }
+      value= ma_retrieve_auto_increment(to, keyseg->type);
+      set_if_bigger(share->state.auto_increment, value);
+      llstr(share->state.auto_increment, llbuf);
+      tprint(tracef, "   auto-inc %s\n", llbuf);
+    }
+  }
+  _ma_unpin_all_pages(info, rec->lsn);
+  return 0;
+}
+
+
+prototype_redo_exec_hook(UNDO_KEY_DELETE)
+{
+  MARIA_HA *info;
+
+  set_undo_lsn_for_active_trans(rec->short_trid, rec->lsn);
+  if (!(info= get_MARIA_HA_from_UNDO_record(rec)))
+    return 0;
+  _ma_unpin_all_pages(info, rec->lsn);
+  return 0;
+}
+
+
+prototype_redo_exec_hook(UNDO_KEY_DELETE_WITH_ROOT)
+{
+  MARIA_HA *info= get_MARIA_HA_from_UNDO_record(rec);
+  MARIA_SHARE *share;
+
+  set_undo_lsn_for_active_trans(rec->short_trid, rec->lsn);
+  if (info == NULL)
+    return 0;
+  share= info->s;
+  if (cmp_translog_addr(rec->lsn, share->state.is_of_horizon) >= 0)
+  {
+    uint key_nr;
+    my_off_t page;
+    key_nr= key_nr_korr(rec->header + LSN_STORE_SIZE + FILEID_STORE_SIZE);
+    page=  page_korr(rec->header +  LSN_STORE_SIZE + FILEID_STORE_SIZE +
+                     KEY_NR_STORE_SIZE);
+    share->state.key_root[key_nr]= (page == IMPOSSIBLE_PAGE_NO ?
+                                    HA_OFFSET_ERROR :
+                                    page * share->block_size);
+  }
+  _ma_unpin_all_pages(info, rec->lsn);
+  return 0;
+}
+
+
+prototype_redo_exec_hook(UNDO_BULK_INSERT)
+{
+  /*
+    If the repair finished it wrote and sync the state. If it didn't finish,
+    we are going to empty the table and that will fix the state.
+  */
+  set_undo_lsn_for_active_trans(rec->short_trid, rec->lsn);
+  return 0;
+}
+
+
+prototype_redo_exec_hook(IMPORTED_TABLE)
+{
+  char *name;
+  enlarge_buffer(rec);
+  if (log_record_buffer.str == NULL ||
+      translog_read_record(rec->lsn, 0, rec->record_length,
+                           log_record_buffer.str, NULL) !=
+      rec->record_length)
+  {
+    eprint(tracef, "Failed to read record");
+    return 1;
+  }
+  name= (char *)log_record_buffer.str;
+  tprint(tracef, "Table '%s' was imported (auto-zerofilled) in this Aria instance\n", name);
+  return 0;
+}
+
+
+prototype_redo_exec_hook(COMMIT)
+{
+  uint16 sid= rec->short_trid;
+  TrID long_trid= all_active_trans[sid].long_trid;
+  char llbuf[22];
+  if (long_trid == 0)
+  {
+    tprint(tracef, "We don't know about transaction with short_trid %u;"
+           "it probably committed long ago, forget it\n", sid);
+    bzero(&all_active_trans[sid], sizeof(all_active_trans[sid]));
+    return 0;
+  }
+  llstr(long_trid, llbuf);
+  tprint(tracef, "Transaction long_trid %s short_trid %u committed\n",
+         llbuf, sid);
+  bzero(&all_active_trans[sid], sizeof(all_active_trans[sid]));
+#ifdef MARIA_VERSIONING
+  /*
+    if real recovery:
+    transaction was committed, move it to some separate list for later
+    purging (but don't purge now! purging may have been started before, we
+    may find REDO_PURGE records soon).
+  */
+#endif
+  return 0;
+}
+
+prototype_redo_exec_hook(CLR_END)
+{
+  MARIA_HA *info= get_MARIA_HA_from_UNDO_record(rec);
+  MARIA_SHARE *share;
+  LSN previous_undo_lsn;
+  enum translog_record_type undone_record_type;
+  const LOG_DESC *log_desc;
+  my_bool row_entry= 0;
+  uchar *logpos;
+  DBUG_ENTER("exec_REDO_LOGREC_CLR_END");
+
+  previous_undo_lsn= lsn_korr(rec->header);
+  undone_record_type=
+    clr_type_korr(rec->header + LSN_STORE_SIZE + FILEID_STORE_SIZE);
+  log_desc= &log_record_type_descriptor[undone_record_type];
+
+  set_undo_lsn_for_active_trans(rec->short_trid, previous_undo_lsn);
+  if (info == NULL)
+    DBUG_RETURN(0);
+  share= info->s;
+  tprint(tracef, "   CLR_END was about %s, undo_lsn now LSN (%lu,0x%lx)\n",
+         log_desc->name, LSN_IN_PARTS(previous_undo_lsn));
+
+  enlarge_buffer(rec);
+  if (log_record_buffer.str == NULL ||
+      translog_read_record(rec->lsn, 0, rec->record_length,
+                           log_record_buffer.str, NULL) !=
+      rec->record_length)
+  {
+    eprint(tracef, "Failed to read record");
+    return 1;
+  }
+  logpos= (log_record_buffer.str + LSN_STORE_SIZE + FILEID_STORE_SIZE +
+           CLR_TYPE_STORE_SIZE);
+
+  if (cmp_translog_addr(rec->lsn, share->state.is_of_horizon) >= 0)
+  {
+    tprint(tracef, "   state older than record\n");
+    switch (undone_record_type) {
+    case LOGREC_UNDO_ROW_DELETE:
+      row_entry= 1;
+      share->state.state.records++;
+      break;
+    case LOGREC_UNDO_ROW_INSERT:
+      share->state.state.records--;
+      share->state.changed|= STATE_NOT_OPTIMIZED_ROWS;
+      row_entry= 1;
+      break;
+    case LOGREC_UNDO_ROW_UPDATE:
+      row_entry= 1;
+      break;
+    case LOGREC_UNDO_KEY_INSERT:
+    case LOGREC_UNDO_KEY_DELETE:
+      break;
+    case LOGREC_UNDO_KEY_INSERT_WITH_ROOT:
+    case LOGREC_UNDO_KEY_DELETE_WITH_ROOT:
+    {
+      uint key_nr;
+      my_off_t page;
+      key_nr= key_nr_korr(logpos);
+      page=  page_korr(logpos + KEY_NR_STORE_SIZE);
+      share->state.key_root[key_nr]= (page == IMPOSSIBLE_PAGE_NO ?
+                                      HA_OFFSET_ERROR :
+                                      page * share->block_size);
+      break;
+    }
+    case LOGREC_UNDO_BULK_INSERT:
+      break;
+    default:
+      DBUG_ASSERT(0);
+    }
+    if (row_entry && share->calc_checksum)
+      share->state.state.checksum+= ha_checksum_korr(logpos);
+    share->state.changed|= (STATE_CHANGED | STATE_NOT_ANALYZED |
+                            STATE_NOT_ZEROFILLED | STATE_NOT_MOVABLE);
+  }
+  if (row_entry)
+    tprint(tracef, "   rows' count %lu\n", (ulong)share->state.state.records);
+  _ma_unpin_all_pages(info, rec->lsn);
+  DBUG_RETURN(0);
+}
+
+
+/**
+   Hock to print debug information (like MySQL query)
+*/
+
+prototype_redo_exec_hook(DEBUG_INFO)
+{
+  uchar *data;
+  enum translog_debug_info_type debug_info;
+
+  enlarge_buffer(rec);
+  if (log_record_buffer.str == NULL ||
+      translog_read_record(rec->lsn, 0, rec->record_length,
+                           log_record_buffer.str, NULL) !=
+      rec->record_length)
+  {
+    eprint(tracef, "Failed to read record debug record");
+    return 1;
+  }
+  debug_info= (enum translog_debug_info_type) log_record_buffer.str[0];
+  data= log_record_buffer.str + 1;
+  switch (debug_info) {
+  case LOGREC_DEBUG_INFO_QUERY:
+    tprint(tracef, "Query: %.*s\n", rec->record_length - 1,
+           (char*) data);
+    break;
+  default:
+    DBUG_ASSERT(0);
+  }
+  return 0;
+}
+
+
+/**
+  In some cases we have to skip execution of an UNDO record during the UNDO
+  phase.
+*/
+
+static void skip_undo_record(LSN previous_undo_lsn, TRN *trn)
+{
+  trn->undo_lsn= previous_undo_lsn;
+  if (previous_undo_lsn == LSN_IMPOSSIBLE) /* has fully rolled back */
+    trn->first_undo_lsn= LSN_WITH_FLAGS_TO_FLAGS(trn->first_undo_lsn);
+  skipped_undo_phase++;
+}
+
+
+prototype_undo_exec_hook(UNDO_ROW_INSERT)
+{
+  my_bool error;
+  MARIA_HA *info= get_MARIA_HA_from_UNDO_record(rec);
+  LSN previous_undo_lsn= lsn_korr(rec->header);
+  MARIA_SHARE *share;
+  const uchar *record_ptr;
+
+  if (info == NULL || maria_is_crashed(info))
+  {
+    /*
+      Unlike for REDOs, if the table was skipped it is abnormal; we have a
+      transaction to rollback which used this table, as it is not rolled back
+      it was supposed to hold this table and so the table should still be
+      there. Skip it (user may have repaired the table with maria_chk because
+      it was so badly corrupted that a previous recovery failed) but warn.
+    */
+    skip_undo_record(previous_undo_lsn, trn);
+    return 0;
+  }
+  share= info->s;
+  share->state.changed|= (STATE_CHANGED | STATE_NOT_ANALYZED |
+                          STATE_NOT_OPTIMIZED_ROWS | STATE_NOT_ZEROFILLED |
+                          STATE_NOT_MOVABLE);
+  record_ptr= rec->header;
+  if (share->calc_checksum)
+  {
+    /*
+      We need to read more of the record to put the checksum into the record
+      buffer used by _ma_apply_undo_row_insert().
+      If the table has no live checksum, rec->header will be enough.
+    */
+    enlarge_buffer(rec);
+    if (log_record_buffer.str == NULL ||
+        translog_read_record(rec->lsn, 0, rec->record_length,
+                             log_record_buffer.str, NULL) !=
+        rec->record_length)
+    {
+      eprint(tracef, "Failed to read record");
+      return 1;
+    }
+    record_ptr= log_record_buffer.str;
+  }
+
+  info->trn= trn;
+  error= _ma_apply_undo_row_insert(info, previous_undo_lsn,
+                                   record_ptr + LSN_STORE_SIZE +
+                                   FILEID_STORE_SIZE);
+  info->trn= 0;
+  /* trn->undo_lsn is updated in an inwrite_hook when writing the CLR_END */
+  tprint(tracef, "   rows' count %lu\n", (ulong)info->s->state.state.records);
+  tprint(tracef, "   undo_lsn now LSN (%lu,0x%lx)\n",
+         LSN_IN_PARTS(trn->undo_lsn));
+  return error;
+}
+
+
+prototype_undo_exec_hook(UNDO_ROW_DELETE)
+{
+  my_bool error;
+  MARIA_HA *info= get_MARIA_HA_from_UNDO_record(rec);
+  LSN previous_undo_lsn= lsn_korr(rec->header);
+  MARIA_SHARE *share;
+
+  if (info == NULL || maria_is_crashed(info))
+  {
+    skip_undo_record(previous_undo_lsn, trn);
+    return 0;
+  }
+
+  share= info->s;
+  share->state.changed|= (STATE_CHANGED | STATE_NOT_ANALYZED |
+                          STATE_NOT_ZEROFILLED | STATE_NOT_MOVABLE);
+  enlarge_buffer(rec);
+  if (log_record_buffer.str == NULL ||
+      translog_read_record(rec->lsn, 0, rec->record_length,
+                           log_record_buffer.str, NULL) !=
+       rec->record_length)
+  {
+    eprint(tracef, "Failed to read record");
+    return 1;
+  }
+
+  info->trn= trn;
+  error= _ma_apply_undo_row_delete(info, previous_undo_lsn,
+                                   log_record_buffer.str + LSN_STORE_SIZE +
+                                   FILEID_STORE_SIZE,
+                                   rec->record_length -
+                                   (LSN_STORE_SIZE + FILEID_STORE_SIZE));
+  info->trn= 0;
+  tprint(tracef, "   rows' count %lu\n   undo_lsn now LSN (%lu,0x%lx)\n",
+         (ulong)share->state.state.records, LSN_IN_PARTS(trn->undo_lsn));
+  return error;
+}
+
+
+prototype_undo_exec_hook(UNDO_ROW_UPDATE)
+{
+  my_bool error;
+  MARIA_HA *info= get_MARIA_HA_from_UNDO_record(rec);
+  LSN previous_undo_lsn= lsn_korr(rec->header);
+  MARIA_SHARE *share;
+
+  if (info == NULL || maria_is_crashed(info))
+  {
+    skip_undo_record(previous_undo_lsn, trn);
+    return 0;
+  }
+
+  share= info->s;
+  share->state.changed|= (STATE_CHANGED | STATE_NOT_ANALYZED |
+                          STATE_NOT_ZEROFILLED | STATE_NOT_MOVABLE);
+  enlarge_buffer(rec);
+  if (log_record_buffer.str == NULL ||
+      translog_read_record(rec->lsn, 0, rec->record_length,
+                           log_record_buffer.str, NULL) !=
+       rec->record_length)
+  {
+    eprint(tracef, "Failed to read record");
+    return 1;
+  }
+
+  info->trn= trn;
+  error= _ma_apply_undo_row_update(info, previous_undo_lsn,
+                                   log_record_buffer.str + LSN_STORE_SIZE +
+                                   FILEID_STORE_SIZE,
+                                   rec->record_length -
+                                   (LSN_STORE_SIZE + FILEID_STORE_SIZE));
+  info->trn= 0;
+  tprint(tracef, "   undo_lsn now LSN (%lu,0x%lx)\n",
+         LSN_IN_PARTS(trn->undo_lsn));
+  return error;
+}
+
+
+prototype_undo_exec_hook(UNDO_KEY_INSERT)
+{
+  my_bool error;
+  MARIA_HA *info= get_MARIA_HA_from_UNDO_record(rec);
+  LSN previous_undo_lsn= lsn_korr(rec->header);
+  MARIA_SHARE *share;
+
+  if (info == NULL || maria_is_crashed(info))
+  {
+    skip_undo_record(previous_undo_lsn, trn);
+    return 0;
+  }
+
+  share= info->s;
+  share->state.changed|= (STATE_CHANGED | STATE_NOT_ANALYZED |
+                          STATE_NOT_ZEROFILLED | STATE_NOT_MOVABLE);
+
+  enlarge_buffer(rec);
+  if (log_record_buffer.str == NULL ||
+      translog_read_record(rec->lsn, 0, rec->record_length,
+                           log_record_buffer.str, NULL) !=
+        rec->record_length)
+  {
+    eprint(tracef, "Failed to read record");
+    return 1;
+  }
+
+  info->trn= trn;
+  error= _ma_apply_undo_key_insert(info, previous_undo_lsn,
+                                   log_record_buffer.str + LSN_STORE_SIZE +
+                                   FILEID_STORE_SIZE,
+                                   rec->record_length - LSN_STORE_SIZE -
+                                   FILEID_STORE_SIZE);
+  info->trn= 0;
+  /* trn->undo_lsn is updated in an inwrite_hook when writing the CLR_END */
+  tprint(tracef, "   undo_lsn now LSN (%lu,0x%lx)\n",
+         LSN_IN_PARTS(trn->undo_lsn));
+  return error;
+}
+
+
+prototype_undo_exec_hook(UNDO_KEY_DELETE)
+{
+  my_bool error;
+  MARIA_HA *info= get_MARIA_HA_from_UNDO_record(rec);
+  LSN previous_undo_lsn= lsn_korr(rec->header);
+  MARIA_SHARE *share;
+
+  if (info == NULL || maria_is_crashed(info))
+  {
+    skip_undo_record(previous_undo_lsn, trn);
+    return 0;
+  }
+
+  share= info->s;
+  share->state.changed|= (STATE_CHANGED | STATE_NOT_ANALYZED |
+                          STATE_NOT_ZEROFILLED | STATE_NOT_MOVABLE);
+
+  enlarge_buffer(rec);
+  if (log_record_buffer.str == NULL ||
+      translog_read_record(rec->lsn, 0, rec->record_length,
+                           log_record_buffer.str, NULL) !=
+        rec->record_length)
+  {
+    eprint(tracef, "Failed to read record");
+    return 1;
+  }
+
+  info->trn= trn;
+  error= _ma_apply_undo_key_delete(info, previous_undo_lsn,
+                                   log_record_buffer.str + LSN_STORE_SIZE +
+                                   FILEID_STORE_SIZE,
+                                   rec->record_length - LSN_STORE_SIZE -
+                                   FILEID_STORE_SIZE, FALSE);
+  info->trn= 0;
+  /* trn->undo_lsn is updated in an inwrite_hook when writing the CLR_END */
+  tprint(tracef, "   undo_lsn now LSN (%lu,0x%lx)\n",
+         LSN_IN_PARTS(trn->undo_lsn));
+  return error;
+}
+
+
+prototype_undo_exec_hook(UNDO_KEY_DELETE_WITH_ROOT)
+{
+  my_bool error;
+  MARIA_HA *info= get_MARIA_HA_from_UNDO_record(rec);
+  LSN previous_undo_lsn= lsn_korr(rec->header);
+  MARIA_SHARE *share;
+
+  if (info == NULL || maria_is_crashed(info))
+  {
+    skip_undo_record(previous_undo_lsn, trn);
+    return 0;
+  }
+
+  share= info->s;
+  share->state.changed|= (STATE_CHANGED | STATE_NOT_ANALYZED |
+                          STATE_NOT_ZEROFILLED | STATE_NOT_MOVABLE);
+
+  enlarge_buffer(rec);
+  if (log_record_buffer.str == NULL ||
+      translog_read_record(rec->lsn, 0, rec->record_length,
+                           log_record_buffer.str, NULL) !=
+        rec->record_length)
+  {
+    eprint(tracef, "Failed to read record");
+    return 1;
+  }
+
+  info->trn= trn;
+  error= _ma_apply_undo_key_delete(info, previous_undo_lsn,
+                                   log_record_buffer.str + LSN_STORE_SIZE +
+                                   FILEID_STORE_SIZE,
+                                   rec->record_length - LSN_STORE_SIZE -
+                                   FILEID_STORE_SIZE, TRUE);
+  info->trn= 0;
+  /* trn->undo_lsn is updated in an inwrite_hook when writing the CLR_END */
+  tprint(tracef, "   undo_lsn now LSN (%lu,0x%lx)\n",
+         LSN_IN_PARTS(trn->undo_lsn));
+  return error;
+}
+
+
+prototype_undo_exec_hook(UNDO_BULK_INSERT)
+{
+  my_bool error;
+  MARIA_HA *info= get_MARIA_HA_from_UNDO_record(rec);
+  LSN previous_undo_lsn= lsn_korr(rec->header);
+  MARIA_SHARE *share;
+
+  /* Here we don't check for crashed as we can undo the bulk insert */
+  if (info == NULL)
+  {
+    skip_undo_record(previous_undo_lsn, trn);
+    return 0;
+  }
+
+  share= info->s;
+  share->state.changed|= (STATE_CHANGED | STATE_NOT_ANALYZED |
+                          STATE_NOT_ZEROFILLED | STATE_NOT_MOVABLE);
+
+  info->trn= trn;
+  error= _ma_apply_undo_bulk_insert(info, previous_undo_lsn);
+  info->trn= 0;
+  /* trn->undo_lsn is updated in an inwrite_hook when writing the CLR_END */
+  tprint(tracef, "   undo_lsn now LSN (%lu,0x%lx)\n",
+         LSN_IN_PARTS(trn->undo_lsn));
+  return error;
+}
+
+
+static int run_redo_phase(LSN lsn, LSN lsn_end, enum maria_apply_log_way apply)
+{
+  TRANSLOG_HEADER_BUFFER rec;
+  struct st_translog_scanner_data scanner;
+  int len;
+  uint i;
+  DBUG_ENTER("run_redo_phase");
+
+  /* install hooks for execution */
+#define install_redo_exec_hook(R)                                        \
+  log_record_type_descriptor[LOGREC_ ## R].record_execute_in_redo_phase= \
+    exec_REDO_LOGREC_ ## R;
+#define install_redo_exec_hook_shared(R,S)                               \
+  log_record_type_descriptor[LOGREC_ ## R].record_execute_in_redo_phase= \
+    exec_REDO_LOGREC_ ## S;
+#define install_undo_exec_hook(R)                                        \
+  log_record_type_descriptor[LOGREC_ ## R].record_execute_in_undo_phase= \
+    exec_UNDO_LOGREC_ ## R;
+  install_redo_exec_hook(LONG_TRANSACTION_ID);
+  install_redo_exec_hook(CHECKPOINT);
+  install_redo_exec_hook(REDO_CREATE_TABLE);
+  install_redo_exec_hook(REDO_RENAME_TABLE);
+  install_redo_exec_hook(REDO_REPAIR_TABLE);
+  install_redo_exec_hook(REDO_DROP_TABLE);
+  install_redo_exec_hook(FILE_ID);
+  install_redo_exec_hook(INCOMPLETE_LOG);
+  install_redo_exec_hook(INCOMPLETE_GROUP);
+  install_redo_exec_hook(REDO_INSERT_ROW_HEAD);
+  install_redo_exec_hook(REDO_INSERT_ROW_TAIL);
+  install_redo_exec_hook(REDO_INSERT_ROW_BLOBS);
+  install_redo_exec_hook(REDO_PURGE_ROW_HEAD);
+  install_redo_exec_hook(REDO_PURGE_ROW_TAIL);
+  install_redo_exec_hook(REDO_FREE_HEAD_OR_TAIL);
+  install_redo_exec_hook(REDO_FREE_BLOCKS);
+  install_redo_exec_hook(REDO_DELETE_ALL);
+  install_redo_exec_hook(REDO_INDEX);
+  install_redo_exec_hook(REDO_INDEX_NEW_PAGE);
+  install_redo_exec_hook(REDO_INDEX_FREE_PAGE);
+  install_redo_exec_hook(REDO_BITMAP_NEW_PAGE);
+  install_redo_exec_hook(UNDO_ROW_INSERT);
+  install_redo_exec_hook(UNDO_ROW_DELETE);
+  install_redo_exec_hook(UNDO_ROW_UPDATE);
+  install_redo_exec_hook(UNDO_KEY_INSERT);
+  install_redo_exec_hook(UNDO_KEY_DELETE);
+  install_redo_exec_hook(UNDO_KEY_DELETE_WITH_ROOT);
+  install_redo_exec_hook(COMMIT);
+  install_redo_exec_hook(CLR_END);
+  install_undo_exec_hook(UNDO_ROW_INSERT);
+  install_undo_exec_hook(UNDO_ROW_DELETE);
+  install_undo_exec_hook(UNDO_ROW_UPDATE);
+  install_undo_exec_hook(UNDO_KEY_INSERT);
+  install_undo_exec_hook(UNDO_KEY_DELETE);
+  install_undo_exec_hook(UNDO_KEY_DELETE_WITH_ROOT);
+  /* REDO_NEW_ROW_HEAD shares entry with REDO_INSERT_ROW_HEAD */
+  install_redo_exec_hook_shared(REDO_NEW_ROW_HEAD, REDO_INSERT_ROW_HEAD);
+  /* REDO_NEW_ROW_TAIL shares entry with REDO_INSERT_ROW_TAIL */
+  install_redo_exec_hook_shared(REDO_NEW_ROW_TAIL, REDO_INSERT_ROW_TAIL);
+  install_redo_exec_hook(UNDO_BULK_INSERT);
+  install_undo_exec_hook(UNDO_BULK_INSERT);
+  install_redo_exec_hook(IMPORTED_TABLE);
+  install_redo_exec_hook(DEBUG_INFO);
+
+  current_group_end_lsn= LSN_IMPOSSIBLE;
+#ifndef DBUG_OFF
+  current_group_table= NULL;
+#endif
+
+  if (unlikely(lsn == LSN_IMPOSSIBLE || lsn == translog_get_horizon()))
+  {
+    tprint(tracef, "checkpoint address refers to the log end log or "
+           "log is empty, nothing to do.\n");
+    DBUG_RETURN(0);
+  }
+
+  len= translog_read_record_header(lsn, &rec);
+
+  if (len == RECHEADER_READ_ERROR)
+  {
+    eprint(tracef, "Failed to read header of the first record.");
+    DBUG_RETURN(1);
+  }
+  if (translog_scanner_init(lsn, 1, &scanner, 1))
+  {
+    tprint(tracef, "Scanner init failed\n");
+    DBUG_RETURN(1);
+  }
+  for (i= 1;;i++)
+  {
+    uint16 sid= rec.short_trid;
+    const LOG_DESC *log_desc= &log_record_type_descriptor[rec.type];
+    display_record_position(log_desc, &rec, i);
+    /*
+      A complete group is a set of log records with an "end mark" record
+      (e.g. a set of REDOs for an operation, terminated by an UNDO for this
+      operation); if there is no "end mark" record the group is incomplete and
+      won't be executed.
+    */
+    if ((log_desc->record_in_group == LOGREC_IS_GROUP_ITSELF) ||
+        (log_desc->record_in_group == LOGREC_LAST_IN_GROUP))
+    {
+      if (all_active_trans[sid].group_start_lsn != LSN_IMPOSSIBLE)
+      {
+        if (log_desc->record_in_group == LOGREC_IS_GROUP_ITSELF)
+        {
+          /*
+            Can happen if the transaction got a table write error, then
+            unlocked tables thus wrote a COMMIT record. Or can be an
+            INCOMPLETE_GROUP record written by a previous recovery.
+          */
+          tprint(tracef, "\nDiscarding incomplete group before this record\n");
+          all_active_trans[sid].group_start_lsn= LSN_IMPOSSIBLE;
+        }
+        else
+        {
+          struct st_translog_scanner_data scanner2;
+          TRANSLOG_HEADER_BUFFER rec2;
+          /*
+            There is a complete group for this transaction, containing more
+            than this event.
+          */
+          tprint(tracef, "   ends a group:\n");
+          len=
+            translog_read_record_header(all_active_trans[sid].group_start_lsn,
+                                        &rec2);
+          if (len < 0) /* EOF or error */
+          {
+            tprint(tracef, "Cannot find record where it should be\n");
+            goto err;
+          }
+          if (lsn_end != LSN_IMPOSSIBLE && rec2.lsn >= lsn_end)
+          {
+            tprint(tracef,
+                   "lsn_end reached at (%lu,0x%lx). "
+                   "Skipping rest of redo entries",
+                   LSN_IN_PARTS(rec2.lsn));
+            translog_destroy_scanner(&scanner);
+            translog_free_record_header(&rec);
+            DBUG_RETURN(0);
+          }
+
+          if (translog_scanner_init(rec2.lsn, 1, &scanner2, 1))
+          {
+            tprint(tracef, "Scanner2 init failed\n");
+            goto err;
+          }
+          current_group_end_lsn= rec.lsn;
+          do
+          {
+            if (rec2.short_trid == sid) /* it's in our group */
+            {
+              const LOG_DESC *log_desc2= &log_record_type_descriptor[rec2.type];
+              display_record_position(log_desc2, &rec2, 0);
+              if (apply == MARIA_LOG_CHECK)
+              {
+                translog_size_t read_len;
+                enlarge_buffer(&rec2);
+                read_len=
+                  translog_read_record(rec2.lsn, 0, rec2.record_length,
+                                       log_record_buffer.str, NULL);
+                if (read_len != rec2.record_length)
+                {
+                  tprint(tracef, "Cannot read record's body: read %u of"
+                         " %u bytes\n", read_len, rec2.record_length);
+                  translog_destroy_scanner(&scanner2);
+                  translog_free_record_header(&rec2);
+                  goto err;
+                }
+              }
+              if (apply == MARIA_LOG_APPLY &&
+                  display_and_apply_record(log_desc2, &rec2))
+              {
+                translog_destroy_scanner(&scanner2);
+                translog_free_record_header(&rec2);
+                goto err;
+              }
+            }
+            translog_free_record_header(&rec2);
+            len= translog_read_next_record_header(&scanner2, &rec2);
+            if (len < 0) /* EOF or error */
+            {
+              tprint(tracef, "Cannot find record where it should be\n");
+              translog_destroy_scanner(&scanner2);
+              translog_free_record_header(&rec2);
+              goto err;
+            }
+          }
+          while (rec2.lsn < rec.lsn);
+          /* group finished */
+          all_active_trans[sid].group_start_lsn= LSN_IMPOSSIBLE;
+          current_group_end_lsn= LSN_IMPOSSIBLE; /* for debugging */
+          display_record_position(log_desc, &rec, 0);
+          translog_destroy_scanner(&scanner2);
+          translog_free_record_header(&rec2);
+        }
+      }
+      if (apply == MARIA_LOG_APPLY &&
+          display_and_apply_record(log_desc, &rec))
+        goto err;
+#ifndef DBUG_OFF
+      current_group_table= NULL;
+#endif
+    }
+    else /* record does not end group */
+    {
+      /* just record the fact, can't know if can execute yet */
+      if (all_active_trans[sid].group_start_lsn == LSN_IMPOSSIBLE)
+      {
+        /* group not yet started */
+        all_active_trans[sid].group_start_lsn= rec.lsn;
+      }
+    }
+    translog_free_record_header(&rec);
+    len= translog_read_next_record_header(&scanner, &rec);
+    if (len < 0)
+    {
+      switch (len)
+      {
+      case RECHEADER_READ_EOF:
+        tprint(tracef, "EOF on the log\n");
+        break;
+      case RECHEADER_READ_ERROR:
+        tprint(tracef, "Error reading log\n");
+        goto err;
+      }
+      break;
+    }
+  }
+  translog_destroy_scanner(&scanner);
+  translog_free_record_header(&rec);
+  if (recovery_message_printed == REC_MSG_REDO)
+  {
+    fprintf(stderr, " 100%%");
+    fflush(stderr);
+    procent_printed= 1;
+  }
+  DBUG_RETURN(0);
+
+err:
+  translog_destroy_scanner(&scanner);
+  translog_free_record_header(&rec);
+  DBUG_RETURN(1);
+}
+
+
+/**
+   @brief Informs about any aborted groups or uncommitted transactions,
+   prepares for the UNDO phase if needed.
+
+   @note Observe that it may init trnman.
+*/
+static uint end_of_redo_phase(my_bool prepare_for_undo_phase)
+{
+  uint sid, uncommitted= 0;
+  char llbuf[22];
+  LSN addr;
+
+  hash_free(&all_dirty_pages);
+  /*
+    hash_free() can be called multiple times probably, but be safe if that
+    changes
+  */
+  bzero(&all_dirty_pages, sizeof(all_dirty_pages));
+  my_free(dirty_pages_pool, MYF(MY_ALLOW_ZERO_PTR));
+  dirty_pages_pool= NULL;
+
+  llstr(max_long_trid, llbuf);
+  tprint(tracef, "Maximum transaction long id seen: %s\n", llbuf);
+  llstr(max_trid_in_control_file, llbuf);
+  tprint(tracef, "Maximum transaction long id seen in control file: %s\n",
+         llbuf);
+  /*
+    If logs were deleted, or lost, trid in control file is needed to set
+    trnman's generator:
+  */
+  set_if_bigger(max_long_trid, max_trid_in_control_file);
+  if (prepare_for_undo_phase && trnman_init(max_long_trid))
+    return -1;
+
+  trns_created= TRUE;
+
+  for (sid= 0; sid <= SHORT_TRID_MAX; sid++)
+  {
+    TrID long_trid= all_active_trans[sid].long_trid;
+    LSN gslsn= all_active_trans[sid].group_start_lsn;
+    TRN *trn;
+    if (gslsn != LSN_IMPOSSIBLE)
+    {
+      tprint(tracef, "Group at LSN (%lu,0x%lx) short_trid %u incomplete\n",
+             LSN_IN_PARTS(gslsn), sid);
+      all_active_trans[sid].group_start_lsn= LSN_IMPOSSIBLE;
+    }
+    if (all_active_trans[sid].undo_lsn != LSN_IMPOSSIBLE)
+    {
+      llstr(long_trid, llbuf);
+      tprint(tracef, "Transaction long_trid %s short_trid %u uncommitted\n",
+             llbuf, sid);
+      /*
+        dummy_transaction_object serves only for DDLs, where there is never a
+        rollback or incomplete group. And unknown transactions (which have
+        long_trid==0) should have undo_lsn==LSN_IMPOSSIBLE.
+      */
+      if (long_trid ==0)
+      {
+        eprint(tracef, "Transaction with long_trid 0 should not roll back");
+        ALERT_USER();
+        return -1;
+      }
+      if (prepare_for_undo_phase)
+      {
+        if ((trn= trnman_recreate_trn_from_recovery(sid, long_trid)) == NULL)
+          return -1;
+        trn->undo_lsn= all_active_trans[sid].undo_lsn;
+        trn->first_undo_lsn= all_active_trans[sid].first_undo_lsn |
+          TRANSACTION_LOGGED_LONG_ID; /* because trn is known in log */
+        if (gslsn != LSN_IMPOSSIBLE)
+        {
+          /*
+            UNDO phase will log some records. So, a future recovery may see:
+            REDO(from incomplete group) - REDO(from rollback) - CLR_END
+            and thus execute the first REDO (finding it in "a complete
+            group"). To prevent that:
+          */
+          LEX_CUSTRING log_array[TRANSLOG_INTERNAL_PARTS];
+          LSN lsn;
+          if (translog_write_record(&lsn, LOGREC_INCOMPLETE_GROUP,
+                                    trn, NULL, 0,
+                                    TRANSLOG_INTERNAL_PARTS, log_array,
+                                    NULL, NULL))
+            return -1;
+        }
+      }
+      uncommitted++;
+    }
+#ifdef MARIA_VERSIONING
+    /*
+      If real recovery: if transaction was committed, move it to some separate
+      list for soon purging.
+    */
+#endif
+  }
+
+  my_free(all_active_trans, MYF(MY_ALLOW_ZERO_PTR));
+  all_active_trans= NULL;
+
+  /*
+    The UNDO phase uses some normal run-time code of ROLLBACK: generates log
+    records, etc; prepare tables for that
+  */
+  addr= translog_get_horizon();
+  for (sid= 0; sid <= SHARE_ID_MAX; sid++)
+  {
+    MARIA_HA *info= all_tables[sid].info;
+    if (info != NULL)
+    {
+      prepare_table_for_close(info, addr);
+      /*
+        But we don't close it; we leave it available for the UNDO phase;
+        it's likely that the UNDO phase will need it.
+      */
+      if (prepare_for_undo_phase)
+        translog_assign_id_to_share_from_recovery(info->s, sid);
+    }
+  }
+  return uncommitted;
+}
+
+
+static int run_undo_phase(uint uncommitted)
+{
+  LSN last_undo;
+  DBUG_ENTER("run_undo_phase");
+
+  if (uncommitted > 0)
+  {
+    checkpoint_useful= TRUE;
+    if (tracef != stdout)
+    {
+      if (recovery_message_printed == REC_MSG_NONE)
+        print_preamble();
+      fprintf(stderr, "transactions to roll back:");
+      recovery_message_printed= REC_MSG_UNDO;
+    }
+    tprint(tracef, "%u transactions will be rolled back\n", uncommitted);
+    procent_printed= 1;
+    for( ; ; )
+    {
+      char llbuf[22];
+      TRN *trn;
+      if (recovery_message_printed == REC_MSG_UNDO)
+      {
+        fprintf(stderr, " %u", uncommitted);
+        fflush(stderr);
+      }
+      if ((uncommitted--) == 0)
+        break;
+      trn= trnman_get_any_trn();
+      DBUG_ASSERT(trn != NULL);
+      llstr(trn->trid, llbuf);
+      tprint(tracef, "Rolling back transaction of long id %s\n", llbuf);
+      last_undo= trn->undo_lsn + 1;
+
+      /* Execute all undo entries */
+      while (trn->undo_lsn)
+      {
+        TRANSLOG_HEADER_BUFFER rec;
+        LOG_DESC *log_desc;
+        DBUG_ASSERT(trn->undo_lsn < last_undo);
+        last_undo= trn->undo_lsn;
+
+        if (translog_read_record_header(trn->undo_lsn, &rec) ==
+            RECHEADER_READ_ERROR)
+          DBUG_RETURN(1);
+        log_desc= &log_record_type_descriptor[rec.type];
+        display_record_position(log_desc, &rec, 0);
+        if (log_desc->record_execute_in_undo_phase(&rec, trn))
+        {
+          eprint(tracef, "Got error %d when executing undo %s", my_errno,
+                 log_desc->name);
+          translog_free_record_header(&rec);
+          DBUG_RETURN(1);
+        }
+        translog_free_record_header(&rec);
+      }
+
+      if (trnman_rollback_trn(trn))
+        DBUG_RETURN(1);
+      /* We could want to span a few threads (4?) instead of 1 */
+      /* In the future, we want to have this phase *online* */
+    }
+  }
+  procent_printed= 0;
+  DBUG_RETURN(0);
+}
+
+
+/**
+  In case of error in recovery, deletes all transactions from the transaction
+  manager so that this module does not assert.
+
+  @note no checkpoint should be taken as those transactions matter for the
+  next recovery (they still haven't been properly dealt with).
+*/
+
+static void delete_all_transactions()
+{
+  for( ; ; )
+  {
+    TRN *trn= trnman_get_any_trn();
+    if (trn == NULL)
+      break;
+    trn->undo_lsn= trn->first_undo_lsn= LSN_IMPOSSIBLE;
+    trnman_rollback_trn(trn); /* ignore error */
+  }
+}
+
+
+/**
+   @brief re-enables transactionality, updates is_of_horizon
+
+   @param  info                table
+   @param  horizon             address to set is_of_horizon
+*/
+
+static void prepare_table_for_close(MARIA_HA *info, TRANSLOG_ADDRESS horizon)
+{
+  MARIA_SHARE *share= info->s;
+  /*
+    In a fully-forward REDO phase (no checkpoint record),
+    state is now at least as new as the LSN of the current record. It may be
+    newer, in case we are seeing a LOGREC_FILE_ID which tells us to close a
+    table, but that table was later modified further in the log.
+    But if we parsed a checkpoint record, it may be this way in the log:
+    FILE_ID(6->t2)... FILE_ID(6->t1)... CHECKPOINT(6->t1)
+    Checkpoint parsing opened t1 with id 6; first FILE_ID above is going to
+    make t1 close; the first condition below is however false (when checkpoint
+    was taken it increased is_of_horizon) and so it works. For safety we
+    add the second condition.
+  */
+  if (cmp_translog_addr(share->state.is_of_horizon, horizon) < 0 &&
+      cmp_translog_addr(share->lsn_of_file_id, horizon) < 0)
+  {
+    share->state.is_of_horizon= horizon;
+    _ma_state_info_write_sub(share->kfile.file, &share->state,
+                             MA_STATE_INFO_WRITE_DONT_MOVE_OFFSET);
+  }
+
+  /*
+   Ensure that info->state is up to date as
+   _ma_renable_logging_for_table() is depending on this
+  */
+  *info->state= info->s->state.state;
+
+  /*
+    This leaves PAGECACHE_PLAIN_PAGE pages into the cache, while the table is
+    going to switch back to transactional. So the table will be a mix of
+    pages, which is ok as long as we don't take any checkpoints until all
+    tables get closed at the end of the UNDO phase.
+  */
+  _ma_reenable_logging_for_table(info, FALSE);
+  info->trn= NULL; /* safety */
+}
+
+
+static MARIA_HA *get_MARIA_HA_from_REDO_record(const
+                                               TRANSLOG_HEADER_BUFFER *rec)
+{
+  uint16 sid;
+  pgcache_page_no_t page;
+  MARIA_HA *info;
+  MARIA_SHARE *share;
+  char llbuf[22];
+  my_bool index_page_redo_entry= FALSE, page_redo_entry= FALSE;
+  LINT_INIT(page);
+
+  print_redo_phase_progress(rec->lsn);
+  sid= fileid_korr(rec->header);
+  switch (rec->type) {
+    /* not all REDO records have a page: */
+  case LOGREC_REDO_INDEX_NEW_PAGE:
+  case LOGREC_REDO_INDEX:
+  case LOGREC_REDO_INDEX_FREE_PAGE:
+    index_page_redo_entry= 1;
+    /* Fall trough*/
+  case LOGREC_REDO_INSERT_ROW_HEAD:
+  case LOGREC_REDO_INSERT_ROW_TAIL:
+  case LOGREC_REDO_PURGE_ROW_HEAD:
+  case LOGREC_REDO_PURGE_ROW_TAIL:
+  case LOGREC_REDO_NEW_ROW_HEAD:
+  case LOGREC_REDO_NEW_ROW_TAIL:
+  case LOGREC_REDO_FREE_HEAD_OR_TAIL:
+    page_redo_entry= TRUE;
+    page= page_korr(rec->header + FILEID_STORE_SIZE);
+    llstr(page, llbuf);
+    break;
+    /*
+      For REDO_FREE_BLOCKS, no need to look at dirty pages list: it does not
+      read data pages, only reads/modifies bitmap page(s) which is cheap.
+    */
+  default:
+    break;
+  }
+  tprint(tracef, "   For table of short id %u", sid);
+  info= all_tables[sid].info;
+#ifndef DBUG_OFF
+  DBUG_ASSERT(current_group_table == NULL || current_group_table == info);
+  current_group_table= info;
+#endif
+  if (info == NULL)
+  {
+    tprint(tracef, ", table skipped, so skipping record\n");
+    return NULL;
+  }
+  share= info->s;
+  tprint(tracef, ", '%s'", share->open_file_name.str);
+  DBUG_ASSERT(in_redo_phase);
+  if (cmp_translog_addr(rec->lsn, share->lsn_of_file_id) <= 0)
+  {
+    /*
+      This can happen only if processing a record before the checkpoint
+      record.
+      id->name mapping is newer than REDO record: for sure the table subject
+      of the REDO has been flushed and forced (id re-assignment implies this);
+      REDO can be ignored (and must be, as we don't know what this subject
+      table was).
+    */
+    DBUG_ASSERT(cmp_translog_addr(rec->lsn, checkpoint_start) < 0);
+    tprint(tracef, ", table's LOGREC_FILE_ID has LSN (%lu,0x%lx) more recent"
+           " than record, skipping record",
+           LSN_IN_PARTS(share->lsn_of_file_id));
+    return NULL;
+  }
+  if (cmp_translog_addr(rec->lsn, share->state.skip_redo_lsn) <= 0)
+  {
+    /* probably a bulk insert repair */
+    tprint(tracef, ", has skip_redo_lsn (%lu,0x%lx) more recent than"
+           " record, skipping record\n",
+           LSN_IN_PARTS(share->state.skip_redo_lsn));
+    return NULL;
+  }
+  /* detect if an open instance of a dropped table (internal bug) */
+  DBUG_ASSERT(share->last_version != 0);
+  if (page_redo_entry)
+  {
+    /*
+      Consult dirty pages list.
+      REDO_INSERT_ROW_BLOBS will consult list by itself, as it covers several
+      pages.
+    */
+    tprint(tracef, " page %s", llbuf);
+    if (_ma_redo_not_needed_for_page(sid, rec->lsn, page,
+                                     index_page_redo_entry))
+      return NULL;
+  }
+  /*
+    So we are going to read the page, and if its LSN is older than the
+    record's we will modify the page
+  */
+  tprint(tracef, ", applying record\n");
+  _ma_writeinfo(info, WRITEINFO_UPDATE_KEYFILE); /* to flush state on close */
+  return info;
+}
+
+
+static MARIA_HA *get_MARIA_HA_from_UNDO_record(const
+                                               TRANSLOG_HEADER_BUFFER *rec)
+{
+  uint16 sid;
+  MARIA_HA *info;
+  MARIA_SHARE *share;
+
+  sid= fileid_korr(rec->header + LSN_STORE_SIZE);
+  tprint(tracef, "   For table of short id %u", sid);
+  info= all_tables[sid].info;
+#ifndef DBUG_OFF
+  DBUG_ASSERT(!in_redo_phase ||
+              current_group_table == NULL || current_group_table == info);
+  current_group_table= info;
+#endif
+  if (info == NULL)
+  {
+    tprint(tracef, ", table skipped, so skipping record\n");
+    return NULL;
+  }
+  share= info->s;
+  tprint(tracef, ", '%s'", share->open_file_name.str);
+  if (cmp_translog_addr(rec->lsn, share->lsn_of_file_id) <= 0)
+  {
+    tprint(tracef, ", table's LOGREC_FILE_ID has LSN (%lu,0x%lx) more recent"
+           " than record, skipping record",
+           LSN_IN_PARTS(share->lsn_of_file_id));
+    return NULL;
+  }
+  if (in_redo_phase &&
+      cmp_translog_addr(rec->lsn, share->state.skip_redo_lsn) <= 0)
+  {
+    /* probably a bulk insert repair */
+    tprint(tracef, ", has skip_redo_lsn (%lu,0x%lx) more recent than"
+           " record, skipping record\n",
+           LSN_IN_PARTS(share->state.skip_redo_lsn));
+    return NULL;
+  }
+  DBUG_ASSERT(share->last_version != 0);
+  _ma_writeinfo(info, WRITEINFO_UPDATE_KEYFILE); /* to flush state on close */
+  tprint(tracef, ", applying record\n");
+  return info;
+}
+
+
+/**
+   @brief Parses checkpoint record.
+
+   Builds from it the dirty_pages list (a hash), opens tables and maps them to
+   their 2-byte IDs, recreates transactions (not real TRNs though).
+
+   @return LSN from where in the log the REDO phase should start
+     @retval LSN_ERROR error
+     @retval other     ok
+*/
+
+static LSN parse_checkpoint_record(LSN lsn)
+{
+  ulong i;
+  ulonglong nb_dirty_pages;
+  TRANSLOG_HEADER_BUFFER rec;
+  TRANSLOG_ADDRESS start_address;
+  int len;
+  uint nb_active_transactions, nb_committed_transactions, nb_tables;
+  uchar *ptr;
+  LSN minimum_rec_lsn_of_active_transactions, minimum_rec_lsn_of_dirty_pages;
+  struct st_dirty_page *next_dirty_page_in_pool;
+
+  tprint(tracef, "Loading data from checkpoint record at LSN (%lu,0x%lx)\n",
+         LSN_IN_PARTS(lsn));
+  if ((len= translog_read_record_header(lsn, &rec)) == RECHEADER_READ_ERROR)
+  {
+    tprint(tracef, "Cannot find checkpoint record where it should be\n");
+    return LSN_ERROR;
+  }
+
+  enlarge_buffer(&rec);
+  if (log_record_buffer.str == NULL ||
+      translog_read_record(rec.lsn, 0, rec.record_length,
+                           log_record_buffer.str, NULL) !=
+      rec.record_length)
+  {
+    eprint(tracef, "Failed to read record");
+    return LSN_ERROR;
+  }
+
+  ptr= log_record_buffer.str;
+  start_address= lsn_korr(ptr);
+  ptr+= LSN_STORE_SIZE;
+  tprint(tracef, "Checkpoint record has start_horizon at (%lu,0x%lx)\n",
+         LSN_IN_PARTS(start_address));
+
+  /* transactions */
+  nb_active_transactions= uint2korr(ptr);
+  ptr+= 2;
+  tprint(tracef, "%u active transactions\n", nb_active_transactions);
+  minimum_rec_lsn_of_active_transactions= lsn_korr(ptr);
+  ptr+= LSN_STORE_SIZE;
+  max_long_trid= transid_korr(ptr);
+  ptr+= TRANSID_SIZE;
+
+  /*
+    how much brain juice and discussions there was to come to writing this
+    line. It may make start_address slightly decrease (only by the time it
+    takes to write one or a few rows, roughly).
+  */
+  tprint(tracef, "Checkpoint record has min_rec_lsn of active transactions"
+         " at (%lu,0x%lx)\n",
+         LSN_IN_PARTS(minimum_rec_lsn_of_active_transactions));
+  set_if_smaller(start_address, minimum_rec_lsn_of_active_transactions);
+
+  for (i= 0; i < nb_active_transactions; i++)
+  {
+    uint16 sid= uint2korr(ptr);
+    TrID long_id;
+    LSN undo_lsn, first_undo_lsn;
+    ptr+= 2;
+    long_id= uint6korr(ptr);
+    ptr+= 6;
+    DBUG_ASSERT(sid > 0 && long_id > 0);
+    undo_lsn= lsn_korr(ptr);
+    ptr+= LSN_STORE_SIZE;
+    first_undo_lsn= lsn_korr(ptr);
+    ptr+= LSN_STORE_SIZE;
+    new_transaction(sid, long_id, undo_lsn, first_undo_lsn);
+  }
+  nb_committed_transactions= uint4korr(ptr);
+  ptr+= 4;
+  tprint(tracef, "%lu committed transactions\n",
+         (ulong)nb_committed_transactions);
+  /* no purging => committed transactions are not important */
+  ptr+= (6 + LSN_STORE_SIZE) * nb_committed_transactions;
+
+  /* tables  */
+  nb_tables= uint4korr(ptr);
+  ptr+= 4;
+  tprint(tracef, "%u open tables\n", nb_tables);
+  for (i= 0; i< nb_tables; i++)
+  {
+    char name[FN_REFLEN];
+    LSN first_log_write_lsn;
+    uint name_len;
+    uint16 sid= uint2korr(ptr);
+    ptr+= 2;
+    DBUG_ASSERT(sid > 0);
+    first_log_write_lsn= lsn_korr(ptr);
+    ptr+= LSN_STORE_SIZE;
+    name_len= strlen((char *)ptr) + 1;
+    strmake(name, (char *)ptr, sizeof(name)-1);
+    ptr+= name_len;
+    if (new_table(sid, name, first_log_write_lsn))
+      return LSN_ERROR;
+  }
+
+  /* dirty pages */
+  nb_dirty_pages= uint8korr(ptr);
+
+  /* Ensure casts later will not loose significant bits. */
+  DBUG_ASSERT((nb_dirty_pages <= SIZE_T_MAX/sizeof(struct st_dirty_page)) &&
+              (nb_dirty_pages <= ULONG_MAX));
+
+  ptr+= 8;
+  tprint(tracef, "%lu dirty pages\n", (ulong) nb_dirty_pages);
+  if (hash_init(&all_dirty_pages, &my_charset_bin, (ulong)nb_dirty_pages,
+                offsetof(struct st_dirty_page, file_and_page_id),
+                sizeof(((struct st_dirty_page *)NULL)->file_and_page_id),
+                NULL, NULL, 0))
+    return LSN_ERROR;
+  dirty_pages_pool=
+    (struct st_dirty_page *)my_malloc((size_t)nb_dirty_pages *
+                                      sizeof(struct st_dirty_page),
+                                      MYF(MY_WME));
+  if (unlikely(dirty_pages_pool == NULL))
+    return LSN_ERROR;
+  next_dirty_page_in_pool= dirty_pages_pool;
+  minimum_rec_lsn_of_dirty_pages= LSN_MAX;
+  if (maria_recovery_verbose)
+    tprint(tracef, "Table_id  Is_index       Page_id    Rec_lsn\n");
+  for (i= 0; i < nb_dirty_pages ; i++)
+  {
+    pgcache_page_no_t page_id;
+    LSN rec_lsn;
+    uint32 is_index;
+    uint16 table_id= uint2korr(ptr);
+    ptr+= 2;
+    is_index= ptr[0];
+    ptr++;
+    page_id= page_korr(ptr);
+    ptr+= PAGE_STORE_SIZE;
+    rec_lsn= lsn_korr(ptr);
+    ptr+= LSN_STORE_SIZE;
+    if (new_page((is_index << 16) | table_id,
+                 page_id, rec_lsn, next_dirty_page_in_pool++))
+      return LSN_ERROR;
+    if (maria_recovery_verbose)
+      tprint(tracef, "%8u  %8u  %12lu    %lu,0x%lx\n", (uint) table_id,
+             (uint) is_index, (ulong) page_id, LSN_IN_PARTS(rec_lsn));
+    set_if_smaller(minimum_rec_lsn_of_dirty_pages, rec_lsn);
+  }
+  /* after that, there will be no insert/delete into the hash */
+  /*
+    sanity check on record (did we screw up with all those "ptr+=", did the
+    checkpoint write code and checkpoint read code go out of sync?).
+  */
+  if (ptr != (log_record_buffer.str + log_record_buffer.length))
+  {
+    eprint(tracef, "checkpoint record corrupted\n");
+    return LSN_ERROR;
+  }
+
+  /*
+    start_address is now from where the dirty pages list can be ignored.
+    Find LSN higher or equal to this TRANSLOG_ADDRESS, suitable for
+    translog_read_record() functions.
+  */
+  start_address= checkpoint_start=
+    translog_next_LSN(start_address, LSN_IMPOSSIBLE);
+  tprint(tracef, "Checkpoint record start_horizon now adjusted to"
+         " LSN (%lu,0x%lx)\n", LSN_IN_PARTS(start_address));
+  if (checkpoint_start == LSN_IMPOSSIBLE)
+  {
+    /*
+      There must be a problem, as our checkpoint record exists and is >= the
+      address which is stored in its first bytes, which is >= start_address.
+    */
+    return LSN_ERROR;
+  }
+  /* now, where the REDO phase should start reading log: */
+  tprint(tracef, "Checkpoint has min_rec_lsn of dirty pages at"
+         " LSN (%lu,0x%lx)\n", LSN_IN_PARTS(minimum_rec_lsn_of_dirty_pages));
+  set_if_smaller(start_address, minimum_rec_lsn_of_dirty_pages);
+  DBUG_PRINT("info",
+             ("checkpoint_start: (%lu,0x%lx) start_address: (%lu,0x%lx)",
+              LSN_IN_PARTS(checkpoint_start), LSN_IN_PARTS(start_address)));
+  return start_address;
+}
+
+
+static int new_page(uint32 fileid, pgcache_page_no_t pageid, LSN rec_lsn,
+                    struct st_dirty_page *dirty_page)
+{
+  /* serves as hash key */
+  dirty_page->file_and_page_id= (((uint64)fileid) << 40) | pageid;
+  dirty_page->rec_lsn= rec_lsn;
+  return my_hash_insert(&all_dirty_pages, (uchar *)dirty_page);
+}
+
+
+static int close_all_tables(void)
+{
+  int error= 0;
+  uint count= 0;
+  LIST *list_element, *next_open;
+  MARIA_HA *info;
+  TRANSLOG_ADDRESS addr;
+  DBUG_ENTER("close_all_tables");
+
+  pthread_mutex_lock(&THR_LOCK_maria);
+  if (maria_open_list == NULL)
+    goto end;
+  tprint(tracef, "Closing all tables\n");
+  if (tracef != stdout)
+  {
+    if (recovery_message_printed == REC_MSG_NONE)
+      print_preamble();
+    for (count= 0, list_element= maria_open_list ;
+         list_element ; count++, (list_element= list_element->next))
+      ;
+    fprintf(stderr, "tables to flush:");
+    recovery_message_printed= REC_MSG_FLUSH;
+  }
+  /*
+    Since the end of end_of_redo_phase(), we may have written new records
+    (if UNDO phase ran)  and thus the state is newer than at
+    end_of_redo_phase(), we need to bump is_of_horizon again.
+  */
+  addr= translog_get_horizon();
+  for (list_element= maria_open_list ; ; list_element= next_open)
+  {
+    if (recovery_message_printed == REC_MSG_FLUSH)
+    {
+      fprintf(stderr, " %u", count--);
+      fflush(stderr);
+    }
+    if (list_element == NULL)
+      break;
+    next_open= list_element->next;
+    info= (MARIA_HA*)list_element->data;
+    pthread_mutex_unlock(&THR_LOCK_maria); /* ok, UNDO phase not online yet */
+    /*
+      Tables which we see here are exactly those which were open at time of
+      crash. They might have open_count>0 as Checkpoint maybe flushed their
+      state while they were used. As Recovery corrected them, don't alarm the
+      user, don't ask for a table check:
+    */
+    if (info->s->state.open_count != 0)
+    {
+      /* let ma_close() mark the table properly closed */
+      info->s->state.open_count= 1;
+      info->s->global_changed= 1;
+    }
+    prepare_table_for_close(info, addr);
+    error|= maria_close(info);
+    pthread_mutex_lock(&THR_LOCK_maria);
+  }
+end:
+  pthread_mutex_unlock(&THR_LOCK_maria);
+  DBUG_RETURN(error);
+}
+
+
+/**
+   @brief Close all table instances with a certain name which are present in
+   all_tables.
+
+   @param  name                Name of table
+   @param  addr                Log address passed to prepare_table_for_close()
+*/
+
+static my_bool close_one_table(const char *name, TRANSLOG_ADDRESS addr)
+{
+  my_bool res= 0;
+  /* There are no other threads using the tables, so we don't need any locks */
+  struct st_table_for_recovery *internal_table, *end;
+  for (internal_table= all_tables, end= internal_table + SHARE_ID_MAX + 1;
+       internal_table < end ;
+       internal_table++)
+  {
+    MARIA_HA *info= internal_table->info;
+    if ((info != NULL) && !strcmp(info->s->open_file_name.str, name))
+    {
+      prepare_table_for_close(info, addr);
+      if (maria_close(info))
+        res= 1;
+      internal_table->info= NULL;
+    }
+  }
+  return res;
+}
+
+
+/**
+   Temporarily disables logging for this table.
+
+   If that makes the log incomplete, writes a LOGREC_INCOMPLETE_LOG to the log
+   to warn log readers.
+
+   @param  info            table
+   @param  log_incomplete  if that disabling makes the log incomplete
+
+   @note for example in the REDO phase we disable logging but that does not
+   make the log incomplete.
+*/
+
+void _ma_tmp_disable_logging_for_table(MARIA_HA *info,
+                                       my_bool log_incomplete)
+{
+  MARIA_SHARE *share= info->s;
+  DBUG_ENTER("_ma_tmp_disable_logging_for_table");
+  if (log_incomplete)
+  {
+    uchar log_data[FILEID_STORE_SIZE];
+    LEX_CUSTRING log_array[TRANSLOG_INTERNAL_PARTS + 1];
+    LSN lsn;
+    log_array[TRANSLOG_INTERNAL_PARTS + 0].str=    log_data;
+    log_array[TRANSLOG_INTERNAL_PARTS + 0].length= sizeof(log_data);
+    translog_write_record(&lsn, LOGREC_INCOMPLETE_LOG,
+                          &dummy_transaction_object, info,
+                          (translog_size_t) sizeof(log_data),
+                          TRANSLOG_INTERNAL_PARTS + 1, log_array,
+                          log_data, NULL);
+  }
+
+  /* if we disabled before writing the record, record wouldn't reach log */
+  share->now_transactional= FALSE;
+
+  /*
+    Reset state pointers. This is needed as in ALTER table we may do
+    commit fllowed by _ma_renable_logging_for_table and then
+    info->state may point to a state that was deleted by
+    _ma_trnman_end_trans_hook()
+   */
+  share->state.common= *info->state;
+  info->state= &share->state.common;
+  info->switched_transactional= TRUE;
+
+  /*
+    Some code in ma_blockrec.c assumes a trn even if !now_transactional but in
+    this case it only reads trn->rec_lsn, which has to be LSN_IMPOSSIBLE and
+    should be now. info->trn may be NULL in maria_chk.
+  */
+  if (info->trn == NULL)
+    info->trn= &dummy_transaction_object;
+  DBUG_ASSERT(info->trn->rec_lsn == LSN_IMPOSSIBLE);
+  share->page_type= PAGECACHE_PLAIN_PAGE;
+  /* Functions below will pick up now_transactional and change callbacks */
+  _ma_set_data_pagecache_callbacks(&info->dfile, share);
+  _ma_set_index_pagecache_callbacks(&share->kfile, share);
+  _ma_bitmap_set_pagecache_callbacks(&share->bitmap.file, share);
+  DBUG_VOID_RETURN;
+}
+
+
+/**
+   Re-enables logging for a table which had it temporarily disabled.
+
+   Only the thread which disabled logging is allowed to reenable it. Indeed,
+   re-enabling logging affects all open instances, one must have exclusive
+   access to the table to do that. In practice, the one which disables has
+   such access.
+
+   @param  info            table
+   @param  flush_pages     if function needs to flush pages first
+*/
+
+my_bool _ma_reenable_logging_for_table(MARIA_HA *info, my_bool flush_pages)
+{
+  MARIA_SHARE *share= info->s;
+  DBUG_ENTER("_ma_reenable_logging_for_table");
+
+  if (share->now_transactional == share->base.born_transactional ||
+      !info->switched_transactional)
+    DBUG_RETURN(0);
+  info->switched_transactional= FALSE;
+
+  if ((share->now_transactional= share->base.born_transactional))
+  {
+    share->page_type= PAGECACHE_LSN_PAGE;
+
+    /*
+      Copy state information that where updated while the table was used
+      in not transactional mode
+    */
+    _ma_copy_nontrans_state_information(info);
+    _ma_reset_history(info->s);
+
+    if (flush_pages)
+    {
+      /*
+        We are going to change callbacks; if a page is flushed at this moment
+        this can cause race conditions, that's one reason to flush pages
+        now. Other reasons: a checkpoint could be running and miss pages; the
+        pages have type PAGECACHE_PLAIN_PAGE which should not remain. As
+        there are no REDOs for pages, them, bitmaps and the state also have to
+        be flushed and synced.
+      */
+      if (_ma_flush_table_files(info, MARIA_FLUSH_DATA | MARIA_FLUSH_INDEX,
+                                FLUSH_RELEASE, FLUSH_RELEASE) ||
+          _ma_state_info_write(share,
+                               MA_STATE_INFO_WRITE_DONT_MOVE_OFFSET |
+                               MA_STATE_INFO_WRITE_LOCK) ||
+          _ma_sync_table_files(info))
+        DBUG_RETURN(1);
+    }
+    else if (!maria_in_recovery)
+    {
+      /*
+        Except in Recovery, we mustn't leave dirty pages (see comments above).
+        Note that this does not verify that the state was flushed, but hey.
+      */
+      pagecache_file_no_dirty_page(share->pagecache, &info->dfile);
+      pagecache_file_no_dirty_page(share->pagecache, &share->kfile);
+    }
+    _ma_set_data_pagecache_callbacks(&info->dfile, share);
+    _ma_set_index_pagecache_callbacks(&share->kfile, share);
+    _ma_bitmap_set_pagecache_callbacks(&share->bitmap.file, share);
+    /*
+      info->trn was not changed in the disable/enable combo, so that it's
+      still usable in this kind of combination:
+      external_lock;
+      start_bulk_insert; # table is empty, disables logging
+      end_bulk_insert;   # enables logging
+      start_bulk_insert; # table is not empty, logging stays
+                         # so rows insertion needs the real trn.
+      as happens during row-based replication on the slave.
+    */
+  }
+  DBUG_RETURN(0);
+}
+
+
+static void print_redo_phase_progress(TRANSLOG_ADDRESS addr)
+{
+  static uint end_logno= FILENO_IMPOSSIBLE, percentage_printed= 0;
+  static ulong end_offset;
+  static ulonglong initial_remainder= ~(ulonglong) 0;
+
+  uint cur_logno;
+  ulong cur_offset;
+  ulonglong local_remainder;
+  uint percentage_done;
+
+  if (tracef == stdout)
+    return;
+  if (recovery_message_printed == REC_MSG_NONE)
+  {
+    print_preamble();
+    fprintf(stderr, "recovered pages: 0%%");
+    fflush(stderr);
+    procent_printed= 1;
+    recovery_message_printed= REC_MSG_REDO;
+  }
+  if (end_logno == FILENO_IMPOSSIBLE)
+  {
+    LSN end_addr= translog_get_horizon();
+    end_logno= LSN_FILE_NO(end_addr);
+    end_offset= LSN_OFFSET(end_addr);
+  }
+  cur_logno= LSN_FILE_NO(addr);
+  cur_offset= LSN_OFFSET(addr);
+  local_remainder= (cur_logno == end_logno) ? (end_offset - cur_offset) :
+    (((longlong)log_file_size) - cur_offset +
+     max(end_logno - cur_logno - 1, 0) * ((longlong)log_file_size) +
+     end_offset);
+  if (initial_remainder == (ulonglong)(-1))
+    initial_remainder= local_remainder;
+  percentage_done= (uint) ((initial_remainder - local_remainder) * ULL(100) /
+                           initial_remainder);
+  if ((percentage_done - percentage_printed) >= 10)
+  {
+    percentage_printed= percentage_done;
+    fprintf(stderr, " %u%%", percentage_done);
+    fflush(stderr);
+    procent_printed= 1;
+  }
+}
+
+
+#ifdef MARIA_EXTERNAL_LOCKING
+#error Marias Checkpoint and Recovery are really not ready for it
+#endif
+
+/*
+Recovery of the state :  how it works
+=====================================
+
+Here we ignore Checkpoints for a start.
+
+The state (MARIA_HA::MARIA_SHARE::MARIA_STATE_INFO) is updated in
+memory frequently (at least at every row write/update/delete) but goes
+to disk at few moments: maria_close() when closing the last open
+instance, and a few rare places like CHECK/REPAIR/ALTER
+(non-transactional tables also do it at maria_lock_database() but we
+needn't cover them here).
+
+In case of crash, state on disk is likely to be older than what it was
+in memory, the REDO phase needs to recreate the state as it was in
+memory at the time of crash. When we say Recovery here we will always
+mean "REDO phase".
+
+For example MARIA_STATUS_INFO::records (count of records). It is updated at
+the end of every row write/update/delete/delete_all. When Recovery sees the
+sign of such row operation (UNDO or REDO), it may need to update the records'
+count if that count does not reflect that operation (is older). How to know
+the age of the state compared to the log record: every time the state
+goes to disk at runtime, its member "is_of_horizon" is updated to the
+current end-of-log horizon. So Recovery just needs to compare is_of_horizon
+and the record's LSN to know if it should modify "records".
+
+Other operations like ALTER TABLE DISABLE KEYS update the state but
+don't write log records, thus the REDO phase cannot repeat their
+effect on the state in case of crash. But we make them sync the state
+as soon as they have finished. This reduces the window for a problem.
+
+It looks like only one thread at a time updates the state in memory or
+on disk. We assume that the upper level (normally MySQL) has protection
+against issuing HA_EXTRA_(FORCE_REOPEN|PREPARE_FOR_RENAME) so that these
+are not issued while there are any running transactions on the given table.
+If this is not done, we may write a corrupted state to disk.
+
+With checkpoints
+================
+
+Checkpoint module needs to read the state in memory and write it to
+disk. This may happen while some other thread is modifying the state
+in memory or on disk. Checkpoint thus may be reading changing data, it
+needs a mutex to not have it corrupted, and concurrent modifiers of
+the state need that mutex too for the same reason.
+"records" is modified for every row write/update/delete, we don't want
+to add a mutex lock/unlock there. So we re-use the mutex lock/unlock
+which is already present in these moments, namely the log's mutex which is
+taken when UNDO_ROW_INSERT|UPDATE|DELETE is written: we update "records" in
+under-log-mutex hooks when writing these records (thus "records" is
+not updated at the end of maria_write/update/delete() anymore).
+Thus Checkpoint takes the log's lock and can read "records" from
+memory an write it to disk and release log's lock.
+We however want to avoid having the disk write under the log's
+lock. So it has to be under another mutex, natural choice is
+intern_lock (as Checkpoint needs it anyway to read MARIA_SHARE::kfile,
+and as maria_close() takes it too). All state writes to disk are
+changed to be protected with intern_lock.
+So Checkpoint takes intern_lock, log's lock, reads "records" from
+memory, releases log's lock, updates is_of_horizon and writes "records" to
+disk, release intern_lock.
+In practice, not only "records" needs to be written but the full
+state. So, Checkpoint reads the full state from memory. Some other
+thread may at this moment be modifying in memory some pieces of the
+state which are not protected by the lock's log (see ma_extra.c
+HA_EXTRA_NO_KEYS), and Checkpoint would be reading a corrupted state
+from memory; to guard against that we extend the intern_lock-zone to
+changes done to the state in memory by HA_EXTRA_NO_KEYS et al, and
+also any change made in memory to create_rename_lsn/state_is_of_horizon.
+Last, we don't want in Checkpoint to do
+ log lock; read state from memory; release log lock;
+for each table, it may hold the log's lock too much in total.
+So, we instead do
+ log lock; read N states from memory; release log lock;
+Thus, the sequence above happens outside of any intern_lock.
+But this re-introduces the problem that some other thread may be changing the
+state in memory and on disk under intern_lock, without log's lock, like
+HA_EXTRA_NO_KEYS, while we read the N states. However, when Checkpoint later
+comes to handling the table under intern_lock, which is serialized with
+HA_EXTRA_NO_KEYS, it can see that is_of_horizon is higher then when the state
+was read from memory under log's lock, and thus can decide to not flush the
+obsolete state it has, knowing that the other thread flushed a more recent
+state already. If on the other hand is_of_horizon is not higher, the read
+state is current and can be flushed. So we have a per-table sequence:
+ lock intern_lock; test if is_of_horizon is higher than when we read the state
+ under log's lock; if no then flush the read state to disk.
+*/
+
+/* some comments and pseudo-code which we keep for later */
+#if 0
+  /*
+    MikaelR suggests: support checkpoints during REDO phase too: do checkpoint
+    after a certain amount of log records have been executed. This helps
+    against repeated crashes. Those checkpoints could not be user-requested
+    (as engine is not communicating during the REDO phase), so they would be
+    automatic: this changes the original assumption that we don't write to the
+    log while in the REDO phase, but why not. How often should we checkpoint?
+  */
+
+  /*
+    We want to have two steps:
+    engine->recover_with_max_memory();
+    next_engine->recover_with_max_memory();
+    engine->init_with_normal_memory();
+    next_engine->init_with_normal_memory();
+    So: in recover_with_max_memory() allocate a giant page cache, do REDO
+    phase, then all page cache is flushed and emptied and freed (only retain
+    small structures like TM): take full checkpoint, which is useful if
+    next engine crashes in its recovery the next second.
+    Destroy all shares (maria_close()), then at init_with_normal_memory() we
+    do this:
+  */
+
+  /**** UNDO PHASE *****/
+
+  /*
+    Launch one or more threads to do the background rollback. Don't wait for
+    them to complete their rollback (background rollback; for debugging, we
+    can have an option which waits). Set a counter (total_of_rollback_threads)
+    to the number of threads to lauch.
+
+    Note that InnoDB's rollback-in-background works as long as InnoDB is the
+    last engine to recover, otherwise MySQL will refuse new connections until
+    the last engine has recovered so it's not "background" from the user's
+    point of view. InnoDB is near top of sys_table_types so all others
+    (e.g. BDB) recover after it... So it's really "online rollback" only if
+    InnoDB is the only engine.
+  */
+
+  /* wake up delete/update handler */
+  /* tell the TM that it can now accept new transactions */
+
+  /*
+    mark that checkpoint requests are now allowed.
+  */
+#endif
diff --git a/storage/maria/ma_recovery.h b/storage/maria/ma_recovery.h
new file mode 100644
index 00000000000..0bfcdd17d39
--- /dev/null
+++ b/storage/maria/ma_recovery.h
@@ -0,0 +1,33 @@
+/* Copyright (C) 2006,2007 MySQL AB
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; version 2 of the License.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */
+
+/*
+  WL#3072 Maria recovery
+  First version written by Guilhem Bichot on 2006-04-27.
+*/
+
+/* This is the interface of this module. */
+
+/* Performs recovery of the engine at start */
+
+C_MODE_START
+enum maria_apply_log_way
+{ MARIA_LOG_APPLY, MARIA_LOG_DISPLAY_HEADER, MARIA_LOG_CHECK };
+int maria_recovery_from_log(void);
+int maria_apply_log(LSN lsn, LSN lsn_end, enum maria_apply_log_way apply,
+                    FILE *trace_file,
+                    my_bool execute_undo_phase, my_bool skip_DDLs,
+                    my_bool take_checkpoints, uint *warnings_count);
+C_MODE_END
diff --git a/storage/maria/ma_recovery_util.c b/storage/maria/ma_recovery_util.c
new file mode 100644
index 00000000000..19e61daf4ef
--- /dev/null
+++ b/storage/maria/ma_recovery_util.c
@@ -0,0 +1,146 @@
+/* Copyright (C) 2006,2007,2008 MySQL AB
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; version 2 of the License.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */
+
+/*
+  Q: Why isn't ma_recovery_util.c simply moved to ma_recovery.c ?
+
+  A: ma_recovery.c, because it invokes objects from ma_check.c (like
+  maria_chk_init()) causes the following problem:
+  if a source file a.c of a program invokes a function defined in
+  ma_recovery.c, then a.o depends on ma_recovery.o which depends on
+  ma_check.o: linker thus brings in ma_check.o. That brings in the
+  dependencies of ma_check.o which are definitions of _ma_check_print_info()
+  etc; if a.o does not define them then the ones of ha_maria.o are used
+  i.e. ha_maria.o is linked into the program, and this brings in dependencies
+  of ha_maria.o on mysqld.o into the program's linking which thus fails, as
+  the program is not linked with mysqld.o.
+  Thus, while several functions defined in ma_recovery.c could be useful to
+  other files, they cannot be used by them.
+  So we are going to gradually move a great share of ma_recovery.c's exported
+  functions into the present file, to isolate the problematic components and
+  avoid the problem.
+*/
+
+#include "maria_def.h"
+
+HASH all_dirty_pages;
+struct st_dirty_page /* used only in the REDO phase */
+{
+  uint64 file_and_page_id;
+  LSN rec_lsn;
+};
+/*
+  LSN after which dirty pages list does not apply. Can be slightly before
+  when ma_checkpoint_execute() started.
+*/
+LSN checkpoint_start= LSN_IMPOSSIBLE;
+
+/** @todo looks like duplicate of recovery_message_printed */
+my_bool procent_printed;
+FILE *tracef; /**< trace file for debugging */
+
+
+/** @brief Prints to a trace file if it is not NULL */
+void tprint(FILE *trace_file __attribute__ ((unused)),
+            const char *format __attribute__ ((unused)), ...)
+{
+  va_list args;
+#ifndef DBUG_OFF
+  {
+    char buff[1024];
+    va_start(args, format);
+    vsnprintf(buff, sizeof(buff)-1, format, args);
+    DBUG_PRINT("info", ("%s", buff));
+    va_end(args);
+  }
+#endif
+  va_start(args, format);
+  if (trace_file != NULL)
+  {
+    if (procent_printed)
+    {
+      procent_printed= 0;
+      fputc('\n', trace_file);
+    }
+    vfprintf(trace_file, format, args);
+  }
+  va_end(args);
+}
+
+
+void eprint(FILE *trace_file __attribute__ ((unused)),
+            const char *format __attribute__ ((unused)), ...)
+{
+  va_list args;
+  va_start(args, format);
+  DBUG_PRINT("error", ("%s", format));
+  if (!trace_file)
+    trace_file= stderr;
+
+  if (procent_printed)
+  {
+    /* In silent mode, print on another line than the 0% 10% 20% line */
+    procent_printed= 0;
+    fputc('\n', trace_file);
+  }
+  vfprintf(trace_file , format, args);
+  fputc('\n', trace_file);
+  if (trace_file != stderr)
+  {
+    va_start(args, format);
+    my_printv_error(HA_ERR_INITIALIZATION, format, MYF(0), args);
+  }
+  va_end(args);
+  fflush(trace_file);
+}
+
+
+/**
+   Tells if the dirty pages list found in checkpoint record allows to ignore a
+   REDO for a certain page.
+
+   @param  shortid         short id of the table
+   @param  lsn             REDO record's LSN
+   @param  page            page number
+   @param  index           TRUE if index page, FALSE if data page
+*/
+
+my_bool _ma_redo_not_needed_for_page(uint16 shortid, LSN lsn,
+                                     pgcache_page_no_t page,
+                                     my_bool index)
+{
+  if (cmp_translog_addr(lsn, checkpoint_start) < 0)
+  {
+    /*
+      64-bit key is formed like this:
+      Most significant byte: 0 if data page, 1 if index page
+      Next 2 bytes: table's short id
+      Next 5 bytes: page number
+    */
+    uint64 file_and_page_id=
+      (((uint64)((index << 16) | shortid)) << 40) | page;
+    struct st_dirty_page *dirty_page= (struct st_dirty_page *)
+      hash_search(&all_dirty_pages,
+                  (uchar *)&file_and_page_id, sizeof(file_and_page_id));
+    DBUG_PRINT("info", ("in dirty pages list: %d", dirty_page != NULL));
+    if ((dirty_page == NULL) ||
+        cmp_translog_addr(lsn, dirty_page->rec_lsn) < 0)
+    {
+      tprint(tracef, ", ignoring because of dirty_pages list\n");
+      return TRUE;
+    }
+  }
+  return FALSE;
+}
diff --git a/storage/maria/ma_recovery_util.h b/storage/maria/ma_recovery_util.h
new file mode 100644
index 00000000000..a35fea84fe9
--- /dev/null
+++ b/storage/maria/ma_recovery_util.h
@@ -0,0 +1,37 @@
+/* Copyright (C) 2006,2007,2008 MySQL AB
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; version 2 of the License.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */
+
+struct st_dirty_page /* used only in the REDO phase */
+{
+  uint64 file_and_page_id;
+  LSN rec_lsn;
+};
+extern HASH all_dirty_pages;
+/*
+  LSN after which dirty pages list does not apply. Can be slightly before
+  when ma_checkpoint_execute() started.
+*/
+extern LSN checkpoint_start;
+extern my_bool procent_printed;
+extern FILE *tracef;
+
+
+my_bool _ma_redo_not_needed_for_page(uint16 shortid, LSN lsn,
+                                     pgcache_page_no_t page,
+                                     my_bool index);
+void tprint(FILE *trace_file, const char *format, ...)
+  ATTRIBUTE_FORMAT(printf, 2, 3);
+void eprint(FILE *trace_file, const char *format, ...)
+  ATTRIBUTE_FORMAT(printf, 2, 3);
diff --git a/storage/maria/ma_rename.c b/storage/maria/ma_rename.c
new file mode 100644
index 00000000000..380f3da3c46
--- /dev/null
+++ b/storage/maria/ma_rename.c
@@ -0,0 +1,135 @@
+/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; version 2 of the License.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */
+
+/*
+  Rename a table
+*/
+
+#include "ma_fulltext.h"
+#include "trnman_public.h"
+
+/**
+   @brief renames a table
+
+   @param  old_name        current name of table
+   @param  new_name        table should be renamed to this name
+
+   @return Operation status
+     @retval 0      OK
+     @retval !=0    Error
+*/
+
+int maria_rename(const char *old_name, const char *new_name)
+{
+  char from[FN_REFLEN],to[FN_REFLEN];
+  int data_file_rename_error;
+#ifdef USE_RAID
+  uint raid_type=0,raid_chunks=0;
+#endif
+  MARIA_HA *info;
+  MARIA_SHARE *share;
+  myf sync_dir;
+  DBUG_ENTER("maria_rename");
+
+#ifdef EXTRA_DEBUG
+  _ma_check_table_is_closed(old_name,"rename old_table");
+  _ma_check_table_is_closed(new_name,"rename new table2");
+#endif
+  /** @todo LOCK take X-lock on table */
+  if (!(info= maria_open(old_name, O_RDWR, HA_OPEN_FOR_REPAIR)))
+    DBUG_RETURN(my_errno);
+  share= info->s;
+#ifdef USE_RAID
+  raid_type =      share->base.raid_type;
+  raid_chunks =    share->base.raid_chunks;
+#endif
+
+  /*
+    the renaming of an internal table to the final table (like in ALTER TABLE)
+    is the moment when this table receives its correct create_rename_lsn and
+    this is important; make sure transactionality has been re-enabled.
+  */
+  DBUG_ASSERT(share->now_transactional == share->base.born_transactional);
+  sync_dir= (share->now_transactional && !share->temporary &&
+             !maria_in_recovery) ? MY_SYNC_DIR : 0;
+  if (sync_dir)
+  {
+    LSN lsn;
+    LEX_CUSTRING log_array[TRANSLOG_INTERNAL_PARTS + 2];
+    uint old_name_len= strlen(old_name)+1, new_name_len= strlen(new_name)+1;
+    log_array[TRANSLOG_INTERNAL_PARTS + 0].str= (uchar*)old_name;
+    log_array[TRANSLOG_INTERNAL_PARTS + 0].length= old_name_len;
+    log_array[TRANSLOG_INTERNAL_PARTS + 1].str= (uchar*)new_name;
+    log_array[TRANSLOG_INTERNAL_PARTS + 1].length= new_name_len;
+    /*
+      For this record to be of any use for Recovery, we need the upper
+      MySQL layer to be crash-safe, which it is not now (that would require
+      work using the ddl_log of sql/sql_table.cc); when it is, we should
+      reconsider the moment of writing this log record (before or after op,
+      under THR_LOCK_maria or not...), how to use it in Recovery.
+      For now it can serve to apply logs to a backup so we sync it.
+    */
+    if (unlikely(translog_write_record(&lsn, LOGREC_REDO_RENAME_TABLE,
+                                       &dummy_transaction_object, NULL,
+                                       old_name_len + new_name_len,
+                                       sizeof(log_array)/sizeof(log_array[0]),
+                                       log_array, NULL, NULL) ||
+                 translog_flush(lsn)))
+    {
+      maria_close(info);
+      DBUG_RETURN(1);
+    }
+    /*
+      store LSN into file, needed for Recovery to not be confused if a
+      RENAME happened (applying REDOs to the wrong table).
+    */
+    if (_ma_update_state_lsns(share, lsn, share->state.create_trid, TRUE,
+                              TRUE))
+    {
+      maria_close(info);
+      DBUG_RETURN(1);
+    }
+  }
+
+  maria_close(info);
+
+  fn_format(from,old_name,"",MARIA_NAME_IEXT,MY_UNPACK_FILENAME|MY_APPEND_EXT);
+  fn_format(to,new_name,"",MARIA_NAME_IEXT,MY_UNPACK_FILENAME|MY_APPEND_EXT);
+  if (my_rename_with_symlink(from, to, MYF(MY_WME | sync_dir)))
+    DBUG_RETURN(my_errno);
+  fn_format(from,old_name,"",MARIA_NAME_DEXT,MY_UNPACK_FILENAME|MY_APPEND_EXT);
+  fn_format(to,new_name,"",MARIA_NAME_DEXT,MY_UNPACK_FILENAME|MY_APPEND_EXT);
+#ifdef USE_RAID
+  if (raid_type)
+    data_file_rename_error= my_raid_rename(from, to, raid_chunks,
+                                           MYF(MY_WME | sync_dir));
+  else
+#endif
+    data_file_rename_error=
+      my_rename_with_symlink(from, to, MYF(MY_WME | sync_dir));
+  if (data_file_rename_error)
+  {
+    /*
+      now we have a renamed index file and a non-renamed data file, try to
+      undo the rename of the index file.
+    */
+    data_file_rename_error= my_errno;
+    fn_format(from, old_name, "", MARIA_NAME_IEXT, MYF(MY_UNPACK_FILENAME|MY_APPEND_EXT));
+    fn_format(to, new_name, "", MARIA_NAME_IEXT, MYF(MY_UNPACK_FILENAME|MY_APPEND_EXT));
+    my_rename_with_symlink(to, from, MYF(MY_WME | sync_dir));
+  }
+  DBUG_RETURN(data_file_rename_error);
+
+}
diff --git a/storage/maria/ma_rfirst.c b/storage/maria/ma_rfirst.c
new file mode 100644
index 00000000000..226aaa551f0
--- /dev/null
+++ b/storage/maria/ma_rfirst.c
@@ -0,0 +1,26 @@
+/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; version 2 of the License.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */
+
+#include "maria_def.h"
+
+	/*  Read first row through  a specfic key */
+
+int maria_rfirst(MARIA_HA *info, uchar *buf, int inx)
+{
+  DBUG_ENTER("maria_rfirst");
+  info->cur_row.lastpos= HA_OFFSET_ERROR;
+  info->update|= HA_STATE_PREV_FOUND;
+  DBUG_RETURN(maria_rnext(info,buf,inx));
+} /* maria_rfirst */
diff --git a/storage/maria/ma_rkey.c b/storage/maria/ma_rkey.c
new file mode 100644
index 00000000000..24b275d0ba6
--- /dev/null
+++ b/storage/maria/ma_rkey.c
@@ -0,0 +1,215 @@
+/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; version 2 of the License.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */
+
+/* Read record based on a key */
+
+#include "maria_def.h"
+#include "ma_rt_index.h"
+
+/**
+  Read a record using key
+
+  @note
+  Ordinary search_flag is 0 ; Give error if no record with key
+*/
+
+int maria_rkey(MARIA_HA *info, uchar *buf, int inx, const uchar *key_data,
+               key_part_map keypart_map, enum ha_rkey_function search_flag)
+{
+  uchar *key_buff;
+  MARIA_SHARE *share= info->s;
+  MARIA_KEYDEF *keyinfo;
+  HA_KEYSEG *last_used_keyseg;
+  uint32 nextflag;
+  MARIA_KEY key;
+  int icp_res= 1;
+  DBUG_ENTER("maria_rkey");
+  DBUG_PRINT("enter", ("base: 0x%lx  buf: 0x%lx  inx: %d  search_flag: %d",
+                       (long) info, (long) buf, inx, search_flag));
+
+  if ((inx = _ma_check_index(info,inx)) < 0)
+    DBUG_RETURN(my_errno);
+
+  info->update&= (HA_STATE_CHANGED | HA_STATE_ROW_CHANGED);
+  info->last_key_func= search_flag;
+  keyinfo= share->keyinfo + inx;
+
+  key_buff= info->lastkey_buff+info->s->base.max_key_length;
+
+  if (info->once_flags & USE_PACKED_KEYS)
+  {
+    info->once_flags&= ~USE_PACKED_KEYS;	/* Reset flag */
+    /*
+      key is already packed!;  This happens when we are using a MERGE TABLE
+      In this key 'key_part_map' is the length of the key !
+    */
+    bmove(key_buff, key_data, keypart_map);
+    key.data=    key_buff;
+    key.keyinfo= keyinfo;
+    key.data_length= keypart_map;
+    key.ref_length= 0;
+    key.flag= 0;
+
+    last_used_keyseg= keyinfo->seg + info->last_used_keyseg;
+  }
+  else
+  {
+    DBUG_ASSERT(keypart_map);
+    /* Save the packed key for later use in the second buffer of lastkey. */
+    _ma_pack_key(info, &key, inx, key_buff, key_data,
+                 keypart_map, &last_used_keyseg);
+    /* Save packed_key_length for use by the MERGE engine. */
+    info->pack_key_length= key.data_length;
+    info->last_used_keyseg= (uint16) (last_used_keyseg -
+                                      keyinfo->seg);
+    DBUG_EXECUTE("key", _ma_print_key(DBUG_FILE, &key););
+  }
+
+  if (fast_ma_readinfo(info))
+    goto err;
+  if (share->lock_key_trees)
+    rw_rdlock(&keyinfo->root_lock);
+
+  nextflag= maria_read_vec[search_flag] | key.flag;
+  if (search_flag != HA_READ_KEY_EXACT ||
+      ((keyinfo->flag & (HA_NOSAME | HA_NULL_PART)) != HA_NOSAME))
+    nextflag|= SEARCH_SAVE_BUFF;
+
+  switch (keyinfo->key_alg) {
+#ifdef HAVE_RTREE_KEYS
+  case HA_KEY_ALG_RTREE:
+    if (maria_rtree_find_first(info, &key, nextflag) < 0)
+    {
+      maria_print_error(info->s, HA_ERR_CRASHED);
+      my_errno= HA_ERR_CRASHED;
+      info->cur_row.lastpos= HA_OFFSET_ERROR;
+    }
+    break;
+#endif
+  case HA_KEY_ALG_BTREE:
+  default:
+    if (!_ma_search(info, &key, nextflag, info->s->state.key_root[inx]))
+    {      
+      MARIA_KEY lastkey;
+      lastkey.keyinfo= keyinfo;
+      lastkey.data= info->lastkey_buff;
+      /*
+        Found a key, but it might not be usable. We cannot use rows that
+        are inserted by other threads after we got our table lock
+        ("concurrent inserts"). The record may not even be present yet.
+        Keys are inserted into the index(es) before the record is
+        inserted into the data file.
+
+        If index condition is present, it must be either satisfied or 
+        not satisfied with an out-of-range condition.
+      */
+      if ((*share->row_is_visible)(info) && 
+          ((icp_res= ma_check_index_cond(info, inx, buf)) != 0))
+        break;
+
+      /* The key references a concurrently inserted record. */
+      if (search_flag == HA_READ_KEY_EXACT &&
+          last_used_keyseg == keyinfo->seg + keyinfo->keysegs)
+      {
+        /* Simply ignore the key if it matches exactly. (Bug #29838) */
+        my_errno= HA_ERR_KEY_NOT_FOUND;
+        info->cur_row.lastpos= HA_OFFSET_ERROR;
+        break;
+      }
+      
+      do
+      {
+        uint not_used[2];
+        /*
+          Skip rows that are inserted by other threads since we got
+          a lock. Note that this can only happen if we are not
+          searching after a full length exact key, because the keys
+          are sorted according to position.
+        */
+        lastkey.data_length= info->last_key.data_length;
+        lastkey.ref_length=  info->last_key.ref_length;
+        lastkey.flag=        info->last_key.flag;
+        if  (_ma_search_next(info, &lastkey, maria_readnext_vec[search_flag],
+                             info->s->state.key_root[inx]))
+          break;                          /* purecov: inspected */
+        /*
+          Check that the found key does still match the search.
+          _ma_search_next() delivers the next key regardless of its
+          value.
+        */
+        if (!(nextflag & (SEARCH_BIGGER | SEARCH_SMALLER)) &&
+            ha_key_cmp(keyinfo->seg, info->last_key.data, key.data,
+                       key.data_length, SEARCH_FIND, not_used))
+        {
+          /* purecov: begin inspected */
+          my_errno= HA_ERR_KEY_NOT_FOUND;
+          info->cur_row.lastpos= HA_OFFSET_ERROR;
+          break;
+          /* purecov: end */
+        }
+
+      } while (!(*share->row_is_visible)(info) || 
+               ((icp_res= ma_check_index_cond(info, inx, buf)) == 0));
+    }
+  }
+  if (share->lock_key_trees)
+    rw_unlock(&keyinfo->root_lock);
+
+  if (info->cur_row.lastpos == HA_OFFSET_ERROR || (icp_res != 1))
+  {
+    if (icp_res == 2)
+    {
+      info->cur_row.lastpos= HA_OFFSET_ERROR;
+      my_errno= HA_ERR_KEY_NOT_FOUND;
+    }
+    fast_ma_writeinfo(info);
+    goto err;
+  }
+  
+  /* Calculate length of the found key;  Used by maria_rnext_same */
+  if ((keyinfo->flag & HA_VAR_LENGTH_KEY))
+    info->last_rkey_length= _ma_keylength_part(keyinfo, info->lastkey_buff,
+					       last_used_keyseg);
+  else
+    info->last_rkey_length= key.data_length;
+
+  /* Check if we don't want to have record back, only error message */
+  if (!buf)
+  {
+    fast_ma_writeinfo(info);
+    DBUG_RETURN(0);
+  }
+  if (!(*info->read_record)(info, buf, info->cur_row.lastpos))
+  {
+    info->update|= HA_STATE_AKTIV;		/* Record is read */
+    DBUG_RETURN(0);
+  }
+
+  info->cur_row.lastpos= HA_OFFSET_ERROR;      /* Didn't find row */
+
+err:
+  /* Store last used key as a base for read next */
+  memcpy(info->last_key.data, key_buff, key.data_length);
+  info->last_key.data_length= key.data_length;
+  info->last_key.ref_length=  info->s->base.rec_reflength;
+  info->last_key.flag= 0;
+  /* Create key with rowid 0 */
+  bzero((char*) info->last_key.data + info->last_key.data_length,
+        info->s->base.rec_reflength);
+
+  if (search_flag == HA_READ_AFTER_KEY)
+    info->update|=HA_STATE_NEXT_FOUND;		/* Previous gives last row */
+  DBUG_RETURN(my_errno);
+} /* _ma_rkey */
diff --git a/storage/maria/ma_rlast.c b/storage/maria/ma_rlast.c
new file mode 100644
index 00000000000..a9a470d37d9
--- /dev/null
+++ b/storage/maria/ma_rlast.c
@@ -0,0 +1,26 @@
+/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; version 2 of the License.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */
+
+#include "maria_def.h"
+
+	/* Read last row with the same key as the previous read. */
+
+int maria_rlast(MARIA_HA *info, uchar *buf, int inx)
+{
+  DBUG_ENTER("maria_rlast");
+  info->cur_row.lastpos= HA_OFFSET_ERROR;
+  info->update|= HA_STATE_NEXT_FOUND;
+  DBUG_RETURN(maria_rprev(info,buf,inx));
+} /* maria_rlast */
diff --git a/storage/maria/ma_rnext.c b/storage/maria/ma_rnext.c
new file mode 100644
index 00000000000..bdba5ff3a17
--- /dev/null
+++ b/storage/maria/ma_rnext.c
@@ -0,0 +1,130 @@
+/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; version 2 of the License.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */
+
+#include "maria_def.h"
+
+#include "ma_rt_index.h"
+
+	/*
+	   Read next row with the same key as previous read
+	   One may have done a write, update or delete of the previous row.
+	   NOTE! Even if one changes the previous row, the next read is done
+	   based on the position of the last used key!
+	*/
+
+int maria_rnext(MARIA_HA *info, uchar *buf, int inx)
+{
+  int error,changed;
+  uint flag;
+  MARIA_SHARE *share= info->s;
+  MARIA_KEYDEF *keyinfo;
+  int icp_res= 1;
+  DBUG_ENTER("maria_rnext");
+
+  if ((inx = _ma_check_index(info,inx)) < 0)
+    DBUG_RETURN(my_errno);
+  flag=SEARCH_BIGGER;				/* Read next */
+  if (info->cur_row.lastpos == HA_OFFSET_ERROR &&
+      info->update & HA_STATE_PREV_FOUND)
+    flag=0;					/* Read first */
+
+  if (fast_ma_readinfo(info))
+    DBUG_RETURN(my_errno);
+  keyinfo= share->keyinfo + inx;
+  if (share->lock_key_trees)
+    rw_rdlock(&keyinfo->root_lock);
+  changed= _ma_test_if_changed(info);
+  if (!flag)
+  {
+    switch (keyinfo->key_alg){
+#ifdef HAVE_RTREE_KEYS
+    case HA_KEY_ALG_RTREE:
+      error=maria_rtree_get_first(info, inx,
+                                  info->last_key.data_length +
+                                  info->last_key.ref_length);
+                                  
+      break;
+#endif
+    case HA_KEY_ALG_BTREE:
+    default:
+      error= _ma_search_first(info, keyinfo, share->state.key_root[inx]);
+      break;
+    }
+  }
+  else
+  {
+    switch (keyinfo->key_alg) {
+#ifdef HAVE_RTREE_KEYS
+    case HA_KEY_ALG_RTREE:
+      /*
+	Note that rtree doesn't support that the table
+	may be changed since last call, so we do need
+	to skip rows inserted by other threads like in btree
+      */
+      error= maria_rtree_get_next(info, inx, info->last_key.data_length +
+                                  info->last_key.ref_length);
+      break;
+#endif
+    case HA_KEY_ALG_BTREE:
+    default:
+      if (!changed)
+	error= _ma_search_next(info, &info->last_key,
+                               flag | info->last_key.flag,
+			       share->state.key_root[inx]);
+      else
+	error= _ma_search(info, &info->last_key, flag | info->last_key.flag,
+                          share->state.key_root[inx]);
+    }
+  }
+
+  if (!error)
+  {
+    while (!(*share->row_is_visible)(info) ||
+           ((icp_res= ma_check_index_cond(info, inx, buf)) == 0))
+    {
+      /* Skip rows inserted by other threads since we got a lock */
+      if  ((error= _ma_search_next(info, &info->last_key,
+                                   SEARCH_BIGGER,
+                                   share->state.key_root[inx])))
+        break;
+    }
+  }
+  if (share->lock_key_trees)
+    rw_unlock(&keyinfo->root_lock);
+
+	/* Don't clear if database-changed */
+  info->update&= (HA_STATE_CHANGED | HA_STATE_ROW_CHANGED);
+  info->update|= HA_STATE_NEXT_FOUND;
+  
+  if (icp_res == 2)
+    my_errno=HA_ERR_END_OF_FILE; /* got beyond the end of scanned range */
+
+  if (error || icp_res != 1)
+  {
+    if (my_errno == HA_ERR_KEY_NOT_FOUND)
+      my_errno=HA_ERR_END_OF_FILE;
+  }
+  else if (!buf)
+  {
+    DBUG_RETURN(info->cur_row.lastpos == HA_OFFSET_ERROR ? my_errno : 0);
+  }
+  else if (!(*info->read_record)(info, buf, info->cur_row.lastpos))
+  {
+    info->update|= HA_STATE_AKTIV;		/* Record is read */
+    DBUG_RETURN(0);
+  }
+  DBUG_PRINT("error",("Got error: %d,  errno: %d",error, my_errno));
+  DBUG_RETURN(my_errno);
+} /* maria_rnext */
diff --git a/storage/maria/ma_rnext_same.c b/storage/maria/ma_rnext_same.c
new file mode 100644
index 00000000000..f67a76a366f
--- /dev/null
+++ b/storage/maria/ma_rnext_same.c
@@ -0,0 +1,113 @@
+/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; version 2 of the License.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */
+
+#include "maria_def.h"
+#include "ma_rt_index.h"
+
+/*
+  Read next row with the same key as previous read, but abort if
+  the key changes.
+  One may have done a write, update or delete of the previous row.
+
+  NOTE! Even if one changes the previous row, the next read is done
+  based on the position of the last used key!
+*/
+
+int maria_rnext_same(MARIA_HA *info, uchar *buf)
+{
+  int error;
+  uint inx,not_used[2];
+  MARIA_KEYDEF *keyinfo;
+  int icp_res= 1;
+  DBUG_ENTER("maria_rnext_same");
+
+  if ((int) (inx= info->lastinx) < 0 ||
+      info->cur_row.lastpos == HA_OFFSET_ERROR)
+    DBUG_RETURN(my_errno=HA_ERR_WRONG_INDEX);
+  if (fast_ma_readinfo(info))
+    DBUG_RETURN(my_errno);
+
+  keyinfo= info->s->keyinfo+inx;
+  if (info->s->lock_key_trees)
+    rw_rdlock(&keyinfo->root_lock);
+
+  switch (keyinfo->key_alg) {
+#ifdef HAVE_RTREE_KEYS
+    case HA_KEY_ALG_RTREE:
+      if ((error=maria_rtree_find_next(info,inx,
+				 maria_read_vec[info->last_key_func])))
+      {
+	error=1;
+	my_errno=HA_ERR_END_OF_FILE;
+	info->cur_row.lastpos= HA_OFFSET_ERROR;
+	break;
+      }
+      break;
+#endif
+    case HA_KEY_ALG_BTREE:
+    default:
+      if (!(info->update & HA_STATE_RNEXT_SAME))
+      {
+        /* First rnext_same; Store old key */
+        memcpy(info->lastkey_buff2, info->last_key.data,
+               info->last_rkey_length);
+      }
+      for (;;)
+      {
+        if ((error= _ma_search_next(info, &info->last_key,
+                                    SEARCH_BIGGER,
+                                    info->s->state.key_root[inx])))
+          break;
+        if (ha_key_cmp(keyinfo->seg, info->last_key.data,
+                       info->lastkey_buff2,
+                       info->last_rkey_length, SEARCH_FIND,
+                       not_used))
+        {
+          error=1;
+          my_errno=HA_ERR_END_OF_FILE;
+          info->cur_row.lastpos= HA_OFFSET_ERROR;
+          break;
+        }
+        /* Skip rows that are inserted by other threads since we got a lock */
+        if ((info->s->row_is_visible)(info) &&
+            ((icp_res= ma_check_index_cond(info, inx, buf)) != 0))
+          break;
+      }
+  }
+  if (info->s->lock_key_trees)
+    rw_unlock(&keyinfo->root_lock);
+	/* Don't clear if database-changed */
+  info->update&= (HA_STATE_CHANGED | HA_STATE_ROW_CHANGED);
+  info->update|= HA_STATE_NEXT_FOUND | HA_STATE_RNEXT_SAME;
+
+  if (icp_res == 2)
+    my_errno=HA_ERR_END_OF_FILE; /* got beyond the end of scanned range */
+
+  if (error || icp_res != 1)
+  {
+    if (my_errno == HA_ERR_KEY_NOT_FOUND)
+      my_errno=HA_ERR_END_OF_FILE;
+  }
+  else if (!buf)
+  {
+    DBUG_RETURN(info->cur_row.lastpos == HA_OFFSET_ERROR ? my_errno : 0);
+  }
+  else if (!(*info->read_record)(info, buf, info->cur_row.lastpos))
+  {
+    info->update|= HA_STATE_AKTIV;		/* Record is read */
+    DBUG_RETURN(0);
+  }
+  DBUG_RETURN(my_errno);
+} /* maria_rnext_same */
diff --git a/storage/maria/ma_rprev.c b/storage/maria/ma_rprev.c
new file mode 100644
index 00000000000..b9f46d7c405
--- /dev/null
+++ b/storage/maria/ma_rprev.c
@@ -0,0 +1,86 @@
+/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; version 2 of the License.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */
+
+#include "maria_def.h"
+
+	/*
+	   Read previous row with the same key as previous read
+	   One may have done a write, update or delete of the previous row.
+	   NOTE! Even if one changes the previous row, the next read is done
+	   based on the position of the last used key!
+	*/
+
+int maria_rprev(MARIA_HA *info, uchar *buf, int inx)
+{
+  int error,changed;
+  register uint flag;
+  MARIA_SHARE *share= info->s;
+  MARIA_KEYDEF *keyinfo;
+  DBUG_ENTER("maria_rprev");
+
+  if ((inx = _ma_check_index(info,inx)) < 0)
+    DBUG_RETURN(my_errno);
+  flag=SEARCH_SMALLER;				/* Read previous */
+  if (info->cur_row.lastpos == HA_OFFSET_ERROR &&
+      info->update & HA_STATE_NEXT_FOUND)
+    flag=0;					/* Read last */
+
+  if (fast_ma_readinfo(info))
+    DBUG_RETURN(my_errno);
+  keyinfo= share->keyinfo + inx;
+  changed= _ma_test_if_changed(info);
+  if (share->lock_key_trees)
+    rw_rdlock(&keyinfo->root_lock);
+  if (!flag)
+    error= _ma_search_last(info, keyinfo, share->state.key_root[inx]);
+  else if (!changed)
+    error= _ma_search_next(info, &info->last_key,
+                           flag | info->last_key.flag,
+                           share->state.key_root[inx]);
+  else
+    error= _ma_search(info, &info->last_key, flag | info->last_key.flag,
+                      share->state.key_root[inx]);
+
+  if (!error)
+  {
+    while (!(*share->row_is_visible)(info))
+    {
+      /* Skip rows that are inserted by other threads since we got a lock */
+      if  ((error= _ma_search_next(info, &info->last_key,
+                                   SEARCH_SMALLER,
+                                   share->state.key_root[inx])))
+        break;
+    }
+  }
+  if (share->lock_key_trees)
+    rw_unlock(&keyinfo->root_lock);
+  info->update&= (HA_STATE_CHANGED | HA_STATE_ROW_CHANGED);
+  info->update|= HA_STATE_PREV_FOUND;
+  if (error)
+  {
+    if (my_errno == HA_ERR_KEY_NOT_FOUND)
+      my_errno=HA_ERR_END_OF_FILE;
+  }
+  else if (!buf)
+  {
+    DBUG_RETURN(info->cur_row.lastpos == HA_OFFSET_ERROR ? my_errno : 0);
+  }
+  else if (!(*info->read_record)(info, buf, info->cur_row.lastpos))
+  {
+    info->update|= HA_STATE_AKTIV;		/* Record is read */
+    DBUG_RETURN(0);
+  }
+  DBUG_RETURN(my_errno);
+} /* maria_rprev */
diff --git a/storage/maria/ma_rrnd.c b/storage/maria/ma_rrnd.c
new file mode 100644
index 00000000000..24c4bfdd467
--- /dev/null
+++ b/storage/maria/ma_rrnd.c
@@ -0,0 +1,44 @@
+/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; version 2 of the License.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */
+
+/* Read a record with random-access. The position to the record must
+   get by MARIA_HA. The next record can be read with pos= MARIA_POS_ERROR */
+
+
+#include "maria_def.h"
+
+/*
+  Read a row based on position.
+
+  RETURN
+    0   Ok.
+    HA_ERR_RECORD_DELETED  Record is deleted.
+    HA_ERR_END_OF_FILE	   EOF.
+*/
+
+int maria_rrnd(MARIA_HA *info, uchar *buf, MARIA_RECORD_POS filepos)
+{
+  DBUG_ENTER("maria_rrnd");
+
+  DBUG_ASSERT(filepos != HA_OFFSET_ERROR);
+
+  /* Init all but update-flag */
+  info->update&= (HA_STATE_CHANGED | HA_STATE_ROW_CHANGED);
+  if (info->opt_flag & WRITE_CACHE_USED && flush_io_cache(&info->rec_cache))
+    DBUG_RETURN(my_errno);
+
+  info->cur_row.lastpos= filepos;               /* Remember for update */
+  DBUG_RETURN((*info->s->read_record)(info, buf, filepos));
+}
diff --git a/storage/maria/ma_rsame.c b/storage/maria/ma_rsame.c
new file mode 100644
index 00000000000..4bdbfd526ba
--- /dev/null
+++ b/storage/maria/ma_rsame.c
@@ -0,0 +1,78 @@
+/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; version 2 of the License.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */
+
+#include "maria_def.h"
+
+/**
+  Find current row with read on position or read on key
+
+  @notes
+  If inx >= 0 find record using key
+
+  @warning
+  This function is not row version safe.
+  This is not crtical as this function is not used by MySQL
+
+  @return
+  @retval 0                      Ok
+  @retval HA_ERR_KEY_NOT_FOUND   Row is deleted
+  @retval HA_ERR_END_OF_FILE     End of file
+*/
+
+
+int maria_rsame(MARIA_HA *info, uchar *record, int inx)
+{
+  DBUG_ENTER("maria_rsame");
+
+  if (inx != -1 && ! maria_is_key_active(info->s->state.key_map, inx))
+  {
+    DBUG_PRINT("error", ("wrong index usage"));
+    DBUG_RETURN(my_errno=HA_ERR_WRONG_INDEX);
+  }
+  if (info->cur_row.lastpos == HA_OFFSET_ERROR ||
+      info->update & HA_STATE_DELETED)
+  {
+    DBUG_PRINT("error", ("no current record"));
+    DBUG_RETURN(my_errno=HA_ERR_KEY_NOT_FOUND);	/* No current record */
+  }
+  info->update&= (HA_STATE_CHANGED | HA_STATE_ROW_CHANGED);
+
+  /* Read row from data file */
+  if (fast_ma_readinfo(info))
+    DBUG_RETURN(my_errno);
+
+  if (inx >= 0)
+  {
+    MARIA_KEYDEF *keyinfo= info->s->keyinfo + inx;
+    info->lastinx= inx;
+    (*keyinfo->make_key)(info, &info->last_key, (uint) inx,
+                         info->lastkey_buff, record,
+                         info->cur_row.lastpos,
+                         info->cur_row.trid);
+    if (info->s->lock_key_trees)
+      rw_rdlock(&keyinfo->root_lock);
+    VOID(_ma_search(info, &info->last_key, SEARCH_SAME,
+		    info->s->state.key_root[inx]));
+    if (info->s->lock_key_trees)
+      rw_unlock(&keyinfo->root_lock);
+  }
+
+  if (!(*info->read_record)(info, record, info->cur_row.lastpos))
+    DBUG_RETURN(0);
+  if (my_errno == HA_ERR_RECORD_DELETED)
+    my_errno=HA_ERR_KEY_NOT_FOUND;
+  DBUG_PRINT("error", ("my_errno: %d", my_errno));
+  DBUG_RETURN(my_errno);
+} /* maria_rsame */
diff --git a/storage/maria/ma_rsamepos.c b/storage/maria/ma_rsamepos.c
new file mode 100644
index 00000000000..d2099e7b116
--- /dev/null
+++ b/storage/maria/ma_rsamepos.c
@@ -0,0 +1,63 @@
+/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; version 2 of the License.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */
+
+/* read record through position and fix key-position */
+/* As maria_rsame but supply a position */
+
+#include "maria_def.h"
+
+
+/*
+  Read row based on postion
+
+  @param inx      If inx >= 0 postion the given index on found row
+
+  @return
+  @retval  0                    Ok
+  @retval HA_ERR_KEY_NOT_FOUND  Row is deleted
+  @retval HA_ERR_END_OF_FILE   End of file
+*/
+
+int maria_rsame_with_pos(MARIA_HA *info, uchar *record, int inx,
+                         MARIA_RECORD_POS filepos)
+{
+  DBUG_ENTER("maria_rsame_with_pos");
+  DBUG_PRINT("enter",("index: %d  filepos: %ld", inx, (long) filepos));
+
+  if (inx < -1 ||
+      (inx >= 0 && ! maria_is_key_active(info->s->state.key_map, inx)))
+  {
+    DBUG_RETURN(my_errno=HA_ERR_WRONG_INDEX);
+  }
+
+  info->update&= (HA_STATE_CHANGED | HA_STATE_ROW_CHANGED);
+  if ((*info->s->read_record)(info, record, filepos))
+  {
+    if (my_errno == HA_ERR_RECORD_DELETED)
+      my_errno=HA_ERR_KEY_NOT_FOUND;
+    DBUG_RETURN(my_errno);
+  }
+  info->cur_row.lastpos= filepos;
+  info->lastinx= inx;
+  if (inx >= 0)
+  {
+    (*info->s->keyinfo[inx].make_key)(info, &info->last_key, (uint) inx,
+                                      info->lastkey_buff,
+                                      record, info->cur_row.lastpos,
+                                      info->cur_row.trid);
+    info->update|=HA_STATE_KEY_CHANGED;		/* Don't use indexposition */
+  }
+  DBUG_RETURN(0);
+} /* maria_rsame_pos */
diff --git a/storage/maria/ma_rt_index.c b/storage/maria/ma_rt_index.c
new file mode 100644
index 00000000000..62474dbbad8
--- /dev/null
+++ b/storage/maria/ma_rt_index.c
@@ -0,0 +1,1343 @@
+/* Copyright (C) 2006 MySQL AB & Ramil Kalimullin & MySQL Finland AB
+   & TCX DataKonsult AB
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; version 2 of the License.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */
+
+#include "maria_def.h"
+#include "trnman.h"
+#include "ma_key_recover.h"
+
+#ifdef HAVE_RTREE_KEYS
+
+#include "ma_rt_index.h"
+#include "ma_rt_key.h"
+#include "ma_rt_mbr.h"
+
+#define REINSERT_BUFFER_INC 10
+#define PICK_BY_AREA
+/*#define PICK_BY_PERIMETER*/
+
+typedef struct st_page_level
+{
+  uint level;
+  my_off_t offs;
+} stPageLevel;
+
+typedef struct st_page_list
+{
+  uint n_pages;
+  uint m_pages;
+  stPageLevel *pages;
+} stPageList;
+
+
+/*
+   Find next key in r-tree according to search_flag recursively
+
+   NOTES
+     Used in maria_rtree_find_first() and maria_rtree_find_next()
+
+   RETURN
+     -1	 Error
+     0   Found
+     1   Not found
+*/
+
+static int maria_rtree_find_req(MARIA_HA *info, MARIA_KEYDEF *keyinfo,
+                                uint32 search_flag,
+                                uint nod_cmp_flag, my_off_t page_pos,
+                                int level)
+{
+  MARIA_SHARE *share= info->s;
+  uint nod_flag;
+  int res;
+  uchar *page_buf, *k, *last;
+  int key_data_length;
+  uint *saved_key= (uint*) (info->maria_rtree_recursion_state) + level;
+  MARIA_PAGE page;
+
+  if (!(page_buf= (uchar*) my_alloca((uint) keyinfo->block_length)))
+  {
+    my_errno= HA_ERR_OUT_OF_MEM;
+    return -1;
+  }
+  if (_ma_fetch_keypage(&page, info, keyinfo, page_pos,
+                        PAGECACHE_LOCK_LEFT_UNLOCKED,
+                        DFLT_INIT_HITS, page_buf, 0))
+    goto err;
+  nod_flag= page.node;
+
+  key_data_length= keyinfo->keylength - share->base.rec_reflength;
+
+  if (info->maria_rtree_recursion_depth >= level)
+  {
+    k= page_buf + *saved_key;
+  }
+  else
+  {
+    k= rt_PAGE_FIRST_KEY(share, page_buf, nod_flag);
+  }
+  last= rt_PAGE_END(&page);
+
+  for (; k < last; k= rt_PAGE_NEXT_KEY(share, k, key_data_length, nod_flag))
+  {
+    if (nod_flag)
+    {
+      /* this is an internal node in the tree */
+      if (!(res= maria_rtree_key_cmp(keyinfo->seg,
+                                      info->first_mbr_key, k,
+                                      info->last_rkey_length, nod_cmp_flag)))
+      {
+        switch ((res= maria_rtree_find_req(info, keyinfo, search_flag,
+                                            nod_cmp_flag,
+                                            _ma_kpos(nod_flag, k),
+                                            level + 1)))
+        {
+          case 0: /* found - exit from recursion */
+            *saved_key= k - page_buf;
+            goto ok;
+          case 1: /* not found - continue searching */
+            info->maria_rtree_recursion_depth= level;
+            break;
+          default: /* error */
+          case -1:
+            goto err;
+        }
+      }
+    }
+    else
+    {
+      /* this is a leaf */
+      if (!maria_rtree_key_cmp(keyinfo->seg, info->first_mbr_key,
+                               k, info->last_rkey_length, search_flag))
+      {
+        uchar *after_key= rt_PAGE_NEXT_KEY(share, k, key_data_length, 0);
+        MARIA_KEY tmp_key;
+        
+        /*
+          We don't need to set all MARIA_KEY elements here as
+          _ma_row_pos_from_key() only uses a few of them.
+         */
+        tmp_key.keyinfo= keyinfo;
+        tmp_key.data= k;
+        tmp_key.data_length= key_data_length;
+
+        info->cur_row.lastpos= _ma_row_pos_from_key(&tmp_key);
+        info->last_key.keyinfo= keyinfo;
+        info->last_key.data_length= key_data_length;
+        info->last_key.ref_length=  share->base.rec_reflength;
+        info->last_key.flag= 0;
+        memcpy(info->last_key.data, k,
+               info->last_key.data_length + info->last_key.ref_length);
+        info->maria_rtree_recursion_depth= level;
+        *saved_key= last - page_buf;
+
+        if (after_key < last)
+        {
+          uchar *keyread_buff= info->keyread_buff;
+          info->int_keypos= keyread_buff;
+          info->int_maxpos= keyread_buff + (last - after_key);
+          memcpy(keyread_buff, after_key, last - after_key);
+          info->keyread_buff_used= 0;
+        }
+        else
+        {
+	  info->keyread_buff_used= 1;
+        }
+
+        res= 0;
+        goto ok;
+      }
+    }
+  }
+  info->cur_row.lastpos= HA_OFFSET_ERROR;
+  my_errno= HA_ERR_KEY_NOT_FOUND;
+  res= 1;
+
+ok:
+  my_afree(page_buf);
+  return res;
+
+err:
+  my_afree(page_buf);
+  info->cur_row.lastpos= HA_OFFSET_ERROR;
+  return -1;
+}
+
+
+/*
+  Find first key in r-tree according to search_flag condition
+
+  SYNOPSIS
+   maria_rtree_find_first()
+   info			Handler to MARIA file
+   key			Key to search for
+   search_flag		Bitmap of flags how to do the search
+
+  RETURN
+    -1  Error
+    0   Found
+    1   Not found
+*/
+
+int maria_rtree_find_first(MARIA_HA *info, MARIA_KEY *key, uint32 search_flag)
+{
+  my_off_t root;
+  uint nod_cmp_flag;
+  MARIA_KEYDEF *keyinfo= key->keyinfo;
+
+  if ((root= info->s->state.key_root[keyinfo->key_nr]) == HA_OFFSET_ERROR)
+  {
+    my_errno= HA_ERR_END_OF_FILE;
+    return -1;
+  }
+
+  /*
+    Save searched key, include data pointer.
+    The data pointer is required if the search_flag contains MBR_DATA.
+    (minimum bounding rectangle)
+  */
+  memcpy(info->first_mbr_key, key->data, key->data_length + key->ref_length);
+  info->last_rkey_length= key->data_length;
+
+  info->maria_rtree_recursion_depth= -1;
+  info->keyread_buff_used= 1;
+
+  nod_cmp_flag= ((search_flag & (MBR_EQUAL | MBR_WITHIN)) ?
+                 MBR_WITHIN : MBR_INTERSECT);
+  return maria_rtree_find_req(info, keyinfo, search_flag, nod_cmp_flag, root,
+                              0);
+}
+
+
+/*
+   Find next key in r-tree according to search_flag condition
+
+  SYNOPSIS
+   maria_rtree_find_next()
+   info			Handler to MARIA file
+   uint keynr		Key number to use
+   search_flag		Bitmap of flags how to do the search
+
+   RETURN
+     -1  Error
+     0   Found
+     1   Not found
+*/
+
+int maria_rtree_find_next(MARIA_HA *info, uint keynr, uint32 search_flag)
+{
+  my_off_t root;
+  uint32 nod_cmp_flag;
+  MARIA_KEYDEF *keyinfo= info->s->keyinfo + keynr;
+  DBUG_ASSERT(info->last_key.keyinfo == keyinfo);
+
+  if (info->update & HA_STATE_DELETED)
+    return maria_rtree_find_first(info, &info->last_key, search_flag);
+
+  if (!info->keyread_buff_used)
+  {
+    uchar *key= info->int_keypos;
+
+    while (key < info->int_maxpos)
+    {
+      if (!maria_rtree_key_cmp(keyinfo->seg,
+                               info->first_mbr_key, key,
+                               info->last_rkey_length, search_flag))
+      {
+        uchar *after_key= key + keyinfo->keylength;
+        MARIA_KEY tmp_key;
+        
+        /*
+          We don't need to set all MARIA_KEY elements here as
+          _ma_row_pos_from_key only uses a few of them.
+         */
+        tmp_key.keyinfo= keyinfo;
+        tmp_key.data= key;
+        tmp_key.data_length= keyinfo->keylength - info->s->base.rec_reflength;
+
+        info->cur_row.lastpos= _ma_row_pos_from_key(&tmp_key);
+        memcpy(info->last_key.data, key, info->last_key.data_length);
+
+        if (after_key < info->int_maxpos)
+	  info->int_keypos= after_key;
+        else
+	  info->keyread_buff_used= 1;
+        return 0;
+      }
+      key+= keyinfo->keylength;
+    }
+  }
+  if ((root= info->s->state.key_root[keynr]) == HA_OFFSET_ERROR)
+  {
+    my_errno= HA_ERR_END_OF_FILE;
+    return -1;
+  }
+
+  nod_cmp_flag= (((search_flag & (MBR_EQUAL | MBR_WITHIN)) ?
+                  MBR_WITHIN : MBR_INTERSECT));
+  return maria_rtree_find_req(info, keyinfo, search_flag, nod_cmp_flag, root,
+                              0);
+}
+
+
+/*
+  Get next key in r-tree recursively
+
+  NOTES
+    Used in maria_rtree_get_first() and maria_rtree_get_next()
+
+  RETURN
+    -1  Error
+    0   Found
+    1   Not found
+*/
+
+static int maria_rtree_get_req(MARIA_HA *info, MARIA_KEYDEF *keyinfo,
+                               uint key_length, my_off_t page_pos, int level)
+{
+  MARIA_SHARE *share= info->s;
+  uchar *page_buf, *last, *k;
+  uint nod_flag, key_data_length;
+  int res;
+  uint *saved_key= (uint*) (info->maria_rtree_recursion_state) + level;
+  MARIA_PAGE page;
+
+  if (!(page_buf= (uchar*) my_alloca((uint) keyinfo->block_length)))
+    return -1;
+  if (_ma_fetch_keypage(&page, info, keyinfo, page_pos,
+                        PAGECACHE_LOCK_LEFT_UNLOCKED,
+                         DFLT_INIT_HITS, page_buf, 0))
+    goto err;
+  nod_flag= page.node;
+
+  key_data_length= keyinfo->keylength - share->base.rec_reflength;
+
+  if (info->maria_rtree_recursion_depth >= level)
+  {
+    k= page.buff + *saved_key;
+    if (!nod_flag)
+    {
+      /* Only leaf pages contain data references. */
+      /* Need to check next key with data reference. */
+      k= rt_PAGE_NEXT_KEY(share, k, key_data_length, nod_flag);
+    }
+  }
+  else
+  {
+    k= rt_PAGE_FIRST_KEY(share, page.buff, nod_flag);
+  }
+  last= rt_PAGE_END(&page);
+
+  for (; k < last; k= rt_PAGE_NEXT_KEY(share, k, key_data_length, nod_flag))
+  {
+    if (nod_flag)
+    {
+      /* this is an internal node in the tree */
+      switch ((res= maria_rtree_get_req(info, keyinfo, key_length,
+                                         _ma_kpos(nod_flag, k), level + 1)))
+      {
+        case 0: /* found - exit from recursion */
+          *saved_key= k - page.buff;
+          goto ok;
+        case 1: /* not found - continue searching */
+          info->maria_rtree_recursion_depth= level;
+          break;
+        default:
+        case -1: /* error */
+          goto err;
+      }
+    }
+    else
+    {
+      /* this is a leaf */
+      uchar *after_key= rt_PAGE_NEXT_KEY(share, k, key_data_length, 0);
+      MARIA_KEY tmp_key;
+        
+      /*
+        We don't need to set all MARIA_KEY elements here as
+        _ma_row_pos_from_key() only uses a few of them.
+      */
+      tmp_key.keyinfo= keyinfo;
+      tmp_key.data= k;
+      tmp_key.data_length= key_data_length;
+
+      info->cur_row.lastpos= _ma_row_pos_from_key(&tmp_key);
+      info->last_key.data_length= key_data_length;
+      info->last_key.ref_length= share->base.rec_reflength;
+
+      memcpy(info->last_key.data, k,
+             info->last_key.data_length + info->last_key.ref_length);
+
+      info->maria_rtree_recursion_depth= level;
+      *saved_key= k - page.buff;
+
+      if (after_key < last)
+      {
+        uchar *keyread_buff= info->keyread_buff;
+        info->last_rtree_keypos= saved_key;
+        memcpy(keyread_buff, page.buff, page.size);
+        info->int_maxpos= keyread_buff + page.size;
+        info->keyread_buff_used= 0;
+      }
+      else
+      {
+	info->keyread_buff_used= 1;
+      }
+
+      res= 0;
+      goto ok;
+    }
+  }
+  info->cur_row.lastpos= HA_OFFSET_ERROR;
+  my_errno= HA_ERR_KEY_NOT_FOUND;
+  res= 1;
+
+ok:
+  my_afree(page_buf);
+  return res;
+
+err:
+  my_afree(page_buf);
+  info->cur_row.lastpos= HA_OFFSET_ERROR;
+  return -1;
+}
+
+
+/*
+  Get first key in r-tree
+
+  RETURN
+    -1	Error
+    0	Found
+    1	Not found
+*/
+
+int maria_rtree_get_first(MARIA_HA *info, uint keynr, uint key_length)
+{
+  my_off_t root;
+  MARIA_KEYDEF *keyinfo= info->s->keyinfo + keynr;
+
+  if ((root= info->s->state.key_root[keynr]) == HA_OFFSET_ERROR)
+  {
+    my_errno= HA_ERR_END_OF_FILE;
+    return -1;
+  }
+
+  info->maria_rtree_recursion_depth= -1;
+  info->keyread_buff_used= 1;
+
+  return maria_rtree_get_req(info, keyinfo, key_length, root, 0);
+}
+
+
+/*
+  Get next key in r-tree
+
+  RETURN
+    -1	Error
+    0	Found
+    1	Not found
+*/
+
+int maria_rtree_get_next(MARIA_HA *info, uint keynr, uint key_length)
+{
+  my_off_t root;
+  MARIA_KEYDEF *keyinfo= info->s->keyinfo + keynr;
+  uchar *keyread_buff= info->keyread_buff;
+
+  if (!info->keyread_buff_used)
+  {
+    uint key_data_length= keyinfo->keylength - info->s->base.rec_reflength;
+    /* rt_PAGE_NEXT_KEY(*info->last_rtree_keypos) */
+    uchar *key= keyread_buff + *info->last_rtree_keypos + keyinfo->keylength;
+    /* rt_PAGE_NEXT_KEY(key) */
+    uchar *after_key= key + keyinfo->keylength;
+    MARIA_KEY tmp_key;
+
+    tmp_key.keyinfo= keyinfo;
+    tmp_key.data= key;
+    tmp_key.data_length= key_data_length;
+    tmp_key.ref_length= info->s->base.rec_reflength;
+    tmp_key.flag= 0;
+
+    info->cur_row.lastpos= _ma_row_pos_from_key(&tmp_key);
+    _ma_copy_key(&info->last_key, &tmp_key);
+
+    *info->last_rtree_keypos= (uint) (key - keyread_buff);
+    if (after_key >= info->int_maxpos)
+    {
+      info->keyread_buff_used= 1;
+    }
+
+    return 0;
+  }
+  else
+  {
+    if ((root= info->s->state.key_root[keynr]) == HA_OFFSET_ERROR)
+    {
+      my_errno= HA_ERR_END_OF_FILE;
+      return -1;
+    }
+
+    return maria_rtree_get_req(info, &keyinfo[keynr], key_length, root, 0);
+  }
+}
+
+
+/*
+  Choose non-leaf better key for insertion
+
+  Returns a pointer inside the page_buf buffer.
+*/
+#ifdef PICK_BY_PERIMETER
+static const uchar *maria_rtree_pick_key(const MARIA_KEY *key,
+                                         const MARIA_PAGE *page)
+{
+  double increase;
+  double best_incr;
+  double perimeter;
+  double best_perimeter;
+  uchar *best_key= NULL;
+  const MARIA_HA *info= page->info;
+
+  uchar *k= rt_PAGE_FIRST_KEY(info->s, page->buf, page->node);
+  uchar *last= rt_PAGE_END(info, page);
+
+  LINT_INIT(best_perimeter);
+  LINT_INIT(best_key);
+  LINT_INIT(best_incr);
+
+  for (; k < last; k= rt_PAGE_NEXT_KEY(k, key->data_length, nod_flag))
+  {
+    if ((increase= maria_rtree_perimeter_increase(keyinfo->seg, k, key,
+                                                  &perimeter)) == -1)
+      return NULL;
+    if ((increase < best_incr)||
+	(increase == best_incr && perimeter < best_perimeter))
+    {
+      best_key= k;
+      best_perimeter= perimeter;
+      best_incr= increase;
+    }
+  }
+  return best_key;
+}
+
+#endif /*PICK_BY_PERIMETER*/
+
+#ifdef PICK_BY_AREA
+static const uchar *maria_rtree_pick_key(const MARIA_KEY *key,
+                                         const MARIA_PAGE *page)
+{
+  const MARIA_HA *info= page->info;
+  MARIA_SHARE *share= info->s;
+  double increase;
+  double best_incr= DBL_MAX;
+  double area;
+  double best_area;
+  const uchar *best_key= NULL;
+  const uchar *k= rt_PAGE_FIRST_KEY(share, page->buff, page->node);
+  const uchar *last= rt_PAGE_END(page);
+
+  LINT_INIT(best_area);
+
+  for (; k < last;
+       k= rt_PAGE_NEXT_KEY(share, k, key->data_length, page->node))
+  {
+    /* The following is safe as -1.0 is an exact number */
+    if ((increase= maria_rtree_area_increase(key->keyinfo->seg, k, key->data,
+                                             key->data_length +
+                                             key->ref_length,
+                                             &area)) == -1.0)
+      return NULL;
+    /* The following should be safe, even if we compare doubles */
+    if (!best_key || increase < best_incr ||
+        ((increase == best_incr) && (area < best_area)))
+    {
+      best_key= k;
+      best_area= area;
+      best_incr= increase;
+    }
+  }
+  return best_key;
+}
+
+#endif /*PICK_BY_AREA*/
+
+/*
+  Go down and insert key into tree
+
+  RETURN
+    -1	Error
+    0	Child was not split
+    1	Child was split
+*/
+
+static int maria_rtree_insert_req(MARIA_HA *info, MARIA_KEY *key,
+                                  my_off_t page_pos, my_off_t *new_page,
+                                  int ins_level, int level)
+{
+  uint nod_flag;
+  uint key_length= key->data_length;
+  int res;
+  uchar *page_buf, *k;
+  MARIA_SHARE *share= info->s;
+  MARIA_KEYDEF *keyinfo= key->keyinfo;
+  MARIA_PAGE page;
+  DBUG_ENTER("maria_rtree_insert_req");
+
+  if (!(page_buf= (uchar*) my_alloca((uint) keyinfo->block_length +
+                                     MARIA_MAX_KEY_BUFF)))
+  {
+    my_errno= HA_ERR_OUT_OF_MEM;
+    DBUG_RETURN(-1); /* purecov: inspected */
+  }
+  if (_ma_fetch_keypage(&page, info, keyinfo, page_pos, PAGECACHE_LOCK_WRITE,
+                        DFLT_INIT_HITS, page_buf, 0))
+    goto err;
+  nod_flag= page.node;
+  DBUG_PRINT("rtree", ("page: %lu  level: %d  ins_level: %d  nod_flag: %u",
+                       (ulong) page.pos, level, ins_level, nod_flag));
+
+  if ((ins_level == -1 && nod_flag) ||       /* key: go down to leaf */
+      (ins_level > -1 && ins_level > level)) /* branch: go down to ins_level */
+  {
+    if (!(k= (uchar *)maria_rtree_pick_key(key, &page)))
+      goto err;
+    /* k is now a pointer inside the page_buf buffer */
+    switch ((res= maria_rtree_insert_req(info, key,
+                                         _ma_kpos(nod_flag, k), new_page,
+                                         ins_level, level + 1)))
+    {
+      case 0: /* child was not split, most common case */
+      {
+        maria_rtree_combine_rect(keyinfo->seg, k, key->data, k, key_length);
+        if (share->now_transactional &&
+            _ma_log_change(&page, k, key_length,
+                           KEY_OP_DEBUG_RTREE_COMBINE))
+          goto err;
+        page_mark_changed(info, &page);
+        if (_ma_write_keypage(&page, PAGECACHE_LOCK_LEFT_WRITELOCKED,
+                              DFLT_INIT_HITS))
+          goto err;
+        goto ok;
+      }
+      case 1: /* child was split */
+      {
+        /* Set new_key to point to a free buffer area */
+        uchar *new_key_buff= page_buf + keyinfo->block_length + nod_flag;
+        MARIA_KEY new_key;
+        MARIA_KEY k_key;
+
+        DBUG_ASSERT(nod_flag);
+        k_key.keyinfo= new_key.keyinfo= keyinfo;
+        new_key.data= new_key_buff;
+        k_key.data= k;
+        k_key.data_length= new_key.data_length= key->data_length;
+        k_key.ref_length=  new_key.ref_length=  key->ref_length;
+        k_key.flag= new_key.flag= 0;            /* Safety */
+
+        /* set proper MBR for key */
+        if (maria_rtree_set_key_mbr(info, &k_key, _ma_kpos(nod_flag, k)))
+          goto err;
+        if (share->now_transactional &&
+            _ma_log_change(&page, k, key_length,
+                           KEY_OP_DEBUG_RTREE_SPLIT))
+          goto err;
+        /* add new key for new page */
+        _ma_kpointer(info, new_key_buff - nod_flag, *new_page);
+        if (maria_rtree_set_key_mbr(info, &new_key, *new_page))
+          goto err;
+        res= maria_rtree_add_key(&new_key, &page, new_page);
+        page_mark_changed(info, &page);
+        if (_ma_write_keypage(&page, PAGECACHE_LOCK_LEFT_WRITELOCKED,
+                              DFLT_INIT_HITS))
+          goto err;
+        goto ok;
+      }
+      default:
+      case -1: /* error */
+      {
+        goto err;
+      }
+    }
+  }
+  else
+  {
+    res= maria_rtree_add_key(key, &page, new_page);
+    page_mark_changed(info, &page);
+    if (_ma_write_keypage(&page, PAGECACHE_LOCK_LEFT_WRITELOCKED,
+                          DFLT_INIT_HITS))
+      goto err;
+  }
+
+ok:
+  my_afree(page_buf);
+  DBUG_RETURN(res);
+
+err:
+  res= -1;                                   /* purecov: inspected */
+  goto ok;                                   /* purecov: inspected */
+}
+
+
+/**
+  Insert key into the tree
+
+  @param  info             table
+  @param  key              KEY to insert
+  @param  ins_level        at which level key insertion should start
+  @param  root             put new key_root there
+
+  @return Operation result
+    @retval  -1 Error
+    @retval   0 Root was not split
+    @retval   1 Root was split
+*/
+
+int maria_rtree_insert_level(MARIA_HA *info, MARIA_KEY *key, int ins_level,
+                             my_off_t *root)
+{
+  my_off_t old_root;
+  MARIA_SHARE *share= info->s;
+  MARIA_KEYDEF *keyinfo= key->keyinfo;
+  int res;
+  my_off_t new_page;
+  enum pagecache_page_lock write_lock;
+  DBUG_ENTER("maria_rtree_insert_level");
+
+  if ((old_root= share->state.key_root[keyinfo->key_nr]) == HA_OFFSET_ERROR)
+  {
+    MARIA_PINNED_PAGE tmp_page_link, *page_link;
+    MARIA_PAGE page;
+
+    page_link= &tmp_page_link;
+    if ((old_root= _ma_new(info, DFLT_INIT_HITS, &page_link)) ==
+        HA_OFFSET_ERROR)
+      DBUG_RETURN(-1);
+    write_lock= page_link->write_lock;
+    info->keyread_buff_used= 1;
+    bzero(info->buff, share->block_size);
+    _ma_store_keynr(share, info->buff, keyinfo->key_nr);
+    _ma_store_page_used(share, info->buff, share->keypage_header);
+    _ma_page_setup(&page, info, keyinfo, old_root, info->buff);
+
+    if (share->now_transactional && _ma_log_new(&page, 1))
+      DBUG_RETURN(1);
+
+    res= maria_rtree_add_key(key, &page, NULL);
+    if (_ma_write_keypage(&page, write_lock, DFLT_INIT_HITS))
+      DBUG_RETURN(1);
+    *root= old_root;
+    DBUG_RETURN(res);
+  }
+
+  switch ((res= maria_rtree_insert_req(info, key, old_root, &new_page,
+                                       ins_level, 0)))
+  {
+    case 0: /* root was not split */
+    {
+      break;
+    }
+    case 1: /* root was split, grow a new root; very rare */
+    {
+      uchar *new_root_buf, *new_key_buff;
+      my_off_t new_root;
+      uint nod_flag= share->base.key_reflength;
+      MARIA_PINNED_PAGE tmp_page_link, *page_link;
+      MARIA_KEY new_key;
+      MARIA_PAGE page;
+      page_link= &tmp_page_link;
+
+      DBUG_PRINT("rtree", ("root was split, grow a new root"));
+      if (!(new_root_buf= (uchar*) my_alloca((uint) keyinfo->block_length +
+                                             MARIA_MAX_KEY_BUFF)))
+      {
+        my_errno= HA_ERR_OUT_OF_MEM;
+        DBUG_RETURN(-1); /* purecov: inspected */
+      }
+
+      bzero(new_root_buf, share->block_size);
+      _ma_store_keypage_flag(share, new_root_buf, KEYPAGE_FLAG_ISNOD);
+      _ma_store_keynr(share, new_root_buf, keyinfo->key_nr);
+      _ma_store_page_used(share, new_root_buf, share->keypage_header);
+      if ((new_root= _ma_new(info, DFLT_INIT_HITS, &page_link)) ==
+	  HA_OFFSET_ERROR)
+        goto err;
+      write_lock= page_link->write_lock;
+
+      _ma_page_setup(&page, info, keyinfo, new_root, new_root_buf);
+
+      if (share->now_transactional && _ma_log_new(&page, 1))
+        goto err;
+
+      /* Point to some free space */
+      new_key_buff= new_root_buf + keyinfo->block_length + nod_flag;
+      new_key.keyinfo=     keyinfo;
+      new_key.data=        new_key_buff;
+      new_key.data_length= key->data_length;
+      new_key.ref_length=  key->ref_length;
+      new_key.flag= 0;
+
+      _ma_kpointer(info, new_key_buff - nod_flag, old_root);
+      if (maria_rtree_set_key_mbr(info, &new_key, old_root))
+        goto err;
+      if (maria_rtree_add_key(&new_key, &page, NULL)
+          == -1)
+        goto err;
+      _ma_kpointer(info, new_key_buff - nod_flag, new_page);
+      if (maria_rtree_set_key_mbr(info, &new_key, new_page))
+        goto err;
+      if (maria_rtree_add_key(&new_key, &page, NULL)
+          == -1)
+        goto err;
+      if (_ma_write_keypage(&page, write_lock, DFLT_INIT_HITS))
+        goto err;
+      *root= new_root;
+      DBUG_PRINT("rtree", ("new root page: %lu  level: %d  nod_flag: %u",
+                           (ulong) new_root, 0, page.node));
+
+      my_afree(new_root_buf);
+      break;
+err:
+      my_afree(new_root_buf);
+      DBUG_RETURN(-1); /* purecov: inspected */
+    }
+    default:
+    case -1: /* error */
+    {
+      DBUG_ASSERT(0);
+      break;
+    }
+  }
+  DBUG_RETURN(res);
+}
+
+
+/*
+  Insert key into the tree - interface function
+
+  RETURN
+    1	Error
+    0	OK
+*/
+
+my_bool maria_rtree_insert(MARIA_HA *info, MARIA_KEY *key)
+{
+  int res;
+  MARIA_SHARE *share= info->s;
+  my_off_t *root,  new_root;
+  LSN lsn= LSN_IMPOSSIBLE;
+  DBUG_ENTER("maria_rtree_insert");
+
+  if (!key)
+    DBUG_RETURN(1);                       /* _ma_sp_make_key failed */
+
+  root= &share->state.key_root[key->keyinfo->key_nr];
+  new_root= *root;
+
+  if ((res= (maria_rtree_insert_level(info, key, -1, &new_root) == -1)))
+    goto err;
+  if (share->now_transactional)
+    res= _ma_write_undo_key_insert(info, key, root, new_root, &lsn);
+  else
+  {
+    *root= new_root;
+    _ma_fast_unlock_key_del(info);
+  }
+  _ma_unpin_all_pages_and_finalize_row(info, lsn);
+err:
+  DBUG_RETURN(res != 0);
+}
+
+
+/*
+  Fill reinsert page buffer
+
+  RETURN
+    1	Error
+    0	OK
+*/
+
+static my_bool maria_rtree_fill_reinsert_list(stPageList *ReinsertList,
+                                              my_off_t page, int level)
+{
+  DBUG_ENTER("maria_rtree_fill_reinsert_list");
+  DBUG_PRINT("rtree", ("page: %lu  level: %d", (ulong) page, level));
+  if (ReinsertList->n_pages == ReinsertList->m_pages)
+  {
+    ReinsertList->m_pages += REINSERT_BUFFER_INC;
+    if (!(ReinsertList->pages= (stPageLevel*)my_realloc((uchar*)ReinsertList->pages,
+      ReinsertList->m_pages * sizeof(stPageLevel), MYF(MY_ALLOW_ZERO_PTR))))
+      goto err;
+  }
+  /* save page to ReinsertList */
+  ReinsertList->pages[ReinsertList->n_pages].offs= page;
+  ReinsertList->pages[ReinsertList->n_pages].level= level;
+  ReinsertList->n_pages++;
+  DBUG_RETURN(0);
+
+err:
+  DBUG_RETURN(1);                             /* purecov: inspected */
+}
+
+
+/*
+  Go down and delete key from the tree
+
+  RETURN
+    -1	Error
+    0	Deleted
+    1	Not found
+    2	Empty leaf
+*/
+
+static int maria_rtree_delete_req(MARIA_HA *info, const MARIA_KEY *key,
+                                  my_off_t page_pos, uint *page_size,
+                                  stPageList *ReinsertList, int level)
+{
+  ulong i;
+  uint nod_flag;
+  int res;
+  uchar *page_buf, *last, *k;
+  MARIA_SHARE *share= info->s;
+  MARIA_KEYDEF *keyinfo= key->keyinfo;
+  MARIA_PAGE page;
+  DBUG_ENTER("maria_rtree_delete_req");
+
+  if (!(page_buf= (uchar*) my_alloca((uint) keyinfo->block_length)))
+  {
+    my_errno= HA_ERR_OUT_OF_MEM;
+    DBUG_RETURN(-1); /* purecov: inspected */
+  }
+  if (_ma_fetch_keypage(&page, info, keyinfo, page_pos, PAGECACHE_LOCK_WRITE,
+                        DFLT_INIT_HITS, page_buf, 0))
+    goto err;
+  nod_flag= page.node;
+  DBUG_PRINT("rtree", ("page: %lu  level: %d  nod_flag: %u",
+                       (ulong) page_pos, level, nod_flag));
+
+  k= rt_PAGE_FIRST_KEY(share, page_buf, nod_flag);
+  last= rt_PAGE_END(&page);
+
+  for (i= 0;
+       k < last;
+       k= rt_PAGE_NEXT_KEY(share, k, key->data_length, nod_flag), i++)
+  {
+    if (nod_flag)
+    {
+      /* not leaf */
+      if (!maria_rtree_key_cmp(keyinfo->seg, key->data, k, key->data_length,
+                               MBR_WITHIN))
+      {
+        switch ((res= maria_rtree_delete_req(info, key,
+                                             _ma_kpos(nod_flag, k),
+                                             page_size, ReinsertList,
+                                             level + 1)))
+        {
+          case 0: /* deleted */
+          {
+            /* test page filling */
+            if (*page_size + key->data_length >=
+                rt_PAGE_MIN_SIZE(keyinfo->block_length))
+            {
+              /* OK */
+              /* Calculate a new key value (MBR) for the shrinked block. */
+              MARIA_KEY tmp_key;
+              tmp_key.keyinfo= keyinfo;
+              tmp_key.data= k;
+              tmp_key.data_length= key->data_length;
+              tmp_key.ref_length=  key->ref_length;
+              tmp_key.flag= 0;                  /* Safety */
+
+              if (maria_rtree_set_key_mbr(info, &tmp_key,
+                                          _ma_kpos(nod_flag, k)))
+                goto err;
+              if (share->now_transactional &&
+                  _ma_log_change(&page, k, key->data_length,
+                                 KEY_OP_DEBUG_RTREE_SET_KEY))
+                goto err;
+              page_mark_changed(info, &page)
+              if (_ma_write_keypage(&page,
+                                    PAGECACHE_LOCK_LEFT_WRITELOCKED,
+                                    DFLT_INIT_HITS))
+                goto err;
+            }
+            else
+            {
+              /*
+                Too small: delete key & add it descendant to reinsert list.
+                Store position and level of the block so that it can be
+                accessed later for inserting the remaining keys.
+              */
+              DBUG_PRINT("rtree", ("too small. move block to reinsert list"));
+              if (maria_rtree_fill_reinsert_list(ReinsertList,
+                                                 _ma_kpos(nod_flag, k),
+                                                 level + 1))
+                goto err;
+              /*
+                Delete the key that references the block. This makes the
+                block disappear from the index. Hence we need to insert
+                its remaining keys later. Note: if the block is a branch
+                block, we do not only remove this block, but the whole
+                subtree. So we need to re-insert its keys on the same
+                level later to reintegrate the subtrees.
+              */
+              if (maria_rtree_delete_key(&page, k, key->data_length))
+                goto err;
+              page_mark_changed(info, &page);
+              if (_ma_write_keypage(&page, PAGECACHE_LOCK_LEFT_WRITELOCKED,
+                                    DFLT_INIT_HITS))
+                goto err;
+              *page_size= page.size;
+            }
+
+            goto ok;
+          }
+          case 1: /* not found - continue searching */
+          {
+            break;
+          }
+          case 2: /* vacuous case: last key in the leaf */
+          {
+            if (maria_rtree_delete_key(&page, k, key->data_length))
+              goto err;
+            page_mark_changed(info, &page);
+            if (_ma_write_keypage(&page, PAGECACHE_LOCK_LEFT_WRITELOCKED,
+                                  DFLT_INIT_HITS))
+              goto err;
+            *page_size= page.size;
+            res= 0;
+            goto ok;
+          }
+          default: /* error */
+          case -1:
+          {
+            goto err;
+          }
+        }
+      }
+    }
+    else
+    {
+      /* leaf */
+      if (!maria_rtree_key_cmp(keyinfo->seg, key->data, k, key->data_length,
+                               MBR_EQUAL | MBR_DATA))
+      {
+        page_mark_changed(info, &page);
+        if (maria_rtree_delete_key(&page, k, key->data_length))
+          goto err;
+        *page_size= page.size;
+        if (*page_size == info->s->keypage_header)
+        {
+          /* last key in the leaf */
+          res= 2;
+          if (_ma_dispose(info, page.pos, 0))
+            goto err;
+        }
+        else
+        {
+          res= 0;
+          if (_ma_write_keypage(&page, PAGECACHE_LOCK_LEFT_WRITELOCKED,
+                                DFLT_INIT_HITS))
+            goto err;
+        }
+        goto ok;
+      }
+    }
+  }
+  res= 1;
+
+ok:
+  my_afree(page_buf);
+  DBUG_RETURN(res);
+
+err:
+  my_afree(page_buf);
+  DBUG_RETURN(-1); /* purecov: inspected */
+}
+
+
+/*
+  Delete key - interface function
+
+  RETURN
+    1	Error
+    0	Deleted
+*/
+
+my_bool maria_rtree_delete(MARIA_HA *info, MARIA_KEY *key)
+{
+  MARIA_SHARE *share= info->s;
+  my_off_t new_root= share->state.key_root[key->keyinfo->key_nr];
+  int res;
+  LSN lsn= LSN_IMPOSSIBLE;
+  DBUG_ENTER("maria_rtree_delete");
+
+  if ((res= maria_rtree_real_delete(info, key, &new_root)))
+    goto err;
+
+  if (share->now_transactional)
+    res= _ma_write_undo_key_delete(info, key, new_root, &lsn);
+  else
+    share->state.key_root[key->keyinfo->key_nr]= new_root;
+
+err:
+  _ma_fast_unlock_key_del(info);
+  _ma_unpin_all_pages_and_finalize_row(info, lsn);
+  DBUG_RETURN(res != 0);
+}
+
+
+my_bool maria_rtree_real_delete(MARIA_HA *info, MARIA_KEY *key,
+                                my_off_t *root)
+{
+  uint page_size;
+  stPageList ReinsertList;
+  my_off_t old_root;
+  MARIA_SHARE *share= info->s;
+  MARIA_KEYDEF *keyinfo= key->keyinfo;
+  uint key_data_length= key->data_length;
+  DBUG_ENTER("maria_rtree_real_delete");
+
+  if ((old_root= share->state.key_root[keyinfo->key_nr]) ==
+      HA_OFFSET_ERROR)
+  {
+    my_errno= HA_ERR_END_OF_FILE;
+    DBUG_RETURN(1);                           /* purecov: inspected */
+  }
+  DBUG_PRINT("rtree", ("starting deletion at root page: %lu",
+                       (ulong) old_root));
+
+  ReinsertList.pages= NULL;
+  ReinsertList.n_pages= 0;
+  ReinsertList.m_pages= 0;
+
+  switch (maria_rtree_delete_req(info, key, old_root, &page_size,
+                                 &ReinsertList, 0)) {
+  case 2: /* empty */
+  {
+    *root= HA_OFFSET_ERROR;
+    break;
+  }
+  case 0: /* deleted */
+  {
+    uint nod_flag;
+    ulong i;
+    uchar *page_buf;
+    MARIA_PAGE page;
+    MARIA_KEY tmp_key;
+    tmp_key.keyinfo=     key->keyinfo;
+    tmp_key.data_length= key->data_length;
+    tmp_key.ref_length=  key->ref_length;
+    tmp_key.flag=        0;                     /* Safety */
+
+    if (ReinsertList.n_pages)
+    {
+      if (!(page_buf= (uchar*) my_alloca((uint) keyinfo->block_length)))
+      {
+        my_errno= HA_ERR_OUT_OF_MEM;
+        goto err;
+      }
+
+      for (i= 0; i < ReinsertList.n_pages; ++i)
+      {
+        uchar *k, *last;
+        if (_ma_fetch_keypage(&page, info, keyinfo, ReinsertList.pages[i].offs,
+                              PAGECACHE_LOCK_WRITE,
+                              DFLT_INIT_HITS, page_buf, 0))
+          goto err;
+        nod_flag= page.node;
+        DBUG_PRINT("rtree", ("reinserting keys from "
+                             "page: %lu  level: %d  nod_flag: %u",
+                             (ulong) ReinsertList.pages[i].offs,
+                             ReinsertList.pages[i].level, nod_flag));
+
+        k= rt_PAGE_FIRST_KEY(share, page.buff, nod_flag);
+        last= rt_PAGE_END(&page);
+        for (; k < last; k= rt_PAGE_NEXT_KEY(share, k, key_data_length,
+                                             nod_flag))
+        {
+          int res;
+          tmp_key.data= k;
+          if ((res= maria_rtree_insert_level(info, &tmp_key,
+                                             ReinsertList.pages[i].level,
+                                             root)) == -1)
+          {
+            my_afree(page_buf);
+            goto err;
+          }
+          if (res)
+          {
+            uint j;
+            DBUG_PRINT("rtree", ("root has been split, adjust levels"));
+            for (j= i; j < ReinsertList.n_pages; j++)
+            {
+              ReinsertList.pages[j].level++;
+              DBUG_PRINT("rtree", ("keys from page: %lu  now level: %d",
+                                   (ulong) ReinsertList.pages[i].offs,
+                                   ReinsertList.pages[i].level));
+            }
+          }
+        }
+        page_mark_changed(info, &page);
+        if (_ma_dispose(info, page.pos, 0))
+        {
+          my_afree(page_buf);
+          goto err;
+        }
+      }
+      my_afree(page_buf);
+      my_free(ReinsertList.pages, MYF(0));
+    }
+
+    /* check for redundant root (not leaf, 1 child) and eliminate */
+    if ((old_root= *root) == HA_OFFSET_ERROR)
+      goto err;
+    if (_ma_fetch_keypage(&page, info, keyinfo, old_root,
+                          PAGECACHE_LOCK_WRITE,
+                          DFLT_INIT_HITS, info->buff, 0))
+      goto err;
+    nod_flag= page.node;
+    if (nod_flag && (page.size == share->keypage_header + key_data_length +
+                     nod_flag))
+    {
+      *root= _ma_kpos(nod_flag,
+                      rt_PAGE_FIRST_KEY(share, info->buff, nod_flag));
+      page_mark_changed(info, &page);
+      if (_ma_dispose(info, page.pos, 0))
+        goto err;
+    }
+    info->update= HA_STATE_DELETED;
+    break;
+  }
+  case 1:                                     /* not found */
+  {
+    my_errno= HA_ERR_KEY_NOT_FOUND;
+    goto err;
+  }
+  case -1:                                    /* error */
+  default:
+    goto err;                                 /* purecov: inspected */
+  }
+  DBUG_RETURN(0);
+
+err:
+  DBUG_RETURN(1);
+}
+
+
+/*
+  Estimate number of suitable keys in the tree
+
+  RETURN
+    estimated value
+*/
+
+ha_rows maria_rtree_estimate(MARIA_HA *info, MARIA_KEY *key, uint32 flag)
+{
+  my_off_t root;
+  uint i= 0;
+  uint nod_flag, key_data_length;
+  uchar *page_buf, *k, *last;
+  double area= 0;
+  ha_rows res= 0;
+  MARIA_SHARE *share= info->s;
+  MARIA_KEYDEF *keyinfo= key->keyinfo;
+  MARIA_PAGE page;
+
+  if (flag & MBR_DISJOINT)
+    return info->state->records;
+
+  if ((root= share->state.key_root[key->keyinfo->key_nr]) == HA_OFFSET_ERROR)
+    return HA_POS_ERROR;
+  if (!(page_buf= (uchar*) my_alloca((uint) keyinfo->block_length)))
+    return HA_POS_ERROR;
+  if (_ma_fetch_keypage(&page, info, keyinfo, root,
+                        PAGECACHE_LOCK_LEFT_UNLOCKED, DFLT_INIT_HITS, page_buf,
+                        0))
+    goto err;
+  nod_flag= page.node;
+
+  key_data_length= key->data_length;
+
+  k= rt_PAGE_FIRST_KEY(share, page.buff, nod_flag);
+  last= rt_PAGE_END(&page);
+
+  for (; k < last;
+       k= rt_PAGE_NEXT_KEY(share, k, key_data_length, nod_flag), i++)
+  {
+    if (nod_flag)
+    {
+      double k_area= maria_rtree_rect_volume(keyinfo->seg, k, key_data_length);
+
+      /* The following should be safe, even if we compare doubles */
+      if (k_area == 0)
+      {
+        if (flag & (MBR_CONTAIN | MBR_INTERSECT))
+        {
+          area+= 1;
+        }
+        else if (flag & (MBR_WITHIN | MBR_EQUAL))
+        {
+          if (!maria_rtree_key_cmp(keyinfo->seg, key->data, k, key_data_length,
+                                   MBR_WITHIN))
+            area+= 1;
+        }
+        else
+          goto err;
+      }
+      else
+      {
+        if (flag & (MBR_CONTAIN | MBR_INTERSECT))
+        {
+          area+= maria_rtree_overlapping_area(keyinfo->seg, key->data, k,
+                                              key_data_length) / k_area;
+        }
+        else if (flag & (MBR_WITHIN | MBR_EQUAL))
+        {
+          if (!maria_rtree_key_cmp(keyinfo->seg, key->data, k, key_data_length,
+                                   MBR_WITHIN))
+            area+= (maria_rtree_rect_volume(keyinfo->seg, key->data,
+                                            key_data_length) / k_area);
+        }
+        else
+          goto err;
+      }
+    }
+    else
+    {
+      if (!maria_rtree_key_cmp(keyinfo->seg, key->data, k, key_data_length,
+                               flag))
+        ++res;
+    }
+  }
+  if (nod_flag)
+  {
+    if (i)
+      res= (ha_rows) (area / i * info->state->records);
+    else
+      res= HA_POS_ERROR;
+  }
+
+  my_afree(page_buf);
+  return res;
+
+err:
+  my_afree(page_buf);
+  return HA_POS_ERROR;
+}
+
+#endif /*HAVE_RTREE_KEYS*/
diff --git a/storage/maria/ma_rt_index.h b/storage/maria/ma_rt_index.h
new file mode 100644
index 00000000000..dacaa4389b7
--- /dev/null
+++ b/storage/maria/ma_rt_index.h
@@ -0,0 +1,46 @@
+/* Copyright (C) 2006 MySQL AB & Ramil Kalimullin & MySQL Finland AB
+   & TCX DataKonsult AB
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; version 2 of the License.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */
+
+#ifndef _rt_index_h
+#define _rt_index_h
+
+#ifdef HAVE_RTREE_KEYS
+
+#define rt_PAGE_FIRST_KEY(share, page, nod_flag) (page + share->keypage_header + nod_flag)
+#define rt_PAGE_NEXT_KEY(share, key, key_length, nod_flag) (key + key_length +\
+              (nod_flag ? nod_flag : share->base.rec_reflength))
+#define rt_PAGE_END(page) ((page)->buff + (page)->size)
+
+#define rt_PAGE_MIN_SIZE(block_length) ((uint)(block_length - KEYPAGE_CHECKSUM_SIZE) / 3)
+
+my_bool maria_rtree_insert(MARIA_HA *info, MARIA_KEY *key);
+my_bool maria_rtree_delete(MARIA_HA *info, MARIA_KEY *key);
+int maria_rtree_insert_level(MARIA_HA *info, MARIA_KEY *key,
+                             int ins_level, my_off_t *root);
+my_bool maria_rtree_real_delete(MARIA_HA *info, MARIA_KEY *key,
+                                my_off_t *root);
+int maria_rtree_find_first(MARIA_HA *info, MARIA_KEY *key, uint search_flag);
+int maria_rtree_find_next(MARIA_HA *info, uint keynr, uint32 search_flag);
+
+int maria_rtree_get_first(MARIA_HA *info, uint keynr, uint key_length);
+int maria_rtree_get_next(MARIA_HA *info, uint keynr, uint key_length);
+
+ha_rows maria_rtree_estimate(MARIA_HA *info, MARIA_KEY *key, uint32 flag);
+
+int maria_rtree_split_page(const MARIA_KEY *key, MARIA_PAGE *page,
+                           my_off_t *new_page_offs);
+#endif /*HAVE_RTREE_KEYS*/
+#endif /* _rt_index_h */
diff --git a/storage/maria/ma_rt_key.c b/storage/maria/ma_rt_key.c
new file mode 100644
index 00000000000..fa173605cd3
--- /dev/null
+++ b/storage/maria/ma_rt_key.c
@@ -0,0 +1,120 @@
+/* Copyright (C) 2006 MySQL AB & Ramil Kalimullin
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; version 2 of the License.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */
+
+#include "maria_def.h"
+#include "trnman.h"
+#include "ma_key_recover.h"
+
+#ifdef HAVE_RTREE_KEYS
+#include "ma_rt_index.h"
+#include "ma_rt_key.h"
+#include "ma_rt_mbr.h"
+
+/*
+  Add key to the page
+
+  RESULT VALUES
+    -1 	Error
+    0 	Not split
+    1	Split
+*/
+
+int maria_rtree_add_key(const MARIA_KEY *key, MARIA_PAGE *page,
+                        my_off_t *new_page)
+{
+  MARIA_HA *info= page->info;
+  MARIA_SHARE *share= info->s;
+  uint page_size= page->size;
+  uint nod_flag=  page->node;
+  uchar *key_pos= rt_PAGE_END(page);
+  uint tot_key_length= key->data_length + key->ref_length + nod_flag;
+  DBUG_ENTER("maria_rtree_add_key");
+
+  if (page_size + tot_key_length <=
+      (uint)(key->keyinfo->block_length - KEYPAGE_CHECKSUM_SIZE))
+  {
+    /* split won't be necessary */
+    if (nod_flag)
+    {
+      DBUG_ASSERT(_ma_kpos(nod_flag, key->data) <
+                  info->state->key_file_length);
+      /* We don't store reference to row on nod pages for rtree index */
+      tot_key_length-= key->ref_length;
+    }
+    /* save key */
+    memcpy(key_pos, key->data - nod_flag, tot_key_length);
+    page->size+= tot_key_length;
+    page_store_size(share, page);
+    if (share->now_transactional &&
+        _ma_log_add(page, key_pos - page->buff,
+                    key_pos, tot_key_length, tot_key_length, 0,
+                    KEY_OP_DEBUG_LOG_ADD_1))
+      DBUG_RETURN(-1);
+    DBUG_RETURN(0);
+  }
+  DBUG_RETURN(maria_rtree_split_page(key, page, new_page) ? -1 : 1);
+}
+
+
+/*
+  Delete key from the page
+
+  Notes
+  key_length is only the data part of the key
+*/
+
+int maria_rtree_delete_key(MARIA_PAGE *page, uchar *key, uint key_length)
+{
+  MARIA_HA *info= page->info;
+  MARIA_SHARE *share= info->s;
+  uint key_length_with_nod_flag;
+  uchar *key_start;
+
+  key_start= key - page->node;
+  if (!page->node)
+    key_length+= share->base.rec_reflength;
+
+  memmove(key_start, key + key_length, page->size - key_length -
+	  (key - page->buff));
+  key_length_with_nod_flag= key_length + page->node;
+  page->size-= key_length_with_nod_flag;
+  page_store_size(share, page);
+  if (share->now_transactional &&
+      _ma_log_delete(page, key_start, 0, key_length_with_nod_flag,
+                     0, KEY_OP_DEBUG_LOG_DEL_CHANGE_RT))
+    return -1;
+  return 0;
+}
+
+
+/*
+  Calculate and store key MBR into *key.
+*/
+
+int maria_rtree_set_key_mbr(MARIA_HA *info, MARIA_KEY *key,
+                            my_off_t child_page)
+{
+  MARIA_PAGE page;
+  DBUG_ENTER("maria_rtree_set_key_mbr");
+  if (_ma_fetch_keypage(&page, info, key->keyinfo, child_page,
+                        PAGECACHE_LOCK_LEFT_UNLOCKED,
+                        DFLT_INIT_HITS, info->buff, 0))
+    DBUG_RETURN(-1);
+
+  DBUG_RETURN(maria_rtree_page_mbr(key->keyinfo->seg,
+                                   &page, key->data, key->data_length));
+}
+
+#endif /*HAVE_RTREE_KEYS*/
diff --git a/storage/maria/ma_rt_key.h b/storage/maria/ma_rt_key.h
new file mode 100644
index 00000000000..948809f3d38
--- /dev/null
+++ b/storage/maria/ma_rt_key.h
@@ -0,0 +1,31 @@
+/* Copyright (C) 2006 MySQL AB & Ramil Kalimullin & MySQL Finland AB
+   & TCX DataKonsult AB
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; version 2 of the License.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */
+
+/* Written by Ramil Kalimullin, who has a shared copyright to this code */
+
+#ifndef _rt_key_h
+#define _rt_key_h
+
+#ifdef HAVE_RTREE_KEYS
+
+int maria_rtree_add_key(const MARIA_KEY *key, MARIA_PAGE *page,
+                        my_off_t *new_page);
+int maria_rtree_delete_key(MARIA_PAGE *page, uchar *key, uint key_length);
+int maria_rtree_set_key_mbr(MARIA_HA *info, MARIA_KEY *key,
+                            my_off_t child_page);
+
+#endif /*HAVE_RTREE_KEYS*/
+#endif /* _rt_key_h */
diff --git a/storage/maria/ma_rt_mbr.c b/storage/maria/ma_rt_mbr.c
new file mode 100644
index 00000000000..b3e2b0ceab8
--- /dev/null
+++ b/storage/maria/ma_rt_mbr.c
@@ -0,0 +1,818 @@
+/* Copyright (C) 2006 MySQL AB & Ramil Kalimullin & MySQL Finland AB
+   & TCX DataKonsult AB
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; version 2 of the License.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */
+
+#include "maria_def.h"
+
+#ifdef HAVE_RTREE_KEYS
+
+#include "ma_rt_index.h"
+#include "ma_rt_mbr.h"
+
+#define INTERSECT_CMP(amin, amax, bmin, bmax) ((amin >  bmax) || (bmin >  amax))
+#define CONTAIN_CMP(amin, amax, bmin, bmax) ((bmin > amin)  || (bmax <  amax))
+#define WITHIN_CMP(amin, amax, bmin, bmax) ((amin > bmin)  || (amax <  bmax))
+#define DISJOINT_CMP(amin, amax, bmin, bmax) ((amin <= bmax) && (bmin <= amax))
+#define EQUAL_CMP(amin, amax, bmin, bmax) ((amin != bmin) || (amax != bmax))
+
+#define FCMP(A, B) ((int)(A) - (int)(B))
+#define p_inc(A, B, X)  {A += X; B += X;}
+
+#define RT_CMP(nextflag) \
+  if (nextflag & MBR_INTERSECT) \
+  { \
+    if (INTERSECT_CMP(amin, amax, bmin, bmax)) \
+      return 1; \
+  } \
+  else if (nextflag & MBR_CONTAIN) \
+  { \
+    if (CONTAIN_CMP(amin, amax, bmin, bmax)) \
+      return 1; \
+  } \
+  else if (nextflag & MBR_WITHIN) \
+  { \
+    if (WITHIN_CMP(amin, amax, bmin, bmax)) \
+      return 1; \
+  } \
+  else if (nextflag & MBR_EQUAL)  \
+  { \
+    if (EQUAL_CMP(amin, amax, bmin, bmax)) \
+      return 1; \
+  } \
+  else if (nextflag & MBR_DISJOINT) \
+  { \
+    if (DISJOINT_CMP(amin, amax, bmin, bmax)) \
+      return 1; \
+  }\
+  else /* if unknown comparison operator */ \
+  { \
+    DBUG_ASSERT(0); \
+  }
+
+#define RT_CMP_KORR(type, korr_func, len, nextflag) \
+{ \
+  type amin, amax, bmin, bmax; \
+  amin= korr_func(a); \
+  bmin= korr_func(b); \
+  amax= korr_func(a+len); \
+  bmax= korr_func(b+len); \
+  RT_CMP(nextflag); \
+}
+
+#define RT_CMP_GET(type, get_func, len, nextflag) \
+{ \
+  type amin, amax, bmin, bmax; \
+  get_func(amin, a); \
+  get_func(bmin, b); \
+  get_func(amax, a+len); \
+  get_func(bmax, b+len); \
+  RT_CMP(nextflag); \
+}
+
+/*
+ Compares two keys a and b depending on nextflag
+ nextflag can contain these flags:
+   MBR_INTERSECT(a,b)  a overlaps b
+   MBR_CONTAIN(a,b)    a contains b
+   MBR_DISJOINT(a,b)   a disjoint b
+   MBR_WITHIN(a,b)     a within   b
+   MBR_EQUAL(a,b)      All coordinates of MBRs are equal
+   MBR_DATA(a,b)       Data reference is the same
+ Returns 0 on success.
+*/
+
+int maria_rtree_key_cmp(HA_KEYSEG *keyseg, const uchar *b, const uchar *a,
+                        uint key_length, uint32 nextflag)
+{
+  for (; (int) key_length > 0; keyseg += 2 )
+  {
+    uint32 keyseg_length;
+    switch ((enum ha_base_keytype) keyseg->type) {
+    case HA_KEYTYPE_INT8:
+      RT_CMP_KORR(int8, mi_sint1korr, 1, nextflag);
+      break;
+    case HA_KEYTYPE_BINARY:
+      RT_CMP_KORR(uint8, mi_uint1korr, 1, nextflag);
+      break;
+    case HA_KEYTYPE_SHORT_INT:
+      RT_CMP_KORR(int16, mi_sint2korr, 2, nextflag);
+      break;
+    case HA_KEYTYPE_USHORT_INT:
+      RT_CMP_KORR(uint16, mi_uint2korr, 2, nextflag);
+      break;
+    case HA_KEYTYPE_INT24:
+      RT_CMP_KORR(int32, mi_sint3korr, 3, nextflag);
+      break;
+    case HA_KEYTYPE_UINT24:
+      RT_CMP_KORR(uint32, mi_uint3korr, 3, nextflag);
+      break;
+    case HA_KEYTYPE_LONG_INT:
+      RT_CMP_KORR(int32, mi_sint4korr, 4, nextflag);
+      break;
+    case HA_KEYTYPE_ULONG_INT:
+      RT_CMP_KORR(uint32, mi_uint4korr, 4, nextflag);
+      break;
+#ifdef HAVE_LONG_LONG
+    case HA_KEYTYPE_LONGLONG:
+      RT_CMP_KORR(longlong, mi_sint8korr, 8, nextflag)
+      break;
+    case HA_KEYTYPE_ULONGLONG:
+      RT_CMP_KORR(ulonglong, mi_uint8korr, 8, nextflag)
+      break;
+#endif
+    case HA_KEYTYPE_FLOAT:
+      /* The following should be safe, even if we compare doubles */
+      RT_CMP_GET(float, mi_float4get, 4, nextflag);
+      break;
+    case HA_KEYTYPE_DOUBLE:
+      RT_CMP_GET(double, mi_float8get, 8, nextflag);
+      break;
+    case HA_KEYTYPE_END:
+      goto end;
+    default:
+      return 1;
+    }
+    keyseg_length= keyseg->length * 2;
+    key_length-= keyseg_length;
+    a+= keyseg_length;
+    b+= keyseg_length;
+  }
+
+end:
+  if (nextflag & MBR_DATA)
+  {
+    const uchar *end= a + keyseg->length;
+    do
+    {
+      if (*a++ != *b++)
+        return FCMP(a[-1], b[-1]);
+    } while (a != end);
+  }
+  return 0;
+}
+
+#define RT_VOL_KORR(type, korr_func, len, cast) \
+{ \
+  type amin, amax; \
+  amin= korr_func(a); \
+  amax= korr_func(a+len); \
+  res *= (cast(amax) - cast(amin)); \
+}
+
+#define RT_VOL_GET(type, get_func, len, cast) \
+{ \
+  type amin, amax; \
+  get_func(amin, a); \
+  get_func(amax, a+len); \
+  res *= (cast(amax) - cast(amin)); \
+}
+
+/*
+ Calculates rectangle volume
+*/
+double maria_rtree_rect_volume(HA_KEYSEG *keyseg, uchar *a, uint key_length)
+{
+  double res= 1;
+  for (; (int)key_length > 0; keyseg += 2)
+  {
+    uint32 keyseg_length;
+    switch ((enum ha_base_keytype) keyseg->type) {
+    case HA_KEYTYPE_INT8:
+      RT_VOL_KORR(int8, mi_sint1korr, 1, (double));
+      break;
+    case HA_KEYTYPE_BINARY:
+      RT_VOL_KORR(uint8, mi_uint1korr, 1, (double));
+      break;
+    case HA_KEYTYPE_SHORT_INT:
+      RT_VOL_KORR(int16, mi_sint2korr, 2, (double));
+      break;
+    case HA_KEYTYPE_USHORT_INT:
+      RT_VOL_KORR(uint16, mi_uint2korr, 2, (double));
+      break;
+    case HA_KEYTYPE_INT24:
+      RT_VOL_KORR(int32, mi_sint3korr, 3, (double));
+      break;
+    case HA_KEYTYPE_UINT24:
+      RT_VOL_KORR(uint32, mi_uint3korr, 3, (double));
+      break;
+    case HA_KEYTYPE_LONG_INT:
+      RT_VOL_KORR(int32, mi_sint4korr, 4, (double));
+      break;
+    case HA_KEYTYPE_ULONG_INT:
+      RT_VOL_KORR(uint32, mi_uint4korr, 4, (double));
+      break;
+#ifdef HAVE_LONG_LONG
+    case HA_KEYTYPE_LONGLONG:
+      RT_VOL_KORR(longlong, mi_sint8korr, 8, (double));
+      break;
+    case HA_KEYTYPE_ULONGLONG:
+      RT_VOL_KORR(longlong, mi_sint8korr, 8, ulonglong2double);
+      break;
+#endif
+    case HA_KEYTYPE_FLOAT:
+      RT_VOL_GET(float, mi_float4get, 4, (double));
+      break;
+    case HA_KEYTYPE_DOUBLE:
+      RT_VOL_GET(double, mi_float8get, 8, (double));
+      break;
+    case HA_KEYTYPE_END:
+      key_length= 0;
+      break;
+    default:
+      return -1;
+    }
+    keyseg_length= keyseg->length * 2;
+    key_length-= keyseg_length;
+    a+= keyseg_length;
+  }
+  return res;
+}
+
+#define RT_D_MBR_KORR(type, korr_func, len, cast) \
+{ \
+  type amin, amax; \
+  amin= korr_func(a); \
+  amax= korr_func(a+len); \
+  *res++= cast(amin); \
+  *res++= cast(amax); \
+}
+
+#define RT_D_MBR_GET(type, get_func, len, cast) \
+{ \
+  type amin, amax; \
+  get_func(amin, a); \
+  get_func(amax, a+len); \
+  *res++= cast(amin); \
+  *res++= cast(amax); \
+}
+
+
+/*
+  Creates an MBR as an array of doubles.
+  Fills *res.
+*/
+
+int maria_rtree_d_mbr(const HA_KEYSEG *keyseg, const uchar *a,
+                      uint key_length, double *res)
+{
+  for (; (int)key_length > 0; keyseg += 2)
+  {
+    uint32 keyseg_length;
+    switch ((enum ha_base_keytype) keyseg->type) {
+    case HA_KEYTYPE_INT8:
+      RT_D_MBR_KORR(int8, mi_sint1korr, 1, (double));
+      break;
+    case HA_KEYTYPE_BINARY:
+      RT_D_MBR_KORR(uint8, mi_uint1korr, 1, (double));
+      break;
+    case HA_KEYTYPE_SHORT_INT:
+      RT_D_MBR_KORR(int16, mi_sint2korr, 2, (double));
+      break;
+    case HA_KEYTYPE_USHORT_INT:
+      RT_D_MBR_KORR(uint16, mi_uint2korr, 2, (double));
+      break;
+    case HA_KEYTYPE_INT24:
+      RT_D_MBR_KORR(int32, mi_sint3korr, 3, (double));
+      break;
+    case HA_KEYTYPE_UINT24:
+      RT_D_MBR_KORR(uint32, mi_uint3korr, 3, (double));
+      break;
+    case HA_KEYTYPE_LONG_INT:
+      RT_D_MBR_KORR(int32, mi_sint4korr, 4, (double));
+      break;
+    case HA_KEYTYPE_ULONG_INT:
+      RT_D_MBR_KORR(uint32, mi_uint4korr, 4, (double));
+      break;
+#ifdef HAVE_LONG_LONG
+    case HA_KEYTYPE_LONGLONG:
+      RT_D_MBR_KORR(longlong, mi_sint8korr, 8, (double));
+      break;
+    case HA_KEYTYPE_ULONGLONG:
+      RT_D_MBR_KORR(longlong, mi_sint8korr, 8, ulonglong2double);
+      break;
+#endif
+    case HA_KEYTYPE_FLOAT:
+      RT_D_MBR_GET(float, mi_float4get, 4, (double));
+      break;
+    case HA_KEYTYPE_DOUBLE:
+      RT_D_MBR_GET(double, mi_float8get, 8, (double));
+      break;
+    case HA_KEYTYPE_END:
+      key_length= 0;
+      break;
+    default:
+      return 1;
+    }
+    keyseg_length= keyseg->length * 2;
+    key_length-= keyseg_length;
+    a+= keyseg_length;
+  }
+  return 0;
+}
+
+#define RT_COMB_KORR(type, korr_func, store_func, len) \
+{ \
+  type amin, amax, bmin, bmax; \
+  amin= korr_func(a); \
+  bmin= korr_func(b); \
+  amax= korr_func(a+len); \
+  bmax= korr_func(b+len); \
+  amin= min(amin, bmin); \
+  amax= max(amax, bmax); \
+  store_func(c, amin); \
+  store_func(c+len, amax); \
+}
+
+#define RT_COMB_GET(type, get_func, store_func, len) \
+{ \
+  type amin, amax, bmin, bmax; \
+  get_func(amin, a); \
+  get_func(bmin, b); \
+  get_func(amax, a+len); \
+  get_func(bmax, b+len); \
+  amin= min(amin, bmin); \
+  amax= max(amax, bmax); \
+  store_func(c, amin); \
+  store_func(c+len, amax); \
+}
+
+/*
+  Creates common minimal bounding rectungle
+  for two input rectagnles a and b
+  Result is written to c
+*/
+
+int maria_rtree_combine_rect(const HA_KEYSEG *keyseg, const uchar* a,
+                             const uchar* b, uchar* c,
+                             uint key_length)
+{
+  for ( ; (int) key_length > 0 ; keyseg += 2)
+  {
+    uint32 keyseg_length;
+    switch ((enum ha_base_keytype) keyseg->type) {
+    case HA_KEYTYPE_INT8:
+      RT_COMB_KORR(int8, mi_sint1korr, mi_int1store, 1);
+      break;
+    case HA_KEYTYPE_BINARY:
+      RT_COMB_KORR(uint8, mi_uint1korr, mi_int1store, 1);
+      break;
+    case HA_KEYTYPE_SHORT_INT:
+      RT_COMB_KORR(int16, mi_sint2korr, mi_int2store, 2);
+      break;
+    case HA_KEYTYPE_USHORT_INT:
+      RT_COMB_KORR(uint16, mi_uint2korr, mi_int2store, 2);
+      break;
+    case HA_KEYTYPE_INT24:
+      RT_COMB_KORR(int32, mi_sint3korr, mi_int3store, 3);
+      break;
+    case HA_KEYTYPE_UINT24:
+      RT_COMB_KORR(uint32, mi_uint3korr, mi_int3store, 3);
+      break;
+    case HA_KEYTYPE_LONG_INT:
+      RT_COMB_KORR(int32, mi_sint4korr, mi_int4store, 4);
+      break;
+    case HA_KEYTYPE_ULONG_INT:
+      RT_COMB_KORR(uint32, mi_uint4korr, mi_int4store, 4);
+      break;
+#ifdef HAVE_LONG_LONG
+    case HA_KEYTYPE_LONGLONG:
+      RT_COMB_KORR(longlong, mi_sint8korr, mi_int8store, 8);
+      break;
+    case HA_KEYTYPE_ULONGLONG:
+      RT_COMB_KORR(ulonglong, mi_uint8korr, mi_int8store, 8);
+      break;
+#endif
+    case HA_KEYTYPE_FLOAT:
+      RT_COMB_GET(float, mi_float4get, mi_float4store, 4);
+      break;
+    case HA_KEYTYPE_DOUBLE:
+      RT_COMB_GET(double, mi_float8get, mi_float8store, 8);
+      break;
+    case HA_KEYTYPE_END:
+      return 0;
+    default:
+      return 1;
+    }
+    keyseg_length= keyseg->length * 2;
+    key_length-= keyseg_length;
+    a+= keyseg_length;
+    b+= keyseg_length;
+    c+= keyseg_length;
+  }
+  return 0;
+}
+
+
+#define RT_OVL_AREA_KORR(type, korr_func, len) \
+{ \
+  type amin, amax, bmin, bmax; \
+  amin= korr_func(a); \
+  bmin= korr_func(b); \
+  amax= korr_func(a+len); \
+  bmax= korr_func(b+len); \
+  amin= max(amin, bmin); \
+  amax= min(amax, bmax); \
+  if (amin >= amax) \
+    return 0; \
+  res *= amax - amin; \
+}
+
+#define RT_OVL_AREA_GET(type, get_func, len) \
+{ \
+  type amin, amax, bmin, bmax; \
+  get_func(amin, a); \
+  get_func(bmin, b); \
+  get_func(amax, a+len); \
+  get_func(bmax, b+len); \
+  amin= max(amin, bmin); \
+  amax= min(amax, bmax); \
+  if (amin >= amax)  \
+    return 0; \
+  res *= amax - amin; \
+}
+
+/*
+Calculates overlapping area of two MBRs a & b
+*/
+double maria_rtree_overlapping_area(HA_KEYSEG *keyseg, uchar* a, uchar* b,
+                             uint key_length)
+{
+  double res= 1;
+  for (; (int) key_length > 0 ; keyseg += 2)
+  {
+    uint32 keyseg_length;
+    switch ((enum ha_base_keytype) keyseg->type) {
+    case HA_KEYTYPE_INT8:
+      RT_OVL_AREA_KORR(int8, mi_sint1korr, 1);
+      break;
+    case HA_KEYTYPE_BINARY:
+      RT_OVL_AREA_KORR(uint8, mi_uint1korr, 1);
+      break;
+    case HA_KEYTYPE_SHORT_INT:
+      RT_OVL_AREA_KORR(int16, mi_sint2korr, 2);
+      break;
+    case HA_KEYTYPE_USHORT_INT:
+      RT_OVL_AREA_KORR(uint16, mi_uint2korr, 2);
+      break;
+    case HA_KEYTYPE_INT24:
+      RT_OVL_AREA_KORR(int32, mi_sint3korr, 3);
+      break;
+    case HA_KEYTYPE_UINT24:
+      RT_OVL_AREA_KORR(uint32, mi_uint3korr, 3);
+      break;
+    case HA_KEYTYPE_LONG_INT:
+      RT_OVL_AREA_KORR(int32, mi_sint4korr, 4);
+      break;
+    case HA_KEYTYPE_ULONG_INT:
+      RT_OVL_AREA_KORR(uint32, mi_uint4korr, 4);
+      break;
+#ifdef HAVE_LONG_LONG
+    case HA_KEYTYPE_LONGLONG:
+      RT_OVL_AREA_KORR(longlong, mi_sint8korr, 8);
+      break;
+    case HA_KEYTYPE_ULONGLONG:
+      RT_OVL_AREA_KORR(longlong, mi_sint8korr, 8);
+      break;
+#endif
+    case HA_KEYTYPE_FLOAT:
+      RT_OVL_AREA_GET(float, mi_float4get, 4);
+      break;
+    case HA_KEYTYPE_DOUBLE:
+      RT_OVL_AREA_GET(double, mi_float8get, 8);
+      break;
+    case HA_KEYTYPE_END:
+      return res;
+    default:
+      return -1;
+    }
+    keyseg_length= keyseg->length * 2;
+    key_length-= keyseg_length;
+    a+= keyseg_length;
+    b+= keyseg_length;
+  }
+  return res;
+}
+
+#define RT_AREA_INC_KORR(type, korr_func, len) \
+{ \
+   type amin, amax, bmin, bmax; \
+   amin= korr_func(a); \
+   bmin= korr_func(b); \
+   amax= korr_func(a+len); \
+   bmax= korr_func(b+len); \
+   a_area *= (((double)amax) - ((double)amin)); \
+   loc_ab_area *= ((double)max(amax, bmax) - (double)min(amin, bmin)); \
+}
+
+#define RT_AREA_INC_GET(type, get_func, len)\
+{\
+   type amin, amax, bmin, bmax; \
+   get_func(amin, a); \
+   get_func(bmin, b); \
+   get_func(amax, a+len); \
+   get_func(bmax, b+len); \
+   a_area *= (((double)amax) - ((double)amin)); \
+   loc_ab_area *= ((double)max(amax, bmax) - (double)min(amin, bmin)); \
+}
+
+/*
+  Calculates MBR_AREA(a+b) - MBR_AREA(a)
+  Fills *ab_area.
+  Note: when 'a' and 'b' objects are far from each other,
+  the area increase can be really big, so this function
+  can return 'inf' as a result.
+*/
+
+double maria_rtree_area_increase(const HA_KEYSEG *keyseg, const uchar *a,
+                                 const uchar *b,
+                                 uint key_length, double *ab_area)
+{
+  double a_area= 1.0;
+  double loc_ab_area= 1.0;
+
+  *ab_area= 1.0;
+  for (; (int)key_length > 0; keyseg += 2)
+  {
+    uint32 keyseg_length;
+
+    if (keyseg->null_bit)                       /* Handle NULL part */
+      return -1;
+
+    switch ((enum ha_base_keytype) keyseg->type) {
+    case HA_KEYTYPE_INT8:
+      RT_AREA_INC_KORR(int8, mi_sint1korr, 1);
+      break;
+    case HA_KEYTYPE_BINARY:
+      RT_AREA_INC_KORR(uint8, mi_uint1korr, 1);
+      break;
+    case HA_KEYTYPE_SHORT_INT:
+      RT_AREA_INC_KORR(int16, mi_sint2korr, 2);
+      break;
+    case HA_KEYTYPE_USHORT_INT:
+      RT_AREA_INC_KORR(uint16, mi_uint2korr, 2);
+      break;
+    case HA_KEYTYPE_INT24:
+      RT_AREA_INC_KORR(int32, mi_sint3korr, 3);
+      break;
+    case HA_KEYTYPE_UINT24:
+      RT_AREA_INC_KORR(int32, mi_uint3korr, 3);
+      break;
+    case HA_KEYTYPE_LONG_INT:
+      RT_AREA_INC_KORR(int32, mi_sint4korr, 4);
+      break;
+    case HA_KEYTYPE_ULONG_INT:
+      RT_AREA_INC_KORR(uint32, mi_uint4korr, 4);
+      break;
+#ifdef HAVE_LONG_LONG
+    case HA_KEYTYPE_LONGLONG:
+      RT_AREA_INC_KORR(longlong, mi_sint8korr, 8);
+      break;
+    case HA_KEYTYPE_ULONGLONG:
+      RT_AREA_INC_KORR(longlong, mi_sint8korr, 8);
+      break;
+#endif
+    case HA_KEYTYPE_FLOAT:
+      RT_AREA_INC_GET(float, mi_float4get, 4);
+      break;
+    case HA_KEYTYPE_DOUBLE:
+      RT_AREA_INC_GET(double, mi_float8get, 8);
+      break;
+    case HA_KEYTYPE_END:
+      goto safe_end;
+    default:
+      return -1;
+    }
+    keyseg_length= keyseg->length * 2;
+    key_length-= keyseg_length;
+    a+= keyseg_length;
+    b+= keyseg_length;
+  }
+safe_end:
+  *ab_area= loc_ab_area;
+  return loc_ab_area - a_area;
+}
+
+#define RT_PERIM_INC_KORR(type, korr_func, len) \
+{ \
+   type amin, amax, bmin, bmax; \
+   amin= korr_func(a); \
+   bmin= korr_func(b); \
+   amax= korr_func(a+len); \
+   bmax= korr_func(b+len); \
+   a_perim+= (((double)amax) - ((double)amin)); \
+   *ab_perim+= ((double)max(amax, bmax) - (double)min(amin, bmin)); \
+}
+
+#define RT_PERIM_INC_GET(type, get_func, len)\
+{\
+   type amin, amax, bmin, bmax; \
+   get_func(amin, a); \
+   get_func(bmin, b); \
+   get_func(amax, a+len); \
+   get_func(bmax, b+len); \
+   a_perim+= (((double)amax) - ((double)amin)); \
+   *ab_perim+= ((double)max(amax, bmax) - (double)min(amin, bmin)); \
+}
+
+/*
+Calculates MBR_PERIMETER(a+b) - MBR_PERIMETER(a)
+*/
+double maria_rtree_perimeter_increase(HA_KEYSEG *keyseg, uchar* a, uchar* b,
+				uint key_length, double *ab_perim)
+{
+  double a_perim= 0.0;
+
+  *ab_perim= 0.0;
+  for (; (int)key_length > 0; keyseg += 2)
+  {
+    uint32 keyseg_length;
+
+    if (keyseg->null_bit)                       /* Handle NULL part */
+      return -1;
+
+    switch ((enum ha_base_keytype) keyseg->type) {
+    case HA_KEYTYPE_INT8:
+      RT_PERIM_INC_KORR(int8, mi_sint1korr, 1);
+      break;
+    case HA_KEYTYPE_BINARY:
+      RT_PERIM_INC_KORR(uint8, mi_uint1korr, 1);
+      break;
+    case HA_KEYTYPE_SHORT_INT:
+      RT_PERIM_INC_KORR(int16, mi_sint2korr, 2);
+      break;
+    case HA_KEYTYPE_USHORT_INT:
+      RT_PERIM_INC_KORR(uint16, mi_uint2korr, 2);
+      break;
+    case HA_KEYTYPE_INT24:
+      RT_PERIM_INC_KORR(int32, mi_sint3korr, 3);
+      break;
+    case HA_KEYTYPE_UINT24:
+      RT_PERIM_INC_KORR(int32, mi_uint3korr, 3);
+      break;
+    case HA_KEYTYPE_LONG_INT:
+      RT_PERIM_INC_KORR(int32, mi_sint4korr, 4);
+      break;
+    case HA_KEYTYPE_ULONG_INT:
+      RT_PERIM_INC_KORR(uint32, mi_uint4korr, 4);
+      break;
+#ifdef HAVE_LONG_LONG
+    case HA_KEYTYPE_LONGLONG:
+      RT_PERIM_INC_KORR(longlong, mi_sint8korr, 8);
+      break;
+    case HA_KEYTYPE_ULONGLONG:
+      RT_PERIM_INC_KORR(longlong, mi_sint8korr, 8);
+      break;
+#endif
+    case HA_KEYTYPE_FLOAT:
+      RT_PERIM_INC_GET(float, mi_float4get, 4);
+      break;
+    case HA_KEYTYPE_DOUBLE:
+      RT_PERIM_INC_GET(double, mi_float8get, 8);
+      break;
+    case HA_KEYTYPE_END:
+      return *ab_perim - a_perim;
+    default:
+      return -1;
+    }
+    keyseg_length= keyseg->length * 2;
+    key_length-= keyseg_length;
+    a+= keyseg_length;
+    b+= keyseg_length;
+  }
+  return *ab_perim - a_perim;
+}
+
+
+#define RT_PAGE_MBR_KORR(share, type, korr_func, store_func, len, to)    \
+{ \
+  type amin, amax, bmin, bmax; \
+  amin= korr_func(k + inc); \
+  amax= korr_func(k + inc + len); \
+  k= rt_PAGE_NEXT_KEY(share, k, k_len, nod_flag);            \
+  for (; k < last; k= rt_PAGE_NEXT_KEY(share, k, k_len, nod_flag))       \
+{ \
+    bmin= korr_func(k + inc); \
+    bmax= korr_func(k + inc + len); \
+    if (amin > bmin) \
+      amin= bmin; \
+    if (amax < bmax) \
+      amax= bmax; \
+} \
+  store_func(to, amin); \
+  to+= len; \
+  store_func(to, amax); \
+  to += len;           \
+  inc += 2 * len; \
+}
+
+#define RT_PAGE_MBR_GET(share, type, get_func, store_func, len, to)      \
+{ \
+  type amin, amax, bmin, bmax; \
+  get_func(amin, k + inc); \
+  get_func(amax, k + inc + len); \
+  k= rt_PAGE_NEXT_KEY(share, k, k_len, nod_flag);            \
+  for (; k < last; k= rt_PAGE_NEXT_KEY(share, k, k_len, nod_flag))       \
+{ \
+    get_func(bmin, k + inc); \
+    get_func(bmax, k + inc + len); \
+    if (amin > bmin) \
+      amin= bmin; \
+    if (amax < bmax) \
+      amax= bmax; \
+} \
+  store_func(to, amin); \
+  to+= len; \
+  store_func(to, amax); \
+  to+= len; \
+  inc += 2 * len; \
+}
+
+/*
+  Calculates key page total MBR= MBR(key1) + MBR(key2) + ...
+  Stores into *to.
+*/
+int maria_rtree_page_mbr(const HA_KEYSEG *keyseg,
+                         MARIA_PAGE *page,
+                         uchar *to, uint key_length)
+{
+  MARIA_HA *info= page->info;
+  MARIA_SHARE *share= info->s;
+  uint inc= 0;
+  uint k_len= key_length;
+  uint nod_flag= page->node;
+  const uchar *k;
+  const uchar *last= rt_PAGE_END(page);
+
+  for (; (int)key_length > 0; keyseg += 2)
+  {
+    key_length -= keyseg->length * 2;
+
+    /* Handle NULL part */
+    if (keyseg->null_bit)
+    {
+      return 1;
+    }
+
+    k= rt_PAGE_FIRST_KEY(share, page->buff, nod_flag);
+
+    switch ((enum ha_base_keytype) keyseg->type) {
+    case HA_KEYTYPE_INT8:
+      RT_PAGE_MBR_KORR(share, int8, mi_sint1korr, mi_int1store, 1, to);
+      break;
+    case HA_KEYTYPE_BINARY:
+      RT_PAGE_MBR_KORR(share, uint8, mi_uint1korr, mi_int1store, 1, to);
+      break;
+    case HA_KEYTYPE_SHORT_INT:
+      RT_PAGE_MBR_KORR(share, int16, mi_sint2korr, mi_int2store, 2, to);
+      break;
+    case HA_KEYTYPE_USHORT_INT:
+      RT_PAGE_MBR_KORR(share, uint16, mi_uint2korr, mi_int2store, 2, to);
+      break;
+    case HA_KEYTYPE_INT24:
+      RT_PAGE_MBR_KORR(share, int32, mi_sint3korr, mi_int3store, 3, to);
+      break;
+    case HA_KEYTYPE_UINT24:
+      RT_PAGE_MBR_KORR(share, uint32, mi_uint3korr, mi_int3store, 3, to);
+      break;
+    case HA_KEYTYPE_LONG_INT:
+      RT_PAGE_MBR_KORR(share, int32, mi_sint4korr, mi_int4store, 4, to);
+      break;
+    case HA_KEYTYPE_ULONG_INT:
+      RT_PAGE_MBR_KORR(share, uint32, mi_uint4korr, mi_int4store, 4, to);
+      break;
+#ifdef HAVE_LONG_LONG
+    case HA_KEYTYPE_LONGLONG:
+      RT_PAGE_MBR_KORR(share, longlong, mi_sint8korr, mi_int8store, 8, to);
+      break;
+    case HA_KEYTYPE_ULONGLONG:
+      RT_PAGE_MBR_KORR(share, ulonglong, mi_uint8korr, mi_int8store, 8, to);
+      break;
+#endif
+    case HA_KEYTYPE_FLOAT:
+      RT_PAGE_MBR_GET(share, float, mi_float4get, mi_float4store, 4, to);
+      break;
+    case HA_KEYTYPE_DOUBLE:
+      RT_PAGE_MBR_GET(share, double, mi_float8get, mi_float8store, 8, to);
+      break;
+    case HA_KEYTYPE_END:
+      return 0;
+    default:
+      return 1;
+    }
+  }
+  return 0;
+}
+
+#endif /*HAVE_RTREE_KEYS*/
diff --git a/storage/maria/ma_rt_mbr.h b/storage/maria/ma_rt_mbr.h
new file mode 100644
index 00000000000..8fcd3d37b99
--- /dev/null
+++ b/storage/maria/ma_rt_mbr.h
@@ -0,0 +1,40 @@
+/* Copyright (C) 2006 MySQL AB & Ramil Kalimullin & MySQL Finland AB
+   & TCX DataKonsult AB
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; version 2 of the License.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */
+
+#ifndef _rt_mbr_h
+#define _rt_mbr_h
+
+#ifdef HAVE_RTREE_KEYS
+
+int maria_rtree_key_cmp(HA_KEYSEG *keyseg, const uchar *a, const uchar *b,
+                        uint key_length, uint32 nextflag);
+int maria_rtree_combine_rect(const HA_KEYSEG *keyseg,
+                             const uchar *, const uchar *, uchar*,
+                             uint key_length);
+double maria_rtree_rect_volume(HA_KEYSEG *keyseg, uchar*, uint key_length);
+int maria_rtree_d_mbr(const HA_KEYSEG *keyseg, const uchar *a,
+                      uint key_length, double *res);
+double maria_rtree_overlapping_area(HA_KEYSEG *keyseg, uchar *a, uchar *b,
+                                    uint key_length);
+double maria_rtree_area_increase(const HA_KEYSEG *keyseg, const uchar *a,
+                                 const uchar *b,
+                                 uint key_length, double *ab_area);
+double maria_rtree_perimeter_increase(HA_KEYSEG *keyseg, uchar* a, uchar* b,
+                                      uint key_length, double *ab_perim);
+int maria_rtree_page_mbr(const HA_KEYSEG *keyseg, MARIA_PAGE *page,
+                         uchar *key, uint key_length);
+#endif /*HAVE_RTREE_KEYS*/
+#endif /* _rt_mbr_h */
diff --git a/storage/maria/ma_rt_split.c b/storage/maria/ma_rt_split.c
new file mode 100644
index 00000000000..856edc60490
--- /dev/null
+++ b/storage/maria/ma_rt_split.c
@@ -0,0 +1,554 @@
+/* Copyright (C) 2006 MySQL AB & Alexey Botchkov & MySQL Finland AB
+   & TCX DataKonsult AB
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; version 2 of the License.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */
+
+#include "maria_def.h"
+#include "trnman.h"
+#include "ma_key_recover.h"
+
+#ifdef HAVE_RTREE_KEYS
+
+#include "ma_rt_index.h"
+#include "ma_rt_key.h"
+#include "ma_rt_mbr.h"
+
+typedef struct
+{
+  double square;
+  int n_node;
+  const uchar *key;
+  double *coords;
+} SplitStruct;
+
+inline static double *reserve_coords(double **d_buffer, int n_dim)
+{
+  double *coords= *d_buffer;
+  (*d_buffer)+= n_dim * 2;
+  return coords;
+}
+
+static void mbr_join(double *a, const double *b, int n_dim)
+{
+  double *end= a + n_dim * 2;
+  do
+  {
+    if (a[0] > b[0])
+      a[0]= b[0];
+
+    if (a[1] < b[1])
+      a[1]= b[1];
+
+    a+= 2;
+    b+= 2;
+  } while (a != end);
+}
+
+/*
+Counts the square of mbr which is a join of a and b
+*/
+static double mbr_join_square(const double *a, const double *b, int n_dim)
+{
+  const double *end= a + n_dim * 2;
+  double square= 1.0;
+  do
+  {
+    square *=
+      ((a[1] < b[1]) ? b[1] : a[1]) - ((a[0] > b[0]) ? b[0] : a[0]);
+
+    a+= 2;
+    b+= 2;
+  } while (a != end);
+
+  return square;
+}
+
+static double count_square(const double *a, int n_dim)
+{
+  const double *end= a + n_dim * 2;
+  double square= 1.0;
+  do
+  {
+    square *= a[1] - a[0];
+    a+= 2;
+  } while (a != end);
+  return square;
+}
+
+inline static void copy_coords(double *dst, const double *src, int n_dim)
+{
+  memcpy(dst, src, sizeof(double) * (n_dim * 2));
+}
+
+/**
+  Select two nodes to collect group upon.
+
+  Note that such function uses 'double' arithmetic so may behave differently
+  on different platforms/builds. There are others in this file.
+*/
+static void pick_seeds(SplitStruct *node, int n_entries,
+     SplitStruct **seed_a, SplitStruct **seed_b, int n_dim)
+{
+  SplitStruct *cur1;
+  SplitStruct *lim1= node + (n_entries - 1);
+  SplitStruct *cur2;
+  SplitStruct *lim2= node + n_entries;
+
+  double max_d= -DBL_MAX;
+  double d;
+
+  for (cur1= node; cur1 < lim1; cur1++)
+  {
+    for (cur2=cur1 + 1; cur2 < lim2; cur2++)
+    {
+
+      d= mbr_join_square(cur1->coords, cur2->coords, n_dim) - cur1->square -
+          cur2->square;
+      if (d > max_d)
+      {
+        max_d= d;
+        *seed_a= cur1;
+        *seed_b= cur2;
+      }
+    }
+  }
+}
+
+/*
+Select next node and group where to add
+*/
+static void pick_next(SplitStruct *node, int n_entries, double *g1, double *g2,
+    SplitStruct **choice, int *n_group, int n_dim)
+{
+  SplitStruct *cur= node;
+  SplitStruct *end= node + n_entries;
+
+  double max_diff= -DBL_MAX;
+
+  for (; cur < end; cur++)
+  {
+    double diff;
+    double abs_diff;
+
+    if (cur->n_node)
+    {
+      continue;
+    }
+
+    diff= mbr_join_square(g1, cur->coords, n_dim) -
+      mbr_join_square(g2, cur->coords, n_dim);
+
+    abs_diff= fabs(diff);
+    if (abs_diff  > max_diff)
+    {
+      max_diff= abs_diff;
+      *n_group= 1 + (diff > 0);
+      *choice= cur;
+    }
+  }
+}
+
+/*
+Mark not-in-group entries as n_group
+*/
+static void mark_all_entries(SplitStruct *node, int n_entries, int n_group)
+{
+  SplitStruct *cur= node;
+  SplitStruct *end= node + n_entries;
+
+  for (; cur < end; cur++)
+  {
+    if (cur->n_node)
+    {
+      continue;
+    }
+    cur->n_node= n_group;
+  }
+}
+
+static int split_maria_rtree_node(SplitStruct *node, int n_entries,
+                                  int all_size, /* Total key's size */
+                                  int key_size,
+                                  int min_size, /* Minimal group size */
+                                  int size1, int size2 /* initial group sizes */,
+                                  double **d_buffer, int n_dim)
+{
+  SplitStruct *cur;
+  SplitStruct *a;
+  SplitStruct *b;
+  double *g1= reserve_coords(d_buffer, n_dim);
+  double *g2= reserve_coords(d_buffer, n_dim);
+  SplitStruct *next;
+  int next_node;
+  int i;
+  SplitStruct *end= node + n_entries;
+  LINT_INIT(a);
+  LINT_INIT(b);
+  LINT_INIT(next);
+  LINT_INIT(next_node);
+
+  if (all_size < min_size * 2)
+  {
+    return 1;
+  }
+
+  cur= node;
+  for (; cur < end; cur++)
+  {
+    cur->square= count_square(cur->coords, n_dim);
+    cur->n_node= 0;
+  }
+
+  pick_seeds(node, n_entries, &a, &b, n_dim);
+  a->n_node= 1;
+  b->n_node= 2;
+
+
+  copy_coords(g1, a->coords, n_dim);
+  size1+= key_size;
+  copy_coords(g2, b->coords, n_dim);
+  size2+= key_size;
+
+
+  for (i=n_entries - 2; i>0; --i)
+  {
+    if (all_size - (size2 + key_size) < min_size) /* Can't write into group 2 */
+    {
+      mark_all_entries(node, n_entries, 1);
+      break;
+    }
+
+    if (all_size - (size1 + key_size) < min_size) /* Can't write into group 1 */
+    {
+      mark_all_entries(node, n_entries, 2);
+      break;
+    }
+
+    pick_next(node, n_entries, g1, g2, &next, &next_node, n_dim);
+    if (next_node == 1)
+    {
+      size1+= key_size;
+      mbr_join(g1, next->coords, n_dim);
+    }
+    else
+    {
+      size2+= key_size;
+      mbr_join(g2, next->coords, n_dim);
+    }
+    next->n_node= next_node;
+  }
+
+  return 0;
+}
+
+
+/**
+  Logs key reorganization done in a split page (new page is logged elsewhere).
+
+  The effect of a split on the split page is three changes:
+  - some piece of the page move to different places inside this page (we are
+  not interested here in the pieces which move to the new page)
+  - the key is inserted into the page or not (could be in the new page)
+  - page is shrunk
+  All this is uniquely determined by a few parameters:
+  - the key (starting at 'key-nod_flag', for 'full_length' bytes
+  (maria_rtree_split_page() seems to depend on its parameters key&key_length
+  but in fact it reads more (to the left: nod_flag, and to the right:
+  full_length)
+  - the binary content of the page
+  - some variables in the share
+  - double arithmetic, which is unpredictable from machine to machine and
+  from build to build (see pick_seeds() above: it has a comparison between
+  double-s 'if (d > max_d)' so the comparison can go differently from machine
+  to machine or build to build, it has happened in real life).
+  If one day we use precision-math instead of double-math, in GIS, then the
+  last parameter would become constant accross machines and builds and we
+  could some cheap logging: just log the few parameters above.
+  Until then, we log the list of memcpy() operations (fortunately, we often do
+  not have to log the source bytes, as they can be found in the page before
+  applying the REDO; the only source bytes to log are the key), the key if it
+  was inserted into this page, and the shrinking.
+
+  @param  info             table
+  @param  page             page's offset in the file
+  @param  buff             content of the page (post-split)
+  @param  key_with_nod_flag pointer to key-nod_flag
+  @param  full_length      length of (key + (nod_flag (if node) or rowid (if
+                           leaf)))
+  @param  log_internal_copy encoded list of mempcy() operations done on
+                           split page, having their source in the page
+  @param  log_internal_copy_length length of above list, in bytes
+  @param  log_key_copy     operation describing the key's copy, or NULL if the
+                           inserted key was not put into the page (was put in
+                           new page, so does not have to be logged here)
+  @param  length_diff      by how much the page has shrunk during split
+*/
+
+static my_bool _ma_log_rt_split(MARIA_PAGE *page,
+                                const uchar *key_with_nod_flag,
+                                uint full_length,
+                                const uchar *log_internal_copy,
+                                uint log_internal_copy_length,
+                                const uchar *log_key_copy,
+                                uint length_diff)
+{
+  MARIA_HA    *info=  page->info;
+  MARIA_SHARE *share= info->s;
+  LSN lsn;
+  uchar log_data[FILEID_STORE_SIZE + PAGE_STORE_SIZE + 1 + 2 + 1 + 2 + 2 + 7],
+    *log_pos;
+  LEX_CUSTRING log_array[TRANSLOG_INTERNAL_PARTS + 6];
+  uint translog_parts, extra_length= 0;
+  my_off_t page_pos;
+  DBUG_ENTER("_ma_log_rt_split");
+  DBUG_PRINT("enter", ("page: %lu", (ulong) page));
+
+  DBUG_ASSERT(share->now_transactional);
+  page_pos= page->pos / share->block_size;
+  page_store(log_data + FILEID_STORE_SIZE, page_pos);
+  log_pos= log_data+ FILEID_STORE_SIZE + PAGE_STORE_SIZE;
+  log_pos[0]= KEY_OP_DEL_SUFFIX;
+  log_pos++;
+  DBUG_ASSERT((int)length_diff > 0);
+  int2store(log_pos, length_diff);
+  log_pos+= 2;
+  log_pos[0]= KEY_OP_MULTI_COPY;
+  log_pos++;
+  int2store(log_pos, full_length);
+  log_pos+= 2;
+  int2store(log_pos, log_internal_copy_length);
+  log_pos+= 2;
+  log_array[TRANSLOG_INTERNAL_PARTS + 0].str=    log_data;
+  log_array[TRANSLOG_INTERNAL_PARTS + 0].length= sizeof(log_data) - 7;
+  log_array[TRANSLOG_INTERNAL_PARTS + 1].str=    log_internal_copy;
+  log_array[TRANSLOG_INTERNAL_PARTS + 1].length= log_internal_copy_length;
+  translog_parts= 2;
+  if (log_key_copy != NULL) /* need to store key into record */
+  {
+    log_array[TRANSLOG_INTERNAL_PARTS + 2].str=    log_key_copy;
+    log_array[TRANSLOG_INTERNAL_PARTS + 2].length= 1 + 2 + 1 + 2;
+    log_array[TRANSLOG_INTERNAL_PARTS + 3].str=    key_with_nod_flag;
+    log_array[TRANSLOG_INTERNAL_PARTS + 3].length= full_length;
+    extra_length= 1 + 2 + 1 + 2 + full_length;
+    translog_parts+= 2;
+  }
+
+  _ma_log_key_changes(page,
+                      log_array + TRANSLOG_INTERNAL_PARTS + translog_parts,
+                      log_pos, &extra_length, &translog_parts);
+  /* Remember new page length for future log entires for same page */
+  page->org_size= page->size;
+
+  if (translog_write_record(&lsn, LOGREC_REDO_INDEX,
+                            info->trn, info,
+                            (translog_size_t) ((log_pos - log_data) +
+                                               log_internal_copy_length +
+                                               extra_length),
+                            TRANSLOG_INTERNAL_PARTS + translog_parts,
+                            log_array, log_data, NULL))
+    DBUG_RETURN(1);
+  DBUG_RETURN(0);
+}
+
+/**
+   0 ok; the created page is put into page cache; the shortened one is not (up
+   to the caller to do it)
+   1 or -1: error.
+   If new_page_offs==NULL, won't create new page (for redo phase).
+*/
+
+int maria_rtree_split_page(const MARIA_KEY *key, MARIA_PAGE *page,
+                           my_off_t *new_page_offs)
+{
+  MARIA_HA   *info= page->info;
+  MARIA_SHARE *share= info->s;
+  const my_bool transactional= share->now_transactional;
+  int n1, n2; /* Number of items in groups */
+  SplitStruct *task;
+  SplitStruct *cur;
+  SplitStruct *stop;
+  double *coord_buf;
+  double *next_coord;
+  double *old_coord;
+  int n_dim;
+  uchar *source_cur, *cur1, *cur2;
+  uchar *new_page_buff, *log_internal_copy, *log_internal_copy_ptr,
+    *log_key_copy= NULL;
+  int err_code= 0;
+  uint new_page_length;
+  uint nod_flag= page->node;
+  uint org_length= page->size;
+  uint full_length= key->data_length + (nod_flag ? nod_flag :
+                                        key->ref_length);
+  uint key_data_length= key->data_length;
+  int max_keys= ((org_length - share->keypage_header) / (full_length));
+  MARIA_PINNED_PAGE tmp_page_link, *page_link= &tmp_page_link;
+  MARIA_KEYDEF *keyinfo= key->keyinfo;
+  DBUG_ENTER("maria_rtree_split_page");
+  DBUG_PRINT("rtree", ("splitting block"));
+
+  n_dim= keyinfo->keysegs / 2;
+
+  if (!(coord_buf= (double*) my_alloca(n_dim * 2 * sizeof(double) *
+                                       (max_keys + 1 + 4) +
+                                       sizeof(SplitStruct) * (max_keys + 1))))
+    DBUG_RETURN(-1); /* purecov: inspected */
+
+  task= (SplitStruct *)(coord_buf + n_dim * 2 * (max_keys + 1 + 4));
+
+  next_coord= coord_buf;
+
+  stop= task + max_keys;
+  source_cur= rt_PAGE_FIRST_KEY(share, page->buff, nod_flag);
+
+  for (cur= task;
+       cur < stop;
+       cur++, source_cur= rt_PAGE_NEXT_KEY(share, source_cur, key_data_length,
+                                           nod_flag))
+  {
+    cur->coords= reserve_coords(&next_coord, n_dim);
+    cur->key= source_cur;
+    maria_rtree_d_mbr(keyinfo->seg, source_cur, key_data_length, cur->coords);
+  }
+
+  cur->coords= reserve_coords(&next_coord, n_dim);
+  maria_rtree_d_mbr(keyinfo->seg, key->data, key_data_length, cur->coords);
+  cur->key= key->data;
+
+  old_coord= next_coord;
+
+  if (split_maria_rtree_node(task, max_keys + 1,
+                             page->size + full_length + 2,
+                             full_length,
+       rt_PAGE_MIN_SIZE(keyinfo->block_length),
+       2, 2, &next_coord, n_dim))
+  {
+    err_code= 1;
+    goto split_err;
+  }
+
+  /* Allocate buffer for new page and piece of log record */
+  if (!(new_page_buff= (uchar*) my_alloca((uint)keyinfo->block_length +
+                                          (transactional ?
+                                           (max_keys * (2 + 2) +
+                                            1 + 2 + 1 + 2) : 0))))
+  {
+    err_code= -1;
+    goto split_err;
+  }
+  log_internal_copy= log_internal_copy_ptr= new_page_buff +
+    keyinfo->block_length;
+  bzero(new_page_buff, share->block_size);
+
+  stop= task + (max_keys + 1);
+  cur1= rt_PAGE_FIRST_KEY(share, page->buff, nod_flag);
+  cur2= rt_PAGE_FIRST_KEY(share, new_page_buff, nod_flag);
+
+  n1= n2= 0;
+  for (cur= task; cur < stop; cur++)
+  {
+    uchar *to;
+    const uchar *cur_key= cur->key;
+    my_bool log_this_change;
+    DBUG_ASSERT(log_key_copy == NULL);
+    if (cur->n_node == 1)
+    {
+      to= cur1;
+      cur1= rt_PAGE_NEXT_KEY(share, cur1, key_data_length, nod_flag);
+      n1++;
+      log_this_change= transactional;
+    }
+    else
+    {
+      to= cur2;
+      cur2= rt_PAGE_NEXT_KEY(share, cur2, key_data_length, nod_flag);
+      n2++;
+      log_this_change= FALSE;
+    }
+    if (to != cur_key)
+    {
+      uchar *to_with_nod_flag= to - nod_flag;
+      const uchar *cur_key_with_nod_flag= cur_key - nod_flag;
+      memcpy(to_with_nod_flag, cur_key_with_nod_flag, full_length);
+      if (log_this_change)
+      {
+        uint to_with_nod_flag_offs= to_with_nod_flag - page->buff;
+        if (likely(cur_key != key->data))
+        {
+          /* this memcpy() is internal to the page (source in the page) */
+          uint cur_key_with_nod_flag_offs= cur_key_with_nod_flag - page->buff;
+          int2store(log_internal_copy_ptr, to_with_nod_flag_offs);
+          log_internal_copy_ptr+= 2;
+          int2store(log_internal_copy_ptr, cur_key_with_nod_flag_offs);
+          log_internal_copy_ptr+= 2;
+        }
+        else
+        {
+          /* last iteration, and this involves *key: source is external */
+          log_key_copy= log_internal_copy_ptr;
+          log_key_copy[0]= KEY_OP_OFFSET;
+          int2store(log_key_copy + 1, to_with_nod_flag_offs);
+          log_key_copy[3]= KEY_OP_CHANGE;
+          int2store(log_key_copy + 4, full_length);
+          /* _ma_log_rt_split() will store *key, right after */
+        }
+      }
+    }
+  }
+  { /* verify that above loop didn't touch header bytes */
+    uint i;
+    for (i= 0; i < share->keypage_header; i++)
+      DBUG_ASSERT(new_page_buff[i]==0);
+  }
+
+  if (nod_flag)
+    _ma_store_keypage_flag(share, new_page_buff, KEYPAGE_FLAG_ISNOD);
+  _ma_store_keynr(share, new_page_buff, keyinfo->key_nr);
+  new_page_length= share->keypage_header + n2 * full_length;
+  _ma_store_page_used(share, new_page_buff, new_page_length);
+  page->size= share->keypage_header + n1 * full_length;
+  page_store_size(share, page);
+
+  if ((*new_page_offs= _ma_new(info, DFLT_INIT_HITS, &page_link)) ==
+      HA_OFFSET_ERROR)
+    err_code= -1;
+  else
+  {
+    MARIA_PAGE new_page;
+    _ma_page_setup(&new_page, info, keyinfo, *new_page_offs, new_page_buff);
+
+    if (transactional &&
+        ( /* log change to split page */
+         _ma_log_rt_split(page, key->data - nod_flag,
+                          full_length, log_internal_copy,
+                          log_internal_copy_ptr - log_internal_copy,
+                          log_key_copy, org_length - page->size) ||
+         /* and to new page */
+         _ma_log_new(&new_page, 0)))
+      err_code= -1;
+
+    if (_ma_write_keypage(&new_page, page_link->write_lock,
+                          DFLT_INIT_HITS))
+      err_code= -1;
+  }
+  DBUG_PRINT("rtree", ("split new block: %lu", (ulong) *new_page_offs));
+
+  my_afree(new_page);
+
+split_err:
+  my_afree(coord_buf);
+  DBUG_RETURN(err_code);
+}
+
+#endif /*HAVE_RTREE_KEYS*/
diff --git a/storage/maria/ma_rt_test.c b/storage/maria/ma_rt_test.c
new file mode 100644
index 00000000000..af54e6b27be
--- /dev/null
+++ b/storage/maria/ma_rt_test.c
@@ -0,0 +1,692 @@
+/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; version 2 of the License.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */
+
+/* Testing of the basic functions of a MARIA rtree table         */
+/* Written by Alex Barkov who has a shared copyright to this code */
+
+
+#include "maria_def.h"
+#include "ma_control_file.h"
+#include "ma_loghandler.h"
+#include "ma_checkpoint.h"
+#include "trnman.h"
+#include <my_getopt.h>
+
+#ifdef HAVE_RTREE_KEYS
+
+#include "ma_rt_index.h"
+
+#define MAX_REC_LENGTH 1024
+#define ndims 2
+#define KEYALG HA_KEY_ALG_RTREE
+
+static int read_with_pos(MARIA_HA * file);
+static void create_record(uchar *record,uint rownr);
+static void create_record1(uchar *record,uint rownr);
+static void print_record(uchar * record,my_off_t offs,const char * tail);
+static  int run_test(const char *filename);
+static void get_options(int argc, char *argv[]);
+static void usage();
+
+static double rt_data[]=
+{
+  /*1*/  0,10,0,10,
+  /*2*/  5,15,0,10,
+  /*3*/  0,10,5,15,
+  /*4*/  10,20,10,20,
+  /*5*/  0,10,0,10,
+  /*6*/  5,15,0,10,
+  /*7*/  0,10,5,15,
+  /*8*/  10,20,10,20,
+  /*9*/  0,10,0,10,
+  /*10*/  5,15,0,10,
+  /*11*/  0,10,5,15,
+  /*12*/  10,20,10,20,
+  /*13*/  0,10,0,10,
+  /*14*/  5,15,0,10,
+  /*15*/  0,10,5,15,
+  /*16*/  10,20,10,20,
+  /*17*/  5,15,0,10,
+  /*18*/  0,10,5,15,
+  /*19*/  10,20,10,20,
+  /*20*/  0,10,0,10,
+
+  /*1*/  100,110,0,10,
+  /*2*/  105,115,0,10,
+  /*3*/  100,110,5,15,
+  /*4*/  110,120,10,20,
+  /*5*/  100,110,0,10,
+  /*6*/  105,115,0,10,
+  /*7*/  100,110,5,15,
+  /*8*/  110,120,10,20,
+  /*9*/  100,110,0,10,
+  /*10*/  105,115,0,10,
+  /*11*/  100,110,5,15,
+  /*12*/  110,120,10,20,
+  /*13*/  100,110,0,10,
+  /*14*/  105,115,0,10,
+  /*15*/  100,110,5,15,
+  /*16*/  110,120,10,20,
+  /*17*/  105,115,0,10,
+  /*18*/  100,110,5,15,
+  /*19*/  110,120,10,20,
+  /*20*/  100,110,0,10,
+  -1
+};
+
+static int testflag, checkpoint, create_flag;
+static my_bool silent, transactional, die_in_middle_of_transaction,
+  opt_versioning;
+static enum data_file_type record_type= DYNAMIC_RECORD;
+
+int main(int argc, char *argv[])
+{
+  MY_INIT(argv[0]);
+  get_options(argc, argv);
+  maria_data_root= (char *)".";
+  /* Maria requires that we always have a page cache */
+  if (maria_init() ||
+      (init_pagecache(maria_pagecache, maria_block_size * 16, 0, 0,
+                      maria_block_size, MY_WME) == 0) ||
+      ma_control_file_open(TRUE, TRUE) ||
+      (init_pagecache(maria_log_pagecache,
+                      TRANSLOG_PAGECACHE_SIZE, 0, 0,
+                      TRANSLOG_PAGE_SIZE, MY_WME) == 0) ||
+      translog_init(maria_data_root, TRANSLOG_FILE_SIZE,
+                    0, 0, maria_log_pagecache,
+                    TRANSLOG_DEFAULT_FLAGS, 0) ||
+      (transactional && (trnman_init(0) || ma_checkpoint_init(0))))
+  {
+    fprintf(stderr, "Error in initialization\n");
+    exit(1);
+  }
+
+  exit(run_test("rt_test"));
+}
+
+
+static int run_test(const char *filename)
+{
+  MARIA_HA        *file;
+  MARIA_UNIQUEDEF   uniquedef;
+  MARIA_CREATE_INFO create_info;
+  MARIA_COLUMNDEF   recinfo[20];
+  MARIA_KEYDEF      keyinfo[20];
+  HA_KEYSEG      keyseg[20];
+  key_range	range;
+
+  int opt_unique=0;
+  int key_type=HA_KEYTYPE_DOUBLE;
+  int key_length=8;
+  int null_fields=0;
+  int nrecords=sizeof(rt_data)/(sizeof(double)*4);/* 40 */
+  int rec_length=0;
+  int uniques=0;
+  int i, max_i;
+  int error;
+  int row_count=0;
+  uchar record[MAX_REC_LENGTH];
+  uchar read_record[MAX_REC_LENGTH];
+  int upd= 10;
+  ha_rows hrows;
+
+  bzero(&uniquedef, sizeof(uniquedef));
+  bzero(&create_info, sizeof(create_info));
+  bzero(recinfo, sizeof(recinfo));
+  bzero(keyinfo, sizeof(keyinfo));
+  bzero(keyseg, sizeof(keyseg));
+
+  /* Define a column for NULLs and DEL markers*/
+
+  recinfo[0].type=FIELD_NORMAL;
+  recinfo[0].length=1; /* For NULL bits */
+  rec_length=1;
+
+  /* Define 2*ndims columns for coordinates*/
+
+  for (i=1; i<=2*ndims ;i++)
+  {
+    recinfo[i].type=FIELD_NORMAL;
+    recinfo[i].length=key_length;
+    rec_length+=key_length;
+  }
+
+  /* Define a key with 2*ndims segments */
+
+  keyinfo[0].seg=keyseg;
+  keyinfo[0].keysegs=2*ndims;
+  keyinfo[0].flag=0;
+  keyinfo[0].key_alg=KEYALG;
+
+  for (i=0; i<2*ndims; i++)
+  {
+    keyinfo[0].seg[i].type= key_type;
+    keyinfo[0].seg[i].flag=0;          /* Things like HA_REVERSE_SORT */
+    keyinfo[0].seg[i].start= (key_length*i)+1;
+    keyinfo[0].seg[i].length=key_length;
+    keyinfo[0].seg[i].null_bit= null_fields ? 2 : 0;
+    keyinfo[0].seg[i].null_pos=0;
+    keyinfo[0].seg[i].language=default_charset_info->number;
+  }
+
+  if (!silent)
+    printf("- Creating isam-file\n");
+
+  create_info.max_rows=10000000;
+  create_info.transactional= transactional;
+
+  if (maria_create(filename,
+                   record_type,
+                   1,            /*  keys   */
+                   keyinfo,
+                   1+2*ndims+opt_unique, /* columns */
+                   recinfo,uniques,&uniquedef,&create_info,create_flag))
+    goto err;
+
+  if (!silent)
+    printf("- Open isam-file\n");
+
+  if (!(file=maria_open(filename,2,HA_OPEN_ABORT_IF_LOCKED)))
+    goto err;
+  maria_begin(file);
+  if (opt_versioning)
+    maria_versioning(file, 1);
+  if (testflag == 1)
+    goto end;
+  if (checkpoint == 1 && ma_checkpoint_execute(CHECKPOINT_MEDIUM, FALSE))
+    goto err;
+  if (!silent)
+    printf("- Writing key:s\n");
+
+  for (i=0; i<nrecords; i++ )
+  {
+    create_record(record,i);
+    error=maria_write(file,record);
+    print_record(record,maria_position(file),"\n");
+    if (!error)
+    {
+      row_count++;
+    }
+    else
+    {
+      fprintf(stderr, "maria_write: %d\n", error);
+      goto err;
+    }
+  }
+
+  if (maria_scan_init(file))
+  {
+    fprintf(stderr, "maria_scan_init failed\n");
+    goto err;
+  }
+  if ((error=read_with_pos(file)))
+    goto err;
+  maria_scan_end(file);
+
+  if (!silent)
+    printf("- Reading rows with key\n");
+
+  for (i=0 ; i < nrecords ; i++)
+  {
+    my_errno=0;
+    create_record(record,i);
+
+    bzero((char*) read_record,MAX_REC_LENGTH);
+    error=maria_rkey(file,read_record,0,record+1,HA_WHOLE_KEY,HA_READ_MBR_EQUAL);
+
+    if (error && error!=HA_ERR_KEY_NOT_FOUND)
+    {
+      fprintf(stderr,"     maria_rkey: %3d  errno: %3d\n",error,my_errno);
+      goto err;
+    }
+    if (error == HA_ERR_KEY_NOT_FOUND)
+    {
+      print_record(record,maria_position(file),"  NOT FOUND\n");
+      continue;
+    }
+    print_record(read_record,maria_position(file),"\n");
+  }
+
+  if (checkpoint == 2 && ma_checkpoint_execute(CHECKPOINT_MEDIUM, FALSE))
+    goto err;
+
+  if (testflag == 2)
+    goto end;
+
+  if (!silent)
+    printf("- Deleting rows\n");
+  if (maria_scan_init(file))
+  {
+    fprintf(stderr, "maria_scan_init failed\n");
+    goto err;
+  }
+
+  for (i=0; i < nrecords/4; i++)
+  {
+    my_errno=0;
+    bzero((char*) read_record,MAX_REC_LENGTH);
+    error=maria_scan(file,read_record);
+    if (error)
+    {
+      fprintf(stderr, "pos: %2d  maria_rrnd: %3d  errno: %3d\n", i, error,
+              my_errno);
+      goto err;
+    }
+    print_record(read_record,maria_position(file),"\n");
+
+    error=maria_delete(file,read_record);
+    if (error)
+    {
+      fprintf(stderr, "pos: %2d maria_delete: %3d errno: %3d\n", i, error,
+              my_errno);
+      goto err;
+    }
+  }
+  maria_scan_end(file);
+
+  if (testflag == 3)
+    goto end;
+  if (checkpoint == 3 && ma_checkpoint_execute(CHECKPOINT_MEDIUM, FALSE))
+    goto err;
+
+  if (!silent)
+    printf("- Updating rows with position\n");
+  if (maria_scan_init(file))
+  {
+    fprintf(stderr, "maria_scan_init failed\n");
+    goto err;
+  }
+
+  /* We are looking for nrecords-necords/2 non-deleted records */
+  for (i=0, max_i= nrecords - nrecords/2; i < max_i ; i++)
+  {
+    my_errno=0;
+    bzero((char*) read_record,MAX_REC_LENGTH);
+    error=maria_scan(file,read_record);
+    if (error)
+    {
+      if (error==HA_ERR_RECORD_DELETED)
+      {
+        if (!silent)
+          printf("found deleted record\n");
+        /*
+          In BLOCK_RECORD format, maria_scan() never returns deleted records,
+          while in DYNAMIC format it can. Don't count such record:
+        */
+        max_i++;
+        continue;
+      }
+      fprintf(stderr, "pos: %2d  maria_rrnd: %3d  errno: %3d\n",i , error,
+              my_errno);
+      goto err;
+    }
+    print_record(read_record,maria_position(file),"");
+    create_record1(record,i+nrecords*upd);
+    if (!silent)
+      printf("\t-> ");
+    print_record(record,maria_position(file),"\n");
+    error=maria_update(file,read_record,record);
+    if (error)
+    {
+      fprintf(stderr, "pos: %2d  maria_update: %3d  errno: %3d\n",i, error,
+              my_errno);
+      goto err;
+    }
+  }
+
+  if (testflag == 4)
+    goto end;
+  if (checkpoint == 4 && ma_checkpoint_execute(CHECKPOINT_MEDIUM, FALSE))
+    goto err;
+
+  if (maria_scan_init(file))
+  {
+    fprintf(stderr, "maria_scan_init failed\n");
+    goto err;
+  }
+  if ((error=read_with_pos(file)))
+    goto err;
+  maria_scan_end(file);
+
+  if (!silent)
+    printf("- Test maria_rkey then a sequence of maria_rnext_same\n");
+
+  create_record(record, nrecords*4/5);
+  print_record(record,0,"  search for\n");
+
+  if ((error=maria_rkey(file,read_record,0,record+1,HA_WHOLE_KEY,
+                        HA_READ_MBR_INTERSECT)))
+  {
+    fprintf(stderr, "maria_rkey: %3d  errno: %3d\n",error,my_errno);
+    goto err;
+  }
+  print_record(read_record,maria_position(file),"  maria_rkey\n");
+  row_count=1;
+
+  for (;;)
+  {
+    if ((error=maria_rnext_same(file,read_record)))
+    {
+      if (error==HA_ERR_END_OF_FILE)
+        break;
+      fprintf(stderr, "maria_next: %3d  errno: %3d\n",error,my_errno);
+      goto err;
+    }
+    print_record(read_record,maria_position(file),"  maria_rnext_same\n");
+      row_count++;
+  }
+  if (!silent)
+    printf("     %d rows\n",row_count);
+
+  if (!silent)
+    printf("- Test maria_rfirst then a sequence of maria_rnext\n");
+
+  error=maria_rfirst(file,read_record,0);
+  if (error)
+  {
+    fprintf(stderr, "maria_rfirst: %3d  errno: %3d\n",error,my_errno);
+    goto err;
+  }
+  row_count=1;
+  print_record(read_record,maria_position(file),"  maria_frirst\n");
+
+  for (i=0;i<nrecords;i++)
+  {
+    if ((error=maria_rnext(file,read_record,0)))
+    {
+      if (error==HA_ERR_END_OF_FILE)
+        break;
+      fprintf(stderr, "maria_next: %3d  errno: %3d\n",error,my_errno);
+      goto err;
+    }
+    print_record(read_record,maria_position(file),"  maria_rnext\n");
+    row_count++;
+  }
+  if (!silent)
+    printf("     %d rows\n",row_count);
+
+  if (!silent)
+    printf("- Test maria_records_in_range()\n");
+
+  create_record1(record, nrecords*4/5);
+  print_record(record,0,"\n");
+
+  range.key= record+1;
+  range.length= 1000;                           /* Big enough */
+  range.flag= HA_READ_MBR_INTERSECT;
+  hrows= maria_records_in_range(file,0, &range, (key_range*) 0);
+  if (!silent)
+    printf("     %ld rows\n", (long) hrows);
+
+end:
+  maria_scan_end(file);
+  if (die_in_middle_of_transaction)
+  {
+    /* see similar code in ma_test2.c for comments */
+    switch (die_in_middle_of_transaction) {
+    case 1:
+      _ma_flush_table_files(file, MARIA_FLUSH_DATA | MARIA_FLUSH_INDEX,
+                            FLUSH_RELEASE, FLUSH_RELEASE);
+      break;
+    case 2:
+      if (translog_flush(file->trn->undo_lsn))
+        goto err;
+      break;
+    case 3:
+      break;
+    case 4:
+      _ma_flush_table_files(file, MARIA_FLUSH_DATA, FLUSH_RELEASE,
+                            FLUSH_RELEASE);
+      if (translog_flush(file->trn->undo_lsn))
+        goto err;
+      break;
+    }
+    if (!silent)
+      printf("Dying on request without maria_commit()/maria_close()\n");
+    exit(0);
+  }
+  if (maria_commit(file))
+    goto err;
+  if (maria_close(file)) goto err;
+  maria_end();
+  my_end(MY_CHECK_ERROR);
+
+  return 0;
+
+err:
+  fprintf(stderr, "got error: %3d when using maria-database\n",my_errno);
+  return 1;           /* skip warning */
+}
+
+
+
+static int read_with_pos (MARIA_HA * file)
+{
+  int error;
+  int i;
+  uchar read_record[MAX_REC_LENGTH];
+
+  if (!silent)
+    printf("- Reading rows with position\n");
+  for (i=0;;i++)
+  {
+    my_errno=0;
+    bzero((char*) read_record,MAX_REC_LENGTH);
+    error=maria_scan(file,read_record);
+    if (error)
+    {
+      if (error==HA_ERR_END_OF_FILE)
+        break;
+      if (error==HA_ERR_RECORD_DELETED)
+        continue;
+      fprintf(stderr, "pos: %2d  maria_rrnd: %3d  errno: %3d\n", i, error,
+              my_errno);
+      return error;
+    }
+    print_record(read_record,maria_position(file),"\n");
+  }
+  return 0;
+}
+
+
+#ifdef NOT_USED
+static void bprint_record(char * record,
+			  my_off_t offs __attribute__((unused)),
+			  const char * tail)
+{
+  int i;
+  char * pos;
+  if (silent)
+    return;
+  i=(unsigned char)record[0];
+  printf("%02X ",i);
+
+  for( pos=record+1, i=0; i<32; i++,pos++){
+    int b=(unsigned char)*pos;
+    printf("%02X",b);
+  }
+  printf("%s",tail);
+}
+#endif
+
+
+static void print_record(uchar *record,
+			 my_off_t offs __attribute__((unused)),
+			 const char * tail)
+{
+  int i;
+  uchar *pos;
+  double c;
+
+  if (silent)
+    return;
+  printf("     rec=(%d)",(unsigned char)record[0]);
+  for ( pos=record+1, i=0; i<2*ndims; i++)
+   {
+      memcpy(&c,pos,sizeof(c));
+      float8get(c,pos);
+      printf(" %.14g ",c);
+      pos+=sizeof(c);
+   }
+   printf("pos=%ld",(long int)offs);
+   printf("%s",tail);
+}
+
+
+
+static void create_record1(uchar *record, uint rownr)
+{
+   int i;
+   uchar *pos;
+   double c=rownr+10;
+
+   bzero((char*) record,MAX_REC_LENGTH);
+   record[0]=0x01; /* DEL marker */
+
+   for ( pos=record+1, i=0; i<2*ndims; i++)
+   {
+      memcpy(pos,&c,sizeof(c));
+      float8store(pos,c);
+      pos+=sizeof(c);
+   }
+}
+
+#ifdef NOT_USED
+
+static void create_record0(char *record,uint rownr)
+{
+   int i;
+   char * pos;
+   double c=rownr+10;
+   double c0=0;
+
+   bzero((char*) record,MAX_REC_LENGTH);
+   record[0]=0x01; /* DEL marker */
+
+   for ( pos=record+1, i=0; i<ndims; i++)
+   {
+      memcpy(pos,&c0,sizeof(c0));
+      float8store(pos,c0);
+      pos+=sizeof(c0);
+      memcpy(pos,&c,sizeof(c));
+      float8store(pos,c);
+      pos+=sizeof(c);
+   }
+}
+
+#endif
+
+static void create_record(uchar *record, uint rownr)
+{
+   int i;
+   uchar *pos;
+   double *data= rt_data+rownr*4;
+   record[0]=0x01; /* DEL marker */
+   for ( pos=record+1, i=0; i<ndims*2; i++)
+   {
+      float8store(pos,data[i]);
+      pos+=8;
+   }
+}
+
+
+static struct my_option my_long_options[] =
+{
+  {"checkpoint", 'H', "Checkpoint at specified stage", (uchar**) &checkpoint,
+   (uchar**) &checkpoint, 0, GET_INT, REQUIRED_ARG, 0, 0, 0, 0, 0, 0},
+  {"checksum", 'c', "Undocumented",
+   0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
+#ifndef DBUG_OFF
+  {"debug", '#', "Undocumented",
+   0, 0, 0, GET_STR, REQUIRED_ARG, 0, 0, 0, 0, 0, 0},
+#endif
+  {"help", '?', "Display help and exit",
+   0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
+  {"row-fixed-size", 'S', "Fixed size records",
+   0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
+  {"rows-in-block", 'M', "Store rows in block format",
+   0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
+  {"silent", 's', "Undocumented",
+   (uchar**) &silent, (uchar**) &silent, 0, GET_BOOL, NO_ARG, 0, 0, 0, 0,
+   0, 0},
+  {"testflag", 't', "Stop test at specified stage", (uchar**) &testflag,
+   (uchar**) &testflag, 0, GET_INT, REQUIRED_ARG, 0, 0, 0, 0, 0, 0},
+  {"test-undo", 'A',
+   "Abort hard. Used for testing recovery with undo",
+   (uchar**) &die_in_middle_of_transaction,
+   (uchar**) &die_in_middle_of_transaction,
+   0, GET_INT, REQUIRED_ARG, 0, 0, 0, 0, 0, 0},
+  {"transactional", 'T',
+   "Test in transactional mode. (Only works with block format)",
+   (uchar**) &transactional, (uchar**) &transactional, 0, GET_BOOL, NO_ARG,
+   0, 0, 0, 0, 0, 0},
+  {"versioning", 'C', "Use row versioning (only works with block format)",
+   (uchar**) &opt_versioning,  (uchar**) &opt_versioning, 0, GET_BOOL,
+   NO_ARG, 0, 0, 0, 0, 0, 0},
+  { 0, 0, 0, 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}
+};
+
+
+static my_bool
+get_one_option(int optid, const struct my_option *opt __attribute__((unused)),
+	       char *argument __attribute__((unused)))
+{
+  switch(optid) {
+  case 'c':
+    create_flag|= HA_CREATE_CHECKSUM | HA_CREATE_PAGE_CHECKSUM;
+    break;
+  case 'M':
+    record_type= BLOCK_RECORD;
+    break;
+  case 'S':
+    record_type= STATIC_RECORD;
+    break;
+  case '#':
+    DBUG_PUSH(argument);
+    break;
+  case '?':
+    usage();
+    exit(1);
+  }
+  return 0;
+}
+
+
+/* Read options */
+
+static void get_options(int argc, char *argv[])
+{
+  int ho_error;
+
+  if ((ho_error=handle_options(&argc, &argv, my_long_options, get_one_option)))
+    exit(ho_error);
+
+  return;
+} /* get options */
+
+
+static void usage()
+{
+  printf("Usage: %s [options]\n\n", my_progname);
+  my_print_help(my_long_options);
+  my_print_variables(my_long_options);
+}
+
+#else
+int main(int argc __attribute__((unused)),char *argv[] __attribute__((unused)))
+{
+  exit(0);
+}
+#endif /*HAVE_RTREE_KEYS*/
diff --git a/storage/maria/ma_scan.c b/storage/maria/ma_scan.c
new file mode 100644
index 00000000000..cbac463a2c8
--- /dev/null
+++ b/storage/maria/ma_scan.c
@@ -0,0 +1,74 @@
+/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; version 2 of the License.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */
+
+/* Read through all rows sequntially */
+
+#include "maria_def.h"
+
+int maria_scan_init(register MARIA_HA *info)
+{
+  DBUG_ENTER("maria_scan_init");
+
+  info->cur_row.nextpos= info->s->pack.header_length;	/* Read first record */
+  info->lastinx= -1;				/* Can't forward or backward */
+  if (info->opt_flag & WRITE_CACHE_USED && flush_io_cache(&info->rec_cache))
+    DBUG_RETURN(my_errno);
+
+  if ((*info->s->scan_init)(info))
+    DBUG_RETURN(my_errno);
+  DBUG_RETURN(0);
+}
+
+/*
+  Read a row based on position.
+
+  SYNOPSIS
+    maria_scan()
+    info		Maria handler
+    record		Read data here
+
+  RETURN
+    0  			   ok
+    HA_ERR_END_OF_FILE     End of file
+    HA_ERR_RECORD_DELETED  Record was deleted (can only happen for static rec)
+    #			   Error code
+*/
+
+int maria_scan(MARIA_HA *info, uchar *record)
+{
+  DBUG_ENTER("maria_scan");
+  /* Init all but update-flag */
+  info->update&= (HA_STATE_CHANGED | HA_STATE_ROW_CHANGED);
+  DBUG_RETURN((*info->s->scan)(info, record, info->cur_row.nextpos, 1));
+}
+
+
+void maria_scan_end(MARIA_HA *info)
+{
+  (*info->s->scan_end)(info);
+}
+
+
+int _ma_def_scan_remember_pos(MARIA_HA *info, MARIA_RECORD_POS *lastpos)
+{
+  *lastpos= info->cur_row.lastpos;
+  return 0;
+}
+
+
+void _ma_def_scan_restore_pos(MARIA_HA *info, MARIA_RECORD_POS lastpos)
+{
+  info->cur_row.nextpos= lastpos;
+}
diff --git a/storage/maria/ma_search.c b/storage/maria/ma_search.c
new file mode 100644
index 00000000000..9f1e8e2554b
--- /dev/null
+++ b/storage/maria/ma_search.c
@@ -0,0 +1,2397 @@
+/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; version 2 of the License.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */
+
+/* key handling functions */
+
+#include "ma_fulltext.h"
+#include "m_ctype.h"
+
+static int _ma_search_no_save(register MARIA_HA *info, MARIA_KEY *key,
+                              uint32 nextflag, register my_off_t pos,
+                              MARIA_PINNED_PAGE **res_page_link,
+                              uchar **res_page_buff);
+static my_bool _ma_get_prev_key(MARIA_KEY *key, MARIA_PAGE *ma_page,
+                                uchar *keypos);
+
+
+/* Check that new index is ok */
+
+int _ma_check_index(MARIA_HA *info, int inx)
+{
+  if (inx < 0 || ! maria_is_key_active(info->s->state.key_map, inx))
+  {
+    my_errno=HA_ERR_WRONG_INDEX;
+    return -1;
+  }
+  if (info->lastinx != inx)             /* Index changed */
+  {
+    info->lastinx = inx;
+    info->page_changed=1;
+    info->update= ((info->update & (HA_STATE_CHANGED | HA_STATE_ROW_CHANGED)) |
+                   HA_STATE_NEXT_FOUND | HA_STATE_PREV_FOUND);
+  }
+  if (info->opt_flag & WRITE_CACHE_USED && flush_io_cache(&info->rec_cache))
+    return(-1);
+  return(inx);
+} /* _ma_check_index */
+
+
+/**
+   @breif Search after row by a key
+
+   @note
+     Position to row is stored in info->lastpos
+
+   @return
+   @retval  0   ok (key found)
+   @retval -1   Not found
+   @retval  1   If one should continue search on higher level
+*/
+
+int _ma_search(register MARIA_HA *info, MARIA_KEY *key, uint32 nextflag,
+               my_off_t pos)
+{
+  int error;
+  MARIA_PINNED_PAGE *page_link;
+  uchar *page_buff;
+
+  info->page_changed= 1;                        /* If page not saved */
+  if (!(error= _ma_search_no_save(info, key, nextflag, pos, &page_link,
+                                  &page_buff)))
+  {
+    if (nextflag & SEARCH_SAVE_BUFF)
+    {
+      bmove512(info->keyread_buff, page_buff, info->s->block_size);
+
+      /* Save position for a possible read next / previous */
+      info->int_keypos= info->keyread_buff + info->keypos_offset;
+      info->int_maxpos= info->keyread_buff + info->maxpos_offset;
+      info->int_keytree_version= key->keyinfo->version;
+      info->last_search_keypage= info->last_keypage;
+      info->page_changed= 0;
+      info->keyread_buff_used= 0;
+    }
+  }
+  _ma_unpin_all_pages(info, LSN_IMPOSSIBLE);
+  return (error);
+}
+
+/**
+   @breif Search after row by a key
+
+   ret_page_link	Will contain pointer to page where we found key
+
+   @note
+     Position to row is stored in info->lastpos
+
+   @return
+   @retval  0   ok (key found)
+   @retval -1   Not found
+   @retval  1   If one should continue search on higher level
+*/
+
+static int _ma_search_no_save(register MARIA_HA *info, MARIA_KEY *key,
+                              uint32 nextflag, register my_off_t pos,
+                              MARIA_PINNED_PAGE **res_page_link,
+                              uchar **res_page_buff)
+{
+  my_bool last_key_not_used;
+  int error,flag;
+  uint page_flag, nod_flag, used_length;
+  uchar *keypos,*maxpos;
+  uchar lastkey[MARIA_MAX_KEY_BUFF];
+  MARIA_KEYDEF *keyinfo= key->keyinfo;
+  MARIA_PAGE page;
+  MARIA_PINNED_PAGE *page_link;
+  DBUG_ENTER("_ma_search");
+  DBUG_PRINT("enter",("page: %lu  nextflag: %u  lastpos: %lu",
+                      (ulong) (pos / info->s->block_size),
+                      nextflag, (ulong) info->cur_row.lastpos));
+  DBUG_EXECUTE("key", _ma_print_key(DBUG_FILE, key););
+
+  if (pos == HA_OFFSET_ERROR)
+  {
+    my_errno=HA_ERR_KEY_NOT_FOUND;                      /* Didn't find key */
+    info->cur_row.lastpos= HA_OFFSET_ERROR;
+    if (!(nextflag & (SEARCH_SMALLER | SEARCH_BIGGER | SEARCH_LAST)))
+      DBUG_RETURN(-1);                          /* Not found ; return error */
+    DBUG_RETURN(1);                             /* Search at upper levels */
+  }
+
+  if (_ma_fetch_keypage(&page, info, keyinfo, pos,
+                        PAGECACHE_LOCK_READ, DFLT_INIT_HITS, 0, 0))
+    goto err;
+  page_link= dynamic_element(&info->pinned_pages,
+                             info->pinned_pages.elements-1,
+                             MARIA_PINNED_PAGE*);
+  DBUG_DUMP("page", page.buff, page.size);
+
+  flag= (*keyinfo->bin_search)(key, &page, nextflag, &keypos, lastkey,
+                               &last_key_not_used);
+  if (flag == MARIA_FOUND_WRONG_KEY)
+    DBUG_RETURN(-1);
+  page_flag=   page.flag;
+  used_length= page.size;
+  nod_flag=    page.node;
+  maxpos=      page.buff + used_length -1;
+
+  if (flag)
+  {
+    if ((error= _ma_search_no_save(info, key, nextflag,
+                                   _ma_kpos(nod_flag,keypos),
+                                   res_page_link, res_page_buff)) <= 0)
+      DBUG_RETURN(error);
+
+    if (flag >0)
+    {
+      if (nextflag & (SEARCH_SMALLER | SEARCH_LAST) &&
+          keypos == page.buff + info->s->keypage_header + nod_flag)
+        DBUG_RETURN(1);                                 /* Bigger than key */
+    }
+    else if (nextflag & SEARCH_BIGGER && keypos >= maxpos)
+      DBUG_RETURN(1);                                   /* Smaller than key */
+  }
+  else
+  {
+    /* Found matching key */
+    if ((nextflag & SEARCH_FIND) && nod_flag &&
+	((keyinfo->flag & (HA_NOSAME | HA_NULL_PART)) != HA_NOSAME ||
+	 (key->flag & SEARCH_PART_KEY) || info->s->base.born_transactional))
+    {
+      if ((error= _ma_search_no_save(info, key, (nextflag | SEARCH_FIND) &
+                                     ~(SEARCH_BIGGER | SEARCH_SMALLER |
+                                       SEARCH_LAST),
+                                     _ma_kpos(nod_flag,keypos),
+                                     res_page_link, res_page_buff)) >= 0 ||
+          my_errno != HA_ERR_KEY_NOT_FOUND)
+        DBUG_RETURN(error);
+    }
+  }
+
+  info->last_key.keyinfo= keyinfo;
+  if ((nextflag & (SEARCH_SMALLER | SEARCH_LAST)) && flag != 0)
+  {
+    uint not_used[2];
+    if (_ma_get_prev_key(&info->last_key, &page, keypos))
+      goto err;
+    /*
+      We have to use key->flag >> 1 here to transform
+      SEARCH_PAGE_KEY_HAS_TRANSID to SEARCH_USER_KEY_HAS_TRANSID
+    */
+    if (!(nextflag & SEARCH_SMALLER) &&
+        ha_key_cmp(keyinfo->seg, info->last_key.data, key->data,
+                   key->data_length + key->ref_length,
+                   SEARCH_FIND | (key->flag >> 1) | info->last_key.flag,
+                   not_used))
+    {
+      my_errno=HA_ERR_KEY_NOT_FOUND;                    /* Didn't find key */
+      goto err;
+    }
+  }
+  else
+  {
+    /* Set info->last_key to temporarily point to last key value */
+    info->last_key.data= lastkey;
+    /* Get key value (if not packed key) and position after key */
+    if (!(*keyinfo->get_key)(&info->last_key, page_flag, nod_flag, &keypos))
+      goto err;
+    memcpy(info->lastkey_buff, lastkey,
+           info->last_key.data_length + info->last_key.ref_length);
+    info->last_key.data= info->lastkey_buff;
+  }
+  info->cur_row.lastpos= _ma_row_pos_from_key(&info->last_key);
+  info->cur_row.trid=    _ma_trid_from_key(&info->last_key);
+
+  /* Store offset to key */
+  info->keypos_offset= (uint) (keypos - page.buff);
+  info->maxpos_offset= (uint) (maxpos - page.buff);
+  info->int_nod_flag= nod_flag;
+  info->last_keypage= pos;
+  *res_page_link= page_link;
+  *res_page_buff= page.buff;
+  
+  DBUG_PRINT("exit",("found key at %lu",(ulong) info->cur_row.lastpos));
+  DBUG_RETURN(0);
+
+err:
+  DBUG_PRINT("exit",("Error: %d",my_errno));
+  info->cur_row.lastpos= HA_OFFSET_ERROR;
+  info->page_changed=1;
+  DBUG_RETURN (-1);
+}
+
+
+/*
+  Search after key in page-block
+
+  @fn    _ma_bin_search
+  @param key		Search after this key
+  @param page		Start of data page
+  @param comp_flag	How key should be compared
+  @param ret_pos
+  @param buff		Buffer for holding a key (not used here)
+  @param last_key
+
+  @note
+   If keys are packed, then smaller or identical key is stored in buff
+
+  @return
+  @retval <0, 0 , >0 depending on if if found is smaller, equal or bigger than
+          'key'
+  @retval ret_pos   Points to where the identical or bigger key starts
+  @retval last_key  Set to 1 if key is the last key in the page.
+*/
+
+int _ma_bin_search(const MARIA_KEY *key, const MARIA_PAGE *ma_page,
+                   uint32 comp_flag, uchar **ret_pos,
+                   uchar *buff __attribute__((unused)), my_bool *last_key)
+{
+  int flag;
+  uint page_flag;
+  uint start, mid, end, save_end, totlength, nod_flag;
+  uint not_used[2];
+  MARIA_KEYDEF *keyinfo= key->keyinfo;
+  MARIA_SHARE *share=  keyinfo->share;
+  uchar *page;
+  DBUG_ENTER("_ma_bin_search");
+
+  LINT_INIT(flag);
+
+  page_flag= ma_page->flag;
+  if (page_flag & KEYPAGE_FLAG_HAS_TRANSID)
+  {
+    /* Keys have varying length, can't use binary search */
+    DBUG_RETURN(_ma_seq_search(key, ma_page, comp_flag, ret_pos, buff,
+                               last_key));
+  }
+
+  nod_flag=    ma_page->node;
+  totlength= keyinfo->keylength + nod_flag;
+  DBUG_ASSERT(ma_page->size >= share->keypage_header + nod_flag + totlength);
+
+  start=0;
+  mid=1;
+  save_end= end= ((ma_page->size - nod_flag - share->keypage_header) /
+                  totlength-1);
+  DBUG_PRINT("test",("page_length: %u  end: %u", ma_page->size, end));
+  page= ma_page->buff + share->keypage_header + nod_flag;
+
+  while (start != end)
+  {
+    mid= (start+end)/2;
+    if ((flag=ha_key_cmp(keyinfo->seg, page + (uint) mid * totlength,
+                         key->data, key->data_length + key->ref_length,
+                         comp_flag, not_used))
+        >= 0)
+      end=mid;
+    else
+      start=mid+1;
+  }
+  if (mid != start)
+    flag=ha_key_cmp(keyinfo->seg, page + (uint) start * totlength,
+                    key->data, key->data_length + key->ref_length, comp_flag,
+                    not_used);
+  if (flag < 0)
+    start++;                    /* point at next, bigger key */
+  *ret_pos= (page + (uint) start * totlength);
+  *last_key= end == save_end;
+  DBUG_PRINT("exit",("flag: %d  keypos: %d",flag,start));
+  DBUG_RETURN(flag);
+} /* _ma_bin_search */
+
+
+/**
+   Locate a packed key in a key page.
+
+   @fn    _ma_seq_search()
+   @param key                       Search key.
+   @param page                      Key page (beginning).
+   @param comp_flag                 Search flags like SEARCH_SAME etc.
+   @param ret_pos
+   @param buff                      Buffer for holding temp keys
+   @param last_key
+
+   @description
+   Used instead of _ma_bin_search() when key is packed.
+   Puts smaller or identical key in buff.
+   Key is searched sequentially.
+
+   @todo
+   Don't copy key to buffer if we are not using key with prefix packing
+
+   @return
+   @retval > 0         Key in 'buff' is smaller than search key.
+   @retval 0           Key in 'buff' is identical to search key.
+   @retval < 0         Not found.
+
+   @retval ret_pos   Points to where the identical or bigger key starts
+   @retval last_key  Set to 1 if key is the last key in the page
+   @retval buff      Copy of previous or identical unpacked key
+*/
+
+int _ma_seq_search(const MARIA_KEY *key, const MARIA_PAGE *ma_page,
+                   uint32 comp_flag, uchar **ret_pos,
+                   uchar *buff, my_bool *last_key)
+{
+  int flag;
+  uint page_flag, nod_flag, length, not_used[2];
+  uchar t_buff[MARIA_MAX_KEY_BUFF], *end;
+  uchar *page;
+  MARIA_KEYDEF *keyinfo= key->keyinfo;
+  MARIA_SHARE *share= keyinfo->share;
+  MARIA_KEY tmp_key;
+  DBUG_ENTER("_ma_seq_search");
+
+  LINT_INIT(flag);
+  LINT_INIT(length);
+
+  page_flag= ma_page->flag;
+  nod_flag=  ma_page->node;
+  page=      ma_page->buff;
+  end= page + ma_page->size;
+  page+= share->keypage_header + nod_flag;
+  *ret_pos= page;
+  t_buff[0]=0;                                  /* Avoid bugs */
+
+  tmp_key.data= t_buff;
+  tmp_key.keyinfo= keyinfo;
+  while (page < end)
+  {
+    length=(*keyinfo->get_key)(&tmp_key, page_flag, nod_flag, &page);
+    if (length == 0 || page > end)
+    {
+      maria_print_error(share, HA_ERR_CRASHED);
+      my_errno=HA_ERR_CRASHED;
+      DBUG_PRINT("error",
+                 ("Found wrong key:  length: %u  page: 0x%lx  end: 0x%lx",
+                  length, (long) page, (long) end));
+      DBUG_RETURN(MARIA_FOUND_WRONG_KEY);
+    }
+    if ((flag= ha_key_cmp(keyinfo->seg, t_buff, key->data,
+                          key->data_length + key->ref_length,
+                          comp_flag | tmp_key.flag,
+                          not_used)) >= 0)
+      break;
+    DBUG_PRINT("loop_extra",("page: 0x%lx  key: '%s'  flag: %d",
+                             (long) page, t_buff, flag));
+    memcpy(buff,t_buff,length);
+    *ret_pos=page;
+  }
+  if (flag == 0)
+    memcpy(buff,t_buff,length);                 /* Result is first key */
+  *last_key= page == end;
+  DBUG_PRINT("exit",("flag: %d  ret_pos: 0x%lx", flag, (long) *ret_pos));
+  DBUG_RETURN(flag);
+} /* _ma_seq_search */
+
+
+/**
+   Search for key on key page with string prefix compression
+
+   @notes
+   This is an optimized function compared to calling _ma_get_pack_key()
+   for each key in the buffer
+
+   Same interface as for _ma_seq_search()
+*/
+
+int _ma_prefix_search(const MARIA_KEY *key, const MARIA_PAGE *ma_page,
+                      uint32 nextflag, uchar **ret_pos, uchar *buff,
+                      my_bool *last_key)
+{
+  /*
+    my_flag is raw comparison result to be changed according to
+    SEARCH_NO_FIND,SEARCH_LAST and HA_REVERSE_SORT flags.
+    flag is the value returned by ha_key_cmp and as treated as final
+  */
+  int flag=0, my_flag=-1;
+  uint nod_flag, length, len, matched, cmplen, kseg_len;
+  uint page_flag, prefix_len,suffix_len;
+  int key_len_skip, seg_len_pack, key_len_left;
+  uchar *end, *vseg, *saved_vseg, *saved_from;
+  uchar *page;
+  uchar tt_buff[MARIA_MAX_KEY_BUFF+2], *t_buff=tt_buff+2;
+  uchar  *saved_to;
+  const uchar *kseg;
+  uint  saved_length=0, saved_prefix_len=0;
+  uint  length_pack;
+  MARIA_KEYDEF *keyinfo= key->keyinfo;
+  MARIA_SHARE *share= keyinfo->share;
+  const uchar *sort_order= keyinfo->seg->charset->sort_order;
+  DBUG_ENTER("_ma_prefix_search");
+
+  LINT_INIT(length);
+  LINT_INIT(prefix_len);
+  LINT_INIT(seg_len_pack);
+  LINT_INIT(saved_from);
+  LINT_INIT(saved_to);
+  LINT_INIT(saved_vseg);
+
+  t_buff[0]=0;                                  /* Avoid bugs */
+  page_flag=   ma_page->flag;
+  nod_flag=    ma_page->node;
+  page_flag&= KEYPAGE_FLAG_HAS_TRANSID;         /* For faster test in loop */
+  page= ma_page->buff;
+  end= page + ma_page->size;
+  page+= share->keypage_header + nod_flag;
+  *ret_pos= page;
+  kseg= key->data;
+
+  get_key_pack_length(kseg_len, length_pack, kseg);
+  key_len_skip=length_pack+kseg_len;
+  key_len_left=(int) (key->data_length + key->ref_length) - (int) key_len_skip;
+  /* If key_len is 0, then length_pack is 1, then key_len_left is -1. */
+  cmplen= ((key_len_left>=0) ? kseg_len :
+           (key->data_length + key->ref_length - length_pack));
+  DBUG_PRINT("info",("key: '%.*s'",kseg_len,kseg));
+
+  /*
+    Keys are compressed the following way:
+
+    If the max length of first key segment <= 127 bytes the prefix is
+    1 uchar else it's 2 byte
+
+    (prefix) length  The high bit is set if this is a prefix for the prev key.
+    [suffix length]  Packed length of suffix if the previous was a prefix.
+    (suffix) data    Key data bytes (past the common prefix or whole segment).
+    [next-key-seg]   Next key segments (([packed length], data), ...)
+    pointer          Reference to the data file (last_keyseg->length).
+  */
+
+  matched=0;  /* how many char's from prefix were alredy matched */
+  len=0;      /* length of previous key unpacked */
+
+  while (page < end)
+  {
+    uint packed= *page & 128;
+    uint key_flag;
+
+    vseg= page;
+    if (keyinfo->seg->length >= 127)
+    {
+      suffix_len=mi_uint2korr(vseg) & 32767;
+      vseg+=2;
+    }
+    else
+      suffix_len= *vseg++ & 127;
+
+    if (packed)
+    {
+      if (suffix_len == 0)
+      {
+        /* == 0x80 or 0x8000, same key, prefix length == old key length. */
+        prefix_len=len;
+      }
+      else
+      {
+        /* > 0x80 or 0x8000, this is prefix lgt, packed suffix lgt follows. */
+        prefix_len=suffix_len;
+        get_key_length(suffix_len,vseg);
+      }
+    }
+    else
+    {
+      /* Not packed. No prefix used from last key. */
+      prefix_len=0;
+    }
+
+    len=prefix_len+suffix_len;
+    seg_len_pack=get_pack_length(len);
+    t_buff=tt_buff+3-seg_len_pack;
+    store_key_length(t_buff,len);
+
+    if (prefix_len > saved_prefix_len)
+      memcpy(t_buff+seg_len_pack+saved_prefix_len,saved_vseg,
+             prefix_len-saved_prefix_len);
+    saved_vseg=vseg;
+    saved_prefix_len=prefix_len;
+
+    DBUG_PRINT("loop",("page: '%.*s%.*s'",prefix_len,t_buff+seg_len_pack,
+		       suffix_len,vseg));
+    {
+      /* Calculate length of one key */
+      uchar *from= vseg+suffix_len;
+      HA_KEYSEG *keyseg;
+
+      for (keyseg=keyinfo->seg+1 ; keyseg->type ; keyseg++ )
+      {
+        if (keyseg->flag & HA_NULL_PART)
+        {
+          if (!(*from++))
+            continue;
+        }
+        if (keyseg->flag & (HA_VAR_LENGTH_PART | HA_BLOB_PART | HA_SPACE_PACK))
+        {
+          uint key_part_length;
+          get_key_length(key_part_length,from);
+          from+= key_part_length;
+        }
+        else
+          from+= keyseg->length;
+      }
+      from+= keyseg->length;
+      key_flag=0;
+
+      if (page_flag && key_has_transid(from-1))
+      {
+        from+= transid_packed_length(from);
+        key_flag= SEARCH_PAGE_KEY_HAS_TRANSID;
+      }
+      page= from + nod_flag;
+      length= (uint) (from-vseg);
+    }
+
+    if (page > end)
+    {
+      maria_print_error(share, HA_ERR_CRASHED);
+      my_errno=HA_ERR_CRASHED;
+      DBUG_PRINT("error",
+                 ("Found wrong key:  length: %u  page: 0x%lx  end: %lx",
+                  length, (long) page, (long) end));
+      DBUG_RETURN(MARIA_FOUND_WRONG_KEY);
+    }
+
+    if (matched >= prefix_len)
+    {
+      /* We have to compare. But we can still skip part of the key */
+      uint  left;
+      const uchar *k= kseg+prefix_len;
+
+      /*
+        If prefix_len > cmplen then we are in the end-space comparison
+        phase. Do not try to acces the key any more ==> left= 0.
+      */
+      left= ((len <= cmplen) ? suffix_len :
+             ((prefix_len < cmplen) ? cmplen - prefix_len : 0));
+
+      matched=prefix_len+left;
+
+      if (sort_order)
+      {
+        for (my_flag=0;left;left--)
+          if ((my_flag= (int) sort_order[*vseg++] - (int) sort_order[*k++]))
+            break;
+      }
+      else
+      {
+        for (my_flag=0;left;left--)
+          if ((my_flag= (int) *vseg++ - (int) *k++))
+            break;
+      }
+
+      if (my_flag>0)      /* mismatch */
+        break;
+      if (my_flag==0) /* match */
+      {
+	/*
+        **  len cmplen seg_left_len more_segs
+        **     <                               matched=len; continue search
+        **     >      =                        prefix ? found : (matched=len;
+        *                                      continue search)
+        **     >      <                 -      ok, found
+        **     =      <                 -      ok, found
+        **     =      =                 -      ok, found
+        **     =      =                 +      next seg
+        */
+        if (len < cmplen)
+        {
+	  if ((keyinfo->seg->type != HA_KEYTYPE_TEXT &&
+	       keyinfo->seg->type != HA_KEYTYPE_VARTEXT1 &&
+               keyinfo->seg->type != HA_KEYTYPE_VARTEXT2))
+	    my_flag= -1;
+	  else
+	  {
+	    /* We have to compare k and vseg as if they were space extended */
+	    const uchar *k_end= k+ (cmplen - len);
+	    for ( ; k < k_end && *k == ' '; k++) ;
+	    if (k == k_end)
+	      goto cmp_rest;		/* should never happen */
+	    if ((uchar) *k < (uchar) ' ')
+	    {
+	      my_flag= 1;		/* Compared string is smaller */
+	      break;
+	    }
+	    my_flag= -1;		/* Continue searching */
+	  }
+        }
+        else if (len > cmplen)
+        {
+	  uchar *vseg_end;
+	  if ((nextflag & SEARCH_PREFIX) && key_len_left == 0)
+	    goto fix_flag;
+
+	  /* We have to compare k and vseg as if they were space extended */
+	  for (vseg_end= vseg + (len-cmplen) ;
+	       vseg < vseg_end && *vseg == (uchar) ' ';
+	       vseg++, matched++) ;
+	  DBUG_ASSERT(vseg < vseg_end);
+
+	  if ((uchar) *vseg > (uchar) ' ')
+	  {
+	    my_flag= 1;			/* Compared string is smaller */
+	    break;
+	  }
+	  my_flag= -1;			/* Continue searching */
+        }
+        else
+	{
+      cmp_rest:
+	  if (key_len_left>0)
+	  {
+	    uint not_used[2];
+	    if ((flag = ha_key_cmp(keyinfo->seg+1,vseg,
+				   k, key_len_left, nextflag | key_flag,
+                                   not_used)) >= 0)
+	      break;
+	  }
+	  else
+	  {
+	    /*
+	      at this line flag==-1 if the following lines were already
+	      visited and 0 otherwise,  i.e. flag <=0 here always !!!
+	    */
+	fix_flag:
+	    DBUG_ASSERT(flag <= 0);
+	    if (nextflag & (SEARCH_NO_FIND | SEARCH_LAST))
+	      flag=(nextflag & (SEARCH_BIGGER | SEARCH_LAST)) ? -1 : 1;
+	    if (flag>=0)
+	      break;
+	  }
+	}
+      }
+      matched-=left;
+    }
+    /* else (matched < prefix_len) ---> do nothing. */
+
+    memcpy(buff,t_buff,saved_length=seg_len_pack+prefix_len);
+    saved_to= buff+saved_length;
+    saved_from= saved_vseg;
+    saved_length=length;
+    *ret_pos=page;
+  }
+  if (my_flag)
+    flag=(keyinfo->seg->flag & HA_REVERSE_SORT) ? -my_flag : my_flag;
+  if (flag == 0)
+  {
+    memcpy(buff,t_buff,saved_length=seg_len_pack+prefix_len);
+    saved_to= buff+saved_length;
+    saved_from= saved_vseg;
+    saved_length=length;
+  }
+  if (saved_length)
+    memcpy(saved_to, saved_from, saved_length);
+
+  *last_key= page == end;
+
+  DBUG_PRINT("exit",("flag: %d  ret_pos: 0x%lx", flag, (long) *ret_pos));
+  DBUG_RETURN(flag);
+} /* _ma_prefix_search */
+
+
+/* Get pos to a key_block */
+
+my_off_t _ma_kpos(uint nod_flag, const uchar *after_key)
+{
+  after_key-=nod_flag;
+  switch (nod_flag) {
+#if SIZEOF_OFF_T > 4
+  case 7:
+    return mi_uint7korr(after_key)*maria_block_size;
+  case 6:
+    return mi_uint6korr(after_key)*maria_block_size;
+  case 5:
+    return mi_uint5korr(after_key)*maria_block_size;
+#else
+  case 7:
+    after_key++;
+  case 6:
+    after_key++;
+  case 5:
+    after_key++;
+#endif
+  case 4:
+    return ((my_off_t) mi_uint4korr(after_key))*maria_block_size;
+  case 3:
+    return ((my_off_t) mi_uint3korr(after_key))*maria_block_size;
+  case 2:
+    return (my_off_t) (mi_uint2korr(after_key)*maria_block_size);
+  case 1:
+    return (uint) (*after_key)*maria_block_size;
+  case 0:                                       /* At leaf page */
+  default:                                      /* Impossible */
+    return(HA_OFFSET_ERROR);
+  }
+} /* _kpos */
+
+
+/* Save pos to a key_block */
+
+void _ma_kpointer(register MARIA_HA *info, register uchar *buff, my_off_t pos)
+{
+  pos/=maria_block_size;
+  switch (info->s->base.key_reflength) {
+#if SIZEOF_OFF_T > 4
+  case 7: mi_int7store(buff,pos); break;
+  case 6: mi_int6store(buff,pos); break;
+  case 5: mi_int5store(buff,pos); break;
+#else
+  case 7: *buff++=0;
+    /* fall trough */
+  case 6: *buff++=0;
+    /* fall trough */
+  case 5: *buff++=0;
+    /* fall trough */
+#endif
+  case 4: mi_int4store(buff,pos); break;
+  case 3: mi_int3store(buff,pos); break;
+  case 2: mi_int2store(buff,(uint) pos); break;
+  case 1: buff[0]= (uchar) pos; break;
+  default: abort();                             /* impossible */
+  }
+} /* _ma_kpointer */
+
+
+/* Calc pos to a data-record from a key */
+
+MARIA_RECORD_POS _ma_row_pos_from_key(const MARIA_KEY *key)
+{
+  my_off_t pos;
+  const uchar *after_key= key->data + key->data_length;
+  MARIA_SHARE *share= key->keyinfo->share;
+  switch (share->rec_reflength) {
+#if SIZEOF_OFF_T > 4
+  case 8:  pos= (my_off_t) mi_uint8korr(after_key);  break;
+  case 7:  pos= (my_off_t) mi_uint7korr(after_key);  break;
+  case 6:  pos= (my_off_t) mi_uint6korr(after_key);  break;
+  case 5:  pos= (my_off_t) mi_uint5korr(after_key);  break;
+#else
+  case 8:  pos= (my_off_t) mi_uint4korr(after_key+4);   break;
+  case 7:  pos= (my_off_t) mi_uint4korr(after_key+3);   break;
+  case 6:  pos= (my_off_t) mi_uint4korr(after_key+2);   break;
+  case 5:  pos= (my_off_t) mi_uint4korr(after_key+1);   break;
+#endif
+  case 4:  pos= (my_off_t) mi_uint4korr(after_key);  break;
+  case 3:  pos= (my_off_t) mi_uint3korr(after_key);  break;
+  case 2:  pos= (my_off_t) mi_uint2korr(after_key);  break;
+  default:
+    pos=0L;                                     /* Shut compiler up */
+  }
+  return (*share->keypos_to_recpos)(share, pos);
+}
+
+
+/**
+   Get trid from a key
+
+   @param key	Maria key read from a page
+
+   @retval 0    If key doesn't have a trid
+   @retval trid
+*/
+
+TrID _ma_trid_from_key(const MARIA_KEY *key)
+{
+  if (!(key->flag & (SEARCH_PAGE_KEY_HAS_TRANSID |
+                     SEARCH_USER_KEY_HAS_TRANSID)))
+    return 0;
+  return transid_get_packed(key->keyinfo->share,
+                            key->data + key->data_length +
+                            key->keyinfo->share->rec_reflength);
+}
+
+
+/* Calc position from a record pointer ( in delete link chain ) */
+
+MARIA_RECORD_POS _ma_rec_pos(MARIA_SHARE *share, uchar *ptr)
+{
+  my_off_t pos;
+  switch (share->rec_reflength) {
+#if SIZEOF_OFF_T > 4
+  case 8:
+    pos= (my_off_t) mi_uint8korr(ptr);
+    if (pos == HA_OFFSET_ERROR)
+      return HA_OFFSET_ERROR;                   /* end of list */
+    break;
+  case 7:
+    pos= (my_off_t) mi_uint7korr(ptr);
+    if (pos == (((my_off_t) 1) << 56) -1)
+      return HA_OFFSET_ERROR;                   /* end of list */
+    break;
+  case 6:
+    pos= (my_off_t) mi_uint6korr(ptr);
+    if (pos == (((my_off_t) 1) << 48) -1)
+      return HA_OFFSET_ERROR;                   /* end of list */
+    break;
+  case 5:
+    pos= (my_off_t) mi_uint5korr(ptr);
+    if (pos == (((my_off_t) 1) << 40) -1)
+      return HA_OFFSET_ERROR;                   /* end of list */
+    break;
+#else
+  case 8:
+  case 7:
+  case 6:
+  case 5:
+    ptr+= (share->rec_reflength-4);
+    /* fall through */
+#endif
+  case 4:
+    pos= (my_off_t) mi_uint4korr(ptr);
+    if (pos == (my_off_t) (uint32) ~0L)
+      return  HA_OFFSET_ERROR;
+    break;
+  case 3:
+    pos= (my_off_t) mi_uint3korr(ptr);
+    if (pos == (my_off_t) (1 << 24) -1)
+      return HA_OFFSET_ERROR;
+    break;
+  case 2:
+    pos= (my_off_t) mi_uint2korr(ptr);
+    if (pos == (my_off_t) (1 << 16) -1)
+      return HA_OFFSET_ERROR;
+    break;
+  default: abort();                             /* Impossible */
+  }
+  return (*share->keypos_to_recpos)(share, pos);
+}
+
+
+/* save position to record */
+
+void _ma_dpointer(MARIA_SHARE *share, uchar *buff, my_off_t pos)
+{
+  if (pos != HA_OFFSET_ERROR)
+    pos= (*share->recpos_to_keypos)(share, pos);
+
+  switch (share->rec_reflength) {
+#if SIZEOF_OFF_T > 4
+  case 8: mi_int8store(buff,pos); break;
+  case 7: mi_int7store(buff,pos); break;
+  case 6: mi_int6store(buff,pos); break;
+  case 5: mi_int5store(buff,pos); break;
+#else
+  case 8: *buff++=0;
+    /* fall trough */
+  case 7: *buff++=0;
+    /* fall trough */
+  case 6: *buff++=0;
+    /* fall trough */
+  case 5: *buff++=0;
+    /* fall trough */
+#endif
+  case 4: mi_int4store(buff,pos); break;
+  case 3: mi_int3store(buff,pos); break;
+  case 2: mi_int2store(buff,(uint) pos); break;
+  default: abort();                             /* Impossible */
+  }
+} /* _ma_dpointer */
+
+
+my_off_t _ma_static_keypos_to_recpos(MARIA_SHARE *share, my_off_t pos)
+{
+  return pos * share->base.pack_reclength;
+}
+
+
+my_off_t _ma_static_recpos_to_keypos(MARIA_SHARE *share, my_off_t pos)
+{
+  return pos / share->base.pack_reclength;
+}
+
+my_off_t _ma_transparent_recpos(MARIA_SHARE *share __attribute__((unused)),
+                                my_off_t pos)
+{
+  return pos;
+}
+
+my_off_t _ma_transaction_keypos_to_recpos(MARIA_SHARE *share
+                                          __attribute__((unused)),
+                                          my_off_t pos)
+{
+  /* We need one bit to store if there is transid's after position */
+  return pos >> 1;
+}
+
+my_off_t _ma_transaction_recpos_to_keypos(MARIA_SHARE *share
+                                          __attribute__((unused)),
+                                          my_off_t pos)
+{
+  return pos << 1;
+}
+
+/*
+  @brief Get key from key-block
+
+  @param key         Should contain previous key. Will contain new key
+  @param page_flag   Flag on page block
+  @param nod_flag    Is set to nod length if we on nod
+  @param page        Points at previous key; Its advanced to point at next key
+
+  @notes
+    Same as _ma_get_key but used with fixed length keys
+
+  @return
+  @retval key_length + length of data pointer (without nod length)
+ */
+
+uint _ma_get_static_key(MARIA_KEY *key, uint page_flag, uint nod_flag,
+                        register uchar **page)
+{
+  register MARIA_KEYDEF *keyinfo= key->keyinfo;
+  size_t key_length= keyinfo->keylength;
+
+  key->ref_length=  keyinfo->share->rec_reflength;
+  key->data_length= key_length - key->ref_length;
+  key->flag= 0;
+  if (page_flag & KEYPAGE_FLAG_HAS_TRANSID)
+  {
+    uchar *end= *page + keyinfo->keylength;
+    if (key_has_transid(end-1))
+    {
+      uint trans_length= transid_packed_length(end);
+      key->ref_length+= trans_length;
+      key_length+= trans_length;
+      key->flag= SEARCH_PAGE_KEY_HAS_TRANSID;
+    }
+  }
+  key_length+= nod_flag;
+  memcpy(key->data, *page, key_length);
+  *page+= key_length;
+  return key_length - nod_flag;
+} /* _ma_get_static_key */
+
+
+/**
+   Skip over static length key from key-block
+
+  @fn _ma_skip_static_key()
+  @param key       Keyinfo and buffer that can be used
+  @param nod_flag  If nod: Length of node pointer, else zero.
+  @param key       Points at key
+
+  @retval pointer to next key
+*/
+
+uchar *_ma_skip_static_key(MARIA_KEY *key, uint page_flag,
+                           uint nod_flag, uchar *page)
+{
+  page+= key->keyinfo->keylength;
+  if ((page_flag & KEYPAGE_FLAG_HAS_TRANSID) && key_has_transid(page-1))
+    page+= transid_packed_length(page);
+  return page+ nod_flag;
+}
+
+
+/*
+  get key which is packed against previous key or key with a NULL column.
+
+  SYNOPSIS
+    _ma_get_pack_key()
+    @param int_key   Should contain previous key. Will contain new key
+    @param page_flag page_flag from page
+    @param nod_flag  If nod: Length of node pointer, else zero.
+    @param page_pos  Points at previous key; Its advanced to point at next key
+
+    @return
+    @retval key_length + length of data pointer
+*/
+
+uint _ma_get_pack_key(MARIA_KEY *int_key, uint page_flag,
+                      uint nod_flag, uchar **page_pos)
+{
+  reg1 HA_KEYSEG *keyseg;
+  uchar *page= *page_pos;
+  uint length;
+  uchar *key= int_key->data;
+  MARIA_KEYDEF *keyinfo= int_key->keyinfo;
+
+  for (keyseg=keyinfo->seg ; keyseg->type ;keyseg++)
+  {
+    if (keyseg->flag & HA_PACK_KEY)
+    {
+      /* key with length, packed to previous key */
+      uchar *start= key;
+      uint packed= *page & 128,tot_length,rest_length;
+      if (keyseg->length >= 127)
+      {
+        length=mi_uint2korr(page) & 32767;
+        page+=2;
+      }
+      else
+        length= *page++ & 127;
+
+      if (packed)
+      {
+	if (length > (uint) keyseg->length)
+	{
+          maria_print_error(keyinfo->share, HA_ERR_CRASHED);
+	  my_errno=HA_ERR_CRASHED;
+	  return 0;				/* Error */
+	}
+	if (length == 0)			/* Same key */
+	{
+	  if (keyseg->flag & HA_NULL_PART)
+	    *key++=1;				/* Can't be NULL */
+	  get_key_length(length,key);
+	  key+= length;				/* Same diff_key as prev */
+	  if (length > keyseg->length)
+	  {
+	    DBUG_PRINT("error",
+                       ("Found too long null packed key: %u of %u at 0x%lx",
+                        length, keyseg->length, (long) *page_pos));
+	    DBUG_DUMP("key", *page_pos, 16);
+            maria_print_error(keyinfo->share, HA_ERR_CRASHED);
+	    my_errno=HA_ERR_CRASHED;
+	    return 0;
+	  }
+	  continue;
+	}
+	if (keyseg->flag & HA_NULL_PART)
+	{
+	  key++;				/* Skip null marker*/
+	  start++;
+	}
+
+	get_key_length(rest_length,page);
+	tot_length=rest_length+length;
+
+	/* If the stored length has changed, we must move the key */
+	if (tot_length >= 255 && *start != 255)
+	{
+	  /* length prefix changed from a length of one to a length of 3 */
+	  bmove_upp(key+length+3, key+length+1, length);
+	  *key=255;
+	  mi_int2store(key+1,tot_length);
+	  key+=3+length;
+	}
+	else if (tot_length < 255 && *start == 255)
+	{
+	  bmove(key+1,key+3,length);
+	  *key=tot_length;
+	  key+=1+length;
+	}
+	else
+	{
+	  store_key_length_inc(key,tot_length);
+	  key+=length;
+	}
+	memcpy(key,page,rest_length);
+	page+=rest_length;
+	key+=rest_length;
+	continue;
+      }
+      else
+      {
+        /* Key that is not packed against previous key */
+        if (keyseg->flag & HA_NULL_PART)
+        {
+          if (!length--)                        /* Null part */
+          {
+            *key++=0;
+            continue;
+          }
+          *key++=1;                             /* Not null */
+        }
+      }
+      if (length > (uint) keyseg->length)
+      {
+        DBUG_PRINT("error",("Found too long packed key: %u of %u at 0x%lx",
+                            length, keyseg->length, (long) *page_pos));
+        DBUG_DUMP("key", *page_pos, 16);
+        maria_print_error(keyinfo->share, HA_ERR_CRASHED);
+        my_errno=HA_ERR_CRASHED;
+        return 0;                               /* Error */
+      }
+      store_key_length_inc(key,length);
+    }
+    else
+    {
+      if (keyseg->flag & HA_NULL_PART)
+      {
+        if (!(*key++ = *page++))
+          continue;
+      }
+      if (keyseg->flag &
+          (HA_VAR_LENGTH_PART | HA_BLOB_PART | HA_SPACE_PACK))
+      {
+        uchar *tmp=page;
+        get_key_length(length,tmp);
+        length+=(uint) (tmp-page);
+      }
+      else
+        length=keyseg->length;
+    }
+    memcpy(key, page,(size_t) length);
+    key+=length;
+    page+=length;
+  }
+
+  int_key->data_length= (key - int_key->data);
+  int_key->flag= 0;
+  length= keyseg->length;
+  if (page_flag & KEYPAGE_FLAG_HAS_TRANSID)
+  {
+    uchar *end= page + length;
+    if (key_has_transid(end-1))
+    {
+      length+= transid_packed_length(end);
+      int_key->flag= SEARCH_PAGE_KEY_HAS_TRANSID;
+    }
+  }
+  int_key->ref_length= length;
+  length+= nod_flag;
+  bmove(key, page, length);
+  *page_pos= page+length;
+
+  return (int_key->data_length + int_key->ref_length);
+} /* _ma_get_pack_key */
+
+
+/**
+  skip key which is packed against previous key or key with a NULL column.
+
+  @fn _ma_skip_pack_key()
+  @param key       Keyinfo and buffer that can be used
+  @param nod_flag  If nod: Length of node pointer, else zero.
+  @param key       Points at key
+
+  @note
+  This is in principle a simpler version of _ma_get_pack_key()
+
+  @retval pointer to next key
+*/
+
+uchar *_ma_skip_pack_key(MARIA_KEY *key, uint page_flag,
+                         uint nod_flag, uchar *page)
+{
+  reg1 HA_KEYSEG *keyseg;
+  for (keyseg= key->keyinfo->seg ; keyseg->type ; keyseg++)
+  {
+    if (keyseg->flag & HA_PACK_KEY)
+    {
+      /* key with length, packed to previous key */
+      uint packed= *page & 128, length;
+      if (keyseg->length >= 127)
+      {
+        length= mi_uint2korr(page) & 32767;
+        page+= 2;
+      }
+      else
+        length= *page++ & 127;
+
+      if (packed)
+      {
+	if (length == 0)			/* Same key */
+	  continue;
+	get_key_length(length,page);
+	page+= length;
+	continue;
+      }
+      if ((keyseg->flag & HA_NULL_PART) && length)
+      {
+        /*
+          Keys that can have null use length+1 as the length for date as the
+          number 0 is reserved for keys that have a NULL value
+        */
+        length--;
+      }
+      page+= length;
+    }
+    else
+    {
+      if (keyseg->flag & HA_NULL_PART)
+        if (!*page++)
+          continue;
+      if (keyseg->flag & (HA_SPACE_PACK | HA_BLOB_PART | HA_VAR_LENGTH_PART))
+      {
+        uint length;
+        get_key_length(length,page);
+        page+=length;
+      }
+      else
+        page+= keyseg->length;
+    }
+  }
+  page+= keyseg->length;
+  if ((page_flag & KEYPAGE_FLAG_HAS_TRANSID) && key_has_transid(page-1))
+    page+= transid_packed_length(page);
+  return page + nod_flag;
+}
+
+
+/* Read key that is packed relatively to previous */
+
+uint _ma_get_binary_pack_key(MARIA_KEY *int_key, uint page_flag, uint nod_flag,
+                             register uchar **page_pos)
+{
+  reg1 HA_KEYSEG *keyseg;
+  uchar *page, *page_end, *from, *from_end, *key;
+  uint length,tmp;
+  MARIA_KEYDEF *keyinfo= int_key->keyinfo;
+  DBUG_ENTER("_ma_get_binary_pack_key");
+
+  page= *page_pos;
+  page_end=page + MARIA_MAX_KEY_BUFF + 1;
+  key= int_key->data;
+
+  /*
+    Keys are compressed the following way:
+
+    prefix length      Packed length of prefix common with prev key.
+                       (1 or 3 bytes)
+    for each key segment:
+      [is null]        Null indicator if can be null (1 byte, zero means null)
+      [length]         Packed length if varlength (1 or 3 bytes)
+      key segment      'length' bytes of key segment value
+    pointer          Reference to the data file (last_keyseg->length).
+
+    get_key_length() is a macro. It gets the prefix length from 'page'
+    and puts it into 'length'. It increments 'page' by 1 or 3, depending
+    on the packed length of the prefix length.
+  */
+  get_key_length(length,page);
+  if (length)
+  {
+    if (length > keyinfo->maxlength)
+    {
+      DBUG_PRINT("error",
+                 ("Found too long binary packed key: %u of %u at 0x%lx",
+                  length, keyinfo->maxlength, (long) *page_pos));
+      DBUG_DUMP("key", *page_pos, 16);
+      maria_print_error(keyinfo->share, HA_ERR_CRASHED);
+      my_errno=HA_ERR_CRASHED;
+      DBUG_RETURN(0);                                 /* Wrong key */
+    }
+    /* Key is packed against prev key, take prefix from prev key. */
+    from= key;
+    from_end= key + length;
+  }
+  else
+  {
+    /* Key is not packed against prev key, take all from page buffer. */
+    from= page;
+    from_end= page_end;
+  }
+
+  /*
+    The trouble is that key can be split in two parts:
+      The first part (prefix) is in from .. from_end - 1.
+      The second part starts at page.
+    The split can be at every byte position. So we need to check for
+    the end of the first part before using every byte.
+  */
+  for (keyseg=keyinfo->seg ; keyseg->type ;keyseg++)
+  {
+    if (keyseg->flag & HA_NULL_PART)
+    {
+      /* If prefix is used up, switch to rest. */
+      if (from == from_end)
+      {
+        from=page;
+        from_end=page_end;
+      }
+      if (!(*key++ = *from++))
+        continue;                               /* Null part */
+    }
+    if (keyseg->flag & (HA_VAR_LENGTH_PART | HA_BLOB_PART | HA_SPACE_PACK))
+    {
+      /* If prefix is used up, switch to rest. */
+      if (from == from_end) { from=page;  from_end=page_end; }
+      /* Get length of dynamic length key part */
+      if ((length= (uint) (uchar) (*key++ = *from++)) == 255)
+      {
+        /* If prefix is used up, switch to rest. */
+        if (from == from_end) { from=page;  from_end=page_end; }
+        length= ((uint) (uchar) ((*key++ = *from++))) << 8;
+        /* If prefix is used up, switch to rest. */
+        if (from == from_end) { from=page;  from_end=page_end; }
+        length+= (uint) (uchar) ((*key++ = *from++));
+      }
+    }
+    else
+      length=keyseg->length;
+
+    if ((tmp=(uint) (from_end-from)) <= length)
+    {
+      key+=tmp;                                 /* Use old key */
+      length-=tmp;
+      from=page; from_end=page_end;
+    }
+    DBUG_ASSERT((int) length >= 0);
+    DBUG_PRINT("info",("key: 0x%lx  from: 0x%lx  length: %u",
+		       (long) key, (long) from, length));
+    memmove(key, from, (size_t) length);
+    key+=length;
+    from+=length;
+  }
+  /*
+    Last segment (type == 0) contains length of data pointer.
+    If we have mixed key blocks with data pointer and key block pointer,
+    we have to copy both.
+  */
+  int_key->data_length= (key - int_key->data);
+  int_key->ref_length= length= keyseg->length;
+  int_key->flag= 0;
+  if ((tmp=(uint) (from_end-from)) <= length)
+  {
+    /* Skip over the last common part of the data */
+    key+= tmp;
+    length-= tmp;
+    from= page;
+  }
+  else
+  {
+    /*
+      Remaining length is greater than max possible length.
+      This can happen only if we switched to the new key bytes already.
+      'page_end' is calculated with MARIA_MAX_KEY_BUFF. So it can be far
+      behind the real end of the key.
+    */
+    if (from_end != page_end)
+    {
+      DBUG_PRINT("error",("Error when unpacking key"));
+      maria_print_error(keyinfo->share, HA_ERR_CRASHED);
+      my_errno=HA_ERR_CRASHED;
+      DBUG_RETURN(0);                                 /* Error */
+    }
+  }
+  if (page_flag & KEYPAGE_FLAG_HAS_TRANSID)
+  {
+    uchar *end= from + length;
+    if (key_has_transid(end-1))
+    {
+      uint trans_length= transid_packed_length(end);
+      length+= trans_length;
+      int_key->ref_length+= trans_length;
+      int_key->flag= SEARCH_PAGE_KEY_HAS_TRANSID;
+    }
+  }
+
+  /* Copy rest of data ptr and, if appropriate, trans_id and node_ptr */
+  memcpy(key, from, length + nod_flag);
+  *page_pos= from + length + nod_flag;
+  
+  DBUG_RETURN(int_key->data_length + int_key->ref_length);
+}
+
+/**
+  skip key which is ptefix packed against previous key
+
+  @fn _ma_skip_binary_key()
+  @param key       Keyinfo and buffer that can be used
+  @param nod_flag  If nod: Length of node pointer, else zero.
+  @param key       Points at key
+
+  @note
+  We have to copy the key as otherwise we don't know how much left
+  data there is of the key.
+
+  @todo
+  Implement more efficient version of this. We can ignore to copy any rest
+  key parts that are not null or not packed. We also don't have to copy
+  rowid or transid.
+
+  @retval pointer to next key
+*/
+
+uchar *_ma_skip_binary_pack_key(MARIA_KEY *key, uint page_flag,
+                                uint nod_flag, uchar *page)
+{
+  if (!_ma_get_binary_pack_key(key, page_flag, nod_flag, &page))
+    return 0;
+  return page;
+}
+
+
+/**
+  @brief Get key at position without knowledge of previous key
+
+  @return pointer to next key
+*/
+
+uchar *_ma_get_key(MARIA_KEY *key, MARIA_PAGE *ma_page, uchar *keypos)
+{
+  uint page_flag, nod_flag;
+  MARIA_KEYDEF *keyinfo= key->keyinfo;
+  uchar *page;
+  DBUG_ENTER("_ma_get_key");
+
+  page=       ma_page->buff;
+  page_flag=  ma_page->flag;
+  nod_flag=   ma_page->node;
+
+  if (! (keyinfo->flag & (HA_VAR_LENGTH_KEY | HA_BINARY_PACK_KEY)) &&
+      ! (page_flag & KEYPAGE_FLAG_HAS_TRANSID))
+  {
+    bmove(key->data, keypos, keyinfo->keylength+nod_flag);
+    key->ref_length= keyinfo->share->rec_reflength;
+    key->data_length= keyinfo->keylength - key->ref_length;
+    key->flag= 0;
+    DBUG_RETURN(keypos+keyinfo->keylength+nod_flag);
+  }
+  else
+  {
+    page+= keyinfo->share->keypage_header + nod_flag;
+    key->data[0]= 0;                            /* safety */
+    while (page <= keypos)
+    {
+      if (!(*keyinfo->get_key)(key, page_flag, nod_flag, &page))
+      {
+        maria_print_error(keyinfo->share, HA_ERR_CRASHED);
+        my_errno=HA_ERR_CRASHED;
+        DBUG_RETURN(0);
+      }
+    }
+  }
+  DBUG_PRINT("exit",("page: 0x%lx  length: %u", (long) page,
+                     key->data_length + key->ref_length));
+  DBUG_RETURN(page);
+} /* _ma_get_key */
+
+
+/*
+  @brief Get key at position without knowledge of previous key
+
+  @return
+  @retval 0  ok
+  @retval 1  error
+*/
+
+static my_bool _ma_get_prev_key(MARIA_KEY *key, MARIA_PAGE *ma_page,
+                                uchar *keypos)
+{
+  uint page_flag, nod_flag;
+  MARIA_KEYDEF *keyinfo= key->keyinfo;
+  DBUG_ENTER("_ma_get_prev_key");
+
+  page_flag= ma_page->flag;
+  nod_flag=  ma_page->node;
+
+  if (! (keyinfo->flag & (HA_VAR_LENGTH_KEY | HA_BINARY_PACK_KEY)) &&
+      ! (page_flag & KEYPAGE_FLAG_HAS_TRANSID))
+  {
+    bmove(key->data, keypos - keyinfo->keylength - nod_flag,
+          keyinfo->keylength);
+    key->ref_length= keyinfo->share->rec_reflength;
+    key->data_length= keyinfo->keylength - key->ref_length;
+    key->flag= 0;
+    DBUG_RETURN(0);
+  }
+  else
+  {
+    uchar *page;
+
+    page= ma_page->buff + keyinfo->share->keypage_header + nod_flag;
+    key->data[0]= 0;                            /* safety */
+    DBUG_ASSERT(page != keypos);
+    while (page < keypos)
+    {
+      if (! (*keyinfo->get_key)(key, page_flag, nod_flag, &page))
+      {
+        maria_print_error(keyinfo->share, HA_ERR_CRASHED);
+        my_errno=HA_ERR_CRASHED;
+        DBUG_RETURN(1);
+      }
+    }
+  }
+  DBUG_RETURN(0);
+} /* _ma_get_prev_key */
+
+
+/*
+  @brief Get last key from key-page before 'endpos'
+
+  @note
+  endpos may be either end of buffer or start of a key
+
+  @return
+  @retval pointer to where key starts
+*/
+
+uchar *_ma_get_last_key(MARIA_KEY *key, MARIA_PAGE *ma_page, uchar *endpos)
+{
+  uint page_flag,nod_flag;
+  uchar *lastpos, *page;
+  MARIA_KEYDEF *keyinfo= key->keyinfo;
+  DBUG_ENTER("_ma_get_last_key");
+  DBUG_PRINT("enter",("page: 0x%lx  endpos: 0x%lx", (long) ma_page->buff,
+                      (long) endpos));
+
+  page_flag= ma_page->flag;
+  nod_flag=  ma_page->node;
+  page= ma_page->buff + keyinfo->share->keypage_header + nod_flag;
+
+  if (! (keyinfo->flag & (HA_VAR_LENGTH_KEY | HA_BINARY_PACK_KEY)) &&
+      ! (page_flag & KEYPAGE_FLAG_HAS_TRANSID))
+  {
+    lastpos= endpos-keyinfo->keylength-nod_flag;
+    key->ref_length= keyinfo->share->rec_reflength;
+    key->data_length= keyinfo->keylength - key->ref_length;
+    key->flag= 0;
+    if (lastpos >= page)
+      bmove(key->data, lastpos, keyinfo->keylength + nod_flag);
+  }
+  else
+  {
+    lastpos= page;
+    key->data[0]=0;                             /* safety */
+    while (page < endpos)
+    {
+      lastpos= page;
+      if (!(*keyinfo->get_key)(key, page_flag, nod_flag, &page))
+      {
+        DBUG_PRINT("error",("Couldn't find last key:  page: 0x%lx",
+                            (long) page));
+        maria_print_error(keyinfo->share, HA_ERR_CRASHED);
+        my_errno=HA_ERR_CRASHED;
+        DBUG_RETURN(0);
+      }
+    }
+  }
+  DBUG_PRINT("exit",("lastpos: 0x%lx  length: %u", (ulong) lastpos,
+                     key->data_length + key->ref_length));
+  DBUG_RETURN(lastpos);
+} /* _ma_get_last_key */
+
+
+/**
+   Calculate length of unpacked key
+
+   @param info	       Maria handler
+   @param keyinfo      key handler
+   @param key	       data for key
+
+   @notes
+     This function is very seldom used.  It's mainly used for debugging
+     or when calculating a key length from a stored key in batch insert.
+
+     This function does *NOT* calculate length of transid size!
+     This function can't be used against a prefix packed key on a page
+
+   @return
+   @retval total length for key
+*/
+
+uint _ma_keylength(MARIA_KEYDEF *keyinfo, const uchar *key)
+{
+  reg1 HA_KEYSEG *keyseg;
+  const uchar *start;
+
+  if (! (keyinfo->flag & (HA_VAR_LENGTH_KEY | HA_BINARY_PACK_KEY)))
+    return (keyinfo->keylength);
+
+  start= key;
+  for (keyseg=keyinfo->seg ; keyseg->type ; keyseg++)
+  {
+    if (keyseg->flag & HA_NULL_PART)
+      if (!*key++)
+        continue;
+    if (keyseg->flag & (HA_SPACE_PACK | HA_BLOB_PART | HA_VAR_LENGTH_PART))
+    {
+      uint length;
+      get_key_length(length,key);
+      key+=length;
+    }
+    else
+      key+= keyseg->length;
+  }
+  return((uint) (key-start)+keyseg->length);
+} /* _ma_keylength */
+
+
+/*
+  Calculate length of part key.
+
+  Used in maria_rkey() to find the key found for the key-part that was used.
+  This is needed in case of multi-byte character sets where we may search
+  after '0xDF' but find 'ss'
+*/
+
+uint _ma_keylength_part(MARIA_KEYDEF *keyinfo, register const uchar *key,
+			HA_KEYSEG *end)
+{
+  reg1 HA_KEYSEG *keyseg;
+  const uchar *start= key;
+
+  for (keyseg=keyinfo->seg ; keyseg != end ; keyseg++)
+  {
+    if (keyseg->flag & HA_NULL_PART)
+      if (!*key++)
+        continue;
+    if (keyseg->flag & (HA_SPACE_PACK | HA_BLOB_PART | HA_VAR_LENGTH_PART))
+    {
+      uint length;
+      get_key_length(length,key);
+      key+=length;
+    }
+    else
+      key+= keyseg->length;
+  }
+  return (uint) (key-start);
+}
+
+
+/*
+  Find next/previous record with same key
+
+  WARNING
+    This can't be used when database is touched after last read
+*/
+
+int _ma_search_next(register MARIA_HA *info, MARIA_KEY *key,
+                    uint32 nextflag, my_off_t pos)
+{
+  int error;
+  uchar lastkey[MARIA_MAX_KEY_BUFF];
+  MARIA_KEYDEF *keyinfo= key->keyinfo;
+  MARIA_KEY tmp_key;
+  MARIA_PAGE page;
+  DBUG_ENTER("_ma_search_next");
+  DBUG_PRINT("enter",("nextflag: %u  lastpos: %lu  int_keypos: 0x%lx  page_changed %d  keyread_buff_used: %d",
+                      nextflag, (ulong) info->cur_row.lastpos,
+                      (ulong) info->int_keypos,
+                      info->page_changed, info->keyread_buff_used));
+  DBUG_EXECUTE("key", _ma_print_key(DBUG_FILE, key););
+
+  /*
+    Force full read if we are at last key or if we are not on a leaf
+    and the key tree has changed since we used it last time
+    Note that even if the key tree has changed since last read, we can use
+    the last read data from the leaf if we haven't used the buffer for
+    something else.
+  */
+
+  if (((nextflag & SEARCH_BIGGER) && info->int_keypos >= info->int_maxpos) ||
+      info->page_changed ||
+      (info->int_keytree_version != keyinfo->version &&
+       (info->int_nod_flag || info->keyread_buff_used)))
+    DBUG_RETURN(_ma_search(info, key, nextflag | SEARCH_SAVE_BUFF,
+                           pos));
+
+  if (info->keyread_buff_used)
+  {
+    if (_ma_fetch_keypage(&page, info, keyinfo, info->last_search_keypage,
+                          PAGECACHE_LOCK_LEFT_UNLOCKED,
+                          DFLT_INIT_HITS, info->keyread_buff, 0))
+      DBUG_RETURN(-1);
+    info->keyread_buff_used=0;
+  }
+  else
+  {
+    /* Last used buffer is in info->keyread_buff */
+    /* Todo:  Add info->keyread_page to keep track of this */
+    _ma_page_setup(&page, info, keyinfo, 0, info->keyread_buff);
+  }
+
+  tmp_key.data=   lastkey;
+  info->last_key.keyinfo= tmp_key.keyinfo= keyinfo;
+
+  if (nextflag & SEARCH_BIGGER)                                 /* Next key */
+  {
+    if (page.node)
+    {
+      my_off_t tmp_pos= _ma_kpos(page.node, info->int_keypos);
+
+      if ((error= _ma_search(info, key, nextflag | SEARCH_SAVE_BUFF,
+                             tmp_pos)) <=0)
+        DBUG_RETURN(error);
+    }
+    if (keyinfo->flag & (HA_PACK_KEY | HA_BINARY_PACK_KEY) &&
+        info->last_key.data != key->data)
+      memcpy(info->last_key.data, key->data,
+             key->data_length + key->ref_length);
+    if (!(*keyinfo->get_key)(&info->last_key, page.flag, page.node,
+                             &info->int_keypos))
+      DBUG_RETURN(-1);
+  }
+  else                                                  /* Previous key */
+  {
+    /* Find start of previous key */
+    info->int_keypos= _ma_get_last_key(&tmp_key, &page, info->int_keypos);
+    if (!info->int_keypos)
+      DBUG_RETURN(-1);
+    if (info->int_keypos == info->keyread_buff + info->s->keypage_header)
+    {
+      /* Previous key was first key, read key before this one */
+      DBUG_RETURN(_ma_search(info, key, nextflag | SEARCH_SAVE_BUFF,
+                             pos));
+    }
+    if (page.node &&
+        (error= _ma_search(info, key, nextflag | SEARCH_SAVE_BUFF,
+                           _ma_kpos(page.node,info->int_keypos))) <= 0)
+      DBUG_RETURN(error);
+
+    /* QQ: We should be able to optimize away the following call */
+    if (! _ma_get_last_key(&info->last_key, &page, info->int_keypos))
+      DBUG_RETURN(-1);
+  }
+  info->cur_row.lastpos= _ma_row_pos_from_key(&info->last_key);
+  info->cur_row.trid=    _ma_trid_from_key(&info->last_key);
+  DBUG_PRINT("exit",("found key at %lu",(ulong) info->cur_row.lastpos));
+  DBUG_RETURN(0);
+} /* _ma_search_next */
+
+
+/**
+  Search after position for the first row in an index
+
+  @return
+  Found row is stored in info->cur_row.lastpos
+*/
+
+int _ma_search_first(MARIA_HA *info, MARIA_KEYDEF *keyinfo,
+                     my_off_t pos)
+{
+  uchar *first_pos;
+  MARIA_PAGE page;
+  MARIA_SHARE *share= info->s;
+  DBUG_ENTER("_ma_search_first");
+
+  if (pos == HA_OFFSET_ERROR)
+  {
+    my_errno=HA_ERR_KEY_NOT_FOUND;
+    info->cur_row.lastpos= HA_OFFSET_ERROR;
+    DBUG_RETURN(-1);
+  }
+
+  do
+  {
+    if (_ma_fetch_keypage(&page, info, keyinfo, pos,
+                          PAGECACHE_LOCK_LEFT_UNLOCKED,
+                          DFLT_INIT_HITS, info->keyread_buff, 0))
+    {
+      info->cur_row.lastpos= HA_OFFSET_ERROR;
+      DBUG_RETURN(-1);
+    }
+    first_pos= page.buff + share->keypage_header + page.node;
+  } while ((pos= _ma_kpos(page.node, first_pos)) != HA_OFFSET_ERROR);
+
+  info->last_key.keyinfo= keyinfo;
+
+  if (!(*keyinfo->get_key)(&info->last_key, page.flag, page.node, &first_pos))
+    DBUG_RETURN(-1);                            /* Crashed */
+
+  info->int_keypos=   first_pos;
+  info->int_maxpos=   (page.buff + page.size -1);
+  info->int_nod_flag= page.node;
+  info->int_keytree_version= keyinfo->version;
+  info->last_search_keypage= info->last_keypage;
+  info->page_changed=info->keyread_buff_used=0;
+  info->cur_row.lastpos= _ma_row_pos_from_key(&info->last_key);
+  info->cur_row.trid=    _ma_trid_from_key(&info->last_key);
+
+  DBUG_PRINT("exit",("found key at %lu", (ulong) info->cur_row.lastpos));
+  DBUG_RETURN(0);
+} /* _ma_search_first */
+
+
+/**
+   Search after position for the last row in an index
+
+  @return
+  Found row is stored in info->cur_row.lastpos
+*/
+
+int _ma_search_last(MARIA_HA *info, MARIA_KEYDEF *keyinfo,
+                    my_off_t pos)
+{
+  uchar *end_of_page;
+  MARIA_PAGE page;
+  DBUG_ENTER("_ma_search_last");
+
+  if (pos == HA_OFFSET_ERROR)
+  {
+    my_errno=HA_ERR_KEY_NOT_FOUND;                      /* Didn't find key */
+    info->cur_row.lastpos= HA_OFFSET_ERROR;
+    DBUG_RETURN(-1);
+  }
+
+  do
+  {
+    if (_ma_fetch_keypage(&page, info, keyinfo, pos,
+                          PAGECACHE_LOCK_LEFT_UNLOCKED,
+                          DFLT_INIT_HITS, info->keyread_buff, 0))
+    {
+      info->cur_row.lastpos= HA_OFFSET_ERROR;
+      DBUG_RETURN(-1);
+    }
+    end_of_page= page.buff + page.size;
+  } while ((pos= _ma_kpos(page.node, end_of_page)) != HA_OFFSET_ERROR);
+
+  info->last_key.keyinfo= keyinfo;
+
+  if (!_ma_get_last_key(&info->last_key, &page, end_of_page))
+    DBUG_RETURN(-1);
+  info->cur_row.lastpos= _ma_row_pos_from_key(&info->last_key);
+  info->cur_row.trid=    _ma_trid_from_key(&info->last_key);
+  info->int_keypos=      info->int_maxpos= end_of_page;
+  info->int_nod_flag=    page.node;
+  info->int_keytree_version= keyinfo->version;
+  info->last_search_keypage= info->last_keypage;
+  info->page_changed=info->keyread_buff_used=0;
+
+  DBUG_PRINT("exit",("found key at %lu",(ulong) info->cur_row.lastpos));
+  DBUG_RETURN(0);
+} /* _ma_search_last */
+
+
+
+/****************************************************************************
+**
+** Functions to store and pack a key in a page
+**
+** maria_calc_xx_key_length takes the following arguments:
+**  nod_flag    If nod: Length of nod-pointer
+**  next_key    Position to pos after the new key in buffer
+**  org_key     Key that was before the next key in buffer
+**  prev_key    Last key before current key
+**  key         Key that will be stored
+**  s_temp      Information how next key will be packed
+****************************************************************************/
+
+/* Static length key */
+
+int
+_ma_calc_static_key_length(const MARIA_KEY *key, uint nod_flag,
+                           uchar *next_pos  __attribute__((unused)),
+                           uchar *org_key  __attribute__((unused)),
+                           uchar *prev_key __attribute__((unused)),
+                           MARIA_KEY_PARAM *s_temp)
+{
+  s_temp->key= key->data;
+  return (int) (s_temp->move_length= key->data_length + key->ref_length +
+                nod_flag);
+}
+
+/* Variable length key */
+
+int
+_ma_calc_var_key_length(const MARIA_KEY *key, uint nod_flag,
+                        uchar *next_pos  __attribute__((unused)),
+                        uchar *org_key  __attribute__((unused)),
+                        uchar *prev_key __attribute__((unused)),
+                        MARIA_KEY_PARAM *s_temp)
+{
+  s_temp->key= key->data;
+  return (int) (s_temp->move_length= key->data_length + key->ref_length +
+                nod_flag);
+}
+
+/**
+   @brief Calc length needed to store prefixed compressed keys
+
+  @info
+    Variable length first segment which is prefix compressed
+    (maria_chk reports 'packed + stripped')
+
+    Keys are compressed the following way:
+
+    If the max length of first key segment <= 127 bytes the prefix is
+    1 uchar else it's 2 byte
+
+    prefix byte(s) The high bit is set if this is a prefix for the prev key
+    length         Packed length if the previous was a prefix byte
+    [data_length]  data bytes ('length' bytes)
+    next-key-seg   Next key segments
+
+    If the first segment can have NULL:
+       If key was packed
+         data_length is length of rest of key
+       If key was not packed
+         The data_length is 0 for NULLS and 1+data_length for not null columns
+*/
+
+int
+_ma_calc_var_pack_key_length(const MARIA_KEY *int_key, uint nod_flag,
+                             uchar *next_key, uchar *org_key, uchar *prev_key,
+                             MARIA_KEY_PARAM *s_temp)
+{
+  reg1 HA_KEYSEG *keyseg;
+  int length;
+  uint key_length,ref_length,org_key_length=0,
+       length_pack,new_key_length,diff_flag,pack_marker;
+  const uchar *key, *start, *end, *key_end;
+  const uchar *sort_order;
+  my_bool same_length;
+  MARIA_KEYDEF *keyinfo= int_key->keyinfo;
+
+  key= int_key->data;
+  length_pack=s_temp->ref_length=s_temp->n_ref_length=s_temp->n_length=0;
+  same_length=0; keyseg=keyinfo->seg;
+  key_length= int_key->data_length + int_key->ref_length + nod_flag;
+
+  sort_order=0;
+  if ((keyinfo->flag & HA_FULLTEXT) &&
+      ((keyseg->type == HA_KEYTYPE_TEXT) ||
+       (keyseg->type == HA_KEYTYPE_VARTEXT1) ||
+       (keyseg->type == HA_KEYTYPE_VARTEXT2)) &&
+      !use_strnxfrm(keyseg->charset))
+    sort_order= keyseg->charset->sort_order;
+
+  /* diff flag contains how many bytes is needed to pack key */
+  if (keyseg->length >= 127)
+  {
+    diff_flag=2;
+    pack_marker=32768;
+  }
+  else
+  {
+    diff_flag= 1;
+    pack_marker=128;
+  }
+  s_temp->pack_marker=pack_marker;
+
+  /* Handle the case that the first part have NULL values */
+  if (keyseg->flag & HA_NULL_PART)
+  {
+    if (!*key++)
+    {
+      s_temp->key= key;
+      s_temp->key_length= 0;
+      s_temp->totlength= key_length-1+diff_flag;
+      s_temp->next_key_pos= 0;                   /* No next key */
+      return (s_temp->move_length= s_temp->totlength);
+    }
+    s_temp->store_not_null=1;
+    key_length--;                               /* We don't store NULL */
+    if (prev_key && !*prev_key++)
+      org_key=prev_key=0;                       /* Can't pack against prev */
+    else if (org_key)
+      org_key++;                                /* Skip NULL */
+  }
+  else
+    s_temp->store_not_null=0;
+  s_temp->prev_key= org_key;
+
+  /* The key part will start with a packed length */
+
+  get_key_pack_length(new_key_length,length_pack,key);
+  end= key_end= key+ new_key_length;
+  start= key;
+
+  /* Calc how many characters are identical between this and the prev. key */
+  if (prev_key)
+  {
+    get_key_length(org_key_length,prev_key);
+    s_temp->prev_key=prev_key;          /* Pointer at data */
+    /* Don't use key-pack if length == 0 */
+    if (new_key_length && new_key_length == org_key_length)
+      same_length=1;
+    else if (new_key_length > org_key_length)
+      end= key + org_key_length;
+
+    if (sort_order)                             /* SerG */
+    {
+      while (key < end &&
+             sort_order[*key] == sort_order[*prev_key])
+      {
+        key++; prev_key++;
+      }
+    }
+    else
+    {
+      while (key < end && *key == *prev_key)
+      {
+        key++; prev_key++;
+      }
+    }
+  }
+
+  s_temp->key=key;
+  s_temp->key_length= (uint) (key_end-key);
+
+  if (same_length && key == key_end)
+  {
+    /* identical variable length key */
+    s_temp->ref_length= pack_marker;
+    length=(int) key_length-(int) (key_end-start)-length_pack;
+    length+= diff_flag;
+    if (next_key)
+    {                                           /* Can't combine with next */
+      s_temp->n_length= *next_key;              /* Needed by _ma_store_key */
+      next_key=0;
+    }
+  }
+  else
+  {
+    if (start != key)
+    {                                           /* Starts as prev key */
+      ref_length= (uint) (key-start);
+      s_temp->ref_length= ref_length + pack_marker;
+      length= (int) (key_length - ref_length);
+
+      length-= length_pack;
+      length+= diff_flag;
+      length+= ((new_key_length-ref_length) >= 255) ? 3 : 1;/* Rest_of_key */
+    }
+    else
+    {
+      s_temp->key_length+=s_temp->store_not_null;       /* If null */
+      length= key_length - length_pack+ diff_flag;
+    }
+  }
+  s_temp->totlength=(uint) length;
+  s_temp->prev_length=0;
+  DBUG_PRINT("test",("tot_length: %u  length: %d  uniq_key_length: %u",
+                     key_length, length, s_temp->key_length));
+
+        /* If something after that hasn't length=0, test if we can combine */
+  if ((s_temp->next_key_pos=next_key))
+  {
+    uint packed,n_length;
+
+    packed = *next_key & 128;
+    if (diff_flag == 2)
+    {
+      n_length= mi_uint2korr(next_key) & 32767; /* Length of next key */
+      next_key+=2;
+    }
+    else
+      n_length= *next_key++ & 127;
+    if (!packed)
+      n_length-= s_temp->store_not_null;
+
+    if (n_length || packed)             /* Don't pack 0 length keys */
+    {
+      uint next_length_pack, new_ref_length=s_temp->ref_length;
+
+      if (packed)
+      {
+        /* If first key and next key is packed (only on delete) */
+        if (!prev_key && org_key)
+        {
+          get_key_length(org_key_length,org_key);
+          key=start;
+          if (sort_order)                       /* SerG */
+          {
+            while (key < end &&
+                   sort_order[*key] == sort_order[*org_key])
+            {
+              key++; org_key++;
+            }
+          }
+          else
+          {
+            while (key < end && *key == *org_key)
+            {
+              key++; org_key++;
+            }
+          }
+          if ((new_ref_length= (uint) (key - start)))
+            new_ref_length+=pack_marker;
+        }
+
+        if (!n_length)
+        {
+          /*
+            We put a different key between two identical variable length keys
+            Extend next key to have same prefix as this key
+          */
+          if (new_ref_length)                   /* prefix of previus key */
+          {                                     /* make next key longer */
+            s_temp->part_of_prev_key= new_ref_length;
+            s_temp->prev_length=          org_key_length -
+              (new_ref_length-pack_marker);
+            s_temp->n_ref_length= s_temp->part_of_prev_key;
+            s_temp->n_length= s_temp->prev_length;
+            n_length=             get_pack_length(s_temp->prev_length);
+            s_temp->prev_key+=    (new_ref_length - pack_marker);
+            length+=              s_temp->prev_length + n_length;
+          }
+          else
+          {                                     /* Can't use prev key */
+            s_temp->part_of_prev_key=0;
+            s_temp->prev_length= org_key_length;
+            s_temp->n_ref_length=s_temp->n_length=  org_key_length;
+            length+=           org_key_length;
+          }
+          return (s_temp->move_length= (int) length);
+        }
+
+        ref_length=n_length;
+        /* Get information about not packed key suffix */
+        get_key_pack_length(n_length,next_length_pack,next_key);
+
+        /* Test if new keys has fewer characters that match the previous key */
+        if (!new_ref_length)
+        {                                       /* Can't use prev key */
+          s_temp->part_of_prev_key=     0;
+          s_temp->prev_length=          ref_length;
+          s_temp->n_ref_length= s_temp->n_length= n_length+ref_length;
+          return s_temp->move_length= ((int) length+ref_length-
+                                       next_length_pack);
+        }
+        if (ref_length+pack_marker > new_ref_length)
+        {
+          uint new_pack_length=new_ref_length-pack_marker;
+          /* We must copy characters from the original key to the next key */
+          s_temp->part_of_prev_key= new_ref_length;
+          s_temp->prev_length=      ref_length - new_pack_length;
+          s_temp->n_ref_length=s_temp->n_length=n_length + s_temp->prev_length;
+          s_temp->prev_key+=        new_pack_length;
+          length-= (next_length_pack - get_pack_length(s_temp->n_length));
+          return s_temp->move_length= ((int) length + s_temp->prev_length);
+        }
+      }
+      else
+      {
+        /* Next key wasn't a prefix of previous key */
+        ref_length=0;
+        next_length_pack=0;
+     }
+      DBUG_PRINT("test",("length: %d  next_key: 0x%lx", length,
+                         (long) next_key));
+
+      {
+        uint tmp_length;
+        key=(start+=ref_length);
+        if (key+n_length < key_end)             /* Normalize length based */
+          key_end= key+n_length;
+        if (sort_order)                         /* SerG */
+        {
+          while (key < key_end &&
+                 sort_order[*key] == sort_order[*next_key])
+          {
+            key++; next_key++;
+          }
+        }
+        else
+        {
+          while (key < key_end && *key == *next_key)
+          {
+            key++; next_key++;
+          }
+        }
+        if (!(tmp_length=(uint) (key-start)))
+        {                                       /* Key can't be re-packed */
+          s_temp->next_key_pos=0;
+          return (s_temp->move_length= length);
+        }
+        ref_length+=tmp_length;
+        n_length-=tmp_length;
+        length-=tmp_length+next_length_pack;    /* We gained these chars */
+      }
+      if (n_length == 0 && ref_length == new_key_length)
+      {
+        s_temp->n_ref_length=pack_marker;       /* Same as prev key */
+      }
+      else
+      {
+        s_temp->n_ref_length=ref_length | pack_marker;
+        length+= get_pack_length(n_length);
+        s_temp->n_length=n_length;
+      }
+    }
+  }
+  return (s_temp->move_length= length);
+}
+
+
+/* Length of key which is prefix compressed */
+
+int _ma_calc_bin_pack_key_length(const MARIA_KEY *int_key,
+                                 uint nod_flag,
+                                 uchar *next_key,
+                                 uchar *org_key, uchar *prev_key,
+                                 MARIA_KEY_PARAM *s_temp)
+{
+  uint length,key_length,ref_length;
+  const uchar *key= int_key->data;
+
+  s_temp->totlength= key_length= (int_key->data_length + int_key->ref_length+
+                                  nod_flag);
+#ifdef HAVE_valgrind
+  s_temp->n_length= s_temp->n_ref_length=0;	/* For valgrind */
+#endif
+  s_temp->key=key;
+  s_temp->prev_key=org_key;
+  if (prev_key)                                 /* If not first key in block */
+  {
+    /* pack key against previous key */
+    /*
+      As keys may be identical when running a sort in maria_chk, we
+      have to guard against the case where keys may be identical
+    */
+    const uchar *end;
+    end=key+key_length;
+    for ( ; *key == *prev_key && key < end; key++,prev_key++) ;
+    s_temp->ref_length= ref_length=(uint) (key-s_temp->key);
+    length=key_length - ref_length + get_pack_length(ref_length);
+  }
+  else
+  {
+    /* No previous key */
+    s_temp->ref_length=ref_length=0;
+    length=key_length+1;
+  }
+  if ((s_temp->next_key_pos=next_key))          /* If another key after */
+  {
+    /* pack key against next key */
+    uint next_length,next_length_pack;
+    get_key_pack_length(next_length,next_length_pack,next_key);
+
+    /* If first key and next key is packed (only on delete) */
+    if (!prev_key && org_key && next_length)
+    {
+      const uchar *end;
+      for (key= s_temp->key, end=key+next_length ;
+           *key == *org_key && key < end;
+           key++,org_key++) ;
+      ref_length= (uint) (key - s_temp->key);
+    }
+
+    if (next_length > ref_length)
+    {
+      /*
+        We put a key with different case between two keys with the same prefix
+        Extend next key to have same prefix as this key
+      */
+      s_temp->n_ref_length= ref_length;
+      s_temp->prev_length=  next_length-ref_length;
+      s_temp->prev_key+=    ref_length;
+      return s_temp->move_length= ((int) (length+ s_temp->prev_length -
+                                          next_length_pack +
+                                          get_pack_length(ref_length)));
+    }
+    /* Check how many characters are identical to next key */
+    key= s_temp->key+next_length;
+    s_temp->prev_length= 0;
+    while (*key++ == *next_key++) ;
+    if ((ref_length= (uint) (key - s_temp->key)-1) == next_length)
+    {
+      s_temp->next_key_pos=0;
+      return (s_temp->move_length= length);  /* Can't pack next key */
+    }
+    s_temp->n_ref_length=ref_length;
+    return s_temp->move_length= (int) (length-(ref_length - next_length) -
+                                       next_length_pack +
+                                       get_pack_length(ref_length));
+  }
+  return (s_temp->move_length= (int) length);
+}
+
+
+/*
+** store a key packed with _ma_calc_xxx_key_length in page-buffert
+*/
+
+/* store key without compression */
+
+void _ma_store_static_key(MARIA_KEYDEF *keyinfo __attribute__((unused)),
+                          register uchar *key_pos,
+                          register MARIA_KEY_PARAM *s_temp)
+{
+  memcpy(key_pos, s_temp->key,(size_t) s_temp->move_length);
+  s_temp->changed_length= s_temp->move_length;
+}
+
+
+/* store variable length key with prefix compression */
+
+#define store_pack_length(test,pos,length) { \
+  if (test) { *((pos)++) = (uchar) (length); } else \
+  { *((pos)++) = (uchar) ((length) >> 8); *((pos)++) = (uchar) (length);  } }
+
+
+void _ma_store_var_pack_key(MARIA_KEYDEF *keyinfo  __attribute__((unused)),
+                            register uchar *key_pos,
+                            register MARIA_KEY_PARAM *s_temp)
+{
+  uint length;
+  uchar *org_key_pos= key_pos;
+
+  if (s_temp->ref_length)
+  {
+    /* Packed against previous key */
+    store_pack_length(s_temp->pack_marker == 128,key_pos,s_temp->ref_length);
+    /* If not same key after */
+    if (s_temp->ref_length != s_temp->pack_marker)
+      store_key_length_inc(key_pos,s_temp->key_length);
+  }
+  else
+  {
+    /* Not packed against previous key */
+    store_pack_length(s_temp->pack_marker == 128,key_pos,s_temp->key_length);
+  }
+  bmove(key_pos, s_temp->key,
+        (length= s_temp->totlength - (uint) (key_pos-org_key_pos)));
+
+  key_pos+= length;
+
+  if (!s_temp->next_key_pos)                    /* No following key */
+    goto end;
+
+  if (s_temp->prev_length)
+  {
+    /* Extend next key because new key didn't have same prefix as prev key */
+    if (s_temp->part_of_prev_key)
+    {
+      store_pack_length(s_temp->pack_marker == 128,key_pos,
+                        s_temp->part_of_prev_key);
+      store_key_length_inc(key_pos,s_temp->n_length);
+    }
+    else
+    {
+      s_temp->n_length+= s_temp->store_not_null;
+      store_pack_length(s_temp->pack_marker == 128,key_pos,
+                        s_temp->n_length);
+    }
+    memcpy(key_pos, s_temp->prev_key, s_temp->prev_length);
+    key_pos+= s_temp->prev_length;
+  }
+  else if (s_temp->n_ref_length)
+  {
+    store_pack_length(s_temp->pack_marker == 128,key_pos,s_temp->n_ref_length);
+    if (s_temp->n_ref_length != s_temp->pack_marker)
+    {
+      /* Not identical key */
+      store_key_length_inc(key_pos,s_temp->n_length);
+    }
+  }
+  else
+  {
+    s_temp->n_length+= s_temp->store_not_null;
+    store_pack_length(s_temp->pack_marker == 128,key_pos,s_temp->n_length);
+  }
+
+end:
+  s_temp->changed_length= (uint) (key_pos - org_key_pos);
+}
+
+
+/* variable length key with prefix compression */
+
+void _ma_store_bin_pack_key(MARIA_KEYDEF *keyinfo  __attribute__((unused)),
+                            register uchar *key_pos,
+                            register MARIA_KEY_PARAM *s_temp)
+{
+  uchar *org_key_pos= key_pos;
+  size_t length= s_temp->totlength - s_temp->ref_length;
+
+  store_key_length_inc(key_pos,s_temp->ref_length);
+  memcpy(key_pos, s_temp->key+s_temp->ref_length, length);
+  key_pos+= length;
+
+  if (s_temp->next_key_pos)
+  {
+    store_key_length_inc(key_pos,s_temp->n_ref_length);
+    if (s_temp->prev_length)                    /* If we must extend key */
+    {
+      memcpy(key_pos,s_temp->prev_key,s_temp->prev_length);
+      key_pos+= s_temp->prev_length;
+    }
+  }
+  s_temp->changed_length= (uint) (key_pos - org_key_pos);
+}
diff --git a/storage/maria/ma_servicethread.c b/storage/maria/ma_servicethread.c
new file mode 100644
index 00000000000..a8099c998e9
--- /dev/null
+++ b/storage/maria/ma_servicethread.c
@@ -0,0 +1,134 @@
+#include "maria_def.h"
+#include "ma_servicethread.h"
+
+/**
+   Initializes the service thread
+
+   @param control        control block
+
+   @return Operation status
+    @retval 0 OK
+    @retval 1 error
+*/
+
+int ma_service_thread_control_init(MA_SERVICE_THREAD_CONTROL *control)
+{
+  int res= 0;
+  DBUG_ENTER("ma_service_thread_control_init");
+  DBUG_PRINT("init", ("control 0x%lx", (ulong) control));
+  control->inited= TRUE;
+  control->status= THREAD_DEAD; /* not yet born == dead */
+  res= (pthread_mutex_init(control->LOCK_control, MY_MUTEX_INIT_SLOW) ||
+        pthread_cond_init(control->COND_control, 0));
+  DBUG_PRINT("info", ("init: %s", (res ? "Error" : "OK")));
+  DBUG_RETURN(res);
+}
+
+
+/**
+   Kill the service thread
+
+   @param control        control block
+
+   @note The service thread should react on condition and status equal
+   THREAD_DYING, by setting status THREAD_DEAD, and issuing message to
+   control thread via condition and exiting. The base way to do so is using
+   my_service_thread_sleep() and my_service_thread_signal_end()
+*/
+
+void ma_service_thread_control_end(MA_SERVICE_THREAD_CONTROL *control)
+{
+  DBUG_ENTER("ma_service_thread_control_end");
+  DBUG_PRINT("init", ("control 0x%lx", (ulong) control));
+  DBUG_ASSERT(control->inited);
+  pthread_mutex_lock(control->LOCK_control);
+  if (control->status != THREAD_DEAD) /* thread was started OK */
+  {
+    DBUG_PRINT("info",("killing Maria background thread"));
+    control->status= THREAD_DYING; /* kill it */
+    do /* and wait for it to be dead */
+    {
+      /* wake it up if it was in a sleep */
+      pthread_cond_broadcast(control->COND_control);
+      DBUG_PRINT("info",("waiting for Maria background thread to die"));
+      pthread_cond_wait(control->COND_control, control->LOCK_control);
+    }
+    while (control->status != THREAD_DEAD);
+  }
+  pthread_mutex_unlock(control->LOCK_control);
+  pthread_mutex_destroy(control->LOCK_control);
+  pthread_cond_destroy(control->COND_control);
+  control->inited= FALSE;
+  DBUG_VOID_RETURN;
+}
+
+
+/**
+   Sleep for given number of nanoseconds with reaction on thread kill
+
+   @param control        control block
+   @param sleep_time     time of sleeping
+
+   @return Operation status
+    @retval FALSE Time out
+    @retval TRUE  Thread should be killed
+*/
+
+my_bool my_service_thread_sleep(MA_SERVICE_THREAD_CONTROL *control,
+                                ulonglong sleep_time)
+{
+  struct timespec abstime;
+  my_bool res= FALSE;
+  DBUG_ENTER("my_service_thread_sleep");
+  DBUG_PRINT("init", ("control 0x%lx", (ulong) control));
+  pthread_mutex_lock(control->LOCK_control);
+  if (control->status == THREAD_DYING)
+  {
+    pthread_mutex_unlock(control->LOCK_control);
+    DBUG_RETURN(TRUE);
+  }
+#if 0 /* good for testing, to do a lot of checkpoints, finds a lot of bugs */
+  pthread_mutex_unlock(&control->LOCK_control);
+  my_sleep(100000); /* a tenth of a second */
+  pthread_mutex_lock(&control->LOCK_control);
+#else
+    /* To have a killable sleep, we use timedwait like our SQL GET_LOCK() */
+  DBUG_PRINT("info", ("sleeping %llu nano seconds", sleep_time));
+  if (sleep_time)
+  {
+    set_timespec_nsec(abstime, sleep_time);
+    pthread_cond_timedwait(control->COND_control,
+                           control->LOCK_control, &abstime);
+  }
+#endif
+  if (control->status == THREAD_DYING)
+    res= TRUE;
+  pthread_mutex_unlock(control->LOCK_control);
+  DBUG_RETURN(res);
+}
+
+
+/**
+  inform about thread exiting
+
+  @param control        control block
+*/
+
+void my_service_thread_signal_end(MA_SERVICE_THREAD_CONTROL *control)
+{
+  DBUG_ENTER("my_service_thread_signal_end");
+  DBUG_PRINT("init", ("control 0x%lx", (ulong) control));
+  pthread_mutex_lock(control->LOCK_control);
+  control->status = THREAD_DEAD; /* indicate that we are dead */
+  /*
+    wake up ma_service_thread_control_end which may be waiting for
+    our death
+  */
+  pthread_cond_broadcast(control->COND_control);
+  /*
+    broadcast was inside unlock because ma_service_thread_control_end
+    destroys mutex
+  */
+  pthread_mutex_unlock(control->LOCK_control);
+  DBUG_VOID_RETURN;
+}
diff --git a/storage/maria/ma_servicethread.h b/storage/maria/ma_servicethread.h
new file mode 100644
index 00000000000..153ff9ebd14
--- /dev/null
+++ b/storage/maria/ma_servicethread.h
@@ -0,0 +1,22 @@
+#include <my_pthread.h>
+
+enum ma_service_thread_state {THREAD_RUNNING, THREAD_DYING, THREAD_DEAD};
+
+typedef struct st_ma_service_thread_control
+{
+  /** 'kill' flag for the background thread */
+  enum ma_service_thread_state status;
+  /** if thread module was inited or not */
+  my_bool inited;
+  /** for killing the background thread */
+  pthread_mutex_t *LOCK_control;
+  /** for killing the background thread */
+  pthread_cond_t *COND_control;
+} MA_SERVICE_THREAD_CONTROL;
+
+
+int ma_service_thread_control_init(MA_SERVICE_THREAD_CONTROL *control);
+void ma_service_thread_control_end(MA_SERVICE_THREAD_CONTROL *control);
+my_bool my_service_thread_sleep(MA_SERVICE_THREAD_CONTROL *control,
+                                ulonglong sleep_time);
+void my_service_thread_signal_end(MA_SERVICE_THREAD_CONTROL *control);
diff --git a/storage/maria/ma_sort.c b/storage/maria/ma_sort.c
new file mode 100644
index 00000000000..f7f79f90cf0
--- /dev/null
+++ b/storage/maria/ma_sort.c
@@ -0,0 +1,1077 @@
+/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; version 2 of the License.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */
+
+/*
+  Creates a index for a database by reading keys, sorting them and outputing
+  them in sorted order through MARIA_SORT_INFO functions.
+*/
+
+#include "ma_fulltext.h"
+#if defined(MSDOS) || defined(__WIN__)
+#include <fcntl.h>
+#else
+#include <stddef.h>
+#endif
+#include <queues.h>
+
+/* static variables */
+
+#undef MIN_SORT_MEMORY
+#undef MYF_RW
+#undef DISK_BUFFER_SIZE
+
+#define MERGEBUFF 15
+#define MERGEBUFF2 31
+#define MIN_SORT_MEMORY (4096-MALLOC_OVERHEAD)
+#define MYF_RW  MYF(MY_NABP | MY_WME | MY_WAIT_IF_FULL)
+#define DISK_BUFFER_SIZE (IO_SIZE*16)
+
+
+/*
+ Pointers of functions for store and read keys from temp file
+*/
+
+extern void print_error _VARARGS((const char *fmt,...));
+
+/* Functions defined in this file */
+
+static ha_rows find_all_keys(MARIA_SORT_PARAM *info,uint keys,
+                             uchar **sort_keys,
+                             DYNAMIC_ARRAY *buffpek,int *maxbuffer,
+                             IO_CACHE *tempfile,
+                             IO_CACHE *tempfile_for_exceptions);
+static int write_keys(MARIA_SORT_PARAM *info, uchar **sort_keys,
+                      uint count, BUFFPEK *buffpek,IO_CACHE *tempfile);
+static int write_key(MARIA_SORT_PARAM *info, uchar *key,
+                     IO_CACHE *tempfile);
+static int write_index(MARIA_SORT_PARAM *info, uchar **sort_keys,
+                       uint count);
+static int merge_many_buff(MARIA_SORT_PARAM *info,uint keys,
+                           uchar **sort_keys,
+                           BUFFPEK *buffpek,int *maxbuffer,
+                           IO_CACHE *t_file);
+static uint read_to_buffer(IO_CACHE *fromfile,BUFFPEK *buffpek,
+                           uint sort_length);
+static int merge_buffers(MARIA_SORT_PARAM *info,uint keys,
+                         IO_CACHE *from_file, IO_CACHE *to_file,
+                         uchar **sort_keys, BUFFPEK *lastbuff,
+                         BUFFPEK *Fb, BUFFPEK *Tb);
+static int merge_index(MARIA_SORT_PARAM *,uint, uchar **,BUFFPEK *, int,
+                       IO_CACHE *);
+static int flush_maria_ft_buf(MARIA_SORT_PARAM *info);
+
+static int write_keys_varlen(MARIA_SORT_PARAM *info, uchar **sort_keys,
+                             uint count, BUFFPEK *buffpek,
+                             IO_CACHE *tempfile);
+static uint read_to_buffer_varlen(IO_CACHE *fromfile,BUFFPEK *buffpek,
+                                  uint sort_length);
+static int write_merge_key(MARIA_SORT_PARAM *info, IO_CACHE *to_file,
+                           uchar *key, uint sort_length, uint count);
+static int write_merge_key_varlen(MARIA_SORT_PARAM *info,
+                                  IO_CACHE *to_file, uchar *key,
+                                  uint sort_length, uint count);
+static inline int
+my_var_write(MARIA_SORT_PARAM *info, IO_CACHE *to_file, uchar *bufs);
+
+/*
+  Creates a index of sorted keys
+
+  SYNOPSIS
+    _ma_create_index_by_sort()
+    info		Sort parameters
+    no_messages		Set to 1 if no output
+    sortbuff_size	Size of sortbuffer to allocate
+
+  RESULT
+    0	ok
+   <> 0 Error
+*/
+
+int _ma_create_index_by_sort(MARIA_SORT_PARAM *info, my_bool no_messages,
+                             size_t sortbuff_size)
+{
+  int error,maxbuffer,skr;
+  size_t memavl,old_memavl;
+  uint keys,sort_length;
+  DYNAMIC_ARRAY buffpek;
+  ha_rows records;
+  uchar **sort_keys;
+  IO_CACHE tempfile, tempfile_for_exceptions;
+  DBUG_ENTER("_ma_create_index_by_sort");
+  DBUG_PRINT("enter",("sort_buff_size: %lu  sort_length: %d  max_records: %lu",
+                      (ulong) sortbuff_size, info->key_length,
+                      (ulong) info->sort_info->max_records));
+
+  if (info->keyinfo->flag & HA_VAR_LENGTH_KEY)
+  {
+    info->write_keys= write_keys_varlen;
+    info->read_to_buffer=read_to_buffer_varlen;
+    info->write_key=write_merge_key_varlen;
+  }
+  else
+  {
+    info->write_keys= write_keys;
+    info->read_to_buffer=read_to_buffer;
+    info->write_key=write_merge_key;
+  }
+
+  my_b_clear(&tempfile);
+  my_b_clear(&tempfile_for_exceptions);
+  bzero((char*) &buffpek,sizeof(buffpek));
+  sort_keys= (uchar **) NULL; error= 1;
+  maxbuffer=1;
+
+  memavl=max(sortbuff_size,MIN_SORT_MEMORY);
+  records=	info->sort_info->max_records;
+  sort_length=	info->key_length;
+  LINT_INIT(keys);
+
+  while (memavl >= MIN_SORT_MEMORY)
+  {
+    if ((records < UINT_MAX32) &&
+       ((my_off_t) (records + 1) *
+        (sort_length + sizeof(char*)) <= (my_off_t) memavl))
+      keys= (uint)records+1;
+    else
+      do
+      {
+	skr=maxbuffer;
+	if (memavl < sizeof(BUFFPEK)*(uint) maxbuffer ||
+	    (keys=(memavl-sizeof(BUFFPEK)*(uint) maxbuffer)/
+             (sort_length+sizeof(char*))) <= 1 ||
+            keys < (uint) maxbuffer)
+	{
+	  _ma_check_print_error(info->sort_info->param,
+			       "aria_sort_buffer_size is too small");
+	  goto err;
+	}
+      }
+      while ((maxbuffer= (int) (records/(keys-1)+1)) != skr);
+
+    if ((sort_keys=(uchar**) my_malloc(keys*(sort_length+sizeof(char*))+
+                                      HA_FT_MAXBYTELEN, MYF(0))))
+    {
+      if (my_init_dynamic_array(&buffpek, sizeof(BUFFPEK), maxbuffer,
+			     maxbuffer/2))
+      {
+	my_free(sort_keys,MYF(0));
+        sort_keys= 0;
+      }
+      else
+	break;
+    }
+    old_memavl=memavl;
+    if ((memavl=memavl/4*3) < MIN_SORT_MEMORY && old_memavl > MIN_SORT_MEMORY)
+      memavl=MIN_SORT_MEMORY;
+  }
+  if (memavl < MIN_SORT_MEMORY)
+  {
+    _ma_check_print_error(info->sort_info->param, "Aria sort buffer"
+                          " too small"); /* purecov: tested */
+    goto err; /* purecov: tested */
+  }
+  (*info->lock_in_memory)(info->sort_info->param);/* Everything is allocated */
+
+  if (!no_messages)
+    printf("  - Searching for keys, allocating buffer for %d keys\n",keys);
+
+  if ((records=find_all_keys(info,keys,sort_keys,&buffpek,&maxbuffer,
+                                  &tempfile,&tempfile_for_exceptions))
+      == HA_POS_ERROR)
+    goto err; /* purecov: tested */
+  if (maxbuffer == 0)
+  {
+    if (!no_messages)
+      printf("  - Dumping %lu keys\n", (ulong) records);
+    if (write_index(info,sort_keys, (uint) records))
+      goto err; /* purecov: inspected */
+  }
+  else
+  {
+    keys=(keys*(sort_length+sizeof(char*)))/sort_length;
+    if (maxbuffer >= MERGEBUFF2)
+    {
+      if (!no_messages)
+	printf("  - Merging %lu keys\n", (ulong) records); /* purecov: tested */
+      if (merge_many_buff(info,keys,sort_keys,
+                  dynamic_element(&buffpek,0,BUFFPEK *),&maxbuffer,&tempfile))
+	goto err;				/* purecov: inspected */
+    }
+    if (flush_io_cache(&tempfile) ||
+	reinit_io_cache(&tempfile,READ_CACHE,0L,0,0))
+      goto err;					/* purecov: inspected */
+    if (!no_messages)
+      printf("  - Last merge and dumping keys\n"); /* purecov: tested */
+    if (merge_index(info,keys,sort_keys,dynamic_element(&buffpek,0,BUFFPEK *),
+                    maxbuffer,&tempfile))
+      goto err;					/* purecov: inspected */
+  }
+
+  if (flush_maria_ft_buf(info) || _ma_flush_pending_blocks(info))
+    goto err;
+
+  if (my_b_inited(&tempfile_for_exceptions))
+  {
+    MARIA_HA *idx=info->sort_info->info;
+    uint16    key_length;
+    MARIA_KEY key;
+    key.keyinfo= idx->s->keyinfo + info->key;
+
+    if (!no_messages)
+      printf("  - Adding exceptions\n"); /* purecov: tested */
+    if (flush_io_cache(&tempfile_for_exceptions) ||
+	reinit_io_cache(&tempfile_for_exceptions,READ_CACHE,0L,0,0))
+      goto err;
+
+    while (!my_b_read(&tempfile_for_exceptions,(uchar*)&key_length,
+		      sizeof(key_length))
+        && !my_b_read(&tempfile_for_exceptions,(uchar*)sort_keys,
+		      (uint) key_length))
+    {
+      key.data=       (uchar*) sort_keys;
+      key.ref_length= idx->s->rec_reflength;
+      key.data_length= key_length - key.ref_length;
+      key.flag= 0;
+      if (_ma_ck_write(idx, &key))
+        goto err;
+    }
+  }
+
+  error =0;
+
+err:
+  my_free(sort_keys, MYF(MY_ALLOW_ZERO_PTR));
+  delete_dynamic(&buffpek);
+  close_cached_file(&tempfile);
+  close_cached_file(&tempfile_for_exceptions);
+
+  DBUG_RETURN(error ? -1 : 0);
+} /* _ma_create_index_by_sort */
+
+
+/* Search after all keys and place them in a temp. file */
+
+static ha_rows find_all_keys(MARIA_SORT_PARAM *info, uint keys,
+                             uchar **sort_keys, DYNAMIC_ARRAY *buffpek,
+                             int *maxbuffer, IO_CACHE *tempfile,
+                             IO_CACHE *tempfile_for_exceptions)
+{
+  int error;
+  uint idx;
+  DBUG_ENTER("find_all_keys");
+
+  idx=error=0;
+  sort_keys[0]= (uchar*) (sort_keys+keys);
+
+  while (!(error=(*info->key_read)(info,sort_keys[idx])))
+  {
+    if (info->real_key_length > info->key_length)
+    {
+      if (write_key(info,sort_keys[idx],tempfile_for_exceptions))
+        DBUG_RETURN(HA_POS_ERROR);		/* purecov: inspected */
+      continue;
+    }
+
+    if (++idx == keys)
+    {
+      if (info->write_keys(info,sort_keys,idx-1,
+                           (BUFFPEK *)alloc_dynamic(buffpek),
+                           tempfile))
+      DBUG_RETURN(HA_POS_ERROR);		/* purecov: inspected */
+
+      sort_keys[0]=(uchar*) (sort_keys+keys);
+      memcpy(sort_keys[0],sort_keys[idx-1],(size_t) info->key_length);
+      idx=1;
+    }
+    sort_keys[idx]=sort_keys[idx-1]+info->key_length;
+  }
+  if (error > 0)
+    DBUG_RETURN(HA_POS_ERROR);		/* Aborted by get_key */ /* purecov: inspected */
+  if (buffpek->elements)
+  {
+    if (info->write_keys(info,sort_keys,idx,(BUFFPEK *)alloc_dynamic(buffpek),
+                         tempfile))
+      DBUG_RETURN(HA_POS_ERROR);		/* purecov: inspected */
+    *maxbuffer=buffpek->elements-1;
+  }
+  else
+    *maxbuffer=0;
+
+  DBUG_RETURN((*maxbuffer)*(keys-1)+idx);
+} /* find_all_keys */
+
+
+#ifdef THREAD
+/* Search after all keys and place them in a temp. file */
+
+pthread_handler_t _ma_thr_find_all_keys(void *arg)
+{
+  MARIA_SORT_PARAM *sort_param= (MARIA_SORT_PARAM*) arg;
+  int error;
+  size_t memavl,old_memavl;
+  uint sort_length;
+  ulong idx, maxbuffer, keys;
+  uchar **sort_keys=0;
+
+  LINT_INIT(keys);
+
+  error=1;
+
+  if (my_thread_init())
+    goto err;
+
+  { /* Add extra block since DBUG_ENTER declare variables */
+    DBUG_ENTER("_ma_thr_find_all_keys");
+    DBUG_PRINT("enter", ("master: %d", sort_param->master));
+    if (sort_param->sort_info->got_error)
+      goto err;
+
+    if (sort_param->keyinfo->flag & HA_VAR_LENGTH_KEY)
+    {
+      sort_param->write_keys=     write_keys_varlen;
+      sort_param->read_to_buffer= read_to_buffer_varlen;
+      sort_param->write_key=      write_merge_key_varlen;
+    }
+    else
+    {
+      sort_param->write_keys=     write_keys;
+      sort_param->read_to_buffer= read_to_buffer;
+      sort_param->write_key=      write_merge_key;
+    }
+
+    my_b_clear(&sort_param->tempfile);
+    my_b_clear(&sort_param->tempfile_for_exceptions);
+    bzero((char*) &sort_param->buffpek,sizeof(sort_param->buffpek));
+    bzero((char*) &sort_param->unique, sizeof(sort_param->unique));
+
+    memavl=       max(sort_param->sortbuff_size, MIN_SORT_MEMORY);
+    idx=          (uint)sort_param->sort_info->max_records;
+    sort_length=  sort_param->key_length;
+    maxbuffer=    1;
+
+    while (memavl >= MIN_SORT_MEMORY)
+    {
+      if ((my_off_t) (idx+1)*(sort_length+sizeof(char*)) <= (my_off_t) memavl)
+        keys= idx+1;
+      else
+      {
+        ulong skr;
+        do
+        {
+          skr= maxbuffer;
+          if (memavl < sizeof(BUFFPEK)*maxbuffer ||
+              (keys=(memavl-sizeof(BUFFPEK)*maxbuffer)/
+               (sort_length+sizeof(char*))) <= 1 ||
+              keys < maxbuffer)
+          {
+            _ma_check_print_error(sort_param->sort_info->param,
+                                  "aria_sort_buffer_size is too small");
+            goto err;
+          }
+        }
+        while ((maxbuffer= (int) (idx/(keys-1)+1)) != skr);
+      }
+      if ((sort_keys= (uchar **)
+           my_malloc(keys*(sort_length+sizeof(char*))+
+                     ((sort_param->keyinfo->flag & HA_FULLTEXT) ?
+                      HA_FT_MAXBYTELEN : 0), MYF(0))))
+      {
+        if (my_init_dynamic_array(&sort_param->buffpek, sizeof(BUFFPEK),
+                                  maxbuffer, maxbuffer/2))
+        {
+          my_free(sort_keys, MYF(0));
+          sort_keys= (uchar **) NULL;            /* for err: label */
+        }
+        else
+          break;
+      }
+      old_memavl= memavl;
+      if ((memavl= memavl/4*3) < MIN_SORT_MEMORY &&
+          old_memavl > MIN_SORT_MEMORY)
+        memavl= MIN_SORT_MEMORY;
+    }
+    if (memavl < MIN_SORT_MEMORY)
+    {
+      _ma_check_print_error(sort_param->sort_info->param,
+                            "Aria sort buffer too small");
+      goto err; /* purecov: tested */
+    }
+
+    if (sort_param->sort_info->param->testflag & T_VERBOSE)
+      printf("Key %d - Allocating buffer for %lu keys\n",
+             sort_param->key+1, (ulong) keys);
+    sort_param->sort_keys= sort_keys;
+
+    idx= error= 0;
+    sort_keys[0]= (uchar*) (sort_keys+keys);
+
+    DBUG_PRINT("info", ("reading keys"));
+    while (!(error= sort_param->sort_info->got_error) &&
+           !(error= (*sort_param->key_read)(sort_param, sort_keys[idx])))
+    {
+      if (sort_param->real_key_length > sort_param->key_length)
+      {
+        if (write_key(sort_param,sort_keys[idx],
+                      &sort_param->tempfile_for_exceptions))
+          goto err;
+        continue;
+      }
+
+      if (++idx == keys)
+      {
+        if (sort_param->write_keys(sort_param, sort_keys, idx - 1,
+                                   (BUFFPEK *)alloc_dynamic(&sort_param->
+                                                            buffpek),
+                                   &sort_param->tempfile))
+          goto err;
+        sort_keys[0]= (uchar*) (sort_keys+keys);
+        memcpy(sort_keys[0], sort_keys[idx - 1],
+               (size_t) sort_param->key_length);
+        idx= 1;
+      }
+      sort_keys[idx]=sort_keys[idx - 1] + sort_param->key_length;
+    }
+    if (error > 0)
+      goto err;
+    if (sort_param->buffpek.elements)
+    {
+      if (sort_param->write_keys(sort_param,sort_keys, idx,
+                                 (BUFFPEK *) alloc_dynamic(&sort_param->
+                                                           buffpek),
+                                 &sort_param->tempfile))
+        goto err;
+      sort_param->keys= (sort_param->buffpek.elements - 1) * (keys - 1) + idx;
+    }
+    else
+      sort_param->keys= idx;
+
+    sort_param->sort_keys_length= keys;
+    goto ok;
+
+err:
+    DBUG_PRINT("error", ("got some error"));
+    sort_param->sort_info->got_error= 1; /* no need to protect with a mutex */
+    my_free(sort_keys,MYF(MY_ALLOW_ZERO_PTR));
+    sort_param->sort_keys=0;
+    delete_dynamic(& sort_param->buffpek);
+    close_cached_file(&sort_param->tempfile);
+    close_cached_file(&sort_param->tempfile_for_exceptions);
+
+ok:
+    free_root(&sort_param->wordroot, MYF(0));
+    /*
+      Detach from the share if the writer is involved. Avoid others to
+      be blocked. This includes a flush of the write buffer. This will
+      also indicate EOF to the readers.
+    */
+    if (sort_param->sort_info->info->rec_cache.share)
+      remove_io_thread(&sort_param->sort_info->info->rec_cache);
+
+    /* Readers detach from the share if any. Avoid others to be blocked. */
+    if (sort_param->read_cache.share)
+      remove_io_thread(&sort_param->read_cache);
+
+    pthread_mutex_lock(&sort_param->sort_info->mutex);
+    if (!--sort_param->sort_info->threads_running)
+      pthread_cond_signal(&sort_param->sort_info->cond);
+    pthread_mutex_unlock(&sort_param->sort_info->mutex);
+    DBUG_PRINT("exit", ("======== ending thread ========"));
+  }
+  my_thread_end();
+  return NULL;
+}
+
+
+int _ma_thr_write_keys(MARIA_SORT_PARAM *sort_param)
+{
+  MARIA_SORT_INFO *sort_info=sort_param->sort_info;
+  HA_CHECK *param=sort_info->param;
+  ulong length, keys;
+  double *rec_per_key_part= param->new_rec_per_key_part;
+  int got_error=sort_info->got_error;
+  uint i;
+  MARIA_HA *info=sort_info->info;
+  MARIA_SHARE *share= info->s;
+  MARIA_SORT_PARAM *sinfo;
+  uchar *mergebuf=0;
+  DBUG_ENTER("_ma_thr_write_keys");
+  LINT_INIT(length);
+
+  for (i= 0, sinfo= sort_param ;
+       i < sort_info->total_keys ;
+       i++, rec_per_key_part+=sinfo->keyinfo->keysegs, sinfo++)
+  {
+    if (!sinfo->sort_keys)
+    {
+      got_error=1;
+      my_free(sinfo->rec_buff, MYF(MY_ALLOW_ZERO_PTR));
+      continue;
+    }
+    if (!got_error)
+    {
+      maria_set_key_active(share->state.key_map, sinfo->key);
+
+      if (!sinfo->buffpek.elements)
+      {
+        if (param->testflag & T_VERBOSE)
+        {
+          printf("Key %d  - Dumping %u keys\n",sinfo->key+1, sinfo->keys);
+          fflush(stdout);
+        }
+        if (write_index(sinfo, sinfo->sort_keys, sinfo->keys) ||
+            flush_maria_ft_buf(sinfo) || _ma_flush_pending_blocks(sinfo))
+          got_error=1;
+      }
+      if (!got_error && param->testflag & T_STATISTICS)
+        maria_update_key_parts(sinfo->keyinfo, rec_per_key_part, sinfo->unique,
+                               param->stats_method ==
+                               MI_STATS_METHOD_IGNORE_NULLS ?
+                               sinfo->notnull : NULL,
+                               (ulonglong) share->state.state.records);
+    }
+    my_free(sinfo->sort_keys,MYF(0));
+    my_free(sinfo->rec_buff, MYF(MY_ALLOW_ZERO_PTR));
+    sinfo->sort_keys=0;
+  }
+
+  for (i= 0, sinfo= sort_param ;
+       i < sort_info->total_keys ;
+       i++,
+	 delete_dynamic(&sinfo->buffpek),
+	 close_cached_file(&sinfo->tempfile),
+	 close_cached_file(&sinfo->tempfile_for_exceptions),
+	 sinfo++)
+  {
+    if (got_error)
+      continue;
+    if (sinfo->keyinfo->flag & HA_VAR_LENGTH_KEY)
+    {
+      sinfo->write_keys=write_keys_varlen;
+      sinfo->read_to_buffer=read_to_buffer_varlen;
+      sinfo->write_key=write_merge_key_varlen;
+    }
+    else
+    {
+      sinfo->write_keys=write_keys;
+      sinfo->read_to_buffer=read_to_buffer;
+      sinfo->write_key=write_merge_key;
+    }
+    if (sinfo->buffpek.elements)
+    {
+      uint maxbuffer=sinfo->buffpek.elements-1;
+      if (!mergebuf)
+      {
+        length=param->sort_buffer_length;
+        while (length >= MIN_SORT_MEMORY)
+        {
+          if ((mergebuf= my_malloc(length, MYF(0))))
+              break;
+          length=length*3/4;
+        }
+        if (!mergebuf)
+        {
+          got_error=1;
+          continue;
+        }
+      }
+      keys=length/sinfo->key_length;
+      if (maxbuffer >= MERGEBUFF2)
+      {
+        if (param->testflag & T_VERBOSE)
+          printf("Key %d  - Merging %u keys\n",sinfo->key+1, sinfo->keys);
+        if (merge_many_buff(sinfo, keys, (uchar **) mergebuf,
+			    dynamic_element(&sinfo->buffpek, 0, BUFFPEK *),
+			    (int*) &maxbuffer, &sinfo->tempfile))
+        {
+          got_error=1;
+          continue;
+        }
+      }
+      if (flush_io_cache(&sinfo->tempfile) ||
+          reinit_io_cache(&sinfo->tempfile,READ_CACHE,0L,0,0))
+      {
+        got_error=1;
+        continue;
+      }
+      if (param->testflag & T_VERBOSE)
+        printf("Key %d  - Last merge and dumping keys\n", sinfo->key+1);
+      if (merge_index(sinfo, keys, (uchar**) mergebuf,
+                      dynamic_element(&sinfo->buffpek,0,BUFFPEK *),
+                      maxbuffer,&sinfo->tempfile) ||
+          flush_maria_ft_buf(sinfo) ||
+	  _ma_flush_pending_blocks(sinfo))
+      {
+        got_error=1;
+        continue;
+      }
+    }
+    if (my_b_inited(&sinfo->tempfile_for_exceptions))
+    {
+      uint16 key_length;
+
+      if (param->testflag & T_VERBOSE)
+        printf("Key %d  - Dumping 'long' keys\n", sinfo->key+1);
+
+      if (flush_io_cache(&sinfo->tempfile_for_exceptions) ||
+          reinit_io_cache(&sinfo->tempfile_for_exceptions,READ_CACHE,0L,0,0))
+      {
+        got_error=1;
+        continue;
+      }
+
+      while (!got_error &&
+	     !my_b_read(&sinfo->tempfile_for_exceptions,(uchar*)&key_length,
+			sizeof(key_length)))
+      {
+        uchar maria_ft_buf[HA_FT_MAXBYTELEN + HA_FT_WLEN + 10];
+        if (key_length > sizeof(maria_ft_buf) ||
+            my_b_read(&sinfo->tempfile_for_exceptions, (uchar*)maria_ft_buf,
+                      (uint) key_length))
+          got_error= 1;
+        else
+        {
+          MARIA_KEY tmp_key;
+          tmp_key.keyinfo= info->s->keyinfo + sinfo->key;
+          tmp_key.data= maria_ft_buf;
+          tmp_key.ref_length= info->s->rec_reflength;
+          tmp_key.data_length= key_length - info->s->rec_reflength;
+          tmp_key.flag= 0;
+          if (_ma_ck_write(info, &tmp_key))
+            got_error=1;
+        }
+      }
+    }
+  }
+  my_free(mergebuf,MYF(MY_ALLOW_ZERO_PTR));
+  DBUG_RETURN(got_error);
+}
+#endif /* THREAD */
+
+
+/* Write all keys in memory to file for later merge */
+
+static int write_keys(MARIA_SORT_PARAM *info, register uchar **sort_keys,
+                      uint count, BUFFPEK *buffpek, IO_CACHE *tempfile)
+{
+  uchar **end;
+  uint sort_length=info->key_length;
+  DBUG_ENTER("write_keys");
+
+  my_qsort2((uchar*) sort_keys,count,sizeof(uchar*),(qsort2_cmp) info->key_cmp,
+            info);
+  if (!my_b_inited(tempfile) &&
+      open_cached_file(tempfile, my_tmpdir(info->tmpdir), "ST",
+                       DISK_BUFFER_SIZE, info->sort_info->param->myf_rw))
+    DBUG_RETURN(1); /* purecov: inspected */
+
+  buffpek->file_pos=my_b_tell(tempfile);
+  buffpek->count=count;
+
+  for (end=sort_keys+count ; sort_keys != end ; sort_keys++)
+  {
+    if (my_b_write(tempfile, *sort_keys, (uint) sort_length))
+      DBUG_RETURN(1); /* purecov: inspected */
+  }
+  DBUG_RETURN(0);
+} /* write_keys */
+
+
+static inline int
+my_var_write(MARIA_SORT_PARAM *info, IO_CACHE *to_file, uchar *bufs)
+{
+  int err;
+  uint16 len= _ma_keylength(info->keyinfo, bufs);
+
+  /* The following is safe as this is a local file */
+  if ((err= my_b_write(to_file, (uchar*)&len, sizeof(len))))
+    return (err);
+  if ((err= my_b_write(to_file,bufs, (uint) len)))
+    return (err);
+  return (0);
+}
+
+
+static int write_keys_varlen(MARIA_SORT_PARAM *info,
+				    register uchar **sort_keys,
+                                    uint count, BUFFPEK *buffpek,
+				    IO_CACHE *tempfile)
+{
+  uchar **end;
+  int err;
+  DBUG_ENTER("write_keys_varlen");
+
+  my_qsort2((uchar*) sort_keys,count,sizeof(uchar*),(qsort2_cmp) info->key_cmp,
+            info);
+  if (!my_b_inited(tempfile) &&
+      open_cached_file(tempfile, my_tmpdir(info->tmpdir), "ST",
+                       DISK_BUFFER_SIZE, info->sort_info->param->myf_rw))
+    DBUG_RETURN(1); /* purecov: inspected */
+
+  buffpek->file_pos=my_b_tell(tempfile);
+  buffpek->count=count;
+  for (end=sort_keys+count ; sort_keys != end ; sort_keys++)
+  {
+    if ((err= my_var_write(info,tempfile, *sort_keys)))
+      DBUG_RETURN(err);
+  }
+  DBUG_RETURN(0);
+} /* write_keys_varlen */
+
+
+static int write_key(MARIA_SORT_PARAM *info, uchar *key,
+			    IO_CACHE *tempfile)
+{
+  uint16 key_length=info->real_key_length;
+  DBUG_ENTER("write_key");
+
+  if (!my_b_inited(tempfile) &&
+      open_cached_file(tempfile, my_tmpdir(info->tmpdir), "ST",
+                       DISK_BUFFER_SIZE, info->sort_info->param->myf_rw))
+    DBUG_RETURN(1);
+
+  if (my_b_write(tempfile, (uchar*)&key_length,sizeof(key_length)) ||
+      my_b_write(tempfile, key, (uint) key_length))
+    DBUG_RETURN(1);
+  DBUG_RETURN(0);
+} /* write_key */
+
+
+/* Write index */
+
+static int write_index(MARIA_SORT_PARAM *info,
+                              register uchar **sort_keys,
+                              register uint count)
+{
+  DBUG_ENTER("write_index");
+
+  my_qsort2((uchar*) sort_keys,(size_t) count,sizeof(uchar*),
+            (qsort2_cmp) info->key_cmp,info);
+  while (count--)
+  {
+    if ((*info->key_write)(info, *sort_keys++))
+      DBUG_RETURN(-1); /* purecov: inspected */
+  }
+  DBUG_RETURN(0);
+} /* write_index */
+
+
+        /* Merge buffers to make < MERGEBUFF2 buffers */
+
+static int merge_many_buff(MARIA_SORT_PARAM *info, uint keys,
+                                  uchar **sort_keys, BUFFPEK *buffpek,
+                                  int *maxbuffer, IO_CACHE *t_file)
+{
+  register int i;
+  IO_CACHE t_file2, *from_file, *to_file, *temp;
+  BUFFPEK *lastbuff;
+  DBUG_ENTER("merge_many_buff");
+
+  if (*maxbuffer < MERGEBUFF2)
+    DBUG_RETURN(0);                             /* purecov: inspected */
+  if (flush_io_cache(t_file) ||
+      open_cached_file(&t_file2,my_tmpdir(info->tmpdir),"ST",
+                       DISK_BUFFER_SIZE, info->sort_info->param->myf_rw))
+    DBUG_RETURN(1);                             /* purecov: inspected */
+
+  from_file= t_file ; to_file= &t_file2;
+  while (*maxbuffer >= MERGEBUFF2)
+  {
+    reinit_io_cache(from_file,READ_CACHE,0L,0,0);
+    reinit_io_cache(to_file,WRITE_CACHE,0L,0,0);
+    lastbuff=buffpek;
+    for (i=0 ; i <= *maxbuffer-MERGEBUFF*3/2 ; i+=MERGEBUFF)
+    {
+      if (merge_buffers(info,keys,from_file,to_file,sort_keys,lastbuff++,
+                        buffpek+i,buffpek+i+MERGEBUFF-1))
+        goto cleanup;
+    }
+    if (merge_buffers(info,keys,from_file,to_file,sort_keys,lastbuff++,
+                      buffpek+i,buffpek+ *maxbuffer))
+      break; /* purecov: inspected */
+    if (flush_io_cache(to_file))
+      break;                                    /* purecov: inspected */
+    temp=from_file; from_file=to_file; to_file=temp;
+    *maxbuffer= (int) (lastbuff-buffpek)-1;
+  }
+cleanup:
+  close_cached_file(to_file);                   /* This holds old result */
+  if (to_file == t_file)
+    *t_file=t_file2;                            /* Copy result file */
+
+  DBUG_RETURN(*maxbuffer >= MERGEBUFF2);        /* Return 1 if interrupted */
+} /* merge_many_buff */
+
+
+/*
+   Read data to buffer
+
+  SYNOPSIS
+    read_to_buffer()
+    fromfile		File to read from
+    buffpek		Where to read from
+    sort_length		max length to read
+  RESULT
+    > 0	Ammount of bytes read
+    -1	Error
+*/
+
+static uint read_to_buffer(IO_CACHE *fromfile, BUFFPEK *buffpek,
+                                  uint sort_length)
+{
+  register uint count;
+  uint length;
+
+  if ((count=(uint) min((ha_rows) buffpek->max_keys,buffpek->count)))
+  {
+    if (my_pread(fromfile->file, buffpek->base,
+                 (length= sort_length*count),buffpek->file_pos,MYF_RW))
+      return((uint) -1);                        /* purecov: inspected */
+    buffpek->key=buffpek->base;
+    buffpek->file_pos+= length;                 /* New filepos */
+    buffpek->count-=    count;
+    buffpek->mem_count= count;
+  }
+  return (count*sort_length);
+} /* read_to_buffer */
+
+static uint read_to_buffer_varlen(IO_CACHE *fromfile, BUFFPEK *buffpek,
+                                         uint sort_length)
+{
+  register uint count;
+  uint idx;
+  uchar *buffp;
+
+  if ((count=(uint) min((ha_rows) buffpek->max_keys,buffpek->count)))
+  {
+    buffp= buffpek->base;
+
+    for (idx=1;idx<=count;idx++)
+    {
+      uint16 length_of_key;
+      if (my_pread(fromfile->file,(uchar*)&length_of_key,sizeof(length_of_key),
+                   buffpek->file_pos,MYF_RW))
+        return((uint) -1);
+      buffpek->file_pos+=sizeof(length_of_key);
+      if (my_pread(fromfile->file, buffp, length_of_key,
+                   buffpek->file_pos,MYF_RW))
+        return((uint) -1);
+      buffpek->file_pos+=length_of_key;
+      buffp = buffp + sort_length;
+    }
+    buffpek->key=buffpek->base;
+    buffpek->count-=    count;
+    buffpek->mem_count= count;
+  }
+  return (count*sort_length);
+} /* read_to_buffer_varlen */
+
+
+static int write_merge_key_varlen(MARIA_SORT_PARAM *info,
+                                  IO_CACHE *to_file, uchar* key,
+                                  uint sort_length, uint count)
+{
+  uint idx;
+  uchar *bufs = key;
+
+  for (idx=1;idx<=count;idx++)
+  {
+    int err;
+    if ((err= my_var_write(info, to_file, bufs)))
+      return (err);
+    bufs=bufs+sort_length;
+  }
+  return(0);
+}
+
+
+static int write_merge_key(MARIA_SORT_PARAM *info __attribute__((unused)),
+				  IO_CACHE *to_file, uchar *key,
+				  uint sort_length, uint count)
+{
+  return my_b_write(to_file, key, (size_t) sort_length*count);
+}
+
+/*
+  Merge buffers to one buffer
+  If to_file == 0 then use info->key_write
+*/
+
+static int NEAR_F
+merge_buffers(MARIA_SORT_PARAM *info, uint keys, IO_CACHE *from_file,
+              IO_CACHE *to_file, uchar **sort_keys, BUFFPEK *lastbuff,
+              BUFFPEK *Fb, BUFFPEK *Tb)
+{
+  int error;
+  uint sort_length,maxcount;
+  ha_rows count;
+  my_off_t to_start_filepos;
+  uchar *strpos;
+  BUFFPEK *buffpek,**refpek;
+  QUEUE queue;
+  DBUG_ENTER("merge_buffers");
+
+  count=error=0;
+  maxcount=keys/((uint) (Tb-Fb) +1);
+  DBUG_ASSERT(maxcount > 0);
+  LINT_INIT(to_start_filepos);
+  if (to_file)
+    to_start_filepos=my_b_tell(to_file);
+  strpos= (uchar*) sort_keys;
+  sort_length=info->key_length;
+
+  if (init_queue(&queue,(uint) (Tb-Fb)+1,offsetof(BUFFPEK,key),0,
+                 (int (*)(void*, uchar *,uchar*)) info->key_cmp,
+                 (void*) info, 0, 0))
+    DBUG_RETURN(1); /* purecov: inspected */
+
+  for (buffpek= Fb ; buffpek <= Tb ; buffpek++)
+  {
+    count+= buffpek->count;
+    buffpek->base= strpos;
+    buffpek->max_keys=maxcount;
+    strpos+= (uint) (error=(int) info->read_to_buffer(from_file,buffpek,
+                                                      sort_length));
+    if (error == -1)
+      goto err; /* purecov: inspected */
+    queue_insert(&queue,(uchar*) buffpek);
+  }
+
+  while (queue.elements > 1)
+  {
+    for (;;)
+    {
+      buffpek=(BUFFPEK*) queue_top(&queue);
+      if (to_file)
+      {
+        if (info->write_key(info,to_file, buffpek->key,
+                            (uint) sort_length,1))
+        {
+          error=1; goto err; /* purecov: inspected */
+        }
+      }
+      else
+      {
+        if ((*info->key_write)(info,(void*) buffpek->key))
+        {
+          error=1; goto err; /* purecov: inspected */
+        }
+      }
+      buffpek->key+=sort_length;
+      if (! --buffpek->mem_count)
+      {
+        /* It's enough to check for killedptr before a slow operation */
+        if (_ma_killed_ptr(info->sort_info->param))
+        {
+          error=1;
+          goto err;
+        }
+        if (!(error=(int) info->read_to_buffer(from_file,buffpek,sort_length)))
+        {
+          uchar *base= buffpek->base;
+          uint max_keys=buffpek->max_keys;
+
+          VOID(queue_remove_top(&queue));
+
+          /* Put room used by buffer to use in other buffer */
+          for (refpek= (BUFFPEK**) &queue_top(&queue);
+               refpek <= (BUFFPEK**) &queue_end(&queue);
+               refpek++)
+          {
+            buffpek= *refpek;
+            if (buffpek->base+buffpek->max_keys*sort_length == base)
+            {
+              buffpek->max_keys+=max_keys;
+              break;
+            }
+            else if (base+max_keys*sort_length == buffpek->base)
+            {
+              buffpek->base=base;
+              buffpek->max_keys+=max_keys;
+              break;
+            }
+          }
+          break;                /* One buffer have been removed */
+        }
+      }
+      else if (error == -1)
+        goto err;               /* purecov: inspected */
+      queue_replace_top(&queue);   /* Top element has been replaced */
+    }
+  }
+  buffpek=(BUFFPEK*) queue_top(&queue);
+  buffpek->base= (uchar*) sort_keys;
+  buffpek->max_keys=keys;
+  do
+  {
+    if (to_file)
+    {
+      if (info->write_key(info, to_file, buffpek->key,
+                         sort_length,buffpek->mem_count))
+      {
+        error=1; goto err; /* purecov: inspected */
+      }
+    }
+    else
+    {
+      register uchar *end;
+      strpos= buffpek->key;
+      for (end= strpos+buffpek->mem_count*sort_length;
+           strpos != end ;
+           strpos+=sort_length)
+      {
+        if ((*info->key_write)(info, strpos))
+        {
+          error=1; goto err; /* purecov: inspected */
+        }
+      }
+    }
+  }
+  while ((error=(int) info->read_to_buffer(from_file,buffpek,sort_length)) !=
+         -1 && error != 0);
+
+  lastbuff->count=count;
+  if (to_file)
+    lastbuff->file_pos=to_start_filepos;
+err:
+  delete_queue(&queue);
+  DBUG_RETURN(error);
+} /* merge_buffers */
+
+
+        /* Do a merge to output-file (save only positions) */
+
+static int NEAR_F
+merge_index(MARIA_SORT_PARAM *info, uint keys, uchar **sort_keys,
+            BUFFPEK *buffpek, int maxbuffer, IO_CACHE *tempfile)
+{
+  DBUG_ENTER("merge_index");
+  if (merge_buffers(info,keys,tempfile,(IO_CACHE*) 0,sort_keys,buffpek,buffpek,
+                    buffpek+maxbuffer))
+    DBUG_RETURN(1); /* purecov: inspected */
+  DBUG_RETURN(0);
+} /* merge_index */
+
+
+static int flush_maria_ft_buf(MARIA_SORT_PARAM *info)
+{
+  int err=0;
+  if (info->sort_info->ft_buf)
+  {
+    err=_ma_sort_ft_buf_flush(info);
+    my_free(info->sort_info->ft_buf, MYF(0));
+    info->sort_info->ft_buf=0;
+  }
+  return err;
+}
diff --git a/storage/maria/ma_sp_defs.h b/storage/maria/ma_sp_defs.h
new file mode 100644
index 00000000000..398bf99c52e
--- /dev/null
+++ b/storage/maria/ma_sp_defs.h
@@ -0,0 +1,48 @@
+/* Copyright (C) 2006 MySQL AB & Ramil Kalimullin & MySQL Finland AB
+   & TCX DataKonsult AB
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; version 2 of the License.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */
+
+#ifndef _SP_DEFS_H
+#define _SP_DEFS_H
+
+#define SPDIMS 2
+#define SPTYPE HA_KEYTYPE_DOUBLE
+#define SPLEN  8
+
+#ifdef HAVE_SPATIAL
+
+enum wkbType
+{
+  wkbPoint = 1,
+  wkbLineString = 2,
+  wkbPolygon = 3,
+  wkbMultiPoint = 4,
+  wkbMultiLineString = 5,
+  wkbMultiPolygon = 6,
+  wkbGeometryCollection = 7
+};
+
+enum wkbByteOrder
+{
+  wkbXDR = 0,    /* Big Endian    */
+  wkbNDR = 1     /* Little Endian */
+};
+
+MARIA_KEY *_ma_sp_make_key(MARIA_HA *info, MARIA_KEY *ret_key, uint keynr,
+                           uchar *key, const uchar *record, my_off_t filepos,
+                           ulonglong trid);
+
+#endif /*HAVE_SPATIAL*/
+#endif /* _SP_DEFS_H */
diff --git a/storage/maria/ma_sp_key.c b/storage/maria/ma_sp_key.c
new file mode 100644
index 00000000000..22944a5db0a
--- /dev/null
+++ b/storage/maria/ma_sp_key.c
@@ -0,0 +1,305 @@
+/* Copyright (C) 2006 MySQL AB & Ramil Kalimullin
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; version 2 of the License.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */
+
+#include "maria_def.h"
+#include "ma_blockrec.h"                        /* For ROW_FLAG_TRANSID */
+#include "trnman.h"
+
+#ifdef HAVE_SPATIAL
+
+#include "ma_sp_defs.h"
+
+static int sp_add_point_to_mbr(uchar *(*wkb), uchar *end, uint n_dims,
+                             uchar byte_order, double *mbr);
+static int sp_get_point_mbr(uchar *(*wkb), uchar *end, uint n_dims,
+                           uchar byte_order, double *mbr);
+static int sp_get_linestring_mbr(uchar *(*wkb), uchar *end, uint n_dims,
+                                uchar byte_order, double *mbr);
+static int sp_get_polygon_mbr(uchar *(*wkb), uchar *end, uint n_dims,
+                             uchar byte_order, double *mbr);
+static int sp_get_geometry_mbr(uchar *(*wkb), uchar *end, uint n_dims,
+                              double *mbr, int top);
+static int sp_mbr_from_wkb(uchar (*wkb), uint size, uint n_dims, double *mbr);
+
+
+/**
+   Create spactial key
+*/
+
+MARIA_KEY *_ma_sp_make_key(MARIA_HA *info, MARIA_KEY *ret_key, uint keynr,
+                           uchar *key, const uchar *record, my_off_t filepos,
+                           ulonglong trid)
+{
+  HA_KEYSEG *keyseg;
+  MARIA_KEYDEF *keyinfo = &info->s->keyinfo[keynr];
+  uint len = 0;
+  const uchar *pos;
+  uint dlen;
+  uchar *dptr;
+  double mbr[SPDIMS * 2];
+  uint i;
+  DBUG_ENTER("_ma_sp_make_key");
+
+  keyseg = &keyinfo->seg[-1];
+  pos = record + keyseg->start;
+  ret_key->data= key;
+
+  dlen = _ma_calc_blob_length(keyseg->bit_start, pos);
+  memcpy_fixed(&dptr, pos + keyseg->bit_start, sizeof(char*));
+  if (!dptr)
+  {
+    my_errno= HA_ERR_NULL_IN_SPATIAL;
+    DBUG_RETURN(0);
+  }
+
+  sp_mbr_from_wkb(dptr + 4, dlen - 4, SPDIMS, mbr);	/* SRID */
+
+  for (i = 0, keyseg = keyinfo->seg; keyseg->type; keyseg++, i++)
+  {
+    uint length = keyseg->length, start= keyseg->start;
+    double val;
+
+    DBUG_ASSERT(length == 8);
+    DBUG_ASSERT(!(start % 8));
+    DBUG_ASSERT(start < sizeof(mbr));
+    DBUG_ASSERT(keyseg->type == HA_KEYTYPE_DOUBLE);
+
+    val= mbr[start / sizeof (double)];
+#ifdef HAVE_ISNAN
+    if (isnan(val))
+    {
+      bzero(key, length);
+      key+= length;
+      len+= length;
+      continue;
+    }
+#endif
+
+    if (keyseg->flag & HA_SWAP_KEY)
+    {
+      mi_float8store(key, val);
+    }
+    else
+    {
+      float8store((uchar *)key, val);
+    }
+    key += length;
+    len+= length;
+  }
+  _ma_dpointer(info->s, key, filepos);
+  ret_key->keyinfo= keyinfo;
+  ret_key->data_length= len;
+  ret_key->ref_length= info->s->rec_reflength;
+  ret_key->flag= 0;
+  if (_ma_have_versioning(info) && trid)
+  {
+    ret_key->ref_length+= transid_store_packed(info,
+                                               key + ret_key->ref_length,
+                                               trid);
+  }
+  DBUG_EXECUTE("key", _ma_print_key(DBUG_FILE, ret_key););
+  DBUG_RETURN(ret_key);
+}
+
+
+/*
+  Calculate minimal bounding rectangle (mbr) of the spatial object
+  stored in "well-known binary representation" (wkb) format.
+*/
+
+static int sp_mbr_from_wkb(uchar *wkb, uint size, uint n_dims, double *mbr)
+{
+  uint i;
+
+  for (i=0; i < n_dims; ++i)
+  {
+    mbr[i * 2] = DBL_MAX;
+    mbr[i * 2 + 1] = -DBL_MAX;
+  }
+
+  return sp_get_geometry_mbr(&wkb, wkb + size, n_dims, mbr, 1);
+}
+
+/*
+  Add one point stored in wkb to mbr
+*/
+
+static int sp_add_point_to_mbr(uchar *(*wkb), uchar *end, uint n_dims,
+			       uchar byte_order __attribute__((unused)),
+			       double *mbr)
+{
+  double ord;
+  double *mbr_end= mbr + n_dims * 2;
+
+  while (mbr < mbr_end)
+  {
+    if ((*wkb) > end - 8)
+      return -1;
+    float8get(ord, (const uchar*) *wkb);
+    (*wkb)+= 8;
+    if (ord < *mbr)
+      *mbr= ord;
+    mbr++;
+    if (ord > *mbr)
+      *mbr= ord;
+    mbr++;
+  }
+  return 0;
+}
+
+
+static int sp_get_point_mbr(uchar *(*wkb), uchar *end, uint n_dims,
+                           uchar byte_order, double *mbr)
+{
+  return sp_add_point_to_mbr(wkb, end, n_dims, byte_order, mbr);
+}
+
+
+static int sp_get_linestring_mbr(uchar *(*wkb), uchar *end, uint n_dims,
+                                  uchar byte_order, double *mbr)
+{
+  uint n_points;
+
+  n_points = uint4korr(*wkb);
+  (*wkb) += 4;
+  for (; n_points > 0; --n_points)
+  {
+    /* Add next point to mbr */
+    if (sp_add_point_to_mbr(wkb, end, n_dims, byte_order, mbr))
+      return -1;
+  }
+  return 0;
+}
+
+
+static int sp_get_polygon_mbr(uchar *(*wkb), uchar *end, uint n_dims,
+                               uchar byte_order, double *mbr)
+{
+  uint n_linear_rings;
+  uint n_points;
+
+  n_linear_rings = uint4korr((*wkb));
+  (*wkb) += 4;
+
+  for (; n_linear_rings > 0; --n_linear_rings)
+  {
+    n_points = uint4korr((*wkb));
+    (*wkb) += 4;
+    for (; n_points > 0; --n_points)
+    {
+      /* Add next point to mbr */
+      if (sp_add_point_to_mbr(wkb, end, n_dims, byte_order, mbr))
+        return -1;
+    }
+  }
+  return 0;
+}
+
+static int sp_get_geometry_mbr(uchar *(*wkb), uchar *end, uint n_dims,
+                              double *mbr, int top)
+{
+  int res;
+  uchar byte_order;
+  uint wkb_type;
+
+  byte_order = *(*wkb);
+  ++(*wkb);
+
+  wkb_type = uint4korr((*wkb));
+  (*wkb) += 4;
+
+  switch ((enum wkbType) wkb_type)
+  {
+    case wkbPoint:
+      res = sp_get_point_mbr(wkb, end, n_dims, byte_order, mbr);
+      break;
+    case wkbLineString:
+      res = sp_get_linestring_mbr(wkb, end, n_dims, byte_order, mbr);
+      break;
+    case wkbPolygon:
+      res = sp_get_polygon_mbr(wkb, end, n_dims, byte_order, mbr);
+      break;
+    case wkbMultiPoint:
+    {
+      uint n_items;
+      n_items = uint4korr((*wkb));
+      (*wkb) += 4;
+      for (; n_items > 0; --n_items)
+      {
+        byte_order = *(*wkb);
+        ++(*wkb);
+        (*wkb) += 4;
+        if (sp_get_point_mbr(wkb, end, n_dims, byte_order, mbr))
+          return -1;
+      }
+      res = 0;
+      break;
+    }
+    case wkbMultiLineString:
+    {
+      uint n_items;
+      n_items = uint4korr((*wkb));
+      (*wkb) += 4;
+      for (; n_items > 0; --n_items)
+      {
+        byte_order = *(*wkb);
+        ++(*wkb);
+        (*wkb) += 4;
+        if (sp_get_linestring_mbr(wkb, end, n_dims, byte_order, mbr))
+          return -1;
+      }
+      res = 0;
+      break;
+    }
+    case wkbMultiPolygon:
+    {
+      uint n_items;
+      n_items = uint4korr((*wkb));
+      (*wkb) += 4;
+      for (; n_items > 0; --n_items)
+      {
+        byte_order = *(*wkb);
+        ++(*wkb);
+        (*wkb) += 4;
+        if (sp_get_polygon_mbr(wkb, end, n_dims, byte_order, mbr))
+          return -1;
+      }
+      res = 0;
+      break;
+    }
+    case wkbGeometryCollection:
+    {
+      uint n_items;
+
+      if (!top)
+        return -1;
+
+      n_items = uint4korr((*wkb));
+      (*wkb) += 4;
+      for (; n_items > 0; --n_items)
+      {
+        if (sp_get_geometry_mbr(wkb, end, n_dims, mbr, 0))
+          return -1;
+      }
+      res = 0;
+      break;
+    }
+    default:
+      res = -1;
+  }
+  return res;
+}
+
+#endif /*HAVE_SPATIAL*/
diff --git a/storage/maria/ma_sp_test.c b/storage/maria/ma_sp_test.c
new file mode 100644
index 00000000000..b8c00753acb
--- /dev/null
+++ b/storage/maria/ma_sp_test.c
@@ -0,0 +1,568 @@
+/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; version 2 of the License.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */
+
+/* Testing of the basic functions of a MARIA spatial table        */
+/* Written by Alex Barkov, who has a shared copyright to this code */
+
+#include "maria.h"
+
+#ifdef HAVE_SPATIAL
+#include "ma_sp_defs.h"
+
+#define MAX_REC_LENGTH 1024
+#define KEYALG HA_KEY_ALG_RTREE
+
+static void create_linestring(uchar *record,uint rownr);
+static void print_record(uchar * record,my_off_t offs,const char * tail);
+
+static void create_key(uchar *key,uint rownr);
+static void print_key(const uchar *key,const char * tail);
+
+static int run_test(const char *filename);
+static int read_with_pos(MARIA_HA * file, int silent);
+
+static int maria_rtree_CreateLineStringWKB(double *ords, uint n_dims, uint n_points,
+                                     uchar *wkb);
+static  void maria_rtree_PrintWKB(uchar *wkb, uint n_dims);
+
+static char blob_key[MAX_REC_LENGTH];
+
+
+int main(int argc  __attribute__((unused)),char *argv[])
+{
+  MY_INIT(argv[0]);
+  maria_init();
+  exit(run_test("sp_test"));
+}
+
+
+int run_test(const char *filename)
+{
+  MARIA_HA        *file;
+  MARIA_UNIQUEDEF   uniquedef;
+  MARIA_CREATE_INFO create_info;
+  MARIA_COLUMNDEF   recinfo[20];
+  MARIA_KEYDEF      keyinfo[20];
+  HA_KEYSEG      keyseg[20];
+  key_range	 min_range, max_range;
+  int silent=0;
+  int create_flag=0;
+  int null_fields=0;
+  int nrecords=30;
+  int uniques=0;
+  int i;
+  int error;
+  int row_count=0;
+  uchar record[MAX_REC_LENGTH];
+  uchar key[MAX_REC_LENGTH];
+  uchar read_record[MAX_REC_LENGTH];
+  int upd=10;
+  ha_rows hrows;
+
+  /* Define a column for NULLs and DEL markers*/
+
+  recinfo[0].type=FIELD_NORMAL;
+  recinfo[0].length=1; /* For NULL bits */
+
+
+  /* Define spatial column  */
+
+  recinfo[1].type=FIELD_BLOB;
+  recinfo[1].length=4 + portable_sizeof_char_ptr;
+
+
+
+  /* Define a key with 1 spatial segment */
+
+  keyinfo[0].seg=keyseg;
+  keyinfo[0].keysegs=1;
+  keyinfo[0].flag=HA_SPATIAL;
+  keyinfo[0].key_alg=KEYALG;
+
+  keyinfo[0].seg[0].type= HA_KEYTYPE_BINARY;
+  keyinfo[0].seg[0].flag=0;
+  keyinfo[0].seg[0].start= 1;
+  keyinfo[0].seg[0].length=1; /* Spatial ignores it anyway */
+  keyinfo[0].seg[0].null_bit= null_fields ? 2 : 0;
+  keyinfo[0].seg[0].null_pos=0;
+  keyinfo[0].seg[0].language=default_charset_info->number;
+  keyinfo[0].seg[0].bit_start=4; /* Long BLOB */
+
+
+  if (!silent)
+    printf("- Creating isam-file\n");
+
+  bzero((char*) &create_info,sizeof(create_info));
+  create_info.max_rows=10000000;
+
+  if (maria_create(filename,
+                   DYNAMIC_RECORD,
+                   1,            /*  keys   */
+                   keyinfo,
+                   2, /* columns */
+                   recinfo,uniques,&uniquedef,&create_info,create_flag))
+    goto err;
+
+  if (!silent)
+    printf("- Open isam-file\n");
+
+  if (!(file=maria_open(filename,2,HA_OPEN_ABORT_IF_LOCKED)))
+    goto err;
+
+  if (!silent)
+    printf("- Writing key:s\n");
+
+  for (i=0; i<nrecords; i++ )
+  {
+    create_linestring(record,i);
+    error=maria_write(file,record);
+    print_record(record,maria_position(file),"\n");
+    if (!error)
+    {
+      row_count++;
+    }
+    else
+    {
+      printf("maria_write: %d\n", error);
+      goto err;
+    }
+  }
+
+  if ((error=read_with_pos(file,silent)))
+    goto err;
+
+  if (!silent)
+    printf("- Deleting rows with position\n");
+  for (i=0; i < nrecords/4; i++)
+  {
+    my_errno=0;
+    bzero((char*) read_record,MAX_REC_LENGTH);
+    error=maria_rrnd(file,read_record,i == 0 ? 0L : HA_OFFSET_ERROR);
+    if (error)
+    {
+      printf("pos: %2d  maria_rrnd: %3d  errno: %3d\n",i,error,my_errno);
+      goto err;
+    }
+    print_record(read_record,maria_position(file),"\n");
+    error=maria_delete(file,read_record);
+    if (error)
+    {
+      printf("pos: %2d maria_delete: %3d errno: %3d\n",i,error,my_errno);
+      goto err;
+    }
+  }
+
+  if (!silent)
+    printf("- Updating rows with position\n");
+  for (i=0; i < nrecords/2 ; i++)
+  {
+    my_errno=0;
+    bzero((char*) read_record,MAX_REC_LENGTH);
+    error=maria_rrnd(file,read_record,i == 0 ? 0L : HA_OFFSET_ERROR);
+    if (error)
+    {
+      if (error==HA_ERR_RECORD_DELETED)
+        continue;
+      printf("pos: %2d  maria_rrnd: %3d  errno: %3d\n",i,error,my_errno);
+      goto err;
+    }
+    print_record(read_record,maria_position(file),"");
+    create_linestring(record,i+nrecords*upd);
+    printf("\t-> ");
+    print_record(record,maria_position(file),"\n");
+    error=maria_update(file,read_record,record);
+    if (error)
+    {
+      printf("pos: %2d  maria_update: %3d  errno: %3d\n",i,error,my_errno);
+      goto err;
+    }
+  }
+
+  if ((error=read_with_pos(file,silent)))
+    goto err;
+
+  if (!silent)
+    printf("- Test maria_rkey then a sequence of maria_rnext_same\n");
+
+  create_key(key, nrecords*4/5);
+  print_key(key,"  search for INTERSECT\n");
+
+  if ((error=maria_rkey(file,read_record,0,key,0,HA_READ_MBR_INTERSECT)))
+  {
+    printf("maria_rkey: %3d  errno: %3d\n",error,my_errno);
+    goto err;
+  }
+  print_record(read_record,maria_position(file),"  maria_rkey\n");
+  row_count=1;
+
+  for (;;)
+  {
+    if ((error=maria_rnext_same(file,read_record)))
+    {
+      if (error==HA_ERR_END_OF_FILE)
+        break;
+      printf("maria_next: %3d  errno: %3d\n",error,my_errno);
+      goto err;
+    }
+    print_record(read_record,maria_position(file),"  maria_rnext_same\n");
+      row_count++;
+  }
+  printf("     %d rows\n",row_count);
+
+  if (!silent)
+    printf("- Test maria_rfirst then a sequence of maria_rnext\n");
+
+  error=maria_rfirst(file,read_record,0);
+  if (error)
+  {
+    printf("maria_rfirst: %3d  errno: %3d\n",error,my_errno);
+    goto err;
+  }
+  row_count=1;
+  print_record(read_record,maria_position(file),"  maria_frirst\n");
+
+  for(i=0;i<nrecords;i++) {
+    if ((error=maria_rnext(file,read_record,0)))
+    {
+      if (error==HA_ERR_END_OF_FILE)
+        break;
+      printf("maria_next: %3d  errno: %3d\n",error,my_errno);
+      goto err;
+    }
+    print_record(read_record,maria_position(file),"  maria_rnext\n");
+    row_count++;
+  }
+  printf("     %d rows\n",row_count);
+
+  if (!silent)
+    printf("- Test maria_records_in_range()\n");
+
+  create_key(key, nrecords*upd);
+  print_key(key," INTERSECT\n");
+  min_range.key= key;
+  min_range.length= 1000;                       /* Big enough */
+  min_range.flag= HA_READ_MBR_INTERSECT;
+  max_range.key= record+1;
+  max_range.length= 1000;                       /* Big enough */
+  max_range.flag= HA_READ_KEY_EXACT;
+  hrows= maria_records_in_range(file,0, &min_range, &max_range);
+  printf("     %ld rows\n", (long) hrows);
+
+  if (maria_close(file)) goto err;
+  maria_end();
+  my_end(MY_CHECK_ERROR);
+
+  return 0;
+
+err:
+  printf("got error: %3d when using maria-database\n",my_errno);
+  maria_end();
+  return 1;           /* skip warning */
+}
+
+
+static int read_with_pos (MARIA_HA * file,int silent)
+{
+  int error;
+  int i;
+  uchar read_record[MAX_REC_LENGTH];
+  int rows=0;
+
+  if (!silent)
+    printf("- Reading rows with position\n");
+  for (i=0;;i++)
+  {
+    my_errno=0;
+    bzero((char*) read_record,MAX_REC_LENGTH);
+    error=maria_rrnd(file,read_record,i == 0 ? 0L : HA_OFFSET_ERROR);
+    if (error)
+    {
+      if (error==HA_ERR_END_OF_FILE)
+        break;
+      if (error==HA_ERR_RECORD_DELETED)
+        continue;
+      printf("pos: %2d  maria_rrnd: %3d  errno: %3d\n",i,error,my_errno);
+      return error;
+    }
+    rows++;
+    print_record(read_record,maria_position(file),"\n");
+  }
+  printf("     %d rows\n",rows);
+  return 0;
+}
+
+
+#ifdef NOT_USED
+static void bprint_record(uchar * record,
+			  my_off_t offs __attribute__((unused)),
+			  const char * tail)
+{
+  int i;
+  char * pos;
+  i=(unsigned char)record[0];
+  printf("%02X ",i);
+
+  for( pos=record+1, i=0; i<32; i++,pos++)
+  {
+    int b=(unsigned char)*pos;
+    printf("%02X",b);
+  }
+  printf("%s",tail);
+}
+#endif
+
+
+static void print_record(uchar * record, my_off_t offs,const char * tail)
+{
+  uchar *pos;
+  char *ptr;
+  uint len;
+
+  printf("     rec=(%d)",(unsigned char)record[0]);
+  pos=record+1;
+  len=sint4korr(pos);
+  pos+=4;
+  printf(" len=%d ",len);
+  memcpy_fixed(&ptr,pos,sizeof(char*));
+  if (ptr)
+    maria_rtree_PrintWKB((uchar*) ptr,SPDIMS);
+  else
+    printf("<NULL> ");
+  printf(" offs=%ld ",(long int)offs);
+  printf("%s",tail);
+}
+
+
+#ifdef NOT_USED
+static void create_point(uchar *record,uint rownr)
+{
+   uint tmp;
+   char *ptr;
+   char *pos=record;
+   double x[200];
+   int i;
+
+   for(i=0;i<SPDIMS;i++)
+     x[i]=rownr;
+
+   bzero((char*) record,MAX_REC_LENGTH);
+   *pos=0x01; /* DEL marker */
+   pos++;
+
+   memset(blob_key,0,sizeof(blob_key));
+   tmp=maria_rtree_CreatePointWKB(x,SPDIMS,blob_key);
+
+   int4store(pos,tmp);
+   pos+=4;
+
+   ptr=blob_key;
+   memcpy_fixed(pos,&ptr,sizeof(char*));
+}
+#endif
+
+
+static void create_linestring(uchar *record,uint rownr)
+{
+   uint tmp;
+   char *ptr;
+   uchar *pos= record;
+   double x[200];
+   int i,j;
+   int npoints=2;
+
+   for(j=0;j<npoints;j++)
+     for(i=0;i<SPDIMS;i++)
+       x[i+j*SPDIMS]=rownr*j;
+
+   bzero((char*) record,MAX_REC_LENGTH);
+   *pos=0x01; /* DEL marker */
+   pos++;
+
+   memset(blob_key,0,sizeof(blob_key));
+   tmp=maria_rtree_CreateLineStringWKB(x,SPDIMS,npoints, (uchar*) blob_key);
+
+   int4store(pos,tmp);
+   pos+=4;
+
+   ptr=blob_key;
+   memcpy_fixed(pos,&ptr,sizeof(char*));
+}
+
+
+static void create_key(uchar *key,uint rownr)
+{
+   double c=rownr;
+   uchar *pos;
+   uint i;
+
+   bzero(key,MAX_REC_LENGTH);
+   for ( pos=key, i=0; i<2*SPDIMS; i++)
+   {
+     float8store(pos,c);
+     pos+=sizeof(c);
+   }
+}
+
+static void print_key(const uchar *key,const char * tail)
+{
+  double c;
+  uint i;
+
+  printf("     key=");
+  for (i=0; i<2*SPDIMS; i++)
+  {
+    float8get(c,key);
+    key+=sizeof(c);
+    printf("%.14g ",c);
+  }
+  printf("%s",tail);
+}
+
+
+#ifdef NOT_USED
+
+static int maria_rtree_CreatePointWKB(double *ords, uint n_dims, uchar *wkb)
+{
+  uint i;
+
+  *wkb = wkbXDR;
+  ++wkb;
+  int4store(wkb, wkbPoint);
+  wkb += 4;
+
+  for (i=0; i < n_dims; ++i)
+  {
+    float8store(wkb, ords[i]);
+    wkb += 8;
+  }
+  return 5 + n_dims * 8;
+}
+#endif
+
+
+static int maria_rtree_CreateLineStringWKB(double *ords, uint n_dims, uint n_points,
+				     uchar *wkb)
+{
+  uint i;
+  uint n_ords = n_dims * n_points;
+
+  *wkb = wkbXDR;
+  ++wkb;
+  int4store(wkb, wkbLineString);
+  wkb += 4;
+  int4store(wkb, n_points);
+  wkb += 4;
+  for (i=0; i < n_ords; ++i)
+  {
+    float8store(wkb, ords[i]);
+    wkb += 8;
+  }
+  return 9 + n_points * n_dims * 8;
+}
+
+
+static void maria_rtree_PrintWKB(uchar *wkb, uint n_dims)
+{
+  uint wkb_type;
+
+  ++wkb;
+  wkb_type = uint4korr(wkb);
+  wkb += 4;
+
+  switch ((enum wkbType)wkb_type)
+  {
+    case wkbPoint:
+    {
+      uint i;
+      double ord;
+
+      printf("POINT(");
+      for (i=0; i < n_dims; ++i)
+      {
+        float8get(ord, wkb);
+        wkb += 8;
+        printf("%.14g", ord);
+        if (i < n_dims - 1)
+          printf(" ");
+        else
+          printf(")");
+      }
+      break;
+    }
+    case wkbLineString:
+    {
+      uint p, i;
+      uint n_points;
+      double ord;
+
+      printf("LineString(");
+      n_points = uint4korr(wkb);
+      wkb += 4;
+      for (p=0; p < n_points; ++p)
+      {
+        for (i=0; i < n_dims; ++i)
+        {
+          float8get(ord, wkb);
+          wkb += 8;
+          printf("%.14g", ord);
+          if (i < n_dims - 1)
+            printf(" ");
+        }
+        if (p < n_points - 1)
+          printf(", ");
+        else
+          printf(")");
+      }
+      break;
+    }
+    case wkbPolygon:
+    {
+      printf("POLYGON(...)");
+      break;
+    }
+    case wkbMultiPoint:
+    {
+      printf("MULTIPOINT(...)");
+      break;
+    }
+    case wkbMultiLineString:
+    {
+      printf("MULTILINESTRING(...)");
+      break;
+    }
+    case wkbMultiPolygon:
+    {
+      printf("MULTIPOLYGON(...)");
+      break;
+    }
+    case wkbGeometryCollection:
+    {
+      printf("GEOMETRYCOLLECTION(...)");
+      break;
+    }
+    default:
+    {
+      printf("UNKNOWN GEOMETRY TYPE");
+      break;
+    }
+  }
+}
+
+#else
+int main(int argc __attribute__((unused)),char *argv[] __attribute__((unused)))
+{
+  exit(0);
+}
+#endif /*HAVE_SPATIAL*/
diff --git a/storage/maria/ma_state.c b/storage/maria/ma_state.c
new file mode 100644
index 00000000000..ca94d58264b
--- /dev/null
+++ b/storage/maria/ma_state.c
@@ -0,0 +1,795 @@
+/* Copyright (C) 2008 Sun AB and Michael Widenius
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; version 2 of the License.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */
+
+/*
+  Functions to maintain live statistics for Maria transactional tables
+  and versioning for not transactional tables
+
+  See WL#3138; Maria - fast "SELECT COUNT(*) FROM t;" and "CHECKSUM TABLE t"
+  for details about live number of rows and live checksums
+
+  TODO
+   - Allocate MA_USED_TABLES and MA_HISTORY_STATE from a global pool (to
+     avoid calls to malloc()
+   - In trnamn_end_trans_hook(), don't call _ma_remove_not_visible_states()
+     every time. One could for example call it if there has been more than
+     10 ended transactions since last time it was called.
+*/
+
+#include "maria_def.h"
+#include "trnman.h"
+#include "ma_blockrec.h"
+
+/**
+   @brief Setup initial start-of-transaction state for a table
+
+   @fn     _ma_setup_live_state
+   @param info		Maria handler
+
+   @notes
+     This function ensures that trn->used_tables contains a list of
+     start and live states for tables that are part of the transaction
+     and that info->state points to the current live state for the table.
+
+   @TODO
+     Change trn->table_list to a hash and share->state_history to a binary tree
+
+   @return
+   @retval 0  ok
+   @retval 1  error (out of memory)
+*/
+
+my_bool _ma_setup_live_state(MARIA_HA *info)
+{
+  TRN *trn;
+  MARIA_SHARE *share= info->s;
+  MARIA_USED_TABLES *tables;
+  MARIA_STATE_HISTORY *history;
+  DBUG_ENTER("_ma_setup_live_state");
+
+  if (maria_create_trn_hook(info))
+    DBUG_RETURN(1);
+
+  trn= info->trn;
+  for (tables= (MARIA_USED_TABLES*) info->trn->used_tables;
+       tables;
+       tables= tables->next)
+  {
+    if (tables->share == share)
+    {
+      /* Table is already used by transaction */
+      goto end;
+    }
+  }
+
+  /* Table was not used before, create new table state entry */
+  if (!(tables= (MARIA_USED_TABLES*) my_malloc(sizeof(*tables),
+                                               MYF(MY_WME | MY_ZEROFILL))))
+    DBUG_RETURN(1);
+  tables->next= trn->used_tables;
+  trn->used_tables= tables;
+  tables->share= share;
+
+  pthread_mutex_lock(&share->intern_lock);
+  share->in_trans++;
+  DBUG_PRINT("info", ("share: 0x%lx  in_trans: %d",
+                      (ulong) share, share->in_trans));
+
+  history= share->state_history;
+
+  /*
+    We must keep share locked to ensure that we don't access a history
+    link that is deleted by concurrently running checkpoint.
+
+    It's enough to compare trids here (instead of calling
+    tranman_can_read_from) as history->trid is a commit_trid
+  */
+  while (trn->trid <= history->trid)
+    history= history->next;
+  pthread_mutex_unlock(&share->intern_lock);
+  /* The current item can't be deleted as it's the first one visible for us */
+  tables->state_start=  tables->state_current= history->state;
+  tables->state_current.changed= tables->state_current.no_transid= 0;
+
+  DBUG_PRINT("info", ("records: %ld", (ulong) tables->state_start.records));
+
+end:
+  info->state_start= &tables->state_start;
+  info->state= &tables->state_current;
+
+  /*
+    Mark in transaction state if we are not using transid (versioning)
+    on rows. If not, then we will in _ma_trnman_end_trans_hook()
+    ensure that the state is visible for all at end of transaction
+  */
+  tables->state_current.no_transid|= !(info->row_flag & ROW_FLAG_TRANSID);
+
+  DBUG_RETURN(0);
+}
+
+
+/**
+   @brief Remove states that are not visible by anyone
+
+   @fn   _ma_remove_not_visible_states()
+   @param org_history    List to history
+   @param all            1 if we should delete the first state if it's
+                         visible for all.  For the moment this is only used
+                         on close() of table.
+   @param trnman_is_locked  Set to 1 if we have already a lock on trnman.
+
+   @notes
+     The assumption is that items in the history list is ordered by
+     commit_trid.
+
+     A state is not visible anymore if there is no new transaction
+     that has been started between the commit_trid's of two states
+
+     As long as some states exists, we keep the newest = (last commit)
+     state as first state in the history.  This is to allow us to just move
+     the history from the global list to the share when we open the table.
+
+     Note that if 'all' is set trnman_is_locked must be 0, becasue
+     trnman_get_min_trid() will take a lock on trnman.
+
+   @return
+   @retval Pointer to new history list
+*/
+
+MARIA_STATE_HISTORY
+*_ma_remove_not_visible_states(MARIA_STATE_HISTORY *org_history,
+                               my_bool all,
+                               my_bool trnman_is_locked)
+{
+  TrID last_trid;
+  MARIA_STATE_HISTORY *history, **parent, *next;
+  DBUG_ENTER("_ma_remove_not_visible_states");
+
+  if (!org_history)
+    DBUG_RETURN(0);                          /* Not versioned table */
+
+  last_trid= org_history->trid;
+  parent= &org_history->next;
+  for (history= org_history->next; history; history= next)
+  {
+    next= history->next;
+    if (!trnman_exists_active_transactions(history->trid, last_trid,
+                                           trnman_is_locked))
+    {
+      DBUG_PRINT("info", ("removing history->trid: %lu  next: %lu",
+                          (ulong) history->trid, (ulong) last_trid));
+      my_free(history, MYF(0));
+      continue;
+    }
+    *parent= history;
+    parent= &history->next;
+    last_trid= history->trid;
+  }
+  *parent= 0;
+
+  if (all && parent == &org_history->next)
+  {
+    /* There is only one state left. Delete this if it's visible for all */
+    if (last_trid < trnman_get_min_trid())
+    {
+      my_free(org_history, MYF(0));
+      org_history= 0;
+    }
+  }
+  DBUG_RETURN(org_history);
+}
+
+
+/**
+   @brief Remove not used state history
+
+   @param share          Maria table information
+   @param all            1 if we should delete the first state if it's
+                         visible for all.  For the moment this is only used
+                         on close() of table.
+
+   @notes
+   share and trnman are not locked.
+
+   We must first lock trnman and then share->intern_lock. This is becasue
+   _ma_trnman_end_trans_hook() has a lock on trnman and then
+   takes share->intern_lock.
+*/
+
+void _ma_remove_not_visible_states_with_lock(MARIA_SHARE *share,
+                                             my_bool all)
+{
+  my_bool is_lock_trman;
+  if ((is_lock_trman= trman_is_inited()))
+    trnman_lock();
+
+  pthread_mutex_lock(&share->intern_lock);
+  share->state_history=  _ma_remove_not_visible_states(share->state_history,
+                                                       all, 1);
+  pthread_mutex_unlock(&share->intern_lock);
+  if (is_lock_trman)
+    trnman_unlock();
+}
+
+
+/*
+  Free state history information from share->history and reset information
+  to current state.
+
+  @notes
+  Used after repair as then all rows are visible for everyone
+*/
+
+void _ma_reset_state(MARIA_HA *info)
+{
+  MARIA_SHARE *share= info->s;
+  MARIA_STATE_HISTORY *history= share->state_history;
+
+  if (history)
+  {
+    MARIA_STATE_HISTORY *next;
+
+    /* Set the current history to current state */
+    share->state_history->state= share->state.state;
+    /* Set current table handler to point to new history state */
+    info->state= info->state_start= &share->state_history->state;
+    for (history= history->next ; history ; history= next)
+    {
+      next= history->next;
+      my_free(history, MYF(0));
+    }
+    share->state_history->next= 0;
+    share->state_history->trid= 0;              /* Visibile for all */
+  }
+}
+
+
+/****************************************************************************
+  The following functions are called by thr_lock() in threaded applications
+  for not transactional tables
+****************************************************************************/
+
+/*
+  Create a copy of the current status for the table
+
+  SYNOPSIS
+    _ma_get_status()
+    param		Pointer to Myisam handler
+    concurrent_insert	Set to 1 if we are going to do concurrent inserts
+			(THR_WRITE_CONCURRENT_INSERT was used)
+*/
+
+void _ma_get_status(void* param, my_bool concurrent_insert)
+{
+  MARIA_HA *info=(MARIA_HA*) param;
+  DBUG_ENTER("_ma_get_status");
+  DBUG_PRINT("info",("key_file: %ld  data_file: %ld  concurrent_insert: %d",
+		     (long) info->s->state.state.key_file_length,
+		     (long) info->s->state.state.data_file_length,
+                     concurrent_insert));
+#ifndef DBUG_OFF
+  if (info->state->key_file_length > info->s->state.state.key_file_length ||
+      info->state->data_file_length > info->s->state.state.data_file_length)
+    DBUG_PRINT("warning",("old info:  key_file: %ld  data_file: %ld",
+			  (long) info->state->key_file_length,
+			  (long) info->state->data_file_length));
+#endif
+  info->state_save= info->s->state.state;
+  info->state= &info->state_save;
+  info->state->changed= 0;
+  info->append_insert_at_end= concurrent_insert;
+  DBUG_VOID_RETURN;
+}
+
+
+void _ma_update_status(void* param)
+{
+  MARIA_HA *info=(MARIA_HA*) param;
+  /*
+    Because someone may have closed the table we point at, we only
+    update the state if its our own state.  This isn't a problem as
+    we are always pointing at our own lock or at a read lock.
+    (This is enforced by thr_multi_lock.c)
+  */
+  if (info->state == &info->state_save)
+  {
+    MARIA_SHARE *share= info->s;
+#ifndef DBUG_OFF
+    DBUG_PRINT("info",("updating status:  key_file: %ld  data_file: %ld",
+		       (long) info->state->key_file_length,
+		       (long) info->state->data_file_length));
+    if (info->state->key_file_length < share->state.state.key_file_length ||
+	info->state->data_file_length < share->state.state.data_file_length)
+      DBUG_PRINT("warning",("old info:  key_file: %ld  data_file: %ld",
+			    (long) share->state.state.key_file_length,
+			    (long) share->state.state.data_file_length));
+#endif
+    /*
+      we are going to modify the state without lock's log, this would break
+      recovery if done with a transactional table.
+    */
+    DBUG_ASSERT(!info->s->base.born_transactional);
+    share->state.state= *info->state;
+    info->state= &share->state.state;
+#ifdef HAVE_QUERY_CACHE
+    DBUG_PRINT("info", ("invalidator... '%s' (status update)",
+                        info->s->data_file_name.str));
+    DBUG_ASSERT(info->s->chst_invalidator != NULL);
+    (*info->s->chst_invalidator)((const char *)info->s->data_file_name.str);
+#endif
+
+  }
+  info->append_insert_at_end= 0;
+}
+
+
+/*
+  Same as ma_update_status() but take a lock in the table lock, to protect
+  against someone calling ma_get_status() from thr_lock() at the same time.
+*/
+
+void _ma_update_status_with_lock(MARIA_HA *info)
+{
+  my_bool locked= 0;
+  if (info->state == &info->state_save)
+  {
+    locked= 1;
+    pthread_mutex_lock(&info->s->lock.mutex);
+  }
+  (*info->s->lock.update_status)(info);
+  if (locked)
+    pthread_mutex_unlock(&info->s->lock.mutex);
+}
+
+
+void _ma_restore_status(void *param)
+{
+  MARIA_HA *info= (MARIA_HA*) param;
+  info->state= &info->s->state.state;
+  info->append_insert_at_end= 0;
+}
+
+
+void _ma_copy_status(void* to, void *from)
+{
+  ((MARIA_HA*) to)->state= &((MARIA_HA*) from)->state_save;
+}
+
+
+void _ma_reset_update_flag(void *param,
+                           my_bool concurrent_insert __attribute__((unused)))
+{
+  MARIA_HA *info=(MARIA_HA*) param;
+  info->state->changed= 0;
+}
+
+
+/**
+   @brief Check if should allow concurrent inserts
+
+   @implementation
+     Allow concurrent inserts if we don't have a hole in the table or
+     if there is no active write lock and there is active read locks and
+     maria_concurrent_insert == 2. In this last case the new
+     row('s) are inserted at end of file instead of filling up the hole.
+
+     The last case is to allow one to inserts into a heavily read-used table
+     even if there is holes.
+
+   @notes
+     If there is a an rtree indexes in the table, concurrent inserts are
+     disabled in maria_open()
+
+  @return
+  @retval 0  ok to use concurrent inserts
+  @retval 1  not ok
+*/
+
+my_bool _ma_check_status(void *param)
+{
+  MARIA_HA *info=(MARIA_HA*) param;
+  /*
+    The test for w_locks == 1 is here because this thread has already done an
+    external lock (in other words: w_locks == 1 means no other threads has
+    a write lock)
+  */
+  DBUG_PRINT("info",("dellink: %ld  r_locks: %u  w_locks: %u",
+                     (long) info->s->state.dellink, (uint) info->s->r_locks,
+                     (uint) info->s->w_locks));
+  return (my_bool) !(info->s->state.dellink == HA_OFFSET_ERROR ||
+                     (maria_concurrent_insert == 2 && info->s->r_locks &&
+                      info->s->w_locks == 1));
+}
+
+
+/**
+   @brief write hook at end of trans to store status for all used table
+
+   @Notes
+   This function must be called under trnman_lock in trnman_end_trn()
+   because of the following reasons:
+   - After trnman_end_trn() is called, the current transaction will be
+   regarded as committed and all used tables state_history will be
+   visible to other transactions.  To do this, we loop over all used
+   tables and create/update a history entries that contains the correct
+   state_history for them.
+*/
+
+my_bool _ma_trnman_end_trans_hook(TRN *trn, my_bool commit,
+                                  my_bool active_transactions)
+{
+  my_bool error= 0;
+  MARIA_USED_TABLES *tables, *next;
+  DBUG_ENTER("_ma_trnman_end_trans_hook");
+  
+  for (tables= (MARIA_USED_TABLES*) trn->used_tables;
+       tables;
+       tables= next)
+  {
+    MARIA_SHARE *share= tables->share;
+    next= tables->next;
+    if (commit)
+    {
+      MARIA_STATE_HISTORY *history;
+
+      pthread_mutex_lock(&share->intern_lock);
+
+      /* We only have to update history state if something changed */
+      if (tables->state_current.changed)
+      {
+        if (tables->state_current.no_transid)
+        {
+          /*
+            The change was done without using transid on rows (like in
+            bulk insert). In this case this thread is the only one
+            that is using the table and all rows will be visble
+            for all transactions.
+          */
+          _ma_reset_history(share);
+        }
+        else
+        {
+          if (active_transactions && share->now_transactional &&
+              trnman_exists_active_transactions(share->state_history->trid,
+                                                trn->commit_trid, 1))
+          {
+            /*
+              There exist transactions that are still using the current
+              share->state_history.  Create a new history item for this
+              commit and add it first in the state_history list. This
+              ensures that all history items are stored in the list in
+              decresing trid order.
+            */
+            if (!(history= my_malloc(sizeof(*history), MYF(MY_WME))))
+            {
+              /* purecov: begin inspected */
+              error= 1;
+              pthread_mutex_unlock(&share->intern_lock);
+              my_free(tables, MYF(0));
+              continue;
+              /* purecov: end */
+            }
+            history->state= share->state_history->state;
+            history->next= share->state_history;
+            share->state_history= history;
+          }
+          else
+          {
+            /* Previous history can't be seen by anyone, reuse old memory */
+            history= share->state_history;
+            DBUG_PRINT("info", ("removing history->trid: %lu  new: %lu",
+                                (ulong) history->trid,
+                                (ulong) trn->commit_trid));
+          }
+
+          history->state.records+= (tables->state_current.records -
+                                    tables->state_start.records);
+          history->state.checksum+= (tables->state_current.checksum -
+                                     tables->state_start.checksum);
+          history->trid= trn->commit_trid;
+
+          share->state.last_change_trn= trn->commit_trid;
+
+          if (history->next)
+          {
+            /* Remove not visible states */
+            share->state_history= _ma_remove_not_visible_states(history, 0, 1);
+          }
+          DBUG_PRINT("info", ("share: 0x%lx  in_trans: %d",
+                              (ulong) share, share->in_trans));
+        }
+      }
+      share->in_trans--;
+      pthread_mutex_unlock(&share->intern_lock);
+    }
+    else
+    {
+#ifndef DBUG_OFF
+      /*
+        We need to keep share->in_trans correct in the debug library
+        because of the assert in maria_close()
+      */
+      pthread_mutex_lock(&share->intern_lock);
+      share->in_trans--;
+      pthread_mutex_unlock(&share->intern_lock);
+#endif
+    }
+    my_free(tables, MYF(0));
+  }
+  trn->used_tables= 0;
+  DBUG_RETURN(error);
+}
+
+
+/**
+   Remove table from trnman_list
+
+   @notes
+     This is used when we unlock a table from a group of locked tables
+     just before doing a rename or drop table.
+
+     share->internal_lock must be locked when function is called
+*/
+
+void _ma_remove_table_from_trnman(MARIA_SHARE *share, TRN *trn)
+{
+  MARIA_USED_TABLES *tables, **prev;
+  DBUG_ENTER("_ma_remove_table_from_trnman");
+  DBUG_PRINT("enter", ("share: 0x%lx  in_trans: %d",
+                       (ulong) share, share->in_trans));
+
+  safe_mutex_assert_owner(&share->intern_lock);
+  
+  for (prev= (MARIA_USED_TABLES**) (char*) &trn->used_tables, tables= *prev;
+       tables;
+       tables= *prev)
+  {
+    if (tables->share == share)
+    {
+      *prev= tables->next;
+      share->in_trans--;
+      DBUG_PRINT("info", ("in_trans: %d", share->in_trans));
+      my_free(tables, MYF(0));
+      break;
+    }
+    prev= &tables->next;
+  }
+  DBUG_VOID_RETURN;
+}
+
+
+
+/****************************************************************************
+  The following functions are called by thr_lock() in threaded applications
+  for transactional tables.
+****************************************************************************/
+
+/*
+  Create a copy of the current status for the table
+
+  SYNOPSIS
+    _ma_get_status()
+    param		Pointer to Myisam handler
+    concurrent_insert	Set to 1 if we are going to do concurrent inserts
+			(THR_WRITE_CONCURRENT_INSERT was used)
+*/
+
+void _ma_block_get_status(void* param, my_bool concurrent_insert)
+{
+  MARIA_HA *info=(MARIA_HA*) param;
+  DBUG_ENTER("_ma_block_get_status");
+  DBUG_PRINT("enter", ("concurrent_insert %d", concurrent_insert));
+
+  info->row_base_length= info->s->base_length;
+  info->row_flag= info->s->base.default_row_flag;
+  if (concurrent_insert)
+  {
+    DBUG_ASSERT(info->lock.type == TL_WRITE_CONCURRENT_INSERT);
+    info->row_flag|= ROW_FLAG_TRANSID;
+    info->row_base_length+= TRANSID_SIZE;
+  }
+  else
+  {
+    DBUG_ASSERT(info->lock.type != TL_WRITE_CONCURRENT_INSERT);
+  }
+  DBUG_VOID_RETURN;
+}
+
+
+my_bool _ma_block_start_trans(void* param)
+{
+  MARIA_HA *info=(MARIA_HA*) param;
+  if (info->s->lock_key_trees)
+  {
+    /*
+      Assume for now that this doesn't fail (It can only fail in
+      out of memory conditions)
+      TODO: Fix this by having one extra state pre-allocated
+    */
+    return _ma_setup_live_state(info);
+  }
+
+  /*
+    Info->trn is set if this table is already handled and we are
+    called from maria_versioning()
+  */
+  if (info->s->base.born_transactional && !info->trn)
+  {
+    /*
+      Assume for now that this doesn't fail (It can only fail in
+      out of memory conditions)
+    */
+    return maria_create_trn_hook(info) != 0;
+  }
+  return 0;
+}
+
+
+void _ma_block_update_status(void *param __attribute__((unused)))
+{
+}
+
+void _ma_block_restore_status(void *param __attribute__((unused)))
+{
+}
+
+
+/**
+  Check if should allow concurrent inserts
+
+  @return
+  @retval 0  ok to use concurrent inserts
+  @retval 1  not ok
+*/
+
+my_bool _ma_block_check_status(void *param __attribute__((unused)))
+{
+  return (my_bool) 0;
+}
+
+
+/* Get status when transactional but not versioned */
+
+my_bool _ma_block_start_trans_no_versioning(void* param)
+{
+  MARIA_HA *info=(MARIA_HA*) param;
+  DBUG_ENTER("_ma_block_get_status_no_version");
+  DBUG_ASSERT(info->s->base.born_transactional);
+
+  info->state->changed= 0;              /* from _ma_reset_update_flag() */
+  if (!info->trn)
+  {
+    /*
+      Assume for now that this doesn't fail (It can only fail in
+      out of memory conditions)
+    */
+    DBUG_RETURN(maria_create_trn_hook(info));
+  }
+  DBUG_RETURN(0);
+}
+
+
+/**
+  Enable/disable versioning
+*/
+
+void maria_versioning(MARIA_HA *info, my_bool versioning)
+{
+  /* For now, this is a hack */
+  if (info->s->have_versioning)
+  {
+    enum thr_lock_type save_lock_type;
+    /* Assume is a non threaded application (for now) */
+    info->s->lock_key_trees= 0;
+    /* Set up info->lock.type temporary for _ma_block_get_status() */
+    save_lock_type= info->lock.type;
+    info->lock.type= versioning ? TL_WRITE_CONCURRENT_INSERT : TL_WRITE;
+    _ma_block_get_status((void*) info, versioning);
+    info->lock.type= save_lock_type;
+    info->state= info->state_start= &info->s->state.common;
+  }
+}
+
+
+/**
+   Update data_file_length to new length
+
+   NOTES
+     Only used by block records
+*/
+
+void _ma_set_share_data_file_length(MARIA_SHARE *share, ulonglong new_length)
+{
+  pthread_mutex_lock(&share->intern_lock);
+  if (share->state.state.data_file_length < new_length)
+    share->state.state.data_file_length= new_length;
+  pthread_mutex_unlock(&share->intern_lock);
+}
+
+
+/**
+   Copy state information that where updated while the table was used
+   in not transactional mode
+*/
+
+void _ma_copy_nontrans_state_information(MARIA_HA *info)
+{
+  info->s->state.state.records=          info->state->records;
+  info->s->state.state.checksum=         info->state->checksum;
+}
+
+
+void _ma_reset_history(MARIA_SHARE *share)
+{
+  MARIA_STATE_HISTORY *history, *next;
+  DBUG_ENTER("_ma_reset_history");
+
+  share->state_history->trid= 0;          /* Visibly by all */
+  share->state_history->state= share->state.state;
+  history= share->state_history->next;
+  share->state_history->next= 0;
+
+  for (; history; history= next)
+  {
+    next= history->next;
+    my_free(history, MYF(0));
+  }
+  DBUG_VOID_RETURN;
+}
+
+
+/****************************************************************************
+  Virtual functions to check if row is visible
+****************************************************************************/
+
+/**
+   Row is always visible
+   This is for tables without concurrent insert
+*/
+
+my_bool _ma_row_visible_always(MARIA_HA *info __attribute__((unused)))
+{
+  return 1;
+}
+
+
+/**
+   Row visibility for non transactional tables with concurrent insert
+
+   @implementation
+   When we got our table lock, we saved the current
+   data_file_length. Concurrent inserts always go to the end of the
+   file. So we can test if the found key references a new record.
+*/
+
+my_bool _ma_row_visible_non_transactional_table(MARIA_HA *info)
+{
+  return info->cur_row.lastpos < info->state->data_file_length;
+}
+
+
+/**
+   Row visibility for transactional tables with versioning
+
+
+   @TODO
+   Add test if found key was marked deleted and it was deleted by
+   us. In that case we should return 0
+*/
+
+my_bool _ma_row_visible_transactional_table(MARIA_HA *info)
+{
+  return trnman_can_read_from(info->trn, info->cur_row.trid);
+}
diff --git a/storage/maria/ma_state.h b/storage/maria/ma_state.h
new file mode 100644
index 00000000000..03ce5c2ea8c
--- /dev/null
+++ b/storage/maria/ma_state.h
@@ -0,0 +1,86 @@
+/* Copyright (C) 2008 Sun AB & Michael Widenius
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; version 2 of the License.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */
+
+/* Struct to store tables in use by one transaction */
+
+typedef struct st_maria_status_info
+{
+  ha_rows records;                      /* Rows in table */
+  ha_rows del;                          /* Removed rows */
+  my_off_t empty;                       /* lost space in datafile */
+  my_off_t key_empty;                   /* lost space in indexfile */
+  my_off_t key_file_length;
+  my_off_t data_file_length;
+  ha_checksum checksum;
+  uint32 changed:1,                     /* Set if table was changed */
+         no_transid:1;                  /* Set if no transid was set on rows */
+} MARIA_STATUS_INFO;
+
+
+typedef struct st_used_tables {
+  struct st_used_tables *next;
+  struct st_maria_share *share;
+  MARIA_STATUS_INFO state_current;
+  MARIA_STATUS_INFO state_start;
+} MARIA_USED_TABLES;
+
+
+/* Struct to store commit state at different times */
+
+typedef struct st_state_history {
+  struct st_state_history *next;
+  TrID trid;
+  MARIA_STATUS_INFO state;
+} MARIA_STATE_HISTORY;
+
+
+/* struct to remember history for closed tables */
+
+typedef struct st_state_history_closed {
+  LSN create_rename_lsn;
+  MARIA_STATE_HISTORY *state_history;
+} MARIA_STATE_HISTORY_CLOSED;
+
+
+my_bool _ma_setup_live_state(MARIA_HA *info);
+MARIA_STATE_HISTORY *_ma_remove_not_visible_states(MARIA_STATE_HISTORY
+                                                   *org_history,
+                                                   my_bool all,
+                                                   my_bool trman_is_locked);
+void _ma_reset_state(MARIA_HA *info);
+void _ma_get_status(void* param, my_bool concurrent_insert);
+void _ma_update_status(void* param);
+void _ma_update_status_with_lock(MARIA_HA *info);
+void _ma_restore_status(void *param);
+void _ma_copy_status(void* to, void *from);
+void _ma_reset_update_flag(void *param, my_bool concurrent_insert);
+my_bool _ma_check_status(void *param);
+void _ma_block_get_status(void* param, my_bool concurrent_insert);
+void _ma_block_update_status(void *param);
+void _ma_block_restore_status(void *param);
+my_bool _ma_block_check_status(void *param);
+void maria_versioning(MARIA_HA *info, my_bool versioning);
+void _ma_set_share_data_file_length(struct st_maria_share *share,
+                                    ulonglong new_length);
+void _ma_copy_nontrans_state_information(MARIA_HA *info);
+my_bool _ma_trnman_end_trans_hook(TRN *trn, my_bool commit,
+                                  my_bool active_transactions);
+my_bool _ma_row_visible_always(MARIA_HA *info);
+my_bool _ma_row_visible_non_transactional_table(MARIA_HA *info);
+my_bool _ma_row_visible_transactional_table(MARIA_HA *info);
+void _ma_remove_not_visible_states_with_lock(struct st_maria_share *share,
+                                             my_bool all);
+void _ma_remove_table_from_trnman(struct st_maria_share *share, TRN *trn);
+void _ma_reset_history(struct st_maria_share *share);
diff --git a/storage/maria/ma_static.c b/storage/maria/ma_static.c
new file mode 100644
index 00000000000..917385f9568
--- /dev/null
+++ b/storage/maria/ma_static.c
@@ -0,0 +1,109 @@
+/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; version 2 of the License.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */
+
+
+/*
+  Static variables for MARIA library. All definied here for easy making of
+  a shared library
+*/
+
+#ifndef _global_h
+#include "maria_def.h"
+#include "trnman.h"
+#endif
+
+LIST	*maria_open_list=0;
+uchar	maria_file_magic[]=
+{ (uchar) 254, (uchar) 254, (uchar) 9, '\003', };
+uchar	maria_pack_file_magic[]=
+{ (uchar) 254, (uchar) 254, (uchar) 10, '\001', };
+/* Unique number for this maria instance */
+uchar   maria_uuid[MY_UUID_SIZE];
+uint	maria_quick_table_bits=9;
+ulong	maria_block_size= MARIA_KEY_BLOCK_LENGTH;
+my_bool maria_flush= 0, maria_single_user= 0;
+my_bool maria_delay_key_write= 0, maria_page_checksums= 1;
+my_bool maria_inited= FALSE;
+my_bool maria_in_ha_maria= FALSE; /* If used from ha_maria or not */
+my_bool maria_recovery_changed_data= 0, maria_recovery_verbose= 0;
+pthread_mutex_t THR_LOCK_maria;
+#if defined(THREAD) && !defined(DONT_USE_RW_LOCKS)
+ulong maria_concurrent_insert= 2;
+#else
+ulong maria_concurrent_insert= 0;
+#endif
+my_off_t maria_max_temp_length= MAX_FILE_SIZE;
+ulong    maria_bulk_insert_tree_size=8192*1024;
+ulong    maria_data_pointer_size= 4;
+
+PAGECACHE maria_pagecache_var;
+PAGECACHE *maria_pagecache= &maria_pagecache_var;
+
+PAGECACHE maria_log_pagecache_var;
+PAGECACHE *maria_log_pagecache= &maria_log_pagecache_var;
+MY_TMPDIR *maria_tmpdir;                        /* Tempdir for redo */
+char *maria_data_root;
+HASH maria_stored_state;
+int (*maria_create_trn_hook)(MARIA_HA *);
+
+/**
+   @brief when transactionality does not matter we can use this transaction
+
+   Used in external programs like ma_test*, and also internally inside
+   libmaria when there is no transaction around and the operation isn't
+   transactional (CREATE/DROP/RENAME/OPTIMIZE/REPAIR).
+*/
+TRN dummy_transaction_object;
+
+/* a WT_RESOURCE_TYPE for transactions waiting on a unique key conflict */
+WT_RESOURCE_TYPE ma_rc_dup_unique={ wt_resource_id_memcmp, 0};
+
+/* Enough for comparing if number is zero */
+uchar maria_zero_string[]= {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
+
+/*
+  read_vec[] is used for converting between P_READ_KEY.. and SEARCH_
+  Position is , == , >= , <= , > , <
+*/
+
+uint32 maria_read_vec[]=
+{
+  SEARCH_FIND,                               /* HA_READ_KEY_EXACT */
+  SEARCH_FIND | SEARCH_BIGGER,               /* HA_READ_KEY_OR_NEXT */
+  SEARCH_FIND | SEARCH_SMALLER,              /* HA_READ_KEY_OR_PREV */
+  SEARCH_NO_FIND | SEARCH_BIGGER,            /* HA_READ_AFTER_KEY */
+  SEARCH_NO_FIND | SEARCH_SMALLER,	     /* HA_READ_BEFORE_KEY */
+  SEARCH_FIND | SEARCH_PART_KEY,	     /* HA_READ_PREFIX */
+  SEARCH_LAST,                               /* HA_READ_PREFIX_LAST */
+  SEARCH_LAST | SEARCH_SMALLER,              /* HA_READ_PREFIX_LAST_OR_PREV */
+  MBR_CONTAIN,                               /* HA_READ_MBR_CONTAIN */
+  MBR_INTERSECT,                             /* HA_READ_MBR_INTERSECT */
+  MBR_WITHIN,                                /* HA_READ_MBR_WITHIN */
+  MBR_DISJOINT,                              /* HA_READ_MBR_DISJOINT */
+  MBR_EQUAL                                  /* HA_READ_MBR_EQUAL */
+};
+
+uint32 maria_readnext_vec[]=
+{
+  SEARCH_BIGGER, SEARCH_BIGGER, SEARCH_SMALLER, SEARCH_BIGGER, SEARCH_SMALLER,
+  SEARCH_BIGGER, SEARCH_SMALLER, SEARCH_SMALLER
+};
+
+static int always_valid(const char *filename __attribute__((unused)))
+{
+  return 0;
+}
+
+int (*maria_test_invalid_symlink)(const char *filename)= always_valid;
diff --git a/storage/maria/ma_statrec.c b/storage/maria/ma_statrec.c
new file mode 100644
index 00000000000..0aa3a3acbc1
--- /dev/null
+++ b/storage/maria/ma_statrec.c
@@ -0,0 +1,302 @@
+/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; version 2 of the License.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */
+
+	/* Functions to handle fixed-length-records */
+
+#include "maria_def.h"
+
+
+my_bool _ma_write_static_record(MARIA_HA *info, const uchar *record)
+{
+  uchar temp[8];                                 /* max pointer length */
+  if (info->s->state.dellink != HA_OFFSET_ERROR &&
+      !info->append_insert_at_end)
+  {
+    my_off_t filepos=info->s->state.dellink;
+    info->rec_cache.seek_not_done=1;		/* We have done a seek */
+    if (info->s->file_read(info, &temp[0],info->s->base.rec_reflength,
+		info->s->state.dellink+1,
+		 MYF(MY_NABP)))
+      goto err;
+    info->s->state.dellink= _ma_rec_pos(info->s, temp);
+    info->state->del--;
+    info->state->empty-=info->s->base.pack_reclength;
+    if (info->s->file_write(info, record, info->s->base.reclength,
+                            filepos, MYF(MY_NABP)))
+      goto err;
+  }
+  else
+  {
+    if (info->state->data_file_length > info->s->base.max_data_file_length-
+	info->s->base.pack_reclength)
+    {
+      my_errno=HA_ERR_RECORD_FILE_FULL;
+      return(2);
+    }
+    if (info->opt_flag & WRITE_CACHE_USED)
+    {				/* Cash in use */
+      if (my_b_write(&info->rec_cache, record,
+		     info->s->base.reclength))
+	goto err;
+      if (info->s->base.pack_reclength != info->s->base.reclength)
+      {
+	uint length=info->s->base.pack_reclength - info->s->base.reclength;
+	bzero(temp,length);
+	if (my_b_write(&info->rec_cache, temp,length))
+	  goto err;
+      }
+    }
+    else
+    {
+      info->rec_cache.seek_not_done=1;		/* We have done a seek */
+      if (info->s->file_write(info, record, info->s->base.reclength,
+                              info->state->data_file_length,
+                              info->s->write_flag))
+        goto err;
+      if (info->s->base.pack_reclength != info->s->base.reclength)
+      {
+	uint length=info->s->base.pack_reclength - info->s->base.reclength;
+	bzero(temp,length);
+	if (info->s->file_write(info, temp,length,
+		      info->state->data_file_length+
+		      info->s->base.reclength,
+		      info->s->write_flag))
+    goto err;
+      }
+    }
+    info->state->data_file_length+=info->s->base.pack_reclength;
+    info->s->state.split++;
+  }
+  return 0;
+ err:
+  return 1;
+}
+
+my_bool _ma_update_static_record(MARIA_HA *info, MARIA_RECORD_POS pos,
+                                 const uchar *oldrec __attribute__ ((unused)),
+                                 const uchar *record)
+{
+  info->rec_cache.seek_not_done=1;		/* We have done a seek */
+  return (info->s->file_write(info,
+                              record, info->s->base.reclength,
+		    pos,
+		    MYF(MY_NABP)) != 0);
+}
+
+
+my_bool _ma_delete_static_record(MARIA_HA *info,
+                                 const uchar *record __attribute__ ((unused)))
+{
+  uchar temp[9];                                 /* 1+sizeof(uint32) */
+  info->state->del++;
+  info->state->empty+=info->s->base.pack_reclength;
+  temp[0]= '\0';			/* Mark that record is deleted */
+  _ma_dpointer(info->s, temp+1, info->s->state.dellink);
+  info->s->state.dellink= info->cur_row.lastpos;
+  info->rec_cache.seek_not_done=1;
+  return (info->s->file_write(info, temp, 1+info->s->rec_reflength,
+		    info->cur_row.lastpos, MYF(MY_NABP)) != 0);
+}
+
+
+my_bool _ma_cmp_static_record(register MARIA_HA *info,
+                              register const uchar *old)
+{
+  DBUG_ENTER("_ma_cmp_static_record");
+
+  /* We are going to do changes; dont let anybody disturb */
+  dont_break();				/* Dont allow SIGHUP or SIGINT */
+
+  if (info->opt_flag & WRITE_CACHE_USED)
+  {
+    if (flush_io_cache(&info->rec_cache))
+    {
+      DBUG_RETURN(1);
+    }
+    info->rec_cache.seek_not_done=1;		/* We have done a seek */
+  }
+
+  if ((info->opt_flag & READ_CHECK_USED))
+  {						/* If check isn't disabled  */
+    info->rec_cache.seek_not_done=1;		/* We have done a seek */
+    if (info->s->file_read(info, info->rec_buff, info->s->base.reclength,
+                           info->cur_row.lastpos, MYF(MY_NABP)))
+      DBUG_RETURN(1);
+    if (memcmp(info->rec_buff, old, (uint) info->s->base.reclength))
+    {
+      DBUG_DUMP("read",old,info->s->base.reclength);
+      DBUG_DUMP("disk",info->rec_buff,info->s->base.reclength);
+      my_errno=HA_ERR_RECORD_CHANGED;		/* Record have changed */
+      DBUG_RETURN(1);
+    }
+  }
+  DBUG_RETURN(0);
+}
+
+
+my_bool _ma_cmp_static_unique(MARIA_HA *info, MARIA_UNIQUEDEF *def,
+                              const uchar *record, MARIA_RECORD_POS pos)
+{
+  DBUG_ENTER("_ma_cmp_static_unique");
+
+  info->rec_cache.seek_not_done=1;		/* We have done a seek */
+  if (info->s->file_read(info, info->rec_buff, info->s->base.reclength,
+	       pos, MYF(MY_NABP)))
+    DBUG_RETURN(1);
+  DBUG_RETURN(_ma_unique_comp(def, record, info->rec_buff,
+                              def->null_are_equal));
+}
+
+
+/*
+  Read a fixed-length-record
+
+  RETURN
+    0  Ok
+    1  record delete
+    -1 on read-error or locking-error
+*/
+
+int _ma_read_static_record(register MARIA_HA *info, register uchar *record,
+                           MARIA_RECORD_POS pos)
+{
+  int error;
+  DBUG_ENTER("_ma_read_static_record");
+
+  if (pos != HA_OFFSET_ERROR)
+  {
+    if (info->opt_flag & WRITE_CACHE_USED &&
+	info->rec_cache.pos_in_file <= pos &&
+	flush_io_cache(&info->rec_cache))
+      DBUG_RETURN(my_errno);
+    info->rec_cache.seek_not_done=1;		/* We have done a seek */
+
+    error= (int) info->s->file_read(info, record,info->s->base.reclength,
+                                    pos, MYF(MY_NABP));
+    if (! error)
+    {
+      fast_ma_writeinfo(info);
+      if (!*record)
+      {
+        /* Record is deleted */
+        DBUG_PRINT("warning", ("Record is deleted"));
+	DBUG_RETURN((my_errno=HA_ERR_RECORD_DELETED));
+      }
+      info->update|= HA_STATE_AKTIV;		/* Record is read */
+      DBUG_RETURN(0);
+    }
+  }
+  fast_ma_writeinfo(info);			/* No such record */
+  DBUG_RETURN(my_errno);
+}
+
+
+/**
+   @brief  Read record from given position or next record
+
+   @note
+     When scanning, this function will return HA_ERR_RECORD_DELETED
+     for deleted rows even if skip_deleted_blocks is set.
+     The reason for this is to allow the caller to calculate the record
+     position without having to do call maria_position() for each record.
+*/
+
+int _ma_read_rnd_static_record(MARIA_HA *info, uchar *buf,
+                               MARIA_RECORD_POS filepos,
+                               my_bool skip_deleted_blocks)
+{
+  int locked,error,cache_read;
+  uint cache_length;
+  MARIA_SHARE *share= info->s;
+  DBUG_ENTER("_ma_read_rnd_static_record");
+
+  cache_read=0;
+  cache_length=0;
+  if (info->opt_flag & READ_CACHE_USED)
+  {						/* Cache in use */
+    if (filepos == my_b_tell(&info->rec_cache) &&
+	(skip_deleted_blocks || !filepos))
+    {
+      cache_read=1;				/* Read record using cache */
+      cache_length= (uint) (info->rec_cache.read_end -
+                            info->rec_cache.read_pos);
+    }
+    else
+      info->rec_cache.seek_not_done=1;		/* Filepos is changed */
+  }
+  locked=0;
+  if (info->lock_type == F_UNLCK)
+  {
+    if (filepos >= info->state->data_file_length)
+    {						/* Test if new records */
+      if (_ma_readinfo(info,F_RDLCK,0))
+	DBUG_RETURN(my_errno);
+      locked=1;
+    }
+    else
+    {						/* We don't nead new info */
+#ifndef UNSAFE_LOCKING
+      if ((! cache_read || share->base.reclength > cache_length) &&
+	  share->tot_locks == 0)
+      {						/* record not in cache */
+	locked=1;
+      }
+#else
+      info->tmp_lock_type=F_RDLCK;
+#endif
+    }
+  }
+  if (filepos >= info->state->data_file_length)
+  {
+    DBUG_PRINT("test",("filepos: %ld (%ld)  records: %ld  del: %ld",
+		       (long) filepos/share->base.reclength, (long) filepos,
+		       (long) info->state->records, (long) info->state->del));
+    fast_ma_writeinfo(info);
+    DBUG_RETURN(my_errno=HA_ERR_END_OF_FILE);
+  }
+  info->cur_row.lastpos= filepos;
+  info->cur_row.nextpos= filepos+share->base.pack_reclength;
+
+  if (! cache_read)			/* No cacheing */
+  {
+    error= _ma_read_static_record(info, buf, filepos);
+    DBUG_RETURN(error);
+  }
+
+	/* Read record with cacheing */
+  error=my_b_read(&info->rec_cache, buf, share->base.reclength);
+  if (info->s->base.pack_reclength != info->s->base.reclength && !error)
+  {
+    uchar tmp[8];				/* Skill fill bytes */
+    error=my_b_read(&info->rec_cache, tmp,
+		    info->s->base.pack_reclength - info->s->base.reclength);
+  }
+  if (locked)
+    VOID(_ma_writeinfo(info,0));		/* Unlock keyfile */
+  if (!error)
+  {
+    if (!buf[0])
+    {						/* Record is removed */
+      DBUG_RETURN(my_errno=HA_ERR_RECORD_DELETED);
+    }
+						/* Found and may be updated */
+    info->update|= HA_STATE_AKTIV | HA_STATE_KEY_CHANGED;
+    DBUG_RETURN(0);
+  }
+  /* my_errno should be set if rec_cache.error == -1 */
+  if (info->rec_cache.error != -1 || my_errno == 0)
+    my_errno=HA_ERR_WRONG_IN_RECORD;
+  DBUG_RETURN(my_errno);			/* Something wrong (EOF?) */
+}
diff --git a/storage/maria/ma_test1.c b/storage/maria/ma_test1.c
new file mode 100644
index 00000000000..affa3a71634
--- /dev/null
+++ b/storage/maria/ma_test1.c
@@ -0,0 +1,899 @@
+/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; version 2 of the License.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */
+
+/* Testing of the basic functions of a MARIA table */
+
+#include "maria_def.h"
+#include <my_getopt.h>
+#include <m_string.h>
+#include "ma_control_file.h"
+#include "ma_loghandler.h"
+#include "ma_checkpoint.h"
+#include "trnman.h"
+
+extern PAGECACHE *maria_log_pagecache;
+extern char *maria_data_root;
+
+#define MAX_REC_LENGTH 1024
+
+static void usage();
+
+static int rec_pointer_size=0, flags[50], testflag, checkpoint;
+static int key_field=FIELD_SKIP_PRESPACE,extra_field=FIELD_SKIP_ENDSPACE;
+static int key_type=HA_KEYTYPE_NUM;
+static int create_flag=0;
+static ulong blob_length;
+static enum data_file_type record_type= DYNAMIC_RECORD;
+
+static uint insert_count, update_count, remove_count;
+static uint pack_keys=0, pack_seg=0, key_length;
+static uint unique_key=HA_NOSAME;
+static uint die_in_middle_of_transaction;
+static my_bool pagecacheing, null_fields, silent, skip_update, opt_unique;
+static my_bool verbose, skip_delete, transactional;
+static my_bool opt_versioning= 0;
+static MARIA_COLUMNDEF recinfo[4];
+static MARIA_KEYDEF keyinfo[10];
+static HA_KEYSEG keyseg[10];
+static HA_KEYSEG uniqueseg[10];
+
+static int run_test(const char *filename);
+static void get_options(int argc, char *argv[]);
+static void create_key(uchar *key,uint rownr);
+static void create_record(uchar *record,uint rownr);
+static void update_record(uchar *record);
+
+
+/*
+  These are here only for testing of recovery with undo. We are not
+  including maria_def.h here as this test is also to be an example of
+  how to use maria outside of the maria directory
+*/
+
+extern int _ma_flush_table_files(MARIA_HA *info, uint flush_data_or_index,
+                                 enum flush_type flush_type_for_data,
+                                 enum flush_type flush_type_for_index);
+#define MARIA_FLUSH_DATA  1
+
+
+int main(int argc,char *argv[])
+{
+#if defined(SAFE_MUTEX) && defined(THREAD)
+  safe_mutex_deadlock_detector= 1;
+#endif
+  MY_INIT(argv[0]);
+  get_options(argc,argv);
+  maria_data_root= (char *)".";
+  /* Maria requires that we always have a page cache */
+  if (maria_init() ||
+      (init_pagecache(maria_pagecache, maria_block_size * 16, 0, 0,
+                      maria_block_size, MY_WME) == 0) ||
+      ma_control_file_open(TRUE, TRUE) ||
+      (init_pagecache(maria_log_pagecache,
+                      TRANSLOG_PAGECACHE_SIZE, 0, 0,
+                      TRANSLOG_PAGE_SIZE, MY_WME) == 0) ||
+      translog_init(maria_data_root, TRANSLOG_FILE_SIZE,
+                    0, 0, maria_log_pagecache,
+                    TRANSLOG_DEFAULT_FLAGS, 0) ||
+      (transactional && (trnman_init(0) || ma_checkpoint_init(0))))
+  {
+    fprintf(stderr, "Error in initialization\n");
+    exit(1);
+  }
+  if (opt_versioning)
+    init_thr_lock();
+
+  exit(run_test("test1"));
+}
+
+
+static int run_test(const char *filename)
+{
+  MARIA_HA *file;
+  int i,j= 0,error,deleted,rec_length,uniques=0;
+  uint offset_to_key;
+  ha_rows found,row_count;
+  uchar record[MAX_REC_LENGTH],key[MAX_REC_LENGTH],read_record[MAX_REC_LENGTH];
+  MARIA_UNIQUEDEF uniquedef;
+  MARIA_CREATE_INFO create_info;
+
+  if (die_in_middle_of_transaction)
+    null_fields= 1;
+
+  bzero((char*) recinfo,sizeof(recinfo));
+  bzero((char*) &create_info,sizeof(create_info));
+
+  /* First define 2 columns */
+  create_info.null_bytes= 1;
+  recinfo[0].type= key_field;
+  recinfo[0].length= (key_field == FIELD_BLOB ? 4+portable_sizeof_char_ptr :
+		      key_length);
+  if (key_field == FIELD_VARCHAR)
+    recinfo[0].length+= HA_VARCHAR_PACKLENGTH(key_length);
+  recinfo[1].type=extra_field;
+  recinfo[1].length= (extra_field == FIELD_BLOB ? 4 + portable_sizeof_char_ptr : 24);
+  if (extra_field == FIELD_VARCHAR)
+    recinfo[1].length+= HA_VARCHAR_PACKLENGTH(recinfo[1].length);
+  recinfo[1].null_bit= null_fields ? 2 : 0;
+
+  if (opt_unique)
+  {
+    recinfo[2].type=FIELD_CHECK;
+    recinfo[2].length=MARIA_UNIQUE_HASH_LENGTH;
+  }
+  rec_length= recinfo[0].length + recinfo[1].length + recinfo[2].length +
+    create_info.null_bytes;
+
+  if (key_type == HA_KEYTYPE_VARTEXT1 &&
+      key_length > 255)
+    key_type= HA_KEYTYPE_VARTEXT2;
+
+  /* Define a key over the first column */
+  keyinfo[0].seg=keyseg;
+  keyinfo[0].keysegs=1;
+  keyinfo[0].block_length= 0;                   /* Default block length */
+  keyinfo[0].key_alg=HA_KEY_ALG_BTREE;
+  keyinfo[0].seg[0].type= key_type;
+  keyinfo[0].seg[0].flag= pack_seg;
+  keyinfo[0].seg[0].start=1;
+  keyinfo[0].seg[0].length=key_length;
+  keyinfo[0].seg[0].null_bit= null_fields ? 2 : 0;
+  keyinfo[0].seg[0].null_pos=0;
+  keyinfo[0].seg[0].language= default_charset_info->number;
+  if (pack_seg & HA_BLOB_PART)
+  {
+    keyinfo[0].seg[0].bit_start=4;		/* Length of blob length */
+  }
+  keyinfo[0].flag = (uint8) (pack_keys | unique_key);
+
+  bzero((uchar*) flags,sizeof(flags));
+  if (opt_unique)
+  {
+    uint start;
+    uniques=1;
+    bzero((char*) &uniquedef,sizeof(uniquedef));
+    bzero((char*) uniqueseg,sizeof(uniqueseg));
+    uniquedef.seg=uniqueseg;
+    uniquedef.keysegs=2;
+
+    /* Make a unique over all columns (except first NULL fields) */
+    for (i=0, start=1 ; i < 2 ; i++)
+    {
+      uniqueseg[i].start=start;
+      start+=recinfo[i].length;
+      uniqueseg[i].length=recinfo[i].length;
+      uniqueseg[i].language= default_charset_info->number;
+    }
+    uniqueseg[0].type= key_type;
+    uniqueseg[0].null_bit= null_fields ? 2 : 0;
+    uniqueseg[1].type= HA_KEYTYPE_TEXT;
+    if (extra_field == FIELD_BLOB)
+    {
+      uniqueseg[1].length=0;			/* The whole blob */
+      uniqueseg[1].bit_start=4;			/* long blob */
+      uniqueseg[1].flag|= HA_BLOB_PART;
+    }
+    else if (extra_field == FIELD_VARCHAR)
+    {
+      uniqueseg[1].flag|= HA_VAR_LENGTH_PART;
+      uniqueseg[1].type= (HA_VARCHAR_PACKLENGTH(recinfo[1].length-1) == 1 ?
+                          HA_KEYTYPE_VARTEXT1 : HA_KEYTYPE_VARTEXT2);
+    }
+  }
+  else
+    uniques=0;
+
+  offset_to_key= test(null_fields);
+  if (key_field == FIELD_BLOB || key_field == FIELD_VARCHAR)
+    offset_to_key+= 2;
+
+  if (!silent)
+    printf("- Creating maria file\n");
+  create_info.max_rows=(ulong) (rec_pointer_size ?
+				(1L << (rec_pointer_size*8))/40 :
+				0);
+  create_info.transactional= transactional;
+  if (maria_create(filename, record_type, 1, keyinfo,2+opt_unique,recinfo,
+		uniques, &uniquedef, &create_info,
+		create_flag))
+    goto err;
+  if (!(file=maria_open(filename,2,HA_OPEN_ABORT_IF_LOCKED)))
+    goto err;
+  if (!silent)
+    printf("- Writing key:s\n");
+
+  if (maria_begin(file))
+    goto err;
+  if (opt_versioning)
+    maria_versioning(file, 1);
+  my_errno=0;
+  row_count=deleted=0;
+  for (i=49 ; i>=1 ; i-=2 )
+  {
+    if (insert_count-- == 0)
+    {
+      if (testflag)
+        break;
+      VOID(maria_close(file));
+      exit(0);
+    }
+    j=i%25 +1;
+    create_record(record,j);
+    error=maria_write(file,record);
+    if (!error)
+      row_count++;
+    flags[j]=1;
+    if (verbose || error)
+      printf("J= %2d  maria_write: %d  errno: %d\n", j,error,my_errno);
+  }
+
+  if (maria_commit(file) || maria_begin(file))
+    goto err;
+
+  if (checkpoint == 1 && ma_checkpoint_execute(CHECKPOINT_MEDIUM, FALSE))
+    goto err;
+
+  if (testflag == 1)
+    goto end;
+
+  /* Insert 2 rows with null values */
+  if (null_fields)
+  {
+    create_record(record,0);
+    error=maria_write(file,record);
+    if (!error)
+      row_count++;
+    if (verbose || error)
+      printf("J= NULL  maria_write: %d  errno: %d\n", error,my_errno);
+    error=maria_write(file,record);
+    if (!error)
+      row_count++;
+    if (verbose || error)
+      printf("J= NULL  maria_write: %d  errno: %d\n", error,my_errno);
+    flags[0]=2;
+  }
+
+  if (checkpoint == 2 && ma_checkpoint_execute(CHECKPOINT_MEDIUM, FALSE))
+    goto err;
+
+  if (testflag == 2)
+  {
+    printf("Terminating after inserts\n");
+    goto end;
+  }
+
+  if (maria_commit(file) || maria_begin(file))
+    goto err;
+
+  if (!skip_update)
+  {
+    if (opt_unique)
+    {
+      if (!silent)
+	printf("- Checking unique constraint\n");
+      create_record(record,j);                  /* Check last created row */
+      if (!maria_write(file,record) || my_errno != HA_ERR_FOUND_DUPP_UNIQUE)
+      {
+	printf("unique check failed\n");
+      }
+    }
+    if (!silent)
+      printf("- Updating rows\n");
+
+    /* Update first last row to force extend of file */
+    if (maria_rsame(file,read_record,-1))
+    {
+      printf("Can't find last row with maria_rsame\n");
+    }
+    else
+    {
+      memcpy(record,read_record,rec_length);
+      update_record(record);
+      if (maria_update(file,read_record,record))
+      {
+	printf("Can't update last row: %.*s\n",
+	       keyinfo[0].seg[0].length,read_record+1);
+      }
+    }
+
+    /* Read through all rows and update them */
+    assert(maria_scan_init(file) == 0);
+
+    found=0;
+    while ((error= maria_scan(file,read_record)) == 0)
+    {
+      if (--update_count == 0) { VOID(maria_close(file)) ; exit(0) ; }
+      memcpy(record,read_record,rec_length);
+      update_record(record);
+      if (maria_update(file,read_record,record))
+      {
+	printf("Can't update row: %.*s, error: %d\n",
+	       keyinfo[0].seg[0].length,record+1,my_errno);
+      }
+      found++;
+    }
+    if (found != row_count)
+      printf("Found %ld of %ld rows\n", (ulong) found, (ulong) row_count);
+    maria_scan_end(file);
+  }
+
+  if (checkpoint == 3 && ma_checkpoint_execute(CHECKPOINT_MEDIUM, FALSE))
+    goto err;
+
+  if (testflag == 3)
+  {
+    printf("Terminating after updates\n");
+    goto end;
+  }
+  if (!silent)
+    printf("- Reopening file\n");
+  if (maria_commit(file))
+    goto err;
+  if (maria_close(file))
+    goto err;
+  if (!(file=maria_open(filename,2,HA_OPEN_ABORT_IF_LOCKED)))
+    goto err;
+  if (maria_begin(file))
+    goto err;
+  if (opt_versioning)
+    maria_versioning(file, 1);
+  if (!skip_delete)
+  {
+    if (!silent)
+      printf("- Removing keys\n");
+
+    for (i=0 ; i <= 10 ; i++)
+    {
+      /*
+        If you want to debug the problem in ma_test_recovery with BLOBs
+        (see @todo there), you can break out of the loop after just one
+        delete, it is enough, like this:
+        if (i==1) break;
+      */
+      /* testing */
+      if (remove_count-- == 0)
+      {
+        fprintf(stderr,
+                "delete-rows number of rows deleted; Going down hard!\n");
+        goto end;
+      }
+      j=i*2;
+      if (!flags[j])
+	continue;
+      create_key(key,j);
+      my_errno=0;
+      if ((error = maria_rkey(file, read_record, 0, key,
+                              HA_WHOLE_KEY, HA_READ_KEY_EXACT)))
+      {
+	if (verbose || (flags[j] >= 1 ||
+			(error && my_errno != HA_ERR_KEY_NOT_FOUND)))
+	  printf("key: '%.*s'  maria_rkey:  %3d  errno: %3d\n",
+		 (int) key_length,key+offset_to_key,error,my_errno);
+      }
+      else
+      {
+	error=maria_delete(file,read_record);
+	if (verbose || error)
+	  printf("key: '%.*s'  maria_delete: %3d  errno: %3d\n",
+		 (int) key_length, key+offset_to_key, error, my_errno);
+	if (! error)
+	{
+	  deleted++;
+	  flags[j]--;
+	}
+      }
+    }
+  }
+
+  if (checkpoint == 4 && ma_checkpoint_execute(CHECKPOINT_MEDIUM, FALSE))
+    goto err;
+
+  if (testflag == 4)
+  {
+    printf("Terminating after deletes\n");
+    goto end;
+  }
+
+  if (!silent)
+    printf("- Reading rows with key\n");
+  record[1]= 0;                                 /* For nicer printf */
+  for (i=0 ; i <= 25 ; i++)
+  {
+    create_key(key,i);
+    my_errno=0;
+    error=maria_rkey(file,read_record,0,key,HA_WHOLE_KEY,HA_READ_KEY_EXACT);
+    if (verbose ||
+	(error == 0 && flags[i] == 0 && unique_key) ||
+	(error && (flags[i] != 0 || my_errno != HA_ERR_KEY_NOT_FOUND)))
+    {
+      printf("key: '%.*s'  maria_rkey: %3d  errno: %3d  record: %s\n",
+	     (int) key_length,key+offset_to_key,error,my_errno,record+1);
+    }
+  }
+
+  if (!silent)
+    printf("- Reading rows with position\n");
+  if (maria_scan_init(file))
+  {
+    fprintf(stderr, "maria_scan_init failed\n");
+    goto err;
+  }
+
+  for (i=1,found=0 ; i <= 30 ; i++)
+  {
+    my_errno=0;
+    if ((error= maria_scan(file, read_record)) == HA_ERR_END_OF_FILE)
+    {
+      if (found != row_count-deleted)
+	printf("Found only %ld of %ld rows\n", (ulong) found,
+	       (ulong) (row_count - deleted));
+      break;
+    }
+    if (!error)
+      found++;
+    if (verbose || (error != 0 && error != HA_ERR_RECORD_DELETED &&
+		    error != HA_ERR_END_OF_FILE))
+    {
+      printf("pos: %2d  maria_rrnd: %3d  errno: %3d  record: %s\n",
+	     i-1,error,my_errno,read_record+1);
+    }
+  }
+  maria_scan_end(file);
+
+end:
+  if (die_in_middle_of_transaction)
+  {
+    /* As commit record is not done, UNDO entries needs to be rolled back */
+    switch (die_in_middle_of_transaction) {
+    case 1:
+      /*
+        Flush changed pages go to disk. That will also flush log. Recovery
+        will skip REDOs and apply UNDOs.
+      */
+      _ma_flush_table_files(file, MARIA_FLUSH_DATA | MARIA_FLUSH_INDEX,
+                            FLUSH_RELEASE, FLUSH_RELEASE);
+      break;
+    case 2:
+      /*
+        Just flush log. Pages are likely to not be on disk. Recovery will
+        then execute REDOs and UNDOs.
+      */
+      if (translog_flush(file->trn->undo_lsn))
+        goto err;
+      break;
+    case 3:
+      /*
+        Flush nothing. Pages and log are likely to not be on disk. Recovery
+        will then do nothing.
+      */
+      break;
+    case 4:
+      /*
+        Flush changed data pages go to disk. Changed index pages are not
+        flushed. Recovery will skip some REDOs and apply UNDOs.
+      */
+      _ma_flush_table_files(file, MARIA_FLUSH_DATA, FLUSH_RELEASE,
+                            FLUSH_RELEASE);
+      /*
+        We have to flush log separately as the redo for the last key page
+        may not be flushed
+      */
+      if (translog_flush(file->trn->undo_lsn))
+        goto err;
+      break;
+    }
+    printf("Dying on request without maria_commit()/maria_close()\n");
+    exit(0);
+  }
+
+  if (maria_commit(file))
+    goto err;
+  if (maria_close(file))
+    goto err;
+  maria_end();
+  my_end(MY_CHECK_ERROR);
+
+  return (0);
+err:
+  printf("got error: %3d when using maria-database\n",my_errno);
+  return 1;			/* skip warning */
+}
+
+
+static void create_key_part(uchar *key,uint rownr)
+{
+  if (!unique_key)
+    rownr&=7;					/* Some identical keys */
+  if (keyinfo[0].seg[0].type == HA_KEYTYPE_NUM)
+  {
+    sprintf((char*) key,"%*d",keyinfo[0].seg[0].length,rownr);
+  }
+  else if (keyinfo[0].seg[0].type == HA_KEYTYPE_VARTEXT1 ||
+           keyinfo[0].seg[0].type == HA_KEYTYPE_VARTEXT2)
+  {						/* Alpha record */
+    /* Create a key that may be easily packed */
+    bfill(key,keyinfo[0].seg[0].length,rownr < 10 ? 'A' : 'B');
+    sprintf((char*) key+keyinfo[0].seg[0].length-2,"%-2d",rownr);
+    if ((rownr & 7) == 0)
+    {
+      /* Change the key to force a unpack of the next key */
+      bfill(key+3,keyinfo[0].seg[0].length-5,rownr < 10 ? 'a' : 'b');
+    }
+  }
+  else
+  {						/* Alpha record */
+    if (keyinfo[0].seg[0].flag & HA_SPACE_PACK)
+      sprintf((char*) key,"%-*d",keyinfo[0].seg[0].length,rownr);
+    else
+    {
+      /* Create a key that may be easily packed */
+      bfill(key,keyinfo[0].seg[0].length,rownr < 10 ? 'A' : 'B');
+      sprintf((char*) key+keyinfo[0].seg[0].length-2,"%-2d",rownr);
+      if ((rownr & 7) == 0)
+      {
+	/* Change the key to force a unpack of the next key */
+	key[1]= (rownr < 10 ? 'a' : 'b');
+      }
+    }
+  }
+}
+
+
+static void create_key(uchar *key,uint rownr)
+{
+  if (keyinfo[0].seg[0].null_bit)
+  {
+    if (rownr == 0)
+    {
+      key[0]=1;					/* null key */
+      key[1]=0;					/* For easy print of key */
+      return;
+    }
+    *key++=0;
+  }
+  if (keyinfo[0].seg[0].flag & (HA_BLOB_PART | HA_VAR_LENGTH_PART))
+  {
+    uint tmp;
+    create_key_part(key+2,rownr);
+    tmp=strlen((char*) key+2);
+    int2store(key,tmp);
+  }
+  else
+    create_key_part(key,rownr);
+}
+
+
+static uchar blob_key[MAX_REC_LENGTH];
+static uchar blob_record[MAX_REC_LENGTH+20*20];
+
+
+static void create_record(uchar *record,uint rownr)
+{
+  uchar *pos;
+  bzero((char*) record,MAX_REC_LENGTH);
+  record[0]=1;					/* delete marker */
+  if (rownr == 0 && keyinfo[0].seg[0].null_bit)
+    record[0]|=keyinfo[0].seg[0].null_bit;	/* Null key */
+
+  pos=record+1;
+  if (recinfo[0].type == FIELD_BLOB)
+  {
+    uint tmp;
+    uchar *ptr;
+    create_key_part(blob_key,rownr);
+    tmp=strlen((char*) blob_key);
+    int4store(pos,tmp);
+    ptr=blob_key;
+    memcpy_fixed(pos+4,&ptr,sizeof(char*));
+    pos+=recinfo[0].length;
+  }
+  else if (recinfo[0].type == FIELD_VARCHAR)
+  {
+    uint tmp, pack_length= HA_VARCHAR_PACKLENGTH(recinfo[0].length-1);
+    create_key_part(pos+pack_length,rownr);
+    tmp= strlen((char*) pos+pack_length);
+    if (pack_length == 1)
+      *(uchar*) pos= (uchar) tmp;
+    else
+      int2store(pos,tmp);
+    pos+= recinfo[0].length;
+  }
+  else
+  {
+    create_key_part(pos,rownr);
+    pos+=recinfo[0].length;
+  }
+  if (recinfo[1].type == FIELD_BLOB)
+  {
+    uint tmp;
+    uchar *ptr;;
+    sprintf((char*) blob_record,"... row: %d", rownr);
+    strappend((char*) blob_record,max(MAX_REC_LENGTH-rownr,10),' ');
+    tmp=strlen((char*) blob_record);
+    int4store(pos,tmp);
+    ptr=blob_record;
+    memcpy_fixed(pos+4,&ptr,sizeof(char*));
+  }
+  else if (recinfo[1].type == FIELD_VARCHAR)
+  {
+    uint tmp, pack_length= HA_VARCHAR_PACKLENGTH(recinfo[1].length-1);
+    sprintf((char*) pos+pack_length, "... row: %d", rownr);
+    tmp= strlen((char*) pos+pack_length);
+    if (pack_length == 1)
+      *pos= (uchar) tmp;
+    else
+      int2store(pos,tmp);
+  }
+  else
+  {
+    sprintf((char*) pos,"... row: %d", rownr);
+    strappend((char*) pos,recinfo[1].length,' ');
+  }
+}
+
+/* change row to test re-packing of rows and reallocation of keys */
+
+static void update_record(uchar *record)
+{
+  uchar *pos=record+1;
+  if (recinfo[0].type == FIELD_BLOB)
+  {
+    uchar *column,*ptr;
+    int length;
+    length=uint4korr(pos);			/* Long blob */
+    memcpy_fixed(&column,pos+4,sizeof(char*));
+    memcpy(blob_key,column,length);		/* Move old key */
+    ptr=blob_key;
+    memcpy_fixed(pos+4,&ptr,sizeof(char*));	/* Store pointer to new key */
+    if (keyinfo[0].seg[0].type != HA_KEYTYPE_NUM)
+      default_charset_info->cset->casedn(default_charset_info,
+                                         (char*) blob_key, length,
+                                         (char*) blob_key, length);
+    pos+=recinfo[0].length;
+  }
+  else if (recinfo[0].type == FIELD_VARCHAR)
+  {
+    uint pack_length= HA_VARCHAR_PACKLENGTH(recinfo[0].length-1);
+    uint length= pack_length == 1 ? (uint) *(uchar*) pos : uint2korr(pos);
+    default_charset_info->cset->casedn(default_charset_info,
+                                       (char*) pos + pack_length, length,
+                                       (char*) pos + pack_length, length);
+    pos+=recinfo[0].length;
+  }
+  else
+  {
+    if (keyinfo[0].seg[0].type != HA_KEYTYPE_NUM)
+      default_charset_info->cset->casedn(default_charset_info,
+                                         (char*) pos, keyinfo[0].seg[0].length,
+                                         (char*) pos, keyinfo[0].seg[0].length);
+    pos+=recinfo[0].length;
+  }
+
+  if (recinfo[1].type == FIELD_BLOB)
+  {
+    uchar *column;
+    int length;
+    length=uint4korr(pos);
+    memcpy_fixed(&column,pos+4,sizeof(char*));
+    memcpy(blob_record,column,length);
+    bfill(blob_record+length,20,'.');	/* Make it larger */
+    length+=20;
+    int4store(pos,length);
+    column=blob_record;
+    memcpy_fixed(pos+4,&column,sizeof(char*));
+  }
+  else if (recinfo[1].type == FIELD_VARCHAR)
+  {
+    /* Second field is longer than 10 characters */
+    uint pack_length= HA_VARCHAR_PACKLENGTH(recinfo[1].length-1);
+    uint length= pack_length == 1 ? (uint) *(uchar*) pos : uint2korr(pos);
+    pos= record+ recinfo[1].offset;
+    bfill(pos+pack_length+length,recinfo[1].length-length-pack_length,'.');
+    length=recinfo[1].length-pack_length;
+    if (pack_length == 1)
+      *(uchar*) pos= (uchar) length;
+    else
+      int2store(pos,length);
+  }
+  else
+  {
+    bfill(pos+recinfo[1].length-10,10,'.');
+  }
+}
+
+
+static struct my_option my_long_options[] =
+{
+  {"checkpoint", 'H', "Checkpoint at specified stage", (uchar**) &checkpoint,
+   (uchar**) &checkpoint, 0, GET_INT, REQUIRED_ARG, 0, 0, 0, 0, 0, 0},
+  {"checksum", 'c', "Undocumented",
+   0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
+#ifndef DBUG_OFF
+  {"debug", '#', "Undocumented",
+   0, 0, 0, GET_STR, REQUIRED_ARG, 0, 0, 0, 0, 0, 0},
+#endif
+  {"delete-rows", 'd', "Abort after this many rows has been deleted",
+   (uchar**) &remove_count, (uchar**) &remove_count, 0, GET_UINT, REQUIRED_ARG,
+   1000, 0, 0, 0, 0, 0},
+  {"help", '?', "Display help and exit",
+   0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
+  {"insert-rows", 'i', "Undocumented", (uchar**) &insert_count,
+   (uchar**) &insert_count, 0, GET_UINT, REQUIRED_ARG, 1000, 0, 0, 0, 0, 0},
+  {"key-alpha", 'a', "Use a key of type HA_KEYTYPE_TEXT",
+   0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
+  {"key-binary-pack", 'B', "Undocumented",
+   0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
+  {"key-blob", 'b', "Undocumented",
+   (uchar**) &blob_length, (uchar**) &blob_length,
+   0, GET_ULONG, OPT_ARG, 0, 0, 0, 0, 0, 0},
+  {"key-cache", 'K', "Undocumented", (uchar**) &pagecacheing,
+   (uchar**) &pagecacheing, 0, GET_BOOL, NO_ARG, 0, 0, 0, 0, 0, 0},
+  {"key-length", 'k', "Undocumented", (uchar**) &key_length,
+   (uchar**) &key_length, 0, GET_UINT, REQUIRED_ARG, 6, 0, 0, 0, 0, 0},
+  {"key-multiple", 'm', "Don't use unique keys",
+   0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
+  {"key-prefix_pack", 'P', "Undocumented",
+   0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
+  {"key-space_pack", 'p', "Undocumented",
+   0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
+  {"key-varchar", 'w', "Test VARCHAR keys",
+   0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
+  {"null-fields", 'N', "Define fields with NULL",
+   (uchar**) &null_fields, (uchar**) &null_fields, 0, GET_BOOL, NO_ARG,
+   0, 0, 0, 0, 0, 0},
+  {"row-fixed-size", 'S', "Fixed size records",
+   0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
+  {"rows-in-block", 'M', "Store rows in block format",
+   0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
+  {"row-pointer-size", 'R', "Undocumented", (uchar**) &rec_pointer_size,
+   (uchar**) &rec_pointer_size, 0, GET_INT, REQUIRED_ARG, 0, 0, 0, 0, 0, 0},
+  {"silent", 's', "Undocumented",
+   (uchar**) &silent, (uchar**) &silent, 0, GET_BOOL, NO_ARG, 0, 0, 0, 0,
+   0, 0},
+  {"skip-delete", 'D', "Don't test deletes", (uchar**) &skip_delete,
+   (uchar**) &skip_delete, 0, GET_BOOL, NO_ARG, 0, 0, 0, 0, 0, 0},
+  {"skip-update", 'U', "Don't test updates", (uchar**) &skip_update,
+   (uchar**) &skip_update, 0, GET_BOOL, NO_ARG, 0, 0, 0, 0, 0, 0},
+  {"testflag", 't', "Stop test at specified stage", (uchar**) &testflag,
+   (uchar**) &testflag, 0, GET_INT, REQUIRED_ARG, 0, 0, 0, 0, 0, 0},
+  {"test-undo", 'A',
+   "Abort hard. Used for testing recovery with undo",
+   (uchar**) &die_in_middle_of_transaction,
+   (uchar**) &die_in_middle_of_transaction,
+   0, GET_INT, REQUIRED_ARG, 0, 0, 0, 0, 0, 0},
+  {"transactional", 'T',
+   "Test in transactional mode. (Only works with block format)",
+   (uchar**) &transactional, (uchar**) &transactional, 0, GET_BOOL, NO_ARG,
+   0, 0, 0, 0, 0, 0},
+  {"unique", 'E', "Check unique handling", (uchar**) &opt_unique,
+   (uchar**) &opt_unique, 0, GET_BOOL, NO_ARG, 0, 0, 0, 0, 0, 0},
+  {"update-rows", 'u', "Max number of rows to update", (uchar**) &update_count,
+   (uchar**) &update_count, 0, GET_UINT, REQUIRED_ARG, 1000, 0, 0, 0, 0, 0},
+  {"verbose", 'v', "Be more verbose", (uchar**) &verbose,
+   (uchar**) &verbose, 0, GET_BOOL, NO_ARG, 0, 0, 0, 0, 0, 0},
+  {"version", 'V', "Print version number and exit",
+   0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
+  {"versioning", 'C', "Use row versioning (only works with block format)",
+   (uchar**) &opt_versioning,  (uchar**) &opt_versioning, 0, GET_BOOL,
+   NO_ARG, 0, 0, 0, 0, 0, 0},
+  { 0, 0, 0, 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}
+};
+
+
+static my_bool
+get_one_option(int optid, const struct my_option *opt __attribute__((unused)),
+	       char *argument __attribute__((unused)))
+{
+  switch(optid) {
+  case 'a':
+    key_type= HA_KEYTYPE_TEXT;
+    break;
+  case 'c':
+    create_flag|= HA_CREATE_CHECKSUM | HA_CREATE_PAGE_CHECKSUM;
+    break;
+  case 'R':				/* Length of record pointer */
+    if (rec_pointer_size > 3)
+      rec_pointer_size=0;
+    break;
+  case 'P':
+    pack_keys= HA_PACK_KEY;		/* Use prefix compression */
+    break;
+  case 'B':
+    pack_keys= HA_BINARY_PACK_KEY;	/* Use binary compression */
+    break;
+  case 'M':
+    record_type= BLOCK_RECORD;
+    break;
+  case 'S':
+    if (key_field == FIELD_VARCHAR)
+    {
+      create_flag=0;			/* Static sized varchar */
+      record_type= STATIC_RECORD;
+    }
+    else if (key_field != FIELD_BLOB)
+    {
+      key_field=FIELD_NORMAL;		/* static-size record */
+      extra_field=FIELD_NORMAL;
+      record_type= STATIC_RECORD;
+    }
+    break;
+  case 'p':
+    pack_keys=HA_PACK_KEY;		/* Use prefix + space packing */
+    pack_seg=HA_SPACE_PACK;
+    key_type=HA_KEYTYPE_TEXT;
+    break;
+  case 'm':
+    unique_key=0;
+    break;
+  case 'b':
+    key_field=FIELD_BLOB;			/* blob key */
+    extra_field= FIELD_BLOB;
+    pack_seg|= HA_BLOB_PART;
+    key_type= HA_KEYTYPE_VARTEXT1;
+    if (record_type == STATIC_RECORD)
+      record_type= DYNAMIC_RECORD;
+    break;
+  case 'k':
+    if (key_length < 4 || key_length > HA_MAX_KEY_LENGTH)
+    {
+      fprintf(stderr,"Wrong key length\n");
+      exit(1);
+    }
+    break;
+  case 'w':
+    key_field=FIELD_VARCHAR;			/* varchar keys */
+    extra_field= FIELD_VARCHAR;
+    key_type= HA_KEYTYPE_VARTEXT1;
+    pack_seg|= HA_VAR_LENGTH_PART;
+    if (record_type == STATIC_RECORD)
+      record_type= DYNAMIC_RECORD;
+    break;
+  case 'K':                                     /* Use key cacheing */
+    pagecacheing=1;
+    break;
+  case 'V':
+    printf("test1 Ver 1.2 \n");
+    exit(0);
+  case '#':
+    DBUG_PUSH(argument);
+    break;
+  case '?':
+    usage();
+    exit(1);
+  }
+  return 0;
+}
+
+
+/* Read options */
+
+static void get_options(int argc, char *argv[])
+{
+  int ho_error;
+
+  if ((ho_error=handle_options(&argc, &argv, my_long_options, get_one_option)))
+    exit(ho_error);
+  if (transactional)
+    record_type= BLOCK_RECORD;
+  return;
+} /* get options */
+
+
+static void usage()
+{
+  printf("Usage: %s [options]\n\n", my_progname);
+  my_print_help(my_long_options);
+  my_print_variables(my_long_options);
+}
diff --git a/storage/maria/ma_test2.c b/storage/maria/ma_test2.c
new file mode 100644
index 00000000000..9e2f32f767b
--- /dev/null
+++ b/storage/maria/ma_test2.c
@@ -0,0 +1,1246 @@
+/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; version 2 of the License.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */
+
+/* Test av isam-databas: stor test */
+
+#ifndef USE_MY_FUNC		/* We want to be able to dbug this !! */
+#define USE_MY_FUNC
+#endif
+#include "maria_def.h"
+#include "trnman.h"
+#include <m_ctype.h>
+#include <my_bit.h>
+#include "ma_checkpoint.h"
+
+#define STANDARD_LENGTH 37
+#define MARIA_KEYS 6
+#define MAX_PARTS 4
+#if !defined(MSDOS) && !defined(labs)
+#define labs(a) abs(a)
+#endif
+
+static void get_options(int argc, char *argv[]);
+static uint rnd(uint max_value);
+static void fix_length(uchar *record,uint length);
+static void put_blob_in_record(uchar *blob_pos,char **blob_buffer,
+                               ulong *length);
+static void copy_key(MARIA_HA *info, uint inx, uchar *record, uchar *key);
+
+static int verbose= 0, testflag= 0, first_key= 0, async_io= 0, pagecacheing= 0;
+static int write_cacheing= 0, do_locking= 0, rec_pointer_size= 0;
+static int silent= 0, opt_quick_mode= 0, transactional= 0, skip_update= 0;
+static int die_in_middle_of_transaction= 0, pack_fields= 1;
+static int pack_seg= HA_SPACE_PACK, pack_type= HA_PACK_KEY, remove_count= -1;
+static int create_flag= 0, srand_arg= 0, checkpoint= 0;
+static my_bool opt_versioning= 0;
+static uint use_blob= 0, update_count= 0;
+static ulong pagecache_size=8192*32;
+static enum data_file_type record_type= DYNAMIC_RECORD;
+
+static uint keys=MARIA_KEYS,recant=1000;
+static uint16 key1[1001],key3[5000];
+static uchar record[300],record2[300],key[100],key2[100];
+static uchar read_record[300],read_record2[300],read_record3[300];
+static HA_KEYSEG glob_keyseg[MARIA_KEYS][MAX_PARTS];
+
+		/* Test program */
+
+int main(int argc, char *argv[])
+{
+  uint i;
+  int j,n1,n2,n3,error,k;
+  uint write_count,update,dupp_keys,opt_delete,start,length,blob_pos,
+       reclength,ant,found_parts;
+  my_off_t lastpos;
+  ha_rows range_records,records;
+  MARIA_HA *file;
+  MARIA_KEYDEF keyinfo[10];
+  MARIA_COLUMNDEF recinfo[10];
+  MARIA_INFO info;
+  const char *filename;
+  char *blob_buffer;
+  MARIA_CREATE_INFO create_info;
+
+#if defined(SAFE_MUTEX) && defined(THREAD)
+  safe_mutex_deadlock_detector= 1;
+#endif
+  MY_INIT(argv[0]);
+
+  filename= "test2";
+  get_options(argc,argv);
+  if (! async_io)
+    my_disable_async_io=1;
+
+  /* If we sync or not have no affect on this test */
+  my_disable_sync= 1;
+
+  maria_data_root= (char *)".";
+  /* Maria requires that we always have a page cache */
+  if (maria_init() ||
+      (init_pagecache(maria_pagecache, pagecache_size, 0, 0,
+		      maria_block_size, MY_WME) == 0) ||
+      ma_control_file_open(TRUE, TRUE) ||
+      (init_pagecache(maria_log_pagecache,
+		      TRANSLOG_PAGECACHE_SIZE, 0, 0,
+		      TRANSLOG_PAGE_SIZE, MY_WME) == 0) ||
+      translog_init(maria_data_root, TRANSLOG_FILE_SIZE,
+		    0, 0, maria_log_pagecache,
+		    TRANSLOG_DEFAULT_FLAGS, 0) ||
+      (transactional && (trnman_init(0) || ma_checkpoint_init(0))))
+  {
+    fprintf(stderr, "Error in initialization");
+    exit(1);
+  }
+  if (opt_versioning)
+    init_thr_lock();
+
+  reclength=STANDARD_LENGTH+60+(use_blob ? 8 : 0);
+  blob_pos=STANDARD_LENGTH+60;
+  keyinfo[0].seg= &glob_keyseg[0][0];
+  keyinfo[0].seg[0].start=0;
+  keyinfo[0].seg[0].length=6;
+  keyinfo[0].seg[0].type=HA_KEYTYPE_TEXT;
+  keyinfo[0].seg[0].language= default_charset_info->number;
+  keyinfo[0].seg[0].flag=(uint8) pack_seg;
+  keyinfo[0].seg[0].null_bit=0;
+  keyinfo[0].seg[0].null_pos=0;
+  keyinfo[0].key_alg=HA_KEY_ALG_BTREE;
+  keyinfo[0].keysegs=1;
+  keyinfo[0].flag = pack_type;
+  keyinfo[0].block_length= 0;                   /* Default block length */
+  keyinfo[1].seg= &glob_keyseg[1][0];
+  keyinfo[1].seg[0].start=7;
+  keyinfo[1].seg[0].length=6;
+  keyinfo[1].seg[0].type=HA_KEYTYPE_BINARY;
+  keyinfo[1].seg[0].flag=0;
+  keyinfo[1].seg[0].null_bit=0;
+  keyinfo[1].seg[0].null_pos=0;
+  keyinfo[1].seg[1].start=0;			/* two part key */
+  keyinfo[1].seg[1].length=6;
+  keyinfo[1].seg[1].type=HA_KEYTYPE_NUM;
+  keyinfo[1].seg[1].flag=HA_REVERSE_SORT;
+  keyinfo[1].seg[1].null_bit=0;
+  keyinfo[1].seg[1].null_pos=0;
+  keyinfo[1].key_alg=HA_KEY_ALG_BTREE;
+  keyinfo[1].keysegs=2;
+  keyinfo[1].flag =0;
+  keyinfo[1].block_length= MARIA_MIN_KEY_BLOCK_LENGTH;  /* Diff blocklength */
+  keyinfo[2].seg= &glob_keyseg[2][0];
+  keyinfo[2].seg[0].start=12;
+  keyinfo[2].seg[0].length=8;
+  keyinfo[2].seg[0].type=HA_KEYTYPE_BINARY;
+  keyinfo[2].seg[0].flag=HA_REVERSE_SORT;
+  keyinfo[2].seg[0].null_bit=0;
+  keyinfo[2].seg[0].null_pos=0;
+  keyinfo[2].key_alg=HA_KEY_ALG_BTREE;
+  keyinfo[2].keysegs=1;
+  keyinfo[2].flag =HA_NOSAME;
+  keyinfo[2].block_length= 0;                   /* Default block length */
+  keyinfo[3].seg= &glob_keyseg[3][0];
+  keyinfo[3].seg[0].start=0;
+  keyinfo[3].seg[0].length=reclength-(use_blob ? 8 : 0);
+  keyinfo[3].seg[0].type=HA_KEYTYPE_TEXT;
+  keyinfo[3].seg[0].language=default_charset_info->number;
+  keyinfo[3].seg[0].flag=(uint8) pack_seg;
+  keyinfo[3].seg[0].null_bit=0;
+  keyinfo[3].seg[0].null_pos=0;
+  keyinfo[3].key_alg=HA_KEY_ALG_BTREE;
+  keyinfo[3].keysegs=1;
+  keyinfo[3].flag = pack_type;
+  keyinfo[3].block_length= 0;                   /* Default block length */
+  keyinfo[4].seg= &glob_keyseg[4][0];
+  keyinfo[4].seg[0].start=0;
+  keyinfo[4].seg[0].length=5;
+  keyinfo[4].seg[0].type=HA_KEYTYPE_TEXT;
+  keyinfo[4].seg[0].language=default_charset_info->number;
+  keyinfo[4].seg[0].flag=0;
+  keyinfo[4].seg[0].null_bit=0;
+  keyinfo[4].seg[0].null_pos=0;
+  keyinfo[4].key_alg=HA_KEY_ALG_BTREE;
+  keyinfo[4].keysegs=1;
+  keyinfo[4].flag = pack_type;
+  keyinfo[4].block_length= 0;                   /* Default block length */
+  keyinfo[5].seg= &glob_keyseg[5][0];
+  keyinfo[5].seg[0].start=0;
+  keyinfo[5].seg[0].length=4;
+  keyinfo[5].seg[0].type=HA_KEYTYPE_TEXT;
+  keyinfo[5].seg[0].language=default_charset_info->number;
+  keyinfo[5].seg[0].flag=pack_seg;
+  keyinfo[5].seg[0].null_bit=0;
+  keyinfo[5].seg[0].null_pos=0;
+  keyinfo[5].key_alg=HA_KEY_ALG_BTREE;
+  keyinfo[5].keysegs=1;
+  keyinfo[5].flag = pack_type;
+  keyinfo[5].block_length= 0;                   /* Default block length */
+
+  recinfo[0].type=pack_fields ? FIELD_SKIP_PRESPACE : 0;
+  recinfo[0].length=7;
+  recinfo[0].null_bit=0;
+  recinfo[0].null_pos=0;
+  recinfo[1].type=pack_fields ? FIELD_SKIP_PRESPACE : 0;
+  recinfo[1].length=5;
+  recinfo[1].null_bit=0;
+  recinfo[1].null_pos=0;
+  recinfo[2].type=pack_fields ? FIELD_SKIP_PRESPACE : 0;
+  recinfo[2].length=9;
+  recinfo[2].null_bit=0;
+  recinfo[2].null_pos=0;
+  recinfo[3].type=FIELD_NORMAL;
+  recinfo[3].length=STANDARD_LENGTH-7-5-9-4;
+  recinfo[3].null_bit=0;
+  recinfo[3].null_pos=0;
+  recinfo[4].type=pack_fields ? FIELD_SKIP_ZERO : 0;
+  recinfo[4].length=4;
+  recinfo[4].null_bit=0;
+  recinfo[4].null_pos=0;
+  recinfo[5].type=pack_fields ? FIELD_SKIP_ENDSPACE : 0;
+  recinfo[5].length=60;
+  recinfo[5].null_bit=0;
+  recinfo[5].null_pos=0;
+  if (use_blob)
+  {
+    recinfo[6].type=FIELD_BLOB;
+    recinfo[6].length=4+portable_sizeof_char_ptr;
+    recinfo[6].null_bit=0;
+    recinfo[6].null_pos=0;
+  }
+
+  write_count=update=dupp_keys=opt_delete=0;
+  blob_buffer=0;
+
+  for (i=1000 ; i>0 ; i--) key1[i]=0;
+  for (i=4999 ; i>0 ; i--) key3[i]=0;
+
+  if (!silent)
+    printf("- Creating maria-file\n");
+  file= 0;
+  bzero((char*) &create_info,sizeof(create_info));
+  create_info.max_rows=(ha_rows) (rec_pointer_size ?
+				  (1L << (rec_pointer_size*8))/
+				  reclength : 0);
+  create_info.reloc_rows=(ha_rows) 100;
+  create_info.transactional= transactional;
+  if (maria_create(filename, record_type, keys,&keyinfo[first_key],
+		use_blob ? 7 : 6, &recinfo[0],
+		0,(MARIA_UNIQUEDEF*) 0,
+		&create_info,create_flag))
+    goto err;
+  if (!(file=maria_open(filename,2,HA_OPEN_ABORT_IF_LOCKED)))
+    goto err;
+  maria_begin(file);
+  if (opt_versioning)
+    maria_versioning(file, 1);
+  if (testflag == 1)
+    goto end;
+  if (checkpoint == 1 && ma_checkpoint_execute(CHECKPOINT_MEDIUM, FALSE))
+    goto err;
+  if (!silent)
+    printf("- Writing key:s\n");
+  if (do_locking)
+    maria_lock_database(file,F_WRLCK);
+  if (write_cacheing)
+    maria_extra(file,HA_EXTRA_WRITE_CACHE,0);
+  if (opt_quick_mode)
+    maria_extra(file,HA_EXTRA_QUICK,0);
+
+  for (i=0 ; i < recant ; i++)
+  {
+    ulong blob_length;
+    n1=rnd(1000); n2=rnd(100); n3=rnd(5000);
+    sprintf((char*) record,"%6d:%4d:%8d:Pos: %4d    ",n1,n2,n3,write_count);
+    int4store(record+STANDARD_LENGTH-4,(long) i);
+    fix_length(record,(uint) STANDARD_LENGTH+rnd(60));
+    put_blob_in_record(record+blob_pos,&blob_buffer, &blob_length);
+    DBUG_PRINT("test",("record: %d  blob_length: %lu", i, blob_length));
+
+    if (maria_write(file,record))
+    {
+      if (my_errno != HA_ERR_FOUND_DUPP_KEY || key3[n3] == 0)
+      {
+	printf("Error: %d in write at record: %d\n",my_errno,i);
+	goto err;
+      }
+      if (verbose) printf("   Double key: %d at record# %d\n", n3, i);
+    }
+    else
+    {
+      if (key3[n3] == 1 && first_key <3 && first_key+keys >= 3)
+      {
+	printf("Error: Didn't get error when writing second key: '%8d'\n",n3);
+	goto err;
+      }
+      write_count++; key1[n1]++; key3[n3]=1;
+    }
+
+    /* Check if we can find key without flushing database */
+    if (i % 10 == 0)
+    {
+      for (j=rnd(1000)+1 ; j>0 && key1[j] == 0 ; j--) ;
+      if (!j)
+	for (j=999 ; j>0 && key1[j] == 0 ; j--) ;
+      sprintf((char*) key,"%6d",j);
+      if (maria_rkey(file,read_record,0,key,HA_WHOLE_KEY,HA_READ_KEY_EXACT))
+      {
+	printf("Test in loop: Can't find key: \"%s\"\n",key);
+	goto err;
+      }
+    }
+  }
+  if (checkpoint == 2 && ma_checkpoint_execute(CHECKPOINT_MEDIUM, FALSE))
+    goto err;
+
+  if (write_cacheing)
+  {
+    if (maria_extra(file,HA_EXTRA_NO_CACHE,0))
+    {
+      puts("got error from maria_extra(HA_EXTRA_NO_CACHE)");
+      goto err;
+    }
+  }
+
+  if (testflag == 2)
+    goto end;
+
+#ifdef REMOVE_WHEN_WE_HAVE_RESIZE
+  if (pagecacheing)
+    resize_pagecache(maria_pagecache, maria_block_size,
+                     pagecache_size * 2, 0, 0);
+#endif
+  if (!silent)
+    printf("- Delete\n");
+  if (srand_arg)
+    srand(srand_arg);
+  if (!update_count)
+    update_count= recant/10;
+
+  for (i=0 ; i < update_count ; i++)
+  {
+    for (j=rnd(1000)+1 ; j>0 && key1[j] == 0 ; j--) ;
+    if (j != 0)
+    {
+      sprintf((char*) key,"%6d",j);
+      if (maria_rkey(file,read_record,0,key,HA_WHOLE_KEY,HA_READ_KEY_EXACT))
+      {
+	printf("can't find key1: \"%s\"\n",key);
+	goto err;
+      }
+      if (bcmp(read_record+keyinfo[0].seg[0].start,
+               key, keyinfo[0].seg[0].length))
+      {
+	printf("Found wrong record when searching for key: \"%s\"\n",key);
+	goto err;
+      }
+      if (opt_delete == (uint) remove_count)		/* While testing */
+	goto end;
+      if (maria_delete(file,read_record))
+      {
+	printf("error: %d; can't delete record: \"%s\"\n", my_errno,read_record);
+	goto err;
+      }
+      opt_delete++;
+      key1[atoi((char*) read_record+keyinfo[0].seg[0].start)]--;
+      key3[atoi((char*) read_record+keyinfo[2].seg[0].start)]=0;
+    }
+    else
+    {
+      puts("Warning: Skipping delete test because no dupplicate keys");
+      break;
+    }
+  }
+  if (testflag == 3)
+    goto end;
+  if (checkpoint == 3 && ma_checkpoint_execute(CHECKPOINT_MEDIUM, FALSE))
+    goto err;
+
+  if (!silent)
+    printf("- Update\n");
+  if (srand_arg)
+    srand(srand_arg);
+  if (!update_count)
+    update_count= recant/10;
+
+  for (i=0 ; i < update_count ; i++)
+  {
+    n1=rnd(1000); n2=rnd(100); n3=rnd(5000);
+    sprintf((char*) record2,"%6d:%4d:%8d:XXX: %4d     ",n1,n2,n3,update);
+    int4store(record2+STANDARD_LENGTH-4,(long) i);
+    fix_length(record2,(uint) STANDARD_LENGTH+rnd(60));
+
+    for (j=rnd(1000)+1 ; j>0 && key1[j] == 0 ; j--) ;
+    if (j != 0)
+    {
+      sprintf((char*) key,"%6d",j);
+      if (maria_rkey(file,read_record,0,key,HA_WHOLE_KEY,HA_READ_KEY_EXACT))
+      {
+	printf("can't find key1: \"%s\"\n", (char*) key);
+	goto err;
+      }
+      if (bcmp(read_record+keyinfo[0].seg[0].start,
+               key, keyinfo[0].seg[0].length))
+      {
+	printf("Found wrong record when searching for key: \"%s\"; Found \"%.*s\"\n",
+               key, keyinfo[0].seg[0].length,
+               read_record+keyinfo[0].seg[0].start);
+	goto err;
+      }
+      if (use_blob)
+      {
+        ulong blob_length;
+	if (i & 1)
+	  put_blob_in_record(record2+blob_pos,&blob_buffer, &blob_length);
+	else
+	  bmove(record2+blob_pos, read_record+blob_pos, 4 + sizeof(char*));
+      }
+      if (skip_update)
+        continue;
+      if (maria_update(file,read_record,record2))
+      {
+	if (my_errno != HA_ERR_FOUND_DUPP_KEY || key3[n3] == 0)
+	{
+	  printf("error: %d; can't update:\nFrom: \"%s\"\nTo:   \"%s\"\n",
+		 my_errno,read_record,record2);
+	  goto err;
+	}
+	if (verbose)
+	  printf("Double key when tried to update:\nFrom: \"%s\"\nTo:   \"%s\"\n",record,record2);
+      }
+      else
+      {
+	key1[atoi((char*) read_record+keyinfo[0].seg[0].start)]--;
+	key3[atoi((char*) read_record+keyinfo[2].seg[0].start)]=0;
+	key1[n1]++; key3[n3]=1;
+	update++;
+      }
+    }
+  }
+  if (testflag == 4)
+    goto end;
+  if (checkpoint == 4 && ma_checkpoint_execute(CHECKPOINT_MEDIUM, FALSE))
+    goto err;
+
+  for (i=999, dupp_keys=j=0 ; i>0 ; i--)
+  {
+    if (key1[i] > dupp_keys)
+    {
+      dupp_keys=key1[i]; j=i;
+    }
+  }
+  sprintf((char*) key,"%6d",j);
+  start=keyinfo[0].seg[0].start;
+  length=keyinfo[0].seg[0].length;
+  if (dupp_keys)
+  {
+    if (!silent)
+      printf("- Same key: first - next -> last - prev -> first\n");
+    DBUG_PRINT("progpos",("first - next -> last - prev -> first"));
+    if (verbose) printf("	 Using key: \"%s\"  Keys: %d\n",key,dupp_keys);
+
+    if (maria_rkey(file,read_record,0,key,HA_WHOLE_KEY,HA_READ_KEY_EXACT))
+      goto err;
+    if (maria_rsame(file,read_record2,-1))
+      goto err;
+    if (memcmp(read_record,read_record2,reclength) != 0)
+    {
+      printf("maria_rsame didn't find same record\n");
+      goto err;
+    }
+    info.recpos=maria_position(file);
+    if (maria_rfirst(file,read_record2,0) ||
+	maria_rsame_with_pos(file,read_record2,0,info.recpos) ||
+	memcmp(read_record,read_record2,reclength) != 0)
+    {
+      printf("maria_rsame_with_pos didn't find same record\n");
+      goto err;
+    }
+    {
+      int skr;
+      info.recpos= maria_position(file);
+      skr= maria_rnext(file,read_record2,0);
+      if ((skr && my_errno != HA_ERR_END_OF_FILE) ||
+	  maria_rprev(file,read_record2,0) ||
+	  memcmp(read_record,read_record2,reclength) != 0 ||
+          info.recpos != maria_position(file))
+      {
+	printf("maria_rsame_with_pos lost position\n");
+	goto err;
+      }
+    }
+    ant=1;
+    while (maria_rnext(file,read_record2,0) == 0 &&
+	   memcmp(read_record2+start,key,length) == 0) ant++;
+    if (ant != dupp_keys)
+    {
+      printf("next: Found: %d keys of %d\n",ant,dupp_keys);
+      goto err;
+    }
+    ant=0;
+    while (maria_rprev(file,read_record3,0) == 0 &&
+	   bcmp(read_record3+start,key,length) == 0) ant++;
+    if (ant != dupp_keys)
+    {
+      printf("prev: Found: %d records of %d\n",ant,dupp_keys);
+      goto err;
+    }
+
+    /* Check of maria_rnext_same */
+    if (maria_rkey(file,read_record,0,key,HA_WHOLE_KEY,HA_READ_KEY_EXACT))
+      goto err;
+    ant=1;
+    while (!maria_rnext_same(file,read_record3) && ant < dupp_keys+10)
+      ant++;
+    if (ant != dupp_keys || my_errno != HA_ERR_END_OF_FILE)
+    {
+      printf("maria_rnext_same: Found: %d records of %d\n",ant,dupp_keys);
+      goto err;
+    }
+  }
+
+  if (!silent)
+    printf("- All keys: first - next -> last - prev -> first\n");
+  DBUG_PRINT("progpos",("All keys: first - next -> last - prev -> first"));
+  ant=1;
+  if (maria_rfirst(file,read_record,0))
+  {
+    printf("Can't find first record\n");
+    goto err;
+  }
+  while ((error=maria_rnext(file,read_record3,0)) == 0 && ant < write_count+10)
+    ant++;
+  if (ant != write_count - opt_delete || error != HA_ERR_END_OF_FILE)
+  {
+    printf("next: I found: %d records of %d (error: %d)\n",
+	   ant, write_count - opt_delete, error);
+    goto err;
+  }
+  if (maria_rlast(file,read_record2,0) ||
+      bcmp(read_record2,read_record3,reclength))
+  {
+    printf("Can't find last record\n");
+    DBUG_DUMP("record2", read_record2, reclength);
+    DBUG_DUMP("record3", read_record3, reclength);
+    goto err;
+  }
+  ant=1;
+  while (maria_rprev(file,read_record3,0) == 0 && ant < write_count+10)
+    ant++;
+  if (ant != write_count - opt_delete)
+  {
+    printf("prev: I found: %d records of %d\n",ant,write_count);
+    goto err;
+  }
+  if (bcmp(read_record,read_record3,reclength))
+  {
+    printf("Can't find first record\n");
+    goto err;
+  }
+
+  if (!silent)
+    printf("- Test if: Read first - next - prev - prev - next == first\n");
+  DBUG_PRINT("progpos",("- Read first - next - prev - prev - next == first"));
+  if (maria_rfirst(file,read_record,0) ||
+      maria_rnext(file,read_record3,0) ||
+      maria_rprev(file,read_record3,0) ||
+      maria_rprev(file,read_record3,0) == 0 ||
+      maria_rnext(file,read_record3,0))
+      goto err;
+  if (bcmp(read_record,read_record3,reclength) != 0)
+     printf("Can't find first record\n");
+
+  if (!silent)
+    printf("- Test if: Read last - prev - next - next - prev == last\n");
+  DBUG_PRINT("progpos",("Read last - prev - next - next - prev == last"));
+  if (maria_rlast(file,read_record2,0) ||
+      maria_rprev(file,read_record3,0) ||
+      maria_rnext(file,read_record3,0) ||
+      maria_rnext(file,read_record3,0) == 0 ||
+      maria_rprev(file,read_record3,0))
+      goto err;
+  if (bcmp(read_record2,read_record3,reclength))
+     printf("Can't find last record\n");
+#ifdef NOT_ANYMORE
+  if (!silent)
+    puts("- Test read key-part");
+  strmov(key2,key);
+  for(i=strlen(key2) ; i-- > 1 ;)
+  {
+    key2[i]=0;
+
+    /* The following row is just to catch some bugs in the key code */
+    bzero((char*) file->lastkey,file->s->base.max_key_length*2);
+    if (maria_rkey(file,read_record,0,key2,(uint) i,HA_READ_PREFIX))
+      goto err;
+    if (bcmp(read_record+start,key,(uint) i))
+    {
+      puts("Didn't find right record");
+      goto err;
+    }
+  }
+#endif
+  if (dupp_keys > 2)
+  {
+    if (!silent)
+      printf("- Read key (first) - next - delete - next -> last\n");
+    DBUG_PRINT("progpos",("first - next - delete - next -> last"));
+    if (maria_rkey(file,read_record,0,key,HA_WHOLE_KEY,HA_READ_KEY_EXACT))
+      goto err;
+    if (maria_rnext(file,read_record3,0)) goto err;
+    if (maria_delete(file,read_record3)) goto err;
+    opt_delete++;
+    ant=1;
+    while (maria_rnext(file,read_record3,0) == 0 &&
+	   bcmp(read_record3+start,key,length) == 0) ant++;
+    if (ant != dupp_keys-1)
+    {
+      printf("next: I can only find: %d keys of %d\n",ant,dupp_keys-1);
+      goto err;
+    }
+  }
+  if (dupp_keys>4)
+  {
+    if (!silent)
+      printf("- Read last of key - prev - delete - prev -> first\n");
+    DBUG_PRINT("progpos",("last - prev - delete - prev -> first"));
+    if (maria_rprev(file,read_record3,0)) goto err;
+    if (maria_rprev(file,read_record3,0)) goto err;
+    if (maria_delete(file,read_record3)) goto err;
+    opt_delete++;
+    ant=1;
+    while (maria_rprev(file,read_record3,0) == 0 &&
+	   bcmp(read_record3+start,key,length) == 0) ant++;
+    if (ant != dupp_keys-2)
+    {
+      printf("next: I can only find: %d keys of %d\n",ant,dupp_keys-2);
+      goto err;
+    }
+  }
+  if (dupp_keys > 6)
+  {
+    if (!silent)
+      printf("- Read first - delete - next -> last\n");
+    DBUG_PRINT("progpos",("first - delete - next -> last"));
+    if (maria_rkey(file,read_record3,0,key,HA_WHOLE_KEY,HA_READ_KEY_EXACT))
+      goto err;
+    if (maria_delete(file,read_record3)) goto err;
+    opt_delete++;
+    ant=1;
+    if (maria_rnext(file,read_record,0))
+      goto err;					/* Skall finnas poster */
+    while (maria_rnext(file,read_record3,0) == 0 &&
+	   bcmp(read_record3+start,key,length) == 0) ant++;
+    if (ant != dupp_keys-3)
+    {
+      printf("next: I can only find: %d keys of %d\n",ant,dupp_keys-3);
+      goto err;
+    }
+
+    if (!silent)
+      printf("- Read last - delete - prev -> first\n");
+    DBUG_PRINT("progpos",("last - delete - prev -> first"));
+    if (maria_rprev(file,read_record3,0)) goto err;
+    if (maria_delete(file,read_record3)) goto err;
+    opt_delete++;
+    ant=0;
+    while (maria_rprev(file,read_record3,0) == 0 &&
+	   bcmp(read_record3+start,key,length) == 0) ant++;
+    if (ant != dupp_keys-4)
+    {
+      printf("next: I can only find: %d keys of %d\n",ant,dupp_keys-4);
+      goto err;
+    }
+  }
+
+  if (!silent)
+    puts("- Test if: Read rrnd - same");
+  DBUG_PRINT("progpos",("Read rrnd - same"));
+  assert(maria_scan_init(file) == 0);
+  for (i=0 ; i < write_count ; i++)
+  {
+    int tmp;
+    if ((tmp= maria_scan(file,read_record)) &&
+        tmp != HA_ERR_END_OF_FILE &&
+        tmp != HA_ERR_RECORD_DELETED)
+    {
+      printf("Got error %d when scanning table\n", tmp);
+      break;
+    }
+    if (!tmp)
+    {
+      /* Remember position to last found row */
+      info.recpos= maria_position(file);
+      bmove(read_record2,read_record,reclength);
+    }
+  }
+  maria_scan_end(file);
+  if (i != write_count && i != write_count - opt_delete)
+  {
+    printf("Found wrong number of rows while scanning table\n");
+    goto err;
+  }
+
+  if (maria_rsame_with_pos(file,read_record,0,info.recpos))
+    goto err;
+  if (bcmp(read_record,read_record2,reclength) != 0)
+  {
+    printf("maria_rsame_with_pos didn't find same record\n");
+    goto err;
+  }
+
+  for (i=min(2,keys) ; i-- > 0 ;)
+  {
+    if (maria_rsame(file,read_record2,(int) i)) goto err;
+    if (bcmp(read_record,read_record2,reclength) != 0)
+    {
+      printf("maria_rsame didn't find same record\n");
+      goto err;
+    }
+  }
+  if (!silent)
+    puts("- Test maria_records_in_range");
+  maria_status(file,&info,HA_STATUS_VARIABLE);
+  for (i=0 ; i < info.keys ; i++)
+  {
+    key_range min_key, max_key;
+    if (maria_rfirst(file,read_record,(int) i) ||
+	maria_rlast(file,read_record2,(int) i))
+      goto err;
+    copy_key(file,(uint) i, read_record,  key);
+    copy_key(file,(uint) i, read_record2, key2);
+    min_key.key= key;
+    min_key.keypart_map= HA_WHOLE_KEY;
+    min_key.flag= HA_READ_KEY_EXACT;
+    max_key.key= key2;
+    max_key.keypart_map= HA_WHOLE_KEY;
+    max_key.flag= HA_READ_AFTER_KEY;
+
+    range_records= maria_records_in_range(file,(int) i, &min_key, &max_key);
+    if (range_records < info.records*8/10 ||
+	range_records > info.records*12/10)
+    {
+      printf("maria_records_range returned %ld; Should be about %ld\n",
+	     (long) range_records,(long) info.records);
+      goto err;
+    }
+    if (verbose)
+    {
+      printf("maria_records_range returned %ld;  Exact is %ld  (diff: %4.2g %%)\n",
+	     (long) range_records, (long) info.records,
+	     labs((long) range_records - (long) info.records)*100.0/
+	     info.records);
+    }
+  }
+  for (i=0 ; i < 5 ; i++)
+  {
+    for (j=rnd(1000)+1 ; j>0 && key1[j] == 0 ; j--) ;
+    for (k=rnd(1000)+1 ; k>0 && key1[k] == 0 ; k--) ;
+    if (j != 0 && k != 0)
+    {
+      key_range min_key, max_key;
+      if (j > k)
+	swap_variables(int, j, k);
+      sprintf((char*) key,"%6d",j);
+      sprintf((char*) key2,"%6d",k);
+
+      min_key.key= key;
+      min_key.keypart_map= HA_WHOLE_KEY;
+      min_key.flag= HA_READ_AFTER_KEY;
+      max_key.key= key2;
+      max_key.keypart_map= HA_WHOLE_KEY;
+      max_key.flag= HA_READ_BEFORE_KEY;
+      range_records= maria_records_in_range(file, 0, &min_key, &max_key);
+      records=0;
+      for (j++ ; j < k ; j++)
+	records+=key1[j];
+      if ((long) range_records < (long) records*7/10-2 ||
+	  (long) range_records > (long) records*14/10+2)
+      {
+	printf("maria_records_range for key: %d returned %lu; Should be about %lu\n",
+	       i, (ulong) range_records, (ulong) records);
+	goto err;
+      }
+      if (verbose && records)
+      {
+	printf("maria_records_range returned %lu;  Exact is %lu  (diff: %4.2g %%)\n",
+	       (ulong) range_records, (ulong) records,
+	       labs((long) range_records-(long) records)*100.0/records);
+
+      }
+    }
+    }
+
+  if (!silent)
+    printf("- maria_info\n");
+  maria_status(file,&info,HA_STATUS_VARIABLE | HA_STATUS_CONST);
+  if (info.records != write_count-opt_delete || info.deleted > opt_delete + update
+      || info.keys != keys)
+  {
+    puts("Wrong info from maria_info");
+    printf("Got: records: %lu  delete: %lu  i_keys: %d\n",
+	   (ulong) info.records, (ulong) info.deleted, info.keys);
+    goto err;
+  }
+  if (verbose)
+  {
+    char buff[80];
+    get_date(buff,3,info.create_time);
+    printf("info: Created %s\n",buff);
+    get_date(buff,3,info.check_time);
+    printf("info: checked %s\n",buff);
+    get_date(buff,3,info.update_time);
+    printf("info: Modified %s\n",buff);
+  }
+
+  maria_panic(HA_PANIC_WRITE);
+  maria_panic(HA_PANIC_READ);
+  if (maria_is_changed(file))
+    puts("Warning: maria_is_changed reported that datafile was changed");
+
+  if (!silent)
+    printf("- maria_extra(CACHE) + maria_rrnd.... + maria_extra(NO_CACHE)\n");
+  if (maria_reset(file) || maria_extra(file,HA_EXTRA_CACHE,0))
+  {
+    if (do_locking || (!use_blob && !pack_fields))
+    {
+      puts("got error from maria_extra(HA_EXTRA_CACHE)");
+      goto err;
+    }
+  }
+  ant=0;
+  assert(maria_scan_init(file) == 0);
+  while ((error= maria_scan(file,record)) != HA_ERR_END_OF_FILE &&
+	 ant < write_count + 10)
+    ant+= error ? 0 : 1;
+  maria_scan_end(file);
+  if (ant != write_count-opt_delete)
+  {
+    printf("scan with cache: I can only find: %d records of %d\n",
+	   ant,write_count-opt_delete);
+    maria_scan_end(file);
+    goto err;
+  }
+  if (maria_extra(file,HA_EXTRA_NO_CACHE,0))
+  {
+    puts("got error from maria_extra(HA_EXTRA_NO_CACHE)");
+    maria_scan_end(file);
+    goto err;
+  }
+  maria_scan_end(file);
+
+  ant=0;
+  maria_scan_init(file);
+  while ((error=maria_scan(file,record)) != HA_ERR_END_OF_FILE &&
+	 ant < write_count + 10)
+	ant+= error ? 0 : 1;
+  if (ant != write_count-opt_delete)
+  {
+    printf("scan with cache: I can only find: %d records of %d\n",
+	   ant,write_count-opt_delete);
+    maria_scan_end(file);
+    goto err;
+  }
+  maria_scan_end(file);
+
+  if (testflag == 5)
+    goto end;
+  if (checkpoint == 5 && ma_checkpoint_execute(CHECKPOINT_MEDIUM, FALSE))
+    goto err;
+
+  if (!silent)
+    printf("- Removing keys\n");
+  DBUG_PRINT("progpos",("Removing keys"));
+  lastpos = HA_OFFSET_ERROR;
+  /* DBUG_POP(); */
+  maria_reset(file);
+  found_parts=0;
+  maria_scan_init(file);
+  while ((error= maria_scan(file,read_record)) != HA_ERR_END_OF_FILE)
+  {
+    info.recpos=maria_position(file);
+    if (lastpos >= info.recpos && lastpos != HA_OFFSET_ERROR)
+    {
+      printf("maria_rrnd didn't advance filepointer; old: %ld, new: %ld\n",
+	     (long) lastpos, (long) info.recpos);
+      goto err;
+    }
+    lastpos=info.recpos;
+    if (error == 0)
+    {
+      if (opt_delete == (uint) remove_count)		/* While testing */
+	goto end;
+      if (rnd(2) == 1 && maria_rsame(file,read_record,-1))
+      {
+	printf("can't find record %lx\n",(long) info.recpos);
+	goto err;
+      }
+      if (use_blob)
+      {
+	ulong blob_length,pos;
+	uchar *ptr;
+	memcpy_fixed(&ptr, read_record+blob_pos+4, sizeof(ptr));
+        blob_length= uint4korr(read_record+blob_pos);
+	for (pos=0 ; pos < blob_length ; pos++)
+	{
+	  if (ptr[pos] != (uchar) (blob_length+pos))
+	  {
+	    printf("Found blob with wrong info at %ld\n",(long) lastpos);
+            maria_scan_end(file);
+            my_errno= 0;
+	    goto err;
+	  }
+	}
+      }
+      if (maria_delete(file,read_record))
+      {
+	printf("can't delete record: %6.6s, delete_count: %d\n",
+	       read_record, opt_delete);
+        maria_scan_end(file);
+	goto err;
+      }
+      opt_delete++;
+    }
+    else
+      found_parts++;
+  }
+  if (my_errno != HA_ERR_END_OF_FILE && my_errno != HA_ERR_RECORD_DELETED)
+    printf("error: %d from maria_rrnd\n",my_errno);
+  if (write_count != opt_delete)
+  {
+    printf("Deleted only %d of %d records (%d parts)\n",opt_delete,write_count,
+	   found_parts);
+    maria_scan_end(file);
+    goto err;
+  }
+  if (testflag == 6)
+    goto end;
+  if (checkpoint == 6 && ma_checkpoint_execute(CHECKPOINT_MEDIUM, FALSE))
+    goto err;
+
+end:
+  maria_scan_end(file);
+  if (die_in_middle_of_transaction)
+  {
+    /* As commit record is not done, UNDO entries needs to be rolled back */
+    switch (die_in_middle_of_transaction) {
+    case 1:
+      /*
+        Flush changed data and index pages go to disk
+        That will also flush log. Recovery will skip REDOs and apply UNDOs.
+      */
+      _ma_flush_table_files(file, MARIA_FLUSH_DATA | MARIA_FLUSH_INDEX,
+                            FLUSH_RELEASE, FLUSH_RELEASE);
+      break;
+    case 2:
+      /*
+        Just flush log. Pages are likely to not be on disk. Recovery will
+        then execute REDOs and UNDOs.
+      */
+      if (translog_flush(file->trn->undo_lsn))
+        goto err;
+      break;
+    case 3:
+      /*
+        Flush nothing. Pages and log are likely to not be on disk. Recovery
+        will then do nothing.
+      */
+      break;
+    case 4:
+      /*
+        Flush changed data pages go to disk. Changed index pages are not
+        flushed. Recovery will skip some REDOs and apply UNDOs.
+      */
+      _ma_flush_table_files(file, MARIA_FLUSH_DATA, FLUSH_RELEASE,
+                            FLUSH_RELEASE);
+      /*
+        We have to flush log separately as the redo for the last key page
+        may not be flushed
+      */
+      if (translog_flush(file->trn->undo_lsn))
+        goto err;
+      break;
+    }
+    printf("Dying on request without maria_commit()/maria_close()\n");
+    exit(0);
+  }
+  if (maria_commit(file))
+    goto err;
+  if (maria_close(file))
+  {
+    file= 0;
+    goto err;
+  }
+  file= 0;
+  maria_panic(HA_PANIC_CLOSE);			/* Should close log */
+  if (!silent)
+  {
+    printf("\nFollowing test have been made:\n");
+    printf("Write records: %d\nUpdate records: %d\nSame-key-read: %d\nDelete records: %d\n", write_count,update,dupp_keys,opt_delete);
+    if (rec_pointer_size)
+      printf("Record pointer size:  %d\n",rec_pointer_size);
+    printf("maria_block_size:    %lu\n", maria_block_size);
+    if (write_cacheing)
+      puts("Key cache resized");
+    if (write_cacheing)
+      puts("Write cacheing used");
+    if (write_cacheing)
+      puts("quick mode");
+    if (async_io && do_locking)
+      puts("Asyncron io with locking used");
+    else if (do_locking)
+      puts("Locking used");
+    if (use_blob)
+      puts("blobs used");
+    printf("key cache status: \n\
+blocks used:%10lu\n\
+not flushed:%10lu\n\
+w_requests: %10lu\n\
+writes:     %10lu\n\
+r_requests: %10lu\n\
+reads:      %10lu\n",
+           maria_pagecache->blocks_used,
+           maria_pagecache->global_blocks_changed,
+           (ulong) maria_pagecache->global_cache_w_requests,
+           (ulong) maria_pagecache->global_cache_write,
+           (ulong) maria_pagecache->global_cache_r_requests,
+           (ulong) maria_pagecache->global_cache_read);
+  }
+  maria_end();
+  my_free(blob_buffer, MYF(MY_ALLOW_ZERO_PTR));
+  my_end(silent ? MY_CHECK_ERROR : MY_CHECK_ERROR | MY_GIVE_INFO);
+  return(0);
+err:
+  printf("got error: %d when using MARIA-database\n",my_errno);
+  if (file)
+  {
+    if (maria_commit(file))
+      goto err;
+    VOID(maria_close(file));
+  }
+  maria_end();
+  return(1);
+} /* main */
+
+
+/* Read options */
+
+static void get_options(int argc, char **argv)
+{
+  char *pos,*progname;
+
+  progname= argv[0];
+
+  while (--argc >0 && *(pos = *(++argv)) == '-' ) {
+    switch(*++pos) {
+    case 'B':
+      pack_type= HA_BINARY_PACK_KEY;
+      break;
+    case 'b':
+      use_blob= 1000;
+      if (*++pos)
+        use_blob= atol(pos);
+      break;
+    case 'K':				/* Use key cacheing */
+      pagecacheing=1;
+      if (*++pos)
+	pagecache_size=atol(pos);
+      break;
+    case 'W':				/* Use write cacheing */
+      write_cacheing=1;
+      if (*++pos)
+	my_default_record_cache_size=atoi(pos);
+      break;
+    case 'd':
+      remove_count= atoi(++pos);
+      break;
+    case 'i':
+      if (*++pos)
+	srand(srand_arg= atoi(pos));
+      break;
+    case 'L':
+      do_locking=1;
+      break;
+    case 'a':				/* use asyncron io */
+      async_io=1;
+      if (*++pos)
+	my_default_record_cache_size=atoi(pos);
+      break;
+    case 'v':				/* verbose */
+      verbose=1;
+      break;
+    case 'm':				/* records */
+      if ((recant=atoi(++pos)) < 10 && testflag > 2)
+      {
+	fprintf(stderr,"record count must be >= 10 (if testflag > 2)\n");
+	exit(1);
+      }
+      break;
+    case 'e':				/* maria_block_length */
+    case 'E':
+      if ((maria_block_size= atoi(++pos)) < MARIA_MIN_KEY_BLOCK_LENGTH ||
+	  maria_block_size > MARIA_MAX_KEY_BLOCK_LENGTH)
+      {
+	fprintf(stderr,"Wrong maria_block_length\n");
+	exit(1);
+      }
+      maria_block_size= my_round_up_to_next_power(maria_block_size);
+      break;
+    case 'f':
+      if ((first_key=atoi(++pos)) < 0 || first_key >= MARIA_KEYS)
+	first_key=0;
+      break;
+    case 'H':
+      checkpoint= atoi(++pos);
+      break;
+    case 'k':
+      if ((keys=(uint) atoi(++pos)) < 1 ||
+	   keys > (uint) (MARIA_KEYS-first_key))
+	keys=MARIA_KEYS-first_key;
+      break;
+    case 'M':
+      record_type= BLOCK_RECORD;
+      break;
+    case 'P':
+      pack_type=0;			/* Don't use DIFF_LENGTH */
+      pack_seg=0;
+      break;
+    case 'R':				/* Length of record pointer */
+      rec_pointer_size=atoi(++pos);
+      if (rec_pointer_size > 7)
+	rec_pointer_size=0;
+      break;
+    case 'S':
+      pack_fields=0;			/* Static-length-records */
+      record_type= STATIC_RECORD;
+      break;
+    case 's':
+      silent=1;
+      break;
+    case 't':
+      testflag=atoi(++pos);		/* testmod */
+      break;
+    case 'T':
+      transactional= 1;
+      break;
+    case 'A':
+      die_in_middle_of_transaction= atoi(++pos);
+      break;
+    case 'u':
+      update_count=atoi(++pos);
+      if (!update_count)
+        skip_update= 1;
+      break;
+    case 'q':
+      opt_quick_mode=1;
+      break;
+    case 'c':
+      create_flag|= HA_CREATE_CHECKSUM | HA_CREATE_PAGE_CHECKSUM;
+      break;
+    case 'D':
+      create_flag|=HA_CREATE_DELAY_KEY_WRITE;
+      break;
+    case 'g':
+      skip_update= TRUE;
+      break;
+    case 'C':
+      opt_versioning= 1;
+      break;
+    case '?':
+    case 'I':
+    case 'V':
+      printf("%s  Ver 1.2 for %s at %s\n",progname,SYSTEM_TYPE,MACHINE_TYPE);
+      puts("By Monty, for testing Maria\n");
+      printf("Usage: %s [-?AbBcCDIKLPRqSsTVWltv] [-k#] [-f#] [-m#] [-e#] [-E#] [-t#]\n",
+	     progname);
+      exit(0);
+    case '#':
+      DBUG_PUSH (++pos);
+      break;
+    default:
+      printf("Illegal option: '%c'\n",*pos);
+      break;
+    }
+  }
+  return;
+} /* get options */
+
+	/* Get a random value 0 <= x <= n */
+
+static uint rnd(uint max_value)
+{
+  return (uint) ((rand() & 32767)/32767.0*max_value);
+} /* rnd */
+
+
+	/* Create a variable length record */
+
+static void fix_length(uchar *rec, uint length)
+{
+  bmove(rec+STANDARD_LENGTH,
+	"0123456789012345678901234567890123456789012345678901234567890",
+	length-STANDARD_LENGTH);
+  strfill((char*) rec+length,STANDARD_LENGTH+60-length,' ');
+} /* fix_length */
+
+
+/* Put maybe a blob in record */
+
+static int first_entry;
+
+static void put_blob_in_record(uchar *blob_pos, char **blob_buffer,
+                               ulong *blob_length)
+{
+  ulong i,length;
+  *blob_length= 0;
+  if (use_blob)
+  {
+    if (! *blob_buffer &&
+        !(*blob_buffer=my_malloc((uint) use_blob,MYF(MY_WME))))
+    {
+      use_blob= 0;
+      return;
+    }
+    if (rnd(10) == 0)
+    {
+      if (first_entry++ == 0)
+      {
+        /* Ensure we have at least one blob of max length in file */
+        length= use_blob;
+      }
+      else
+        length=rnd(use_blob);
+      for (i=0 ; i < length ; i++)
+	(*blob_buffer)[i]=(char) (length+i);
+      int4store(blob_pos,length);
+      memcpy_fixed(blob_pos+4,(char*) blob_buffer,sizeof(char*));
+      *blob_length= length;
+    }
+    else
+    {
+      int4store(blob_pos,0);
+    }
+  }
+  return;
+}
+
+
+static void copy_key(MARIA_HA *info,uint inx,uchar *rec,uchar *key_buff)
+{
+  HA_KEYSEG *keyseg;
+
+  for (keyseg=info->s->keyinfo[inx].seg ; keyseg->type ; keyseg++)
+  {
+    memcpy(key_buff,rec+keyseg->start,(size_t) keyseg->length);
+    key_buff+=keyseg->length;
+  }
+  return;
+}
diff --git a/storage/maria/ma_test3.c b/storage/maria/ma_test3.c
new file mode 100644
index 00000000000..040d6fa78c2
--- /dev/null
+++ b/storage/maria/ma_test3.c
@@ -0,0 +1,501 @@
+/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; version 2 of the License.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */
+
+/* Test av locking */
+
+#if !(defined (__NETWARE_) || defined (_WIN32)) /*no fork() in Windows*/
+
+#include "maria.h"
+#include <sys/types.h>
+#ifdef HAVE_SYS_WAIT_H
+# include <sys/wait.h>
+#endif
+#ifndef WEXITSTATUS
+# define WEXITSTATUS(stat_val) ((unsigned)(stat_val) >> 8)
+#endif
+#ifndef WIFEXITED
+# define WIFEXITED(stat_val) (((stat_val) & 255) == 0)
+#endif
+
+
+#if defined(HAVE_LRAND48)
+#define rnd(X) (lrand48() % X)
+#define rnd_init(X) srand48(X)
+#else
+#define rnd(X) (random() % X)
+#define rnd_init(X) srandom(X)
+#endif
+
+
+const char *filename= "test3";
+uint tests=10,forks=10,pagecacheing=0;
+
+static void get_options(int argc, char *argv[]);
+void start_test(int id);
+int test_read(MARIA_HA *,int),test_write(MARIA_HA *,int,int),
+    test_update(MARIA_HA *,int,int),test_rrnd(MARIA_HA *,int);
+
+struct record {
+  uchar id[8];
+  uchar nr[4];
+  uchar text[10];
+} record;
+
+
+int main(int argc,char **argv)
+{
+  int status,wait_ret;
+  uint i=0;
+  MARIA_KEYDEF keyinfo[10];
+  MARIA_COLUMNDEF recinfo[10];
+  HA_KEYSEG keyseg[10][2];
+  MY_INIT(argv[0]);
+  get_options(argc,argv);
+
+  fprintf(stderr, "WARNING! this program is to test 'external locking'"
+          " (when several processes share a table through file locking)"
+          " which is not supported by Maria at all; expect errors."
+          " We may soon remove this program.\n");
+  maria_init();
+  bzero((char*) keyinfo,sizeof(keyinfo));
+  bzero((char*) recinfo,sizeof(recinfo));
+  bzero((char*) keyseg,sizeof(keyseg));
+  keyinfo[0].seg= &keyseg[0][0];
+  keyinfo[0].seg[0].start=0;
+  keyinfo[0].seg[0].length=8;
+  keyinfo[0].seg[0].type=HA_KEYTYPE_TEXT;
+  keyinfo[0].seg[0].flag=HA_SPACE_PACK;
+  keyinfo[0].key_alg=HA_KEY_ALG_BTREE;
+  keyinfo[0].keysegs=1;
+  keyinfo[0].flag = (uint8) HA_PACK_KEY;
+  keyinfo[0].block_length= 0;                   /* Default block length */
+  keyinfo[1].seg= &keyseg[1][0];
+  keyinfo[1].seg[0].start=8;
+  keyinfo[1].seg[0].length=4;		/* Long is always 4 in maria */
+  keyinfo[1].seg[0].type=HA_KEYTYPE_LONG_INT;
+  keyinfo[1].seg[0].flag=0;
+  keyinfo[1].key_alg=HA_KEY_ALG_BTREE;
+  keyinfo[1].keysegs=1;
+  keyinfo[1].flag =HA_NOSAME;
+  keyinfo[1].block_length= 0;                   /* Default block length */
+
+  recinfo[0].type=0;
+  recinfo[0].length=sizeof(record.id);
+  recinfo[1].type=0;
+  recinfo[1].length=sizeof(record.nr);
+  recinfo[2].type=0;
+  recinfo[2].length=sizeof(record.text);
+
+  puts("- Creating maria-file");
+  my_delete(filename,MYF(0));		/* Remove old locks under gdb */
+  if (maria_create(filename,BLOCK_RECORD, 2, &keyinfo[0],2,&recinfo[0],0,
+                   (MARIA_UNIQUEDEF*) 0, (MARIA_CREATE_INFO*) 0,0))
+    exit(1);
+
+  rnd_init(0);
+  printf("- Starting %d processes\n",forks); fflush(stdout);
+  for (i=0 ; i < forks; i++)
+  {
+    if (!fork())
+    {
+      start_test(i+1);
+      sleep(1);
+      return 0;
+    }
+    VOID(rnd(1));
+  }
+
+  for (i=0 ; i < forks ; i++)
+    while ((wait_ret=wait(&status)) && wait_ret == -1);
+  maria_end();
+  return 0;
+}
+
+
+static void get_options(int argc, char **argv)
+{
+  char *pos,*progname;
+
+  progname= argv[0];
+
+  while (--argc >0 && *(pos = *(++argv)) == '-' ) {
+    switch(*++pos) {
+    case 'f':
+      forks=atoi(++pos);
+      break;
+    case 't':
+      tests=atoi(++pos);
+      break;
+    case 'K':				/* Use key cacheing */
+      pagecacheing=1;
+      break;
+    case 'A':				/* All flags */
+      pagecacheing=1;
+      break;
+   case '?':
+    case 'I':
+    case 'V':
+      printf("%s  Ver 1.0 for %s at %s\n",progname,SYSTEM_TYPE,MACHINE_TYPE);
+      puts("By Monty, for your professional use\n");
+      puts("Test av locking with threads\n");
+      printf("Usage: %s [-?lKA] [-f#] [-t#]\n",progname);
+      exit(0);
+    case '#':
+      DBUG_PUSH (++pos);
+      break;
+    default:
+      printf("Illegal option: '%c'\n",*pos);
+      break;
+    }
+  }
+  return;
+}
+
+
+void start_test(int id)
+{
+  uint i;
+  int error,lock_type;
+  MARIA_INFO isam_info;
+  MARIA_HA *file,*file1,*file2=0,*lock;
+
+  if (!(file1=maria_open(filename,O_RDWR,HA_OPEN_WAIT_IF_LOCKED)) ||
+      !(file2=maria_open(filename,O_RDWR,HA_OPEN_WAIT_IF_LOCKED)))
+  {
+    fprintf(stderr,"Can't open isam-file: %s\n",filename);
+    exit(1);
+  }
+  if (pagecacheing && rnd(2) == 0)
+    init_pagecache(maria_pagecache, 65536L, 0, 0, MARIA_KEY_BLOCK_LENGTH,
+                   MY_WME);
+  printf("Process %d, pid: %ld\n",id,(long) getpid()); fflush(stdout);
+
+  for (error=i=0 ; i < tests && !error; i++)
+  {
+    file= (rnd(2) == 1) ? file1 : file2;
+    lock=0 ; lock_type=0;
+    if (rnd(10) == 0)
+    {
+      if (maria_lock_database(lock=(rnd(2) ? file1 : file2),
+			   lock_type=(rnd(2) == 0 ? F_RDLCK : F_WRLCK)))
+      {
+	fprintf(stderr,"%2d: start: Can't lock table %d\n",id,my_errno);
+	error=1;
+	break;
+      }
+    }
+    switch (rnd(4)) {
+    case 0: error=test_read(file,id); break;
+    case 1: error=test_rrnd(file,id); break;
+    case 2: error=test_write(file,id,lock_type); break;
+    case 3: error=test_update(file,id,lock_type); break;
+    }
+    if (lock)
+      maria_lock_database(lock,F_UNLCK);
+  }
+  if (!error)
+  {
+    maria_status(file1,&isam_info,HA_STATUS_VARIABLE);
+    printf("%2d: End of test.  Records:  %ld  Deleted:  %ld\n",
+	   id,(long) isam_info.records, (long) isam_info.deleted);
+    fflush(stdout);
+  }
+
+  maria_close(file1);
+  maria_close(file2);
+  if (error)
+  {
+    printf("%2d: Aborted\n",id); fflush(stdout);
+    exit(1);
+  }
+}
+
+
+int test_read(MARIA_HA *file,int id)
+{
+  uint i,lock,found,next,prev;
+  ulong find;
+
+  lock=0;
+  if (rnd(2) == 0)
+  {
+    lock=1;
+    if (maria_lock_database(file,F_RDLCK))
+    {
+      fprintf(stderr,"%2d: Can't lock table %d\n",id,my_errno);
+      return 1;
+    }
+  }
+
+  found=next=prev=0;
+  for (i=0 ; i < 100 ; i++)
+  {
+    find=rnd(100000);
+    if (!maria_rkey(file,record.id,1,(uchar*) &find, HA_WHOLE_KEY,
+                    HA_READ_KEY_EXACT))
+      found++;
+    else
+    {
+      if (my_errno != HA_ERR_KEY_NOT_FOUND)
+      {
+	fprintf(stderr,"%2d: Got error %d from read in read\n",id,my_errno);
+	return 1;
+      }
+      else if (!maria_rnext(file,record.id,1))
+	next++;
+      else
+      {
+	if (my_errno != HA_ERR_END_OF_FILE)
+	{
+	  fprintf(stderr,"%2d: Got error %d from rnext in read\n",id,my_errno);
+	  return 1;
+	}
+	else if (!maria_rprev(file,record.id,1))
+	  prev++;
+	else
+	{
+	  if (my_errno != HA_ERR_END_OF_FILE)
+	  {
+	    fprintf(stderr,"%2d: Got error %d from rnext in read\n",
+		    id,my_errno);
+	    return 1;
+	  }
+	}
+      }
+    }
+  }
+  if (lock)
+  {
+    if (maria_lock_database(file,F_UNLCK))
+    {
+      fprintf(stderr,"%2d: Can't unlock table\n",id);
+      return 1;
+    }
+  }
+  printf("%2d: read:   found: %5d  next: %5d   prev: %5d\n",
+	 id,found,next,prev);
+  fflush(stdout);
+  return 0;
+}
+
+
+int test_rrnd(MARIA_HA *file,int id)
+{
+  uint count,lock;
+
+  lock=0;
+  if (rnd(2) == 0)
+  {
+    lock=1;
+    if (maria_lock_database(file,F_RDLCK))
+    {
+      fprintf(stderr,"%2d: Can't lock table (%d)\n",id,my_errno);
+      maria_close(file);
+      return 1;
+    }
+    if (rnd(2) == 0)
+      maria_extra(file,HA_EXTRA_CACHE,0);
+  }
+
+  count=0;
+  if (maria_rrnd(file,record.id,0L))
+  {
+    if (my_errno == HA_ERR_END_OF_FILE)
+      goto end;
+    fprintf(stderr,"%2d: Can't read first record (%d)\n",id,my_errno);
+    return 1;
+  }
+  for (count=1 ; !maria_rrnd(file,record.id,HA_OFFSET_ERROR) ;count++) ;
+  if (my_errno != HA_ERR_END_OF_FILE)
+  {
+    fprintf(stderr,"%2d: Got error %d from rrnd\n",id,my_errno);
+    return 1;
+  }
+
+end:
+  if (lock)
+  {
+    maria_extra(file,HA_EXTRA_NO_CACHE,0);
+    if (maria_lock_database(file,F_UNLCK))
+    {
+      fprintf(stderr,"%2d: Can't unlock table\n",id);
+      exit(0);
+    }
+  }
+  printf("%2d: rrnd:   %5d\n",id,count); fflush(stdout);
+  return 0;
+}
+
+
+int test_write(MARIA_HA *file,int id,int lock_type)
+{
+  uint i,tries,count,lock;
+
+  lock=0;
+  if (rnd(2) == 0 || lock_type == F_RDLCK)
+  {
+    lock=1;
+    if (maria_lock_database(file,F_WRLCK))
+    {
+      if (lock_type == F_RDLCK && my_errno == EDEADLK)
+      {
+	printf("%2d: write:  deadlock\n",id); fflush(stdout);
+	return 0;
+      }
+      fprintf(stderr,"%2d: Can't lock table (%d)\n",id,my_errno);
+      maria_close(file);
+      return 1;
+    }
+    if (rnd(2) == 0)
+      maria_extra(file,HA_EXTRA_WRITE_CACHE,0);
+  }
+
+  sprintf((char*) record.id,"%7ld", (long) getpid());
+  strnmov((char*) record.text,"Testing...", sizeof(record.text));
+
+  tries=(uint) rnd(100)+10;
+  for (i=count=0 ; i < tries ; i++)
+  {
+    uint32 tmp=rnd(80000)+20000;
+    int4store(record.nr,tmp);
+    if (!maria_write(file,record.id))
+      count++;
+    else
+    {
+      if (my_errno != HA_ERR_FOUND_DUPP_KEY)
+      {
+	fprintf(stderr,"%2d: Got error %d (errno %d) from write\n",id,my_errno,
+		errno);
+	return 1;
+      }
+    }
+  }
+  if (lock)
+  {
+    maria_extra(file,HA_EXTRA_NO_CACHE,0);
+    if (maria_lock_database(file,F_UNLCK))
+    {
+      fprintf(stderr,"%2d: Can't unlock table\n",id);
+      exit(0);
+    }
+  }
+  printf("%2d: write:  %5d\n",id,count); fflush(stdout);
+  return 0;
+}
+
+
+int test_update(MARIA_HA *file,int id,int lock_type)
+{
+  uint i,lock,found,next,prev,update;
+  uint32 tmp;
+  char find[4];
+  struct record new_record;
+
+  lock=0;
+  if (rnd(2) == 0 || lock_type == F_RDLCK)
+  {
+    lock=1;
+    if (maria_lock_database(file,F_WRLCK))
+    {
+      if (lock_type == F_RDLCK && my_errno == EDEADLK)
+      {
+	printf("%2d: write:  deadlock\n",id); fflush(stdout);
+	return 0;
+      }
+      fprintf(stderr,"%2d: Can't lock table (%d)\n",id,my_errno);
+      return 1;
+    }
+  }
+  bzero((char*) &new_record,sizeof(new_record));
+  strmov((char*) new_record.text,"Updated");
+
+  found=next=prev=update=0;
+  for (i=0 ; i < 100 ; i++)
+  {
+    tmp=rnd(100000);
+    int4store(find,tmp);
+    if (!maria_rkey(file,record.id,1,(uchar*) find, HA_WHOLE_KEY,
+                    HA_READ_KEY_EXACT))
+      found++;
+    else
+    {
+      if (my_errno != HA_ERR_KEY_NOT_FOUND)
+      {
+	fprintf(stderr,"%2d: Got error %d from read in update\n",id,my_errno);
+	return 1;
+      }
+      else if (!maria_rnext(file,record.id,1))
+	next++;
+      else
+      {
+	if (my_errno != HA_ERR_END_OF_FILE)
+	{
+	  fprintf(stderr,"%2d: Got error %d from rnext in update\n",
+		  id,my_errno);
+	  return 1;
+	}
+	else if (!maria_rprev(file,record.id,1))
+	  prev++;
+	else
+	{
+	  if (my_errno != HA_ERR_END_OF_FILE)
+	  {
+	    fprintf(stderr,"%2d: Got error %d from rnext in update\n",
+		    id,my_errno);
+	    return 1;
+	  }
+	  continue;
+	}
+      }
+    }
+    memcpy_fixed(new_record.id,record.id,sizeof(record.id));
+    tmp=rnd(20000)+40000;
+    int4store(new_record.nr,tmp);
+    if (!maria_update(file,record.id,new_record.id))
+      update++;
+    else
+    {
+      if (my_errno != HA_ERR_RECORD_CHANGED &&
+	  my_errno != HA_ERR_RECORD_DELETED &&
+	  my_errno != HA_ERR_FOUND_DUPP_KEY)
+      {
+	fprintf(stderr,"%2d: Got error %d from update\n",id,my_errno);
+	return 1;
+      }
+    }
+  }
+  if (lock)
+  {
+    if (maria_lock_database(file,F_UNLCK))
+    {
+      fprintf(stderr,"Can't unlock table,id, error%d\n",my_errno);
+      return 1;
+    }
+  }
+  printf("%2d: update: %5d\n",id,update); fflush(stdout);
+  return 0;
+}
+
+#else /* __NETWARE__ || __WIN__ */
+
+#include <stdio.h>
+
+int main()
+{
+	fprintf(stderr,"this test has not been ported to Netware or Windows\n");
+	return 0;
+}
+
+#endif /* __NETWARE__|| __WIN__ */
diff --git a/storage/maria/ma_test_all.res b/storage/maria/ma_test_all.res
new file mode 100644
index 00000000000..586aaf68020
--- /dev/null
+++ b/storage/maria/ma_test_all.res
@@ -0,0 +1,14 @@
+Running tests with dynamic row format
+Running tests with static row format
+Running tests with block row format
+Running tests with block row format and transactions
+ma_test2 -s -L -K -R1 -m2000 ;  Should give error 135
+Error: 135 in write at record: 1099
+got error: 135 when using MARIA-database
+./maria_chk -sm test2 will warn that 'Datafile is almost full'
+maria_chk: MARIA file test2
+maria_chk: warning: Datafile is almost full,      65516 of      65534 used
+MARIA-table 'test2' is usable but should be fixed
+MARIA RECOVERY TESTS
+ALL RECOVERY TESTS OK
+!!!!!!!! BUT REMEMBER to FIX this BLOB issue !!!!!!!
diff --git a/storage/maria/ma_test_all.sh b/storage/maria/ma_test_all.sh
new file mode 100755
index 00000000000..041fbf3abe6
--- /dev/null
+++ b/storage/maria/ma_test_all.sh
@@ -0,0 +1,19 @@
+#!/bin/sh
+#
+# This file is now deprecated and has been replaced by
+# unittest/ma_test_all-t
+#
+#
+#
+#
+
+if test -n "$1"; then
+
+  # unit.pl can't pass options to ma_test_all-t, so if anything
+  # was passed as an argument, assume the purpose was to pass
+  # them to ma_test_all-t and call it directly
+
+  unittest/ma_test_all-t $@
+else
+  perl ../../unittest/unit.pl run unittest/ma_test_all-t
+fi
diff --git a/storage/maria/ma_test_big.sh b/storage/maria/ma_test_big.sh
new file mode 100644
index 00000000000..6419d05e3a4
--- /dev/null
+++ b/storage/maria/ma_test_big.sh
@@ -0,0 +1,22 @@
+#!/bin/sh
+#
+# This tests is good to find bugs in the redo/undo handling and in
+# finding bugs in blob handling
+#
+
+set -e
+a=15
+while test $a -le 5000
+do
+  echo $a
+  rm -f maria_log*
+  ma_test2 -s -L -K -W -P -M -T -c -b32768 -t4 -A1 -m$a > /dev/null
+  maria_read_log -a -s >& /dev/null
+  maria_chk -es test2
+  maria_read_log -a -s >& /dev/null
+  maria_chk -es test2
+  rm test2.MA?
+  maria_read_log -a -s >& /dev/null
+  maria_chk -es test2
+  a=$((a+1))
+done
diff --git a/storage/maria/ma_test_force_start.pl b/storage/maria/ma_test_force_start.pl
new file mode 100755
index 00000000000..8148b2f212b
--- /dev/null
+++ b/storage/maria/ma_test_force_start.pl
@@ -0,0 +1,238 @@
+#!/usr/bin/env perl
+
+
+use strict;
+use warnings;
+
+my $usage= <<EOF;
+This program tests that the options
+--aria-force-start-after-recovery-failures --aria-recover work as
+expected.
+It has to be run from directory mysql-test, and works with non-debug
+and debug binaries.
+Pass it option -d or -i (to test corruption of data or index file).
+EOF
+
+# -d currently exhibits BUG#36578
+# "Maria: maria-recover may fail to autorepair a table"
+
+die($usage) if (@ARGV == 0);
+
+my $corrupt_index;
+
+if ($ARGV[0] eq '-d')
+  {
+    $corrupt_index= 0;
+  }
+elsif ($ARGV[0] eq '-i')
+  {
+    $corrupt_index= 1;
+  }
+else
+  {
+    die($usage);
+  }
+
+my $force_after= 3;
+my $corrupt_file= $corrupt_index ? "MAI" : "MAD";
+my $corrupt_message= 
+  "\\[ERROR\\] mysqld(.exe)*: Table '..test.t1' is marked as crashed and should be repaired";
+
+my $sql_name= "./var/tmp/create_table.sql";
+my $error_log_name= "./var/log/master.err";
+my @cmd_output;
+my $whatever; # garbage data
+$ENV{MTR_VERSION} = 1; # MTR2 does not have --start-and-exit
+my $base_server_cmd= "perl mysql-test-run.pl --mysqld=--aria-force-start-after-recovery-failures=$force_after --suite=maria maria.maria-recover ";
+if ($^O =~ /^mswin/i)
+  {
+    print <<EOF;
+WARNING: with Activestate Perl, mysql-test-run.pl --start-and-exit has a bug:
+it does not exit; cygwin perl recommended
+EOF
+  }
+my $iswindows= ( $^O =~ /win/i  && $^O !~ /darwin/i );
+$base_server_cmd.= ($iswindows ? "--mysqld=--console" : "--mem");
+my $server_cmd;
+my $server_pid_name="./var/run/master.pid";
+my $server_pid;
+my $i; # count of server restarts
+sub kill_server;
+
+my $suffix= ($iswindows ? ".exe" : "");
+my $client_exe_path= "../client/release";
+# we use -f, sometimes -x is unexpectedly false in Cygwin
+if ( ! -f "$client_exe_path/mysql$suffix" )
+  {
+    $client_exe_path= "../client/relwithdebinfo";
+    if ( ! -f "$client_exe_path/mysql$suffix" )
+    {
+      $client_exe_path= "../client/debug";
+      if ( ! -f "$client_exe_path/mysql$suffix" )
+      {
+        $client_exe_path= "../client";
+        if ( ! -f "$client_exe_path/mysql$suffix" )
+        {
+          die("Cannot find 'mysql' executable\n");
+        }
+      }
+    }
+  }
+
+print "starting mysqld\n";
+$server_cmd= $base_server_cmd . " --start-and-exit 2>&1";
+@cmd_output=`$server_cmd`;
+die if $?;
+my $master_port= (grep (/Using MASTER_MYPORT .*= (\d+)$/, @cmd_output))[0];
+$master_port =~ s/.*= //;
+chomp $master_port;
+die unless $master_port > 0;
+
+my $client_cmd= "$client_exe_path/mysql -u root -h 127.0.0.1 -P $master_port test < $sql_name";
+
+open(FILE, ">", $sql_name) or die;
+
+# To exhibit BUG#36578 with -d, we don't create an index if -d. This is
+# because the presence of an index will cause repair-by-sort to be used,
+# where sort_get_next_record() is only called inside
+#_ma_create_index_by_sort(), so the latter function fails and in this
+# case retry_repair is set, so bug does not happen. Whereas without
+# an index, repair-with-key-cache is called, which calls
+# sort_get_next_record() whose failure itself does not cause a retry.
+
+print FILE "create table t1 (a varchar(1000)".
+  ($corrupt_index ? ", index(a)" : "") .") engine=aria;\n";
+print FILE <<EOF;
+insert into t1 values("ThursdayMorningsMarket");
+# If Recovery executes REDO_INDEX_NEW_PAGE it will overwrite our
+# intentional corruption; we make Recovery skip this record by bumping
+# create_rename_lsn using OPTIMIZE TABLE. This also makes sure to put
+# the pages on disk, so that we can corrupt them.
+optimize table t1;
+# mark table open, so that --aria-recover repairs it
+insert into t1 select concat(a,'b') from t1 limit 1;
+EOF
+close FILE;
+
+print "creating table\n";
+`$client_cmd`;
+die if $?;
+
+print "killing mysqld hard\n";
+kill_server(9);
+
+print "ruining " .
+  ($corrupt_index ? "first page of keys" : "bitmap page") .
+  " in table to test aria-recover\n";
+open(FILE, "+<", "./var/master-data/test/t1.$corrupt_file") or die;
+$whatever= ("\xAB" x 100);
+sysseek (FILE, $corrupt_index ? 8192 : (8192-100-100), 0) or die;
+syswrite (FILE, $whatever) or die;
+close FILE;
+
+print "ruining log to make recovery fail; mysqld should fail the $force_after first restarts\n";
+open(FILE, "+<", "./var/tmp/aria_log.00000001") or die;
+$whatever= ("\xAB" x 8192);
+sysseek (FILE, 99, 0) or die;
+syswrite (FILE, $whatever) or die;
+close FILE;
+
+$server_cmd= $base_server_cmd . " --start-dirty 2>&1";
+for($i= 1; $i <= $force_after; $i= $i + 1)
+  {
+    print "mysqld restart number $i... ";
+    unlink($error_log_name) or die;
+    `$server_cmd`;
+    # mysqld should return 1 when can't read log
+    die unless (($? >> 8) == 1);
+    open(FILE, "<", $error_log_name) or die;
+    @cmd_output= <FILE>;
+    close FILE;
+    die unless grep(/\[ERROR\] mysqld(.exe)*: Aria engine: log initialization failed/, @cmd_output);
+    die unless grep(/\[ERROR\] Plugin 'Aria' init function returned error./, @cmd_output);
+    print "failed - ok\n";
+  }
+
+print "mysqld restart number $i... ";
+unlink($error_log_name) or die;
+@cmd_output=`$server_cmd`;
+die if $?;
+open(FILE, "<", $error_log_name) or die;
+@cmd_output= <FILE>;
+close FILE;
+die unless grep(/\[Warning\] mysqld(.exe)*: Aria engine: removed all logs after [\d]+ consecutive failures of recovery from logs/, @cmd_output);
+die unless grep(/\[ERROR\] mysqld(.exe)*: File '.*tmp.aria_log.00000001' not found \(Errcode: 2\)/, @cmd_output);
+print "success - ok\n";
+
+open(FILE, ">", $sql_name) or die;
+print FILE <<EOF;
+set global aria_recover=normal;
+insert into t1 values('aaa');
+EOF
+close FILE;
+
+# verify corruption has not yet been noticed
+open(FILE, "<", $error_log_name) or die;
+@cmd_output= <FILE>;
+close FILE;
+die if grep(/$corrupt_message/, @cmd_output);
+
+print "inserting in table\n";
+`$client_cmd`;
+die if $?;
+print "table is usable - ok\n";
+
+open(FILE, "<", $error_log_name) or die;
+@cmd_output= <FILE>;
+close FILE;
+die unless grep(/$corrupt_message/, @cmd_output);
+die unless grep(/\[Warning\] Recovering table: '..test.t1'/, @cmd_output);
+print "was corrupted and automatically repaired - ok\n";
+
+# remove our traces
+kill_server(15);
+
+print "TEST ALL OK\n";
+
+# kills mysqld with signal given in parameter
+sub kill_server
+  {
+    my ($sig)= @_;
+    my $wait_count= 0;
+    my $kill_cmd;
+    my @kill_output;
+    open(FILE, "<", $server_pid_name) or die;
+    @cmd_output= <FILE>;
+    close FILE;
+    $server_pid= $cmd_output[0];
+    chomp $server_pid;
+    die unless $server_pid > 0;
+    if ($iswindows)
+      {
+        # On Windows, server_pid_name is not the "main" process id
+        # so perl's kill() does not see this process id.
+        # But taskkill works, though only with /F ("-9"-style kill).
+        $kill_cmd= "taskkill /F /PID $server_pid 2>&1";
+        @kill_output= `$kill_cmd`;
+        die unless grep(/has been terminated/, @kill_output);
+      }
+    else
+      {
+        kill($sig, $server_pid) or die;
+      }
+    while (1) # wait until mysqld process gone
+      {
+        if ($iswindows)
+          {
+            @kill_output= `$kill_cmd`;
+            last if grep(/not found/, @kill_output);
+          }
+        else
+          {
+            kill (0, $server_pid) or last;
+          }
+        print "waiting for mysqld to die\n" if ($wait_count > 30);
+        $wait_count= $wait_count + 1;
+        select(undef, undef, undef, 0.1);
+      }
+  }
diff --git a/storage/maria/ma_test_recovery b/storage/maria/ma_test_recovery
new file mode 100755
index 00000000000..0b20264c434
--- /dev/null
+++ b/storage/maria/ma_test_recovery
@@ -0,0 +1,8 @@
+#!/bin/sh
+
+# Remove comment from next line if this script fails and you need more
+# information of what's going on
+
+# This file is deprecated and has been replaced with ma_test_recovery.pl
+
+unittest/ma_test_recovery.pl $@
diff --git a/storage/maria/ma_unique.c b/storage/maria/ma_unique.c
new file mode 100644
index 00000000000..a90578c2162
--- /dev/null
+++ b/storage/maria/ma_unique.c
@@ -0,0 +1,244 @@
+/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; version 2 of the License.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */
+
+/* Functions to check if a row is unique */
+
+#include "maria_def.h"
+#include <m_ctype.h>
+
+/**
+  Check if there exist a row with the same hash
+
+  @notes
+  This function is not versioning safe. For the moment this is not a problem
+  as it's only used for internal temporary tables in MySQL for which there
+  isn't any versioning information.
+*/
+
+my_bool _ma_check_unique(MARIA_HA *info, MARIA_UNIQUEDEF *def, uchar *record,
+			ha_checksum unique_hash, my_off_t disk_pos)
+{
+  my_off_t lastpos=info->cur_row.lastpos;
+  MARIA_KEYDEF *keyinfo= &info->s->keyinfo[def->key];
+  uchar *key_buff= info->lastkey_buff2;
+  MARIA_KEY key;
+  DBUG_ENTER("_ma_check_unique");
+  DBUG_PRINT("enter",("unique_hash: %lu", (ulong) unique_hash));
+
+  maria_unique_store(record+keyinfo->seg->start, unique_hash);
+  /* Can't be spatial so it's ok to call _ma_make_key directly here */
+  _ma_make_key(info, &key, def->key, key_buff, record, 0, 0);
+
+  /* The above changed info->lastkey_buff2. Inform maria_rnext_same(). */
+  info->update&= ~HA_STATE_RNEXT_SAME;
+
+  DBUG_ASSERT(key.data_length == MARIA_UNIQUE_HASH_LENGTH);
+  if (_ma_search(info, &key, SEARCH_FIND, info->s->state.key_root[def->key]))
+  {
+    info->page_changed=1;			/* Can't optimize read next */
+    info->cur_row.lastpos= lastpos;
+    DBUG_RETURN(0);				/* No matching rows */
+  }
+
+  for (;;)
+  {
+    if (info->cur_row.lastpos != disk_pos &&
+	!(*info->s->compare_unique)(info,def,record,info->cur_row.lastpos))
+    {
+      my_errno=HA_ERR_FOUND_DUPP_UNIQUE;
+      info->errkey= (int) def->key;
+      info->dup_key_pos= info->cur_row.lastpos;
+      info->page_changed= 1;			/* Can't optimize read next */
+      info->cur_row.lastpos= lastpos;
+      DBUG_PRINT("info",("Found duplicate"));
+      DBUG_RETURN(1);				/* Found identical  */
+    }
+    DBUG_ASSERT(info->last_key.data_length == MARIA_UNIQUE_HASH_LENGTH);
+    if (_ma_search_next(info, &info->last_key, SEARCH_BIGGER,
+			info->s->state.key_root[def->key]) ||
+	bcmp(info->last_key.data, key_buff, MARIA_UNIQUE_HASH_LENGTH))
+    {
+      info->page_changed= 1;			/* Can't optimize read next */
+      info->cur_row.lastpos= lastpos;
+      DBUG_RETURN(0);				/* end of tree */
+    }
+  }
+}
+
+
+/*
+  Calculate a hash for a row
+
+  TODO
+    Add support for bit fields
+*/
+
+ha_checksum _ma_unique_hash(MARIA_UNIQUEDEF *def, const uchar *record)
+{
+  const uchar *pos, *end;
+  ha_checksum crc= 0;
+  ulong seed1=0, seed2= 4;
+  HA_KEYSEG *keyseg;
+
+  for (keyseg=def->seg ; keyseg < def->end ; keyseg++)
+  {
+    enum ha_base_keytype type=(enum ha_base_keytype) keyseg->type;
+    uint length=keyseg->length;
+
+    if (keyseg->null_bit)
+    {
+      if (record[keyseg->null_pos] & keyseg->null_bit)
+      {
+	/*
+	  Change crc in a way different from an empty string or 0.
+	  (This is an optimisation;  The code will work even if this isn't
+	  done)
+	*/
+	crc=((crc << 8) + 511+
+	     (crc >> (8*sizeof(ha_checksum)-8)));
+	continue;
+      }
+    }
+    pos= record+keyseg->start;
+    if (keyseg->flag & HA_VAR_LENGTH_PART)
+    {
+      uint pack_length=  keyseg->bit_start;
+      uint tmp_length= (pack_length == 1 ? (uint) *pos :
+                        uint2korr(pos));
+      pos+= pack_length;			/* Skip VARCHAR length */
+      set_if_smaller(length,tmp_length);
+    }
+    else if (keyseg->flag & HA_BLOB_PART)
+    {
+      uint tmp_length= _ma_calc_blob_length(keyseg->bit_start,pos);
+      memcpy_fixed((uchar*) &pos,pos+keyseg->bit_start,sizeof(char*));
+      if (!length || length > tmp_length)
+	length=tmp_length;			/* The whole blob */
+    }
+    end= pos+length;
+    if (type == HA_KEYTYPE_TEXT || type == HA_KEYTYPE_VARTEXT1 ||
+        type == HA_KEYTYPE_VARTEXT2)
+    {
+      keyseg->charset->coll->hash_sort(keyseg->charset,
+                                       (const uchar*) pos, length, &seed1,
+                                       &seed2);
+      crc^= seed1;
+    }
+    else
+      while (pos != end)
+	crc=((crc << 8) +
+	     (((uchar)  *pos++))) +
+	  (crc >> (8*sizeof(ha_checksum)-8));
+  }
+  return crc;
+}
+
+
+/*
+  compare unique key for two rows
+
+  TODO
+    Add support for bit fields
+
+  RETURN
+    0   if both rows have equal unique value
+    1   Rows are different
+*/
+
+my_bool _ma_unique_comp(MARIA_UNIQUEDEF *def, const uchar *a, const uchar *b,
+                        my_bool null_are_equal)
+{
+  const uchar *pos_a, *pos_b, *end;
+  HA_KEYSEG *keyseg;
+
+  for (keyseg=def->seg ; keyseg < def->end ; keyseg++)
+  {
+    enum ha_base_keytype type=(enum ha_base_keytype) keyseg->type;
+    uint a_length, b_length;
+    a_length= b_length= keyseg->length;
+
+    /* If part is NULL it's regarded as different */
+    if (keyseg->null_bit)
+    {
+      uint tmp;
+      if ((tmp=(a[keyseg->null_pos] & keyseg->null_bit)) !=
+	  (uint) (b[keyseg->null_pos] & keyseg->null_bit))
+	return 1;
+      if (tmp)
+      {
+	if (!null_are_equal)
+	  return 1;
+	continue;
+      }
+    }
+    pos_a= a+keyseg->start;
+    pos_b= b+keyseg->start;
+    if (keyseg->flag & HA_VAR_LENGTH_PART)
+    {
+      uint pack_length= keyseg->bit_start;
+      if (pack_length == 1)
+      {
+        a_length= (uint) *pos_a++;
+        b_length= (uint) *pos_b++;
+      }
+      else
+      {
+        a_length= uint2korr(pos_a);
+        b_length= uint2korr(pos_b);
+        pos_a+= 2;				/* Skip VARCHAR length */
+        pos_b+= 2;
+      }
+      set_if_smaller(a_length, keyseg->length); /* Safety */
+      set_if_smaller(b_length, keyseg->length); /* safety */
+    }
+    else if (keyseg->flag & HA_BLOB_PART)
+    {
+      /* Only compare 'length' characters if length != 0 */
+      a_length= _ma_calc_blob_length(keyseg->bit_start,pos_a);
+      b_length= _ma_calc_blob_length(keyseg->bit_start,pos_b);
+      /* Check that a and b are of equal length */
+      if (keyseg->length)
+      {
+        /*
+          This is used in some cases when we are not interested in comparing
+          the whole length of the blob.
+        */
+        set_if_smaller(a_length, keyseg->length);
+        set_if_smaller(b_length, keyseg->length);
+      }
+      memcpy_fixed((uchar*) &pos_a,pos_a+keyseg->bit_start,sizeof(char*));
+      memcpy_fixed((uchar*) &pos_b,pos_b+keyseg->bit_start,sizeof(char*));
+    }
+    if (type == HA_KEYTYPE_TEXT || type == HA_KEYTYPE_VARTEXT1 ||
+        type == HA_KEYTYPE_VARTEXT2)
+    {
+      if (ha_compare_text(keyseg->charset, pos_a, a_length,
+                          pos_b, b_length, 0, 1))
+        return 1;
+    }
+    else
+    {
+      if (a_length != b_length)
+        return 1;
+      end= pos_a+a_length;
+      while (pos_a != end)
+      {
+	if (*pos_a++ != *pos_b++)
+	  return 1;
+      }
+    }
+  }
+  return 0;
+}
diff --git a/storage/maria/ma_update.c b/storage/maria/ma_update.c
new file mode 100644
index 00000000000..7b9e006ec43
--- /dev/null
+++ b/storage/maria/ma_update.c
@@ -0,0 +1,253 @@
+/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; version 2 of the License.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */
+
+#include "ma_fulltext.h"
+#include "ma_rt_index.h"
+#include "trnman.h"
+
+/**
+   Update an old row in a MARIA table
+*/
+
+int maria_update(register MARIA_HA *info, const uchar *oldrec, uchar *newrec)
+{
+  int flag,key_changed,save_errno;
+  reg3 my_off_t pos;
+  uint i;
+  uchar old_key_buff[MARIA_MAX_KEY_BUFF],*new_key_buff;
+  my_bool auto_key_changed= 0;
+  ulonglong changed;
+  MARIA_SHARE *share= info->s;
+  MARIA_KEYDEF *keyinfo;
+  DBUG_ENTER("maria_update");
+  LINT_INIT(new_key_buff);
+  LINT_INIT(changed);
+
+  DBUG_EXECUTE_IF("maria_pretend_crashed_table_on_usage",
+                  maria_print_error(info->s, HA_ERR_CRASHED);
+                  DBUG_RETURN(my_errno= HA_ERR_CRASHED););
+  if (!(info->update & HA_STATE_AKTIV))
+  {
+    DBUG_RETURN(my_errno=HA_ERR_KEY_NOT_FOUND);
+  }
+  if (share->options & HA_OPTION_READ_ONLY_DATA)
+  {
+    DBUG_RETURN(my_errno=EACCES);
+  }
+  if (share->state.state.key_file_length >= share->base.margin_key_file_length)
+  {
+    DBUG_RETURN(my_errno=HA_ERR_INDEX_FILE_FULL);
+  }
+  pos= info->cur_row.lastpos;
+  if (_ma_readinfo(info,F_WRLCK,1))
+    DBUG_RETURN(my_errno);
+
+  if ((*share->compare_record)(info,oldrec))
+  {
+    save_errno= my_errno;
+    DBUG_PRINT("warning", ("Got error from compare record"));
+    goto err_end;			/* Record has changed */
+  }
+
+  /* Calculate and check all unique constraints */
+  key_changed=0;
+  for (i=0 ; i < share->state.header.uniques ; i++)
+  {
+    MARIA_UNIQUEDEF *def=share->uniqueinfo+i;
+    if (_ma_unique_comp(def, newrec, oldrec,1) &&
+	_ma_check_unique(info, def, newrec, _ma_unique_hash(def, newrec),
+                         pos))
+    {
+      save_errno=my_errno;
+      goto err_end;
+    }
+  }
+  if (_ma_mark_file_changed(info))
+  {
+    save_errno=my_errno;
+    goto err_end;
+  }
+
+  /* Ensure we don't try to restore auto_increment if it doesn't change */
+  info->last_auto_increment= ~(ulonglong) 0;
+
+  /* Check which keys changed from the original row */
+
+  new_key_buff= info->lastkey_buff2;
+  changed=0;
+  for (i=0, keyinfo= share->keyinfo ; i < share->base.keys ; i++, keyinfo++)
+  {
+    if (maria_is_key_active(share->state.key_map, i))
+    {
+      if (keyinfo->flag & HA_FULLTEXT )
+      {
+	if (_ma_ft_cmp(info,i,oldrec, newrec))
+	{
+	  if ((int) i == info->lastinx)
+	  {
+	  /*
+	    We are changeing the index we are reading on.  Mark that
+	    the index data has changed and we need to do a full search
+	    when doing read-next
+	  */
+	    key_changed|=HA_STATE_WRITTEN;
+	  }
+	  changed|=((ulonglong) 1 << i);
+	  if (_ma_ft_update(info,i,old_key_buff,oldrec,newrec,pos))
+	    goto err;
+	}
+      }
+      else
+      {
+        MARIA_KEY new_key, old_key;
+
+        (*keyinfo->make_key)(info,&new_key, i, new_key_buff, newrec,
+                             pos, info->trn->trid);
+        (*keyinfo->make_key)(info,&old_key, i, old_key_buff,
+                             oldrec, pos, info->cur_row.trid);
+
+        /* The above changed info->lastkey2. Inform maria_rnext_same(). */
+        info->update&= ~HA_STATE_RNEXT_SAME;
+
+	if (new_key.data_length != old_key.data_length ||
+	    memcmp(old_key.data, new_key.data, new_key.data_length))
+	{
+	  if ((int) i == info->lastinx)
+	    key_changed|=HA_STATE_WRITTEN;	/* Mark that keyfile changed */
+	  changed|=((ulonglong) 1 << i);
+	  keyinfo->version++;
+	  if (keyinfo->ck_delete(info,&old_key))
+            goto err;
+	  if (keyinfo->ck_insert(info,&new_key))
+            goto err;
+	  if (share->base.auto_key == i+1)
+	    auto_key_changed=1;
+	}
+      }
+    }
+  }
+
+  if (share->calc_checksum)
+  {
+    /*
+      We can't use the row based checksum as this doesn't have enough
+      precision (one byte, while the table's is more bytes).
+      At least _ma_check_unique() modifies the 'newrec' record, so checksum
+      has to be computed _after_ it. Nobody apparently modifies 'oldrec'.
+      We need to pass the old row's checksum down to (*update_record)(), we do
+      this via info->new_row.checksum (not intuitive but existing code
+      mandated that cur_row is the new row).
+      If (*update_record)() fails, table will be marked corrupted so no need
+      to revert the live checksum change.
+    */
+    info->cur_row.checksum= (*share->calc_checksum)(info, newrec);
+    info->new_row.checksum= (*share->calc_checksum)(info, oldrec);
+    info->state->checksum+= info->cur_row.checksum - info->new_row.checksum;
+  }
+
+  if ((*share->update_record)(info, pos, oldrec, newrec))
+    goto err;
+
+  if (auto_key_changed & !share->now_transactional)
+  {
+    const HA_KEYSEG *keyseg= share->keyinfo[share->base.auto_key-1].seg;
+    const uchar *key= newrec + keyseg->start;
+    set_if_bigger(share->state.auto_increment,
+                  ma_retrieve_auto_increment(key, keyseg->type));
+  }
+
+  /*
+    We can't yet have HA_STATE_AKTIV here, as block_record dosn't support it
+  */
+  info->update= (HA_STATE_CHANGED | HA_STATE_ROW_CHANGED | key_changed);
+  share->state.changed|= STATE_NOT_MOVABLE | STATE_NOT_ZEROFILLED;
+  info->state->changed= 1;
+
+  /*
+    Every Maria function that updates Maria table must end with
+    call to _ma_writeinfo(). If operation (second param of
+    _ma_writeinfo()) is not 0 it sets share->changed to 1, that is
+    flags that data has changed. If operation is 0, this function
+    equals to no-op in this case.
+
+    ma_update() must always pass !0 value as operation, since even if
+    there is no index change there could be data change.
+  */
+  VOID(_ma_writeinfo(info, WRITEINFO_UPDATE_KEYFILE));
+  allow_break();				/* Allow SIGHUP & SIGINT */
+  if (info->invalidator != 0)
+  {
+    DBUG_PRINT("info", ("invalidator... '%s' (update)",
+                        share->open_file_name.str));
+    (*info->invalidator)(share->open_file_name.str);
+    info->invalidator=0;
+  }
+  DBUG_RETURN(0);
+
+err:
+  DBUG_PRINT("error",("key: %d  errno: %d",i,my_errno));
+  save_errno= my_errno;
+  DBUG_ASSERT(save_errno);
+  if (!save_errno)
+    save_errno= HA_ERR_INTERNAL_ERROR;          /* Should never happen */
+
+  if (my_errno == HA_ERR_FOUND_DUPP_KEY || my_errno == HA_ERR_OUT_OF_MEM ||
+      my_errno == HA_ERR_RECORD_FILE_FULL)
+  {
+    info->errkey= (int) i;
+    flag=0;
+    do
+    {
+      if (((ulonglong) 1 << i) & changed)
+      {
+	if (share->keyinfo[i].flag & HA_FULLTEXT)
+	{
+	  if ((flag++ && _ma_ft_del(info,i,new_key_buff,newrec,pos)) ||
+	      _ma_ft_add(info,i,old_key_buff,oldrec,pos))
+	    break;
+	}
+	else
+	{
+          MARIA_KEY new_key, old_key;
+          (*share->keyinfo[i].make_key)(info, &new_key, i, new_key_buff,
+                                        newrec, pos,
+                                        info->trn->trid);
+          (*share->keyinfo[i].make_key)(info, &old_key, i, old_key_buff,
+                                        oldrec, pos, info->cur_row.trid);
+	  if ((flag++ && _ma_ck_delete(info, &new_key)) ||
+	      _ma_ck_write(info, &old_key))
+	    break;
+	}
+      }
+    } while (i-- != 0);
+  }
+  else
+  {
+    maria_print_error(share, HA_ERR_CRASHED);
+    maria_mark_crashed(info);
+  }
+  info->update= (HA_STATE_CHANGED | HA_STATE_AKTIV | HA_STATE_ROW_CHANGED |
+		 key_changed);
+
+ err_end:
+  VOID(_ma_writeinfo(info,WRITEINFO_UPDATE_KEYFILE));
+  allow_break();				/* Allow SIGHUP & SIGINT */
+  if (save_errno == HA_ERR_KEY_NOT_FOUND)
+  {
+    maria_print_error(share, HA_ERR_CRASHED);
+    save_errno=HA_ERR_CRASHED;
+  }
+  DBUG_RETURN(my_errno=save_errno);
+} /* maria_update */
diff --git a/storage/maria/ma_write.c b/storage/maria/ma_write.c
new file mode 100644
index 00000000000..02eeec754ee
--- /dev/null
+++ b/storage/maria/ma_write.c
@@ -0,0 +1,2461 @@
+/* Copyright (C) 2004-2008 MySQL AB & MySQL Finland AB & TCX DataKonsult AB
+   Copyright (C) 2008-2009 Sun Microsystems, Inc.
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; version 2 of the License.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */
+
+/* Write a row to a MARIA table */
+
+#include "ma_fulltext.h"
+#include "ma_rt_index.h"
+#include "trnman.h"
+#include "ma_key_recover.h"
+#include "ma_blockrec.h"
+
+#define MAX_POINTER_LENGTH 8
+
+	/* Functions declared in this file */
+
+static int w_search(MARIA_HA *info, uint32 comp_flag,
+                    MARIA_KEY *key, my_off_t page,
+		    MARIA_PAGE *father_page, uchar *father_keypos,
+		    my_bool insert_last);
+static int _ma_balance_page(MARIA_HA *info, MARIA_KEYDEF *keyinfo,
+			    MARIA_KEY *key, MARIA_PAGE *curr_page,
+                            MARIA_PAGE *father_page,
+                            uchar *father_key_pos, MARIA_KEY_PARAM *s_temp);
+static uchar *_ma_find_last_pos(MARIA_KEY *int_key,
+                                MARIA_PAGE *page, uchar **after_key);
+static my_bool _ma_ck_write_tree(register MARIA_HA *info, MARIA_KEY *key);
+static my_bool _ma_ck_write_btree(register MARIA_HA *info, MARIA_KEY *key);
+static my_bool _ma_ck_write_btree_with_log(MARIA_HA *, MARIA_KEY *, my_off_t *,
+                                           uint32);
+static my_bool _ma_log_split(MARIA_PAGE *page, uint org_length,
+                             uint new_length,
+                             const uchar *key_pos,
+                             uint key_length, int move_length,
+                             enum en_key_op prefix_or_suffix,
+                             const uchar *data, uint data_length,
+                             uint changed_length);
+static my_bool _ma_log_del_prefix(MARIA_PAGE *page,
+                                  uint org_length, uint new_length,
+                                  const uchar *key_pos, uint key_length,
+                                  int move_length);
+static my_bool _ma_log_key_middle(MARIA_PAGE *page,
+                                  uint new_length,
+                                  uint data_added_first,
+                                  uint data_changed_first,
+                                  uint data_deleted_last,
+                                  const uchar *key_pos,
+                                  uint key_length, int move_length);
+
+/*
+  @brief Default handler for returing position to new row
+
+  @note
+    This is only called for non transactional tables and not for block format
+    which is why we use info->state here.
+*/
+
+MARIA_RECORD_POS _ma_write_init_default(MARIA_HA *info,
+                                        const uchar *record
+                                        __attribute__((unused)))
+{
+  return ((info->s->state.dellink != HA_OFFSET_ERROR &&
+           !info->append_insert_at_end) ?
+          info->s->state.dellink :
+          info->state->data_file_length);
+}
+
+my_bool _ma_write_abort_default(MARIA_HA *info __attribute__((unused)))
+{
+  return 0;
+}
+
+
+/* Write new record to a table */
+
+int maria_write(MARIA_HA *info, uchar *record)
+{
+  MARIA_SHARE *share= info->s;
+  uint i;
+  int save_errno;
+  MARIA_RECORD_POS filepos;
+  uchar *buff;
+  my_bool lock_tree= share->lock_key_trees;
+  my_bool fatal_error;
+  MARIA_KEYDEF *keyinfo;
+  DBUG_ENTER("maria_write");
+  DBUG_PRINT("enter",("index_file: %d  data_file: %d",
+                      share->kfile.file, info->dfile.file));
+
+  DBUG_EXECUTE_IF("maria_pretend_crashed_table_on_usage",
+                  maria_print_error(info->s, HA_ERR_CRASHED);
+                  DBUG_RETURN(my_errno= HA_ERR_CRASHED););
+  if (share->options & HA_OPTION_READ_ONLY_DATA)
+  {
+    DBUG_RETURN(my_errno=EACCES);
+  }
+  if (_ma_readinfo(info,F_WRLCK,1))
+    DBUG_RETURN(my_errno);
+  dont_break();				/* Dont allow SIGHUP or SIGINT */
+
+  if (share->base.reloc == (ha_rows) 1 &&
+      share->base.records == (ha_rows) 1 &&
+      share->state.state.records == (ha_rows) 1)
+  {						/* System file */
+    my_errno=HA_ERR_RECORD_FILE_FULL;
+    goto err2;
+  }
+  if (share->state.state.key_file_length >= share->base.margin_key_file_length)
+  {
+    my_errno=HA_ERR_INDEX_FILE_FULL;
+    goto err2;
+  }
+  if (_ma_mark_file_changed(info))
+    goto err2;
+
+  /* Calculate and check all unique constraints */
+  for (i=0 ; i < share->state.header.uniques ; i++)
+  {
+    if (_ma_check_unique(info,share->uniqueinfo+i,record,
+                         _ma_unique_hash(share->uniqueinfo+i,record),
+                         HA_OFFSET_ERROR))
+      goto err2;
+  }
+
+  /* Ensure we don't try to restore auto_increment if it doesn't change */
+  info->last_auto_increment= ~(ulonglong) 0;
+
+  if ((info->opt_flag & OPT_NO_ROWS))
+    filepos= HA_OFFSET_ERROR;
+  else
+  {
+    /*
+      This may either calculate a record or, or write the record and return
+      the record id
+    */
+    if ((filepos= (*share->write_record_init)(info, record)) ==
+        HA_OFFSET_ERROR)
+      goto err2;
+  }
+
+  /* Write all keys to indextree */
+  buff= info->lastkey_buff2;
+  for (i=0, keyinfo= share->keyinfo ; i < share->base.keys ; i++, keyinfo++)
+  {
+    MARIA_KEY int_key;
+    if (maria_is_key_active(share->state.key_map, i))
+    {
+      my_bool local_lock_tree= (lock_tree &&
+                                !(info->bulk_insert &&
+                                  is_tree_inited(&info->bulk_insert[i])));
+      if (local_lock_tree)
+      {
+	rw_wrlock(&keyinfo->root_lock);
+	keyinfo->version++;
+      }
+      if (keyinfo->flag & HA_FULLTEXT )
+      {
+        if (_ma_ft_add(info,i, buff,record,filepos))
+        {
+	  if (local_lock_tree)
+	    rw_unlock(&keyinfo->root_lock);
+          DBUG_PRINT("error",("Got error: %d on write",my_errno));
+          goto err;
+        }
+      }
+      else
+      {
+        while (keyinfo->ck_insert(info,
+                                  (*keyinfo->make_key)(info, &int_key, i,
+                                                       buff, record, filepos,
+                                                       info->trn->trid)))
+        {
+          TRN *blocker;
+          DBUG_PRINT("error",("Got error: %d on write",my_errno));
+          /*
+            explicit check to filter out temp tables, they aren't
+            transactional and don't have a proper TRN so the code
+            below doesn't work for them.
+            Also, filter out non-thread maria use, and table modified in
+            the same transaction.
+            At last, filter out non-dup-unique errors.
+          */
+          if (!local_lock_tree)
+            goto err;
+          if (info->dup_key_trid == info->trn->trid ||
+              my_errno != HA_ERR_FOUND_DUPP_KEY)
+          {
+	    rw_unlock(&keyinfo->root_lock);
+            goto err;
+          }
+          /* Different TrIDs: table must be transactional */
+          DBUG_ASSERT(share->base.born_transactional);
+          /*
+            If transactions are disabled, and dup_key_trid is different from
+            our TrID, it must be ALTER TABLE with dup_key_trid==0 (no
+            transaction). ALTER TABLE does have MARIA_HA::TRN not dummy but
+            puts TrID=0 in rows/keys.
+          */
+          DBUG_ASSERT(share->now_transactional ||
+                      (info->dup_key_trid == 0));
+          blocker= trnman_trid_to_trn(info->trn, info->dup_key_trid);
+          /*
+            if blocker TRN was not found, it means that the conflicting
+            transaction was committed long time ago. It could not be
+            aborted, as it would have to wait on the key tree lock
+            to remove the conflicting key it has inserted.
+          */
+          if (!blocker || blocker->commit_trid != ~(TrID)0)
+          { /* committed */
+            if (blocker)
+              pthread_mutex_unlock(& blocker->state_lock);
+            rw_unlock(&keyinfo->root_lock);
+            goto err;
+          }
+          rw_unlock(&keyinfo->root_lock);
+          {
+            /* running. now we wait */
+            WT_RESOURCE_ID rc;
+            int res;
+            const char *old_proc_info; 
+
+            rc.type= &ma_rc_dup_unique;
+            /* TODO savepoint id when we'll have them */
+            rc.value= (intptr)blocker;
+            res= wt_thd_will_wait_for(info->trn->wt, blocker->wt, & rc);
+            if (res != WT_OK)
+            {
+              pthread_mutex_unlock(& blocker->state_lock);
+              my_errno= HA_ERR_LOCK_DEADLOCK;
+              goto err;
+            }
+            old_proc_info= proc_info_hook(0,
+                                          "waiting for a resource",
+                                          __func__, __FILE__, __LINE__);
+            res= wt_thd_cond_timedwait(info->trn->wt, & blocker->state_lock);
+            proc_info_hook(0, old_proc_info, __func__, __FILE__, __LINE__);
+
+            pthread_mutex_unlock(& blocker->state_lock);
+            if (res != WT_OK)
+            {
+              my_errno= res == WT_TIMEOUT ? HA_ERR_LOCK_WAIT_TIMEOUT
+                                          : HA_ERR_LOCK_DEADLOCK;
+              goto err;
+            }
+          }
+          rw_wrlock(&keyinfo->root_lock);
+#ifndef MARIA_CANNOT_ROLLBACK
+          keyinfo->version++;
+#endif
+        }
+      }
+
+      /* The above changed info->lastkey2. Inform maria_rnext_same(). */
+      info->update&= ~HA_STATE_RNEXT_SAME;
+
+      if (local_lock_tree)
+        rw_unlock(&keyinfo->root_lock);
+    }
+  }
+  if (share->calc_write_checksum)
+    info->cur_row.checksum= (*share->calc_write_checksum)(info,record);
+  if (filepos != HA_OFFSET_ERROR)
+  {
+    if ((*share->write_record)(info,record))
+      goto err;
+    info->state->checksum+= info->cur_row.checksum;
+  }
+  if (!share->now_transactional)
+  {
+    if (share->base.auto_key != 0)
+    {
+      const HA_KEYSEG *keyseg= share->keyinfo[share->base.auto_key-1].seg;
+      const uchar *key= record + keyseg->start;
+      set_if_bigger(share->state.auto_increment,
+                    ma_retrieve_auto_increment(key, keyseg->type));
+    }
+  }
+  info->state->records++;
+  info->update= (HA_STATE_CHANGED | HA_STATE_AKTIV | HA_STATE_WRITTEN |
+		 HA_STATE_ROW_CHANGED);
+  share->state.changed|= STATE_NOT_MOVABLE | STATE_NOT_ZEROFILLED;
+  info->state->changed= 1;
+
+  info->cur_row.lastpos= filepos;
+  VOID(_ma_writeinfo(info, WRITEINFO_UPDATE_KEYFILE));
+  if (info->invalidator != 0)
+  {
+    DBUG_PRINT("info", ("invalidator... '%s' (update)",
+                        share->open_file_name.str));
+    (*info->invalidator)(share->open_file_name.str);
+    info->invalidator=0;
+  }
+
+  /*
+    Update status of the table. We need to do so after each row write
+    for the log tables, as we want the new row to become visible to
+    other threads as soon as possible. We don't lock mutex here
+    (as it is required by pthread memory visibility rules) as (1) it's
+    not critical to use outdated share->is_log_table value (2) locking
+    mutex here for every write is too expensive.
+  */
+  if (share->is_log_table)
+    _ma_update_status((void*) info);
+
+  allow_break();				/* Allow SIGHUP & SIGINT */
+  DBUG_RETURN(0);
+
+err:
+  save_errno= my_errno;
+  fatal_error= 0;
+  if (my_errno == HA_ERR_FOUND_DUPP_KEY ||
+      my_errno == HA_ERR_RECORD_FILE_FULL ||
+      my_errno == HA_ERR_LOCK_DEADLOCK ||
+      my_errno == HA_ERR_LOCK_WAIT_TIMEOUT ||
+      my_errno == HA_ERR_NULL_IN_SPATIAL ||
+      my_errno == HA_ERR_OUT_OF_MEM)
+  {
+    if (info->bulk_insert)
+    {
+      uint j;
+      for (j=0 ; j < share->base.keys ; j++)
+        maria_flush_bulk_insert(info, j);
+    }
+    info->errkey= (int) i;
+    /*
+      We delete keys in the reverse order of insertion. This is the order that
+      a rollback would do and is important for CLR_ENDs generated by
+      _ma_ft|ck_delete() and write_record_abort() to work (with any other
+      order they would cause wrong jumps in the chain).
+    */
+    while ( i-- > 0)
+    {
+      if (maria_is_key_active(share->state.key_map, i))
+      {
+	my_bool local_lock_tree= (lock_tree &&
+                                  !(info->bulk_insert &&
+                                    is_tree_inited(&info->bulk_insert[i])));
+        keyinfo= share->keyinfo + i;
+	if (local_lock_tree)
+	  rw_wrlock(&keyinfo->root_lock);
+        /**
+           @todo RECOVERY BUG
+           The key deletes below should generate CLR_ENDs
+        */
+	if (keyinfo->flag & HA_FULLTEXT)
+        {
+          if (_ma_ft_del(info,i,buff,record,filepos))
+	  {
+	    if (local_lock_tree)
+	      rw_unlock(&keyinfo->root_lock);
+            break;
+	  }
+        }
+        else
+	{
+	  MARIA_KEY key;
+	  if (_ma_ck_delete(info,
+                            (*keyinfo->make_key)(info, &key, i, buff, record,
+                                                 filepos, info->trn->trid)))
+	  {
+	    if (local_lock_tree)
+	      rw_unlock(&keyinfo->root_lock);
+	    break;
+	  }
+	}
+	if (local_lock_tree)
+	  rw_unlock(&keyinfo->root_lock);
+      }
+    }
+  }
+  else
+    fatal_error= 1;
+
+  if ((*share->write_record_abort)(info))
+    fatal_error= 1;
+  if (fatal_error)
+  {
+    maria_print_error(info->s, HA_ERR_CRASHED);
+    maria_mark_crashed(info);
+  }
+
+  info->update= (HA_STATE_CHANGED | HA_STATE_WRITTEN | HA_STATE_ROW_CHANGED);
+  my_errno=save_errno;
+err2:
+  save_errno=my_errno;
+  DBUG_ASSERT(save_errno);
+  if (!save_errno)
+    save_errno= HA_ERR_INTERNAL_ERROR;          /* Should never happen */
+  DBUG_PRINT("error", ("got error: %d", save_errno));
+  VOID(_ma_writeinfo(info,WRITEINFO_UPDATE_KEYFILE));
+  allow_break();			/* Allow SIGHUP & SIGINT */
+  DBUG_RETURN(my_errno=save_errno);
+} /* maria_write */
+
+
+/*
+  Write one key to btree
+
+  TODO
+    Remove this function and have bulk insert change keyinfo->ck_insert
+    to point to the right function
+*/
+
+my_bool _ma_ck_write(MARIA_HA *info, MARIA_KEY *key)
+{
+  DBUG_ENTER("_ma_ck_write");
+
+  if (info->bulk_insert &&
+      is_tree_inited(&info->bulk_insert[key->keyinfo->key_nr]))
+  {
+    DBUG_RETURN(_ma_ck_write_tree(info, key));
+  }
+  DBUG_RETURN(_ma_ck_write_btree(info, key));
+} /* _ma_ck_write */
+
+
+/**********************************************************************
+  Insert key into btree (normal case)
+**********************************************************************/
+
+static my_bool _ma_ck_write_btree(MARIA_HA *info, MARIA_KEY *key)
+{
+  my_bool error;
+  MARIA_KEYDEF *keyinfo= key->keyinfo;
+  my_off_t  *root= &info->s->state.key_root[keyinfo->key_nr];
+  DBUG_ENTER("_ma_ck_write_btree");
+
+  error= _ma_ck_write_btree_with_log(info, key, root,
+                                     keyinfo->write_comp_flag | key->flag);
+  if (info->ft1_to_ft2)
+  {
+    if (!error)
+      error= _ma_ft_convert_to_ft2(info, key);
+    delete_dynamic(info->ft1_to_ft2);
+    my_free(info->ft1_to_ft2, MYF(0));
+    info->ft1_to_ft2=0;
+  }
+  DBUG_RETURN(error);
+} /* _ma_ck_write_btree */
+
+
+/**
+  @brief Write a key to the b-tree
+
+  @retval 1   error
+  @retval 0    ok
+*/
+
+static my_bool _ma_ck_write_btree_with_log(MARIA_HA *info, MARIA_KEY *key,
+                                           my_off_t *root, uint32 comp_flag)
+{
+  MARIA_SHARE *share= info->s;
+  LSN lsn= LSN_IMPOSSIBLE;
+  int error;
+  my_off_t new_root= *root;
+  uchar key_buff[MARIA_MAX_KEY_BUFF];
+  MARIA_KEY org_key;
+  DBUG_ENTER("_ma_ck_write_btree_with_log");
+
+  LINT_INIT_STRUCT(org_key);
+  if (share->now_transactional)
+  {
+    /* Save original value as the key may change */
+    org_key= *key;
+    memcpy(key_buff, key->data, key->data_length + key->ref_length);
+  }
+
+  error= _ma_ck_real_write_btree(info, key, &new_root, comp_flag);
+  if (!error && share->now_transactional)
+  {
+    /* Log the original value */
+    *key= org_key;
+    key->data= key_buff;
+    error= _ma_write_undo_key_insert(info, key, root, new_root, &lsn);
+  }
+  else
+  {
+    *root= new_root;
+    _ma_fast_unlock_key_del(info);
+  }
+  _ma_unpin_all_pages_and_finalize_row(info, lsn);
+
+  DBUG_RETURN(error != 0);
+} /* _ma_ck_write_btree_with_log */
+
+
+/**
+  @brief Write a key to the b-tree
+
+  @retval 1   error
+  @retval 0    ok
+*/
+
+my_bool _ma_ck_real_write_btree(MARIA_HA *info, MARIA_KEY *key, my_off_t *root,
+                            uint32 comp_flag)
+{
+  int error;
+  DBUG_ENTER("_ma_ck_real_write_btree");
+
+  /* key_length parameter is used only if comp_flag is SEARCH_FIND */
+  if (*root == HA_OFFSET_ERROR ||
+      (error= w_search(info, comp_flag, key, *root, (MARIA_PAGE *) 0,
+                       (uchar*) 0, 1)) > 0)
+    error= _ma_enlarge_root(info, key, root);
+  DBUG_RETURN(error != 0);
+} /* _ma_ck_real_write_btree */
+
+
+/**
+  @brief Make a new root with key as only pointer
+
+  @retval 1   error
+  @retval 0    ok
+*/
+
+my_bool _ma_enlarge_root(MARIA_HA *info, MARIA_KEY *key, my_off_t *root)
+{
+  uint t_length, nod_flag;
+  MARIA_KEY_PARAM s_temp;
+  MARIA_SHARE *share= info->s;
+  MARIA_PINNED_PAGE tmp_page_link, *page_link= &tmp_page_link;
+  MARIA_KEYDEF *keyinfo= key->keyinfo;
+  MARIA_PAGE page;
+  my_bool res= 0;
+  DBUG_ENTER("_ma_enlarge_root");
+
+  page.info=    info;
+  page.keyinfo= keyinfo;
+  page.buff=    info->buff;
+  page.flag=    0;
+
+  nod_flag= (*root != HA_OFFSET_ERROR) ?  share->base.key_reflength : 0;
+  /* Store pointer to prev page if nod */
+  _ma_kpointer(info, page.buff + share->keypage_header, *root);
+  t_length= (*keyinfo->pack_key)(key, nod_flag, (uchar*) 0,
+                                 (uchar*) 0, (uchar*) 0, &s_temp);
+  page.size= share->keypage_header + t_length + nod_flag;
+
+  bzero(page.buff, share->keypage_header);
+  _ma_store_keynr(share, page.buff, keyinfo->key_nr);
+  if (nod_flag)
+    page.flag|= KEYPAGE_FLAG_ISNOD;
+  if (key->flag & (SEARCH_USER_KEY_HAS_TRANSID | SEARCH_PAGE_KEY_HAS_TRANSID))
+    page.flag|= KEYPAGE_FLAG_HAS_TRANSID;
+  (*keyinfo->store_key)(keyinfo, page.buff + share->keypage_header +
+                        nod_flag, &s_temp);
+
+  /* Mark that info->buff was used */
+  info->keyread_buff_used= info->page_changed= 1;
+  if ((page.pos= _ma_new(info, PAGECACHE_PRIORITY_HIGH, &page_link)) ==
+      HA_OFFSET_ERROR)
+    DBUG_RETURN(1);
+  *root= page.pos;
+
+  page_store_info(share, &page);
+
+  /*
+    Clear unitialized part of page to avoid valgrind/purify warnings
+    and to get a clean page that is easier to compress and compare with
+    pages generated with redo
+  */
+  bzero(page.buff + page.size, share->block_size - page.size);
+
+  if (share->now_transactional && _ma_log_new(&page, 1))
+    res= 1;
+
+  if (_ma_write_keypage(&page, page_link->write_lock,
+                        PAGECACHE_PRIORITY_HIGH))
+    res= 1;
+
+  DBUG_RETURN(res);
+} /* _ma_enlarge_root */
+
+
+/*
+  Search after a position for a key and store it there
+
+  TODO:
+  Change this to use pagecache directly instead of creating a copy
+  of the page. To do this, we must however change write-key-on-page
+  algorithm to not overwrite the buffer but instead store any overflow
+  key in a separate buffer.
+
+  @return
+  @retval -1   error
+  @retval 0    ok
+  @retval > 0  Key should be stored in higher tree
+*/
+
+static int w_search(register MARIA_HA *info, uint32 comp_flag, MARIA_KEY *key,
+                    my_off_t page_pos,
+                    MARIA_PAGE *father_page, uchar *father_keypos,
+		    my_bool insert_last)
+{
+  int error,flag;
+  uchar *temp_buff,*keypos;
+  uchar keybuff[MARIA_MAX_KEY_BUFF];
+  my_bool was_last_key;
+  my_off_t next_page, dup_key_pos;
+  MARIA_SHARE *share= info->s;
+  MARIA_KEYDEF *keyinfo= key->keyinfo;
+  MARIA_PAGE page;
+  DBUG_ENTER("w_search");
+  DBUG_PRINT("enter", ("page: %lu", (ulong) (page_pos/keyinfo->block_length)));
+
+  if (!(temp_buff= (uchar*) my_alloca((uint) keyinfo->block_length+
+				      MARIA_MAX_KEY_BUFF*2)))
+    DBUG_RETURN(-1);
+  if (_ma_fetch_keypage(&page, info, keyinfo, page_pos, PAGECACHE_LOCK_WRITE,
+                        DFLT_INIT_HITS, temp_buff, 0))
+    goto err;
+
+  flag= (*keyinfo->bin_search)(key, &page, comp_flag, &keypos,
+                               keybuff, &was_last_key);
+  if (flag == 0)
+  {
+    MARIA_KEY tmp_key;
+    /* get position to record with duplicated key */
+
+    tmp_key.keyinfo= keyinfo;
+    tmp_key.data= keybuff;
+
+    if ((*keyinfo->get_key)(&tmp_key, page.flag, page.node, &keypos))
+      dup_key_pos= _ma_row_pos_from_key(&tmp_key);
+    else
+      dup_key_pos= HA_OFFSET_ERROR;
+
+    if (keyinfo->flag & HA_FULLTEXT)
+    {
+      uint off;
+      int  subkeys;
+
+      get_key_full_length_rdonly(off, keybuff);
+      subkeys=ft_sintXkorr(keybuff+off);
+      comp_flag=SEARCH_SAME;
+      if (subkeys >= 0)
+      {
+        /* normal word, one-level tree structure */
+        flag=(*keyinfo->bin_search)(key, &page, comp_flag,
+                                    &keypos, keybuff, &was_last_key);
+      }
+      else
+      {
+        /* popular word. two-level tree. going down */
+        my_off_t root=dup_key_pos;
+        keyinfo= &share->ft2_keyinfo;
+        get_key_full_length_rdonly(off, key);
+        key+=off;
+        /* we'll modify key entry 'in vivo' */
+        keypos-= keyinfo->keylength + page.node;
+        error= _ma_ck_real_write_btree(info, key, &root, comp_flag);
+        _ma_dpointer(share, keypos+HA_FT_WLEN, root);
+        subkeys--; /* should there be underflow protection ? */
+        DBUG_ASSERT(subkeys < 0);
+        ft_intXstore(keypos, subkeys);
+        if (!error)
+        {
+          page_mark_changed(info, &page);
+          if (_ma_write_keypage(&page, PAGECACHE_LOCK_LEFT_WRITELOCKED,
+                                DFLT_INIT_HITS))
+            goto err;
+        }
+        my_afree(temp_buff);
+        DBUG_RETURN(error);
+      }
+    }
+    else /* not HA_FULLTEXT, normal HA_NOSAME key */
+    {
+      /*
+        TODO
+        When the index will support true versioning - with multiple
+        identical values in the UNIQUE index, invisible to each other -
+        the following should be changed to "continue inserting keys, at the
+        end (of the row or statement) wait". We need to wait on *all*
+        unique conflicts at once, not one-at-a-time, because we need to
+        know all blockers in advance, otherwise we'll have incomplete wait-for
+        graph.
+      */
+      /*
+        transaction that has inserted the conflicting key may be in progress.
+        the caller will wait for it to be committed or aborted.
+      */
+      info->dup_key_trid= _ma_trid_from_key(&tmp_key);
+      info->dup_key_pos= dup_key_pos;
+      my_errno= HA_ERR_FOUND_DUPP_KEY;
+      DBUG_PRINT("warning",
+                 ("Duplicate key. dup_key_trid: %lu  pos %lu  visible: %d",
+                  (ulong) info->dup_key_trid,
+                  (ulong) info->dup_key_pos,
+                  info->trn ? trnman_can_read_from(info->trn,
+                                                   info->dup_key_trid) : 2));
+      goto err;
+    }
+  }
+  if (flag == MARIA_FOUND_WRONG_KEY)
+    goto err;
+  if (!was_last_key)
+    insert_last=0;
+  next_page= _ma_kpos(page.node, keypos);
+  if (next_page == HA_OFFSET_ERROR ||
+      (error= w_search(info, comp_flag, key, next_page,
+                       &page, keypos, insert_last)) > 0)
+  {
+    error= _ma_insert(info, key, &page, keypos, keybuff,
+                      father_page, father_keypos, insert_last);
+    page_mark_changed(info, &page);
+    if (_ma_write_keypage(&page, PAGECACHE_LOCK_LEFT_WRITELOCKED,
+                          DFLT_INIT_HITS))
+      goto err;
+  }
+  my_afree(temp_buff);
+  DBUG_RETURN(error);
+err:
+  my_afree(temp_buff);
+  DBUG_PRINT("exit",("Error: %d",my_errno));
+  DBUG_RETURN(-1);
+} /* w_search */
+
+
+/*
+  Insert new key.
+
+  SYNOPSIS
+    _ma_insert()
+    info                        Open table information.
+    keyinfo                     Key definition information.
+    key                         New key
+    anc_page                    Key page (beginning)
+    key_pos                     Position in key page where to insert.
+    key_buff                    Copy of previous key if keys where packed.
+    father_page                 position of parent key page in file.
+    father_key_pos              position in parent key page for balancing.
+    insert_last                 If to append at end of page.
+
+  DESCRIPTION
+    Insert new key at right of key_pos.
+    Note that caller must save anc_buff
+
+    This function writes log records for all changed pages
+    (Including anc_buff and father page)
+
+  RETURN
+    < 0         Error.
+    0           OK
+    1           If key contains key to upper level (from balance page)
+    2           If key contains key to upper level (from split space)
+*/
+
+int _ma_insert(register MARIA_HA *info, MARIA_KEY *key,
+               MARIA_PAGE *anc_page, uchar *key_pos, uchar *key_buff,
+               MARIA_PAGE *father_page, uchar *father_key_pos,
+               my_bool insert_last)
+{
+  uint a_length, nod_flag, org_anc_length;
+  int t_length;
+  uchar *endpos, *prev_key, *anc_buff;
+  MARIA_KEY_PARAM s_temp;
+  MARIA_SHARE *share= info->s;
+  MARIA_KEYDEF *keyinfo= key->keyinfo;
+  DBUG_ENTER("_ma_insert");
+  DBUG_PRINT("enter",("key_pos: 0x%lx", (ulong) key_pos));
+  DBUG_EXECUTE("key", _ma_print_key(DBUG_FILE, key););
+
+  /*
+    Note that anc_page->size can be bigger then block_size in case of
+    delete key that caused increase of page length
+  */
+  org_anc_length= a_length= anc_page->size;
+  nod_flag= anc_page->node;
+
+  anc_buff= anc_page->buff;
+  endpos= anc_buff+ a_length;
+  prev_key= (key_pos == anc_buff + share->keypage_header + nod_flag ?
+             (uchar*) 0 : key_buff);
+  t_length= (*keyinfo->pack_key)(key, nod_flag,
+                                 (key_pos == endpos ? (uchar*) 0 : key_pos),
+                                 prev_key, prev_key, &s_temp);
+#ifndef DBUG_OFF
+  if (prev_key && (keyinfo->flag & (HA_BINARY_PACK_KEY | HA_PACK_KEY)))
+  {
+    DBUG_DUMP("prev_key", prev_key, _ma_keylength(keyinfo,prev_key));
+  }
+  if (keyinfo->flag & HA_PACK_KEY)
+  {
+    DBUG_PRINT("test",("t_length: %d  ref_len: %d",
+		       t_length,s_temp.ref_length));
+    DBUG_PRINT("test",("n_ref_len: %d  n_length: %d  key_pos: 0x%lx",
+		       s_temp.n_ref_length, s_temp.n_length, (long) s_temp.key));
+  }
+#endif
+  if (t_length > 0)
+  {
+    if (t_length >= keyinfo->maxlength*2+MAX_POINTER_LENGTH)
+    {
+      my_errno=HA_ERR_CRASHED;
+      DBUG_RETURN(-1);
+    }
+    bmove_upp(endpos+t_length, endpos, (uint) (endpos-key_pos));
+  }
+  else
+  {
+    if (-t_length >= keyinfo->maxlength*2+MAX_POINTER_LENGTH)
+    {
+      my_errno=HA_ERR_CRASHED;
+      DBUG_RETURN(-1);
+    }
+    bmove(key_pos,key_pos-t_length,(uint) (endpos-key_pos)+t_length);
+  }
+  (*keyinfo->store_key)(keyinfo,key_pos,&s_temp);
+  a_length+=t_length;
+
+  if (key->flag & (SEARCH_USER_KEY_HAS_TRANSID | SEARCH_PAGE_KEY_HAS_TRANSID))
+  {
+    _ma_mark_page_with_transid(share, anc_page);
+  }
+  anc_page->size= a_length;
+  page_store_size(share, anc_page);
+
+  /*
+    Check if the new key fits totally into the the page
+    (anc_buff is big enough to contain a full page + one key)
+  */
+  if (a_length <= share->max_index_block_size)
+  {
+    if (share->max_index_block_size - a_length < 32 &&
+        (keyinfo->flag & HA_FULLTEXT) && key_pos == endpos &&
+        share->base.key_reflength <= share->base.rec_reflength &&
+        share->options & (HA_OPTION_PACK_RECORD | HA_OPTION_COMPRESS_RECORD))
+    {
+      /*
+        Normal word. One-level tree. Page is almost full.
+        Let's consider converting.
+        We'll compare 'key' and the first key at anc_buff
+      */
+      const uchar *a= key->data;
+      const uchar *b= anc_buff + share->keypage_header + nod_flag;
+      uint alen, blen, ft2len= share->ft2_keyinfo.keylength;
+      /* the very first key on the page is always unpacked */
+      DBUG_ASSERT((*b & 128) == 0);
+#if HA_FT_MAXLEN >= 127
+      blen= mi_uint2korr(b); b+=2;
+      When you enable this code, as part of the MyISAM->Maria merge of
+ChangeSet@1.2562, 2008-04-09 07:41:40+02:00, serg@janus.mylan +9 -0
+  restore ft2 functionality, fix bugs.
+      Then this will enable two-level fulltext index, which is not totally
+      recoverable yet.
+      So remove this text and inform Guilhem so that he fixes the issue.
+#else
+      blen= *b++;
+#endif
+      get_key_length(alen,a);
+      DBUG_ASSERT(info->ft1_to_ft2==0);
+      if (alen == blen &&
+          ha_compare_text(keyinfo->seg->charset, a, alen,
+                          b, blen, 0, 0) == 0)
+      {
+        /* Yup. converting */
+        info->ft1_to_ft2=(DYNAMIC_ARRAY *)
+          my_malloc(sizeof(DYNAMIC_ARRAY), MYF(MY_WME));
+        my_init_dynamic_array(info->ft1_to_ft2, ft2len, 300, 50);
+
+        /*
+          Now, adding all keys from the page to dynarray
+          if the page is a leaf (if not keys will be deleted later)
+        */
+        if (!nod_flag)
+        {
+          /*
+            Let's leave the first key on the page, though, because
+            we cannot easily dispatch an empty page here
+          */
+          b+=blen+ft2len+2;
+          for (a=anc_buff+a_length ; b < a ; b+=ft2len+2)
+            insert_dynamic(info->ft1_to_ft2, b);
+
+          /* fixing the page's length - it contains only one key now */
+          anc_page->size= share->keypage_header + blen + ft2len + 2;
+          page_store_size(share, anc_page);
+        }
+        /* the rest will be done when we're back from recursion */
+      }
+    }
+    else
+    {
+      if (share->now_transactional && 
+          _ma_log_add(anc_page, org_anc_length,
+                      key_pos, s_temp.changed_length, t_length, 1,
+                      KEY_OP_DEBUG_LOG_ADD_1))
+        DBUG_RETURN(-1);
+    }
+    DBUG_RETURN(0);				/* There is room on page */
+  }
+  /* Page is full */
+  if (nod_flag)
+    insert_last=0;
+  /*
+    TODO:
+    Remove 'born_transactional' here.
+    The only reason for having it here is that the current
+    _ma_balance_page_ can't handle variable length keys.
+  */
+  if (!(keyinfo->flag & (HA_VAR_LENGTH_KEY | HA_BINARY_PACK_KEY)) &&
+      father_page && !insert_last && !info->quick_mode &&
+      !info->s->base.born_transactional)
+  {
+    s_temp.key_pos= key_pos;
+    page_mark_changed(info, father_page);
+    DBUG_RETURN(_ma_balance_page(info, keyinfo, key, anc_page,
+                                 father_page, father_key_pos,
+                                 &s_temp));
+  }
+  DBUG_RETURN(_ma_split_page(info, key, anc_page,
+                             min(org_anc_length,
+                                 info->s->max_index_block_size),
+                             key_pos, s_temp.changed_length, t_length,
+                             key_buff, insert_last));
+} /* _ma_insert */
+
+
+/**
+  @brief split a full page in two and assign emerging item to key
+
+  @fn _ma_split_page()
+    info	     Maria handler
+    keyinfo	     Key handler
+    key		     Buffer for middle key
+    split_page       Page that should be split
+    org_split_length Original length of split_page before key was inserted
+    inserted_key_pos Address in buffer where key was inserted
+    changed_length   Number of bytes changed at 'inserted_key_pos'
+    move_length	     Number of bytes buffer was moved when key was inserted
+    key_buff	     Key buffer to use for temporary storage of key
+    insert_last_key  If we are insert key on rightmost key page
+
+  @note
+    split_buff is not stored on disk    (caller has to do this)
+
+  @return
+  @retval 2   ok  (Middle key up from _ma_insert())
+  @retval -1  error
+*/
+
+int _ma_split_page(MARIA_HA *info, MARIA_KEY *key, MARIA_PAGE *split_page,
+                   uint org_split_length,
+                   uchar *inserted_key_pos, uint changed_length,
+                   int move_length,
+                   uchar *key_buff, my_bool insert_last_key)
+{
+  uint length,a_length,key_ref_length,t_length,nod_flag,key_length;
+  uint page_length, split_length, page_flag;
+  uchar *key_pos,*pos, *after_key;
+  MARIA_KEY_PARAM s_temp;
+  MARIA_PINNED_PAGE tmp_page_link, *page_link= &tmp_page_link;
+  MARIA_SHARE *share= info->s;
+  MARIA_KEYDEF *keyinfo= key->keyinfo;
+  MARIA_KEY tmp_key;
+  MARIA_PAGE new_page;
+  int res;
+  DBUG_ENTER("_ma_split_page");
+
+  LINT_INIT(after_key);
+  DBUG_DUMP("buff", split_page->buff, split_page->size);
+
+  info->page_changed=1;			/* Info->buff is used */
+  info->keyread_buff_used=1;
+  page_flag= split_page->flag;
+  nod_flag=  split_page->node;
+  key_ref_length= share->keypage_header + nod_flag;
+
+  new_page.info= info;
+  new_page.buff= info->buff;
+  new_page.keyinfo= keyinfo;
+
+  tmp_key.data=   key_buff;
+  tmp_key.keyinfo= keyinfo;
+  if (insert_last_key)
+    key_pos= _ma_find_last_pos(&tmp_key, split_page, &after_key);
+  else
+    key_pos= _ma_find_half_pos(&tmp_key, split_page, &after_key);
+  if (!key_pos)
+    DBUG_RETURN(-1);
+
+  key_length= tmp_key.data_length + tmp_key.ref_length;
+  split_length= (uint) (key_pos - split_page->buff);
+  a_length= split_page->size;
+  split_page->size= split_length;
+  page_store_size(share, split_page);
+
+  key_pos=after_key;
+  if (nod_flag)
+  {
+    DBUG_PRINT("test",("Splitting nod"));
+    pos=key_pos-nod_flag;
+    memcpy(new_page.buff + share->keypage_header, pos, (size_t) nod_flag);
+  }
+
+  /* Move middle item to key and pointer to new page */
+  if ((new_page.pos= _ma_new(info, PAGECACHE_PRIORITY_HIGH, &page_link)) ==
+      HA_OFFSET_ERROR)
+    DBUG_RETURN(-1);
+
+  _ma_copy_key(key, &tmp_key);
+  _ma_kpointer(info, key->data + key_length, new_page.pos);
+
+  /* Store new page */
+  if (!(*keyinfo->get_key)(&tmp_key, page_flag, nod_flag, &key_pos))
+    DBUG_RETURN(-1);
+
+  t_length=(*keyinfo->pack_key)(&tmp_key, nod_flag, (uchar *) 0,
+				(uchar*) 0, (uchar*) 0, &s_temp);
+  length=(uint) ((split_page->buff + a_length) - key_pos);
+  memcpy(new_page.buff + key_ref_length + t_length, key_pos,
+	 (size_t) length);
+  (*keyinfo->store_key)(keyinfo,new_page.buff+key_ref_length,&s_temp);
+  page_length= length + t_length + key_ref_length;
+
+  bzero(new_page.buff, share->keypage_header);
+  /* Copy KEYFLAG_FLAG_ISNODE and KEYPAGE_FLAG_HAS_TRANSID from parent page */
+  new_page.flag= page_flag;
+  new_page.size= page_length;
+  page_store_info(share, &new_page);
+
+  /* Copy key number */
+  new_page.buff[share->keypage_header - KEYPAGE_USED_SIZE -
+                KEYPAGE_KEYID_SIZE - KEYPAGE_FLAG_SIZE]=
+    split_page->buff[share->keypage_header - KEYPAGE_USED_SIZE -
+                     KEYPAGE_KEYID_SIZE - KEYPAGE_FLAG_SIZE];
+
+  res= 2;                                       /* Middle key up */
+  if (share->now_transactional && _ma_log_new(&new_page, 0))
+    res= -1;
+
+  /*
+    Clear unitialized part of page to avoid valgrind/purify warnings
+    and to get a clean page that is easier to compress and compare with
+    pages generated with redo
+  */
+  bzero(new_page.buff + page_length, share->block_size - page_length);
+
+  if (_ma_write_keypage(&new_page, page_link->write_lock,
+                        DFLT_INIT_HITS))
+    res= -1;
+
+  /* Save changes to split pages */
+  if (share->now_transactional &&
+      _ma_log_split(split_page, org_split_length, split_length,
+                    inserted_key_pos, changed_length, move_length,
+                    KEY_OP_NONE, (uchar*) 0, 0, 0))
+    res= -1;
+
+  DBUG_DUMP_KEY("middle_key", key);
+  DBUG_RETURN(res);
+} /* _ma_split_page */
+
+
+/*
+  Calculate how to much to move to split a page in two
+
+  Returns pointer to start of key.
+  key will contain the key.
+  return_key_length will contain the length of key
+  after_key will contain the position to where the next key starts
+*/
+
+uchar *_ma_find_half_pos(MARIA_KEY *key, MARIA_PAGE *ma_page,
+                         uchar **after_key)
+{
+  uint keys, length, key_ref_length, page_flag, nod_flag;
+  uchar *page, *end, *lastpos;
+  MARIA_HA *info= ma_page->info;
+  MARIA_SHARE *share= info->s;
+  MARIA_KEYDEF *keyinfo= key->keyinfo;
+  DBUG_ENTER("_ma_find_half_pos");
+
+  nod_flag= ma_page->node;
+  key_ref_length= share->keypage_header + nod_flag;
+  page_flag= ma_page->flag;
+  length=    ma_page->size - key_ref_length;
+  page=      ma_page->buff+ key_ref_length;        /* Point to first key */
+
+  if (!(keyinfo->flag &
+	(HA_PACK_KEY | HA_SPACE_PACK_USED | HA_VAR_LENGTH_KEY |
+	 HA_BINARY_PACK_KEY)) && !(page_flag & KEYPAGE_FLAG_HAS_TRANSID))
+  {
+    key_ref_length=   keyinfo->keylength+nod_flag;
+    key->data_length= keyinfo->keylength - info->s->rec_reflength;
+    key->ref_length=  info->s->rec_reflength;
+    key->flag= 0;
+    keys=length/(key_ref_length*2);
+    end=page+keys*key_ref_length;
+    *after_key=end+key_ref_length;
+    memcpy(key->data, end, key_ref_length);
+    DBUG_RETURN(end);
+  }
+
+  end=page+length/2-key_ref_length;		/* This is aprox. half */
+  key->data[0]= 0;                               /* Safety */
+  do
+  {
+    lastpos=page;
+    if (!(length= (*keyinfo->get_key)(key, page_flag, nod_flag, &page)))
+      DBUG_RETURN(0);
+  } while (page < end);
+  *after_key= page;
+  DBUG_PRINT("exit",("returns: 0x%lx  page: 0x%lx  half: 0x%lx",
+                     (long) lastpos, (long) page, (long) end));
+  DBUG_RETURN(lastpos);
+} /* _ma_find_half_pos */
+
+
+/**
+  Find second to last key on leaf page
+
+  @notes
+  Used to split buffer at last key.  In this case the next to last
+  key will be moved to parent page and last key will be on it's own page.
+  
+  @TODO
+  Add one argument for 'last key value' to get_key so that one can
+  do the loop without having to copy the found key the whole time
+
+  @return
+  @retval Pointer to the start of the key before the last key
+  @retval int_key will contain the last key
+*/
+
+static uchar *_ma_find_last_pos(MARIA_KEY *int_key, MARIA_PAGE *ma_page,
+                                uchar **after_key)
+{
+  uint keys, length, key_ref_length, page_flag;
+  uchar *page, *end, *lastpos, *prevpos;
+  uchar key_buff[MARIA_MAX_KEY_BUFF];
+  MARIA_HA *info= ma_page->info;
+  MARIA_SHARE *share= info->s;
+  MARIA_KEYDEF *keyinfo= int_key->keyinfo;
+  MARIA_KEY tmp_key;
+  DBUG_ENTER("_ma_find_last_pos");
+
+  key_ref_length= share->keypage_header;
+  page_flag= ma_page->flag;
+  length= ma_page->size - key_ref_length;
+  page=   ma_page->buff + key_ref_length;
+
+  if (!(keyinfo->flag &
+	(HA_PACK_KEY | HA_SPACE_PACK_USED | HA_VAR_LENGTH_KEY |
+	 HA_BINARY_PACK_KEY)) && !(page_flag & KEYPAGE_FLAG_HAS_TRANSID))
+  {
+    keys= length / keyinfo->keylength - 2;
+    length= keyinfo->keylength;
+    int_key->data_length= length - info->s->rec_reflength;
+    int_key->ref_length=  info->s->rec_reflength;
+    int_key->flag= 0;
+    end=page+keys*length;
+    *after_key=end+length;
+    memcpy(int_key->data, end, length);
+    DBUG_RETURN(end);
+  }
+
+  end=page+length-key_ref_length;
+  lastpos=page;
+  tmp_key.data= key_buff;
+  tmp_key.keyinfo= int_key->keyinfo;
+  key_buff[0]= 0;                               /* Safety */
+
+  /* We know that there are at least 2 keys on the page */
+
+  if (!(length=(*keyinfo->get_key)(&tmp_key, page_flag, 0, &page)))
+  {
+    my_errno=HA_ERR_CRASHED;
+    DBUG_RETURN(0);
+  }
+
+  do
+  {
+    prevpos=lastpos; lastpos=page;
+    int_key->data_length= tmp_key.data_length;
+    int_key->ref_length=  tmp_key.ref_length;
+    int_key->flag=        tmp_key.flag;
+    memcpy(int_key->data, key_buff, length);		/* previous key */
+    if (!(length=(*keyinfo->get_key)(&tmp_key, page_flag, 0, &page)))
+    {
+      my_errno=HA_ERR_CRASHED;
+      DBUG_RETURN(0);
+    }
+  } while (page < end);
+
+  *after_key=lastpos;
+  DBUG_PRINT("exit",("returns: 0x%lx  page: 0x%lx  end: 0x%lx",
+                     (long) prevpos,(long) page,(long) end));
+  DBUG_RETURN(prevpos);
+} /* _ma_find_last_pos */
+
+
+/**
+  @brief Balance page with static size keys with page on right/left
+
+  @param key 	Middle key will be stored here
+
+  @notes
+    Father_buff will always be changed
+    Caller must handle saving of curr_buff
+
+  @return
+  @retval  0   Balance was done (father buff is saved)
+  @retval  1   Middle key up    (father buff is not saved)
+  @retval  -1  Error
+*/
+
+static int _ma_balance_page(MARIA_HA *info, MARIA_KEYDEF *keyinfo,
+			    MARIA_KEY *key, MARIA_PAGE *curr_page,
+                            MARIA_PAGE *father_page,
+                            uchar *father_key_pos, MARIA_KEY_PARAM *s_temp)
+{
+  MARIA_PINNED_PAGE tmp_page_link, *new_page_link= &tmp_page_link;
+  MARIA_SHARE *share= info->s;
+  my_bool right;
+  uint k_length,father_length,father_keylength,nod_flag,curr_keylength;
+  uint right_length,left_length,new_right_length,new_left_length,extra_length;
+  uint keys, tmp_length, extra_buff_length;
+  uchar *pos, *extra_buff, *parting_key;
+  uchar tmp_part_key[MARIA_MAX_KEY_BUFF];
+  MARIA_PAGE next_page, extra_page, *left_page, *right_page;
+  DBUG_ENTER("_ma_balance_page");
+
+  k_length= keyinfo->keylength;
+  father_length= father_page->size;
+  father_keylength= k_length + share->base.key_reflength;
+  nod_flag= curr_page->node;
+  curr_keylength= k_length+nod_flag;
+  info->page_changed=1;
+
+  if ((father_key_pos != father_page->buff+father_length &&
+       (info->state->records & 1)) ||
+      father_key_pos == father_page->buff+ share->keypage_header +
+      share->base.key_reflength)
+  {
+    right=1;
+    next_page.pos= _ma_kpos(share->base.key_reflength,
+                            father_key_pos+father_keylength);
+    left_page=  curr_page;
+    right_page= &next_page;
+    DBUG_PRINT("info", ("use right page: %lu",
+                        (ulong) (next_page.pos / keyinfo->block_length)));
+  }
+  else
+  {
+    right=0;
+    father_key_pos-=father_keylength;
+    next_page.pos= _ma_kpos(share->base.key_reflength,father_key_pos);
+    left_page=  &next_page;
+    right_page= curr_page;
+    DBUG_PRINT("info", ("use left page: %lu",
+                        (ulong) (next_page.pos / keyinfo->block_length)));
+  }					/* father_key_pos ptr to parting key */
+
+  if (_ma_fetch_keypage(&next_page, info, keyinfo, next_page.pos,
+                        PAGECACHE_LOCK_WRITE,
+                        DFLT_INIT_HITS, info->buff, 0))
+    goto err;
+  page_mark_changed(info, &next_page);
+  DBUG_DUMP("next", next_page.buff, next_page.size);
+
+  /* Test if there is room to share keys */
+  left_length= left_page->size;
+  right_length= right_page->size;
+  keys= ((left_length+right_length-share->keypage_header*2-nod_flag*2)/
+         curr_keylength);
+
+  if ((right ? right_length : left_length) + curr_keylength <=
+      share->max_index_block_size)
+  {
+    /* Enough space to hold all keys in the two buffers ; Balance bufferts */
+    new_left_length= share->keypage_header+nod_flag+(keys/2)*curr_keylength;
+    new_right_length=share->keypage_header+nod_flag+(((keys+1)/2)*
+                                                       curr_keylength);
+    left_page->size=  new_left_length;
+    page_store_size(share, left_page);
+    right_page->size= new_right_length;
+    page_store_size(share, right_page);
+
+    DBUG_PRINT("info", ("left_length: %u -> %u  right_length: %u -> %u",
+                        left_length, new_left_length,
+                        right_length, new_right_length));
+    if (left_length < new_left_length)
+    {
+      uint length;
+      DBUG_PRINT("info", ("move keys to end of buff"));
+
+      /* Move keys right_page -> left_page */
+      pos= left_page->buff+left_length;
+      memcpy(pos,father_key_pos, (size_t) k_length);
+      memcpy(pos+k_length, right_page->buff + share->keypage_header,
+	     (size_t) (length=new_left_length - left_length - k_length));
+      pos= right_page->buff + share->keypage_header + length;
+      memcpy(father_key_pos, pos, (size_t) k_length);
+      bmove(right_page->buff + share->keypage_header,
+            pos + k_length, new_right_length);
+
+      if (share->now_transactional)
+      {
+        if (right)
+        {
+          /*
+            Log changes to page on left
+            The original page is on the left and stored in left_page->buff
+            We have on the page the newly inserted key and data
+            from buff added last on the page
+          */
+          if (_ma_log_split(curr_page,
+                            left_length - s_temp->move_length,
+                            new_left_length,
+                            s_temp->key_pos, s_temp->changed_length,
+                            s_temp->move_length,
+                            KEY_OP_ADD_SUFFIX,
+                            curr_page->buff + left_length,
+                            new_left_length - left_length,
+                            new_left_length - left_length+ k_length))
+            goto err;
+          /*
+            Log changes to page on right
+            This contains the original data with some keys deleted from
+            start of page
+          */
+          if (_ma_log_prefix(&next_page, 0,
+                             ((int) new_right_length - (int) right_length),
+                             KEY_OP_DEBUG_LOG_PREFIX_3))
+            goto err;
+        }
+        else
+        {
+          /*
+            Log changes to page on right (the original page) which is in buff
+            Data is removed from start of page
+            The inserted key may be in buff or moved to curr_buff
+          */
+          if (_ma_log_del_prefix(curr_page,
+                                 right_length - s_temp->changed_length,
+                                 new_right_length,
+                                 s_temp->key_pos, s_temp->changed_length,
+                                 s_temp->move_length))
+            goto err;
+          /*
+            Log changes to page on left, which has new data added last
+          */
+          if (_ma_log_suffix(&next_page, left_length, new_left_length))
+            goto err;
+        }
+      }
+    }
+    else
+    {
+      uint length;
+      DBUG_PRINT("info", ("move keys to start of right_page"));
+
+      bmove_upp(right_page->buff + new_right_length,
+                right_page->buff + right_length,
+		right_length - share->keypage_header);
+      length= new_right_length -right_length - k_length;
+      memcpy(right_page->buff + share->keypage_header + length, father_key_pos,
+             (size_t) k_length);
+      pos= left_page->buff + new_left_length;
+      memcpy(father_key_pos, pos, (size_t) k_length);
+      memcpy(right_page->buff + share->keypage_header, pos+k_length,
+             (size_t) length);
+
+      if (share->now_transactional)
+      {
+        if (right)
+        {
+          /*
+            Log changes to page on left
+            The original page is on the left and stored in curr_buff
+            The page is shortened from end and the key may be on the page
+          */
+          if (_ma_log_split(curr_page,
+                            left_length - s_temp->move_length,
+                            new_left_length,
+                            s_temp->key_pos, s_temp->changed_length,
+                            s_temp->move_length,
+                            KEY_OP_NONE, (uchar*) 0, 0, 0))
+            goto err;
+          /*
+            Log changes to page on right
+            This contains the original data, with some data from cur_buff
+            added first
+          */
+          if (_ma_log_prefix(&next_page,
+                             (uint) (new_right_length - right_length),
+                             (int) (new_right_length - right_length),
+                             KEY_OP_DEBUG_LOG_PREFIX_4))
+            goto err;
+        }
+        else
+        {
+          /*
+            Log changes to page on right (the original page) which is in buff
+            We have on the page the newly inserted key and data
+            from buff added first on the page
+          */
+          uint diff_length= new_right_length - right_length;
+          if (_ma_log_split(curr_page,
+                            left_length - s_temp->move_length,
+                            new_right_length,
+                            s_temp->key_pos + diff_length,
+                            s_temp->changed_length,
+                            s_temp->move_length,
+                            KEY_OP_ADD_PREFIX,
+                            curr_page->buff + share->keypage_header,
+                            diff_length, diff_length + k_length))
+            goto err;
+          /*
+            Log changes to page on left, which is shortened from end
+          */
+          if (_ma_log_suffix(&next_page, left_length, new_left_length))
+            goto err;
+        }
+      }
+    }
+
+    /* Log changes to father (one level up) page */
+
+    if (share->now_transactional &&
+        _ma_log_change(father_page, father_key_pos, k_length,
+                       KEY_OP_DEBUG_FATHER_CHANGED_1))
+      goto err;
+
+    /*
+      next_page_link->changed is marked as true above and fathers
+      page_link->changed is marked as true in caller
+    */
+    if (_ma_write_keypage(&next_page, PAGECACHE_LOCK_LEFT_WRITELOCKED,
+                          DFLT_INIT_HITS) ||
+        _ma_write_keypage(father_page,
+                          PAGECACHE_LOCK_LEFT_WRITELOCKED, DFLT_INIT_HITS))
+      goto err;
+    DBUG_RETURN(0);
+  }
+
+  /* left_page and right_page are full, lets split and make new nod */
+
+  extra_buff= info->buff+share->base.max_key_block_length;
+  new_left_length= new_right_length= (share->keypage_header + nod_flag +
+                                      (keys+1) / 3 * curr_keylength);
+  extra_page.info=    info;
+  extra_page.keyinfo= keyinfo;
+  extra_page.buff=    extra_buff;
+
+  /*
+    5 is the minum number of keys we can have here. This comes from
+    the fact that each full page can store at least 2 keys and in this case
+    we have a 'split' key, ie 2+2+1 = 5
+  */
+  if (keys == 5)				/* Too few keys to balance */
+    new_left_length-=curr_keylength;
+  extra_length= (nod_flag + left_length + right_length -
+                 new_left_length - new_right_length - curr_keylength);
+  extra_buff_length= extra_length + share->keypage_header;
+  DBUG_PRINT("info",("left_length: %d  right_length: %d  new_left_length: %d  new_right_length: %d  extra_length: %d",
+                     left_length, right_length,
+                     new_left_length, new_right_length,
+                     extra_length));
+
+  left_page->size= new_left_length;
+  page_store_size(share, left_page);
+  right_page->size= new_right_length;
+  page_store_size(share, right_page);
+
+  bzero(extra_buff, share->keypage_header);
+  extra_page.flag= nod_flag ? KEYPAGE_FLAG_ISNOD : 0;
+  extra_page.size= extra_buff_length;
+  page_store_info(share, &extra_page);
+
+  /* Copy key number */
+  extra_buff[share->keypage_header - KEYPAGE_USED_SIZE - KEYPAGE_KEYID_SIZE -
+             KEYPAGE_FLAG_SIZE]= keyinfo->key_nr;
+
+  /* move first largest keys to new page  */
+  pos= right_page->buff + right_length-extra_length;
+  memcpy(extra_buff + share->keypage_header, pos, extra_length);
+  /* Zero old data from buffer */
+  bzero(extra_buff + extra_buff_length,
+        share->block_size - extra_buff_length);
+
+  /* Save new parting key between buff and extra_buff */
+  memcpy(tmp_part_key, pos-k_length,k_length);
+  /* Make place for new keys */
+  bmove_upp(right_page->buff + new_right_length, pos - k_length,
+            right_length - extra_length - k_length - share->keypage_header);
+  /* Copy keys from left page */
+  pos= left_page->buff + new_left_length;
+  memcpy(right_page->buff + share->keypage_header, pos + k_length,
+         (size_t) (tmp_length= left_length - new_left_length - k_length));
+  /* Copy old parting key */
+  parting_key= right_page->buff + share->keypage_header + tmp_length;
+  memcpy(parting_key, father_key_pos, (size_t) k_length);
+
+  /* Move new parting keys up to caller */
+  memcpy((right ? key->data : father_key_pos),pos,(size_t) k_length);
+  memcpy((right ? father_key_pos : key->data),tmp_part_key, k_length);
+
+  if ((extra_page.pos= _ma_new(info, DFLT_INIT_HITS, &new_page_link))
+      == HA_OFFSET_ERROR)
+    goto err;
+  _ma_kpointer(info,key->data+k_length, extra_page.pos);
+  /* This is safe as long we are using not keys with transid */
+  key->data_length= k_length - info->s->rec_reflength;
+  key->ref_length= info->s->rec_reflength;
+
+  if (right)
+  {
+    /*
+      Page order according to key values:
+      orignal_page (curr_page = left_page), next_page (buff), extra_buff
+
+      Move page positions so that we store data in extra_page where
+      next_page was and next_page will be stored at the new position
+    */
+    swap_variables(my_off_t, extra_page.pos, next_page.pos);
+  } 
+
+  if (share->now_transactional)
+  {
+    if (right)
+    {
+      /*
+        left_page is shortened,
+        right_page is getting new keys at start and shortened from end.
+        extra_page is new page
+
+        Note that extra_page (largest key parts) will be stored at the
+        place of the original 'right' page (next_page) and right page
+        will be stored at the new page position
+
+        This makes the log entries smaller as right_page contains all
+        data to generate the data extra_buff
+      */
+
+      /*
+        Log changes to page on left (page shortened page at end)
+      */
+      if (_ma_log_split(curr_page,
+                        left_length - s_temp->move_length, new_left_length,
+                        s_temp->key_pos, s_temp->changed_length,
+                        s_temp->move_length,
+                        KEY_OP_NONE, (uchar*) 0, 0, 0))
+        goto err;
+      /*
+        Log changes to right page (stored at next page)
+        This contains the last 'extra_buff' from 'buff'
+      */
+      if (_ma_log_prefix(&extra_page,
+                         0, (int) (extra_buff_length - right_length),
+                         KEY_OP_DEBUG_LOG_PREFIX_5))
+        goto err;
+
+      /*
+        Log changes to middle page, which is stored at the new page
+        position
+      */
+      if (_ma_log_new(&next_page, 0))
+        goto err;
+    }
+    else
+    {
+      /*
+        Log changes to page on right (the original page) which is in buff
+        This contains the original data, with some data from curr_buff
+        added first and shortened at end
+      */
+      int data_added_first= left_length - new_left_length;
+      if (_ma_log_key_middle(right_page,
+                             new_right_length,
+                             data_added_first,
+                             data_added_first,
+                             extra_length,
+                             s_temp->key_pos,
+                             s_temp->changed_length,
+                             s_temp->move_length))
+        goto err;
+
+      /* Log changes to page on left, which is shortened from end */
+      if (_ma_log_suffix(left_page, left_length, new_left_length))
+        goto err;
+
+      /* Log change to rightmost (new) page */
+      if (_ma_log_new(&extra_page, 0))
+        goto err;
+    }
+
+    /* Log changes to father (one level up) page */
+    if (share->now_transactional &&
+        _ma_log_change(father_page, father_key_pos, k_length,
+                       KEY_OP_DEBUG_FATHER_CHANGED_2))
+      goto err;
+  }
+
+  if (_ma_write_keypage(&next_page,
+                        (right ? new_page_link->write_lock :
+                         PAGECACHE_LOCK_LEFT_WRITELOCKED),
+                        DFLT_INIT_HITS) ||
+      _ma_write_keypage(&extra_page,
+                        (!right ? new_page_link->write_lock :
+                         PAGECACHE_LOCK_LEFT_WRITELOCKED),
+                        DFLT_INIT_HITS))
+    goto err;
+
+  DBUG_RETURN(1);				/* Middle key up */
+
+err:
+  DBUG_RETURN(-1);
+} /* _ma_balance_page */
+
+
+/**********************************************************************
+ *                Bulk insert code                                    *
+ **********************************************************************/
+
+typedef struct {
+  MARIA_HA *info;
+  uint keynr;
+} bulk_insert_param;
+
+
+static my_bool _ma_ck_write_tree(register MARIA_HA *info, MARIA_KEY *key)
+{
+  my_bool error;
+  uint keynr= key->keyinfo->key_nr;
+  DBUG_ENTER("_ma_ck_write_tree");
+
+  /* Store ref_length as this is always constant */
+  info->bulk_insert_ref_length= key->ref_length;
+  error= tree_insert(&info->bulk_insert[keynr], key->data,
+                     key->data_length + key->ref_length,
+                     info->bulk_insert[keynr].custom_arg) == 0;
+  DBUG_RETURN(error);
+} /* _ma_ck_write_tree */
+
+
+/* typeof(_ma_keys_compare)=qsort_cmp2 */
+
+static int keys_compare(bulk_insert_param *param, uchar *key1, uchar *key2)
+{
+  uint not_used[2];
+  return ha_key_cmp(param->info->s->keyinfo[param->keynr].seg,
+                    key1, key2, USE_WHOLE_KEY, SEARCH_SAME,
+                    not_used);
+}
+
+
+static int keys_free(uchar *key, TREE_FREE mode, bulk_insert_param *param)
+{
+  /*
+    Probably I can use info->lastkey here, but I'm not sure,
+    and to be safe I'd better use local lastkey.
+  */
+  MARIA_SHARE *share= param->info->s;
+  uchar lastkey[MARIA_MAX_KEY_BUFF];
+  uint keylen;
+  MARIA_KEYDEF *keyinfo= share->keyinfo + param->keynr;
+  MARIA_KEY tmp_key;
+
+  switch (mode) {
+  case free_init:
+    if (share->lock_key_trees)
+    {
+      rw_wrlock(&keyinfo->root_lock);
+      keyinfo->version++;
+    }
+    return 0;
+  case free_free:
+    /* Note: keylen doesn't contain transid lengths */
+    keylen= _ma_keylength(keyinfo, key);
+    tmp_key.data=        lastkey;
+    tmp_key.keyinfo=     keyinfo;
+    tmp_key.data_length= keylen - share->rec_reflength;
+    tmp_key.ref_length=  param->info->bulk_insert_ref_length;
+    tmp_key.flag= (param->info->bulk_insert_ref_length ==
+                   share->rec_reflength ? 0 : SEARCH_USER_KEY_HAS_TRANSID);
+    /*
+      We have to copy key as ma_ck_write_btree may need the buffer for
+      copying middle key up if tree is growing
+    */
+    memcpy(lastkey, key, tmp_key.data_length + tmp_key.ref_length);
+    return _ma_ck_write_btree(param->info, &tmp_key);
+  case free_end:
+    if (share->lock_key_trees)
+      rw_unlock(&keyinfo->root_lock);
+    return 0;
+  }
+  return 1;
+}
+
+
+int maria_init_bulk_insert(MARIA_HA *info, ulong cache_size, ha_rows rows)
+{
+  MARIA_SHARE *share= info->s;
+  MARIA_KEYDEF *key=share->keyinfo;
+  bulk_insert_param *params;
+  uint i, num_keys, total_keylength;
+  ulonglong key_map;
+  DBUG_ENTER("_ma_init_bulk_insert");
+  DBUG_PRINT("enter",("cache_size: %lu", cache_size));
+
+  DBUG_ASSERT(!info->bulk_insert &&
+	      (!rows || rows >= MARIA_MIN_ROWS_TO_USE_BULK_INSERT));
+
+  maria_clear_all_keys_active(key_map);
+  for (i=total_keylength=num_keys=0 ; i < share->base.keys ; i++)
+  {
+    if (! (key[i].flag & HA_NOSAME) && (share->base.auto_key != i + 1) &&
+        maria_is_key_active(share->state.key_map, i))
+    {
+      num_keys++;
+      maria_set_key_active(key_map, i);
+      total_keylength+=key[i].maxlength+TREE_ELEMENT_EXTRA_SIZE;
+    }
+  }
+
+  if (num_keys==0 ||
+      num_keys * MARIA_MIN_SIZE_BULK_INSERT_TREE > cache_size)
+    DBUG_RETURN(0);
+
+  if (rows && rows*total_keylength < cache_size)
+    cache_size= (ulong)rows;
+  else
+    cache_size/=total_keylength*16;
+
+  info->bulk_insert=(TREE *)
+    my_malloc((sizeof(TREE)*share->base.keys+
+               sizeof(bulk_insert_param)*num_keys),MYF(0));
+
+  if (!info->bulk_insert)
+    DBUG_RETURN(HA_ERR_OUT_OF_MEM);
+
+  params=(bulk_insert_param *)(info->bulk_insert+share->base.keys);
+  for (i=0 ; i < share->base.keys ; i++)
+  {
+    if (maria_is_key_active(key_map, i))
+    {
+      params->info=info;
+      params->keynr=i;
+      /* Only allocate a 16'th of the buffer at a time */
+      init_tree(&info->bulk_insert[i],
+                cache_size * key[i].maxlength,
+                cache_size * key[i].maxlength, 0,
+		(qsort_cmp2)keys_compare, 0,
+		(tree_element_free) keys_free, (void *)params++);
+    }
+    else
+     info->bulk_insert[i].root=0;
+  }
+
+  DBUG_RETURN(0);
+}
+
+void maria_flush_bulk_insert(MARIA_HA *info, uint inx)
+{
+  if (info->bulk_insert)
+  {
+    if (is_tree_inited(&info->bulk_insert[inx]))
+      reset_tree(&info->bulk_insert[inx]);
+  }
+}
+
+void maria_end_bulk_insert(MARIA_HA *info)
+{
+  DBUG_ENTER("maria_end_bulk_insert");
+  if (info->bulk_insert)
+  {
+    uint i;
+    for (i=0 ; i < info->s->base.keys ; i++)
+    {
+      if (is_tree_inited(&info->bulk_insert[i]))
+      {
+        if (info->s->deleting)
+          reset_free_element(&info->bulk_insert[i]);
+        delete_tree(&info->bulk_insert[i]);
+      }
+    }
+    my_free(info->bulk_insert, MYF(0));
+    info->bulk_insert= 0;
+  }
+  DBUG_VOID_RETURN;
+}
+
+
+/****************************************************************************
+  Dedicated functions that generate log entries
+****************************************************************************/
+
+
+int _ma_write_undo_key_insert(MARIA_HA *info, const MARIA_KEY *key,
+                              my_off_t *root, my_off_t new_root, LSN *res_lsn)
+{
+  MARIA_SHARE *share= info->s;
+  MARIA_KEYDEF *keyinfo= key->keyinfo;
+  uchar log_data[LSN_STORE_SIZE + FILEID_STORE_SIZE +
+                 KEY_NR_STORE_SIZE];
+  const uchar *key_value;
+  LEX_CUSTRING log_array[TRANSLOG_INTERNAL_PARTS + 2];
+  struct st_msg_to_write_hook_for_undo_key msg;
+  uint key_length;
+
+  /* Save if we need to write a clr record */
+  lsn_store(log_data, info->trn->undo_lsn);
+  key_nr_store(log_data + LSN_STORE_SIZE + FILEID_STORE_SIZE,
+               keyinfo->key_nr);
+  key_length= key->data_length + key->ref_length;
+  log_array[TRANSLOG_INTERNAL_PARTS + 0].str=    log_data;
+  log_array[TRANSLOG_INTERNAL_PARTS + 0].length= sizeof(log_data);
+  log_array[TRANSLOG_INTERNAL_PARTS + 1].str=    key->data;
+  log_array[TRANSLOG_INTERNAL_PARTS + 1].length= key_length;
+
+  msg.root= root;
+  msg.value= new_root;
+  msg.auto_increment= 0;
+  key_value= key->data;
+  if (share->base.auto_key == ((uint) keyinfo->key_nr + 1))
+  {
+    const HA_KEYSEG *keyseg= keyinfo->seg;
+    uchar reversed[MARIA_MAX_KEY_BUFF];
+    if (keyseg->flag & HA_SWAP_KEY)
+    {
+      /* We put key from log record to "data record" packing format... */
+      const uchar *key_ptr= key->data, *key_end= key->data + keyseg->length;
+      uchar *to= reversed + keyseg->length;
+      do
+      {
+        *--to= *key_ptr++;
+      } while (key_ptr != key_end);
+      key_value= to;
+    }
+    /* ... so that we can read it with: */
+    msg.auto_increment=
+      ma_retrieve_auto_increment(key_value, keyseg->type);
+    /* and write_hook_for_undo_key_insert() will pick this. */
+  }
+
+  return translog_write_record(res_lsn, LOGREC_UNDO_KEY_INSERT,
+                               info->trn, info,
+                               (translog_size_t)
+                               log_array[TRANSLOG_INTERNAL_PARTS + 0].length +
+                               key_length,
+                               TRANSLOG_INTERNAL_PARTS + 2, log_array,
+                               log_data + LSN_STORE_SIZE, &msg) ? -1 : 0;
+}
+
+
+/**
+  @brief Log creation of new page
+
+  @note
+    We don't have to store the page_length into the log entry as we can
+    calculate this from the length of the log entry
+
+  @retval 1   error
+  @retval 0    ok
+*/
+
+my_bool _ma_log_new(MARIA_PAGE *ma_page, my_bool root_page)
+{
+  LSN lsn;
+  uchar log_data[FILEID_STORE_SIZE + PAGE_STORE_SIZE * 2 + KEY_NR_STORE_SIZE
+                 +1];
+  uint page_length;
+  LEX_CUSTRING log_array[TRANSLOG_INTERNAL_PARTS + 2];
+  MARIA_HA *info= ma_page->info;
+  MARIA_SHARE *share= info->s;
+  my_off_t page= ma_page->pos / share->block_size;
+  DBUG_ENTER("_ma_log_new");
+  DBUG_PRINT("enter", ("page: %lu", (ulong) page));
+
+  DBUG_ASSERT(share->now_transactional);
+
+  /* Store address of new root page */
+  page_store(log_data + FILEID_STORE_SIZE, page);
+
+  /* Store link to next unused page */
+  if (info->key_del_used == 2)
+    page= 0;                                    /* key_del not changed */
+  else
+    page= ((share->key_del_current == HA_OFFSET_ERROR) ? IMPOSSIBLE_PAGE_NO :
+           share->key_del_current / share->block_size);
+
+  page_store(log_data + FILEID_STORE_SIZE + PAGE_STORE_SIZE, page);
+  key_nr_store(log_data + FILEID_STORE_SIZE + PAGE_STORE_SIZE*2,
+               ma_page->keyinfo->key_nr);
+  log_data[FILEID_STORE_SIZE + PAGE_STORE_SIZE*2 + KEY_NR_STORE_SIZE]=
+    (uchar) root_page;
+
+  log_array[TRANSLOG_INTERNAL_PARTS + 0].str=    log_data;
+  log_array[TRANSLOG_INTERNAL_PARTS + 0].length= sizeof(log_data);
+
+  page_length= ma_page->size - LSN_STORE_SIZE;
+  log_array[TRANSLOG_INTERNAL_PARTS + 1].str=   ma_page->buff + LSN_STORE_SIZE;
+  log_array[TRANSLOG_INTERNAL_PARTS + 1].length= page_length;
+
+  /* Remember new page length for future log entires for same page */
+  ma_page->org_size= ma_page->size;
+
+  if (translog_write_record(&lsn, LOGREC_REDO_INDEX_NEW_PAGE,
+                            info->trn, info,
+                            (translog_size_t)
+                            (sizeof(log_data) + page_length),
+                            TRANSLOG_INTERNAL_PARTS + 2, log_array,
+                            log_data, NULL))
+    DBUG_RETURN(1);
+  DBUG_RETURN(0);
+}
+
+
+/**
+   @brief
+   Log when some part of the key page changes
+*/
+
+my_bool _ma_log_change(MARIA_PAGE *ma_page, const uchar *key_pos, uint length,
+                       enum en_key_debug debug_marker __attribute__((unused)))
+{
+  LSN lsn;
+  uchar log_data[FILEID_STORE_SIZE + PAGE_STORE_SIZE + 2 + 6 + 7], *log_pos;
+  LEX_CUSTRING log_array[TRANSLOG_INTERNAL_PARTS + 4];
+  uint offset= (uint) (key_pos - ma_page->buff), translog_parts;
+  MARIA_HA *info= ma_page->info;
+  my_off_t page= ma_page->pos / info->s->block_size;
+  DBUG_ENTER("_ma_log_change");
+  DBUG_PRINT("enter", ("page: %lu  length: %u", (ulong) page, length));
+
+  DBUG_ASSERT(info->s->now_transactional);
+  DBUG_ASSERT(offset + length <= ma_page->size);
+  DBUG_ASSERT(ma_page->org_size == ma_page->size);
+
+  /* Store address of new root page */
+  page= ma_page->pos / info->s->block_size;
+  page_store(log_data + FILEID_STORE_SIZE, page);
+  log_pos= log_data+ FILEID_STORE_SIZE + PAGE_STORE_SIZE;
+
+#ifdef EXTRA_DEBUG_KEY_CHANGES
+  (*log_pos++)= KEY_OP_DEBUG;
+  (*log_pos++)= debug_marker;
+#endif
+
+  log_pos[0]= KEY_OP_OFFSET;
+  int2store(log_pos+1, offset);
+  log_pos[3]= KEY_OP_CHANGE;
+  int2store(log_pos+4, length);
+  log_pos+= 6;
+
+  log_array[TRANSLOG_INTERNAL_PARTS + 0].str=    log_data;
+  log_array[TRANSLOG_INTERNAL_PARTS + 0].length= (log_pos - log_data);
+  log_array[TRANSLOG_INTERNAL_PARTS + 1].str=    key_pos;
+  log_array[TRANSLOG_INTERNAL_PARTS + 1].length= length;
+  translog_parts= 2;
+
+  _ma_log_key_changes(ma_page,
+                      log_array + TRANSLOG_INTERNAL_PARTS + translog_parts,
+                      log_pos, &length, &translog_parts);
+
+  if (translog_write_record(&lsn, LOGREC_REDO_INDEX,
+                            info->trn, info,
+                            (translog_size_t) (log_pos - log_data) + length,
+                            TRANSLOG_INTERNAL_PARTS + translog_parts,
+                            log_array, log_data, NULL))
+    DBUG_RETURN(1);
+  DBUG_RETURN(0);
+}
+
+
+/**
+   @brief Write log entry for page splitting
+
+   @fn     _ma_log_split()
+   @param
+     ma_page		Page that is changed
+     org_length	        Original length of page. Can be bigger than block_size
+                        for block that overflowed
+     new_length		New length of page
+     key_pos		Where key is inserted on page (may be 0 if no key)
+     key_length		Number of bytes changed at key_pos
+     move_length	Number of bytes moved at key_pos to make room for key
+     prefix_or_suffix   KEY_OP_NONE	    Ignored
+   			KEY_OP_ADD_PREFIX   Add data to start of page
+			KEY_OP_ADD_SUFFIX   Add data to end of page
+     data		What data was added
+     data_length	Number of bytes added first or last
+     changed_length	Number of bytes changed first or last.
+
+   @note
+     Write log entry for page that has got a key added to the page under
+     one and only one of the following senarios:
+     - Page is shortened from end
+     - Data is added to end of page
+     - Data added at front of page
+*/
+
+static my_bool _ma_log_split(MARIA_PAGE *ma_page,
+                             uint org_length, uint new_length,
+                             const uchar *key_pos, uint key_length,
+                             int move_length, enum en_key_op prefix_or_suffix,
+                             const uchar *data, uint data_length,
+                             uint changed_length)
+{
+  LSN lsn;
+  uchar log_data[FILEID_STORE_SIZE + PAGE_STORE_SIZE + 2 + 2 + 3+3+3+3+3+2 +7];
+  uchar *log_pos;
+  LEX_CUSTRING log_array[TRANSLOG_INTERNAL_PARTS + 6];
+  uint offset= (uint) (key_pos - ma_page->buff);
+  uint translog_parts, extra_length;
+  MARIA_HA *info= ma_page->info; 
+  my_off_t page= ma_page->pos / info->s->block_size;
+  DBUG_ENTER("_ma_log_split");
+  DBUG_PRINT("enter", ("page: %lu  org_length: %u  new_length: %u",
+                       (ulong) page, org_length, new_length));
+
+  DBUG_ASSERT(changed_length >= data_length);
+  DBUG_ASSERT(org_length <= info->s->max_index_block_size);
+  DBUG_ASSERT(new_length == ma_page->size);
+  DBUG_ASSERT(org_length == ma_page->org_size);
+
+  log_pos= log_data + FILEID_STORE_SIZE;
+  page_store(log_pos, page);
+  log_pos+= PAGE_STORE_SIZE;
+
+#ifdef EXTRA_DEBUG_KEY_CHANGES
+  (*log_pos++)= KEY_OP_DEBUG;
+  (*log_pos++)= KEY_OP_DEBUG_LOG_SPLIT;
+#endif
+
+  /* Store keypage_flag */
+  *log_pos++= KEY_OP_SET_PAGEFLAG;
+  *log_pos++= ma_page->buff[KEYPAGE_TRANSFLAG_OFFSET];
+
+  if (new_length <= offset || !key_pos)
+  {
+    /*
+      Page was split before inserted key. Write redo entry where
+      we just cut current page at page_length
+    */
+    uint length_offset= org_length - new_length;
+    log_pos[0]= KEY_OP_DEL_SUFFIX;
+    int2store(log_pos+1, length_offset);
+    log_pos+= 3;
+    translog_parts= 1;
+    extra_length= 0;
+    DBUG_ASSERT(data_length == 0);
+  }
+  else
+  {
+    /* Key was added to page which was split after the inserted key */
+    uint max_key_length;
+
+    /*
+      Handle case when split happened directly after the newly inserted key.
+    */
+    max_key_length= new_length - offset;
+    extra_length= min(key_length, max_key_length);
+    if (offset + move_length > new_length)
+    {
+      /* This is true when move_length includes changes for next packed key */
+      move_length= new_length - offset;
+    }
+
+    if ((int) new_length < (int) (org_length + move_length + data_length))
+    {
+      /* Shorten page */
+      uint diff= org_length + move_length + data_length - new_length;
+      log_pos[0]= KEY_OP_DEL_SUFFIX;
+      int2store(log_pos + 1, diff);
+      log_pos+= 3;
+      DBUG_ASSERT(data_length == 0);            /* Page is shortened */
+      DBUG_ASSERT(offset <= org_length - diff);
+    }
+    else
+    {
+      DBUG_ASSERT(new_length == org_length + move_length + data_length);
+      DBUG_ASSERT(offset <= org_length);
+    }
+
+    log_pos[0]= KEY_OP_OFFSET;
+    int2store(log_pos+1, offset);
+    log_pos+= 3;
+
+    if (move_length)
+    {
+      log_pos[0]= KEY_OP_SHIFT;
+      int2store(log_pos+1, move_length);
+      log_pos+= 3;
+    }
+
+    log_pos[0]= KEY_OP_CHANGE;
+    int2store(log_pos+1, extra_length);
+    log_pos+= 3;
+
+    /* Point to original inserted key data */
+    if (prefix_or_suffix == KEY_OP_ADD_PREFIX)
+      key_pos+= data_length;
+
+    translog_parts= 2;
+    log_array[TRANSLOG_INTERNAL_PARTS + 1].str=    key_pos;
+    log_array[TRANSLOG_INTERNAL_PARTS + 1].length= extra_length;
+  }
+
+  if (data_length)
+  {
+    /* Add prefix or suffix */
+    log_pos[0]= prefix_or_suffix;
+    int2store(log_pos+1, data_length);
+    log_pos+= 3;
+    if (prefix_or_suffix == KEY_OP_ADD_PREFIX)
+    {
+      int2store(log_pos+1, changed_length);
+      log_pos+= 2;
+      data_length= changed_length;
+    }
+    log_array[TRANSLOG_INTERNAL_PARTS + translog_parts].str=    data;
+    log_array[TRANSLOG_INTERNAL_PARTS + translog_parts].length= data_length;
+    translog_parts++;
+    extra_length+= data_length;
+  }
+
+  log_array[TRANSLOG_INTERNAL_PARTS + 0].str=    log_data;
+  log_array[TRANSLOG_INTERNAL_PARTS + 0].length= (uint) (log_pos -
+                                                         log_data);
+
+  _ma_log_key_changes(ma_page,
+                      log_array + TRANSLOG_INTERNAL_PARTS + translog_parts,
+                      log_pos, &extra_length, &translog_parts);
+  /* Remember new page length for future log entires for same page */
+  ma_page->org_size= ma_page->size;
+
+  DBUG_RETURN(translog_write_record(&lsn, LOGREC_REDO_INDEX,
+                                    info->trn, info,
+                                    (translog_size_t)
+                                    log_array[TRANSLOG_INTERNAL_PARTS +
+                                              0].length + extra_length,
+                                    TRANSLOG_INTERNAL_PARTS + translog_parts,
+                                    log_array, log_data, NULL));
+}
+
+
+/**
+   @brief
+   Write log entry for page that has got a key added to the page
+   and page is shortened from start of page
+
+   @fn _ma_log_del_prefix()
+   @param info		Maria handler
+   @param page		Page number
+   @param buff		Page buffer
+   @param org_length	Length of buffer when read
+   @param new_length	Final length
+   @param key_pos	Where on page buffer key was added. This is position
+			before prefix was removed
+   @param key_length    How many bytes was changed at 'key_pos'
+   @param move_length   How many bytes was moved up when key was added
+
+   @return
+   @retval  0  ok
+   @retval  1  error
+*/
+
+static my_bool _ma_log_del_prefix(MARIA_PAGE *ma_page,
+                                  uint org_length, uint new_length,
+                                  const uchar *key_pos, uint key_length,
+                                  int move_length)
+{
+  LSN lsn;
+  uchar log_data[FILEID_STORE_SIZE + PAGE_STORE_SIZE + 2 + 2 + 12 + 7];
+  uchar *log_pos;
+  LEX_CUSTRING log_array[TRANSLOG_INTERNAL_PARTS + 4];
+  uint offset= (uint) (key_pos - ma_page->buff);
+  uint diff_length= org_length + move_length - new_length;
+  uint translog_parts, extra_length;
+  MARIA_HA *info= ma_page->info;
+  my_off_t page= ma_page->pos / info->s->block_size;
+  DBUG_ENTER("_ma_log_del_prefix");
+  DBUG_PRINT("enter", ("page: %lu  org_length: %u  new_length: %u",
+                       (ulong) page, org_length, new_length));
+
+  DBUG_ASSERT((int) diff_length > 0);
+  DBUG_ASSERT(ma_page->org_size == org_length);
+  DBUG_ASSERT(ma_page->size == new_length);
+
+  log_pos= log_data + FILEID_STORE_SIZE;
+  page_store(log_pos, page);
+  log_pos+= PAGE_STORE_SIZE;
+
+  translog_parts= 1;
+  extra_length= 0;
+
+#ifdef EXTRA_DEBUG_KEY_CHANGES
+  *log_pos++= KEY_OP_DEBUG;
+  *log_pos++= KEY_OP_DEBUG_LOG_DEL_PREFIX;
+#endif
+
+  /* Store keypage_flag */
+  *log_pos++= KEY_OP_SET_PAGEFLAG;
+  *log_pos++= ma_page->buff[KEYPAGE_TRANSFLAG_OFFSET];
+
+  if (offset < diff_length + info->s->keypage_header)
+  {
+    /*
+      Key is not anymore on page. Move data down, but take into account that
+      the original page had grown with 'move_length bytes'
+    */
+    DBUG_ASSERT(offset + key_length <= diff_length + info->s->keypage_header);
+
+    log_pos[0]= KEY_OP_DEL_PREFIX;
+    int2store(log_pos+1, diff_length - move_length);
+    log_pos+= 3;
+  }
+  else
+  {
+    /*
+      Correct position to key, as data before key has been delete and key
+      has thus been moved down
+    */
+    offset-= diff_length;
+    key_pos-= diff_length;
+
+    /* Move data down */
+    log_pos[0]= KEY_OP_DEL_PREFIX;
+    int2store(log_pos+1, diff_length);
+    log_pos+= 3;
+
+    log_pos[0]= KEY_OP_OFFSET;
+    int2store(log_pos+1, offset);
+    log_pos+= 3;
+
+    if (move_length)
+    {
+      log_pos[0]= KEY_OP_SHIFT;
+      int2store(log_pos+1, move_length);
+      log_pos+= 3;
+    }
+    log_pos[0]= KEY_OP_CHANGE;
+    int2store(log_pos+1, key_length);
+    log_pos+= 3;
+    log_array[TRANSLOG_INTERNAL_PARTS + 1].str=    key_pos;
+    log_array[TRANSLOG_INTERNAL_PARTS + 1].length= key_length;
+    translog_parts= 2;
+    extra_length= key_length;
+  }
+  log_array[TRANSLOG_INTERNAL_PARTS + 0].str=    log_data;
+  log_array[TRANSLOG_INTERNAL_PARTS + 0].length= (uint) (log_pos -
+                                                         log_data);
+  _ma_log_key_changes(ma_page,
+                      log_array + TRANSLOG_INTERNAL_PARTS + translog_parts,
+                      log_pos, &extra_length, &translog_parts);
+  /* Remember new page length for future log entires for same page */
+  ma_page->org_size= ma_page->size;
+
+  DBUG_RETURN(translog_write_record(&lsn, LOGREC_REDO_INDEX,
+                                    info->trn, info,
+                                    (translog_size_t)
+                                    log_array[TRANSLOG_INTERNAL_PARTS +
+                                              0].length + extra_length,
+                                    TRANSLOG_INTERNAL_PARTS + translog_parts,
+                                    log_array, log_data, NULL));
+}
+
+
+/**
+   @brief
+   Write log entry for page that has got data added first and
+   data deleted last. Old changed key may be part of page
+*/
+
+static my_bool _ma_log_key_middle(MARIA_PAGE *ma_page,
+                                  uint new_length,
+                                  uint data_added_first,
+                                  uint data_changed_first,
+                                  uint data_deleted_last,
+                                  const uchar *key_pos,
+                                  uint key_length, int move_length)
+{
+  LSN lsn;
+  uchar log_data[FILEID_STORE_SIZE + PAGE_STORE_SIZE + 2 + 2 + 3+5+3+3+3 + 7];
+  uchar *log_pos;
+  LEX_CUSTRING log_array[TRANSLOG_INTERNAL_PARTS + 6];
+  uint key_offset;
+  uint translog_parts, extra_length;
+  MARIA_HA *info= ma_page->info;
+  my_off_t page= ma_page->pos / info->s->block_size;
+  DBUG_ENTER("_ma_log_key_middle");
+  DBUG_PRINT("enter", ("page: %lu", (ulong) page));
+
+  DBUG_ASSERT(ma_page->size == new_length);
+
+  /* new place of key after changes */
+  key_pos+= data_added_first;
+  key_offset= (uint) (key_pos - ma_page->buff);
+  if (key_offset < new_length)
+  {
+    /* key is on page; Calculate how much of the key is there */
+    uint max_key_length= new_length - key_offset;
+    if (max_key_length < key_length)
+    {
+      /* Key is last on page */
+      key_length= max_key_length;
+      move_length= 0;
+    }
+    /*
+      Take into account that new data was added as part of original key
+      that also needs to be removed from page
+    */
+    data_deleted_last+= move_length;
+  }
+
+  /* First log changes to page */
+  log_pos= log_data + FILEID_STORE_SIZE;
+  page_store(log_pos, page);
+  log_pos+= PAGE_STORE_SIZE;
+
+#ifdef EXTRA_DEBUG_KEY_CHANGES
+  *log_pos++= KEY_OP_DEBUG;
+  *log_pos++= KEY_OP_DEBUG_LOG_MIDDLE;
+#endif
+
+  /* Store keypage_flag */
+  *log_pos++= KEY_OP_SET_PAGEFLAG;
+  *log_pos++= ma_page->buff[KEYPAGE_TRANSFLAG_OFFSET];
+
+  log_pos[0]= KEY_OP_DEL_SUFFIX;
+  int2store(log_pos+1, data_deleted_last);
+  log_pos+= 3;
+
+  log_pos[0]= KEY_OP_ADD_PREFIX;
+  int2store(log_pos+1, data_added_first);
+  int2store(log_pos+3, data_changed_first);
+  log_pos+= 5;
+
+  log_array[TRANSLOG_INTERNAL_PARTS + 0].str=    log_data;
+  log_array[TRANSLOG_INTERNAL_PARTS + 0].length= (uint) (log_pos -
+                                                         log_data);
+  log_array[TRANSLOG_INTERNAL_PARTS + 1].str=    (ma_page->buff +
+                                                  info->s->keypage_header);
+  log_array[TRANSLOG_INTERNAL_PARTS + 1].length= data_changed_first;
+  translog_parts= 2;
+  extra_length= data_changed_first;
+
+  /* If changed key is on page, log those changes too */
+
+  if (key_offset < new_length)
+  {
+    uchar *start_log_pos= log_pos;
+
+    log_pos[0]= KEY_OP_OFFSET;
+    int2store(log_pos+1, key_offset);
+    log_pos+= 3;
+    if (move_length)
+    {
+      log_pos[0]= KEY_OP_SHIFT;
+      int2store(log_pos+1, move_length);
+      log_pos+= 3;
+    }
+    log_pos[0]= KEY_OP_CHANGE;
+    int2store(log_pos+1, key_length);
+    log_pos+= 3;
+
+    log_array[TRANSLOG_INTERNAL_PARTS + 2].str=    start_log_pos;
+    log_array[TRANSLOG_INTERNAL_PARTS + 2].length= (uint) (log_pos -
+                                                           start_log_pos);
+
+    log_array[TRANSLOG_INTERNAL_PARTS + 3].str=    key_pos;
+    log_array[TRANSLOG_INTERNAL_PARTS + 3].length= key_length;
+    translog_parts+=2;
+    extra_length+= (uint) (log_array[TRANSLOG_INTERNAL_PARTS + 2].length +
+                           key_length);
+  }
+
+  _ma_log_key_changes(ma_page,
+                      log_array + TRANSLOG_INTERNAL_PARTS + translog_parts,
+                      log_pos, &extra_length, &translog_parts);
+  /* Remember new page length for future log entires for same page */
+  ma_page->org_size= ma_page->size;
+
+  DBUG_RETURN(translog_write_record(&lsn, LOGREC_REDO_INDEX,
+                                    info->trn, info,
+                                    (translog_size_t)
+                                    (log_array[TRANSLOG_INTERNAL_PARTS +
+                                               0].length + extra_length),
+                                    TRANSLOG_INTERNAL_PARTS + translog_parts,
+                                    log_array, log_data, NULL));
+}
+
+
+#ifdef NOT_NEEDED
+
+/**
+   @brief
+   Write log entry for page that has got data added first and
+   data deleted last
+*/
+
+static my_bool _ma_log_middle(MARIA_PAGE *ma_page,
+                              uint data_added_first, uint data_changed_first,
+                              uint data_deleted_last)
+{
+  LSN lsn;
+  LEX_STRING log_array[TRANSLOG_INTERNAL_PARTS + 4];
+  uchar log_data[FILEID_STORE_SIZE + PAGE_STORE_SIZE + 3 + 5 + 7], *log_pos;
+  MARIA_HA *info= ma_page->info;
+  my_off_t page= ma_page->page / info->s->block_size;
+  uint translog_parts, extra_length;
+  DBUG_ENTER("_ma_log_middle");
+  DBUG_PRINT("enter", ("page: %lu", (ulong) page));
+
+  DBUG_ASSERT(ma_page->org_size + data_added_first - data_deleted_last ==
+              ma_page->size);
+
+  log_pos= log_data + FILEID_STORE_SIZE;
+  page_store(log_pos, page);
+  log_pos+= PAGE_STORE_SIZE;
+
+  log_pos[0]= KEY_OP_DEL_PREFIX;
+  int2store(log_pos+1, data_deleted_last);
+  log_pos+= 3;
+
+  log_pos[0]= KEY_OP_ADD_PREFIX;
+  int2store(log_pos+1, data_added_first);
+  int2store(log_pos+3, data_changed_first);
+  log_pos+= 5;
+
+  log_array[TRANSLOG_INTERNAL_PARTS + 0].str=    log_data;
+  log_array[TRANSLOG_INTERNAL_PARTS + 0].length= (uint) (log_pos -
+                                                         log_data);
+
+  log_array[TRANSLOG_INTERNAL_PARTS + 1].str=    ((char*) buff +
+                                                  info->s->keypage_header);
+  log_array[TRANSLOG_INTERNAL_PARTS + 1].length= data_changed_first;
+  translog_parts= 2;
+  extra_length= data_changed_first;
+
+  _ma_log_key_changes(ma_page,
+                      log_array + TRANSLOG_INTERNAL_PARTS + translog_parts,
+                      log_pos, &extra_length, &translog_parts);
+  /* Remember new page length for future log entires for same page */
+  ma_page->org_size= ma_page->size;
+
+  DBUG_RETURN(translog_write_record(&lsn, LOGREC_REDO_INDEX,
+                                    info->trn, info,
+                                    (translog_size_t)
+                                    log_array[TRANSLOG_INTERNAL_PARTS +
+                                              0].length + extra_length,
+                                    TRANSLOG_INTERNAL_PARTS + translog_parts,
+                                    log_array, log_data, NULL));
+}
+#endif
diff --git a/storage/maria/maria_chk.c b/storage/maria/maria_chk.c
new file mode 100644
index 00000000000..4e19d5878ea
--- /dev/null
+++ b/storage/maria/maria_chk.c
@@ -0,0 +1,2008 @@
+/* Copyright (C) 2006-2003 MySQL AB
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; version 2 of the License.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */
+
+/* Describe, check and repair of MARIA tables */
+
+#include "ma_fulltext.h"
+#include <myisamchk.h>
+#include <my_bit.h>
+#include <m_ctype.h>
+#include <stdarg.h>
+#include <my_getopt.h>
+#ifdef HAVE_SYS_VADVICE_H
+#include <sys/vadvise.h>
+#endif
+#ifdef HAVE_SYS_MMAN_H
+#include <sys/mman.h>
+#endif
+SET_STACK_SIZE(9000)			/* Minimum stack size for program */
+
+#ifndef USE_RAID
+#define my_raid_create(A,B,C,D,E,F,G) my_create(A,B,C,G)
+#define my_raid_delete(A,B,C) my_delete(A,B)
+#endif
+
+static uint decode_bits;
+static char **default_argv;
+static const char *load_default_groups[]= { "aria_chk", 0 };
+static const char *set_collation_name, *opt_tmpdir, *opt_log_dir;
+static CHARSET_INFO *set_collation;
+static int stopwords_inited= 0;
+static MY_TMPDIR maria_chk_tmpdir;
+static my_bool opt_transaction_logging, opt_debug, opt_require_control_file;
+static my_bool opt_warning_for_wrong_transid;
+
+static const char *type_names[]=
+{
+  "impossible","char","binary", "short", "long", "float",
+  "double","number","unsigned short",
+  "unsigned long","longlong","ulonglong","int24",
+  "uint24","int8","varchar", "varbin", "varchar2", "varbin2", "bit",
+  "?","?"
+};
+
+static const char *prefix_packed_txt="packed ",
+		  *bin_packed_txt="prefix ",
+		  *diff_txt="stripped ",
+		  *null_txt="NULL",
+		  *blob_txt="BLOB ";
+
+static const char *field_pack[]=
+{
+  "","no endspace", "no prespace",
+ "no zeros", "blob", "constant", "table-lockup",
+ "always zero","varchar","unique-hash","?","?"
+};
+
+static const char *record_formats[]=
+{
+  "Fixed length", "Packed", "Compressed", "Block", "?"
+};
+
+static const char *bitmap_description[]=
+{
+  "Empty page", "Part filled head page","Part filled head page",
+  "Part filled head page", "Full head page",
+  "Part filled tail page","Part filled tail page",
+  "Full tail or blob page"
+};
+
+static const char *maria_stats_method_str="nulls_unequal";
+static char default_open_errmsg[]=  "%d when opening Aria table '%s'";
+static char default_close_errmsg[]= "%d when closing Aria table '%s'";
+
+static void get_options(int *argc,char * * *argv);
+static void print_version(void);
+static void usage(void);
+static int maria_chk(HA_CHECK *param, char *filename);
+static void descript(HA_CHECK *param, register MARIA_HA *info, char *name);
+static int maria_sort_records(HA_CHECK *param, register MARIA_HA *info,
+                              char *name, uint sort_key,
+                              my_bool write_info, my_bool update_index);
+static int sort_record_index(MARIA_SORT_PARAM *sort_param, MARIA_PAGE *page,
+			     uint sortkey, File new_file,
+                             my_bool update_index);
+static my_bool write_log_record(HA_CHECK *param);
+
+HA_CHECK check_param;
+
+	/* Main program */
+
+int main(int argc, char **argv)
+{
+  int error;
+  MY_INIT(argv[0]);
+
+  opt_log_dir= maria_data_root= (char *)".";
+  maria_chk_init(&check_param);
+  check_param.opt_lock_memory= 1;		/* Lock memory if possible */
+  check_param.using_global_keycache = 0;
+  get_options(&argc,(char***) &argv);
+  maria_quick_table_bits=decode_bits;
+  error=0;
+  maria_init();
+
+  maria_block_size= 0;                 /* Use block size from control file */
+  if (ma_control_file_open(FALSE, opt_require_control_file ||
+                           !(check_param.testflag & T_SILENT)) &&
+      (opt_require_control_file ||
+       (opt_transaction_logging && (check_param.testflag & T_REP_ANY))))
+  {
+    error= 1;
+    goto end;
+  }
+
+  /*
+    If we are doing a repair, user may want to store this repair into the log
+    so that the log has a complete history and can be used to replay.
+  */
+  if (opt_transaction_logging && (check_param.testflag & T_REP_ANY))
+  {
+    if (init_pagecache(maria_log_pagecache,
+                       TRANSLOG_PAGECACHE_SIZE, 0, 0,
+                       TRANSLOG_PAGE_SIZE, MY_WME) == 0 ||
+        translog_init(opt_log_dir, TRANSLOG_FILE_SIZE,
+                      0, 0, maria_log_pagecache,
+                      TRANSLOG_DEFAULT_FLAGS, 0))
+    {
+      _ma_check_print_error(&check_param,
+                            "Can't initialize transaction logging. Run "
+                            "recovery with switch --skip-transaction-log");
+      error= 1;
+      goto end;
+    }
+  }
+
+  while (--argc >= 0)
+  {
+    int new_error=maria_chk(&check_param, *(argv++));
+    if ((check_param.testflag & T_REP_ANY) != T_REP)
+      check_param.testflag&= ~T_REP;
+    VOID(fflush(stdout));
+    VOID(fflush(stderr));
+    if ((check_param.error_printed | check_param.warning_printed) &&
+	(check_param.testflag & T_FORCE_CREATE) &&
+	(!(check_param.testflag & (T_REP | T_REP_BY_SORT | T_SORT_RECORDS |
+				   T_SORT_INDEX))))
+    {
+      ulonglong old_testflag=check_param.testflag;
+      if (!(check_param.testflag & T_REP))
+	check_param.testflag|= T_REP_BY_SORT;
+      check_param.testflag&= ~T_EXTEND;			/* Not needed  */
+      error|=maria_chk(&check_param, argv[-1]);
+      check_param.testflag= old_testflag;
+      VOID(fflush(stdout));
+      VOID(fflush(stderr));
+    }
+    else
+      error|=new_error;
+    if (argc && (!(check_param.testflag & T_SILENT) ||
+                 check_param.testflag & T_INFO))
+    {
+      puts("\n---------\n");
+      VOID(fflush(stdout));
+    }
+  }
+end:
+  if (check_param.total_files > 1)
+  {					/* Only if descript */
+    char buff[22],buff2[22];
+    if (!(check_param.testflag & T_SILENT) || check_param.testflag & T_INFO)
+      puts("\n---------");
+    printf("\nTotal of all %d Aria-files:\nData records: %9s   Deleted blocks: %9s\n",check_param.total_files,llstr(check_param.total_records,buff),
+	   llstr(check_param.total_deleted,buff2));
+  }
+  free_defaults(default_argv);
+  free_tmpdir(&maria_chk_tmpdir);
+  maria_end();
+  my_end(check_param.testflag & T_INFO ?
+         MY_CHECK_ERROR | MY_GIVE_INFO : MY_CHECK_ERROR);
+  exit(error);
+#ifndef _lint
+  return 0;				/* No compiler warning */
+#endif
+} /* main */
+
+enum options_mc {
+  OPT_CHARSETS_DIR=256, OPT_SET_COLLATION,OPT_START_CHECK_POS,
+  OPT_CORRECT_CHECKSUM, OPT_PAGE_BUFFER_SIZE,
+  OPT_KEY_CACHE_BLOCK_SIZE, OPT_MARIA_BLOCK_SIZE,
+  OPT_READ_BUFFER_SIZE, OPT_WRITE_BUFFER_SIZE, OPT_SORT_BUFFER_SIZE,
+  OPT_SORT_KEY_BLOCKS, OPT_DECODE_BITS, OPT_FT_MIN_WORD_LEN,
+  OPT_FT_MAX_WORD_LEN, OPT_FT_STOPWORD_FILE,
+  OPT_MAX_RECORD_LENGTH, OPT_AUTO_CLOSE, OPT_STATS_METHOD, OPT_TRANSACTION_LOG,
+  OPT_SKIP_SAFEMALLOC, OPT_ZEROFILL_KEEP_LSN, OPT_REQUIRE_CONTROL_FILE,
+  OPT_LOG_DIR, OPT_DATADIR, OPT_WARNING_FOR_WRONG_TRANSID
+};
+
+static struct my_option my_long_options[] =
+{
+  {"analyze", 'a',
+   "Analyze distribution of keys. Will make some joins in MySQL faster. You can check the calculated distribution.",
+   0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
+#ifdef __NETWARE__
+  {"autoclose", OPT_AUTO_CLOSE, "Auto close the screen on exit for Netware.",
+   0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
+#endif
+  {"block-search", 'b',
+   "No help available.",
+   0, 0, 0, GET_ULONG, REQUIRED_ARG, 0, 0, 0, 0, 0, 0},
+  {"backup", 'B',
+   "Make a backup of the .MAD file as 'filename-time.BAK'.",
+   0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
+  {"character-sets-dir", OPT_CHARSETS_DIR,
+   "Directory where character sets are.",
+   (char**) &charsets_dir, 0, 0, GET_STR, REQUIRED_ARG, 0, 0, 0, 0, 0, 0},
+  {"check", 'c',
+   "Check table for errors.",
+   0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
+  {"check-only-changed", 'C',
+   "Check only tables that have changed since last check. It also applies to other requested actions (e.g. --analyze will be ignored if the table is already analyzed).",
+   0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
+  {"correct-checksum", OPT_CORRECT_CHECKSUM,
+   "Correct checksum information for table.",
+   0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
+#ifndef DBUG_OFF
+  {"debug", '#',
+   "Output debug log. Often this is 'd:t:o,filename'.",
+   0, 0, 0, GET_STR, OPT_ARG, 0, 0, 0, 0, 0, 0},
+#endif
+  {"description", 'd',
+   "Prints some information about table.",
+   0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
+  {"data-file-length", 'D',
+   "Max length of data file (when recreating data-file when it's full).",
+   &check_param.max_data_file_length,
+   &check_param.max_data_file_length,
+   0, GET_LL, REQUIRED_ARG, 0, 0, 0, 0, 0, 0},
+  {"extend-check", 'e',
+   "If used when checking a table, ensure that the table is 100 percent consistent, which will take a long time. If used when repairing a table, try to recover every possible row from the data file. Normally this will also find a lot of garbage rows; Don't use this option with repair if you are not totally desperate.",
+   0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
+  {"fast", 'F',
+   "Check only tables that haven't been closed properly. It also applies to other requested actions (e.g. --analyze will be ignored if the table is already analyzed).",
+   0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
+  {"force", 'f',
+   "Restart with -r if there are any errors in the table. States will be updated as with --update-state.",
+   0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
+  {"HELP", 'H',
+   "Display this help and exit.",
+   0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
+  {"help", '?',
+   "Display this help and exit.",
+   0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
+  {"information", 'i',
+   "Print statistics information about table that is checked.",
+   0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
+  {"keys-used", 'k',
+   "Tell Aria to update only some specific keys. # is a bit mask of which keys to use. This can be used to get faster inserts.",
+   &check_param.keys_in_use,
+   &check_param.keys_in_use,
+   0, GET_ULL, REQUIRED_ARG, -1, 0, 0, 0, 0, 0},
+  {"datadir", OPT_DATADIR,
+   "Path for control file (and logs if --logdir not used).",
+   &maria_data_root, 0, 0, GET_STR, REQUIRED_ARG,
+   0, 0, 0, 0, 0, 0},
+  {"logdir", OPT_LOG_DIR,
+   "Path for log files.",
+   (char**) &opt_log_dir, 0, 0, GET_STR, REQUIRED_ARG, 0, 0, 0, 0, 0, 0},
+  {"max-record-length", OPT_MAX_RECORD_LENGTH,
+   "Skip rows bigger than this if aria_chk can't allocate memory to hold it",
+   &check_param.max_record_length,
+   &check_param.max_record_length,
+   0, GET_ULL, REQUIRED_ARG, LONGLONG_MAX, 0, LONGLONG_MAX, 0, 0, 0},
+  {"medium-check", 'm',
+   "Faster than extend-check, but only finds 99.99% of all errors. Should be good enough for most cases.",
+   0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
+  {"quick", 'q', "Faster repair by not modifying the data file.",
+   0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
+  {"read-only", 'T',
+   "Don't mark table as checked.",
+   0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
+  {"recover", 'r',
+   "Can fix almost anything except unique keys that aren't unique.",
+   0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
+  {"parallel-recover", 'p',
+   "Same as '-r' but creates all the keys in parallel.",
+   0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
+  {"safe-recover", 'o',
+   "Uses old recovery method; Slower than '-r' but can handle a couple of cases where '-r' reports that it can't fix the data file.",
+   0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
+  {"sort-recover", 'n',
+   "Force recovering with sorting even if the temporary file was very big.",
+   0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
+  { "require-control-file", OPT_REQUIRE_CONTROL_FILE,
+    "Abort if cannot find control file",
+    (uchar**)&opt_require_control_file, 0, 0, GET_BOOL, NO_ARG,
+    0, 0, 0, 0, 0, 0},
+#ifdef DEBUG
+  {"start-check-pos", OPT_START_CHECK_POS,
+   "No help available.",
+   0, 0, 0, GET_ULL, REQUIRED_ARG, 0, 0, 0, 0, 0, 0},
+#endif
+  {"set-auto-increment", 'A',
+   "Force auto_increment to start at this or higher value. If no value is given, then sets the next auto_increment value to the highest used value for the auto key + 1.",
+   &check_param.auto_increment_value,
+   &check_param.auto_increment_value,
+   0, GET_ULL, OPT_ARG, 0, 0, 0, 0, 0, 0},
+  {"set-collation", OPT_SET_COLLATION,
+   "Change the collation used by the index",
+   (char**) &set_collation_name, 0, 0, GET_STR, REQUIRED_ARG,
+   0, 0, 0, 0, 0, 0},
+  {"silent", 's',
+   "Only print errors. One can use two -s to make aria_chk very silent.",
+   0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
+#ifndef DBUG_OFF
+#ifdef SAFEMALLOC
+  {"skip-safemalloc", OPT_SKIP_SAFEMALLOC,
+   "Don't use the memory allocation checking.", 0, 0, 0, GET_NO_ARG, NO_ARG,
+   0, 0, 0, 0, 0, 0},
+#endif
+#endif
+  {"sort-index", 'S',
+   "Sort index blocks. This speeds up 'read-next' in applications.",
+   0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
+  {"sort-records", 'R',
+   "Sort records according to an index. This makes your data much more localized and may speed up things. (It may be VERY slow to do a sort the first time!)",
+   &check_param.opt_sort_key,
+   &check_param.opt_sort_key,
+   0, GET_UINT, REQUIRED_ARG, 0, 0, 0, 0, 0, 0},
+  {"tmpdir", 't', "Path for temporary files.", (char**) &opt_tmpdir,
+   0, 0, GET_STR, REQUIRED_ARG, 0, 0, 0, 0, 0, 0},
+  {"transaction-log", OPT_TRANSACTION_LOG,
+   "Log repair command to transaction log",
+   &opt_transaction_logging, &opt_transaction_logging,
+   0, GET_BOOL, NO_ARG, 0, 0, 0, 0, 0, 0},
+  {"update-state", 'U',
+   "Mark tables as crashed if any errors were found and clean if check didn't "
+   "find any errors. This allows one to get rid of warnings like 'table not "
+   "properly closed'",
+   0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
+  {"unpack", 'u',
+   "Unpack file packed with aria_pack.",
+   0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
+  {"verbose", 'v',
+   "Print more information. This can be used with --description and --check. Use many -v for more verbosity!",
+   0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
+  {"version", 'V', "Print version and exit.",
+   0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
+  {"wait", 'w', "Wait if table is locked.",
+   0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
+  {"warning-for-wrong-transaction-id", OPT_WARNING_FOR_WRONG_TRANSID,
+   "Give a warning if we find a transaction id in the table that is bigger"
+   "than what exists in the control file. Use --skip-... to disable warning",
+   &opt_warning_for_wrong_transid, &opt_warning_for_wrong_transid,
+   0, GET_BOOL, NO_ARG, 1, 0, 0, 0, 0, 0},
+  { "page_buffer_size", OPT_PAGE_BUFFER_SIZE,
+    "Size of page buffer. Used by --safe-repair",
+    &check_param.use_buffers, &check_param.use_buffers, 0,
+    GET_ULONG, REQUIRED_ARG, (long) USE_BUFFER_INIT, 1024L*1024L,
+    (long) ~0L, (long) MALLOC_OVERHEAD, (long) IO_SIZE, 0},
+  { "read_buffer_size", OPT_READ_BUFFER_SIZE,
+    "Read buffer size for sequential reads during scanning",
+    &check_param.read_buffer_length,
+    &check_param.read_buffer_length, 0, GET_ULONG, REQUIRED_ARG,
+    (long) READ_BUFFER_INIT, (long) MALLOC_OVERHEAD,
+    (long) ~0L, (long) MALLOC_OVERHEAD, (long) 1L, 0},
+  { "write_buffer_size", OPT_WRITE_BUFFER_SIZE,
+    "Write buffer size for sequential writes during repair of fixed size or dynamic size rows",
+    &check_param.write_buffer_length,
+    &check_param.write_buffer_length, 0, GET_ULONG, REQUIRED_ARG,
+    (long) READ_BUFFER_INIT, (long) MALLOC_OVERHEAD,
+    (long) ~0L, (long) MALLOC_OVERHEAD, (long) 1L, 0},
+  { "sort_buffer_size", OPT_SORT_BUFFER_SIZE,
+    "Size of sort buffer. Used by --recover",
+    &check_param.sort_buffer_length,
+    &check_param.sort_buffer_length, 0, GET_ULONG, REQUIRED_ARG,
+    (long) SORT_BUFFER_INIT, (long) (MIN_SORT_BUFFER + MALLOC_OVERHEAD),
+    (long) ~0L, (long) MALLOC_OVERHEAD, (long) 1L, 0},
+  { "sort_key_blocks", OPT_SORT_KEY_BLOCKS,
+    "Internal buffer for sorting keys; Don't touch :)",
+    &check_param.sort_key_blocks,
+    &check_param.sort_key_blocks, 0, GET_ULONG, REQUIRED_ARG,
+    BUFFERS_WHEN_SORTING, 4L, 100L, 0L, 1L, 0},
+  { "decode_bits", OPT_DECODE_BITS, "", &decode_bits,
+    &decode_bits, 0, GET_UINT, REQUIRED_ARG, 9L, 4L, 17L, 0L, 1L, 0},
+  { "ft_min_word_len", OPT_FT_MIN_WORD_LEN, "", &ft_min_word_len,
+    &ft_min_word_len, 0, GET_ULONG, REQUIRED_ARG, 4, 1, HA_FT_MAXCHARLEN,
+    0, 1, 0},
+  { "ft_max_word_len", OPT_FT_MAX_WORD_LEN, "", &ft_max_word_len,
+    &ft_max_word_len, 0, GET_ULONG, REQUIRED_ARG, HA_FT_MAXCHARLEN, 10,
+    HA_FT_MAXCHARLEN, 0, 1, 0},
+  { "aria_ft_stopword_file", OPT_FT_STOPWORD_FILE,
+    "Use stopwords from this file instead of built-in list.",
+    (char**) &ft_stopword_file, (char**) &ft_stopword_file, 0, GET_STR,
+    REQUIRED_ARG, 0, 0, 0, 0, 0, 0},
+  { "stats_method", OPT_STATS_METHOD,
+    "Specifies how index statistics collection code should treat NULLs. "
+    "Possible values of name are \"nulls_unequal\" (default behavior for 4.1/5.0), "
+    "\"nulls_equal\" (emulate 4.0 behavior), and \"nulls_ignored\".",
+    (char**) &maria_stats_method_str, (char**) &maria_stats_method_str, 0,
+    GET_STR, REQUIRED_ARG, 0, 0, 0, 0, 0, 0},
+  { "zerofill", 'z',
+    "Fill empty space in data and index files with zeroes,",
+    0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
+  { "zerofill-keep-lsn", OPT_ZEROFILL_KEEP_LSN,
+    "Like --zerofill but does not zero out LSN of data/index pages;"
+    " used only for testing and debugging",
+    0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
+  { 0, 0, 0, 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}
+};
+
+
+#include <help_start.h>
+
+static void print_version(void)
+{
+  printf("%s  Ver 1.0 for %s at %s\n", my_progname, SYSTEM_TYPE,
+	 MACHINE_TYPE);
+  NETWARE_SET_SCREEN_MODE(1);
+}
+
+
+static void usage(void)
+{
+  print_version();
+  puts("By Monty, for your professional use");
+  puts("This software comes with NO WARRANTY: see the PUBLIC for details.\n");
+  puts("Description, check and repair of Aria tables.");
+  puts("Used without options all tables on the command will be checked for errors");
+  printf("Usage: %s [OPTIONS] tables[.MAI]\n", my_progname_short);
+  printf("\nGlobal options:\n");
+#ifndef DBUG_OFF
+  printf("\
+  -#, --debug=...     Output debug log. Often this is 'd:t:o,filename'.\n");
+#endif
+  printf("\
+  -H, --HELP          Display this help and exit.\n\
+  -?, --help          Display this help and exit.\n\
+  --datadir=path      Path for control file (and logs if --logdir not used)\n\
+  --logdir=path       Path for log files\n\
+  --require-control-file  Abort if we can't find/read the maria_log_control\n\
+                          file\n\
+  -s, --silent	      Only print errors.  One can use two -s to make\n\
+		      maria_chk very silent.\n\
+  -t, --tmpdir=path   Path for temporary files. Multiple paths can be\n\
+                      specified, separated by ");
+#if defined( __WIN__) || defined(__NETWARE__)
+   printf("semicolon (;)");
+#else
+   printf("colon (:)");
+#endif
+   printf(", they will be used\n\
+                      in a round-robin fashion.\n\
+  -v, --verbose       Print more information. This can be used with\n\
+                      --description and --check. Use many -v for more verbosity.\n\
+  -V, --version       Print version and exit.\n\
+  -w, --wait          Wait if table is locked.\n\n");
+#ifdef DEBUG
+  puts("  --start-check-pos=# Start reading file at given offset.\n");
+#endif
+
+  puts("Check options (check is the default action for aria_chk):\n\
+  -c, --check	      Check table for errors.\n\
+  -e, --extend-check  Check the table VERY throughly.  Only use this in\n\
+                      extreme cases as aria_chk should normally be able to\n\
+                      find out if the table is ok even without this switch.\n\
+  -F, --fast	      Check only tables that haven't been closed properly.\n\
+  -C, --check-only-changed\n\
+		      Check only tables that have changed since last check.\n\
+  -f, --force         Restart with '-r' if there are any errors in the table.\n\
+		      States will be updated as with '--update-state'.\n\
+  -i, --information   Print statistics information about table that is checked.\n\
+  -m, --medium-check  Faster than extend-check, but only finds 99.99% of\n\
+		      all errors.  Should be good enough for most cases.\n\
+  -U, --update-state  Mark tables as crashed if you find any errors.\n\
+  -T, --read-only     Don't mark table as checked.\n");
+
+  puts("\
+Recover (repair)/ options (When using '--recover' or '--safe-recover'):\n\
+  -B, --backup	      Make a backup of the .MAD file as 'filename-time.BAK'.\n\
+  --correct-checksum  Correct checksum information for table.\n\
+  -D, --data-file-length=#  Max length of data file (when recreating data\n\
+                      file when it's full).\n\
+  -e, --extend-check  Try to recover every possible row from the data file\n\
+		      Normally this will also find a lot of garbage rows;\n\
+		      Don't use this option if you are not totally desperate.\n\
+  -f, --force         Overwrite old temporary files.\n\
+  -k, --keys-used=#   Tell Aria to update only some specific keys. # is a\n\
+	              bit mask of which keys to use. This can be used to\n\
+		      get faster inserts.\n\
+  --max-record-length=#\n\
+                      Skip rows bigger than this if aria_chk can't allocate\n\
+		      memory to hold it.\n\
+  -r, --recover       Can fix almost anything except unique keys that aren't\n\
+                      unique.\n\
+  -n, --sort-recover  Forces recovering with sorting even if the temporary\n\
+		      file would be very big.\n\
+  -p, --parallel-recover\n\
+                      Uses the same technique as '-r' and '-n', but creates\n\
+                      all the keys in parallel, in different threads.");
+  puts("\
+  -o, --safe-recover  Uses old recovery method; Slower than '-r' but can\n \
+		      handle a couple of cases where '-r' reports that it\n\
+		      can't fix the data file.\n\
+  --transaction-log   Log repair command to transaction log. This is needed\n\
+                      if one wants to use the aria_read_log to repeat the \n\
+                      repair\n\
+  --character-sets-dir=...\n\
+                      Directory where character sets are.\n\
+  --set-collation=name\n\
+ 		      Change the collation used by the index.\n\
+  -q, --quick         Faster repair by not modifying the data file.\n\
+                      One can give a second '-q' to force aria_chk to\n\
+		      modify the original datafile in case of duplicate keys.\n\
+		      NOTE: Tables where the data file is currupted can't be\n\
+		      fixed with this option.\n\
+  -u, --unpack        Unpack file packed with ariapack.\n\
+");
+
+  puts("Other actions:\n\
+  -a, --analyze	      Analyze distribution of keys. Will make some joins in\n\
+		      MariaDB faster.  You can check the calculated distribution\n\
+		      by using '--description --verbose table_name'.\n\
+  --stats_method=name Specifies how index statistics collection code should\n\
+                      treat NULLs. Possible values of name are \"nulls_unequal\"\n\
+                      (default for 4.1/5.0), \"nulls_equal\" (emulate 4.0), and \n\
+                      \"nulls_ignored\".\n\
+  -d, --description   Prints some information about table.\n\
+  -A, --set-auto-increment[=value]\n\
+		      Force auto_increment to start at this or higher value\n\
+		      If no value is given, then sets the next auto_increment\n\
+		      value to the highest used value for the auto key + 1.\n\
+  -S, --sort-index    Sort index blocks.  This speeds up 'read-next' in\n\
+		      applications.\n\
+  -R, --sort-records=#\n\
+		      Sort records according to an index.  This makes your\n\
+		      data much more localized and may speed up things\n\
+		      (It may be VERY slow to do a sort the first time!).\n\
+  -b,  --block-search=#\n\
+                      Find a record, a block at given offset belongs to.\n\
+  -z,  --zerofill     Fill empty space in data and index files with zeroes\n\
+  --zerofill-keep-lsn Like --zerofill but does not zero out LSN of\n\
+                      data/index pages.");
+
+  puts("Variables:\n\
+--page_buffer_size=#   Size of page buffer. Used by --safe-repair\n\
+--read_buffer_size=#   Read buffer size for sequential reads during scanning\n\
+--sort_buffer_size=#   Size of sort buffer. Used by --recover\n\
+--sort_key_blocks=#    Internal buffer for sorting keys; Don't touch :)\n\
+--write_buffer_size=#  Write buffer size for sequential writes during repair");
+
+  print_defaults("my", load_default_groups);
+  my_print_variables(my_long_options);
+}
+
+#include <help_end.h>
+
+const char *maria_stats_method_names[] = {"nulls_unequal", "nulls_equal",
+                                           "nulls_ignored", NullS};
+TYPELIB maria_stats_method_typelib= {
+  array_elements(maria_stats_method_names) - 1, "",
+  maria_stats_method_names, NULL};
+
+	 /* Read options */
+
+static my_bool
+get_one_option(int optid,
+	       const struct my_option *opt __attribute__((unused)),
+	       char *argument)
+{
+  switch (optid) {
+#ifdef __NETWARE__
+  case OPT_AUTO_CLOSE:
+    setscreenmode(SCR_AUTOCLOSE_ON_EXIT);
+    break;
+#endif
+  case 'a':
+    if (argument == disabled_my_option)
+      check_param.testflag&= ~T_STATISTICS;
+    else
+      check_param.testflag|= T_STATISTICS;
+    break;
+  case 'A':
+    if (argument)
+      check_param.auto_increment_value= strtoull(argument, NULL, 0);
+    else
+      check_param.auto_increment_value= 0;	/* Set to max used value */
+    check_param.testflag|= T_AUTO_INC;
+    break;
+  case 'b':
+    check_param.search_after_block= strtoul(argument, NULL, 10);
+    break;
+  case 'B':
+    if (argument == disabled_my_option)
+      check_param.testflag&= ~T_BACKUP_DATA;
+    else
+      check_param.testflag|= T_BACKUP_DATA;
+    break;
+  case 'c':
+    if (argument == disabled_my_option)
+      check_param.testflag&= ~T_CHECK;
+    else
+      check_param.testflag|= T_CHECK;
+    break;
+  case 'C':
+    if (argument == disabled_my_option)
+      check_param.testflag&= ~(T_CHECK | T_CHECK_ONLY_CHANGED);
+    else
+      check_param.testflag|= T_CHECK | T_CHECK_ONLY_CHANGED;
+    break;
+  case 'D':
+    check_param.max_data_file_length=strtoll(argument, NULL, 10);
+    break;
+  case 's':				/* silent */
+    if (argument == disabled_my_option)
+      check_param.testflag&= ~(T_SILENT | T_VERY_SILENT);
+    else
+    {
+      if (check_param.testflag & T_SILENT)
+	check_param.testflag|= T_VERY_SILENT;
+      check_param.testflag|= T_SILENT;
+      check_param.testflag&= ~T_WRITE_LOOP;
+    }
+    break;
+  case 'w':
+    if (argument == disabled_my_option)
+      check_param.testflag&= ~T_WAIT_FOREVER;
+    else
+      check_param.testflag|= T_WAIT_FOREVER;
+    break;
+  case 'd':				/* description if isam-file */
+    if (argument == disabled_my_option)
+      check_param.testflag&= ~T_DESCRIPT;
+    else
+      check_param.testflag|= T_DESCRIPT;
+    break;
+  case 'e':				/* extend check */
+    if (argument == disabled_my_option)
+      check_param.testflag&= ~T_EXTEND;
+    else
+      check_param.testflag|= T_EXTEND;
+    break;
+  case 'i':
+    if (argument == disabled_my_option)
+      check_param.testflag&= ~T_INFO;
+    else
+      check_param.testflag|= T_INFO;
+    break;
+  case 'f':
+    if (argument == disabled_my_option)
+    {
+      check_param.tmpfile_createflag= O_RDWR | O_TRUNC | O_EXCL;
+      check_param.testflag&= ~(T_FORCE_CREATE | T_UPDATE_STATE);
+    }
+    else
+    {
+      check_param.tmpfile_createflag= O_RDWR | O_TRUNC;
+      check_param.testflag|= T_FORCE_CREATE | T_UPDATE_STATE;
+    }
+    break;
+  case 'F':
+    if (argument == disabled_my_option)
+      check_param.testflag&= ~T_FAST;
+    else
+      check_param.testflag|= T_FAST;
+    break;
+  case 'k':
+    check_param.keys_in_use= (ulonglong) strtoll(argument, NULL, 10);
+    break;
+  case 'm':
+    if (argument == disabled_my_option)
+      check_param.testflag&= ~T_MEDIUM;
+    else
+      check_param.testflag|= T_MEDIUM;		/* Medium check */
+    break;
+  case 'r':				/* Repair table */
+    check_param.testflag&= ~T_REP_ANY;
+    if (argument != disabled_my_option)
+      check_param.testflag|= T_REP_BY_SORT;
+    break;
+  case 'p':
+    check_param.testflag&= ~T_REP_ANY;
+    if (argument != disabled_my_option)
+      check_param.testflag|= T_REP_PARALLEL;
+    break;
+  case 'o':
+    check_param.testflag&= ~T_REP_ANY;
+    check_param.force_sort= 0;
+    if (argument != disabled_my_option)
+    {
+      check_param.testflag|= T_REP;
+      my_disable_async_io= 1;		/* More safety */
+    }
+    break;
+  case 'n':
+    check_param.testflag&= ~T_REP_ANY;
+    if (argument == disabled_my_option)
+      check_param.force_sort= 0;
+    else
+    {
+      check_param.testflag|= T_REP_BY_SORT;
+      check_param.force_sort= 1;
+    }
+    break;
+  case 'q':
+    if (argument == disabled_my_option)
+      check_param.testflag&= ~(T_QUICK | T_FORCE_UNIQUENESS);
+    else
+      check_param.testflag|=
+        (check_param.testflag & T_QUICK) ? T_FORCE_UNIQUENESS : T_QUICK;
+    break;
+  case 'u':
+    if (argument == disabled_my_option)
+      check_param.testflag&= ~T_UNPACK;
+    else
+    {
+      check_param.testflag|= T_UNPACK;
+      if (!(check_param.testflag & T_REP_ANY))
+        check_param.testflag|= T_REP_BY_SORT;
+    }
+    break;
+  case 'v':				/* Verbose */
+    if (argument == disabled_my_option)
+    {
+      check_param.testflag&= ~T_VERBOSE;
+      check_param.verbose=0;
+    }
+    else
+    {
+      check_param.testflag|= T_VERBOSE;
+      check_param.verbose++;
+    }
+    break;
+  case 'R':				/* Sort records */
+    if (argument == disabled_my_option)
+      check_param.testflag&= ~T_SORT_RECORDS;
+    else
+    {
+      check_param.testflag|= T_SORT_RECORDS;
+      check_param.opt_sort_key= (uint) atoi(argument) - 1;
+      if (check_param.opt_sort_key >= MARIA_MAX_KEY)
+      {
+	fprintf(stderr,
+		"The value of the sort key is bigger than max key: %d.\n",
+		MARIA_MAX_KEY);
+	exit(1);
+      }
+    }
+    break;
+  case 'S':			      /* Sort index */
+    if (argument == disabled_my_option)
+      check_param.testflag&= ~T_SORT_INDEX;
+    else
+      check_param.testflag|= T_SORT_INDEX;
+    break;
+  case 'T':
+    if (argument == disabled_my_option)
+      check_param.testflag&= ~T_READONLY;
+    else
+      check_param.testflag|= T_READONLY;
+    break;
+  case 'U':
+    if (argument == disabled_my_option)
+      check_param.testflag&= ~T_UPDATE_STATE;
+    else
+      check_param.testflag|= T_UPDATE_STATE;
+    break;
+  case '#':
+    DBUG_SET_INITIAL(argument ? argument : "d:t:o,/tmp/aria_chk.trace");
+    opt_debug= 1;
+    break;
+  case OPT_SKIP_SAFEMALLOC:
+#ifdef SAFEMALLOC
+    sf_malloc_quick=1;
+#endif
+    break;
+  case 'V':
+    print_version();
+    exit(0);
+  case OPT_CORRECT_CHECKSUM:
+    if (argument == disabled_my_option)
+      check_param.testflag&= ~T_CALC_CHECKSUM;
+    else
+      check_param.testflag|= T_CALC_CHECKSUM;
+    break;
+  case OPT_STATS_METHOD:
+  {
+    int method;
+    enum_handler_stats_method method_conv;
+    LINT_INIT(method_conv);
+    maria_stats_method_str= argument;
+    if ((method=find_type(argument, &maria_stats_method_typelib, 2)) <= 0)
+    {
+      fprintf(stderr, "Invalid value of stats_method: %s.\n", argument);
+      exit(1);
+    }
+    switch (method-1) {
+    case 0:
+      method_conv= MI_STATS_METHOD_NULLS_EQUAL;
+      break;
+    case 1:
+      method_conv= MI_STATS_METHOD_NULLS_NOT_EQUAL;
+      break;
+    case 2:
+      method_conv= MI_STATS_METHOD_IGNORE_NULLS;
+      break;
+    default: assert(0);                         /* Impossible */
+    }
+    check_param.stats_method= method_conv;
+    break;
+  }
+#ifdef DEBUG					/* Only useful if debugging */
+  case OPT_START_CHECK_POS:
+    check_param.start_check_pos= strtoull(argument, NULL, 0);
+    break;
+#endif
+  case 'z':
+    if (argument == disabled_my_option)
+      check_param.testflag&= ~T_ZEROFILL;
+    else
+      check_param.testflag|= T_ZEROFILL;
+    break;
+  case OPT_ZEROFILL_KEEP_LSN:
+    if (argument == disabled_my_option)
+      check_param.testflag&= ~(T_ZEROFILL_KEEP_LSN | T_ZEROFILL);
+    else
+      check_param.testflag|= (T_ZEROFILL_KEEP_LSN | T_ZEROFILL);
+    break;
+  case 'H':
+    my_print_help(my_long_options);
+    exit(0);
+  case '?':
+    usage();
+    exit(0);
+  }
+  return 0;
+}
+
+
+static void get_options(register int *argc,register char ***argv)
+{
+  int ho_error;
+
+  load_defaults("my", load_default_groups, argc, argv);
+  default_argv= *argv;
+  if (isatty(fileno(stdout)))
+    check_param.testflag|=T_WRITE_LOOP;
+
+  if ((ho_error=handle_options(argc, argv, my_long_options, get_one_option)))
+    exit(ho_error);
+
+  /* If using repair, then update checksum if one uses --update-state */
+  if ((check_param.testflag & T_UPDATE_STATE) &&
+      (check_param.testflag & T_REP_ANY))
+    check_param.testflag|= T_CALC_CHECKSUM;
+
+  if (*argc == 0)
+  {
+    usage();
+    exit(-1);
+  }
+
+  if ((check_param.testflag & T_UNPACK) &&
+      (check_param.testflag & (T_QUICK | T_SORT_RECORDS)))
+  {
+    VOID(fprintf(stderr,
+		 "%s: --unpack can't be used with --quick or --sort-records\n",
+		 my_progname_short));
+    exit(1);
+  }
+  if ((check_param.testflag & T_READONLY) &&
+      (check_param.testflag &
+       (T_REP_ANY | T_STATISTICS | T_AUTO_INC |
+	T_SORT_RECORDS | T_SORT_INDEX | T_FORCE_CREATE)))
+  {
+    VOID(fprintf(stderr,
+		 "%s: Can't use --readonly when repairing or sorting\n",
+		 my_progname_short));
+    exit(1);
+  }
+
+  if (!opt_debug)
+  {
+    DEBUGGER_OFF;                               /* Speed up things a bit */
+  }
+  if (init_tmpdir(&maria_chk_tmpdir, opt_tmpdir))
+    exit(1);
+
+  check_param.tmpdir=&maria_chk_tmpdir;
+
+  if (set_collation_name)
+    if (!(set_collation= get_charset_by_name(set_collation_name,
+                                             MYF(MY_WME))))
+      exit(1);
+
+  return;
+} /* get options */
+
+
+	/* Check table */
+
+static int maria_chk(HA_CHECK *param, char *filename)
+{
+  int error,lock_type,recreate;
+  my_bool rep_quick= test(param->testflag & (T_QUICK | T_FORCE_UNIQUENESS));
+  MARIA_HA *info;
+  File datafile;
+  char llbuff[22],llbuff2[22];
+  my_bool state_updated=0;
+  MARIA_SHARE *share;
+  DBUG_ENTER("maria_chk");
+
+  param->out_flag=error=param->warning_printed=param->error_printed=
+    recreate=0;
+  datafile=0;
+  param->isam_file_name=filename;		/* For error messages */
+  if (!(info=maria_open(filename,
+                        (param->testflag & (T_DESCRIPT | T_READONLY)) ?
+                        O_RDONLY : O_RDWR,
+                        HA_OPEN_FOR_REPAIR |
+                        ((param->testflag & T_WAIT_FOREVER) ?
+                         HA_OPEN_WAIT_IF_LOCKED :
+                         (param->testflag & T_DESCRIPT) ?
+                         HA_OPEN_IGNORE_IF_LOCKED : HA_OPEN_ABORT_IF_LOCKED))))
+  {
+    /* Avoid twice printing of isam file name */
+    param->error_printed=1;
+    switch (my_errno) {
+    case HA_ERR_CRASHED:
+      _ma_check_print_error(param,"'%s' doesn't have a correct index definition. You need to recreate it before you can do a repair",filename);
+      break;
+    case HA_ERR_NOT_A_TABLE:
+      _ma_check_print_error(param,"'%s' is not a Aria table",filename);
+      break;
+    case HA_ERR_CRASHED_ON_USAGE:
+      _ma_check_print_error(param,"'%s' is marked as crashed",filename);
+      break;
+    case HA_ERR_CRASHED_ON_REPAIR:
+      _ma_check_print_error(param,"'%s' is marked as crashed after last repair",filename);
+      break;
+    case HA_ERR_OLD_FILE:
+      _ma_check_print_error(param,"'%s' is a old type of Aria table", filename);
+      break;
+    case HA_ERR_NEW_FILE:
+      _ma_check_print_error(param,"'%s' uses new features not supported by this version of the Aria library", filename);
+      break;
+    case HA_ERR_END_OF_FILE:
+      _ma_check_print_error(param,"Couldn't read complete header from '%s'", filename);
+      break;
+    case EAGAIN:
+      _ma_check_print_error(param,"'%s' is locked. Use -w to wait until unlocked",filename);
+      break;
+    case ENOENT:
+      _ma_check_print_error(param,"File '%s' doesn't exist",filename);
+      break;
+    case EACCES:
+      _ma_check_print_error(param,"You don't have permission to use '%s'",
+                            filename);
+      break;
+    default:
+      _ma_check_print_error(param,"%d when opening Aria table '%s'",
+		  my_errno,filename);
+      break;
+    }
+    DBUG_RETURN(1);
+  }
+  share= info->s;
+  share->tot_locks-= share->r_locks;
+  share->r_locks=0;
+  maria_block_size= share->base.block_size;
+
+  if (share->data_file_type == BLOCK_RECORD ||
+      ((param->testflag & T_UNPACK) &&
+       share->state.header.org_data_file_type == BLOCK_RECORD))
+  {
+    if (param->testflag & T_SORT_RECORDS)
+    {
+      _ma_check_print_error(param,
+                            "Record format used by '%s' is is not yet supported with sort-records",
+                            filename);
+      param->error_printed= 0;
+      error= 1;
+      goto end2;
+    }
+    /* We can't do parallell repair with BLOCK_RECORD yet */
+    if (param->testflag & T_REP_PARALLEL)
+    {
+      param->testflag&= ~T_REP_PARALLEL;
+      param->testflag|= T_REP_BY_SORT;
+    }
+  }
+
+  /*
+    Skip the checking of the file if:
+    We are using --fast and the table is closed properly
+    We are using --check-only-changed-tables and the table hasn't changed
+  */
+  if (param->testflag & (T_FAST | T_CHECK_ONLY_CHANGED))
+  {
+    my_bool need_to_check= (maria_is_crashed(info) ||
+                            share->state.open_count != 0);
+
+    if ((param->testflag & (T_REP_ANY | T_SORT_RECORDS)) &&
+	((share->state.changed & (STATE_CHANGED | STATE_CRASHED |
+				  STATE_CRASHED_ON_REPAIR | STATE_IN_REPAIR) ||
+	  !(param->testflag & T_CHECK_ONLY_CHANGED))))
+      need_to_check=1;
+
+    if (info->s->base.keys && info->state->records)
+    {
+      if ((param->testflag & T_STATISTICS) &&
+          (share->state.changed & STATE_NOT_ANALYZED))
+        need_to_check=1;
+      if ((param->testflag & T_SORT_INDEX) &&
+          (share->state.changed & STATE_NOT_SORTED_PAGES))
+        need_to_check=1;
+      if ((param->testflag & T_REP_BY_SORT) &&
+          (share->state.changed & STATE_NOT_OPTIMIZED_KEYS))
+        need_to_check=1;
+    }
+    if ((param->testflag & T_CHECK_ONLY_CHANGED) &&
+	(share->state.changed & (STATE_CHANGED | STATE_CRASHED |
+				 STATE_CRASHED_ON_REPAIR | STATE_IN_REPAIR)))
+      need_to_check=1;
+    if (!need_to_check)
+    {
+      if (!(param->testflag & T_SILENT) || param->testflag & T_INFO)
+	printf("Aria file: %s is already checked\n",filename);
+      if (maria_close(info))
+      {
+	_ma_check_print_error(param,"%d when closing Aria table '%s'",
+			     my_errno,filename);
+	DBUG_RETURN(1);
+      }
+      DBUG_RETURN(0);
+    }
+  }
+  if ((param->testflag & (T_REP_ANY | T_STATISTICS |
+			  T_SORT_RECORDS | T_SORT_INDEX)) &&
+      (((param->testflag & T_UNPACK) &&
+	share->data_file_type == COMPRESSED_RECORD) ||
+       mi_uint2korr(share->state.header.state_info_length) !=
+       MARIA_STATE_INFO_SIZE ||
+       mi_uint2korr(share->state.header.base_info_length) !=
+       MARIA_BASE_INFO_SIZE ||
+       maria_is_any_intersect_keys_active(param->keys_in_use, share->base.keys,
+                                       ~share->state.key_map) ||
+       maria_test_if_almost_full(info) ||
+       info->s->state.header.file_version[3] != maria_file_magic[3] ||
+       (set_collation &&
+        set_collation->number != share->state.header.language)))
+  {
+    if (set_collation)
+      param->language= set_collation->number;
+    if (maria_recreate_table(param, &info,filename))
+    {
+      VOID(fprintf(stderr,
+		   "Aria table '%s' is not fixed because of errors\n",
+	      filename));
+      return(-1);
+    }
+    recreate=1;
+    if (!(param->testflag & T_REP_ANY))
+    {
+      param->testflag|=T_REP_BY_SORT;		/* if only STATISTICS */
+      if (!(param->testflag & T_SILENT))
+	printf("- '%s' has old table-format. Recreating index\n",filename);
+      rep_quick= 1;
+    }
+    share= info->s;
+    share->tot_locks-= share->r_locks;
+    share->r_locks=0;
+  }
+
+  if (param->testflag & T_DESCRIPT)
+  {
+    param->total_files++;
+    param->total_records+=info->state->records;
+    param->total_deleted+=info->state->del;
+    descript(param, info, filename);
+    maria_close(info);                          /* Should always succeed */
+    return(0);
+  }
+
+  if (!stopwords_inited++)
+    ft_init_stopwords();
+
+  if (!(param->testflag & T_READONLY))
+    lock_type = F_WRLCK;			/* table is changed */
+  else
+    lock_type= F_RDLCK;
+  if (info->lock_type == F_RDLCK)
+    info->lock_type=F_UNLCK;			/* Read only table */
+  if (_ma_readinfo(info,lock_type,0))
+  {
+    _ma_check_print_error(param,"Can't lock indexfile of '%s', error: %d",
+                          filename,my_errno);
+    param->error_printed=0;
+    error= 1;
+    goto end2;
+  }
+  /*
+    _ma_readinfo() has locked the table.
+    We mark the table as locked (without doing file locks) to be able to
+    use functions that only works on locked tables (like row caching).
+  */
+  maria_lock_database(info, F_EXTRA_LCK);
+  datafile= info->dfile.file;
+  if (init_pagecache(maria_pagecache, (size_t) param->use_buffers, 0, 0,
+                     maria_block_size, MY_WME) == 0)
+  {
+    _ma_check_print_error(param, "Can't initialize page cache with %lu memory",
+                          (ulong) param->use_buffers);
+    error= 1;
+    goto end2;
+  }
+
+  if (param->testflag & (T_REP_ANY | T_SORT_RECORDS | T_SORT_INDEX |
+                         T_ZEROFILL))
+  {
+    /*
+      Mark table as not transactional to avoid logging. Should not be needed,
+      maria_repair and maria_zerofill do it already.
+    */
+    _ma_tmp_disable_logging_for_table(info, FALSE);
+
+    if (param->testflag & T_REP_ANY)
+    {
+      ulonglong tmp=share->state.key_map;
+      maria_copy_keys_active(share->state.key_map, share->base.keys,
+                             param->keys_in_use);
+      if (tmp != share->state.key_map)
+        info->update|=HA_STATE_CHANGED;
+
+      if (rep_quick &&
+          maria_chk_del(param, info, param->testflag & ~T_VERBOSE))
+      {
+        if (param->testflag & T_FORCE_CREATE)
+        {
+          rep_quick=0;
+          _ma_check_print_info(param,"Creating new data file\n");
+        }
+        else
+        {
+          error=1;
+          _ma_check_print_error(param,
+                                "Quick-recover aborted; Run recovery without switch 'q'");
+        }
+      }
+    }
+    if (!error)
+    {
+      /*
+        Unless this was only --zerofill-keep-lsn, old REDOs are not
+        applicable, tell the server's Recovery to ignore them; we don't
+        know what the log's end LSN is now, so we just let the server know
+        that it will have to find and store it.
+        This is the only case where create_rename_lsn can be a horizon and not
+        a LSN.
+        If this was only --zerofill-keep-lsn, the table can be used in
+        Recovery and especially in this scenario: do a dirty-copy-based backup
+        (snapshot-like), --zerofill-keep-lsn on the copies to achieve better
+        compression, compress the copies with an external tool, and after a
+        restore, Recovery still works (because pages and state still have
+        their correct LSNs).
+      */
+      if (share->base.born_transactional &&
+          ((param->testflag & (T_REP_ANY | T_SORT_RECORDS | T_SORT_INDEX |
+                               T_ZEROFILL | T_ZEROFILL_KEEP_LSN)) !=
+           (T_ZEROFILL | T_ZEROFILL_KEEP_LSN)))
+        share->state.create_rename_lsn= share->state.is_of_horizon=
+          share->state.skip_redo_lsn= LSN_NEEDS_NEW_STATE_LSNS;
+    }
+    if (!error && (param->testflag & T_REP_ANY))
+    {
+      if ((param->testflag & (T_REP_BY_SORT | T_REP_PARALLEL)) &&
+          (maria_is_any_key_active(share->state.key_map) ||
+           (rep_quick && !param->keys_in_use && !recreate)) &&
+          maria_test_if_sort_rep(info, info->state->records,
+                                 info->s->state.key_map,
+                                 param->force_sort))
+      {
+        if (param->testflag & T_REP_BY_SORT)
+          error=maria_repair_by_sort(param,info,filename,rep_quick);
+        else
+          error=maria_repair_parallel(param,info,filename,rep_quick);
+        state_updated=1;
+      }
+      else
+        error=maria_repair(param, info,filename,rep_quick);
+    }
+    if (!error && (param->testflag & T_SORT_RECORDS))
+    {
+      /*
+        The data file is nowadays reopened in the repair code so we should
+        soon remove the following reopen-code
+      */
+#ifndef TO_BE_REMOVED
+      if (param->out_flag & O_NEW_DATA)
+      {			/* Change temp file to org file */
+        VOID(my_close(info->dfile.file, MYF(MY_WME))); /* Close new file */
+        error|=maria_change_to_newfile(filename,MARIA_NAME_DEXT,DATA_TMP_EXT,
+                                       MYF(0));
+        if (_ma_open_datafile(info,info->s, NullS, -1))
+          error=1;
+        param->out_flag&= ~O_NEW_DATA; /* We are using new datafile */
+        param->read_cache.file= info->dfile.file;
+      }
+#endif
+      if (! error)
+      {
+        uint key;
+        /*
+          We can't update the index in maria_sort_records if we have a
+          prefix compressed or fulltext index
+        */
+        my_bool update_index=1;
+        for (key=0 ; key < share->base.keys; key++)
+          if (share->keyinfo[key].flag & (HA_BINARY_PACK_KEY|HA_FULLTEXT))
+            update_index=0;
+
+        error=maria_sort_records(param,info,filename,param->opt_sort_key,
+                                 /* what is the following parameter for ? */
+                                 (my_bool) !(param->testflag & T_REP),
+                                 update_index);
+        datafile= info->dfile.file;	/* This is now locked */
+        if (!error && !update_index)
+        {
+          if (param->verbose)
+            puts("Table had a compressed index;  We must now recreate the index");
+          error=maria_repair_by_sort(param,info,filename,1);
+        }
+      }
+    }
+    if (!error && (param->testflag & T_SORT_INDEX))
+      error= maria_sort_index(param,info,filename);
+    if (!error && (param->testflag & T_ZEROFILL))
+      error= maria_zerofill(param, info, filename);
+    if (!error)
+    {
+      DBUG_PRINT("info", ("Reseting crashed state"));
+      share->state.changed&= ~(STATE_CHANGED | STATE_CRASHED |
+                               STATE_CRASHED_ON_REPAIR | STATE_IN_REPAIR);
+    }
+    else
+      maria_mark_crashed(info);
+  }
+  else if ((param->testflag & T_CHECK) || !(param->testflag & T_AUTO_INC))
+  {
+    if (!(param->testflag & T_VERY_SILENT) || param->testflag & T_INFO)
+      printf("Checking Aria file: %s\n",filename);
+    if (!(param->testflag & T_SILENT))
+      printf("Data records: %7s   Deleted blocks: %7s\n",
+             llstr(info->state->records,llbuff),
+             llstr(info->state->del,llbuff2));
+    maria_chk_init_for_check(param, info);
+    if (opt_warning_for_wrong_transid == 0)
+      param->max_trid= ~ (ulonglong) 0;
+    error= maria_chk_status(param,info);
+    maria_intersect_keys_active(share->state.key_map, param->keys_in_use);
+    error|= maria_chk_size(param,info);
+    if (!error || !(param->testflag & (T_FAST | T_FORCE_CREATE)))
+      error|=maria_chk_del(param, info,param->testflag);
+    if ((!error || (!(param->testflag & (T_FAST | T_FORCE_CREATE)) &&
+                    !param->start_check_pos)))
+    {
+      error|=maria_chk_key(param, info);
+      if (!error && (param->testflag & (T_STATISTICS | T_AUTO_INC)))
+        error=maria_update_state_info(param, info,
+                                      ((param->testflag & T_STATISTICS) ?
+                                       UPDATE_STAT : 0) |
+                                      ((param->testflag & T_AUTO_INC) ?
+                                       UPDATE_AUTO_INC : 0));
+    }
+    if ((!rep_quick && !error) ||
+        !(param->testflag & (T_FAST | T_FORCE_CREATE)))
+    {
+      VOID(init_io_cache(&param->read_cache,datafile,
+                         (uint) param->read_buffer_length,
+                         READ_CACHE,
+                         (param->start_check_pos ?
+                          param->start_check_pos :
+                          share->pack.header_length),
+                         1,
+                         MYF(MY_WME)));
+      maria_lock_memory(param);
+      if ((info->s->data_file_type != STATIC_RECORD) ||
+          (param->testflag & (T_EXTEND | T_MEDIUM)))
+        error|=maria_chk_data_link(param, info,
+                                   test(param->testflag & T_EXTEND));
+      VOID(end_io_cache(&param->read_cache));
+    }
+    if (!error)
+    {
+      if (((share->state.changed &
+            (STATE_CHANGED | STATE_CRASHED | STATE_CRASHED_ON_REPAIR |
+             STATE_IN_REPAIR)) ||
+           share->state.open_count != 0)
+          && (param->testflag & T_UPDATE_STATE))
+        info->update|=HA_STATE_CHANGED | HA_STATE_ROW_CHANGED;
+      DBUG_PRINT("info", ("Reseting crashed state"));
+      share->state.changed&= ~(STATE_CHANGED | STATE_CRASHED |
+                               STATE_CRASHED_ON_REPAIR | STATE_IN_REPAIR);
+    }
+    else if (!maria_is_crashed(info) &&
+             (param->testflag & T_UPDATE_STATE))
+    {						/* Mark crashed */
+      maria_mark_crashed(info);
+      info->update|=HA_STATE_CHANGED | HA_STATE_ROW_CHANGED;
+    }
+  }
+
+  if ((param->testflag & T_AUTO_INC) ||
+      ((param->testflag & T_REP_ANY) && info->s->base.auto_key))
+    _ma_update_auto_increment_key(param, info,
+                                  (my_bool) !test(param->testflag & T_AUTO_INC));
+
+  if (info->update & HA_STATE_CHANGED && ! (param->testflag & T_READONLY))
+    error|=maria_update_state_info(param, info,
+                                   UPDATE_OPEN_COUNT |
+                                   (((param->testflag & T_REP_ANY) ?
+                                     UPDATE_TIME : 0) |
+                                    (state_updated ? UPDATE_STAT : 0) |
+                                    ((param->testflag & T_SORT_RECORDS) ?
+                                     UPDATE_SORT : 0)));
+  info->update&= ~HA_STATE_CHANGED;
+  _ma_reenable_logging_for_table(info, FALSE);
+  maria_lock_database(info, F_UNLCK);
+
+end2:
+  end_pagecache(maria_pagecache, 1);
+  if (maria_close(info))
+  {
+    _ma_check_print_error(param, default_close_errmsg, my_errno, filename);
+    DBUG_RETURN(1);
+  }
+  if (error == 0)
+  {
+    if (param->out_flag & O_NEW_DATA)
+      error|=maria_change_to_newfile(filename,MARIA_NAME_DEXT,DATA_TMP_EXT,
+                                     ((param->testflag & T_BACKUP_DATA) ?
+                                      MYF(MY_REDEL_MAKE_BACKUP) : MYF(0)));
+    if (param->out_flag & O_NEW_INDEX)
+      error|=maria_change_to_newfile(filename,MARIA_NAME_IEXT,INDEX_TMP_EXT,
+                                     MYF(0));
+  }
+  if (opt_transaction_logging &&
+      share->base.born_transactional && !error &&
+      (param->testflag & (T_REP_ANY | T_SORT_RECORDS | T_SORT_INDEX |
+                          T_ZEROFILL)))
+    error= write_log_record(param);
+
+  if (param->not_visible_rows_found && (param->testflag & T_VERBOSE))
+  {
+    char buff[22];
+    printf("Max transaction id found: %s\n",
+           llstr(param->max_found_trid, buff));
+  }
+
+  VOID(fflush(stdout)); VOID(fflush(stderr));
+
+  if (param->error_printed)
+  {
+    if (param->testflag & (T_REP_ANY | T_SORT_RECORDS | T_SORT_INDEX))
+    {
+      VOID(fprintf(stderr,
+		   "Aria table '%s' is not fixed because of errors\n",
+		   filename));
+      if (param->testflag & T_REP_ANY)
+	VOID(fprintf(stderr,
+		     "Try fixing it by using the --safe-recover (-o), the --force (-f) option or by not using the --quick (-q) flag\n"));
+    }
+    else if (!(param->error_printed & 2) &&
+	     !(param->testflag & T_FORCE_CREATE))
+      VOID(fprintf(stderr,
+      "Aria table '%s' is corrupted\nFix it using switch \"-r\" or \"-o\"\n",
+	      filename));
+  }
+  else if (param->warning_printed &&
+	   ! (param->testflag & (T_REP_ANY | T_SORT_RECORDS | T_SORT_INDEX |
+			  T_FORCE_CREATE)))
+    VOID(fprintf(stderr, "Aria table '%s' is usable but should be fixed\n",
+		 filename));
+  VOID(fflush(stderr));
+  DBUG_RETURN(error);
+} /* maria_chk */
+
+
+/* Write info about table */
+
+static void descript(HA_CHECK *param, register MARIA_HA *info, char *name)
+{
+  uint key,keyseg_nr,field;
+  reg3 MARIA_KEYDEF *keyinfo;
+  reg2 HA_KEYSEG *keyseg;
+  reg4 const char *text;
+  char buff[200],length[10],*pos,*end;
+  enum en_fieldtype type;
+  MARIA_SHARE *share= info->s;
+  char llbuff[22],llbuff2[22];
+  DBUG_ENTER("descript");
+
+  if (param->testflag & T_VERY_SILENT)
+  {
+    longlong checksum= info->state->checksum;
+    if (!(share->options & (HA_OPTION_CHECKSUM | HA_OPTION_COMPRESS_RECORD)))
+      checksum= 0;
+    printf("%s %s %s\n", name, llstr(info->state->records,llbuff),
+           llstr(checksum, llbuff2));
+    DBUG_VOID_RETURN;
+  }
+
+  printf("Aria file:          %s\n",name);
+  printf("Record format:       %s\n", record_formats[share->data_file_type]);
+  printf("Crashsafe:           %s\n",
+         share->base.born_transactional ? "yes" : "no");
+  printf("Character set:       %s (%d)\n",
+	 get_charset_name(share->state.header.language),
+	 share->state.header.language);
+
+  if (param->testflag & T_VERBOSE)
+  {
+    printf("File-version:        %d\n",
+	   (int) share->state.header.file_version[3]);
+    if (share->state.create_time)
+    {
+      get_date(buff,1,share->state.create_time);
+      printf("Creation time:       %s\n",buff);
+    }
+    if (share->state.check_time)
+    {
+      get_date(buff,1,share->state.check_time);
+      printf("Recover time:        %s\n",buff);
+    }
+    if (share->base.born_transactional)
+    {
+      printf("LSNs:                create_rename (%lu,0x%lx),"
+             " state_horizon (%lu,0x%lx), skip_redo (%lu,0x%lx)\n",
+             LSN_IN_PARTS(share->state.create_rename_lsn),
+             LSN_IN_PARTS(share->state.is_of_horizon),
+             LSN_IN_PARTS(share->state.skip_redo_lsn));
+    }
+    compile_time_assert((MY_UUID_STRING_LENGTH + 1) <= sizeof(buff));
+    buff[MY_UUID_STRING_LENGTH]= 0;
+    my_uuid2str(share->base.uuid, buff);
+    printf("UUID:                %s\n", buff);
+    pos=buff;
+    if (share->state.changed & STATE_CRASHED)
+      strmov(buff,"crashed");
+    else
+    {
+      if (share->state.open_count)
+	pos=strmov(pos,"open,");
+      if (share->state.changed & STATE_CHANGED)
+	pos=strmov(pos,"changed,");
+      else
+	pos=strmov(pos,"checked,");
+      if (!(share->state.changed & STATE_NOT_ANALYZED))
+	pos=strmov(pos,"analyzed,");
+      if (!(share->state.changed & STATE_NOT_OPTIMIZED_KEYS))
+	pos=strmov(pos,"optimized keys,");
+      if (!(share->state.changed & STATE_NOT_SORTED_PAGES))
+	pos=strmov(pos,"sorted index pages,");
+      if (!(share->state.changed & STATE_NOT_ZEROFILLED))
+	pos=strmov(pos,"zerofilled,");
+      if (!(share->state.changed & STATE_NOT_MOVABLE))
+	pos=strmov(pos,"movable,");
+      pos[-1]=0;				/* Remove extra ',' */
+    }
+    printf("Status:              %s\n",buff);
+    if (share->options & (HA_OPTION_CHECKSUM | HA_OPTION_COMPRESS_RECORD))
+      printf("Checksum:  %26s\n",llstr(info->state->checksum,llbuff));
+;
+    if (share->options & HA_OPTION_DELAY_KEY_WRITE)
+      printf("Keys are only flushed at close\n");
+
+    if (share->options & HA_OPTION_PAGE_CHECKSUM)
+      printf("Page checksums are used\n");
+    if (share->base.auto_key)
+    {
+      printf("Auto increment key:  %16d  Last value:         %18s\n",
+	     share->base.auto_key,
+	     llstr(share->state.auto_increment,llbuff));
+    }
+  }
+  printf("Data records:        %16s  Deleted blocks:     %18s\n",
+	 llstr(info->state->records,llbuff),llstr(info->state->del,llbuff2));
+  if (param->testflag & T_SILENT)
+    DBUG_VOID_RETURN;				/* This is enough */
+
+  if (param->testflag & T_VERBOSE)
+  {
+#ifdef USE_RELOC
+    printf("Init-relocation:     %16s\n",llstr(share->base.reloc,llbuff));
+#endif
+    printf("Datafile parts:      %16s  Deleted data:       %18s\n",
+	   llstr(share->state.split,llbuff),
+	   llstr(info->state->empty,llbuff2));
+    printf("Datafile pointer (bytes): %11d  Keyfile pointer (bytes): %13d\n",
+	   share->rec_reflength,share->base.key_reflength);
+    printf("Datafile length:     %16s  Keyfile length:     %18s\n",
+	   llstr(info->state->data_file_length,llbuff),
+	   llstr(info->state->key_file_length,llbuff2));
+
+    if (info->s->base.reloc == 1L && info->s->base.records == 1L)
+      puts("This is a one-record table");
+    else
+    {
+      if (share->base.max_data_file_length != HA_OFFSET_ERROR ||
+	  share->base.max_key_file_length != HA_OFFSET_ERROR)
+	printf("Max datafile length: %16s  Max keyfile length: %18s\n",
+	       llstr(share->base.max_data_file_length-1,llbuff),
+	       llstr(share->base.max_key_file_length-1,llbuff2));
+    }
+  }
+  printf("Block_size:          %16d\n",(int) share->block_size);
+  printf("Recordlength:        %16d\n",(int) share->base.pack_reclength);
+  if (! maria_is_all_keys_active(share->state.key_map, share->base.keys))
+  {
+    longlong2str(share->state.key_map,buff,2,1);
+    printf("Using only keys '%s' of %d possibly keys\n",
+	   buff, share->base.keys);
+  }
+  puts("\nTable description:");
+  printf("Key Start Len Index   Type");
+  if (param->testflag & T_VERBOSE)
+    printf("                     Rec/key         Root  Blocksize");
+  VOID(putchar('\n'));
+
+  for (key=keyseg_nr=0, keyinfo= &share->keyinfo[0] ;
+       key < share->base.keys;
+       key++,keyinfo++)
+  {
+    keyseg=keyinfo->seg;
+    if (keyinfo->flag & HA_NOSAME) text="unique ";
+    else if (keyinfo->flag & HA_FULLTEXT) text="fulltext ";
+    else text="multip.";
+
+    pos=buff;
+    if (keyseg->flag & HA_REVERSE_SORT)
+      *pos++ = '-';
+    pos=strmov(pos,type_names[keyseg->type]);
+    *pos++ = ' ';
+    *pos=0;
+    if (keyinfo->flag & HA_PACK_KEY)
+      pos=strmov(pos,prefix_packed_txt);
+    if (keyinfo->flag & HA_BINARY_PACK_KEY)
+      pos=strmov(pos,bin_packed_txt);
+    if (keyseg->flag & HA_SPACE_PACK)
+      pos=strmov(pos,diff_txt);
+    if (keyseg->flag & HA_BLOB_PART)
+      pos=strmov(pos,blob_txt);
+    if (keyseg->flag & HA_NULL_PART)
+      pos=strmov(pos,null_txt);
+    *pos=0;
+
+    printf("%-4d%-6ld%-3d %-8s%-23s",
+	   key+1,(long) keyseg->start+1,keyseg->length,text,buff);
+    if (share->state.key_root[key] != HA_OFFSET_ERROR)
+      llstr(share->state.key_root[key],buff);
+    else
+      buff[0]=0;
+    if (param->testflag & T_VERBOSE)
+      printf("%9.0f %12s %10d",
+	     share->state.rec_per_key_part[keyseg_nr++],
+	     buff,keyinfo->block_length);
+    VOID(putchar('\n'));
+    while ((++keyseg)->type != HA_KEYTYPE_END)
+    {
+      pos=buff;
+      if (keyseg->flag & HA_REVERSE_SORT)
+	*pos++ = '-';
+      pos=strmov(pos,type_names[keyseg->type]);
+      *pos++= ' ';
+      if (keyseg->flag & HA_SPACE_PACK)
+	pos=strmov(pos,diff_txt);
+      if (keyseg->flag & HA_BLOB_PART)
+	pos=strmov(pos,blob_txt);
+      if (keyseg->flag & HA_NULL_PART)
+	pos=strmov(pos,null_txt);
+      *pos=0;
+      printf("    %-6ld%-3d         %-21s",
+	     (long) keyseg->start+1,keyseg->length,buff);
+      if (param->testflag & T_VERBOSE)
+	printf("%11.0f", share->state.rec_per_key_part[keyseg_nr++]);
+      VOID(putchar('\n'));
+    }
+    keyseg++;
+  }
+  if (share->state.header.uniques)
+  {
+    MARIA_UNIQUEDEF *uniqueinfo;
+    puts("\nUnique  Key  Start  Len  Nullpos  Nullbit  Type");
+    for (key=0,uniqueinfo= &share->uniqueinfo[0] ;
+	 key < share->state.header.uniques; key++, uniqueinfo++)
+    {
+      my_bool new_row=0;
+      char null_bit[8],null_pos[8];
+      printf("%-8d%-5d",key+1,uniqueinfo->key+1);
+      for (keyseg=uniqueinfo->seg ; keyseg->type != HA_KEYTYPE_END ; keyseg++)
+      {
+	if (new_row)
+	  fputs("             ",stdout);
+	null_bit[0]=null_pos[0]=0;
+	if (keyseg->null_bit)
+	{
+	  sprintf(null_bit,"%d",keyseg->null_bit);
+	  sprintf(null_pos,"%ld",(long) keyseg->null_pos+1);
+	}
+	printf("%-7ld%-5d%-9s%-10s%-30s\n",
+	       (long) keyseg->start+1,keyseg->length,
+	       null_pos,null_bit,
+	       type_names[keyseg->type]);
+	new_row=1;
+      }
+    }
+  }
+  if (param->verbose > 1)
+  {
+    char null_bit[8],null_pos[8];
+    printf("\nField Start Length Nullpos Nullbit Type");
+    if (share->options & HA_OPTION_COMPRESS_RECORD)
+      printf("                         Huff tree  Bits");
+    VOID(putchar('\n'));
+
+    for (field=0 ; field < share->base.fields ; field++)
+    {
+      if (share->options & HA_OPTION_COMPRESS_RECORD)
+	type=share->columndef[field].base_type;
+      else
+	type=(enum en_fieldtype) share->columndef[field].type;
+      end=strmov(buff,field_pack[type]);
+      if (share->options & HA_OPTION_COMPRESS_RECORD)
+      {
+	if (share->columndef[field].pack_type & PACK_TYPE_SELECTED)
+	  end=strmov(end,", not_always");
+	if (share->columndef[field].pack_type & PACK_TYPE_SPACE_FIELDS)
+	  end=strmov(end,", no empty");
+	if (share->columndef[field].pack_type & PACK_TYPE_ZERO_FILL)
+	{
+	  sprintf(end,", zerofill(%d)",share->columndef[field].space_length_bits);
+	  end=strend(end);
+	}
+      }
+      if (buff[0] == ',')
+	strmov(buff,buff+2);
+      int10_to_str((long) share->columndef[field].length,length,10);
+      null_bit[0]=null_pos[0]=0;
+      if (share->columndef[field].null_bit)
+      {
+	sprintf(null_bit,"%d",share->columndef[field].null_bit);
+	sprintf(null_pos,"%d",share->columndef[field].null_pos+1);
+      }
+      printf("%-6d%-6u%-7s%-8s%-8s%-35s",field+1,
+             (uint) share->columndef[field].offset+1,
+             length, null_pos, null_bit, buff);
+      if (share->options & HA_OPTION_COMPRESS_RECORD)
+      {
+	if (share->columndef[field].huff_tree)
+	  printf("%3d    %2d",
+		 (uint) (share->columndef[field].huff_tree-share->decode_trees)+1,
+		 share->columndef[field].huff_tree->quick_table_bits);
+      }
+      VOID(putchar('\n'));
+    }
+    if (share->data_file_type == BLOCK_RECORD)
+    {
+      uint i;
+      puts("\nBitmap  Data size  Description");
+      for (i=0 ; i <= 7 ; i++)
+        printf("%u           %5u  %s\n", i, share->bitmap.sizes[i],
+               bitmap_description[i]);
+    }
+  }
+  DBUG_VOID_RETURN;
+} /* describe */
+
+
+	/* Sort records according to one key */
+
+static int maria_sort_records(HA_CHECK *param,
+			   register MARIA_HA *info, char *name,
+			   uint sort_key,
+			   my_bool write_info,
+			   my_bool update_index)
+{
+  int got_error;
+  uint key;
+  MARIA_KEYDEF *keyinfo;
+  File new_file;
+  uchar *temp_buff;
+  ha_rows old_record_count;
+  MARIA_SHARE *share= info->s;
+  char llbuff[22],llbuff2[22];
+  MARIA_SORT_INFO sort_info;
+  MARIA_SORT_PARAM sort_param;
+  MARIA_PAGE page;
+  DBUG_ENTER("sort_records");
+
+  bzero((char*)&sort_info,sizeof(sort_info));
+  bzero((char*)&sort_param,sizeof(sort_param));
+  sort_param.sort_info=&sort_info;
+  sort_info.param=param;
+  keyinfo= &share->keyinfo[sort_key];
+  got_error=1;
+  temp_buff=0;
+  new_file= -1;
+
+  if (! maria_is_key_active(share->state.key_map, sort_key))
+  {
+    _ma_check_print_warning(param,
+			   "Can't sort table '%s' on key %d;  No such key",
+		name,sort_key+1);
+    param->error_printed=0;
+    DBUG_RETURN(0);				/* Nothing to do */
+  }
+  if (keyinfo->flag & HA_FULLTEXT)
+  {
+    _ma_check_print_warning(param,"Can't sort table '%s' on FULLTEXT key %d",
+			   name,sort_key+1);
+    param->error_printed=0;
+    DBUG_RETURN(0);				/* Nothing to do */
+  }
+  if (keyinfo->flag & HA_BINARY_PACK_KEY)
+  {
+    _ma_check_print_warning(param,
+                            "Can't sort table '%s' on a key with prefix "
+                            "packing %d",
+                            name,sort_key+1);
+    param->error_printed=0;
+    DBUG_RETURN(0);
+  }
+
+
+  if (share->data_file_type == COMPRESSED_RECORD)
+  {
+    _ma_check_print_warning(param,"Can't sort read-only table '%s'", name);
+    param->error_printed=0;
+    DBUG_RETURN(0);				/* Nothing to do */
+  }
+  if (!(param->testflag & T_SILENT))
+  {
+    printf("- Sorting records for Aria table '%s'\n",name);
+    if (write_info)
+      printf("Data records: %9s   Deleted: %9s\n",
+	     llstr(info->state->records,llbuff),
+	     llstr(info->state->del,llbuff2));
+  }
+  if (share->state.key_root[sort_key] == HA_OFFSET_ERROR)
+    DBUG_RETURN(0);				/* Nothing to do */
+
+  if (init_io_cache(&info->rec_cache,-1,(uint) param->write_buffer_length,
+		   WRITE_CACHE,share->pack.header_length,1,
+		   MYF(MY_WME | MY_WAIT_IF_FULL)))
+    goto err;
+  info->opt_flag|=WRITE_CACHE_USED;
+
+  if (!(temp_buff=(uchar*) my_alloca((uint) keyinfo->block_length)))
+  {
+    _ma_check_print_error(param,"Not enough memory for key block");
+    goto err;
+  }
+
+  if (!(sort_param.record=
+        (uchar*) my_malloc((uint) share->base.default_rec_buff_size, MYF(0))))
+  {
+    _ma_check_print_error(param,"Not enough memory for record");
+    goto err;
+  }
+
+  fn_format(param->temp_filename,name,"", MARIA_NAME_DEXT,2+4+32);
+  new_file= my_create(fn_format(param->temp_filename,
+                                param->temp_filename,"",
+                                DATA_TMP_EXT,
+                                MY_REPLACE_EXT | MY_UNPACK_FILENAME),
+                      0, param->tmpfile_createflag,
+                      MYF(0));
+  if (new_file < 0)
+  {
+    _ma_check_print_error(param,"Can't create new tempfile: '%s'",
+			 param->temp_filename);
+    goto err;
+  }
+  if (share->pack.header_length)
+    if (maria_filecopy(param, new_file, info->dfile.file, 0L,
+                       share->pack.header_length,
+                       "datafile-header"))
+      goto err;
+  info->rec_cache.file=new_file;		/* Use this file for cacheing*/
+
+  maria_lock_memory(param);
+  for (key=0 ; key < share->base.keys ; key++)
+    share->keyinfo[key].flag|= HA_SORT_ALLOWS_SAME;
+
+  if (my_pread(share->kfile.file, temp_buff,
+	       (uint) keyinfo->block_length,
+	       share->state.key_root[sort_key],
+	       MYF(MY_NABP+MY_WME)))
+  {
+    _ma_check_print_error(param, "Can't read indexpage from filepos: %s",
+                          llstr(share->state.key_root[sort_key], llbuff));
+    goto err;
+  }
+
+  /* Setup param for _ma_sort_write_record */
+  sort_info.info=info;
+  sort_info.new_data_file_type=share->data_file_type;
+  sort_param.fix_datafile=1;
+  sort_param.master=1;
+  sort_param.filepos=share->pack.header_length;
+  old_record_count=info->state->records;
+  info->state->records=0;
+  if (sort_info.new_data_file_type != COMPRESSED_RECORD)
+    info->state->checksum=0;
+
+  _ma_page_setup(&page, info, keyinfo, share->state.key_root[sort_key],
+                 temp_buff);
+  if (sort_record_index(&sort_param, &page, sort_key,new_file,update_index) ||
+      maria_write_data_suffix(&sort_info,1) ||
+      flush_io_cache(&info->rec_cache))
+    goto err;
+
+  if (info->state->records != old_record_count)
+  {
+    _ma_check_print_error(param,"found %s of %s records",
+		llstr(info->state->records,llbuff),
+		llstr(old_record_count,llbuff2));
+    goto err;
+  }
+
+  VOID(my_close(info->dfile.file, MYF(MY_WME)));
+  param->out_flag|=O_NEW_DATA;			/* Data in new file */
+  info->dfile.file= new_file;                   /* Use new datafile */
+  _ma_set_data_pagecache_callbacks(&info->dfile, info->s);
+
+  info->state->del=0;
+  info->state->empty=0;
+  share->state.dellink= HA_OFFSET_ERROR;
+  info->state->data_file_length=sort_param.filepos;
+  share->state.split=info->state->records;	/* Only hole records */
+  share->state.version=(ulong) time((time_t*) 0);
+
+  info->update= (short) (HA_STATE_CHANGED | HA_STATE_ROW_CHANGED);
+
+  if (param->testflag & T_WRITE_LOOP)
+  {
+    VOID(fputs("          \r",stdout)); VOID(fflush(stdout));
+  }
+  got_error=0;
+
+err:
+  if (got_error && new_file >= 0)
+  {
+    VOID(end_io_cache(&info->rec_cache));
+    (void) my_close(new_file,MYF(MY_WME));
+    (void) my_delete(param->temp_filename, MYF(MY_WME));
+  }
+  if (temp_buff)
+  {
+    my_afree(temp_buff);
+  }
+  my_free(sort_param.record,MYF(MY_ALLOW_ZERO_PTR));
+  info->opt_flag&= ~(READ_CACHE_USED | WRITE_CACHE_USED);
+  VOID(end_io_cache(&info->rec_cache));
+  my_free(sort_info.buff,MYF(MY_ALLOW_ZERO_PTR));
+  sort_info.buff=0;
+  share->state.sortkey=sort_key;
+  DBUG_RETURN(got_error);
+} /* sort_records */
+
+
+/* Sort records recursive using one index */
+
+static int sort_record_index(MARIA_SORT_PARAM *sort_param,
+                             MARIA_PAGE *ma_page, uint sort_key,
+			     File new_file,my_bool update_index)
+{
+  MARIA_HA *info= ma_page->info;
+  MARIA_SHARE *share= info->s;
+  uint	page_flag, nod_flag,used_length;
+  uchar *temp_buff,*keypos,*endpos;
+  my_off_t next_page,rec_pos;
+  uchar lastkey[MARIA_MAX_KEY_BUFF];
+  char llbuff[22];
+  MARIA_SORT_INFO *sort_info= sort_param->sort_info;
+  HA_CHECK *param=sort_info->param;
+  MARIA_KEY tmp_key;
+  MARIA_PAGE new_page;
+  const MARIA_KEYDEF *keyinfo= ma_page->keyinfo;
+  DBUG_ENTER("sort_record_index");
+
+  page_flag= ma_page->flag;
+  nod_flag=  ma_page->node;
+  temp_buff=0;
+  tmp_key.keyinfo= (MARIA_KEYDEF*) keyinfo;
+  tmp_key.data=    lastkey;
+
+  if (nod_flag)
+  {
+    if (!(temp_buff= (uchar*) my_alloca(tmp_key.keyinfo->block_length)))
+    {
+      _ma_check_print_error(param,"Not Enough memory");
+      DBUG_RETURN(-1);
+    }
+  }
+  used_length= ma_page->size;
+  keypos= ma_page->buff + share->keypage_header + nod_flag;
+  endpos= ma_page->buff + used_length;
+  for ( ;; )
+  {
+    _sanity(__FILE__,__LINE__);
+    if (nod_flag)
+    {
+      next_page= _ma_kpos(nod_flag, keypos);
+      if (my_pread(share->kfile.file, temp_buff,
+		  (uint) tmp_key.keyinfo->block_length, next_page,
+		   MYF(MY_NABP+MY_WME)))
+      {
+	_ma_check_print_error(param,"Can't read keys from filepos: %s",
+		    llstr(next_page,llbuff));
+	goto err;
+      }
+      _ma_page_setup(&new_page, info, ma_page->keyinfo, next_page, temp_buff);
+
+      if (sort_record_index(sort_param, &new_page, sort_key,
+			    new_file, update_index))
+	goto err;
+    }
+    _sanity(__FILE__,__LINE__);
+    if (keypos >= endpos ||
+	!(*keyinfo->get_key)(&tmp_key, page_flag, nod_flag, &keypos))
+      break;
+    rec_pos= _ma_row_pos_from_key(&tmp_key);
+
+    if ((*share->read_record)(info,sort_param->record,rec_pos))
+    {
+      _ma_check_print_error(param,"%d when reading datafile",my_errno);
+      goto err;
+    }
+    if (rec_pos != sort_param->filepos && update_index)
+    {
+      _ma_dpointer(share, keypos - nod_flag - tmp_key.ref_length,
+		   sort_param->filepos);
+      if (maria_movepoint(info,sort_param->record,rec_pos,sort_param->filepos,
+                          sort_key))
+      {
+	_ma_check_print_error(param,"%d when updating key-pointers",my_errno);
+	goto err;
+      }
+    }
+    if (_ma_sort_write_record(sort_param))
+      goto err;
+  }
+  /* Clear end of block to get better compression if the table is backuped */
+  bzero(ma_page->buff + used_length, keyinfo->block_length - used_length);
+  if (my_pwrite(share->kfile.file, ma_page->buff, (uint)keyinfo->block_length,
+		ma_page->pos, param->myf_rw))
+  {
+    _ma_check_print_error(param,"%d when updating keyblock",my_errno);
+    goto err;
+  }
+  if (temp_buff)
+    my_afree(temp_buff);
+  DBUG_RETURN(0);
+err:
+  if (temp_buff)
+    my_afree(temp_buff);
+  DBUG_RETURN(1);
+} /* sort_record_index */
+
+
+static my_bool write_log_record(HA_CHECK *param)
+{
+  /*
+    Now that all operations including O_NEW_DATA|INDEX are successfully
+    done, we can write a log record.
+  */
+  MARIA_HA *info= maria_open(param->isam_file_name, O_RDWR, 0);
+  if (info == NULL)
+    _ma_check_print_error(param, default_open_errmsg, my_errno,
+                          param->isam_file_name);
+  else
+  {
+    if (write_log_record_for_repair(param, info))
+      _ma_check_print_error(param, "%d when writing log record for"
+                            " Aria table '%s'", my_errno,
+                            param->isam_file_name);
+    else if (maria_close(info))
+      _ma_check_print_error(param, default_close_errmsg, my_errno,
+                            param->isam_file_name);
+    else
+      return FALSE;
+  }
+  return TRUE;
+}
+
+#include "ma_check_standalone.h"
diff --git a/storage/maria/maria_def.h b/storage/maria/maria_def.h
new file mode 100644
index 00000000000..ba97684b1aa
--- /dev/null
+++ b/storage/maria/maria_def.h
@@ -0,0 +1,1267 @@
+/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; version 2 of the License.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */
+
+/* This file is included by all internal maria files */
+
+#include "maria.h"				/* Structs & some defines */
+#include <myisampack.h>				/* packing of keys */
+#include <my_tree.h>
+#include <my_bitmap.h>
+#ifdef THREAD
+#include <my_pthread.h>
+#include <thr_lock.h>
+#else
+#include <my_no_pthread.h>
+#endif
+#include <hash.h>
+#include "ma_loghandler.h"
+#include "ma_control_file.h"
+#include "ma_state.h"
+#include <waiting_threads.h>
+
+/* For testing recovery */
+#ifdef TO_BE_REMOVED
+#define IDENTICAL_PAGES_AFTER_RECOVERY 1
+#endif
+/* Do extra sanity checking */
+#define SANITY_CHECKS 1
+#ifdef EXTRA_DEBUG
+#define EXTRA_DEBUG_KEY_CHANGES
+#define EXTRA_STORE_FULL_PAGE_IN_KEY_CHANGES
+#endif
+
+#define MAX_NONMAPPED_INSERTS 1000
+#define MARIA_MAX_TREE_LEVELS 32
+
+/* maria_open() flag, specific for maria_pack */
+#define HA_OPEN_IGNORE_MOVED_STATE (1U << 30)
+
+struct st_transaction;
+
+/* undef map from my_nosys; We need test-if-disk full */
+#undef my_write
+
+#define CRC_SIZE 4
+
+typedef struct st_maria_state_info
+{
+  struct
+  {					/* Fileheader (24 bytes) */
+    uchar file_version[4];
+    uchar options[2];
+    uchar header_length[2];
+    uchar state_info_length[2];
+    uchar base_info_length[2];
+    uchar base_pos[2];
+    uchar key_parts[2];			/* Key parts */
+    uchar unique_key_parts[2];		/* Key parts + unique parts */
+    uchar keys;				/* number of keys in file */
+    uchar uniques;			/* number of UNIQUE definitions */
+    uchar language;			/* Language for indexes */
+    uchar fulltext_keys;
+    uchar data_file_type;
+    /* Used by mariapack to store the original data_file_type */
+    uchar org_data_file_type;
+  } header;
+
+  MARIA_STATUS_INFO state;
+  /* maria_ha->state points here for crash-safe but not versioned tables */
+  MARIA_STATUS_INFO common;
+  ha_rows split;			/* number of split blocks */
+  my_off_t dellink;			/* Link to next removed block */
+  pgcache_page_no_t first_bitmap_with_space;
+  ulonglong auto_increment;
+  TrID create_trid;                     /* Minum trid for file */
+  TrID last_change_trn;                 /* selfdescriptive */
+  ulong update_count;			/* Updated for each write lock */
+  ulong status;
+  double *rec_per_key_part;
+  ulong *nulls_per_key_part;
+  ha_checksum checksum;                 /* Table checksum */
+  my_off_t *key_root;			/* Start of key trees */
+  my_off_t key_del;			/* delete links for index pages */
+  my_off_t records_at_analyze;		/* Rows when calculating rec_per_key */
+
+  ulong sec_index_changed;		/* Updated when new sec_index */
+  ulong sec_index_used;			/* which extra index are in use */
+  ulonglong key_map;			/* Which keys are in use */
+  ulong version;			/* timestamp of create */
+  time_t create_time;			/* Time when created database */
+  time_t recover_time;			/* Time for last recover */
+  time_t check_time;			/* Time for last check */
+  uint sortkey;				/* sorted by this key (not used) */
+  uint open_count;
+  uint changed;                         /* Changed since maria_chk */
+  /**
+     Birthday of the table: no record in the log before this LSN should ever
+     be applied to the table. Updated when created, renamed, explicitely
+     repaired (REPAIR|OPTIMIZE TABLE, ALTER TABLE ENABLE KEYS, maria_chk).
+  */
+  LSN create_rename_lsn;
+  /** @brief Log horizon when state was last updated on disk */
+  TRANSLOG_ADDRESS is_of_horizon;
+  /**
+     REDO phase should ignore any record before this LSN. UNDO phase
+     shouldn't, this is the difference with create_rename_lsn.
+     skip_redo_lsn >= create_rename_lsn.
+     The distinction is for these cases:
+     - after a repair at end of bulk insert (enabling indices), REDO phase
+     should skip the table but UNDO phase should not, so only skip_redo_lsn is
+     increased, not create_rename_lsn
+     - if one table is corrupted and so recovery fails, user may repair the
+     table with maria_chk and let recovery restart: that recovery should then
+     skip the repaired table even in the UNDO phase, so create_rename_lsn is
+     increased.
+  */
+  LSN skip_redo_lsn;
+
+  /* the following isn't saved on disk */
+  uint state_diff_length;		/* Should be 0 */
+  uint state_length;			/* Length of state header in file */
+  ulong *key_info;
+} MARIA_STATE_INFO;
+
+
+#define MARIA_STATE_INFO_SIZE	\
+  (24 + 2 + LSN_STORE_SIZE*3 + 4 + 11*8 + 4*4 + 8 + 3*4 + 5*8)
+#define MARIA_FILE_OPEN_COUNT_OFFSET 0
+#define MARIA_FILE_CHANGED_OFFSET 2
+#define MARIA_FILE_CREATE_RENAME_LSN_OFFSET 4
+#define MARIA_FILE_CREATE_TRID_OFFSET (4 + LSN_STORE_SIZE*3 + 11*8)
+
+#define MARIA_STATE_KEY_SIZE	(8 + 4)
+#define MARIA_STATE_KEYBLOCK_SIZE  8
+#define MARIA_STATE_KEYSEG_SIZE	12
+#define MARIA_STATE_EXTRA_SIZE (MARIA_MAX_KEY*MARIA_STATE_KEY_SIZE + MARIA_MAX_KEY*HA_MAX_KEY_SEG*MARIA_STATE_KEYSEG_SIZE)
+#define MARIA_KEYDEF_SIZE	(2+ 5*2)
+#define MARIA_UNIQUEDEF_SIZE	(2+1+1)
+#define HA_KEYSEG_SIZE		(6+ 2*2 + 4*2)
+#define MARIA_MAX_KEY_BUFF	(HA_MAX_KEY_BUFF + MARIA_MAX_PACK_TRANSID_SIZE)
+#define MARIA_COLUMNDEF_SIZE	(2*7+1+1+4)
+#define MARIA_BASE_INFO_SIZE	(MY_UUID_SIZE + 5*8 + 6*4 + 11*2 + 6 + 5*2 + 1 + 16)
+#define MARIA_INDEX_BLOCK_MARGIN 16	/* Safety margin for .MYI tables */
+/* Internal management bytes needed to store 2 transid/key on an index page */
+#define MARIA_MAX_PACK_TRANSID_SIZE   (TRANSID_SIZE+1)
+#define MARIA_TRANSID_PACK_OFFSET     (256- TRANSID_SIZE - 1)
+#define MARIA_MIN_TRANSID_PACK_OFFSET (MARIA_TRANSID_PACK_OFFSET-TRANSID_SIZE)
+#define MARIA_INDEX_OVERHEAD_SIZE     (MARIA_MAX_PACK_TRANSID_SIZE * 2)
+#define MARIA_DELETE_KEY_NR  255	/* keynr for deleted blocks */
+
+/*
+  Basic information of the Maria table. This is stored on disk
+  and not changed (unless we do DLL changes).
+*/
+
+typedef struct st_ma_base_info
+{
+  my_off_t keystart;                    /* Start of keys */
+  my_off_t max_data_file_length;
+  my_off_t max_key_file_length;
+  my_off_t margin_key_file_length;
+  ha_rows records, reloc;               /* Create information */
+  ulong mean_row_length;                /* Create information */
+  ulong reclength;                      /* length of unpacked record */
+  ulong pack_reclength;                 /* Length of full packed rec */
+  ulong min_pack_length;
+  ulong max_pack_length;                /* Max possibly length of packed rec */
+  ulong min_block_length;
+  uint fields;                          /* fields in table */
+  uint fixed_not_null_fields;
+  uint fixed_not_null_fields_length;
+  uint max_field_lengths;
+  uint pack_fields;                     /* packed fields in table */
+  uint varlength_fields;                /* char/varchar/blobs */
+  /* Number of bytes in the index used to refer to a row (2-8) */
+  uint rec_reflength;
+  /* Number of bytes in the index used to refer to another index page (2-8) */
+  uint key_reflength;                   /* = 2-8 */
+  uint keys;                            /* same as in state.header */
+  uint auto_key;                        /* Which key-1 is a auto key */
+  uint blobs;                           /* Number of blobs */
+  /* Length of packed bits (when table was created first time) */
+  uint pack_bytes;
+  /* Length of null bits (when table was created first time) */
+  uint original_null_bytes;
+  uint null_bytes;                      /* Null bytes in record */
+  uint field_offsets;                   /* Number of field offsets */
+  uint max_key_block_length;            /* Max block length */
+  uint max_key_length;                  /* Max key length */
+  /* Extra allocation when using dynamic record format */
+  uint extra_alloc_bytes;
+  uint extra_alloc_procent;
+  uint is_nulls_extended;               /* 1 if new null bytes */
+  uint default_row_flag;                /* 0 or ROW_FLAG_NULLS_EXTENDED */
+  uint block_size;
+  /* Size of initial record buffer */
+  uint default_rec_buff_size;
+  /* Extra number of bytes the row format require in the record buffer */
+  uint extra_rec_buff_size;
+  /* Tuning flags that can be ignored by older Maria versions */
+  uint extra_options;
+
+  /* The following are from the header */
+  uint key_parts, all_key_parts;
+  uchar uuid[MY_UUID_SIZE];
+  /**
+     @brief If false, we disable logging, versioning, transaction etc. Observe
+     difference with MARIA_SHARE::now_transactional
+  */
+  my_bool born_transactional;
+} MARIA_BASE_INFO;
+
+
+/* Structs used intern in database */
+
+typedef struct st_maria_blob            /* Info of record */
+{
+  ulong offset;                         /* Offset to blob in record */
+  uint pack_length;                     /* Type of packed length */
+  ulong length;                         /* Calc:ed for each record */
+} MARIA_BLOB;
+
+
+typedef struct st_maria_pack
+{
+  ulong header_length;
+  uint ref_length;
+  uchar version;
+} MARIA_PACK;
+
+typedef struct st_maria_file_bitmap
+{
+  uchar *map;
+  pgcache_page_no_t page;              /* Page number for current bitmap */
+  uint used_size;                      /* Size of bitmap head that is not 0 */
+  my_bool changed;                     /* 1 if page needs to be written */
+  my_bool changed_not_flushed;         /* 1 if some bitmap is not flushed */
+  uint flush_all_requested;            /**< If _ma_bitmap_flush_all waiting */
+  uint non_flushable;                  /**< 0 if bitmap and log are in sync */
+  PAGECACHE_FILE file;		       /* datafile where bitmap is stored */
+
+#ifdef THREAD
+  pthread_mutex_t bitmap_lock;
+  pthread_cond_t bitmap_cond;          /**< When bitmap becomes flushable */
+#endif
+  /* Constants, allocated when initiating bitmaps */
+  uint sizes[8];                      /* Size per bit combination */
+  uint total_size;		      /* Total usable size of bitmap page */
+  uint block_size;                    /* Block size of file */
+  ulong pages_covered;                /* Pages covered by bitmap + 1 */
+  DYNAMIC_ARRAY pinned_pages;         /**< not-yet-flushable bitmap pages */
+} MARIA_FILE_BITMAP;
+
+#define MARIA_CHECKPOINT_LOOKS_AT_ME 1
+#define MARIA_CHECKPOINT_SHOULD_FREE_ME 2
+#define MARIA_CHECKPOINT_SEEN_IN_LOOP 4
+
+typedef struct st_maria_share
+{					/* Shared between opens */
+  MARIA_STATE_INFO state;
+  MARIA_BASE_INFO base;
+  MARIA_STATE_HISTORY *state_history;
+  MARIA_KEYDEF ft2_keyinfo;		/* Second-level ft-key definition */
+  MARIA_KEYDEF *keyinfo;		/* Key definitions */
+  MARIA_UNIQUEDEF *uniqueinfo;		/* unique definitions */
+  HA_KEYSEG *keyparts;			/* key part info */
+  MARIA_COLUMNDEF *columndef;		/* Pointer to column information */
+  MARIA_PACK pack;			/* Data about packed records */
+  MARIA_BLOB *blobs;			/* Pointer to blobs */
+  uint16 *column_nr;			/* Original column order */
+  LEX_STRING unique_file_name;		/* realpath() of index file */
+  LEX_STRING data_file_name;		/* Resolved path names from symlinks */
+  LEX_STRING index_file_name;
+  LEX_STRING open_file_name;		/* parameter to open filename */
+  uchar *file_map;			/* mem-map of file if possible */
+  PAGECACHE *pagecache;			/* ref to the current key cache */
+  MARIA_DECODE_TREE *decode_trees;
+  /*
+    Previous auto-increment value. Used to verify if we can restore the
+    auto-increment counter if we have to abort an insert (duplicate key).
+  */
+  ulonglong last_auto_increment;
+  uint16 *decode_tables;
+  uint16 id; /**< 2-byte id by which log records refer to the table */
+  /* Called the first time the table instance is opened */
+  my_bool (*once_init)(struct st_maria_share *, File);
+  /* Called when the last instance of the table is closed */
+  my_bool (*once_end)(struct st_maria_share *);
+  /* Is called for every open of the table */
+  my_bool (*init)(MARIA_HA *);
+  /* Is called for every close of the table */
+  void (*end)(MARIA_HA *);
+  /* Called when we want to read a record from a specific position */
+  int (*read_record)(MARIA_HA *, uchar *, MARIA_RECORD_POS);
+  /* Initialize a scan */
+  my_bool (*scan_init)(MARIA_HA *);
+  /* Read next record while scanning */
+  int (*scan)(MARIA_HA *, uchar *, MARIA_RECORD_POS, my_bool);
+  /* End scan */
+  void (*scan_end)(MARIA_HA *);
+  int (*scan_remember_pos)(MARIA_HA *, MARIA_RECORD_POS*);
+  void (*scan_restore_pos)(MARIA_HA *, MARIA_RECORD_POS);
+  /* Pre-write of row (some handlers may do the actual write here) */
+  MARIA_RECORD_POS (*write_record_init)(MARIA_HA *, const uchar *);
+  /* Write record (or accept write_record_init) */
+  my_bool (*write_record)(MARIA_HA *, const uchar *);
+  /* Called when write failed */
+  my_bool (*write_record_abort)(MARIA_HA *);
+  my_bool (*update_record)(MARIA_HA *, MARIA_RECORD_POS,
+                           const uchar *, const uchar *);
+  my_bool (*delete_record)(MARIA_HA *, const uchar *record);
+  my_bool (*compare_record)(MARIA_HA *, const uchar *);
+  /* calculate checksum for a row */
+  ha_checksum(*calc_checksum)(MARIA_HA *, const uchar *);
+  /*
+    Calculate checksum for a row during write. May be 0 if we calculate
+    the checksum in write_record_init()
+  */
+  ha_checksum(*calc_write_checksum)(MARIA_HA *, const uchar *);
+  /* calculate checksum for a row during check table */
+  ha_checksum(*calc_check_checksum)(MARIA_HA *, const uchar *);
+  /* Compare a row in memory with a row on disk */
+  my_bool (*compare_unique)(MARIA_HA *, MARIA_UNIQUEDEF *,
+                            const uchar *record, MARIA_RECORD_POS pos);
+  my_off_t (*keypos_to_recpos)(struct st_maria_share *share, my_off_t pos);
+  my_off_t (*recpos_to_keypos)(struct st_maria_share *share, my_off_t pos);
+  my_bool (*row_is_visible)(MARIA_HA *);
+
+  /* Mapings to read/write the data file */
+  size_t (*file_read)(MARIA_HA *, uchar *, size_t, my_off_t, myf);
+  size_t (*file_write)(MARIA_HA *, const uchar *, size_t, my_off_t, myf);
+  /* query cache invalidator for merged tables */
+  invalidator_by_filename invalidator;
+  /* query cache invalidator for changing state */
+  invalidator_by_filename chst_invalidator;
+  my_off_t key_del_current;		/* delete links for index pages */
+  ulong this_process;			/* processid */
+  ulong last_process;			/* For table-change-check */
+  ulong last_version;			/* Version on start */
+  ulong options;			/* Options used */
+  ulong min_pack_length;		/* These are used by packed data */
+  ulong max_pack_length;
+  ulong state_diff_length;
+  uint rec_reflength;			/* rec_reflength in use now */
+  uint keypage_header;
+  uint32 ftkeys;			/* Number of distinct full-text keys
+						   + 1 */
+  PAGECACHE_FILE kfile;			/* Shared keyfile */
+  File data_file;			/* Shared data file */
+  int mode;				/* mode of file on open */
+  uint reopen;				/* How many times opened */
+  uint in_trans;                        /* Number of references by trn */
+  uint w_locks, r_locks, tot_locks;	/* Number of read/write locks */
+  uint block_size;			/* block_size of keyfile & data file*/
+  uint max_index_block_size;            /* block_size - end_of_page_info */
+  /* Fixed length part of a packed row in BLOCK_RECORD format */
+  uint base_length;
+  myf write_flag;
+  enum data_file_type data_file_type;
+  enum pagecache_page_type page_type;   /* value depending transactional */
+  /**
+     if Checkpoint looking at table; protected by close_lock or THR_LOCK_maria
+  */
+  uint8 in_checkpoint;
+  my_bool temporary;
+  /* Below flag is needed to make log tables work with concurrent insert */
+  my_bool is_log_table;
+
+  my_bool changed,			/* If changed since lock */
+    global_changed,			/* If changed since open */
+    not_flushed;
+  my_bool lock_key_trees;               /* If we have to lock trees on read */
+  my_bool non_transactional_concurrent_insert;
+  my_bool delay_key_write;
+  my_bool have_rtree;
+  /**
+     @brief if the table is transactional right now. It may have been created
+     transactional (base.born_transactional==TRUE) but with transactionality
+     (logging) temporarily disabled (now_transactional==FALSE). The opposite
+     (FALSE, TRUE) is impossible.
+  */
+  my_bool now_transactional;
+  my_bool have_versioning;
+  my_bool key_del_used;                         /* != 0 if key_del is locked */
+  my_bool deleting;                     /* we are going to delete this table */
+#ifdef THREAD
+  THR_LOCK lock;
+  void (*lock_restore_status)(void *);
+  /**
+    Protects kfile, dfile, most members of the state, state disk writes,
+    versioning information (like in_trans, state_history).
+    @todo find the exhaustive list.
+  */
+  pthread_mutex_t intern_lock;	
+  pthread_mutex_t key_del_lock;
+  pthread_cond_t  key_del_cond;
+  /**
+    _Always_ held while closing table; prevents checkpoint from looking at
+    structures freed during closure (like bitmap). If you need close_lock and
+    intern_lock, lock them in this order.
+  */
+  pthread_mutex_t close_lock;
+#endif
+  my_off_t mmaped_length;
+  uint nonmmaped_inserts;		/* counter of writing in
+						   non-mmaped area */
+  MARIA_FILE_BITMAP bitmap;
+  rw_lock_t mmap_lock;
+  LSN lsn_of_file_id; /**< LSN of its last LOGREC_FILE_ID */
+} MARIA_SHARE;
+
+
+typedef uchar MARIA_BITMAP_BUFFER;
+
+typedef struct st_maria_bitmap_block
+{
+  pgcache_page_no_t page;                       /* Page number */
+  /* Number of continuous pages. TAIL_BIT is set if this is a tail page */
+  uint page_count;
+  uint empty_space;                     /* Set for head and tail pages */
+  /*
+    Number of BLOCKS for block-region (holds all non-blob-fields or one blob)
+  */
+  uint sub_blocks;
+  /* set to <> 0 in write_record() if this block was actually used */
+  uint8 used;
+  uint8 org_bitmap_value;
+} MARIA_BITMAP_BLOCK;
+
+
+typedef struct st_maria_bitmap_blocks
+{
+  MARIA_BITMAP_BLOCK *block;
+  uint count;
+  my_bool tail_page_skipped;            /* If some tail pages was not used */
+  my_bool page_skipped;                 /* If some full pages was not used */
+} MARIA_BITMAP_BLOCKS;
+
+
+/* Data about the currently read row */
+typedef struct st_maria_row
+{
+  MARIA_BITMAP_BLOCKS insert_blocks;
+  MARIA_BITMAP_BUFFER *extents;
+  MARIA_RECORD_POS lastpos, nextpos;
+  MARIA_RECORD_POS *tail_positions;
+  ha_checksum checksum;
+  LSN orig_undo_lsn;			/* Lsn at start of row insert */
+  TrID trid;                            /* Transaction id for current row */
+  uchar *empty_bits, *field_lengths;
+  uint *null_field_lengths;             /* All null field lengths */
+  ulong *blob_lengths;                  /* Length for each blob */
+  ulong min_length, normal_length, char_length, varchar_length;
+  ulong blob_length, total_length;
+  size_t extents_buffer_length;         /* Size of 'extents' buffer */
+  uint head_length, header_length;
+  uint field_lengths_length;            /* Length of data in field_lengths */
+  uint extents_count;                   /* number of extents in 'extents' */
+  uint full_page_count, tail_count;     /* For maria_chk */
+  uint space_on_head_page;
+} MARIA_ROW;
+
+/* Data to scan row in blocked format */
+typedef struct st_maria_block_scan
+{
+  uchar *bitmap_buff, *bitmap_pos, *bitmap_end, *page_buff;
+  uchar *dir, *dir_end;
+  pgcache_page_no_t bitmap_page, max_page;
+  ulonglong bits;
+  uint number_of_rows, bit_pos;
+  MARIA_RECORD_POS row_base_page;
+} MARIA_BLOCK_SCAN;
+
+typedef ICP_RESULT (*index_cond_func_t)(void *param);
+
+struct st_maria_handler
+{
+  MARIA_SHARE *s;			/* Shared between open:s */
+  struct st_ma_transaction *trn;        /* Pointer to active transaction */
+  void *external_ptr;           	/* Pointer to THD in mysql */
+  MARIA_STATUS_INFO *state, state_save;
+  MARIA_STATUS_INFO *state_start;       /* State at start of transaction */
+  MARIA_ROW cur_row;                    /* The active row that we just read */
+  MARIA_ROW new_row;			/* Storage for a row during update */
+  MARIA_KEY last_key;                   /* Last found key */
+  MARIA_BLOCK_SCAN scan, *scan_save;
+  MARIA_BLOB *blobs;			/* Pointer to blobs */
+  MARIA_BIT_BUFF bit_buff;
+  DYNAMIC_ARRAY bitmap_blocks;
+  DYNAMIC_ARRAY pinned_pages;
+  /* accumulate indexfile changes between write's */
+  TREE *bulk_insert;
+  LEX_CUSTRING *log_row_parts;		/* For logging */
+  DYNAMIC_ARRAY *ft1_to_ft2;		/* used only in ft1->ft2 conversion */
+  MEM_ROOT      ft_memroot;             /* used by the parser               */
+  MYSQL_FTPARSER_PARAM *ftparser_param;	/* share info between init/deinit */
+  uchar *buff;				/* page buffer */
+  uchar *keyread_buff;                   /* Buffer for last key read */
+  uchar *lastkey_buff;			/* Last used search key */
+  uchar *lastkey_buff2;
+  uchar *first_mbr_key;			/* Searhed spatial key */
+  uchar *rec_buff;			/* Temp buffer for recordpack */
+  uchar *blob_buff;                     /* Temp buffer for blobs */
+  uchar *int_keypos;			/* Save position for next/previous */
+  uchar *int_maxpos;			/* -""- */
+  uint keypos_offset;                   /* Tmp storage for offset int_keypos */
+  uint maxpos_offset;          		/* Tmp storage for offset int_maxpos */
+  uchar *update_field_data;		/* Used by update in rows-in-block */
+  uint int_nod_flag;			/* -""- */
+  uint32 int_keytree_version;		/* -""- */
+  int (*read_record)(MARIA_HA *, uchar*, MARIA_RECORD_POS);
+  invalidator_by_filename invalidator;	/* query cache invalidator */
+  ulonglong last_auto_increment;        /* auto value at start of statement */
+  ulong this_unique;			/* uniq filenumber or thread */
+  ulong last_unique;			/* last unique number */
+  ulong this_loop;			/* counter for this open */
+  ulong last_loop;			/* last used counter */
+  MARIA_RECORD_POS save_lastpos;
+  MARIA_RECORD_POS dup_key_pos;
+  TrID             dup_key_trid;
+  my_off_t pos;				/* Intern variable */
+  my_off_t last_keypage;		/* Last key page read */
+  my_off_t last_search_keypage;		/* Last keypage when searching */
+
+  /*
+    QQ: the folloing two xxx_length fields should be removed,
+     as they are not compatible with parallel repair
+  */
+  ulong packed_length, blob_length;	/* Length of found, packed record */
+  size_t rec_buff_size, blob_buff_size;
+  PAGECACHE_FILE dfile;			/* The datafile */
+  IO_CACHE rec_cache;			/* When cacheing records */
+  LIST open_list;
+  MY_BITMAP changed_fields;
+  ulong row_base_length;                /* Length of row header */
+  uint row_flag;                        /* Flag to store in row header */
+  uint opt_flag;			/* Optim. for space/speed */
+  uint update;				/* If file changed since open */
+  int lastinx;				/* Last used index */
+  uint last_rkey_length;		/* Last length in maria_rkey() */
+  uint *last_rtree_keypos;              /* Last key positions for rtrees */
+  uint bulk_insert_ref_length;          /* Lenght of row ref during bi */
+  uint non_flushable_state;
+  enum ha_rkey_function last_key_func;	/* CONTAIN, OVERLAP, etc */
+  uint save_lastkey_data_length;
+  uint save_lastkey_ref_length;
+  uint pack_key_length;			/* For MARIA_MRG */
+  myf lock_wait;			/* is 0 or MY_SHORT_WAIT */
+  int errkey;				/* Got last error on this key */
+  int lock_type;			/* How database was locked */
+  int tmp_lock_type;			/* When locked by readinfo */
+  uint data_changed;			/* Somebody has changed data */
+  uint save_update;			/* When using KEY_READ */
+  int save_lastinx;
+  uint preload_buff_size;		/* When preloading indexes */
+  uint16 last_used_keyseg;              /* For MARIAMRG */
+  uint8 key_del_used;                   /* != 0 if key_del is used */
+  my_bool was_locked;			/* Was locked in panic */
+  my_bool append_insert_at_end;		/* Set if concurrent insert */
+  my_bool quick_mode;
+  /* Marker if key_del_changed */
+  /* If info->keyread_buff can't be used for rnext */
+  my_bool page_changed;
+  /* If info->keyread_buff has to be re-read for rnext */
+  my_bool keyread_buff_used;
+  my_bool once_flags;			/* For MARIA_MRG */
+  /* For bulk insert enable/disable transactions control */
+  my_bool switched_transactional;
+#ifdef __WIN__
+  my_bool owned_by_merge;               /* This Maria table is part of a merge union */
+#endif
+#ifdef THREAD
+  THR_LOCK_DATA lock;
+#endif
+  uchar *maria_rtree_recursion_state;	/* For RTREE */
+  uchar length_buff[5];			/* temp buff to store blob lengths */
+  int maria_rtree_recursion_depth;
+
+  index_cond_func_t index_cond_func;   /* Index condition function */
+  void *index_cond_func_arg;           /* parameter for the func */
+};
+
+/* Some defines used by maria-functions */
+
+#define USE_WHOLE_KEY	65535         /* Use whole key in _search() */
+#define F_EXTRA_LCK	-1
+
+/* bits in opt_flag */
+#define MEMMAP_USED	32
+#define REMEMBER_OLD_POS 64
+
+#define WRITEINFO_UPDATE_KEYFILE	1
+#define WRITEINFO_NO_UNLOCK		2
+
+/* once_flags */
+#define USE_PACKED_KEYS         1
+#define RRND_PRESERVE_LASTINX   2
+
+/* bits in state.changed */
+
+#define STATE_CHANGED		 1
+#define STATE_CRASHED		 2
+#define STATE_CRASHED_ON_REPAIR  4
+#define STATE_NOT_ANALYZED	 8
+#define STATE_NOT_OPTIMIZED_KEYS 16
+#define STATE_NOT_SORTED_PAGES	 32
+#define STATE_NOT_OPTIMIZED_ROWS 64
+#define STATE_NOT_ZEROFILLED     128
+#define STATE_NOT_MOVABLE        256
+#define STATE_MOVED              512 /* set if base->uuid != maria_uuid */
+#define STATE_IN_REPAIR  	 1024 /* We are running repair on table */
+
+/* options to maria_read_cache */
+
+#define READING_NEXT	1
+#define READING_HEADER	2
+
+/* Number of bytes on key pages to indicate used size */
+#define KEYPAGE_USED_SIZE  2
+#define KEYPAGE_KEYID_SIZE 1
+#define KEYPAGE_FLAG_SIZE  1
+#define KEYPAGE_CHECKSUM_SIZE 4
+#define MAX_KEYPAGE_HEADER_SIZE (LSN_STORE_SIZE + KEYPAGE_USED_SIZE + \
+                                 KEYPAGE_KEYID_SIZE + KEYPAGE_FLAG_SIZE + \
+                                 TRANSID_SIZE)
+#define KEYPAGE_FLAG_ISNOD      1
+#define KEYPAGE_FLAG_HAS_TRANSID 2
+/* Position to KEYPAGE_FLAG for transactional tables */
+#define KEYPAGE_TRANSFLAG_OFFSET LSN_STORE_SIZE + TRANSID_SIZE + KEYPAGE_KEYID_SIZE
+
+#define _ma_get_page_used(share,x) \
+  ((uint) mi_uint2korr((x) + (share)->keypage_header - KEYPAGE_USED_SIZE))
+#define _ma_store_page_used(share,x,y) \
+  mi_int2store((x) + (share)->keypage_header - KEYPAGE_USED_SIZE, (y))
+#define _ma_get_keypage_flag(share,x) x[(share)->keypage_header - KEYPAGE_USED_SIZE - KEYPAGE_FLAG_SIZE]
+#define _ma_test_if_nod(share,x) \
+  ((_ma_get_keypage_flag(share,x) & KEYPAGE_FLAG_ISNOD) ? (share)->base.key_reflength : 0)
+
+#define _ma_store_keynr(share, x, nr) x[(share)->keypage_header - KEYPAGE_KEYID_SIZE - KEYPAGE_FLAG_SIZE - KEYPAGE_USED_SIZE]= (nr)
+#define _ma_get_keynr(share, x) ((uchar) x[(share)->keypage_header - KEYPAGE_KEYID_SIZE - KEYPAGE_FLAG_SIZE - KEYPAGE_USED_SIZE])
+#define _ma_store_transid(buff, transid) \
+  transid_store((buff) + LSN_STORE_SIZE, (transid))
+#define _ma_korr_transid(buff) \
+  transid_korr((buff) + LSN_STORE_SIZE)
+#define _ma_store_keypage_flag(share,x,flag) x[(share)->keypage_header - KEYPAGE_USED_SIZE - KEYPAGE_FLAG_SIZE]= (flag)
+#define _ma_mark_page_with_transid(share, page) \
+  (page)->flag|= KEYPAGE_FLAG_HAS_TRANSID;                              \
+  (page)->buff[(share)->keypage_header - KEYPAGE_USED_SIZE - KEYPAGE_FLAG_SIZE]= (page)->flag;
+
+
+/*
+  TODO: write int4store_aligned as *((uint32 *) (T))= (uint32) (A) for
+  architectures where it is possible
+*/
+#define int4store_aligned(A,B) int4store((A),(B))
+
+#define maria_mark_crashed(x) do{(x)->s->state.changed|= STATE_CRASHED; \
+    DBUG_PRINT("error", ("Marked table crashed"));                      \
+  }while(0)
+#define maria_mark_crashed_share(x)                                     \
+  do{(x)->state.changed|= STATE_CRASHED;                                \
+    DBUG_PRINT("error", ("Marked table crashed"));                      \
+  }while(0)
+#define maria_mark_crashed_on_repair(x) do{(x)->s->state.changed|=      \
+      STATE_CRASHED|STATE_CRASHED_ON_REPAIR;                            \
+    (x)->update|= HA_STATE_CHANGED;                                     \
+    DBUG_PRINT("error", ("Marked table crashed on repair"));            \
+  }while(0)
+#define maria_mark_in_repair(x) do{(x)->s->state.changed|=      \
+      STATE_CRASHED | STATE_IN_REPAIR;                          \
+    (x)->update|= HA_STATE_CHANGED;                             \
+    DBUG_PRINT("error", ("Marked table crashed for repair"));   \
+  }while(0)
+#define maria_is_crashed(x) ((x)->s->state.changed & STATE_CRASHED)
+#define maria_is_crashed_on_repair(x) ((x)->s->state.changed & STATE_CRASHED_ON_REPAIR)
+#define maria_in_repair(x) ((x)->s->state.changed & STATE_IN_REPAIR)
+
+#ifdef EXTRA_DEBUG
+/**
+  Brings additional information in certain debug builds and in standalone
+  (non-ha_maria) programs. To help debugging. Not in ha_maria, to not spam the
+  user (some messages can be produced many times per statement, or even
+  wrongly during some repair operations).
+*/
+#define maria_print_error(SHARE, ERRNO)                         \
+  do{ if (!maria_in_ha_maria)                                   \
+      _ma_report_error((ERRNO), &(SHARE)->index_file_name); }    \
+  while(0)
+#else
+#define maria_print_error(SHARE, ERRNO) while (0)
+#endif
+#define DBUG_DUMP_KEY(name, key) DBUG_DUMP(name, (key)->data, (key)->data_length + (key)->ref_length)
+
+
+/* Functions to store length of space packed keys, VARCHAR or BLOB keys */
+
+#define store_key_length(key,length) \
+{ if ((length) < 255) \
+  { *(key)=(length); } \
+  else \
+  { *(key)=255; mi_int2store((key)+1,(length)); } \
+}
+
+#define get_key_full_length(length,key) \
+  { if (*(const uchar*) (key) != 255)            \
+    length= ((uint) *(const uchar*) ((key)++))+1; \
+  else \
+  { length=mi_uint2korr((key)+1)+3; (key)+=3; } \
+}
+
+#define get_key_full_length_rdonly(length,key) \
+{ if (*(const uchar*) (key) != 255) \
+    length= ((uint) *(const uchar*) ((key)))+1; \
+  else \
+  { length=mi_uint2korr((key)+1)+3; } \
+}
+
+#define maria_max_key_length() ((maria_block_size - MAX_KEYPAGE_HEADER_SIZE)/2 - MARIA_INDEX_OVERHEAD_SIZE)
+#define get_pack_length(length) ((length) >= 255 ? 3 : 1)
+#define _ma_have_versioning(info) ((info)->row_flag & ROW_FLAG_TRANSID)
+
+/**
+   Sets table's trn and prints debug information
+   @param tbl              MARIA_HA of table
+   @param newtrn           what to put into tbl->trn
+   @note cast of newtrn is because %p of NULL gives warning (NULL is int)
+*/
+#define _ma_set_trn_for_table(tbl, newtrn) do {                         \
+    DBUG_PRINT("info",("table: %p trn: %p -> %p",                       \
+                       (tbl), (tbl)->trn, (void *)(newtrn)));           \
+    (tbl)->trn= (newtrn);                                               \
+  } while (0)
+
+
+#define MARIA_MIN_BLOCK_LENGTH	20		/* Because of delete-link */
+/* Don't use to small record-blocks */
+#define MARIA_EXTEND_BLOCK_LENGTH	20
+#define MARIA_SPLIT_LENGTH	((MARIA_EXTEND_BLOCK_LENGTH+4)*2)
+	/* Max prefix of record-block */
+#define MARIA_MAX_DYN_BLOCK_HEADER	20
+#define MARIA_BLOCK_INFO_HEADER_LENGTH 20
+#define MARIA_DYN_DELETE_BLOCK_HEADER 20    /* length of delete-block-header */
+#define MARIA_DYN_MAX_BLOCK_LENGTH	((1L << 24)-4L)
+#define MARIA_DYN_MAX_ROW_LENGTH	(MARIA_DYN_MAX_BLOCK_LENGTH - MARIA_SPLIT_LENGTH)
+#define MARIA_DYN_ALIGN_SIZE	  4	/* Align blocks on this */
+#define MARIA_MAX_DYN_HEADER_BYTE 13	/* max header uchar for dynamic rows */
+#define MARIA_MAX_BLOCK_LENGTH	((((ulong) 1 << 24)-1) & (~ (ulong) (MARIA_DYN_ALIGN_SIZE-1)))
+#define MARIA_REC_BUFF_OFFSET      ALIGN_SIZE(MARIA_DYN_DELETE_BLOCK_HEADER+sizeof(uint32))
+
+#define MEMMAP_EXTRA_MARGIN	7	/* Write this as a suffix for file */
+
+#define PACK_TYPE_SELECTED	1	/* Bits in field->pack_type */
+#define PACK_TYPE_SPACE_FIELDS	2
+#define PACK_TYPE_ZERO_FILL	4
+#define MARIA_FOUND_WRONG_KEY 32738	/* Impossible value from ha_key_cmp */
+
+#define MARIA_BLOCK_SIZE(key_length,data_pointer,key_pointer,block_size)  (((((key_length)+(data_pointer)+(key_pointer))*4+(key_pointer)+2)/(block_size)+1)*(block_size))
+#define MARIA_MAX_KEYPTR_SIZE	5	/* For calculating block lengths */
+
+/* Marker for impossible delete link */
+#define IMPOSSIBLE_PAGE_NO LL(0xFFFFFFFFFF)
+
+/* The UNIQUE check is done with a hashed long key */
+
+#define MARIA_UNIQUE_HASH_TYPE	HA_KEYTYPE_ULONG_INT
+#define maria_unique_store(A,B)    mi_int4store((A),(B))
+
+#ifdef THREAD
+extern pthread_mutex_t THR_LOCK_maria;
+#endif
+#if !defined(THREAD) || defined(DONT_USE_RW_LOCKS)
+#define rw_wrlock(A) {}
+#define rw_rdlock(A) {}
+#define rw_unlock(A) {}
+#endif
+
+/* Some tuning parameters */
+#define MARIA_MIN_KEYBLOCK_LENGTH 50	/* When to split delete blocks */
+#define MARIA_MIN_SIZE_BULK_INSERT_TREE 16384	/* this is per key */
+#define MARIA_MIN_ROWS_TO_USE_BULK_INSERT 100
+#define MARIA_MIN_ROWS_TO_DISABLE_INDEXES 100
+#define MARIA_MIN_ROWS_TO_USE_WRITE_CACHE 10
+/* Keep a small buffer for tables only using small blobs */
+#define MARIA_SMALL_BLOB_BUFFER 1024
+#define MARIA_MAX_CONTROL_FILE_LOCK_RETRY 30     /* Retry this many times */
+
+
+/* Some extern variables */
+extern LIST *maria_open_list;
+extern uchar maria_file_magic[], maria_pack_file_magic[];
+extern uchar maria_uuid[MY_UUID_SIZE];
+extern uint32 maria_read_vec[], maria_readnext_vec[];
+extern uint maria_quick_table_bits;
+extern char *maria_data_root;
+extern uchar maria_zero_string[];
+extern my_bool maria_inited, maria_in_ha_maria, maria_recovery_changed_data;
+extern my_bool maria_recovery_verbose;
+extern HASH maria_stored_state;
+extern int (*maria_create_trn_hook)(MARIA_HA *);
+
+/* This is used by _ma_calc_xxx_key_length och _ma_store_key */
+typedef struct st_maria_s_param
+{
+  const uchar *key;
+  uchar *prev_key, *next_key_pos;
+  uchar *key_pos;                               /* For balance page */
+  uint ref_length, key_length, n_ref_length;
+  uint n_length, totlength, part_of_prev_key, prev_length, pack_marker;
+  uint changed_length;
+  int move_length;                              /* For balance_page */
+  my_bool store_not_null;
+} MARIA_KEY_PARAM;
+
+
+/* Used to store reference to pinned page */
+typedef struct st_pinned_page
+{
+  PAGECACHE_BLOCK_LINK *link;
+  enum pagecache_page_lock unlock, write_lock;
+  my_bool changed;
+} MARIA_PINNED_PAGE;
+
+
+/* Keeps all information about a page and related to a page */
+typedef struct st_maria_page
+{
+  MARIA_HA *info;
+  const MARIA_KEYDEF *keyinfo;
+  uchar *buff;				/* Data for page */
+  my_off_t pos;                         /* Disk address to page */
+  uint     size;                        /* Size of data on page */
+  uint     org_size;                    /* Size of page at read or after log */
+  uint     node;      			/* 0 or share->base.key_reflength */
+  uint     flag;			/* Page flag */
+  uint     link_offset;
+} MARIA_PAGE;
+
+
+/* Prototypes for intern functions */
+extern int _ma_read_dynamic_record(MARIA_HA *, uchar *, MARIA_RECORD_POS);
+extern int _ma_read_rnd_dynamic_record(MARIA_HA *, uchar *, MARIA_RECORD_POS,
+                                       my_bool);
+extern my_bool _ma_write_dynamic_record(MARIA_HA *, const uchar *);
+extern my_bool _ma_update_dynamic_record(MARIA_HA *, MARIA_RECORD_POS,
+                                         const uchar *, const uchar *);
+extern my_bool _ma_delete_dynamic_record(MARIA_HA *info, const uchar *record);
+extern my_bool _ma_cmp_dynamic_record(MARIA_HA *info, const uchar *record);
+extern my_bool _ma_write_blob_record(MARIA_HA *, const uchar *);
+extern my_bool _ma_update_blob_record(MARIA_HA *, MARIA_RECORD_POS,
+                                      const uchar *, const uchar *);
+extern int _ma_read_static_record(MARIA_HA *info, uchar *, MARIA_RECORD_POS);
+extern int _ma_read_rnd_static_record(MARIA_HA *, uchar *, MARIA_RECORD_POS,
+                                      my_bool);
+extern my_bool _ma_write_static_record(MARIA_HA *, const uchar *);
+extern my_bool _ma_update_static_record(MARIA_HA *, MARIA_RECORD_POS,
+                                        const uchar *, const uchar *);
+extern my_bool _ma_delete_static_record(MARIA_HA *info, const uchar *record);
+extern my_bool _ma_cmp_static_record(MARIA_HA *info, const uchar *record);
+extern my_bool _ma_ck_write(MARIA_HA *info, MARIA_KEY *key);
+extern my_bool _ma_enlarge_root(MARIA_HA *info, MARIA_KEY *key,
+                                MARIA_RECORD_POS *root);
+int _ma_insert(register MARIA_HA *info, MARIA_KEY *key,
+               MARIA_PAGE *anc_page, uchar *key_pos, uchar *key_buff,
+               MARIA_PAGE *father_page, uchar *father_key_pos,
+               my_bool insert_last);
+extern my_bool _ma_ck_real_write_btree(MARIA_HA *info, MARIA_KEY *key,
+                                   MARIA_RECORD_POS *root, uint32 comp_flag);
+extern int _ma_split_page(MARIA_HA *info, MARIA_KEY *key,
+                          MARIA_PAGE *split_page,
+                          uint org_split_length,
+                          uchar *inserted_key_pos, uint changed_length,
+                          int move_length,
+                          uchar *key_buff, my_bool insert_last_key);
+extern uchar *_ma_find_half_pos(MARIA_KEY *key, MARIA_PAGE *page,
+                                uchar ** after_key);
+extern int _ma_calc_static_key_length(const MARIA_KEY *key, uint nod_flag,
+                                      uchar *key_pos, uchar *org_key,
+                                      uchar *key_buff,
+                                      MARIA_KEY_PARAM *s_temp);
+extern int _ma_calc_var_key_length(const MARIA_KEY *key, uint nod_flag,
+                                   uchar *key_pos, uchar *org_key,
+                                   uchar *key_buff,
+                                   MARIA_KEY_PARAM *s_temp);
+extern int _ma_calc_var_pack_key_length(const MARIA_KEY *key,
+                                        uint nod_flag, uchar *next_key,
+                                        uchar *org_key, uchar *prev_key,
+                                        MARIA_KEY_PARAM *s_temp);
+extern int _ma_calc_bin_pack_key_length(const MARIA_KEY *key,
+                                        uint nod_flag, uchar *next_key,
+                                        uchar *org_key, uchar *prev_key,
+                                        MARIA_KEY_PARAM *s_temp);
+extern void _ma_store_static_key(MARIA_KEYDEF *keyinfo, uchar *key_pos,
+                                 MARIA_KEY_PARAM *s_temp);
+extern void _ma_store_var_pack_key(MARIA_KEYDEF *keyinfo, uchar *key_pos,
+                                   MARIA_KEY_PARAM *s_temp);
+#ifdef NOT_USED
+extern void _ma_store_pack_key(MARIA_KEYDEF *keyinfo, uchar *key_pos,
+                               MARIA_KEY_PARAM *s_temp);
+#endif
+extern void _ma_store_bin_pack_key(MARIA_KEYDEF *keyinfo, uchar *key_pos,
+                                   MARIA_KEY_PARAM *s_temp);
+
+extern my_bool _ma_ck_delete(MARIA_HA *info, MARIA_KEY *key);
+extern my_bool _ma_ck_real_delete(register MARIA_HA *info, MARIA_KEY *key,
+                                  my_off_t *root);
+extern int _ma_readinfo(MARIA_HA *info, int lock_flag, int check_keybuffer);
+extern int _ma_writeinfo(MARIA_HA *info, uint options);
+extern int _ma_test_if_changed(MARIA_HA *info);
+extern int _ma_mark_file_changed(MARIA_HA *info);
+extern void _ma_mark_file_crashed(MARIA_SHARE *share);
+extern my_bool _ma_set_uuid(MARIA_HA *info, my_bool reset_uuid);
+extern my_bool _ma_check_if_zero(uchar *pos, size_t size);
+extern int _ma_decrement_open_count(MARIA_HA *info);
+extern int _ma_check_index(MARIA_HA *info, int inx);
+extern int _ma_search(MARIA_HA *info, MARIA_KEY *key, uint32 nextflag,
+                      my_off_t pos);
+extern int _ma_bin_search(const MARIA_KEY *key, const MARIA_PAGE *page,
+                          uint32 comp_flag, uchar **ret_pos, uchar *buff,
+                          my_bool *was_last_key);
+extern int _ma_seq_search(const MARIA_KEY *key, const MARIA_PAGE *page,
+                          uint comp_flag, uchar ** ret_pos, uchar *buff,
+                          my_bool *was_last_key);
+extern int _ma_prefix_search(const MARIA_KEY *key, const MARIA_PAGE *page,
+                             uint32 comp_flag, uchar ** ret_pos, uchar *buff,
+                             my_bool *was_last_key);
+extern my_off_t _ma_kpos(uint nod_flag, const uchar *after_key);
+extern void _ma_kpointer(MARIA_HA *info, uchar *buff, my_off_t pos);
+MARIA_RECORD_POS _ma_row_pos_from_key(const MARIA_KEY *key);
+TrID _ma_trid_from_key(const MARIA_KEY *key);
+extern MARIA_RECORD_POS _ma_rec_pos(MARIA_SHARE *share, uchar *ptr);
+extern void _ma_dpointer(MARIA_SHARE *share, uchar *buff,
+                         MARIA_RECORD_POS pos);
+extern uint _ma_get_static_key(MARIA_KEY *key, uint page_flag, uint nod_flag,
+                               uchar **page);
+extern uchar *_ma_skip_static_key(MARIA_KEY *key, uint page_flag,
+                           uint nod_flag, uchar *page);
+extern uint _ma_get_pack_key(MARIA_KEY *key, uint page_flag, uint nod_flag,
+                             uchar **page);
+extern uchar *_ma_skip_pack_key(MARIA_KEY *key, uint page_flag,
+                                uint nod_flag, uchar *page);
+extern uint _ma_get_binary_pack_key(MARIA_KEY *key, uint page_flag,
+                                    uint nod_flag, uchar **page_pos);
+uchar *_ma_skip_binary_pack_key(MARIA_KEY *key, uint page_flag,
+                                uint nod_flag, uchar *page);
+extern uchar *_ma_get_last_key(MARIA_KEY *key, MARIA_PAGE *page,
+                               uchar *endpos);
+extern uchar *_ma_get_key(MARIA_KEY *key, MARIA_PAGE *page, uchar *keypos);
+extern uint _ma_keylength(MARIA_KEYDEF *keyinfo, const uchar *key);
+extern uint _ma_keylength_part(MARIA_KEYDEF *keyinfo, const uchar *key,
+                               HA_KEYSEG *end);
+extern int _ma_search_next(MARIA_HA *info, MARIA_KEY *key,
+                           uint32 nextflag, my_off_t pos);
+extern int _ma_search_first(MARIA_HA *info, MARIA_KEYDEF *keyinfo,
+                            my_off_t pos);
+extern int _ma_search_last(MARIA_HA *info, MARIA_KEYDEF *keyinfo,
+                           my_off_t pos);
+extern my_off_t _ma_static_keypos_to_recpos(MARIA_SHARE *share, my_off_t pos);
+extern my_off_t _ma_static_recpos_to_keypos(MARIA_SHARE *share, my_off_t pos);
+extern my_off_t _ma_transparent_recpos(MARIA_SHARE *share, my_off_t pos);
+extern my_off_t _ma_transaction_keypos_to_recpos(MARIA_SHARE *, my_off_t pos);
+extern my_off_t _ma_transaction_recpos_to_keypos(MARIA_SHARE *, my_off_t pos);
+
+extern void _ma_page_setup(MARIA_PAGE *page, MARIA_HA *info,
+                           const MARIA_KEYDEF *keyinfo, my_off_t pos,
+                           uchar *buff);
+extern my_bool _ma_fetch_keypage(MARIA_PAGE *page, MARIA_HA *info,
+                                 const MARIA_KEYDEF *keyinfo,
+                                 my_off_t pos, enum pagecache_page_lock lock,
+                                 int level, uchar *buff,
+                                 my_bool return_buffer);
+extern my_bool _ma_write_keypage(MARIA_PAGE *page,
+                                 enum pagecache_page_lock lock, int level);
+extern int _ma_dispose(MARIA_HA *info, my_off_t pos, my_bool page_not_read);
+extern my_off_t _ma_new(register MARIA_HA *info, int level,
+                        MARIA_PINNED_PAGE **page_link);
+extern my_bool _ma_compact_keypage(MARIA_PAGE *page, TrID min_read_from);
+extern uint transid_store_packed(MARIA_HA *info, uchar *to, ulonglong trid);
+extern ulonglong transid_get_packed(MARIA_SHARE *share, const uchar *from);
+#define transid_packed_length(data) \
+  ((data)[0] < MARIA_MIN_TRANSID_PACK_OFFSET ? 1 : \
+   (uint) ((uchar) (data)[0]) - (MARIA_TRANSID_PACK_OFFSET - 1))
+#define key_has_transid(key) (*(key) & 1)
+
+#define page_mark_changed(info, page) \
+  dynamic_element(&(info)->pinned_pages, (page)->link_offset,            \
+                  MARIA_PINNED_PAGE*)->changed= 1;
+#define page_store_size(share, page)                           \
+  _ma_store_page_used((share), (page)->buff, (page)->size);
+#define page_store_info(share, page)                           \
+  _ma_store_keypage_flag((share), (page)->buff, (page)->flag); \
+  _ma_store_page_used((share), (page)->buff, (page)->size);
+#ifdef IDENTICAL_PAGES_AFTER_RECOVERY
+void page_cleanup(MARIA_SHARE *share, MARIA_PAGE *page)
+#else
+#define page_cleanup(A,B) while (0)
+#endif
+
+extern MARIA_KEY *_ma_make_key(MARIA_HA *info, MARIA_KEY *int_key, uint keynr,
+                               uchar *key, const uchar *record,
+                               MARIA_RECORD_POS filepos, ulonglong trid);
+extern MARIA_KEY *_ma_pack_key(MARIA_HA *info, MARIA_KEY *int_key,
+                               uint keynr, uchar *key,
+                               const uchar *old, key_part_map keypart_map,
+                               HA_KEYSEG ** last_used_keyseg);
+extern void _ma_copy_key(MARIA_KEY *to, const MARIA_KEY *from);
+extern int _ma_read_key_record(MARIA_HA *info, uchar *buf, MARIA_RECORD_POS);
+extern my_bool _ma_read_cache(IO_CACHE *info, uchar *buff,
+                              MARIA_RECORD_POS pos, size_t length,
+                              uint re_read_if_possibly);
+extern ulonglong ma_retrieve_auto_increment(const uchar *key, uint8 key_type);
+extern my_bool _ma_alloc_buffer(uchar **old_addr, size_t *old_size,
+                                size_t new_size);
+extern ulong _ma_rec_unpack(MARIA_HA *info, uchar *to, uchar *from,
+                            ulong reclength);
+extern my_bool _ma_rec_check(MARIA_HA *info, const uchar *record,
+                             uchar *packpos, ulong packed_length,
+                             my_bool with_checkum, ha_checksum checksum);
+extern int _ma_write_part_record(MARIA_HA *info, my_off_t filepos,
+                                 ulong length, my_off_t next_filepos,
+                                 uchar ** record, ulong *reclength,
+                                 int *flag);
+extern void _ma_print_key(FILE *stream, MARIA_KEY *key);
+extern void _ma_print_keydata(FILE *stream, HA_KEYSEG *keyseg,
+                              const uchar *key, uint length);
+extern my_bool _ma_once_init_pack_row(MARIA_SHARE *share, File dfile);
+extern my_bool _ma_once_end_pack_row(MARIA_SHARE *share);
+extern int _ma_read_pack_record(MARIA_HA *info, uchar *buf,
+                                MARIA_RECORD_POS filepos);
+extern int _ma_read_rnd_pack_record(MARIA_HA *, uchar *, MARIA_RECORD_POS,
+                                    my_bool);
+extern int _ma_pack_rec_unpack(MARIA_HA *info, MARIA_BIT_BUFF *bit_buff,
+                               uchar *to, uchar *from, ulong reclength);
+extern ulonglong _ma_safe_mul(ulonglong a, ulonglong b);
+extern int _ma_ft_update(MARIA_HA *info, uint keynr, uchar *keybuf,
+                         const uchar *oldrec, const uchar *newrec,
+                         my_off_t pos);
+
+/*
+  Parameter to _ma_get_block_info
+  The dynamic row header is read into this struct. For an explanation of
+  the fields, look at the function _ma_get_block_info().
+*/
+
+typedef struct st_maria_block_info
+{
+  uchar header[MARIA_BLOCK_INFO_HEADER_LENGTH];
+  ulong rec_len;
+  ulong data_len;
+  ulong block_len;
+  ulong blob_len;
+  MARIA_RECORD_POS filepos;
+  MARIA_RECORD_POS next_filepos;
+  MARIA_RECORD_POS prev_filepos;
+  uint second_read;
+  uint offset;
+} MARIA_BLOCK_INFO;
+
+
+/* bits in return from _ma_get_block_info */
+
+#define BLOCK_FIRST	1
+#define BLOCK_LAST	2
+#define BLOCK_DELETED	4
+#define BLOCK_ERROR	8			/* Wrong data */
+#define BLOCK_SYNC_ERROR 16			/* Right data at wrong place */
+#define BLOCK_FATAL_ERROR 32			/* hardware-error */
+
+#define NEED_MEM	((uint) 10*4*(IO_SIZE+32)+32) /* Nead for recursion */
+#define MAXERR			20
+#define BUFFERS_WHEN_SORTING	16		/* Alloc for sort-key-tree */
+#define WRITE_COUNT		MY_HOW_OFTEN_TO_WRITE
+#define INDEX_TMP_EXT		".TMM"
+#define DATA_TMP_EXT		".TMD"
+
+#define UPDATE_TIME		1
+#define UPDATE_STAT		2
+#define UPDATE_SORT		4
+#define UPDATE_AUTO_INC		8
+#define UPDATE_OPEN_COUNT	16
+
+#define USE_BUFFER_INIT		(((1024L*1024L*128-MALLOC_OVERHEAD)/8192)*8192)
+#define READ_BUFFER_INIT	(1024L*256L-MALLOC_OVERHEAD)
+#define SORT_BUFFER_INIT	(1024L*1024L*256-MALLOC_OVERHEAD)
+#define MIN_SORT_BUFFER		(4096-MALLOC_OVERHEAD)
+
+#define fast_ma_writeinfo(INFO) if (!(INFO)->s->tot_locks) (void) _ma_writeinfo((INFO),0)
+#define fast_ma_readinfo(INFO) ((INFO)->lock_type == F_UNLCK) && _ma_readinfo((INFO),F_RDLCK,1)
+
+extern uint _ma_get_block_info(MARIA_BLOCK_INFO *, File, my_off_t);
+extern uint _ma_rec_pack(MARIA_HA *info, uchar *to, const uchar *from);
+extern uint _ma_pack_get_block_info(MARIA_HA *maria, MARIA_BIT_BUFF *bit_buff,
+                                    MARIA_BLOCK_INFO *info, uchar **rec_buff_p,
+                                    size_t *rec_buff_size,
+                                    File file, my_off_t filepos);
+extern void _ma_store_blob_length(uchar *pos, uint pack_length, uint length);
+extern void _ma_report_error(int errcode, const LEX_STRING *file_name);
+extern my_bool _ma_memmap_file(MARIA_HA *info);
+extern void _ma_unmap_file(MARIA_HA *info);
+extern uint _ma_save_pack_length(uint version, uchar * block_buff,
+                                 ulong length);
+extern uint _ma_calc_pack_length(uint version, ulong length);
+extern ulong _ma_calc_blob_length(uint length, const uchar *pos);
+extern size_t _ma_mmap_pread(MARIA_HA *info, uchar *Buffer,
+			     size_t Count, my_off_t offset, myf MyFlags);
+extern size_t _ma_mmap_pwrite(MARIA_HA *info, const uchar *Buffer,
+			      size_t Count, my_off_t offset, myf MyFlags);
+extern size_t _ma_nommap_pread(MARIA_HA *info, uchar *Buffer,
+			       size_t Count, my_off_t offset, myf MyFlags);
+extern size_t _ma_nommap_pwrite(MARIA_HA *info, const uchar *Buffer,
+				size_t Count, my_off_t offset, myf MyFlags);
+
+/* my_pwrite instead of my_write used */
+#define MA_STATE_INFO_WRITE_DONT_MOVE_OFFSET 1
+/* info should be written */
+#define MA_STATE_INFO_WRITE_FULL_INFO        2
+/* intern_lock taking is needed */
+#define MA_STATE_INFO_WRITE_LOCK             4
+uint _ma_state_info_write(MARIA_SHARE *share, uint pWrite);
+uint _ma_state_info_write_sub(File file, MARIA_STATE_INFO *state, uint pWrite);
+uint _ma_state_info_read_dsk(File file, MARIA_STATE_INFO *state);
+uint _ma_base_info_write(File file, MARIA_BASE_INFO *base);
+my_bool _ma_keyseg_write(File file, const HA_KEYSEG *keyseg);
+uchar *_ma_keyseg_read(uchar *ptr, HA_KEYSEG *keyseg);
+my_bool _ma_keydef_write(File file, MARIA_KEYDEF *keydef);
+uchar *_ma_keydef_read(uchar *ptr, MARIA_KEYDEF *keydef);
+my_bool _ma_uniquedef_write(File file, MARIA_UNIQUEDEF *keydef);
+uchar *_ma_uniquedef_read(uchar *ptr, MARIA_UNIQUEDEF *keydef);
+my_bool _ma_columndef_write(File file, MARIA_COLUMNDEF *columndef);
+uchar *_ma_columndef_read(uchar *ptr, MARIA_COLUMNDEF *columndef);
+my_bool _ma_column_nr_write(File file, uint16 *offsets, uint columns);
+uchar *_ma_column_nr_read(uchar *ptr, uint16 *offsets, uint columns);
+ulong _ma_calc_total_blob_length(MARIA_HA *info, const uchar *record);
+ha_checksum _ma_checksum(MARIA_HA *info, const uchar *buf);
+ha_checksum _ma_static_checksum(MARIA_HA *info, const uchar *buf);
+my_bool _ma_check_unique(MARIA_HA *info, MARIA_UNIQUEDEF *def,
+                         uchar *record, ha_checksum unique_hash,
+                         MARIA_RECORD_POS pos);
+ha_checksum _ma_unique_hash(MARIA_UNIQUEDEF *def, const uchar *buf);
+my_bool _ma_cmp_static_unique(MARIA_HA *info, MARIA_UNIQUEDEF *def,
+                              const uchar *record, MARIA_RECORD_POS pos);
+my_bool _ma_cmp_dynamic_unique(MARIA_HA *info, MARIA_UNIQUEDEF *def,
+                               const uchar *record, MARIA_RECORD_POS pos);
+my_bool _ma_unique_comp(MARIA_UNIQUEDEF *def, const uchar *a, const uchar *b,
+                        my_bool null_are_equal);
+void _ma_get_status(void *param, my_bool concurrent_insert);
+void _ma_update_status(void *param);
+void _ma_restore_status(void *param);
+void _ma_copy_status(void *to, void *from);
+my_bool _ma_check_status(void *param);
+void _ma_restore_status(void *param);
+void _ma_reset_status(MARIA_HA *maria);
+int _ma_def_scan_remember_pos(MARIA_HA *info, MARIA_RECORD_POS *lastpos);
+void _ma_def_scan_restore_pos(MARIA_HA *info, MARIA_RECORD_POS lastpos);
+
+#include "ma_commit.h"
+
+extern MARIA_HA *_ma_test_if_reopen(const char *filename);
+my_bool _ma_check_table_is_closed(const char *name, const char *where);
+int _ma_open_datafile(MARIA_HA *info, MARIA_SHARE *share, const char *org_name,
+                      File file_to_dup);
+int _ma_open_keyfile(MARIA_SHARE *share);
+void _ma_setup_functions(register MARIA_SHARE *share);
+my_bool _ma_dynmap_file(MARIA_HA *info, my_off_t size);
+void _ma_remap_file(MARIA_HA *info, my_off_t size);
+
+MARIA_RECORD_POS _ma_write_init_default(MARIA_HA *info, const uchar *record);
+my_bool _ma_write_abort_default(MARIA_HA *info);
+
+C_MODE_START
+#define MARIA_FLUSH_DATA  1
+#define MARIA_FLUSH_INDEX 2
+int _ma_flush_table_files(MARIA_HA *info, uint flush_data_or_index,
+                          enum flush_type flush_type_for_data,
+                          enum flush_type flush_type_for_index);
+/*
+  Functions needed by _ma_check (are overridden in MySQL/ha_maria.cc).
+  See ma_check_standalone.h .
+*/
+int _ma_killed_ptr(HA_CHECK *param);
+void _ma_check_print_error _VARARGS((HA_CHECK *param, const char *fmt, ...))
+  ATTRIBUTE_FORMAT(printf, 2, 3);
+void _ma_check_print_warning _VARARGS((HA_CHECK *param, const char *fmt, ...))
+  ATTRIBUTE_FORMAT(printf, 2, 3);
+void _ma_check_print_info _VARARGS((HA_CHECK *param, const char *fmt, ...))
+  ATTRIBUTE_FORMAT(printf, 2, 3);
+my_bool write_log_record_for_repair(const HA_CHECK *param, MARIA_HA *info);
+C_MODE_END
+
+int _ma_flush_pending_blocks(MARIA_SORT_PARAM *param);
+int _ma_sort_ft_buf_flush(MARIA_SORT_PARAM *sort_param);
+int _ma_thr_write_keys(MARIA_SORT_PARAM *sort_param);
+#ifdef THREAD
+pthread_handler_t _ma_thr_find_all_keys(void *arg);
+#endif
+
+int _ma_sort_write_record(MARIA_SORT_PARAM *sort_param);
+int _ma_create_index_by_sort(MARIA_SORT_PARAM *info, my_bool no_messages,
+                             size_t);
+int _ma_sync_table_files(const MARIA_HA *info);
+int _ma_initialize_data_file(MARIA_SHARE *share, File dfile);
+int _ma_update_state_lsns(MARIA_SHARE *share,
+                          LSN lsn, TrID create_trid, my_bool do_sync,
+                          my_bool update_create_rename_lsn);
+int _ma_update_state_lsns_sub(MARIA_SHARE *share, LSN lsn,
+                              TrID create_trid, my_bool do_sync,
+                              my_bool update_create_rename_lsn);
+void _ma_set_data_pagecache_callbacks(PAGECACHE_FILE *file,
+                                      MARIA_SHARE *share);
+void _ma_set_index_pagecache_callbacks(PAGECACHE_FILE *file,
+                                       MARIA_SHARE *share);
+void _ma_tmp_disable_logging_for_table(MARIA_HA *info,
+                                       my_bool log_incomplete);
+my_bool _ma_reenable_logging_for_table(MARIA_HA *info, my_bool flush_pages);
+my_bool write_log_record_for_bulk_insert(MARIA_HA *info);
+void _ma_unpin_all_pages(MARIA_HA *info, LSN undo_lsn);
+
+#define MARIA_NO_CRC_NORMAL_PAGE 0xffffffff
+#define MARIA_NO_CRC_BITMAP_PAGE 0xfffffffe
+extern my_bool maria_page_crc_set_index(uchar *page,
+                                        pgcache_page_no_t page_no,
+                                        uchar *data_ptr);
+extern my_bool maria_page_crc_set_normal(uchar *page,
+                                         pgcache_page_no_t page_no,
+                                         uchar *data_ptr);
+extern my_bool maria_page_crc_check_bitmap(uchar *page,
+                                           pgcache_page_no_t page_no,
+                                           uchar *data_ptr);
+extern my_bool maria_page_crc_check_data(uchar *page,
+                                           pgcache_page_no_t page_no,
+                                           uchar *data_ptr);
+extern my_bool maria_page_crc_check_index(uchar *page,
+                                           pgcache_page_no_t page_no,
+                                           uchar *data_ptr);
+extern my_bool maria_page_crc_check_none(uchar *page,
+                                         pgcache_page_no_t page_no,
+                                         uchar *data_ptr);
+extern my_bool maria_page_filler_set_bitmap(uchar *page,
+                                            pgcache_page_no_t page_no,
+                                            uchar *data_ptr);
+extern my_bool maria_page_filler_set_normal(uchar *page,
+                                            pgcache_page_no_t page_no,
+                                            uchar *data_ptr);
+extern my_bool maria_page_filler_set_none(uchar *page,
+                                          pgcache_page_no_t page_no,
+                                          uchar *data_ptr);
+extern void maria_page_write_failure(uchar* data_ptr);
+extern my_bool maria_flush_log_for_page(uchar *page,
+                                        pgcache_page_no_t page_no,
+                                        uchar *data_ptr);
+extern my_bool maria_flush_log_for_page_none(uchar *page,
+                                             pgcache_page_no_t page_no,
+                                             uchar *data_ptr);
+extern PAGECACHE *maria_log_pagecache;
+extern void ma_set_index_cond_func(MARIA_HA *info, index_cond_func_t func,
+                                   void *func_arg);
+int ma_check_index_cond(register MARIA_HA *info, uint keynr, uchar *record);
diff --git a/storage/maria/maria_ftdump.c b/storage/maria/maria_ftdump.c
new file mode 100644
index 00000000000..870d07fa96e
--- /dev/null
+++ b/storage/maria/maria_ftdump.c
@@ -0,0 +1,282 @@
+/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; version 2 of the License.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */
+
+/* Written by Sergei A. Golubchik, who has a shared copyright to this code
+   added support for long options (my_getopt) 22.5.2002 by Jani Tolonen */
+
+#include "ma_ftdefs.h"
+#include <my_getopt.h>
+
+static void usage();
+static void complain(int val);
+static my_bool get_one_option(int, const struct my_option *, char *);
+
+static int count=0, stats=0, dump=0, lstats=0;
+static my_bool verbose;
+static char *query=NULL;
+static uint lengths[256];
+
+#define MAX_LEN (HA_FT_MAXBYTELEN+10)
+#define HOW_OFTEN_TO_WRITE 10000
+
+static struct my_option my_long_options[] =
+{
+  {"help", 'h', "Display help and exit.",
+   0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
+  {"help", '?', "Synonym for -h.",
+   0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
+  {"count", 'c', "Calculate per-word stats (counts and global weights).",
+   0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
+  {"dump", 'd', "Dump index (incl. data offsets and word weights).",
+   0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
+  {"length", 'l', "Report length distribution.",
+   0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
+  {"stats", 's', "Report global stats.",
+   0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
+  {"verbose", 'v', "Be verbose.",
+   &verbose, &verbose, 0, GET_BOOL, NO_ARG, 0, 0, 0, 0, 0, 0},
+  { 0, 0, 0, 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}
+};
+
+
+int main(int argc,char *argv[])
+{
+  int error=0;
+  uint keylen, keylen2=0, inx, doc_cnt=0;
+  float weight= 1.0;
+  double gws, min_gws=0, avg_gws=0;
+  MARIA_HA *info;
+  char buf[MAX_LEN], buf2[MAX_LEN], buf_maxlen[MAX_LEN], buf_min_gws[MAX_LEN];
+  ulong total=0, maxlen=0, uniq=0, max_doc_cnt=0;
+  struct { MARIA_HA *info; } aio0, *aio=&aio0; /* for GWS_IN_USE */
+
+  MY_INIT(argv[0]);
+  if ((error= handle_options(&argc, &argv, my_long_options, get_one_option)))
+    exit(error);
+  maria_init();
+  if (count || dump)
+    verbose=0;
+  if (!count && !dump && !lstats && !query)
+    stats=1;
+
+  if (verbose)
+    setbuf(stdout,NULL);
+
+  if (argc < 2)
+    usage();
+
+  {
+    char *end;
+    inx= (uint) strtoll(argv[1], &end, 10);
+    if (*end)
+      usage();
+  }
+
+  init_pagecache(maria_pagecache, USE_BUFFER_INIT, 0, 0,
+                 MARIA_KEY_BLOCK_LENGTH, MY_WME);
+
+  if (!(info=maria_open(argv[0], O_RDONLY,
+                        HA_OPEN_ABORT_IF_LOCKED|HA_OPEN_FROM_SQL_LAYER)))
+  {
+    error=my_errno;
+    goto err;
+  }
+
+  *buf2=0;
+  aio->info=info;
+
+  if ((inx >= info->s->base.keys) ||
+      !(info->s->keyinfo[inx].flag & HA_FULLTEXT))
+  {
+    printf("Key %d in table %s is not a FULLTEXT key\n", inx,
+           info->s->open_file_name.str);
+    goto err;
+  }
+
+  maria_lock_database(info, F_EXTRA_LCK);
+
+  info->cur_row.lastpos= HA_OFFSET_ERROR;
+  info->update|= HA_STATE_PREV_FOUND;
+
+  while (!(error=maria_rnext(info,NULL,inx)))
+  {
+    FT_WEIGTH subkeys;
+    keylen=*(info->lastkey_buff);
+
+    subkeys.i= ft_sintXkorr(info->lastkey_buff + keylen + 1);
+    if (subkeys.i >= 0)
+      weight= subkeys.f;
+
+#ifdef HAVE_SNPRINTF
+    snprintf(buf,MAX_LEN,"%.*s",(int) keylen,info->lastkey_buff+1);
+#else
+    sprintf(buf,"%.*s",(int) keylen,info->lastkey_buff+1);
+#endif
+    my_casedn_str(default_charset_info,buf);
+    total++;
+    lengths[keylen]++;
+
+    if (count || stats)
+    {
+      if (strcmp(buf, buf2))
+      {
+        if (*buf2)
+        {
+          uniq++;
+          avg_gws+=gws=GWS_IN_USE;
+          if (count)
+            printf("%9u %20.7f %s\n",doc_cnt,gws,buf2);
+          if (maxlen<keylen2)
+          {
+            maxlen=keylen2;
+            strmov(buf_maxlen, buf2);
+          }
+          if (max_doc_cnt < doc_cnt)
+          {
+            max_doc_cnt=doc_cnt;
+            strmov(buf_min_gws, buf2);
+            min_gws=gws;
+          }
+        }
+        strmov(buf2, buf);
+        keylen2=keylen;
+        doc_cnt=0;
+      }
+      doc_cnt+= (subkeys.i >= 0 ? 1 : -subkeys.i);
+    }
+    if (dump)
+    {
+      if (subkeys.i >= 0)
+        printf("%9lx %20.7f %s\n", (long) info->cur_row.lastpos,weight,buf);
+      else
+        printf("%9lx => %17d %s\n",(long) info->cur_row.lastpos,-subkeys.i,
+               buf);
+    }
+    if (verbose && (total%HOW_OFTEN_TO_WRITE)==0)
+      printf("%10ld\r",total);
+  }
+  maria_lock_database(info, F_UNLCK);
+
+  if (count || stats)
+  {
+    if (*buf2)
+    {
+      uniq++;
+      avg_gws+=gws=GWS_IN_USE;
+      if (count)
+        printf("%9u %20.7f %s\n",doc_cnt,gws,buf2);
+      if (maxlen<keylen2)
+      {
+        maxlen=keylen2;
+        strmov(buf_maxlen, buf2);
+      }
+      if (max_doc_cnt < doc_cnt)
+      {
+        max_doc_cnt=doc_cnt;
+        strmov(buf_min_gws, buf2);
+        min_gws=gws;
+      }
+    }
+  }
+
+  if (stats)
+  {
+    count=0;
+    for (inx=0;inx<256;inx++)
+    {
+      count+=lengths[inx];
+      if ((ulong) count >= total/2)
+        break;
+    }
+    printf("Total rows: %lu\nTotal words: %lu\n"
+           "Unique words: %lu\nLongest word: %lu chars (%s)\n"
+           "Median length: %u\n"
+           "Average global weight: %f\n"
+           "Most common word: %lu times, weight: %f (%s)\n",
+           (long) info->state->records, total, uniq, maxlen, buf_maxlen,
+           inx, avg_gws/uniq, max_doc_cnt, min_gws, buf_min_gws);
+  }
+  if (lstats)
+  {
+    count=0;
+    for (inx=0; inx<256; inx++)
+    {
+      count+=lengths[inx];
+      if (count && lengths[inx])
+        printf("%3u: %10lu %5.2f%% %20lu %4.1f%%\n", inx,
+               (ulong) lengths[inx],100.0*lengths[inx]/total,(ulong) count,
+               100.0*count/total);
+    }
+  }
+
+err:
+  if (error && error != HA_ERR_END_OF_FILE)
+    printf("got error %d\n",my_errno);
+  if (info)
+    maria_close(info);
+  maria_end();
+  return 0;
+}
+
+
+static my_bool
+get_one_option(int optid, const struct my_option *opt __attribute__((unused)),
+	       char *argument __attribute__((unused)))
+{
+  switch(optid) {
+  case 'd':
+    dump=1;
+    complain(count || query);
+    break;
+  case 's':
+    stats=1;
+    complain(query!=0);
+    break;
+  case 'c':
+    count= 1;
+    complain(dump || query);
+    break;
+  case 'l':
+    lstats=1;
+    complain(query!=0);
+    break;
+  case '?':
+  case 'h':
+    usage();
+  }
+  return 0;
+}
+
+#include <help_start.h>
+
+static void usage()
+{
+  printf("Use: aria_ft_dump <table_name> <index_num>\n");
+  my_print_help(my_long_options);
+  my_print_variables(my_long_options);
+  NETWARE_SET_SCREEN_MODE(1);
+  exit(1);
+}
+
+#include <help_end.h>
+
+static void complain(int val) /* Kinda assert :-)  */
+{
+  if (val)
+  {
+    printf("You cannot use these options together!\n");
+    exit(1);
+  }
+}
diff --git a/storage/maria/maria_pack.c b/storage/maria/maria_pack.c
new file mode 100644
index 00000000000..1d2d3995bd8
--- /dev/null
+++ b/storage/maria/maria_pack.c
@@ -0,0 +1,3234 @@
+/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; version 2 of the License.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */
+
+/* Pack MARIA file */
+
+#ifndef USE_MY_FUNC
+#define USE_MY_FUNC			/* We need at least my_malloc */
+#endif
+
+#include "maria_def.h"
+#include <queues.h>
+#include <my_tree.h>
+#include "mysys_err.h"
+#ifdef MSDOS
+#include <io.h>
+#endif
+#ifndef __GNU_LIBRARY__
+#define __GNU_LIBRARY__			/* Skip warnings in getopt.h */
+#endif
+#include <my_getopt.h>
+#include <assert.h>
+
+#if SIZEOF_LONG_LONG > 4
+#define BITS_SAVED 64
+#else
+#define BITS_SAVED 32
+#endif
+
+#define IS_OFFSET ((uint) 32768)	/* Bit if offset or char in tree */
+#define HEAD_LENGTH	32
+#define ALLOWED_JOIN_DIFF	256	/* Diff allowed to join trees */
+
+#define DATA_TMP_EXT		".TMD"
+#define OLD_EXT			".OLD"
+#define WRITE_COUNT		MY_HOW_OFTEN_TO_WRITE
+
+struct st_file_buffer {
+  File file;
+  uchar *buffer,*pos,*end;
+  my_off_t pos_in_file;
+  int bits;
+  ulonglong bitbucket;
+};
+
+struct st_huff_tree;
+struct st_huff_element;
+
+typedef struct st_huff_counts {
+  uint	field_length,max_zero_fill;
+  uint	pack_type;
+  uint	max_end_space,max_pre_space,length_bits,min_space;
+  ulong max_length;
+  enum en_fieldtype field_type;
+  struct st_huff_tree *tree;		/* Tree for field */
+  my_off_t counts[256];
+  my_off_t end_space[8];
+  my_off_t pre_space[8];
+  my_off_t tot_end_space,tot_pre_space,zero_fields,empty_fields,bytes_packed;
+  TREE int_tree;        /* Tree for detecting distinct column values. */
+  uchar *tree_buff;      /* Column values, 'field_length' each. */
+  uchar *tree_pos;       /* Points to end of column values in 'tree_buff'. */
+} HUFF_COUNTS;
+
+typedef struct st_huff_element HUFF_ELEMENT;
+
+/*
+  WARNING: It is crucial for the optimizations in calc_packed_length()
+  that 'count' is the first element of 'HUFF_ELEMENT'.
+*/
+struct st_huff_element {
+  my_off_t count;
+  union un_element {
+    struct st_nod {
+      HUFF_ELEMENT *left,*right;
+    } nod;
+    struct st_leaf {
+      HUFF_ELEMENT *null;
+      uint	element_nr;		/* Number of element */
+    } leaf;
+  } a;
+};
+
+
+typedef struct st_huff_tree {
+  HUFF_ELEMENT *root,*element_buffer;
+  HUFF_COUNTS *counts;
+  uint tree_number;
+  uint elements;
+  my_off_t bytes_packed;
+  uint tree_pack_length;
+  uint min_chr,max_chr,char_bits,offset_bits,max_offset,height;
+  ulonglong *code;
+  uchar *code_len;
+} HUFF_TREE;
+
+
+typedef struct st_isam_mrg {
+  MARIA_HA **file,**current,**end;
+  uint free_file;
+  uint count;
+  uint	min_pack_length;		/* Theese is used by packed data */
+  uint	max_pack_length;
+  uint	ref_length;
+  uint	max_blob_length;
+  my_off_t records;
+  /* true if at least one source file has at least one disabled index */
+  my_bool src_file_has_indexes_disabled;
+} PACK_MRG_INFO;
+
+
+extern int main(int argc,char * *argv);
+static void get_options(int *argc,char ***argv);
+static MARIA_HA *open_maria_file(char *name,int mode);
+static my_bool open_maria_files(PACK_MRG_INFO *mrg,char **names,uint count);
+static int compress(PACK_MRG_INFO *file,char *join_name);
+static HUFF_COUNTS *init_huff_count(MARIA_HA *info,my_off_t records);
+static void free_counts_and_tree_and_queue(HUFF_TREE *huff_trees,
+					   uint trees,
+					   HUFF_COUNTS *huff_counts,
+					   uint fields);
+static int compare_tree(void* cmp_arg __attribute__((unused)),
+			const uchar *s,const uchar *t);
+static int get_statistic(PACK_MRG_INFO *mrg,HUFF_COUNTS *huff_counts);
+static void check_counts(HUFF_COUNTS *huff_counts,uint trees,
+			 my_off_t records);
+static int test_space_compress(HUFF_COUNTS *huff_counts,my_off_t records,
+			       uint max_space_length,my_off_t *space_counts,
+			       my_off_t tot_space_count,
+			       enum en_fieldtype field_type);
+static HUFF_TREE* make_huff_trees(HUFF_COUNTS *huff_counts,uint trees);
+static int make_huff_tree(HUFF_TREE *tree,HUFF_COUNTS *huff_counts);
+static int compare_huff_elements(void *not_used, uchar *a,uchar *b);
+static int save_counts_in_queue(uchar *key,element_count count,
+				    HUFF_TREE *tree);
+static my_off_t calc_packed_length(HUFF_COUNTS *huff_counts,uint flag);
+static uint join_same_trees(HUFF_COUNTS *huff_counts,uint trees);
+static int make_huff_decode_table(HUFF_TREE *huff_tree,uint trees);
+static void make_traverse_code_tree(HUFF_TREE *huff_tree,
+				    HUFF_ELEMENT *element,uint size,
+				    ulonglong code);
+static int write_header(PACK_MRG_INFO *isam_file, uint header_length,uint trees,
+			my_off_t tot_elements,my_off_t filelength);
+static void write_field_info(HUFF_COUNTS *counts, uint fields,uint trees);
+static my_off_t write_huff_tree(HUFF_TREE *huff_tree,uint trees);
+static uint *make_offset_code_tree(HUFF_TREE *huff_tree,
+				       HUFF_ELEMENT *element,
+				       uint *offset);
+static uint max_bit(uint value);
+static int compress_maria_file(PACK_MRG_INFO *file,HUFF_COUNTS *huff_counts);
+static char *make_new_name(char *new_name,char *old_name);
+static char *make_old_name(char *new_name,char *old_name);
+static void init_file_buffer(File file,pbool read_buffer);
+static int flush_buffer(ulong neaded_length);
+static void end_file_buffer(void);
+static void write_bits(ulonglong value, uint bits);
+static void flush_bits(void);
+static int save_state(MARIA_HA *isam_file,PACK_MRG_INFO *mrg,
+                      my_off_t new_length, ha_checksum crc);
+static int save_state_mrg(File file,PACK_MRG_INFO *isam_file,
+                          my_off_t new_length, ha_checksum crc);
+static int mrg_close(PACK_MRG_INFO *mrg);
+static int mrg_rrnd(PACK_MRG_INFO *info,uchar *buf);
+static void mrg_reset(PACK_MRG_INFO *mrg);
+#if !defined(DBUG_OFF)
+static void fakebigcodes(HUFF_COUNTS *huff_counts, HUFF_COUNTS *end_count);
+static int fakecmp(my_off_t **count1, my_off_t **count2);
+#endif
+
+
+static int error_on_write=0,test_only=0,verbose=0,silent=0,
+	   write_loop=0,force_pack=0, isamchk_neaded=0;
+static int tmpfile_createflag=O_RDWR | O_TRUNC | O_EXCL;
+static my_bool backup, opt_wait;
+/*
+  tree_buff_length is somewhat arbitrary. The bigger it is the better
+  the chance to win in terms of compression factor. On the other hand,
+  this table becomes part of the compressed file header. And its length
+  is coded with 16 bits in the header. Hence the limit is 2**16 - 1.
+*/
+static uint tree_buff_length= 65536 - MALLOC_OVERHEAD;
+static char tmp_dir[FN_REFLEN]={0},*join_table;
+static my_off_t intervall_length;
+static ha_checksum glob_crc;
+static struct st_file_buffer file_buffer;
+static QUEUE queue;
+static HUFF_COUNTS *global_count;
+static char zero_string[]={0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
+static const char *load_default_groups[]= { "ariapack",0 };
+
+	/* The main program */
+
+int main(int argc, char **argv)
+{
+  int error,ok;
+  PACK_MRG_INFO merge;
+  char **default_argv;
+  MY_INIT(argv[0]);
+
+  load_defaults("my",load_default_groups,&argc,&argv);
+  default_argv= argv;
+  get_options(&argc,&argv);
+  maria_init();
+
+  error=ok=isamchk_neaded=0;
+  if (join_table)
+  {						/* Join files into one */
+    if (open_maria_files(&merge,argv,(uint) argc) ||
+	compress(&merge,join_table))
+      error=1;
+  }
+  else while (argc--)
+  {
+    MARIA_HA *isam_file;
+    if (!(isam_file=open_maria_file(*argv++,O_RDWR)))
+      error=1;
+    else
+    {
+      merge.file= &isam_file;
+      merge.current=0;
+      merge.free_file=0;
+      merge.count=1;
+      if (compress(&merge,0))
+	error=1;
+      else
+	ok=1;
+    }
+  }
+  if (ok && isamchk_neaded && !silent)
+    puts("Remember to run aria_chk -rq on compressed tables");
+  VOID(fflush(stdout));
+  VOID(fflush(stderr));
+  free_defaults(default_argv);
+  maria_end();
+  my_end(verbose ? MY_CHECK_ERROR | MY_GIVE_INFO : MY_CHECK_ERROR);
+  exit(error ? 2 : 0);
+#ifndef _lint
+  return 0;					/* No compiler warning */
+#endif
+}
+
+enum options_mp {OPT_CHARSETS_DIR_MP=256, OPT_AUTO_CLOSE};
+
+static struct my_option my_long_options[] =
+{
+#ifdef __NETWARE__
+  {"autoclose", OPT_AUTO_CLOSE, "Auto close the screen on exit for Netware.",
+   0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
+#endif
+  {"backup", 'b', "Make a backup of the table as table_name.OLD.",
+   &backup, &backup, 0, GET_BOOL, NO_ARG, 0, 0, 0, 0, 0, 0},
+  {"character-sets-dir", OPT_CHARSETS_DIR_MP,
+   "Directory where character sets are.", (char**) &charsets_dir,
+   (char**) &charsets_dir, 0, GET_STR, REQUIRED_ARG, 0, 0, 0, 0, 0, 0},
+  {"debug", '#', "Output debug log. Often this is 'd:t:o,filename'.",
+   0, 0, 0, GET_STR, OPT_ARG, 0, 0, 0, 0, 0, 0},
+  {"force", 'f',
+   "Force packing of table even if it gets bigger or if tempfile exists.",
+   0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
+  {"join", 'j',
+   "Join all given tables into 'new_table_name'. All tables MUST have identical layouts.",
+   &join_table, &join_table, 0, GET_STR, REQUIRED_ARG, 0, 0, 0,
+   0, 0, 0},
+  {"help", '?', "Display this help and exit.",
+   0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
+  {"silent", 's', "Be more silent.",
+   0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
+  {"tmpdir", 'T', "Use temporary directory to store temporary table.",
+   0, 0, 0, GET_STR, REQUIRED_ARG, 0, 0, 0, 0, 0, 0},
+  {"test", 't', "Don't pack table, only test packing it.",
+   0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
+  {"verbose", 'v', "Write info about progress and packing result. Use many -v for more verbosity!",
+   0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
+  {"version", 'V', "Output version information and exit.",
+   0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
+  {"wait", 'w', "Wait and retry if table is in use.", &opt_wait,
+   &opt_wait, 0, GET_BOOL, NO_ARG, 0, 0, 0, 0, 0, 0},
+  { 0, 0, 0, 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}
+};
+
+#include <help_start.h>
+
+static void print_version(void)
+{
+  VOID(printf("%s Ver 1.0 for %s on %s\n",
+              my_progname, SYSTEM_TYPE, MACHINE_TYPE));
+  NETWARE_SET_SCREEN_MODE(1);
+}
+
+
+static void usage(void)
+{
+  print_version();
+  puts("Copyright 2002-2008 MySQL AB, 2008-2009 Sun Microsystems, Inc.");
+  puts("This software comes with ABSOLUTELY NO WARRANTY. This is free software,");
+  puts("and you are welcome to modify and redistribute it under the GPL license\n");
+
+  puts("Pack a Aria-table to take much less space.");
+  puts("Keys are not updated, you must run aria_chk -rq on the index (.MAI) file");
+  puts("afterwards to update the keys.");
+  puts("You should give the .MAI file as the filename argument.");
+  puts("To unpack a packed table, run aria_chk -u on the table");
+
+  VOID(printf("\nUsage: %s [OPTIONS] filename...\n", my_progname));
+  my_print_help(my_long_options);
+  print_defaults("my", load_default_groups);
+  my_print_variables(my_long_options);
+}
+
+#include <help_end.h>
+
+static my_bool
+get_one_option(int optid, const struct my_option *opt __attribute__((unused)),
+	       char *argument)
+{
+  uint length;
+
+  switch(optid) {
+#ifdef __NETWARE__
+  case OPT_AUTO_CLOSE:
+    setscreenmode(SCR_AUTOCLOSE_ON_EXIT);
+    break;
+#endif
+  case 'f':
+    force_pack= 1;
+    tmpfile_createflag= O_RDWR | O_TRUNC;
+    break;
+  case 's':
+    write_loop= verbose= 0;
+    silent= 1;
+    break;
+  case 't':
+    test_only= 1;
+    /* Avoid to reset 'verbose' if it was already set > 1. */
+    if (! verbose)
+      verbose= 1;
+    break;
+  case 'T':
+    length= (uint) (strmov(tmp_dir, argument) - tmp_dir);
+    if (length != dirname_length(tmp_dir))
+    {
+      tmp_dir[length]=FN_LIBCHAR;
+      tmp_dir[length+1]=0;
+    }
+    break;
+  case 'v':
+    verbose++; /* Allow for selecting the level of verbosity. */
+    silent= 0;
+    break;
+  case '#':
+    DBUG_PUSH(argument ? argument : "d:t:o,/tmp/aria_pack.trace");
+    break;
+  case 'V':
+    print_version();
+    exit(0);
+  case 'I':
+  case '?':
+    usage();
+    exit(0);
+  }
+  return 0;
+}
+
+	/* reads options */
+	/* Initiates DEBUG - but no debugging here ! */
+
+static void get_options(int *argc,char ***argv)
+{
+  int ho_error;
+
+  my_progname= argv[0][0];
+  if (isatty(fileno(stdout)))
+    write_loop=1;
+
+  if ((ho_error=handle_options(argc, argv, my_long_options, get_one_option)))
+    exit(ho_error);
+
+  if (!*argc)
+  {
+    usage();
+    exit(1);
+  }
+  if (join_table)
+  {
+    backup=0;					/* Not needed */
+    tmp_dir[0]=0;
+  }
+  return;
+}
+
+
+static MARIA_HA *open_maria_file(char *name,int mode)
+{
+  MARIA_HA *isam_file;
+  MARIA_SHARE *share;
+  DBUG_ENTER("open_maria_file");
+
+  if (!(isam_file=maria_open(name, mode, HA_OPEN_IGNORE_MOVED_STATE |
+			  (opt_wait ? HA_OPEN_WAIT_IF_LOCKED :
+			   HA_OPEN_ABORT_IF_LOCKED))))
+  {
+    VOID(fprintf(stderr, "%s gave error %d on open\n", name, my_errno));
+    DBUG_RETURN(0);
+  }
+  share=isam_file->s;
+  if (share->options & HA_OPTION_COMPRESS_RECORD && !join_table)
+  {
+    if (!force_pack)
+    {
+      VOID(fprintf(stderr, "%s is already compressed\n", name));
+      VOID(maria_close(isam_file));
+      DBUG_RETURN(0);
+    }
+    if (verbose)
+      puts("Recompressing already compressed table");
+    share->options&= ~HA_OPTION_READ_ONLY_DATA; /* We are modifing it */
+  }
+  if (! force_pack && share->state.state.records != 0 &&
+      (share->state.state.records <= 1 ||
+       share->state.state.data_file_length < 1024))
+  {
+    VOID(fprintf(stderr, "%s is too small to compress\n", name));
+    VOID(maria_close(isam_file));
+    DBUG_RETURN(0);
+  }
+  VOID(maria_lock_database(isam_file,F_WRLCK));
+  maria_ignore_trids(isam_file);
+  DBUG_RETURN(isam_file);
+}
+
+
+static my_bool open_maria_files(PACK_MRG_INFO *mrg,char **names,uint count)
+{
+  uint i,j;
+  mrg->count=0;
+  mrg->current=0;
+  mrg->file=(MARIA_HA**) my_malloc(sizeof(MARIA_HA*)*count,MYF(MY_FAE));
+  mrg->free_file=1;
+  mrg->src_file_has_indexes_disabled= 0;
+  for (i=0; i < count ; i++)
+  {
+    if (!(mrg->file[i]=open_maria_file(names[i],O_RDONLY)))
+      goto error;
+
+    mrg->src_file_has_indexes_disabled|=
+      ! maria_is_all_keys_active(mrg->file[i]->s->state.key_map,
+                              mrg->file[i]->s->base.keys);
+  }
+  /* Check that files are identical */
+  for (j=0 ; j < count-1 ; j++)
+  {
+    MARIA_COLUMNDEF *m1,*m2,*end;
+    if (mrg->file[j]->s->base.reclength != mrg->file[j+1]->s->base.reclength ||
+	mrg->file[j]->s->base.fields != mrg->file[j+1]->s->base.fields)
+      goto diff_file;
+    m1=mrg->file[j]->s->columndef;
+    end=m1+mrg->file[j]->s->base.fields;
+    m2=mrg->file[j+1]->s->columndef;
+    for ( ; m1 != end ; m1++,m2++)
+    {
+      if (m1->type != m2->type || m1->length != m2->length)
+	goto diff_file;
+    }
+  }
+  mrg->count=count;
+  return 0;
+
+ diff_file:
+  VOID(fprintf(stderr, "%s: Tables '%s' and '%s' are not identical\n",
+               my_progname, names[j], names[j+1]));
+ error:
+  while (i--)
+    maria_close(mrg->file[i]);
+  my_free(mrg->file, MYF(0));
+  return 1;
+}
+
+
+static int compress(PACK_MRG_INFO *mrg,char *result_table)
+{
+  int error;
+  File new_file,join_maria_file;
+  MARIA_HA *isam_file;
+  MARIA_SHARE *share;
+  char org_name[FN_REFLEN],new_name[FN_REFLEN],temp_name[FN_REFLEN];
+  uint i,header_length,fields,trees,used_trees;
+  my_off_t old_length,new_length,tot_elements;
+  HUFF_COUNTS *huff_counts;
+  HUFF_TREE *huff_trees;
+  DBUG_ENTER("compress");
+
+  isam_file=mrg->file[0];			/* Take this as an example */
+  share=isam_file->s;
+  new_file=join_maria_file= -1;
+  trees=fields=0;
+  huff_trees=0;
+  huff_counts=0;
+  maria_block_size= isam_file->s->block_size;
+
+  /* Create temporary or join file */
+  if (backup)
+    VOID(fn_format(org_name,isam_file->s->open_file_name.str,
+                   "",MARIA_NAME_DEXT, 2));
+  else
+    VOID(fn_format(org_name,isam_file->s->open_file_name.str,
+                   "",MARIA_NAME_DEXT, 2+4+16));
+
+  if (init_pagecache(maria_pagecache, MARIA_MIN_PAGE_CACHE_SIZE, 0, 0,
+                     maria_block_size, MY_WME) == 0)
+  {
+    fprintf(stderr, "Can't initialize page cache\n");
+    goto err;
+  }
+
+  if (!test_only && result_table)
+  {
+    /* Make a new indexfile based on first file in list */
+    uint length;
+    uchar *buff;
+    strmov(org_name,result_table);		/* Fix error messages */
+    VOID(fn_format(new_name,result_table,"",MARIA_NAME_IEXT,2));
+    if ((join_maria_file=my_create(new_name,0,tmpfile_createflag,MYF(MY_WME)))
+	< 0)
+      goto err;
+    length=(uint) share->base.keystart;
+    if (!(buff= (uchar*) my_malloc(length,MYF(MY_WME))))
+      goto err;
+    if (my_pread(share->kfile.file, buff, length, 0L, MYF(MY_WME | MY_NABP)) ||
+	my_write(join_maria_file,buff,length,
+		 MYF(MY_WME | MY_NABP | MY_WAIT_IF_FULL)))
+    {
+      my_free(buff,MYF(0));
+      goto err;
+    }
+    my_free(buff,MYF(0));
+    VOID(fn_format(new_name,result_table,"",MARIA_NAME_DEXT,2));
+  }
+  else if (!tmp_dir[0])
+    VOID(make_new_name(new_name,org_name));
+  else
+    VOID(fn_format(new_name,org_name,tmp_dir,DATA_TMP_EXT,1+2+4));
+  if (!test_only &&
+      (new_file=my_create(new_name,0,tmpfile_createflag,MYF(MY_WME))) < 0)
+    goto err;
+
+  /* Start calculating statistics */
+
+  mrg->records=0;
+  for (i=0 ; i < mrg->count ; i++)
+    mrg->records+=mrg->file[i]->s->state.state.records;
+
+  DBUG_PRINT("info", ("Compressing %s: (%lu records)",
+                      result_table ? new_name : org_name,
+                      (ulong) mrg->records));
+  if (write_loop || verbose)
+  {
+    VOID(printf("Compressing %s: (%lu records)\n",
+                result_table ? new_name : org_name, (ulong) mrg->records));
+  }
+  trees=fields=share->base.fields;
+  huff_counts=init_huff_count(isam_file,mrg->records);
+  QUICK_SAFEMALLOC;
+
+  /*
+    Read the whole data file(s) for statistics.
+  */
+  DBUG_PRINT("info", ("- Calculating statistics"));
+  if (write_loop || verbose)
+    VOID(printf("- Calculating statistics\n"));
+  if (get_statistic(mrg,huff_counts))
+    goto err;
+  NORMAL_SAFEMALLOC;
+  old_length=0;
+  for (i=0; i < mrg->count ; i++)
+    old_length+= (mrg->file[i]->s->state.state.data_file_length -
+		  mrg->file[i]->s->state.state.empty);
+
+  /*
+    Create a global priority queue in preparation for making
+    temporary Huffman trees.
+  */
+  if (init_queue(&queue, 256, 0, 0, compare_huff_elements, 0, 0, 0))
+    goto err;
+
+  /*
+    Check each column if we should use pre-space-compress, end-space-
+    compress, empty-field-compress or zero-field-compress.
+  */
+  check_counts(huff_counts,fields,mrg->records);
+
+  /*
+    Build a Huffman tree for each column.
+  */
+  huff_trees=make_huff_trees(huff_counts,trees);
+
+  /*
+    If the packed lengths of combined columns is less then the sum of
+    the non-combined columns, then create common Huffman trees for them.
+    We do this only for uchar compressed columns, not for distinct values
+    compressed columns.
+  */
+  if ((int) (used_trees=join_same_trees(huff_counts,trees)) < 0)
+    goto err;
+
+  /*
+    Assign codes to all uchar or column values.
+  */
+  if (make_huff_decode_table(huff_trees,fields))
+    goto err;
+
+  /* Prepare a file buffer. */
+  init_file_buffer(new_file,0);
+
+  /*
+    Reserve space in the target file for the fixed compressed file header.
+  */
+  file_buffer.pos_in_file=HEAD_LENGTH;
+  if (! test_only)
+    VOID(my_seek(new_file,file_buffer.pos_in_file,MY_SEEK_SET,MYF(0)));
+
+  /*
+    Write field infos: field type, pack type, length bits, tree number.
+  */
+  write_field_info(huff_counts,fields,used_trees);
+
+  /*
+    Write decode trees.
+  */
+  if (!(tot_elements=write_huff_tree(huff_trees,trees)))
+    goto err;
+
+  /*
+    Calculate the total length of the compression info header.
+    This includes the fixed compressed file header, the column compression
+    type descriptions, and the decode trees.
+  */
+  header_length=(uint) file_buffer.pos_in_file+
+    (uint) (file_buffer.pos-file_buffer.buffer);
+
+  /*
+    Compress the source file into the target file.
+  */
+  DBUG_PRINT("info", ("- Compressing file"));
+  if (write_loop || verbose)
+    VOID(printf("- Compressing file\n"));
+  error=compress_maria_file(mrg,huff_counts);
+  new_length=file_buffer.pos_in_file;
+  if (!error && !test_only)
+  {
+    uchar buff[MEMMAP_EXTRA_MARGIN];		/* End marginal for memmap */
+    bzero(buff,sizeof(buff));
+    error=my_write(file_buffer.file,buff,sizeof(buff),
+		   MYF(MY_WME | MY_NABP | MY_WAIT_IF_FULL)) != 0;
+  }
+
+  /*
+    Write the fixed compressed file header.
+  */
+  if (!error)
+    error=write_header(mrg,header_length,used_trees,tot_elements,
+		       new_length);
+
+  /* Flush the file buffer. */
+  end_file_buffer();
+
+  /* Display statistics. */
+  DBUG_PRINT("info", ("Min record length: %6d  Max length: %6d  "
+                      "Mean total length: %6ld",
+                      mrg->min_pack_length, mrg->max_pack_length,
+                      (ulong) (mrg->records ? (new_length/mrg->records) : 0)));
+  if (verbose && mrg->records)
+    VOID(printf("Min record length: %6d   Max length: %6d   "
+                "Mean total length: %6ld\n", mrg->min_pack_length,
+                mrg->max_pack_length, (ulong) (new_length/mrg->records)));
+
+  /* Close source and target file. */
+  if (!test_only)
+  {
+    error|=my_close(new_file,MYF(MY_WME));
+    if (!result_table)
+    {
+      error|=my_close(isam_file->dfile.file, MYF(MY_WME));
+      isam_file->dfile.file= -1;	/* Tell maria_close file is closed */
+      isam_file->s->bitmap.file.file= -1;
+    }
+  }
+
+  /* Cleanup. */
+  free_counts_and_tree_and_queue(huff_trees,trees,huff_counts,fields);
+  if (! test_only && ! error)
+  {
+    if (result_table)
+    {
+      error=save_state_mrg(join_maria_file,mrg,new_length,glob_crc);
+    }
+    else
+    {
+      if (backup)
+      {
+	if (my_rename(org_name,make_old_name(temp_name,
+                                             isam_file->s->open_file_name.str),
+		      MYF(MY_WME)))
+	  error=1;
+	else
+	{
+	  if (tmp_dir[0])
+	    error=my_copy(new_name,org_name,MYF(MY_WME));
+	  else
+	    error=my_rename(new_name,org_name,MYF(MY_WME));
+	  if (!error)
+          {
+	    VOID(my_copystat(temp_name,org_name,MYF(MY_COPYTIME)));
+            if (tmp_dir[0])
+              VOID(my_delete(new_name,MYF(MY_WME)));
+          }
+	}
+      }
+      else
+      {
+	if (tmp_dir[0])
+        {
+	  error=my_copy(new_name,org_name,
+			MYF(MY_WME | MY_HOLD_ORIGINAL_MODES | MY_COPYTIME));
+          if (!error)
+            VOID(my_delete(new_name,MYF(MY_WME)));
+        }
+	else
+	  error=my_redel(org_name,new_name,MYF(MY_WME | MY_COPYTIME));
+      }
+      if (! error)
+	error=save_state(isam_file,mrg,new_length,glob_crc);
+    }
+  }
+  error|=mrg_close(mrg);
+  if (join_maria_file >= 0)
+    error|=my_close(join_maria_file,MYF(MY_WME));
+  if (error)
+  {
+    VOID(fprintf(stderr, "Aborting: %s is not compressed\n", org_name));
+    VOID(my_delete(new_name,MYF(MY_WME)));
+    DBUG_RETURN(-1);
+  }
+  if (write_loop || verbose)
+  {
+    if (old_length)
+      VOID(printf("%.4g%%     \n",
+                  (((longlong) (old_length - new_length)) * 100.0 /
+                   (longlong) old_length)));
+    else
+      puts("Empty file saved in compressed format");
+  }
+  DBUG_RETURN(0);
+
+ err:
+  end_pagecache(maria_pagecache, 1);
+  free_counts_and_tree_and_queue(huff_trees,trees,huff_counts,fields);
+  if (new_file >= 0)
+    VOID(my_close(new_file,MYF(0)));
+  if (join_maria_file >= 0)
+    VOID(my_close(join_maria_file,MYF(0)));
+  mrg_close(mrg);
+  VOID(fprintf(stderr, "Aborted: %s is not compressed\n", org_name));
+  DBUG_RETURN(-1);
+}
+
+	/* Init a huff_count-struct for each field and init it */
+
+static HUFF_COUNTS *init_huff_count(MARIA_HA *info,my_off_t records)
+{
+  reg2 uint i;
+  reg1 HUFF_COUNTS *count;
+  if ((count = (HUFF_COUNTS*) my_malloc(info->s->base.fields*
+					sizeof(HUFF_COUNTS),
+					MYF(MY_ZEROFILL | MY_WME))))
+  {
+    for (i=0 ; i < info->s->base.fields ; i++)
+    {
+      enum en_fieldtype type;
+      count[i].field_length=info->s->columndef[i].length;
+      type= count[i].field_type= (enum en_fieldtype) info->s->columndef[i].type;
+      if (type == FIELD_INTERVALL ||
+	  type == FIELD_CONSTANT ||
+	  type == FIELD_ZERO)
+	type = FIELD_NORMAL;
+      if (count[i].field_length <= 8 &&
+	  (type == FIELD_NORMAL ||
+	   type == FIELD_SKIP_ZERO))
+	count[i].max_zero_fill= count[i].field_length;
+      /*
+        For every column initialize a tree, which is used to detect distinct
+        column values. 'int_tree' works together with 'tree_buff' and
+        'tree_pos'. It's keys are implemented by pointers into 'tree_buff'.
+        This is accomplished by '-1' as the element size.
+      */
+      init_tree(&count[i].int_tree,0,0,-1,(qsort_cmp2) compare_tree,0, NULL,
+		NULL);
+      if (records && type != FIELD_BLOB && type != FIELD_VARCHAR)
+	count[i].tree_pos=count[i].tree_buff =
+	  my_malloc(count[i].field_length > 1 ? tree_buff_length : 2,
+		    MYF(MY_WME));
+    }
+  }
+  return count;
+}
+
+
+	/* Free memory used by counts and trees */
+
+static void free_counts_and_tree_and_queue(HUFF_TREE *huff_trees, uint trees,
+					   HUFF_COUNTS *huff_counts,
+					   uint fields)
+{
+  register uint i;
+
+  if (huff_trees)
+  {
+    for (i=0 ; i < trees ; i++)
+    {
+      if (huff_trees[i].element_buffer)
+	my_free(huff_trees[i].element_buffer,MYF(0));
+      if (huff_trees[i].code)
+	my_free(huff_trees[i].code,MYF(0));
+    }
+    my_free(huff_trees,MYF(0));
+  }
+  if (huff_counts)
+  {
+    for (i=0 ; i < fields ; i++)
+    {
+      if (huff_counts[i].tree_buff)
+      {
+	my_free(huff_counts[i].tree_buff,MYF(0));
+	delete_tree(&huff_counts[i].int_tree);
+      }
+    }
+    my_free(huff_counts, MYF(0));
+  }
+  delete_queue(&queue);		/* This is safe to free */
+  return;
+}
+
+	/* Read through old file and gather some statistics */
+
+static int get_statistic(PACK_MRG_INFO *mrg,HUFF_COUNTS *huff_counts)
+{
+  int error;
+  uint length, null_bytes;
+  ulong reclength,max_blob_length;
+  uchar *record,*pos,*next_pos,*end_pos,*start_pos;
+  ha_rows record_count;
+  HUFF_COUNTS *count,*end_count;
+  TREE_ELEMENT *element;
+  ha_checksum(*calc_checksum)(MARIA_HA *, const uchar *);
+  DBUG_ENTER("get_statistic");
+
+  reclength=  mrg->file[0]->s->base.reclength;
+  null_bytes= mrg->file[0]->s->base.null_bytes;
+  record=(uchar*) my_alloca(reclength);
+  end_count=huff_counts+mrg->file[0]->s->base.fields;
+  record_count=0; glob_crc=0;
+  max_blob_length=0;
+
+  /* Check how to calculate checksum */
+  if (mrg->file[0]->s->data_file_type == STATIC_RECORD)
+    calc_checksum= _ma_static_checksum;
+  else
+    calc_checksum= _ma_checksum;
+
+  mrg_reset(mrg);
+  while ((error=mrg_rrnd(mrg,record)) != HA_ERR_END_OF_FILE)
+  {
+    ulong tot_blob_length=0;
+    if (! error)
+    {
+      /* glob_crc is a checksum over all bytes of all records. */
+      glob_crc+= (*calc_checksum)(mrg->file[0],record);
+
+      /* Count the incidence of values separately for every column. */
+      for (pos=record + null_bytes, count=huff_counts ;
+	   count < end_count ;
+	   count++,
+	   pos=next_pos)
+      {
+	next_pos=end_pos=(start_pos=pos)+count->field_length;
+
+	/*
+          Put the whole column value in a tree if there is room for it.
+          'int_tree' is used to quickly check for duplicate values.
+          'tree_buff' collects as many distinct column values as
+          possible. If the field length is > 1, it is tree_buff_length,
+          else 2 bytes. Each value is 'field_length' bytes big. If there
+          are more distinct column values than fit into the buffer, we
+          give up with this tree. BLOBs and VARCHARs do not have a
+          tree_buff as it can only be used with fixed length columns.
+          For the special case of field length == 1, we handle only the
+          case that there is only one distinct value in the table(s).
+          Otherwise, we can have a maximum of 256 distinct values. This
+          is then handled by the normal Huffman tree build.
+
+          Another limit for collecting distinct column values is the
+          number of values itself. Since we would need to build a
+          Huffman tree for the values, we are limited by the 'IS_OFFSET'
+          constant. This constant expresses a bit which is used to
+          determine if a tree element holds a final value or an offset
+          to a child element. Hence, all values and offsets need to be
+          smaller than 'IS_OFFSET'. A tree element is implemented with
+          two integer values, one for the left branch and one for the
+          right branch. For the extreme case that the first element
+          points to the last element, the number of integers in the tree
+          must be less or equal to IS_OFFSET. So the number of elements
+          must be less or equal to IS_OFFSET / 2.
+
+          WARNING: At first, we insert a pointer into the record buffer
+          as the key for the tree. If we got a new distinct value, which
+          is really inserted into the tree, instead of being counted
+          only, we will copy the column value from the record buffer to
+          'tree_buff' and adjust the key pointer of the tree accordingly.
+        */
+	if (count->tree_buff)
+	{
+	  global_count=count;
+	  if (!(element=tree_insert(&count->int_tree,pos, 0,
+				    count->int_tree.custom_arg)) ||
+	      (element->count == 1 &&
+	       (count->tree_buff + tree_buff_length <
+                count->tree_pos + count->field_length)) ||
+              (count->int_tree.elements_in_tree > IS_OFFSET / 2) ||
+	      (count->field_length == 1 &&
+	       count->int_tree.elements_in_tree > 1))
+	  {
+	    delete_tree(&count->int_tree);
+	    my_free(count->tree_buff,MYF(0));
+	    count->tree_buff=0;
+	  }
+	  else
+	  {
+            /*
+              If tree_insert() succeeds, it either creates a new element
+              or increments the counter of an existing element.
+            */
+	    if (element->count == 1)
+	    {
+              /* Copy the new column value into 'tree_buff'. */
+	      memcpy(count->tree_pos,pos,(size_t) count->field_length);
+              /* Adjust the key pointer in the tree. */
+	      tree_set_pointer(element,count->tree_pos);
+              /* Point behind the last column value so far. */
+	      count->tree_pos+=count->field_length;
+	    }
+	  }
+	}
+
+	/* Save character counters and space-counts and zero-field-counts */
+	if (count->field_type == FIELD_NORMAL ||
+	    count->field_type == FIELD_SKIP_ENDSPACE)
+	{
+          /* Ignore trailing space. */
+	  for ( ; end_pos > pos ; end_pos--)
+	    if (end_pos[-1] != ' ')
+	      break;
+          /* Empty fields are just counted. Go to the next record. */
+	  if (end_pos == pos)
+	  {
+	    count->empty_fields++;
+	    count->max_zero_fill=0;
+	    continue;
+	  }
+          /*
+            Count the total of all trailing spaces and the number of
+            short trailing spaces. Remember the longest trailing space.
+          */
+	  length= (uint) (next_pos-end_pos);
+	  count->tot_end_space+=length;
+	  if (length < 8)
+	    count->end_space[length]++;
+	  if (count->max_end_space < length)
+	    count->max_end_space = length;
+	}
+
+	if (count->field_type == FIELD_NORMAL ||
+	    count->field_type == FIELD_SKIP_PRESPACE)
+	{
+          /* Ignore leading space. */
+	  for (pos=start_pos; pos < end_pos ; pos++)
+	    if (pos[0] != ' ')
+	      break;
+          /* Empty fields are just counted. Go to the next record. */
+	  if (end_pos == pos)
+	  {
+	    count->empty_fields++;
+	    count->max_zero_fill=0;
+	    continue;
+	  }
+          /*
+            Count the total of all leading spaces and the number of
+            short leading spaces. Remember the longest leading space.
+          */
+	  length= (uint) (pos-start_pos);
+	  count->tot_pre_space+=length;
+	  if (length < 8)
+	    count->pre_space[length]++;
+	  if (count->max_pre_space < length)
+	    count->max_pre_space = length;
+	}
+
+        /* Calculate pos, end_pos, and max_length for variable length fields. */
+	if (count->field_type == FIELD_BLOB)
+	{
+	  uint field_length=count->field_length -portable_sizeof_char_ptr;
+	  ulong blob_length= _ma_calc_blob_length(field_length, start_pos);
+	  memcpy_fixed((char*) &pos,  start_pos+field_length,sizeof(char*));
+	  end_pos=pos+blob_length;
+	  tot_blob_length+=blob_length;
+	  set_if_bigger(count->max_length,blob_length);
+	}
+	else if (count->field_type == FIELD_VARCHAR)
+	{
+          uint pack_length= HA_VARCHAR_PACKLENGTH(count->field_length-1);
+	  length= (pack_length == 1 ? (uint) *(uchar*) start_pos :
+                   uint2korr(start_pos));
+	  pos= start_pos+pack_length;
+	  end_pos= pos+length;
+	  set_if_bigger(count->max_length,length);
+	}
+
+        /* Evaluate 'max_zero_fill' for short fields. */
+	if (count->field_length <= 8 &&
+	    (count->field_type == FIELD_NORMAL ||
+	     count->field_type == FIELD_SKIP_ZERO))
+	{
+	  uint i;
+          /* Zero fields are just counted. Go to the next record. */
+	  if (!memcmp(start_pos, zero_string, count->field_length))
+	  {
+	    count->zero_fields++;
+	    continue;
+	  }
+          /*
+            max_zero_fill starts with field_length. It is decreased every
+            time a shorter "zero trailer" is found. It is set to zero when
+            an empty field is found (see above). This suggests that the
+            variable should be called 'min_zero_fill'.
+          */
+	  for (i =0 ; i < count->max_zero_fill && ! end_pos[-1 - (int) i] ;
+	       i++) ;
+	  if (i < count->max_zero_fill)
+	    count->max_zero_fill=i;
+	}
+
+        /* Ignore zero fields and check fields. */
+	if (count->field_type == FIELD_ZERO ||
+	    count->field_type == FIELD_CHECK)
+	  continue;
+
+        /*
+          Count the incidence of every uchar value in the
+          significant field value.
+        */
+	for ( ; pos < end_pos ; pos++)
+	  count->counts[(uchar) *pos]++;
+
+        /* Step to next field. */
+      }
+
+      if (tot_blob_length > max_blob_length)
+	max_blob_length=tot_blob_length;
+      record_count++;
+      if (write_loop && record_count % WRITE_COUNT == 0)
+      {
+	VOID(printf("%lu\r", (ulong) record_count));
+        VOID(fflush(stdout));
+      }
+    }
+    else if (error != HA_ERR_RECORD_DELETED)
+    {
+      VOID(fprintf(stderr, "Got error %d while reading rows\n", error));
+      break;
+    }
+
+    /* Step to next record. */
+  }
+  if (write_loop)
+  {
+    VOID(printf("            \r"));
+    VOID(fflush(stdout));
+  }
+
+  /*
+    If --debug=d,fakebigcodes is set, fake the counts to get big Huffman
+    codes.
+  */
+  DBUG_EXECUTE_IF("fakebigcodes", fakebigcodes(huff_counts, end_count););
+
+  DBUG_PRINT("info", ("Found the following number of incidents "
+                      "of the uchar codes:"));
+  if (verbose >= 2)
+    VOID(printf("Found the following number of incidents "
+                "of the uchar codes:\n"));
+  for (count= huff_counts ; count < end_count; count++)
+  {
+    uint      idx;
+    my_off_t  total_count;
+    char      llbuf[32];
+
+    DBUG_PRINT("info", ("column: %3u", (uint) (count - huff_counts + 1)));
+    if (verbose >= 2)
+      VOID(printf("column: %3u\n", (uint) (count - huff_counts + 1)));
+    if (count->tree_buff)
+    {
+      DBUG_PRINT("info", ("number of distinct values: %u",
+                          (uint) ((count->tree_pos - count->tree_buff) /
+                                  count->field_length)));
+      if (verbose >= 2)
+        VOID(printf("number of distinct values: %u\n",
+                    (uint) ((count->tree_pos - count->tree_buff) /
+                            count->field_length)));
+    }
+    total_count= 0;
+    for (idx= 0; idx < 256; idx++)
+    {
+      if (count->counts[idx])
+      {
+        total_count+= count->counts[idx];
+        DBUG_PRINT("info", ("counts[0x%02x]: %12s", idx,
+                            llstr((longlong) count->counts[idx], llbuf)));
+        if (verbose >= 2)
+          VOID(printf("counts[0x%02x]: %12s\n", idx,
+                      llstr((longlong) count->counts[idx], llbuf)));
+      }
+    }
+    DBUG_PRINT("info", ("total:        %12s", llstr((longlong) total_count,
+                                                    llbuf)));
+    if ((verbose >= 2) && total_count)
+    {
+      VOID(printf("total:        %12s\n",
+                  llstr((longlong) total_count, llbuf)));
+    }
+  }
+
+  mrg->records=record_count;
+  mrg->max_blob_length=max_blob_length;
+  my_afree(record);
+  DBUG_RETURN(error != HA_ERR_END_OF_FILE);
+}
+
+static int compare_huff_elements(void *not_used __attribute__((unused)),
+				 uchar *a, uchar *b)
+{
+  return *((my_off_t*) a) < *((my_off_t*) b) ? -1 :
+    (*((my_off_t*) a) == *((my_off_t*) b)  ? 0 : 1);
+}
+
+	/* Check each tree if we should use pre-space-compress, end-space-
+	   compress, empty-field-compress or zero-field-compress */
+
+static void check_counts(HUFF_COUNTS *huff_counts, uint trees,
+			 my_off_t records)
+{
+  uint space_fields,fill_zero_fields,field_count[(int) FIELD_enum_val_count];
+  my_off_t old_length,new_length,length;
+  DBUG_ENTER("check_counts");
+
+  bzero((uchar*) field_count,sizeof(field_count));
+  space_fields=fill_zero_fields=0;
+
+  for (; trees-- ; huff_counts++)
+  {
+    if (huff_counts->field_type == FIELD_BLOB)
+    {
+      huff_counts->length_bits=max_bit(huff_counts->max_length);
+      goto found_pack;
+    }
+    else if (huff_counts->field_type == FIELD_VARCHAR)
+    {
+      huff_counts->length_bits=max_bit(huff_counts->max_length);
+      goto found_pack;
+    }
+    else if (huff_counts->field_type == FIELD_CHECK)
+    {
+      huff_counts->bytes_packed=0;
+      huff_counts->counts[0]=0;
+      goto found_pack;
+    }
+
+    huff_counts->field_type=FIELD_NORMAL;
+    huff_counts->pack_type=0;
+
+    /* Check for zero-filled records (in this column), or zero records. */
+    if (huff_counts->zero_fields || ! records)
+    {
+      my_off_t old_space_count;
+      /*
+        If there are only zero filled records (in this column),
+        or no records at all, we are done.
+      */
+      if (huff_counts->zero_fields == records)
+      {
+	huff_counts->field_type= FIELD_ZERO;
+	huff_counts->bytes_packed=0;
+	huff_counts->counts[0]=0;
+	goto found_pack;
+      }
+      /* Remeber the number of significant spaces. */
+      old_space_count=huff_counts->counts[' '];
+      /* Add all leading and trailing spaces. */
+      huff_counts->counts[' ']+= (huff_counts->tot_end_space +
+                                  huff_counts->tot_pre_space +
+                                  huff_counts->empty_fields *
+                                  huff_counts->field_length);
+      /* Check, what the compressed length of this would be. */
+      old_length=calc_packed_length(huff_counts,0)+records/8;
+      /* Get the number of zero bytes. */
+      length=huff_counts->zero_fields*huff_counts->field_length;
+      /* Add it to the counts. */
+      huff_counts->counts[0]+=length;
+      /* Check, what the compressed length of this would be. */
+      new_length=calc_packed_length(huff_counts,0);
+      /* If the compression without the zeroes would be shorter, we are done. */
+      if (old_length < new_length && huff_counts->field_length > 1)
+      {
+	huff_counts->field_type=FIELD_SKIP_ZERO;
+	huff_counts->counts[0]-=length;
+	huff_counts->bytes_packed=old_length- records/8;
+	goto found_pack;
+      }
+      /* Remove the insignificant spaces, but keep the zeroes. */
+      huff_counts->counts[' ']=old_space_count;
+    }
+    /* Check, what the compressed length of this column would be. */
+    huff_counts->bytes_packed=calc_packed_length(huff_counts,0);
+
+    /*
+      If there are enough empty records (in this column),
+      treating them specially may pay off.
+    */
+    if (huff_counts->empty_fields)
+    {
+      if (huff_counts->field_length > 2 &&
+	  huff_counts->empty_fields + (records - huff_counts->empty_fields)*
+	  (1+max_bit(max(huff_counts->max_pre_space,
+			 huff_counts->max_end_space))) <
+	  records * max_bit(huff_counts->field_length))
+      {
+	huff_counts->pack_type |= PACK_TYPE_SPACE_FIELDS;
+      }
+      else
+      {
+	length=huff_counts->empty_fields*huff_counts->field_length;
+	if (huff_counts->tot_end_space || ! huff_counts->tot_pre_space)
+	{
+	  huff_counts->tot_end_space+=length;
+	  huff_counts->max_end_space=huff_counts->field_length;
+	  if (huff_counts->field_length < 8)
+	    huff_counts->end_space[huff_counts->field_length]+=
+	      huff_counts->empty_fields;
+	}
+	if (huff_counts->tot_pre_space)
+	{
+	  huff_counts->tot_pre_space+=length;
+	  huff_counts->max_pre_space=huff_counts->field_length;
+	  if (huff_counts->field_length < 8)
+	    huff_counts->pre_space[huff_counts->field_length]+=
+	      huff_counts->empty_fields;
+	}
+      }
+    }
+
+    /*
+      If there are enough trailing spaces (in this column),
+      treating them specially may pay off.
+    */
+    if (huff_counts->tot_end_space)
+    {
+      huff_counts->counts[' ']+=huff_counts->tot_pre_space;
+      if (test_space_compress(huff_counts,records,huff_counts->max_end_space,
+			      huff_counts->end_space,
+			      huff_counts->tot_end_space,FIELD_SKIP_ENDSPACE))
+	goto found_pack;
+      huff_counts->counts[' ']-=huff_counts->tot_pre_space;
+    }
+
+    /*
+      If there are enough leading spaces (in this column),
+      treating them specially may pay off.
+    */
+    if (huff_counts->tot_pre_space)
+    {
+      if (test_space_compress(huff_counts,records,huff_counts->max_pre_space,
+			      huff_counts->pre_space,
+			      huff_counts->tot_pre_space,FIELD_SKIP_PRESPACE))
+	goto found_pack;
+    }
+
+  found_pack:			/* Found field-packing */
+
+    /* Test if we can use zero-fill */
+
+    if (huff_counts->max_zero_fill &&
+	(huff_counts->field_type == FIELD_NORMAL ||
+	 huff_counts->field_type == FIELD_SKIP_ZERO))
+    {
+      huff_counts->counts[0]-=huff_counts->max_zero_fill*
+	(huff_counts->field_type == FIELD_SKIP_ZERO ?
+	 records - huff_counts->zero_fields : records);
+      huff_counts->pack_type|=PACK_TYPE_ZERO_FILL;
+      huff_counts->bytes_packed=calc_packed_length(huff_counts,0);
+    }
+
+    /* Test if intervall-field is better */
+
+    if (huff_counts->tree_buff)
+    {
+      HUFF_TREE tree;
+
+      DBUG_EXECUTE_IF("forceintervall",
+                      huff_counts->bytes_packed= ~ (my_off_t) 0;);
+      tree.element_buffer=0;
+      if (!make_huff_tree(&tree,huff_counts) &&
+	  tree.bytes_packed+tree.tree_pack_length < huff_counts->bytes_packed)
+      {
+	if (tree.elements == 1)
+	  huff_counts->field_type=FIELD_CONSTANT;
+	else
+	  huff_counts->field_type=FIELD_INTERVALL;
+	huff_counts->pack_type=0;
+      }
+      else
+      {
+	my_free(huff_counts->tree_buff,MYF(0));
+	delete_tree(&huff_counts->int_tree);
+	huff_counts->tree_buff=0;
+      }
+      if (tree.element_buffer)
+	my_free(tree.element_buffer,MYF(0));
+    }
+    if (huff_counts->pack_type & PACK_TYPE_SPACE_FIELDS)
+      space_fields++;
+    if (huff_counts->pack_type & PACK_TYPE_ZERO_FILL)
+      fill_zero_fields++;
+    field_count[huff_counts->field_type]++;
+  }
+  DBUG_PRINT("info", ("normal:    %3d  empty-space:     %3d  "
+                      "empty-zero:       %3d  empty-fill: %3d",
+                      field_count[FIELD_NORMAL],space_fields,
+                      field_count[FIELD_SKIP_ZERO],fill_zero_fields));
+  DBUG_PRINT("info", ("pre-space: %3d  end-space:       %3d  "
+                      "intervall-fields: %3d  zero:       %3d",
+                      field_count[FIELD_SKIP_PRESPACE],
+                      field_count[FIELD_SKIP_ENDSPACE],
+                      field_count[FIELD_INTERVALL],
+                      field_count[FIELD_ZERO]));
+  if (verbose)
+    VOID(printf("\nnormal:    %3d  empty-space:     %3d  "
+                "empty-zero:       %3d  empty-fill: %3d\n"
+                "pre-space: %3d  end-space:       %3d  "
+                "intervall-fields: %3d  zero:       %3d\n",
+                field_count[FIELD_NORMAL],space_fields,
+                field_count[FIELD_SKIP_ZERO],fill_zero_fields,
+                field_count[FIELD_SKIP_PRESPACE],
+                field_count[FIELD_SKIP_ENDSPACE],
+                field_count[FIELD_INTERVALL],
+                field_count[FIELD_ZERO]));
+  DBUG_VOID_RETURN;
+}
+
+
+/* Test if we can use space-compression and empty-field-compression */
+
+static int
+test_space_compress(HUFF_COUNTS *huff_counts, my_off_t records,
+		    uint max_space_length, my_off_t *space_counts,
+		    my_off_t tot_space_count, enum en_fieldtype field_type)
+{
+  int min_pos;
+  uint length_bits,i;
+  my_off_t space_count,min_space_count,min_pack,new_length,skip;
+
+  length_bits=max_bit(max_space_length);
+
+		/* Default no end_space-packing */
+  space_count=huff_counts->counts[(uint) ' '];
+  min_space_count= (huff_counts->counts[(uint) ' ']+= tot_space_count);
+  min_pack=calc_packed_length(huff_counts,0);
+  min_pos= -2;
+  huff_counts->counts[(uint) ' ']=space_count;
+
+	/* Test with allways space-count */
+  new_length=huff_counts->bytes_packed+length_bits*records/8;
+  if (new_length+1 < min_pack)
+  {
+    min_pos= -1;
+    min_pack=new_length;
+    min_space_count=space_count;
+  }
+	/* Test with length-flag */
+  for (skip=0L, i=0 ; i < 8 ; i++)
+  {
+    if (space_counts[i])
+    {
+      if (i)
+	huff_counts->counts[(uint) ' ']+=space_counts[i];
+      skip+=huff_counts->pre_space[i];
+      new_length=calc_packed_length(huff_counts,0)+
+	(records+(records-skip)*(1+length_bits))/8;
+      if (new_length < min_pack)
+      {
+	min_pos=(int) i;
+	min_pack=new_length;
+	min_space_count=huff_counts->counts[(uint) ' '];
+      }
+    }
+  }
+
+  huff_counts->counts[(uint) ' ']=min_space_count;
+  huff_counts->bytes_packed=min_pack;
+  switch (min_pos) {
+  case -2:
+    return(0);				/* No space-compress */
+  case -1:				/* Always space-count */
+    huff_counts->field_type=field_type;
+    huff_counts->min_space=0;
+    huff_counts->length_bits=max_bit(max_space_length);
+    break;
+  default:
+    huff_counts->field_type=field_type;
+    huff_counts->min_space=(uint) min_pos;
+    huff_counts->pack_type|=PACK_TYPE_SELECTED;
+    huff_counts->length_bits=max_bit(max_space_length);
+    break;
+  }
+  return(1);				/* Using space-compress */
+}
+
+
+	/* Make a huff_tree of each huff_count */
+
+static HUFF_TREE* make_huff_trees(HUFF_COUNTS *huff_counts, uint trees)
+{
+  uint tree;
+  HUFF_TREE *huff_tree;
+  DBUG_ENTER("make_huff_trees");
+
+  if (!(huff_tree=(HUFF_TREE*) my_malloc(trees*sizeof(HUFF_TREE),
+					 MYF(MY_WME | MY_ZEROFILL))))
+    DBUG_RETURN(0);
+
+  for (tree=0 ; tree < trees ; tree++)
+  {
+    if (make_huff_tree(huff_tree+tree,huff_counts+tree))
+    {
+      while (tree--)
+	my_free(huff_tree[tree].element_buffer,MYF(0));
+      my_free(huff_tree,MYF(0));
+      DBUG_RETURN(0);
+    }
+  }
+  DBUG_RETURN(huff_tree);
+}
+
+/*
+  Build a Huffman tree.
+
+  SYNOPSIS
+    make_huff_tree()
+    huff_tree                   The Huffman tree.
+    huff_counts                 The counts.
+
+  DESCRIPTION
+    Build a Huffman tree according to huff_counts->counts or
+    huff_counts->tree_buff. tree_buff, if non-NULL contains up to
+    tree_buff_length of distinct column values. In that case, whole
+    values can be Huffman encoded instead of single bytes.
+
+  RETURN
+    0           OK
+    != 0        Error
+*/
+
+static int make_huff_tree(HUFF_TREE *huff_tree, HUFF_COUNTS *huff_counts)
+{
+  uint i,found,bits_packed,first,last;
+  my_off_t bytes_packed;
+  HUFF_ELEMENT *a,*b,*new_huff_el;
+
+  first=last=0;
+  if (huff_counts->tree_buff)
+  {
+    /* Calculate the number of distinct values in tree_buff. */
+    found= (uint) (huff_counts->tree_pos - huff_counts->tree_buff) /
+      huff_counts->field_length;
+    first=0; last=found-1;
+  }
+  else
+  {
+    /* Count the number of uchar codes found in the column. */
+    for (i=found=0 ; i < 256 ; i++)
+    {
+      if (huff_counts->counts[i])
+      {
+	if (! found++)
+	  first=i;
+	last=i;
+      }
+    }
+    if (found < 2)
+      found=2;
+  }
+
+  /* When using 'tree_buff' we can have more that 256 values. */
+  if (queue.max_elements < found)
+  {
+    delete_queue(&queue);
+    if (init_queue(&queue,found, 0, 0, compare_huff_elements, 0, 0, 0))
+      return -1;
+  }
+
+  /* Allocate or reallocate an element buffer for the Huffman tree. */
+  if (!huff_tree->element_buffer)
+  {
+    if (!(huff_tree->element_buffer=
+	 (HUFF_ELEMENT*) my_malloc(found*2*sizeof(HUFF_ELEMENT),MYF(MY_WME))))
+      return 1;
+  }
+  else
+  {
+    HUFF_ELEMENT *temp;
+    if (!(temp=
+	  (HUFF_ELEMENT*) my_realloc((uchar*) huff_tree->element_buffer,
+				     found*2*sizeof(HUFF_ELEMENT),
+				     MYF(MY_WME))))
+      return 1;
+    huff_tree->element_buffer=temp;
+  }
+
+  huff_counts->tree=huff_tree;
+  huff_tree->counts=huff_counts;
+  huff_tree->min_chr=first;
+  huff_tree->max_chr=last;
+  huff_tree->char_bits=max_bit(last-first);
+  huff_tree->offset_bits=max_bit(found-1)+1;
+
+  if (huff_counts->tree_buff)
+  {
+    huff_tree->elements=0;
+    huff_tree->tree_pack_length=(1+15+16+5+5+
+				 (huff_tree->char_bits+1)*found+
+				 (huff_tree->offset_bits+1)*
+				 (found-2)+7)/8 +
+				   (uint) (huff_tree->counts->tree_pos-
+					   huff_tree->counts->tree_buff);
+    /*
+      Put a HUFF_ELEMENT into the queue for every distinct column value.
+
+      tree_walk() calls save_counts_in_queue() for every element in
+      'int_tree'. This takes elements from the target trees element
+      buffer and places references to them into the buffer of the
+      priority queue. We insert in column value order, but the order is
+      in fact irrelevant here. We will establish the correct order
+      later.
+    */
+    tree_walk(&huff_counts->int_tree,
+	      (int (*)(void*, element_count,void*)) save_counts_in_queue,
+	      (uchar*) huff_tree, left_root_right);
+  }
+  else
+  {
+    huff_tree->elements=found;
+    huff_tree->tree_pack_length=(9+9+5+5+
+				 (huff_tree->char_bits+1)*found+
+				 (huff_tree->offset_bits+1)*
+				 (found-2)+7)/8;
+    /*
+      Put a HUFF_ELEMENT into the queue for every uchar code found in the column.
+
+      The elements are taken from the target trees element buffer.
+      Instead of using queue_insert(), we just place references to the
+      elements into the buffer of the priority queue. We insert in byte
+      value order, but the order is in fact irrelevant here. We will
+      establish the correct order later.
+    */
+    for (i=first, found=0 ; i <= last ; i++)
+    {
+      if (huff_counts->counts[i])
+      {
+	new_huff_el=huff_tree->element_buffer+(found++);
+	new_huff_el->count=huff_counts->counts[i];
+	new_huff_el->a.leaf.null=0;
+	new_huff_el->a.leaf.element_nr=i;
+	queue.root[found]=(uchar*) new_huff_el;
+      }
+    }
+    /*
+      If there is only a single uchar value in this field in all records,
+      add a second element with zero incidence. This is required to enter
+      the loop, which builds the Huffman tree.
+    */
+    while (found < 2)
+    {
+      new_huff_el=huff_tree->element_buffer+(found++);
+      new_huff_el->count=0;
+      new_huff_el->a.leaf.null=0;
+      if (last)
+	new_huff_el->a.leaf.element_nr=huff_tree->min_chr=last-1;
+      else
+	new_huff_el->a.leaf.element_nr=huff_tree->max_chr=last+1;
+      queue.root[found]=(uchar*) new_huff_el;
+    }
+  }
+
+  /* Make a queue from the queue buffer. */
+  queue.elements=found;
+
+  /*
+    Make a priority queue from the queue. Construct its index so that we
+    have a partially ordered tree.
+  */
+  queue_fix(&queue);
+
+  /* The Huffman algorithm. */
+  bytes_packed=0; bits_packed=0;
+  for (i=1 ; i < found ; i++)
+  {
+    /*
+      Pop the top element from the queue (the one with the least incidence).
+      Popping from a priority queue includes a re-ordering of the queue,
+      to get the next least incidence element to the top.
+    */
+    a=(HUFF_ELEMENT*) queue_remove_top(&queue);
+    /* Copy the next least incidence element */
+    b=(HUFF_ELEMENT*) queue_top(&queue);
+    /* Get a new element from the element buffer. */
+    new_huff_el=huff_tree->element_buffer+found+i;
+    /* The new element gets the sum of the two least incidence elements. */
+    new_huff_el->count=a->count+b->count;
+    /*
+      The Huffman algorithm assigns another bit to the code for a byte
+      every time that bytes incidence is combined (directly or indirectly)
+      to a new element as one of the two least incidence elements.
+      This means that one more bit per incidence of that uchar is required
+      in the resulting file. So we add the new combined incidence as the
+      number of bits by which the result grows.
+    */
+    bits_packed+=(uint) (new_huff_el->count & 7);
+    bytes_packed+=new_huff_el->count/8;
+    /* The new element points to its children, lesser in left.  */
+    new_huff_el->a.nod.left=a;
+    new_huff_el->a.nod.right=b;
+    /*
+      Replace the copied top element by the new element and re-order the
+      queue.
+    */
+    queue_top(&queue)= (uchar*) new_huff_el;
+    queue_replace_top(&queue);
+  }
+  huff_tree->root=(HUFF_ELEMENT*) queue.root[1];
+  huff_tree->bytes_packed=bytes_packed+(bits_packed+7)/8;
+  return 0;
+}
+
+static int compare_tree(void* cmp_arg __attribute__((unused)),
+			register const uchar *s, register const uchar *t)
+{
+  uint length;
+  for (length=global_count->field_length; length-- ;)
+    if (*s++ != *t++)
+      return (int) s[-1] - (int) t[-1];
+  return 0;
+}
+
+/*
+  Organize distinct column values and their incidences into a priority queue.
+
+  SYNOPSIS
+    save_counts_in_queue()
+    key                         The column value.
+    count                       The incidence of this value.
+    tree                        The Huffman tree to be built later.
+
+  DESCRIPTION
+    We use the element buffer of the targeted tree. The distinct column
+    values are organized in a priority queue first. The Huffman
+    algorithm will later organize the elements into a Huffman tree. For
+    the time being, we just place references to the elements into the
+    queue buffer. The buffer will later be organized into a priority
+    queue.
+
+  RETURN
+    0
+ */
+
+static int save_counts_in_queue(uchar *key, element_count count,
+				HUFF_TREE *tree)
+{
+  HUFF_ELEMENT *new_huff_el;
+
+  new_huff_el=tree->element_buffer+(tree->elements++);
+  new_huff_el->count=count;
+  new_huff_el->a.leaf.null=0;
+  new_huff_el->a.leaf.element_nr= (uint) (key- tree->counts->tree_buff) /
+    tree->counts->field_length;
+  queue.root[tree->elements]=(uchar*) new_huff_el;
+  return 0;
+}
+
+
+/*
+  Calculate length of file if given counts should be used.
+
+  SYNOPSIS
+    calc_packed_length()
+    huff_counts                 The counts for a column of the table(s).
+    add_tree_lenght             If the decode tree length should be added.
+
+  DESCRIPTION
+    We need to follow the Huffman algorithm until we know, how many bits
+    are required for each uchar code. But we do not need the resulting
+    Huffman tree. Hence, we can leave out some steps which are essential
+    in make_huff_tree().
+
+  RETURN
+    Number of bytes required to compress this table column.
+*/
+
+static my_off_t calc_packed_length(HUFF_COUNTS *huff_counts,
+				   uint add_tree_lenght)
+{
+  uint i,found,bits_packed,first,last;
+  my_off_t bytes_packed;
+  HUFF_ELEMENT element_buffer[256];
+  DBUG_ENTER("calc_packed_length");
+
+  /*
+    WARNING: We use a small hack for efficiency: Instead of placing
+    references to HUFF_ELEMENTs into the queue, we just insert
+    references to the counts of the uchar codes which appeared in this
+    table column. During the Huffman algorithm they are successively
+    replaced by references to HUFF_ELEMENTs. This works, because
+    HUFF_ELEMENTs have the incidence count at their beginning.
+    Regardless, wether the queue array contains references to counts of
+    type my_off_t or references to HUFF_ELEMENTs which have the count of
+    type my_off_t at their beginning, it always points to a count of the
+    same type.
+
+    Instead of using queue_insert(), we just copy the references into
+    the buffer of the priority queue. We insert in uchar value order, but
+    the order is in fact irrelevant here. We will establish the correct
+    order later.
+  */
+  first=last=0;
+  for (i=found=0 ; i < 256 ; i++)
+  {
+    if (huff_counts->counts[i])
+    {
+      if (! found++)
+	first=i;
+      last=i;
+      /* We start with root[1], which is the queues top element. */
+      queue.root[found]=(uchar*) &huff_counts->counts[i];
+    }
+  }
+  if (!found)
+    DBUG_RETURN(0);			/* Empty tree */
+  /*
+    If there is only a single uchar value in this field in all records,
+    add a second element with zero incidence. This is required to enter
+    the loop, which follows the Huffman algorithm.
+  */
+  if (found < 2)
+    queue.root[++found]=(uchar*) &huff_counts->counts[last ? 0 : 1];
+
+  /* Make a queue from the queue buffer. */
+  queue.elements=found;
+
+  bytes_packed=0; bits_packed=0;
+  /* Add the length of the coding table, which would become part of the file. */
+  if (add_tree_lenght)
+    bytes_packed=(8+9+5+5+(max_bit(last-first)+1)*found+
+		  (max_bit(found-1)+1+1)*(found-2) +7)/8;
+
+  /*
+    Make a priority queue from the queue. Construct its index so that we
+    have a partially ordered tree.
+  */
+  queue_fix(&queue);
+
+  /* The Huffman algorithm. */
+  for (i=0 ; i < found-1 ; i++)
+  {
+    my_off_t        *a;
+    my_off_t        *b;
+    HUFF_ELEMENT    *new_huff_el;
+
+    /*
+      Pop the top element from the queue (the one with the least
+      incidence). Popping from a priority queue includes a re-ordering
+      of the queue, to get the next least incidence element to the top.
+    */
+    a= (my_off_t*) queue_remove_top(&queue);
+    /* Copy the next least incidence element. */
+    b= (my_off_t*) queue_top(&queue);
+    /* Create a new element in a local (automatic) buffer. */
+    new_huff_el= element_buffer + i;
+    /* The new element gets the sum of the two least incidence elements. */
+    new_huff_el->count= *a + *b;
+    /*
+      The Huffman algorithm assigns another bit to the code for a byte
+      every time that bytes incidence is combined (directly or indirectly)
+      to a new element as one of the two least incidence elements.
+      This means that one more bit per incidence of that uchar is required
+      in the resulting file. So we add the new combined incidence as the
+      number of bits by which the result grows.
+    */
+    bits_packed+=(uint) (new_huff_el->count & 7);
+    bytes_packed+=new_huff_el->count/8;
+    /*
+      Replace the copied top element by the new element and re-order the
+      queue. This successively replaces the references to counts by
+      references to HUFF_ELEMENTs.
+    */
+    queue_top(&queue)= (uchar*) new_huff_el;
+    queue_replace_top(&queue);
+  }
+  DBUG_RETURN(bytes_packed+(bits_packed+7)/8);
+}
+
+
+	/* Remove trees that don't give any compression */
+
+static uint join_same_trees(HUFF_COUNTS *huff_counts, uint trees)
+{
+  uint k,tree_number;
+  HUFF_COUNTS count,*i,*j,*last_count;
+
+  last_count=huff_counts+trees;
+  for (tree_number=0, i=huff_counts ; i < last_count ; i++)
+  {
+    if (!i->tree->tree_number)
+    {
+      i->tree->tree_number= ++tree_number;
+      if (i->tree_buff)
+	continue;			/* Don't join intervall */
+      for (j=i+1 ; j < last_count ; j++)
+      {
+	if (! j->tree->tree_number && ! j->tree_buff)
+	{
+	  for (k=0 ; k < 256 ; k++)
+	    count.counts[k]=i->counts[k]+j->counts[k];
+	  if (calc_packed_length(&count,1) <=
+	      i->tree->bytes_packed + j->tree->bytes_packed+
+	      i->tree->tree_pack_length+j->tree->tree_pack_length+
+	      ALLOWED_JOIN_DIFF)
+	  {
+	    memcpy_fixed((uchar*) i->counts,(uchar*) count.counts,
+			 sizeof(count.counts[0])*256);
+	    my_free((uchar*) j->tree->element_buffer,MYF(0));
+	    j->tree->element_buffer=0;
+	    j->tree=i->tree;
+	    bmove((uchar*) i->counts,(uchar*) count.counts,
+		  sizeof(count.counts[0])*256);
+	    if (make_huff_tree(i->tree,i))
+	      return (uint) -1;
+	  }
+	}
+      }
+    }
+  }
+  DBUG_PRINT("info", ("Original trees:  %d  After join: %d",
+                      trees, tree_number));
+  if (verbose)
+    VOID(printf("Original trees:  %d  After join: %d\n", trees, tree_number));
+  return tree_number;			/* Return trees left */
+}
+
+
+/*
+  Fill in huff_tree encode tables.
+
+  SYNOPSIS
+    make_huff_decode_table()
+    huff_tree               An array of HUFF_TREE which are to be encoded.
+    trees                   The number of HUFF_TREE in the array.
+
+  RETURN
+    0           success
+    != 0        error
+*/
+
+static int make_huff_decode_table(HUFF_TREE *huff_tree, uint trees)
+{
+  uint elements;
+  for ( ; trees-- ; huff_tree++)
+  {
+    if (huff_tree->tree_number > 0)
+    {
+      elements=huff_tree->counts->tree_buff ? huff_tree->elements : 256;
+      if (!(huff_tree->code =
+            (ulonglong*) my_malloc(elements*
+                                   (sizeof(ulonglong) + sizeof(uchar)),
+                                   MYF(MY_WME | MY_ZEROFILL))))
+	return 1;
+      huff_tree->code_len=(uchar*) (huff_tree->code+elements);
+      make_traverse_code_tree(huff_tree, huff_tree->root,
+                              8 * sizeof(ulonglong), LL(0));
+    }
+  }
+  return 0;
+}
+
+
+static void make_traverse_code_tree(HUFF_TREE *huff_tree,
+				    HUFF_ELEMENT *element,
+				    uint size, ulonglong code)
+{
+  uint chr;
+  if (!element->a.leaf.null)
+  {
+    chr=element->a.leaf.element_nr;
+    huff_tree->code_len[chr]= (uchar) (8 * sizeof(ulonglong) - size);
+    huff_tree->code[chr]= (code >> size);
+    if (huff_tree->height < 8 * sizeof(ulonglong) - size)
+        huff_tree->height= 8 * sizeof(ulonglong) - size;
+  }
+  else
+  {
+    size--;
+    make_traverse_code_tree(huff_tree,element->a.nod.left,size,code);
+    make_traverse_code_tree(huff_tree, element->a.nod.right, size,
+			    code + (((ulonglong) 1) << size));
+  }
+  return;
+}
+
+
+/*
+  Convert a value into binary digits.
+
+  SYNOPSIS
+    bindigits()
+    value                       The value.
+    length                      The number of low order bits to convert.
+
+  NOTE
+    The result string is in static storage. It is reused on every call.
+    So you cannot use it twice in one expression.
+
+  RETURN
+    A pointer to a static NUL-terminated string.
+ */
+
+static char *bindigits(ulonglong value, uint bits)
+{
+  static char digits[72];
+  char *ptr= digits;
+  uint idx= bits;
+
+  DBUG_ASSERT(idx < sizeof(digits));
+  while (idx)
+    *(ptr++)= '0' + ((char) (value >> (--idx)) & (char) 1);
+  *ptr= '\0';
+  return digits;
+}
+
+
+/*
+  Convert a value into hexadecimal digits.
+
+  SYNOPSIS
+    hexdigits()
+    value                       The value.
+
+  NOTE
+    The result string is in static storage. It is reused on every call.
+    So you cannot use it twice in one expression.
+
+  RETURN
+    A pointer to a static NUL-terminated string.
+ */
+
+static char *hexdigits(ulonglong value)
+{
+  static char digits[20];
+  char *ptr= digits;
+  uint idx= 2 * sizeof(value); /* Two hex digits per byte. */
+
+  DBUG_ASSERT(idx < sizeof(digits));
+  while (idx)
+  {
+    if ((*(ptr++)= '0' + ((char) (value >> (4 * (--idx))) & (char) 0xf)) > '9')
+      *(ptr - 1)+= 'a' - '9' - 1;
+  }
+  *ptr= '\0';
+  return digits;
+}
+
+
+	/* Write header to new packed data file */
+
+static int write_header(PACK_MRG_INFO *mrg,uint head_length,uint trees,
+			my_off_t tot_elements,my_off_t filelength)
+{
+  uchar *buff= (uchar*) file_buffer.pos;
+
+  bzero(buff,HEAD_LENGTH);
+  memcpy_fixed(buff,maria_pack_file_magic,4);
+  int4store(buff+4,head_length);
+  int4store(buff+8, mrg->min_pack_length);
+  int4store(buff+12,mrg->max_pack_length);
+  int4store(buff+16,tot_elements);
+  int4store(buff+20,intervall_length);
+  int2store(buff+24,trees);
+  buff[26]=(char) mrg->ref_length;
+	/* Save record pointer length */
+  buff[27]= (uchar) maria_get_pointer_length((ulonglong) filelength,2);
+  if (test_only)
+    return 0;
+  VOID(my_seek(file_buffer.file,0L,MY_SEEK_SET,MYF(0)));
+  return my_write(file_buffer.file,(const uchar *) file_buffer.pos,HEAD_LENGTH,
+		  MYF(MY_WME | MY_NABP | MY_WAIT_IF_FULL)) != 0;
+}
+
+	/* Write fieldinfo to new packed file */
+
+static void write_field_info(HUFF_COUNTS *counts, uint fields, uint trees)
+{
+  reg1 uint i;
+  uint huff_tree_bits;
+  huff_tree_bits=max_bit(trees ? trees-1 : 0);
+
+  DBUG_PRINT("info", (" "));
+  DBUG_PRINT("info", ("column types:"));
+  DBUG_PRINT("info", ("FIELD_NORMAL          0"));
+  DBUG_PRINT("info", ("FIELD_SKIP_ENDSPACE   1"));
+  DBUG_PRINT("info", ("FIELD_SKIP_PRESPACE   2"));
+  DBUG_PRINT("info", ("FIELD_SKIP_ZERO       3"));
+  DBUG_PRINT("info", ("FIELD_BLOB            4"));
+  DBUG_PRINT("info", ("FIELD_CONSTANT        5"));
+  DBUG_PRINT("info", ("FIELD_INTERVALL       6"));
+  DBUG_PRINT("info", ("FIELD_ZERO            7"));
+  DBUG_PRINT("info", ("FIELD_VARCHAR         8"));
+  DBUG_PRINT("info", ("FIELD_CHECK           9"));
+  DBUG_PRINT("info", (" "));
+  DBUG_PRINT("info", ("pack type as a set of flags:"));
+  DBUG_PRINT("info", ("PACK_TYPE_SELECTED      1"));
+  DBUG_PRINT("info", ("PACK_TYPE_SPACE_FIELDS  2"));
+  DBUG_PRINT("info", ("PACK_TYPE_ZERO_FILL     4"));
+  DBUG_PRINT("info", (" "));
+  if (verbose >= 2)
+  {
+    VOID(printf("\n"));
+    VOID(printf("column types:\n"));
+    VOID(printf("FIELD_NORMAL          0\n"));
+    VOID(printf("FIELD_SKIP_ENDSPACE   1\n"));
+    VOID(printf("FIELD_SKIP_PRESPACE   2\n"));
+    VOID(printf("FIELD_SKIP_ZERO       3\n"));
+    VOID(printf("FIELD_BLOB            4\n"));
+    VOID(printf("FIELD_CONSTANT        5\n"));
+    VOID(printf("FIELD_INTERVALL       6\n"));
+    VOID(printf("FIELD_ZERO            7\n"));
+    VOID(printf("FIELD_VARCHAR         8\n"));
+    VOID(printf("FIELD_CHECK           9\n"));
+    VOID(printf("\n"));
+    VOID(printf("pack type as a set of flags:\n"));
+    VOID(printf("PACK_TYPE_SELECTED      1\n"));
+    VOID(printf("PACK_TYPE_SPACE_FIELDS  2\n"));
+    VOID(printf("PACK_TYPE_ZERO_FILL     4\n"));
+    VOID(printf("\n"));
+  }
+  for (i=0 ; i++ < fields ; counts++)
+  {
+    write_bits((ulonglong) (int) counts->field_type, 5);
+    write_bits(counts->pack_type,6);
+    if (counts->pack_type & PACK_TYPE_ZERO_FILL)
+      write_bits(counts->max_zero_fill,5);
+    else
+      write_bits(counts->length_bits,5);
+    write_bits((ulonglong) counts->tree->tree_number - 1, huff_tree_bits);
+    DBUG_PRINT("info", ("column: %3u  type: %2u  pack: %2u  zero: %4u  "
+                        "lbits: %2u  tree: %2u  length: %4u",
+                        i , counts->field_type, counts->pack_type,
+                        counts->max_zero_fill, counts->length_bits,
+                        counts->tree->tree_number, counts->field_length));
+    if (verbose >= 2)
+      VOID(printf("column: %3u  type: %2u  pack: %2u  zero: %4u  lbits: %2u  "
+                  "tree: %2u  length: %4u\n", i , counts->field_type,
+                  counts->pack_type, counts->max_zero_fill, counts->length_bits,
+                  counts->tree->tree_number, counts->field_length));
+  }
+  flush_bits();
+  return;
+}
+
+	/* Write all huff_trees to new datafile. Return tot count of
+	   elements in all trees
+	   Returns 0 on error */
+
+static my_off_t write_huff_tree(HUFF_TREE *huff_tree, uint trees)
+{
+  uint i,int_length;
+  uint tree_no;
+  uint codes;
+  uint errors= 0;
+  uint *packed_tree,*offset,length;
+  my_off_t elements;
+
+  /* Find the highest number of elements in the trees. */
+  for (i=length=0 ; i < trees ; i++)
+    if (huff_tree[i].tree_number > 0 && huff_tree[i].elements > length)
+      length=huff_tree[i].elements;
+  /*
+    Allocate a buffer for packing a decode tree. Two numbers per element
+    (left child and right child).
+  */
+  if (!(packed_tree=(uint*) my_alloca(sizeof(uint)*length*2)))
+  {
+    my_error(EE_OUTOFMEMORY,MYF(ME_BELL),sizeof(uint)*length*2);
+    return 0;
+  }
+
+  DBUG_PRINT("info", (" "));
+  if (verbose >= 2)
+    VOID(printf("\n"));
+  tree_no= 0;
+  intervall_length=0;
+  for (elements=0; trees-- ; huff_tree++)
+  {
+    /* Skip columns that have been joined with other columns. */
+    if (huff_tree->tree_number == 0)
+      continue;				/* Deleted tree */
+    tree_no++;
+    DBUG_PRINT("info", (" "));
+    if (verbose >= 3)
+      VOID(printf("\n"));
+    /* Count the total number of elements (byte codes or column values). */
+    elements+=huff_tree->elements;
+    huff_tree->max_offset=2;
+    /* Build a tree of offsets and codes for decoding in 'packed_tree'. */
+    if (huff_tree->elements <= 1)
+      offset=packed_tree;
+    else
+      offset=make_offset_code_tree(huff_tree,huff_tree->root,packed_tree);
+
+    /* This should be the same as 'length' above. */
+    huff_tree->offset_bits=max_bit(huff_tree->max_offset);
+
+    /*
+      Since we check this during collecting the distinct column values,
+      this should never happen.
+    */
+    if (huff_tree->max_offset >= IS_OFFSET)
+    {				/* This should be impossible */
+      VOID(fprintf(stderr, "Tree offset got too big: %d, aborted\n",
+                   huff_tree->max_offset));
+      my_afree(packed_tree);
+      return 0;
+    }
+
+    DBUG_PRINT("info", ("pos: %lu  elements: %u  tree-elements: %lu  "
+                        "char_bits: %u\n",
+                        (ulong) (file_buffer.pos - file_buffer.buffer),
+                        huff_tree->elements, (ulong) (offset - packed_tree),
+                        huff_tree->char_bits));
+    if (!huff_tree->counts->tree_buff)
+    {
+      /* We do a uchar compression on this column. Mark with bit 0. */
+      write_bits(0,1);
+      write_bits(huff_tree->min_chr,8);
+      write_bits(huff_tree->elements,9);
+      write_bits(huff_tree->char_bits,5);
+      write_bits(huff_tree->offset_bits,5);
+      int_length=0;
+    }
+    else
+    {
+      int_length=(uint) (huff_tree->counts->tree_pos -
+			 huff_tree->counts->tree_buff);
+      /* We have distinct column values for this column. Mark with bit 1. */
+      write_bits(1,1);
+      write_bits(huff_tree->elements,15);
+      write_bits(int_length,16);
+      write_bits(huff_tree->char_bits,5);
+      write_bits(huff_tree->offset_bits,5);
+      intervall_length+=int_length;
+    }
+    DBUG_PRINT("info", ("tree: %2u  elements: %4u  char_bits: %2u  "
+                        "offset_bits: %2u  %s: %5u  codelen: %2u",
+                        tree_no, huff_tree->elements, huff_tree->char_bits,
+                        huff_tree->offset_bits, huff_tree->counts->tree_buff ?
+                        "bufflen" : "min_chr", huff_tree->counts->tree_buff ?
+                        int_length : huff_tree->min_chr, huff_tree->height));
+    if (verbose >= 2)
+      VOID(printf("tree: %2u  elements: %4u  char_bits: %2u  offset_bits: %2u  "
+                  "%s: %5u  codelen: %2u\n", tree_no, huff_tree->elements,
+                  huff_tree->char_bits, huff_tree->offset_bits,
+                  huff_tree->counts->tree_buff ? "bufflen" : "min_chr",
+                  huff_tree->counts->tree_buff ? int_length :
+                  huff_tree->min_chr, huff_tree->height));
+
+    /* Check that the code tree length matches the element count. */
+    length=(uint) (offset-packed_tree);
+    if (length != huff_tree->elements*2-2)
+    {
+      VOID(fprintf(stderr, "error: Huff-tree-length: %d != calc_length: %d\n",
+                   length, huff_tree->elements * 2 - 2));
+      errors++;
+      break;
+    }
+
+    for (i=0 ; i < length ; i++)
+    {
+      if (packed_tree[i] & IS_OFFSET)
+	write_bits(packed_tree[i] - IS_OFFSET+ (1 << huff_tree->offset_bits),
+		   huff_tree->offset_bits+1);
+      else
+	write_bits(packed_tree[i]-huff_tree->min_chr,huff_tree->char_bits+1);
+      DBUG_PRINT("info", ("tree[0x%04x]: %s0x%04x",
+                          i, (packed_tree[i] & IS_OFFSET) ?
+                          " -> " : "", (packed_tree[i] & IS_OFFSET) ?
+                          packed_tree[i] - IS_OFFSET + i : packed_tree[i]));
+      if (verbose >= 3)
+        VOID(printf("tree[0x%04x]: %s0x%04x\n",
+                    i, (packed_tree[i] & IS_OFFSET) ? " -> " : "",
+                    (packed_tree[i] & IS_OFFSET) ?
+                    packed_tree[i] - IS_OFFSET + i : packed_tree[i]));
+    }
+    flush_bits();
+
+    /*
+      Display coding tables and check their correctness.
+    */
+    codes= huff_tree->counts->tree_buff ? huff_tree->elements : 256;
+    for (i= 0; i < codes; i++)
+    {
+      ulonglong code;
+      uint bits;
+      uint len;
+      uint idx;
+
+      if (! (len= huff_tree->code_len[i]))
+        continue;
+      DBUG_PRINT("info", ("code[0x%04x]:      0x%s  bits: %2u  bin: %s", i,
+                          hexdigits(huff_tree->code[i]), huff_tree->code_len[i],
+                          bindigits(huff_tree->code[i],
+                                    huff_tree->code_len[i])));
+      if (verbose >= 3)
+        VOID(printf("code[0x%04x]:      0x%s  bits: %2u  bin: %s\n", i,
+                    hexdigits(huff_tree->code[i]), huff_tree->code_len[i],
+                    bindigits(huff_tree->code[i], huff_tree->code_len[i])));
+
+      /* Check that the encode table decodes correctly. */
+      code= 0;
+      bits= 0;
+      idx= 0;
+      DBUG_EXECUTE_IF("forcechkerr1", len--;);
+      DBUG_EXECUTE_IF("forcechkerr2", bits= 8 * sizeof(code););
+      DBUG_EXECUTE_IF("forcechkerr3", idx= length;);
+      for (;;)
+      {
+        if (! len)
+        {
+          VOID(fflush(stdout));
+          VOID(fprintf(stderr, "error: code 0x%s with %u bits not found\n",
+                       hexdigits(huff_tree->code[i]), huff_tree->code_len[i]));
+          errors++;
+          break;
+        }
+        code<<= 1;
+        code|= (huff_tree->code[i] >> (--len)) & 1;
+        bits++;
+        if (bits > 8 * sizeof(code))
+        {
+          VOID(fflush(stdout));
+          VOID(fprintf(stderr, "error: Huffman code too long: %u/%u\n",
+                       bits, (uint) (8 * sizeof(code))));
+          errors++;
+          break;
+        }
+        idx+= (uint) code & 1;
+        if (idx >= length)
+        {
+          VOID(fflush(stdout));
+          VOID(fprintf(stderr, "error: illegal tree offset: %u/%u\n",
+                       idx, length));
+          errors++;
+          break;
+        }
+        if (packed_tree[idx] & IS_OFFSET)
+          idx+= packed_tree[idx] & ~IS_OFFSET;
+        else
+          break; /* Hit a leaf. This contains the result value. */
+      }
+      if (errors)
+        break;
+
+      DBUG_EXECUTE_IF("forcechkerr4", packed_tree[idx]++;);
+      if (packed_tree[idx] != i)
+      {
+        VOID(fflush(stdout));
+        VOID(fprintf(stderr, "error: decoded value 0x%04x  should be: 0x%04x\n",
+                     packed_tree[idx], i));
+        errors++;
+        break;
+      }
+    } /*end for (codes)*/
+    if (errors)
+      break;
+
+    /* Write column values in case of distinct column value compression. */
+    if (huff_tree->counts->tree_buff)
+    {
+      for (i=0 ; i < int_length ; i++)
+      {
+ 	write_bits((ulonglong) (uchar) huff_tree->counts->tree_buff[i], 8);
+        DBUG_PRINT("info", ("column_values[0x%04x]: 0x%02x",
+                            i, (uchar) huff_tree->counts->tree_buff[i]));
+        if (verbose >= 3)
+          VOID(printf("column_values[0x%04x]: 0x%02x\n",
+                      i, (uchar) huff_tree->counts->tree_buff[i]));
+      }
+    }
+    flush_bits();
+  }
+  DBUG_PRINT("info", (" "));
+  if (verbose >= 2)
+    VOID(printf("\n"));
+  my_afree(packed_tree);
+  if (errors)
+  {
+    VOID(fprintf(stderr, "Error: Generated decode trees are corrupt. Stop.\n"));
+    return 0;
+  }
+  return elements;
+}
+
+
+static uint *make_offset_code_tree(HUFF_TREE *huff_tree, HUFF_ELEMENT *element,
+				   uint *offset)
+{
+  uint *prev_offset;
+
+  prev_offset= offset;
+  /*
+    'a.leaf.null' takes the same place as 'a.nod.left'. If this is null,
+    then there is no left child and, hence no right child either. This
+    is a property of a binary tree. An element is either a node with two
+    childs, or a leaf without childs.
+
+    The current element is always a node with two childs. Go left first.
+  */
+  if (!element->a.nod.left->a.leaf.null)
+  {
+    /* Store the uchar code or the index of the column value. */
+    prev_offset[0] =(uint) element->a.nod.left->a.leaf.element_nr;
+    offset+=2;
+  }
+  else
+  {
+    /*
+      Recursively traverse the tree to the left. Mark it as an offset to
+      another tree node (in contrast to a uchar code or column value index).
+    */
+    prev_offset[0]= IS_OFFSET+2;
+    offset=make_offset_code_tree(huff_tree,element->a.nod.left,offset+2);
+  }
+
+  /* Now, check the right child. */
+  if (!element->a.nod.right->a.leaf.null)
+  {
+    /* Store the uchar code or the index of the column value. */
+    prev_offset[1]=element->a.nod.right->a.leaf.element_nr;
+    return offset;
+  }
+  else
+  {
+    /*
+      Recursively traverse the tree to the right. Mark it as an offset to
+      another tree node (in contrast to a uchar code or column value index).
+    */
+    uint temp=(uint) (offset-prev_offset-1);
+    prev_offset[1]= IS_OFFSET+ temp;
+    if (huff_tree->max_offset < temp)
+      huff_tree->max_offset = temp;
+    return make_offset_code_tree(huff_tree,element->a.nod.right,offset);
+  }
+}
+
+	/* Get number of bits neaded to represent value */
+
+static uint max_bit(register uint value)
+{
+  reg2 uint power=1;
+
+  while ((value>>=1))
+    power++;
+  return (power);
+}
+
+
+static int compress_maria_file(PACK_MRG_INFO *mrg, HUFF_COUNTS *huff_counts)
+{
+  int error;
+  uint i,max_calc_length,pack_ref_length,min_record_length,max_record_length;
+  uint intervall,field_length,max_pack_length,pack_blob_length, null_bytes;
+  my_off_t record_count;
+  char llbuf[32];
+  ulong length,pack_length;
+  uchar *record,*pos,*end_pos,*record_pos,*start_pos;
+  HUFF_COUNTS *count,*end_count;
+  HUFF_TREE *tree;
+  MARIA_HA *isam_file=mrg->file[0];
+  uint pack_version= (uint) isam_file->s->pack.version;
+  DBUG_ENTER("compress_maria_file");
+
+  /* Allocate a buffer for the records (excluding blobs). */
+  if (!(record=(uchar*) my_alloca(isam_file->s->base.reclength)))
+    return -1;
+
+  end_count=huff_counts+isam_file->s->base.fields;
+  min_record_length= (uint) ~0;
+  max_record_length=0;
+  null_bytes= isam_file->s->base.null_bytes;
+
+  /*
+    Calculate the maximum number of bits required to pack the records.
+    Remember to understand 'max_zero_fill' as 'min_zero_fill'.
+    The tree height determines the maximum number of bits per value.
+    Some fields skip leading or trailing spaces or zeroes. The skipped
+    number of bytes is encoded by 'length_bits' bits.
+    Empty blobs and varchar are encoded with a single 1 bit. Other blobs
+    and varchar get a leading 0 bit.
+  */
+  max_calc_length= null_bytes;
+  for (i= 0 ; i < isam_file->s->base.fields ; i++)
+  {
+    if (!(huff_counts[i].pack_type & PACK_TYPE_ZERO_FILL))
+      huff_counts[i].max_zero_fill=0;
+    if (huff_counts[i].field_type == FIELD_CONSTANT ||
+	huff_counts[i].field_type == FIELD_ZERO ||
+	huff_counts[i].field_type == FIELD_CHECK)
+      continue;
+    if (huff_counts[i].field_type == FIELD_INTERVALL)
+      max_calc_length+=huff_counts[i].tree->height;
+    else if (huff_counts[i].field_type == FIELD_BLOB ||
+	     huff_counts[i].field_type == FIELD_VARCHAR)
+      max_calc_length+=huff_counts[i].tree->height*huff_counts[i].max_length + huff_counts[i].length_bits +1;
+    else
+      max_calc_length+=
+	(huff_counts[i].field_length - huff_counts[i].max_zero_fill)*
+	  huff_counts[i].tree->height+huff_counts[i].length_bits;
+  }
+  max_calc_length= (max_calc_length + 7) / 8;
+  pack_ref_length= _ma_calc_pack_length(pack_version, max_calc_length);
+  record_count=0;
+  /* 'max_blob_length' is the max length of all blobs of a record. */
+  pack_blob_length= isam_file->s->base.blobs ?
+                    _ma_calc_pack_length(pack_version, mrg->max_blob_length) : 0;
+  max_pack_length=pack_ref_length+pack_blob_length;
+
+  DBUG_PRINT("fields", ("==="));
+  mrg_reset(mrg);
+  while ((error=mrg_rrnd(mrg,record)) != HA_ERR_END_OF_FILE)
+  {
+    ulong tot_blob_length=0;
+    if (! error)
+    {
+      if (flush_buffer((ulong) max_calc_length + (ulong) max_pack_length +
+                       null_bytes))
+	break;
+      record_pos= file_buffer.pos;
+      file_buffer.pos+= max_pack_length;
+      if (null_bytes)
+      {
+        /* Copy null bits 'as is' */
+        memcpy(file_buffer.pos, record, null_bytes);
+        file_buffer.pos+= null_bytes;
+      }
+      for (start_pos=record+null_bytes, count= huff_counts;
+           count < end_count ;
+           count++)
+      {
+	end_pos=start_pos+(field_length=count->field_length);
+	tree=count->tree;
+
+        DBUG_PRINT("fields", ("column: %3lu  type: %2u  pack: %2u  zero: %4u  "
+                              "lbits: %2u  tree: %2u  length: %4u",
+                              (ulong) (count - huff_counts + 1),
+                              count->field_type,
+                              count->pack_type, count->max_zero_fill,
+                              count->length_bits, count->tree->tree_number,
+                              count->field_length));
+
+        /* Check if the column contains spaces only. */
+	if (count->pack_type & PACK_TYPE_SPACE_FIELDS)
+	{
+	  for (pos=start_pos ; *pos == ' ' && pos < end_pos; pos++) ;
+	  if (pos == end_pos)
+	  {
+            DBUG_PRINT("fields",
+                       ("PACK_TYPE_SPACE_FIELDS spaces only, bits:  1"));
+            DBUG_PRINT("fields", ("---"));
+	    write_bits(1,1);
+	    start_pos=end_pos;
+	    continue;
+	  }
+          DBUG_PRINT("fields",
+                     ("PACK_TYPE_SPACE_FIELDS not only spaces, bits:  1"));
+	  write_bits(0,1);
+	}
+	end_pos-=count->max_zero_fill;
+	field_length-=count->max_zero_fill;
+
+	switch (count->field_type) {
+	case FIELD_SKIP_ZERO:
+	  if (!memcmp(start_pos, zero_string, field_length))
+	  {
+            DBUG_PRINT("fields", ("FIELD_SKIP_ZERO zeroes only, bits:  1"));
+	    write_bits(1,1);
+	    start_pos=end_pos;
+	    break;
+	  }
+          DBUG_PRINT("fields", ("FIELD_SKIP_ZERO not only zeroes, bits:  1"));
+	  write_bits(0,1);
+	  /* Fall through */
+	case FIELD_NORMAL:
+          DBUG_PRINT("fields", ("FIELD_NORMAL %lu bytes",
+                                (ulong) (end_pos - start_pos)));
+	  for ( ; start_pos < end_pos ; start_pos++)
+          {
+            DBUG_PRINT("fields",
+                       ("value: 0x%02x  code: 0x%s  bits: %2u  bin: %s",
+                        (uchar) *start_pos,
+                        hexdigits(tree->code[(uchar) *start_pos]),
+                        (uint) tree->code_len[(uchar) *start_pos],
+                        bindigits(tree->code[(uchar) *start_pos],
+                                  (uint) tree->code_len[(uchar) *start_pos])));
+	    write_bits(tree->code[(uchar) *start_pos],
+		       (uint) tree->code_len[(uchar) *start_pos]);
+          }
+	  break;
+	case FIELD_SKIP_ENDSPACE:
+	  for (pos=end_pos ; pos > start_pos && pos[-1] == ' ' ; pos--) ;
+	  length= (ulong) (end_pos - pos);
+	  if (count->pack_type & PACK_TYPE_SELECTED)
+	  {
+	    if (length > count->min_space)
+	    {
+              DBUG_PRINT("fields",
+                         ("FIELD_SKIP_ENDSPACE more than min_space, bits:  1"));
+              DBUG_PRINT("fields",
+                         ("FIELD_SKIP_ENDSPACE skip %lu/%u bytes, bits: %2u",
+                          length, field_length, count->length_bits));
+	      write_bits(1,1);
+	      write_bits(length,count->length_bits);
+	    }
+	    else
+	    {
+              DBUG_PRINT("fields",
+                         ("FIELD_SKIP_ENDSPACE not more than min_space, "
+                          "bits:  1"));
+	      write_bits(0,1);
+	      pos=end_pos;
+	    }
+	  }
+	  else
+          {
+            DBUG_PRINT("fields",
+                       ("FIELD_SKIP_ENDSPACE skip %lu/%u bytes, bits: %2u",
+                        length, field_length, count->length_bits));
+	    write_bits(length,count->length_bits);
+          }
+          /* Encode all significant bytes. */
+          DBUG_PRINT("fields", ("FIELD_SKIP_ENDSPACE %lu bytes",
+                                (ulong) (pos - start_pos)));
+	  for ( ; start_pos < pos ; start_pos++)
+          {
+            DBUG_PRINT("fields",
+                       ("value: 0x%02x  code: 0x%s  bits: %2u  bin: %s",
+                        (uchar) *start_pos,
+                        hexdigits(tree->code[(uchar) *start_pos]),
+                        (uint) tree->code_len[(uchar) *start_pos],
+                        bindigits(tree->code[(uchar) *start_pos],
+                                  (uint) tree->code_len[(uchar) *start_pos])));
+	    write_bits(tree->code[(uchar) *start_pos],
+		       (uint) tree->code_len[(uchar) *start_pos]);
+          }
+	  start_pos=end_pos;
+	  break;
+	case FIELD_SKIP_PRESPACE:
+	  for (pos=start_pos ; pos < end_pos && pos[0] == ' ' ; pos++) ;
+          length= (ulong) (pos - start_pos);
+	  if (count->pack_type & PACK_TYPE_SELECTED)
+	  {
+	    if (length > count->min_space)
+	    {
+              DBUG_PRINT("fields",
+                         ("FIELD_SKIP_PRESPACE more than min_space, bits:  1"));
+              DBUG_PRINT("fields",
+                         ("FIELD_SKIP_PRESPACE skip %lu/%u bytes, bits: %2u",
+                          length, field_length, count->length_bits));
+	      write_bits(1,1);
+	      write_bits(length,count->length_bits);
+	    }
+	    else
+	    {
+              DBUG_PRINT("fields",
+                         ("FIELD_SKIP_PRESPACE not more than min_space, "
+                          "bits:  1"));
+	      pos=start_pos;
+	      write_bits(0,1);
+	    }
+	  }
+	  else
+          {
+            DBUG_PRINT("fields",
+                       ("FIELD_SKIP_PRESPACE skip %lu/%u bytes, bits: %2u",
+                        length, field_length, count->length_bits));
+	    write_bits(length,count->length_bits);
+          }
+          /* Encode all significant bytes. */
+          DBUG_PRINT("fields", ("FIELD_SKIP_PRESPACE %lu bytes",
+                                (ulong) (end_pos - start_pos)));
+	  for (start_pos=pos ; start_pos < end_pos ; start_pos++)
+          {
+            DBUG_PRINT("fields",
+                       ("value: 0x%02x  code: 0x%s  bits: %2u  bin: %s",
+                        (uchar) *start_pos,
+                        hexdigits(tree->code[(uchar) *start_pos]),
+                        (uint) tree->code_len[(uchar) *start_pos],
+                        bindigits(tree->code[(uchar) *start_pos],
+                                  (uint) tree->code_len[(uchar) *start_pos])));
+	    write_bits(tree->code[(uchar) *start_pos],
+		       (uint) tree->code_len[(uchar) *start_pos]);
+          }
+	  break;
+	case FIELD_CONSTANT:
+	case FIELD_ZERO:
+	case FIELD_CHECK:
+          DBUG_PRINT("fields", ("FIELD_CONSTANT/ZERO/CHECK"));
+	  start_pos=end_pos;
+	  break;
+	case FIELD_INTERVALL:
+	  global_count=count;
+	  pos=(uchar*) tree_search(&count->int_tree, start_pos,
+				  count->int_tree.custom_arg);
+	  intervall=(uint) (pos - count->tree_buff)/field_length;
+          DBUG_PRINT("fields", ("FIELD_INTERVALL"));
+          DBUG_PRINT("fields", ("index: %4u code: 0x%s  bits: %2u",
+                                intervall, hexdigits(tree->code[intervall]),
+                                (uint) tree->code_len[intervall]));
+	  write_bits(tree->code[intervall],(uint) tree->code_len[intervall]);
+	  start_pos=end_pos;
+	  break;
+	case FIELD_BLOB:
+	{
+	  ulong blob_length= _ma_calc_blob_length(field_length-
+						 portable_sizeof_char_ptr,
+						 start_pos);
+          /* Empty blobs are encoded with a single 1 bit. */
+	  if (!blob_length)
+	  {
+            DBUG_PRINT("fields", ("FIELD_BLOB empty, bits:  1"));
+            write_bits(1,1);
+	  }
+	  else
+	  {
+	    uchar *blob,*blob_end;
+            DBUG_PRINT("fields", ("FIELD_BLOB not empty, bits:  1"));
+	    write_bits(0,1);
+            /* Write the blob length. */
+            DBUG_PRINT("fields", ("FIELD_BLOB %lu bytes, bits: %2u",
+                                  blob_length, count->length_bits));
+	    write_bits(blob_length,count->length_bits);
+	    memcpy_fixed(&blob,end_pos-portable_sizeof_char_ptr,
+			 sizeof(char*));
+	    blob_end=blob+blob_length;
+            /* Encode the blob bytes. */
+	    for ( ; blob < blob_end ; blob++)
+            {
+              DBUG_PRINT("fields",
+                         ("value: 0x%02x  code: 0x%s  bits: %2u  bin: %s",
+                          (uchar) *blob, hexdigits(tree->code[(uchar) *blob]),
+                          (uint) tree->code_len[(uchar) *blob],
+                          bindigits(tree->code[(uchar) *start_pos],
+                                    (uint)tree->code_len[(uchar) *start_pos])));
+	      write_bits(tree->code[(uchar) *blob],
+			 (uint) tree->code_len[(uchar) *blob]);
+            }
+	    tot_blob_length+=blob_length;
+	  }
+	  start_pos= end_pos;
+	  break;
+	}
+	case FIELD_VARCHAR:
+	{
+          uint var_pack_length= HA_VARCHAR_PACKLENGTH(count->field_length-1);
+	  ulong col_length= (var_pack_length == 1 ?
+                             (uint) *(uchar*) start_pos :
+                             uint2korr(start_pos));
+          /* Empty varchar are encoded with a single 1 bit. */
+	  if (!col_length)
+	  {
+            DBUG_PRINT("fields", ("FIELD_VARCHAR empty, bits:  1"));
+	    write_bits(1,1);			/* Empty varchar */
+	  }
+	  else
+	  {
+	    uchar *end= start_pos + var_pack_length + col_length;
+            DBUG_PRINT("fields", ("FIELD_VARCHAR not empty, bits:  1"));
+	    write_bits(0,1);
+            /* Write the varchar length. */
+            DBUG_PRINT("fields", ("FIELD_VARCHAR %lu bytes, bits: %2u",
+                                  col_length, count->length_bits));
+	    write_bits(col_length,count->length_bits);
+            /* Encode the varchar bytes. */
+	    for (start_pos+= var_pack_length ; start_pos < end ; start_pos++)
+            {
+              DBUG_PRINT("fields",
+                         ("value: 0x%02x  code: 0x%s  bits: %2u  bin: %s",
+                          (uchar) *start_pos,
+                          hexdigits(tree->code[(uchar) *start_pos]),
+                          (uint) tree->code_len[(uchar) *start_pos],
+                          bindigits(tree->code[(uchar) *start_pos],
+                                    (uint)tree->code_len[(uchar) *start_pos])));
+	      write_bits(tree->code[(uchar) *start_pos],
+			 (uint) tree->code_len[(uchar) *start_pos]);
+            }
+	  }
+	  start_pos= end_pos;
+	  break;
+	}
+	case FIELD_LAST:
+        case FIELD_enum_val_count:
+	  abort();				/* Impossible */
+	}
+	start_pos+=count->max_zero_fill;
+        DBUG_PRINT("fields", ("---"));
+      }
+      flush_bits();
+      length=(ulong) (file_buffer.pos - record_pos) - max_pack_length;
+      pack_length= _ma_save_pack_length(pack_version, record_pos, length);
+      if (pack_blob_length)
+	pack_length+= _ma_save_pack_length(pack_version,
+                                           record_pos + pack_length,
+                                           tot_blob_length);
+      DBUG_PRINT("fields", ("record: %lu  length: %lu  blob-length: %lu  "
+                            "length-bytes: %lu", (ulong) record_count, length,
+                            tot_blob_length, pack_length));
+      DBUG_PRINT("fields", ("==="));
+
+      /* Correct file buffer if the header was smaller */
+      if (pack_length != max_pack_length)
+      {
+	bmove(record_pos+pack_length,record_pos+max_pack_length,length);
+	file_buffer.pos-= (max_pack_length-pack_length);
+      }
+      if (length < (ulong) min_record_length)
+	min_record_length=(uint) length;
+      if (length > (ulong) max_record_length)
+	max_record_length=(uint) length;
+      record_count++;
+      if (write_loop && record_count % WRITE_COUNT == 0)
+      {
+	VOID(printf("%lu\r", (ulong) record_count));
+        VOID(fflush(stdout));
+      }
+    }
+    else if (error != HA_ERR_RECORD_DELETED)
+      break;
+  }
+  if (error == HA_ERR_END_OF_FILE)
+    error=0;
+  else
+  {
+    VOID(fprintf(stderr, "%s: Got error %d reading records\n",
+                 my_progname, error));
+  }
+  if (verbose >= 2)
+    VOID(printf("wrote %s records.\n", llstr((longlong) record_count, llbuf)));
+
+  my_afree(record);
+  mrg->ref_length=max_pack_length;
+  mrg->min_pack_length=max_record_length ? min_record_length : 0;
+  mrg->max_pack_length=max_record_length;
+  DBUG_RETURN(error || error_on_write || flush_buffer(~(ulong) 0));
+}
+
+
+static char *make_new_name(char *new_name, char *old_name)
+{
+  return fn_format(new_name,old_name,"",DATA_TMP_EXT,2+4);
+}
+
+static char *make_old_name(char *new_name, char *old_name)
+{
+  return fn_format(new_name,old_name,"",OLD_EXT,2+4);
+}
+
+	/* rutines for bit writing buffer */
+
+static void init_file_buffer(File file, pbool read_buffer)
+{
+  file_buffer.file=file;
+  file_buffer.buffer= (uchar*) my_malloc(ALIGN_SIZE(RECORD_CACHE_SIZE),
+					 MYF(MY_WME));
+  file_buffer.end=file_buffer.buffer+ALIGN_SIZE(RECORD_CACHE_SIZE)-8;
+  file_buffer.pos_in_file=0;
+  error_on_write=0;
+  if (read_buffer)
+  {
+
+    file_buffer.pos=file_buffer.end;
+    file_buffer.bits=0;
+  }
+  else
+  {
+    file_buffer.pos=file_buffer.buffer;
+    file_buffer.bits=BITS_SAVED;
+  }
+  file_buffer.bitbucket= 0;
+}
+
+
+static int flush_buffer(ulong neaded_length)
+{
+  ulong length;
+
+  /*
+    file_buffer.end is 8 bytes lower than the real end of the buffer.
+    This is done so that the end-of-buffer condition does not need to be
+    checked for every uchar (see write_bits()). Consequently,
+    file_buffer.pos can become greater than file_buffer.end. The
+    algorithms in the other functions ensure that there will never be
+    more than 8 bytes written to the buffer without an end-of-buffer
+    check. So the buffer cannot be overrun. But we need to check for the
+    near-to-buffer-end condition to avoid a negative result, which is
+    casted to unsigned and thus becomes giant.
+  */
+  if ((file_buffer.pos < file_buffer.end) &&
+      ((ulong) (file_buffer.end - file_buffer.pos) > neaded_length))
+    return 0;
+  length=(ulong) (file_buffer.pos-file_buffer.buffer);
+  file_buffer.pos=file_buffer.buffer;
+  file_buffer.pos_in_file+=length;
+  if (test_only)
+    return 0;
+  if (error_on_write|| my_write(file_buffer.file,
+				(const uchar*) file_buffer.buffer,
+				length,
+				MYF(MY_WME | MY_NABP | MY_WAIT_IF_FULL)))
+  {
+    error_on_write=1;
+    return 1;
+  }
+
+  if (neaded_length != ~(ulong) 0 &&
+      (ulong) (file_buffer.end-file_buffer.buffer) < neaded_length)
+  {
+    uchar *tmp;
+    neaded_length+=256;				/* some margin */
+    tmp= (uchar*) my_realloc(file_buffer.buffer, neaded_length,MYF(MY_WME));
+    if (!tmp)
+      return 1;
+    file_buffer.pos=    (tmp + (ulong) (file_buffer.pos - file_buffer.buffer));
+    file_buffer.buffer= tmp;
+    file_buffer.end=    (tmp+neaded_length-8);
+  }
+  return 0;
+}
+
+
+static void end_file_buffer(void)
+{
+  my_free(file_buffer.buffer, MYF(0));
+}
+
+	/* output `bits` low bits of `value' */
+
+static void write_bits(register ulonglong value, register uint bits)
+{
+  DBUG_ASSERT(((bits < 8 * sizeof(value)) && ! (value >> bits)) ||
+              (bits == 8 * sizeof(value)));
+
+  if ((file_buffer.bits-= (int) bits) >= 0)
+  {
+    file_buffer.bitbucket|= value << file_buffer.bits;
+  }
+  else
+  {
+    reg3 ulonglong bit_buffer;
+    bits= (uint) -file_buffer.bits;
+    bit_buffer= (file_buffer.bitbucket |
+                 ((bits != 8 * sizeof(value)) ? (value >> bits) : 0));
+#if BITS_SAVED == 64
+    *file_buffer.pos++= (uchar) (bit_buffer >> 56);
+    *file_buffer.pos++= (uchar) (bit_buffer >> 48);
+    *file_buffer.pos++= (uchar) (bit_buffer >> 40);
+    *file_buffer.pos++= (uchar) (bit_buffer >> 32);
+#endif
+    *file_buffer.pos++= (uchar) (bit_buffer >> 24);
+    *file_buffer.pos++= (uchar) (bit_buffer >> 16);
+    *file_buffer.pos++= (uchar) (bit_buffer >> 8);
+    *file_buffer.pos++= (uchar) (bit_buffer);
+
+    if (bits != 8 * sizeof(value))
+      value&= (((ulonglong) 1) << bits) - 1;
+    if (file_buffer.pos >= file_buffer.end)
+      VOID(flush_buffer(~ (ulong) 0));
+    file_buffer.bits=(int) (BITS_SAVED - bits);
+    file_buffer.bitbucket= value << (BITS_SAVED - bits);
+  }
+  return;
+}
+
+	/* Flush bits in bit_buffer to buffer */
+
+static void flush_bits(void)
+{
+  int bits;
+  ulonglong bit_buffer;
+
+  bits= file_buffer.bits & ~7;
+  bit_buffer= file_buffer.bitbucket >> bits;
+  bits= BITS_SAVED - bits;
+  while (bits > 0)
+  {
+    bits-= 8;
+    *file_buffer.pos++= (uchar) (bit_buffer >> bits);
+  }
+  if (file_buffer.pos >= file_buffer.end)
+    VOID(flush_buffer(~ (ulong) 0));
+  file_buffer.bits= BITS_SAVED;
+  file_buffer.bitbucket= 0;
+}
+
+
+/****************************************************************************
+** functions to handle the joined files
+****************************************************************************/
+
+static int save_state(MARIA_HA *isam_file,PACK_MRG_INFO *mrg,
+                      my_off_t new_length,
+		      ha_checksum crc)
+{
+  MARIA_SHARE *share=isam_file->s;
+  uint options=mi_uint2korr(share->state.header.options);
+  uint key;
+  DBUG_ENTER("save_state");
+
+  options|= HA_OPTION_COMPRESS_RECORD | HA_OPTION_READ_ONLY_DATA;
+  mi_int2store(share->state.header.options,options);
+  /* Save the original file type of we have to undo the packing later */
+  share->state.header.org_data_file_type= share->state.header.data_file_type;
+  share->state.header.data_file_type= COMPRESSED_RECORD;
+
+  share->state.state.data_file_length=new_length;
+  share->state.state.del=0;
+  share->state.state.empty=0;
+  share->state.dellink= HA_OFFSET_ERROR;
+  share->state.split=(ha_rows) mrg->records;
+  share->state.version=(ulong) time((time_t*) 0);
+  if (share->base.born_transactional)
+    share->state.create_rename_lsn= share->state.is_of_horizon=
+      share->state.skip_redo_lsn= LSN_NEEDS_NEW_STATE_LSNS;
+  if (! maria_is_all_keys_active(share->state.key_map, share->base.keys))
+  {
+    /*
+      Some indexes are disabled, cannot use current key_file_length value
+      as an estimate of upper bound of index file size. Use packed data file
+      size instead.
+    */
+    share->state.state.key_file_length= new_length;
+  }
+  /*
+    If there are no disabled indexes, keep key_file_length value from
+    original file so "aria_chk -rq" can use this value (this is necessary
+    because index size cannot be easily calculated for fulltext keys)
+  */
+  maria_clear_all_keys_active(share->state.key_map);
+  for (key=0 ; key < share->base.keys ; key++)
+    share->state.key_root[key]= HA_OFFSET_ERROR;
+  share->state.key_del= HA_OFFSET_ERROR;
+  share->state.state.checksum= crc;     /* Save crc in file */
+  share->changed=1;			/* Force write of header */
+  share->state.open_count=0;
+  share->global_changed=0;
+  VOID(my_chsize(share->kfile.file, share->base.keystart, 0, MYF(0)));
+  if (share->base.keys)
+    isamchk_neaded=1;
+  DBUG_RETURN(_ma_state_info_write_sub(share->kfile.file,
+                                       &share->state,
+                                       MA_STATE_INFO_WRITE_DONT_MOVE_OFFSET |
+                                       MA_STATE_INFO_WRITE_FULL_INFO));
+}
+
+
+static int save_state_mrg(File file,PACK_MRG_INFO *mrg,my_off_t new_length,
+			  ha_checksum crc)
+{
+  MARIA_STATE_INFO state;
+  MARIA_HA *isam_file=mrg->file[0];
+  uint options;
+  DBUG_ENTER("save_state_mrg");
+
+  state= isam_file->s->state;
+  options= (mi_uint2korr(state.header.options) | HA_OPTION_COMPRESS_RECORD |
+	    HA_OPTION_READ_ONLY_DATA);
+  mi_int2store(state.header.options,options);
+  /* Save the original file type of we have to undo the packing later */
+  state.header.org_data_file_type= state.header.data_file_type;
+  state.header.data_file_type= COMPRESSED_RECORD;
+
+  state.state.data_file_length=new_length;
+  state.state.del=0;
+  state.state.empty=0;
+  state.state.records=state.split=(ha_rows) mrg->records;
+  state.create_rename_lsn= state.is_of_horizon= state.skip_redo_lsn=
+    LSN_NEEDS_NEW_STATE_LSNS;
+
+  /* See comment above in save_state about key_file_length handling. */
+  if (mrg->src_file_has_indexes_disabled)
+  {
+    isam_file->s->state.state.key_file_length=
+      max(isam_file->s->state.state.key_file_length, new_length);
+  }
+  state.dellink= HA_OFFSET_ERROR;
+  state.version=(ulong) time((time_t*) 0);
+  maria_clear_all_keys_active(state.key_map);
+  state.state.checksum=crc;
+  if (isam_file->s->base.keys)
+    isamchk_neaded=1;
+  state.changed=STATE_CHANGED | STATE_NOT_ANALYZED; /* Force check of table */
+  DBUG_RETURN (_ma_state_info_write_sub(file, &state,
+                                        MA_STATE_INFO_WRITE_DONT_MOVE_OFFSET |
+                                        MA_STATE_INFO_WRITE_FULL_INFO));
+}
+
+
+/* reset for mrg_rrnd */
+
+static void mrg_reset(PACK_MRG_INFO *mrg)
+{
+  if (mrg->current)
+  {
+    maria_extra(*mrg->current, HA_EXTRA_NO_CACHE, 0);
+    mrg->current=0;
+  }
+}
+
+static int mrg_rrnd(PACK_MRG_INFO *info,uchar *buf)
+{
+  int error;
+  MARIA_HA *isam_info;
+  my_off_t filepos;
+
+  if (!info->current)
+  {
+    isam_info= *(info->current=info->file);
+    info->end=info->current+info->count;
+    maria_reset(isam_info);
+    maria_extra(isam_info, HA_EXTRA_CACHE, 0);
+    if ((error= maria_scan_init(isam_info)))
+      return(error);
+  }
+  else
+    isam_info= *info->current;
+
+  for (;;)
+  {
+    if (!(error= maria_scan(isam_info, buf)) ||
+	error != HA_ERR_END_OF_FILE)
+      return (error);
+    maria_scan_end(isam_info);
+    maria_extra(isam_info,HA_EXTRA_NO_CACHE, 0);
+    if (info->current+1 == info->end)
+      return(HA_ERR_END_OF_FILE);
+    info->current++;
+    isam_info= *info->current;
+    filepos=isam_info->s->pack.header_length;
+    maria_reset(isam_info);
+    maria_extra(isam_info,HA_EXTRA_CACHE, 0);
+    if ((error= maria_scan_init(isam_info)))
+      return(error);
+  }
+}
+
+
+static int mrg_close(PACK_MRG_INFO *mrg)
+{
+  uint i;
+  int error=0;
+  DBUG_ENTER("mrg_close");
+
+  for (i=0 ; i < mrg->count ; i++)
+    error|=maria_close(mrg->file[i]);
+  if (mrg->free_file)
+    my_free(mrg->file, MYF(0));
+  DBUG_RETURN(error);
+}
+
+
+#if !defined(DBUG_OFF)
+/*
+  Fake the counts to get big Huffman codes.
+
+  SYNOPSIS
+    fakebigcodes()
+    huff_counts                 A pointer to the counts array.
+    end_count                   A pointer past the counts array.
+
+  DESCRIPTION
+
+    Huffman coding works by removing the two least frequent values from
+    the list of values and add a new value with the sum of their
+    incidences in a loop until only one value is left. Every time a
+    value is reused for a new value, it gets one more bit for its
+    encoding. Hence, the least frequent values get the longest codes.
+
+    To get a maximum code length for a value, two of the values must
+    have an incidence of 1. As their sum is 2, the next infrequent value
+    must have at least an incidence of 2, then 4, 8, 16 and so on. This
+    means that one needs 2**n bytes (values) for a code length of n
+    bits. However, using more distinct values forces the use of longer
+    codes, or reaching the code length with less total bytes (values).
+
+    To get 64(32)-bit codes, I sort the counts by decreasing incidence.
+    I assign counts of 1 to the two most frequent values, a count of 2
+    for the next one, then 4, 8, and so on until 2**64-1(2**30-1). All
+    the remaining values get 1. That way every possible uchar has an
+    assigned code, though not all codes are used if not all uchar values
+    are present in the column.
+
+    This strategy would work with distinct column values too, but
+    requires that at least 64(32) values are present. To make things
+    easier here, I cancel all distinct column values and force byte
+    compression for all columns.
+
+  RETURN
+    void
+*/
+
+static void fakebigcodes(HUFF_COUNTS *huff_counts, HUFF_COUNTS *end_count)
+{
+  HUFF_COUNTS   *count;
+  my_off_t      *cur_count_p;
+  my_off_t      *end_count_p;
+  my_off_t      **cur_sort_p;
+  my_off_t      **end_sort_p;
+  my_off_t      *sort_counts[256];
+  my_off_t      total;
+  DBUG_ENTER("fakebigcodes");
+
+  for (count= huff_counts; count < end_count; count++)
+  {
+    /*
+      Remove distinct column values.
+    */
+    if (huff_counts->tree_buff)
+    {
+      my_free(huff_counts->tree_buff, MYF(0));
+      delete_tree(&huff_counts->int_tree);
+      huff_counts->tree_buff= NULL;
+      DBUG_PRINT("fakebigcodes", ("freed distinct column values"));
+    }
+
+    /*
+      Sort counts by decreasing incidence.
+    */
+    cur_count_p= count->counts;
+    end_count_p= cur_count_p + 256;
+    cur_sort_p= sort_counts;
+    while (cur_count_p < end_count_p)
+      *(cur_sort_p++)= cur_count_p++;
+    (void) my_qsort(sort_counts, 256, sizeof(my_off_t*), (qsort_cmp) fakecmp);
+
+    /*
+      Assign faked counts.
+    */
+    cur_sort_p= sort_counts;
+#if SIZEOF_LONG_LONG > 4
+    end_sort_p= sort_counts + 8 * sizeof(ulonglong) - 1;
+#else
+    end_sort_p= sort_counts + 8 * sizeof(ulonglong) - 2;
+#endif
+    /* Most frequent value gets a faked count of 1. */
+    **(cur_sort_p++)= 1;
+    total= 1;
+    while (cur_sort_p < end_sort_p)
+    {
+      **(cur_sort_p++)= total;
+      total<<= 1;
+    }
+    /* Set the last value. */
+    **(cur_sort_p++)= --total;
+    /*
+      Set the remaining counts.
+    */
+    end_sort_p= sort_counts + 256;
+    while (cur_sort_p < end_sort_p)
+      **(cur_sort_p++)= 1;
+  }
+  DBUG_VOID_RETURN;
+}
+
+
+/*
+  Compare two counts for reverse sorting.
+
+  SYNOPSIS
+    fakecmp()
+    count1              One count.
+    count2              Another count.
+
+  RETURN
+    1                   count1  < count2
+    0                   count1 == count2
+    -1                  count1 >  count2
+*/
+
+static int fakecmp(my_off_t **count1, my_off_t **count2)
+{
+  return ((**count1 < **count2) ? 1 :
+          (**count1 > **count2) ? -1 : 0);
+}
+#endif
diff --git a/storage/maria/maria_read_log.c b/storage/maria/maria_read_log.c
new file mode 100644
index 00000000000..de45eb0bcb6
--- /dev/null
+++ b/storage/maria/maria_read_log.c
@@ -0,0 +1,308 @@
+/* Copyright (C) 2007 MySQL AB
+   Copyright (C) 2010 Monty Program Ab
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; version 2 of the License.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */
+
+#include "maria_def.h"
+#include "ma_recovery.h"
+#include <my_getopt.h>
+
+#define LOG_FLAGS 0
+
+static const char *load_default_groups[]= { "aria_read_log",0 };
+static void get_options(int *argc,char * * *argv);
+#ifndef DBUG_OFF
+#if defined(__WIN__)
+const char *default_dbug_option= "d:t:O,\\aria_read_log.trace";
+#else
+const char *default_dbug_option= "d:t:o,/tmp/aria_read_log.trace";
+#endif
+#endif /* DBUG_OFF */
+static my_bool opt_display_only, opt_apply, opt_apply_undo, opt_silent;
+static my_bool opt_check;
+static const char *opt_tmpdir;
+static ulong opt_page_buffer_size;
+static ulonglong opt_start_from_lsn, opt_end_lsn, opt_start_from_checkpoint;
+static MY_TMPDIR maria_chk_tmpdir;
+
+
+int main(int argc, char **argv)
+{
+  LSN lsn;
+  char **default_argv;
+  uint warnings_count;
+  MY_INIT(argv[0]);
+
+  load_defaults("my", load_default_groups, &argc, &argv);
+  default_argv= argv;
+  maria_data_root= (char *)".";
+  get_options(&argc, &argv);
+
+  maria_in_recovery= TRUE;
+
+  if (maria_init())
+  {
+    fprintf(stderr, "Can't init Aria engine (%d)\n", errno);
+    goto err;
+  }
+  maria_block_size= 0;                          /* Use block size from file */
+  /* we don't want to create a control file, it MUST exist */
+  if (ma_control_file_open(FALSE, TRUE))
+  {
+    fprintf(stderr, "Can't open control file (%d)\n", errno);
+    goto err;
+  }
+  if (last_logno == FILENO_IMPOSSIBLE)
+  {
+    fprintf(stderr, "Can't find any log\n");
+    goto err;
+  }
+  if (init_pagecache(maria_pagecache, opt_page_buffer_size, 0, 0,
+                     maria_block_size, MY_WME) == 0)
+  {
+    fprintf(stderr, "Got error in init_pagecache() (errno: %d)\n", errno);
+    goto err;
+  }
+  /*
+    If log handler does not find the "last_logno" log it will return error,
+    which is good.
+    But if it finds a log and this log was crashed, it will create a new log,
+    which is useless. TODO: start log handler in read-only mode.
+  */
+  if (init_pagecache(maria_log_pagecache,
+                     TRANSLOG_PAGECACHE_SIZE, 0, 0,
+                     TRANSLOG_PAGE_SIZE, MY_WME) == 0 ||
+      translog_init(maria_data_root, TRANSLOG_FILE_SIZE,
+                    0, 0, maria_log_pagecache, TRANSLOG_DEFAULT_FLAGS,
+                    opt_display_only))
+  {
+    fprintf(stderr, "Can't init loghandler (%d)\n", errno);
+    goto err;
+  }
+
+  if (opt_display_only)
+    printf("You are using --display-only, NOTHING will be written to disk\n");
+
+  lsn= translog_first_lsn_in_log();
+  if (lsn == LSN_ERROR)
+  {
+    fprintf(stderr, "Opening transaction log failed\n");
+    goto end;
+  }
+  if (lsn == LSN_IMPOSSIBLE)
+  {
+     fprintf(stdout, "The transaction log is empty\n");
+  }
+  if (opt_start_from_checkpoint && !opt_start_from_lsn &&
+      last_checkpoint_lsn != LSN_IMPOSSIBLE)
+  {
+    lsn= LSN_IMPOSSIBLE;             /* LSN set in maria_apply_log() */
+    fprintf(stdout, "Starting from checkpoint (%lu,0x%lx)\n",
+            LSN_IN_PARTS(last_checkpoint_lsn));
+  }
+  else
+    fprintf(stdout, "The transaction log starts from lsn (%lu,0x%lx)\n",
+            LSN_IN_PARTS(lsn));
+
+  if (opt_start_from_lsn)
+  {
+    if (opt_start_from_lsn < (ulonglong) lsn)
+    {
+      fprintf(stderr, "start_from_lsn is too small. Aborting\n");
+      maria_end();
+      goto err;
+    }
+    lsn= (LSN) opt_start_from_lsn;
+    fprintf(stdout, "Starting reading log from lsn (%lu,0x%lx)\n",
+            LSN_IN_PARTS(lsn));
+  }
+
+  if (opt_end_lsn != LSN_IMPOSSIBLE)
+  {
+    /* We can't apply undo if we use end_lsn */
+    opt_apply_undo= 0;
+  }
+
+  fprintf(stdout, "TRACE of the last aria_read_log\n");
+  if (maria_apply_log(lsn, opt_end_lsn, opt_apply ?  MARIA_LOG_APPLY :
+                      (opt_check ? MARIA_LOG_CHECK :
+                       MARIA_LOG_DISPLAY_HEADER), opt_silent ? NULL : stdout,
+                      opt_apply_undo, FALSE, FALSE, &warnings_count))
+    goto err;
+  if (warnings_count == 0)
+    fprintf(stdout, "%s: SUCCESS\n", my_progname_short);
+  else
+    fprintf(stdout, "%s: DOUBTFUL (%u warnings, check previous output)\n",
+            my_progname_short, warnings_count);
+
+end:
+  maria_end();
+  free_tmpdir(&maria_chk_tmpdir);
+  free_defaults(default_argv);
+  my_end(0);
+  exit(0);
+  return 0;				/* No compiler warning */
+
+err:
+  /* don't touch anything more, in case we hit a bug */
+  fprintf(stderr, "%s: FAILED\n", my_progname_short);
+  free_tmpdir(&maria_chk_tmpdir);
+  free_defaults(default_argv);
+  exit(1);
+}
+
+
+#include "ma_check_standalone.h"
+
+enum options_mc {
+  OPT_CHARSETS_DIR=256
+};
+
+static struct my_option my_long_options[] =
+{
+  {"apply", 'a',
+   "Apply log to tables: modifies tables! you should make a backup first! "
+   " Displays a lot of information if not run with --silent",
+   (uchar **) &opt_apply, (uchar **) &opt_apply, 0,
+   GET_BOOL, NO_ARG, 0, 0, 0, 0, 0, 0},
+  {"character-sets-dir", OPT_CHARSETS_DIR,
+   "Directory where character sets are.",
+   (char**) &charsets_dir, 0, 0, GET_STR, REQUIRED_ARG, 0, 0, 0, 0, 0, 0},
+  {"check", 'c',
+   "if --display-only, check if record is fully readable (for debugging)",
+   (uchar **) &opt_check, (uchar **) &opt_check, 0,
+   GET_BOOL, NO_ARG, 0, 0, 0, 0, 0, 0},
+#ifndef DBUG_OFF
+  {"debug", '#', "Output debug log. Often the argument is 'd:t:o,filename'.",
+   0, 0, 0, GET_STR, OPT_ARG, 0, 0, 0, 0, 0, 0},
+#endif
+  {"help", '?', "Display this help and exit.",
+   0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
+  {"display-only", 'd', "display brief info read from records' header",
+   &opt_display_only, &opt_display_only, 0, GET_BOOL,
+   NO_ARG,0, 0, 0, 0, 0, 0},
+  {"aria-log-dir-path", 'l',
+    "Path to the directory where to store transactional log",
+    (uchar **) &maria_data_root, (uchar **) &maria_data_root, 0,
+    GET_STR, REQUIRED_ARG, 0, 0, 0, 0, 0, 0},
+  { "page-buffer-size", 'P', "",
+    &opt_page_buffer_size, &opt_page_buffer_size, 0,
+    GET_ULONG, REQUIRED_ARG, (long) USE_BUFFER_INIT,
+    (long) USE_BUFFER_INIT, (long) ~(ulong) 0, (long) MALLOC_OVERHEAD,
+    (long) IO_SIZE, 0},
+  { "start-from-lsn", 'o', "Start reading log from this lsn",
+    &opt_start_from_lsn, &opt_start_from_lsn,
+    0, GET_ULL, REQUIRED_ARG, 0, 0, ~(longlong) 0, 0, 0, 0 },
+  {"start-from-checkpoint", 'C', "Start applying from last checkpoint",
+   &opt_start_from_checkpoint, &opt_start_from_checkpoint, 0,
+   GET_BOOL, NO_ARG, 0, 0, 0, 0, 0, 0},
+  { "end-lsn", 'e', "Stop applying at this lsn. If end-lsn is used, UNDO:s "
+    "will not be applied", &opt_end_lsn, &opt_end_lsn,
+    0, GET_ULL, REQUIRED_ARG, 0, 0, ~(longlong) 0, 0, 0, 0 },
+  {"silent", 's', "Print less information during apply/undo phase",
+   &opt_silent, &opt_silent, 0,
+   GET_BOOL, NO_ARG, 0, 0, 0, 0, 0, 0},
+  {"verbose", 'v', "Print more information during apply/undo phase",
+   &maria_recovery_verbose, &maria_recovery_verbose, 0,
+   GET_BOOL, NO_ARG, 0, 0, 0, 0, 0, 0},
+  {"tmpdir", 't', "Path for temporary files. Multiple paths can be specified, "
+   "separated by "
+#if defined( __WIN__) || defined(__NETWARE__)
+   "semicolon (;)"
+#else
+   "colon (:)"
+#endif
+   , (char**) &opt_tmpdir, 0, 0, GET_STR, REQUIRED_ARG, 0, 0, 0, 0, 0, 0},
+  {"undo", 'u', "Apply UNDO records to tables. (disable with --disable-undo)",
+   (uchar **) &opt_apply_undo, (uchar **) &opt_apply_undo, 0,
+   GET_BOOL, NO_ARG, 1, 0, 0, 0, 0, 0},
+  {"version", 'V', "Print version and exit.",
+   0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
+  { 0, 0, 0, 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}
+};
+
+#include <help_start.h>
+
+static void print_version(void)
+{
+  VOID(printf("%s Ver 1.3 for %s on %s\n",
+              my_progname_short, SYSTEM_TYPE, MACHINE_TYPE));
+  NETWARE_SET_SCREEN_MODE(1);
+}
+
+
+static void usage(void)
+{
+  print_version();
+  puts("Copyright (C) 2007 MySQL AB");
+  puts("This software comes with ABSOLUTELY NO WARRANTY. This is free software,");
+  puts("and you are welcome to modify and redistribute it under the GPL license\n");
+
+  puts("Display and apply log records from a Aria transaction log");
+  puts("found in the current directory (for now)");
+#ifndef IDENTICAL_PAGES_AFTER_RECOVERY
+  puts("\nNote: Aria is compiled without -DIDENTICAL_PAGES_AFTER_RECOVERY\n"
+       "which means that the table files are not byte-to-byte identical to\n"
+       "files created during normal execution. This should be ok, except for\n"
+       "test scripts that tries to compare files before and after recovery.");
+#endif
+  VOID(printf("\nUsage: %s OPTIONS\n", my_progname_short));
+  puts("You need to use one of -d or -a");
+  my_print_help(my_long_options);
+  print_defaults("my", load_default_groups);
+  my_print_variables(my_long_options);
+}
+
+#include <help_end.h>
+
+static my_bool
+get_one_option(int optid __attribute__((unused)),
+               const struct my_option *opt __attribute__((unused)),
+               char *argument __attribute__((unused)))
+{
+  switch (optid) {
+  case '?':
+    usage();
+    exit(0);
+  case 'V':
+    print_version();
+    exit(0);
+#ifndef DBUG_OFF
+  case '#':
+    DBUG_SET_INITIAL(argument ? argument : default_dbug_option);
+    break;
+#endif
+  }
+  return 0;
+}
+
+static void get_options(int *argc,char ***argv)
+{
+  int ho_error;
+
+  if ((ho_error=handle_options(argc, argv, my_long_options, get_one_option)))
+    exit(ho_error);
+
+  if (!opt_apply)
+    opt_apply_undo= FALSE;
+
+  if (((opt_display_only + opt_apply) != 1) || (*argc > 0))
+  {
+    usage();
+    exit(1);
+  }
+  if (init_tmpdir(&maria_chk_tmpdir, opt_tmpdir))
+    exit(1);
+  maria_tmpdir= &maria_chk_tmpdir;
+}
diff --git a/storage/maria/maria_rename.sh b/storage/maria/maria_rename.sh
new file mode 100755
index 00000000000..fb20e47e635
--- /dev/null
+++ b/storage/maria/maria_rename.sh
@@ -0,0 +1,17 @@
+#!/bin/sh
+
+replace myisam maria MYISAM MARIA MyISAM MARIA -- mysql-test/t/*maria*test mysql-test/r/*maria*result
+
+FILES=`echo sql/ha_maria.{cc,h} include/maria*h storage/maria/*.{c,h}`
+
+replace myisam maria MYISAM MARIA MyISAM MARIA myisam.h maria.h myisamdef.h maria_def.h mi_ maria_ ft_ maria_ft_ "Copyright (C) 2000" "Copyright (C) 2006" MI_ISAMINFO MARIA_INFO MI_CREATE_INFO MARIA_CREATE_INFO maria_isam_ maria_ MI_INFO MARIA_HA MI_ MARIA_ MARIACHK MARIA_CHK  rt_index.h ma_rt_index.h rtree_ maria_rtree rt_key.h ma_rt_key.h rt_mbr.h ma_rt_mbr.h -- $FILES
+
+replace check_table_is_closed _ma_check_table_is_closed test_if_reopen _ma_test_if_reopen my_n_base_info_read maria_n_base_info_read update_auto_increment _ma_update_auto_increment save_pack_length _ma_save_packlength calc_pack_length _ma_calc_pack_length -- $FILES
+
+replace mi_ ma_ ft_ ma_ft_ rt_ ma_rt_ myisam maria myisamchk maria_chk myisampack maria_pack myisamlog maria_log -- storage/maria/Makefile.am
+
+#
+# Restore wrong replaces
+#
+
+replace maria_sint1korr mi_sint1korr maria_uint1korr mi_uint1korr maria_sint2korr mi_sint2korr maria_sint3korr mi_sint3korr maria_sint4korr mi_sint4korr maria_sint8korr mi_sint8korr maria_uint2korr mi_uint2korr maria_uint3korr mi_uint3korr maria_uint4korr mi_uint4korr maria_uint5korr mi_uint5korr maria_uint6korr mi_uint6korr maria_uint7korr mi_uint7korr maria_uint8korr mi_uint8korr maria_int1store mi_int1store maria_int2store mi_int2store maria_int3store mi_int3store maria_int4store mi_int4store maria_int5store mi_int5store maria_int6store mi_int6store maria_int7store mi_int7store maria_int8store mi_int8store maria_float4store mi_float4store maria_float4get mi_float4get maria_float8store mi_float8store maria_float8get mi_float8get maria_rowstore mi_rowstore maria_rowkorr mi_rowkorr maria_sizestore mi_sizestore maria_sizekorr mi_sizekorr _maria_maria_ _maria MARIA_MAX_POSSIBLE_KEY HA_MAX_POSSIBLE_KEY MARIA_MAX_KEY_BUFF HA_MAX_KEY_BUFF MARIA_MAX_KEY_SEG HA_MAX_KEY_SEG maria_ft_sintXkorr ft_sintXkorr maria_ft_intXstore ft_intXstore maria_ft_boolean_syntax ft_boolean_syntax maria_ft_min_word_len ft_min_word_len maria_ft_max_word_len ft_max_word_len -- $FILES
diff --git a/storage/maria/plug.in b/storage/maria/plug.in
new file mode 100644
index 00000000000..008d82250c8
--- /dev/null
+++ b/storage/maria/plug.in
@@ -0,0 +1,19 @@
+MYSQL_STORAGE_ENGINE(aria,, [Aria Storage Engine],
+        [Crash-safe tables with MyISAM heritage], [default,max,max-no-ndb])
+MYSQL_PLUGIN_DIRECTORY(aria,  [storage/maria])
+MYSQL_PLUGIN_STATIC(aria,     [libaria.a])
+MYSQL_PLUGIN_DEPENDS_ON_MYSQL_INTERNALS(aria, [ha_maria.cc])
+
+MYSQL_PLUGIN_ACTIONS(aria,  [
+# AC_CONFIG_FILES(storage/maria/unittest/Makefile)
+AC_ARG_WITH(aria-tmp-tables,
+    AC_HELP_STRING([--with-aria-tmp-tables],[Use Aria for internal temporary tables]),
+    [with_aria_tmp_tables=$withval],
+    [with_aria_tmp_tables=yes]
+)
+
+if test "$with_aria_tmp_tables" = "yes"
+then
+  AC_DEFINE([USE_MARIA_FOR_TMP_TABLES], [1], [Aria is used for internal temporary tables])
+fi
+])
diff --git a/storage/maria/tablockman.c b/storage/maria/tablockman.c
new file mode 100644
index 00000000000..1bb8889aaa7
--- /dev/null
+++ b/storage/maria/tablockman.c
@@ -0,0 +1,674 @@
+/* QQ: TODO - allocate everything from dynarrays !!! (benchmark) */
+/* QQ: automatically place S instead of LS if possible */
+/* Copyright (C) 2006 MySQL AB
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; version 2 of the License.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */
+
+#include <my_base.h>
+#include <hash.h>
+#include "tablockman.h"
+
+/*
+  Lock Manager for Table Locks
+
+  The code below handles locks on resources - but it is optimized for a
+  case when a number of resources is not very large, and there are many of
+  locks per resource - that is a resource is likely to be a table or a
+  database, but hardly a row in a table.
+
+  Locks belong to "lock owners". A Lock Owner is uniquely identified by a
+  16-bit number - loid (lock owner identifier). A function loid_to_tlo must
+  be provided by the application that takes such a number as an argument
+  and returns a TABLE_LOCK_OWNER structure.
+
+  Lock levels are completely defined by three tables. Lock compatibility
+  matrix specifies which locks can be held at the same time on a resource.
+  Lock combining matrix specifies what lock level has the same behaviour as
+  a pair of two locks of given levels. getlock_result matrix simplifies
+  intention locking and lock escalation for an application, basically it
+  defines which locks are intention locks and which locks are "loose"
+  locks.  It is only used to provide better diagnostics for the
+  application, lock manager itself does not differentiate between normal,
+  intention, and loose locks.
+
+  The assumptions are: few distinct resources, many locks are held at the
+  same time on one resource. Thus: a lock structure _per resource_ can be
+  rather large; a lock structure _per lock_ does not need to be very small
+  either; we need to optimize for _speed_. Operations we need are: place a
+  lock, check if a particular transaction already has a lock on this
+  resource, check if a conflicting lock exists, if yes - find who owns it.
+
+  Solution: every resource has a structure with
+  1. Hash of latest (see the lock upgrade section below) granted locks with
+     loid as a key. Thus, checking if a given transaction has a lock on
+     this resource is O(1) operation.
+  2. Doubly-linked lists of all granted locks - one list for every lock
+     type. Thus, checking if a conflicting lock exists is a check whether
+     an appropriate list head pointer is not null, also O(1).
+  3. Every lock has a loid of the owner, thus checking who owns a
+     conflicting lock is also O(1).
+  4. Deque of waiting locks. It's a deque (double-ended queue) not a fifo,
+     because for lock upgrades requests are added to the queue head, not
+     tail. This is a single place where there it gets O(N) on number
+     of locks - when a transaction wakes up from waiting on a condition,
+     it may need to scan the queue backward to the beginning to find
+     a conflicting lock. It is guaranteed though that "all transactions
+     before it" received the same - or earlier - signal.  In other words a
+     transaction needs to scan all transactions before it that received the
+     signal but didn't have a chance to resume the execution yet, so
+     practically OS scheduler won't let the scan to be O(N).
+
+  Waiting: if there is a conflicting lock or if wait queue is not empty, a
+  requested lock cannot be granted at once. It is added to the end of the
+  wait queue. If a queue was empty and there is a conflicting lock - the
+  "blocker" transaction is the owner of this lock. If a queue is not empty,
+  an owner of the previous lock in the queue is the "blocker". But if the
+  previous lock is compatible with the request, then the "blocker" is the
+  transaction that the owner of the lock at the end of the queue is waiting
+  for (in other words, our lock is added to the end of the wait queue, and
+  our blocker is the same as of the lock right before us).
+
+  Lock upgrades: when a thread that has a lock on a given resource,
+  requests a new lock on the same resource and the old lock is not enough
+  to satisfy new lock requirements (which is defined by
+  lock_combining_matrix[old_lock][new_lock] != old_lock), a new lock
+  (defined by lock_combining_matrix as above) is placed. Depending on
+  other granted locks it is immediately granted or it has to wait.  Here the
+  lock is added to the start of the waiting queue, not to the end.  Old
+  lock, is removed from the hash, but not from the doubly-linked lists.
+  (indeed, a transaction checks "do I have a lock on this resource ?" by
+  looking in a hash, and it should find a latest lock, so old locks must be
+  removed; but a transaction checks "are there conflicting locks ?" by
+  checking doubly-linked lists, it doesn't matter if it will find an old
+  lock - if it would be removed, a new lock would be also a conflict).
+  So, a hash contains only "latest" locks - there can be only one latest
+  lock per resource per transaction. But doubly-linked lists contain all
+  locks, even "obsolete" ones, because it doesnt't hurt. Note that old
+  locks can not be freed early, in particular they stay in the
+  'active_locks' list of a lock owner, because they may be "re-enabled"
+  on a savepoint rollback.
+
+  To better support table-row relations where one needs to lock the table
+  with an intention lock before locking the row, extended diagnostics is
+  provided.  When an intention lock (presumably on a table) is granted,
+  lockman_getlock() returns one of GOT_THE_LOCK (no need to lock the row,
+  perhaps the thread already has a normal lock on this table),
+  GOT_THE_LOCK_NEED_TO_LOCK_A_SUBRESOURCE (need to lock the row, as usual),
+  GOT_THE_LOCK_NEED_TO_INSTANT_LOCK_A_SUBRESOURCE (only need to check
+  whether it's possible to lock the row, but no need to lock it - perhaps
+  the thread has a loose lock on this table). This is defined by
+  getlock_result[] table.
+
+  Instant duration locks are not supported. Though they're trivial to add,
+  they are normally only used on rows, not on tables. So, presumably,
+  they are not needed here.
+
+  Mutexes: there're table mutexes (LOCKED_TABLE::mutex), lock owner mutexes
+  (TABLE_LOCK_OWNER::mutex), and a pool mutex (TABLOCKMAN::pool_mutex).
+  table mutex protects operations on the table lock structures, and lock
+  owner pointers waiting_for and waiting_for_loid.
+  lock owner mutex is only used to wait on lock owner condition
+  (TABLE_LOCK_OWNER::cond), there's no need to protect owner's lock
+  structures, and only lock owner itself may access them.
+  The pool mutex protects a pool of unused locks. Note the locking order:
+  first the table mutex, then the owner mutex or a pool mutex.
+  Table mutex lock cannot be attempted when owner or pool mutex are locked.
+  No mutex lock can be attempted if owner or pool mutex are locked.
+*/
+
+/*
+  Lock compatibility matrix.
+
+  It's asymmetric. Read it as "Somebody has the lock <value in the row
+  label>, can I set the lock <value in the column label> ?"
+
+  ') Though you can take LS lock while somebody has S lock, it makes no
+  sense - it's simpler to take S lock too.
+
+  1  - compatible
+  0  - incompatible
+  -1 - "impossible", so that we can assert the impossibility.
+*/
+static const int lock_compatibility_matrix[10][10]=
+{ /* N    S   X  IS  IX  SIX LS  LX  SLX LSIX          */
+  {  -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 }, /* N    */
+  {  -1,  1,  0,  1,  0,  0,  1,  0,  0,  0 }, /* S    */
+  {  -1,  0,  0,  0,  0,  0,  0,  0,  0,  0 }, /* X    */
+  {  -1,  1,  0,  1,  1,  1,  1,  1,  1,  1 }, /* IS   */
+  {  -1,  0,  0,  1,  1,  0,  1,  1,  0,  1 }, /* IX   */
+  {  -1,  0,  0,  1,  0,  0,  1,  0,  0,  0 }, /* SIX  */
+  {  -1,  1,  0,  1,  0,  0,  1,  0,  0,  0 }, /* LS   */
+  {  -1,  0,  0,  0,  0,  0,  0,  0,  0,  0 }, /* LX   */
+  {  -1,  0,  0,  0,  0,  0,  0,  0,  0,  0 }, /* SLX  */
+  {  -1,  0,  0,  1,  0,  0,  1,  0,  0,  0 }  /* LSIX */
+};
+
+/*
+  Lock combining matrix.
+
+  It's symmetric. Read it as "what lock level L is identical to the
+  set of two locks A and B"
+
+  One should never get N from it, we assert the impossibility
+*/
+static const enum lockman_lock_type lock_combining_matrix[10][10]=
+{/*    N    S   X    IS    IX  SIX    LS    LX   SLX   LSIX         */
+  {    N,   N,  N,    N,    N,   N,    N,    N,   N,    N}, /* N    */
+  {    N,   S,  X,    S,  SIX, SIX,    S,  SLX, SLX,  SIX}, /* S    */
+  {    N,   X,  X,    X,    X,   X,    X,    X,   X,    X}, /* X    */
+  {    N,   S,  X,   IS,   IX, SIX,   LS,   LX, SLX, LSIX}, /* IS   */
+  {    N, SIX,  X,   IX,   IX, SIX, LSIX,   LX, SLX, LSIX}, /* IX   */
+  {    N, SIX,  X,  SIX,  SIX, SIX,  SIX,  SLX, SLX,  SIX}, /* SIX  */
+  {    N,   S,  X,   LS, LSIX, SIX,   LS,   LX, SLX, LSIX}, /* LS   */
+  {    N, SLX,  X,   LX,   LX, SLX,   LX,   LX, SLX,   LX}, /* LX   */
+  {    N, SLX,  X,  SLX,  SLX, SLX,  SLX,  SLX, SLX,  SLX}, /* SLX  */
+  {    N, SIX,  X, LSIX, LSIX, SIX, LSIX,   LX, SLX, LSIX}  /* LSIX */
+};
+
+/*
+  the return codes for lockman_getlock
+
+  It's asymmetric. Read it as "I have the lock <value in the row label>,
+  what value should be returned for <value in the column label> ?"
+
+  0 means impossible combination (assert!)
+
+  Defines below help to preserve the table structure.
+  I/L/A values are self explanatory
+  x means the combination is possible (assert should not crash)
+    but it cannot happen in row locks, only in table locks (S,X),
+    or lock escalations (LS,LX)
+*/
+#define I GOT_THE_LOCK_NEED_TO_LOCK_A_SUBRESOURCE
+#define L GOT_THE_LOCK_NEED_TO_INSTANT_LOCK_A_SUBRESOURCE
+#define A GOT_THE_LOCK
+#define x GOT_THE_LOCK
+static const enum lockman_getlock_result getlock_result[10][10]=
+{/*    N    S   X    IS    IX  SIX    LS    LX   SLX   LSIX         */
+  {    0,   0,  0,    0,    0,   0,    0,    0,   0,    0}, /* N    */
+  {    0,   x,  0,    A,    0,   0,    x,    0,   0,    0}, /* S    */
+  {    0,   x,  x,    A,    A,   0,    x,    x,   0,    0}, /* X    */
+  {    0,   0,  0,    I,    0,   0,    0,    0,   0,    0}, /* IS   */
+  {    0,   0,  0,    I,    I,   0,    0,    0,   0,    0}, /* IX   */
+  {    0,   x,  0,    A,    I,   0,    x,    0,   0,    0}, /* SIX  */
+  {    0,   0,  0,    L,    0,   0,    x,    0,   0,    0}, /* LS   */
+  {    0,   0,  0,    L,    L,   0,    x,    x,   0,    0}, /* LX   */
+  {    0,   x,  0,    A,    L,   0,    x,    x,   0,    0}, /* SLX  */
+  {    0,   0,  0,    L,    I,   0,    x,    0,   0,    0}  /* LSIX */
+};
+#undef I
+#undef L
+#undef A
+#undef x
+
+/*
+  this structure is optimized for a case when there're many locks
+  on the same resource - e.g. a table
+*/
+
+struct st_table_lock {
+  /* QQ: do we need upgraded_from ? */
+  struct st_table_lock *next_in_lo, *upgraded_from, *next, *prev;
+  struct st_locked_table *table;
+  uint16 loid;
+  uchar  lock_type;
+};
+
+#define hash_insert my_hash_insert /* for consistency :) */
+
+static inline
+TABLE_LOCK *find_by_loid(LOCKED_TABLE *table, uint16 loid)
+{
+  return (TABLE_LOCK *)hash_search(& table->latest_locks,
+                                   (uchar *)& loid, sizeof(loid));
+}
+
+static inline
+void remove_from_wait_queue(TABLE_LOCK *lock, LOCKED_TABLE *table)
+{
+  DBUG_ASSERT(table == lock->table);
+  if (lock->prev)
+  {
+    DBUG_ASSERT(table->wait_queue_out != lock);
+    lock->prev->next= lock->next;
+  }
+  else
+  {
+    DBUG_ASSERT(table->wait_queue_out == lock);
+    table->wait_queue_out= lock->next;
+  }
+  if (lock->next)
+  {
+    DBUG_ASSERT(table->wait_queue_in != lock);
+    lock->next->prev= lock->prev;
+  }
+  else
+  {
+    DBUG_ASSERT(table->wait_queue_in == lock);
+    table->wait_queue_in= lock->prev;
+  }
+}
+
+/*
+  DESCRIPTION
+    tries to lock a resource 'table' with a lock level 'lock'.
+
+  RETURN
+    see enum lockman_getlock_result
+*/
+enum lockman_getlock_result
+tablockman_getlock(TABLOCKMAN *lm, TABLE_LOCK_OWNER *lo,
+                   LOCKED_TABLE *table, enum lockman_lock_type lock)
+{
+  TABLE_LOCK *old, *new, *blocker, *blocker2;
+  TABLE_LOCK_OWNER *wait_for;
+  struct timespec timeout;
+  enum lockman_lock_type new_lock;
+  enum lockman_getlock_result res;
+  int i;
+
+  DBUG_ASSERT(lo->waiting_lock == 0);
+  DBUG_ASSERT(lo->waiting_for == 0);
+  DBUG_ASSERT(lo->waiting_for_loid == 0);
+
+  pthread_mutex_lock(& table->mutex);
+  /* do we already have a lock on this resource ? */
+  old= find_by_loid(table, lo->loid);
+
+  /* calculate the level of the upgraded lock, if yes */
+  new_lock= old ? lock_combining_matrix[old->lock_type][lock] : lock;
+
+  /* and check if old lock is enough to satisfy the new request */
+  if (old && new_lock == old->lock_type)
+  {
+    /* yes */
+    res= getlock_result[old->lock_type][lock];
+    goto ret;
+  }
+
+  /* no, placing a new lock. first - take a free lock structure from the pool */
+  pthread_mutex_lock(& lm->pool_mutex);
+  new= lm->pool;
+  if (new)
+  {
+    lm->pool= new->next;
+    pthread_mutex_unlock(& lm->pool_mutex);
+  }
+  else
+  {
+    pthread_mutex_unlock(& lm->pool_mutex);
+    new= (TABLE_LOCK *)my_malloc(sizeof(*new), MYF(MY_WME));
+    if (unlikely(!new))
+    {
+      res= NO_MEMORY_FOR_LOCK;
+      goto ret;
+    }
+  }
+
+  new->loid= lo->loid;
+  new->lock_type= new_lock;
+  new->table= table;
+
+  /* and try to place it */
+  for (new->prev= table->wait_queue_in;;)
+  {
+    wait_for= 0;
+    if (!old)
+    {
+      /* not upgrading - a lock must be added to the _end_ of the wait queue */
+      for (blocker= new->prev; blocker && !wait_for; blocker= blocker->prev)
+      {
+        TABLE_LOCK_OWNER *tmp= lm->loid_to_tlo(blocker->loid);
+
+        /* find a blocking lock */
+        DBUG_ASSERT(table->wait_queue_out);
+        DBUG_ASSERT(table->wait_queue_in);
+        if (!lock_compatibility_matrix[blocker->lock_type][lock])
+        {
+          /* found! */
+          wait_for= tmp;
+          break;
+        }
+
+        /*
+          hmm, the lock before doesn't block us, let's look one step further.
+          the condition below means:
+
+            if we never waited on a condition yet
+            OR
+            the lock before ours (blocker) waits on a lock (blocker2) that is
+               present in the hash AND and conflicts with 'blocker'
+
+            the condition after OR may fail if 'blocker2' was removed from
+            the hash, its signal woke us up, but 'blocker' itself didn't see
+            the signal yet.
+        */
+        if (!lo->waiting_lock ||
+            ((blocker2= find_by_loid(table, tmp->waiting_for_loid)) &&
+            !lock_compatibility_matrix[blocker2->lock_type]
+                                      [blocker->lock_type]))
+        {
+          /* but it's waiting for a real lock. we'll wait for the same lock */
+          wait_for= tmp->waiting_for;
+          /*
+            We don't really need tmp->waiting_for, as tmp->waiting_for_loid
+            is enough.  waiting_for is just a local cache to avoid calling
+            loid_to_tlo().
+            But it's essensial that tmp->waiting_for pointer can ONLY
+            be dereferenced if find_by_loid() above returns a non-null
+            pointer, because a TABLE_LOCK_OWNER object that it points to
+            may've been freed when we come here after a signal.
+            In particular tmp->waiting_for_loid cannot be replaced
+            with tmp->waiting_for->loid.
+          */
+          DBUG_ASSERT(wait_for == lm->loid_to_tlo(tmp->waiting_for_loid));
+          break;
+        }
+
+        /*
+          otherwise - a lock it's waiting for doesn't exist.
+          We've no choice but to scan the wait queue backwards, looking
+          for a conflicting lock or a lock waiting for a real lock.
+          QQ is there a way to avoid this scanning ?
+        */
+      }
+    }
+
+    if (wait_for == 0)
+    {
+      /* checking for compatibility with existing locks */
+      for (blocker= 0, i= 0; i < LOCK_TYPES; i++)
+      {
+        if (table->active_locks[i] && !lock_compatibility_matrix[i+1][lock])
+        {
+          blocker= table->active_locks[i];
+          /* if the first lock in the list is our own - skip it */
+          if (blocker->loid == lo->loid)
+            blocker= blocker->next;
+          if (blocker) /* found a conflicting lock, need to wait */
+            break;
+        }
+      }
+      if (!blocker) /* free to go */
+        break;
+      wait_for= lm->loid_to_tlo(blocker->loid);
+    }
+
+    /* ok, we're here - the wait is inevitable */
+    lo->waiting_for= wait_for;
+    lo->waiting_for_loid= wait_for->loid;
+    if (!lo->waiting_lock) /* first iteration of the for() loop */
+    {
+      /* lock upgrade or new lock request ? */
+      if (old)
+      {
+        /* upgrade - add the lock to the _start_ of the wait queue */
+        new->prev= 0;
+        if ((new->next= table->wait_queue_out))
+          new->next->prev= new;
+        table->wait_queue_out= new;
+        if (!table->wait_queue_in)
+          table->wait_queue_in= table->wait_queue_out;
+      }
+      else
+      {
+        /* new lock - add the lock to the _end_ of the wait queue */
+        new->next= 0;
+        if ((new->prev= table->wait_queue_in))
+          new->prev->next= new;
+        table->wait_queue_in= new;
+        if (!table->wait_queue_out)
+          table->wait_queue_out= table->wait_queue_in;
+      }
+      lo->waiting_lock= new;
+
+      set_timespec_nsec(timeout,lm->lock_timeout * 1000000);
+
+    }
+
+    /*
+      prepare to wait.
+      we must lock blocker's mutex to wait on blocker's cond.
+      and we must release table's mutex.
+      note that blocker's mutex is locked _before_ table's mutex is released
+    */
+    pthread_mutex_lock(wait_for->mutex);
+    pthread_mutex_unlock(& table->mutex);
+
+    /* now really wait */
+    i= pthread_cond_timedwait(wait_for->cond, wait_for->mutex, & timeout);
+
+    pthread_mutex_unlock(wait_for->mutex);
+
+    if (i == ETIMEDOUT || i == ETIME)
+    {
+      /* we rely on the caller to rollback and release all locks */
+      res= LOCK_TIMEOUT;
+      goto ret2;
+    }
+
+    pthread_mutex_lock(& table->mutex);
+
+    /* ... and repeat from the beginning */
+  }
+  /* yeah! we can place the lock now */
+
+  /* remove the lock from the wait queue, if it was there */
+  if (lo->waiting_lock)
+  {
+    remove_from_wait_queue(new, table);
+    lo->waiting_lock= 0;
+    lo->waiting_for= 0;
+    lo->waiting_for_loid= 0;
+  }
+
+  /* add it to the list of all locks of this lock owner */
+  new->next_in_lo= lo->active_locks;
+  lo->active_locks= new;
+
+  /* and to the list of active locks of this lock type */
+  new->prev= 0;
+  if ((new->next= table->active_locks[new_lock-1]))
+    new->next->prev= new;
+  table->active_locks[new_lock-1]= new;
+
+  /* update the latest_locks hash */
+  if (old)
+    hash_delete(& table->latest_locks, (uchar *)old);
+  hash_insert(& table->latest_locks, (uchar *)new);
+
+  new->upgraded_from= old;
+
+  res= getlock_result[lock][lock];
+
+ret:
+  pthread_mutex_unlock(& table->mutex);
+ret2:
+  DBUG_ASSERT(res);
+  return res;
+}
+
+/*
+  DESCRIPTION
+    release all locks belonging to a transaction.
+    signal waiters to continue
+*/
+void tablockman_release_locks(TABLOCKMAN *lm, TABLE_LOCK_OWNER *lo)
+{
+  TABLE_LOCK *lock, *local_pool= 0, *local_pool_end;
+
+  /*
+    instead of adding released locks to a pool one by one, we'll link
+    them in a list and add to a pool in one short action (under a mutex)
+  */
+  local_pool_end= lo->waiting_lock ? lo->waiting_lock : lo->active_locks;
+  if (!local_pool_end)
+    return;
+
+  /* release a waiting lock, if any */
+  if ((lock= lo->waiting_lock))
+  {
+    DBUG_ASSERT(lock->loid == lo->loid);
+    pthread_mutex_lock(& lock->table->mutex);
+    remove_from_wait_queue(lock, lock->table);
+
+    /*
+      a special case: if this lock was not the last in the wait queue
+      and it's compatible with the next lock, than the next lock
+      is waiting for our blocker though really it waits for us, indirectly.
+      Signal our blocker to release this next lock (after we removed our
+      lock from the wait queue, of course).
+    */
+    /*
+      An example to clarify the above:
+        trn1> S-lock the table. Granted.
+        trn2> IX-lock the table. Added to the wait queue. trn2 waits on trn1
+        trn3> IS-lock the table.  The queue is not empty, so IS-lock is added
+              to the queue. It's compatible with the waiting IX-lock, so trn3
+              waits for trn2->waiting_for, that is trn1.
+      if trn1 releases the lock it signals trn1->cond and both waiting
+      transactions are awaken. But if trn2 times out, trn3 must be notified
+      too (as IS and S locks are compatible). So trn2 must signal trn1->cond.
+    */
+    if (lock->next &&
+        lock_compatibility_matrix[lock->next->lock_type][lock->lock_type])
+    {
+      pthread_mutex_lock(lo->waiting_for->mutex);
+      pthread_cond_broadcast(lo->waiting_for->cond);
+      pthread_mutex_unlock(lo->waiting_for->mutex);
+    }
+    lo->waiting_for= 0;
+    lo->waiting_for_loid= 0;
+    pthread_mutex_unlock(& lock->table->mutex);
+
+    lock->next= local_pool;
+    local_pool= lock;
+  }
+
+  /* now release granted locks */
+  lock= lo->active_locks;
+  while (lock)
+  {
+    TABLE_LOCK *cur= lock;
+    pthread_mutex_t *mutex= & lock->table->mutex;
+    DBUG_ASSERT(cur->loid == lo->loid);
+
+    DBUG_ASSERT(lock != lock->next_in_lo);
+    lock= lock->next_in_lo;
+
+    /* TODO ? group locks by table to reduce the number of mutex locks */
+    pthread_mutex_lock(mutex);
+    hash_delete(& cur->table->latest_locks, (uchar *)cur);
+
+    if (cur->prev)
+      cur->prev->next= cur->next;
+    if (cur->next)
+      cur->next->prev= cur->prev;
+    if (cur->table->active_locks[cur->lock_type-1] == cur)
+      cur->table->active_locks[cur->lock_type-1]= cur->next;
+
+    cur->next= local_pool;
+    local_pool= cur;
+
+    pthread_mutex_unlock(mutex);
+  }
+
+  lo->waiting_lock= lo->active_locks= 0;
+
+  /*
+    okay, all locks released. now signal that we're leaving,
+    in case somebody's waiting for it
+  */
+  pthread_mutex_lock(lo->mutex);
+  pthread_cond_broadcast(lo->cond);
+  pthread_mutex_unlock(lo->mutex);
+
+  /* and push all freed locks to the lockman's pool */
+  pthread_mutex_lock(& lm->pool_mutex);
+  local_pool_end->next= lm->pool;
+  lm->pool= local_pool;
+  pthread_mutex_unlock(& lm->pool_mutex);
+}
+
+void tablockman_init(TABLOCKMAN *lm, loid_to_tlo_func *func, uint timeout)
+{
+  lm->pool= 0;
+  lm->loid_to_tlo= func;
+  lm->lock_timeout= timeout;
+  pthread_mutex_init(& lm->pool_mutex, MY_MUTEX_INIT_FAST);
+  my_getsystime(); /* ensure that my_getsystime() is initialized */
+}
+
+void tablockman_destroy(TABLOCKMAN *lm)
+{
+  while (lm->pool)
+  {
+    TABLE_LOCK *tmp= lm->pool;
+    lm->pool= tmp->next;
+    my_free((void *)tmp, MYF(0));
+  }
+  pthread_mutex_destroy(& lm->pool_mutex);
+}
+
+/*
+  initialize a LOCKED_TABLE structure
+
+  SYNOPSYS
+    lt                          a LOCKED_TABLE to initialize
+    initial_hash_size           initial size for 'latest_locks' hash
+*/
+void tablockman_init_locked_table(LOCKED_TABLE *lt, int initial_hash_size)
+{
+  bzero(lt, sizeof(*lt));
+  pthread_mutex_init(& lt->mutex, MY_MUTEX_INIT_FAST);
+  hash_init(& lt->latest_locks, & my_charset_bin, initial_hash_size,
+            offsetof(TABLE_LOCK, loid),
+            sizeof(((TABLE_LOCK*)0)->loid), 0, 0, 0);
+}
+
+void tablockman_destroy_locked_table(LOCKED_TABLE *lt)
+{
+  int i;
+
+  DBUG_ASSERT(lt->wait_queue_out == 0);
+  DBUG_ASSERT(lt->wait_queue_in == 0);
+  DBUG_ASSERT(lt->latest_locks.records == 0);
+  for (i= 0; i<LOCK_TYPES; i++)
+     DBUG_ASSERT(lt->active_locks[i] == 0);
+
+  hash_free(& lt->latest_locks);
+  pthread_mutex_destroy(& lt->mutex);
+}
+
+#ifdef EXTRA_DEBUG
+static const char *lock2str[LOCK_TYPES+1]= {"N", "S", "X", "IS", "IX", "SIX",
+  "LS", "LX", "SLX", "LSIX"};
+
+void tablockman_print_tlo(TABLE_LOCK_OWNER *lo)
+{
+  TABLE_LOCK *lock;
+
+  printf("lo%d>", lo->loid);
+  if ((lock= lo->waiting_lock))
+    printf(" (%s.0x%lx)", lock2str[lock->lock_type], (ulong)lock->table);
+  for (lock= lo->active_locks;
+       lock && lock != lock->next_in_lo;
+       lock= lock->next_in_lo)
+    printf(" %s.0x%lx", lock2str[lock->lock_type], (ulong)lock->table);
+  if (lock && lock == lock->next_in_lo)
+    printf("!");
+  printf("\n");
+}
+#endif
+
diff --git a/storage/maria/tablockman.h b/storage/maria/tablockman.h
new file mode 100644
index 00000000000..e33d1aa44e8
--- /dev/null
+++ b/storage/maria/tablockman.h
@@ -0,0 +1,87 @@
+/* Copyright (C) 2006 MySQL AB
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; version 2 of the License.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */
+
+#ifndef _tablockman_h
+#define _tablockman_h
+
+/*
+  Lock levels:
+  ^^^^^^^^^^^
+
+  N    - "no lock", not a lock, used sometimes internally to simplify the code
+  S    - Shared
+  X    - eXclusive
+  IS   - Intention Shared
+  IX   - Intention eXclusive
+  SIX  - Shared + Intention eXclusive
+  LS   - Loose Shared
+  LX   - Loose eXclusive
+  SLX  - Shared + Loose eXclusive
+  LSIX - Loose Shared + Intention eXclusive
+*/
+#ifndef _lockman_h
+/* QQ: TODO remove N-locks */
+enum lockman_lock_type { N, S, X, IS, IX, SIX, LS, LX, SLX, LSIX, LOCK_TYPE_LAST };
+enum lockman_getlock_result {
+  NO_MEMORY_FOR_LOCK=1, DEADLOCK, LOCK_TIMEOUT,
+  GOT_THE_LOCK,
+  GOT_THE_LOCK_NEED_TO_LOCK_A_SUBRESOURCE,
+  GOT_THE_LOCK_NEED_TO_INSTANT_LOCK_A_SUBRESOURCE
+};
+#endif
+
+#define LOCK_TYPES (LOCK_TYPE_LAST-1)
+
+typedef struct st_table_lock TABLE_LOCK;
+
+typedef struct st_table_lock_owner {
+  TABLE_LOCK *active_locks;                          /* list of active locks */
+  TABLE_LOCK *waiting_lock;                  /* waiting lock (one lock only) */
+  struct st_table_lock_owner *waiting_for; /* transaction we're waiting for  */
+  pthread_cond_t  *cond;      /* transactions waiting for us, wait on 'cond' */
+  pthread_mutex_t *mutex;                 /* mutex is required to use 'cond' */
+  uint16    loid, waiting_for_loid;                 /* Lock Owner IDentifier */
+} TABLE_LOCK_OWNER;
+
+typedef struct st_locked_table {
+  pthread_mutex_t mutex;                        /* mutex for everything below */
+  HASH latest_locks;                                /* latest locks in a hash */
+  TABLE_LOCK *active_locks[LOCK_TYPES];          /* dl-list of locks per type */
+  TABLE_LOCK *wait_queue_in, *wait_queue_out; /* wait deque (double-end queue)*/
+} LOCKED_TABLE;
+
+typedef TABLE_LOCK_OWNER *loid_to_tlo_func(uint16);
+
+typedef struct {
+  pthread_mutex_t pool_mutex;
+  TABLE_LOCK *pool;                                /* lifo pool of free locks */
+  uint lock_timeout;                          /* lock timeout in milliseconds */
+  loid_to_tlo_func *loid_to_tlo;      /* for mapping loid to TABLE_LOCK_OWNER */
+} TABLOCKMAN;
+
+void tablockman_init(TABLOCKMAN *, loid_to_tlo_func *, uint);
+void tablockman_destroy(TABLOCKMAN *);
+enum lockman_getlock_result tablockman_getlock(TABLOCKMAN *, TABLE_LOCK_OWNER *,
+                                               LOCKED_TABLE *, enum lockman_lock_type);
+void tablockman_release_locks(TABLOCKMAN *, TABLE_LOCK_OWNER *);
+void tablockman_init_locked_table(LOCKED_TABLE *, int);
+void tablockman_destroy_locked_table(LOCKED_TABLE *);
+
+#ifdef EXTRA_DEBUG
+void tablockman_print_tlo(TABLE_LOCK_OWNER *);
+#endif
+
+#endif
+
diff --git a/storage/maria/test_pack b/storage/maria/test_pack
new file mode 100755
index 00000000000..689645b1661
--- /dev/null
+++ b/storage/maria/test_pack
@@ -0,0 +1,10 @@
+silent="-s"
+suffix=""
+
+ma_test1$suffix -s ; maria_pack$suffix --force -s test1 ; maria_chk$suffix -es test1 ; maria_chk$suffix -rqs test1 ; maria_chk$suffix -es test1 ; maria_chk$suffix -us test1 ; maria_chk$suffix -es test1
+ma_test1$suffix -s -S ; maria_pack$suffix --force -s test1 ; maria_chk$suffix -es test1 ; maria_chk$suffix -rqs test1 ; maria_chk$suffix -es test1 ;maria_chk$suffix -us test1 ; maria_chk$suffix -es test1
+ma_test1$suffix -s -b ; maria_pack$suffix --force -s test1 ; maria_chk$suffix -es test1 ; maria_chk$suffix -rqs test1 ; maria_chk$suffix -es test1
+ma_test1$suffix -s -w ; maria_pack$suffix --force -s test1 ; maria_chk$suffix -es test1 ; maria_chk$suffix -ros test1 ; maria_chk$suffix -es test1
+
+ma_test2$suffix -s -t4 ; maria_pack$suffix --force -s test2 ; maria_chk$suffix -es test2 ; maria_chk$suffix -ros test2 ; maria_chk$suffix -es test2 ; maria_chk$suffix -s -u test2 ; maria_chk$suffix -sm test2
+ma_test2$suffix -s -t4 -b ; maria_pack$suffix --force -s test2 ; maria_chk$suffix -es test2 ; maria_chk$suffix -ros test2 ; maria_chk$suffix -es test2 ; maria_chk$suffix -s -u test2 ; maria_chk$suffix -sm test2
diff --git a/storage/maria/trnman.c b/storage/maria/trnman.c
new file mode 100644
index 00000000000..05330baed76
--- /dev/null
+++ b/storage/maria/trnman.c
@@ -0,0 +1,979 @@
+/* Copyright (C) 2006-2008 MySQL AB, 2008-2009 Sun Microsystems, Inc.
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; version 2 of the License.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */
+
+
+#include <my_global.h>
+#include <my_sys.h>
+#include <m_string.h>
+#include "trnman.h"
+#include "ma_checkpoint.h"
+#include "ma_control_file.h"
+
+/*
+  status variables:
+  how many trns in the active list currently,
+  in the committed list currently, allocated since startup.
+*/
+uint trnman_active_transactions, trnman_committed_transactions,
+  trnman_allocated_transactions;
+
+/* list of active transactions in the trid order */
+static TRN active_list_min, active_list_max;
+/* list of committed transactions in the trid order */
+static TRN committed_list_min, committed_list_max;
+
+/* a counter, used to generate transaction ids */
+static TrID global_trid_generator;
+
+/*
+  The minimum existing transaction id for trnman_get_min_trid()
+  The default value is used when transaction manager not initialize;
+  Probably called from maria_chk
+*/
+static TrID trid_min_read_from= MAX_TRID;
+
+/* the mutex for everything above */
+static pthread_mutex_t LOCK_trn_list;
+
+/* LIFO pool of unused TRN structured for reuse */
+static TRN *pool;
+
+/* a hash for committed transactions that maps trid to a TRN structure */
+static LF_HASH trid_to_trn;
+
+/* an array that maps short_id of an active transaction to a TRN structure */
+static TRN **short_trid_to_active_trn;
+
+/* locks for short_trid_to_active_trn and pool */
+static my_atomic_rwlock_t LOCK_short_trid_to_trn, LOCK_pool;
+static my_bool default_trnman_end_trans_hook(TRN *, my_bool, my_bool);
+static void trnman_free_trn(TRN *);
+
+my_bool (*trnman_end_trans_hook)(TRN *, my_bool, my_bool)=
+  default_trnman_end_trans_hook;
+
+/*
+  Simple interface functions
+  QQ: if they stay so simple, should we make them inline?
+*/
+
+uint trnman_increment_locked_tables(TRN *trn)
+{
+  return trn->locked_tables++;
+}
+
+uint trnman_has_locked_tables(TRN *trn)
+{
+  return trn->locked_tables;
+}
+
+uint trnman_decrement_locked_tables(TRN *trn)
+{
+  return --trn->locked_tables;
+}
+
+void trnman_reset_locked_tables(TRN *trn, uint locked_tables)
+{
+  trn->locked_tables= locked_tables;
+}
+
+#ifdef EXTRA_DEBUG
+uint16 trnman_get_flags(TRN *trn)
+{
+  return trn->flags;
+}
+
+void trnman_set_flags(TRN *trn, uint16 flags)
+{
+  trn->flags= flags;
+}
+#endif
+
+/** Wake up threads waiting for this transaction */
+static void wt_thd_release_self(TRN *trn)
+{
+  if (trn->wt)
+  {
+    WT_RESOURCE_ID rc;
+    rc.type= &ma_rc_dup_unique;
+    rc.value= (intptr)trn;
+    wt_thd_release(trn->wt, & rc);
+    trn->wt= 0;
+  }
+}
+
+static my_bool
+default_trnman_end_trans_hook(TRN *trn __attribute__ ((unused)),
+                              my_bool commit __attribute__ ((unused)),
+                              my_bool active_transactions
+                              __attribute__ ((unused)))
+{
+  return 0;
+}
+
+
+static uchar *trn_get_hash_key(const uchar *trn, size_t *len,
+                              my_bool unused __attribute__ ((unused)))
+{
+  *len= sizeof(TrID);
+  return (uchar *) & ((*((TRN **)trn))->trid);
+}
+
+
+/**
+   @brief Initializes transaction manager.
+
+   @param  initial_trid        Generated TrIDs will start from initial_trid+1.
+
+   @return Operation status
+     @retval 0      OK
+     @retval !=0    Error
+*/
+
+int trnman_init(TrID initial_trid)
+{
+  DBUG_ENTER("trnman_init");
+  DBUG_PRINT("enter", ("initial_trid: %lu", (ulong) initial_trid));
+
+  short_trid_to_active_trn= (TRN **)my_malloc(SHORT_TRID_MAX*sizeof(TRN*),
+                                     MYF(MY_WME|MY_ZEROFILL));
+  if (unlikely(!short_trid_to_active_trn))
+    DBUG_RETURN(1);
+  short_trid_to_active_trn--; /* min short_id is 1 */
+
+  /*
+    Initialize lists.
+    active_list_max.min_read_from must be larger than any trid,
+    so that when an active list is empty we would could free
+    all committed list.
+    And  committed_list_max itself can not be freed so
+    committed_list_max.commit_trid must not be smaller that
+    active_list_max.min_read_from
+  */
+
+  active_list_max.trid= active_list_min.trid= 0;
+  active_list_max.min_read_from= MAX_TRID;
+  active_list_max.next= active_list_min.prev= 0;
+  active_list_max.prev= &active_list_min;
+  active_list_min.next= &active_list_max;
+
+  committed_list_max.commit_trid= MAX_TRID;
+  committed_list_max.next= committed_list_min.prev= 0;
+  committed_list_max.prev= &committed_list_min;
+  committed_list_min.next= &committed_list_max;
+
+  trnman_active_transactions= 0;
+  trnman_committed_transactions= 0;
+  trnman_allocated_transactions= 0;
+  /* This is needed for recovery and repair */
+  dummy_transaction_object.min_read_from= ~(TrID) 0;
+
+  pool= 0;
+  global_trid_generator= initial_trid;
+  trid_min_read_from= initial_trid;
+  lf_hash_init(&trid_to_trn, sizeof(TRN*), LF_HASH_UNIQUE,
+               0, 0, trn_get_hash_key, 0);
+  DBUG_PRINT("info", ("pthread_mutex_init LOCK_trn_list"));
+  pthread_mutex_init(&LOCK_trn_list, MY_MUTEX_INIT_FAST);
+  my_atomic_rwlock_init(&LOCK_short_trid_to_trn);
+  my_atomic_rwlock_init(&LOCK_pool);
+
+  DBUG_RETURN(0);
+}
+
+/*
+  NOTE
+    this could only be called in the "idle" state - no transaction can be
+    running. See asserts below.
+*/
+void trnman_destroy()
+{
+  DBUG_ENTER("trnman_destroy");
+
+  if (short_trid_to_active_trn == NULL) /* trnman already destroyed */
+    DBUG_VOID_RETURN;
+  DBUG_ASSERT(trid_to_trn.count == 0);
+  DBUG_ASSERT(trnman_active_transactions == 0);
+  DBUG_ASSERT(trnman_committed_transactions == 0);
+  DBUG_ASSERT(active_list_max.prev == &active_list_min);
+  DBUG_ASSERT(active_list_min.next == &active_list_max);
+  DBUG_ASSERT(committed_list_max.prev == &committed_list_min);
+  DBUG_ASSERT(committed_list_min.next == &committed_list_max);
+  while (pool)
+  {
+    TRN *trn= pool;
+    pool= pool->next;
+    DBUG_ASSERT(trn->wt == NULL);
+    pthread_mutex_destroy(&trn->state_lock);
+    my_free((void *)trn, MYF(0));
+  }
+  lf_hash_destroy(&trid_to_trn);
+  DBUG_PRINT("info", ("pthread_mutex_destroy LOCK_trn_list"));
+  pthread_mutex_destroy(&LOCK_trn_list);
+  my_atomic_rwlock_destroy(&LOCK_short_trid_to_trn);
+  my_atomic_rwlock_destroy(&LOCK_pool);
+  my_free((void *)(short_trid_to_active_trn+1), MYF(0));
+  short_trid_to_active_trn= NULL;
+
+  DBUG_VOID_RETURN;
+}
+
+/*
+  NOTE
+    TrID is limited to 6 bytes. Initial value of the generator
+    is set by the recovery code - being read from the last checkpoint
+    (or 1 on a first run).
+*/
+static TrID new_trid()
+{
+  DBUG_ENTER("new_trid");
+  DBUG_ASSERT(global_trid_generator < 0xffffffffffffLL);
+  DBUG_PRINT("info", ("safe_mutex_assert_owner LOCK_trn_list"));
+  safe_mutex_assert_owner(&LOCK_trn_list);
+  DBUG_RETURN(++global_trid_generator);
+}
+
+static uint get_short_trid(TRN *trn)
+{
+  int i= (int) ((global_trid_generator + (intptr)trn) * 312089 %
+                SHORT_TRID_MAX) + 1;
+  uint res=0;
+
+  for ( ; !res ; i= 1)
+  {
+    my_atomic_rwlock_wrlock(&LOCK_short_trid_to_trn);
+    for ( ; i <= SHORT_TRID_MAX; i++) /* the range is [1..SHORT_TRID_MAX] */
+    {
+      void *tmp= NULL;
+      if (short_trid_to_active_trn[i] == NULL &&
+          my_atomic_casptr((void **)&short_trid_to_active_trn[i], &tmp, trn))
+      {
+        res= i;
+        break;
+      }
+    }
+    my_atomic_rwlock_wrunlock(&LOCK_short_trid_to_trn);
+  }
+  return res;
+}
+
+/**
+  Allocates and initialzies a new TRN object
+
+  @note the 'wt' parameter can only be 0 in a single-threaded code (or,
+  generally, where threads cannot block each other), otherwise the
+  first call to the deadlock detector will sigsegv.
+*/
+
+TRN *trnman_new_trn(WT_THD *wt)
+{
+  int res;
+  TRN *trn;
+  union { TRN *trn; void *v; } tmp;
+  DBUG_ENTER("trnman_new_trn");
+
+  /*
+    we have a mutex, to do simple things under it - allocate a TRN,
+    increment trnman_active_transactions, set trn->min_read_from.
+
+    Note that all the above is fast. generating short_id may be slow,
+    as it involves scanning a large array - so it's done outside of the
+    mutex.
+  */
+
+  DBUG_PRINT("info", ("pthread_mutex_lock LOCK_trn_list"));
+  pthread_mutex_lock(&LOCK_trn_list);
+
+  /* Allocating a new TRN structure */
+  tmp.trn= pool;
+  /*
+    Popping an unused TRN from the pool
+    (ABA isn't possible, we're behind a mutex
+  */
+  my_atomic_rwlock_wrlock(&LOCK_pool);
+  while (tmp.trn && !my_atomic_casptr((void **)(char*) &pool, &tmp.v,
+                                      (void *)tmp.trn->next))
+    /* no-op */;
+  my_atomic_rwlock_wrunlock(&LOCK_pool);
+
+  /* Nothing in the pool ? Allocate a new one */
+  if (!(trn= tmp.trn))
+  {
+    /*
+      trn should be completely initalized at create time to allow
+      one to keep a known state on it.
+      (Like redo_lns, which is assumed to be 0 at start of row handling
+      and reset to zero before end of row handling)
+    */
+    trn= (TRN *)my_malloc(sizeof(TRN), MYF(MY_WME | MY_ZEROFILL));
+    if (unlikely(!trn))
+    {
+      DBUG_PRINT("info", ("pthread_mutex_unlock LOCK_trn_list"));
+      pthread_mutex_unlock(&LOCK_trn_list);
+      return 0;
+    }
+    trnman_allocated_transactions++;
+    pthread_mutex_init(&trn->state_lock, MY_MUTEX_INIT_FAST);
+  }
+  trn->wt= wt;
+  trn->pins= lf_hash_get_pins(&trid_to_trn);
+  if (!trn->pins)
+  {
+    trnman_free_trn(trn);
+    pthread_mutex_unlock(&LOCK_trn_list);
+    return 0;
+  }
+
+  trnman_active_transactions++;
+
+  trn->min_read_from= active_list_min.next->trid;
+
+  trn->trid= new_trid();
+
+  trn->next= &active_list_max;
+  trn->prev= active_list_max.prev;
+  active_list_max.prev= trn->prev->next= trn;
+  trid_min_read_from= active_list_min.next->min_read_from;
+  DBUG_PRINT("info", ("pthread_mutex_unlock LOCK_trn_list"));
+  pthread_mutex_unlock(&LOCK_trn_list);
+
+  if (unlikely(!trn->min_read_from))
+  {
+    /*
+      We are the only transaction. Set min_read_from so that we can read
+      our own rows
+    */
+    trn->min_read_from= trn->trid + 1;
+  }
+
+  /* no other transaction can read changes done by this one */
+  trn->commit_trid=  MAX_TRID;
+  trn->rec_lsn= trn->undo_lsn= trn->first_undo_lsn= 0;
+  trn->used_tables= 0;
+
+  trn->locked_tables= 0;
+  trn->flags= 0;
+
+  /*
+    only after the following function TRN is considered initialized,
+    so it must be done the last
+  */
+  pthread_mutex_lock(&trn->state_lock);
+  trn->short_id= get_short_trid(trn);
+  pthread_mutex_unlock(&trn->state_lock);
+
+  res= lf_hash_insert(&trid_to_trn, trn->pins, &trn);
+  DBUG_ASSERT(res <= 0);
+  if (res)
+  {
+    trnman_end_trn(trn, 0);
+    return 0;
+  }
+
+  DBUG_PRINT("exit", ("trn: 0x%lx  trid: 0x%lu",
+                      (ulong) trn, (ulong) trn->trid));
+
+  DBUG_RETURN(trn);
+}
+
+/*
+  remove a trn from the active list.
+  if necessary - move to committed list and set commit_trid
+
+  NOTE
+    Locks are released at the end. In particular, after placing the
+    transaction in commit list, and after setting commit_trid. It's
+    important, as commit_trid affects visibility.  Locks don't affect
+    anything they simply delay execution of other threads - they could be
+    released arbitrarily late. In other words, when locks are released it
+    serves as a start banner for other threads, they start to run. So
+    everything they may need must be ready at that point.
+
+  RETURN
+    0  ok
+    1  error
+*/
+my_bool trnman_end_trn(TRN *trn, my_bool commit)
+{
+  int res= 1;
+  uint16 cached_short_id= trn->short_id; /* we have to cache it, see below */
+  TRN *free_me= 0;
+  LF_PINS *pins= trn->pins;
+  DBUG_ENTER("trnman_end_trn");
+  DBUG_PRINT("enter", ("trn=0x%lx commit=%d", (ulong) trn, commit));
+
+  /* if a rollback, all UNDO records should have been executed */
+  DBUG_ASSERT(commit || trn->undo_lsn == 0);
+  DBUG_ASSERT(trn != &dummy_transaction_object);
+  DBUG_PRINT("info", ("pthread_mutex_lock LOCK_trn_list"));
+
+  pthread_mutex_lock(&LOCK_trn_list);
+
+  /* remove from active list */
+  trn->next->prev= trn->prev;
+  trn->prev->next= trn->next;
+
+  /*
+    if trn was the oldest active transaction, now that it goes away there
+    may be committed transactions in the list which no active transaction
+    needs to bother about - clean up the committed list
+  */
+  if (trn->prev == &active_list_min)
+  {
+    uint free_me_count;
+    TRN *t;
+    for (t= committed_list_min.next, free_me_count= 0;
+         t->commit_trid < active_list_min.next->min_read_from;
+         t= t->next, free_me_count++) /* no-op */;
+
+    DBUG_ASSERT((t != committed_list_min.next && free_me_count > 0) ||
+                (t == committed_list_min.next && free_me_count == 0));
+    /* found transactions committed before the oldest active one */
+    if (t != committed_list_min.next)
+    {
+      free_me= committed_list_min.next;
+      committed_list_min.next= t;
+      t->prev->next= 0;
+      t->prev= &committed_list_min;
+      trnman_committed_transactions-= free_me_count;
+    }
+  }
+
+  pthread_mutex_lock(&trn->state_lock);
+  if (commit)
+    trn->commit_trid= global_trid_generator;
+  wt_thd_release_self(trn);
+  pthread_mutex_unlock(&trn->state_lock);
+
+  /*
+    if transaction is committed and it was not the only active transaction -
+    add it to the committed list
+  */
+  if (commit && active_list_min.next != &active_list_max)
+  {
+    trn->next= &committed_list_max;
+    trn->prev= committed_list_max.prev;
+    trnman_committed_transactions++;
+    committed_list_max.prev= trn->prev->next= trn;
+  }
+  else
+  {
+    trn->next= free_me;
+    free_me= trn;
+  }
+  trid_min_read_from= active_list_min.next->min_read_from;
+
+  if ((*trnman_end_trans_hook)(trn, commit,
+                               active_list_min.next != &active_list_max))
+    res= -1;
+  trnman_active_transactions--;
+
+  DBUG_PRINT("info", ("pthread_mutex_unlock LOCK_trn_list"));
+  pthread_mutex_unlock(&LOCK_trn_list);
+
+  /*
+    the rest is done outside of a critical section
+
+    note that we don't own trn anymore, it may be in a shared list now.
+    Thus, we cannot dereference it, and must use cached_short_id below.
+  */
+  my_atomic_rwlock_rdlock(&LOCK_short_trid_to_trn);
+  my_atomic_storeptr((void **)&short_trid_to_active_trn[cached_short_id], 0);
+  my_atomic_rwlock_rdunlock(&LOCK_short_trid_to_trn);
+
+  /*
+    we, under the mutex, removed going-in-free_me transactions from the
+    active and committed lists, thus nobody else may see them when it scans
+    those lists, and thus nobody may want to free them. Now we don't
+    need a mutex to access free_me list
+  */
+  /* QQ: send them to the purge thread */
+  while (free_me)
+  {
+    TRN *t= free_me;
+    free_me= free_me->next;
+
+    /* ignore OOM. it's harmless, and we can do nothing here anyway */
+    (void)lf_hash_delete(&trid_to_trn, pins, &t->trid, sizeof(TrID));
+
+    trnman_free_trn(t);
+  }
+
+  lf_hash_put_pins(pins);
+
+  DBUG_RETURN(res < 0);
+}
+
+/*
+  free a trn (add to the pool, that is)
+  note - we can never really free() a TRN if there's at least one other
+  running transaction - see, e.g., how lock waits are implemented in
+  lockman.c
+  The same is true for other lock-free data structures too. We may need some
+  kind of FLUSH command to reset them all - ensuring that no transactions are
+  running. It may even be called automatically on checkpoints if no
+  transactions are running.
+*/
+static void trnman_free_trn(TRN *trn)
+{
+  /*
+     union is to solve strict aliasing issue.
+     without it gcc 3.4.3 doesn't notice that updating *(void **)&tmp
+     modifies the value of tmp.
+  */
+  union { TRN *trn; void *v; } tmp;
+
+  pthread_mutex_lock(&trn->state_lock);
+  trn->short_id= 0;
+  pthread_mutex_unlock(&trn->state_lock);
+
+  tmp.trn= pool;
+
+  my_atomic_rwlock_wrlock(&LOCK_pool);
+  do
+  {
+    /*
+      without this volatile cast gcc-3.4.4 moves the assignment
+      down after the loop at -O2
+    */
+    *(TRN * volatile *)&(trn->next)= tmp.trn;
+  } while (!my_atomic_casptr((void **)(char*)&pool, &tmp.v, trn));
+  my_atomic_rwlock_wrunlock(&LOCK_pool);
+}
+
+/*
+  NOTE
+    here we access the hash in a lock-free manner.
+    It's safe, a 'found' TRN can never be freed/reused before we access it.
+    In fact, it cannot be freed before 'trn' ends, because a 'found' TRN
+    can only be removed from the hash when:
+                found->commit_trid < ALL (trn->min_read_from)
+    that is, at least
+                found->commit_trid < trn->min_read_from
+    but
+                found->trid >= trn->min_read_from
+    and
+                found->commit_trid > found->trid
+
+  RETURN
+    1   can
+    0   cannot
+   -1   error (OOM)
+*/
+int trnman_can_read_from(TRN *trn, TrID trid)
+{
+  TRN **found;
+  my_bool can;
+  LF_REQUIRE_PINS(3);
+
+  if (trid < trn->min_read_from)
+    return 1; /* Row is visible by all transactions in the system */
+
+  if (trid >= trn->trid)
+  {
+    /*
+      We have now two cases
+      trid > trn->trid, in which case the row is from a new transaction
+      and not visible, in which case we should return 0.
+      trid == trn->trid in which case the row is from the current transaction
+      and we should return 1
+    */
+    return trid == trn->trid;
+  }
+
+  found= lf_hash_search(&trid_to_trn, trn->pins, &trid, sizeof(trid));
+  if (found == NULL)
+    return 0; /* not in the hash of transactions = cannot read */
+  if (found == MY_ERRPTR)
+    return -1;
+
+  can= (*found)->commit_trid < trn->trid;
+  lf_hash_search_unpin(trn->pins);
+  return can;
+}
+
+/**
+  Finds a TRN by its TrID
+
+  @param trn    current trn. Needed for pinning pointers (see lf_pin)
+  @param trid   trid to search for
+
+  @return found trn or 0
+
+  @note that trn is returned with its state locked!
+*/
+TRN *trnman_trid_to_trn(TRN *trn, TrID trid)
+{
+  TRN **found;
+  LF_REQUIRE_PINS(3);
+
+  if (trid < trn->min_read_from)
+    return 0; /* it's committed eons ago */
+
+  found= lf_hash_search(&trid_to_trn, trn->pins, &trid, sizeof(trid));
+  if (found == NULL || found == MY_ERRPTR)
+    return 0; /* no luck */
+
+  /* we've found something */
+  pthread_mutex_lock(&(*found)->state_lock);
+
+  if ((*found)->short_id == 0)
+  {
+    pthread_mutex_unlock(&(*found)->state_lock);
+    lf_hash_search_unpin(trn->pins);
+    return 0; /* but it was a ghost */
+  }
+  lf_hash_search_unpin(trn->pins);
+
+  /* Gotcha! */
+  return *found;
+}
+
+/* TODO: the stubs below are waiting for savepoints to be implemented */
+
+void trnman_new_statement(TRN *trn __attribute__ ((unused)))
+{
+}
+
+void trnman_rollback_statement(TRN *trn __attribute__ ((unused)))
+{
+}
+
+
+/**
+   @brief Allocates buffers and stores in them some info about transactions
+
+   Does the allocation because the caller cannot know the size itself.
+   Memory freeing is to be done by the caller (if the "str" member of the
+   LEX_STRING is not NULL).
+   The caller has the intention of doing checkpoints.
+
+   @param[out]  str_act    pointer to where the allocated buffer,
+                           and its size, will be put; buffer will be filled
+                           with info about active transactions
+   @param[out]  str_com    pointer to where the allocated buffer,
+                           and its size, will be put; buffer will be filled
+                           with info about committed transactions
+   @param[out]  min_first_undo_lsn pointer to where the minimum
+                           first_undo_lsn of all transactions will be put
+
+   @return Operation status
+     @retval 0      OK
+     @retval 1      Error
+*/
+
+my_bool trnman_collect_transactions(LEX_STRING *str_act, LEX_STRING *str_com,
+                                    LSN *min_rec_lsn, LSN *min_first_undo_lsn)
+{
+  my_bool error;
+  TRN *trn;
+  char *ptr;
+  uint stored_transactions= 0;
+  LSN minimum_rec_lsn= LSN_MAX, minimum_first_undo_lsn= LSN_MAX;
+  DBUG_ENTER("trnman_collect_transactions");
+
+  DBUG_ASSERT((NULL == str_act->str) && (NULL == str_com->str));
+
+  /* validate the use of read_non_atomic() in general: */
+  compile_time_assert((sizeof(LSN) == 8) && (sizeof(LSN_WITH_FLAGS) == 8));
+  pthread_mutex_lock(&LOCK_trn_list);
+  str_act->length= 2 + /* number of active transactions */
+    LSN_STORE_SIZE + /* minimum of their rec_lsn */
+    TRANSID_SIZE + /* current TrID generator value */
+    (2 + /* short id */
+     6 + /* long id */
+     LSN_STORE_SIZE + /* undo_lsn */
+#ifdef MARIA_VERSIONING /* not enabled yet */
+     LSN_STORE_SIZE + /* undo_purge_lsn */
+#endif
+     LSN_STORE_SIZE /* first_undo_lsn */
+     ) * trnman_active_transactions;
+  str_com->length= 4 + /* number of committed transactions */
+    (6 + /* long id */
+#ifdef MARIA_VERSIONING /* not enabled yet */
+     LSN_STORE_SIZE + /* undo_purge_lsn */
+#endif
+     LSN_STORE_SIZE /* first_undo_lsn */
+     ) * trnman_committed_transactions;
+  if ((NULL == (str_act->str= my_malloc(str_act->length, MYF(MY_WME)))) ||
+      (NULL == (str_com->str= my_malloc(str_com->length, MYF(MY_WME)))))
+    goto err;
+  /* First, the active transactions */
+  ptr= str_act->str + 2 + LSN_STORE_SIZE;
+  transid_store(ptr, global_trid_generator);
+  ptr+= TRANSID_SIZE;
+  for (trn= active_list_min.next; trn != &active_list_max; trn= trn->next)
+  {
+    uint sid;
+    LSN rec_lsn, undo_lsn, first_undo_lsn;
+    pthread_mutex_lock(&trn->state_lock);
+    sid= trn->short_id;
+    pthread_mutex_unlock(&trn->state_lock);
+    if (sid == 0)
+    {
+      /*
+        Not even inited, has done nothing. Or it is the
+        dummy_transaction_object, which does only non-transactional
+        immediate-sync operations (CREATE/DROP/RENAME/REPAIR TABLE), and so
+        can be forgotten for Checkpoint.
+      */
+      continue;
+    }
+    /* needed for low-water mark calculation */
+    if (((rec_lsn= lsn_read_non_atomic(trn->rec_lsn)) > 0) &&
+        (cmp_translog_addr(rec_lsn, minimum_rec_lsn) < 0))
+      minimum_rec_lsn= rec_lsn;
+    /*
+      If trn has not logged LOGREC_LONG_TRANSACTION_ID, this trn will be
+      discovered when seeing that log record which is for sure located after
+      checkpoint_start_log_horizon.
+    */
+    if ((LSN_WITH_FLAGS_TO_FLAGS(trn->first_undo_lsn) &
+         TRANSACTION_LOGGED_LONG_ID) == 0)
+      continue;
+    /*
+      On the other hand, if undo_lsn is LSN_IMPOSSIBLE, trn may later log
+      records; so we must include trn in the checkpoint now, because we cannot
+      count on LOGREC_LONG_TRANSACTION_ID (as we are already past it).
+    */
+    undo_lsn= trn->undo_lsn;
+    stored_transactions++;
+    int2store(ptr, sid);
+    ptr+= 2;
+    int6store(ptr, trn->trid);
+    ptr+= 6;
+    lsn_store(ptr, undo_lsn); /* needed for rollback */
+    ptr+= LSN_STORE_SIZE;
+    /* needed for low-water mark calculation */
+    if (((first_undo_lsn= lsn_read_non_atomic(trn->first_undo_lsn)) > 0) &&
+        (cmp_translog_addr(first_undo_lsn, minimum_first_undo_lsn) < 0))
+      minimum_first_undo_lsn= first_undo_lsn;
+    lsn_store(ptr, first_undo_lsn);
+    ptr+= LSN_STORE_SIZE;
+#ifdef MARIA_VERSIONING /* not enabled yet */
+    /* to know where purging should start (last delete of this trn) */
+    lsn_store(ptr, trn->undo_purge_lsn);
+    ptr+= LSN_STORE_SIZE;
+#endif
+    /**
+       @todo RECOVERY: add a comment explaining why we can dirtily read some
+       vars, inspired by the text of "assumption 8" in WL#3072
+    */
+  }
+  str_act->length= ptr - str_act->str; /* as we maybe over-estimated */
+  ptr= str_act->str;
+  DBUG_PRINT("info",("collected %u active transactions",
+                     (uint)stored_transactions));
+  int2store(ptr, stored_transactions);
+  ptr+= 2;
+  /* this LSN influences how REDOs for any page can be ignored by Recovery */
+  lsn_store(ptr, minimum_rec_lsn);
+  /* one day there will also be a list of prepared transactions */
+  /* do the same for committed ones */
+  ptr= str_com->str;
+  int4store(ptr, trnman_committed_transactions);
+  ptr+= 4;
+  DBUG_PRINT("info",("collected %u committed transactions",
+                     (uint)trnman_committed_transactions));
+  for (trn= committed_list_min.next; trn != &committed_list_max;
+       trn= trn->next)
+  {
+    LSN first_undo_lsn;
+    int6store(ptr, trn->trid);
+    ptr+= 6;
+#ifdef MARIA_VERSIONING /* not enabled yet */
+    lsn_store(ptr, trn->undo_purge_lsn);
+    ptr+= LSN_STORE_SIZE;
+#endif
+    first_undo_lsn= LSN_WITH_FLAGS_TO_LSN(trn->first_undo_lsn);
+    if (cmp_translog_addr(first_undo_lsn, minimum_first_undo_lsn) < 0)
+      minimum_first_undo_lsn= first_undo_lsn;
+    lsn_store(ptr, first_undo_lsn);
+    ptr+= LSN_STORE_SIZE;
+  }
+  /*
+    TODO: if we see there exists no transaction (active and committed) we can
+    tell the lock-free structures to do some freeing (my_free()).
+  */
+  error= 0;
+  *min_rec_lsn= minimum_rec_lsn;
+  *min_first_undo_lsn= minimum_first_undo_lsn;
+  goto end;
+err:
+  error= 1;
+end:
+  pthread_mutex_unlock(&LOCK_trn_list);
+  DBUG_RETURN(error);
+}
+
+
+TRN *trnman_recreate_trn_from_recovery(uint16 shortid, TrID longid)
+{
+  TrID old_trid_generator= global_trid_generator;
+  TRN *trn;
+  DBUG_ASSERT(maria_in_recovery && !maria_multi_threaded);
+  global_trid_generator= longid-1; /* force a correct trid in the new trn */
+  if (unlikely((trn= trnman_new_trn(NULL)) == NULL))
+    return NULL;
+  /* deallocate excessive allocations of trnman_new_trn() */
+  global_trid_generator= old_trid_generator;
+  set_if_bigger(global_trid_generator, longid);
+  short_trid_to_active_trn[trn->short_id]= 0;
+  DBUG_ASSERT(short_trid_to_active_trn[shortid] == NULL);
+  short_trid_to_active_trn[shortid]= trn;
+  trn->short_id= shortid;
+  return trn;
+}
+
+
+TRN *trnman_get_any_trn()
+{
+  TRN *trn= active_list_min.next;
+  return (trn != &active_list_max) ? trn : NULL;
+}
+
+
+/**
+  Returns the minimum existing transaction id. May return a too small
+  number in race conditions, but this is ok as the value is used to
+  remove not visible transid from index/rows.
+*/
+
+TrID trnman_get_min_trid()
+{
+  return trid_min_read_from;
+}
+
+
+/**
+  Returns the minimum possible transaction id
+
+  @notes
+  If there is no transactions running, returns number for next running
+  transaction.
+  If one has an active transaction, the returned number will be less or
+  equal to this.  If one is not running in a transaction one will ge the
+  number for the next started transaction.  This is used in create table
+  to get a safe minimum trid to use.
+*/
+
+TrID trnman_get_min_safe_trid()
+{
+  TrID trid;
+  pthread_mutex_lock(&LOCK_trn_list);
+  trid= min(active_list_min.next->min_read_from,
+            global_trid_generator);
+  pthread_mutex_unlock(&LOCK_trn_list);
+  return trid;
+}
+
+
+/**
+  Returns maximum transaction id given to a transaction so far.
+*/
+
+TrID trnman_get_max_trid()
+{
+  TrID id;
+  if (short_trid_to_active_trn == NULL)
+    return 0;
+  pthread_mutex_lock(&LOCK_trn_list);
+  id= global_trid_generator;
+  pthread_mutex_unlock(&LOCK_trn_list);
+  return id;
+}
+
+/**
+  @brief Check if there exist an active transaction between two commit_id's
+
+  @todo
+    Improve speed of this.
+      - Store transactions in tree or skip list
+      - Have function to copying all active transaction id's to b-tree
+        and use b-tree for checking states.  This could be a big win
+        for checkpoint that will call this function for a lot of objects.
+
+  @return
+    0   No transaction exists
+    1   There is at least on active transaction in the given range
+*/
+
+my_bool trnman_exists_active_transactions(TrID min_id, TrID max_id,
+                                          my_bool trnman_is_locked)
+{
+  TRN *trn;
+  my_bool ret= 0;
+
+  if (!trnman_is_locked)
+    pthread_mutex_lock(&LOCK_trn_list);
+  safe_mutex_assert_owner(&LOCK_trn_list);
+  for (trn= active_list_min.next; trn != &active_list_max; trn= trn->next)
+  {
+    /*
+      We use <= for max_id as max_id is a commit_trid and trn->trid
+      is transaction id.  When calculating commit_trid we use the
+      current value of global_trid_generator.  global_trid_generator is
+      incremented for each new transaction.
+
+      For example, assuming we have
+      min_id = 5
+      max_id = 10
+
+      A trid of value 5 can't see the history event between 5 & 10
+      at it vas started before min_id 5 was committed.
+      A trid of value 10 can't see the next history event (max_id = 10)
+      as it started before this was committed. In this case it must use
+      the this event.
+    */
+    if (trn->trid > min_id && trn->trid <= max_id)
+    {
+      ret= 1;
+      break;
+    }
+  }
+  if (!trnman_is_locked)
+    pthread_mutex_unlock(&LOCK_trn_list);
+  return ret;
+}
+
+
+/**
+   lock transaction list
+*/
+
+void trnman_lock()
+{
+  pthread_mutex_lock(&LOCK_trn_list);
+}
+
+
+/**
+   unlock transaction list
+*/
+
+void trnman_unlock()
+{
+  pthread_mutex_unlock(&LOCK_trn_list);
+}
+
+
+/**
+  Is trman initialized
+*/
+
+my_bool trman_is_inited()
+{
+  return (short_trid_to_active_trn != NULL);
+}
diff --git a/storage/maria/trnman.h b/storage/maria/trnman.h
new file mode 100644
index 00000000000..afe01d4ad10
--- /dev/null
+++ b/storage/maria/trnman.h
@@ -0,0 +1,67 @@
+/* Copyright (C) 2006-2008 MySQL AB, 2008-2009 Sun Microsystems, Inc.
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; version 2 of the License.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */
+
+#ifndef _trnman_h
+#define _trnman_h
+
+C_MODE_START
+
+#include <lf.h>
+#include "trnman_public.h"
+#include "ma_loghandler_lsn.h"
+
+/**
+  trid - 6 uchar transaction identifier. Assigned when a transaction
+  is created. Transaction can always be identified by its trid,
+  even after transaction has ended.
+
+  short_id - 2-byte transaction identifier, identifies a running
+  transaction, is reassigned when transaction ends.
+
+  when short_id is 0, TRN is not initialized, for all practical purposes
+  it could be considered unused.
+
+  when commit_trid is MAX_TRID the transaction is running, otherwise it's
+  committed.
+
+  state_lock mutex protects the state of a TRN, that is whether a TRN
+  is committed/running/unused. Meaning that modifications of short_id and
+  commit_trid happen under this mutex.
+*/
+
+struct st_ma_transaction
+{
+  LF_PINS              *pins;
+  WT_THD               *wt;
+  pthread_mutex_t      state_lock;
+  void                 *used_tables;  /**< Tables used by transaction */
+  TRN                  *next, *prev;
+  TrID                 trid, min_read_from, commit_trid;
+  LSN		       rec_lsn, undo_lsn;
+  LSN_WITH_FLAGS       first_undo_lsn;
+  uint                 locked_tables;
+  uint16               short_id;
+  uint16               flags;         /**< Various flags */
+};
+
+#define TRANSACTION_LOGGED_LONG_ID ULL(0x8000000000000000)
+#define MAX_TRID (~(TrID)0)
+
+extern WT_RESOURCE_TYPE ma_rc_dup_unique;
+
+C_MODE_END
+
+#endif
+
diff --git a/storage/maria/trnman_public.h b/storage/maria/trnman_public.h
new file mode 100644
index 00000000000..9523eb5de8f
--- /dev/null
+++ b/storage/maria/trnman_public.h
@@ -0,0 +1,85 @@
+/* Copyright (C) 2006-2008 MySQL AB, 2008-2009 Sun Microsystems, Inc.
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; version 2 of the License.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */
+
+
+/*
+  External definitions for trnman.h
+  We need to split this into two files as gcc 4.1.2 gives error if it tries
+  to include my_atomic.h in C++ code.
+*/
+
+#ifndef _trnman_public_h
+#define _trnman_public_h
+
+#include "ma_loghandler_lsn.h"
+#include <waiting_threads.h>
+
+C_MODE_START
+typedef uint64 TrID; /* our TrID is 6 bytes */
+typedef struct st_ma_transaction TRN;
+
+#define SHORT_TRID_MAX 65535
+
+extern uint trnman_active_transactions, trnman_allocated_transactions;
+extern TRN dummy_transaction_object;
+extern my_bool (*trnman_end_trans_hook)(TRN *trn, my_bool commit,
+                                        my_bool active_transactions);
+
+int trnman_init(TrID);
+void trnman_destroy(void);
+TRN *trnman_new_trn(WT_THD *wt);
+my_bool trnman_end_trn(TRN *trn, my_bool commit);
+#define trnman_commit_trn(T) trnman_end_trn(T, TRUE)
+#define trnman_abort_trn(T)  trnman_end_trn(T, FALSE)
+#define trnman_rollback_trn(T)  trnman_end_trn(T, FALSE)
+int trnman_can_read_from(TRN *trn, TrID trid);
+TRN *trnman_trid_to_trn(TRN *trn, TrID trid);
+void trnman_new_statement(TRN *trn);
+void trnman_rollback_statement(TRN *trn);
+my_bool trnman_collect_transactions(LEX_STRING *str_act, LEX_STRING *str_com,
+                                    LSN *min_rec_lsn,
+                                    LSN *min_first_undo_lsn);
+
+uint trnman_increment_locked_tables(TRN *trn);
+uint trnman_decrement_locked_tables(TRN *trn);
+uint trnman_has_locked_tables(TRN *trn);
+void trnman_reset_locked_tables(TRN *trn, uint locked_tables);
+TRN *trnman_recreate_trn_from_recovery(uint16 shortid, TrID longid);
+TRN *trnman_get_any_trn(void);
+TrID trnman_get_min_trid(void);
+TrID trnman_get_max_trid(void);
+TrID trnman_get_min_safe_trid();
+my_bool trnman_exists_active_transactions(TrID min_id, TrID max_id,
+                                          my_bool trnman_is_locked);
+#define TRANSID_SIZE		6
+#define transid_store(dst, id) int6store(dst,id)
+#define transid_korr(P) uint6korr(P)
+void trnman_lock();
+void trnman_unlock();
+my_bool trman_is_inited();
+#ifdef EXTRA_DEBUG
+uint16 trnman_get_flags(TRN *);
+void trnman_set_flags(TRN *, uint16 flags);
+#else
+#define trnman_get_flags(A) 0
+#define trnman_set_flags(A, B) do { } while (0)
+#endif
+
+/* Flag bits */
+#define TRN_STATE_INFO_LOGGED       1  /* Query is logged */
+#define TRN_STATE_TABLES_CAN_CHANGE 2  /* Things can change during trans. */
+
+C_MODE_END
+#endif
diff --git a/storage/maria/unittest/CMakeLists.txt b/storage/maria/unittest/CMakeLists.txt
new file mode 100644
index 00000000000..fe6327c6ea3
--- /dev/null
+++ b/storage/maria/unittest/CMakeLists.txt
@@ -0,0 +1,95 @@
+# Copyright (C) 2007 MySQL AB
+# 
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; version 2 of the License.
+# 
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+# 
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+
+INCLUDE_DIRECTORIES(${CMAKE_SOURCE_DIR}/include ${CMAKE_SOURCE_DIR}/zlib
+                    ${CMAKE_SOURCE_DIR}/unittest/mytap)
+LINK_LIBRARIES(aria myisam mytap mysys dbug strings wsock32 zlib)
+
+ADD_EXECUTABLE(ma_control_file-t ma_control_file-t.c)
+ADD_EXECUTABLE(trnman-t trnman-t.c)
+ADD_EXECUTABLE(ma_test_loghandler-t  
+	ma_test_loghandler-t.c ma_maria_log_cleanup.c ma_loghandler_examples.c)
+ADD_EXECUTABLE(ma_test_loghandler_multigroup-t  
+	ma_test_loghandler_multigroup-t.c ma_maria_log_cleanup.c ma_loghandler_examples.c sequence_storage.c)
+ADD_EXECUTABLE(ma_test_loghandler_multithread-t  
+	ma_test_loghandler_multithread-t.c ma_maria_log_cleanup.c ma_loghandler_examples.c)
+ADD_EXECUTABLE(ma_test_loghandler_pagecache-t
+	ma_test_loghandler_pagecache-t.c ma_maria_log_cleanup.c ma_loghandler_examples.c)
+ADD_EXECUTABLE(ma_test_loghandler_long-t
+	ma_test_loghandler-t.c ma_maria_log_cleanup.c ma_loghandler_examples.c)
+SET_TARGET_PROPERTIES(ma_test_loghandler_long-t  PROPERTIES COMPILE_FLAGS "-DLONG_LOG_TEST")
+
+ADD_EXECUTABLE(ma_test_loghandler_noflush-t 
+	ma_test_loghandler_noflush-t.c ma_maria_log_cleanup.c ma_loghandler_examples.c)
+ADD_EXECUTABLE(ma_test_loghandler_first_lsn-t 
+	ma_test_loghandler_first_lsn-t.c ma_maria_log_cleanup.c ma_loghandler_examples.c)
+ADD_EXECUTABLE(ma_test_loghandler_max_lsn-t 
+	ma_test_loghandler_max_lsn-t.c ma_maria_log_cleanup.c ma_loghandler_examples.c)
+ADD_EXECUTABLE(ma_test_loghandler_purge-t
+	ma_test_loghandler_purge-t.c ma_maria_log_cleanup.c ma_loghandler_examples.c)
+ADD_EXECUTABLE(ma_test_loghandler_readonly-t
+	ma_test_loghandler_multigroup-t.c ma_maria_log_cleanup.c ma_loghandler_examples.c  sequence_storage.c)
+SET_TARGET_PROPERTIES(ma_test_loghandler_readonly-t  PROPERTIES COMPILE_FLAGS "-DREADONLY_TEST")
+ADD_EXECUTABLE(ma_test_loghandler_nologs-t
+	ma_test_loghandler_nologs-t.c ma_maria_log_cleanup.c ma_loghandler_examples.c)
+
+SET(ma_pagecache_single_src	ma_pagecache_single.c test_file.c test_file.h)
+SET(ma_pagecache_consist_src ma_pagecache_consist.c test_file.c test_file.h)
+SET(ma_pagecache_common_cppflags "-DEXTRA_DEBUG -DPAGECACHE_DEBUG -DMAIN")
+
+ADD_EXECUTABLE(ma_pagecache_single_1k-t ${ma_pagecache_single_src})
+SET_TARGET_PROPERTIES(ma_pagecache_single_1k-t
+	PROPERTIES COMPILE_FLAGS "${ma_pagecache_common_cppflags} -DTEST_PAGE_SIZE=1024")
+
+ADD_EXECUTABLE(ma_pagecache_single_8k-t ${ma_pagecache_single_src})
+SET_TARGET_PROPERTIES(ma_pagecache_single_8k-t
+	PROPERTIES COMPILE_FLAGS "${ma_pagecache_common_cppflags} -DTEST_PAGE_SIZE=8192")
+
+ADD_EXECUTABLE(ma_pagecache_single_64k-t ${ma_pagecache_single_src})
+SET_TARGET_PROPERTIES(ma_pagecache_single_64k-t
+	PROPERTIES COMPILE_FLAGS "${ma_pagecache_common_cppflags} -DTEST_PAGE_SIZE=65536")
+
+ADD_EXECUTABLE(ma_pagecache_consist_1k-t ${ma_pagecache_consist_src})
+SET_TARGET_PROPERTIES(ma_pagecache_consist_1k-t
+	PROPERTIES COMPILE_FLAGS "${ma_pagecache_common_cppflags} -DTEST_PAGE_SIZE=1024")
+
+ADD_EXECUTABLE(ma_pagecache_consist_64k-t ${ma_pagecache_consist_src})
+SET_TARGET_PROPERTIES(ma_pagecache_consist_64k-t
+	PROPERTIES COMPILE_FLAGS "${ma_pagecache_common_cppflags} -DTEST_PAGE_SIZE=65536")
+
+ADD_EXECUTABLE(ma_pagecache_consist_1kHC-t
+	${ma_pagecache_consist_src})
+SET_TARGET_PROPERTIES(ma_pagecache_consist_1kHC-t
+	PROPERTIES COMPILE_FLAGS "${ma_pagecache_common_cppflags} -DTEST_PAGE_SIZE=1024 -DTEST_HIGH_CONCURENCY")
+ADD_EXECUTABLE(ma_pagecache_consist_64kHC-t
+	${ma_pagecache_consist_src})
+SET_TARGET_PROPERTIES(ma_pagecache_consist_64kHC-t
+	PROPERTIES COMPILE_FLAGS "${ma_pagecache_common_cppflags} -DTEST_PAGE_SIZE=65536 -DTEST_HIGH_CONCURENCY")
+ADD_EXECUTABLE(ma_pagecache_consist_1kRD-t ${ma_pagecache_consist_src})
+SET_TARGET_PROPERTIES(ma_pagecache_consist_1kRD-t
+	PROPERTIES COMPILE_FLAGS "${ma_pagecache_common_cppflags} -DTEST_PAGE_SIZE=1024 -DTEST_READERS")
+ADD_EXECUTABLE(ma_pagecache_consist_64kRD-t ${ma_pagecache_consist_src})
+SET_TARGET_PROPERTIES(ma_pagecache_consist_64kRD-t
+	PROPERTIES COMPILE_FLAGS "${ma_pagecache_common_cppflags} -DTEST_PAGE_SIZE=65536 -DTEST_READERS")
+ADD_EXECUTABLE(ma_pagecache_consist_1kWR-t ${ma_pagecache_consist_src})
+SET_TARGET_PROPERTIES(ma_pagecache_consist_1kWR-t
+	PROPERTIES COMPILE_FLAGS "${ma_pagecache_common_cppflags} -DTEST_PAGE_SIZE=1024 -DTEST_WRITERS")
+ADD_EXECUTABLE(ma_pagecache_consist_64kWR-t ${ma_pagecache_consist_src})
+SET_TARGET_PROPERTIES(ma_pagecache_consist_64kWR-t
+	PROPERTIES COMPILE_FLAGS "${ma_pagecache_common_cppflags} -DTEST_PAGE_SIZE=65536 -DTEST_WRITERS")
+ADD_EXECUTABLE(ma_pagecache_rwconsist_1k-t ma_pagecache_rwconsist.c)
+SET_TARGET_PROPERTIES(ma_pagecache_rwconsist_1k-t PROPERTIES COMPILE_FLAGS "-DTEST_PAGE_SIZE=1024")
+ADD_EXECUTABLE(ma_pagecache_rwconsist2_1k-t ma_pagecache_rwconsist2.c)
+SET_TARGET_PROPERTIES(ma_pagecache_rwconsist2_1k-t PROPERTIES COMPILE_FLAGS "-DTEST_PAGE_SIZE=1024")
diff --git a/storage/maria/unittest/Makefile.am b/storage/maria/unittest/Makefile.am
new file mode 100644
index 00000000000..b5bc8587066
--- /dev/null
+++ b/storage/maria/unittest/Makefile.am
@@ -0,0 +1,115 @@
+# Copyright (C) 2006-2008 MySQL AB
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; version 2 of the License.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+
+AM_CPPFLAGS      = @ZLIB_INCLUDES@ -I$(top_builddir)/include \
+                  -I$(top_srcdir)/include -I$(top_srcdir)/unittest/mytap
+INCLUDES         = @ZLIB_INCLUDES@ -I$(top_builddir)/include \
+                  -I$(top_srcdir)/include -I$(top_srcdir)/unittest/mytap
+EXTRA_DIST=	ma_test_all-t CMakeLists.txt \
+		ma_test_recovery.pl ma_test_recovery.expected
+# Only reason to link with libmyisam.a here is that it's where some fulltext
+# pieces are (but soon we'll remove fulltext dependencies from Aria).
+LDADD=			$(top_builddir)/unittest/mytap/libmytap.a \
+			$(top_builddir)/storage/maria/libaria.a \
+                        $(top_builddir)/storage/myisam/libmyisam.a \
+			$(top_builddir)/mysys/libmysys.a \
+			$(top_builddir)/dbug/libdbug.a \
+			$(top_builddir)/strings/libmystrings.a @ZLIB_LIBS@
+noinst_PROGRAMS =	ma_control_file-t trnman-t \
+			ma_pagecache_single_1k-t ma_pagecache_single_8k-t \
+			ma_pagecache_single_64k-t \
+			ma_pagecache_consist_1k-t \
+			ma_pagecache_consist_64k-t \
+			ma_pagecache_consist_1kHC-t \
+			ma_pagecache_consist_64kHC-t \
+			ma_pagecache_consist_1kRD-t \
+			ma_pagecache_consist_64kRD-t \
+			ma_pagecache_consist_1kWR-t \
+			ma_pagecache_consist_64kWR-t \
+                        ma_pagecache_rwconsist_1k-t \
+                        ma_pagecache_rwconsist2_1k-t \
+			ma_test_loghandler-t \
+                        ma_test_loghandler_multigroup-t \
+			ma_test_loghandler_multithread-t \
+			ma_test_loghandler_multiflush-t \
+			ma_test_loghandler_pagecache-t \
+			ma_test_loghandler_long-t \
+			ma_test_loghandler_noflush-t \
+			ma_test_loghandler_first_lsn-t \
+                        ma_test_loghandler_max_lsn-t \
+                        ma_test_loghandler_purge-t \
+			ma_test_loghandler_readonly-t\
+                        ma_test_loghandler_nologs-t
+
+ma_test_loghandler_t_SOURCES = ma_test_loghandler-t.c ma_maria_log_cleanup.c ma_loghandler_examples.c
+ma_test_loghandler_multigroup_t_SOURCES = ma_test_loghandler_multigroup-t.c ma_maria_log_cleanup.c ma_loghandler_examples.c sequence_storage.c sequence_storage.h
+ma_test_loghandler_multithread_t_SOURCES = ma_test_loghandler_multithread-t.c ma_maria_log_cleanup.c ma_loghandler_examples.c
+ma_test_loghandler_multiflush_t_SOURCES = ma_test_loghandler_multithread-t.c ma_maria_log_cleanup.c ma_loghandler_examples.c
+ma_test_loghandler_multiflush_t_CPPFLAGS = -DMULTIFLUSH_TEST
+ma_test_loghandler_pagecache_t_SOURCES = ma_test_loghandler_pagecache-t.c ma_maria_log_cleanup.c ma_loghandler_examples.c
+ma_test_loghandler_long_t_SOURCES = ma_test_loghandler-t.c ma_maria_log_cleanup.c ma_loghandler_examples.c
+ma_test_loghandler_long_t_CPPFLAGS = -DLONG_LOG_TEST
+ma_test_loghandler_noflush_t_SOURCES = ma_test_loghandler_noflush-t.c ma_maria_log_cleanup.c ma_loghandler_examples.c
+ma_test_loghandler_first_lsn_t_SOURCES = ma_test_loghandler_first_lsn-t.c ma_maria_log_cleanup.c ma_loghandler_examples.c
+ma_test_loghandler_max_lsn_t_SOURCES = ma_test_loghandler_max_lsn-t.c ma_maria_log_cleanup.c ma_loghandler_examples.c
+ma_test_loghandler_purge_t_SOURCES = ma_test_loghandler_purge-t.c ma_maria_log_cleanup.c ma_loghandler_examples.c
+ma_test_loghandler_readonly_t_SOURCES = ma_test_loghandler_multigroup-t.c ma_maria_log_cleanup.c ma_loghandler_examples.c sequence_storage.c sequence_storage.h
+ma_test_loghandler_readonly_t_CPPFLAGS = -DREADONLY_TEST
+ma_test_loghandler_nologs_t_SOURCES = ma_test_loghandler_nologs-t.c ma_maria_log_cleanup.c ma_loghandler_examples.c
+
+ma_pagecache_single_src =	ma_pagecache_single.c test_file.c test_file.h
+ma_pagecache_consist_src =	ma_pagecache_consist.c test_file.c test_file.h
+ma_pagecache_common_cppflags =	-DEXTRA_DEBUG -DPAGECACHE_DEBUG -DMAIN
+
+ma_pagecache_single_1k_t_SOURCES =	$(ma_pagecache_single_src)
+ma_pagecache_single_8k_t_SOURCES =	$(ma_pagecache_single_src)
+ma_pagecache_single_64k_t_SOURCES =	$(ma_pagecache_single_src)
+ma_pagecache_single_1k_t_CPPFLAGS = 	$(ma_pagecache_common_cppflags) -DTEST_PAGE_SIZE=1024
+ma_pagecache_single_8k_t_CPPFLAGS = 	$(ma_pagecache_common_cppflags) -DTEST_PAGE_SIZE=8192
+ma_pagecache_single_64k_t_CPPFLAGS =	$(ma_pagecache_common_cppflags) -DTEST_PAGE_SIZE=65536 -DBIG
+
+ma_pagecache_consist_1k_t_SOURCES =	$(ma_pagecache_consist_src)
+ma_pagecache_consist_1k_t_CPPFLAGS =	$(ma_pagecache_common_cppflags) -DTEST_PAGE_SIZE=1024
+ma_pagecache_consist_64k_t_SOURCES =	$(ma_pagecache_consist_src)
+ma_pagecache_consist_64k_t_CPPFLAGS = 	$(ma_pagecache_common_cppflags) -DTEST_PAGE_SIZE=65536
+
+ma_pagecache_consist_1kHC_t_SOURCES =	$(ma_pagecache_consist_src)
+ma_pagecache_consist_1kHC_t_CPPFLAGS = 	$(ma_pagecache_common_cppflags) -DTEST_PAGE_SIZE=1024 -DTEST_HIGH_CONCURENCY
+ma_pagecache_consist_64kHC_t_SOURCES =	$(ma_pagecache_consist_src)
+ma_pagecache_consist_64kHC_t_CPPFLAGS =	$(ma_pagecache_common_cppflags) -DTEST_PAGE_SIZE=65536 -DTEST_HIGH_CONCURENCY
+
+ma_pagecache_consist_1kRD_t_SOURCES =	$(ma_pagecache_consist_src)
+ma_pagecache_consist_1kRD_t_CPPFLAGS = 	$(ma_pagecache_common_cppflags) -DTEST_PAGE_SIZE=1024 -DTEST_READERS
+ma_pagecache_consist_64kRD_t_SOURCES =	$(ma_pagecache_consist_src)
+ma_pagecache_consist_64kRD_t_CPPFLAGS =	$(ma_pagecache_common_cppflags) -DTEST_PAGE_SIZE=65536 -DTEST_READERS
+
+ma_pagecache_consist_1kWR_t_SOURCES =	$(ma_pagecache_consist_src)
+ma_pagecache_consist_1kWR_t_CPPFLAGS = 	$(ma_pagecache_common_cppflags) -DTEST_PAGE_SIZE=1024 -DTEST_WRITERS
+ma_pagecache_consist_64kWR_t_SOURCES =	$(ma_pagecache_consist_src)
+ma_pagecache_consist_64kWR_t_CPPFLAGS =	$(ma_pagecache_common_cppflags) -DTEST_PAGE_SIZE=65536 -DTEST_WRITERS
+
+ma_pagecache_rwconsist_1k_t_SOURCES =	ma_pagecache_rwconsist.c
+ma_pagecache_rwconsist_1k_t_CPPFLAGS = -DTEST_PAGE_SIZE=1024
+ma_pagecache_rwconsist2_1k_t_SOURCES =	ma_pagecache_rwconsist2.c
+ma_pagecache_rwconsist2_1k_t_CPPFLAGS = -DTEST_PAGE_SIZE=1024
+
+# the generic lock manager may not be used in the end and lockman1-t crashes,
+# and lockman2-t takes at least quarter an hour,
+# so we don't build lockman-t and lockman1-t and lockman2-t 
+CLEANFILES =		aria_log_control page_cache_test_file_1 \
+			aria_log.????????
+
+# Don't update the files from bitkeeper
+%::SCCS/s.%
diff --git a/storage/maria/unittest/lockman-t.c b/storage/maria/unittest/lockman-t.c
new file mode 100644
index 00000000000..9b54a3d8ff9
--- /dev/null
+++ b/storage/maria/unittest/lockman-t.c
@@ -0,0 +1,308 @@
+/* Copyright (C) 2006 MySQL AB
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; version 2 of the License.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */
+
+/*
+  lockman for row and table locks
+*/
+
+/* #define EXTRA_VERBOSE */
+
+#include <tap.h>
+
+#include <my_global.h>
+#include <my_sys.h>
+#include <my_atomic.h>
+#include <lf.h>
+#include "../lockman.h"
+
+#define Nlos 100
+LOCK_OWNER loarray[Nlos];
+pthread_mutex_t mutexes[Nlos];
+pthread_cond_t conds[Nlos];
+LOCKMAN lockman;
+
+#ifndef EXTRA_VERBOSE
+#define print_lockhash(X)       /* no-op */
+#define DIAG(X)                 /* no-op */
+#else
+#define DIAG(X) diag X
+#endif
+
+LOCK_OWNER *loid2lo(uint16 loid)
+{
+  return loarray+loid-1;
+}
+
+#define unlock_all(O) diag("lo" #O "> release all locks");              \
+  lockman_release_locks(&lockman, loid2lo(O));print_lockhash(&lockman)
+#define test_lock(O, R, L, S, RES)                                      \
+  ok(lockman_getlock(&lockman, loid2lo(O), R, L) == RES,                \
+     "lo" #O "> " S "lock resource " #R " with " #L "-lock");           \
+  print_lockhash(&lockman)
+#define lock_ok_a(O, R, L)                                              \
+  test_lock(O, R, L, "", GOT_THE_LOCK)
+#define lock_ok_i(O, R, L)                                              \
+  test_lock(O, R, L, "", GOT_THE_LOCK_NEED_TO_LOCK_A_SUBRESOURCE)
+#define lock_ok_l(O, R, L)                                              \
+  test_lock(O, R, L, "", GOT_THE_LOCK_NEED_TO_INSTANT_LOCK_A_SUBRESOURCE)
+#define lock_conflict(O, R, L)                                          \
+  test_lock(O, R, L, "cannot ", DIDNT_GET_THE_LOCK);
+
+void test_lockman_simple()
+{
+  /* simple */
+  lock_ok_a(1, 1, S);
+  lock_ok_i(2, 2, IS);
+  lock_ok_i(1, 2, IX);
+  /* lock escalation */
+  lock_ok_a(1, 1, X);
+  lock_ok_i(2, 2, IX);
+  /* failures */
+  lock_conflict(2, 1, X);
+  unlock_all(2);
+  lock_ok_a(1, 2, S);
+  lock_ok_a(1, 2, IS);
+  lock_ok_a(1, 2, LS);
+  lock_ok_i(1, 3, IX);
+  lock_ok_a(2, 3, LS);
+  lock_ok_i(1, 3, IX);
+  lock_ok_l(2, 3, IS);
+  unlock_all(1);
+  unlock_all(2);
+
+  lock_ok_i(1, 1, IX);
+  lock_conflict(2, 1, S);
+  lock_ok_a(1, 1, LS);
+  unlock_all(1);
+  unlock_all(2);
+
+  lock_ok_i(1, 1, IX);
+  lock_ok_a(2, 1, LS);
+  lock_ok_a(1, 1, LS);
+  lock_ok_i(1, 1, IX);
+  lock_ok_i(3, 1, IS);
+  unlock_all(1);
+  unlock_all(2);
+  unlock_all(3);
+
+  lock_ok_i(1, 4, IS);
+  lock_ok_i(2, 4, IS);
+  lock_ok_i(3, 4, IS);
+  lock_ok_a(3, 4, LS);
+  lock_ok_i(4, 4, IS);
+  lock_conflict(4, 4, IX);
+  lock_conflict(2, 4, IX);
+  lock_ok_a(1, 4, LS);
+  unlock_all(1);
+  unlock_all(2);
+  unlock_all(3);
+  unlock_all(4);
+
+  lock_ok_i(1, 1, IX);
+  lock_ok_i(2, 1, IX);
+  lock_conflict(1, 1, S);
+  lock_conflict(2, 1, X);
+  unlock_all(1);
+  unlock_all(2);
+}
+
+int rt_num_threads;
+int litmus;
+int thread_number= 0, timeouts= 0;
+void run_test(const char *test, pthread_handler handler, int n, int m)
+{
+  pthread_t *threads;
+  ulonglong now= my_getsystime();
+  int i;
+
+  thread_number= timeouts= 0;
+  litmus= 0;
+
+  threads= (pthread_t *)my_malloc(sizeof(void *)*n, MYF(0));
+  if (!threads)
+  {
+    diag("Out of memory");
+    abort();
+  }
+
+  diag("Running %s with %d threads, %d iterations... ", test, n, m);
+  rt_num_threads= n;
+  for (i= 0; i < n ; i++)
+    if (pthread_create(threads+i, 0, handler, &m))
+    {
+      diag("Could not create thread");
+      abort();
+    }
+  for (i= 0 ; i < n ; i++)
+    pthread_join(threads[i], 0);
+  now= my_getsystime()-now;
+  ok(litmus == 0, "Finished %s in %g secs (%d)", test, ((double)now)/1e7, litmus);
+  my_free((void*)threads, MYF(0));
+}
+
+pthread_mutex_t rt_mutex;
+int Nrows= 100;
+int Ntables= 10;
+int table_lock_ratio= 10;
+enum lockman_lock_type lock_array[6]= {S, X, LS, LX, IS, IX};
+char *lock2str[6]= {"S", "X", "LS", "LX", "IS", "IX"};
+char *res2str[4]= {
+  "DIDN'T GET THE LOCK",
+  "GOT THE LOCK",
+  "GOT THE LOCK NEED TO LOCK A SUBRESOURCE",
+  "GOT THE LOCK NEED TO INSTANT LOCK A SUBRESOURCE"};
+pthread_handler_t test_lockman(void *arg)
+{
+  int    m= (*(int *)arg);
+  uint   x, loid, row, table, res, locklevel, timeout= 0;
+  LOCK_OWNER *lo;
+
+  pthread_mutex_lock(&rt_mutex);
+  loid= ++thread_number;
+  pthread_mutex_unlock(&rt_mutex);
+  lo= loid2lo(loid);
+
+  for (x= ((int)(intptr)(&m)); m > 0; m--)
+  {
+    x= (x*3628273133 + 1500450271) % 9576890767; /* three prime numbers */
+    row=  x % Nrows + Ntables;
+    table= row % Ntables;
+    locklevel= (x/Nrows) & 3;
+    if (table_lock_ratio && (x/Nrows/4) % table_lock_ratio == 0)
+    { /* table lock */
+      res= lockman_getlock(&lockman, lo, table, lock_array[locklevel]);
+      DIAG(("loid %2d, table %d, lock %s, res %s", loid, table,
+            lock2str[locklevel], res2str[res]));
+      if (res == DIDNT_GET_THE_LOCK)
+      {
+        lockman_release_locks(&lockman, lo);
+        DIAG(("loid %2d, release all locks", loid));
+        timeout++;
+        continue;
+      }
+      DBUG_ASSERT(res == GOT_THE_LOCK);
+    }
+    else
+    { /* row lock */
+      locklevel&= 1;
+      res= lockman_getlock(&lockman, lo, table, lock_array[locklevel + 4]);
+      DIAG(("loid %2d, row %d, lock %s, res %s", loid, row,
+            lock2str[locklevel+4], res2str[res]));
+      switch (res)
+      {
+      case DIDNT_GET_THE_LOCK:
+        lockman_release_locks(&lockman, lo);
+        DIAG(("loid %2d, release all locks", loid));
+        timeout++;
+        continue;
+      case GOT_THE_LOCK:
+        continue;
+      case GOT_THE_LOCK_NEED_TO_INSTANT_LOCK_A_SUBRESOURCE:
+        /* not implemented, so take a regular lock */
+      case GOT_THE_LOCK_NEED_TO_LOCK_A_SUBRESOURCE:
+        res= lockman_getlock(&lockman, lo, row, lock_array[locklevel]);
+        DIAG(("loid %2d, ROW %d, lock %s, res %s", loid, row,
+              lock2str[locklevel], res2str[res]));
+        if (res == DIDNT_GET_THE_LOCK)
+        {
+          lockman_release_locks(&lockman, lo);
+          DIAG(("loid %2d, release all locks", loid));
+          timeout++;
+          continue;
+        }
+        DBUG_ASSERT(res == GOT_THE_LOCK);
+        continue;
+      default:
+        DBUG_ASSERT(0);
+      }
+    }
+  }
+
+  lockman_release_locks(&lockman, lo);
+
+  pthread_mutex_lock(&rt_mutex);
+  rt_num_threads--;
+  timeouts+= timeout;
+  if (!rt_num_threads)
+    diag("number of timeouts: %d", timeouts);
+  pthread_mutex_unlock(&rt_mutex);
+
+  return 0;
+}
+
+int main()
+{
+  int i;
+
+  my_init();
+  pthread_mutex_init(&rt_mutex, 0);
+
+  plan(35);
+
+  if (my_atomic_initialize())
+    return exit_status();
+
+
+  lockman_init(&lockman, &loid2lo, 50);
+
+  for (i= 0; i < Nlos; i++)
+  {
+    loarray[i].pins= lf_alloc_get_pins(&lockman.alloc);
+    loarray[i].all_locks= 0;
+    loarray[i].waiting_for= 0;
+    pthread_mutex_init(&mutexes[i], MY_MUTEX_INIT_FAST);
+    pthread_cond_init (&conds[i], 0);
+    loarray[i].mutex= &mutexes[i];
+    loarray[i].cond= &conds[i];
+    loarray[i].loid= i+1;
+  }
+
+  test_lockman_simple();
+
+#define CYCLES 10000
+#define THREADS Nlos /* don't change this line */
+
+  /* mixed load, stress-test with random locks */
+  Nrows= 100;
+  Ntables= 10;
+  table_lock_ratio= 10;
+  run_test("\"random lock\" stress test", test_lockman, THREADS, CYCLES);
+
+  /* "real-life" simulation - many rows, no table locks */
+  Nrows= 1000000;
+  Ntables= 10;
+  table_lock_ratio= 0;
+  run_test("\"real-life\" simulation test", test_lockman, THREADS, CYCLES*10);
+
+  for (i= 0; i < Nlos; i++)
+  {
+    lockman_release_locks(&lockman, &loarray[i]);
+    pthread_mutex_destroy(loarray[i].mutex);
+    pthread_cond_destroy(loarray[i].cond);
+    lf_pinbox_put_pins(loarray[i].pins);
+  }
+
+  {
+    ulonglong now= my_getsystime();
+    lockman_destroy(&lockman);
+    now= my_getsystime()-now;
+    diag("lockman_destroy: %g secs", ((double)now)/1e7);
+  }
+
+  pthread_mutex_destroy(&rt_mutex);
+  my_end(0);
+  return exit_status();
+}
+
diff --git a/storage/maria/unittest/lockman1-t.c b/storage/maria/unittest/lockman1-t.c
new file mode 100644
index 00000000000..ca959c6e6e3
--- /dev/null
+++ b/storage/maria/unittest/lockman1-t.c
@@ -0,0 +1,334 @@
+/* Copyright (C) 2006 MySQL AB
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; version 2 of the License.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */
+
+/*
+  lockman for row locks, tablockman for table locks
+*/
+
+/* #define EXTRA_VERBOSE */
+
+#include <tap.h>
+
+#include <my_global.h>
+#include <my_sys.h>
+#include <my_atomic.h>
+#include <lf.h>
+#include "../lockman.h"
+#include "../tablockman.h"
+
+#define Nlos 100
+#define Ntbls 10
+LOCK_OWNER loarray[Nlos];
+TABLE_LOCK_OWNER loarray1[Nlos];
+pthread_mutex_t mutexes[Nlos];
+pthread_cond_t conds[Nlos];
+LOCKED_TABLE ltarray[Ntbls];
+LOCKMAN lockman;
+TABLOCKMAN tablockman;
+
+#ifndef EXTRA_VERBOSE
+#define print_lo1(X)       /* no-op */
+#define DIAG(X)            /* no-op */
+#else
+#define DIAG(X) diag X
+#endif
+
+LOCK_OWNER *loid2lo(uint16 loid)
+{
+  return loarray+loid-1;
+}
+TABLE_LOCK_OWNER *loid2lo1(uint16 loid)
+{
+  return loarray1+loid-1;
+}
+
+#define unlock_all(O) diag("lo" #O "> release all locks");              \
+  tablockman_release_locks(&tablockman, loid2lo1(O));
+#define test_lock(O, R, L, S, RES)                                      \
+  ok(tablockman_getlock(&tablockman, loid2lo1(O), &ltarray[R], L) == RES,   \
+     "lo" #O "> " S "lock resource " #R " with " #L "-lock");           \
+  print_lo1(loid2lo1(O));
+#define lock_ok_a(O, R, L)                                              \
+  test_lock(O, R, L, "", GOT_THE_LOCK)
+#define lock_ok_i(O, R, L)                                              \
+  test_lock(O, R, L, "", GOT_THE_LOCK_NEED_TO_LOCK_A_SUBRESOURCE)
+#define lock_ok_l(O, R, L)                                              \
+  test_lock(O, R, L, "", GOT_THE_LOCK_NEED_TO_INSTANT_LOCK_A_SUBRESOURCE)
+#define lock_conflict(O, R, L)                                          \
+  test_lock(O, R, L, "cannot ", LOCK_TIMEOUT);
+
+void test_tablockman_simple()
+{
+  /* simple */
+  lock_ok_a(1, 1, S);
+  lock_ok_i(2, 2, IS);
+  lock_ok_i(1, 2, IX);
+  /* lock escalation */
+  lock_ok_a(1, 1, X);
+  lock_ok_i(2, 2, IX);
+  /* failures */
+  lock_conflict(2, 1, X);
+  unlock_all(2);
+  lock_ok_a(1, 2, S);
+  lock_ok_a(1, 2, IS);
+  lock_ok_a(1, 2, LS);
+  lock_ok_i(1, 3, IX);
+  lock_ok_a(2, 3, LS);
+  lock_ok_i(1, 3, IX);
+  lock_ok_l(2, 3, IS);
+  unlock_all(1);
+  unlock_all(2);
+
+  lock_ok_i(1, 1, IX);
+  lock_conflict(2, 1, S);
+  lock_ok_a(1, 1, LS);
+  unlock_all(1);
+  unlock_all(2);
+
+  lock_ok_i(1, 1, IX);
+  lock_ok_a(2, 1, LS);
+  lock_ok_a(1, 1, LS);
+  lock_ok_i(1, 1, IX);
+  lock_ok_i(3, 1, IS);
+  unlock_all(1);
+  unlock_all(2);
+  unlock_all(3);
+
+  lock_ok_i(1, 4, IS);
+  lock_ok_i(2, 4, IS);
+  lock_ok_i(3, 4, IS);
+  lock_ok_a(3, 4, LS);
+  lock_ok_i(4, 4, IS);
+  lock_conflict(4, 4, IX);
+  lock_conflict(2, 4, IX);
+  lock_ok_a(1, 4, LS);
+  unlock_all(1);
+  unlock_all(2);
+  unlock_all(3);
+  unlock_all(4);
+
+  lock_ok_i(1, 1, IX);
+  lock_ok_i(2, 1, IX);
+  lock_conflict(1, 1, S);
+  lock_conflict(2, 1, X);
+  unlock_all(1);
+  unlock_all(2);
+}
+
+int rt_num_threads;
+int litmus;
+int thread_number= 0, timeouts= 0;
+void run_test(const char *test, pthread_handler handler, int n, int m)
+{
+  pthread_t *threads;
+  ulonglong now= my_getsystime();
+  int i;
+
+  thread_number= timeouts= 0;
+  litmus= 0;
+
+  threads= (pthread_t *)my_malloc(sizeof(void *)*n, MYF(0));
+  if (!threads)
+  {
+    diag("Out of memory");
+    abort();
+  }
+
+  diag("Running %s with %d threads, %d iterations... ", test, n, m);
+  rt_num_threads= n;
+  for (i= 0; i < n ; i++)
+    if (pthread_create(threads+i, 0, handler, &m))
+    {
+      diag("Could not create thread");
+      abort();
+    }
+  for (i= 0 ; i < n ; i++)
+    pthread_join(threads[i], 0);
+  now= my_getsystime()-now;
+  ok(litmus == 0, "Finished %s in %g secs (%d)", test, ((double)now)/1e7, litmus);
+  my_free((void*)threads, MYF(0));
+}
+
+pthread_mutex_t rt_mutex;
+int Nrows= 100;
+int Ntables= 10;
+int table_lock_ratio= 10;
+enum lockman_lock_type lock_array[6]= {S, X, LS, LX, IS, IX};
+char *lock2str[6]= {"S", "X", "LS", "LX", "IS", "IX"};
+char *res2str[]= {
+  "DIDN'T GET THE LOCK",
+  "OUT OF MEMORY",
+  "DEADLOCK",
+  "LOCK TIMEOUT",
+  "GOT THE LOCK",
+  "GOT THE LOCK NEED TO LOCK A SUBRESOURCE",
+  "GOT THE LOCK NEED TO INSTANT LOCK A SUBRESOURCE"};
+pthread_handler_t test_lockman(void *arg)
+{
+  int    m= (*(int *)arg);
+  uint   x, loid, row, table, res, locklevel, timeout= 0;
+  LOCK_OWNER *lo; TABLE_LOCK_OWNER *lo1; DBUG_ASSERT(Ntables <= Ntbls);
+
+  pthread_mutex_lock(&rt_mutex);
+  loid= ++thread_number;
+  pthread_mutex_unlock(&rt_mutex);
+  lo= loid2lo(loid); lo1= loid2lo1(loid);
+
+  for (x= ((int)(intptr)(&m)); m > 0; m--)
+  {
+    x= (x*3628273133 + 1500450271) % 9576890767; /* three prime numbers */
+    row=  x % Nrows + Ntables;
+    table= row % Ntables;
+    locklevel= (x/Nrows) & 3;
+    if (table_lock_ratio && (x/Nrows/4) % table_lock_ratio == 0)
+    { /* table lock */
+      res= tablockman_getlock(&tablockman, lo1, ltarray+table, lock_array[locklevel]);
+      DIAG(("loid %2d, table %d, lock %s, res %s", loid, table,
+            lock2str[locklevel], res2str[res]));
+      if (res < GOT_THE_LOCK)
+      {
+        lockman_release_locks(&lockman, lo); tablockman_release_locks(&tablockman, lo1);
+        DIAG(("loid %2d, release all locks", loid));
+        timeout++;
+        continue;
+      }
+      DBUG_ASSERT(res == GOT_THE_LOCK);
+    }
+    else
+    { /* row lock */
+      locklevel&= 1;
+      res= tablockman_getlock(&tablockman, lo1, ltarray+table, lock_array[locklevel + 4]);
+      DIAG(("loid %2d, row %d, lock %s, res %s", loid, row,
+            lock2str[locklevel+4], res2str[res]));
+      switch (res)
+      {
+      case GOT_THE_LOCK:
+        continue;
+      case GOT_THE_LOCK_NEED_TO_INSTANT_LOCK_A_SUBRESOURCE:
+        /* not implemented, so take a regular lock */
+      case GOT_THE_LOCK_NEED_TO_LOCK_A_SUBRESOURCE:
+        res= lockman_getlock(&lockman, lo, row, lock_array[locklevel]);
+        DIAG(("loid %2d, ROW %d, lock %s, res %s", loid, row,
+              lock2str[locklevel], res2str[res]));
+        if (res == DIDNT_GET_THE_LOCK)
+        {
+          lockman_release_locks(&lockman, lo);
+          tablockman_release_locks(&tablockman, lo1);
+          DIAG(("loid %2d, release all locks", loid));
+          timeout++;
+          continue;
+        }
+        DBUG_ASSERT(res == GOT_THE_LOCK);
+        continue;
+      default:
+        lockman_release_locks(&lockman, lo); tablockman_release_locks(&tablockman, lo1);
+        DIAG(("loid %2d, release all locks", loid));
+        timeout++;
+        continue;
+      }
+    }
+  }
+
+  lockman_release_locks(&lockman, lo);
+  tablockman_release_locks(&tablockman, lo1);
+
+  pthread_mutex_lock(&rt_mutex);
+  rt_num_threads--;
+  timeouts+= timeout;
+  if (!rt_num_threads)
+    diag("number of timeouts: %d", timeouts);
+  pthread_mutex_unlock(&rt_mutex);
+
+  return 0;
+}
+
+int main()
+{
+  int i;
+
+  my_init();
+  pthread_mutex_init(&rt_mutex, 0);
+
+  plan(35);
+
+  if (my_atomic_initialize())
+    return exit_status();
+
+
+  lockman_init(&lockman, &loid2lo, 50);
+  tablockman_init(&tablockman, &loid2lo1, 50);
+
+  for (i= 0; i < Nlos; i++)
+  {
+    pthread_mutex_init(&mutexes[i], MY_MUTEX_INIT_FAST);
+    pthread_cond_init (&conds[i], 0);
+
+    loarray[i].pins= lf_alloc_get_pins(&lockman.alloc);
+    loarray[i].all_locks= 0;
+    loarray[i].waiting_for= 0;
+    loarray[i].mutex= &mutexes[i];
+    loarray[i].cond= &conds[i];
+    loarray[i].loid= i+1;
+
+    loarray1[i].active_locks= 0;
+    loarray1[i].waiting_lock= 0;
+    loarray1[i].waiting_for= 0;
+    loarray1[i].mutex= &mutexes[i];
+    loarray1[i].cond= &conds[i];
+    loarray1[i].loid= i+1;
+  }
+
+  for (i= 0; i < Ntbls; i++)
+  {
+    tablockman_init_locked_table(ltarray+i, Nlos);
+  }
+
+  test_tablockman_simple();
+
+#define CYCLES 10000
+#define THREADS Nlos /* don't change this line */
+
+  /* mixed load, stress-test with random locks */
+  Nrows= 100;
+  Ntables= 10;
+  table_lock_ratio= 10;
+  run_test("\"random lock\" stress test", test_lockman, THREADS, CYCLES);
+
+  /* "real-life" simulation - many rows, no table locks */
+  Nrows= 1000000;
+  Ntables= 10;
+  table_lock_ratio= 0;
+  run_test("\"real-life\" simulation test", test_lockman, THREADS, CYCLES*10);
+
+  for (i= 0; i < Nlos; i++)
+  {
+    lockman_release_locks(&lockman, &loarray[i]);
+    pthread_mutex_destroy(loarray[i].mutex);
+    pthread_cond_destroy(loarray[i].cond);
+    lf_pinbox_put_pins(loarray[i].pins);
+  }
+
+  {
+    ulonglong now= my_getsystime();
+    lockman_destroy(&lockman);
+    now= my_getsystime()-now;
+    diag("lockman_destroy: %g secs", ((double)now)/1e7);
+  }
+
+  pthread_mutex_destroy(&rt_mutex);
+  my_end(0);
+  return exit_status();
+}
+
diff --git a/storage/maria/unittest/lockman2-t.c b/storage/maria/unittest/lockman2-t.c
new file mode 100644
index 00000000000..c1d40159500
--- /dev/null
+++ b/storage/maria/unittest/lockman2-t.c
@@ -0,0 +1,361 @@
+/* Copyright (C) 2006 MySQL AB
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; version 2 of the License.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */
+
+/*
+  tablockman for row and table locks
+*/
+
+/* #define EXTRA_VERBOSE */
+
+#include <tap.h>
+
+#include <my_global.h>
+#include <my_sys.h>
+#include <my_atomic.h>
+#include <lf.h>
+#include "../tablockman.h"
+
+#define Nlos 100
+#define Ntbls 110
+TABLE_LOCK_OWNER loarray1[Nlos];
+pthread_mutex_t mutexes[Nlos];
+pthread_cond_t conds[Nlos];
+LOCKED_TABLE ltarray[Ntbls];
+TABLOCKMAN tablockman;
+
+#ifndef EXTRA_VERBOSE
+#define print_lo1(X)       /* no-op */
+#define DIAG(X)            /* no-op */
+#else
+#define DIAG(X) diag X
+#endif
+
+TABLE_LOCK_OWNER *loid2lo1(uint16 loid)
+{
+  return loarray1+loid-1;
+}
+
+#define unlock_all(O) diag("lo" #O "> release all locks");              \
+  tablockman_release_locks(&tablockman, loid2lo1(O));
+#define test_lock(O, R, L, S, RES)                                      \
+  ok(tablockman_getlock(&tablockman, loid2lo1(O), &ltarray[R], L) == RES,   \
+     "lo" #O "> " S "lock resource " #R " with " #L "-lock");           \
+  print_lo1(loid2lo1(O));
+#define lock_ok_a(O, R, L)                                              \
+  test_lock(O, R, L, "", GOT_THE_LOCK)
+#define lock_ok_i(O, R, L)                                              \
+  test_lock(O, R, L, "", GOT_THE_LOCK_NEED_TO_LOCK_A_SUBRESOURCE)
+#define lock_ok_l(O, R, L)                                              \
+  test_lock(O, R, L, "", GOT_THE_LOCK_NEED_TO_INSTANT_LOCK_A_SUBRESOURCE)
+#define lock_conflict(O, R, L)                                          \
+  test_lock(O, R, L, "cannot ", LOCK_TIMEOUT);
+
+void test_tablockman_simple()
+{
+  /* simple */
+  lock_ok_a(1, 1, S);
+  lock_ok_i(2, 2, IS);
+  lock_ok_i(1, 2, IX);
+  /* lock escalation */
+  lock_ok_a(1, 1, X);
+  lock_ok_i(2, 2, IX);
+  /* failures */
+  lock_conflict(2, 1, X);
+  unlock_all(2);
+  lock_ok_a(1, 2, S);
+  lock_ok_a(1, 2, IS);
+  lock_ok_a(1, 2, LS);
+  lock_ok_i(1, 3, IX);
+  lock_ok_a(2, 3, LS);
+  lock_ok_i(1, 3, IX);
+  lock_ok_l(2, 3, IS);
+  unlock_all(1);
+  unlock_all(2);
+
+  lock_ok_i(1, 1, IX);
+  lock_conflict(2, 1, S);
+  lock_ok_a(1, 1, LS);
+  unlock_all(1);
+  unlock_all(2);
+
+  lock_ok_i(1, 1, IX);
+  lock_ok_a(2, 1, LS);
+  lock_ok_a(1, 1, LS);
+  lock_ok_i(1, 1, IX);
+  lock_ok_i(3, 1, IS);
+  unlock_all(1);
+  unlock_all(2);
+  unlock_all(3);
+
+  lock_ok_i(1, 4, IS);
+  lock_ok_i(2, 4, IS);
+  lock_ok_i(3, 4, IS);
+  lock_ok_a(3, 4, LS);
+  lock_ok_i(4, 4, IS);
+  lock_conflict(4, 4, IX);
+  lock_conflict(2, 4, IX);
+  lock_ok_a(1, 4, LS);
+  unlock_all(1);
+  unlock_all(2);
+  unlock_all(3);
+  unlock_all(4);
+
+  lock_ok_i(1, 1, IX);
+  lock_ok_i(2, 1, IX);
+  lock_conflict(1, 1, S);
+  lock_conflict(2, 1, X);
+  unlock_all(1);
+  unlock_all(2);
+
+  lock_ok_i(1, 1, IS);
+  lock_conflict(2, 1, X);
+  lock_conflict(3, 1, IS);
+  unlock_all(1);
+  unlock_all(2);
+  unlock_all(3);
+
+  lock_ok_a(1, 1, S);
+  lock_conflict(2, 1, IX);
+  lock_conflict(3, 1, IS);
+  unlock_all(1);
+  unlock_all(2);
+  unlock_all(3);
+}
+
+int rt_num_threads;
+int litmus;
+int thread_number= 0, timeouts= 0;
+void run_test(const char *test, pthread_handler handler, int n, int m)
+{
+  pthread_t *threads;
+  ulonglong now= my_getsystime();
+  int i;
+
+  thread_number= timeouts= 0;
+  litmus= 0;
+
+  threads= (pthread_t *)my_malloc(sizeof(void *)*n, MYF(0));
+  if (!threads)
+  {
+    diag("Out of memory");
+    abort();
+  }
+
+  diag("Running %s with %d threads, %d iterations... ", test, n, m);
+  rt_num_threads= n;
+  for (i= 0; i < n ; i++)
+    if (pthread_create(threads+i, 0, handler, &m))
+    {
+      diag("Could not create thread");
+      abort();
+    }
+  for (i= 0 ; i < n ; i++)
+    pthread_join(threads[i], 0);
+  now= my_getsystime()-now;
+  ok(litmus == 0, "Finished %s in %g secs (%d)", test, ((double)now)/1e7, litmus);
+  my_free((void*)threads, MYF(0));
+}
+
+static void reinit_tlo(TABLOCKMAN *lm, TABLE_LOCK_OWNER *lo)
+{
+#ifdef NOT_USED_YET
+  TABLE_LOCK_OWNER backup= *lo;
+#endif
+
+  tablockman_release_locks(lm, lo);
+#ifdef NOT_USED_YET
+  pthread_mutex_destroy(lo->mutex);
+  pthread_cond_destroy(lo->cond);
+  bzero(lo, sizeof(*lo));
+
+  lo->mutex= backup.mutex;
+  lo->cond= backup.cond;
+  lo->loid= backup.loid;
+  pthread_mutex_init(lo->mutex, MY_MUTEX_INIT_FAST);
+  pthread_cond_init(lo->cond, 0);
+#endif
+}
+
+pthread_mutex_t rt_mutex;
+int Nrows= 100;
+int Ntables= 10;
+int table_lock_ratio= 10;
+enum lockman_lock_type lock_array[6]= {S, X, LS, LX, IS, IX};
+const char *lock2str[6]= {"S", "X", "LS", "LX", "IS", "IX"};
+const char *res2str[]= {
+  0,
+  "OUT OF MEMORY",
+  "DEADLOCK",
+  "LOCK TIMEOUT",
+  "GOT THE LOCK",
+  "GOT THE LOCK NEED TO LOCK A SUBRESOURCE",
+  "GOT THE LOCK NEED TO INSTANT LOCK A SUBRESOURCE"};
+
+pthread_handler_t test_lockman(void *arg)
+{
+  int    m= (*(int *)arg);
+  uint   x, loid, row, table, res, locklevel, timeout= 0;
+  TABLE_LOCK_OWNER *lo1;
+  DBUG_ASSERT(Ntables <= Ntbls);
+  DBUG_ASSERT(Nrows + Ntables <= Ntbls);
+
+  pthread_mutex_lock(&rt_mutex);
+  loid= ++thread_number;
+  pthread_mutex_unlock(&rt_mutex);
+  lo1= loid2lo1(loid);
+
+  for (x= ((int)(intptr)(&m)); m > 0; m--)
+  {
+    /* three prime numbers */
+    x= (uint) ((x*LL(3628273133) + LL(1500450271)) % LL(9576890767));
+    row=  x % Nrows + Ntables;
+    table= row % Ntables;
+    locklevel= (x/Nrows) & 3;
+    if (table_lock_ratio && (x/Nrows/4) % table_lock_ratio == 0)
+    {
+      /* table lock */
+      res= tablockman_getlock(&tablockman, lo1, ltarray+table,
+                              lock_array[locklevel]);
+      DIAG(("loid %2d, table %d, lock %s, res %s", loid, table,
+            lock2str[locklevel], res2str[res]));
+      if (res < GOT_THE_LOCK)
+      {
+        reinit_tlo(&tablockman, lo1);
+        DIAG(("loid %2d, release all locks", loid));
+        timeout++;
+        continue;
+      }
+      DBUG_ASSERT(res == GOT_THE_LOCK);
+    }
+    else
+    { /* row lock */
+      locklevel&= 1;
+      res= tablockman_getlock(&tablockman, lo1, ltarray+table, lock_array[locklevel + 4]);
+      DIAG(("loid %2d, row %d, lock %s, res %s", loid, row,
+            lock2str[locklevel+4], res2str[res]));
+      switch (res)
+      {
+      case GOT_THE_LOCK:
+        continue;
+      case GOT_THE_LOCK_NEED_TO_INSTANT_LOCK_A_SUBRESOURCE:
+        /* not implemented, so take a regular lock */
+      case GOT_THE_LOCK_NEED_TO_LOCK_A_SUBRESOURCE:
+        res= tablockman_getlock(&tablockman, lo1, ltarray+row, lock_array[locklevel]);
+        DIAG(("loid %2d, ROW %d, lock %s, res %s", loid, row,
+              lock2str[locklevel], res2str[res]));
+        if (res < GOT_THE_LOCK)
+        {
+          reinit_tlo(&tablockman, lo1);
+          DIAG(("loid %2d, release all locks", loid));
+          timeout++;
+          continue;
+        }
+        DBUG_ASSERT(res == GOT_THE_LOCK);
+        continue;
+      default:
+        reinit_tlo(&tablockman, lo1);
+        DIAG(("loid %2d, release all locks", loid));
+        timeout++;
+        continue;
+      }
+    }
+  }
+
+  reinit_tlo(&tablockman, lo1);
+
+  pthread_mutex_lock(&rt_mutex);
+  rt_num_threads--;
+  timeouts+= timeout;
+  if (!rt_num_threads)
+    diag("number of timeouts: %d", timeouts);
+  pthread_mutex_unlock(&rt_mutex);
+
+  return 0;
+}
+
+int main(int argc __attribute__((unused)), char **argv)
+{
+  int i;
+  MY_INIT(argv[0]);
+
+  my_init();
+  pthread_mutex_init(&rt_mutex, 0);
+
+  plan(40);
+
+  if (my_atomic_initialize())
+    return exit_status();
+
+
+  tablockman_init(&tablockman, &loid2lo1, 50);
+
+  for (i= 0; i < Nlos; i++)
+  {
+    pthread_mutex_init(&mutexes[i], MY_MUTEX_INIT_FAST);
+    pthread_cond_init (&conds[i], 0);
+
+    loarray1[i].active_locks= 0;
+    loarray1[i].waiting_lock= 0;
+    loarray1[i].waiting_for= 0;
+    loarray1[i].mutex= &mutexes[i];
+    loarray1[i].cond= &conds[i];
+    loarray1[i].loid= i+1;
+  }
+
+  for (i= 0; i < Ntbls; i++)
+  {
+    tablockman_init_locked_table(ltarray+i, Nlos);
+  }
+
+  test_tablockman_simple();
+
+#define CYCLES 10000
+#define THREADS Nlos /* don't change this line */
+
+  /* mixed load, stress-test with random locks */
+  Nrows= 100;
+  Ntables= 10;
+  table_lock_ratio= 10;
+  run_test("\"random lock\" stress test", test_lockman, THREADS, CYCLES);
+#if 0
+  /* "real-life" simulation - many rows, no table locks */
+  Nrows= 1000000;
+  Ntables= 10;
+  table_lock_ratio= 0;
+  run_test("\"real-life\" simulation test", test_lockman, THREADS, CYCLES*10);
+#endif
+  for (i= 0; i < Nlos; i++)
+  {
+    tablockman_release_locks(&tablockman, &loarray1[i]);
+    pthread_mutex_destroy(loarray1[i].mutex);
+    pthread_cond_destroy(loarray1[i].cond);
+  }
+
+  {
+    ulonglong now= my_getsystime();
+    for (i= 0; i < Ntbls; i++)
+    {
+      tablockman_destroy_locked_table(ltarray+i);
+    }
+    tablockman_destroy(&tablockman);
+    now= my_getsystime()-now;
+    diag("lockman_destroy: %g secs", ((double)now)/1e7);
+  }
+
+  pthread_mutex_destroy(&rt_mutex);
+  my_end(0);
+  return exit_status();
+}
+
diff --git a/storage/maria/unittest/ma_control_file-t.c b/storage/maria/unittest/ma_control_file-t.c
new file mode 100644
index 00000000000..164ea284f31
--- /dev/null
+++ b/storage/maria/unittest/ma_control_file-t.c
@@ -0,0 +1,592 @@
+/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; version 2 of the License.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */
+
+/* Unit test of the control file module of the Aria engine WL#3234 */
+
+/*
+  Note that it is not possible to test the durability of the write (can't
+  pull the plug programmatically :)
+*/
+
+#include <my_global.h>
+#include <my_sys.h>
+#include <tap.h>
+
+#ifndef WITH_ARIA_STORAGE_ENGINE
+/*
+  If Aria is not compiled in, normally we don't come to building this test.
+*/
+#error "Aria engine is not compiled in, test cannot be built"
+#endif
+
+#include "maria.h"
+#include "../../../storage/maria/maria_def.h"
+#include <my_getopt.h>
+
+#define EXTRACT_DEFINITIONS
+#include "../ma_control_file.c"
+#undef EXTRACT_DEFINITIONS
+
+char file_name[FN_REFLEN];
+
+/* The values we'll set and expect the control file module to return */
+LSN    expect_checkpoint_lsn;
+uint32 expect_logno;
+TrID   expect_max_trid;
+uint8  expect_recovery_failures;
+
+static int delete_file(myf my_flags);
+/*
+  Those are test-specific wrappers around the module's API functions: after
+  calling the module's API functions they perform checks on the result.
+*/
+static int close_file(void); /* wraps ma_control_file_end */
+/* wraps ma_control_file_open_or_create */
+static int open_file(void);
+/* wraps ma_control_file_write_and_force */
+static int write_file(LSN checkpoint_lsn, uint32 logno, TrID trid,
+                      uint8 rec_failures);
+
+/* Tests */
+static int test_one_log_and_recovery_failures(void);
+static int test_five_logs_and_max_trid(void);
+static int test_3_checkpoints_and_2_logs(void);
+static int test_binary_content(void);
+static int test_start_stop(void);
+static int test_2_open_and_2_close(void);
+static int test_bad_magic_string(void);
+static int test_bad_checksum(void);
+static int test_bad_hchecksum(void);
+static int test_future_size(void);
+static int test_bad_blocksize(void);
+static int test_bad_size(void);
+
+/* Utility */
+static int verify_module_values_match_expected(void);
+static int verify_module_values_are_impossible(void);
+static void usage(void);
+static void get_options(int argc, char *argv[]);
+
+/*
+  If "expr" is FALSE, this macro will make the function print a diagnostic
+  message and immediately return 1.
+  This is inspired from assert() but does not crash the binary (sometimes we
+  may want to see how other tests go even if one fails).
+  RET_ERR means "return error".
+*/
+
+#define RET_ERR_UNLESS(expr) \
+  {if (!(expr)) {diag("line %d: failure: '%s'", __LINE__, #expr); assert(0);return 1;}}
+
+
+/* Used to ignore error messages from ma_control_file_open() */
+
+static int my_ignore_message(uint error __attribute__((unused)),
+                             const char *str __attribute__((unused)),
+                             myf MyFlags __attribute__((unused)))
+{
+  DBUG_ENTER("my_message_no_curses");
+  DBUG_PRINT("enter",("message: %s",str));
+  DBUG_RETURN(0);
+}
+
+int (*default_error_handler_hook)(uint my_err, const char *str,
+                                  myf MyFlags) = 0;
+
+
+/* like ma_control_file_open(), but without error messages */
+
+static CONTROL_FILE_ERROR local_ma_control_file_open(void)
+{
+  CONTROL_FILE_ERROR error;
+  error_handler_hook= my_ignore_message;
+  error= ma_control_file_open(TRUE, TRUE);
+  error_handler_hook= default_error_handler_hook;
+  return error;
+}
+
+
+
+int main(int argc,char *argv[])
+{
+  MY_INIT(argv[0]);
+  my_init();
+
+  maria_data_root= (char *)".";
+  default_error_handler_hook= error_handler_hook;
+
+  plan(12);
+
+  diag("Unit tests for control file");
+
+  get_options(argc,argv);
+
+  diag("Deleting control file at startup, if there is an old one");
+  RET_ERR_UNLESS(0 == delete_file(0)); /* if fails, can't continue */
+
+  diag("Tests of normal conditions");
+  ok(0 == test_one_log_and_recovery_failures(),
+     "test of creating one log and recording recovery failures");
+  ok(0 == test_five_logs_and_max_trid(),
+     "test of creating five logs and many transactions");
+  ok(0 == test_3_checkpoints_and_2_logs(),
+     "test of creating three checkpoints and two logs");
+  ok(0 == test_binary_content(), "test of the binary content of the file");
+  ok(0 == test_start_stop(), "test of multiple starts and stops");
+  diag("Tests of abnormal conditions");
+  ok(0 == test_2_open_and_2_close(),
+     "test of two open and two close (strange call sequence)");
+  ok(0 == test_bad_magic_string(), "test of bad magic string");
+  ok(0 == test_bad_checksum(), "test of bad checksum");
+  ok(0 == test_bad_hchecksum(), "test of bad hchecksum");
+  ok(0 == test_future_size(), "test of ability to handlr future versions");
+  ok(0 == test_bad_blocksize(), "test of bad blocksize");
+  ok(0 == test_bad_size(), "test of too small/big file");
+
+  return exit_status();
+}
+
+
+static int delete_file(myf my_flags)
+{
+  RET_ERR_UNLESS(fn_format(file_name, CONTROL_FILE_BASE_NAME,
+                           maria_data_root, "", MYF(MY_WME)) != NullS);
+  /*
+    Maybe file does not exist, ignore error.
+    The error will however be printed on stderr.
+  */
+  my_delete(file_name, my_flags);
+  expect_checkpoint_lsn= LSN_IMPOSSIBLE;
+  expect_logno= FILENO_IMPOSSIBLE;
+  expect_max_trid= expect_recovery_failures= 0;
+
+  return 0;
+}
+
+/*
+  Verifies that global values last_checkpoint_lsn, last_logno,
+  max_trid_in_control_file (belonging to the module) match what we expect.
+*/
+static int verify_module_values_match_expected(void)
+{
+  RET_ERR_UNLESS(last_logno == expect_logno);
+  RET_ERR_UNLESS(last_checkpoint_lsn == expect_checkpoint_lsn);
+  RET_ERR_UNLESS(max_trid_in_control_file == expect_max_trid);
+  RET_ERR_UNLESS(recovery_failures == expect_recovery_failures);
+  return 0;
+}
+
+
+/*
+  Verifies that global values last_checkpoint_lsn and last_logno (belonging
+  to the module) are impossible (this is used when the file has been closed).
+*/
+static int verify_module_values_are_impossible(void)
+{
+  RET_ERR_UNLESS(last_logno == FILENO_IMPOSSIBLE);
+  RET_ERR_UNLESS(last_checkpoint_lsn == LSN_IMPOSSIBLE);
+  RET_ERR_UNLESS(max_trid_in_control_file == 0);
+  return 0;
+}
+
+
+static int close_file(void)
+{
+  /* Simulate shutdown */
+  ma_control_file_end();
+  /* Verify amnesia */
+  RET_ERR_UNLESS(verify_module_values_are_impossible() == 0);
+  return 0;
+}
+
+static int open_file(void)
+{
+  RET_ERR_UNLESS(local_ma_control_file_open() == CONTROL_FILE_OK);
+  /* Check that the module reports expected information */
+  RET_ERR_UNLESS(verify_module_values_match_expected() == 0);
+  return 0;
+}
+
+static int write_file(LSN checkpoint_lsn, uint32 logno, TrID trid,
+                      uint8 rec_failures)
+{
+  RET_ERR_UNLESS(ma_control_file_write_and_force(checkpoint_lsn, logno, trid,
+                                                 rec_failures)
+                 == 0);
+  /* Check that the module reports expected information */
+  RET_ERR_UNLESS(verify_module_values_match_expected() == 0);
+  return 0;
+}
+
+static int test_one_log_and_recovery_failures(void)
+{
+  RET_ERR_UNLESS(open_file() == CONTROL_FILE_OK);
+  expect_logno= 123;
+  RET_ERR_UNLESS(write_file(last_checkpoint_lsn, expect_logno,
+                            max_trid_in_control_file,
+                            recovery_failures) == 0);
+  expect_recovery_failures= 158;
+  RET_ERR_UNLESS(write_file(last_checkpoint_lsn, expect_logno,
+                            max_trid_in_control_file,
+                            expect_recovery_failures) == 0);
+  RET_ERR_UNLESS(close_file() == 0);
+  return 0;
+}
+
+static int test_five_logs_and_max_trid(void)
+{
+  uint i;
+
+  RET_ERR_UNLESS(open_file() == CONTROL_FILE_OK);
+  expect_logno= 100;
+  expect_max_trid= ULL(14111978111);
+  for (i= 0; i<5; i++)
+  {
+    expect_logno*= 3;
+    RET_ERR_UNLESS(write_file(last_checkpoint_lsn, expect_logno,
+                              expect_max_trid,
+                              recovery_failures) == 0);
+  }
+  RET_ERR_UNLESS(close_file() == 0);
+  return 0;
+}
+
+static int test_3_checkpoints_and_2_logs(void)
+{
+  /*
+    Simulate one checkpoint, one log creation, two checkpoints, one
+    log creation.
+  */
+  RET_ERR_UNLESS(open_file() == CONTROL_FILE_OK);
+  expect_checkpoint_lsn= MAKE_LSN(5, 10000);
+  RET_ERR_UNLESS(write_file(expect_checkpoint_lsn, expect_logno,
+                            max_trid_in_control_file,
+                            recovery_failures) == 0);
+
+  expect_logno= 17;
+  RET_ERR_UNLESS(write_file(expect_checkpoint_lsn, expect_logno,
+                            max_trid_in_control_file,
+                            recovery_failures) == 0);
+
+  expect_checkpoint_lsn= MAKE_LSN(17, 20000);
+  RET_ERR_UNLESS(write_file(expect_checkpoint_lsn, expect_logno,
+                            max_trid_in_control_file,
+                            recovery_failures) == 0);
+
+  expect_checkpoint_lsn= MAKE_LSN(17, 45000);
+  RET_ERR_UNLESS(write_file(expect_checkpoint_lsn, expect_logno,
+                            max_trid_in_control_file,
+                            recovery_failures) == 0);
+
+  expect_logno= 19;
+  RET_ERR_UNLESS(write_file(expect_checkpoint_lsn, expect_logno,
+                            max_trid_in_control_file,
+                            recovery_failures) == 0);
+  RET_ERR_UNLESS(close_file() == 0);
+  return 0;
+}
+
+static int test_binary_content(void)
+{
+  uint i;
+  int fd;
+
+  /*
+    TEST4: actually check by ourselves the content of the file.
+    Note that constants (offsets) are hard-coded here, precisely to prevent
+    someone from changing them in the control file module and breaking
+    backward-compatibility.
+    TODO: when we reach the format-freeze state, we may even just do a
+    comparison with a raw binary string, to not depend on any uint4korr
+    future change/breakage.
+  */
+
+  uchar buffer[45];
+  RET_ERR_UNLESS((fd= my_open(file_name,
+                          O_BINARY | O_RDWR,
+                          MYF(MY_WME))) >= 0);
+  RET_ERR_UNLESS(my_read(fd, buffer, 45, MYF(MY_FNABP |  MY_WME)) == 0);
+  RET_ERR_UNLESS(my_close(fd, MYF(MY_WME)) == 0);
+  RET_ERR_UNLESS(open_file() == CONTROL_FILE_OK);
+  i= uint3korr(buffer + 34 );
+  RET_ERR_UNLESS(i == LSN_FILE_NO(last_checkpoint_lsn));
+  i= uint4korr(buffer + 37);
+  RET_ERR_UNLESS(i == LSN_OFFSET(last_checkpoint_lsn));
+  i= uint4korr(buffer + 41);
+  RET_ERR_UNLESS(i == last_logno);
+  RET_ERR_UNLESS(close_file() == 0);
+  return 0;
+}
+
+static int test_start_stop(void)
+{
+  /* TEST5: Simulate start/nothing/stop/start/nothing/stop/start */
+
+  RET_ERR_UNLESS(open_file() == CONTROL_FILE_OK);
+  RET_ERR_UNLESS(close_file() == 0);
+  RET_ERR_UNLESS(open_file() == CONTROL_FILE_OK);
+  RET_ERR_UNLESS(close_file() == 0);
+  RET_ERR_UNLESS(open_file() == CONTROL_FILE_OK);
+  RET_ERR_UNLESS(close_file() == 0);
+  return 0;
+}
+
+static int test_2_open_and_2_close(void)
+{
+  RET_ERR_UNLESS(open_file() == CONTROL_FILE_OK);
+  RET_ERR_UNLESS(open_file() == CONTROL_FILE_OK);
+  RET_ERR_UNLESS(close_file() == 0);
+  RET_ERR_UNLESS(close_file() == 0);
+  return 0;
+}
+
+
+static int test_bad_magic_string(void)
+{
+  uchar buffer[4];
+  int fd;
+
+  RET_ERR_UNLESS(open_file() == CONTROL_FILE_OK);
+  RET_ERR_UNLESS(close_file() == 0);
+
+  /* Corrupt magic string */
+  RET_ERR_UNLESS((fd= my_open(file_name,
+                          O_BINARY | O_RDWR,
+                          MYF(MY_WME))) >= 0);
+  RET_ERR_UNLESS(my_pread(fd, buffer, 4, 0, MYF(MY_FNABP |  MY_WME)) == 0);
+  RET_ERR_UNLESS(my_pwrite(fd, (const uchar *)"papa", 4, 0,
+                           MYF(MY_FNABP |  MY_WME)) == 0);
+
+  /* Check that control file module sees the problem */
+  RET_ERR_UNLESS(local_ma_control_file_open() ==
+             CONTROL_FILE_BAD_MAGIC_STRING);
+  /* Restore magic string */
+  RET_ERR_UNLESS(my_pwrite(fd, buffer, 4, 0, MYF(MY_FNABP |  MY_WME)) == 0);
+  RET_ERR_UNLESS(my_close(fd, MYF(MY_WME)) == 0);
+  RET_ERR_UNLESS(open_file() == CONTROL_FILE_OK);
+  RET_ERR_UNLESS(close_file() == 0);
+  return 0;
+}
+
+static int test_bad_checksum(void)
+{
+  uchar buffer[4];
+  int fd;
+
+  RET_ERR_UNLESS(open_file() == CONTROL_FILE_OK);
+  RET_ERR_UNLESS(close_file() == 0);
+
+  /* Corrupt checksum */
+  RET_ERR_UNLESS((fd= my_open(file_name,
+                          O_BINARY | O_RDWR,
+                          MYF(MY_WME))) >= 0);
+  RET_ERR_UNLESS(my_pread(fd, buffer, 1, 30, MYF(MY_FNABP |  MY_WME)) == 0);
+  buffer[0]+= 3; /* mangle checksum */
+  RET_ERR_UNLESS(my_pwrite(fd, buffer, 1, 30, MYF(MY_FNABP |  MY_WME)) == 0);
+  /* Check that control file module sees the problem */
+  RET_ERR_UNLESS(local_ma_control_file_open() ==
+                 CONTROL_FILE_BAD_CHECKSUM);
+  /* Restore checksum */
+  buffer[0]-= 3;
+  RET_ERR_UNLESS(my_pwrite(fd, buffer, 1, 30, MYF(MY_FNABP |  MY_WME)) == 0);
+  RET_ERR_UNLESS(my_close(fd, MYF(MY_WME)) == 0);
+
+  return 0;
+}
+
+
+static int test_bad_blocksize(void)
+{
+  maria_block_size<<= 1;
+  /* Check that control file module sees the problem */
+  RET_ERR_UNLESS(local_ma_control_file_open() ==
+                 CONTROL_FILE_WRONG_BLOCKSIZE);
+  /* Restore blocksize */
+  maria_block_size>>= 1;
+
+  RET_ERR_UNLESS(open_file() == CONTROL_FILE_OK);
+  RET_ERR_UNLESS(close_file() == 0);
+  return 0;
+}
+
+
+static int test_future_size(void)
+{
+  /*
+    Here we check ability to add fields only so we can use
+    defined constants
+  */
+  uint32 sum;
+  int fd;
+  uchar buffer[CF_CREATE_TIME_TOTAL_SIZE + CF_CHANGEABLE_TOTAL_SIZE + 2];
+  RET_ERR_UNLESS((fd= my_open(file_name,
+                          O_BINARY | O_RDWR,
+                          MYF(MY_WME))) >= 0);
+  RET_ERR_UNLESS(my_read(fd, buffer,
+                         CF_CREATE_TIME_TOTAL_SIZE + CF_CHANGEABLE_TOTAL_SIZE,
+                         MYF(MY_FNABP |  MY_WME)) == 0);
+  RET_ERR_UNLESS(my_close(fd, MYF(MY_WME)) == 0);
+  /* "add" new field of 1 byte (value 1) to header and variable part */
+  memmove(buffer + CF_CREATE_TIME_TOTAL_SIZE + 1,
+          buffer + CF_CREATE_TIME_TOTAL_SIZE,
+          CF_CHANGEABLE_TOTAL_SIZE);
+  buffer[CF_CREATE_TIME_TOTAL_SIZE - CF_CHECKSUM_SIZE]= '\1';
+  buffer[CF_CREATE_TIME_TOTAL_SIZE + CF_CHANGEABLE_TOTAL_SIZE + 1]= '\1';
+  /* fix lengths */
+  int2store(buffer + CF_CREATE_TIME_SIZE_OFFSET, CF_CREATE_TIME_TOTAL_SIZE + 1);
+  int2store(buffer + CF_CHANGEABLE_SIZE_OFFSET, CF_CHANGEABLE_TOTAL_SIZE + 1);
+  /* recalculete checksums */
+  sum= (uint32) my_checksum(0, buffer, CF_CREATE_TIME_TOTAL_SIZE -
+                            CF_CHECKSUM_SIZE + 1);
+  int4store(buffer + CF_CREATE_TIME_TOTAL_SIZE - CF_CHECKSUM_SIZE + 1, sum);
+  sum= (uint32) my_checksum(0, buffer +  CF_CREATE_TIME_TOTAL_SIZE + 1 +
+                            CF_CHECKSUM_SIZE,
+                            CF_CHANGEABLE_TOTAL_SIZE - CF_CHECKSUM_SIZE + 1);
+  int4store(buffer + CF_CREATE_TIME_TOTAL_SIZE + 1, sum);
+  /* write new file and check it */
+  RET_ERR_UNLESS((fd= my_open(file_name,
+                          O_BINARY | O_RDWR,
+                          MYF(MY_WME))) >= 0);
+  RET_ERR_UNLESS(my_pwrite(fd, buffer,
+                           CF_CREATE_TIME_TOTAL_SIZE +
+                           CF_CHANGEABLE_TOTAL_SIZE + 2,
+                           0, MYF(MY_FNABP |  MY_WME)) == 0);
+  RET_ERR_UNLESS(my_close(fd, MYF(MY_WME)) == 0);
+  RET_ERR_UNLESS(open_file() == CONTROL_FILE_OK);
+  RET_ERR_UNLESS(close_file() == 0);
+
+  return(0);
+}
+
+static int test_bad_hchecksum(void)
+{
+  uchar buffer[4];
+  int fd;
+
+  RET_ERR_UNLESS(open_file() == CONTROL_FILE_OK);
+  RET_ERR_UNLESS(close_file() == 0);
+
+  /* Corrupt checksum */
+  RET_ERR_UNLESS((fd= my_open(file_name,
+                          O_BINARY | O_RDWR,
+                          MYF(MY_WME))) >= 0);
+  RET_ERR_UNLESS(my_pread(fd, buffer, 1, 26, MYF(MY_FNABP |  MY_WME)) == 0);
+  buffer[0]+= 3; /* mangle checksum */
+  RET_ERR_UNLESS(my_pwrite(fd, buffer, 1, 26, MYF(MY_FNABP |  MY_WME)) == 0);
+  /* Check that control file module sees the problem */
+  RET_ERR_UNLESS(local_ma_control_file_open() ==
+                 CONTROL_FILE_BAD_HEAD_CHECKSUM);
+  /* Restore checksum */
+  buffer[0]-= 3;
+  RET_ERR_UNLESS(my_pwrite(fd, buffer, 1, 26, MYF(MY_FNABP |  MY_WME)) == 0);
+  RET_ERR_UNLESS(my_close(fd, MYF(MY_WME)) == 0);
+
+  return 0;
+}
+
+
+static int test_bad_size(void)
+{
+  uchar buffer[]=
+    "123456789012345678901234567890123456789012345678901234567890123456";
+  int fd, i;
+
+  /* A too short file */
+  RET_ERR_UNLESS(delete_file(MYF(MY_WME)) == 0);
+  RET_ERR_UNLESS((fd= my_open(file_name,
+                          O_BINARY | O_RDWR | O_CREAT,
+                          MYF(MY_WME))) >= 0);
+  RET_ERR_UNLESS(my_write(fd, buffer, 10, MYF(MY_FNABP |  MY_WME)) == 0);
+  /* Check that control file module sees the problem */
+  RET_ERR_UNLESS(local_ma_control_file_open() ==
+                 CONTROL_FILE_TOO_SMALL);
+  for (i= 0; i < 8; i++)
+  {
+    RET_ERR_UNLESS(my_write(fd, buffer, 66, MYF(MY_FNABP |  MY_WME)) == 0);
+  }
+  /* Check that control file module sees the problem */
+  RET_ERR_UNLESS(local_ma_control_file_open() ==
+                 CONTROL_FILE_TOO_BIG);
+  RET_ERR_UNLESS(my_close(fd, MYF(MY_WME)) == 0);
+
+  /* Leave a correct control file */
+  RET_ERR_UNLESS(delete_file(MYF(MY_WME)) == 0);
+  RET_ERR_UNLESS(open_file() == CONTROL_FILE_OK);
+  RET_ERR_UNLESS(close_file() == 0);
+
+  return 0;
+}
+
+
+static struct my_option my_long_options[] =
+{
+#ifndef DBUG_OFF
+  {"debug", '#', "Debug log.",
+   0, 0, 0, GET_STR, REQUIRED_ARG, 0, 0, 0, 0, 0, 0},
+#endif
+  {"help", '?', "Display help and exit",
+   0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
+  {"version", 'V', "Print version number and exit",
+   0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
+  { 0, 0, 0, 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}
+};
+
+
+static void version(void)
+{
+  printf("ma_control_file_test: unit test for the control file "
+         "module of the Aria storage engine. Ver 1.0 \n");
+}
+
+static my_bool
+get_one_option(int optid, const struct my_option *opt __attribute__((unused)),
+	       char *argument __attribute__((unused)))
+{
+  switch(optid) {
+  case 'V':
+    version();
+    exit(0);
+  case '#':
+    DBUG_PUSH (argument);
+    break;
+  case '?':
+    version();
+    usage();
+    exit(0);
+  }
+  return 0;
+}
+
+
+/* Read options */
+
+static void get_options(int argc, char *argv[])
+{
+  int ho_error;
+
+  if ((ho_error=handle_options(&argc, &argv, my_long_options,
+                               get_one_option)))
+    exit(ho_error);
+
+  return;
+} /* get options */
+
+
+static void usage(void)
+{
+  printf("Usage: %s [options]\n\n", my_progname);
+  my_print_help(my_long_options);
+  my_print_variables(my_long_options);
+}
diff --git a/storage/maria/unittest/ma_loghandler_examples.c b/storage/maria/unittest/ma_loghandler_examples.c
new file mode 100644
index 00000000000..0c11a3b9a8e
--- /dev/null
+++ b/storage/maria/unittest/ma_loghandler_examples.c
@@ -0,0 +1,65 @@
+/* Copyright (C) 2006-2008 MySQL AB
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; version 2 of the License.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */
+
+#include "../maria_def.h"
+
+static LOG_DESC INIT_LOGREC_FIXED_RECORD_0LSN_EXAMPLE=
+{LOGRECTYPE_FIXEDLENGTH, 6, 6, NULL, NULL, NULL, 0,
+ "fixed0example", LOGREC_NOT_LAST_IN_GROUP, NULL, NULL};
+
+static LOG_DESC INIT_LOGREC_VARIABLE_RECORD_0LSN_EXAMPLE=
+{LOGRECTYPE_VARIABLE_LENGTH, 0, 9, NULL, NULL, NULL, 0,
+"variable0example", LOGREC_NOT_LAST_IN_GROUP, NULL, NULL};
+
+static LOG_DESC INIT_LOGREC_FIXED_RECORD_1LSN_EXAMPLE=
+{LOGRECTYPE_PSEUDOFIXEDLENGTH, 7, 7, NULL, NULL, NULL, 1,
+"fixed1example", LOGREC_NOT_LAST_IN_GROUP, NULL, NULL};
+
+static LOG_DESC INIT_LOGREC_VARIABLE_RECORD_1LSN_EXAMPLE=
+{LOGRECTYPE_VARIABLE_LENGTH, 0, 12, NULL, NULL, NULL, 1,
+"variable1example", LOGREC_NOT_LAST_IN_GROUP, NULL, NULL};
+
+static LOG_DESC INIT_LOGREC_FIXED_RECORD_2LSN_EXAMPLE=
+{LOGRECTYPE_PSEUDOFIXEDLENGTH, 23, 23, NULL, NULL, NULL, 2,
+"fixed2example", LOGREC_NOT_LAST_IN_GROUP, NULL, NULL};
+
+static LOG_DESC INIT_LOGREC_VARIABLE_RECORD_2LSN_EXAMPLE=
+{LOGRECTYPE_VARIABLE_LENGTH, 0, 19, NULL, NULL, NULL, 2,
+"variable2example", LOGREC_NOT_LAST_IN_GROUP, NULL, NULL};
+
+
+void translog_example_table_init()
+{
+  int i;
+  log_record_type_descriptor[LOGREC_FIXED_RECORD_0LSN_EXAMPLE]=
+    INIT_LOGREC_FIXED_RECORD_0LSN_EXAMPLE;
+  log_record_type_descriptor[LOGREC_VARIABLE_RECORD_0LSN_EXAMPLE]=
+    INIT_LOGREC_VARIABLE_RECORD_0LSN_EXAMPLE;
+  log_record_type_descriptor[LOGREC_FIXED_RECORD_1LSN_EXAMPLE]=
+    INIT_LOGREC_FIXED_RECORD_1LSN_EXAMPLE;
+  log_record_type_descriptor[LOGREC_VARIABLE_RECORD_1LSN_EXAMPLE]=
+    INIT_LOGREC_VARIABLE_RECORD_1LSN_EXAMPLE;
+  log_record_type_descriptor[LOGREC_FIXED_RECORD_2LSN_EXAMPLE]=
+    INIT_LOGREC_FIXED_RECORD_2LSN_EXAMPLE;
+  log_record_type_descriptor[LOGREC_VARIABLE_RECORD_2LSN_EXAMPLE]=
+    INIT_LOGREC_VARIABLE_RECORD_2LSN_EXAMPLE;
+  for (i= LOGREC_VARIABLE_RECORD_2LSN_EXAMPLE + 1;
+       i < LOGREC_NUMBER_OF_TYPES;
+       i++)
+    log_record_type_descriptor[i].rclass= LOGRECTYPE_NOT_ALLOWED;
+}
+
+
+
diff --git a/storage/maria/unittest/ma_maria_log_cleanup.c b/storage/maria/unittest/ma_maria_log_cleanup.c
new file mode 100644
index 00000000000..f85c75b1a88
--- /dev/null
+++ b/storage/maria/unittest/ma_maria_log_cleanup.c
@@ -0,0 +1,64 @@
+/* Copyright (C) 2006-2008 MySQL AB
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; version 2 of the License.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */
+
+#include "../maria_def.h"
+#include <my_dir.h>
+
+my_bool maria_log_remove()
+{
+  MY_DIR *dirp;
+  uint i;
+  MY_STAT stat_buff;
+  char file_name[FN_REFLEN];
+
+  /* Removes control file */
+  if (fn_format(file_name, CONTROL_FILE_BASE_NAME,
+                maria_data_root, "", MYF(MY_WME)) == NullS)
+    return 1;
+  if (my_stat(file_name, &stat_buff, MYF(0)) &&
+      my_delete(file_name, MYF(MY_WME)) != 0)
+    return 1;
+
+  /* Finds and removes transaction log files */
+  if (!(dirp = my_dir(maria_data_root, MYF(MY_DONT_SORT))))
+    return 1;
+
+  for (i= 0; i < dirp->number_off_files; i++)
+  {
+    char *file= dirp->dir_entry[i].name;
+    if (strncmp(file, "aria_log.", 9) == 0 &&
+        file[9] >= '0' && file[9] <= '9' &&
+        file[10] >= '0' && file[10] <= '9' &&
+        file[11] >= '0' && file[11] <= '9' &&
+        file[12] >= '0' && file[12] <= '9' &&
+        file[13] >= '0' && file[13] <= '9' &&
+        file[14] >= '0' && file[14] <= '9' &&
+        file[15] >= '0' && file[15] <= '9' &&
+        file[16] >= '0' && file[16] <= '9' &&
+        file[17] == '\0')
+    {
+      if (fn_format(file_name, file,
+                    maria_data_root, "", MYF(MY_WME)) == NullS ||
+          my_delete(file_name, MYF(MY_WME)) != 0)
+      {
+        my_dirend(dirp);
+        return 1;
+      }
+    }
+  }
+  my_dirend(dirp);
+  return 0;
+}
+
diff --git a/storage/maria/unittest/ma_pagecache_consist.c b/storage/maria/unittest/ma_pagecache_consist.c
new file mode 100644
index 00000000000..7dbdba433c6
--- /dev/null
+++ b/storage/maria/unittest/ma_pagecache_consist.c
@@ -0,0 +1,498 @@
+/* Copyright (C) 2006-2008 MySQL AB
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; version 2 of the License.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */
+
+/*
+  TODO: use pthread_join instead of wait_for_thread_count_to_be_zero, like in
+  my_atomic-t.c (see BUG#22320).
+*/
+
+#include <tap.h>
+#include <my_sys.h>
+#include <m_string.h>
+#include "test_file.h"
+#include <tap.h>
+
+#define PCACHE_SIZE (TEST_PAGE_SIZE*1024*8)
+
+#ifndef DBUG_OFF
+static const char* default_dbug_option;
+#endif
+
+static char *file1_name= (char*)"page_cache_test_file_1";
+static PAGECACHE_FILE file1;
+static pthread_cond_t COND_thread_count;
+static pthread_mutex_t LOCK_thread_count;
+static uint thread_count;
+static PAGECACHE pagecache;
+
+#ifdef TEST_HIGH_CONCURENCY
+static uint number_of_readers= 10;
+static uint number_of_writers= 20;
+static uint number_of_tests= 30000;
+static uint record_length_limit= TEST_PAGE_SIZE/200;
+static uint number_of_pages= 20;
+static uint flush_divider= 1000;
+#else /*TEST_HIGH_CONCURENCY*/
+#ifdef TEST_READERS
+static uint number_of_readers= 10;
+static uint number_of_writers= 1;
+static uint number_of_tests= 30000;
+static uint record_length_limit= TEST_PAGE_SIZE/200;
+static uint number_of_pages= 20;
+static uint flush_divider= 1000;
+#undef SKIP_BIG_TESTS
+#define SKIP_BIG_TESTS(X) /* no-op */
+#else /*TEST_READERS*/
+#ifdef TEST_WRITERS
+static uint number_of_readers= 0;
+static uint number_of_writers= 10;
+static uint number_of_tests= 30000;
+static uint record_length_limit= TEST_PAGE_SIZE/200;
+static uint number_of_pages= 20;
+static uint flush_divider= 1000;
+#undef SKIP_BIG_TESTS
+#define SKIP_BIG_TESTS(X) /* no-op */
+#else /*TEST_WRITERS*/
+static uint number_of_readers= 10;
+static uint number_of_writers= 10;
+static uint number_of_tests= 50000;
+static uint record_length_limit= TEST_PAGE_SIZE/200;
+static uint number_of_pages= 20000;
+static uint flush_divider= 1000;
+#endif /*TEST_WRITERS*/
+#endif /*TEST_READERS*/
+#endif /*TEST_HIGH_CONCURENCY*/
+
+
+/**
+  @brief Dummy pagecache callback.
+*/
+
+static my_bool
+dummy_callback(uchar *page __attribute__((unused)),
+               pgcache_page_no_t page_no __attribute__((unused)),
+               uchar* data_ptr __attribute__((unused)))
+{
+  return 0;
+}
+
+
+/**
+  @brief Dummy pagecache callback.
+*/
+
+static void
+dummy_fail_callback(uchar* data_ptr __attribute__((unused)))
+{
+  return;
+}
+
+
+/*
+  Get pseudo-random length of the field in (0;limit)
+
+  SYNOPSYS
+    get_len()
+    limit                limit for generated value
+
+  RETURN
+    length where length >= 0 & length < limit
+*/
+
+static uint get_len(uint limit)
+{
+  return (uint)((ulonglong)rand()*(limit-1)/RAND_MAX);
+}
+
+
+/*
+  Check page's consistency: layout is
+  4 bytes: number 'num' of records in this page, then num occurences of
+  { 4 bytes: record's length 'len'; then 4 bytes unchecked ('tag') then
+  'len' bytes each equal to the record's sequential number in this page,
+  modulo 256 }, then zeroes.
+ */
+uint check_page(uchar *buff, ulong offset, int page_locked, int page_no,
+                int tag)
+{
+  uint end= sizeof(uint);
+  uint num= uint4korr(buff);
+  uint i;
+  DBUG_ENTER("check_page");
+
+  for (i= 0; i < num; i++)
+  {
+    uint len= uint4korr(buff + end);
+    uint j;
+    end+= 4 + 4;
+    if (len + end > TEST_PAGE_SIZE)
+    {
+      diag("incorrect field header #%u by offset %lu\n", i, offset + end);
+      goto err;
+    }
+    for(j= 0; j < len; j++)
+    {
+      if (buff[end + j] != (uchar)((i+1) % 256))
+      {
+        diag("incorrect %lu byte\n", offset + end + j);
+        goto err;
+      }
+    }
+    end+= len;
+  }
+  for(i= end; i < TEST_PAGE_SIZE; i++)
+  {
+    if (buff[i] != 0)
+    {
+      int h;
+      DBUG_PRINT("err",
+                 ("byte %lu (%lu + %u), page %u (%s, end: %u, recs: %u, tag: %d) should be 0\n",
+                  offset + i, offset, i, page_no,
+                  (page_locked ? "locked" : "unlocked"),
+                  end, num, tag));
+      diag("byte %lu (%lu + %u), page %u (%s, end: %u, recs: %u, tag: %d) should be 0\n",
+           offset + i, offset, i, page_no,
+           (page_locked ? "locked" : "unlocked"),
+           end, num, tag);
+      h= my_open("wrong_page", O_CREAT | O_TRUNC | O_RDWR, MYF(0));
+      my_pwrite(h, (uchar*) buff, TEST_PAGE_SIZE, 0, MYF(0));
+      my_close(h, MYF(0));
+      goto err;
+    }
+  }
+  DBUG_RETURN(end);
+err:
+  DBUG_PRINT("err", ("try to flush"));
+  if (page_locked)
+  {
+    pagecache_delete(&pagecache, &file1, page_no,
+                     PAGECACHE_LOCK_LEFT_WRITELOCKED, 1);
+  }
+  else
+  {
+    flush_pagecache_blocks(&pagecache, &file1, FLUSH_RELEASE);
+  }
+  exit(1);
+}
+
+void put_rec(uchar *buff, uint end, uint len, uint tag)
+{
+  uint i;
+  uint num;
+  num= uint4korr(buff);
+  if (!len)
+    len= 1;
+  if (end + 4*2 + len > TEST_PAGE_SIZE)
+    return;
+  int4store(buff + end, len);
+  end+=  4;
+  int4store(buff + end, tag);
+  end+=  4;
+  num++;
+  int4store(buff, num);
+  for (i= end; i < (len + end); i++)
+  {
+    buff[i]= (uchar) num % 256;
+  }
+}
+
+/*
+  Recreate and reopen a file for test
+
+  SYNOPSIS
+    reset_file()
+    file                 File to reset
+    file_name            Path (and name) of file which should be reset
+*/
+
+void reset_file(PAGECACHE_FILE file, char *file_name)
+{
+  flush_pagecache_blocks(&pagecache, &file1, FLUSH_RELEASE);
+  if (my_close(file1.file, MYF(0)) != 0)
+  {
+    diag("Got error during %s closing from close() (errno: %d)\n",
+         file_name, errno);
+    exit(1);
+  }
+  my_delete(file_name, MYF(0));
+  if ((file.file= my_open(file_name,
+                          O_CREAT | O_TRUNC | O_RDWR, MYF(0))) == -1)
+  {
+    diag("Got error during %s creation from open() (errno: %d)\n",
+         file_name, errno);
+    exit(1);
+  }
+}
+
+
+void reader(int num)
+{
+  unsigned char *buffr= malloc(TEST_PAGE_SIZE);
+  uint i;
+
+  for (i= 0; i < number_of_tests; i++)
+  {
+    uint page= get_len(number_of_pages);
+    pagecache_read(&pagecache, &file1, page, 3, buffr,
+                   PAGECACHE_PLAIN_PAGE,
+                   PAGECACHE_LOCK_LEFT_UNLOCKED,
+                   0);
+    check_page(buffr, page * TEST_PAGE_SIZE, 0, page, -num);
+
+  }
+  free(buffr);
+}
+
+
+void writer(int num)
+{
+  unsigned char *buffr= malloc(TEST_PAGE_SIZE);
+  uint i;
+
+  for (i= 0; i < number_of_tests; i++)
+  {
+    uint end;
+    uint page= get_len(number_of_pages);
+    pagecache_read(&pagecache, &file1, page, 3, buffr,
+                   PAGECACHE_PLAIN_PAGE,
+                   PAGECACHE_LOCK_WRITE,
+                   0);
+    end= check_page(buffr, page * TEST_PAGE_SIZE, 1, page, num);
+    put_rec(buffr, end, get_len(record_length_limit), num);
+    pagecache_write(&pagecache, &file1, page, 3, buffr,
+                    PAGECACHE_PLAIN_PAGE,
+                    PAGECACHE_LOCK_WRITE_UNLOCK,
+                    PAGECACHE_UNPIN,
+                    PAGECACHE_WRITE_DELAY,
+                    0, LSN_IMPOSSIBLE);
+
+    if (i % flush_divider == 0)
+      flush_pagecache_blocks(&pagecache, &file1, FLUSH_FORCE_WRITE);
+  }
+  free(buffr);
+}
+
+
+static void *test_thread_reader(void *arg)
+{
+  int param=*((int*) arg);
+  my_thread_init();
+  {
+    DBUG_ENTER("test_reader");
+    DBUG_PRINT("enter", ("param: %d", param));
+
+    reader(param);
+
+    DBUG_PRINT("info", ("Thread %s ended", my_thread_name()));
+    pthread_mutex_lock(&LOCK_thread_count);
+    ok(1, "reader%d: done", param);
+    thread_count--;
+    VOID(pthread_cond_signal(&COND_thread_count)); /* Tell main we are ready */
+    pthread_mutex_unlock(&LOCK_thread_count);
+    free((uchar*) arg);
+    my_thread_end();
+  }
+  return 0;
+}
+
+
+static void *test_thread_writer(void *arg)
+{
+  int param=*((int*) arg);
+  my_thread_init();
+  {
+    DBUG_ENTER("test_writer");
+    DBUG_PRINT("enter", ("param: %d", param));
+
+    writer(param);
+
+    DBUG_PRINT("info", ("Thread %s ended", my_thread_name()));
+    pthread_mutex_lock(&LOCK_thread_count);
+    ok(1, "writer%d: done", param);
+    thread_count--;
+    VOID(pthread_cond_signal(&COND_thread_count)); /* Tell main we are ready */
+    pthread_mutex_unlock(&LOCK_thread_count);
+    free((uchar*) arg);
+    my_thread_end();
+  }
+  return 0;
+}
+
+
+int main(int argc __attribute__((unused)),
+         char **argv __attribute__((unused)))
+{
+  pthread_t tid;
+  pthread_attr_t thr_attr;
+  int *param, error, pagen;
+
+  MY_INIT(argv[0]);
+
+#ifndef DBUG_OFF
+#if defined(__WIN__)
+  default_dbug_option= "d:t:i:O,\\test_pagecache_consist.trace";
+#else
+  default_dbug_option= "d:t:i:o,/tmp/test_pagecache_consist.trace";
+#endif
+  if (argc > 1)
+  {
+    DBUG_SET(default_dbug_option);
+    DBUG_SET_INITIAL(default_dbug_option);
+  }
+#endif
+
+  {
+  DBUG_ENTER("main");
+  DBUG_PRINT("info", ("Main thread: %s\n", my_thread_name()));
+  plan(number_of_writers + number_of_readers);
+  SKIP_BIG_TESTS(number_of_writers + number_of_readers)
+  {
+
+  if ((file1.file= my_open(file1_name,
+                           O_CREAT | O_TRUNC | O_RDWR, MYF(0))) == -1)
+  {
+    diag( "Got error during file1 creation from open() (errno: %d)\n",
+	    errno);
+    exit(1);
+  }
+  pagecache_file_init(file1, &dummy_callback, &dummy_callback,
+                      &dummy_fail_callback, &dummy_callback, NULL);
+  DBUG_PRINT("info", ("file1: %d", file1.file));
+  if (my_chmod(file1_name, S_IRWXU | S_IRWXG | S_IRWXO, MYF(MY_WME)))
+    exit(1);
+  my_pwrite(file1.file, (const uchar *)"test file", 9, 0, MYF(0));
+
+  if ((error= pthread_cond_init(&COND_thread_count, NULL)))
+  {
+    diag( "COND_thread_count: %d from pthread_cond_init (errno: %d)\n",
+	    error, errno);
+    exit(1);
+  }
+  if ((error= pthread_mutex_init(&LOCK_thread_count, MY_MUTEX_INIT_FAST)))
+  {
+    diag( "LOCK_thread_count: %d from pthread_cond_init (errno: %d)\n",
+	    error, errno);
+    exit(1);
+  }
+
+  if ((error= pthread_attr_init(&thr_attr)))
+  {
+    diag("Got error: %d from pthread_attr_init (errno: %d)\n",
+	    error,errno);
+    exit(1);
+  }
+  if ((error= pthread_attr_setdetachstate(&thr_attr, PTHREAD_CREATE_DETACHED)))
+  {
+    diag(
+	    "Got error: %d from pthread_attr_setdetachstate (errno: %d)\n",
+	    error,errno);
+    exit(1);
+  }
+
+#ifdef HAVE_THR_SETCONCURRENCY
+  VOID(thr_setconcurrency(2));
+#endif
+
+  if ((pagen= init_pagecache(&pagecache, PCACHE_SIZE, 0, 0,
+                             TEST_PAGE_SIZE, 0)) == 0)
+  {
+    diag("Got error: init_pagecache() (errno: %d)\n",
+            errno);
+    exit(1);
+  }
+  DBUG_PRINT("info", ("Page cache %d pages", pagen));
+  {
+    unsigned char *buffr= malloc(TEST_PAGE_SIZE);
+    uint i;
+    memset(buffr, '\0', TEST_PAGE_SIZE);
+    for (i= 0; i < number_of_pages; i++)
+    {
+      pagecache_write(&pagecache, &file1, i, 3, buffr,
+                      PAGECACHE_PLAIN_PAGE,
+                      PAGECACHE_LOCK_LEFT_UNLOCKED,
+                      PAGECACHE_PIN_LEFT_UNPINNED,
+                      PAGECACHE_WRITE_DELAY,
+                      0, LSN_IMPOSSIBLE);
+    }
+    flush_pagecache_blocks(&pagecache, &file1, FLUSH_FORCE_WRITE);
+    free(buffr);
+  }
+  pthread_mutex_lock(&LOCK_thread_count);
+  while (number_of_readers != 0 || number_of_writers != 0)
+  {
+    if (number_of_readers != 0)
+    {
+      param=(int*) malloc(sizeof(int));
+      *param= number_of_readers;
+      if ((error= pthread_create(&tid, &thr_attr, test_thread_reader,
+                                 (void*) param)))
+      {
+        diag("Got error: %d from pthread_create (errno: %d)\n",
+                error,errno);
+        exit(1);
+      }
+      thread_count++;
+      number_of_readers--;
+    }
+    if (number_of_writers != 0)
+    {
+      param=(int*) malloc(sizeof(int));
+      *param= number_of_writers;
+      if ((error= pthread_create(&tid, &thr_attr, test_thread_writer,
+                                 (void*) param)))
+      {
+        diag("Got error: %d from pthread_create (errno: %d)\n",
+                error,errno);
+        exit(1);
+      }
+      thread_count++;
+      number_of_writers--;
+    }
+  }
+  DBUG_PRINT("info", ("Thread started"));
+  pthread_mutex_unlock(&LOCK_thread_count);
+
+  pthread_attr_destroy(&thr_attr);
+
+  /* wait finishing */
+  pthread_mutex_lock(&LOCK_thread_count);
+  while (thread_count)
+  {
+    if ((error= pthread_cond_wait(&COND_thread_count,&LOCK_thread_count)))
+      diag("COND_thread_count: %d from pthread_cond_wait\n",error);
+  }
+  pthread_mutex_unlock(&LOCK_thread_count);
+  DBUG_PRINT("info", ("thread ended"));
+
+  end_pagecache(&pagecache, 1);
+  DBUG_PRINT("info", ("Page cache ended"));
+
+  if (my_close(file1.file, MYF(0)) != 0)
+  {
+    diag( "Got error during file1 closing from close() (errno: %d)\n",
+	    errno);
+    exit(1);
+  }
+  my_delete(file1_name, MYF(0));
+
+  DBUG_PRINT("info", ("file1 (%d) closed", file1.file));
+  DBUG_PRINT("info", ("Program end"));
+
+  } /* SKIP_BIG_TESTS */
+  my_end(0);
+
+  return exit_status();
+  }
+}
diff --git a/storage/maria/unittest/ma_pagecache_rwconsist.c b/storage/maria/unittest/ma_pagecache_rwconsist.c
new file mode 100644
index 00000000000..a1a22b5e18d
--- /dev/null
+++ b/storage/maria/unittest/ma_pagecache_rwconsist.c
@@ -0,0 +1,362 @@
+/* Copyright (C) 2006-2008 MySQL AB
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; version 2 of the License.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */
+
+/*
+  TODO: use pthread_join instead of wait_for_thread_count_to_be_zero, like in
+  my_atomic-t.c (see BUG#22320).
+*/
+
+#include <tap.h>
+#include <my_sys.h>
+#include <m_string.h>
+#include "test_file.h"
+#include <tap.h>
+
+#define PCACHE_SIZE (TEST_PAGE_SIZE*1024*8)
+
+#ifndef DBUG_OFF
+static const char* default_dbug_option;
+#endif
+
+
+#define SLEEP my_sleep(5)
+
+static char *file1_name= (char*)"page_cache_test_file_1";
+static PAGECACHE_FILE file1;
+static pthread_cond_t COND_thread_count;
+static pthread_mutex_t LOCK_thread_count;
+static uint thread_count= 0;
+static PAGECACHE pagecache;
+
+static uint number_of_readers= 5;
+static uint number_of_writers= 5;
+static uint number_of_read_tests= 2000;
+static uint number_of_write_tests= 1000;
+static uint read_sleep_limit= 3;
+static uint report_divisor= 50;
+
+/**
+  @brief Dummy pagecache callback.
+*/
+
+static my_bool
+dummy_callback(uchar *page __attribute__((unused)),
+               pgcache_page_no_t page_no __attribute__((unused)),
+               uchar* data_ptr __attribute__((unused)))
+{
+  return 0;
+}
+
+
+/**
+  @brief Dummy pagecache callback.
+*/
+
+static void
+dummy_fail_callback(uchar* data_ptr __attribute__((unused)))
+{
+  return;
+}
+
+
+/**
+  @brief Checks page consistency
+
+  @param buff            pointer to the page content
+  @param task            task ID
+*/
+void check_page(uchar *buff, int task)
+{
+  uint i;
+  DBUG_ENTER("check_page");
+
+  for (i= 1; i < TEST_PAGE_SIZE; i++)
+  {
+    if (buff[0] != buff[i])
+      goto err;
+  }
+  DBUG_VOID_RETURN;
+err:
+  diag("Task %d char #%u '%u' != '%u'", task, i, (uint) buff[0],
+       (uint) buff[i]);
+  DBUG_PRINT("err", ("try to flush"));
+  exit(1);
+}
+
+
+
+void reader(int num)
+{
+  unsigned char *buff;
+  uint i;
+  PAGECACHE_BLOCK_LINK *link;
+
+  for (i= 0; i < number_of_read_tests; i++)
+  {
+    if (i % report_divisor == 0)
+      diag("Reader %d - %u", num, i);
+    buff= pagecache_read(&pagecache, &file1, 0, 3, NULL,
+                         PAGECACHE_PLAIN_PAGE,
+                         PAGECACHE_LOCK_READ,
+                         &link);
+    check_page(buff, num);
+    pagecache_unlock_by_link(&pagecache, link,
+                             PAGECACHE_LOCK_READ_UNLOCK,
+                             PAGECACHE_UNPIN, 0, 0, 0, FALSE);
+    {
+      int lim= rand() % read_sleep_limit;
+      int j;
+      for (j= 0; j < lim; j++)
+        SLEEP;
+    }
+  }
+}
+
+
+void writer(int num)
+{
+  uint i;
+  uchar *buff;
+  PAGECACHE_BLOCK_LINK *link;
+
+  for (i= 0; i < number_of_write_tests; i++)
+  {
+    uchar c= (uchar) rand() % 256;
+
+    if (i % report_divisor == 0)
+      diag("Writer %d - %u", num, i);
+    buff= pagecache_read(&pagecache, &file1, 0, 3, NULL,
+                         PAGECACHE_PLAIN_PAGE,
+                         PAGECACHE_LOCK_WRITE,
+                         &link);
+
+    check_page(buff, num);
+    bfill(buff, TEST_PAGE_SIZE / 2, c);
+    SLEEP;
+    bfill(buff + TEST_PAGE_SIZE/2, TEST_PAGE_SIZE / 2, c);
+    check_page(buff, num);
+    pagecache_unlock_by_link(&pagecache, link,
+                             PAGECACHE_LOCK_WRITE_UNLOCK,
+                             PAGECACHE_UNPIN, 0, 0, 1, FALSE);
+    SLEEP;
+  }
+}
+
+
+static void *test_thread_reader(void *arg)
+{
+  int param=*((int*) arg);
+  my_thread_init();
+  {
+    DBUG_ENTER("test_reader");
+
+    DBUG_PRINT("enter", ("param: %d", param));
+
+    reader(param);
+
+    DBUG_PRINT("info", ("Thread %s ended", my_thread_name()));
+    pthread_mutex_lock(&LOCK_thread_count);
+    ok(1, "reader%d: done", param);
+    thread_count--;
+    VOID(pthread_cond_signal(&COND_thread_count)); /* Tell main we are ready */
+    pthread_mutex_unlock(&LOCK_thread_count);
+    free((uchar*) arg);
+    my_thread_end();
+  }
+  return 0;
+}
+
+
+static void *test_thread_writer(void *arg)
+{
+  int param=*((int*) arg);
+  my_thread_init();
+  {
+    DBUG_ENTER("test_writer");
+
+    writer(param);
+
+    DBUG_PRINT("info", ("Thread %s ended", my_thread_name()));
+    pthread_mutex_lock(&LOCK_thread_count);
+    ok(1, "writer%d: done", param);
+    thread_count--;
+    VOID(pthread_cond_signal(&COND_thread_count)); /* Tell main we are ready */
+    pthread_mutex_unlock(&LOCK_thread_count);
+    free((uchar*) arg);
+    my_thread_end();
+  }
+  return 0;
+}
+
+
+int main(int argc __attribute__((unused)),
+         char **argv __attribute__((unused)))
+{
+  pthread_t tid;
+  pthread_attr_t thr_attr;
+  int *param, error, pagen;
+
+  MY_INIT(argv[0]);
+
+#ifndef DBUG_OFF
+#if defined(__WIN__)
+  default_dbug_option= "d:t:i:O,\\test_pagecache_consist.trace";
+#else
+  default_dbug_option= "d:t:i:O,/tmp/test_pagecache_consist.trace";
+#endif
+  if (argc > 1)
+  {
+    DBUG_SET(default_dbug_option);
+    DBUG_SET_INITIAL(default_dbug_option);
+  }
+#endif
+
+  {
+  DBUG_ENTER("main");
+  DBUG_PRINT("info", ("Main thread: %s\n", my_thread_name()));
+  plan(number_of_writers + number_of_readers);
+  SKIP_BIG_TESTS(number_of_writers + number_of_readers)
+  {
+
+  if ((file1.file= my_open(file1_name,
+                           O_CREAT | O_TRUNC | O_RDWR, MYF(0))) == -1)
+  {
+    diag( "Got error during file1 creation from open() (errno: %d)\n",
+	    errno);
+    exit(1);
+  }
+  pagecache_file_init(file1, &dummy_callback, &dummy_callback,
+                      &dummy_fail_callback, &dummy_callback, NULL);
+  DBUG_PRINT("info", ("file1: %d", file1.file));
+  if (my_chmod(file1_name, S_IRWXU | S_IRWXG | S_IRWXO, MYF(MY_WME)))
+    exit(1);
+  my_pwrite(file1.file, (const uchar*) "test file", 9, 0, MYF(0));
+
+  if ((error= pthread_cond_init(&COND_thread_count, NULL)))
+  {
+    diag( "COND_thread_count: %d from pthread_cond_init (errno: %d)\n",
+	    error, errno);
+    exit(1);
+  }
+  if ((error= pthread_mutex_init(&LOCK_thread_count, MY_MUTEX_INIT_FAST)))
+  {
+    diag( "LOCK_thread_count: %d from pthread_cond_init (errno: %d)\n",
+	    error, errno);
+    exit(1);
+  }
+
+  if ((error= pthread_attr_init(&thr_attr)))
+  {
+    diag("Got error: %d from pthread_attr_init (errno: %d)\n",
+	    error,errno);
+    exit(1);
+  }
+  if ((error= pthread_attr_setdetachstate(&thr_attr, PTHREAD_CREATE_DETACHED)))
+  {
+    diag(
+	    "Got error: %d from pthread_attr_setdetachstate (errno: %d)\n",
+	    error,errno);
+    exit(1);
+  }
+
+#ifdef HAVE_THR_SETCONCURRENCY
+  VOID(thr_setconcurrency(2));
+#endif
+
+  if ((pagen= init_pagecache(&pagecache, PCACHE_SIZE, 0, 0,
+                             TEST_PAGE_SIZE, 0)) == 0)
+  {
+    diag("Got error: init_pagecache() (errno: %d)\n",
+            errno);
+    exit(1);
+  }
+  DBUG_PRINT("info", ("Page cache %d pages", pagen));
+  {
+    unsigned char *buffr= malloc(TEST_PAGE_SIZE);
+    memset(buffr, '\0', TEST_PAGE_SIZE);
+    pagecache_write(&pagecache, &file1, 0, 3, buffr,
+                    PAGECACHE_PLAIN_PAGE,
+                    PAGECACHE_LOCK_LEFT_UNLOCKED,
+                    PAGECACHE_PIN_LEFT_UNPINNED,
+                    PAGECACHE_WRITE_DELAY,
+                    0, LSN_IMPOSSIBLE);
+  }
+  pthread_mutex_lock(&LOCK_thread_count);
+
+  while (number_of_readers != 0 || number_of_writers != 0)
+  {
+    if (number_of_readers != 0)
+    {
+      param=(int*) malloc(sizeof(int));
+      *param= number_of_readers + number_of_writers;
+      if ((error= pthread_create(&tid, &thr_attr, test_thread_reader,
+                                 (void*) param)))
+      {
+        diag("Got error: %d from pthread_create (errno: %d)\n",
+                error,errno);
+        exit(1);
+      }
+      thread_count++;
+      number_of_readers--;
+    }
+    if (number_of_writers != 0)
+    {
+      param=(int*) malloc(sizeof(int));
+      *param= number_of_writers + number_of_readers;
+      if ((error= pthread_create(&tid, &thr_attr, test_thread_writer,
+                                 (void*) param)))
+      {
+        diag("Got error: %d from pthread_create (errno: %d)\n",
+                error,errno);
+        exit(1);
+      }
+      thread_count++;
+      number_of_writers--;
+    }
+  }
+  DBUG_PRINT("info", ("Thread started"));
+  pthread_mutex_unlock(&LOCK_thread_count);
+
+  pthread_attr_destroy(&thr_attr);
+
+  /* wait finishing */
+  pthread_mutex_lock(&LOCK_thread_count);
+  while (thread_count)
+  {
+    if ((error= pthread_cond_wait(&COND_thread_count, &LOCK_thread_count)))
+      diag("COND_thread_count: %d from pthread_cond_wait\n", error);
+  }
+  pthread_mutex_unlock(&LOCK_thread_count);
+  DBUG_PRINT("info", ("thread ended"));
+
+  end_pagecache(&pagecache, 1);
+  DBUG_PRINT("info", ("Page cache ended"));
+
+  if (my_close(file1.file, MYF(0)) != 0)
+  {
+    diag( "Got error during file1 closing from close() (errno: %d)\n",
+	    errno);
+    exit(1);
+  }
+  my_delete(file1_name, MYF(0));
+
+  DBUG_PRINT("info", ("file1 (%d) closed", file1.file));
+  DBUG_PRINT("info", ("Program end"));
+  } /* SKIP_BIG_TESTS */
+  my_end(0);
+
+  return exit_status();
+  }
+}
diff --git a/storage/maria/unittest/ma_pagecache_rwconsist2.c b/storage/maria/unittest/ma_pagecache_rwconsist2.c
new file mode 100644
index 00000000000..34183a2d0ab
--- /dev/null
+++ b/storage/maria/unittest/ma_pagecache_rwconsist2.c
@@ -0,0 +1,358 @@
+/* Copyright (C) 2006-2008 MySQL AB, 2008 Sun Microsystems, Inc.
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; version 2 of the License.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */
+
+
+/**
+  @file this unit tests consistence of  long block writing under write lock
+  and simultaneous reading of this block with read request without read lock
+  requirement.
+*/
+
+/*
+  TODO: use pthread_join instead of wait_for_thread_count_to_be_zero, like in
+  my_atomic-t.c (see BUG#22320).
+*/
+
+#include <tap.h>
+#include <my_sys.h>
+#include <m_string.h>
+#include "test_file.h"
+#include <tap.h>
+
+#define PCACHE_SIZE (TEST_PAGE_SIZE*1024*8)
+
+#ifndef DBUG_OFF
+static const char* default_dbug_option;
+#endif
+
+
+#define SLEEP my_sleep(5)
+
+static char *file1_name= (char*)"page_cache_test_file_1";
+static PAGECACHE_FILE file1;
+static pthread_cond_t COND_thread_count;
+static pthread_mutex_t LOCK_thread_count;
+static uint thread_count= 0;
+static PAGECACHE pagecache;
+
+static uint number_of_readers= 5;
+static uint number_of_writers= 5;
+static uint number_of_read_tests= 20000;
+static uint number_of_write_tests= 1000;
+static uint report_divisor= 50;
+
+/**
+  @brief Dummy pagecache callback.
+*/
+
+static my_bool
+dummy_callback(uchar *page __attribute__((unused)),
+               pgcache_page_no_t page_no __attribute__((unused)),
+               uchar* data_ptr __attribute__((unused)))
+{
+  return 0;
+}
+
+
+/**
+  @brief Dummy pagecache callback.
+*/
+
+static void
+dummy_fail_callback(uchar* data_ptr __attribute__((unused)))
+{
+  return;
+}
+
+
+/**
+  @brief Checks page consistency
+
+  @param buff            pointer to the page content
+  @param task            task ID
+*/
+void check_page(uchar *buff, int task)
+{
+  uint i;
+  DBUG_ENTER("check_page");
+
+  for (i= 1; i < TEST_PAGE_SIZE; i++)
+  {
+    if (buff[0] != buff[i])
+      goto err;
+  }
+  DBUG_VOID_RETURN;
+err:
+  diag("Task %d char #%u '%u' != '%u'", task, i, (uint) buff[0],
+       (uint) buff[i]);
+  DBUG_PRINT("err", ("try to flush"));
+  exit(1);
+}
+
+
+
+void reader(int num)
+{
+  unsigned char buff[TEST_PAGE_SIZE];
+  uint i;
+
+  for (i= 0; i < number_of_read_tests; i++)
+  {
+    if (i % report_divisor == 0)
+      diag("Reader %d - %u", num, i);
+    pagecache_read(&pagecache, &file1, 0, 3, buff,
+                   PAGECACHE_PLAIN_PAGE,
+                   PAGECACHE_LOCK_LEFT_UNLOCKED,
+                   NULL);
+    check_page(buff, num);
+  }
+}
+
+
+void writer(int num)
+{
+  uint i;
+  uchar *buff;
+  PAGECACHE_BLOCK_LINK *link;
+
+  for (i= 0; i < number_of_write_tests; i++)
+  {
+    uchar c= (uchar) rand() % 256;
+
+    if (i % report_divisor == 0)
+      diag("Writer %d - %u", num, i);
+    buff= pagecache_read(&pagecache, &file1, 0, 3, NULL,
+                         PAGECACHE_PLAIN_PAGE,
+                         PAGECACHE_LOCK_WRITE,
+                         &link);
+
+    check_page(buff, num);
+    bfill(buff, TEST_PAGE_SIZE / 2, c);
+    SLEEP;
+    bfill(buff + TEST_PAGE_SIZE/2, TEST_PAGE_SIZE / 2, c);
+    check_page(buff, num);
+    pagecache_unlock_by_link(&pagecache, link,
+                             PAGECACHE_LOCK_WRITE_UNLOCK,
+                             PAGECACHE_UNPIN, 0, 0, 1, FALSE);
+    SLEEP;
+  }
+}
+
+
+static void *test_thread_reader(void *arg)
+{
+  int param=*((int*) arg);
+  my_thread_init();
+  {
+    DBUG_ENTER("test_reader");
+
+    DBUG_PRINT("enter", ("param: %d", param));
+
+    reader(param);
+
+    DBUG_PRINT("info", ("Thread %s ended", my_thread_name()));
+    pthread_mutex_lock(&LOCK_thread_count);
+    ok(1, "reader%d: done", param);
+    thread_count--;
+    VOID(pthread_cond_signal(&COND_thread_count)); /* Tell main we are ready */
+    pthread_mutex_unlock(&LOCK_thread_count);
+    free((uchar*) arg);
+    my_thread_end();
+  }
+  return 0;
+}
+
+
+static void *test_thread_writer(void *arg)
+{
+  int param=*((int*) arg);
+  my_thread_init();
+  {
+    DBUG_ENTER("test_writer");
+
+    writer(param);
+
+    DBUG_PRINT("info", ("Thread %s ended", my_thread_name()));
+    pthread_mutex_lock(&LOCK_thread_count);
+    ok(1, "writer%d: done", param);
+    thread_count--;
+    VOID(pthread_cond_signal(&COND_thread_count)); /* Tell main we are ready */
+    pthread_mutex_unlock(&LOCK_thread_count);
+    free((uchar*) arg);
+    my_thread_end();
+  }
+  return 0;
+}
+
+
+int main(int argc __attribute__((unused)),
+         char **argv __attribute__((unused)))
+{
+  pthread_t tid;
+  pthread_attr_t thr_attr;
+  int *param, error, pagen;
+
+  MY_INIT(argv[0]);
+
+#ifndef DBUG_OFF
+#if defined(__WIN__)
+  default_dbug_option= "d:t:i:O,\\test_pagecache_consist.trace";
+#else
+  default_dbug_option= "d:t:i:O,/tmp/test_pagecache_consist.trace";
+#endif
+  if (argc > 1)
+  {
+    DBUG_SET(default_dbug_option);
+    DBUG_SET_INITIAL(default_dbug_option);
+  }
+#endif
+
+  {
+  DBUG_ENTER("main");
+  DBUG_PRINT("info", ("Main thread: %s\n", my_thread_name()));
+  plan(number_of_writers + number_of_readers);
+  SKIP_BIG_TESTS(number_of_writers + number_of_readers)
+  {
+
+  if ((file1.file= my_open(file1_name,
+                           O_CREAT | O_TRUNC | O_RDWR, MYF(0))) == -1)
+  {
+    diag( "Got error during file1 creation from open() (errno: %d)\n",
+	    errno);
+    exit(1);
+  }
+  pagecache_file_init(file1, &dummy_callback, &dummy_callback,
+                      &dummy_fail_callback, &dummy_callback, NULL);
+  DBUG_PRINT("info", ("file1: %d", file1.file));
+  if (my_chmod(file1_name, S_IRWXU | S_IRWXG | S_IRWXO, MYF(MY_WME)))
+    exit(1);
+  my_pwrite(file1.file, (const uchar*) "test file", 9, 0, MYF(0));
+
+  if ((error= pthread_cond_init(&COND_thread_count, NULL)))
+  {
+    diag( "COND_thread_count: %d from pthread_cond_init (errno: %d)\n",
+	    error, errno);
+    exit(1);
+  }
+  if ((error= pthread_mutex_init(&LOCK_thread_count, MY_MUTEX_INIT_FAST)))
+  {
+    diag( "LOCK_thread_count: %d from pthread_cond_init (errno: %d)\n",
+	    error, errno);
+    exit(1);
+  }
+
+  if ((error= pthread_attr_init(&thr_attr)))
+  {
+    diag("Got error: %d from pthread_attr_init (errno: %d)\n",
+	    error,errno);
+    exit(1);
+  }
+  if ((error= pthread_attr_setdetachstate(&thr_attr, PTHREAD_CREATE_DETACHED)))
+  {
+    diag(
+	    "Got error: %d from pthread_attr_setdetachstate (errno: %d)\n",
+	    error,errno);
+    exit(1);
+  }
+
+#ifdef HAVE_THR_SETCONCURRENCY
+  VOID(thr_setconcurrency(2));
+#endif
+
+  if ((pagen= init_pagecache(&pagecache, PCACHE_SIZE, 0, 0,
+                             TEST_PAGE_SIZE, 0)) == 0)
+  {
+    diag("Got error: init_pagecache() (errno: %d)\n",
+            errno);
+    exit(1);
+  }
+  DBUG_PRINT("info", ("Page cache %d pages", pagen));
+  {
+    unsigned char *buffr= malloc(TEST_PAGE_SIZE);
+    memset(buffr, '\0', TEST_PAGE_SIZE);
+    pagecache_write(&pagecache, &file1, 0, 3, buffr,
+                    PAGECACHE_PLAIN_PAGE,
+                    PAGECACHE_LOCK_LEFT_UNLOCKED,
+                    PAGECACHE_PIN_LEFT_UNPINNED,
+                    PAGECACHE_WRITE_DELAY,
+                    0, LSN_IMPOSSIBLE);
+  }
+  pthread_mutex_lock(&LOCK_thread_count);
+
+  while (number_of_readers != 0 || number_of_writers != 0)
+  {
+    if (number_of_readers != 0)
+    {
+      param=(int*) malloc(sizeof(int));
+      *param= number_of_readers + number_of_writers;
+      if ((error= pthread_create(&tid, &thr_attr, test_thread_reader,
+                                 (void*) param)))
+      {
+        diag("Got error: %d from pthread_create (errno: %d)\n",
+                error,errno);
+        exit(1);
+      }
+      thread_count++;
+      number_of_readers--;
+    }
+    if (number_of_writers != 0)
+    {
+      param=(int*) malloc(sizeof(int));
+      *param= number_of_writers + number_of_readers;
+      if ((error= pthread_create(&tid, &thr_attr, test_thread_writer,
+                                 (void*) param)))
+      {
+        diag("Got error: %d from pthread_create (errno: %d)\n",
+                error,errno);
+        exit(1);
+      }
+      thread_count++;
+      number_of_writers--;
+    }
+  }
+  DBUG_PRINT("info", ("Thread started"));
+  pthread_mutex_unlock(&LOCK_thread_count);
+
+  pthread_attr_destroy(&thr_attr);
+
+  /* wait finishing */
+  pthread_mutex_lock(&LOCK_thread_count);
+  while (thread_count)
+  {
+    if ((error= pthread_cond_wait(&COND_thread_count, &LOCK_thread_count)))
+      diag("COND_thread_count: %d from pthread_cond_wait\n", error);
+  }
+  pthread_mutex_unlock(&LOCK_thread_count);
+  DBUG_PRINT("info", ("thread ended"));
+
+  end_pagecache(&pagecache, 1);
+  DBUG_PRINT("info", ("Page cache ended"));
+
+  if (my_close(file1.file, MYF(0)) != 0)
+  {
+    diag( "Got error during file1 closing from close() (errno: %d)\n",
+	    errno);
+    exit(1);
+  }
+  my_delete(file1_name, MYF(0));
+
+  DBUG_PRINT("info", ("file1 (%d) closed", file1.file));
+  DBUG_PRINT("info", ("Program end"));
+  } /* SKIP_BIG_TESTS */
+  my_end(0);
+
+  return exit_status();
+  }
+}
diff --git a/storage/maria/unittest/ma_pagecache_single.c b/storage/maria/unittest/ma_pagecache_single.c
new file mode 100644
index 00000000000..32e588e165a
--- /dev/null
+++ b/storage/maria/unittest/ma_pagecache_single.c
@@ -0,0 +1,853 @@
+/* Copyright (C) 2006-2008 MySQL AB
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; version 2 of the License.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */
+
+/*
+  TODO: use pthread_join instead of wait_for_thread_count_to_be_zero, like in
+  my_atomic-t.c (see BUG#22320).
+  Use diag() instead of fprintf(stderr).
+*/
+#include <tap.h>
+#include <my_sys.h>
+#include <m_string.h>
+#include "test_file.h"
+#include <tap.h>
+
+#define PCACHE_SIZE (TEST_PAGE_SIZE*1024*10)
+
+#ifndef DBUG_OFF
+static const char* default_dbug_option;
+#endif
+
+#ifndef BIG
+#undef SKIP_BIG_TESTS
+#define SKIP_BIG_TESTS(X) /* no-op */
+#endif
+
+static char *file1_name= (char*)"page_cache_test_file_1";
+static char *file2_name= (char*)"page_cache_test_file_2";
+static PAGECACHE_FILE file1;
+static pthread_cond_t COND_thread_count;
+static pthread_mutex_t LOCK_thread_count;
+static uint thread_count;
+static PAGECACHE pagecache;
+
+/*
+  File contance descriptors
+*/
+static struct file_desc simple_read_write_test_file[]=
+{
+  { TEST_PAGE_SIZE, '\1'},
+  {0, 0}
+};
+static struct file_desc simple_read_change_write_read_test_file[]=
+{
+  { TEST_PAGE_SIZE/2, '\65'},
+  { TEST_PAGE_SIZE/2, '\1'},
+  {0, 0}
+};
+static struct file_desc simple_pin_test_file1[]=
+{
+  { TEST_PAGE_SIZE*2, '\1'},
+  {0, 0}
+};
+static struct file_desc simple_pin_test_file2[]=
+{
+  { TEST_PAGE_SIZE/2, '\1'},
+  { TEST_PAGE_SIZE/2, (unsigned char)129},
+  { TEST_PAGE_SIZE, '\1'},
+  {0, 0}
+};
+static struct file_desc simple_pin_no_lock_test_file1[]=
+{
+  { TEST_PAGE_SIZE, '\4'},
+  {0, 0}
+};
+static struct file_desc simple_pin_no_lock_test_file2[]=
+{
+  { TEST_PAGE_SIZE, '\5'},
+  {0, 0}
+};
+static struct file_desc simple_pin_no_lock_test_file3[]=
+{
+  { TEST_PAGE_SIZE, '\6'},
+  {0, 0}
+};
+static struct file_desc simple_delete_forget_test_file[]=
+{
+  { TEST_PAGE_SIZE, '\1'},
+  {0, 0}
+};
+static struct file_desc simple_delete_flush_test_file[]=
+{
+  { TEST_PAGE_SIZE, '\2'},
+  {0, 0}
+};
+
+
+/**
+  @brief Dummy pagecache callback.
+*/
+
+static my_bool
+dummy_callback(uchar *page __attribute__((unused)),
+               pgcache_page_no_t page_no __attribute__((unused)),
+               uchar* data_ptr __attribute__((unused)))
+{
+  return 0;
+}
+
+
+/**
+  @brief Dummy pagecache callback.
+*/
+
+static void
+dummy_fail_callback(uchar* data_ptr __attribute__((unused)))
+{
+  return;
+}
+
+
+/*
+  Recreate and reopen a file for test
+
+  SYNOPSIS
+    reset_file()
+    file                 File to reset
+    file_name            Path (and name) of file which should be reset
+*/
+
+void reset_file(PAGECACHE_FILE *file, const char *file_name)
+{
+  flush_pagecache_blocks(&pagecache, file, FLUSH_RELEASE);
+  if (my_close(file->file, MYF(MY_WME)))
+    exit(1);
+  my_delete(file_name, MYF(MY_WME));
+  if ((file->file= my_open(file_name,
+                           O_CREAT | O_TRUNC | O_RDWR, MYF(0))) == -1)
+  {
+    diag("Got error during %s creation from open() (errno: %d)\n",
+         file_name, my_errno);
+    exit(1);
+  }
+}
+
+/*
+  Write then read page, check file on disk
+*/
+
+int simple_read_write_test()
+{
+  unsigned char *buffw= malloc(TEST_PAGE_SIZE);
+  unsigned char *buffr= malloc(TEST_PAGE_SIZE);
+  int res;
+  DBUG_ENTER("simple_read_write_test");
+  bfill(buffw, TEST_PAGE_SIZE, '\1');
+  pagecache_write(&pagecache, &file1, 0, 3, buffw,
+                  PAGECACHE_PLAIN_PAGE,
+                  PAGECACHE_LOCK_LEFT_UNLOCKED,
+                  PAGECACHE_PIN_LEFT_UNPINNED,
+                  PAGECACHE_WRITE_DELAY,
+                  0, LSN_IMPOSSIBLE);
+  pagecache_read(&pagecache, &file1, 0, 3, buffr,
+                 PAGECACHE_PLAIN_PAGE,
+                 PAGECACHE_LOCK_LEFT_UNLOCKED,
+                 0);
+  ok((res= test(memcmp(buffr, buffw, TEST_PAGE_SIZE) == 0)),
+     "Simple write-read page ");
+  if (flush_pagecache_blocks(&pagecache, &file1, FLUSH_FORCE_WRITE))
+  {
+    diag("Got error during flushing pagecache\n");
+    exit(1);
+  }
+  ok((res&= test(test_file(file1, file1_name, TEST_PAGE_SIZE, TEST_PAGE_SIZE,
+                           simple_read_write_test_file))),
+     "Simple write-read page file");
+  if (res)
+    reset_file(&file1, file1_name);
+  free(buffw);
+  free(buffr);
+  DBUG_RETURN(res);
+}
+
+
+/*
+  Prepare page, then read (and lock), change (write new value and unlock),
+  then check the page in the cache and on the disk
+*/
+int simple_read_change_write_read_test()
+{
+  unsigned char *buffw= malloc(TEST_PAGE_SIZE);
+  unsigned char *buffr= malloc(TEST_PAGE_SIZE);
+  int res, res2;
+  DBUG_ENTER("simple_read_change_write_read_test");
+
+  /* prepare the file */
+  bfill(buffw, TEST_PAGE_SIZE, '\1');
+  pagecache_write(&pagecache, &file1, 0, 3, buffw,
+                  PAGECACHE_PLAIN_PAGE,
+                  PAGECACHE_LOCK_LEFT_UNLOCKED,
+                  PAGECACHE_PIN_LEFT_UNPINNED,
+                  PAGECACHE_WRITE_DELAY,
+                  0, LSN_IMPOSSIBLE);
+  if (flush_pagecache_blocks(&pagecache, &file1, FLUSH_FORCE_WRITE))
+  {
+    diag("Got error during flushing pagecache\n");
+    exit(1);
+  }
+  /* test */
+  pagecache_read(&pagecache, &file1, 0, 3, buffw,
+                 PAGECACHE_PLAIN_PAGE,
+                 PAGECACHE_LOCK_WRITE,
+                 0);
+  bfill(buffw, TEST_PAGE_SIZE/2, '\65');
+  pagecache_write(&pagecache, &file1, 0, 3, buffw,
+                  PAGECACHE_PLAIN_PAGE,
+                  PAGECACHE_LOCK_WRITE_UNLOCK,
+                  PAGECACHE_UNPIN,
+                  PAGECACHE_WRITE_DELAY,
+                  0, LSN_IMPOSSIBLE);
+
+  pagecache_read(&pagecache, &file1, 0, 3, buffr,
+                 PAGECACHE_PLAIN_PAGE,
+                 PAGECACHE_LOCK_LEFT_UNLOCKED,
+                 0);
+  ok((res= test(memcmp(buffr, buffw, TEST_PAGE_SIZE) == 0)),
+     "Simple read-change-write-read page ");
+  DBUG_ASSERT(pagecache.blocks_changed == 1);
+  if (flush_pagecache_blocks(&pagecache, &file1, FLUSH_FORCE_WRITE))
+  {
+    diag("Got error during flushing pagecache\n");
+    exit(1);
+  }
+  DBUG_ASSERT(pagecache.blocks_changed == 0);
+  ok((res2= test(test_file(file1, file1_name, TEST_PAGE_SIZE, TEST_PAGE_SIZE,
+                           simple_read_change_write_read_test_file))),
+     "Simple read-change-write-read page file");
+  if (res && res2)
+    reset_file(&file1, file1_name);
+  free(buffw);
+  free(buffr);
+  DBUG_RETURN(res && res2);
+}
+
+
+/*
+  Prepare page, read page 0 (and pin) then write page 1 and page 0.
+  Flush the file (should flush only page 1 and return 1 (page 0 is
+  still pinned).
+  Check file on the disk.
+  Unpin and flush.
+  Check file on the disk.
+*/
+int simple_pin_test()
+{
+  unsigned char *buffw= malloc(TEST_PAGE_SIZE);
+  int res;
+  DBUG_ENTER("simple_pin_test");
+  /* prepare the file */
+  bfill(buffw, TEST_PAGE_SIZE, '\1');
+  pagecache_write(&pagecache, &file1, 0, 3, buffw,
+                  PAGECACHE_PLAIN_PAGE,
+                  PAGECACHE_LOCK_LEFT_UNLOCKED,
+                  PAGECACHE_PIN_LEFT_UNPINNED,
+                  PAGECACHE_WRITE_DELAY,
+                  0, LSN_IMPOSSIBLE);
+  /* test */
+  if (flush_pagecache_blocks(&pagecache, &file1, FLUSH_FORCE_WRITE))
+  {
+    diag("Got error during flushing pagecache\n");
+    exit(1);
+  }
+  pagecache_read(&pagecache, &file1, 0, 3, buffw,
+                 PAGECACHE_PLAIN_PAGE,
+                 PAGECACHE_LOCK_WRITE,
+                 0);
+  pagecache_write(&pagecache, &file1, 1, 3, buffw,
+                  PAGECACHE_PLAIN_PAGE,
+                  PAGECACHE_LOCK_LEFT_UNLOCKED,
+                  PAGECACHE_PIN_LEFT_UNPINNED,
+                  PAGECACHE_WRITE_DELAY,
+                  0, LSN_IMPOSSIBLE);
+  bfill(buffw + TEST_PAGE_SIZE/2, TEST_PAGE_SIZE/2, ((unsigned char) 129));
+  pagecache_write(&pagecache, &file1, 0, 3, buffw,
+                  PAGECACHE_PLAIN_PAGE,
+                  PAGECACHE_LOCK_LEFT_WRITELOCKED,
+                  PAGECACHE_PIN_LEFT_PINNED,
+                  PAGECACHE_WRITE_DELAY,
+                  0, LSN_IMPOSSIBLE);
+  /*
+    We have to get error because one page of the file is pinned,
+    other page should be flushed
+  */
+  if (!flush_pagecache_blocks(&pagecache, &file1, FLUSH_FORCE_WRITE))
+  {
+    diag("Did not get error in flush_pagecache_blocks\n");
+    res= 0;
+    goto err;
+  }
+  ok((res= test(test_file(file1, file1_name, TEST_PAGE_SIZE*2, TEST_PAGE_SIZE*2,
+                           simple_pin_test_file1))),
+     "Simple pin page file with pin");
+  pagecache_unlock(&pagecache,
+                   &file1,
+                   0,
+                   PAGECACHE_LOCK_WRITE_UNLOCK,
+                   PAGECACHE_UNPIN,
+                   0, 0, 0);
+  if (flush_pagecache_blocks(&pagecache, &file1, FLUSH_FORCE_WRITE))
+  {
+    diag("Got error in flush_pagecache_blocks\n");
+    res= 0;
+    goto err;
+  }
+  ok((res&= test(test_file(file1, file1_name, TEST_PAGE_SIZE*2, TEST_PAGE_SIZE,
+                           simple_pin_test_file2))),
+     "Simple pin page result file");
+  if (res)
+    reset_file(&file1, file1_name);
+err:
+  free(buffw);
+  DBUG_RETURN(res);
+}
+
+/*
+  Prepare page, read page 0 (and pin) then write page 1 and page 0.
+  Flush the file (should flush only page 1 and return 1 (page 0 is
+  still pinned).
+  Check file on the disk.
+  Unpin and flush.
+  Check file on the disk.
+*/
+int simple_pin_test2()
+{
+  unsigned char *buffw= malloc(TEST_PAGE_SIZE);
+  int res;
+  DBUG_ENTER("simple_pin_test2");
+  /* prepare the file */
+  bfill(buffw, TEST_PAGE_SIZE, '\1');
+  pagecache_write(&pagecache, &file1, 0, 3, buffw,
+                  PAGECACHE_PLAIN_PAGE,
+                  PAGECACHE_LOCK_LEFT_UNLOCKED,
+                  PAGECACHE_PIN_LEFT_UNPINNED,
+                  PAGECACHE_WRITE_DELAY,
+                  0, LSN_IMPOSSIBLE);
+  /* test */
+  if (flush_pagecache_blocks(&pagecache, &file1, FLUSH_FORCE_WRITE))
+  {
+    diag("Got error during flushing pagecache\n");
+    exit(1);
+  }
+  pagecache_read(&pagecache, &file1, 0, 3, buffw,
+                 PAGECACHE_PLAIN_PAGE,
+                 PAGECACHE_LOCK_WRITE,
+                 0);
+  pagecache_write(&pagecache, &file1, 1, 3, buffw,
+                  PAGECACHE_PLAIN_PAGE,
+                  PAGECACHE_LOCK_LEFT_UNLOCKED,
+                  PAGECACHE_PIN_LEFT_UNPINNED,
+                  PAGECACHE_WRITE_DELAY,
+                  0, LSN_IMPOSSIBLE);
+  bfill(buffw + TEST_PAGE_SIZE/2, TEST_PAGE_SIZE/2, ((unsigned char) 129));
+  pagecache_write(&pagecache, &file1, 0, 3, buffw,
+                  PAGECACHE_PLAIN_PAGE,
+                  PAGECACHE_LOCK_WRITE_TO_READ,
+                  PAGECACHE_PIN_LEFT_PINNED,
+                  PAGECACHE_WRITE_DELAY,
+                  0, LSN_IMPOSSIBLE);
+  /*
+    We have to get error because one page of the file is pinned,
+    other page should be flushed
+  */
+  if (!flush_pagecache_blocks(&pagecache, &file1, FLUSH_KEEP_LAZY))
+  {
+    diag("Did not get error in flush_pagecache_blocks 2\n");
+    res= 0;
+    goto err;
+  }
+  ok((res= test(test_file(file1, file1_name, TEST_PAGE_SIZE*2, TEST_PAGE_SIZE*2,
+                           simple_pin_test_file1))),
+     "Simple pin page file with pin 2");
+
+  /* Test that a normal flush goes through */
+  if (flush_pagecache_blocks(&pagecache, &file1, FLUSH_FORCE_WRITE))
+  {
+    diag("Got error in flush_pagecache_blocks 3\n");
+    res= 0;
+    goto err;
+  }
+  pagecache_unlock(&pagecache,
+                   &file1,
+                   0,
+                   PAGECACHE_LOCK_READ_UNLOCK,
+                   PAGECACHE_UNPIN,
+                   0, 0, 0);
+  if (flush_pagecache_blocks(&pagecache, &file1, FLUSH_FORCE_WRITE))
+  {
+    diag("Got error in flush_pagecache_blocks 4\n");
+    res= 0;
+    goto err;
+  }
+  ok((res&= test(test_file(file1, file1_name, TEST_PAGE_SIZE*2, TEST_PAGE_SIZE,
+                           simple_pin_test_file2))),
+     "Simple pin page result file 2");
+  if (res)
+    reset_file(&file1, file1_name);
+err:
+  free(buffw);
+  DBUG_RETURN(res);
+}
+
+/*
+  Checks pins without lock.
+*/
+int simple_pin_no_lock_test()
+{
+  unsigned char *buffw= malloc(TEST_PAGE_SIZE);
+  PAGECACHE_BLOCK_LINK *link;
+  int res;
+  DBUG_ENTER("simple_pin_no_lock_test");
+  /* prepare the file */
+  bfill(buffw, TEST_PAGE_SIZE, '\4');
+  pagecache_write(&pagecache, &file1, 0, 3, buffw,
+                  PAGECACHE_PLAIN_PAGE,
+                  PAGECACHE_LOCK_LEFT_UNLOCKED,
+                  PAGECACHE_PIN_LEFT_UNPINNED,
+                  PAGECACHE_WRITE_DELAY,
+                  0, LSN_IMPOSSIBLE);
+  /* test */
+  if (flush_pagecache_blocks(&pagecache, &file1, FLUSH_FORCE_WRITE))
+  {
+    diag("Got error during flushing pagecache 2\n");
+    exit(1);
+  }
+  bfill(buffw, TEST_PAGE_SIZE, '\5');
+  pagecache_write(&pagecache, &file1, 0, 3, buffw,
+                  PAGECACHE_PLAIN_PAGE,
+                  PAGECACHE_LOCK_LEFT_UNLOCKED,
+                  PAGECACHE_PIN,
+                  PAGECACHE_WRITE_DELAY,
+                  0, LSN_IMPOSSIBLE);
+  /*
+    We have to get error because one page of the file is pinned,
+    other page should be flushed
+  */
+  if (!flush_pagecache_blocks(&pagecache, &file1, FLUSH_KEEP_LAZY))
+  {
+    diag("Did not get error in flush_pagecache_blocks 2\n");
+    res= 0;
+    goto err;
+  }
+  ok((res= test(test_file(file1, file1_name, TEST_PAGE_SIZE, TEST_PAGE_SIZE,
+                           simple_pin_no_lock_test_file1))),
+     "Simple pin (no lock) page file with pin 2");
+  pagecache_unlock(&pagecache,
+                   &file1,
+                   0,
+                   PAGECACHE_LOCK_LEFT_UNLOCKED,
+                   PAGECACHE_UNPIN,
+                   0, 0, 0);
+  if (flush_pagecache_blocks(&pagecache, &file1, FLUSH_FORCE_WRITE))
+  {
+    diag("Got error in flush_pagecache_blocks 2\n");
+    res= 0;
+    goto err;
+  }
+  ok((res&= test(test_file(file1, file1_name, TEST_PAGE_SIZE, TEST_PAGE_SIZE,
+                           simple_pin_no_lock_test_file2))),
+     "Simple pin (no lock) page result file 2");
+
+  bfill(buffw, TEST_PAGE_SIZE, '\6');
+  pagecache_write(&pagecache, &file1, 0, 3, buffw,
+                  PAGECACHE_PLAIN_PAGE,
+                  PAGECACHE_LOCK_WRITE,
+                  PAGECACHE_PIN,
+                  PAGECACHE_WRITE_DELAY,
+                  &link, LSN_IMPOSSIBLE);
+  pagecache_unlock_by_link(&pagecache, link,
+                           PAGECACHE_LOCK_WRITE_UNLOCK,
+                           PAGECACHE_PIN_LEFT_PINNED, 0, 0, 1, FALSE);
+  if (!flush_pagecache_blocks(&pagecache, &file1, FLUSH_KEEP_LAZY))
+  {
+    diag("Did not get error in flush_pagecache_blocks 3\n");
+    res= 0;
+    goto err;
+  }
+  ok((res= test(test_file(file1, file1_name, TEST_PAGE_SIZE, TEST_PAGE_SIZE,
+                           simple_pin_no_lock_test_file2))),
+     "Simple pin (no lock) page file with pin 3");
+  pagecache_unpin_by_link(&pagecache, link, 0);
+  if (flush_pagecache_blocks(&pagecache, &file1, FLUSH_FORCE_WRITE))
+  {
+    diag("Got error in flush_pagecache_blocks 3\n");
+    res= 0;
+    goto err;
+  }
+  ok((res&= test(test_file(file1, file1_name, TEST_PAGE_SIZE, TEST_PAGE_SIZE,
+                           simple_pin_no_lock_test_file3))),
+     "Simple pin (no lock) page result file 3");
+  if (res)
+    reset_file(&file1, file1_name);
+err:
+  free(buffw);
+  DBUG_RETURN(res);
+}
+/*
+  Prepare page, write new value, then delete page from cache without flush,
+  on the disk should be page with old content written during preparation
+*/
+
+int simple_delete_forget_test()
+{
+  unsigned char *buffw= malloc(TEST_PAGE_SIZE);
+  unsigned char *buffr= malloc(TEST_PAGE_SIZE);
+  int res;
+  DBUG_ENTER("simple_delete_forget_test");
+  /* prepare the file */
+  bfill(buffw, TEST_PAGE_SIZE, '\1');
+  pagecache_write(&pagecache, &file1, 0, 3, buffw,
+                  PAGECACHE_PLAIN_PAGE,
+                  PAGECACHE_LOCK_LEFT_UNLOCKED,
+                  PAGECACHE_PIN_LEFT_UNPINNED,
+                  PAGECACHE_WRITE_DELAY,
+                  0, LSN_IMPOSSIBLE);
+  flush_pagecache_blocks(&pagecache, &file1, FLUSH_FORCE_WRITE);
+  /* test */
+  bfill(buffw, TEST_PAGE_SIZE, '\2');
+  pagecache_write(&pagecache, &file1, 0, 3, buffw,
+                  PAGECACHE_PLAIN_PAGE,
+                  PAGECACHE_LOCK_LEFT_UNLOCKED,
+                  PAGECACHE_PIN_LEFT_UNPINNED,
+                  PAGECACHE_WRITE_DELAY,
+                  0, LSN_IMPOSSIBLE);
+  pagecache_delete(&pagecache, &file1, 0,
+                   PAGECACHE_LOCK_WRITE, 0);
+  flush_pagecache_blocks(&pagecache, &file1, FLUSH_FORCE_WRITE);
+  ok((res= test(test_file(file1, file1_name, TEST_PAGE_SIZE, TEST_PAGE_SIZE,
+                          simple_delete_forget_test_file))),
+     "Simple delete-forget page file");
+  if (res)
+    reset_file(&file1, file1_name);
+  free(buffw);
+  free(buffr);
+  DBUG_RETURN(res);
+}
+
+/*
+  Prepare page with locking, write new content to the page,
+  delete page with flush and on existing lock,
+  check that page on disk contain new value.
+*/
+
+int simple_delete_flush_test()
+{
+  unsigned char *buffw= malloc(TEST_PAGE_SIZE);
+  unsigned char *buffr= malloc(TEST_PAGE_SIZE);
+  PAGECACHE_BLOCK_LINK *link;
+  int res;
+  DBUG_ENTER("simple_delete_flush_test");
+  /* prepare the file */
+  bfill(buffw, TEST_PAGE_SIZE, '\1');
+  pagecache_write(&pagecache, &file1, 0, 3, buffw,
+                  PAGECACHE_PLAIN_PAGE,
+                  PAGECACHE_LOCK_WRITE,
+                  PAGECACHE_PIN,
+                  PAGECACHE_WRITE_DELAY,
+                  &link, LSN_IMPOSSIBLE);
+  flush_pagecache_blocks(&pagecache, &file1, FLUSH_FORCE_WRITE);
+  /* test */
+  bfill(buffw, TEST_PAGE_SIZE, '\2');
+  pagecache_write(&pagecache, &file1, 0, 3, buffw,
+                  PAGECACHE_PLAIN_PAGE,
+                  PAGECACHE_LOCK_LEFT_WRITELOCKED,
+                  PAGECACHE_PIN_LEFT_PINNED,
+                  PAGECACHE_WRITE_DELAY,
+                  0, LSN_IMPOSSIBLE);
+  if (pagecache_delete_by_link(&pagecache, link,
+			       PAGECACHE_LOCK_LEFT_WRITELOCKED, 1))
+  {
+    diag("simple_delete_flush_test: error during delete");
+    exit(1);
+  }
+  flush_pagecache_blocks(&pagecache, &file1, FLUSH_FORCE_WRITE);
+  ok((res= test(test_file(file1, file1_name, TEST_PAGE_SIZE, TEST_PAGE_SIZE,
+                          simple_delete_flush_test_file))),
+     "Simple delete flush (link) page file");
+  if (res)
+    reset_file(&file1, file1_name);
+  free(buffw);
+  free(buffr);
+  DBUG_RETURN(res);
+}
+
+
+/*
+  write then read file bigger then cache
+*/
+
+int simple_big_test()
+{
+  unsigned char *buffw= (unsigned char *) my_malloc(TEST_PAGE_SIZE, MYF(MY_WME));
+  unsigned char *buffr= (unsigned char *) my_malloc(TEST_PAGE_SIZE, MYF(MY_WME));
+  struct file_desc *desc= ((struct file_desc *)
+                           my_malloc((PCACHE_SIZE/(TEST_PAGE_SIZE/2) + 1) *
+                                     sizeof(struct file_desc), MYF(MY_WME)));
+  int res, i;
+  DBUG_ENTER("simple_big_test");
+
+  /* prepare the file twice larger then cache */
+  for (i= 0; i < PCACHE_SIZE/(TEST_PAGE_SIZE/2); i++)
+  {
+    bfill(buffw, TEST_PAGE_SIZE, (unsigned char) (i & 0xff));
+    desc[i].length= TEST_PAGE_SIZE;
+    desc[i].content= (i & 0xff);
+    pagecache_write(&pagecache, &file1, i, 3, buffw,
+                    PAGECACHE_PLAIN_PAGE,
+                    PAGECACHE_LOCK_LEFT_UNLOCKED,
+                    PAGECACHE_PIN_LEFT_UNPINNED,
+                    PAGECACHE_WRITE_DELAY,
+                    0, LSN_IMPOSSIBLE);
+  }
+  desc[i].length= 0;
+  desc[i].content= '\0';
+  ok(1, "Simple big file write");
+  /* check written pages sequentally read */
+  for (i= 0; i < PCACHE_SIZE/(TEST_PAGE_SIZE/2); i++)
+  {
+    int j;
+    pagecache_read(&pagecache, &file1, i, 3, buffr,
+                   PAGECACHE_PLAIN_PAGE,
+                   PAGECACHE_LOCK_LEFT_UNLOCKED,
+                   0);
+    for(j= 0; j < TEST_PAGE_SIZE; j++)
+    {
+      if (buffr[j] != (i & 0xff))
+      {
+        diag("simple_big_test seq: page %u byte %u mismatch\n", i, j);
+        res= 0;
+        goto err;
+      }
+    }
+  }
+  ok(1, "Simple big file sequential read");
+  /* chack random reads */
+  for (i= 0; i < PCACHE_SIZE/(TEST_PAGE_SIZE); i++)
+  {
+    int j, page;
+    page= rand() % (PCACHE_SIZE/(TEST_PAGE_SIZE/2));
+    pagecache_read(&pagecache, &file1, page, 3, buffr,
+                   PAGECACHE_PLAIN_PAGE,
+                   PAGECACHE_LOCK_LEFT_UNLOCKED,
+                   0);
+    for(j= 0; j < TEST_PAGE_SIZE; j++)
+    {
+      if (buffr[j] != (page & 0xff))
+      {
+        diag("simple_big_test rnd: page %u byte %u mismatch\n", page, j);
+        res= 0;
+        goto err;
+      }
+    }
+  }
+  ok(1, "Simple big file random read");
+  flush_pagecache_blocks(&pagecache, &file1, FLUSH_FORCE_WRITE);
+
+  ok((res= test(test_file(file1, file1_name, PCACHE_SIZE*2, TEST_PAGE_SIZE,
+                          desc))),
+     "Simple big file");
+  if (res)
+    reset_file(&file1, file1_name);
+
+err:
+  my_free(buffw, 0);
+  my_free(buffr, 0);
+  my_free(desc, 0);
+  DBUG_RETURN(res);
+}
+
+
+/*
+  Thread function
+*/
+
+static void *test_thread(void *arg)
+{
+#ifndef DBUG_OFF
+  int param= *((int*) arg);
+#endif
+
+  my_thread_init();
+  {
+  DBUG_ENTER("test_thread");
+  DBUG_PRINT("enter", ("param: %d", param));
+
+  if (!simple_read_write_test() ||
+      !simple_read_change_write_read_test() ||
+      !simple_pin_test() ||
+      !simple_pin_test2() ||
+      !simple_pin_no_lock_test() ||
+      !simple_delete_forget_test() ||
+      !simple_delete_flush_test())
+    exit(1);
+
+  SKIP_BIG_TESTS(4)
+  {
+    if (!simple_big_test())
+      exit(1);
+  }
+
+  DBUG_PRINT("info", ("Thread %s ended\n", my_thread_name()));
+  pthread_mutex_lock(&LOCK_thread_count);
+  thread_count--;
+  VOID(pthread_cond_signal(&COND_thread_count)); /* Tell main we are ready */
+  pthread_mutex_unlock(&LOCK_thread_count);
+  free((uchar*) arg);
+  my_thread_end();
+  DBUG_RETURN(0);
+  }
+}
+
+
+int main(int argc __attribute__((unused)),
+         char **argv __attribute__((unused)))
+{
+  pthread_t tid;
+  pthread_attr_t thr_attr;
+  int *param, error, pagen;
+  File tmp_file;
+  MY_INIT(argv[0]);
+
+#ifndef DBUG_OFF
+#if defined(__WIN__)
+  default_dbug_option= "d:t:i:O,\\test_pagecache_single.trace";
+#else
+  default_dbug_option= "d:t:i:o,/tmp/test_pagecache_single.trace";
+#endif
+  if (argc > 1)
+  {
+    DBUG_SET(default_dbug_option);
+    DBUG_SET_INITIAL(default_dbug_option);
+  }
+#endif
+  {
+  DBUG_ENTER("main");
+  DBUG_PRINT("info", ("Main thread: %s\n", my_thread_name()));
+
+  plan(18);
+  SKIP_BIG_TESTS(18)
+  {
+
+  if ((tmp_file= my_open(file2_name, O_CREAT | O_TRUNC | O_RDWR,
+                         MYF(MY_WME))) < 0)
+    exit(1);
+
+  if ((file1.file= my_open(file1_name,
+                           O_CREAT | O_TRUNC | O_RDWR, MYF(0))) == -1)
+  {
+    fprintf(stderr, "Got error during file1 creation from open() (errno: %d)\n",
+	    errno);
+    exit(1);
+  }
+  pagecache_file_init(file1, &dummy_callback, &dummy_callback,
+                      &dummy_fail_callback, &dummy_callback, NULL);
+  my_close(tmp_file, MYF(0));
+  my_delete(file2_name, MYF(0));
+
+  DBUG_PRINT("info", ("file1: %d", file1.file));
+  if (my_chmod(file1_name, S_IRWXU | S_IRWXG | S_IRWXO, MYF(MY_WME)))
+    exit(1);
+  my_pwrite(file1.file, (const uchar*)"test file", 9, 0, MYF(MY_WME));
+
+  if ((error= pthread_cond_init(&COND_thread_count, NULL)))
+  {
+    fprintf(stderr, "Got error: %d from pthread_cond_init (errno: %d)\n",
+	    error, errno);
+    exit(1);
+  }
+  if ((error= pthread_mutex_init(&LOCK_thread_count, MY_MUTEX_INIT_FAST)))
+  {
+    fprintf(stderr, "Got error: %d from pthread_cond_init (errno: %d)\n",
+	    error, errno);
+    exit(1);
+  }
+
+  if ((error= pthread_attr_init(&thr_attr)))
+  {
+    fprintf(stderr,"Got error: %d from pthread_attr_init (errno: %d)\n",
+	    error,errno);
+    exit(1);
+  }
+  if ((error= pthread_attr_setdetachstate(&thr_attr, PTHREAD_CREATE_DETACHED)))
+  {
+    fprintf(stderr,
+	    "Got error: %d from pthread_attr_setdetachstate (errno: %d)\n",
+	    error,errno);
+    exit(1);
+  }
+
+#ifdef HAVE_THR_SETCONCURRENCY
+  VOID(thr_setconcurrency(2));
+#endif
+
+  if ((pagen= init_pagecache(&pagecache, PCACHE_SIZE, 0, 0,
+                             TEST_PAGE_SIZE, MYF(MY_WME))) == 0)
+  {
+    fprintf(stderr,"Got error: init_pagecache() (errno: %d)\n",
+            errno);
+    exit(1);
+  }
+  DBUG_PRINT("info", ("Page cache %d pages", pagen));
+
+  pthread_mutex_lock(&LOCK_thread_count);
+  param=(int*) malloc(sizeof(int));
+  *param= 1;
+  if ((error= pthread_create(&tid, &thr_attr, test_thread, (void*) param)))
+  {
+    fprintf(stderr,"Got error: %d from pthread_create (errno: %d)\n",
+            error,errno);
+    exit(1);
+  }
+  thread_count++;
+  DBUG_PRINT("info", ("Thread started"));
+  pthread_mutex_unlock(&LOCK_thread_count);
+
+  pthread_attr_destroy(&thr_attr);
+
+  pthread_mutex_lock(&LOCK_thread_count);
+  while (thread_count)
+  {
+    if ((error= pthread_cond_wait(&COND_thread_count,&LOCK_thread_count)))
+      fprintf(stderr,"Got error: %d from pthread_cond_wait\n",error);
+  }
+  pthread_mutex_unlock(&LOCK_thread_count);
+  DBUG_PRINT("info", ("thread ended"));
+
+  end_pagecache(&pagecache, 1);
+  DBUG_PRINT("info", ("Page cache ended"));
+
+  if (my_close(file1.file, MYF(MY_WME)))
+    exit(1);
+
+  my_delete(file1_name, MYF(0));
+
+  } /* SKIP_BIG_TESTS */
+  DBUG_PRINT("info", ("file1 (%d) closed", file1.file));
+  DBUG_PRINT("info", ("Program end"));
+
+  my_end(0);
+
+  }
+  return exit_status();
+}
diff --git a/storage/maria/unittest/ma_test_all-t b/storage/maria/unittest/ma_test_all-t
new file mode 100755
index 00000000000..0b11daf7f98
--- /dev/null
+++ b/storage/maria/unittest/ma_test_all-t
@@ -0,0 +1,710 @@
+#!/usr/bin/env perl
+#
+# Run various unit tests.
+#
+
+use Getopt::Long;
+use File::Basename;
+
+$|= 1;
+$^W = 1; # warnings, because env cannot parse 'perl -w'
+$VER= "1.4";
+
+$opt_version=         0;
+$opt_help=            0;
+$opt_verbose=         0;
+$opt_abort_on_error=  0;
+$opt_valgrind=        "valgrind --alignment=8 --leak-check=yes";
+$opt_silent=          "-s";
+$opt_number_of_tests= 0;
+$opt_run_tests=       undef();
+
+my $maria_path;       # path to "storage/maria"
+my $maria_exe_path;   # path to executables (ma_test1, aria_chk etc)
+my $my_progname= $0;
+$my_progname=~ s/.*[\/]//;
+my $runtime_error= 0; # Return 1 if error(s) occur during run
+my $NEW_TEST= 0;      # Test group separator in an array of tests
+my $test_begin= 0;
+my $test_end= 0;
+my $test_counter= 0;
+
+run_tests();
+
+####
+#### Initialise variables, clean temporary files and run the tests
+####
+
+sub run_tests
+{
+  my $nr_tests= 0;
+  my $flag_exit= 0;
+
+  if (!GetOptions("help" => \$opt_help,
+                  "version" => \$opt_version,
+                  "verbose" => \$opt_verbose,
+                  "abort-on-error" => \$opt_abort_on_error,
+                  "valgrind=s" => \$opt_valgrind,
+                  "silent=s" => \$opt_silent,
+                  "number-of-tests" => \$opt_number_of_tests,
+                  "run-tests=s" => \$opt_run_tests,
+                  "start-from=s" => \$opt_run_tests))
+  {
+    $flag_exit= 1;
+  }
+  if ($opt_version)
+  {
+    print "$my_progname version $VER\n";
+    exit(0);
+  }
+  $maria_path= dirname($0) . "/..";
+
+  my $suffix= ( $^O =~ /win/i  && $^O !~ /darwin/i ) ? ".exe" : "";
+  $maria_exe_path= "$maria_path/release";
+  # we use -f, sometimes -x is unexpectedly false in Cygwin
+  if ( ! -f "$maria_exe_path/ma_test1$suffix" )
+  {
+    $maria_exe_path= "$maria_path/relwithdebinfo";
+    if ( ! -f "$maria_exe_path/ma_test1$suffix" )
+    {
+      $maria_exe_path= "$maria_path/debug";
+      if ( ! -f "$maria_exe_path/ma_test1$suffix" )
+      {
+        $maria_exe_path= $maria_path;
+        if ( ! -f "$maria_exe_path/ma_test1$suffix" )
+        {
+           die("Cannot find ma_test1 executable\n");
+        }
+      }
+    }
+  }    
+
+  usage() if ($opt_help || $flag_exit);
+
+  #
+  # IMPORTANT: If you modify this file, please read this:
+  #
+  # Count total number of tests. Make sure that the functions return
+  # number of unit tests correctly, e.g. calls to ok(). The last argument
+  # for each function is a flag counter and will return the number of
+  # unit tests in each. Please see comments on function ok() at the end.
+  #
+  # If you modify any functions or add any new ones, please make sure the
+  # unit tests are appropriately detected here. A wrong count will
+  # make the unit test fail during 'make test'. $nr_tests must be right.
+  #
+
+  $nr_tests+= run_check_tests(0, 0, 0, 0, 1) * 5;  #
+  $nr_tests+= run_repair_tests(0, 0, 0, 0, 1) * 5; # called 4 times
+  $nr_tests+= run_pack_tests(0, 0, 0, 0, 1) * 5;   #
+  $nr_tests+= run_tests_on_warnings_and_errors(0, 0, 0, 1);
+  $nr_tests+= run_ma_test_recovery(0, 1);
+  $nr_tests+= run_tests_on_clrs(0, 0, 1);
+
+  if ($opt_number_of_tests)
+  {
+    print "Total number of tests is $nr_tests\n";
+    exit(0);
+  }
+
+  if (defined($opt_run_tests))
+  {
+    if ($opt_run_tests =~ m/^(\d+)$/ ||
+        $opt_run_tests =~ m/^(\d+)\.+$/)
+    {
+      $test_begin= $1;
+    }
+    elsif ($opt_run_tests =~ m/^(\d+)\.+(\d+)$/)
+    {
+      $test_begin= $1;
+      $test_end= $2;
+    }
+    else
+    {
+      print "Wrong syntax for option --run-tests=$opt_run_tests\n";
+      print "Please use --run-tests=<begin>..<end>\nwhere 'begin' is the ";
+      print "first test to be run and 'end' is the last.\n";
+      exit(1);
+    }
+    if ($test_end > $nr_tests)
+    {
+      print "Test range ($test_begin..$test_end) out of range. ";
+      print "There are only $nr_tests in the test suite.\n";
+      exit(1);
+    }
+    $test_begin++ if (!$test_begin); # Handle zero, if user gave that
+    if ($test_end && $test_begin > $test_end)
+    {
+      print "Bad test range ($test_begin..$test_end)\n";
+      exit(1);
+    }
+    # Now adjust number of tests
+    $nr_tests= ($test_end ? $test_end : $nr_tests) - $test_begin + 1;
+  }
+
+  #
+  # clean-up
+  #
+
+  unlink <*.TMD aria_log*>;        # Delete temporary files
+
+  #
+  # Run tests
+  #
+
+  if (!$opt_verbose)
+  {
+    print "1..$nr_tests\n";
+  }
+  else
+  {
+    print "Total tests: $nr_tests\n";
+  }
+
+  if ($opt_verbose)
+  {
+    print "Running tests with dynamic row format\n"
+  }
+  run_check_tests($suffix, $opt_silent, "", $opt_verbose, 0);
+  run_repair_tests($suffix, $opt_silent, "", $opt_verbose, 0);
+  run_pack_tests($suffix, $opt_silent, "", $opt_verbose, 0);
+
+  if ($opt_verbose)
+  {
+    print "\nRunning tests with static row format\n";
+  }
+  run_check_tests($suffix, $opt_silent, "-S", $opt_verbose, 0);
+  run_repair_tests($suffix, $opt_silent, "-S", $opt_verbose, 0);
+  run_pack_tests($suffix, $opt_silent, "-S", $opt_verbose, 0);
+
+  if ($opt_verbose)
+  {
+    print "\nRunning tests with block row format\n";
+  }
+  run_check_tests($suffix, $opt_silent, "-M", $opt_verbose, 0);
+  run_repair_tests($suffix, $opt_silent, "-M", $opt_verbose, 0);
+  run_pack_tests($suffix, $opt_silent, "-M", $opt_verbose, 0);
+
+  if ($opt_verbose)
+  {
+    print "\nRunning tests with block row format and transactions\n";
+  }
+  run_check_tests($suffix, $opt_silent, "-M -T", $opt_verbose, 0);
+  run_repair_tests($suffix, $opt_silent, "-M -T", $opt_verbose, 0);
+  run_pack_tests($suffix, $opt_silent, "-M -T", $opt_verbose, 0);
+
+  if ($opt_verbose)
+  {
+    print "\nRunning tests with block row format, transactions and versioning\n";
+  }
+  run_check_tests($suffix, $opt_silent, "-M -T -C", $opt_verbose, 0);
+  run_repair_tests($suffix, $opt_silent, "-M -T -C", $opt_verbose, 0);
+  run_pack_tests($suffix, $opt_silent, "-M -T -C", $opt_verbose, 0);
+
+
+  if ($opt_verbose)
+  {
+    print "\nRunning tests with warnings and recovery\n";
+  }
+  run_tests_on_warnings_and_errors($suffix, $opt_silent, $opt_verbose, 0);
+  run_ma_test_recovery($opt_verbose, 0);
+  run_tests_on_clrs($suffix, $opt_verbose, 0);
+
+  exit($runtime_error);
+}
+
+####
+#### regular tests
+####
+
+sub run_check_tests
+{
+  my ($suffix, $silent, $row_type, $verbose, $count)= @_;
+  my ($i, $nr_tests);
+  my @ma_test1_opt= ( ["","-se"],
+                      ["-N","-se"],
+                      ["-P --checksum","-se"],
+                      ["-P -N","-se"],
+                      ["-B -N -R2","-sm"],
+                      ["-a -k 480 --unique","-sm"],
+                      ["-a -N -R1 ","-sm"],
+                      ["-p","-sm"],
+                      ["-p -N --unique","-sm"],
+                      ["-p -N --key_length=127 --checksum","-sm"],
+                      ["-p -N --key_length=128","-sm"],
+                      ["-p --key_length=480","-sm"],
+                      ["-a -B","-sm"],
+                      ["-a -B --key_length=64 --unique","-sm"],
+                      ["-a -B -k 480 --checksum","-sm"],
+                      ["-a -B -k 480 -N --unique --checksum","-sm"],
+                      ["-a -m","-sm"],
+                      ["-a -m -P --unique --checksum","-sm"],
+                      ["-a -m -P --key_length=480 --key_cache","-sm"],
+                      ["-m -p","-sm"],
+                      ["-w --unique","-sm"],
+                      ["-a -w --key_length=64 --checksum","-sm"],
+                      ["-a -w -N --key_length=480","-sm"],
+                      ["-a -w --key_length=480 --checksum","-sm"],
+                      ["-a -b -N","-sm"],
+                      ["-a -b --key_length=480","-sm"],
+                      ["-p -B --key_length=480","-sm"],
+                      ["--checksum --unique","-se"],
+                      ["--unique","-se"],
+                      ["--key_multiple -N -S","-sm"],
+                      ["--key_multiple -a -p --key_length=480","-sm"],
+                      ["--key_multiple -a -B --key_length=480","-sm"],
+                      ["--key_multiple -P -S","-sm"] );
+  my @ma_test2_opt= ( ["-L -K -W -P","-sm"],
+                      ["-L -K -W -P -A","-sm"],
+                      ["-L -K -W -P -b32768", "-sm"],
+                      ["-L -K -W -P -M -T -c -b32768 -t4 -m300", "-sm"],
+                      ["-L -K -P -R3 -m50 -b1000000", "-sm"],
+                      ["-L -B","-sm"],
+                      ["-D -B -c","-sm"],
+                      ["-m10000 -e4096 -K","-sm"],
+                      ["-m10000 -e8192 -K","-sm"],
+                      ["-m10000 -e16384 -E16384 -K -L","-sm"],
+                      ["-L -K -W -P -b32768", "-se"],
+                      ["-c -b65000","-se"] );
+  my @ma_rt_test_opt= ( ); # (["--checksum", "-se"] );
+
+
+  if ($count)
+  {
+    $nr_tests= 2;                # Number of tests outside loops
+    for ($i= 0; defined($ma_test1_opt[$i]); $i++) { $nr_tests+=2; }
+    for ($i= 0; defined($ma_test2_opt[$i]); $i++) { $nr_tests+=2; }
+    for ($i= 0; defined($ma_rt_test_opt[$i]); $i++) { $nr_tests+=2; }
+    return $nr_tests;
+  }
+
+  for ($i= 0; defined($ma_test1_opt[$i]); $i++)
+  {
+    unlink <aria_log_control aria_log.*>;
+    ok("$maria_exe_path/ma_test1$suffix $silent $ma_test1_opt[$i][0] $row_type",
+       $verbose, $i + 1);
+    ok("$maria_exe_path/aria_chk$suffix $ma_test1_opt[$i][1] test1",
+       $verbose, $i + 1);
+  }
+  #
+  # These tests are outside the loops. Make sure to include them in
+  # nr_tests manually
+  #
+  ok("$maria_exe_path/aria_pack$suffix --force -s test1", $verbose, 0);
+  ok("$maria_exe_path/aria_chk$suffix -ess test1", $verbose, 0);
+
+  for ($i= 0; defined($ma_test2_opt[$i]); $i++)
+  {
+    unlink <aria_log_control aria_log.*>;
+    ok("$maria_exe_path/ma_test2$suffix $silent $ma_test2_opt[$i][0] $row_type",
+       $verbose, $i + 1);
+    ok("$maria_exe_path/aria_chk$suffix $ma_test2_opt[$i][1] test2",
+       $verbose, $i + 1);
+  }
+
+  for ($i= 0; defined($ma_rt_test_opt[$i]); $i++)
+  {
+    unlink <aria_log_control aria_log.*>;
+    ok("$maria_exe_path/ma_rt_test$suffix $silent $ma_rt_test_opt[$i][0] $row_type",
+       $verbose, $i + 1);
+    ok("$maria_exe_path/aria_chk$suffix $ma_rt_test_opt[$i][1] rt_test",
+       $verbose, $i + 1);
+  }
+
+  unlink <aria_log_control aria_log.*>;
+
+  return 0;
+}
+
+####
+#### repair tests
+####
+
+sub run_repair_tests()
+{
+  my ($suffix, $silent, $row_type, $verbose, $count)= @_;
+  my ($i);
+
+  my @t= ($NEW_TEST,
+          "$maria_exe_path/ma_test1$suffix $silent --checksum $row_type",
+          "$maria_exe_path/aria_chk$suffix -se test1",
+          "$maria_exe_path/aria_chk$suffix --silent -re --transaction-log test1",
+          "$maria_exe_path/aria_chk$suffix -rs test1",
+          "$maria_exe_path/aria_chk$suffix -se test1",
+          "$maria_exe_path/aria_chk$suffix -rqs test1",
+          "$maria_exe_path/aria_chk$suffix -se test1",
+          "$maria_exe_path/aria_chk$suffix -rs --correct-checksum test1",
+          "$maria_exe_path/aria_chk$suffix -se test1",
+          "$maria_exe_path/aria_chk$suffix -rqs --correct-checksum test1",
+          "$maria_exe_path/aria_chk$suffix -se test1",
+          "$maria_exe_path/aria_chk$suffix -ros --correct-checksum test1",
+          "$maria_exe_path/aria_chk$suffix -se test1",
+          "$maria_exe_path/aria_chk$suffix -rqos --correct-checksum test1",
+          "$maria_exe_path/aria_chk$suffix -se test1",
+          "$maria_exe_path/aria_chk$suffix -sz test1",
+          "$maria_exe_path/aria_chk$suffix -se test1",
+          "$maria_exe_path/ma_test2$suffix $silent -c -d1 $row_type",
+          "$maria_exe_path/aria_chk$suffix -s --parallel-recover test2",
+          "$maria_exe_path/aria_chk$suffix -se test2",
+          "$maria_exe_path/aria_chk$suffix -s --parallel-recover --quick test2",
+          "$maria_exe_path/aria_chk$suffix -se test2",
+          "$maria_exe_path/ma_test2$suffix $silent -c $row_type",
+          "$maria_exe_path/aria_chk$suffix -se test2",
+          "$maria_exe_path/aria_chk$suffix -sr test2",
+          "$maria_exe_path/aria_chk$suffix -se test2",
+          "$maria_exe_path/ma_test2$suffix $silent -c -t4 -b32768 $row_type",
+          "$maria_exe_path/aria_chk$suffix -s --zerofill test1",
+          "$maria_exe_path/aria_chk$suffix -se test1"
+         );
+
+  return &count_tests(\@t) if ($count);
+  &run_test_bunch(\@t, $verbose, 0);
+  return 0;
+}
+
+####
+#### pack tests
+####
+
+sub run_pack_tests()
+{
+  my ($suffix, $silent, $row_type, $verbose, $count)= @_;
+  my ($i);
+
+  my @t= ($NEW_TEST,
+          "$maria_exe_path/ma_test1$suffix $silent --checksum $row_type",
+          "$maria_exe_path/aria_pack$suffix --force -s test1",
+          "$maria_exe_path/aria_chk$suffix -ess test1",
+          "$maria_exe_path/aria_chk$suffix -rqs test1",
+          "$maria_exe_path/aria_chk$suffix -es test1",
+          "$maria_exe_path/aria_chk$suffix -rs test1",
+          "$maria_exe_path/aria_chk$suffix -es test1",
+          "$maria_exe_path/aria_chk$suffix -rus test1",
+          "$maria_exe_path/aria_chk$suffix -es test1",
+          $NEW_TEST,
+          "$maria_exe_path/ma_test1$suffix $silent --checksum $row_type",
+          "$maria_exe_path/aria_pack$suffix --force -s test1",
+          "$maria_exe_path/aria_chk$suffix -rus --safe-recover test1",
+          "$maria_exe_path/aria_chk$suffix -es test1",
+          $NEW_TEST,
+          "$maria_exe_path/ma_test1$suffix $silent --checksum -S $row_type",
+          "$maria_exe_path/aria_chk$suffix -se test1",
+          "$maria_exe_path/aria_chk$suffix -ros test1",
+          "$maria_exe_path/aria_chk$suffix -rqs test1",
+          "$maria_exe_path/aria_chk$suffix -se test1",
+          $NEW_TEST,
+          "$maria_exe_path/aria_pack$suffix --force -s test1",
+          "$maria_exe_path/aria_chk$suffix -rqs test1",
+          "$maria_exe_path/aria_chk$suffix -es test1",
+          "$maria_exe_path/aria_chk$suffix -rus test1",
+          "$maria_exe_path/aria_chk$suffix -es test1",
+          $NEW_TEST,
+          "$maria_exe_path/ma_test2$suffix $silent -c -d1 $row_type",
+          "$maria_exe_path/aria_chk$suffix -s --parallel-recover test2",
+          "$maria_exe_path/aria_chk$suffix -se test2",
+          "$maria_exe_path/aria_chk$suffix -s --unpack --parallel-recover test2",
+          "$maria_exe_path/aria_chk$suffix -se test2",
+          "$maria_exe_path/aria_pack$suffix --force -s test1",
+          "$maria_exe_path/aria_chk$suffix -s --unpack --parallel-recover test2",
+          "$maria_exe_path/aria_chk$suffix -se test2",
+          $NEW_TEST,
+          "$maria_exe_path/ma_test1$suffix $silent -c $row_type",
+          "cp test1.MAD test2.MAD",
+          "cp test1.MAI test2.MAI",
+          "$maria_exe_path/aria_pack$suffix --force -s --join=test3 test1 test2",
+          "$maria_exe_path/aria_chk -s test3",
+          "$maria_exe_path/aria_chk -s --safe-recover test3",
+          "$maria_exe_path/aria_chk -s test3"
+         );
+
+  return &count_tests(\@t) if ($count);
+  &run_test_bunch(\@t, $verbose, 0);
+  return 0;
+}
+
+####
+#### Tests that gives warnings or errors
+####
+
+sub run_tests_on_warnings_and_errors
+{
+  my ($suffix, $silent, $verbose, $count)= @_;
+  my ($com);
+
+  return 9 if ($count);  # Number of tests in this function, e.g. calls to ok()
+
+  ok("$maria_exe_path/ma_test2$suffix $silent -L -K -W -P -S -R1 -m500",
+     $verbose, 0);
+  ok("$maria_exe_path/aria_chk$suffix -sm test2", $verbose, 0);
+  # ma_test2$suffix $silent -L -K -R1 -m2000 ;  Should give error 135\n
+  # In the following a failure is a success and success is a failure
+  $com=  "$maria_exe_path/ma_test2$suffix $silent -L -K -R1 -m2000 ";
+  $com.= ">ma_test2_message.txt 2>&1";
+  ok($com, $verbose, 0, 1);
+  ok("cat ma_test2_message.txt", $verbose, 0);
+  ok("grep \"Error: 135\" ma_test2_message.txt > /dev/null", $verbose, 0);
+  # maria_exe_path/aria_chk$suffix -sm test2 will warn that
+  # Datafile is almost full
+  ok("$maria_exe_path/aria_chk$suffix -sm test2 >ma_test2_message.txt 2>&1",
+     $verbose, 0);
+  ok("cat ma_test2_message.txt", $verbose, 0);
+  ok("grep \"warning: Datafile is almost full\" ma_test2_message.txt>/dev/null",
+     $verbose, 0);
+  unlink <ma_test2_message.txt>;
+  ok("$maria_exe_path/aria_chk$suffix -ssm test2", $verbose, 0);
+
+  return 0;
+}
+
+####
+#### Test that removing tables and applying the log leads to identical tables
+####
+
+sub run_ma_test_recovery
+{
+  my ($verbose, $count)= @_;
+
+  return 1 if ($count);                # Number of tests in this function
+  ok("$maria_path/unittest/ma_test_recovery.pl", $verbose, 0);
+  return 0;
+}
+
+####
+#### Tests on CLR's
+####
+
+sub run_tests_on_clrs
+{
+  my ($suffix, $verbose, $count)= @_;
+  my ($i);
+
+  my @t= ($NEW_TEST,
+          "$maria_exe_path/ma_test2$suffix -s -L -K -W -P -M -T -c -b -t2 -A1",
+          "cp aria_log_control tmp",
+          "$maria_exe_path/aria_read_log$suffix -a -s",
+          "$maria_exe_path/aria_chk$suffix -s -e test2",
+          "cp tmp/aria_log_control .",
+          "rm test2.MA?",
+          "$maria_exe_path/aria_read_log$suffix -a -s",
+          "$maria_exe_path/aria_chk$suffix -s -e test2",
+          "rm test2.MA?",
+          $NEW_TEST,
+          "$maria_exe_path/ma_test2$suffix -s -L -K -W -P -M -T -c -b -t2 -A1",
+          "$maria_exe_path/aria_read_log$suffix -a -s",
+          "$maria_exe_path/aria_chk$suffix -s -e test2",
+          "rm test2.MA?",
+          "$maria_exe_path/aria_read_log$suffix -a -s",
+          "$maria_exe_path/aria_chk$suffix -e -s test2",
+          "rm test2.MA?",
+          $NEW_TEST,
+          "$maria_exe_path/ma_test2$suffix -s -L -K -W -P -M -T -c -b32768 -t4 -A1",
+          "$maria_exe_path/aria_read_log$suffix -a -s",
+          "$maria_exe_path/aria_chk$suffix -es test2",
+          "$maria_exe_path/aria_read_log$suffix -a -s",
+          "$maria_exe_path/aria_chk$suffix -es test2",
+          "rm test2.MA?",
+          "$maria_exe_path/aria_read_log$suffix -a -s",
+          "$maria_exe_path/aria_chk$suffix -es test2",
+          "rm test2.MA?"
+         );
+
+  return &count_tests(\@t) if ($count);
+  &run_test_bunch(\@t, $verbose, 1);
+  return 0;
+}
+
+#
+# Print "ok" on success and "not ok" on error
+#
+# Note: Every time this function is called it will be counted
+# as a unit test.
+#
+# Args: $com:            The actual command run. Will be printed on a failure
+#       $verbose:        Be more verbose.
+#       $iteration:      Number of iterations in a loop when the error
+#                        occurred. If not in loop, this should be blank
+#                        (e.g. send zero).
+#       $expected_error: Optional; put here expected error code. Test
+#                        will pass with this result only.
+#
+# Return value:          Will return 1 on success and 0 on an error
+#
+
+sub ok
+{
+  my ($com, $verbose, $iteration, $expected_error)= @_;
+  my ($msg, $output, $err, $len);
+
+  $test_counter++;
+  if ($test_begin > $test_counter)
+  {
+    return 0;
+  }
+  if ($test_end && $test_end < $test_counter)
+  {
+    exit(0);
+  }
+
+  $msg= "";
+  $expected_error= 0 if (!defined($expected_error));
+
+  if ($verbose)
+  {
+    print "$com ";
+  }
+  $output= `$com 2>&1`;
+  $len= length($com);
+  if ($verbose)
+  {
+    print " " x (62 - $len);
+  }
+  $err= $?;
+  if ((!$err && !$expected_error) ||
+      (($err >> 8) == $expected_error && $expected_error))
+  {
+    print "[ " if ($verbose);
+    print "ok";
+    if ($verbose)
+    {
+      print " ]";
+      print " " x (5 - length("$test_counter"));
+      print "$test_counter";
+    }
+    else
+    {
+      print " $test_counter - $com"
+    }
+    print "\n";
+    return 1;
+  }
+  print "[ " if ($verbose);
+  print "not ok";
+  print " ]" if ($verbose);
+  print " $test_counter - $com" unless $verbose;
+  print "\n";
+  if ($verbose && defined($output) && length($output))
+  {
+    print "$output\n";
+  }
+  if (!$verbose)
+  {
+    $msg= "\n"; # Get a nicer output in perl unit test mode
+  }
+  $msg.= "Failed test '$com' ";
+  if ($iteration)
+  {
+    $msg.= "(loop iteration $iteration.) ";
+  }
+  $msg.= "at line ";
+  $msg.= (caller)[2];
+  $msg.= "\n(errcode: $err, test: $test_counter)\n";
+  if ($expected_error)
+  {
+    $msg.= "Was expecting errcode: $expected_error\n";
+  }
+  warn $msg;
+  $runtime_error= 1;
+  if ($opt_abort_on_error)
+  {
+    exit 1;
+  }
+  return 0;
+}
+
+#
+# Print "skip" and the reason
+#
+# Note: Every time this function is called it will be counted
+# as a unit test.
+#
+# Args: $com:            The actual command run. Will be printed on a failure
+#       $reason:         The reason to skip a test
+#       $verbose:        Be more verbose.
+#
+
+sub skip
+{
+  my ($com, $reason, $verbose)= @_;
+
+  $test_counter++;
+  return 0 if $test_begin > $test_counter;
+  exit 0 if $test_end && $test_end < $test_counter;
+  printf '%-64s[ skipped ]%5d', $com, $test_counter if $verbose;
+  print "ok $test_counter # skip $reason" unless $verbose;
+  print "\n";
+  return 1;
+}
+
+####
+#### Count tests
+#### Arguments: $t:    an array of the tests
+####
+
+sub count_tests
+{
+  my ($t)= @_;
+  my ($i, $nr_tests);
+
+  $nr_tests= 0;
+  for ($i= 0; defined(@$t[$i]); $i++) { $nr_tests++ if (@$t[$i]); }
+  return $nr_tests;
+}
+
+####
+#### Run a bunch of tests
+#### Arguments: $t:       an array of the tests
+####            $verbose: to be passed to ok()
+####            $clear:   clear log files if set
+####
+
+sub run_test_bunch
+{
+  my ($t, $verbose, $clear)= @_;
+  my ($i);
+
+  for ($i= 0; defined(@$t[$i]); $i++)
+  {
+    if ($clear && @$t[$i] eq $NEW_TEST)
+    {
+      unlink <aria_log.* aria_log_control>;
+    }
+    if (@$t[$i] ne $NEW_TEST)
+    {
+      ok(@$t[$i], $verbose, $i + 1);
+    }
+  }
+}
+
+####
+#### usage
+####
+
+sub usage
+{
+  print <<EOF;
+$my_progname version $VER
+
+Description:
+
+Run various Aria related tests. Typically used via make test as a unittest.
+
+Options
+--help             Show this help and exit.
+--abort-on-error   Abort at once in case of error.
+--number-of-tests  Print the total number of tests and exit.
+--run-tests=...    Test number(s) that should be run. You can give just
+                   one number or a range. For example 45..89. To run a specific
+                   test alone, for example test 215, use --run-tests=215..215
+                   Use this option with caution, because some of the tests
+                   might depend on previous ones.
+--start-from=...   Alias for --run-tests
+--silent=...       Silent option passed to ma_test* tests ('$opt_silent')
+--valgrind=...     Options for valgrind.
+                   ('$opt_valgrind')
+--verbose          Be more verbose. Will print each unittest on a line
+                   and result after. This mode cannot be used with unit.pl
+                   when running in normal unit test mode.
+--version          Show version number and exit.
+EOF
+  exit(0);
+}
diff --git a/storage/maria/unittest/ma_test_loghandler-t.c b/storage/maria/unittest/ma_test_loghandler-t.c
new file mode 100644
index 00000000000..ffac9b04839
--- /dev/null
+++ b/storage/maria/unittest/ma_test_loghandler-t.c
@@ -0,0 +1,661 @@
+/* Copyright (C) 2006-2008 MySQL AB
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; version 2 of the License.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */
+
+#include "../maria_def.h"
+#include <stdio.h>
+#include <errno.h>
+#include <tap.h>
+#include "../trnman.h"
+
+extern my_bool maria_log_remove();
+extern void example_loghandler_init();
+
+#ifndef DBUG_OFF
+static const char *default_dbug_option;
+#endif
+static TRN *trn= &dummy_transaction_object;
+
+#define PCACHE_SIZE (1024*1024*10)
+
+#define LONG_BUFFER_SIZE (100 * 1024)
+
+#ifdef LONG_LOG_TEST
+#define LOG_FLAGS 0
+#define LOG_FILE_SIZE (1024L*1024L*8)
+#define ITERATIONS (1600*4)
+
+#else
+#undef SKIP_BIG_TESTS
+#define SKIP_BIG_TESTS(X) /* no-op */
+#define LOG_FLAGS (TRANSLOG_SECTOR_PROTECTION | TRANSLOG_PAGE_CRC)
+#define LOG_FILE_SIZE (1024L*1024L*8L)
+#define ITERATIONS 1600
+#endif
+
+/*
+#define LOG_FLAGS 0
+#define LOG_FILE_SIZE 1024L*1024L*1024L
+#define ITERATIONS 181000
+*/
+
+/*
+#define LOG_FLAGS 0
+#define LOG_FILE_SIZE 1024L*1024L*3L
+#define ITERATIONS 1600
+*/
+
+/*
+#define LOG_FLAGS 0
+#define LOG_FILE_SIZE 1024L*1024L*100L
+#define ITERATIONS 65000
+*/
+
+/*
+  Generate random value in the range (0,LONG_BUFFER_SIZE)
+*/
+static uint32 rand_buffer_size()
+{
+  return (uint32)((ulonglong)rand()*(LONG_BUFFER_SIZE + 1)/RAND_MAX);
+}
+
+/*
+  Check that the buffer filled correctly
+
+  SYNOPSIS
+    check_content()
+    ptr                  Pointer to the buffer
+    length               length of the buffer
+
+  RETURN
+    0 - OK
+    1 - Error
+*/
+
+
+static my_bool check_content(uchar *ptr, ulong length)
+{
+  ulong i;
+  uchar buff[2];
+  for (i= 0; i < length; i++)
+  {
+    if (i % 2 == 0)
+      int2store(buff, i >> 1);
+    if (ptr[i] != buff[i % 2])
+    {
+      fprintf(stderr, "Byte # %lu is %x instead of %x",
+              i, (uint) ptr[i], (uint) buff[i % 2]);
+      return 1;
+    }
+  }
+  return 0;
+}
+
+
+/*
+  Report OK for read operation
+
+  SYNOPSIS
+    read_ok()
+    rec                  the record header
+*/
+
+void read_ok(TRANSLOG_HEADER_BUFFER *rec)
+{
+  ok(1, "read record type: %u  LSN: (%lu,0x%lx)",
+     rec->type, LSN_IN_PARTS(rec->lsn));
+}
+
+/*
+  Read whole record content, and check content (put with offset)
+
+  SYNOPSIS
+    read_and_check_content()
+    rec                  The record header buffer
+    buffer               The buffer to read the record in
+    skip                 Skip this number of bytes ot the record content
+
+  RETURN
+    0 - OK
+    1 - Error
+*/
+
+static my_bool read_and_check_content(TRANSLOG_HEADER_BUFFER *rec,
+                                      uchar *buffer, uint skip)
+{
+  DBUG_ASSERT(rec->record_length < LONG_BUFFER_SIZE * 2 + 7 * 2 + 2);
+  if (translog_read_record(rec->lsn, 0, rec->record_length, buffer, NULL) !=
+      rec->record_length)
+      return 1;
+  return check_content(buffer + skip, rec->record_length - skip);
+}
+
+
+int main(int argc __attribute__((unused)), char *argv[])
+{
+  uint32 i;
+  uint32 rec_len;
+  uint pagen;
+  uchar long_tr_id[6];
+  uchar lsn_buff[23]=
+  {
+    0x55, 0xAA, 0x55, 0xAA, 0x55, 0xAA, 0x55, 0xAA,
+    0x55, 0xAA, 0x55, 0xAA, 0x55, 0xAA, 0x55, 0xAA,
+    0x55, 0xAA, 0x55, 0xAA, 0x55, 0xAA, 0x55
+  };
+  uchar long_buffer[LONG_BUFFER_SIZE * 2 + LSN_STORE_SIZE * 2 + 2];
+  PAGECACHE pagecache;
+  LSN lsn, lsn_base, first_lsn;
+  TRANSLOG_HEADER_BUFFER rec;
+  LEX_CUSTRING parts[TRANSLOG_INTERNAL_PARTS + 3];
+  struct st_translog_scanner_data scanner;
+  int rc;
+
+  MY_INIT(argv[0]);
+
+  if (my_set_max_open_files(100) < 100)
+  {
+    fprintf(stderr, "can't allocate 100 file descriptors\n");
+    exit(1);
+  }
+  bzero(&pagecache, sizeof(pagecache));
+  maria_data_root= (char *)".";
+  if (maria_log_remove())
+    exit(1);
+
+  for (i= 0; i < (LONG_BUFFER_SIZE + LSN_STORE_SIZE * 2 + 2); i+= 2)
+  {
+    int2store(long_buffer + i, (i >> 1));
+    /* long_buffer[i]= (i & 0xFF); */
+  }
+
+  bzero(long_tr_id, 6);
+#ifndef DBUG_OFF
+#if defined(__WIN__)
+  default_dbug_option= "d:t:i:O,\\ma_test_loghandler.trace";
+#else
+  default_dbug_option= "d:t:i:o,/tmp/ma_test_loghandler.trace";
+#endif
+  if (argc > 1)
+  {
+    DBUG_SET(default_dbug_option);
+    DBUG_SET_INITIAL(default_dbug_option);
+  }
+#endif
+
+  if (ma_control_file_open(TRUE, TRUE))
+  {
+    fprintf(stderr, "Can't init control file (%d)\n", errno);
+    exit(1);
+  }
+  if ((pagen= init_pagecache(&pagecache, PCACHE_SIZE, 0, 0,
+                             TRANSLOG_PAGE_SIZE, 0)) == 0)
+  {
+    fprintf(stderr, "Got error: init_pagecache() (errno: %d)\n", errno);
+    exit(1);
+  }
+  if (translog_init_with_table(".", LOG_FILE_SIZE, 50112, 0, &pagecache,
+                               LOG_FLAGS, 0, &translog_example_table_init,
+                               0))
+  {
+    fprintf(stderr, "Can't init loghandler (%d)\n", errno);
+    exit(1);
+  }
+  /* Suppressing of automatic record writing */
+  trn->first_undo_lsn|= TRANSACTION_LOGGED_LONG_ID;
+
+  plan(((ITERATIONS - 1) * 4 + 1)*2 + ITERATIONS - 1 + 1);
+
+  SKIP_BIG_TESTS(((ITERATIONS - 1) * 4 + 1)*2 + ITERATIONS - 1 + 1)
+  {
+
+  srand(122334817L);
+
+  long_tr_id[5]= 0xff;
+
+  int4store(long_tr_id, 0);
+  parts[TRANSLOG_INTERNAL_PARTS + 0].str= long_tr_id;
+  parts[TRANSLOG_INTERNAL_PARTS + 0].length= 6;
+  trn->short_id= 0;
+  trn->first_undo_lsn= TRANSACTION_LOGGED_LONG_ID;
+  if (translog_write_record(&lsn,
+                            LOGREC_FIXED_RECORD_0LSN_EXAMPLE,
+                            trn, NULL, 6, TRANSLOG_INTERNAL_PARTS + 1,
+                            parts, NULL, NULL))
+  {
+    fprintf(stderr, "Can't write record #%lu\n", (ulong) 0);
+    translog_destroy();
+    ok(0, "write LOGREC_FIXED_RECORD_0LSN_EXAMPLE");
+    exit(1);
+  }
+  ok(1, "write LOGREC_FIXED_RECORD_0LSN_EXAMPLE");
+  lsn_base= first_lsn= lsn;
+
+  for (i= 1; i < ITERATIONS; i++)
+  {
+    trn->short_id= i % 0xFFFF;
+    if (i % 2)
+    {
+      lsn_store(lsn_buff, lsn_base);
+      parts[TRANSLOG_INTERNAL_PARTS + 0].str= lsn_buff;
+      parts[TRANSLOG_INTERNAL_PARTS + 0].length= LSN_STORE_SIZE;
+      /* check auto-count feature */
+      parts[TRANSLOG_INTERNAL_PARTS + 1].str= NULL;
+      parts[TRANSLOG_INTERNAL_PARTS + 1].length= 0;
+      if (translog_write_record(&lsn, LOGREC_FIXED_RECORD_1LSN_EXAMPLE, trn,
+                                NULL, LSN_STORE_SIZE, 0, parts, NULL, NULL))
+      {
+        fprintf(stderr, "1 Can't write reference defore record #%lu\n",
+                (ulong) i);
+        translog_destroy();
+        ok(0, "write LOGREC_FIXED_RECORD_1LSN_EXAMPLE");
+        exit(1);
+      }
+      ok(1, "write LOGREC_FIXED_RECORD_1LSN_EXAMPLE");
+      lsn_store(lsn_buff, lsn_base);
+      if ((rec_len= rand_buffer_size()) < 12)
+        rec_len= 12;
+      parts[TRANSLOG_INTERNAL_PARTS + 0].str= lsn_buff;
+      parts[TRANSLOG_INTERNAL_PARTS + 0].length= LSN_STORE_SIZE;
+      parts[TRANSLOG_INTERNAL_PARTS + 1].str= long_buffer;
+      parts[TRANSLOG_INTERNAL_PARTS + 1].length= rec_len;
+      /* check record length auto-counting */
+      if (translog_write_record(&lsn,
+                                LOGREC_VARIABLE_RECORD_1LSN_EXAMPLE,
+                                trn, NULL, 0, TRANSLOG_INTERNAL_PARTS + 2,
+                                parts, NULL, NULL))
+      {
+        fprintf(stderr, "1 Can't write var reference defore record #%lu\n",
+                (ulong) i);
+        translog_destroy();
+        ok(0, "write LOGREC_VARIABLE_RECORD_1LSN_EXAMPLE");
+        exit(1);
+      }
+      ok(1, "write LOGREC_VARIABLE_RECORD_1LSN_EXAMPLE");
+    }
+    else
+    {
+      lsn_store(lsn_buff, lsn_base);
+      lsn_store(lsn_buff + LSN_STORE_SIZE, first_lsn);
+      parts[TRANSLOG_INTERNAL_PARTS + 0].str= lsn_buff;
+      parts[TRANSLOG_INTERNAL_PARTS + 0].length= 23;
+      if (translog_write_record(&lsn,
+                                LOGREC_FIXED_RECORD_2LSN_EXAMPLE,
+                                trn, NULL, 23, TRANSLOG_INTERNAL_PARTS + 1,
+                                parts, NULL, NULL))
+      {
+        fprintf(stderr, "0 Can't write reference defore record #%lu\n",
+                (ulong) i);
+        translog_destroy();
+        ok(0, "write LOGREC_FIXED_RECORD_2LSN_EXAMPLE");
+        exit(1);
+      }
+      ok(1, "write LOGREC_FIXED_RECORD_2LSN_EXAMPLE");
+      lsn_store(lsn_buff, lsn_base);
+      lsn_store(lsn_buff + LSN_STORE_SIZE, first_lsn);
+      if ((rec_len= rand_buffer_size()) < 19)
+        rec_len= 19;
+      parts[TRANSLOG_INTERNAL_PARTS + 0].str= lsn_buff;
+      parts[TRANSLOG_INTERNAL_PARTS + 0].length= 14;
+      parts[TRANSLOG_INTERNAL_PARTS + 1].str= long_buffer;
+      parts[TRANSLOG_INTERNAL_PARTS + 1].length= rec_len;
+      if (translog_write_record(&lsn,
+                                LOGREC_VARIABLE_RECORD_2LSN_EXAMPLE,
+                                trn, NULL, 14 + rec_len,
+                                TRANSLOG_INTERNAL_PARTS + 2, parts, NULL,
+                                NULL))
+      {
+        fprintf(stderr, "0 Can't write var reference defore record #%lu\n",
+                (ulong) i);
+        translog_destroy();
+        ok(0, "write LOGREC_VARIABLE_RECORD_2LSN_EXAMPLE");
+        exit(1);
+      }
+      ok(1, "write LOGREC_VARIABLE_RECORD_2LSN_EXAMPLE");
+    }
+    int4store(long_tr_id, i);
+    parts[TRANSLOG_INTERNAL_PARTS + 0].str= long_tr_id;
+    parts[TRANSLOG_INTERNAL_PARTS + 0].length= 6;
+    if (translog_write_record(&lsn,
+                              LOGREC_FIXED_RECORD_0LSN_EXAMPLE,
+                              trn, NULL, 6,
+                              TRANSLOG_INTERNAL_PARTS + 1,
+                              parts, NULL, NULL))
+    {
+      fprintf(stderr, "Can't write record #%lu\n", (ulong) i);
+      translog_destroy();
+      ok(0, "write LOGREC_FIXED_RECORD_0LSN_EXAMPLE");
+      exit(1);
+    }
+    ok(1, "write LOGREC_FIXED_RECORD_0LSN_EXAMPLE");
+
+    lsn_base= lsn;
+
+    if ((rec_len= rand_buffer_size()) < 9)
+      rec_len= 9;
+    parts[TRANSLOG_INTERNAL_PARTS + 0].str= long_buffer;
+    parts[TRANSLOG_INTERNAL_PARTS + 0].length= rec_len;
+    if (translog_write_record(&lsn,
+                              LOGREC_VARIABLE_RECORD_0LSN_EXAMPLE,
+                              trn, NULL, rec_len,
+                              TRANSLOG_INTERNAL_PARTS + 1,
+                              parts, NULL, NULL))
+    {
+      fprintf(stderr, "Can't write variable record #%lu\n", (ulong) i);
+      translog_destroy();
+      ok(0, "write LOGREC_VARIABLE_RECORD_0LSN_EXAMPLE");
+      exit(1);
+    }
+    ok(1, "write LOGREC_VARIABLE_RECORD_0LSN_EXAMPLE");
+    if (translog_flush(lsn))
+    {
+      fprintf(stderr, "Can't flush #%lu\n", (ulong) i);
+      translog_destroy();
+      ok(0, "flush");
+      exit(1);
+    }
+    ok(1, "flush");
+  }
+
+  if (translog_flush(translog_get_horizon()))
+  {
+    fprintf(stderr, "Can't flush up to horizon\n");
+    translog_destroy();
+    ok(0, "flush");
+    exit(1);
+  }
+  ok(1, "flush");
+
+  srand(122334817L);
+
+  rc= 1;
+
+  {
+    int len= translog_read_record_header(first_lsn, &rec);
+    if (len == RECHEADER_READ_ERROR)
+    {
+      fprintf(stderr, "translog_read_record_header failed (%d)\n", errno);
+      goto err;
+    }
+    if (rec.type !=LOGREC_FIXED_RECORD_0LSN_EXAMPLE || rec.short_trid != 0 ||
+        rec.record_length != 6 || uint4korr(rec.header) != 0 ||
+        ((uchar)rec.header[4]) != 0 || ((uchar)rec.header[5]) != 0xFF ||
+        first_lsn != rec.lsn)
+    {
+      fprintf(stderr, "Incorrect LOGREC_FIXED_RECORD_0LSN_EXAMPLE "
+              "data read(0)\n"
+              "type %u, strid %u, len %u, i: %u, 4: %u 5: %u, "
+              "lsn(%lu,0x%lx)\n",
+              (uint) rec.type, (uint) rec.short_trid, (uint) rec.record_length,
+              (uint) uint4korr(rec.header), (uint) rec.header[4],
+              (uint) rec.header[5],
+              LSN_IN_PARTS(rec.lsn));
+      goto err;
+    }
+    read_ok(&rec);
+    translog_free_record_header(&rec);
+    lsn= first_lsn;
+    if (translog_scanner_init(first_lsn, 1, &scanner, 0))
+    {
+      fprintf(stderr, "scanner init failed\n");
+      goto err;
+    }
+    for (i= 1;; i++)
+    {
+      len= translog_read_next_record_header(&scanner, &rec);
+      if (len == RECHEADER_READ_ERROR)
+      {
+        fprintf(stderr, "1-%d translog_read_next_record_header failed (%d)\n",
+                i, errno);
+        goto err;
+      }
+      if (len == RECHEADER_READ_EOF)
+      {
+        if (i != ITERATIONS)
+        {
+          fprintf(stderr, "EOL met at iteration %u instead of %u\n",
+                  i, ITERATIONS);
+          goto err;
+        }
+        break;
+      }
+      if (i % 2)
+      {
+        LSN ref;
+        ref= lsn_korr(rec.header);
+        if (rec.type != LOGREC_FIXED_RECORD_1LSN_EXAMPLE ||
+            rec.short_trid != (i % 0xFFFF) ||
+            rec.record_length != 7 || ref != lsn)
+        {
+          fprintf(stderr, "Incorrect LOGREC_FIXED_RECORD_1LSN_EXAMPLE "
+                  "data read(%d) "
+                  "type: %u  strid: %u  len: %u"
+                  "ref: (%lu,0x%lx)  (%lu,0x%lx)  "
+                  "lsn(%lu,0x%lx)\n",
+                  i, (uint) rec.type, (uint) rec.short_trid,
+                  (uint) rec.record_length,
+                  LSN_IN_PARTS(ref), LSN_IN_PARTS(lsn),
+                  LSN_IN_PARTS(rec.lsn));
+          goto err;
+        }
+      }
+      else
+      {
+        LSN ref1, ref2;
+        ref1= lsn_korr(rec.header);
+        ref2= lsn_korr(rec.header + LSN_STORE_SIZE);
+        if (rec.type != LOGREC_FIXED_RECORD_2LSN_EXAMPLE ||
+            rec.short_trid != (i % 0xFFFF) ||
+            rec.record_length != 23 ||
+            ref1 != lsn ||
+            ref2 != first_lsn ||
+            ((uchar)rec.header[22]) != 0x55 ||
+            ((uchar)rec.header[21]) != 0xAA ||
+            ((uchar)rec.header[20]) != 0x55 ||
+            ((uchar)rec.header[19]) != 0xAA ||
+            ((uchar)rec.header[18]) != 0x55 ||
+            ((uchar)rec.header[17]) != 0xAA ||
+            ((uchar)rec.header[16]) != 0x55 ||
+            ((uchar)rec.header[15]) != 0xAA ||
+            ((uchar)rec.header[14]) != 0x55)
+        {
+          fprintf(stderr, "Incorrect LOGREC_FIXED_RECORD_2LSN_EXAMPLE "
+                  "data read(%d) "
+                  "type %u, strid %u, len %u, ref1(%lu,0x%lx), "
+                  "ref2(%lu,0x%lx) %x%x%x%x%x%x%x%x%x "
+                  "lsn(%lu,0x%lx)\n",
+                  i, (uint) rec.type, (uint) rec.short_trid,
+                  (uint) rec.record_length,
+                  LSN_IN_PARTS(ref1), LSN_IN_PARTS(ref2),
+                  (uint) rec.header[14], (uint) rec.header[15],
+                  (uint) rec.header[16], (uint) rec.header[17],
+                  (uint) rec.header[18], (uint) rec.header[19],
+                  (uint) rec.header[20], (uint) rec.header[21],
+                  (uint) rec.header[22],
+                  LSN_IN_PARTS(rec.lsn));
+          goto err;
+        }
+      }
+      read_ok(&rec);
+      translog_free_record_header(&rec);
+
+      len= translog_read_next_record_header(&scanner, &rec);
+      if (len == RECHEADER_READ_ERROR)
+      {
+        fprintf(stderr, "1-%d translog_read_next_record_header (var) "
+                "failed (%d)\n", i, errno);
+        goto err;
+      }
+      if (len == RECHEADER_READ_EOF)
+      {
+        fprintf(stderr, "EOL met at the middle of iteration (first var) %u "
+                "instead of beginning of %u\n", i, ITERATIONS);
+        goto err;
+      }
+      if (i % 2)
+      {
+        LSN ref;
+        ref= lsn_korr(rec.header);
+        if ((rec_len= rand_buffer_size()) < 12)
+          rec_len= 12;
+        if (rec.type != LOGREC_VARIABLE_RECORD_1LSN_EXAMPLE ||
+            rec.short_trid != (i % 0xFFFF) ||
+            rec.record_length != rec_len + LSN_STORE_SIZE ||
+            len != 12 || ref != lsn ||
+            check_content(rec.header + LSN_STORE_SIZE, len - LSN_STORE_SIZE))
+        {
+          fprintf(stderr, "Incorrect LOGREC_VARIABLE_RECORD_1LSN_EXAMPLE "
+                  "data read(%d)"
+                  "type %u (%d), strid %u (%d), len %lu, %lu + 7 (%d), "
+                  "hdr len: %u (%d), "
+                  "ref(%lu,0x%lx), lsn(%lu,0x%lx) (%d), content: %d\n",
+                  i, (uint) rec.type,
+                  rec.type != LOGREC_VARIABLE_RECORD_1LSN_EXAMPLE,
+                  (uint) rec.short_trid,
+                  rec.short_trid != (i % 0xFFFF),
+                  (ulong) rec.record_length, (ulong) rec_len,
+                  rec.record_length != rec_len + LSN_STORE_SIZE,
+                  (uint) len,
+                  len != 12,
+                  LSN_IN_PARTS(ref), LSN_IN_PARTS(rec.lsn),
+                  (len != 12 || ref != lsn),
+                  check_content(rec.header + LSN_STORE_SIZE,
+                                len - LSN_STORE_SIZE));
+          goto err;
+        }
+        if (read_and_check_content(&rec, long_buffer, LSN_STORE_SIZE))
+        {
+          fprintf(stderr,
+                  "Incorrect LOGREC_VARIABLE_RECORD_1LSN_EXAMPLE "
+                  "in whole rec read lsn(%lu,0x%lx)\n",
+                  LSN_IN_PARTS(rec.lsn));
+          goto err;
+        }
+      }
+      else
+      {
+        LSN ref1, ref2;
+        ref1= lsn_korr(rec.header);
+        ref2= lsn_korr(rec.header + LSN_STORE_SIZE);
+        if ((rec_len= rand_buffer_size()) < 19)
+          rec_len= 19;
+        if (rec.type != LOGREC_VARIABLE_RECORD_2LSN_EXAMPLE ||
+            rec.short_trid != (i % 0xFFFF) ||
+            rec.record_length != rec_len + LSN_STORE_SIZE * 2 ||
+            len != 19 ||
+            ref1 != lsn ||
+            ref2 != first_lsn ||
+            check_content(rec.header + LSN_STORE_SIZE * 2,
+                          len - LSN_STORE_SIZE * 2))
+        {
+          fprintf(stderr, "Incorrect LOGREC_VARIABLE_RECORD_2LSN_EXAMPLE "
+                  "data read(%d) "
+                  "type %u, strid %u, len %lu != %lu + 14, hdr len: %d, "
+                  "ref1(%lu,0x%lx), ref2(%lu,0x%lx), "
+                  "lsn(%lu,0x%lx)\n",
+                  i, (uint) rec.type, (uint) rec.short_trid,
+                  (ulong) rec.record_length, (ulong) rec_len,
+                  len, LSN_IN_PARTS(ref1), LSN_IN_PARTS(ref2),
+                  LSN_IN_PARTS(rec.lsn));
+          goto err;
+        }
+        if (read_and_check_content(&rec, long_buffer, LSN_STORE_SIZE * 2))
+        {
+          fprintf(stderr,
+                  "Incorrect LOGREC_VARIABLE_RECORD_2LSN_EXAMPLE "
+                  "in whole rec read lsn(%lu,0x%lx)\n",
+                  LSN_IN_PARTS(rec.lsn));
+          goto err;
+        }
+      }
+      read_ok(&rec);
+      translog_free_record_header(&rec);
+
+      len= translog_read_next_record_header(&scanner, &rec);
+      if (len == RECHEADER_READ_ERROR)
+      {
+        fprintf(stderr, "1-%d translog_read_next_record_header failed (%d)\n",
+                i, errno);
+        goto err;
+      }
+      if (len == RECHEADER_READ_EOF)
+      {
+        fprintf(stderr, "EOL met at the middle of iteration %u "
+                "instead of beginning of %u\n", i, ITERATIONS);
+        goto err;
+      }
+      if (rec.type != LOGREC_FIXED_RECORD_0LSN_EXAMPLE ||
+          rec.short_trid != (i % 0xFFFF) ||
+          rec.record_length != 6 || uint4korr(rec.header) != i ||
+          ((uchar)rec.header[4]) != 0 || ((uchar)rec.header[5]) != 0xFF)
+      {
+        fprintf(stderr, "Incorrect LOGREC_FIXED_RECORD_0LSN_EXAMPLE "
+                "data read(%d)\n"
+                "type %u, strid %u, len %u, i: %u, 4: %u 5: %u "
+                "lsn(%lu,0x%lx)\n",
+                i, (uint) rec.type, (uint) rec.short_trid,
+                (uint) rec.record_length,
+                (uint) uint4korr(rec.header), (uint) rec.header[4],
+                (uint) rec.header[5],
+                LSN_IN_PARTS(rec.lsn));
+        goto err;
+      }
+      lsn= rec.lsn;
+      read_ok(&rec);
+      translog_free_record_header(&rec);
+
+      len= translog_read_next_record_header(&scanner, &rec);
+      if ((rec_len= rand_buffer_size()) < 9)
+        rec_len= 9;
+      if (rec.type != LOGREC_VARIABLE_RECORD_0LSN_EXAMPLE ||
+          rec.short_trid != (i % 0xFFFF) ||
+          rec.record_length != rec_len ||
+          len != 9 || check_content(rec.header, (uint)len))
+      {
+        fprintf(stderr, "Incorrect LOGREC_VARIABLE_RECORD_0LSN_EXAMPLE "
+                "data read(%d) "
+                "type %u, strid %u, len %lu != %lu, hdr len: %d, "
+                "lsn(%lu,0x%lx)\n",
+                i, (uint) rec.type, (uint) rec.short_trid,
+                (ulong) rec.record_length, (ulong) rec_len,
+                len, LSN_IN_PARTS(rec.lsn));
+        goto err;
+      }
+      if (read_and_check_content(&rec, long_buffer, 0))
+      {
+        fprintf(stderr,
+                "Incorrect LOGREC_VARIABLE_RECORD_2LSN_EXAMPLE "
+                "in whole rec read lsn(%lu,0x%lx)\n",
+                LSN_IN_PARTS(rec.lsn));
+        goto err;
+      }
+      read_ok(&rec);
+      translog_free_record_header(&rec);
+    }
+  }
+
+  rc= 0;
+err:
+  if (rc)
+    ok(0, "read record");
+  } /* SKIP_BIG_TESTS */
+  translog_destroy();
+  end_pagecache(&pagecache, 1);
+  ma_control_file_end();
+
+  if (maria_log_remove())
+    exit(1);
+
+  return(test(exit_status()));
+}
diff --git a/storage/maria/unittest/ma_test_loghandler_first_lsn-t.c b/storage/maria/unittest/ma_test_loghandler_first_lsn-t.c
new file mode 100644
index 00000000000..06d9a00c04c
--- /dev/null
+++ b/storage/maria/unittest/ma_test_loghandler_first_lsn-t.c
@@ -0,0 +1,160 @@
+/* Copyright (C) 2006-2008 MySQL AB
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; version 2 of the License.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */
+
+#include "../maria_def.h"
+#include <stdio.h>
+#include <errno.h>
+#include <tap.h>
+#include "../trnman.h"
+
+extern my_bool maria_log_remove();
+extern void translog_example_table_init();
+
+#ifndef DBUG_OFF
+static const char *default_dbug_option;
+#endif
+
+#define PCACHE_SIZE (1024*1024*10)
+#define PCACHE_PAGE TRANSLOG_PAGE_SIZE
+#define LOG_FILE_SIZE (1024L*1024L*1024L + 1024L*1024L*512)
+#define LOG_FLAGS 0
+
+static char *first_translog_file= (char*)"maria_log.00000001";
+
+int main(int argc __attribute__((unused)), char *argv[])
+{
+  uint pagen;
+  uchar long_tr_id[6];
+  PAGECACHE pagecache;
+  LSN lsn, first_lsn, theor_lsn;
+  LEX_CUSTRING parts[TRANSLOG_INTERNAL_PARTS + 1];
+
+  MY_INIT(argv[0]);
+
+  plan(2);
+
+  bzero(&pagecache, sizeof(pagecache));
+  maria_data_root= (char *)".";
+  if (maria_log_remove())
+    exit(1);
+  /* be sure that we have no logs in the directory*/
+  my_delete(CONTROL_FILE_BASE_NAME, MYF(0));
+  my_delete(first_translog_file, MYF(0));
+
+  bzero(long_tr_id, 6);
+#ifndef DBUG_OFF
+#if defined(__WIN__)
+  default_dbug_option= "d:t:i:O,\\ma_test_loghandler.trace";
+#else
+  default_dbug_option= "d:t:i:o,/tmp/ma_test_loghandler.trace";
+#endif
+  if (argc > 1)
+  {
+    DBUG_SET(default_dbug_option);
+    DBUG_SET_INITIAL(default_dbug_option);
+  }
+#endif
+
+  if (ma_control_file_open(TRUE, TRUE))
+  {
+    fprintf(stderr, "Can't init control file (%d)\n", errno);
+    exit(1);
+  }
+  if ((pagen= init_pagecache(&pagecache, PCACHE_SIZE, 0, 0,
+                             PCACHE_PAGE, 0)) == 0)
+  {
+    fprintf(stderr, "Got error: init_pagecache() (errno: %d)\n", errno);
+    exit(1);
+  }
+  if (translog_init_with_table(".", LOG_FILE_SIZE, 50112, 0, &pagecache,
+                               LOG_FLAGS, 0, &translog_example_table_init,
+                               0))
+  {
+    fprintf(stderr, "Can't init loghandler (%d)\n", errno);
+    exit(1);
+  }
+  /* Suppressing of automatic record writing */
+  dummy_transaction_object.first_undo_lsn|= TRANSACTION_LOGGED_LONG_ID;
+
+  theor_lsn= translog_first_theoretical_lsn();
+  if (theor_lsn == 1)
+  {
+    fprintf(stderr, "Error reading the first log file.");
+    translog_destroy();
+    exit(1);
+  }
+  if (theor_lsn == LSN_IMPOSSIBLE)
+  {
+    fprintf(stderr, "There is no first log file.");
+    translog_destroy();
+    exit(1);
+  }
+  first_lsn= translog_first_lsn_in_log();
+  if (first_lsn != LSN_IMPOSSIBLE)
+  {
+    fprintf(stderr, "Incorrect first lsn response (%lu,0x%lx).",
+            LSN_IN_PARTS(first_lsn));
+    translog_destroy();
+    exit(1);
+  }
+  ok(1, "Empty log response");
+
+
+  int4store(long_tr_id, 0);
+  parts[TRANSLOG_INTERNAL_PARTS + 0].str= long_tr_id;
+  parts[TRANSLOG_INTERNAL_PARTS + 0].length= 6;
+  if (translog_write_record(&lsn,
+                            LOGREC_FIXED_RECORD_0LSN_EXAMPLE,
+                            &dummy_transaction_object, NULL, 6,
+                            TRANSLOG_INTERNAL_PARTS + 1,
+                            parts, NULL, NULL))
+  {
+    fprintf(stderr, "Can't write record #%lu\n", (ulong) 0);
+    translog_destroy();
+    exit(1);
+  }
+
+  theor_lsn= translog_first_theoretical_lsn();
+  if (theor_lsn == 1)
+  {
+    fprintf(stderr, "Error reading the first log file\n");
+    translog_destroy();
+    exit(1);
+  }
+  if (theor_lsn == LSN_IMPOSSIBLE)
+  {
+    fprintf(stderr, "There is no first log file\n");
+    translog_destroy();
+    exit(1);
+  }
+  first_lsn= translog_first_lsn_in_log();
+  if (first_lsn != theor_lsn)
+  {
+    fprintf(stderr, "Incorrect first lsn: (%lu,0x%lx)  "
+            " theoretical first: (%lu,0x%lx)\n",
+            LSN_IN_PARTS(first_lsn), LSN_IN_PARTS(theor_lsn));
+    translog_destroy();
+    exit(1);
+  }
+
+  ok(1, "Full log response");
+
+  translog_destroy();
+  end_pagecache(&pagecache, 1);
+  ma_control_file_end();
+  if (maria_log_remove())
+    exit(1);
+  exit(0);
+}
diff --git a/storage/maria/unittest/ma_test_loghandler_max_lsn-t.c b/storage/maria/unittest/ma_test_loghandler_max_lsn-t.c
new file mode 100644
index 00000000000..64f486b8cf1
--- /dev/null
+++ b/storage/maria/unittest/ma_test_loghandler_max_lsn-t.c
@@ -0,0 +1,156 @@
+/* Copyright (C) 2006-2008 MySQL AB
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; version 2 of the License.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */
+
+#include "../maria_def.h"
+#include <stdio.h>
+#include <errno.h>
+#include <tap.h>
+#include "../trnman.h"
+
+extern my_bool maria_log_remove();
+extern void translog_example_table_init();
+
+#ifndef DBUG_OFF
+static const char *default_dbug_option;
+#endif
+
+#define PCACHE_SIZE (1024*1024*10)
+#define PCACHE_PAGE TRANSLOG_PAGE_SIZE
+#define LOG_FILE_SIZE (8*1024L*1024L)
+#define LOG_FLAGS 0
+
+
+int main(int argc __attribute__((unused)), char *argv[])
+{
+  ulong i;
+  uint pagen;
+  uchar long_tr_id[6];
+  PAGECACHE pagecache;
+  LSN lsn, max_lsn, last_lsn= LSN_IMPOSSIBLE;
+  LEX_CUSTRING parts[TRANSLOG_INTERNAL_PARTS + 1];
+
+  MY_INIT(argv[0]);
+
+  plan(2);
+
+  bzero(&pagecache, sizeof(pagecache));
+  maria_data_root= (char *)".";
+  if (maria_log_remove())
+    exit(1);
+
+  bzero(long_tr_id, 6);
+#ifndef DBUG_OFF
+#if defined(__WIN__)
+  default_dbug_option= "d:t:i:O,\\ma_test_loghandler.trace";
+#else
+  default_dbug_option= "d:t:i:o,/tmp/ma_test_loghandler.trace";
+#endif
+  if (argc > 1)
+  {
+    DBUG_SET(default_dbug_option);
+    DBUG_SET_INITIAL(default_dbug_option);
+  }
+#endif
+
+  if (ma_control_file_open(TRUE, TRUE))
+  {
+    fprintf(stderr, "Can't init control file (%d)\n", errno);
+    exit(1);
+  }
+  if ((pagen= init_pagecache(&pagecache, PCACHE_SIZE, 0, 0,
+                             PCACHE_PAGE, 0)) == 0)
+  {
+    fprintf(stderr, "Got error: init_pagecache() (errno: %d)\n", errno);
+    exit(1);
+  }
+  if (translog_init_with_table(".", LOG_FILE_SIZE, 50112, 0, &pagecache,
+                               LOG_FLAGS, 0, &translog_example_table_init,
+                               0))
+  {
+    fprintf(stderr, "Can't init loghandler (%d)\n", errno);
+    exit(1);
+  }
+  /* Suppressing of automatic record writing */
+  dummy_transaction_object.first_undo_lsn|= TRANSACTION_LOGGED_LONG_ID;
+
+  max_lsn= translog_get_file_max_lsn_stored(1);
+  if (max_lsn == 1)
+  {
+    fprintf(stderr, "Error reading the first log file.");
+    translog_destroy();
+    exit(1);
+  }
+  if (max_lsn != LSN_IMPOSSIBLE)
+  {
+    fprintf(stderr, "Incorrect first lsn response (%lu,0x%lx).",
+            LSN_IN_PARTS(max_lsn));
+    translog_destroy();
+    exit(1);
+  }
+  ok(1, "Empty log response");
+
+
+  /* write more then 1 file */
+  int4store(long_tr_id, 0);
+  parts[TRANSLOG_INTERNAL_PARTS + 0].str= long_tr_id;
+  parts[TRANSLOG_INTERNAL_PARTS + 0].length= 6;
+  for(i= 0; i < LOG_FILE_SIZE/6; i++)
+  {
+    if (translog_write_record(&lsn,
+                              LOGREC_FIXED_RECORD_0LSN_EXAMPLE,
+                              &dummy_transaction_object, NULL, 6,
+                              TRANSLOG_INTERNAL_PARTS + 1,
+                              parts, NULL, NULL))
+    {
+      fprintf(stderr, "Can't write record #%lu\n", (ulong) 0);
+      translog_destroy();
+      exit(1);
+    }
+    if (LSN_FILE_NO(lsn) == 1)
+      last_lsn= lsn;
+  }
+
+
+  max_lsn= translog_get_file_max_lsn_stored(1);
+  if (max_lsn == 1)
+  {
+    fprintf(stderr, "Error reading the first log file\n");
+    translog_destroy();
+    exit(1);
+  }
+  if (max_lsn == LSN_IMPOSSIBLE)
+  {
+    fprintf(stderr, "Isn't first file still finished?!!\n");
+    translog_destroy();
+    exit(1);
+  }
+  if (max_lsn != last_lsn)
+  {
+    fprintf(stderr, "Incorrect max lsn: (%lu,0x%lx)  "
+            " last lsn on first file: (%lu,0x%lx)\n",
+            LSN_IN_PARTS(max_lsn), LSN_IN_PARTS(last_lsn));
+    translog_destroy();
+    exit(1);
+  }
+
+  ok(1, "First file max LSN");
+
+  translog_destroy();
+  end_pagecache(&pagecache, 1);
+  ma_control_file_end();
+  if (maria_log_remove())
+    exit(1);
+  exit(0);
+}
diff --git a/storage/maria/unittest/ma_test_loghandler_multigroup-t.c b/storage/maria/unittest/ma_test_loghandler_multigroup-t.c
new file mode 100644
index 00000000000..7ba7ce3176d
--- /dev/null
+++ b/storage/maria/unittest/ma_test_loghandler_multigroup-t.c
@@ -0,0 +1,746 @@
+/* Copyright (C) 2006-2008 MySQL AB
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; version 2 of the License.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */
+
+#include "../maria_def.h"
+#include <stdio.h>
+#include <errno.h>
+#include <tap.h>
+#include "../trnman.h"
+#include "sequence_storage.h"
+#include <my_getopt.h>
+
+extern my_bool maria_log_remove();
+extern void translog_example_table_init();
+
+#ifndef DBUG_OFF
+static const char *default_dbug_option;
+#endif
+static TRN *trn= &dummy_transaction_object;
+
+
+#ifndef READONLY_TEST
+
+#define PCACHE_SIZE (1024*1024*10)
+#define LONG_BUFFER_SIZE ((1024L*1024L*1024L) + (1024L*1024L*512))
+#define MIN_REC_LENGTH (1024L*1024L + 1024L*512L + 1)
+#define LOG_FILE_SIZE (1024L*1024L*1024L + 1024L*1024L*512)
+#define ITERATIONS 2
+#define READONLY 0
+
+#else
+
+#define PCACHE_SIZE (1024*1024*10)
+#define LONG_BUFFER_SIZE (1024L*1024L)
+#define MIN_REC_LENGTH (1024L)
+#define LOG_FILE_SIZE (1024L*1024L*1024L + 1024L*1024L*512)
+#define ITERATIONS 2
+#define READONLY 1
+
+#endif /*READONLY_TEST*/
+
+
+/*
+#define LOG_FILE_SIZE 1024L*1024L*3L
+#define ITERATIONS 1600
+*/
+/*
+#define LOG_FILE_SIZE 1024L*1024L*100L
+#define ITERATIONS 65000
+*/
+
+
+/*
+  Check that the buffer filled correctly
+
+  SYNOPSIS
+    check_content()
+    ptr                  Pointer to the buffer
+    length               length of the buffer
+
+  RETURN
+    0 - OK
+    1 - Error
+*/
+
+static my_bool check_content(uchar *ptr, ulong length)
+{
+  ulong i;
+  uchar buff[4];
+  DBUG_ENTER("check_content");
+  for (i= 0; i < length; i++)
+  {
+    if (i % 4 == 0)
+      int4store(buff, (i >> 2));
+    if (ptr[i] != buff[i % 4])
+    {
+      fprintf(stderr, "Byte # %lu is %x instead of %x",
+              i, (uint) ptr[i], (uint) buff[i % 4]);
+      DBUG_DUMP("mem", ptr +(ulong) (i > 16 ? i - 16 : 0),
+                (i > 16 ? 16 : i) + (i + 16 < length ? 16 : length - i));
+      DBUG_RETURN(1);
+    }
+  }
+  DBUG_RETURN(0);
+}
+
+
+/*
+  Read whole record content, and check content (put with offset)
+
+  SYNOPSIS
+    read_and_check_content()
+    rec                  The record header buffer
+    buffer               The buffer to read the record in
+    skip                 Skip this number of bytes ot the record content
+
+  RETURN
+    0 - OK
+    1 - Error
+*/
+
+static my_bool read_and_check_content(TRANSLOG_HEADER_BUFFER *rec,
+                                      uchar *buffer, uint skip)
+{
+  int res= 0;
+  translog_size_t len;
+  DBUG_ENTER("read_and_check_content");
+  DBUG_ASSERT(rec->record_length < LONG_BUFFER_SIZE + LSN_STORE_SIZE * 2 + 2);
+  if ((len= translog_read_record(rec->lsn, 0, rec->record_length,
+                                 buffer, NULL)) != rec->record_length)
+  {
+    fprintf(stderr, "Requested %lu byte, read %lu\n",
+            (ulong) rec->record_length, (ulong) len);
+    res= 1;
+  }
+  res|= check_content(buffer + skip, rec->record_length - skip);
+  DBUG_RETURN(res);
+}
+
+static const char *load_default_groups[]= {"ma_unit_loghandler", 0};
+#ifndef DBUG_OFF
+static const char *default_dbug_option=
+  IF_WIN("d:t:i:O,\\ma_test_loghandler.trace",
+         "d:t:i:o,/tmp/ma_test_loghandler.trace");
+#endif
+static const char *opt_wfile= NULL;
+static const char *opt_rfile= NULL;
+static struct my_option my_long_options[] =
+{
+#ifndef DBUG_OFF
+  {"debug", '#', "Output debug log. Often the argument is 'd:t:o,filename'.",
+   0, 0, 0, GET_STR, OPT_ARG, 0, 0, 0, 0, 0, 0},
+#endif
+  {"write-seq", 'w', "Path to file in which \"random\" sequence  used in the test will be written",
+    (uchar**) &opt_wfile, 0, 0, GET_STR, REQUIRED_ARG, 0, 0, 0, 0, 0, 0},
+  {"read-seq", 'r', "Path to file from which \"random\" sequence  used in the test will be read",
+    (uchar**) &opt_rfile, 0, 0, GET_STR, REQUIRED_ARG, 0, 0, 0, 0, 0, 0},
+  {"help", '?', "Display this help and exit.",
+   0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
+  { 0, 0, 0, 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}
+};
+static SEQ_STORAGE seq;
+
+static uint32 get_len()
+{
+  uint32 res;
+  DBUG_ENTER("get_len");
+  if (opt_rfile)
+    res= seq_storage_next(&seq);
+  else
+  {
+    res= (uint32)
+      ((ulonglong) rand() *
+       (LONG_BUFFER_SIZE - MIN_REC_LENGTH - 1) / RAND_MAX) + MIN_REC_LENGTH;
+    if (opt_wfile &&
+        seq_storage_write(opt_wfile, res))
+      exit(1);
+  }
+  DBUG_PRINT("info", ("length value : %lu", (ulong) res));
+  DBUG_RETURN(res);
+}
+
+static void usage(void)
+{
+  puts("Copyright (C) 2008 MySQL AB");
+  puts("This software comes with ABSOLUTELY NO WARRANTY. This is free software,");
+  puts("and you are welcome to modify and redistribute it under the GPL license\n");
+
+  puts("Unit test of maria engine");
+  VOID(printf("\nUsage: %s [OPTIONS]\n", my_progname_short));
+  my_print_help(my_long_options);
+  print_defaults("my", load_default_groups);
+  my_print_variables(my_long_options);
+}
+
+
+static my_bool
+get_one_option(int optid __attribute__((unused)),
+               const struct my_option *opt __attribute__((unused)),
+               char *argument __attribute__((unused)))
+{
+  switch (optid) {
+  case '?':
+    usage();
+    exit(0);
+#ifndef DBUG_OFF
+  case '#':
+    DBUG_SET_INITIAL(argument ? argument : default_dbug_option);
+    break;
+#endif
+  }
+  return 0;
+}
+
+
+static void get_options(int *argc,char ***argv)
+{
+  int ho_error;
+
+  if ((ho_error= handle_options(argc, argv, my_long_options, get_one_option)))
+    exit(ho_error);
+
+  if (opt_rfile && opt_wfile)
+  {
+    usage();
+    exit(1);
+  }
+}
+
+
+int main(int argc __attribute__((unused)), char *argv[])
+{
+  uint32 i;
+  uint32 rec_len;
+  uint pagen;
+  uchar long_tr_id[6];
+  uchar lsn_buff[23]=
+  {
+    0x55, 0xAA, 0x55, 0xAA, 0x55, 0xAA, 0x55, 0xAA,
+    0x55, 0xAA, 0x55, 0xAA, 0x55, 0xAA, 0x55, 0xAA,
+    0x55, 0xAA, 0x55, 0xAA, 0x55, 0xAA, 0x55
+  };
+  uchar *long_buffer= malloc(LONG_BUFFER_SIZE + LSN_STORE_SIZE * 2 + 2);
+  char **default_argv;
+  PAGECACHE pagecache;
+  LSN lsn, lsn_base, first_lsn;
+  TRANSLOG_HEADER_BUFFER rec;
+  LEX_CUSTRING parts[TRANSLOG_INTERNAL_PARTS + 2];
+  struct st_translog_scanner_data scanner;
+  int rc;
+
+  MY_INIT(argv[0]);
+
+  bzero(&pagecache, sizeof(pagecache));
+  maria_data_root= (char *)".";
+  load_defaults("my", load_default_groups, &argc, &argv);
+  default_argv= argv;
+  get_options(&argc, &argv);
+
+  if (maria_log_remove())
+    exit(1);
+
+  {
+    uchar buff[4];
+    for (i= 0; i < (LONG_BUFFER_SIZE + LSN_STORE_SIZE * 2 + 2); i++)
+    {
+      if (i % 4 == 0)
+        int4store(buff, (i >> 2));
+      long_buffer[i]= buff[i % 4];
+    }
+  }
+
+  bzero(long_tr_id, 6);
+
+  if (ma_control_file_open(TRUE, TRUE))
+  {
+    fprintf(stderr, "Can't init control file (%d)\n", errno);
+    exit(1);
+  }
+  if ((pagen= init_pagecache(&pagecache, PCACHE_SIZE, 0, 0,
+                             TRANSLOG_PAGE_SIZE, 0)) == 0)
+  {
+    fprintf(stderr, "Got error: init_pagecache() (errno: %d)\n", errno);
+    exit(1);
+  }
+  if (translog_init_with_table(".", LOG_FILE_SIZE, 50112, 0, &pagecache,
+                               0, 0, &translog_example_table_init, 0))
+  {
+    fprintf(stderr, "Can't init loghandler (%d)\n", errno);
+    exit(1);
+  }
+  /* Suppressing of automatic record writing */
+  trn->first_undo_lsn|= TRANSACTION_LOGGED_LONG_ID;
+
+  plan(((ITERATIONS - 1) * 4 + 1) * 2);
+
+  if (opt_rfile &&
+      seq_storage_reader_init(&seq, opt_rfile))
+    exit(1);
+  srand(122334817L);
+
+  long_tr_id[5]= 0xff;
+
+  int4store(long_tr_id, 0);
+  parts[TRANSLOG_INTERNAL_PARTS + 0].str= long_tr_id;
+  parts[TRANSLOG_INTERNAL_PARTS + 0].length= 6;
+  trn->short_id= 0;
+  trn->first_undo_lsn= TRANSACTION_LOGGED_LONG_ID;
+  if (translog_write_record(&lsn, LOGREC_FIXED_RECORD_0LSN_EXAMPLE,
+                            trn, NULL, 6, TRANSLOG_INTERNAL_PARTS + 1, parts,
+                            NULL, NULL))
+  {
+    fprintf(stderr, "Can't write record #%u\n", 0);
+    translog_destroy();
+    ok(0, "write LOGREC_FIXED_RECORD_0LSN_EXAMPLE");
+    exit(1);
+  }
+  ok(1, "write LOGREC_FIXED_RECORD_0LSN_EXAMPLE");
+  lsn_base= first_lsn= lsn;
+
+  for (i= 1; i < ITERATIONS; i++)
+  {
+    if (i % 2)
+    {
+      lsn_store(lsn_buff, lsn_base);
+      parts[TRANSLOG_INTERNAL_PARTS + 0].str= lsn_buff;
+      parts[TRANSLOG_INTERNAL_PARTS + 0].length= LSN_STORE_SIZE;
+      trn->short_id= i % 0xFFFF;
+      if (translog_write_record(&lsn,
+                                LOGREC_FIXED_RECORD_1LSN_EXAMPLE, trn, NULL,
+                                LSN_STORE_SIZE, TRANSLOG_INTERNAL_PARTS + 1,
+                                parts, NULL, NULL))
+      {
+        fprintf(stderr, "1 Can't write reference before record #%u\n", i);
+        translog_destroy();
+        ok(0, "write LOGREC_FIXED_RECORD_1LSN_EXAMPLE");
+        exit(1);
+      }
+      ok(1, "write LOGREC_FIXED_RECORD_1LSN_EXAMPLE");
+      lsn_store(lsn_buff, lsn_base);
+      rec_len= get_len();
+      parts[TRANSLOG_INTERNAL_PARTS + 0].str= lsn_buff;
+      parts[TRANSLOG_INTERNAL_PARTS + 0].length= LSN_STORE_SIZE;
+      parts[TRANSLOG_INTERNAL_PARTS + 1].str= long_buffer;
+      parts[TRANSLOG_INTERNAL_PARTS + 1].length= rec_len;
+      trn->short_id= i % 0xFFFF;
+      if (translog_write_record(&lsn,
+                                LOGREC_VARIABLE_RECORD_1LSN_EXAMPLE,
+                                trn, NULL, LSN_STORE_SIZE + rec_len,
+                                TRANSLOG_INTERNAL_PARTS + 2,
+                                parts, NULL, NULL))
+      {
+        fprintf(stderr, "1 Can't write var reference before record #%u\n", i);
+        translog_destroy();
+        ok(0, "write LOGREC_VARIABLE_RECORD_1LSN_EXAMPLE");
+        exit(1);
+      }
+      ok(1, "write LOGREC_VARIABLE_RECORD_1LSN_EXAMPLE");
+    }
+    else
+    {
+      lsn_store(lsn_buff, lsn_base);
+      lsn_store(lsn_buff + LSN_STORE_SIZE, first_lsn);
+      parts[TRANSLOG_INTERNAL_PARTS + 1].str= lsn_buff;
+      parts[TRANSLOG_INTERNAL_PARTS + 1].length= 23;
+      trn->short_id= i % 0xFFFF;
+      if (translog_write_record(&lsn,
+                                LOGREC_FIXED_RECORD_2LSN_EXAMPLE,
+                                trn, NULL, 23, TRANSLOG_INTERNAL_PARTS + 1,
+                                parts, NULL, NULL))
+      {
+        fprintf(stderr, "0 Can't write reference before record #%u\n", i);
+        translog_destroy();
+        ok(0, "write LOGREC_FIXED_RECORD_2LSN_EXAMPLE");
+        exit(1);
+      }
+      ok(1, "write LOGREC_FIXED_RECORD_2LSN_EXAMPLE");
+      lsn_store(lsn_buff, lsn_base);
+      lsn_store(lsn_buff + LSN_STORE_SIZE, first_lsn);
+      rec_len= get_len();
+      parts[TRANSLOG_INTERNAL_PARTS + 0].str= lsn_buff;
+      parts[TRANSLOG_INTERNAL_PARTS + 0].length= LSN_STORE_SIZE * 2;
+      parts[TRANSLOG_INTERNAL_PARTS + 1].str= long_buffer;
+      parts[TRANSLOG_INTERNAL_PARTS + 1].length= rec_len;
+      trn->short_id= i % 0xFFFF;
+      if (translog_write_record(&lsn,
+                                LOGREC_VARIABLE_RECORD_2LSN_EXAMPLE,
+                                trn, NULL, LSN_STORE_SIZE * 2 + rec_len,
+                                TRANSLOG_INTERNAL_PARTS + 2,
+                                parts, NULL, NULL))
+      {
+        fprintf(stderr, "0 Can't write var reference before record #%u\n", i);
+        translog_destroy();
+        ok(0, "write LOGREC_VARIABLE_RECORD_2LSN_EXAMPLE");
+        exit(1);
+      }
+      ok(1, "write LOGREC_VARIABLE_RECORD_2LSN_EXAMPLE");
+    }
+    int4store(long_tr_id, i);
+    parts[TRANSLOG_INTERNAL_PARTS + 0].str= long_tr_id;
+    parts[TRANSLOG_INTERNAL_PARTS + 0].length= 6;
+    trn->short_id= i % 0xFFFF;
+    if (translog_write_record(&lsn,
+                              LOGREC_FIXED_RECORD_0LSN_EXAMPLE,
+                              trn, NULL, 6,
+                              TRANSLOG_INTERNAL_PARTS + 1, parts, NULL, NULL))
+    {
+      fprintf(stderr, "Can't write record #%u\n", i);
+      translog_destroy();
+      ok(0, "write LOGREC_FIXED_RECORD_0LSN_EXAMPLE");
+      exit(1);
+    }
+    ok(1, "write LOGREC_FIXED_RECORD_0LSN_EXAMPLE");
+
+    lsn_base= lsn;
+
+    rec_len= get_len();
+    parts[TRANSLOG_INTERNAL_PARTS + 0].str= long_buffer;
+    parts[TRANSLOG_INTERNAL_PARTS + 0].length= rec_len;
+    trn->short_id= i % 0xFFFF;
+    if (translog_write_record(&lsn,
+                              LOGREC_VARIABLE_RECORD_0LSN_EXAMPLE,
+                              trn, NULL, rec_len,
+                              TRANSLOG_INTERNAL_PARTS + 1, parts, NULL, NULL))
+    {
+      fprintf(stderr, "Can't write variable record #%u\n", i);
+      translog_destroy();
+      ok(0, "write LOGREC_VARIABLE_RECORD_0LSN_EXAMPLE");
+      exit(1);
+    }
+    ok(1, "write LOGREC_VARIABLE_RECORD_0LSN_EXAMPLE");
+  }
+
+  translog_destroy();
+  end_pagecache(&pagecache, 1);
+  ma_control_file_end();
+
+  if (ma_control_file_open(TRUE,TRUE))
+  {
+    fprintf(stderr, "pass2: Can't init control file (%d)\n", errno);
+    exit(1);
+  }
+  if ((pagen= init_pagecache(&pagecache, PCACHE_SIZE, 0, 0,
+                             TRANSLOG_PAGE_SIZE, 0)) == 0)
+  {
+    fprintf(stderr, "pass2: Got error: init_pagecache() (errno: %d)\n", errno);
+    exit(1);
+  }
+  if (translog_init_with_table(".", LOG_FILE_SIZE, 50112, 0, &pagecache,
+                               0, READONLY, &translog_example_table_init, 0))
+  {
+    fprintf(stderr, "pass2: Can't init loghandler (%d)\n", errno);
+    exit(1);
+  }
+
+
+  /* If we were writing sequence we need it only once */
+  opt_wfile= NULL;
+  if (opt_rfile)
+    seq_storage_rewind(&seq);
+  srand(122334817L);
+
+  rc= 1;
+
+  {
+    int len= translog_read_record_header(first_lsn, &rec);
+    if (len == RECHEADER_READ_ERROR)
+    {
+      fprintf(stderr, "translog_read_record_header failed (%d)\n", errno);
+      translog_free_record_header(&rec);
+      goto err;
+    }
+    if (rec.type !=LOGREC_FIXED_RECORD_0LSN_EXAMPLE || rec.short_trid != 0 ||
+        rec.record_length != 6 || uint4korr(rec.header) != 0 ||
+        ((uchar)rec.header[4]) != 0 || ((uchar)rec.header[5]) != 0xFF ||
+        first_lsn != rec.lsn)
+    {
+      fprintf(stderr, "Incorrect LOGREC_FIXED_RECORD_0LSN_EXAMPLE "
+              "data read(0)\n"
+              "type %u, strid %u, len %u, i: %u, 4: %u 5: %u, "
+              "lsn(0x%lu,0x%lx)\n",
+              (uint) rec.type, (uint) rec.short_trid, (uint) rec.record_length,
+              (uint)uint4korr(rec.header), (uint) rec.header[4],
+              (uint) rec.header[5],
+              LSN_IN_PARTS(rec.lsn));
+      translog_free_record_header(&rec);
+      goto err;
+    }
+    ok(1, "read record");
+    translog_free_record_header(&rec);
+    lsn= first_lsn;
+    if (translog_scanner_init(first_lsn, 1, &scanner, 0))
+    {
+      fprintf(stderr, "scanner init failed\n");
+      goto err;
+    }
+    for (i= 1;; i++)
+    {
+      len= translog_read_next_record_header(&scanner, &rec);
+      if (len == RECHEADER_READ_ERROR)
+      {
+        fprintf(stderr, "1-%d translog_read_next_record_header failed (%d)\n",
+                i, errno);
+        translog_free_record_header(&rec);
+        goto err;
+      }
+      if (len == RECHEADER_READ_EOF)
+      {
+        if (i != ITERATIONS)
+        {
+          fprintf(stderr, "EOL met at iteration %u instead of %u\n",
+                  i, ITERATIONS);
+          translog_free_record_header(&rec);
+          goto err;
+        }
+        break;
+      }
+
+      if (i % 2)
+      {
+        LSN ref;
+        ref= lsn_korr(rec.header);
+        if (rec.type != LOGREC_FIXED_RECORD_1LSN_EXAMPLE ||
+            rec.short_trid != (i % 0xFFFF) ||
+            rec.record_length != LSN_STORE_SIZE || ref != lsn)
+        {
+          fprintf(stderr, "Incorrect LOGREC_FIXED_RECORD_1LSN_EXAMPLE "
+                  "data read(%d)"
+                  "type %u, strid %u, len %u, ref(%lu,0x%lx), lsn(%lu,0x%lx)\n",
+                  i, (uint) rec.type, (uint) rec.short_trid,
+                  (uint) rec.record_length,
+                  LSN_IN_PARTS(ref), LSN_IN_PARTS(rec.lsn));
+          translog_free_record_header(&rec);
+          goto err;
+        }
+      }
+      else
+      {
+        LSN ref1, ref2;
+        ref1= lsn_korr(rec.header);
+        ref2= lsn_korr(rec.header + LSN_STORE_SIZE);
+        if (rec.type != LOGREC_FIXED_RECORD_2LSN_EXAMPLE ||
+            rec.short_trid != (i % 0xFFFF) ||
+            rec.record_length != 23 ||
+            ref1 != lsn ||
+            ref2 != first_lsn ||
+            ((uchar)rec.header[22]) != 0x55 ||
+            ((uchar)rec.header[21]) != 0xAA ||
+            ((uchar)rec.header[20]) != 0x55 ||
+            ((uchar)rec.header[19]) != 0xAA ||
+            ((uchar)rec.header[18]) != 0x55 ||
+            ((uchar)rec.header[17]) != 0xAA ||
+            ((uchar)rec.header[16]) != 0x55 ||
+            ((uchar)rec.header[15]) != 0xAA ||
+            ((uchar)rec.header[14]) != 0x55)
+        {
+          fprintf(stderr, "Incorrect LOGREC_FIXED_RECORD_2LSN_EXAMPLE "
+                  "data read(%d) "
+                  "type %u, strid %u, len %u, ref1(%lu,0x%lx), "
+                  "ref2(%lu,0x%lx) %x%x%x%x%x%x%x%x%x "
+                  "lsn(%lu,0x%lx)\n",
+                  i, (uint) rec.type, (uint) rec.short_trid,
+                  (uint) rec.record_length,
+                  LSN_IN_PARTS(ref1), LSN_IN_PARTS(ref2),
+                  (uint) rec.header[14], (uint) rec.header[15],
+                  (uint) rec.header[16], (uint) rec.header[17],
+                  (uint) rec.header[18], (uint) rec.header[19],
+                  (uint) rec.header[20], (uint) rec.header[21],
+                  (uint) rec.header[22],
+                  LSN_IN_PARTS(rec.lsn));
+          translog_free_record_header(&rec);
+          DBUG_ASSERT(0);
+          goto err;
+        }
+      }
+      ok(1, "read record");
+      translog_free_record_header(&rec);
+
+      len= translog_read_next_record_header(&scanner, &rec);
+      if (len == RECHEADER_READ_ERROR)
+      {
+        fprintf(stderr, "1-%d translog_read_next_record_header (var) "
+                "failed (%d)\n", i, errno);
+        goto err;
+      }
+      if (len == RECHEADER_READ_EOF)
+      {
+        fprintf(stderr, "EOL met at the middle of iteration (first var) %u "
+                "instead of beginning of %u\n", i, ITERATIONS);
+        goto err;
+      }
+      if (i % 2)
+      {
+        LSN ref;
+        ref= lsn_korr(rec.header);
+        rec_len= get_len();
+        if (rec.type !=LOGREC_VARIABLE_RECORD_1LSN_EXAMPLE ||
+            rec.short_trid != (i % 0xFFFF) ||
+            rec.record_length != rec_len + LSN_STORE_SIZE ||
+            len != 12 || ref != lsn ||
+            check_content(rec.header + LSN_STORE_SIZE, len - LSN_STORE_SIZE))
+        {
+          fprintf(stderr, "Incorrect LOGREC_VARIABLE_RECORD_1LSN_EXAMPLE "
+                  "data read(%d)"
+                  "type %u (%d), strid %u (%d), len %lu, %lu + 7 (%d), "
+                  "hdr len: %d (%d), "
+                  "ref(%lu,0x%lx), lsn(%lu,0x%lx) (%d), content: %d\n",
+                  i, (uint) rec.type,
+                  rec.type !=LOGREC_VARIABLE_RECORD_1LSN_EXAMPLE,
+                  (uint) rec.short_trid,
+                  rec.short_trid != (i % 0xFFFF),
+                  (ulong) rec.record_length, (ulong) rec_len,
+                  rec.record_length != rec_len + LSN_STORE_SIZE,
+                  len,
+                  len != 12,
+                  LSN_IN_PARTS(ref), LSN_IN_PARTS(rec.lsn),
+                  (ref != lsn),
+                  check_content(rec.header + LSN_STORE_SIZE,
+                                len - LSN_STORE_SIZE));
+          translog_free_record_header(&rec);
+          goto err;
+        }
+        if (read_and_check_content(&rec, long_buffer, LSN_STORE_SIZE))
+        {
+          fprintf(stderr,
+                  "Incorrect LOGREC_VARIABLE_RECORD_1LSN_EXAMPLE "
+                  "in whole rec read lsn(%lu,0x%lx)\n",
+                  LSN_IN_PARTS(rec.lsn));
+          translog_free_record_header(&rec);
+          goto err;
+        }
+      }
+      else
+      {
+        LSN ref1, ref2;
+        ref1= lsn_korr(rec.header);
+        ref2= lsn_korr(rec.header + LSN_STORE_SIZE);
+        rec_len= get_len();
+        if (rec.type != LOGREC_VARIABLE_RECORD_2LSN_EXAMPLE ||
+            rec.short_trid != (i % 0xFFFF) ||
+            rec.record_length != rec_len + LSN_STORE_SIZE * 2 ||
+            len != 19 ||
+            ref1 != lsn ||
+            ref2 != first_lsn ||
+            check_content(rec.header + LSN_STORE_SIZE * 2,
+                          len - LSN_STORE_SIZE * 2))
+        {
+          fprintf(stderr, "Incorrect LOGREC_VARIABLE_RECORD_2LSN_EXAMPLE "
+                  " data read(%d) "
+                  "type %u, strid %u, len %lu != %lu + 14, hdr len: %d, "
+                  "ref1(%lu,0x%lx), ref2(%lu,0x%lx), "
+                  "lsn(%lu,0x%lx)\n",
+                  i, (uint) rec.type, (uint) rec.short_trid,
+                  (ulong) rec.record_length, (ulong) rec_len,
+                  len,
+                  LSN_IN_PARTS(ref1), LSN_IN_PARTS(ref2),
+                  LSN_IN_PARTS(rec.lsn));
+          translog_free_record_header(&rec);
+          goto err;
+        }
+        if (read_and_check_content(&rec, long_buffer, LSN_STORE_SIZE * 2))
+        {
+          fprintf(stderr,
+                  "Incorrect LOGREC_VARIABLE_RECORD_2LSN_EXAMPLE "
+                  "in whole rec read lsn(%lu,0x%lx)\n",
+                  LSN_IN_PARTS(rec.lsn));
+          translog_free_record_header(&rec);
+          goto err;
+        }
+      }
+      ok(1, "read record");
+      translog_free_record_header(&rec);
+
+      len= translog_read_next_record_header(&scanner, &rec);
+      if (len == RECHEADER_READ_ERROR)
+      {
+        fprintf(stderr, "1-%d translog_read_next_record_header failed (%d)\n",
+                i, errno);
+        translog_free_record_header(&rec);
+        goto err;
+      }
+      if (len == RECHEADER_READ_EOF)
+      {
+        fprintf(stderr, "EOL met at the middle of iteration %u "
+                "instead of beginning of %u\n", i, ITERATIONS);
+        translog_free_record_header(&rec);
+        goto err;
+      }
+      if (rec.type != LOGREC_FIXED_RECORD_0LSN_EXAMPLE ||
+          rec.short_trid != (i % 0xFFFF) ||
+          rec.record_length != 6 || uint4korr(rec.header) != i ||
+          ((uchar)rec.header[4]) != 0 || ((uchar)rec.header[5]) != 0xFF)
+      {
+        fprintf(stderr, "Incorrect LOGREC_FIXED_RECORD_0LSN_EXAMPLE "
+                "data read(%d)\n"
+                "type %u, strid %u, len %u, i: %u, 4: %u 5: %u "
+                "lsn(%lu,0x%lx)\n",
+                i, (uint) rec.type, (uint) rec.short_trid,
+                (uint) rec.record_length,
+                (uint)uint4korr(rec.header), (uint) rec.header[4],
+                (uint) rec.header[5],
+                LSN_IN_PARTS(rec.lsn));
+        translog_free_record_header(&rec);
+        goto err;
+      }
+      ok(1, "read record");
+      translog_free_record_header(&rec);
+
+      lsn= rec.lsn;
+
+      len= translog_read_next_record_header(&scanner, &rec);
+      rec_len= get_len();
+      if (rec.type != LOGREC_VARIABLE_RECORD_0LSN_EXAMPLE ||
+          rec.short_trid != (i % 0xFFFF) ||
+          rec.record_length != rec_len ||
+          len != 9 || check_content(rec.header, len))
+      {
+        fprintf(stderr, "Incorrect LOGREC_VARIABLE_RECORD_0LSN_EXAMPLE "
+                "data read(%d) "
+                "type %u, strid %u, len %lu != %lu, hdr len: %d, "
+                "lsn(%lu,0x%lx)\n",
+                i, (uint) rec.type, (uint) rec.short_trid,
+                (ulong) rec.record_length, (ulong) rec_len,
+                len, LSN_IN_PARTS(rec.lsn));
+        translog_free_record_header(&rec);
+        goto err;
+      }
+      if (read_and_check_content(&rec, long_buffer, 0))
+      {
+        fprintf(stderr,
+                "Incorrect LOGREC_VARIABLE_RECORD_2LSN_EXAMPLE "
+                "in whole rec read lsn(%lu,0x%lx)\n",
+                LSN_IN_PARTS(rec.lsn));
+        translog_free_record_header(&rec);
+        goto err;
+      }
+      ok(1, "read record");
+      translog_free_record_header(&rec);
+    }
+  }
+
+  rc= 0;
+err:
+  if (rc)
+    ok(0, "read record");
+  translog_destroy();
+  end_pagecache(&pagecache, 1);
+  ma_control_file_end();
+  free_defaults(default_argv);
+  seq_storage_destroy(&seq);
+  if (maria_log_remove())
+    exit(1);
+
+  return (test(exit_status()));
+}
diff --git a/storage/maria/unittest/ma_test_loghandler_multithread-t.c b/storage/maria/unittest/ma_test_loghandler_multithread-t.c
new file mode 100644
index 00000000000..354f5d12e08
--- /dev/null
+++ b/storage/maria/unittest/ma_test_loghandler_multithread-t.c
@@ -0,0 +1,556 @@
+/* Copyright (C) 2006-2008 MySQL AB
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; version 2 of the License.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */
+
+#include "../maria_def.h"
+#include <stdio.h>
+#include <errno.h>
+#include <tap.h>
+#include "../trnman.h"
+
+extern my_bool maria_log_remove();
+extern void translog_example_table_init();
+
+#ifndef DBUG_OFF
+static const char *default_dbug_option;
+#endif
+
+#define PCACHE_SIZE (1024*1024*10)
+
+#define LOG_FILE_SIZE (1024L*1024L*1024L + 1024L*1024L*512)
+/*#define LOG_FLAGS TRANSLOG_SECTOR_PROTECTION | TRANSLOG_PAGE_CRC */
+#define LOG_FLAGS 0
+/*#define LONG_BUFFER_SIZE (1024L*1024L*1024L + 1024L*1024L*512)*/
+
+#ifdef MULTIFLUSH_TEST
+
+#define LONG_BUFFER_SIZE (16384L)
+#define MIN_REC_LENGTH 10
+#define SHOW_DIVIDER 20
+#define ITERATIONS 10000
+#define FLUSH_ITERATIONS 1000
+#define WRITERS 2
+#define FLUSHERS 10
+
+#else
+
+#define LONG_BUFFER_SIZE (512L*1024L*1024L)
+#define MIN_REC_LENGTH 30
+#define SHOW_DIVIDER 10
+#define ITERATIONS 3
+#define FLUSH_ITERATIONS 0
+#define WRITERS 3
+#define FLUSHERS 0
+
+#endif
+
+static uint number_of_writers= WRITERS;
+static uint number_of_flushers= FLUSHERS;
+
+static pthread_cond_t COND_thread_count;
+static pthread_mutex_t LOCK_thread_count;
+static uint thread_count;
+
+static ulong lens[WRITERS][ITERATIONS];
+static LSN lsns1[WRITERS][ITERATIONS];
+static LSN lsns2[WRITERS][ITERATIONS];
+static uchar *long_buffer;
+
+
+static LSN last_lsn; /* For test purposes the variable allow dirty read/write */
+
+/*
+  Get pseudo-random length of the field in
+    limits [MIN_REC_LENGTH..LONG_BUFFER_SIZE]
+
+  SYNOPSIS
+    get_len()
+
+  RETURN
+    length - length >= 0 length <= LONG_BUFFER_SIZE
+*/
+
+static uint32 get_len()
+{
+  return MIN_REC_LENGTH +
+    (uint32)(((ulonglong)rand())*
+       (LONG_BUFFER_SIZE - MIN_REC_LENGTH - 1)/RAND_MAX);
+}
+
+
+/*
+  Check that the buffer filled correctly
+
+  SYNOPSIS
+    check_content()
+    ptr                  Pointer to the buffer
+    length               length of the buffer
+
+  RETURN
+    0 - OK
+    1 - Error
+*/
+
+static my_bool check_content(uchar *ptr, ulong length)
+{
+  ulong i;
+  for (i= 0; i < length; i++)
+  {
+    if (((uchar)ptr[i]) != (i & 0xFF))
+    {
+      fprintf(stderr, "Byte # %lu is %x instead of %x",
+              i, (uint) ptr[i], (uint) (i & 0xFF));
+      return 1;
+    }
+  }
+  return 0;
+}
+
+
+/*
+  Read whole record content, and check content (put with offset)
+
+  SYNOPSIS
+    read_and_check_content()
+    rec                  The record header buffer
+    buffer               The buffer to read the record in
+    skip                 Skip this number of bytes ot the record content
+
+  RETURN
+    0 - OK
+    1 - Error
+*/
+
+
+static my_bool read_and_check_content(TRANSLOG_HEADER_BUFFER *rec,
+                                      uchar *buffer, uint skip)
+{
+  int res= 0;
+  translog_size_t len;
+
+  if ((len= translog_read_record(rec->lsn, 0, rec->record_length,
+                                 buffer, NULL)) != rec->record_length)
+  {
+    fprintf(stderr, "Requested %lu byte, read %lu\n",
+            (ulong) rec->record_length, (ulong) len);
+    res= 1;
+  }
+  res|= check_content(buffer + skip, rec->record_length - skip);
+  return(res);
+}
+
+void writer(int num)
+{
+  LSN lsn;
+  TRN trn;
+  uchar long_tr_id[6];
+  uint i;
+
+  trn.short_id= num;
+  trn.first_undo_lsn= TRANSACTION_LOGGED_LONG_ID;
+  for (i= 0; i < ITERATIONS; i++)
+  {
+    uint len= get_len();
+    LEX_CUSTRING parts[TRANSLOG_INTERNAL_PARTS + 1];
+    lens[num][i]= len;
+
+    int2store(long_tr_id, num);
+    int4store(long_tr_id + 2, i);
+    parts[TRANSLOG_INTERNAL_PARTS + 0].str= long_tr_id;
+    parts[TRANSLOG_INTERNAL_PARTS + 0].length= 6;
+    if (translog_write_record(&lsn,
+                              LOGREC_FIXED_RECORD_0LSN_EXAMPLE,
+                              &trn, NULL, 6, TRANSLOG_INTERNAL_PARTS + 1,
+                              parts, NULL, NULL))
+    {
+      fprintf(stderr, "Can't write LOGREC_FIXED_RECORD_0LSN_EXAMPLE record #%lu "
+              "thread %i\n", (ulong) i, num);
+      translog_destroy();
+      pthread_mutex_lock(&LOCK_thread_count);
+      ok(0, "write records");
+      pthread_mutex_unlock(&LOCK_thread_count);
+      return;
+    }
+    lsns1[num][i]= lsn;
+    parts[TRANSLOG_INTERNAL_PARTS + 0].str= long_buffer;
+    parts[TRANSLOG_INTERNAL_PARTS + 0].length= len;
+    if (translog_write_record(&lsn,
+                              LOGREC_VARIABLE_RECORD_0LSN_EXAMPLE,
+                              &trn, NULL,
+                              len, TRANSLOG_INTERNAL_PARTS + 1,
+                              parts, NULL, NULL))
+    {
+      fprintf(stderr, "Can't write variable record #%lu\n", (ulong) i);
+      translog_destroy();
+      pthread_mutex_lock(&LOCK_thread_count);
+      ok(0, "write records");
+      pthread_mutex_unlock(&LOCK_thread_count);
+      return;
+    }
+    lsns2[num][i]= lsn;
+    last_lsn= lsn;
+    pthread_mutex_lock(&LOCK_thread_count);
+    ok(1, "write records");
+    pthread_mutex_unlock(&LOCK_thread_count);
+  }
+  return;
+}
+
+
+static void *test_thread_writer(void *arg)
+{
+  int param= *((int*) arg);
+
+  my_thread_init();
+
+  writer(param);
+
+  pthread_mutex_lock(&LOCK_thread_count);
+  thread_count--;
+  ok(1, "writer finished"); /* just to show progress */
+  VOID(pthread_cond_signal(&COND_thread_count));        /* Tell main we are
+                                                           ready */
+  pthread_mutex_unlock(&LOCK_thread_count);
+  free((uchar*) arg);
+  my_thread_end();
+  return(0);
+}
+
+
+static void *test_thread_flusher(void *arg)
+{
+  int param= *((int*) arg);
+  int i;
+
+  my_thread_init();
+
+  for(i= 0; i < FLUSH_ITERATIONS; i++)
+  {
+    translog_flush(last_lsn);
+    pthread_mutex_lock(&LOCK_thread_count);
+    ok(1, "-- flush %d", param);
+    pthread_mutex_unlock(&LOCK_thread_count);
+  }
+
+  pthread_mutex_lock(&LOCK_thread_count);
+  thread_count--;
+  ok(1, "flusher finished"); /* just to show progress */
+  VOID(pthread_cond_signal(&COND_thread_count));        /* Tell main we are
+                                                           ready */
+  pthread_mutex_unlock(&LOCK_thread_count);
+  free((uchar*) arg);
+  my_thread_end();
+  return(0);
+}
+
+
+int main(int argc __attribute__((unused)),
+         char **argv __attribute__ ((unused)))
+{
+  uint32 i;
+  uint pagen;
+  PAGECACHE pagecache;
+  LSN first_lsn;
+  TRANSLOG_HEADER_BUFFER rec;
+  struct st_translog_scanner_data scanner;
+  pthread_t tid;
+  pthread_attr_t thr_attr;
+  int *param, error;
+  int rc;
+
+  /* Disabled until Sanja tests */
+  plan(1);
+  ok(1, "disabled");
+  exit(0);
+
+  plan(WRITERS + FLUSHERS +
+       ITERATIONS * WRITERS * 3 + FLUSH_ITERATIONS * FLUSHERS );
+
+  bzero(&pagecache, sizeof(pagecache));
+  maria_data_root= (char *)".";
+  long_buffer= malloc(LONG_BUFFER_SIZE + 7 * 2 + 2);
+  if (long_buffer == 0)
+  {
+    fprintf(stderr, "End of memory\n");
+    exit(1);
+  }
+  for (i= 0; i < (LONG_BUFFER_SIZE + 7 * 2 + 2); i++)
+    long_buffer[i]= (i & 0xFF);
+
+  MY_INIT(argv[0]);
+  if (maria_log_remove())
+    exit(1);
+
+
+#ifndef DBUG_OFF
+#if defined(__WIN__)
+  default_dbug_option= "d:t:i:O,\\ma_test_loghandler.trace";
+#else
+  default_dbug_option= "d:t:i:o,/tmp/ma_test_loghandler.trace";
+#endif
+  if (argc > 1)
+  {
+    DBUG_SET(default_dbug_option);
+    DBUG_SET_INITIAL(default_dbug_option);
+  }
+#endif
+
+
+  if ((error= pthread_cond_init(&COND_thread_count, NULL)))
+  {
+    fprintf(stderr, "COND_thread_count: %d from pthread_cond_init "
+            "(errno: %d)\n", error, errno);
+    exit(1);
+  }
+  if ((error= pthread_mutex_init(&LOCK_thread_count, MY_MUTEX_INIT_FAST)))
+  {
+    fprintf(stderr, "LOCK_thread_count: %d from pthread_cond_init "
+            "(errno: %d)\n", error, errno);
+    exit(1);
+  }
+  if ((error= pthread_attr_init(&thr_attr)))
+  {
+    fprintf(stderr, "Got error: %d from pthread_attr_init "
+            "(errno: %d)\n", error, errno);
+    exit(1);
+  }
+  if ((error= pthread_attr_setdetachstate(&thr_attr, PTHREAD_CREATE_DETACHED)))
+  {
+    fprintf(stderr,
+            "Got error: %d from pthread_attr_setdetachstate (errno: %d)\n",
+            error, errno);
+    exit(1);
+  }
+
+#ifdef HAVE_THR_SETCONCURRENCY
+  VOID(thr_setconcurrency(2));
+#endif
+
+  my_thread_global_init();
+
+  if (ma_control_file_open(TRUE, TRUE))
+  {
+    fprintf(stderr, "Can't init control file (%d)\n", errno);
+    exit(1);
+  }
+  if ((pagen= init_pagecache(&pagecache, PCACHE_SIZE, 0, 0,
+                             TRANSLOG_PAGE_SIZE, 0)) == 0)
+  {
+    fprintf(stderr, "Got error: init_pagecache() (errno: %d)\n", errno);
+    exit(1);
+  }
+  if (translog_init_with_table(".", LOG_FILE_SIZE, 50112, 0, &pagecache,
+                               LOG_FLAGS, 0, &translog_example_table_init,
+                               0))
+  {
+    fprintf(stderr, "Can't init loghandler (%d)\n", errno);
+    exit(1);
+  }
+  /* Suppressing of automatic record writing */
+  dummy_transaction_object.first_undo_lsn|= TRANSACTION_LOGGED_LONG_ID;
+
+  srand(122334817L);
+  {
+    LEX_CUSTRING parts[TRANSLOG_INTERNAL_PARTS + 1];
+    uchar long_tr_id[6]=
+    {
+      0x11, 0x22, 0x33, 0x44, 0x55, 0x66
+    };
+
+    parts[TRANSLOG_INTERNAL_PARTS + 0].str= long_tr_id;
+    parts[TRANSLOG_INTERNAL_PARTS + 0].length= 6;
+    dummy_transaction_object.first_undo_lsn= TRANSACTION_LOGGED_LONG_ID;
+    if (translog_write_record(&first_lsn,
+                              LOGREC_FIXED_RECORD_0LSN_EXAMPLE,
+                              &dummy_transaction_object, NULL, 6,
+                              TRANSLOG_INTERNAL_PARTS + 1,
+                              parts, NULL, NULL))
+    {
+      fprintf(stderr, "Can't write the first record\n");
+      translog_destroy();
+      exit(1);
+    }
+  }
+
+
+  pthread_mutex_lock(&LOCK_thread_count);
+  while (number_of_writers != 0 || number_of_flushers != 0)
+  {
+    if (number_of_writers)
+    {
+      param= (int*) malloc(sizeof(int));
+      *param= number_of_writers - 1;
+      if ((error= pthread_create(&tid, &thr_attr, test_thread_writer,
+                                 (void*) param)))
+      {
+        fprintf(stderr, "Got error: %d from pthread_create (errno: %d)\n",
+                error, errno);
+        exit(1);
+      }
+      thread_count++;
+      number_of_writers--;
+    }
+    if (number_of_flushers)
+    {
+      param= (int*) malloc(sizeof(int));
+      *param= number_of_flushers - 1;
+      if ((error= pthread_create(&tid, &thr_attr, test_thread_flusher,
+                                 (void*) param)))
+      {
+        fprintf(stderr, "Got error: %d from pthread_create (errno: %d)\n",
+                error, errno);
+        exit(1);
+      }
+      thread_count++;
+      number_of_flushers--;
+    }
+  }
+  pthread_mutex_unlock(&LOCK_thread_count);
+
+  pthread_attr_destroy(&thr_attr);
+
+  /* wait finishing */
+  pthread_mutex_lock(&LOCK_thread_count);
+  while (thread_count)
+  {
+    if ((error= pthread_cond_wait(&COND_thread_count, &LOCK_thread_count)))
+      fprintf(stderr, "COND_thread_count: %d from pthread_cond_wait\n", error);
+  }
+  pthread_mutex_unlock(&LOCK_thread_count);
+
+  /* Find last LSN and flush up to it (all our log) */
+  {
+    LSN max= 0;
+    for (i= 0; i < WRITERS; i++)
+    {
+      if (cmp_translog_addr(lsns2[i][ITERATIONS - 1], max) > 0)
+        max= lsns2[i][ITERATIONS - 1];
+    }
+    translog_flush(max);
+  }
+
+  rc= 1;
+
+  {
+    uint indeces[WRITERS];
+    uint index, stage;
+    int len;
+    bzero(indeces, sizeof(uint) * WRITERS);
+
+    bzero(indeces, sizeof(indeces));
+
+    if (translog_scanner_init(first_lsn, 1, &scanner, 0))
+    {
+      fprintf(stderr, "scanner init failed\n");
+      goto err;
+    }
+    for (i= 0;; i++)
+    {
+      len= translog_read_next_record_header(&scanner, &rec);
+
+      if (len == RECHEADER_READ_ERROR)
+      {
+        fprintf(stderr, "1-%d translog_read_next_record_header failed (%d)\n",
+                i, errno);
+        translog_free_record_header(&rec);
+        goto err;
+      }
+      if (len == RECHEADER_READ_EOF)
+      {
+        if (i != WRITERS * ITERATIONS * 2)
+        {
+          fprintf(stderr, "EOL met at iteration %u instead of %u\n",
+                  i, ITERATIONS * WRITERS * 2);
+          translog_free_record_header(&rec);
+          goto err;
+        }
+        break;
+      }
+      index= indeces[rec.short_trid] / 2;
+      stage= indeces[rec.short_trid] % 2;
+      if (stage == 0)
+      {
+        if (rec.type !=LOGREC_FIXED_RECORD_0LSN_EXAMPLE ||
+            rec.record_length != 6 ||
+            uint2korr(rec.header) != rec.short_trid ||
+            index != uint4korr(rec.header + 2) ||
+            cmp_translog_addr(lsns1[rec.short_trid][index], rec.lsn) != 0)
+        {
+          fprintf(stderr, "Incorrect LOGREC_FIXED_RECORD_0LSN_EXAMPLE "
+                  "data read(%d)\n"
+                  "type %u, strid %u %u, len %u, i: %u %u, "
+                  "lsn(%lu,0x%lx) (%lu,0x%lx)\n",
+                  i, (uint) rec.type,
+                  (uint) rec.short_trid, (uint) uint2korr(rec.header),
+                  (uint) rec.record_length,
+                  (uint) index, (uint) uint4korr(rec.header + 2),
+                  LSN_IN_PARTS(rec.lsn),
+                  LSN_IN_PARTS(lsns1[rec.short_trid][index]));
+          translog_free_record_header(&rec);
+          goto err;
+        }
+      }
+      else
+      {
+        if (rec.type != LOGREC_VARIABLE_RECORD_0LSN_EXAMPLE ||
+            len != 9 ||
+            rec.record_length != lens[rec.short_trid][index] ||
+            cmp_translog_addr(lsns2[rec.short_trid][index], rec.lsn) != 0 ||
+            check_content(rec.header, (uint)len))
+        {
+          fprintf(stderr,
+                  "Incorrect LOGREC_VARIABLE_RECORD_0LSN_EXAMPLE "
+                  "data read(%d) "
+                  "thread: %d, iteration %d, stage %d\n"
+                  "type %u (%d), len %d, length %lu %lu (%d) "
+                  "lsn(%lu,0x%lx) (%lu,0x%lx)\n",
+                  i, (uint) rec.short_trid, index, stage,
+                  (uint) rec.type, (rec.type !=
+                                    LOGREC_VARIABLE_RECORD_0LSN_EXAMPLE),
+                  len,
+                  (ulong) rec.record_length, lens[rec.short_trid][index],
+                  (rec.record_length != lens[rec.short_trid][index]),
+                  LSN_IN_PARTS(rec.lsn),
+                  LSN_IN_PARTS(lsns2[rec.short_trid][index]));
+          translog_free_record_header(&rec);
+          goto err;
+        }
+        if (read_and_check_content(&rec, long_buffer, 0))
+        {
+          fprintf(stderr,
+                  "Incorrect LOGREC_VARIABLE_RECORD_0LSN_EXAMPLE "
+                  "in whole rec read lsn(%lu,0x%lx)\n",
+                  LSN_IN_PARTS(rec.lsn));
+          translog_free_record_header(&rec);
+          goto err;
+        }
+      }
+      ok(1, "record read");
+      translog_free_record_header(&rec);
+      indeces[rec.short_trid]++;
+    }
+  }
+
+  rc= 0;
+err:
+  if (rc)
+    ok(0, "record read");
+  translog_destroy();
+  end_pagecache(&pagecache, 1);
+  ma_control_file_end();
+  if (maria_log_remove())
+    exit(1);
+
+  return(exit_status());
+}
diff --git a/storage/maria/unittest/ma_test_loghandler_noflush-t.c b/storage/maria/unittest/ma_test_loghandler_noflush-t.c
new file mode 100644
index 00000000000..973dfd03bcf
--- /dev/null
+++ b/storage/maria/unittest/ma_test_loghandler_noflush-t.c
@@ -0,0 +1,146 @@
+/* Copyright (C) 2006-2008 MySQL AB
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; version 2 of the License.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */
+
+#include "../maria_def.h"
+#include <stdio.h>
+#include <errno.h>
+#include <tap.h>
+#include "../trnman.h"
+
+extern my_bool maria_log_remove();
+extern void translog_example_table_init();
+
+#ifndef DBUG_OFF
+static const char *default_dbug_option;
+#endif
+
+#define PCACHE_SIZE (1024*1024*10)
+#define PCACHE_PAGE TRANSLOG_PAGE_SIZE
+#define LOG_FILE_SIZE (1024L*1024L*1024L + 1024L*1024L*512)
+#define LOG_FLAGS 0
+
+static char *first_translog_file= (char*)"maria_log.00000001";
+
+int main(int argc __attribute__((unused)), char *argv[])
+{
+  uint pagen;
+  int rc= 1;
+  uchar long_tr_id[6];
+  PAGECACHE pagecache;
+  LSN first_lsn;
+  TRANSLOG_HEADER_BUFFER rec;
+  LEX_CUSTRING parts[TRANSLOG_INTERNAL_PARTS + 1];
+  translog_size_t len;
+
+  MY_INIT(argv[0]);
+
+  plan(1);
+
+  bzero(&pagecache, sizeof(pagecache));
+  maria_data_root= (char *)".";
+  if (maria_log_remove())
+    exit(1);
+  /* be sure that we have no logs in the directory*/
+  my_delete(CONTROL_FILE_BASE_NAME, MYF(0));
+  my_delete(first_translog_file, MYF(0));
+
+  bzero(long_tr_id, 6);
+#ifndef DBUG_OFF
+#if defined(__WIN__)
+  default_dbug_option= "d:t:i:O,\\ma_test_loghandler.trace";
+#else
+  default_dbug_option= "d:t:i:o,/tmp/ma_test_loghandler.trace";
+#endif
+  if (argc > 1)
+  {
+    DBUG_SET(default_dbug_option);
+    DBUG_SET_INITIAL(default_dbug_option);
+  }
+#endif
+
+  if (ma_control_file_open(TRUE, TRUE))
+  {
+    fprintf(stderr, "Can't init control file (%d)\n", errno);
+    exit(1);
+  }
+  if ((pagen= init_pagecache(&pagecache, PCACHE_SIZE, 0, 0,
+                             PCACHE_PAGE, 0)) == 0)
+  {
+    fprintf(stderr, "Got error: init_pagecache() (errno: %d)\n", errno);
+    exit(1);
+  }
+  if (translog_init_with_table(".", LOG_FILE_SIZE, 50112, 0, &pagecache,
+                               LOG_FLAGS, 0, &translog_example_table_init,
+                               0))
+  {
+    fprintf(stderr, "Can't init loghandler (%d)\n", errno);
+    exit(1);
+  }
+  /* Suppressing of automatic record writing */
+  dummy_transaction_object.first_undo_lsn|= TRANSACTION_LOGGED_LONG_ID;
+
+  int4store(long_tr_id, 0);
+  long_tr_id[5]= 0xff;
+  parts[TRANSLOG_INTERNAL_PARTS + 0].str= long_tr_id;
+  parts[TRANSLOG_INTERNAL_PARTS + 0].length= 6;
+  if (translog_write_record(&first_lsn,
+                            LOGREC_FIXED_RECORD_0LSN_EXAMPLE,
+                            &dummy_transaction_object, NULL, 6,
+                            TRANSLOG_INTERNAL_PARTS + 1,
+                            parts, NULL, NULL))
+  {
+    fprintf(stderr, "Can't write record #%lu\n", (ulong) 0);
+    translog_destroy();
+    exit(1);
+  }
+
+  len= translog_read_record_header(first_lsn, &rec);
+  if (len == 0)
+  {
+    fprintf(stderr, "translog_read_record_header failed (%d)\n", errno);
+    goto err;
+  }
+  if (rec.type !=LOGREC_FIXED_RECORD_0LSN_EXAMPLE || rec.short_trid != 0 ||
+      rec.record_length != 6 || uint4korr(rec.header) != 0 ||
+      ((uchar)rec.header[4]) != 0 || ((uchar)rec.header[5]) != 0xFF ||
+      first_lsn != rec.lsn)
+  {
+    fprintf(stderr, "Incorrect LOGREC_FIXED_RECORD_0LSN_EXAMPLE "
+            "data read(0)\n"
+            "type: %u (%d)  strid: %u (%d)  len: %u (%d)  i: %u (%d), "
+            "4: %u (%d)  5: %u (%d)  "
+            "lsn(%lu,0x%lx) (%d)\n",
+            (uint) rec.type, (rec.type !=LOGREC_FIXED_RECORD_0LSN_EXAMPLE),
+            (uint) rec.short_trid, (rec.short_trid != 0),
+            (uint) rec.record_length, (rec.record_length != 6),
+            (uint) uint4korr(rec.header), (uint4korr(rec.header) != 0),
+            (uint) rec.header[4], (((uchar)rec.header[4]) != 0),
+            (uint) rec.header[5], (((uchar)rec.header[5]) != 0xFF),
+            LSN_IN_PARTS(rec.lsn), (first_lsn != rec.lsn));
+    goto err;
+  }
+
+  ok(1, "read OK");
+  rc= 0;
+
+err:
+  translog_destroy();
+  end_pagecache(&pagecache, 1);
+  ma_control_file_end();
+  if (maria_log_remove())
+    exit(1);
+
+  exit(rc);
+}
diff --git a/storage/maria/unittest/ma_test_loghandler_nologs-t.c b/storage/maria/unittest/ma_test_loghandler_nologs-t.c
new file mode 100644
index 00000000000..34508d1d751
--- /dev/null
+++ b/storage/maria/unittest/ma_test_loghandler_nologs-t.c
@@ -0,0 +1,195 @@
+/* Copyright (C) 2006-2008 MySQL AB
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; version 2 of the License.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */
+
+#include "../maria_def.h"
+#include <stdio.h>
+#include <errno.h>
+#include <tap.h>
+#include "../trnman.h"
+
+extern my_bool maria_log_remove();
+extern void example_loghandler_init();
+
+#ifndef DBUG_OFF
+static const char *default_dbug_option;
+#endif
+
+#define PCACHE_SIZE (1024*1024*10)
+#define PCACHE_PAGE TRANSLOG_PAGE_SIZE
+#define LOG_FILE_SIZE (8*1024L*1024L)
+#define LOG_FLAGS 0
+#define LONG_BUFFER_SIZE (LOG_FILE_SIZE + LOG_FILE_SIZE / 2)
+
+
+int main(int argc __attribute__((unused)), char *argv[])
+{
+  ulong i;
+  uint pagen;
+  uchar long_tr_id[6];
+  PAGECACHE pagecache;
+  LSN lsn;
+  LEX_CUSTRING parts[TRANSLOG_INTERNAL_PARTS + 1];
+  uchar *long_buffer= malloc(LONG_BUFFER_SIZE);
+
+  MY_INIT(argv[0]);
+
+  plan(2);
+
+  bzero(&pagecache, sizeof(pagecache));
+  bzero(long_buffer, LONG_BUFFER_SIZE);
+  maria_data_root= (char *)".";
+  if (maria_log_remove())
+    exit(1);
+
+  bzero(long_tr_id, 6);
+#ifndef DBUG_OFF
+#if defined(__WIN__)
+  default_dbug_option= "d:t:i:O,\\ma_test_loghandler.trace";
+#else
+  default_dbug_option= "d:t:i:o,/tmp/ma_test_loghandler.trace";
+#endif
+  if (argc > 1)
+  {
+    DBUG_SET(default_dbug_option);
+    DBUG_SET_INITIAL(default_dbug_option);
+  }
+#endif
+
+  if (ma_control_file_open(TRUE, TRUE))
+  {
+    fprintf(stderr, "Can't init control file (%d)\n", errno);
+    exit(1);
+  }
+  if ((pagen= init_pagecache(&pagecache, PCACHE_SIZE, 0, 0,
+                             PCACHE_PAGE, 0)) == 0)
+  {
+    fprintf(stderr, "Got error: init_pagecache() (errno: %d)\n", errno);
+    exit(1);
+  }
+  if (translog_init_with_table(".", LOG_FILE_SIZE, 50112, 0, &pagecache,
+                               LOG_FLAGS, 0, &translog_example_table_init,
+                               0))
+  {
+    fprintf(stderr, "Can't init loghandler (%d)\n", errno);
+    exit(1);
+  }
+  /* Suppressing of automatic record writing */
+  dummy_transaction_object.first_undo_lsn|= TRANSACTION_LOGGED_LONG_ID;
+
+  /* write more then 1 file */
+  int4store(long_tr_id, 0);
+  parts[TRANSLOG_INTERNAL_PARTS + 0].str= long_tr_id;
+  parts[TRANSLOG_INTERNAL_PARTS + 0].length= 6;
+  if (translog_write_record(&lsn,
+                            LOGREC_FIXED_RECORD_0LSN_EXAMPLE,
+                            &dummy_transaction_object, NULL, 6,
+                            TRANSLOG_INTERNAL_PARTS + 1,
+                            parts, NULL, NULL))
+  {
+    fprintf(stderr, "Can't write record #0\n");
+    translog_destroy();
+    exit(1);
+  }
+
+  for(i= 0; i < LOG_FILE_SIZE/6 && LSN_FILE_NO(lsn) == 1; i++)
+  {
+    if (translog_write_record(&lsn,
+                              LOGREC_FIXED_RECORD_0LSN_EXAMPLE,
+                              &dummy_transaction_object, NULL, 6,
+                              TRANSLOG_INTERNAL_PARTS + 1,
+                              parts, NULL, NULL))
+    {
+      fprintf(stderr, "Can't write record #0\n");
+      translog_destroy();
+      exit(1);
+    }
+  }
+
+  translog_destroy();
+  end_pagecache(&pagecache, 1);
+  ma_control_file_end();
+
+  {
+    char file_name[FN_REFLEN];
+    for (i= 1; i <= 2; i++)
+    {
+      translog_filename_by_fileno(i, file_name);
+      if (my_access(file_name, W_OK))
+      {
+        fprintf(stderr, "No file '%s'\n", file_name);
+        exit(1);
+      }
+      if (my_delete(file_name, MYF(MY_WME)) != 0)
+      {
+        fprintf(stderr, "Error %d during removing file'%s'\n",
+                errno, file_name);
+        exit(1);
+      }
+    }
+  }
+
+  if (ma_control_file_open(TRUE, TRUE))
+  {
+    fprintf(stderr, "Can't init control file (%d)\n", errno);
+    exit(1);
+  }
+  if ((pagen= init_pagecache(&pagecache, PCACHE_SIZE, 0, 0,
+                             PCACHE_PAGE, 0)) == 0)
+  {
+    fprintf(stderr, "Got error: init_pagecache() (errno: %d)\n", errno);
+    exit(1);
+  }
+  if (translog_init_with_table(".", LOG_FILE_SIZE, 50112, 0, &pagecache,
+                               LOG_FLAGS, 0, &translog_example_table_init,
+                               1))
+  {
+    fprintf(stderr, "Can't init loghandler (%d)\n", errno);
+    exit(1);
+  }
+  /* Suppressing of automatic record writing */
+  dummy_transaction_object.first_undo_lsn|= TRANSACTION_LOGGED_LONG_ID;
+
+  ok(1, "Log init OK");
+
+  int4store(long_tr_id, 0);
+  parts[TRANSLOG_INTERNAL_PARTS + 0].str= long_tr_id;
+  parts[TRANSLOG_INTERNAL_PARTS + 0].length= 6;
+  if (translog_write_record(&lsn,
+                            LOGREC_FIXED_RECORD_0LSN_EXAMPLE,
+                            &dummy_transaction_object, NULL, 6,
+                            TRANSLOG_INTERNAL_PARTS + 1,
+                            parts, NULL, NULL))
+  {
+    fprintf(stderr, "Can't write record #0\n");
+    translog_destroy();
+    exit(1);
+  }
+
+  translog_destroy();
+  end_pagecache(&pagecache, 1);
+  ma_control_file_end();
+
+  if (!translog_is_file(3))
+  {
+    fprintf(stderr, "No file #3\n");
+    exit(1);
+  }
+
+  ok(1, "New log is OK");
+
+  if (maria_log_remove())
+    exit(1);
+  exit(0);
+}
diff --git a/storage/maria/unittest/ma_test_loghandler_pagecache-t.c b/storage/maria/unittest/ma_test_loghandler_pagecache-t.c
new file mode 100644
index 00000000000..1644aa4885c
--- /dev/null
+++ b/storage/maria/unittest/ma_test_loghandler_pagecache-t.c
@@ -0,0 +1,200 @@
+/* Copyright (C) 2006-2008 MySQL AB
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; version 2 of the License.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */
+
+#include "../maria_def.h"
+#include <stdio.h>
+#include <errno.h>
+#include <tap.h>
+#include "../trnman.h"
+
+extern my_bool maria_log_remove();
+extern void translog_example_table_init();
+
+#ifndef DBUG_OFF
+static const char *default_dbug_option;
+#endif
+
+#define PCACHE_SIZE (1024*1024*10)
+#define PCACHE_PAGE TRANSLOG_PAGE_SIZE
+#define LOG_FILE_SIZE (1024L*1024L*1024L + 1024L*1024L*512)
+#define LOG_FLAGS 0
+
+static char *first_translog_file= (char*)"aria_log.00000001";
+static char *file1_name= (char*)"page_cache_test_file_1";
+static PAGECACHE_FILE file1;
+
+
+/**
+  @brief Dummy pagecache callback.
+*/
+
+static my_bool
+dummy_callback(uchar *page __attribute__((unused)),
+               pgcache_page_no_t page_no __attribute__((unused)),
+               uchar* data_ptr __attribute__((unused)))
+{
+  return 0;
+}
+
+
+/**
+  @brief Dummy pagecache callback.
+*/
+
+static void
+dummy_fail_callback(uchar* data_ptr __attribute__((unused)))
+{
+  return;
+}
+
+
+int main(int argc __attribute__((unused)), char *argv[])
+{
+  uint pagen;
+  uchar long_tr_id[6];
+  PAGECACHE pagecache;
+  LSN lsn;
+  my_off_t file_size;
+  LEX_CUSTRING parts[TRANSLOG_INTERNAL_PARTS + 1];
+
+  MY_INIT(argv[0]);
+
+  plan(1);
+
+  bzero(&pagecache, sizeof(pagecache));
+  maria_data_root= (char *)".";
+  if (maria_log_remove())
+    exit(1);
+  /* be sure that we have no logs in the directory*/
+  my_delete(CONTROL_FILE_BASE_NAME, MYF(0));
+  my_delete(first_translog_file, MYF(0));
+
+  bzero(long_tr_id, 6);
+#ifndef DBUG_OFF
+#if defined(__WIN__)
+  default_dbug_option= "d:t:i:O,\\ma_test_loghandler_pagecache.trace";
+#else
+  default_dbug_option= "d:t:i:o,/tmp/ma_test_loghandler_pagecache.trace";
+#endif
+  if (argc > 1)
+  {
+    DBUG_SET(default_dbug_option);
+    DBUG_SET_INITIAL(default_dbug_option);
+  }
+#endif
+
+  if (ma_control_file_open(TRUE, TRUE))
+  {
+    fprintf(stderr, "Can't init control file (%d)\n", errno);
+    exit(1);
+  }
+  if ((pagen= init_pagecache(&pagecache, PCACHE_SIZE, 0, 0,
+                             PCACHE_PAGE, 0)) == 0)
+  {
+    fprintf(stderr, "Got error: init_pagecache() (errno: %d)\n", errno);
+    exit(1);
+  }
+  if (translog_init_with_table(".", LOG_FILE_SIZE, 50112, 0, &pagecache,
+                               LOG_FLAGS, 0, &translog_example_table_init,
+                               0))
+  {
+    fprintf(stderr, "Can't init loghandler (%d)\n", errno);
+    exit(1);
+  }
+  /* Suppressing of automatic record writing */
+  dummy_transaction_object.first_undo_lsn|= TRANSACTION_LOGGED_LONG_ID;
+
+  if ((file1.file= my_open(first_translog_file, O_RDONLY,  MYF(MY_WME))) < 0)
+  {
+    fprintf(stderr, "There is no %s (%d)\n", first_translog_file, errno);
+    exit(1);
+  }
+  file_size= my_seek(file1.file, 0, SEEK_END, MYF(MY_WME));
+  if (file_size != TRANSLOG_PAGE_SIZE)
+  {
+    fprintf(stderr,
+            "incorrect initial size of %s: %ld instead of %ld\n",
+            first_translog_file, (long)file_size, (long)TRANSLOG_PAGE_SIZE);
+    exit(1);
+  }
+  my_close(file1.file, MYF(MY_WME));
+  int4store(long_tr_id, 0);
+  parts[TRANSLOG_INTERNAL_PARTS + 0].str= long_tr_id;
+  parts[TRANSLOG_INTERNAL_PARTS + 0].length= 6;
+  dummy_transaction_object.first_undo_lsn= TRANSACTION_LOGGED_LONG_ID;
+  if (translog_write_record(&lsn,
+                            LOGREC_FIXED_RECORD_0LSN_EXAMPLE,
+                            &dummy_transaction_object, NULL, 6,
+                            TRANSLOG_INTERNAL_PARTS + 1,
+                            parts, NULL, NULL))
+  {
+    fprintf(stderr, "Can't write record #%lu\n", (ulong) 0);
+    translog_destroy();
+    exit(1);
+  }
+
+  if ((file1.file= my_open(file1_name,
+                           O_CREAT | O_TRUNC | O_RDWR, MYF(0))) == -1)
+  {
+    fprintf(stderr, "Got error during file1 creation from open() (errno: %d)\n",
+	    errno);
+    exit(1);
+  }
+  pagecache_file_init(file1, &dummy_callback, &dummy_callback,
+                      &dummy_fail_callback, maria_flush_log_for_page, NULL);
+  if (my_chmod(file1_name, S_IRWXU | S_IRWXG | S_IRWXO, MYF(MY_WME)))
+    exit(1);
+
+  {
+    uchar page[PCACHE_PAGE];
+
+    bzero(page, PCACHE_PAGE);
+    lsn_store(page, lsn);
+    pagecache_write(&pagecache, &file1, 0, 3, page,
+                    PAGECACHE_LSN_PAGE,
+                    PAGECACHE_LOCK_LEFT_UNLOCKED,
+                    PAGECACHE_PIN_LEFT_UNPINNED,
+                    PAGECACHE_WRITE_DELAY,
+                    0, LSN_IMPOSSIBLE);
+    flush_pagecache_blocks(&pagecache, &file1, FLUSH_FORCE_WRITE);
+  }
+  my_close(file1.file, MYF(MY_WME));
+  if ((file1.file= my_open(first_translog_file, O_RDONLY, MYF(MY_WME))) < 0)
+  {
+    fprintf(stderr, "can't open %s (%d)\n", first_translog_file, errno);
+    exit(1);
+  }
+  file_size= my_seek(file1.file, 0, SEEK_END, MYF(MY_WME));
+  if (file_size != TRANSLOG_PAGE_SIZE * 2)
+  {
+    fprintf(stderr,
+            "incorrect initial size of %s: %ld instead of %ld\n",
+            first_translog_file,
+            (long)file_size, (long)(TRANSLOG_PAGE_SIZE * 2));
+    ok(0, "log triggered");
+    exit(1);
+  }
+  my_close(file1.file, MYF(MY_WME));
+  ok(1, "log triggered");
+
+  translog_destroy();
+  end_pagecache(&pagecache, 1);
+  ma_control_file_end();
+  my_delete(CONTROL_FILE_BASE_NAME, MYF(0));
+  my_delete(first_translog_file, MYF(0));
+  my_delete(file1_name, MYF(0));
+
+  exit(0);
+}
diff --git a/storage/maria/unittest/ma_test_loghandler_purge-t.c b/storage/maria/unittest/ma_test_loghandler_purge-t.c
new file mode 100644
index 00000000000..d37b45bc3ca
--- /dev/null
+++ b/storage/maria/unittest/ma_test_loghandler_purge-t.c
@@ -0,0 +1,192 @@
+/* Copyright (C) 2006-2008 MySQL AB
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; version 2 of the License.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */
+
+#include "../maria_def.h"
+#include <stdio.h>
+#include <errno.h>
+#include <tap.h>
+#include "../trnman.h"
+
+extern my_bool maria_log_remove();
+extern void translog_example_table_init();
+
+#ifndef DBUG_OFF
+static const char *default_dbug_option;
+#endif
+
+#define PCACHE_SIZE (1024*1024*10)
+#define PCACHE_PAGE TRANSLOG_PAGE_SIZE
+#define LOG_FILE_SIZE (8*1024L*1024L)
+#define LOG_FLAGS 0
+#define LONG_BUFFER_SIZE (LOG_FILE_SIZE + LOG_FILE_SIZE / 2)
+
+
+int main(int argc __attribute__((unused)), char *argv[])
+{
+  ulong i;
+  uint pagen;
+  uchar long_tr_id[6];
+  PAGECACHE pagecache;
+  LSN lsn;
+  LEX_CUSTRING parts[TRANSLOG_INTERNAL_PARTS + 1];
+  uchar *long_buffer= malloc(LONG_BUFFER_SIZE);
+
+  MY_INIT(argv[0]);
+
+  plan(4);
+
+  bzero(&pagecache, sizeof(pagecache));
+  bzero(long_buffer, LONG_BUFFER_SIZE);
+  maria_data_root= (char *)".";
+  if (maria_log_remove())
+    exit(1);
+
+  bzero(long_tr_id, 6);
+#ifndef DBUG_OFF
+#if defined(__WIN__)
+  default_dbug_option= "d:t:i:O,\\ma_test_loghandler.trace";
+#else
+  default_dbug_option= "d:t:i:o,/tmp/ma_test_loghandler.trace";
+#endif
+  if (argc > 1)
+  {
+    DBUG_SET(default_dbug_option);
+    DBUG_SET_INITIAL(default_dbug_option);
+  }
+#endif
+
+  if (ma_control_file_open(TRUE, TRUE))
+  {
+    fprintf(stderr, "Can't init control file (%d)\n", errno);
+    exit(1);
+  }
+  if ((pagen= init_pagecache(&pagecache, PCACHE_SIZE, 0, 0,
+                             PCACHE_PAGE, 0)) == 0)
+  {
+    fprintf(stderr, "Got error: init_pagecache() (errno: %d)\n", errno);
+    exit(1);
+  }
+  if (translog_init_with_table(".", LOG_FILE_SIZE, 50112, 0, &pagecache,
+                               LOG_FLAGS, 0, &translog_example_table_init,
+                               0))
+  {
+    fprintf(stderr, "Can't init loghandler (%d)\n", errno);
+    exit(1);
+  }
+  /* Suppressing of automatic record writing */
+  dummy_transaction_object.first_undo_lsn|= TRANSACTION_LOGGED_LONG_ID;
+
+  /* write more then 1 file */
+  int4store(long_tr_id, 0);
+  parts[TRANSLOG_INTERNAL_PARTS + 0].str= long_tr_id;
+  parts[TRANSLOG_INTERNAL_PARTS + 0].length= 6;
+  if (translog_write_record(&lsn,
+                            LOGREC_FIXED_RECORD_0LSN_EXAMPLE,
+                            &dummy_transaction_object, NULL, 6,
+                            TRANSLOG_INTERNAL_PARTS + 1,
+                            parts, NULL, NULL))
+  {
+    fprintf(stderr, "Can't write record #%lu\n", (ulong) 0);
+    translog_destroy();
+    exit(1);
+  }
+
+  translog_purge(lsn);
+  if (!translog_is_file(1))
+  {
+    fprintf(stderr, "First file was removed after first record\n");
+    translog_destroy();
+    exit(1);
+  }
+  ok(1, "First is not removed");
+
+  for(i= 0; i < LOG_FILE_SIZE/6 && LSN_FILE_NO(lsn) == 1; i++)
+  {
+    if (translog_write_record(&lsn,
+                              LOGREC_FIXED_RECORD_0LSN_EXAMPLE,
+                              &dummy_transaction_object, NULL, 6,
+                              TRANSLOG_INTERNAL_PARTS + 1,
+                              parts, NULL, NULL))
+    {
+      fprintf(stderr, "Can't write record #%lu\n", (ulong) 0);
+      translog_destroy();
+      exit(1);
+    }
+  }
+
+  translog_purge(lsn);
+  if (translog_is_file(1))
+  {
+    fprintf(stderr, "First file was not removed.\n");
+    translog_destroy();
+    exit(1);
+  }
+
+  ok(1, "First file is removed");
+
+  parts[TRANSLOG_INTERNAL_PARTS + 0].str= long_buffer;
+  parts[TRANSLOG_INTERNAL_PARTS + 0].length= LONG_BUFFER_SIZE;
+  if (translog_write_record(&lsn,
+			    LOGREC_VARIABLE_RECORD_0LSN_EXAMPLE,
+			    &dummy_transaction_object, NULL, LONG_BUFFER_SIZE,
+			    TRANSLOG_INTERNAL_PARTS + 1, parts, NULL, NULL))
+  {
+    fprintf(stderr, "Can't write variable record\n");
+    translog_destroy();
+    exit(1);
+  }
+
+  translog_purge(lsn);
+  if (!translog_is_file(2) || !translog_is_file(3))
+  {
+    fprintf(stderr, "Second file (%d) or third file (%d) is not present.\n",
+	    translog_is_file(2), translog_is_file(3));
+    translog_destroy();
+    exit(1);
+  }
+
+  ok(1, "Second and third files are not removed");
+
+  int4store(long_tr_id, 0);
+  parts[TRANSLOG_INTERNAL_PARTS + 0].str= long_tr_id;
+  parts[TRANSLOG_INTERNAL_PARTS + 0].length= 6;
+  if (translog_write_record(&lsn,
+                            LOGREC_FIXED_RECORD_0LSN_EXAMPLE,
+                            &dummy_transaction_object, NULL, 6,
+                            TRANSLOG_INTERNAL_PARTS + 1,
+                            parts, NULL, NULL))
+  {
+    fprintf(stderr, "Can't write last record\n");
+    translog_destroy();
+    exit(1);
+  }
+
+  translog_purge(lsn);
+  if (translog_is_file(2))
+  {
+    fprintf(stderr, "Second file is not removed\n");
+    translog_destroy();
+    exit(1);
+  }
+
+  ok(1, "Second file is removed");
+
+  translog_destroy();
+  end_pagecache(&pagecache, 1);
+  ma_control_file_end();
+  if (maria_log_remove())
+    exit(1);
+  exit(0);
+}
diff --git a/storage/maria/unittest/ma_test_recovery.expected b/storage/maria/unittest/ma_test_recovery.expected
new file mode 100644
index 00000000000..5f7dd54e673
--- /dev/null
+++ b/storage/maria/unittest/ma_test_recovery.expected
@@ -0,0 +1,1578 @@
+Testing the REDO PHASE ALONE
+TEST WITH ma_test1 -s -M -T -c
+applying log
+testing idempotency
+applying log
+TEST WITH ma_test2 -s -L -K -W -P -M -T -c -d500
+applying log
+testing idempotency
+applying log
+TEST WITH ma_test2 -s -M -T -c -b65000
+applying log
+testing idempotency
+applying log
+TEST WITH ma_test2 -s -M -T -c -b65000 -d800
+applying log
+testing idempotency
+applying log
+TEST WITH ma_test1 -s -M -T -c -C
+applying log
+testing idempotency
+applying log
+TEST WITH ma_test2 -s -L -K -W -P -M -T -c -d500 -C
+applying log
+testing idempotency
+applying log
+Testing the REDO AND UNDO PHASE
+TEST WITH ma_test1 -s -M -T -c -N --testflag=1 (commit at end)
+TEST WITH ma_test1 -s -M -T -c -N --testflag=2 --test-undo=1 (additional aborted work)
+Terminating after inserts
+Dying on request without maria_commit()/maria_close()
+applying log
+testing idempotency
+applying log
+testing applying of CLRs to recreate table
+applying log
+TEST WITH ma_test1 -s -M -T -c -N --testflag=3 (commit at end)
+Terminating after updates
+TEST WITH ma_test1 -s -M -T -c -N --testflag=4 --test-undo=1 (additional aborted work)
+Terminating after deletes
+Dying on request without maria_commit()/maria_close()
+applying log
+testing idempotency
+applying log
+testing applying of CLRs to recreate table
+applying log
+TEST WITH ma_test1 -s -M -T -c -N --versioning --testflag=3 (commit at end)
+Terminating after updates
+TEST WITH ma_test1 -s -M -T -c -N --versioning --testflag=4 --test-undo=1 (additional aborted work)
+Terminating after deletes
+Dying on request without maria_commit()/maria_close()
+applying log
+testing idempotency
+applying log
+testing applying of CLRs to recreate table
+applying log
+TEST WITH ma_test1 -s -M -T -c -N --testflag=2 (commit at end)
+Terminating after inserts
+TEST WITH ma_test1 -s -M -T -c -N --testflag=3 --test-undo=1 (additional aborted work)
+Terminating after updates
+Dying on request without maria_commit()/maria_close()
+applying log
+testing idempotency
+applying log
+testing applying of CLRs to recreate table
+applying log
+TEST WITH ma_test2 -s -L -K -W -P -M -T -c -t1 (commit at end)
+TEST WITH ma_test2 -s -L -K -W -P -M -T -c -t2 -A1 (additional aborted work)
+Dying on request without maria_commit()/maria_close()
+applying log
+Differences in aria_chk -dvv, recovery not yet perfect !
+========DIFF START=======
+6c6
+< Status:              checked,analyzed,optimized keys,sorted index pages,zerofilled,movable
+---
+> Status:              changed
+========DIFF END=======
+testing idempotency
+applying log
+Differences in aria_chk -dvv, recovery not yet perfect !
+========DIFF START=======
+6c6
+< Status:              checked,analyzed,optimized keys,sorted index pages,zerofilled,movable
+---
+> Status:              changed
+========DIFF END=======
+testing applying of CLRs to recreate table
+applying log
+Differences in aria_chk -dvv, recovery not yet perfect !
+========DIFF START=======
+6c6
+< Status:              checked,analyzed,optimized keys,sorted index pages,zerofilled,movable
+---
+> Status:              changed
+========DIFF END=======
+TEST WITH ma_test2 -s -L -K -W -P -M -T -c -t1 (commit at end)
+TEST WITH ma_test2 -s -L -K -W -P -M -T -c -t6 -A1 (additional aborted work)
+Dying on request without maria_commit()/maria_close()
+applying log
+Differences in aria_chk -dvv, recovery not yet perfect !
+========DIFF START=======
+6c6
+< Status:              checked,analyzed,optimized keys,sorted index pages,zerofilled,movable
+---
+> Status:              changed
+========DIFF END=======
+testing idempotency
+applying log
+Differences in aria_chk -dvv, recovery not yet perfect !
+========DIFF START=======
+6c6
+< Status:              checked,analyzed,optimized keys,sorted index pages,zerofilled,movable
+---
+> Status:              changed
+========DIFF END=======
+testing applying of CLRs to recreate table
+applying log
+Differences in aria_chk -dvv, recovery not yet perfect !
+========DIFF START=======
+6c6
+< Status:              checked,analyzed,optimized keys,sorted index pages,zerofilled,movable
+---
+> Status:              changed
+========DIFF END=======
+TEST WITH ma_test1 -s -M -T -c -N --testflag=1 (commit at end)
+TEST WITH ma_test1 -s -M -T -c -N --testflag=2 --test-undo=2 (additional aborted work)
+Terminating after inserts
+Dying on request without maria_commit()/maria_close()
+applying log
+testing idempotency
+applying log
+testing applying of CLRs to recreate table
+applying log
+TEST WITH ma_test1 -s -M -T -c -N --testflag=3 (commit at end)
+Terminating after updates
+TEST WITH ma_test1 -s -M -T -c -N --testflag=4 --test-undo=2 (additional aborted work)
+Terminating after deletes
+Dying on request without maria_commit()/maria_close()
+applying log
+testing idempotency
+applying log
+testing applying of CLRs to recreate table
+applying log
+TEST WITH ma_test1 -s -M -T -c -N --versioning --testflag=3 (commit at end)
+Terminating after updates
+TEST WITH ma_test1 -s -M -T -c -N --versioning --testflag=4 --test-undo=2 (additional aborted work)
+Terminating after deletes
+Dying on request without maria_commit()/maria_close()
+applying log
+testing idempotency
+applying log
+testing applying of CLRs to recreate table
+applying log
+TEST WITH ma_test1 -s -M -T -c -N --testflag=2 (commit at end)
+Terminating after inserts
+TEST WITH ma_test1 -s -M -T -c -N --testflag=3 --test-undo=2 (additional aborted work)
+Terminating after updates
+Dying on request without maria_commit()/maria_close()
+applying log
+testing idempotency
+applying log
+testing applying of CLRs to recreate table
+applying log
+TEST WITH ma_test2 -s -L -K -W -P -M -T -c -t1 (commit at end)
+TEST WITH ma_test2 -s -L -K -W -P -M -T -c -t2 -A2 (additional aborted work)
+Dying on request without maria_commit()/maria_close()
+applying log
+Differences in aria_chk -dvv, recovery not yet perfect !
+========DIFF START=======
+6c6
+< Status:              checked,analyzed,optimized keys,sorted index pages,zerofilled,movable
+---
+> Status:              changed
+========DIFF END=======
+testing idempotency
+applying log
+Differences in aria_chk -dvv, recovery not yet perfect !
+========DIFF START=======
+6c6
+< Status:              checked,analyzed,optimized keys,sorted index pages,zerofilled,movable
+---
+> Status:              changed
+========DIFF END=======
+testing applying of CLRs to recreate table
+applying log
+Differences in aria_chk -dvv, recovery not yet perfect !
+========DIFF START=======
+6c6
+< Status:              checked,analyzed,optimized keys,sorted index pages,zerofilled,movable
+---
+> Status:              changed
+========DIFF END=======
+TEST WITH ma_test2 -s -L -K -W -P -M -T -c -t1 (commit at end)
+TEST WITH ma_test2 -s -L -K -W -P -M -T -c -t6 -A2 (additional aborted work)
+Dying on request without maria_commit()/maria_close()
+applying log
+Differences in aria_chk -dvv, recovery not yet perfect !
+========DIFF START=======
+6c6
+< Status:              checked,analyzed,optimized keys,sorted index pages,zerofilled,movable
+---
+> Status:              changed
+========DIFF END=======
+testing idempotency
+applying log
+Differences in aria_chk -dvv, recovery not yet perfect !
+========DIFF START=======
+6c6
+< Status:              checked,analyzed,optimized keys,sorted index pages,zerofilled,movable
+---
+> Status:              changed
+========DIFF END=======
+testing applying of CLRs to recreate table
+applying log
+Differences in aria_chk -dvv, recovery not yet perfect !
+========DIFF START=======
+6c6
+< Status:              checked,analyzed,optimized keys,sorted index pages,zerofilled,movable
+---
+> Status:              changed
+========DIFF END=======
+TEST WITH ma_test1 -s -M -T -c -N --testflag=1 (commit at end)
+TEST WITH ma_test1 -s -M -T -c -N --testflag=2 --test-undo=3 (additional aborted work)
+Terminating after inserts
+Dying on request without maria_commit()/maria_close()
+applying log
+testing idempotency
+applying log
+testing applying of CLRs to recreate table
+applying log
+TEST WITH ma_test1 -s -M -T -c -N --testflag=3 (commit at end)
+Terminating after updates
+TEST WITH ma_test1 -s -M -T -c -N --testflag=4 --test-undo=3 (additional aborted work)
+Terminating after deletes
+Dying on request without maria_commit()/maria_close()
+applying log
+testing idempotency
+applying log
+testing applying of CLRs to recreate table
+applying log
+TEST WITH ma_test1 -s -M -T -c -N --versioning --testflag=3 (commit at end)
+Terminating after updates
+TEST WITH ma_test1 -s -M -T -c -N --versioning --testflag=4 --test-undo=3 (additional aborted work)
+Terminating after deletes
+Dying on request without maria_commit()/maria_close()
+applying log
+testing idempotency
+applying log
+testing applying of CLRs to recreate table
+applying log
+TEST WITH ma_test1 -s -M -T -c -N --testflag=2 (commit at end)
+Terminating after inserts
+TEST WITH ma_test1 -s -M -T -c -N --testflag=3 --test-undo=3 (additional aborted work)
+Terminating after updates
+Dying on request without maria_commit()/maria_close()
+applying log
+testing idempotency
+applying log
+testing applying of CLRs to recreate table
+applying log
+TEST WITH ma_test2 -s -L -K -W -P -M -T -c -t1 (commit at end)
+TEST WITH ma_test2 -s -L -K -W -P -M -T -c -t2 -A3 (additional aborted work)
+Dying on request without maria_commit()/maria_close()
+applying log
+Differences in aria_chk -dvv, recovery not yet perfect !
+========DIFF START=======
+6c6
+< Status:              checked,analyzed,optimized keys,sorted index pages,zerofilled,movable
+---
+> Status:              changed
+========DIFF END=======
+testing idempotency
+applying log
+Differences in aria_chk -dvv, recovery not yet perfect !
+========DIFF START=======
+6c6
+< Status:              checked,analyzed,optimized keys,sorted index pages,zerofilled,movable
+---
+> Status:              changed
+========DIFF END=======
+testing applying of CLRs to recreate table
+applying log
+Differences in aria_chk -dvv, recovery not yet perfect !
+========DIFF START=======
+6c6
+< Status:              checked,analyzed,optimized keys,sorted index pages,zerofilled,movable
+---
+> Status:              changed
+========DIFF END=======
+TEST WITH ma_test2 -s -L -K -W -P -M -T -c -t1 (commit at end)
+TEST WITH ma_test2 -s -L -K -W -P -M -T -c -t6 -A3 (additional aborted work)
+Dying on request without maria_commit()/maria_close()
+applying log
+Differences in aria_chk -dvv, recovery not yet perfect !
+========DIFF START=======
+6c6
+< Status:              checked,analyzed,optimized keys,sorted index pages,zerofilled,movable
+---
+> Status:              changed
+========DIFF END=======
+testing idempotency
+applying log
+Differences in aria_chk -dvv, recovery not yet perfect !
+========DIFF START=======
+6c6
+< Status:              checked,analyzed,optimized keys,sorted index pages,zerofilled,movable
+---
+> Status:              changed
+========DIFF END=======
+testing applying of CLRs to recreate table
+applying log
+Differences in aria_chk -dvv, recovery not yet perfect !
+========DIFF START=======
+6c6
+< Status:              checked,analyzed,optimized keys,sorted index pages,zerofilled,movable
+---
+> Status:              changed
+========DIFF END=======
+TEST WITH ma_test1 -s -M -T -c -N --testflag=1 (commit at end)
+TEST WITH ma_test1 -s -M -T -c -N --testflag=2 --test-undo=4 (additional aborted work)
+Terminating after inserts
+Dying on request without maria_commit()/maria_close()
+applying log
+testing idempotency
+applying log
+testing applying of CLRs to recreate table
+applying log
+TEST WITH ma_test1 -s -M -T -c -N --testflag=3 (commit at end)
+Terminating after updates
+TEST WITH ma_test1 -s -M -T -c -N --testflag=4 --test-undo=4 (additional aborted work)
+Terminating after deletes
+Dying on request without maria_commit()/maria_close()
+applying log
+testing idempotency
+applying log
+testing applying of CLRs to recreate table
+applying log
+TEST WITH ma_test1 -s -M -T -c -N --versioning --testflag=3 (commit at end)
+Terminating after updates
+TEST WITH ma_test1 -s -M -T -c -N --versioning --testflag=4 --test-undo=4 (additional aborted work)
+Terminating after deletes
+Dying on request without maria_commit()/maria_close()
+applying log
+testing idempotency
+applying log
+testing applying of CLRs to recreate table
+applying log
+TEST WITH ma_test1 -s -M -T -c -N --testflag=2 (commit at end)
+Terminating after inserts
+TEST WITH ma_test1 -s -M -T -c -N --testflag=3 --test-undo=4 (additional aborted work)
+Terminating after updates
+Dying on request without maria_commit()/maria_close()
+applying log
+testing idempotency
+applying log
+testing applying of CLRs to recreate table
+applying log
+TEST WITH ma_test2 -s -L -K -W -P -M -T -c -t1 (commit at end)
+TEST WITH ma_test2 -s -L -K -W -P -M -T -c -t2 -A4 (additional aborted work)
+Dying on request without maria_commit()/maria_close()
+applying log
+Differences in aria_chk -dvv, recovery not yet perfect !
+========DIFF START=======
+6c6
+< Status:              checked,analyzed,optimized keys,sorted index pages,zerofilled,movable
+---
+> Status:              changed
+========DIFF END=======
+testing idempotency
+applying log
+Differences in aria_chk -dvv, recovery not yet perfect !
+========DIFF START=======
+6c6
+< Status:              checked,analyzed,optimized keys,sorted index pages,zerofilled,movable
+---
+> Status:              changed
+========DIFF END=======
+testing applying of CLRs to recreate table
+applying log
+Differences in aria_chk -dvv, recovery not yet perfect !
+========DIFF START=======
+6c6
+< Status:              checked,analyzed,optimized keys,sorted index pages,zerofilled,movable
+---
+> Status:              changed
+========DIFF END=======
+TEST WITH ma_test2 -s -L -K -W -P -M -T -c -t1 (commit at end)
+TEST WITH ma_test2 -s -L -K -W -P -M -T -c -t6 -A4 (additional aborted work)
+Dying on request without maria_commit()/maria_close()
+applying log
+Differences in aria_chk -dvv, recovery not yet perfect !
+========DIFF START=======
+6c6
+< Status:              checked,analyzed,optimized keys,sorted index pages,zerofilled,movable
+---
+> Status:              changed
+========DIFF END=======
+testing idempotency
+applying log
+Differences in aria_chk -dvv, recovery not yet perfect !
+========DIFF START=======
+6c6
+< Status:              checked,analyzed,optimized keys,sorted index pages,zerofilled,movable
+---
+> Status:              changed
+========DIFF END=======
+testing applying of CLRs to recreate table
+applying log
+Differences in aria_chk -dvv, recovery not yet perfect !
+========DIFF START=======
+6c6
+< Status:              checked,analyzed,optimized keys,sorted index pages,zerofilled,movable
+---
+> Status:              changed
+========DIFF END=======
+TEST WITH ma_test1 -s -M -T -c -N -b32768 --testflag=1 (commit at end)
+TEST WITH ma_test1 -s -M -T -c -N -b32768 --testflag=2 --test-undo=1 (additional aborted work)
+Terminating after inserts
+Dying on request without maria_commit()/maria_close()
+applying log
+testing idempotency
+applying log
+testing applying of CLRs to recreate table
+applying log
+TEST WITH ma_test1 -s -M -T -c -N -b32768 --testflag=3 (commit at end)
+Terminating after updates
+TEST WITH ma_test1 -s -M -T -c -N -b32768 --testflag=4 --test-undo=1 (additional aborted work)
+Terminating after deletes
+Dying on request without maria_commit()/maria_close()
+applying log
+testing idempotency
+applying log
+testing applying of CLRs to recreate table
+applying log
+TEST WITH ma_test1 -s -M -T -c -N -b32768 --versioning --testflag=3 (commit at end)
+Terminating after updates
+TEST WITH ma_test1 -s -M -T -c -N -b32768 --versioning --testflag=4 --test-undo=1 (additional aborted work)
+Terminating after deletes
+Dying on request without maria_commit()/maria_close()
+applying log
+testing idempotency
+applying log
+testing applying of CLRs to recreate table
+applying log
+TEST WITH ma_test1 -s -M -T -c -N -b32768 --testflag=2 (commit at end)
+Terminating after inserts
+TEST WITH ma_test1 -s -M -T -c -N -b32768 --testflag=3 --test-undo=1 (additional aborted work)
+Terminating after updates
+Dying on request without maria_commit()/maria_close()
+applying log
+testing idempotency
+applying log
+testing applying of CLRs to recreate table
+applying log
+TEST WITH ma_test2 -s -L -K -W -P -M -T -c -b32768 -t1 (commit at end)
+TEST WITH ma_test2 -s -L -K -W -P -M -T -c -b32768 -t2 -A1 (additional aborted work)
+Dying on request without maria_commit()/maria_close()
+applying log
+Differences in aria_chk -dvv, recovery not yet perfect !
+========DIFF START=======
+6c6
+< Status:              checked,analyzed,optimized keys,sorted index pages,zerofilled,movable
+---
+> Status:              changed
+========DIFF END=======
+testing idempotency
+applying log
+Differences in aria_chk -dvv, recovery not yet perfect !
+========DIFF START=======
+6c6
+< Status:              checked,analyzed,optimized keys,sorted index pages,zerofilled,movable
+---
+> Status:              changed
+========DIFF END=======
+testing applying of CLRs to recreate table
+applying log
+Differences in aria_chk -dvv, recovery not yet perfect !
+========DIFF START=======
+6c6
+< Status:              checked,analyzed,optimized keys,sorted index pages,zerofilled,movable
+---
+> Status:              changed
+========DIFF END=======
+TEST WITH ma_test2 -s -L -K -W -P -M -T -c -b32768 -t1 (commit at end)
+TEST WITH ma_test2 -s -L -K -W -P -M -T -c -b32768 -t6 -A1 (additional aborted work)
+Dying on request without maria_commit()/maria_close()
+applying log
+Differences in aria_chk -dvv, recovery not yet perfect !
+========DIFF START=======
+6c6
+< Status:              checked,analyzed,optimized keys,sorted index pages,zerofilled,movable
+---
+> Status:              changed
+========DIFF END=======
+testing idempotency
+applying log
+Differences in aria_chk -dvv, recovery not yet perfect !
+========DIFF START=======
+6c6
+< Status:              checked,analyzed,optimized keys,sorted index pages,zerofilled,movable
+---
+> Status:              changed
+========DIFF END=======
+testing applying of CLRs to recreate table
+applying log
+Differences in aria_chk -dvv, recovery not yet perfect !
+========DIFF START=======
+6c6
+< Status:              checked,analyzed,optimized keys,sorted index pages,zerofilled,movable
+---
+> Status:              changed
+========DIFF END=======
+TEST WITH ma_test1 -s -M -T -c -N -b32768 --testflag=1 (commit at end)
+TEST WITH ma_test1 -s -M -T -c -N -b32768 --testflag=2 --test-undo=2 (additional aborted work)
+Terminating after inserts
+Dying on request without maria_commit()/maria_close()
+applying log
+testing idempotency
+applying log
+testing applying of CLRs to recreate table
+applying log
+TEST WITH ma_test1 -s -M -T -c -N -b32768 --testflag=3 (commit at end)
+Terminating after updates
+TEST WITH ma_test1 -s -M -T -c -N -b32768 --testflag=4 --test-undo=2 (additional aborted work)
+Terminating after deletes
+Dying on request without maria_commit()/maria_close()
+applying log
+testing idempotency
+applying log
+testing applying of CLRs to recreate table
+applying log
+TEST WITH ma_test1 -s -M -T -c -N -b32768 --versioning --testflag=3 (commit at end)
+Terminating after updates
+TEST WITH ma_test1 -s -M -T -c -N -b32768 --versioning --testflag=4 --test-undo=2 (additional aborted work)
+Terminating after deletes
+Dying on request without maria_commit()/maria_close()
+applying log
+testing idempotency
+applying log
+testing applying of CLRs to recreate table
+applying log
+TEST WITH ma_test1 -s -M -T -c -N -b32768 --testflag=2 (commit at end)
+Terminating after inserts
+TEST WITH ma_test1 -s -M -T -c -N -b32768 --testflag=3 --test-undo=2 (additional aborted work)
+Terminating after updates
+Dying on request without maria_commit()/maria_close()
+applying log
+testing idempotency
+applying log
+testing applying of CLRs to recreate table
+applying log
+TEST WITH ma_test2 -s -L -K -W -P -M -T -c -b32768 -t1 (commit at end)
+TEST WITH ma_test2 -s -L -K -W -P -M -T -c -b32768 -t2 -A2 (additional aborted work)
+Dying on request without maria_commit()/maria_close()
+applying log
+Differences in aria_chk -dvv, recovery not yet perfect !
+========DIFF START=======
+6c6
+< Status:              checked,analyzed,optimized keys,sorted index pages,zerofilled,movable
+---
+> Status:              changed
+========DIFF END=======
+testing idempotency
+applying log
+Differences in aria_chk -dvv, recovery not yet perfect !
+========DIFF START=======
+6c6
+< Status:              checked,analyzed,optimized keys,sorted index pages,zerofilled,movable
+---
+> Status:              changed
+========DIFF END=======
+testing applying of CLRs to recreate table
+applying log
+Differences in aria_chk -dvv, recovery not yet perfect !
+========DIFF START=======
+6c6
+< Status:              checked,analyzed,optimized keys,sorted index pages,zerofilled,movable
+---
+> Status:              changed
+========DIFF END=======
+TEST WITH ma_test2 -s -L -K -W -P -M -T -c -b32768 -t1 (commit at end)
+TEST WITH ma_test2 -s -L -K -W -P -M -T -c -b32768 -t6 -A2 (additional aborted work)
+Dying on request without maria_commit()/maria_close()
+applying log
+Differences in aria_chk -dvv, recovery not yet perfect !
+========DIFF START=======
+6c6
+< Status:              checked,analyzed,optimized keys,sorted index pages,zerofilled,movable
+---
+> Status:              changed
+========DIFF END=======
+testing idempotency
+applying log
+Differences in aria_chk -dvv, recovery not yet perfect !
+========DIFF START=======
+6c6
+< Status:              checked,analyzed,optimized keys,sorted index pages,zerofilled,movable
+---
+> Status:              changed
+========DIFF END=======
+testing applying of CLRs to recreate table
+applying log
+Differences in aria_chk -dvv, recovery not yet perfect !
+========DIFF START=======
+6c6
+< Status:              checked,analyzed,optimized keys,sorted index pages,zerofilled,movable
+---
+> Status:              changed
+========DIFF END=======
+TEST WITH ma_test1 -s -M -T -c -N -b32768 --testflag=1 (commit at end)
+TEST WITH ma_test1 -s -M -T -c -N -b32768 --testflag=2 --test-undo=3 (additional aborted work)
+Terminating after inserts
+Dying on request without maria_commit()/maria_close()
+applying log
+testing idempotency
+applying log
+testing applying of CLRs to recreate table
+applying log
+TEST WITH ma_test1 -s -M -T -c -N -b32768 --testflag=3 (commit at end)
+Terminating after updates
+TEST WITH ma_test1 -s -M -T -c -N -b32768 --testflag=4 --test-undo=3 (additional aborted work)
+Terminating after deletes
+Dying on request without maria_commit()/maria_close()
+applying log
+testing idempotency
+applying log
+testing applying of CLRs to recreate table
+applying log
+TEST WITH ma_test1 -s -M -T -c -N -b32768 --versioning --testflag=3 (commit at end)
+Terminating after updates
+TEST WITH ma_test1 -s -M -T -c -N -b32768 --versioning --testflag=4 --test-undo=3 (additional aborted work)
+Terminating after deletes
+Dying on request without maria_commit()/maria_close()
+applying log
+testing idempotency
+applying log
+testing applying of CLRs to recreate table
+applying log
+TEST WITH ma_test1 -s -M -T -c -N -b32768 --testflag=2 (commit at end)
+Terminating after inserts
+TEST WITH ma_test1 -s -M -T -c -N -b32768 --testflag=3 --test-undo=3 (additional aborted work)
+Terminating after updates
+Dying on request without maria_commit()/maria_close()
+applying log
+testing idempotency
+applying log
+testing applying of CLRs to recreate table
+applying log
+TEST WITH ma_test2 -s -L -K -W -P -M -T -c -b32768 -t1 (commit at end)
+TEST WITH ma_test2 -s -L -K -W -P -M -T -c -b32768 -t2 -A3 (additional aborted work)
+Dying on request without maria_commit()/maria_close()
+applying log
+Differences in aria_chk -dvv, recovery not yet perfect !
+========DIFF START=======
+6c6
+< Status:              checked,analyzed,optimized keys,sorted index pages,zerofilled,movable
+---
+> Status:              changed
+========DIFF END=======
+testing idempotency
+applying log
+Differences in aria_chk -dvv, recovery not yet perfect !
+========DIFF START=======
+6c6
+< Status:              checked,analyzed,optimized keys,sorted index pages,zerofilled,movable
+---
+> Status:              changed
+========DIFF END=======
+testing applying of CLRs to recreate table
+applying log
+Differences in aria_chk -dvv, recovery not yet perfect !
+========DIFF START=======
+6c6
+< Status:              checked,analyzed,optimized keys,sorted index pages,zerofilled,movable
+---
+> Status:              changed
+========DIFF END=======
+TEST WITH ma_test2 -s -L -K -W -P -M -T -c -b32768 -t1 (commit at end)
+TEST WITH ma_test2 -s -L -K -W -P -M -T -c -b32768 -t6 -A3 (additional aborted work)
+Dying on request without maria_commit()/maria_close()
+applying log
+Differences in aria_chk -dvv, recovery not yet perfect !
+========DIFF START=======
+6c6
+< Status:              checked,analyzed,optimized keys,sorted index pages,zerofilled,movable
+---
+> Status:              changed
+========DIFF END=======
+testing idempotency
+applying log
+Differences in aria_chk -dvv, recovery not yet perfect !
+========DIFF START=======
+6c6
+< Status:              checked,analyzed,optimized keys,sorted index pages,zerofilled,movable
+---
+> Status:              changed
+========DIFF END=======
+testing applying of CLRs to recreate table
+applying log
+Differences in aria_chk -dvv, recovery not yet perfect !
+========DIFF START=======
+6c6
+< Status:              checked,analyzed,optimized keys,sorted index pages,zerofilled,movable
+---
+> Status:              changed
+========DIFF END=======
+TEST WITH ma_test1 -s -M -T -c -N -b32768 --testflag=1 (commit at end)
+TEST WITH ma_test1 -s -M -T -c -N -b32768 --testflag=2 --test-undo=4 (additional aborted work)
+Terminating after inserts
+Dying on request without maria_commit()/maria_close()
+applying log
+testing idempotency
+applying log
+testing applying of CLRs to recreate table
+applying log
+TEST WITH ma_test1 -s -M -T -c -N -b32768 --testflag=3 (commit at end)
+Terminating after updates
+TEST WITH ma_test1 -s -M -T -c -N -b32768 --testflag=4 --test-undo=4 (additional aborted work)
+Terminating after deletes
+Dying on request without maria_commit()/maria_close()
+applying log
+testing idempotency
+applying log
+testing applying of CLRs to recreate table
+applying log
+TEST WITH ma_test1 -s -M -T -c -N -b32768 --versioning --testflag=3 (commit at end)
+Terminating after updates
+TEST WITH ma_test1 -s -M -T -c -N -b32768 --versioning --testflag=4 --test-undo=4 (additional aborted work)
+Terminating after deletes
+Dying on request without maria_commit()/maria_close()
+applying log
+testing idempotency
+applying log
+testing applying of CLRs to recreate table
+applying log
+TEST WITH ma_test1 -s -M -T -c -N -b32768 --testflag=2 (commit at end)
+Terminating after inserts
+TEST WITH ma_test1 -s -M -T -c -N -b32768 --testflag=3 --test-undo=4 (additional aborted work)
+Terminating after updates
+Dying on request without maria_commit()/maria_close()
+applying log
+testing idempotency
+applying log
+testing applying of CLRs to recreate table
+applying log
+TEST WITH ma_test2 -s -L -K -W -P -M -T -c -b32768 -t1 (commit at end)
+TEST WITH ma_test2 -s -L -K -W -P -M -T -c -b32768 -t2 -A4 (additional aborted work)
+Dying on request without maria_commit()/maria_close()
+applying log
+Differences in aria_chk -dvv, recovery not yet perfect !
+========DIFF START=======
+6c6
+< Status:              checked,analyzed,optimized keys,sorted index pages,zerofilled,movable
+---
+> Status:              changed
+========DIFF END=======
+testing idempotency
+applying log
+Differences in aria_chk -dvv, recovery not yet perfect !
+========DIFF START=======
+6c6
+< Status:              checked,analyzed,optimized keys,sorted index pages,zerofilled,movable
+---
+> Status:              changed
+========DIFF END=======
+testing applying of CLRs to recreate table
+applying log
+Differences in aria_chk -dvv, recovery not yet perfect !
+========DIFF START=======
+6c6
+< Status:              checked,analyzed,optimized keys,sorted index pages,zerofilled,movable
+---
+> Status:              changed
+========DIFF END=======
+TEST WITH ma_test2 -s -L -K -W -P -M -T -c -b32768 -t1 (commit at end)
+TEST WITH ma_test2 -s -L -K -W -P -M -T -c -b32768 -t6 -A4 (additional aborted work)
+Dying on request without maria_commit()/maria_close()
+applying log
+Differences in aria_chk -dvv, recovery not yet perfect !
+========DIFF START=======
+6c6
+< Status:              checked,analyzed,optimized keys,sorted index pages,zerofilled,movable
+---
+> Status:              changed
+========DIFF END=======
+testing idempotency
+applying log
+Differences in aria_chk -dvv, recovery not yet perfect !
+========DIFF START=======
+6c6
+< Status:              checked,analyzed,optimized keys,sorted index pages,zerofilled,movable
+---
+> Status:              changed
+========DIFF END=======
+testing applying of CLRs to recreate table
+applying log
+Differences in aria_chk -dvv, recovery not yet perfect !
+========DIFF START=======
+6c6
+< Status:              checked,analyzed,optimized keys,sorted index pages,zerofilled,movable
+---
+> Status:              changed
+========DIFF END=======
+TEST WITH ma_test1 -s -M -T -c -N  -H1 --testflag=1 (commit at end)
+TEST WITH ma_test1 -s -M -T -c -N  -H1 --testflag=2 --test-undo=1 (additional aborted work)
+Terminating after inserts
+Dying on request without maria_commit()/maria_close()
+applying log
+testing idempotency
+applying log
+testing applying of CLRs to recreate table
+applying log
+TEST WITH ma_test1 -s -M -T -c -N  -H2 --testflag=3 (commit at end)
+Terminating after updates
+TEST WITH ma_test1 -s -M -T -c -N  -H2 --testflag=4 --test-undo=1 (additional aborted work)
+Terminating after deletes
+Dying on request without maria_commit()/maria_close()
+applying log
+testing idempotency
+applying log
+testing applying of CLRs to recreate table
+applying log
+TEST WITH ma_test1 -s -M -T -c -N  -H2 --versioning --testflag=3 (commit at end)
+Terminating after updates
+TEST WITH ma_test1 -s -M -T -c -N  -H2 --versioning --testflag=4 --test-undo=1 (additional aborted work)
+Terminating after deletes
+Dying on request without maria_commit()/maria_close()
+applying log
+testing idempotency
+applying log
+testing applying of CLRs to recreate table
+applying log
+TEST WITH ma_test1 -s -M -T -c -N  -H2 --testflag=2 (commit at end)
+Terminating after inserts
+TEST WITH ma_test1 -s -M -T -c -N  -H2 --testflag=3 --test-undo=1 (additional aborted work)
+Terminating after updates
+Dying on request without maria_commit()/maria_close()
+applying log
+testing idempotency
+applying log
+testing applying of CLRs to recreate table
+applying log
+TEST WITH ma_test2 -s -L -K -W -P -M -T -c  -H1 -t1 (commit at end)
+TEST WITH ma_test2 -s -L -K -W -P -M -T -c  -H1 -t2 -A1 (additional aborted work)
+Dying on request without maria_commit()/maria_close()
+applying log
+Differences in aria_chk -dvv, recovery not yet perfect !
+========DIFF START=======
+6c6
+< Status:              checked,analyzed,optimized keys,sorted index pages,zerofilled,movable
+---
+> Status:              changed
+========DIFF END=======
+testing idempotency
+applying log
+Differences in aria_chk -dvv, recovery not yet perfect !
+========DIFF START=======
+6c6
+< Status:              checked,analyzed,optimized keys,sorted index pages,zerofilled,movable
+---
+> Status:              changed
+========DIFF END=======
+testing applying of CLRs to recreate table
+applying log
+Differences in aria_chk -dvv, recovery not yet perfect !
+========DIFF START=======
+6c6
+< Status:              checked,analyzed,optimized keys,sorted index pages,zerofilled,movable
+---
+> Status:              changed
+========DIFF END=======
+TEST WITH ma_test2 -s -L -K -W -P -M -T -c  -H1 -t1 (commit at end)
+TEST WITH ma_test2 -s -L -K -W -P -M -T -c  -H1 -t6 -A1 (additional aborted work)
+Dying on request without maria_commit()/maria_close()
+applying log
+Differences in aria_chk -dvv, recovery not yet perfect !
+========DIFF START=======
+6c6
+< Status:              checked,analyzed,optimized keys,sorted index pages,zerofilled,movable
+---
+> Status:              changed
+========DIFF END=======
+testing idempotency
+applying log
+Differences in aria_chk -dvv, recovery not yet perfect !
+========DIFF START=======
+6c6
+< Status:              checked,analyzed,optimized keys,sorted index pages,zerofilled,movable
+---
+> Status:              changed
+========DIFF END=======
+testing applying of CLRs to recreate table
+applying log
+Differences in aria_chk -dvv, recovery not yet perfect !
+========DIFF START=======
+6c6
+< Status:              checked,analyzed,optimized keys,sorted index pages,zerofilled,movable
+---
+> Status:              changed
+========DIFF END=======
+TEST WITH ma_test1 -s -M -T -c -N  -H1 --testflag=1 (commit at end)
+TEST WITH ma_test1 -s -M -T -c -N  -H1 --testflag=2 --test-undo=2 (additional aborted work)
+Terminating after inserts
+Dying on request without maria_commit()/maria_close()
+applying log
+testing idempotency
+applying log
+testing applying of CLRs to recreate table
+applying log
+TEST WITH ma_test1 -s -M -T -c -N  -H2 --testflag=3 (commit at end)
+Terminating after updates
+TEST WITH ma_test1 -s -M -T -c -N  -H2 --testflag=4 --test-undo=2 (additional aborted work)
+Terminating after deletes
+Dying on request without maria_commit()/maria_close()
+applying log
+testing idempotency
+applying log
+testing applying of CLRs to recreate table
+applying log
+TEST WITH ma_test1 -s -M -T -c -N  -H2 --versioning --testflag=3 (commit at end)
+Terminating after updates
+TEST WITH ma_test1 -s -M -T -c -N  -H2 --versioning --testflag=4 --test-undo=2 (additional aborted work)
+Terminating after deletes
+Dying on request without maria_commit()/maria_close()
+applying log
+testing idempotency
+applying log
+testing applying of CLRs to recreate table
+applying log
+TEST WITH ma_test1 -s -M -T -c -N  -H2 --testflag=2 (commit at end)
+Terminating after inserts
+TEST WITH ma_test1 -s -M -T -c -N  -H2 --testflag=3 --test-undo=2 (additional aborted work)
+Terminating after updates
+Dying on request without maria_commit()/maria_close()
+applying log
+testing idempotency
+applying log
+testing applying of CLRs to recreate table
+applying log
+TEST WITH ma_test2 -s -L -K -W -P -M -T -c  -H1 -t1 (commit at end)
+TEST WITH ma_test2 -s -L -K -W -P -M -T -c  -H1 -t2 -A2 (additional aborted work)
+Dying on request without maria_commit()/maria_close()
+applying log
+Differences in aria_chk -dvv, recovery not yet perfect !
+========DIFF START=======
+6c6
+< Status:              checked,analyzed,optimized keys,sorted index pages,zerofilled,movable
+---
+> Status:              changed
+========DIFF END=======
+testing idempotency
+applying log
+Differences in aria_chk -dvv, recovery not yet perfect !
+========DIFF START=======
+6c6
+< Status:              checked,analyzed,optimized keys,sorted index pages,zerofilled,movable
+---
+> Status:              changed
+========DIFF END=======
+testing applying of CLRs to recreate table
+applying log
+Differences in aria_chk -dvv, recovery not yet perfect !
+========DIFF START=======
+6c6
+< Status:              checked,analyzed,optimized keys,sorted index pages,zerofilled,movable
+---
+> Status:              changed
+========DIFF END=======
+TEST WITH ma_test2 -s -L -K -W -P -M -T -c  -H1 -t1 (commit at end)
+TEST WITH ma_test2 -s -L -K -W -P -M -T -c  -H1 -t6 -A2 (additional aborted work)
+Dying on request without maria_commit()/maria_close()
+applying log
+Differences in aria_chk -dvv, recovery not yet perfect !
+========DIFF START=======
+6c6
+< Status:              checked,analyzed,optimized keys,sorted index pages,zerofilled,movable
+---
+> Status:              changed
+========DIFF END=======
+testing idempotency
+applying log
+Differences in aria_chk -dvv, recovery not yet perfect !
+========DIFF START=======
+6c6
+< Status:              checked,analyzed,optimized keys,sorted index pages,zerofilled,movable
+---
+> Status:              changed
+========DIFF END=======
+testing applying of CLRs to recreate table
+applying log
+Differences in aria_chk -dvv, recovery not yet perfect !
+========DIFF START=======
+6c6
+< Status:              checked,analyzed,optimized keys,sorted index pages,zerofilled,movable
+---
+> Status:              changed
+========DIFF END=======
+TEST WITH ma_test1 -s -M -T -c -N  -H1 --testflag=1 (commit at end)
+TEST WITH ma_test1 -s -M -T -c -N  -H1 --testflag=2 --test-undo=3 (additional aborted work)
+Terminating after inserts
+Dying on request without maria_commit()/maria_close()
+applying log
+testing idempotency
+applying log
+testing applying of CLRs to recreate table
+applying log
+TEST WITH ma_test1 -s -M -T -c -N  -H2 --testflag=3 (commit at end)
+Terminating after updates
+TEST WITH ma_test1 -s -M -T -c -N  -H2 --testflag=4 --test-undo=3 (additional aborted work)
+Terminating after deletes
+Dying on request without maria_commit()/maria_close()
+applying log
+testing idempotency
+applying log
+testing applying of CLRs to recreate table
+applying log
+TEST WITH ma_test1 -s -M -T -c -N  -H2 --versioning --testflag=3 (commit at end)
+Terminating after updates
+TEST WITH ma_test1 -s -M -T -c -N  -H2 --versioning --testflag=4 --test-undo=3 (additional aborted work)
+Terminating after deletes
+Dying on request without maria_commit()/maria_close()
+applying log
+testing idempotency
+applying log
+testing applying of CLRs to recreate table
+applying log
+TEST WITH ma_test1 -s -M -T -c -N  -H2 --testflag=2 (commit at end)
+Terminating after inserts
+TEST WITH ma_test1 -s -M -T -c -N  -H2 --testflag=3 --test-undo=3 (additional aborted work)
+Terminating after updates
+Dying on request without maria_commit()/maria_close()
+applying log
+testing idempotency
+applying log
+testing applying of CLRs to recreate table
+applying log
+TEST WITH ma_test2 -s -L -K -W -P -M -T -c  -H1 -t1 (commit at end)
+TEST WITH ma_test2 -s -L -K -W -P -M -T -c  -H1 -t2 -A3 (additional aborted work)
+Dying on request without maria_commit()/maria_close()
+applying log
+Differences in aria_chk -dvv, recovery not yet perfect !
+========DIFF START=======
+6c6
+< Status:              checked,analyzed,optimized keys,sorted index pages,zerofilled,movable
+---
+> Status:              changed
+========DIFF END=======
+testing idempotency
+applying log
+Differences in aria_chk -dvv, recovery not yet perfect !
+========DIFF START=======
+6c6
+< Status:              checked,analyzed,optimized keys,sorted index pages,zerofilled,movable
+---
+> Status:              changed
+========DIFF END=======
+testing applying of CLRs to recreate table
+applying log
+Differences in aria_chk -dvv, recovery not yet perfect !
+========DIFF START=======
+6c6
+< Status:              checked,analyzed,optimized keys,sorted index pages,zerofilled,movable
+---
+> Status:              changed
+========DIFF END=======
+TEST WITH ma_test2 -s -L -K -W -P -M -T -c  -H1 -t1 (commit at end)
+TEST WITH ma_test2 -s -L -K -W -P -M -T -c  -H1 -t6 -A3 (additional aborted work)
+Dying on request without maria_commit()/maria_close()
+applying log
+Differences in aria_chk -dvv, recovery not yet perfect !
+========DIFF START=======
+6c6
+< Status:              checked,analyzed,optimized keys,sorted index pages,zerofilled,movable
+---
+> Status:              changed
+========DIFF END=======
+testing idempotency
+applying log
+Differences in aria_chk -dvv, recovery not yet perfect !
+========DIFF START=======
+6c6
+< Status:              checked,analyzed,optimized keys,sorted index pages,zerofilled,movable
+---
+> Status:              changed
+========DIFF END=======
+testing applying of CLRs to recreate table
+applying log
+Differences in aria_chk -dvv, recovery not yet perfect !
+========DIFF START=======
+6c6
+< Status:              checked,analyzed,optimized keys,sorted index pages,zerofilled,movable
+---
+> Status:              changed
+========DIFF END=======
+TEST WITH ma_test1 -s -M -T -c -N  -H1 --testflag=1 (commit at end)
+TEST WITH ma_test1 -s -M -T -c -N  -H1 --testflag=2 --test-undo=4 (additional aborted work)
+Terminating after inserts
+Dying on request without maria_commit()/maria_close()
+applying log
+testing idempotency
+applying log
+testing applying of CLRs to recreate table
+applying log
+TEST WITH ma_test1 -s -M -T -c -N  -H2 --testflag=3 (commit at end)
+Terminating after updates
+TEST WITH ma_test1 -s -M -T -c -N  -H2 --testflag=4 --test-undo=4 (additional aborted work)
+Terminating after deletes
+Dying on request without maria_commit()/maria_close()
+applying log
+testing idempotency
+applying log
+testing applying of CLRs to recreate table
+applying log
+TEST WITH ma_test1 -s -M -T -c -N  -H2 --versioning --testflag=3 (commit at end)
+Terminating after updates
+TEST WITH ma_test1 -s -M -T -c -N  -H2 --versioning --testflag=4 --test-undo=4 (additional aborted work)
+Terminating after deletes
+Dying on request without maria_commit()/maria_close()
+applying log
+testing idempotency
+applying log
+testing applying of CLRs to recreate table
+applying log
+TEST WITH ma_test1 -s -M -T -c -N  -H2 --testflag=2 (commit at end)
+Terminating after inserts
+TEST WITH ma_test1 -s -M -T -c -N  -H2 --testflag=3 --test-undo=4 (additional aborted work)
+Terminating after updates
+Dying on request without maria_commit()/maria_close()
+applying log
+testing idempotency
+applying log
+testing applying of CLRs to recreate table
+applying log
+TEST WITH ma_test2 -s -L -K -W -P -M -T -c  -H1 -t1 (commit at end)
+TEST WITH ma_test2 -s -L -K -W -P -M -T -c  -H1 -t2 -A4 (additional aborted work)
+Dying on request without maria_commit()/maria_close()
+applying log
+Differences in aria_chk -dvv, recovery not yet perfect !
+========DIFF START=======
+6c6
+< Status:              checked,analyzed,optimized keys,sorted index pages,zerofilled,movable
+---
+> Status:              changed
+========DIFF END=======
+testing idempotency
+applying log
+Differences in aria_chk -dvv, recovery not yet perfect !
+========DIFF START=======
+6c6
+< Status:              checked,analyzed,optimized keys,sorted index pages,zerofilled,movable
+---
+> Status:              changed
+========DIFF END=======
+testing applying of CLRs to recreate table
+applying log
+Differences in aria_chk -dvv, recovery not yet perfect !
+========DIFF START=======
+6c6
+< Status:              checked,analyzed,optimized keys,sorted index pages,zerofilled,movable
+---
+> Status:              changed
+========DIFF END=======
+TEST WITH ma_test2 -s -L -K -W -P -M -T -c  -H1 -t1 (commit at end)
+TEST WITH ma_test2 -s -L -K -W -P -M -T -c  -H1 -t6 -A4 (additional aborted work)
+Dying on request without maria_commit()/maria_close()
+applying log
+Differences in aria_chk -dvv, recovery not yet perfect !
+========DIFF START=======
+6c6
+< Status:              checked,analyzed,optimized keys,sorted index pages,zerofilled,movable
+---
+> Status:              changed
+========DIFF END=======
+testing idempotency
+applying log
+Differences in aria_chk -dvv, recovery not yet perfect !
+========DIFF START=======
+6c6
+< Status:              checked,analyzed,optimized keys,sorted index pages,zerofilled,movable
+---
+> Status:              changed
+========DIFF END=======
+testing applying of CLRs to recreate table
+applying log
+Differences in aria_chk -dvv, recovery not yet perfect !
+========DIFF START=======
+6c6
+< Status:              checked,analyzed,optimized keys,sorted index pages,zerofilled,movable
+---
+> Status:              changed
+========DIFF END=======
+TEST WITH ma_test1 -s -M -T -c -N -b32768 -H1 --testflag=1 (commit at end)
+TEST WITH ma_test1 -s -M -T -c -N -b32768 -H1 --testflag=2 --test-undo=1 (additional aborted work)
+Terminating after inserts
+Dying on request without maria_commit()/maria_close()
+applying log
+testing idempotency
+applying log
+testing applying of CLRs to recreate table
+applying log
+TEST WITH ma_test1 -s -M -T -c -N -b32768 -H2 --testflag=3 (commit at end)
+Terminating after updates
+TEST WITH ma_test1 -s -M -T -c -N -b32768 -H2 --testflag=4 --test-undo=1 (additional aborted work)
+Terminating after deletes
+Dying on request without maria_commit()/maria_close()
+applying log
+testing idempotency
+applying log
+testing applying of CLRs to recreate table
+applying log
+TEST WITH ma_test1 -s -M -T -c -N -b32768 -H2 --versioning --testflag=3 (commit at end)
+Terminating after updates
+TEST WITH ma_test1 -s -M -T -c -N -b32768 -H2 --versioning --testflag=4 --test-undo=1 (additional aborted work)
+Terminating after deletes
+Dying on request without maria_commit()/maria_close()
+applying log
+testing idempotency
+applying log
+testing applying of CLRs to recreate table
+applying log
+TEST WITH ma_test1 -s -M -T -c -N -b32768 -H2 --testflag=2 (commit at end)
+Terminating after inserts
+TEST WITH ma_test1 -s -M -T -c -N -b32768 -H2 --testflag=3 --test-undo=1 (additional aborted work)
+Terminating after updates
+Dying on request without maria_commit()/maria_close()
+applying log
+testing idempotency
+applying log
+testing applying of CLRs to recreate table
+applying log
+TEST WITH ma_test2 -s -L -K -W -P -M -T -c -b32768 -H1 -t1 (commit at end)
+TEST WITH ma_test2 -s -L -K -W -P -M -T -c -b32768 -H1 -t2 -A1 (additional aborted work)
+Dying on request without maria_commit()/maria_close()
+applying log
+Differences in aria_chk -dvv, recovery not yet perfect !
+========DIFF START=======
+6c6
+< Status:              checked,analyzed,optimized keys,sorted index pages,zerofilled,movable
+---
+> Status:              changed
+========DIFF END=======
+testing idempotency
+applying log
+Differences in aria_chk -dvv, recovery not yet perfect !
+========DIFF START=======
+6c6
+< Status:              checked,analyzed,optimized keys,sorted index pages,zerofilled,movable
+---
+> Status:              changed
+========DIFF END=======
+testing applying of CLRs to recreate table
+applying log
+Differences in aria_chk -dvv, recovery not yet perfect !
+========DIFF START=======
+6c6
+< Status:              checked,analyzed,optimized keys,sorted index pages,zerofilled,movable
+---
+> Status:              changed
+========DIFF END=======
+TEST WITH ma_test2 -s -L -K -W -P -M -T -c -b32768 -H1 -t1 (commit at end)
+TEST WITH ma_test2 -s -L -K -W -P -M -T -c -b32768 -H1 -t6 -A1 (additional aborted work)
+Dying on request without maria_commit()/maria_close()
+applying log
+Differences in aria_chk -dvv, recovery not yet perfect !
+========DIFF START=======
+6c6
+< Status:              checked,analyzed,optimized keys,sorted index pages,zerofilled,movable
+---
+> Status:              changed
+========DIFF END=======
+testing idempotency
+applying log
+Differences in aria_chk -dvv, recovery not yet perfect !
+========DIFF START=======
+6c6
+< Status:              checked,analyzed,optimized keys,sorted index pages,zerofilled,movable
+---
+> Status:              changed
+========DIFF END=======
+testing applying of CLRs to recreate table
+applying log
+Differences in aria_chk -dvv, recovery not yet perfect !
+========DIFF START=======
+6c6
+< Status:              checked,analyzed,optimized keys,sorted index pages,zerofilled,movable
+---
+> Status:              changed
+========DIFF END=======
+TEST WITH ma_test1 -s -M -T -c -N -b32768 -H1 --testflag=1 (commit at end)
+TEST WITH ma_test1 -s -M -T -c -N -b32768 -H1 --testflag=2 --test-undo=2 (additional aborted work)
+Terminating after inserts
+Dying on request without maria_commit()/maria_close()
+applying log
+testing idempotency
+applying log
+testing applying of CLRs to recreate table
+applying log
+TEST WITH ma_test1 -s -M -T -c -N -b32768 -H2 --testflag=3 (commit at end)
+Terminating after updates
+TEST WITH ma_test1 -s -M -T -c -N -b32768 -H2 --testflag=4 --test-undo=2 (additional aborted work)
+Terminating after deletes
+Dying on request without maria_commit()/maria_close()
+applying log
+testing idempotency
+applying log
+testing applying of CLRs to recreate table
+applying log
+TEST WITH ma_test1 -s -M -T -c -N -b32768 -H2 --versioning --testflag=3 (commit at end)
+Terminating after updates
+TEST WITH ma_test1 -s -M -T -c -N -b32768 -H2 --versioning --testflag=4 --test-undo=2 (additional aborted work)
+Terminating after deletes
+Dying on request without maria_commit()/maria_close()
+applying log
+testing idempotency
+applying log
+testing applying of CLRs to recreate table
+applying log
+TEST WITH ma_test1 -s -M -T -c -N -b32768 -H2 --testflag=2 (commit at end)
+Terminating after inserts
+TEST WITH ma_test1 -s -M -T -c -N -b32768 -H2 --testflag=3 --test-undo=2 (additional aborted work)
+Terminating after updates
+Dying on request without maria_commit()/maria_close()
+applying log
+testing idempotency
+applying log
+testing applying of CLRs to recreate table
+applying log
+TEST WITH ma_test2 -s -L -K -W -P -M -T -c -b32768 -H1 -t1 (commit at end)
+TEST WITH ma_test2 -s -L -K -W -P -M -T -c -b32768 -H1 -t2 -A2 (additional aborted work)
+Dying on request without maria_commit()/maria_close()
+applying log
+Differences in aria_chk -dvv, recovery not yet perfect !
+========DIFF START=======
+6c6
+< Status:              checked,analyzed,optimized keys,sorted index pages,zerofilled,movable
+---
+> Status:              changed
+========DIFF END=======
+testing idempotency
+applying log
+Differences in aria_chk -dvv, recovery not yet perfect !
+========DIFF START=======
+6c6
+< Status:              checked,analyzed,optimized keys,sorted index pages,zerofilled,movable
+---
+> Status:              changed
+========DIFF END=======
+testing applying of CLRs to recreate table
+applying log
+Differences in aria_chk -dvv, recovery not yet perfect !
+========DIFF START=======
+6c6
+< Status:              checked,analyzed,optimized keys,sorted index pages,zerofilled,movable
+---
+> Status:              changed
+========DIFF END=======
+TEST WITH ma_test2 -s -L -K -W -P -M -T -c -b32768 -H1 -t1 (commit at end)
+TEST WITH ma_test2 -s -L -K -W -P -M -T -c -b32768 -H1 -t6 -A2 (additional aborted work)
+Dying on request without maria_commit()/maria_close()
+applying log
+Differences in aria_chk -dvv, recovery not yet perfect !
+========DIFF START=======
+6c6
+< Status:              checked,analyzed,optimized keys,sorted index pages,zerofilled,movable
+---
+> Status:              changed
+========DIFF END=======
+testing idempotency
+applying log
+Differences in aria_chk -dvv, recovery not yet perfect !
+========DIFF START=======
+6c6
+< Status:              checked,analyzed,optimized keys,sorted index pages,zerofilled,movable
+---
+> Status:              changed
+========DIFF END=======
+testing applying of CLRs to recreate table
+applying log
+Differences in aria_chk -dvv, recovery not yet perfect !
+========DIFF START=======
+6c6
+< Status:              checked,analyzed,optimized keys,sorted index pages,zerofilled,movable
+---
+> Status:              changed
+========DIFF END=======
+TEST WITH ma_test1 -s -M -T -c -N -b32768 -H1 --testflag=1 (commit at end)
+TEST WITH ma_test1 -s -M -T -c -N -b32768 -H1 --testflag=2 --test-undo=3 (additional aborted work)
+Terminating after inserts
+Dying on request without maria_commit()/maria_close()
+applying log
+testing idempotency
+applying log
+testing applying of CLRs to recreate table
+applying log
+TEST WITH ma_test1 -s -M -T -c -N -b32768 -H2 --testflag=3 (commit at end)
+Terminating after updates
+TEST WITH ma_test1 -s -M -T -c -N -b32768 -H2 --testflag=4 --test-undo=3 (additional aborted work)
+Terminating after deletes
+Dying on request without maria_commit()/maria_close()
+applying log
+testing idempotency
+applying log
+testing applying of CLRs to recreate table
+applying log
+TEST WITH ma_test1 -s -M -T -c -N -b32768 -H2 --versioning --testflag=3 (commit at end)
+Terminating after updates
+TEST WITH ma_test1 -s -M -T -c -N -b32768 -H2 --versioning --testflag=4 --test-undo=3 (additional aborted work)
+Terminating after deletes
+Dying on request without maria_commit()/maria_close()
+applying log
+testing idempotency
+applying log
+testing applying of CLRs to recreate table
+applying log
+TEST WITH ma_test1 -s -M -T -c -N -b32768 -H2 --testflag=2 (commit at end)
+Terminating after inserts
+TEST WITH ma_test1 -s -M -T -c -N -b32768 -H2 --testflag=3 --test-undo=3 (additional aborted work)
+Terminating after updates
+Dying on request without maria_commit()/maria_close()
+applying log
+testing idempotency
+applying log
+testing applying of CLRs to recreate table
+applying log
+TEST WITH ma_test2 -s -L -K -W -P -M -T -c -b32768 -H1 -t1 (commit at end)
+TEST WITH ma_test2 -s -L -K -W -P -M -T -c -b32768 -H1 -t2 -A3 (additional aborted work)
+Dying on request without maria_commit()/maria_close()
+applying log
+Differences in aria_chk -dvv, recovery not yet perfect !
+========DIFF START=======
+6c6
+< Status:              checked,analyzed,optimized keys,sorted index pages,zerofilled,movable
+---
+> Status:              changed
+========DIFF END=======
+testing idempotency
+applying log
+Differences in aria_chk -dvv, recovery not yet perfect !
+========DIFF START=======
+6c6
+< Status:              checked,analyzed,optimized keys,sorted index pages,zerofilled,movable
+---
+> Status:              changed
+========DIFF END=======
+testing applying of CLRs to recreate table
+applying log
+Differences in aria_chk -dvv, recovery not yet perfect !
+========DIFF START=======
+6c6
+< Status:              checked,analyzed,optimized keys,sorted index pages,zerofilled,movable
+---
+> Status:              changed
+========DIFF END=======
+TEST WITH ma_test2 -s -L -K -W -P -M -T -c -b32768 -H1 -t1 (commit at end)
+TEST WITH ma_test2 -s -L -K -W -P -M -T -c -b32768 -H1 -t6 -A3 (additional aborted work)
+Dying on request without maria_commit()/maria_close()
+applying log
+Differences in aria_chk -dvv, recovery not yet perfect !
+========DIFF START=======
+6c6
+< Status:              checked,analyzed,optimized keys,sorted index pages,zerofilled,movable
+---
+> Status:              changed
+========DIFF END=======
+testing idempotency
+applying log
+Differences in aria_chk -dvv, recovery not yet perfect !
+========DIFF START=======
+6c6
+< Status:              checked,analyzed,optimized keys,sorted index pages,zerofilled,movable
+---
+> Status:              changed
+========DIFF END=======
+testing applying of CLRs to recreate table
+applying log
+Differences in aria_chk -dvv, recovery not yet perfect !
+========DIFF START=======
+6c6
+< Status:              checked,analyzed,optimized keys,sorted index pages,zerofilled,movable
+---
+> Status:              changed
+========DIFF END=======
+TEST WITH ma_test1 -s -M -T -c -N -b32768 -H1 --testflag=1 (commit at end)
+TEST WITH ma_test1 -s -M -T -c -N -b32768 -H1 --testflag=2 --test-undo=4 (additional aborted work)
+Terminating after inserts
+Dying on request without maria_commit()/maria_close()
+applying log
+testing idempotency
+applying log
+testing applying of CLRs to recreate table
+applying log
+TEST WITH ma_test1 -s -M -T -c -N -b32768 -H2 --testflag=3 (commit at end)
+Terminating after updates
+TEST WITH ma_test1 -s -M -T -c -N -b32768 -H2 --testflag=4 --test-undo=4 (additional aborted work)
+Terminating after deletes
+Dying on request without maria_commit()/maria_close()
+applying log
+testing idempotency
+applying log
+testing applying of CLRs to recreate table
+applying log
+TEST WITH ma_test1 -s -M -T -c -N -b32768 -H2 --versioning --testflag=3 (commit at end)
+Terminating after updates
+TEST WITH ma_test1 -s -M -T -c -N -b32768 -H2 --versioning --testflag=4 --test-undo=4 (additional aborted work)
+Terminating after deletes
+Dying on request without maria_commit()/maria_close()
+applying log
+testing idempotency
+applying log
+testing applying of CLRs to recreate table
+applying log
+TEST WITH ma_test1 -s -M -T -c -N -b32768 -H2 --testflag=2 (commit at end)
+Terminating after inserts
+TEST WITH ma_test1 -s -M -T -c -N -b32768 -H2 --testflag=3 --test-undo=4 (additional aborted work)
+Terminating after updates
+Dying on request without maria_commit()/maria_close()
+applying log
+testing idempotency
+applying log
+testing applying of CLRs to recreate table
+applying log
+TEST WITH ma_test2 -s -L -K -W -P -M -T -c -b32768 -H1 -t1 (commit at end)
+TEST WITH ma_test2 -s -L -K -W -P -M -T -c -b32768 -H1 -t2 -A4 (additional aborted work)
+Dying on request without maria_commit()/maria_close()
+applying log
+Differences in aria_chk -dvv, recovery not yet perfect !
+========DIFF START=======
+6c6
+< Status:              checked,analyzed,optimized keys,sorted index pages,zerofilled,movable
+---
+> Status:              changed
+========DIFF END=======
+testing idempotency
+applying log
+Differences in aria_chk -dvv, recovery not yet perfect !
+========DIFF START=======
+6c6
+< Status:              checked,analyzed,optimized keys,sorted index pages,zerofilled,movable
+---
+> Status:              changed
+========DIFF END=======
+testing applying of CLRs to recreate table
+applying log
+Differences in aria_chk -dvv, recovery not yet perfect !
+========DIFF START=======
+6c6
+< Status:              checked,analyzed,optimized keys,sorted index pages,zerofilled,movable
+---
+> Status:              changed
+========DIFF END=======
+TEST WITH ma_test2 -s -L -K -W -P -M -T -c -b32768 -H1 -t1 (commit at end)
+TEST WITH ma_test2 -s -L -K -W -P -M -T -c -b32768 -H1 -t6 -A4 (additional aborted work)
+Dying on request without maria_commit()/maria_close()
+applying log
+Differences in aria_chk -dvv, recovery not yet perfect !
+========DIFF START=======
+6c6
+< Status:              checked,analyzed,optimized keys,sorted index pages,zerofilled,movable
+---
+> Status:              changed
+========DIFF END=======
+testing idempotency
+applying log
+Differences in aria_chk -dvv, recovery not yet perfect !
+========DIFF START=======
+6c6
+< Status:              checked,analyzed,optimized keys,sorted index pages,zerofilled,movable
+---
+> Status:              changed
+========DIFF END=======
+testing applying of CLRs to recreate table
+applying log
+Differences in aria_chk -dvv, recovery not yet perfect !
+========DIFF START=======
+6c6
+< Status:              checked,analyzed,optimized keys,sorted index pages,zerofilled,movable
+---
+> Status:              changed
+========DIFF END=======
diff --git a/storage/maria/unittest/ma_test_recovery.pl b/storage/maria/unittest/ma_test_recovery.pl
new file mode 100755
index 00000000000..d9be82f4e58
--- /dev/null
+++ b/storage/maria/unittest/ma_test_recovery.pl
@@ -0,0 +1,481 @@
+#!/usr/bin/env perl
+
+use Getopt::Long;
+use File::Copy;
+use File::Compare;
+use File::Basename;
+use Digest::MD5;
+
+$|= 1;
+$^W = 1; # warnings, because env cannot parse 'perl -w'
+$VER= "1.2";
+
+$opt_version= 0;
+$opt_help=    0;
+$opt_verbose= 0;
+$opt_abort_on_error=0;
+
+my $silent= "-s";
+my $maria_path;     # path to "storage/maria"
+my $maria_exe_path; # path to executables (ma_test1, aria_chk etc)
+my $tmp= "./tmp";
+my $my_progname= $0;
+my $suffix;
+my $zerofilled_tables= 0;
+
+$my_progname=~ s/.*[\/]//;
+$maria_path= dirname($0) . "/..";
+
+main();
+
+####
+#### main function
+####
+
+sub main
+{
+  my ($res, $table);
+
+  if (!GetOptions("abort-on-error", "help", "version", "verbose"))
+  {
+    $flag_exit= 1;
+  }
+  if ($opt_version)
+  {
+    print "$my_progname version $VER\n";
+    exit(0);
+  }
+  usage() if ($opt_help || $flag_exit);
+
+  $suffix= ( $^O =~ /win/i  && $^O !~ /darwin/i ) ? ".exe" : "";
+  $maria_exe_path= "$maria_path/release";
+  # we use -f, sometimes -x is unexpectedly false in Cygwin
+  if ( ! -f "$maria_exe_path/ma_test1$suffix" )
+  {
+    $maria_exe_path= "$maria_path/relwithdebinfo";
+    if ( ! -f "$maria_exe_path/ma_test1$suffix" )
+    {
+      $maria_exe_path= "$maria_path/debug";
+      if ( ! -f "$maria_exe_path/ma_test1$suffix" )
+      {
+        $maria_exe_path= $maria_path;
+        if ( ! -f "$maria_exe_path/ma_test1$suffix" )
+        {
+          die("Cannot find ma_test1 executable\n");
+        }
+      }
+    }
+  }
+
+  # test data is always put in the current directory or a tmp subdirectory
+  # of it
+
+  if (! -d "$tmp")
+  {
+    mkdir $tmp;
+  }
+  print "ARIA RECOVERY TESTS\n";
+
+  # To not flood the screen, we redirect all the commands below to a text file
+  # and just give a final error if their output is not as expected
+
+  open (MY_LOG, ">$tmp/ma_test_recovery.output") or die "Can't open log file\n";
+  print MY_LOG "Testing the REDO PHASE ALONE\n";
+
+  # runs a program inserting/deleting rows, then moves the resulting table
+  # elsewhere; applies the log and checks that the data file is
+  # identical to the saved original.
+
+  my @t= ("ma_test1$suffix $silent -M -T -c",
+          "ma_test2$suffix $silent -L -K -W -P -M -T -c -d500",
+          "ma_test2$suffix $silent -M -T -c -b65000",
+          "ma_test2$suffix $silent -M -T -c -b65000 -d800",
+          "ma_test1$suffix $silent -M -T -c -C",
+          "ma_test2$suffix $silent -L -K -W -P -M -T -c -d500 -C",
+          #"ma_rt_test$suffix $silent -M -T -c -C",
+          # @todo: also add to @t2
+         );
+
+  foreach my $prog (@t)
+  {
+    unlink <aria_log.* aria_log_control>;
+    my $prog_no_suffix= $prog;
+    $prog_no_suffix=~ s/$suffix// if ($suffix);
+    print MY_LOG "TEST WITH $prog_no_suffix\n";
+    $res= my_exec("$maria_exe_path/$prog");
+    print MY_LOG $res;
+    # derive table's name from program's name
+    if ($prog =~ m/^ma_(\S+)\s.*/)
+    {
+      $table= $1;
+    }
+    else
+    {
+      die("can't guess table name");
+    }
+    $com=  "$maria_exe_path/aria_chk$suffix -dvv $table ";
+    $com.= "| grep -v \"Creation time:\" | grep -v \"file length\" | grep -v \"LSNs:\" | grep -v \"UUID:\"";
+    $com.= "> $tmp/aria_chk_message.good.txt 2>&1";
+    my_exec($com);
+    my $checksum= my_exec("$maria_exe_path/aria_chk$suffix -dss $table");
+    move("$table.MAD", "$tmp/$table-good.MAD") ||
+      die "Can't move $table.MAD to $tmp/$table-good.MAD\n";
+    move("$table.MAI", "$tmp/$table-good.MAI") ||
+      die "Can't move $table.MAI to $tmp/$table-good.MAI\n";
+    apply_log($table, "shouldnotchangelog");
+    check_table_is_same($table, $checksum);
+    $res= physical_cmp($table, "$tmp/$table-good");
+    print MY_LOG $res;
+    print MY_LOG "testing idempotency\n";
+    apply_log($table, "shouldnotchangelog");
+    check_table_is_same($table, $checksum);
+    $res= physical_cmp($table, "$tmp/$table-good");
+    print MY_LOG $res;
+  }
+
+  print MY_LOG "Testing the REDO AND UNDO PHASE\n";
+  # The test programs look like:
+  # work; commit (time T1); work; exit-without-commit (time T2)
+  # We first run the test program and let it exit after T1's commit.
+  # Then we run it again and let it exit at T2. Then we compare
+  # and expect identity.
+
+  my @take_checkpoints= ("no", "yes");
+  my @blobs= ("", "-b32768");
+  my @test_undo= (1, 2, 3, 4);
+  my @t2= ("ma_test1$suffix $silent -M -T -c -N blob -H1",
+           "--testflag=1",
+           "--testflag=2 --test-undo=",
+           "ma_test1$suffix $silent -M -T -c -N blob -H2",
+           "--testflag=3",
+           "--testflag=4 --test-undo=",
+           "ma_test1$suffix $silent -M -T -c -N blob -H2 --versioning",
+           "--testflag=3",
+           "--testflag=4 --test-undo=",
+           "ma_test1$suffix $silent -M -T -c -N blob -H2",
+           "--testflag=2",
+           "--testflag=3 --test-undo=",
+           "ma_test2$suffix $silent -L -K -W -P -M -T -c blob -H1",
+           "-t1",
+           "-t2 -A",
+           "ma_test2$suffix $silent -L -K -W -P -M -T -c blob -H1",
+           "-t1",
+           "-t6 -A");
+
+  foreach my $take_checkpoint (@take_checkpoints)
+  {
+    my ($i, $j, $k, $commit_run_args, $abort_run_args);
+    # we test table without blobs and then table with blobs
+    for ($i= 0; defined($blobs[$i]); $i++)
+    {
+      for ($j= 0; defined($test_undo[$j]); $j++)
+      {
+        # first iteration tests rollback of insert, second tests rollback of delete
+        # -N (create NULL fields) is needed because --test-undo adds it anyway
+        for ($k= 0; defined($t2[$k]); $k+= 3)
+        {
+          $prog= $t2[$k];
+          $prog=~ s/blob/$blobs[$i]/;
+          if ("$take_checkpoint" eq "no") {
+            $prog=~ s/\s+\-H[0-9]+//;
+          }
+          $commit_run_args= $t2[$k + 1];
+          $abort_run_args= $t2[$k + 2];
+          unlink <aria_log.* aria_log_control>;
+          my $prog_no_suffix= $prog;
+          $prog_no_suffix=~ s/$suffix// if ($suffix);
+          print MY_LOG "TEST WITH $prog_no_suffix $commit_run_args (commit at end)\n";
+          $res= my_exec("$maria_exe_path/$prog $commit_run_args");
+          print MY_LOG $res;
+          # derive table's name from program's name
+          if ($prog =~ m/^ma_(\S+)\s.*/)
+          {
+            $table= $1;
+          }
+          else
+          {
+            die("can't guess table name");
+          }
+          $com=  "$maria_exe_path/aria_chk$suffix -dvv $table ";
+          $com.= "| grep -v \"Creation time:\" | grep -v \"file length\" | grep -v \"LSNs:\" | grep -v \"UUID:\" ";
+          $com.= "> $tmp/aria_chk_message.good.txt 2>&1";
+          $res= my_exec($com);
+          print MY_LOG $res;
+          $checksum= my_exec("$maria_exe_path/aria_chk$suffix -dss $table");
+          move("$table.MAD", "$tmp/$table-good.MAD") ||
+            die "Can't move $table.MAD to $tmp/$table-good.MAD\n";
+          move("$table.MAI", "$tmp/$table-good.MAI") ||
+            die "Can't move $table.MAI to $tmp/$table-good.MAI\n";
+          unlink <aria_log.* aria_log_control>;
+          print MY_LOG "TEST WITH $prog_no_suffix $abort_run_args$test_undo[$j] (additional aborted work)\n";
+          $res= my_exec("$maria_exe_path/$prog $abort_run_args$test_undo[$j]");
+          print MY_LOG $res;
+          copy("$table.MAD", "$tmp/$table-before_undo.MAD") ||
+            die "Can't copy $table.MAD to $tmp/$table-before_undo.MAD\n";
+          copy("$table.MAI", "$tmp/$table-before_undo.MAI") ||
+            die "Can't copy $table.MAI to $tmp/$table-before_undo.MAI\n";
+
+          # The lines below seem unneeded, will be removed soon
+          # We have to copy and restore logs, as running aria_read_log will
+          # change the aria_control_file
+          #    rm -f $tmp/aria_log.* $tmp/aria_log_control
+          #    cp $maria_path/aria_log* $tmp
+
+          if ($test_undo[$j] != 3) {
+            apply_log($table, "shouldchangelog"); # should undo aborted work
+          } else {
+            # probably nothing to undo went to log or data file
+            apply_log($table, "dontknow");
+          }
+          copy("$table.MAD", "$tmp/$table-after_undo.MAD") ||
+            die "Can't copy $table.MAD to $tmp/$table-after_undo.MAD\n";
+          copy("$table.MAI", "$tmp/$table-after_undo.MAI") ||
+            die "Can't copy $table.MAI to $tmp/$table-after_undo.MAI\n";
+
+          # It is impossible to do a "cmp" between .good and .after_undo,
+          # because the UNDO phase generated log
+          # records whose LSN tagged pages. Another reason is that rolling back
+          # INSERT only marks the rows free, does not empty them (optimization), so
+          # traces of the INSERT+rollback remain.
+
+          check_table_is_same($table, $checksum);
+          print MY_LOG "testing idempotency\n";
+          apply_log($table, "shouldnotchangelog");
+          check_table_is_same($table, $checksum);
+          $res= physical_cmp($table, "$tmp/$table-after_undo");
+          print MY_LOG $res;
+          print MY_LOG "testing applying of CLRs to recreate table\n";
+          unlink <$table.MA?>;
+          #    cp $tmp/aria_log* $maria_path  #unneeded
+          apply_log($table, "shouldnotchangelog");
+          check_table_is_same($table, $checksum);
+          $res= physical_cmp($table, "$tmp/$table-after_undo");
+          print MY_LOG $res;
+        }
+        unlink <$table.* $tmp/$table* $tmp/aria_chk_*.txt $tmp/aria_read_log_$table.txt>;
+      }
+    }
+  }
+
+  if ($? >> 8) {
+    print "Some test failed\n";
+    exit(1);
+  }
+
+  close(MY_LOG);
+  # also note that aria_chk -dvv shows differences for ma_test2 in UNDO phase,
+  # this is normal: removing records does not shrink the data/key file,
+  # does not put back the "analyzed,optimized keys"(etc) index state.
+  `diff -b $maria_path/unittest/ma_test_recovery.expected $tmp/ma_test_recovery.output`;
+  if ($? >> 8) {
+    print "UNEXPECTED OUTPUT OF TESTS, FAILED";
+    print " (zerofilled $zerofilled_tables tables)\n";
+    print "For more info, do diff -b $maria_path/unittest/ma_test_recovery.expected ";
+    print "$tmp/ma_test_recovery.output\n";
+    exit(1);
+  }
+  print "ALL RECOVERY TESTS OK (zerofilled $zerofilled_tables tables)\n";
+}
+
+####
+#### check_table_is_same
+####
+
+sub check_table_is_same
+{
+  my ($table, $checksum)= @_;
+  my ($com, $checksum2, $res);
+
+  # Computes checksum of new table and compares to checksum of old table
+  # Shows any difference in table's state (info from the index's header)
+  # Data/key file length is random in ma_test2 (as it uses srand() which
+  # may differ between machines).
+
+  if ($opt_verbose)
+  {
+    print "checking if table $table has changed\n";
+  }
+
+  $com=  "$maria_exe_path/aria_chk$suffix -dvv $table | grep -v \"Creation time:\" ";
+  $com.= "| grep -v \"file length\" | grep -v \"LSNs:\" | grep -v \"UUID:\" > $tmp/aria_chk_message.txt 2>&1";
+  $res= `$com`;
+  print MY_LOG $res;
+  $res= `$maria_exe_path/aria_chk$suffix -ss -e --read-only $table`;
+  print MY_LOG $res;
+  $checksum2= `$maria_exe_path/aria_chk$suffix -dss $table`;
+  if ("$checksum" ne "$checksum2")
+  {
+    print MY_LOG "checksum differs for $table before and after recovery\n";
+    return 1;
+  }
+
+  $com=  "diff $tmp/aria_chk_message.good.txt $tmp/aria_chk_message.txt ";
+  $com.= "> $tmp/aria_chk_diff.txt || true";
+  $res= `$com`;
+  print MY_LOG $res;
+
+  if (-s "$tmp/aria_chk_diff.txt")
+  {
+    print MY_LOG "Differences in aria_chk -dvv, recovery not yet perfect !\n";
+    print MY_LOG "========DIFF START=======\n";
+    open(MY_FILE, "<$tmp/aria_chk_diff.txt") || die "Can't open file aria_chk_diff.txt\n";
+    while (<MY_FILE>)
+    {
+      print MY_LOG $_;
+    }
+    close(MY_FILE);
+    print MY_LOG "========DIFF END=======\n";
+  }
+}
+
+####
+#### apply_log
+####
+
+sub apply_log
+{
+  my ($table, $shouldchangelog)= @_;
+  my ($log_md5, $log_md5_2);
+
+  # applies log, can verify if applying did write to log or not
+
+  if ("$shouldchangelog" ne "shouldnotchangelog" &&
+      "$shouldchangelog" ne "shouldchangelog" &&
+      "$shouldchangelog" ne "dontknow" )
+  {
+    print MY_LOG "bad argument '$shouldchangelog'\n";
+    return 1;
+  }
+  foreach (<aria_log.*>)
+  {
+    $log_md5.= md5_conv($_);
+  }
+  print MY_LOG "applying log\n";
+  my_exec("$maria_exe_path/aria_read_log$suffix -a > $tmp/aria_read_log_$table.txt");
+  foreach (<aria_log.*>)
+  {
+    $log_md5_2.= md5_conv($_);
+  }
+  if ("$log_md5" ne "$log_md5_2" )
+  {
+    if ("$shouldchangelog" eq "shouldnotchangelog")
+    {
+      print MY_LOG "aria_read_log should not have modified the log\n";
+      return 1;
+    }
+  }
+  elsif ("$shouldchangelog" eq "shouldchangelog")
+  {
+    print MY_LOG "aria_read_log should have modified the log\n";
+    return 1;
+  }
+}
+
+####
+#### md5_conv
+####
+
+sub md5_conv
+{
+  my ($file)= @_;
+
+  open(FILE, $file) or die "Can't open '$file': $!\n";
+  binmode(FILE);
+  my $md5= Digest::MD5->new;
+  $md5->addfile(FILE);
+  close (FILE);
+  return $md5->hexdigest . "\n";
+}
+
+####
+#### physical_cmp: compares two tables (MAI and MAD) physically;
+#### uses zerofill-keep-lsn to reduce irrelevant differences.
+####
+
+sub physical_cmp
+{
+  my ($table1, $table2)= @_;
+  my ($zerofilled, $ret_text)= (0, "");
+  #return `cmp $table1.MAD $table2.MAD`.`cmp $table1.MAI $table2.MAI`;
+  foreach my $file_suffix ("MAD", "MAI")
+  {
+    my $file1= "$table1.$file_suffix";
+    my $file2= "$table2.$file_suffix";
+    my $res= File::Compare::compare($file1, $file2);
+    die() if ($res == -1);
+    if ($res == 1 # they differ
+        and !$zerofilled)
+    {
+      # let's try with --zerofill-keep-lsn
+      $zerofilled= 1; # but no need to do it twice
+      $zerofilled_tables= $zerofilled_tables + 1;
+      my $table_no= 1;
+      foreach my $table ($table1, $table2)
+      {
+        # save original tables to restore them later
+        copy("$table.MAD", "$tmp/before_zerofill$table_no.MAD") || die();
+        copy("$table.MAI", "$tmp/before_zerofill$table_no.MAI") || die();
+        $com= "$maria_exe_path/aria_chk$suffix -ss --zerofill-keep-lsn $table";
+        $res= `$com`;
+        print MY_LOG $res;
+        $table_no= $table_no + 1;
+      }
+      $res= File::Compare::compare($file1, $file2);
+      die() if ($res == -1);
+    }
+    $ret_text.= "$file1 and $file2 differ\n" if ($res != 0);
+  }
+  if ($zerofilled)
+  {
+    my $table_no= 1;
+    foreach my $table ($table1, $table2)
+    {
+      move("$tmp/before_zerofill$table_no.MAD", "$table.MAD") || die();
+      move("$tmp/before_zerofill$table_no.MAI", "$table.MAI") || die();
+      $table_no= $table_no + 1;
+    }
+  }
+  return $ret_text;
+}
+
+
+sub my_exec
+{
+  my($command)= @_;
+  my $res;
+  if ($opt_verbose)
+  {
+    print "$command\n";
+  }
+  $res= `$command`;
+  if ($? != 0 && $opt_abort_on_error)
+  {
+    exit(1);
+  }
+  return $res;
+}
+
+
+####
+#### usage
+####
+
+sub usage
+{
+  print <<EOF;
+$my_progname version $VER
+
+Description:
+
+Run various Aria recovery tests and print the results
+
+Options
+--help             Show this help and exit.
+
+--abort-on-error   Abort at once in case of error.
+--verbose          Show commands while there are executing.
+--version          Show version number and exit.
+
+EOF
+  exit(0);
+}
diff --git a/storage/maria/unittest/sequence_storage.c b/storage/maria/unittest/sequence_storage.c
new file mode 100644
index 00000000000..d5db20d31ca
--- /dev/null
+++ b/storage/maria/unittest/sequence_storage.c
@@ -0,0 +1,110 @@
+/* Copyright (C) 2008 MySQL AB
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; version 2 of the License.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */
+
+#include "../maria_def.h"
+#include "sequence_storage.h"
+
+
+/**
+  @brief Initializes the sequence from the sequence file.
+
+  @param seq             Reference on the sequence storage.
+  @param file            Path to the file where to write the sequence
+
+  @retval 0 OK
+  @retval 1 Error
+*/
+
+my_bool seq_storage_reader_init(SEQ_STORAGE *seq, const char *file)
+{
+  FILE *fd;
+  seq->pos= 0;
+  if ((fd= my_fopen(file, O_RDONLY, MYF(MY_WME))) == NULL)
+    return 1;
+  if (my_init_dynamic_array(&seq->seq, sizeof(ulong), 10, 10))
+    return 1;
+
+  for(;;)
+  {
+    ulong num;
+    char line[22];
+    if (fgets(line, sizeof(line), fd) == NULL)
+      break;
+    num= atol(line);
+    if (insert_dynamic(&seq->seq, (uchar*) &num))
+      return 1;
+  }
+  fclose(fd);
+  return 0;
+}
+
+
+/**
+  @brief Gets next number from the sequence storage
+
+  @param seq             Reference on the sequence storage.
+
+  @return Next number from the sequence.
+*/
+
+ulong seq_storage_next(SEQ_STORAGE *seq)
+{
+  DBUG_ASSERT(seq->seq.elements > 0);
+  DBUG_ASSERT(seq->pos < seq->seq.elements);
+  return (*(dynamic_element(&seq->seq, seq->pos++, ulong *)));
+}
+
+
+/**
+  @brief Frees resources allocated for the storage
+
+  @param seq             Reference on the sequence storage.
+*/
+
+void seq_storage_destroy(SEQ_STORAGE *seq)
+{
+  delete_dynamic(&seq->seq);
+}
+
+
+/**
+  @brief Starts the sequence from begining
+
+  @param seq             Reference on the sequence storage.
+*/
+
+void seq_storage_rewind(SEQ_STORAGE *seq)
+{
+  seq->pos= 0;
+}
+
+/**
+  @brief Writes a number to the sequence file.
+
+  @param file            Path to the file where to write the sequence
+  @pagem num             Number to be written
+
+  @retval 0 OK
+  @retval 1 Error
+*/
+
+my_bool seq_storage_write(const char *file, ulong num)
+{
+  FILE *fd;
+  return  ((fd= my_fopen(file, O_CREAT | O_APPEND | O_WRONLY, MYF(MY_WME))) ==
+           NULL ||
+           fprintf(fd, "%lu\n", num) < 0 ||
+           fclose(fd) != 0);
+}
diff --git a/storage/maria/unittest/sequence_storage.h b/storage/maria/unittest/sequence_storage.h
new file mode 100644
index 00000000000..78ce15a6253
--- /dev/null
+++ b/storage/maria/unittest/sequence_storage.h
@@ -0,0 +1,28 @@
+/* Copyright (C) 2008 MySQL AB
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; version 2 of the License.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */
+
+
+typedef struct st_seq_storage
+{
+  uint pos;
+  DYNAMIC_ARRAY seq;
+} SEQ_STORAGE;
+
+extern my_bool seq_storage_reader_init(SEQ_STORAGE *seq, const char *file);
+extern ulong seq_storage_next(SEQ_STORAGE *seq);
+extern void seq_storage_destroy(SEQ_STORAGE *seq);
+extern void seq_storage_rewind(SEQ_STORAGE *seq);
+extern my_bool seq_storage_write(const char *file, ulong num);
+
diff --git a/storage/maria/unittest/test_file.c b/storage/maria/unittest/test_file.c
new file mode 100644
index 00000000000..5f7e3939592
--- /dev/null
+++ b/storage/maria/unittest/test_file.c
@@ -0,0 +1,118 @@
+/* Copyright (C) 2006-2008 MySQL AB
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; version 2 of the License.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */
+
+#include <tap.h>
+#include <my_sys.h>
+#include <my_dir.h>
+#include "test_file.h"
+
+
+/*
+  Check that file contance correspond to descriptor
+
+  SYNOPSIS
+    test_file()
+    file                 File to test
+    file_name            Path (and name) of file which is tested
+    size                 size of file
+    buff_size            size of buffer which is enought to check the file
+    desc                 file descriptor to check with
+
+  RETURN
+    1 file if OK
+    0 error
+*/
+
+int test_file(PAGECACHE_FILE file, char *file_name,
+              off_t size, size_t buff_size, struct file_desc *desc)
+{
+  unsigned char *buffr= my_malloc(buff_size, MYF(0));
+  off_t pos= 0;
+  size_t byte;
+  int step= 0;
+  int res= 1;                                   /* ok */
+
+#ifdef __WIN__
+  /*
+    On Windows, the info returned by stat(), specifically file length
+    is not necessarily current, because this is the behavior of
+    underlying FindFirstFile() function.
+  */
+  WIN32_FILE_ATTRIBUTE_DATA file_attr;
+  LARGE_INTEGER li;
+  if(GetFileAttributesEx(file_name, GetFileExInfoStandard, &file_attr) == 0)
+  {
+    diag("Can't GetFileAttributesEx %s (errno: %d)\n", file_name,
+      GetLastError());
+    res= 0;
+    goto err;
+  }
+  li.HighPart= file_attr.nFileSizeHigh;
+  li.LowPart=  file_attr.nFileSizeLow;
+  if(li.QuadPart !=  size)
+  {
+    diag("file %s size is %llu (should be %llu)\n",
+      file_name, (ulonglong)size, (ulonglong)li.QuadPart);
+    res= 0;                                       /* failed */
+    /* continue to get more information */
+  }
+#else
+  MY_STAT stat_buff, *stat;
+  if ((stat= my_stat(file_name, &stat_buff, MYF(0))) == NULL)
+  {
+    diag("Can't stat() %s (errno: %d)\n", file_name, errno);
+    res= 0;
+    goto err;
+  }
+  if (stat->st_size != size)
+  {
+    diag("file %s size is %lu (should be %lu)\n",
+         file_name, (ulong) stat->st_size, (ulong) size);
+    res= 0;                                       /* failed */
+    /* continue to get more information */
+  }
+#endif
+
+  /* check content */
+  my_seek(file.file, 0, SEEK_SET, MYF(MY_WME));
+  while (desc[step].length != 0)
+  {
+    if (my_read(file.file, buffr, desc[step].length, MYF(0)) !=
+        desc[step].length)
+    {
+      diag("Can't read %u bytes from %s (file: %d  errno: %d)\n",
+           (uint)desc[step].length, file_name, file.file, errno);
+      res= 0;
+      goto err;
+    }
+    for (byte= 0; byte < desc[step].length; byte++)
+    {
+      if (buffr[byte] != desc[step].content)
+      {
+        diag("content of %s mismatch 0x%x in position %lu instead of 0x%x\n",
+             file_name, (uint) buffr[byte], (ulong) (pos + byte),
+             desc[step].content);
+        res= 0;
+        goto err;
+      }
+    }
+    pos+= desc[step].length;
+    step++;
+  }
+
+err:
+  my_free(buffr, 0);
+  return res;
+}
diff --git a/storage/maria/unittest/test_file.h b/storage/maria/unittest/test_file.h
new file mode 100644
index 00000000000..0a1ccf4ab54
--- /dev/null
+++ b/storage/maria/unittest/test_file.h
@@ -0,0 +1,29 @@
+/* Copyright (C) 2006-2008 MySQL AB
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; version 2 of the License.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */
+
+#include <m_string.h>
+#include "../ma_pagecache.h"
+
+/*
+  File content descriptor
+*/
+struct file_desc
+{
+  unsigned int length;
+  unsigned char content;
+};
+
+int test_file(PAGECACHE_FILE file, char *file_name,
+              off_t size, size_t buff_size, struct file_desc *desc);
diff --git a/storage/maria/unittest/trnman-t.c b/storage/maria/unittest/trnman-t.c
new file mode 100644
index 00000000000..43cf982a7f2
--- /dev/null
+++ b/storage/maria/unittest/trnman-t.c
@@ -0,0 +1,175 @@
+/* Copyright (C) 2006-2008 MySQL AB, 2008-2009 Sun Microsystems, Inc.
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; version 2 of the License.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */
+
+#include <tap.h>
+
+#include <my_global.h>
+#include <my_sys.h>
+#include <my_atomic.h>
+#include <lf.h>
+#include <m_string.h>
+#include "../trnman.h"
+
+pthread_mutex_t rt_mutex;
+pthread_attr_t attr;
+size_t stacksize= 0;
+#define STACK_SIZE (((int)stacksize-2048)*STACK_DIRECTION)
+
+int rt_num_threads;
+int litmus;
+
+/*
+  create and end (commit or rollback) transactions randomly
+*/
+#define MAX_ITER 100
+pthread_handler_t test_trnman(void *arg)
+{
+  uint   x, y, i, n;
+  TRN    *trn[MAX_ITER];
+  int    m= (*(int *)arg);
+
+  if (my_thread_init())
+    BAIL_OUT("my_thread_init failed!");
+
+  for (x= ((int)(intptr)(&m)); m > 0; )
+  {
+    y= x= (x*LL(3628273133) + LL(1500450271)) % LL(9576890767); /* three prime numbers */
+    m-= n= x % MAX_ITER;
+    for (i= 0; i < n; i++)
+    {
+      trn[i]= trnman_new_trn(0);
+      if (!trn[i])
+      {
+        diag("trnman_new_trn() failed");
+        litmus++;
+      }
+    }
+    for (i= 0; i < n; i++)
+    {
+      y= (y*19 + 7) % 31;
+      trnman_end_trn(trn[i], y & 1);
+    }
+  }
+  pthread_mutex_lock(&rt_mutex);
+  rt_num_threads--;
+  pthread_mutex_unlock(&rt_mutex);
+
+  my_thread_end();
+
+  return 0;
+}
+#undef MAX_ITER
+
+void run_test(const char *test, pthread_handler handler, int n, int m)
+{
+  pthread_t *threads;
+  ulonglong now= my_getsystime();
+  int i;
+
+  litmus= 0;
+
+  threads= (pthread_t *)my_malloc(sizeof(void *)*n, MYF(0));
+  if (!threads)
+  {
+    diag("Out of memory");
+    abort();
+  }
+
+  diag("Testing %s with %d threads, %d iterations... ", test, n, m);
+  rt_num_threads= n;
+  for (i= 0; i < n ; i++)
+    if (pthread_create(threads+i, &attr, handler, &m))
+    {
+      diag("Could not create thread");
+      abort();
+    }
+  for (i= 0 ; i < n ; i++)
+    pthread_join(threads[i], 0);
+  now= my_getsystime()-now;
+  ok(litmus == 0, "Tested %s in %g secs (%d)", test, ((double)now)/1e7, litmus);
+  my_free((void*)threads, MYF(0));
+}
+
+#define ok_read_from(T1, T2, RES)                       \
+  i= trnman_can_read_from(trn[T1], trid[T2]);           \
+  ok(i == RES, "trn" #T1 " %s read from trn" #T2, i ? "can" : "cannot")
+#define start_transaction(T)                            \
+  trn[T]= trnman_new_trn(0);                            \
+  trid[T]= trn[T]->trid
+#define commit(T)               trnman_commit_trn(trn[T])
+#define abort(T)                trnman_abort_trn(trn[T])
+
+#define Ntrns 4
+void test_trnman_read_from()
+{
+  TRN *trn[Ntrns];
+  TrID trid[Ntrns];
+  int i;
+
+  start_transaction(0);                    /* start trn1 */
+  start_transaction(1);                    /* start trn2 */
+  ok_read_from(1, 0, 0);
+  commit(0);                               /* commit trn1 */
+  start_transaction(2);                    /* start trn4 */
+  abort(2);                                /* abort trn4 */
+  start_transaction(3);                    /* start trn5 */
+  ok_read_from(3, 0, 1);
+  ok_read_from(3, 1, 0);
+  ok_read_from(3, 2, 0);
+  ok_read_from(3, 3, 1);
+  commit(1);                               /* commit trn2 */
+  ok_read_from(3, 1, 0);
+  commit(3);                               /* commit trn5 */
+
+}
+
+int main(int argc __attribute__((unused)), char **argv)
+{
+  MY_INIT(argv[0]);
+
+  plan(7);
+
+  if (my_atomic_initialize())
+    return exit_status();
+
+  pthread_mutex_init(&rt_mutex, 0);
+  pthread_attr_init(&attr);
+#ifdef HAVE_PTHREAD_ATTR_GETSTACKSIZE
+  pthread_attr_getstacksize(&attr, &stacksize);
+  if (stacksize == 0)
+#endif
+    stacksize= PTHREAD_STACK_MIN;
+
+#define CYCLES 10000
+#define THREADS 10
+
+  trnman_init(0);
+
+  test_trnman_read_from();
+  run_test("trnman", test_trnman, THREADS, CYCLES);
+
+  diag("mallocs: %d", trnman_allocated_transactions);
+  {
+    ulonglong now= my_getsystime();
+    trnman_destroy();
+    now= my_getsystime()-now;
+    diag("trnman_destroy: %g", ((double)now)/1e7);
+  }
+
+  pthread_mutex_destroy(&rt_mutex);
+  my_end(0);
+  return exit_status();
+}
+
diff --git a/storage/myisam/CMakeLists.txt b/storage/myisam/CMakeLists.txt
index 487fbded2df..5abd8c29e64 100644
--- a/storage/myisam/CMakeLists.txt
+++ b/storage/myisam/CMakeLists.txt
@@ -15,6 +15,7 @@
 
 SET(MYISAM_SOURCES  ft_boolean_search.c ft_nlq_search.c ft_parser.c ft_static.c
                                 ha_myisam.cc
+				ft_myisam.c 
 				ft_stopwords.c ft_update.c mi_cache.c mi_changed.c mi_check.c
 				mi_checksum.c mi_close.c mi_create.c mi_dbug.c mi_delete.c 
 				mi_delete_all.c mi_delete_table.c mi_dynrec.c mi_extra.c mi_info.c
@@ -65,3 +66,4 @@ ENDIF()
 IF (MSVC)
   SET_TARGET_PROPERTIES(myisamchk myisampack PROPERTIES LINK_FLAGS "setargv.obj")
 ENDIF()
+
diff --git a/storage/myisam/Makefile.am b/storage/myisam/Makefile.am
index 5c3370ac6c5..5d8a5e9753b 100644
--- a/storage/myisam/Makefile.am
+++ b/storage/myisam/Makefile.am
@@ -28,7 +28,7 @@ LDADD =
 DEFS =                  @DEFS@
 
 EXTRA_DIST =		mi_test_all.sh mi_test_all.res CMakeLists.txt plug.in
-pkgdata_DATA =		mi_test_all mi_test_all.res
+pkgdata_DATA =		
 
 pkglib_LIBRARIES =	libmyisam.a
 bin_PROGRAMS =		myisamchk myisamlog myisampack myisam_ftdump
@@ -94,8 +94,8 @@ libmyisam_a_SOURCES =	mi_open.c mi_extra.c mi_info.c mi_rkey.c \
 			mi_delete_table.c mi_rename.c  mi_check.c \
 			mi_keycache.c mi_preload.c \
 			ft_parser.c ft_stopwords.c ft_static.c \
-			ft_update.c ft_boolean_search.c ft_nlq_search.c sort.c \
-			ha_myisam.cc \
+			ft_update.c ft_boolean_search.c ft_nlq_search.c \
+			sort.c ha_myisam.cc ft_myisam.c \
 			rt_index.c rt_key.c rt_mbr.c rt_split.c sp_key.c
 CLEANFILES =		test?.MY? FT?.MY? isam.log mi_test_all rt_test.MY? sp_test.MY?
 
diff --git a/storage/myisam/ft_boolean_search.c b/storage/myisam/ft_boolean_search.c
index b54b4c6ce49..7d615d837d3 100644
--- a/storage/myisam/ft_boolean_search.c
+++ b/storage/myisam/ft_boolean_search.c
@@ -182,7 +182,7 @@ typedef struct st_my_ftb_param
 
 
 static int ftb_query_add_word(MYSQL_FTPARSER_PARAM *param,
-                              char *word, int word_len,
+                              const uchar *word, mysql_ft_size_t word_len,
                               MYSQL_FTPARSER_BOOLEAN_INFO *info)
 {
   MY_FTB_PARAM *ftb_param= param->mysql_ftparam;
@@ -198,7 +198,7 @@ static int ftb_query_add_word(MYSQL_FTPARSER_PARAM *param,
     case FT_TOKEN_WORD:
       ftbw= (FTB_WORD *)alloc_root(&ftb_param->ftb->mem_root,
                                    sizeof(FTB_WORD) +
-                                   (info->trunc ? MI_MAX_KEY_BUFF :
+                                   (info->trunc ? HA_MAX_KEY_BUFF :
                                     word_len * ftb_param->ftb->charset->mbmaxlen +
                                     HA_FT_WLEN +
                                     ftb_param->ftb->info->s->rec_reflength));
@@ -284,24 +284,24 @@ static int ftb_query_add_word(MYSQL_FTPARSER_PARAM *param,
 
 
 static int ftb_parse_query_internal(MYSQL_FTPARSER_PARAM *param,
-                                    char *query, int len)
+                                    const uchar *query, mysql_ft_size_t len)
 {
   MY_FTB_PARAM *ftb_param= param->mysql_ftparam;
   MYSQL_FTPARSER_BOOLEAN_INFO info;
   CHARSET_INFO *cs= ftb_param->ftb->charset;
-  uchar **start= (uchar**) &query;
-  uchar *end= (uchar*) query + len;
+  const uchar **start= &query;
+  const uchar *end= query + len;
   FT_WORD w;
 
   info.prev= ' ';
   info.quot= 0;
   while (ft_get_word(cs, start, end, &w, &info))
-    param->mysql_add_word(param, (char*) w.pos, w.len, &info);
+    param->mysql_add_word(param, w.pos, w.len, &info);
   return(0);
 }
 
 
-static int _ftb_parse_query(FTB *ftb, uchar *query, uint len,
+static int _ftb_parse_query(FTB *ftb, uchar *query, mysql_ft_size_t len,
                             struct st_mysql_ftparser *parser)
 {
   MYSQL_FTPARSER_PARAM *param;
@@ -323,7 +323,7 @@ static int _ftb_parse_query(FTB *ftb, uchar *query, uint len,
   param->mysql_add_word= ftb_query_add_word;
   param->mysql_ftparam= (void *)&ftb_param;
   param->cs= ftb->charset;
-  param->doc= (char*) query;
+  param->doc= query;
   param->length= len;
   param->flags= 0;
   param->mode= MYSQL_FTPARSER_FULL_BOOLEAN_INFO;
@@ -484,16 +484,18 @@ static int _ft2_search(FTB *ftb, FTB_WORD *ftbw, my_bool init_search)
 
 static void _ftb_init_index_search(FT_INFO *ftb)
 {
-  int i;
+  uint i;
   FTB_WORD   *ftbw;
 
   if (ftb->state == UNINITIALIZED || ftb->keynr == NO_SUCH_KEY)
     return;
   ftb->state=INDEX_SEARCH;
 
-  for (i=ftb->queue.elements; i; i--)
+  for (i= queue_last_element(&ftb->queue);
+       (int) i >= (int) queue_first_element(&ftb->queue);
+       i--)
   {
-    ftbw=(FTB_WORD *)(ftb->queue.root[i]);
+    ftbw=(FTB_WORD *)(queue_element(&ftb->queue, i));
 
     if (ftbw->flags & FTB_FLAG_TRUNC)
     {
@@ -552,7 +554,7 @@ static void _ftb_init_index_search(FT_INFO *ftb)
 
 
 FT_INFO * ft_init_boolean_search(MI_INFO *info, uint keynr, uchar *query,
-                                 uint query_len, CHARSET_INFO *cs)
+                                 mysql_ft_size_t query_len, CHARSET_INFO *cs)
 {
   FTB       *ftb;
   FTB_EXPR  *ftbe;
@@ -597,14 +599,14 @@ FT_INFO * ft_init_boolean_search(MI_INFO *info, uint keynr, uchar *query,
                                               sizeof(void *))))
     goto err;
   reinit_queue(&ftb->queue, ftb->queue.max_elements, 0, 0,
-                         (int (*)(void*, uchar*, uchar*))FTB_WORD_cmp, 0);
+               (int (*)(void*, uchar*, uchar*))FTB_WORD_cmp, 0, 0, 0);
   for (ftbw= ftb->last_word; ftbw; ftbw= ftbw->prev)
     queue_insert(&ftb->queue, (uchar *)ftbw);
   ftb->list=(FTB_WORD **)alloc_root(&ftb->mem_root,
                                      sizeof(FTB_WORD *)*ftb->queue.elements);
-  memcpy(ftb->list, ftb->queue.root+1, sizeof(FTB_WORD *)*ftb->queue.elements);
+  memcpy(ftb->list, &queue_top(&ftb->queue), sizeof(FTB_WORD *)*ftb->queue.elements);
   my_qsort2(ftb->list, ftb->queue.elements, sizeof(FTB_WORD *),
-            (qsort2_cmp)FTB_WORD_cmp_list, ftb->charset);
+            (qsort2_cmp)FTB_WORD_cmp_list, (void*) ftb->charset);
   if (ftb->queue.elements<2) ftb->with_scan &= ~FTB_FLAG_TRUNC;
   ftb->state=READY;
   return ftb;
@@ -627,7 +629,7 @@ typedef struct st_my_ftb_phrase_param
 
 
 static int ftb_phrase_add_word(MYSQL_FTPARSER_PARAM *param,
-                               char *word, int word_len,
+                               const uchar *word, mysql_ft_size_t word_len,
     MYSQL_FTPARSER_BOOLEAN_INFO *boolean_info __attribute__((unused)))
 {
   MY_FTB_PHRASE_PARAM *phrase_param= param->mysql_ftparam;
@@ -659,15 +661,16 @@ static int ftb_phrase_add_word(MYSQL_FTPARSER_PARAM *param,
 
 
 static int ftb_check_phrase_internal(MYSQL_FTPARSER_PARAM *param,
-                                     char *document, int len)
+                                     const uchar *document,
+                                     mysql_ft_size_t len)
 {
   FT_WORD word;
   MY_FTB_PHRASE_PARAM *phrase_param= param->mysql_ftparam;
   const uchar *docend= (uchar*) document + len;
-  while (ft_simple_get_word(phrase_param->cs, (uchar**) &document, docend,
+  while (ft_simple_get_word(phrase_param->cs, &document, docend,
                             &word, FALSE))
   {
-    param->mysql_add_word(param, (char*) word.pos, word.len, 0);
+    param->mysql_add_word(param, word.pos, word.len, 0);
     if (phrase_param->match)
       break;
   }
@@ -690,8 +693,9 @@ static int ftb_check_phrase_internal(MYSQL_FTPARSER_PARAM *param,
     -1 is returned if error occurs.
 */
 
-static int _ftb_check_phrase(FTB *ftb, const uchar *document, uint len,
-                FTB_EXPR *ftbe, struct st_mysql_ftparser *parser)
+static int _ftb_check_phrase(FTB *ftb, const uchar *document,
+                             mysql_ft_size_t len,
+                             FTB_EXPR *ftbe, struct st_mysql_ftparser *parser)
 {
   MY_FTB_PHRASE_PARAM ftb_param;
   MYSQL_FTPARSER_PARAM *param;
@@ -712,7 +716,7 @@ static int _ftb_check_phrase(FTB *ftb, const uchar *document, uint len,
   param->mysql_add_word= ftb_phrase_add_word;
   param->mysql_ftparam= (void *)&ftb_param;
   param->cs= ftb->charset;
-  param->doc= (char *) document;
+  param->doc= document;
   param->length= len;
   param->flags= 0;
   param->mode= MYSQL_FTPARSER_WITH_STOPWORDS;
@@ -839,7 +843,7 @@ int ft_boolean_read_next(FT_INFO *ftb, char *record)
 
       /* update queue */
       _ft2_search(ftb, ftbw, 0);
-      queue_replaced(& ftb->queue);
+      queue_replace_top(&ftb->queue);
     }
 
     ftbe=ftb->root;
@@ -885,8 +889,9 @@ typedef struct st_my_ftb_find_param
 
 
 static int ftb_find_relevance_add_word(MYSQL_FTPARSER_PARAM *param,
-                                       char *word, int len,
-             MYSQL_FTPARSER_BOOLEAN_INFO *boolean_info __attribute__((unused)))
+                                       const uchar *word, mysql_ft_size_t len,
+                                       MYSQL_FTPARSER_BOOLEAN_INFO
+                                       *boolean_info __attribute__((unused)))
 {
   MY_FTB_FIND_PARAM *ftb_param= param->mysql_ftparam;
   FT_INFO *ftb= ftb_param->ftb;
@@ -899,8 +904,8 @@ static int ftb_find_relevance_add_word(MYSQL_FTPARSER_PARAM *param,
   for (a= 0, b= ftb->queue.elements, c= (a+b)/2; b-a>1; c= (a+b)/2)
   {
     ftbw= ftb->list[c];
-    if (ha_compare_text(ftb->charset, (uchar*)word, len,
-                        (uchar*)ftbw->word+1, ftbw->len-1,
+    if (ha_compare_text(ftb->charset, word, len,
+                        ftbw->word+1, ftbw->len-1,
                         (my_bool) (ftbw->flags & FTB_FLAG_TRUNC), 0) < 0)
       b= c;
     else
@@ -926,8 +931,8 @@ static int ftb_find_relevance_add_word(MYSQL_FTPARSER_PARAM *param,
   for (; c >= 0; c--)
   {
     ftbw= ftb->list[c];
-    if (ha_compare_text(ftb->charset, (uchar*)word, len,
-                        (uchar*)ftbw->word + 1,ftbw->len - 1,
+    if (ha_compare_text(ftb->charset, word, len,
+                        ftbw->word + 1,ftbw->len - 1,
                         (my_bool)(ftbw->flags & FTB_FLAG_TRUNC), 0))
     {
       if (ftb->with_scan & FTB_FLAG_TRUNC)
@@ -946,14 +951,14 @@ static int ftb_find_relevance_add_word(MYSQL_FTPARSER_PARAM *param,
 
 
 static int ftb_find_relevance_parse(MYSQL_FTPARSER_PARAM *param,
-                                    char *doc, int len)
+                                    const uchar *doc, mysql_ft_size_t len)
 {
   MY_FTB_FIND_PARAM *ftb_param= param->mysql_ftparam;
   FT_INFO *ftb= ftb_param->ftb;
-  uchar *end= (uchar*) doc + len;
+  const uchar *end= doc + len;
   FT_WORD w;
-  while (ft_simple_get_word(ftb->charset, (uchar**) &doc, end, &w, TRUE))
-    param->mysql_add_word(param, (char*) w.pos, w.len, 0);
+  while (ft_simple_get_word(ftb->charset, &doc, end, &w, TRUE))
+    param->mysql_add_word(param, w.pos, w.len, 0);
   return(0);
 }
 
@@ -1009,7 +1014,7 @@ float ft_boolean_find_relevance(FT_INFO *ftb, uchar *record, uint length)
   {
     if (!ftsi.pos)
       continue;
-    param->doc= (char *)ftsi.pos;
+    param->doc= ftsi.pos;
     param->length= ftsi.len;
     if (unlikely(parser->parse(param)))
       return 0;
diff --git a/storage/myisam/ft_myisam.c b/storage/myisam/ft_myisam.c
new file mode 100644
index 00000000000..7bcc62d5bf5
--- /dev/null
+++ b/storage/myisam/ft_myisam.c
@@ -0,0 +1,36 @@
+/* Copyright (C) 2000 MySQL AB & MySQL Finland AB & TCX DataKonsult AB
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 2 of the License, or
+   (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */
+
+/* Written by Sergei A. Golubchik, who has a shared copyright to this code */
+
+/*
+  This function is for interface functions between fulltext and myisam
+*/
+
+#include "ftdefs.h"
+
+FT_INFO *ft_init_search(uint flags, void *info, uint keynr,
+                        uchar *query, size_t query_len,
+                        CHARSET_INFO *cs, uchar *record)
+{
+  FT_INFO *res;
+  if (flags & FT_BOOL)
+    res= ft_init_boolean_search((MI_INFO *)info, keynr, query, query_len,cs);
+  else
+    res= ft_init_nlq_search((MI_INFO *)info, keynr, query, query_len, flags,
+			    record);
+  return res;
+}
diff --git a/storage/myisam/ft_nlq_search.c b/storage/myisam/ft_nlq_search.c
index 937bb6ffe19..e19765efe3e 100644
--- a/storage/myisam/ft_nlq_search.c
+++ b/storage/myisam/ft_nlq_search.c
@@ -63,7 +63,8 @@ static int FT_SUPERDOC_cmp(void* cmp_arg __attribute__((unused)),
 
 static int walk_and_match(FT_WORD *word, uint32 count, ALL_IN_ONE *aio)
 {
-  int	       subkeys, r;
+  FT_WEIGTH    subkeys;
+  int          r;
   uint	       keylen, doc_cnt;
   FT_SUPERDOC  sdoc, *sptr;
   TREE_ELEMENT *selem;
@@ -90,7 +91,7 @@ static int walk_and_match(FT_WORD *word, uint32 count, ALL_IN_ONE *aio)
   /* Skip rows inserted by current inserted */
   for (r=_mi_search(info, keyinfo, keybuff, keylen, SEARCH_FIND, key_root) ;
        !r &&
-         (subkeys=ft_sintXkorr(info->lastkey+info->lastkey_length-extra)) > 0 &&
+         (subkeys.i= ft_sintXkorr(info->lastkey+info->lastkey_length-extra)) > 0 &&
          info->lastpos >= info->state->data_file_length ;
        r= _mi_search_next(info, keyinfo, info->lastkey,
                           info->lastkey_length, SEARCH_BIGGER, key_root))
@@ -107,7 +108,7 @@ static int walk_and_match(FT_WORD *word, uint32 count, ALL_IN_ONE *aio)
                         info->lastkey_length-extra-1, keybuff+1,keylen-1,0,0))
      break;
 
-    if (subkeys<0)
+    if (subkeys.i < 0)
     {
       if (doc_cnt)
         DBUG_RETURN(1); /* index is corrupted */
@@ -123,7 +124,8 @@ static int walk_and_match(FT_WORD *word, uint32 count, ALL_IN_ONE *aio)
       goto do_skip;
     }
 #if HA_FT_WTYPE == HA_KEYTYPE_FLOAT
-    ft_floatXget(tmp_weight, info->lastkey+info->lastkey_length-extra);
+    /* The weight we read was actually a float */
+    tmp_weight= subkeys.f;
 #else
 #error
 #endif
@@ -160,7 +162,7 @@ static int walk_and_match(FT_WORD *word, uint32 count, ALL_IN_ONE *aio)
 	r=_mi_search(info, keyinfo, info->lastkey, info->lastkey_length,
                      SEARCH_BIGGER, key_root);
 do_skip:
-    while ((subkeys=ft_sintXkorr(info->lastkey+info->lastkey_length-extra)) > 0 &&
+    while ((subkeys.i= ft_sintXkorr(info->lastkey+info->lastkey_length-extra)) > 0 &&
            !r && info->lastpos >= info->state->data_file_length)
       r= _mi_search_next(info, keyinfo, info->lastkey, info->lastkey_length,
                          SEARCH_BIGGER, key_root);
@@ -203,7 +205,8 @@ static int FT_DOC_cmp(void *unused __attribute__((unused)),
 
 
 FT_INFO *ft_init_nlq_search(MI_INFO *info, uint keynr, uchar *query,
-			    uint query_len, uint flags, uchar *record)
+			    mysql_ft_size_t query_len, uint flags,
+                            uchar *record)
 {
   TREE	      wtree;
   ALL_IN_ONE  aio;
@@ -248,12 +251,12 @@ FT_INFO *ft_init_nlq_search(MI_INFO *info, uint keynr, uchar *query,
   {
     QUEUE best;
     init_queue(&best,ft_query_expansion_limit,0,0, (queue_compare) &FT_DOC_cmp,
-	       0);
+	       0, 0, 0);
     tree_walk(&aio.dtree, (tree_walk_action) &walk_and_push,
               &best, left_root_right);
     while (best.elements)
     {
-      my_off_t docid=((FT_DOC *)queue_remove(& best, 0))->dpos;
+      my_off_t docid= ((FT_DOC *)queue_remove_top(&best))->dpos;
       if (!(*info->read_record)(info,docid,record))
       {
         info->update|= HA_STATE_AKTIV;
diff --git a/storage/myisam/ft_parser.c b/storage/myisam/ft_parser.c
index 663d7869f71..af16240bca2 100644
--- a/storage/myisam/ft_parser.c
+++ b/storage/myisam/ft_parser.c
@@ -106,10 +106,10 @@ my_bool ft_boolean_check_syntax_string(const uchar *str)
   3 - right bracket
   4 - stopword found
 */
-uchar ft_get_word(CHARSET_INFO *cs, uchar **start, uchar *end,
+uchar ft_get_word(CHARSET_INFO *cs, const uchar **start, const uchar *end,
                   FT_WORD *word, MYSQL_FTPARSER_BOOLEAN_INFO *param)
 {
-  uchar *doc=*start;
+  const uchar *doc= *start;
   int ctype;
   uint mwc, length;
   int mbl;
@@ -174,7 +174,7 @@ uchar ft_get_word(CHARSET_INFO *cs, uchar **start, uchar *end,
     if ((param->trunc=(doc<end && *doc == FTB_TRUNC)))
       doc++;
 
-    if (((length >= ft_min_word_len && !is_stopword((char*) word->pos,
+    if (((length >= ft_min_word_len && !is_stopword(word->pos,
                                                     word->len))
          || param->trunc) && length < ft_max_word_len)
     {
@@ -199,10 +199,11 @@ ret:
   return param->type;
 }
 
-uchar ft_simple_get_word(CHARSET_INFO *cs, uchar **start, const uchar *end,
-                         FT_WORD *word, my_bool skip_stopwords)
+uchar ft_simple_get_word(CHARSET_INFO *cs, const uchar **start,
+                         const uchar *end, FT_WORD *word,
+                         my_bool skip_stopwords)
 {
-  uchar *doc= *start;
+  const uchar *doc= *start;
   uint mwc, length;
   int mbl;
   int ctype;
@@ -214,7 +215,7 @@ uchar ft_simple_get_word(CHARSET_INFO *cs, uchar **start, const uchar *end,
     {
       if (doc >= end)
         DBUG_RETURN(0);
-      mbl= cs->cset->ctype(cs, &ctype, (uchar*)doc, (uchar*)end);
+      mbl= cs->cset->ctype(cs, &ctype, doc, end);
       if (true_word_char(ctype, *doc))
         break;
     }
@@ -223,7 +224,7 @@ uchar ft_simple_get_word(CHARSET_INFO *cs, uchar **start, const uchar *end,
     for (word->pos= doc; doc < end; length++,
          doc+= (mbl > 0 ? mbl : (mbl < 0 ? -mbl : 1)))
     {
-      mbl= cs->cset->ctype(cs, &ctype, (uchar*)doc, (uchar*)end);
+      mbl= cs->cset->ctype(cs, &ctype, doc, end);
       if (true_word_char(ctype, *doc))
         mwc= 0;
       else if (!misc_word_char(*doc) || mwc)
@@ -236,7 +237,7 @@ uchar ft_simple_get_word(CHARSET_INFO *cs, uchar **start, const uchar *end,
 
     if (skip_stopwords == FALSE ||
         (length >= ft_min_word_len && length < ft_max_word_len &&
-         !is_stopword((char*) word->pos, word->len)))
+         !is_stopword(word->pos, word->len)))
     {
       *start= doc;
       DBUG_RETURN(1);
@@ -249,14 +250,16 @@ void ft_parse_init(TREE *wtree, CHARSET_INFO *cs)
 {
   DBUG_ENTER("ft_parse_init");
   if (!is_tree_inited(wtree))
-    init_tree(wtree,0,0,sizeof(FT_WORD),(qsort_cmp2)&FT_WORD_cmp,0,NULL, cs);
+    init_tree(wtree,0,0,sizeof(FT_WORD),(qsort_cmp2)&FT_WORD_cmp,0,NULL,
+              (void*) cs);
   DBUG_VOID_RETURN;
 }
 
 
 static int ft_add_word(MYSQL_FTPARSER_PARAM *param,
-                       char *word, int word_len,
-             MYSQL_FTPARSER_BOOLEAN_INFO *boolean_info __attribute__((unused)))
+                       const uchar *word, mysql_ft_size_t word_len,
+                       MYSQL_FTPARSER_BOOLEAN_INFO *boolean_info
+                       __attribute__((unused)))
 {
   TREE *wtree;
   FT_WORD w;
@@ -284,23 +287,23 @@ static int ft_add_word(MYSQL_FTPARSER_PARAM *param,
 
 
 static int ft_parse_internal(MYSQL_FTPARSER_PARAM *param,
-                             char *doc_arg, int doc_len)
+                             const uchar *doc_arg, mysql_ft_size_t doc_len)
 {
-  uchar *doc= (uchar*) doc_arg;
-  uchar *end= doc + doc_len;
+  const uchar *doc= doc_arg;
+  const uchar *end= doc + doc_len;
   MY_FT_PARSER_PARAM *ft_param=param->mysql_ftparam;
   TREE *wtree= ft_param->wtree;
   FT_WORD w;
   DBUG_ENTER("ft_parse_internal");
 
   while (ft_simple_get_word(wtree->custom_arg, &doc, end, &w, TRUE))
-    if (param->mysql_add_word(param, (char*) w.pos, w.len, 0))
+    if (param->mysql_add_word(param, w.pos, w.len, 0))
       DBUG_RETURN(1);
   DBUG_RETURN(0);
 }
 
 
-int ft_parse(TREE *wtree, uchar *doc, int doclen,
+int ft_parse(TREE *wtree, const uchar *doc, mysql_ft_size_t doclen,
              struct st_mysql_ftparser *parser,
              MYSQL_FTPARSER_PARAM *param, MEM_ROOT *mem_root)
 {
@@ -315,7 +318,7 @@ int ft_parse(TREE *wtree, uchar *doc, int doclen,
   param->mysql_add_word= ft_add_word;
   param->mysql_ftparam= &my_param;
   param->cs= wtree->custom_arg;
-  param->doc= (char*) doc;
+  param->doc= doc;
   param->length= doclen;
   param->mode= MYSQL_FTPARSER_SIMPLE_MODE;
   DBUG_RETURN(parser->parse(param));
@@ -375,8 +378,8 @@ MYSQL_FTPARSER_PARAM *ftparser_call_initializer(MI_INFO *info,
        mysql_add_word != 0 - parser is initialized, or no
                              initialization needed. */
     info->ftparser_param[ftparser_nr].mysql_add_word=
-      (int (*)(struct st_mysql_ftparser_param *, char *, int,
-              MYSQL_FTPARSER_BOOLEAN_INFO *)) 1;
+      (int (*)(struct st_mysql_ftparser_param *, const uchar *,
+               mysql_ft_size_t, MYSQL_FTPARSER_BOOLEAN_INFO *)) 1;
     if (parser->init && parser->init(&info->ftparser_param[ftparser_nr]))
       return 0;
   }
diff --git a/storage/myisam/ft_static.c b/storage/myisam/ft_static.c
index 78fbc5781e9..6a0bda5b666 100644
--- a/storage/myisam/ft_static.c
+++ b/storage/myisam/ft_static.c
@@ -54,20 +54,6 @@ const struct _ft_vft _ft_vft_boolean= {
   ft_boolean_get_relevance,  ft_boolean_reinit_search
 };
 
-
-FT_INFO *ft_init_search(uint flags, void *info, uint keynr,
-                        uchar *query, uint query_len, CHARSET_INFO *cs,
-                        uchar *record)
-{
-  FT_INFO *res;
-  if (flags & FT_BOOL)
-    res= ft_init_boolean_search((MI_INFO *)info, keynr, query, query_len,cs);
-  else
-    res= ft_init_nlq_search((MI_INFO *)info, keynr, query, query_len, flags,
-			    record);
-  return res;
-}
-
 const char *ft_stopword_file= 0;
 const char *ft_precompiled_stopwords[]= {
 
diff --git a/storage/myisam/ft_stopwords.c b/storage/myisam/ft_stopwords.c
index e8d81cbbbb1..1079cf83417 100644
--- a/storage/myisam/ft_stopwords.c
+++ b/storage/myisam/ft_stopwords.c
@@ -24,8 +24,8 @@ static CHARSET_INFO *ft_stopword_cs= NULL;
 
 typedef struct st_ft_stopwords
 {
-  const char * pos;
-  uint   len;
+  const uchar* pos;
+  size_t len;
 } FT_STOPWORD;
 
 static TREE *stopwords3=NULL;
@@ -34,8 +34,8 @@ static int FT_STOPWORD_cmp(void* cmp_arg __attribute__((unused)),
 			   FT_STOPWORD *w1, FT_STOPWORD *w2)
 {
   return ha_compare_text(ft_stopword_cs,
-			 (uchar *)w1->pos,w1->len,
-			 (uchar *)w2->pos,w2->len,0,0);
+			 w1->pos, w1->len,
+			 w2->pos, w2->len, 0, 0);
 }
 
 static void FT_STOPWORD_free(FT_STOPWORD *w, TREE_FREE action,
@@ -48,17 +48,19 @@ static void FT_STOPWORD_free(FT_STOPWORD *w, TREE_FREE action,
 static int ft_add_stopword(const char *w)
 {
   FT_STOPWORD sw;
-  return !w ||
-         (((sw.len= (uint) strlen(sw.pos=w)) >= ft_min_word_len) &&
-          (tree_insert(stopwords3, &sw, 0, stopwords3->custom_arg)==NULL));
+  return (!w ||
+          (((sw.len= (uint) strlen((char*) (sw.pos=(const uchar *)w))) >= 
+            ft_min_word_len) &&
+           (tree_insert(stopwords3, &sw, 0, stopwords3->custom_arg)==NULL)));
 }
 
 int ft_init_stopwords()
 {
+  DBUG_ENTER("ft_init_stopwords");
   if (!stopwords3)
   {
     if (!(stopwords3=(TREE *)my_malloc(sizeof(TREE),MYF(0))))
-      return -1;
+      DBUG_RETURN(-1);
     init_tree(stopwords3,0,0,sizeof(FT_STOPWORD),(qsort_cmp2)&FT_STOPWORD_cmp,
               0,
               (ft_stopword_file ? (tree_element_free)&FT_STOPWORD_free : 0),
@@ -77,15 +79,16 @@ int ft_init_stopwords()
   {
     File fd;
     uint len;
-    uchar *buffer, *start, *end;
+    uchar *buffer;
+    const uchar *start, *end;
     FT_WORD w;
     int error=-1;
 
     if (!*ft_stopword_file)
-      return 0;
+      DBUG_RETURN(0);
 
     if ((fd=my_open(ft_stopword_file, O_RDONLY, MYF(MY_WME))) == -1)
-      return -1;
+      DBUG_RETURN(-1);
     len=(uint)my_seek(fd, 0L, MY_SEEK_END, MYF(0));
     my_seek(fd, 0L, MY_SEEK_SET, MYF(0));
     if (!(start=buffer=my_malloc(len+1, MYF(MY_WME))))
@@ -102,7 +105,7 @@ err1:
     my_free(buffer);
 err0:
     my_close(fd, MYF(MY_WME));
-    return error;
+    DBUG_RETURN(error);
   }
   else
   {
@@ -112,14 +115,15 @@ err0:
     for (;*sws;sws++)
     {
       if (ft_add_stopword(*sws))
-        return -1;
+        DBUG_RETURN(-1);
     }
     ft_stopword_file="(built-in)"; /* for SHOW VARIABLES */
   }
-  return 0;
+  DBUG_RETURN(0);
 }
 
-int is_stopword(char *word, uint len)
+
+int is_stopword(const uchar *word, size_t len)
 {
   FT_STOPWORD sw;
   sw.pos=word;
@@ -130,6 +134,8 @@ int is_stopword(char *word, uint len)
 
 void ft_free_stopwords()
 {
+  DBUG_ENTER("ft_free_stopwords");
+
   if (stopwords3)
   {
     delete_tree(stopwords3); /* purecov: inspected */
@@ -137,4 +143,5 @@ void ft_free_stopwords()
     stopwords3=0;
   }
   ft_stopword_file= 0;
+  DBUG_VOID_RETURN;
 }
diff --git a/storage/myisam/ftdefs.h b/storage/myisam/ftdefs.h
index ddcf1a8dc26..b26fa523b42 100644
--- a/storage/myisam/ftdefs.h
+++ b/storage/myisam/ftdefs.h
@@ -96,22 +96,23 @@
 #define FTB_RQUOT (ft_boolean_syntax[11])
 
 typedef struct st_ft_word {
-  uchar * pos;
-  uint	 len;
+  const uchar *pos;
   double weight;
+  size_t len;
 } FT_WORD;
 
-int is_stopword(char *word, uint len);
+int is_stopword(const uchar *word, size_t len);
 
 uint _ft_make_key(MI_INFO *, uint , uchar *, FT_WORD *, my_off_t);
 
-uchar ft_get_word(CHARSET_INFO *, uchar **, uchar *, FT_WORD *,
+uchar ft_get_word(CHARSET_INFO *, const uchar **, const uchar *, FT_WORD *,
                   MYSQL_FTPARSER_BOOLEAN_INFO *);
-uchar ft_simple_get_word(CHARSET_INFO *, uchar **, const uchar *,
+uchar ft_simple_get_word(CHARSET_INFO *, const uchar **, const uchar *,
                          FT_WORD *, my_bool);
 
 typedef struct _st_ft_seg_iterator {
-  uint        num, len;
+  uint        num;
+  mysql_ft_size_t len;
   HA_KEYSEG  *seg;
   const uchar *rec, *pos;
 } FT_SEG_ITERATOR;
@@ -121,15 +122,16 @@ void _mi_ft_segiterator_dummy_init(const uchar *, uint, FT_SEG_ITERATOR *);
 uint _mi_ft_segiterator(FT_SEG_ITERATOR *);
 
 void ft_parse_init(TREE *, CHARSET_INFO *);
-int ft_parse(TREE *, uchar *, int, struct st_mysql_ftparser *parser,
+int ft_parse(TREE *, const uchar *, int, struct st_mysql_ftparser *parser,
              MYSQL_FTPARSER_PARAM *, MEM_ROOT *);
 FT_WORD * ft_linearize(TREE *, MEM_ROOT *);
 FT_WORD * _mi_ft_parserecord(MI_INFO *, uint, const uchar *, MEM_ROOT *);
 uint _mi_ft_parse(TREE *, MI_INFO *, uint, const uchar *,
                   MYSQL_FTPARSER_PARAM *, MEM_ROOT *);
 
-FT_INFO *ft_init_nlq_search(MI_INFO *, uint, uchar *, uint, uint, uchar *);
-FT_INFO *ft_init_boolean_search(MI_INFO *, uint, uchar *, uint, CHARSET_INFO *);
+FT_INFO *ft_init_nlq_search(MI_INFO *, uint, uchar *, mysql_ft_size_t, uint,
+                            uchar *);
+FT_INFO *ft_init_boolean_search(MI_INFO *, uint, uchar *, mysql_ft_size_t, CHARSET_INFO *);
 
 extern const struct _ft_vft _ft_vft_nlq;
 int ft_nlq_read_next(FT_INFO *, char *);
diff --git a/storage/myisam/fulltext.h b/storage/myisam/fulltext.h
index 853eb6362e6..9aef2d0d002 100644
--- a/storage/myisam/fulltext.h
+++ b/storage/myisam/fulltext.h
@@ -20,33 +20,8 @@
 #include "myisamdef.h"
 #include "ft_global.h"
 
-#define HA_FT_WTYPE  HA_KEYTYPE_FLOAT
-#define HA_FT_WLEN   4
-#define FT_SEGS      2
-
-/**
-  Accessor methods for the weight and the number of subkeys in a buffer.
-
-  The weight is of float type and subkeys number is of integer type. Both
-  are stored in the same position of the buffer and the stored object is
-  identified by the sign (bit): the weight value is positive whilst the
-  number of subkeys is negative.
-
-  In light of C's strict-aliasing rules, which roughly state that an object
-  must not be accessed through incompatible types, these methods are used to
-  avoid any problems arising from the type duality inside the buffer. The
-  values are retrieved using a character type which can access any object.
-*/
-#define ft_sintXkorr(A)    mi_sint4korr(A)
-#define ft_intXstore(T,A)  mi_int4store(T,A)
-#define ft_floatXget(V,M)  mi_float4get(V,M)
-
-
-extern const HA_KEYSEG ft_keysegs[FT_SEGS];
-
 int  _mi_ft_cmp(MI_INFO *, uint, const uchar *, const uchar *);
 int  _mi_ft_add(MI_INFO *, uint, uchar *, const uchar *, my_off_t);
 int  _mi_ft_del(MI_INFO *, uint, uchar *, const uchar *, my_off_t);
 
 uint _mi_ft_convert_to_ft2(MI_INFO *, uint, uchar *);
-
diff --git a/storage/myisam/ha_myisam.cc b/storage/myisam/ha_myisam.cc
index 87de58076cd..fc6a9829072 100644
--- a/storage/myisam/ha_myisam.cc
+++ b/storage/myisam/ha_myisam.cc
@@ -132,13 +132,13 @@ static handler *myisam_create_handler(handlerton *hton,
 
 // collect errors printed by mi_check routines
 
-static void mi_check_print_msg(MI_CHECK *param,	const char* msg_type,
+static void mi_check_print_msg(HA_CHECK *param,	const char* msg_type,
 			       const char *fmt, va_list args)
 {
   THD* thd = (THD*)param->thd;
   Protocol *protocol= thd->protocol;
   size_t length, msg_length;
-  char msgbuf[MI_MAX_MSG_BUF];
+  char msgbuf[HA_MAX_MSG_BUF];
   char name[NAME_LEN*2+2];
 
   msg_length= my_vsnprintf(msgbuf, sizeof(msgbuf), fmt, args);
@@ -313,7 +313,7 @@ int table2myisam(TABLE *table_arg, MI_KEYDEF **keydef_out,
   record= table_arg->record[0];
   recpos= 0;
   recinfo_pos= recinfo;
-  while (recpos < (uint) share->reclength)
+  while (recpos < (uint) share->stored_rec_length)
   {
     Field **field, *found= 0;
     minpos= share->reclength;
@@ -339,30 +339,31 @@ int table2myisam(TABLE *table_arg, MI_KEYDEF **keydef_out,
     DBUG_PRINT("loop", ("found: 0x%lx  recpos: %d  minpos: %d  length: %d",
                         (long) found, recpos, minpos, length));
     if (recpos != minpos)
-    { // Reserved space (Null bits?)
+    {
+      /* reserve space for null bits */
       bzero((char*) recinfo_pos, sizeof(*recinfo_pos));
-      recinfo_pos->type= (int) FIELD_NORMAL;
+      recinfo_pos->type= FIELD_NORMAL;
       recinfo_pos++->length= (uint16) (minpos - recpos);
     }
     if (!found)
       break;
 
     if (found->flags & BLOB_FLAG)
-      recinfo_pos->type= (int) FIELD_BLOB;
+      recinfo_pos->type= FIELD_BLOB;
     else if (found->type() == MYSQL_TYPE_VARCHAR)
       recinfo_pos->type= FIELD_VARCHAR;
     else if (!(options & HA_OPTION_PACK_RECORD))
-      recinfo_pos->type= (int) FIELD_NORMAL;
+      recinfo_pos->type= FIELD_NORMAL;
     else if (found->zero_pack())
-      recinfo_pos->type= (int) FIELD_SKIP_ZERO;
+      recinfo_pos->type= FIELD_SKIP_ZERO;
     else
-      recinfo_pos->type= (int) ((length <= 3 ||
-                                 (found->flags & ZEROFILL_FLAG)) ?
-                                  FIELD_NORMAL :
-                                  found->type() == MYSQL_TYPE_STRING ||
-                                  found->type() == MYSQL_TYPE_VAR_STRING ?
-                                  FIELD_SKIP_ENDSPACE :
-                                  FIELD_SKIP_PRESPACE);
+      recinfo_pos->type= ((length <= 3 ||
+                           (found->flags & ZEROFILL_FLAG)) ?
+                          FIELD_NORMAL :
+                          found->type() == MYSQL_TYPE_STRING ||
+                          found->type() == MYSQL_TYPE_VAR_STRING ?
+                          FIELD_SKIP_ENDSPACE :
+                          FIELD_SKIP_PRESPACE);
     if (found->null_ptr)
     {
       recinfo_pos->null_bit= found->null_bit;
@@ -388,7 +389,7 @@ int table2myisam(TABLE *table_arg, MI_KEYDEF **keydef_out,
   Check for underlying table conformance
 
   SYNOPSIS
-    check_definition()
+    myisam_check_definition()
       t1_keyinfo       in    First table key definition
       t1_recinfo       in    First table record definition
       t1_keys          in    Number of keys in first table
@@ -557,13 +558,12 @@ int check_definition(MI_KEYDEF *t1_keyinfo, MI_COLUMNDEF *t1_recinfo,
 
 extern "C" {
 
-volatile int *killed_ptr(MI_CHECK *param)
+int killed_ptr(HA_CHECK *param)
 {
-  /* In theory Unsafe conversion, but should be ok for now */
-  return (int*) &(((THD *)(param->thd))->killed);
+  return thd_killed((THD*)param->thd);
 }
 
-void mi_check_print_error(MI_CHECK *param, const char *fmt,...)
+void mi_check_print_error(HA_CHECK *param, const char *fmt,...)
 {
   param->error_printed|=1;
   param->out_flag|= O_DATA_LOST;
@@ -573,7 +573,7 @@ void mi_check_print_error(MI_CHECK *param, const char *fmt,...)
   va_end(args);
 }
 
-void mi_check_print_info(MI_CHECK *param, const char *fmt,...)
+void mi_check_print_info(HA_CHECK *param, const char *fmt,...)
 {
   va_list args;
   va_start(args, fmt);
@@ -581,7 +581,7 @@ void mi_check_print_info(MI_CHECK *param, const char *fmt,...)
   va_end(args);
 }
 
-void mi_check_print_warning(MI_CHECK *param, const char *fmt,...)
+void mi_check_print_warning(HA_CHECK *param, const char *fmt,...)
 {
   param->warning_printed=1;
   param->out_flag|= O_DATA_LOST;
@@ -705,6 +705,9 @@ int ha_myisam::open(const char *name, int mode, uint test_if_locked)
 
   if (!(file=mi_open(name, mode, test_if_locked | HA_OPEN_FROM_SQL_LAYER)))
     return (my_errno ? my_errno : -1);
+
+  file->s->chst_invalidator= query_cache_invalidate_by_MyISAM_filename_ref;
+
   if (!table->s->tmp_table) /* No need to perform a check for tmp table */
   {
     if ((my_errno= table2myisam(table, &keyinfo, &recinfo, &recs)))
@@ -736,7 +739,19 @@ int ha_myisam::open(const char *name, int mode, uint test_if_locked)
   if (!table->s->db_record_offset)
     int_table_flags|=HA_REC_NOT_IN_SEQ;
   if (file->s->options & (HA_OPTION_CHECKSUM | HA_OPTION_COMPRESS_RECORD))
-    int_table_flags|=HA_HAS_CHECKSUM;
+  {
+    /*
+      Set which type of automatic checksum we have
+      The old checksum and new checksum are identical if there is no
+      null fields.
+      Files with new checksum has the HA_OPTION_NULL_FIELDS bit set.
+    */      
+    if ((file->s->options & HA_OPTION_NULL_FIELDS) ||
+        !file->s->has_null_fields)
+      int_table_flags|= HA_HAS_NEW_CHECKSUM;
+    if (!(file->s->options & HA_OPTION_NULL_FIELDS))
+      int_table_flags|= HA_HAS_OLD_CHECKSUM;
+  }
 
   for (i= 0; i < table->s->keys; i++)
   {
@@ -763,14 +778,14 @@ int ha_myisam::open(const char *name, int mode, uint test_if_locked)
 int ha_myisam::close(void)
 {
   MI_INFO *tmp=file;
+  if (!tmp)
+    return 0;
   file=0;
   return mi_close(tmp);
 }
 
 int ha_myisam::write_row(uchar *buf)
 {
-  ha_statistic_increment(&SSV::ha_write_count);
-
   /* If we have a timestamp column, update it to the current time */
   if (table->timestamp_field_type & TIMESTAMP_AUTO_SET_ON_INSERT)
     table->timestamp_field->set_time();
@@ -792,10 +807,13 @@ int ha_myisam::check(THD* thd, HA_CHECK_OPT* check_opt)
 {
   if (!file) return HA_ADMIN_INTERNAL_ERROR;
   int error;
-  MI_CHECK param;
+  HA_CHECK &param= *(HA_CHECK*) thd->alloc(sizeof(param));
   MYISAM_SHARE* share = file->s;
   const char *old_proc_info=thd->proc_info;
 
+  if (!&param)
+    return HA_ADMIN_INTERNAL_ERROR;
+
   thd_proc_info(thd, "Checking table");
   myisamchk_init(&param);
   param.thd = thd;
@@ -803,7 +821,7 @@ int ha_myisam::check(THD* thd, HA_CHECK_OPT* check_opt)
   param.db_name=    table->s->db.str;
   param.table_name= table->alias;
   param.testflag = check_opt->flags | T_CHECK | T_SILENT;
-  param.stats_method= (enum_mi_stats_method)THDVAR(thd, stats_method);
+  param.stats_method= (enum_handler_stats_method)THDVAR(thd, stats_method);
 
   if (!(table->db_stat & HA_READ_ONLY))
     param.testflag|= T_STATISTICS;
@@ -832,13 +850,13 @@ int ha_myisam::check(THD* thd, HA_CHECK_OPT* check_opt)
 	  (param.testflag & (T_EXTEND | T_MEDIUM)))) ||
 	mi_is_crashed(file))
     {
-      uint old_testflag=param.testflag;
+      ulonglong old_testflag= param.testflag;
       param.testflag|=T_MEDIUM;
       if (!(error= init_io_cache(&param.read_cache, file->dfile,
                                  my_default_record_cache_size, READ_CACHE,
                                  share->pack.header_length, 1, MYF(MY_WME))))
       {
-        error= chk_data_link(&param, file, param.testflag & T_EXTEND);
+        error= chk_data_link(&param, file, test(param.testflag & T_EXTEND));
         end_io_cache(&(param.read_cache));
       }
       param.testflag= old_testflag;
@@ -884,9 +902,12 @@ int ha_myisam::check(THD* thd, HA_CHECK_OPT* check_opt)
 int ha_myisam::analyze(THD *thd, HA_CHECK_OPT* check_opt)
 {
   int error=0;
-  MI_CHECK param;
+  HA_CHECK &param= *(HA_CHECK*) thd->alloc(sizeof(param));
   MYISAM_SHARE* share = file->s;
 
+  if (!&param)
+    return HA_ADMIN_INTERNAL_ERROR;
+
   myisamchk_init(&param);
   param.thd = thd;
   param.op_name=    "analyze";
@@ -895,7 +916,7 @@ int ha_myisam::analyze(THD *thd, HA_CHECK_OPT* check_opt)
   param.testflag= (T_FAST | T_CHECK | T_SILENT | T_STATISTICS |
                    T_DONT_CHECK_CHECKSUM);
   param.using_global_keycache = 1;
-  param.stats_method= (enum_mi_stats_method)THDVAR(thd, stats_method);
+  param.stats_method= (enum_handler_stats_method)THDVAR(thd, stats_method);
 
   if (!(share->state.changed & STATE_NOT_ANALYZED))
     return HA_ADMIN_ALREADY_DONE;
@@ -916,10 +937,10 @@ int ha_myisam::analyze(THD *thd, HA_CHECK_OPT* check_opt)
 int ha_myisam::repair(THD* thd, HA_CHECK_OPT *check_opt)
 {
   int error;
-  MI_CHECK param;
+  HA_CHECK &param= *(HA_CHECK*) thd->alloc(sizeof(param));
   ha_rows start_records;
 
-  if (!file) return HA_ADMIN_INTERNAL_ERROR;
+  if (!file || !&param) return HA_ADMIN_INTERNAL_ERROR;
 
   myisamchk_init(&param);
   param.thd = thd;
@@ -935,7 +956,9 @@ int ha_myisam::repair(THD* thd, HA_CHECK_OPT *check_opt)
     if (test_all_bits(param.testflag,
 		      (uint) (T_RETRY_WITHOUT_QUICK | T_QUICK)))
     {
-      param.testflag&= ~T_RETRY_WITHOUT_QUICK;
+      param.testflag&= ~(T_RETRY_WITHOUT_QUICK | T_QUICK);
+      /* Ensure we don't loose any rows when retrying without quick */
+      param.testflag|= T_SAFE_REPAIR;
       sql_print_information("Retrying repair of: '%s' without quick",
                             table->s->path.str);
       continue;
@@ -965,8 +988,9 @@ int ha_myisam::repair(THD* thd, HA_CHECK_OPT *check_opt)
 int ha_myisam::optimize(THD* thd, HA_CHECK_OPT *check_opt)
 {
   int error;
-  if (!file) return HA_ADMIN_INTERNAL_ERROR;
-  MI_CHECK param;
+  HA_CHECK &param= *(HA_CHECK*) thd->alloc(sizeof(param));
+
+  if (!file || !&param) return HA_ADMIN_INTERNAL_ERROR;
 
   myisamchk_init(&param);
   param.thd = thd;
@@ -985,10 +1009,10 @@ int ha_myisam::optimize(THD* thd, HA_CHECK_OPT *check_opt)
 }
 
 
-int ha_myisam::repair(THD *thd, MI_CHECK &param, bool do_optimize)
+int ha_myisam::repair(THD *thd, HA_CHECK &param, bool do_optimize)
 {
   int error=0;
-  uint local_testflag=param.testflag;
+  ulonglong local_testflag= param.testflag;
   bool optimize_done= !do_optimize, statistics_done=0;
   const char *old_proc_info=thd->proc_info;
   char fixed_name[FN_REFLEN];
@@ -1024,7 +1048,7 @@ int ha_myisam::repair(THD *thd, MI_CHECK &param, bool do_optimize)
     ulonglong key_map= ((local_testflag & T_CREATE_MISSING_KEYS) ?
 			mi_get_mask_all_keys_active(share->base.keys) :
 			share->state.key_map);
-    uint testflag=param.testflag;
+    ulonglong testflag= param.testflag;
     if (mi_test_if_sort_rep(file,file->state->records,key_map,0) &&
 	(local_testflag & T_REP_BY_SORT))
     {
@@ -1038,7 +1062,7 @@ int ha_myisam::repair(THD *thd, MI_CHECK &param, bool do_optimize)
         my_snprintf(buf, 40, "Repair with %d threads", my_count_bits(key_map));
         thd_proc_info(thd, buf);
         error = mi_repair_parallel(&param, file, fixed_name,
-            param.testflag & T_QUICK);
+                                   test(param.testflag & T_QUICK));
         thd_proc_info(thd, "Repair done"); // to reset proc_info, as
                                       // it was pointing to local buffer
       }
@@ -1046,7 +1070,7 @@ int ha_myisam::repair(THD *thd, MI_CHECK &param, bool do_optimize)
       {
         thd_proc_info(thd, "Repair by sorting");
         error = mi_repair_by_sort(&param, file, fixed_name,
-            param.testflag & T_QUICK);
+                                  test(param.testflag & T_QUICK));
       }
     }
     else
@@ -1054,9 +1078,9 @@ int ha_myisam::repair(THD *thd, MI_CHECK &param, bool do_optimize)
       thd_proc_info(thd, "Repair with keycache");
       param.testflag &= ~T_REP_BY_SORT;
       error=  mi_repair(&param, file, fixed_name,
-			param.testflag & T_QUICK);
+			test(param.testflag & T_QUICK));
     }
-    param.testflag=testflag;
+    param.testflag= testflag | (param.testflag & T_RETRY_WITHOUT_QUICK);
     optimize_done=1;
   }
   if (!error)
@@ -1160,7 +1184,10 @@ int ha_myisam::assign_to_keycache(THD* thd, HA_CHECK_OPT *check_opt)
   if (error != HA_ADMIN_OK)
   {
     /* Send error to user */
-    MI_CHECK param;
+    HA_CHECK &param= *(HA_CHECK*) thd->alloc(sizeof(param));
+    if (!&param)
+      return HA_ADMIN_INTERNAL_ERROR;
+
     myisamchk_init(&param);
     param.thd= thd;
     param.op_name=    "assign_to_keycache";
@@ -1224,7 +1251,9 @@ int ha_myisam::preload_keys(THD* thd, HA_CHECK_OPT *check_opt)
 
  err:
   {
-    MI_CHECK param;
+    HA_CHECK &param= *(HA_CHECK*) thd->alloc(sizeof(param));
+    if (!&param)
+      return HA_ADMIN_INTERNAL_ERROR;
     myisamchk_init(&param);
     param.thd= thd;
     param.op_name=    "preload_keys";
@@ -1334,8 +1363,12 @@ int ha_myisam::enable_indexes(uint mode)
   else if (mode == HA_KEY_SWITCH_NONUNIQ_SAVE)
   {
     THD *thd=current_thd;
-    MI_CHECK param;
+    HA_CHECK &param= *(HA_CHECK*) thd->alloc(sizeof(param));
     const char *save_proc_info=thd->proc_info;
+
+    if (!&param)
+      return HA_ADMIN_INTERNAL_ERROR;
+
     thd_proc_info(thd, "Creating index");
     myisamchk_init(&param);
     param.op_name= "recreating_index";
@@ -1343,7 +1376,7 @@ int ha_myisam::enable_indexes(uint mode)
                      T_CREATE_MISSING_KEYS);
     param.myf_rw&= ~MY_WAIT_IF_FULL;
     param.sort_buffer_length=  THDVAR(thd, sort_buffer_size);
-    param.stats_method= (enum_mi_stats_method)THDVAR(thd, stats_method);
+    param.stats_method= (enum_handler_stats_method)THDVAR(thd, stats_method);
     param.tmpdir=&mysql_tmpdir_list;
     if ((error= (repair(thd,param,0) != HA_ADMIN_OK)) && param.retry_repair)
     {
@@ -1471,7 +1504,7 @@ int ha_myisam::end_bulk_insert()
 {
   mi_end_bulk_insert(file);
   int err=mi_extra(file, HA_EXTRA_NO_CACHE, 0);
-  if (!err)
+  if (!err && !file->s->deleting)
   {
     if (can_enable_indexes)
     {
@@ -1539,7 +1572,6 @@ bool ha_myisam::is_crashed() const
 
 int ha_myisam::update_row(const uchar *old_data, uchar *new_data)
 {
-  ha_statistic_increment(&SSV::ha_update_count);
   if (table->timestamp_field_type & TIMESTAMP_AUTO_SET_ON_UPDATE)
     table->timestamp_field->set_time();
   return mi_update(file,old_data,new_data);
@@ -1547,19 +1579,58 @@ int ha_myisam::update_row(const uchar *old_data, uchar *new_data)
 
 int ha_myisam::delete_row(const uchar *buf)
 {
-  ha_statistic_increment(&SSV::ha_delete_count);
   return mi_delete(file,buf);
 }
 
+
+C_MODE_START
+
+ICP_RESULT index_cond_func_myisam(void *arg)
+{
+  ha_myisam *h= (ha_myisam*)arg;
+  if (h->end_range)
+  {
+    if (h->compare_key2(h->end_range) > 0)
+      return ICP_OUT_OF_RANGE; /* caller should return HA_ERR_END_OF_FILE already */
+  }
+  return (ICP_RESULT) test(h->pushed_idx_cond->val_int());
+}
+
+C_MODE_END
+
+
+int ha_myisam::index_init(uint idx, bool sorted)
+{ 
+  active_index=idx;
+  if (pushed_idx_cond_keyno == idx)
+    mi_set_index_cond_func(file, index_cond_func_myisam, this);
+  return 0; 
+}
+
+
+int ha_myisam::index_end()
+{
+  active_index=MAX_KEY;
+  //pushed_idx_cond_keyno= MAX_KEY;
+  mi_set_index_cond_func(file, NULL, 0);
+  in_range_check_pushed_down= FALSE;
+  ds_mrr.dsmrr_close();
+  return 0; 
+}
+
+int ha_myisam::rnd_end()
+{
+  ds_mrr.dsmrr_close();
+  return 0;
+}
+
 int ha_myisam::index_read_map(uchar *buf, const uchar *key,
                               key_part_map keypart_map,
                               enum ha_rkey_function find_flag)
 {
   MYSQL_INDEX_READ_ROW_START(table_share->db.str, table_share->table_name.str);
   DBUG_ASSERT(inited==INDEX);
-  ha_statistic_increment(&SSV::ha_read_key_count);
   int error=mi_rkey(file, buf, active_index, key, keypart_map, find_flag);
-  table->status=error ? STATUS_NOT_FOUND: 0;
   MYSQL_INDEX_READ_ROW_DONE(error);
   return error;
 }
@@ -1569,34 +1640,16 @@ int ha_myisam::index_read_idx_map(uchar *buf, uint index, const uchar *key,
                                   enum ha_rkey_function find_flag)
 {
   MYSQL_INDEX_READ_ROW_START(table_share->db.str, table_share->table_name.str);
-  ha_statistic_increment(&SSV::ha_read_key_count);
   int error=mi_rkey(file, buf, index, key, keypart_map, find_flag);
-  table->status=error ? STATUS_NOT_FOUND: 0;
   MYSQL_INDEX_READ_ROW_DONE(error);
   return error;
 }
 
-int ha_myisam::index_read_last_map(uchar *buf, const uchar *key,
-                                   key_part_map keypart_map)
-{
-  MYSQL_INDEX_READ_ROW_START(table_share->db.str, table_share->table_name.str);
-  DBUG_ENTER("ha_myisam::index_read_last");
-  DBUG_ASSERT(inited==INDEX);
-  ha_statistic_increment(&SSV::ha_read_key_count);
-  int error=mi_rkey(file, buf, active_index, key, keypart_map,
-                    HA_READ_PREFIX_LAST);
-  table->status=error ? STATUS_NOT_FOUND: 0;
-  MYSQL_INDEX_READ_ROW_DONE(error);
-  DBUG_RETURN(error);
-}
-
 int ha_myisam::index_next(uchar *buf)
 {
   MYSQL_INDEX_READ_ROW_START(table_share->db.str, table_share->table_name.str);
   DBUG_ASSERT(inited==INDEX);
-  ha_statistic_increment(&SSV::ha_read_next_count);
   int error=mi_rnext(file,buf,active_index);
-  table->status=error ? STATUS_NOT_FOUND: 0;
   MYSQL_INDEX_READ_ROW_DONE(error);
   return error;
 }
@@ -1605,9 +1658,7 @@ int ha_myisam::index_prev(uchar *buf)
 {
   MYSQL_INDEX_READ_ROW_START(table_share->db.str, table_share->table_name.str);
   DBUG_ASSERT(inited==INDEX);
-  ha_statistic_increment(&SSV::ha_read_prev_count);
   int error=mi_rprev(file,buf, active_index);
-  table->status=error ? STATUS_NOT_FOUND: 0;
   MYSQL_INDEX_READ_ROW_DONE(error);
   return error;
 }
@@ -1616,9 +1667,7 @@ int ha_myisam::index_first(uchar *buf)
 {
   MYSQL_INDEX_READ_ROW_START(table_share->db.str, table_share->table_name.str);
   DBUG_ASSERT(inited==INDEX);
-  ha_statistic_increment(&SSV::ha_read_first_count);
   int error=mi_rfirst(file, buf, active_index);
-  table->status=error ? STATUS_NOT_FOUND: 0;
   MYSQL_INDEX_READ_ROW_DONE(error);
   return error;
 }
@@ -1627,10 +1676,9 @@ int ha_myisam::index_last(uchar *buf)
 {
   MYSQL_INDEX_READ_ROW_START(table_share->db.str, table_share->table_name.str);
   DBUG_ASSERT(inited==INDEX);
-  ha_statistic_increment(&SSV::ha_read_last_count);
   int error=mi_rlast(file, buf, active_index);
-  table->status=error ? STATUS_NOT_FOUND: 0;
   MYSQL_INDEX_READ_ROW_DONE(error);
+#warning move that to wrappers
   return error;
 }
 
@@ -1641,12 +1689,10 @@ int ha_myisam::index_next_same(uchar *buf,
   int error;
   DBUG_ASSERT(inited==INDEX);
   MYSQL_INDEX_READ_ROW_START(table_share->db.str, table_share->table_name.str);
-  ha_statistic_increment(&SSV::ha_read_next_count);
   do
   {
     error= mi_rnext_same(file,buf);
   } while (error == HA_ERR_RECORD_DELETED);
-  table->status=error ? STATUS_NOT_FOUND: 0;
   MYSQL_INDEX_READ_ROW_DONE(error);
   return error;
 }
@@ -1663,25 +1709,27 @@ int ha_myisam::rnd_next(uchar *buf)
 {
   MYSQL_READ_ROW_START(table_share->db.str, table_share->table_name.str,
                        TRUE);
-  ha_statistic_increment(&SSV::ha_read_rnd_next_count);
   int error=mi_scan(file, buf);
-  table->status=error ? STATUS_NOT_FOUND: 0;
   MYSQL_READ_ROW_DONE(error);
   return error;
 }
 
-int ha_myisam::restart_rnd_next(uchar *buf, uchar *pos)
+int ha_myisam::remember_rnd_pos()
 {
-  return rnd_pos(buf,pos);
+  position((uchar*) 0);
+  return 0;
+}
+
+int ha_myisam::restart_rnd_next(uchar *buf)
+{
+  return rnd_pos(buf, ref);
 }
 
 int ha_myisam::rnd_pos(uchar *buf, uchar *pos)
 {
   MYSQL_READ_ROW_START(table_share->db.str, table_share->table_name.str,
                        FALSE);
-  ha_statistic_increment(&SSV::ha_read_rnd_count);
   int error=mi_rrnd(file, buf, my_get_ptr(pos,ref_length));
-  table->status=error ? STATUS_NOT_FOUND: 0;
   MYSQL_READ_ROW_DONE(error);
   return error;
 }
@@ -1714,6 +1762,16 @@ int ha_myisam::info(uint flag)
     stats.max_data_file_length=  misam_info.max_data_file_length;
     stats.max_index_file_length= misam_info.max_index_file_length;
     stats.create_time= (ulong) misam_info.create_time;
+    /* 
+      We want the value of stats.mrr_length_per_rec to be platform independent.
+      The size of the chunk at the end of the join buffer used for MRR needs
+      is calculated now basing on the values passed in the stats structure.
+      The remaining part of the join buffer is used for records. A different
+      number of records in the buffer results in a different number of buffer
+      refills and in a different order of records in the result set.
+    */
+    stats.mrr_length_per_rec= misam_info.reflength + 8; // 8=max(sizeof(void *))
+
     ref_length= misam_info.reflength;
     share->db_options_in_use= misam_info.options;
     stats.block_size= myisam_block_size;        /* record block size */
@@ -1769,8 +1827,13 @@ int ha_myisam::extra(enum ha_extra_function operation)
   return mi_extra(file, operation, 0);
 }
 
+
 int ha_myisam::reset(void)
 {
+  pushed_idx_cond= NULL;
+  pushed_idx_cond_keyno= MAX_KEY;
+  mi_set_index_cond_func(file, NULL, 0);
+  ds_mrr.dsmrr_close();
   return mi_reset(file);
 }
 
@@ -1901,7 +1964,7 @@ void ha_myisam::get_auto_increment(ulonglong offset, ulonglong increment,
 {
   ulonglong nr;
   int error;
-  uchar key[MI_MAX_KEY_LENGTH];
+  uchar key[HA_MAX_KEY_LENGTH];
 
   if (!table->s->next_number_key_offset)
   {						// Autoincrement at key-start
@@ -1985,8 +2048,6 @@ int ha_myisam::ft_read(uchar *buf)
 			&LOCK_status); // why ?
 
   error=ft_handler->please->read_next(ft_handler,(char*) buf);
-
-  table->status=error ? STATUS_NOT_FOUND: 0;
   return error;
 }
 
@@ -2016,6 +2077,27 @@ bool ha_myisam::check_if_incompatible_data(HA_CREATE_INFO *info,
   return COMPATIBLE_DATA_YES;
 }
 
+
+/**
+  Check if a table is incompatible with the current version.
+
+  The cases are:
+  - Table has checksum, varchars and are not of dynamic record type
+*/
+
+int ha_myisam::check_for_upgrade(HA_CHECK_OPT *check_opt)
+{
+  if (!(file->s->options & HA_OPTION_NULL_FIELDS) &&
+      !(file->s->options & HA_OPTION_PACK_RECORD) &&
+      file->s->has_varchar_fields)
+  {
+    /* We need alter there to get the HA_OPTION_NULL_FIELDS flag to be set */
+    return HA_ADMIN_NEEDS_ALTER;
+  }
+  return HA_ADMIN_OK;
+}
+
+
 extern int mi_panic(enum ha_panic_function flag);
 int myisam_panic(handlerton *hton, ha_panic_function flag)
 {
@@ -2060,6 +2142,61 @@ static struct st_mysql_sys_var* myisam_sysvars[]= {
   0
 };
 
+/****************************************************************************
+ * MyISAM MRR implementation: use DS-MRR
+ ***************************************************************************/
+
+int ha_myisam::multi_range_read_init(RANGE_SEQ_IF *seq, void *seq_init_param,
+                                     uint n_ranges, uint mode, 
+                                     HANDLER_BUFFER *buf)
+{
+  return ds_mrr.dsmrr_init(this, seq, seq_init_param, n_ranges, mode, buf);
+}
+
+int ha_myisam::multi_range_read_next(char **range_info)
+{
+  return ds_mrr.dsmrr_next(range_info);
+}
+
+ha_rows ha_myisam::multi_range_read_info_const(uint keyno, RANGE_SEQ_IF *seq,
+                                               void *seq_init_param, 
+                                               uint n_ranges, uint *bufsz,
+                                               uint *flags, COST_VECT *cost)
+{
+  /*
+    This call is here because there is no location where this->table would
+    already be known.
+    TODO: consider moving it into some per-query initialization call.
+  */
+  ds_mrr.init(this, table);
+  return ds_mrr.dsmrr_info_const(keyno, seq, seq_init_param, n_ranges, bufsz,
+                                 flags, cost);
+}
+
+ha_rows ha_myisam::multi_range_read_info(uint keyno, uint n_ranges, uint keys,
+                                         uint *bufsz, uint *flags,
+                                         COST_VECT *cost)
+{
+  ds_mrr.init(this, table);
+  return ds_mrr.dsmrr_info(keyno, n_ranges, keys, bufsz, flags, cost);
+}
+
+/* MyISAM MRR implementation ends */
+
+
+/* Index condition pushdown implementation*/
+
+
+Item *ha_myisam::idx_cond_push(uint keyno_arg, Item* idx_cond_arg)
+{
+  pushed_idx_cond_keyno= keyno_arg;
+  pushed_idx_cond= idx_cond_arg;
+  in_range_check_pushed_down= TRUE;
+  if (active_index == pushed_idx_cond_keyno)
+    mi_set_index_cond_func(file, index_cond_func_myisam, this);
+  return NULL;
+}
+
 struct st_mysql_storage_engine myisam_storage_engine=
 { MYSQL_HANDLERTON_INTERFACE_VERSION };
 
@@ -2079,6 +2216,23 @@ mysql_declare_plugin(myisam)
   NULL
 }
 mysql_declare_plugin_end;
+maria_declare_plugin(myisam)
+{
+  MYSQL_STORAGE_ENGINE_PLUGIN,
+  &myisam_storage_engine,
+  "MyISAM",
+  "MySQL AB",
+  "Default engine as of MySQL 3.23 with great performance",
+  PLUGIN_LICENSE_GPL,
+  myisam_init, /* Plugin Init */
+  NULL, /* Plugin Deinit */
+  0x0100, /* 1.0 */
+  NULL,                       /* status variables                */
+  NULL,                       /* system variables                */
+  "1.0",                      /* string version */
+  MariaDB_PLUGIN_MATURITY_STABLE /* maturity */
+}
+maria_declare_plugin_end;
 
 
 #ifdef HAVE_QUERY_CACHE
diff --git a/storage/myisam/ha_myisam.h b/storage/myisam/ha_myisam.h
index 1a8246ae882..61585013678 100644
--- a/storage/myisam/ha_myisam.h
+++ b/storage/myisam/ha_myisam.h
@@ -21,6 +21,7 @@
 /* class for the the myisam handler */
 
 #include <myisam.h>
+#include <myisamchk.h>
 #include <ft_global.h>
 #include "handler.h"                            /* handler */
 #include "table.h"                              /* TABLE_SHARE */
@@ -39,13 +40,17 @@ extern TYPELIB myisam_recover_typelib;
 extern const char *myisam_recover_names[];
 extern ulonglong myisam_recover_options;
 
+C_MODE_START
+ICP_RESULT index_cond_func_myisam(void *arg);
+C_MODE_END
+
 class ha_myisam: public handler
 {
   MI_INFO *file;
   ulonglong int_table_flags;
   char    *data_file_name, *index_file_name;
   bool can_enable_indexes;
-  int repair(THD *thd, MI_CHECK &param, bool optimize);
+  int repair(THD *thd, HA_CHECK &param, bool optimize);
 
  public:
   ha_myisam(handlerton *hton, TABLE_SHARE *table_arg);
@@ -55,15 +60,19 @@ class ha_myisam: public handler
   const char *index_type(uint key_number);
   const char **bas_ext() const;
   ulonglong table_flags() const { return int_table_flags; }
+  int index_init(uint idx, bool sorted);
+  int index_end();
+  int rnd_end();
+
   ulong index_flags(uint inx, uint part, bool all_parts) const
   {
     return ((table_share->key_info[inx].algorithm == HA_KEY_ALG_FULLTEXT) ?
             0 : HA_READ_NEXT | HA_READ_PREV | HA_READ_RANGE |
-            HA_READ_ORDER | HA_KEYREAD_ONLY);
+            HA_READ_ORDER | HA_KEYREAD_ONLY | HA_DO_INDEX_COND_PUSHDOWN);
   }
   uint max_supported_keys()          const { return MI_MAX_KEY; }
-  uint max_supported_key_length()    const { return MI_MAX_KEY_LENGTH; }
-  uint max_supported_key_part_length() const { return MI_MAX_KEY_LENGTH; }
+  uint max_supported_key_length()    const { return HA_MAX_KEY_LENGTH; }
+  uint max_supported_key_part_length() const { return HA_MAX_KEY_LENGTH; }
   uint checksum() const;
 
   int open(const char *name, int mode, uint test_if_locked);
@@ -76,7 +85,6 @@ class ha_myisam: public handler
   int index_read_idx_map(uchar *buf, uint index, const uchar *key,
                          key_part_map keypart_map,
                          enum ha_rkey_function find_flag);
-  int index_read_last_map(uchar *buf, const uchar *key, key_part_map keypart_map);
   int index_next(uchar * buf);
   int index_prev(uchar * buf);
   int index_first(uchar * buf);
@@ -99,7 +107,8 @@ class ha_myisam: public handler
   int rnd_init(bool scan);
   int rnd_next(uchar *buf);
   int rnd_pos(uchar * buf, uchar *pos);
-  int restart_rnd_next(uchar *buf, uchar *pos);
+  int remember_rnd_pos();
+  int restart_rnd_next(uchar *buf);
   void position(const uchar *record);
   int info(uint);
   int extra(enum ha_extra_function operation);
@@ -124,6 +133,7 @@ class ha_myisam: public handler
                                   ulonglong *nb_reserved_values);
   int rename_table(const char * from, const char * to);
   int delete_table(const char *name);
+  int check_for_upgrade(HA_CHECK_OPT *check_opt);
   int check(THD* thd, HA_CHECK_OPT* check_opt);
   int analyze(THD* thd,HA_CHECK_OPT* check_opt);
   int repair(THD* thd, HA_CHECK_OPT* check_opt);
@@ -134,6 +144,7 @@ class ha_myisam: public handler
   int assign_to_keycache(THD* thd, HA_CHECK_OPT* check_opt);
   int preload_keys(THD* thd, HA_CHECK_OPT* check_opt);
   bool check_if_incompatible_data(HA_CREATE_INFO *info, uint table_changes);
+  bool check_if_supported_virtual_columns(void) { return TRUE;}
 #ifdef HAVE_QUERY_CACHE
   my_bool register_query_cache_table(THD *thd, char *table_key,
                                      uint key_length,
@@ -145,4 +156,23 @@ class ha_myisam: public handler
   {
     return file;
   }
+public:
+  /**
+   * Multi Range Read interface
+   */
+  int multi_range_read_init(RANGE_SEQ_IF *seq, void *seq_init_param,
+                            uint n_ranges, uint mode, HANDLER_BUFFER *buf);
+  int multi_range_read_next(char **range_info);
+  ha_rows multi_range_read_info_const(uint keyno, RANGE_SEQ_IF *seq,
+                                      void *seq_init_param, 
+                                      uint n_ranges, uint *bufsz,
+                                      uint *flags, COST_VECT *cost);
+  ha_rows multi_range_read_info(uint keyno, uint n_ranges, uint keys,
+                                uint *bufsz, uint *flags, COST_VECT *cost);
+  
+  /* Index condition pushdown implementation */
+  Item *idx_cond_push(uint keyno, Item* idx_cond);
+private:
+  DsMrr_impl ds_mrr;
+  friend ICP_RESULT index_cond_func_myisam(void *arg);
 };
diff --git a/storage/myisam/mi_cache.c b/storage/myisam/mi_cache.c
index 139a50a7c0d..ddbfda11326 100644
--- a/storage/myisam/mi_cache.c
+++ b/storage/myisam/mi_cache.c
@@ -98,8 +98,8 @@ int _mi_read_cache(IO_CACHE *info, uchar *buff, my_off_t pos, uint length,
     DBUG_PRINT("error",
                ("Error %d reading next-multi-part block (Got %d bytes)",
                 my_errno, (int) read_length));
-    if (!my_errno || my_errno == -1)
-      my_errno=HA_ERR_WRONG_IN_RECORD;
+    if (!my_errno || my_errno == -1 || my_errno == HA_ERR_FILE_TOO_SHORT)
+      my_errno= HA_ERR_WRONG_IN_RECORD;
     DBUG_RETURN(1);
   }
   bzero(buff+read_length,MI_BLOCK_INFO_HEADER_LENGTH - in_buff_length -
diff --git a/storage/myisam/mi_check.c b/storage/myisam/mi_check.c
index 6bf01cd63c7..89644e5f978 100644
--- a/storage/myisam/mi_check.c
+++ b/storage/myisam/mi_check.c
@@ -54,14 +54,14 @@
 
 	/* Functions defined in this file */
 
-static int check_k_link(MI_CHECK *param, MI_INFO *info,uint nr);
-static int chk_index(MI_CHECK *param, MI_INFO *info,MI_KEYDEF *keyinfo,
+static int check_k_link(HA_CHECK *param, MI_INFO *info,uint nr);
+static int chk_index(HA_CHECK *param, MI_INFO *info,MI_KEYDEF *keyinfo,
 		     my_off_t page, uchar *buff, ha_rows *keys,
 		     ha_checksum *key_checksum, uint level);
 static uint isam_key_length(MI_INFO *info,MI_KEYDEF *keyinfo);
 static ha_checksum calc_checksum(ha_rows count);
 static int writekeys(MI_SORT_PARAM *sort_param);
-static int sort_one_index(MI_CHECK *param, MI_INFO *info,MI_KEYDEF *keyinfo,
+static int sort_one_index(HA_CHECK *param, MI_INFO *info,MI_KEYDEF *keyinfo,
 			  my_off_t pagepos, File new_file);
 static int sort_key_read(MI_SORT_PARAM *sort_param,void *key);
 static int sort_ft_key_read(MI_SORT_PARAM *sort_param,void *key);
@@ -75,19 +75,19 @@ static int sort_insert_key(MI_SORT_PARAM  *sort_param,
                            reg1 SORT_KEY_BLOCKS *key_block,
 			   uchar *key, my_off_t prev_block);
 static int sort_delete_record(MI_SORT_PARAM *sort_param);
-/*static int flush_pending_blocks(MI_CHECK *param);*/
-static SORT_KEY_BLOCKS	*alloc_key_blocks(MI_CHECK *param, uint blocks,
+/*static int flush_pending_blocks(HA_CHECK *param);*/
+static SORT_KEY_BLOCKS	*alloc_key_blocks(HA_CHECK *param, uint blocks,
 					  uint buffer_length);
 static ha_checksum mi_byte_checksum(const uchar *buf, uint length);
-static void set_data_file_type(SORT_INFO *sort_info, MYISAM_SHARE *share);
+static void set_data_file_type(MI_SORT_INFO *sort_info, MYISAM_SHARE *share);
 
-void myisamchk_init(MI_CHECK *param)
+void myisamchk_init(HA_CHECK *param)
 {
   bzero((uchar*) param,sizeof(*param));
+  /* Set all params that are not 0 */
   param->opt_follow_links=1;
   param->keys_in_use= ~(ulonglong) 0;
   param->search_after_block=HA_OFFSET_ERROR;
-  param->auto_increment_value= 0;
   param->use_buffers=USE_BUFFER_INIT;
   param->read_buffer_length=READ_BUFFER_INIT;
   param->write_buffer_length=READ_BUFFER_INIT;
@@ -95,18 +95,14 @@ void myisamchk_init(MI_CHECK *param)
   param->sort_key_blocks=BUFFERS_WHEN_SORTING;
   param->tmpfile_createflag=O_RDWR | O_TRUNC | O_EXCL;
   param->myf_rw=MYF(MY_NABP | MY_WME | MY_WAIT_IF_FULL);
-  param->start_check_pos=0;
   param->max_record_length= LONGLONG_MAX;
   param->key_cache_block_size= KEY_CACHE_BLOCK_SIZE;
   param->stats_method= MI_STATS_METHOD_NULLS_NOT_EQUAL;
-#ifdef THREAD
-  param->need_print_msg_lock= 0;
-#endif
 }
 
 	/* Check the status flags for the table */
 
-int chk_status(MI_CHECK *param, register MI_INFO *info)
+int chk_status(HA_CHECK *param, register MI_INFO *info)
 {
   MYISAM_SHARE *share=info->s;
 
@@ -134,7 +130,7 @@ int chk_status(MI_CHECK *param, register MI_INFO *info)
 
 	/* Check delete links */
 
-int chk_del(MI_CHECK *param, register MI_INFO *info, uint test_flag)
+int chk_del(HA_CHECK *param, register MI_INFO *info, ulonglong test_flag)
 {
   reg2 ha_rows i;
   uint delete_link_length;
@@ -164,7 +160,7 @@ int chk_del(MI_CHECK *param, register MI_INFO *info, uint test_flag)
     empty=0;
     for (i= info->state->del ; i > 0L && next_link != HA_OFFSET_ERROR ; i--)
     {
-      if (*killed_ptr(param))
+      if (killed_ptr(param))
         DBUG_RETURN(1);
       if (test_flag & T_VERBOSE)
 	printf(" %9s",llstr(next_link,buff));
@@ -242,7 +238,7 @@ wrong:
 
 	/* Check delete links in index file */
 
-static int check_k_link(MI_CHECK *param, register MI_INFO *info, uint nr)
+static int check_k_link(HA_CHECK *param, register MI_INFO *info, uint nr)
 {
   my_off_t next_link;
   uint block_size=(nr+1)*MI_MIN_KEY_BLOCK_LENGTH;
@@ -259,7 +255,7 @@ static int check_k_link(MI_CHECK *param, register MI_INFO *info, uint nr)
   records= (ha_rows) (info->state->key_file_length / block_size);
   while (next_link != HA_OFFSET_ERROR && records > 0)
   {
-    if (*killed_ptr(param))
+    if (killed_ptr(param))
       DBUG_RETURN(1);
     if (param->testflag & T_VERBOSE)
       printf("%16s",llstr(next_link,llbuff));
@@ -320,7 +316,7 @@ static int check_k_link(MI_CHECK *param, register MI_INFO *info, uint nr)
 
 	/* Check sizes of files */
 
-int chk_size(MI_CHECK *param, register MI_INFO *info)
+int chk_size(HA_CHECK *param, register MI_INFO *info)
 {
   int error=0;
   register my_off_t skr,size;
@@ -331,7 +327,8 @@ int chk_size(MI_CHECK *param, register MI_INFO *info)
 
   /* The following is needed if called externally (not from myisamchk) */
   flush_key_blocks(info->s->key_cache,
-		   info->s->kfile, FLUSH_FORCE_WRITE);
+		   info->s->kfile, &info->s->dirty_part_map,
+                   FLUSH_FORCE_WRITE);
 
   size= mysql_file_seek(info->s->kfile, 0L, MY_SEEK_END, MYF(MY_THREADSAFE));
   if ((skr=(my_off_t) info->state->key_file_length) != size)
@@ -396,7 +393,7 @@ int chk_size(MI_CHECK *param, register MI_INFO *info)
 
 	/* Check keys */
 
-int chk_key(MI_CHECK *param, register MI_INFO *info)
+int chk_key(HA_CHECK *param, register MI_INFO *info)
 {
   uint key,found_keys=0,full_text_keys=0,result=0;
   ha_rows keys;
@@ -581,7 +578,7 @@ do_stat:
 } /* chk_key */
 
 
-static int chk_index_down(MI_CHECK *param, MI_INFO *info, MI_KEYDEF *keyinfo,
+static int chk_index_down(HA_CHECK *param, MI_INFO *info, MI_KEYDEF *keyinfo,
                      my_off_t page, uchar *buff, ha_rows *keys,
                      ha_checksum *key_checksum, uint level)
 {
@@ -728,7 +725,7 @@ int mi_collect_stats_nonulls_next(HA_KEYSEG *keyseg, ulonglong *notnull,
 
 	/* Check if index is ok */
 
-static int chk_index(MI_CHECK *param, MI_INFO *info, MI_KEYDEF *keyinfo,
+static int chk_index(HA_CHECK *param, MI_INFO *info, MI_KEYDEF *keyinfo,
 		     my_off_t page, uchar *buff, ha_rows *keys,
 		     ha_checksum *key_checksum, uint level)
 {
@@ -752,7 +749,10 @@ static int chk_index(MI_CHECK *param, MI_INFO *info, MI_KEYDEF *keyinfo,
   }
 
   if (keyinfo->flag & HA_NOSAME)
-    comp_flag=SEARCH_FIND | SEARCH_UPDATE;	/* Not real duplicates */
+  {
+    /* Not real duplicates */
+    comp_flag= SEARCH_FIND | SEARCH_UPDATE | SEARCH_INSERT;
+  }
   else
     comp_flag=SEARCH_SAME;			/* Keys in positionorder */
   nod_flag=mi_test_if_nod(buff);
@@ -773,7 +773,7 @@ static int chk_index(MI_CHECK *param, MI_INFO *info, MI_KEYDEF *keyinfo,
   }
   for ( ;; )
   {
-    if (*killed_ptr(param))
+    if (killed_ptr(param))
       goto err;
     memcpy((char*) info->lastkey,(char*) key,key_length);
     info->lastkey_length=key_length;
@@ -797,9 +797,9 @@ static int chk_index(MI_CHECK *param, MI_INFO *info, MI_KEYDEF *keyinfo,
 	(flag=ha_key_cmp(keyinfo->seg,info->lastkey,key,key_length,
 			 comp_flag, diff_pos)) >=0)
     {
-      DBUG_DUMP("old",(uchar*) info->lastkey, info->lastkey_length);
-      DBUG_DUMP("new",(uchar*) key, key_length);
-      DBUG_DUMP("new_in_page",(uchar*) old_keypos,(uint) (keypos-old_keypos));
+      DBUG_DUMP("old",info->lastkey, info->lastkey_length);
+      DBUG_DUMP("new",key, key_length);
+      DBUG_DUMP("new_in_page",old_keypos,(uint) (keypos-old_keypos));
 
       if (comp_flag & SEARCH_FIND && flag == 0)
 	mi_check_print_error(param,"Found duplicated key at page %s",llstr(page,llbuff));
@@ -868,8 +868,8 @@ static int chk_index(MI_CHECK *param, MI_INFO *info, MI_KEYDEF *keyinfo,
       DBUG_PRINT("test",("page: %s  record: %s  filelength: %s",
 			 llstr(page,llbuff),llstr(record,llbuff2),
 			 llstr(info->state->data_file_length,llbuff3)));
-      DBUG_DUMP("key",(uchar*) key,key_length);
-      DBUG_DUMP("new_in_page",(uchar*) old_keypos,(uint) (keypos-old_keypos));
+      DBUG_DUMP("key",key,key_length);
+      DBUG_DUMP("new_in_page",old_keypos,(uint) (keypos-old_keypos));
       goto err;
     }
     param->record_checksum+=(ha_checksum) record;
@@ -931,7 +931,7 @@ static uint isam_key_length(MI_INFO *info, register MI_KEYDEF *keyinfo)
 
 	/* Check that record-link is ok */
 
-int chk_data_link(MI_CHECK *param, MI_INFO *info,int extend)
+int chk_data_link(HA_CHECK *param, MI_INFO *info, my_bool extend)
 {
   int	error,got_error,flag;
   uint	key,UNINIT_VAR(left_length),b_type,field;
@@ -985,9 +985,12 @@ int chk_data_link(MI_CHECK *param, MI_INFO *info,int extend)
   bzero((char*) key_checksum, info->s->base.keys * sizeof(key_checksum[0]));
   while (pos < info->state->data_file_length)
   {
-    if (*killed_ptr(param))
+    if (killed_ptr(param))
       goto err2;
     switch (info->s->data_file_type) {
+    case BLOCK_RECORD:
+      DBUG_ASSERT(0);                           /* Impossible */
+      break;
     case STATIC_RECORD:
       if (my_b_read(&param->read_cache,(uchar*) record,
 		    info->s->base.pack_reclength))
@@ -1001,7 +1004,7 @@ int chk_data_link(MI_CHECK *param, MI_INFO *info,int extend)
 	del_length+=info->s->base.pack_reclength;
 	continue;					/* Record removed */
       }
-      param->glob_crc+= mi_static_checksum(info,record);
+      param->glob_crc+= (*info->s->calc_check_checksum)(info,record);
       used+=info->s->base.pack_reclength;
       break;
     case DYNAMIC_RECORD:
@@ -1155,7 +1158,7 @@ int chk_data_link(MI_CHECK *param, MI_INFO *info,int extend)
 	}
 	else
 	{
-	  info->checksum=mi_checksum(info,record);
+	  info->checksum= (*info->s->calc_check_checksum)(info,record);
 	  if (param->testflag & (T_EXTEND | T_MEDIUM | T_VERBOSE))
 	  {
 	    if (_mi_rec_check(info,record, info->rec_buff,block_info.rec_len,
@@ -1201,15 +1204,10 @@ int chk_data_link(MI_CHECK *param, MI_INFO *info,int extend)
 			     llstr(start_recpos,llbuff));
 	got_error=1;
       }
-      if (static_row_size)
-	param->glob_crc+= mi_static_checksum(info,record);
-      else
-	param->glob_crc+= mi_checksum(info,record);
+      param->glob_crc+= (*info->s->calc_check_checksum)(info,record);
       link_used+= (block_info.filepos - start_recpos);
       used+= (pos-start_recpos);
       break;
-    case BLOCK_RECORD:
-      assert(0);                                /* Impossible */
     } /* switch */
     if (! got_error)
     {
@@ -1335,7 +1333,7 @@ int chk_data_link(MI_CHECK *param, MI_INFO *info,int extend)
   if (splits != info->s->state.split)
   {
     mi_check_print_warning(param,
-			   "Found %10s key parts. Should be: %s",
+			   "Found %10s parts. Should be: %s",
 			   llstr(splits,llbuff),
 			   llstr(info->s->state.split,llbuff2));
   }
@@ -1428,7 +1426,7 @@ int chk_data_link(MI_CHECK *param, MI_INFO *info,int extend)
     then recrate all indexes.
 */
 
-static int mi_drop_all_indexes(MI_CHECK *param, MI_INFO *info, my_bool force)
+static int mi_drop_all_indexes(HA_CHECK *param, MI_INFO *info, my_bool force)
 {
   MYISAM_SHARE *share= info->s;
   MI_STATE_INFO *state= &share->state;
@@ -1472,6 +1470,7 @@ static int mi_drop_all_indexes(MI_CHECK *param, MI_INFO *info, my_bool force)
       */
       DBUG_PRINT("repair", ("all disabled are empty: create missing"));
       error= flush_key_blocks(share->key_cache, share->kfile,
+                              &share->dirty_part_map,
                               FLUSH_FORCE_WRITE);
       goto end;
     }
@@ -1486,6 +1485,7 @@ static int mi_drop_all_indexes(MI_CHECK *param, MI_INFO *info, my_bool force)
 
   /* Remove all key blocks of this index file from key cache. */
   if ((error= flush_key_blocks(share->key_cache, share->kfile,
+                               &share->dirty_part_map,
                                FLUSH_IGNORE_CHANGED)))
     goto end; /* purecov: inspected */
 
@@ -1511,7 +1511,7 @@ static int mi_drop_all_indexes(MI_CHECK *param, MI_INFO *info, my_bool force)
 	/* Recover old table by reading each record and writing all keys */
 	/* Save new datafile-name in temp_filename */
 
-int mi_repair(MI_CHECK *param, register MI_INFO *info,
+int mi_repair(HA_CHECK *param, register MI_INFO *info,
 	      char * name, int rep_quick)
 {
   int error,got_error;
@@ -1520,7 +1520,7 @@ int mi_repair(MI_CHECK *param, register MI_INFO *info,
   File new_file;
   MYISAM_SHARE *share=info->s;
   char llbuff[22],llbuff2[22];
-  SORT_INFO sort_info;
+  MI_SORT_INFO sort_info;
   MI_SORT_PARAM sort_param;
   DBUG_ENTER("mi_repair");
 
@@ -1547,7 +1547,7 @@ int mi_repair(MI_CHECK *param, register MI_INFO *info,
 
   if (!param->using_global_keycache)
     (void) init_key_cache(dflt_key_cache, param->key_cache_block_size,
-                        param->use_buffers, 0, 0);
+                        param->use_buffers, 0, 0, 0);
 
   if (init_io_cache(&param->read_cache,info->dfile,
 		    (uint) param->read_buffer_length,
@@ -1770,7 +1770,8 @@ err:
   (void) end_io_cache(&param->read_cache);
   info->opt_flag&= ~(READ_CACHE_USED | WRITE_CACHE_USED);
   (void) end_io_cache(&info->rec_cache);
-  got_error|=flush_blocks(param, share->key_cache, share->kfile);
+  got_error|=flush_blocks(param, share->key_cache, share->kfile,
+                          &share->dirty_part_map);
   if (!got_error && param->testflag & T_UNPACK)
   {
     share->state.header.options[0]&= (uchar) ~HA_OPTION_COMPRESS_RECORD;
@@ -1900,7 +1901,7 @@ int movepoint(register MI_INFO *info, uchar *record, my_off_t oldpos,
 
 	/* Tell system that we want all memory for our cache */
 
-void lock_memory(MI_CHECK *param __attribute__((unused)))
+void lock_memory(HA_CHECK *param __attribute__((unused)))
 {
 #ifdef SUN_OS				/* Key-cacheing thrases on sun 4.1 */
   if (param->opt_lock_memory)
@@ -1916,9 +1917,10 @@ void lock_memory(MI_CHECK *param __attribute__((unused)))
 
 	/* Flush all changed blocks to disk */
 
-int flush_blocks(MI_CHECK *param, KEY_CACHE *key_cache, File file)
+int flush_blocks(HA_CHECK *param, KEY_CACHE *key_cache, File file,
+                 ulonglong *dirty_part_map)
 {
-  if (flush_key_blocks(key_cache, file, FLUSH_RELEASE))
+  if (flush_key_blocks(key_cache, file, dirty_part_map, FLUSH_RELEASE))
   {
     mi_check_print_error(param,"%d when trying to write bufferts",my_errno);
     return(1);
@@ -1931,7 +1933,7 @@ int flush_blocks(MI_CHECK *param, KEY_CACHE *key_cache, File file)
 
 	/* Sort index for more efficent reads */
 
-int mi_sort_index(MI_CHECK *param, register MI_INFO *info, char * name)
+int mi_sort_index(HA_CHECK *param, register MI_INFO *info, char * name)
 {
   reg2 uint key;
   reg1 MI_KEYDEF *keyinfo;
@@ -1987,7 +1989,8 @@ int mi_sort_index(MI_CHECK *param, register MI_INFO *info, char * name)
   }
 
   /* Flush key cache for this file if we are calling this outside myisamchk */
-  flush_key_blocks(share->key_cache,share->kfile, FLUSH_IGNORE_CHANGED);
+  flush_key_blocks(share->key_cache, share->kfile, &share->dirty_part_map,
+                   FLUSH_IGNORE_CHANGED);
 
   share->state.version=(ulong) time((time_t*) 0);
   old_state= share->state;			/* save state if not stored */
@@ -2034,7 +2037,7 @@ err2:
 
 	 /* Sort records recursive using one index */
 
-static int sort_one_index(MI_CHECK *param, MI_INFO *info, MI_KEYDEF *keyinfo,
+static int sort_one_index(HA_CHECK *param, MI_INFO *info, MI_KEYDEF *keyinfo,
 			  my_off_t pagepos, File new_file)
 {
   uint length,nod_flag,used_length, key_length;
@@ -2145,12 +2148,12 @@ int change_to_newfile(const char * filename, const char * old_ext,
 	/* Locks a whole file */
 	/* Gives an error-message if file can't be locked */
 
-int lock_file(MI_CHECK *param, File file, my_off_t start, int lock_type,
+int lock_file(HA_CHECK *param, File file, my_off_t start, int lock_type,
 	      const char *filetype, const char *filename)
 {
   if (my_lock(file,lock_type,start,F_TO_EOF,
 	      param->testflag & T_WAIT_FOREVER ? MYF(MY_SEEK_NOT_DONE) :
-	      MYF(MY_SEEK_NOT_DONE |  MY_DONT_WAIT)))
+	      MYF(MY_SEEK_NOT_DONE |  MY_SHORT_WAIT)))
   {
     mi_check_print_error(param," %d when locking %s '%s'",my_errno,filetype,filename);
     param->error_printed=2;		/* Don't give that data is crashed */
@@ -2162,7 +2165,7 @@ int lock_file(MI_CHECK *param, File file, my_off_t start, int lock_type,
 
 	/* Copy a block between two files */
 
-int filecopy(MI_CHECK *param, File to,File from,my_off_t start,
+int filecopy(HA_CHECK *param, File to,File from,my_off_t start,
 	     my_off_t length, const char *type)
 {
   char tmp_buff[IO_SIZE],*buff;
@@ -2213,7 +2216,7 @@ err:
     <>0	Error
 */
 
-int mi_repair_by_sort(MI_CHECK *param, register MI_INFO *info,
+int mi_repair_by_sort(HA_CHECK *param, register MI_INFO *info,
 		      const char * name, int rep_quick)
 {
   int got_error;
@@ -2227,7 +2230,7 @@ int mi_repair_by_sort(MI_CHECK *param, register MI_INFO *info,
   HA_KEYSEG *keyseg;
   ulong   *rec_per_key_part;
   char llbuff[22];
-  SORT_INFO sort_info;
+  MI_SORT_INFO sort_info;
   ulonglong UNINIT_VAR(key_map);
   DBUG_ENTER("mi_repair_by_sort");
 
@@ -2495,7 +2498,7 @@ int mi_repair_by_sort(MI_CHECK *param, register MI_INFO *info,
     goto err;
   }
 
-  if (rep_quick & T_FORCE_UNIQUENESS)
+  if (rep_quick && (param->testflag & T_FORCE_UNIQUENESS))
   {
     my_off_t skr=info->state->data_file_length+
       (share->options & HA_OPTION_COMPRESS_RECORD ?
@@ -2534,7 +2537,8 @@ int mi_repair_by_sort(MI_CHECK *param, register MI_INFO *info,
     memcpy( &share->state.state, info->state, sizeof(*info->state));
 
 err:
-  got_error|= flush_blocks(param, share->key_cache, share->kfile);
+  got_error|= flush_blocks(param, share->key_cache, share->kfile,
+                           &share->dirty_part_map);
   (void) end_io_cache(&info->rec_cache);
   if (!got_error)
   {
@@ -2626,7 +2630,7 @@ err:
     <>0	Error
 */
 
-int mi_repair_parallel(MI_CHECK *param, register MI_INFO *info,
+int mi_repair_parallel(HA_CHECK *param, register MI_INFO *info,
 			const char * name, int rep_quick)
 {
 #ifndef THREAD
@@ -2645,7 +2649,7 @@ int mi_repair_parallel(MI_CHECK *param, register MI_INFO *info,
   char llbuff[22];
   IO_CACHE new_data_cache; /* For non-quick repair. */
   IO_CACHE_SHARE io_share;
-  SORT_INFO sort_info;
+  MI_SORT_INFO sort_info;
   ulonglong UNINIT_VAR(key_map);
   pthread_attr_t thr_attr;
   ulong max_pack_reclength;
@@ -2669,14 +2673,14 @@ int mi_repair_parallel(MI_CHECK *param, register MI_INFO *info,
   /*
     Quick repair (not touching data file, rebuilding indexes):
     {
-      Read  cache is (MI_CHECK *param)->read_cache using info->dfile.
+      Read  cache is (HA_CHECK *param)->read_cache using info->dfile.
     }
 
     Non-quick repair (rebuilding data file and indexes):
     {
       Master thread:
 
-        Read  cache is (MI_CHECK *param)->read_cache using info->dfile.
+        Read  cache is (HA_CHECK *param)->read_cache using info->dfile.
         Write cache is (MI_INFO   *info)->rec_cache  using new_file.
 
       Slave threads:
@@ -3017,7 +3021,7 @@ int mi_repair_parallel(MI_CHECK *param, register MI_INFO *info,
     goto err;
   }
 
-  if (rep_quick & T_FORCE_UNIQUENESS)
+  if (rep_quick && (param->testflag & T_FORCE_UNIQUENESS))
   {
     my_off_t skr=info->state->data_file_length+
       (share->options & HA_OPTION_COMPRESS_RECORD ?
@@ -3055,7 +3059,8 @@ int mi_repair_parallel(MI_CHECK *param, register MI_INFO *info,
     memcpy(&share->state.state, info->state, sizeof(*info->state));
 
 err:
-  got_error|= flush_blocks(param, share->key_cache, share->kfile);
+  got_error|= flush_blocks(param, share->key_cache, share->kfile,
+                           &share->dirty_part_map);
   /*
     Destroy the write cache. The master thread did already detach from
     the share by remove_io_thread() or it was not yet started (if the
@@ -3128,7 +3133,7 @@ err:
 static int sort_key_read(MI_SORT_PARAM *sort_param, void *key)
 {
   int error;
-  SORT_INFO *sort_info=sort_param->sort_info;
+  MI_SORT_INFO *sort_info=sort_param->sort_info;
   MI_INFO *info=sort_info->info;
   DBUG_ENTER("sort_key_read");
 
@@ -3145,7 +3150,7 @@ static int sort_key_read(MI_SORT_PARAM *sort_param, void *key)
     (info->s->rec_reflength+
      _mi_make_key(info, sort_param->key, (uchar*) key,
 		  sort_param->record, sort_param->filepos));
-#ifdef HAVE_purify
+#ifdef HAVE_valgrind
   bzero(key+sort_param->real_key_length,
 	(sort_param->key_length-sort_param->real_key_length));
 #endif
@@ -3155,7 +3160,7 @@ static int sort_key_read(MI_SORT_PARAM *sort_param, void *key)
 static int sort_ft_key_read(MI_SORT_PARAM *sort_param, void *key)
 {
   int error;
-  SORT_INFO *sort_info=sort_param->sort_info;
+  MI_SORT_INFO *sort_info=sort_param->sort_info;
   MI_INFO *info=sort_info->info;
   FT_WORD *wptr=0;
   DBUG_ENTER("sort_ft_key_read");
@@ -3185,7 +3190,7 @@ static int sort_ft_key_read(MI_SORT_PARAM *sort_param, void *key)
   sort_param->real_key_length=(info->s->rec_reflength+
 			       _ft_make_key(info, sort_param->key,
 					    key, wptr++, sort_param->filepos));
-#ifdef HAVE_purify
+#ifdef HAVE_valgrind
   if (sort_param->key_length > sort_param->real_key_length)
     bzero(key+sort_param->real_key_length,
 	  (sort_param->key_length-sort_param->real_key_length));
@@ -3242,17 +3247,20 @@ static int sort_get_next_record(MI_SORT_PARAM *sort_param)
   my_off_t pos;
   uchar *UNINIT_VAR(to);
   MI_BLOCK_INFO block_info;
-  SORT_INFO *sort_info=sort_param->sort_info;
-  MI_CHECK *param=sort_info->param;
+  MI_SORT_INFO *sort_info=sort_param->sort_info;
+  HA_CHECK *param=sort_info->param;
   MI_INFO *info=sort_info->info;
   MYISAM_SHARE *share=info->s;
   char llbuff[22],llbuff2[22];
   DBUG_ENTER("sort_get_next_record");
 
-  if (*killed_ptr(param))
+  if (killed_ptr(param))
     DBUG_RETURN(1);
 
   switch (share->data_file_type) {
+  case BLOCK_RECORD:
+    DBUG_ASSERT(0);                           /* Impossible */
+    break;
   case STATIC_RECORD:
     for (;;)
     {
@@ -3277,7 +3285,9 @@ static int sort_get_next_record(MI_SORT_PARAM *sort_param)
       {
 	if (sort_param->calc_checksum)
 	  param->glob_crc+= (info->checksum=
-			     mi_static_checksum(info,sort_param->record));
+                             (*info->s->calc_check_checksum)(info,
+                                                             sort_param->
+                                                             record));
 	DBUG_RETURN(0);
       }
       if (!sort_param->fix_datafile && sort_param->master)
@@ -3553,7 +3563,8 @@ static int sort_get_next_record(MI_SORT_PARAM *sort_param)
 	if (sort_param->read_cache.error < 0)
 	  DBUG_RETURN(1);
 	if (sort_param->calc_checksum)
-	  info->checksum= mi_checksum(info, sort_param->record);
+	  info->checksum= (*info->s->calc_check_checksum)(info,
+                                                          sort_param->record);
 	if ((param->testflag & (T_EXTEND | T_REP)) || searching)
 	{
 	  if (_mi_rec_check(info, sort_param->record, sort_param->rec_buff,
@@ -3638,11 +3649,11 @@ static int sort_get_next_record(MI_SORT_PARAM *sort_param)
       info->packed_length=block_info.rec_len;
       if (sort_param->calc_checksum)
 	param->glob_crc+= (info->checksum=
-                           mi_checksum(info, sort_param->record));
+                           (*info->s->calc_check_checksum)(info,
+                                                           sort_param->
+                                                           record));
       DBUG_RETURN(0);
     }
-  case BLOCK_RECORD:
-    assert(0);                                  /* Impossible */
   }
   DBUG_RETURN(1);                               /* Impossible */
 }
@@ -3670,8 +3681,8 @@ int sort_write_record(MI_SORT_PARAM *sort_param)
   ulong block_length,reclength;
   uchar *from;
   uchar block_buff[8];
-  SORT_INFO *sort_info=sort_param->sort_info;
-  MI_CHECK *param=sort_info->param;
+  MI_SORT_INFO *sort_info=sort_param->sort_info;
+  HA_CHECK *param=sort_info->param;
   MI_INFO *info=sort_info->info;
   MYISAM_SHARE *share=info->s;
   DBUG_ENTER("sort_write_record");
@@ -3679,6 +3690,9 @@ int sort_write_record(MI_SORT_PARAM *sort_param)
   if (sort_param->fix_datafile)
   {
     switch (sort_info->new_data_file_type) {
+    case BLOCK_RECORD:
+      DBUG_ASSERT(0);                           /* Impossible */
+      break;
     case STATIC_RECORD:
       if (my_b_write(&info->rec_cache,sort_param->record,
 		     share->base.pack_reclength))
@@ -3688,7 +3702,6 @@ int sort_write_record(MI_SORT_PARAM *sort_param)
       }
       sort_param->filepos+=share->base.pack_reclength;
       info->s->state.split++;
-      /* sort_info->param->glob_crc+=mi_static_checksum(info, sort_param->record); */
       break;
     case DYNAMIC_RECORD:
       if (! info->blobs)
@@ -3697,7 +3710,7 @@ int sort_write_record(MI_SORT_PARAM *sort_param)
       {
 	/* must be sure that local buffer is big enough */
 	reclength=info->s->base.pack_reclength+
-	  _my_calc_total_blob_length(info,sort_param->record)+
+	  _mi_calc_total_blob_length(info,sort_param->record)+
 	  ALIGN_SIZE(MI_MAX_DYN_BLOCK_HEADER)+MI_SPLIT_LENGTH+
 	  MI_DYN_DELETE_BLOCK_HEADER;
 	if (sort_info->buff_length < reclength)
@@ -3711,10 +3724,9 @@ int sort_write_record(MI_SORT_PARAM *sort_param)
 	from= sort_info->buff+ALIGN_SIZE(MI_MAX_DYN_BLOCK_HEADER);
       }
       /* We can use info->checksum here as only one thread calls this. */
-      info->checksum=mi_checksum(info,sort_param->record);
+      info->checksum= (*info->s->calc_check_checksum)(info,sort_param->record);
       reclength=_mi_rec_pack(info,from,sort_param->record);
       flag=0;
-      /* sort_info->param->glob_crc+=info->checksum; */
 
       do
       {
@@ -3754,8 +3766,6 @@ int sort_write_record(MI_SORT_PARAM *sort_param)
       sort_param->filepos+=reclength+length;
       info->s->state.split++;
       break;
-    case BLOCK_RECORD:
-      assert(0);                                  /* Impossible */
     }
   }
   if (sort_param->master)
@@ -3788,24 +3798,26 @@ static int sort_key_write(MI_SORT_PARAM *sort_param, const void *a)
 {
   uint diff_pos[2];
   char llbuff[22],llbuff2[22];
-  SORT_INFO *sort_info=sort_param->sort_info;
-  MI_CHECK *param= sort_info->param;
+  MI_SORT_INFO *sort_info=sort_param->sort_info;
+  HA_CHECK *param= sort_info->param;
   int cmp;
 
   if (sort_info->key_block->inited)
   {
-    cmp=ha_key_cmp(sort_param->seg,sort_info->key_block->lastkey,
-		   (uchar*) a, USE_WHOLE_KEY,SEARCH_FIND | SEARCH_UPDATE,
+    cmp=ha_key_cmp(sort_param->seg, (uchar*) sort_info->key_block->lastkey,
+		   (uchar*) a, USE_WHOLE_KEY,
+                   SEARCH_FIND | SEARCH_UPDATE | SEARCH_INSERT,
 		   diff_pos);
     if (param->stats_method == MI_STATS_METHOD_NULLS_NOT_EQUAL)
-      ha_key_cmp(sort_param->seg,sort_info->key_block->lastkey,
+      ha_key_cmp(sort_param->seg, (uchar*) sort_info->key_block->lastkey,
                  (uchar*) a, USE_WHOLE_KEY, 
                  SEARCH_FIND | SEARCH_NULL_ARE_NOT_EQUAL, diff_pos);
     else if (param->stats_method == MI_STATS_METHOD_IGNORE_NULLS)
     {
       diff_pos[0]= mi_collect_stats_nonulls_next(sort_param->seg,
                                                  sort_param->notnull,
-                                                 sort_info->key_block->lastkey,
+                                                 (uchar*) sort_info->
+                                                 key_block->lastkey,
                                                  (uchar*)a);
     }
     sort_param->unique[diff_pos[0]-1]++;
@@ -3828,8 +3840,8 @@ static int sort_key_write(MI_SORT_PARAM *sort_param, const void *a)
 			   llstr(sort_info->info->lastpos,llbuff),
 			   llstr(get_record_for_key(sort_info->info,
 						    sort_param->keyinfo,
-						    sort_info->key_block->
-						    lastkey),
+						    (uchar*) sort_info->
+                                                    key_block->lastkey),
 				 llbuff2));
     param->testflag|=T_RETRY_WITHOUT_QUICK;
     if (sort_info->param->testflag & T_VERBOSE)
@@ -3850,7 +3862,7 @@ static int sort_key_write(MI_SORT_PARAM *sort_param, const void *a)
 
 int sort_ft_buf_flush(MI_SORT_PARAM *sort_param)
 {
-  SORT_INFO *sort_info=sort_param->sort_info;
+  MI_SORT_INFO *sort_info=sort_param->sort_info;
   SORT_KEY_BLOCKS *key_block=sort_info->key_block;
   MYISAM_SHARE *share=sort_info->info->s;
   uint val_off, val_len;
@@ -3860,19 +3872,19 @@ int sort_ft_buf_flush(MI_SORT_PARAM *sort_param)
 
   val_len=share->ft2_keyinfo.keylength;
   get_key_full_length_rdonly(val_off, ft_buf->lastkey);
-  to=ft_buf->lastkey+val_off;
+  to= (uchar*) ft_buf->lastkey+val_off;
 
   if (ft_buf->buf)
   {
     /* flushing first-level tree */
-    error=sort_insert_key(sort_param,key_block,ft_buf->lastkey,
+    error=sort_insert_key(sort_param,key_block, (uchar*) ft_buf->lastkey,
 			  HA_OFFSET_ERROR);
     for (from=to+val_len;
-         !error && from < ft_buf->buf;
+         !error && from < (uchar*) ft_buf->buf;
          from+= val_len)
     {
       memcpy(to, from, val_len);
-      error=sort_insert_key(sort_param,key_block,ft_buf->lastkey,
+      error=sort_insert_key(sort_param,key_block, (uchar*) ft_buf->lastkey,
 			    HA_OFFSET_ERROR);
     }
     return error;
@@ -3881,8 +3893,8 @@ int sort_ft_buf_flush(MI_SORT_PARAM *sort_param)
   error=flush_pending_blocks(sort_param);
   /* updating lastkey with second-level tree info */
   ft_intXstore(ft_buf->lastkey+val_off, -ft_buf->count);
-  _mi_dpointer(sort_info->info, ft_buf->lastkey+val_off+HA_FT_WLEN,
-      share->state.key_root[sort_param->key]);
+  _mi_dpointer(sort_info->info, (uchar*) ft_buf->lastkey+val_off+HA_FT_WLEN,
+               share->state.key_root[sort_param->key]);
   /* restoring first level tree data in sort_info/sort_param */
   sort_info->key_block=sort_info->key_block_end- sort_info->param->sort_key_blocks;
   sort_param->keyinfo=share->keyinfo+sort_param->key;
@@ -3890,14 +3902,14 @@ int sort_ft_buf_flush(MI_SORT_PARAM *sort_param)
   /* writing lastkey in first-level tree */
   return error ? error :
                  sort_insert_key(sort_param,sort_info->key_block,
-                                 ft_buf->lastkey,HA_OFFSET_ERROR);
+                                 (uchar*) ft_buf->lastkey,HA_OFFSET_ERROR);
 }
 
 static int sort_ft_key_write(MI_SORT_PARAM *sort_param, const void *a)
 {
   uint a_len, val_off, val_len, error;
   uchar *p;
-  SORT_INFO *sort_info=sort_param->sort_info;
+  MI_SORT_INFO *sort_info=sort_param->sort_info;
   SORT_FT_BUF *ft_buf=sort_info->ft_buf;
   SORT_KEY_BLOCKS *key_block=sort_info->key_block;
 
@@ -3929,7 +3941,7 @@ static int sort_ft_key_write(MI_SORT_PARAM *sort_param, const void *a)
 
   if (ha_compare_text(sort_param->seg->charset,
                       ((uchar *)a)+1,a_len-1,
-                      ft_buf->lastkey+1,val_off-1, 0, 0)==0)
+                      (uchar*) ft_buf->lastkey+1,val_off-1, 0, 0)==0)
   {
     if (!ft_buf->buf) /* store in second-level tree */
     {
@@ -3945,16 +3957,16 @@ static int sort_ft_key_write(MI_SORT_PARAM *sort_param, const void *a)
       return 0;
 
     /* converting to two-level tree */
-    p=ft_buf->lastkey+val_off;
+    p= (uchar*) ft_buf->lastkey+val_off;
 
     while (key_block->inited)
       key_block++;
     sort_info->key_block=key_block;
     sort_param->keyinfo=& sort_info->info->s->ft2_keyinfo;
-    ft_buf->count=(uint) (ft_buf->buf - p)/val_len;
+    ft_buf->count=((uchar*) ft_buf->buf - p)/val_len;
 
     /* flushing buffer to second-level tree */
-    for (error=0; !error && p < ft_buf->buf; p+= val_len)
+    for (error=0; !error && p < (uchar*) ft_buf->buf; p+= val_len)
       error=sort_insert_key(sort_param,key_block,p,HA_OFFSET_ERROR);
     ft_buf->buf=0;
     return error;
@@ -4002,13 +4014,13 @@ static int sort_insert_key(MI_SORT_PARAM *sort_param,
   MI_KEY_PARAM s_temp;
   MI_INFO *info;
   MI_KEYDEF *keyinfo=sort_param->keyinfo;
-  SORT_INFO *sort_info= sort_param->sort_info;
-  MI_CHECK *param=sort_info->param;
+  MI_SORT_INFO *sort_info= sort_param->sort_info;
+  HA_CHECK *param=sort_info->param;
   DBUG_ENTER("sort_insert_key");
 
-  anc_buff=key_block->buff;
+  anc_buff= (uchar*) key_block->buff;
   info=sort_info->info;
-  lastkey=key_block->lastkey;
+  lastkey= (uchar*) key_block->lastkey;
   nod_flag= (key_block == sort_info->key_block ? 0 :
 	     info->s->base.key_reflength);
 
@@ -4021,7 +4033,7 @@ static int sort_insert_key(MI_SORT_PARAM *sort_param,
       DBUG_RETURN(1);
     }
     a_length=2+nod_flag;
-    key_block->end_pos=anc_buff+2;
+    key_block->end_pos= anc_buff+2;
     lastkey=0;					/* No previous key in block */
   }
   else
@@ -4029,12 +4041,12 @@ static int sort_insert_key(MI_SORT_PARAM *sort_param,
 
 	/* Save pointer to previous block */
   if (nod_flag)
-    _mi_kpointer(info,key_block->end_pos,prev_block);
+    _mi_kpointer(info,(uchar*) key_block->end_pos,prev_block);
 
   t_length=(*keyinfo->pack_key)(keyinfo,nod_flag,
 				(uchar*) 0,lastkey,lastkey,key,
 				 &s_temp);
-  (*keyinfo->store_key)(keyinfo, key_block->end_pos+nod_flag,&s_temp);
+  (*keyinfo->store_key)(keyinfo, (uchar*) key_block->end_pos+nod_flag,&s_temp);
   a_length+=t_length;
   mi_putint(anc_buff,a_length,nod_flag);
   key_block->end_pos+=t_length;
@@ -4066,7 +4078,8 @@ static int sort_insert_key(MI_SORT_PARAM *sort_param,
   DBUG_DUMP("buff",(uchar*) anc_buff,mi_getint(anc_buff));
 
 	/* Write separator-key to block in next level */
-  if (sort_insert_key(sort_param,key_block+1,key_block->lastkey,filepos))
+  if (sort_insert_key(sort_param,key_block+1,(uchar*) key_block->lastkey,
+                      filepos))
     DBUG_RETURN(1);
 
 	/* clear old block and write new key in it */
@@ -4082,8 +4095,8 @@ static int sort_delete_record(MI_SORT_PARAM *sort_param)
   uint i;
   int old_file,error;
   uchar *key;
-  SORT_INFO *sort_info=sort_param->sort_info;
-  MI_CHECK *param=sort_info->param;
+  MI_SORT_INFO *sort_info=sort_param->sort_info;
+  HA_CHECK *param=sort_info->param;
   MI_INFO *info=sort_info->info;
   DBUG_ENTER("sort_delete_record");
 
@@ -4139,7 +4152,7 @@ int flush_pending_blocks(MI_SORT_PARAM *sort_param)
   uint nod_flag,length;
   my_off_t filepos,key_file_length;
   SORT_KEY_BLOCKS *key_block;
-  SORT_INFO *sort_info= sort_param->sort_info;
+  MI_SORT_INFO *sort_info= sort_param->sort_info;
   myf myf_rw=sort_info->param->myf_rw;
   MI_INFO *info=sort_info->info;
   MI_KEYDEF *keyinfo=sort_param->keyinfo;
@@ -4152,7 +4165,7 @@ int flush_pending_blocks(MI_SORT_PARAM *sort_param)
     key_block->inited=0;
     length=mi_getint(key_block->buff);
     if (nod_flag)
-      _mi_kpointer(info,key_block->end_pos,filepos);
+      _mi_kpointer(info,(uchar*) key_block->end_pos,filepos);
     key_file_length=info->state->key_file_length;
     bzero((uchar*) key_block->buff+length, keyinfo->block_length-length);
     if ((filepos=_mi_new(info,keyinfo,DFLT_INIT_HITS)) == HA_OFFSET_ERROR)
@@ -4162,7 +4175,7 @@ int flush_pending_blocks(MI_SORT_PARAM *sort_param)
     if (key_file_length == info->state->key_file_length)
     {
       if (_mi_write_keypage(info, keyinfo, filepos,
-                            DFLT_INIT_HITS, key_block->buff))
+                            DFLT_INIT_HITS, (uchar*) key_block->buff))
 	DBUG_RETURN(1);
     }
     else if (mysql_file_pwrite(info->s->kfile, (uchar*) key_block->buff,
@@ -4177,7 +4190,7 @@ int flush_pending_blocks(MI_SORT_PARAM *sort_param)
 
 	/* alloc space and pointers for key_blocks */
 
-static SORT_KEY_BLOCKS *alloc_key_blocks(MI_CHECK *param, uint blocks,
+static SORT_KEY_BLOCKS *alloc_key_blocks(HA_CHECK *param, uint blocks,
                                          uint buffer_length)
 {
   reg1 uint i;
@@ -4216,7 +4229,7 @@ int test_if_almost_full(MI_INFO *info)
 
 	/* Recreate table with bigger more alloced record-data */
 
-int recreate_table(MI_CHECK *param, MI_INFO **org_info, char *filename)
+int recreate_table(HA_CHECK *param, MI_INFO **org_info, char *filename)
 {
   int error;
   MI_INFO info;
@@ -4390,7 +4403,7 @@ end:
 
 	/* write suffix to data file if neaded */
 
-int write_data_suffix(SORT_INFO *sort_info, my_bool fix_datafile)
+int write_data_suffix(MI_SORT_INFO *sort_info, my_bool fix_datafile)
 {
   MI_INFO *info=sort_info->info;
 
@@ -4411,7 +4424,7 @@ int write_data_suffix(SORT_INFO *sort_info, my_bool fix_datafile)
 
 	/* Update state and myisamchk_time of indexfile */
 
-int update_state_info(MI_CHECK *param, MI_INFO *info,uint update)
+int update_state_info(HA_CHECK *param, MI_INFO *info,uint update)
 {
   MYISAM_SHARE *share=info->s;
 
@@ -4438,7 +4451,7 @@ int update_state_info(MI_CHECK *param, MI_INFO *info,uint update)
   {
     if (update & UPDATE_TIME)
     {
-      share->state.check_time= (long) time((time_t*) 0);
+      share->state.check_time= time((time_t*) 0);
       if (!share->state.create_time)
 	share->state.create_time=share->state.check_time;
     }
@@ -4483,7 +4496,7 @@ err:
 	  param->auto_increment is bigger than the biggest key.
 	*/
 
-void update_auto_increment_key(MI_CHECK *param, MI_INFO *info,
+void update_auto_increment_key(HA_CHECK *param, MI_INFO *info,
 			       my_bool repair_only)
 {
   uchar *record= 0;
@@ -4623,8 +4636,9 @@ void update_key_parts(MI_KEYDEF *keyinfo, ulong *rec_per_key_part,
       let's ensure it is not
     */
     set_if_bigger(tmp,1);
-    if (tmp >= (ulonglong) ~(ulong) 0)
-      tmp=(ulonglong) ~(ulong) 0;
+    /* Keys are stored as 32 byte int's; Ensure we don't get an overflow */
+    if (tmp >= (ulonglong) ~(uint32) 0)
+      tmp=(ulonglong) ~(uint32) 0;
 
     *rec_per_key_part=(ulong) tmp;
     rec_per_key_part++;
@@ -4714,7 +4728,7 @@ my_bool mi_test_if_sort_rep(MI_INFO *info, ha_rows rows,
 
 
 static void
-set_data_file_type(SORT_INFO *sort_info, MYISAM_SHARE *share)
+set_data_file_type(MI_SORT_INFO *sort_info, MYISAM_SHARE *share)
 {
   if ((sort_info->new_data_file_type=share->data_file_type) ==
       COMPRESSED_RECORD && sort_info->param->testflag & T_UNPACK)
diff --git a/storage/myisam/mi_checksum.c b/storage/myisam/mi_checksum.c
index 1aa56e571e3..8c408ef7ff5 100644
--- a/storage/myisam/mi_checksum.c
+++ b/storage/myisam/mi_checksum.c
@@ -19,27 +19,34 @@
 
 ha_checksum mi_checksum(MI_INFO *info, const uchar *buf)
 {
-  uint i;
   ha_checksum crc=0;
-  MI_COLUMNDEF *rec=info->s->rec;
+  const uchar *record= buf;
+  MI_COLUMNDEF *column= info->s->rec;
+  MI_COLUMNDEF *column_end= column+ info->s->base.fields;
+  my_bool skip_null_bits= test(info->s->options & HA_OPTION_NULL_FIELDS);
 
-  for (i=info->s->base.fields ; i-- ; buf+=(rec++)->length)
+  for ( ; column != column_end ; buf+= column++->length)
   {
     const uchar *pos;
     ulong length;
-    switch (rec->type) {
+
+    if ((record[column->null_pos] & column->null_bit) &&
+        skip_null_bits)
+      continue;                                 /* Null field */
+
+    switch (column->type) {
     case FIELD_BLOB:
     {
-      length=_mi_calc_blob_length(rec->length-
-					portable_sizeof_char_ptr,
-					buf);
-      memcpy((char*) &pos, buf+rec->length- portable_sizeof_char_ptr,
+      length=_mi_calc_blob_length(column->length-
+                                  portable_sizeof_char_ptr,
+                                  buf);
+      memcpy((char*) &pos, buf+column->length- portable_sizeof_char_ptr,
 	     sizeof(char*));
       break;
     }
     case FIELD_VARCHAR:
     {
-      uint pack_length= HA_VARCHAR_PACKLENGTH(rec->length-1);
+      uint pack_length= HA_VARCHAR_PACKLENGTH(column->length-1);
       if (pack_length == 1)
         length= (ulong) *(uchar*) buf;
       else
@@ -48,7 +55,7 @@ ha_checksum mi_checksum(MI_INFO *info, const uchar *buf)
       break;
     }
     default:
-      length=rec->length;
+      length=column->length;
       pos=buf;
       break;
     }
diff --git a/storage/myisam/mi_close.c b/storage/myisam/mi_close.c
index 51408ab191c..fd10ef47968 100644
--- a/storage/myisam/mi_close.c
+++ b/storage/myisam/mi_close.c
@@ -64,8 +64,10 @@ int mi_close(register MI_INFO *info)
                     if (share->kfile >= 0) abort(););
     if (share->kfile >= 0 &&
 	flush_key_blocks(share->key_cache, share->kfile,
-			 share->temporary ? FLUSH_IGNORE_CHANGED :
-			 FLUSH_RELEASE))
+                         &share->dirty_part_map,
+                         ((share->temporary || share->deleting) ?
+                          FLUSH_IGNORE_CHANGED :
+                          FLUSH_RELEASE)))
       error=my_errno;
     if (share->kfile >= 0)
     {
@@ -74,6 +76,7 @@ int mi_close(register MI_INFO *info)
         not change the crashed state.
         We can NOT write the state in other cases as other threads
         may be using the file at this point
+        IF using --external-locking.
       */
       if (share->mode != O_RDONLY && mi_is_crashed(info))
 	mi_state_info_write(share->kfile, &share->state, 1);
diff --git a/storage/myisam/mi_create.c b/storage/myisam/mi_create.c
index 46c61eb4709..3a842b966ab 100644
--- a/storage/myisam/mi_create.c
+++ b/storage/myisam/mi_create.c
@@ -37,11 +37,11 @@ int mi_create(const char *name,uint keys,MI_KEYDEF *keydefs,
   File UNINIT_VAR(dfile), UNINIT_VAR(file);
   int errpos,save_errno, create_mode= O_RDWR | O_TRUNC;
   myf create_flag;
-  uint fields,length,max_key_length,packed,pointer,real_length_diff,
+  uint fields,length,max_key_length,packed,pack_bytes,pointer,real_length_diff,
        key_length,info_length,key_segs,options,min_key_length_skip,
        base_pos,long_varchar_count,varchar_length,
        max_key_block_length,unique_key_parts,fulltext_keys,offset;
-  uint aligned_key_start, block_length;
+  uint aligned_key_start, block_length, res;
   ulong reclength, real_reclength,min_pack_length;
   char filename[FN_REFLEN],linkname[FN_REFLEN], *linkname_ptr;
   ulong pack_reclength;
@@ -90,7 +90,7 @@ int mi_create(const char *name,uint keys,MI_KEYDEF *keydefs,
     ci->reloc_rows=ci->max_rows;		/* Check if wrong parameter */
 
   if (!(rec_per_key_part=
-	(ulong*) my_malloc((keys + uniques)*MI_MAX_KEY_SEG*sizeof(long),
+	(ulong*) my_malloc((keys + uniques)*HA_MAX_KEY_SEG*sizeof(long),
 			   MYF(MY_WME | MY_ZEROFILL))))
     DBUG_RETURN(my_errno);
 
@@ -103,6 +103,9 @@ int mi_create(const char *name,uint keys,MI_KEYDEF *keydefs,
        rec++,fields++)
   {
     reclength+=rec->length;
+    if (rec->null_bit)
+      options|= HA_OPTION_NULL_FIELDS;
+
     if ((type=(enum en_fieldtype) rec->type) != FIELD_NORMAL &&
 	type != FIELD_CHECK)
     {
@@ -137,6 +140,7 @@ int mi_create(const char *name,uint keys,MI_KEYDEF *keydefs,
 	  long_varchar_count++;
 	  pack_reclength+= 2;			/* May be packed on 3 bytes */
 	}
+        options|= HA_OPTION_NULL_FIELDS;        /* Use of mi_checksum() */
       }
       else if (type != FIELD_SKIP_ZERO)
       {
@@ -176,23 +180,30 @@ int mi_create(const char *name,uint keys,MI_KEYDEF *keydefs,
   if (flags & HA_CREATE_TMP_TABLE)
   {
     options|= HA_OPTION_TMP_TABLE;
-    create_mode|= O_EXCL | O_NOFOLLOW;
+    create_mode|= O_NOFOLLOW;
   }
   if (flags & HA_CREATE_CHECKSUM || (options & HA_OPTION_CHECKSUM))
   {
     options|= HA_OPTION_CHECKSUM;
     min_pack_length++;
   }
+  /*
+    Don't set HA_OPTION_NULL_FIELDS if no checksums, as this flag makes
+    that file incompatible with MySQL.  This is ok, as this flag is only
+    used if one specifics table level checksums.
+  */
+  if (!(options & HA_OPTION_CHECKSUM))
+    options&= ~HA_OPTION_NULL_FIELDS;
   if (flags & HA_CREATE_DELAY_KEY_WRITE)
     options|= HA_OPTION_DELAY_KEY_WRITE;
   if (flags & HA_CREATE_RELIES_ON_SQL_LAYER)
     options|= HA_OPTION_RELIES_ON_SQL_LAYER;
 
-  packed=(packed+7)/8;
+  pack_bytes= (packed+7)/8;
   if (pack_reclength != INT_MAX32)
     pack_reclength+= reclength+packed +
       test(test_all_bits(options, HA_OPTION_CHECKSUM | HA_OPTION_PACK_RECORD));
-  min_pack_length+=packed;
+  min_pack_length+= pack_bytes;
 
   if (!ci->data_file_length && ci->max_rows)
   {
@@ -269,7 +280,7 @@ int mi_create(const char *name,uint keys,MI_KEYDEF *keydefs,
             keyseg->type != HA_KEYTYPE_VARBINARY2)
         {
           my_errno=HA_WRONG_CREATE_OPTION;
-          goto err;
+          goto err_no_lock;
         }
       }
       keydef->keysegs+=sp_segs;
@@ -278,7 +289,7 @@ int mi_create(const char *name,uint keys,MI_KEYDEF *keydefs,
       min_key_length_skip+=SPLEN*2*SPDIMS;
 #else
       my_errno= HA_ERR_UNSUPPORTED;
-      goto err;
+      goto err_no_lock;
 #endif /*HAVE_SPATIAL*/
     }
     else if (keydef->flag & HA_FULLTEXT)
@@ -294,7 +305,7 @@ int mi_create(const char *name,uint keys,MI_KEYDEF *keydefs,
             keyseg->type != HA_KEYTYPE_VARTEXT2)
         {
           my_errno=HA_WRONG_CREATE_OPTION;
-          goto err;
+          goto err_no_lock;
         }
         if (!(keyseg->flag & HA_BLOB_PART) &&
 	    (keyseg->type == HA_KEYTYPE_VARTEXT1 ||
@@ -416,10 +427,10 @@ int mi_create(const char *name,uint keys,MI_KEYDEF *keydefs,
       }
     } /* if HA_FULLTEXT */
     key_segs+=keydef->keysegs;
-    if (keydef->keysegs > MI_MAX_KEY_SEG)
+    if (keydef->keysegs > HA_MAX_KEY_SEG)
     {
       my_errno=HA_WRONG_CREATE_OPTION;
-      goto err;
+      goto err_no_lock;
     }
     /*
       key_segs may be 0 in the case when we only want to be able to
@@ -431,7 +442,7 @@ int mi_create(const char *name,uint keys,MI_KEYDEF *keydefs,
       share.state.rec_per_key_part[key_segs-1]=1L;
     length+=key_length;
     /* Get block length for key, if defined by user */
-    block_length= (keydef->block_length ? 
+    block_length= (keydef->block_length ?
                    my_round_up_to_next_power(keydef->block_length) :
                    myisam_block_size);
     block_length= max(block_length, MI_MIN_KEY_BLOCK_LENGTH);
@@ -441,10 +452,10 @@ int mi_create(const char *name,uint keys,MI_KEYDEF *keydefs,
                                                  pointer,MI_MAX_KEYPTR_SIZE,
                                                  block_length);
     if (keydef->block_length > MI_MAX_KEY_BLOCK_LENGTH ||
-        length >= MI_MAX_KEY_BUFF)
+        length >= HA_MAX_KEY_BUFF)
     {
       my_errno=HA_WRONG_CREATE_OPTION;
-      goto err;
+      goto err_no_lock;
     }
     set_if_bigger(max_key_block_length,keydef->block_length);
     keydef->keylength= (uint16) key_length;
@@ -487,11 +498,12 @@ int mi_create(const char *name,uint keys,MI_KEYDEF *keydefs,
   /* There are only 16 bits for the total header length. */
   if (info_length > 65535)
   {
-    my_printf_error(0, "MyISAM table '%s' has too many columns and/or "
+    my_printf_error(HA_WRONG_CREATE_OPTION,
+                    "MyISAM table '%s' has too many columns and/or "
                     "indexes and/or unique constraints.",
                     MYF(0), name + dirname_length(name));
     my_errno= HA_WRONG_CREATE_OPTION;
-    goto err;
+    goto err_no_lock;
   }
 
   bmove(share.state.header.file_version,(uchar*) myisam_file_magic,4);
@@ -546,7 +558,7 @@ int mi_create(const char *name,uint keys,MI_KEYDEF *keydefs,
   share.base.pack_reclength=reclength+ test(options & HA_OPTION_CHECKSUM);
   share.base.max_pack_length=pack_reclength;
   share.base.min_pack_length=min_pack_length;
-  share.base.pack_bits=packed;
+  share.base.pack_bits= pack_bytes;
   share.base.fields=fields;
   share.base.pack_fields=packed;
 
@@ -560,7 +572,7 @@ int mi_create(const char *name,uint keys,MI_KEYDEF *keydefs,
     max(share.base.pack_reclength,MI_MIN_BLOCK_LENGTH) :
     MI_EXTEND_BLOCK_LENGTH;
   if (! (flags & HA_DONT_TOUCH_DATA))
-    share.state.create_time= (long) time((time_t*) 0);
+    share.state.create_time= time((time_t*) 0);
 
   mysql_mutex_lock(&THR_LOCK_myisam);
 
@@ -809,13 +821,16 @@ int mi_create(const char *name,uint keys,MI_KEYDEF *keydefs,
   }
   errpos=0;
   mysql_mutex_unlock(&THR_LOCK_myisam);
+  res= 0;
   if (mysql_file_close(file, MYF(0)))
-    goto err;
+    res= my_errno;
   my_free(rec_per_key_part);
-  DBUG_RETURN(0);
+  DBUG_RETURN(res);
 
 err:
   mysql_mutex_unlock(&THR_LOCK_myisam);
+err_no_lock:
+
   save_errno=my_errno;
   switch (errpos) {
   case 3:
diff --git a/storage/myisam/mi_dbug.c b/storage/myisam/mi_dbug.c
index e450e81cecb..3bcacef0b69 100644
--- a/storage/myisam/mi_dbug.c
+++ b/storage/myisam/mi_dbug.c
@@ -45,6 +45,7 @@ void _mi_print_key(FILE *stream, register HA_KEYSEG *keyseg,
 	fprintf(stream,"NULL");
 	continue;
       }
+      end++;
     }
 
     switch (keyseg->type) {
@@ -91,7 +92,7 @@ void _mi_print_key(FILE *stream, register HA_KEYSEG *keyseg,
       key=end;
       break;
     case HA_KEYTYPE_ULONG_INT:
-      l_1=mi_sint4korr(key);
+      l_1=mi_uint4korr(key);
       (void) fprintf(stream,"%lu",(ulong) l_1);
       key=end;
       break;
@@ -117,7 +118,7 @@ void _mi_print_key(FILE *stream, register HA_KEYSEG *keyseg,
     case HA_KEYTYPE_LONGLONG:
     {
       char buff[21];
-      longlong2str(mi_sint8korr(key),buff,-10);
+      longlong10_to_str(mi_sint8korr(key),buff,-10);
       (void) fprintf(stream,"%s",buff);
       key=end;
       break;
@@ -125,11 +126,12 @@ void _mi_print_key(FILE *stream, register HA_KEYSEG *keyseg,
     case HA_KEYTYPE_ULONGLONG:
     {
       char buff[21];
-      longlong2str(mi_sint8korr(key),buff,10);
+      longlong10_to_str(mi_sint8korr(key),buff,10);
       (void) fprintf(stream,"%s",buff);
       key=end;
       break;
     }
+#endif
     case HA_KEYTYPE_BIT:
     {
       uint i;
@@ -139,8 +141,6 @@ void _mi_print_key(FILE *stream, register HA_KEYSEG *keyseg,
       key= end;
       break;
     }
-
-#endif
     case HA_KEYTYPE_VARTEXT1:                   /* VARCHAR and TEXT */
     case HA_KEYTYPE_VARTEXT2:                   /* VARCHAR and TEXT */
     case HA_KEYTYPE_VARBINARY1:                 /* VARBINARY and BLOB */
diff --git a/storage/myisam/mi_delete.c b/storage/myisam/mi_delete.c
index 9314148cd8c..0817d9926ca 100644
--- a/storage/myisam/mi_delete.c
+++ b/storage/myisam/mi_delete.c
@@ -159,7 +159,7 @@ static int _mi_ck_real_delete(register MI_INFO *info, MI_KEYDEF *keyinfo,
     DBUG_RETURN(my_errno=HA_ERR_CRASHED);
   }
   if (!(root_buff= (uchar*) my_alloca((uint) keyinfo->block_length+
-				      MI_MAX_KEY_BUFF*2)))
+				      HA_MAX_KEY_BUFF*2)))
   {
     DBUG_PRINT("error",("Couldn't allocate memory"));
     DBUG_RETURN(my_errno=ENOMEM);
@@ -171,8 +171,9 @@ static int _mi_ck_real_delete(register MI_INFO *info, MI_KEYDEF *keyinfo,
     goto err;
   }
   if ((error=d_search(info,keyinfo,
-                      (keyinfo->flag & HA_FULLTEXT ? SEARCH_FIND | SEARCH_UPDATE
-                                                   : SEARCH_SAME),
+                      (keyinfo->flag & HA_FULLTEXT ?
+                       SEARCH_FIND | SEARCH_UPDATE | SEARCH_INSERT :
+                       SEARCH_SAME),
                        key,key_length,old_root,root_buff)) >0)
   {
     if (error == 2)
@@ -221,7 +222,7 @@ static int d_search(register MI_INFO *info, register MI_KEYDEF *keyinfo,
   my_bool last_key;
   uchar *leaf_buff,*keypos;
   my_off_t UNINIT_VAR(leaf_page),next_block;
-  uchar lastkey[MI_MAX_KEY_BUFF];
+  uchar lastkey[HA_MAX_KEY_BUFF];
   DBUG_ENTER("d_search");
   DBUG_DUMP("page",(uchar*) anc_buff,mi_getint(anc_buff));
 
@@ -310,7 +311,7 @@ static int d_search(register MI_INFO *info, register MI_KEYDEF *keyinfo,
   {
     leaf_page=_mi_kpos(nod_flag,keypos);
     if (!(leaf_buff= (uchar*) my_alloca((uint) keyinfo->block_length+
-					MI_MAX_KEY_BUFF*2)))
+					HA_MAX_KEY_BUFF*2)))
     {
       DBUG_PRINT("error",("Couldn't allocate memory"));
       my_errno=ENOMEM;
@@ -369,9 +370,7 @@ static int d_search(register MI_INFO *info, register MI_KEYDEF *keyinfo,
     {				/* This happens only with packed keys */
       DBUG_PRINT("test",("Enlarging of key when deleting"));
       if (!_mi_get_last_key(info,keyinfo,anc_buff,lastkey,keypos,&length))
-      {
 	goto err;
-      }
       ret_value=_mi_insert(info,keyinfo,key,anc_buff,keypos,lastkey,
 			   (uchar*) 0,(uchar*) 0,(my_off_t) 0,(my_bool) 0);
     }
@@ -409,7 +408,7 @@ static int del(register MI_INFO *info, register MI_KEYDEF *keyinfo, uchar *key,
   int ret_value,length;
   uint a_length,nod_flag,tmp;
   my_off_t next_page;
-  uchar keybuff[MI_MAX_KEY_BUFF],*endpos,*next_buff,*key_start, *prev_key;
+  uchar keybuff[HA_MAX_KEY_BUFF],*endpos,*next_buff,*key_start, *prev_key;
   MYISAM_SHARE *share=info->s;
   MI_KEY_PARAM s_temp;
   DBUG_ENTER("del");
@@ -426,7 +425,7 @@ static int del(register MI_INFO *info, register MI_KEYDEF *keyinfo, uchar *key,
   {
     next_page= _mi_kpos(nod_flag,endpos);
     if (!(next_buff= (uchar*) my_alloca((uint) keyinfo->block_length+
-					MI_MAX_KEY_BUFF*2)))
+					HA_MAX_KEY_BUFF*2)))
       DBUG_RETURN(-1);
     if (!_mi_fetch_keypage(info,keyinfo,next_page,DFLT_INIT_HITS,next_buff,0))
       ret_value= -1;
@@ -513,7 +512,7 @@ static int underflow(register MI_INFO *info, register MI_KEYDEF *keyinfo,
   uint length,anc_length,buff_length,leaf_length,p_length,s_length,nod_flag,
        key_reflength,key_length;
   my_off_t next_page;
-  uchar anc_key[MI_MAX_KEY_BUFF],leaf_key[MI_MAX_KEY_BUFF],
+  uchar anc_key[HA_MAX_KEY_BUFF],leaf_key[HA_MAX_KEY_BUFF],
         *buff,*endpos,*next_keypos,*anc_pos,*half_pos,*temp_pos,*prev_key,
         *after_key;
   MI_KEY_PARAM s_temp;
diff --git a/storage/myisam/mi_delete_all.c b/storage/myisam/mi_delete_all.c
index 7a2e24189e6..5940b927d9a 100644
--- a/storage/myisam/mi_delete_all.c
+++ b/storage/myisam/mi_delete_all.c
@@ -52,7 +52,8 @@ int mi_delete_all_rows(MI_INFO *info)
     If we are using delayed keys or if the user has done changes to the tables
     since it was locked then there may be key blocks in the key cache
   */
-  flush_key_blocks(share->key_cache, share->kfile, FLUSH_IGNORE_CHANGED);
+  flush_key_blocks(share->key_cache, share->kfile, &share->dirty_part_map,
+                   FLUSH_IGNORE_CHANGED);
 #ifdef HAVE_MMAP
   if (share->file_map)
     mi_munmap_file(info);
diff --git a/storage/myisam/mi_dynrec.c b/storage/myisam/mi_dynrec.c
index f429edd2759..70a59a11346 100644
--- a/storage/myisam/mi_dynrec.c
+++ b/storage/myisam/mi_dynrec.c
@@ -119,7 +119,7 @@ int mi_munmap_file(MI_INFO *info)
 {
   int ret;
   DBUG_ENTER("mi_unmap_file");
-  if ((ret= my_munmap(info->s->file_map, info->s->mmaped_length)))
+  if ((ret= my_munmap(info->s->file_map, (size_t) info->s->mmaped_length)))
     DBUG_RETURN(ret);
   info->s->file_read= mi_nommap_pread;
   info->s->file_write= mi_nommap_pwrite;
@@ -282,7 +282,7 @@ int _mi_write_blob_record(MI_INFO *info, const uchar *record)
   extra= (ALIGN_SIZE(MI_MAX_DYN_BLOCK_HEADER)+MI_SPLIT_LENGTH+
 	  MI_DYN_DELETE_BLOCK_HEADER+1);
   reclength= (info->s->base.pack_reclength +
-	      _my_calc_total_blob_length(info,record)+ extra);
+	      _mi_calc_total_blob_length(info,record)+ extra);
   if (!(rec_buff=(uchar*) my_alloca(reclength)))
   {
     my_errno= HA_ERR_OUT_OF_MEM; /* purecov: inspected */
@@ -309,7 +309,7 @@ int _mi_update_blob_record(MI_INFO *info, my_off_t pos, const uchar *record)
   extra= (ALIGN_SIZE(MI_MAX_DYN_BLOCK_HEADER)+MI_SPLIT_LENGTH+
 	  MI_DYN_DELETE_BLOCK_HEADER);
   reclength= (info->s->base.pack_reclength+
-	      _my_calc_total_blob_length(info,record)+ extra);
+	      _mi_calc_total_blob_length(info,record)+ extra);
   if (!(rec_buff=(uchar*) my_alloca(reclength)))
   {
     my_errno= HA_ERR_OUT_OF_MEM; /* purecov: inspected */
@@ -1353,7 +1353,7 @@ err:
 
 	/* Calc length of blob. Update info in blobs->length */
 
-ulong _my_calc_total_blob_length(MI_INFO *info, const uchar *record)
+ulong _mi_calc_total_blob_length(MI_INFO *info, const uchar *record)
 {
   ulong length;
   MI_BLOB *blob,*end;
@@ -1387,7 +1387,7 @@ ulong _mi_calc_blob_length(uint length, const uchar *pos)
 }
 
 
-void _my_store_blob_length(uchar *pos,uint pack_length,uint length)
+void _mi_store_blob_length(uchar *pos,uint pack_length,uint length)
 {
   switch (pack_length) {
   case 1:
@@ -1598,7 +1598,7 @@ int _mi_cmp_dynamic_record(register MI_INFO *info, register const uchar *record)
     if (info->s->base.blobs)
     {
       if (!(buffer=(uchar*) my_alloca(info->s->base.pack_reclength+
-				     _my_calc_total_blob_length(info,record))))
+				     _mi_calc_total_blob_length(info,record))))
 	DBUG_RETURN(-1);
     }
     reclength=_mi_rec_pack(info,buffer,record);
@@ -1856,7 +1856,7 @@ int _mi_read_rnd_dynamic_record(MI_INFO *info, uchar *buf,
         if (mysql_file_read(info->dfile, (uchar*) to, block_info.data_len,
                             MYF(MY_NABP)))
 	{
-	  if (my_errno == -1)
+	  if (my_errno == HA_ERR_FILE_TOO_SHORT)
 	    my_errno= HA_ERR_WRONG_IN_RECORD;	/* Unexpected end of file */
 	  goto err;
 	}
diff --git a/storage/myisam/mi_extra.c b/storage/myisam/mi_extra.c
index baf8cb5e240..16ec536dbc6 100644
--- a/storage/myisam/mi_extra.c
+++ b/storage/myisam/mi_extra.c
@@ -216,7 +216,7 @@ int mi_extra(MI_INFO *info, enum ha_extra_function function, void *extra_arg)
     info->lock_wait=0;
     break;
   case HA_EXTRA_NO_WAIT_LOCK:
-    info->lock_wait=MY_DONT_WAIT;
+    info->lock_wait= MY_SHORT_WAIT;
     break;
   case HA_EXTRA_NO_KEYS:
     if (info->lock_type == F_UNLCK)
@@ -257,20 +257,28 @@ int mi_extra(MI_INFO *info, enum ha_extra_function function, void *extra_arg)
     mysql_mutex_unlock(&THR_LOCK_myisam);
     break;
   case HA_EXTRA_PREPARE_FOR_DROP:
+    /* Signals about intent to delete this table */
+    share->deleting= TRUE;
+    share->global_changed= FALSE;     /* force writing changed flag */
+    _mi_mark_file_changed(info);
+    /* Fall trough */
+  case HA_EXTRA_PREPARE_FOR_RENAME:
     mysql_mutex_lock(&THR_LOCK_myisam);
     share->last_version= 0L;			/* Impossible version */
-#ifdef __WIN__REMOVE_OBSOLETE_WORKAROUND
-    /* Close the isam and data files as Win32 can't drop an open table */
     mysql_mutex_lock(&share->intern_lock);
+    /* Flush pages that we don't need anymore */
     if (flush_key_blocks(share->key_cache, share->kfile,
-			 (function == HA_EXTRA_FORCE_REOPEN ?
-			  FLUSH_RELEASE : FLUSH_IGNORE_CHANGED)))
+                         &share->dirty_part_map,
+			 (function == HA_EXTRA_PREPARE_FOR_DROP ?
+                          FLUSH_IGNORE_CHANGED : FLUSH_RELEASE)))
     {
       error=my_errno;
       share->changed=1;
       mi_print_error(info->s, HA_ERR_CRASHED);
       mi_mark_crashed(info);			/* Fatal error found */
     }
+#ifdef __WIN__REMOVE_OBSOLETE_WORKAROUND
+    /* Close the isam and data files as Win32 can't drop an open table */
     if (info->opt_flag & (READ_CACHE_USED | WRITE_CACHE_USED))
     {
       info->opt_flag&= ~(READ_CACHE_USED | WRITE_CACHE_USED);
@@ -284,9 +292,19 @@ int mi_extra(MI_INFO *info, enum ha_extra_function function, void *extra_arg)
       info->lock_type = F_UNLCK;
     }
     if (share->kfile >= 0)
+    {
+      /*
+        We don't need to call _mi_decrement_open_count() if we are
+        dropping the table, as the files will be removed anyway. If we
+        are aborted before the files is removed, it's better to not
+        call it as in that case the automatic repair on open will add
+        the missing index entries
+      */
+      if (function != HA_EXTRA_PREPARE_FOR_DROP)
       _mi_decrement_open_count(info);
-    if (share->kfile >= 0 && mysql_file_close(share->kfile, MYF(0)))
-      error=my_errno;
+      if (mysql_file_close(share->kfile,MYF(0)))
+        error=my_errno;
+    }
     {
       LIST *list_element ;
       for (list_element=myisam_open_list ;
@@ -303,13 +321,14 @@ int mi_extra(MI_INFO *info, enum ha_extra_function function, void *extra_arg)
       }
     }
     share->kfile= -1;				/* Files aren't open anymore */
-    mysql_mutex_unlock(&share->intern_lock);
 #endif
+    mysql_mutex_unlock(&share->intern_lock);
     mysql_mutex_unlock(&THR_LOCK_myisam);
     break;
   case HA_EXTRA_FLUSH:
     if (!share->temporary)
-      flush_key_blocks(share->key_cache, share->kfile, FLUSH_KEEP);
+      flush_key_blocks(share->key_cache, share->kfile, &share->dirty_part_map,
+                       FLUSH_KEEP);
 #ifdef HAVE_PWRITE
     _mi_decrement_open_count(info);
 #endif
@@ -373,6 +392,11 @@ int mi_extra(MI_INFO *info, enum ha_extra_function function, void *extra_arg)
     share->is_log_table= TRUE;
     mysql_mutex_unlock(&share->intern_lock);
     break;
+  case HA_EXTRA_DETACH_CHILD: /* When used with MERGE tables */
+    info->open_flag&=     ~HA_OPEN_MERGE_TABLE;
+    info->lock.priority&= ~THR_LOCK_MERGE_PRIV;
+    break;
+    
   case HA_EXTRA_KEY_CACHE:
   case HA_EXTRA_NO_KEY_CACHE:
   default:
@@ -386,6 +410,12 @@ int mi_extra(MI_INFO *info, enum ha_extra_function function, void *extra_arg)
   DBUG_RETURN(error);
 } /* mi_extra */
 
+void mi_set_index_cond_func(MI_INFO *info, index_cond_func_t func,
+                            void *func_arg)
+{
+  info->index_cond_func= func;
+  info->index_cond_func_arg= func_arg;
+}
 
 /*
     Start/Stop Inserting Duplicates Into a Table, WL#1648.
diff --git a/storage/myisam/mi_key.c b/storage/myisam/mi_key.c
index 75038fce070..f64a602e2be 100644
--- a/storage/myisam/mi_key.c
+++ b/storage/myisam/mi_key.c
@@ -31,7 +31,8 @@
               set_if_smaller(char_length,length);                           \
             } while(0)
 
-static int _mi_put_key_in_record(MI_INFO *info,uint keynr,uchar *record);
+static int _mi_put_key_in_record(MI_INFO *info,uint keynr,
+                                 my_bool unpack_blobs, uchar *record);
 
 /*
   Make a intern key from a record
@@ -311,6 +312,9 @@ uint _mi_pack_key(register MI_INFO *info, uint keynr, uchar *key, uchar *old,
     _mi_put_key_in_record()
     info		MyISAM handler
     keynr		Key number that was used
+    unpack_blobs        TRUE  <=> Unpack blob columns
+                        FALSE <=> Skip them. This is used by index condition 
+                                  pushdown check function
     record 		Store key here
 
     Last read key is in info->lastkey
@@ -324,7 +328,7 @@ uint _mi_pack_key(register MI_INFO *info, uint keynr, uchar *key, uchar *old,
 */
 
 static int _mi_put_key_in_record(register MI_INFO *info, uint keynr,
-				 uchar *record)
+                                 my_bool unpack_blobs, uchar *record)
 {
   reg2 uchar *key;
   uchar *pos,*key_end;
@@ -417,16 +421,19 @@ static int _mi_put_key_in_record(register MI_INFO *info, uint keynr,
       if (length > keyseg->length || key+length > key_end)
 	goto err;
 #endif
-      memcpy(record+keyseg->start+keyseg->bit_start,
-	     (char*) &blob_ptr,sizeof(char*));
-      memcpy(blob_ptr,key,length);
-      blob_ptr+=length;
+      if (unpack_blobs)
+      {
+        memcpy(record+keyseg->start+keyseg->bit_start,
+               (char*) &blob_ptr,sizeof(char*));
+        memcpy(blob_ptr,key,length);
+        blob_ptr+=length;
 
-      /* The above changed info->lastkey2. Inform mi_rnext_same(). */
-      info->update&= ~HA_STATE_RNEXT_SAME;
+        /* The above changed info->lastkey2. Inform mi_rnext_same(). */
+        info->update&= ~HA_STATE_RNEXT_SAME;
 
-      _my_store_blob_length(record+keyseg->start,
-			    (uint) keyseg->bit_start,length);
+        _mi_store_blob_length(record+keyseg->start,
+                              (uint) keyseg->bit_start,length);
+      }
       key+=length;
     }
     else if (keyseg->flag & HA_SWAP_KEY)
@@ -470,7 +477,7 @@ int _mi_read_key_record(MI_INFO *info, my_off_t filepos, uchar *buf)
   {
     if (info->lastinx >= 0)
     {				/* Read only key */
-      if (_mi_put_key_in_record(info,(uint) info->lastinx,buf))
+      if (_mi_put_key_in_record(info,(uint) info->lastinx, TRUE, buf))
       {
         mi_print_error(info->s, HA_ERR_CRASHED);
 	my_errno=HA_ERR_CRASHED;
@@ -486,6 +493,34 @@ int _mi_read_key_record(MI_INFO *info, my_off_t filepos, uchar *buf)
 
 
 /*
+  Save current key tuple to record and call index condition check function
+
+  SYNOPSIS
+    mi_check_index_cond()
+      info    MyISAM handler
+      keynr   Index we're running a scan on
+      record  Record buffer to use (it is assumed that index check function 
+              will look for column values there)
+
+  RETURN
+    ICP_ERROR         Error 
+    ICP_NO_MATCH      Index condition is not satisfied, continue scanning
+    ICP_MATCH         Index condition is satisfied
+    ICP_OUT_OF_RANGE  Index condition is not satisfied, end the scan. 
+*/
+
+int mi_check_index_cond(register MI_INFO *info, uint keynr, uchar *record)
+{
+  if (_mi_put_key_in_record(info, keynr, FALSE, record))
+  {
+    mi_print_error(info->s, HA_ERR_CRASHED);
+    my_errno=HA_ERR_CRASHED;
+    return ICP_ERROR;
+  }
+  return info->index_cond_func(info->index_cond_func_arg);
+}
+
+/*
   Retrieve auto_increment info
 
   SYNOPSIS
diff --git a/storage/myisam/mi_keycache.c b/storage/myisam/mi_keycache.c
index cbd9c7d76ab..be7d3cc6f1e 100644
--- a/storage/myisam/mi_keycache.c
+++ b/storage/myisam/mi_keycache.c
@@ -75,7 +75,8 @@ int mi_assign_to_key_cache(MI_INFO *info,
     in the old key cache.
   */
 
-  if (flush_key_blocks(share->key_cache, share->kfile, FLUSH_RELEASE))
+  if (flush_key_blocks(share->key_cache, share->kfile, &share->dirty_part_map,
+                       FLUSH_RELEASE))
   {
     error= my_errno;
     mi_print_error(info->s, HA_ERR_CRASHED);
@@ -90,7 +91,8 @@ int mi_assign_to_key_cache(MI_INFO *info,
     (This can never fail as there is never any not written data in the
     new key cache)
   */
-  (void) flush_key_blocks(key_cache, share->kfile, FLUSH_RELEASE);
+  (void) flush_key_blocks(key_cache, share->kfile, &share->dirty_part_map,
+                          FLUSH_RELEASE);
 
   /*
     ensure that setting the key cache and changing the multi_key_cache
@@ -102,6 +104,7 @@ int mi_assign_to_key_cache(MI_INFO *info,
     This should be seen at the lastes for the next call to an myisam function.
   */
   share->key_cache= key_cache;
+  share->dirty_part_map= 0;
 
   /* store the key cache in the global hash structure for future opens */
   if (multi_key_cache_set((uchar*) share->unique_file_name,
diff --git a/storage/myisam/mi_locking.c b/storage/myisam/mi_locking.c
index 6134b4f46df..be308797286 100644
--- a/storage/myisam/mi_locking.c
+++ b/storage/myisam/mi_locking.c
@@ -22,6 +22,8 @@
 
 #include "ftdefs.h"
 
+static void mi_update_status_with_lock(MI_INFO *info);
+
 	/* lock table by F_UNLCK, F_RDLCK or F_WRLCK */
 
 int mi_lock_database(MI_INFO *info, int lock_type)
@@ -56,13 +58,21 @@ int mi_lock_database(MI_INFO *info, int lock_type)
     case F_UNLCK:
       ftparser_call_deinitializer(info);
       if (info->lock_type == F_RDLCK)
+      {
 	count= --share->r_locks;
+        mi_restore_status(info);
+      }
       else
+      {
 	count= --share->w_locks;
+        mi_update_status_with_lock(info);
+      }
       --share->tot_locks;
       if (info->lock_type == F_WRLCK && !share->w_locks &&
 	  !share->delay_key_write && flush_key_blocks(share->key_cache,
-						      share->kfile,FLUSH_KEEP))
+						      share->kfile,
+                                                      &share->dirty_part_map,
+                                                      FLUSH_KEEP))
       {
 	error=my_errno;
         mi_print_error(info->s, HA_ERR_CRASHED);
@@ -84,16 +94,16 @@ int mi_lock_database(MI_INFO *info, int lock_type)
 	if (share->changed && !share->w_locks)
 	{
 #ifdef HAVE_MMAP
-    if ((info->s->mmaped_length != info->s->state.state.data_file_length) &&
-        (info->s->nonmmaped_inserts > MAX_NONMAPPED_INSERTS))
-    {
-      if (info->s->concurrent_insert)
-        mysql_rwlock_wrlock(&info->s->mmap_lock);
-      mi_remap_file(info, info->s->state.state.data_file_length);
-      info->s->nonmmaped_inserts= 0;
-      if (info->s->concurrent_insert)
-        mysql_rwlock_unlock(&info->s->mmap_lock);
-    }
+          if ((info->s->mmaped_length != info->s->state.state.data_file_length) &&
+              (info->s->nonmmaped_inserts > MAX_NONMAPPED_INSERTS))
+          {
+            if (info->s->concurrent_insert)
+              mysql_rwlock_wrlock(&info->s->mmap_lock);
+            mi_remap_file(info, info->s->state.state.data_file_length);
+            info->s->nonmmaped_inserts= 0;
+            if (info->s->concurrent_insert)
+              mysql_rwlock_unlock(&info->s->mmap_lock);
+          }
 #endif
 	  share->state.process= share->last_process=share->this_process;
 	  share->state.unique=   info->last_unique=  info->this_unique;
@@ -242,7 +252,7 @@ int mi_lock_database(MI_INFO *info, int lock_type)
        a crash on windows if the table is renamed and 
        later on referenced by the merge table.
      */
-    if( info->owned_by_merge && (info->s)->kfile < 0 )
+    if ((info->open_flag & HA_OPEN_MERGE_TABLE) && (info->s)->kfile < 0)
     {
       error = HA_ERR_NO_SUCH_TABLE;
     }
@@ -267,13 +277,15 @@ int mi_lock_database(MI_INFO *info, int lock_type)
 			(THR_WRITE_CONCURRENT_INSERT was used)
 */
 
-void mi_get_status(void* param, int concurrent_insert)
+void mi_get_status(void* param, my_bool concurrent_insert)
 {
   MI_INFO *info=(MI_INFO*) param;
   DBUG_ENTER("mi_get_status");
-  DBUG_PRINT("info",("key_file: %ld  data_file: %ld  concurrent_insert: %d",
-		     (long) info->s->state.state.key_file_length,
-		     (long) info->s->state.state.data_file_length,
+  DBUG_PRINT("info",("name: %s  key_file: %lu  data_file: %lu  rows: %lu  concurrent_insert: %d",
+                     info->s->index_file_name,
+		     (ulong) info->s->state.state.key_file_length,
+		     (ulong) info->s->state.state.data_file_length,
+		     (ulong) info->s->state.state.records,
                      concurrent_insert));
 #ifndef DBUG_OFF
   if (info->state->key_file_length > info->s->state.state.key_file_length ||
@@ -294,6 +306,7 @@ void mi_get_status(void* param, int concurrent_insert)
 void mi_update_status(void* param)
 {
   MI_INFO *info=(MI_INFO*) param;
+  DBUG_ENTER("mi_update_status");
   /*
     Because someone may have closed the table we point at, we only
     update the state if its our own state.  This isn't a problem as
@@ -303,9 +316,11 @@ void mi_update_status(void* param)
   if (info->state == &info->save_state)
   {
 #ifndef DBUG_OFF
-    DBUG_PRINT("info",("updating status:  key_file: %ld  data_file: %ld",
-		       (long) info->state->key_file_length,
-		       (long) info->state->data_file_length));
+    DBUG_PRINT("info",
+               ("updating status:  key_file: %lu  data_file: %lu  rows: %lu",
+                (ulong) info->state->key_file_length,
+                (ulong) info->state->data_file_length,
+                (ulong) info->state->records));
     if (info->state->key_file_length < info->s->state.state.key_file_length ||
 	info->state->data_file_length < info->s->state.state.data_file_length)
       DBUG_PRINT("warning",("old info:  key_file: %ld  data_file: %ld",
@@ -313,6 +328,12 @@ void mi_update_status(void* param)
 			    (long) info->s->state.state.data_file_length));
 #endif
     info->s->state.state= *info->state;
+#ifdef HAVE_QUERY_CACHE
+    DBUG_PRINT("info", ("invalidator... '%s' (status update)",
+                        info->filename));
+    DBUG_ASSERT(info->s->chst_invalidator != NULL);
+    (*info->s->chst_invalidator)((const char *)info->filename);
+#endif
   }
   info->state= &info->s->state.state;
   info->append_insert_at_end= 0;
@@ -330,20 +351,50 @@ void mi_update_status(void* param)
     }
     info->opt_flag&= ~WRITE_CACHE_USED;
   }
+  DBUG_VOID_RETURN;
+}
+
+/*
+  Same as mi_update_status() but take a lock in the table lock, to protect
+  against someone calling mi_get_status() from thr_lock() at the same time.
+*/
+
+static void mi_update_status_with_lock(MI_INFO *info)
+{
+  my_bool locked= 0;
+  if (info->state == &info->save_state)
+  {
+    locked= 1;
+    pthread_mutex_lock(&info->s->lock.mutex);
+  }
+  mi_update_status(info);
+  if (locked)
+    pthread_mutex_unlock(&info->s->lock.mutex);
 }
 
 
 void mi_restore_status(void *param)
 {
   MI_INFO *info= (MI_INFO*) param;
+  DBUG_ENTER("mi_restore_status");
+  DBUG_PRINT("info",("key_file: %ld  data_file: %ld",
+		     (long) info->s->state.state.key_file_length,
+		     (long) info->s->state.state.data_file_length));
   info->state= &info->s->state.state;
   info->append_insert_at_end= 0;
+  DBUG_VOID_RETURN;
 }
 
 
 void mi_copy_status(void* to,void *from)
 {
-  ((MI_INFO*) to)->state= &((MI_INFO*) from)->save_state;
+  MI_INFO *info= (MI_INFO*) to;
+  DBUG_ENTER("mi_copy_status");
+  info->state= &((MI_INFO*) from)->save_state;
+  DBUG_PRINT("info",("key_file: %ld  data_file: %ld",
+		     (long) info->state->key_file_length,
+		     (long) info->state->data_file_length));
+  DBUG_VOID_RETURN;
 }
 
 
@@ -371,17 +422,44 @@ void mi_copy_status(void* to,void *from)
 my_bool mi_check_status(void *param)
 {
   MI_INFO *info=(MI_INFO*) param;
+  DBUG_ENTER("mi_check_status");
+  DBUG_PRINT("info",("dellink: %ld  r_locks: %u  w_locks: %u",
+                     (long) info->s->state.dellink, (uint) info->s->r_locks,
+                     (uint) info->s->w_locks));
   /*
     The test for w_locks == 1 is here because this thread has already done an
     external lock (in other words: w_locks == 1 means no other threads has
     a write lock)
   */
-  DBUG_PRINT("info",("dellink: %ld  r_locks: %u  w_locks: %u",
-                     (long) info->s->state.dellink, (uint) info->s->r_locks,
-                     (uint) info->s->w_locks));
-  return (my_bool) !(info->s->state.dellink == HA_OFFSET_ERROR ||
+  DBUG_RETURN((my_bool) !(info->s->state.dellink == HA_OFFSET_ERROR ||
                      (myisam_concurrent_insert == 2 && info->s->r_locks &&
-                      info->s->w_locks == 1));
+                      info->s->w_locks == 1)));
+}
+
+
+/**
+  Fix status for thr_lock_merge()
+
+  @param  org_table
+  @param  new_table that should point on org_lock.  new_table is 0
+          in case this is the first occurence of the table in the lock
+          structure.
+*/
+
+void mi_fix_status(MI_INFO *org_table, MI_INFO *new_table)
+{
+  DBUG_ENTER("mi_fix_status");
+  if (!new_table)
+  {
+    /* First in group. Set state as in mi_get_status() */
+    org_table->state= &org_table->save_state;
+  }
+  else
+  {
+    /* Set new_table to use state from org_table (first lock of this table) */
+    new_table->state= org_table->state;
+  }
+  DBUG_VOID_RETURN;
 }
 
 
@@ -403,10 +481,10 @@ int _mi_readinfo(register MI_INFO *info, int lock_type, int check_keybuffer)
 	DBUG_RETURN(1);
       if (mi_state_info_read_dsk(share->kfile, &share->state, 1))
       {
-	int error=my_errno ? my_errno : -1;
+	int error= my_errno ? my_errno : HA_ERR_FILE_TOO_SHORT;
 	(void) my_lock(share->kfile,F_UNLCK,0L,F_TO_EOF,
 		     MYF(MY_SEEK_NOT_DONE));
-	my_errno=error;
+	my_errno= error;
 	DBUG_RETURN(1);
       }
     }
@@ -479,7 +557,8 @@ int _mi_test_if_changed(register MI_INFO *info)
   {						/* Keyfile has changed */
     DBUG_PRINT("info",("index file changed"));
     if (share->state.process != share->this_process)
-      (void) flush_key_blocks(share->key_cache, share->kfile, FLUSH_RELEASE);
+      (void) flush_key_blocks(share->key_cache, share->kfile,
+                            &share->dirty_part_map, FLUSH_RELEASE);
     share->last_process=share->state.process;
     info->last_unique=	share->state.unique;
     info->last_loop=	share->state.update_count;
@@ -554,7 +633,7 @@ int _mi_decrement_open_count(MI_INFO *info)
   {
     uint old_lock=info->lock_type;
     share->global_changed=0;
-    lock_error=mi_lock_database(info,F_WRLCK);
+    lock_error= my_disable_locking ? 0 : mi_lock_database(info,F_WRLCK);
     /* Its not fatal even if we couldn't get the lock ! */
     if (share->state.open_count > 0)
     {
@@ -564,7 +643,7 @@ int _mi_decrement_open_count(MI_INFO *info)
                                      sizeof(share->state.header),
                                      MYF(MY_NABP));
     }
-    if (!lock_error)
+    if (!lock_error && !my_disable_locking)
       lock_error=mi_lock_database(info,old_lock);
   }
   return test(lock_error || write_error);
diff --git a/storage/myisam/mi_log.c b/storage/myisam/mi_log.c
index 5af4a057a95..e0c66bef996 100644
--- a/storage/myisam/mi_log.c
+++ b/storage/myisam/mi_log.c
@@ -130,7 +130,7 @@ void _myisam_log_record(enum myisam_log_commands command, MI_INFO *info,
   if (!info->s->base.blobs)
     length=info->s->base.reclength;
   else
-    length=info->s->base.reclength+ _my_calc_total_blob_length(info,record);
+    length=info->s->base.reclength+ _mi_calc_total_blob_length(info,record);
   buff[0]=(uchar) command;
   mi_int2store(buff+1,info->dfile);
   mi_int4store(buff+3,pid);
diff --git a/storage/myisam/mi_open.c b/storage/myisam/mi_open.c
index e3c29909067..2403bf70434 100644
--- a/storage/myisam/mi_open.c
+++ b/storage/myisam/mi_open.c
@@ -19,6 +19,7 @@
 #include "sp_defs.h"
 #include "rt_index.h"
 #include <m_ctype.h>
+#include <mysql_version.h>
 
 #ifdef __WIN__
 #include <fcntl.h>
@@ -50,6 +51,8 @@ MI_INFO *test_if_reopen(char *filename)
   {
     MI_INFO *info=(MI_INFO*) pos->data;
     MYISAM_SHARE *share=info->s;
+    DBUG_ASSERT(strcmp(share->unique_file_name,filename) ||
+                share->last_version);
     if (!strcmp(share->unique_file_name,filename) && share->last_version)
       return info;
   }
@@ -75,8 +78,8 @@ MI_INFO *mi_open(const char *name, int mode, uint open_flags)
   uchar *disk_cache, *disk_pos, *end_pos;
   MI_INFO info,*m_info,*old_info;
   MYISAM_SHARE share_buff,*share;
-  ulong rec_per_key_part[HA_MAX_POSSIBLE_KEY*MI_MAX_KEY_SEG];
-  my_off_t key_root[HA_MAX_POSSIBLE_KEY],key_del[MI_MAX_KEY_BLOCK_SIZE];
+  ulong *rec_per_key_part= 0;
+  my_off_t *key_root, *key_del;
   ulonglong max_key_file_length, max_data_file_length;
   DBUG_ENTER("mi_open");
 
@@ -88,7 +91,7 @@ MI_INFO *mi_open(const char *name, int mode, uint open_flags)
   bzero((uchar*) &info,sizeof(info));
 
   realpath_err= my_realpath(name_buff,
-                  fn_format(org_name,name,"",MI_NAME_IEXT,4),MYF(0));
+                            fn_format(org_name,name,"",MI_NAME_IEXT,4),MYF(0));
   if (my_is_symlink(org_name) &&
       (realpath_err || (*myisam_test_invalid_symlink)(name_buff)))
   {
@@ -101,14 +104,12 @@ MI_INFO *mi_open(const char *name, int mode, uint open_flags)
   {
     share= &share_buff;
     bzero((uchar*) &share_buff,sizeof(share_buff));
-    share_buff.state.rec_per_key_part=rec_per_key_part;
-    share_buff.state.key_root=key_root;
-    share_buff.state.key_del=key_del;
     share_buff.key_cache= multi_key_cache_search((uchar*) name_buff,
-                                                 strlen(name_buff));
+                                                 strlen(name_buff),
+                                                 dflt_key_cache);
 
     DBUG_EXECUTE_IF("myisam_pretend_crashed_table_on_open",
-                    if (strstr(name, "/t1"))
+                    if (strstr(name, "/crashed"))
                     {
                       my_errno= HA_ERR_CRASHED;
                       goto err;
@@ -132,12 +133,11 @@ MI_INFO *mi_open(const char *name, int mode, uint open_flags)
       my_errno= HA_ERR_NOT_A_TABLE;
       goto err;
     }
-    if (memcmp((uchar*) share->state.header.file_version,
-	       (uchar*) myisam_file_magic, 4))
+    if (bcmp(share->state.header.file_version, myisam_file_magic, 4))
     {
       DBUG_PRINT("error",("Wrong header in %s",name_buff));
       DBUG_DUMP("error_dump", share->state.header.file_version,
-		head_length);
+		(size_t) head_length);
       my_errno=HA_ERR_NOT_A_TABLE;
       goto err;
     }
@@ -147,7 +147,7 @@ MI_INFO *mi_open(const char *name, int mode, uint open_flags)
 	  HA_OPTION_COMPRESS_RECORD | HA_OPTION_READ_ONLY_DATA |
 	  HA_OPTION_TEMP_COMPRESS_RECORD | HA_OPTION_CHECKSUM |
           HA_OPTION_TMP_TABLE | HA_OPTION_DELAY_KEY_WRITE |
-          HA_OPTION_RELIES_ON_SQL_LAYER))
+          HA_OPTION_RELIES_ON_SQL_LAYER | HA_OPTION_NULL_FIELDS))
     {
       DBUG_PRINT("error",("wrong options: 0x%lx", share->options));
       my_errno=HA_ERR_OLD_FILE;
@@ -183,7 +183,7 @@ MI_INFO *mi_open(const char *name, int mode, uint open_flags)
     {
       if ((lock_error=my_lock(kfile,F_RDLCK,0L,F_TO_EOF,
 			      MYF(open_flags & HA_OPEN_WAIT_IF_LOCKED ?
-				  0 : MY_DONT_WAIT))) &&
+				  0 : MY_SHORT_WAIT))) &&
 	  !(open_flags & HA_OPEN_IGNORE_IF_LOCKED))
 	goto err;
     }
@@ -207,14 +207,19 @@ MI_INFO *mi_open(const char *name, int mode, uint open_flags)
     }
     share->state_diff_length=len-MI_STATE_INFO_SIZE;
 
-    mi_state_info_read(disk_cache, &share->state);
+    if (!mi_state_info_read(disk_cache, &share->state))
+      goto err;
+    rec_per_key_part= share->state.rec_per_key_part;
+    key_root= share->state.key_root;
+    key_del=  share->state.key_del;
+
     len= mi_uint2korr(share->state.header.base_info_length);
     if (len != MI_BASE_INFO_SIZE)
     {
       DBUG_PRINT("warning",("saved_base_info_length: %d  base_info_length: %d",
 			    len,MI_BASE_INFO_SIZE));
     }
-    disk_pos= my_n_base_info_read(disk_cache + base_pos, &share->base);
+    disk_pos= mi_n_base_info_read(disk_cache + base_pos, &share->base);
     share->state.state_length=base_pos;
 
     if (!(open_flags & HA_OPEN_FOR_REPAIR) &&
@@ -239,8 +244,8 @@ MI_INFO *mi_open(const char *name, int mode, uint open_flags)
     }
 
     key_parts+=fulltext_keys*FT_SEGS;
-    if (share->base.max_key_length > MI_MAX_KEY_BUFF || keys > MI_MAX_KEY ||
-	key_parts > MI_MAX_KEY * MI_MAX_KEY_SEG)
+    if (share->base.max_key_length > HA_MAX_KEY_BUFF || keys > MI_MAX_KEY ||
+	key_parts > MI_MAX_KEY * HA_MAX_KEY_SEG)
     {
       DBUG_PRINT("error",("Wrong key info:  Max_key_length: %d  keys: %d  key_parts: %d", share->base.max_key_length, keys, key_parts));
       my_errno=HA_ERR_UNSUPPORTED;
@@ -440,13 +445,20 @@ MI_INFO *mi_open(const char *name, int mode, uint open_flags)
       share->rec[i].pack_type=0;
       share->rec[i].huff_tree=0;
       share->rec[i].offset=offset;
-      if (share->rec[i].type == (int) FIELD_BLOB)
+      if (share->rec[i].type == FIELD_BLOB)
       {
 	share->blobs[j].pack_length=
-	  share->rec[i].length-portable_sizeof_char_ptr;
+	  share->rec[i].length - portable_sizeof_char_ptr;
 	share->blobs[j].offset=offset;
 	j++;
       }
+#if MYSQL_VERSION_ID <= 60100
+      /* This is to detect old checksum option */
+      if (share->rec[i].null_bit)
+        share->has_null_fields= 1;
+      if (share->rec[i].type == FIELD_VARCHAR)
+        share->has_varchar_fields= 1;
+#endif
       offset+=share->rec[i].length;
     }
     share->rec[i].type=(int) FIELD_LAST;	/* End marker */
@@ -526,6 +538,7 @@ MI_INFO *mi_open(const char *name, int mode, uint open_flags)
 	share->lock.update_status=mi_update_status;
         share->lock.restore_status= mi_restore_status;
 	share->lock.check_status=mi_check_status;
+        share->lock.fix_status= (void (*)(void *, void *)) mi_fix_status;
       }
     }
 #endif
@@ -576,6 +589,7 @@ MI_INFO *mi_open(const char *name, int mode, uint open_flags)
   info.s=share;
   info.lastpos= HA_OFFSET_ERROR;
   info.update= (short) (HA_STATE_NEXT_FOUND+HA_STATE_PREV_FOUND);
+  info.open_flag= open_flags;
   info.opt_flag=READ_CHECK_USED;
   info.this_unique= (ulong) info.dfile; /* Uniq number in process */
   if (share->data_file_type == COMPRESSED_RECORD)
@@ -634,6 +648,7 @@ MI_INFO *mi_open(const char *name, int mode, uint open_flags)
   mysql_mutex_unlock(&THR_LOCK_myisam);
 
   bzero(info.buff, share->base.max_key_block_length * 2);
+  my_free(rec_per_key_part, MYF(MY_ALLOW_ZERO_PTR));
 
   if (myisam_log_file >= 0)
   {
@@ -663,6 +678,7 @@ err:
   case 3:
     if (! lock_error)
       (void) my_lock(kfile, F_UNLCK, 0L, F_TO_EOF, MYF(MY_SEEK_NOT_DONE));
+    my_free(rec_per_key_part, MYF(MY_ALLOW_ZERO_PTR));
     /* fall through */
   case 2:
     my_afree(disk_cache);
@@ -735,12 +751,14 @@ void mi_setup_functions(register MYISAM_SHARE *share)
   {
     share->read_record=_mi_read_pack_record;
     share->read_rnd=_mi_read_rnd_pack_record;
-    if (!(share->options & HA_OPTION_TEMP_COMPRESS_RECORD))
-      share->calc_checksum=0;				/* No checksum */
-    else if (share->options & HA_OPTION_PACK_RECORD)
+    if ((share->options &
+              (HA_OPTION_PACK_RECORD | HA_OPTION_NULL_FIELDS)))
       share->calc_checksum= mi_checksum;
     else
       share->calc_checksum= mi_static_checksum;
+    share->calc_check_checksum= share->calc_checksum;
+    if (!(share->options & HA_OPTION_TEMP_COMPRESS_RECORD))
+      share->calc_checksum=0;				/* No checksum */
   }
   else if (share->options & HA_OPTION_PACK_RECORD)
   {
@@ -750,6 +768,7 @@ void mi_setup_functions(register MYISAM_SHARE *share)
     share->compare_record=_mi_cmp_dynamic_record;
     share->compare_unique=_mi_cmp_dynamic_unique;
     share->calc_checksum= mi_checksum;
+    share->calc_check_checksum= share->calc_checksum;
 
     /* add bits used to pack data to pack_reclength for faster allocation */
     share->base.pack_reclength+= share->base.pack_bits;
@@ -773,7 +792,11 @@ void mi_setup_functions(register MYISAM_SHARE *share)
     share->update_record=_mi_update_static_record;
     share->write_record=_mi_write_static_record;
     share->compare_unique=_mi_cmp_static_unique;
-    share->calc_checksum= mi_static_checksum;
+    if (share->options & HA_OPTION_NULL_FIELDS)
+      share->calc_checksum= mi_checksum;
+    else
+      share->calc_checksum= mi_static_checksum;
+    share->calc_check_checksum= share->calc_checksum;
   }
   share->file_read= mi_nommap_pread;
   share->file_write= mi_nommap_pwrite;
@@ -943,6 +966,16 @@ uchar *mi_state_info_read(uchar *ptr, MI_STATE_INFO *state)
 
   ptr+= state->state_diff_length;
 
+  if (!state->rec_per_key_part)
+  {
+    if (!my_multi_malloc(MY_WME,
+			 &state->rec_per_key_part,sizeof(long)*key_parts,
+			 &state->key_root, keys*sizeof(my_off_t),
+			 &state->key_del,  key_blocks*sizeof(my_off_t),
+                         NullS))
+      return(0);
+  }
+
   for (i=0; i < keys; i++)
   {
     state->key_root[i]= mi_sizekorr(ptr);	ptr +=8;
@@ -1022,7 +1055,7 @@ uint mi_base_info_write(File file, MI_BASE_INFO *base)
 }
 
 
-uchar *my_n_base_info_read(uchar *ptr, MI_BASE_INFO *base)
+uchar *mi_n_base_info_read(uchar *ptr, MI_BASE_INFO *base)
 {
   base->keystart = mi_sizekorr(ptr);			ptr +=8;
   base->max_data_file_length = mi_sizekorr(ptr);	ptr +=8;
diff --git a/storage/myisam/mi_packrec.c b/storage/myisam/mi_packrec.c
index d8d892a5bc9..ceb087d67c9 100644
--- a/storage/myisam/mi_packrec.c
+++ b/storage/myisam/mi_packrec.c
@@ -105,6 +105,7 @@ static void init_bit_buffer(MI_BIT_BUFF *bit_buff,uchar *buffer,uint length);
 static uint fill_and_get_bits(MI_BIT_BUFF *bit_buff,uint count);
 static void fill_buffer(MI_BIT_BUFF *bit_buff);
 static uint max_bit(uint value);
+static uint read_pack_length(uint version, const uchar *buf, ulong *length);
 #ifdef HAVE_MMAP
 static uchar *_mi_mempack_get_block_info(MI_INFO *myisam, MI_BIT_BUFF *bit_buff,
                                          MI_BLOCK_INFO *info, uchar **rec_buff_p,
@@ -1050,7 +1051,7 @@ static void uf_blob(MI_COLUMNDEF *rec, MI_BIT_BUFF *bit_buff,
       return;
     }
     decode_bytes(rec,bit_buff,bit_buff->blob_pos,bit_buff->blob_pos+length);
-    _my_store_blob_length((uchar*) to,pack_length,length);
+    _mi_store_blob_length((uchar*) to,pack_length,length);
     memcpy((char*) to+pack_length, &bit_buff->blob_pos, sizeof(char*));
     bit_buff->blob_pos+=length;
   }
@@ -1674,7 +1675,7 @@ uint save_pack_length(uint version, uchar *block_buff, ulong length)
 }
 
 
-uint read_pack_length(uint version, const uchar *buf, ulong *length)
+static uint read_pack_length(uint version, const uchar *buf, ulong *length)
 {
   if (buf[0] < 254)
   {
diff --git a/storage/myisam/mi_page.c b/storage/myisam/mi_page.c
index 90e31e72532..82acb801c90 100644
--- a/storage/myisam/mi_page.c
+++ b/storage/myisam/mi_page.c
@@ -87,10 +87,11 @@ int _mi_write_keypage(register MI_INFO *info, register MI_KEYDEF *keyinfo,
       info->state->key_file_length != page+length)
     length= ((mi_getint(buff)+IO_SIZE-1) & (uint) ~(IO_SIZE-1));
   DBUG_RETURN((key_cache_write(info->s->key_cache,
-                         info->s->kfile,page, level, (uchar*) buff,length,
-			 (uint) keyinfo->block_length,
-			 (int) ((info->lock_type != F_UNLCK) ||
-				info->s->delay_key_write))));
+			       info->s->kfile, &info->s->dirty_part_map,
+                               page, level, (uchar*) buff, length,
+			       (uint) keyinfo->block_length,
+			       (int) ((info->lock_type != F_UNLCK) ||
+				     info->s->delay_key_write))));
 } /* mi_write_keypage */
 
 
@@ -109,7 +110,8 @@ int _mi_dispose(register MI_INFO *info, MI_KEYDEF *keyinfo, my_off_t pos,
   mi_sizestore(buff,old_link);
   info->s->state.changed|= STATE_NOT_SORTED_PAGES;
   DBUG_RETURN(key_cache_write(info->s->key_cache,
-                              info->s->kfile, pos , level, buff,
+                              info->s->kfile, &info->s->dirty_part_map,
+                              pos , level, buff,
 			      sizeof(buff),
 			      (uint) keyinfo->block_length,
 			      (int) (info->lock_type != F_UNLCK)));
diff --git a/storage/myisam/mi_panic.c b/storage/myisam/mi_panic.c
index 69865cfc0bb..e6a1d54a516 100644
--- a/storage/myisam/mi_panic.c
+++ b/storage/myisam/mi_panic.c
@@ -47,7 +47,8 @@ int mi_panic(enum ha_panic_function flag)
       if (info->s->options & HA_OPTION_READ_ONLY_DATA)
 	break;
 #endif
-      if (flush_key_blocks(info->s->key_cache, info->s->kfile, FLUSH_RELEASE))
+      if (flush_key_blocks(info->s->key_cache, info->s->kfile,
+                           &info->s->dirty_part_map, FLUSH_RELEASE))
 	error=my_errno;
       if (info->opt_flag & WRITE_CACHE_USED)
 	if (flush_io_cache(&info->rec_cache))
diff --git a/storage/myisam/mi_preload.c b/storage/myisam/mi_preload.c
index 31537f7054b..79d3db83796 100644
--- a/storage/myisam/mi_preload.c
+++ b/storage/myisam/mi_preload.c
@@ -65,7 +65,7 @@ int mi_preload(MI_INFO *info, ulonglong key_map, my_bool ignore_leaves)
     }
   }
   else
-    block_length= share->key_cache->key_cache_block_size;
+    block_length= share->key_cache->param_block_size;
 
   length= info->preload_buff_size/block_length * block_length;
   set_if_bigger(length, block_length);
@@ -73,7 +73,8 @@ int mi_preload(MI_INFO *info, ulonglong key_map, my_bool ignore_leaves)
   if (!(buff= (uchar *) my_malloc(length, MYF(MY_WME))))
     DBUG_RETURN(my_errno= HA_ERR_OUT_OF_MEM);
 
-  if (flush_key_blocks(share->key_cache,share->kfile, FLUSH_RELEASE))
+  if (flush_key_blocks(share->key_cache, share->kfile, &share->dirty_part_map,
+                       FLUSH_RELEASE))
     goto err;
 
   do
diff --git a/storage/myisam/mi_range.c b/storage/myisam/mi_range.c
index 28bf5cbe033..8f598200634 100644
--- a/storage/myisam/mi_range.c
+++ b/storage/myisam/mi_range.c
@@ -152,7 +152,7 @@ static ha_rows _mi_record_pos(MI_INFO *info, const uchar *key,
     operations with a comment like "Not real duplicates", whatever this
     means. From the condition above we can see that 'skip_end_space' is
     always false for these operations. The result is that trailing space
-    counts in key comparison and hence, emtpy strings ('', string length
+    counts in key comparison and hence, empty strings ('', string length
     zero, but not NULL) compare less that strings starting with control
     characters and these in turn compare less than strings starting with
     blanks.
@@ -166,7 +166,7 @@ static ha_rows _mi_record_pos(MI_INFO *info, const uchar *key,
 
     This is the reason that we add the SEARCH_UPDATE flag here. It makes
     the key estimation compare in the same way like key write operations
-    do. Olny so we will find the keys where they have been inserted.
+    do. Only so we will find the keys where they have been inserted.
 
     Adding the flag unconditionally does not hurt as it is used in the
     above mentioned condition only. So it can safely be used together
@@ -259,7 +259,7 @@ static uint _mi_keynr(MI_INFO *info, register MI_KEYDEF *keyinfo, uchar *page,
                       uchar *keypos, uint *ret_max_key)
 {
   uint nod_flag,keynr,max_key;
-  uchar t_buff[MI_MAX_KEY_BUFF],*end;
+  uchar t_buff[HA_MAX_KEY_BUFF],*end;
 
   end= page+mi_getint(page);
   nod_flag=mi_test_if_nod(page);
diff --git a/storage/myisam/mi_rkey.c b/storage/myisam/mi_rkey.c
index d3744c9a053..f5b3514faf6 100644
--- a/storage/myisam/mi_rkey.c
+++ b/storage/myisam/mi_rkey.c
@@ -29,6 +29,7 @@ int mi_rkey(MI_INFO *info, uchar *buf, int inx, const uchar *key,
   MI_KEYDEF *keyinfo;
   HA_KEYSEG *last_used_keyseg;
   uint pack_key_length, use_key_length, nextflag;
+  ICP_RESULT res= ICP_NO_MATCH;
   DBUG_ENTER("mi_rkey");
   DBUG_PRINT("enter", ("base: 0x%lx  buf: 0x%lx  inx: %d  search_flag: %d",
                        (long) info, (long) buf, inx, search_flag));
@@ -85,6 +86,8 @@ int mi_rkey(MI_INFO *info, uchar *buf, int inx, const uchar *key,
     {
       mi_print_error(info->s, HA_ERR_CRASHED);
       my_errno=HA_ERR_CRASHED;
+      if (share->concurrent_insert)
+        rw_unlock(&share->key_root_lock[inx]);
       goto err;
     }
     break;
@@ -103,55 +106,62 @@ int mi_rkey(MI_INFO *info, uchar *buf, int inx, const uchar *key,
         saved the current data_file_length. Concurrent inserts always go
         to the end of the file. So we can test if the found key
         references a new record.
+
+        If we are searching for a partial key (or using >, >=, < or <=) and
+        the data is outside of the data file, we need to continue searching
+        for the first key inside the data file.
+
+        We do also continue searching if an index condition check function
+        is available.
       */
-      if (info->lastpos >= info->state->data_file_length)
+      while ((info->lastpos >= info->state->data_file_length &&
+              (search_flag != HA_READ_KEY_EXACT ||
+              last_used_keyseg != keyinfo->seg + keyinfo->keysegs)) ||
+             (info->index_cond_func && 
+              (res= mi_check_index_cond(info, inx, buf)) == ICP_NO_MATCH))
       {
-        /* The key references a concurrently inserted record. */
+        uint not_used[2];
+        /*
+          Skip rows that are inserted by other threads since we got a lock
+          Note that this can only happen if we are not searching after an
+          full length exact key, because the keys are sorted
+          according to position
+        */
+        if  (_mi_search_next(info, keyinfo, info->lastkey,
+                             info->lastkey_length,
+                             myisam_readnext_vec[search_flag],
+                             info->s->state.key_root[inx]))
+          break;
+        /*
+          Check that the found key does still match the search.
+          _mi_search_next() delivers the next key regardless of its
+          value.
+        */
         if (search_flag == HA_READ_KEY_EXACT &&
-            last_used_keyseg == keyinfo->seg + keyinfo->keysegs)
+            ha_key_cmp(keyinfo->seg, key_buff, info->lastkey, use_key_length,
+                       SEARCH_FIND, not_used))
         {
-          /* Simply ignore the key if it matches exactly. (Bug #29838) */
           my_errno= HA_ERR_KEY_NOT_FOUND;
           info->lastpos= HA_OFFSET_ERROR;
+          break;
         }
-        else
-        {
-          /*
-            If searching for a partial key (or using >, >=, < or <=) and
-            the data is outside of the data file, we need to continue
-            searching for the first key inside the data file.
-          */
-          do
-          {
-            uint not_used[2];
-            /*
-              Skip rows that are inserted by other threads since we got
-              a lock. Note that this can only happen if we are not
-              searching after a full length exact key, because the keys
-              are sorted according to position.
-            */
-            if  (_mi_search_next(info, keyinfo, info->lastkey,
-                                 info->lastkey_length,
-                                 myisam_readnext_vec[search_flag],
-                                 info->s->state.key_root[inx]))
-              break; /* purecov: inspected */
-            /*
-              Check that the found key does still match the search.
-              _mi_search_next() delivers the next key regardless of its
-              value.
-            */
-            if (search_flag == HA_READ_KEY_EXACT &&
-                ha_key_cmp(keyinfo->seg, key_buff, info->lastkey,
-                           use_key_length, SEARCH_FIND, not_used))
-            {
-              /* purecov: begin inspected */
-              my_errno= HA_ERR_KEY_NOT_FOUND;
-              info->lastpos= HA_OFFSET_ERROR;
-              break;
-              /* purecov: end */
-            }
-          } while (info->lastpos >= info->state->data_file_length);
-        }
+      }
+      if (res == ICP_OUT_OF_RANGE)
+      {
+        info->lastpos= HA_OFFSET_ERROR;
+        if (share->concurrent_insert)
+          rw_unlock(&share->key_root_lock[inx]);
+        DBUG_RETURN((my_errno= HA_ERR_KEY_NOT_FOUND));
+      }
+      /*
+        Error if no row found within the data file. (Bug #29838)
+        Do not overwrite my_errno if already at HA_OFFSET_ERROR.
+      */
+      if (info->lastpos != HA_OFFSET_ERROR &&
+          info->lastpos >= info->state->data_file_length)
+      {
+        info->lastpos= HA_OFFSET_ERROR;
+        my_errno= HA_ERR_KEY_NOT_FOUND;
       }
     }
   }
diff --git a/storage/myisam/mi_rnext.c b/storage/myisam/mi_rnext.c
index e1a78a04e57..43b071bc464 100644
--- a/storage/myisam/mi_rnext.c
+++ b/storage/myisam/mi_rnext.c
@@ -28,6 +28,7 @@ int mi_rnext(MI_INFO *info, uchar *buf, int inx)
 {
   int error,changed;
   uint flag;
+  ICP_RESULT res= 0;
   uint update_mask= HA_STATE_NEXT_FOUND;
   DBUG_ENTER("mi_rnext");
 
@@ -96,23 +97,36 @@ int mi_rnext(MI_INFO *info, uchar *buf, int inx)
     }
   }
 
-  if (info->s->concurrent_insert)
+  if (!error)
   {
-    if (!error)
+    while ((info->s->concurrent_insert &&
+            info->lastpos >= info->state->data_file_length) ||
+           (info->index_cond_func &&
+           (res= mi_check_index_cond(info, inx, buf)) == ICP_NO_MATCH))
     {
-      while (info->lastpos >= info->state->data_file_length)
-      {
-	/* Skip rows inserted by other threads since we got a lock */
-	if  ((error=_mi_search_next(info,info->s->keyinfo+inx,
-				    info->lastkey,
-				    info->lastkey_length,
-				    SEARCH_BIGGER,
-				    info->s->state.key_root[inx])))
-	  break;
-      }
+      /* 
+         Skip rows that are either inserted by other threads since
+         we got a lock or do not match pushed index conditions
+      */
+      if  ((error=_mi_search_next(info,info->s->keyinfo+inx,
+                                  info->lastkey,
+                                  info->lastkey_length,
+                                  SEARCH_BIGGER,
+                                  info->s->state.key_root[inx])))
+        break;
+    }
+    if (!error && res == ICP_OUT_OF_RANGE)
+    {
+      if (info->s->concurrent_insert)
+        rw_unlock(&info->s->key_root_lock[inx]);
+      info->lastpos= HA_OFFSET_ERROR;
+      DBUG_RETURN(my_errno= HA_ERR_END_OF_FILE);
     }
-    mysql_rwlock_unlock(&info->s->key_root_lock[inx]);
   }
+  
+  if (info->s->concurrent_insert)
+    mysql_rwlock_unlock(&info->s->key_root_lock[inx]);
+
 	/* Don't clear if database-changed */
   info->update&= (HA_STATE_CHANGED | HA_STATE_ROW_CHANGED);
   info->update|= update_mask;
diff --git a/storage/myisam/mi_rnext_same.c b/storage/myisam/mi_rnext_same.c
index 6779709fc80..54de367016b 100644
--- a/storage/myisam/mi_rnext_same.c
+++ b/storage/myisam/mi_rnext_same.c
@@ -75,8 +75,13 @@ int mi_rnext_same(MI_INFO *info, uchar *buf)
           info->lastpos= HA_OFFSET_ERROR;
           break;
         }
-        /* Skip rows that are inserted by other threads since we got a lock */
-        if (info->lastpos < info->state->data_file_length)
+        /* 
+          Skip 
+           - rows that are inserted by other threads since we got a lock 
+           - rows that don't match index condition */
+        if (info->lastpos < info->state->data_file_length && 
+            (!info->index_cond_func || 
+              mi_check_index_cond(info, inx, buf) != ICP_NO_MATCH))
           break;
       }
   }
diff --git a/storage/myisam/mi_rprev.c b/storage/myisam/mi_rprev.c
index f7dddefb647..89612b5a661 100644
--- a/storage/myisam/mi_rprev.c
+++ b/storage/myisam/mi_rprev.c
@@ -51,22 +51,36 @@ int mi_rprev(MI_INFO *info, uchar *buf, int inx)
     error=_mi_search(info,share->keyinfo+inx,info->lastkey,
 		     USE_WHOLE_KEY, flag, share->state.key_root[inx]);
 
-  if (share->concurrent_insert)
+  if (!error)
   {
-    if (!error)
+    int res= 0;
+    while ((share->concurrent_insert && 
+            info->lastpos >= info->state->data_file_length) ||
+           (info->index_cond_func &&
+            !(res= mi_check_index_cond(info, inx, buf))))
     {
-      while (info->lastpos >= info->state->data_file_length)
-      {
-	/* Skip rows that are inserted by other threads since we got a lock */
-	if  ((error=_mi_search_next(info,share->keyinfo+inx,info->lastkey,
-				    info->lastkey_length,
-				    SEARCH_SMALLER,
-				    share->state.key_root[inx])))
-	  break;
-      }
+      /* 
+         Skip rows that are either inserted by other threads since
+         we got a lock or do not match pushed index conditions
+      */
+      if  ((error=_mi_search_next(info,share->keyinfo+inx,info->lastkey,
+                                  info->lastkey_length,
+                                  SEARCH_SMALLER,
+                                  share->state.key_root[inx])))
+        break;
+    }
+    if (!error && res == 2) 
+    {
+      if (share->concurrent_insert)
+        rw_unlock(&share->key_root_lock[inx]);
+      info->lastpos= HA_OFFSET_ERROR;
+      DBUG_RETURN(my_errno= HA_ERR_END_OF_FILE);
     }
-    mysql_rwlock_unlock(&share->key_root_lock[inx]);
   }
+
+  if (share->concurrent_insert)
+    mysql_rwlock_unlock(&share->key_root_lock[inx]);
+
   info->update&= (HA_STATE_CHANGED | HA_STATE_ROW_CHANGED);
   info->update|= HA_STATE_PREV_FOUND;
   if (error)
diff --git a/storage/myisam/mi_search.c b/storage/myisam/mi_search.c
index 61ca3c37863..89d1b801695 100644
--- a/storage/myisam/mi_search.c
+++ b/storage/myisam/mi_search.c
@@ -66,7 +66,7 @@ int _mi_search(register MI_INFO *info, register MI_KEYDEF *keyinfo,
   int error,flag;
   uint nod_flag;
   uchar *keypos,*maxpos;
-  uchar lastkey[MI_MAX_KEY_BUFF],*buff;
+  uchar lastkey[HA_MAX_KEY_BUFF],*buff;
   DBUG_ENTER("_mi_search");
   DBUG_PRINT("enter",("pos: %lu  nextflag: %u  lastpos: %lu",
                       (ulong) pos, nextflag, (ulong) info->lastpos));
@@ -248,7 +248,7 @@ int _mi_seq_search(MI_INFO *info, register MI_KEYDEF *keyinfo, uchar *page,
 {
   int UNINIT_VAR(flag);
   uint nod_flag,UNINIT_VAR(length),not_used[2];
-  uchar t_buff[MI_MAX_KEY_BUFF],*end;
+  uchar t_buff[HA_MAX_KEY_BUFF],*end;
   DBUG_ENTER("_mi_seq_search");
 
   end= page+mi_getint(page);
@@ -300,8 +300,8 @@ int _mi_prefix_search(MI_INFO *info, register MI_KEYDEF *keyinfo, uchar *page,
   uint UNINIT_VAR(prefix_len), suffix_len;
   int key_len_skip, UNINIT_VAR(seg_len_pack), key_len_left;
   uchar *end, *kseg, *vseg;
-  uchar *sort_order=keyinfo->seg->charset->sort_order;
-  uchar tt_buff[MI_MAX_KEY_BUFF+2], *t_buff=tt_buff+2;
+  const uchar *sort_order= keyinfo->seg->charset->sort_order;
+  uchar tt_buff[HA_MAX_KEY_BUFF+2], *t_buff=tt_buff+2;
   uchar *UNINIT_VAR(saved_from), *UNINIT_VAR(saved_to);
   uchar *UNINIT_VAR(saved_vseg);
   uint  saved_length=0, saved_prefix_len=0;
@@ -919,7 +919,7 @@ uint _mi_get_binary_pack_key(register MI_KEYDEF *keyinfo, uint nod_flag,
   DBUG_ENTER("_mi_get_binary_pack_key");
 
   page= *page_pos;
-  page_end=page+MI_MAX_KEY_BUFF+1;
+  page_end=page+HA_MAX_KEY_BUFF+1;
   start_key=key;
 
   /*
@@ -1237,7 +1237,7 @@ int _mi_search_next(register MI_INFO *info, register MI_KEYDEF *keyinfo,
 {
   int error;
   uint nod_flag;
-  uchar lastkey[MI_MAX_KEY_BUFF];
+  uchar lastkey[HA_MAX_KEY_BUFF];
   DBUG_ENTER("_mi_search_next");
   DBUG_PRINT("enter",("nextflag: %u  lastpos: %lu  int_keypos: %lu",
                       nextflag, (ulong) info->lastpos,
@@ -1467,7 +1467,8 @@ _mi_calc_var_pack_key_length(MI_KEYDEF *keyinfo,uint nod_flag,uchar *next_key,
   int length;
   uint key_length,ref_length,org_key_length=0,
        length_pack,new_key_length,diff_flag,pack_marker;
-  uchar *start,*end,*key_end,*sort_order;
+  uchar *start,*end,*key_end;
+  const uchar *sort_order;
   my_bool same_length;
 
   length_pack=s_temp->ref_length=s_temp->n_ref_length=s_temp->n_length=0;
@@ -1748,7 +1749,7 @@ _mi_calc_bin_pack_key_length(MI_KEYDEF *keyinfo,uint nod_flag,uchar *next_key,
   uint length,key_length,ref_length;
 
   s_temp->totlength=key_length=_mi_keylength(keyinfo,key)+nod_flag;
-#ifdef HAVE_purify
+#ifdef HAVE_valgrind
   s_temp->n_length= s_temp->n_ref_length=0;	/* For valgrind */
 #endif
   s_temp->key=key;
@@ -1801,13 +1802,13 @@ _mi_calc_bin_pack_key_length(MI_KEYDEF *keyinfo,uint nod_flag,uchar *next_key,
     }
     /* Check how many characters are identical to next key */
     key= s_temp->key+next_length;
+    s_temp->prev_length= 0;
     while (*key++ == *next_key++) ;
     if ((ref_length= (uint) (key - s_temp->key)-1) == next_length)
     {
       s_temp->next_key_pos=0;
       return length;                            /* can't pack next key */
     }
-    s_temp->prev_length=0;
     s_temp->n_ref_length=ref_length;
     return (int) (length-(ref_length - next_length) - next_length_pack +
                   get_pack_length(ref_length));
diff --git a/storage/myisam/mi_test1.c b/storage/myisam/mi_test1.c
index f89f2a8d21d..e9dbc7c6a69 100644
--- a/storage/myisam/mi_test1.c
+++ b/storage/myisam/mi_test1.c
@@ -49,7 +49,8 @@ int main(int argc,char *argv[])
   MY_INIT(argv[0]);
   my_init();
   if (key_cacheing)
-    init_key_cache(dflt_key_cache,KEY_CACHE_BLOCK_SIZE,IO_SIZE*16,0,0);
+    init_key_cache(dflt_key_cache,KEY_CACHE_BLOCK_SIZE,IO_SIZE*16,0,0,
+                   DEFAULT_KEY_CACHE_PARTITIONS);
   get_options(argc,argv);
 
   exit(run_test("test1"));
@@ -79,6 +80,8 @@ static int run_test(const char *filename)
   recinfo[2].length= (extra_field == FIELD_BLOB ? 4 + portable_sizeof_char_ptr : 24);
   if (extra_field == FIELD_VARCHAR)
     recinfo[2].length+= HA_VARCHAR_PACKLENGTH(recinfo[2].length);
+  recinfo[1].null_bit= null_fields ? 2 : 0;
+
   if (opt_unique)
   {
     recinfo[3].type=FIELD_CHECK;
@@ -630,7 +633,7 @@ get_one_option(int optid, const struct my_option *opt __attribute__((unused)),
     key_type= HA_KEYTYPE_VARTEXT1;
     break;
   case 'k':
-    if (key_length < 4 || key_length > MI_MAX_KEY_LENGTH)
+    if (key_length < 4 || key_length > HA_MAX_KEY_LENGTH)
     {
       fprintf(stderr,"Wrong key length\n");
       exit(1);
diff --git a/storage/myisam/mi_test2.c b/storage/myisam/mi_test2.c
index 127d93b5433..9babf7ad4f0 100644
--- a/storage/myisam/mi_test2.c
+++ b/storage/myisam/mi_test2.c
@@ -18,9 +18,6 @@
 #ifndef USE_MY_FUNC		/* We want to be able to dbug this !! */
 #define USE_MY_FUNC
 #endif
-#ifdef DBUG_OFF
-#undef DBUG_OFF
-#endif
 #include "myisamdef.h"
 #include <m_ctype.h>
 #include <my_bit.h>
@@ -40,7 +37,7 @@ static void copy_key(struct st_myisam_info *info,uint inx,
 		     uchar *record,uchar *key);
 
 static	int verbose=0,testflag=0,
-	    first_key=0,async_io=0,key_cacheing=0,write_cacheing=0,locking=0,
+	    first_key=0,async_io=0,key_cacheing=0,write_cacheing=0,do_locking=0,
             rec_pointer_size=0,pack_fields=1,use_log=0,silent=0,
             opt_quick_mode=0;
 static int pack_seg=HA_SPACE_PACK,pack_type=HA_PACK_KEY,remove_count=-1,
@@ -218,8 +215,9 @@ int main(int argc, char *argv[])
   if (!silent)
     printf("- Writing key:s\n");
   if (key_cacheing)
-    init_key_cache(dflt_key_cache,key_cache_block_size,key_cache_size,0,0);
-  if (locking)
+    init_key_cache(dflt_key_cache,key_cache_block_size,key_cache_size,0,0,
+                   DEFAULT_KEY_CACHE_PARTITIONS);
+  if (do_locking)
     mi_lock_database(file,F_WRLCK);
   if (write_cacheing)
     mi_extra(file,HA_EXTRA_WRITE_CACHE,0);
@@ -331,9 +329,9 @@ int main(int argc, char *argv[])
       if (use_blob)
       {
 	if (i & 1)
-	  put_blob_in_record(record+blob_pos,&blob_buffer);
+	  put_blob_in_record(record2+blob_pos,&blob_buffer);
 	else
-	  bmove(record+blob_pos,read_record+blob_pos,8);
+	  bmove(record2+blob_pos,read_record+blob_pos,8);
       }
       if (mi_update(file,read_record,record2))
       {
@@ -603,7 +601,7 @@ int main(int argc, char *argv[])
     if (mi_rsame(file,read_record2,(int) i)) goto err;
     if (memcmp(read_record,read_record2,reclength) != 0)
     {
-      printf("is_rsame didn't find same record\n");
+      printf("mi_rsame didn't find same record\n");
       goto end;
     }
   }
@@ -654,10 +652,10 @@ int main(int argc, char *argv[])
       sprintf((char*) key2,"%6d",k);
 
       min_key.key= key;
-      min_key.length= USE_WHOLE_KEY;
+      min_key.keypart_map= HA_WHOLE_KEY;
       min_key.flag= HA_READ_AFTER_KEY;
       max_key.key= key2;
-      max_key.length= USE_WHOLE_KEY;
+      max_key.keypart_map= HA_WHOLE_KEY;
       max_key.flag= HA_READ_BEFORE_KEY;
       range_records= mi_records_in_range(file, 0, &min_key, &max_key);
       records=0;
@@ -710,7 +708,7 @@ int main(int argc, char *argv[])
     printf("- mi_extra(CACHE) + mi_rrnd.... + mi_extra(NO_CACHE)\n");
   if (mi_reset(file) || mi_extra(file,HA_EXTRA_CACHE,0))
   {
-    if (locking || (!use_blob && !pack_fields))
+    if (do_locking || (!use_blob && !pack_fields))
     {
       puts("got error from mi_extra(HA_EXTRA_CACHE)");
       goto end;
@@ -777,9 +775,8 @@ int main(int argc, char *argv[])
       {
 	ulong blob_length,pos;
 	uchar *ptr;
-	longget(blob_length,read_record+blob_pos+4);
-	ptr=(uchar*) blob_length;
-	longget(blob_length,read_record+blob_pos);
+	memcpy_fixed(&ptr, read_record+blob_pos+4, sizeof(ptr));
+        blob_length= uint4korr(read_record+blob_pos);
 	for (pos=0 ; pos < blob_length ; pos++)
 	{
 	  if (ptr[pos] != (uchar) (blob_length+pos))
@@ -815,6 +812,8 @@ end:
   mi_panic(HA_PANIC_CLOSE);			/* Should close log */
   if (!silent)
   {
+    KEY_CACHE_STATISTICS stats;
+    
     printf("\nFollowing test have been made:\n");
     printf("Write records: %d\nUpdate records: %d\nSame-key-read: %d\nDelete records: %d\n", write_count,update,dupp_keys,opt_delete);
     if (rec_pointer_size)
@@ -831,12 +830,13 @@ end:
       puts("Write cacheing used");
     if (write_cacheing)
       puts("quick mode");
-    if (async_io && locking)
+    if (async_io && do_locking)
       puts("Asyncron io with locking used");
-    else if (locking)
+    else if (do_locking)
       puts("Locking used");
     if (use_blob)
       puts("blobs used");
+    get_key_cache_statistics(dflt_key_cache, 0, &stats);
     printf("key cache status: \n\
 blocks used:%10lu\n\
 not flushed:%10lu\n\
@@ -844,12 +844,12 @@ w_requests: %10lu\n\
 writes:     %10lu\n\
 r_requests: %10lu\n\
 reads:      %10lu\n",
-           dflt_key_cache->blocks_used,
-           dflt_key_cache->global_blocks_changed,
-           (ulong) dflt_key_cache->global_cache_w_requests,
-           (ulong) dflt_key_cache->global_cache_write,
-           (ulong) dflt_key_cache->global_cache_r_requests,
-           (ulong) dflt_key_cache->global_cache_read);
+           (ulong) stats.blocks_used,
+           (ulong) stats.blocks_changed,
+           (ulong) stats.write_requests,
+           (ulong) stats.writes,
+           (ulong) stats.read_requests,
+           (ulong) stats.reads);
   }
   end_key_cache(dflt_key_cache,1);
   if (blob_buffer)
@@ -902,7 +902,7 @@ static void get_options(int argc, char **argv)
       use_log=1;
       break;
     case 'L':
-      locking=1;
+      do_locking=1;
       break;
     case 'A':				/* use asyncron io */
       async_io=1;
diff --git a/storage/myisam/mi_test3.c b/storage/myisam/mi_test3.c
index c03a34df227..742fd06b0e3 100644
--- a/storage/myisam/mi_test3.c
+++ b/storage/myisam/mi_test3.c
@@ -15,6 +15,8 @@
 
 /* Test av locking */
 
+#ifndef _WIN32 /*no fork() in Windows*/
+
 #include "myisam.h"
 #include <sys/types.h>
 #ifdef HAVE_SYS_WAIT_H
@@ -175,8 +177,10 @@ void start_test(int id)
     exit(1);
   }
   if (key_cacheing && rnd(2) == 0)
-    init_key_cache(dflt_key_cache, KEY_CACHE_BLOCK_SIZE, 65536L, 0, 0);
-  printf("Process %d, pid: %d\n",id,getpid()); fflush(stdout);
+    init_key_cache(dflt_key_cache, KEY_CACHE_BLOCK_SIZE, 65536L, 0, 0,
+                   DEFAULT_KEY_CACHE_PARTITIONS);
+  printf("Process %d, pid: %ld\n", id, (long) getpid());
+  fflush(stdout);
 
   for (error=i=0 ; i < tests && !error; i++)
   {
@@ -360,7 +364,7 @@ int test_write(MI_INFO *file,int id,int lock_type)
       mi_extra(file,HA_EXTRA_WRITE_CACHE,0);
   }
 
-  sprintf((char*) record.id,"%7d",getpid());
+  sprintf((char*) record.id,"%7ld",(long) getpid());
   strnmov((char*) record.text,"Testing...", sizeof(record.text));
 
   tries=(uint) rnd(100)+10;
@@ -487,3 +491,14 @@ int test_update(MI_INFO *file,int id,int lock_type)
 }
 
 #include "mi_extrafunc.h"
+#else /* _WIN32 */
+
+#include <stdio.h>
+
+int main()
+{
+	fprintf(stderr,"this test has not been ported to Windows\n");
+	return 0;
+}
+
+#endif /* _WIN32 */
diff --git a/storage/myisam/mi_test_all.sh b/storage/myisam/mi_test_all.sh
index 5989d9cfaf0..c6bc686e885 100755
--- a/storage/myisam/mi_test_all.sh
+++ b/storage/myisam/mi_test_all.sh
@@ -5,6 +5,7 @@
 
 valgrind="valgrind --alignment=8 --leak-check=yes"
 silent="-s"
+rm -f test1.TMD
 
 if test -f mi_test1$MACH ; then suffix=$MACH ; else suffix=""; fi
 ./mi_test1$suffix $silent
diff --git a/storage/myisam/mi_update.c b/storage/myisam/mi_update.c
index b538bcd0bb1..6d4150e5b79 100644
--- a/storage/myisam/mi_update.c
+++ b/storage/myisam/mi_update.c
@@ -23,7 +23,7 @@ int mi_update(register MI_INFO *info, const uchar *oldrec, uchar *newrec)
   int flag,key_changed,save_errno;
   reg3 my_off_t pos;
   uint i;
-  uchar old_key[MI_MAX_KEY_BUFF],*new_key;
+  uchar old_key[HA_MAX_KEY_BUFF],*new_key;
   my_bool auto_key_changed=0;
   ulonglong changed;
   MYISAM_SHARE *share=info->s;
diff --git a/storage/myisam/mi_write.c b/storage/myisam/mi_write.c
index bd56bb04f65..7b40a11f7c6 100644
--- a/storage/myisam/mi_write.c
+++ b/storage/myisam/mi_write.c
@@ -267,7 +267,7 @@ int _mi_ck_write_btree(register MI_INFO *info, uint keynr, uchar *key,
     comp_flag=SEARCH_BIGGER;			/* Put after same key */
   else if (keyinfo->flag & (HA_NOSAME|HA_FULLTEXT))
   {
-    comp_flag=SEARCH_FIND | SEARCH_UPDATE;	/* No duplicates */
+    comp_flag=SEARCH_FIND | SEARCH_UPDATE | SEARCH_INSERT; /* No duplicates */
     if (keyinfo->flag & HA_NULL_ARE_EQUAL)
       comp_flag|= SEARCH_NULL_ARE_EQUAL;
   }
@@ -341,7 +341,7 @@ static int w_search(register MI_INFO *info, register MI_KEYDEF *keyinfo,
   int error,flag;
   uint nod_flag, search_key_length;
   uchar *temp_buff,*keypos;
-  uchar keybuff[MI_MAX_KEY_BUFF];
+  uchar keybuff[HA_MAX_KEY_BUFF];
   my_bool was_last_key;
   my_off_t next_page, dupp_key_pos;
   DBUG_ENTER("w_search");
@@ -349,7 +349,7 @@ static int w_search(register MI_INFO *info, register MI_KEYDEF *keyinfo,
 
   search_key_length= (comp_flag & SEARCH_FIND) ? key_length : USE_WHOLE_KEY;
   if (!(temp_buff= (uchar*) my_alloca((uint) keyinfo->block_length+
-				      MI_MAX_KEY_BUFF*2)))
+				      HA_MAX_KEY_BUFF*2)))
     DBUG_RETURN(-1);
   if (!_mi_fetch_keypage(info,keyinfo,page,DFLT_INIT_HITS,temp_buff,0))
     goto err;
@@ -697,21 +697,23 @@ uchar *_mi_find_half_pos(uint nod_flag, MI_KEYDEF *keyinfo, uchar *page,
 } /* _mi_find_half_pos */
 
 
-	/*
-	  Split buffer at last key
-	  Returns pointer to the start of the key before the last key
-	  key will contain the last key
-	*/
+/*
+  Split buffer at last key
+  Returns pointer to the start of the key before the last key
+  key will contain the last key
+*/
 
 static uchar *_mi_find_last_pos(MI_KEYDEF *keyinfo, uchar *page,
 				uchar *key, uint *return_key_length,
 				uchar **after_key)
 {
-  uint keys,length,UNINIT_VAR(last_length),key_ref_length;
-  uchar *end,*lastpos,*UNINIT_VAR(prevpos);
-  uchar key_buff[MI_MAX_KEY_BUFF];
+  uint keys,length,last_length,key_ref_length;
+  uchar *end,*lastpos,*prevpos;
+  uchar key_buff[HA_MAX_KEY_BUFF];
   DBUG_ENTER("_mi_find_last_pos");
 
+  LINT_INIT(last_length);
+
   key_ref_length=2;
   length=mi_getint(page)-key_ref_length;
   page+=key_ref_length;
@@ -728,10 +730,12 @@ static uchar *_mi_find_last_pos(MI_KEYDEF *keyinfo, uchar *page,
   }
 
   end=page+length-key_ref_length;
+  DBUG_ASSERT(page < end);
   *key='\0';
   length=0;
   lastpos=page;
-  while (page < end)
+
+  do
   {
     prevpos=lastpos; lastpos=page;
     last_length=length;
@@ -742,7 +746,8 @@ static uchar *_mi_find_last_pos(MI_KEYDEF *keyinfo, uchar *page,
       my_errno=HA_ERR_CRASHED;
       DBUG_RETURN(0);
     }
-  }
+  } while (page < end);
+
   *return_key_length=last_length;
   *after_key=lastpos;
   DBUG_PRINT("exit",("returns: 0x%lx  page: 0x%lx  end: 0x%lx",
@@ -764,7 +769,7 @@ static int _mi_balance_page(register MI_INFO *info, MI_KEYDEF *keyinfo,
        length,keys;
   uchar *pos,*buff,*extra_buff;
   my_off_t next_page,new_pos;
-  uchar tmp_part_key[MI_MAX_KEY_BUFF];
+  uchar tmp_part_key[HA_MAX_KEY_BUFF];
   DBUG_ENTER("_mi_balance_page");
 
   k_length=keyinfo->keylength;
@@ -930,7 +935,7 @@ static int keys_free(uchar *key, TREE_FREE mode, bulk_insert_param *param)
     Probably I can use info->lastkey here, but I'm not sure,
     and to be safe I'd better use local lastkey.
   */
-  uchar lastkey[MI_MAX_KEY_BUFF];
+  uchar lastkey[HA_MAX_KEY_BUFF];
   uint keylen;
   MI_KEYDEF *keyinfo;
 
diff --git a/storage/myisam/myisam_ftdump.c b/storage/myisam/myisam_ftdump.c
index 1c534fe8d02..d51e079625e 100644
--- a/storage/myisam/myisam_ftdump.c
+++ b/storage/myisam/myisam_ftdump.c
@@ -53,7 +53,7 @@ static struct my_option my_long_options[] =
 
 int main(int argc,char *argv[])
 {
-  int error=0, subkeys;
+  int error=0;
   uint keylen, keylen2=0, inx, doc_cnt=0;
   float weight= 1.0;
   double gws, min_gws=0, avg_gws=0;
@@ -83,7 +83,7 @@ int main(int argc,char *argv[])
       usage();
   }
 
-  init_key_cache(dflt_key_cache,MI_KEY_BLOCK_LENGTH,USE_BUFFER_INIT, 0, 0);
+  init_key_cache(dflt_key_cache,MI_KEY_BLOCK_LENGTH,USE_BUFFER_INIT, 0, 0, 0);
 
   if (!(info=mi_open(argv[0], O_RDONLY,
                      HA_OPEN_ABORT_IF_LOCKED|HA_OPEN_FROM_SQL_LAYER)))
@@ -109,11 +109,12 @@ int main(int argc,char *argv[])
 
   while (!(error=mi_rnext(info,NULL,inx)))
   {
+    FT_WEIGTH subkeys;
     keylen=*(info->lastkey);
 
-    subkeys=ft_sintXkorr(info->lastkey+keylen+1);
-    if (subkeys >= 0)
-      ft_floatXget(weight, info->lastkey+keylen+1);
+    subkeys.i =ft_sintXkorr(info->lastkey+keylen+1);
+    if (subkeys.i >= 0)
+      weight= subkeys.f;
 
 #ifdef HAVE_SNPRINTF
     snprintf(buf,MAX_LEN,"%.*s",(int) keylen,info->lastkey+1);
@@ -150,14 +151,14 @@ int main(int argc,char *argv[])
         keylen2=keylen;
         doc_cnt=0;
       }
-      doc_cnt+= (subkeys >= 0 ? 1 : -subkeys);
+      doc_cnt+= (subkeys.i >= 0 ? 1 : -subkeys.i);
     }
     if (dump)
     {
-      if (subkeys>=0)
+      if (subkeys.i >= 0)
         printf("%9lx %20.7f %s\n", (long) info->lastpos,weight,buf);
       else
-        printf("%9lx => %17d %s\n",(long) info->lastpos,-subkeys,buf);
+        printf("%9lx => %17d %s\n",(long) info->lastpos,-subkeys.i,buf);
     }
     if (verbose && (total%HOW_OFTEN_TO_WRITE)==0)
       printf("%10ld\r",total);
diff --git a/storage/myisam/myisamchk.c b/storage/myisam/myisamchk.c
index 4df76e31872..f8eb3cb1bde 100644
--- a/storage/myisam/myisamchk.c
+++ b/storage/myisam/myisamchk.c
@@ -16,7 +16,6 @@
 /* Describe, check and repair of MyISAM tables */
 
 #include "fulltext.h"
-
 #include <m_ctype.h>
 #include <stdarg.h>
 #include <my_getopt.h>
@@ -35,7 +34,6 @@ static const char *set_collation_name, *opt_tmpdir;
 static CHARSET_INFO *set_collation;
 static long opt_myisam_block_size;
 static long opt_key_cache_block_size;
-static const char *my_progname_short;
 static int stopwords_inited= 0;
 static MY_TMPDIR myisamchk_tmpdir;
 
@@ -62,9 +60,9 @@ static const char *myisam_stats_method_str="nulls_unequal";
 static void get_options(int *argc,char * * *argv);
 static void print_version(void);
 static void usage(void);
-static int myisamchk(MI_CHECK *param, char *filename);
-static void descript(MI_CHECK *param, register MI_INFO *info, char * name);
-static int mi_sort_records(MI_CHECK *param, register MI_INFO *info,
+static int myisamchk(HA_CHECK *param, char *filename);
+static void descript(HA_CHECK *param, register MI_INFO *info, char * name);
+static int mi_sort_records(HA_CHECK *param, register MI_INFO *info,
                            char * name, uint sort_key,
 			   my_bool write_info, my_bool update_index);
 static int sort_record_index(MI_SORT_PARAM *sort_param, MI_INFO *info,
@@ -72,15 +70,16 @@ static int sort_record_index(MI_SORT_PARAM *sort_param, MI_INFO *info,
 			     my_off_t page,uchar *buff,uint sortkey,
 			     File new_file, my_bool update_index);
 
-MI_CHECK check_param;
+HA_CHECK check_param;
 
 	/* Main program */
 
 int main(int argc, char **argv)
 {
   int error;
+  uchar rc;
   MY_INIT(argv[0]);
-  my_progname_short= my_progname+dirname_length(my_progname);
+  my_progname_short= "myisamchk";
 
   myisamchk_init(&check_param);
   check_param.opt_lock_memory=1;		/* Lock memory if possible */
@@ -100,7 +99,7 @@ int main(int argc, char **argv)
 	(!(check_param.testflag & (T_REP | T_REP_BY_SORT | T_SORT_RECORDS |
 				   T_SORT_INDEX))))
     {
-      uint old_testflag=check_param.testflag;
+      ulonglong old_testflag=check_param.testflag;
       if (!(check_param.testflag & T_REP))
 	check_param.testflag|= T_REP_BY_SORT;
       check_param.testflag&= ~T_EXTEND;			/* Don't needed  */
@@ -129,7 +128,8 @@ int main(int argc, char **argv)
   free_tmpdir(&myisamchk_tmpdir);
   ft_free_stopwords();
   my_end(check_param.testflag & T_INFO ? MY_CHECK_ERROR | MY_GIVE_INFO : MY_CHECK_ERROR);
-  exit(error);
+  rc= (uchar) error;
+  exit(rc);
 #ifndef _lint
   return 0;				/* No compiler warning */
 #endif
@@ -158,7 +158,7 @@ static struct my_option my_long_options[] =
    0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
   {"character-sets-dir", OPT_CHARSETS_DIR,
    "Directory where character sets are.",
-   &charsets_dir, 0, 0, GET_STR, REQUIRED_ARG, 0, 0, 0, 0, 0, 0},
+   (char**) &charsets_dir, 0, 0, GET_STR, REQUIRED_ARG, 0, 0, 0, 0, 0, 0},
   {"check", 'c',
    "Check table for errors.",
    0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
@@ -254,8 +254,7 @@ static struct my_option my_long_options[] =
    &check_param.opt_sort_key,
    0, GET_UINT, REQUIRED_ARG, 0, 0, 0, 0, 0, 0},
   {"tmpdir", 't',
-   "Path for temporary files.",
-   &opt_tmpdir,
+   "Path for temporary files.", (char**) &opt_tmpdir,
    0, 0, GET_STR, REQUIRED_ARG, 0, 0, 0, 0, 0, 0},
   {"update-state", 'U',
    "Mark tables as crashed if any errors were found.",
@@ -314,13 +313,13 @@ static struct my_option my_long_options[] =
     HA_FT_MAXCHARLEN, 0, 1, 0},
   { "ft_stopword_file", OPT_FT_STOPWORD_FILE,
     "Use stopwords from this file instead of built-in list.",
-    &ft_stopword_file, &ft_stopword_file, 0, GET_STR,
+    (char**) &ft_stopword_file, (char**) &ft_stopword_file, 0, GET_STR,
     REQUIRED_ARG, 0, 0, 0, 0, 0, 0},
   {"stats_method", OPT_STATS_METHOD,
    "Specifies how index statistics collection code should treat NULLs. "
    "Possible values of name are \"nulls_unequal\" (default behavior for 4.1/5.0), "
    "\"nulls_equal\" (emulate 4.0 behavior), and \"nulls_ignored\".",
-   &myisam_stats_method_str, &myisam_stats_method_str, 0,
+   (char**) &myisam_stats_method_str, (char**) &myisam_stats_method_str, 0,
     GET_STR, REQUIRED_ARG, 0, 0, 0, 0, 0, 0},
   { 0, 0, 0, 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}
 };
@@ -671,7 +670,7 @@ get_one_option(int optid,
   case OPT_STATS_METHOD:
   {
     int method;
-    enum_mi_stats_method method_conv;
+    enum_handler_stats_method method_conv;
     LINT_INIT(method_conv);
     myisam_stats_method_str= argument;
     if ((method=find_type(argument, &myisam_stats_method_typelib, 2)) <= 0)
@@ -772,10 +771,10 @@ static void get_options(register int *argc,register char ***argv)
 
 	/* Check table */
 
-static int myisamchk(MI_CHECK *param, char * filename)
+static int myisamchk(HA_CHECK *param, char * filename)
 {
   int error,lock_type,recreate;
-  int rep_quick= param->testflag & (T_QUICK | T_FORCE_UNIQUENESS);
+  int rep_quick= test(param->testflag & (T_QUICK | T_FORCE_UNIQUENESS));
   MI_INFO *info;
   File datafile;
   char llbuff[22],llbuff2[22];
@@ -913,7 +912,7 @@ static int myisamchk(MI_CHECK *param, char * filename)
       param->testflag|=T_REP_BY_SORT;		/* if only STATISTICS */
       if (!(param->testflag & T_SILENT))
 	printf("- '%s' has old table-format. Recreating index\n",filename);
-      rep_quick|=T_QUICK;
+      rep_quick= 1;
     }
     share=info->s;
     share->tot_locks-= share->r_locks;
@@ -1074,7 +1073,7 @@ static int myisamchk(MI_CHECK *param, char * filename)
       {
 	if (param->testflag & (T_EXTEND | T_MEDIUM))
 	  (void) init_key_cache(dflt_key_cache,opt_key_cache_block_size,
-                              param->use_buffers, 0, 0);
+                              param->use_buffers, 0, 0, 0);
 	(void) init_io_cache(&param->read_cache,datafile,
 			   (uint) param->read_buffer_length,
 			   READ_CACHE,
@@ -1087,8 +1086,9 @@ static int myisamchk(MI_CHECK *param, char * filename)
 	if ((info->s->options & (HA_OPTION_PACK_RECORD |
 				 HA_OPTION_COMPRESS_RECORD)) ||
 	    (param->testflag & (T_EXTEND | T_MEDIUM)))
-	  error|=chk_data_link(param, info, param->testflag & T_EXTEND);
-	error|=flush_blocks(param, share->key_cache, share->kfile);
+	  error|=chk_data_link(param, info, test(param->testflag & T_EXTEND));
+	error|=flush_blocks(param, share->key_cache, share->kfile,
+                            &share->dirty_part_map);
 	(void) end_io_cache(&param->read_cache);
       }
       if (!error)
@@ -1171,7 +1171,7 @@ end2:
 
 	 /* Write info about table */
 
-static void descript(MI_CHECK *param, register MI_INFO *info, char * name)
+static void descript(HA_CHECK *param, register MI_INFO *info, char * name)
 {
   uint key,keyseg_nr,field,start;
   reg3 MI_KEYDEF *keyinfo;
@@ -1276,7 +1276,7 @@ static void descript(MI_CHECK *param, register MI_INFO *info, char * name)
   printf("Recordlength:        %13d\n",(int) share->base.pack_reclength);
   if (! mi_is_all_keys_active(share->state.key_map, share->base.keys))
   {
-    longlong2str(share->state.key_map,buff,2);
+    longlong2str(share->state.key_map,buff,2,1);
     printf("Using only keys '%s' of %d possibly keys\n",
 	   buff, share->base.keys);
   }
@@ -1429,7 +1429,7 @@ static void descript(MI_CHECK *param, register MI_INFO *info, char * name)
 
 	/* Sort records according to one key */
 
-static int mi_sort_records(MI_CHECK *param,
+static int mi_sort_records(HA_CHECK *param,
 			   register MI_INFO *info, char * name,
 			   uint sort_key,
 			   my_bool write_info,
@@ -1443,7 +1443,7 @@ static int mi_sort_records(MI_CHECK *param,
   ha_rows old_record_count;
   MYISAM_SHARE *share=info->s;
   char llbuff[22],llbuff2[22];
-  SORT_INFO sort_info;
+  MI_SORT_INFO sort_info;
   MI_SORT_PARAM sort_param;
   DBUG_ENTER("sort_records");
 
@@ -1489,7 +1489,7 @@ static int mi_sort_records(MI_CHECK *param,
     DBUG_RETURN(0);				/* Nothing to do */
 
   init_key_cache(dflt_key_cache, opt_key_cache_block_size,
-                 (size_t) param->use_buffers, 0, 0);
+                 (size_t) param->use_buffers, 0, 0, 0);
   if (init_io_cache(&info->rec_cache,-1,(uint) param->write_buffer_length,
 		   WRITE_CACHE,share->pack.header_length,1,
 		   MYF(MY_WME | MY_WAIT_IF_FULL)))
@@ -1599,8 +1599,8 @@ err:
   my_free(sort_info.buff);
   sort_info.buff=0;
   share->state.sortkey=sort_key;
-  DBUG_RETURN(flush_blocks(param, share->key_cache, share->kfile) |
-	      got_error);
+  DBUG_RETURN(flush_blocks(param, share->key_cache, share->kfile,
+                           &share->dirty_part_map) | got_error);
 } /* sort_records */
 
 
@@ -1614,10 +1614,10 @@ static int sort_record_index(MI_SORT_PARAM *sort_param,MI_INFO *info,
   uint	nod_flag,used_length,key_length;
   uchar *temp_buff,*keypos,*endpos;
   my_off_t next_page,rec_pos;
-  uchar lastkey[MI_MAX_KEY_BUFF];
+  uchar lastkey[HA_MAX_KEY_BUFF];
   char llbuff[22];
-  SORT_INFO *sort_info= sort_param->sort_info;
-  MI_CHECK *param=sort_info->param;
+  MI_SORT_INFO *sort_info= sort_param->sort_info;
+  HA_CHECK *param=sort_info->param;
   DBUG_ENTER("sort_record_index");
 
   nod_flag=mi_test_if_nod(buff);
@@ -1701,17 +1701,15 @@ err:
   sorting
 */
 
-static int not_killed= 0;
-
-volatile int *killed_ptr(MI_CHECK *param __attribute__((unused)))
+int killed_ptr(HA_CHECK *param __attribute__((unused)))
 {
-  return &not_killed;			/* always NULL */
+  return 0;
 }
 
 	/* print warnings and errors */
 	/* VARARGS */
 
-void mi_check_print_info(MI_CHECK *param __attribute__((unused)),
+void mi_check_print_info(HA_CHECK *param __attribute__((unused)),
 			 const char *fmt,...)
 {
   va_list args;
@@ -1724,7 +1722,7 @@ void mi_check_print_info(MI_CHECK *param __attribute__((unused)),
 
 /* VARARGS */
 
-void mi_check_print_warning(MI_CHECK *param, const char *fmt,...)
+void mi_check_print_warning(HA_CHECK *param, const char *fmt,...)
 {
   va_list args;
   DBUG_ENTER("mi_check_print_warning");
@@ -1749,7 +1747,7 @@ void mi_check_print_warning(MI_CHECK *param, const char *fmt,...)
 
 /* VARARGS */
 
-void mi_check_print_error(MI_CHECK *param, const char *fmt,...)
+void mi_check_print_error(HA_CHECK *param, const char *fmt,...)
 {
   va_list args;
   DBUG_ENTER("mi_check_print_error");
diff --git a/storage/myisam/myisamdef.h b/storage/myisam/myisamdef.h
index c7f0cb27a40..304150601d7 100644
--- a/storage/myisam/myisamdef.h
+++ b/storage/myisam/myisamdef.h
@@ -15,8 +15,8 @@
 
 /* This file is included by all internal myisam files */
 
-#include "myisam.h"			/* Structs & some defines */
-#include "myisampack.h"			/* packing of keys */
+#include "myisam.h"                     /* Structs & some defines */
+#include "myisampack.h"                 /* packing of keys */
 #include <my_tree.h>
 #ifdef THREAD
 #include <my_pthread.h>
@@ -33,10 +33,10 @@
 
 typedef struct st_mi_status_info
 {
-  ha_rows records;			/* Rows in table */
-  ha_rows del;				/* Removed rows */
-  my_off_t empty;			/* lost space in datafile */
-  my_off_t key_empty;			/* lost space in indexfile */
+  ha_rows records;                      /* Rows in table */
+  ha_rows del;                          /* Removed rows */
+  my_off_t empty;                       /* lost space in datafile */
+  my_off_t key_empty;                   /* lost space in indexfile */
   my_off_t key_file_length;
   my_off_t data_file_length;
   ha_checksum checksum;
@@ -45,116 +45,119 @@ typedef struct st_mi_status_info
 
 typedef struct st_mi_state_info
 {
-  struct {				/* Fileheader */
+  struct
+  {                                     /* Fileheader */
     uchar file_version[4];
     uchar options[2];
     uchar header_length[2];
     uchar state_info_length[2];
     uchar base_info_length[2];
     uchar base_pos[2];
-    uchar key_parts[2];			/* Key parts */
-    uchar unique_key_parts[2];		/* Key parts + unique parts */
-    uchar keys;				/* number of keys in file */
-    uchar uniques;			/* number of UNIQUE definitions */
-    uchar language;			/* Language for indexes */
-    uchar max_block_size_index;		/* max keyblock size */
+    uchar key_parts[2];                 /* Key parts */
+    uchar unique_key_parts[2];          /* Key parts + unique parts */
+    uchar keys;                         /* number of keys in file */
+    uchar uniques;                      /* number of UNIQUE definitions */
+    uchar language;                     /* Language for indexes */
+    uchar max_block_size_index;         /* max keyblock size */
     uchar fulltext_keys;
     uchar not_used;                     /* To align to 8 */
   } header;
 
   MI_STATUS_INFO state;
-  ha_rows split;			/* number of split blocks */
-  my_off_t dellink;			/* Link to next removed block */
+  ha_rows split;                        /* number of split blocks */
+  my_off_t dellink;                     /* Link to next removed block */
   ulonglong auto_increment;
-  ulong process;			/* process that updated table last */
-  ulong unique;				/* Unique number for this process */
-  ulong update_count;			/* Updated for each write lock */
+  ulong process;                        /* process that updated table last */
+  ulong unique;                         /* Unique number for this process */
+  ulong update_count;                   /* Updated for each write lock */
   ulong status;
   ulong *rec_per_key_part;
-  my_off_t *key_root;			/* Start of key trees */
-  my_off_t *key_del;			/* delete links for trees */
-  my_off_t rec_per_key_rows;		/* Rows when calculating rec_per_key */
-
-  ulong sec_index_changed;		/* Updated when new sec_index */
-  ulong sec_index_used;			/* which extra index are in use */
-  ulonglong key_map;			/* Which keys are in use */
   ha_checksum checksum;                 /* Table checksum */
-  ulong version;			/* timestamp of create */
-  time_t create_time;			/* Time when created database */
-  time_t recover_time;			/* Time for last recover */
-  time_t check_time;			/* Time for last check */
-  uint	sortkey;			/* sorted by this key  (not used) */
+  my_off_t *key_root;                   /* Start of key trees */
+  my_off_t *key_del;                    /* delete links for trees */
+  my_off_t rec_per_key_rows;            /* Rows when calculating rec_per_key */
+
+  ulong sec_index_changed;              /* Updated when new sec_index */
+  ulong sec_index_used;                 /* which extra index are in use */
+  ulonglong key_map;                    /* Which keys are in use */
+  ulong version;                        /* timestamp of create */
+  time_t create_time;                   /* Time when created database */
+  time_t recover_time;                  /* Time for last recover */
+  time_t check_time;                    /* Time for last check */
+  uint sortkey;                         /* sorted by this key (not used) */
   uint open_count;
-  uint8 changed;			/* Changed since myisamchk */
+  uint8 changed;                        /* Changed since myisamchk */
 
   /* the following isn't saved on disk */
-  uint state_diff_length;		/* Should be 0 */
-  uint	state_length;			/* Length of state header in file */
+  uint state_diff_length;               /* Should be 0 */
+  uint state_length;                    /* Length of state header in file */
   ulong *key_info;
 } MI_STATE_INFO;
 
-#define MI_STATE_INFO_SIZE	(24+14*8+7*4+2*2+8)
-#define MI_STATE_KEY_SIZE	8
+#define MI_STATE_INFO_SIZE      (24+14*8+7*4+2*2+8)
+#define MI_STATE_KEY_SIZE       8
 #define MI_STATE_KEYBLOCK_SIZE  8
-#define MI_STATE_KEYSEG_SIZE	4
-#define MI_STATE_EXTRA_SIZE ((MI_MAX_KEY+MI_MAX_KEY_BLOCK_SIZE)*MI_STATE_KEY_SIZE + MI_MAX_KEY*MI_MAX_KEY_SEG*MI_STATE_KEYSEG_SIZE)
-#define MI_KEYDEF_SIZE		(2+ 5*2)
-#define MI_UNIQUEDEF_SIZE	(2+1+1)
-#define HA_KEYSEG_SIZE		(6+ 2*2 + 4*2)
-#define MI_COLUMNDEF_SIZE	(2*3+1)
-#define MI_BASE_INFO_SIZE	(5*8 + 8*4 + 4 + 4*2 + 16)
-#define MI_INDEX_BLOCK_MARGIN	16	/* Safety margin for .MYI tables */
+#define MI_STATE_KEYSEG_SIZE    4
+#define MI_STATE_EXTRA_SIZE ((MI_MAX_KEY+MI_MAX_KEY_BLOCK_SIZE)*MI_STATE_KEY_SIZE + MI_MAX_KEY*HA_MAX_KEY_SEG*MI_STATE_KEYSEG_SIZE)
+#define MI_KEYDEF_SIZE          (2+ 5*2)
+#define MI_UNIQUEDEF_SIZE       (2+1+1)
+#define HA_KEYSEG_SIZE          (6+ 2*2 + 4*2)
+#define MI_COLUMNDEF_SIZE       (2*3+1)
+#define MI_BASE_INFO_SIZE       (5*8 + 8*4 + 4 + 4*2 + 16)
+#define MI_INDEX_BLOCK_MARGIN   16      /* Safety margin for .MYI tables */
 
 typedef struct st_mi_base_info
 {
-  my_off_t keystart;			/* Start of keys */
+  my_off_t keystart;                    /* Start of keys */
   my_off_t max_data_file_length;
   my_off_t max_key_file_length;
   my_off_t margin_key_file_length;
-  ha_rows records,reloc;		/* Create information */
-  ulong mean_row_length;		/* Create information */
-  ulong reclength;			/* length of unpacked record */
-  ulong pack_reclength;			/* Length of full packed rec. */
+  ha_rows records, reloc;               /* Create information */
+  ulong mean_row_length;                /* Create information */
+  ulong reclength;                      /* length of unpacked record */
+  ulong pack_reclength;                 /* Length of full packed rec. */
   ulong min_pack_length;
-  ulong max_pack_length;		/* Max possibly length of packed rec.*/
+  ulong max_pack_length;                /* Max possibly length of packed rec.*/
   ulong min_block_length;
-  ulong fields,				/* fields in table */
-       pack_fields;			/* packed fields in table */
-  uint rec_reflength;			/* = 2-8 */
-  uint key_reflength;			/* = 2-8 */
-  uint keys;				/* same as in state.header */
-  uint auto_key;			/* Which key-1 is a auto key */
-  uint blobs;				/* Number of blobs */
-  uint pack_bits;			/* Length of packed bits */
-  uint max_key_block_length;		/* Max block length */
-  uint max_key_length;			/* Max key length */
+  ulong fields,                         /* fields in table */
+    pack_fields;                        /* packed fields in table */
+  uint rec_reflength;                   /* = 2-8 */
+  uint key_reflength;                   /* = 2-8 */
+  uint keys;                            /* same as in state.header */
+  uint auto_key;                        /* Which key-1 is a auto key */
+  uint blobs;                           /* Number of blobs */
+  uint pack_bits;                       /* Length of packed bits */
+  uint max_key_block_length;            /* Max block length */
+  uint max_key_length;                  /* Max key length */
   /* Extra allocation when using dynamic record format */
   uint extra_alloc_bytes;
   uint extra_alloc_procent;
   /* The following are from the header */
-  uint key_parts,all_key_parts;
+  uint key_parts, all_key_parts;
 } MI_BASE_INFO;
 
 
-	/* Structs used intern in database */
+        /* Structs used intern in database */
 
-typedef struct st_mi_blob		/* Info of record */
+typedef struct st_mi_blob               /* Info of record */
 {
-  ulong offset;				/* Offset to blob in record */
-  uint pack_length;			/* Type of packed length */
-  ulong length;				/* Calc:ed for each record */
+  ulong offset;                         /* Offset to blob in record */
+  uint pack_length;                     /* Type of packed length */
+  ulong length;                         /* Calc:ed for each record */
 } MI_BLOB;
 
 
-typedef struct st_mi_isam_pack {
+typedef struct st_mi_isam_pack
+{
   ulong header_length;
   uint ref_length;
   uchar version;
 } MI_PACK;
 
-#define MAX_NONMAPPED_INSERTS 1000      
+#define MAX_NONMAPPED_INSERTS 1000
 
-typedef struct st_mi_isam_share {	/* Shared between opens */
+typedef struct st_mi_isam_share
+{                                       /* Shared between opens */
   MI_STATE_INFO state;
   MI_BASE_INFO base;
   MI_KEYDEF  ft2_keyinfo;		/* Second-level ft-key definition */
@@ -170,26 +173,33 @@ typedef struct st_mi_isam_share {	/* Shared between opens */
         *index_file_name;
   uchar *file_map;			/* mem-map of file if possible */
   KEY_CACHE *key_cache;			/* ref to the current key cache */
+  /* To mark the key cache partitions containing dirty pages for this file */ 
+  ulonglong dirty_part_map;   
   MI_DECODE_TREE *decode_trees;
   uint16 *decode_tables;
-  int (*read_record)(struct st_myisam_info*, my_off_t, uchar*);
-  int (*write_record)(struct st_myisam_info*, const uchar*);
-  int (*update_record)(struct st_myisam_info*, my_off_t, const uchar*);
-  int (*delete_record)(struct st_myisam_info*);
-  int (*read_rnd)(struct st_myisam_info*, uchar*, my_off_t, my_bool);
-  int (*compare_record)(struct st_myisam_info*, const uchar *);
   /* Function to use for a row checksum. */
-  ha_checksum (*calc_checksum)(struct st_myisam_info*, const uchar *);
-  int (*compare_unique)(struct st_myisam_info*, MI_UNIQUEDEF *,
-			const uchar *record, my_off_t pos);
-  size_t (*file_read)(MI_INFO *, uchar *, size_t, my_off_t, myf);
-  size_t (*file_write)(MI_INFO *, const uchar *, size_t, my_off_t, myf);
-  invalidator_by_filename invalidator;  /* query cache invalidator */
-  ulong this_process;			/* processid */
-  ulong last_process;			/* For table-change-check */
-  ulong last_version;			/* Version on start */
-  ulong options;			/* Options used */
-  ulong min_pack_length;		/* Theese are used by packed data */
+  int(*read_record) (struct st_myisam_info *, my_off_t, uchar*);
+  int(*write_record) (struct st_myisam_info *, const uchar*);
+  int(*update_record) (struct st_myisam_info *, my_off_t, const uchar*);
+  int(*delete_record) (struct st_myisam_info *);
+  int(*read_rnd) (struct st_myisam_info *, uchar*, my_off_t, my_bool);
+  int(*compare_record) (struct st_myisam_info *, const uchar*);
+  ha_checksum(*calc_checksum) (struct st_myisam_info *, const uchar*);
+  /* calculate checksum for a row during check table */
+  ha_checksum(*calc_check_checksum)(struct st_myisam_info *, const uchar *);
+  int(*compare_unique) (struct st_myisam_info *, MI_UNIQUEDEF *,
+                        const uchar *record, my_off_t pos);
+    size_t (*file_read) (MI_INFO *, uchar *, size_t, my_off_t, myf);
+    size_t (*file_write) (MI_INFO *, const uchar *, size_t, my_off_t, myf);
+  /* query cache invalidator for merged tables */
+  invalidator_by_filename invalidator;
+  /* query cache invalidator for changing state */
+  invalidator_by_filename chst_invalidator;
+  ulong this_process;                   /* processid */
+  ulong last_process;                   /* For table-change-check */
+  ulong last_version;                   /* Version on start */
+  ulong options;                        /* Options used */
+  ulong min_pack_length;                /* Theese are used by packed data */
   ulong max_pack_length;
   ulong state_diff_length;
   uint	rec_reflength;			/* rec_reflength in use now */
@@ -205,12 +215,14 @@ typedef struct st_mi_isam_share {	/* Shared between opens */
   enum data_file_type data_file_type;
   /* Below flag is needed to make log tables work with concurrent insert */
   my_bool is_log_table;
-
-  my_bool  changed,			/* If changed since lock */
-    global_changed,			/* If changed since open */
-    not_flushed,
-    temporary,delay_key_write,
-    concurrent_insert;
+  /* This is 1 if they table checksum is of old type */
+  my_bool has_null_fields;
+  my_bool has_varchar_fields;
+
+  my_bool changed,                      /* If changed since lock */
+    global_changed,                     /* If changed since open */
+    not_flushed, temporary, delay_key_write, concurrent_insert;
+  my_bool deleting;                     /* we are going to delete this table */
 #ifdef THREAD
   THR_LOCK lock;
   mysql_mutex_t intern_lock;            /* Locking for use with _locking */
@@ -222,23 +234,16 @@ typedef struct st_mi_isam_share {	/* Shared between opens */
   mysql_rwlock_t mmap_lock;
 } MYISAM_SHARE;
 
+typedef ICP_RESULT (*index_cond_func_t)(void *param);
 
-typedef uint mi_bit_type;
-
-typedef struct st_mi_bit_buff {		/* Used for packing of record */
-  mi_bit_type current_byte;
-  uint bits;
-  uchar *pos,*end,*blob_pos,*blob_end;
-  uint error;
-} MI_BIT_BUFF;
-
-struct st_myisam_info {
-  MYISAM_SHARE *s;			/* Shared between open:s */
-  MI_STATUS_INFO *state,save_state;
-  MI_BLOB     *blobs;			/* Pointer to blobs */
-  MI_BIT_BUFF  bit_buff;
+struct st_myisam_info
+{
+  MYISAM_SHARE *s;                      /* Shared between open:s */
+  MI_STATUS_INFO *state, save_state;
+  MI_BLOB *blobs;                       /* Pointer to blobs */
+  MI_BIT_BUFF bit_buff;
   /* accumulate indexfile changes between write's */
-  TREE	        *bulk_insert;
+  TREE *bulk_insert;
   DYNAMIC_ARRAY *ft1_to_ft2;            /* used only in ft1->ft2 conversion */
   MEM_ROOT      ft_memroot;             /* used by the parser               */
   MYSQL_FTPARSER_PARAM *ftparser_param; /* share info between init/deinit   */
@@ -254,137 +259,89 @@ struct st_myisam_info {
   uint32 int_keytree_version;		/*  -""-  */
   int (*read_record)(struct st_myisam_info*, my_off_t, uchar*);
   invalidator_by_filename invalidator;  /* query cache invalidator */
-  ulong this_unique;			/* uniq filenumber or thread */
-  ulong last_unique;			/* last unique number */
-  ulong this_loop;			/* counter for this open */
-  ulong last_loop;			/* last used counter */
-  my_off_t lastpos,			/* Last record position */
-	nextpos;			/* Position to next record */
+  ulong this_unique;                    /* uniq filenumber or thread */
+  ulong last_unique;                    /* last unique number */
+  ulong this_loop;                      /* counter for this open */
+  ulong last_loop;                      /* last used counter */
+  my_off_t lastpos,                     /* Last record position */
+    nextpos;                            /* Position to next record */
   my_off_t save_lastpos;
-  my_off_t pos;				/* Intern variable */
-  my_off_t last_keypage;		/* Last key page read */
-  my_off_t last_search_keypage;		/* Last keypage when searching */
+  my_off_t pos;                         /* Intern variable */
+  my_off_t last_keypage;                /* Last key page read */
+  my_off_t last_search_keypage;         /* Last keypage when searching */
   my_off_t dupp_key_pos;
   ha_checksum checksum;                 /* Temp storage for row checksum */
-  /* QQ: the folloing two xxx_length fields should be removed,
-     as they are not compatible with parallel repair */
-  ulong packed_length,blob_length;	/* Length of found, packed record */
-  int  dfile;				/* The datafile */
-  uint opt_flag;			/* Optim. for space/speed */
-  uint update;				/* If file changed since open */
-  int	lastinx;			/* Last used index */
-  uint	lastkey_length;			/* Length of key in lastkey */
-  uint	last_rkey_length;		/* Last length in mi_rkey() */
+  /*
+    QQ: the folloing two xxx_length fields should be removed,
+     as they are not compatible with parallel repair
+  */
+  ulong packed_length, blob_length;     /* Length of found, packed record */
+  int dfile;                            /* The datafile */
+  uint open_flag;                       /* Parameters for open */
+  uint opt_flag;                        /* Optim. for space/speed */
+  uint once_flags;                      /* For MYISAMMRG */
+  uint update;                          /* If file changed since open */
+  int lastinx;                          /* Last used index */
+  uint lastkey_length;                  /* Length of key in lastkey */
+  uint last_rkey_length;                /* Last length in mi_rkey() */
   enum ha_rkey_function last_key_func;  /* CONTAIN, OVERLAP, etc */
-  uint  save_lastkey_length;
-  uint  pack_key_length;                /* For MYISAMMRG */
+  uint save_lastkey_length;
+  uint pack_key_length;                 /* For MYISAMMRG */
   uint16 last_used_keyseg;              /* For MyISAMMRG */
-  int	errkey;				/* Got last error on this key */
-  int   lock_type;			/* How database was locked */
-  int   tmp_lock_type;			/* When locked by readinfo */
-  uint	data_changed;			/* Somebody has changed data */
-  uint	save_update;			/* When using KEY_READ */
-  int	save_lastinx;
-  LIST	open_list;
-  IO_CACHE rec_cache;			/* When cacheing records */
-  uint  preload_buff_size;              /* When preloading indexes */
-  myf lock_wait;			/* is 0 or MY_DONT_WAIT */
-  my_bool was_locked;			/* Was locked in panic */
-  my_bool append_insert_at_end;		/* Set if concurrent insert */
+  int errkey;                           /* Got last error on this key */
+  int lock_type;                        /* How database was locked */
+  int tmp_lock_type;                    /* When locked by readinfo */
+  uint data_changed;                    /* Somebody has changed data */
+  uint save_update;                     /* When using KEY_READ */
+  int save_lastinx;
+  LIST open_list;
+  IO_CACHE rec_cache;                   /* When cacheing records */
+  uint preload_buff_size;               /* When preloading indexes */
+  myf lock_wait;                        /* is 0 or MY_SHORT_WAIT */
+  my_bool was_locked;                   /* Was locked in panic */
+  my_bool append_insert_at_end;         /* Set if concurrent insert */
   my_bool quick_mode;
-  my_bool page_changed;		/* If info->buff can't be used for rnext */
-  my_bool buff_used;		/* If info->buff has to be reread for rnext */
-  my_bool once_flags;           /* For MYISAMMRG */
-#ifdef __WIN__
-  my_bool owned_by_merge;                       /* This MyISAM table is part of a merge union */
-#endif
+  /* If info->buff can't be used for rnext */
+  my_bool page_changed;
+  /* If info->buff has to be reread for rnext */
+  my_bool buff_used;
+  index_cond_func_t index_cond_func;   /* Index condition function */
+  void *index_cond_func_arg;           /* parameter for the func */
 #ifdef THREAD
   THR_LOCK_DATA lock;
 #endif
-  uchar  *rtree_recursion_state;	/* For RTREE */
-  int     rtree_recursion_depth;
+  uchar *rtree_recursion_state;         /* For RTREE */
+  int rtree_recursion_depth;
 };
 
-typedef struct st_buffpek {
-  my_off_t file_pos;                    /* Where we are in the sort file */
-  uchar *base,*key;                     /* Key pointers */
-  ha_rows count;                        /* Number of rows in table */
-  ulong mem_count;                      /* numbers of keys in memory */
-  ulong max_keys;                       /* Max keys in buffert */
-} BUFFPEK;
-
-typedef struct st_mi_sort_param
-{
-  pthread_t  thr;
-  IO_CACHE read_cache, tempfile, tempfile_for_exceptions;
-  DYNAMIC_ARRAY buffpek;
-  MI_BIT_BUFF   bit_buff;               /* For parallel repair of packrec. */
-
-  /*
-    The next two are used to collect statistics, see update_key_parts for
-    description.
-  */
-  ulonglong unique[MI_MAX_KEY_SEG+1];
-  ulonglong notnull[MI_MAX_KEY_SEG+1];
-
-  my_off_t pos,max_pos,filepos,start_recpos;
-  uint key, key_length,real_key_length,sortbuff_size;
-  uint maxbuffers, keys, find_length, sort_keys_length;
-  my_bool fix_datafile, master;
-  my_bool calc_checksum;                /* calculate table checksum */
-  MI_KEYDEF *keyinfo;
-  HA_KEYSEG *seg;
-  SORT_INFO *sort_info;
-  uchar **sort_keys;
-  uchar *rec_buff;
-  void *wordlist, *wordptr;
-  MEM_ROOT wordroot;
-  uchar *record;
-  MY_TMPDIR *tmpdir;
-  int (*key_cmp)(struct st_mi_sort_param *, const void *, const void *);
-  int (*key_read)(struct st_mi_sort_param *,void *);
-  int (*key_write)(struct st_mi_sort_param *, const void *);
-  void (*lock_in_memory)(MI_CHECK *);
-  int (*write_keys)(struct st_mi_sort_param *, register uchar **,
-                    uint , struct st_buffpek *, IO_CACHE *);
-  uint (*read_to_buffer)(IO_CACHE *,struct st_buffpek *, uint);
-  int (*write_key)(struct st_mi_sort_param *, IO_CACHE *,uchar *,
-                   uint, uint);
-} MI_SORT_PARAM;
-
-	/* Some defines used by isam-funktions */
-
-#define USE_WHOLE_KEY	MI_MAX_KEY_BUFF*2 /* Use whole key in _mi_search() */
-#define F_EXTRA_LCK	-1
-
-	/* bits in opt_flag */
-#define MEMMAP_USED	32
+#define USE_WHOLE_KEY   HA_MAX_KEY_BUFF*2 /* Use whole key in _mi_search() */
+#define F_EXTRA_LCK     -1
+/* bits in opt_flag */
+#define MEMMAP_USED     32
 #define REMEMBER_OLD_POS 64
 
-#define WRITEINFO_UPDATE_KEYFILE	1
-#define WRITEINFO_NO_UNLOCK		2
+#define WRITEINFO_UPDATE_KEYFILE        1
+#define WRITEINFO_NO_UNLOCK             2
 
-        /* once_flags */
+/* once_flags */
 #define USE_PACKED_KEYS         1
 #define RRND_PRESERVE_LASTINX   2
 
-	/* bits in state.changed */
-
-#define STATE_CHANGED		1
-#define STATE_CRASHED		2
+/* bits in state.changed */
+#define STATE_CHANGED           1
+#define STATE_CRASHED           2
 #define STATE_CRASHED_ON_REPAIR 4
-#define STATE_NOT_ANALYZED	8
+#define STATE_NOT_ANALYZED      8
 #define STATE_NOT_OPTIMIZED_KEYS 16
-#define STATE_NOT_SORTED_PAGES	32
+#define STATE_NOT_SORTED_PAGES  32
 
-	/* options to mi_read_cache */
+/* options to mi_read_cache */
+#define READING_NEXT    1
+#define READING_HEADER  2
 
-#define READING_NEXT	1
-#define READING_HEADER	2
-
-#define mi_getint(x)	((uint) mi_uint2korr(x) & 32767)
+#define mi_getint(x)    ((uint) mi_uint2korr(x) & 32767)
 #define mi_putint(x,y,nod) { uint16 boh=(nod ? (uint16) 32768 : 0) + (uint16) (y);\
-			  mi_int2store(x,boh); }
+                          mi_int2store(x,boh); }
 #define mi_test_if_nod(x) (x[0] & 128 ? info->s->base.key_reflength : 0)
 #define mi_report_crashed(A, B) _mi_report_crashed((A), (B), __FILE__, __LINE__)
 #define mi_mark_crashed(x) do{(x)->s->state.changed|= STATE_CRASHED; \
@@ -427,38 +384,38 @@ typedef struct st_mi_sort_param
 
 #define get_pack_length(length) ((length) >= 255 ? 3 : 1)
 
-#define MI_MIN_BLOCK_LENGTH	20	/* Because of delete-link */
-#define MI_EXTEND_BLOCK_LENGTH	20	/* Don't use to small record-blocks */
-#define MI_SPLIT_LENGTH	((MI_EXTEND_BLOCK_LENGTH+4)*2)
-#define MI_MAX_DYN_BLOCK_HEADER	20	/* Max prefix of record-block */
+#define MI_MIN_BLOCK_LENGTH     20      /* Because of delete-link */
+#define MI_EXTEND_BLOCK_LENGTH  20      /* Don't use to small record-blocks */
+#define MI_SPLIT_LENGTH ((MI_EXTEND_BLOCK_LENGTH+4)*2)
+#define MI_MAX_DYN_BLOCK_HEADER 20      /* Max prefix of record-block */
 #define MI_BLOCK_INFO_HEADER_LENGTH 20
-#define MI_DYN_DELETE_BLOCK_HEADER 20	/* length of delete-block-header */
-#define MI_DYN_MAX_BLOCK_LENGTH	((1L << 24)-4L)
-#define MI_DYN_MAX_ROW_LENGTH	(MI_DYN_MAX_BLOCK_LENGTH - MI_SPLIT_LENGTH)
-#define MI_DYN_ALIGN_SIZE	4	/* Align blocks on this */
-#define MI_MAX_DYN_HEADER_BYTE	13	/* max header byte for dynamic rows */
-#define MI_MAX_BLOCK_LENGTH	((((ulong) 1 << 24)-1) & (~ (ulong) (MI_DYN_ALIGN_SIZE-1)))
+#define MI_DYN_DELETE_BLOCK_HEADER 20   /* length of delete-block-header */
+#define MI_DYN_MAX_BLOCK_LENGTH ((1L << 24)-4L)
+#define MI_DYN_MAX_ROW_LENGTH   (MI_DYN_MAX_BLOCK_LENGTH - MI_SPLIT_LENGTH)
+#define MI_DYN_ALIGN_SIZE       4       /* Align blocks on this */
+#define MI_MAX_DYN_HEADER_BYTE  13      /* max header byte for dynamic rows */
+#define MI_MAX_BLOCK_LENGTH     ((((ulong) 1 << 24)-1) & (~ (ulong) (MI_DYN_ALIGN_SIZE-1)))
 #define MI_REC_BUFF_OFFSET      ALIGN_SIZE(MI_DYN_DELETE_BLOCK_HEADER+sizeof(uint32))
 
 
-#define PACK_TYPE_SELECTED	1	/* Bits in field->pack_type */
-#define PACK_TYPE_SPACE_FIELDS	2
-#define PACK_TYPE_ZERO_FILL	4
-#define MI_FOUND_WRONG_KEY 32738	/* Impossible value from ha_key_cmp */
+#define PACK_TYPE_SELECTED      1       /* Bits in field->pack_type */
+#define PACK_TYPE_SPACE_FIELDS  2
+#define PACK_TYPE_ZERO_FILL     4
+#define MI_FOUND_WRONG_KEY 32738        /* Impossible value from ha_key_cmp */
 
-#define MI_MAX_KEY_BLOCK_SIZE	(MI_MAX_KEY_BLOCK_LENGTH/MI_MIN_KEY_BLOCK_LENGTH)
+#define MI_MAX_KEY_BLOCK_SIZE   (MI_MAX_KEY_BLOCK_LENGTH/MI_MIN_KEY_BLOCK_LENGTH)
 #define MI_BLOCK_SIZE(key_length,data_pointer,key_pointer,block_size) (((((key_length)+(data_pointer)+(key_pointer))*4+(key_pointer)+2)/(block_size)+1)*(block_size))
-#define MI_MAX_KEYPTR_SIZE	5        /* For calculating block lengths */
-#define MI_MIN_KEYBLOCK_LENGTH	50         /* When to split delete blocks */
+#define MI_MAX_KEYPTR_SIZE      5       /* For calculating block lengths */
+#define MI_MIN_KEYBLOCK_LENGTH  50      /* When to split delete blocks */
 
-#define MI_MIN_SIZE_BULK_INSERT_TREE 16384             /* this is per key */
+#define MI_MIN_SIZE_BULK_INSERT_TREE 16384 /* this is per key */
 #define MI_MIN_ROWS_TO_USE_BULK_INSERT 100
 #define MI_MIN_ROWS_TO_DISABLE_INDEXES 100
 #define MI_MIN_ROWS_TO_USE_WRITE_CACHE 10
 
 /* The UNIQUE check is done with a hashed long key */
 
-#define MI_UNIQUE_HASH_TYPE	HA_KEYTYPE_ULONG_INT
+#define MI_UNIQUE_HASH_TYPE     HA_KEYTYPE_ULONG_INT
 #define mi_unique_store(A,B)    mi_int4store((A),(B))
 
 #ifdef THREAD
@@ -470,7 +427,7 @@ extern mysql_mutex_t THR_LOCK_myisam;
 #define mysql_rwlock_unlock(A) {}
 #endif
 
-	/* Some extern variables */
+/* Some extern variables */
 
 extern LIST *myisam_open_list;
 extern uchar myisam_file_magic[], myisam_pack_file_magic[];
@@ -479,162 +436,167 @@ extern uint myisam_quick_table_bits;
 extern File myisam_log_file;
 extern ulong myisam_pid;
 
-	/* This is used by _mi_calc_xxx_key_length och _mi_store_key */
+/* This is used by _mi_calc_xxx_key_length och _mi_store_key */
 
 typedef struct st_mi_s_param
 {
-  uint	ref_length,key_length,
-	n_ref_length,
-	n_length,
-	totlength,
-	part_of_prev_key,prev_length,pack_marker;
-  uchar *key, *prev_key,*next_key_pos;
+  uint ref_length, key_length,
+    n_ref_length,
+    n_length, totlength, part_of_prev_key, prev_length, pack_marker;
+  uchar *key, *prev_key, *next_key_pos;
   my_bool store_not_null;
 } MI_KEY_PARAM;
 
-	/* Prototypes for intern functions */
+/* Prototypes for intern functions */
 
-extern int _mi_read_dynamic_record(MI_INFO *info,my_off_t filepos,uchar *buf);
-extern int _mi_write_dynamic_record(MI_INFO*, const uchar*);
-extern int _mi_update_dynamic_record(MI_INFO*, my_off_t, const uchar*);
+extern int _mi_read_dynamic_record(MI_INFO *info, my_off_t filepos, uchar *buf);
+extern int _mi_write_dynamic_record(MI_INFO *, const uchar *);
+extern int _mi_update_dynamic_record(MI_INFO *, my_off_t, const uchar *);
 extern int _mi_delete_dynamic_record(MI_INFO *info);
-extern int _mi_cmp_dynamic_record(MI_INFO *info,const uchar *record);
-extern int _mi_read_rnd_dynamic_record(MI_INFO *, uchar *,my_off_t, my_bool);
-extern int _mi_write_blob_record(MI_INFO*, const uchar*);
-extern int _mi_update_blob_record(MI_INFO*, my_off_t, const uchar*);
-extern int _mi_read_static_record(MI_INFO *info, my_off_t filepos,uchar *buf);
-extern int _mi_write_static_record(MI_INFO*, const uchar*);
-extern int _mi_update_static_record(MI_INFO*, my_off_t, const uchar*);
+extern int _mi_cmp_dynamic_record(MI_INFO *info, const uchar *record);
+extern int _mi_read_rnd_dynamic_record(MI_INFO *, uchar *, my_off_t, my_bool);
+extern int _mi_write_blob_record(MI_INFO *, const uchar *);
+extern int _mi_update_blob_record(MI_INFO *, my_off_t, const uchar *);
+extern int _mi_read_static_record(MI_INFO *info, my_off_t filepos, uchar *buf);
+extern int _mi_write_static_record(MI_INFO *, const uchar *);
+extern int _mi_update_static_record(MI_INFO *, my_off_t, const uchar *);
 extern int _mi_delete_static_record(MI_INFO *info);
-extern int _mi_cmp_static_record(MI_INFO *info,const uchar *record);
-extern int _mi_read_rnd_static_record(MI_INFO*, uchar *,my_off_t, my_bool);
-extern int _mi_ck_write(MI_INFO *info,uint keynr,uchar *key,uint length);
+extern int _mi_cmp_static_record(MI_INFO *info, const uchar *record);
+extern int _mi_read_rnd_static_record(MI_INFO *, uchar *, my_off_t, my_bool);
+extern int _mi_ck_write(MI_INFO *info, uint keynr, uchar *key, uint length);
 extern int _mi_ck_real_write_btree(MI_INFO *info, MI_KEYDEF *keyinfo,
                                    uchar *key, uint key_length,
                                    my_off_t *root, uint comp_flag);
-extern int _mi_enlarge_root(MI_INFO *info,MI_KEYDEF *keyinfo,uchar *key, my_off_t *root);
-extern int _mi_insert(MI_INFO *info,MI_KEYDEF *keyinfo,uchar *key,
-		      uchar *anc_buff,uchar *key_pos,uchar *key_buff,
-		      uchar *father_buff, uchar *father_keypos,
-		      my_off_t father_page, my_bool insert_last);
-extern int _mi_split_page(MI_INFO *info,MI_KEYDEF *keyinfo,uchar *key,
-			  uchar *buff,uchar *key_buff, my_bool insert_last);
-extern uchar *_mi_find_half_pos(uint nod_flag,MI_KEYDEF *keyinfo,uchar *page,
-				uchar *key,uint *return_key_length,
-				uchar **after_key);
-extern int _mi_calc_static_key_length(MI_KEYDEF *keyinfo,uint nod_flag,
-				      uchar *key_pos, uchar *org_key,
-				      uchar *key_buff,
-				      uchar *key, MI_KEY_PARAM *s_temp);
-extern int _mi_calc_var_key_length(MI_KEYDEF *keyinfo,uint nod_flag,
-				   uchar *key_pos, uchar *org_key,
-				   uchar *key_buff,
-				   uchar *key, MI_KEY_PARAM *s_temp);
-extern int _mi_calc_var_pack_key_length(MI_KEYDEF *keyinfo,uint nod_flag,
-					uchar *key_pos, uchar *org_key,
-					uchar *prev_key,
-					uchar *key, MI_KEY_PARAM *s_temp);
-extern int _mi_calc_bin_pack_key_length(MI_KEYDEF *keyinfo,uint nod_flag,
-					uchar *key_pos,uchar *org_key,
-					uchar *prev_key,
-					uchar *key, MI_KEY_PARAM *s_temp);
-void _mi_store_static_key(MI_KEYDEF *keyinfo,  uchar *key_pos,
-			   MI_KEY_PARAM *s_temp);
-void _mi_store_var_pack_key(MI_KEYDEF *keyinfo,  uchar *key_pos,
-			     MI_KEY_PARAM *s_temp);
-void _mi_store_bin_pack_key(MI_KEYDEF *keyinfo,  uchar *key_pos,
-			    MI_KEY_PARAM *s_temp);
-
-extern int _mi_ck_delete(MI_INFO *info,uint keynr,uchar *key,uint key_length);
-extern int _mi_readinfo(MI_INFO *info,int lock_flag,int check_keybuffer);
-extern int _mi_writeinfo(MI_INFO *info,uint options);
+extern int _mi_enlarge_root(MI_INFO *info, MI_KEYDEF *keyinfo, uchar *key,
+                            my_off_t *root);
+extern int _mi_insert(MI_INFO *info, MI_KEYDEF *keyinfo, uchar *key,
+                      uchar *anc_buff, uchar *key_pos, uchar *key_buff,
+                      uchar *father_buff, uchar *father_keypos,
+                      my_off_t father_page, my_bool insert_last);
+extern int _mi_split_page(MI_INFO *info, MI_KEYDEF *keyinfo, uchar *key,
+                          uchar *buff, uchar *key_buff, my_bool insert_last);
+extern uchar *_mi_find_half_pos(uint nod_flag, MI_KEYDEF *keyinfo,
+                                uchar *page, uchar *key,
+                                uint *return_key_length, uchar ** after_key);
+extern int _mi_calc_static_key_length(MI_KEYDEF *keyinfo, uint nod_flag,
+                                      uchar *key_pos, uchar *org_key,
+                                      uchar *key_buff, uchar *key,
+                                      MI_KEY_PARAM *s_temp);
+extern int _mi_calc_var_key_length(MI_KEYDEF *keyinfo, uint nod_flag,
+                                   uchar *key_pos, uchar *org_key,
+                                   uchar *key_buff, uchar *key,
+                                   MI_KEY_PARAM *s_temp);
+extern int _mi_calc_var_pack_key_length(MI_KEYDEF *keyinfo, uint nod_flag,
+                                        uchar *key_pos, uchar *org_key,
+                                        uchar *prev_key, uchar *key,
+                                        MI_KEY_PARAM *s_temp);
+extern int _mi_calc_bin_pack_key_length(MI_KEYDEF *keyinfo, uint nod_flag,
+                                        uchar *key_pos, uchar *org_key,
+                                        uchar *prev_key, uchar *key,
+                                        MI_KEY_PARAM *s_temp);
+void _mi_store_static_key(MI_KEYDEF *keyinfo, uchar *key_pos,
+                          MI_KEY_PARAM *s_temp);
+void _mi_store_var_pack_key(MI_KEYDEF *keyinfo, uchar *key_pos,
+                            MI_KEY_PARAM *s_temp);
+void _mi_store_bin_pack_key(MI_KEYDEF *keyinfo, uchar *key_pos,
+                            MI_KEY_PARAM *s_temp);
+
+extern int _mi_ck_delete(MI_INFO *info, uint keynr, uchar *key,
+                         uint key_length);
+extern int _mi_readinfo(MI_INFO *info, int lock_flag, int check_keybuffer);
+extern int _mi_writeinfo(MI_INFO *info, uint options);
 extern int _mi_test_if_changed(MI_INFO *info);
 extern int _mi_mark_file_changed(MI_INFO *info);
 extern int _mi_decrement_open_count(MI_INFO *info);
-extern int _mi_check_index(MI_INFO *info,int inx);
-extern int _mi_search(MI_INFO *info,MI_KEYDEF *keyinfo,uchar *key,uint key_len,
-		      uint nextflag,my_off_t pos);
-extern int _mi_bin_search(struct st_myisam_info *info,MI_KEYDEF *keyinfo,
-			  uchar *page,uchar *key,uint key_len,uint comp_flag,
-			  uchar * *ret_pos,uchar *buff, my_bool *was_last_key);
-extern int _mi_seq_search(MI_INFO *info,MI_KEYDEF *keyinfo,uchar *page,
-			  uchar *key,uint key_len,uint comp_flag,
-			  uchar **ret_pos,uchar *buff, my_bool *was_last_key);
-extern int _mi_prefix_search(MI_INFO *info,MI_KEYDEF *keyinfo,uchar *page,
-			  uchar *key,uint key_len,uint comp_flag,
-			  uchar **ret_pos,uchar *buff, my_bool *was_last_key);
-extern my_off_t _mi_kpos(uint nod_flag,uchar *after_key);
-extern void _mi_kpointer(MI_INFO *info,uchar *buff,my_off_t pos);
-extern my_off_t _mi_dpos(MI_INFO *info, uint nod_flag,uchar *after_key);
+extern int _mi_check_index(MI_INFO *info, int inx);
+extern int _mi_search(MI_INFO *info, MI_KEYDEF *keyinfo, uchar *key,
+                      uint key_len, uint nextflag, my_off_t pos);
+extern int _mi_bin_search(struct st_myisam_info *info, MI_KEYDEF *keyinfo,
+                          uchar *page, uchar *key, uint key_len,
+                          uint comp_flag, uchar **ret_pos, uchar *buff,
+                          my_bool *was_last_key);
+extern int _mi_seq_search(MI_INFO *info, MI_KEYDEF *keyinfo, uchar *page,
+                          uchar *key, uint key_len, uint comp_flag,
+                          uchar ** ret_pos, uchar *buff,
+                          my_bool *was_last_key);
+extern int _mi_prefix_search(MI_INFO *info, MI_KEYDEF *keyinfo, uchar *page,
+                             uchar *key, uint key_len, uint comp_flag,
+                             uchar ** ret_pos, uchar *buff,
+                             my_bool *was_last_key);
+extern my_off_t _mi_kpos(uint nod_flag, uchar *after_key);
+extern void _mi_kpointer(MI_INFO *info, uchar *buff, my_off_t pos);
+extern my_off_t _mi_dpos(MI_INFO *info, uint nod_flag, uchar *after_key);
 extern my_off_t _mi_rec_pos(MYISAM_SHARE *info, uchar *ptr);
-extern void _mi_dpointer(MI_INFO *info, uchar *buff,my_off_t pos);
-extern int ha_key_cmp(HA_KEYSEG *keyseg, uchar *a,uchar *b,
-		       uint key_length,uint nextflag,uint *diff_length);
-extern uint _mi_get_static_key(MI_KEYDEF *keyinfo,uint nod_flag,uchar * *page,
-			       uchar *key);
-extern uint _mi_get_pack_key(MI_KEYDEF *keyinfo,uint nod_flag,uchar * *page,
-			     uchar *key);
+extern void _mi_dpointer(MI_INFO *info, uchar *buff, my_off_t pos);
+extern uint _mi_get_static_key(MI_KEYDEF *keyinfo, uint nod_flag,
+                               uchar **page, uchar *key);
+extern uint _mi_get_pack_key(MI_KEYDEF *keyinfo, uint nod_flag, uchar **page,
+                             uchar *key);
 extern uint _mi_get_binary_pack_key(MI_KEYDEF *keyinfo, uint nod_flag,
-				    uchar **page_pos, uchar *key);
-extern uchar *_mi_get_last_key(MI_INFO *info,MI_KEYDEF *keyinfo,uchar *keypos,
-			       uchar *lastkey,uchar *endpos,
-			       uint *return_key_length);
+                                    uchar ** page_pos, uchar *key);
+extern uchar *_mi_get_last_key(MI_INFO *info, MI_KEYDEF *keyinfo,
+                               uchar *keypos, uchar *lastkey, uchar *endpos,
+                               uint *return_key_length);
 extern uchar *_mi_get_key(MI_INFO *info, MI_KEYDEF *keyinfo, uchar *page,
-			  uchar *key, uchar *keypos, uint *return_key_length);
-extern uint _mi_keylength(MI_KEYDEF *keyinfo,uchar *key);
+                          uchar *key, uchar *keypos,
+                          uint *return_key_length);
+extern uint _mi_keylength(MI_KEYDEF *keyinfo, uchar *key);
 extern uint _mi_keylength_part(MI_KEYDEF *keyinfo, register uchar *key,
-			       HA_KEYSEG *end);
-extern uchar *_mi_move_key(MI_KEYDEF *keyinfo,uchar *to,uchar *from);
-extern int _mi_search_next(MI_INFO *info,MI_KEYDEF *keyinfo,uchar *key,
-			   uint key_length,uint nextflag,my_off_t pos);
-extern int _mi_search_first(MI_INFO *info,MI_KEYDEF *keyinfo,my_off_t pos);
-extern int _mi_search_last(MI_INFO *info,MI_KEYDEF *keyinfo,my_off_t pos);
-extern uchar *_mi_fetch_keypage(MI_INFO *info,MI_KEYDEF *keyinfo,my_off_t page,
-				int level,uchar *buff,int return_buffer);
-extern int _mi_write_keypage(MI_INFO *info,MI_KEYDEF *keyinfo,my_off_t page,
-			     int level, uchar *buff);
-extern int _mi_dispose(MI_INFO *info,MI_KEYDEF *keyinfo,my_off_t pos,
-                      int level);
-extern my_off_t _mi_new(MI_INFO *info,MI_KEYDEF *keyinfo,int level);
-extern uint _mi_make_key(MI_INFO *info,uint keynr,uchar *key,
-			 const uchar *record,my_off_t filepos);
-extern uint _mi_pack_key(register MI_INFO *info, uint keynr, uchar *key,
+                               HA_KEYSEG *end);
+extern uchar *_mi_move_key(MI_KEYDEF *keyinfo, uchar *to, uchar *from);
+extern int _mi_search_next(MI_INFO *info, MI_KEYDEF *keyinfo, uchar *key,
+                           uint key_length, uint nextflag, my_off_t pos);
+extern int _mi_search_first(MI_INFO *info, MI_KEYDEF *keyinfo, my_off_t pos);
+extern int _mi_search_last(MI_INFO *info, MI_KEYDEF *keyinfo, my_off_t pos);
+extern uchar *_mi_fetch_keypage(MI_INFO *info, MI_KEYDEF *keyinfo,
+                                my_off_t page, int level, uchar *buff,
+                                int return_buffer);
+extern int _mi_write_keypage(MI_INFO *info, MI_KEYDEF *keyinfo, my_off_t page,
+                             int level, uchar *buff);
+extern int _mi_dispose(MI_INFO *info, MI_KEYDEF *keyinfo, my_off_t pos,
+                       int level);
+extern my_off_t _mi_new(MI_INFO *info, MI_KEYDEF *keyinfo, int level);
+extern uint _mi_make_key(MI_INFO *info, uint keynr, uchar *key,
+                         const uchar *record, my_off_t filepos);
+extern uint _mi_pack_key(MI_INFO *info, uint keynr, uchar *key,
                          uchar *old, key_part_map keypart_map,
-                         HA_KEYSEG **last_used_keyseg);
-extern int _mi_read_key_record(MI_INFO *info,my_off_t filepos,uchar *buf);
-extern int _mi_read_cache(IO_CACHE *info,uchar *buff,my_off_t pos,
-			  uint length,int re_read_if_possibly);
-extern ulonglong retrieve_auto_increment(MI_INFO *info,const uchar *record);
+                         HA_KEYSEG ** last_used_keyseg);
+extern int _mi_read_key_record(MI_INFO *info, my_off_t filepos, uchar *buf);
+extern int _mi_read_cache(IO_CACHE *info, uchar *buff, my_off_t pos,
+                          uint length, int re_read_if_possibly);
+extern ulonglong retrieve_auto_increment(MI_INFO *info, const uchar *record);
 
-extern uchar *mi_alloc_rec_buff(MI_INFO *,ulong, uchar**);
+extern uchar *mi_alloc_rec_buff(MI_INFO *, ulong, uchar **);
 #define mi_get_rec_buff_ptr(info,buf)                              \
         ((((info)->s->options & HA_OPTION_PACK_RECORD) && (buf)) ? \
         (buf) - MI_REC_BUFF_OFFSET : (buf))
 #define mi_get_rec_buff_len(info,buf)                              \
         (*((uint32 *)(mi_get_rec_buff_ptr(info,buf))))
 
-extern ulong _mi_rec_unpack(MI_INFO *info,uchar *to,uchar *from,
-			    ulong reclength);
+extern ulong _mi_rec_unpack(MI_INFO *info, uchar *to, uchar *from,
+                            ulong reclength);
 extern my_bool _mi_rec_check(MI_INFO *info,const uchar *record, uchar *packpos,
                              ulong packed_length, my_bool with_checkum);
-extern int _mi_write_part_record(MI_INFO *info,my_off_t filepos,ulong length,
-				 my_off_t next_filepos,uchar **record,
-				 ulong *reclength,int *flag);
-extern void _mi_print_key(FILE *stream,HA_KEYSEG *keyseg,const uchar *key,
-			  uint length);
-extern my_bool _mi_read_pack_info(MI_INFO *info,pbool fix_keys);
-extern int _mi_read_pack_record(MI_INFO *info,my_off_t filepos,uchar *buf);
-extern int _mi_read_rnd_pack_record(MI_INFO*, uchar *,my_off_t, my_bool);
+extern int _mi_write_part_record(MI_INFO *info, my_off_t filepos, ulong length,
+                                 my_off_t next_filepos, uchar ** record,
+                                 ulong *reclength, int *flag);
+extern void _mi_print_key(FILE *stream, HA_KEYSEG *keyseg, const uchar *key,
+                          uint length);
+extern my_bool _mi_read_pack_info(MI_INFO *info, pbool fix_keys);
+extern int _mi_read_pack_record(MI_INFO *info, my_off_t filepos, uchar *buf);
+extern int _mi_read_rnd_pack_record(MI_INFO *, uchar *, my_off_t, my_bool);
 extern int _mi_pack_rec_unpack(MI_INFO *info, MI_BIT_BUFF *bit_buff,
                                uchar *to, uchar *from, ulong reclength);
-extern ulonglong mi_safe_mul(ulonglong a,ulonglong b);
+extern ulonglong mi_safe_mul(ulonglong a, ulonglong b);
 extern int _mi_ft_update(MI_INFO *info, uint keynr, uchar *keybuf,
-			 const uchar *oldrec, const uchar *newrec, my_off_t pos);
+                         const uchar *oldrec, const uchar *newrec,
+                         my_off_t pos);
 
 struct st_sort_info;
 
 
-typedef struct st_mi_block_info {	/* Parameter to _mi_get_block_info */
+typedef struct st_mi_block_info         /* Parameter to _mi_get_block_info */
+{
   uchar header[MI_BLOCK_INFO_HEADER_LENGTH];
   ulong rec_len;
   ulong data_len;
@@ -647,35 +609,37 @@ typedef struct st_mi_block_info {	/* Parameter to _mi_get_block_info */
   uint offset;
 } MI_BLOCK_INFO;
 
-	/* bits in return from _mi_get_block_info */
-
-#define BLOCK_FIRST	1
-#define BLOCK_LAST	2
-#define BLOCK_DELETED	4
-#define BLOCK_ERROR	8	/* Wrong data */
-#define BLOCK_SYNC_ERROR 16	/* Right data at wrong place */
-#define BLOCK_FATAL_ERROR 32	/* hardware-error */
-
-#define NEED_MEM	((uint) 10*4*(IO_SIZE+32)+32) /* Nead for recursion */
-#define MAXERR			20
-#define BUFFERS_WHEN_SORTING	16		/* Alloc for sort-key-tree */
-#define WRITE_COUNT		MY_HOW_OFTEN_TO_WRITE
-#define INDEX_TMP_EXT		".TMM"
-#define DATA_TMP_EXT		".TMD"
-
-#define UPDATE_TIME		1
-#define UPDATE_STAT		2
-#define UPDATE_SORT		4
-#define UPDATE_AUTO_INC		8
-#define UPDATE_OPEN_COUNT	16
-
-#define USE_BUFFER_INIT		(((1024L*512L-MALLOC_OVERHEAD)/IO_SIZE)*IO_SIZE)
-#define READ_BUFFER_INIT	(1024L*256L-MALLOC_OVERHEAD)
-#define SORT_BUFFER_INIT	(2048L*1024L-MALLOC_OVERHEAD)
-#define MIN_SORT_BUFFER		(4096-MALLOC_OVERHEAD)
-
-enum myisam_log_commands {
-  MI_LOG_OPEN,MI_LOG_WRITE,MI_LOG_UPDATE,MI_LOG_DELETE,MI_LOG_CLOSE,MI_LOG_EXTRA,MI_LOG_LOCK,MI_LOG_DELETE_ALL
+        /* bits in return from _mi_get_block_info */
+
+#define BLOCK_FIRST     1
+#define BLOCK_LAST      2
+#define BLOCK_DELETED   4
+#define BLOCK_ERROR     8               /* Wrong data */
+#define BLOCK_SYNC_ERROR 16             /* Right data at wrong place */
+#define BLOCK_FATAL_ERROR 32            /* hardware-error */
+
+#define NEED_MEM        ((uint) 10*4*(IO_SIZE+32)+32) /* Nead for recursion */
+#define MAXERR                  20
+#define BUFFERS_WHEN_SORTING    16      /* Alloc for sort-key-tree */
+#define WRITE_COUNT             MY_HOW_OFTEN_TO_WRITE
+#define INDEX_TMP_EXT           ".TMM"
+#define DATA_TMP_EXT            ".TMD"
+
+#define UPDATE_TIME             1
+#define UPDATE_STAT             2
+#define UPDATE_SORT             4
+#define UPDATE_AUTO_INC         8
+#define UPDATE_OPEN_COUNT       16
+
+#define USE_BUFFER_INIT         (((1024L*512L-MALLOC_OVERHEAD)/IO_SIZE)*IO_SIZE)
+#define READ_BUFFER_INIT        (1024L*256L-MALLOC_OVERHEAD)
+#define SORT_BUFFER_INIT        (2048L*1024L-MALLOC_OVERHEAD)
+#define MIN_SORT_BUFFER         (4096-MALLOC_OVERHEAD)
+
+enum myisam_log_commands
+{
+  MI_LOG_OPEN, MI_LOG_WRITE, MI_LOG_UPDATE, MI_LOG_DELETE, MI_LOG_CLOSE,
+    MI_LOG_EXTRA, MI_LOG_LOCK, MI_LOG_DELETE_ALL
 };
 
 #define myisam_log(a,b,c,d) if (myisam_log_file >= 0) _myisam_log(a,b,c,d)
@@ -685,29 +649,25 @@ enum myisam_log_commands {
 #define fast_mi_writeinfo(INFO) if (!(INFO)->s->tot_locks) (void) _mi_writeinfo((INFO),0)
 #define fast_mi_readinfo(INFO) ((INFO)->lock_type == F_UNLCK) && _mi_readinfo((INFO),F_RDLCK,1)
 
-#ifdef	__cplusplus
-extern "C" {
-#endif
-
-extern uint _mi_get_block_info(MI_BLOCK_INFO *,File, my_off_t);
-extern uint _mi_rec_pack(MI_INFO *info,uchar *to,const uchar *from);
+C_MODE_START
+extern uint _mi_get_block_info(MI_BLOCK_INFO *, File, my_off_t);
+extern uint _mi_rec_pack(MI_INFO *info, uchar *to, const uchar *from);
 extern uint _mi_pack_get_block_info(MI_INFO *myisam, MI_BIT_BUFF *bit_buff,
                                     MI_BLOCK_INFO *info, uchar **rec_buff_p,
                                     File file, my_off_t filepos);
-extern void _my_store_blob_length(uchar *pos,uint pack_length,uint length);
-extern void _myisam_log(enum myisam_log_commands command,MI_INFO *info,
-		       const uchar *buffert,uint length);
+extern void _mi_store_blob_length(uchar *pos, uint pack_length, uint length);
+extern void _myisam_log(enum myisam_log_commands command, MI_INFO *info,
+                        const uchar *buffert, uint length);
 extern void _myisam_log_command(enum myisam_log_commands command,
-			       MI_INFO *info, const uchar *buffert,
-			       uint length, int result);
-extern void _myisam_log_record(enum myisam_log_commands command,MI_INFO *info,
-			      const uchar *record,my_off_t filepos,
-			      int result);
+                                MI_INFO *info, const uchar *buffert,
+                                uint length, int result);
+extern void _myisam_log_record(enum myisam_log_commands command, MI_INFO *info,
+                               const uchar *record, my_off_t filepos,
+                               int result);
 extern void mi_report_error(int errcode, const char *file_name);
 extern my_bool _mi_memmap_file(MI_INFO *info);
 extern void _mi_unmap_file(MI_INFO *info);
 extern uint save_pack_length(uint version, uchar *block_buff, ulong length);
-extern uint read_pack_length(uint version, const uchar *buf, ulong *length);
 extern uint calc_pack_length(uint version, ulong length);
 extern size_t mi_mmap_pread(MI_INFO *info, uchar *Buffer,
                             size_t Count, my_off_t offset, myf MyFlags);
@@ -722,7 +682,7 @@ uint mi_state_info_write(File file, MI_STATE_INFO *state, uint pWrite);
 uchar *mi_state_info_read(uchar *ptr, MI_STATE_INFO *state);
 uint mi_state_info_read_dsk(File file, MI_STATE_INFO *state, my_bool pRead);
 uint mi_base_info_write(File file, MI_BASE_INFO *base);
-uchar *my_n_base_info_read(uchar *ptr, MI_BASE_INFO *base);
+uchar *mi_n_base_info_read(uchar *ptr, MI_BASE_INFO *base);
 int mi_keyseg_write(File file, const HA_KEYSEG *keyseg);
 uchar *mi_keyseg_read(uchar *ptr, HA_KEYSEG *keyseg);
 uint mi_keydef_write(File file, MI_KEYDEF *keydef);
@@ -734,23 +694,24 @@ uchar *mi_recinfo_read(uchar *ptr, MI_COLUMNDEF *recinfo);
 extern int mi_disable_indexes(MI_INFO *info);
 extern int mi_enable_indexes(MI_INFO *info);
 extern int mi_indexes_are_disabled(MI_INFO *info);
-ulong _my_calc_total_blob_length(MI_INFO *info, const uchar *record);
+ulong _mi_calc_total_blob_length(MI_INFO *info, const uchar *record);
 ha_checksum mi_checksum(MI_INFO *info, const uchar *buf);
 ha_checksum mi_static_checksum(MI_INFO *info, const uchar *buf);
 my_bool mi_check_unique(MI_INFO *info, MI_UNIQUEDEF *def, uchar *record,
-		     ha_checksum unique_hash, my_off_t pos);
+                        ha_checksum unique_hash, my_off_t pos);
 ha_checksum mi_unique_hash(MI_UNIQUEDEF *def, const uchar *buf);
 int _mi_cmp_static_unique(MI_INFO *info, MI_UNIQUEDEF *def,
-			   const uchar *record, my_off_t pos);
+                          const uchar *record, my_off_t pos);
 int _mi_cmp_dynamic_unique(MI_INFO *info, MI_UNIQUEDEF *def,
-			   const uchar *record, my_off_t pos);
+                           const uchar *record, my_off_t pos);
 int mi_unique_comp(MI_UNIQUEDEF *def, const uchar *a, const uchar *b,
-		   my_bool null_are_equal);
-void mi_get_status(void* param, int concurrent_insert);
-void mi_update_status(void* param);
-void mi_restore_status(void* param);
-void mi_copy_status(void* to,void *from);
-my_bool mi_check_status(void* param);
+                   my_bool null_are_equal);
+void mi_get_status(void *param, my_bool concurrent_insert);
+void mi_update_status(void *param);
+void mi_restore_status(void *param);
+void mi_copy_status(void *to, void *from);
+my_bool mi_check_status(void *param);
+void mi_fix_status(MI_INFO *org_table, MI_INFO *new_table);
 void mi_disable_non_unique_index(MI_INFO *info, ha_rows rows);
 
 extern MI_INFO *test_if_reopen(char *filename);
@@ -766,28 +727,21 @@ void mi_remap_file(MI_INFO *info, my_off_t size);
 void _mi_report_crashed(MI_INFO *file, const char *message,
                         const char *sfile, uint sline);
 
+int mi_check_index_cond(register MI_INFO *info, uint keynr, uchar *record);
     /* Functions needed by mi_check */
-volatile int *killed_ptr(MI_CHECK *param);
-void mi_check_print_error(MI_CHECK *param, const char *fmt,...);
-void mi_check_print_warning(MI_CHECK *param, const char *fmt,...);
-void mi_check_print_info(MI_CHECK *param, const char *fmt,...);
-int flush_pending_blocks(MI_SORT_PARAM *param);
-int sort_ft_buf_flush(MI_SORT_PARAM *sort_param);
-int thr_write_keys(MI_SORT_PARAM *sort_param);
+int killed_ptr(HA_CHECK *param);
+void mi_check_print_error(HA_CHECK *param, const char *fmt, ...);
+void mi_check_print_warning(HA_CHECK *param, const char *fmt, ...);
+void mi_check_print_info(HA_CHECK *param, const char *fmt, ...);
 #ifdef THREAD
 pthread_handler_t thr_find_all_keys(void *arg);
 #endif
-int flush_blocks(MI_CHECK *param, KEY_CACHE *key_cache, File file);
-
-int sort_write_record(MI_SORT_PARAM *sort_param);
-int _create_index_by_sort(MI_SORT_PARAM *info,my_bool no_messages, ulong);
-
-#ifdef __cplusplus
-}
-#endif
+extern void mi_set_index_cond_func(MI_INFO *info, index_cond_func_t func,
+                                   void *func_arg);
+int flush_blocks(HA_CHECK *param, KEY_CACHE *key_cache, File file,
+                 ulonglong *dirty_part_map);
 
 #ifdef HAVE_PSI_INTERFACE
-C_MODE_START
 extern PSI_mutex_key mi_key_mutex_MYISAM_SHARE_intern_lock,
   mi_key_mutex_MI_SORT_INFO_mutex, mi_key_mutex_MI_CHECK_print_msg;
 
@@ -802,6 +756,6 @@ extern PSI_file_key mi_key_file_datatmp, mi_key_file_dfile, mi_key_file_kfile,
 extern PSI_thread_key mi_key_thread_find_all_keys;
 
 void init_myisam_psi_keys();
-C_MODE_END
 #endif /* HAVE_PSI_INTERFACE */
 
+C_MODE_END
diff --git a/storage/myisam/myisamlog.c b/storage/myisam/myisamlog.c
index 84743b8da51..a17823a45b5 100644
--- a/storage/myisam/myisamlog.c
+++ b/storage/myisam/myisamlog.c
@@ -331,7 +331,7 @@ static int examine_log(char * file_name, char **table_names)
   init_tree(&tree,0,0,sizeof(file_info),(qsort_cmp2) file_info_compare,1,
 	    (tree_element_free) file_info_free, NULL);
   (void) init_key_cache(dflt_key_cache,KEY_CACHE_BLOCK_SIZE,KEY_CACHE_SIZE,
-                      0, 0);
+                      0, 0, 0);
 
   files_open=0; access_time=0;
   while (access_time++ != number_of_commands &&
@@ -806,7 +806,7 @@ static int find_record_with_key(struct file_info *file_info, uchar *record)
 {
   uint key;
   MI_INFO *info=file_info->isam;
-  uchar tmp_key[MI_MAX_KEY_BUFF];
+  uchar tmp_key[HA_MAX_KEY_BUFF];
 
   for (key=0 ; key < info->s->base.keys ; key++)
   {
diff --git a/storage/myisam/myisampack.c b/storage/myisam/myisampack.c
index 84a7f2a1ba9..017f7d160b6 100644
--- a/storage/myisam/myisampack.c
+++ b/storage/myisam/myisampack.c
@@ -260,8 +260,8 @@ static struct my_option my_long_options[] =
   {"backup", 'b', "Make a backup of the table as table_name.OLD.",
    &backup, &backup, 0, GET_BOOL, NO_ARG, 0, 0, 0, 0, 0, 0},
   {"character-sets-dir", OPT_CHARSETS_DIR_MP,
-   "Directory where character sets are.", &charsets_dir,
-   &charsets_dir, 0, GET_STR, REQUIRED_ARG, 0, 0, 0, 0, 0, 0},
+   "Directory where character sets are.", (char**) &charsets_dir,
+   (char**) &charsets_dir, 0, GET_STR, REQUIRED_ARG, 0, 0, 0, 0, 0, 0},
   {"debug", '#', "Output debug log. Often this is 'd:t:o,filename'.",
    0, 0, 0, GET_STR, OPT_ARG, 0, 0, 0, 0, 0, 0},
   {"force", 'f',
@@ -304,7 +304,7 @@ static void usage(void)
   puts("and you are welcome to modify and redistribute it under the GPL license\n");
 
   puts("Pack a MyISAM-table to take much less space.");
-  puts("Keys are not updated, you must run myisamchk -rq on the datafile");
+  puts("Keys are not updated, you must run myisamchk -rq on the index (.MYI) file");
   puts("afterwards to update the keys.");
   puts("You should give the .MYI file as the filename argument.");
 
@@ -568,7 +568,7 @@ static int compress(PACK_MRG_INFO *mrg,char *result_table)
     Create a global priority queue in preparation for making 
     temporary Huffman trees.
   */
-  if (init_queue(&queue,256,0,0,compare_huff_elements,0))
+  if (init_queue(&queue, 256, 0, 0, compare_huff_elements, 0, 0, 0))
     goto err;
 
   /*
@@ -1541,7 +1541,7 @@ static int make_huff_tree(HUFF_TREE *huff_tree, HUFF_COUNTS *huff_counts)
   if (queue.max_elements < found)
   {
     delete_queue(&queue);
-    if (init_queue(&queue,found,0,0,compare_huff_elements,0))
+    if (init_queue(&queue,found, 0, 0, compare_huff_elements, 0, 0, 0))
       return -1;
   }
 
@@ -1645,8 +1645,7 @@ static int make_huff_tree(HUFF_TREE *huff_tree, HUFF_COUNTS *huff_counts)
     Make a priority queue from the queue. Construct its index so that we
     have a partially ordered tree.
   */
-  for (i=found/2 ; i > 0 ; i--)
-    _downheap(&queue,i);
+  queue_fix(&queue);
 
   /* The Huffman algorithm. */
   bytes_packed=0; bits_packed=0;
@@ -1657,12 +1656,9 @@ static int make_huff_tree(HUFF_TREE *huff_tree, HUFF_COUNTS *huff_counts)
       Popping from a priority queue includes a re-ordering of the queue,
       to get the next least incidence element to the top.
     */
-    a=(HUFF_ELEMENT*) queue_remove(&queue,0);
-    /*
-      Copy the next least incidence element. The queue implementation
-      reserves root[0] for temporary purposes. root[1] is the top.
-    */
-    b=(HUFF_ELEMENT*) queue.root[1];
+    a=(HUFF_ELEMENT*) queue_remove_top(&queue);
+    /* Copy the next least incidence element */
+    b=(HUFF_ELEMENT*) queue_top(&queue);
     /* Get a new element from the element buffer. */
     new_huff_el=huff_tree->element_buffer+found+i;
     /* The new element gets the sum of the two least incidence elements. */
@@ -1684,8 +1680,8 @@ static int make_huff_tree(HUFF_TREE *huff_tree, HUFF_COUNTS *huff_counts)
       Replace the copied top element by the new element and re-order the
       queue.
     */
-    queue.root[1]=(uchar*) new_huff_el;
-    queue_replaced(&queue);
+    queue_top(&queue)= (uchar*) new_huff_el;
+    queue_replace_top(&queue);
   }
   huff_tree->root=(HUFF_ELEMENT*) queue.root[1];
   huff_tree->bytes_packed=bytes_packed+(bits_packed+7)/8;
@@ -1816,8 +1812,7 @@ static my_off_t calc_packed_length(HUFF_COUNTS *huff_counts,
     Make a priority queue from the queue. Construct its index so that we
     have a partially ordered tree.
   */
-  for (i=(found+1)/2 ; i > 0 ; i--)
-    _downheap(&queue,i);
+  queue_fix(&queue);
 
   /* The Huffman algorithm. */
   for (i=0 ; i < found-1 ; i++)
@@ -1831,12 +1826,9 @@ static my_off_t calc_packed_length(HUFF_COUNTS *huff_counts,
       incidence). Popping from a priority queue includes a re-ordering
       of the queue, to get the next least incidence element to the top.
     */
-    a= (my_off_t*) queue_remove(&queue, 0);
-    /*
-      Copy the next least incidence element. The queue implementation
-      reserves root[0] for temporary purposes. root[1] is the top.
-    */
-    b= (my_off_t*) queue.root[1];
+    a= (my_off_t*) queue_remove_top(&queue);
+    /* Copy the next least incidence element. */
+    b= (my_off_t*) queue_top(&queue);
     /* Create a new element in a local (automatic) buffer. */
     new_huff_el= element_buffer + i;
     /* The new element gets the sum of the two least incidence elements. */
@@ -1856,8 +1848,8 @@ static my_off_t calc_packed_length(HUFF_COUNTS *huff_counts,
       queue. This successively replaces the references to counts by
       references to HUFF_ELEMENTs.
     */
-    queue.root[1]=(uchar*) new_huff_el;
-    queue_replaced(&queue);
+    queue_top(&queue)= (uchar*) new_huff_el;
+    queue_replace_top(&queue);
   }
   DBUG_RETURN(bytes_packed+(bits_packed+7)/8);
 }
diff --git a/storage/myisam/plug.in b/storage/myisam/plug.in
index 051ec2d54aa..e92b5e56d7f 100644
--- a/storage/myisam/plug.in
+++ b/storage/myisam/plug.in
@@ -1,7 +1,7 @@
-MYSQL_STORAGE_ENGINE(myisam,no, [MyISAM Storage Engine],
-        [Traditional non-transactional MySQL tables])
-MYSQL_PLUGIN_DIRECTORY(myisam,  [storage/myisam])
-MYSQL_PLUGIN_STATIC(myisam,     [libmyisam.a])
-MYSQL_PLUGIN_MANDATORY(myisam)  dnl Default
-MYSQL_PLUGIN_DEPENDS_ON_MYSQL_INTERNALS(myisam, [ha_myisam.cc])
+dnl MYSQL_STORAGE_ENGINE(myisam,no, [MyISAM Storage Engine],
+dnl         [Traditional non-transactional MySQL tables])
+dnl MYSQL_PLUGIN_DIRECTORY(myisam,  [storage/myisam])
+dnl MYSQL_PLUGIN_STATIC(myisam,     [libmyisam.a])
+dnl MYSQL_PLUGIN_MANDATORY(myisam)  dnl Default
+dnl MYSQL_PLUGIN_DEPENDS_ON_MYSQL_INTERNALS(myisam, [ha_myisam.cc])
 
diff --git a/storage/myisam/rt_index.c b/storage/myisam/rt_index.c
index 37a06606b3c..48eb48cc5e8 100644
--- a/storage/myisam/rt_index.c
+++ b/storage/myisam/rt_index.c
@@ -528,7 +528,7 @@ static int rtree_insert_req(MI_INFO *info, MI_KEYDEF *keyinfo, uchar *key,
   DBUG_ENTER("rtree_insert_req");
 
   if (!(page_buf = (uchar*)my_alloca((uint)keyinfo->block_length + 
-                                     MI_MAX_KEY_BUFF)))
+                                     HA_MAX_KEY_BUFF)))
   {
     my_errno = HA_ERR_OUT_OF_MEM;
     DBUG_RETURN(-1); /* purecov: inspected */
diff --git a/storage/myisam/rt_test.c b/storage/myisam/rt_test.c
index 7233300c539..50cdc538668 100644
--- a/storage/myisam/rt_test.c
+++ b/storage/myisam/rt_test.c
@@ -105,14 +105,20 @@ static int run_test(const char *filename)
   int nrecords=sizeof(rt_data)/(sizeof(double)*4);/* 3000;*/
   int rec_length=0;
   int uniques=0;
-  int i;
+  int i, max_i;
   int error;
   int row_count=0;
   uchar record[MAX_REC_LENGTH];
   uchar read_record[MAX_REC_LENGTH];
   int upd= 10;
   ha_rows hrows;
-  
+
+  bzero(&uniquedef, sizeof(uniquedef));
+  bzero(&create_info, sizeof(create_info));
+  bzero(recinfo, sizeof(recinfo));
+  bzero(keyinfo, sizeof(keyinfo));
+  bzero(keyseg, sizeof(keyseg));
+
   /* Define a column for NULLs and DEL markers*/
   
   recinfo[0].type=FIELD_NORMAL;
@@ -147,7 +153,6 @@ static int run_test(const char *filename)
   if (!silent)
     printf("- Creating isam-file\n");
   
-  bzero((char*) &create_info,sizeof(create_info));
   create_info.max_rows=10000000;
   
   if (mi_create(filename,
@@ -194,7 +199,7 @@ static int run_test(const char *filename)
     create_record(record,i);
     
     bzero((char*) read_record,MAX_REC_LENGTH);
-    error=mi_rkey(file,read_record,0,record+1,0,HA_READ_MBR_EQUAL);
+    error=mi_rkey(file,read_record,0,record+1,HA_WHOLE_KEY,HA_READ_MBR_EQUAL);
     
     if (error && error!=HA_ERR_KEY_NOT_FOUND)
     {
@@ -233,7 +238,8 @@ static int run_test(const char *filename)
 
   if (!silent)
     printf("- Updating rows with position\n");
-  for (i=0; i < (nrecords - nrecords/4) ; i++)
+  /* We are looking for nrecords-necords/2 non-deleted records */
+  for (i=0, max_i= nrecords - nrecords/2; i < max_i ; i++)
   {
     my_errno=0;
     bzero((char*) read_record,MAX_REC_LENGTH);
@@ -241,7 +247,11 @@ static int run_test(const char *filename)
     if (error)
     {
       if (error==HA_ERR_RECORD_DELETED)
+      {
+        printf("found deleted record\n");
+        max_i++; /* don't count such record */
         continue;
+      }
       printf("pos: %2d  mi_rrnd: %3d  errno: %3d\n",i,error,my_errno);
       goto err;
     }
@@ -266,7 +276,8 @@ static int run_test(const char *filename)
   create_record(record, nrecords*4/5);
   print_record(record,0,"  search for\n");
   
-  if ((error=mi_rkey(file,read_record,0,record+1,0,HA_READ_MBR_INTERSECT)))
+  if ((error=mi_rkey(file,read_record,0,record+1,HA_WHOLE_KEY,
+                     HA_READ_MBR_INTERSECT)))
   {
     printf("mi_rkey: %3d  errno: %3d\n",error,my_errno);
     goto err;
diff --git a/storage/myisam/sort.c b/storage/myisam/sort.c
index 9532b9f0474..903a893e779 100644
--- a/storage/myisam/sort.c
+++ b/storage/myisam/sort.c
@@ -15,7 +15,7 @@
 
 /*
   Creates a index for a database by reading keys, sorting them and outputing
-  them in sorted order through SORT_INFO functions.
+  them in sorted order through MI_SORT_INFO functions.
 */
 
 #include "fulltext.h"
@@ -487,8 +487,8 @@ ok:
 
 int thr_write_keys(MI_SORT_PARAM *sort_param)
 {
-  SORT_INFO *sort_info=sort_param->sort_info;
-  MI_CHECK *param=sort_info->param;
+  MI_SORT_INFO *sort_info=sort_param->sort_info;
+  HA_CHECK *param=sort_info->param;
   ulong UNINIT_VAR(length), keys;
   ulong *rec_per_key_part=param->rec_per_key_part;
   int got_error=sort_info->got_error;
@@ -903,7 +903,6 @@ merge_buffers(MI_SORT_PARAM *info, uint keys, IO_CACHE *from_file,
   uchar *strpos;
   BUFFPEK *buffpek,**refpek;
   QUEUE queue;
-  volatile int *killed= killed_ptr(info->sort_info->param);
   DBUG_ENTER("merge_buffers");
 
   count=error=0;
@@ -917,13 +916,13 @@ merge_buffers(MI_SORT_PARAM *info, uint keys, IO_CACHE *from_file,
 
   if (init_queue(&queue,(uint) (Tb-Fb)+1,offsetof(BUFFPEK,key),0,
                  (int (*)(void*, uchar *,uchar*)) info->key_cmp,
-                 (void*) info))
+                 (void*) info, 0, 0))
     DBUG_RETURN(1); /* purecov: inspected */
 
   for (buffpek= Fb ; buffpek <= Tb ; buffpek++)
   {
     count+= buffpek->count;
-    buffpek->base= strpos;
+    buffpek->base= (uchar*) strpos;
     buffpek->max_keys=maxcount;
     strpos+= (uint) (error=(int) info->read_to_buffer(from_file,buffpek,
                                                       sort_length));
@@ -936,10 +935,6 @@ merge_buffers(MI_SORT_PARAM *info, uint keys, IO_CACHE *from_file,
   {
     for (;;)
     {
-      if (*killed)
-      {
-        error=1; goto err;
-      }
       buffpek=(BUFFPEK*) queue_top(&queue);
       if (to_file)
       {
@@ -959,12 +954,18 @@ merge_buffers(MI_SORT_PARAM *info, uint keys, IO_CACHE *from_file,
       buffpek->key+=sort_length;
       if (! --buffpek->mem_count)
       {
+        /* It's enough to check for killedptr before a slow operation */
+        if (killed_ptr(info->sort_info->param))
+        {
+          error=1;
+          goto err;
+        }
         if (!(error=(int) info->read_to_buffer(from_file,buffpek,sort_length)))
         {
-          uchar *base=buffpek->base;
+          uchar *base= buffpek->base;
           uint max_keys=buffpek->max_keys;
 
-          (void) queue_remove(&queue,0);
+          queue_remove_top(&queue);
 
           /* Put room used by buffer to use in other buffer */
           for (refpek= (BUFFPEK**) &queue_top(&queue);
@@ -989,11 +990,11 @@ merge_buffers(MI_SORT_PARAM *info, uint keys, IO_CACHE *from_file,
       }
       else if (error == -1)
         goto err;               /* purecov: inspected */
-      queue_replaced(&queue);   /* Top element has been replaced */
+      queue_replace_top(&queue);   /* Top element has been replaced */
     }
   }
   buffpek=(BUFFPEK*) queue_top(&queue);
-  buffpek->base=(uchar *) sort_keys;
+  buffpek->base= (uchar*) sort_keys;
   buffpek->max_keys=keys;
   do
   {
@@ -1008,7 +1009,7 @@ merge_buffers(MI_SORT_PARAM *info, uint keys, IO_CACHE *from_file,
     else
     {
       register uchar *end;
-      strpos= buffpek->key;
+      strpos= (uchar*) buffpek->key;
       for (end=strpos+buffpek->mem_count*sort_length;
            strpos != end ;
            strpos+=sort_length)
diff --git a/storage/myisammrg/ha_myisammrg.cc b/storage/myisammrg/ha_myisammrg.cc
index f62aff4e383..6e0271088d3 100644
--- a/storage/myisammrg/ha_myisammrg.cc
+++ b/storage/myisammrg/ha_myisammrg.cc
@@ -141,9 +141,11 @@ static const char *ha_myisammrg_exts[] = {
 };
 extern int table2myisam(TABLE *table_arg, MI_KEYDEF **keydef_out,
                         MI_COLUMNDEF **recinfo_out, uint *records_out);
-extern int check_definition(MI_KEYDEF *t1_keyinfo, MI_COLUMNDEF *t1_recinfo,
+extern int check_definition(MI_KEYDEF *t1_keyinfo,
+                            MI_COLUMNDEF *t1_recinfo,
                             uint t1_keys, uint t1_recs,
-                            MI_KEYDEF *t2_keyinfo, MI_COLUMNDEF *t2_recinfo,
+                            MI_KEYDEF *t2_keyinfo,
+                            MI_COLUMNDEF *t2_recinfo,
                             uint t2_keys, uint t2_recs, bool strict,
                             TABLE *table_arg);
 static void split_file_name(const char *file_name,
@@ -650,7 +652,6 @@ extern "C" MI_INFO *myisammrg_attach_children_callback(void *callback_param)
 
 CPP_UNNAMED_NS_END
 
-
 /**
    Returns a cloned instance of the current handler.
 
@@ -1261,7 +1262,7 @@ int ha_myisammrg::info(uint flag)
   {
     if (table->s->key_parts && mrg_info.rec_per_key)
     {
-#ifdef HAVE_purify
+#ifdef HAVE_valgrind
       /*
         valgrind may be unhappy about it, because optimizer may access values
         between file->keys and table->key_parts, that will be uninitialized.
@@ -1318,7 +1319,8 @@ int ha_myisammrg::extra(enum ha_extra_function operation)
   /* As this is just a mapping, we don't have to force the underlying
      tables to be closed */
   if (operation == HA_EXTRA_FORCE_REOPEN ||
-      operation == HA_EXTRA_PREPARE_FOR_DROP)
+      operation == HA_EXTRA_PREPARE_FOR_DROP ||
+      operation == HA_EXTRA_PREPARE_FOR_RENAME)
     return 0;
   if (operation == HA_EXTRA_MMAP && !opt_myisam_use_mmap)
     return 0;
@@ -1368,6 +1370,33 @@ THR_LOCK_DATA **ha_myisammrg::store_lock(THD *thd,
 					 THR_LOCK_DATA **to,
 					 enum thr_lock_type lock_type)
 {
+  MYRG_TABLE *open_table;
+
+  /*
+    This method can be called while another thread is attaching the
+    children. If the processor reorders instructions or write to memory,
+    'children_attached' could be set before 'open_tables' has all the
+    pointers to the children. Use of a mutex here and in
+    myrg_attach_children() forces consistent data.
+  */
+  pthread_mutex_lock(&this->file->mutex);
+
+  /*
+    When MERGE table is open, but not yet attached, other threads
+    could flush it, which means call mysql_lock_abort_for_thread()
+    on this threads TABLE. 'children_attached' is FALSE in this
+    situaton. Since the table is not locked, return no lock data.
+  */
+  if (!this->file->children_attached)
+    goto end; /* purecov: tested */
+
+  for (open_table=file->open_tables ;
+       open_table != file->end_table ;
+       open_table++)
+    open_table->table->lock.priority|= THR_LOCK_MERGE_PRIV;
+
+ end:
+  pthread_mutex_unlock(&this->file->mutex);
   return to;
 }
 
@@ -1621,3 +1650,20 @@ mysql_declare_plugin(myisammrg)
   NULL                        /* config options                  */
 }
 mysql_declare_plugin_end;
+maria_declare_plugin(myisammrg)
+{
+  MYSQL_STORAGE_ENGINE_PLUGIN,
+  &myisammrg_storage_engine,
+  "MRG_MYISAM",
+  "MySQL AB",
+  "Collection of identical MyISAM tables",
+  PLUGIN_LICENSE_GPL,
+  myisammrg_init, /* Plugin Init */
+  NULL, /* Plugin Deinit */
+  0x0100, /* 1.0 */
+  NULL,                       /* status variables                */
+  NULL,                       /* system variables                */
+  "1.0",                      /* string version */
+  MariaDB_PLUGIN_MATURITY_STABLE /* maturity */
+}
+maria_declare_plugin_end;
diff --git a/storage/myisammrg/ha_myisammrg.h b/storage/myisammrg/ha_myisammrg.h
index 4ff24c69071..c434dc28426 100644
--- a/storage/myisammrg/ha_myisammrg.h
+++ b/storage/myisammrg/ha_myisammrg.h
@@ -101,8 +101,8 @@ public:
             HA_READ_ORDER | HA_KEYREAD_ONLY);
   }
   uint max_supported_keys()          const { return MI_MAX_KEY; }
-  uint max_supported_key_length()    const { return MI_MAX_KEY_LENGTH; }
-  uint max_supported_key_part_length() const { return MI_MAX_KEY_LENGTH; }
+  uint max_supported_key_length()    const { return HA_MAX_KEY_LENGTH; }
+  uint max_supported_key_part_length() const { return HA_MAX_KEY_LENGTH; }
   double scan_time()
   { return ulonglong2double(stats.data_file_length) / IO_SIZE + file->tables; }
 
diff --git a/storage/myisammrg/myrg_locking.c b/storage/myisammrg/myrg_locking.c
index 4f1e3f844a1..a414cee7bb8 100644
--- a/storage/myisammrg/myrg_locking.c
+++ b/storage/myisammrg/myrg_locking.c
@@ -27,15 +27,8 @@ int myrg_lock_database(MYRG_INFO *info, int lock_type)
   error=0;
   for (file=info->open_tables ; file != info->end_table ; file++) 
   {
-#ifdef __WIN__
-    /*
-      Make sure this table is marked as owned by a merge table.
-      The semaphore is never released as long as table remains
-      in memory. This should be refactored into a more generic
-      approach (observer pattern)
-     */
-    (file->table)->owned_by_merge = TRUE;
-#endif
+    DBUG_ASSERT(file->table->open_flag & HA_OPEN_MERGE_TABLE);
+
     if ((new_error=mi_lock_database(file->table,lock_type)))
     {
       error=new_error;
diff --git a/storage/myisammrg/myrg_open.c b/storage/myisammrg/myrg_open.c
index b8e86b89181..156660d00c0 100644
--- a/storage/myisammrg/myrg_open.c
+++ b/storage/myisammrg/myrg_open.c
@@ -24,8 +24,9 @@
 	if handle_locking is 0 then exit with error if some table is locked
 	if handle_locking is 1 then wait if table is locked
 
-        NOTE: This function is not used in the MySQL server. It is for
-        MERGE use independent from MySQL. Currently there is some code
+        NOTE: This function is only used in the MySQL server when a
+        table is cloned. It is also used for usage of MERGE
+        independent from MySQL. Currently there is some code
         duplication between myrg_open() and myrg_parent_open() +
         myrg_attach_children(). Please duplicate changes in these
         functions or make common sub-functions.
@@ -91,7 +92,8 @@ MYRG_INFO *myrg_open(const char *name, int mode, int handle_locking)
     }
     else
       fn_format(buff, buff, "", "", 0);
-    if (!(isam=mi_open(buff,mode,(handle_locking?HA_OPEN_WAIT_IF_LOCKED:0))))
+    if (!(isam=mi_open(buff,mode,(handle_locking?HA_OPEN_WAIT_IF_LOCKED:0) |
+                       HA_OPEN_MERGE_TABLE)))
     {
       if (handle_locking & HA_OPEN_FOR_REPAIR)
       {
@@ -236,6 +238,7 @@ MYRG_INFO *myrg_parent_open(const char *parent_name,
   rc= 1;
   errpos= 0;
   bzero((char*) &file_cache, sizeof(file_cache));
+  LINT_INIT(m_info);
 
   /* Open MERGE meta file. */
   if ((fd= mysql_file_open(rg_key_file_MRG,
@@ -431,6 +434,8 @@ int myrg_attach_children(MYRG_INFO *m_info, int handle_locking,
     m_info->open_tables[child_nr].table= myisam;
     m_info->open_tables[child_nr].file_offset= (my_off_t) file_offset;
     file_offset+= myisam->state->data_file_length;
+    /* Mark as MERGE table */
+    myisam->open_flag|= HA_OPEN_MERGE_TABLE;
 
     /* Check table definition match. */
     if (m_info->reclength != myisam->s->base.reclength)
diff --git a/storage/myisammrg/myrg_queue.c b/storage/myisammrg/myrg_queue.c
index 2c447083558..b13eee3ed75 100644
--- a/storage/myisammrg/myrg_queue.c
+++ b/storage/myisammrg/myrg_queue.c
@@ -52,7 +52,7 @@ int _myrg_init_queue(MYRG_INFO *info,int inx,enum ha_rkey_function search_flag)
       if (init_queue(q,info->tables, 0,
 		     (myisam_readnext_vec[search_flag] == SEARCH_SMALLER),
 		     queue_key_cmp,
-		     info->open_tables->table->s->keyinfo[inx].seg))
+		     info->open_tables->table->s->keyinfo[inx].seg, 0, 0))
 	error=my_errno;
     }
     else
@@ -60,7 +60,7 @@ int _myrg_init_queue(MYRG_INFO *info,int inx,enum ha_rkey_function search_flag)
       if (reinit_queue(q,info->tables, 0,
 		       (myisam_readnext_vec[search_flag] == SEARCH_SMALLER),
 		       queue_key_cmp,
-		       info->open_tables->table->s->keyinfo[inx].seg))
+		       info->open_tables->table->s->keyinfo[inx].seg, 0, 0))
 	error=my_errno;
     }
   }
diff --git a/storage/myisammrg/myrg_rnext.c b/storage/myisammrg/myrg_rnext.c
index 82d5cbf38b1..1442ee08dd4 100644
--- a/storage/myisammrg/myrg_rnext.c
+++ b/storage/myisammrg/myrg_rnext.c
@@ -32,7 +32,7 @@ int myrg_rnext(MYRG_INFO *info, uchar *buf, int inx)
   {
     if (err == HA_ERR_END_OF_FILE)
     {
-      queue_remove(&(info->by_key),0);
+      queue_remove_top(&(info->by_key));
       if (!info->by_key.elements)
         return HA_ERR_END_OF_FILE;
     }
@@ -43,7 +43,7 @@ int myrg_rnext(MYRG_INFO *info, uchar *buf, int inx)
   {
     /* Found here, adding to queue */
     queue_top(&(info->by_key))=(uchar *)(info->current_table);
-    queue_replaced(&(info->by_key));
+    queue_replace_top(&(info->by_key));
   }
 
   /* now, mymerge's read_next is as simple as one queue_top */
diff --git a/storage/myisammrg/myrg_rnext_same.c b/storage/myisammrg/myrg_rnext_same.c
index ad7bbfb0f6e..14b41dbe756 100644
--- a/storage/myisammrg/myrg_rnext_same.c
+++ b/storage/myisammrg/myrg_rnext_same.c
@@ -29,7 +29,7 @@ int myrg_rnext_same(MYRG_INFO *info, uchar *buf)
   {
     if (err == HA_ERR_END_OF_FILE)
     {
-      queue_remove(&(info->by_key),0);
+      queue_remove_top(&(info->by_key));
       if (!info->by_key.elements)
         return HA_ERR_END_OF_FILE;
     }
@@ -40,7 +40,7 @@ int myrg_rnext_same(MYRG_INFO *info, uchar *buf)
   {
     /* Found here, adding to queue */
     queue_top(&(info->by_key))=(uchar *)(info->current_table);
-    queue_replaced(&(info->by_key));
+    queue_replace_top(&(info->by_key));
   }
 
   /* now, mymerge's read_next is as simple as one queue_top */
diff --git a/storage/myisammrg/myrg_rprev.c b/storage/myisammrg/myrg_rprev.c
index 66c94974940..0c560a0b73d 100644
--- a/storage/myisammrg/myrg_rprev.c
+++ b/storage/myisammrg/myrg_rprev.c
@@ -32,7 +32,7 @@ int myrg_rprev(MYRG_INFO *info, uchar *buf, int inx)
   {
     if (err == HA_ERR_END_OF_FILE)
     {
-      queue_remove(&(info->by_key),0);
+      queue_remove_top(&(info->by_key));
       if (!info->by_key.elements)
         return HA_ERR_END_OF_FILE;
     }
@@ -43,7 +43,7 @@ int myrg_rprev(MYRG_INFO *info, uchar *buf, int inx)
   {
     /* Found here, adding to queue */
     queue_top(&(info->by_key))=(uchar *)(info->current_table);
-    queue_replaced(&(info->by_key));
+    queue_replace_top(&(info->by_key));
   }
 
   /* now, mymerge's read_prev is as simple as one queue_top */
diff --git a/storage/ndb/plug.in b/storage/ndb/plug.in
index a7e351417b1..3d3349f7a8b 100644
--- a/storage/ndb/plug.in
+++ b/storage/ndb/plug.in
@@ -1,5 +1,5 @@
 MYSQL_STORAGE_ENGINE(ndbcluster, ndbcluster, [Cluster Storage Engine],
-        [High Availability Clustered tables], [max])
+        [High Availability Clustered tables],)
 MYSQL_PLUGIN_DIRECTORY(ndbcluster,[storage/ndb])
 MYSQL_PLUGIN_STATIC(ndbcluster, [[\$(ndbcluster_libs) \$(ndbcluster_system_libs) \$(NDB_SCI_LIBS)]])
 MYSQL_PLUGIN_ACTIONS(ndbcluster,[MYSQL_SETUP_NDBCLUSTER])
diff --git a/storage/ndb/src/ndbapi/Ndb.cpp b/storage/ndb/src/ndbapi/Ndb.cpp
index e6a1c2cfcfd..70e0dc7e2f2 100644
--- a/storage/ndb/src/ndbapi/Ndb.cpp
+++ b/storage/ndb/src/ndbapi/Ndb.cpp
@@ -1569,9 +1569,11 @@ Ndb::externalizeTableName(const char * internalTableName, bool fullyQualifiedNam
     register const char *ptr = internalTableName;
    
     // Skip database name
-    while (*ptr && *ptr++ != table_name_separator);
+    while (*ptr && *ptr++ != table_name_separator)
+      ;
     // Skip schema name
-    while (*ptr && *ptr++ != table_name_separator);
+    while (*ptr && *ptr++ != table_name_separator)
+      ;
     return ptr;
   }
   else
@@ -1591,7 +1593,9 @@ Ndb::externalizeIndexName(const char * internalIndexName, bool fullyQualifiedNam
     register const char *ptr = internalIndexName;
    
     // Scan name from the end
-    while (*ptr++); ptr--; // strend
+    while (*ptr++)
+      ;
+    ptr--; // strend
     while (ptr >= internalIndexName && *ptr != table_name_separator)
       ptr--;
      
diff --git a/storage/ndb/src/ndbapi/NdbRecAttr.cpp b/storage/ndb/src/ndbapi/NdbRecAttr.cpp
index 38ca14085f0..3b0329dd7ac 100644
--- a/storage/ndb/src/ndbapi/NdbRecAttr.cpp
+++ b/storage/ndb/src/ndbapi/NdbRecAttr.cpp
@@ -248,7 +248,8 @@ ndbrecattr_print_formatted(NdbOut& out, const NdbRecAttr &r,
       {
         const Uint32 *buf = (Uint32 *)r.aRef();
         int k = (length+31)/32;
-        while (k > 0 && (buf[--k] == 0));
+        while (k > 0 && (buf[--k] == 0))
+          ;
         out.print("%X", buf[k]);
         while (k > 0)
           out.print("%.8X", buf[--k]);
diff --git a/storage/oqgraph/CMakeLists.txt b/storage/oqgraph/CMakeLists.txt
new file mode 100644
index 00000000000..01723f04e16
--- /dev/null
+++ b/storage/oqgraph/CMakeLists.txt
@@ -0,0 +1,20 @@
+CHECK_CXX_SOURCE_COMPILES(
+"#include <boost/version.hpp>
+#if BOOST_VERSION >= 104000
+#else
+#error oops
+#endif
+int main() { return 0; }" BOOST_OK)
+
+IF(BOOST_OK)
+  ADD_DEFINITIONS(-DHAVE_OQGRAPH)
+  IF(MSVC)
+    SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /EHsc")
+  ENDIF(MSVC)
+
+  SET(OQGRAPH_PLUGIN_STATIC  "oqgraph")
+  SET(OQGRAPH_PLUGIN_DYNAMIC "ha_oqgraph")
+  SET(OQGRAPH_SOURCES ha_oqgraph.cc graphcore.cc)
+  MYSQL_ADD_PLUGIN(oqgraph ${OQGRAPH_SOURCES} STORAGE_ENGINE)
+ENDIF(BOOST_OK)
+#error same fix here as in plugin.m4
diff --git a/storage/oqgraph/Makefile.am b/storage/oqgraph/Makefile.am
new file mode 100644
index 00000000000..e99e134db02
--- /dev/null
+++ b/storage/oqgraph/Makefile.am
@@ -0,0 +1,98 @@
+# Copyright (C) 2007-2009 Arjen G Lentz & Antony T Curtis for Open Query
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */
+
+# ======================================================================
+# Open Query Graph Computation Engine, based on a concept by Arjen Lentz
+# Mk.II implementation by Antony Curtis & Arjen Lentz
+# For more information, documentation, support, enhancement engineering,
+# and non-GPL licensing, see http://openquery.com/graph
+# or contact graph@openquery.com
+# For packaged binaries, see http://ourdelta.org
+# ======================================================================
+
+mysqlplugindir=		$(pkglibdir)/plugin
+
+BOOST_CXXFLAGS =	-frtti -fexceptions -fimplicit-templates
+#BOOST_CXXFLAGS+=	-g
+#original flags before 2009-11-10
+#BOOST_CXXFLAGS+=	-O3 -fomit-frame-pointer -fstrict-aliasing
+#BOOST_CXXFLAGS+=	-momit-leaf-frame-pointer -falign-loops
+#modified flags:
+# - remove omit-frame-pointer, x86 specific (fails on PPC) + hinders debugging
+#   Option details from gcc man:
+#   Don't keep the frame pointer in a register for functions that don't need one.
+#   This avoids the instructions to save, set up and restore frame pointers;
+#   it also makes an extra register available in many functions.
+#   It also makes debugging impossible on some machines.
+#   (automatically gets enabled anyway by -O* on some architectures)
+BOOST_CXXFLAGS+=	-O3 -fstrict-aliasing
+BOOST_CXXFLAGS+=	-falign-loops
+if HAVE_FVISIBILITY_INLINES_HIDDEN
+BOOST_CXXFLAGS+=	-fvisibility-inlines-hidden
+endif
+BOOST_CXXFLAGS+=	-funroll-loops -fno-trapping-math
+
+EXTRA_DIST =	ha_oqgraph.h ha_oqgraph.cc graphcore.cc \
+		graphcore-graph.h graphcore-types.h graphcore.h \
+		CMakeLists.txt plug.in oqgraph_probes.d
+
+# DTRACE =                @DTRACE@
+# DTRACEFLAGS =           @DTRACEFLAGS@
+# DTRACEFILES =           .libs/libha_oqgraph_la-ha_oqgraph.o
+
+ORIG_CXXFLAGS = @CXXFLAGS@
+CXXFLAGS=
+noinst_HEADERS = ha_oqgraph.h \
+		 graphcore-graph.h graphcore-types.h graphcore.h
+#		 oqgraph_probes.h
+
+noinst_LTLIBRARIES = libgraphcore.la
+libgraphcore_la_SOURCES = graphcore.cc
+libgraphcore_la_CXXFLAGS = $(ORIG_CXXFLAGS) $(BOOST_CXXFLAGS)
+
+if BUILD_OQGRAPH_FOR_MYSQL
+
+if BUILD_OQGRAPH_STANDALONE
+INCLUDES = -DDBUG_ON -DSAFE_MUTEX -DUNIV_MUST_NOT_INLINE -DEXTRA_DEBUG -DFORCE_INIT_OF_VARS -DSAFEMALLOC -DPEDANTIC_SAFEMALLOC -DSAFE_MUTEX -DHAVE_OQGRAPH $(MYSQL_INC) 
+else
+INCLUDES = -I$(top_srcdir)/include -I$(top_builddir)/include -I$(top_srcdir)/regex -I$(top_srcdir)/sql -I$(srcdir) -DHAVE_OQGRAPH 
+endif !BUILD_OQGRAPH_STANDALONE
+
+EXTRA_LTLIBRARIES = ha_oqgraph.la
+mysqlplugin_LTLIBRARIES = @plugin_oqgraph_shared_target@
+ha_oqgraph_la_SOURCES = ha_oqgraph.cc
+ha_oqgraph_la_LIBADD = libgraphcore.la
+
+# if HAVE_DTRACE
+#   ha_oqgraph_la_LIBADD += oqgraph_probes.o
+# endif
+
+ha_oqgraph_la_LDFLAGS =	-shared -module -rpath $(mysqlplugindir)
+ha_oqgraph_la_CFLAGS = $(ORIG_CFLAGS) -DMYSQL_DYNAMIC_PLUGIN
+ha_oqgraph_la_CXXFLAGS = $(ORIG_CXXFLAGS) -DMYSQL_DYNAMIC_PLUGIN
+
+# oqgraph_probes.h: oqgraph_probes.d
+# 	$(DTRACE) $(DTRACEFLAGS) -h -s oqgraph_probes.d
+# 	mv oqgraph_probes.h oqgraph_probes.h.bak
+# 	sed "s/#include <unistd.h>//g" oqgraph_probes.h.bak > oqgraph_probes.h
+# 	rm oqgraph_probes.h.bak
+
+# oqgraph_probes.o:
+# 	$(DTRACE) $(DTRACEFLAGS) -G -s oqgraph_probes.d $(DTRACEFILES)
+
+endif BUILD_OQGRAPH_FOR_MYSQL
+
+# End
diff --git a/storage/oqgraph/README b/storage/oqgraph/README
new file mode 100644
index 00000000000..cb4fba7295b
--- /dev/null
+++ b/storage/oqgraph/README
@@ -0,0 +1,16 @@
+OQGraph storage engine
+Copyright (C) 2007-2009 Arjen G Lentz & Antony T Curtis for Open Query
+
+The Open Query GRAPH engine (OQGRAPH) is a computation engine allowing
+hierarchies and more complex graph structures to be handled in a
+relational fashion. In a nutshell, tree structures and
+friend-of-a-friend style searches can now be done using standard SQL
+syntax, and results joined onto other tables.
+
+See http://openquery.com/graph for more information.
+
+
+INSTALLATION
+
+OQGraph requires at least version 1.40.0 of the Boost library. To
+obtain a copy of the Boost library, see http://www.boost.org/
diff --git a/storage/oqgraph/graphcore-graph.h b/storage/oqgraph/graphcore-graph.h
new file mode 100644
index 00000000000..46ddfb5335b
--- /dev/null
+++ b/storage/oqgraph/graphcore-graph.h
@@ -0,0 +1,48 @@
+/* Copyright (C) 2007-2009 Arjen G Lentz & Antony T Curtis for Open Query
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; version 2 of the License, or
+   (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */
+
+/* ======================================================================
+   Open Query Graph Computation Engine, based on a concept by Arjen Lentz
+   Mk.II implementation by Antony Curtis & Arjen Lentz
+   For more information, documentation, support, enhancement engineering,
+   and non-GPL licensing, see http://openquery.com/graph
+   or contact graph@openquery.com
+   For packaged binaries, see http://ourdelta.org
+   ======================================================================
+*/
+
+#ifndef oq_graphcore_graph_h_
+#define oq_graphcore_graph_h_
+
+typedef adjacency_list
+<
+  vecS,
+  vecS,
+  bidirectionalS,
+  VertexInfo,
+  EdgeInfo
+> Graph;
+
+#define GRAPH_WEIGHTMAP(G) get(&EdgeInfo::weight, G)
+typedef property_map<Graph, EdgeWeight EdgeInfo::*>::type weightmap_type;
+
+#define GRAPH_INDEXMAP(G)  get(vertex_index, G)
+typedef property_map<Graph, vertex_index_t>::type indexmap_type;
+
+#define GRAPH_IDMAP(G)     get(&VertexInfo::id, G)
+typedef property_map<Graph, VertexID VertexInfo::*>::type idmap_type;
+
+#endif
diff --git a/storage/oqgraph/graphcore-types.h b/storage/oqgraph/graphcore-types.h
new file mode 100644
index 00000000000..7a7e4c62729
--- /dev/null
+++ b/storage/oqgraph/graphcore-types.h
@@ -0,0 +1,36 @@
+/* Copyright (C) 2007-2009 Arjen G Lentz & Antony T Curtis for Open Query
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; version 2 of the License, or
+   (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */
+
+/* ======================================================================
+   Open Query Graph Computation Engine, based on a concept by Arjen Lentz
+   Mk.II implementation by Antony Curtis & Arjen Lentz
+   For more information, documentation, support, enhancement engineering,
+   and non-GPL licensing, see http://openquery.com/graph
+   or contact graph@openquery.com
+   For packaged binaries, see http://ourdelta.org
+   ======================================================================
+*/
+
+#ifndef oq_graphcore_types_h_
+#define oq_graphcore_types_h_
+namespace open_query
+{
+
+  typedef unsigned long long VertexID;
+  typedef double EdgeWeight;
+
+}
+#endif
diff --git a/storage/oqgraph/graphcore.cc b/storage/oqgraph/graphcore.cc
new file mode 100644
index 00000000000..0b856ac253f
--- /dev/null
+++ b/storage/oqgraph/graphcore.cc
@@ -0,0 +1,1101 @@
+/* Copyright (C) 2007-2009 Arjen G Lentz & Antony T Curtis for Open Query
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; version 2 of the License, or
+   (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */
+
+/* ======================================================================
+   Open Query Graph Computation Engine, based on a concept by Arjen Lentz
+   Mk.II implementation by Antony Curtis & Arjen Lentz
+   For more information, documentation, support, enhancement engineering,
+   and non-GPL licensing, see http://openquery.com/graph
+   or contact graph@openquery.com
+   For packaged binaries, see http://ourdelta.org
+   ======================================================================
+*/
+
+#include <string.h>
+
+#define BOOST_ALL_NO_LIB 1
+
+#include <boost/config.hpp>
+
+#include <set>
+#include <stack>
+
+#include <boost/property_map/property_map.hpp>
+
+#include <boost/graph/graph_concepts.hpp>
+#include <boost/graph/graph_archetypes.hpp>
+#include <boost/graph/adjacency_list.hpp>
+#include <boost/graph/breadth_first_search.hpp>
+#include <boost/graph/dijkstra_shortest_paths.hpp>
+#include <boost/graph/iteration_macros.hpp>
+#include <boost/graph/reverse_graph.hpp>
+#include <boost/graph/graph_utility.hpp>
+
+#include "graphcore.h"
+
+using namespace open_query;
+using namespace boost;
+
+static const row empty_row = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
+
+namespace open_query
+{
+  enum vertex_id_t { vertex_id };
+
+  struct VertexInfo {
+    inline VertexInfo() { }
+
+    inline VertexInfo(VertexID _id)
+      : id(_id) { }
+
+    VertexID id;
+  };
+
+  struct EdgeInfo {
+    EdgeWeight weight;
+  };
+}
+
+namespace boost
+{
+  BOOST_INSTALL_PROPERTY(vertex, id);
+
+  namespace graph
+  {
+    template<>
+    struct internal_vertex_name<VertexInfo>
+    {
+      typedef multi_index::member<VertexInfo, VertexID, &VertexInfo::id> type;
+    };
+
+    template<>
+    struct internal_vertex_constructor<VertexInfo>
+    {
+      typedef vertex_from_name<VertexInfo> type;
+    };
+  }
+}
+
+namespace open_query
+{
+
+  #include "graphcore-graph.h"
+
+  typedef graph_traits<Graph>::vertex_descriptor Vertex;
+  typedef graph_traits<Graph>::edge_descriptor Edge;
+
+  typedef std::list<std::pair<Vertex,optional<EdgeWeight> > > shortest_path_list;
+  typedef shortest_path_list::iterator shortest_path_iterator;
+
+  template<typename ID, typename IDMap>
+  class id_equals_t
+  {
+  public:
+    id_equals_t(ID id, IDMap map)
+      : m_id(id), m_map(map)
+    { }
+    template<typename V>
+    bool operator()(V u) const
+    {
+      return m_map[u] == m_id;
+    }
+  private:
+    ID m_id;
+    IDMap m_map;
+  };
+
+  template<typename ID, typename IDMap>
+  inline id_equals_t<ID,IDMap>
+  id_equals(ID id, IDMap idmap)
+  {
+    return id_equals_t<ID,IDMap>(id, idmap);
+  }
+
+  template<typename T, typename Graph>
+  class target_equals_t
+  {
+  public:
+    target_equals_t(T target, Graph &g)
+      : m_target(target), m_g(g)
+    { }
+    template<typename V>
+    bool operator()(V u) const
+    {
+      return target(u, m_g) == m_target;
+    }
+  private:
+    T m_target;
+    Graph &m_g;
+  };
+
+  template<typename T, typename Graph>
+  inline target_equals_t<T,Graph>
+  target_equals(T target, Graph &g)
+  {
+    return target_equals_t<T,Graph>(target, g);
+  }
+
+  template<typename T, typename Graph>
+  class source_equals_t
+  {
+  public:
+    source_equals_t(T source, Graph &g)
+      : m_source(source), m_g(g)
+    { }
+    template<typename V>
+    bool operator()(V u) const
+    {
+      return source(u, m_g) == m_source;
+    }
+  private:
+    T m_source;
+    Graph &m_g;
+  };
+
+  template<typename T, typename Graph>
+  inline source_equals_t<T,Graph>
+  source_equals(T source, Graph &g)
+  {
+    return source_equals_t<T,Graph>(source, g);
+  }
+
+  struct reference
+  {
+    int m_flags;
+    int m_sequence;
+    Vertex m_vertex;
+    Edge m_edge;
+    EdgeWeight m_weight;
+
+    enum
+    {
+      HAVE_SEQUENCE = 1,
+      HAVE_WEIGHT = 2,
+      HAVE_EDGE = 4,
+    };
+
+    inline reference()
+      : m_flags(0), m_sequence(0),
+        m_vertex(graph_traits<Graph>::null_vertex()),
+        m_edge(), m_weight(0)
+    { }
+
+    inline reference(int s, Edge e)
+      : m_flags(HAVE_SEQUENCE | HAVE_EDGE), m_sequence(s),
+        m_vertex(graph_traits<Graph>::null_vertex()),
+        m_edge(e), m_weight(0)
+    { }
+
+    inline reference(int s, Vertex v, const optional<Edge> &e,
+                     const optional<EdgeWeight> &w)
+      : m_flags(HAVE_SEQUENCE | (w ? HAVE_WEIGHT : 0) | (e ? HAVE_EDGE : 0)),
+        m_sequence(s), m_vertex(v)
+    {
+      if (w) m_weight= *w;
+      if (e) m_edge= *e;
+    }
+
+    inline reference(int s, Vertex v, Edge e, EdgeWeight w)
+      : m_flags(HAVE_SEQUENCE | HAVE_WEIGHT | HAVE_EDGE),
+        m_sequence(s), m_vertex(v), m_edge(e), m_weight(w)
+    { }
+
+    inline reference(int s, Vertex v, EdgeWeight w)
+      : m_flags(HAVE_SEQUENCE | HAVE_WEIGHT),
+        m_sequence(s), m_vertex(v), m_edge(), m_weight(w)
+    { }
+
+    inline reference(int s, Vertex v)
+      : m_flags(HAVE_SEQUENCE), m_sequence(s), m_vertex(v), m_edge(),
+        m_weight(0)
+    { }
+
+    optional<int> sequence() const
+    {
+      if (m_flags & HAVE_SEQUENCE)
+      {
+        return m_sequence;
+      }
+      return optional<int>();
+    }
+
+    optional<Vertex> vertex() const
+    {
+      if (m_vertex != graph_traits<Graph>::null_vertex())
+        return m_vertex;
+      return optional<Vertex>();
+    }
+
+    optional<Edge> edge() const
+    {
+      if (m_flags & HAVE_EDGE)
+        return m_edge;
+      return optional<Edge>();
+    };
+
+    optional<EdgeWeight> weight() const
+    {
+      if (m_flags & HAVE_WEIGHT)
+        return m_weight;
+      return optional<EdgeWeight>();
+    }
+  };
+}
+
+namespace open_query {
+  class GRAPHCORE_INTERNAL oqgraph_share
+  {
+  public:
+    Graph g;
+
+    weightmap_type weightmap;
+    idmap_type idmap;
+    indexmap_type indexmap;
+
+    optional<Vertex> find_vertex(VertexID id) const;
+    optional<Edge> find_edge(Vertex, Vertex) const;
+
+    inline oqgraph_share() throw()
+      : g(),
+        weightmap(GRAPH_WEIGHTMAP(g)),
+        idmap(GRAPH_IDMAP(g)),
+        indexmap(GRAPH_INDEXMAP(g))
+    { }
+    inline ~oqgraph_share()
+    { }
+  };
+
+  class GRAPHCORE_INTERNAL oqgraph_cursor
+  {
+  public:
+    oqgraph_share *const share;
+
+    inline oqgraph_cursor(oqgraph_share *arg)
+      : share(arg)
+    { }
+    virtual ~oqgraph_cursor()
+    { }
+
+    virtual int fetch_row(const row &, row&) = 0;
+    virtual int fetch_row(const row &, row&, const reference&) = 0;
+    virtual void current(reference& ref) const = 0;
+  };
+}
+
+namespace open_query {
+  class GRAPHCORE_INTERNAL stack_cursor : public oqgraph_cursor
+  {
+  private:
+    optional<EdgeWeight> no_weight;
+  public:
+    int sequence;
+    std::stack<reference> results;
+    reference last;
+
+    inline stack_cursor(oqgraph_share *arg)
+      : oqgraph_cursor(arg), no_weight(), sequence(0), results(), last()
+    { }
+
+    int fetch_row(const row &, row&);
+    int fetch_row(const row &, row&, const reference&);
+
+    void current(reference& ref) const
+    {
+      ref= last;
+    }
+  };
+
+  class GRAPHCORE_INTERNAL vertices_cursor : public oqgraph_cursor
+  {
+    typedef graph_traits<Graph>::vertex_iterator vertex_iterator;
+
+    size_t position;
+    reference last;
+  public:
+    inline vertices_cursor(oqgraph_share *arg)
+      : oqgraph_cursor(arg), position(0)
+    { }
+
+    int fetch_row(const row &, row&);
+    int fetch_row(const row &, row&, const reference&);
+
+    void current(reference& ref) const
+    {
+      ref= last;
+    }
+
+  };
+
+  class GRAPHCORE_INTERNAL edges_cursor : public oqgraph_cursor
+  {
+    typedef graph_traits<Graph>::edge_iterator edge_iterator;
+    typedef edge_iterator::difference_type edge_difference;
+
+    edge_difference position;
+    reference last;
+  public:
+    inline edges_cursor(oqgraph_share *arg)
+      : oqgraph_cursor(arg), position(0), last()
+    { }
+
+    int fetch_row(const row &, row&);
+    int fetch_row(const row &, row&, const reference&);
+
+    void current(reference& ref) const
+    {
+      ref= last;
+    }
+  };
+
+  struct GRAPHCORE_INTERNAL oqgraph_visit_dist
+    : public base_visitor<oqgraph_visit_dist>
+  {
+    typedef on_finish_vertex event_filter;
+
+    oqgraph_visit_dist(std::vector<Vertex>::iterator p,
+                       std::vector<EdgeWeight>::iterator d,
+                       stack_cursor *cursor)
+      : seq(0), m_cursor(*cursor), m_p(p), m_d(d)
+    { assert(cursor); }
+
+    template<class T, class Graph>
+    void operator()(T u, Graph &g)
+    {
+      m_cursor.results.push(reference(++seq, u, m_d[GRAPH_INDEXMAP(g)[u]]));
+    }
+  private:
+    int seq;
+    stack_cursor &m_cursor;
+    std::vector<Vertex>::iterator m_p;
+    std::vector<EdgeWeight>::iterator m_d;
+  };
+
+  template<bool record_weight, typename goal_filter>
+  struct GRAPHCORE_INTERNAL oqgraph_goal
+    : public base_visitor<oqgraph_goal<record_weight,goal_filter> >
+  {
+    typedef goal_filter event_filter;
+
+    oqgraph_goal(Vertex goal, std::vector<Vertex>::iterator p,
+                 stack_cursor *cursor)
+      : m_goal(goal), m_cursor(*cursor), m_p(p)
+    { assert(cursor); }
+
+    template<class T, class Graph>
+    void operator()(T u, Graph &g)
+    {
+      if (u == m_goal)
+      {
+        int seq= 0;
+        indexmap_type indexmap= GRAPH_INDEXMAP(g);
+
+        for (Vertex q, v= u;; v = q, seq++)
+          if ((q= m_p[ indexmap[v] ]) == v)
+            break;
+
+        for (Vertex v= u;; u= v)
+        {
+          optional<Edge> edge;
+          optional<EdgeWeight> weight;
+          v= m_p[ indexmap[u] ];
+          if (record_weight && u != v)
+          {
+            typename graph_traits<Graph>::out_edge_iterator ei, ei_end;
+            for (tie(ei, ei_end)= out_edges(v, g); ei != ei_end; ++ei)
+            {
+              if (target(*ei, g) == u)
+              {
+                edge= *ei;
+                weight= GRAPH_WEIGHTMAP(g)[*ei];
+                break;
+              }
+            }
+          }
+          else if (u != v)
+            weight= 1;
+          m_cursor.results.push(reference(seq--, u, edge, weight));
+          if (u == v)
+            break;
+        }
+        throw this;
+      }
+    }
+
+  private:
+    Vertex m_goal;
+    stack_cursor &m_cursor;
+    std::vector<Vertex>::iterator m_p;
+  };
+}
+
+namespace open_query
+{
+  inline oqgraph::oqgraph(oqgraph_share *arg) throw()
+    : share(arg), cursor(0)
+  { }
+
+  inline oqgraph::~oqgraph() throw()
+  {
+    delete cursor;
+  }
+
+  unsigned oqgraph::edges_count() const throw()
+  {
+    return num_edges(share->g);
+  }
+
+  unsigned oqgraph::vertices_count() const throw()
+  {
+    return num_vertices(share->g);
+  }
+
+  oqgraph* oqgraph::create(oqgraph_share *share) throw()
+  {
+    assert(share != NULL);
+    return new (std::nothrow) oqgraph(share);
+  }
+
+  oqgraph_share* oqgraph::create() throw()
+  {
+    return new (std::nothrow) oqgraph_share();
+  }
+
+  optional<Edge>
+  oqgraph_share::find_edge(Vertex orig, Vertex dest) const
+  {
+    if (in_degree(dest, g) >= out_degree(orig, g))
+    {
+      graph_traits<Graph>::out_edge_iterator ei, ei_end;
+      tie(ei, ei_end)= out_edges(orig, g);
+      if ((ei= find_if(ei, ei_end, target_equals(dest, g))) != ei_end)
+        return *ei;
+    }
+    else
+    {
+      graph_traits<Graph>::in_edge_iterator ei, ei_end;
+      tie(ei, ei_end)= in_edges(dest, g);
+      if ((ei= find_if(ei, ei_end, source_equals(orig, g))) != ei_end)
+        return *ei;
+    }
+    return optional<Edge>();
+  }
+
+  optional<Vertex>
+  oqgraph_share::find_vertex(VertexID id) const
+  {
+    return boost::graph::find_vertex(id, g);
+  }
+
+  int oqgraph::delete_all() throw()
+  {
+    share->g.clear();
+    return 0;
+  }
+
+  int oqgraph::insert_edge(
+      VertexID orig_id, VertexID dest_id, EdgeWeight weight, bool replace) throw()
+  {
+    optional<Vertex> orig, dest;
+    optional<Edge> edge;
+    bool inserted= 0;
+
+    if (weight < 0)
+      return INVALID_WEIGHT;
+    if (!(orig= share->find_vertex(orig_id)))
+    {
+      try
+      {
+        orig= add_vertex(VertexInfo(orig_id), share->g);
+        if (orig == graph_traits<Graph>::null_vertex())
+          return CANNOT_ADD_VERTEX;
+      }
+      catch (...)
+      {
+        return CANNOT_ADD_VERTEX;
+      }
+    }
+    if (!(dest= share->find_vertex(dest_id)))
+    {
+      try
+      {
+        dest= add_vertex(VertexInfo(dest_id), share->g);
+        if (dest == graph_traits<Graph>::null_vertex())
+          return CANNOT_ADD_VERTEX;
+      }
+      catch (...)
+      {
+        return CANNOT_ADD_VERTEX;
+      }
+    }
+    if (!(edge= share->find_edge(*orig, *dest)))
+    {
+      try
+      {
+        tie(edge, inserted)= add_edge(*orig, *dest, share->g);
+        if (!inserted)
+          return CANNOT_ADD_EDGE;
+      }
+      catch (...)
+      {
+        return CANNOT_ADD_EDGE;
+      }
+    }
+    else
+    {
+      if (!replace)
+        return DUPLICATE_EDGE;
+    }
+    share->weightmap[*edge]= weight;
+    return OK;
+  }
+
+  int oqgraph::delete_edge(current_row_st) throw()
+  {
+    reference ref;
+    if (cursor)
+      return EDGE_NOT_FOUND;
+    cursor->current(ref);
+    optional<Edge> edge;
+    if (!(edge= ref.edge()))
+      return EDGE_NOT_FOUND;
+    Vertex orig= source(*edge, share->g);
+    Vertex dest= target(*edge, share->g);
+    remove_edge(*edge, share->g);
+    if (!degree(orig, share->g))
+      remove_vertex(orig, share->g);
+    if (!degree(dest, share->g))
+      remove_vertex(dest, share->g);
+    return OK;
+  }
+
+  int oqgraph::modify_edge(current_row_st,
+      VertexID *orig_id, VertexID *dest_id, EdgeWeight *weight,
+      bool replace) throw()
+  {
+    if (!cursor)
+      return EDGE_NOT_FOUND;
+    reference ref;
+    cursor->current(ref);
+    optional<Edge> edge;
+    if (!(edge= ref.edge()))
+      return EDGE_NOT_FOUND;
+    if (weight && *weight < 0)
+      return INVALID_WEIGHT;
+
+    optional<Vertex> orig= source(*edge, share->g),
+                     dest= target(*edge, share->g);
+
+    bool orig_neq= orig_id ? share->idmap[*orig] != *orig_id : 0;
+    bool dest_neq= dest_id ? share->idmap[*dest] != *dest_id : 0;
+    if (orig_neq || dest_neq)
+    {
+      optional<Edge> new_edge;
+      if (orig_neq && !(orig= share->find_vertex(*orig_id)))
+      {
+        try
+        {
+          orig= add_vertex(VertexInfo(*orig_id), share->g);
+          if (orig == graph_traits<Graph>::null_vertex())
+            return CANNOT_ADD_VERTEX;
+        }
+        catch (...)
+        {
+          return CANNOT_ADD_VERTEX;
+        }
+      }
+      if (dest_neq && !(dest= share->find_vertex(*dest_id)))
+      {
+        try
+        {
+          dest= add_vertex(VertexInfo(*dest_id), share->g);
+          if (dest == graph_traits<Graph>::null_vertex())
+            return CANNOT_ADD_VERTEX;
+        }
+        catch (...)
+        {
+          return CANNOT_ADD_VERTEX;
+        }
+      }
+      if (!(new_edge= share->find_edge(*orig, *dest)))
+      {
+        try
+        {
+          bool inserted;
+          tie(new_edge, inserted)= add_edge(*orig, *dest, share->g);
+          if (!inserted)
+            return CANNOT_ADD_EDGE;
+        }
+        catch (...)
+        {
+          return CANNOT_ADD_EDGE;
+        }
+      }
+      else
+      {
+        if (!replace)
+          return DUPLICATE_EDGE;
+      }
+      share->weightmap[*new_edge]= share->weightmap[*edge];
+      remove_edge(*edge, share->g);
+      edge= new_edge;
+    }
+    if (weight)
+      share->weightmap[*edge]= *weight;
+    return OK;
+  }
+
+  int oqgraph::modify_edge(
+      VertexID orig_id, VertexID dest_id, EdgeWeight weight) throw()
+  {
+    optional<Vertex> orig, dest;
+    optional<Edge> edge;
+
+    if (weight < 0)
+      return INVALID_WEIGHT;
+    if (!(orig= share->find_vertex(orig_id)))
+      return EDGE_NOT_FOUND;
+    if (!(dest= share->find_vertex(dest_id)))
+      return EDGE_NOT_FOUND;
+    if (!(edge= share->find_edge(*orig, *dest)))
+      return EDGE_NOT_FOUND;
+    share->weightmap[*edge]= weight;
+    return OK;
+  }
+
+
+  int oqgraph::delete_edge(VertexID orig_id, VertexID dest_id) throw()
+  {
+    optional<Vertex> orig, dest;
+    optional<Edge> edge;
+
+    if (!(orig= share->find_vertex(orig_id)))
+      return EDGE_NOT_FOUND;
+    if (!(dest= share->find_vertex(dest_id)))
+      return EDGE_NOT_FOUND;
+    if (!(edge= share->find_edge(*orig, *dest)))
+      return EDGE_NOT_FOUND;
+    remove_edge(*edge, share->g);
+    if (!degree(*orig, share->g))
+      remove_vertex(*orig, share->g);
+    if (!degree(*dest, share->g))
+      remove_vertex(*dest, share->g);
+    return OK;
+  }
+
+
+  int oqgraph::search(int *latch, VertexID *orig_id, VertexID *dest_id) throw()
+  {
+      optional<Vertex> orig, dest;
+      int op= 0, seq= 0;
+      enum {
+        NO_SEARCH = 0,
+        DIJKSTRAS = 1,
+        BREADTH_FIRST = 2,
+
+	ALGORITHM = 0x0ffff,
+        HAVE_ORIG = 0x10000,
+        HAVE_DEST = 0x20000,
+      };
+
+      delete cursor; cursor= 0;
+      row_info= empty_row;
+      if ((row_info.latch_indicator= latch))
+        op= ALGORITHM & (row_info.latch= *latch);
+      if ((row_info.orig_indicator= orig_id) && (op|= HAVE_ORIG))
+        orig= share->find_vertex((row_info.orig= *orig_id));
+      if ((row_info.dest_indicator= dest_id) && (op|= HAVE_DEST))
+        dest= share->find_vertex((row_info.dest= *dest_id));
+    //try
+    //{
+      switch (op)
+      {
+      case NO_SEARCH | HAVE_ORIG | HAVE_DEST:
+      case NO_SEARCH | HAVE_ORIG:
+        if ((cursor= new (std::nothrow) stack_cursor(share)) && orig)
+        {
+          graph_traits<Graph>::out_edge_iterator ei, ei_end;
+          for (tie(ei, ei_end)= out_edges(*orig, share->g); ei != ei_end; ++ei)
+          {
+            Vertex v= target(*ei, share->g);
+            static_cast<stack_cursor*>(cursor)->
+                results.push(reference(++seq, v, *ei, share->weightmap[*ei]));
+          }
+        }
+        /* fall through */
+      case NO_SEARCH | HAVE_DEST:
+        if ((op & HAVE_DEST) &&
+            (cursor || (cursor= new (std::nothrow) stack_cursor(share))) &&
+	    dest)
+        {
+          graph_traits<Graph>::in_edge_iterator ei, ei_end;
+          for (tie(ei, ei_end)= in_edges(*dest, share->g); ei != ei_end; ++ei)
+          {
+            Vertex v= source(*ei, share->g);
+            static_cast<stack_cursor*>(cursor)->
+                results.push(reference(++seq, v, *ei, share->weightmap[*ei]));
+          }
+        }
+        break;
+
+      case NO_SEARCH:
+        cursor= new (std::nothrow) vertices_cursor(share);
+        break;
+
+      case DIJKSTRAS | HAVE_ORIG | HAVE_DEST:
+        if ((cursor= new (std::nothrow) stack_cursor(share)) && orig && dest)
+        {
+          std::vector<Vertex> p(num_vertices(share->g));
+          std::vector<EdgeWeight> d(num_vertices(share->g));
+          oqgraph_goal<true, on_finish_vertex>
+              vis(*dest, p.begin(), static_cast<stack_cursor*>(cursor));
+          p[share->indexmap[*orig]]= *orig;
+          try
+          {
+            dijkstra_shortest_paths(share->g, *orig,
+                weight_map(
+                  share->weightmap
+                ).
+                distance_map(
+                    make_iterator_property_map(d.begin(), share->indexmap)
+                ).
+                predecessor_map(
+                    make_iterator_property_map(p.begin(), share->indexmap)
+                ).
+                visitor(
+                    make_dijkstra_visitor(vis)
+                )
+            );
+          }
+          catch (...)
+          { /* printf("found\n"); */ }
+        }
+        break;
+
+      case BREADTH_FIRST | HAVE_ORIG | HAVE_DEST:
+        if ((cursor= new (std::nothrow) stack_cursor(share)) && orig && dest)
+        {
+          std::vector<Vertex> p(num_vertices(share->g));
+          oqgraph_goal<false, on_discover_vertex>
+              vis(*dest, p.begin(), static_cast<stack_cursor*>(cursor));
+          p[share->indexmap[*orig]]= *orig;
+          try
+          {
+            breadth_first_search(share->g, *orig,
+                visitor(make_bfs_visitor(
+                    std::make_pair(
+                        record_predecessors(
+                            make_iterator_property_map(p.begin(), share->indexmap),
+                            on_tree_edge()
+                        ),
+                        vis)
+                    )
+                )
+            );
+          }
+          catch (...)
+          { /* printf("found\n"); */ }
+        }
+        break;
+
+      case DIJKSTRAS | HAVE_ORIG:
+      case BREADTH_FIRST | HAVE_ORIG:
+        if ((cursor= new (std::nothrow) stack_cursor(share)) && (orig || dest))
+        {
+          std::vector<Vertex> p(num_vertices(share->g));
+          std::vector<EdgeWeight> d(num_vertices(share->g));
+          oqgraph_visit_dist vis(p.begin(), d.begin(),
+                                 static_cast<stack_cursor*>(cursor));
+          p[share->indexmap[*orig]]= *orig;
+          switch (ALGORITHM & op)
+          {
+          case DIJKSTRAS:
+            dijkstra_shortest_paths(share->g, *orig,
+                weight_map(
+                  share->weightmap
+                ).
+                distance_map(
+                    make_iterator_property_map(d.begin(), share->indexmap)
+                ).
+                predecessor_map(
+                    make_iterator_property_map(p.begin(), share->indexmap)
+                ).
+                visitor(
+                    make_dijkstra_visitor(vis)
+                )
+            );
+            break;
+          case BREADTH_FIRST:
+            breadth_first_search(share->g, *orig,
+                visitor(make_bfs_visitor(
+                    std::make_pair(
+                        record_predecessors(
+                            make_iterator_property_map(p.begin(),
+                                                       share->indexmap),
+                            on_tree_edge()
+                        ),
+                    std::make_pair(
+                        record_distances(
+                            make_iterator_property_map(d.begin(),
+                                                       share->indexmap),
+                            on_tree_edge()
+                        ),
+                        vis
+                    ))
+                ))
+            );
+            break;
+          default:
+            abort();
+          }
+        }
+        break;
+
+      case BREADTH_FIRST | HAVE_DEST:
+      case DIJKSTRAS | HAVE_DEST:
+        if ((cursor= new (std::nothrow) stack_cursor(share)) && (orig || dest))
+        {
+          std::vector<Vertex> p(num_vertices(share->g));
+          std::vector<EdgeWeight> d(num_vertices(share->g));
+          oqgraph_visit_dist vis(p.begin(), d.begin(),
+                                 static_cast<stack_cursor*>(cursor));
+          reverse_graph<Graph> r(share->g);
+          p[share->indexmap[*dest]]= *dest;
+          switch (ALGORITHM & op)
+          {
+          case DIJKSTRAS:
+            dijkstra_shortest_paths(r, *dest,
+                weight_map(
+                  share->weightmap
+                ).
+                distance_map(
+                    make_iterator_property_map(d.begin(), share->indexmap)
+                ).
+                predecessor_map(
+                    make_iterator_property_map(p.begin(), share->indexmap)
+                ).
+                visitor(
+                    make_dijkstra_visitor(vis)
+                )
+            );
+            break;
+          case BREADTH_FIRST:
+            breadth_first_search(r, *dest,
+                visitor(make_bfs_visitor(
+                    std::make_pair(
+                        record_predecessors(
+                            make_iterator_property_map(p.begin(),
+                                                       share->indexmap),
+                            on_tree_edge()
+                        ),
+                    std::make_pair(
+                        record_distances(
+                            make_iterator_property_map(d.begin(),
+                                                       share->indexmap),
+                            on_tree_edge()
+                        ),
+                        vis
+                    ))
+                ))
+            );
+            break;
+          default:
+            abort();
+          }
+        }
+        break;
+
+      default:
+        break;
+      }
+      return 0;
+    //}
+    //catch (...)
+    //{
+    //  return MISC_FAIL;
+    //}
+  }
+
+  int oqgraph::fetch_row(row& result) throw()
+  {
+    if (!cursor)
+      return NO_MORE_DATA;
+    return cursor->fetch_row(row_info, result);
+  }
+
+  int oqgraph::fetch_row(row& result, const void* ref_ptr) throw()
+  {
+    const reference &ref= *(const reference*) ref_ptr;
+    if (!cursor)
+      return NO_MORE_DATA;
+    return cursor->fetch_row(row_info, result, ref);
+  }
+
+  void oqgraph::row_ref(void *ref_ptr) throw()
+  {
+    reference &ref= *(reference*) ref_ptr;
+    if (cursor)
+      cursor->current(ref);
+    else
+      ref= reference();
+  }
+
+  int oqgraph::random(bool scan) throw()
+  {
+    if (scan || !cursor)
+    {
+      delete cursor; cursor= 0;
+      if (!(cursor= new (std::nothrow) edges_cursor(share)))
+        return MISC_FAIL;
+    }
+    row_info= empty_row;
+    return OK;
+  }
+
+  void oqgraph::free(oqgraph *graph) throw()
+  {
+    delete graph;
+  }
+
+  void oqgraph::free(oqgraph_share *graph) throw()
+  {
+    delete graph;
+  }
+
+  const size_t oqgraph::sizeof_ref= sizeof(reference);
+}
+
+int stack_cursor::fetch_row(const row &row_info, row &result)
+{
+  if (!results.empty())
+  {
+    if (int res= fetch_row(row_info, result, results.top()))
+      return res;
+    results.pop();
+    return oqgraph::OK;
+  }
+  else
+  {
+    last= reference();
+    return oqgraph::NO_MORE_DATA;
+  }
+}
+
+int stack_cursor::fetch_row(const row &row_info, row &result,
+                            const reference &ref)
+{
+  last= ref;
+  if (last.vertex())
+  {
+    optional<int> seq;
+    optional<EdgeWeight> w;
+    optional<Vertex> v;
+    result= row_info;
+    if ((result.seq_indicator= seq= last.sequence()))
+      result.seq= *seq;
+    if ((result.link_indicator= v= last.vertex()))
+      result.link= share->idmap[*v];
+    if ((result.weight_indicator= w= last.weight()))
+      result.weight= *w;
+    return oqgraph::OK;
+  }
+  else
+    return oqgraph::NO_MORE_DATA;
+}
+
+
+int vertices_cursor::fetch_row(const row &row_info, row &result)
+{
+  vertex_iterator it, end;
+  reference ref;
+  size_t count= position;
+  for (tie(it, end)= vertices(share->g); count && it != end; ++it, --count)
+    ;
+  if (it != end)
+    ref= reference(position+1, *it);
+  if (int res= fetch_row(row_info, result, ref))
+    return res;
+  position++;
+  return oqgraph::OK;
+}
+
+int vertices_cursor::fetch_row(const row &row_info, row &result,
+                               const reference &ref)
+{
+  last= ref;
+  optional<Vertex> v= last.vertex();
+  result= row_info;
+  if (v)
+  {
+    result.link_indicator= 1;
+    result.link= share->idmap[*v];
+#ifdef DISPLAY_VERTEX_INFO
+    result.seq_indicator= 1;
+    if ((result.seq= degree(*v, share->g)))
+    {
+      EdgeWeight weight= 0;
+      graph_traits<Graph>::in_edge_iterator iei, iei_end;
+      for (tie(iei, iei_end)= in_edges(*v, share->g); iei != iei_end; ++iei)
+        weight+= share->weightmap[*iei];
+      graph_traits<Graph>::out_edge_iterator oei, oei_end;
+      for (tie(oei, oei_end)= out_edges(*v, share->g); oei != oei_end; ++oei)
+        weight+= share->weightmap[*oei];
+      result.weight_indicator= 1;
+      result.weight= weight / result.seq;
+    }
+#endif
+    return oqgraph::OK;
+  }
+  else
+    return oqgraph::NO_MORE_DATA;
+}
+
+int edges_cursor::fetch_row(const row &row_info, row &result)
+{
+  edge_iterator it, end;
+  reference ref;
+  size_t count= position;
+  for (tie(it, end)= edges(share->g); count && it != end; ++it, --count)
+    ;
+  if (it != end)
+    ref= reference(position+1, *it);
+  if (int res= fetch_row(row_info, result, ref))
+    return res;
+  ++position;
+  return oqgraph::OK;
+}
+
+int edges_cursor::fetch_row(const row &row_info, row &result,
+                            const reference &ref)
+{
+  optional<Edge> edge;
+  if ((edge= (last= ref).edge()))
+  {
+    result= row_info;
+    result.orig_indicator= result.dest_indicator= result.weight_indicator= 1;
+    result.orig= share->idmap[ source( *edge, share->g ) ];
+    result.dest= share->idmap[ target( *edge, share->g ) ];
+    result.weight= share->weightmap[ *edge ];
+    return oqgraph::OK;
+  }
+  return oqgraph::NO_MORE_DATA;
+}
+
+namespace boost {
+  GRAPHCORE_INTERNAL void throw_exception(std::exception const&)
+  {
+    abort();
+  }
+}
diff --git a/storage/oqgraph/graphcore.h b/storage/oqgraph/graphcore.h
new file mode 100644
index 00000000000..4aaddb2796f
--- /dev/null
+++ b/storage/oqgraph/graphcore.h
@@ -0,0 +1,116 @@
+/* Copyright (C) 2007-2009 Arjen G Lentz & Antony T Curtis for Open Query
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; version 2 of the License, or
+   (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */
+
+/* ======================================================================
+   Open Query Graph Computation Engine, based on a concept by Arjen Lentz
+   Mk.II implementation by Antony Curtis & Arjen Lentz
+   For more information, documentation, support, enhancement engineering,
+   and non-GPL licensing, see http://openquery.com/graph
+   or contact graph@openquery.com
+   For packaged binaries, see http://ourdelta.org
+   ======================================================================
+*/
+
+#ifndef oq_graphcore_h_
+#define oq_graphcore_h_
+
+/* #define GRAPHCORE_INTERNAL __attribute__((visibility("hidden"))) */
+#define GRAPHCORE_INTERNAL
+
+#include "graphcore-types.h"
+
+namespace open_query
+{
+  class oqgraph_share;
+  class oqgraph_cursor;
+
+  struct row
+  {
+    bool latch_indicator;
+    bool orig_indicator;
+    bool dest_indicator;
+    bool weight_indicator;
+    bool seq_indicator;
+    bool link_indicator;
+
+    int latch;
+    VertexID orig;
+    VertexID dest;
+    EdgeWeight weight;
+    unsigned seq;
+    VertexID link;
+  };
+
+  class oqgraph
+  {
+    oqgraph_share *const share;
+    oqgraph_cursor *cursor;
+    row row_info;
+
+    inline oqgraph(oqgraph_share*) throw();
+    inline ~oqgraph() throw();
+  public:
+
+    enum error_code
+    {
+      OK= 0,
+      NO_MORE_DATA,
+      EDGE_NOT_FOUND,
+      INVALID_WEIGHT,
+      DUPLICATE_EDGE,
+      CANNOT_ADD_VERTEX,
+      CANNOT_ADD_EDGE,
+      MISC_FAIL
+    };
+
+    struct current_row_st {};
+    static inline current_row_st current_row()
+    { return current_row_st(); }
+
+    unsigned vertices_count() const throw();
+    unsigned edges_count() const throw();
+
+    int delete_all(void) throw();
+
+    int insert_edge(VertexID, VertexID, EdgeWeight, bool=0) throw();
+    int modify_edge(VertexID, VertexID, EdgeWeight) throw();
+    int delete_edge(VertexID, VertexID) throw();
+
+    int modify_edge(current_row_st,
+                    VertexID*, VertexID*, EdgeWeight*, bool=0) throw();
+    int delete_edge(current_row_st) throw();
+
+    int replace_edge(VertexID orig, VertexID dest, EdgeWeight weight) throw()
+    { return insert_edge(orig, dest, weight, true); }
+
+    int search(int*, VertexID*, VertexID*) throw();
+    int random(bool) throw();
+
+    int fetch_row(row&) throw();
+    int fetch_row(row&, const void*) throw();
+    void row_ref(void*) throw();
+
+    static oqgraph* create(oqgraph_share*) throw();
+    static oqgraph_share *create() throw();
+
+    static void free(oqgraph*) throw();
+    static void free(oqgraph_share*) throw();
+
+    static const size_t sizeof_ref;
+  };
+
+}
+#endif
diff --git a/storage/oqgraph/graphstore.c b/storage/oqgraph/graphstore.c
new file mode 100644
index 00000000000..c5478b56ca5
--- /dev/null
+++ b/storage/oqgraph/graphstore.c
@@ -0,0 +1,356 @@
+/*
+ * Graph Engine - Copyright (C) 2007 by Arjen Lentz (arjen@openquery.com.au)
+ * graphstore.c internal storage system
+ */
+#include <stdlib.h>
+#include <string.h>
+#include <my_global.h>
+#include <my_sys.h>
+#include "graphstore.h"
+
+
+/*
+	create a new vertex, and add it to the list (or start a list)
+	NOTE! gspp is ptr to base ptr
+
+	returns 1 for ok, 0 for error
+*/
+static int _add_vertex (GRAPHSTORE **gspp, GRAPH_VERTEXID id)
+{
+	GRAPHSTORE *newgsp;
+	GRAPHSTORE *gscurp;
+
+	if (gspp == NULL)
+		return 0;
+
+	/* not allowing 0 */
+	if (!id)
+		return 0;
+
+	if (*gspp != NULL) {
+		for (gscurp = *gspp; gscurp != NULL; gscurp = gscurp->next) {
+			if (gscurp->vertex->id == id)
+				return 1;	/* we can ignore, id already exists */
+		}
+	}
+
+	/* allocate and initialise */
+	if ((newgsp = my_malloc(sizeof (GRAPHSTORE),MYF(MY_ZEROFILL))) == NULL)
+		return 0;
+
+	if ((newgsp->vertex = my_malloc(sizeof (GRAPH_VERTEX),MYF(MY_ZEROFILL))) == NULL) {
+		my_free(newgsp,MYF(0));
+		return 0;
+	}
+
+	newgsp->vertex->id = id;
+	/* add new vertex to end of list */
+	if (*gspp != NULL) {
+		for (gscurp = *gspp; gscurp->next != NULL; gscurp = gscurp->next);
+		gscurp->next = newgsp;
+	}
+	else /* new list */
+		*gspp = newgsp;
+
+	/* ok */
+	return 1;
+}
+
+
+/*
+	find a vertex by id
+
+	returns ptr or NULL
+*/
+static GRAPH_VERTEX *_find_vertex (GRAPHSTORE *gsp, GRAPH_VERTEXID id)
+{
+	/* just loop through the list to find id */
+	while (gsp != NULL && gsp->vertex->id != id)
+		gsp = gsp->next;
+
+	/* return ptr to vertex, or NULL */
+	return (gsp != NULL ? gsp->vertex : NULL);
+}
+
+
+/*
+	add edge
+	both vertices must already exist; graphstore_insert() does this
+
+	return 1 for ok, 0 for error (already exists, alloc error, etc)
+*/
+static int _add_edge (GRAPHSTORE *gsp, GRAPH_VERTEXID origid, GRAPH_VERTEXID destid, GRAPH_WEIGHT weight)
+{
+	GRAPH_VERTEX *origvp, *destvp;
+	GRAPH_EDGE	*ep, *newep;
+
+	/* find both vertices */
+	if ((origvp = _find_vertex(gsp,origid)) == NULL ||
+		(destvp = _find_vertex(gsp,destid)) == NULL)
+		return 0;
+
+	/* check if edge already exists */
+	for (ep = origvp->forward_edge; ep != NULL; ep = ep->next_edge) {
+		if (ep->vertex->id == destid)
+			return 0;
+	}
+
+	/* allocate and initialise new edge */
+	if ((newep = my_malloc(sizeof (GRAPH_EDGE),MYF(MY_ZEROFILL))) == NULL)
+		return 0;
+
+	newep->vertex = destvp;
+	newep->weight = weight;
+
+	/* insert new edge at start of chain, that's easiest */
+	ep = origvp->forward_edge;
+	origvp->forward_edge = newep;
+	newep->next_edge = ep;
+
+	/* ok */
+	return 1;
+}
+
+
+/*
+	create a new row, and add it to the graph set (or start set)
+	NOTE! gsetpp is ptr to base ptr
+
+	returns 1 for ok, 0 for error
+*/
+static int _add_graph_set (GRAPH_SET **gsetpp, GRAPH_TUPLE *gtp)
+{
+	GRAPH_SET *newgsetp;
+	GRAPH_SET *gsetcurp;
+
+	if (gsetpp == NULL || gtp == NULL)
+		return 0;
+
+	/* allocate and initialise */
+	if ((newgsetp = my_malloc(sizeof (GRAPH_SET),MYF(MY_ZEROFILL))) == NULL)
+		return 0;
+
+	/* put in the data */
+	memcpy(&newgsetp->tuple,gtp,sizeof (GRAPH_TUPLE));
+
+	/* add new row to end of set */
+	if (*gsetpp != NULL) {
+		for (gsetcurp = *gsetpp; gsetcurp->next != NULL; gsetcurp = gsetcurp->next);
+		gsetcurp->next = newgsetp;
+	}
+	else {	/* new set */
+		*gsetpp = newgsetp;
+	}
+
+	/* ok */
+	return 1;
+}
+
+
+/*
+	free a graph set (release memory)
+
+	returns 1 for ok, 0 for error
+*/
+int free_graph_set (GRAPH_SET *gsetp)
+{
+	GRAPH_SET *nextgsetp;
+
+	if (gsetp == NULL)
+		return 0;
+
+	while (gsetp != NULL) {
+		nextgsetp = gsetp->next;
+		/* free() is a void function, nothing to check */
+		my_free(gsetp,MYF(0));
+		gsetp = nextgsetp;
+	}
+
+	/* ok */
+	return 1;
+}
+
+
+/*
+	insert new data into graphstore
+	this can be either a vertex or an edge, depending on the params
+	NOTE! gspp is ptr to base ptr
+
+	returns 1 for ok, 0 for error
+*/
+int graphstore_insert (GRAPHSTORE **gspp, GRAPH_TUPLE *gtp)
+{
+	if (gspp == NULL)
+		return 0;
+
+	/* if nada or no orig vertex, we can't do anything */
+	if (gtp == NULL || !gtp->origid)
+		return 0;
+
+#if 0
+printf("inserting: origid=%lu destid=%lu weight=%lu\n",gtp->origid,gtp->destid,gtp->weight);
+#endif
+
+	if (!gtp->destid)	/* no edge param so just adding vertex */
+		return _add_vertex(gspp,gtp->origid);
+
+	/*
+		add an edge
+		first add both vertices just in case they didn't yet exist...
+		not checking result there: if there's a prob, _add_edge() will catch.
+	*/
+	_add_vertex(gspp,gtp->origid);
+	_add_vertex(gspp,gtp->destid);
+	return _add_edge(*gspp,gtp->origid,gtp->destid,gtp->weight);
+}
+
+
+/*
+	this is an internal function used by graphstore_query()
+
+	find any path from originating vertex to destid
+	if found, add to the result set on the way back
+	NOTE: recursive function!
+	
+	returns 1 for hit, 0 for nothing, -1 for error
+*/
+int _find_any_path(GRAPH_SET **gsetpp, GRAPH_VERTEXID origid, GRAPH_VERTEXID destid, GRAPH_VERTEX *gvp, GRAPH_SEQ depth)
+{
+	GRAPH_EDGE *gep;
+	GRAPH_TUPLE tup;
+	int res;
+
+	if (gvp->id == destid) {
+		/* found target! */
+		bzero(&tup,sizeof (GRAPH_TUPLE));
+		tup.origid	= origid;
+		tup.destid	= destid;
+		tup.seq		= depth;
+		tup.linkid	= gvp->id;
+		return (_add_graph_set(gsetpp,&tup) ? 1 : -1);
+	}
+
+	/* walk through all edges for this vertex */
+	for (gep = gvp->forward_edge; gep; gep = gep->next_edge) {
+		/* recurse */
+		res = _find_any_path(gsetpp,origid,destid,gep->vertex,depth+1);
+		if (res < 0)
+			return res;
+		if (res > 0) {
+			/* found somewhere below this one, insert ourselves and return */
+			bzero(&tup,sizeof (GRAPH_TUPLE));
+			tup.origid	= origid;
+			tup.destid	= destid;
+			tup.weight  = gep->weight;
+			tup.seq		= depth;
+			tup.linkid	= gvp->id;
+			return (_add_graph_set(gsetpp,&tup) ? 1 : -1);			
+		}
+	}
+
+	/* nothing found but no error */
+	return 0;
+}
+
+
+/*
+	query graphstore
+	latch specifies what operation to perform
+
+	we need to feed the conditions in... (through engine condition pushdown)
+	for now we just presume one condition per field so we just feed in a tuple
+	this also means we can just find constants, not ranges
+
+	return ptr to GRAPH_SET
+	caller must free with free_graph_set()
+*/
+GRAPH_SET *graphstore_query (GRAPHSTORE *gsp, GRAPH_TUPLE *gtp)
+{
+	GRAPH_SET *gsetp = NULL;
+	GRAPH_SET *gsetcurp;
+	GRAPH_SET *newgsetp;
+
+	if (gsp == NULL || gtp == NULL)
+		return (NULL);
+
+	switch (gtp->latch) {
+		case 0: /* return all vertices/edges */
+			{
+				GRAPHSTORE *gscurp;
+				GRAPH_EDGE *gep;
+				GRAPH_TUPLE tup;
+
+				/* walk through all vertices */
+				for (gscurp = gsp; gscurp != NULL; gscurp = gscurp->next) {
+					/* check for condition */
+					if (gtp->origid && gscurp->vertex->id != gtp->origid)
+						continue;
+
+					bzero(&tup,sizeof (GRAPH_TUPLE));
+					tup.origid = gscurp->vertex->id;
+
+					/* no edges? */
+					if (gscurp->vertex->forward_edge == NULL) {
+						/* just add vertex to set */
+						if (!_add_graph_set(&gsetp,&tup)) {
+							if (gsetp != NULL)	/* clean up */
+								my_free(gsetp,MYF(0));
+							return (NULL);
+						}
+					}
+					else {
+						/* walk through all edges */
+						for (gep = gscurp->vertex->forward_edge; gep; gep = gep->next_edge) {
+							tup.destid	= gep->vertex->id;
+							tup.weight	= gep->weight;
+
+							/* just add vertex to set */
+							if (!_add_graph_set(&gsetp,&tup)) {
+								if (gsetp != NULL)	/* clean up */
+									my_free(gsetp,MYF(0));
+								return (NULL);
+							}
+						}
+					}
+				}
+			}
+			break;
+
+		case 1:	/* find a path between origid and destid */
+				/* yes it'll just go with the first path it finds! */
+			{
+				GRAPHSTORE *gscurp;
+				GRAPH_VERTEX *origvp;
+				GRAPH_TUPLE tup;
+
+				if (!gtp->origid || !gtp->destid)
+					return NULL;
+
+				/* find both vertices */
+				if ((origvp = _find_vertex(gsp,gtp->origid)) == NULL ||
+					_find_vertex(gsp,gtp->destid) == NULL)
+					return NULL;
+
+				if (_find_any_path(&gsetp,gtp->origid,gtp->destid,origvp,0) < 0) {	/* error? */
+					if (gsetp != NULL)	/* clean up */
+						my_free(gsetp,MYF(0));
+					return NULL;
+				}
+			}
+			break;
+
+		default:
+			/* this ends up being an empty set */
+			break;
+	}
+
+	/* Fix up latch column with the proper value - to be relationally correct */
+	for (gsetcurp = gsetp; gsetcurp != NULL; gsetcurp = gsetcurp->next)
+		gsetcurp->tuple.latch = gtp->latch;
+
+	return gsetp;
+}
+
+
+
+/* end of graphstore.c */
+\ No newline at end of file
diff --git a/storage/oqgraph/graphstore.h b/storage/oqgraph/graphstore.h
new file mode 100644
index 00000000000..61862221455
--- /dev/null
+++ b/storage/oqgraph/graphstore.h
@@ -0,0 +1,90 @@
+/*
+ * Graph Engine - Copyright (C) 2007 by Arjen Lentz (arjen@openquery.com.au)
+ * graphstore.h internal storage system
+ */
+//typedef unsigned short uint16;
+//typedef unsigned long long uint64;
+
+
+/*
+	This is essentially what a GRAPH engine table looks like on the MySQL end:
+	CREATE TABLE foo (
+		latch	SMALLINT	UNSIGNED NULL,
+		origid	BIGINT		UNSIGNED NULL,
+		destid	BIGINT		UNSIGNED NULL,
+		weight	BIGINT		UNSIGNED NULL,
+		seq		BIGINT		UNSIGNED NULL,
+		linkid	BIGINT		UNSIGNED NULL
+ 	) ENGINE=OQGRAPH
+*/
+
+
+/*
+	We represent the above in C in the following way:
+*/
+typedef uint16	GRAPH_LATCH;
+typedef uint64	GRAPH_VERTEXID;
+typedef uint64	GRAPH_WEIGHT;
+typedef uint64	GRAPH_SEQ;
+
+typedef struct graph_tuple {
+	GRAPH_LATCH		latch;		/* function 							*/
+	GRAPH_VERTEXID	origid;		/* vertex (should be != 0)				*/
+	GRAPH_VERTEXID	destid;		/* edge									*/
+	GRAPH_WEIGHT	weight;		/* weight								*/
+	GRAPH_SEQ		seq;		/* seq# within (origid)					*/
+	GRAPH_VERTEXID	linkid;		/* current step between origid/destid	*/
+} GRAPH_TUPLE;
+
+typedef struct graph_set {
+	GRAPH_TUPLE			tuple;
+	struct graph_set	*next;
+} GRAPH_SET;
+
+
+/*
+	Internally, sets look nothing like the above
+
+	- We have vertices, connected by edges.
+	- Each vertex' edges are maintained in a linked list.
+	- Edges can be weighted.
+
+	There are some issues with this structure, it'd be a pest to do a delete
+	So for now, let's just not support deletes!
+*/
+/* the below is half-gross and will likely change */
+typedef struct graph_edge {
+	struct graph_vertex {
+		GRAPH_VERTEXID		 id;
+		struct graph_edge	*forward_edge;
+	}					*vertex;
+	GRAPH_WEIGHT	 	 weight;
+	struct graph_edge	*next_edge;
+} GRAPH_EDGE;
+
+typedef struct graph_vertex GRAPH_VERTEX;
+
+
+/*
+	A rough internal storage system for a set
+*/
+/* this below is fully gross and will definitely change */
+typedef struct graphstore {
+	GRAPH_VERTEX		*vertex;	/* changed to ptr when integrating into MySQL */
+	struct graphstore	*next;
+} GRAPHSTORE;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* public function declarations */
+int graphstore_insert (GRAPHSTORE **gspp, GRAPH_TUPLE *gtp);
+GRAPH_SET *graphstore_query (GRAPHSTORE *gsp, GRAPH_TUPLE *gtp);
+int free_graph_set (GRAPH_SET *gsetp);
+
+#ifdef __cplusplus
+}
+#endif
+
+/* end of graphstore.h */
+\ No newline at end of file
diff --git a/storage/oqgraph/ha_oqgraph.cc b/storage/oqgraph/ha_oqgraph.cc
new file mode 100644
index 00000000000..e0c66134858
--- /dev/null
+++ b/storage/oqgraph/ha_oqgraph.cc
@@ -0,0 +1,1041 @@
+/* Copyright (C) 2007-2009 Arjen G Lentz & Antony T Curtis for Open Query
+   Portions of this file copyright (C) 2000-2006 MySQL AB
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; version 2 of the License.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */
+
+/* ======================================================================
+   Open Query Graph Computation Engine, based on a concept by Arjen Lentz
+   Mk.II implementation by Antony Curtis & Arjen Lentz
+   For more information, documentation, support, enhancement engineering,
+   and non-GPL licensing, see http://openquery.com/graph
+   or contact graph@openquery.com
+   For packaged binaries, see http://ourdelta.org
+   ======================================================================
+*/
+
+#ifdef USE_PRAGMA_IMPLEMENTATION
+#pragma implementation				// gcc: Class implementation
+#endif
+
+#define MYSQL_SERVER	// to have THD
+#include "mysql_priv.h"
+#if MYSQL_VERSION_ID >= 50100
+#include <mysql/plugin.h>
+#endif
+
+#ifdef HAVE_OQGRAPH
+
+#include "ha_oqgraph.h"
+#include "graphcore.h"
+
+#define OQGRAPH_STATS_UPDATE_THRESHOLD 10
+
+using namespace open_query;
+
+
+struct oqgraph_info_st
+{
+  THR_LOCK lock;
+  oqgraph_share *graph;
+  uint use_count;
+  uint key_stat_version;
+  uint records;
+  bool dropped;
+  char name[FN_REFLEN+1];
+};
+
+static const char oqgraph_description[]=
+  "Open Query Graph Computation Engine, stored in memory "
+  "(http://openquery.com/graph)";
+
+#if MYSQL_VERSION_ID < 50100
+static bool oqgraph_init();
+
+handlerton oqgraph_hton= {
+  "OQGRAPH",
+  SHOW_OPTION_YES,
+  oqgraph_description,
+  DB_TYPE_OQGRAPH,
+  oqgraph_init,
+  0,       /* slot */
+  0,       /* savepoint size. */
+  NULL,    /* close_connection */
+  NULL,    /* savepoint */
+  NULL,    /* rollback to savepoint */
+  NULL,    /* release savepoint */
+  NULL,    /* commit */
+  NULL,    /* rollback */
+  NULL,    /* prepare */
+  NULL,    /* recover */
+  NULL,    /* commit_by_xid */
+  NULL,    /* rollback_by_xid */
+  NULL,    /* create_cursor_read_view */
+  NULL,    /* set_cursor_read_view */
+  NULL,    /* close_cursor_read_view */
+  HTON_NO_FLAGS
+};
+
+#define STATISTIC_INCREMENT(X) \
+statistic_increment(table->in_use->status_var.X, &LOCK_status)
+#define MOVE(X) move_field(X)
+#define RECORDS records
+#else
+#define STATISTIC_INCREMENT(X) ha_statistic_increment(&SSV::X)
+#define MOVE(X) move_field_offset(X)
+#define RECORDS stats.records
+#endif
+
+static HASH oqgraph_open_tables;
+static pthread_mutex_t LOCK_oqgraph;
+static bool oqgraph_init_done= 0;
+
+#if MYSQL_VERSION_ID >= 50130
+#define HASH_KEY_LENGTH size_t
+#else
+#define HASH_KEY_LENGTH uint
+#endif
+
+static uchar* get_key(const uchar *ptr, HASH_KEY_LENGTH *length,
+                      my_bool)
+{
+  const OQGRAPH_INFO *share= (const OQGRAPH_INFO*) ptr;
+  *length= strlen(share->name);
+  return (uchar*) share->name;
+}
+
+#if MYSQL_VERSION_ID >= 50100
+static handler* oqgraph_create_handler(handlerton *hton, TABLE_SHARE *table,
+                                       MEM_ROOT *mem_root)
+{
+  return new (mem_root) ha_oqgraph(hton, table);
+}
+
+static int oqgraph_init(handlerton *hton)
+{
+#else
+static bool oqgraph_init()
+{
+  if (have_oqgraph == SHOW_OPTION_DISABLED)
+    return 1;
+#endif
+  if (pthread_mutex_init(&LOCK_oqgraph, MY_MUTEX_INIT_FAST))
+    goto error;
+  if (hash_init(&oqgraph_open_tables, &my_charset_bin, 32, 0, 0,
+                get_key, 0, 0))
+  {
+    pthread_mutex_destroy(&LOCK_oqgraph);
+    goto error;
+  }
+#if MYSQL_VERSION_ID >= 50100
+  hton->state= SHOW_OPTION_YES;
+  hton->db_type= DB_TYPE_AUTOASSIGN;
+  hton->create= oqgraph_create_handler;
+  hton->flags= HTON_NO_FLAGS;
+#endif
+  oqgraph_init_done= TRUE;
+  return 0;
+error:
+#if MYSQL_VERSION_ID < 50100
+  have_oqgraph= SHOW_OPTION_DISABLED;
+#endif
+  return 1;
+}
+
+#if MYSQL_VERSION_ID >= 50100
+static int oqgraph_fini(void *)
+{
+  hash_free(&oqgraph_open_tables);
+  pthread_mutex_destroy(&LOCK_oqgraph);
+  oqgraph_init_done= FALSE;
+  return 0;
+}
+#endif
+
+static OQGRAPH_INFO *get_share(const char *name, TABLE *table=0)
+{
+  OQGRAPH_INFO *share;
+  uint length= strlen(name);
+
+  safe_mutex_assert_owner(&LOCK_oqgraph);
+  if (!(share= (OQGRAPH_INFO*) hash_search(&oqgraph_open_tables,
+                                           (byte*) name, length)))
+  {
+    if (!table ||
+        !(share= new OQGRAPH_INFO))
+      return 0;
+    share->use_count= share->key_stat_version= share->records= 0;
+    share->dropped= 0;
+    strmov(share->name, name);
+    if (!(share->graph= oqgraph::create()))
+    {
+      delete share;
+      return 0;
+    }
+    if (my_hash_insert(&oqgraph_open_tables, (byte*) share))
+    {
+      oqgraph::free(share->graph);
+      delete share;
+      return 0;
+    }
+    thr_lock_init(&share->lock);
+  }
+  share->use_count++;
+  return share;
+}
+
+static int free_share(OQGRAPH_INFO *share, bool drop=0)
+{
+  safe_mutex_assert_owner(&LOCK_oqgraph);
+  if (!share)
+    return 0;
+  if (drop)
+  {
+    share->dropped= true;
+    hash_delete(&oqgraph_open_tables, (byte*) share);
+  }
+  if (!--share->use_count)
+  {
+    if (share->dropped)
+    {
+      thr_lock_delete(&share->lock);
+      oqgraph::free(share->graph);
+      delete share;
+    }
+  }
+  return 0;
+}
+
+static int error_code(int res)
+{
+  switch (res)
+  {
+  case oqgraph::OK:
+    return 0;
+  case oqgraph::NO_MORE_DATA:
+    return HA_ERR_END_OF_FILE;
+  case oqgraph::EDGE_NOT_FOUND:
+    return HA_ERR_KEY_NOT_FOUND;
+  case oqgraph::INVALID_WEIGHT:
+    return HA_ERR_AUTOINC_ERANGE;
+  case oqgraph::DUPLICATE_EDGE:
+    return HA_ERR_FOUND_DUPP_KEY;
+  case oqgraph::CANNOT_ADD_VERTEX:
+  case oqgraph::CANNOT_ADD_EDGE:
+    return HA_ERR_RECORD_FILE_FULL;
+  case oqgraph::MISC_FAIL:
+  default:
+    return HA_ERR_CRASHED_ON_USAGE;
+  }
+}
+
+/**
+ * Check if table complies with our designated structure
+ *
+ *    ColName    Type      Attributes
+ *    =======    ========  =============
+ *    latch     SMALLINT  UNSIGNED NULL
+ *    origid    BIGINT    UNSIGNED NULL
+ *    destid    BIGINT    UNSIGNED NULL
+ *    weight    DOUBLE    NULL
+ *    seq       BIGINT    UNSIGNED NULL
+ *    linkid    BIGINT    UNSIGNED NULL
+ *    =================================
+ *
+  CREATE TABLE foo (
+    latch   SMALLINT  UNSIGNED NULL,
+    origid  BIGINT    UNSIGNED NULL,
+    destid  BIGINT    UNSIGNED NULL,
+    weight  DOUBLE    NULL,
+    seq     BIGINT    UNSIGNED NULL,
+    linkid  BIGINT    UNSIGNED NULL,
+    KEY (latch, origid, destid) USING HASH,
+    KEY (latch, destid, origid) USING HASH
+  ) ENGINE=OQGRAPH
+
+ */
+static int oqgraph_check_table_structure (TABLE *table_arg)
+{
+  int i;
+  struct { const char *colname; int coltype; } skel[] = {
+    { "latch" , MYSQL_TYPE_SHORT },
+    { "origid", MYSQL_TYPE_LONGLONG },
+    { "destid", MYSQL_TYPE_LONGLONG },
+    { "weight", MYSQL_TYPE_DOUBLE },
+    { "seq"   , MYSQL_TYPE_LONGLONG },
+    { "linkid", MYSQL_TYPE_LONGLONG },
+  { NULL    , 0}
+  };
+
+  DBUG_ENTER("ha_oqgraph::table_structure_ok");
+
+  Field **field= table_arg->field;
+  for (i= 0; *field && skel[i].colname; i++, field++) {
+    /* Check Column Type */
+    if ((*field)->type() != skel[i].coltype)
+      DBUG_RETURN(-1);
+    if (skel[i].coltype != MYSQL_TYPE_DOUBLE) {
+      /* Check Is UNSIGNED */
+      if (!((*field)->flags & UNSIGNED_FLAG ))
+        DBUG_RETURN(-1);
+    }
+    /* Check THAT  NOT NULL isn't set */
+    if ((*field)->flags & NOT_NULL_FLAG)
+      DBUG_RETURN(-1);
+    /* Check the column name */
+    if (strcmp(skel[i].colname,(*field)->field_name))
+      DBUG_RETURN(-1);
+  }
+
+  if (skel[i].colname || *field || !table_arg->key_info || !table_arg->s->keys)
+    DBUG_RETURN(-1);
+
+  KEY *key= table_arg->key_info;
+  for (uint i= 0; i < table_arg->s->keys; ++i, ++key)
+  {
+    Field **field= table_arg->field;
+    /* check that the first key part is the latch and it is a hash key */
+    if (!(field[0] == key->key_part[0].field &&
+          HA_KEY_ALG_HASH == key->algorithm))
+      DBUG_RETURN(-1);
+    if (key->key_parts == 3)
+    {
+      /* KEY (latch, origid, destid) USING HASH */
+      /* KEY (latch, destid, origid) USING HASH */
+      if (!(field[1] == key->key_part[1].field &&
+            field[2] == key->key_part[2].field) &&
+          !(field[1] == key->key_part[2].field &&
+            field[2] == key->key_part[1].field))
+        DBUG_RETURN(-1);
+    }
+    else
+      DBUG_RETURN(-1);
+  }
+
+  DBUG_RETURN(0);
+}
+
+/*****************************************************************************
+** OQGRAPH tables
+*****************************************************************************/
+
+#if MYSQL_VERSION_ID >= 50100
+ha_oqgraph::ha_oqgraph(handlerton *hton, TABLE_SHARE *table_arg)
+  : handler(hton, table_arg),
+#else
+ha_oqgraph::ha_oqgraph(TABLE *table_arg)
+  : handler(&oqgraph_hton, table_arg),
+#endif
+    share(0), graph(0), records_changed(0), key_stat_version(0)
+{ }
+
+
+static const char *ha_oqgraph_exts[] =
+{
+  NullS
+};
+
+const char **ha_oqgraph::bas_ext() const
+{
+  return ha_oqgraph_exts;
+}
+
+#if MYSQL_VERSION_ID >= 50100
+ulonglong ha_oqgraph::table_flags() const
+#else
+ulong ha_oqgraph::table_flags() const
+#endif
+{
+  return (HA_NO_BLOBS | HA_NULL_IN_KEY |
+          HA_REC_NOT_IN_SEQ | HA_CAN_INSERT_DELAYED |
+          HA_BINLOG_STMT_CAPABLE | HA_BINLOG_ROW_CAPABLE);
+}
+
+ulong ha_oqgraph::index_flags(uint inx, uint part, bool all_parts) const
+{
+  return HA_ONLY_WHOLE_INDEX | HA_KEY_SCAN_NOT_ROR;
+}
+
+int ha_oqgraph::open(const char *name, int mode, uint test_if_locked)
+{
+  pthread_mutex_lock(&LOCK_oqgraph);
+  if ((share = get_share(name, table)))
+  {
+    ref_length= oqgraph::sizeof_ref;
+  }
+
+  if (share)
+  {
+    /* Initialize variables for the opened table */
+    thr_lock_data_init(&share->lock, &lock, NULL);
+
+    graph= oqgraph::create(share->graph);
+
+    /*
+      We cannot run update_key_stats() here because we do not have a
+      lock on the table. The 'records' count might just be changed
+      temporarily at this moment and we might get wrong statistics (Bug
+      #10178). Instead we request for update. This will be done in
+      ha_oqgraph::info(), which is always called before key statistics are
+      used.
+    */
+    key_stat_version= share->key_stat_version-1;
+  }
+  pthread_mutex_unlock(&LOCK_oqgraph);
+
+  return (share ? 0 : 1);
+}
+
+int ha_oqgraph::close(void)
+{
+  pthread_mutex_lock(&LOCK_oqgraph);
+  oqgraph::free(graph); graph= 0;
+  int res= free_share(share);
+  pthread_mutex_unlock(&LOCK_oqgraph);
+  return error_code(res);
+}
+
+void ha_oqgraph::update_key_stats()
+{
+  for (uint i= 0; i < table->s->keys; i++)
+  {
+    KEY *key=table->key_info+i;
+    if (!key->rec_per_key)
+      continue;
+    if (key->algorithm != HA_KEY_ALG_BTREE)
+    {
+      if (key->flags & HA_NOSAME)
+        key->rec_per_key[key->key_parts-1]= 1;
+      else
+      {
+        unsigned vertices= graph->vertices_count();
+        unsigned edges= graph->edges_count();
+        uint no_records= vertices ? 2 * (edges + vertices) / vertices : 2;
+        if (no_records < 2)
+          no_records= 2;
+        key->rec_per_key[key->key_parts-1]= no_records;
+      }
+    }
+  }
+  records_changed= 0;
+  /* At the end of update_key_stats() we can proudly claim they are OK. */
+  key_stat_version= share->key_stat_version;
+}
+
+
+int ha_oqgraph::write_row(byte * buf)
+{
+  int res= oqgraph::MISC_FAIL;
+  Field ** const field= table->field;
+  STATISTIC_INCREMENT(ha_write_count);
+
+#if MYSQL_VERSION_ID >= 50100
+  my_bitmap_map *old_map= dbug_tmp_use_all_columns(table, table->read_set);
+#endif
+  my_ptrdiff_t ptrdiff= buf - table->record[0];
+
+  if (ptrdiff)
+  {
+    field[1]->MOVE(ptrdiff);
+    field[2]->MOVE(ptrdiff);
+    field[3]->MOVE(ptrdiff);
+  }
+
+  if (!field[1]->is_null() && !field[2]->is_null())
+  {
+    VertexID orig_id= (VertexID) field[1]->val_int();
+    VertexID dest_id= (VertexID) field[2]->val_int();
+    EdgeWeight weight= 1;
+
+    if (!field[3]->is_null())
+      weight= (EdgeWeight) field[3]->val_real();
+
+    if (!(res= graph->insert_edge(orig_id, dest_id, weight, replace_dups)))
+    {
+      ++records_changed;
+      share->records++;
+    }
+    if (res == oqgraph::DUPLICATE_EDGE && ignore_dups && !insert_dups)
+      res= oqgraph::OK;
+  }
+
+  if (ptrdiff)
+  {
+    field[1]->MOVE(-ptrdiff);
+    field[2]->MOVE(-ptrdiff);
+    field[3]->MOVE(-ptrdiff);
+  }
+#if MYSQL_VERSION_ID >= 50100
+  dbug_tmp_restore_column_map(table->read_set, old_map);
+#endif
+
+  if (!res && records_changed*OQGRAPH_STATS_UPDATE_THRESHOLD > share->records)
+  {
+    /*
+       We can perform this safely since only one writer at the time is
+       allowed on the table.
+    */
+    share->key_stat_version++;
+  }
+
+  return error_code(res);
+}
+
+int ha_oqgraph::update_row(const byte * old, byte * buf)
+{
+  int res= oqgraph::MISC_FAIL;
+  VertexID orig_id, dest_id;
+  EdgeWeight weight= 1;
+  Field **field= table->field;
+  STATISTIC_INCREMENT(ha_update_count);
+
+#if MYSQL_VERSION_ID >= 50100
+  my_bitmap_map *old_map= dbug_tmp_use_all_columns(table, table->read_set);
+#endif
+  my_ptrdiff_t ptrdiff= buf - table->record[0];
+
+  if (ptrdiff)
+  {
+    field[0]->MOVE(ptrdiff);
+    field[1]->MOVE(ptrdiff);
+    field[2]->MOVE(ptrdiff);
+    field[3]->MOVE(ptrdiff);
+  }
+
+  if (inited == INDEX || inited == RND)
+  {
+    VertexID *origp= 0, *destp= 0;
+    EdgeWeight *weightp= 0;
+    if (!field[1]->is_null())
+      *(origp= &orig_id)= (VertexID) field[1]->val_int();
+    if (!field[2]->is_null())
+      *(destp= &dest_id)= (VertexID) field[2]->val_int();
+    if (!field[3]->is_null())
+      *(weightp= &weight)= (EdgeWeight) field[3]->val_real();
+
+    my_ptrdiff_t ptrdiff2= old - buf;
+
+    field[0]->MOVE(ptrdiff2);
+    field[1]->MOVE(ptrdiff2);
+    field[2]->MOVE(ptrdiff2);
+    field[3]->MOVE(ptrdiff2);
+
+    if (field[0]->is_null())
+    {
+      if (!origp == field[1]->is_null() &&
+          *origp == (VertexID) field[1]->val_int())
+        origp= 0;
+      if (!destp == field[2]->is_null() &&
+          *destp == (VertexID) field[2]->val_int())
+        origp= 0;
+      if (!weightp == field[3]->is_null() &&
+          *weightp == (VertexID) field[3]->val_real())
+        weightp= 0;
+
+      if (!(res= graph->modify_edge(oqgraph::current_row(),
+                                    origp, destp, weightp, replace_dups)))
+        ++records_changed;
+      else if (ignore_dups && res == oqgraph::DUPLICATE_EDGE)
+        res= oqgraph::OK;
+    }
+
+    field[0]->MOVE(-ptrdiff2);
+    field[1]->MOVE(-ptrdiff2);
+    field[2]->MOVE(-ptrdiff2);
+    field[3]->MOVE(-ptrdiff2);
+  }
+
+  if (ptrdiff)
+  {
+    field[0]->MOVE(-ptrdiff);
+    field[1]->MOVE(-ptrdiff);
+    field[2]->MOVE(-ptrdiff);
+    field[3]->MOVE(-ptrdiff);
+  }
+#if MYSQL_VERSION_ID >= 50100
+  dbug_tmp_restore_column_map(table->read_set, old_map);
+#endif
+
+  if (!res && records_changed*OQGRAPH_STATS_UPDATE_THRESHOLD > share->records)
+  {
+    /*
+       We can perform this safely since only one writer at the time is
+       allowed on the table.
+    */
+    share->key_stat_version++;
+  }
+  return error_code(res);
+}
+
+int ha_oqgraph::delete_row(const byte * buf)
+{
+  int res= oqgraph::EDGE_NOT_FOUND;
+  Field **field= table->field;
+  STATISTIC_INCREMENT(ha_delete_count);
+
+  if (inited == INDEX || inited == RND)
+  {
+    if ((res= graph->delete_edge(oqgraph::current_row())) == oqgraph::OK)
+    {
+      ++records_changed;
+      share->records--;
+    }
+  }
+  if (res != oqgraph::OK)
+  {
+#if MYSQL_VERSION_ID >= 50100
+    my_bitmap_map *old_map= dbug_tmp_use_all_columns(table, table->read_set);
+#endif
+    my_ptrdiff_t ptrdiff= buf - table->record[0];
+
+    if (ptrdiff)
+    {
+      field[0]->MOVE(ptrdiff);
+      field[1]->MOVE(ptrdiff);
+      field[2]->MOVE(ptrdiff);
+    }
+
+    if (field[0]->is_null() && !field[1]->is_null() && !field[2]->is_null())
+    {
+      VertexID orig_id= (VertexID) field[1]->val_int();
+      VertexID dest_id= (VertexID) field[2]->val_int();
+
+      if ((res= graph->delete_edge(orig_id, dest_id)) == oqgraph::OK)
+      {
+        ++records_changed;
+        share->records--;
+      }
+    }
+
+    if (ptrdiff)
+    {
+      field[0]->MOVE(-ptrdiff);
+      field[1]->MOVE(-ptrdiff);
+      field[2]->MOVE(-ptrdiff);
+    }
+#if MYSQL_VERSION_ID >= 50100
+    dbug_tmp_restore_column_map(table->read_set, old_map);
+#endif
+  }
+
+  if (!res && table->s->tmp_table == NO_TMP_TABLE &&
+      records_changed*OQGRAPH_STATS_UPDATE_THRESHOLD > share->records)
+  {
+    /*
+       We can perform this safely since only one writer at the time is
+       allowed on the table.
+    */
+    share->key_stat_version++;
+  }
+  return error_code(res);
+}
+
+int ha_oqgraph::index_read(byte * buf, const byte * key, uint key_len,
+			enum ha_rkey_function find_flag)
+{
+  DBUG_ASSERT(inited==INDEX);
+  return index_read_idx(buf, active_index, key, key_len, find_flag);
+}
+
+int ha_oqgraph::index_next_same(byte *buf, const byte *key, uint key_len)
+{
+  int res;
+  open_query::row row;
+  DBUG_ASSERT(inited==INDEX);
+  STATISTIC_INCREMENT(ha_read_key_count);
+  if (!(res= graph->fetch_row(row)))
+    res= fill_record(buf, row);
+  table->status= res ? STATUS_NOT_FOUND : 0;
+  return error_code(res);
+}
+
+int ha_oqgraph::index_read_idx(byte * buf, uint index, const byte * key,
+			    uint key_len, enum ha_rkey_function find_flag)
+{
+  Field **field= table->field;
+  KEY *key_info= table->key_info + index;
+  int res;
+  VertexID orig_id, dest_id;
+  int latch;
+  VertexID *orig_idp=0, *dest_idp=0;
+  int *latchp=0;
+  open_query::row row;
+  STATISTIC_INCREMENT(ha_read_key_count);
+
+  bmove_align(buf, table->s->default_values, table->s->reclength);
+  key_restore(buf, (byte*) key, key_info, key_len);
+
+#if MYSQL_VERSION_ID >= 50100
+  my_bitmap_map *old_map= dbug_tmp_use_all_columns(table, table->read_set);
+#endif
+  my_ptrdiff_t ptrdiff= buf - table->record[0];
+
+  if (ptrdiff)
+  {
+    field[0]->MOVE(ptrdiff);
+    field[1]->MOVE(ptrdiff);
+    field[2]->MOVE(ptrdiff);
+  }
+
+  if (!field[0]->is_null())
+  {
+    latch= (int) field[0]->val_int();
+    latchp= &latch;
+  }
+
+  if (!field[1]->is_null())
+  {
+    orig_id= (VertexID) field[1]->val_int();
+    orig_idp= &orig_id;
+  }
+
+  if (!field[2]->is_null())
+  {
+    dest_id= (VertexID) field[2]->val_int();
+    dest_idp= &dest_id;
+  }
+
+  if (ptrdiff)
+  {
+    field[0]->MOVE(-ptrdiff);
+    field[1]->MOVE(-ptrdiff);
+    field[2]->MOVE(-ptrdiff);
+  }
+#if MYSQL_VERSION_ID >= 50100
+  dbug_tmp_restore_column_map(table->read_set, old_map);
+#endif
+
+  res= graph->search(latchp, orig_idp, dest_idp);
+
+  if (!res && !(res= graph->fetch_row(row)))
+    res= fill_record(buf, row);
+  table->status = res ? STATUS_NOT_FOUND : 0;
+  return error_code(res);
+}
+
+int ha_oqgraph::fill_record(byte *record, const open_query::row &row)
+{
+  Field **field= table->field;
+
+  bmove_align(record, table->s->default_values, table->s->reclength);
+
+#if MYSQL_VERSION_ID >= 50100
+  my_bitmap_map *old_map= dbug_tmp_use_all_columns(table, table->write_set);
+#endif
+  my_ptrdiff_t ptrdiff= record - table->record[0];
+
+  if (ptrdiff)
+  {
+    field[0]->MOVE(ptrdiff);
+    field[1]->MOVE(ptrdiff);
+    field[2]->MOVE(ptrdiff);
+    field[3]->MOVE(ptrdiff);
+    field[4]->MOVE(ptrdiff);
+    field[5]->MOVE(ptrdiff);
+  }
+
+  // just each field specifically, no sense iterating
+  if (row.latch_indicator)
+  {
+    field[0]->set_notnull();
+    field[0]->store((longlong) row.latch);
+  }
+
+  if (row.orig_indicator)
+  {
+    field[1]->set_notnull();
+    field[1]->store((longlong) row.orig);
+  }
+
+  if (row.dest_indicator)
+  {
+    field[2]->set_notnull();
+    field[2]->store((longlong) row.dest);
+  }
+
+  if (row.weight_indicator)
+  {
+    field[3]->set_notnull();
+    field[3]->store((double) row.weight);
+  }
+
+  if (row.seq_indicator)
+  {
+    field[4]->set_notnull();
+    field[4]->store((longlong) row.seq);
+  }
+
+  if (row.link_indicator)
+  {
+    field[5]->set_notnull();
+    field[5]->store((longlong) row.link);
+  }
+
+  if (ptrdiff)
+  {
+    field[0]->MOVE(-ptrdiff);
+    field[1]->MOVE(-ptrdiff);
+    field[2]->MOVE(-ptrdiff);
+    field[3]->MOVE(-ptrdiff);
+    field[4]->MOVE(-ptrdiff);
+    field[5]->MOVE(-ptrdiff);
+  }
+#if MYSQL_VERSION_ID >= 50100
+  dbug_tmp_restore_column_map(table->write_set, old_map);
+#endif
+
+  return 0;
+}
+
+int ha_oqgraph::rnd_init(bool scan)
+{
+  return error_code(graph->random(scan));
+}
+
+int ha_oqgraph::rnd_next(byte *buf)
+{
+  int res;
+  open_query::row row;
+  STATISTIC_INCREMENT(ha_read_rnd_next_count);
+  if (!(res= graph->fetch_row(row)))
+    res= fill_record(buf, row);
+  table->status= res ? STATUS_NOT_FOUND: 0;
+  return error_code(res);
+}
+
+int ha_oqgraph::rnd_pos(byte * buf, byte *pos)
+{
+  int res;
+  open_query::row row;
+  STATISTIC_INCREMENT(ha_read_rnd_count);
+  if (!(res= graph->fetch_row(row, pos)))
+    res= fill_record(buf, row);
+  table->status=res ? STATUS_NOT_FOUND: 0;
+  return error_code(res);
+}
+
+void ha_oqgraph::position(const byte *record)
+{
+  graph->row_ref((void*) ref);	// Ref is aligned
+}
+
+int ha_oqgraph::cmp_ref(const byte *ref1, const byte *ref2)
+{
+  return memcmp(ref1, ref2, oqgraph::sizeof_ref);
+}
+
+int ha_oqgraph::info(uint flag)
+{
+  RECORDS= graph->vertices_count() + graph->edges_count();
+#if 0
+  records= hp_info.records;
+  deleted= hp_info.deleted;
+  errkey=  hp_info.errkey;
+  mean_rec_length= hp_info.reclength;
+  data_file_length= hp_info.data_length;
+  index_file_length= hp_info.index_length;
+  max_data_file_length= hp_info.max_records* hp_info.reclength;
+  delete_length= hp_info.deleted * hp_info.reclength;
+#endif
+  /*
+    If info() is called for the first time after open(), we will still
+    have to update the key statistics. Hoping that a table lock is now
+    in place.
+  */
+  if (key_stat_version != share->key_stat_version)
+    update_key_stats();
+  return 0;
+}
+
+int ha_oqgraph::extra(enum ha_extra_function operation)
+{
+  switch (operation)
+  {
+  case HA_EXTRA_IGNORE_DUP_KEY:
+    ignore_dups= true;
+    break;
+  case HA_EXTRA_NO_IGNORE_DUP_KEY:
+    ignore_dups= false;
+    insert_dups= false;
+    break;
+  case HA_EXTRA_WRITE_CAN_REPLACE:
+    replace_dups= true;
+    break;
+  case HA_EXTRA_WRITE_CANNOT_REPLACE:
+    replace_dups= false;
+    break;
+  case HA_EXTRA_INSERT_WITH_UPDATE:
+    insert_dups= true;
+    break;
+  default:
+    break;
+  }
+  return 0;
+}
+
+int ha_oqgraph::delete_all_rows()
+{
+  int res;
+  if (!(res= graph->delete_all()))
+  {
+    share->records= 0;
+  }
+
+  if (!res && table->s->tmp_table == NO_TMP_TABLE)
+  {
+    /*
+       We can perform this safely since only one writer at the time is
+       allowed on the table.
+    */
+    share->key_stat_version++;
+  }
+  return error_code(res);
+}
+
+int ha_oqgraph::external_lock(THD *thd, int lock_type)
+{
+  return 0;					// No external locking
+}
+
+
+THR_LOCK_DATA **ha_oqgraph::store_lock(THD *thd,
+				       THR_LOCK_DATA **to,
+				       enum thr_lock_type lock_type)
+{
+  if (lock_type != TL_IGNORE && lock.type == TL_UNLOCK)
+    lock.type=lock_type;
+  *to++= &lock;
+  return to;
+}
+
+/*
+  We have to ignore ENOENT entries as the HEAP table is created on open and
+  not when doing a CREATE on the table.
+*/
+
+int ha_oqgraph::delete_table(const char *name)
+{
+  int res= 0;
+  OQGRAPH_INFO *share;
+  pthread_mutex_lock(&LOCK_oqgraph);
+  if ((share= get_share(name)))
+  {
+    res= free_share(share, true);
+  }
+  pthread_mutex_unlock(&LOCK_oqgraph);
+  return error_code(res);
+}
+
+int ha_oqgraph::rename_table(const char * from, const char * to)
+{
+  pthread_mutex_lock(&LOCK_oqgraph);
+  if (OQGRAPH_INFO *share= get_share(from))
+  {
+    strmov(share->name, to);
+    hash_update(&oqgraph_open_tables, (byte*) share,
+                (byte*) from, strlen(from));
+  }
+  pthread_mutex_unlock(&LOCK_oqgraph);
+  return 0;
+}
+
+
+ha_rows ha_oqgraph::records_in_range(uint inx, key_range *min_key,
+                                  key_range *max_key)
+{
+  KEY *key=table->key_info+inx;
+  //if (key->algorithm == HA_KEY_ALG_BTREE)
+  //  return btree_records_in_range(file, inx, min_key, max_key);
+
+  if (!min_key || !max_key ||
+      min_key->length != max_key->length ||
+      min_key->length < key->key_length - key->key_part[2].store_length ||
+      min_key->flag != HA_READ_KEY_EXACT ||
+      max_key->flag != HA_READ_AFTER_KEY)
+  {
+    if (min_key->length == key->key_part[0].store_length)
+    {
+      // If latch is not null and equals 0, return # nodes
+      DBUG_ASSERT(key->key_part[0].store_length == 3);
+      if (key->key_part[0].null_bit && !min_key->key[0] &&
+          !min_key->key[1] && !min_key->key[2])
+        return graph->vertices_count();
+    }
+    return HA_POS_ERROR;			// Can only use exact keys
+  }
+
+  if (RECORDS <= 1)
+    return RECORDS;
+
+  /* Assert that info() did run. We need current statistics here. */
+  DBUG_ASSERT(key_stat_version == share->key_stat_version);
+  ha_rows result= key->rec_per_key[key->key_parts-1];
+
+  return result;
+}
+
+
+int ha_oqgraph::create(const char *name, TABLE *table_arg,
+		    HA_CREATE_INFO *create_info)
+{
+  int res = -1;
+  OQGRAPH_INFO *share;
+
+  pthread_mutex_lock(&LOCK_oqgraph);
+  if ((share= get_share(name)))
+  {
+    free_share(share);
+  }
+  else
+  {
+    if (!oqgraph_check_table_structure(table_arg))
+      res= 0;;
+  }
+  pthread_mutex_unlock(&LOCK_oqgraph);
+
+  if (this->share)
+    info(HA_STATUS_NO_LOCK | HA_STATUS_CONST | HA_STATUS_VARIABLE);
+  return error_code(res);
+}
+
+
+void ha_oqgraph::update_create_info(HA_CREATE_INFO *create_info)
+{
+  table->file->info(HA_STATUS_AUTO);
+  //if (!(create_info->used_fields & HA_CREATE_USED_AUTO))
+  //  create_info->auto_increment_value= auto_increment_value;
+}
+
+#if MYSQL_VERSION_ID >= 50100
+struct st_mysql_storage_engine oqgraph_storage_engine=
+{ MYSQL_HANDLERTON_INTERFACE_VERSION };
+
+mysql_declare_plugin(oqgraph)
+{
+  MYSQL_STORAGE_ENGINE_PLUGIN,
+  &oqgraph_storage_engine,
+  "OQGRAPH",
+  "Arjen Lentz & Antony T Curtis, Open Query",
+  oqgraph_description,
+  PLUGIN_LICENSE_GPL,
+  (int (*)(void*)) oqgraph_init, /* Plugin Init                  */
+  oqgraph_fini,               /* Plugin Deinit                   */
+  0x0200,                     /* Version: 2.0                    */
+  NULL,                       /* status variables                */
+  NULL,                       /* system variables                */
+  NULL                        /* config options                  */
+}
+mysql_declare_plugin_end;
+#endif
+
+#endif
diff --git a/storage/oqgraph/ha_oqgraph.h b/storage/oqgraph/ha_oqgraph.h
new file mode 100644
index 00000000000..dcc14b074da
--- /dev/null
+++ b/storage/oqgraph/ha_oqgraph.h
@@ -0,0 +1,114 @@
+/* Copyright (C) 2007-2009 Arjen G Lentz & Antony T Curtis for Open Query
+   Portions of this file copyright (C) 2000-2006 MySQL AB
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; version 2 of the License.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */
+
+/* ======================================================================
+   Open Query Graph Computation Engine, based on a concept by Arjen Lentz
+   Mk.II implementation by Antony Curtis & Arjen Lentz
+   For more information, documentation, support, enhancement engineering,
+   and non-GPL licensing, see http://openquery.com/graph
+   or contact graph@openquery.com
+   For packaged binaries, see http://ourdelta.org
+   ======================================================================
+*/
+
+#ifdef USE_PRAGMA_INTERFACE
+#pragma interface			/* gcc class implementation */
+#endif
+
+
+typedef struct oqgraph_info_st OQGRAPH_INFO;
+
+#if MYSQL_VERSION_ID >= 50120
+typedef uchar byte;
+#endif
+
+namespace open_query
+{
+  struct row;
+  class oqgraph;
+}
+
+/* class for the the Open Query Graph handler */
+
+class ha_oqgraph: public handler
+{
+  OQGRAPH_INFO *share;
+  open_query::oqgraph *graph;
+  THR_LOCK_DATA lock;
+  /* number of records changed since last statistics update */
+  uint records_changed;
+  uint key_stat_version;
+  bool replace_dups, ignore_dups, insert_dups;
+
+  int fill_record(byte*, const open_query::row&);
+
+public:
+#if MYSQL_VERSION_ID >= 50100
+  ha_oqgraph(handlerton *hton, TABLE_SHARE *table);
+  ulonglong table_flags() const;
+#else
+  ha_oqgraph(TABLE *table);
+  ulong table_flags() const;
+#endif
+  ~ha_oqgraph() {}
+  const char *table_type() const
+  {
+    return "OQGRAPH";
+  }
+  const char *index_type(uint inx)
+  {
+    return "HASH";
+  }
+  /* Rows also use a fixed-size format */
+  enum row_type get_row_type() const { return ROW_TYPE_FIXED; }
+  const char **bas_ext() const;
+  ulong index_flags(uint inx, uint part, bool all_parts) const;
+  uint max_supported_keys()          const { return MAX_KEY; }
+  uint max_supported_key_part_length() const { return MAX_KEY_LENGTH; }
+  double scan_time() { return (double) 1000000000; }
+  double read_time(uint index, uint ranges, ha_rows rows)
+  { return 1; }
+
+  int open(const char *name, int mode, uint test_if_locked);
+  int close(void);
+  int write_row(byte * buf);
+  int update_row(const byte * old_data, byte * new_data);
+  int delete_row(const byte * buf);
+  int index_read(byte * buf, const byte * key,
+		 uint key_len, enum ha_rkey_function find_flag);
+  int index_read_idx(byte * buf, uint idx, const byte * key,
+		     uint key_len, enum ha_rkey_function find_flag);
+  int index_next_same(byte * buf, const byte * key, uint key_len);
+  int rnd_init(bool scan);
+  int rnd_next(byte *buf);
+  int rnd_pos(byte * buf, byte *pos);
+  void position(const byte *record);
+  int info(uint);
+  int extra(enum ha_extra_function operation);
+  int external_lock(THD *thd, int lock_type);
+  int delete_all_rows(void);
+  ha_rows records_in_range(uint inx, key_range *min_key, key_range *max_key);
+  int delete_table(const char *from);
+  int rename_table(const char * from, const char * to);
+  int create(const char *name, TABLE *form, HA_CREATE_INFO *create_info);
+  void update_create_info(HA_CREATE_INFO *create_info);
+
+  THR_LOCK_DATA **store_lock(THD *thd, THR_LOCK_DATA **to,
+			     enum thr_lock_type lock_type);
+  int cmp_ref(const byte *ref1, const byte *ref2);
+private:
+  void update_key_stats();
+};
diff --git a/storage/oqgraph/oqgraph_config.h.in b/storage/oqgraph/oqgraph_config.h.in
new file mode 100644
index 00000000000..18dad70a75d
--- /dev/null
+++ b/storage/oqgraph/oqgraph_config.h.in
@@ -0,0 +1,73 @@
+/* src/oqgraph_config.h.in.  Generated from configure.in by autoheader.  */
+
+/* Define to 1 if you have the <dlfcn.h> header file. */
+#undef HAVE_DLFCN_H
+
+/* Enables DTRACE Support */
+#undef HAVE_DTRACE
+
+/* Define to 1 if you have the <inttypes.h> header file. */
+#undef HAVE_INTTYPES_H
+
+/* Define to 1 if you have the <limits.h> header file. */
+#undef HAVE_LIMITS_H
+
+/* Define to 1 if you have the <memory.h> header file. */
+#undef HAVE_MEMORY_H
+
+/* Define to 1 if you have the <stdint.h> header file. */
+#undef HAVE_STDINT_H
+
+/* Define to 1 if you have the <stdlib.h> header file. */
+#undef HAVE_STDLIB_H
+
+/* Define to 1 if you have the <strings.h> header file. */
+#undef HAVE_STRINGS_H
+
+/* Define to 1 if you have the <string.h> header file. */
+#undef HAVE_STRING_H
+
+/* Define to 1 if you have the <syslimits.h> header file. */
+#undef HAVE_SYSLIMITS_H
+
+/* Define to 1 if you have the <sys/stat.h> header file. */
+#undef HAVE_SYS_STAT_H
+
+/* Define to 1 if you have the <sys/types.h> header file. */
+#undef HAVE_SYS_TYPES_H
+
+/* Define to 1 if you have the <unistd.h> header file. */
+#undef HAVE_UNISTD_H
+
+/* Source directory for MySQL */
+#undef MYSQL_SRC
+
+/* Name of package */
+#undef PACKAGE
+
+/* Define to the address where bug reports for this package should be sent. */
+#undef PACKAGE_BUGREPORT
+
+/* Define to the full name of this package. */
+#undef PACKAGE_NAME
+
+/* Define to the full name and version of this package. */
+#undef PACKAGE_STRING
+
+/* Define to the one symbol short name of this package. */
+#undef PACKAGE_TARNAME
+
+/* Define to the version of this package. */
+#undef PACKAGE_VERSION
+
+/* Define to 1 if you have the ANSI C header files. */
+#undef STDC_HEADERS
+
+/* Version number of package */
+#undef VERSION
+
+/* Define to empty if `const' does not conform to ANSI C. */
+#undef const
+
+/* Define to `unsigned int' if <sys/types.h> does not define. */
+#undef size_t
diff --git a/storage/oqgraph/oqgraph_probes.d b/storage/oqgraph/oqgraph_probes.d
new file mode 100644
index 00000000000..bfdee29ba6e
--- /dev/null
+++ b/storage/oqgraph/oqgraph_probes.d
@@ -0,0 +1,19 @@
+/* Copyright (C) 2004-2005 MySQL AB
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; version 2 of the License.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA */
+
+provider oqgraph {
+	probe open();
+	probe close();
+};
diff --git a/storage/oqgraph/plug.in b/storage/oqgraph/plug.in
new file mode 100644
index 00000000000..38c8310a915
--- /dev/null
+++ b/storage/oqgraph/plug.in
@@ -0,0 +1,40 @@
+MYSQL_STORAGE_ENGINE(oqgraph,,[Graph Storage Engine],
+        [Open Query Graph Computation Engine], [])
+MYSQL_PLUGIN_DYNAMIC(oqgraph,   [ha_oqgraph.la])
+MYSQL_PLUGIN_DEPENDS_ON_MYSQL_INTERNALS(oqgraph, [ha_oqgraph.cc])
+MYSQL_PLUGIN_ACTIONS(oqgraph,[
+  AC_LANG_PUSH([C++])
+  AC_MSG_CHECKING([whether compiler supports  -fvisibility-inlines-hidden])
+  OLD_CXXFLAGS=$CXXFLAGS
+  CXXFLAGS="$CXXFLAGS -fvisibility-inlines-hidden"
+  AC_TRY_COMPILE([],[],[
+    AM_CONDITIONAL(HAVE_FVISIBILITY_INLINES_HIDDEN, true)
+    AC_MSG_RESULT([yes])
+  ], [
+    AC_MSG_RESULT([no])
+  ])
+  CXXFLAGS=$OLD_CXXFLAGS
+  AC_LANG_POP()
+])
+
+AM_CONDITIONAL([BUILD_OQGRAPH_FOR_MYSQL], true)
+AM_CONDITIONAL([BUILD_OQGRAPH_STANDALONE], false)
+AM_CONDITIONAL([HAVE_DTRACE], false)
+AM_CONDITIONAL(HAVE_FVISIBILITY_INLINES_HIDDEN, false)
+
+AC_LANG_PUSH([C++])
+
+AC_MSG_CHECKING([for Boost usable by OQGraph engine])
+AC_PREPROC_IFELSE(
+   [
+#include <boost/version.hpp>
+#if BOOST_VERSION >= 104000
+#else
+#error oops
+#endif
+   ],
+   [AC_MSG_RESULT([yes])],
+   [AC_MSG_RESULT([no])
+   MYSQL_PLUGIN_WITHOUT(oqgraph)])
+
+AC_LANG_POP()
diff --git a/storage/pbxt/AUTHORS b/storage/pbxt/AUTHORS
new file mode 100644
index 00000000000..3c5c3db6db8
--- /dev/null
+++ b/storage/pbxt/AUTHORS
@@ -0,0 +1,4 @@
+Paul McCullagh
+paul.mccullagh@primebase.org
+http://www.primebase.org
+http://pbxt.blogspot.com
diff --git a/storage/pbxt/CMakeLists.txt b/storage/pbxt/CMakeLists.txt
new file mode 100644
index 00000000000..b87c0810c29
--- /dev/null
+++ b/storage/pbxt/CMakeLists.txt
@@ -0,0 +1,95 @@
+# Copyright (c) 2008 PrimeBase Technologies GmbH
+# 
+# PrimeBase XT
+# 
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+#
+# 2006-03-22	Paul McCullagh
+#
+# H&G2JCtL
+#
+# This file is used to make the Windows version
+
+SET(PBXT_SOURCES
+src/bsearch_xt.cc
+src/bsearch_xt.h
+src/cache_xt.cc
+src/cache_xt.h
+src/ccutils_xt.cc
+src/ccutils_xt.h
+src/database_xt.cc
+src/database_xt.h
+src/datadic_xt.cc
+src/datadic_xt.h
+src/datalog_xt.cc
+src/datalog_xt.h
+src/discover_xt.cc
+src/discover_xt.h
+src/filesys_xt.cc
+src/filesys_xt.h
+src/hashtab_xt.cc
+src/hashtab_xt.h
+src/ha_pbxt.cc
+src/ha_pbxt.h
+src/ha_xtsys.cc
+src/ha_xtsys.h
+src/heap_xt.cc
+src/heap_xt.h
+src/index_xt.cc
+src/index_xt.h
+src/linklist_xt.cc
+src/linklist_xt.h
+src/locklist_xt.cc
+src/locklist_xt.h
+src/lock_xt.cc
+src/lock_xt.h
+src/memory_xt.cc
+src/memory_xt.h
+src/myxt_xt.cc
+src/myxt_xt.h
+src/pbms.h
+src/pbms_enabled.cc
+src/pbms_enabled.h
+src/pthread_xt.cc
+src/pthread_xt.h
+src/restart_xt.cc
+src/restart_xt.h
+src/sortedlist_xt.cc
+src/sortedlist_xt.h
+src/strutil_xt.cc
+src/strutil_xt.h
+src/systab_xt.cc
+src/systab_xt.h
+src/tabcache_xt.cc
+src/tabcache_xt.h
+src/table_xt.cc
+src/table_xt.h
+src/thread_xt.cc
+src/thread_xt.h
+src/trace_xt.cc
+src/trace_xt.h
+src/util_xt.cc
+src/util_xt.h
+src/xaction_xt.cc
+src/xaction_xt.h
+src/xactlog_xt.cc
+src/xactlog_xt.h
+src/xt_config.h
+src/xt_defs.h
+src/xt_errno.h)
+
+SET(PBXT_PLUGIN_STATIC  "pbxt")
+MYSQL_ADD_PLUGIN(pbxt ${PBXT_SOURCES} STORAGE_ENGINE)
+
diff --git a/storage/pbxt/ChangeLog b/storage/pbxt/ChangeLog
new file mode 100644
index 00000000000..f0f9864d0d5
--- /dev/null
+++ b/storage/pbxt/ChangeLog
@@ -0,0 +1,964 @@
+PBXT Release Notes
+==================
+
+------- 1.0.11-7 Pre-GA - 2010-09-09
+
+RN336: Compiled and tested with MySQL 5.1.50.
+
+RN335: Fixed bug #523994: Deleting all records does not update table statistics. 
+
+RN334: Made a change to reduce the time that only temporary tables exist during the ALTER TABLE and REPAIR TABLE statements. This increases the chance of recovery if a crash occurs during these operations.
+
+RN333: Log name of table when PBXT recovers an index on startup. If an error occurs during index recovery, the index is set to "repair pending".
+
+RN332: Fixed an inifinite loop when a record in a row is corrupt. Added logging and set the table to "repair pending" in this case.
+
+RN331: Fixed bug #626890: Crash on truncate table operation.
+
+RN330: Added additional checks for corruption of the index free list.
+
+------- 1.0.11-6 Pre-GA - 2010-07-08
+
+RN329: Fixed bug #601245: make fails. PBXT did not compile if the partition engine was disabled in the MySQL build.
+
+------- 1.0.11-5 Pre-GA - 2010-06-18
+
+RN328: Fixed bug #595478: Compile fails (1.0.11-4).
+
+------- 1.0.11-4 Pre-GA - 2010-06-15
+
+RN327: Fixed a bug that caused a crash during delete on the index. The crash occurred due to memory overwrite when a long key is promoted after a shorter key is deleted, and the difference causes a node size overflow.
+
+------- 1.0.11-3 Pre-GA - 2010-06-11
+
+RN326: Fixed bug #587740: pbxt-1.0.11-pre2-ga first time create partition table error. This was not a new bug. The problem was the PBXT system table's .frm files are corrupted when the first PBXT table created is a partition table.
+
+RN325: Fixed the "to-sweep" column output in xtstat.
+
+------- 1.0.11-2 Pre-GA - 2010-05-26
+
+RN324: Fixed bug #584070:pbxt-1.0.11-pre-ga does not work with mysql 5.1.47. This bug fix removes a hack which was done to avoid running into the LOCK_plugin lock.
+
+------- 1.0.11-1 Pre-GA - 2010-05-19
+
+RN323: Detect corruption of a key length in an index page. This bug fix avoids a possible crash due to index page corruption. 
+
+------- 1.0.11 Pre-GA - 2010-05-11
+
+RN322: Creating a table the references a non-existing table can now only be done if you set: foreign_key_checks = 0. Also fixed a failure when creating tables with recursive foreign key declarations.
+
+RN321: Added "Extended record count" to the CHECK TABLE output. This indicates the number of records that have a data log component.
+
+RN320: All tests now run with MySQL 5.1.46.
+
+------- 1.0.10n RC4 - 2010-04-28
+
+RN319: Fix RN1/3 and RN1/4 back-ported from 1.1: Fixed a deadlock that could occur during low index cache situations and added some checks for index corruption, and added the try lock variation for R/W locks.
+
+RN318: Fixed a bug in the atomic R/W lock. This bug occurred on multi-core Linux when under extrem load. The affect was that an index lookup could fail. The index was not corrupted.
+
+------- 1.0.10m RC4 - 2010-03-29
+
+RN317: This change prevents a unscheduled checkpoint from occurring when the sweeper has work to do. Checkpoint required due to the Checkpoint threshold reached are done as usual.
+
+------- 1.0.10k RC4 - 2010-03-29
+
+RN316: Set the maximum delay, while waiting for previous transactions to commit to 1/100s. This situation occurs when cleanup begins of a long running transaction.
+
+RN315: Fixed a bug that could lead to a data log error, for example: Data log not found: '.../dlog-129602.xt'. This error occurred after a duplicate key error, dending on the table structure, because the row buffer was not restored after writing an extended record.
+
+RN314: Server startup time could be very long when data logs become large because the log size was not save in the header when a data log is full.
+
+------- 1.0.10j RC4 - 2010-03-24
+
+RN313: Fixed an error in the calculation of the handle data record (.xtd files) size when AVG_ROW_LENGTH is set explicitly to a value less than 12. For example:
+
+CREATE TABLE objs (
+  id int(10)  unsigned   NOT NULL,
+  objdata     mediumblob NOT NULL,
+  PRIMARY KEY (id)
+) ENGINE=PBXT AVG_ROW_LENGTH=10
+
+This table definition previously lead to corruption of the table because the handle data record was set to 24 (14+10), which is less than the minimum (for variable length records) handle data record size of 26.
+
+This minimum consists of 14 byte record header and 12 bytes reference to the extended record data (the part of the record in the data log).
+
+Tip: when setting AVG_ROW_LENGTH you should normally add 12 to the average row length estimate to ensure that the average length part of the record is always in the handle data file. This is important, for example if you wish to make sure that the rows used to build indexes are in the handle data file. CHECK TABLE tells you how many rows are in the "fixed length part" of the record (output in MySQL error log). In the example above, this would be AVG_ROW_LENGTH=17.
+
+The maximum size of a field can be calculated adding the maximum byte size as described here: http://dev.mysql.com/doc/refman/5.1/en/storage-requirements.html, and then add the following values, depending on the byte size:
+
+byte size <= 240, add 1
+byte size < 2^16 (65536), add 3
+byte size < 2^24 (16777216), add 4
+byte size > 2^24, add 5
+
+------- 1.0.10i RC4 - 2010-03-17
+
+RN312: Fixed bug #534361: Valgrind error: write of uninitialised bytes in xt_flush_indices()
+
+RN311: Fixed ilog corruption when running out of disk space during an index flush operation, which lead to corruption of the index.
+
+------- 1.0.10h RC4 - 2010-02-25
+
+RN310: Fixed Windows atomic INC/DEC operations, which lead to atomic R/W lock not working correctly. The result was that some index entries were not foound.
+
+RN309: Fixed a bug that caused a crash when the index was corrupted. The crash occurs if the index page in not completely written, and an item in the index has a bad length.
+
+RN308: Fixed bug #509803: can't run tpcc (cannot compare FKs that rely on indexes of different length).
+
+------- 1.0.10g RC4 - 2010-02-11
+
+RN307:  2010-02-15: Set the internal version number 1.0.10g.
+
+RN306: All tests now run with MySQL 5.1.42.
+
+RN305: Fixed a bug that could cause a crash in filesort. The problem was that the return row estimate was incorrect, which caused the result of estimate_rows_upper_bound() to overflow to zero. Row estimate has been changed, and no longer takes into account deleted rows (so the row estimate is now a maximum).
+
+RN304: Fixed bug #513012: On a table with a trigger the same record is updated more than once in one statement
+
+------- 1.0.10f RC4 - 2010-01-29
+
+RN303: Fix RN1/10 back-ported from 1.1: Fixed a bug in the record cache that caused PBXT to think it had run out of cache memory. The effect was that PBXT used less and less cache over time. The bug occurs during heavy concurrent access on the record cache. The affect is the PBXT gets slower and slower.
+
+RN302: Fix RN1/11 back-ported from 1.1: Corrected a problem that sometimes caused a pause in activity when the record cache was full.
+
+------- 1.0.10e RC4 - 2010-01-25
+
+RN301: Fixed index statistics calculation. This bug lead to the wrong indices being selected by the optimizer because all indices returned the same cost. 
+
+RN300: Fixed bug #509968: START TRANSACTION WITH CONSISTENT SNAPSHOT breaks transactional flow.
+
+RN299: Fixed bug #509218: Server asserts with Assertion `mutex->__data.__owner == 0' failed on high concurrency OLTP test.
+
+------- 1.0.10d RC4 - 2010-01-11
+
+RN298: Fixed a bug that caused huge amounts of transaction log to be written when pbxt_flush_log_at_trx_commit = 2.
+
+------- 1.0.10c RC4 - 2009-12-29
+
+RN297: Updated "LOCK TABLES ... READ LOCAL" behavior to be more restrictive and compatible with InnoDB 
+
+RN296: Fixed bug #499026: START TRANSACTION WITH CONSISTENT SNAPSHOT does not work for PBXT
+
+------- 1.0.10 RC4 - 2009-12-18
+
+RN295: PBXT tests now all run with MySQL 5.1.41.
+
+RN294: Fixed bug #483714: a broken table can prevent other tables from opening
+
+RN293: Added system variable pbxt_flush_log_at_trx_commit. The value of this variable determines whether the transaction log is written and/or flushed when a transaction is ended. A value of 0 means don't write or flush the transaction log, 1 means write and flush and 2 means write, but do not flush. No matter what the setting is choosen, the transaction log is written and flushed at least once per second.
+
+------- 1.0.09g RC3 - 2009-12-16
+
+RN292: Fixed a bug that resulted in 2-phase commit not being used between PBXT and the binlog. This bug was a result of a hack which as added to solve a problem in an pre-release version of MySQL 5.1. The hack was removed. 
+
+------- 1.0.09f RC3 - 2009-11-30
+
+RN291: Fixed bug #489088: On shutdown MySQL reports: [Warning] Plugin 'PBXT' will be forced to shutdown.
+
+RN290: Fixed bug #345524: pbxt does not compile on 64 bit windows. Currently atomic operations are not supported on this platform.
+
+RN286: Fixed a bug introduced in RN281, which could cause an index scan to hang. The original change was to prevent a warning in Valgrind.
+
+RN285: Merged changes required to compile with Drizzle.
+
+RN284: Fixed bug that cause the error "[ERROR] Invalid (old?) table or database name 'mysqld.1'", when running temp_table.test under MariaDB (thanks to Monty for his initial bug fix). Added a fix for partition table names as well.
+
+RN283: Added win_inttypes.h to the distribution. This file is only required for the Windows build.
+
+RN282: Fixed bug #451101: jump or move depends on uninitialised value in myxt_get_key_length
+
+RN281: Fixed bug #451080: Uninitialised memory write in XTDatabaseLog::xlog_append
+
+RN280: Fixed bug #451085: jump or move depends on uninitialised value in my_type_to_string
+
+RN279: Fixed bug #441000: xtstat crashes with segmentation fault on startup if max_pbxt_threads exceeded.
+
+------- 1.0.09e RC3 - 2009-11-20
+
+RN278: Fixed compile error with MySQL 5.1.41.
+
+------- 1.0.09d RC3 - 2009-09-30
+
+RN277: Added r/o flag to pbxt_max_threads server variable (this fix is related to bug #430637)
+
+RN276: Added test case for replication on tables w/o PKs (see bug #430716)
+
+RN275: Fixed bug #430600: 'Failed to read auto-increment value from storage engine' error.
+
+RN274: Fixed bug #431240: This report is public edit xtstat fails if no PBXT table has been created. xtstat now accepts --database=information_schema or --database=pbxt. Depending on this setting PBXT will either use the information_schema.pbxt_statistics or the pbxt.statistics table. If information_schema is used, then the statistics are displayed even when no PBXT table exists. Recovery activity is also displayed, unless pbxt_support_xa=1, in which case MySQL will wait for PBXT recovery to complete before allowing connections. 
+
+RN273: Fixed bug #430633: XA_RBDEADLOCK is not returned on XA END after the transacting ended with a deadlock.
+
+RN272: Fixed bug #430596: Backup/restore does not work well even on a basic PBXT table with auto-increment.
+
+------- 1.0.09c RC3 - 2009-09-16
+
+RN271: Windows build update: now you can simply put the pbxt directory under <mysql-root>/storage and build the PBXT engine as a part of the source tree. The engine will be linked statically. Be sure to specify the WITH_PBXT_STORAGE_ENGINE option when running win\configure.js
+
+RN270: Correctly disabled PBMS so that this version now compiles under Windows. If PBMS_ENABLED is defined, PBXT will not compile under Windows becaause of a getpid() call in pbms.h.
+
+------- 1.0.09 RC3 - 2009-09-09
+
+RN269: Implemented online backup. A native online backup driver now performs BACKUP and RESTORE DATABASE operations for PBXT. NOTE: This feature is only supported by MySQL 6.0.9 or later.
+
+RN268: Implemented XA support. PBXT now supports all XA related MySQL statements. The variable pbxt_support_xa determines if XA support is enabled. Note: due to MySQL bug #47134, enabling XA support could lead to a crash. 
+
+------- 1.0.08d RC2 - 2009-09-02
+
+RN267: Fixed a bug that caused MySQL to crash on shutdown, after an incorrect command line parameter was given. The crash occurred because the background recovery task was not cleaned up before the PBXT engine was de-initialized.
+
+------- 1.0.08c RC2 - 2009-08-18
+
+RN266: Updated BLOB streaming glue, used with the PBMS engine. The glue code is now identical to the version of "1.0.08-rc-pbms" version of PBXT available from http://blobstreaming.org/download.
+
+RN265: Changes the sequential reading of data log files to skip gaps, instead of returning EOF. This ensures that extended data records are preserved even when something goes wrong with the way the file is written.
+
+RN264: Fixed a bug that cased an "Data log not found" error after an out of disk space error on a log file. This bug is similar to RN262 in that it allows "gaps" to appear in the data logs.
+
+RN263: Updated xtstat to compile on Windows/MS Visual C++.
+
+RN262: Merged changes for PBMS version 0.5.09.
+
+RN261: Concerning bug #377788: Cannot find index for FK. Fixed buffer overflow which occurred when the error was reported.
+
+RN260: Fixed bug #377788: Cannot find index for FK. PBXT now correctly uses prefix of an index to support FK references (e.g. if key = (c1, c2) then an index on (c1, c2, c3) will work). Also fixed buffer overflow, which occurred when reporting the error.
+
+RN259: Fixed bug #309424: xtstat doesn't use my.cnf. You can now add an [xtstat] section to my.cnf, for use with xtstat.
+
+RN258: updated xt_p_join implementation for Windows to check if a thread has already exited or has not yet started
+
+RN257: Removed false assertion that could fail during restore if a transaction log page was zero-filled
+
+RN256: Update datalog eof pointer only if write opearions were sucessful
+
+RN255: Added re-allocation of of filemap if allocating the of the new map failed. This often happens if there's not enough space on disk.
+
+RN254: When a table with a corrupted index is detected, PBXT creates a file called 'repair-pending' in the pbxt directory, with the name of the table in it. Each table in the file is listed on a line by itself (the last line has no trailing \n). When the table is repaired (using the REPAIR TABLE command), this entry is removed from the file.
+
+RN253: Use fcntl(F_FULLFSYNC) instead of fsync on platforms that support it. Improper fsync operation was presumably the reason of index corruption on Mac OS X.
+
+RN252: Fixed bug #368692: PBXT not reporting data size correctly in information_schema.
+
+------- 1.0.08 RC2 - 2009-06-30
+
+RN251: A Windows-specific test update, also removed false assertion that failed on Windows.
+
+RN250: Fixed a bug that caused recovery to fail when the transaction log ID exceeded 255. The problem was a checksum failed in the log record.
+
+RN249: Fixed bug #313176: Test case timeout. This happened because record cache pages where not properly freed and as soon as cache filled up the performacne degraded.
+
+RN248: PBXT now compiles and runs with MySQL 5.1.35. All tests pass.
+
+RN247: Fixed bug #369086: Incosistent/Incorrect Truncate behavior
+
+RN246: Fixed bug #378222: Drop sakila causes error: Cannot delete or update a parent row: a foreign key constraint fails
+
+RN245: Fixed bug #379315: Inconsistent behavior of DELETE IGNORE and FK constraint.
+
+RN244: Fixed a recovery problem: during the recovery of "record modified" action the table was updated before the old index entries were removed; then the xres_remove_index_entries was supplied the new record which lead to incorrect index update.
+
+RN243: Fixed a bug that caused a recovery failure if partitioned pbxt tables where present. This happended because the recovery used a MySQL function to open tables and the PBXT handler was not yet registered
+
+RN242: Fixed a bug that caused a deadlock if pbxt initialization failed. This happened because pbxt ceanup was done from pbxt_init() with PLUGIN_lock being held by MySQL which lead to a deadlock in the freeer thread
+
+RN241: Fixed a heap corruption bug (writing to a freed memory location). It happened only when memory mapped files were used leading to heap inconsistency and program crash or termination by heap checker. Likely to happen right after or during DROP TABLE but possible in other cases too.
+
+RN240: Load the record cache on read when no using memory mapped files.
+
+RN239: Added PBXT variable pbxt_max_threads. This is the maximum number of threads that can be created PBXT. By default this value is set to 0 which means the number of threads is derived from the MySQL variable max_connections. The value used is max_connections+7. Under Drizzle the default value is 500.
+
+RN238: Added an option to wait for the sweeper to clean up old transactions on a particular connection. This prevents the sweeper from getting too far behind.
+
+RN237: Added an option to lazy delete fixed length index entries. This means the index entries are just marked for deletion, instead of removing the items from the index page. This has the advantage that an exclusive lock is not always required for deletion.
+
+RN236: Fixed bug #349177: a bug in configure.in script.
+
+RN235: Fixed bug 349176: a compiler warning.
+
+RN234: Completed Drizzle integration. All Drizzle tests now run with PBXT.
+
+RN233: Fixed bugs which occur when PBXT is used together with PBMS (BLOB Streaming engine).
+
+RN232: Merged Drizzle-specific changes into the main tree.
+
+RN231: Fixed a bug that caused bad performance as the number of threads increased. This occurred when the number of open table handles exceeded 'table_open_cache', and MySQL started closing open table handlers. PBXT was flushing a table when all table handlers were closed. PBXT will now only do this when the FLUSH TABLES statement is used.
+
+RN230: Improved efficiency of conflict resolution: Implemented a queue for threads waiting for a lock. Threads no longer poll to take a lock. If a temp lock is granted because of an update, then the thread granted the temp lock will also wait for the transaction that did the update to quit.
+
+RN229: Fixed bug #313391: LOAD DATA ... REPLACE broken.
+
+RN228: Fixed bug #341115: 'Out of memory' error (a bug in key comparison algorithm).
+
+RN227: Changed conflict handling to use spin locks and improve efficiency.
+
+RN226: Fixed bug #340316: Issue with bigint unsigned auto-increment field.
+
+RN225: Fixed bug #308557: UPDATE fails to match all rows in a transactional scenario.
+
+RN224: Fixed a deadlock which could occur during table scans.
+
+RN223: Index scans now use handles to cache buffers instead of making a copy of the index page. The handles are "copy-on-write".
+
+RN222: Fixed a bug that caused the server to hang on startup if PBXT ran out of record cache while waiting for the sweeper to complete.
+
+RN221: Fixed an index recovery bug. This occurred if the server crashed after operating in low index cache sitations. 
+
+RN220: Improved index selectivity estimation: added scanning from the end of index backwards.
+
+RN219: Fixed a problem: during intersected range scan not all fields were returned by engine to MySQL.
+
+RN218: Changed the way row locking (used by SELECT FOR UPDATE) works. Previously we locked a group of rows at once (although there were many groups). However, this caused conflicts even when the same rows were not locked. We now locks individual rows.
+
+RN217: Fixed bug #315564: Rollbacked inserts remain permanently in table.
+
+RN216: Added lock tracing. In DEBUG mode, each thread has a list of locks (semaphores, mutexes, r/w locks that it holds).
+
+RN215: Fixed a bug that caused a crash during restart if an index file was flushed during recovery.
+
+RN214: Fixed bug #310184: Deadlock when trying to wake up transactions
+
+RN213: Fixed an index corruption bug on SPARC Solaris. Note this error will occur on any machine that does not use the x86 (little endian) byte order.
+
+------- 1.0.07 RC - 2008-12-15
+
+RN212: Fixed build problems on NetBSD.
+
+RN211: Fixed build problems on FreeBSD.
+
+RN210: Fixed build problems on OpenSolaris.
+
+RN209: Added handling of the foreign_key_checks system flag.
+
+RN208: xtstat will now automatically reconnect if the connection to server is lost. 
+
+RN207: Foreign key references are now checked on CREATE TABLE.
+
+RN206: Fixed a crash if inserting into a table that has an FK that references a column that has no index on it.
+
+RN205: Added processing of foreign key action SET DEFAULT.
+
+RN204: Fixed an index recovery problem: unswept index entries were not recovered correctly 
+
+RN203: Fixed foreign key bug: REPLACE fails with 'on delete cascade'
+
+RN202: Fixes and updates to tests, now all tests pass on windows and linux.
+
+RN201: Fixed ref-counting for mmapped files.
+
+RN200: Fixed an index recovery problem: unswept index entries were not recovered correctly .
+
+RN199: Recovery now takes place on plug-in startup. Previously recovery occurred when the first PBXT table was accessed.
+
+RN198: Fixed a recovery bug that caused index entries to get out of sync with the data file.
+
+RN197: Improved the efficiency of group commit.
+
+RN196: Changed checkpointing so that it now works during idle time. Every record, row or index file fllush now also contributes to the checkpoint (fuzzy checkpointing). Checkpointing is forced to complete after about 50% of the checkpoint threshold in order to ensure the correct maximum for log reading on recovery.
+
+RN195: Fixed scheduling bug that caused sweeper to get behind with the cleanup, which caused performance problems in high conflict situations. Foreground threads will now wait if the sweeper gets too far behind.
+
+RN194: Created the xtstat program which monitors the internal performance of PBXT. Run xtstat --help for more details information of the output. 
+
+RN193: Implemented the pbxt.statistics virtual table. The statistics table returns information about the internal activity of the engine. This includes I/O byte counts, cache hit counts and usage, commit count, etc. 
+
+RN192: Due to timing issues in the engine API it could happen that the client received an OK for a committed transaction before the transaction was actually committed. This problem has been fixed.
+
+RN191: Fixed a bug that caused a hang when conflicts occured while reading a covering index.
+
+RN190: Previously the sweeper delayed deletion of transaction structures until all transactions that were running during sweeping have quit. This is now handled by the same code that fixed the bug in RN189.
+
+RN189: Fixed a bug that could cause a row to go missing due to a visibility issue.
+
+RN188: Fixed a bug which ocurred when using CREATE TABLE ... AVG_ROW_LENGTH=x, and the table contained BLOBs. In this case, alter table corrupted the table.
+
+RN187: Windows now stores paths in the location file in UNIX format by converting all '\' characters to '/'. Note that the location file is only cross-platform if the paths are relative (which is the default).
+
+RN186: Set version number to 1.0.07.
+
+------- 1.0.06 Beta 2 - 2008-11-06
+
+RN185: Disabled support for INSERT DELAYED because of MySQL bug #40505
+
+RN184: Implemented info(flag == HA_STATUS_AUTO) engine API call. This call returns the next value that will be assigned as auto-increment value on the table.
+
+RN183: Turned off streaming on Windows (see XT_STREAMING macro in sources)
+
+RN182: Switch code base to the latest version of BLOB streaming engine (PBMS): www.blobstreaming.org.
+
+RN181: Updated pbxt-test-run default parameters (--force is on, --default-storage-engine is pbxt, --base-dir is set according to config)
+
+RN180: PBXT can now cope with a missing .xti file (the file that contains the table indexes). This file can be regenerated using REPAIR TABLE.
+
+RN179: On recovery PBXT now creates a filed called 'recovery-progress' in the pbxt database. The recovery percentage complete is written to this file as recovery progresses. Note that this file will not be created if no recoery is necessary or if PBXT estimates that it will read less then 10MB to do recovery.
+
+RN178: Fixed a problem in CHECK TABLE that caused memory corruption for fixed-size records
+
+RN177: Added "crash debugging". When enabled, crash debugging does the following:
+  - Create a core dump on Windows if the server crashes.
+  - Make a backup copy of the datadir directory before recovery if the server crashes.
+  - Keep at least 5 of the previous transaction logs.
+Currently crash debugging is disabled by default. To disable, create a file called 'no-debug' in the pbxt database folder, and restart the server. When crash debugging is disabled by default, it can be enabled by creating a file called 'crash-debug; in the pbxt database folder.
+
+RN176: Fixed a bug: a lock was not released appropriately
+
+RN175: Fixed some debug assertions
+
+RN174: Fixed some of test/mysql-test tests
+
+RN173: Fixed a RENAME TABLE bug, that prevented index files from being properly recreated
+
+RN172: Added the file ./pbxt/lock-pid. This file is locked while the server is running, and  contains the process of the server. PBXT will return an error on startup if the file is locked or the process is still running in order to prevent a second server from being started.
+
+RN171: Implemented the AVG_ROW_LENGTH table attribute. When set, this value determines the size of the fixed length data component of a record. Normally this size is estimated depending on the column definitions. The command CHECK TABLE dumps the current average row length to the log. This can be used to find a suitable value for AVG_ROW_LENGTH.
+
+RN170: Changed configure so that debug/optimize flags set for building the engine override the flags set for MySQL. If --with-debug is not specified, then the engine will use the flags set when building MySQL. If MySQL was built with --with-debug=full, the DEBUG will be defined for the engine. When building the engine, the following flags can be set:
+  yes  - Debug symbols enabled, no optimization, DEBUG not defined.
+  full - Debug symbols enabled, no optimization, DEBUG defined.
+  only - Debug symbols enabled, MySQL flags used, DEBUG not defined.
+  prof - Profile code enabled, optimization on, DEBUG not defined.
+  no   - No debug symbols, optimization on, DEBUG not defined.
+
+RN169: Used MySQL root Makefile instead of config.status in order to extract settings (such as CFLAGS and CXXFLAGS) for the PBXT build.
+
+RN168: Fixed Windows build after merging changes for Drizzle.
+
+RN167: Fixed "This table requires primary key" error in sql-bench.
+
+RN166: Fixed threading problems that caused crashes in sql-bench.
+
+RN165: Added sql-bench to pbxt source tree.
+
+RN164: Ported PBXT to Drizzle. To compile for Drizzle DRIZZLED must be defined on the command line. The -drz.am and -drz.in files are must be used when PBXT is embedded in Drizzle.
+
+RN163: Added "make test" build step. Running "make test" from the root of pbxt source tree will launch test/mysql-test/pbxt-test-run.pl with appropriate options to execute the pbxt functional test suite. On Windows where 
+pbxt is statically linked into mysql server binary pbxt testing works by going to test/mysql-test directory and running ./pbxt-test-run.pl with --base-dir argument pointing to a mysql source tree (mysql binaries are taken 
+from there) and passing the rest of usual arguments (--force --mysqld=--default-storage-engine=pbxt) 
+
+RN162: The 'pbxt' database must now be dropped explicitly. It is automatically created when the first PBXT table is created. After that, the pbxt database can be dropped once all PBXT tables have been dropped. Dropping the pbxt database will also cause all transaction (pbxt/system directory) and data logs (pbxt/data directory) to also be deleted.
+
+RN161: Added pbxt.location system table. This table can only be dropped when all PBXT tables have been deleted. Dropping the system table will cause all transaction (pbxt/system directory) and data logs (pbxt/data directory) to also be deleted.
+
+RN160: Made changes to run with MySQL 6.0.6.
+
+RN159: Changes to configure: added --with-plugindir=<path>, which should be used to specify the plugin directory. This means that --libdir should no longer be used. For backwards compatibility configure will still recognize this options if the path ends with 'plugin'.
+
+Also updated --help, to include all options, and better desciptions of the options.
+
+The configure options are now as follows:
+
+--with-mysql=<path> - (Required) It specifies the path to the MySQL source tree. The source should already be built. All other options will be taken from the MySQL build by default.
+--with-debug=yes/no - (Optional) Specify if then engine should be built with different debug options to the MySQL source tree.
+--with-plugindur=<path> - (Optional) Specify an alternative installation directory for the plugin. By default it will be installed in the plugin directory of the MySQL installation.
+
+
+RN158: Added support for core dumps on Windows. This can be enabled by defining XT_COREDUMP. On by default at the moment. If the server crashes a file called PBXTCore00000001.dmp will be created in the data directory. This file can be openned using MS VS.
+
+RN157: Fixed a compile problem with tv_nsec which is not supported on all platforms.
+
+RN156: Updated tests to run with MySQL 5.1.28.
+
+RN155: Errors during cascade update of VARCHAR values with trailing spaces
+
+RN154: Fixed a bug: impossible to create a foreign key that referenced an ENUM or SET column
+
+RN153: Fixed a bug that caused the following problems: #1. Foreign keys: crash if update cascade and autocommit=0 #2. Foreign keys: crash if update cascade and multi-level recursion
+
+RN152: Fixed missing information about foreign keys in I_S.table_constraints and I_S.referential_constraints
+
+------- 1.0.05 Beta - 2008-08-30
+
+RN151: "Quick config": It is now possible to configure the engine by just specifying the mysql source code tree (the --with-mysql option). The --libdir and --with-debug setting will be deduced automatically. 
+
+RN150: Added system variable pbxt_sweeper_priority, 0 = low (default), 1 = normal (same as user threads), 2 = high. The sweeper cleans up deleted records (deleted records also result from an update). If allowed to accumulate, these records can slow searches. Higher priority for the sweeper is recommended on systems with 4 or more cores.  
+
+RN149: Record cleanup is now initiated if a deleted record is found, and the transaction that deleted the record has ended. Since waking up the sweeper is an expensive operation, normally the sweeper will run every 1/10th of a second.
+
+RN148: Fixed a bug which caused transaction starvation (one transaction was constantly locked out) during high conflict updates. This lead to cleanup of records not being done, which lead to a general slow down.
+
+RN147: Fixed a problem with TRUNCATE TABLE: a failed TRUNCATE TABLE could put the engine into an invalid state that later caused a crash
+
+RN146: Fixed a bug that caused the error: "-49: Record format unknown, either corrupted or upgrade required".
+
+RN145: Added pbxt_db_offline_log_function system variable, 0 = recycle logs (default), 1 = delete logs (default on Mac OS X), 2 = keep logs.
+
+------- 1.0.04 Alpha - 2008-08-02
+
+RN144: Completed port and testing of Windows version.
+
+RN143: Fixed a bug which caused the free-er thread to hang. This was a result of an invalid operation ID, which was the result of the checkpointer flushing the table at the same time as a foreground thread.
+
+RN142: The fast RW/mutex lock can now handle nested calls. This is possible during a sequential scan.
+
+RN141: The normal behavior in MySQL is that an auto-increment values will be re-issued if you delete the row containing the current maximum auto-increment value and then restart the server. To prevent this you can use ALTER TABLE my_table AUTO_INCREMENT = <current-max-auto-increment> + 1, before deleting the current maximum auto-increment value.
+
+A new system variable, pbxt_auto_increment_mode, has been added so that this work around is not necessary. When set to 0 (the default), auto-increment works as described above. When set to 1, the AUTO_INCREMENT value of the table is automatically to prevent previously issued auto-increment values being returned.
+
+However, if the server crashes, a gap of up to 100 unique values can result, because the table AUTO_INCREMENT value is incremented in steps of 100.
+
+RN140: Index statistics are now automatically recalculated when the table row count exceeds 200. 
+
+RN139: Fixed a bug that caused index corruption, error: "int idx_push(index_xt.cc:172) -2: Core B-tree too deep".
+
+RN138: Handle startup and recovery when an index is corrupted.
+
+RN137: Fixed a bug in the zero wait R/W lock that caused the lock to fail (the state is extremely volatile, and must be written to memory after increment).
+
+RN136: Fixed a bug that cause the error "int xt_pwrite_file(filesys_xt.cc:789) errno (14): Bad address".
+
+RN135: Fixed TRUNCATE TABLE that did not work correctly when the table contained BLOBs stored in the BLOB streaming engine (www.blobstreaming.org).
+
+RN134: Fixed a bug that caused duplicate rows to be returned from an index scan (using a SELECT FOR UPDATE) if a concurrent update was done.
+
+RN133: Optimised PBXT for multi-processor scale-up. This mostly involved using different types of locks instead of the standard pthread mutex and reader/writer locks [TODO: 0038].
+
+------- 1.0.03 Alpha - 2008-05-30
+
+RN132: Fixed bug when using PBXT in conjunction with the BLOB streaming engine (www.blobstreaming.org). Uploaded BLOBs could not be inserted into a table.
+
+RN131: Fixed wait for background processes on shutdown. Shutdown will wait a maximum of 16 seconds for each process.
+
+RN130: Fixed calculation of bytes to be read for recovery.
+
+RN129: Fixed bug in cleanup of unterminated transactions.
+
+RN128: The writer will now start working when one of the following is true:
+- it is time for a checkpoint,
+- the log cache is almost full,
+- the free'er is waiting for the writer,
+- there is no other activity.
+
+RN127: Fixed checkpoint frequency. Checkpointing is now done correctly after 'pbxt_checkpoint_frequency' bytes.
+
+RN126: Implemented index consistent write [TODO: 0050].
+
+RN125: Implemented memory mapping for row pointer (.xtr) and handle data files (.xtd).
+
+RN124: Index files now use direct I/O.
+
+------- 1.0.02 Alpha - 2008-04-25
+
+RN123: Fixed compile errors with MySQL 5.1.24.
+
+------- 1.0.01 Alpha - 2008-03-28
+
+RN122: ++++ NOTE: This version is not compatible with older versions of PBXT ++++.
+
+RN121: Transaction logs are now global so that multi-database statements are now possible. This makes it also possible to work PBXT temporary tables.
+
+RN120: Transaction logs pre-allocated and recycled.
+
+RN119: Transaction log writes on 512 byte boundaries only.
+
+------- 1.0.00 Alpha - 2008-03-10
+
+This version has alpha status because of the large number of changes done for full durability.
+
+RN118: ++++ NOTE: This version is incompatible to older versions of PBXT ++++.
+
+RN117: Documentation now avaliable at http://www.primebase.org/documentation.
+
+RN116: Corrected the plug.in file so that PBXT compiles when dropped into the storage directory in the MySQL source tree.
+
+RN115: Compiled and tested with MySQL 5.1.23.
+
+RN114: Increased index block size. Minimum is now 4K. Default is 16K.
+
+RN113: Calculate index selectivity to return a more accurate value from records_in_range(). NOTE: FLUSH TABLESl will update the index statistics, after data has been inserted or updated.
+
+RN112: Optimized table storage, saving 8 bytes per row.
+
+RN111: Optimized search on keys containing 2 or 3 not null integer values.
+
+RN110: Optimization: store the row ID in the index so that an index entry can be verified as current without loading the record. This is necessary to optimize an access with index coverage.
+
+RN109: Optimization: only load the record extended data if required.
+
+RN108: Implemented SHOW ENGINE PBXT STATUS;
+
+RN107: Added the following system variables:
+
+pbxt_index_cache_size - The amount of memory allocated to the index cache, used only to cache index data
+pbxt_record_cache_size - The amount of memory allocated to the record cache used to cache table data
+pbxt_log_cache_size - The amount of memory allocated to the transaction log cache used to cache on transaction log data
+pbxt_log_file_threshold - The size of a transaction log before rollover, and a new log is created
+pbxt_transaction_buffer_size - The size of the global transaction log buffer (the engine allocates 2 buffers of this size)
+pbxt_log_buffer_size - The size of the buffer used to cache data from transaction and data logs during sequential scans, or when writing a data log
+pbxt_checkpoint_frequency - The amount of data written to the transaction log before a checkpoint is performed
+pbxt_data_log_threshold - The maximum size of a data log file
+pbxt_garbage_threshold - The percentage of garbage in a data log file before it is compacted
+
+RN106: PBXT now compiles for MySQL 6.0.3.
+
+RN104: Updates now locks a record temporarily. This prevents most "record changed" errors, however, it makes UPDATE statements a type of "committed read". This means that you may update a different value to that which you selected in repeatable read mode. To avoid this, use SELECT FOR UPDATE if you plan to UPDATE records after reading.
+
+RN103: Implemented SELECT FOR UPDATE. This is implemented by turning SELECT FOR UPDATE into a type of "committed read". This means that, if you do a SELECT followed by a SELECT FOR UPDATE you can get different results, even in repeatable read mode.
+
+RN102: Implemented recovery of index entries. Note: indexes are not yet fully consistent. This means that index can become currupted due to a crash. Data, however, cannot be lost. The indices can be rebuild using REPAIR TABLE.
+
+RN101: Writing and flushing of a single transaction write-ahead log.
+
+RN100: Automatic rollover of transaction logs as they become full.
+
+RN99: Implementation of the transaction log cache.
+
+RN98: Group commit.
+
+RN97: Implementation of the writer thread that applies changes in the transaction log to the database.
+
+RN96: Implementation of the checkpointer thread that periodically flushes the database and writes a checkpoint which determines the recovery start point.
+
+RN95: Implementation of the free'er thread that is responsible for keeping the record cache at a preset level.
+
+RN94: Modifications to the record cache so that rows are stored in pages, in order to speed up sequence access.
+
+RN93: Implemented the recovery process which applies changes written to the log that are not in the database, on startup.
+
+RN92: Modification of the sweeper thread which cleans up rolled-back transactions and deleted data, to use the new transaction log format.
+
+RN91: Modifications to the data logs so that they use the same record structure as the transaction logs.
+
+RN90: The data logs are now managed "per database" in order to minimize the work done to flush and commit a transaction.
+
+RN89: Implementation of a file handle pool for the data logs.
+
+------- 0.9.91 Beta - 2007-10-30
+
+RN88: The format of the URL genearated by MyBS has been changed. The format of the BLOB URLs is now as follows:
+
+'~*' <db-name> '/' <type-char> <table-id> '-' <blob-id> '-' <access-code> '-' <server-id>
+
+Where <type-char> is '_' or '~'.
+
+Examples: ~*test/_11-128-fbd590b-0, ~*test/~1-524-3dc45b09-0
+
+In other words, the characters '>' has been replace by '*', '^' has been replace by '_' and ':' has been replace by '~'. The reason for this is that the characters '>' and '^' are not allowed in URLs, and must be URL-encoded. The character ':' is reserved, but allowed.
+
+NOTE: This change makes this version incompatible with previous versions of MyBS. If you have a table with BLOB URLs, you can upgrade the URLs as follows:
+
+UPDATE blob_table SET blob_col = REPLACE(REPLACE(blob_col, '~>', '~*'), '/:', '/~');
+
+Replacing '^' is not necessary because BLOB URLs with '^' should not appear in tables. 
+
+------- 0.9.90 Beta - 2007-10-17
+
+RN87: Corrected stack trace of errors passed through the BLOB streaming API.
+
+RN86: Added new engine API accessor functions that appeared in 5.1.21 (thanks Stewart).
+
+RN85: Added plug.in file. PBXT now compiles when dropped into the storage directory of the MySQL build tree. However, you have rebuild configure. For example:
+
+rm -rf autom4te.cache/
+aclocal
+autoconf
+autoheader
+automake -a
+./configure --help
+./configure --with-plugins=max --without-innodb --prefix=/usr/local/mysql --with-debug=full
+
+NOTE: ./configure --help should show that the PBXT has been included.
+
+RN84: Fixed several problems with shutdown of PBXT in combiniation with MyBS.
+
+------- 0.9.89 Beta - 2007-08-17
+
+RN83 (2007-08-21): Fixed a crash due to a compile bug that does not like the contruct *((xtWordPS *) &(v)) = (xtWordPS) (x) (macro allocr_() and alloczr_()).
+
+RN82: It is now possible to insert non-URL values into a LONGBLOB field, in the previous version the generated an "Invalid URL" error. Such values can be retrieved as a stream using a field reference.
+
+RN81: Fixed a bug that caused PBXT to crash during certina operations when MyBS was not installed.
+
+RN80: Set engine as capable of row-level replication, but not as statement replication. Statement replication does not work because MVCC is not serializable.
+
+------- 0.9.88 Beta - 2007-07-25
+
+RN79: Made some corrections in order to compile with MySQL 5.1.20.
+
+RN78: Support for the features of the MyBS BLOB Streaming engine, version 0.5 Alpha.
+
+RN77: Bugfix: The server crashes during BLOB data handling. The reason is the table field structure is shared, and may not be changed.
+
+------- 0.9.87 Beta - 2007-06-19
+
+RN76: The major feature of this release is support for the BLOB Streaming Engine. The current version enables the download of specific BLOB columns via the Streaming Engine. For example:
+
+use test;
+CREATE TABLE notes_tab (
+  n_id        INTEGER PRIMARY KEY,
+  n_text      BLOB
+) ENGINE=pbxt;
+INSERT notes_tab VALUES (1, "This is a BLOB streaming test!");
+
+The URL:
+
+http://localhost:8080/test/notes_tab/n_text/n_id=1
+
+will return the value "This is a BLOB streaming test!"
+
+RN75: Bugfix: MySQL prints error: "Plugin 'PBXT' will be forced to shutdown". This error was caused by the plug-in having a reference to itself.
+
+RN74: Added system variable pbxt_index_cache_size and pbxt_record_cache_size. These variable can now be set on the mysqld command line (for example: --pbxt_record_cache_size=50MB). The values are also displayed by SHOW VARIABLES.
+
+------- 0.9.86 Beta - 2007-04-07
+
+RN74: ++++ NOTE: This version is incompatible to older versions of PBXT ++++.
+
+In order to upgrade, install the older version of PBXT. Convert all tables to MyISAM using ALTER TABLE t1 ENGINE=MyISAM. Then install the new version of PBXT and convert back using ALTER TABLE t1 ENGINE=PBXT.
+
+RN73: Each table will now use a maximum of 4 data log files. This means a maximum of 7 files per table. The minimum is 3 for tables that do not have a variable field that exceeds about 40 bytes in size. This means that under Linux PBXT requires a maximum of 7 file handles per table used. Windows lock of pread/pwrite (atomic seek and read/write) functions means it requires a file handler per file per open table handler. [TODO: 0044]
+
+RN72: All threads now write to the same data log file. Recovery and compaction take this fact into account. Each thread still writes its own transaction log.
+
+RN71: Removed all directory scans when creating and dropping table. Increased the table limit to 10000.
+
+RN70: Changed locking to avoid a deadlock when TRUNCATE TABLE is used together with other DML.
+
+RN69: procedures and functions are now considered atomic, and execute in a single transaction.
+
+RN68: Bug fixed: all files are now correctly flushed before commit.
+
+------- 0.9.85 Beta - 2007-03-15
+
+RN67: Changed the implementation of the pushsr_ and allocr_ macros because "*((void **) &(v) = " caused a crash due to a compiler error on some platforms (thanks Luciano for your help on this one and RN66).
+
+RN66: Fixed a bug that caused PBXT to corrupt the index file when the size exceeded 4GB. [TODO: 0031]
+
+RN65: PBXT now runs under Windows. This source tree must be placed in the MySQL source storage directory in order to compile. Further details of how to build are in the windows-readme.txt file. [TODO: 0027]
+
+RN64: Improved speed of table lookup by ID after a table has been deleted. The sweeper needs to ignore these records. Scanning the directory each time was too slow.
+
+RN63: Added checking for repeat update of a record in a statement.
+
+RN62: Committed read no longer blocks due to a change made by another transaction (the XT_REPEATABLE_READ_BLOCKS define, turns blocking on).
+
+RN61: Avoid checking for duplicates if an index is not modified by an update.
+
+RN60: Records updated repeatedly by a transaction are now updated in place. [TODO: 0040]
+
+------- 0.9.8 Beta - 2007-01-30
+
+RN59: Reduced the number of file handles used to a maximum of one per file. This assumes that pread() and pwrite() allows multiple threads to use the same file handle (according to my tests, this is the case).
+
+RN58: Added the configure flag --with-debug=only which compiles a version of the plug-in with debug symbols that will link to an non-debug MySQL server.
+
+RN57: Changed error number returned on lock from 1205 (lock timeout) to 1020 (optimistic lock failure).
+
+RN56: Added UNIX environment variable for PBXT system parameters. These must be set before starting mysqld, for example:
+
+setenv pbxt_index_cache_size 400MB
+setenv pbxt_record_cache_size "1 GB"
+
+Values are in bytes unless one of the following units is specified: GB, MB, Kb
+
+RN55: Fixed a bug which prevented VARCHAR values from being compressed correctly when stored in variable length rows.
+
+RN54: Fixed a bug which caused a crash when PBXT was used with MySQL 5.1.14. This bug also caused data to be corrupted on insert.
+
+RN53: Set query caching mode to transactional. [TODO: 0027]
+
+RN52: Added conditions so that the engine compiles with MySQL 5.1.14 and 5.1.13.
+
+------- 0.9.74 Beta - 2006-12-14
+
+RN51: DELETE FROM <table>; is no longer implemented by  re-creating the table. This statement now works by deleting all rows. TRUNCATE is implemented as before, by re-creating the table.
+
+RN50: The test scripts innodb.test and innodb-mysql.test have been modified to run with PBXT.
+
+RN49: [TODO: 0020] Implemented foreign keys. Functionality is identical to InnoDB with 2 exceptions:
+
+* Data types of referenced columns must be an exact match (e.g. you cannot mix VARCHAR and CHAR values).
+* Currently an exact matching index is required on referenced columns (i.e. the index may not have more columns that the columns used in the foreign key definition).
+
+Also note the following:
+
+* It is possible to create foreign keys that reference non-existent tables or columns. An error will occur when updating a table with an incorrect foreign key declaration.
+* If you alter the data-type of a column referenced by a foreign key set you need to set foreign_key_checks=0; or an error will occur.
+
+RN48: Fixed a bug in the implementation of indexes on ENUM and SET types.
+
+RN47: Fixed a bug that caused a crash when an index was place on a BLOB column, and data was retrieved from the index directly.
+
+------- 0.9.73 Beta - 2006-10-31
+
+RN46: Updated test scripts to run with MySQL 5.1.13.
+
+------- 0.9.72 Beta - 2006-10-19
+
+RN45: Corrected compilation errors that occurred due to a change to struct st_mysql_plugin.
+
+------- 0.9.71 Beta - 2006-10-04
+
+RN44: Corrected compilation errors that occurred due to changes in the storage engine API.
+
+------- 0.9.7 Beta - 2006-09-20
+
+RN43: This is the first Beta release of PrimeBase XT. It has been integrated into MySQL 4.1.21 and is available as a plug-in for MySQL 5.1.12, or later. This version has been extensively tested using mysql-test-run, on various Linux and Mac OS X platforms.
+
+RN42: ++++ NOTE: This version is incompatible to older versions of PBXT ++++. Files created by older versions cannot be opened by version 0.9.7.
+
+RN41: Renaming or deleting a table while using a name with different case to the original created name did not work.
+
+RN40: Fixed a bug when grouping and searching on indexed columns that contain a null.
+
+RN39: Fixed bugs related to trailing spaces on VARCHAR values. Values that only vary by the number of trailing spaces (for example "aa" and "aa "), are now correctly handled as identical.
+
+RN38: The default AUTO_INCREMENT value was not correctly preserved during ALTER TABLE.
+
+RN37: Created a MySQL 5.1 Plugin version of PBXT. [TODO: 0017]
+
+RN36: Fixed a race condition in the row cache which had the affect that inserted rows dissappeared after cleanup because the cache was out of date. I was only able to reproduce this error on multi-processor machines.
+
+------- 0.9.6 - 2006-08-05
+
+RN35: ++++ NOTE: This version is incompatible to older versions of PBXT ++++.
+
+The disk format of tables and log files has changed slightly in this version. As a result, files created by older versions cannot be opened by version 0.9.6. An error will be generated. If you have data wish to preserve, first start the older version of XT and convert all tables to MyISAM. The stop the server and removed all transaction log file (files of the form xtlog-*.xt). Then start the new version and convert tables back to XT.
+
+RN34: Implemented READ COMMITTED transaction mode. XT now supports READ COMMITTED and SERIALIZABLE transaction modes. NOTE: if the mode is set to REPEATABLE READ, SERIALIZABLE is used. If the mode is set to READ UNCOMMITTED READ COMMITTED is used.
+
+RN33: The implementation of AUTO_INCREMENT on a paritial index is non-standard. A unique value is generated without regard to the value of the index prefix. For example, assume we have the following table: CREATE TABLE t1 (c1 CHAR(10) not null, c2 INT not null AUTO_INCREMENT, PRIMARY KEY(c1, c2));
+
+With the following contents:   c1 c2
+                               A  8
+                               B  1
+
+After executing the following statement: insert into t1 (c1) values ('B');
+
+This is the result using PBXT: c1 c2
+                               A  8
+                               B  1
+                               B  9
+
+The standard result would be:  c1 c2
+                               A  8
+                               B  1
+                               B  2
+
+RN32: PBXT does not permit access to multiple databases within a single transaction. For example:
+
+begin;
+update database_1.t1 set a=10;
+update database_2.t2 set d=10;
+commit;
+
+In this case the following error is returned: 1015: Can't lock file (errno: -1)
+
+RN31: The implementation of COUNT(*) has changed. For effectiency, rows are not counted. The information is taken from the header of the record (.xtr) files. This information is only 100% accurate after transaction cleanup has completed. Which basically means, only when PBXT is idle. ANALYZE TABLE waits for all background activity to stop, so the statement may be executed before a COUNT(*) to ensure an accurate result. NOTE: Other then waiting for background processes, ANALYSE TABLE is not implemented.
+
+RN30: Two concurrency bugs have been fixed: a shared lock was used instead of an exclusive lock when deleting from a transaction list, the transaction segment semaphore was not initialized. XT now runs correctly in a multi-processor environment. The test used was sysbench on a dual-process, dual-core, AMD 64-bit machine running SUSE Linux 10.0.
+
+RN29: PBXT compiles and runs on under 64-bit Lunix. [TODO: 0009]
+
+RN28: ./mysql-test-run --force --mysqld=--default-storage-engine=pbxt will now execute most tests successfully. Changes to the tests and the result have been documented in http://www.primebase.com/xt/download/pbxt-test-run-changes.txt. [TODO: 0004, 0019]
+
+RN27: Fixed a bug that caused the server to crash if when using tables locks and transactions. For example: LOCK TABLES, BEGIN, COMMIT, SELECT. This sequence now returns an error. The correct sequence is:
+
+LOCK TABLES, BEGIN, COMMIT, UNLOCK TABLES, SELECT
+or
+LOCK TABLES, BEGIN, COMMIT, BEGIN, SELECT COMMIT, UNLOCK TABLES
+
+RN26: Fixed a concurrency problem which caused a number of threads to hang during the sysbench test - see RN30 above (bug reported by Vadim).
+
+RN25: Fixed a bug that caused the server to hang when ha_pbxt::create() and ha_pbxt::ha_open() where given different, but equivalent paths for a particular table.
+
+RN24: Fixed bug in the indexing of blob columns, for example: create table t1(name_id int, name blob, INDEX name_idx (name(5)));
+
+RN23: When a duplicate key error occurs in auto-commit mode, the transaction is now rolled back.
+
+RN22: Fixed incorrect duplicate key error. In the case of a unique key which allows NULLs, duplicates are allowed if the inserted key contains a NULL. For example:
+
+create table t1 (id int not null, str char(10), unique(str));
+insert into t1 values (1, null),(2, null),(3, "foo"),(4, "bar");
+
+RN21: PBXT now returns the correct error code on duplicate key: 1062 instead of 1022.
+
+RN19: Implemented AUTO_INCREMENT on partial keys. However, the XT implementation is non-standard. Increment of partial index works, but the ID generated is incremented like a non-partial index. For example:
+
+create table t1 (c1 char(10) not null, c2 int not null auto_increment, primary key(c1, c2));
+select * from t1;
+c1 c2
+A  8
+B  1
+
+insert into t1 (c1) values ('B');
+select * from t1;
+c1 c2
+A  8
+B  1
+B  9
+
+The standard result would be:
+c1 c2
+A  8
+B  1
+B  2
+
+RN18: Implemented TRUNCATE TABLE and DELETE FROM <table>; (i.e. a DELETE without WHERE clause). Previously DELETE FROM <table>; did not cause an error, but no rows where deleted (TRUNCATE TABLE returned an error). [TODO: 0012, 0022]
+
+RN17: Implemented CREATE TABLE (...) auto_increment=<value>;
+
+------- 0.9.51 - 2006-07-06
+
+RN16: Fixed crash which could occur when creating the first table in a database (bug reported by Hakan). 
+
+------- 0.9.5 - 2006-07-03
+
+RN15: This version concludes the re-structuring of the PBXT implementation. I have made a number of major changes, including:
+
+- All files except the transaction logs are now associated with a particular table. All table related files begin with the name of the table. The extension indicates the function. 
+
+- I have merged the handle and the fixed length row data for performance reasons.
+
+- Only the variable size component of a row is stored in the data log files. As a result the data logs can now be considered as a type of "overflow" area.
+
+- Memory mapped files are no longer used because it is not possible to flush changes to the disk.
+
+RN14: File names have the following forms:
+
+[table-name]-[table-id].xtr - These files contains the table row pointers. Each row pointer occupies 8 bytes and refers to a list of records. The file name also contains the table ID. This is a unique number which is used internally by XT to identify the table. 
+
+[table-name].xtd - This file contains the fixed length data of a table. Each data item includes a handle and a record. The handle references a record in the data log file if the table contains variable length records.
+
+[table-name].xti - This file contains the index data of the table.
+
+[table-name]-[log-id].xtl - This is a data log file. It contains the variable length data of the table. A table may have any number of data log files, each with a unique ID.
+
+xtlog-[log-id].xt - These files are the transaction logs. Log entries that specify updates reference a data file record. Each active thread has its own transaction log in order to avoid contension.
+
+RN13: Fixed the bug "Hang on DROP DATABASE". [TODO: 0016] 
+
+RN12: PBXT currently only supports the "Serializable" transaction isolation level. This is the highest isolation level possible and includes the "repeatable-read" functionality [TODO: 0015]. This is implemented by giving every transaction a snapshot of the database at the point when the transaction is started.
+
+If the transaction tries to update a record that was updated by some other transaction after the snapshot was taken, a locked error is returned. A deadlock can occur if 2 transactions update the same record in a different order. PBXT can detect all deadlocks.
+
+RN11: I have implemented write buffering on the table data files. [TODO: 0013]
+
+RN10: The unique constraint (UNIQUE INDEX/PRIMARY KEY) is now checked correctly. [TODO: 0008]
+
+RN9: I have implemented a conventional B-tree algorithm for the indices (instead of the Lehman and Yoa B*-link tree). Although this reduces concurrency it improves the performance of queries significantly because of the simplicity of the algorithm. Deletion is also implemented in a very simple manner. [TODO: 0007]
+
+RN8: PBXT now has only 2 caches [TODO: 0006]:
+
+The Index Cache (pbxt_index_cache_size): This is the amount of memory the PBXT storage engine uses to cache index data and row pointers. This is all the data in the files with the extensions '.xti' and '.xtr'. This cache is managed in blocks of 2K.
+
+The Record Cache (pbxt_record_cache_size): This is the amount of memory the PBXT storage engine uses to cache table row data (handles and records). This is all the data in the files with the extension '.xtd'.
+
+The size of the caches are determined by the values of the system variables pbxt_index_cache_size and pbxt_row_cache_size. By default these values are set to 32MB.
+
+RN7: Auto-increment is now implemented in memory. This is done by doing a MAX() select when a table is first opened to get the high value. After that, then high value is incremented in memory on INSERT. On UPDATE (or INSERT) the value in memory is adjusted if necessary. This method also makes it possible for rows to be inserted simultaneously on the same table. [TODO: 0005, 0014]
+
+RN6: ./run-all-tests --create-options=TYPE=PBXT succeeds. [TODO: 0004]
+
+RN5: Using sql-bench and my own Java based test I have confirmed that PBXT behaves correctly during multi-threaded access. [PARTIALY TODO: 0002]
+
+RN4: Load/Stability test. Using sql-bench I have tested PBXT under load over a long period of time. [PARTIALY TODO: 0001]
+
+------- 0.9.2 - 2006-04-01
+
+RN3: Fixed a bug that cause the error "-6: Handle is out of range: [0:0]".
+
+RN2: Implemented SET, ENUM and YEAR data types.
+
+RN1: Fixed a bug in the error reporting when a table is created with a datatype that is not supported. [TODO: 0011]
+
+
diff --git a/storage/pbxt/INSTALL b/storage/pbxt/INSTALL
new file mode 100644
index 00000000000..23e5f25d0e5
--- /dev/null
+++ b/storage/pbxt/INSTALL
@@ -0,0 +1,236 @@
+Installation Instructions
+*************************
+
+Copyright (C) 1994, 1995, 1996, 1999, 2000, 2001, 2002, 2004, 2005 Free
+Software Foundation, Inc.
+
+This file is free documentation; the Free Software Foundation gives
+unlimited permission to copy, distribute and modify it.
+
+Basic Installation
+==================
+
+These are generic installation instructions.
+
+   The `configure' shell script attempts to guess correct values for
+various system-dependent variables used during compilation.  It uses
+those values to create a `Makefile' in each directory of the package.
+It may also create one or more `.h' files containing system-dependent
+definitions.  Finally, it creates a shell script `config.status' that
+you can run in the future to recreate the current configuration, and a
+file `config.log' containing compiler output (useful mainly for
+debugging `configure').
+
+   It can also use an optional file (typically called `config.cache'
+and enabled with `--cache-file=config.cache' or simply `-C') that saves
+the results of its tests to speed up reconfiguring.  (Caching is
+disabled by default to prevent problems with accidental use of stale
+cache files.)
+
+   If you need to do unusual things to compile the package, please try
+to figure out how `configure' could check whether to do them, and mail
+diffs or instructions to the address given in the `README' so they can
+be considered for the next release.  If you are using the cache, and at
+some point `config.cache' contains results you don't want to keep, you
+may remove or edit it.
+
+   The file `configure.ac' (or `configure.in') is used to create
+`configure' by a program called `autoconf'.  You only need
+`configure.ac' if you want to change it or regenerate `configure' using
+a newer version of `autoconf'.
+
+The simplest way to compile this package is:
+
+  1. `cd' to the directory containing the package's source code and type
+     `./configure' to configure the package for your system.  If you're
+     using `csh' on an old version of System V, you might need to type
+     `sh ./configure' instead to prevent `csh' from trying to execute
+     `configure' itself.
+
+     Running `configure' takes awhile.  While running, it prints some
+     messages telling which features it is checking for.
+
+  2. Type `make' to compile the package.
+
+  3. Optionally, type `make check' to run any self-tests that come with
+     the package.
+
+  4. Type `make install' to install the programs and any data files and
+     documentation.
+
+  5. You can remove the program binaries and object files from the
+     source code directory by typing `make clean'.  To also remove the
+     files that `configure' created (so you can compile the package for
+     a different kind of computer), type `make distclean'.  There is
+     also a `make maintainer-clean' target, but that is intended mainly
+     for the package's developers.  If you use it, you may have to get
+     all sorts of other programs in order to regenerate files that came
+     with the distribution.
+
+Compilers and Options
+=====================
+
+Some systems require unusual options for compilation or linking that the
+`configure' script does not know about.  Run `./configure --help' for
+details on some of the pertinent environment variables.
+
+   You can give `configure' initial values for configuration parameters
+by setting variables in the command line or in the environment.  Here
+is an example:
+
+     ./configure CC=c89 CFLAGS=-O2 LIBS=-lposix
+
+   *Note Defining Variables::, for more details.
+
+Compiling For Multiple Architectures
+====================================
+
+You can compile the package for more than one kind of computer at the
+same time, by placing the object files for each architecture in their
+own directory.  To do this, you must use a version of `make' that
+supports the `VPATH' variable, such as GNU `make'.  `cd' to the
+directory where you want the object files and executables to go and run
+the `configure' script.  `configure' automatically checks for the
+source code in the directory that `configure' is in and in `..'.
+
+   If you have to use a `make' that does not support the `VPATH'
+variable, you have to compile the package for one architecture at a
+time in the source code directory.  After you have installed the
+package for one architecture, use `make distclean' before reconfiguring
+for another architecture.
+
+Installation Names
+==================
+
+By default, `make install' installs the package's commands under
+`/usr/local/bin', include files under `/usr/local/include', etc.  You
+can specify an installation prefix other than `/usr/local' by giving
+`configure' the option `--prefix=PREFIX'.
+
+   You can specify separate installation prefixes for
+architecture-specific files and architecture-independent files.  If you
+pass the option `--exec-prefix=PREFIX' to `configure', the package uses
+PREFIX as the prefix for installing programs and libraries.
+Documentation and other data files still use the regular prefix.
+
+   In addition, if you use an unusual directory layout you can give
+options like `--bindir=DIR' to specify different values for particular
+kinds of files.  Run `configure --help' for a list of the directories
+you can set and what kinds of files go in them.
+
+   If the package supports it, you can cause programs to be installed
+with an extra prefix or suffix on their names by giving `configure' the
+option `--program-prefix=PREFIX' or `--program-suffix=SUFFIX'.
+
+Optional Features
+=================
+
+Some packages pay attention to `--enable-FEATURE' options to
+`configure', where FEATURE indicates an optional part of the package.
+They may also pay attention to `--with-PACKAGE' options, where PACKAGE
+is something like `gnu-as' or `x' (for the X Window System).  The
+`README' should mention any `--enable-' and `--with-' options that the
+package recognizes.
+
+   For packages that use the X Window System, `configure' can usually
+find the X include and library files automatically, but if it doesn't,
+you can use the `configure' options `--x-includes=DIR' and
+`--x-libraries=DIR' to specify their locations.
+
+Specifying the System Type
+==========================
+
+There may be some features `configure' cannot figure out automatically,
+but needs to determine by the type of machine the package will run on.
+Usually, assuming the package is built to be run on the _same_
+architectures, `configure' can figure that out, but if it prints a
+message saying it cannot guess the machine type, give it the
+`--build=TYPE' option.  TYPE can either be a short name for the system
+type, such as `sun4', or a canonical name which has the form:
+
+     CPU-COMPANY-SYSTEM
+
+where SYSTEM can have one of these forms:
+
+     OS KERNEL-OS
+
+   See the file `config.sub' for the possible values of each field.  If
+`config.sub' isn't included in this package, then this package doesn't
+need to know the machine type.
+
+   If you are _building_ compiler tools for cross-compiling, you should
+use the option `--target=TYPE' to select the type of system they will
+produce code for.
+
+   If you want to _use_ a cross compiler, that generates code for a
+platform different from the build platform, you should specify the
+"host" platform (i.e., that on which the generated programs will
+eventually be run) with `--host=TYPE'.
+
+Sharing Defaults
+================
+
+If you want to set default values for `configure' scripts to share, you
+can create a site shell script called `config.site' that gives default
+values for variables like `CC', `cache_file', and `prefix'.
+`configure' looks for `PREFIX/share/config.site' if it exists, then
+`PREFIX/etc/config.site' if it exists.  Or, you can set the
+`CONFIG_SITE' environment variable to the location of the site script.
+A warning: not all `configure' scripts look for a site script.
+
+Defining Variables
+==================
+
+Variables not defined in a site shell script can be set in the
+environment passed to `configure'.  However, some packages may run
+configure again during the build, and the customized values of these
+variables may be lost.  In order to avoid this problem, you should set
+them in the `configure' command line, using `VAR=value'.  For example:
+
+     ./configure CC=/usr/local2/bin/gcc
+
+causes the specified `gcc' to be used as the C compiler (unless it is
+overridden in the site shell script).  Here is a another example:
+
+     /bin/bash ./configure CONFIG_SHELL=/bin/bash
+
+Here the `CONFIG_SHELL=/bin/bash' operand causes subsequent
+configuration-related scripts to be executed by `/bin/bash'.
+
+`configure' Invocation
+======================
+
+`configure' recognizes the following options to control how it operates.
+
+`--help'
+`-h'
+     Print a summary of the options to `configure', and exit.
+
+`--version'
+`-V'
+     Print the version of Autoconf used to generate the `configure'
+     script, and exit.
+
+`--cache-file=FILE'
+     Enable the cache: use and save the results of the tests in FILE,
+     traditionally `config.cache'.  FILE defaults to `/dev/null' to
+     disable caching.
+
+`--config-cache'
+`-C'
+     Alias for `--cache-file=config.cache'.
+
+`--quiet'
+`--silent'
+`-q'
+     Do not print messages saying which checks are being made.  To
+     suppress all normal output, redirect it to `/dev/null' (any error
+     messages will still be shown).
+
+`--srcdir=DIR'
+     Look for the package's source code in directory DIR.  Usually
+     `configure' can determine that directory automatically.
+
+`configure' also accepts some other, not widely useful, options.  Run
+`configure --help' for more details.
+
diff --git a/storage/pbxt/Makefile.am b/storage/pbxt/Makefile.am
new file mode 100644
index 00000000000..371756c84be
--- /dev/null
+++ b/storage/pbxt/Makefile.am
@@ -0,0 +1,3 @@
+SUBDIRS = src bin
+
+EXTRA_DIST = CMakeLists.txt plug.in
diff --git a/storage/pbxt/NEWS b/storage/pbxt/NEWS
new file mode 100644
index 00000000000..e69de29bb2d
--- /dev/null
+++ b/storage/pbxt/NEWS
diff --git a/storage/pbxt/README b/storage/pbxt/README
new file mode 100644
index 00000000000..52d7cf6c44e
--- /dev/null
+++ b/storage/pbxt/README
@@ -0,0 +1,19 @@
+PrimeBase XT for MySQL 5.1
+==========================
+
+This is the PrimeBase XT (PBXT) transactional storage engine for MySQL. PBXT is "pluggable", which means that it can be loaded dynamically by MySQL at runtime. It uses a unique "write-once" update strategy and MVCC (multi-version concurrency control) to provide optimal performance over a wide range of tasks.
+
+This package includes the complete source code for the engine. Although this is a standalone project it must be built against a compiled version of the MySQL 5.1 source tree, because it references headers files used internally by the server.
+
+Details about how to build PBXT both under UNIX or Windows, as a standalone plug-in, or as part of the MySQL source code, is distribed in the documentation which is avaliable online at:
+
+http://www.primebase.org/documentation
+
+Bug reports, questions and comments can be sent directly to me.
+
+Thanks for your support!
+
+Paul McCullagh
+SNAP Innovation GmbH
+paul.mccullagh@primebase.org
+
diff --git a/storage/pbxt/TODO b/storage/pbxt/TODO
new file mode 100644
index 00000000000..b5782defb61
--- /dev/null
+++ b/storage/pbxt/TODO
@@ -0,0 +1,195 @@
+PBXT To-Do List
+===============
+
+My thanks to all who have downloaded and tested PBXT. If an issue you reported before the date below is not on this list, please e-mail me again. 
+
+------- 2008-12-09
+
+0063: The option for not using memory mapped files must be fixed.
+
+0062: Dynamic option for using memory mapping on a table (Dimitri).
+
+------- 2008-09-12
+
+0061: Add records per key result to ha_pbxt:info() call (Mark).
+
+------- 2008-08-31
+
+0060: Add table option to determine if a table should be memory mapped or not (also requested by Dimitri).
+
+0059: Add table options:
+  AVG_ROW_LENGTH [=] value
+  DATA DIRECTORY [=] 'absolute path to directory'
+  INDEX DIRECTORY [=] 'absolute path to directory'
+  MAX_ROWS [=] value
+
+------- 2008-03-28
+
+0058: Consolidate writes when changes in the log are applied to the database.
+
+------- 2008-03-07
+
+0057: Cluster updates onto a single page.
+
+0056: Add checksum to index and data pages.
+
+0055: When no index cache is available, the complete index must be flushed (not just single pages).
+
+0054: Optimize indexes by not creating indexes that are a complete sub-set of some other index. In this case we must be able to identify part of an index as unique. For example: primary key (a, b), index (a, b, c). Here we would just create index (a, b, c), and specify that the part (a, b) must be unique. Operations on (a, b) will be directed to index (a, b, c).
+
+0053: Check and test lock tables.
+
+0052: Cache data log data in the handle data cache. Must be purged when a handle data record is written.
+
+0051: Write data log data alternatively to the transaction log. The compactor must then compact transaction logs.
+
+0050: [RESOLVED: RN126] Implement consistent write for indexes.
+
+0049: [RESOLVED: RN114] Set the index block size to 4K, or 16K as used by InnoDB.
+
+0048: [RESOLVED: RN110] Add row ID to indexes. This should only be set once the row is cleaned by the sweeper. Then the row ID can be used to make a quite check if the row is the most recent version.
+
+------- 2007-06-19
+
+0047: Test build with ./configure --with-innodb under Linux (Vadim).
+
+0046: [RESOLVED: RN85] Add plug.in file to enable drop in compile under Linux.
+
+0045: Provide libstdc++.so.6 binaries (Vadim).
+
+0044: [RESOLVED: RN73] Limit number of file handles used per table (Brian).
+
+0043: XA (two-phase commit) support (Peter).
+
+------- 2007-03-13
+
+0042: [RESOLVED: RN108] Implemement STATUS commands.
+
+0041: Implement index prefix compression.
+
+------- 2007-03-07
+
+0040: [RESOLVED: RN60] Update in-place when a transaction updates the same record more than once.
+
+0039: Set the number and size of the segments dynamically according to the amount of memory in the cache (and the number of CPUs?) (as discussed with: Peter & Vadim).
+
+0038: [RESOLVED: RN133] Improve the efficiency of the locks by using atomic compare and swap (Peter & Vadim).
+
+0037: [RESOLVED: RN133] Instead of a global LRU list, use a LRU list for segment of the cache (Peter & Vadim). [ Note: a global list using a TAS lock and change time (so that LRU is not always updated) is most efficient].
+
+0036: Add support for deferred foreign key checking (requested by: Mark).
+
+0035: [RESOLVED: RN71] Remove the 2000 table limit (reported by: Hakan).
+
+------- 2007-02-28
+
+0035: [RESOLVED: RN74, RN107] Build in the PBXT system parameters (currently they must be set using environment variables.
+
+0034: [RESOLVED: RN117] Initial documentation (yes, it must be done!)
+
+0033: Make the error code returned on lock error configurable.
+
+0032: [RESOLVED: RN65] Create a source code pluggable version for Windows.
+
+0031: [RESOLVED: RN66] PBXT corrupts the index file when the size exceeds 4 GB (reported by: Luciano)
+
+0030: [RESOLVED: RN102] Implement pbxt_index_flush_delay. Postpones index writing in order to speed up imports. [Resolution uses that fact hat index entries that are missing are added during recovery. As a result, index flushing can be delayed.]
+
+0029: [RESOLVED: RN103] Implement SELECT ... FOR UPDATE (recommended by: Robin).
+
+------- 2007-02-14
+
+0028: Implement CREATE TABLE ... DATA/INDEX DIRECTORY (suggested by: Robin).
+
+------- 2006-12-06
+
+0027: [RESOLVED: RN53] Bug in pbxt with query caching (reported by: Giuseppe) caused violation of transaction isolation.
+
+------- 2006-08-05
+
+0026: Implement BACKUP and RESTORE table (planned for the first post release version).
+
+0025: Implement DISABLE/ENABLE KEYS. Works for FOREIGN KEYs, currently no plans to implement for disabling indexes.
+
+0024: Implement ANALYZE TABLE (planned for the first post release version).
+
+0023: Implement CHECK TABLE (planned for the first release candidate).
+
+0022: [RESOLVED: RN18] Implement TRUNCATE TABLE and DELETE FROM <table>; (i.e. a DELETE without WHERE clause). Currently this function does not cause an error, but no rows are deleted.
+
+------- 2006-07-06
+
+0021: [RESOLVED: RN28] .../mysql-test/mysql-test-run --force --mysqld=--default-storage-engine=pbxt produces a number of errors (reported by: Hakan): As far as I can tell some failures are unnessary but others are bugs. All need to be checked.
+
+------- 2006-07-03
+
+0020: [RESOLVED: RN49] Implement referential integrity (planned for the first release candidate).
+
+------- 2006-04-01
+
+0019: [RESOLVED: RN28] mysql-test-run hangs on alter table (reported by: Hakan): Running a test like ./mysql-test-run.pl --mysqld=--default-storage-engine=pbxt, hangs on ALTER TABLE.
+
+0018: Implement GEOMETRY date type. Note: There are currently no plans to implement this feature.
+
+------- 2006-03-31
+
+0017: [RESOLVED: RN37] MySQL 5.x Version (reported by: Ronald, Giuseppe).
+
+0016: [RESOLVED: RN13] Hang on "DROP DATABASE" (reported by: Giuseppe). Load the world database (http://downloads.mysql.com/docs/world.sql) and convert all tables into PBXT. Then, the drop database command hangs.
+
+0015: [RESOLVED: RN12] Implement isolation level "repeatable read" (reported by: Giuseppe). Current PBXT only supports isolation level "committed read". This means committed data can be seen no matter when it was committed. Use SELECT ... FOR UPDATE to guarantee repeatable read, on data already read.
+
+0014: [RESOLVED: RN7] Two transactions cannot insert simaltaneously if they use auto_increment (reported by: Giuseppe). See also 0005.
+
+0013: [RESOLVED: RN11] Implement buffered write (reported by: Giuseppe): Lack of buffered write leads to bad performance in operations such as ALTER TABLE ENGINE = PBXT and INSERT ... SELECT.
+
+0012: [RESOLVED: RN18] TRUNCATE does not work (reported by: Giuseppe)
+
+0011: [RESOLVED: RN2] Load Sakila Sample Database (reported by: Ronald): ALTER TABLE film ENGINE=PBXT; fails
+
+0010: [RESOLVED: RN6] sql-bench (reported by: Dmitry): ./run-all-tests --create-options=TYPE=PBXT fails.
+
+0009: [RESOLVED: RN29] 64-bit Linux (reported by: Hakan): PBXT current does not compile under 64-bit Linux.
+
+------- 2006-03-16 
+
+0008: [RESOLVED: RN10] Enforcing the unique index constraint:
+
+An index declared as "unique" must return a "duplicate unique key" error when inserting a duplicate value. The difficulty part of implementing this in PBXT is that we may encounter a duplicate value that has not yet been committed. The index reading thread must then wait for the transaction to commit or abort.
+
+0007: [RESOLVED: RN9] Cleaning up empty index nodes:
+
+The Lehman and Yoa algorithm used for indexing does not describe a way of cleaning up empty index nodes on-the-fly. A search of the relevant literature for an algorithm also turns up empty handed (periodic "reorg" is mostly suggested). I have subsequently devised an algorithm that will do the job. This needs to be implemented.
+
+0006: [RESOLVED: RN8] Cache Balancing:
+
+PBXT uses a number of small caches in order to improve concurrency (rather than one large cache). A process is required to manage the amount of cache memory used as a whole. The process must distribute the overall amount of memory available for caching over the small caches, according to demand.
+
+0005: [RESOLVED: RN7] Implement a faster auto-increment method
+
+Currently the auto-increment is handled by the default method used in MySQL. This is done by performing a "fetch-last" on the index for each insert to find the highest key value. This works well unless there are large number empty index nodes due to the problem described in (2) above.
+
+PBXT Testing To-Do List
+
+This is my first take on what still must be tested. My thanks to Ronald Bradford who is working on a generic testing framework that can be used to test PBXT.
+
+0004: [RESOLVED: RN6, RN28] MySQL Tests:
+
+Several tests (for mysql-test-run) written for other engines can be adapted and used to test PBXT.
+
+0003: [RESOLVED: RN30] Multi-processor Test:
+
+There is a difference between preemptive multitasking and true multitasking, which you have on a multi-processor (or dual core) machine. I don't expect any fundamental problems here, but it must be tested.
+
+0002: [RESOLVED: RN5, RN30, RN43] Multi-user/locking Test:
+
+How does the engine perform with a number of concurrent users running various transactions on a number of different tables?
+This is a difficult test to write because it need to simulate a production situation. To test at least 2 or 3 machines is required. The idea is not to use too much data so that a lot of conflicts may occur.
+
+0001: [RESOLVED: RN4, RN43] Load/Stability Test:
+
+How does the engine perform under heavy load over a long period of time? How stable is the engine on power outage, etc?
+
+The test could use a variation of the test program written for test (3) above. At least 3 test machines would be required. The test must be modified to cause as much activity as possible. The test should monitor the performance under load.
+
+
diff --git a/storage/pbxt/bin/Makefile.am b/storage/pbxt/bin/Makefile.am
new file mode 100644
index 00000000000..ab7b711a6f1
--- /dev/null
+++ b/storage/pbxt/bin/Makefile.am
@@ -0,0 +1,14 @@
+# Used to build Makefile.in
+
+INCLUDES =	-I$(top_srcdir)/include -I$(top_builddir)/include \
+			-I$(top_srcdir)/regex \
+			-I$(top_srcdir)/storage/innobase/include \
+			-I$(top_srcdir)/sql \
+			-I$(srcdir) \
+			-I$(srcdir)/../src
+
+bin_PROGRAMS =		xtstat
+
+xtstat_SOURCES =	xtstat_xt.cc ../src/strutil_xt.cc
+
+xtstat_LDADD =		$(top_builddir)/libmysql/libmysqlclient.la
diff --git a/storage/pbxt/bin/xtstat_xt.cc b/storage/pbxt/bin/xtstat_xt.cc
new file mode 100644
index 00000000000..93b3d42e3f6
--- /dev/null
+++ b/storage/pbxt/bin/xtstat_xt.cc
@@ -0,0 +1,819 @@
+/* Copyright (c) 2005 PrimeBase Technologies GmbH
+ *
+ * PrimeBase XT
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * 2008-11-19	Paul McCullagh
+ *
+ * H&G2JCtL
+ */
+
+#include "xt_config.h"
+
+#include <mysql.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <ctype.h>
+#include <string.h>
+
+#include "strutil_xt.h"
+#include "util_xt.h"
+
+//#define DEBUG_INTERRUPT
+
+#define OPT_NONE		-1
+#define OPT_HELP		0
+#define OPT_HOST		1
+#define OPT_USER		2
+#define OPT_PASSWORD	3
+#define OPT_DATABASE	4
+#define OPT_PORT		5
+#define OPT_SOCKET		6
+#define OPT_DELAY		7
+#define OPT_PROTOCOL	8
+#define OPT_DISPLAY		9
+
+#define OPT_HAS_VALUE	1
+#define OPT_OPTIONAL	2
+#define OPT_INTEGER		4
+
+llong		record_cache_size;
+llong		index_cache_size;
+llong		log_cache_size;
+
+llong		accumulative_values[XT_STAT_CURRENT_MAX];
+int			columns_used;
+int			use_i_s = 0;
+
+struct DisplayOrder {
+	int			do_statistic;
+	bool		do_combo;
+} display_order[XT_STAT_CURRENT_MAX];
+
+struct Options {
+	int			opt_id;
+	const char	opt_char;
+	const char	*opt_name;
+	int			opt_flags;
+	const char	*opt_desc;
+	const char	*opt_value_str;
+	int			opt_value_int;
+	bool		opt_value_bool;
+} options[] = {
+	{ OPT_HELP,		'?', "help",		0,
+		"Prints help text", NULL, 0, false },
+	{ OPT_HOST,		'h', "host",		OPT_HAS_VALUE,
+		"Connect to host", NULL, 0, false },
+	{ OPT_USER,		'u', "user",		OPT_HAS_VALUE,
+		"User for login if not current user", NULL, 0, false },
+	{ OPT_PASSWORD, 'p', "password",	OPT_HAS_VALUE | OPT_OPTIONAL,
+		"Password to use when connecting to server. If password is not given it's asked from the tty", NULL, 0, false },
+	{ OPT_DATABASE, 'd', "database",	OPT_HAS_VALUE,
+		"Database to be used (pbxt or information_schema required), default is information_schema", "information_schema", 0, false },
+	{ OPT_PORT,		'P', "port",		OPT_HAS_VALUE | OPT_INTEGER,
+		"Port number to use for connection", NULL, 3306, false },
+	{ OPT_SOCKET,	'S', "socket",		OPT_HAS_VALUE,
+		"Socket file to use for connection", NULL, 0, false },
+	{ OPT_DELAY,	'D', "delay",		OPT_HAS_VALUE | OPT_INTEGER,
+		"Delay in seconds between polls of the database", NULL, 1, false },
+	{ OPT_PROTOCOL,	0, "protocol",		OPT_HAS_VALUE,
+		"Connection protocol to use: default/tcp/socket/pipe/memory", "default", MYSQL_PROTOCOL_DEFAULT, false },
+	{ OPT_DISPLAY,	0, "display",		OPT_HAS_VALUE,
+		"Columns to display: use short names separated by |, partial match allowed", "time-msec,commt,row-ins,rec,ind,ilog,xlog,data,to,dirty", 0, false },
+	{ OPT_NONE,		0, NULL, 0, NULL, NULL, 0, false }
+};
+
+#ifdef XT_WIN
+#define atoll _atoi64
+#endif
+
+void add_statistic(int stat)
+{
+	/* Check if column has already been added: */
+	for (int i=0; i<columns_used; i++) {
+		if (display_order[i].do_statistic == stat)
+			return;
+	}
+	display_order[columns_used].do_statistic = stat;
+	display_order[columns_used].do_combo = false;
+	columns_used++;
+}
+
+void determine_display_order()
+{
+	const char			*cols = options[OPT_DISPLAY].opt_value_str;
+	char				column_1[21], column_2[21];
+	int					i;
+	bool				add, added, add_combo;
+	XTStatMetaDataPtr	meta, meta2;
+
+	if (strcmp(cols, "all") == 0)
+		cols = "time,xact,stat,rec,ind,ilog,xlog,data,to,sweep,scan,row";
+	columns_used = 0;
+	while (*cols) {
+		i = 0;
+		while (*cols && *cols != '-' && *cols != ',') {
+			if (i < 20) {
+				column_1[i] = *cols;
+				i++;
+			}
+			cols++;
+		}
+		column_1[i] = 0;
+		
+		i = 0;
+		if (*cols == '-') {
+			cols++;
+			while (*cols && *cols != '-' && *cols != ',') {
+				if (i < 20) {
+					column_2[i] = *cols;
+					i++;
+				}
+				cols++;
+			}
+		}
+		column_2[i] = 0;
+
+		if (*cols == ',')
+			cols++;
+
+		if (strcmp(column_1, "ms") == 0)
+			strcpy(column_1, "msec");
+		if (strcmp(column_2, "ms") == 0)
+			strcpy(column_2, "msec");
+		add_combo = false;
+		if (strcmp(column_1, "syncs/ms") == 0) {
+			strcpy(column_1, "syncs");
+			add_combo = true;
+		}
+		if (strcmp(column_2, "syncs/ms") == 0) {
+			strcpy(column_2, "syncs");
+			add_combo = true;
+		}
+
+		added = false;
+		for (i=0; i<XT_STAT_MAXIMUM; i++) {
+			meta = xt_get_stat_meta_data(i);
+			add = false;
+			if (strcmp(meta->sm_short_line_1, column_1) == 0) {
+				if (column_2[0]) {
+					if (strcmp(meta->sm_short_line_2, column_2) == 0)
+						add = true;
+				}
+				else {
+					if (i != XT_STAT_XLOG_CACHE_USAGE)
+						add = true;
+				}
+			}
+			else if (!column_2[0]) {
+				if (strcmp(meta->sm_short_line_2, column_1) == 0) {
+					/* XT_STAT_XLOG_CACHE_USAGE is ignored, unless explicity listed! */
+					if (i != XT_STAT_XLOG_CACHE_USAGE)
+						add = true;
+				}
+			}
+			if (add) {
+				added = true;
+				add_statistic(i);
+				if (add_combo)
+					add_statistic(i+1);
+			}
+		}
+		if (!added) {
+			if (column_2[0])
+				fprintf(stderr, "ERROR: No statistic matches display option: '%s-%s'\n", column_1, column_2);
+			else
+				fprintf(stderr, "ERROR: No statistic matches display option: '%s'\n", column_1);
+			fprintf(stderr, "Display options: %s\n", options[OPT_DISPLAY].opt_value_str);
+			exit(1);
+		}
+	}
+
+	/* Setup "combo" fields: */
+	for (i=0; i<columns_used; i++) {
+		meta = xt_get_stat_meta_data(display_order[i].do_statistic);
+		if (meta->sm_flags & XT_STAT_COMBO_FIELD) {
+			if (i+1 < columns_used) {
+				meta2 = xt_get_stat_meta_data(display_order[i+1].do_statistic);
+				if (meta2->sm_flags & XT_STAT_COMBO_FIELD_2) {
+					if (strcmp(meta->sm_short_line_1, meta2->sm_short_line_1) == 0)
+						display_order[i].do_combo = true;
+				}
+			}
+		}
+	}
+}
+
+void format_percent_value(char *buffer, double value, double perc)
+{
+	value = value * (double) 100 / (double) perc;
+	if (value >= 100)
+		sprintf(buffer, "%.0f", value);
+	else
+		sprintf(buffer, "%.1f", value);
+	buffer[4] = 0;
+	if (buffer[3] == '.')
+		buffer[3] = 0;
+}
+
+#define XT_1_K				((double) 1024)
+#define XT_1_M				((double) 1024 * (double) 1024)
+#define XT_1_G				((double) 1024 * (double) 1024 * (double) 1024)
+#define XT_1_T				((double) 1024 * (double) 1024 * (double) 1024 * (double) 1024)
+#define XT_10000_K			((double) 10000 * XT_1_K)
+#define XT_10000_M			((double) 10000 * XT_1_M)
+#define XT_10000_G			((double) 10000 * XT_1_G)
+
+void format_byte_value(char *buffer, double value)
+{
+	double	dval;
+	char	string[100];
+	char	ch;
+
+	if (value < (double) 100000) {
+		/* byte value from 0 to 99999: */
+		sprintf(buffer, "%.0f", value);
+		return;
+	}
+
+	if (value < XT_10000_K) {
+		dval = value / XT_1_K;
+		ch = 'K';
+	}
+	else if (value < XT_10000_M) {
+		dval = value / XT_1_M;
+		ch = 'M';
+	}
+	else if (value < XT_10000_G) {
+		dval = value / XT_1_G;
+		ch = 'G';
+	}
+	else {
+		dval = value / XT_1_T;
+		ch = 'T';
+	}
+
+	if (dval < (double) 10.0)
+		sprintf(string, "%.2f", dval);
+	else if (dval < (double) 100.0)
+		sprintf(string, "%.1f", dval);
+	else
+		sprintf(string, "%.0f", dval);
+	if (string[3] == '.')
+		string[3] = 0;
+	else
+		string[4] = 0;
+	sprintf(buffer, "%s%c", string, ch);
+}
+
+/*
+ * Uses:
+ * t = thousands
+ * m = millions
+ * b = billions
+ */
+void format_mini_count_value(char *buffer, double value)
+{
+	double	dval;
+	char	string[100];
+	char	ch;
+
+	if (value < (double) 100) {
+		/* Value from 0 to 99: */
+		sprintf(buffer, "%.0f", value);
+		return;
+	}
+
+	if (value < (double) 1000) {
+		sprintf(buffer, "<t");
+		return;
+	}
+
+	if (value < (double) 10000) {
+		/* Value is less than 1m */
+		dval = value / (double) 1000.0;
+		ch = 't';
+	}
+	else if (value < (double) 1000000) {
+		sprintf(buffer, "<m");
+		return;
+	}
+	else if (value < (double) 10000000) {
+		/* Value is less than 1b */
+		dval = value / (double) 1000000.0;
+		ch = 'm';
+	}
+	else if (value < (double) 1000000000) {
+		sprintf(buffer, "<b");
+		return;
+	}
+	else {
+		/* Value is greater than 1 billion  */
+		dval = value / (double) 1000000000.0;
+		ch = 'b';
+	}
+
+	sprintf(string, "%1.0f", dval);
+	string[1] = 0;
+	sprintf(buffer, "%s%c", string, ch);
+}
+
+#define XT_1_THOUSAND		((double) 1000)
+#define XT_1_MILLION		((double) 1000 * (double) 1000)
+#define XT_1_BILLION		((double) 1000 * (double) 1000 * (double) 1000)
+#define XT_1_TRILLION		((double) 1000 * (double) 1000 * (double) 1000 * (double) 1000)
+#define XT_10_THOUSAND		((double) 10 * (double) 1000)
+#define XT_10_MILLION		((double) 10 * (double) 1000 * (double) 1000)
+#define XT_10_BILLION		((double) 10 * (double) 1000 * (double) 1000 * (double) 1000)
+#define XT_10_TRILLION		((double) 10 * (double) 1000 * (double) 1000 * (double) 1000 * (double) 1000)
+
+void format_count_value(char *buffer, double value)
+{
+	double	dval;
+	char	string[100];
+	char	ch;
+
+	if (value < (double) 0) {
+		strcpy(buffer, "0");
+		return;
+	}
+
+	if (value < XT_10_THOUSAND) {
+		/* byte value from 0 to 99999: */
+		sprintf(buffer, "%.0f", value);
+		return;
+	}
+
+	if (value < XT_10_MILLION) {
+		/* Value is less than 10 million */
+		dval = value / XT_1_THOUSAND;
+		ch = 't';
+	}
+	else if (value < XT_10_BILLION) {
+		/* Value is less than 10 million */
+		dval = value / XT_1_MILLION;
+		ch = 'm';
+	}
+	else if (value < XT_10_TRILLION) {
+		/* Value is less than 10 trillion */
+		dval = value / XT_1_BILLION;
+		ch = 'b';
+	}
+	else {
+		dval = value / XT_1_TRILLION;
+		ch = 't';
+	}
+
+	if (dval < (double) 10.0)
+		sprintf(string, "%.2f", dval);
+	else if (dval < (double) 100.0)
+		sprintf(string, "%.1f", dval);
+	else
+		sprintf(string, "%.0f", dval);
+	if (string[3] == '.')
+		string[3] = 0;
+	else
+		string[4] = 0;
+	sprintf(buffer, "%s%c", string, ch);
+}
+
+void print_help()
+{
+	struct Options	*opt;
+	char			command[100];
+
+	printf("Usage: xtstat [ options ]\n");
+	printf("e.g. xtstat -D10 : Poll every 10 seconds\n");
+	opt = options;
+	printf("Options :-\n");
+	while (opt->opt_id != OPT_NONE) {
+		strcpy(command, opt->opt_name);
+		if (opt->opt_flags & OPT_HAS_VALUE) {
+			if (opt->opt_flags & OPT_OPTIONAL)
+				strcat(command, "[=value]");
+			else
+				strcat(command, "=value");
+		}
+		if (opt->opt_char)
+			printf("-%c, --%-16s %s.\n", opt->opt_char, command, opt->opt_desc);
+		else
+			printf("    --%-16s %s.\n", command, opt->opt_desc);
+		opt++;
+	}
+}
+
+void print_stat_key()
+{
+	printf("Key :-\n");
+	printf("K = Kilobytes (1,024 bytes)\n");
+	printf("M = Megabytes (1,048,576 bytes)\n");
+	printf("G = Gigabytes (1,073,741,024 bytes)\n");
+	printf("T = Terabytes (1,099,511,627,776 bytes)\n");
+	printf("t = thousands (1,000s)\n");
+	printf("m = millions  (1,000,000s)\n");
+	printf("b = billions  (1,000,000,000s)\n");
+}
+
+void print_stat_info()
+{
+	XTStatMetaDataPtr	meta;
+	char				buffer[40];
+	char				desc[400];
+
+	printf("Statistics :-\n");
+	for (int i=0; i<XT_STAT_CURRENT_MAX; i++) {
+		meta = xt_get_stat_meta_data(i);
+		sprintf(desc, meta->sm_description, "milli");
+		sprintf(buffer, "%s-%s", meta->sm_short_line_1, meta->sm_short_line_2);
+		if (meta->sm_flags & XT_STAT_COMBO_FIELD) {
+			/* Combine next 2 fields: */
+			i++;
+			strcat(buffer, "/ms");
+			strcat(desc, "/time taken in milliseconds");
+		}
+		printf("%-13s %-21s - %s.\n", buffer, meta->sm_name, desc);
+	}
+}
+
+bool match_arg(char *what, const char *opt, char **value)
+{
+	while (*what && *opt && isalpha(*what)) {
+		if (*what != *opt)
+			return false;
+		what++;
+		opt++;
+	}
+	if (*opt)
+		return false;
+	if (*what == '=')
+		*value = what + 1;
+	else if (*what)
+		return false;
+	else
+		*value = NULL;
+	return true;
+}
+
+void parse_args(int argc, char **argv)
+{
+	char			*ptr;
+	char			*value;
+	int				i = 1;
+	struct Options	*opt;
+	bool			found;
+
+	while (i < argc) {
+		ptr = argv[i];
+		found = false;
+		if (*ptr == '-') {
+			ptr++;
+			if (*ptr == '-') {
+				ptr++;
+				opt = options;
+				while (opt->opt_id != OPT_NONE) {
+					if (match_arg(ptr, opt->opt_name, &value)) {
+						found = true;
+						opt->opt_value_str = value;
+						opt->opt_value_bool = true;
+						break;
+					}
+					opt++;
+				}
+			}
+			else {
+				opt = options;
+				while (opt->opt_id != OPT_NONE) {
+					if (*ptr == opt->opt_char) {
+						ptr++;
+						if (*ptr)
+							opt->opt_value_str = ptr;
+						else {
+							opt->opt_value_str = NULL;
+							if (i+1 < argc) {
+								ptr = argv[i+1];
+								if (*ptr != '-') {
+									opt->opt_value_str = ptr;
+									i++;
+								}
+							}
+						}
+						found = true;
+						opt->opt_value_bool = true;
+						break;
+					}
+					opt++;
+				}
+			}
+		}
+		
+		if (!found) {
+			fprintf(stderr, "Unknown option: %s\n", argv[i]);
+			print_help();
+			exit(1);
+		}
+
+		if (opt->opt_flags & OPT_HAS_VALUE) {
+			if (!(opt->opt_flags & OPT_OPTIONAL)) {
+				if (!opt->opt_value_str) {
+					fprintf(stderr, "Option requires a value: %s\n", argv[i]);
+					printf("Use --help for help on commands and usage\n");
+					exit(1);
+				}
+			}
+		}
+		else {
+			if (opt->opt_value_str) {
+				fprintf(stderr, "Option does not accept a value: %s\n", argv[i]);
+				printf("Use --help for help on commands and usage\n");
+				exit(1);
+			}
+		}
+
+		if (opt->opt_value_str && (opt->opt_flags & OPT_INTEGER))
+			opt->opt_value_int = atoi(opt->opt_value_str);
+
+		if (opt->opt_id == OPT_HELP) {
+			print_help();
+			print_stat_key();
+			print_stat_info();
+			exit(1);
+		}
+
+		i++;
+	}
+}
+
+#ifdef DEBUG_INTERRUPT
+void interrupt_pbxt(MYSQL *conn)
+{
+	MYSQL_RES *res;
+
+	if (mysql_query(conn, "show engine pbxt status")) {
+		fprintf(stderr, "%s\n", mysql_error(conn));
+		exit(1);
+	}
+
+	res = mysql_use_result(conn);
+	mysql_free_result(res);
+}
+#endif
+
+static bool display_parameters(MYSQL *conn)
+{
+	MYSQL_RES		*res;
+	MYSQL_ROW		row;
+
+	/* send SQL query */
+	if (mysql_query(conn, "show variables like 'pbxt_%'"))
+		return false;
+
+	if (!(res = mysql_use_result(conn)))
+		return false;
+
+	/* output table name */
+	printf("-- PBXT System Variables --\n");
+	while ((row = mysql_fetch_row(res)) != NULL) {
+		if (strcmp(row[0], "pbxt_index_cache_size") == 0)
+			index_cache_size = xt_byte_size_to_int8(row[1]);
+		else if (strcmp(row[0], "pbxt_record_cache_size") == 0)
+			record_cache_size = xt_byte_size_to_int8(row[1]);
+		else if (strcmp(row[0], "pbxt_log_cache_size") == 0)
+			log_cache_size = xt_byte_size_to_int8(row[1]);
+		printf("%-29s= %s\n", row[0], row[1]);
+	}
+
+	mysql_free_result(res);
+
+	for (int i=0; i<XT_STAT_CURRENT_MAX; i++)
+		accumulative_values[i] = 0;
+
+	printf("Display options: %s\n", options[OPT_DISPLAY].opt_value_str);
+	return true;
+}
+
+static bool connect(MYSQL *conn)
+{
+	unsigned int	type;
+
+	if (strcasecmp(options[OPT_PROTOCOL].opt_value_str, "tcp") == 0)
+		type = MYSQL_PROTOCOL_TCP;
+	else if (strcasecmp(options[OPT_PROTOCOL].opt_value_str, "socket") == 0)
+		type = MYSQL_PROTOCOL_SOCKET;
+	else if (strcasecmp(options[OPT_PROTOCOL].opt_value_str, "pipe") == 0)
+		type = MYSQL_PROTOCOL_PIPE;
+	else if (strcasecmp(options[OPT_PROTOCOL].opt_value_str, "memory") == 0)
+		type = MYSQL_PROTOCOL_MEMORY;
+	else
+		type = MYSQL_PROTOCOL_DEFAULT;
+
+	if (mysql_options(conn, MYSQL_OPT_PROTOCOL, (char *) &type))
+		return false;
+
+	if (mysql_options(conn, MYSQL_READ_DEFAULT_GROUP, "xtstat"))
+		return false;
+
+	if (strcasecmp(options[OPT_DATABASE].opt_value_str, "pbxt") == 0)
+		use_i_s = FALSE;
+	else if (strcasecmp(options[OPT_DATABASE].opt_value_str, "information_schema") == 0)
+		use_i_s = TRUE;
+	else
+		use_i_s = TRUE;
+
+	/* Connect to database */
+	if (!mysql_real_connect(conn,
+			options[OPT_HOST].opt_value_str,
+			options[OPT_USER].opt_value_str,
+			options[OPT_PASSWORD].opt_value_str,
+			options[OPT_DATABASE].opt_value_str,
+			options[OPT_PORT].opt_value_int,
+			options[OPT_SOCKET].opt_value_str,
+			0))
+		return false;
+
+	return true;
+}
+
+int main(int argc, char **argv)
+{
+	MYSQL				*conn;
+	MYSQL_RES			*res;
+	MYSQL_ROW			row;
+	llong				current_values[XT_STAT_CURRENT_MAX];
+	double				value;
+	char				str_value[100];
+	XTStatMetaDataPtr	meta;
+	int					len;
+	int					stat;
+	int					err;
+	bool				select_worked = true;
+
+	xt_set_time_unit("msec");
+	parse_args(argc, argv);
+
+	determine_display_order();
+
+	if (!(conn = mysql_init(NULL))) {
+		fprintf(stderr, "Insufficient memory\n");
+		exit(1);
+	}
+
+	if (!connect(conn) || !display_parameters(conn)) {
+		fprintf(stderr, "%s\n", mysql_error(conn));
+		exit(1);
+	}
+
+	retry:
+	for (int loop = 0; ; loop++) {
+		if (use_i_s)
+			err = mysql_query(conn, "select id, Value from information_schema.pbxt_statistics order by ID");
+		else
+			err = mysql_query(conn, "select id, Value from pbxt.statistics order by ID");
+		if (err)
+			goto reconnect;
+
+		if (!(res = mysql_use_result(conn)))
+			goto reconnect;
+		select_worked = true;
+
+		while ((row = mysql_fetch_row(res)) != NULL) {
+			stat = atoi(row[0])-1;
+			current_values[stat] = atoll(row[1]);
+		}
+		mysql_free_result(res);
+
+#ifdef DEBUG_INTERRUPT
+		if (current_values[XT_STAT_STAT_WRITES] - accumulative_values[XT_STAT_STAT_WRITES] == 0 &&
+			current_values[XT_STAT_REC_SYNC_TIME] - accumulative_values[XT_STAT_REC_SYNC_TIME] == 0 &&
+			current_values[XT_STAT_IND_SYNC_TIME] - accumulative_values[XT_STAT_IND_SYNC_TIME] == 0)
+			interrupt_pbxt();
+#endif
+
+		if ((loop % 25) == 0) {
+			for (int column=0; column<columns_used; column++) {
+				len = 5;
+				meta = xt_get_stat_meta_data(display_order[column].do_statistic);
+				strcpy(str_value, meta->sm_short_line_1);
+				if (display_order[column].do_combo) {
+					/* Combine next 2 fields: */
+					len = 8;
+					column++;
+				}
+				else if (meta->sm_flags & XT_STAT_PERCENTAGE)
+					len = 4;
+				else if (meta->sm_flags & XT_STAT_DATE)
+					len = 15;
+				printf("%*s ", len, str_value);
+			}
+			printf("\n");
+			for (int column=0; column<columns_used; column++) {
+				len = 5;
+				meta = xt_get_stat_meta_data(display_order[column].do_statistic);
+				strcpy(str_value, meta->sm_short_line_2);
+				if (display_order[column].do_combo) {
+					/* Combine next 2 fields: */
+					len = 8;
+					column++;
+					strcat(str_value, "/ms");
+				}
+				else if (meta->sm_flags & XT_STAT_PERCENTAGE)
+					len = 4;
+				else if (meta->sm_flags & XT_STAT_DATE)
+					len = 15;
+				printf("%*s ", len, str_value);
+			}
+			printf("\n");
+		}
+
+		for (int column=0; column<columns_used; column++) {
+			len = 5;
+			stat = display_order[column].do_statistic;
+			meta = xt_get_stat_meta_data(stat);
+			if (meta->sm_flags & XT_STAT_ACCUMULATIVE) {
+				/* Take care of overflow! */
+				if (current_values[stat] < accumulative_values[stat])
+					value = (double) (0xFFFFFFFF - (accumulative_values[stat] - current_values[stat]));
+				else
+					value = (double) (current_values[stat] - accumulative_values[stat]);
+			}
+			else
+				value = (double) current_values[stat];
+			accumulative_values[stat] = current_values[stat];
+			if (meta->sm_flags & XT_STAT_TIME_VALUE)
+				value = value / (double) 1000;
+			if (display_order[column].do_combo) {
+				format_mini_count_value(str_value, value);
+				strcat(str_value, "/");
+				column++;
+				stat = display_order[column].do_statistic;
+				value = (double) (current_values[stat] - accumulative_values[stat]);
+				accumulative_values[stat] = current_values[stat];
+				value = value / (double) 1000;
+				format_count_value(&str_value[strlen(str_value)], value);
+				len = 8;
+			}
+			else if (meta->sm_flags & XT_STAT_PERCENTAGE) {
+				double perc = 100;
+				switch (stat) {
+					case XT_STAT_REC_CACHE_USAGE:	perc = (double)record_cache_size; break;
+					case XT_STAT_IND_CACHE_USAGE:	perc = (double)index_cache_size; break;
+					case XT_STAT_XLOG_CACHE_USAGE:	perc = (double)log_cache_size; break;
+				}
+				format_percent_value(str_value, value, perc);
+				len = 4;
+			}
+			else if (meta->sm_flags & XT_STAT_DATE) {
+				time_t ticks = (time_t) value;
+				const struct tm *ltime = localtime(&ticks);
+				strftime(str_value, 99, "%y%m%d %H:%M:%S", ltime);
+				len = 15;
+			}
+			else if (meta->sm_flags & XT_STAT_BYTE_COUNT)
+				format_byte_value(str_value, value);
+			else
+				format_count_value(str_value, value);
+			if (column == columns_used-1)
+				printf("%*s\n", len, str_value);
+			else
+				printf("%*s ", len, str_value);
+		}
+
+		sleep(options[OPT_DELAY].opt_value_int);
+	}
+
+	/* close connection */
+	mysql_close(conn);
+	return 0;
+
+	reconnect:
+	/* Reconnect... */
+	if (select_worked) {
+		/* Only print message if the SELECT worked.
+		 * or we will get a screen full of messages:
+		 */
+		fprintf(stderr, "%s\n", mysql_error(conn));
+		printf("Reconnecting...\n");
+	}
+	mysql_close(conn);
+	if (!(conn = mysql_init(NULL))) {
+		fprintf(stderr, "Insufficient memory\n");
+		exit(1);
+	}
+	do {
+		sleep(2);
+	} while (!connect(conn));
+	select_worked = false;
+	goto retry;
+}
diff --git a/storage/pbxt/plug.in b/storage/pbxt/plug.in
new file mode 100644
index 00000000000..02c5d8adcbe
--- /dev/null
+++ b/storage/pbxt/plug.in
@@ -0,0 +1,8 @@
+MYSQL_STORAGE_ENGINE(pbxt,no,  [PBXT Storage Engine],
+        [MVCC-based transactional engine], [max,max-no-ndb])
+MYSQL_PLUGIN_DIRECTORY(pbxt, [storage/pbxt])
+MYSQL_PLUGIN_STATIC(pbxt,    [src/libpbxt.a])
+MYSQL_PLUGIN_ACTIONS(pbxt,  [
+#               AC_CONFIG_FILES(storage/pbxt/src/Makefile)
+               ])
+MYSQL_PLUGIN_DEPENDS_ON_MYSQL_INTERNALS(pbxt, [[src/ha_pbxt.cc],[src/myxt_xt.cc],[src/discover_xt.cc]])
diff --git a/storage/pbxt/src/Makefile.am b/storage/pbxt/src/Makefile.am
new file mode 100644
index 00000000000..fc4c4ef8f1e
--- /dev/null
+++ b/storage/pbxt/src/Makefile.am
@@ -0,0 +1,50 @@
+# Used to build Makefile.in
+
+MYSQLDATAdir =          $(localstatedir)
+MYSQLSHAREdir =         $(pkgdatadir)
+MYSQLBASEdir=           $(prefix)
+MYSQLLIBdir=            $(pkglibdir)
+pkgplugindir =          $(pkglibdir)/plugin
+INCLUDES=		-I$(top_srcdir)/include -I$(top_builddir)/include \
+			-I$(top_srcdir)/regex \
+			-I$(top_srcdir)/storage/innobase/include \
+			-I$(top_srcdir)/sql \
+			-I$(srcdir)
+
+LIBS =
+
+LDADD =
+
+noinst_HEADERS =		bsearch_xt.h cache_xt.h ccutils_xt.h database_xt.h \
+						datadic_xt.h datalog_xt.h filesys_xt.h hashtab_xt.h \
+						ha_pbxt.h heap_xt.h index_xt.h linklist_xt.h \
+						memory_xt.h myxt_xt.h pthread_xt.h restart_xt.h \
+						pbms_enabled.h sortedlist_xt.h strutil_xt.h \
+						tabcache_xt.h table_xt.h trace_xt.h thread_xt.h \
+						util_xt.h xaction_xt.h xactlog_xt.h lock_xt.h \
+						systab_xt.h ha_xtsys.h discover_xt.h backup_xt.h \
+						pbms.h xt_config.h xt_defs.h xt_errno.h locklist_xt.h
+EXTRA_LTLIBRARIES =	libpbxt.la
+
+libpbxt_la_SOURCES =	bsearch_xt.cc cache_xt.cc ccutils_xt.cc database_xt.cc \
+						datadic_xt.cc datalog_xt.cc filesys_xt.cc hashtab_xt.cc \
+						ha_pbxt.cc heap_xt.cc index_xt.cc linklist_xt.cc \
+						memory_xt.cc myxt_xt.cc pthread_xt.cc restart_xt.cc \
+						sortedlist_xt.cc strutil_xt.cc \
+						tabcache_xt.cc table_xt.cc trace_xt.cc thread_xt.cc \
+						systab_xt.cc ha_xtsys.cc discover_xt.cc backup_xt.cc \
+						util_xt.cc xaction_xt.cc xactlog_xt.cc lock_xt.cc locklist_xt.cc
+
+libpbxt_la_LDFLAGS =	-module
+
+# These are the warning Drizzle uses:
+# DRIZZLE_WARNINGS =		-W -Wall -Wextra -pedantic -Wundef -Wredundant-decls -Wno-strict-aliasing -Wno-long-long -Wno-unused-parameter
+
+libpbxt_la_CXXFLAGS =	$(AM_CXXFLAGS) -DMYSQL_DYNAMIC_PLUGIN
+libpbxt_la_CFLAGS =		$(AM_CFLAGS) -DMYSQL_DYNAMIC_PLUGIN -std=c99
+
+EXTRA_LIBRARIES =		libpbxt.a
+noinst_LIBRARIES = 		libpbxt.a
+libpbxt_a_SOURCES =		$(libpbxt_la_SOURCES)
+
+EXTRA_DIST =			pbms_enabled.cc win_inttypes.h
diff --git a/storage/pbxt/src/backup_xt.cc b/storage/pbxt/src/backup_xt.cc
new file mode 100644
index 00000000000..b9631f2cfd5
--- /dev/null
+++ b/storage/pbxt/src/backup_xt.cc
@@ -0,0 +1,802 @@
+/* Copyright (c) 2009 PrimeBase Technologies GmbH
+ *
+ * PrimeBase XT
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * 2009-09-07	Paul McCullagh
+ *
+ * H&G2JCtL
+ */
+
+#include "xt_config.h"
+
+#ifdef MYSQL_SUPPORTS_BACKUP
+
+#include <string.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <time.h>
+#include <ctype.h>
+
+#include "mysql_priv.h"
+#include <backup/api_types.h>
+#include <backup/backup_engine.h>
+#include <backup/backup_aux.h>         // for build_table_list()
+#include <hash.h>
+
+#include "ha_pbxt.h"
+
+#include "backup_xt.h"
+#include "pthread_xt.h"
+#include "filesys_xt.h"
+#include "database_xt.h"
+#include "strutil_xt.h"
+#include "memory_xt.h"
+#include "trace_xt.h"
+#include "myxt_xt.h"
+
+#ifdef OK
+#undef OK
+#endif
+
+#ifdef byte
+#undef byte
+#endif
+
+#ifdef DEBUG
+//#define TRACE_BACKUP_CALLS
+//#define TEST_SMALL_BLOCK			100000
+#endif
+
+using backup::byte;
+using backup::result_t;
+using backup::version_t;
+using backup::Table_list;
+using backup::Table_ref;
+using backup::Buffer;
+
+#ifdef TRACE_BACKUP_CALLS
+#define XT_TRACE_CALL()				ha_trace_function(__FUNC__, NULL)
+#else
+#define XT_TRACE_CALL()
+#endif
+
+#define XT_RESTORE_BATCH_SIZE		10000
+
+#define BUP_STATE_BEFORE_LOCK		0
+#define BUP_STATE_AFTER_LOCK		1
+
+#define BUP_STANDARD_VAR_RECORD		1
+#define BUP_RECORD_BLOCK_4_START	2			// Part of a record, with a 4 byte total length, and 4 byte data length
+#define BUP_RECORD_BLOCK_4			3			// Part of a record, with a 4 byte length
+#define BUP_RECORD_BLOCK_4_END		4			// Last part of a record with a 4 byte length
+
+/*
+ * -----------------------------------------------------------------------
+ * UTILITIES
+ */
+
+#ifdef TRACE_BACKUP_CALLS
+static void ha_trace_function(const char *function, char *table)
+{
+	char		func_buf[50], *ptr;
+	XTThreadPtr	thread = xt_get_self(); 
+	
+	if ((ptr = strchr(function, '('))) {
+		ptr--;
+		while (ptr > function) {
+			if (!(isalnum(*ptr) || *ptr == '_'))
+				break;
+			ptr--;
+		}
+		ptr++;
+		xt_strcpy(50, func_buf, ptr);
+		if ((ptr = strchr(func_buf, '(')))
+			*ptr = 0;
+	}
+	else
+		xt_strcpy(50, func_buf, function);
+	if (table)
+		printf("%s %s (%s)\n", thread ? thread->t_name : "-unknown-", func_buf, table);
+	else
+		printf("%s %s\n", thread ? thread->t_name : "-unknown-", func_buf);
+}
+#endif
+
+/*
+ * -----------------------------------------------------------------------
+ * BACKUP DRIVER
+ */
+
+class PBXTBackupDriver: public Backup_driver
+{
+	public:
+	PBXTBackupDriver(const Table_list &);
+	virtual ~PBXTBackupDriver();
+
+	virtual size_t		size();
+	virtual size_t		init_size();
+	virtual result_t	begin(const size_t);
+	virtual result_t	end();
+	virtual result_t	get_data(Buffer &);
+	virtual result_t	prelock();
+	virtual result_t	lock();
+	virtual result_t	unlock();
+	virtual result_t	cancel();
+	virtual void		free();
+	void				lock_tables_TL_READ_NO_INSERT();
+
+	private:
+	XTThreadPtr		bd_thread;
+	int				bd_state;
+	u_int			bd_table_no;
+	XTOpenTablePtr	bd_ot;
+	xtWord1			*bd_row_buf;
+
+	/* Non-zero if we last returned only part of
+	 * a row.
+	 */
+	xtWord1			*db_write_block(xtWord1 *buffer, xtWord1 bup_type, size_t *size, xtWord4 row_len);
+	xtWord1			*db_write_block(xtWord1 *buffer, xtWord1 bup_type, size_t *size, xtWord4 total_len, xtWord4 row_len);
+
+	xtWord4			bd_row_offset;
+	xtWord4			bd_row_size;
+};
+
+
+PBXTBackupDriver::PBXTBackupDriver(const Table_list &tables):
+Backup_driver(tables),
+bd_state(BUP_STATE_BEFORE_LOCK),
+bd_table_no(0),
+bd_ot(NULL),
+bd_row_buf(NULL),
+bd_row_offset(0),
+bd_row_size(0)
+{
+}
+
+PBXTBackupDriver::~PBXTBackupDriver()
+{
+}
+
+/** Estimates total size of backup. @todo improve it */
+size_t PBXTBackupDriver::size()
+{
+	XT_TRACE_CALL();
+	return UNKNOWN_SIZE;
+}
+
+/** Estimates size of backup before lock. @todo improve it */
+size_t PBXTBackupDriver::init_size()
+{
+	XT_TRACE_CALL();
+	return 0;
+}
+
+result_t PBXTBackupDriver::begin(const size_t)
+{
+	THD				*thd = current_thd;
+	XTExceptionRec	e;
+
+	XT_TRACE_CALL();
+	
+	if (!(bd_thread = xt_ha_set_current_thread(thd, &e))) {
+		xt_log_exception(NULL, &e, XT_LOG_DEFAULT);
+		return backup::ERROR;
+	}
+	
+	return backup::OK;
+}
+
+result_t PBXTBackupDriver::end()
+{
+	XT_TRACE_CALL();
+	if (bd_ot) {
+		xt_tab_seq_exit(bd_ot);
+		xt_db_return_table_to_pool_ns(bd_ot);
+		bd_ot = NULL;
+	}
+	if (bd_thread->st_xact_data) {
+		if (!xt_xn_commit(bd_thread))
+			return backup::ERROR;
+	}
+	return backup::OK;
+}
+
+xtWord1 *PBXTBackupDriver::db_write_block(xtWord1 *buffer, xtWord1 bup_type, size_t *ret_size, xtWord4 row_len)
+{
+	register size_t size = *ret_size;
+
+	*buffer = bup_type;	// Record type identifier.
+	buffer++;
+	size--;
+	memcpy(buffer, bd_ot->ot_row_wbuffer, row_len);
+	buffer += row_len;
+	size -= row_len;
+	*ret_size = size;
+	return buffer;
+}
+
+xtWord1 *PBXTBackupDriver::db_write_block(xtWord1 *buffer, xtWord1 bup_type, size_t *ret_size, xtWord4 total_len, xtWord4 row_len)
+{
+	register size_t size = *ret_size;
+
+	*buffer = bup_type;	// Record type identifier.
+	buffer++;
+	size--;
+	if (bup_type == BUP_RECORD_BLOCK_4_START) {
+		XT_SET_DISK_4(buffer, total_len);
+		buffer += 4;
+		size -= 4;
+	}
+	XT_SET_DISK_4(buffer, row_len);
+	buffer += 4;
+	size -= 4;
+	memcpy(buffer, bd_ot->ot_row_wbuffer+bd_row_offset, row_len);
+	buffer += row_len;
+	size -= row_len;
+	bd_row_size -= row_len;
+	bd_row_offset += row_len;
+	*ret_size = size;
+	return buffer;
+}
+
+result_t PBXTBackupDriver::get_data(Buffer &buf)
+{
+	xtBool	eof = FALSE;
+	size_t	size;
+	xtWord4	row_len;
+	xtWord1	*buffer;
+
+	XT_TRACE_CALL();
+
+	if (bd_state == BUP_STATE_BEFORE_LOCK) {
+		buf.table_num = 0;
+		buf.size = 0;
+		buf.last = FALSE;
+		return backup::READY;
+	}
+
+	/* Open the backup table: */
+	if (!bd_ot) {
+		XTThreadPtr		self = bd_thread;
+		XTTableHPtr		tab;
+		char			path[PATH_MAX];
+	
+		if (bd_table_no == m_tables.count()) {
+			buf.size = 0;
+			buf.table_num = 0;
+			buf.last = TRUE;
+			return backup::DONE;
+		}
+		
+		m_tables[bd_table_no].internal_name(path, sizeof(path));
+		bd_table_no++;
+		try_(a)	{
+			xt_ha_open_database_of_table(self, (XTPathStrPtr) path);
+			tab = xt_use_table(self, (XTPathStrPtr) path, FALSE, FALSE);
+			pushr_(xt_heap_release, tab);
+			if (!(bd_ot = xt_db_open_table_using_tab(tab, bd_thread)))
+				xt_throw(self);
+			freer_(); // xt_heap_release(tab)
+
+			/* Prepare the seqential scan: */
+			xt_tab_seq_exit(bd_ot);
+			if (!xt_tab_seq_init(bd_ot))
+				xt_throw(self);
+			
+			if (bd_row_buf) {
+				xt_free(self, bd_row_buf);
+				bd_row_buf = NULL;
+			}
+			bd_row_buf = (xtWord1 *) xt_malloc(self, bd_ot->ot_table->tab_dic.dic_mysql_buf_size);
+			bd_ot->ot_cols_req = bd_ot->ot_table->tab_dic.dic_no_of_cols;
+		}
+		catch_(a) {
+			;
+		}
+		cont_(a);
+
+		if (!bd_ot)
+			goto failed;
+	}
+
+	buf.table_num = bd_table_no;
+#ifdef TEST_SMALL_BLOCK
+	buf.size = TEST_SMALL_BLOCK;
+#endif
+	size = buf.size;
+	buffer = (xtWord1 *) buf.data;
+	ASSERT_NS(size > 9);
+
+	/* First check of a record was partically written
+	 * last time.
+	 */
+	write_row:
+	if (bd_row_size > 0) {
+		row_len = bd_row_size;
+		if (bd_row_offset == 0) {
+			if (row_len+1 > size) {
+				ASSERT_NS(size > 9);
+				row_len = size - 9;
+				buffer = db_write_block(buffer, BUP_RECORD_BLOCK_4_START, &size, bd_row_size, row_len);
+				goto done;
+			}
+			buffer = db_write_block(buffer, BUP_STANDARD_VAR_RECORD, &size, row_len);
+			bd_row_size = 0;
+		}
+		else {
+			if (row_len+5 > size) {
+				row_len = size - 5;
+				buffer = db_write_block(buffer, BUP_RECORD_BLOCK_4, &size, 0, row_len);
+				goto done;
+			}
+			buffer = db_write_block(buffer, BUP_RECORD_BLOCK_4_END, &size, 0, row_len);
+		}
+	}
+
+	/* Now continue with the sequential scan. */
+	while (size > 1) {
+		if (!xt_tab_seq_next(bd_ot, bd_row_buf, &eof))
+			goto failed;
+		if (eof) {
+			/* We will go the next table, on the next call. */
+			xt_tab_seq_exit(bd_ot);
+			xt_db_return_table_to_pool_ns(bd_ot);
+			bd_ot = NULL;
+			break;
+		}
+		if (!(row_len = myxt_store_row_data(bd_ot, 0, (char *) bd_row_buf)))
+			goto failed;
+		if (row_len+1 > size) {
+			/* Does not fit: */
+			bd_row_offset = 0;
+			bd_row_size = row_len;
+			/* Only add part of the row, if there is still
+			 * quite a bit of space left:
+			 */
+			if (size >= (32 * 1024))
+				goto write_row;
+			break;
+		}
+		buffer = db_write_block(buffer, BUP_STANDARD_VAR_RECORD, &size, row_len);
+	}
+
+	done:
+	buf.size = buf.size - size;
+	/* This indicates wnd of data for a table! */
+    buf.last = eof;
+
+	return backup::OK;
+
+	failed:
+	xt_log_and_clear_exception(bd_thread);
+	return backup::ERROR;
+}
+
+result_t PBXTBackupDriver::prelock()
+{
+	XT_TRACE_CALL();
+	return backup::READY;
+}
+
+result_t PBXTBackupDriver::lock()
+{
+	XT_TRACE_CALL();
+	bd_thread->st_xact_mode = XT_XACT_COMMITTED_READ;
+	bd_thread->st_ignore_fkeys = FALSE;
+	bd_thread->st_auto_commit = FALSE;
+	bd_thread->st_table_trans = FALSE;
+	bd_thread->st_abort_trans = FALSE;
+	bd_thread->st_stat_ended = FALSE;
+	bd_thread->st_stat_trans = FALSE;
+	bd_thread->st_is_update = NULL;
+	if (!xt_xn_begin(bd_thread))
+		return backup::ERROR;
+	bd_state = BUP_STATE_AFTER_LOCK;
+	return backup::OK;
+}
+
+result_t PBXTBackupDriver::unlock()
+{
+	XT_TRACE_CALL();
+	return backup::OK;
+}
+
+result_t PBXTBackupDriver::cancel()
+{
+	XT_TRACE_CALL();
+	return backup::OK; // free() will be called and suffice
+}
+
+void PBXTBackupDriver::free()
+{
+	XT_TRACE_CALL();
+	if (bd_ot) {
+		xt_tab_seq_exit(bd_ot);
+		xt_db_return_table_to_pool_ns(bd_ot);
+		bd_ot = NULL;
+	}
+	if (bd_row_buf) {
+		xt_free_ns(bd_row_buf);
+		bd_row_buf = NULL;
+	}
+	if (bd_thread->st_xact_data)
+		xt_xn_rollback(bd_thread);
+	delete this;
+}
+
+void PBXTBackupDriver::lock_tables_TL_READ_NO_INSERT()
+{
+	XT_TRACE_CALL();
+}
+
+/*
+ * -----------------------------------------------------------------------
+ * BACKUP DRIVER
+ */
+
+class PBXTRestoreDriver: public Restore_driver
+{
+	public:
+	PBXTRestoreDriver(const Table_list &tables);
+	virtual ~PBXTRestoreDriver();
+
+	virtual result_t  begin(const size_t);
+	virtual result_t  end();
+	virtual result_t  send_data(Buffer &buf);
+	virtual result_t  cancel();
+	virtual void      free();
+	
+	private:
+	XTThreadPtr		rd_thread;
+	u_int			rd_table_no;
+	XTOpenTablePtr	rd_ot;
+	STRUCT_TABLE	*rd_my_table;
+	xtWord1			*rb_row_buf;
+	u_int			rb_col_cnt;
+	u_int			rb_insert_count;
+
+	/* Long rows are accumulated here: */
+	xtWord4			rb_row_len;
+	xtWord4			rb_data_size;
+	xtWord1			*rb_row_data;
+};
+
+PBXTRestoreDriver::PBXTRestoreDriver(const Table_list &tables):
+Restore_driver(tables),
+rd_thread(NULL),
+rd_table_no(0),
+rd_ot(NULL),
+rb_row_buf(NULL),
+rb_row_len(0),
+rb_data_size(0),
+rb_row_data(NULL)
+{
+}
+
+PBXTRestoreDriver::~PBXTRestoreDriver()
+{
+}
+
+result_t PBXTRestoreDriver::begin(const size_t)
+{
+	THD				*thd = current_thd;
+	XTExceptionRec	e;
+	
+	XT_TRACE_CALL();
+	
+	if (!(rd_thread = xt_ha_set_current_thread(thd, &e))) {
+		xt_log_exception(NULL, &e, XT_LOG_DEFAULT);
+		return backup::ERROR;
+	}
+	
+	return backup::OK;
+}
+
+result_t PBXTRestoreDriver::end()
+{
+	XT_TRACE_CALL();
+	if (rd_ot) {
+		xt_db_return_table_to_pool_ns(rd_ot);
+		rd_ot = NULL;
+	}
+	//if (rb_row_buf) {
+	//	xt_free_ns(rb_row_buf);
+	//	rb_row_buf = NULL;
+	//}
+	if (rb_row_data) {
+		xt_free_ns(rb_row_data);
+		rb_row_data = NULL;
+	}
+	if (rd_thread->st_xact_data) {
+		if (!xt_xn_commit(rd_thread))
+			return backup::ERROR;
+	}
+	return backup::OK;
+}
+
+
+result_t PBXTRestoreDriver::send_data(Buffer &buf)
+{
+	size_t	size;
+	xtWord1	type;
+	xtWord1	*buffer;
+	xtWord4	row_len;
+	xtWord1 *rec_data;
+
+	XT_TRACE_CALL();
+
+	if (buf.table_num != rd_table_no) {
+		XTThreadPtr		self = rd_thread;
+		XTTableHPtr		tab;
+		char			path[PATH_MAX];
+		
+		if (rd_ot) {
+			xt_db_return_table_to_pool_ns(rd_ot);
+			rd_ot = NULL;
+		}
+
+		if (rd_thread->st_xact_data) {
+			if (!xt_xn_commit(rd_thread))
+				goto failed;
+		}
+		if (!xt_xn_begin(rd_thread))
+			goto failed;
+		rb_insert_count = 0;
+		
+		rd_table_no = buf.table_num;
+		m_tables[rd_table_no-1].internal_name(path, sizeof(path));
+		try_(a)	{
+			xt_ha_open_database_of_table(self, (XTPathStrPtr) path);
+			tab = xt_use_table(self, (XTPathStrPtr) path, FALSE, FALSE);
+			pushr_(xt_heap_release, tab);
+			if (!(rd_ot = xt_db_open_table_using_tab(tab, rd_thread)))
+				xt_throw(self);
+			freer_(); // xt_heap_release(tab)
+
+			rd_my_table = rd_ot->ot_table->tab_dic.dic_my_table;
+			if (rd_my_table->found_next_number_field) {
+				rd_my_table->in_use = current_thd;
+				rd_my_table->next_number_field = rd_my_table->found_next_number_field;
+				rd_my_table->mark_columns_used_by_index_no_reset(rd_my_table->s->next_number_index, rd_my_table->read_set);
+			}
+
+			/* This is safe because only one thread can restore a table at 
+			 * a time!
+			 */
+			rb_row_buf = (xtWord1 *) rd_my_table->record[0];
+			//if (rb_row_buf) {
+			//	xt_free(self, rb_row_buf);
+			//	rb_row_buf = NULL;
+			//}
+			//rb_row_buf = (xtWord1 *) xt_malloc(self, rd_ot->ot_table->tab_dic.dic_mysql_buf_size);
+	
+			rb_col_cnt = rd_ot->ot_table->tab_dic.dic_no_of_cols;
+
+		}
+		catch_(a) {
+			;
+		}
+		cont_(a);
+		
+		if (!rd_ot)
+			goto failed;
+	}
+
+	buffer = (xtWord1 *) buf.data;
+	size = buf.size;
+
+	while (size > 0) {
+		type = *buffer;
+		switch (type) {
+			case BUP_STANDARD_VAR_RECORD:
+				rec_data = buffer + 1;
+				break;
+			case BUP_RECORD_BLOCK_4_START:
+				buffer++;
+				row_len = XT_GET_DISK_4(buffer);
+				buffer += 4;
+				if (rb_data_size < row_len) {
+					if (!xt_realloc_ns((void **) &rb_row_data, row_len))
+						goto failed;
+					rb_data_size = row_len;
+				}
+				row_len = XT_GET_DISK_4(buffer);
+				buffer += 4;
+				ASSERT_NS(row_len <= rb_data_size);
+				if (row_len > rb_data_size) {
+					xt_register_xterr(XT_REG_CONTEXT, XT_ERR_BAD_BACKUP_FORMAT);
+					goto failed;
+				}
+				memcpy(rb_row_data, buffer, row_len);
+				rb_row_len = row_len;
+				buffer += row_len;
+				if (row_len + 9 > size) {
+					xt_register_xterr(XT_REG_CONTEXT, XT_ERR_BAD_BACKUP_FORMAT);
+					goto failed;
+				}
+				size -= row_len + 9;
+				continue;
+			case BUP_RECORD_BLOCK_4:
+				buffer++;
+				row_len = XT_GET_DISK_4(buffer);
+				buffer += 4;
+				ASSERT_NS(rb_row_len + row_len <= rb_data_size);
+				if (rb_row_len + row_len > rb_data_size) {
+					xt_register_xterr(XT_REG_CONTEXT, XT_ERR_BAD_BACKUP_FORMAT);
+					goto failed;
+				}
+				memcpy(rb_row_data + rb_row_len, buffer, row_len);
+				rb_row_len += row_len;
+				buffer += row_len;
+				if (row_len + 5 > size) {
+					xt_register_xterr(XT_REG_CONTEXT, XT_ERR_BAD_BACKUP_FORMAT);
+					goto failed;
+				}
+				size -= row_len + 5;
+				continue;
+			case BUP_RECORD_BLOCK_4_END:
+				buffer++;
+				row_len = XT_GET_DISK_4(buffer);
+				buffer += 4;
+				ASSERT_NS(rb_row_len + row_len <= rb_data_size);
+				if (rb_row_len + row_len > rb_data_size) {
+					xt_register_xterr(XT_REG_CONTEXT, XT_ERR_BAD_BACKUP_FORMAT);
+					goto failed;
+				}
+				memcpy(rb_row_data + rb_row_len, buffer, row_len);
+				buffer += row_len;
+				if (row_len + 5 > size) {
+					xt_register_xterr(XT_REG_CONTEXT, XT_ERR_BAD_BACKUP_FORMAT);
+					goto failed;
+				}
+				size -= row_len + 5;
+				rec_data = rb_row_data;
+				break;
+			default:
+				xt_register_xterr(XT_REG_CONTEXT, XT_ERR_BAD_BACKUP_FORMAT);
+				goto failed;
+		}
+		
+		if (!(row_len = myxt_load_row_data(rd_ot, rec_data, rb_row_buf, rb_col_cnt)))
+			goto failed;
+
+		if (rd_ot->ot_table->tab_dic.dic_my_table->found_next_number_field)
+			ha_set_auto_increment(rd_ot, rd_ot->ot_table->tab_dic.dic_my_table->found_next_number_field);
+
+		if (!xt_tab_new_record(rd_ot, rb_row_buf))
+			goto failed;
+
+		if (type == BUP_STANDARD_VAR_RECORD) {
+			buffer += row_len+1;
+			if (row_len + 1 > size) {
+				xt_register_xterr(XT_REG_CONTEXT, XT_ERR_BAD_BACKUP_FORMAT);
+				goto failed;
+			}
+			size -= row_len + 1;
+		}
+
+		rb_insert_count++;
+		if (rb_insert_count == XT_RESTORE_BATCH_SIZE) {
+			if (!xt_xn_commit(rd_thread))
+				goto failed;
+			if (!xt_xn_begin(rd_thread))
+				goto failed;
+			rb_insert_count = 0;
+		}
+	}
+
+	return backup::OK;
+	
+	failed:
+	xt_log_and_clear_exception(rd_thread);
+	return backup::ERROR;
+}
+
+
+result_t PBXTRestoreDriver::cancel()
+{
+	XT_TRACE_CALL();
+	/* Nothing to do in cancel(); free() will suffice */
+	return backup::OK;
+}
+
+void PBXTRestoreDriver::free()
+{
+	XT_TRACE_CALL();
+	if (rd_ot) {
+		xt_db_return_table_to_pool_ns(rd_ot);
+		rd_ot = NULL;
+	}
+	//if (rb_row_buf) {
+	//	xt_free_ns(rb_row_buf);
+	//	rb_row_buf = NULL;
+	//}
+	if (rb_row_data) {
+		xt_free_ns(rb_row_data);
+		rb_row_data = NULL;
+	}
+	if (rd_thread->st_xact_data)
+		xt_xn_rollback(rd_thread);
+	delete this;
+}
+
+/*
+ * -----------------------------------------------------------------------
+ * BACKUP ENGINE FACTORY
+ */
+
+#define PBXT_BACKUP_VERSION 1
+
+
+class PBXTBackupEngine: public Backup_engine
+{
+	public:
+	PBXTBackupEngine() { };
+
+	virtual version_t version() const {
+		return PBXT_BACKUP_VERSION;
+	};
+
+	virtual result_t get_backup(const uint32, const Table_list &, Backup_driver* &);
+
+	virtual result_t get_restore(const version_t, const uint32, const Table_list &,Restore_driver* &);
+
+	virtual void free()
+	{
+		delete this;
+	}
+};
+
+result_t PBXTBackupEngine::get_backup(const u_int count, const Table_list &tables, Backup_driver* &drv)
+{
+	PBXTBackupDriver *ptr = new PBXTBackupDriver(tables);
+
+	if (!ptr)
+		return backup::ERROR;
+	drv = ptr;
+	return backup::OK;
+}
+
+result_t PBXTBackupEngine::get_restore(const version_t ver, const uint32,
+                             const Table_list &tables, Restore_driver* &drv)
+{
+	if (ver > PBXT_BACKUP_VERSION)
+	{
+		return backup::ERROR;    
+	}
+	
+	PBXTRestoreDriver *ptr = new PBXTRestoreDriver(tables);
+
+	if (!ptr)
+		return backup::ERROR;
+	drv = (Restore_driver *) ptr;
+	return backup::OK;
+}
+
+
+Backup_result_t pbxt_backup_engine(handlerton *self, Backup_engine* &be)
+{
+	be = new PBXTBackupEngine();
+	
+	if (!be)
+		return backup::ERROR;
+	
+	return backup::OK;
+}
+
+#endif
diff --git a/storage/pbxt/src/backup_xt.h b/storage/pbxt/src/backup_xt.h
new file mode 100644
index 00000000000..58171f6c31a
--- /dev/null
+++ b/storage/pbxt/src/backup_xt.h
@@ -0,0 +1,34 @@
+/* Copyright (c) 2009 PrimeBase Technologies GmbH
+ *
+ * PrimeBase XT
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * 2009-09-07	Paul McCullagh
+ *
+ * H&G2JCtL
+ */
+
+#ifndef __backup_xt_h__
+#define __backup_xt_h__
+
+#include "xt_defs.h"
+
+#ifdef MYSQL_SUPPORTS_BACKUP
+
+Backup_result_t pbxt_backup_engine(handlerton *self, Backup_engine* &be);
+
+#endif
+#endif
diff --git a/storage/pbxt/src/bsearch_xt.cc b/storage/pbxt/src/bsearch_xt.cc
new file mode 100644
index 00000000000..539de1ae74d
--- /dev/null
+++ b/storage/pbxt/src/bsearch_xt.cc
@@ -0,0 +1,66 @@
+/* Copyright (c) 2005 PrimeBase Technologies GmbH
+ *
+ * PrimeBase XT
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * 2004-01-03	Paul McCullagh
+ *
+ * H&G2JCtL
+ */
+
+#include "xt_config.h"
+
+#include <stdio.h>
+
+#include "bsearch_xt.h"
+#include "pthread_xt.h"
+#include "thread_xt.h"
+
+/**
+ * Binary search a array of 'count' items, with byte size 'size'. This
+ * function returns a pointer to the element and the 'index'
+ * of the element if found.
+ *
+ * If not found the index of the insert point of the item
+ * is returned (0 <= index <= count).
+ *
+ * The comparison routine 'compar' may throw an exception.
+ * In this case the error details will be stored in 'thread'.
+ */
+void *xt_bsearch(XTThreadPtr thread, const void *key, register const void *base, size_t count, size_t size, size_t *idx, const void *thunk, XTCompareFunc compar)
+{
+	register size_t		i;
+	register size_t		guess;
+	register int		r;
+
+	i = 0;
+	while (i < count) {
+		guess = (i + count - 1) >> 1;
+		r = (compar)(thread, thunk, key, ((char *) base) + guess * size);
+		if (r == 0) {
+			*idx = guess;
+			return ((char *) base) + guess * size;
+		}
+		if (r < 0)
+			count = guess;
+		else
+			i = guess + 1;
+	}
+
+	*idx = i;
+	return NULL;
+}
+
diff --git a/storage/pbxt/src/bsearch_xt.h b/storage/pbxt/src/bsearch_xt.h
new file mode 100644
index 00000000000..f15e28009fb
--- /dev/null
+++ b/storage/pbxt/src/bsearch_xt.h
@@ -0,0 +1,32 @@
+/* Copyright (c) 2005 PrimeBase Technologies GmbH
+ *
+ * PrimeBase XT
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * 2004-01-03	Paul McCullagh
+ *
+ * H&G2JCtL
+ */
+#ifndef __xt_bsearch_h__
+#define __xt_bsearch_h__
+
+#include "xt_defs.h"
+
+struct XTThread;
+
+void *xt_bsearch(struct XTThread *self, const void *key, register const void *base, size_t count, size_t size, size_t *idx, const void *thunk, XTCompareFunc compar);
+
+#endif
diff --git a/storage/pbxt/src/cache_xt.cc b/storage/pbxt/src/cache_xt.cc
new file mode 100644
index 00000000000..24e42d9e984
--- /dev/null
+++ b/storage/pbxt/src/cache_xt.cc
@@ -0,0 +1,1679 @@
+/* Copyright (c) 2005 PrimeBase Technologies GmbH, Germany
+ *
+ * PrimeBase XT
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * 2005-05-24	Paul McCullagh
+ *
+ * H&G2JCtL
+ */
+
+#include "xt_config.h"
+
+#ifdef DRIZZLED
+#include <bitset>
+#endif
+
+#ifndef XT_WIN
+#include <unistd.h>
+#endif
+
+#include <stdio.h>
+#include <time.h>
+
+#include "pthread_xt.h"
+#include "thread_xt.h"
+#include "filesys_xt.h"
+#include "cache_xt.h"
+#include "table_xt.h"
+#include "trace_xt.h"
+#include "util_xt.h"
+
+#define XT_TIME_DIFF(start, now) (\
+	((xtWord4) (now) < (xtWord4) (start)) ? \
+	((xtWord4) 0XFFFFFFFF - ((xtWord4) (start) - (xtWord4) (now))) : \
+	((xtWord4) (now) - (xtWord4) (start)))
+
+/*
+ * -----------------------------------------------------------------------
+ * D I S K   C A C H E
+ */
+
+#define IDX_CAC_SEGMENT_COUNT		((off_t) 1 << XT_INDEX_CACHE_SEGMENT_SHIFTS)
+#define IDX_CAC_SEGMENT_MASK		(IDX_CAC_SEGMENT_COUNT - 1)
+
+#ifdef XT_NO_ATOMICS
+#define IDX_CAC_USE_PTHREAD_RW
+#else
+//#define IDX_CAC_USE_RWMUTEX
+//#define IDX_CAC_USE_PTHREAD_RW
+//#define IDX_USE_SPINXSLOCK
+#define IDX_CAC_USE_XSMUTEX
+#endif
+
+#ifdef IDX_CAC_USE_XSMUTEX
+#define IDX_CAC_LOCK_TYPE				XTXSMutexRec
+#define IDX_CAC_INIT_LOCK(s, i)			xt_xsmutex_init_with_autoname(s, &(i)->cs_lock)
+#define IDX_CAC_FREE_LOCK(s, i)			xt_xsmutex_free(s, &(i)->cs_lock)	
+#define IDX_CAC_READ_LOCK(i, o)			xt_xsmutex_slock(&(i)->cs_lock, (o)->t_id)
+#define IDX_CAC_WRITE_LOCK(i, o)		xt_xsmutex_xlock(&(i)->cs_lock, (o)->t_id)
+#define IDX_CAC_UNLOCK(i, o)			xt_xsmutex_unlock(&(i)->cs_lock, (o)->t_id)
+#elif defined(IDX_CAC_USE_PTHREAD_RW)
+#define IDX_CAC_LOCK_TYPE				xt_rwlock_type
+#define IDX_CAC_INIT_LOCK(s, i)			xt_init_rwlock_with_autoname(s, &(i)->cs_lock)
+#define IDX_CAC_FREE_LOCK(s, i)			xt_free_rwlock(&(i)->cs_lock)	
+#define IDX_CAC_READ_LOCK(i, o)			xt_slock_rwlock_ns(&(i)->cs_lock)
+#define IDX_CAC_WRITE_LOCK(i, o)		xt_xlock_rwlock_ns(&(i)->cs_lock)
+#define IDX_CAC_UNLOCK(i, o)			xt_unlock_rwlock_ns(&(i)->cs_lock)
+#elif defined(IDX_CAC_USE_RWMUTEX)
+#define IDX_CAC_LOCK_TYPE				XTRWMutexRec
+#define IDX_CAC_INIT_LOCK(s, i)			xt_rwmutex_init_with_autoname(s, &(i)->cs_lock)
+#define IDX_CAC_FREE_LOCK(s, i)			xt_rwmutex_free(s, &(i)->cs_lock)	
+#define IDX_CAC_READ_LOCK(i, o)			xt_rwmutex_slock(&(i)->cs_lock, (o)->t_id)
+#define IDX_CAC_WRITE_LOCK(i, o)		xt_rwmutex_xlock(&(i)->cs_lock, (o)->t_id)
+#define IDX_CAC_UNLOCK(i, o)			xt_rwmutex_unlock(&(i)->cs_lock, (o)->t_id)
+#elif defined(IDX_CAC_USE_SPINXSLOCK)
+#define IDX_CAC_LOCK_TYPE				XTSpinXSLockRec
+#define IDX_CAC_INIT_LOCK(s, i)			xt_spinxslock_init_with_autoname(s, &(i)->cs_lock)
+#define IDX_CAC_FREE_LOCK(s, i)			xt_spinxslock_free(s, &(i)->cs_lock)	
+#define IDX_CAC_READ_LOCK(i, s)			xt_spinxslock_slock(&(i)->cs_lock, (s)->t_id)
+#define IDX_CAC_WRITE_LOCK(i, s)		xt_spinxslock_xlock(&(i)->cs_lock, FALSE, (s)->t_id)
+#define IDX_CAC_UNLOCK(i, s)			xt_spinxslock_unlock(&(i)->cs_lock, (s)->t_id)
+#endif
+
+#ifdef XT_NO_ATOMICS
+#define ID_HANDLE_USE_PTHREAD_RW
+#else
+#define ID_HANDLE_USE_SPINLOCK
+//#define ID_HANDLE_USE_PTHREAD_RW
+#endif
+
+#if defined(ID_HANDLE_USE_PTHREAD_RW)
+#define ID_HANDLE_LOCK_TYPE				xt_mutex_type
+#define ID_HANDLE_INIT_LOCK(s, i)		xt_init_mutex_with_autoname(s, i)
+#define ID_HANDLE_FREE_LOCK(s, i)		xt_free_mutex(i)	
+#define ID_HANDLE_LOCK(i)				xt_lock_mutex_ns(i)
+#define ID_HANDLE_UNLOCK(i)				xt_unlock_mutex_ns(i)
+#elif defined(ID_HANDLE_USE_SPINLOCK)
+#define ID_HANDLE_LOCK_TYPE				XTSpinLockRec
+#define ID_HANDLE_INIT_LOCK(s, i)		xt_spinlock_init_with_autoname(s, i)
+#define ID_HANDLE_FREE_LOCK(s, i)		xt_spinlock_free(s, i)	
+#define ID_HANDLE_LOCK(i)				xt_spinlock_lock(i)
+#define ID_HANDLE_UNLOCK(i)				xt_spinlock_unlock(i)
+#endif
+
+#define XT_HANDLE_SLOTS					37
+
+/*
+#ifdef DEBUG
+#define XT_INIT_HANDLE_COUNT			0
+#define XT_INIT_HANDLE_BLOCKS			0
+#else
+#define XT_INIT_HANDLE_COUNT			40
+#define XT_INIT_HANDLE_BLOCKS			10
+#endif
+*/
+
+/* A disk cache segment. The cache is divided into a number of segments
+ * to improve concurrency.
+ */
+typedef struct DcSegment {
+	IDX_CAC_LOCK_TYPE	cs_lock;						/* The cache segment lock. */
+	XTIndBlockPtr		*cs_hash_table;
+} DcSegmentRec, *DcSegmentPtr;
+
+typedef struct DcHandleSlot {
+	ID_HANDLE_LOCK_TYPE	hs_handles_lock;
+	XTIndHandleBlockPtr	hs_free_blocks;
+	XTIndHandlePtr		hs_free_handles;
+	XTIndHandlePtr		hs_used_handles;
+} DcHandleSlotRec, *DcHandleSlotPtr;
+
+typedef struct DcGlobals {
+	xt_mutex_type		cg_lock;						/* The public cache lock. */
+	DcSegmentRec		cg_segment[IDX_CAC_SEGMENT_COUNT];
+	XTIndBlockPtr		cg_blocks;
+#ifdef XT_USE_DIRECT_IO_ON_INDEX
+	xtWord1				*cg_buffer;
+#endif
+	XTIndBlockPtr		cg_free_list;
+	xtWord4				cg_free_count;
+	xtWord4				cg_ru_now;						/* A counter as described by Jim Starkey (my thanks) */
+	XTIndBlockPtr		cg_lru_block;
+	XTIndBlockPtr		cg_mru_block;
+	xtWord4				cg_hash_size;
+	xtWord4				cg_block_count;
+	xtWord4				cg_max_free;
+#ifdef DEBUG_CHECK_IND_CACHE
+	u_int				cg_reserved_by_ots;				/* Number of blocks reserved by open tables. */
+	u_int				cg_read_count;					/* Number of blocks being read. */
+#endif
+
+	/* Index cache handles: */
+	DcHandleSlotRec		cg_handle_slot[XT_HANDLE_SLOTS];
+} DcGlobalsRec;
+
+static DcGlobalsRec	ind_cac_globals;
+
+#ifdef XT_USE_MYSYS
+#ifdef xtPublic
+#undef xtPublic
+#endif
+#include "my_global.h"
+#include "my_sys.h"
+#include "keycache.h"
+KEY_CACHE my_cache;
+#undef	pthread_rwlock_rdlock
+#undef	pthread_rwlock_wrlock
+#undef	pthread_rwlock_try_wrlock
+#undef	pthread_rwlock_unlock
+#undef	pthread_mutex_lock
+#undef	pthread_mutex_unlock
+#undef	pthread_cond_wait
+#undef	pthread_cond_broadcast
+#undef	xt_mutex_type
+#define xtPublic
+#endif
+
+/*
+ * -----------------------------------------------------------------------
+ * INDEX CACHE HANDLES
+ */
+
+static XTIndHandlePtr ind_alloc_handle()
+{
+	XTIndHandlePtr handle;
+
+	if (!(handle = (XTIndHandlePtr) xt_calloc_ns(sizeof(XTIndHandleRec))))
+		return NULL;
+	xt_spinlock_init_with_autoname(NULL, &handle->ih_lock);
+	return handle;
+}
+
+static void ind_free_handle(XTIndHandlePtr handle)
+{
+	xt_spinlock_free(NULL, &handle->ih_lock);
+	xt_free_ns(handle);
+}
+
+static void ind_handle_exit(XTThreadPtr self)
+{
+	DcHandleSlotPtr		hs;
+	XTIndHandlePtr		handle;
+	XTIndHandleBlockPtr	hptr;
+
+	for (int i=0; i<XT_HANDLE_SLOTS; i++) {
+		hs = &ind_cac_globals.cg_handle_slot[i];
+
+		while (hs->hs_used_handles) {
+			handle = hs->hs_used_handles;
+			xt_ind_release_handle(handle, FALSE, self);
+		}
+
+		while (hs->hs_free_blocks) {
+			hptr = hs->hs_free_blocks;
+			hs->hs_free_blocks = hptr->hb_next;
+			xt_free(self, hptr);
+		}
+
+		while (hs->hs_free_handles) {
+			handle = hs->hs_free_handles;
+			hs->hs_free_handles = handle->ih_next;
+			ind_free_handle(handle);
+		}
+
+		ID_HANDLE_FREE_LOCK(self, &hs->hs_handles_lock);
+	}
+}
+
+static void ind_handle_init(XTThreadPtr self)
+{
+	DcHandleSlotPtr		hs;
+
+	for (int i=0; i<XT_HANDLE_SLOTS; i++) {
+		hs = &ind_cac_globals.cg_handle_slot[i];
+		memset(hs, 0, sizeof(DcHandleSlotRec));
+		ID_HANDLE_INIT_LOCK(self, &hs->hs_handles_lock);
+	}
+}
+
+//#define CHECK_HANDLE_STRUCTS
+
+#ifdef CHECK_HANDLE_STRUCTS
+static int gdummy = 0;
+
+static void ic_stop_here()
+{
+	gdummy = gdummy + 1;
+	printf("Nooo %d!\n", gdummy);
+}
+
+static void ic_check_handle_structs()
+{
+	XTIndHandlePtr		handle, phandle;
+	XTIndHandleBlockPtr	hptr, phptr;
+	int					count = 0;
+	int					ctest;
+
+	phandle = NULL;
+	handle = ind_cac_globals.cg_used_handles;
+	while (handle) {
+		if (handle == phandle)
+			ic_stop_here();
+		if (handle->ih_prev != phandle)
+			ic_stop_here();
+		if (handle->ih_cache_reference) {
+			ctest = handle->x.ih_cache_block->cb_handle_count;
+			if (ctest == 0 || ctest > 100)
+				ic_stop_here();
+		}
+		else {
+			ctest = handle->x.ih_handle_block->hb_ref_count;
+			if (ctest == 0 || ctest > 100)
+				ic_stop_here();
+		}
+		phandle = handle;
+		handle = handle->ih_next;
+		count++;
+		if (count > 1000)
+			ic_stop_here();
+	}
+
+	count = 0;
+	hptr = ind_cac_globals.cg_free_blocks;
+	while (hptr) {
+		if (hptr == phptr)
+			ic_stop_here();
+		phptr = hptr;
+		hptr = hptr->hb_next;
+		count++;
+		if (count > 1000)
+			ic_stop_here();
+	}
+
+	count = 0;
+	handle = ind_cac_globals.cg_free_handles;
+	while (handle) {
+		if (handle == phandle)
+			ic_stop_here();
+		phandle = handle;
+		handle = handle->ih_next;
+		count++;
+		if (count > 1000)
+			ic_stop_here();
+	}
+}
+#endif
+
+/*
+ * Get a handle to the index block.
+ * This function is called by index scanners (readers).
+ */
+xtPublic XTIndHandlePtr xt_ind_get_handle(XTOpenTablePtr ot, XTIndexPtr ind, XTIndReferencePtr iref)
+{
+	DcHandleSlotPtr	hs;
+	XTIndHandlePtr	handle;
+
+	hs = &ind_cac_globals.cg_handle_slot[iref->ir_block->cb_address % XT_HANDLE_SLOTS];
+
+	ASSERT_NS(iref->ir_xlock == FALSE);
+	ASSERT_NS(iref->ir_updated == FALSE);
+	ID_HANDLE_LOCK(&hs->hs_handles_lock);
+#ifdef CHECK_HANDLE_STRUCTS
+	ic_check_handle_structs();
+#endif
+	if ((handle = hs->hs_free_handles))
+		hs->hs_free_handles = handle->ih_next;
+	else {
+		if (!(handle = ind_alloc_handle())) {
+			ID_HANDLE_UNLOCK(&hs->hs_handles_lock);
+			xt_ind_release(ot, ind, XT_UNLOCK_READ, iref);
+			return NULL;
+		}
+	}
+	if (hs->hs_used_handles)
+		hs->hs_used_handles->ih_prev = handle;
+	handle->ih_next = hs->hs_used_handles;
+	handle->ih_prev = NULL;
+	handle->ih_address = iref->ir_block->cb_address;
+	handle->ih_cache_reference = TRUE;
+	handle->x.ih_cache_block = iref->ir_block;
+	handle->ih_branch = iref->ir_branch;
+	/* {HANDLE-COUNT-USAGE}
+	 * This is safe because:
+	 *
+	 * I have an Slock on the cache block, and I have
+	 * at least an Slock on the index.
+	 * So this excludes anyone who is reading 
+	 * cb_handle_count in the index.
+	 * (all cache block writers, and the freeer).
+	 *
+	 * The increment is safe because I have the list
+	 * lock (hs_handles_lock), which is required by anyone else
+	 * who increments or decrements this value.
+	 */
+	iref->ir_block->cb_handle_count++;
+	hs->hs_used_handles = handle;
+#ifdef CHECK_HANDLE_STRUCTS
+	ic_check_handle_structs();
+#endif
+	ID_HANDLE_UNLOCK(&hs->hs_handles_lock);
+	xt_ind_release(ot, ind, XT_UNLOCK_READ, iref);
+	return handle;
+}
+
+xtPublic void xt_ind_release_handle(XTIndHandlePtr handle, xtBool have_lock, XTThreadPtr thread)
+{
+	DcHandleSlotPtr	hs;
+	XTIndBlockPtr	block = NULL;
+	u_int			hash_idx = 0;
+	DcSegmentPtr	seg = NULL;
+	XTIndBlockPtr	xblock;
+
+	/* The lock order is:
+	 * 1. Cache segment (cs_lock) - This is only by ind_free_block()!
+	 * 1. S/Slock cache block (cb_lock)
+	 * 2. List lock (cg_handles_lock).
+	 * 3. Handle lock (ih_lock)
+	 */
+	if (!have_lock)
+		xt_spinlock_lock(&handle->ih_lock);
+
+	/* Get the lock on the cache page if required: */
+	if (handle->ih_cache_reference) {
+		u_int			file_id;
+		xtIndexNodeID	address;
+
+		block = handle->x.ih_cache_block;
+
+		file_id = block->cb_file_id;
+		address = block->cb_address;
+		hash_idx = XT_NODE_ID(address) + (file_id * 223);
+		seg = &ind_cac_globals.cg_segment[hash_idx & IDX_CAC_SEGMENT_MASK];
+		hash_idx = (hash_idx >> XT_INDEX_CACHE_SEGMENT_SHIFTS) % ind_cac_globals.cg_hash_size;
+	}
+
+	xt_spinlock_unlock(&handle->ih_lock);
+
+	/* Because of the lock order, I have to release the
+	 * handle before I get a lock on the cache block.
+	 *
+	 * But, by doing this, this cache block may be gone!
+	 */
+	if (block) {
+		IDX_CAC_READ_LOCK(seg, thread);
+		xblock = seg->cs_hash_table[hash_idx];
+		while (xblock) {
+			if (block == xblock) {
+				/* Found the block... 
+				 * {HANDLE-COUNT-SLOCK}
+				 * 04.05.2009, changed to slock.
+				 * The xlock causes too much contention
+				 * on the cache block for read only loads.
+				 *
+				 * Is it safe?
+				 * See below...
+				 */
+				XT_IPAGE_READ_LOCK(&block->cb_lock);
+				goto block_found;
+			}
+			xblock = xblock->cb_next;
+		}
+		block = NULL;
+		block_found:
+		IDX_CAC_UNLOCK(seg, thread);
+	}
+
+	hs = &ind_cac_globals.cg_handle_slot[handle->ih_address % XT_HANDLE_SLOTS];
+
+	ID_HANDLE_LOCK(&hs->hs_handles_lock);
+#ifdef CHECK_HANDLE_STRUCTS
+	ic_check_handle_structs();
+#endif
+
+	/* I don't need to lock the handle because I have locked
+	 * the list, and no other thread can change the
+	 * handle without first getting a lock on the list.
+	 *
+	 * In addition, the caller is the only owner of the
+	 * handle, and the only thread with an independent
+	 * reference to the handle.
+	 * All other access occur over the list.
+	 */
+
+	/* Remove the reference to the cache or a handle block: */
+	if (handle->ih_cache_reference) {
+		ASSERT_NS(block == handle->x.ih_cache_block);
+		ASSERT_NS(block && block->cb_handle_count > 0);
+		/* {HANDLE-COUNT-USAGE}
+		 * This is safe here because I have excluded
+		 * all readers by taking an Xlock on the
+		 * cache block (CHANGED - see below).
+		 *
+		 * {HANDLE-COUNT-SLOCK}
+		 * 04.05.2009, changed to slock.
+		 * Should be OK, because:
+		 * A have a lock on the list lock (hs_handles_lock),
+		 * which prevents concurrent updates to cb_handle_count.
+		 *
+		 * I have also have a read lock on the cache block
+		 * but not a lock on the index. As a result, we cannot
+		 * excluded all index writers (and readers of 
+		 * cb_handle_count.
+		 */
+		block->cb_handle_count--;
+	}
+	else {
+		XTIndHandleBlockPtr	hptr = handle->x.ih_handle_block;
+
+		ASSERT_NS(!handle->ih_cache_reference);
+		ASSERT_NS(hptr->hb_ref_count > 0);
+		hptr->hb_ref_count--;
+		if (!hptr->hb_ref_count) {
+			/* Put it back on the free list: */
+			hptr->hb_next = hs->hs_free_blocks;
+			hs->hs_free_blocks = hptr;
+		}
+	}
+
+	/* Unlink the handle: */
+	if (handle->ih_next)
+		handle->ih_next->ih_prev = handle->ih_prev;
+	if (handle->ih_prev)
+		handle->ih_prev->ih_next = handle->ih_next;
+	if (hs->hs_used_handles == handle)
+		hs->hs_used_handles = handle->ih_next;
+
+	/* Put it on the free list: */
+	handle->ih_next = hs->hs_free_handles;
+	hs->hs_free_handles = handle;
+
+#ifdef CHECK_HANDLE_STRUCTS
+	ic_check_handle_structs();
+#endif
+	ID_HANDLE_UNLOCK(&hs->hs_handles_lock);
+
+	if (block)
+		XT_IPAGE_UNLOCK(&block->cb_lock, FALSE);
+}
+
+/* Call this function before a referenced cache block is modified!
+ * This function is called by index updaters.
+ */
+xtPublic xtBool xt_ind_copy_on_write(XTIndReferencePtr iref)
+{
+	DcHandleSlotPtr		hs;
+	XTIndHandleBlockPtr	hptr;
+	u_int				branch_size;
+	XTIndHandlePtr		handle;
+	u_int				i = 0;
+
+	hs = &ind_cac_globals.cg_handle_slot[iref->ir_block->cb_address % XT_HANDLE_SLOTS];
+
+	ID_HANDLE_LOCK(&hs->hs_handles_lock);
+
+	/* {HANDLE-COUNT-USAGE}
+	 * This is only called by updaters of this index block, or
+	 * the free which holds an Xlock on the index block.
+	 * These are all mutually exclusive for the index block.
+	 *
+	 * {HANDLE-COUNT-SLOCK}
+	 * Do this check again, after we have the list lock (hs_handles_lock).
+	 * There is a small chance that the count has changed, since we last
+	 * checked because xt_ind_release_handle() only holds
+	 * an slock on the index page.
+	 *
+	 * An updater can sometimes have a XLOCK on the index and an slock
+	 * on the cache block. In this case xt_ind_release_handle()
+	 * could have run through.
+	 */
+	if (!iref->ir_block->cb_handle_count) {
+		ID_HANDLE_UNLOCK(&hs->hs_handles_lock);
+		return OK;
+	}
+
+#ifdef CHECK_HANDLE_STRUCTS
+	ic_check_handle_structs();
+#endif
+	if ((hptr = hs->hs_free_blocks))
+		hs->hs_free_blocks = hptr->hb_next;
+	else {
+		if (!(hptr = (XTIndHandleBlockPtr) xt_malloc_ns(sizeof(XTIndHandleBlockRec)))) {
+			ID_HANDLE_UNLOCK(&hs->hs_handles_lock);
+			return FAILED;
+		}
+	}
+
+	branch_size = XT_GET_INDEX_BLOCK_LEN(XT_GET_DISK_2(iref->ir_branch->tb_size_2));
+	memcpy(&hptr->hb_branch, iref->ir_branch, branch_size);
+	hptr->hb_ref_count = iref->ir_block->cb_handle_count;
+
+	handle = hs->hs_used_handles;
+	while (handle) {
+		if (handle->ih_branch == iref->ir_branch) {
+			i++;
+			xt_spinlock_lock(&handle->ih_lock);
+			ASSERT_NS(handle->ih_cache_reference);
+			handle->ih_cache_reference = FALSE;
+			handle->x.ih_handle_block = hptr;
+			handle->ih_branch = &hptr->hb_branch;
+			xt_spinlock_unlock(&handle->ih_lock);
+#ifndef DEBUG
+			if (i == hptr->hb_ref_count)
+				break;
+#endif
+		}
+		handle = handle->ih_next;
+	}
+#ifdef DEBUG
+	ASSERT_NS(hptr->hb_ref_count == i);
+#endif
+	/* {HANDLE-COUNT-USAGE}
+	 * It is safe to modify cb_handle_count when I have the
+	 * list lock, and I have excluded all readers!
+	 */
+	iref->ir_block->cb_handle_count = 0;
+#ifdef CHECK_HANDLE_STRUCTS
+	ic_check_handle_structs();
+#endif
+	ID_HANDLE_UNLOCK(&hs->hs_handles_lock);
+
+	return OK;
+}
+
+xtPublic void xt_ind_lock_handle(XTIndHandlePtr handle)
+{
+	xt_spinlock_lock(&handle->ih_lock);
+}
+
+xtPublic void xt_ind_unlock_handle(XTIndHandlePtr handle)
+{
+	xt_spinlock_unlock(&handle->ih_lock);
+}
+
+/*
+ * -----------------------------------------------------------------------
+ * INIT/EXIT
+ */
+
+/*
+ * Initialize the disk cache.
+ */
+xtPublic void xt_ind_init(XTThreadPtr self, size_t cache_size)
+{
+	XTIndBlockPtr	block;
+
+#ifdef XT_USE_MYSYS
+	init_key_cache(&my_cache, 1024, cache_size, 100, 300);
+#endif
+	/* Memory is devoted to the page data alone, I no longer count the size of the directory,
+	 * or the page overhead: */
+	ind_cac_globals.cg_block_count = cache_size / XT_INDEX_PAGE_SIZE;
+	ind_cac_globals.cg_hash_size = ind_cac_globals.cg_block_count / (IDX_CAC_SEGMENT_COUNT >> 1);
+	ind_cac_globals.cg_max_free = ind_cac_globals.cg_block_count / 10;
+	if (ind_cac_globals.cg_max_free < 8)
+		ind_cac_globals.cg_max_free = 8;
+	if (ind_cac_globals.cg_max_free > 128)
+		ind_cac_globals.cg_max_free = 128;
+
+	try_(a) {
+		for (u_int i=0; i<IDX_CAC_SEGMENT_COUNT; i++) {
+			ind_cac_globals.cg_segment[i].cs_hash_table = (XTIndBlockPtr *) xt_calloc(self, ind_cac_globals.cg_hash_size * sizeof(XTIndBlockPtr));
+			IDX_CAC_INIT_LOCK(self, &ind_cac_globals.cg_segment[i]);
+		}
+
+		block = (XTIndBlockPtr) xt_malloc(self, ind_cac_globals.cg_block_count * sizeof(XTIndBlockRec));
+		ind_cac_globals.cg_blocks = block;
+		xt_init_mutex_with_autoname(self, &ind_cac_globals.cg_lock);
+#ifdef XT_USE_DIRECT_IO_ON_INDEX
+		xtWord1 *buffer;
+#ifdef XT_WIN
+		size_t	psize = 512;
+#else
+		size_t	psize = getpagesize();
+#endif
+		size_t	diff;
+
+		buffer = (xtWord1 *) xt_malloc(self, (ind_cac_globals.cg_block_count * XT_INDEX_PAGE_SIZE));
+		diff = (size_t) buffer % psize;
+		if (diff != 0) {
+			xt_free(self, buffer);
+			buffer = (xtWord1 *) xt_malloc(self, (ind_cac_globals.cg_block_count * XT_INDEX_PAGE_SIZE) + psize);
+			diff = (size_t) buffer % psize;
+			if (diff != 0)
+				diff = psize - diff;
+		}
+		ind_cac_globals.cg_buffer = buffer;
+		buffer += diff;
+#endif
+
+		for (u_int i=0; i<ind_cac_globals.cg_block_count; i++) {
+			XT_IPAGE_INIT_LOCK(self, &block->cb_lock);
+			block->cb_state = IDX_CAC_BLOCK_FREE;
+			block->cb_next = ind_cac_globals.cg_free_list;
+#ifdef XT_USE_DIRECT_IO_ON_INDEX
+			block->cb_data = buffer;
+			buffer += XT_INDEX_PAGE_SIZE;
+#endif
+#ifdef CHECK_BLOCK_TRAILERS
+			XT_SET_DISK_4(block->cp_check, 0xDEADBEEF);
+#endif
+			ind_cac_globals.cg_free_list = block;
+			block++;
+		}
+		ind_cac_globals.cg_free_count = ind_cac_globals.cg_block_count;
+#ifdef DEBUG_CHECK_IND_CACHE
+		ind_cac_globals.cg_reserved_by_ots = 0;
+#endif
+		ind_handle_init(self);
+	}
+	catch_(a) {
+		xt_ind_exit(self);
+		throw_();
+	}
+	cont_(a);
+}
+
+#ifdef CHECK_BLOCK_TRAILERS
+xtPublic void check_block_trailers()
+{
+	XTIndBlockPtr	block;
+
+	block = ind_cac_globals.cg_blocks;
+	for (u_int i=0; i<ind_cac_globals.cg_block_count; i++) {
+		ASSERT_NS(XT_GET_DISK_4(block->cp_check) == 0xDEADBEEF);
+		block++;
+	}
+}
+#endif
+
+xtPublic void xt_ind_exit(XTThreadPtr self)
+{
+#ifdef XT_USE_MYSYS
+	end_key_cache(&my_cache, 1);
+#endif
+	for (u_int i=0; i<IDX_CAC_SEGMENT_COUNT; i++) {
+		if (ind_cac_globals.cg_segment[i].cs_hash_table) {
+			xt_free(self, ind_cac_globals.cg_segment[i].cs_hash_table);
+			ind_cac_globals.cg_segment[i].cs_hash_table = NULL;
+			IDX_CAC_FREE_LOCK(self, &ind_cac_globals.cg_segment[i]);
+		}
+	}
+
+	/* Must be done before freeing the blocks! */
+	ind_handle_exit(self);
+
+	if (ind_cac_globals.cg_blocks) {
+		xt_free(self, ind_cac_globals.cg_blocks);
+		ind_cac_globals.cg_blocks = NULL;
+		xt_free_mutex(&ind_cac_globals.cg_lock);
+	}
+#ifdef XT_USE_DIRECT_IO_ON_INDEX
+	if (ind_cac_globals.cg_buffer) {
+		xt_free(self, ind_cac_globals.cg_buffer);
+		ind_cac_globals.cg_buffer = NULL;
+	}
+#endif
+
+	memset(&ind_cac_globals, 0, sizeof(ind_cac_globals));
+}
+
+xtPublic xtInt8 xt_ind_get_usage()
+{
+	xtInt8 size = 0;
+
+	size = (xtInt8) (ind_cac_globals.cg_block_count - ind_cac_globals.cg_free_count) * (xtInt8) XT_INDEX_PAGE_SIZE;
+	return size;
+}
+
+xtPublic xtInt8 xt_ind_get_size()
+{
+	xtInt8 size = 0;
+
+	size = (xtInt8) ind_cac_globals.cg_block_count * (xtInt8) XT_INDEX_PAGE_SIZE;
+	return size;
+}
+
+/*
+ * -----------------------------------------------------------------------
+ * INDEX CHECKING
+ */
+
+xtPublic void xt_ind_check_cache(XTIndexPtr ind)
+{
+	XTIndBlockPtr	block;
+	u_int			free_count, inuse_count, clean_count;
+	xtBool			check_count = FALSE;
+
+	if (ind == (XTIndex *) 1) {
+		ind = NULL;
+		check_count = TRUE;
+	}
+
+	// Check the dirty list:
+	if (ind) {
+		u_int cnt = 0;
+
+		block = ind->mi_dirty_list;
+		while (block) {
+			cnt++;
+			ASSERT_NS(block->cb_state == IDX_CAC_BLOCK_DIRTY);
+			block = block->cb_dirty_next;
+		}
+		ASSERT_NS(ind->mi_dirty_blocks == cnt);
+	}
+
+	xt_lock_mutex_ns(&ind_cac_globals.cg_lock);
+
+	// Check the free list:
+	free_count = 0;
+	block = ind_cac_globals.cg_free_list;
+	while (block) {
+		free_count++;
+		ASSERT_NS(block->cb_state == IDX_CAC_BLOCK_FREE);
+		block = block->cb_next;
+	}
+	ASSERT_NS(ind_cac_globals.cg_free_count == free_count);
+
+	/* Check the LRU list: */
+	XTIndBlockPtr list_block, plist_block;
+	
+	plist_block = NULL;
+	list_block = ind_cac_globals.cg_lru_block;
+	if (list_block) {
+		ASSERT_NS(ind_cac_globals.cg_mru_block != NULL);
+		ASSERT_NS(ind_cac_globals.cg_mru_block->cb_mr_used == NULL);
+		ASSERT_NS(list_block->cb_lr_used == NULL);
+		inuse_count = 0;
+		clean_count = 0;
+		while (list_block) {
+			inuse_count++;
+			ASSERT_NS(list_block->cb_state == IDX_CAC_BLOCK_DIRTY || list_block->cb_state == IDX_CAC_BLOCK_CLEAN);
+			if (list_block->cb_state == IDX_CAC_BLOCK_CLEAN)
+				clean_count++;
+			ASSERT_NS(block != list_block);
+			ASSERT_NS(list_block->cb_lr_used == plist_block);
+			plist_block = list_block;
+			list_block = list_block->cb_mr_used;
+		}
+		ASSERT_NS(ind_cac_globals.cg_mru_block == plist_block);
+	}
+	else {
+		inuse_count = 0;
+		clean_count = 0;
+		ASSERT_NS(ind_cac_globals.cg_mru_block == NULL);
+	}
+
+#ifdef DEBUG_CHECK_IND_CACHE
+	ASSERT_NS(free_count + inuse_count + ind_cac_globals.cg_reserved_by_ots + ind_cac_globals.cg_read_count == ind_cac_globals.cg_block_count);
+#endif
+	xt_unlock_mutex_ns(&ind_cac_globals.cg_lock);
+	if (check_count) {
+		/* We have just flushed, check how much is now free/clean. */
+		if (free_count + clean_count < 10) {
+			/* This could be a problem: */
+			printf("Cache very low!\n");
+		}
+	}
+}
+
+#ifdef XXXXDEBUG
+static void ind_cac_check_on_dirty_list(DcSegmentPtr seg, XTIndBlockPtr block)
+{
+	XTIndBlockPtr	list_block, plist_block;
+	xtBool		found = FALSE;
+	
+	plist_block = NULL;
+	list_block = seg->cs_dirty_list[block->cb_file_id % XT_INDEX_CACHE_FILE_SLOTS];
+	while (list_block) {
+		ASSERT_NS(list_block->cb_state == IDX_CAC_BLOCK_DIRTY);
+		ASSERT_NS(list_block->cb_dirty_prev == plist_block);
+		if (list_block == block)
+			found = TRUE;
+		plist_block = list_block;
+		list_block = list_block->cb_dirty_next;
+	}
+	ASSERT_NS(found);
+}
+
+static void ind_cac_check_dirty_list(DcSegmentPtr seg, XTIndBlockPtr block)
+{
+	XTIndBlockPtr list_block, plist_block;
+	
+	for (u_int j=0; j<XT_INDEX_CACHE_FILE_SLOTS; j++) {
+		plist_block = NULL;
+		list_block = seg->cs_dirty_list[j];
+		while (list_block) {
+			ASSERT_NS(list_block->cb_state == IDX_CAC_BLOCK_DIRTY);
+			ASSERT_NS(block != list_block);
+			ASSERT_NS(list_block->cb_dirty_prev == plist_block);
+			plist_block = list_block;
+			list_block = list_block->cb_dirty_next;
+		}
+	}
+}
+
+#endif
+
+/*
+ * -----------------------------------------------------------------------
+ * FREEING INDEX CACHE
+ */
+
+/*
+ * This function return TRUE if the block is freed. 
+ * This function returns FALSE if the block cannot be found, or the
+ * block is not clean.
+ *
+ * We also return FALSE if we cannot copy the block to the handle
+ * (if this is required). This will be due to out-of-memory!
+ */
+static xtBool ind_free_block(XTOpenTablePtr ot, XTIndBlockPtr block)
+{
+	XTIndBlockPtr	xblock, pxblock;
+	u_int			hash_idx;
+	u_int			file_id;
+	xtIndexNodeID	address;
+	DcSegmentPtr	seg;
+
+#ifdef DEBUG_CHECK_IND_CACHE
+	xt_ind_check_cache(NULL);
+#endif
+	file_id = block->cb_file_id;
+	address = block->cb_address;
+
+	hash_idx = XT_NODE_ID(address) + (file_id * 223);
+	seg = &ind_cac_globals.cg_segment[hash_idx & IDX_CAC_SEGMENT_MASK];
+	hash_idx = (hash_idx >> XT_INDEX_CACHE_SEGMENT_SHIFTS) % ind_cac_globals.cg_hash_size;
+
+	IDX_CAC_WRITE_LOCK(seg, ot->ot_thread);
+
+	pxblock = NULL;
+	xblock = seg->cs_hash_table[hash_idx];
+	while (xblock) {
+		if (block == xblock) {
+			/* Found the block... */
+			/* It is possible that a thread enters this code holding a
+			 * lock on a page. This can cause a deadlock:
+			 *
+			 * #0	0x91faa2ce in semaphore_wait_signal_trap
+			 * #1	0x91fb1da5 in pthread_mutex_lock
+			 * #2	0x00e2ec13 in xt_p_mutex_lock at pthread_xt.cc:544
+			 * #3	0x00e6c30a in xt_xsmutex_xlock at lock_xt.cc:1547
+			 * #4	0x00dee402 in ind_free_block at cache_xt.cc:879
+			 * #5	0x00dee76a in ind_cac_free_lru_blocks at cache_xt.cc:1033
+			 * #6	0x00def8d1 in xt_ind_reserve at cache_xt.cc:1513
+			 * #7	0x00e22118 in xt_idx_insert at index_xt.cc:2047
+			 * #8	0x00e4d7ee in xt_tab_new_record at table_xt.cc:4702
+			 * #9	0x00e0ff0b in ha_pbxt::write_row at ha_pbxt.cc:2340
+			 * #10	0x0023a00f in handler::ha_write_row at handler.cc:4570
+			 * #11	0x001a32c8 in write_record at sql_insert.cc:1568
+			 * #12	0x001ab635 in mysql_insert at sql_insert.cc:812
+			 * #13	0x0010e068 in mysql_execute_command at sql_parse.cc:3066
+			 * #14	0x0011480d in mysql_parse at sql_parse.cc:5787
+			 * #15	0x00115afb in dispatch_command at sql_parse.cc:1200
+			 * #16	0x00116de2 in do_command at sql_parse.cc:857
+			 * #17	0x00101ee4 in handle_one_connection at sql_connect.cc:1115
+			 * #18	0x91fdb155 in _pthread_start
+			 * #19	0x91fdb012 in thread_start
+			 * 
+			 * #0	0x91fb146e in __semwait_signal
+			 * #1	0x91fb12ef in nanosleep$UNIX2003
+			 * #2	0x91fb1236 in usleep$UNIX2003
+			 * #3	0x00e52112 in xt_yield at thread_xt.cc:1274
+			 * #4	0x00e6c0eb in xt_spinxslock_xlock at lock_xt.cc:1456
+			 * #5	0x00dee444 in ind_free_block at cache_xt.cc:886
+			 * #6	0x00dee76a in ind_cac_free_lru_blocks at cache_xt.cc:1033
+			 * #7	0x00deeaf0 in ind_cac_fetch at cache_xt.cc:1130
+			 * #8	0x00def604 in xt_ind_fetch at cache_xt.cc:1386
+			 * #9	0x00e2159a in xt_idx_update_row_id at index_xt.cc:2489
+			 * #10	0x00e603c8 in xn_sw_clean_indices at xaction_xt.cc:1932
+			 * #11	0x00e606d4 in xn_sw_cleanup_variation at xaction_xt.cc:2056
+			 * #12	0x00e60e29 in xn_sw_cleanup_xact at xaction_xt.cc:2276
+			 * #13	0x00e615ed in xn_sw_main at xaction_xt.cc:2433
+			 * #14	0x00e61919 in xn_sw_run_thread at xaction_xt.cc:2564
+			 * #15	0x00e53f80 in thr_main at thread_xt.cc:1017
+			 * #16	0x91fdb155 in _pthread_start
+			 * #17	0x91fdb012 in thread_start
+			 *
+			 * So we back off if a lock is held!
+			 */
+			if (!XT_IPAGE_WRITE_TRY_LOCK(&block->cb_lock, ot->ot_thread->t_id)) {
+				IDX_CAC_UNLOCK(seg, ot->ot_thread);
+#ifdef DEBUG_CHECK_IND_CACHE
+				xt_ind_check_cache(NULL);
+#endif
+				return FALSE;
+			}
+			if (block->cb_state != IDX_CAC_BLOCK_CLEAN) {
+				/* This block cannot be freeed: */
+				XT_IPAGE_UNLOCK(&block->cb_lock, TRUE);
+				IDX_CAC_UNLOCK(seg, ot->ot_thread);
+#ifdef DEBUG_CHECK_IND_CACHE
+				xt_ind_check_cache(NULL);
+#endif
+				return FALSE;
+			}
+			
+			goto free_the_block;
+		}
+		pxblock = xblock;
+		xblock = xblock->cb_next;
+	}
+
+	IDX_CAC_UNLOCK(seg, ot->ot_thread);
+
+	/* Not found (this can happen, if block was freed by another thread) */
+#ifdef DEBUG_CHECK_IND_CACHE
+	xt_ind_check_cache(NULL);
+#endif
+	return FALSE;
+
+	free_the_block:
+
+	/* If the block is reference by a handle, then we
+	 * have to copy the data to the handle before we
+	 * free the page:
+	 */
+	/* {HANDLE-COUNT-USAGE}
+	 * This access is safe because:
+	 *
+	 * We have an Xlock on the cache block, which excludes
+	 * all other writers that want to change the cache block
+	 * and also all readers of the cache block, because
+	 * they all have at least an Slock on the cache block.
+	 */
+	if (block->cb_handle_count) {
+		XTIndReferenceRec	iref;
+		
+		iref.ir_xlock = TRUE;
+		iref.ir_updated = FALSE;
+		iref.ir_block = block;
+		iref.ir_branch = (XTIdxBranchDPtr) block->cb_data;
+		if (!xt_ind_copy_on_write(&iref)) {
+			XT_IPAGE_UNLOCK(&block->cb_lock, TRUE);
+			return FALSE;
+		}
+	}
+
+	/* Block is clean, remove from the hash table: */
+	if (pxblock)
+		pxblock->cb_next = block->cb_next;
+	else
+		seg->cs_hash_table[hash_idx] = block->cb_next;
+
+	xt_lock_mutex_ns(&ind_cac_globals.cg_lock);
+
+	/* Remove from the MRU list: */
+	if (ind_cac_globals.cg_lru_block == block)
+		ind_cac_globals.cg_lru_block = block->cb_mr_used;
+	if (ind_cac_globals.cg_mru_block == block)
+		ind_cac_globals.cg_mru_block = block->cb_lr_used;
+	
+	/* Note, I am updating blocks for which I have no lock
+	 * here. But I think this is OK because I have a lock
+	 * for the MRU list.
+	 */
+	if (block->cb_lr_used)
+		block->cb_lr_used->cb_mr_used = block->cb_mr_used;
+	if (block->cb_mr_used)
+		block->cb_mr_used->cb_lr_used = block->cb_lr_used;
+
+	/* The block is now free: */
+	block->cb_next = ind_cac_globals.cg_free_list;
+	ind_cac_globals.cg_free_list = block;
+	ind_cac_globals.cg_free_count++;
+	block->cb_state = IDX_CAC_BLOCK_FREE;
+	IDX_TRACE("%d- f%x\n", (int) XT_NODE_ID(address), (int) XT_GET_DISK_2(block->cb_data));
+
+	/* Unlock BEFORE the block is reused! */
+	XT_IPAGE_UNLOCK(&block->cb_lock, TRUE);
+
+	xt_unlock_mutex_ns(&ind_cac_globals.cg_lock);
+
+	IDX_CAC_UNLOCK(seg, ot->ot_thread);
+
+#ifdef DEBUG_CHECK_IND_CACHE
+	xt_ind_check_cache(NULL);
+#endif
+	return TRUE;
+}
+
+#define IND_CACHE_MAX_BLOCKS_TO_FREE		100
+
+/*
+ * Return the number of blocks freed.
+ *
+ * The idea is to grab a list of blocks to free.
+ * The list consists of the LRU blocks that are
+ * clean.
+ *
+ * Free as many as possible (up to max of blocks_required)
+ * from the list, even if LRU position has changed
+ * (or we have a race if there are too few blocks).
+ * However, if the block cannot be found, or is dirty
+ * we must skip it.
+ *
+ * Repeat until we find no blocks for the list, or
+ * we have freed 'blocks_required'.
+ *
+ * 'not_this' is a block that must not be freed because
+ * it is locked by the calling thread!
+ */
+static u_int ind_cac_free_lru_blocks(XTOpenTablePtr ot, u_int blocks_required, XTIdxBranchDPtr not_this)
+{
+	register DcGlobalsRec	*dcg = &ind_cac_globals;
+	XTIndBlockPtr			to_free[IND_CACHE_MAX_BLOCKS_TO_FREE];
+	int						count;
+	XTIndBlockPtr			block;
+	u_int					blocks_freed = 0;
+	XTIndBlockPtr			locked_block;
+
+#ifdef XT_USE_DIRECT_IO_ON_INDEX
+#error This will not work!
+#endif
+	locked_block = (XTIndBlockPtr) ((xtWord1 *) not_this - offsetof(XTIndBlockRec, cb_data));
+
+	retry:
+	xt_lock_mutex_ns(&ind_cac_globals.cg_lock);
+	block = dcg->cg_lru_block;
+	count = 0;
+	while (block && count < IND_CACHE_MAX_BLOCKS_TO_FREE) {
+		if (block != locked_block && block->cb_state == IDX_CAC_BLOCK_CLEAN) {
+			to_free[count] = block;
+			count++;
+		}
+		block = block->cb_mr_used;
+	}
+	xt_unlock_mutex_ns(&ind_cac_globals.cg_lock);
+
+	if (!count)
+		return blocks_freed;
+
+	for (int i=0; i<count; i++) {
+		if (ind_free_block(ot, to_free[i]))
+			blocks_freed++;
+		if (blocks_freed >= blocks_required &&
+			ind_cac_globals.cg_free_count >= ind_cac_globals.cg_max_free + blocks_required)
+		return blocks_freed;
+	}
+
+	goto retry;
+}
+
+/*
+ * -----------------------------------------------------------------------
+ * MAIN CACHE FUNCTIONS
+ */
+
+/*
+ * Fetch the block. Note, if we are about to write the block
+ * then there is no need to read it from disk!
+ */
+static XTIndBlockPtr ind_cac_fetch(XTOpenTablePtr ot, XTIndexPtr ind, xtIndexNodeID address, DcSegmentPtr *ret_seg, xtBool read_data)
+{
+	register XTOpenFilePtr	file = ot->ot_ind_file;
+	register XTIndBlockPtr	block, new_block;
+	register DcSegmentPtr	seg;
+	register u_int			hash_idx;
+	register DcGlobalsRec	*dcg = &ind_cac_globals;
+	size_t					red_size;
+
+#ifdef DEBUG_CHECK_IND_CACHE
+	xt_ind_check_cache(NULL);
+#endif
+	/* Address, plus file ID multiplied by my favorite prime number! */
+	hash_idx = XT_NODE_ID(address) + (file->fr_id * 223);
+	seg = &dcg->cg_segment[hash_idx & IDX_CAC_SEGMENT_MASK];
+	hash_idx = (hash_idx >> XT_INDEX_CACHE_SEGMENT_SHIFTS) % dcg->cg_hash_size;
+
+	IDX_CAC_READ_LOCK(seg, ot->ot_thread);
+	block = seg->cs_hash_table[hash_idx];
+	while (block) {
+		if (XT_NODE_ID(block->cb_address) == XT_NODE_ID(address) && block->cb_file_id == file->fr_id) {
+			ASSERT_NS(block->cb_state != IDX_CAC_BLOCK_FREE);
+
+			/* Check how recently this page has been used: */
+			if (XT_TIME_DIFF(block->cb_ru_time, dcg->cg_ru_now) > (dcg->cg_block_count >> 1)) {
+				xt_lock_mutex_ns(&dcg->cg_lock);
+
+				/* Move to the front of the MRU list: */
+				block->cb_ru_time = ++dcg->cg_ru_now;
+				if (dcg->cg_mru_block != block) {
+					/* Remove from the MRU list: */
+					if (dcg->cg_lru_block == block)
+						dcg->cg_lru_block = block->cb_mr_used;
+					if (block->cb_lr_used)
+						block->cb_lr_used->cb_mr_used = block->cb_mr_used;
+					if (block->cb_mr_used)
+						block->cb_mr_used->cb_lr_used = block->cb_lr_used;
+
+					/* Make the block the most recently used: */
+					if ((block->cb_lr_used = dcg->cg_mru_block))
+						dcg->cg_mru_block->cb_mr_used = block;
+					block->cb_mr_used = NULL;
+					dcg->cg_mru_block = block;
+					if (!dcg->cg_lru_block)
+						dcg->cg_lru_block = block;
+				}
+
+				xt_unlock_mutex_ns(&dcg->cg_lock);
+			}
+		
+			*ret_seg = seg;
+#ifdef DEBUG_CHECK_IND_CACHE
+			xt_ind_check_cache(NULL);
+#endif
+			ot->ot_thread->st_statistics.st_ind_cache_hit++;
+			return block;
+		}
+		block = block->cb_next;
+	}
+	
+	/* Block not found... */
+	IDX_CAC_UNLOCK(seg, ot->ot_thread);
+
+	/* Check the open table reserve list first: */
+	if ((new_block = ot->ot_ind_res_bufs)) {
+		ot->ot_ind_res_bufs = new_block->cb_next;
+		ot->ot_ind_res_count--;
+#ifdef DEBUG_CHECK_IND_CACHE
+		xt_lock_mutex_ns(&dcg->cg_lock);
+		dcg->cg_reserved_by_ots--;
+		dcg->cg_read_count++;
+		xt_unlock_mutex_ns(&dcg->cg_lock);
+#endif
+		goto use_free_block;
+	}
+
+	free_some_blocks:
+	if (!dcg->cg_free_list) {
+		if (!ind_cac_free_lru_blocks(ot, 1, NULL)) {
+			if (!dcg->cg_free_list) {
+				xt_register_xterr(XT_REG_CONTEXT, XT_ERR_NO_INDEX_CACHE);
+#ifdef DEBUG_CHECK_IND_CACHE
+				xt_ind_check_cache(NULL);
+#endif
+				return NULL;
+			}
+		}
+	}
+
+	/* Get a free block: */
+	xt_lock_mutex_ns(&dcg->cg_lock);
+	if (!(new_block = dcg->cg_free_list)) {
+		xt_unlock_mutex_ns(&dcg->cg_lock);
+		goto free_some_blocks;
+	}
+	ASSERT_NS(new_block->cb_state == IDX_CAC_BLOCK_FREE);
+	dcg->cg_free_list = new_block->cb_next;
+	dcg->cg_free_count--;
+#ifdef DEBUG_CHECK_IND_CACHE
+	dcg->cg_read_count++;
+#endif
+	xt_unlock_mutex_ns(&dcg->cg_lock);
+
+	use_free_block:
+	new_block->cb_address = address;
+	new_block->cb_file_id = file->fr_id;
+	new_block->cb_state = IDX_CAC_BLOCK_CLEAN;
+	new_block->cb_handle_count = 0;
+	new_block->cp_flush_seq = 0;
+	new_block->cp_del_count = 0;
+	new_block->cb_dirty_next = NULL;
+	new_block->cb_dirty_prev = NULL;
+
+	if (read_data) {
+		if (!xt_pread_file(file, xt_ind_node_to_offset(ot->ot_table, address), XT_INDEX_PAGE_SIZE, 0, new_block->cb_data, &red_size, &ot->ot_thread->st_statistics.st_ind, ot->ot_thread)) {
+			xt_lock_mutex_ns(&dcg->cg_lock);
+			new_block->cb_next = dcg->cg_free_list;
+			dcg->cg_free_list = new_block;
+			dcg->cg_free_count++;
+#ifdef DEBUG_CHECK_IND_CACHE
+			dcg->cg_read_count--;
+#endif
+			new_block->cb_state = IDX_CAC_BLOCK_FREE;
+			IDX_TRACE("%d- F%x\n", (int) XT_NODE_ID(address), (int) XT_GET_DISK_2(new_block->cb_data));
+			xt_unlock_mutex_ns(&dcg->cg_lock);
+#ifdef DEBUG_CHECK_IND_CACHE
+			xt_ind_check_cache(NULL);
+#endif
+			return NULL;
+		}
+		IDX_TRACE("%d- R%x\n", (int) XT_NODE_ID(address), (int) XT_GET_DISK_2(new_block->cb_data));
+		ot->ot_thread->st_statistics.st_ind_cache_miss++;
+	}
+	else
+		red_size = 0;
+	// PMC - I don't think this is required! memset(new_block->cb_data + red_size, 0, XT_INDEX_PAGE_SIZE - red_size);
+
+	IDX_CAC_WRITE_LOCK(seg, ot->ot_thread);
+	block = seg->cs_hash_table[hash_idx];
+	while (block) {
+		if (XT_NODE_ID(block->cb_address) == XT_NODE_ID(address) && block->cb_file_id == file->fr_id) {
+			/* Oops, someone else was faster! */
+			xt_lock_mutex_ns(&dcg->cg_lock);
+			new_block->cb_next = dcg->cg_free_list;
+			dcg->cg_free_list = new_block;
+			dcg->cg_free_count++;
+#ifdef DEBUG_CHECK_IND_CACHE
+			dcg->cg_read_count--;
+#endif
+			new_block->cb_state = IDX_CAC_BLOCK_FREE;
+			IDX_TRACE("%d- F%x\n", (int) XT_NODE_ID(address), (int) XT_GET_DISK_2(new_block->cb_data));
+			xt_unlock_mutex_ns(&dcg->cg_lock);
+			goto done_ok;
+		}
+		block = block->cb_next;
+	}
+	block = new_block;
+
+	/* Make the block the most recently used: */
+	xt_lock_mutex_ns(&dcg->cg_lock);
+	block->cb_ru_time = ++dcg->cg_ru_now;
+	if ((block->cb_lr_used = dcg->cg_mru_block))
+		dcg->cg_mru_block->cb_mr_used = block;
+	block->cb_mr_used = NULL;
+	dcg->cg_mru_block = block;
+	if (!dcg->cg_lru_block)
+		dcg->cg_lru_block = block;
+#ifdef DEBUG_CHECK_IND_CACHE
+	dcg->cg_read_count--;
+#endif
+	xt_unlock_mutex_ns(&dcg->cg_lock);
+
+	/* {LAZY-DEL-INDEX-ITEMS}
+	 * Conditionally count the number of deleted entries in the index:
+	 * We do this before other threads can read the block.
+	 */
+	if (ind && ind->mi_lazy_delete && read_data)
+		xt_ind_count_deleted_items(ot->ot_table, ind, block);
+
+	/* Add to the hash table: */
+	block->cb_next = seg->cs_hash_table[hash_idx];
+	seg->cs_hash_table[hash_idx] = block;
+
+	done_ok:
+	*ret_seg = seg;
+#ifdef DEBUG_CHECK_IND_CACHE
+	xt_ind_check_cache(NULL);
+#endif
+	return block;
+}
+
+static xtBool ind_cac_get(XTOpenTablePtr ot, xtIndexNodeID address, DcSegmentPtr *ret_seg, XTIndBlockPtr *ret_block)
+{
+	register XTOpenFilePtr	file = ot->ot_ind_file;
+	register XTIndBlockPtr	block;
+	register DcSegmentPtr	seg;
+	register u_int			hash_idx;
+	register DcGlobalsRec	*dcg = &ind_cac_globals;
+
+	hash_idx = XT_NODE_ID(address) + (file->fr_id * 223);
+	seg = &dcg->cg_segment[hash_idx & IDX_CAC_SEGMENT_MASK];
+	hash_idx = (hash_idx >> XT_INDEX_CACHE_SEGMENT_SHIFTS) % dcg->cg_hash_size;
+
+	IDX_CAC_READ_LOCK(seg, ot->ot_thread);
+	block = seg->cs_hash_table[hash_idx];
+	while (block) {
+		if (XT_NODE_ID(block->cb_address) == XT_NODE_ID(address) && block->cb_file_id == file->fr_id) {
+			ASSERT_NS(block->cb_state != IDX_CAC_BLOCK_FREE);
+
+			*ret_seg = seg;
+			*ret_block = block;
+			return OK;
+		}
+		block = block->cb_next;
+	}
+	IDX_CAC_UNLOCK(seg, ot->ot_thread);
+	
+	/* Block not found: */
+	*ret_seg = NULL;
+	*ret_block = NULL;
+	return OK;
+}
+
+xtPublic xtBool xt_ind_write(XTOpenTablePtr ot, XTIndexPtr ind, xtIndexNodeID address, size_t size, xtWord1 *data)
+{
+	XTIndBlockPtr	block;
+	DcSegmentPtr	seg;
+
+	if (!(block = ind_cac_fetch(ot, ind, address, &seg, FALSE)))
+		return FAILED;
+
+	XT_IPAGE_WRITE_LOCK(&block->cb_lock, ot->ot_thread->t_id);
+	ASSERT_NS(block->cb_state == IDX_CAC_BLOCK_CLEAN || block->cb_state == IDX_CAC_BLOCK_DIRTY);
+	memcpy(block->cb_data, data, size);
+	block->cp_flush_seq = ot->ot_table->tab_ind_flush_seq;
+	if (block->cb_state != IDX_CAC_BLOCK_DIRTY) {
+		TRACK_BLOCK_WRITE(offset);
+		xt_spinlock_lock(&ind->mi_dirty_lock);
+		if ((block->cb_dirty_next = ind->mi_dirty_list))
+			ind->mi_dirty_list->cb_dirty_prev = block;
+		block->cb_dirty_prev = NULL;
+		ind->mi_dirty_list = block;
+		ind->mi_dirty_blocks++;
+		xt_spinlock_unlock(&ind->mi_dirty_lock);
+		block->cb_state = IDX_CAC_BLOCK_DIRTY;
+	}
+	XT_IPAGE_UNLOCK(&block->cb_lock, TRUE);
+	IDX_CAC_UNLOCK(seg, ot->ot_thread);
+#ifdef XT_TRACK_INDEX_UPDATES
+	ot->ot_ind_changed++;
+#endif
+#ifdef CHECK_BLOCK_TRAILERS
+	check_block_trailers();
+#endif
+	return OK;
+}
+
+/*
+ * Update the cache, if in RAM.
+ */
+xtPublic xtBool xt_ind_write_cache(XTOpenTablePtr ot, xtIndexNodeID address, size_t size, xtWord1 *data)
+{
+	XTIndBlockPtr	block;
+	DcSegmentPtr	seg;
+
+	if (!ind_cac_get(ot, address, &seg, &block))
+		return FAILED;
+
+	if (block) {
+		XT_IPAGE_WRITE_LOCK(&block->cb_lock, ot->ot_thread->t_id);
+		ASSERT_NS(block->cb_state == IDX_CAC_BLOCK_CLEAN || block->cb_state == IDX_CAC_BLOCK_DIRTY);
+		memcpy(block->cb_data, data, size);
+		XT_IPAGE_UNLOCK(&block->cb_lock, TRUE);
+		IDX_CAC_UNLOCK(seg, ot->ot_thread);
+	}
+
+	return OK;
+}
+
+xtPublic xtBool xt_ind_clean(XTOpenTablePtr ot, XTIndexPtr ind, xtIndexNodeID address)
+{
+	XTIndBlockPtr	block;
+	DcSegmentPtr	seg;
+
+	if (!ind_cac_get(ot, address, &seg, &block))
+		return FAILED;
+	if (block) {
+		XT_IPAGE_WRITE_LOCK(&block->cb_lock, ot->ot_thread->t_id);
+		ASSERT_NS(block->cb_state == IDX_CAC_BLOCK_CLEAN || block->cb_state == IDX_CAC_BLOCK_DIRTY);
+
+		if (block->cb_state == IDX_CAC_BLOCK_DIRTY) {
+			/* Take the block off the dirty list: */
+			xt_spinlock_lock(&ind->mi_dirty_lock);
+			if (block->cb_dirty_next)
+				block->cb_dirty_next->cb_dirty_prev = block->cb_dirty_prev;
+			if (block->cb_dirty_prev)
+				block->cb_dirty_prev->cb_dirty_next = block->cb_dirty_next;
+			if (ind->mi_dirty_list == block)
+				ind->mi_dirty_list = block->cb_dirty_next;
+			ind->mi_dirty_blocks--;
+			xt_spinlock_unlock(&ind->mi_dirty_lock);
+			block->cb_state = IDX_CAC_BLOCK_CLEAN;
+		}
+		XT_IPAGE_UNLOCK(&block->cb_lock, TRUE);
+
+		IDX_CAC_UNLOCK(seg, ot->ot_thread);
+	}
+
+	return OK;
+}
+
+xtPublic xtBool xt_ind_read_bytes(XTOpenTablePtr ot, XTIndexPtr ind, xtIndexNodeID address, size_t size, xtWord1 *data)
+{
+	XTIndBlockPtr	block;
+	DcSegmentPtr	seg;
+
+	if (!(block = ind_cac_fetch(ot, ind, address, &seg, TRUE)))
+		return FAILED;
+
+	XT_IPAGE_READ_LOCK(&block->cb_lock);
+	memcpy(data, block->cb_data, size);
+	XT_IPAGE_UNLOCK(&block->cb_lock, FALSE);
+	IDX_CAC_UNLOCK(seg, ot->ot_thread);
+	return OK;
+}
+
+xtPublic xtBool xt_ind_fetch(XTOpenTablePtr ot, XTIndexPtr ind, xtIndexNodeID address, XTPageLockType ltype, XTIndReferencePtr iref)
+{
+	register XTIndBlockPtr	block;
+	DcSegmentPtr			seg;
+	xtWord2					branch_size;
+	u_int					rec_size;
+	xtBool					xlock = FALSE;
+
+#ifdef DEBUG
+	ASSERT_NS(iref->ir_xlock == 2);
+	ASSERT_NS(iref->ir_xlock == 2);
+#endif
+	if (!(block = ind_cac_fetch(ot, ind, address, &seg, TRUE)))
+		return FAILED;
+
+	branch_size = XT_GET_DISK_2(((XTIdxBranchDPtr) block->cb_data)->tb_size_2);
+	rec_size = XT_GET_INDEX_BLOCK_LEN(branch_size);
+	if (rec_size < 2 || rec_size > XT_INDEX_PAGE_SIZE)
+		goto failed_corrupt;
+	if (ind->mi_fix_key) {
+		rec_size -= 2;
+		if (XT_IS_NODE(branch_size)) {
+			if (rec_size != 0) {
+				if (rec_size < XT_NODE_REF_SIZE)
+					goto failed_corrupt;
+				rec_size -= XT_NODE_REF_SIZE;
+				if ((rec_size % (ind->mi_key_size + XT_RECORD_REF_SIZE + XT_NODE_REF_SIZE)) != 0)
+					goto failed_corrupt;
+			}
+		}
+		else {
+			if ((rec_size % (ind->mi_key_size + XT_RECORD_REF_SIZE)) != 0)
+				goto failed_corrupt;
+		}
+	}
+
+	switch (ltype) {
+		case XT_LOCK_READ:
+			break;
+		case XT_LOCK_WRITE:
+			xlock = TRUE;
+			break;
+		case XT_XLOCK_LEAF:
+			if (!XT_IS_NODE(branch_size))
+				xlock = TRUE;
+			break;
+		case XT_XLOCK_DEL_LEAF:
+			if (!XT_IS_NODE(branch_size)) {
+				if (ot->ot_table->tab_dic.dic_no_lazy_delete)
+					xlock = TRUE;
+				else {
+					/*
+					 * {LAZY-DEL-INDEX-ITEMS}
+					 *
+					 * We are fetch a page for delete purpose.
+					 * we decide here if we plan to do a lazy delete,
+					 * Or if we plan to compact the node.
+					 *
+					 * A lazy delete just requires a shared lock.
+					 *
+					 */
+					if (ind->mi_lazy_delete) {
+						/* If the number of deleted items is greater than
+						 * half of the number of times that can fit in the
+						 * page, then we will compact the node.
+						 */
+						if (!xt_idx_lazy_delete_on_leaf(ind, block, XT_GET_INDEX_BLOCK_LEN(branch_size)))
+							xlock = TRUE;
+					}
+					else
+						xlock = TRUE;
+				}
+			}
+			break;
+	}
+
+	if ((iref->ir_xlock = xlock))
+		XT_IPAGE_WRITE_LOCK(&block->cb_lock, ot->ot_thread->t_id);
+	else
+		XT_IPAGE_READ_LOCK(&block->cb_lock);
+
+	IDX_CAC_UNLOCK(seg, ot->ot_thread);
+
+	/* {DIRECT-IO}
+	 * Direct I/O requires that the buffer is 512 byte aligned.
+	 * To do this, cb_data is turned into a pointer, instead
+	 * of an array.
+	 * As a result, we need to pass a pointer to both the
+	 * cache block and the cache block data:
+	 */
+	iref->ir_updated = FALSE;
+	iref->ir_block = block;
+	iref->ir_branch = (XTIdxBranchDPtr) block->cb_data;
+	return OK;
+
+	failed_corrupt:
+	IDX_CAC_UNLOCK(seg, ot->ot_thread);
+	xt_register_taberr(XT_REG_CONTEXT, XT_ERR_INDEX_CORRUPTED, ot->ot_table->tab_name);
+	return FAILED;
+}
+
+xtPublic xtBool xt_ind_release(XTOpenTablePtr ot, XTIndexPtr ind, XTPageUnlockType XT_NDEBUG_UNUSED(utype), XTIndReferencePtr iref)
+{
+	register XTIndBlockPtr	block;
+
+	block = iref->ir_block;
+
+#ifdef DEBUG
+	ASSERT_NS(iref->ir_xlock != 2);
+	ASSERT_NS(iref->ir_updated != 2);
+	if (iref->ir_updated)
+		ASSERT_NS(utype == XT_UNLOCK_R_UPDATE || utype == XT_UNLOCK_W_UPDATE);
+	else
+		ASSERT_NS(utype == XT_UNLOCK_READ || utype == XT_UNLOCK_WRITE);
+	if (iref->ir_xlock)
+		ASSERT_NS(utype == XT_UNLOCK_WRITE || utype == XT_UNLOCK_W_UPDATE);
+	else
+		ASSERT_NS(utype == XT_UNLOCK_READ || utype == XT_UNLOCK_R_UPDATE);
+#endif
+	if (iref->ir_updated) {
+		/* The page was update: */
+		ASSERT_NS(block->cb_state == IDX_CAC_BLOCK_CLEAN || block->cb_state == IDX_CAC_BLOCK_DIRTY);
+		block->cp_flush_seq = ot->ot_table->tab_ind_flush_seq;
+		if (block->cb_state != IDX_CAC_BLOCK_DIRTY) {
+			TRACK_BLOCK_WRITE(offset);
+			xt_spinlock_lock(&ind->mi_dirty_lock);
+			if ((block->cb_dirty_next = ind->mi_dirty_list))
+				ind->mi_dirty_list->cb_dirty_prev = block;
+			block->cb_dirty_prev = NULL;
+			ind->mi_dirty_list = block;
+			ind->mi_dirty_blocks++;
+			xt_spinlock_unlock(&ind->mi_dirty_lock);
+			block->cb_state = IDX_CAC_BLOCK_DIRTY;
+		}
+	}
+
+	XT_IPAGE_UNLOCK(&block->cb_lock, iref->ir_xlock);
+#ifdef DEBUG
+	iref->ir_xlock = 2;
+	iref->ir_updated = 2;
+#endif
+	return OK;
+}
+
+xtPublic xtBool xt_ind_reserve(XTOpenTablePtr ot, u_int count, XTIdxBranchDPtr not_this)
+{
+	register XTIndBlockPtr	block;
+	register DcGlobalsRec	*dcg = &ind_cac_globals;
+
+#ifdef XT_TRACK_INDEX_UPDATES
+	ot->ot_ind_reserved = count;
+	ot->ot_ind_reads = 0;
+#endif
+#ifdef DEBUG_CHECK_IND_CACHE
+	xt_ind_check_cache(NULL);
+#endif
+	while (ot->ot_ind_res_count < count) {
+		if (!dcg->cg_free_list) {
+			if (!ind_cac_free_lru_blocks(ot, count - ot->ot_ind_res_count, not_this)) {
+				if (!dcg->cg_free_list) {
+					xt_ind_free_reserved(ot);
+					xt_register_xterr(XT_REG_CONTEXT, XT_ERR_NO_INDEX_CACHE);
+#ifdef DEBUG_CHECK_IND_CACHE
+					xt_ind_check_cache(NULL);
+#endif
+					return FAILED;
+				}
+			}
+		}
+
+		/* Get a free block: */
+		xt_lock_mutex_ns(&dcg->cg_lock);
+		while (ot->ot_ind_res_count < count && (block = dcg->cg_free_list)) {
+			ASSERT_NS(block->cb_state == IDX_CAC_BLOCK_FREE);
+			dcg->cg_free_list = block->cb_next;
+			dcg->cg_free_count--;
+			block->cb_next = ot->ot_ind_res_bufs;
+			ot->ot_ind_res_bufs = block;
+			ot->ot_ind_res_count++;
+#ifdef DEBUG_CHECK_IND_CACHE
+			dcg->cg_reserved_by_ots++;
+#endif
+		}
+		xt_unlock_mutex_ns(&dcg->cg_lock);
+	}
+#ifdef DEBUG_CHECK_IND_CACHE
+	xt_ind_check_cache(NULL);
+#endif
+	return OK;
+}
+
+xtPublic void xt_ind_free_reserved(XTOpenTablePtr ot)
+{
+#ifdef DEBUG_CHECK_IND_CACHE
+	xt_ind_check_cache(NULL);
+#endif
+	if (ot->ot_ind_res_bufs) {
+		register XTIndBlockPtr	block, fblock;
+		register DcGlobalsRec	*dcg = &ind_cac_globals;
+
+		xt_lock_mutex_ns(&dcg->cg_lock);
+		block = ot->ot_ind_res_bufs;
+		while (block) {
+			fblock = block;
+			block = block->cb_next;
+
+			fblock->cb_next = dcg->cg_free_list;
+			dcg->cg_free_list = fblock;
+#ifdef DEBUG_CHECK_IND_CACHE
+			dcg->cg_reserved_by_ots--;
+#endif
+			dcg->cg_free_count++;
+		}
+		xt_unlock_mutex_ns(&dcg->cg_lock);
+		ot->ot_ind_res_bufs = NULL;
+		ot->ot_ind_res_count = 0;
+	}
+#ifdef DEBUG_CHECK_IND_CACHE
+	xt_ind_check_cache(NULL);
+#endif
+}
+
+xtPublic void xt_ind_unreserve(XTOpenTablePtr ot)
+{
+	if (!ind_cac_globals.cg_free_list)
+		xt_ind_free_reserved(ot);
+}
+
diff --git a/storage/pbxt/src/cache_xt.h b/storage/pbxt/src/cache_xt.h
new file mode 100644
index 00000000000..63a5164e466
--- /dev/null
+++ b/storage/pbxt/src/cache_xt.h
@@ -0,0 +1,200 @@
+/* Copyright (c) 2005 PrimeBase Technologies GmbH
+ *
+ * PrimeBase XT
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * 2005-05-24	Paul McCullagh
+ *
+ * H&G2JCtL
+ */
+#ifndef __xt_cache_h__
+#define __xt_cache_h__
+
+//#define XT_USE_MYSYS
+
+#include "filesys_xt.h"
+#include "index_xt.h"
+
+struct XTOpenTable;
+struct XTIdxReadBuffer;
+
+#ifdef DEBUG
+//#define XT_USE_CACHE_DEBUG_SIZES
+//#define CHECK_BLOCK_TRAILERS
+#endif
+
+#ifdef XT_USE_CACHE_DEBUG_SIZES
+#define XT_INDEX_CACHE_SEGMENT_SHIFTS	1
+#else
+#define XT_INDEX_CACHE_SEGMENT_SHIFTS	3
+#endif
+
+#define IDX_CAC_BLOCK_FREE				0
+#define IDX_CAC_BLOCK_CLEAN				1
+#define IDX_CAC_BLOCK_DIRTY				2
+
+#ifdef XT_NO_ATOMICS
+#define XT_IPAGE_USE_PTHREAD_RW
+#else
+//#define XT_IPAGE_USE_ATOMIC_RW
+#define XT_IPAGE_USE_SPINXSLOCK
+//#define XT_IPAGE_USE_SKEW_RW
+#endif
+
+#ifdef XT_IPAGE_USE_ATOMIC_RW
+#define XT_IPAGE_LOCK_TYPE				XTAtomicRWLockRec
+#define XT_IPAGE_INIT_LOCK(s, i)		xt_atomicrwlock_init_with_autoname(s, i)
+#define XT_IPAGE_FREE_LOCK(s, i)		xt_atomicrwlock_free(s, i)	
+#define XT_IPAGE_READ_LOCK(i)			xt_atomicrwlock_slock(i)
+#define XT_IPAGE_WRITE_LOCK(i, o)		xt_atomicrwlock_xlock(i, FALSE, o)
+#define XT_IPAGE_WRITE_TRY_LOCK(i, o)	xt_atomicrwlock_xlock(i, TRUE, o)
+#define XT_IPAGE_UNLOCK(i, x)			xt_atomicrwlock_unlock(i, x)
+#elif defined(XT_IPAGE_USE_PTHREAD_RW)
+#define XT_IPAGE_LOCK_TYPE				xt_rwlock_type
+#define XT_IPAGE_INIT_LOCK(s, i)		xt_init_rwlock_with_autoname(s, i)
+#define XT_IPAGE_FREE_LOCK(s, i)		xt_free_rwlock(i)	
+#define XT_IPAGE_READ_LOCK(i)			xt_slock_rwlock_ns(i)
+#define XT_IPAGE_WRITE_LOCK(i, s)		xt_xlock_rwlock_ns(i)
+#define XT_IPAGE_WRITE_TRY_LOCK(i, s)	xt_xlock_try_rwlock_ns(i)
+#define XT_IPAGE_UNLOCK(i, x)			xt_unlock_rwlock_ns(i)
+#elif defined(XT_IPAGE_USE_SPINXSLOCK)
+#define XT_IPAGE_LOCK_TYPE				XTSpinXSLockRec
+#define XT_IPAGE_INIT_LOCK(s, i)		xt_spinxslock_init_with_autoname(s, i)
+#define XT_IPAGE_FREE_LOCK(s, i)		xt_spinxslock_free(s, i)	
+#define XT_IPAGE_READ_LOCK(i)			xt_spinxslock_slock(i)
+#define XT_IPAGE_WRITE_LOCK(i, o)		xt_spinxslock_xlock(i, FALSE, o)
+#define XT_IPAGE_WRITE_TRY_LOCK(i, o)	xt_spinxslock_xlock(i, TRUE, o)
+#define XT_IPAGE_UNLOCK(i, x)			xt_spinxslock_unlock(i, x)
+#else // XT_IPAGE_USE_SKEW_RW
+#define XT_IPAGE_LOCK_TYPE				XTSkewRWLockRec
+#define XT_IPAGE_INIT_LOCK(s, i)		xt_skewrwlock_init_with_autoname(s, i)
+#define XT_IPAGE_FREE_LOCK(s, i)		xt_skewrwlock_free(s, i)	
+#define XT_IPAGE_READ_LOCK(i)			xt_skewrwlock_slock(i)
+#define XT_IPAGE_WRITE_LOCK(i, o)		xt_skewrwlock_xlock(i, FALSE, o)
+#define XT_IPAGE_WRITE_TRY_LOCK(i, o)	xt_skewrwlock_xlock(i, TRUE, o)
+#define XT_IPAGE_UNLOCK(i, x)			xt_skewrwlock_unlock(i, x)
+#endif
+
+enum XTPageLockType { XT_LOCK_READ, XT_LOCK_WRITE, XT_XLOCK_LEAF, XT_XLOCK_DEL_LEAF };
+enum XTPageUnlockType { XT_UNLOCK_NONE, XT_UNLOCK_READ, XT_UNLOCK_WRITE, XT_UNLOCK_R_UPDATE, XT_UNLOCK_W_UPDATE };
+
+/* A block is X locked if it is being changed or freed.
+ * A block is S locked if it is being read.
+ */
+typedef struct XTIndBlock {
+	xtIndexNodeID		cb_address;						/* The block address. */
+	u_int				cb_file_id;						/* The file id of the block. */
+	/* This is protected by cs_lock */
+	struct XTIndBlock	*cb_next;						/* Pointer to next block on hash list, or next free block on free list. */
+	/* This is protected by mi_dirty_lock */
+	struct XTIndBlock	*cb_dirty_next;					/* Double link for dirty blocks, next pointer. */
+	struct XTIndBlock	*cb_dirty_prev;					/* Double link for dirty blocks, previous pointer. */
+	/* This is protected by cg_lock */
+	xtWord4				cb_ru_time;						/* If this is in the top 1/4 don't change position in MRU list. */
+	struct XTIndBlock	*cb_mr_used;					/* More recently used blocks. */
+	struct XTIndBlock	*cb_lr_used;					/* Less recently used blocks. */
+	/* Protected by cb_lock: */
+	XT_IPAGE_LOCK_TYPE	cb_lock;
+	xtWord4				cp_flush_seq;
+	xtWord2				cb_handle_count;				/* TRUE if this page is referenced by a handle. */
+	xtWord2				cp_del_count;					/* Number of deleted entries. */
+	xtWord1				cb_state;						/* Block status. */
+#ifdef XT_USE_DIRECT_IO_ON_INDEX
+	xtWord1				*cb_data;
+#else
+	xtWord1				cb_data[XT_INDEX_PAGE_SIZE];
+#endif
+#ifdef CHECK_BLOCK_TRAILERS
+	xtWord1				cp_check[4];
+#endif
+} XTIndBlockRec, *XTIndBlockPtr;
+
+typedef struct XTIndReference {
+	xtBool					ir_xlock;					/* Set to TRUE if the cache block is X locked. */
+	xtBool					ir_updated;					/* Set to TRUE if the cache block is updated. */
+	XTIndBlockPtr			ir_block;
+	XTIdxBranchDPtr			ir_branch;
+} XTIndReferenceRec, *XTIndReferencePtr;
+
+typedef struct XTIndFreeBlock {
+	XTDiskValue1			if_zero1_1;					/* Must be set to zero. */
+	XTDiskValue1			if_zero2_1;					/* Must be set to zero. */
+	XTDiskValue1			if_status_1;
+	XTDiskValue1			if_unused1_1;
+	XTDiskValue4			if_unused2_4;
+	XTDiskValue8			if_next_block_8;
+} XTIndFreeBlockRec, *XTIndFreeBlockPtr;
+
+typedef struct XTIndHandleBlock {
+	xtWord4					hb_ref_count;
+	struct XTIndHandleBlock	*hb_next;
+	XTIdxBranchDRec			hb_branch;
+} XTIndHandleBlockRec, *XTIndHandleBlockPtr;
+
+typedef struct XTIndHandle {
+	struct XTIndHandle		*ih_next;
+	struct XTIndHandle		*ih_prev;
+	XTSpinLockRec			ih_lock;
+	xtIndexNodeID			ih_address;
+	xtBool					ih_cache_reference;		/* True if this handle references the cache. */
+	union {
+		XTIndBlockPtr		ih_cache_block;
+		XTIndHandleBlockPtr	ih_handle_block;
+	} x;
+	XTIdxBranchDPtr			ih_branch;
+} XTIndHandleRec, *XTIndHandlePtr;
+
+void			xt_ind_init(XTThreadPtr self, size_t cache_size);
+void			xt_ind_exit(XTThreadPtr self);
+
+xtInt8			xt_ind_get_usage();
+xtInt8			xt_ind_get_size();
+xtBool			xt_ind_write(struct XTOpenTable *ot, XTIndexPtr ind, xtIndexNodeID offset, size_t size, xtWord1 *data);
+xtBool			xt_ind_write_cache(struct XTOpenTable *ot, xtIndexNodeID offset, size_t size, xtWord1 *data);
+xtBool			xt_ind_clean(struct XTOpenTable *ot, XTIndexPtr ind, xtIndexNodeID offset);
+xtBool			xt_ind_read_bytes(struct XTOpenTable *ot, XTIndexPtr ind, xtIndexNodeID offset, size_t size, xtWord1 *data);
+void			xt_ind_check_cache(XTIndexPtr ind);
+xtBool			xt_ind_reserve(struct XTOpenTable *ot, u_int count, XTIdxBranchDPtr not_this);
+void			xt_ind_free_reserved(struct XTOpenTable *ot);
+void			xt_ind_unreserve(struct XTOpenTable *ot);
+
+xtBool			xt_ind_fetch(struct XTOpenTable *ot, XTIndexPtr ind, xtIndexNodeID node, XTPageLockType ltype, XTIndReferencePtr iref);
+xtBool			xt_ind_release(struct XTOpenTable *ot, XTIndexPtr ind, XTPageUnlockType utype, XTIndReferencePtr iref);
+
+void			xt_ind_lock_handle(XTIndHandlePtr handle);
+void			xt_ind_unlock_handle(XTIndHandlePtr handle);
+xtBool			xt_ind_copy_on_write(XTIndReferencePtr iref);
+
+XTIndHandlePtr	xt_ind_get_handle(struct XTOpenTable *ot, XTIndexPtr ind, XTIndReferencePtr iref);
+void			xt_ind_release_handle(XTIndHandlePtr handle, xtBool have_lock, XTThreadPtr thread);
+
+#ifdef CHECK_BLOCK_TRAILERS
+extern void check_block_trailers();
+#endif
+
+#ifdef DEBUG
+//#define DEBUG_CHECK_IND_CACHE
+#endif
+
+//#define XT_TRACE_INDEX
+
+#ifdef XT_TRACE_INDEX
+#define IDX_TRACE(x, y, z)		xt_trace(x, y, z)
+#else
+#define IDX_TRACE(x, y, z)
+#endif
+
+#endif
diff --git a/storage/pbxt/src/ccutils_xt.cc b/storage/pbxt/src/ccutils_xt.cc
new file mode 100644
index 00000000000..2f31061ac21
--- /dev/null
+++ b/storage/pbxt/src/ccutils_xt.cc
@@ -0,0 +1,69 @@
+/* Copyright (c) 2005 PrimeBase Technologies GmbH
+ *
+ * PrimeBase XT
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.	See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * 2006-05-16	Paul McCullagh
+ *
+ * H&G2JCtL
+ *
+ * C++ Utilities
+ */
+
+#include "xt_config.h"
+
+#include "pthread_xt.h"
+#include "ccutils_xt.h"
+#include "bsearch_xt.h"
+
+static int ccu_compare_object(XTThreadPtr XT_UNUSED(self), register const void *XT_UNUSED(thunk), register const void *a, register const void *b)
+{
+	XTObject *obj_ptr = (XTObject *) b;
+
+	return obj_ptr->compare(a);
+}
+
+void XTListImp::append(XTThreadPtr self, XTObject *info, void *key) {
+	size_t idx;
+
+	if (li_item_count == 0)
+		idx = 0;
+	else if (li_item_count == 1) {
+		int r;
+
+		if ((r = li_items[0]->compare(key)) == 0)
+			idx = 0;
+		else if (r < 0)
+			idx = 0;
+		else
+			idx = 1;
+	}
+	else {
+		xt_bsearch(self, key, li_items, li_item_count, sizeof(void *), &idx, NULL, ccu_compare_object);
+	}
+
+	if (!xt_realloc(NULL, (void **) &li_items, (li_item_count + 1) * sizeof(void *))) {
+		if (li_referenced)
+			info->release(self);
+		xt_throw_errno(XT_CONTEXT, XT_ENOMEM);
+		return;
+	}
+	memmove(&li_items[idx+1], &li_items[idx], (li_item_count-idx) * sizeof(void *));
+	li_items[idx] = info;
+	li_item_count++;
+}
+
+
diff --git a/storage/pbxt/src/ccutils_xt.h b/storage/pbxt/src/ccutils_xt.h
new file mode 100644
index 00000000000..a800073869d
--- /dev/null
+++ b/storage/pbxt/src/ccutils_xt.h
@@ -0,0 +1,220 @@
+/* Copyright (c) 2005 PrimeBase Technologies GmbH
+ *
+ * PrimeBase XT
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.	See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * 2006-05-16	Paul McCullagh
+ *
+ * H&G2JCtL
+ *
+ * C++ Utilities
+ */
+
+#ifndef __ccutils_xt_h__
+#define __ccutils_xt_h__
+
+#include <errno.h>
+
+#include "xt_defs.h"
+#include "thread_xt.h"
+
+class XTObject
+{
+	private:
+	u_int			o_refcnt;
+
+	public:
+	inline XTObject() { o_refcnt = 1; }
+	
+	virtual ~XTObject() { }
+
+	inline void reference() {
+		o_refcnt++;
+	}
+
+	inline void release(XTThreadPtr self) {
+		ASSERT(o_refcnt > 0);
+		o_refcnt--;
+		if (o_refcnt == 0) {
+			finalize(self);
+			delete this;
+		}
+	}
+
+	virtual XTObject *factory(XTThreadPtr self) {
+		XTObject *new_obj;
+		
+		if (!(new_obj = new XTObject()))
+			xt_throw_errno(XT_CONTEXT, XT_ENOMEM);
+		return new_obj;
+	}
+
+	virtual XTObject *clone(XTThreadPtr self) {
+		XTObject *new_obj;
+		
+		new_obj = factory(self);
+		new_obj->init(self, this);
+		return new_obj;
+	}
+
+	virtual void init(XTThreadPtr self) { (void) self; }
+	virtual void init(XTThreadPtr self, XTObject *obj) { (void) obj; init(self); }
+	virtual void finalize(XTThreadPtr self) { (void) self; }
+	virtual int compare(const void *key) { (void) key; return -1; }
+};
+
+class XTListImp
+{
+	protected:
+	bool		li_referenced;
+	u_int		li_item_count;
+	XTObject	**li_items;
+
+	public:
+	inline XTListImp() : li_referenced(true), li_item_count(0), li_items(NULL) { }
+
+	inline void setNonReferenced() { li_referenced = false; }
+
+	void append(XTThreadPtr self, XTObject *info) {
+		if (!xt_realloc(NULL, (void **) &li_items, (li_item_count + 1) * sizeof(void *))) {
+			if (li_referenced)
+				info->release(self);
+			xt_throw_errno(XT_CONTEXT, XT_ENOMEM);
+			return;
+		}
+		li_items[li_item_count] = info;
+		li_item_count++;
+	}
+
+	void insert(XTThreadPtr self, XTObject *info, u_int i) {
+		if (!xt_realloc(NULL, (void **) &li_items, (li_item_count + 1) * sizeof(void *))) {
+			if (li_referenced)
+				info->release(self);
+			xt_throw_errno(XT_CONTEXT, XT_ENOMEM);
+			return;
+		}
+		memmove(&li_items[i+1], &li_items[i], (li_item_count-i) * sizeof(XTObject *));
+		li_items[i] = info;
+		li_item_count++;
+	}
+
+	void addToFront(XTThreadPtr self, XTObject *info) {
+		insert(self, info, 0);
+	}
+
+	/* Will sort! */
+	void append(XTThreadPtr self, XTObject *info, void *key);
+
+	inline bool remove(XTObject *info) {
+		for (u_int i=0; i<li_item_count; i++) {
+			if (li_items[i] == info) {
+				li_item_count--;
+				memmove(&li_items[i], &li_items[i+1], (li_item_count - i) * sizeof(XTObject *));
+				return true;
+			}
+		}
+		return false;
+	}
+
+	inline bool remove(XTThreadPtr self, u_int i) {
+		XTObject *item;
+
+		if (i >= li_item_count)
+			return false;
+		item = li_items[i];
+		li_item_count--;
+		memmove(&li_items[i], &li_items[i+1], (li_item_count - i) * sizeof(void *));
+		if (li_referenced)
+			item->release(self);
+		return true;
+	}
+
+	inline XTObject *take(u_int i) {
+		XTObject *item;
+
+		if (i >= li_item_count)
+			return NULL;
+		item = li_items[i];
+		li_item_count--;
+		memmove(&li_items[i], &li_items[i+1], (li_item_count - i) * sizeof(void *));
+		return item;
+	}
+
+	inline u_int size() const { return li_item_count; }
+
+	inline void setEmpty(XTThreadPtr self) {
+		if (li_items)
+			xt_free(self, li_items);
+		li_item_count = 0;
+		li_items = NULL;
+	}
+
+	inline bool isEmpty() { return li_item_count == 0; }
+
+	inline XTObject *itemAt(u_int i) const {
+		if (i >= li_item_count)
+			return NULL;
+		return li_items[i];
+	}
+};
+
+
+template <class T> class XTList : public XTListImp
+{
+	public:
+	inline XTList() : XTListImp() { }
+
+	inline void append(XTThreadPtr self, T *a) { XTListImp::append(self, a); }
+	inline void insert(XTThreadPtr self, T *a, u_int i) { XTListImp::insert(self, a, i); }
+	inline void addToFront(XTThreadPtr self, T *a) { XTListImp::addToFront(self, a); }
+
+	inline bool remove(T *a) { return XTListImp::remove(a); }
+
+	inline bool remove(XTThreadPtr self, u_int i) { return XTListImp::remove(self, i); }
+
+	inline T *take(u_int i) { return (T *) XTListImp::take(i); }
+
+	inline T *itemAt(u_int i) const { return (T *) XTListImp::itemAt(i); }
+
+	inline u_int indexOf(T *a) {
+		u_int i;
+
+		for (i=0; i<size(); i++) {
+			if (itemAt(i) == a)
+				break;
+		}
+		return i;
+	}
+
+	void deleteAll(XTThreadPtr self)
+	{
+		for (u_int i=0; i<size(); i++) {
+			if (li_referenced)
+				itemAt(i)->release(self);
+		}
+		setEmpty(self);
+	}
+
+	void clone(XTThreadPtr self, XTListImp *list)
+	{
+		deleteAll(self);
+		for (u_int i=0; i<list->size(); i++) {
+			XTListImp::append(self, list->itemAt(i)->clone(self));
+		}
+	}
+};
+
+#endif
diff --git a/storage/pbxt/src/database_xt.cc b/storage/pbxt/src/database_xt.cc
new file mode 100644
index 00000000000..8d1b4e46da9
--- /dev/null
+++ b/storage/pbxt/src/database_xt.cc
@@ -0,0 +1,1320 @@
+/* Copyright (c) 2005 PrimeBase Technologies GmbH
+ *
+ * PrimeBase XT
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * 2005-01-15	Paul McCullagh
+ *
+ * H&G2JCtL
+ */
+
+#include "xt_config.h"
+
+#ifdef DRIZZLED
+#include <bitset>
+#endif
+
+#include <string.h>
+#include <stdio.h>
+
+#include "pthread_xt.h"
+#include "hashtab_xt.h"
+#include "filesys_xt.h"
+#include "database_xt.h"
+#include "memory_xt.h"
+#include "heap_xt.h"
+#include "datalog_xt.h"
+#include "strutil_xt.h"
+#include "util_xt.h"
+#include "trace_xt.h"
+
+#ifdef DEBUG
+//#define XT_TEST_XACT_OVERFLOW
+#endif
+
+#ifndef NAME_MAX
+#define NAME_MAX 128
+#endif
+
+/*
+ * -----------------------------------------------------------------------
+ * GLOBALS
+ */
+
+xtPublic XTDatabaseHPtr		pbxt_database = NULL;		// The global open database
+
+xtPublic xtLogOffset		xt_db_log_file_threshold;
+xtPublic size_t				xt_db_log_buffer_size;
+xtPublic size_t				xt_db_transaction_buffer_size;
+xtPublic size_t				xt_db_checkpoint_frequency;
+xtPublic off_t				xt_db_data_log_threshold;
+xtPublic size_t				xt_db_data_file_grow_size;
+xtPublic size_t				xt_db_row_file_grow_size;
+xtPublic int				xt_db_garbage_threshold;
+xtPublic int				xt_db_log_file_count;
+xtPublic int				xt_db_auto_increment_mode;		/* 0 = MySQL compatible, 1 = PrimeBase Compatible. */
+xtPublic int				xt_db_offline_log_function;		/* 0 = recycle logs, 1 = delete logs, 2 = keep logs */
+xtPublic int				xt_db_sweeper_priority;			/* 0 = low (default), 1 = normal, 2 = high */
+xtPublic int				xt_db_flush_log_at_trx_commit;	/* 0 = no-write/no-flush, 1 = yes, 2 = write/no-flush */
+
+xtPublic XTSortedListPtr	xt_db_open_db_by_id = NULL;
+xtPublic XTHashTabPtr		xt_db_open_databases = NULL;
+xtPublic time_t				xt_db_approximate_time = 0;		/* A "fast" alternative timer (not too accurate). */
+
+static xtDatabaseID				db_next_id = 1;
+static volatile XTOpenFilePtr	db_lock_file = NULL;
+
+/*
+ * -----------------------------------------------------------------------
+ * LOCK/UNLOCK INSTALLATION
+ */
+
+xtPublic void xt_lock_installation(XTThreadPtr self, char *installation_path)
+{
+	char			file_path[PATH_MAX];
+	char			buffer[101];
+	size_t			red_size;
+	llong			pid;
+	xtBool			cd = pbxt_crash_debug;
+
+	xt_strcpy(PATH_MAX, file_path, installation_path);
+	xt_add_pbxt_file(PATH_MAX, file_path, "no-debug");
+	if (xt_fs_exists(file_path))
+		pbxt_crash_debug = FALSE;
+	xt_strcpy(PATH_MAX, file_path, installation_path);
+	xt_add_pbxt_file(PATH_MAX, file_path, "crash-debug");
+	if (xt_fs_exists(file_path))
+		pbxt_crash_debug = TRUE;
+
+	if (pbxt_crash_debug != cd) {
+		if (pbxt_crash_debug)
+			xt_logf(XT_NT_WARNING, "Crash debugging has been turned on ('crash-debug' file exists)\n");
+		else
+			xt_logf(XT_NT_WARNING, "Crash debugging has been turned off ('no-debug' file exists)\n");
+	}
+	else if (pbxt_crash_debug)
+		xt_logf(XT_NT_WARNING, "Crash debugging is enabled\n");
+
+	/* Moved the lock file out of the pbxt directory so that
+	 * it is possible to drop the pbxt database!
+	 */
+	xt_strcpy(PATH_MAX, file_path, installation_path);
+	xt_add_dir_char(PATH_MAX, file_path);
+	xt_strcat(PATH_MAX, file_path, "pbxt-lock");
+	db_lock_file = xt_open_file(self, file_path, XT_FS_CREATE | XT_FS_MAKE_PATH);
+
+	try_(a) {
+		if (!xt_lock_file(self, db_lock_file)) {
+			xt_logf(XT_NT_ERROR, "A server appears to already be running\n");
+			xt_logf(XT_NT_ERROR, "The file: %s, is locked\n", file_path);
+			xt_throw_xterr(XT_CONTEXT, XT_ERR_SERVER_RUNNING);
+		}
+		if (!xt_pread_file(db_lock_file, 0, 100, 0, buffer, &red_size, &self->st_statistics.st_rec, self))
+			xt_throw(self);
+		if (red_size > 0) {
+			buffer[red_size] = 0;
+#ifdef XT_WIN
+			pid = (llong) _atoi64(buffer);
+#else
+			pid = atoll(buffer);
+#endif
+			/* Problem with this code is, after a restart
+			 * the process ID's are reused.
+			 * If some system process grabs the proc id that
+			 * the server had on the last run, then
+			 * the database will not start.
+			if (xt_process_exists((xtProcID) pid)) {
+				xt_logf(XT_NT_ERROR, "A server appears to already be running, process ID: %lld\n", pid);
+				xt_logf(XT_NT_ERROR, "Remove the file: %s, if this is not the case\n", file_path);
+				xt_throw_xterr(XT_CONTEXT, XT_ERR_SERVER_RUNNING);
+			}
+			*/
+			xt_logf(XT_NT_INFO, "The server was not shutdown correctly, recovery required\n");
+#ifdef XT_BACKUP_BEFORE_RECOVERY
+			if (pbxt_crash_debug) {
+				/* The server was not shut down correctly. Make a backup before
+				 * we start recovery.
+				 */
+				char extension[100];
+
+				for (int i=1;;i++) {
+					xt_strcpy(PATH_MAX, file_path, installation_path);
+					xt_remove_dir_char(file_path);
+					sprintf(extension, "-recovery-%d", i);
+					xt_strcat(PATH_MAX, file_path, extension);
+					if (!xt_fs_exists(file_path))
+						break;
+				}
+				xt_logf(XT_NT_INFO, "In order to reproduce recovery errors a backup of the installation\n");
+				xt_logf(XT_NT_INFO, "will be made to:\n");
+				xt_logf(XT_NT_INFO, "%s\n", file_path);
+				xt_logf(XT_NT_INFO, "Copy in progress...\n");
+				xt_fs_copy_dir(self, installation_path, file_path);
+				xt_logf(XT_NT_INFO, "Copy OK\n");
+			}
+#endif
+		}
+
+		sprintf(buffer, "%lld", (llong) xt_getpid());
+		xt_set_eof_file(self, db_lock_file, 0);
+		if (!xt_pwrite_file(db_lock_file, 0, strlen(buffer), buffer, &self->st_statistics.st_rec, self))
+			xt_throw(self);
+	}
+	catch_(a) {
+		xt_close_file(self, db_lock_file);
+		db_lock_file = NULL;
+		xt_throw(self);
+	}
+	cont_(a);
+}
+
+xtPublic void xt_unlock_installation(XTThreadPtr self, char *installation_path)
+{
+	if (db_lock_file) {
+		char lock_file[PATH_MAX];
+
+		xt_unlock_file(NULL, db_lock_file);
+		xt_close_file_ns(db_lock_file);
+		db_lock_file = NULL;
+
+		xt_strcpy(PATH_MAX, lock_file, installation_path);
+		xt_add_dir_char(PATH_MAX, lock_file);
+		xt_strcat(PATH_MAX, lock_file, "pbxt-lock");
+		xt_fs_delete(self, lock_file);
+	}
+}
+
+int *xt_bad_pointer = 0;
+
+void xt_crash_me(void)
+{
+	if (pbxt_crash_debug)
+		*xt_bad_pointer = 123;
+}
+
+/*
+ * -----------------------------------------------------------------------
+ * INIT/EXIT DATABASE
+ */
+
+static xtBool db_hash_comp(void *key, void *data)
+{
+	XTDatabaseHPtr	db = (XTDatabaseHPtr) data;
+
+	return strcmp((char *) key, db->db_name) == 0;
+}
+
+static xtHashValue db_hash(xtBool is_key, void *key_data)
+{
+	XTDatabaseHPtr	db = (XTDatabaseHPtr) key_data;
+
+	if (is_key)
+		return xt_ht_hash((char *) key_data);
+	return xt_ht_hash(db->db_name);
+}
+
+static xtBool db_hash_comp_ci(void *key, void *data)
+{
+	XTDatabaseHPtr	db = (XTDatabaseHPtr) data;
+
+	return strcasecmp((char *) key, db->db_name) == 0;
+}
+
+static xtHashValue db_hash_ci(xtBool is_key, void *key_data)
+{
+	XTDatabaseHPtr	db = (XTDatabaseHPtr) key_data;
+
+	if (is_key)
+		return xt_ht_casehash((char *) key_data);
+	return xt_ht_casehash(db->db_name);
+}
+
+static void db_hash_free(XTThreadPtr self, void *data)
+{
+	xt_heap_release(self, (XTDatabaseHPtr) data);
+}
+
+static int db_cmp_db_id(struct XTThread *XT_UNUSED(self), register const void *XT_UNUSED(thunk), register const void *a, register const void *b)
+{
+	xtDatabaseID	db_id = *((xtDatabaseID *) a);
+	XTDatabaseHPtr	*db_ptr = (XTDatabaseHPtr *) b;
+
+	if (db_id == (*db_ptr)->db_id)
+		return 0;
+	if (db_id < (*db_ptr)->db_id)
+		return -1;
+	return 1;
+}
+
+xtPublic void xt_init_databases(XTThreadPtr self)
+{
+	if (pbxt_ignore_case)
+		xt_db_open_databases = xt_new_hashtable(self, db_hash_comp_ci, db_hash_ci, db_hash_free, TRUE, TRUE);
+	else
+		xt_db_open_databases = xt_new_hashtable(self, db_hash_comp, db_hash, db_hash_free, TRUE, TRUE);
+	xt_db_open_db_by_id = xt_new_sortedlist(self, sizeof(XTDatabaseHPtr), 20, 10, db_cmp_db_id, NULL, NULL, FALSE, FALSE);
+}
+
+xtPublic void xt_stop_database_threads(XTThreadPtr self, xtBool sync)
+{
+	u_int			len = 0;
+	XTDatabaseHPtr	*dbptr;
+	XTDatabaseHPtr	db = NULL;
+	
+	if (xt_db_open_db_by_id)
+		len = xt_sl_get_size(xt_db_open_db_by_id);
+	for (u_int i=0; i<len; i++) {
+		if ((dbptr = (XTDatabaseHPtr *) xt_sl_item_at(xt_db_open_db_by_id, i))) {
+			db = *dbptr;
+			if (sync) {
+				/* Wait for the sweeper: */
+				xt_wait_for_sweeper(self, db, 16);
+				
+				/* Wait for the writer: */
+				xt_wait_for_writer(self, db);
+
+				/* Wait for the checkpointer: */
+				xt_wait_for_checkpointer(self, db);
+			}
+			xt_stop_flusher(self, db);
+			xt_stop_checkpointer(self, db);
+			xt_stop_writer(self, db);
+			xt_stop_sweeper(self, db);
+			xt_stop_compactor(self, db);
+		}
+	}
+}
+
+xtPublic void xt_exit_databases(XTThreadPtr self)
+{
+	if (xt_db_open_databases) {
+		xt_free_hashtable(self, xt_db_open_databases);
+		xt_db_open_databases = NULL;
+	}
+	if (xt_db_open_db_by_id) {
+		xt_free_sortedlist(self, xt_db_open_db_by_id);
+		xt_db_open_db_by_id = NULL;
+	}
+}
+
+xtPublic void xt_create_database(XTThreadPtr self, char *path)
+{
+	xt_fs_mkdir(self, path);
+}
+
+static void db_finalize(XTThreadPtr self, void *x)
+{
+	XTDatabaseHPtr	db = (XTDatabaseHPtr) x;
+
+	xt_stop_flusher(self, db);
+	xt_stop_checkpointer(self, db);
+	xt_stop_compactor(self, db);
+	xt_stop_sweeper(self, db);
+	xt_stop_writer(self, db);
+
+	xt_sl_delete(self, xt_db_open_db_by_id, &db->db_id);
+	/* 
+	 * Important is that xt_db_pool_exit() is called
+	 * before xt_xn_exit_db() because xt_xn_exit_db()
+	 * frees the checkpoint information which
+	 * may be required to shutdown the tables, which
+	 * flushes tables, and therefore does a checkpoint.
+	 */
+	/* This was the previous order of shutdown:
+	xt_xn_exit_db(self, db);
+	xt_dl_exit_db(self, db);
+	xt_db_pool_exit(self, db);
+	db->db_indlogs.ilp_exit(self);
+	*/
+
+	xt_db_pool_exit(self, db);
+	db->db_indlogs.ilp_exit(self); 
+	xt_dl_exit_db(self, db);
+	xt_xn_exit_db(self, db);
+	xt_tab_exit_db(self, db);
+	if (db->db_name) {
+		xt_free(self, db->db_name);
+		db->db_name = NULL;
+	}
+	if (db->db_main_path) {
+		xt_free(self, db->db_main_path);
+		db->db_main_path = NULL;
+	}
+}
+
+static void db_onrelease(XTThreadPtr self, void *XT_UNUSED(x))
+{
+	/* Signal threads waiting for exclusive use of the database: */
+	if (xt_db_open_databases)	// The database may already be closed.
+		xt_ht_signal(self, xt_db_open_databases);
+}
+
+xtPublic void xt_add_pbxt_file(size_t size, char *path, const char *file)
+{
+	xt_add_dir_char(size, path);
+	xt_strcat(size, path, "pbxt");
+	xt_add_dir_char(size, path);
+	xt_strcat(size, path, file);
+}
+
+xtPublic void xt_add_location_file(size_t size, char *path)
+{
+	xt_add_dir_char(size, path);
+	xt_strcat(size, path, "pbxt");
+	xt_add_dir_char(size, path);
+	xt_strcat(size, path, "location");
+}
+
+xtPublic void xt_add_pbxt_dir(size_t size, char *path)
+{
+	xt_add_dir_char(size, path);
+	xt_strcat(size, path, "pbxt");
+}
+
+xtPublic void xt_add_system_dir(size_t size, char *path)
+{
+	xt_add_dir_char(size, path);
+	xt_strcat(size, path, "pbxt");
+	xt_add_dir_char(size, path);
+	xt_strcat(size, path, "system");
+}
+
+xtPublic void xt_add_data_dir(size_t size, char *path)
+{
+	xt_add_dir_char(size, path);
+	xt_strcat(size, path, "pbxt");
+	xt_add_dir_char(size, path);
+	xt_strcat(size, path, "data");
+}
+
+/*
+ * I have a problem here. I cannot rely on the path given to xt_get_database() to be
+ * consistant. When called from ha_create_table() the path is not modified.
+ * However when called from ha_open() the path is first transformed by a call to
+ * fn_format(). I have given an example from a stack trace below.
+ *
+ * In this case the odd path comes from the option:
+ * --tmpdir=/Users/build/Development/mysql/debug-mysql/mysql-test/var//tmp
+ *
+ * #3  0x001a3818 in ha_pbxt::create(char const*, st_table*, st_ha_create_information*) 
+ *     (this=0x2036898, table_path=0xf0060bd0 "/users/build/development/mysql/debug-my
+ *     sql/mysql-test/var//tmp/#sql5718_1_0.frm", table_arg=0xf00601c0,
+ *     create_info=0x2017410) at ha_pbxt.cc:2323
+ * #4  0x00140d74 in ha_create_table(char const*, st_ha_create_information*, bool) 
+ *     (name=0xf0060bd0 "/users/build/development/mysql/debug-mysql/mysql-te
+ *     st/var//tmp/#sql5718_1_0.frm", create_info=0x2017410, 
+ *     update_create_info=false) at handler.cc:1387
+ *
+ * #4  0x0013f7a4 in handler::ha_open(char const*, int, int) (this=0x203ba98, 
+ *     name=0xf005eb70 "/users/build/development/mysql/debug-mysql/mysql-te
+ *     st/var/tmp/#sql5718_1_1", mode=2, test_if_locked=2) at handler.cc:993
+ * #5  0x000cd900 in openfrm(char const*, char const*, unsigned, unsigned, 
+ *     unsigned, st_table*) (name=0xf005f260 "/users/build/development/mys
+ *     ql/debug-mysql/mysql-test/var//tmp/#sql5718_1_1.frm", 
+ *     alias=0xf005fb90 "#sql-5718_1", db_stat=7, prgflag=44, 
+ *     ha_open_flags=0, outparam=0x2039e18) at table.cc:771
+ *
+ * As a result, I no longer use the entire path as the key to find a database.
+ * Just the last component of the path (i.e. the database name) should be
+ * sufficient!?
+ */
+xtPublic XTDatabaseHPtr xt_get_database(XTThreadPtr self, char *path, xtBool multi_path)
+{
+	XTDatabaseHPtr	db = NULL;
+	char			db_path[PATH_MAX];
+	char			db_name[NAME_MAX];
+	xtBool			multi_path_db = FALSE;
+
+	/* A database may not be in use when this is called. */
+	ASSERT(!self->st_database);
+	xt_ht_lock(self, xt_db_open_databases);
+	pushr_(xt_ht_unlock, xt_db_open_databases);
+
+	xt_strcpy(PATH_MAX, db_path, path);
+	xt_add_location_file(PATH_MAX, db_path);
+	if (multi_path || xt_fs_exists(db_path))
+		multi_path_db = TRUE;
+
+	xt_strcpy(PATH_MAX, db_path, path);
+	xt_remove_dir_char(db_path);
+	xt_strcpy(NAME_MAX, db_name, xt_last_directory_of_path(db_path));
+
+	db = (XTDatabaseHPtr) xt_ht_get(self, xt_db_open_databases, db_name);
+	if (!db) {
+		pushsr_(db, xt_heap_release, (XTDatabaseHPtr) xt_heap_new(self, sizeof(XTDatabaseRec), db_finalize));
+		xt_heap_set_release_callback(self, db, db_onrelease);
+		db->db_id = db_next_id++;
+		db->db_name = xt_dup_string(self, db_name);
+		db->db_main_path = xt_dup_string(self, db_path);
+		db->db_multi_path = multi_path_db;
+#ifdef XT_TEST_XACT_OVERFLOW
+		/* Test transaction ID overflow: */
+		db->db_xn_curr_id = 0xFFFFFFFF - 30;
+#endif
+		xt_db_pool_init(self, db);
+		xt_tab_init_db(self, db);
+		xt_dl_init_db(self, db);
+
+		/* Initialize the index logs: */
+		db->db_indlogs.ilp_init(self, db, XT_INDEX_WRITE_BUFFER_SIZE); 
+
+		xt_xn_init_db(self, db);
+		xt_sl_insert(self, xt_db_open_db_by_id, &db->db_id, &db);
+
+		xt_start_sweeper(self, db);
+		xt_start_compactor(self, db);
+		xt_start_writer(self, db);
+		xt_start_checkpointer(self, db);
+		if (xt_db_flush_log_at_trx_commit == 0 || xt_db_flush_log_at_trx_commit == 2)
+			xt_start_flusher(self, db);
+
+		popr_();
+		xt_ht_put(self, xt_db_open_databases, db);
+
+		/* The recovery process could attach parts of the open
+		 * database to the thread!
+		 */
+		xt_unuse_database(self, self);
+
+	}
+	xt_heap_reference(self, db);
+	freer_();
+
+	/* {INDEX-RECOV_ROWID}
+	 * Wait for sweeper to finish processing possibly
+	 * unswept transactions after recovery.
+	 * This is required because during recovery for
+	 * all index entries written the row_id is set.
+	 *
+	 * When the row ID is set, this means that the row
+	 * is "clean". i.e. visible to all transactions.
+	 *
+	 * Obviously this is not necessary the case for all
+	 * index entries recovered. For example, 
+	 * transactions that still need to be swept may be
+	 * rolled back.
+	 *
+	 * As a result, we have to wait the the sweeper
+	 * to complete. Only then can we be sure that
+	 * all index entries that are not visible have
+	 * been removed.
+	 *
+	 * REASON WHY WE SET ROWID ON RECOVERY:
+	 * The row ID is set on recovery because the
+	 * change to the index may be lost after a crash.
+	 * The change to the index is done by the sweeper, and
+	 * there is no record of this change in the log.
+	 * The sweeper will not "re-sweep" all transations
+	 * that are recovered. As a result, this upadte
+	 * of the index by the sweeper may be lost.
+	 *
+	 * {OPEN-DB-SWEEPER-WAIT}
+	 * This has been moved to after the release of the open
+	 * database lock because:
+	 *
+	 * - We are waiting for the sweeper which may run out of
+	 * record cache.
+	 * - If it runs out of cache it well wait
+	 * for the freeer thread.
+	 * - For the freeer thread to be able to work it needs
+	 * to open the database.
+	 * - To open the database it needs the open database
+	 * lock.
+	 */
+	/*
+	 * This has been moved, see: {WAIT-FOR-SW-AFTER-RECOV}
+	pushr_(xt_heap_release, db);
+	xt_wait_for_sweeper(self, db, 0);
+	popr_();
+	*/
+
+	return db;
+}
+
+xtPublic XTDatabaseHPtr xt_get_database_by_id(XTThreadPtr self, xtDatabaseID db_id)
+{
+	XTDatabaseHPtr	*dbptr;
+	XTDatabaseHPtr	db = NULL;
+
+	xt_ht_lock(self, xt_db_open_databases);
+	pushr_(xt_ht_unlock, xt_db_open_databases);
+	if ((dbptr = (XTDatabaseHPtr *) xt_sl_find(self, xt_db_open_db_by_id, &db_id))) {
+		db = *dbptr;
+		xt_heap_reference(self, db);
+	}
+	freer_(); // xt_ht_unlock(xt_db_open_databases)
+	return db;
+}
+
+xtPublic void xt_check_database(XTThreadPtr self)
+{
+	xt_check_tables(self);
+	/*
+	xt_check_handlefiles(self, db);
+	*/
+}
+
+xtPublic void xt_drop_database(XTThreadPtr self, XTDatabaseHPtr	db)
+{
+	char			path[PATH_MAX];
+	char			db_name[NAME_MAX];
+	XTOpenDirPtr	od;
+	char			*file;
+	XTTablePathPtr	*tp_ptr;
+
+	xt_ht_lock(self, xt_db_open_databases);
+	pushr_(xt_ht_unlock, xt_db_open_databases);
+
+	/* Shutdown the database daemons: */
+	xt_stop_flusher(self, db);
+	xt_stop_checkpointer(self, db);
+	xt_stop_sweeper(self, db);
+	xt_stop_compactor(self, db);
+	xt_stop_writer(self, db);
+
+	/* Remove the database from the directory: */
+	xt_strcpy(NAME_MAX, db_name, db->db_name);
+	xt_ht_del(self, xt_db_open_databases, db_name);
+
+	/* Release the lock on the database directory: */
+	freer_(); // xt_ht_unlock(xt_db_open_databases)
+
+	/* Delete the transaction logs: */
+	xt_xlog_delete_logs(self, db);
+
+	/* Delete the data logs: */
+	xt_dl_delete_logs(self, db);
+
+	for (u_int i=0; i<xt_sl_get_size(db->db_table_paths); i++) {
+
+		tp_ptr = (XTTablePathPtr *) xt_sl_item_at(db->db_table_paths, i);
+
+		xt_strcpy(PATH_MAX, path, (*tp_ptr)->tp_path);
+
+		/* Delete all files in the database: */
+		pushsr_(od, xt_dir_close, xt_dir_open(self, path, NULL));
+		while (xt_dir_next(self, od)) {
+			file = xt_dir_name(self, od);
+			if (xt_ends_with(file, ".xtr") ||
+				xt_ends_with(file, ".xtd") ||
+				xt_ends_with(file, ".xti") ||
+				xt_ends_with(file, ".xt"))
+			{
+				xt_add_dir_char(PATH_MAX, path);
+				xt_strcat(PATH_MAX, path, file);
+				xt_fs_delete(self, path);
+				xt_remove_last_name_of_path(path);
+			}
+		}
+		freer_(); // xt_dir_close(od)
+		
+	}
+	if (!db->db_multi_path) {
+		xt_strcpy(PATH_MAX, path, db->db_main_path);
+		xt_add_pbxt_dir(PATH_MAX, path);
+		if (!xt_fs_rmdir(NULL, path))
+			xt_log_and_clear_exception(self);
+	}
+}
+
+/*
+ * Open/use a database.
+ */
+xtPublic void xt_open_database(XTThreadPtr self, char *path, xtBool multi_path)
+{
+	XTDatabaseHPtr db;
+
+	/* We cannot get a database, without unusing the current
+	 * first. The reason is that the restart process will
+	 * partially set the current database!
+	 */
+	xt_unuse_database(self, self);
+	db = xt_get_database(self, path, multi_path);
+	pushr_(xt_heap_release, db);
+	xt_use_database(self, db, XT_FOR_USER);
+	freer_();	// xt_heap_release(self, db);	
+}
+
+/* This function can only be called if you do not already have a database in
+ * use. This is because to get a database pointer you are not allowed
+ * to have a database in use!
+ */
+xtPublic void xt_use_database(XTThreadPtr self, XTDatabaseHPtr db, int what_for)
+{
+	/* Check if a transaction is in progress. If so,
+	 * we cannot change the database!
+	 */
+	if (self->st_xact_data || self->st_database)
+		xt_throw_xterr(XT_CONTEXT, XT_ERR_CANNOT_CHANGE_DB);
+
+	xt_heap_reference(self, db);
+	self->st_database = db;
+#ifdef XT_WAIT_FOR_CLEANUP
+	self->st_last_xact = 0;
+	for (int i=0; i<XT_MAX_XACT_BEHIND; i++) {
+		self->st_prev_xact[i] = db->db_xn_curr_id;
+	}
+#endif
+	xt_xn_init_thread(self, what_for);
+}
+
+xtPublic void xt_unuse_database(XTThreadPtr self, XTThreadPtr other_thr)
+{
+	/* Abort the transacion if it belongs exclusively to this thread. */
+	xt_lock_mutex(self, &other_thr->t_lock);
+	pushr_(xt_unlock_mutex, &other_thr->t_lock);
+
+	xt_xn_exit_thread(other_thr);
+	if (other_thr->st_database) {
+		xt_heap_release(self, other_thr->st_database);
+		other_thr->st_database = NULL;
+	}
+	
+	freer_();
+}
+
+xtPublic void xt_db_init_thread(XTThreadPtr XT_UNUSED(self), XTThreadPtr XT_UNUSED(new_thread))
+{
+#ifdef XT_IMPLEMENT_NO_ACTION
+	memset(&new_thread->st_restrict_list, 0, sizeof(XTBasicListRec));
+	new_thread->st_restrict_list.bl_item_size = sizeof(XTRestrictItemRec);
+#endif
+}
+
+xtPublic void xt_db_exit_thread(XTThreadPtr self)
+{
+#ifdef XT_IMPLEMENT_NO_ACTION
+	xt_bl_free(NULL, &self->st_restrict_list);
+#endif
+	xt_unuse_database(self, self);
+}
+
+/*
+ * -----------------------------------------------------------------------
+ * OPEN TABLE POOL
+ */
+
+#ifdef UNUSED_CODE
+static void check_free_list(XTDatabaseHPtr db)
+{
+	XTOpenTablePtr	ot;
+	u_int			cnt = 0;
+
+	ot = db->db_ot_pool.otp_mr_used;
+	if (ot)
+		ASSERT_NS(!ot->ot_otp_mr_used);
+	ot = db->db_ot_pool.otp_lr_used;
+	if (ot)
+		ASSERT_NS(!ot->ot_otp_lr_used);
+	while (ot) {
+		cnt++;
+		ot = ot->ot_otp_mr_used;
+	}
+	ASSERT_NS(cnt == db->db_ot_pool.otp_total_free);
+}
+#endif
+
+xtPublic void xt_db_pool_init(XTThreadPtr self, XTDatabaseHPtr db)
+{
+	memset(&db->db_ot_pool, 0, sizeof(XTAllTablePoolsRec));
+	xt_init_mutex_with_autoname(self, &db->db_ot_pool.opt_lock);
+	xt_init_cond(self, &db->db_ot_pool.opt_cond);
+}
+
+xtPublic void xt_db_pool_exit(XTThreadPtr self, XTDatabaseHPtr db)
+{
+	XTOpenTablePoolPtr	table_pool, tmp;
+	XTOpenTablePtr		ot, tmp_ot;
+
+	xt_free_mutex(&db->db_ot_pool.opt_lock);
+	xt_free_cond(&db->db_ot_pool.opt_cond);
+	
+	for (u_int i=0; i<XT_OPEN_TABLE_POOL_HASH_SIZE; i++) {
+		table_pool = db->db_ot_pool.otp_hash[i];
+		while (table_pool) {
+			tmp = table_pool->opt_next_hash;
+			ot = table_pool->opt_free_list;
+			while (ot) {
+				tmp_ot = ot->ot_otp_next_free;
+				ot->ot_thread = self;
+				xt_close_table(ot, TRUE, FALSE);
+				ot = tmp_ot;
+			}
+			xt_free(self, table_pool);
+			table_pool = tmp;
+		}
+	}
+}
+
+static XTOpenTablePoolPtr db_get_open_table_pool(XTDatabaseHPtr db, xtTableID tab_id)
+{
+	XTOpenTablePoolPtr	table_pool;
+	u_int				hash;
+
+	hash = tab_id % XT_OPEN_TABLE_POOL_HASH_SIZE;
+	table_pool = db->db_ot_pool.otp_hash[hash];
+	while (table_pool) {
+		if (table_pool->opt_tab_id == tab_id)
+			return table_pool;
+		table_pool = table_pool->opt_next_hash;
+	}
+	
+	if (!(table_pool = (XTOpenTablePoolPtr) xt_malloc_ns(sizeof(XTOpenTablePoolRec))))
+		return NULL;
+
+	table_pool->opt_db = db;
+	table_pool->opt_tab_id = tab_id;
+	table_pool->opt_total_open = 0;
+	table_pool->opt_locked = FALSE;
+	table_pool->opt_flushing = 0;
+	table_pool->opt_free_list = NULL;
+	table_pool->opt_next_hash = db->db_ot_pool.otp_hash[hash];
+	db->db_ot_pool.otp_hash[hash] = table_pool;
+	
+	return table_pool;
+}
+
+static void db_free_open_table_pool(XTThreadPtr self, XTOpenTablePoolPtr table_pool)
+{
+	if (!table_pool->opt_locked && !table_pool->opt_flushing && !table_pool->opt_total_open) {
+		XTOpenTablePoolPtr	ptr, pptr = NULL;
+		u_int				hash;
+
+		hash = table_pool->opt_tab_id % XT_OPEN_TABLE_POOL_HASH_SIZE;
+		ptr = table_pool->opt_db->db_ot_pool.otp_hash[hash];
+		while (ptr) {
+			if (ptr == table_pool)
+				break;
+			pptr = ptr;
+			ptr = ptr->opt_next_hash;
+		}
+		
+		if (ptr == table_pool) {
+			if (pptr)
+				pptr->opt_next_hash = table_pool->opt_next_hash;
+			else
+				table_pool->opt_db->db_ot_pool.otp_hash[hash] = table_pool->opt_next_hash;
+		}
+
+		xt_free(self, table_pool);
+	}
+}
+
+static XTOpenTablePoolPtr db_lock_table_pool(XTThreadPtr self, XTDatabaseHPtr db, xtTableID tab_id, xtBool flush_table, xtBool wait_for_open)
+{
+	XTOpenTablePoolPtr	table_pool;
+	XTOpenTablePtr		ot, tmp_ot;
+
+	xt_lock_mutex(self, &db->db_ot_pool.opt_lock);
+	pushr_(xt_unlock_mutex, &db->db_ot_pool.opt_lock);
+
+	if (!(table_pool = db_get_open_table_pool(db, tab_id)))
+		xt_throw(self);
+
+	/* Wait for the lock: */
+	while (table_pool->opt_locked) {
+		xt_timed_wait_cond(self, &db->db_ot_pool.opt_cond, &db->db_ot_pool.opt_lock, 2000);
+		if (!(table_pool = db_get_open_table_pool(db, tab_id)))
+			xt_throw(self);
+	}
+
+	/* Lock it: */
+	table_pool->opt_locked = TRUE;
+
+	if (flush_table) {
+		table_pool->opt_flushing++;
+		freer_(); // xt_unlock_mutex(db_ot_pool.opt_lock)
+
+		pushr_(xt_db_unlock_table_pool, table_pool);
+		/* During this time, background processes can use the
+		 * pool!
+		 *
+		 * May also do a flush, but this is now taken care
+		 * of here [*10*]
+		 */
+		if ((ot = xt_db_open_pool_table(self, db, tab_id, NULL, TRUE))) {
+			pushr_(xt_db_return_table_to_pool, ot);
+			xt_sync_flush_table(self, ot);
+			freer_(); //xt_db_return_table_to_pool_foreground(ot);
+		}
+
+		popr_(); // Discard xt_db_unlock_table_pool_no_lock(table_pool)
+
+		xt_lock_mutex(self, &db->db_ot_pool.opt_lock);
+		pushr_(xt_unlock_mutex, &db->db_ot_pool.opt_lock);
+		table_pool->opt_flushing--;
+	}
+	
+	/* Free all open tables not in use: */
+	ot = table_pool->opt_free_list;
+	table_pool->opt_free_list = NULL;
+	while (ot) {
+		tmp_ot = ot->ot_otp_next_free;
+
+		/* Remove from MRU list: */
+		if (db->db_ot_pool.otp_lr_used == ot)
+			db->db_ot_pool.otp_lr_used = ot->ot_otp_mr_used;
+		if (db->db_ot_pool.otp_mr_used == ot)
+			db->db_ot_pool.otp_mr_used = ot->ot_otp_lr_used;
+		if (ot->ot_otp_lr_used)
+			ot->ot_otp_lr_used->ot_otp_mr_used = ot->ot_otp_mr_used;
+		if (ot->ot_otp_mr_used)
+			ot->ot_otp_mr_used->ot_otp_lr_used = ot->ot_otp_lr_used;
+
+		if (db->db_ot_pool.otp_lr_used)
+			db->db_ot_pool.otp_free_time = db->db_ot_pool.otp_lr_used->ot_otp_free_time;
+		
+		ASSERT_NS(db->db_ot_pool.otp_total_free > 0);
+		db->db_ot_pool.otp_total_free--;
+
+		/* Close the table: */
+		ASSERT(table_pool->opt_total_open > 0);
+		table_pool->opt_total_open--;
+
+		ot->ot_thread = self;
+		xt_close_table(ot, table_pool->opt_total_open == 0, FALSE);
+
+		/* Go to the next: */
+		ot = tmp_ot;
+	}
+
+	/* Wait for other to close: */
+	if (wait_for_open) {
+		while (table_pool->opt_total_open > 0) {
+			xt_timed_wait_cond_ns(&db->db_ot_pool.opt_cond, &db->db_ot_pool.opt_lock, 2000);
+		}
+	}
+
+	freer_(); // xt_unlock_mutex(db_ot_pool.opt_lock)
+	return table_pool;
+}
+
+xtPublic XTOpenTablePoolPtr xt_db_lock_table_pool_by_name(XTThreadPtr self, XTDatabaseHPtr db, XTPathStrPtr tab_name, xtBool no_load, xtBool flush_table, xtBool missing_ok, xtBool wait_for_open, XTTableHPtr *ret_tab)
+{
+	XTOpenTablePoolPtr	table_pool;
+	XTTableHPtr			tab;
+	xtTableID			tab_id;
+
+	pushsr_(tab, xt_heap_release, xt_use_table(self, tab_name, no_load, missing_ok));
+	if (!tab) {
+		freer_(); // xt_heap_release(tab)
+		return NULL;
+	}
+
+	tab_id = tab->tab_id;
+
+	if (ret_tab) {
+		*ret_tab = tab;
+		table_pool = db_lock_table_pool(self, db, tab_id, flush_table, wait_for_open);
+		popr_(); // Discard xt_heap_release(tab)
+		return table_pool;
+	}
+
+	freer_(); // xt_heap_release(tab)
+	return db_lock_table_pool(self, db, tab_id, flush_table, wait_for_open);
+}
+
+xtPublic void xt_db_wait_for_open_tables(XTThreadPtr self, XTOpenTablePoolPtr table_pool)
+{
+	XTDatabaseHPtr db = table_pool->opt_db;
+
+	xt_lock_mutex(self, &db->db_ot_pool.opt_lock);
+	pushr_(xt_unlock_mutex, &db->db_ot_pool.opt_lock);
+
+	/* Wait for other to close: */
+	while (table_pool->opt_total_open > 0) {
+		xt_timed_wait_cond(self, &db->db_ot_pool.opt_cond, &db->db_ot_pool.opt_lock, 2000);
+	}
+
+	freer_(); // xt_unlock_mutex(db_ot_pool.opt_lock)
+}
+
+xtPublic void xt_db_unlock_table_pool(XTThreadPtr self, XTOpenTablePoolPtr table_pool)
+{
+	XTDatabaseHPtr db;
+
+	if (!table_pool)
+		return;
+
+	db = table_pool->opt_db;
+	xt_lock_mutex(self, &db->db_ot_pool.opt_lock);
+	pushr_(xt_unlock_mutex, &db->db_ot_pool.opt_lock);
+
+	table_pool->opt_locked = FALSE;
+	xt_broadcast_cond(self, &db->db_ot_pool.opt_cond);
+	db_free_open_table_pool(NULL, table_pool);
+
+	freer_(); // xt_unlock_mutex(db_ot_pool.opt_lock)
+}
+
+xtPublic XTOpenTablePtr xt_db_open_table_using_tab(XTTableHPtr tab, XTThreadPtr thread)
+{
+	XTDatabaseHPtr		db = tab->tab_db;
+	XTOpenTablePoolPtr	table_pool;
+	XTOpenTablePtr		ot;
+
+	xt_lock_mutex_ns(&db->db_ot_pool.opt_lock);
+
+	if (!(table_pool = db_get_open_table_pool(db, tab->tab_id)))
+		goto failed;
+
+	while (table_pool->opt_locked) {
+		if (!xt_timed_wait_cond_ns(&db->db_ot_pool.opt_cond, &db->db_ot_pool.opt_lock, 2000))
+			goto failed_1;
+		if (!(table_pool = db_get_open_table_pool(db, tab->tab_id)))
+			goto failed;
+	}
+
+	if ((ot = table_pool->opt_free_list)) {
+		/* Remove from the free list: */
+		table_pool->opt_free_list = ot->ot_otp_next_free;
+		
+		/* Remove from MRU list: */
+		if (db->db_ot_pool.otp_lr_used == ot)
+			db->db_ot_pool.otp_lr_used = ot->ot_otp_mr_used;
+		if (db->db_ot_pool.otp_mr_used == ot)
+			db->db_ot_pool.otp_mr_used = ot->ot_otp_lr_used;
+		if (ot->ot_otp_lr_used)
+			ot->ot_otp_lr_used->ot_otp_mr_used = ot->ot_otp_mr_used;
+		if (ot->ot_otp_mr_used)
+			ot->ot_otp_mr_used->ot_otp_lr_used = ot->ot_otp_lr_used;
+
+		if (db->db_ot_pool.otp_lr_used)
+			db->db_ot_pool.otp_free_time = db->db_ot_pool.otp_lr_used->ot_otp_free_time;
+
+		ASSERT_NS(db->db_ot_pool.otp_total_free > 0);
+		db->db_ot_pool.otp_total_free--;
+
+		ot->ot_thread = thread;
+		goto done_ok;
+	}
+
+	if ((ot = xt_open_table(tab))) {
+		ot->ot_thread = thread;
+		table_pool->opt_total_open++;
+	}
+
+	done_ok:
+	db_free_open_table_pool(NULL, table_pool);
+	xt_unlock_mutex_ns(&db->db_ot_pool.opt_lock);
+	return ot;
+
+	failed_1:
+	db_free_open_table_pool(NULL, table_pool);
+
+	failed:
+	xt_unlock_mutex_ns(&db->db_ot_pool.opt_lock);
+	return NULL;
+}
+
+xtPublic xtBool xt_db_open_pool_table_ns(XTOpenTablePtr *ret_ot, XTDatabaseHPtr db, xtTableID tab_id)
+{
+	XTThreadPtr	self = xt_get_self();
+	xtBool		ok = TRUE;
+
+	try_(a) {
+		*ret_ot = xt_db_open_pool_table(self, db, tab_id, NULL, FALSE);
+	}
+	catch_(a) {
+		ok = FALSE;
+	}
+	cont_(a);
+	return ok;
+}
+
+xtPublic XTOpenTablePtr xt_db_open_pool_table(XTThreadPtr self, XTDatabaseHPtr db, xtTableID tab_id, int *result, xtBool i_am_background)
+{
+	XTOpenTablePtr		ot;
+	XTOpenTablePoolPtr	table_pool;
+	int					r;
+	XTTableHPtr			tab;
+
+	xt_lock_mutex(self, &db->db_ot_pool.opt_lock);
+	pushr_(xt_unlock_mutex, &db->db_ot_pool.opt_lock);
+
+	if (!(table_pool = db_get_open_table_pool(db, tab_id)))
+		xt_throw(self);
+
+	/* Background processes do not have to wait while flushing!
+	 *
+	 * I think I did this so that the background process would
+	 * not hang during flushing. Exact reason currently
+	 * unknown.
+	 *
+	 * This led to the situation that the checkpointer
+	 * could flush at the same time as a user process
+	 * which was flushing due to a rename.
+	 *
+	 * This led to the situation described here: [*10*],
+	 * which is now fixed.
+	 */
+	while (table_pool->opt_locked && !(i_am_background && table_pool->opt_flushing)) {
+		xt_timed_wait_cond(self, &db->db_ot_pool.opt_cond, &db->db_ot_pool.opt_lock, 2000);
+		if (!(table_pool = db_get_open_table_pool(db, tab_id)))
+			xt_throw(self);
+	}
+
+	/* Moved from above, because db_get_open_table_pool() may return a different
+	 * pool on each call!
+	*/
+	pushr_(db_free_open_table_pool, table_pool);	
+	
+	if ((ot = table_pool->opt_free_list)) {
+		/* Remove from the free list: */
+		table_pool->opt_free_list = ot->ot_otp_next_free;
+		
+		/* Remove from MRU list: */
+		if (db->db_ot_pool.otp_lr_used == ot)
+			db->db_ot_pool.otp_lr_used = ot->ot_otp_mr_used;
+		if (db->db_ot_pool.otp_mr_used == ot)
+			db->db_ot_pool.otp_mr_used = ot->ot_otp_lr_used;
+		if (ot->ot_otp_lr_used)
+			ot->ot_otp_lr_used->ot_otp_mr_used = ot->ot_otp_mr_used;
+		if (ot->ot_otp_mr_used)
+			ot->ot_otp_mr_used->ot_otp_lr_used = ot->ot_otp_lr_used;
+
+		if (db->db_ot_pool.otp_lr_used)
+			db->db_ot_pool.otp_free_time = db->db_ot_pool.otp_lr_used->ot_otp_free_time;
+
+		ASSERT(db->db_ot_pool.otp_total_free > 0);
+		db->db_ot_pool.otp_total_free--;
+
+		freer_(); // db_free_open_table_pool(table_pool)
+		freer_(); // xt_unlock_mutex(&db->db_ot_pool.opt_lock)
+		ot->ot_thread = self;
+		return ot;
+	}
+
+	r = xt_use_table_by_id(self, &tab, db, tab_id);
+	if (result) {
+		if (r != XT_TAB_OK) {
+			*result = r;
+			freer_(); // db_free_open_table_pool(table_pool)
+			freer_(); // xt_unlock_mutex(&db->db_ot_pool.opt_lock)
+			return NULL;
+		}
+	}
+	else {
+		switch (r) {
+			case XT_TAB_NOT_FOUND:
+				/* The table no longer exists, ignore the change: */
+				freer_(); // db_free_open_table_pool(table_pool)
+				freer_(); // xt_unlock_mutex(&db->db_ot_pool.opt_lock)
+				return NULL;
+			case XT_TAB_NO_DICTIONARY:
+				xt_throw_ulxterr(XT_CONTEXT, XT_ERR_NO_DICTIONARY, (u_long) tab_id);
+			case XT_TAB_POOL_CLOSED:
+				xt_throw_ulxterr(XT_CONTEXT, XT_ERR_TABLE_LOCKED, (u_long) tab_id);
+			default:
+				break;
+		}
+	}
+
+	/* xt_use_table_by_id returns a referenced tab! */
+	pushr_(xt_heap_release, tab);
+	if ((ot = xt_open_table(tab))) {
+		ot->ot_thread = self;
+		table_pool->opt_total_open++;
+	}
+	freer_(); // xt_release_heap(tab)
+
+	freer_(); // db_free_open_table_pool(table_pool)
+	freer_(); // xt_unlock_mutex(&db->db_ot_pool.opt_lock)
+	return ot;
+}
+
+xtPublic void xt_db_return_table_to_pool(XTThreadPtr XT_UNUSED(self), XTOpenTablePtr ot)
+{
+	xt_db_return_table_to_pool_ns(ot);
+}
+
+xtPublic void xt_db_return_table_to_pool_ns(XTOpenTablePtr ot)
+{
+	XTOpenTablePoolPtr	table_pool;
+	XTDatabaseHPtr		db = ot->ot_table->tab_db;
+	xtBool				flush_table = TRUE;
+
+	/* No open table returned to the pool should still
+	 * have a cache handle!
+	 */
+	ASSERT_NS(!ot->ot_ind_rhandle);
+	xt_lock_mutex_ns(&db->db_ot_pool.opt_lock);
+
+	if (!(table_pool = db_get_open_table_pool(db, ot->ot_table->tab_id)))
+		goto failed;
+
+	if (table_pool->opt_locked && !table_pool->opt_flushing) {
+		/* Table will be closed below: */
+		if (table_pool->opt_total_open > 1)
+			flush_table = FALSE;
+	}
+	else {
+		/* Put it on the free list: */
+		db->db_ot_pool.otp_total_free++;
+
+		ot->ot_otp_next_free = table_pool->opt_free_list;
+		table_pool->opt_free_list = ot;
+
+		/* This is the time the table was freed: */
+		ot->ot_otp_free_time = xt_db_approximate_time;
+
+		/* Add to most recently used: */
+		if ((ot->ot_otp_lr_used = db->db_ot_pool.otp_mr_used))
+			db->db_ot_pool.otp_mr_used->ot_otp_mr_used = ot;
+		ot->ot_otp_mr_used = NULL;
+		db->db_ot_pool.otp_mr_used = ot;
+		if (!db->db_ot_pool.otp_lr_used) {
+			db->db_ot_pool.otp_lr_used = ot;
+			db->db_ot_pool.otp_free_time = ot->ot_otp_free_time;
+		}
+
+		ot = NULL;
+	}
+
+	if (ot) {
+		xt_unlock_mutex_ns(&db->db_ot_pool.opt_lock);
+		xt_close_table(ot, flush_table, FALSE);
+
+		/* assume that table_pool cannot be invalidated in between as we have table_pool->opt_total_open > 0 */
+		xt_lock_mutex_ns(&db->db_ot_pool.opt_lock);
+		table_pool->opt_total_open--;
+	}
+
+	db_free_open_table_pool(NULL, table_pool);
+
+	if (!xt_broadcast_cond_ns(&db->db_ot_pool.opt_cond))
+		goto failed;
+	xt_unlock_mutex_ns(&db->db_ot_pool.opt_lock);
+	
+	return;
+
+	failed:
+	xt_unlock_mutex_ns(&db->db_ot_pool.opt_lock);
+	if (ot)
+		xt_close_table(ot, TRUE, FALSE);
+	xt_log_and_clear_exception_ns();
+}
+
+//#define TEST_FREE_OPEN_TABLES
+
+#ifdef DEBUG
+#undef XT_OPEN_TABLE_FREE_TIME
+#define XT_OPEN_TABLE_FREE_TIME			5
+#endif
+
+xtPublic void xt_db_free_unused_open_tables(XTThreadPtr self, XTDatabaseHPtr db)
+{
+	XTOpenTablePoolPtr	table_pool;
+	size_t				count;
+	XTOpenTablePtr		ot;
+	xtBool				flush_table = TRUE;
+	u_int				table_count;
+
+	/* A quick check of the oldest free table: */
+	if (xt_db_approximate_time < db->db_ot_pool.otp_free_time + XT_OPEN_TABLE_FREE_TIME)
+		return;
+
+	table_count = db->db_table_by_id ? xt_sl_get_size(db->db_table_by_id) : 0;
+	count = table_count * 3;
+	if (count < 20)
+		count = 20;
+#ifdef TEST_FREE_OPEN_TABLES
+	count = 10;
+#endif
+	if (db->db_ot_pool.otp_total_free > count) {
+		XTOpenTablePtr	ptr, pptr;
+
+		count = table_count * 2;
+		if (count < 10)
+			count = 10;
+#ifdef TEST_FREE_OPEN_TABLES
+		count = 5;
+#endif
+		xt_lock_mutex(self, &db->db_ot_pool.opt_lock);
+		pushr_(xt_unlock_mutex, &db->db_ot_pool.opt_lock);
+
+		while (db->db_ot_pool.otp_total_free > count) {
+			ASSERT_NS(db->db_ot_pool.otp_lr_used);
+			if (!(ot = db->db_ot_pool.otp_lr_used))
+				break;
+
+			/* Check how long the open table has been free: */
+			if (xt_db_approximate_time < ot->ot_otp_free_time + XT_OPEN_TABLE_FREE_TIME)
+				break;
+
+			ot->ot_thread = self;
+
+			/* Remove from MRU list: */
+			db->db_ot_pool.otp_lr_used = ot->ot_otp_mr_used;
+			if (db->db_ot_pool.otp_mr_used == ot)
+				db->db_ot_pool.otp_mr_used = ot->ot_otp_lr_used;
+			if (ot->ot_otp_lr_used)
+				ot->ot_otp_lr_used->ot_otp_mr_used = ot->ot_otp_mr_used;
+			if (ot->ot_otp_mr_used)
+				ot->ot_otp_mr_used->ot_otp_lr_used = ot->ot_otp_lr_used;
+
+			if (db->db_ot_pool.otp_lr_used)
+				db->db_ot_pool.otp_free_time = db->db_ot_pool.otp_lr_used->ot_otp_free_time;
+
+			ASSERT(db->db_ot_pool.otp_total_free > 0);
+			db->db_ot_pool.otp_total_free--;
+
+			if (!(table_pool = db_get_open_table_pool(db, ot->ot_table->tab_id)))
+				xt_throw(self);
+
+			/* Find the open table in the table pool,
+			 * and remove it from the list:
+			 */
+			pptr = NULL;
+			ptr = table_pool->opt_free_list;
+			while (ptr) {
+				if (ptr == ot)
+					break;
+				pptr = ptr;
+				ptr = ptr->ot_otp_next_free;
+			}
+
+			ASSERT_NS(ptr == ot);
+			if (ptr == ot) {
+				if (pptr)
+					pptr->ot_otp_next_free = ot->ot_otp_next_free;
+				else
+					table_pool->opt_free_list = ot->ot_otp_next_free;
+			}
+
+			ASSERT_NS(table_pool->opt_total_open > 0);
+			table_pool->opt_total_open--;
+			if (table_pool->opt_total_open > 0)
+				flush_table = FALSE;
+			else
+				flush_table = TRUE;
+
+			db_free_open_table_pool(self, table_pool);
+
+			freer_();
+
+			/* Close the table, but not
+			 * while holding the lock.
+			 */
+			xt_close_table(ot, flush_table, FALSE);
+
+			xt_lock_mutex(self, &db->db_ot_pool.opt_lock);
+			pushr_(xt_unlock_mutex, &db->db_ot_pool.opt_lock);
+		}
+
+		freer_();
+	}
+}
diff --git a/storage/pbxt/src/database_xt.h b/storage/pbxt/src/database_xt.h
new file mode 100644
index 00000000000..7744aeeac31
--- /dev/null
+++ b/storage/pbxt/src/database_xt.h
@@ -0,0 +1,261 @@
+/* Copyright (c) 2005 PrimeBase Technologies GmbH
+ *
+ * PrimeBase XT
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * 2005-01-15	Paul McCullagh
+ *
+ * H&G2JCtL
+ */
+#ifndef __xt_database_h__
+#define __xt_database_h__
+
+#include <time.h>
+
+#include "thread_xt.h"
+#include "hashtab_xt.h"
+#include "table_xt.h"
+#include "sortedlist_xt.h"
+#include "xaction_xt.h"
+#include "heap_xt.h"
+#include "xactlog_xt.h"
+#include "restart_xt.h"
+#include "index_xt.h"
+
+#ifdef DEBUG
+//#define XT_USE_XACTION_DEBUG_SIZES
+#endif
+
+#ifdef XT_USE_XACTION_DEBUG_SIZES
+#define XT_DB_TABLE_POOL_SIZE	2
+#else
+#define XT_DB_TABLE_POOL_SIZE	10		// The number of open tables maintained by the sweeper
+#endif
+
+/* Turn this switch on to enable spin lock based wait-for logic: */
+#define XT_USE_SPINLOCK_WAIT_FOR
+
+extern xtLogOffset		xt_db_log_file_threshold;
+extern size_t			xt_db_log_buffer_size;
+extern size_t			xt_db_transaction_buffer_size;
+extern size_t			xt_db_checkpoint_frequency;
+extern off_t			xt_db_data_log_threshold;
+extern size_t			xt_db_data_file_grow_size;
+extern size_t			xt_db_row_file_grow_size;
+extern int				xt_db_garbage_threshold;
+extern int				xt_db_log_file_count;
+extern int				xt_db_auto_increment_mode;
+extern int				xt_db_offline_log_function;
+extern int				xt_db_sweeper_priority;
+extern int				xt_db_flush_log_at_trx_commit;
+
+extern XTSortedListPtr	xt_db_open_db_by_id;
+extern XTHashTabPtr		xt_db_open_databases;
+extern time_t			xt_db_approximate_time;
+
+#define XT_OPEN_TABLE_POOL_HASH_SIZE	223
+
+#define XT_SW_WORK_NORMAL				0
+#define XT_SW_NO_MORE_XACT_SLOTS		1
+#define XT_SW_DIRTY_RECORD_FOUND		2
+#define XT_SW_TOO_FAR_BEHIND			3							/* The sweeper is getting too far behind, although it is working! */
+
+typedef struct XTOpenTablePool {
+	struct XTDatabase		*opt_db;
+	xtTableID				opt_tab_id;								/* The table ID. */
+	u_int					opt_total_open;							/* Total number of open tables. */
+	xtBool					opt_locked;								/* This table is locked open tables are freed on return to pool. */
+	u_int					opt_flushing;
+	XTOpenTablePtr			opt_free_list;							/* A list of free, unused open tables. */
+	struct XTOpenTablePool	*opt_next_hash;
+} XTOpenTablePoolRec, *XTOpenTablePoolPtr;
+
+typedef struct XTAllTablePools {
+	xt_mutex_type			opt_lock;								/* This lock protects the open table pool. */
+	xt_cond_type			opt_cond;								/* Used to wait for an exclusive lock on a table. */
+
+	u_int					otp_total_free;							/* This is the total number of free open tables (not in use): */
+
+	/* All free (unused tables) are on this list: */
+	XTOpenTablePtr			otp_mr_used;
+	XTOpenTablePtr			otp_lr_used;
+	time_t					otp_free_time;							/* The free time of the LRU open table. */
+	
+	XTOpenTablePoolPtr		otp_hash[XT_OPEN_TABLE_POOL_HASH_SIZE];
+} XTAllTablePoolsRec, *XTAllTablePoolsPtr;
+
+typedef struct XTTablePath {
+	u_int					tp_tab_count;							/* The number of tables using this path. */
+	char					tp_path[XT_VAR_LENGTH];					/* The table path. */
+} XTTablePathRec, *XTTablePathPtr;
+
+#define XT_THREAD_BUSY		0
+#define XT_THREAD_IDLE		1
+#define XT_THREAD_INERR		2
+
+#define XT_XA_HASH_TAB_SIZE	223
+
+typedef struct XTDatabase : public XTHeap {
+	char					*db_name;								/* The name of the database, last component of the path! */
+	char					*db_main_path;
+	xtDatabaseID			db_id;
+	xtTableID				db_curr_tab_id;							/* The ID of the last table created. */
+	XTHashTabPtr			db_tables;
+	XTSortedListPtr			db_table_by_id;
+	XTSortedListPtr			db_table_paths;							/* A list of table paths used by this database. */
+	xtBool					db_multi_path;
+	XTSortedListPtr			db_error_list;							/* A list of errors already reported. */
+
+	/* The open table pool: */
+	XTAllTablePoolsRec		db_ot_pool;
+
+	/* Transaction related stuff: */
+	XTSpinLockRec			db_xn_id_lock;							/* Lock for next transaction ID. */
+	xtXactID				db_xn_curr_id;							/* The ID of the last transaction started. */
+	xtXactID				db_xn_min_ram_id;						/* The lowest ID of the transactions in memory (RAM). */
+	xtXactID				db_xn_to_clean_id;						/* The next transaction to be cleaned (>= db_xn_min_ram_id). */
+	xtXactID				db_xn_min_run_id;						/* The lowest ID of all running transactions (not up-to-date! >= db_xn_to_clean_id) */
+	xtWord4					db_xn_end_time;							/* The time of the transaction end. */
+	XTXactSegRec			db_xn_idx[XT_XN_NO_OF_SEGMENTS];		/* Index of transactions in RAM. */
+	xtWord1					*db_xn_data;							/* Start of the block allocated to contain transaction data. */
+	xtWord1					*db_xn_data_end;						/* End of the transaction data block. */
+	u_int					db_stat_sweep_waits;					/* STATISTICS: count the sweeper waits. */
+	XTDatabaseLogRec		db_xlog;								/* The transaction log for this database. */
+	XTXactRestartRec		db_restart;								/* Database recovery stuff. */
+	xt_mutex_type			db_xn_xa_lock;
+	XTXactPreparePtr		db_xn_xa_table[XT_XA_HASH_TAB_SIZE];
+	XTSortedListPtr			db_xn_xa_list;							/* The "wait-for" list, of transactions waiting for other transactions. */
+
+	XTSortedListPtr			db_xn_wait_for;							/* The "wait-for" list, of transactions waiting for other transactions. */
+	u_int					db_xn_call_start;						/* Start of the post wait calls. */
+	XTSpinLockRec			db_xn_wait_spinlock;
+	//xt_mutex_type			db_xn_wait_lock;						/* The lock associated with the wait for list. */
+	//xt_cond_type			db_xn_wait_cond;						/* This condition is signalled when a transaction quits. */
+	//u_int					db_xn_wait_on_cond;						/* Number of threads waiting on the condition. */
+	int						db_xn_wait_count;						/* Number of waiting transactions. */
+	u_int					db_xn_total_writer_count;				/* The total number of writers. */
+	int						db_xn_writer_count;						/* The number of writer threads. */
+	int						db_xn_writer_wait_count;				/* The number of writer threads waiting. */
+	int						db_xn_long_running_count;				/* The number of long running writer threads. */
+
+	/* Sweeper stuff: */
+	struct XTThread			*db_sw_thread;							/* The sweeper thread (cleans up transactions). */
+	xt_mutex_type			db_sw_lock;								/* The lock associated with the sweeper. */
+	xt_cond_type			db_sw_cond;								/* The sweeper wakeup condition. */
+	u_int					db_sw_check_count;
+	int						db_sw_idle;								/* BUSY/IDLE/INERR depending on the state of the sweeper. */
+	int						db_sw_faster;							/* non-zero if the sweeper should work faster. */
+	xtBool					db_sw_fast;								/* TRUE if the sweeper is working faster. */
+
+	/* Writer stuff: */
+	struct XTThread			*db_wr_thread;							/* The writer thread (write log data to the database). */
+	int						db_wr_idle;								/* BUSY/IDLE/INERR depending on the state of the writer. */
+	xtBool					db_wr_faster;							/* Set to TRUE if the writer should work faster. */
+	xtBool					db_wr_fast;								/* TRUE if the writer is working faster. */
+	u_int					db_wr_thread_waiting;					/* Count the number of threads waiting for the writer. */
+	xtBool					db_wr_freeer_waiting;					/* TRUE if the freeer is wating for the writer. */
+	xt_mutex_type			db_wr_lock;
+	xt_cond_type			db_wr_cond;								/* Writer condition when idle (must bw woken by log flush! */
+	xtLogID					db_wr_log_id;							/* Current write log ID. */
+	xtLogOffset				db_wr_log_offset;						/* Current write log offset. */
+	xtLogID					db_wr_flush_point_log_id;				/* This is the point to which the writer will write (log ID). */
+	xtLogOffset				db_wr_flush_point_log_offset;			/* This is the point to which the writer will write (log offset). */
+
+	/* Data log stuff: */
+	XTDataLogCacheRec		db_datalogs;							/* The database data log stuff. */
+	XTIndexLogPoolRec		db_indlogs;								/* Index logs used for consistent write. */
+
+	/* Compactor stuff: */
+	struct XTThread			*db_co_thread;							/* The compator thread (compacts data logs). */
+	xt_mutex_type			db_co_ext_lock;							/* Required when extended data is moved, or removed. */
+	xtBool					db_co_busy;								/* True of the compactor is busy compacting a data log. */
+	xt_mutex_type			db_co_dlog_lock;						/* This is the lock required to flusht the compactors data log. */
+
+	/* Checkpointer stuff: */
+	struct XTThread			*db_cp_thread;							/* The checkpoint thread (flushes the database data). */
+	xt_mutex_type			db_cp_lock;
+	xt_cond_type			db_cp_cond;								/* Writer condition when idle (must bw woken by log flush! */
+	XTCheckPointStateRec	db_cp_state;							/* The checkpoint state. */
+
+	/* The "flusher" thread (used when pbxt_flush_log_at_trx_commit = 0 or 2) */
+	struct XTThread			*db_fl_thread;							/* The flusher thread (flushes the transation log). */
+	xt_mutex_type			db_fl_lock;
+} XTDatabaseRec, *XTDatabaseHPtr;		/* Heap pointer */
+
+#define XT_FOR_USER					0
+#define XT_FOR_COMPACTOR			1
+#define XT_FOR_SWEEPER				2
+#define XT_FOR_WRITER				3
+#define XT_FOR_CHECKPOINTER			4
+
+void				xt_create_database(XTThreadPtr th, char *path);
+XTDatabaseHPtr		xt_get_database(XTThreadPtr self, char *path, xtBool multi_path);
+XTDatabaseHPtr		xt_get_database_by_id(XTThreadPtr self, xtDatabaseID db_id);
+void				xt_drop_database(XTThreadPtr self, XTDatabaseHPtr db);
+void				xt_check_database(XTThreadPtr self);
+
+void				xt_add_pbxt_file(size_t size, char *path, const char *file);
+void				xt_add_location_file(size_t size, char *path);
+void				xt_add_pbxt_dir(size_t size, char *path);
+void				xt_add_system_dir(size_t size, char *path);
+void				xt_add_data_dir(size_t size, char *path);
+
+void				xt_use_database(XTThreadPtr self, XTDatabaseHPtr db, int what_for);
+void				xt_unuse_database(XTThreadPtr self, XTThreadPtr other_thr);
+void				xt_open_database(XTThreadPtr self, char *path, xtBool multi_path);
+
+void				xt_lock_installation(XTThreadPtr self, char *installation_path);
+void				xt_unlock_installation(XTThreadPtr self, char *installation_path);
+void				xt_crash_me(void);
+
+void				xt_init_databases(XTThreadPtr self);
+void				xt_stop_database_threads(XTThreadPtr self, xtBool sync);
+void				xt_exit_databases(XTThreadPtr self);
+
+void				xt_dump_database(XTThreadPtr self, XTDatabaseHPtr db);
+
+void				xt_db_init_thread(XTThreadPtr self, XTThreadPtr new_thread);
+void				xt_db_exit_thread(XTThreadPtr self);
+
+void				xt_db_pool_init(XTThreadPtr self, struct XTDatabase *db);
+void				xt_db_pool_exit(XTThreadPtr self, struct XTDatabase *db);
+XTOpenTablePoolPtr	xt_db_lock_table_pool_by_name(XTThreadPtr self, XTDatabaseHPtr db, XTPathStrPtr name, xtBool no_load, xtBool flush_table, xtBool missing_ok, xtBool wait_for_open, XTTableHPtr *ret_tab);
+void				xt_db_wait_for_open_tables(XTThreadPtr self, XTOpenTablePoolPtr table_pool);
+void				xt_db_unlock_table_pool(struct XTThread *self, XTOpenTablePoolPtr table_pool);
+XTOpenTablePtr		xt_db_open_pool_table(XTThreadPtr self, XTDatabaseHPtr db, xtTableID tab_id, int *result, xtBool i_am_background);
+XTOpenTablePtr		xt_db_open_table_using_tab(XTTableHPtr tab, XTThreadPtr thread);
+xtBool				xt_db_open_pool_table_ns(XTOpenTablePtr *ret_ot, XTDatabaseHPtr db, xtTableID tab_id);
+void				xt_db_return_table_to_pool(XTThreadPtr self, XTOpenTablePtr ot);
+void				xt_db_return_table_to_pool_ns(XTOpenTablePtr ot);
+void				xt_db_free_unused_open_tables(XTThreadPtr self, XTDatabaseHPtr db);
+
+#define XT_LONG_RUNNING_TIME	2
+
+inline void xt_xlog_check_long_writer(XTThreadPtr thread)
+{
+	if (thread->st_xact_writer) {
+		if (xt_db_approximate_time - thread->st_xact_write_time > XT_LONG_RUNNING_TIME) {
+			if (!thread->st_xact_long_running) {
+				thread->st_xact_long_running = TRUE;
+				thread->st_database->db_xn_long_running_count++;
+			}
+		}
+	}
+}
+
+extern XTDatabaseHPtr	pbxt_database;				// The global open database
+
+#endif
diff --git a/storage/pbxt/src/datadic_xt.cc b/storage/pbxt/src/datadic_xt.cc
new file mode 100644
index 00000000000..6a58d23d980
--- /dev/null
+++ b/storage/pbxt/src/datadic_xt.cc
@@ -0,0 +1,3007 @@
+/* Copyright (c) 2005 PrimeBase Technologies GmbH
+ *
+ * PrimeBase XT
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.	See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * 2006-05-16	Paul McCullagh
+ *
+ * H&G2JCtL
+ *
+ * Implementation of the PBXT internal data dictionary.
+ */
+
+
+#include "xt_config.h"
+
+#ifdef DRIZZLED
+#include <bitset>
+#endif
+
+#include <ctype.h>
+#include <errno.h>
+
+#ifdef DEBUG
+#ifdef DRIZZLED
+//#include <drizzled/common_includes.h>
+#else
+#include "mysql_priv.h"
+#endif
+#endif
+
+#include "pthread_xt.h"
+#include "datadic_xt.h"
+#include "util_xt.h"
+#include "database_xt.h"
+#include "table_xt.h"
+#include "heap_xt.h"
+#include "strutil_xt.h"
+#include "myxt_xt.h"
+#include "hashtab_xt.h"
+
+/*
+ * -----------------------------------------------------------------------
+ * Lexical analyser
+ */
+
+#define XT_TK_EOF				0
+#define XT_TK_IDENTIFIER		1
+#define XT_TK_NUMBER			2
+#define XT_TK_STRING			3
+#define XT_TK_PUNCTUATION		4
+
+#define XT_TK_RESERVER_WORDS	5
+#define XT_TK_PRIMARY			5
+#define XT_TK_UNIQUE			6
+#define XT_TK_FULLTEXT			7
+#define XT_TK_SPATIAL			8
+#define XT_TK_INDEX				9
+#define XT_TK_KEY				10
+#define XT_TK_CHECK				11
+#define XT_TK_FOREIGN			12
+#define XT_TK_COLUMN			13
+#define XT_TK_REFERENCES		14
+#define XT_TK_NOT				15
+#define XT_TK_NULL				16
+#define XT_TK_AUTO_INCREMENT	17
+#define XT_TK_COMMENT			18
+#define XT_TK_DEFAULT			19
+#define XT_TK_COLLATE			20
+
+class XTToken {
+	public:	
+	u_int	tk_type;
+	char	*tk_text;
+	size_t	tk_length;
+
+	void initCString(u_int type, char *start, char *end);
+	inline char charAt(u_int i) {
+		if (i >= tk_length)
+			return 0;
+		return toupper(tk_text[i]);
+	}
+	void expectKeyWord(XTThreadPtr self, c_char *keyword);
+	void expectIdentifier(XTThreadPtr self);
+	void expectNumber(XTThreadPtr self);
+	bool isKeyWord(c_char *keyword);
+	bool isReservedWord();
+	bool isReservedWord(u_int word);
+	void identifyReservedWord();
+	bool isEOF();
+	bool isIdentifier();
+	bool isNumber();
+	size_t getString(char *string, size_t len);
+	void getTokenText(char *string, size_t len);
+	XTToken *clone(XTThreadPtr self);
+};
+
+void XTToken::initCString(u_int type, char *start, char *end)
+{
+	tk_type = type;
+	tk_text = start;
+	tk_length = (size_t) end - (size_t) start;
+}
+
+bool XTToken::isKeyWord(c_char *keyword)
+{
+	char	*str = tk_text;
+	size_t	len = tk_length;
+	
+	while (len && *keyword) {
+		if (toupper(*keyword) != toupper(*str))
+			return false;
+		keyword++;
+		str++;
+		len--;
+	}
+	return !len && !*keyword;
+}
+
+bool XTToken::isReservedWord()
+{
+	return tk_type >= XT_TK_RESERVER_WORDS;
+}
+
+bool XTToken::isReservedWord(u_int word)
+{
+	return tk_type == word;
+}
+
+void XTToken::identifyReservedWord()
+{
+	if (tk_type == XT_TK_IDENTIFIER) {
+		switch (charAt(0)) {
+			case 'A':
+				if (isKeyWord("AUTO_INCREMENT"))
+					tk_type = XT_TK_AUTO_INCREMENT;
+				break;
+			case 'C':
+				switch (charAt(2)) {
+					case 'E':
+						if (isKeyWord("CHECK"))
+							tk_type = XT_TK_CHECK;
+						break;
+					case 'L':
+						if (isKeyWord("COLUMN"))
+							tk_type = XT_TK_COLUMN;
+						else if (isKeyWord("COLLATE"))
+							tk_type = XT_TK_COLLATE;
+						break;
+					case 'M':
+						if (isKeyWord("COMMENT"))
+							tk_type = XT_TK_COMMENT;
+						break;
+				}
+				break;
+			case 'D':
+				if (isKeyWord("DEFAULT"))
+					tk_type = XT_TK_DEFAULT;
+				break;
+			case 'F':
+				switch (charAt(1)) {
+					case 'O':
+						if (isKeyWord("FOREIGN"))
+							tk_type = XT_TK_FOREIGN;
+						break;
+					case 'U':
+						if (isKeyWord("FULLTEXT"))
+							tk_type = XT_TK_FULLTEXT;
+						break;
+				}
+				break;
+			case 'I':
+				if (isKeyWord("INDEX"))
+					tk_type = XT_TK_INDEX;
+				break;
+			case 'K':
+				if (isKeyWord("KEY"))
+					tk_type = XT_TK_KEY;
+				break;
+			case 'N':
+				switch (charAt(1)) {
+					case 'O':
+						if (isKeyWord("NOT"))
+							tk_type = XT_TK_NOT;
+						break;
+					case 'U':
+						if (isKeyWord("NULL"))
+							tk_type = XT_TK_NULL;
+						break;
+				}
+				break;
+			case 'P':
+				if (isKeyWord("PRIMARY"))
+					tk_type = XT_TK_PRIMARY;
+				break;
+			case 'R':
+				if (isKeyWord("REFERENCES"))
+					tk_type = XT_TK_REFERENCES;
+				break;
+			case 'S':
+				if (isKeyWord("SPATIAL"))
+					tk_type = XT_TK_SPATIAL;
+				break;
+			case 'U':
+				if (isKeyWord("UNIQUE"))
+					tk_type = XT_TK_UNIQUE;
+				break;			
+		}
+	}
+}
+
+bool XTToken::isEOF()
+{
+	return tk_type == XT_TK_EOF;
+}
+
+bool XTToken::isIdentifier()
+{
+	return tk_type == XT_TK_IDENTIFIER;
+}
+
+bool XTToken::isNumber()
+{
+	return tk_type == XT_TK_NUMBER;
+}
+
+/* Return actual, or required string length. */
+size_t XTToken::getString(char *dtext, size_t dsize)
+{
+	char	*buffer = dtext;
+	int		slen;
+	size_t	dlen;
+	char	*stext;
+	char	quote;
+
+	if ((slen = (int) tk_length) == 0) {
+		*dtext = 0;
+		return 0;
+	}
+	switch (*tk_text) {
+		case '\'':
+		case '"':
+		case '`':
+			quote = *tk_text;
+			stext = tk_text+1;
+			slen -= 2;
+			dlen = 0;
+			while (slen > 0) {
+				if (*stext == '\\') {
+					stext++;
+					slen--;
+					if (slen > 0) {
+						switch (*stext) {
+							case '\0':
+								*dtext = 0;
+								break;
+							case '\'':
+								*dtext = '\'';
+								break;
+							case '"':
+								*dtext = '"';
+								break;
+							case 'b':
+								*dtext = '\b';
+								break;
+							case 'n':
+								*dtext = '\n';
+								break;
+							case 'r':
+								*dtext = '\r';
+								break;
+							case 't':
+								*dtext = '\t';
+								break;
+							case 'z':
+								*dtext = (char) 26;
+								break;
+							case '\\':
+								*dtext = '\\';
+								break;
+							default:
+								*dtext = *stext;
+								break;
+						}
+					}
+				}
+				else if (*stext == quote) {
+					if (dlen < dsize)
+						*dtext = quote;
+					stext++;
+					slen--;
+				}
+				else {
+					if (dlen < dsize)
+						*dtext = *stext;
+				}
+				dtext++;
+				dlen++;
+				stext++;
+				slen--;
+			}
+			if (dlen < dsize)
+				buffer[dlen] = 0;
+			else if (dsize > 0)
+				buffer[dsize-1] = 0;
+			break;
+		default:
+			if (dsize > 0) {
+				dlen = dsize-1;
+				if ((int) dlen > slen)
+					dlen = slen;
+				memcpy(dtext, tk_text, dlen);
+				dtext[dlen] = 0;
+			}
+			dlen = tk_length;
+			break;
+	}
+	return dlen;
+}
+
+/* Return the token as a string with ... in it if it is too long
+ */
+void XTToken::getTokenText(char *string, size_t size)
+{
+	if (tk_length == 0 || !tk_text) {
+		xt_strcpy(size, string, "EOF");
+		return;
+	}
+
+	size--;
+	if (tk_length <= size) {
+		memcpy(string, tk_text, tk_length);
+		string[tk_length] = 0;
+		return;
+	}
+	
+	size = (size - 3) / 2;
+	memcpy(string, tk_text, size);
+	memcpy(string+size, "...", 3);
+	memcpy(string+size+3, tk_text + tk_length - size, size);
+	string[size+3+size] = 0;
+}
+
+XTToken *XTToken::clone(XTThreadPtr self)
+{
+	XTToken *tk;
+
+	if (!(tk = new XTToken()))
+		xt_throw_errno(XT_CONTEXT, XT_ENOMEM);
+	tk->initCString(tk_type, tk_text, tk_text + tk_length);
+	return tk;
+}
+
+void XTToken::expectKeyWord(XTThreadPtr self, c_char *keyword)
+{
+	char	buffer[100];
+
+	if (isKeyWord(keyword))
+		return;
+	getTokenText(buffer, 100);
+	xt_throw_i2xterr(XT_CONTEXT, XT_ERR_A_EXPECTED_NOT_B, keyword, buffer);
+}
+
+void XTToken::expectIdentifier(XTThreadPtr self)
+{
+	char buffer[100];
+
+	if (isIdentifier())
+		return;
+	getTokenText(buffer, 100);
+	xt_throw_i2xterr(XT_CONTEXT, XT_ERR_A_EXPECTED_NOT_B, "Identifier", buffer);
+}
+
+void XTToken::expectNumber(XTThreadPtr self)
+{
+	char buffer[100];
+
+	if (isNumber())
+		return;
+	getTokenText(buffer, 100);
+	xt_throw_i2xterr(XT_CONTEXT, XT_ERR_A_EXPECTED_NOT_B, "Value", buffer);
+}
+
+struct charset_info_st;
+
+class XTTokenizer {
+	MX_CONST_CHARSET_INFO	*tkn_charset;
+	char					*tkn_cstring;
+	char					*tkn_curr_pos;
+	XTToken					*tkn_current;
+	bool					tkn_in_comment;
+
+	public:
+
+	XTTokenizer(bool convert, char *cstring) {
+		tkn_charset = myxt_getcharset(convert);
+		tkn_cstring = cstring;
+		tkn_curr_pos = cstring;
+		tkn_current = NULL;
+		tkn_in_comment = FALSE;
+	}
+
+	virtual ~XTTokenizer(void) {
+		if (tkn_current)
+			delete tkn_current;
+	}
+
+	inline bool isSingleChar(int ch)
+	{
+		return  ch != '$' && ch != '_' && myxt_ispunct(tkn_charset, ch);
+	}
+
+	inline bool isIdentifierChar(int ch)
+	{
+		return  ch && !isSingleChar(ch) && !myxt_isspace(tkn_charset, ch);
+	}
+
+	inline bool isNumberChar(int ch, int next_ch)
+	{
+		return myxt_isdigit(tkn_charset, ch) || ((ch == '-' || ch == '+') && myxt_isdigit(tkn_charset, next_ch));
+	}
+
+	XTToken *newToken(XTThreadPtr self, u_int type, char *start, char *end);
+	XTToken *nextToken(XTThreadPtr self);
+	XTToken *nextToken(XTThreadPtr self, c_char *keyword, XTToken *tk);
+};
+
+XTToken *XTTokenizer::newToken(XTThreadPtr self, u_int type, char *start, char *end)
+{
+	if (!tkn_current) {
+		if (!(tkn_current = new XTToken()))
+			xt_throw_errno(XT_CONTEXT, XT_ENOMEM);
+	}
+	tkn_current->initCString(type, start, end);
+	if (type == XT_TK_IDENTIFIER)
+		tkn_current->identifyReservedWord();
+	return tkn_current;
+}
+
+XTToken *XTTokenizer::nextToken(XTThreadPtr self)
+{
+	char	*token_start;
+	u_int	token_type = XT_TK_PUNCTUATION;
+	char	quote;
+	bool	must_be_num;
+
+	restart:
+
+	/* Ignore space: */
+	while (*tkn_curr_pos && myxt_isspace(tkn_charset, *tkn_curr_pos)) tkn_curr_pos++;
+
+	token_start = tkn_curr_pos;
+	switch (*tkn_curr_pos) {
+		case '\0':
+			return newToken(self, XT_TK_EOF, NULL, NULL);
+		// Comment: # ... EOL
+		case '#':
+			tkn_curr_pos++;
+			while (*tkn_curr_pos && *tkn_curr_pos != '\n' && *tkn_curr_pos != '\r') tkn_curr_pos++;
+			goto restart;
+		case '-':
+			if (tkn_curr_pos[1] == '-') {
+				// Comment: -- ... EOL
+				while (*tkn_curr_pos && *tkn_curr_pos != '\n' && *tkn_curr_pos != '\r') tkn_curr_pos++;
+				goto restart;
+			}
+			if (myxt_isdigit(tkn_charset, tkn_curr_pos[1]))
+				goto is_number;
+			tkn_curr_pos++;
+			break;
+		case '+':
+			if (myxt_isdigit(tkn_charset, tkn_curr_pos[1]))
+				goto is_number;
+			tkn_curr_pos++;
+			break;
+		case '/':
+			tkn_curr_pos++;
+			if (*tkn_curr_pos == '*') {
+				// Comment: /* ... */
+				// Look for: /*!99999 ... */  version conditional statements
+				tkn_curr_pos++;
+				if (*tkn_curr_pos == '!') {
+					tkn_curr_pos++;
+					if (isdigit(*tkn_curr_pos)) {
+						while (isdigit(*tkn_curr_pos))
+							tkn_curr_pos++;
+						tkn_in_comment = true;
+						goto restart;
+					}
+				}
+
+				while (*tkn_curr_pos && !(*tkn_curr_pos == '*' && *(tkn_curr_pos+1) == '/')) tkn_curr_pos++;
+				if (*tkn_curr_pos == '*' && *(tkn_curr_pos+1) == '/')
+					tkn_curr_pos += 2;
+				goto restart;
+			}
+			break;
+		case '\'':
+			token_type = XT_TK_STRING;
+			goto is_string;
+		case '"':
+		case '`':
+			token_type = XT_TK_IDENTIFIER;
+			is_string:
+			quote = *tkn_curr_pos;
+			tkn_curr_pos++;
+			while (*tkn_curr_pos) {
+				if (*tkn_curr_pos == quote) {
+					// Doubling the quote means stay in string...
+					if (*(tkn_curr_pos + 1) != quote)
+						break;
+					tkn_curr_pos++;
+				}
+				/* TODO: Unless sql_mode == 'NO_BACKSLASH_ESCAPES'!!! */
+				if (*tkn_curr_pos == '\\') {
+					if (*(tkn_curr_pos+1) == quote) {
+						if (quote == '"' || quote == '\'')
+							tkn_curr_pos++;
+					}
+				}
+				tkn_curr_pos++;
+			}
+			
+			if (*tkn_curr_pos == quote)
+				tkn_curr_pos++;
+			break;
+		case '$':
+			goto is_identifier;
+		case '*':
+			if (tkn_in_comment) {
+				if (tkn_curr_pos[1] == '/') {
+					tkn_in_comment = false;
+					tkn_curr_pos += 2;
+					goto restart;
+				}
+			}
+			/* No break required! */
+		default:
+			if (isNumberChar(tkn_curr_pos[0], tkn_curr_pos[1]))
+				goto is_number;
+
+			if (isSingleChar(*tkn_curr_pos)) {
+				token_type = XT_TK_PUNCTUATION;
+				// The rest are singles...
+				tkn_curr_pos++;
+				break;
+			}
+			
+			is_identifier:
+			// Identifier (any string of characters that is not punctuation or a space:
+			token_type = XT_TK_IDENTIFIER;
+			while (isIdentifierChar(*tkn_curr_pos))
+				tkn_curr_pos++;
+			break;
+
+			is_number:
+			must_be_num = false;
+			token_type = XT_TK_NUMBER;
+
+			if (*tkn_curr_pos == '-' || *tkn_curr_pos == '+') {
+				must_be_num = true;
+				tkn_curr_pos++;
+			}
+
+			// Number: 9999 [ . 9999 ] [ e/E [+/-] 9999 ]
+			// However, 9999e or 9999E is an identifier!
+			while (*tkn_curr_pos && myxt_isdigit(tkn_charset, *tkn_curr_pos)) tkn_curr_pos++;
+			
+			if (*tkn_curr_pos == '.') {
+				must_be_num = true;
+				tkn_curr_pos++;
+				while (*tkn_curr_pos && myxt_isdigit(tkn_charset, *tkn_curr_pos)) tkn_curr_pos++;
+			}
+
+			if (*tkn_curr_pos == 'e' || *tkn_curr_pos == 'E') {
+				tkn_curr_pos++;
+
+				if (isNumberChar(tkn_curr_pos[0], tkn_curr_pos[1])) {
+					must_be_num = true;
+
+					if (*tkn_curr_pos == '-' || *tkn_curr_pos == '+')
+						tkn_curr_pos++;
+					while (*tkn_curr_pos && myxt_isdigit(tkn_charset, *tkn_curr_pos))
+						tkn_curr_pos++;
+				}
+				else if (!must_be_num)
+					token_type = XT_TK_IDENTIFIER;
+			}
+
+			if (must_be_num || !isIdentifierChar(*tkn_curr_pos))
+				break;
+
+			/* Crazy, but true. An identifier can start by looking like a number! */
+			goto is_identifier;
+	}
+
+	return newToken(self, token_type, token_start, tkn_curr_pos);
+}
+
+XTToken *XTTokenizer::nextToken(XTThreadPtr self, c_char *keyword, XTToken *tk)
+{
+	tk->expectKeyWord(self, keyword);
+	return nextToken(self);
+}
+
+/*
+ * -----------------------------------------------------------------------
+ * Parser
+ */
+
+/*
+	We must parse the following syntax. Note that the constraints
+	may be embedded in a CREATE TABLE/ALTER TABLE statement.
+
+	[CONSTRAINT symbol] FOREIGN KEY [id] (index_col_name, ...)
+    REFERENCES tbl_name (index_col_name, ...)
+    [ON DELETE {RESTRICT | CASCADE | SET NULL | SET DEFAULT | NO ACTION}]
+    [ON UPDATE {RESTRICT | CASCADE | SET NULL | SET DEFAULT | NO ACTION}]
+*/
+
+class XTParseTable : public XTObject {
+	public:	
+	void raiseError(XTThreadPtr self, XTToken *tk, int err);
+
+	private:
+	XTTokenizer			*pt_tokenizer;
+	XTToken				*pt_current;
+	XTStringBufferRec	pt_sbuffer;
+
+	void syntaxError(XTThreadPtr self, XTToken *tk);	
+
+	void parseIdentifier(XTThreadPtr self, char *name);
+	int parseKeyAction(XTThreadPtr self);	
+	void parseCreateTable(XTThreadPtr self);
+	void parseAddTableItem(XTThreadPtr self);
+	void parseQualifiedName(XTThreadPtr self, char *parent_name, char *name);
+	void parseTableName(XTThreadPtr self, bool alterTable);
+	void parseExpression(XTThreadPtr self, bool allow_reserved);
+	void parseBrackets(XTThreadPtr self);
+	void parseMoveColumn(XTThreadPtr self);
+	
+	/* If old_col_name is NULL, then this column is to be added,
+	 * if old_col_name is empty (strlen() = 0) then the column
+	 * exists, and should be modified, otherwize the column
+	 * given is to be modified.
+	 */
+	void parseColumnDefinition(XTThreadPtr self, char *old_col_name);
+	void parseDataType(XTThreadPtr self);
+	void parseReferenceDefinition(XTThreadPtr self, u_int req_cols);
+	void optionalIndexName(XTThreadPtr self);
+	void optionalIndexType(XTThreadPtr self);
+	u_int columnList(XTThreadPtr self, bool index_cols);
+	void parseAlterTable(XTThreadPtr self);	
+	void parseCreateIndex(XTThreadPtr self);
+	void parseDropIndex(XTThreadPtr self);
+
+	public:	
+	XTParseTable() {
+		pt_tokenizer = NULL;
+		pt_current = NULL;
+		memset(&pt_sbuffer, 0, sizeof(XTStringBufferRec));
+	}
+
+	virtual void finalize(XTThreadPtr XT_UNUSED(self)) {
+		if (pt_tokenizer)
+			delete pt_tokenizer;
+		xt_sb_set_size(NULL, &pt_sbuffer, 0);
+	}
+
+	// Hooks to receive output from the parser:
+	virtual void setTableName(XTThreadPtr XT_UNUSED(self), char *XT_UNUSED(name), bool XT_UNUSED(alterTable)) {
+	}
+	virtual void addColumn(XTThreadPtr XT_UNUSED(self), char *XT_UNUSED(col_name), char *XT_UNUSED(old_col_name)) {
+	}
+	virtual void setDataType(XTThreadPtr self, char *cstring) {
+		if (cstring) 
+			xt_free(self, cstring);
+	}
+	virtual void setNull(XTThreadPtr XT_UNUSED(self), bool XT_UNUSED(nullOK)) {
+	}
+	virtual void setAutoInc(XTThreadPtr XT_UNUSED(self), bool XT_UNUSED(autoInc)) {
+	}
+	
+	/* Add a contraint. If lastColumn is TRUE then add the contraint 
+	 * to the last column. If not, expect addListedColumn() to be called.
+	 */
+	virtual void addConstraint(XTThreadPtr XT_UNUSED(self), char *XT_UNUSED(name), u_int XT_UNUSED(type), bool XT_UNUSED(lastColumn)) {
+	}
+	
+	/* Move the last column created. If symbol is NULL then move the column to the
+	 * first position, else move it to the position just after the given column.
+	 */
+	virtual void moveColumn(XTThreadPtr XT_UNUSED(self), char *XT_UNUSED(col_name)) {
+	}
+
+	virtual void dropColumn(XTThreadPtr XT_UNUSED(self), char *XT_UNUSED(col_name)) {
+	}
+
+	virtual void dropConstraint(XTThreadPtr XT_UNUSED(self), char *XT_UNUSED(name), u_int XT_UNUSED(type)) {
+	}
+
+	virtual void setIndexName(XTThreadPtr XT_UNUSED(self), char *XT_UNUSED(name)) {
+	}
+	virtual void addListedColumn(XTThreadPtr XT_UNUSED(self), char *XT_UNUSED(index_col_name)) {
+	}
+	virtual void setReferencedTable(XTThreadPtr XT_UNUSED(self), char *XT_UNUSED(ref_schema), char *XT_UNUSED(ref_table)) {
+	}
+	virtual void addReferencedColumn(XTThreadPtr XT_UNUSED(self), char *XT_UNUSED(index_col_name)) {
+	}
+	virtual void setActions(XTThreadPtr XT_UNUSED(self), int XT_UNUSED(on_delete), int XT_UNUSED(on_update)) {
+	}
+
+	virtual void parseTable(XTThreadPtr self, bool convert, char *sql);	
+};
+
+void XTParseTable::raiseError(XTThreadPtr self, XTToken *tk, int err)
+{
+	char buffer[100];
+
+	tk->getTokenText(buffer, 100);
+	xt_throw_ixterr(XT_CONTEXT, err, buffer);
+}
+
+void XTParseTable::syntaxError(XTThreadPtr self, XTToken *tk)
+{
+	raiseError(self, tk, XT_ERR_SYNTAX);
+}
+
+void XTParseTable::parseIdentifier(XTThreadPtr self, char *name)
+{
+	pt_current->expectIdentifier(self);
+	if (name) {
+		if (pt_current->getString(name, XT_IDENTIFIER_NAME_SIZE) >= XT_IDENTIFIER_NAME_SIZE)
+			raiseError(self, pt_current, XT_ERR_ID_TOO_LONG);
+	}
+	pt_current = pt_tokenizer->nextToken(self);
+}
+
+int XTParseTable::parseKeyAction(XTThreadPtr self)
+{
+	XTToken *tk;
+
+	tk = pt_tokenizer->nextToken(self);
+
+	if (tk->isKeyWord("RESTRICT"))
+		return XT_KEY_ACTION_RESTRICT;
+
+	if (tk->isKeyWord("CASCADE"))
+		return XT_KEY_ACTION_CASCADE;
+
+	if (tk->isKeyWord("SET")) {
+		tk = pt_tokenizer->nextToken(self);
+		if (tk->isKeyWord("DEFAULT"))
+			return XT_KEY_ACTION_SET_DEFAULT;
+		tk->expectKeyWord(self, "NULL");
+		return XT_KEY_ACTION_SET_NULL;
+	}
+
+	if (tk->isKeyWord("NO")) {
+		tk = pt_tokenizer->nextToken(self);
+		tk->expectKeyWord(self, "ACTION");
+		return XT_KEY_ACTION_NO_ACTION;
+	}
+
+	syntaxError(self, tk);
+	return 0;
+}
+
+void XTParseTable::parseTable(XTThreadPtr self, bool convert, char *sql)
+{
+	if (pt_tokenizer)
+		delete pt_tokenizer;
+	pt_tokenizer = new XTTokenizer(convert, sql);
+	if (!pt_tokenizer)
+		xt_throw_errno(XT_CONTEXT, XT_ENOMEM);
+	pt_current = pt_tokenizer->nextToken(self);
+
+	if (pt_current->isKeyWord("CREATE")) {
+		pt_current = pt_tokenizer->nextToken(self);
+		if (pt_current->isKeyWord("TEMPORARY") || pt_current->isKeyWord("TABLE"))
+			parseCreateTable(self);
+		else
+			parseCreateIndex(self);
+	}
+	else if (pt_current->isKeyWord("ALTER"))
+		parseAlterTable(self);
+	else if (pt_current->isKeyWord("DROP"))
+		parseDropIndex(self);
+	else if (pt_current->isKeyWord("TRUNCATE")) {
+		pt_current = pt_tokenizer->nextToken(self);
+		if (pt_current->isKeyWord("TABLE"))
+			pt_current = pt_tokenizer->nextToken(self);
+		parseTableName(self, true);
+	}
+	else if (pt_current->isKeyWord("OPTIMIZE") || pt_current->isKeyWord("REPAIR")) {
+		/* OPTIMIZE [LOCAL | NO_WRITE_TO_BINLOG] TABLE tbl_name [, tbl_name] ...
+		 *
+		 * GOTCHA: This cannot work if more than one table is specified,
+		 * because then I cannot find the source table?!
+		 */
+		pt_current = pt_tokenizer->nextToken(self);
+		while (!pt_current->isEOF() && !pt_current->isKeyWord("TABLE"))
+			pt_current = pt_tokenizer->nextToken(self);
+		pt_current = pt_tokenizer->nextToken(self);
+		parseTableName(self, true);
+	}
+	else
+		syntaxError(self, pt_current);
+}
+
+void XTParseTable::parseCreateTable(XTThreadPtr self)
+{
+	if (pt_current->isKeyWord("TEMPORARY"))
+		pt_current = pt_tokenizer->nextToken(self);
+	pt_current = pt_tokenizer->nextToken(self, "TABLE", pt_current);
+	if (pt_current->isKeyWord("IF")) {
+		pt_current = pt_tokenizer->nextToken(self);
+		pt_current = pt_tokenizer->nextToken(self, "NOT", pt_current);
+		pt_current = pt_tokenizer->nextToken(self, "EXISTS", pt_current);
+	}
+
+	/* Table name is optional (when loading from dictionary)! */
+	if (!pt_current->isKeyWord("("))
+		parseTableName(self, false);
+	else
+		setTableName(self, NULL, false);
+
+	/* We do not support CREATE ... SELECT! */
+	if (pt_current->isKeyWord("(")) {
+		pt_current = pt_tokenizer->nextToken(self);
+		// Avoid this:
+		// create table t3 (select group_concat(a) as a from t1 where a = 'a') union
+		// (select group_concat(b) as a from t1 where a = 'b');
+		if (pt_current->isKeyWord("SELECT"))
+			return;
+		
+		/* Allow empty table definition for temporary table. */
+		while (!pt_current->isEOF() && !pt_current->isKeyWord(")")) {
+			parseAddTableItem(self);
+			if (!pt_current->isKeyWord(","))
+				break;
+			pt_current = pt_tokenizer->nextToken(self);
+		}
+		pt_current = pt_tokenizer->nextToken(self, ")", pt_current);
+	}
+}
+
+void XTParseTable::parseAddTableItem(XTThreadPtr self)
+{
+	char name[XT_IDENTIFIER_NAME_SIZE];
+
+	*name = 0;
+	if (pt_current->isKeyWord("CONSTRAINT")) {
+		pt_current = pt_tokenizer->nextToken(self);
+		if (pt_current->isIdentifier())
+			parseQualifiedName(self, NULL, name);
+	}
+
+	if (pt_current->isReservedWord(XT_TK_PRIMARY)) {
+		pt_current = pt_tokenizer->nextToken(self);
+		pt_current = pt_tokenizer->nextToken(self, "KEY", pt_current);
+
+		addConstraint(self, name, XT_DD_KEY_PRIMARY, false);
+		optionalIndexType(self);
+
+		/* GATCHA: Wierd?! This syntax is used in a test:
+		 * alter table t1 add primary key aaa(tt);
+		 */
+		if (!pt_current->isKeyWord("("))
+			pt_current = pt_tokenizer->nextToken(self);
+		columnList(self, true);
+	}
+	else if (pt_current->isReservedWord(XT_TK_UNIQUE) ||
+		pt_current->isReservedWord(XT_TK_FULLTEXT) ||
+		pt_current->isReservedWord(XT_TK_SPATIAL) ||
+		pt_current->isReservedWord(XT_TK_INDEX) ||
+		pt_current->isReservedWord(XT_TK_KEY)) {
+		bool is_unique = false;
+
+		if (pt_current->isReservedWord(XT_TK_FULLTEXT) || pt_current->isReservedWord(XT_TK_SPATIAL))
+			pt_current = pt_tokenizer->nextToken(self);
+		else if (pt_current->isReservedWord(XT_TK_UNIQUE)) {
+			pt_current = pt_tokenizer->nextToken(self);
+			is_unique = true;
+		}
+		if (pt_current->isReservedWord(XT_TK_INDEX) || pt_current->isReservedWord(XT_TK_KEY))
+			pt_current = pt_tokenizer->nextToken(self);
+
+		addConstraint(self, name, is_unique ? XT_DD_INDEX_UNIQUE : XT_DD_INDEX, false);
+		optionalIndexName(self);
+		optionalIndexType(self);
+		columnList(self, true);
+	}
+	else if (pt_current->isReservedWord(XT_TK_CHECK)) {
+		pt_current = pt_tokenizer->nextToken(self);
+		parseExpression(self, false);
+	}
+	else if (pt_current->isReservedWord(XT_TK_FOREIGN)) {
+		u_int req_cols;
+
+		pt_current = pt_tokenizer->nextToken(self);
+		pt_current = pt_tokenizer->nextToken(self, "KEY", pt_current);
+
+		addConstraint(self, name, XT_DD_KEY_FOREIGN, false);
+		optionalIndexName(self);
+		req_cols = columnList(self, false);
+		/* GOTCHA: According the MySQL manual this is optional, but without domains,
+		 * it is required!
+		 */
+		parseReferenceDefinition(self, req_cols);
+	}
+	else if (pt_current->isKeyWord("(")) {
+		pt_current = pt_tokenizer->nextToken(self);
+		for (;;) {
+			parseColumnDefinition(self, NULL);
+			if (!pt_current->isKeyWord(","))
+				break;
+			pt_current = pt_tokenizer->nextToken(self);
+		}
+		pt_current = pt_tokenizer->nextToken(self, ")", pt_current);
+	}
+	else {
+		if (pt_current->isReservedWord(XT_TK_COLUMN))
+			pt_current = pt_tokenizer->nextToken(self);
+		parseColumnDefinition(self, NULL);
+		parseMoveColumn(self);
+	}
+	/* GOTCHA: Support: create table t1 (a int not null, key `a` (a) key_block_size=1024)
+	 * and any other undocumented syntax?!
+	 */
+	parseExpression(self, true);
+}
+
+void XTParseTable::parseExpression(XTThreadPtr self, bool allow_reserved)
+{
+	while (!pt_current->isEOF() && !pt_current->isKeyWord(",") &&
+		!pt_current->isKeyWord(")") && (allow_reserved || !pt_current->isReservedWord())) {
+		if (pt_current->isKeyWord("("))
+			parseBrackets(self);
+		else
+			pt_current = pt_tokenizer->nextToken(self);
+	}
+}
+
+void XTParseTable::parseBrackets(XTThreadPtr self)
+{
+	u_int cnt = 1;
+	pt_current = pt_tokenizer->nextToken(self, "(", pt_current);
+	while (cnt) {
+		if (pt_current->isEOF())
+			break;
+		if (pt_current->isKeyWord("("))
+			cnt++;
+		if (pt_current->isKeyWord(")"))
+			cnt--;
+		pt_current = pt_tokenizer->nextToken(self);
+	}
+}
+
+void XTParseTable::parseMoveColumn(XTThreadPtr self)
+{
+	if (pt_current->isKeyWord("FIRST")) {
+		pt_current = pt_tokenizer->nextToken(self);
+		/* If name is NULL it means move to the front. */
+		moveColumn(self, NULL);
+	}
+	else if (pt_current->isKeyWord("AFTER")) {
+		char	name[XT_IDENTIFIER_NAME_SIZE];
+
+		pt_current = pt_tokenizer->nextToken(self);
+		parseQualifiedName(self, NULL, name);
+		moveColumn(self, name);
+	}
+}
+
+void XTParseTable::parseQualifiedName(XTThreadPtr self, char *parent_name, char *name)
+{
+	if (parent_name)
+		parent_name[0] = '\0';
+	/* Should be an identifier by I have this example:
+	 * CREATE TABLE t1 ( comment CHAR(32) ASCII NOT NULL, koi8_ru_f CHAR(32) CHARACTER SET koi8r NOT NULL default '' ) CHARSET=latin5;
+	 *
+	 * COMMENT is elsewhere used as reserved word?!
+	 */
+	if (pt_current->getString(name, XT_IDENTIFIER_NAME_SIZE) >= XT_IDENTIFIER_NAME_SIZE)
+		raiseError(self, pt_current, XT_ERR_ID_TOO_LONG);
+	pt_current = pt_tokenizer->nextToken(self);
+	while (pt_current->isKeyWord(".")) {
+		if (parent_name)
+			xt_strcpy(XT_IDENTIFIER_NAME_SIZE,parent_name, name);
+		pt_current = pt_tokenizer->nextToken(self);
+		/* Accept anything after the DOT! */
+		if (pt_current->getString(name, XT_IDENTIFIER_NAME_SIZE) >= XT_IDENTIFIER_NAME_SIZE)
+			raiseError(self, pt_current, XT_ERR_ID_TOO_LONG);
+		pt_current = pt_tokenizer->nextToken(self);
+	}
+}
+
+void XTParseTable::parseTableName(XTThreadPtr self, bool alterTable)
+{
+	char name[XT_IDENTIFIER_NAME_SIZE];
+
+	parseQualifiedName(self, NULL, name);
+	setTableName(self, name, alterTable);
+}
+
+void XTParseTable::parseColumnDefinition(XTThreadPtr self, char *old_col_name)
+{
+	char col_name[XT_IDENTIFIER_NAME_SIZE];
+
+	// column_definition
+	parseQualifiedName(self, NULL, col_name);
+	addColumn(self, col_name, old_col_name);
+	parseDataType(self);
+
+	for (;;) {
+		if (pt_current->isReservedWord(XT_TK_NOT)) {
+			pt_current = pt_tokenizer->nextToken(self);
+			pt_current = pt_tokenizer->nextToken(self, "NULL", pt_current);
+			setNull(self, false);
+		}
+		else if (pt_current->isReservedWord(XT_TK_NULL)) {
+			pt_current = pt_tokenizer->nextToken(self);
+			setNull(self, true);
+		}
+		else if (pt_current->isReservedWord(XT_TK_DEFAULT)) {
+			pt_current = pt_tokenizer->nextToken(self);
+			/* Possible here [ + | - ] <value> or [ <charset> ] <string> */
+			parseExpression(self, false);
+		}
+		else if (pt_current->isReservedWord(XT_TK_AUTO_INCREMENT)) {
+			pt_current = pt_tokenizer->nextToken(self);
+			setAutoInc(self, true);
+		}
+		else if (pt_current->isReservedWord(XT_TK_UNIQUE)) {
+			pt_current = pt_tokenizer->nextToken(self);
+			if (pt_current->isReservedWord(XT_TK_KEY))
+				pt_current = pt_tokenizer->nextToken(self);
+			addConstraint(self, NULL, XT_DD_INDEX_UNIQUE, true);
+		}
+		else if (pt_current->isReservedWord(XT_TK_KEY)) {
+			pt_current = pt_tokenizer->nextToken(self);
+			addConstraint(self, NULL, XT_DD_INDEX, true);
+		}
+		else if (pt_current->isReservedWord(XT_TK_PRIMARY)) {
+			pt_current = pt_tokenizer->nextToken(self);
+			pt_current = pt_tokenizer->nextToken(self, "KEY", pt_current);
+			addConstraint(self, NULL, XT_DD_KEY_PRIMARY, true);
+		}
+		else if (pt_current->isReservedWord(XT_TK_COMMENT)) {
+			pt_current = pt_tokenizer->nextToken(self);
+			pt_current = pt_tokenizer->nextToken(self);
+		}
+		else if (pt_current->isReservedWord(XT_TK_REFERENCES)) {
+			addConstraint(self, NULL, XT_DD_KEY_FOREIGN, true);
+			parseReferenceDefinition(self, 1);
+		}
+		else if (pt_current->isReservedWord(XT_TK_CHECK)) {
+			pt_current = pt_tokenizer->nextToken(self);
+			parseExpression(self, false);
+		}
+		/* GOTCHA: Not in the documentation:
+		 * CREATE TABLE t1 (c varchar(255) NOT NULL COLLATE utf8_general_ci, INDEX (c))
+		 */
+		else if (pt_current->isReservedWord(XT_TK_COLLATE)) {
+			pt_current = pt_tokenizer->nextToken(self);
+			pt_current = pt_tokenizer->nextToken(self);
+		}
+		else
+			break;
+	}
+}
+
+void XTParseTable::parseDataType(XTThreadPtr self)
+{
+	/* Not actually implemented because MySQL allows undocumented
+	 * syntax like this:
+	 * create table t1 (c national character varying(10))
+	 */
+	parseExpression(self, false);
+	setDataType(self, NULL);
+}
+
+void XTParseTable::optionalIndexName(XTThreadPtr self)
+{
+	// [index_name]
+	if (!pt_current->isKeyWord("USING") && !pt_current->isKeyWord("(")) {
+		char name[XT_IDENTIFIER_NAME_SIZE];
+
+		parseIdentifier(self, name);
+		setIndexName(self, name);
+	}
+}
+
+void XTParseTable::optionalIndexType(XTThreadPtr self)
+{
+	// USING {BTREE | HASH}
+	if (pt_current->isKeyWord("USING")) {
+		pt_current = pt_tokenizer->nextToken(self);
+		pt_current = pt_tokenizer->nextToken(self);
+	}
+}
+
+u_int XTParseTable::columnList(XTThreadPtr self, bool index_cols)
+{
+	char	name[XT_IDENTIFIER_NAME_SIZE];
+	u_int	cols = 0;
+	
+	pt_current->expectKeyWord(self, "(");
+	do {
+		pt_current = pt_tokenizer->nextToken(self);
+		parseQualifiedName(self, NULL, name);
+		addListedColumn(self, name);
+		cols++;
+		if (index_cols) {
+			if (pt_current->isKeyWord("(")) {
+				pt_current = pt_tokenizer->nextToken(self);
+				pt_current = pt_tokenizer->nextToken(self);
+				pt_current = pt_tokenizer->nextToken(self, ")", pt_current);
+			}
+			if (pt_current->isKeyWord("ASC"))
+				pt_current = pt_tokenizer->nextToken(self);
+			else if (pt_current->isKeyWord("DESC"))
+				pt_current = pt_tokenizer->nextToken(self);
+		}
+	} while (pt_current->isKeyWord(","));
+	pt_current = pt_tokenizer->nextToken(self, ")", pt_current);
+	return cols;
+}
+
+void XTParseTable::parseReferenceDefinition(XTThreadPtr self, u_int req_cols)
+{
+	int		on_delete = XT_KEY_ACTION_RESTRICT;
+	int		on_update = XT_KEY_ACTION_RESTRICT;
+	char	name[XT_IDENTIFIER_NAME_SIZE];
+	char	parent_name[XT_IDENTIFIER_NAME_SIZE];
+	u_int	cols = 0;
+
+	// REFERENCES tbl_name
+	pt_current = pt_tokenizer->nextToken(self, "REFERENCES", pt_current);
+	parseQualifiedName(self, parent_name, name);
+	setReferencedTable(self, parent_name[0] ? parent_name : NULL, name);
+
+	// [ (index_col_name,...) ]
+	if (pt_current->isKeyWord("(")) {
+		pt_current->expectKeyWord(self, "(");
+		do {
+			pt_current = pt_tokenizer->nextToken(self);
+			parseQualifiedName(self, NULL, name);
+			addReferencedColumn(self, name);
+			cols++;
+			if (cols > req_cols)
+				raiseError(self, pt_current, XT_ERR_INCORRECT_NO_OF_COLS);
+		} while (pt_current->isKeyWord(","));
+		if (cols != req_cols)
+			raiseError(self, pt_current, XT_ERR_INCORRECT_NO_OF_COLS);
+		pt_current = pt_tokenizer->nextToken(self, ")", pt_current);			
+	}
+	else
+		addReferencedColumn(self, NULL);
+
+	// [MATCH FULL | MATCH PARTIAL | MATCH SIMPLE]
+	if (pt_current->isKeyWord("MATCH")) {
+		pt_current = pt_tokenizer->nextToken(self);
+		pt_current = pt_tokenizer->nextToken(self);
+	}
+
+	// [ON DELETE {RESTRICT | CASCADE | SET NULL | SET DEFAULT | NO ACTION}]
+	// [ON UPDATE {RESTRICT | CASCADE | SET NULL | SET DEFAULT | NO ACTION}]
+	while (pt_current->isKeyWord("ON")) {
+		pt_current = pt_tokenizer->nextToken(self);
+		if (pt_current->isKeyWord("DELETE"))
+			on_delete = parseKeyAction(self);
+		else if (pt_current->isKeyWord("UPDATE"))
+			on_update = parseKeyAction(self);
+		else
+			syntaxError(self, pt_current);
+		pt_current = pt_tokenizer->nextToken(self);
+	}
+
+	setActions(self, on_delete, on_update);
+}
+
+void XTParseTable::parseAlterTable(XTThreadPtr self)
+{
+	char name[XT_IDENTIFIER_NAME_SIZE];
+
+	pt_current = pt_tokenizer->nextToken(self, "ALTER", pt_current);
+	if (pt_current->isKeyWord("IGNORE"))
+		pt_current = pt_tokenizer->nextToken(self);
+	pt_current = pt_tokenizer->nextToken(self, "TABLE", pt_current);
+	parseTableName(self, true);
+	for (;;) {
+		if (pt_current->isKeyWord("ADD")) {
+			pt_current = pt_tokenizer->nextToken(self);
+			parseAddTableItem(self);
+		}
+		else if (pt_current->isKeyWord("ALTER")) {
+			pt_current = pt_tokenizer->nextToken(self);
+			if (pt_current->isReservedWord(XT_TK_COLUMN))
+				pt_current = pt_tokenizer->nextToken(self);
+			pt_current->expectIdentifier(self);
+			pt_current = pt_tokenizer->nextToken(self);
+			if (pt_current->isKeyWord("SET")) {
+				pt_current = pt_tokenizer->nextToken(self);
+				pt_current = pt_tokenizer->nextToken(self, "DEFAULT", pt_current);
+				pt_current = pt_tokenizer->nextToken(self);
+			}
+			else if (pt_current->isKeyWord("DROP")) {
+				pt_current = pt_tokenizer->nextToken(self);
+				pt_current = pt_tokenizer->nextToken(self, "DEFAULT", pt_current);
+			}
+		}
+		else if (pt_current->isKeyWord("CHANGE")) {
+			char old_col_name[XT_IDENTIFIER_NAME_SIZE];
+
+			pt_current = pt_tokenizer->nextToken(self);
+			if (pt_current->isReservedWord(XT_TK_COLUMN))
+				pt_current = pt_tokenizer->nextToken(self);
+
+			parseQualifiedName(self, NULL, old_col_name);
+			parseColumnDefinition(self, old_col_name);
+			parseMoveColumn(self);
+		}
+		else if (pt_current->isKeyWord("MODIFY")) {
+			pt_current = pt_tokenizer->nextToken(self);
+			if (pt_current->isReservedWord(XT_TK_COLUMN))
+				pt_current = pt_tokenizer->nextToken(self);
+			parseColumnDefinition(self, NULL);
+			parseMoveColumn(self);
+		}
+		else if (pt_current->isKeyWord("DROP")) {
+			pt_current = pt_tokenizer->nextToken(self);
+			if (pt_current->isReservedWord(XT_TK_PRIMARY)) {
+				pt_current = pt_tokenizer->nextToken(self);
+				pt_current = pt_tokenizer->nextToken(self, "KEY", pt_current);
+				dropConstraint(self, NULL, XT_DD_KEY_PRIMARY);
+			}
+			else if (pt_current->isReservedWord(XT_TK_INDEX) || pt_current->isReservedWord(XT_TK_KEY)) {
+				pt_current = pt_tokenizer->nextToken(self);
+				parseIdentifier(self, name);
+				dropConstraint(self, name, XT_DD_INDEX);
+			}
+			else if (pt_current->isReservedWord(XT_TK_FOREIGN)) {
+				pt_current = pt_tokenizer->nextToken(self);
+				pt_current = pt_tokenizer->nextToken(self, "KEY", pt_current);
+				parseIdentifier(self, name);
+				dropConstraint(self, name, XT_DD_KEY_FOREIGN);
+			}
+			else {
+				if (pt_current->isReservedWord(XT_TK_COLUMN))
+					pt_current = pt_tokenizer->nextToken(self);
+				parseQualifiedName(self, NULL, name);
+				dropColumn(self, name);
+			}
+		}
+		else if (pt_current->isKeyWord("RENAME")) {
+			pt_current = pt_tokenizer->nextToken(self);
+			if (pt_current->isKeyWord("TO"))
+				pt_current = pt_tokenizer->nextToken(self);
+			parseQualifiedName(self, NULL, name);
+		}
+		else
+			/* Just ignore the syntax until the next , */
+			parseExpression(self, true);
+		if (!pt_current->isKeyWord(","))
+			break;
+		pt_current = pt_tokenizer->nextToken(self);
+	}
+}
+
+void XTParseTable::parseCreateIndex(XTThreadPtr self)
+{
+	char name[XT_IDENTIFIER_NAME_SIZE];
+	bool is_unique = false;
+
+	if (pt_current->isReservedWord(XT_TK_UNIQUE)) {
+		pt_current = pt_tokenizer->nextToken(self);
+		is_unique = true;
+	}
+	else if (pt_current->isReservedWord(XT_TK_FULLTEXT))
+		pt_current = pt_tokenizer->nextToken(self);
+	else if (pt_current->isKeyWord("SPACIAL"))
+		pt_current = pt_tokenizer->nextToken(self);
+	pt_current = pt_tokenizer->nextToken(self, "INDEX", pt_current);
+	parseQualifiedName(self, NULL, name);
+	optionalIndexType(self);
+	pt_current = pt_tokenizer->nextToken(self, "ON", pt_current);
+	parseTableName(self, true);
+	addConstraint(self, NULL, is_unique ? XT_DD_INDEX_UNIQUE : XT_DD_INDEX, false);
+	setIndexName(self, name);
+	columnList(self, true);
+}
+
+void XTParseTable::parseDropIndex(XTThreadPtr self)
+{
+	char name[XT_IDENTIFIER_NAME_SIZE];
+
+	pt_current = pt_tokenizer->nextToken(self, "DROP", pt_current);
+	pt_current = pt_tokenizer->nextToken(self, "INDEX", pt_current);
+	parseQualifiedName(self, NULL, name);
+	pt_current = pt_tokenizer->nextToken(self, "ON", pt_current);
+	parseTableName(self, true);
+	dropConstraint(self, name, XT_DD_INDEX);
+}
+
+/*
+ * -----------------------------------------------------------------------
+ * Create/Alter table table
+ */
+
+class XTCreateTable : public XTParseTable {
+	public:
+	bool					ct_convert;
+	MX_CONST_CHARSET_INFO	*ct_charset;
+	XTPathStrPtr			ct_tab_path;
+	u_int					ct_contraint_no;
+	XTDDTable				*ct_curr_table;
+	XTDDColumn				*ct_curr_column;
+	XTDDConstraint			*ct_curr_constraint;
+
+	XTCreateTable(bool convert, XTPathStrPtr tab_path) : XTParseTable() {
+		ct_convert = convert;
+		ct_charset = myxt_getcharset(convert);
+		ct_tab_path = tab_path;
+		ct_curr_table = NULL;
+		ct_curr_column = NULL;
+		ct_curr_constraint = NULL;
+	}
+
+	virtual void finalize(XTThreadPtr self) {
+		if (ct_curr_table)
+			ct_curr_table->release(self);
+		XTParseTable::finalize(self);
+	}
+
+	virtual void setTableName(XTThreadPtr self, char *name, bool alterTable);
+	virtual void addColumn(XTThreadPtr self, char *col_name, char *old_col_name);
+	virtual void addConstraint(XTThreadPtr self, char *name, u_int type, bool lastColumn);
+	virtual void dropConstraint(XTThreadPtr self, char *name, u_int type);
+	virtual void addListedColumn(XTThreadPtr self, char *index_col_name);
+	virtual void setReferencedTable(XTThreadPtr self, char *ref_schema, char *ref_table);
+	virtual void addReferencedColumn(XTThreadPtr self, char *index_col_name);
+	virtual void setActions(XTThreadPtr self, int on_delete, int on_update);
+
+	virtual void parseTable(XTThreadPtr self, bool convert, char *sql);	
+};
+
+static void ri_free_create_table(XTThreadPtr self, XTCreateTable *ct)
+{
+	if (ct)
+		ct->release(self);
+}
+
+XTDDTable *xt_ri_create_table(XTThreadPtr self, bool convert, XTPathStrPtr tab_path, char *sql, XTDDTable *start_tab)
+{
+	XTCreateTable	*ct;
+	XTDDTable		*dd_tab;
+
+	if (!(ct = new XTCreateTable(convert, tab_path))) {
+		if (start_tab)
+			start_tab->release(self);
+		xt_throw_errno(XT_CONTEXT, XT_ENOMEM);
+	}
+
+	ct->ct_curr_table = start_tab;
+
+	pushr_(ri_free_create_table, ct);
+
+	ct->parseTable(self, convert, sql);
+	
+	/* Return the table ... */
+	dd_tab = ct->ct_curr_table;
+	ct->ct_curr_table = NULL;
+
+	freer_();
+	return dd_tab;
+}
+
+void XTCreateTable::parseTable(XTThreadPtr self, bool convert, char *sql)
+{
+	u_int i;
+
+	ct_contraint_no = 0;
+	XTParseTable::parseTable(self, convert, sql);
+
+	/* Remove contraints that do not have matching columns. */
+	for (i=0; i<ct_curr_table->dt_indexes.size();) {
+		if (!ct_curr_table->dt_indexes.itemAt(i)->attachColumns())
+			ct_curr_table->dt_indexes.remove(self, i);
+		else
+			i++;
+	}
+
+	for (i=0; i<ct_curr_table->dt_fkeys.size(); ) {
+		if (!ct_curr_table->dt_fkeys.itemAt(i)->attachColumns())
+			ct_curr_table->dt_fkeys.remove(self, i);
+		else
+			i++;
+	}
+}
+
+void XTCreateTable::setTableName(XTThreadPtr self, char *name, bool alterTable)
+{
+	char path[PATH_MAX];
+
+	if (!name)
+		return;
+
+	xt_strcpy(PATH_MAX, path, ct_tab_path->ps_path);
+	xt_remove_last_name_of_path(path);
+
+	if (ct_convert) {
+		char	buffer[XT_IDENTIFIER_NAME_SIZE];
+		size_t	len;
+
+		myxt_static_convert_identifier(self, ct_charset, name, buffer, XT_IDENTIFIER_NAME_SIZE);
+		len = strlen(path);
+		myxt_static_convert_table_name(self, buffer, &path[len], PATH_MAX - len);
+	}
+	else
+		xt_strcat(PATH_MAX, path, name);
+
+	if (alterTable) {
+		XTTableHPtr	tab;
+
+		/* Find the table... */
+		pushsr_(tab, xt_heap_release, xt_use_table(self, (XTPathStrPtr) path, FALSE, TRUE));
+
+		/* Clone the foreign key definitions: */
+		if (tab && tab->tab_dic.dic_table) {
+			ct_curr_table->dt_fkeys.deleteAll(self);
+			ct_curr_table->dt_fkeys.clone(self, &tab->tab_dic.dic_table->dt_fkeys);	
+			for (u_int i=0; i<ct_curr_table->dt_fkeys.size(); i++)
+				ct_curr_table->dt_fkeys.itemAt(i)->co_table = ct_curr_table;
+		}
+
+		freer_(); // xt_heap_release(tab)
+	}
+}
+
+/*
+ * old_name is given if the column name was changed.
+ * NOTE that we built the table desciption from the current MySQL table
+ * description. This means that all changes to columns and 
+ * indexes have already been applied.
+ *
+ * Our job is to now add the foreign key changes.
+ * This means we have to note the current column here. It is
+ * possible to add a FOREIGN KEY contraint directly to a column!
+ */
+void XTCreateTable::addColumn(XTThreadPtr self, char *new_name, char *old_name)
+{
+	char new_col_name[XT_IDENTIFIER_NAME_SIZE];
+
+	myxt_static_convert_identifier(self, ct_charset, new_name, new_col_name, XT_IDENTIFIER_NAME_SIZE);
+	ct_curr_column = ct_curr_table->findColumn(new_col_name);
+	if (old_name) {
+		char old_col_name[XT_IDENTIFIER_NAME_SIZE];
+
+		myxt_static_convert_identifier(self, ct_charset, old_name, old_col_name, XT_IDENTIFIER_NAME_SIZE);
+		ct_curr_table->alterColumnName(self, old_col_name, new_col_name);
+	}
+}
+
+void XTCreateTable::addConstraint(XTThreadPtr self, char *name, u_int type, bool lastColumn)
+{
+	/* We are only interested in foreign keys! */
+	if (type == XT_DD_KEY_FOREIGN) {
+		char buffer[50];
+
+		if (!(ct_curr_constraint = new XTDDForeignKey()))
+			xt_throw_errno(XT_CONTEXT, XT_ENOMEM);
+		ct_curr_table->dt_fkeys.append(self, (XTDDForeignKey *) ct_curr_constraint);
+		ct_curr_constraint->co_table = ct_curr_table;
+
+		if (name && *name)
+			ct_curr_constraint->co_name = myxt_convert_identifier(self, ct_charset, name);
+		else {
+			// Generate a default constraint name:
+			ct_contraint_no++;
+			sprintf(buffer, "FOREIGN_%d", ct_contraint_no);
+			ct_curr_constraint->co_name = xt_dup_string(self, buffer);
+		}
+
+		if (lastColumn && ct_curr_column) {
+			/* This constraint has one column, the current column. */
+			XTDDColumnRef	*cref;
+			char			*col_name = xt_dup_string(self, ct_curr_column->dc_name);
+
+			if (!(cref = new XTDDColumnRef())) {
+				xt_free(self, col_name);
+				xt_throw_errno(XT_CONTEXT, XT_ENOMEM);
+			}
+			cref->cr_col_name = col_name;
+			ct_curr_constraint->co_cols.append(self, cref);
+		}
+	}
+	else
+		/* Other constraints/indexes do not interest us: */
+		ct_curr_constraint = NULL;
+}
+
+void XTCreateTable::dropConstraint(XTThreadPtr self, char *name, u_int type)
+{
+	if (type == XT_DD_KEY_FOREIGN && name) {
+		u_int			i;
+		XTDDForeignKey	*fkey;
+		char			con_name[XT_IDENTIFIER_NAME_SIZE];
+
+		myxt_static_convert_identifier(self, ct_charset, name, con_name, XT_IDENTIFIER_NAME_SIZE);
+		for (i=0; i<ct_curr_table->dt_fkeys.size(); i++) {
+			fkey = ct_curr_table->dt_fkeys.itemAt(i);
+			if (fkey->co_name && myxt_strcasecmp(con_name, fkey->co_name) == 0) {
+				ct_curr_table->dt_fkeys.remove(fkey);
+				fkey->release(self);
+			}
+		}
+	}
+}
+
+void XTCreateTable::addListedColumn(XTThreadPtr self, char *index_col_name)
+{
+	if (ct_curr_constraint && ct_curr_constraint->co_type == XT_DD_KEY_FOREIGN) {
+		XTDDColumnRef	*cref;
+		char			*name = myxt_convert_identifier(self, ct_charset, index_col_name);
+
+		if (!(cref = new XTDDColumnRef())) {
+			xt_free(self, name);
+			xt_throw_errno(XT_CONTEXT, XT_ENOMEM);
+		}
+		cref->cr_col_name = name;
+		ct_curr_constraint->co_cols.append(self, cref);
+	}
+}
+
+void XTCreateTable::setReferencedTable(XTThreadPtr self, char *ref_schema, char *ref_table)
+{
+	XTDDForeignKey	*fk = (XTDDForeignKey *) ct_curr_constraint;
+	char			path[PATH_MAX];
+
+	if (ref_schema) {
+		xt_strcpy(PATH_MAX,path, ".");
+		xt_add_dir_char(PATH_MAX, path);
+		xt_strcat(PATH_MAX, path, ref_schema);
+		xt_add_dir_char(PATH_MAX, path);
+		xt_strcat(PATH_MAX, path, ref_table);
+	} else {
+		xt_strcpy(PATH_MAX, path, ct_tab_path->ps_path);
+		xt_remove_last_name_of_path(path);
+		if (ct_convert) {
+			char	buffer[XT_IDENTIFIER_NAME_SIZE];
+			size_t	len;
+
+			myxt_static_convert_identifier(self, ct_charset, ref_table, buffer, XT_IDENTIFIER_NAME_SIZE);
+			len = strlen(path);
+			myxt_static_convert_table_name(self, buffer, &path[len], PATH_MAX - len);
+		}
+		else
+			xt_strcat(PATH_MAX, path, ref_table);
+	}
+
+	fk->fk_ref_tab_name = (XTPathStrPtr) xt_dup_string(self, path);
+}
+
+/* If the referenced column is NULL, this means 
+ * duplicate the local column list!
+ */
+void XTCreateTable::addReferencedColumn(XTThreadPtr self, char *index_col_name)
+{
+	XTDDForeignKey	*fk = (XTDDForeignKey *) ct_curr_constraint;
+	XTDDColumnRef	*cref;
+	char			*name;
+
+	if (index_col_name) {
+		name = myxt_convert_identifier(self, ct_charset, index_col_name);
+		if (!(cref = new XTDDColumnRef())) {
+			xt_free(self, name);
+			xt_throw_errno(XT_CONTEXT, XT_ENOMEM);
+		}
+		cref->cr_col_name = name;
+		fk->fk_ref_cols.append(self, cref);
+	}
+	else
+		fk->fk_ref_cols.clone(self, &fk->co_cols);
+}
+
+void XTCreateTable::setActions(XTThreadPtr XT_UNUSED(self), int on_delete, int on_update)
+{
+	XTDDForeignKey	*fk = (XTDDForeignKey *) ct_curr_constraint;
+
+	fk->fk_on_delete = on_delete;
+	fk->fk_on_update = on_update;
+}
+
+/*
+ * -----------------------------------------------------------------------
+ * Dictionary methods
+ */
+
+void XTDDColumn::init(XTThreadPtr self, XTObject *obj) {
+	XTDDColumn *col = (XTDDColumn *) obj;
+
+	XTObject::init(self, obj);
+	if (col->dc_name)
+		dc_name = xt_dup_string(self, col->dc_name);
+	if (col->dc_data_type)
+		dc_data_type = xt_dup_string(self, col->dc_data_type);
+	dc_null_ok = col->dc_null_ok;
+	dc_auto_inc = col->dc_auto_inc;
+}
+
+void XTDDColumn::finalize(XTThreadPtr self)
+{
+	if (dc_name)
+		xt_free(self, dc_name);
+	if (dc_data_type)
+		xt_free(self, dc_data_type);
+}
+
+void XTDDColumn::loadString(XTThreadPtr self, XTStringBufferPtr sb)
+{
+	xt_sb_concat(self, sb, "`");
+	xt_sb_concat(self, sb, dc_name);
+	xt_sb_concat(self, sb, "` ");
+	if (dc_data_type) {
+		xt_sb_concat(self, sb, dc_data_type);
+		if (dc_null_ok)
+			xt_sb_concat(self, sb, " NULL");
+		else
+			xt_sb_concat(self, sb, " NOT NULL");
+		if (dc_auto_inc)
+			xt_sb_concat(self, sb, " AUTO_INCREMENT");
+	}
+}
+
+void  XTDDColumnRef::init(XTThreadPtr self, XTObject *obj)
+{
+	XTDDColumnRef *cr = (XTDDColumnRef *) obj;
+
+	XTObject::init(self, obj);
+	cr_col_name = xt_dup_string(self, cr->cr_col_name);
+}
+
+void XTDDColumnRef::finalize(XTThreadPtr self)
+{
+	XTObject::finalize(self);
+	if (cr_col_name) {
+		xt_free(self, cr_col_name);
+		cr_col_name = NULL;
+	}
+}
+
+void  XTDDConstraint::init(XTThreadPtr self, XTObject *obj)
+{
+	XTDDConstraint *co = (XTDDConstraint *) obj;
+
+	XTObject::init(self, obj);
+	co_type = co->co_type;
+	if (co->co_name)
+		co_name = xt_dup_string(self, co->co_name);
+	if (co->co_ind_name)
+		co_ind_name = xt_dup_string(self, co->co_ind_name);
+	co_cols.clone(self, &co->co_cols);
+}
+
+void XTDDConstraint::loadString(XTThreadPtr self, XTStringBufferPtr sb)
+{
+	if (co_name) {
+		xt_sb_concat(self, sb, "CONSTRAINT `");
+		xt_sb_concat(self, sb, co_name);
+		xt_sb_concat(self, sb, "` ");
+	}
+	switch (co_type) {
+		case XT_DD_INDEX:
+			xt_sb_concat(self, sb, "INDEX ");
+			break;
+		case XT_DD_INDEX_UNIQUE:
+			xt_sb_concat(self, sb, "UNIQUE INDEX ");
+			break;
+		case XT_DD_KEY_PRIMARY:
+			xt_sb_concat(self, sb, "PRIMARY KEY ");
+			break;
+		case XT_DD_KEY_FOREIGN:
+			xt_sb_concat(self, sb, "FOREIGN KEY ");
+			break;		
+	}
+	if (co_ind_name) {
+		xt_sb_concat(self, sb, "`");
+		xt_sb_concat(self, sb, co_ind_name);
+		xt_sb_concat(self, sb, "` ");
+	}
+	xt_sb_concat(self, sb, "(`");
+	xt_sb_concat(self, sb, co_cols.itemAt(0)->cr_col_name);
+	for (u_int i=1; i<co_cols.size(); i++) {
+		xt_sb_concat(self, sb, "`, `");
+		xt_sb_concat(self, sb, co_cols.itemAt(i)->cr_col_name);
+	}
+	xt_sb_concat(self, sb, "`)");
+}
+
+void XTDDConstraint::alterColumnName(XTThreadPtr self, char *from_name, char *to_name)
+{
+	XTDDColumnRef *col;
+
+	for (u_int i=0; i<co_cols.size(); i++) {
+		col = co_cols.itemAt(i);
+		if (myxt_strcasecmp(col->cr_col_name, from_name) == 0) {
+			char *name = xt_dup_string(self, to_name);
+
+			xt_free(self, col->cr_col_name);
+			col->cr_col_name = name;
+			break;
+		}
+	}
+}
+
+void XTDDConstraint::getColumnList(char *buffer, size_t size)
+{
+	if (co_table->dt_table) {
+		xt_strcpy(size, buffer, "`");
+		xt_strcat(size, buffer, co_table->dt_table->tab_name->ps_path);
+		xt_strcat(size, buffer, "` (`");
+	}
+	else
+		xt_strcpy(size, buffer, "(`");
+	xt_strcat(size, buffer, co_cols.itemAt(0)->cr_col_name);
+	for (u_int i=1; i<co_cols.size(); i++) {
+		xt_strcat(size, buffer, "`, `");
+		xt_strcat(size, buffer, co_cols.itemAt(i)->cr_col_name);
+	}
+	xt_strcat(size, buffer, "`)");
+}
+
+bool XTDDConstraint::sameColumns(XTDDConstraint *co)
+{
+	u_int i = 0;
+
+	if (co_cols.size() != co->co_cols.size())
+		return false;
+	while (i<co_cols.size()) {
+		if (myxt_strcasecmp(co_cols.itemAt(i)->cr_col_name, co->co_cols.itemAt(i)->cr_col_name) != 0)
+			return false;
+		i++;
+	}
+	return OK;
+}
+
+bool XTDDConstraint::samePrefixColumns(XTDDConstraint *co)
+{
+	u_int i = 0;
+
+	if (co_cols.size() > co->co_cols.size())
+		return false;
+	while (i<co_cols.size()) {
+		if (myxt_strcasecmp(co_cols.itemAt(i)->cr_col_name, co->co_cols.itemAt(i)->cr_col_name) != 0)
+			return false;
+		i++;
+	}
+	return OK;
+}
+
+bool XTDDConstraint::attachColumns()
+{
+	XTDDColumn		*col;
+
+	for (u_int i=0; i<co_cols.size(); i++) {
+		if (!(col = co_table->findColumn(co_cols.itemAt(i)->cr_col_name)))
+			return false;
+		/* If this is a primary key, then the column becomes not-null! */
+		if (co_type == XT_DD_KEY_PRIMARY)
+			col->dc_null_ok = false;
+	}
+	return true;
+}
+
+void XTDDTableRef::finalize(XTThreadPtr self)
+{
+	XTDDForeignKey	*fk;
+
+	if ((fk = tr_fkey)) {
+		tr_fkey = NULL;
+		fk->removeReference(self);
+		xt_heap_release(self, fk->co_table->dt_table); /* We referenced the database table, not the foreign key */
+	}
+	XTObject::finalize(self);
+}
+
+bool XTDDTableRef::checkReference(xtWord1 *before_buf, XTThreadPtr thread)
+{
+	XTIndexPtr			loc_ind, ind;
+	xtBool				no_null = TRUE;
+	XTOpenTablePtr		ot;
+	XTIdxSearchKeyRec	search_key;
+	xtXactID			xn_id;
+	XTXactWaitRec		xw;
+	bool				ok = false;
+
+	if (!(loc_ind = tr_fkey->getReferenceIndexPtr()))
+		return false;
+
+	if (!(ind = tr_fkey->getIndexPtr()))
+		return false;
+
+	search_key.sk_key_value.sv_flags = 0;
+	search_key.sk_key_value.sv_rec_id = 0;
+	search_key.sk_key_value.sv_row_id = 0;
+	search_key.sk_key_value.sv_key = search_key.sk_key_buf;
+	search_key.sk_key_value.sv_length = myxt_create_foreign_key_from_row(loc_ind, search_key.sk_key_buf, before_buf, ind, &no_null);
+	search_key.sk_on_key = FALSE;
+
+	if (!no_null)
+		return true;
+
+	/* Search for the key in the child (referencing) table: */
+	if (!(ot = xt_db_open_table_using_tab(tr_fkey->co_table->dt_table, thread)))
+		return false;
+
+	retry:
+	if (!xt_idx_search(ot, ind, &search_key))
+		goto done;
+		
+	while (ot->ot_curr_rec_id && search_key.sk_on_key) {
+		switch (xt_tab_maybe_committed(ot, ot->ot_curr_rec_id, &xn_id, &ot->ot_curr_row_id, &ot->ot_curr_updated)) {
+			case XT_MAYBE:				
+				xw.xw_xn_id = xn_id;
+				if (!xt_xn_wait_for_xact(thread, &xw, NULL))
+					goto done;
+				goto retry;
+			case XT_ERR:
+				goto done;
+			case TRUE:
+				/* We found a matching child: */
+				xt_register_ixterr(XT_REG_CONTEXT, XT_ERR_ROW_IS_REFERENCED, tr_fkey->co_name);
+				goto done;
+			case FALSE:
+				if (!xt_idx_next(ot, ind, &search_key))
+					goto done;
+				break;
+		}
+	}
+
+	/* No matching children, all OK: */
+	ok = true;
+
+	done:
+	if (ot->ot_ind_rhandle) {
+		xt_ind_release_handle(ot->ot_ind_rhandle, FALSE, thread);
+		ot->ot_ind_rhandle = NULL;
+	}
+	xt_db_return_table_to_pool_ns(ot);
+	return ok;
+}
+
+/*
+ * A row has been deleted or updated (after_buf non-NULL), check if it is referenced by the foreign key table.
+ * If it is referenced, then we need to follow the specified action.
+ */
+bool XTDDTableRef::modifyRow(XTOpenTablePtr XT_UNUSED(ref_ot), xtWord1 *before_buf, xtWord1 *after_buf, XTThreadPtr thread)
+{
+	XTIndexPtr			loc_ind, ind;
+	xtBool				no_null = TRUE;
+	XTOpenTablePtr		ot;
+	XTIdxSearchKeyRec	search_key;
+	xtXactID			xn_id;
+	int					action = after_buf ? tr_fkey->fk_on_update : tr_fkey->fk_on_delete;
+	u_int				after_key_len = 0;
+	xtWord1				*after_key = NULL;
+	XTInfoBufferRec		after_info;
+	XTXactWaitRec		xw;
+
+	after_info.ib_free = FALSE;
+
+	if (!(loc_ind = tr_fkey->getReferenceIndexPtr()))
+		return false;
+
+	if (!(ind = tr_fkey->getIndexPtr()))
+		return false;
+
+	search_key.sk_key_value.sv_flags = 0;
+	search_key.sk_key_value.sv_rec_id = 0;
+	search_key.sk_key_value.sv_row_id = 0;
+	search_key.sk_key_value.sv_key = search_key.sk_key_buf;
+	search_key.sk_key_value.sv_length = myxt_create_foreign_key_from_row(loc_ind, search_key.sk_key_buf, before_buf, ind, &no_null);
+	search_key.sk_on_key = FALSE;
+
+	if (!no_null)
+		return true;
+
+	if (after_buf) {
+		if (!(after_key = (xtWord1 *) xt_malloc_ns(XT_INDEX_MAX_KEY_SIZE)))
+			return false;
+		after_key_len = myxt_create_foreign_key_from_row(loc_ind, after_key, after_buf, ind, NULL);
+		
+		/* Check whether the key value has changed, if not, we have nothing
+		 * to do here!
+		 */
+		if (myxt_compare_key(ind, 0, search_key.sk_key_value.sv_length,
+			search_key.sk_key_value.sv_key, after_key) == 0)
+			goto success;
+
+	}
+
+	/* Search for the key in the child (referencing) table: */
+	if (!(ot = xt_db_open_table_using_tab(tr_fkey->co_table->dt_table, thread)))
+		goto failed;
+
+	retry:
+	if (!xt_idx_search(ot, ind, &search_key))
+		goto failed_2;
+		
+	while (ot->ot_curr_rec_id && search_key.sk_on_key) {
+		switch (xt_tab_maybe_committed(ot, ot->ot_curr_rec_id, &xn_id, &ot->ot_curr_row_id, &ot->ot_curr_updated)) {
+			case XT_MAYBE:
+				xw.xw_xn_id = xn_id;
+				if (!xt_xn_wait_for_xact(thread, &xw, NULL))
+					goto failed_2;
+				goto retry;
+			case XT_ERR:
+				goto failed_2;
+			case TRUE:
+				/* We found a matching child: */
+				switch (action) {
+					case XT_KEY_ACTION_CASCADE:
+						if (after_buf) {
+							/* Do a cascaded update: */
+							if (!xt_tab_load_record(ot, ot->ot_curr_rec_id, &after_info))
+								goto failed_2;
+
+							if (!myxt_create_row_from_key(ot, ind, after_key, after_key_len, after_info.ib_db.db_data))
+								goto failed_2;
+
+							if (!xt_tab_update_record(ot, NULL, after_info.ib_db.db_data)) {
+								// Change to duplicate foreign key
+								if (ot->ot_thread->t_exception.e_xt_err == XT_ERR_DUPLICATE_KEY)
+									xt_register_ixterr(XT_REG_CONTEXT, XT_ERR_DUPLICATE_FKEY, tr_fkey->co_name);
+								goto failed_2;
+							}
+						}
+						else {
+							/* Do a cascaded delete: */
+							if (!xt_tab_delete_record(ot, NULL))
+								goto failed_2;
+						}
+						break;
+					case XT_KEY_ACTION_SET_NULL:
+						if (!xt_tab_load_record(ot, ot->ot_curr_rec_id, &after_info))
+							goto failed_2;
+
+						myxt_set_null_row_from_key(ot, ind, after_info.ib_db.db_data);
+
+						if (!xt_tab_update_record(ot, NULL, after_info.ib_db.db_data))
+							goto failed_2;
+						break;
+					case XT_KEY_ACTION_SET_DEFAULT:
+
+						if (!xt_tab_load_record(ot, ot->ot_curr_rec_id, &after_info))
+							goto failed_2;
+
+						myxt_set_default_row_from_key(ot, ind, after_info.ib_db.db_data);
+
+						if (!xt_tab_update_record(ot, NULL, after_info.ib_db.db_data))
+							goto failed_2;
+
+						break;
+					case XT_KEY_ACTION_NO_ACTION:
+#ifdef XT_IMPLEMENT_NO_ACTION
+						XTRestrictItemRec	r;
+						
+						r.ri_tab_id = ref_ot->ot_table->tab_id;
+						r.ri_rec_id = ref_ot->ot_curr_rec_id;
+						if (!xt_bl_append(NULL, &thread->st_restrict_list, (void *) &r))
+							goto failed_2;
+						break;
+#endif
+					default:
+						xt_register_ixterr(XT_REG_CONTEXT, XT_ERR_ROW_IS_REFERENCED, tr_fkey->co_name);
+						goto failed_2;
+				}
+				/* Fall throught to next: */
+			case FALSE:
+				if (!xt_idx_next(ot, ind, &search_key))
+					goto failed_2;
+				break;
+		}
+	}
+
+	/* No matching children, all OK: */
+	if (ot->ot_ind_rhandle) {
+		xt_ind_release_handle(ot->ot_ind_rhandle, FALSE, thread);
+		ot->ot_ind_rhandle = NULL;
+	}
+	xt_db_return_table_to_pool_ns(ot);
+
+	success:
+	xt_ib_free(NULL, &after_info);
+	if (after_key)
+		xt_free_ns(after_key);
+	return true;
+
+	failed_2:
+	if (ot->ot_ind_rhandle) {
+		xt_ind_release_handle(ot->ot_ind_rhandle, FALSE, thread);
+		ot->ot_ind_rhandle = NULL;
+	}
+	xt_db_return_table_to_pool_ns(ot);
+
+	failed:
+	xt_ib_free(NULL, &after_info);
+	if (after_key)
+		xt_free_ns(after_key);
+	return false;
+}
+
+void XTDDTableRef::deleteAllRows(XTThreadPtr self)
+{
+	XTOpenTablePtr	ot;
+	xtBool			eof;
+	xtWord1			*buffer;
+
+	if (!tr_fkey->getReferenceIndexPtr())
+		xt_throw(self);
+
+	if (!tr_fkey->getIndexPtr())
+		xt_throw(self);
+
+	if (!(ot = xt_db_open_table_using_tab(tr_fkey->co_table->dt_table, self)))
+		xt_throw(self);
+
+	/* Check if there are any rows in the referencing table: */
+	if (!xt_tab_seq_init(ot))
+		goto failed;
+
+	if (!(buffer = (xtWord1 *) xt_malloc(self, ot->ot_table->tab_dic.dic_mysql_buf_size)))
+		goto failed_1;
+
+	if (!xt_tab_seq_next(ot, buffer, &eof))
+		goto failed_2;
+
+	xt_free(self, buffer);
+
+	xt_tab_seq_exit(ot);
+
+	xt_db_return_table_to_pool_ns(ot);
+
+	if (!eof)
+		xt_throw_ixterr(XT_CONTEXT, XT_ERR_ROW_IS_REFERENCED, tr_fkey->co_name);
+	return;
+
+	failed_2:
+	xt_free(self, buffer);
+
+	failed_1:
+	xt_tab_seq_exit(ot);
+
+	failed:
+	xt_db_return_table_to_pool_ns(ot);
+	xt_throw(self);
+}
+
+void  XTDDIndex::init(XTThreadPtr self, XTObject *obj)
+{
+	XTDDConstraint::init(self, obj);
+}
+
+XTIndexPtr XTDDIndex::getIndexPtr()
+{
+	if (in_index >= co_table->dt_table->tab_dic.dic_key_count) {
+		XTDDIndex		*in;
+
+		if (!(in = co_table->findIndex(this)))
+			return NULL;
+		in_index = in->in_index;
+	}
+	return co_table->dt_table->tab_dic.dic_keys[in_index];
+}
+
+void XTDDForeignKey::init(XTThreadPtr self, XTObject *obj)
+{
+	XTDDForeignKey *fk = (XTDDForeignKey *) obj;
+
+	XTDDIndex::init(self, obj);
+	if (fk->fk_ref_tab_name)
+		fk_ref_tab_name = (XTPathStrPtr) xt_dup_string(self, fk->fk_ref_tab_name->ps_path);
+	fk_ref_cols.clone(self, &fk->fk_ref_cols);
+	fk_on_delete = fk->fk_on_delete;
+	fk_on_update = fk->fk_on_update;
+}
+
+void XTDDForeignKey::finalize(XTThreadPtr self)
+{
+	XTDDTable *ref_tab;
+
+	if (fk_ref_tab_name) {
+		xt_free(self, fk_ref_tab_name);
+		fk_ref_tab_name = NULL;
+	}
+
+	if ((ref_tab = fk_ref_table)) {
+		fk_ref_table = NULL;
+		ref_tab->removeReference(self, this);
+		xt_heap_release(self, ref_tab->dt_table); /* We referenced the table, not the index! */
+	}
+
+	fk_ref_index = UINT_MAX;
+
+	fk_ref_cols.deleteAll(self);
+	XTDDConstraint::finalize(self);
+}
+
+void XTDDForeignKey::loadString(XTThreadPtr self, XTStringBufferPtr sb)
+{
+	char schema_name[XT_IDENTIFIER_NAME_SIZE];
+	
+	XTDDConstraint::loadString(self, sb);
+	xt_sb_concat(self, sb, " REFERENCES `");
+	xt_2nd_last_name_of_path(XT_IDENTIFIER_NAME_SIZE, schema_name, fk_ref_tab_name->ps_path);
+	xt_sb_concat(self, sb, schema_name);
+	xt_sb_concat(self, sb, "`.`");
+	xt_sb_concat(self, sb, xt_last_name_of_path(fk_ref_tab_name->ps_path));
+	xt_sb_concat(self, sb, "` ");
+
+	xt_sb_concat(self, sb, "(`");
+	xt_sb_concat(self, sb, fk_ref_cols.itemAt(0)->cr_col_name);
+	for (u_int i=1; i<fk_ref_cols.size(); i++) {
+		xt_sb_concat(self, sb, "`, `");
+		xt_sb_concat(self, sb, fk_ref_cols.itemAt(i)->cr_col_name);
+	}
+	xt_sb_concat(self, sb, "`)");
+	
+	if (fk_on_delete != XT_KEY_ACTION_RESTRICT) {
+		xt_sb_concat(self, sb, " ON DELETE ");
+		switch (fk_on_delete) {
+			case XT_KEY_ACTION_CASCADE:		xt_sb_concat(self, sb, "CASCADE"); break;
+			case XT_KEY_ACTION_SET_NULL:	xt_sb_concat(self, sb, "SET NULL"); break;
+			case XT_KEY_ACTION_SET_DEFAULT:	xt_sb_concat(self, sb, "SET DEFAULT"); break;
+			case XT_KEY_ACTION_NO_ACTION:	xt_sb_concat(self, sb, "NO ACTION"); break;
+		}
+	}
+	if (fk_on_update != XT_KEY_ACTION_RESTRICT) {
+		xt_sb_concat(self, sb, " ON UPDATE ");
+		switch (fk_on_update) {
+			case XT_KEY_ACTION_RESTRICT:	xt_sb_concat(self, sb, "RESTRICT"); break;
+			case XT_KEY_ACTION_CASCADE:		xt_sb_concat(self, sb, "CASCADE"); break;
+			case XT_KEY_ACTION_SET_NULL:	xt_sb_concat(self, sb, "SET NULL"); break;
+			case XT_KEY_ACTION_SET_DEFAULT:	xt_sb_concat(self, sb, "SET DEFAULT"); break;
+			case XT_KEY_ACTION_NO_ACTION:	xt_sb_concat(self, sb, "NO ACTION"); break;
+		}
+	}
+}
+
+void XTDDForeignKey::getReferenceList(char *buffer, size_t size)
+{
+	buffer[0] = '`';
+	xt_strcpy(size, buffer + 1, xt_last_name_of_path(fk_ref_tab_name->ps_path));
+	xt_strcat(size, buffer, "` (");
+	xt_strcat(size, buffer, fk_ref_cols.itemAt(0)->cr_col_name);
+	for (u_int i=1; i<fk_ref_cols.size(); i++) {
+		xt_strcat(size, buffer, ", ");
+		xt_strcat(size, buffer, fk_ref_cols.itemAt(i)->cr_col_name);
+	}
+	xt_strcat(size, buffer, ")");
+}
+
+struct XTIndex *XTDDForeignKey::getReferenceIndexPtr()
+{
+	if (!fk_ref_table) {
+		xt_register_taberr(XT_REG_CONTEXT, XT_ERR_REF_TABLE_NOT_FOUND, fk_ref_tab_name);
+		return NULL;
+	}
+	if (fk_ref_index >= fk_ref_table->dt_table->tab_dic.dic_key_count) {
+		XTDDIndex *in;
+
+		if (!(in = fk_ref_table->findReferenceIndex(this)))
+			return NULL;
+		if (!checkReferencedTypes(fk_ref_table))
+			return NULL;
+		fk_ref_index = in->in_index;
+	}
+
+	return fk_ref_table->dt_table->tab_dic.dic_keys[fk_ref_index];
+}
+
+bool XTDDForeignKey::sameReferenceColumns(XTDDConstraint *co)
+{
+	u_int i = 0;
+
+	if (fk_ref_cols.size() != co->co_cols.size())
+		return false;
+	while (i<fk_ref_cols.size()) {
+		if (myxt_strcasecmp(fk_ref_cols.itemAt(i)->cr_col_name, co->co_cols.itemAt(i)->cr_col_name) != 0)
+			return false;
+		i++;
+	}
+	return OK;
+}
+
+bool XTDDForeignKey::samePrefixReferenceColumns(XTDDConstraint *co)
+{
+	u_int i = 0;
+
+	if (fk_ref_cols.size() > co->co_cols.size())
+		return false;
+	while (i<fk_ref_cols.size()) {
+		if (myxt_strcasecmp(fk_ref_cols.itemAt(i)->cr_col_name, co->co_cols.itemAt(i)->cr_col_name) != 0)
+			return false;
+		i++;
+	}
+	return OK;
+}
+
+bool XTDDForeignKey::checkReferencedTypes(XTDDTable *dt)
+{
+	XTDDColumn *col, *ref_col;
+	XTDDEnumerableColumn *enum_col, *enum_ref_col;
+
+	if (dt->dt_table->tab_dic.dic_tab_flags & XT_TAB_FLAGS_TEMP_TAB) {
+		xt_register_xterr(XT_REG_CONTEXT, XT_ERR_FK_REF_TEMP_TABLE);
+		return false;
+	}
+
+	for (u_int i=0; i<co_cols.size() && i<fk_ref_cols.size(); i++) {
+		col = co_table->findColumn(co_cols.itemAt(i)->cr_col_name);
+		ref_col = dt->findColumn(fk_ref_cols.itemAt(i)->cr_col_name);
+		if (!col || !ref_col)
+			continue;
+
+		enum_col = col->castToEnumerable();
+		enum_ref_col = ref_col->castToEnumerable();
+
+		if (!enum_col && !enum_ref_col && (strcmp(col->dc_data_type, ref_col->dc_data_type) == 0))
+			continue;
+
+		/* Allow match varchar(30) == varchar(40): */
+		if (strncmp(col->dc_data_type, "varchar", 7) == 0 && strncmp(ref_col->dc_data_type, "varchar", 7) == 0) {
+			char *t1, *t2;
+			
+			t1 = col->dc_data_type + 7;
+			while (*t1 && (isdigit(*t1) || *t1 == '(' || *t1 == ')')) t1++;
+			t2 = col->dc_data_type + 7;
+			while (*t2 && (isdigit(*t2) || *t2 == '(' || *t2 == ')')) t2++;
+			
+			if (strcmp(t1, t2) == 0)
+				continue;
+		}
+
+		/*
+		 * MySQL stores ENUMs an integer indexes for string values. That's why
+		 * it is ok to have refrences between columns that are different ENUMs as long
+		 * as they contain equal number of members, so that for example a cascase update
+		 * will not cause an invaid value to be stored in the child table. 
+		 *
+		 * The above is also true for SETs.
+		 *
+		 */
+
+		if (enum_col && enum_ref_col && 
+			(enum_col->enum_size == enum_ref_col->enum_size) && 
+			(enum_col->is_enum == enum_ref_col->is_enum))
+			continue;
+
+		xt_register_tabcolerr(XT_REG_CONTEXT, XT_ERR_REF_TYPE_WRONG, fk_ref_tab_name, ref_col->dc_name);
+		return false;
+	}
+	return true;
+}
+
+void XTDDForeignKey::removeReference(XTThreadPtr self)
+{
+	XTDDTable *ref_tab;
+
+	xt_recurrwlock_xlock(self, &co_table->dt_ref_lock);
+	pushr_(xt_recurrwlock_unxlock, &co_table->dt_ref_lock);
+
+	if ((ref_tab = fk_ref_table)) {			
+		fk_ref_table = NULL;
+		ref_tab->removeReference(self, this);
+		xt_heap_release(self, ref_tab->dt_table); /* We referenced the table, not the index! */
+	}
+
+	fk_ref_index = UINT_MAX;
+
+	freer_(); // xt_recurrwlock_unxlock(&co_table->dt_ref_lock);
+}
+
+/*
+ * A row was inserted, check that a key exists in the referenced
+ * table.
+ */
+bool XTDDForeignKey::insertRow(xtWord1 *before_buf, xtWord1 *rec_buf, XTThreadPtr thread)
+{
+	XTIndexPtr			loc_ind, ind;
+	xtBool				no_null = TRUE;
+	XTOpenTablePtr		ot;
+	XTIdxSearchKeyRec	search_key;
+	xtXactID			xn_id;
+	XTXactWaitRec		xw;
+
+	/* This lock ensures that the foreign key references are not
+	 * changed.
+	 */
+	xt_recurrwlock_slock_ns(&co_table->dt_ref_lock);
+
+	if (!(loc_ind = getIndexPtr()))
+		goto failed;
+
+	if (!(ind = getReferenceIndexPtr()))
+		goto failed;
+
+	search_key.sk_key_value.sv_flags = 0;
+	search_key.sk_key_value.sv_rec_id = 0;
+	search_key.sk_key_value.sv_row_id = 0;
+	search_key.sk_key_value.sv_key = search_key.sk_key_buf;
+	search_key.sk_key_value.sv_length = myxt_create_foreign_key_from_row(loc_ind, search_key.sk_key_buf, rec_buf, ind, &no_null);
+	search_key.sk_on_key = FALSE;
+
+	if (!no_null)
+		goto success;
+
+	if (before_buf) {
+		u_int	before_key_len;
+		xtWord1	before_key[XT_INDEX_MAX_KEY_SIZE];
+
+		/* If there is a before buffer, this insert was an update, so check
+		 * if the key value has changed. If not, we need not do anything.
+		 */
+		before_key_len = myxt_create_foreign_key_from_row(loc_ind, before_key, before_buf, ind, NULL);
+		
+		/* Check whether the key value has changed, if not, we have nothing
+		 * to do here!
+		 */
+		if (search_key.sk_key_value.sv_length == before_key_len &&
+			memcmp(search_key.sk_key_buf, before_key, before_key_len) == 0)
+			goto success;
+	}
+
+	/* Search for the key in the parent (referenced) table: */
+	if (!(ot = xt_db_open_table_using_tab(fk_ref_table->dt_table, thread)))
+		goto failed;
+
+	retry:
+	if (!xt_idx_search(ot, ind, &search_key))
+		goto failed_2;
+		
+	while (ot->ot_curr_rec_id) {
+		if (!search_key.sk_on_key)
+			break;
+
+		switch (xt_tab_maybe_committed(ot, ot->ot_curr_rec_id, &xn_id, &ot->ot_curr_row_id, &ot->ot_curr_updated)) {
+			case XT_MAYBE:
+				/* We should not get a deadlock here because the thread
+				 * that we are waiting for, should not doing
+				 * data definition (i.e. should not be trying to
+				 * get an exclusive lock on dt_ref_lock.
+				 */
+				xw.xw_xn_id = xn_id;
+				if (!xt_xn_wait_for_xact(thread, &xw, NULL))
+					goto failed_2;
+				goto retry;			
+			case XT_ERR:
+				goto failed_2;
+			case TRUE:
+				/* We found a matching parent: */
+				if (ot->ot_ind_rhandle) {
+					xt_ind_release_handle(ot->ot_ind_rhandle, FALSE, thread);
+					ot->ot_ind_rhandle = NULL;
+				}
+				xt_db_return_table_to_pool_ns(ot);
+				goto success;
+			case FALSE:
+				if (!xt_idx_next(ot, ind, &search_key))
+					goto failed_2;
+				break;
+		}
+	}
+
+	xt_register_ixterr(XT_REG_CONTEXT, XT_ERR_NO_REFERENCED_ROW, co_name);
+
+	failed_2:
+	if (ot->ot_ind_rhandle) {
+		xt_ind_release_handle(ot->ot_ind_rhandle, FALSE, thread);
+		ot->ot_ind_rhandle = NULL;
+	}
+	xt_db_return_table_to_pool_ns(ot);
+
+	failed:
+	xt_recurrwlock_unslock_ns(&co_table->dt_ref_lock);
+	return false;
+
+	success:
+	xt_recurrwlock_unslock_ns(&co_table->dt_ref_lock);
+	return true;
+}
+
+/*
+ * Convert XT_KEY_ACTION_* constants to strings
+ */
+const char *XTDDForeignKey::actionTypeToString(int action)
+{
+	switch (action)
+	{
+	case XT_KEY_ACTION_RESTRICT:
+		return "RESTRICT";
+	case XT_KEY_ACTION_CASCADE:
+		return "CASCADE";
+	case XT_KEY_ACTION_SET_NULL:
+		return "SET NULL";
+	case XT_KEY_ACTION_SET_DEFAULT:
+		return "";
+	case XT_KEY_ACTION_NO_ACTION:
+		return "NO ACTION";
+	}
+
+	return "";
+}
+
+void XTDDTable::init(XTThreadPtr self)
+{
+	xt_recurrwlock_init_with_autoname(self, &dt_ref_lock);
+	dt_trefs = NULL;
+}
+
+void XTDDTable::init(XTThreadPtr self, XTObject *obj)
+{
+	XTDDTable *tab = (XTDDTable *) obj;
+	u_int		i;
+
+	init(self);
+	XTObject::init(self, obj);
+	dt_cols.clone(self, &tab->dt_cols);	
+	dt_indexes.clone(self, &tab->dt_indexes);	
+	dt_fkeys.clone(self, &tab->dt_fkeys);	
+
+	for (i=0; i<dt_indexes.size(); i++)
+		dt_indexes.itemAt(i)->co_table = this;
+	for (i=0; i<dt_fkeys.size(); i++)
+		dt_fkeys.itemAt(i)->co_table = this;
+}
+
+void XTDDTable::finalize(XTThreadPtr self)
+{
+	XTDDTableRef *ptr;
+
+	removeReferences(self);
+
+	dt_cols.deleteAll(self);
+	dt_indexes.deleteAll(self);
+	dt_fkeys.deleteAll(self);
+
+	while (dt_trefs) {
+		ptr = dt_trefs;
+		dt_trefs = dt_trefs->tr_next;
+		ptr->release(self);
+	}
+
+	xt_recurrwlock_free(&dt_ref_lock);
+}
+
+XTDDColumn *XTDDTable::findColumn(char *name)
+{
+	XTDDColumn *col;
+
+	for (u_int i=0; i<dt_cols.size(); i++) {
+		col = dt_cols.itemAt(i);
+		if (myxt_strcasecmp(name, col->dc_name) == 0)
+			return col;
+	}
+	return NULL;
+}
+
+void XTDDTable::loadString(XTThreadPtr self, XTStringBufferPtr sb)
+{
+	u_int i;
+
+	/* I do not specify a table name because that is known */
+	xt_sb_concat(self, sb, "CREATE TABLE (\n  ");
+
+	/* We only need to save the foreign key definitions!!
+	for (i=0; i<dt_cols.size(); i++) {
+		if (i != 0)
+			xt_sb_concat(self, sb, ",\n  ");
+		dt_cols.itemAt(i)->loadString(self, sb);
+	}
+
+	for (i=0; i<dt_indexes.size(); i++) {
+		xt_sb_concat(self, sb, ",\n  ");
+		dt_indexes.itemAt(i)->loadString(self, sb);
+	}
+	*/
+
+	for (i=0; i<dt_fkeys.size(); i++) {
+		if (i != 0)
+			xt_sb_concat(self, sb, ",\n  ");
+		dt_fkeys.itemAt(i)->loadString(self, sb);
+	}
+
+	xt_sb_concat(self, sb, "\n)\n");
+}
+
+void XTDDTable::loadForeignKeyString(XTThreadPtr self, XTStringBufferPtr sb)
+{
+	for (u_int i=0; i<dt_fkeys.size(); i++) {
+		xt_sb_concat(self, sb, ",\n  ");
+		dt_fkeys.itemAt(i)->loadString(self, sb);
+	}
+}
+
+/* Change all references to the given column name to new name. */
+void XTDDTable::alterColumnName(XTThreadPtr self, char *from_name, char *to_name)
+{
+	u_int i;
+
+	/* We only alter references in the foreign keys (we copied the
+	 * other changes from MySQL).
+	 */
+	for (i=0; i<dt_fkeys.size(); i++)
+		dt_fkeys.itemAt(i)->alterColumnName(self, from_name, to_name);
+}
+
+void XTDDTable::attachReference(XTThreadPtr self, XTDDForeignKey *fk)
+{
+	XTDDTableRef	*tr;
+
+	/* Remove the reference to this FK if one exists: */
+	removeReference(self, fk);
+
+	if (!fk->checkReferencedTypes(this)) {
+		if (!self->st_ignore_fkeys)
+			throw_();
+	}
+
+	xt_recurrwlock_xlock(self, &dt_ref_lock);
+	pushr_(xt_recurrwlock_unxlock, &dt_ref_lock);
+
+	if (!(tr = new XTDDTableRef()))
+		xt_throw_errno(XT_CONTEXT, XT_ENOMEM);
+	tr->tr_fkey = fk;
+	tr->tr_next = dt_trefs;
+	dt_trefs = tr;
+
+	/* Reference the database table of the foreign key, not the FK itself.
+	 * Just referencing the key will not guarantee that the
+	 * table remains valid because the FK does not reference the
+	 * table.
+	 */
+	xt_heap_reference(self, fk->co_table->dt_table);
+
+	freer_(); // xt_recurrwlock_unxlock(&dt_ref_lock);
+}
+
+/*
+ * Remove the reference to the given foreign key.
+ */
+void XTDDTable::removeReference(XTThreadPtr self, XTDDForeignKey *fk)
+{
+	XTDDTableRef	*tr, *prev_tr = NULL;
+
+	xt_recurrwlock_xlock(self, &dt_ref_lock);
+	pushr_(xt_recurrwlock_unxlock, &dt_ref_lock);
+
+	tr = dt_trefs;
+	while (tr) {
+		if (tr->tr_fkey == fk) {
+			if (prev_tr)
+				prev_tr->tr_next = tr->tr_next;
+			else
+				dt_trefs = tr->tr_next;
+			break;
+		}
+		prev_tr = tr;
+		tr = tr->tr_next;
+	}
+	freer_(); // xt_recurrwlock_unxlock(&dt_ref_lock);
+	if (tr)
+		tr->release(self);
+}
+
+void XTDDTable::checkForeignKeyReference(XTThreadPtr self, XTDDForeignKey *fk)
+{
+	XTDDColumnRef	*cr;
+
+	for (u_int i=0; i<fk->fk_ref_cols.size(); i++) {
+		cr = fk->fk_ref_cols.itemAt(i);
+		if (!findColumn(cr->cr_col_name))
+			xt_throw_tabcolerr(XT_CONTEXT, XT_ERR_COLUMN_NOT_FOUND, fk->fk_ref_tab_name, cr->cr_col_name);
+	}
+}
+
+void XTDDTable::attachReference(XTThreadPtr self, XTDDTable *dt)
+{
+	XTDDForeignKey	*fk;
+
+	for (u_int i=0; i<dt_fkeys.size(); i++) {
+		fk = dt_fkeys.itemAt(i);
+		if (xt_tab_compare_names(fk->fk_ref_tab_name->ps_path, dt->dt_table->tab_name->ps_path) == 0) {
+			fk->removeReference(self);
+
+			dt->attachReference(self, fk);
+
+			xt_recurrwlock_xlock(self, &dt_ref_lock);
+			pushr_(xt_recurrwlock_unxlock, &dt_ref_lock);
+			/* Referenced the table, not the index!
+			 * We do this because we know that if the table is referenced, the
+			 * index will remain valid!
+			 * This is because the table references the index, and only
+			 * releases it when the table is released. The index does not
+			 * reference the table though!
+			 */
+			xt_heap_reference(self, dt->dt_table);
+			fk->fk_ref_table = dt;
+			freer_(); // xt_recurrwlock_unxlock(&dt_ref_lock);
+		}
+	}
+}
+
+/*
+ * This function assumes the database table list is locked!
+ */
+void XTDDTable::attachReferences(XTThreadPtr self, XTDatabaseHPtr db)
+{
+	XTDDForeignKey	*fk;
+	XTTableHPtr		tab;
+	XTDDTable		*dt;
+	XTHashEnumRec	tables;
+
+	/* Search for table referenced by this table. */
+	for (u_int i=0; i<dt_fkeys.size(); i++) {
+		fk = dt_fkeys.itemAt(i);
+		fk->removeReference(self);
+
+		// if self-reference
+		if (xt_tab_compare_names(fk->fk_ref_tab_name->ps_path, this->dt_table->tab_name->ps_path) == 0)
+			fk->fk_ref_table = this;
+		else {
+			/* get pointer to the referenced table, load it if needed
+			 * cyclic references are being handled, absent table is ignored
+			 */
+			tab = xt_use_table_no_lock(self, db, fk->fk_ref_tab_name, /*TRUE*/FALSE, /*FALSE*/TRUE, NULL);
+
+			if (tab) {
+				pushr_(xt_heap_release, tab);
+				if ((dt = tab->tab_dic.dic_table)) {
+					// Add a reverse reference:
+					dt->attachReference(self, fk);
+					xt_heap_reference(self, dt->dt_table); /* Referenced the table, not the index! */
+					fk->fk_ref_table = dt;
+				}
+				freer_(); // xt_heap_release(tab)
+			}
+			else if (!self->st_ignore_fkeys) {
+				xt_throw_taberr(XT_CONTEXT, XT_ERR_REF_TABLE_NOT_FOUND, fk->fk_ref_tab_name);
+			}
+		}
+	}
+
+	/* Search for tables that reference this table. */
+	xt_ht_enum(self, dt_table->tab_db->db_tables, &tables);
+	while ((tab = (XTTableHPtr) xt_ht_next(self, &tables))) {
+		if (tab == this->dt_table) /* no need to re-reference itself, also this fails with "native" pthreads */
+			continue;
+		xt_heap_reference(self, tab);
+		pushr_(xt_heap_release, tab);
+		if ((dt = tab->tab_dic.dic_table))
+			dt->attachReference(self, this);
+		freer_(); // xt_heap_release(tab)
+	}
+}
+
+void XTDDTable::removeReferences(XTThreadPtr self)
+{
+	XTDDForeignKey	*fk;
+	XTDDTableRef	*tr;
+	XTDDTable		*tab;
+
+	xt_recurrwlock_xlock(self, &dt_ref_lock);
+	pushr_(xt_recurrwlock_unxlock, &dt_ref_lock);
+
+	for (u_int i=0; i<dt_fkeys.size(); i++) {
+		fk = dt_fkeys.itemAt(i);
+		if ((tab = fk->fk_ref_table)) {			
+			fk->fk_ref_table = NULL;
+			fk->fk_ref_index = UINT_MAX;
+			if (tab != this) {
+				/* To avoid deadlock we do not hold more than
+				 * one lock at a time!
+				 */
+				freer_(); // xt_recurrwlock_unxlock(&dt_ref_lock);
+	
+				tab->removeReference(self, fk);
+				xt_heap_release(self, tab->dt_table); /* We referenced the table, not the index! */
+	
+				xt_recurrwlock_xlock(self, &dt_ref_lock);
+				pushr_(xt_recurrwlock_unxlock, &dt_ref_lock);
+			}
+		}
+	}
+
+	while (dt_trefs) {
+		tr = dt_trefs;
+		dt_trefs = tr->tr_next;
+		freer_(); // xt_recurrwlock_unxlock(&dt_ref_lock);
+		tr->release(self);
+		xt_recurrwlock_xlock(self, &dt_ref_lock);
+		pushr_(xt_recurrwlock_unxlock, &dt_ref_lock);
+	}
+
+	freer_(); // xt_recurrwlock_unxlock(&dt_ref_lock);
+}
+
+void XTDDTable::checkForeignKeys(XTThreadPtr self, bool temp_table)
+{
+	XTDDForeignKey	*fk;
+
+	if (temp_table && dt_fkeys.size()) {
+		/* Temporary tables cannot have foreign keys: */
+		xt_throw_xterr(XT_CONTEXT, XT_ERR_FK_ON_TEMP_TABLE);
+		
+	}
+
+	/* Search for table referenced by this table. */
+	for (u_int i=0; i<dt_fkeys.size(); i++) {
+		fk = dt_fkeys.itemAt(i);
+
+		if (fk->fk_on_delete == XT_KEY_ACTION_SET_NULL || fk->fk_on_update == XT_KEY_ACTION_SET_NULL) {
+			/* Check that all the columns can be set to NULL! */
+			XTDDColumn *col;
+
+			for (u_int j=0; j<fk->co_cols.size(); j++) {
+				if ((col = findColumn(fk->co_cols.itemAt(j)->cr_col_name))) {
+					if (!col->dc_null_ok)
+						xt_throw_tabcolerr(XT_CONTEXT, XT_ERR_COLUMN_IS_NOT_NULL, fk->fk_ref_tab_name, col->dc_name);
+				}
+			}
+		}
+
+		// TODO: dont close table immediately so it can be possibly reused in this loop
+		XTTable *ref_tab;
+
+		pushsr_(ref_tab, xt_heap_release, xt_use_table(self, fk->fk_ref_tab_name, FALSE, TRUE));
+		if (ref_tab && !fk->checkReferencedTypes(ref_tab->tab_dic.dic_table))
+			throw_();
+		freer_();
+
+		/* Currently I allow foreign keys to be created on tables that do not yet exist!
+		pushsr_(tab, xt_heap_release, xt_use_table(self, fk->fk_ref_tab_name, FALSE FALSE));
+		if ((dt = tab->tab_dic.dic_table))
+			dt->checkForeignKeyReference(self, fk);
+		freer_(); // xt_heap_release(tab)
+		*/
+	}
+}
+
+XTDDIndex *XTDDTable::findIndex(XTDDConstraint *co)
+{
+	XTDDIndex *ind = NULL;
+	XTDDIndex *cur_ind;
+	u_int index_size = UINT_MAX;
+
+	for (u_int i=0; i<dt_indexes.size(); i++) {
+		cur_ind = dt_indexes.itemAt(i);
+		u_int sz = cur_ind->getIndexPtr()->mi_key_size;
+		if (sz < index_size && co->samePrefixColumns(cur_ind)) {
+			ind = cur_ind;
+			index_size = sz;
+		}
+	}
+
+	if (ind) 
+		return ind;
+	
+	{
+		char buffer[XT_ERR_MSG_SIZE - 200];
+		co->getColumnList(buffer, XT_ERR_MSG_SIZE - 200);
+		xt_register_ixterr(XT_REG_CONTEXT, XT_ERR_NO_MATCHING_INDEX, buffer);
+	}
+	return NULL;
+}
+
+XTDDIndex *XTDDTable::findReferenceIndex(XTDDForeignKey *fk)
+{
+	XTDDIndex		*ind = NULL;
+	XTDDIndex		*cur_ind;
+	XTDDColumnRef	*cr;
+	u_int			i;
+	u_int			index_size = UINT_MAX;
+
+	for (i=0; i<dt_indexes.size(); i++) {
+		cur_ind = dt_indexes.itemAt(i);
+		u_int sz = cur_ind->getIndexPtr()->mi_key_size;
+		if (sz < index_size && fk->samePrefixReferenceColumns(cur_ind)) {
+			ind = cur_ind;
+			index_size = sz;
+		}
+	}
+
+	if (ind)
+		return ind;
+
+	/* If the index does not exist, maybe the columns do not exist?! */
+	for (i=0; i<fk->fk_ref_cols.size(); i++) {
+		cr = fk->fk_ref_cols.itemAt(i);
+		if (!findColumn(cr->cr_col_name)) {
+			xt_register_tabcolerr(XT_REG_CONTEXT, XT_ERR_COLUMN_NOT_FOUND, fk->fk_ref_tab_name, cr->cr_col_name);
+			return NULL;
+		}
+	}
+	
+	{
+		char buffer[XT_ERR_MSG_SIZE - 200];
+
+		fk->getReferenceList(buffer, XT_ERR_MSG_SIZE - 200);
+		xt_register_ixterr(XT_REG_CONTEXT, XT_ERR_NO_MATCHING_INDEX, buffer);
+	}
+	return NULL;
+}
+
+bool XTDDTable::insertRow(XTOpenTablePtr ot, xtWord1 *rec_ptr)
+{
+	bool			ok = true;
+	XTInfoBufferRec	rec_buf;
+
+	if (ot->ot_thread->st_ignore_fkeys)
+		return true;
+
+	rec_buf.ib_free = FALSE;
+	if (!rec_ptr) {
+		if (!xt_tab_load_record(ot, ot->ot_curr_rec_id, &rec_buf))
+			return false;
+		rec_ptr = rec_buf.ib_db.db_data;
+		
+	}
+	for (u_int i=0; i<dt_fkeys.size(); i++) {
+		if (!dt_fkeys.itemAt(i)->insertRow(NULL, rec_ptr, ot->ot_thread)) {
+			ok = false;
+			break;
+		}
+	}
+	xt_ib_free(NULL, &rec_buf);
+	return ok;
+}
+
+bool XTDDTable::checkNoAction(XTOpenTablePtr ot, xtRecordID rec_id)
+{
+	XTDDTableRef	*tr;
+	bool			ok = true;
+	XTInfoBufferRec	rec_buf;
+	xtWord1			*rec_ptr;
+
+	if (ot->ot_thread->st_ignore_fkeys)
+		return true;
+
+	rec_buf.ib_free = FALSE;
+	if (!xt_tab_load_record(ot, rec_id, &rec_buf))
+		return false;
+	rec_ptr = rec_buf.ib_db.db_data;
+
+	xt_recurrwlock_slock_ns(&dt_ref_lock);
+	tr = dt_trefs;
+	while (tr) {
+		if (!tr->checkReference(rec_ptr, ot->ot_thread)) {
+			ok = false;
+			break;
+		}
+		tr = tr->tr_next;
+	}
+	xt_recurrwlock_unslock_ns(&dt_ref_lock);
+	xt_ib_free(NULL, &rec_buf);
+	return ok;
+}
+
+bool XTDDTable::deleteRow(XTOpenTablePtr ot, xtWord1 *rec_ptr)
+{
+	XTDDTableRef	*tr;
+	bool			ok = true;
+	XTInfoBufferRec	rec_buf;
+
+	if (ot->ot_thread->st_ignore_fkeys)
+		return true;
+
+	rec_buf.ib_free = FALSE;
+	if (!rec_ptr) {
+		if (!xt_tab_load_record(ot, ot->ot_curr_rec_id, &rec_buf))
+			return false;
+		rec_ptr = rec_buf.ib_db.db_data;
+		
+	}
+	xt_recurrwlock_slock_ns(&dt_ref_lock);
+	tr = dt_trefs;
+	while (tr) {
+		if (!tr->modifyRow(ot, rec_ptr, NULL, ot->ot_thread)) {
+			ok = false;
+			break;
+		}
+		tr = tr->tr_next;
+	}
+	xt_recurrwlock_unslock_ns(&dt_ref_lock);
+	xt_ib_free(NULL, &rec_buf);
+	return ok;
+}
+
+void XTDDTable::deleteAllRows(XTThreadPtr self)
+{
+	XTDDTableRef	*tr;
+
+	xt_recurrwlock_slock(self, &dt_ref_lock);
+	pushr_(xt_recurrwlock_unslock, &dt_ref_lock);
+
+	tr = dt_trefs;
+	while (tr) {
+		tr->deleteAllRows(self);
+		tr = tr->tr_next;
+	}
+
+	freer_(); // xt_recurrwlock_unslock(&dt_ref_lock);
+}
+
+bool XTDDTable::updateRow(XTOpenTablePtr ot, xtWord1 *before, xtWord1 *after)
+{
+	XTDDTableRef	*tr;
+	bool			ok;
+	XTInfoBufferRec	before_buf;
+
+	ASSERT_NS(after);
+
+	if (ot->ot_thread->st_ignore_fkeys)
+		return true;
+
+	/* If before is NULL then this is a cascaded
+	 * update. In this case there is no need to check
+	 * if the column has a parent!!
+	 */
+	if (before) {
+		if (dt_fkeys.size() > 0) {
+			for (u_int i=0; i<dt_fkeys.size(); i++) {
+				if (!dt_fkeys.itemAt(i)->insertRow(before, after, ot->ot_thread))
+					return false;
+			}
+		}
+	}
+
+	ok = true;
+	before_buf.ib_free = FALSE;
+
+	xt_recurrwlock_slock_ns(&dt_ref_lock);
+	if ((tr = dt_trefs)) {
+		if (!before) {
+			if (!xt_tab_load_record(ot, ot->ot_curr_rec_id, &before_buf))
+				return false;
+			before = before_buf.ib_db.db_data;
+		}
+
+		while (tr) {
+			if (!tr->modifyRow(ot, before, after, ot->ot_thread)) {
+				ok = false;
+				break;
+			}
+			tr = tr->tr_next;
+		}
+	}
+	xt_recurrwlock_unslock_ns(&dt_ref_lock);
+	
+	xt_ib_free(NULL, &before_buf);
+	return ok;
+}
+
+/*
+ * drop_db parameter is TRUE if we are dropping the schema of this table. In this case
+ * we return TRUE if the table has only refs to the tables from its own schema
+ */
+xtBool XTDDTable::checkCanDrop(xtBool drop_db)
+{
+	/* no refs or references only itself */
+	if ((dt_trefs == NULL) || ((dt_trefs->tr_next == NULL) && (dt_trefs->tr_fkey->co_table == this)))
+		return TRUE;
+
+	if (!drop_db) 
+		return FALSE;
+	
+	const char *this_schema = xt_last_2_names_of_path(dt_table->tab_name->ps_path);
+	size_t this_schema_sz = xt_last_name_of_path(dt_table->tab_name->ps_path) - this_schema;
+	XTDDTableRef *tr = dt_trefs;
+
+	while (tr) {
+		const char *tab_path = tr->tr_fkey->co_table->dt_table->tab_name->ps_path;
+		const char *tab_schema = xt_last_2_names_of_path(tab_path);
+		size_t tab_schema_sz = xt_last_name_of_path(tab_path) - tab_schema;
+
+		if (this_schema_sz != tab_schema_sz || strncmp(this_schema, tab_schema, tab_schema_sz))
+			return FALSE;
+		
+		tr = tr->tr_next;
+	}
+
+	return TRUE;
+}
diff --git a/storage/pbxt/src/datadic_xt.h b/storage/pbxt/src/datadic_xt.h
new file mode 100644
index 00000000000..8dd6387f137
--- /dev/null
+++ b/storage/pbxt/src/datadic_xt.h
@@ -0,0 +1,300 @@
+/* Copyright (c) 2005 PrimeBase Technologies GmbH
+ *
+ * PrimeBase XT
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * 2004-01-03	Paul McCullagh
+ *
+ * H&G2JCtL
+ *
+ * Implementation of the PBXT internal data dictionary.
+ */
+
+#ifndef __datadic_xt_h__
+#define __datadic_xt_h__
+
+#include <stddef.h>
+#include <limits.h>
+
+#include "ccutils_xt.h"
+#include "util_xt.h"
+
+struct XTDatabase;
+struct XTTable;
+struct XTIndex;
+struct XTOpenTable;
+struct XTIndex;
+
+/* Constraint types: */
+#define XT_DD_UNKNOWN				((u_int) -1)
+#define XT_DD_INDEX					0
+#define XT_DD_INDEX_UNIQUE			1
+#define XT_DD_KEY_PRIMARY			2
+#define XT_DD_KEY_FOREIGN			3
+
+#define XT_KEY_ACTION_RESTRICT		1
+#define XT_KEY_ACTION_CASCADE		2
+#define XT_KEY_ACTION_SET_NULL		3
+#define XT_KEY_ACTION_SET_DEFAULT	4
+#define XT_KEY_ACTION_NO_ACTION		5		/* Like RESTRICT, but check at end of statement. */ 
+
+class XTDDEnumerableColumn;
+class XTDDColumnFactory;
+
+class XTDDColumn : public XTObject {
+
+protected:
+
+	XTDDColumn() : XTObject(),
+		dc_name(NULL),
+		dc_data_type(NULL),
+		dc_null_ok(true),
+		dc_auto_inc(false) {
+	}
+
+public:
+	char	*dc_name;
+	char	*dc_data_type;
+	bool	dc_null_ok;
+	bool	dc_auto_inc;
+
+	virtual XTObject *factory(XTThreadPtr self) {
+		XTObject *new_obj;
+		
+		if (!(new_obj = new XTDDColumn()))
+			xt_throw_errno(XT_CONTEXT, XT_ENOMEM);
+		return new_obj;
+	}
+
+	virtual void init(XTThreadPtr self) { 
+		XTObject::init(self);
+	}
+	virtual void init(XTThreadPtr self, XTObject *obj);
+	virtual void finalize(XTThreadPtr self);
+	virtual void loadString(XTThreadPtr self, XTStringBufferPtr sb);
+
+	virtual XTDDEnumerableColumn *castToEnumerable() { 
+		return NULL;
+	}
+
+	friend class XTDDColumnFactory;
+};
+
+/*
+ * subclass for ENUMs and SETs
+ */
+class XTDDEnumerableColumn : public XTDDColumn {
+
+protected:
+	XTDDEnumerableColumn() : XTDDColumn(), 
+		enum_size(0), is_enum(0) {
+	}
+
+public:
+	int enum_size;	/* number of elements in the ENUM or SET */
+	xtBool is_enum;	/* TRUE if this is ENUM, FALSE if SET */
+
+	virtual XTObject *factory(XTThreadPtr self) {
+		XTObject *new_obj;
+		
+		if (!(new_obj = new XTDDEnumerableColumn()))
+			xt_throw_errno(XT_CONTEXT, XT_ENOMEM);
+		return new_obj;
+	}
+
+	virtual XTDDEnumerableColumn *castToEnumerable() { 
+		return this;
+	}
+
+	friend class XTDDColumnFactory;
+};
+
+class XTDDColumnRef : public XTObject {
+	public:
+	char					*cr_col_name;
+
+	XTDDColumnRef() : XTObject(), cr_col_name(NULL) { }
+
+	virtual XTObject *factory(XTThreadPtr self) {
+		XTObject *new_obj;
+		
+		if (!(new_obj = new XTDDColumnRef()))
+			xt_throw_errno(XT_CONTEXT, XT_ENOMEM);
+		return new_obj;
+	}
+
+	virtual void init(XTThreadPtr self) { XTObject::init(self); }
+	virtual void init(XTThreadPtr self, XTObject *obj);
+	virtual void finalize(XTThreadPtr self);
+};
+
+class XTDDConstraint : public XTObject {
+	public:
+	class XTDDTable			*co_table;								/* The table of this constraint (non-referenced). */
+	u_int					co_type;
+	char					*co_name;
+	char					*co_ind_name;
+	XTList<XTDDColumnRef>	co_cols;
+
+	XTDDConstraint(u_int t) : XTObject(),
+		co_table(NULL),
+		co_type(t),
+		co_name(NULL),
+		co_ind_name(NULL) {
+	}
+
+	virtual void init(XTThreadPtr self) { XTObject::init(self); }
+	virtual void init(XTThreadPtr self, XTObject *obj);
+	virtual void finalize(XTThreadPtr self) {
+		if (co_name)
+			xt_free(self, co_name);
+		if (co_ind_name)
+			xt_free(self, co_ind_name);
+		co_cols.deleteAll(self);
+		XTObject::finalize(self);
+	}
+	virtual void loadString(XTThreadPtr self, XTStringBufferPtr sb);
+	virtual void alterColumnName(XTThreadPtr self, char *from_name, char *to_name);
+	void getColumnList(char *buffer, size_t size);
+	bool sameColumns(XTDDConstraint *co);
+	bool samePrefixColumns(XTDDConstraint *co);
+	bool attachColumns();
+};
+
+class XTDDTableRef : public XTObject {
+	public:
+	class XTDDTableRef		*tr_next;								/* The next reference in the list. */
+	class XTDDForeignKey	*tr_fkey;								/* The foreign key that references this table (if not-NULL). */
+
+	XTDDTableRef() : XTObject(), tr_next(NULL), tr_fkey(NULL) { }
+	virtual void finalize(XTThreadPtr self);
+	bool modifyRow(struct XTOpenTable *tab, xtWord1 *before, xtWord1 *after, XTThreadPtr thread);
+	bool checkReference(xtWord1 *before, XTThreadPtr thread);
+	void deleteAllRows(XTThreadPtr self);
+};
+
+class XTDDIndex : public XTDDConstraint {	
+	public:
+	u_int					in_index;
+
+	XTDDIndex(u_int type) : XTDDConstraint(type), in_index((u_int) -1) { }
+
+	virtual XTObject *factory(XTThreadPtr self) {
+		XTObject *new_obj;
+		
+		if (!(new_obj = new XTDDIndex(XT_DD_UNKNOWN)))
+			xt_throw_errno(XT_CONTEXT, XT_ENOMEM);
+		return new_obj;
+	}
+
+        virtual void init(XTThreadPtr self) { XTDDConstraint::init(self); };
+	virtual void init(XTThreadPtr self, XTObject *obj);
+	struct XTIndex *getIndexPtr();
+};
+
+/*
+ * A foreign key is based on a local index.
+ */
+class XTDDForeignKey : public XTDDIndex {
+	public:
+	XTPathStrPtr			fk_ref_tab_name;
+	XTDDTable				*fk_ref_table;
+	u_int					fk_ref_index;							/* The index on which this foreign key references. */
+	XTList<XTDDColumnRef>	fk_ref_cols;
+	int						fk_on_delete;
+	int						fk_on_update;
+
+	XTDDForeignKey() : XTDDIndex(XT_DD_KEY_FOREIGN),
+		fk_ref_tab_name(NULL),
+		fk_ref_table(NULL),
+		fk_ref_index(UINT_MAX),
+		fk_on_delete(0),
+		fk_on_update(0) {
+	}
+
+	virtual XTObject *factory(XTThreadPtr self) {
+		XTObject *new_obj;
+		
+		if (!(new_obj = new XTDDForeignKey()))
+			xt_throw_errno(XT_CONTEXT, XT_ENOMEM);
+		return new_obj;
+	}
+
+        virtual void init(XTThreadPtr self) { XTDDIndex::init(self); }
+	virtual void init(XTThreadPtr self, XTObject *obj);
+	virtual void finalize(XTThreadPtr self);
+	virtual void loadString(XTThreadPtr self, XTStringBufferPtr sb);
+	void getReferenceList(char *buffer, size_t size);
+	struct XTIndex *getReferenceIndexPtr();
+	bool sameReferenceColumns(XTDDConstraint *co);
+	bool samePrefixReferenceColumns(XTDDConstraint *co);
+	bool checkReferencedTypes(XTDDTable *dt);
+	void removeReference(XTThreadPtr self);
+	bool insertRow(xtWord1 *before, xtWord1 *after, XTThreadPtr thread);
+	bool updateRow(xtWord1 *before, xtWord1 *after, XTThreadPtr thread);
+
+	static const char *actionTypeToString(int action);
+};
+
+class XTDDTable : public XTObject {
+	private:
+
+	public:
+	struct XTTable			*dt_table;
+
+	XTList<XTDDColumn>		dt_cols;
+	XTList<XTDDIndex>		dt_indexes;
+
+	XTRecurRWLockRec		dt_ref_lock;			/* The lock for adding and using references. */
+	XTList<XTDDForeignKey>	dt_fkeys;				/* The foreign keys on this table. */
+	XTDDTableRef			*dt_trefs;				/* A list of tables that reference this table. */
+
+	virtual XTObject *factory(XTThreadPtr self) {
+		XTObject *new_obj;
+		
+		if (!(new_obj = new XTDDTable()))
+			xt_throw_errno(XT_CONTEXT, XT_ENOMEM);
+		return new_obj;
+	}
+
+	virtual void init(XTThreadPtr self);
+	virtual void init(XTThreadPtr self, XTObject *obj);
+	virtual void finalize(XTThreadPtr self);
+
+	XTDDColumn *findColumn(char *name);
+	void loadString(XTThreadPtr self, XTStringBufferPtr sb);
+	void loadForeignKeyString(XTThreadPtr self, XTStringBufferPtr sb);
+	void checkForeignKeyReference(XTThreadPtr self, XTDDForeignKey *fk);
+	void attachReferences(XTThreadPtr self, struct XTDatabase *db);
+	void attachReference(XTThreadPtr self, XTDDForeignKey *fk);
+	void alterColumnName(XTThreadPtr self, char *from_name, char *to_name);
+	void attachReference(XTThreadPtr self, XTDDTable *dt);
+	void removeReferences(XTThreadPtr self);
+	void removeReference(XTThreadPtr self, XTDDForeignKey *fk);
+	void checkForeignKeys(XTThreadPtr self, bool temp_table);
+	XTDDIndex *findIndex(XTDDConstraint *co);
+	XTDDIndex *findReferenceIndex(XTDDForeignKey *fk);
+	bool insertRow(struct XTOpenTable *rec_ot, xtWord1 *buffer);
+	bool checkNoAction(struct XTOpenTable *ot, xtRecordID rec_id);
+	xtBool checkCanDrop(xtBool drop_db);
+	bool deleteRow(struct XTOpenTable *rec_ot, xtWord1 *buffer);
+	void deleteAllRows(XTThreadPtr self);
+	bool updateRow(struct XTOpenTable *rec_ot, xtWord1 *before, xtWord1 *after);
+};
+
+XTDDTable *xt_ri_create_table(XTThreadPtr self, bool convert, XTPathStrPtr tab_path, char *sql, XTDDTable *my_tab);
+
+#endif
diff --git a/storage/pbxt/src/datalog_xt.cc b/storage/pbxt/src/datalog_xt.cc
new file mode 100644
index 00000000000..3238f0cbd17
--- /dev/null
+++ b/storage/pbxt/src/datalog_xt.cc
@@ -0,0 +1,2150 @@
+/* Copyright (c) 2005 PrimeBase Technologies GmbH
+ *
+ * PrimeBase XT
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * 2005-01-24	Paul McCullagh
+ *
+ * H&G2JCtL
+ */
+
+#include "xt_config.h"
+
+#include <stdio.h>
+#ifndef XT_WIN
+#include <unistd.h>
+#include <signal.h>
+#endif
+#include <stdlib.h>
+
+#ifndef DRIZZLED
+#include "mysql_priv.h"
+#endif
+
+#include "ha_pbxt.h"
+
+#include "filesys_xt.h"
+#include "database_xt.h"
+#include "memory_xt.h"
+#include "strutil_xt.h"
+#include "sortedlist_xt.h"
+#include "util_xt.h"
+#include "heap_xt.h"
+#include "table_xt.h"
+#include "trace_xt.h"
+#include "myxt_xt.h"
+
+static void dl_wake_co_thread(XTDatabaseHPtr db);
+
+/*
+ * --------------------------------------------------------------------------------
+ * SEQUENTIAL READING
+ */
+
+xtBool XTDataSeqRead::sl_seq_init(struct XTDatabase *db, size_t buffer_size)
+{
+	sl_db = db;
+	sl_buffer_size = buffer_size;
+
+	sl_log_file = NULL;
+	sl_log_eof = 0;
+
+	sl_buf_log_offset = 0;
+	sl_buffer_len = 0;
+	sl_buffer = (xtWord1 *) xt_malloc_ns(buffer_size);
+
+	sl_rec_log_id = 0;
+	sl_rec_log_offset = 0;
+	sl_record_len = 0;
+	sl_extra_garbage = 0;
+
+	return sl_buffer != NULL;
+}
+
+void XTDataSeqRead::sl_seq_exit()
+{
+	if (sl_log_file) {
+		xt_close_file_ns(sl_log_file);
+		sl_log_file  = NULL;
+	}
+	if (sl_buffer) {
+		xt_free_ns(sl_buffer);
+		sl_buffer = NULL;
+	}
+}
+
+XTOpenFilePtr XTDataSeqRead::sl_seq_open_file()
+{
+	return sl_log_file;
+}
+
+void XTDataSeqRead::sl_seq_pos(xtLogID *log_id, xtLogOffset *log_offset)
+{
+	*log_id = sl_rec_log_id;
+	*log_offset = sl_rec_log_offset;
+}
+
+xtBool XTDataSeqRead::sl_seq_start(xtLogID log_id, xtLogOffset log_offset, xtBool missing_ok)
+{
+	if (sl_rec_log_id != log_id) {
+		if (sl_log_file) {
+			xt_close_file_ns(sl_log_file);
+			sl_log_file  = NULL;
+		}
+
+		sl_rec_log_id = log_id;
+		sl_buf_log_offset = sl_rec_log_offset;
+		sl_buffer_len = 0;
+
+		if (!sl_db->db_datalogs.dlc_open_log(&sl_log_file, log_id, missing_ok ? XT_FS_MISSING_OK : XT_FS_DEFAULT))
+			return FAILED;
+		if (sl_log_file)
+			sl_log_eof = xt_seek_eof_file(NULL, sl_log_file);
+	}
+	sl_rec_log_offset = log_offset;
+	sl_record_len = 0;
+	return OK;
+}
+
+xtBool XTDataSeqRead::sl_rnd_read(xtLogOffset log_offset, size_t size, xtWord1 *buffer, size_t *data_read, struct XTThread *thread)
+{
+	if (!sl_log_file) {
+		*data_read = 0;
+		return OK;
+	}
+	return xt_pread_file(sl_log_file, log_offset, size, 0, buffer, data_read, &thread->st_statistics.st_data, thread);
+}
+
+/*
+ * Unlike the transaction log sequential reader, this function only returns
+ * the header of a record.
+ *
+ * {SKIP-GAPS}
+ * This function now skips gaps. This should not be required, because in normal
+ * operation, no gaps should be created.
+ *
+ * However, if his happens there is a danger that a valid record after the
+ * gap will be lost.
+ *
+ * So, if we find an invalid record, we scan through the log to find the next
+ * valid record. Note, that there is still a danger that will will find
+ * data that looks like a valid record, but is not.
+ *
+ * In this case, this "pseudo record" may cause the function to actually skip
+ * valid records.
+ *
+ * Note, any such malfunction will eventually cause the record to be lost forever
+ * after the garbage collector has run.
+ */
+xtBool XTDataSeqRead::sl_seq_next(XTXactLogBufferDPtr *ret_entry, struct XTThread *thread)
+{
+	XTXactLogBufferDPtr	record;
+	size_t				tfer;
+	size_t				len = 0;
+	size_t				rec_offset;
+	size_t				max_rec_len;
+	xtBool				reread_from_buffer;
+	xtWord4				size;
+	xtLogOffset			gap_start = 0;
+
+	/* Go to the next record (xseq_record_len must be initialized
+	 * to 0 for this to work.
+	 */
+	retry:
+	sl_rec_log_offset += sl_record_len;
+	sl_record_len = 0;
+
+	if (sl_rec_log_offset < sl_buf_log_offset ||
+		sl_rec_log_offset >= sl_buf_log_offset + (xtLogOffset) sl_buffer_len) {
+		/* The current position is nowhere near the buffer, read data into the
+		 * buffer:
+		 */
+		tfer = sl_buffer_size;
+		if (!sl_rnd_read(sl_rec_log_offset, tfer, sl_buffer, &tfer, thread))
+			return FAILED;
+		sl_buf_log_offset = sl_rec_log_offset;
+		sl_buffer_len = tfer;
+
+		/* Should we go to the next log? */
+		if (!tfer)
+			goto return_empty;
+	}
+
+	/* The start of the record is in the buffer: */
+	read_from_buffer:
+	rec_offset = (size_t) (sl_rec_log_offset - sl_buf_log_offset);
+	max_rec_len = sl_buffer_len - rec_offset;
+	reread_from_buffer = FALSE;
+	size = 0;
+
+	/* Check the type of record: */
+	record = (XTXactLogBufferDPtr) (sl_buffer + rec_offset);
+	switch (record->xl.xl_status_1) {
+		case XT_LOG_ENT_HEADER:
+			if (sl_rec_log_offset != 0)
+				goto scan_to_next_record;
+			if (offsetof(XTXactLogHeaderDRec, xh_size_4) + 4 > max_rec_len) {
+				reread_from_buffer = TRUE;
+				goto read_more;
+			}
+			len = XT_GET_DISK_4(record->xh.xh_size_4);
+			if (len > max_rec_len) {
+				reread_from_buffer = TRUE;
+				goto read_more;
+			}
+
+			if (record->xh.xh_checksum_1 != XT_CHECKSUM_1(sl_rec_log_id))
+				goto return_empty;
+			if (XT_LOG_HEAD_MAGIC(record, len) != XT_LOG_FILE_MAGIC)
+				goto return_empty;
+			if (len > offsetof(XTXactLogHeaderDRec, xh_log_id_4) + 4) {
+				if (XT_GET_DISK_4(record->xh.xh_log_id_4) != sl_rec_log_id)
+					goto return_empty;
+			}
+			break;
+		case XT_LOG_ENT_EXT_REC_OK:
+		case XT_LOG_ENT_EXT_REC_DEL:
+			if (gap_start) {
+				xt_logf(XT_NS_CONTEXT, XT_LOG_WARNING, "Gap in data log %lu, start: %llu, size: %llu\n", (u_long) sl_rec_log_id, (u_llong) gap_start, (u_llong) (sl_rec_log_offset - gap_start));
+				gap_start = 0;
+			}
+			len = offsetof(XTactExtRecEntryDRec, er_data);
+			if (len > max_rec_len) {
+				reread_from_buffer = TRUE;
+				goto read_more;
+			}
+			size = XT_GET_DISK_4(record->er.er_data_size_4);
+			/* Verify the record as good as we can! */
+			if (!size)
+				goto scan_to_next_record;
+			if (sl_rec_log_offset + (xtLogOffset) offsetof(XTactExtRecEntryDRec, er_data) + size > sl_log_eof)
+				goto scan_to_next_record;
+			if (!XT_GET_DISK_4(record->er.er_tab_id_4))
+				goto scan_to_next_record;
+			if (!XT_GET_DISK_4(record->er.er_rec_id_4))
+				goto scan_to_next_record;
+			break;
+		default:
+			/* Note, we no longer assume EOF.
+			 * Instead, we skip to the next value record. */
+			goto scan_to_next_record;
+	}
+
+	if (len <= max_rec_len) {
+		/* The record is completely in the buffer: */
+		sl_record_len = len+size;
+		*ret_entry = record;
+		return OK;
+	}
+	
+	read_more:
+	/* The record is partially in the buffer. */
+	memmove(sl_buffer, sl_buffer + rec_offset, max_rec_len);
+	sl_buf_log_offset += rec_offset;
+	sl_buffer_len = max_rec_len;
+
+	/* Read the rest, as far as possible: */
+	tfer = sl_buffer_size - max_rec_len;
+	if (!sl_rnd_read(sl_buf_log_offset + max_rec_len, tfer, sl_buffer + max_rec_len, &tfer, thread))
+		return FAILED;
+	sl_buffer_len += tfer;
+
+	if (sl_buffer_len < len)
+		/* A partial record is in the log, must be the end of the log: */
+		goto return_empty;
+
+	if (reread_from_buffer)
+		goto read_from_buffer;
+
+	/* The record is not completely in the buffer: */
+	sl_record_len = len;
+	*ret_entry = (XTXactLogBufferDPtr) sl_buffer;
+	return OK;
+
+	scan_to_next_record:
+	if (!gap_start) {
+		gap_start = sl_rec_log_offset;
+		xt_logf(XT_NS_CONTEXT, XT_LOG_WARNING, "Gap found in data log %lu, starting at offset %llu\n", (u_long) sl_rec_log_id, (u_llong) gap_start);
+	}
+	sl_record_len = 1;
+	sl_extra_garbage++;
+	goto retry;
+
+	return_empty:
+	if (gap_start) {
+		xt_logf(XT_NS_CONTEXT, XT_LOG_WARNING, "Gap in data log %lu, start: %llu, size: %llu\n", (u_long) sl_rec_log_id, (u_llong) gap_start, (u_llong) (sl_rec_log_offset - gap_start));
+		gap_start = 0;
+	}
+	*ret_entry = NULL;
+	return OK;
+}
+
+void XTDataSeqRead::sl_seq_skip(size_t size)
+{
+	sl_record_len += size;
+}
+
+void XTDataSeqRead::sl_seq_skip_to(off_t log_offset)
+{
+	if (log_offset >= sl_rec_log_offset)
+		sl_record_len = (size_t) (log_offset - sl_rec_log_offset);
+}
+
+/*
+ * --------------------------------------------------------------------------------
+ * STATIC UTILITIES
+ */
+
+static xtBool dl_create_log_header(XTDataLogFilePtr data_log, XTOpenFilePtr of, XTThreadPtr thread)
+{
+	XTXactLogHeaderDRec	header;
+
+	/* The header was not completely written, so write a new one: */
+	memset(&header, 0, sizeof(XTXactLogHeaderDRec));
+	header.xh_status_1 = XT_LOG_ENT_HEADER;
+	header.xh_checksum_1 = XT_CHECKSUM_1(data_log->dlf_log_id);
+	XT_SET_DISK_4(header.xh_size_4, sizeof(XTXactLogHeaderDRec));
+	XT_SET_DISK_8(header.xh_free_space_8, 0);
+	XT_SET_DISK_8(header.xh_file_len_8, sizeof(XTXactLogHeaderDRec));
+	XT_SET_DISK_4(header.xh_log_id_4, data_log->dlf_log_id);
+	XT_SET_DISK_2(header.xh_version_2, XT_LOG_VERSION_NO);
+	XT_SET_DISK_4(header.xh_magic_4, XT_LOG_FILE_MAGIC);
+	if (!xt_pwrite_file(of, 0, sizeof(XTXactLogHeaderDRec), &header, &thread->st_statistics.st_data, thread))
+		return FAILED;
+	if (!xt_flush_file(of, &thread->st_statistics.st_data, thread))
+		return FAILED;
+	return OK;
+}
+
+static xtBool dl_write_garbage_level(XTDataLogFilePtr data_log, XTOpenFilePtr of, xtBool flush, XTThreadPtr thread)
+{
+	XTXactLogHeaderDRec	header;
+
+	/* The header was not completely written, so write a new one: */
+	XT_SET_DISK_8(header.xh_free_space_8, data_log->dlf_garbage_count);
+	if (!xt_pwrite_file(of, offsetof(XTXactLogHeaderDRec, xh_free_space_8), 8, (xtWord1 *) &header.xh_free_space_8, &thread->st_statistics.st_data, thread))
+		return FAILED;
+	if (flush && !xt_flush_file(of, &thread->st_statistics.st_data, thread))
+		return FAILED;
+	return OK;
+}
+
+/*
+ * {SKIP-GAPS}
+ * Extra garbage is the amount of space skipped during recovery of the data
+ * log file. We assume this space has not be counted as garbage, 
+ * and add it to the garbage count.
+ *
+ * This may mean that our estimate of garbaged is higher than it should
+ * be, but that is better than the other way around.
+ *
+ * The fact is, there should not be any gaps in the data log files, so
+ * this is actually an exeption which should not occur.
+ */
+static xtBool dl_write_log_header(XTDataLogFilePtr data_log, XTOpenFilePtr of, xtLogOffset extra_garbage, XTThreadPtr thread)
+{
+	XTXactLogHeaderDRec	header;
+
+	XT_SET_DISK_8(header.xh_file_len_8, data_log->dlf_log_eof);
+
+	if (extra_garbage) {
+		data_log->dlf_garbage_count += extra_garbage;
+		if (data_log->dlf_garbage_count > data_log->dlf_log_eof)
+			data_log->dlf_garbage_count = data_log->dlf_log_eof;
+		XT_SET_DISK_8(header.xh_free_space_8, data_log->dlf_garbage_count);
+		if (!xt_pwrite_file(of, offsetof(XTXactLogHeaderDRec, xh_free_space_8), 16, (xtWord1 *) &header.xh_free_space_8, &thread->st_statistics.st_data, thread))
+			return FAILED;
+	}
+	else {
+		if (!xt_pwrite_file(of, offsetof(XTXactLogHeaderDRec, xh_file_len_8), 8, (xtWord1 *) &header.xh_file_len_8, &thread->st_statistics.st_data, thread))
+			return FAILED;
+	}
+	if (!xt_flush_file(of, &thread->st_statistics.st_data, thread))
+		return FAILED;
+	return OK;
+}
+
+static void dl_free_seq_read(XTThreadPtr self __attribute__((unused)), XTDataSeqReadPtr seq_read)
+{
+	seq_read->sl_seq_exit();
+}
+
+static void dl_recover_log(XTThreadPtr self, XTDatabaseHPtr db, XTDataLogFilePtr data_log)
+{
+	XTDataSeqReadRec	seq_read;
+	XTXactLogBufferDPtr	record;
+
+	if (!seq_read.sl_seq_init(db, xt_db_log_buffer_size))
+		xt_throw(self);
+	pushr_(dl_free_seq_read, &seq_read);
+
+	seq_read.sl_seq_start(data_log->dlf_log_id, 0, FALSE);
+
+	for (;;) {
+		if (!seq_read.sl_seq_next(&record, self))
+			xt_throw(self);
+		if (!record)
+			break;
+		switch (record->xh.xh_status_1) {
+			case XT_LOG_ENT_HEADER:
+				data_log->dlf_garbage_count = XT_GET_DISK_8(record->xh.xh_free_space_8);
+				data_log->dlf_start_offset = XT_GET_DISK_8(record->xh.xh_comp_pos_8);
+				seq_read.sl_seq_skip_to((off_t) XT_GET_DISK_8(record->xh.xh_file_len_8)); 
+				break;
+		}
+	}
+
+	ASSERT_NS(seq_read.sl_log_eof == seq_read.sl_rec_log_offset);
+	data_log->dlf_log_eof = seq_read.sl_rec_log_offset;
+
+	if (data_log->dlf_log_eof < (off_t) sizeof(XTXactLogHeaderDRec)) {
+		data_log->dlf_log_eof = sizeof(XTXactLogHeaderDRec);
+		if (!dl_create_log_header(data_log, seq_read.sl_log_file, self))
+			xt_throw(self);
+	}
+	else {
+		if (!dl_write_log_header(data_log, seq_read.sl_log_file, seq_read.sl_extra_garbage, self))
+			xt_throw(self);
+	}
+
+	freer_(); // dl_free_seq_read(&seq_read)
+}
+
+/*
+ * --------------------------------------------------------------------------------
+ * D A T A  L O G  C AC H E
+ */
+
+void XTDataLogCache::dls_remove_log(XTDataLogFilePtr data_log)
+{
+	xtLogID log_id = data_log->dlf_log_id;
+
+	switch (data_log->dlf_state) {
+		case XT_DL_HAS_SPACE:
+			xt_sl_delete(NULL, dlc_has_space, &log_id);
+			break;
+		case XT_DL_TO_COMPACT:
+			xt_sl_delete(NULL, dlc_to_compact, &log_id);
+			break;
+		case XT_DL_TO_DELETE:
+			xt_sl_delete(NULL, dlc_to_delete, &log_id);
+			break;
+		case XT_DL_DELETED:
+			xt_sl_delete(NULL, dlc_deleted, &log_id);
+			break;
+	}
+}
+
+int XTDataLogCache::dls_get_log_state(XTDataLogFilePtr data_log)
+{
+	if (data_log->dlf_to_much_garbage())
+		return XT_DL_TO_COMPACT;
+	if (data_log->dlf_space_avaliable() > 0)
+		return XT_DL_HAS_SPACE;
+	return XT_DL_READ_ONLY;
+}
+
+xtBool XTDataLogCache::dls_set_log_state(XTDataLogFilePtr data_log, int state)
+{
+	xtLogID log_id = data_log->dlf_log_id;
+
+	xt_lock_mutex_ns(&dlc_lock);
+	if (state == XT_DL_MAY_COMPACT) {
+		if (data_log->dlf_state != XT_DL_UNKNOWN &&
+			data_log->dlf_state != XT_DL_HAS_SPACE &&
+			data_log->dlf_state != XT_DL_READ_ONLY)
+			goto ok;
+		state = XT_DL_TO_COMPACT;
+	}
+	if (state == XT_DL_UNKNOWN)
+		state = dls_get_log_state(data_log);
+	switch (state) {
+		case XT_DL_HAS_SPACE:
+			if (data_log->dlf_state != XT_DL_HAS_SPACE) {
+				dls_remove_log(data_log);
+				if (!xt_sl_insert(NULL, dlc_has_space, &log_id, &log_id))
+					goto failed;
+			}
+			break;
+		case XT_DL_TO_COMPACT:
+#ifdef DEBUG_LOG_DELETE
+			printf("-- set to compact: %d\n", (int) log_id);
+#endif
+			if (data_log->dlf_state != XT_DL_TO_COMPACT) {
+				dls_remove_log(data_log);
+				if (!xt_sl_insert(NULL, dlc_to_compact, &log_id, &log_id))
+					goto failed;
+			}
+			dl_wake_co_thread(dlc_db);
+			break;
+		case XT_DL_COMPACTED:
+#ifdef DEBUG_LOG_DELETE
+			printf("-- set compacted: %d\n", (int) log_id);
+#endif
+			if (data_log->dlf_state != state)
+				dls_remove_log(data_log);
+			break;
+		case XT_DL_TO_DELETE:
+#ifdef DEBUG_LOG_DELETE
+			printf("-- set to delete log: %d\n", (int) log_id);
+#endif
+			if (data_log->dlf_state != XT_DL_TO_DELETE) {
+				dls_remove_log(data_log);
+				if (!xt_sl_insert(NULL, dlc_to_delete, &log_id, &log_id))
+					goto failed;
+			}
+			break;
+		case XT_DL_DELETED:
+#ifdef DEBUG_LOG_DELETE
+			printf("-- set DELETED log: %d\n", (int) log_id);
+#endif
+			if (data_log->dlf_state != XT_DL_DELETED) {
+				dls_remove_log(data_log);
+				if (!xt_sl_insert(NULL, dlc_deleted, &log_id, &log_id))
+					goto failed;
+			}
+			break;
+		default:
+			if (data_log->dlf_state != state)
+				dls_remove_log(data_log);
+			break;
+	}
+	data_log->dlf_state = state;
+
+	ok:
+	xt_unlock_mutex_ns(&dlc_lock);
+	return OK;
+
+	failed:
+	xt_unlock_mutex_ns(&dlc_lock);
+	return FAILED;
+}
+
+static int dl_cmp_log_id(XTThreadPtr XT_UNUSED(self), register const void *XT_UNUSED(thunk), register const void *a, register const void *b)
+{
+	xtLogID			log_id_a = *((xtLogID *) a);
+	xtLogID			log_id_b = *((xtLogID *) b);
+
+	if (log_id_a == log_id_b)
+		return 0;
+	if (log_id_a < log_id_b)
+		return -1;
+	return 1;
+}
+
+void XTDataLogCache::dlc_init(XTThreadPtr self, XTDatabaseHPtr db)
+{
+	XTOpenDirPtr		od;
+	char				log_dir[PATH_MAX];
+	char				*file;
+	xtLogID				log_id;
+	XTDataLogFilePtr	data_log= NULL;
+
+	memset(this, 0, sizeof(XTDataLogCacheRec));
+	dlc_db = db;
+	try_(a) {
+		xt_init_mutex_with_autoname(self, &dlc_lock);
+		xt_init_cond(self, &dlc_cond);
+		for (u_int i=0; i<XT_DL_NO_OF_SEGMENTS; i++) {
+			xt_init_mutex_with_autoname(self, &dlc_segment[i].dls_lock);
+			xt_init_cond(self, &dlc_segment[i].dls_cond);
+		}
+		dlc_has_space = xt_new_sortedlist(self, sizeof(xtLogID), 20, 10, dl_cmp_log_id, NULL, NULL, FALSE, FALSE);
+		dlc_to_compact = xt_new_sortedlist(self, sizeof(xtLogID), 20, 10, dl_cmp_log_id, NULL, NULL, FALSE, FALSE);
+		dlc_to_delete = xt_new_sortedlist(self, sizeof(xtLogID), 20, 10, dl_cmp_log_id, NULL, NULL, FALSE, FALSE);
+		dlc_deleted = xt_new_sortedlist(self, sizeof(xtLogID), 20, 10, dl_cmp_log_id, NULL, NULL, FALSE, FALSE);
+		xt_init_mutex_with_autoname(self, &dlc_mru_lock);
+		xt_init_mutex_with_autoname(self, &dlc_head_lock);
+
+		xt_strcpy(PATH_MAX, log_dir, dlc_db->db_main_path);
+		xt_add_data_dir(PATH_MAX, log_dir);
+		if (xt_fs_exists(log_dir)) {
+			pushsr_(od, xt_dir_close, xt_dir_open(self, log_dir, NULL));
+			while (xt_dir_next(self, od)) {
+				file = xt_dir_name(self, od);
+				if (xt_ends_with(file, ".xt")) {
+					if ((log_id = (xtLogID) xt_file_name_to_id(file))) {
+						if (!dlc_get_data_log(&data_log, log_id, TRUE, NULL))
+							xt_throw(self);
+						dl_recover_log(self, db, data_log);
+						if (!dls_set_log_state(data_log, XT_DL_UNKNOWN))
+							xt_throw(self);
+					}
+				}
+			}
+			freer_();
+		}
+	}
+	catch_(a) {
+		dlc_exit(self);
+		xt_throw(self);
+	}
+	cont_(a);
+}
+
+void XTDataLogCache::dlc_exit(XTThreadPtr self)
+{
+	XTDataLogFilePtr	data_log, tmp_data_log;
+	XTOpenLogFilePtr	open_log, tmp_open_log;
+
+	if (dlc_has_space) {
+		xt_free_sortedlist(self, dlc_has_space);
+		dlc_has_space = NULL;
+	}
+	if (dlc_to_compact) {
+		xt_free_sortedlist(self, dlc_to_compact);
+		dlc_to_compact = NULL;
+	}
+	if (dlc_to_delete) {
+		xt_free_sortedlist(self, dlc_to_delete);
+		dlc_to_delete = NULL;
+	}
+	if (dlc_deleted) {
+		xt_free_sortedlist(self, dlc_deleted);
+		dlc_deleted = NULL;
+	}
+	for (u_int i=0; i<XT_DL_NO_OF_SEGMENTS; i++) {
+		for (u_int j=0; j<XT_DL_SEG_HASH_TABLE_SIZE; j++) {
+			data_log = dlc_segment[i].dls_hash_table[j];
+			while (data_log) {
+				if (data_log->dlf_log_file) {
+					xt_close_file_ns(data_log->dlf_log_file);
+					data_log->dlf_log_file = NULL;
+				}
+
+				open_log = data_log->dlf_free_list;
+				while (open_log) {
+					if (open_log->odl_log_file)
+						xt_close_file(self, open_log->odl_log_file);
+					tmp_open_log = open_log;
+					open_log = open_log->odl_next_free;
+					xt_free(self, tmp_open_log);
+				}
+				tmp_data_log = data_log;
+				data_log = data_log->dlf_next_hash;
+
+				xt_free(self, tmp_data_log);
+			}
+		}
+		xt_free_mutex(&dlc_segment[i].dls_lock);
+		xt_free_cond(&dlc_segment[i].dls_cond);
+	}
+	xt_free_mutex(&dlc_head_lock);
+	xt_free_mutex(&dlc_mru_lock);
+	xt_free_mutex(&dlc_lock);
+	xt_free_cond(&dlc_cond);
+}
+
+void XTDataLogCache::dlc_name(size_t size, char *path, xtLogID log_id)
+{
+	char name[50];
+
+	sprintf(name, "dlog-%lu.xt", (u_long) log_id);
+	xt_strcpy(size, path, dlc_db->db_main_path);
+	xt_add_data_dir(size, path);
+	xt_add_dir_char(size, path);
+	xt_strcat(size, path, name);
+}
+
+xtBool XTDataLogCache::dlc_open_log(XTOpenFilePtr *fh, xtLogID log_id, int mode)
+{
+	char log_path[PATH_MAX];
+
+	dlc_name(PATH_MAX, log_path, log_id);
+	return xt_open_file_ns(fh, log_path, mode);
+}
+
+xtBool XTDataLogCache::dlc_unlock_log(XTDataLogFilePtr data_log)
+{
+	if (data_log->dlf_log_file) {
+		xt_close_file_ns(data_log->dlf_log_file);
+		data_log->dlf_log_file = NULL;
+	}
+
+	return dls_set_log_state(data_log, XT_DL_UNKNOWN);
+}
+
+XTDataLogFilePtr XTDataLogCache::dlc_get_log_for_writing(off_t space_required, struct XTThread *thread)
+{
+	xtLogID				log_id, *log_id_ptr = NULL;
+	size_t				size;
+	size_t				idx;
+	XTDataLogFilePtr	data_log = NULL;
+
+	xt_lock_mutex_ns(&dlc_lock);
+
+	/* Look for an existing log with enough space: */
+	size = xt_sl_get_size(dlc_has_space);
+	for (idx=0; idx<size; idx++) {
+		log_id_ptr = (xtLogID *) xt_sl_item_at(dlc_has_space, idx);
+		if (!dlc_get_data_log(&data_log, *log_id_ptr, FALSE, NULL))
+			goto failed;
+		if (data_log) {
+			if (data_log->dlf_space_avaliable() >= space_required)
+				break;
+			data_log = NULL;
+		}
+		else {
+			ASSERT_NS(FALSE);
+			xt_sl_delete_item_at(NULL, dlc_has_space, idx);
+			idx--;
+			size--;
+		}
+	}
+
+	if (data_log) {
+		/* Found a log: */
+		if (!dlc_open_log(&data_log->dlf_log_file, *log_id_ptr, XT_FS_DEFAULT))
+			goto failed;
+		xt_sl_delete_item_at(NULL, dlc_has_space, idx);
+	}
+	else {
+		/* Create a new log: */
+		log_id = dlc_next_log_id;
+		for (u_int i=0; i<XT_DL_MAX_LOG_ID; i++) {
+			log_id++;
+			if (log_id > XT_DL_MAX_LOG_ID)
+				log_id = 1;
+			if (!dlc_get_data_log(&data_log, log_id, FALSE, NULL))
+				goto failed;
+			if (!data_log)
+				break;
+		}
+		dlc_next_log_id = log_id;
+		if (data_log) {
+			xt_register_ulxterr(XT_REG_CONTEXT, XT_ERR_LOG_MAX_EXCEEDED, (u_long) XT_DL_MAX_LOG_ID);
+			goto failed;
+		}
+		if (!dlc_get_data_log(&data_log, log_id, TRUE, NULL))
+			goto failed;
+		if (!dlc_open_log(&data_log->dlf_log_file, log_id, XT_FS_CREATE | XT_FS_MAKE_PATH))
+			goto failed;
+		data_log->dlf_log_eof = sizeof(XTXactLogHeaderDRec);
+		if (!dl_create_log_header(data_log, data_log->dlf_log_file, thread)) {
+			xt_close_file_ns(data_log->dlf_log_file);
+			goto failed;
+		}
+		/* By setting this late we ensure that the error
+		 * will be repeated.
+		 */ 
+		dlc_next_log_id = log_id;
+	}
+	data_log->dlf_state = XT_DL_EXCLUSIVE;
+
+	xt_unlock_mutex_ns(&dlc_lock);
+	return data_log;
+
+	failed:
+	xt_unlock_mutex_ns(&dlc_lock);
+	return NULL;
+}
+
+xtBool XTDataLogCache::dlc_get_data_log(XTDataLogFilePtr *lf, xtLogID log_id, xtBool create, XTDataLogSegPtr *ret_seg)
+{
+	register XTDataLogSegPtr	seg;
+	register u_int				hash_idx;
+	register XTDataLogFilePtr	data_log;
+
+	/* Which segment, and hash index: */
+	seg = &dlc_segment[log_id & XT_DL_SEGMENT_MASK];
+	hash_idx = (log_id >> XT_DL_SEGMENT_SHIFTS) % XT_DL_SEG_HASH_TABLE_SIZE;
+
+	/* Lock the segment: */
+	xt_lock_mutex_ns(&seg->dls_lock);
+
+	/* Find the log file on the hash list: */
+	data_log = seg->dls_hash_table[hash_idx];
+	while (data_log) {
+		if (data_log->dlf_log_id == log_id)
+			break;
+		data_log = data_log->dlf_next_hash;
+	}
+
+	if (!data_log && create) {
+		/* Create a new log file structure: */
+		if (!(data_log = (XTDataLogFilePtr) xt_calloc_ns(sizeof(XTDataLogFileRec))))
+			goto failed;
+		data_log->dlf_log_id = log_id;
+		data_log->dlf_next_hash = seg->dls_hash_table[hash_idx];
+		seg->dls_hash_table[hash_idx] = data_log;
+	}
+
+	if (ret_seg) {
+		/* This gives the caller the lock: */
+		*ret_seg = seg;
+		*lf = data_log;
+		return OK;
+	}
+
+	xt_unlock_mutex_ns(&seg->dls_lock);
+	*lf = data_log;
+	return OK;
+
+	failed:
+	xt_unlock_mutex_ns(&seg->dls_lock);
+	return FAILED;
+}
+
+/*
+ * If just_close is FALSE, then a log is being deleted.
+ * This means that that the log may still be in exclusive use by
+ * some thread. So we just close the log!
+ */
+xtBool XTDataLogCache::dlc_remove_data_log(xtLogID log_id, xtBool just_close)
+{
+	register XTDataLogSegPtr	seg;
+	register u_int				hash_idx;
+	register XTDataLogFilePtr	data_log;
+	XTOpenLogFilePtr			open_log, tmp_open_log;
+
+	/* Which segment, and hash index: */
+	seg = &dlc_segment[log_id & XT_DL_SEGMENT_MASK];
+	hash_idx = (log_id >> XT_DL_SEGMENT_SHIFTS) % XT_DL_SEG_HASH_TABLE_SIZE;
+
+	/* Lock the segment: */
+	retry:
+	xt_lock_mutex_ns(&seg->dls_lock);
+
+	/* Find the log file on the hash list: */
+	data_log = seg->dls_hash_table[hash_idx];
+	while (data_log) {
+		if (data_log->dlf_log_id == log_id)
+			break;
+		data_log = data_log->dlf_next_hash;
+	}
+
+	if (data_log) {
+		xt_lock_mutex_ns(&dlc_mru_lock);
+
+		open_log = data_log->dlf_free_list;
+		while (open_log) {
+			if (open_log->odl_log_file)
+				xt_close_file_ns(open_log->odl_log_file);
+
+			/* Remove from MRU list: */
+			if (dlc_lru_open_log == open_log) {
+				dlc_lru_open_log = open_log->odl_mr_used;
+				ASSERT_NS(!open_log->odl_lr_used);
+			}
+			else if (open_log->odl_lr_used)
+				open_log->odl_lr_used->odl_mr_used = open_log->odl_mr_used;
+			if (dlc_mru_open_log == open_log) {
+				dlc_mru_open_log = open_log->odl_lr_used;
+				ASSERT_NS(!open_log->odl_mr_used);
+			}
+			else if (open_log->odl_mr_used)
+				open_log->odl_mr_used->odl_lr_used = open_log->odl_lr_used;
+
+			data_log->dlf_open_count--;
+			tmp_open_log = open_log;
+			open_log = open_log->odl_next_free;
+			xt_free_ns(tmp_open_log);
+		}
+		data_log->dlf_free_list = NULL;
+
+		xt_unlock_mutex_ns(&dlc_mru_lock);
+
+		if (data_log->dlf_open_count) {
+			if (!xt_timed_wait_cond_ns(&seg->dls_cond, &seg->dls_lock, 2000))
+				goto failed;
+			xt_unlock_mutex_ns(&seg->dls_lock);
+			goto retry;
+		}
+
+		/* Close the exclusive file if required: */
+		if (data_log->dlf_log_file) {
+			xt_close_file_ns(data_log->dlf_log_file);
+			data_log->dlf_log_file = NULL;
+		}
+
+		if (!just_close) {
+			/* Remove the log from the hash list: */
+			XTDataLogFilePtr ptr, pptr = NULL;
+
+			ptr = seg->dls_hash_table[hash_idx];
+			while (ptr) {
+				if (ptr == data_log)
+					break;
+				pptr = ptr;
+				ptr = ptr->dlf_next_hash;
+			}
+			
+			if (ptr == data_log) {
+				if (pptr)
+					pptr->dlf_next_hash = ptr->dlf_next_hash;
+				else
+					seg->dls_hash_table[hash_idx] = ptr->dlf_next_hash;
+			}
+
+			xt_free_ns(data_log);
+		}
+	}
+
+	xt_unlock_mutex_ns(&seg->dls_lock);
+	return OK;
+
+	failed:
+	xt_unlock_mutex_ns(&seg->dls_lock);
+	return FAILED;
+}
+
+xtBool XTDataLogCache::dlc_get_open_log(XTOpenLogFilePtr *ol, xtLogID log_id)
+{
+	register XTDataLogSegPtr	seg;
+	register u_int				hash_idx;
+	register XTDataLogFilePtr	data_log;
+	register XTOpenLogFilePtr	open_log;
+	char						path[PATH_MAX];
+
+	/* Which segment, and hash index: */
+	seg = &dlc_segment[log_id & XT_DL_SEGMENT_MASK];
+	hash_idx = (log_id >> XT_DL_SEGMENT_SHIFTS) % XT_DL_SEG_HASH_TABLE_SIZE;
+
+	/* Lock the segment: */
+	xt_lock_mutex_ns(&seg->dls_lock);
+
+	/* Find the log file on the hash list: */
+	data_log = seg->dls_hash_table[hash_idx];
+	while (data_log) {
+		if (data_log->dlf_log_id == log_id)
+			break;
+		data_log = data_log->dlf_next_hash;
+	}
+
+	if (!data_log) {
+		/* Create a new log file structure: */
+		dlc_name(PATH_MAX, path, log_id);
+		if (!xt_fs_exists(path)) {
+			xt_register_ixterr(XT_REG_CONTEXT, XT_ERR_DATA_LOG_NOT_FOUND, path);
+			goto failed;
+		}
+		if (!(data_log = (XTDataLogFilePtr) xt_calloc_ns(sizeof(XTDataLogFileRec))))
+			goto failed;
+		data_log->dlf_log_id = log_id;
+		data_log->dlf_next_hash = seg->dls_hash_table[hash_idx];
+		seg->dls_hash_table[hash_idx] = data_log;
+	}
+
+	if ((open_log = data_log->dlf_free_list)) {
+		/* Remove from the free list: */
+		if ((data_log->dlf_free_list = open_log->odl_next_free))
+			data_log->dlf_free_list->odl_prev_free = NULL;
+
+		/* This file has been most recently used: */
+		if (XT_TIME_DIFF(open_log->odl_ru_time, dlc_ru_now) > (XT_DL_LOG_POOL_SIZE >> 1)) {
+			/* Move to the front of the MRU list: */
+			xt_lock_mutex_ns(&dlc_mru_lock);
+
+			open_log->odl_ru_time = ++dlc_ru_now;
+			if (dlc_mru_open_log != open_log) {
+				/* Remove from the MRU list: */
+				if (dlc_lru_open_log == open_log) {
+					dlc_lru_open_log = open_log->odl_mr_used;
+					ASSERT_NS(!open_log->odl_lr_used);
+				}
+				else if (open_log->odl_lr_used)
+					open_log->odl_lr_used->odl_mr_used = open_log->odl_mr_used;
+				if (open_log->odl_mr_used)
+					open_log->odl_mr_used->odl_lr_used = open_log->odl_lr_used;
+
+				/* Make the file the most recently used: */
+				if ((open_log->odl_lr_used = dlc_mru_open_log))
+					dlc_mru_open_log->odl_mr_used = open_log;
+				open_log->odl_mr_used = NULL;
+				dlc_mru_open_log = open_log;
+				if (!dlc_lru_open_log)
+					dlc_lru_open_log = open_log;
+			}
+			xt_unlock_mutex_ns(&dlc_mru_lock);
+		}
+	}
+	else {
+		/* Create a new open file: */
+		if (!(open_log = (XTOpenLogFilePtr) xt_calloc_ns(sizeof(XTOpenLogFileRec))))
+			goto failed;
+		dlc_name(PATH_MAX, path, log_id);
+		if (!xt_open_file_ns(&open_log->odl_log_file, path, XT_FS_DEFAULT)) {
+			xt_free_ns(open_log);
+			goto failed;
+		}
+		open_log->olf_log_id = log_id;
+		open_log->odl_data_log = data_log;
+		data_log->dlf_open_count++;
+
+		/* Make the new open file the most recently used: */
+		xt_lock_mutex_ns(&dlc_mru_lock);
+		open_log->odl_ru_time = ++dlc_ru_now;
+		if ((open_log->odl_lr_used = dlc_mru_open_log))
+			dlc_mru_open_log->odl_mr_used = open_log;
+		open_log->odl_mr_used = NULL;
+		dlc_mru_open_log = open_log;
+		if (!dlc_lru_open_log)
+			dlc_lru_open_log = open_log;
+		dlc_open_count++;
+		xt_unlock_mutex_ns(&dlc_mru_lock);
+	}
+
+	open_log->odl_in_use = TRUE;
+	xt_unlock_mutex_ns(&seg->dls_lock);
+	*ol = open_log;
+
+	if (dlc_open_count > XT_DL_LOG_POOL_SIZE) {
+		u_int	target = XT_DL_LOG_POOL_SIZE / 4 * 3;
+		xtLogID	free_log_id;
+
+		/* Remove some open files: */
+		while (dlc_open_count > target) {
+			XTOpenLogFilePtr to_free = dlc_lru_open_log;
+
+			if (!to_free || to_free->odl_in_use)
+				break;
+
+			/* Dirty read the file ID: */
+			free_log_id = to_free->olf_log_id;
+
+			seg = &dlc_segment[free_log_id & XT_DL_SEGMENT_MASK];
+
+			/* Lock the segment: */
+			xt_lock_mutex_ns(&seg->dls_lock);
+
+			/* Lock the MRU list: */
+			xt_lock_mutex_ns(&dlc_mru_lock);
+
+			/* Check if we have the same open file: */
+			if (dlc_lru_open_log == to_free && !to_free->odl_in_use) {
+				data_log = to_free->odl_data_log;
+		
+				/* Remove from the MRU list: */
+				dlc_lru_open_log = to_free->odl_mr_used;
+				ASSERT_NS(!to_free->odl_lr_used);
+
+				if (dlc_mru_open_log == to_free) {
+					dlc_mru_open_log = to_free->odl_lr_used;
+					ASSERT_NS(!to_free->odl_mr_used);
+				}
+				else if (to_free->odl_mr_used)
+					to_free->odl_mr_used->odl_lr_used = to_free->odl_lr_used;
+
+				/* Remove from the free list of the file: */
+				if (data_log->dlf_free_list == to_free) {
+					data_log->dlf_free_list = to_free->odl_next_free;
+					ASSERT_NS(!to_free->odl_prev_free);
+				}
+				else if (to_free->odl_prev_free)
+					to_free->odl_prev_free->odl_next_free = to_free->odl_next_free;
+				if (to_free->odl_next_free)
+					to_free->odl_next_free->odl_prev_free = to_free->odl_prev_free;
+				ASSERT_NS(data_log->dlf_open_count > 0);
+				data_log->dlf_open_count--;
+				dlc_open_count--;
+			}
+			else
+				to_free = NULL;
+
+			xt_unlock_mutex_ns(&dlc_mru_lock);
+			xt_unlock_mutex_ns(&seg->dls_lock);
+
+			if (to_free) {
+				xt_close_file_ns(to_free->odl_log_file);
+				xt_free_ns(to_free);
+			}
+		}
+	}
+
+	return OK;
+
+	failed:
+	xt_unlock_mutex_ns(&seg->dls_lock);
+	return FAILED;
+}
+
+void XTDataLogCache::dlc_release_open_log(XTOpenLogFilePtr open_log)
+{
+	register XTDataLogSegPtr	seg;
+	register XTDataLogFilePtr	data_log = open_log->odl_data_log;
+
+	/* Which segment, and hash index: */
+	seg = &dlc_segment[open_log->olf_log_id & XT_DL_SEGMENT_MASK];
+
+	xt_lock_mutex_ns(&seg->dls_lock);
+	open_log->odl_next_free = data_log->dlf_free_list;
+	open_log->odl_prev_free = NULL;
+	if (data_log->dlf_free_list)
+		data_log->dlf_free_list->odl_prev_free = open_log;
+	data_log->dlf_free_list = open_log;
+	open_log->odl_in_use = FALSE;
+
+	/* Wakeup any exclusive lockers: */
+	if (!xt_broadcast_cond_ns(&seg->dls_cond))
+		xt_log_and_clear_exception_ns();
+
+	xt_unlock_mutex_ns(&seg->dls_lock);
+}
+
+/*
+ * --------------------------------------------------------------------------------
+ * D A T A   L O G   F I L E
+ */
+
+off_t XTDataLogFile::dlf_space_avaliable()
+{
+	if (dlf_log_eof < xt_db_data_log_threshold)
+		return xt_db_data_log_threshold - dlf_log_eof;
+	return 0;
+}
+
+xtBool XTDataLogFile::dlf_to_much_garbage()
+{
+	if (!dlf_log_eof)
+		return FALSE;
+	return dlf_garbage_count * 100 / dlf_log_eof >= xt_db_garbage_threshold;
+}
+
+/*
+ * --------------------------------------------------------------------------------
+ * D A T A   L O G   B U F F E R
+ */
+
+void XTDataLogBuffer::dlb_init(XTDatabaseHPtr db, size_t buffer_size)
+{
+	ASSERT_NS(!dlb_db);
+	ASSERT_NS(!dlb_buffer_size);
+	ASSERT_NS(!dlb_data_log);
+	ASSERT_NS(!dlb_log_buffer);
+	dlb_db = db;
+	dlb_buffer_size = buffer_size;
+}
+
+void XTDataLogBuffer::dlb_exit(XTThreadPtr self)
+{
+	dlb_close_log(self);
+	if (dlb_log_buffer) {
+		xt_free(self, dlb_log_buffer);
+		dlb_log_buffer = NULL;
+	}
+	dlb_db = NULL;
+	dlb_buffer_offset = 0;
+	dlb_buffer_size = 0;
+	dlb_buffer_len = 0;
+	dlb_flush_required = FALSE;
+#ifdef DEBUG
+	dlb_max_write_offset = 0;
+#endif
+}
+
+xtBool XTDataLogBuffer::dlb_close_log(XTThreadPtr thread)
+{
+	if (dlb_data_log) {
+		if (dlb_data_log->dlf_log_file) {
+			if (!dl_write_log_header(dlb_data_log, dlb_data_log->dlf_log_file, 0, thread))
+				return FAILED;
+		}
+
+		/* Flush and commit the data in the old log: */
+		if (!dlb_flush_log(TRUE, thread))
+			return FAILED;
+
+		if (!dlb_db->db_datalogs.dlc_unlock_log(dlb_data_log))
+			return FAILED;
+		dlb_data_log = NULL;
+	}
+	return OK;
+}
+
+/* When I use 'thread' instead of 'self', this means
+ * that I will not throw an error.
+ */
+xtBool XTDataLogBuffer::dlb_get_log_offset(xtLogID *log_id, xtLogOffset *out_offset, size_t XT_UNUSED(req_size), struct XTThread *thread)
+{
+	/* Note, I am allowing a log to grow beyond the threshold.
+	 * The amount depends on the maximum extended record size.
+	 * If I don't some logs will never fill up, because of only having
+	 * a few more bytes available.
+	 */
+	if (!dlb_data_log || dlb_data_log->dlf_space_avaliable() == 0) {
+		/* Release the old log: */
+		if (!dlb_close_log(thread))
+			return FAILED;
+
+		if (!dlb_log_buffer) {
+			if (!(dlb_log_buffer = (xtWord1 *) xt_malloc_ns(dlb_buffer_size)))
+				return FAILED;
+		}
+
+		/* I could use req_size instead of 1, but this would mean some logs
+		 * are never filled up.
+		 */
+		if (!(dlb_data_log = dlb_db->db_datalogs.dlc_get_log_for_writing(1, thread)))
+			return FAILED;
+#ifdef DEBUG
+		dlb_max_write_offset = dlb_data_log->dlf_log_eof;
+#endif
+	}
+
+	*log_id = dlb_data_log->dlf_log_id;
+	*out_offset = dlb_data_log->dlf_log_eof;
+	return OK;
+}
+
+xtBool XTDataLogBuffer::dlb_flush_log(xtBool commit, XTThreadPtr thread)
+{
+	if (!dlb_data_log || !dlb_data_log->dlf_log_file)
+		return OK;
+
+	if (dlb_buffer_len) {
+		if (!xt_pwrite_file(dlb_data_log->dlf_log_file, dlb_buffer_offset, dlb_buffer_len, dlb_log_buffer, &thread->st_statistics.st_data, thread))
+			return FAILED;
+#ifdef DEBUG
+		if (dlb_buffer_offset + (xtLogOffset) dlb_buffer_len > dlb_max_write_offset)
+			dlb_max_write_offset = dlb_buffer_offset + (xtLogOffset) dlb_buffer_len;
+#endif
+		dlb_buffer_len = 0;
+		dlb_flush_required = TRUE;
+	}
+
+	if (commit && dlb_flush_required) {
+#ifdef DEBUG
+		/* This would normally be equal, however, in the case
+		 * where some other thread flushes the compactors
+		 * data log, the eof, can be greater than the
+		 * write offset.
+		 *
+		 * This occurs because the flush can come between the 
+		 * dlb_get_log_offset() and dlb_write_thru_log() calls.
+		 */
+		ASSERT_NS(dlb_data_log->dlf_log_eof >= dlb_max_write_offset);
+#endif
+		if (!xt_flush_file(dlb_data_log->dlf_log_file, &thread->st_statistics.st_data, thread))
+			return FAILED;
+		dlb_flush_required = FALSE;
+	}
+	return OK;
+}
+
+xtBool XTDataLogBuffer::dlb_write_thru_log(xtLogID XT_NDEBUG_UNUSED(log_id), xtLogOffset log_offset, size_t size, xtWord1 *data, XTThreadPtr thread)
+{
+	ASSERT_NS(log_id == dlb_data_log->dlf_log_id);
+
+	if (dlb_buffer_len)
+		dlb_flush_log(FALSE, thread);
+
+	if (!xt_pwrite_file(dlb_data_log->dlf_log_file, log_offset, size, data, &thread->st_statistics.st_data, thread))
+		return FAILED;
+	/* Increment of dlb_data_log->dlf_log_eof was moved here from dlb_get_log_offset()
+	 * to ensure it is done after a successful update of the log, otherwise otherwise a 
+	 * gap occurs in the log which cause eof to be detected  in middle of the log
+	 */
+	dlb_data_log->dlf_log_eof += size;
+#ifdef DEBUG
+	if (log_offset + (xtLogOffset) size > (xtLogOffset) dlb_max_write_offset)
+		dlb_max_write_offset = log_offset + size;
+#endif
+	dlb_flush_required = TRUE;
+	return OK;
+}
+
+xtBool XTDataLogBuffer::dlb_append_log(xtLogID XT_NDEBUG_UNUSED(log_id), xtLogOffset log_offset, size_t size, xtWord1 *data, XTThreadPtr thread)
+{
+	ASSERT_NS(log_id == dlb_data_log->dlf_log_id);
+
+	if (dlb_buffer_len) {
+		/* Should be the case, we only write by appending: */
+		ASSERT_NS(dlb_buffer_offset + (xtLogOffset) dlb_buffer_len == log_offset);
+		/* Check if we are appending to the existing value in the buffer: */
+		if (dlb_buffer_offset + (xtLogOffset) dlb_buffer_len == log_offset) {
+			/* Can we just append: */
+			if (dlb_buffer_size >= dlb_buffer_len + size) {
+				memcpy(dlb_log_buffer + dlb_buffer_len, data, size);
+				dlb_buffer_len += size;
+				dlb_data_log->dlf_log_eof += size;
+				return OK;
+			}
+		}
+		if (dlb_flush_log(FALSE, thread) != OK)
+			return FAILED;
+	}
+	
+	ASSERT_NS(dlb_buffer_len == 0);
+	
+	if (dlb_buffer_size >= size) {
+		dlb_buffer_offset = log_offset;
+		dlb_buffer_len = size;
+		memcpy(dlb_log_buffer, data, size);
+		dlb_data_log->dlf_log_eof += size;
+		return OK;
+	}
+
+	/* Write directly: */
+	if (!xt_pwrite_file(dlb_data_log->dlf_log_file, log_offset, size, data, &thread->st_statistics.st_data, thread))
+		return FAILED;
+#ifdef DEBUG
+	if (log_offset + (xtLogOffset) size > (xtLogOffset) dlb_max_write_offset)
+		dlb_max_write_offset = log_offset + size;
+#endif
+	dlb_flush_required = TRUE;
+	dlb_data_log->dlf_log_eof += size;
+	return OK;
+}
+
+xtBool XTDataLogBuffer::dlb_read_log(xtLogID log_id, xtLogOffset log_offset, size_t size, xtWord1 *data, XTThreadPtr thread)
+{
+	size_t				red_size;
+	XTOpenLogFilePtr	open_log;
+
+	if (dlb_data_log && log_id == dlb_data_log->dlf_log_id) {
+		/* Reading from the write log, I can do this quicker: */
+		if (dlb_buffer_len) {
+			/* If it is in the buffer, then it is completely in the buffer. */
+			if (log_offset >= dlb_buffer_offset) {
+				if (log_offset + (xtLogOffset) size <= dlb_buffer_offset + (xtLogOffset) dlb_buffer_len) {
+					memcpy(data, dlb_log_buffer + (log_offset - dlb_buffer_offset), size);
+					return OK;
+				}
+				/* Should not happen, reading past EOF: */
+				ASSERT_NS(FALSE);
+				memset(data, 0, size);
+				return OK;
+			}
+			/* In the write log, but not in the buffer,
+			 * must be completely not in the log,
+			 * because only whole records are written to the
+			 * log:
+			 */
+			ASSERT_NS(log_offset + (xtLogOffset) size <= dlb_buffer_offset);
+		}		
+		return xt_pread_file(dlb_data_log->dlf_log_file, log_offset, size, size, data, NULL, &thread->st_statistics.st_data, thread);
+	}
+
+	/* Read from some other log: */
+	if (!dlb_db->db_datalogs.dlc_get_open_log(&open_log, log_id))
+		return FAILED;
+
+	if (!xt_pread_file(open_log->odl_log_file, log_offset, size, 0, data, &red_size, &thread->st_statistics.st_data, thread)) {
+		dlb_db->db_datalogs.dlc_release_open_log(open_log);
+		return FAILED;
+	}
+
+	dlb_db->db_datalogs.dlc_release_open_log(open_log);
+
+	if (red_size < size)
+		memset(data + red_size, 0, size - red_size);
+
+	return OK;
+}
+
+/*
+ * We assume that the given reference may not be valid.
+ * Only valid references actually cause a delete.
+ * Invalid references are logged, and ignored.
+ *
+ * Note this routine does not lock the compactor.
+ * This can lead to the some incorrect calculation is the
+ * amount of garbage. But nothing serious I think.
+ */
+xtBool XTDataLogBuffer::dlb_delete_log(xtLogID log_id, xtLogOffset log_offset, size_t size, xtTableID tab_id, xtRecordID rec_id, XTThreadPtr thread)
+{
+	XTactExtRecEntryDRec	record;
+	xtWord1					status = XT_LOG_ENT_EXT_REC_DEL;
+	XTOpenLogFilePtr		open_log;
+	xtBool					to_much_garbage;
+	XTDataLogFilePtr		data_log;
+
+	if (!dlb_read_log(log_id, log_offset, offsetof(XTactExtRecEntryDRec, er_data), (xtWord1 *) &record, thread))
+		return FAILED;
+
+	/* Already deleted: */
+	if (record.er_status_1 == XT_LOG_ENT_EXT_REC_DEL)
+		return OK;
+
+	if (record.er_status_1 != XT_LOG_ENT_EXT_REC_OK ||
+		size != XT_GET_DISK_4(record.er_data_size_4) ||
+		tab_id != XT_GET_DISK_4(record.er_tab_id_4) ||
+		rec_id != XT_GET_DISK_4(record.er_rec_id_4)) {
+		xt_register_xterr(XT_REG_CONTEXT, XT_ERR_BAD_EXT_RECORD);
+		return FAILED;
+	}
+
+	if (dlb_data_log && log_id == dlb_data_log->dlf_log_id) {
+		/* Writing to the write log, I can do this quicker: */
+		if (dlb_buffer_len) {
+			/* If it is in the buffer, then it is completely in the buffer. */
+			if (log_offset >= dlb_buffer_offset) {
+				if (log_offset + 1 <= dlb_buffer_offset + (xtLogOffset) dlb_buffer_len) {
+					*(dlb_log_buffer + (log_offset - dlb_buffer_offset)) = XT_LOG_ENT_EXT_REC_DEL;
+					goto inc_garbage_count;
+				}
+				/* Should not happen, writing past EOF: */
+				ASSERT_NS(FALSE);
+				return OK;
+			}
+			ASSERT_NS(log_offset + (xtLogOffset) size <= dlb_buffer_offset);
+		}
+
+		if (!xt_pwrite_file(dlb_data_log->dlf_log_file, log_offset, 1, &status, &thread->st_statistics.st_data, thread))
+			return FAILED;
+		
+		inc_garbage_count:
+		xt_lock_mutex_ns(&dlb_db->db_datalogs.dlc_head_lock);
+		dlb_data_log->dlf_garbage_count += offsetof(XTactExtRecEntryDRec, er_data) + size;
+		ASSERT_NS(dlb_data_log->dlf_garbage_count < dlb_data_log->dlf_log_eof);
+		if (!dl_write_garbage_level(dlb_data_log, dlb_data_log->dlf_log_file, FALSE, thread)) {
+			xt_unlock_mutex_ns(&dlb_db->db_datalogs.dlc_head_lock);
+			return FAILED;
+		}
+		dlb_flush_required = TRUE;
+		xt_unlock_mutex_ns(&dlb_db->db_datalogs.dlc_head_lock);
+		return OK;
+	}
+
+	/* Write to some other log, open the log: */
+	if (!dlb_db->db_datalogs.dlc_get_open_log(&open_log, log_id))
+		return FAILED;
+
+	/* Write the status byte: */
+	if (!xt_pwrite_file(open_log->odl_log_file, log_offset, 1, &status, &thread->st_statistics.st_data, thread))
+		goto failed;
+
+	data_log = open_log->odl_data_log;
+
+	/* Adjust the garbage level in the header. */
+	xt_lock_mutex_ns(&dlb_db->db_datalogs.dlc_head_lock);
+	data_log->dlf_garbage_count += offsetof(XTactExtRecEntryDRec, er_data) + size;
+	ASSERT_NS(data_log->dlf_garbage_count < data_log->dlf_log_eof);
+	if (!dl_write_garbage_level(data_log, open_log->odl_log_file, FALSE, thread)) {
+		xt_unlock_mutex_ns(&dlb_db->db_datalogs.dlc_head_lock);
+		goto failed;
+	}
+	to_much_garbage = data_log->dlf_to_much_garbage();
+	xt_unlock_mutex_ns(&dlb_db->db_datalogs.dlc_head_lock);
+
+	if (to_much_garbage &&
+		(data_log->dlf_state == XT_DL_HAS_SPACE || data_log->dlf_state == XT_DL_READ_ONLY)) {
+		/* There is too much garbage, it may be compacted. */
+		if (!dlb_db->db_datalogs.dls_set_log_state(data_log, XT_DL_MAY_COMPACT))
+			goto failed;
+	}
+
+	/* Release the open log: */
+	dlb_db->db_datalogs.dlc_release_open_log(open_log);
+	
+	return OK;
+
+	failed:
+	dlb_db->db_datalogs.dlc_release_open_log(open_log);
+	return FAILED;
+}
+
+/*
+ * Delete all the extended data belonging to a particular
+ * table.
+ */
+xtPublic void xt_dl_delete_ext_data(XTThreadPtr self, XTTableHPtr tab, xtBool XT_UNUSED(missing_ok), xtBool have_table_lock)
+{
+	XTOpenTablePtr	ot;
+	xtRecordID		page_rec_id, offs_rec_id;
+	XTTabRecExtDPtr	rec_buf;
+	xtWord4			log_over_size;
+	xtLogID			log_id;
+	xtLogOffset		log_offset;
+	xtWord1			*page_data;
+
+	page_data = (xtWord1 *) xt_malloc(self, tab->tab_recs.tci_page_size);
+	pushr_(xt_free, page_data);
+
+	/* Scan the table, and remove all exended data... */
+	if (!(ot = xt_open_table(tab))) {
+		if (self->t_exception.e_xt_err == XT_SYSTEM_ERROR &&
+			XT_FILE_NOT_FOUND(self->t_exception.e_sys_err))
+			return;
+		xt_throw(self);
+	}
+	ot->ot_thread = self;
+
+	/* {LOCK-EXT-REC} This lock is to stop the compactor changing records 
+	 * while we are doing the delete.
+	 */
+	xt_lock_mutex_ns(&tab->tab_db->db_co_ext_lock);
+
+	page_rec_id = 1;
+	while (page_rec_id < tab->tab_rec_eof_id) {
+		/* NOTE: There is a good reason for using xt_tc_read_page().
+		 * A deadlock can occur if using read, which can run out of
+		 * memory, which waits for the freeer, which may need to
+		 * open a table, which requires the db->db_tables lock,
+		 * which is owned by the this thread, when the function
+		 * is called from drop table.
+		 *
+		 * xt_tc_read_page() should work because no more changes
+		 * should happen to the table while we are dropping it.
+		 */
+		if (!tab->tab_recs.xt_tc_read_page(ot->ot_rec_file, page_rec_id, page_data, self))
+			goto failed;
+
+		for (offs_rec_id=0; offs_rec_id<tab->tab_recs.tci_rows_per_page && page_rec_id+offs_rec_id < tab->tab_rec_eof_id; offs_rec_id++) {
+			rec_buf = (XTTabRecExtDPtr) (page_data + (offs_rec_id * tab->tab_recs.tci_rec_size));
+			if (XT_REC_IS_EXT_DLOG(rec_buf->tr_rec_type_1)) {
+				log_over_size = XT_GET_DISK_4(rec_buf->re_log_dat_siz_4);
+				XT_GET_LOG_REF(log_id, log_offset, rec_buf);
+
+				if (!self->st_dlog_buf.dlb_delete_log(log_id, log_offset, log_over_size, tab->tab_id, page_rec_id+offs_rec_id, self)) {
+					if (self->t_exception.e_xt_err != XT_ERR_BAD_EXT_RECORD &&
+						self->t_exception.e_xt_err != XT_ERR_DATA_LOG_NOT_FOUND)
+						xt_log_and_clear_exception(self);
+				}
+			}
+		}
+
+		page_rec_id += tab->tab_recs.tci_rows_per_page;
+	}
+
+	xt_unlock_mutex_ns(&tab->tab_db->db_co_ext_lock);
+
+	xt_close_table(ot, TRUE, have_table_lock);
+	
+	freer_(); // xt_free(page_data)
+	return;
+	
+	failed:
+	xt_unlock_mutex_ns(&tab->tab_db->db_co_ext_lock);
+
+	xt_close_table(ot, TRUE, have_table_lock);
+	xt_throw(self);
+}
+
+/*
+ * --------------------------------------------------------------------------------
+ * GARBAGE COLLECTOR THREAD
+ */
+
+xtPublic void xt_dl_init_db(XTThreadPtr self, XTDatabaseHPtr db)
+{
+	xt_init_mutex_with_autoname(self, &db->db_co_ext_lock);
+	xt_init_mutex_with_autoname(self, &db->db_co_dlog_lock);
+}
+
+xtPublic void xt_dl_exit_db(XTThreadPtr self, XTDatabaseHPtr db)
+{
+	xt_stop_compactor(self, db);	// Already done!
+	db->db_co_thread = NULL;
+	xt_free_mutex(&db->db_co_ext_lock);
+	xt_free_mutex(&db->db_co_dlog_lock);
+}
+
+xtPublic void xt_dl_set_to_delete(XTThreadPtr self, XTDatabaseHPtr db, xtLogID log_id)
+{
+	XTDataLogFilePtr data_log;
+
+	if (!db->db_datalogs.dlc_get_data_log(&data_log, log_id, FALSE, NULL))
+		xt_throw(self);
+	if (data_log) {
+		if (!db->db_datalogs.dls_set_log_state(data_log, XT_DL_TO_DELETE))
+			xt_throw(self);
+	}
+}
+
+xtPublic void xt_dl_log_status(XTThreadPtr self, XTDatabaseHPtr db, XTStringBufferPtr strbuf)
+{
+	XTSortedListPtr		list;
+	XTDataLogFilePtr	data_log;
+	XTDataLogSegPtr		seg;
+	u_int				no_of_logs;
+	xtLogID				*log_id_ptr;
+
+	list = xt_new_sortedlist(self, sizeof(xtLogID), 20, 10, dl_cmp_log_id, NULL, NULL, FALSE, FALSE);
+	pushr_(xt_free_sortedlist, list);
+
+	for (u_int i=0; i<XT_DL_NO_OF_SEGMENTS; i++) {
+		for (u_int j=0; j<XT_DL_SEG_HASH_TABLE_SIZE; j++) {
+			seg = &db->db_datalogs.dlc_segment[i];
+			data_log = seg->dls_hash_table[j];
+			while (data_log) {
+				xt_sl_insert(self, list, &data_log->dlf_log_id, &data_log->dlf_log_id);
+				data_log = data_log->dlf_next_hash;
+			}
+		}
+	}
+
+	no_of_logs = xt_sl_get_size(list);
+	for (u_int i=0; i<no_of_logs; i++) {
+		log_id_ptr = (xtLogID *) xt_sl_item_at(list, i);
+		if (!db->db_datalogs.dlc_get_data_log(&data_log, *log_id_ptr, FALSE, &seg))
+			xt_throw(self);
+		if (data_log) {
+			xt_sb_concat(self, strbuf, "d-log: ");
+			xt_sb_concat_int8(self, strbuf, data_log->dlf_log_id);
+			xt_sb_concat(self, strbuf, " status=");
+			switch (data_log->dlf_state) {
+				case XT_DL_UNKNOWN:
+					xt_sb_concat(self, strbuf, "?");
+					break;
+				case XT_DL_HAS_SPACE:
+					xt_sb_concat(self, strbuf, "has-space ");
+					break;
+				case XT_DL_READ_ONLY:
+					xt_sb_concat(self, strbuf, "read-only ");
+					break;
+				case XT_DL_TO_COMPACT:
+					xt_sb_concat(self, strbuf, "to-compact");
+					break;
+				case XT_DL_COMPACTED:
+					xt_sb_concat(self, strbuf, "compacted ");
+					break;
+				case XT_DL_TO_DELETE:
+					xt_sb_concat(self, strbuf, "to-delete ");
+					break;
+				case XT_DL_DELETED:
+					xt_sb_concat(self, strbuf, "deleted   ");
+					break;
+				case XT_DL_EXCLUSIVE:
+					xt_sb_concat(self, strbuf, "x-locked  ");
+					break;
+			}
+			xt_sb_concat(self, strbuf, " eof=");
+			xt_sb_concat_int8(self, strbuf, data_log->dlf_log_eof);
+			xt_sb_concat(self, strbuf, " garbage=");
+			xt_sb_concat_int8(self, strbuf, data_log->dlf_garbage_count);
+			xt_sb_concat(self, strbuf, " g%=");
+			if (data_log->dlf_log_eof)
+				xt_sb_concat_int8(self, strbuf, data_log->dlf_garbage_count * 100 / data_log->dlf_log_eof);
+			else
+				xt_sb_concat(self, strbuf, "100");
+			xt_sb_concat(self, strbuf, " open=");
+			xt_sb_concat_int8(self, strbuf, data_log->dlf_open_count);
+			xt_sb_concat(self, strbuf, "\n");
+		}
+		xt_unlock_mutex_ns(&seg->dls_lock);
+	}
+
+	freer_(); // xt_free_sortedlist(list)
+}
+
+xtPublic void xt_dl_delete_logs(XTThreadPtr self, XTDatabaseHPtr db)
+{
+	char			path[PATH_MAX];
+	XTOpenDirPtr	od;
+	char			*file;
+	xtLogID			log_id;
+
+	xt_strcpy(PATH_MAX, path, db->db_main_path);
+	xt_add_data_dir(PATH_MAX, path);
+	if (!xt_fs_exists(path))
+		return;
+	pushsr_(od, xt_dir_close, xt_dir_open(self, path, NULL));
+	while (xt_dir_next(self, od)) {
+		file = xt_dir_name(self, od);
+		if ((log_id = (xtLogID) xt_file_name_to_id(file))) {
+			if (!db->db_datalogs.dlc_remove_data_log(log_id, TRUE))
+				xt_log_and_clear_exception(self);
+		}
+		if (xt_ends_with(file, ".xt")) {
+			xt_add_dir_char(PATH_MAX, path);
+			xt_strcat(PATH_MAX, path, file);
+			xt_fs_delete(self, path);
+			xt_remove_last_name_of_path(path);
+		}
+	}
+	freer_(); // xt_dir_close(od)
+
+	/* I no longer attach the condition: !db->db_multi_path
+	 * to removing this directory. This is because
+	 * the pbxt directory must now be removed explicitly
+	 * by drop database, or by delete all the PBXT
+	 * system tables.
+	 */
+	if (!xt_fs_rmdir(NULL, path))
+		xt_log_and_clear_exception(self);
+}
+
+typedef struct XTCompactorState {
+	XTSeqLogReadPtr			cs_seqread;
+	XTOpenTablePtr			cs_ot;
+	XTDataBufferRec			cs_databuf;
+} XTCompactorStateRec, *XTCompactorStatePtr;
+
+static void dl_free_compactor_state(XTThreadPtr self, XTCompactorStatePtr cs)
+{
+	if (cs->cs_seqread) {
+		cs->cs_seqread->sl_seq_exit();
+		delete cs->cs_seqread;
+		cs->cs_seqread = NULL;
+	}
+	if (cs->cs_ot) {
+		xt_db_return_table_to_pool(self, cs->cs_ot);
+		cs->cs_ot = NULL;
+	}
+	xt_db_set_size(self, &cs->cs_databuf, 0);
+}
+
+static XTOpenTablePtr dl_cs_get_open_table(XTThreadPtr self, XTCompactorStatePtr cs, xtTableID tab_id)
+{
+	if (cs->cs_ot) {
+		if (cs->cs_ot->ot_table->tab_id == tab_id)
+			return cs->cs_ot;
+
+		xt_db_return_table_to_pool(self, cs->cs_ot);
+		cs->cs_ot = NULL;
+	}
+
+	if (!cs->cs_ot) {
+		if (!(cs->cs_ot = xt_db_open_pool_table(self, self->st_database, tab_id, NULL, TRUE)))
+			return NULL;
+	}
+
+	return cs->cs_ot;
+}
+
+static void dl_co_wait(XTThreadPtr self, XTDatabaseHPtr db, u_int secs)
+{
+	xt_lock_mutex(self, &db->db_datalogs.dlc_lock);
+	pushr_(xt_unlock_mutex, &db->db_datalogs.dlc_lock);
+	if (!self->t_quit)
+		xt_timed_wait_cond(self, &db->db_datalogs.dlc_cond, &db->db_datalogs.dlc_lock, secs * 1000);
+	freer_(); // xt_unlock_mutex(&db->db_datalogs.dlc_lock)
+}
+
+/*
+ * Collect all the garbage in a file by moving all valid records
+ * into some other data log and updating the handles.
+ */
+static xtBool dl_collect_garbage(XTThreadPtr self, XTDatabaseHPtr db, XTDataLogFilePtr data_log)
+{
+	XTXactLogBufferDPtr	record;
+	size_t				size;
+	xtTableID			tab_id;
+	xtRecordID			rec_id;
+	XTCompactorStateRec	cs;
+	XTOpenTablePtr		ot;
+	XTTableHPtr			tab;
+	XTTabRecExtDRec		rec_buffer;
+	size_t				src_size;
+	xtLogID				src_log_id;
+	xtLogOffset			src_log_offset;
+	xtLogID				curr_log_id;
+	xtLogOffset			curr_log_offset;
+	xtLogID				dest_log_id = 0;
+	xtLogOffset			dest_log_offset = 0;
+	off_t				garbage_count = 0;
+
+	memset(&cs, 0, sizeof(XTCompactorStateRec));
+
+	if (!(cs.cs_seqread = new XTDataSeqRead()))
+		xt_throw_errno(XT_CONTEXT, XT_ENOMEM);
+
+	if (!cs.cs_seqread->sl_seq_init(db, xt_db_log_buffer_size)) {
+		delete cs.cs_seqread;
+		xt_throw(self);
+	}
+	pushr_(dl_free_compactor_state, &cs);
+
+	if (!cs.cs_seqread->sl_seq_start(data_log->dlf_log_id, data_log->dlf_start_offset, FALSE))
+		xt_throw(self);
+
+	for (;;) {
+		if (self->t_quit) {
+			/* Flush the destination log: */
+			xt_lock_mutex(self, &db->db_co_dlog_lock);
+			pushr_(xt_unlock_mutex, &db->db_co_dlog_lock);
+			if (!self->st_dlog_buf.dlb_flush_log(TRUE, self))
+				xt_throw(self);
+			freer_(); // xt_unlock_mutex(&db->db_co_dlog_lock)
+
+			/* Flush the transaction log. */
+			if (!xt_xlog_flush_log(db, self))
+				xt_throw(self);
+
+			xt_lock_mutex_ns(&db->db_datalogs.dlc_head_lock);
+			data_log->dlf_garbage_count += garbage_count;
+			ASSERT(data_log->dlf_garbage_count < data_log->dlf_log_eof);
+			if (!dl_write_garbage_level(data_log, cs.cs_seqread->sl_seq_open_file(), TRUE, self)) {
+				xt_unlock_mutex_ns(&db->db_datalogs.dlc_head_lock);
+				xt_throw(self);
+			}
+			xt_unlock_mutex_ns(&db->db_datalogs.dlc_head_lock);
+
+			freer_(); // dl_free_compactor_state(&cs)
+			return FAILED;
+		}
+		if (!cs.cs_seqread->sl_seq_next(&record, self))
+			xt_throw(self);
+		cs.cs_seqread->sl_seq_pos(&curr_log_id, &curr_log_offset);
+		if (!record) {
+			data_log->dlf_start_offset = curr_log_offset;
+			break;
+		}
+		switch (record->xh.xh_status_1) {
+			case XT_LOG_ENT_EXT_REC_OK:
+				size = XT_GET_DISK_4(record->er.er_data_size_4);
+				tab_id = XT_GET_DISK_4(record->er.er_tab_id_4);
+				rec_id = XT_GET_DISK_4(record->er.er_rec_id_4);
+				
+				if (!(ot = dl_cs_get_open_table(self, &cs, tab_id)))
+					break;
+				tab = ot->ot_table;
+				
+				/* All this is required for a valid record address: */
+				if (!rec_id || rec_id >= tab->tab_rec_eof_id)
+					break;
+
+				/* {LOCK-EXT-REC} It is important to prevent the compactor from modifying
+				 * a record that has been freed (and maybe allocated again).
+				 *
+				 * Consider the following sequence:
+				 *
+				 * 1. Compactor reads the record.
+				 * 2. The record is freed and reallocated.
+				 * 3. The compactor updates the record.
+				 *
+				 * To prevent this, the compactor locks out the
+				 * sweeper using the db_co_ext_lock lock. The db_co_ext_lock lock
+				 * prevents a extended record from being moved and removed at the
+				 * same time.
+				 *
+				 * The compactor also checks the status of the record before
+				 * moving a record.
+				 */
+				xt_lock_mutex(self, &db->db_co_ext_lock);
+				pushr_(xt_unlock_mutex, &db->db_co_ext_lock);
+
+				/* Read the record: */
+				if (!xt_tab_get_rec_data(ot, rec_id, offsetof(XTTabRecExtDRec, re_data), (xtWord1 *) &rec_buffer)) {
+					xt_log_and_clear_warning(self);
+					freer_(); // xt_unlock_mutex(&db->db_co_ext_lockk)
+					break;
+				}
+
+				/* [(7)] REMOVE is followed by FREE:
+				if (XT_REC_IS_REMOVED(rec_buffer.tr_rec_type_1) || !XT_REC_IS_EXT_DLOG(rec_buffer.tr_rec_type_1)) {
+				*/
+				if (!XT_REC_IS_EXT_DLOG(rec_buffer.tr_rec_type_1)) {
+					freer_(); // xt_unlock_mutex(&db->db_co_ext_lock)
+					break;
+				}
+
+				XT_GET_LOG_REF(src_log_id, src_log_offset, &rec_buffer);
+				src_size = (size_t) XT_GET_DISK_4(rec_buffer.re_log_dat_siz_4);
+
+				/* Does the record agree with the current position: */
+				if (curr_log_id != src_log_id ||
+					curr_log_offset != src_log_offset ||
+					size != src_size) {
+					freer_(); // xt_unlock_mutex(&db->db_co_ext_lock)
+					break;
+				}
+
+				size = offsetof(XTactExtRecEntryDRec, er_data) + size;
+
+				/* Allocate space in a destination log: */
+				xt_lock_mutex(self, &db->db_co_dlog_lock);
+				pushr_(xt_unlock_mutex, &db->db_co_dlog_lock);
+				if (!self->st_dlog_buf.dlb_get_log_offset(&dest_log_id, &dest_log_offset, size, self))
+					xt_throw(self);
+				freer_(); // xt_unlock_mutex(&db->db_co_dlog_lock)
+
+				/* This record is referenced by the data: */
+				xt_db_set_size(self, &cs.cs_databuf, size);
+				if (!cs.cs_seqread->sl_rnd_read(src_log_offset, size, cs.cs_databuf.db_data, NULL, self))
+					xt_throw(self);
+
+				/* The problem with writing to the buffer here, is that other
+				 * threads want to read the data! */
+				xt_lock_mutex(self, &db->db_co_dlog_lock);
+				pushr_(xt_unlock_mutex, &db->db_co_dlog_lock);
+				if (!self->st_dlog_buf.dlb_write_thru_log(dest_log_id, dest_log_offset, size, cs.cs_databuf.db_data, self))
+					xt_throw(self);
+				freer_(); // xt_unlock_mutex(&db->db_co_dlog_lock)
+
+				/* Make sure we flush the compactor target log, before we
+				 * flush the transaction log!!
+				 * This is done here [(8)]
+				 */
+
+				XT_SET_LOG_REF(&rec_buffer, dest_log_id, dest_log_offset);
+				xtOpSeqNo op_seq;
+				if (!xt_tab_put_log_rec_data(ot, XT_LOG_ENT_REC_MOVED, 0, rec_id, 8, (xtWord1 *) &rec_buffer.re_log_id_2, &op_seq))
+					xt_throw(self);
+				tab->tab_co_op_seq = op_seq;
+
+				/* Only records that were actually moved, count as garbage now!
+				 * This means, lost records, remain "lost" as far as the garbage
+				 * count is concerned!
+				 */
+				garbage_count += size;
+				freer_(); // xt_unlock_mutex(&db->db_co_ext_lock)
+				break;
+		}
+		data_log->dlf_start_offset = curr_log_offset;
+	}
+
+	/* Flush the distination log. */
+	xt_lock_mutex(self, &db->db_co_dlog_lock);
+	pushr_(xt_unlock_mutex, &db->db_co_dlog_lock);
+	if (!self->st_dlog_buf.dlb_flush_log(TRUE, self))
+		xt_throw(self);
+	freer_(); // xt_unlock_mutex(&db->db_co_dlog_lock)
+	
+	/* Flush the transaction log. */
+	if (!xt_xlog_flush_log(db, self))
+		xt_throw(self);
+
+	/* Save state in source log header. */
+	xt_lock_mutex_ns(&db->db_datalogs.dlc_head_lock);
+	data_log->dlf_garbage_count += garbage_count;
+	ASSERT(data_log->dlf_garbage_count < data_log->dlf_log_eof);
+	if (!dl_write_garbage_level(data_log, cs.cs_seqread->sl_seq_open_file(), TRUE, self)) {
+		xt_unlock_mutex_ns(&db->db_datalogs.dlc_head_lock);
+		xt_throw(self);
+	}
+	xt_unlock_mutex_ns(&db->db_datalogs.dlc_head_lock);
+
+	/* Wait for the writer to write all the changes.
+	 * Then we can start the delete process for the log:
+	 *
+	 * Note, if we do not wait, then it could be some operations are held up,
+	 * by being out of sequence. This could cause the log to be deleted
+	 * before all the operations have been performed (which are on a table
+	 * basis).
+	 *
+	 */
+	for (;;) {
+		u_int			edx;
+		XTTableEntryPtr tab_ptr;
+		xtBool			wait;
+
+		if (self->t_quit) {
+			freer_(); // dl_free_compactor_state(&cs)
+			return FAILED;
+		}
+		wait = FALSE;
+		xt_ht_lock(self, db->db_tables);
+		pushr_(xt_ht_unlock, db->db_tables);
+		xt_enum_tables_init(&edx);
+		while ((tab_ptr = xt_enum_tables_next(self, db, &edx))) {
+			if (tab_ptr->te_table && tab_ptr->te_table->tab_co_op_seq > tab_ptr->te_table->tab_head_op_seq) {
+				wait = TRUE;
+				break;
+			}
+		}
+		freer_(); // xt_ht_unlock(db->db_tables)
+		
+		if (!wait)
+			break;
+
+		/* Nobody will wake me, so check again shortly! */
+		dl_co_wait(self, db, 1);		
+	}
+
+	db->db_datalogs.dls_set_log_state(data_log, XT_DL_COMPACTED);
+
+#ifdef DEBUG_LOG_DELETE
+	printf("-- MARK FOR DELETE IN LOG: %d\n", (int) data_log->dlf_log_id);
+#endif
+	/* Log that this log should be deleted on the next checkpoint: */
+	// transaction log...
+	XTXactNewLogEntryDRec	log_rec;
+	log_rec.xl_status_1 = XT_LOG_ENT_DEL_LOG;
+	log_rec.xl_checksum_1 = XT_CHECKSUM_1(data_log->dlf_log_id);
+	XT_SET_DISK_4(log_rec.xl_log_id_4, data_log->dlf_log_id);
+	if (!xt_xlog_log_data(self, sizeof(XTXactNewLogEntryDRec), (XTXactLogBufferDPtr) &log_rec, XT_XLOG_WRITE_AND_FLUSH)) {
+		db->db_datalogs.dls_set_log_state(data_log, XT_DL_TO_COMPACT);
+		xt_throw(self);
+	}
+
+	freer_(); // dl_free_compactor_state(&cs)
+	return OK;
+}
+
+static void dl_co_not_busy(XTThreadPtr XT_UNUSED(self), XTDatabaseHPtr db)
+{
+	db->db_co_busy = FALSE;
+}
+
+static void dl_co_main(XTThreadPtr self, xtBool once_off)
+{
+	XTDatabaseHPtr		db = self->st_database;
+	xtLogID				*log_id_ptr, log_id;
+	XTDataLogFilePtr	data_log = NULL;
+
+	xt_set_low_priority(self);
+
+	while (!self->t_quit) {
+		while (!self->t_quit) {
+			xt_lock_mutex_ns(&db->db_datalogs.dlc_lock);
+			if ((log_id_ptr = (xtLogID *) xt_sl_first_item(db->db_datalogs.dlc_to_compact))) {
+				log_id = *log_id_ptr;
+			}
+			else
+				log_id = 0;
+			xt_unlock_mutex_ns(&db->db_datalogs.dlc_lock);
+			if (!log_id)
+				break;
+			if (!db->db_datalogs.dlc_get_data_log(&data_log, log_id, FALSE, NULL))
+				xt_throw(self);
+			ASSERT(data_log);
+			if (data_log) {
+				db->db_co_busy = TRUE;
+				pushr_(dl_co_not_busy, db);
+				dl_collect_garbage(self, db, data_log);
+				freer_(); // dl_co_not_busy(db)
+			}
+			else {
+				xt_lock_mutex_ns(&db->db_datalogs.dlc_lock);
+				xt_sl_delete(self, db->db_datalogs.dlc_to_compact, &log_id);
+				xt_unlock_mutex_ns(&db->db_datalogs.dlc_lock);
+			}
+		}
+
+		if (once_off)
+			break;
+
+		/* Wait for a signal that a data log can be collected: */
+		dl_co_wait(self, db, 120);
+	}
+}
+
+static void *dl_run_co_thread(XTThreadPtr self)
+{
+	XTDatabaseHPtr	db = (XTDatabaseHPtr) self->t_data;
+	int				count;
+	void			*mysql_thread;
+
+	if (!(mysql_thread = myxt_create_thread()))
+		xt_throw(self);
+
+	while (!self->t_quit) {
+		try_(a) {
+			/*
+			 * The garbage collector requires that the database
+			 * is in use because.
+			 */
+			xt_use_database(self, db, XT_FOR_COMPACTOR);
+
+			/* This action is both safe and required:
+			 *
+			 * safe: releasing the database is safe because as
+			 * long as this thread is running the database
+			 * reference is valid, and this reference cannot
+			 * be the only one to the database because
+			 * otherwize this thread would not be running.
+			 *
+			 * required: releasing the database is necessary
+			 * otherwise we cannot close the database
+			 * correctly because we only shutdown this
+			 * thread when the database is closed and we
+			 * only close the database when all references
+			 * are removed.
+			 */
+			xt_heap_release(self, self->st_database);
+
+			dl_co_main(self, FALSE);
+		}
+		catch_(a) {
+			if (!(self->t_exception.e_xt_err == XT_SIGNAL_CAUGHT &&
+				self->t_exception.e_sys_err == SIGTERM))
+				xt_log_and_clear_exception(self);
+		}
+		cont_(a);
+
+		/* Avoid releasing the database (done above) */
+		self->st_database = NULL;
+		xt_unuse_database(self, self);
+
+		/* After an exception, pause before trying again... */
+		/* Number of seconds */
+#ifdef DEBUG
+		count = 10;
+#else
+		count = 2*60;
+#endif
+		while (!self->t_quit && count > 0) {
+			sleep(1);
+			count--;
+		}
+	}
+
+   /*
+	* {MYSQL-THREAD-KILL}
+	myxt_destroy_thread(mysql_thread, TRUE);
+	*/
+	return NULL;
+}
+
+static void dl_free_co_thread(XTThreadPtr self, void *data)
+{
+	XTDatabaseHPtr db = (XTDatabaseHPtr) data;
+
+	if (db->db_co_thread) {
+		xt_lock_mutex(self, &db->db_datalogs.dlc_lock);
+		pushr_(xt_unlock_mutex, &db->db_datalogs.dlc_lock);
+		db->db_co_thread = NULL;
+		freer_(); // xt_unlock_mutex(&db->db_datalogs.dlc_lock)
+	}
+}
+
+xtPublic void xt_start_compactor(XTThreadPtr self, XTDatabaseHPtr db)
+{
+	char name[PATH_MAX];
+
+	sprintf(name, "GC-%s", xt_last_directory_of_path(db->db_main_path));
+	xt_remove_dir_char(name);
+	db->db_co_thread = xt_create_daemon(self, name);
+	xt_set_thread_data(db->db_co_thread, db, dl_free_co_thread);
+	xt_run_thread(self, db->db_co_thread, dl_run_co_thread);
+}
+
+static void dl_wake_co_thread(XTDatabaseHPtr db)
+{
+	if (!xt_signal_cond(NULL, &db->db_datalogs.dlc_cond))
+		xt_log_and_clear_exception_ns();
+}
+
+xtPublic void xt_stop_compactor(XTThreadPtr self, XTDatabaseHPtr db)
+{
+	XTThreadPtr thr_co;
+
+	if (db->db_co_thread) {
+		xt_lock_mutex(self, &db->db_datalogs.dlc_lock);
+		pushr_(xt_unlock_mutex, &db->db_datalogs.dlc_lock);
+
+		/* This pointer is safe as long as you have the transaction lock. */
+		if ((thr_co = db->db_co_thread)) {
+			xtThreadID tid = thr_co->t_id;
+
+			/* Make sure the thread quits when woken up. */
+			xt_terminate_thread(self, thr_co);
+
+			dl_wake_co_thread(db);
+	
+			freer_(); // xt_unlock_mutex(&db->db_datalogs.dlc_lock)
+
+			/*
+			 * This seems to kill the whole server sometimes!!
+			 * SIGTERM is going to a different thread??!
+			xt_kill_thread(thread);
+			 */
+			xt_wait_for_thread(tid, FALSE);
+	
+			/* PMC - This should not be necessary to set the signal here, but in the
+			 * debugger the handler is not called!!?
+			thr_co->t_delayed_signal = SIGTERM;
+			xt_kill_thread(thread);
+			 */
+			db->db_co_thread = NULL;
+		}
+		else
+			freer_(); // xt_unlock_mutex(&db->db_datalogs.dlc_lock)
+	}
+}
+
diff --git a/storage/pbxt/src/datalog_xt.h b/storage/pbxt/src/datalog_xt.h
new file mode 100644
index 00000000000..2eeba7bfab4
--- /dev/null
+++ b/storage/pbxt/src/datalog_xt.h
@@ -0,0 +1,229 @@
+/* Copyright (c) 2005 PrimeBase Technologies GmbH
+ *
+ * PrimeBase XT
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * 2005-01-24	Paul McCullagh
+ *
+ * H&G2JCtL
+ */
+#ifndef __xt_datalog_h__
+#define __xt_datalog_h__
+
+#include "pthread_xt.h"
+#include "filesys_xt.h"
+#include "sortedlist_xt.h"
+#include "xactlog_xt.h"
+#include "util_xt.h"
+
+struct XTThread;
+struct XTDatabase;
+struct xXTDataLog;
+struct XTTable;
+struct XTOpenTable;
+
+#define XT_SET_LOG_REF(d, l, o)			do { XT_SET_DISK_2((d)->re_log_id_2, l); \
+											 XT_SET_DISK_6((d)->re_log_offs_6, o); \
+										} while (0)
+#define XT_GET_LOG_REF(l, o, s)			do { l = XT_GET_DISK_2((s)->re_log_id_2); \
+											 o = XT_GET_DISK_6((s)->re_log_offs_6); \
+										} while (0)
+
+#ifdef DEBUG
+//#define USE_DEBUG_SIZES
+#endif
+
+#ifdef USE_DEBUG_SIZES
+#define XT_DL_MAX_LOG_ID				500
+#define XT_DL_LOG_POOL_SIZE				10
+#define XT_DL_HASH_TABLE_SIZE			5
+#define XT_DL_SEGMENT_SHIFTS			1
+#else
+#define XT_DL_MAX_LOG_ID				0x7FFF
+#define XT_DL_LOG_POOL_SIZE				1000
+#define XT_DL_HASH_TABLE_SIZE			10000
+#define XT_DL_SEGMENT_SHIFTS			3
+#endif
+
+#define XT_DL_SEG_HASH_TABLE_SIZE		(XT_DL_HASH_TABLE_SIZE / XT_DL_NO_OF_SEGMENTS)
+#define XT_DL_NO_OF_SEGMENTS			(1 << XT_DL_SEGMENT_SHIFTS)
+#define XT_DL_SEGMENT_MASK				(XT_DL_NO_OF_SEGMENTS - 1)
+
+typedef struct XTOpenLogFile {
+	xtLogID					olf_log_id;
+	XTOpenFilePtr			odl_log_file;					/* The open file handle. */
+	struct XTDataLogFile	*odl_data_log;
+
+	xtBool					odl_in_use;
+	struct XTOpenLogFile	*odl_next_free;					/* Pointer to the next on the free list. */
+	struct XTOpenLogFile	*odl_prev_free;					/* Pointer to the previous on the free list. */
+
+	xtWord4					odl_ru_time;					/* If this is in the top 1/4 don't change position in MRU list. */
+	struct XTOpenLogFile	*odl_mr_used;					/* More recently used pages. */
+	struct XTOpenLogFile	*odl_lr_used;					/* Less recently used pages. */
+} XTOpenLogFileRec, *XTOpenLogFilePtr;
+
+#define XT_DL_MAY_COMPACT	-1								/* This is an indication to set the state to XT_DL_TO_COMPACT. */
+#define XT_DL_UNKNOWN		0
+#define XT_DL_HAS_SPACE		1								/* The log is not yet full, and can be used for writing. */
+#define XT_DL_READ_ONLY		2								/* The log is full, and can only be read now. */
+#define XT_DL_TO_COMPACT	3								/* The log has too much garbage, and must be compacted. */
+#define XT_DL_COMPACTED		4								/* The state after compaction. */
+#define XT_DL_TO_DELETE		5								/* All references to this log have been removed, and it is to be deleted. */
+#define XT_DL_DELETED		6								/* After deletion, logs are locked until the next checkpoint. */
+#define XT_DL_EXCLUSIVE		7								/* The log is locked and being written by a thread. */
+
+typedef struct XTDataLogFile {
+	xtLogID					dlf_log_id;						/* The ID of the data log. */
+	int						dlf_state;
+	struct XTDataLogFile	*dlf_next_hash;					/* Pointer to the next on the hash list. */
+	u_int					dlf_open_count;					/* Number of open log files. */
+	XTOpenLogFilePtr		dlf_free_list;					/* The open file free list. */
+	off_t					dlf_log_eof;
+	off_t					dlf_start_offset;				/* Start offset for garbage collection. */
+	off_t					dlf_garbage_count;				/* The amount of garbage in the log file. */
+	XTOpenFilePtr			dlf_log_file;					/* The open file handle (if the log is in exclusive use!!). */
+
+	off_t					dlf_space_avaliable();
+	xtBool					dlf_to_much_garbage();
+} XTDataLogFileRec, *XTDataLogFilePtr;
+
+typedef struct XTDataLogSeg {
+	xt_mutex_type			dls_lock;						/* The cache segment lock. */
+	xt_cond_type			dls_cond;
+	XTDataLogFilePtr		dls_hash_table[XT_DL_SEG_HASH_TABLE_SIZE];
+} XTDataLogSegRec, *XTDataLogSegPtr;
+
+typedef struct XTDataLogCache {
+	struct XTDatabase		*dlc_db;
+
+	xt_mutex_type			dlc_lock;						/* The public cache lock. */
+	xt_cond_type			dlc_cond;						/* The public cache wait condition. */
+	XTSortedListPtr			dlc_has_space;					/* List of logs with space for more data. */
+	XTSortedListPtr			dlc_to_compact;					/* List of logs to be compacted. */
+	XTSortedListPtr			dlc_to_delete;					/* List of logs to be deleted at next checkpoint. */
+	XTSortedListPtr			dlc_deleted;					/* List of logs deleted at the previous checkpoint. */
+	XTDataLogSegRec			dlc_segment[XT_DL_NO_OF_SEGMENTS];
+	xtLogID					dlc_next_log_id;				/* The next log ID to be used to create a new log. */
+
+	xt_mutex_type			dlc_mru_lock;					/* The lock for the LRU list. */
+	xtWord4					dlc_ru_now;
+	XTOpenLogFilePtr		dlc_lru_open_log;
+	XTOpenLogFilePtr		dlc_mru_open_log;
+	u_int					dlc_open_count;					/* The total open file count. */
+
+	xt_mutex_type			dlc_head_lock;					/* The lock for changing the header of shared logs. */
+
+	void					dls_remove_log(XTDataLogFilePtr data_log);
+	int						dls_get_log_state(XTDataLogFilePtr data_log);
+	xtBool					dls_set_log_state(XTDataLogFilePtr data_log, int state);
+	void					dlc_init(struct XTThread *self, struct XTDatabase *db);
+	void					dlc_exit(struct XTThread *self);
+	void					dlc_name(size_t size, char *path, xtLogID log_id);
+	xtBool					dlc_open_log(XTOpenFilePtr *fh, xtLogID log_id, int mode);
+	xtBool					dlc_unlock_log(XTDataLogFilePtr data_log);
+	XTDataLogFilePtr		dlc_get_log_for_writing(off_t space_required, struct XTThread *thread);
+	xtBool					dlc_get_data_log(XTDataLogFilePtr *data_log, xtLogID log_id, xtBool create, XTDataLogSegPtr *ret_seg);
+	xtBool					dlc_remove_data_log(xtLogID log_id, xtBool just_close);
+	xtBool					dlc_get_open_log(XTOpenLogFilePtr *open_log, xtLogID log_id);
+	void					dlc_release_open_log(XTOpenLogFilePtr open_log);
+} XTDataLogCacheRec, *XTDataLogCachePtr;
+
+/* The data log buffer, used by a thread to write a
+ * data log file.
+ */
+typedef struct XTDataLogBuffer {
+	struct XTDatabase		*dlb_db;
+	XTDataLogFilePtr		dlb_data_log;						/* The data log file. */
+	
+	xtLogOffset				dlb_buffer_offset;					/* The offset into the log file. */
+	size_t					dlb_buffer_size;					/* The size of the buffer. */
+	size_t					dlb_buffer_len;						/* The amount of data in the buffer. */
+	xtWord1					*dlb_log_buffer;
+	xtBool					dlb_flush_required;
+#ifdef DEBUG
+	off_t					dlb_max_write_offset;
+#endif
+
+	void					dlb_init(struct XTDatabase *db, size_t buffer_size);
+	void					dlb_exit(struct XTThread *self);
+	xtBool					dlb_close_log(struct XTThread *thread);
+	xtBool					dlb_get_log_offset(xtLogID *log_id, off_t *out_offset, size_t req_size, struct XTThread *thread);
+	xtBool					dlb_flush_log(xtBool commit, struct XTThread *thread);
+	xtBool					dlb_write_thru_log(xtLogID log_id, xtLogOffset log_offset, size_t size, xtWord1 *data, struct XTThread *thread);
+	xtBool					dlb_append_log(xtLogID log_id, off_t out_offset, size_t size, xtWord1 *data, struct XTThread *thread);
+	xtBool					dlb_read_log(xtLogID log_id, off_t offset, size_t size, xtWord1 *data, struct XTThread *thread);
+	xtBool					dlb_delete_log(xtLogID log_id, off_t offset, size_t size, xtTableID tab_id, xtRecordID tab_offset, struct XTThread *thread);
+} XTDataLogBufferRec, *XTDataLogBufferPtr;
+
+typedef struct XTSeqLogRead {
+	struct XTDatabase		*sl_db;
+
+	virtual					~XTSeqLogRead() { }
+	virtual xtBool			sl_seq_init(struct XTDatabase *db, size_t buffer_size) { (void) buffer_size; sl_db = db; return OK; };
+	virtual void			sl_seq_exit() { };
+	virtual XTOpenFilePtr	sl_seq_open_file() { return NULL; };
+	virtual void			sl_seq_pos(xtLogID *log_id, xtLogOffset *log_offset) { (void) log_id; (void) log_offset; };
+	virtual xtBool			sl_seq_start(xtLogID log_id, xtLogOffset log_offset, xtBool missing_ok) {
+		(void) log_id; (void) log_offset; (void) missing_ok; return OK; 
+	};
+	virtual xtBool			sl_rnd_read(xtLogOffset log_offset, size_t size, xtWord1 *data, size_t *read, struct XTThread *thread) {
+		(void) log_offset; (void) size; (void) data; (void) read; (void) thread; return OK;
+	};
+	virtual xtBool			sl_seq_next(XTXactLogBufferDPtr *entry, struct XTThread *thread) {
+		(void) entry; (void) thread; return OK;
+	};
+	virtual void			sl_seq_skip(size_t size) { (void) size; }
+} XTSeqLogReadRec, *XTSeqLogReadPtr;
+
+typedef struct XTDataSeqRead : public XTSeqLogRead {
+	XTOpenFilePtr			sl_log_file;
+	xtLogID					sl_rec_log_id;		/* The current record log ID. */
+	xtLogOffset				sl_rec_log_offset;	/* The current log read position. */
+	size_t					sl_record_len;		/* The length of the current record. */
+	xtLogOffset				sl_log_eof;
+	xtLogOffset				sl_extra_garbage;	/* Garbage found during a scan. */
+
+	size_t					sl_buffer_size;		/* Size of the buffer. */
+	xtLogOffset				sl_buf_log_offset;	/* File offset of the buffer. */
+	size_t					sl_buffer_len;		/* Amount of data in the buffer. */
+	xtWord1					*sl_buffer;
+
+	virtual					~XTDataSeqRead() { }
+	virtual xtBool			sl_seq_init(struct XTDatabase *db, size_t buffer_size);
+	virtual void			sl_seq_exit();
+	virtual XTOpenFilePtr	sl_seq_open_file();
+	virtual void			sl_seq_pos(xtLogID *log_id, xtLogOffset *log_offset);
+	virtual xtBool			sl_seq_start(xtLogID log_id, xtLogOffset log_offset, xtBool missing_ok);
+	virtual xtBool			sl_rnd_read(xtLogOffset log_offset, size_t size, xtWord1 *data, size_t *read, struct XTThread *thread);
+	virtual xtBool			sl_seq_next(XTXactLogBufferDPtr *entry, struct XTThread *thread);
+	virtual void			sl_seq_skip(size_t size);
+	virtual void			sl_seq_skip_to(off_t offset);
+} XTDataSeqReadRec, *XTDataSeqReadPtr;
+
+void	xt_dl_delete_ext_data(struct XTThread *self, struct XTTable *tab, xtBool missing_ok, xtBool have_table_lock);
+
+void	xt_start_compactor(struct XTThread *self, struct XTDatabase *db);
+void	xt_stop_compactor(struct XTThread *self, struct XTDatabase *db);
+
+void	xt_dl_init_db(struct XTThread *self, struct XTDatabase *db);
+void	xt_dl_exit_db(struct XTThread *self, struct XTDatabase *db);
+void	xt_dl_set_to_delete(struct XTThread *self, struct XTDatabase *db, xtLogID log_id);
+void	xt_dl_log_status(struct XTThread *self, struct XTDatabase *db, XTStringBufferPtr strbuf);
+void	xt_dl_delete_logs(struct XTThread *self, struct XTDatabase *db);
+
+#endif
+
diff --git a/storage/pbxt/src/discover_xt.cc b/storage/pbxt/src/discover_xt.cc
new file mode 100644
index 00000000000..7f7281d8c30
--- /dev/null
+++ b/storage/pbxt/src/discover_xt.cc
@@ -0,0 +1,1682 @@
+/* Copyright (c) 2008 PrimeBase Technologies GmbH, Germany
+ * Derived from code Copyright (C) 2000-2004 MySQL AB
+ *
+ * PrimeBase XT
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ *  Created by Leslie on 8/27/08.
+ *
+ */
+
+#include "xt_config.h"
+
+#ifndef DRIZZLED
+#include "mysql_priv.h"
+#include "item_create.h"
+#include <m_ctype.h>
+#else
+#include <drizzled/session.h>
+#include <drizzled/server_includes.h>
+#include <drizzled/sql_base.h>
+#include <drizzled/statement/alter_table.h>
+#include <algorithm>
+#include <sstream>
+#endif
+
+#include "strutil_xt.h"
+#include "ha_pbxt.h"
+#include "discover_xt.h"
+#include "ha_xtsys.h"
+
+#ifndef DRIZZLED
+#if MYSQL_VERSION_ID >= 50404
+#define DOT_STR(x)			x.str
+#else
+#define DOT_STR(x)			x
+#endif
+#endif
+
+//#ifndef DRIZZLED
+#define LOCK_OPEN_HACK_REQUIRED
+//#endif // DRIZZLED
+
+#ifdef LOCK_OPEN_HACK_REQUIRED
+#ifdef DRIZZLED
+
+using namespace drizzled;
+using namespace std;
+
+#define mysql_create_table_no_lock hacked_mysql_create_table_no_lock
+
+namespace drizzled {
+
+int rea_create_table(Session *session, const char *path,
+                     const char *db, const char *table_name,
+                     message::Table *table_proto,
+                     HA_CREATE_INFO *create_info,
+                     List<CreateField> &create_field,
+                     uint32_t key_count,KEY *key_info);
+}
+
+static uint32_t build_tmptable_filename(Session* session,
+                                        char *buff, size_t bufflen)
+{
+  uint32_t length;
+  ostringstream path_str, post_tmpdir_str;
+  string tmp;
+
+  path_str << drizzle_tmpdir;
+  post_tmpdir_str << "/" << TMP_FILE_PREFIX << current_pid;
+  post_tmpdir_str << session->thread_id << session->tmp_table++;
+  tmp= post_tmpdir_str.str();
+
+  transform(tmp.begin(), tmp.end(), tmp.begin(), ::tolower);
+
+  path_str << tmp;
+
+  if (bufflen < path_str.str().length())
+    length= 0;
+  else
+    length= unpack_filename(buff, path_str.str().c_str());
+
+  return length;
+}
+
+static bool mysql_create_table_no_lock(Session *session,
+                                const char *db, const char *table_name,
+                                HA_CREATE_INFO *create_info,
+                                message::Table *table_proto,
+                                AlterInfo *alter_info,
+                                bool internal_tmp_table,
+                                uint32_t select_field_count)
+{
+  char          path[FN_REFLEN];
+  uint32_t          path_length;
+  uint          db_options, key_count;
+  KEY           *key_info_buffer;
+  Cursor        *file;
+  bool          error= true;
+  /* Check for duplicate fields and check type of table to create */
+  if (!alter_info->create_list.elements)
+  {
+    my_message(ER_TABLE_MUST_HAVE_COLUMNS, ER(ER_TABLE_MUST_HAVE_COLUMNS),
+               MYF(0));
+    return true;
+  }
+  assert(strcmp(table_name,table_proto->name().c_str())==0);
+  if (check_engine(session, table_name, create_info))
+    return true;
+  db_options= create_info->table_options;
+  if (create_info->row_type == ROW_TYPE_DYNAMIC)
+    db_options|=HA_OPTION_PACK_RECORD;
+  
+  /*if (!(file= create_info->db_type->getCursor((TableShare*) 0, session->mem_root)))
+  {
+    my_error(ER_OUTOFMEMORY, MYF(0), sizeof(Cursor));
+    return true;
+  }*/
+
+  /* PMC - Done to avoid getting the partition handler by mistake! */
+  if (!(file= new (session->mem_root) ha_xtsys(pbxt_hton, NULL)))
+  {
+    my_error(ER_OUTOFMEMORY, MYF(0), sizeof(Cursor));
+    return true;
+  }
+
+  set_table_default_charset(create_info, (char*) db);
+
+  if (mysql_prepare_create_table(session, 
+                                 create_info,
+                                 table_proto,
+                                 alter_info,
+                                 internal_tmp_table,
+                                 &db_options, file,
+                                 &key_info_buffer, &key_count,
+                                 select_field_count))
+    goto err;
+
+      /* Check if table exists */
+  if (create_info->options & HA_LEX_CREATE_TMP_TABLE)
+  {
+    path_length= build_tmptable_filename(session, path, sizeof(path));
+  }
+  else
+  {
+ #ifdef FN_DEVCHAR
+    /* check if the table name contains FN_DEVCHAR when defined */
+    if (strchr(table_name, FN_DEVCHAR))
+    {
+      my_error(ER_WRONG_TABLE_NAME, MYF(0), table_name);
+      return true;
+    }
+#endif
+    path_length= build_table_filename(path, sizeof(path), db, table_name, internal_tmp_table);
+  }
+
+  /* Check if table already exists */
+  if ((create_info->options & HA_LEX_CREATE_TMP_TABLE) &&
+      session->find_temporary_table(db, table_name))
+  {
+    if (create_info->options & HA_LEX_CREATE_IF_NOT_EXISTS)
+    {
+      create_info->table_existed= 1;            // Mark that table existed
+      push_warning_printf(session, DRIZZLE_ERROR::WARN_LEVEL_NOTE,
+                          ER_TABLE_EXISTS_ERROR, ER(ER_TABLE_EXISTS_ERROR),
+                          table_name);
+      error= 0;
+      goto err;
+    }
+    my_error(ER_TABLE_EXISTS_ERROR, MYF(0), table_name);
+    goto err;
+  }
+
+  //pthread_mutex_lock(&LOCK_open); /* CREATE TABLE (some confussion on naming, double check) */
+  if (!internal_tmp_table && !(create_info->options & HA_LEX_CREATE_TMP_TABLE))
+  {
+    if (plugin::StorageEngine::getTableDefinition(*session,
+                                                  path, 
+                                                  db,
+                                                  table_name,
+                                                  internal_tmp_table) == EEXIST)
+    {
+      if (create_info->options & HA_LEX_CREATE_IF_NOT_EXISTS)
+      {
+        error= false;
+        push_warning_printf(session, DRIZZLE_ERROR::WARN_LEVEL_NOTE,
+                            ER_TABLE_EXISTS_ERROR, ER(ER_TABLE_EXISTS_ERROR),
+                            table_name);
+        create_info->table_existed= 1;          // Mark that table existed
+      }
+      else
+        my_error(ER_TABLE_EXISTS_ERROR,MYF(0),table_name);
+
+      goto unlock_and_end;
+    }
+    /*
+ *       We don't assert here, but check the result, because the table could be
+ *             in the table definition cache and in the same time the .frm could be
+ *                   missing from the disk, in case of manual intervention which deletes
+ *                         the .frm file. The user has to use FLUSH TABLES; to clear the cache.
+ *                               Then she could create the table. This case is pretty obscure and
+ *                                     therefore we don't introduce a new error message only for it.
+ *                                         */
+    if (TableShare::getShare(db, table_name))
+    {
+      my_error(ER_TABLE_EXISTS_ERROR, MYF(0), table_name);
+      goto unlock_and_end;
+    }
+  }
+  /*
+ *     Check that table with given name does not already
+ *         exist in any storage engine. In such a case it should
+ *             be discovered and the error ER_TABLE_EXISTS_ERROR be returned
+ *                 unless user specified CREATE TABLE IF EXISTS
+ *                     The LOCK_open mutex has been locked to make sure no
+ *                         one else is attempting to discover the table. Since
+ *                             it's not on disk as a frm file, no one could be using it!
+ *                               */
+  if (!(create_info->options & HA_LEX_CREATE_TMP_TABLE))
+  {
+    bool create_if_not_exists =
+      create_info->options & HA_LEX_CREATE_IF_NOT_EXISTS;
+
+    char table_path[FN_REFLEN];
+    uint32_t          table_path_length;
+
+    table_path_length= build_table_filename(table_path, sizeof(table_path),
+                                            db, table_name, false);
+
+    int retcode= plugin::StorageEngine::getTableDefinition(*session,
+                                                           table_path, 
+                                                           db,
+                                                           table_name,
+                                                           false);
+    switch (retcode)
+    {
+      case ENOENT:
+        /* Normal case, no table exists. we can go and create it */
+        break;
+      case EEXIST:
+        if (create_if_not_exists)
+        {
+          error= false;
+          push_warning_printf(session, DRIZZLE_ERROR::WARN_LEVEL_NOTE,
+                              ER_TABLE_EXISTS_ERROR, ER(ER_TABLE_EXISTS_ERROR),
+                              table_name);
+          create_info->table_existed= 1;                // Mark that table existed
+          goto unlock_and_end;
+        }
+        my_error(ER_TABLE_EXISTS_ERROR,MYF(0),table_name);
+        goto unlock_and_end;
+      default:
+        my_error(retcode, MYF(0),table_name);
+        goto unlock_and_end;
+    }
+  }
+
+  session->set_proc_info("creating table");
+  create_info->table_existed= 0;                // Mark that table is created
+
+  create_info->table_options=db_options;
+
+  if (rea_create_table(session, path, db, table_name,
+                       table_proto,
+                       create_info, alter_info->create_list,
+                       key_count, key_info_buffer))
+    goto unlock_and_end;
+
+  if (create_info->options & HA_LEX_CREATE_TMP_TABLE)
+  {
+    /* Open table and put in temporary table list */
+    if (!(session->open_temporary_table(path, db, table_name, 1, OTM_OPEN)))
+    {
+      (void) session->rm_temporary_table(create_info->db_type, path);
+      goto unlock_and_end;
+    }
+  }
+
+  /*
+ *     Don't write statement if:
+ *         - It is an internal temporary table,
+ *             - Row-based logging is used and it we are creating a temporary table, or
+ *                 - The binary log is not open.
+ *                     Otherwise, the statement shall be binlogged.
+ *                        */
+  if (!internal_tmp_table &&
+      ((!(create_info->options & HA_LEX_CREATE_TMP_TABLE))))
+    write_bin_log(session, session->query, session->query_length);
+  error= false;
+unlock_and_end:
+  //pthread_mutex_unlock(&LOCK_open);
+
+err:
+  session->set_proc_info("After create");
+  delete file;
+  return(error);
+}
+
+#else // MySQL case
+///////////////////////////////
+/*
+ * Unfortunately I cannot use the standard mysql_create_table_no_lock() because it will lock "LOCK_open"
+ * which has already been locked while the server is performing table discovery. So I have added this hack 
+ * in here to create my own version. The following macros will make the changes I need to get it to work.
+ * The actual function code has been copied here without changes.
+ *
+ * Its almost enough to make you want to cry. :(
+*/
+//-----------------------------
+
+#ifdef pthread_mutex_lock
+#undef pthread_mutex_lock
+#endif
+
+#ifdef pthread_mutex_unlock
+#undef pthread_mutex_unlock
+#endif
+
+#define mysql_create_table_no_lock hacked_mysql_create_table_no_lock
+#define pthread_mutex_lock(l)
+#define pthread_mutex_unlock(l)
+
+#define check_engine(t, n, c) (0)
+#define set_table_default_charset(t, c, d)
+
+void calculate_interval_lengths(CHARSET_INFO *cs, TYPELIB *interval,
+                                uint32 *max_length, uint32 *tot_length);
+
+uint build_tmptable_filename(THD* thd, char *buff, size_t bufflen);
+uint build_table_filename(char *buff, size_t bufflen, const char *db,
+                          const char *table_name, const char *ext, uint flags);
+
+//////////////////////////////////////////////////////////
+////// START OF CUT AND PASTES FROM  sql_table.cc ////////
+//////////////////////////////////////////////////////////
+
+// sort_keys() cut and pasted directly from sql_table.cc. 
+static int sort_keys(KEY *a, KEY *b)
+{
+  ulong a_flags= a->flags, b_flags= b->flags;
+  
+  if (a_flags & HA_NOSAME)
+  {
+    if (!(b_flags & HA_NOSAME))
+      return -1;
+    if ((a_flags ^ b_flags) & HA_NULL_PART_KEY)
+    {
+      /* Sort NOT NULL keys before other keys */
+      return (a_flags & HA_NULL_PART_KEY) ? 1 : -1;
+    }
+    if (a->name == primary_key_name)
+      return -1;
+    if (b->name == primary_key_name)
+      return 1;
+    /* Sort keys don't containing partial segments before others */
+    if ((a_flags ^ b_flags) & HA_KEY_HAS_PART_KEY_SEG)
+      return (a_flags & HA_KEY_HAS_PART_KEY_SEG) ? 1 : -1;
+  }
+  else if (b_flags & HA_NOSAME)
+    return 1;					// Prefer b
+
+  if ((a_flags ^ b_flags) & HA_FULLTEXT)
+  {
+    return (a_flags & HA_FULLTEXT) ? 1 : -1;
+  }
+  /*
+    Prefer original key order.	usable_key_parts contains here
+    the original key position.
+  */
+  return ((a->usable_key_parts < b->usable_key_parts) ? -1 :
+	  (a->usable_key_parts > b->usable_key_parts) ? 1 :
+	  0);
+}
+
+// check_if_keyname_exists() cut and pasted directly from sql_table.cc. 
+static bool
+check_if_keyname_exists(const char *name, KEY *start, KEY *end)
+{
+  for (KEY *key=start ; key != end ; key++)
+    if (!my_strcasecmp(system_charset_info,name,key->name))
+      return 1;
+  return 0;
+}
+
+// make_unique_key_name() cut and pasted directly from sql_table.cc. 
+static char *
+make_unique_key_name(const char *field_name,KEY *start,KEY *end)
+{
+  char buff[MAX_FIELD_NAME],*buff_end;
+
+  if (!check_if_keyname_exists(field_name,start,end) &&
+      my_strcasecmp(system_charset_info,field_name,primary_key_name))
+    return (char*) field_name;			// Use fieldname
+  buff_end=strmake(buff,field_name, sizeof(buff)-4);
+
+  /*
+    Only 3 chars + '\0' left, so need to limit to 2 digit
+    This is ok as we can't have more than 100 keys anyway
+  */
+  for (uint i=2 ; i< 100; i++)
+  {
+    *buff_end= '_';
+    int10_to_str(i, buff_end+1, 10);
+    if (!check_if_keyname_exists(buff,start,end))
+      return sql_strdup(buff);
+  }
+  return (char*) "not_specified";		// Should never happen
+}
+
+
+// prepare_blob_field() cut and pasted directly from sql_table.cc. 
+static bool prepare_blob_field(THD *thd, Create_field *sql_field)
+{
+  DBUG_ENTER("prepare_blob_field");
+
+  if (sql_field->length > MAX_FIELD_VARCHARLENGTH &&
+      !(sql_field->flags & BLOB_FLAG))
+  {
+    /* Convert long VARCHAR columns to TEXT or BLOB */
+    char warn_buff[MYSQL_ERRMSG_SIZE];
+
+    if (sql_field->def || (thd->variables.sql_mode & (MODE_STRICT_TRANS_TABLES |
+                                                      MODE_STRICT_ALL_TABLES)))
+    {
+      my_error(ER_TOO_BIG_FIELDLENGTH, MYF(0), sql_field->field_name,
+               MAX_FIELD_VARCHARLENGTH / sql_field->charset->mbmaxlen);
+      DBUG_RETURN(1);
+    }
+    sql_field->sql_type= MYSQL_TYPE_BLOB;
+    sql_field->flags|= BLOB_FLAG;
+    sprintf(warn_buff, ER(ER_AUTO_CONVERT), sql_field->field_name,
+            (sql_field->charset == &my_charset_bin) ? "VARBINARY" : "VARCHAR",
+            (sql_field->charset == &my_charset_bin) ? "BLOB" : "TEXT");
+    push_warning(thd, MYSQL_ERROR::WARN_LEVEL_NOTE, ER_AUTO_CONVERT,
+                 warn_buff);
+  }
+    
+  if ((sql_field->flags & BLOB_FLAG) && sql_field->length)
+  {
+    if (sql_field->sql_type == MYSQL_TYPE_BLOB)
+    {
+      /* The user has given a length to the blob column */
+      sql_field->sql_type= get_blob_type_from_length(sql_field->length);
+      sql_field->pack_length= calc_pack_length(sql_field->sql_type, 0);
+    }
+    sql_field->length= 0;
+  }
+  DBUG_RETURN(0);
+}
+
+//////////////////////////////
+// mysql_prepare_create_table() cut and pasted directly from sql_table.cc.
+static int
+mysql_prepare_create_table(THD *thd, HA_CREATE_INFO *create_info,
+                           Alter_info *alter_info,
+                           bool tmp_table,
+                           uint *db_options,
+                           handler *file, KEY **key_info_buffer,
+                           uint *key_count, int select_field_count)
+{
+  const char	*key_name;
+  Create_field	*sql_field,*dup_field;
+  uint		field,null_fields,blob_columns,max_key_length;
+  ulong		record_offset= 0;
+  KEY		*key_info;
+  KEY_PART_INFO *key_part_info;
+  int		timestamps= 0, timestamps_with_niladic= 0;
+  int		field_no,dup_no;
+  int		select_field_pos,auto_increment=0;
+  List_iterator<Create_field> it(alter_info->create_list);
+  List_iterator<Create_field> it2(alter_info->create_list);
+  uint total_uneven_bit_length= 0;
+  DBUG_ENTER("mysql_prepare_create_table");
+
+  select_field_pos= alter_info->create_list.elements - select_field_count;
+  null_fields=blob_columns=0;
+  create_info->varchar= 0;
+  max_key_length= file->max_key_length();
+
+  for (field_no=0; (sql_field=it++) ; field_no++)
+  {
+    CHARSET_INFO *save_cs;
+
+    /*
+      Initialize length from its original value (number of characters),
+      which was set in the parser. This is necessary if we're
+      executing a prepared statement for the second time.
+    */
+    sql_field->length= sql_field->char_length;
+    if (!sql_field->charset)
+      sql_field->charset= create_info->default_table_charset;
+    /*
+      table_charset is set in ALTER TABLE if we want change character set
+      for all varchar/char columns.
+      But the table charset must not affect the BLOB fields, so don't
+      allow to change my_charset_bin to somethig else.
+    */
+    if (create_info->table_charset && sql_field->charset != &my_charset_bin)
+      sql_field->charset= create_info->table_charset;
+
+    save_cs= sql_field->charset;
+    if ((sql_field->flags & BINCMP_FLAG) &&
+	!(sql_field->charset= get_charset_by_csname(sql_field->charset->csname,
+						    MY_CS_BINSORT,MYF(0))))
+    {
+      char tmp[64];
+      strmake(strmake(tmp, save_cs->csname, sizeof(tmp)-4),
+              STRING_WITH_LEN("_bin"));
+      my_error(ER_UNKNOWN_COLLATION, MYF(0), tmp);
+      DBUG_RETURN(TRUE);
+    }
+
+    /*
+      Convert the default value from client character
+      set into the column character set if necessary.
+    */
+    if (sql_field->def && 
+        save_cs != sql_field->def->collation.collation &&
+        (sql_field->sql_type == MYSQL_TYPE_VAR_STRING ||
+         sql_field->sql_type == MYSQL_TYPE_STRING ||
+         sql_field->sql_type == MYSQL_TYPE_SET ||
+         sql_field->sql_type == MYSQL_TYPE_ENUM))
+    {
+      /*
+        Starting from 5.1 we work here with a copy of Create_field
+        created by the caller, not with the instance that was
+        originally created during parsing. It's OK to create
+        a temporary item and initialize with it a member of the
+        copy -- this item will be thrown away along with the copy
+        at the end of execution, and thus not introduce a dangling
+        pointer in the parsed tree of a prepared statement or a
+        stored procedure statement.
+      */
+      sql_field->def= sql_field->def->safe_charset_converter(save_cs);
+
+      if (sql_field->def == NULL)
+      {
+        /* Could not convert */
+        my_error(ER_INVALID_DEFAULT, MYF(0), sql_field->field_name);
+        DBUG_RETURN(TRUE);
+      }
+    }
+
+    if (sql_field->sql_type == MYSQL_TYPE_SET ||
+        sql_field->sql_type == MYSQL_TYPE_ENUM)
+    {
+      uint32 dummy;
+      CHARSET_INFO *cs= sql_field->charset;
+      TYPELIB *interval= sql_field->interval;
+
+      /*
+        Create typelib from interval_list, and if necessary
+        convert strings from client character set to the
+        column character set.
+      */
+      if (!interval)
+      {
+        /*
+          Create the typelib in runtime memory - we will free the
+          occupied memory at the same time when we free this
+          sql_field -- at the end of execution.
+        */
+        interval= sql_field->interval= typelib(thd->mem_root,
+                                               sql_field->interval_list);
+        List_iterator<String> int_it(sql_field->interval_list);
+        String conv, *tmp;
+        char comma_buf[2];
+        int comma_length= cs->cset->wc_mb(cs, ',', (uchar*) comma_buf,
+                                          (uchar*) comma_buf + 
+                                          sizeof(comma_buf));
+        DBUG_ASSERT(comma_length > 0);
+        for (uint i= 0; (tmp= int_it++); i++)
+        {
+          uint lengthsp;
+          if (String::needs_conversion(tmp->length(), tmp->charset(),
+                                       cs, &dummy))
+          {
+            uint cnv_errs;
+            conv.copy(tmp->ptr(), tmp->length(), tmp->charset(), cs, &cnv_errs);
+            interval->type_names[i]= strmake_root(thd->mem_root, conv.ptr(),
+                                                  conv.length());
+            interval->type_lengths[i]= conv.length();
+          }
+
+          // Strip trailing spaces.
+          lengthsp= cs->cset->lengthsp(cs, interval->type_names[i],
+                                       interval->type_lengths[i]);
+          interval->type_lengths[i]= lengthsp;
+          ((uchar *)interval->type_names[i])[lengthsp]= '\0';
+          if (sql_field->sql_type == MYSQL_TYPE_SET)
+          {
+            if (cs->coll->instr(cs, interval->type_names[i], 
+                                interval->type_lengths[i], 
+                                comma_buf, comma_length, NULL, 0))
+            {
+              my_error(ER_ILLEGAL_VALUE_FOR_TYPE, MYF(0), "set", tmp->ptr());
+              DBUG_RETURN(TRUE);
+            }
+          }
+        }
+        sql_field->interval_list.empty(); // Don't need interval_list anymore
+      }
+
+      if (sql_field->sql_type == MYSQL_TYPE_SET)
+      {
+        uint32 field_length;
+        if (sql_field->def != NULL)
+        {
+          char *not_used;
+          uint not_used2;
+          bool not_found= 0;
+          String str, *def= sql_field->def->val_str(&str);
+          if (def == NULL) /* SQL "NULL" maps to NULL */
+          {
+            if ((sql_field->flags & NOT_NULL_FLAG) != 0)
+            {
+              my_error(ER_INVALID_DEFAULT, MYF(0), sql_field->field_name);
+              DBUG_RETURN(TRUE);
+            }
+
+            /* else, NULL is an allowed value */
+            (void) find_set(interval, NULL, 0,
+                            cs, &not_used, &not_used2, &not_found);
+          }
+          else /* not NULL */
+          {
+            (void) find_set(interval, def->ptr(), def->length(),
+                            cs, &not_used, &not_used2, &not_found);
+          }
+
+          if (not_found)
+          {
+            my_error(ER_INVALID_DEFAULT, MYF(0), sql_field->field_name);
+            DBUG_RETURN(TRUE);
+          }
+        }
+        calculate_interval_lengths(cs, interval, &dummy, &field_length);
+        sql_field->length= field_length + (interval->count - 1);
+      }
+      else  /* MYSQL_TYPE_ENUM */
+      {
+        uint32 field_length;
+        DBUG_ASSERT(sql_field->sql_type == MYSQL_TYPE_ENUM);
+        if (sql_field->def != NULL)
+        {
+          String str, *def= sql_field->def->val_str(&str);
+          if (def == NULL) /* SQL "NULL" maps to NULL */
+          {
+            if ((sql_field->flags & NOT_NULL_FLAG) != 0)
+            {
+              my_error(ER_INVALID_DEFAULT, MYF(0), sql_field->field_name);
+              DBUG_RETURN(TRUE);
+            }
+
+            /* else, the defaults yield the correct length for NULLs. */
+          } 
+          else /* not NULL */
+          {
+            def->length(cs->cset->lengthsp(cs, def->ptr(), def->length()));
+            if (find_type2(interval, def->ptr(), def->length(), cs) == 0) /* not found */
+            {
+              my_error(ER_INVALID_DEFAULT, MYF(0), sql_field->field_name);
+              DBUG_RETURN(TRUE);
+            }
+          }
+        }
+        calculate_interval_lengths(cs, interval, &field_length, &dummy);
+        sql_field->length= field_length;
+      }
+      set_if_smaller(sql_field->length, MAX_FIELD_WIDTH-1);
+    }
+
+    if (sql_field->sql_type == MYSQL_TYPE_BIT)
+    { 
+      sql_field->pack_flag= FIELDFLAG_NUMBER;
+      if (file->ha_table_flags() & HA_CAN_BIT_FIELD)
+        total_uneven_bit_length+= sql_field->length & 7;
+      else
+        sql_field->pack_flag|= FIELDFLAG_TREAT_BIT_AS_CHAR;
+    }
+
+    sql_field->create_length_to_internal_length();
+    if (prepare_blob_field(thd, sql_field))
+      DBUG_RETURN(TRUE);
+
+    if (!(sql_field->flags & NOT_NULL_FLAG))
+      null_fields++;
+
+    if (check_column_name(sql_field->field_name))
+    {
+      my_error(ER_WRONG_COLUMN_NAME, MYF(0), sql_field->field_name);
+      DBUG_RETURN(TRUE);
+    }
+
+    /* Check if we have used the same field name before */
+    for (dup_no=0; (dup_field=it2++) != sql_field; dup_no++)
+    {
+      if (my_strcasecmp(system_charset_info,
+			sql_field->field_name,
+			dup_field->field_name) == 0)
+      {
+	/*
+	  If this was a CREATE ... SELECT statement, accept a field
+	  redefinition if we are changing a field in the SELECT part
+	*/
+	if (field_no < select_field_pos || dup_no >= select_field_pos)
+	{
+	  my_error(ER_DUP_FIELDNAME, MYF(0), sql_field->field_name);
+	  DBUG_RETURN(TRUE);
+	}
+	else
+	{
+	  /* Field redefined */
+	  sql_field->def=		dup_field->def;
+	  sql_field->sql_type=		dup_field->sql_type;
+	  sql_field->charset=		(dup_field->charset ?
+					 dup_field->charset :
+					 create_info->default_table_charset);
+	  sql_field->length=		dup_field->char_length;
+          sql_field->pack_length=	dup_field->pack_length;
+          sql_field->key_length=	dup_field->key_length;
+	  sql_field->decimals=		dup_field->decimals;
+	  sql_field->create_length_to_internal_length();
+	  sql_field->unireg_check=	dup_field->unireg_check;
+          /* 
+            We're making one field from two, the result field will have
+            dup_field->flags as flags. If we've incremented null_fields
+            because of sql_field->flags, decrement it back.
+          */
+          if (!(sql_field->flags & NOT_NULL_FLAG))
+            null_fields--;
+	  sql_field->flags=		dup_field->flags;
+          sql_field->interval=          dup_field->interval;
+	  it2.remove();			// Remove first (create) definition
+	  select_field_pos--;
+	  break;
+	}
+      }
+    }
+    /* Don't pack rows in old tables if the user has requested this */
+    if ((sql_field->flags & BLOB_FLAG) ||
+	(sql_field->sql_type == MYSQL_TYPE_VARCHAR &&
+         create_info->row_type != ROW_TYPE_FIXED))
+      (*db_options)|= HA_OPTION_PACK_RECORD;
+    it2.rewind();
+  }
+
+  /* record_offset will be increased with 'length-of-null-bits' later */
+  record_offset= 0;
+  null_fields+= total_uneven_bit_length;
+
+  it.rewind();
+  while ((sql_field=it++))
+  {
+    DBUG_ASSERT(sql_field->charset != 0);
+
+    if (prepare_create_field(sql_field, &blob_columns, 
+			     &timestamps, &timestamps_with_niladic,
+			     file->ha_table_flags()))
+      DBUG_RETURN(TRUE);
+    if (sql_field->sql_type == MYSQL_TYPE_VARCHAR)
+      create_info->varchar= TRUE;
+    sql_field->offset= record_offset;
+    if (MTYP_TYPENR(sql_field->unireg_check) == Field::NEXT_NUMBER)
+      auto_increment++;
+    record_offset+= sql_field->pack_length;
+  }
+  if (timestamps_with_niladic > 1)
+  {
+    my_message(ER_TOO_MUCH_AUTO_TIMESTAMP_COLS,
+               ER(ER_TOO_MUCH_AUTO_TIMESTAMP_COLS), MYF(0));
+    DBUG_RETURN(TRUE);
+  }
+  if (auto_increment > 1)
+  {
+    my_message(ER_WRONG_AUTO_KEY, ER(ER_WRONG_AUTO_KEY), MYF(0));
+    DBUG_RETURN(TRUE);
+  }
+  if (auto_increment &&
+      (file->ha_table_flags() & HA_NO_AUTO_INCREMENT))
+  {
+    my_message(ER_TABLE_CANT_HANDLE_AUTO_INCREMENT,
+               ER(ER_TABLE_CANT_HANDLE_AUTO_INCREMENT), MYF(0));
+    DBUG_RETURN(TRUE);
+  }
+
+  if (blob_columns && (file->ha_table_flags() & HA_NO_BLOBS))
+  {
+    my_message(ER_TABLE_CANT_HANDLE_BLOB, ER(ER_TABLE_CANT_HANDLE_BLOB),
+               MYF(0));
+    DBUG_RETURN(TRUE);
+  }
+
+  /* Create keys */
+
+  List_iterator<Key> key_iterator(alter_info->key_list);
+  List_iterator<Key> key_iterator2(alter_info->key_list);
+  uint key_parts=0, fk_key_count=0;
+  bool primary_key=0,unique_key=0;
+  Key *key, *key2;
+  uint tmp, key_number;
+  /* special marker for keys to be ignored */
+  static char ignore_key[1];
+
+  /* Calculate number of key segements */
+  *key_count= 0;
+  
+  while ((key=key_iterator++))
+  {
+    DBUG_PRINT("info", ("key name: '%s'  type: %d", key->DOT_STR(name) ? key->DOT_STR(name) :
+                        "(none)" , key->type));
+    LEX_STRING key_name_str;
+    if (key->type == Key::FOREIGN_KEY)
+    {
+      fk_key_count++;
+      Foreign_key *fk_key= (Foreign_key*) key;
+      if (fk_key->ref_columns.elements &&
+	  fk_key->ref_columns.elements != fk_key->columns.elements)
+      {
+        my_error(ER_WRONG_FK_DEF, MYF(0),
+                 (fk_key->DOT_STR(name) ?  fk_key->DOT_STR(name) : "foreign key without name"),
+                 ER(ER_KEY_REF_DO_NOT_MATCH_TABLE_REF));
+	DBUG_RETURN(TRUE);
+      }
+      continue;
+    }
+    (*key_count)++;
+    tmp=file->max_key_parts();
+    if (key->columns.elements > tmp)
+    {
+      my_error(ER_TOO_MANY_KEY_PARTS,MYF(0),tmp);
+      DBUG_RETURN(TRUE);
+    }
+    key_name_str.str= (char*) key->DOT_STR(name);
+    key_name_str.length= key->DOT_STR(name) ? strlen(key->DOT_STR(name)) : 0;
+    if (check_string_char_length(&key_name_str, "", NAME_CHAR_LEN,
+                                 system_charset_info, 1))
+    {
+      my_error(ER_TOO_LONG_IDENT, MYF(0), key->DOT_STR(name));
+      DBUG_RETURN(TRUE);
+    }
+    key_iterator2.rewind ();
+    if (key->type != Key::FOREIGN_KEY)
+    {
+      while ((key2 = key_iterator2++) != key)
+      {
+	/*
+          foreign_key_prefix(key, key2) returns 0 if key or key2, or both, is
+          'generated', and a generated key is a prefix of the other key.
+          Then we do not need the generated shorter key.
+        */
+        if ((key2->type != Key::FOREIGN_KEY &&
+             key2->DOT_STR(name) != ignore_key &&
+             !foreign_key_prefix(key, key2)))
+        {
+          /* TODO: issue warning message */
+          /* mark that the generated key should be ignored */
+          if (!key2->generated ||
+              (key->generated && key->columns.elements <
+               key2->columns.elements))
+            key->DOT_STR(name)= ignore_key;
+          else
+          {
+            key2->DOT_STR(name)= ignore_key;
+            key_parts-= key2->columns.elements;
+            (*key_count)--;
+          }
+          break;
+        }
+      }
+    }
+    if (key->DOT_STR(name) != ignore_key)
+      key_parts+=key->columns.elements;
+    else
+      (*key_count)--;
+    if (key->DOT_STR(name) && !tmp_table && (key->type != Key::PRIMARY) &&
+	!my_strcasecmp(system_charset_info,key->DOT_STR(name),primary_key_name))
+    {
+      my_error(ER_WRONG_NAME_FOR_INDEX, MYF(0), key->DOT_STR(name));
+      DBUG_RETURN(TRUE);
+    }
+  }
+  tmp=file->max_keys();
+  if (*key_count > tmp)
+  {
+    my_error(ER_TOO_MANY_KEYS,MYF(0),tmp);
+    DBUG_RETURN(TRUE);
+  }
+
+  (*key_info_buffer)= key_info= (KEY*) sql_calloc(sizeof(KEY) * (*key_count));
+  key_part_info=(KEY_PART_INFO*) sql_calloc(sizeof(KEY_PART_INFO)*key_parts);
+  if (!*key_info_buffer || ! key_part_info)
+    DBUG_RETURN(TRUE);				// Out of memory
+
+  key_iterator.rewind();
+  key_number=0;
+  for (; (key=key_iterator++) ; key_number++)
+  {
+    uint key_length=0;
+    Key_part_spec *column;
+
+    if (key->DOT_STR(name) == ignore_key)
+    {
+      /* ignore redundant keys */
+      do
+	key=key_iterator++;
+      while (key && key->DOT_STR(name) == ignore_key);
+      if (!key)
+	break;
+    }
+
+    switch (key->type) {
+    case Key::MULTIPLE:
+	key_info->flags= 0;
+	break;
+    case Key::FULLTEXT:
+	key_info->flags= HA_FULLTEXT;
+	if ((key_info->parser_name= &key->key_create_info.parser_name)->str)
+          key_info->flags|= HA_USES_PARSER;
+        else
+          key_info->parser_name= 0;
+	break;
+    case Key::SPATIAL:
+#ifdef HAVE_SPATIAL
+	key_info->flags= HA_SPATIAL;
+	break;
+#else
+	my_error(ER_FEATURE_DISABLED, MYF(0),
+                 sym_group_geom.name, sym_group_geom.needed_define);
+	DBUG_RETURN(TRUE);
+#endif
+    case Key::FOREIGN_KEY:
+      key_number--;				// Skip this key
+      continue;
+    default:
+      key_info->flags = HA_NOSAME;
+      break;
+    }
+    if (key->generated)
+      key_info->flags|= HA_GENERATED_KEY;
+
+    key_info->key_parts=(uint8) key->columns.elements;
+    key_info->key_part=key_part_info;
+    key_info->usable_key_parts= key_number;
+    key_info->algorithm= key->key_create_info.algorithm;
+
+    if (key->type == Key::FULLTEXT)
+    {
+      if (!(file->ha_table_flags() & HA_CAN_FULLTEXT))
+      {
+	my_message(ER_TABLE_CANT_HANDLE_FT, ER(ER_TABLE_CANT_HANDLE_FT),
+                   MYF(0));
+	DBUG_RETURN(TRUE);
+      }
+    }
+    /*
+       Make SPATIAL to be RTREE by default
+       SPATIAL only on BLOB or at least BINARY, this
+       actually should be replaced by special GEOM type
+       in near future when new frm file is ready
+       checking for proper key parts number:
+    */
+
+    /* TODO: Add proper checks if handler supports key_type and algorithm */
+    if (key_info->flags & HA_SPATIAL)
+    {
+      if (!(file->ha_table_flags() & HA_CAN_RTREEKEYS))
+      {
+        my_message(ER_TABLE_CANT_HANDLE_SPKEYS, ER(ER_TABLE_CANT_HANDLE_SPKEYS),
+                   MYF(0));
+        DBUG_RETURN(TRUE);
+      }
+      if (key_info->key_parts != 1)
+      {
+	my_error(ER_WRONG_ARGUMENTS, MYF(0), "SPATIAL INDEX");
+	DBUG_RETURN(TRUE);
+      }
+    }
+    else if (key_info->algorithm == HA_KEY_ALG_RTREE)
+    {
+#ifdef HAVE_RTREE_KEYS
+      if ((key_info->key_parts & 1) == 1)
+      {
+	my_error(ER_WRONG_ARGUMENTS, MYF(0), "RTREE INDEX");
+	DBUG_RETURN(TRUE);
+      }
+      /* TODO: To be deleted */
+      my_error(ER_NOT_SUPPORTED_YET, MYF(0), "RTREE INDEX");
+      DBUG_RETURN(TRUE);
+#else
+      my_error(ER_FEATURE_DISABLED, MYF(0),
+               sym_group_rtree.name, sym_group_rtree.needed_define);
+      DBUG_RETURN(TRUE);
+#endif
+    }
+
+    /* Take block size from key part or table part */
+    /*
+      TODO: Add warning if block size changes. We can't do it here, as
+      this may depend on the size of the key
+    */
+    key_info->block_size= (key->key_create_info.block_size ?
+                           key->key_create_info.block_size :
+                           create_info->key_block_size);
+
+    if (key_info->block_size)
+      key_info->flags|= HA_USES_BLOCK_SIZE;
+
+    List_iterator<Key_part_spec> cols(key->columns), cols2(key->columns);
+    CHARSET_INFO *ft_key_charset=0;  // for FULLTEXT
+    for (uint column_nr=0 ; (column=cols++) ; column_nr++)
+    {
+      uint length;
+      Key_part_spec *dup_column;
+
+      it.rewind();
+      field=0;
+      while ((sql_field=it++) &&
+	     my_strcasecmp(system_charset_info,
+			   column->DOT_STR(field_name),
+			   sql_field->field_name))
+	field++;
+      if (!sql_field)
+      {
+	my_error(ER_KEY_COLUMN_DOES_NOT_EXITS, MYF(0), column->field_name);
+	DBUG_RETURN(TRUE);
+      }
+      while ((dup_column= cols2++) != column)
+      {
+        if (!my_strcasecmp(system_charset_info,
+	     	           column->DOT_STR(field_name), dup_column->DOT_STR(field_name)))
+	{
+	  my_printf_error(ER_DUP_FIELDNAME,
+			  ER(ER_DUP_FIELDNAME),MYF(0),
+			  column->field_name);
+	  DBUG_RETURN(TRUE);
+	}
+      }
+      cols2.rewind();
+      if (key->type == Key::FULLTEXT)
+      {
+	if ((sql_field->sql_type != MYSQL_TYPE_STRING &&
+	     sql_field->sql_type != MYSQL_TYPE_VARCHAR &&
+	     !f_is_blob(sql_field->pack_flag)) ||
+	    sql_field->charset == &my_charset_bin ||
+	    sql_field->charset->mbminlen > 1 || // ucs2 doesn't work yet
+	    (ft_key_charset && sql_field->charset != ft_key_charset))
+	{
+	    my_error(ER_BAD_FT_COLUMN, MYF(0), column->field_name);
+	    DBUG_RETURN(-1);
+	}
+	ft_key_charset=sql_field->charset;
+	/*
+	  for fulltext keys keyseg length is 1 for blobs (it's ignored in ft
+	  code anyway, and 0 (set to column width later) for char's. it has
+	  to be correct col width for char's, as char data are not prefixed
+	  with length (unlike blobs, where ft code takes data length from a
+	  data prefix, ignoring column->length).
+	*/
+	column->length=test(f_is_blob(sql_field->pack_flag));
+      }
+      else
+      {
+	column->length*= sql_field->charset->mbmaxlen;
+
+        if (key->type == Key::SPATIAL && column->length)
+        {
+          my_error(ER_WRONG_SUB_KEY, MYF(0));
+	  DBUG_RETURN(TRUE);
+	}
+
+	if (f_is_blob(sql_field->pack_flag) ||
+            (f_is_geom(sql_field->pack_flag) && key->type != Key::SPATIAL))
+	{
+	  if (!(file->ha_table_flags() & HA_CAN_INDEX_BLOBS))
+	  {
+	    my_error(ER_BLOB_USED_AS_KEY, MYF(0), column->field_name);
+	    DBUG_RETURN(TRUE);
+	  }
+          if (f_is_geom(sql_field->pack_flag) && sql_field->geom_type ==
+              Field::GEOM_POINT)
+            column->length= 25;
+	  if (!column->length)
+	  {
+	    my_error(ER_BLOB_KEY_WITHOUT_LENGTH, MYF(0), column->field_name);
+	    DBUG_RETURN(TRUE);
+	  }
+	}
+#ifdef HAVE_SPATIAL
+	if (key->type == Key::SPATIAL)
+	{
+	  if (!column->length)
+	  {
+	    /*
+              4 is: (Xmin,Xmax,Ymin,Ymax), this is for 2D case
+              Lately we'll extend this code to support more dimensions
+	    */
+	    column->length= 4*sizeof(double);
+	  }
+	}
+#endif
+	if (!(sql_field->flags & NOT_NULL_FLAG))
+	{
+	  if (key->type == Key::PRIMARY)
+	  {
+	    /* Implicitly set primary key fields to NOT NULL for ISO conf. */
+	    sql_field->flags|= NOT_NULL_FLAG;
+	    sql_field->pack_flag&= ~FIELDFLAG_MAYBE_NULL;
+            null_fields--;
+	  }
+	  else
+          {
+            key_info->flags|= HA_NULL_PART_KEY;
+            if (!(file->ha_table_flags() & HA_NULL_IN_KEY))
+            {
+              my_error(ER_NULL_COLUMN_IN_INDEX, MYF(0), column->field_name);
+              DBUG_RETURN(TRUE);
+            }
+            if (key->type == Key::SPATIAL)
+            {
+              my_message(ER_SPATIAL_CANT_HAVE_NULL,
+                         ER(ER_SPATIAL_CANT_HAVE_NULL), MYF(0));
+              DBUG_RETURN(TRUE);
+            }
+          }
+	}
+	if (MTYP_TYPENR(sql_field->unireg_check) == Field::NEXT_NUMBER)
+	{
+	  if (column_nr == 0 || (file->ha_table_flags() & HA_AUTO_PART_KEY))
+	    auto_increment--;			// Field is used
+	}
+      }
+
+      key_part_info->fieldnr= field;
+      key_part_info->offset=  (uint16) sql_field->offset;
+      key_part_info->key_type=sql_field->pack_flag;
+      length= sql_field->key_length;
+
+      if (column->length)
+      {
+	if (f_is_blob(sql_field->pack_flag))
+	{
+	  if ((length=column->length) > max_key_length ||
+	      length > file->max_key_part_length())
+	  {
+	    length=min(max_key_length, file->max_key_part_length());
+	    if (key->type == Key::MULTIPLE)
+	    {
+	      /* not a critical problem */
+	      char warn_buff[MYSQL_ERRMSG_SIZE];
+	      my_snprintf(warn_buff, sizeof(warn_buff), ER(ER_TOO_LONG_KEY),
+			  length);
+	      push_warning(thd, MYSQL_ERROR::WARN_LEVEL_WARN,
+			   ER_TOO_LONG_KEY, warn_buff);
+              /* Align key length to multibyte char boundary */
+              length-= length % sql_field->charset->mbmaxlen;
+	    }
+	    else
+	    {
+	      my_error(ER_TOO_LONG_KEY,MYF(0),length);
+	      DBUG_RETURN(TRUE);
+	    }
+	  }
+	}
+	else if (!f_is_geom(sql_field->pack_flag) &&
+		  (column->length > length ||
+                   !Field::type_can_have_key_part (sql_field->sql_type) ||
+		   ((f_is_packed(sql_field->pack_flag) ||
+		     ((file->ha_table_flags() & HA_NO_PREFIX_CHAR_KEYS) &&
+		      (key_info->flags & HA_NOSAME))) &&
+		    column->length != length)))
+	{
+	  my_message(ER_WRONG_SUB_KEY, ER(ER_WRONG_SUB_KEY), MYF(0));
+	  DBUG_RETURN(TRUE);
+	}
+	else if (!(file->ha_table_flags() & HA_NO_PREFIX_CHAR_KEYS))
+	  length=column->length;
+      }
+      else if (length == 0)
+      {
+	my_error(ER_WRONG_KEY_COLUMN, MYF(0), column->field_name);
+	  DBUG_RETURN(TRUE);
+      }
+      if (length > file->max_key_part_length() && key->type != Key::FULLTEXT)
+      {
+        length= file->max_key_part_length();
+	if (key->type == Key::MULTIPLE)
+	{
+	  /* not a critical problem */
+	  char warn_buff[MYSQL_ERRMSG_SIZE];
+	  my_snprintf(warn_buff, sizeof(warn_buff), ER(ER_TOO_LONG_KEY),
+		      length);
+	  push_warning(thd, MYSQL_ERROR::WARN_LEVEL_WARN,
+		       ER_TOO_LONG_KEY, warn_buff);
+          /* Align key length to multibyte char boundary */
+          length-= length % sql_field->charset->mbmaxlen;
+	}
+	else
+	{
+	  my_error(ER_TOO_LONG_KEY,MYF(0),length);
+	  DBUG_RETURN(TRUE);
+	}
+      }
+      key_part_info->length=(uint16) length;
+      /* Use packed keys for long strings on the first column */
+      if (!((*db_options) & HA_OPTION_NO_PACK_KEYS) &&
+	  (length >= KEY_DEFAULT_PACK_LENGTH &&
+	   (sql_field->sql_type == MYSQL_TYPE_STRING ||
+	    sql_field->sql_type == MYSQL_TYPE_VARCHAR ||
+	    sql_field->pack_flag & FIELDFLAG_BLOB)))
+      {
+	if ((column_nr == 0 && (sql_field->pack_flag & FIELDFLAG_BLOB)) ||
+            sql_field->sql_type == MYSQL_TYPE_VARCHAR)
+	  key_info->flags|= HA_BINARY_PACK_KEY | HA_VAR_LENGTH_KEY;
+	else
+	  key_info->flags|= HA_PACK_KEY;
+      }
+      /* Check if the key segment is partial, set the key flag accordingly */
+      if (length != sql_field->key_length)
+        key_info->flags|= HA_KEY_HAS_PART_KEY_SEG;
+
+      key_length+=length;
+      key_part_info++;
+
+      /* Create the key name based on the first column (if not given) */
+      if (column_nr == 0)
+      {
+	if (key->type == Key::PRIMARY)
+	{
+	  if (primary_key)
+	  {
+	    my_message(ER_MULTIPLE_PRI_KEY, ER(ER_MULTIPLE_PRI_KEY),
+                       MYF(0));
+	    DBUG_RETURN(TRUE);
+	  }
+	  key_name=primary_key_name;
+	  primary_key=1;
+	}
+	else if (!(key_name = key->DOT_STR(name)))
+	  key_name=make_unique_key_name(sql_field->field_name,
+					*key_info_buffer, key_info);
+	if (check_if_keyname_exists(key_name, *key_info_buffer, key_info))
+	{
+	  my_error(ER_DUP_KEYNAME, MYF(0), key_name);
+	  DBUG_RETURN(TRUE);
+	}
+	key_info->name=(char*) key_name;
+      }
+    }
+    if (!key_info->name || check_column_name(key_info->name))
+    {
+      my_error(ER_WRONG_NAME_FOR_INDEX, MYF(0), key_info->name);
+      DBUG_RETURN(TRUE);
+    }
+    if (!(key_info->flags & HA_NULL_PART_KEY))
+      unique_key=1;
+    key_info->key_length=(uint16) key_length;
+    if (key_length > max_key_length && key->type != Key::FULLTEXT)
+    {
+      my_error(ER_TOO_LONG_KEY,MYF(0),max_key_length);
+      DBUG_RETURN(TRUE);
+    }
+    key_info++;
+  }
+  if (!unique_key && !primary_key &&
+      (file->ha_table_flags() & HA_REQUIRE_PRIMARY_KEY))
+  {
+    my_message(ER_REQUIRES_PRIMARY_KEY, ER(ER_REQUIRES_PRIMARY_KEY), MYF(0));
+    DBUG_RETURN(TRUE);
+  }
+  if (auto_increment > 0)
+  {
+    my_message(ER_WRONG_AUTO_KEY, ER(ER_WRONG_AUTO_KEY), MYF(0));
+    DBUG_RETURN(TRUE);
+  }
+  /* Sort keys in optimized order */
+  my_qsort((uchar*) *key_info_buffer, *key_count, sizeof(KEY),
+	   (qsort_cmp) sort_keys);
+  create_info->null_bits= null_fields;
+
+  /* Check fields. */
+  it.rewind();
+  while ((sql_field=it++))
+  {
+    Field::utype type= (Field::utype) MTYP_TYPENR(sql_field->unireg_check);
+
+    if (thd->variables.sql_mode & MODE_NO_ZERO_DATE &&
+        !sql_field->def &&
+        sql_field->sql_type == MYSQL_TYPE_TIMESTAMP &&
+        (sql_field->flags & NOT_NULL_FLAG) &&
+        (type == Field::NONE || type == Field::TIMESTAMP_UN_FIELD))
+    {
+      /*
+        An error should be reported if:
+          - NO_ZERO_DATE SQL mode is active;
+          - there is no explicit DEFAULT clause (default column value);
+          - this is a TIMESTAMP column;
+          - the column is not NULL;
+          - this is not the DEFAULT CURRENT_TIMESTAMP column.
+
+        In other words, an error should be reported if
+          - NO_ZERO_DATE SQL mode is active;
+          - the column definition is equivalent to
+            'column_name TIMESTAMP DEFAULT 0'.
+      */
+
+      my_error(ER_INVALID_DEFAULT, MYF(0), sql_field->field_name);
+      DBUG_RETURN(TRUE);
+    }
+  }
+
+  DBUG_RETURN(FALSE);
+}
+
+//////////////////////////////
+// mysql_create_table_no_lock() cut and pasted directly from sql_table.cc. (I did make is static after copying it.)
+
+static bool mysql_create_table_no_lock(THD *thd,
+                                const char *db, const char *table_name,
+                                HA_CREATE_INFO *create_info,
+                                Alter_info *alter_info,
+                                bool internal_tmp_table,
+                                uint select_field_count)
+{
+  char			path[FN_REFLEN];
+  uint          path_length;
+  const char	*alias;
+  uint			db_options, key_count;
+  KEY			*key_info_buffer;
+  handler		*file;
+  bool			error= TRUE;
+  DBUG_ENTER("mysql_create_table_no_lock");
+  DBUG_PRINT("enter", ("db: '%s'  table: '%s'  tmp: %d",
+                       db, table_name, internal_tmp_table));
+
+
+  /* Check for duplicate fields and check type of table to create */
+  if (!alter_info->create_list.elements)
+  {
+    my_message(ER_TABLE_MUST_HAVE_COLUMNS, ER(ER_TABLE_MUST_HAVE_COLUMNS),
+               MYF(0));
+    DBUG_RETURN(TRUE);
+  }
+  if (check_engine(thd, table_name, create_info))
+    DBUG_RETURN(TRUE);
+  db_options= create_info->table_options;
+  if (create_info->row_type == ROW_TYPE_DYNAMIC)
+    db_options|=HA_OPTION_PACK_RECORD;
+  alias= table_case_name(create_info, table_name);
+
+  /* PMC - Done to avoid getting the partition handler by mistake! */
+  if (!(file= new (thd->mem_root) ha_xtsys(pbxt_hton, NULL)))
+  {
+    mem_alloc_error(sizeof(handler));
+    DBUG_RETURN(TRUE);
+  }
+
+  set_table_default_charset(thd, create_info, (char*) db);
+
+  if (mysql_prepare_create_table(thd, create_info, alter_info,
+                                 internal_tmp_table,
+                                 &db_options, file,
+                                 &key_info_buffer, &key_count,
+                                 select_field_count))
+    goto err;
+
+      /* Check if table exists */
+  if (create_info->options & HA_LEX_CREATE_TMP_TABLE)
+  {
+    path_length= build_tmptable_filename(thd, path, sizeof(path));
+    create_info->table_options|=HA_CREATE_DELAY_KEY_WRITE;
+  }
+  else  
+  {
+ #ifdef FN_DEVCHAR
+    /* check if the table name contains FN_DEVCHAR when defined */
+    if (strchr(alias, FN_DEVCHAR))
+    {
+      my_error(ER_WRONG_TABLE_NAME, MYF(0), alias);
+      DBUG_RETURN(TRUE);
+    }
+#endif
+    path_length= build_table_filename(path, sizeof(path), db, alias, reg_ext,
+                                      internal_tmp_table ? FN_IS_TMP : 0);
+  }
+
+  /* Check if table already exists */
+  if ((create_info->options & HA_LEX_CREATE_TMP_TABLE) &&
+      find_temporary_table(thd, db, table_name))
+  {
+    if (create_info->options & HA_LEX_CREATE_IF_NOT_EXISTS)
+    {
+      create_info->table_existed= 1;		// Mark that table existed
+      push_warning_printf(thd, MYSQL_ERROR::WARN_LEVEL_NOTE,
+                          ER_TABLE_EXISTS_ERROR, ER(ER_TABLE_EXISTS_ERROR),
+                          alias);
+      error= 0;
+      goto err;
+    }
+    my_error(ER_TABLE_EXISTS_ERROR, MYF(0), alias);
+    goto err;
+  }
+
+  pthread_mutex_lock(&LOCK_open);
+  if (!internal_tmp_table && !(create_info->options & HA_LEX_CREATE_TMP_TABLE))
+  {
+    if (!access(path,F_OK))
+    {
+      if (create_info->options & HA_LEX_CREATE_IF_NOT_EXISTS)
+        goto warn;
+      my_error(ER_TABLE_EXISTS_ERROR,MYF(0),table_name);
+      goto unlock_and_end;
+    }
+    /*
+      We don't assert here, but check the result, because the table could be
+      in the table definition cache and in the same time the .frm could be
+      missing from the disk, in case of manual intervention which deletes
+      the .frm file. The user has to use FLUSH TABLES; to clear the cache.
+      Then she could create the table. This case is pretty obscure and
+      therefore we don't introduce a new error message only for it.
+    */
+    if (get_cached_table_share(db, alias))
+    {
+      my_error(ER_TABLE_EXISTS_ERROR, MYF(0), table_name);
+      goto unlock_and_end;
+    }
+  }
+
+  /*
+    Check that table with given name does not already
+    exist in any storage engine. In such a case it should
+    be discovered and the error ER_TABLE_EXISTS_ERROR be returned
+    unless user specified CREATE TABLE IF EXISTS
+    The LOCK_open mutex has been locked to make sure no
+    one else is attempting to discover the table. Since
+    it's not on disk as a frm file, no one could be using it!
+  */
+  if (!(create_info->options & HA_LEX_CREATE_TMP_TABLE))
+  {
+    bool create_if_not_exists =
+      create_info->options & HA_LEX_CREATE_IF_NOT_EXISTS;
+    int retcode = ha_table_exists_in_engine(thd, db, table_name);
+    DBUG_PRINT("info", ("exists_in_engine: %u",retcode));
+    switch (retcode)
+    {
+      case HA_ERR_NO_SUCH_TABLE:
+        /* Normal case, no table exists. we can go and create it */
+        break;
+      case HA_ERR_TABLE_EXIST:
+        DBUG_PRINT("info", ("Table existed in handler"));
+
+        if (create_if_not_exists)
+          goto warn;
+        my_error(ER_TABLE_EXISTS_ERROR,MYF(0),table_name);
+        goto unlock_and_end;
+        break;
+      default:
+        DBUG_PRINT("info", ("error: %u from storage engine", retcode));
+        my_error(retcode, MYF(0),table_name);
+        goto unlock_and_end;
+    }
+  }
+
+  thd_proc_info(thd, "creating table");
+  create_info->table_existed= 0;		// Mark that table is created
+
+  create_info->table_options=db_options;
+
+  path[path_length - reg_ext_length]= '\0'; // Remove .frm extension
+  if (rea_create_table(thd, path, db, table_name,
+                       create_info, alter_info->create_list,
+                       key_count, key_info_buffer, file))
+    goto unlock_and_end;
+
+  if (create_info->options & HA_LEX_CREATE_TMP_TABLE)
+  {
+    /* Open table and put in temporary table list */
+#if MYSQL_VERSION_ID >= 50404
+    if (!(open_temporary_table(thd, path, db, table_name, 1, OTM_OPEN)))
+#else
+    if (!(open_temporary_table(thd, path, db, table_name, 1)))
+#endif
+    {
+#if MYSQL_VERSION_ID >= 50404
+      (void) rm_temporary_table(create_info->db_type, path, false);
+#else
+      (void) rm_temporary_table(create_info->db_type, path);
+#endif
+      goto unlock_and_end;
+    }
+    thd->thread_specific_used= TRUE;
+  }
+
+  /*
+    Don't write statement if:
+    - It is an internal temporary table,
+    - Row-based logging is used and it we are creating a temporary table, or
+    - The binary log is not open.
+    Otherwise, the statement shall be binlogged.
+   */
+  /* PBXT 1.0.09e
+   * Firstly we had a compile problem with MySQL 5.1.42 and
+   * the write_bin_log() call below:
+   * discover_xt.cc:1259: error: argument of type 'char* (Statement::)()' does not match 'const char*'
+   * 
+   * And secondly, we should no write the BINLOG anyway because this is
+   * an internal PBXT system table.
+   *
+   * So I am just commenting out the code altogether.
+  if (!internal_tmp_table &&
+      (!thd->current_stmt_binlog_row_based ||
+       (thd->current_stmt_binlog_row_based &&
+        !(create_info->options & HA_LEX_CREATE_TMP_TABLE))))
+    write_bin_log(thd, TRUE, thd->query, thd->query_length);
+   */
+  error= FALSE;
+unlock_and_end:
+  pthread_mutex_unlock(&LOCK_open);
+
+err:
+  thd_proc_info(thd, "After create");
+  delete file;
+  DBUG_RETURN(error);
+
+warn:
+  error= FALSE;
+  push_warning_printf(thd, MYSQL_ERROR::WARN_LEVEL_NOTE,
+                      ER_TABLE_EXISTS_ERROR, ER(ER_TABLE_EXISTS_ERROR),
+                      alias);
+  create_info->table_existed= 1;		// Mark that table existed
+  goto unlock_and_end;
+}
+
+////////////////////////////////////////////////////////
+////// END OF CUT AND PASTES FROM  sql_table.cc ////////
+////////////////////////////////////////////////////////
+
+#endif // DRIZZLED
+#endif // LOCK_OPEN_HACK_REQUIRED
+
+//------------------------------
+int xt_create_table_frm(handlerton *hton, THD* thd, const char *db, const char *name, DT_FIELD_INFO *info, DT_KEY_INFO *XT_UNUSED(keys), xtBool skip_existing)
+{
+#ifdef DRIZZLED
+#define MYLEX_CREATE_INFO create_info
+#else
+#define MYLEX_CREATE_INFO mylex.create_info 
+#endif
+
+#ifdef DRIZZLED
+	drizzled::statement::AlterTable *stmt = new drizzled::statement::AlterTable(thd);
+	HA_CREATE_INFO create_info;
+	//AlterInfo alter_info;
+	drizzled::message::Table table_proto;
+
+	static const char *ext = ".dfe";
+	static const int ext_len = 4;
+
+	table_proto.mutable_engine()->mutable_name()->assign("PBXT");
+#else
+	static const char *ext = ".frm";
+	static const int ext_len = 4;
+#endif
+	int err = 1;
+	char field_length_buffer[12], *field_length_ptr;
+	LEX  *save_lex= thd->lex, mylex;
+
+	memset(&MYLEX_CREATE_INFO, 0, sizeof(HA_CREATE_INFO));
+
+	thd->lex = &mylex;
+	lex_start(thd);
+#ifdef DRIZZLED
+        mylex.statement = stmt;
+#endif
+	
+	/* setup the create info */
+	MYLEX_CREATE_INFO.db_type = hton;
+
+#ifndef DRIZZLED 
+	mylex.create_info.frm_only = 1;
+#endif
+ 	MYLEX_CREATE_INFO.default_table_charset = system_charset_info;
+	
+	/* setup the column info. */
+	while (info->field_name) {		
+		 LEX_STRING field_name, comment;		 
+		 field_name.str = (char*)(info->field_name);
+		 field_name.length = strlen(info->field_name);
+		 
+		 comment.str = (char*)(info->comment);
+		 comment.length = strlen(info->comment);
+		 			
+		 if (info->field_length) {
+			sprintf(field_length_buffer, "%d", info->field_length);
+			field_length_ptr = field_length_buffer;
+		 } else 
+			field_length_ptr = NULL;
+
+#ifdef DRIZZLED
+		if (add_field_to_list(thd, &field_name, info->field_type, field_length_ptr, info->field_decimal_length,
+			info->field_flags,
+            COLUMN_FORMAT_TYPE_FIXED,
+		    NULL /*default_value*/, NULL /*on_update_value*/, &comment, NULL /*change*/,
+            NULL /*interval_list*/, info->field_charset))
+#else
+		if (add_field_to_list(thd, &field_name, info->field_type, field_length_ptr, info->field_decimal_length,
+			info->field_flags,
+#if MYSQL_VERSION_ID >= 50404
+				HA_SM_DISK,
+				COLUMN_FORMAT_TYPE_FIXED,
+#endif
+		       NULL /*default_value*/, NULL /*on_update_value*/, &comment, NULL /*change*/, 
+		       NULL /*interval_list*/, info->field_charset, 0 /*uint_geom_type*/
+#if defined(MARIADB_BASE_VERSION) && MYSQL_VERSION_ID > 50200
+		       , NULL /*vcol_info*/, NULL /* create options */
+#endif
+		       )) 
+#endif
+			goto error;
+
+
+		info++;
+	}
+
+	if (skip_existing) {
+		size_t db_len = strlen(db);
+		size_t name_len = strlen(name);
+		size_t len = db_len + 1 + name_len + ext_len + 1;
+		char *path = (char *)xt_malloc_ns(len);
+		memcpy(path, db, db_len);
+		memcpy(path + db_len + 1, name, name_len);
+		memcpy(path + db_len + 1 + name_len, ext, ext_len);
+		path[db_len] = XT_DIR_CHAR;
+		path[len - 1] = '\0';
+		xtBool exists = xt_fs_exists(path);
+		xt_free_ns(path);
+		if (exists)
+			goto noerror;
+	}
+	
+	/* Create an internal temp table */
+#ifdef DRIZZLED
+    table_proto.set_name(name);
+    table_proto.set_type(drizzled::message::Table::STANDARD);
+
+	if (mysql_create_table_no_lock(thd, db, name, &create_info, &table_proto, &stmt->alter_info, 1, 0)) 
+		goto error;
+#else
+#ifdef WITH_PARTITION_STORAGE_ENGINE
+	partition_info *part_info;
+
+	part_info = thd->work_part_info;
+	thd->work_part_info = NULL;
+#endif
+	if (mysql_create_table_no_lock(thd, db, name, &mylex.create_info, &mylex.alter_info, 1, 0)) 
+		goto error;
+#ifdef WITH_PARTITION_STORAGE_ENGINE
+	thd->work_part_info = part_info;
+#endif
+#endif
+
+	noerror:
+	err = 0;
+
+	error:
+	lex_end(&mylex);
+	thd->lex = save_lex;
+	return err;
+}
+
diff --git a/storage/pbxt/src/discover_xt.h b/storage/pbxt/src/discover_xt.h
new file mode 100644
index 00000000000..733974ad59f
--- /dev/null
+++ b/storage/pbxt/src/discover_xt.h
@@ -0,0 +1,79 @@
+/* Copyright (c) 2008 PrimeBase Technologies GmbH, Germany
+ *
+ * PrimeBase XT
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ *  Created by Leslie on 8/27/08.
+ *
+ */
+
+#ifndef __DISCOVER_XT_H__
+#define __DISCOVER_XT_H__
+
+#ifdef DRIZZLED
+#include <drizzled/common.h>
+#else
+#include "mysql_priv.h"
+#endif
+
+/*
+ * ---------------------------------------------------------------
+ * TABLE DISCOVERY HANDLER
+ */
+
+typedef struct dt_field_info {
+	/** 
+	This is used as column name. 
+	*/
+	const char* field_name;
+	/**
+	For string-type columns, this is the maximum number of
+	characters. For numeric data this can be NULL.
+	*/
+	uint field_length;
+
+	/**
+	For decimal  columns, this is the maximum number of
+	digits after the decimal. For other data this can be NULL.
+	*/
+	char* field_decimal_length;
+	/**
+	This denotes data type for the column. For the most part, there seems to
+	be one entry in the enum for each SQL data type, although there seem to
+	be a number of additional entries in the enum.
+	*/
+	enum enum_field_types field_type;
+
+	/**
+	This is the charater set for non numeric data types including blob data.
+	*/
+	CHARSET_INFO *field_charset;
+
+	uint field_flags;        // Field atributes(maybe_null, signed, unsigned etc.)
+	const char* comment;
+} DT_FIELD_INFO;
+
+typedef struct dt_key_info
+{
+	const char*	key_name;
+	uint		key_type; /* PRI_KEY_FLAG, UNIQUE_KEY_FLAG, MULTIPLE_KEY_FLAG */
+	const char*	key_columns[8]; // The size of this can be set to what ever you need.
+} DT_KEY_INFO;
+
+int xt_create_table_frm(handlerton *hton, THD* thd, const char *db, const char *name, DT_FIELD_INFO *info, DT_KEY_INFO *keys, xtBool skip_existing);
+
+#endif
+
diff --git a/storage/pbxt/src/filesys_xt.cc b/storage/pbxt/src/filesys_xt.cc
new file mode 100644
index 00000000000..31e2cf961b6
--- /dev/null
+++ b/storage/pbxt/src/filesys_xt.cc
@@ -0,0 +1,1793 @@
+/* Copyright (c) 2005 PrimeBase Technologies GmbH
+ *
+ * PrimeBase XT
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * 2005-01-12	Paul McCullagh
+ *
+ * H&G2JCtL
+ */
+
+#include "xt_config.h"
+
+#ifdef DRIZZLED
+#include <bitset>
+#endif
+
+#ifndef XT_WIN
+#include <unistd.h>
+#include <dirent.h>
+#include <sys/mman.h>
+#endif
+#include <stdio.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <sys/types.h>
+#include <ctype.h>
+#include <string.h>
+#include <errno.h>
+
+#include "strutil_xt.h"
+#include "pthread_xt.h"
+#include "thread_xt.h"
+#include "filesys_xt.h"
+#include "memory_xt.h"
+#include "cache_xt.h"
+#include "sortedlist_xt.h"
+#include "trace_xt.h"
+
+#ifdef DEBUG
+//#define DEBUG_PRINT_IO
+//#define DEBUG_TRACE_IO
+//#define DEBUG_TRACE_MAP_IO
+//#define DEBUG_TRACE_FILES
+//#define INJECT_WRITE_REMAP_ERROR
+/* This is required to make testing on the Mac faster: */
+/* It turns of full file sync. */
+#define DEBUG_FAST_MAC
+#endif
+
+#ifdef DEBUG_TRACE_FILES
+//#define PRINTF		xt_ftracef
+#define PRINTF		xt_trace
+#endif
+
+#ifdef INJECT_WRITE_REMAP_ERROR
+#define INJECT_REMAP_FILE_SIZE			1000000
+#define INJECT_REMAP_FILE_TYPE			"xtd"
+#endif
+
+/* ----------------------------------------------------------------------
+ * Globals
+ */
+
+typedef struct FsGlobals {
+	xt_mutex_type		*fsg_lock;						/* The xtPublic cache lock. */
+	u_int				fsg_current_id;
+	XTSortedListPtr		fsg_open_files;
+} FsGlobalsRec;
+
+static FsGlobalsRec	fs_globals;
+
+#ifdef XT_WIN
+static int fs_get_win_error()
+{
+	return (int) GetLastError();
+}
+
+xtPublic void xt_get_win_message(char *buffer, size_t size, int err)
+{
+	FormatMessage(FORMAT_MESSAGE_FROM_SYSTEM, NULL, err,
+		MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT),
+        buffer,
+        size, NULL);
+}
+#endif
+
+/* ----------------------------------------------------------------------
+ * Open file list
+ */
+
+static XTFilePtr fs_new_file(XTThreadPtr self, char *file)
+{
+	XTFilePtr file_ptr;
+
+	pushsr_(file_ptr, xt_free, (XTFilePtr) xt_calloc(self, sizeof(XTFileRec)));
+
+	file_ptr->fil_path = xt_dup_string(self, file);
+	file_ptr->fil_id = fs_globals.fsg_current_id++;
+#ifdef DEBUG_TRACE_FILES
+	PRINTF("%s: allocated file: (%d) %s\n", self->t_name, (int) file_ptr->fil_id, xt_last_2_names_of_path(file_ptr->fil_path));
+#endif
+	if (!fs_globals.fsg_current_id)
+		fs_globals.fsg_current_id++;
+	file_ptr->fil_filedes = XT_NULL_FD;
+	file_ptr->fil_handle_count = 0;
+
+	popr_(); // Discard xt_free(file_ptr)
+	return file_ptr;
+}
+
+static void fs_close_fmap(XTThreadPtr self, XTFileMemMapPtr mm)
+{
+#ifdef XT_WIN
+	if (mm->mm_start) {
+		FlushViewOfFile(mm->mm_start, 0);
+		UnmapViewOfFile(mm->mm_start);
+		mm->mm_start = NULL;
+	}
+	if (mm->mm_mapdes != NULL) {
+		CloseHandle(mm->mm_mapdes);
+		mm->mm_mapdes = NULL;
+	}
+#else
+	if (mm->mm_start) {
+		msync( (char *)mm->mm_start, (size_t) mm->mm_length, MS_SYNC);
+		munmap((caddr_t) mm->mm_start, (size_t) mm->mm_length);
+		mm->mm_start = NULL;
+	}
+#endif
+	FILE_MAP_FREE_LOCK(self, &mm->mm_lock);
+	xt_free(self, mm);
+}
+
+static void fs_free_file(XTThreadPtr self, void *XT_UNUSED(thunk), void *item)
+{
+	XTFilePtr	file_ptr = *((XTFilePtr *) item);
+
+	if (file_ptr->fil_filedes != XT_NULL_FD) {
+#ifdef DEBUG_TRACE_FILES
+		PRINTF("%s: close file: (%d) %s\n", self->t_name, (int) file_ptr->fil_id, xt_last_2_names_of_path(file_ptr->fil_path));
+#endif
+#ifdef XT_WIN
+		CloseHandle(file_ptr->fil_filedes);
+#else
+		close(file_ptr->fil_filedes);
+#endif
+		//PRINTF("close (FILE) %d %s\n", file_ptr->fil_filedes, file_ptr->fil_path);
+		file_ptr->fil_filedes = XT_NULL_FD;
+	}
+
+#ifdef DEBUG_TRACE_FILES
+	PRINTF("%s: free file: (%d) %s\n", self->t_name, (int) file_ptr->fil_id, 
+		file_ptr->fil_path ? xt_last_2_names_of_path(file_ptr->fil_path) : "?");
+#endif
+
+	if (!file_ptr->fil_ref_count) {
+		ASSERT_NS(!file_ptr->fil_handle_count);
+		/* Flush any cache before this file is invalid: */
+		if (file_ptr->fil_path) {
+			xt_free(self, file_ptr->fil_path);
+			file_ptr->fil_path = NULL;
+		}
+
+		xt_free(self, file_ptr);
+	}
+}
+
+static int fs_comp_file(XTThreadPtr XT_UNUSED(self), register const void *XT_UNUSED(thunk), register const void *a, register const void *b)
+{
+	char		*file_name = (char *) a;
+	XTFilePtr	file_ptr = *((XTFilePtr *) b);
+
+	return strcmp(file_name, file_ptr->fil_path);
+}
+
+static int fs_comp_file_ci(XTThreadPtr XT_UNUSED(self), register const void *XT_UNUSED(thunk), register const void *a, register const void *b)
+{
+	char		*file_name = (char *) a;
+	XTFilePtr	file_ptr = *((XTFilePtr *) b);
+
+	return strcasecmp(file_name, file_ptr->fil_path);
+}
+
+/* ----------------------------------------------------------------------
+ * init & exit
+ */
+
+xtPublic void xt_fs_init(XTThreadPtr self)
+{
+	fs_globals.fsg_open_files = xt_new_sortedlist(self,
+		sizeof(XTFilePtr), 20, 20,
+		pbxt_ignore_case ? fs_comp_file_ci : fs_comp_file,
+		NULL, fs_free_file, TRUE, FALSE);
+	fs_globals.fsg_lock = fs_globals.fsg_open_files->sl_lock;
+	fs_globals.fsg_current_id = 1;
+}
+
+xtPublic void xt_fs_exit(XTThreadPtr self)
+{
+	if (fs_globals.fsg_open_files) {
+		xt_free_sortedlist(self, fs_globals.fsg_open_files);
+		fs_globals.fsg_open_files = NULL;
+	}
+	fs_globals.fsg_lock = NULL;
+	fs_globals.fsg_current_id = 0;
+}
+
+/* ----------------------------------------------------------------------
+ * File operations
+ */
+
+static void fs_set_stats(XTThreadPtr self, char *path)
+{
+	char		super_path[PATH_MAX];
+	struct stat	stats;
+	char		*ptr;
+
+	ptr = xt_last_name_of_path(path);
+	if (ptr == path) 
+		strcpy(super_path, ".");
+	else {
+		xt_strcpy(PATH_MAX, super_path, path);
+
+		if ((ptr = xt_last_name_of_path(super_path)))
+			*ptr = 0;
+	}
+	if (stat(super_path, &stats) == -1)
+		xt_throw_ferrno(XT_CONTEXT, errno, super_path);
+
+	if (chmod(path, stats.st_mode) == -1)
+		xt_throw_ferrno(XT_CONTEXT, errno, path);
+
+	/*chown(path, stats.st_uid, stats.st_gid);*/
+}
+
+xtPublic char *xt_file_path(struct XTFileRef *of)
+{
+	return of->fr_file->fil_path;
+}
+
+xtBool xt_fs_exists(char *path)
+{
+	int err;
+
+	err = access(path, F_OK);
+	if (err == -1)
+		return FALSE;
+	return TRUE;
+}
+
+/*
+ * No error is generated if the file dose not exist.
+ */
+xtPublic xtBool xt_fs_delete(XTThreadPtr self, char *name)
+{
+#ifdef DEBUG_TRACE_FILES
+	PRINTF("%s: DELETE FILE: %s\n", xt_get_self()->t_name, xt_last_2_names_of_path(name));
+#endif
+#ifdef XT_WIN
+	//PRINTF("delete %s\n", name);
+	if (!DeleteFile(name)) {
+		int err = fs_get_win_error();
+
+		if (!XT_FILE_NOT_FOUND(err)) {
+			xt_throw_ferrno(XT_CONTEXT, err, name);
+			return FAILED;
+		}
+	}
+#else
+	if (unlink(name) == -1) {
+		int err = errno;
+
+		if (err != ENOENT) {
+			xt_throw_ferrno(XT_CONTEXT, err, name);
+			return FAILED;
+		}
+	}
+#endif
+	return OK;
+}
+
+xtPublic xtBool xt_fs_file_not_found(int err)
+{
+#ifdef XT_WIN
+	return XT_FILE_NOT_FOUND(err);
+#else
+	return err == ENOENT;
+#endif
+}
+
+xtPublic void xt_fs_move(struct XTThread *self, char *from_path, char *to_path)
+{
+#ifdef DEBUG_TRACE_FILES
+	PRINTF("%s: MOVE FILE: %s --> %s\n", xt_get_self()->t_name, xt_last_2_names_of_path(from_path), xt_last_2_names_of_path(to_path));
+#endif
+#ifdef XT_WIN
+	if (!MoveFile(from_path, to_path))
+		xt_throw_ferrno(XT_CONTEXT, fs_get_win_error(), from_path);
+#else
+	int err;
+
+	if (link(from_path, to_path) == -1) {
+		err = errno;
+		xt_throw_ferrno(XT_CONTEXT, err, from_path);
+	}
+
+	if (unlink(from_path) == -1) {
+		err = errno;
+		unlink(to_path);
+		xt_throw_ferrno(XT_CONTEXT, err, from_path);
+	}
+#endif
+}
+
+xtPublic xtBool xt_fs_rename(struct XTThread *self, char *from_path, char *to_path)
+{
+	int err;
+
+#ifdef DEBUG_TRACE_FILES
+	PRINTF("%s: RENAME FILE: %s --> %s\n", xt_get_self()->t_name, xt_last_2_names_of_path(from_path), xt_last_2_names_of_path(to_path));
+#endif
+	if (rename(from_path, to_path) == -1) {
+		err = errno;
+		xt_throw_ferrno(XT_CONTEXT, err, from_path);
+		return FAILED;
+	}
+	return OK;
+}
+
+xtPublic xtBool xt_fs_stat(XTThreadPtr self, char *path, off_t *size, struct timespec *mod_time)
+{
+#ifdef XT_WIN
+	HANDLE						fh;
+	BY_HANDLE_FILE_INFORMATION	info;
+	SECURITY_ATTRIBUTES			sa = { sizeof(SECURITY_ATTRIBUTES), 0, 0 };
+
+	fh = CreateFile(
+		path,
+		GENERIC_READ,
+		FILE_SHARE_READ,
+		&sa,
+		OPEN_EXISTING,
+		FILE_ATTRIBUTE_NORMAL,
+		NULL);
+	if (fh == INVALID_HANDLE_VALUE) {
+		xt_throw_ferrno(XT_CONTEXT, fs_get_win_error(), path);
+		return FAILED;
+	}
+
+	if (!GetFileInformationByHandle(fh, &info)) {
+		CloseHandle(fh);
+		xt_throw_ferrno(XT_CONTEXT, fs_get_win_error(), path);
+		return FAILED;
+	}
+
+	CloseHandle(fh);
+	if (size)
+		*size = (off_t) info.nFileSizeLow | (((off_t) info.nFileSizeHigh) << 32);
+	if (mod_time)
+		mod_time->tv.ft = info.ftLastWriteTime;
+#else
+	struct stat sb;
+
+	if (stat(path, &sb) == -1) {
+		xt_throw_ferrno(XT_CONTEXT, errno, path);
+		return FAILED;
+	}
+	if (size)
+		*size = sb.st_size;
+	if (mod_time) {
+		mod_time->tv_sec = sb.st_mtime;
+#ifdef XT_MAC
+		/* This is the Mac OS X version: */
+		mod_time->tv_nsec = sb.st_mtimespec.tv_nsec;
+#else
+#ifdef __USE_MISC
+		/* This is the Linux version: */
+		mod_time->tv_nsec = sb.st_mtim.tv_nsec;
+#else
+		/* Not supported? */
+		mod_time->tv_nsec = 0;
+#endif
+#endif
+	}
+#endif
+	return OK;
+}
+
+void xt_fs_mkdir(XTThreadPtr self, char *name)
+{
+	char path[PATH_MAX];
+
+	xt_strcpy(PATH_MAX, path, name);
+	xt_remove_dir_char(path);
+
+#ifdef XT_WIN
+	{
+		SECURITY_ATTRIBUTES	sa = { sizeof(SECURITY_ATTRIBUTES), 0, 0 };
+
+		if (!CreateDirectory(path, &sa))
+			xt_throw_ferrno(XT_CONTEXT, fs_get_win_error(), path);
+	}
+#else
+	if (mkdir(path, S_IRWXU | S_IRWXG | S_IRWXO) == -1)
+		xt_throw_ferrno(XT_CONTEXT, errno, path);
+
+	try_(a) {
+		fs_set_stats(self, path);
+	}
+	catch_(a) {
+		xt_fs_rmdir(NULL, name);
+		throw_();
+	}
+	cont_(a);
+#endif
+}
+
+void xt_fs_mkpath(XTThreadPtr self, char *path)
+{
+	char *ptr;
+
+	if (xt_fs_exists(path))
+		return;
+
+	if (!(ptr = (char *) xt_last_directory_of_path((c_char *) path)))
+		return;
+	if (ptr == path)
+		return;
+	ptr--;
+	if (XT_IS_DIR_CHAR(*ptr)) {
+		*ptr = 0;
+		xt_fs_mkpath(self, path);
+		*ptr = XT_DIR_CHAR;
+		xt_fs_mkdir(self, path);
+	}
+}
+
+xtBool xt_fs_rmdir(XTThreadPtr self, char *name)
+{
+	char path[PATH_MAX];
+
+	xt_strcpy(PATH_MAX, path, name);
+	xt_remove_dir_char(path);
+
+#ifdef XT_WIN
+	if (!RemoveDirectory(path)) {
+		int err = fs_get_win_error();
+
+		if (!XT_FILE_NOT_FOUND(err)) {
+			xt_throw_ferrno(XT_CONTEXT, err, path);
+			return FAILED;
+		}
+	}
+#else
+	if (rmdir(path) == -1) {
+		int err = errno;
+
+		if (err != ENOENT) {
+			xt_throw_ferrno(XT_CONTEXT, err, path);
+			return FAILED;
+		}
+	}
+#endif
+	return OK;
+}
+
+/* ----------------------------------------------------------------------
+ * Open & Close operations
+ */
+
+xtPublic XTFilePtr xt_fs_get_file(XTThreadPtr self, char *file_name)
+{
+	XTFilePtr	file_ptr, *file_pptr;
+
+	xt_sl_lock(self, fs_globals.fsg_open_files);
+	pushr_(xt_sl_unlock, fs_globals.fsg_open_files);
+
+	if ((file_pptr = (XTFilePtr *) xt_sl_find(self, fs_globals.fsg_open_files, file_name)))
+		file_ptr = *file_pptr;
+	else {
+		file_ptr = fs_new_file(self, file_name);
+		xt_sl_insert(self, fs_globals.fsg_open_files, file_name, &file_ptr);
+	}
+	file_ptr->fil_ref_count++;
+	freer_(); // xt_sl_unlock(fs_globals.fsg_open_files)
+	return file_ptr;
+}
+
+xtPublic void xt_fs_release_file(XTThreadPtr self, XTFilePtr file_ptr)
+{
+	xt_sl_lock(self, fs_globals.fsg_open_files);
+	pushr_(xt_sl_unlock, fs_globals.fsg_open_files);
+
+	file_ptr->fil_ref_count--;
+	if (!file_ptr->fil_ref_count) {
+		xt_sl_delete(self, fs_globals.fsg_open_files, file_ptr->fil_path);
+	}
+
+	freer_(); // xt_ht_unlock(fs_globals.fsg_open_files)
+}
+
+static xtBool fs_open_file(XTThreadPtr self, XT_FD *fd, XTFilePtr file, int mode)
+{
+	int retried = FALSE;
+
+#ifdef DEBUG_TRACE_FILES
+	PRINTF("%s: OPEN FILE: (%d) %s\n", self->t_name, (int) file->fil_id, xt_last_2_names_of_path(file->fil_path));
+#endif
+	retry:
+#ifdef XT_WIN
+	SECURITY_ATTRIBUTES	sa = { sizeof(SECURITY_ATTRIBUTES), 0, 0 };
+	DWORD				flags;
+
+	if (mode & XT_FS_EXCLUSIVE)
+		flags = CREATE_NEW;
+	else if (mode & XT_FS_CREATE)
+		flags = OPEN_ALWAYS;
+	else
+		flags = OPEN_EXISTING;
+
+	*fd = CreateFile(
+		file->fil_path,
+		mode & XT_FS_READONLY ? GENERIC_READ : (GENERIC_READ | GENERIC_WRITE),
+		FILE_SHARE_READ | FILE_SHARE_WRITE,
+		&sa,
+		flags,
+		FILE_FLAG_RANDOM_ACCESS,
+		NULL);
+	if (*fd == INVALID_HANDLE_VALUE) {
+		int err = fs_get_win_error();
+
+		if (!(mode & XT_FS_MISSING_OK) || !XT_FILE_NOT_FOUND(err)) {
+			if (!retried && (mode & XT_FS_MAKE_PATH) && XT_FILE_NOT_FOUND(err)) {
+				char path[PATH_MAX];
+
+				xt_strcpy(PATH_MAX, path, file->fil_path);
+				xt_remove_last_name_of_path(path);
+				xt_fs_mkpath(self, path);
+				retried = TRUE;
+				goto retry;
+			}
+
+			xt_throw_ferrno(XT_CONTEXT, err, file->fil_path);
+		}
+
+		/* File is missing, but don't throw an error. */
+		return FAILED;
+	}
+	//PRINTF("open %d %s\n", *fd, file->fil_path);
+	return OK;
+#else
+	int flags = 0;
+
+	if (mode & XT_FS_READONLY)
+		flags = O_RDONLY;
+	else
+		flags = O_RDWR;
+	if (mode & XT_FS_CREATE)
+		flags |= O_CREAT;
+	if (mode & XT_FS_EXCLUSIVE)
+		flags |= O_EXCL;
+#ifdef O_DIRECT
+	if (mode & XT_FS_DIRECT_IO)
+		flags |= O_DIRECT;
+#endif
+
+	*fd = open(file->fil_path, flags, XT_MASK);
+	if (*fd == -1) {
+		int err = errno;
+
+		if (!(mode & XT_FS_MISSING_OK) || err != ENOENT) {
+			if (!retried && (mode & XT_FS_MAKE_PATH) && err == ENOENT) {
+				char path[PATH_MAX];
+
+				xt_strcpy(PATH_MAX, path, file->fil_path);
+				xt_remove_last_name_of_path(path);
+				xt_fs_mkpath(self, path);
+				retried = TRUE;
+				goto retry;
+			}
+
+			xt_throw_ferrno(XT_CONTEXT, err, file->fil_path);
+		}
+
+		/* File is missing, but don't throw an error. */
+		return FAILED;
+	}
+	///PRINTF("open %d %s\n", *fd, file->fil_path);
+	return OK;
+#endif
+}
+
+xtPublic XTOpenFilePtr xt_open_file(XTThreadPtr self, char *file, int mode)
+{
+	XTOpenFilePtr	of;
+
+	pushsr_(of, xt_close_file, (XTOpenFilePtr) xt_calloc(self, sizeof(XTOpenFileRec)));
+	of->fr_file = xt_fs_get_file(self, file);
+	of->fr_id = of->fr_file->fil_id;
+	of->of_filedes = XT_NULL_FD;
+
+#ifdef XT_WIN
+	if (!fs_open_file(self, &of->of_filedes, of->fr_file, mode)) {
+		xt_close_file(self, of);
+		of = NULL;
+	}
+#else
+	xtBool failed = FALSE;
+
+	if (of->fr_file->fil_filedes == -1) {
+		xt_sl_lock(self, fs_globals.fsg_open_files);
+		pushr_(xt_sl_unlock, fs_globals.fsg_open_files);
+		if (of->fr_file->fil_filedes == -1) {
+			if (!fs_open_file(self, &of->fr_file->fil_filedes, of->fr_file, mode))
+				failed = TRUE;
+		}
+		freer_(); // xt_ht_unlock(fs_globals.fsg_open_files)
+	}
+
+	if (failed) {
+		/* Close, but after we have release the fsg_open_files lock! */
+		xt_close_file(self, of);
+		of = NULL;
+	}
+	else
+		of->of_filedes = of->fr_file->fil_filedes;
+#endif
+
+	popr_(); // Discard xt_close_file(of)
+	return of;
+}
+
+xtPublic XTOpenFilePtr xt_open_file_ns(char *file, int mode)
+{
+	XTThreadPtr		self = xt_get_self();
+	XTOpenFilePtr	of;
+
+	try_(a) {
+		of = xt_open_file(self, file, mode);
+	}
+	catch_(a) {
+		of = NULL;
+	}
+	cont_(a);
+	return of;
+}
+
+xtPublic xtBool xt_open_file_ns(XTOpenFilePtr *fh, char *file, int mode)
+{
+	XTThreadPtr		self = xt_get_self();
+	xtBool			ok = TRUE;
+
+	try_(a) {
+		*fh = xt_open_file(self, file, mode);
+	}
+	catch_(a) {
+		ok = FALSE;
+	}
+	cont_(a);
+	return ok;
+}
+
+xtPublic void xt_close_file(XTThreadPtr self, XTOpenFilePtr of)
+{
+	if (of->of_filedes != XT_NULL_FD) {
+#ifdef XT_WIN
+		CloseHandle(of->of_filedes);
+#ifdef DEBUG_TRACE_FILES
+		PRINTF("%s: close file: (%d) %s\n", self->t_name, (int) of->fr_file->fil_id, xt_last_2_names_of_path(of->fr_file->fil_path));
+#endif
+#else
+		if (!of->fr_file || of->of_filedes != of->fr_file->fil_filedes) {
+			close(of->of_filedes);
+#ifdef DEBUG_TRACE_FILES
+			PRINTF("%s: close file: (%d) %s\n", self->t_name, (int) of->fr_file->fil_id, xt_last_2_names_of_path(of->fr_file->fil_path));
+#endif
+		}
+#endif
+
+		of->of_filedes = XT_NULL_FD;
+	}
+
+	if (of->fr_file) {
+		xt_fs_release_file(self, of->fr_file);
+		of->fr_file = NULL;
+	}
+	xt_free(self, of);
+}
+
+xtPublic xtBool xt_close_file_ns(XTOpenFilePtr of)
+{
+	XTThreadPtr self = xt_get_self();
+	xtBool		failed = FALSE;
+
+	try_(a) {
+		xt_close_file(self, of);
+	}
+	catch_(a) {
+		failed = TRUE;
+	}
+	cont_(a);
+	return failed;
+}
+
+/* ----------------------------------------------------------------------
+ * I/O operations
+ */
+
+xtPublic xtBool xt_lock_file(struct XTThread *self, XTOpenFilePtr of)
+{
+#ifdef XT_WIN
+	if (!LockFile(of->of_filedes, 0, 0, 512, 0)) {
+		int err = fs_get_win_error();
+		
+		if (err == ERROR_LOCK_VIOLATION ||
+			err == ERROR_LOCK_FAILED)
+			return FAILED;
+		
+		xt_throw_ferrno(XT_CONTEXT, err, xt_file_path(of));
+		return FAILED;
+	}
+	return OK;
+#else
+	if (lockf(of->of_filedes, F_TLOCK, 0) == 0)
+		return OK;
+	if (errno == EAGAIN)
+		return FAILED;
+	xt_throw_ferrno(XT_CONTEXT, errno, xt_file_path(of));
+	return FAILED;
+#endif
+}
+
+xtPublic void xt_unlock_file(struct XTThread *self, XTOpenFilePtr of)
+{
+#ifdef XT_WIN
+	if (!UnlockFile(of->of_filedes, 0, 0, 512, 0)) {
+		int err = fs_get_win_error();
+		
+		if (err != ERROR_NOT_LOCKED)
+			xt_throw_ferrno(XT_CONTEXT, err, xt_file_path(of));
+	}
+#else
+	if (lockf(of->of_filedes, F_ULOCK, 0) == -1)
+		xt_throw_ferrno(XT_CONTEXT, errno, xt_file_path(of));
+#endif
+}
+
+static off_t fs_seek_eof(XTThreadPtr self, XT_FD fd, XTFilePtr file)
+{
+#ifdef XT_WIN
+	DWORD			result;
+	LARGE_INTEGER	lpFileSize;
+
+	result = SetFilePointer(fd, 0, NULL, FILE_END);
+	if (result == 0xFFFFFFFF) {
+		xt_throw_ferrno(XT_CONTEXT, fs_get_win_error(), file->fil_path);
+		return (off_t) -1;
+	}
+
+	if (!GetFileSizeEx(fd, &lpFileSize)) {
+		xt_throw_ferrno(XT_CONTEXT, fs_get_win_error(), file->fil_path);
+		return (off_t) -1;
+	}
+
+	return lpFileSize.QuadPart;
+#else
+	off_t off;
+
+	off = lseek(fd, 0, SEEK_END);
+	if (off == -1) {
+		xt_throw_ferrno(XT_CONTEXT, errno, file->fil_path);
+		return -1;
+	}
+
+     return off;
+#endif
+}
+
+xtPublic off_t xt_seek_eof_file(XTThreadPtr self, XTOpenFilePtr of)
+{
+	return fs_seek_eof(self, of->of_filedes, of->fr_file);
+}
+
+xtPublic xtBool xt_set_eof_file(XTThreadPtr self, XTOpenFilePtr of, off_t offset)
+{
+#ifdef XT_WIN
+	LARGE_INTEGER liDistanceToMove;
+	
+	liDistanceToMove.QuadPart = offset;
+	if (!SetFilePointerEx(of->of_filedes, liDistanceToMove, NULL, FILE_BEGIN)) {
+		xt_throw_ferrno(XT_CONTEXT, fs_get_win_error(), xt_file_path(of));
+		return FAILED;
+	}
+
+	if (!SetEndOfFile(of->of_filedes)) {
+		xt_throw_ferrno(XT_CONTEXT, fs_get_win_error(), xt_file_path(of));
+		return FAILED;
+	}
+#else
+	if (ftruncate(of->of_filedes, offset) == -1) {
+		xt_throw_ferrno(XT_CONTEXT, errno, xt_file_path(of));
+		return FAILED;
+	}
+#endif
+	return OK;
+}
+
+xtPublic xtBool xt_pwrite_file(XTOpenFilePtr of, off_t offset, size_t size, void *data, XTIOStatsPtr stat, XTThreadPtr XT_UNUSED(thread))
+{
+#ifdef DEBUG_PRINT_IO
+	PRINTF("PBXT WRITE %s offs=%d size=%d\n", of->fr_file->fil_path, (int) offset, (int) size);
+#endif
+#ifdef DEBUG_TRACE_IO
+	char	timef[50];
+	xtWord8	start = xt_trace_clock();
+#endif
+#ifdef XT_WIN
+	LARGE_INTEGER	liDistanceToMove;
+	DWORD			result;
+	
+	liDistanceToMove.QuadPart = offset;
+	if (!SetFilePointerEx(of->of_filedes, liDistanceToMove, NULL, FILE_BEGIN))
+		return xt_register_ferrno(XT_REG_CONTEXT, fs_get_win_error(), xt_file_path(of));
+
+	if (!WriteFile(of->of_filedes, data, size, &result, NULL))
+		return xt_register_ferrno(XT_REG_CONTEXT, fs_get_win_error(), xt_file_path(of));
+
+	if (result != size)
+		return xt_register_ferrno(XT_REG_CONTEXT, ERROR_HANDLE_EOF, xt_file_path(of));
+#else
+	ssize_t write_size;
+
+	write_size = pwrite(of->of_filedes, data, size, offset);
+	if (write_size == -1)
+		return xt_register_ferrno(XT_REG_CONTEXT, errno, xt_file_path(of));
+
+	if ((size_t) write_size != size)
+		return xt_register_ferrno(XT_REG_CONTEXT, ESPIPE, xt_file_path(of));
+
+#endif
+	stat->ts_write += (u_int) size;
+
+#ifdef DEBUG_TRACE_IO
+	xt_trace("/* %s */ pbxt_file_writ(\"%s\", %lu, %lu);\n", xt_trace_clock_diff(timef, start), of->fr_file->fil_path, (u_long) offset, (u_long) size);
+#endif
+	return OK;
+}
+
+xtPublic xtBool xt_flush_file(XTOpenFilePtr of, XTIOStatsPtr stat, XTThreadPtr XT_UNUSED(thread))
+{
+	xtWord8 s;
+
+#ifdef DEBUG_PRINT_IO
+	PRINTF("PBXT FLUSH %s\n", of->fr_file->fil_path);
+#endif
+#ifdef DEBUG_TRACE_IO
+	char	timef[50];
+	xtWord8	start = xt_trace_clock();
+#endif
+	stat->ts_flush_start = xt_trace_clock();
+#ifdef XT_WIN
+	if (!FlushFileBuffers(of->of_filedes)) {
+		xt_register_ferrno(XT_REG_CONTEXT, fs_get_win_error(), xt_file_path(of));
+		goto failed;
+	}
+#else
+	/* Mac OS X has problems with fsync. We had several cases of index corruption presumably because
+	 * fsync didn't really flush index pages to disk. fcntl(F_FULLFSYNC) is considered more effective 
+	 * in such case.
+	 */
+#if defined(F_FULLFSYNC) && !defined(DEBUG_FAST_MAC)
+	if (fcntl(of->of_filedes, F_FULLFSYNC, 0) == -1) {
+		xt_register_ferrno(XT_REG_CONTEXT, errno, xt_file_path(of));
+		goto failed;
+	}
+#else
+	if (fsync(of->of_filedes) == -1) {
+		xt_register_ferrno(XT_REG_CONTEXT, errno, xt_file_path(of));
+		goto failed;
+	}
+#endif
+#endif
+#ifdef DEBUG_TRACE_IO
+	xt_trace("/* %s */ pbxt_file_sync(\"%s\");\n", xt_trace_clock_diff(timef, start), of->fr_file->fil_path);
+#endif
+	s = stat->ts_flush_start;
+	stat->ts_flush_start = 0;
+	stat->ts_flush_time += xt_trace_clock() - s;
+	stat->ts_flush++;
+	return OK;
+
+	failed:
+	s = stat->ts_flush_start;
+	stat->ts_flush_start = 0;
+	stat->ts_flush_time += xt_trace_clock() - s;
+	return FAILED;
+}
+
+xtBool xt_pread_file(XTOpenFilePtr of, off_t offset, size_t size, size_t min_size, void *data, size_t *red_size, XTIOStatsPtr stat, XTThreadPtr XT_UNUSED(thread))
+{
+#ifdef DEBUG_PRINT_IO
+	PRINTF("PBXT READ %s offset=%d size=%d\n", of->fr_file->fil_path, (int) offset, (int) size);
+#endif
+#ifdef DEBUG_TRACE_IO
+	char	timef[50];
+	xtWord8	start = xt_trace_clock();
+#endif
+#ifdef XT_WIN
+	LARGE_INTEGER	liDistanceToMove;
+	DWORD			result;
+
+	liDistanceToMove.QuadPart = offset;
+	if (!SetFilePointerEx(of->of_filedes, liDistanceToMove, NULL, FILE_BEGIN))
+		return xt_register_ferrno(XT_REG_CONTEXT, fs_get_win_error(), xt_file_path(of));
+
+	if (!ReadFile(of->of_filedes, data, size, &result, NULL))
+		return xt_register_ferrno(XT_REG_CONTEXT, fs_get_win_error(), xt_file_path(of));
+
+	if ((size_t) result < min_size)
+		return xt_register_ferrno(XT_REG_CONTEXT, ERROR_HANDLE_EOF, xt_file_path(of));
+
+	if (red_size)
+		*red_size = (size_t) result;
+	stat->ts_read += (u_int) result;
+#else
+	ssize_t read_size;
+
+	read_size = pread(of->of_filedes, data, size, offset);
+	if (read_size == -1)
+		return xt_register_ferrno(XT_REG_CONTEXT, errno, xt_file_path(of));
+
+	/* Throw an error if read less than the minimum: */
+	if ((size_t) read_size < min_size) {
+//PRINTF("PMC PBXT <-- offset:%llu, count:%lu \n", (u_llong) offset, (u_long) size);
+		return xt_register_ferrno(XT_REG_CONTEXT, ESPIPE, xt_file_path(of));
+	}
+
+	if (red_size)
+		*red_size = (size_t) read_size;
+	stat->ts_read += (u_int) read_size;
+#endif
+#ifdef DEBUG_TRACE_IO
+	xt_trace("/* %s */ pbxt_file_read(\"%s\", %lu, %lu);\n", xt_trace_clock_diff(timef, start), of->fr_file->fil_path, (u_long) offset, (u_long) size);
+#endif
+	return OK;
+}
+
+xtPublic xtBool xt_lock_file_ptr(XTOpenFilePtr of, xtWord1 **data, off_t offset, size_t size, XTIOStatsPtr stat, XTThreadPtr thread)
+{
+	size_t red_size;
+
+	if (!*data) {
+		if (!(*data = (xtWord1 *) xt_malloc_ns(size)))
+			return FAILED;
+	}
+
+	if (!xt_pread_file(of, offset, size, 0, *data, &red_size, stat, thread))
+		return FAILED;
+	
+	//if (red_size < size)
+	//	memset();
+	return OK;
+}
+
+xtPublic void xt_unlock_file_ptr(XTOpenFilePtr XT_UNUSED(of), xtWord1 *data, XTThreadPtr XT_UNUSED(thread))
+{
+	if (data)
+		xt_free_ns(data);
+}
+
+/* ----------------------------------------------------------------------
+ * Directory operations
+ */
+
+/*
+ * The filter may contain one '*' as wildcard.
+ */
+XTOpenDirPtr xt_dir_open(XTThreadPtr self, c_char *path, c_char *filter)
+{
+	XTOpenDirPtr	od;
+
+#ifdef XT_SOLARIS
+	/* see the comment in filesys_xt.h */
+	size_t sz = pathconf(path, _PC_NAME_MAX) + sizeof(XTOpenDirRec) + 1;
+#else
+	size_t sz = sizeof(XTOpenDirRec);
+#endif
+	pushsr_(od, xt_dir_close, (XTOpenDirPtr) xt_calloc(self, sz));
+
+#ifdef XT_WIN
+	size_t			len;
+
+	od->od_handle = XT_NULL_FD;
+
+	// path = path\(filter | *)
+	len = strlen(path) + 1 + (filter ? strlen(filter) : 1) + 1;
+	od->od_path = (char *) xt_malloc(self, len);
+
+	strcpy(od->od_path, path);
+	xt_add_dir_char(len, od->od_path);
+	if (filter)
+		strcat(od->od_path, filter);
+	else
+		strcat(od->od_path, "*");
+#else
+	od->od_path = xt_dup_string(self, path);
+
+	if (filter)
+		od->od_filter = xt_dup_string(self, filter);
+
+	od->od_dir = opendir(path);
+	if (!od->od_dir)
+		xt_throw_ferrno(XT_CONTEXT, errno, path);
+#endif
+	popr_(); // Discard xt_dir_close(od)
+	return od;
+}
+
+void xt_dir_close(XTThreadPtr self, XTOpenDirPtr od)
+{
+	if (od) {
+#ifdef XT_WIN
+		if (od->od_handle != XT_NULL_FD) {
+			FindClose(od->od_handle);
+			od->od_handle = XT_NULL_FD;
+		}
+#else
+		if (od->od_dir) {
+			closedir(od->od_dir);
+			od->od_dir = NULL;
+		}
+		if (od->od_filter) {
+			xt_free(self, od->od_filter);
+			od->od_filter = NULL;
+		}
+#endif
+		if (od->od_path) {
+			xt_free(self, od->od_path);
+			od->od_path = NULL;
+		}
+		xt_free(self, od);
+	}
+}
+
+#ifdef XT_WIN
+xtBool xt_dir_next(XTThreadPtr self, XTOpenDirPtr od)
+{
+	int err = 0;
+
+	if (od->od_handle == INVALID_HANDLE_VALUE) {
+		od->od_handle = FindFirstFile(od->od_path, &od->od_data);
+		if (od->od_handle == INVALID_HANDLE_VALUE)
+			err = fs_get_win_error();
+	}
+	else {
+		if (!FindNextFile(od->od_handle, &od->od_data))
+			err = fs_get_win_error();
+	}
+
+	if (err) {
+		if (err != ERROR_NO_MORE_FILES) {
+			if (err == ERROR_FILE_NOT_FOUND) {
+				char path[PATH_MAX];
+
+				xt_strcpy(PATH_MAX, path, od->od_path);
+				xt_remove_last_name_of_path(path);
+				if (!xt_fs_exists(path))
+					xt_throw_ferrno(XT_CONTEXT, err, path);
+			}
+			else
+				xt_throw_ferrno(XT_CONTEXT, err, od->od_path);
+		}
+		return FAILED;
+	}
+
+	return OK;
+}
+#else
+static xtBool fs_match_filter(c_char *name, c_char *filter)
+{
+	while (*name && *filter) {
+		if (*filter == '*') {
+			if (filter[1] == *name)
+				filter++;
+			else
+				name++;
+		}
+		else {
+			if (*name != *filter)
+				return FALSE;
+			name++;
+			filter++;
+		}
+	}
+	if (!*name) {
+		if (!*filter || (*filter == '*' && !filter[1]))
+			return TRUE;
+	}
+	return FALSE;
+}
+
+xtBool xt_dir_next(XTThreadPtr self, XTOpenDirPtr od)
+{
+	int				err;
+	struct dirent	*result;
+
+	for (;;) {
+		err = readdir_r(od->od_dir, &od->od_entry, &result);
+		if (err) {
+			xt_throw_ferrno(XT_CONTEXT, err, od->od_path);
+			return FAILED;
+		}
+		if (!result)
+			break;
+		/* Filter out '.' and '..': */
+		if (od->od_entry.d_name[0] == '.') {
+			if (od->od_entry.d_name[1] == '.') {
+				if (od->od_entry.d_name[2] == '\0')
+					continue;
+			}
+			else {
+				if (od->od_entry.d_name[1] == '\0')
+					continue;
+			}
+		}
+		if (!od->od_filter)
+			break;
+		if (fs_match_filter(od->od_entry.d_name, od->od_filter))
+			break;
+	}
+	return result ? TRUE : FALSE;
+}
+#endif
+
+char *xt_dir_name(XTThreadPtr XT_UNUSED(self), XTOpenDirPtr od)
+{
+#ifdef XT_WIN
+	return od->od_data.cFileName;
+#else
+	return od->od_entry.d_name;
+#endif
+}
+
+xtBool xt_dir_is_file(XTThreadPtr self, XTOpenDirPtr od)
+{
+	(void) self;
+#ifdef XT_WIN
+	if (od->od_data.dwFileAttributes & FILE_ATTRIBUTE_DIRECTORY)
+		return FALSE;
+#elif defined(XT_SOLARIS)
+        char path[PATH_MAX];
+	struct stat sb;
+
+	xt_strcpy(PATH_MAX, path, od->od_path);
+	xt_add_dir_char(PATH_MAX, path);
+	xt_strcat(PATH_MAX, path, od->od_entry.d_name);
+
+	if (stat(path, &sb) == -1) {
+		xt_throw_ferrno(XT_CONTEXT, errno, path);
+		return FAILED;
+	}
+
+	if ( sb.st_mode & S_IFDIR )
+		return FALSE;
+#else
+	if (od->od_entry.d_type & DT_DIR)
+		return FALSE;
+#endif
+	return TRUE;
+}
+
+off_t xt_dir_file_size(XTThreadPtr self, XTOpenDirPtr od)
+{
+#ifdef XT_WIN
+	return (off_t) od->od_data.nFileSizeLow | (((off_t) od->od_data.nFileSizeHigh) << 32);
+#else
+	char	path[PATH_MAX];
+	off_t	size;
+
+	xt_strcpy(PATH_MAX, path, od->od_path);
+	xt_add_dir_char(PATH_MAX, path);
+	xt_strcat(PATH_MAX, path, od->od_entry.d_name);
+	if (!xt_fs_stat(self, path, &size, NULL))
+		return -1;
+	return size;
+#endif
+}
+
+/* ----------------------------------------------------------------------
+ * File mapping operations
+ */
+
+static xtBool fs_map_file(XTFileMemMapPtr mm, XTFilePtr file, xtBool grow)
+{
+#ifdef INJECT_WRITE_REMAP_ERROR
+	if (xt_is_extension(file->fil_path, INJECT_REMAP_FILE_TYPE)) {
+		if (mm->mm_length > INJECT_REMAP_FILE_SIZE) {
+			xt_register_ferrno(XT_REG_CONTEXT, 30, file->fil_path);
+			return FAILED;
+		}
+	}
+#endif
+
+	ASSERT_NS(!mm->mm_start);
+#ifdef XT_WIN
+	/* This will grow the file to the given size: */
+	mm->mm_mapdes = CreateFileMapping(file->fil_filedes, NULL, PAGE_READWRITE, (DWORD) (mm->mm_length >> 32), (DWORD) mm->mm_length, NULL);
+	if (mm->mm_mapdes == NULL) {
+		xt_register_ferrno(XT_REG_CONTEXT, fs_get_win_error(), file->fil_path);
+		return FAILED;
+	}
+
+	mm->mm_start = (xtWord1 *) MapViewOfFile(mm->mm_mapdes, FILE_MAP_WRITE, 0, 0, 0);
+	if (!mm->mm_start) {
+		CloseHandle(mm->mm_mapdes);
+		mm->mm_mapdes = NULL;
+		xt_register_ferrno(XT_REG_CONTEXT, fs_get_win_error(), file->fil_path);
+		return FAILED;
+	}
+#else
+	if (grow) {
+		char data[2];
+
+		if (pwrite(file->fil_filedes, data, 1, mm->mm_length - 1) == -1) {
+			xt_register_ferrno(XT_REG_CONTEXT, errno, file->fil_path);
+			return FAILED;
+		}
+	}
+
+	/* Remap: */
+	mm->mm_start = (xtWord1 *) mmap(0, (size_t) mm->mm_length, PROT_READ | PROT_WRITE, MAP_SHARED, file->fil_filedes, 0);
+	if (mm->mm_start == MAP_FAILED) {
+		mm->mm_start = NULL;
+		xt_register_ferrno(XT_REG_CONTEXT, errno, file->fil_path);
+		return FAILED;
+	}
+#endif
+	return OK;
+}
+
+xtPublic XTMapFilePtr xt_open_fmap(XTThreadPtr self, char *file, size_t grow_size)
+{
+	XTMapFilePtr	map;
+
+	pushsr_(map, xt_close_fmap, (XTMapFilePtr) xt_calloc(self, sizeof(XTMapFileRec)));
+	map->fr_file = xt_fs_get_file(self, file);
+	map->fr_id = map->fr_file->fil_id;
+
+	xt_sl_lock(self, fs_globals.fsg_open_files);
+	pushr_(xt_sl_unlock, fs_globals.fsg_open_files);
+
+	if (map->fr_file->fil_filedes == XT_NULL_FD) {
+		if (!fs_open_file(self, &map->fr_file->fil_filedes, map->fr_file, XT_FS_DEFAULT)) {
+			xt_close_fmap(self, map);
+			map = NULL;
+		}
+	}
+
+	map->fr_file->fil_handle_count++;
+
+	freer_(); // xt_ht_unlock(fs_globals.fsg_open_files)
+
+	if (!map->fr_file->fil_memmap) {
+		xt_sl_lock(self, fs_globals.fsg_open_files);
+		pushr_(xt_sl_unlock, fs_globals.fsg_open_files);
+		if (!map->fr_file->fil_memmap) {
+			XTFileMemMapPtr mm;
+
+			mm = (XTFileMemMapPtr) xt_calloc(self, sizeof(XTFileMemMapRec));
+			pushr_(fs_close_fmap, mm);
+
+#ifdef XT_WIN
+			/* NULL is the value returned on error! */
+			mm->mm_mapdes = NULL;
+#endif
+			FILE_MAP_INIT_LOCK(self, &mm->mm_lock);
+			mm->mm_length = fs_seek_eof(self, map->fr_file->fil_filedes, map->fr_file);
+			if (sizeof(size_t) == 4 && mm->mm_length >= (off_t) 0xFFFFFFFF)
+				xt_throw_ixterr(XT_CONTEXT, XT_ERR_FILE_TOO_LONG, map->fr_file->fil_path);
+			mm->mm_grow_size = grow_size;
+
+			if (mm->mm_length < (off_t) grow_size) {
+				mm->mm_length = (off_t) grow_size;
+				if (!fs_map_file(mm, map->fr_file, TRUE))
+					xt_throw(self);
+			}
+			else {
+				if (!fs_map_file(mm, map->fr_file, FALSE))
+					xt_throw(self);
+			}
+
+			popr_(); // Discard fs_close_fmap(mm)
+			map->fr_file->fil_memmap = mm;
+		}
+		freer_(); // xt_ht_unlock(fs_globals.fsg_open_files)
+	}
+	map->mf_memmap = map->fr_file->fil_memmap;
+
+	popr_(); // Discard xt_close_fmap(map)
+	return map;
+}
+
+xtPublic void xt_close_fmap(XTThreadPtr self, XTMapFilePtr map)
+{
+	ASSERT_NS(!map->mf_slock_count);
+	if (map->fr_file) {
+		xt_sl_lock(self, fs_globals.fsg_open_files);
+		pushr_(xt_sl_unlock, fs_globals.fsg_open_files);		
+		map->fr_file->fil_handle_count--;
+		if (!map->fr_file->fil_handle_count) {
+			fs_close_fmap(self, map->fr_file->fil_memmap);
+			map->fr_file->fil_memmap = NULL;
+		}
+		freer_();
+		
+		xt_fs_release_file(self, map->fr_file);
+		map->fr_file = NULL;
+	}
+	map->mf_memmap = NULL;
+	xt_free(self, map);
+}
+
+xtPublic xtBool xt_close_fmap_ns(XTMapFilePtr map)
+{
+	XTThreadPtr self = xt_get_self();
+	xtBool		failed = FALSE;
+
+	try_(a) {
+		xt_close_fmap(self, map);
+	}
+	catch_(a) {
+		failed = TRUE;
+	}
+	cont_(a);
+	return failed;
+}
+
+static xtBool fs_remap_file(XTMapFilePtr map, off_t offset, size_t size, XTIOStatsPtr stat)
+{
+	off_t			new_size = 0;
+	XTFileMemMapPtr	mm = map->mf_memmap;
+	xtWord8			s;
+
+	if (offset + (off_t) size > mm->mm_length) {
+		/* Expand the file: */
+		new_size = (mm->mm_length + (off_t) mm->mm_grow_size) / (off_t) mm->mm_grow_size;
+		new_size *= mm->mm_grow_size;
+		while (new_size < offset + (off_t) size)
+			new_size += mm->mm_grow_size;
+
+		if (sizeof(size_t) == 4 && new_size >= (off_t) 0xFFFFFFFF) {
+			xt_register_ixterr(XT_REG_CONTEXT, XT_ERR_FILE_TOO_LONG, xt_file_path(map));
+			return FAILED;
+		}
+	}
+	else if (!mm->mm_start)
+		new_size = mm->mm_length;
+
+	if (new_size) {
+		if (mm->mm_start) {
+			/* Flush & unmap: */
+			stat->ts_flush_start = xt_trace_clock();
+#ifdef XT_WIN
+			if (!FlushViewOfFile(mm->mm_start, 0)) {
+				xt_register_ferrno(XT_REG_CONTEXT, fs_get_win_error(), xt_file_path(map));
+				goto failed;
+			}
+
+			if (!UnmapViewOfFile(mm->mm_start)) {
+				xt_register_ferrno(XT_REG_CONTEXT, fs_get_win_error(), xt_file_path(map));
+				goto failed;
+			}
+#else
+			if (msync( (char *)mm->mm_start, (size_t) mm->mm_length, MS_SYNC) == -1) {
+				xt_register_ferrno(XT_REG_CONTEXT, errno, xt_file_path(map));
+				goto failed;
+			}
+
+			/* Unmap: */
+			if (munmap((caddr_t) mm->mm_start, (size_t) mm->mm_length) == -1) {
+				xt_register_ferrno(XT_REG_CONTEXT, errno, xt_file_path(map));
+				goto failed;
+			}
+#endif
+			s = stat->ts_flush_start;
+			stat->ts_flush_start = 0;
+			stat->ts_flush_time += xt_trace_clock() - s;
+			stat->ts_flush++;
+		}
+		mm->mm_start = NULL;
+#ifdef XT_WIN
+		/* It is possible that a previous remap attempt has failed: the map was closed
+		 * but the new map was not allocated (e.g. because of insufficient disk space). 
+		 * In this case mm->mm_mapdes will be NULL.
+		 */
+		if (mm->mm_mapdes && !CloseHandle(mm->mm_mapdes))
+			return xt_register_ferrno(XT_REG_CONTEXT, fs_get_win_error(), xt_file_path(map));
+		mm->mm_mapdes = NULL;
+#endif
+		off_t old_size = mm->mm_length;
+		mm->mm_length = new_size;
+
+		if (!fs_map_file(mm, map->fr_file, TRUE)) {
+			/* Try to restore old mapping */
+			mm->mm_length = old_size;
+			fs_map_file(mm, map->fr_file, FALSE);
+			return FAILED;
+		}
+	}
+	return OK;
+	
+	failed:
+	s = stat->ts_flush_start;
+	stat->ts_flush_start = 0;
+	stat->ts_flush_time += xt_trace_clock() - s;
+	return FAILED;
+}
+
+xtPublic xtBool xt_pwrite_fmap(XTMapFilePtr map, off_t offset, size_t size, void *data, XTIOStatsPtr stat, XTThreadPtr thread)
+{
+	XTFileMemMapPtr mm = map->mf_memmap;
+#ifndef FILE_MAP_USE_PTHREAD_RW
+	xtThreadID		thd_id = thread->t_id;
+#endif
+
+#ifdef DEBUG_TRACE_MAP_IO
+	xt_trace("/* %s */ pbxt_fmap_writ(\"%s\", %lu, %lu);\n", xt_trace_clock_diff(NULL), map->fr_file->fil_path, (u_long) offset, (u_long) size);
+#endif
+	ASSERT_NS(!map->mf_slock_count);
+	FILE_MAP_READ_LOCK(&mm->mm_lock, thd_id);
+	if (!mm->mm_start || offset + (off_t) size > mm->mm_length) {
+		FILE_MAP_UNLOCK(&mm->mm_lock, thd_id);
+
+		FILE_MAP_WRITE_LOCK(&mm->mm_lock, thd_id);
+		if (!fs_remap_file(map, offset, size, stat))
+			goto failed;
+	}
+
+#ifdef XT_WIN
+	__try
+	{
+		memcpy(mm->mm_start + offset, data, size);
+	}
+	// GetExceptionCode()== EXCEPTION_IN_PAGE_ERROR ? EXCEPTION_EXECUTE_HANDLER : EXCEPTION_CONTINUE_SEARCH
+	__except(EXCEPTION_EXECUTE_HANDLER)
+	{
+		xt_register_ferrno(XT_REG_CONTEXT, GetExceptionCode(), xt_file_path(map));
+		goto failed;
+	}
+#else
+	memcpy(mm->mm_start + offset, data, size);
+#endif
+
+	FILE_MAP_UNLOCK(&mm->mm_lock, thd_id);
+	stat->ts_write += size;
+	return OK;
+
+	failed:
+	FILE_MAP_UNLOCK(&mm->mm_lock, thd_id);
+	return FAILED;
+}
+
+xtPublic xtBool xt_pread_fmap_4(XTMapFilePtr map, off_t offset, xtWord4 *value, XTIOStatsPtr stat, XTThreadPtr thread)
+{
+	XTFileMemMapPtr	mm = map->mf_memmap;
+#ifndef FILE_MAP_USE_PTHREAD_RW
+	xtThreadID		thd_id = thread->t_id;
+#endif
+
+#ifdef DEBUG_TRACE_MAP_IO
+	xt_trace("/* %s */ pbxt_fmap_read_4(\"%s\", %lu, 4);\n", xt_trace_clock_diff(NULL), map->fr_file->fil_path, (u_long) offset);
+#endif
+	if (!map->mf_slock_count)
+		FILE_MAP_READ_LOCK(&mm->mm_lock, thd_id);
+	if (!mm->mm_start) {
+		FILE_MAP_UNLOCK(&mm->mm_lock, thd_id);
+		FILE_MAP_WRITE_LOCK(&mm->mm_lock, thd_id);
+		if (!fs_remap_file(map, 0, 0, stat)) {
+			FILE_MAP_UNLOCK(&mm->mm_lock, thd_id);
+			return FAILED;
+		}
+	}
+	if (offset >= mm->mm_length)
+		*value = 0;
+	else {
+		xtWord1 *data;
+
+		data = mm->mm_start + offset;
+#ifdef XT_WIN
+		__try
+		{
+			*value = XT_GET_DISK_4(data);
+			// GetExceptionCode()== EXCEPTION_IN_PAGE_ERROR ? EXCEPTION_EXECUTE_HANDLER : EXCEPTION_CONTINUE_SEARCH
+		}
+		__except(EXCEPTION_EXECUTE_HANDLER)
+		{
+			FILE_MAP_UNLOCK(&mm->mm_lock, thd_id);
+			return xt_register_ferrno(XT_REG_CONTEXT, GetExceptionCode(), xt_file_path(map));
+		}
+#else
+		*value = XT_GET_DISK_4(data);
+#endif
+	}
+
+	if (!map->mf_slock_count)
+		FILE_MAP_UNLOCK(&mm->mm_lock, thd_id);
+	stat->ts_read += 4;
+	return OK;
+}
+
+xtPublic xtBool xt_pread_fmap(XTMapFilePtr map, off_t offset, size_t size, size_t min_size, void *data, size_t *red_size, XTIOStatsPtr stat, XTThreadPtr thread)
+{
+	XTFileMemMapPtr	mm = map->mf_memmap;
+#ifndef FILE_MAP_USE_PTHREAD_RW
+	xtThreadID		thd_id = thread->t_id;
+#endif
+	size_t			tfer;
+
+#ifdef DEBUG_TRACE_MAP_IO
+	xt_trace("/* %s */ pbxt_fmap_read(\"%s\", %lu, %lu);\n", xt_trace_clock_diff(NULL), map->fr_file->fil_path, (u_long) offset, (u_long) size);
+#endif
+	/* NOTE!! The file map may already be locked,
+	 * by a call to xt_lock_fmap_ptr()!
+	 *
+	 * 20.05.2009: This problem should be fixed now with mf_slock_count!
+	 *
+	 * This can occur during a sequential scan:
+	 * xt_pread_fmap()  Line 1330
+	 * XTTabCache::tc_read_direct()  Line 361
+	 * XTTabCache::xt_tc_read()  Line 220
+	 * xt_tab_get_rec_data()
+	 * tab_visible()  Line 2412
+	 * xt_tab_seq_next()  Line 4068
+	 *
+	 * And occurs during the following test:
+	 * create table t1 ( a int not null, b int not null) ;
+	 * --disable_query_log
+	 * insert into t1 values (1,1),(2,2),(3,3),(4,4);
+	 * let $1=19;
+	 * set @d=4;
+	 * while ($1)
+	 * {
+	 *   eval insert into t1 select a+@d,b+@d from t1;
+	 *   eval set @d=@d*2;
+	 *   dec $1;
+	 * }
+	 * 
+	 * --enable_query_log
+	 * alter table t1 add index i1(a);
+	 * delete from t1 where a > 2000000;
+	 * create table t2 like t1;
+	 * insert into t2 select * from t1;
+	 *
+	 * As a result, the slock must be able to handle
+	 * nested calls to lock/unlock.
+	 */
+	if (!map->mf_slock_count)
+		FILE_MAP_READ_LOCK(&mm->mm_lock, thd_id);
+	tfer = size;
+	if (!mm->mm_start) {
+		FILE_MAP_UNLOCK(&mm->mm_lock, thd_id);
+		ASSERT_NS(!map->mf_slock_count);
+		FILE_MAP_WRITE_LOCK(&mm->mm_lock, thd_id);
+		if (!fs_remap_file(map, 0, 0, stat)) {
+			if (!map->mf_slock_count)
+				FILE_MAP_UNLOCK(&mm->mm_lock, thd_id);
+			return FAILED;
+		}
+	}
+	if (offset >= mm->mm_length)
+		tfer = 0;
+	else {
+		if (mm->mm_length - offset < (off_t) tfer)
+			tfer = (size_t) (mm->mm_length - offset);
+#ifdef XT_WIN
+		__try
+		{
+			memcpy(data, mm->mm_start + offset, tfer);
+			// GetExceptionCode()== EXCEPTION_IN_PAGE_ERROR ? EXCEPTION_EXECUTE_HANDLER : EXCEPTION_CONTINUE_SEARCH
+		}
+		__except(EXCEPTION_EXECUTE_HANDLER)
+		{
+			if (!map->mf_slock_count)
+				FILE_MAP_UNLOCK(&mm->mm_lock, thd_id);
+			return xt_register_ferrno(XT_REG_CONTEXT, GetExceptionCode(), xt_file_path(map));
+		}
+#else
+		memcpy(data, mm->mm_start + offset, tfer);
+#endif
+	}
+
+	if (!map->mf_slock_count)
+		FILE_MAP_UNLOCK(&mm->mm_lock, thd_id);
+	if (tfer < min_size)
+		return xt_register_ferrno(XT_REG_CONTEXT, ESPIPE, xt_file_path(map));
+
+	if (red_size)
+		*red_size = tfer;
+	stat->ts_read += tfer;
+	return OK;
+}
+
+xtPublic xtBool xt_flush_fmap(XTMapFilePtr map, XTIOStatsPtr stat, XTThreadPtr thread)
+{
+	XTFileMemMapPtr	mm = map->mf_memmap;
+#ifndef FILE_MAP_USE_PTHREAD_RW
+	xtThreadID		thd_id = thread->t_id;
+#endif
+	xtWord8			s;
+
+#ifdef DEBUG_TRACE_MAP_IO
+	xt_trace("/* %s */ pbxt_fmap_sync(\"%s\");\n", xt_trace_clock_diff(NULL), map->fr_file->fil_path);
+#endif
+	if (!map->mf_slock_count)
+		FILE_MAP_READ_LOCK(&mm->mm_lock, thd_id);
+	if (!mm->mm_start) {
+		FILE_MAP_UNLOCK(&mm->mm_lock, thd_id);
+		ASSERT_NS(!map->mf_slock_count);
+		FILE_MAP_WRITE_LOCK(&mm->mm_lock, thd_id);
+		if (!fs_remap_file(map, 0, 0, stat)) {
+			if (!map->mf_slock_count)
+				FILE_MAP_UNLOCK(&mm->mm_lock, thd_id);
+			return FAILED;
+		}
+	}
+	stat->ts_flush_start = xt_trace_clock();
+#ifdef XT_WIN
+	if (!FlushViewOfFile(mm->mm_start, 0)) {
+		xt_register_ferrno(XT_REG_CONTEXT, fs_get_win_error(), xt_file_path(map));
+		goto failed;
+	}
+#else
+	if (msync( (char *)mm->mm_start, (size_t) mm->mm_length, MS_SYNC) == -1) {
+		xt_register_ferrno(XT_REG_CONTEXT, errno, xt_file_path(map));
+		goto failed;
+	}
+#endif
+	if (!map->mf_slock_count)
+		FILE_MAP_UNLOCK(&mm->mm_lock, thd_id);
+	s = stat->ts_flush_start;
+	stat->ts_flush_start = 0;
+	stat->ts_flush_time += xt_trace_clock() - s;
+	stat->ts_flush++;
+	return OK;
+
+	failed:
+	if (!map->mf_slock_count)
+		FILE_MAP_UNLOCK(&mm->mm_lock, thd_id);
+	s = stat->ts_flush_start;
+	stat->ts_flush_start = 0;
+	stat->ts_flush_time += xt_trace_clock() - s;
+	return FAILED;
+}
+
+xtPublic xtWord1 *xt_lock_fmap_ptr(XTMapFilePtr map, off_t offset, size_t size, XTIOStatsPtr stat, XTThreadPtr thread)
+{
+	XTFileMemMapPtr	mm = map->mf_memmap;
+#ifndef FILE_MAP_USE_PTHREAD_RW
+	xtThreadID		thd_id = thread->t_id;
+#endif
+
+	if (!map->mf_slock_count)
+		FILE_MAP_READ_LOCK(&mm->mm_lock, thd_id);
+	map->mf_slock_count++;
+	if (!mm->mm_start) {
+		FILE_MAP_UNLOCK(&mm->mm_lock, thd_id);
+		FILE_MAP_WRITE_LOCK(&mm->mm_lock, thd_id);
+		if (!fs_remap_file(map, 0, 0, stat))
+			goto failed;
+	}
+	if (offset >= mm->mm_length)
+		goto failed;
+	
+	if (offset + (off_t) size > mm->mm_length)
+		stat->ts_read += (u_int) (offset + (off_t) size - mm->mm_length);
+	else
+		stat->ts_read += size;
+	return mm->mm_start + offset;
+
+	failed:
+	map->mf_slock_count--;
+	if (!map->mf_slock_count)
+		FILE_MAP_UNLOCK(&mm->mm_lock, thd_id);
+	return NULL;
+}
+
+xtPublic void xt_unlock_fmap_ptr(XTMapFilePtr map, XTThreadPtr thread)
+{
+	map->mf_slock_count--;
+	if (!map->mf_slock_count)
+		FILE_MAP_UNLOCK(&map->mf_memmap->mm_lock, thread->t_id);
+}
+
+/* ----------------------------------------------------------------------
+ * Copy files/directories
+ */
+
+static void fs_copy_file(XTThreadPtr self, char *from_path, char *to_path, void *copy_buf)
+{
+	XTOpenFilePtr	from;
+	XTOpenFilePtr	to;
+	off_t			offset = 0;
+	size_t			read_size= 0;
+
+	from = xt_open_file(self, from_path, XT_FS_READONLY);
+	pushr_(xt_close_file, from);
+	to = xt_open_file(self, to_path, XT_FS_CREATE | XT_FS_MAKE_PATH);
+	pushr_(xt_close_file, to);
+
+	for (;;) {
+		if (!xt_pread_file(from, offset, 16*1024, 0, copy_buf, &read_size, &self->st_statistics.st_x, self))
+			xt_throw(self);
+		if (!read_size)
+			break;
+		if (!xt_pwrite_file(to, offset, read_size, copy_buf, &self->st_statistics.st_x, self))
+			xt_throw(self);
+		offset += (off_t) read_size;
+	}
+
+	freer_();
+	freer_();
+}
+
+xtPublic void xt_fs_copy_file(XTThreadPtr self, char *from_path, char *to_path)
+{
+	void *buffer;
+
+	buffer = xt_malloc(self, 16*1024);
+	pushr_(xt_free, buffer);
+	fs_copy_file(self, from_path, to_path, buffer);
+	freer_();
+}
+
+static void fs_copy_dir(XTThreadPtr self, char *from_path, char *to_path, void *copy_buf)
+{
+	XTOpenDirPtr	od;
+	char			*file;
+	
+	xt_add_dir_char(PATH_MAX, from_path);
+	xt_add_dir_char(PATH_MAX, to_path);
+
+	pushsr_(od, xt_dir_close, xt_dir_open(self, from_path, NULL));
+	while (xt_dir_next(self, od)) {
+		file = xt_dir_name(self, od);
+		if (*file == '.')
+			continue;
+#ifdef XT_WIN
+		if (strcmp(file, "pbxt-lock") == 0)
+			continue;
+#endif
+		xt_strcat(PATH_MAX, from_path, file);
+		xt_strcat(PATH_MAX, to_path, file);
+		if (xt_dir_is_file(self, od))
+			fs_copy_file(self, from_path, to_path, copy_buf);
+		else
+			fs_copy_dir(self, from_path, to_path, copy_buf);
+		xt_remove_last_name_of_path(from_path);
+		xt_remove_last_name_of_path(to_path);
+	}
+	freer_();
+
+	xt_remove_dir_char(from_path);
+	xt_remove_dir_char(to_path);
+}
+
+xtPublic void xt_fs_copy_dir(XTThreadPtr self, const char *from, const char *to)
+{
+	void	*buffer;
+	char	from_path[PATH_MAX];
+	char	to_path[PATH_MAX];
+
+	xt_strcpy(PATH_MAX, from_path, from);
+	xt_strcpy(PATH_MAX, to_path, to);
+
+	buffer = xt_malloc(self, 16*1024);
+	pushr_(xt_free, buffer);
+	fs_copy_dir(self, from_path, to_path, buffer);
+	freer_();
+}
+
diff --git a/storage/pbxt/src/filesys_xt.h b/storage/pbxt/src/filesys_xt.h
new file mode 100644
index 00000000000..6d8dd280e5e
--- /dev/null
+++ b/storage/pbxt/src/filesys_xt.h
@@ -0,0 +1,224 @@
+/* Copyright (c) 2005 PrimeBase Technologies GmbH
+ *
+ * PrimeBase XT
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * 2005-01-12	Paul McCullagh
+ *
+ * H&G2JCtL
+ */
+#ifndef __xt_filesys_h__
+#define __xt_filesys_h__
+
+#ifdef XT_WIN
+#include <time.h>
+#else
+#include <sys/time.h>
+#include <dirent.h>
+#endif
+#include <sys/stat.h>
+
+#include "xt_defs.h"
+#include "lock_xt.h"
+
+#ifdef XT_WIN
+#define XT_FILE_IN_USE(x)			((x) == ERROR_SHARING_VIOLATION)
+#define XT_FILE_ACCESS_DENIED(x)	((x) == ERROR_ACCESS_DENIED || (x) == ERROR_NETWORK_ACCESS_DENIED)
+#define XT_FILE_TOO_MANY_OPEN(x)	((x) == ERROR_TOO_MANY_OPEN_FILES)
+#define XT_FILE_NOT_FOUND(x)		((x) == ERROR_FILE_NOT_FOUND || (x) == ERROR_PATH_NOT_FOUND)
+#else
+#define XT_FILE_IN_USE(x)			((x) == ETXTBSY)
+#define XT_FILE_ACCESS_DENIED(x)	((x) == EACCES)
+#define XT_FILE_TOO_MANY_OPEN(x)	((x) == EMFILE)
+#define XT_FILE_NOT_FOUND(x)		((x) == ENOENT)
+#endif
+
+struct XTOpenFile;
+
+#define XT_MASK				((S_IRUSR | S_IWUSR) | (S_IRGRP | S_IWGRP) | (S_IROTH))
+
+#define XT_FS_DEFAULT		0		/* Open for read/write, error if does not exist. */
+#define XT_FS_READONLY		1		/* Open for read only (otherwize read/write). */
+#define XT_FS_CREATE		2		/* Create if the file does not exist. */
+#define XT_FS_EXCLUSIVE		4		/* Create, and generate an error if it already exists. */
+#define XT_FS_MISSING_OK	8		/* Set this flag if you don't want to throw an error if the file does not exist! */
+#define XT_FS_MAKE_PATH		16		/* Create the path if it does not exist. */
+#define XT_FS_DIRECT_IO		32		/* Use direct I/O on this file if possible (O_DIRECT). */
+
+xtBool			xt_fs_exists(char *path);
+xtBool			xt_fs_delete(struct XTThread *self, char *path);
+xtBool			xt_fs_file_not_found(int err);
+void			xt_fs_mkdir(struct XTThread *self, char *path);
+void			xt_fs_mkpath(struct XTThread *self, char *path);
+xtBool			xt_fs_rmdir(struct XTThread *self, char *path);
+xtBool			xt_fs_stat(struct XTThread *self, char *path, off_t *size, struct timespec *mod_time);
+void			xt_fs_move(struct XTThread *self, char *from_path, char *to_path);
+xtBool			xt_fs_rename(struct XTThread *self, char *from_path, char *to_path);
+
+#ifdef XT_WIN
+#define XT_FD		HANDLE
+#define XT_NULL_FD	INVALID_HANDLE_VALUE
+#else
+#define XT_FD		int
+#define XT_NULL_FD	(-1)
+#endif
+
+/* Note, this lock must be re-entrant,
+ * The only lock that satifies this is
+ * FILE_MAP_USE_RWMUTEX!
+ *
+ * 20.05.2009: This problem should be fixed now with mf_slock_count!
+ *
+ * The lock need no longer be re-entrant
+ */
+#ifdef XT_NO_ATOMICS
+#define FILE_MAP_USE_PTHREAD_RW
+#else
+//#define FILE_MAP_USE_RWMUTEX
+//#define FILE_MAP_USE_PTHREAD_RW
+//#define IDX_USE_SPINXSLOCK
+#define FILE_MAP_USE_XSMUTEX
+#endif
+
+#ifdef FILE_MAP_USE_XSMUTEX
+#define FILE_MAP_LOCK_TYPE				XTXSMutexRec
+#define FILE_MAP_INIT_LOCK(s, i)		xt_xsmutex_init_with_autoname(s, i)
+#define FILE_MAP_FREE_LOCK(s, i)		xt_xsmutex_free(s, i)	
+#define FILE_MAP_READ_LOCK(i, o)		xt_xsmutex_slock(i, o)
+#define FILE_MAP_WRITE_LOCK(i, o)		xt_xsmutex_xlock(i, o)
+#define FILE_MAP_UNLOCK(i, o)			xt_xsmutex_unlock(i, o)
+#elif defined(FILE_MAP_USE_PTHREAD_RW)
+#define FILE_MAP_LOCK_TYPE				xt_rwlock_type
+#define FILE_MAP_INIT_LOCK(s, i)		xt_init_rwlock_with_autoname(s, i)
+#define FILE_MAP_FREE_LOCK(s, i)		xt_free_rwlock(i)	
+#define FILE_MAP_READ_LOCK(i, o)		xt_slock_rwlock_ns(i)
+#define FILE_MAP_WRITE_LOCK(i, o)		xt_xlock_rwlock_ns(i)
+#define FILE_MAP_UNLOCK(i, o)			xt_unlock_rwlock_ns(i)
+#elif defined(FILE_MAP_USE_RWMUTEX)
+#define FILE_MAP_LOCK_TYPE				XTRWMutexRec
+#define FILE_MAP_INIT_LOCK(s, i)		xt_rwmutex_init_with_autoname(s, i)
+#define FILE_MAP_FREE_LOCK(s, i)		xt_rwmutex_free(s, i)	
+#define FILE_MAP_READ_LOCK(i, o)		xt_rwmutex_slock(i, o)
+#define FILE_MAP_WRITE_LOCK(i, o)		xt_rwmutex_xlock(i, o)
+#define FILE_MAP_UNLOCK(i, o)			xt_rwmutex_unlock(i, o)
+#elif defined(FILE_MAP_USE_SPINXSLOCK)
+#define FILE_MAP_LOCK_TYPE				XTSpinXSLockRec
+#define FILE_MAP_INIT_LOCK(s, i)		xt_spinxslock_init_with_autoname(s, i)
+#define FILE_MAP_FREE_LOCK(s, i)		xt_spinxslock_free(s, i)	
+#define FILE_MAP_READ_LOCK(i, o)		xt_spinxslock_slock(i, o)
+#define FILE_MAP_WRITE_LOCK(i, o)		xt_spinxslock_xlock(i, FALSE, o)
+#define FILE_MAP_UNLOCK(i, o)			xt_spinxslock_unlock(i, o)
+#endif
+
+typedef struct XTFileMemMap {
+	xtWord1				*mm_start;			/* The in-memory start of the map. */
+#ifdef XT_WIN
+	HANDLE				mm_mapdes;
+#endif
+	off_t				mm_length;			/* The length of the file map. */
+	FILE_MAP_LOCK_TYPE	mm_lock;			/* The file map R/W lock. */
+	size_t				mm_grow_size;		/* The amount by which the map file is increased. */
+} XTFileMemMapRec, *XTFileMemMapPtr;
+
+typedef struct XTFile {
+	u_int				fil_ref_count;		/* The number of open file structure referencing this file. */
+	char				*fil_path;
+	u_int				fil_id;				/* This is used by the disk cache to identify a file in the hash index. */
+	XT_FD				fil_filedes;		/* The shared file descriptor (pread and pwrite allow this), on Windows this is used only for mmapped files */
+	u_int				fil_handle_count;	/* Number of references in the case of mmapped fil_filedes, both Windows and Unix */
+	XTFileMemMapPtr		fil_memmap;			/* Non-null if this file is memory mapped. */
+} XTFileRec, *XTFilePtr;
+
+typedef struct XTFileRef {
+	XTFilePtr			fr_file;
+	u_int				fr_id;				/* Copied from above (small optimisation). */
+} XTFileRefRec, *XTFileRefPtr;
+
+typedef struct XTOpenFile : public XTFileRef {
+	XT_FD				of_filedes;
+} XTOpenFileRec, *XTOpenFilePtr;
+
+void			xt_fs_init(struct XTThread *self);
+void			xt_fs_exit(struct XTThread *self);
+
+XTFilePtr		xt_fs_get_file(struct XTThread *self, char *file_name);
+void			xt_fs_release_file(struct XTThread *self, XTFilePtr file_ptr);
+
+XTOpenFilePtr	xt_open_file(struct XTThread *self, char *file, int mode);
+XTOpenFilePtr	xt_open_file_ns(char *file, int mode);
+xtBool			xt_open_file_ns(XTOpenFilePtr *fh, char *file, int mode);
+void			xt_close_file(struct XTThread *self, XTOpenFilePtr f);
+xtBool			xt_close_file_ns(XTOpenFilePtr f);
+char			*xt_file_path(struct XTFileRef *of);
+
+xtBool			xt_lock_file(struct XTThread *self, XTOpenFilePtr of);
+void			xt_unlock_file(struct XTThread *self, XTOpenFilePtr of);
+
+off_t			xt_seek_eof_file(struct XTThread *self, XTOpenFilePtr of);
+xtBool			xt_set_eof_file(struct XTThread *self, XTOpenFilePtr of, off_t offset);
+
+xtBool			xt_pwrite_file(XTOpenFilePtr of, off_t offset, size_t size, void *data, struct XTIOStats *timer, struct XTThread *thread);
+xtBool			xt_pread_file(XTOpenFilePtr of, off_t offset, size_t size, size_t min_size, void *data, size_t *red_size, struct XTIOStats *timer, struct XTThread *thread);
+xtBool			xt_flush_file(XTOpenFilePtr of, struct XTIOStats *timer, struct XTThread *thread);
+
+xtBool			xt_lock_file_ptr(XTOpenFilePtr of, xtWord1 **data, off_t offset, size_t size, struct XTIOStats *timer, struct XTThread *thread);
+void			xt_unlock_file_ptr(XTOpenFilePtr of, xtWord1 *data, struct XTThread *thread);
+
+typedef struct XTOpenDir {
+	char				*od_path;
+#ifdef XT_WIN
+	HANDLE				od_handle;
+	WIN32_FIND_DATA		od_data;
+#else
+	char				*od_filter;
+	DIR					*od_dir;
+	/* WARNING: Solaris requires od_entry.d_name member to have size at least as returned
+	 * by pathconf() function on per-directory basis. This makes it impossible to statically
+	 * pre-set the size. So xt_dir_open on Solaris dynamically allocates space as needed. 
+	 *
+	 * This also means that the od_entry member should always be last in the XTOpenDir structure.
+	 */
+	struct dirent		od_entry;
+#endif
+} XTOpenDirRec, *XTOpenDirPtr;
+
+XTOpenDirPtr	xt_dir_open(struct XTThread *self, c_char *path, c_char *filter);
+void			xt_dir_close(struct XTThread *self, XTOpenDirPtr od);
+xtBool			xt_dir_next(struct XTThread *self, XTOpenDirPtr od);
+char			*xt_dir_name(struct XTThread *self, XTOpenDirPtr od);
+xtBool			xt_dir_is_file(struct XTThread *self, XTOpenDirPtr od);
+off_t			xt_dir_file_size(struct XTThread *self, XTOpenDirPtr od);
+
+typedef struct XTMapFile : public XTFileRef {
+	u_int				mf_slock_count;
+	XTFileMemMapPtr		mf_memmap;
+} XTMapFileRec, *XTMapFilePtr;
+
+XTMapFilePtr	xt_open_fmap(struct XTThread *self, char *file, size_t grow_size);
+void			xt_close_fmap(struct XTThread *self, XTMapFilePtr map);
+xtBool			xt_close_fmap_ns(XTMapFilePtr map);
+xtBool			xt_pwrite_fmap(XTMapFilePtr map, off_t offset, size_t size, void *data, struct XTIOStats *timer, struct XTThread *thread);
+xtBool			xt_pread_fmap(XTMapFilePtr map, off_t offset, size_t size, size_t min_size, void *data, size_t *red_size, struct XTIOStats *timer, struct XTThread *thread);
+xtBool			xt_pread_fmap_4(XTMapFilePtr map, off_t offset, xtWord4 *value, struct XTIOStats *timer, struct XTThread *thread);
+xtBool			xt_flush_fmap(XTMapFilePtr map, struct XTIOStats *stat, struct XTThread *thread);
+xtWord1			*xt_lock_fmap_ptr(XTMapFilePtr map, off_t offset, size_t size, struct XTIOStats *timer, struct XTThread *thread);
+void			xt_unlock_fmap_ptr(XTMapFilePtr map, struct XTThread *thread);
+
+void			xt_fs_copy_file(struct XTThread *self, char *from_path, char *to_path);
+void			xt_fs_copy_dir(struct XTThread *self, const char *from, const char *to);
+
+#endif
+
diff --git a/storage/pbxt/src/ha_pbxt.cc b/storage/pbxt/src/ha_pbxt.cc
new file mode 100644
index 00000000000..ef0ae582c07
--- /dev/null
+++ b/storage/pbxt/src/ha_pbxt.cc
@@ -0,0 +1,6323 @@
+/* Copyright (c) 2005 PrimeBase Technologies GmbH
+ *
+ * Derived from ha_example.h
+ * Copyright (C) 2003 MySQL AB
+ *
+ * PrimeBase XT
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.	See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA	02111-1307	USA
+ *
+ * 2005-11-10	Paul McCullagh
+ *
+ */
+
+#ifdef USE_PRAGMA_IMPLEMENTATION
+#pragma implementation				// gcc: Class implementation
+#endif
+
+#include "xt_config.h"
+
+#if defined(XT_WIN)
+#include <windows.h>
+#endif
+
+#include <stdlib.h>
+#include <time.h>
+#include <ctype.h>
+
+#ifdef DRIZZLED
+#include <drizzled/common.h>
+#include <drizzled/plugin.h>
+#include <mysys/my_alloc.h>
+#include <mysys/hash.h>
+#include <drizzled/field.h>
+#include <drizzled/session.h>
+#include <drizzled/data_home.h>
+#include <drizzled/error.h>
+#include <drizzled/table.h>
+#include <drizzled/field/timestamp.h>
+#include <drizzled/server_includes.h>
+#include <drizzled/plugin/info_schema_table.h>
+extern "C" char **session_query(Session *session);
+#define my_strdup(a,b) strdup(a)
+
+using drizzled::plugin::Registry;
+using drizzled::plugin::ColumnInfo;
+using drizzled::plugin::InfoSchemaTable;
+using drizzled::plugin::InfoSchemaMethods;
+
+#else
+#include "mysql_priv.h"
+#include <mysql/plugin.h>
+#endif
+
+#include "ha_pbxt.h"
+#include "ha_xtsys.h"
+
+#include "strutil_xt.h"
+#include "database_xt.h"
+#include "cache_xt.h"
+#include "trace_xt.h"
+#include "heap_xt.h"
+#include "myxt_xt.h"
+#include "datadic_xt.h"
+#ifdef PBMS_ENABLED
+#include "pbms_enabled.h"
+#endif
+#include "tabcache_xt.h"
+#include "systab_xt.h"
+#include "xaction_xt.h"
+#include "backup_xt.h"
+#include "heap_xt.h"
+
+#ifdef DEBUG
+//#define XT_USE_SYS_PAR_DEBUG_SIZES
+//#define PBXT_HANDLER_TRACE
+//#define PBXT_TRACE_RETURN
+//#define XT_PRINT_INDEX_OPT
+//#define XT_SHOW_DUMPS_TRACE
+//#define XT_UNIT_TEST
+//#define LOAD_TABLE_ON_OPEN
+//#define CHECK_TABLE_LOADS
+
+/* Enable to trace the statements executed by the engine: */
+//#define TRACE_STATEMENTS
+
+/* Enable to print the trace to the stdout, instead of
+ * to the trace log.
+ */
+//#define PRINT_STATEMENTS
+#endif
+
+#ifndef DRIZZLED
+static handler	*pbxt_create_handler(handlerton *hton, TABLE_SHARE *table, MEM_ROOT *mem_root);
+static int		pbxt_init(void *p);
+static int		pbxt_end(void *p);
+static int		pbxt_panic(handlerton *hton, enum ha_panic_function flag);
+static void		pbxt_drop_database(handlerton *hton, char *path);
+static int		pbxt_close_connection(handlerton *hton, THD* thd);
+static int		pbxt_commit(handlerton *hton, THD *thd, bool all);
+static int		pbxt_rollback(handlerton *hton, THD *thd, bool all);
+static int		pbxt_prepare(handlerton *hton, THD *thd, bool all);
+static int		pbxt_recover(handlerton *hton, XID *xid_list, uint len);
+static int		pbxt_commit_by_xid(handlerton *hton, XID *xid);
+static int		pbxt_rollback_by_xid(handlerton *hton, XID *xid);
+static int		pbxt_start_consistent_snapshot(handlerton *hton, THD *thd);
+#endif
+static void		ha_aquire_exclusive_use(XTThreadPtr self, XTSharePtr share, ha_pbxt *mine);
+static void		ha_release_exclusive_use(XTThreadPtr self, XTSharePtr share);
+static void		ha_close_open_tables(XTThreadPtr self, XTSharePtr share, ha_pbxt *mine);
+
+#ifdef TRACE_STATEMENTS
+
+#ifdef PRINT_STATEMENTS
+#define STAT_TRACE(y, x)		printf("%s: %s\n", y ? y->t_name : "-unknown-", x)
+#else
+#define STAT_TRACE(y, x)		xt_ttraceq(y, x)
+#endif
+
+#else
+
+#define STAT_TRACE(y, x)
+
+#endif
+
+#ifdef PBXT_HANDLER_TRACE
+#define PBXT_ALLOW_PRINTING
+
+#define XT_TRACE_CALL()				ha_trace_function(__FUNC__, NULL)
+#define XT_TRACE_METHOD()			ha_trace_function(__FUNC__, pb_share->sh_table_path->ps_path)
+
+#ifdef PBXT_TRACE_RETURN
+#define XT_RETURN(x)				do { printf("%d\n", (int) (x)); return (x); } while (0)
+#define XT_RETURN_VOID				do { printf("out\n"); return; } while (0)
+#else
+#define XT_RETURN(x)				return (x)
+#define XT_RETURN_VOID				return
+#endif
+
+#else
+
+#define XT_TRACE_CALL()
+#define XT_TRACE_METHOD()
+#define XT_RETURN(x)				return (x)
+#define XT_RETURN_VOID				return
+
+#endif
+
+#ifdef PBXT_ALLOW_PRINTING
+#define XT_PRINT0(y, x)				do { XTThreadPtr s = (y); printf("%s " x, s ? s->t_name : "-unknown-"); } while (0)
+#define XT_PRINT1(y, x, a)			do { XTThreadPtr s = (y); printf("%s " x, s ? s->t_name : "-unknown-", a); } while (0)
+#define XT_PRINT2(y, x, a, b)		do { XTThreadPtr s = (y); printf("%s " x, s ? s->t_name : "-unknown-", a, b); } while (0)
+#define XT_PRINT3(y, x, a, b, c)	do { XTThreadPtr s = (y); printf("%s " x, s ? s->t_name : "-unknown-", a, b, c); } while (0)
+#else
+#define XT_PRINT0(y, x)
+#define XT_PRINT1(y, x, a)
+#define XT_PRINT2(y, x, a, b)
+#define XT_PRINT3(y, x, a, b, c)
+#endif
+
+
+#define TS(x)					(x)->s
+
+handlerton				*pbxt_hton;
+bool					pbxt_inited = false;		// Variable for checking the init state of hash
+xtBool					pbxt_ignore_case = true;
+const char				*pbxt_extensions[]= { ".xtr", ".xtd", ".xtl", ".xti", ".xt", "", NULL };
+#ifdef XT_CRASH_DEBUG
+xtBool					pbxt_crash_debug = TRUE;
+#else
+xtBool					pbxt_crash_debug = FALSE;
+#endif
+
+
+/* Variables for pbxt share methods */
+static xt_mutex_type	pbxt_database_mutex;		// Prevent a database from being opened while it is being dropped
+static XTHashTabPtr		pbxt_share_tables;			// Hash used to track open tables
+static char				*pbxt_index_cache_size;
+static char				*pbxt_record_cache_size;
+static char				*pbxt_log_cache_size;
+static char				*pbxt_log_file_threshold;
+static char				*pbxt_transaction_buffer_size;
+static char				*pbxt_log_buffer_size;
+static char				*pbxt_checkpoint_frequency;
+static char				*pbxt_data_log_threshold;
+static char				*pbxt_data_file_grow_size;
+static char				*pbxt_row_file_grow_size;
+static int				pbxt_max_threads;
+static my_bool			pbxt_support_xa;
+
+#ifndef DRIZZLED
+// drizzle complains it's not used
+static XTXactEnumXARec	pbxt_xa_enum;
+#endif
+
+#ifdef DEBUG
+#define XT_SHARE_LOCK_WAIT		5000
+#else
+#define XT_SHARE_LOCK_WAIT		500
+#endif
+
+/* 
+ * Lock timeout in 1/1000ths of a second
+ */
+#define XT_SHARE_LOCK_TIMEOUT	30000
+
+/*
+ * -----------------------------------------------------------------------
+ * SYSTEM VARIABLES
+ *
+ */
+ 
+//#define XT_FOR_TEAMDRIVE
+
+typedef struct HAVarParams {
+	const char		*vp_var;						/* Variable name. */
+	const char		*vp_def;						/* Default value. */
+	const char		*vp_min;						/* Minimum allowed value. */
+	const char		*vp_max4;						/* Maximum allowed value on 32-bit processors. */
+	const char		*vp_max8;						/* Maximum allowed value on 64-bit processors. */
+} HAVarParamsRec, *HAVarParamsPtr;
+
+#ifdef XT_USE_SYS_PAR_DEBUG_SIZES
+static HAVarParamsRec vp_index_cache_size = { "pbxt_index_cache_size", "32MB", "8MB", "2GB", "2000GB" };
+static HAVarParamsRec vp_record_cache_size = { "pbxt_record_cache_size", "32MB", "8MB", "2GB", "2000GB" };
+static HAVarParamsRec vp_log_cache_size = { "pbxt_log_cache_size", "16MB", "4MB", "2GB", "2000GB" };
+static HAVarParamsRec vp_checkpoint_frequency = { "pbxt_checkpoint_frequency", "28MB", "512K", "1GB", "24GB" };
+static HAVarParamsRec vp_log_file_threshold = { "pbxt_log_file_threshold", "32MB", "1MB", "2GB", "256TB" };
+static HAVarParamsRec vp_transaction_buffer_size = { "pbxt_transaction_buffer_size", "1MB", "128K", "1GB", "24GB" };
+static HAVarParamsRec vp_log_buffer_size = { "pbxt_log_buffer_size", "256K", "128K", "1GB", "24GB" };
+static HAVarParamsRec vp_data_log_threshold = { "pbxt_data_log_threshold", "400K", "400K", "2GB", "256TB" };
+static HAVarParamsRec vp_data_file_grow_size = { "pbxt_data_file_grow_size", "2MB", "128K", "1GB", "2GB" };
+static HAVarParamsRec vp_row_file_grow_size = { "pbxt_row_file_grow_size", "256K", "32K", "1GB", "2GB" };
+#define XT_DL_DEFAULT_XLOG_COUNT		3
+#define XT_DL_DEFAULT_GARBAGE_LEVEL		10
+#else
+static HAVarParamsRec vp_index_cache_size = { "pbxt_index_cache_size", "32MB", "8MB", "2GB", "2000GB" };
+static HAVarParamsRec vp_record_cache_size = { "pbxt_record_cache_size", "32MB", "8MB", "2GB", "2000GB" };
+static HAVarParamsRec vp_log_cache_size = { "pbxt_log_cache_size", "16MB", "4MB", "2GB", "2000GB" };
+static HAVarParamsRec vp_checkpoint_frequency = { "pbxt_checkpoint_frequency", "28MB", "512K", "1GB", "24GB" };
+static HAVarParamsRec vp_log_file_threshold = { "pbxt_log_file_threshold", "32MB", "1MB", "2GB", "256TB" };
+static HAVarParamsRec vp_transaction_buffer_size = { "pbxt_transaction_buffer_size", "1MB", "128K", "1GB", "24GB" };
+static HAVarParamsRec vp_log_buffer_size = { "pbxt_log_buffer_size", "256K", "128K", "1GB", "24GB" };
+static HAVarParamsRec vp_data_log_threshold = { "pbxt_data_log_threshold", "64MB", "1MB", "2GB", "256TB" };
+static HAVarParamsRec vp_data_file_grow_size = { "pbxt_data_file_grow_size", "2MB", "128K", "1GB", "2GB" };
+static HAVarParamsRec vp_row_file_grow_size = { "pbxt_row_file_grow_size", "256K", "32K", "1GB", "2GB" };
+#define XT_DL_DEFAULT_XLOG_COUNT		3
+#define XT_DL_DEFAULT_GARBAGE_LEVEL		50
+#endif
+
+#define XT_AUTO_INCREMENT_DEF			0
+
+#ifdef XT_MAC
+#ifdef DEBUG
+/* For debugging on the Mac, we check the re-use logs: */
+#define XT_OFFLINE_LOG_FUNCTION_DEF		XT_RECYCLE_LOGS
+#else
+#define XT_OFFLINE_LOG_FUNCTION_DEF		XT_DELETE_LOGS
+#endif
+#else
+#define XT_OFFLINE_LOG_FUNCTION_DEF		XT_RECYCLE_LOGS
+#endif
+
+/* TeamDrive, uses special auto-increment, and
+ * we keep the logs for the moment:
+ */
+#ifdef XT_FOR_TEAMDRIVE
+#undef XT_OFFLINE_LOG_FUNCTION_DEF
+#define XT_OFFLINE_LOG_FUNCTION_DEF		XT_KEEP_LOGS
+//#undef XT_AUTO_INCREMENT_DEF
+//#define XT_AUTO_INCREMENT_DEF			1
+#endif
+
+#ifdef PBXT_HANDLER_TRACE
+static void ha_trace_function(const char *function, char *table)
+{
+	char		func_buf[50], *ptr;
+	XTThreadPtr	thread = xt_get_self(); 
+
+	if ((ptr = const_cast<char *>(strchr(function, '(')))) {
+		ptr--;
+		while (ptr > function) {
+			if (!(isalnum(*ptr) || *ptr == '_'))
+				break;
+			ptr--;
+		}
+		ptr++;
+		xt_strcpy(50, func_buf, ptr);
+		if ((ptr = strchr(func_buf, '(')))
+			*ptr = 0;
+	}
+	else
+		xt_strcpy(50, func_buf, function);
+	if (table)
+		printf("%s %s (%s)\n", thread ? thread->t_name : "-unknown-", func_buf, table);
+	else
+		printf("%s %s\n", thread ? thread->t_name : "-unknown-", func_buf);
+}
+#endif
+
+/*
+ * -----------------------------------------------------------------------
+ * SHARED TABLE DATA
+ *
+ */
+
+static xtBool ha_hash_comp(void *key, void *data)
+{
+	XTSharePtr	share = (XTSharePtr) data;
+
+	return strcmp((char *) key, share->sh_table_path->ps_path) == 0;
+}
+
+static xtHashValue ha_hash(xtBool is_key, void *key_data)
+{
+	XTSharePtr	share = (XTSharePtr) key_data;
+
+	if (is_key)
+		return xt_ht_hash((char *) key_data);
+	return xt_ht_hash(share->sh_table_path->ps_path);
+}
+
+static xtBool ha_hash_comp_ci(void *key, void *data)
+{
+	XTSharePtr	share = (XTSharePtr) data;
+
+	return strcasecmp((char *) key, share->sh_table_path->ps_path) == 0;
+}
+
+static xtHashValue ha_hash_ci(xtBool is_key, void *key_data)
+{
+	XTSharePtr	share = (XTSharePtr) key_data;
+
+	if (is_key)
+		return xt_ht_casehash((char *) key_data);
+	return xt_ht_casehash(share->sh_table_path->ps_path);
+}
+
+static void ha_open_share(XTThreadPtr self, XTShareRec *share)
+{
+	xt_lock_mutex(self, (xt_mutex_type *) share->sh_ex_mutex);
+	pushr_(xt_unlock_mutex, share->sh_ex_mutex);
+
+	if (!share->sh_table) {
+		share->sh_table = xt_use_table(self, share->sh_table_path, FALSE, FALSE);
+		share->sh_dic_key_count = share->sh_table->tab_dic.dic_key_count;
+		share->sh_dic_keys = share->sh_table->tab_dic.dic_keys;
+		share->sh_recalc_selectivity = FALSE;
+	}
+
+	freer_(); // xt_ht_unlock(pbxt_share_tables)
+}
+
+static void ha_close_share(XTThreadPtr self, XTShareRec *share)
+{
+	XTTableHPtr tab;
+
+	if ((tab = share->sh_table)) {
+		/* Save this, in case the share is re-opened. */
+		share->sh_min_auto_inc = tab->tab_auto_inc;
+
+		xt_heap_release(self, tab);
+		share->sh_table = NULL;
+	}
+
+	/* This are only references: */
+	share->sh_dic_key_count = 0;
+	share->sh_dic_keys = NULL;
+}
+
+static void ha_cleanup_share(XTThreadPtr self, XTSharePtr share)
+{
+	ha_close_share(self, share);
+
+	if (share->sh_table_path) {
+		xt_free(self, share->sh_table_path);
+		share->sh_table_path = NULL;
+	}
+
+	if (share->sh_ex_cond) {
+		thr_lock_delete(&share->sh_lock);
+		xt_delete_cond(self, (xt_cond_type *) share->sh_ex_cond);
+		share->sh_ex_cond = NULL;
+	}
+
+	if (share->sh_ex_mutex) {
+		xt_delete_mutex(self, (xt_mutex_type *) share->sh_ex_mutex);
+		share->sh_ex_mutex = NULL;
+	}
+
+	xt_free(self, share);
+}
+
+static void ha_hash_free(XTThreadPtr self, void *data)
+{
+	XTSharePtr	share = (XTSharePtr) data;
+
+	ha_cleanup_share(self, share);
+}
+
+/*
+ * This structure contains information that is common to all handles.
+ * (i.e. it is table specific).
+ */
+static XTSharePtr ha_get_share(XTThreadPtr self, const char *table_path, bool open_table)
+{
+	XTShareRec	*share;
+
+	enter_();
+	xt_ht_lock(self, pbxt_share_tables);
+	pushr_(xt_ht_unlock, pbxt_share_tables);
+
+	// Check if the table exists...
+	if (!(share = (XTSharePtr) xt_ht_get(self, pbxt_share_tables, (void *) table_path))) {
+		share = (XTSharePtr) xt_calloc(self, sizeof(XTShareRec));		
+		pushr_(ha_cleanup_share, share);
+
+		share->sh_ex_mutex = (xt_mutex_type *) xt_new_mutex(self);
+		share->sh_ex_cond = (xt_cond_type *) xt_new_cond(self);
+
+		thr_lock_init(&share->sh_lock);
+
+		share->sh_use_count = 0;
+		share->sh_table_path = (XTPathStrPtr) xt_dup_string(self, table_path);
+
+		if (open_table)
+			ha_open_share(self, share);
+
+		popr_(); // Discard ha_cleanup_share(share);
+
+		xt_ht_put(self, pbxt_share_tables, share);
+	}
+
+	share->sh_use_count++;
+	freer_(); // xt_ht_unlock(pbxt_share_tables)
+
+	return_(share);
+}
+
+/*
+ * Free shared information.
+ */
+static void ha_unget_share(XTThreadPtr self, XTSharePtr share)
+{
+	xt_ht_lock(self, pbxt_share_tables);
+	pushr_(xt_ht_unlock, pbxt_share_tables);
+
+	if (!--share->sh_use_count)
+		xt_ht_del(self, pbxt_share_tables, share->sh_table_path);
+
+	freer_(); // xt_ht_unlock(pbxt_share_tables)
+}
+
+static xtBool ha_unget_share_removed(XTThreadPtr self, XTSharePtr share)
+{
+	xtBool removed = FALSE;
+
+	xt_ht_lock(self, pbxt_share_tables);
+	pushr_(xt_ht_unlock, pbxt_share_tables);
+
+	if (!--share->sh_use_count) {
+		removed = TRUE;
+		xt_ht_del(self, pbxt_share_tables, share->sh_table_path);
+	}
+
+	freer_(); // xt_ht_unlock(pbxt_share_tables)
+	return removed;
+}
+
+static inline void thd_init_xact(THD *thd, XTThreadPtr self, bool set_table_trans)
+{
+	self->st_xact_mode = thd_tx_isolation(thd) <= ISO_READ_COMMITTED ? XT_XACT_COMMITTED_READ : XT_XACT_REPEATABLE_READ;
+	self->st_ignore_fkeys = (thd_test_options(thd, OPTION_NO_FOREIGN_KEY_CHECKS)) != 0;
+	self->st_auto_commit = (thd_test_options(thd,(OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN))) == 0;
+	if (set_table_trans) {
+#ifdef DRIZZLED
+		self->st_table_trans = FALSE;
+#else
+		self->st_table_trans = thd_sql_command(thd) == SQLCOM_LOCK_TABLES;
+#endif
+	}
+	self->st_abort_trans = FALSE;
+	self->st_stat_ended = FALSE;
+	self->st_stat_trans = FALSE;
+	XT_PRINT0(self, "xt_xn_begin\n");
+	xt_xres_wait_for_recovery(self, XT_RECOVER_SWEPT);
+}
+
+/*
+ * -----------------------------------------------------------------------
+ * PUBLIC FUNCTIONS
+ *
+ */
+
+xtPublic void xt_ha_unlock_table(XTThreadPtr self, void *share)
+{
+	ha_release_exclusive_use(self, (XTSharePtr) share);
+	ha_unget_share(self, (XTSharePtr) share);
+}
+
+xtPublic void xt_ha_close_global_database(XTThreadPtr self)
+{
+	if (pbxt_database) {
+		xt_heap_release(self, pbxt_database);
+		pbxt_database = NULL;
+	}
+}
+
+/*
+ * Open a PBXT database given the path of a table.
+ * This function also returns the name of the table.
+ *
+ * We use the pbxt_database_mutex to lock this
+ * operation to make sure it does not occur while
+ * some other thread is doing a "closeall".
+ */
+xtPublic void xt_ha_open_database_of_table(XTThreadPtr self, XTPathStrPtr XT_UNUSED(table_path))
+{
+#ifdef XT_USE_GLOBAL_DB
+	if (!self->st_database) {
+		if (!pbxt_database) {
+			xt_open_database(self, mysql_real_data_home, TRUE);
+			/* {GLOBAL-DB}
+			 * This can be done at the same time as the recovery thread,
+			 * strictly speaking I need a lock.
+			 */
+			if (!pbxt_database) {
+				pbxt_database = self->st_database;
+				xt_heap_reference(self, pbxt_database);
+			}
+		}
+		else
+			xt_use_database(self, pbxt_database, XT_FOR_USER);
+	}
+#else
+	char db_path[PATH_MAX];
+
+	xt_strcpy(PATH_MAX, db_path, (char *) table_path);
+	xt_remove_last_name_of_path(db_path);
+	xt_remove_dir_char(db_path);
+
+	if (self->st_database && xt_tab_compare_paths(self->st_database->db_name, xt_last_name_of_path(db_path)) == 0)
+		/* This thread already has this database open! */
+		return;
+
+	/* Auto commit before changing the database: */
+	if (self->st_xact_data) {
+		/* PMC - This probably indicates something strange is happening:
+		 *
+		 * This sequence generates this error:
+		 *
+		 * delimiter |
+		 * 
+		 * create temporary table t3 (id int)|
+		 * 
+		 * create function f10() returns int
+		 * begin
+		 *   drop temporary table if exists t3;
+		 *   create temporary table t3 (id int) engine=myisam;
+		 *   insert into t3 select id from t4;
+		 *   return (select count(*) from t3);
+		 * end|
+		 * 
+		 * select f10()|
+		 *
+		 * An error is generated because the same thread is used
+		 * to open table t4 (at the start of the functions), and
+		 * then to drop table t3. To drop t3 we need to
+		 * switch the database, so we land up here!
+		 */
+		xt_throw_xterr(XT_CONTEXT, XT_ERR_CANNOT_CHANGE_DB);
+		/*
+		 if (!xt_xn_commit(self))
+		 	throw_();
+		 */
+	}
+
+	xt_lock_mutex(self, &pbxt_database_mutex);
+	pushr_(xt_unlock_mutex, &pbxt_database_mutex);
+	xt_open_database(self, db_path, FALSE);
+	freer_(); // xt_unlock_mutex(&pbxt_database_mutex);
+#endif
+}
+
+xtPublic XTThreadPtr xt_ha_set_current_thread(THD *thd, XTExceptionPtr e)
+{
+	XTThreadPtr	self;
+	static int	ha_thread_count = 0, ha_id;
+
+	if (!(self = (XTThreadPtr) *thd_ha_data(thd, pbxt_hton))) {
+//		const			Security_context *sctx;
+		char			name[120];
+		char			ha_id_str[50];
+
+		ha_id = ++ha_thread_count;
+		sprintf(ha_id_str, "_%d", ha_id);
+		xt_strcpy(120,name,"user"); // TODO: Fix this hack
+/*
+		sctx = &thd->main_security_ctx;
+
+		if (sctx->user) {
+			xt_strcpy(120, name, sctx->user);
+			xt_strcat(120, name, "@");
+		}
+		else
+			*name = 0;
+		if (sctx->host)
+			xt_strcat(120, name, sctx->host);
+		else if (sctx->ip)
+			xt_strcat(120, name, sctx->ip);
+		else if (thd->proc_info)
+			xt_strcat(120, name, (char *) thd->proc_info);
+		else
+			xt_strcat(120, name, "system");
+*/
+		xt_strcat(120, name, ha_id_str);
+		if (!(self = xt_create_thread(name, FALSE, TRUE, e)))
+			return NULL;
+
+		self->st_xact_mode = XT_XACT_REPEATABLE_READ;
+
+		*thd_ha_data(thd, pbxt_hton) = (void *) self;
+	}
+	return self;
+}
+
+xtPublic void xt_ha_close_connection(THD* thd)
+{
+	XTThreadPtr		self;
+
+	if ((self = (XTThreadPtr) *thd_ha_data(thd, pbxt_hton))) {
+		*thd_ha_data(thd, pbxt_hton) = NULL;
+		xt_free_thread(self);
+	}
+}
+
+xtPublic XTThreadPtr xt_ha_thd_to_self(THD *thd)
+{
+	return (XTThreadPtr) *thd_ha_data(thd, pbxt_hton);
+}
+
+/* The first bit is 1. */
+static u_int ha_get_max_bit(MX_BITMAP *map)
+{
+#ifdef DRIZZLED
+	return map->getFirstSet();
+#else
+	my_bitmap_map	*data_ptr = map->bitmap;
+	my_bitmap_map	*end_ptr = map->last_word_ptr;
+	my_bitmap_map	b;
+	u_int			cnt = map->n_bits;
+
+	for (; end_ptr >= data_ptr; end_ptr--) {
+		if ((b = *end_ptr)) {
+			my_bitmap_map mask;
+			
+			if (end_ptr == map->last_word_ptr && map->last_word_mask)
+				mask = map->last_word_mask >> 1;
+			else
+				mask = 0x80000000;
+			while (!(b & mask)) {
+				b = b << 1;
+				/* Should not happen, but if it does, we hang! */
+				if (!b)
+					return map->n_bits;
+				cnt--;
+			}
+			return cnt;
+		}
+		if (end_ptr == map->last_word_ptr)
+			cnt = ((cnt-1) / 32) * 32;
+		else
+			cnt -= 32;
+	}
+	return 0;
+#endif
+}
+
+/*
+ * -----------------------------------------------------------------------
+ * SUPPORT FUNCTIONS
+ *
+ */
+
+/*
+ * In PBXT, as in MySQL: thread == connection.
+ *
+ * So we simply attach a PBXT thread to a MySQL thread.
+ */
+static XTThreadPtr ha_set_current_thread(THD *thd, int *err)
+{
+	XTThreadPtr		self;
+	XTExceptionRec	e;
+
+	if (!(self = xt_ha_set_current_thread(thd, &e))) {
+		xt_log_exception(NULL, &e, XT_LOG_DEFAULT);
+		*err = e.e_xt_err;
+		return NULL;
+	}
+	return self;
+}
+
+xtPublic int xt_ha_pbxt_to_mysql_error(int xt_err)
+{
+	switch (xt_err) {
+		case XT_NO_ERR:
+			return(0);
+		case XT_ERR_DUPLICATE_KEY:
+				return HA_ERR_FOUND_DUPP_KEY;
+		case XT_ERR_DEADLOCK:
+				return HA_ERR_LOCK_DEADLOCK;
+		case XT_ERR_RECORD_CHANGED:
+			/* If we generate HA_ERR_RECORD_CHANGED instead of HA_ERR_LOCK_WAIT_TIMEOUT
+			 * then sysbench does not work because it does not handle this error.
+			 */
+			//return HA_ERR_LOCK_WAIT_TIMEOUT; // but HA_ERR_RECORD_CHANGED is the correct error for a optimistic lock failure.
+			return HA_ERR_RECORD_CHANGED;
+		case XT_ERR_LOCK_TIMEOUT:
+			return HA_ERR_LOCK_WAIT_TIMEOUT;
+		case XT_ERR_TABLE_IN_USE:
+				return HA_ERR_WRONG_COMMAND;
+		case XT_ERR_TABLE_NOT_FOUND:
+			return HA_ERR_NO_SUCH_TABLE;
+		case XT_ERR_TABLE_EXISTS:
+			return HA_ERR_TABLE_EXIST;
+		case XT_ERR_CANNOT_CHANGE_DB:
+			return ER_TRG_IN_WRONG_SCHEMA;
+		case XT_ERR_COLUMN_NOT_FOUND:
+			return HA_ERR_CANNOT_ADD_FOREIGN;
+		case XT_ERR_NO_REFERENCED_ROW:
+		case XT_ERR_REF_TABLE_NOT_FOUND:
+		case XT_ERR_REF_TYPE_WRONG:
+			return HA_ERR_NO_REFERENCED_ROW;
+		case XT_ERR_ROW_IS_REFERENCED:
+			return HA_ERR_ROW_IS_REFERENCED;
+		case XT_ERR_COLUMN_IS_NOT_NULL:
+		case XT_ERR_INCORRECT_NO_OF_COLS:
+		case XT_ERR_FK_ON_TEMP_TABLE:
+		case XT_ERR_FK_REF_TEMP_TABLE:
+			return HA_ERR_CANNOT_ADD_FOREIGN;
+		case XT_ERR_DUPLICATE_FKEY:
+			return HA_ERR_FOREIGN_DUPLICATE_KEY;
+		case XT_ERR_RECORD_DELETED:
+			return HA_ERR_RECORD_DELETED;
+	}
+	return(-1);			// Unknown error
+}
+
+xtPublic int xt_ha_pbxt_thread_error_for_mysql(THD *thd, const XTThreadPtr self, int ignore_dup_key)
+{
+	int		xt_err = self->t_exception.e_xt_err;
+	xtBool	dup_key = FALSE;
+
+	XT_PRINT2(self, "xt_ha_pbxt_thread_error_for_mysql xt_err=%d auto commit=%d\n", (int) xt_err, (int) self->st_auto_commit);
+	switch (xt_err) {
+		case XT_NO_ERR:
+			break;
+		case XT_ERR_DUPLICATE_KEY:
+		case XT_ERR_DUPLICATE_FKEY:
+			/* Let MySQL call rollback as and when it wants to for duplicate
+			 * key.
+			 *
+			 * In addition, we are not allowed to do an auto-rollback
+			 * inside a sub-statement (function() or procedure())
+			 * For example:
+			 * 
+			 * delimiter |
+			 *
+			 * create table t3 (c1 char(1) primary key not null)|
+			 * 
+			 * create function bug12379()
+			 *   returns integer
+			 * begin
+			 *    insert into t3 values('X');
+			 *    insert into t3 values('X');
+			 *    return 0;
+			 * end|
+			 * 
+			 * --error 1062
+			 * select bug12379()|
+			 *
+			 *
+			 * Not doing an auto-rollback should solve this problem in the
+			 * case of duplicate key (but not in others - like deadlock)!
+			 * I don't think this situation is handled correctly by MySQL.
+			 */
+
+			/* If we are in auto-commit mode (and we are not ignoring
+			 * duplicate keys) then rollback the transaction automatically.
+			 */
+			dup_key = TRUE;
+			if (!ignore_dup_key && self->st_auto_commit)
+				goto abort_transaction;
+			break;
+		case XT_ERR_DEADLOCK:
+		case XT_ERR_NO_REFERENCED_ROW:
+		case XT_ERR_ROW_IS_REFERENCED:
+			goto abort_transaction;
+		case XT_ERR_RECORD_CHANGED:
+			/* MySQL also handles the locked error. NOTE: There is no automatic
+			 * rollback!
+			 */
+			break;
+		default:
+			xt_log_exception(self, &self->t_exception, XT_LOG_DEFAULT);
+			abort_transaction:
+			/* PMC 2006-08-30: It should be that this is not necessary!
+			 *
+			 * It is only necessary to call ha_rollback() if the engine
+			 * aborts the transaction.
+			 *
+			 * On the other hand, I shouldn't need to rollback the
+			 * transaction because, if I return an error, MySQL
+			 * should do it for me.
+			 *
+			 * Unfortunately, when auto-commit is off, MySQL does not
+			 * rollback automatically (for example when a deadlock
+			 * is provoked).
+			 *
+			 * And when we have a multi update we cannot rely on this
+			 * either (see comment above).
+			 */
+			if (self->st_xact_data) {
+				/*
+				 * GOTCHA:
+				 * A result of the "st_abort_trans = TRUE" below is that
+				 * the following code results in an empty set.
+				 * The reason is "ignore_dup_key" is not set so
+				 * the duplicate key leads to an error which causes
+				 * the transaction to be aborted.
+				 * The delayed inserts are all execute in one transaction.
+				 * 
+				 * CREATE TABLE t1 (
+				 * c1 INT(11) NOT NULL AUTO_INCREMENT,
+				 * c2 INT(11) DEFAULT NULL,
+				 * PRIMARY KEY (c1)
+				 * );
+				 * SET insert_id= 14;
+				 * INSERT DELAYED INTO t1 VALUES(NULL, 11), (NULL, 12);
+				 * INSERT DELAYED INTO t1 VALUES(14, 91);
+				 * INSERT DELAYED INTO t1 VALUES (NULL, 92), (NULL, 93);
+				 * FLUSH TABLE t1;
+				 * SELECT * FROM t1;
+				 */
+				if (self->st_lock_count == 0) {
+					/* No table locks, must rollback immediately
+					 * (there will be no possibility later!
+					 */
+					XT_PRINT1(self, "xt_xn_rollback xt_err=%d\n", xt_err);
+					if (!xt_xn_rollback(self))
+						xt_log_exception(self, &self->t_exception, XT_LOG_DEFAULT);
+				}
+				else {
+					/* Locks are held on tables.
+					 * Only rollback after locks are released.
+					 */
+					/* I do not think this is required, because
+					 * I tell mysql to rollback below, 
+					 * besides it is a hack!
+					 self->st_auto_commit = TRUE;
+					 */
+					self->st_abort_trans = TRUE;
+				}
+				/* Only tell MySQL to rollback if we automatically rollback.
+				 * Note: calling this with (thd, FALSE), cause sp.test to fail.
+				 */
+				if (!dup_key) {
+					if (thd)
+						thd_mark_transaction_to_rollback(thd, TRUE);
+				}
+			}
+			break;
+	}
+	return xt_ha_pbxt_to_mysql_error(xt_err);
+}
+
+static void ha_conditional_close_database(XTThreadPtr self, XTThreadPtr other_thr, void *db)
+{
+	if (other_thr->st_database == (XTDatabaseHPtr) db)
+		xt_unuse_database(self, other_thr);
+}
+
+/*
+ * This is only called from drop database, so we know that
+ * no thread is actually using the database. This means that it
+ * must be safe to close the database.
+ */
+xtPublic void xt_ha_all_threads_close_database(XTThreadPtr self, XTDatabaseHPtr db)
+{
+	xt_lock_mutex(self, &pbxt_database_mutex);
+	pushr_(xt_unlock_mutex, &pbxt_database_mutex);
+	xt_do_to_all_threads(self, ha_conditional_close_database, db);
+	freer_(); // xt_unlock_mutex(&pbxt_database_mutex);
+}
+
+static int ha_log_pbxt_thread_error_for_mysql(int ignore_dup_key)
+{
+	return xt_ha_pbxt_thread_error_for_mysql(current_thd, myxt_get_self(), ignore_dup_key);
+}
+
+/*
+ * -----------------------------------------------------------------------
+ * STATIC HOOKS
+ *
+ */
+static xtWord8 ha_set_variable(char **value, HAVarParamsPtr vp)
+{
+	xtWord8	result;
+	xtWord8	mi, ma;
+	char	*mm;
+
+	if (!*value)
+		*value = getenv(vp->vp_var);
+	if (!*value)
+		*value = (char *) vp->vp_def;
+	result = xt_byte_size_to_int8(*value);
+	mi = (xtWord8) xt_byte_size_to_int8(vp->vp_min);
+	if (result < mi) {
+		result = mi;
+		*value = (char *) vp->vp_min;
+	}
+	if (sizeof(size_t) == 8)
+		mm = (char *) vp->vp_max8;
+	else
+		mm = (char *) vp->vp_max4;
+	ma = (xtWord8) xt_byte_size_to_int8(mm);
+	if (result > ma) {
+		result = ma;
+		*value = mm;
+	}
+	return result;
+}
+
+static void pbxt_call_init(XTThreadPtr self)
+{
+	xtInt8	index_cache_size;
+	xtInt8	record_cache_size;
+	xtInt8	log_cache_size;
+	xtInt8	log_file_threshold;
+	xtInt8	transaction_buffer_size;
+	xtInt8	log_buffer_size;
+	xtInt8	checkpoint_frequency;
+	xtInt8	data_log_threshold;
+	xtInt8	data_file_grow_size;
+	xtInt8	row_file_grow_size;
+
+	xt_logf(XT_NT_INFO, "PrimeBase XT (PBXT) Engine %s loaded...\n", xt_get_version());
+	xt_logf(XT_NT_INFO, "Paul McCullagh, PrimeBase Technologies GmbH, http://www.primebase.org\n");
+
+	index_cache_size = ha_set_variable(&pbxt_index_cache_size, &vp_index_cache_size);
+	record_cache_size = ha_set_variable(&pbxt_record_cache_size, &vp_record_cache_size);
+	log_cache_size = ha_set_variable(&pbxt_log_cache_size, &vp_log_cache_size);
+	log_file_threshold = ha_set_variable(&pbxt_log_file_threshold, &vp_log_file_threshold);
+	transaction_buffer_size = ha_set_variable(&pbxt_transaction_buffer_size, &vp_transaction_buffer_size);
+	log_buffer_size = ha_set_variable(&pbxt_log_buffer_size, &vp_log_buffer_size);
+	checkpoint_frequency = ha_set_variable(&pbxt_checkpoint_frequency, &vp_checkpoint_frequency);
+	data_log_threshold = ha_set_variable(&pbxt_data_log_threshold, &vp_data_log_threshold);
+	data_file_grow_size = ha_set_variable(&pbxt_data_file_grow_size, &vp_data_file_grow_size);
+	row_file_grow_size = ha_set_variable(&pbxt_row_file_grow_size, &vp_row_file_grow_size);
+
+	xt_db_log_file_threshold = (xtLogOffset) log_file_threshold;
+	xt_db_log_buffer_size = (size_t) xt_align_offset(log_buffer_size, 512);
+	xt_db_transaction_buffer_size = (size_t) xt_align_offset(transaction_buffer_size, 512);
+	xt_db_checkpoint_frequency = (size_t) checkpoint_frequency;
+	xt_db_data_log_threshold = (off_t) data_log_threshold;
+	xt_db_data_file_grow_size = (size_t) data_file_grow_size;
+	xt_db_row_file_grow_size = (size_t) row_file_grow_size;
+
+#ifdef DRIZZLED
+	pbxt_ignore_case = TRUE;
+#else
+	pbxt_ignore_case = lower_case_table_names != 0;
+#endif
+	if (pbxt_ignore_case)
+		pbxt_share_tables = xt_new_hashtable(self, ha_hash_comp_ci, ha_hash_ci, ha_hash_free, TRUE, FALSE);
+	else
+		pbxt_share_tables = xt_new_hashtable(self, ha_hash_comp, ha_hash, ha_hash_free, TRUE, FALSE);
+
+	xt_thread_wait_init(self);
+	xt_fs_init(self);
+	xt_lock_installation(self, mysql_real_data_home);
+	XTSystemTableShare::startUp(self);
+	xt_init_databases(self);
+	xt_ind_init(self, (size_t) index_cache_size);
+	xt_tc_init(self, (size_t) record_cache_size);
+	xt_xlog_init(self, (size_t) log_cache_size);
+}
+
+static void pbxt_call_exit(XTThreadPtr self)
+{
+	xt_logf(XT_NT_INFO, "PrimeBase XT Engine shutdown...\n");
+
+#ifdef TRACE_STATEMENTS
+	xt_dump_trace();
+#endif
+#ifdef XT_USE_GLOBAL_DB
+	xt_ha_close_global_database(self);
+#endif
+#ifdef DEBUG
+	//xt_stop_database_threads(self, FALSE);
+	xt_stop_database_threads(self, TRUE);
+#else
+	xt_stop_database_threads(self, TRUE);
+#endif
+	/* This will tell the freeer to quit ASAP: */
+	xt_quit_freeer(self);
+	/* We conditional stop the freeer here, because if we are
+	 * in startup, then the free will be hanging.
+	 * {FREEER-HANG}
+	 *
+	 * This problem has been solved by MySQL!
+	 */
+	xt_stop_freeer(self);
+	xt_exit_databases(self);
+	XTSystemTableShare::shutDown(self);
+	xt_xlog_exit(self);
+	xt_tc_exit(self);
+	xt_ind_exit(self);
+	xt_unlock_installation(self, mysql_real_data_home);
+	xt_fs_exit(self);
+	xt_thread_wait_exit(self);
+	if (pbxt_share_tables) {
+		xt_free_hashtable(self, pbxt_share_tables);
+		pbxt_share_tables = NULL;
+	}
+}
+
+/*
+ * Shutdown the PBXT sub-system.
+ */
+static void ha_exit(XTThreadPtr self)
+{
+	xt_xres_terminate_recovery(self);
+
+	/* Wrap things up... */
+	xt_unuse_database(self, self);	/* Just in case the main thread has a database in use (for testing)? */
+	/* This may cause the streaming engine to cleanup connections and 
+	 * tables belonging to this engine. This in turn may require some of
+	 * the stuff below (like xt_create_thread() called from pbxt_close_table()! */
+#ifdef PBMS_ENABLED
+	pbms_finalize();
+#endif
+	pbxt_call_exit(self);
+	xt_exit_threading(self);
+	xt_exit_memory();
+	xt_exit_logging();
+	xt_p_mutex_destroy(&pbxt_database_mutex);		
+	pbxt_inited = false;
+}
+
+/*
+ * Outout the PBXT status. Return FALSE on error.
+ */
+#ifdef DRIZZLED
+bool PBXTStorageEngine::show_status(Session *thd, stat_print_fn *stat_print, enum ha_stat_type)
+#else
+static bool pbxt_show_status(handlerton *XT_UNUSED(hton), THD* thd, 
+                          stat_print_fn* stat_print,
+                          enum ha_stat_type XT_UNUSED(stat_type))
+#endif
+{
+	XTThreadPtr			self;	
+	int					err = 0;
+	XTStringBufferRec	strbuf = { 0, 0, 0 };
+	bool				not_ok = FALSE;
+
+	if (!(self = ha_set_current_thread(thd, &err)))
+		return FALSE;
+
+#ifdef XT_SHOW_DUMPS_TRACE
+	//if (pbxt_database)
+	//	xt_dump_xlogs(pbxt_database, 0);
+	xt_trace("// %s - dump\n", xt_trace_clock_diff(NULL));
+	xt_dump_trace();
+#endif
+#ifdef XT_TRACK_CONNECTIONS
+	xt_dump_conn_tracking();
+#endif
+
+	try_(a) {
+		myxt_get_status(self, &strbuf);
+	}
+	catch_(a) {
+		not_ok = TRUE;
+	}
+	cont_(a);
+
+	if (!not_ok) {
+		if (stat_print(thd, "PBXT", 4, "", 0, strbuf.sb_cstring, (uint) strbuf.sb_len))
+			not_ok = TRUE;
+	}
+	xt_sb_set_size(self, &strbuf, 0);
+
+	return not_ok;
+}
+
+/*
+ * Initialize the PBXT sub-system.
+ *
+ * return 1 on error, else 0.
+ */
+#ifdef DRIZZLED
+static int pbxt_init(Registry &registry)
+#else
+static int pbxt_init(void *p)
+#endif
+{
+	int init_err = 0;
+
+	XT_PRINT0(NULL, "pbxt_init\n");
+
+	if (sizeof(xtWordPS) != sizeof(void *)) {
+		printf("PBXT: This won't work, I require that sizeof(xtWordPS) == sizeof(void *)!\n");
+		XT_RETURN(1);
+	}
+
+	/* GOTCHA: This will "detect" if are loading the plug-in
+	 * with different --with-debug option to MySQL.
+	 *
+	 * In this case, you will get an error when loading the
+	 * library that some symbol was not found.
+	 */
+	void *dummy = my_malloc(100, MYF(0));
+	my_free((byte *) dummy, MYF(0));
+
+ 	if (!pbxt_inited) {
+		XTThreadPtr self = NULL;
+
+ 		xt_p_mutex_init_with_autoname(&pbxt_database_mutex, NULL);
+
+#ifdef DRIZZLED
+		pbxt_hton= new PBXTStorageEngine(std::string("PBXT"));
+		registry.add(pbxt_hton);
+#else
+		pbxt_hton = (handlerton *) p;
+		pbxt_hton->state = SHOW_OPTION_YES;
+		pbxt_hton->db_type = DB_TYPE_PBXT; // Wow! I have my own!
+		pbxt_hton->close_connection = pbxt_close_connection; /* close_connection, cleanup thread related data. */
+		pbxt_hton->commit = pbxt_commit; /* commit */
+		pbxt_hton->rollback = pbxt_rollback; /* rollback */
+		if (pbxt_support_xa) {
+			pbxt_hton->prepare = pbxt_prepare;
+			pbxt_hton->recover = pbxt_recover;
+			pbxt_hton->commit_by_xid = pbxt_commit_by_xid;
+			pbxt_hton->rollback_by_xid = pbxt_rollback_by_xid;
+		}
+		else {
+			pbxt_hton->prepare = NULL;
+			pbxt_hton->recover = NULL;
+			pbxt_hton->commit_by_xid = NULL;
+			pbxt_hton->rollback_by_xid = NULL;
+		}
+		pbxt_hton->create = pbxt_create_handler; /* Create a new handler */
+		pbxt_hton->drop_database = pbxt_drop_database; /* Drop a database */
+		pbxt_hton->panic = pbxt_panic; /* Panic call */
+		pbxt_hton->show_status = pbxt_show_status;
+		pbxt_hton->flags = HTON_NO_FLAGS; /* HTON_CAN_RECREATE - Without this flags TRUNCATE uses delete_all_rows() */
+		pbxt_hton->slot = (uint)-1; /* assign invald value, so we know when it's inited later */
+		pbxt_hton->start_consistent_snapshot = pbxt_start_consistent_snapshot;
+#if defined(MYSQL_SUPPORTS_BACKUP) && defined(XT_ENABLE_ONLINE_BACKUP)
+		pbxt_hton->get_backup_engine = pbxt_backup_engine;
+#endif
+#endif
+		if (!xt_init_logging())					/* Initialize logging */
+			goto error_1;
+
+#ifdef PBMS_ENABLED
+		PBMSResultRec result;
+		if (!pbms_initialize("PBXT", false, &result)) {
+			xt_logf(XT_NT_ERROR, "pbms_initialize() Error: %s", result.mr_message);
+			goto error_2;
+		}
+#endif
+
+		if (!xt_init_memory())					/* Initialize memory */
+			goto error_3;
+
+		/* +7 assumes:
+		 * We are not using multiple database, and:
+		 * +1 Main thread.
+		 * +1 Compactor thread
+		 * +1 Writer thread
+		 * +1 Checkpointer thread
+		 * +1 Sweeper thread
+		 * +1 Free'er thread
+		 * +1 Temporary thread (e.g. TempForClose, TempForEnd)
+		 */
+#ifndef DRIZZLED
+		if (pbxt_max_threads == 0)
+			pbxt_max_threads = max_connections + 7;
+#endif
+		self = xt_init_threading(pbxt_max_threads);				/* Create the main self: */
+		if (!self)
+			goto error_3;
+
+ 		pbxt_inited = true;
+
+		try_(a) {
+			/* Initialize all systems */
+			pbxt_call_init(self);
+
+			/* Conditional unit test: */
+#ifdef XT_UNIT_TEST
+			//xt_unit_test_create_threads(self);
+			xt_unit_test_read_write_locks(self);
+			//xt_unit_test_mutex_locks(self);
+#endif
+
+			/* {OPEN-DB-SWEEPER-WAIT}
+			 * I have to start the freeer before I open and recover the database
+			 * because it we run out of cache while waiting for the sweeper
+			 * we will hang!
+			 */
+			xt_start_freeer(self);
+
+#ifdef XT_USE_GLOBAL_DB
+			/* Open the global database. */
+			ASSERT(!pbxt_database);
+			{
+				THD *curr_thd = current_thd;
+				THD *thd = NULL;
+
+#ifndef DRIZZLED
+#if MYSQL_VERSION_ID < 50147
+				/* A hack which is no longer required after 5.1.46 */
+				extern myxt_mutex_t LOCK_plugin;
+#endif
+
+				/* {MYSQL QUIRK}
+				 * I have to release this lock for PBXT recovery to
+				 * work, because it needs to open .frm files.
+				 * So, I unlock, but during INSTALL PLUGIN this is
+				 * risky, because we are in multi-threaded
+				 * mode!
+				 *
+				 * Although, as far as I can tell from the MySQL code,
+				 * INSTALL PLUGIN should still work ok, during
+				 * concurrent access, because we are not
+				 * relying on pointer/memory that may be changed by
+				 * other users.
+				 *
+				 * Only real problem, 2 threads try to load the same
+				 * plugin at the same time.
+				 */
+#if MYSQL_VERSION_ID < 50147
+				myxt_mutex_unlock(&LOCK_plugin);
+#endif
+#endif
+
+				/* Can't do this here yet, because I need a THD! */
+				try_(b) {
+					/* {MYSQL QUIRK}
+					 * Sometime we have a THD,
+					 * sometimes we don't.
+					 * So far, I have noticed that during INSTALL PLUGIN,
+					 * we have one, otherwize not.
+					 */
+					if (!curr_thd) {
+						if (!(thd = (THD *) myxt_create_thread()))
+							xt_throw(self);
+					}
+
+					xt_xres_start_database_recovery(self);
+				}
+				catch_(b) {
+					/* It is possible that the error was reset by cleanup code.
+					 * Set a generic error code in that case.
+					 */
+					/* PMC - This is not necessary in because exceptions are 
+					 * now preserved, in exception handler cleanup.
+					*/
+					if (!self->t_exception.e_xt_err)
+						xt_register_error(XT_REG_CONTEXT, XT_SYSTEM_ERROR, 0, "Initialization failed"); 
+					xt_log_exception(self, &self->t_exception, XT_LOG_DEFAULT);
+					init_err = 1;
+				}
+				cont_(b);
+
+				if (thd)
+					myxt_destroy_thread(thd, FALSE);
+#ifndef DRIZZLED
+#if MYSQL_VERSION_ID < 50147
+				myxt_mutex_lock(&LOCK_plugin);
+#endif
+#endif
+			}
+#endif
+		}
+		catch_(a) {
+			xt_log_exception(self, &self->t_exception, XT_LOG_DEFAULT);
+			init_err = 1;
+		}
+		cont_(a);
+
+		if (init_err) {
+			/* {FREEER-HANG} The free-er will be hung in:
+				#0	0x91fc6a2e in semaphore_wait_signal_trap
+				#1	0x91fce505 in pthread_mutex_lock
+				#2	0x00489633 in safe_mutex_lock at thr_mutex.c:149
+				#3	0x002dfca9 in plugin_thdvar_init at sql_plugin.cc:2398
+				#4	0x000d6a12 in THD::init at sql_class.cc:715
+				#5	0x000de9d3 in THD::THD at sql_class.cc:597
+				#6	0x000debe1 in THD::THD at sql_class.cc:631
+				#7	0x00e207a4 in myxt_create_thread at myxt_xt.cc:2666
+				#8	0x00e3134b in tabc_fr_run_thread at tabcache_xt.cc:982
+				#9	0x00e422ca in xt_thread_main at thread_xt.cc:1006
+				#10	0x91ff7c55 in _pthread_start
+				#11	0x91ff7b12 in thread_start
+			 *
+			 * so it is not good trying to stop it here!
+			 *
+			 * With regard to this problem, see {OPEN-DB-SWEEPER-WAIT}
+			 * Due to this problem, I will probably have to hack
+			 * the mutex so that the freeer can get started...
+			 *
+			 * NOPE! problem has gone in 6.0.9. Also not a problem in
+			 * 5.1.29.
+			 */
+			
+			/* {OPEN-DB-SWEEPER-WAIT} 
+			 * I have to stop the freeer here because it was
+			 * started before opening the database.
+			 */
+
+			/* {FREEER-HANG-ON-INIT-ERROR}
+			 * pbxt_init is called with LOCK_plugin and if it fails and tries to exit
+			 * the freeer here it hangs because the freeer calls THD::~THD which tries
+			 * to aquire the same lock and hangs. OTOH MySQL calls pbxt_end() after
+			 * an unsuccessful call to pbxt_init, so we defer cleaup, except 
+			 * releasing 'self'
+			 */
+			xt_free_thread(self);
+			goto error_3;
+		}
+		xt_free_thread(self);
+ 	}
+	XT_RETURN(init_err);
+
+	error_3:
+#ifdef PBMS_ENABLED
+	pbms_finalize();
+
+	error_2:
+#endif
+
+	error_1:
+	XT_RETURN(1);
+}
+
+#ifdef DRIZZLED
+static int pbxt_end(Registry &registry)
+#else
+static int pbxt_end(void *)
+#endif
+{
+	XTThreadPtr		self;
+	int				err = 0;
+
+	XT_TRACE_CALL();
+
+	if (pbxt_inited) {
+		XTExceptionRec	e;
+
+		/* This flag also means "shutting down". */
+		pbxt_inited = FALSE; 
+		self = xt_create_thread("TempForEnd", FALSE, TRUE, &e);
+		if (self) {
+			self->t_main = TRUE;
+			ha_exit(self);
+		}
+	}
+
+#ifdef DRIZZLED
+	registry.remove(pbxt_hton);
+#endif
+	XT_RETURN(err);
+}
+
+#ifndef DRIZZLED
+static int pbxt_panic(handlerton *hton, enum ha_panic_function flag)
+{
+	return pbxt_end(hton);
+}
+#endif
+
+/*
+ * Kill the PBXT thread associated with the MySQL thread.
+ */
+#ifdef DRIZZLED
+int PBXTStorageEngine::close_connection(Session *thd)
+{
+	PBXTStorageEngine * const hton = this;
+#else
+static int pbxt_close_connection(handlerton *hton, THD* thd)
+{
+#endif
+	XTThreadPtr		self;
+
+	XT_TRACE_CALL();
+	if ((self = (XTThreadPtr) *thd_ha_data(thd, hton))) {
+		*thd_ha_data(thd, hton) = NULL;
+		/* Required because freeing the thread could cause
+		 * free of database which could call xt_close_file_ns()!
+		 */
+		xt_set_self(self);
+		xt_free_thread(self);
+	}
+	return 0;
+}
+
+/*
+ * Currently does nothing because it was all done
+ * when the last PBXT table was removed from the 
+ * database.
+ */
+#ifdef DRIZZLED
+void PBXTStorageEngine::drop_database(char *)
+#else
+static void pbxt_drop_database(handlerton *XT_UNUSED(hton), char *XT_UNUSED(path))
+#endif
+{
+	XT_TRACE_CALL();
+}
+
+/*
+ * NOTES ON TRANSACTIONS:
+ *
+ * 1. If self->st_lock_count == 0 and transaction can be ended immediately.
+ *    If not, we must wait until the last lock is released on the last handler
+ *    to ensure that the tables are flushed before the transaction is
+ *    committed or aborted.
+ *
+ * 2. all (below) indicates, within a BEGIN/END (i.e. auto_commit off) whether
+ *    the statement or the entire transation is being terminated.
+ *    We currently ignore statement termination.
+ * 
+ * 3. If in BEGIN/END we must call ha_rollback() if we abort the transaction
+ *    internally.
+ *
+ * NOTE ON CONSISTENT SNAPSHOTS:
+ * 
+ * PBXT itself doesn't need this functiona as its transaction mechanism provides
+ * consistent snapshots for all transactions by default. This function is needed
+ * only for multi-engine cases like this:
+ *
+ * CREATE TABLE t1 ... ENGINE=INNODB
+ * CREATE TABLE t2 ... ENGINE=PBXT
+ * START TRANSACTION WITH CONSISTENT SNAPSHOT
+ * SELECT * FROM t1 <-- at this point we need to know about the snapshot
+ */
+
+static int pbxt_start_consistent_snapshot(handlerton *hton, THD *thd)
+{
+	int err          = 0;
+	XTThreadPtr self = ha_set_current_thread(thd, &err);
+
+	if (!self->st_database && pbxt_database) {
+		xt_ha_open_database_of_table(self, (XTPathStrPtr) NULL);
+	}
+
+	thd_init_xact(thd, self, true);
+
+	if (xt_xn_begin(self)) {
+		trans_register_ha(thd, TRUE, hton);	
+	} else {
+		err = xt_ha_pbxt_thread_error_for_mysql(thd, self, FALSE);
+	}
+
+	/*
+	 * As of MySQL 5.1.41 the return value is not checked, so the server might assume 
+	 * everything is fine even it isn't. InnoDB returns 0 on success.
+	 */
+	return err;
+}
+
+/*
+ * Commit the PBXT transaction of the given thread.
+ * thd is the MySQL thread structure.
+ * pbxt_thr is a pointer the the PBXT thread structure.
+ *
+ */
+#ifdef DRIZZLED
+int PBXTStorageEngine::commit(Session *thd, bool all)
+{
+	PBXTStorageEngine * const hton = this;
+#else
+static int pbxt_commit(handlerton *hton, THD *thd, bool all)
+{
+#endif
+	int			err = 0;
+	XTThreadPtr	self;
+
+	if ((self = (XTThreadPtr) *thd_ha_data(thd, hton))) {
+		XT_PRINT2(self, "%s pbxt_commit all=%d\n", all ? "END CONN XACT" : "END STAT", all);
+
+		if (self->st_xact_data) {
+			/* There are no table locks, commit immediately in all cases
+			 * except when this is a statement commit with an explicit
+			 * transaction (!all && !self->st_auto_commit).
+			 */
+			if (all || self->st_auto_commit) {
+				XT_PRINT0(self, "xt_xn_commit in pbxt_commit\n");
+
+				if (!xt_xn_commit(self))
+					err = xt_ha_pbxt_thread_error_for_mysql(thd, self, FALSE);
+			}
+		}
+		if (!all)
+			self->st_stat_trans = FALSE;
+	}
+	return err;
+}
+
+#ifdef DRIZZLED
+int PBXTStorageEngine::rollback(Session *thd, bool all)
+{
+	PBXTStorageEngine * const hton = this;
+#else
+static int pbxt_rollback(handlerton *hton, THD *thd, bool all)
+{
+#endif
+	int			err = 0;
+	XTThreadPtr	self;
+
+	if ((self = (XTThreadPtr) *thd_ha_data(thd, hton))) {
+		XT_PRINT2(self, "%s pbxt_rollback all=%d\n", all ? "CONN END XACT" : "STAT END", all);
+
+		if (self->st_xact_data) {
+			/* There are no table locks, rollback immediately in all cases
+			 * except when this is a statement commit with an explicit
+			 * transaction (!all && !self->st_auto_commit).
+			 *
+			 * Note, the only reason for a rollback of a operation is
+			 * due to an error. In this case PBXT has already
+			 * undone the effects of the operation.
+			 *
+			 * However, this is not the same as statement rollback
+			 * which can involve a number of operations.
+			 *
+			 * TODO: Implement statement rollback.
+			 */
+			if (all || self->st_auto_commit) {
+				XT_PRINT0(self, "xt_xn_rollback\n");
+				if (!xt_xn_rollback(self))
+					err = xt_ha_pbxt_thread_error_for_mysql(thd, self, FALSE);
+			}
+		}
+		if (!all)
+			self->st_stat_trans = FALSE;
+	}
+	return 0;
+}
+
+#ifdef DRIZZLED
+Cursor *PBXTStorageEngine::create(TABLE_SHARE *table, MEM_ROOT *mem_root)
+{
+	PBXTStorageEngine * const hton = this;
+#else
+static handler *pbxt_create_handler(handlerton *hton, TABLE_SHARE *table, MEM_ROOT *mem_root)
+{
+#endif
+	if (table && XTSystemTableShare::isSystemTable(table->path.str))
+		return new (mem_root) ha_xtsys(hton, table);
+	else
+		return new (mem_root) ha_pbxt(hton, table);
+}
+
+/*
+ * -----------------------------------------------------------------------
+ * 2-PHASE COMMIT
+ *
+ */
+
+#ifndef DRIZZLED
+
+static int pbxt_prepare(handlerton *hton, THD *thd, bool all)
+{
+	int			err = 0;
+	XTThreadPtr	self;
+
+	XT_TRACE_CALL();
+	if ((self = (XTThreadPtr) *thd_ha_data(thd, hton))) {
+		XT_PRINT1(self, "pbxt_commit all=%d\n", all);
+
+		if (self->st_xact_data) {
+			/* There are no table locks, commit immediately in all cases
+			 * except when this is a statement commit with an explicit
+			 * transaction (!all && !self->st_auto_commit).
+			 */
+			if (all || self->st_auto_commit) {
+				XID xid;
+
+				XT_PRINT0(self, "xt_xn_prepare in pbxt_prepare\n");
+				thd_get_xid(thd, (MYSQL_XID*) &xid);
+
+				if (!xt_xn_prepare(xid.length(), (xtWord1 *) &xid, self))
+					err = xt_ha_pbxt_thread_error_for_mysql(thd, self, FALSE);
+			}
+		}
+	}
+	return err;
+}
+
+static XTThreadPtr ha_temp_open_global_database(handlerton *hton, THD **ret_thd, int *temp_thread, const char *thread_name, int *err)
+{
+	THD			*thd;
+	XTThreadPtr	self = NULL;
+
+	*temp_thread = 0;
+	if ((thd = current_thd))
+		self = (XTThreadPtr) *thd_ha_data(thd, hton);
+	else {
+		//thd = (THD *) myxt_create_thread();
+		//*temp_thread |= 2;
+	}
+
+	if (!self) {
+		XTExceptionRec e;
+
+		if (!(self = xt_create_thread(thread_name, FALSE, TRUE, &e))) {
+			*err = xt_ha_pbxt_to_mysql_error(e.e_xt_err);
+			xt_log_exception(NULL, &e, XT_LOG_DEFAULT);
+			return NULL;
+		}
+		*temp_thread |= 1;
+	}
+
+	xt_xres_wait_for_recovery(self, XT_RECOVER_DONE);
+
+	try_(a) {
+		xt_open_database(self, mysql_real_data_home, TRUE);
+	}
+	catch_(a) {
+		*err = xt_ha_pbxt_thread_error_for_mysql(thd, self, FALSE);
+		if ((*temp_thread & 1))
+			xt_free_thread(self);
+		if (*temp_thread & 2)
+			myxt_destroy_thread(thd, FALSE);
+		self = NULL;
+	}
+	cont_(a);
+
+	*ret_thd = thd;
+	return self;
+}
+
+static void ha_temp_close_database(XTThreadPtr self, THD *thd, int temp_thread)
+{
+	xt_unuse_database(self, self);
+	if (temp_thread & 1)
+		xt_free_thread(self);
+	if (temp_thread & 2)
+		myxt_destroy_thread(thd, TRUE);
+}
+
+/* Return all prepared transactions, found during recovery.
+ * This function returns a count. If len is returned, the
+ * function will be called again.
+ */
+static int pbxt_recover(handlerton *hton, XID *xid_list, uint len)
+{
+	xtBool				temp_thread;
+	XTThreadPtr			self;
+	XTDatabaseHPtr		db;
+	uint				count = 0;
+	XTXactPreparePtr	xap;
+	int					err;
+	THD					*thd;
+
+	if (!(self = ha_temp_open_global_database(hton, &thd, &temp_thread, "TempForRecover", &err)))
+		return 0;
+
+	db = self->st_database;
+
+	for (count=0; count<len; count++) {
+		xap = xt_xn_enum_xa_data(db, &pbxt_xa_enum);
+		if (!xap)
+			break;
+		memcpy(&xid_list[count], xap->xp_xa_data, xap->xp_data_len);
+	}
+
+	ha_temp_close_database(self, thd, temp_thread);
+	return (int) count;
+}
+
+static int pbxt_commit_by_xid(handlerton *hton, XID *xid)
+{
+	xtBool				temp_thread;
+	XTThreadPtr			self;
+	XTDatabaseHPtr		db;
+	int					err = 0;
+	XTXactPreparePtr	xap;
+	THD					*thd;
+
+	XT_TRACE_CALL();
+
+	if (!(self = ha_temp_open_global_database(hton, &thd, &temp_thread, "TempForCommitXA", &err)))
+		return err;
+	db = self->st_database;
+
+	if ((xap = xt_xn_find_xa_data(db, xid->length(), (xtWord1 *) xid, TRUE, self))) {
+		if ((self->st_xact_data = xt_xn_get_xact(db, xap->xp_xact_id, self))) {
+			self->st_xact_data->xd_flags &= ~XT_XN_XAC_PREPARED;  // Prepared transactions cannot be swept!
+			if (!xt_xn_commit(self))
+				err = xt_ha_pbxt_thread_error_for_mysql(thd, self, FALSE);
+		}
+		xt_xn_delete_xa_data(db, xap, TRUE, self);
+	}
+
+	ha_temp_close_database(self, thd, temp_thread);
+	return 0;
+}
+
+static int pbxt_rollback_by_xid(handlerton *hton, XID *xid)
+{
+	int					temp_thread;
+	XTThreadPtr			self;
+	XTDatabaseHPtr		db;
+	int					err = 0;
+	XTXactPreparePtr	xap;
+	THD					*thd;
+
+	XT_TRACE_CALL();
+
+	if (!(self = ha_temp_open_global_database(hton, &thd, &temp_thread, "TempForRollbackXA", &err)))
+		return err;
+	db = self->st_database;
+
+	if ((xap = xt_xn_find_xa_data(db, xid->length(), (xtWord1 *) xid, TRUE, self))) {
+		if ((self->st_xact_data = xt_xn_get_xact(db, xap->xp_xact_id, self))) {
+			self->st_xact_data->xd_flags &= ~XT_XN_XAC_PREPARED;  // Prepared transactions cannot be swept!
+			if (!xt_xn_rollback(self))
+				err = xt_ha_pbxt_thread_error_for_mysql(thd, self, FALSE);
+		}
+		xt_xn_delete_xa_data(db, xap, TRUE, self);
+	}
+
+	ha_temp_close_database(self, thd, temp_thread);
+	return 0;
+}
+
+#endif
+
+/*
+ * -----------------------------------------------------------------------
+ * HANDLER LOCKING FUNCTIONS
+ *
+ * These functions are used get a lock on all handles of a particular table.
+ *
+ */
+
+static void ha_add_to_handler_list(XTThreadPtr self, XTSharePtr share, ha_pbxt *handler)
+{
+	xt_lock_mutex(self, (xt_mutex_type *) share->sh_ex_mutex);
+	pushr_(xt_unlock_mutex, share->sh_ex_mutex);
+
+	handler->pb_ex_next = share->sh_handlers;
+	handler->pb_ex_prev = NULL;
+	if (share->sh_handlers)
+		share->sh_handlers->pb_ex_prev = handler;
+	share->sh_handlers = handler;
+
+	freer_(); // xt_unlock_mutex(share->sh_ex_mutex)
+}
+
+static void ha_remove_from_handler_list(XTThreadPtr self, XTSharePtr share, ha_pbxt *handler)
+{
+	xt_lock_mutex(self, (xt_mutex_type *) share->sh_ex_mutex);
+	pushr_(xt_unlock_mutex, share->sh_ex_mutex);
+
+	/* Move front pointer: */
+	if (share->sh_handlers == handler)
+		share->sh_handlers = handler->pb_ex_next;
+
+	/* Remove from list: */
+	if (handler->pb_ex_prev)
+		handler->pb_ex_prev->pb_ex_next = handler->pb_ex_next;
+	if (handler->pb_ex_next)
+		handler->pb_ex_next->pb_ex_prev = handler->pb_ex_prev;
+
+	freer_(); // xt_unlock_mutex(share->sh_ex_mutex)
+}
+
+/*
+ * Aquire exclusive use of a table, by waiting for all
+ * threads to complete use of all handlers of the table.
+ * At the same time we hold up all threads
+ * that want to use handlers belonging to the table.
+ *
+ * But we do not hold up threads that close the handlers.
+ */
+static void ha_aquire_exclusive_use(XTThreadPtr self, XTSharePtr share, ha_pbxt *mine)
+{
+	ha_pbxt	*handler;
+	time_t	end_time = time(NULL) + XT_SHARE_LOCK_TIMEOUT / 1000;
+
+	XT_PRINT1(self, "ha_aquire_exclusive_use (%s) PBXT X lock\n", share->sh_table_path->ps_path);
+	/* GOTCHA: It is possible to hang here, if you hold
+	 * onto the sh_ex_mutex lock, before we really
+	 * have the exclusive lock (i.e. before all
+	 * handlers are no longer in use.
+	 * The reason is, because reopen() is not possible
+	 * when some other thread holds sh_ex_mutex.
+	 * So this can prevent a thread from completing its
+	 * use of a handler, when prevents exclusive use
+	 * here.
+	 */
+	xt_lock_mutex(self, (xt_mutex_type *) share->sh_ex_mutex);
+	pushr_(xt_unlock_mutex, share->sh_ex_mutex);
+
+	/* Wait until we can get an exclusive lock: */
+	while (share->sh_table_lock) {
+		xt_timed_wait_cond(self, (xt_cond_type *) share->sh_ex_cond, (xt_mutex_type *) share->sh_ex_mutex, XT_SHARE_LOCK_WAIT);
+		if (time(NULL) > end_time) {
+			freer_(); // xt_unlock_mutex(share->sh_ex_mutex)
+			xt_throw_taberr(XT_CONTEXT, XT_ERR_LOCK_TIMEOUT, share->sh_table_path);
+		}
+	}
+
+	/* This tells readers (and other exclusive lockers) that someone has an exclusive lock. */
+	share->sh_table_lock = TRUE;
+	
+	/* Wait for all open handlers use count to go to 0 */	
+	retry:
+	handler = share->sh_handlers;
+	while (handler) {
+		if (handler == mine || !handler->pb_ex_in_use)
+			handler = handler->pb_ex_next;
+		else {
+			/* Wait a bit, and try again: */
+			xt_timed_wait_cond(self, (xt_cond_type *) share->sh_ex_cond, (xt_mutex_type *) share->sh_ex_mutex, XT_SHARE_LOCK_WAIT);
+			if (time(NULL) > end_time) {
+				freer_(); // xt_unlock_mutex(share->sh_ex_mutex)
+				xt_throw_taberr(XT_CONTEXT, XT_ERR_LOCK_TIMEOUT, share->sh_table_path);
+			}
+			/* Handler may have been freed, check from the begining again: */
+			goto retry;
+		}
+	}
+
+	freer_(); // xt_unlock_mutex(share->sh_ex_mutex)
+}
+
+/*
+ * If you have exclusively locked the table, you can close all handler
+ * open tables.
+ *
+ * Call ha_close_open_tables() to get an exclusive lock.
+ */
+static void ha_close_open_tables(XTThreadPtr self, XTSharePtr share, ha_pbxt *mine)
+{
+	ha_pbxt *handler;
+
+	xt_lock_mutex(self, (xt_mutex_type *) share->sh_ex_mutex);
+	pushr_(xt_unlock_mutex, share->sh_ex_mutex);
+
+	/* Now that we know no handler is in use, we can close all the
+	 * open tables...
+	 */
+	handler = share->sh_handlers;
+	while (handler) {
+		if (handler != mine && handler->pb_open_tab) {
+			xt_db_return_table_to_pool_ns(handler->pb_open_tab);
+			handler->pb_open_tab = NULL;
+		}
+		handler = handler->pb_ex_next;
+	}
+
+	freer_(); // xt_unlock_mutex(share->sh_ex_mutex)
+}
+
+#ifdef PBXT_ALLOW_PRINTING
+static void ha_release_exclusive_use(XTThreadPtr self, XTSharePtr share)
+#else
+static void ha_release_exclusive_use(XTThreadPtr XT_UNUSED(self), XTSharePtr share)
+#endif
+{
+	XT_PRINT1(self, "ha_release_exclusive_use (%s) PBXT X UNLOCK\n", share->sh_table_path->ps_path);
+	xt_lock_mutex_ns((xt_mutex_type *) share->sh_ex_mutex);
+	share->sh_table_lock = FALSE;
+	xt_broadcast_cond_ns((xt_cond_type *) share->sh_ex_cond);
+	xt_unlock_mutex_ns((xt_mutex_type *) share->sh_ex_mutex);
+}
+
+static xtBool ha_wait_for_shared_use(ha_pbxt *mine, XTSharePtr share)
+{
+	time_t	end_time = time(NULL) + XT_SHARE_LOCK_TIMEOUT / 1000;
+
+	XT_PRINT1(xt_get_self(), "ha_wait_for_shared_use (%s) share lock wait...\n", share->sh_table_path->ps_path);
+	mine->pb_ex_in_use = 0;
+	xt_lock_mutex_ns((xt_mutex_type *) share->sh_ex_mutex);
+	while (share->sh_table_lock) {
+		/* Wake up the exclusive locker (may be waiting). He can try to continue: */
+		xt_broadcast_cond_ns((xt_cond_type *) share->sh_ex_cond);
+
+		if (!xt_timed_wait_cond(NULL, (xt_cond_type *) share->sh_ex_cond, (xt_mutex_type *) share->sh_ex_mutex, XT_SHARE_LOCK_WAIT)) {
+			xt_unlock_mutex_ns((xt_mutex_type *) share->sh_ex_mutex);
+			return FAILED;
+		}
+
+		if (time(NULL) > end_time) {
+			xt_unlock_mutex_ns((xt_mutex_type *) share->sh_ex_mutex);
+			xt_register_taberr(XT_REG_CONTEXT, XT_ERR_LOCK_TIMEOUT, share->sh_table_path);
+			return FAILED;
+		}
+	}
+	mine->pb_ex_in_use = 1;
+	xt_unlock_mutex_ns((xt_mutex_type *) share->sh_ex_mutex);
+	return OK;
+}
+
+xtPublic int ha_pbxt::reopen()
+{
+	THD				*thd = current_thd;
+	int				err = 0;
+	XTThreadPtr		self;	
+
+	if (!(self = ha_set_current_thread(thd, &err)))
+		return xt_ha_pbxt_to_mysql_error(err);
+
+	try_(a) {
+		xt_ha_open_database_of_table(self, pb_share->sh_table_path);
+
+		ha_open_share(self, pb_share);
+
+		if (!(pb_open_tab = xt_db_open_table_using_tab(pb_share->sh_table, self)))
+			xt_throw(self);
+		pb_open_tab->ot_thread = self;
+
+		/* {TABLE-STATS}
+		 * We no longer use the information that a table
+		 * was opened in order to know when to calculate
+		 * statistics.
+		 */
+		if (!pb_open_tab->ot_table->tab_ind_stat_calc_time) {
+#ifdef LOAD_TABLE_ON_OPEN
+			xt_tab_load_table(self, pb_open_tab);
+#else
+			xt_tab_load_row_pointers(self, pb_open_tab);
+#endif
+			xt_ind_set_index_selectivity(pb_open_tab, self);
+			/* If the number of rows is less than 150 we will recalculate the
+			 * selectity of the indices, as soon as the number of rows
+			 * exceeds 200 (see [**])
+			 */
+#ifdef XT_ROW_COUNT_CORRECTED
+			/* {CORRECTED-ROW-COUNT} */
+			pb_share->sh_recalc_selectivity = (pb_share->sh_table->tab_row_eof_id - 1 - pb_share->sh_table->tab_row_fnum) < 150;
+#else
+			/* {FREE-ROWS-BAD} */
+			pb_share->sh_recalc_selectivity = (pb_share->sh_table->tab_row_eof_id - 1 /* - pb_share->sh_table->tab_row_fnum */) < 150;
+#endif
+		}
+
+		/* I am not doing this anymore because it was only required
+		 * for DELETE FROM table;, which is now implemented
+		 * by deleting each row.
+		 * TRUNCATE TABLE does not preserve the counter value.
+		 */
+		//init_auto_increment(pb_share->sh_min_auto_inc);
+		init_auto_increment(0);
+	}
+	catch_(a) {
+		err = xt_ha_pbxt_thread_error_for_mysql(thd, self, pb_ignore_dup_key);
+	}
+	cont_(a);
+	
+	return err;
+}
+
+/*
+ * -----------------------------------------------------------------------
+ * INFORMATION SCHEMA FUNCTIONS
+ *
+ */
+
+static int pbxt_statistics_fill_table(THD *thd, TABLE_LIST *tables, COND *cond)
+{
+	XTThreadPtr		self = NULL;	
+	int				err = 0;
+
+	if (!pbxt_hton) {
+		/* Can't do if PBXT is not loaded! */
+		XTExceptionRec	e;
+
+		xt_exception_xterr(&e, XT_CONTEXT, XT_ERR_PBXT_NOT_INSTALLED);
+		xt_log_exception(NULL, &e, XT_LOG_DEFAULT);
+		/* Just return an empty set: */
+		return 0;
+	}
+
+	if (!(self = ha_set_current_thread(thd, &err)))
+		return xt_ha_pbxt_to_mysql_error(err);
+
+
+	try_(a) {
+		/* If the thread has no open database, and the global
+		 * database is already open, then open
+		 * the database. Otherwise the statement will be
+		 * executed without an open database, which means
+		 * that the related statistics will be missing.
+		 *
+		 * This includes all background threads.
+		 */
+		if (!self->st_database && pbxt_database) {
+			xt_ha_open_database_of_table(self, (XTPathStrPtr) NULL);
+		}
+
+		err = myxt_statistics_fill_table(self, thd, tables, cond, (void*) system_charset_info);
+	}
+	catch_(a) {
+		err = xt_ha_pbxt_thread_error_for_mysql(thd, self, FALSE);
+	}
+	cont_(a);
+	return err;
+}
+
+#ifdef DRIZZLED
+ColumnInfo pbxt_statistics_fields_info[]=
+{
+	ColumnInfo("ID", 4, MYSQL_TYPE_LONG,  0, 0, "The ID of the statistic", SKIP_OPEN_TABLE),
+        ColumnInfo("Name", 40, MYSQL_TYPE_STRING, 0, 0, "The name of the statistic", SKIP_OPEN_TABLE),
+        ColumnInfo("Value", 8, MYSQL_TYPE_LONGLONG, 0, 0, "The accumulated value", SKIP_OPEN_TABLE),
+	ColumnInfo()
+};
+
+class PBXTStatisticsMethods : public InfoSchemaMethods
+{
+public:
+  int fillTable(Session *session, TableList *tables, COND *cond)
+  {
+        return pbxt_statistics_fill_table(session, tables, cond);
+  }
+};
+#else
+ST_FIELD_INFO pbxt_statistics_fields_info[]=
+{
+	{ "ID",		4,	MYSQL_TYPE_LONG,		0, 0, "The ID of the statistic", SKIP_OPEN_TABLE},
+	{ "Name",	40, MYSQL_TYPE_STRING,		0, 0, "The name of the statistic", SKIP_OPEN_TABLE},
+	{ "Value",	8,	MYSQL_TYPE_LONGLONG,	0, 0, "The accumulated value", SKIP_OPEN_TABLE},
+	{ 0,		0,	MYSQL_TYPE_STRING,		0, 0, 0, SKIP_OPEN_TABLE}
+};
+#endif
+
+#ifdef DRIZZLED
+static InfoSchemaTable	*pbxt_statistics_table;
+static PBXTStatisticsMethods pbxt_statistics_methods;
+static int pbxt_init_statistics(Registry &registry)
+#else
+static int pbxt_init_statistics(void *p)
+#endif
+{
+#ifdef DRIZZLED
+	//pbxt_statistics_table = (InfoSchemaTable *)xt_calloc_ns(sizeof(InfoSchemaTable));
+	//pbxt_statistics_table->table_name= "PBXT_STATISTICS";
+	pbxt_statistics_table = new InfoSchemaTable("PBXT_STATISTICS");
+	pbxt_statistics_table->setColumnInfo(pbxt_statistics_fields_info);
+	pbxt_statistics_table->setInfoSchemaMethods(&pbxt_statistics_methods);
+	registry.add(pbxt_statistics_table);
+#else
+	ST_SCHEMA_TABLE *pbxt_statistics_table = (ST_SCHEMA_TABLE *) p;
+	pbxt_statistics_table->fields_info = pbxt_statistics_fields_info;
+	pbxt_statistics_table->fill_table = pbxt_statistics_fill_table;
+#endif
+
+#if defined(XT_WIN) && defined(XT_COREDUMP)
+	void register_crash_filter();
+
+	if (pbxt_crash_debug)
+		register_crash_filter();
+#endif
+
+	return 0;
+}
+
+#ifdef DRIZZLED
+static int pbxt_exit_statistics(Registry &registry)
+#else
+static int pbxt_exit_statistics(void *XT_UNUSED(p))
+#endif
+{
+#ifdef DRIZZLED
+	registry.remove(pbxt_statistics_table);
+	delete pbxt_statistics_table;
+#endif
+	return(0);
+}
+
+/*
+ * -----------------------------------------------------------------------
+ * DYNAMIC HOOKS
+ *
+ */
+
+ha_pbxt::ha_pbxt(handlerton *hton, TABLE_SHARE *table_arg) : handler(hton, table_arg)
+{
+	pb_share = NULL;
+	pb_open_tab = NULL;
+	pb_key_read = FALSE;
+	pb_ignore_dup_key = 0;
+	pb_lock_table = FALSE;
+	pb_table_locked = 0;
+	pb_ex_next = NULL;
+	pb_ex_prev = NULL;
+	pb_ex_in_use = 0;
+	pb_in_stat = FALSE;
+}
+
+/*
+ * If frm_error() is called then we will use this to to find out what file extentions
+ * exist for the storage engine. This is also used by the default rename_table and
+ * delete_table method in handler.cc.
+ */
+#ifdef DRIZZLED
+const char **PBXTStorageEngine::bas_ext() const
+#else
+const char **ha_pbxt::bas_ext() const
+#endif
+{
+	return pbxt_extensions;
+}
+
+/*
+ * Specify the caching type: HA_CACHE_TBL_NONTRANSACT, HA_CACHE_TBL_NOCACHE
+ * HA_CACHE_TBL_ASKTRANSACT, HA_CACHE_TBL_TRANSACT
+ */
+MX_UINT8_T ha_pbxt::table_cache_type()
+{
+	return HA_CACHE_TBL_TRANSACT; /* Use transactional query cache */
+}
+
+MX_TABLE_TYPES_T ha_pbxt::table_flags() const
+{
+	return (
+		/* We need this flag because records are not packed
+		 * into a table which means #ROWID != offset
+		 */
+		HA_REC_NOT_IN_SEQ |
+		/* Since PBXT caches read records itself, I believe
+		 * this to be the case.
+		 */
+		HA_FAST_KEY_READ |
+		/*
+		 * I am assuming a "key" means a unique index.
+		 * Of course a primary key does not allow nulls.
+		 */
+		HA_NULL_IN_KEY |
+		/*
+		 * This is necessary because a MySQL blob can be
+		 * fairly small.
+		 */
+		HA_CAN_INDEX_BLOBS |
+		/*
+		 * Due to transactional influences, this will be
+		 * the case.
+		 * Although the count is good enough for practical
+		 * purposes!
+		HA_NOT_EXACT_COUNT |
+		 */
+#ifndef DRIZZLED
+		/*
+		 * This basically means we have a file with the name of
+		 * database table (which we do).
+		 */
+		HA_FILE_BASED |
+#endif
+		/*
+		 * Not sure what this does (but MyISAM and InnoDB have it)?!
+		 * Could it mean that we support the handler functions.
+		 */
+		HA_CAN_SQL_HANDLER |
+		/*
+		 * This is not true, we cannot insert delayed, but a
+		 * really cannot see what's wrong with inserting normally
+		 * when asked to insert delayed!
+		 * And the functionallity is required to pass the alter_table
+		 * test.
+		 *
+		 * Disabled because of MySQL bug #40505
+		 */
+		/*HA_CAN_INSERT_DELAYED |*/
+#if MYSQL_VERSION_ID > 50119
+		/* We can do row logging, but not statement, because
+		 * MVCC is not serializable!
+		 */
+		HA_BINLOG_ROW_CAPABLE |
+#endif
+		/*
+		 * Auto-increment is allowed on a partial key.
+		 */
+		HA_AUTO_PART_KEY);
+}
+
+/*
+ * The following query from the DBT1 test is VERY slow
+ * if we do not set HA_READ_ORDER.
+ * The reason is that it must scan all duplicates, then
+ * sort.
+ *
+ * SELECT o_id, o_carrier_id, o_entry_d, o_ol_cnt
+ * FROM orders FORCE INDEX (o_w_id)
+ * WHERE o_w_id = 2
+   * AND o_d_id = 1
+   * AND o_c_id = 500
+ * ORDER BY o_id DESC limit 1;
+ *
+ */
+#define FLAGS_ARE_READ_DYNAMICALLY
+
+MX_ULONG_T ha_pbxt::index_flags(uint XT_UNUSED(inx), uint XT_UNUSED(part), bool XT_UNUSED(all_parts)) const
+{
+	/* It would be nice if the dynamic version of this function works,
+	 * but it does not. MySQL loads this information when the table is openned,
+	 * and then it is fixed.
+	 *
+	 * The problem is, I have had to remove the HA_READ_ORDER option although
+	 * it applies to PBXT. PBXT returns entries in index order during an index
+	 * scan in _almost_ all cases.
+	 *
+	 * A number of cases are demostrated here: [(11)]
+	 *
+	 * If involves the following conditions:
+	 * - a SELECT FOR UPDATE, UPDATE or DELETE statement
+	 * - an ORDER BY, or join that requires the sort order
+	 * - another transaction which updates the index while it is being
+	 *   scanned.
+	 *
+	 * In this "obscure" case, the index scan may return index
+	 * entries in the wrong order.
+	 */
+#ifdef FLAGS_ARE_READ_DYNAMICALLY
+	/* If were are in an update (SELECT FOR UPDATE, UPDATE or DELETE), then
+	 * it may be that we return the rows from an index in the wrong
+	 * order! This is due to the fact that update reads wait for transactions
+	 * to commit and this means that index entries may change position during
+	 * the scan!
+	 */
+	if (pb_open_tab && pb_open_tab->ot_for_update)
+		return (HA_READ_NEXT | HA_READ_PREV | HA_READ_RANGE | HA_KEYREAD_ONLY);
+	/* If I understand HA_KEYREAD_ONLY then this means I do not
+	 * need to fetch the record associated with an index
+	 * key.
+	 */
+	return (HA_READ_NEXT | HA_READ_PREV | HA_READ_ORDER | HA_READ_RANGE | HA_KEYREAD_ONLY);
+#else
+	return (HA_READ_NEXT | HA_READ_PREV | HA_READ_RANGE | HA_KEYREAD_ONLY);
+#endif
+}
+
+void ha_pbxt::internal_close(THD *thd, struct XTThread *self)
+{
+	if (pb_share) {
+		xtBool			removed;
+		XTOpenTablePtr	ot;
+
+		try_(a) {
+			/* This lock must be held when we remove the handler's
+			 * open table because ha_close_open_tables() can run
+			 * concurrently.
+			 */
+			xt_lock_mutex_ns(pb_share->sh_ex_mutex);
+			if ((ot = pb_open_tab)) {
+				pb_open_tab->ot_thread = self;
+				if (self->st_database != pb_open_tab->ot_table->tab_db)
+					xt_ha_open_database_of_table(self, pb_share->sh_table_path);
+				pb_open_tab = NULL;
+				pushr_(xt_db_return_table_to_pool, ot);
+			}
+			xt_unlock_mutex_ns(pb_share->sh_ex_mutex);
+
+			ha_remove_from_handler_list(self, pb_share, this);
+
+			/* Someone may be waiting for me to complete: */
+			xt_broadcast_cond_ns((xt_cond_type *) pb_share->sh_ex_cond);
+
+			removed = ha_unget_share_removed(self, pb_share);
+
+			if (ot) {
+				/* Flush the table if this was the last handler: */
+				/* This is not necessary but has the affect that
+				 * FLUSH TABLES; does a checkpoint!
+				 */
+				if (removed) {
+					/* GOTCHA:
+					 * This was killing performance as the number of threads increased!
+					 *
+					 * When MySQL runs out of table handlers because the table
+					 * handler cache is too small, it starts to close handlers.
+					 * (open_cache.records > table_cache_size)
+					 *
+					 * Which can lead to closing all handlers for a particular table.
+					 *
+					 * It does this while holding lock_OPEN!
+					 * So this code below leads to a sync operation while lock_OPEN
+					 * is held. The result is that the whole server comes to a stop.
+					 */
+					if (!thd || thd_sql_command(thd) == SQLCOM_FLUSH) // FLUSH TABLES
+						xt_sync_flush_table(self, ot);
+					else {
+						/* This change is a result of a problem mentioned by Arjen.
+						 * REPAIR and ALTER lead to the following sequence:
+						 * 1. tab  -- copy --> tmp1
+						 * 2. tab  -- rename --> tmp2
+						 * 3. tmp1 -- rename --> tab
+						 * 4. delete tmp2
+						 *
+						 * PBXT flushes a table before rename.
+						 * In the sequence above results in a table flush in step 3 which can
+						 * take a very long time.
+						 *
+						 * The problem is, during this time frame we have only temp tables.
+						 * A crash in this state leaves the database in a bad state.
+						 *
+						 * To reduce the time in this state, the flush needs to be done
+						 * elsewhere. The code below causes the flish to occur after
+						 * step 1:
+						 */ 
+						switch (thd_sql_command(thd)) {
+							case SQLCOM_REPAIR:
+							case SQLCOM_RENAME_TABLE:
+							case SQLCOM_OPTIMIZE:
+							case SQLCOM_ANALYZE:
+							case SQLCOM_ALTER_TABLE:
+							case SQLCOM_CREATE_INDEX:
+								xt_sync_flush_table(self, ot);
+								break;
+						}
+					}
+				}
+				freer_(); // xt_db_return_table_to_pool(ot);
+			}
+		}
+		catch_(a) {
+			xt_log_and_clear_exception(self);
+		}
+		cont_(a);
+
+		pb_share = NULL;
+	}
+}
+
+/*
+ * Used for opening tables. The name will be the name of the file.
+ * A table is opened when it needs to be opened. For instance
+ * when a request comes in for a select on the table (tables are not
+ * open and closed for each request, they are cached).
+
+ * Called from handler.cc by handler::ha_open(). The server opens all tables by
+ * calling ha_open() which then calls the handler specific open().
+ */
+int ha_pbxt::open(const char *table_path, int XT_UNUSED(mode), uint XT_UNUSED(test_if_locked))
+{
+	THD			*thd = current_thd;
+	int			err = 0;
+	XTThreadPtr	self;
+
+	ref_length = XT_RECORD_OFFS_SIZE;
+
+	if (!(self = ha_set_current_thread(thd, &err)))
+		return xt_ha_pbxt_to_mysql_error(err);
+
+	XT_PRINT1(self, "open (%s)\n", table_path);
+
+	pb_ex_in_use = 1;
+	try_(a) {
+		xt_ha_open_database_of_table(self, (XTPathStrPtr) table_path);
+
+		pb_share = ha_get_share(self, table_path, false);
+		ha_add_to_handler_list(self, pb_share, this);
+		if (pb_share->sh_table_lock) {
+			if (!ha_wait_for_shared_use(this, pb_share))
+				xt_throw(self);
+		}
+
+		ha_open_share(self, pb_share);
+
+		thr_lock_data_init(&pb_share->sh_lock, &pb_lock, NULL);
+		if (!(pb_open_tab = xt_db_open_table_using_tab(pb_share->sh_table, self)))
+			xt_throw(self);
+		pb_open_tab->ot_thread = self;
+
+		/* {TABLE-STATS} */
+		if (!pb_open_tab->ot_table->tab_ind_stat_calc_time) {
+#ifdef LOAD_TABLE_ON_OPEN
+			xt_tab_load_table(self, pb_open_tab);
+#else
+			xt_tab_load_row_pointers(self, pb_open_tab);
+#endif
+
+			xt_ind_set_index_selectivity(pb_open_tab, self);
+#ifdef XT_ROW_COUNT_CORRECTED
+			/* {CORRECTED-ROW-COUNT} */
+			pb_share->sh_recalc_selectivity = (pb_share->sh_table->tab_row_eof_id - 1 - pb_share->sh_table->tab_row_fnum) < 150;
+#else
+			/* {FREE-ROWS-BAD} */
+			pb_share->sh_recalc_selectivity = (pb_share->sh_table->tab_row_eof_id - 1 /* - pb_share->sh_table->tab_row_fnum */) < 150;
+#endif
+		}
+
+		init_auto_increment(0);
+	}
+	catch_(a) {
+		err = xt_ha_pbxt_thread_error_for_mysql(thd, self, pb_ignore_dup_key);
+		internal_close(thd, self);
+	}
+	cont_(a);
+
+	if (!err)
+		info(HA_STATUS_NO_LOCK | HA_STATUS_VARIABLE | HA_STATUS_CONST);
+
+	pb_ex_in_use = 0;
+	if (pb_share) {
+		/* Someone may be waiting for me to complete: */
+		if (pb_share->sh_table_lock)
+			xt_broadcast_cond_ns((xt_cond_type *) pb_share->sh_ex_cond);
+	}
+	return err;
+}
+
+
+/*
+	Closes a table. We call the free_share() function to free any resources
+	that we have allocated in the "shared" structure.
+
+	Called from sql_base.cc, sql_select.cc, and table.cc.
+	In sql_select.cc it is only used to close up temporary tables or during
+	the process where a temporary table is converted over to being a
+	myisam table.
+	For sql_base.cc look at close_data_tables().
+*/
+int ha_pbxt::close(void)
+{
+	THD						*thd = current_thd;
+	volatile int			err = 0;
+	volatile XTThreadPtr	self;
+
+	if (thd)
+		self = ha_set_current_thread(thd, (int *) &err);
+	else {
+		XTExceptionRec e;
+
+		if (!(self = xt_create_thread("TempForClose", FALSE, TRUE, &e))) {
+			xt_log_exception(NULL, &e, XT_LOG_DEFAULT);
+			return 0;
+		}
+	}
+
+	XT_PRINT1(self, "close (%s)\n", pb_share && pb_share->sh_table_path->ps_path ? pb_share->sh_table_path->ps_path : "unknown");
+
+	if (self) {
+		try_(a) {
+			internal_close(thd, self);
+		}
+		catch_(a) {
+			err = xt_ha_pbxt_thread_error_for_mysql(thd, self, pb_ignore_dup_key);
+		}
+		cont_(a);
+
+		if (!thd)
+			xt_free_thread(self);
+	}
+	else
+		xt_log(XT_NS_CONTEXT, XT_LOG_WARNING, "Unable to release table reference\n");
+		
+	return err;
+}
+
+void ha_pbxt::init_auto_increment(xtWord8 min_auto_inc)
+{
+	XTTableHPtr	tab;
+	xtWord8		nr = 0;
+	int			err;
+
+	/* Get the value of the auto-increment value by
+	 * loading the highest value from the index...
+	 */
+	tab = pb_open_tab->ot_table;
+
+	/* Cannot do this if the index version is bad! */
+	if (tab->tab_dic.dic_disable_index)
+		return;
+
+	xt_spinlock_lock(&tab->tab_ainc_lock);
+	if (table->found_next_number_field && !tab->tab_auto_inc) {
+		Field		*tmp_fie = table->next_number_field;
+		THD			*tmp_thd = table->in_use;
+		xtBool		xn_started = FALSE;
+		XTThreadPtr	self = pb_open_tab->ot_thread;
+
+		/*
+		 * A table may be opened by a thread with a running
+		 * transaction!
+		 * Since get_auto_increment() does not do an update,
+		 * it should be OK to use the transaction we already
+		 * have to get the next auto-increment value.
+		 */
+		if (!self->st_xact_data) {
+			self->st_xact_mode = XT_XACT_REPEATABLE_READ;
+			self->st_ignore_fkeys = FALSE;
+			self->st_auto_commit = TRUE;
+			self->st_table_trans = FALSE;
+			self->st_abort_trans = FALSE;
+			self->st_stat_ended = FALSE;
+			self->st_stat_trans = FALSE;
+			self->st_is_update = NULL;
+			if (!xt_xn_begin(self)) {
+				xt_spinlock_unlock(&tab->tab_ainc_lock);
+				xt_throw(self);
+			}
+			xn_started = TRUE;
+		}
+
+		/* Setup the conditions for the next call! */
+		table->in_use = current_thd;
+		table->next_number_field = table->found_next_number_field;
+
+		extra(HA_EXTRA_KEYREAD);
+		table->mark_columns_used_by_index_no_reset(TS(table)->next_number_index, table->read_set);
+		column_bitmaps_signal();
+ 		index_init(TS(table)->next_number_index, 0);
+		if (!TS(table)->next_number_key_offset) {
+			// Autoincrement at key-start
+			err = index_last(table->record[1]);
+			if (!err && !table->next_number_field->is_null(TS(table)->rec_buff_length)) {
+				/* {PRE-INC} */
+				nr = (xtWord8) table->next_number_field->val_int_offset(TS(table)->rec_buff_length);
+			}
+		}
+		else {
+			/* Do an index scan to find the largest value! */
+			/* The standard method will not work because it forces
+			 * us to lock that table!
+			 */
+			xtWord8 val;
+
+			err = index_first(table->record[1]);
+			while (!err) {
+				/* {PRE-INC} */
+				val = (xtWord8) table->next_number_field->val_int_offset(TS(table)->rec_buff_length);
+				if (val > nr)
+					nr = val;
+				err = index_next(table->record[1]);
+			}
+		}
+
+		index_end();
+		extra(HA_EXTRA_NO_KEYREAD);
+
+		/* {PRE-INC}
+		 * I have changed this from post increment to pre-increment!
+		 * The reason is:
+		 * When using post increment we are not able to return
+		 * the last valid value in the range.
+		 *
+		 * Here the test example:
+		 *
+		 * drop table if exists t1;
+		 * create table t1 (i tinyint unsigned not null auto_increment primary key) engine=pbxt;
+		 * insert into t1 set i = 254;
+		 * insert into t1 set i = null;
+		 *
+		 * With post-increment, this last insert fails because on post increment
+		 * the value overflows!
+		 *
+		 * Pre-increment means we store the current max, and increment
+		 * before returning the next value.
+		 *
+		 * This will work in this situation.
+		 */
+		tab->tab_auto_inc = nr;
+		if (tab->tab_auto_inc < tab->tab_dic.dic_min_auto_inc)
+			tab->tab_auto_inc = tab->tab_dic.dic_min_auto_inc-1;
+		if (tab->tab_auto_inc < min_auto_inc)
+			tab->tab_auto_inc = min_auto_inc-1;
+
+		/* Restore the changed values: */
+		table->next_number_field = tmp_fie;
+		table->in_use = tmp_thd;
+
+		if (xn_started) {
+			XT_PRINT0(self, "xt_xn_commit in init_auto_increment\n");
+			xt_xn_commit(self);
+		}
+	}
+	xt_spinlock_unlock(&tab->tab_ainc_lock);
+}
+
+void ha_pbxt::get_auto_increment(MX_ULONGLONG_T offset, MX_ULONGLONG_T increment,
+                                 MX_ULONGLONG_T XT_UNUSED(nb_desired_values),
+                                 MX_ULONGLONG_T *first_value,
+                                 MX_ULONGLONG_T *nb_reserved_values)
+{
+	register XTTableHPtr	tab;
+	MX_ULONGLONG_T			nr, nr_less_inc;
+
+	ASSERT_NS(pb_ex_in_use);
+
+	tab = pb_open_tab->ot_table;
+
+	/* {PRE-INC}
+	 * Assume that nr contains the last value returned!
+	 * We will increment and then return the value.
+	 */
+	xt_spinlock_lock(&tab->tab_ainc_lock);
+	nr = (MX_ULONGLONG_T) tab->tab_auto_inc;
+	nr_less_inc = nr;
+	if (nr < offset)
+		nr = offset;
+	else if (increment > 1 && ((nr - offset) % increment) != 0)
+		nr += increment - ((nr - offset) % increment);
+	else
+		nr += increment;
+	if (table->next_number_field->cmp((const unsigned char *)&nr_less_inc, (const unsigned char *)&nr) < 0)
+		tab->tab_auto_inc = (xtWord8) (nr);
+	else
+		nr = ~0;	/* indicate error to the caller */
+	xt_spinlock_unlock(&tab->tab_ainc_lock);
+
+	*first_value = nr;
+	*nb_reserved_values = 1;
+}
+
+/* GOTCHA: We need to use signed value here because of the test
+ * (from auto_increment.test):
+ * create table t1 (a int not null auto_increment primary key);
+ * insert into t1 values (NULL);
+ * insert into t1 values (-1);
+ * insert into t1 values (NULL);
+ */
+xtPublic void ha_set_auto_increment(XTOpenTablePtr ot, Field *nr)
+{
+	register XTTableHPtr	tab;
+	MX_ULONGLONG_T			nr_int_val;
+	
+	nr_int_val = nr->val_int();
+	tab = ot->ot_table;
+
+	if (nr->cmp((const unsigned char *)&tab->tab_auto_inc) > 0) {
+		xt_spinlock_lock(&tab->tab_ainc_lock);
+
+		if (nr->cmp((const unsigned char *)&tab->tab_auto_inc) > 0) {
+			/* {PRE-INC}
+			 * We increment later, so just set the value!
+			MX_ULONGLONG_T nr_int_val_plus_one = nr_int_val + 1;
+			if (nr->cmp((const unsigned char *)&nr_int_val_plus_one) < 0)
+				tab->tab_auto_inc = nr_int_val_plus_one;
+			else
+			 */
+			tab->tab_auto_inc = nr_int_val;
+		}
+		xt_spinlock_unlock(&tab->tab_ainc_lock);
+	}
+
+	if (xt_db_auto_increment_mode == 1) {
+		if (nr_int_val > (MX_ULONGLONG_T) tab->tab_dic.dic_min_auto_inc) {
+			/* Do this every 100 calls: */
+#ifdef DEBUG
+			tab->tab_dic.dic_min_auto_inc = nr_int_val + 5;
+#else
+			tab->tab_dic.dic_min_auto_inc = nr_int_val + 100;
+#endif
+			ot->ot_thread = xt_get_self();
+			if (!xt_tab_write_min_auto_inc(ot))
+				xt_log_and_clear_exception(ot->ot_thread);
+		}
+	}
+}
+
+/*
+static void dump_buf(unsigned char *buf, int len)
+{
+	int i;
+	
+	for (i=0; i<len; i++) printf("%2c", buf[i] <= 127 ? buf[i] : '.');
+	printf("\n");
+	for (i=0; i<len; i++) printf("%02x", buf[i]);
+	printf("\n");
+}
+*/
+
+/*
+ * write_row() inserts a row. No extra() hint is given currently if a bulk load
+ * is happeneding. buf() is a byte array of data. You can use the field
+ * information to extract the data from the native byte array type.
+ * Example of this would be:
+ * for (Field **field=table->field ; *field ; field++)
+ * {
+ *		...
+ * }
+
+ * See ha_tina.cc for an example of extracting all of the data as strings.
+ * ha_berekly.cc has an example of how to store it intact by "packing" it
+ * for ha_berkeley's own native storage type.
+
+ * See the note for update_row() on auto_increments and timestamps. This
+ * case also applied to write_row().
+
+ * Called from item_sum.cc, item_sum.cc, sql_acl.cc, sql_insert.cc,
+ * sql_insert.cc, sql_select.cc, sql_table.cc, sql_udf.cc, and sql_update.cc.
+ */
+int ha_pbxt::write_row(byte *buf)
+{
+	int err = 0;
+
+	ASSERT_NS(pb_ex_in_use);
+
+	XT_PRINT1(pb_open_tab->ot_thread, "write_row (%s)\n", pb_share->sh_table_path->ps_path);
+	XT_DISABLED_TRACE(("INSERT tx=%d val=%d\n", (int) pb_open_tab->ot_thread->st_xact_data->xd_start_xn_id, (int) XT_GET_DISK_4(&buf[1])));
+	//statistic_increment(ha_write_count,&LOCK_status);
+#ifdef PBMS_ENABLED
+	PBMSResultRec result;
+	err = pbms_write_row_blobs(table, buf, &result);
+	if (err) {
+		xt_logf(XT_NT_ERROR, "pbms_write_row_blobs() Error: %s", result.mr_message);
+		return err;
+	}
+#endif
+
+	/* {START-STAT-HACK} previously position of start statement hack. */
+
+	xt_xlog_check_long_writer(pb_open_tab->ot_thread);
+
+	if (table->timestamp_field_type & TIMESTAMP_AUTO_SET_ON_INSERT)
+		table->timestamp_field->set_time();
+
+	if (table->next_number_field && buf == table->record[0]) {
+		int update_err = update_auto_increment();
+		if (update_err) {
+			ha_log_pbxt_thread_error_for_mysql(pb_ignore_dup_key);
+			err = update_err;
+			goto done;
+		}
+		ha_set_auto_increment(pb_open_tab, table->next_number_field);
+	}
+
+	if (!xt_tab_new_record(pb_open_tab, (xtWord1 *) buf)) {
+		err = ha_log_pbxt_thread_error_for_mysql(pb_ignore_dup_key);
+
+		/*
+		 * This is needed to allow the same row to be updated multiple times in case of bulk REPLACE.
+		 * This happens during execution of LOAD DATA...REPLACE MySQL first tries to INSERT the row 
+		 * and if it gets dup-key error it tries UPDATE, so the same row can be overwriten multiple 
+		 * times within the same statement
+		 */
+		if (err == HA_ERR_FOUND_DUPP_KEY && pb_open_tab->ot_thread->st_is_update) {
+			/* Pop the update stack: */
+			//pb_open_tab->ot_thread->st_update_id++;
+			XTOpenTablePtr curr = pb_open_tab->ot_thread->st_is_update;
+
+			pb_open_tab->ot_thread->st_is_update = curr->ot_prev_update;
+			curr->ot_prev_update = NULL;
+		}
+	}
+
+	done:
+#ifdef PBMS_ENABLED
+	pbms_completed(table, (err == 0));
+#endif
+	return err;
+}
+
+#ifdef UNUSED_CODE
+static int equ_bin(const byte *a, const char *b)
+{
+	while (*a && *b) {
+		if (*a != *b)
+			return 0;
+		a++;
+		b++;
+	}
+	return 1;
+}
+static void dump_bin(const byte *a_in, int offset, int len_in)
+{
+	const byte	*a = a_in;
+	int			len = len_in;
+	
+	a += offset;
+	while (len > 0) {
+		xt_trace("%02X", (int) *a);
+		a++;
+		len--;
+	}
+	xt_trace("==");
+	a = a_in;
+	len = len_in;
+	a += offset;
+	while (len > 0) {
+		xt_trace("%c", (*a > 8 && *a < 127) ? *a : '.');
+		a++;
+		len--;
+	}
+	xt_trace("\n");
+}
+#endif
+
+/*
+ * Yes, update_row() does what you expect, it updates a row. old_data will have
+ * the previous row record in it, while new_data will have the newest data in
+ * it. Keep in mind that the server can do updates based on ordering if an ORDER BY
+ * clause was used. Consecutive ordering is not guarenteed.
+ *
+ * Called from sql_select.cc, sql_acl.cc, sql_update.cc, and sql_insert.cc.
+ */
+int ha_pbxt::update_row(const byte * old_data, byte * new_data)
+{
+	int						err = 0;
+	register XTThreadPtr	self = pb_open_tab->ot_thread;
+
+	ASSERT_NS(pb_ex_in_use);
+
+	XT_PRINT1(self, "update_row (%s)\n", pb_share->sh_table_path->ps_path);
+	XT_DISABLED_TRACE(("UPDATE tx=%d val=%d\n", (int) self->st_xact_data->xd_start_xn_id, (int) XT_GET_DISK_4(&new_data[1])));
+	//statistic_increment(ha_update_count,&LOCK_status);
+
+	/* {START-STAT-HACK} previously position of start statement hack. */
+
+	xt_xlog_check_long_writer(self);
+
+	/* {UPDATE-STACK} */
+	if (self->st_is_update != pb_open_tab) {
+		/* Push the update stack: */
+		pb_open_tab->ot_prev_update = self->st_is_update;
+		self->st_is_update = pb_open_tab;
+		pb_open_tab->ot_update_id++;
+	}
+
+	if (table->timestamp_field_type & TIMESTAMP_AUTO_SET_ON_UPDATE)
+		table->timestamp_field->set_time();
+
+#ifdef PBMS_ENABLED
+	PBMSResultRec result;
+
+	err = pbms_delete_row_blobs(table, old_data, &result);
+	if (err) {
+		xt_logf(XT_NT_ERROR, "update_row:pbms_delete_row_blobs() Error: %s", result.mr_message);
+		return err;
+	}
+	err = pbms_write_row_blobs(table, new_data, &result);
+	if (err) { 
+		xt_logf(XT_NT_ERROR, "update_row:pbms_write_row_blobs() Error: %s", result.mr_message);
+		goto pbms_done;
+	}
+#endif
+
+	/* GOTCHA: We need to check the auto-increment value on update
+	 * because of the following test (which fails for InnoDB) -
+	 * auto_increment.test:
+	 * create table t1 (a int not null auto_increment primary key, val int);
+	 * insert into t1 (val) values (1);
+	 * update t1 set a=2 where a=1;
+	 * insert into t1 (val) values (1);
+	 */
+	if (table->found_next_number_field && new_data == table->record[0]) {
+		MX_LONGLONG_T	nr;
+		my_bitmap_map	*old_map;
+
+		old_map = mx_tmp_use_all_columns(table, table->read_set);
+		nr = table->found_next_number_field->val_int();
+		ha_set_auto_increment(pb_open_tab, table->found_next_number_field);
+		mx_tmp_restore_column_map(table, old_map);
+	}
+
+	if (!xt_tab_update_record(pb_open_tab, (xtWord1 *) old_data, (xtWord1 *) new_data))
+		err = ha_log_pbxt_thread_error_for_mysql(pb_ignore_dup_key);
+
+	pb_open_tab->ot_table->tab_locks.xt_remove_temp_lock(pb_open_tab, TRUE);
+	
+#ifdef PBMS_ENABLED
+	pbms_done:
+	pbms_completed(table, (err == 0));
+#endif
+
+	return err;
+}
+
+/*
+ * This will delete a row. buf will contain a copy of the row to be deleted.
+ * The server will call this right after the current row has been called (from
+ * either a previous rnd_next() or index call).
+ *
+ * Called in sql_acl.cc and sql_udf.cc to manage internal table information.
+ * Called in sql_delete.cc, sql_insert.cc, and sql_select.cc. In sql_select it is
+ * used for removing duplicates while in insert it is used for REPLACE calls.
+*/
+int ha_pbxt::delete_row(const byte * buf)
+{
+	int err = 0;
+
+	ASSERT_NS(pb_ex_in_use);
+
+	XT_PRINT1(pb_open_tab->ot_thread, "delete_row (%s)\n", pb_share->sh_table_path->ps_path);
+	XT_DISABLED_TRACE(("DELETE tx=%d val=%d\n", (int) pb_open_tab->ot_thread->st_xact_data->xd_start_xn_id, (int) XT_GET_DISK_4(&buf[1])));
+	//statistic_increment(ha_delete_count,&LOCK_status);
+
+#ifdef PBMS_ENABLED
+	PBMSResultRec result;
+
+	err = pbms_delete_row_blobs(table, buf, &result);
+	if (err) {
+		xt_logf(XT_NT_ERROR, "pbms_delete_row_blobs() Error: %s", result.mr_message);
+		return err;
+	}
+#endif
+
+	/* {START-STAT-HACK} previously position of start statement hack. */
+
+	xt_xlog_check_long_writer(pb_open_tab->ot_thread);
+
+	if (!xt_tab_delete_record(pb_open_tab, (xtWord1 *) buf))
+		err = ha_log_pbxt_thread_error_for_mysql(pb_ignore_dup_key);
+
+	pb_open_tab->ot_table->tab_locks.xt_remove_temp_lock(pb_open_tab, TRUE);
+
+#ifdef PBMS_ENABLED
+	pbms_completed(table, (err == 0));
+#endif
+	return err;
+}
+
+/*
+ * -----------------------------------------------------------------------
+ * INDEX METHODS
+ */
+
+/*
+ * This looks like a hack, but actually, it is OK.
+ * It depends on the setup done by the super-class. It involves an extra
+ * range check that we need to do if a "new" record is returned during
+ * an index scan.
+ *
+ * A new record is returned if a row is updated (by another transaction)
+ * during the index scan. If an update is detected, then the scan stops
+ * and waits for the transaction to end.
+ *
+ * If the transaction commits, then the updated row is returned instead
+ * of the row it would have returned when doing a consistant read
+ * (repeatable read).
+ *
+ * These new records can appear out of index order, and may not even
+ * belong to the index range that we are concerned with.
+ *
+ * Notice that there is not check for the start of the range. It appears
+ * that this is not necessary, MySQL seems to have no problem ignoring
+ * such values.
+ *
+ * A number of test have been given below which demonstrate the use
+ * of the function.
+ *
+ * They also demonstrate the ORDER BY problem described here: [(11)].
+ *
+ * DROP TABLE IF EXISTS test_tab, test_tab_1, test_tab_2;
+ * CREATE TABLE test_tab (ID int primary key, Value int, Name varchar(20), index(Value, Name)) ENGINE=pbxt;
+ * INSERT test_tab values(1, 1, 'A');
+ * INSERT test_tab values(2, 1, 'B');
+ * INSERT test_tab values(3, 1, 'C');
+ * INSERT test_tab values(4, 2, 'D');
+ * INSERT test_tab values(5, 2, 'E');
+ * INSERT test_tab values(6, 2, 'F');
+ * INSERT test_tab values(7, 2, 'G');
+ * 
+ * select * from test_tab where value = 1 order by value, name for update;
+ * 
+ * -- Test: 1
+ * -- C1
+ * begin;
+ * select * from test_tab where id = 5 for update;
+ * 
+ * -- C2
+ * begin;
+ * select * from test_tab where value = 2 order by value, name for update;
+ * 
+ * -- C1
+ * update test_tab set value = 3 where id = 6;
+ * commit;
+ * 
+ * -- Test: 2
+ * -- C1
+ * begin;
+ * select * from test_tab where id = 5 for update;
+ * 
+ * -- C2
+ * begin;
+ * select * from test_tab where value >= 2 order by value, name for update;
+ * 
+ * -- C1
+ * update test_tab set value = 3 where id = 6;
+ * commit;
+ * 
+ * -- Test: 3
+ * -- C1
+ * begin;
+ * select * from test_tab where id = 5 for update;
+ * 
+ * -- C2
+ * begin;
+ * select * from test_tab where value = 2 order by value, name for update;
+ * 
+ * -- C1
+ * update test_tab set value = 1 where id = 6;
+ * commit;
+ */
+
+int ha_pbxt::xt_index_in_range(register XTOpenTablePtr XT_UNUSED(ot), register XTIndexPtr ind,
+	register XTIdxSearchKeyPtr search_key, xtWord1 *buf)
+{
+	/* If search key is given, this means we want an exact match. */
+	if (search_key) {
+		xtWord1 key_buf[XT_INDEX_MAX_KEY_SIZE];
+
+		myxt_create_key_from_row(ind, key_buf, buf, NULL);
+		search_key->sk_on_key = myxt_compare_key(ind, search_key->sk_key_value.sv_flags, search_key->sk_key_value.sv_length,
+			search_key->sk_key_value.sv_key, key_buf) == 0;
+		return search_key->sk_on_key;
+	}
+
+	/* Otherwise, check the end of the range. */
+	if (end_range)
+		return compare_key(end_range) <= 0;
+	return 1;
+}
+
+int ha_pbxt::xt_index_next_read(register XTOpenTablePtr ot, register XTIndexPtr ind, xtBool key_only,
+	register XTIdxSearchKeyPtr search_key, byte *buf)
+{
+	xt_xlog_check_long_writer(ot->ot_thread);
+
+	if (key_only) {
+		/* We only need to read the data from the key: */
+		while (ot->ot_curr_rec_id) {
+			if (search_key && !search_key->sk_on_key)
+				break;
+
+			switch (xt_tab_visible(ot)) {
+				case FALSE:
+					if (xt_idx_next(ot, ind, search_key))
+						break;
+				case XT_ERR:
+					goto failed;
+				case XT_NEW:
+					if (!xt_idx_read(ot, ind, (xtWord1 *) buf))
+						goto failed;
+					if (xt_index_in_range(ot, ind, search_key, buf)) {
+						return 0;
+					}
+					if (!xt_idx_next(ot, ind, search_key))
+						goto failed;
+					break;
+				case XT_RETRY:
+					/* We cannot start from the beginning again, if we have
+					 * already output rows!
+					 * And we need the orginal search key.
+					 *
+					 * The case in which this occurs is:
+					 *
+					 * T1: UPDATE tbl_file SET GlobalID = 'DBCD5C4514210200825501089884844_6M' WHERE ID = 39
+					 * Locks a particular row.
+					 *
+					 * T2: SELECT ID,Flags FROM tbl_file WHERE SpaceID = 1 AND Path = '/zi/America/' AND 
+					 * Name = 'Cuiaba' AND Flags IN ( 0,1,4,5 ) FOR UPDATE
+					 * scans the index and stops on the lock (of the before image) above.
+					 *
+					 * T1 quits, the sweeper deletes the record updated by T1?!
+					 * BUG: Cleanup should wait until T2 is complete!
+					 *
+					 * T2 continues, and returns XT_RETRY.
+					 *
+					 * At this stage T2 has already returned some rows, so it may not retry from the
+					 * start. Instead it tries to locate the last record it tried to lock.
+					 * This record is gone (or not visible), so it finds the next one.
+					 *
+					 * POTENTIAL BUG: If cleanup does not wait until T2 is complete, then
+					 * I may miss the update record, if it is moved before the index scan
+					 * position.
+					 */
+					if (!pb_ind_row_count && search_key) {
+						if (!xt_idx_search(pb_open_tab, ind, search_key))
+							return ha_log_pbxt_thread_error_for_mysql(pb_ignore_dup_key);
+					}
+					else {
+						if (!xt_idx_research(pb_open_tab, ind))
+							goto failed;
+					}
+					break;
+				default:
+					if (!xt_idx_read(ot, ind, (xtWord1 *) buf))
+						goto failed;
+					return 0;
+			}
+		}
+	}
+	else {
+		while (ot->ot_curr_rec_id) {
+			if (search_key && !search_key->sk_on_key)
+				break;
+
+			switch (xt_tab_read_record(ot, (xtWord1 *) buf)) {
+				case FALSE:
+					XT_DISABLED_TRACE(("not visi tx=%d rec=%d\n", (int) ot->ot_thread->st_xact_data->xd_start_xn_id, (int) ot->ot_curr_rec_id));
+					if (xt_idx_next(ot, ind, search_key))
+						break;
+				case XT_ERR:
+					goto failed;
+				case XT_NEW:
+					if (xt_index_in_range(ot, ind, search_key, buf))
+						return 0;
+					if (!xt_idx_next(ot, ind, search_key))
+						goto failed;
+					break;
+				case XT_RETRY:
+					if (!pb_ind_row_count && search_key) {
+						if (!xt_idx_search(pb_open_tab, ind, search_key))
+							return ha_log_pbxt_thread_error_for_mysql(pb_ignore_dup_key);
+					}
+					else {
+						if (!xt_idx_research(pb_open_tab, ind))
+							goto failed;
+					}
+					break;
+				default:
+					XT_DISABLED_TRACE(("visible tx=%d rec=%d\n", (int) ot->ot_thread->st_xact_data->xd_start_xn_id, (int) ot->ot_curr_rec_id));
+					return 0;
+			}
+		}
+	}
+	return HA_ERR_END_OF_FILE;
+
+	failed:
+	return ha_log_pbxt_thread_error_for_mysql(FALSE);
+}
+
+int ha_pbxt::xt_index_prev_read(XTOpenTablePtr ot, XTIndexPtr ind, xtBool key_only,
+	register XTIdxSearchKeyPtr search_key, byte *buf)
+{
+	if (key_only) {
+		/* We only need to read the data from the key: */
+		while (ot->ot_curr_rec_id) {
+			if (search_key && !search_key->sk_on_key)
+				break;
+
+			switch (xt_tab_visible(ot)) {
+				case FALSE:
+					if (xt_idx_prev(ot, ind, search_key))
+						break;
+				case XT_ERR:
+					goto failed;
+				case XT_NEW:
+					if (!xt_idx_read(ot, ind, (xtWord1 *) buf))
+						goto failed;
+					if (xt_index_in_range(ot, ind, search_key, buf))
+						return 0;
+					if (!xt_idx_next(ot, ind, search_key))
+						goto failed;
+					break;
+				case XT_RETRY:
+					if (!pb_ind_row_count && search_key) {
+						if (!xt_idx_search_prev(pb_open_tab, ind, search_key))
+							return ha_log_pbxt_thread_error_for_mysql(pb_ignore_dup_key);
+					}
+					else {
+						if (!xt_idx_research(pb_open_tab, ind))
+							goto failed;
+					}
+					break;
+				default:
+					if (!xt_idx_read(ot, ind, (xtWord1 *) buf))
+						goto failed;
+					return 0;
+			}
+		}
+	}
+	else {
+		/* We need to read the entire record: */
+		while (ot->ot_curr_rec_id) {
+			if (search_key && !search_key->sk_on_key)
+				break;
+
+			switch (xt_tab_read_record(ot, (xtWord1 *) buf)) {
+				case FALSE:
+					if (xt_idx_prev(ot, ind, search_key))
+						break;
+				case XT_ERR:
+					goto failed;
+				case XT_NEW:
+					if (xt_index_in_range(ot, ind, search_key, buf))
+						return 0;
+					if (!xt_idx_next(ot, ind, search_key))
+						goto failed;
+					break;
+				case XT_RETRY:
+					if (!pb_ind_row_count && search_key) {
+						if (!xt_idx_search_prev(pb_open_tab, ind, search_key))
+							return ha_log_pbxt_thread_error_for_mysql(pb_ignore_dup_key);
+					}
+					else {
+						if (!xt_idx_research(pb_open_tab, ind))
+							goto failed;
+					}
+					break;
+				default:
+					return 0;
+			}
+		}
+	}
+	return HA_ERR_END_OF_FILE;
+
+	failed:
+	return ha_log_pbxt_thread_error_for_mysql(FALSE);
+}
+
+int ha_pbxt::index_init(uint idx, bool XT_UNUSED(sorted))
+{
+	XTIndexPtr	ind;
+	XTThreadPtr	thread = pb_open_tab->ot_thread;
+
+	/* select count(*) from smalltab_PBXT;
+	 * ignores the error below, and continues to
+	 * call index_first!
+	 */
+	active_index = idx;
+
+	if (pb_open_tab->ot_table->tab_dic.dic_disable_index) {
+		active_index = MAX_KEY;
+		xt_tab_set_index_error(pb_open_tab->ot_table);
+		return ha_log_pbxt_thread_error_for_mysql(pb_ignore_dup_key);
+	}
+
+	/* The number of columns required: */
+	if (pb_open_tab->ot_is_modify) {
+
+		pb_open_tab->ot_cols_req = table->read_set->MX_BIT_SIZE();
+#ifdef XT_PRINT_INDEX_OPT
+		ind = (XTIndexPtr) pb_share->sh_dic_keys[idx];
+
+		printf("index_init %s index %d cols req=%d/%d read_bits=%X write_bits=%X index_bits=%X\n", pb_open_tab->ot_table->tab_name->ps_path, (int) idx, pb_open_tab->ot_cols_req, pb_open_tab->ot_cols_req, (int) *table->read_set->bitmap, (int) *table->write_set->bitmap, (int) *ind->mi_col_map.bitmap);
+#endif
+		/* {START-STAT-HACK} previously position of start statement hack,
+		 * previous comment to code below: */
+		/* Start a statement based transaction as soon
+		 * as a read is done for a modify type statement!
+		 * Previously, this was done too late!
+		 */
+	}
+	else {
+		pb_open_tab->ot_cols_req = ha_get_max_bit(table->read_set);
+
+		/* Check for index coverage!
+		 *
+		 * Given the following table:
+		 *
+		 * CREATE TABLE `customer` (
+		 * `c_id` int(11) NOT NULL DEFAULT '0',
+		 * `c_d_id` int(11) NOT NULL DEFAULT '0',
+		 * `c_w_id` int(11) NOT NULL DEFAULT '0',
+		 * `c_first` varchar(16) DEFAULT NULL,
+		 * `c_middle` char(2) DEFAULT NULL,
+		 * `c_last` varchar(16) DEFAULT NULL,
+		 * `c_street_1` varchar(20) DEFAULT NULL,
+		 * `c_street_2` varchar(20) DEFAULT NULL,
+		 * `c_city` varchar(20) DEFAULT NULL,
+		 * `c_state` char(2) DEFAULT NULL,
+		 * `c_zip` varchar(9) DEFAULT NULL,
+		 * `c_phone` varchar(16) DEFAULT NULL,
+		 * `c_since` timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP,
+		 * `c_credit` char(2) DEFAULT NULL,
+		 * `c_credit_lim` decimal(24,12) DEFAULT NULL,
+		 * `c_discount` double DEFAULT NULL,
+		 * `c_balance` decimal(24,12) DEFAULT NULL,
+		 * `c_ytd_payment` decimal(24,12) DEFAULT NULL,
+		 * `c_payment_cnt` double DEFAULT NULL,
+		 * `c_delivery_cnt` double DEFAULT NULL,
+		 * `c_data` text,
+		 * PRIMARY KEY (`c_w_id`,`c_d_id`,`c_id`),
+		 * KEY `c_w_id` (`c_w_id`,`c_d_id`,`c_last`,`c_first`,`c_id`)
+		 * ) ENGINE=PBXT;
+		 *
+		 * MySQL does not recognize index coverage on the followin select:
+		 *
+		 * SELECT c_id FROM customer WHERE c_w_id = 3 AND c_d_id = 8 AND 
+		 * c_last = 'EINGATIONANTI' ORDER BY c_first ASC LIMIT 1;
+		 *
+		 * TODO: Find out why this is necessary, MyISAM does not
+		 * seem to have this problem!
+		 */
+		ind = (XTIndexPtr) pb_share->sh_dic_keys[idx];
+		if (MX_BIT_IS_SUBSET(table->read_set, &ind->mi_col_map))
+			pb_key_read = TRUE;
+#ifdef XT_PRINT_INDEX_OPT
+		printf("index_init %s index %d cols req=%d/%d read_bits=%X write_bits=%X index_bits=%X converage=%d\n", pb_open_tab->ot_table->tab_name->ps_path, (int) idx, pb_open_tab->ot_cols_req, table->read_set->MX_BIT_SIZE(), (int) *table->read_set->bitmap, (int) *table->write_set->bitmap, (int) *ind->mi_col_map.bitmap, (int) (MX_BIT_IS_SUBSET(table->read_set, &ind->mi_col_map) != 0));
+#endif
+	}
+	
+	xt_xlog_check_long_writer(thread);
+
+	pb_open_tab->ot_thread->st_statistics.st_scan_index++;
+	return 0;
+}
+
+int ha_pbxt::index_end()
+{
+	int err = 0;
+
+	XT_TRACE_METHOD();
+
+	XTThreadPtr thread = pb_open_tab->ot_thread;
+
+	/*
+	 * the assertion below is not always held, because the sometimes handler is unlocked
+	 * before this function is called
+	 */
+	/*ASSERT_NS(pb_ex_in_use);*/
+
+	if (pb_open_tab->ot_ind_rhandle) {
+		xt_ind_release_handle(pb_open_tab->ot_ind_rhandle, FALSE, thread);
+		pb_open_tab->ot_ind_rhandle = NULL;
+	}
+
+	/*
+	 * make permanent the lock for the last scanned row
+	 */
+	if (pb_open_tab)
+		pb_open_tab->ot_table->tab_locks.xt_make_lock_permanent(pb_open_tab, &thread->st_lock_list);
+
+	xt_xlog_check_long_writer(thread);
+
+	active_index = MAX_KEY;
+	XT_RETURN(err);
+}
+
+#ifdef XT_TRACK_RETURNED_ROWS
+void ha_start_scan(XTOpenTablePtr ot, u_int index)
+{
+	xt_ttracef(ot->ot_thread, "SCAN %d:%d\n", (int) ot->ot_table->tab_id, (int) index);
+	ot->ot_rows_ret_curr = 0;
+	for (u_int i=0; i<ot->ot_rows_ret_max; i++)
+		ot->ot_rows_returned[i] = 0;
+}
+
+void ha_return_row(XTOpenTablePtr ot, u_int index)
+{
+	xt_ttracef(ot->ot_thread, "%d:%d ROW=%d:%d\n",
+		(int) ot->ot_table->tab_id, (int) index, (int) ot->ot_curr_row_id, (int) ot->ot_curr_rec_id);
+	ot->ot_rows_ret_curr++;
+	if (ot->ot_curr_row_id >= ot->ot_rows_ret_max) {
+		if (!xt_realloc_ns((void **) &ot->ot_rows_returned, (ot->ot_curr_row_id+1) * sizeof(xtRecordID)))
+			ASSERT_NS(FALSE);
+		memset(&ot->ot_rows_returned[ot->ot_rows_ret_max], 0, (ot->ot_curr_row_id+1 - ot->ot_rows_ret_max) * sizeof(xtRecordID));
+		ot->ot_rows_ret_max = ot->ot_curr_row_id+1;
+	}
+	if (!ot->ot_curr_row_id || !ot->ot_curr_rec_id || ot->ot_rows_returned[ot->ot_curr_row_id]) {
+		char *sql = *thd_query(current_thd);
+
+		xt_ttracef(ot->ot_thread, "DUP %d:%d %s\n",
+			(int) ot->ot_table->tab_id, (int) index, *thd_query(current_thd));
+		xt_dump_trace();
+		printf("ERROR: row=%d rec=%d newr=%d, already returned!\n", (int) ot->ot_curr_row_id, (int) ot->ot_rows_returned[ot->ot_curr_row_id], (int) ot->ot_curr_rec_id);
+		printf("ERROR: %s\n", sql);
+#ifdef XT_WIN
+		FatalAppExit(0, "Debug Me!");
+#endif
+	}
+	else
+		ot->ot_rows_returned[ot->ot_curr_row_id] = ot->ot_curr_rec_id;
+}
+#endif
+
+int ha_pbxt::index_read_xt(byte * buf, uint idx, const byte *key, uint key_len, enum ha_rkey_function find_flag)
+{
+	int					err = 0;
+	XTIndexPtr			ind;
+	int					prefix = 0;
+	XTIdxSearchKeyRec	search_key;
+
+	if (idx == MAX_KEY) {
+		err = HA_ERR_WRONG_INDEX;
+		goto done;
+	}
+#ifdef XT_TRACK_RETURNED_ROWS
+	ha_start_scan(pb_open_tab, idx);
+#endif
+
+	/* This call starts a search on this handler! */
+	pb_ind_row_count = 0;
+
+	ASSERT_NS(pb_ex_in_use);
+
+	XT_PRINT1(pb_open_tab->ot_thread, "index_read_xt (%s)\n", pb_share->sh_table_path->ps_path);
+	XT_DISABLED_TRACE(("search tx=%d val=%d update=%d\n", (int) pb_open_tab->ot_thread->st_xact_data->xd_start_xn_id, (int) XT_GET_DISK_4(key), pb_modified));
+	ind = (XTIndexPtr) pb_share->sh_dic_keys[idx];
+
+	switch (find_flag) {
+		case HA_READ_PREFIX_LAST:
+		case HA_READ_PREFIX_LAST_OR_PREV:
+			prefix = SEARCH_PREFIX;
+		case HA_READ_BEFORE_KEY:
+		case HA_READ_KEY_OR_PREV: // I assume you want to be positioned on the last entry in the key duplicate list!! 
+			xt_idx_prep_key(ind, &search_key, ((find_flag == HA_READ_BEFORE_KEY) ? 0 : XT_SEARCH_AFTER_KEY) | prefix, (xtWord1 *) key, (size_t) key_len);
+			if (!xt_idx_search_prev(pb_open_tab, ind, &search_key))
+				err = ha_log_pbxt_thread_error_for_mysql(pb_ignore_dup_key);
+			else
+				err = xt_index_prev_read(pb_open_tab, ind, pb_key_read,
+					(find_flag == HA_READ_PREFIX_LAST) ? &search_key : NULL, buf);
+			break;
+		case HA_READ_PREFIX:
+			prefix = SEARCH_PREFIX;
+		case HA_READ_KEY_EXACT:
+		case HA_READ_KEY_OR_NEXT:
+		case HA_READ_AFTER_KEY:
+		default:
+			xt_idx_prep_key(ind, &search_key, ((find_flag == HA_READ_AFTER_KEY) ? XT_SEARCH_AFTER_KEY : 0) | prefix, (xtWord1 *) key, key_len);
+			if (!xt_idx_search(pb_open_tab, ind, &search_key))
+				err = ha_log_pbxt_thread_error_for_mysql(pb_ignore_dup_key);
+			else {
+				err = xt_index_next_read(pb_open_tab, ind, pb_key_read,
+					(find_flag == HA_READ_KEY_EXACT || find_flag == HA_READ_PREFIX) ? &search_key : NULL, buf);
+				if (err == HA_ERR_END_OF_FILE && find_flag == HA_READ_AFTER_KEY)
+					err = HA_ERR_KEY_NOT_FOUND;			
+			}
+			break;
+	}
+
+	pb_ind_row_count++;
+#ifdef XT_TRACK_RETURNED_ROWS
+	if (!err)
+		ha_return_row(pb_open_tab, idx);
+#endif
+	XT_DISABLED_TRACE(("search tx=%d val=%d err=%d\n", (int) pb_open_tab->ot_thread->st_xact_data->xd_start_xn_id, (int) XT_GET_DISK_4(key), err));
+	done:
+	if (err)
+		table->status = STATUS_NOT_FOUND;
+	else {
+		pb_open_tab->ot_thread->st_statistics.st_row_select++;
+		table->status = 0;
+	}
+	return err;
+}
+
+/*
+ * Positions an index cursor to the index specified in the handle. Fetches the
+ * row if available. If the key value is null, begin at the first key of the
+ * index.
+ */
+int ha_pbxt::index_read(byte * buf, const byte * key, uint key_len, enum ha_rkey_function find_flag)
+{
+	//statistic_increment(ha_read_key_count,&LOCK_status);
+	return index_read_xt(buf, active_index, key, key_len, find_flag);
+}
+
+int ha_pbxt::index_read_idx(byte * buf, uint idx, const byte *key, uint key_len, enum ha_rkey_function find_flag)
+{
+	//statistic_increment(ha_read_key_count,&LOCK_status);
+	return index_read_xt(buf, idx, key, key_len, find_flag);
+}
+
+int ha_pbxt::index_read_last(byte * buf, const byte * key, uint key_len)
+{
+	//statistic_increment(ha_read_key_count,&LOCK_status);
+	return index_read_xt(buf, active_index, key, key_len, HA_READ_PREFIX_LAST);
+}
+
+/*
+ * Used to read forward through the index.
+ */
+int ha_pbxt::index_next(byte * buf)
+{
+	int			err = 0;
+	XTIndexPtr	ind;
+
+	XT_TRACE_METHOD();
+	//statistic_increment(ha_read_next_count,&LOCK_status);
+	ASSERT_NS(pb_ex_in_use);
+
+	if (active_index == MAX_KEY) {
+		err = HA_ERR_WRONG_INDEX;
+		goto done;
+	}
+	ind = (XTIndexPtr) pb_share->sh_dic_keys[active_index];
+
+	if (!xt_idx_next(pb_open_tab, ind, NULL))
+		err = ha_log_pbxt_thread_error_for_mysql(pb_ignore_dup_key);
+	else
+		err = xt_index_next_read(pb_open_tab, ind, pb_key_read, NULL, buf);
+
+	pb_ind_row_count++;
+#ifdef XT_TRACK_RETURNED_ROWS
+	if (!err)
+		ha_return_row(pb_open_tab, active_index);
+#endif
+	done:
+	if (err)
+		table->status = STATUS_NOT_FOUND;
+	else {
+		pb_open_tab->ot_thread->st_statistics.st_row_select++;
+		table->status = 0;
+	}
+	XT_RETURN(err);
+}
+
+/*
+ * I have implemented this because there is currently a
+ * bug in handler::index_next_same().
+ *
+ * drop table if exists t1;
+ * CREATE TABLE t1 (a int, b int, primary key(a,b))
+ * PARTITION BY KEY(b,a) PARTITIONS 2;
+ * insert into t1 values (0,0),(1,1),(2,2),(3,3),(4,4),(5,5),(6,6);
+ * select * from t1 where a = 4;
+ * 
+ */
+int ha_pbxt::index_next_same(byte * buf, const byte *key, uint length)
+{
+	int					err = 0;
+	XTIndexPtr			ind;
+	XTIdxSearchKeyRec	search_key;
+
+	XT_TRACE_METHOD();
+	//statistic_increment(ha_read_next_count,&LOCK_status);
+	ASSERT_NS(pb_ex_in_use);
+
+	if (active_index == MAX_KEY) {
+		err = HA_ERR_WRONG_INDEX;
+		goto done;
+	}
+	ind = (XTIndexPtr) pb_share->sh_dic_keys[active_index];
+
+	search_key.sk_key_value.sv_flags = HA_READ_KEY_EXACT;
+	search_key.sk_key_value.sv_rec_id = 0;
+	search_key.sk_key_value.sv_row_id = 0;
+	search_key.sk_key_value.sv_key = search_key.sk_key_buf;
+	search_key.sk_key_value.sv_length = myxt_create_key_from_key(ind, search_key.sk_key_buf, (xtWord1 *) key, (u_int) length);
+	search_key.sk_on_key = TRUE;
+
+	if (!xt_idx_next(pb_open_tab, ind, &search_key))
+		err = ha_log_pbxt_thread_error_for_mysql(pb_ignore_dup_key);
+	else
+		err = xt_index_next_read(pb_open_tab, ind, pb_key_read, &search_key, buf);
+
+	pb_ind_row_count++;
+#ifdef XT_TRACK_RETURNED_ROWS
+	if (!err)
+		ha_return_row(pb_open_tab, active_index);
+#endif
+	done:
+	if (err)
+		table->status = STATUS_NOT_FOUND;
+	else {
+		pb_open_tab->ot_thread->st_statistics.st_row_select++;
+		table->status = 0;
+	}
+	XT_RETURN(err);
+}
+
+/*
+ * Used to read backwards through the index.
+ */
+int ha_pbxt::index_prev(byte * buf)
+{
+	int			err = 0;
+	XTIndexPtr	ind;
+
+	XT_TRACE_METHOD();
+	//statistic_increment(ha_read_prev_count,&LOCK_status);
+	ASSERT_NS(pb_ex_in_use);
+
+	if (active_index == MAX_KEY) {
+		err = HA_ERR_WRONG_INDEX;
+		goto done;
+	}
+	ind = (XTIndexPtr) pb_share->sh_dic_keys[active_index];
+
+	if (!xt_idx_prev(pb_open_tab, ind, NULL))
+		err = ha_log_pbxt_thread_error_for_mysql(pb_ignore_dup_key);
+	else
+		err = xt_index_prev_read(pb_open_tab, ind, pb_key_read, NULL, buf);
+
+	pb_ind_row_count++;
+#ifdef XT_TRACK_RETURNED_ROWS
+	if (!err)
+		ha_return_row(pb_open_tab, active_index);
+#endif
+	done:
+	if (err)
+		table->status = STATUS_NOT_FOUND;
+	else {
+		pb_open_tab->ot_thread->st_statistics.st_row_select++;
+		table->status = 0;
+	}
+	XT_RETURN(err);
+}
+
+/*
+ * index_first() asks for the first key in the index.
+ */
+int ha_pbxt::index_first(byte * buf)
+{
+	int					err = 0;
+	XTIndexPtr			ind;
+	XTIdxSearchKeyRec	search_key;
+
+	XT_TRACE_METHOD();
+	//statistic_increment(ha_read_first_count,&LOCK_status);
+	ASSERT_NS(pb_ex_in_use);
+
+	/* This is required because MySQL ignores the error returned
+	 * init init_index sometimes, for example:
+	 *
+     * if (!table->file->inited)
+     *    table->file->ha_index_init(tab->index, tab->sorted);
+     *  if ((error=tab->table->file->index_first(tab->table->record[0])))
+	 */
+	if (active_index == MAX_KEY) {
+		err = HA_ERR_WRONG_INDEX;
+		goto done;
+	}
+
+#ifdef XT_TRACK_RETURNED_ROWS
+	ha_start_scan(pb_open_tab, active_index);
+#endif
+	pb_ind_row_count = 0;
+
+	ind = (XTIndexPtr) pb_share->sh_dic_keys[active_index];
+
+	xt_idx_prep_key(ind, &search_key, XT_SEARCH_FIRST_FLAG, NULL, 0);
+	if (!xt_idx_search(pb_open_tab, ind, &search_key))
+		err = ha_log_pbxt_thread_error_for_mysql(pb_ignore_dup_key);
+	else
+		err = xt_index_next_read(pb_open_tab, ind, pb_key_read, NULL, buf);
+
+	pb_ind_row_count++;
+#ifdef XT_TRACK_RETURNED_ROWS
+	if (!err)
+		ha_return_row(pb_open_tab, active_index);
+#endif
+	done:
+	if (err)
+		table->status = STATUS_NOT_FOUND;
+	else {
+		pb_open_tab->ot_thread->st_statistics.st_row_select++;
+		table->status = 0;
+	}
+	XT_RETURN(err);
+}
+
+/*
+ * index_last() asks for the last key in the index.
+ */
+int ha_pbxt::index_last(byte * buf)
+{
+	int					err = 0;
+	XTIndexPtr			ind;
+	XTIdxSearchKeyRec	search_key;
+
+	XT_TRACE_METHOD();
+	//statistic_increment(ha_read_last_count,&LOCK_status);
+	ASSERT_NS(pb_ex_in_use);
+
+	if (active_index == MAX_KEY) {
+		err = HA_ERR_WRONG_INDEX;
+		goto done;
+	}
+
+#ifdef XT_TRACK_RETURNED_ROWS
+	ha_start_scan(pb_open_tab, active_index);
+#endif
+	pb_ind_row_count = 0;
+
+	ind = (XTIndexPtr) pb_share->sh_dic_keys[active_index];
+
+	xt_idx_prep_key(ind, &search_key, XT_SEARCH_AFTER_LAST_FLAG, NULL, 0);
+	if (!xt_idx_search_prev(pb_open_tab, ind, &search_key))
+		err = ha_log_pbxt_thread_error_for_mysql(pb_ignore_dup_key);
+	else
+		err = xt_index_prev_read(pb_open_tab, ind, pb_key_read, NULL, buf);
+
+	pb_ind_row_count++;
+#ifdef XT_TRACK_RETURNED_ROWS
+	if (!err)
+		ha_return_row(pb_open_tab, active_index);
+#endif
+	done:
+	if (err)
+		table->status = STATUS_NOT_FOUND;
+	else {
+		pb_open_tab->ot_thread->st_statistics.st_row_select++;
+		table->status = 0;
+	}
+	XT_RETURN(err);
+}
+
+/*
+ * -----------------------------------------------------------------------
+ * RAMDOM/SEQUENTIAL READ METHODS
+ */
+ 
+/*
+ * rnd_init() is called when the system wants the storage engine to do a table
+ * scan.
+ * See the example in the introduction at the top of this file to see when
+ * rnd_init() is called.
+ *
+ * Called from filesort.cc, records.cc, sql_handler.cc, sql_select.cc, sql_table.cc,
+ * and sql_update.cc.
+ */
+int ha_pbxt::rnd_init(bool scan)
+{
+	int			err = 0;
+	XTThreadPtr	thread = pb_open_tab->ot_thread;
+
+	XT_PRINT1(thread, "rnd_init (%s)\n", pb_share->sh_table_path->ps_path);
+	XT_DISABLED_TRACE(("seq scan tx=%d\n", (int) thread->st_xact_data->xd_start_xn_id));
+
+	/* Call xt_tab_seq_exit() to make sure the resources used by the previous
+	 * scan are freed. In particular make sure cache page ref count is decremented.
+	 * This is needed as rnd_init() can be called mulitple times w/o matching calls 
+	 * to rnd_end(). Our experience is that currently this is done in queries like:
+	 *
+	 * SELECT t1.c1,t2.c1 FROM t1 LEFT JOIN t2 USING (c1);
+	 * UPDATE t1 LEFT JOIN t2 USING (c1) SET t1.c1 = t2.c1 WHERE t1.c1 = t2.c1;
+	 *
+	 * when scanning inner tables. It is important to understand that in such case
+	 * multiple calls to rnd_init() are not semantically equal to a new query. For
+	 * example we cannot make row locks permanent as we do in rnd_end(), as 
+	 * ha_pbxt::unlock_row still can be called.
+	 */
+	xt_tab_seq_exit(pb_open_tab);
+
+	/* The number of columns required: */
+	if (pb_open_tab->ot_is_modify) {
+		pb_open_tab->ot_cols_req = table->read_set->MX_BIT_SIZE();
+		/* {START-STAT-HACK} previously position of start statement hack,
+		 * previous comment to code below: */
+		/* Start a statement based transaction as soon
+		 * as a read is done for a modify type statement!
+		 * Previously, this was done too late!
+		 */
+	}
+	else {
+		pb_open_tab->ot_cols_req = ha_get_max_bit(table->read_set);
+
+		/*
+		 * in case of queries like SELECT COUNT(*) FROM t
+		 * table->read_set is empty. Otoh, ot_cols_req == 0 can be treated
+		 * as "all columns" by some internal code (see e.g. myxt_load_row), 
+		 * which makes such queries very ineffective for the records with 
+		 * extended part. Setting column count to 1 makes sure that the 
+		 * extended part will not be acessed in most cases.
+		 */
+
+		if (pb_open_tab->ot_cols_req == 0)
+			pb_open_tab->ot_cols_req = 1;
+	}
+
+	ASSERT_NS(pb_ex_in_use);
+	if (scan) {
+		if (!xt_tab_seq_init(pb_open_tab))
+			err = ha_log_pbxt_thread_error_for_mysql(pb_ignore_dup_key);
+	}
+	else
+		xt_tab_seq_reset(pb_open_tab);
+
+	xt_xlog_check_long_writer(thread);
+
+	return err;
+}
+
+int ha_pbxt::rnd_end()
+{
+	XT_TRACE_METHOD();
+
+	/*
+	 * make permanent the lock for the last scanned row
+	 */
+	XTThreadPtr thread = pb_open_tab->ot_thread;
+	if (pb_open_tab)
+		pb_open_tab->ot_table->tab_locks.xt_make_lock_permanent(pb_open_tab, &thread->st_lock_list);
+
+	xt_xlog_check_long_writer(thread);
+
+	xt_tab_seq_exit(pb_open_tab);
+	XT_RETURN(0);
+}
+
+/*
+ * This is called for each row of the table scan. When you run out of records
+ * you should return HA_ERR_END_OF_FILE. Fill buff up with the row information.
+ * The Field structure for the table is the key to getting data into buf
+ * in a manner that will allow the server to understand it.
+ *
+ * Called from filesort.cc, records.cc, sql_handler.cc, sql_select.cc, sql_table.cc,
+ * and sql_update.cc.
+ */
+int ha_pbxt::rnd_next(byte *buf)
+{
+	int		err = 0;
+	xtBool	eof;
+
+	XT_TRACE_METHOD();
+	ASSERT_NS(pb_ex_in_use);
+	//statistic_increment(ha_read_rnd_next_count, &LOCK_status);
+	xt_xlog_check_long_writer(pb_open_tab->ot_thread);
+
+	if (!xt_tab_seq_next(pb_open_tab, (xtWord1 *) buf, &eof))
+		err = ha_log_pbxt_thread_error_for_mysql(pb_ignore_dup_key);
+	else if (eof)
+		err = HA_ERR_END_OF_FILE;
+
+	if (err)
+		table->status = STATUS_NOT_FOUND;
+	else {
+		pb_open_tab->ot_thread->st_statistics.st_row_select++;
+		table->status = 0;
+	}
+	XT_RETURN(err);
+}
+
+/*
+ * position() is called after each call to rnd_next() if the data needs
+ * to be ordered. You can do something like the following to store
+ * the position:
+ * ha_store_ptr(ref, ref_length, current_position);
+ *
+ * The server uses ref to store data. ref_length in the above case is
+ * the size needed to store current_position. ref is just a byte array
+ * that the server will maintain. If you are using offsets to mark rows, then
+ * current_position should be the offset. If it is a primary key like in
+ * BDB, then it needs to be a primary key.
+ *
+ * Called from filesort.cc, sql_select.cc, sql_delete.cc and sql_update.cc.
+ */
+void ha_pbxt::position(const byte *XT_UNUSED(record))
+{
+	XT_TRACE_METHOD();
+	ASSERT_NS(pb_ex_in_use);
+	/*
+	 * I changed this from using little endian to big endian.
+	 *
+	 * The reason is because sometime the pointer are sorted.
+	 * When they are are sorted a binary compare is used.
+	 * A binary compare sorts big endian values correctly!
+	 *
+	 * Take the followin example:
+	 *
+	 * create table t1 (a int, b text);
+	 * insert into t1 values (1, 'aa'), (1, 'bb'), (1, 'cc');
+	 * select group_concat(b) from t1 group by a;
+	 *
+	 * With little endian pointers the result is:
+	 * aa,bb,cc
+	 *
+	 * With big-endian pointer the result is:
+	 * aa,cc,bb
+	 *
+	 */
+	(void) ASSERT_NS(XT_RECORD_OFFS_SIZE == 4);
+	mi_int4store((xtWord1 *) ref, pb_open_tab->ot_curr_rec_id);
+	XT_RETURN_VOID;
+}
+
+/*
+ * Given the #ROWID retrieve the record.
+ *
+ * Called from filesort.cc records.cc sql_insert.cc sql_select.cc sql_update.cc.
+ */
+int ha_pbxt::rnd_pos(byte * buf, byte *pos)
+{
+	int err = 0;
+
+	XT_TRACE_METHOD();
+	ASSERT_NS(pb_ex_in_use);
+	//statistic_increment(ha_read_rnd_count, &LOCK_status);
+	XT_PRINT1(pb_open_tab->ot_thread, "rnd_pos (%s)\n", pb_share->sh_table_path->ps_path);
+
+	pb_open_tab->ot_curr_rec_id = mi_uint4korr((xtWord1 *) pos);
+	switch (xt_tab_dirty_read_record(pb_open_tab, (xtWord1 *) buf)) {
+		case FALSE:
+			err = ha_log_pbxt_thread_error_for_mysql(pb_ignore_dup_key);
+			break;
+		default:
+			break;
+	}		
+
+	if (err)
+		table->status = STATUS_NOT_FOUND;
+	else {
+		pb_open_tab->ot_thread->st_statistics.st_row_select++;
+		table->status = 0;
+	}
+	XT_RETURN(err);
+}
+
+/*
+ * -----------------------------------------------------------------------
+ * INFO METHODS
+ */
+ 
+/*
+	::info() is used to return information to the optimizer.
+	Currently this table handler doesn't implement most of the fields
+	really needed. SHOW also makes use of this data
+	Another note, you will probably want to have the following in your
+	code:
+	if (records < 2)
+		records = 2;
+	The reason is that the server will optimize for cases of only a single
+	record. If in a table scan you don't know the number of records
+	it will probably be better to set records to two so you can return
+	as many records as you need.
+	Along with records a few more variables you may wish to set are:
+		records
+		deleted
+		data_file_length
+		index_file_length
+		delete_length
+		check_time
+	Take a look at the public variables in handler.h for more information.
+
+	Called in:
+		filesort.cc
+		ha_heap.cc
+		item_sum.cc
+		opt_sum.cc
+		sql_delete.cc
+		sql_delete.cc
+		sql_derived.cc
+		sql_select.cc
+		sql_select.cc
+		sql_select.cc
+		sql_select.cc
+		sql_select.cc
+		sql_show.cc
+		sql_show.cc
+		sql_show.cc
+		sql_show.cc
+		sql_table.cc
+		sql_union.cc
+		sql_update.cc
+
+*/
+#if MYSQL_VERSION_ID < 50114
+void ha_pbxt::info(uint flag)
+#else
+int ha_pbxt::info(uint flag)
+#endif
+{
+	XTOpenTablePtr	ot;
+	int				in_use;
+
+	XT_TRACE_METHOD();
+	
+	if (!(in_use = pb_ex_in_use)) {
+		pb_ex_in_use = 1;
+		if (pb_share && pb_share->sh_table_lock) {
+			/* If some thread has an exclusive lock, then
+			 * we wait for the lock to be removed:
+			 */
+#if MYSQL_VERSION_ID < 50114
+			ha_wait_for_shared_use(this, pb_share);
+			pb_ex_in_use = 1;
+#else
+			if (!ha_wait_for_shared_use(this, pb_share))
+				return ha_log_pbxt_thread_error_for_mysql(pb_ignore_dup_key);
+#endif
+		}
+	}
+
+	if ((ot = pb_open_tab)) {
+		if (flag & HA_STATUS_VARIABLE) {
+				register XTTableHPtr tab = ot->ot_table;
+
+			/* {FREE-ROWS-BAD}
+			 * Free row count is not reliable, so ignore it.
+			 * The problem is if tab_row_fnum > tab_row_eof_id - 1 then
+			 * we have a very bad result.
+			 *
+			 * If stats.records+EXTRA_RECORDS == 0 as returned by 
+			 * estimate_rows_upper_bound(), then filesort will crash here:
+			 *
+			 * make_sortkey(param,sort_keys[idx++],ref_pos);
+			 * 
+			 * #0	0x000bf69c in Field_long::sort_string at field.cc:3766
+			 * #1	0x0022e1f1 in make_sortkey at filesort.cc:769
+			 * #2	0x0022f1cf in find_all_keys at filesort.cc:619
+			 * #3	0x00230eec in filesort at filesort.cc:243
+			 * #4	0x001b9d89 in mysql_update at sql_update.cc:415
+			 * #5	0x0010db12 in mysql_execute_command at sql_parse.cc:2959
+			 * #6	0x0011480d in mysql_parse at sql_parse.cc:5787
+			 * #7	0x00115afb in dispatch_command at sql_parse.cc:1200
+			 * #8	0x00116de2 in do_command at sql_parse.cc:857
+			 * #9	0x00101ee4 in handle_one_connection at sql_connect.cc:1115
+			 *
+			 * The problem is that sort_keys is allocated to handle just 1 vector.
+			 * Sorting one vector crashes. Although I could not find a check for
+			 * the actual number of vectors. But it must assume that it has at
+			 * least EXTRA_RECORDS vectors.
+			 */
+#ifdef XT_ROW_COUNT_CORRECTED
+			if (tab->tab_row_eof_id <= tab->tab_row_fnum ||
+				(!tab->tab_row_free_id && tab->tab_row_fnum))
+				xt_tab_check_free_lists(NULL, ot, false, true);
+			stats.records = (ha_rows) tab->tab_row_eof_id - 1;
+			if (stats.records >= tab->tab_row_fnum) {
+				stats.deleted = tab->tab_row_fnum;
+				stats.records -= stats.deleted;
+			}
+			else {
+				stats.deleted = 0;
+				stats.records = 2;
+			}
+#else
+			stats.deleted = /* tab->tab_row_fnum */ 0;
+			stats.records = (ha_rows) (tab->tab_row_eof_id - 1 /* - stats.deleted */);
+#endif
+			stats.data_file_length = xt_rec_id_to_rec_offset(tab, tab->tab_rec_eof_id);
+			stats.index_file_length = xt_ind_node_to_offset(tab, tab->tab_ind_eof);
+			stats.delete_length = tab->tab_rec_fnum * ot->ot_rec_size;
+			//check_time = info.check_time;
+			stats.mean_rec_length = (ulong) ot->ot_rec_size;
+		}
+
+		if (flag & HA_STATUS_CONST) {
+			ha_rows		rec_per_key;
+			XTIndexPtr	ind;
+			TABLE_SHARE	*share= TS(table);
+
+			stats.max_data_file_length = 0x00FFFFFF;
+			stats.max_index_file_length = 0x00FFFFFF;
+			//stats.create_time = info.create_time;
+			ref_length = XT_RECORD_OFFS_SIZE;
+			//share->db_options_in_use = info.options;
+			stats.block_size = XT_INDEX_PAGE_SIZE;
+
+			if (share->tmp_table == NO_TMP_TABLE)
+#ifdef DRIZZLED
+#define WHICH_MUTEX			mutex
+#elif MYSQL_VERSION_ID >= 50404
+#define WHICH_MUTEX			LOCK_ha_data
+#else
+#define WHICH_MUTEX			mutex
+#endif
+
+#ifdef SAFE_MUTEX
+
+#if MYSQL_VERSION_ID < 50404
+#if MYSQL_VERSION_ID < 50123
+				safe_mutex_lock(&share->mutex,__FILE__,__LINE__);
+#else
+				safe_mutex_lock(&share->mutex,0,__FILE__,__LINE__);
+#endif
+#else
+				safe_mutex_lock(&share->WHICH_MUTEX,0,__FILE__,__LINE__);
+#endif
+
+#else // SAFE_MUTEX
+
+#ifdef MY_PTHREAD_FASTMUTEX
+				my_pthread_fastmutex_lock(&share->WHICH_MUTEX);
+#else
+				pthread_mutex_lock(&share->WHICH_MUTEX);
+#endif
+
+#endif // SAFE_MUTEX
+#ifdef DRIZZLED
+			set_prefix(share->keys_in_use, share->keys);
+			share->keys_for_keyread&= share->keys_in_use;
+#else
+			share->keys_in_use.set_prefix(share->keys);
+			//share->keys_in_use.intersect_extended(info.key_map);
+			share->keys_for_keyread.intersect(share->keys_in_use);
+			//share->db_record_offset = info.record_offset;
+#endif
+			for (u_int i = 0; i < share->keys; i++) {
+				ind = pb_share->sh_dic_keys[i];
+
+				rec_per_key = 0;
+				if (ind->mi_seg_count == 1 && (ind->mi_flags & HA_NOSAME))
+					rec_per_key = 1;
+				else {
+					rec_per_key = 1;	
+				}
+				for (u_int j = 0; j < table->key_info[i].key_parts; j++)
+	 				table->key_info[i].rec_per_key[j] = (ulong) rec_per_key;
+			}
+			if (share->tmp_table == NO_TMP_TABLE)
+#ifdef SAFE_MUTEX
+				safe_mutex_unlock(&share->WHICH_MUTEX,__FILE__,__LINE__);
+#else
+#ifdef MY_PTHREAD_FASTMUTEX
+				pthread_mutex_unlock(&share->WHICH_MUTEX.mutex);
+#else
+				pthread_mutex_unlock(&share->WHICH_MUTEX);
+#endif
+#endif
+	  		/*
+			 Set data_file_name and index_file_name to point at the symlink value
+			 if table is symlinked (Ie;  Real name is not same as generated name)
+	   		*/
+	   		/*
+			data_file_name = index_file_name = 0;
+			fn_format(name_buff, file->filename, "", MI_NAME_DEXT, 2);
+			if (strcmp(name_buff, info.data_file_name))
+				data_file_name = info.data_file_name;
+			strmov(fn_ext(name_buff), MI_NAME_IEXT);
+			if (strcmp(name_buff, info.index_file_name))
+				index_file_name = info.index_file_name;
+			*/
+		}
+
+ 		if (flag & HA_STATUS_ERRKEY)
+	 		errkey = ot->ot_err_index_no;
+
+		/* {PRE-INC}
+		 * We assume they want the next value to be returned!
+		 *
+		 * At least, this is what works for the following code:
+		 *
+		 * create table t1 (a int auto_increment primary key)
+		 * auto_increment=100
+		 * engine=pbxt
+		 * partition by list (a)
+		 * (partition p0 values in (1, 98,99, 100, 101));
+		 * create index inx on t1 (a);
+		 * insert into t1 values (null);
+		 * select * from t1;
+		 */
+		if (flag & HA_STATUS_AUTO)
+			stats.auto_increment_value = (ulonglong) ot->ot_table->tab_auto_inc+1;
+	}
+	else
+		errkey = (uint) -1;
+
+	if (!in_use) {
+		pb_ex_in_use = 0;
+		if (pb_share) {
+			/* Someone may be waiting for me to complete: */
+			if (pb_share->sh_table_lock)
+				xt_broadcast_cond_ns((xt_cond_type *) pb_share->sh_ex_cond);
+		}
+	}
+#if MYSQL_VERSION_ID < 50114
+	XT_RETURN_VOID;
+#else
+	XT_RETURN(0);
+#endif
+}
+
+/*
+ * extra() is called whenever the server wishes to send a hint to
+ * the storage engine. The myisam engine implements the most hints.
+ * ha_innodb.cc has the most exhaustive list of these hints.
+ */
+int ha_pbxt::extra(enum ha_extra_function operation)
+{
+	int err = 0;
+
+	XT_PRINT2(xt_get_self(), "ha_pbxt::extra (%s) operation=%d\n", pb_share->sh_table_path->ps_path, operation);
+
+	switch (operation) {
+		case HA_EXTRA_RESET_STATE:
+			pb_key_read = FALSE;
+			pb_ignore_dup_key = 0;
+			/* As far as I can tell, this function is called for
+			 * every table at the end of a statement.
+			 *
+			 * So, during a LOCK TABLES ... UNLOCK TABLES, I use
+			 * this to find the end of a statement.
+			 * start_stmt() indicates the start of a statement,
+			 * and is also called once for each table in the
+			 * statement.
+			 *
+			 * So the statement boundary is indicated by 
+			 * self->st_stat_count == 0
+			 *
+			 * GOTCHA: I cannot end the transaction here!
+			 * I must end it in start_stmt().
+			 * The reason is because there are situations
+			 * where this would end a transaction that
+			 * was begin by external_lock().
+			 *
+			 * An example of this is when a function
+			 * is called when doing CREATE TABLE SELECT.
+			 */
+			if (pb_in_stat) {
+				/* NOTE: pb_in_stat is just used to avoid getting
+				 * self, if it is not necessary!!
+				 */
+				XTThreadPtr self;
+
+				pb_in_stat = FALSE;
+
+				if (!(self = ha_set_current_thread(pb_mysql_thd, &err)))
+					return xt_ha_pbxt_to_mysql_error(err);
+
+				if (self->st_stat_count > 0) {
+					self->st_stat_count--;
+					if (self->st_stat_count == 0)
+						self->st_stat_ended = TRUE;
+				}
+
+				/* This is the end of a statement, I can turn any locks into perminant locks now: */
+				if (pb_open_tab)
+					pb_open_tab->ot_table->tab_locks.xt_make_lock_permanent(pb_open_tab, &self->st_lock_list);
+			}
+			if (pb_open_tab)
+				pb_open_tab->ot_for_update = 0;
+			break;
+		case HA_EXTRA_KEYREAD:
+			/* This means we so not need to read the entire record. */
+			pb_key_read = TRUE;
+			break;
+		case HA_EXTRA_NO_KEYREAD:
+			pb_key_read = FALSE;
+			break;
+		case HA_EXTRA_IGNORE_DUP_KEY:
+			/* NOTE!!! Calls to extra(HA_EXTRA_IGNORE_DUP_KEY) can be nested!
+			 * In fact, the calls are from different threads, so
+			 * strictly speaking I should protect this variable!!
+			 * Here is the sequence that produces the duplicate call:
+			 *
+			 * drop table if exists t1;
+			 * CREATE TABLE t1 (x int not null, y int, primary key (x)) engine=pbxt;
+			 * insert into t1 values (1, 3), (4, 1);
+			 * replace DELAYED into t1 (x, y) VALUES (4, 2);
+			 * select * from t1 order by x;
+			 *
+			 */
+			pb_ignore_dup_key++;
+			break;
+		case HA_EXTRA_NO_IGNORE_DUP_KEY:
+			pb_ignore_dup_key--;
+			break;
+		case HA_EXTRA_KEYREAD_PRESERVE_FIELDS:
+			/* MySQL needs all fields */
+			pb_key_read = FALSE;
+			break;
+		default:
+			break;
+	}
+
+	return err;
+}
+
+
+/*
+ * Deprecated and likely to be removed in the future. Storage engines normally
+ * just make a call like:
+ * ha_pbxt::extra(HA_EXTRA_RESET);
+ * to handle it.
+ */
+int ha_pbxt::reset(void)
+{
+	XT_TRACE_METHOD();
+	extra(HA_EXTRA_RESET_STATE);
+	XT_RETURN(0);
+}
+
+void ha_pbxt::unlock_row()
+{
+	XT_TRACE_METHOD();
+	if (pb_open_tab)
+		pb_open_tab->ot_table->tab_locks.xt_remove_temp_lock(pb_open_tab, FALSE);
+}
+
+/*
+ * Used to delete all rows in a table. Both for cases of truncate and
+ * for cases where the optimizer realizes that all rows will be
+ * removed as a result of a SQL statement.
+ *
+ * Called from item_sum.cc by Item_func_group_concat::clear(),
+ * Item_sum_count_distinct::clear(), and Item_func_group_concat::clear().
+ * Called from sql_delete.cc by mysql_delete().
+ * Called from sql_select.cc by JOIN::reinit().
+ * Called from sql_union.cc by st_select_lex_unit::exec().
+ */
+int ha_pbxt::delete_all_rows()
+{
+	THD				*thd = current_thd;
+	int				err = 0;
+	XTThreadPtr		self;
+	XTDDTable		*tab_def = NULL;
+	char			path[PATH_MAX];
+
+	XT_TRACE_METHOD();
+
+	if (thd_sql_command(thd) != SQLCOM_TRUNCATE) {
+		/* Just like InnoDB we only handle TRUNCATE TABLE
+		 * by recreating the table.
+		 * DELETE FROM t must be handled by deleting
+		 * each row because it may be part of a transaction,
+		 * and there may be foreign key actions.
+		 */
+		XT_RETURN (my_errno = HA_ERR_WRONG_COMMAND);
+	}
+
+	if (!(self = ha_set_current_thread(thd, &err)))
+		return xt_ha_pbxt_to_mysql_error(err);
+
+	try_(a) {
+		XTDictionaryRec dic;
+
+		memset(&dic, 0, sizeof(dic));
+
+		dic = pb_share->sh_table->tab_dic;
+		xt_strcpy(PATH_MAX, path, pb_share->sh_table->tab_name->ps_path);
+
+		if ((tab_def = dic.dic_table))
+			tab_def->reference();
+
+		if (!(thd_test_options(thd,OPTION_NO_FOREIGN_KEY_CHECKS)))
+			tab_def->deleteAllRows(self);
+
+		/* We should have a table lock! */
+		//ASSERT(pb_lock_table);
+		if (!pb_table_locked) {
+			ha_aquire_exclusive_use(self, pb_share, this);
+			pushr_(ha_release_exclusive_use, pb_share);
+		}
+		ha_close_open_tables(self, pb_share, NULL);
+
+		/* This is required in the case of delete_all_rows, because we must
+		 * ensure that the handlers no longer reference the old
+		 * table, so that it will not be used again. The table
+		 * must be re-openned, because the ID has changed!
+		 *
+		 * 0.9.86+ Must check if this is still necessary.
+		 *
+		 * the ha_close_share(self, pb_share) call was moved from above
+		 * (before tab_def = dic.dic_table), because of a crash.
+		 * Test case:
+		 *
+		 * set storage_engine = pbxt;
+		 * create table t1 (s1 int primary key);
+		 * insert into t1 values (1);
+		 * create table t2 (s1 int, foreign key (s1) references t1 (s1));
+		 * insert into t2 values (1); 
+		 * truncate table t1; -- this should fail because of FK constraint
+		 * alter table t1 engine = myisam; -- this caused crash
+		 *
+		 */
+		ha_close_share(self, pb_share);
+
+		/* MySQL documentation requires us to reset auto increment value to 1
+		 * on truncate even if the table was created with a different value. 
+		 * This is also consistent with other engines.
+		 */
+		dic.dic_min_auto_inc = 1;
+
+		xt_create_table(self, (XTPathStrPtr) path, &dic);
+		if (!pb_table_locked)
+			freer_(); // ha_release_exclusive_use(pb_share)
+	}
+	catch_(a) {
+		err = xt_ha_pbxt_thread_error_for_mysql(thd, self, pb_ignore_dup_key);
+	}
+	cont_(a);
+
+	if (tab_def)
+		tab_def->release(self);
+
+	XT_RETURN(err);
+}
+
+/*
+ * TODO: Implement!
+ * Assuming a key (a,b,c)
+ * 
+ * rec_per_key[0] = SELECT COUNT(*)/COUNT(DISTINCT a) FROM t;
+ * rec_per_key[1] = SELECT COUNT(*)/COUNT(DISTINCT a,b) FROM t;
+ * rec_per_key[2] = SELECT COUNT(*)/COUNT(DISTINCT a,b,c) FROM t;
+ *
+ * After this is implemented, the selectivity can serve as
+ * a quick estimate of records_in_range().
+ *
+ * After you have done this, you need to redo the index_merge*
+ * tests. Restore the standard result to check if we
+ * now agree with the MyISAM strategy.
+ * 
+ */
+int ha_pbxt::analyze(THD *thd, HA_CHECK_OPT *XT_UNUSED(check_opt))
+{
+	int				err = 0;
+	XTDatabaseHPtr	db;
+	xtXactID		my_xn_id;
+	xtXactID		clean_xn_id = 0;
+	uint			cnt = 10;
+
+	XT_TRACE_METHOD();
+
+	if (!pb_open_tab) {
+		if ((err = reopen()))
+			XT_RETURN(err);
+	}
+
+	/* Wait until the sweeper is no longer busy!
+	 * If you want an accurate count(*) value, then call
+	 * ANALYZE TABLE first. This function waits until the
+	 * sweeper has completed.
+	 */
+	db = pb_open_tab->ot_table->tab_db;
+	
+	/*
+	 * Wait until everything is cleaned up before this transaction.
+	 * But this will only work if the we quit out transaction!
+	 *
+	 * GOTCHA: When a PBXT table is partitioned, then analyze() is
+	 * called for each component. The first calls xt_xn_commit().
+	 * All following calls have no transaction!:
+	 *
+	 * CREATE TABLE t1 (a int)
+	 * PARTITION BY LIST (a)
+	 * (PARTITION x1 VALUES IN (10), PARTITION x2 VALUES IN (20));
+	 * 
+	 * analyze table t1;
+	 * 
+	 */
+	if (pb_open_tab->ot_thread && pb_open_tab->ot_thread->st_xact_data) {
+		my_xn_id = pb_open_tab->ot_thread->st_xact_data->xd_start_xn_id;
+		XT_PRINT0(xt_get_self(), "xt_xn_commit\n");
+		xt_xn_commit(pb_open_tab->ot_thread);
+	}
+	else
+		my_xn_id = db->db_xn_to_clean_id;
+
+	while ((!db->db_sw_idle || xt_xn_is_before(db->db_xn_to_clean_id, my_xn_id)) && !thd_killed(thd)) {
+		xt_busy_wait();
+
+		/*
+		 * It is possible that the sweeper gets stuck because
+		 * it has no dictionary information!
+		 * As in the example below.
+		 *
+		 * create table t4 (
+		 *   pk_col int auto_increment primary key, a1 char(64), a2 char(64), b char(16), c char(16) not null, d char(16), dummy char(64) default ' '
+		 * ) engine=pbxt;
+		 *
+		 * insert into t4 (a1, a2, b, c, d, dummy) select * from t1;
+		 * 
+		 * create index idx12672_0 on t4 (a1);
+		 * create index idx12672_1 on t4 (a1,a2,b,c);
+		 * create index idx12672_2 on t4 (a1,a2,b);
+		 * analyze table t1;
+		 */
+		if (db->db_sw_idle) {
+			/* This will make sure we don't wait forever: */
+			if (clean_xn_id != db->db_xn_to_clean_id) {
+				clean_xn_id = db->db_xn_to_clean_id;
+				cnt = 10;
+			}
+			else {
+				cnt--;
+				if (!cnt)
+					break;
+			}
+			xt_wakeup_sweeper(db);
+		}
+	}
+
+	XT_RETURN(err);
+}
+
+int ha_pbxt::repair(THD *XT_UNUSED(thd), HA_CHECK_OPT *XT_UNUSED(check_opt))
+{
+	return(HA_ADMIN_TRY_ALTER);
+}
+
+/*
+ * This is mapped to "ALTER TABLE tablename TYPE=PBXT", which rebuilds
+ * the table in MySQL.
+ */
+int ha_pbxt::optimize(THD *XT_UNUSED(thd), HA_CHECK_OPT *XT_UNUSED(check_opt))
+{
+	return(HA_ADMIN_TRY_ALTER);
+}
+
+#ifdef DEBUG
+extern int pbxt_mysql_trace_on;
+#endif
+
+int ha_pbxt::check(THD* thd, HA_CHECK_OPT* XT_UNUSED(check_opt))
+{
+	int				err = 0;
+	XTThreadPtr		self;
+
+	if (!(self = ha_set_current_thread(thd, &err)))
+		return xt_ha_pbxt_to_mysql_error(err);
+	if (self->st_lock_count)
+		ASSERT(self->st_xact_data);
+
+	if (!pb_table_locked) {
+		ha_aquire_exclusive_use(self, pb_share, this);
+		pushr_(ha_release_exclusive_use, pb_share);
+	}
+
+#ifdef CHECK_TABLE_LOADS
+	xt_tab_load_table(self, pb_open_tab);
+#endif
+	xt_check_table(self, pb_open_tab);
+
+	if (!pb_table_locked)
+		freer_(); // ha_release_exclusive_use(pb_share)
+
+	//pbxt_mysql_trace_on = TRUE;
+	return 0;
+}
+
+/*
+ * This function is called:
+ * For each table in LOCK TABLES,
+ * OR
+ * For each table in a statement.
+ *
+ * It is called with F_UNLCK:
+ * in UNLOCK TABLES
+ * OR
+ * at the end of a statement.
+ *
+ */
+xtPublic int ha_pbxt::external_lock(THD *thd, int lock_type)
+{
+	int				err = 0;
+	XTThreadPtr		self;
+	
+	if (!(self = ha_set_current_thread(thd, &err)))
+		return xt_ha_pbxt_to_mysql_error(err);
+
+	/* F_UNLCK is set when this function is called at end
+	 * of statement or UNLOCK TABLES
+	 */
+	if (lock_type == F_UNLCK) {
+		/* This is not TRUE if external_lock() FAILED!
+		 * Can we rely on external_unlock being called when
+		 * external_lock() fails? Currently yes, but it does
+		 * not make sense!
+		ASSERT_NS(pb_ex_in_use);
+		*/
+
+		XT_PRINT1(self, "EXTERNAL_LOCK (%s) lock_type=UNLOCK\n", pb_share->sh_table_path->ps_path);
+
+		/* Make any temporary locks on this table permanent.
+		 *
+		 * This is required here because of the following example:
+		 * create table t1 (a int NOT NULL, b int, primary key (a));
+		 * create table t2 (a int NOT NULL, b int, primary key (a));
+		 * insert into t1 values (0, 10),(1, 11),(2, 12);
+		 * insert into t2 values (1, 21),(2, 22),(3, 23);
+		 * update t1 set b= (select b from t2 where t1.a = t2.a);
+		 * update t1 set b= (select b from t2 where t1.a = t2.a);
+		 * select * from t1;
+		 * drop table t1, t2;
+		 *
+		 */
+
+		/* GOTCHA! It's weird, but, if this function returns an error
+		 * on lock, then UNLOCK is called?!
+		 * This should not be done, because if lock fails, it should be
+		 * assumed that no UNLOCK is required.
+		 * Basically, I have to assume that some code will presume this,
+		 * although the function lock_external() calls unlock, even
+		 * when lock fails.
+		 * The result is, that my lock count can go wrong. So I could
+		 * change the lock method, and increment the lock count, even
+		 * if it fails. However, the consequences are more serious,
+		 * if some code decides not to call UNLOCK after lock fails.
+		 * The result is that I would have a permanent too high lock,
+		 * count and nothing will work.
+		 * So instead, I handle the fact that I might too many unlocks
+		 * here.
+		 */
+		if (self->st_lock_count > 0)
+			self->st_lock_count--;
+		if (!self->st_lock_count) {
+			/* This section handles "auto-commit"... */
+
+#ifdef XT_IMPLEMENT_NO_ACTION
+			/* {NO-ACTION-BUG}
+			 * This is required here because it marks the end of a statement.
+			 * If we are in a non-auto-commit mode, then we cannot
+			 * wait for st_is_update to be set by the begining of a new transaction.
+			 */
+			if (self->st_restrict_list.bl_count) {
+				if (!xt_tab_restrict_rows(&self->st_restrict_list, self))
+					err = xt_ha_pbxt_thread_error_for_mysql(thd, self, pb_ignore_dup_key);
+			}
+#endif
+
+			if (self->st_xact_data) {
+				if (self->st_auto_commit) {
+					/*
+					 * Normally I could assume that if the transaction
+					 * has not been aborted by now, then it should be committed.
+					 *
+					 * Unfortunately, this is not the case!
+					 *
+					 * create table t1 (id int primary key) engine = pbxt;
+					 * create table t2 (id int) engine = pbxt;
+					 * 
+					 * insert into t1 values ( 1 ) ;
+					 * insert into t1 values ( 2 ) ;
+					 * insert into t2 values ( 1 ) ;
+					 * insert into t2 values ( 2 ) ;
+					 * 
+					 * --This statement is returns an error calls ha_autocommit_or_rollback():
+					 * update t1 set t1.id=1 where t1.id=2;
+					 * 
+					 * --This statement is returns no error and calls ha_autocommit_or_rollback():
+					 * update t1,t2 set t1.id=3, t2.id=3 where t1.id=2 and t2.id = t1.id;
+					 * 
+					 * --But this statement returns an error and does not call ha_autocommit_or_rollback():
+					 * update t1,t2 set t1.id=1, t2.id=1 where t1.id=3 and t2.id = t1.id;
+					 * 
+					 * The result is, I cannot rely on ha_autocommit_or_rollback() being called :(
+					 * So I have to abort myself here...
+					 */
+					if (pb_open_tab)
+						pb_open_tab->ot_table->tab_locks.xt_make_lock_permanent(pb_open_tab, &self->st_lock_list);
+
+					if (self->st_abort_trans) {
+						XT_PRINT0(self, "xt_xn_rollback in unlock\n");
+						if (!xt_xn_rollback(self))
+							err = xt_ha_pbxt_thread_error_for_mysql(thd, self, pb_ignore_dup_key);
+					}
+					else {
+						XT_PRINT0(self, "xt_xn_commit in unlock\n");
+						if (!xt_xn_commit(self))
+							err = xt_ha_pbxt_thread_error_for_mysql(thd, self, pb_ignore_dup_key);
+					}
+				}
+			}
+
+			/* If the previous statement was "for update", then set the visibilty
+			 * so that non- for update SELECTs will see what the for update select
+			 * (or update statement) just saw.
+			 */
+			if (pb_open_tab) {
+				if (pb_open_tab->ot_for_update) {
+					self->st_visible_time = self->st_database->db_xn_end_time;
+					pb_open_tab->ot_for_update = 0;
+				}
+
+				if (pb_share->sh_recalc_selectivity) {
+#ifdef XT_ROW_COUNT_CORRECTED
+					/* {CORRECTED-ROW-COUNT} */
+					if ((pb_share->sh_table->tab_row_eof_id - 1 - pb_share->sh_table->tab_row_fnum) >= 200)
+#else
+					/* {FREE-ROWS-BAD} */
+					if ((pb_share->sh_table->tab_row_eof_id - 1 /* - pb_share->sh_table->tab_row_fnum */) >= 200)
+#endif
+					{
+						/* [**] */
+						pb_share->sh_recalc_selectivity = FALSE;
+						xt_ind_set_index_selectivity(pb_open_tab, self);
+#ifdef XT_ROW_COUNT_CORRECTED
+						/* {CORRECTED-ROW-COUNT} */
+						pb_share->sh_recalc_selectivity = (pb_share->sh_table->tab_row_eof_id - 1 - pb_share->sh_table->tab_row_fnum) < 150;
+#else
+						/* {FREE-ROWS-BAD} */
+						pb_share->sh_recalc_selectivity = (pb_share->sh_table->tab_row_eof_id - 1 /* - pb_share->sh_table->tab_row_fnum */) < 150;
+#endif
+					}
+				}
+			}
+
+			if (self->st_stat_modify)
+				self->st_statistics.st_stat_write++;
+			else
+				self->st_statistics.st_stat_read++;
+			self->st_stat_modify = FALSE;
+		}
+
+		if (pb_table_locked) {
+			pb_table_locked--;
+			if (!pb_table_locked)
+				ha_release_exclusive_use(self, pb_share);
+		}
+
+		/* No longer in use: */
+		pb_ex_in_use = 0;
+		/* Someone may be waiting for me to complete: */
+		if (pb_share->sh_table_lock)
+			xt_broadcast_cond_ns((xt_cond_type *) pb_share->sh_ex_cond);
+	}
+	else {
+		XT_PRINT2(self, "ha_pbxt::EXTERNAL_LOCK (%s) lock_type=%d\n", pb_share->sh_table_path->ps_path, lock_type);
+		
+		if (pb_lock_table) {
+			pb_ex_in_use = 1;
+			try_(a) {
+				if (!pb_table_locked)
+					ha_aquire_exclusive_use(self, pb_share, this);
+				pb_table_locked++;
+
+				ha_close_open_tables(self, pb_share, this);
+
+				if (!pb_share->sh_table) {
+					xt_ha_open_database_of_table(self, pb_share->sh_table_path);
+
+					ha_open_share(self, pb_share);
+				}
+			}
+			catch_(a) {
+				err = xt_ha_pbxt_thread_error_for_mysql(thd, self, pb_ignore_dup_key);
+				pb_ex_in_use = 0;
+				goto complete;
+			}
+			cont_(a);
+
+			/* Occurs if you do:
+			 * truncate table t1;
+			 * truncate table t1;
+			 */
+			if (!pb_open_tab) {
+				if ((err = reopen())) {
+					pb_ex_in_use = 0;
+					goto complete;
+				}
+			}
+		}
+		else {
+			pb_ex_in_use = 1;
+			if (pb_share->sh_table_lock && !pb_table_locked) {
+				/* If some thread has an exclusive lock, then
+				 * we wait for the lock to be removed:
+				 */
+				if (!ha_wait_for_shared_use(this, pb_share)) {
+					err = ha_log_pbxt_thread_error_for_mysql(pb_ignore_dup_key);
+					goto complete;
+				}
+			}
+
+			if (!pb_open_tab) {
+				if ((err = reopen())) {
+					pb_ex_in_use = 0;
+					goto complete;
+				}
+			}
+
+			/* Set the current thread for this open table: */
+			pb_open_tab->ot_thread = self;
+
+			/* If this is a set, then it is in UPDATE/DELETE TABLE ...
+			 * or SELECT ... FOR UPDATE
+			 */	
+			pb_open_tab->ot_is_modify = FALSE;
+			if ((pb_open_tab->ot_for_update = (lock_type == F_WRLCK))) {
+				switch ((int) thd_sql_command(thd)) {
+					case SQLCOM_DELETE:
+#ifndef DRIZZLED
+					case SQLCOM_DELETE_MULTI:
+#endif
+						/* turn DELETE IGNORE into normal DELETE. The IGNORE option causes problems because 
+						 * when a record is deleted we add an xlog record which we cannot "rollback" later
+						 * when we find that an FK-constraint has failed. 
+						 */
+						thd->lex->ignore = false;
+					case SQLCOM_UPDATE:
+#ifndef DRIZZLED
+					case SQLCOM_UPDATE_MULTI:
+#endif
+					case SQLCOM_REPLACE:
+					case SQLCOM_REPLACE_SELECT:
+					case SQLCOM_INSERT:
+					case SQLCOM_INSERT_SELECT:
+						pb_open_tab->ot_is_modify = TRUE;
+						self->st_stat_modify = TRUE;
+						break;
+					case SQLCOM_CREATE_TABLE:
+					case SQLCOM_CREATE_INDEX:
+					case SQLCOM_ALTER_TABLE:
+					case SQLCOM_TRUNCATE:
+					case SQLCOM_DROP_TABLE:
+					case SQLCOM_DROP_INDEX:
+					case SQLCOM_LOAD:
+#ifndef DRIZZLED
+					case SQLCOM_REPAIR:
+#endif
+					case SQLCOM_OPTIMIZE:
+						self->st_stat_modify = TRUE;
+						break;
+				}
+			}
+
+			if (pb_open_tab->ot_is_modify && pb_open_tab->ot_table->tab_dic.dic_disable_index) {
+				xt_tab_set_index_error(pb_open_tab->ot_table);
+				err = ha_log_pbxt_thread_error_for_mysql(pb_ignore_dup_key);
+				goto complete;
+			}
+		}
+
+		/* Record the associated MySQL thread: */
+		pb_mysql_thd = thd;
+
+		if (self->st_database != pb_share->sh_table->tab_db) {				
+			try_(b) {
+				/* PBXT does not permit multiple databases us one statement,
+				 * or in a single transaction!
+				 *
+				 * Example query:
+				 *
+				 * update mysqltest_1.t1, mysqltest_2.t2 set a=10,d=10;
+				 */
+				if (self->st_lock_count > 0)
+					xt_throw_xterr(XT_CONTEXT, XT_ERR_MULTIPLE_DATABASES);
+
+				xt_ha_open_database_of_table(self, pb_share->sh_table_path);
+			}
+			catch_(b) {
+				err = xt_ha_pbxt_thread_error_for_mysql(thd, self, pb_ignore_dup_key);
+				pb_ex_in_use = 0;
+				goto complete;
+			}
+			cont_(b);
+		}
+
+		/* See {IS-UPDATE-STAT} nad {UPDATE-STACK} */
+		self->st_is_update = NULL;
+
+		/* Auto begin a transaction (if one is not already running): */
+		if (!self->st_xact_data) {
+			/* Transaction mode numbers must be identical! */
+			(void) ASSERT_NS(ISO_READ_UNCOMMITTED == XT_XACT_UNCOMMITTED_READ);
+			(void) ASSERT_NS(ISO_SERIALIZABLE == XT_XACT_SERIALIZABLE);
+
+			thd_init_xact(thd, self, true);
+
+			if (!xt_xn_begin(self)) {
+				err = xt_ha_pbxt_thread_error_for_mysql(thd, self, pb_ignore_dup_key);
+				pb_ex_in_use = 0;
+				goto complete;
+			}
+
+			/*
+			 * {START-TRANS} GOTCHA: trans_register_ha() is not mentioned in the documentation.
+			 * It must be called to inform MySQL that we have a transaction (see start_stmt).
+			 *
+			 * Here are some tests that confirm whether things are done correctly:
+			 *
+			 * drop table if exists t1, t2;
+			 * create table t1 (c1 int);
+			 * insert t1 values (1);
+			 * select * from t1;
+			 * rename table t1 to t2;
+			 *
+			 * rename will generate an error if MySQL thinks a transaction is
+			 * still running.
+			 *
+			 * create table t1 (a text character set utf8, b text character set latin1);
+			 * insert t1 values (0x4F736E616272C3BC636B, 0x4BF66C6E);
+			 * select * from t1;
+			 * --exec $MYSQL_DUMP --tab=$MYSQLTEST_VARDIR/tmp/ test
+			 * --exec $MYSQL test < $MYSQLTEST_VARDIR/tmp/t1.sql
+			 * --exec $MYSQL_IMPORT test $MYSQLTEST_VARDIR/tmp/t1.txt
+			 * select * from t1;
+			 *
+			 * This test forces a begin transaction in start_stmt()
+			 *
+			 * drop tables if exists t1;
+			 * create table t1 (c1 int);
+			 * lock tables t1 write;
+			 * insert t1 values (1);
+			 * insert t1 values (2);
+			 * unlock tables;
+			 *
+			 * The second select will return an empty result of the
+			 * MySQL is not informed that a transaction is running (auto-commit 
+			 * in external_lock comes too late)!
+			 *
+			 */
+			if (!self->st_auto_commit) {
+				trans_register_ha(thd, TRUE, pbxt_hton);
+				XT_PRINT0(self, "CONN START XACT - ha_pbxt::external_lock --> trans_register_ha\n");
+			}
+		}
+
+		/* Start a statment transaction: */
+		/* {START-STAT-HACK} The problem that ha_commit_trans() is not
+		 * called by MySQL seems to be fixed (tests confirm this).
+		 * Here is the previous comment when this code was execute 
+		 * here {START-STAT-HACK}
+		 *
+		 * GOTCHA: I have a huge problem with the transaction statement.
+		 * It is not ALWAYS committed (I mean ha_commit_trans() is
+		 * not always called - for example in SELECT).
+		 *
+		 * If I call trans_register_ha() but ha_commit_trans() is not called
+		 * then MySQL thinks a transaction is still running (while
+		 * I have committed the auto-transaction in ha_pbxt::external_lock()).
+		 *
+		 * This causes all kinds of problems, like transactions
+		 * are killed when they should not be.
+		 *
+		 * To prevent this, I only inform MySQL that a transaction
+		 * has beens started when an update is performed. I have determined that
+		 * ha_commit_trans() is only guarenteed to be called if an update is done.
+		 * --------
+		 *
+		 * So, this is the correct place to start a statement transaction.
+		 *
+		 * Note: if trans_register_ha() is not called before ha_write_row(), then 
+		 * PBXT is not registered correctly as a modification transaction.
+		 * (mark_trx_read_write call in ha_write_row).
+		 * This leads to 2-phase commit not being called as it should when
+		 * binary logging is enabled.
+		 */
+		if (!pb_open_tab->ot_thread->st_stat_trans) {
+			trans_register_ha(pb_mysql_thd, FALSE, pbxt_hton);
+			XT_PRINT0(pb_open_tab->ot_thread, "STAT START - ha_pbxt::external_lock --> trans_register_ha\n");
+			pb_open_tab->ot_thread->st_stat_trans = TRUE;
+		}
+
+		if (lock_type == F_WRLCK || self->st_xact_mode < XT_XACT_REPEATABLE_READ)
+			self->st_visible_time = self->st_database->db_xn_end_time;
+
+#ifdef TRACE_STATEMENTS
+		if (self->st_lock_count == 0)
+			STAT_TRACE(self, *thd_query(thd));
+#endif
+		self->st_lock_count++;
+	}
+
+	complete:
+	return err;
+}
+
+/*
+ * This function is called for each table in a statement
+ * after LOCK TABLES has been used.
+ *
+ * Currently I only use this function to set the
+ * current thread of the table handle. 
+ *
+ * GOTCHA: The prototype of start_stmt() has changed
+ * from version 4.1 to 5.1!
+ */
+int ha_pbxt::start_stmt(THD *thd, thr_lock_type lock_type)
+{
+	int				err = 0;
+	XTThreadPtr		self;
+
+	ASSERT_NS(pb_ex_in_use);
+
+	if (!(self = ha_set_current_thread(thd, &err)))
+		return xt_ha_pbxt_to_mysql_error(err);
+
+	XT_PRINT2(self, "ha_pbxt::start_stmt (%s) lock_type=%d\n", pb_share->sh_table_path->ps_path, (int) lock_type);
+
+	if (!pb_open_tab) {
+		if ((err = reopen()))
+			goto complete;
+	}
+
+	ASSERT_NS(pb_open_tab->ot_thread == self);
+	ASSERT_NS(thd == pb_mysql_thd);
+	ASSERT_NS(self->st_database == pb_open_tab->ot_table->tab_db);
+
+	if (self->st_stat_ended) {
+		self->st_stat_ended = FALSE;
+		self->st_stat_trans = FALSE;
+
+#ifdef XT_IMPLEMENT_NO_ACTION
+		if (self->st_restrict_list.bl_count) {
+			if (!xt_tab_restrict_rows(&self->st_restrict_list, self)) {
+				err = xt_ha_pbxt_thread_error_for_mysql(pb_mysql_thd, self, pb_ignore_dup_key);
+			}
+		}
+#endif
+
+		/* This section handles "auto-commit"... */
+		if (self->st_xact_data && self->st_auto_commit && self->st_table_trans) {
+			if (self->st_abort_trans) {
+				XT_PRINT0(self, "xt_xn_rollback in start_stmt\n");
+				if (!xt_xn_rollback(self))
+					err = xt_ha_pbxt_thread_error_for_mysql(pb_mysql_thd, self, pb_ignore_dup_key);
+			}
+			else {
+				XT_PRINT0(self, "xt_xn_commit in start_stmt\n");
+				if (!xt_xn_commit(self))
+					err = xt_ha_pbxt_thread_error_for_mysql(pb_mysql_thd, self, pb_ignore_dup_key);
+			}
+		}
+
+		if (self->st_stat_modify)
+			self->st_statistics.st_stat_write++;
+		else
+			self->st_statistics.st_stat_read++;
+		self->st_stat_modify = FALSE;
+
+		/* If the previous statement was "for update", then set the visibilty
+		 * so that non- for update SELECTs will see what the for update select
+		 * (or update statement) just saw.
+		 */
+		if (pb_open_tab->ot_for_update)
+			self->st_visible_time = self->st_database->db_xn_end_time;
+	}
+
+	pb_open_tab->ot_for_update =
+		(lock_type != TL_READ && 
+		 lock_type != TL_READ_WITH_SHARED_LOCKS &&
+#ifndef DRIZZLED
+		 lock_type != TL_READ_HIGH_PRIORITY && 
+#endif
+		 lock_type != TL_READ_NO_INSERT);
+	pb_open_tab->ot_is_modify = FALSE;
+	if (pb_open_tab->ot_for_update) {
+		switch ((int) thd_sql_command(thd)) {
+			case SQLCOM_UPDATE:
+			case SQLCOM_DELETE:
+#ifndef DRIZZLED
+			case SQLCOM_UPDATE_MULTI:
+			case SQLCOM_DELETE_MULTI:
+#endif
+			case SQLCOM_REPLACE:
+			case SQLCOM_REPLACE_SELECT:
+			case SQLCOM_INSERT:
+			case SQLCOM_INSERT_SELECT:
+				pb_open_tab->ot_is_modify = TRUE;
+				self->st_stat_modify = TRUE;
+				break;
+			case SQLCOM_CREATE_TABLE:
+			case SQLCOM_CREATE_INDEX:
+			case SQLCOM_ALTER_TABLE:
+			case SQLCOM_TRUNCATE:
+			case SQLCOM_DROP_TABLE:
+			case SQLCOM_DROP_INDEX:
+			case SQLCOM_LOAD:
+#ifndef DRIZZLED
+			case SQLCOM_REPAIR:
+#endif
+			case SQLCOM_OPTIMIZE:
+				self->st_stat_modify = TRUE;
+				break;
+		}
+	}
+
+	/* {IS-UPDATE-STAT} This is required at this level!
+	 * No matter how often it is called, it is still the start of a
+	 * statement. We need to make sure statements that are NOT mistaken
+	 * for different type of statement.
+	 *
+	 * Here is an example:
+	 * select * from t1 where data = getcount("bar")
+	 *
+	 * If the procedure getcount() addresses another table.
+	 * then open and close of the statements in getcount()
+	 * are nested within an open close of the select t1
+	 * statement.
+	 */
+	/* {UPDATE-STACK}
+	 * Add to this I add the following:
+	 * A trigger in the middle of an update also causes nested
+	 * statements. If I reset st_is_update, then then
+	 * when the trigger returns the system thinks we
+	 * are in a different update statement, and may
+	 * update the same row again.
+	 */
+	if (self->st_is_update == pb_open_tab) {
+		/* Pop the update stack: */
+		XTOpenTablePtr curr = pb_open_tab->ot_thread->st_is_update;
+
+		pb_open_tab->ot_thread->st_is_update = curr->ot_prev_update;
+		curr->ot_prev_update = NULL;
+	}
+
+	/* See comment {START-TRANS} */
+	if (!self->st_xact_data) {
+
+		thd_init_xact(thd, self, false);
+
+		if (!xt_xn_begin(self)) {
+			err = xt_ha_pbxt_thread_error_for_mysql(thd, self, pb_ignore_dup_key);
+			goto complete;
+		}
+		if (!self->st_auto_commit) {
+			trans_register_ha(thd, TRUE, pbxt_hton);
+			XT_PRINT0(self, "START CONN XACT - ha_pbxt::start_stmt --> trans_register_ha\n");
+		}
+	}
+
+	/* Start a statment (see {START-STAT-HACK}): */
+	if (!pb_open_tab->ot_thread->st_stat_trans) {
+		trans_register_ha(pb_mysql_thd, FALSE, pbxt_hton);
+		XT_PRINT0(pb_open_tab->ot_thread, "START STAT - ha_pbxt::start_stmt --> trans_register_ha\n");
+		pb_open_tab->ot_thread->st_stat_trans = TRUE;
+	}
+
+	if (pb_open_tab->ot_for_update || self->st_xact_mode < XT_XACT_REPEATABLE_READ)
+		self->st_visible_time = self->st_database->db_xn_end_time;
+
+	pb_in_stat = TRUE;
+
+	self->st_stat_count++;
+
+	complete:
+	return err;
+}
+
+/*
+ * The idea with handler::store_lock() is the following:
+ *
+ * The statement decided which locks we should need for the table
+ * for updates/deletes/inserts we get WRITE locks, for SELECT... we get
+ * read locks.
+ *
+ * Before adding the lock into the table lock handler (see thr_lock.c)
+ * mysqld calls store lock with the requested locks. Store lock can now
+ * modify a write lock to a read lock (or some other lock), ignore the
+ * lock (if we don't want to use MySQL table locks at all) or add locks
+ * for many tables (like we do when we are using a MERGE handler).
+ *
+ * When releasing locks, store_lock() are also called. In this case one
+ * usually doesn't have to do anything.
+ *
+ * In some exceptional cases MySQL may send a request for a TL_IGNORE;
+ * This means that we are requesting the same lock as last time and this
+ * should also be ignored. (This may happen when someone does a flush
+ * table when we have opened a part of the tables, in which case mysqld
+ * closes and reopens the tables and tries to get the same locks at last
+ * time). In the future we will probably try to remove this.
+ *
+ * Called from lock.cc by get_lock_data().
+ */
+THR_LOCK_DATA **ha_pbxt::store_lock(THD *thd, THR_LOCK_DATA **to, enum thr_lock_type lock_type)
+{
+	/*
+	 * TL_READ means concurrent INSERTs are allowed. This is a problem as in this mode
+	 * PBXT is not compatible with MyISAM which allows INSERTs but isolates them from
+	 * current "transaction" (started by LOCK TABLES, ended by UNLOCK TABLES). PBXT 
+	 * used to allow INSERTs and made them visible to the locker (on commit). 
+	 * While MySQL manual doesn't state anything regarding row visibility limitations 
+	 * we choose to convert local locks into normal read locks for better compatibility 
+	 * with MyISAM.
+	 */
+	if (lock_type == TL_READ)
+		lock_type = TL_READ_NO_INSERT;
+
+	if (lock_type != TL_IGNORE && pb_lock.type == TL_UNLOCK) {
+		/* Set to TRUE for operations that require a table lock: */
+		switch (thd_sql_command(thd)) {
+			case SQLCOM_TRUNCATE:
+				/* GOTCHA:
+				 * The problem is, if I do not do this, then
+				 * TRUNCATE TABLE deadlocks with a normal update of the table!
+				 * The reason is:
+				 *
+				 * external_lock() is called before MySQL actually locks the
+				 * table. In external_lock(), the table is shared locked,
+				 * by indicating that the handler is in use.
+				 *
+				 * Then later, in delete_all_rows(), a exclusive lock must be
+				 * obtained. If an UPDATE or INSERT has also gained a shared
+				 * lock in the meantime, then TRUNCATE TABLE hangs.
+				 *
+				 * By setting pb_lock_table we indicate that an exclusive lock
+				 * should be gained in external_lock().
+				 *
+				 * This is the locking behaviour:
+				 *
+				 * TRUNCATE TABLE:
+				 * XT SHARE LOCK (mysql_lock_tables calls external_lock)
+				 * MySQL WRITE LOCK (mysql_lock_tables)
+				 * ...
+				 * XT EXCLUSIVE LOCK (delete_all_rows)
+				 *
+				 * INSERT:
+				 * XT SHARED LOCK (mysql_lock_tables calls external_lock)
+				 * MySQL WRITE_ALLOW_WRITE LOCK (mysql_lock_tables)
+				 *
+				 * If the locking for INSERT is done in the ... phase
+				 * above, then we have a deadlock because 
+				 * WRITE_ALLOW_WRITE conflicts with WRITE.
+				 *
+				 * Making TRUNCATE TABLE take a WRITE_ALLOW_WRITE LOCK, will
+				 * not solve the problem because then 2 TRUNCATE TABLES
+				 * can deadlock due to lock escalation.
+				 *
+				 * What may work is if MySQL were to lock BEFORE calling
+				 * external_lock()!
+				 *
+				 * However, using this method, TRUNCATE TABLE does deadlock
+				 * with other operations such as ALTER TABLE!
+				 *
+				 * This is handled with a lock timeout. Assuming 
+				 * TRUNCATE TABLE will be mixed with DML this is the
+				 * best solution!
+				 */
+				pb_lock_table = TRUE;
+				break;
+			default:
+				pb_lock_table = FALSE;
+				break;
+		}
+
+#ifdef PBXT_HANDLER_TRACE
+		pb_lock.type = lock_type;
+#endif
+		/* GOTCHA: Before it was OK to weaken the lock after just checking
+		 * that !thd->in_lock_tables. However, when starting a procedure, MySQL
+		 * simulates a LOCK TABLES statement.
+		 *
+		 * So we need to be more specific here, and check what the actual statement
+		 * type. Before doing this I got a deadlock (undetected) on the following test.
+		 * However, now we get a failed assertion in ha_rollback_trans():
+		 * TODO: Check this with InnoDB!
+		 *
+		 * DBUG_ASSERT(0);
+		 * my_error(ER_COMMIT_NOT_ALLOWED_IN_SF_OR_TRG, MYF(0));
+		 *
+		 * drop table if exists t3;
+		 * create table t3 (a smallint primary key) engine=pbxt;
+		 * insert into t3 (a) values (40);
+		 * insert into t3 (a) values (50);
+		 * 
+		 * delimiter |
+		 * 
+		 * drop function if exists t3_update|
+		 * 
+		 * create function t3_update() returns int
+		 * begin
+		 *   insert into t3 values (10);
+		 *   return 100;
+		 * end|
+		 * 
+		 * delimiter ;
+		 * 
+		 * CONN 1:
+		 * 
+		 * begin;
+		 * update t3 set a = 5 where a = 50;
+		 * 
+		 * CONN 2:
+		 * 
+		 * begin;
+		 * update t3 set a = 4 where a = 40;
+		 * 
+		 * CONN 1:
+		 * 
+		 * update t3 set a = 4 where a = 40; // Hangs waiting CONN 2.
+		 * 
+		 * CONN 2:
+		 * 
+		 * select t3_update(); // Hangs waiting for table lock.
+		 * 
+		 */
+		if ((lock_type >= TL_WRITE_CONCURRENT_INSERT && lock_type <= TL_WRITE) && 
+#ifndef DRIZZLED
+			!(thd_in_lock_tables(thd) && thd_sql_command(thd) == SQLCOM_LOCK_TABLES) &&
+#endif
+			!thd_tablespace_op(thd) &&
+			thd_sql_command(thd) != SQLCOM_TRUNCATE &&
+			thd_sql_command(thd) != SQLCOM_OPTIMIZE &&
+			thd_sql_command(thd) != SQLCOM_CREATE_TABLE) {
+			lock_type = TL_WRITE_ALLOW_WRITE;
+		}
+
+		/* In queries of type INSERT INTO t1 SELECT ... FROM t2 ...
+		 * MySQL would use the lock TL_READ_NO_INSERT on t2, and that
+		 * would conflict with TL_WRITE_ALLOW_WRITE, blocking all inserts
+		 * to t2. Convert the lock to a normal read lock to allow
+		 * concurrent inserts to t2.
+		 * 
+		 * (This one from InnoDB)
+
+                 * Stewart: removed SQLCOM_CALL, not sure of implications.
+		 */
+		if (lock_type == TL_READ_NO_INSERT
+#ifndef DRIZZLED
+			&& (!thd_in_lock_tables(thd)
+			 || thd_sql_command(thd) == SQLCOM_CALL
+			)
+#endif
+			)
+		{
+			lock_type = TL_READ;
+		}
+
+		XT_PRINT3(xt_get_self(), "store_lock (%s) %d->%d\n", pb_share->sh_table_path->ps_path, pb_lock.type, lock_type);
+		pb_lock.type = lock_type;
+	}
+#ifdef PBXT_HANDLER_TRACE
+	else {
+		XT_PRINT3(xt_get_self(), "store_lock (%s) %d->%d (ignore/unlock)\n", pb_share->sh_table_path->ps_path, lock_type, lock_type);
+	}
+#endif
+	*to++= &pb_lock;
+	return to;
+}
+
+/*
+ * Used to delete a table. By the time delete_table() has been called all
+ * opened references to this table will have been closed (and your globally
+ * shared references released. The variable name will just be the name of
+ * the table. You will need to remove any files you have created at this point.
+ *
+ * Called from handler.cc by delete_table and ha_create_table(). Only used
+ * during create if the table_flag HA_DROP_BEFORE_CREATE was specified for
+ * the storage engine.
+*/
+#ifdef DRIZZLED
+int PBXTStorageEngine::doDropTable(Session &, std::string table_path_str)
+#else
+int ha_pbxt::delete_table(const char *table_path)
+#endif
+{
+	THD				*thd = current_thd;
+	int				err = 0;
+	XTThreadPtr		self = NULL;
+	XTSharePtr		share;
+
+#ifdef DRIZZLED
+	const char *table_path = table_path_str.c_str();
+#endif
+
+	STAT_TRACE(self, *thd_query(thd));
+	XT_PRINT1(self, "delete_table (%s)\n", table_path);
+
+	if (XTSystemTableShare::isSystemTable(table_path))
+		return delete_system_table(table_path);
+
+	if (!(self = ha_set_current_thread(thd, &err)))
+		return xt_ha_pbxt_to_mysql_error(err);
+
+	self->st_ignore_fkeys = (thd_test_options(thd, OPTION_NO_FOREIGN_KEY_CHECKS)) != 0;
+
+	try_(a) {
+		xt_ha_open_database_of_table(self, (XTPathStrPtr) table_path);
+
+		ASSERT(xt_get_self() == self);
+		try_(b) {
+			/* NOTE: MySQL does not drop a table by first locking it!
+			 * We also cannot use pb_share because the handler used
+			 * to delete a table is not openned correctly.
+			 */
+			share = ha_get_share(self, table_path, false);
+			pushr_(ha_unget_share, share);
+			ha_aquire_exclusive_use(self, share, NULL);
+			pushr_(ha_release_exclusive_use, share);
+			ha_close_open_tables(self, share, NULL);
+
+			xt_drop_table(self, (XTPathStrPtr) table_path, thd_sql_command(thd) == SQLCOM_DROP_DB);
+
+			freer_(); // ha_release_exclusive_use(share)
+			freer_(); // ha_unget_share(share)
+		}
+		catch_(b) {
+			/* In MySQL if the table does not exist, just log the error and continue. This is
+ 			 * needed to delete table in the case when CREATE TABLE fails and no PBXT disk
+ 			 * structures were created. 
+ 			 * Drizzle unlike MySQL iterates over all handlers and tries to delete table. It
+ 			 * stops after when a handler returns TRUE, so in Drizzle we need to report error.  
+			 */
+#ifndef DRIZZLED
+			if (self->t_exception.e_xt_err == XT_ERR_TABLE_NOT_FOUND)
+				xt_log_and_clear_exception(self);
+			else
+#endif
+				throw_();
+		}
+		cont_(b);
+
+		/*
+		 * If there are no more PBXT tables in the database, we
+		 * "drop the database", which deletes all PBXT resources
+		 * in the database.
+		 */
+		/* We now only drop the pbxt system data,
+		 * when the PBXT database is dropped.
+		 */
+#ifndef XT_USE_GLOBAL_DB
+		if (!xt_table_exists(self->st_database)) {
+			xt_ha_all_threads_close_database(self, self->st_database);
+			xt_drop_database(self, self->st_database);
+			xt_unuse_database(self, self);
+			xt_ha_close_global_database(self);
+		}
+#endif
+	}
+	catch_(a) {
+		err = xt_ha_pbxt_thread_error_for_mysql(thd, self, FALSE);
+#ifdef DRIZZLED
+		if (err == HA_ERR_NO_SUCH_TABLE)
+			err = ENOENT;
+#endif
+	}
+	cont_(a);
+	
+#ifdef PBMS_ENABLED
+	/* Call pbms_delete_table_with_blobs() last because it cannot be undone. */
+	if (!err) {
+		PBMSResultRec result;
+
+		if (pbms_delete_table_with_blobs(table_path, &result)) {
+			xt_logf(XT_NT_WARNING, "pbms_delete_table_with_blobs() Error: %s", result.mr_message);
+		}
+		
+		pbms_completed(NULL, true);
+	}
+#endif
+
+	return err;
+}
+
+#ifdef DRIZZLED
+int PBXTStorageEngine::delete_system_table(const char *table_path)
+#else
+int ha_pbxt::delete_system_table(const char *table_path)
+#endif
+{
+	THD				*thd = current_thd;
+	XTExceptionRec	e;
+	int				err = 0;
+	XTThreadPtr		self;
+
+	if (!(self = xt_ha_set_current_thread(thd, &e)))
+		return xt_ha_pbxt_to_mysql_error(e.e_xt_err);
+
+	try_(a) {
+		xt_ha_open_database_of_table(self, (XTPathStrPtr) table_path);
+
+		if (xt_table_exists(self->st_database))
+			xt_throw_xterr(XT_CONTEXT, XT_ERR_PBXT_TABLE_EXISTS);
+
+		XTSystemTableShare::setSystemTableDeleted(table_path);
+
+		if (!XTSystemTableShare::doesSystemTableExist()) {
+			xt_ha_all_threads_close_database(self, self->st_database);
+			xt_drop_database(self, self->st_database);
+			xt_unuse_database(self, self);
+			xt_ha_close_global_database(self);
+		}
+	}
+	catch_(a) {
+		err = xt_ha_pbxt_thread_error_for_mysql(thd, self, FALSE);
+	}
+	cont_(a);
+
+	return err;
+}
+
+/*
+ * Renames a table from one name to another from alter table call.
+ * This function can be used to move a table from one database to
+ * another.
+ */
+#ifdef DRIZZLED
+int PBXTStorageEngine::doRenameTable(Session *,
+                                     const char *from,
+                                     const char *to)
+#else
+int ha_pbxt::rename_table(const char *from, const char *to)
+#endif
+{
+	THD				*thd = current_thd;
+	int				err = 0;
+	XTThreadPtr		self;
+	XTSharePtr		share;
+	XTDatabaseHPtr	to_db;
+
+	if (XTSystemTableShare::isSystemTable(from))
+		return rename_system_table(from, to);
+
+	if (!(self = ha_set_current_thread(thd, &err)))
+		return xt_ha_pbxt_to_mysql_error(err);
+
+	XT_PRINT2(self, "rename_table (%s -> %s)\n", from, to);
+
+#ifdef PBMS_ENABLED
+	PBMSResultRec result;
+
+	err = pbms_rename_table_with_blobs(from, to, &result);
+	if (err) {
+		xt_logf(XT_NT_ERROR, "pbms_rename_table_with_blobs() Error: %s", result.mr_message);
+		return err;
+	}
+#endif
+
+	try_(a) {
+		xt_ha_open_database_of_table(self, (XTPathStrPtr) to);
+		to_db = self->st_database;
+
+		xt_ha_open_database_of_table(self, (XTPathStrPtr) from);
+
+		if (self->st_database != to_db)
+			xt_throw_xterr(XT_CONTEXT, XT_ERR_CANNOT_CHANGE_DB);
+
+		/*
+		 * NOTE: MySQL does not lock before calling rename table!
+		 *
+		 * We cannot use pb_share because rename_table() is
+		 * called without correctly initializing
+		 * the handler!
+		 */
+		share = ha_get_share(self, from, true);
+		pushr_(ha_unget_share, share);
+		ha_aquire_exclusive_use(self, share, NULL);
+		pushr_(ha_release_exclusive_use, share);
+		ha_close_open_tables(self, share, NULL);
+
+		self->st_ignore_fkeys = (thd_test_options(thd, OPTION_NO_FOREIGN_KEY_CHECKS)) != 0;
+		xt_rename_table(self, (XTPathStrPtr) from, (XTPathStrPtr) to);
+
+		freer_(); // ha_release_exclusive_use(share)
+		freer_(); // ha_unget_share(share)
+
+		/*
+		 * If there are no more PBXT tables in the database, we
+		 * "drop the database", which deletes all PBXT resources
+		 * in the database.
+		 */
+#ifdef XT_USE_GLOBAL_DB
+		/* We now only drop the pbxt system data,
+		 * when the PBXT database is dropped.
+		 */
+		if (!xt_table_exists(self->st_database)) {
+			xt_ha_all_threads_close_database(self, self->st_database);
+			xt_drop_database(self, self->st_database);
+		}
+#endif
+	}
+	catch_(a) {
+		err = xt_ha_pbxt_thread_error_for_mysql(thd, self, FALSE);
+	}
+	cont_(a);
+	
+#ifdef PBMS_ENABLED
+	pbms_completed(NULL, (err == 0));
+#endif
+
+	XT_RETURN(err);
+}
+
+#ifdef DRIZZLED
+int PBXTStorageEngine::rename_system_table(const char *XT_UNUSED(from), const char *XT_UNUSED(to))
+#else
+int ha_pbxt::rename_system_table(const char *XT_UNUSED(from), const char *XT_UNUSED(to))
+#endif
+{
+	return ER_NOT_SUPPORTED_YET;
+}
+
+uint ha_pbxt::max_supported_key_length() const
+{
+	return XT_INDEX_MAX_KEY_SIZE;
+}
+
+uint ha_pbxt::max_supported_key_part_length() const
+{
+	/* There is a little overhead in order to fit! */
+	return XT_INDEX_MAX_KEY_SIZE-4;
+}
+
+/*
+ * Called in test_quick_select to determine if indexes should be used.
+ *
+ * As far as I can tell, time is measured in "disk reads". So the
+ * calculation below means the system reads about 20 rows per read.
+ *
+ * For example a sequence scan uses a read buffer which reads a
+ * number of rows at once, or a sequential scan can make use
+ * of the cache (so it need to read less).
+ */
+double ha_pbxt::scan_time()
+{
+	double result = (double) (stats.records + stats.deleted) / 38.0 + 2;
+	return result;
+}
+
+/*
+ * The next method will never be called if you do not implement indexes.
+ */
+double ha_pbxt::read_time(uint XT_UNUSED(index), uint ranges, ha_rows rows)
+{
+	double result = rows2double(ranges+rows);
+	return result;
+}
+
+/*
+ * Given a starting key, and an ending key estimate the number of rows that
+ * will exist between the two. end_key may be empty which in case determine
+ * if start_key matches any rows.
+ * 
+ * Called from opt_range.cc by check_quick_keys().
+ *
+ */
+ha_rows ha_pbxt::records_in_range(uint inx, key_range *min_key, key_range *max_key)
+{
+	XTIndexPtr		ind;
+	key_part_map	keypart_map;
+	u_int			segement = 0;
+	ha_rows			result;
+
+	if (min_key)
+		keypart_map = min_key->keypart_map;
+	else if (max_key)
+		keypart_map = max_key->keypart_map;
+	else
+		return 1;
+	ind = (XTIndexPtr) pb_share->sh_dic_keys[inx];
+	
+	while (keypart_map & 1) {
+		segement++;
+		keypart_map = keypart_map >> 1;
+	}
+
+	if (segement < 1 || segement > ind->mi_seg_count)
+		result = 1;
+	else
+		result = ind->mi_seg[segement-1].is_recs_in_range;
+#ifdef XT_PRINT_INDEX_OPT
+	printf("records_in_range %s index %d cols req=%d/%d read_bits=%X write_bits=%X index_bits=%X --> %d\n", pb_open_tab->ot_table->tab_name->ps_path, (int) inx, segement, ind->mi_seg_count, (int) *table->read_set->bitmap, (int) *table->write_set->bitmap, (int) *ind->mi_col_map.bitmap, (int) result);
+#endif
+	return result;
+}
+
+/*
+ * create() is called to create a table/database. The variable name will have the name
+ * of the table. When create() is called you do not need to worry about opening
+ * the table. Also, the FRM file will have already been created so adjusting
+ * create_info will not do you any good. You can overwrite the frm file at this
+ * point if you wish to change the table definition, but there are no methods
+ * currently provided for doing that.
+
+ * Called from handle.cc by ha_create_table().
+*/
+#ifdef DRIZZLED
+int PBXTStorageEngine::doCreateTable(Session *, 
+                                     const char *table_path, 
+                                     Table &table_arg, 
+                                     HA_CREATE_INFO &create_info, 
+                                     drizzled::message::Table &XT_UNUSED(proto))
+#else
+int ha_pbxt::create(const char *table_path, TABLE *table_arg, HA_CREATE_INFO *create_info)
+#endif
+{
+	THD				*thd = current_thd;
+	int				err = 0;
+	XTThreadPtr		self;
+	XTDDTable		*tab_def = NULL;
+	XTDictionaryRec	dic;
+
+	if ((strcmp(table_path, "./pbxt/location") == 0) || (strcmp(table_path, "./pbxt/statistics") == 0))
+		return 0;
+
+	memset(&dic, 0, sizeof(dic));
+
+	if (!(self = ha_set_current_thread(thd, &err)))
+		return xt_ha_pbxt_to_mysql_error(err);
+#ifdef DRIZZLED
+	XT_PRINT2(self, "create (%s) %s\n", table_path, (create_info.options & HA_LEX_CREATE_TMP_TABLE) ? "temporary" : "");
+#else
+	XT_PRINT2(self, "create (%s) %s\n", table_path, (create_info->options & HA_LEX_CREATE_TMP_TABLE) ? "temporary" : "");
+#endif
+
+	STAT_TRACE(self, *thd_query(thd));
+
+	try_(a) {
+		xt_ha_open_database_of_table(self, (XTPathStrPtr) table_path);
+
+#ifdef DRIZZLED
+		for (uint i=0; i<TS(&table_arg)->keys; i++) {
+			if (table_arg.key_info[i].key_length > XT_INDEX_MAX_KEY_SIZE)
+				xt_throw_sulxterr(XT_CONTEXT, XT_ERR_KEY_TOO_LARGE, table_arg.key_info[i].name, (u_long) XT_INDEX_MAX_KEY_SIZE);
+		}
+#else
+		for (uint i=0; i<TS(table_arg)->keys; i++) {
+			if (table_arg->key_info[i].key_length > XT_INDEX_MAX_KEY_SIZE)
+				xt_throw_sulxterr(XT_CONTEXT, XT_ERR_KEY_TOO_LARGE, table_arg->key_info[i].name, (u_long) XT_INDEX_MAX_KEY_SIZE);
+		}
+#endif
+
+		/* ($) auto_increment_value will be zero if 
+		 * AUTO_INCREMENT is not used. Otherwise
+		 * Query was ALTER TABLE ... AUTO_INCREMENT = x; or 
+		 * CREATE TABLE ... AUTO_INCREMENT = x;
+		 */
+#ifdef DRIZZLED
+		tab_def = xt_ri_create_table(self, true, (XTPathStrPtr) table_path, *thd_query(thd), myxt_create_table_from_table(self, &table_arg));
+		tab_def->checkForeignKeys(self, create_info.options & HA_LEX_CREATE_TMP_TABLE);
+#else
+		tab_def = xt_ri_create_table(self, true, (XTPathStrPtr) table_path, *thd_query(thd), myxt_create_table_from_table(self, table_arg));
+		tab_def->checkForeignKeys(self, create_info->options & HA_LEX_CREATE_TMP_TABLE);
+#endif
+
+		dic.dic_table = tab_def;
+#ifdef DRIZZLED
+		dic.dic_my_table = &table_arg;
+		dic.dic_tab_flags = (create_info.options & HA_LEX_CREATE_TMP_TABLE) ? XT_TAB_FLAGS_TEMP_TAB : 0;
+		dic.dic_min_auto_inc = (xtWord8) create_info.auto_increment_value; /* ($) */
+		dic.dic_def_ave_row_size = table_arg.s->getAvgRowLength();
+#else
+		dic.dic_my_table = table_arg;
+		dic.dic_tab_flags = (create_info->options & HA_LEX_CREATE_TMP_TABLE) ? XT_TAB_FLAGS_TEMP_TAB : 0;
+		dic.dic_min_auto_inc = (xtWord8) create_info->auto_increment_value; /* ($) */
+		dic.dic_def_ave_row_size = (xtWord8) table_arg->s->avg_row_length;
+#endif
+		myxt_setup_dictionary(self, &dic);
+
+		/*
+		 * We used to ignore the value of foreign_key_checks flag and allowed creation
+		 * of tables with "hanging" references. Now we validate FKs if foreign_key_checks != 0
+		 */
+		self->st_ignore_fkeys = (thd_test_options(thd, OPTION_NO_FOREIGN_KEY_CHECKS)) != 0;
+
+		/*
+		 * Previously I set delete_if_exists=TRUE because
+		 * CREATE TABLE was being used to TRUNCATE.
+		 * This was due to the flag HTON_CAN_RECREATE.
+		 * Now I could set delete_if_exists=FALSE, but
+		 * leaving it TRUE should not cause any problems.
+		 */
+		xt_create_table(self, (XTPathStrPtr) table_path, &dic);
+	}
+	catch_(a) {
+		if (tab_def)
+			tab_def->finalize(self);
+		dic.dic_table = NULL;
+		err = xt_ha_pbxt_thread_error_for_mysql(thd, self, FALSE);
+	}
+	cont_(a);
+
+	/* Free the dictionary, but not 'table_arg'! */
+	dic.dic_my_table = NULL;
+	myxt_free_dictionary(self, &dic);
+
+	XT_RETURN(err);
+}
+
+void ha_pbxt::update_create_info(HA_CREATE_INFO *create_info)
+{
+	XTOpenTablePtr	ot;
+
+	if ((ot = pb_open_tab)) {
+		if (!(create_info->used_fields & HA_CREATE_USED_AUTO)) {
+			/* Fill in the minimum auto-increment value! */
+			create_info->auto_increment_value = ot->ot_table->tab_dic.dic_min_auto_inc;
+		}
+	}
+}
+
+char *ha_pbxt::get_foreign_key_create_info()
+{
+	THD					*thd = current_thd;
+	int					err = 0;
+	XTThreadPtr			self;
+	XTStringBufferRec	tab_def = { 0, 0, 0 };
+
+	if (!(self = ha_set_current_thread(thd, &err))) {
+		xt_ha_pbxt_to_mysql_error(err);
+		return NULL;
+	}
+
+	if (!pb_open_tab) {
+		if ((err = reopen()))
+			return NULL;
+	}
+
+	if (!pb_open_tab->ot_table->tab_dic.dic_table)
+		return NULL;
+
+	try_(a) {
+		pb_open_tab->ot_table->tab_dic.dic_table->loadForeignKeyString(self, &tab_def);
+	}
+	catch_(a) {
+		xt_sb_set_size(self, &tab_def, 0);
+		err = xt_ha_pbxt_thread_error_for_mysql(thd, self, pb_ignore_dup_key);
+	}
+	cont_(a);
+
+	return tab_def.sb_cstring;
+}
+
+void ha_pbxt::free_foreign_key_create_info(char* str)
+{
+	xt_free(NULL, str);
+}
+
+bool ha_pbxt::get_error_message(int XT_UNUSED(error), String *buf)
+{
+	THD				*thd = current_thd;
+	int				err = 0;
+	XTThreadPtr		self;
+
+	if (!(self = ha_set_current_thread(thd, &err)))
+		return FALSE;
+
+	if (!self->t_exception.e_xt_err)
+		return FALSE;
+
+	buf->copy(self->t_exception.e_err_msg, (uint32) strlen(self->t_exception.e_err_msg), system_charset_info);
+	return TRUE;
+}
+
+/* 
+ * get info about FKs of the currently open table
+ * used in 
+ * 1. REPLACE; is > 0 if table is referred by a FOREIGN KEY 
+ * 2. INFORMATION_SCHEMA tables: TABLE_CONSTRAINTS, REFERENTIAL_CONSTRAINTS
+ * Return value: as of 5.1.24 it's ignored
+ */
+
+int ha_pbxt::get_foreign_key_list(THD *thd, List<FOREIGN_KEY_INFO> *f_key_list)
+{
+	int err = 0;
+	XTThreadPtr	self;
+	const char *action;
+
+	if (!(self = ha_set_current_thread(thd, &err))) {
+		return xt_ha_pbxt_to_mysql_error(err);
+	}
+
+	try_(a) {
+		XTDDTable *table_dic = pb_open_tab->ot_table->tab_dic.dic_table;
+
+		if (table_dic == NULL)
+			xt_throw_errno(XT_CONTEXT, XT_ERR_NO_DICTIONARY);
+
+		for (int i = 0, sz = table_dic->dt_fkeys.size(); i < sz; i++) {
+			FOREIGN_KEY_INFO *fk_info= new	// assumed that C++ exceptions are disabled
+				(thd_alloc(thd, sizeof(FOREIGN_KEY_INFO))) FOREIGN_KEY_INFO;
+
+			if (fk_info == NULL)
+				xt_throw_errno(XT_CONTEXT, XT_ENOMEM);
+
+			XTDDForeignKey *fk = table_dic->dt_fkeys.itemAt(i);
+
+			const char *path = fk->fk_ref_tab_name->ps_path;
+			const char *ref_tbl_name = path + strlen(path);
+
+			while (ref_tbl_name != path && !XT_IS_DIR_CHAR(*ref_tbl_name)) 
+				ref_tbl_name--;
+
+			const char * ref_db_name = ref_tbl_name - 1;
+
+			while (ref_db_name != path && !XT_IS_DIR_CHAR(*ref_db_name)) 
+				ref_db_name--;
+
+			ref_tbl_name++;
+			ref_db_name++;
+
+			fk_info->forein_id = thd_make_lex_string(thd, 0,
+				fk->co_name, (uint) strlen(fk->co_name), 1);
+
+			fk_info->referenced_db = thd_make_lex_string(thd, 0,
+				ref_db_name, (uint) (ref_tbl_name - ref_db_name - 1), 1);
+
+			fk_info->referenced_table = thd_make_lex_string(thd, 0,
+				ref_tbl_name, (uint) strlen(ref_tbl_name), 1);
+
+			fk_info->referenced_key_name = NULL;			
+
+			XTIndex *ix = fk->getReferenceIndexPtr();
+			if (ix == NULL) /* can be NULL if another thread changes referenced table at the moment */
+				continue;
+			
+			XTDDTable *ref_table = fk->fk_ref_table;
+
+			// might be a self-reference
+			if ((ref_table == NULL) 
+				&& (xt_tab_compare_names(path, table_dic->dt_table->tab_name->ps_path) == 0)) {
+				ref_table = table_dic;
+			}
+
+			if (ref_table != NULL) {
+				const XTList<XTDDIndex>& ix_list = ref_table->dt_indexes;
+				for (int j = 0, sz2 = ix_list.size(); j < sz2; j++) {
+					XTDDIndex *ddix = ix_list.itemAt(j);
+					if (ddix->in_index ==  ix->mi_index_no) {
+						const char *ix_name = 
+							ddix->co_name ? ddix->co_name : ddix->co_ind_name;
+						fk_info->referenced_key_name = thd_make_lex_string(thd, 0,
+							ix_name, (uint) strlen(ix_name), 1);
+						break;
+					}
+				}
+			}
+
+			action = XTDDForeignKey::actionTypeToString(fk->fk_on_delete);
+			fk_info->delete_method = thd_make_lex_string(thd, 0,
+				action, (uint) strlen(action), 1);
+			action = XTDDForeignKey::actionTypeToString(fk->fk_on_update);
+			fk_info->update_method = thd_make_lex_string(thd, 0,
+				action, (uint) strlen(action), 1);
+
+			const XTList<XTDDColumnRef>& cols = fk->co_cols;
+			for (int j = 0, sz2 = cols.size(); j < sz2; j++) {
+				XTDDColumnRef *col_ref= cols.itemAt(j);
+				fk_info->foreign_fields.push_back(thd_make_lex_string(thd, 0,
+					col_ref->cr_col_name, (uint) strlen(col_ref->cr_col_name), 1));
+			}
+
+			const XTList<XTDDColumnRef>& ref_cols = fk->fk_ref_cols;
+			for (int j = 0, sz2 = ref_cols.size(); j < sz2; j++) {
+				XTDDColumnRef *col_ref= ref_cols.itemAt(j);
+				fk_info->referenced_fields.push_back(thd_make_lex_string(thd, 0,
+					col_ref->cr_col_name, (uint) strlen(col_ref->cr_col_name), 1));
+			}
+
+			f_key_list->push_back(fk_info);
+		}
+	}
+	catch_(a) {
+		err = xt_ha_pbxt_thread_error_for_mysql(thd, self, pb_ignore_dup_key);
+	}
+	cont_(a);
+
+	return err; 
+}
+
+uint ha_pbxt::referenced_by_foreign_key()
+{
+	XTDDTable *table_dic = pb_open_tab->ot_table->tab_dic.dic_table;
+
+	if (!table_dic)
+		return 0;
+	/* Check the list of referencing tables: */
+	return table_dic->dt_trefs ? 1 : 0;
+}
+
+
+struct st_mysql_sys_var
+{
+	MYSQL_PLUGIN_VAR_HEADER;
+};
+
+#if MYSQL_VERSION_ID < 60000
+#if MYSQL_VERSION_ID >= 50124
+#define USE_CONST_SAVE
+#endif
+#else
+#if MYSQL_VERSION_ID >= 60005
+#define USE_CONST_SAVE
+#endif
+#endif
+
+#ifdef USE_CONST_SAVE
+static void pbxt_record_cache_size_func(THD *XT_UNUSED(thd), struct st_mysql_sys_var *var, void *tgt, const void *save)
+#else
+static void pbxt_record_cache_size_func(THD *XT_UNUSED(thd), struct st_mysql_sys_var *var, void *tgt, void *save)
+#endif
+{
+	xtInt8	record_cache_size;
+
+	char *old= *(char **) tgt;
+	*(char **)tgt= *(char **) save;
+	if (var->flags & PLUGIN_VAR_MEMALLOC)
+	{
+		*(char **)tgt= my_strdup(*(char **) save, MYF(0));
+		my_free(old, MYF(0));
+	}
+	record_cache_size = ha_set_variable(&pbxt_record_cache_size, &vp_record_cache_size);
+	xt_tc_set_cache_size((size_t) record_cache_size);
+#ifdef DEBUG
+	char buffer[200];
+
+	sprintf(buffer, "pbxt_record_cache_size=%llu\n", (u_llong) record_cache_size);
+	xt_logf(XT_NT_INFO, buffer);
+#endif
+}
+
+#ifndef DRIZZLED
+struct st_mysql_storage_engine pbxt_storage_engine = {
+	MYSQL_HANDLERTON_INTERFACE_VERSION
+};
+static st_mysql_information_schema pbxt_statitics = {
+	MYSQL_INFORMATION_SCHEMA_INTERFACE_VERSION
+};
+#endif
+
+#if MYSQL_VERSION_ID >= 50118
+static MYSQL_SYSVAR_STR(index_cache_size, pbxt_index_cache_size,
+  PLUGIN_VAR_READONLY,
+  "The amount of memory allocated to the index cache, used only to cache index data.",
+  NULL, NULL, NULL);
+
+static MYSQL_SYSVAR_STR(record_cache_size, pbxt_record_cache_size,
+  PLUGIN_VAR_READONLY, // PLUGIN_VAR_OPCMDARG | PLUGIN_VAR_MEMALLOC,
+  "The amount of memory allocated to the record cache used to cache table data.",
+  NULL, pbxt_record_cache_size_func, NULL);
+
+static MYSQL_SYSVAR_STR(log_cache_size, pbxt_log_cache_size,
+  PLUGIN_VAR_READONLY,
+  "The amount of memory allocated to the transaction log cache used to cache transaction log data.",
+  NULL, NULL, NULL);
+
+static MYSQL_SYSVAR_STR(log_file_threshold, pbxt_log_file_threshold,
+  PLUGIN_VAR_READONLY,
+  "The size of a transaction log before rollover, and a new log is created.",
+  NULL, NULL, NULL);
+
+static MYSQL_SYSVAR_STR(transaction_buffer_size, pbxt_transaction_buffer_size,
+  PLUGIN_VAR_READONLY,
+  "The size of the global transaction log buffer (the engine allocates 2 buffers of this size).",
+  NULL, NULL, NULL);
+
+static MYSQL_SYSVAR_STR(log_buffer_size, pbxt_log_buffer_size,
+  PLUGIN_VAR_READONLY,
+  "The size of the buffer used to cache data from transaction and data logs during sequential scans, or when writing a data log.",
+  NULL, NULL, NULL);
+
+static MYSQL_SYSVAR_STR(checkpoint_frequency, pbxt_checkpoint_frequency,
+  PLUGIN_VAR_READONLY,
+  "The size of the transaction data buffer which is allocate by each thread.",
+  NULL, NULL, NULL);
+
+static MYSQL_SYSVAR_STR(data_log_threshold, pbxt_data_log_threshold,
+  PLUGIN_VAR_READONLY,
+  "The maximum size of a data log file.",
+  NULL, NULL, NULL);
+
+static MYSQL_SYSVAR_STR(data_file_grow_size, pbxt_data_file_grow_size,
+  PLUGIN_VAR_READONLY,
+  "The amount by which the handle data files (.xtd) grow.",
+  NULL, NULL, NULL);
+
+static MYSQL_SYSVAR_STR(row_file_grow_size, pbxt_row_file_grow_size,
+  PLUGIN_VAR_READONLY,
+  "The amount by which the row pointer files (.xtr) grow.",
+  NULL, NULL, NULL);
+
+static MYSQL_SYSVAR_INT(garbage_threshold, xt_db_garbage_threshold,
+	PLUGIN_VAR_OPCMDARG,
+	"The percentage of garbage in a repository file before it is compacted.",
+	NULL, NULL, XT_DL_DEFAULT_GARBAGE_LEVEL, 0, 100, 1);
+
+static MYSQL_SYSVAR_INT(log_file_count, xt_db_log_file_count,
+	PLUGIN_VAR_OPCMDARG,
+	"The minimum number of transaction logs used.",
+	NULL, NULL, XT_DL_DEFAULT_XLOG_COUNT, 1, 20000, 1);
+
+static MYSQL_SYSVAR_INT(auto_increment_mode, xt_db_auto_increment_mode,
+	PLUGIN_VAR_OPCMDARG,
+	"The auto-increment mode, 0 = MySQL standard (default), 1 = previous ID's never reused.",
+	NULL, NULL, XT_AUTO_INCREMENT_DEF, 0, 1, 1);
+
+/* {RN145} */
+static MYSQL_SYSVAR_INT(offline_log_function, xt_db_offline_log_function,
+	PLUGIN_VAR_OPCMDARG,
+	"Determines what happens to transaction logs when the are moved offline, 0 = recycle logs (default), 1 = delete logs (default on Mac OS X), 2 = keep logs.",
+	NULL, NULL, XT_OFFLINE_LOG_FUNCTION_DEF, 0, 2, 1);
+
+/* {RN150} */
+static MYSQL_SYSVAR_INT(sweeper_priority, xt_db_sweeper_priority,
+	PLUGIN_VAR_OPCMDARG,
+	"Determines the priority of the background sweeper process, 0 = low (default), 1 = normal (same as user threads), 2 = high.",
+	NULL, NULL, XT_PRIORITY_LOW, XT_PRIORITY_LOW, XT_PRIORITY_HIGH, 1);
+
+#ifdef DRIZZLED
+static MYSQL_SYSVAR_INT(max_threads, pbxt_max_threads,
+	PLUGIN_VAR_OPCMDARG | PLUGIN_VAR_READONLY,
+	"The maximum number of threads used by PBXT",
+	NULL, NULL, 500, 20, 20000, 1);
+#else
+static MYSQL_SYSVAR_INT(max_threads, pbxt_max_threads,
+	PLUGIN_VAR_OPCMDARG | PLUGIN_VAR_READONLY,
+	"The maximum number of threads used by PBXT, 0 = set according to MySQL max_connections.",
+	NULL, NULL, 0, 0, 20000, 1);
+#endif
+
+#ifndef DEBUG
+static MYSQL_SYSVAR_BOOL(support_xa, pbxt_support_xa,
+	PLUGIN_VAR_OPCMDARG,
+	"Enable PBXT support for the XA two-phase commit, default is enabled",
+	NULL, NULL, TRUE);
+#else
+static MYSQL_SYSVAR_BOOL(support_xa, pbxt_support_xa,
+	PLUGIN_VAR_OPCMDARG,
+	"Enable PBXT support for the XA two-phase commit, default is disabled (due to assertion failure in MySQL)",
+	/* The problem is, in MySQL an assertion fails in debug mode: 
+	 * Assertion failed: (total_ha_2pc == (ulong) opt_bin_log+1), function ha_recover, file handler.cc, line 1557.
+     */
+	NULL, NULL, FALSE);
+#endif
+
+static MYSQL_SYSVAR_INT(flush_log_at_trx_commit, xt_db_flush_log_at_trx_commit,
+	PLUGIN_VAR_OPCMDARG,
+	"Determines whether the transaction log is written and/or flushed when a transaction is committed (no matter what the setting the log is written and flushed once per second), 0 = no write & no flush, 1 = write & flush (default), 2 = write & no flush.",
+	NULL, NULL, 1, 0, 2, 1);
+
+static struct st_mysql_sys_var* pbxt_system_variables[] = {
+  MYSQL_SYSVAR(index_cache_size),
+  MYSQL_SYSVAR(record_cache_size),
+  MYSQL_SYSVAR(log_cache_size),
+  MYSQL_SYSVAR(log_file_threshold),
+  MYSQL_SYSVAR(transaction_buffer_size),
+  MYSQL_SYSVAR(log_buffer_size),
+  MYSQL_SYSVAR(checkpoint_frequency),
+  MYSQL_SYSVAR(data_log_threshold),
+  MYSQL_SYSVAR(data_file_grow_size),
+  MYSQL_SYSVAR(row_file_grow_size),
+  MYSQL_SYSVAR(garbage_threshold),
+  MYSQL_SYSVAR(log_file_count),
+  MYSQL_SYSVAR(auto_increment_mode),
+  MYSQL_SYSVAR(offline_log_function),
+  MYSQL_SYSVAR(sweeper_priority),
+  MYSQL_SYSVAR(max_threads),
+  MYSQL_SYSVAR(support_xa),
+  MYSQL_SYSVAR(flush_log_at_trx_commit),
+  NULL
+};
+#endif
+
+#ifdef DRIZZLED
+drizzle_declare_plugin(pbxt)
+#else
+mysql_declare_plugin(pbxt)
+#endif
+{
+#ifndef DRIZZLED
+	MYSQL_STORAGE_ENGINE_PLUGIN,
+	&pbxt_storage_engine,
+#endif
+	"PBXT",
+#ifdef DRIZZLED
+	"1.0",
+#endif
+	"Paul McCullagh, PrimeBase Technologies GmbH",
+	"High performance, multi-versioning transactional engine",
+	PLUGIN_LICENSE_GPL,
+	pbxt_init, /* Plugin Init */
+	pbxt_end, /* Plugin Deinit */
+#ifndef DRIZZLED
+	0x0001 /* 0.1 */,
+#endif
+	NULL,                       /* status variables                */
+#if MYSQL_VERSION_ID >= 50118
+	pbxt_system_variables,		/* system variables                */
+#else
+	NULL,
+#endif
+	NULL						/* config options                  */
+},
+{
+#ifndef DRIZZLED
+	MYSQL_INFORMATION_SCHEMA_PLUGIN,
+	&pbxt_statitics,
+#endif
+	"PBXT_STATISTICS",
+#ifdef DRIZZLED
+	"1.0",
+#endif
+	"Paul McCullagh, PrimeBase Technologies GmbH",
+	"PBXT internal system statitics",
+	PLUGIN_LICENSE_GPL,
+	pbxt_init_statistics,						/* plugin init */
+	pbxt_exit_statistics,						/* plugin deinit */
+#ifndef DRIZZLED
+	0x0005,
+#endif
+	NULL,										/* status variables */
+	NULL,										/* system variables */
+	NULL										/* config options */
+}
+#ifdef DRIZZLED
+drizzle_declare_plugin_end;
+#else
+mysql_declare_plugin_end;
+#if defined(MARIADB_BASE_VERSION) && MYSQL_VERSION_ID > 50200
+maria_declare_plugin(pbxt)
+{ /* PBXT */
+  MYSQL_STORAGE_ENGINE_PLUGIN,
+  &pbxt_storage_engine,
+  "PBXT",
+  "Paul McCullagh, PrimeBase Technologies GmbH",
+  "High performance, multi-versioning transactional engine",
+  PLUGIN_LICENSE_GPL,
+  pbxt_init, /* Plugin Init */
+  pbxt_end, /* Plugin Deinit */
+  0x0001 /* 0.1 */,
+  NULL,                       /* status variables */
+  pbxt_system_variables,      /* system variables */
+  "1.0.11-7 Pre-GA",              /* string version */
+  MariaDB_PLUGIN_MATURITY_GAMMA /* maturity */
+},
+{ /* PBXT_STATISTICS */
+  MYSQL_INFORMATION_SCHEMA_PLUGIN,
+  &pbxt_statitics,
+  "PBXT_STATISTICS",
+  "Paul McCullagh, PrimeBase Technologies GmbH",
+  "PBXT internal system statitics",
+  PLUGIN_LICENSE_GPL,
+  pbxt_init_statistics,       /* plugin init */
+  pbxt_exit_statistics,       /* plugin deinit */
+  0x0005,
+  NULL,                       /* status variables */
+  NULL,                       /* system variables */
+  "1.0.11-7 Pre-GA",          /* string version */
+  MariaDB_PLUGIN_MATURITY_GAMMA /* maturity */
+}
+maria_declare_plugin_end;
+#endif
+#endif
+
+#if defined(XT_WIN) && defined(XT_COREDUMP)
+
+/*
+ * WINDOWS CORE DUMP SUPPORT
+ *
+ * MySQL supports core dumping on Windows with --core-file command line option. 
+ * However it creates dumps with the MiniDumpNormal option which saves only stack traces.
+ *
+ * We instead (or in addition) create dumps with MiniDumpWithoutOptionalData option
+ * which saves all available information. To enable core dumping enable XT_COREDUMP
+ * at compile time.
+ * In addition, pbxt_crash_debug must be set to TRUE which is the case if XT_CRASH_DEBUG
+ * is defined.
+ * This switch is also controlled by creating a file called "no-debug" or "crash-debug"
+ * in the pbxt database directory.
+ */
+
+typedef enum _MINIDUMP_TYPE {
+    MiniDumpNormal                         = 0x0000,
+    MiniDumpWithDataSegs                   = 0x0001,
+    MiniDumpWithFullMemory                 = 0x0002,
+    MiniDumpWithHandleData                 = 0x0004,
+    MiniDumpFilterMemory                   = 0x0008,
+    MiniDumpScanMemory                     = 0x0010,
+    MiniDumpWithUnloadedModules            = 0x0020,
+    MiniDumpWithIndirectlyReferencedMemory = 0x0040,
+    MiniDumpFilterModulePaths              = 0x0080,
+    MiniDumpWithProcessThreadData          = 0x0100,
+    MiniDumpWithPrivateReadWriteMemory     = 0x0200,
+} MINIDUMP_TYPE;
+
+typedef struct _MINIDUMP_EXCEPTION_INFORMATION {
+    DWORD ThreadId;
+    PEXCEPTION_POINTERS ExceptionPointers;
+    BOOL ClientPointers;
+} MINIDUMP_EXCEPTION_INFORMATION, *PMINIDUMP_EXCEPTION_INFORMATION;
+
+typedef BOOL (WINAPI *MINIDUMPWRITEDUMP)(
+	HANDLE hProcess, 
+	DWORD dwPid, 
+	HANDLE hFile, 
+	MINIDUMP_TYPE DumpType,
+	void *ExceptionParam,
+	void *UserStreamParam,
+	void *CallbackParam
+	);
+
+char base_path[_MAX_PATH] = {0};
+char dump_path[_MAX_PATH] = {0};
+
+void core_dump(struct _EXCEPTION_POINTERS *pExceptionInfo)
+{
+	SECURITY_ATTRIBUTES	sa = { sizeof(SECURITY_ATTRIBUTES), 0, 0 };
+	int i;
+	HMODULE hDll = NULL;
+	HANDLE hFile;
+	MINIDUMPWRITEDUMP pDump;
+	char *end_ptr = base_path;
+
+	MINIDUMP_EXCEPTION_INFORMATION ExInfo, *ExInfoPtr = NULL;
+
+	if (pExceptionInfo) {
+		ExInfo.ThreadId = GetCurrentThreadId();
+		ExInfo.ExceptionPointers = pExceptionInfo;
+		ExInfo.ClientPointers = NULL;
+		ExInfoPtr = &ExInfo;
+	}
+
+	end_ptr = base_path + strlen(base_path);
+
+	strcat(base_path, "DBGHELP.DLL" );
+	hDll = LoadLibrary(base_path);
+	*end_ptr = 0;
+	if (hDll==NULL) {
+		int err;
+		err = HRESULT_CODE(GetLastError());
+		hDll = LoadLibrary( "DBGHELP.DLL" );
+		if (hDll==NULL) {
+			err = HRESULT_CODE(GetLastError());
+			return;
+		}
+	}
+
+	pDump = (MINIDUMPWRITEDUMP)GetProcAddress( hDll, "MiniDumpWriteDump" );
+	if (!pDump) {
+		int err;
+		err = HRESULT_CODE(GetLastError());
+		return;
+	}
+
+	for (i = 1; i < INT_MAX; i++) {
+		sprintf(dump_path, "%sPBXTCore%08d.dmp", base_path, i);
+		hFile = CreateFile( dump_path, GENERIC_WRITE, FILE_SHARE_WRITE, NULL, CREATE_NEW,
+							FILE_ATTRIBUTE_NORMAL, NULL );
+
+		if ( hFile != INVALID_HANDLE_VALUE )
+			break;
+
+		if (HRESULT_CODE(GetLastError()) == ERROR_FILE_EXISTS )
+			continue;
+
+		return;
+	}
+
+	// write the dump
+	BOOL bOK = pDump( GetCurrentProcess(), GetCurrentProcessId(), hFile, 
+		MiniDumpWithPrivateReadWriteMemory, ExInfoPtr, NULL, NULL );
+
+	CloseHandle(hFile);
+}
+
+LONG crash_filter( struct _EXCEPTION_POINTERS *pExceptionInfo )
+{
+	core_dump(pExceptionInfo);
+	return EXCEPTION_EXECUTE_HANDLER;
+}
+
+void register_crash_filter()
+{
+	SetUnhandledExceptionFilter( (LPTOP_LEVEL_EXCEPTION_FILTER) crash_filter );
+}
+
+#endif // XT_WIN && XT_COREDUMP
diff --git a/storage/pbxt/src/ha_pbxt.h b/storage/pbxt/src/ha_pbxt.h
new file mode 100644
index 00000000000..a7548d6fa86
--- /dev/null
+++ b/storage/pbxt/src/ha_pbxt.h
@@ -0,0 +1,354 @@
+/* Copyright (c) 2005 PrimeBase Technologies GmbH
+ *
+ * Derived from ha_example.h
+ * Copyright (C) 2003 MySQL AB
+ *
+ * PrimeBase XT
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.	See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA	02111-1307	USA
+ *
+ * 2005-11-10	Paul McCullagh
+ *
+ */
+#ifndef __ha_pbxt_h__
+#define __ha_pbxt_h__
+
+#ifdef DRIZZLED
+#include <drizzled/common.h>
+#include <mysys/thr_lock.h>
+#include <drizzled/cursor.h>
+
+#else
+#include "mysql_priv.h"
+#endif
+
+#include "xt_defs.h"
+#include "table_xt.h"
+
+#ifdef USE_PRAGMA_INTERFACE
+#pragma interface			/* gcc class implementation */
+#endif
+
+#if MYSQL_VERSION_ID <= 50120
+#define thd_killed(t)		(t)->killed
+#endif
+
+#if MYSQL_VERSION_ID >= 50120
+#define byte uchar
+#endif
+
+class ha_pbxt;
+
+#ifdef DRIZZLED
+
+class PBXTStorageEngine : public drizzled::plugin::StorageEngine 
+{
+
+	int delete_system_table(const char *table_path);
+	int rename_system_table(const char * from, const char * to);
+
+public:
+	PBXTStorageEngine(std::string name_arg)
+	: drizzled::plugin::StorageEngine(name_arg, HTON_NO_FLAGS) {}
+
+	void operator delete(void *) {}
+	void operator delete[] (void *) {}
+
+	/* override */ int close_connection(Session *);
+	/* override */ int commit(Session *, bool);
+	/* override */ int rollback(Session *, bool);
+	/* override */ Cursor *create(TABLE_SHARE *, MEM_ROOT *);
+	/* override */ void drop_database(char *);
+	/* override */ bool show_status(Session *, stat_print_fn *, enum ha_stat_type);
+        /* override */ const char **bas_ext() const;
+	/* override */ int doCreateTable(Session *session, const char *table_name, 
+				Table &table_arg, HA_CREATE_INFO
+                                &create_info, drizzled::message::Table &proto);
+	/* override */ int doRenameTable(Session *, const char *from, const char *to);
+	/* override */ int doDropTable(Session &session, std::string table_path);
+};
+
+typedef PBXTStorageEngine handlerton;
+
+#endif
+
+extern handlerton *pbxt_hton;
+
+/*
+ * XTShareRec is a structure that will be shared amoung all open handlers.
+ */
+typedef struct XTShare {
+	XTPathStrPtr		sh_table_path;
+	uint				sh_use_count;
+
+	XTTableHPtr			sh_table;				/* This is a XTTableHPtr, a reference to the XT internal table handle. */
+
+	uint				sh_dic_key_count;
+	XTIndexPtr			*sh_dic_keys;			/* A reference to the XT internal index list. */
+	xtBool				sh_recalc_selectivity;	/* This is set to TRUE if when have < 100 rows when the table is openned. */
+
+	/* We use a trick here to get an exclusive lock
+	 * on a table. The trick avoids having to use a
+	 * semaphore if a thread does not want
+	 * exclusive use.
+	 */
+	xt_mutex_type		*sh_ex_mutex;
+	xt_cond_type		*sh_ex_cond;
+	xtBool				sh_table_lock;			/* Set to TRUE if a lock on the table is held. */
+	ha_pbxt				*sh_handlers;			/* Double linked list of handlers for a particular table. */
+	xtWord8				sh_min_auto_inc;		/* Used to proporgate the current auto-inc over a DELETE FROM
+												 * (does not work if the server shuts down in between!).
+												 */
+
+	THR_LOCK			sh_lock;				/* MySQL lock */
+} XTShareRec, *XTSharePtr;
+
+/*
+ * Class definition for the storage engine
+ */
+class ha_pbxt: public handler
+{
+	public:
+	XTSharePtr			pb_share;				/* Shared table info */
+
+	XTOpenTablePtr		pb_open_tab;			/* This is a XTOpenTablePtr (a reference to the XT internal table handle)! */
+
+	xtBool				pb_key_read;			/* No Need to retrieve the entire row, index values are sufficient. */
+	int					pb_ignore_dup_key;
+	u_int				pb_ind_row_count;
+
+	THR_LOCK_DATA		pb_lock;				/* MySQL lock */
+
+	ha_pbxt				*pb_ex_next;			/* Double linked list of handlers for a particular table. */
+	ha_pbxt				*pb_ex_prev;
+
+	xtBool				pb_lock_table;			/* The operation requires a table lock. */
+	int					pb_table_locked;		/* TRUE of this handler holds the table lock. */
+	int					pb_ex_in_use;			/* Set to 1 while when the handler is in use. */
+
+	THD					*pb_mysql_thd;			/* A pointer to the MySQL thread. */
+	xtBool				pb_in_stat;				/* TRUE of start_stmt() was issued */
+
+	ha_pbxt(handlerton *hton, TABLE_SHARE *table_arg);
+
+	virtual ~ha_pbxt() { }
+
+	/* The name that will be used for display purposes */
+	const char *table_type() const { return "PBXT"; }
+
+	/*
+	 * The name of the index type that will be used for display
+	 * don't implement this method unless you really have indexes.
+	 */
+	const char *index_type(uint inx) { (void) inx; return "BTREE"; }
+#ifndef DRIZZLED
+	const char **bas_ext() const;
+#endif
+	MX_UINT8_T table_cache_type();
+
+	/*
+	 * This is a list of flags that says what the storage engine
+	 * implements. The current table flags are documented in
+	 * handler.h
+	 */
+	MX_TABLE_TYPES_T table_flags() const;
+
+	/*
+	 * part is the key part to check. First key part is 0
+	 * If all_parts it's set, MySQL want to know the flags for the combined
+	 * index up to and including 'part'.
+	 */
+	MX_ULONG_T index_flags(uint inx, uint part, bool all_parts) const;
+
+	/*
+	 * unireg.cc will call the following to make sure that the storage engine can
+	 * handle the data it is about to send.
+	 * 
+	 * Return *real* limits of your storage engine here. MySQL will do
+	 * min(your_limits, MySQL_limits) automatically
+	 * 
+	 * Theoretically PBXT supports any number of key parts, etc.
+	 * Practically this is not true of course.
+	 */
+	uint	max_supported_record_length()	const { return UINT_MAX; }
+	uint	max_supported_keys()			const { return 512; }
+	uint	max_supported_key_parts()		const { return 128; }
+	uint	max_supported_key_length()		const;
+	uint	max_supported_key_part_length() const;
+
+	double	scan_time();
+
+	double	read_time(uint index, uint ranges, ha_rows rows);
+
+  	bool	has_transactions()  { return 1; }
+
+	/*
+	 * Everything below are methods that we implement in ha_pbxt.cc.
+	 */
+	void	internal_close(THD *thd, struct XTThread *self);
+	int		open(const char *name, int mode, uint test_if_locked);		// required
+	int		reopen(void);
+	int		close(void);												// required
+
+	void	init_auto_increment(xtWord8 min_auto_inc);
+	void	get_auto_increment(MX_ULONGLONG_T offset, MX_ULONGLONG_T increment,
+                                 MX_ULONGLONG_T nb_desired_values,
+                                 MX_ULONGLONG_T *first_value,
+                                 MX_ULONGLONG_T *nb_reserved_values);
+	void	set_auto_increment(Field *nr);
+
+	int		write_row(byte * buf);
+	int		update_row(const byte * old_data, byte * new_data);
+	int		delete_row(const byte * buf);
+
+	/* Index access functions: */
+	int		xt_index_in_range(register XTOpenTablePtr ot, register XTIndexPtr ind, register XTIdxSearchKeyPtr search_key, byte *buf);
+	int		xt_index_next_read(register XTOpenTablePtr ot, register XTIndexPtr ind, xtBool key_only, register XTIdxSearchKeyPtr search_key, byte *buf);
+	int		xt_index_prev_read(XTOpenTablePtr ot, XTIndexPtr ind, xtBool key_only, register XTIdxSearchKeyPtr search_key, byte *buf);
+	int		index_init(uint idx, bool sorted);
+	int		index_end();
+	int		index_read(byte * buf, const byte * key,
+								 uint key_len, enum ha_rkey_function find_flag);
+	int		index_read_idx(byte * buf, uint idx, const byte * key,
+										 uint key_len, enum ha_rkey_function find_flag);
+	int		index_read_xt(byte * buf, uint idx, const byte * key,
+										 uint key_len, enum ha_rkey_function find_flag);
+	int		index_next(byte * buf);
+	int		index_next_same(byte * buf, const byte *key, uint length);
+	int		index_prev(byte * buf);
+	int		index_first(byte * buf);
+	int		index_last(byte * buf);
+	int		index_read_last(byte * buf, const byte * key, uint key_len);
+
+	/* Sequential scan functions: */
+	int		rnd_init(bool scan);								//required
+	int		rnd_end();
+	int		rnd_next(byte *buf);								//required
+	int		rnd_pos(byte * buf, byte *pos);													 //required
+	void	position(const byte *record);			//required
+#if MYSQL_VERSION_ID < 50114
+	void	info(uint);
+#else
+	int		info(uint);
+#endif
+
+	int		extra(enum ha_extra_function operation);
+	int		reset(void);
+	int		external_lock(THD *thd, int lock_type);									 //required
+	int		start_stmt(THD *thd, thr_lock_type lock_type);
+	void	unlock_row();
+	int		delete_all_rows(void);
+	int		repair(THD* thd, HA_CHECK_OPT* check_opt);
+	int		analyze(THD* thd, HA_CHECK_OPT* check_opt);
+	int		optimize(THD* thd, HA_CHECK_OPT* check_opt);
+	int		check(THD* thd, HA_CHECK_OPT* check_opt);
+	ha_rows	records_in_range(uint inx, key_range *min_key, key_range *max_key);
+#ifndef DRIZZLED
+	int		delete_system_table(const char *table_path);
+	int		delete_table(const char *from);
+	int		rename_system_table(const char * from, const char * to);
+	int		rename_table(const char * from, const char * to);
+	int		create(const char *name, TABLE *form, HA_CREATE_INFO *create_info);				//required
+#endif
+	void	update_create_info(HA_CREATE_INFO *create_info);
+
+	THR_LOCK_DATA **store_lock(THD *thd, THR_LOCK_DATA **to, enum thr_lock_type lock_type);		 //required
+
+	/* Foreign key support: */
+	//bool is_fk_defined_on_table_or_index(uint index);
+	char* get_foreign_key_create_info();
+	int get_foreign_key_list(THD *thd, List<FOREIGN_KEY_INFO> *f_key_list);
+	//bool can_switch_engines();
+	uint referenced_by_foreign_key();
+	void free_foreign_key_create_info(char* str);
+
+	virtual bool get_error_message(int error, String *buf);
+};
+
+/* From ha_pbxt.cc: */
+#define XT_TAB_NAME_WITH_EXT_SIZE	XT_TABLE_NAME_SIZE+4
+
+class THD;
+struct XTThread;
+struct XTDatabase;
+
+void			xt_ha_unlock_table(struct XTThread	*self, void *share);
+void			xt_ha_close_global_database(XTThreadPtr self);
+void			xt_ha_open_database_of_table(struct XTThread *self, XTPathStrPtr table_path);
+struct XTThread	*xt_ha_set_current_thread(THD *thd, XTExceptionPtr e);
+void			xt_ha_close_connection(THD* thd);
+struct XTThread	*xt_ha_thd_to_self(THD* thd);
+int				xt_ha_pbxt_to_mysql_error(int xt_err);
+int				xt_ha_pbxt_thread_error_for_mysql(THD *thd, const XTThreadPtr self, int ignore_dup_key);
+void			xt_ha_all_threads_close_database(XTThreadPtr self, XTDatabase *db);
+void			ha_set_auto_increment(XTOpenTablePtr ot, Field *nr);
+
+/*
+ * These hooks are suppossed to only be used by InnoDB:
+ */
+#ifndef DRIZZLED
+#ifdef INNODB_COMPATIBILITY_HOOKS
+extern "C" struct charset_info_st *thd_charset(MYSQL_THD thd);
+extern "C" char **thd_query(MYSQL_THD thd);
+extern "C" int thd_slave_thread(const MYSQL_THD thd);
+extern "C" int thd_non_transactional_update(const MYSQL_THD thd);
+extern "C" int thd_binlog_format(const MYSQL_THD thd);
+extern "C" void thd_mark_transaction_to_rollback(MYSQL_THD thd, bool all);
+#else
+#define thd_charset(t)						(t)->charset()
+#define thd_query(t)						&(t)->query
+#define thd_slave_thread(t)					(t)->slave_thread
+#define thd_non_transactional_update(t)		(t)->transaction.all.modified_non_trans_table
+#define thd_binlog_format(t)				(t)->variables.binlog_format
+#define thd_mark_transaction_to_rollback(t)	mark_transaction_to_rollback(t, all)
+#endif // INNODB_COMPATIBILITY_HOOKS */
+#endif /* !DRIZZLED */
+
+/* How to lock MySQL mutexes! */
+#ifdef SAFE_MUTEX
+
+#if MYSQL_VERSION_ID < 60000
+#if MYSQL_VERSION_ID < 50123
+#define myxt_mutex_lock(x)		safe_mutex_lock(x,__FILE__,__LINE__)
+#else
+#define myxt_mutex_lock(x)		safe_mutex_lock(x,0,__FILE__,__LINE__)
+#endif
+#else
+#if MYSQL_VERSION_ID < 60004
+#define myxt_mutex_lock(x)		safe_mutex_lock(x,__FILE__,__LINE__)
+#else
+#define myxt_mutex_lock(x)		safe_mutex_lock(x,0,__FILE__,__LINE__)
+#endif
+#endif
+
+#define myxt_mutex_t			safe_mutex_t
+#define myxt_mutex_unlock(x)	safe_mutex_unlock(x,__FILE__,__LINE__)
+
+#else // SAFE_MUTEX
+
+#ifdef MY_PTHREAD_FASTMUTEX
+#define myxt_mutex_lock(x)		my_pthread_fastmutex_lock(x)
+#define myxt_mutex_t			my_pthread_fastmutex_t
+#define myxt_mutex_unlock(x)	pthread_mutex_unlock(&(x)->mutex)
+#else
+#define myxt_mutex_lock(x)		pthread_mutex_lock(x)
+#define myxt_mutex_t			pthread_mutex_t
+#define myxt_mutex_unlock(x)	pthread_mutex_unlock(x)
+#endif
+
+#endif // SAFE_MUTEX
+
+#endif
+
diff --git a/storage/pbxt/src/ha_xtsys.cc b/storage/pbxt/src/ha_xtsys.cc
new file mode 100644
index 00000000000..c76f60267be
--- /dev/null
+++ b/storage/pbxt/src/ha_xtsys.cc
@@ -0,0 +1,252 @@
+/* Copyright (c) 2008 PrimeBase Technologies GmbH, Germany
+ *
+ * PrimeBase Media Stream for MySQL
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * Paul McCullagh
+ *
+ * 2007-05-20
+ *
+ * H&G2JCtL
+ *
+ * Table handler.
+ *
+ */
+
+#ifdef USE_PRAGMA_IMPLEMENTATION
+#pragma implementation				// gcc: Class implementation
+#endif
+
+#include "xt_config.h"
+
+#include <stdlib.h>
+#include <time.h>
+
+#ifdef DRIZZLED
+#include <drizzled/server_includes.h>
+#endif
+
+#include "ha_xtsys.h"
+#include "ha_pbxt.h"
+
+#include "strutil_xt.h"
+#include "database_xt.h"
+#include "discover_xt.h"
+#include "systab_xt.h"
+#include "xt_defs.h"
+
+/* Note: mysql_priv.h messes with new, which caused a crash. */
+#ifdef new
+#undef new
+#endif
+
+/*
+ * ---------------------------------------------------------------
+ * HANDLER INTERFACE
+ */
+
+ha_xtsys::ha_xtsys(handlerton *hton, TABLE_SHARE *table_arg):
+handler(hton, table_arg),
+ha_open_tab(NULL)
+{
+	init();
+}
+
+static const char *ha_pbms_exts[] = {
+	"",
+	NullS
+};
+
+const char **ha_xtsys::bas_ext() const
+{
+	return ha_pbms_exts;
+}
+
+int ha_xtsys::open(const char *table_path, int XT_UNUSED(mode), uint XT_UNUSED(test_if_locked))
+{
+	THD				*thd = current_thd;
+	XTExceptionRec	e;
+	XTThreadPtr		self;
+	int				err = 0;
+
+	if (!(self = xt_ha_set_current_thread(thd, &e)))
+		return xt_ha_pbxt_to_mysql_error(e.e_xt_err);
+
+	try_(a) {
+		xt_ha_open_database_of_table(self, (XTPathStrPtr) table_path);
+
+		ha_open_tab = XTSystemTableShare::openSystemTable(self, table_path, table);
+		thr_lock_data_init(ha_open_tab->ost_share->sts_my_lock, &ha_lock, NULL);
+		ref_length = ha_open_tab->getRefLen();
+	}
+	catch_(a) {
+		err = xt_ha_pbxt_thread_error_for_mysql(thd, self, FALSE);
+		if (ha_open_tab) {
+			ha_open_tab->release(self);
+			ha_open_tab = NULL;
+		}
+	}
+	cont_(a);
+
+	return err;
+}
+
+int ha_xtsys::close(void)
+{
+	THD						*thd = current_thd;
+	XTExceptionRec			e;
+	volatile XTThreadPtr	self = NULL;
+	int						err = 0;
+
+	if (thd)
+		self = xt_ha_set_current_thread(thd, &e);
+	else {
+		if (!(self = xt_create_thread("TempForClose", FALSE, TRUE, &e))) {
+			xt_log_exception(NULL, &e, XT_LOG_DEFAULT);
+			return 0;
+		}
+	}
+
+	if (self) {
+		try_(a) {
+			if (ha_open_tab) {
+				ha_open_tab->release(self);
+				ha_open_tab = NULL;
+			}
+		}
+		catch_(a) {
+			err = xt_ha_pbxt_thread_error_for_mysql(thd, self, FALSE);
+		}
+		cont_(a);
+
+		if (!thd)
+			xt_free_thread(self);
+	}
+	else
+		xt_log(XT_NS_CONTEXT, XT_LOG_WARNING, "Unable to release table reference\n");
+
+	return err;
+}
+
+int ha_xtsys::rnd_init(bool XT_UNUSED(scan))
+{
+	int err = 0;
+
+	if (!ha_open_tab->seqScanInit())
+		err = xt_ha_pbxt_thread_error_for_mysql(current_thd, xt_get_self(), FALSE);
+
+	return err;
+}
+
+int ha_xtsys::rnd_next(byte *buf)
+{
+	bool	eof;
+	int		err = 0;
+
+	if (!ha_open_tab->seqScanNext((char *) buf, &eof)) {
+		if (eof)
+			err = HA_ERR_END_OF_FILE;
+		else
+			err = xt_ha_pbxt_thread_error_for_mysql(current_thd, xt_get_self(), FALSE);
+	}
+
+	return err;
+}
+
+void ha_xtsys::position(const byte *record)
+{
+	xtWord4 rec_id;
+	rec_id = ha_open_tab->seqScanPos((xtWord1 *) record);
+	mi_int4store((xtWord1 *) ref, rec_id);
+}
+
+int ha_xtsys::rnd_pos(byte * buf, byte *pos)
+{
+	int		err = 0;
+	xtWord4	rec_id;
+
+	rec_id = mi_uint4korr((xtWord1 *) pos);
+	if (!ha_open_tab->seqScanRead(rec_id, (char *) buf))
+		err = xt_ha_pbxt_thread_error_for_mysql(current_thd, xt_get_self(), FALSE);
+
+	return err;
+}
+
+int ha_xtsys::info(uint XT_UNUSED(flag))
+{
+	return 0;
+}
+
+int ha_xtsys::external_lock(THD *thd, int lock_type)
+{
+	XTExceptionRec	e;
+	XTThreadPtr		self;
+	int				err = 0;
+	bool			ok;
+
+	if (!(self = xt_ha_set_current_thread(thd, &e)))
+		return xt_ha_pbxt_to_mysql_error(e.e_xt_err);
+
+	if (lock_type == F_UNLCK)
+		ok = ha_open_tab->unuse();
+	else
+		ok = ha_open_tab->use();
+
+	if (!ok)
+		err = xt_ha_pbxt_thread_error_for_mysql(current_thd, xt_get_self(), FALSE);
+
+	return err;
+}
+
+THR_LOCK_DATA **ha_xtsys::store_lock(THD *XT_UNUSED(thd), THR_LOCK_DATA **to, enum thr_lock_type lock_type)
+{
+	if (lock_type != TL_IGNORE && ha_lock.type == TL_UNLOCK)
+		ha_lock.type = lock_type;
+	*to++ = &ha_lock;
+	return to;
+}
+
+/* Note: ha_pbxt::delete_system_table is called instead. */
+int ha_xtsys::delete_table(const char *XT_UNUSED(table_path))
+{
+	/* Should never be called */
+	return 0;
+}
+
+int ha_xtsys::create(const char *XT_UNUSED(name), TABLE *XT_UNUSED(table_arg), HA_CREATE_INFO *XT_UNUSED(create_info))
+{
+	/* Allow the table to be created.
+	 * This is required after a dump is restored.
+	 */
+	return 0;
+}
+
+bool ha_xtsys::get_error_message(int XT_UNUSED(error), String *buf)
+{
+	THD				*thd = current_thd;
+	XTExceptionRec	e;
+	XTThreadPtr		self;
+
+	if (!(self = xt_ha_set_current_thread(thd, &e)))
+		return FALSE;
+
+	if (!self->t_exception.e_xt_err)
+		return FALSE;
+
+	buf->copy(self->t_exception.e_err_msg, strlen(self->t_exception.e_err_msg), system_charset_info);
+	return TRUE;
+}
+
diff --git a/storage/pbxt/src/ha_xtsys.h b/storage/pbxt/src/ha_xtsys.h
new file mode 100644
index 00000000000..16fb4a140ae
--- /dev/null
+++ b/storage/pbxt/src/ha_xtsys.h
@@ -0,0 +1,96 @@
+/* Copyright (c) 2008 PrimeBase Technologies GmbH, Germany
+ *
+ * PrimeBase XT
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * Paul McCullagh
+ *
+ * 2007-05-20
+ *
+ * H&G2JCtL
+ *
+ * PBXT System Table handler.
+ *
+ */
+#ifndef __HA_XTSYS_H__
+#define __HA_XTSYS_H__
+
+#ifdef DRIZZLED
+#include <drizzled/common.h>
+#include <drizzled/handler_structs.h>
+#include <drizzled/current_session.h>
+#include <drizzled/cursor.h>
+#else
+#include "mysql_priv.h"
+#endif
+
+#include "xt_defs.h"
+
+#ifdef USE_PRAGMA_INTERFACE
+#pragma interface			/* gcc class implementation */
+#endif
+
+#if MYSQL_VERSION_ID >= 50120
+#define byte uchar
+#endif
+
+class XTOpenSystemTable;
+
+class ha_xtsys: public handler
+{
+	THR_LOCK_DATA		ha_lock;			///< MySQL lock
+	XTOpenSystemTable	*ha_open_tab;
+
+public:
+	ha_xtsys(handlerton *hton, TABLE_SHARE *table_arg);
+	~ha_xtsys() { }
+
+	const char *table_type() const { return "PBXT"; }
+
+	const char *index_type(uint XT_UNUSED(inx)) {
+		return "NONE";
+	}
+
+	const char **bas_ext() const;
+
+	MX_TABLE_TYPES_T table_flags() const {
+		return HA_BINLOG_ROW_CAPABLE | HA_BINLOG_STMT_CAPABLE;
+	}
+
+	MX_ULONG_T index_flags(uint XT_UNUSED(inx), uint XT_UNUSED(part), bool XT_UNUSED(all_parts)) const {
+		return (HA_READ_NEXT | HA_READ_PREV | HA_READ_RANGE | HA_KEYREAD_ONLY);
+	}
+	uint	max_supported_keys()			const { return 512; }
+	uint	max_supported_key_part_length() const { return 1024; }
+
+	int		open(const char *name, int mode, uint test_if_locked);
+	int		close(void);
+	int		rnd_init(bool scan);
+	int		rnd_next(byte *buf);
+	int		rnd_pos(byte * buf, byte *pos);
+	void	position(const byte *record);
+	int		info(uint);
+
+	int		external_lock(THD *thd, int lock_type);
+	int		delete_table(const char *from);
+	int		create(const char *name, TABLE *form, HA_CREATE_INFO *create_info);
+
+	THR_LOCK_DATA **store_lock(THD *thd, THR_LOCK_DATA **to, enum thr_lock_type lock_type);
+	bool get_error_message(int error, String *buf);
+};
+
+#endif
+
diff --git a/storage/pbxt/src/hashtab_xt.cc b/storage/pbxt/src/hashtab_xt.cc
new file mode 100644
index 00000000000..80ba86a5248
--- /dev/null
+++ b/storage/pbxt/src/hashtab_xt.cc
@@ -0,0 +1,264 @@
+/* Copyright (c) 2005 PrimeBase Technologies GmbH
+ *
+ * PrimeBase XT
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * 2005-01-15	Paul McCullagh
+ *
+ */
+
+#include "xt_config.h"
+
+#include <ctype.h>
+
+#include "pthread_xt.h"
+#include "heap_xt.h"
+#include "thread_xt.h"
+#include "hashtab_xt.h"
+
+XTHashTabPtr xt_new_hashtable(XTThreadPtr self, XTHTCompareFunc comp_func, XTHTHashFunc hash_func, XTHTFreeFunc free_func, xtBool with_lock, xtBool with_cond)
+{
+	XTHashTabPtr	ht;
+	xtHashValue		tab_size = 223;
+
+	ht = (XTHashTabPtr) xt_calloc(self, offsetof(XTHashTabRec, ht_items) + (sizeof(XTHashItemPtr) * tab_size));
+	ht->ht_comp_func = comp_func;
+	ht->ht_hash_func = hash_func;
+	ht->ht_free_func = free_func;
+	ht->ht_tab_size = tab_size;
+
+	if (with_lock || with_cond) {
+		ht->ht_lock = (xt_mutex_type *) xt_calloc(self, sizeof(xt_mutex_type));
+		try_(a) {
+			xt_init_mutex_with_autoname(self, ht->ht_lock);
+		}
+		catch_(a) {
+			xt_free(self, ht->ht_lock);
+			xt_free(self, ht);
+			throw_();
+		}
+		cont_(a);
+	}
+
+	if (with_cond) {
+		ht->ht_cond = (xt_cond_type *) xt_calloc(self, sizeof(xt_cond_type));
+		try_(b) {
+			xt_init_cond(self, ht->ht_cond);
+		}
+		catch_(b) {
+			xt_free(self, ht->ht_cond);
+			ht->ht_cond = NULL;
+			xt_free_hashtable(self, ht);
+			throw_();
+		}
+		cont_(b);
+	}
+
+	return ht;
+}
+
+void xt_free_hashtable(XTThreadPtr self, XTHashTabPtr ht)
+{
+	xtHashValue		i;
+	XTHashItemPtr	item, tmp_item;
+
+	if (ht->ht_lock)
+		xt_lock_mutex(self, ht->ht_lock);
+	for (i=0; i<ht->ht_tab_size; i++) {
+		item = ht->ht_items[i];
+		while (item) {
+			if (ht->ht_free_func)
+				(*ht->ht_free_func)(self, item->hi_data);
+			tmp_item = item;
+			item = item->hi_next;
+			xt_free(self, tmp_item);
+		}
+	}
+	if (ht->ht_lock)
+		xt_unlock_mutex(self, ht->ht_lock);
+	if (ht->ht_lock) {
+		xt_free_mutex(ht->ht_lock);
+		xt_free(self, ht->ht_lock);
+	}
+	if (ht->ht_cond) {
+		xt_free_cond(ht->ht_cond);
+		xt_free(self, ht->ht_cond);
+	}
+	xt_free(self, ht);
+}
+
+xtPublic void xt_ht_put(XTThreadPtr self, XTHashTabPtr ht, void *data)
+{
+	XTHashItemPtr	item = NULL;
+	xtHashValue		h;
+
+	pushr_(ht->ht_free_func, data);
+	h = (*ht->ht_hash_func)(FALSE, data);
+	item = (XTHashItemPtr) xt_malloc(self, sizeof(XTHashItemRec));
+	item->hi_data = data;
+	item->hi_hash = h;
+	item->hi_next = ht->ht_items[h % ht->ht_tab_size];
+	ht->ht_items[h % ht->ht_tab_size] = item;
+	popr_();
+}
+
+xtPublic void *xt_ht_get(XTThreadPtr XT_UNUSED(self), XTHashTabPtr ht, void *key)
+{
+	XTHashItemPtr	item;
+	xtHashValue		h;
+	void			*data = NULL;
+
+	h = (*ht->ht_hash_func)(TRUE, key);
+
+	item = ht->ht_items[h % ht->ht_tab_size];
+	while (item) {
+		if (item->hi_hash == h && (*ht->ht_comp_func)(key, item->hi_data)) {
+			data = item->hi_data;
+			break;
+		}
+		item = item->hi_next;
+	}
+	
+	return data;
+}
+
+xtPublic xtBool xt_ht_del(XTThreadPtr self, XTHashTabPtr ht, void *key)
+{
+	XTHashItemPtr	item, pitem = NULL;
+	xtHashValue		h;
+	xtBool			found = FALSE;
+
+	h = (*ht->ht_hash_func)(TRUE, key);
+
+	item = ht->ht_items[h % ht->ht_tab_size];
+	while (item) {
+		if (item->hi_hash == h && (*ht->ht_comp_func)(key, item->hi_data)) {
+			void *data;
+
+			found = TRUE;
+			data = item->hi_data;
+			
+			/* Unlink the item: */
+			if (pitem)
+				pitem->hi_next = item->hi_next;
+			else
+				ht->ht_items[h % ht->ht_tab_size] = item->hi_next;
+
+			/* Free the item: */
+			xt_free(self, item);
+
+			/* Free the data */
+			if (ht->ht_free_func)
+				(*ht->ht_free_func)(self, data);
+			break;
+		}
+		pitem = item;
+		item = item->hi_next;
+	}
+	
+	return found;
+}
+
+xtPublic xtHashValue xt_ht_hash(char *s)
+{
+	register char *p;
+	register xtHashValue h = 0, g;
+	
+	p = s; 
+	while (*p) {
+		h = (h << 4) + *p;
+		/* Assignment intended here! */
+		if ((g = h & 0xF0000000)) {
+			h = h ^ (g >> 24);
+			h = h ^ g;
+		}
+		p++;
+	}
+	return h;
+}
+
+/*
+ * The case-insensitive version of the hash...
+ */
+xtPublic xtHashValue xt_ht_casehash(char *s)
+{
+	register char *p;
+	register xtHashValue h = 0, g;
+	
+	p = s; 
+	while (*p) {
+		h = (h << 4) + tolower(*p);
+		/* Assignment intended here! */
+		if ((g = h & 0xF0000000)) {
+			h = h ^ (g >> 24);
+			h = h ^ g;
+		}
+		p++;
+	}
+	return h;
+}
+
+xtPublic xtBool xt_ht_lock(XTThreadPtr self, XTHashTabPtr ht)
+{
+	if (ht->ht_lock)
+		return xt_lock_mutex(self, ht->ht_lock);
+	return TRUE;
+}
+
+xtPublic void xt_ht_unlock(XTThreadPtr self, XTHashTabPtr ht)
+{
+	if (ht->ht_lock)
+		xt_unlock_mutex(self, ht->ht_lock);
+}
+
+xtPublic void xt_ht_wait(XTThreadPtr self, XTHashTabPtr ht)
+{
+	xt_wait_cond(self, ht->ht_cond, ht->ht_lock);
+}
+
+xtPublic void xt_ht_timed_wait(XTThreadPtr self, XTHashTabPtr ht, u_long milli_sec)
+{
+	xt_timed_wait_cond(self, ht->ht_cond, ht->ht_lock, milli_sec);
+}
+
+xtPublic void xt_ht_signal(XTThreadPtr self, XTHashTabPtr ht)
+{
+	xt_signal_cond(self, ht->ht_cond);
+}
+
+xtPublic void xt_ht_enum(struct XTThread *XT_UNUSED(self), XTHashTabPtr ht, XTHashEnumPtr en)
+{
+	en->he_i = 0;
+	en->he_item = NULL;
+	en->he_ht = ht;
+}
+
+xtPublic void *xt_ht_next(struct XTThread *XT_UNUSED(self), XTHashEnumPtr en)
+{
+	if (en->he_item) {
+		en->he_item = en->he_item->hi_next;
+		if (en->he_item)
+			return en->he_item->hi_data;
+		en->he_i++;
+	}
+	while (en->he_i < en->he_ht->ht_tab_size) {
+		if ((en->he_item = en->he_ht->ht_items[en->he_i]))
+			return en->he_item->hi_data;
+		en->he_i++;
+	}
+	return NULL;
+}
+
diff --git a/storage/pbxt/src/hashtab_xt.h b/storage/pbxt/src/hashtab_xt.h
new file mode 100644
index 00000000000..d6085c4288d
--- /dev/null
+++ b/storage/pbxt/src/hashtab_xt.h
@@ -0,0 +1,78 @@
+/* Copyright (c) 2005 PrimeBase Technologies GmbH
+ *
+ * PrimeBase XT
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * 2005-01-15	Paul McCullagh
+ *
+ * H&G2JCtL
+ */
+#ifndef __xt_hashtab_h__
+#define __xt_hashtab_h__
+
+#include "xt_defs.h"
+
+struct XTThread;
+
+#define xtHashValue			u_int
+
+typedef xtBool (*XTHTCompareFunc)(void *key, void *data);
+typedef xtHashValue (*XTHTHashFunc)(xtBool is_key, void *key_data);
+typedef void (*XTHTFreeFunc)(struct XTThread *self, void *item);
+
+typedef struct XTHashItem {
+	struct XTHashItem		*hi_next;
+	xtHashValue				hi_hash;
+	void					*hi_data;
+} XTHashItemRec, *XTHashItemPtr;
+
+typedef struct XTHashTab {
+	XTHTCompareFunc			ht_comp_func;
+	XTHTHashFunc			ht_hash_func;
+	XTHTFreeFunc			ht_free_func;
+	xt_mutex_type			*ht_lock;
+	xt_cond_type			*ht_cond;
+
+	xtHashValue				ht_tab_size;
+	XTHashItemPtr			ht_items[XT_VAR_LENGTH];
+} XTHashTabRec, *XTHashTabPtr;
+
+typedef struct XTHashEnum {
+	u_int					he_i;
+	XTHashItemPtr			he_item;
+	XTHashTabPtr			he_ht;
+} XTHashEnumRec, *XTHashEnumPtr;
+
+XTHashTabPtr	xt_new_hashtable(struct XTThread *self, XTHTCompareFunc comp_func, XTHTHashFunc hash_func, XTHTFreeFunc free_func, xtBool with_lock, xtBool with_cond);
+void			xt_free_hashtable(struct XTThread *self, XTHashTabPtr ht);
+
+void			xt_ht_put(struct XTThread *self, XTHashTabPtr ht, void *data);
+void			*xt_ht_get(struct XTThread *self, XTHashTabPtr ht, void *key);
+xtBool			xt_ht_del(struct XTThread *self, XTHashTabPtr ht, void *key);
+
+xtHashValue		xt_ht_hash(char *s);
+xtHashValue		xt_ht_casehash(char *s);
+
+xtBool			xt_ht_lock(struct XTThread *self, XTHashTabPtr ht);
+void			xt_ht_unlock(struct XTThread *self, XTHashTabPtr ht);
+void			xt_ht_wait(struct XTThread *self, XTHashTabPtr ht);
+void			xt_ht_timed_wait(struct XTThread *self, XTHashTabPtr ht, u_long milli_sec);
+void			xt_ht_signal(struct XTThread *self, XTHashTabPtr ht);
+
+void			xt_ht_enum(struct XTThread *self, XTHashTabPtr ht, XTHashEnumPtr en);
+void			*xt_ht_next(struct XTThread *self, XTHashEnumPtr en);
+
+#endif
diff --git a/storage/pbxt/src/heap_xt.cc b/storage/pbxt/src/heap_xt.cc
new file mode 100644
index 00000000000..a4e3fec1611
--- /dev/null
+++ b/storage/pbxt/src/heap_xt.cc
@@ -0,0 +1,129 @@
+/* Copyright (c) 2005 PrimeBase Technologies GmbH
+ *
+ * PrimeBase XT
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * 2005-01-10	Paul McCullagh
+ *
+ * H&G2JCtL
+ */
+
+#include "xt_config.h"
+
+#include "pthread_xt.h"
+#include "heap_xt.h"
+#include "thread_xt.h"
+
+#ifdef xt_heap_new
+#undef xt_heap_new
+#endif
+
+#ifdef DEBUG_MEMORY
+xtPublic XTHeapPtr xt_mm_heap_new(XTThreadPtr self, size_t size, XTFinalizeFunc finalize, u_int line, c_char *file, xtBool track)
+#else
+xtPublic XTHeapPtr xt_heap_new(XTThreadPtr self, size_t size, XTFinalizeFunc finalize)
+#endif
+{
+	volatile XTHeapPtr	hp;
+	
+#ifdef DEBUG_MEMORY
+	hp = (XTHeapPtr) xt_mm_calloc(self, size, line, file);
+	hp->h_track = track;
+	if (track)
+		printf("HEAP: +1  1 %s:%d\n", file, (int) line);
+#else
+	hp = (XTHeapPtr) xt_calloc(self, size);
+#endif
+	if (!hp)
+		return NULL;
+
+	try_(a) {
+		xt_spinlock_init_with_autoname(self, &hp->h_lock);
+	}
+	catch_(a) {
+		xt_free(self, hp);
+		throw_();
+	}
+	cont_(a);
+
+	hp->h_ref_count = 1;
+	hp->h_finalize = finalize;
+	hp->h_onrelease = NULL;
+	return hp;
+}
+
+xtPublic void xt_check_heap(XTThreadPtr XT_NDEBUG_UNUSED(self), XTHeapPtr XT_NDEBUG_UNUSED(hp))
+{
+#ifdef DEBUG_MEMORY
+	xt_mm_malloc_size(self, hp);
+#endif
+}
+
+#ifdef DEBUG_MEMORY
+xtPublic void xt_mm_heap_reference(XTThreadPtr XT_UNUSED(self), XTHeapPtr hp, u_int line, c_char *file)
+#else
+xtPublic void xt_heap_reference(XTThreadPtr, XTHeapPtr hp)
+#endif
+{
+	xt_spinlock_lock(&hp->h_lock);
+#ifdef DEBUG_MEMORY
+	if (hp->h_track)
+		printf("HEAP: +1 %d->%d %s:%d\n", (int) hp->h_ref_count, (int) hp->h_ref_count+1, file, (int) line);
+#endif
+	hp->h_ref_count++;
+	xt_spinlock_unlock(&hp->h_lock);
+}
+
+xtPublic void xt_heap_release(XTThreadPtr self, XTHeapPtr hp)
+{	
+	if (!hp)
+		return;
+#ifdef DEBUG_MEMORY
+	xt_spinlock_lock(&hp->h_lock);
+	ASSERT(hp->h_ref_count != 0);
+	xt_spinlock_unlock(&hp->h_lock);
+#endif
+	xt_spinlock_lock(&hp->h_lock);
+	if (hp->h_onrelease)
+		(*hp->h_onrelease)(self, hp);
+	if (hp->h_ref_count > 0) {
+#ifdef DEBUG_MEMORY
+	if (hp->h_track)
+		printf("HEAP: -1 %d->%d\n", (int) hp->h_ref_count, (int) hp->h_ref_count-1);
+#endif
+		hp->h_ref_count--;
+		if (hp->h_ref_count == 0) {
+			if (hp->h_finalize)
+				(*hp->h_finalize)(self, hp);
+			xt_spinlock_unlock(&hp->h_lock);
+			xt_free(self, hp);
+			return;
+		}
+	}
+	xt_spinlock_unlock(&hp->h_lock);
+}
+
+xtPublic void xt_heap_set_release_callback(XTThreadPtr XT_UNUSED(self), XTHeapPtr hp, XTFinalizeFunc onrelease)
+{
+	hp->h_onrelease = onrelease;
+}
+
+xtPublic u_int xt_heap_get_ref_count(struct XTThread *XT_UNUSED(self), XTHeapPtr hp)
+{
+	return hp->h_ref_count;
+}
+
+
diff --git a/storage/pbxt/src/heap_xt.h b/storage/pbxt/src/heap_xt.h
new file mode 100644
index 00000000000..db7a6909f05
--- /dev/null
+++ b/storage/pbxt/src/heap_xt.h
@@ -0,0 +1,69 @@
+/* Copyright (c) 2005 PrimeBase Technologies GmbH
+ *
+ * PrimeBase XT
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * 2005-01-10	Paul McCullagh
+ *
+ * H&G2JCtL
+ */
+#ifndef __xt_heap_h__
+#define __xt_heap_h__
+
+#include "xt_defs.h"
+#include "lock_xt.h"
+#include "memory_xt.h"
+
+struct XTThread;
+
+/*
+ * Heap memory has a reference count, and a lock for shared access.
+ * It also has a finalize routine which is called before the memory is
+ * freed.
+ */
+typedef void (*XTFinalizeFunc)(struct XTThread *self, void *heap_ptr);
+
+typedef struct XTHeap {
+	XTSpinLockRec			h_lock;					/* Prevent concurrent access to the heap memory: */
+	u_int					h_ref_count;			/* So we know when to free (EVERY pointer reference MUST be counted). */
+	XTFinalizeFunc			h_finalize;				/* If non-NULL, call before freeing. */
+	XTFinalizeFunc			h_onrelease;			/* If non-NULL, call on release. */
+#ifdef DEBUG
+	xtBool					h_track;
+#endif
+} XTHeapRec, *XTHeapPtr;
+
+/* Returns with reference count = 1 */
+XTHeapPtr	xt_heap_new(struct XTThread *self, size_t size, XTFinalizeFunc finalize);
+XTHeapPtr	xt_mm_heap_new(struct XTThread *self, size_t size, XTFinalizeFunc finalize, u_int line, c_char *file, xtBool track);
+
+void		xt_heap_set_release_callback(struct XTThread *self, XTHeapPtr mem, XTFinalizeFunc onrelease);
+
+void		xt_heap_reference(struct XTThread *self, XTHeapPtr mem);
+void		xt_mm_heap_reference(struct XTThread *self, XTHeapPtr hp, u_int line, c_char *file);
+
+void		xt_heap_release(struct XTThread *self, XTHeapPtr mem);
+u_int		xt_heap_get_ref_count(struct XTThread *self, XTHeapPtr mem);
+
+void		xt_check_heap(struct XTThread *self, XTHeapPtr mem);
+
+#ifdef DEBUG_MEMORY
+#define xt_heap_new(t, s, f)		xt_mm_heap_new(t, s, f, __LINE__, __FILE__, FALSE)
+#define xt_heap_new_track(t, s, f)	xt_mm_heap_new(t, s, f, __LINE__, __FILE__, TRUE)
+#define xt_heap_reference(t, s)		xt_mm_heap_reference(t, s, __LINE__, __FILE__)
+#endif
+
+#endif
diff --git a/storage/pbxt/src/index_xt.cc b/storage/pbxt/src/index_xt.cc
new file mode 100644
index 00000000000..f6c4b4d8aa3
--- /dev/null
+++ b/storage/pbxt/src/index_xt.cc
@@ -0,0 +1,4694 @@
+/* Copyright (c) 2005 PrimeBase Technologies GmbH
+ *
+ * PrimeBase XT
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * 2005-09-30	Paul McCullagh
+ *
+ * H&G2JCtL
+ */
+
+#include "xt_config.h"
+
+#ifdef DRIZZLED
+#include <bitset>
+#endif
+
+#include <string.h>
+#include <stdio.h>
+#include <stddef.h>
+#ifndef XT_WIN
+#include <strings.h>
+#endif
+
+#ifdef DRIZZLED
+#include <drizzled/base.h>
+#else
+#include "mysql_priv.h"
+#endif
+
+#include "pthread_xt.h"
+#include "memory_xt.h"
+#include "index_xt.h"
+#include "heap_xt.h"
+#include "database_xt.h"
+#include "strutil_xt.h"
+#include "cache_xt.h"
+#include "myxt_xt.h"
+#include "trace_xt.h"
+#include "table_xt.h"
+
+#ifdef DEBUG
+#define MAX_SEARCH_DEPTH			32
+//#define CHECK_AND_PRINT
+//#define CHECK_NODE_REFERENCE
+//#define TRACE_FLUSH
+#define CHECK_PRINTS_RECORD_REFERENCES
+#else
+#define MAX_SEARCH_DEPTH			100
+#endif
+
+#define IND_FLUSH_BUFFER_SIZE		200
+
+typedef struct IdxStackItem {
+	XTIdxItemRec			i_pos;
+	xtIndexNodeID			i_branch;
+} IdxStackItemRec, *IdxStackItemPtr;
+
+typedef struct IdxBranchStack {
+	int						s_top;
+	IdxStackItemRec			s_elements[MAX_SEARCH_DEPTH];
+} IdxBranchStackRec, *IdxBranchStackPtr;
+
+#ifdef DEBUG
+#ifdef TEST_CODE
+static void idx_check_on_key(XTOpenTablePtr ot);
+#endif
+static u_int idx_check_index(XTOpenTablePtr ot, XTIndexPtr ind, xtBool with_lock);
+#endif
+
+static xtBool idx_insert_node(XTOpenTablePtr ot, XTIndexPtr ind, IdxBranchStackPtr stack, XTIdxKeyValuePtr key_value, xtIndexNodeID branch);
+static xtBool idx_remove_lazy_deleted_item_in_node(XTOpenTablePtr ot, XTIndexPtr ind, xtIndexNodeID current, XTIndReferencePtr iref, XTIdxKeyValuePtr key_value);
+
+#ifdef XT_TRACK_INDEX_UPDATES
+
+static xtBool ind_track_write(struct XTOpenTable *ot, struct XTIndex *ind, xtIndexNodeID offset, size_t size, xtWord1 *data)
+{
+	ot->ot_ind_reads++;
+	return xt_ind_write(ot, ind, offset, size, data);
+}
+
+#define XT_IND_WRITE					ind_track_write
+
+#else
+
+#define XT_IND_WRITE					xt_ind_write
+
+#endif
+
+
+#ifdef CHECK_NODE_REFERENCE
+#define IDX_GET_NODE_REF(t, x, o)		idx_get_node_ref(t, x, o)
+#else
+#define IDX_GET_NODE_REF(t, x, o)		XT_GET_NODE_REF(t, (x) - (o))
+#endif
+
+/*
+ * -----------------------------------------------------------------------
+ * DEBUG ACTIVITY
+ */
+
+//#define TRACK_ACTIVITY
+
+#ifdef TRACK_ACTIVITY
+#define TRACK_MAX_BLOCKS			2000
+
+typedef struct TrackBlock {
+	xtWord1				exists;
+	char				*activity;
+} TrackBlockRec, *TrackBlockPtr;
+
+TrackBlockRec		blocks[TRACK_MAX_BLOCKS];
+
+xtPublic void track_work(u_int block, char *what)
+{
+	int len = 0, len2;
+
+	ASSERT_NS(block > 0 && block <= TRACK_MAX_BLOCKS);
+	block--;
+	if (blocks[block].activity)
+		len = strlen(blocks[block].activity);
+	len2 = strlen(what);
+	xt_realloc_ns((void **) &blocks[block].activity, len + len2 + 1);
+	memcpy(blocks[block].activity + len, what, len2 + 1);
+}
+
+static void track_block_exists(xtIndexNodeID block)
+{
+	if (XT_NODE_ID(block) > 0 && XT_NODE_ID(block) <= TRACK_MAX_BLOCKS)
+		blocks[XT_NODE_ID(block)-1].exists = TRUE;
+}
+
+static void track_reset_missing()
+{
+	for (u_int i=0; i<TRACK_MAX_BLOCKS; i++)
+		blocks[i].exists = FALSE;
+}
+
+static void track_dump_missing(xtIndexNodeID eof_block)
+{
+	for (u_int i=0; i<XT_NODE_ID(eof_block)-1; i++) {
+		if (!blocks[i].exists)
+			printf("block missing = %04d %s\n", i+1, blocks[i].activity);
+	}
+}
+
+static void track_dump_all(u_int max_block)
+{
+	for (u_int i=0; i<max_block; i++) {
+		if (blocks[i].exists)
+			printf(" %04d %s\n", i+1, blocks[i].activity);
+		else
+			printf("-%04d %s\n", i+1, blocks[i].activity);
+	}
+}
+
+#endif
+
+xtPublic void xt_ind_track_dump_block(XTTableHPtr XT_UNUSED(tab), xtIndexNodeID XT_UNUSED(address))
+{
+#ifdef TRACK_ACTIVITY
+	u_int i = XT_NODE_ID(address)-1;
+
+	printf("BLOCK %04d %s\n", i+1, blocks[i].activity);
+#endif
+}
+
+#ifdef CHECK_NODE_REFERENCE
+static xtIndexNodeID idx_get_node_ref(XTTableHPtr tab, xtWord1 *ref, u_int node_ref_size)
+{
+	xtIndexNodeID node;
+
+	/* Node is invalid by default: */
+	XT_NODE_ID(node) = 0xFFFFEEEE;
+	if (node_ref_size) {
+		ref -= node_ref_size;
+		node = XT_RET_NODE_ID(XT_GET_DISK_4(ref));
+		if (node >= tab->tab_ind_eof) {
+			xt_register_taberr(XT_REG_CONTEXT, XT_ERR_INDEX_CORRUPTED, tab->tab_name);
+		}
+	}
+	return node;
+}
+#endif
+
+/*
+ * -----------------------------------------------------------------------
+ * Stack functions
+ */
+
+static void idx_newstack(IdxBranchStackPtr stack)
+{
+	stack->s_top = 0;
+}
+
+static xtBool idx_push(IdxBranchStackPtr stack, xtIndexNodeID n, XTIdxItemPtr pos)
+{
+	if (stack->s_top == MAX_SEARCH_DEPTH) {
+		xt_register_error(XT_REG_CONTEXT, XT_ERR_STACK_OVERFLOW, 0, "Index node stack overflow");
+		return FAILED;
+	}
+	stack->s_elements[stack->s_top].i_branch = n;
+	if (pos)
+		stack->s_elements[stack->s_top].i_pos = *pos;
+	stack->s_top++;
+	return OK;
+}
+
+static IdxStackItemPtr idx_pop(IdxBranchStackPtr stack)
+{
+	if (stack->s_top == 0)
+		return NULL;
+	stack->s_top--;
+	return &stack->s_elements[stack->s_top];
+}
+
+static IdxStackItemPtr idx_top(IdxBranchStackPtr stack)
+{
+	if (stack->s_top == 0)
+		return NULL;
+	return &stack->s_elements[stack->s_top-1];
+}
+
+/*
+ * -----------------------------------------------------------------------
+ * Allocation of nodes
+ */
+
+static xtBool idx_new_branch(XTOpenTablePtr ot, XTIndexPtr ind, xtIndexNodeID *address)
+{
+	register XTTableHPtr	tab;
+	xtIndexNodeID			wrote_pos;
+	XTIndFreeBlockRec		free_block;
+	XTIndFreeListPtr		list_ptr;
+
+	tab = ot->ot_table;
+
+	//ASSERT_NS(XT_INDEX_HAVE_XLOCK(ind, ot));
+	if (ind->mi_free_list && ind->mi_free_list->fl_free_count) {
+		ind->mi_free_list->fl_free_count--;
+		*address = ind->mi_free_list->fl_page_id[ind->mi_free_list->fl_free_count];
+		TRACK_BLOCK_ALLOC(*address);
+		return OK;
+	}
+
+	xt_lock_mutex_ns(&tab->tab_ind_lock);
+
+	/* Check the cached free list: */
+	while ((list_ptr = tab->tab_ind_free_list)) {
+		if (list_ptr->fl_start < list_ptr->fl_free_count) {
+			wrote_pos = list_ptr->fl_page_id[list_ptr->fl_start];
+			list_ptr->fl_start++;
+			xt_unlock_mutex_ns(&tab->tab_ind_lock);
+			*address = wrote_pos;
+			TRACK_BLOCK_ALLOC(wrote_pos);
+			return OK;
+		}
+		tab->tab_ind_free_list = list_ptr->fl_next_list;
+		xt_free_ns(list_ptr);
+	}
+
+	if ((XT_NODE_ID(wrote_pos) = XT_NODE_ID(tab->tab_ind_free))) {
+		xtIndexNodeID next_node;
+
+		/* Use the block on the free list: */
+		if (!xt_ind_read_bytes(ot, NULL, wrote_pos, sizeof(XTIndFreeBlockRec), (xtWord1 *) &free_block))
+			goto failed;
+		XT_NODE_ID(next_node) = (xtIndexNodeID) XT_GET_DISK_8(free_block.if_next_block_8);
+		if (XT_NODE_ID(next_node) >= XT_NODE_ID(tab->tab_ind_eof)) {
+			xt_register_taberr(XT_REG_CONTEXT, XT_ERR_INDEX_CORRUPTED, tab->tab_name);
+			goto failed;
+		}
+		XT_NODE_ID(tab->tab_ind_free) = XT_NODE_ID(next_node);
+		xt_unlock_mutex_ns(&tab->tab_ind_lock);
+		*address = wrote_pos;
+		TRACK_BLOCK_ALLOC(wrote_pos);
+		return OK;
+	}
+
+	/* PMC - Dont allow overflow! */
+	if (XT_NODE_ID(tab->tab_ind_eof) >= 0xFFFFFFF) {
+		xt_register_ixterr(XT_REG_CONTEXT, XT_ERR_INDEX_FILE_TO_LARGE, xt_file_path(ot->ot_ind_file));
+		goto failed;
+	}
+	*address = tab->tab_ind_eof;
+	XT_NODE_ID(tab->tab_ind_eof)++;
+	xt_unlock_mutex_ns(&tab->tab_ind_lock);
+	TRACK_BLOCK_ALLOC(*address);
+	return OK;
+
+	failed:
+	xt_unlock_mutex_ns(&tab->tab_ind_lock);
+	return FAILED;
+}
+
+/* Add the block to the private free list of the index.
+ * On flush, this list will be transfered to the global list.
+ */
+static xtBool idx_free_branch(XTOpenTablePtr ot, XTIndexPtr ind, xtIndexNodeID node_id)
+{
+	register u_int		count;
+	register u_int		i;
+	register u_int		guess;
+
+	TRACK_BLOCK_FREE(node_id);
+	//ASSERT_NS(XT_INDEX_HAVE_XLOCK(ind, ot));
+	if (!ind->mi_free_list) {
+		count = 0;
+		if (!(ind->mi_free_list = (XTIndFreeListPtr) xt_calloc_ns(offsetof(XTIndFreeListRec, fl_page_id) + 10 * sizeof(xtIndexNodeID))))
+			return FAILED;
+	}
+	else {
+		count = ind->mi_free_list->fl_free_count;
+		if (!xt_realloc_ns((void **) &ind->mi_free_list, offsetof(XTIndFreeListRec, fl_page_id) + (count + 1) * sizeof(xtIndexNodeID)))
+			return FAILED;
+	}
+ 
+	i = 0;
+	while (i < count) {
+		guess = (i + count - 1) >> 1;
+		if (XT_NODE_ID(node_id) == XT_NODE_ID(ind->mi_free_list->fl_page_id[guess])) {
+			// Should not happen...
+			ASSERT_NS(FALSE);
+			return OK;
+		}
+		if (XT_NODE_ID(node_id) < XT_NODE_ID(ind->mi_free_list->fl_page_id[guess]))
+			count = guess;
+		else
+			i = guess + 1;
+	}
+
+	/* Insert at position i */
+	memmove(ind->mi_free_list->fl_page_id + i + 1, ind->mi_free_list->fl_page_id + i, (ind->mi_free_list->fl_free_count - i) * sizeof(xtIndexNodeID));
+	ind->mi_free_list->fl_page_id[i] = node_id;
+	ind->mi_free_list->fl_free_count++;
+
+	/* Set the cache page to clean: */
+	return xt_ind_clean(ot, ind, node_id);
+}
+
+/*
+ * -----------------------------------------------------------------------
+ * Simple compare functions
+ */
+
+xtPublic int xt_compare_2_int4(XTIndexPtr XT_UNUSED(ind), uint key_length, xtWord1 *key_value, xtWord1 *b_value)
+{
+	int r;
+
+	ASSERT_NS(key_length == 4 || key_length == 8);
+	r = (xtInt4) XT_GET_DISK_4(key_value) - (xtInt4) XT_GET_DISK_4(b_value);
+	if (r == 0 && key_length > 4) {
+		key_value += 4;
+		b_value += 4;
+		r = (xtInt4) XT_GET_DISK_4(key_value) - (xtInt4) XT_GET_DISK_4(b_value);
+	}
+	return r;
+}
+
+xtPublic int xt_compare_3_int4(XTIndexPtr XT_UNUSED(ind), uint key_length, xtWord1 *key_value, xtWord1 *b_value)
+{
+	int r;
+
+	ASSERT_NS(key_length == 4 || key_length == 8 || key_length == 12);
+	r = (xtInt4) XT_GET_DISK_4(key_value) - (xtInt4) XT_GET_DISK_4(b_value);
+	if (r == 0 && key_length > 4) {
+		key_value += 4;
+		b_value += 4;
+		r = (xtInt4) XT_GET_DISK_4(key_value) - (xtInt4) XT_GET_DISK_4(b_value);
+		if (r == 0 && key_length > 8) {
+			key_value += 4;
+			b_value += 4;
+			r = (xtInt4) XT_GET_DISK_4(key_value) - (xtInt4) XT_GET_DISK_4(b_value);
+		}
+	}
+	return r;
+}
+
+/*
+ * -----------------------------------------------------------------------
+ * Tree branch sanning (searching nodes and leaves)
+ */
+
+xtPublic void xt_scan_branch_single(struct XTTable *XT_UNUSED(tab), XTIndexPtr ind, XTIdxBranchDPtr branch, register XTIdxKeyValuePtr value, register XTIdxResultRec *result)
+{
+	XT_NODE_TEMP;
+	u_int				branch_size;
+	u_int				node_ref_size;
+	u_int				full_item_size;
+	int					search_flags;
+	register xtWord1	*base;
+	register u_int		i;
+	register xtWord1	*bitem;
+
+	branch_size = XT_GET_DISK_2(branch->tb_size_2);
+	node_ref_size = XT_IS_NODE(branch_size) ? XT_NODE_REF_SIZE : 0;
+
+	result->sr_found = FALSE;
+	result->sr_duplicate = FALSE;
+	result->sr_item.i_total_size = XT_GET_BRANCH_DATA_SIZE(branch_size);
+	ASSERT_NS((int) result->sr_item.i_total_size >= 0 && result->sr_item.i_total_size <= XT_INDEX_PAGE_SIZE-2);
+
+	result->sr_item.i_item_size = ind->mi_key_size + XT_RECORD_REF_SIZE;
+	full_item_size = result->sr_item.i_item_size + node_ref_size;
+	result->sr_item.i_node_ref_size = node_ref_size;
+
+	search_flags = value->sv_flags;
+	base = branch->tb_data + node_ref_size;
+	if (search_flags & XT_SEARCH_FIRST_FLAG)
+		i = 0;
+	else if (search_flags & XT_SEARCH_AFTER_LAST_FLAG)
+		i = (result->sr_item.i_total_size - node_ref_size) / full_item_size;
+	else {
+		register u_int		guess;
+		register u_int		count;
+		register xtInt4		r;
+		xtRecordID			key_record;
+
+		key_record = value->sv_rec_id;
+		count = (result->sr_item.i_total_size - node_ref_size) / full_item_size;
+
+		ASSERT_NS(ind);
+		i = 0;
+		while (i < count) {
+			guess = (i + count - 1) >> 1;
+
+			bitem = base + guess * full_item_size;
+
+			switch (ind->mi_single_type) {
+				case HA_KEYTYPE_LONG_INT: {
+					register xtInt4 a, b;
+					
+					a = XT_GET_DISK_4(value->sv_key);
+					b = XT_GET_DISK_4(bitem);
+					r = (a < b) ? -1 : (a == b ? 0 : 1);
+					break;
+				}
+				case HA_KEYTYPE_ULONG_INT: {
+					register xtWord4 a, b;
+					
+					a = XT_GET_DISK_4(value->sv_key);
+					b = XT_GET_DISK_4(bitem);
+					r = (a < b) ? -1 : (a == b ? 0 : 1);
+					break;
+				}
+				default:
+					/* Should not happen: */
+					r = 1;
+					break;
+			}
+			if (r == 0) {
+				if (search_flags & XT_SEARCH_WHOLE_KEY) {
+					xtRecordID	item_record;
+					xtRowID		row_id;
+					
+					xt_get_record_ref(bitem + ind->mi_key_size, &item_record, &row_id);
+
+					/* This should not happen because we should never
+					 * try to insert the same record twice into the 
+					 * index!
+					 */
+					result->sr_duplicate = TRUE;
+					if (key_record == item_record) {
+						result->sr_found = TRUE;
+						result->sr_rec_id = item_record;
+						result->sr_row_id = row_id;
+						result->sr_branch = IDX_GET_NODE_REF(tab, bitem, node_ref_size);
+						result->sr_item.i_item_offset = node_ref_size + guess * full_item_size;
+						return;
+					}
+					if (key_record < item_record)
+						r = -1;
+					else
+						r = 1;
+				}
+				else {
+					result->sr_found = TRUE;
+					/* -1 causes a search to the beginning of the duplicate list of keys.
+					 * 1 causes a search to just after the key.
+				 	*/
+					if (search_flags & XT_SEARCH_AFTER_KEY)
+						r = 1;
+					else
+						r = -1;
+				}
+			}
+
+			if (r < 0)
+				count = guess;
+			else
+				i = guess + 1;
+		}
+	}
+
+	bitem = base + i * full_item_size;
+	xt_get_res_record_ref(bitem + ind->mi_key_size, result);
+	result->sr_branch = IDX_GET_NODE_REF(tab, bitem, node_ref_size);			/* Only valid if this is a node. */
+	result->sr_item.i_item_offset = node_ref_size + i * full_item_size;
+}
+
+/*
+ * We use a special binary search here. It basically assumes that the values
+ * in the index are not unique.
+ *
+ * Even if they are unique, when we search for part of a key, then it is
+ * effectively the case.
+ *
+ * So in the situation where we find duplicates in the index we usually
+ * want to position ourselves at the beginning of the duplicate list.
+ *
+ * Alternatively a search can find the position just after a given key.
+ *
+ * To achieve this we make the following modifications:
+ * - The result of the comparison is always returns 1 or -1. We only stop
+ *   the search early in the case an exact match when inserting (but this
+ *   should not happen anyway).
+ * - The search never actually fails, but sets 'found' to TRUE if it
+ *   sees the search key in the index.
+ *
+ * If the search value exists in the index we know that
+ * this method will take us to the first occurrence of the key in the
+ * index (in the case of -1) or to the first value after the
+ * the search key in the case of 1.
+ */
+xtPublic void xt_scan_branch_fix(struct XTTable *XT_UNUSED(tab), XTIndexPtr ind, XTIdxBranchDPtr branch, register XTIdxKeyValuePtr value, register XTIdxResultRec *result)
+{
+	XT_NODE_TEMP;
+	u_int				branch_size;
+	u_int				node_ref_size;
+	u_int				full_item_size;
+	int					search_flags;
+	xtWord1				*base;
+	register u_int		i;
+	xtWord1				*bitem;
+
+	branch_size = XT_GET_DISK_2(branch->tb_size_2);
+	node_ref_size = XT_IS_NODE(branch_size) ? XT_NODE_REF_SIZE : 0;
+
+	result->sr_found = FALSE;
+	result->sr_duplicate = FALSE;
+	result->sr_item.i_total_size = XT_GET_BRANCH_DATA_SIZE(branch_size);
+	ASSERT_NS((int) result->sr_item.i_total_size >= 0 && result->sr_item.i_total_size <= XT_INDEX_PAGE_SIZE-2);
+
+	result->sr_item.i_item_size = ind->mi_key_size + XT_RECORD_REF_SIZE;
+	full_item_size = result->sr_item.i_item_size + node_ref_size;
+	result->sr_item.i_node_ref_size = node_ref_size;
+
+	search_flags = value->sv_flags;
+	base = branch->tb_data + node_ref_size;
+	if (search_flags & XT_SEARCH_FIRST_FLAG)
+		i = 0;
+	else if (search_flags & XT_SEARCH_AFTER_LAST_FLAG)
+		i = (result->sr_item.i_total_size - node_ref_size) / full_item_size;
+	else {
+		register u_int		guess;
+		register u_int		count;
+		xtRecordID			key_record;
+		int					r;
+
+		key_record = value->sv_rec_id;
+		count = (result->sr_item.i_total_size - node_ref_size) / full_item_size;
+
+		ASSERT_NS(ind);
+		i = 0;
+		while (i < count) {
+			guess = (i + count - 1) >> 1;
+
+			bitem = base + guess * full_item_size;
+
+			r = myxt_compare_key(ind, search_flags, value->sv_length, value->sv_key, bitem);
+
+			if (r == 0) {
+				if (search_flags & XT_SEARCH_WHOLE_KEY) {
+					xtRecordID	item_record;
+					xtRowID		row_id;
+
+					xt_get_record_ref(bitem + ind->mi_key_size, &item_record, &row_id);
+
+					/* This should not happen because we should never
+					 * try to insert the same record twice into the 
+					 * index!
+					 */
+					result->sr_duplicate = TRUE;
+					if (key_record == item_record) {
+						result->sr_found = TRUE;
+						result->sr_rec_id = item_record;
+						result->sr_row_id = row_id;
+						result->sr_branch = IDX_GET_NODE_REF(tab, bitem, node_ref_size);
+						result->sr_item.i_item_offset = node_ref_size + guess * full_item_size;
+						return;
+					}
+					if (key_record < item_record)
+						r = -1;
+					else
+						r = 1;
+				}
+				else {
+					result->sr_found = TRUE;
+					/* -1 causes a search to the beginning of the duplicate list of keys.
+					 * 1 causes a search to just after the key.
+				 	*/
+					if (search_flags & XT_SEARCH_AFTER_KEY)
+						r = 1;
+					else
+						r = -1;
+				}
+			}
+
+			if (r < 0)
+				count = guess;
+			else
+				i = guess + 1;
+		}
+	}
+
+	bitem = base + i * full_item_size;
+	xt_get_res_record_ref(bitem + ind->mi_key_size, result);
+	result->sr_branch = IDX_GET_NODE_REF(tab, bitem, node_ref_size);			/* Only valid if this is a node. */
+	result->sr_item.i_item_offset = node_ref_size + i * full_item_size;
+}
+
+xtPublic void xt_scan_branch_fix_simple(struct XTTable *XT_UNUSED(tab), XTIndexPtr ind, XTIdxBranchDPtr branch, register XTIdxKeyValuePtr value, register XTIdxResultRec *result)
+{
+	XT_NODE_TEMP;
+	u_int				branch_size;
+	u_int				node_ref_size;
+	u_int				full_item_size;
+	int					search_flags;
+	xtWord1				*base;
+	register u_int		i;
+	xtWord1				*bitem;
+
+	branch_size = XT_GET_DISK_2(branch->tb_size_2);
+	node_ref_size = XT_IS_NODE(branch_size) ? XT_NODE_REF_SIZE : 0;
+
+	result->sr_found = FALSE;
+	result->sr_duplicate = FALSE;
+	result->sr_item.i_total_size = XT_GET_BRANCH_DATA_SIZE(branch_size);
+	ASSERT_NS((int) result->sr_item.i_total_size >= 0 && result->sr_item.i_total_size <= XT_INDEX_PAGE_SIZE-2);
+
+	result->sr_item.i_item_size = ind->mi_key_size + XT_RECORD_REF_SIZE;
+	full_item_size = result->sr_item.i_item_size + node_ref_size;
+	result->sr_item.i_node_ref_size = node_ref_size;
+
+	search_flags = value->sv_flags;
+	base = branch->tb_data + node_ref_size;
+	if (search_flags & XT_SEARCH_FIRST_FLAG)
+		i = 0;
+	else if (search_flags & XT_SEARCH_AFTER_LAST_FLAG)
+		i = (result->sr_item.i_total_size - node_ref_size) / full_item_size;
+	else {
+		register u_int		guess;
+		register u_int		count;
+		xtRecordID			key_record;
+		int					r;
+
+		key_record = value->sv_rec_id;
+		count = (result->sr_item.i_total_size - node_ref_size) / full_item_size;
+
+		ASSERT_NS(ind);
+		i = 0;
+		while (i < count) {
+			guess = (i + count - 1) >> 1;
+
+			bitem = base + guess * full_item_size;
+
+			r = ind->mi_simple_comp_key(ind, value->sv_length, value->sv_key, bitem);
+
+			if (r == 0) {
+				if (search_flags & XT_SEARCH_WHOLE_KEY) {
+					xtRecordID	item_record;
+					xtRowID		row_id;
+
+					xt_get_record_ref(bitem + ind->mi_key_size, &item_record, &row_id);
+
+					/* This should not happen because we should never
+					 * try to insert the same record twice into the 
+					 * index!
+					 */
+					result->sr_duplicate = TRUE;
+					if (key_record == item_record) {
+						result->sr_found = TRUE;
+						result->sr_rec_id = item_record;
+						result->sr_row_id = row_id;
+						result->sr_branch = IDX_GET_NODE_REF(tab, bitem, node_ref_size);
+						result->sr_item.i_item_offset = node_ref_size + guess * full_item_size;
+						return;
+					}
+					if (key_record < item_record)
+						r = -1;
+					else
+						r = 1;
+				}
+				else {
+					result->sr_found = TRUE;
+					/* -1 causes a search to the beginning of the duplicate list of keys.
+					 * 1 causes a search to just after the key.
+				 	*/
+					if (search_flags & XT_SEARCH_AFTER_KEY)
+						r = 1;
+					else
+						r = -1;
+				}
+			}
+
+			if (r < 0)
+				count = guess;
+			else
+				i = guess + 1;
+		}
+	}
+
+	bitem = base + i * full_item_size;
+	xt_get_res_record_ref(bitem + ind->mi_key_size, result);
+	result->sr_branch = IDX_GET_NODE_REF(tab, bitem, node_ref_size);			/* Only valid if this is a node. */
+	result->sr_item.i_item_offset = node_ref_size + i * full_item_size;
+}
+
+/*
+ * Variable length key values are stored as a sorted list. Since each list item has a variable length, we
+ * must scan the list sequentially in order to find a key.
+ */
+xtPublic void xt_scan_branch_var(struct XTTable *XT_UNUSED(tab), XTIndexPtr ind, XTIdxBranchDPtr branch, register XTIdxKeyValuePtr value, register XTIdxResultRec *result)
+{
+	XT_NODE_TEMP;
+	u_int			branch_size;
+	u_int			node_ref_size;
+	int				search_flags;
+	xtWord1			*base;
+	xtWord1			*bitem;
+	u_int			ilen;
+	xtWord1			*bend;
+
+	branch_size = XT_GET_DISK_2(branch->tb_size_2);
+	node_ref_size = XT_IS_NODE(branch_size) ? XT_NODE_REF_SIZE : 0;
+
+	result->sr_found = FALSE;
+	result->sr_duplicate = FALSE;
+	result->sr_item.i_total_size = XT_GET_BRANCH_DATA_SIZE(branch_size);
+	ASSERT_NS((int) result->sr_item.i_total_size >= 0 && result->sr_item.i_total_size <= XT_INDEX_PAGE_SIZE-2);
+
+	result->sr_item.i_node_ref_size = node_ref_size;
+
+	search_flags = value->sv_flags;
+	base = branch->tb_data + node_ref_size;
+	bitem = base;
+	bend = &branch->tb_data[result->sr_item.i_total_size];
+	ilen = 0;
+	if (bitem >= bend)
+		goto done_ok;
+
+	if (search_flags & XT_SEARCH_FIRST_FLAG)
+		ilen = myxt_get_key_length(ind, bitem);
+	else if (search_flags & XT_SEARCH_AFTER_LAST_FLAG) {
+		bitem = bend;
+		ilen = 0;
+	}
+	else {
+		xtRecordID	key_record;
+		int			r;
+
+		key_record = value->sv_rec_id;
+
+		ASSERT_NS(ind);
+		while (bitem < bend) {
+			ilen = myxt_get_key_length(ind, bitem);
+			r = myxt_compare_key(ind, search_flags, value->sv_length, value->sv_key, bitem);
+			if (r == 0) {
+				if (search_flags & XT_SEARCH_WHOLE_KEY) {
+					xtRecordID	item_record;
+					xtRowID		row_id;
+
+					xt_get_record_ref(bitem + ilen, &item_record, &row_id);
+
+					/* This should not happen because we should never
+					 * try to insert the same record twice into the 
+					 * index!
+					 */
+					result->sr_duplicate = TRUE;
+					if (key_record == item_record) {
+						result->sr_found = TRUE;
+						result->sr_item.i_item_size = ilen + XT_RECORD_REF_SIZE;
+						result->sr_rec_id = item_record;
+						result->sr_row_id = row_id;
+						result->sr_branch = IDX_GET_NODE_REF(tab, bitem, node_ref_size);
+						result->sr_item.i_item_offset = bitem - branch->tb_data;
+						return;
+					}
+					if (key_record < item_record)
+						r = -1;
+					else
+						r = 1;
+				}
+				else {
+					result->sr_found = TRUE;
+					/* -1 causes a search to the beginning of the duplicate list of keys.
+					 * 1 causes a search to just after the key.
+				 	*/
+					if (search_flags & XT_SEARCH_AFTER_KEY)
+						r = 1;
+					else
+						r = -1;
+				}
+			}
+			if (r <= 0)
+				break;
+			bitem += ilen + XT_RECORD_REF_SIZE + node_ref_size;
+		}
+	}
+
+	done_ok:
+	result->sr_item.i_item_size = ilen + XT_RECORD_REF_SIZE;
+	xt_get_res_record_ref(bitem + ilen, result);
+	result->sr_branch = IDX_GET_NODE_REF(tab, bitem, node_ref_size);			/* Only valid if this is a node. */
+	result->sr_item.i_item_offset = bitem - branch->tb_data;
+}
+
+/* Go to the next item in the node. */
+static void idx_next_branch_item(XTTableHPtr XT_UNUSED(tab), XTIndexPtr ind, XTIdxBranchDPtr branch, register XTIdxResultRec *result)
+{
+	XT_NODE_TEMP;
+	xtWord1	*bitem;
+	u_int	ilen;
+
+	result->sr_item.i_item_offset += result->sr_item.i_item_size + result->sr_item.i_node_ref_size;
+	bitem = branch->tb_data + result->sr_item.i_item_offset;
+	if (result->sr_item.i_item_offset < result->sr_item.i_total_size) {
+		if (ind->mi_fix_key)
+			ilen = result->sr_item.i_item_size;
+		else {
+			ilen = myxt_get_key_length(ind, bitem) + XT_RECORD_REF_SIZE;
+			result->sr_item.i_item_size = ilen;
+		}
+		xt_get_res_record_ref(bitem + ilen - XT_RECORD_REF_SIZE, result); /* (Only valid if i_item_offset < i_total_size) */
+	}
+	else {
+		result->sr_item.i_item_size = 0;
+		result->sr_rec_id = 0;
+		result->sr_row_id = 0;
+	}
+	if (result->sr_item.i_node_ref_size)
+		/* IDX_GET_NODE_REF() loads the branch reference to the LEFT of the item. */
+		result->sr_branch = IDX_GET_NODE_REF(tab, bitem, result->sr_item.i_node_ref_size);
+	else
+		result->sr_branch = 0;
+}
+
+xtPublic void xt_prev_branch_item_fix(XTTableHPtr XT_UNUSED(tab), XTIndexPtr XT_UNUSED(ind), XTIdxBranchDPtr branch, register XTIdxResultRec *result)
+{
+	XT_NODE_TEMP;
+	ASSERT_NS(result->sr_item.i_item_offset >= result->sr_item.i_item_size + result->sr_item.i_node_ref_size + result->sr_item.i_node_ref_size);
+	result->sr_item.i_item_offset -= (result->sr_item.i_item_size + result->sr_item.i_node_ref_size);
+	xt_get_res_record_ref(branch->tb_data + result->sr_item.i_item_offset + result->sr_item.i_item_size - XT_RECORD_REF_SIZE, result); /* (Only valid if i_item_offset < i_total_size) */
+	result->sr_branch = IDX_GET_NODE_REF(tab, branch->tb_data + result->sr_item.i_item_offset, result->sr_item.i_node_ref_size);
+}
+
+xtPublic void xt_prev_branch_item_var(XTTableHPtr XT_UNUSED(tab), XTIndexPtr ind, XTIdxBranchDPtr branch, register XTIdxResultRec *result)
+{
+	XT_NODE_TEMP;
+	xtWord1	*bitem;
+	xtWord1	*bend;
+	u_int	ilen;
+
+	bitem = branch->tb_data + result->sr_item.i_node_ref_size;
+	bend = &branch->tb_data[result->sr_item.i_item_offset];
+	for (;;) {
+		ilen = myxt_get_key_length(ind, bitem);
+		if (bitem + ilen + XT_RECORD_REF_SIZE + result->sr_item.i_node_ref_size >= bend)
+			break;
+		bitem += ilen + XT_RECORD_REF_SIZE + result->sr_item.i_node_ref_size;
+	}
+
+	result->sr_item.i_item_size = ilen + XT_RECORD_REF_SIZE;
+	xt_get_res_record_ref(bitem + ilen, result); /* (Only valid if i_item_offset < i_total_size) */
+	result->sr_branch = IDX_GET_NODE_REF(tab, bitem, result->sr_item.i_node_ref_size);
+	result->sr_item.i_item_offset = bitem - branch->tb_data;
+}
+
+static void idx_reload_item_fix(XTIndexPtr XT_NDEBUG_UNUSED(ind), XTIdxBranchDPtr branch, register XTIdxResultPtr result)
+{
+	u_int branch_size;
+
+	branch_size = XT_GET_DISK_2(branch->tb_size_2);
+	ASSERT_NS(result->sr_item.i_node_ref_size == (XT_IS_NODE(branch_size) ? XT_NODE_REF_SIZE : 0));
+	ASSERT_NS(result->sr_item.i_item_size == ind->mi_key_size + XT_RECORD_REF_SIZE);
+	result->sr_item.i_total_size = XT_GET_BRANCH_DATA_SIZE(branch_size);
+	if (result->sr_item.i_item_offset > result->sr_item.i_total_size)
+		result->sr_item.i_item_offset = result->sr_item.i_total_size;
+	xt_get_res_record_ref(&branch->tb_data[result->sr_item.i_item_offset + result->sr_item.i_item_size - XT_RECORD_REF_SIZE], result); 
+}
+
+static void idx_first_branch_item(XTTableHPtr XT_UNUSED(tab), XTIndexPtr ind, XTIdxBranchDPtr branch, register XTIdxResultPtr result)
+{
+	XT_NODE_TEMP;
+	u_int branch_size;
+	u_int node_ref_size;
+	u_int key_data_size;
+
+	branch_size = XT_GET_DISK_2(branch->tb_size_2);
+	node_ref_size = XT_IS_NODE(branch_size) ? XT_NODE_REF_SIZE : 0;
+
+	result->sr_found = FALSE;
+	result->sr_duplicate = FALSE;
+	result->sr_item.i_total_size = XT_GET_BRANCH_DATA_SIZE(branch_size);
+	ASSERT_NS((int) result->sr_item.i_total_size >= 0 && result->sr_item.i_total_size <= XT_INDEX_PAGE_SIZE-2);
+
+	if (ind->mi_fix_key)
+		key_data_size = ind->mi_key_size;
+	else {
+		xtWord1 *bitem;
+
+		bitem = branch->tb_data + node_ref_size;
+		if (bitem < &branch->tb_data[result->sr_item.i_total_size])
+			key_data_size = myxt_get_key_length(ind, bitem);
+		else
+			key_data_size = 0;
+	}
+
+	result->sr_item.i_item_size = key_data_size + XT_RECORD_REF_SIZE;
+	result->sr_item.i_node_ref_size = node_ref_size;
+
+	xt_get_res_record_ref(branch->tb_data + node_ref_size + key_data_size, result);
+	result->sr_branch = IDX_GET_NODE_REF(tab, branch->tb_data + node_ref_size, node_ref_size); /* Only valid if this is a node. */
+	result->sr_item.i_item_offset = node_ref_size;
+}
+
+/*
+ * Last means different things for leaf or node!
+ */
+xtPublic void xt_last_branch_item_fix(XTTableHPtr XT_UNUSED(tab), XTIndexPtr ind, XTIdxBranchDPtr branch, register XTIdxResultPtr result)
+{
+	XT_NODE_TEMP;
+	u_int branch_size;
+	u_int node_ref_size;
+
+	branch_size = XT_GET_DISK_2(branch->tb_size_2);
+	node_ref_size = XT_IS_NODE(branch_size) ? XT_NODE_REF_SIZE : 0;
+
+	result->sr_found = FALSE;
+	result->sr_duplicate = FALSE;
+	result->sr_item.i_total_size = XT_GET_BRANCH_DATA_SIZE(branch_size);
+	ASSERT_NS((int) result->sr_item.i_total_size >= 0 && result->sr_item.i_total_size <= XT_INDEX_PAGE_SIZE-2);
+
+	result->sr_item.i_item_size = ind->mi_key_size + XT_RECORD_REF_SIZE;
+	result->sr_item.i_node_ref_size = node_ref_size;
+
+	if (node_ref_size) {
+		result->sr_item.i_item_offset = result->sr_item.i_total_size;
+		result->sr_branch = IDX_GET_NODE_REF(tab, branch->tb_data + result->sr_item.i_item_offset, node_ref_size);
+	}
+	else {
+		if (result->sr_item.i_total_size) {
+			result->sr_item.i_item_offset = result->sr_item.i_total_size - result->sr_item.i_item_size;
+			xt_get_res_record_ref(branch->tb_data + result->sr_item.i_item_offset + ind->mi_key_size, result);
+		}
+		else
+			/* Leaf is empty: */
+			result->sr_item.i_item_offset = 0;
+	}
+}
+
+xtPublic void xt_last_branch_item_var(XTTableHPtr XT_UNUSED(tab), XTIndexPtr ind, XTIdxBranchDPtr branch, register XTIdxResultPtr result)
+{
+	XT_NODE_TEMP;
+	u_int	branch_size;
+	u_int	node_ref_size;
+
+	branch_size = XT_GET_DISK_2(branch->tb_size_2);
+	node_ref_size = XT_IS_NODE(branch_size) ? XT_NODE_REF_SIZE : 0;
+
+	result->sr_found = FALSE;
+	result->sr_duplicate = FALSE;
+	result->sr_item.i_total_size = XT_GET_BRANCH_DATA_SIZE(branch_size);
+	ASSERT_NS((int) result->sr_item.i_total_size >= 0 && result->sr_item.i_total_size <= XT_INDEX_PAGE_SIZE-2);
+
+	result->sr_item.i_node_ref_size = node_ref_size;
+
+	if (node_ref_size) {
+		result->sr_item.i_item_offset = result->sr_item.i_total_size;
+		result->sr_branch = IDX_GET_NODE_REF(tab, branch->tb_data + result->sr_item.i_item_offset, node_ref_size);
+		result->sr_item.i_item_size = 0;
+	}
+	else {
+		if (result->sr_item.i_total_size) {
+			xtWord1	*bitem;
+			u_int	ilen;
+			xtWord1	*bend;
+
+			bitem = branch->tb_data + node_ref_size;;
+			bend = &branch->tb_data[result->sr_item.i_total_size];
+			ilen = 0;
+			if (bitem < bend) {
+				for (;;) {
+					ilen = myxt_get_key_length(ind, bitem);
+					if (bitem + ilen + XT_RECORD_REF_SIZE + node_ref_size >= bend)
+						break;
+					bitem += ilen + XT_RECORD_REF_SIZE + node_ref_size;
+				}
+			}
+
+			result->sr_item.i_item_offset = bitem - branch->tb_data;
+			xt_get_res_record_ref(bitem + ilen, result);
+			result->sr_item.i_item_size = ilen + XT_RECORD_REF_SIZE;
+		}
+		else {
+			/* Leaf is empty: */
+			result->sr_item.i_item_offset = 0;
+			result->sr_item.i_item_size = 0;
+		}
+	}
+}
+
+xtPublic xtBool xt_idx_lazy_delete_on_leaf(XTIndexPtr ind, XTIndBlockPtr block, xtWord2 branch_size)
+{
+	ASSERT_NS(ind->mi_fix_key);
+	
+	/* Compact the leaf if more than half the items that fit on the page
+	 * are deleted: */
+	if (block->cp_del_count >= ind->mi_max_items/2)
+		return FALSE;
+
+	/* Compact the page if there is only 1 (or less) valid item left: */
+	if ((u_int) block->cp_del_count+1 >= ((u_int) branch_size - 2)/(ind->mi_key_size + XT_RECORD_REF_SIZE))
+		return FALSE;
+
+	return OK;
+}
+
+static xtBool idx_lazy_delete_on_node(XTIndexPtr ind, XTIndBlockPtr block, register XTIdxItemPtr item)
+{
+	ASSERT_NS(ind->mi_fix_key);
+	
+	/* Compact the node if more than 1/4 of the items that fit on the page
+	 * are deleted: */
+	if (block->cp_del_count >= ind->mi_max_items/4)
+		return FALSE;
+
+	/* Compact the page if there is only 1 (or less) valid item left: */
+	if ((u_int) block->cp_del_count+1 >= (item->i_total_size - item->i_node_ref_size)/(item->i_item_size + item->i_node_ref_size))
+		return FALSE;
+
+	return OK;
+}
+
+inline static xtBool idx_cmp_item_key_fix(XTIndReferencePtr iref, register XTIdxItemPtr item, XTIdxKeyValuePtr value)
+{
+	xtWord1 *data;
+
+	data = &iref->ir_branch->tb_data[item->i_item_offset];
+	return memcmp(data, value->sv_key, value->sv_length) == 0;
+}
+
+inline static void idx_set_item_key_fix(XTIndReferencePtr iref, register XTIdxItemPtr item, XTIdxKeyValuePtr value)
+{
+	xtWord1 *data;
+
+	data = &iref->ir_branch->tb_data[item->i_item_offset];
+	memcpy(data, value->sv_key, value->sv_length);
+	xt_set_val_record_ref(data + value->sv_length, value);
+	iref->ir_updated = TRUE;
+}
+
+inline static void idx_set_item_reference(XTIndReferencePtr iref, register XTIdxItemPtr item, xtRowID rec_id, xtRowID row_id)
+{
+	size_t	offset;
+	xtWord1	*data;
+
+	/* This is the offset of the reference in the item we found: */
+	offset = item->i_item_offset +item->i_item_size - XT_RECORD_REF_SIZE;
+	data = &iref->ir_branch->tb_data[offset];
+
+	xt_set_record_ref(data, rec_id, row_id);
+	iref->ir_updated = TRUE;
+}
+
+inline static void idx_set_item_row_id(XTIndReferencePtr iref, register XTIdxItemPtr item, xtRowID row_id)
+{
+	size_t	offset;
+	xtWord1	*data;
+
+	offset = 
+		/* This is the offset of the reference in the item we found: */
+		item->i_item_offset +item->i_item_size - XT_RECORD_REF_SIZE +
+		/* This is the offset of the row id in the reference: */
+		XT_RECORD_ID_SIZE;
+	data = &iref->ir_branch->tb_data[offset];
+
+	/* This update does not change the structure of page, so we do it without
+	 * copying the page before we write.
+	 */
+	XT_SET_DISK_4(data, row_id);
+	iref->ir_updated = TRUE;
+}
+
+inline static xtBool idx_is_item_deleted(register XTIdxBranchDPtr branch, register XTIdxItemPtr item)
+{
+	xtWord1	*data;
+
+	data = &branch->tb_data[item->i_item_offset + item->i_item_size - XT_RECORD_REF_SIZE + XT_RECORD_ID_SIZE];
+	return XT_GET_DISK_4(data) == (xtRowID) -1;
+}
+
+inline static void idx_set_item_deleted(XTIndReferencePtr iref, register XTIdxItemPtr item)
+{
+	idx_set_item_row_id(iref, item, (xtRowID) -1);
+	
+	/* This should be safe because there is only one thread,
+	 * the sweeper, that does this!
+	 *
+	 * Threads that decrement this value have an xlock on
+	 * the page, or the index.
+	 */
+	iref->ir_block->cp_del_count++;
+}
+
+/*
+ * {LAZY-DEL-INDEX-ITEMS}
+ * Do a lazy delete of an item by just setting the Row ID
+ * to the delete indicator: row ID -1.
+ */
+static void idx_lazy_delete_branch_item(XTOpenTablePtr ot, XTIndexPtr ind, XTIndReferencePtr iref, register XTIdxItemPtr item)
+{
+	idx_set_item_deleted(iref, item);
+	xt_ind_release(ot, ind, iref->ir_xlock ? XT_UNLOCK_W_UPDATE : XT_UNLOCK_R_UPDATE, iref);
+}
+
+/*
+ * This function compacts the leaf, but preserves the
+ * position of the item.
+ */
+static xtBool idx_compact_leaf(XTOpenTablePtr ot, XTIndexPtr ind, XTIndReferencePtr iref, register XTIdxItemPtr item)
+{
+	register XTIdxBranchDPtr branch = iref->ir_branch;
+	int		item_idx, count, i, idx;
+	u_int	size;
+	xtWord1	*s_data;
+	xtWord1	*d_data;
+	xtWord1	*data;
+	xtRowID	row_id;
+
+	if (iref->ir_block->cb_handle_count) {
+		if (!xt_ind_copy_on_write(iref)) {
+			xt_ind_release(ot, ind, iref->ir_xlock ? XT_UNLOCK_WRITE : XT_UNLOCK_READ, iref);
+			return FAILED;
+		}
+	}
+
+	ASSERT_NS(!item->i_node_ref_size);
+	ASSERT_NS(ind->mi_fix_key);
+	size = item->i_item_size;
+	count = item->i_total_size / size;
+	item_idx = item->i_item_offset / size;
+	s_data = d_data = branch->tb_data;
+	idx = 0;
+	for (i=0; i<count; i++) {
+		data = s_data + item->i_item_size - XT_RECORD_REF_SIZE + XT_RECORD_ID_SIZE;
+		row_id = XT_GET_DISK_4(data);
+		if (row_id == (xtRowID) -1) {
+			if (idx < item_idx)
+				item_idx--;
+		}
+		else {
+			if (d_data != s_data)
+				memcpy(d_data, s_data, size);
+			d_data += size;
+			idx++;
+		}
+		s_data += size;
+	}
+	iref->ir_block->cp_del_count = 0;
+	item->i_total_size = d_data - branch->tb_data;
+	ASSERT_NS(idx * size == item->i_total_size);
+	item->i_item_offset = item_idx * size;
+	XT_SET_DISK_2(branch->tb_size_2, XT_MAKE_BRANCH_SIZE(item->i_total_size, 0));
+	iref->ir_updated = TRUE;
+	return OK;
+}
+
+static xtBool idx_lazy_remove_leaf_item_right(XTOpenTablePtr ot, XTIndexPtr ind, XTIndReferencePtr iref, register XTIdxItemPtr item)
+{
+	register XTIdxBranchDPtr branch = iref->ir_branch;
+	int		item_idx, count, i;
+	u_int	size;
+	xtWord1	*s_data;
+	xtWord1	*d_data;
+	xtWord1	*data;
+	xtRowID	row_id;
+
+	ASSERT_NS(!item->i_node_ref_size);
+
+	if (iref->ir_block->cb_handle_count) {
+		if (!xt_ind_copy_on_write(iref)) {
+			xt_ind_release(ot, ind, XT_UNLOCK_WRITE, iref);
+			return FAILED;
+		}
+	}
+
+	ASSERT_NS(ind->mi_fix_key);
+	size = item->i_item_size;
+	count = item->i_total_size / size;
+	item_idx = item->i_item_offset / size;
+	s_data = d_data = branch->tb_data;
+	for (i=0; i<count; i++) {
+		if (i == item_idx)
+			item->i_item_offset = d_data - branch->tb_data;
+		else {
+			data = s_data + item->i_item_size - XT_RECORD_REF_SIZE + XT_RECORD_ID_SIZE;
+			row_id = XT_GET_DISK_4(data);
+			if (row_id != (xtRowID) -1) {
+				if (d_data != s_data)
+					memcpy(d_data, s_data, size);
+				d_data += size;
+			}
+		}
+		s_data += size;
+	}
+	iref->ir_block->cp_del_count = 0;
+	item->i_total_size = d_data - branch->tb_data;
+	XT_SET_DISK_2(branch->tb_size_2, XT_MAKE_BRANCH_SIZE(item->i_total_size, 0));
+	iref->ir_updated = TRUE;
+	xt_ind_release(ot, ind, XT_UNLOCK_W_UPDATE, iref);
+	return OK;
+}
+
+/*
+ * Remove an item and save to disk.
+ */
+static xtBool idx_remove_branch_item_right(XTOpenTablePtr ot, XTIndexPtr ind, xtIndexNodeID, XTIndReferencePtr iref, register XTIdxItemPtr item)
+{
+	register XTIdxBranchDPtr branch = iref->ir_branch;
+	u_int size = item->i_item_size + item->i_node_ref_size;
+
+	/* {HANDLE-COUNT-USAGE}
+	 * This access is safe because we have the right to update
+	 * the page, so no other thread can modify the page.
+	 *
+	 * This means:
+	 * We either have an Xlock on the index, or we have
+	 * an Xlock on the cache block.
+	 */
+	if (iref->ir_block->cb_handle_count) {
+		if (!xt_ind_copy_on_write(iref)) {
+			xt_ind_release(ot, ind, item->i_node_ref_size ? XT_UNLOCK_READ : XT_UNLOCK_WRITE, iref);
+			return FAILED;
+		}
+	}
+	if (ind->mi_lazy_delete) {
+		if (idx_is_item_deleted(branch, item))
+			iref->ir_block->cp_del_count--;
+	}
+	/* Remove the node reference to the left of the item: */
+	memmove(&branch->tb_data[item->i_item_offset],
+		&branch->tb_data[item->i_item_offset + size],
+		item->i_total_size - item->i_item_offset - size);
+	item->i_total_size -= size;
+	XT_SET_DISK_2(branch->tb_size_2, XT_MAKE_BRANCH_SIZE(item->i_total_size, item->i_node_ref_size));
+	IDX_TRACE("%d-> %x\n", (int) XT_NODE_ID(address), (int) XT_GET_DISK_2(branch->tb_size_2));
+	iref->ir_updated = TRUE;
+	xt_ind_release(ot, ind, item->i_node_ref_size ? XT_UNLOCK_R_UPDATE : XT_UNLOCK_W_UPDATE, iref);
+	return OK;
+}
+
+static xtBool idx_remove_branch_item_left(XTOpenTablePtr ot, XTIndexPtr ind, xtIndexNodeID, XTIndReferencePtr iref, register XTIdxItemPtr item, xtBool *lazy_delete_cleanup_required)
+{
+	register XTIdxBranchDPtr branch = iref->ir_branch;
+	u_int size = item->i_item_size + item->i_node_ref_size;
+
+	ASSERT_NS(item->i_node_ref_size);
+	if (iref->ir_block->cb_handle_count) {
+		if (!xt_ind_copy_on_write(iref)) {
+			xt_ind_release(ot, ind, item->i_node_ref_size ? XT_UNLOCK_READ : XT_UNLOCK_WRITE, iref);
+			return FAILED;
+		}
+	}
+	if (ind->mi_lazy_delete) {
+		if (idx_is_item_deleted(branch, item))
+			iref->ir_block->cp_del_count--;
+		if (lazy_delete_cleanup_required)
+			*lazy_delete_cleanup_required = idx_lazy_delete_on_node(ind, iref->ir_block, item);
+	}
+	/* Remove the node reference to the left of the item: */
+	memmove(&branch->tb_data[item->i_item_offset - item->i_node_ref_size],
+		&branch->tb_data[item->i_item_offset + item->i_item_size],
+		item->i_total_size - item->i_item_offset - item->i_item_size);
+	item->i_total_size -= size;
+	XT_SET_DISK_2(branch->tb_size_2, XT_MAKE_BRANCH_SIZE(item->i_total_size, item->i_node_ref_size));
+	IDX_TRACE("%d-> %x\n", (int) XT_NODE_ID(address), (int) XT_GET_DISK_2(branch->tb_size_2));
+	iref->ir_updated = TRUE;
+	xt_ind_release(ot, ind, item->i_node_ref_size ? XT_UNLOCK_R_UPDATE : XT_UNLOCK_W_UPDATE, iref);
+	return OK;
+}
+
+static void idx_insert_leaf_item(XTIndexPtr XT_UNUSED(ind), XTIdxBranchDPtr leaf, XTIdxKeyValuePtr value, XTIdxResultPtr result)
+{
+	xtWord1 *item;
+
+	/* This will ensure we do not overwrite the end of the buffer: */
+	ASSERT_NS(value->sv_length <= XT_INDEX_MAX_KEY_SIZE);
+	memmove(&leaf->tb_data[result->sr_item.i_item_offset + value->sv_length + XT_RECORD_REF_SIZE],
+		&leaf->tb_data[result->sr_item.i_item_offset],
+		result->sr_item.i_total_size - result->sr_item.i_item_offset);
+	item = &leaf->tb_data[result->sr_item.i_item_offset];
+	memcpy(item, value->sv_key, value->sv_length);
+	xt_set_val_record_ref(item + value->sv_length, value);
+	result->sr_item.i_total_size += value->sv_length + XT_RECORD_REF_SIZE;
+	XT_SET_DISK_2(leaf->tb_size_2, XT_MAKE_LEAF_SIZE(result->sr_item.i_total_size));
+}
+
+static void idx_insert_node_item(XTTableHPtr XT_UNUSED(tab), XTIndexPtr XT_UNUSED(ind), XTIdxBranchDPtr leaf, XTIdxKeyValuePtr value, XTIdxResultPtr result, xtIndexNodeID branch)
+{
+	xtWord1 *item;
+
+	/* This will ensure we do not overwrite the end of the buffer: */
+	ASSERT_NS(value->sv_length <= XT_INDEX_MAX_KEY_SIZE);
+	memmove(&leaf->tb_data[result->sr_item.i_item_offset + value->sv_length + XT_RECORD_REF_SIZE + result->sr_item.i_node_ref_size],
+		&leaf->tb_data[result->sr_item.i_item_offset],
+		result->sr_item.i_total_size - result->sr_item.i_item_offset);
+	item = &leaf->tb_data[result->sr_item.i_item_offset];
+	memcpy(item, value->sv_key, value->sv_length);
+	xt_set_val_record_ref(item + value->sv_length, value);
+	XT_SET_NODE_REF(tab, item + value->sv_length + XT_RECORD_REF_SIZE, branch);
+	result->sr_item.i_total_size += value->sv_length + XT_RECORD_REF_SIZE + result->sr_item.i_node_ref_size;
+	XT_SET_DISK_2(leaf->tb_size_2, XT_MAKE_NODE_SIZE(result->sr_item.i_total_size));
+}
+
+static xtBool idx_get_middle_branch_item(XTOpenTablePtr ot, XTIndexPtr ind, XTIdxBranchDPtr branch, XTIdxKeyValuePtr value, XTIdxResultPtr result)
+{
+	xtWord1	*bitem;
+
+	ASSERT_NS(result->sr_item.i_node_ref_size == 0 || result->sr_item.i_node_ref_size == XT_NODE_REF_SIZE);
+	ASSERT_NS((int) result->sr_item.i_total_size >= 0 && result->sr_item.i_total_size <= XT_INDEX_PAGE_SIZE*2);
+	if (ind->mi_fix_key) {
+		u_int full_item_size = result->sr_item.i_item_size + result->sr_item.i_node_ref_size;
+
+		result->sr_item.i_item_offset = ((result->sr_item.i_total_size - result->sr_item.i_node_ref_size)
+			/ full_item_size / 2 * full_item_size) + result->sr_item.i_node_ref_size;
+
+		bitem = &branch->tb_data[result->sr_item.i_item_offset];
+		value->sv_flags = XT_SEARCH_WHOLE_KEY;
+		value->sv_length = result->sr_item.i_item_size - XT_RECORD_REF_SIZE;
+		xt_get_record_ref(bitem + value->sv_length, &value->sv_rec_id, &value->sv_row_id);
+		memcpy(value->sv_key, bitem, value->sv_length);
+	}
+	else {
+		u_int	node_ref_size;
+		u_int	ilen, tlen;
+		xtWord1	*bend;
+
+		node_ref_size = result->sr_item.i_node_ref_size;
+		bitem = branch->tb_data + node_ref_size;
+		bend = &branch->tb_data[(result->sr_item.i_total_size - node_ref_size) / 2 + node_ref_size];
+		ilen = 0;
+		if (bitem < bend) {
+			tlen = 0;
+			for (;;) {
+				ilen = myxt_get_key_length(ind, bitem);
+				tlen += ilen + XT_RECORD_REF_SIZE + node_ref_size;
+				if (bitem + ilen + XT_RECORD_REF_SIZE + node_ref_size >= bend) {
+					if (ilen > XT_INDEX_PAGE_SIZE || tlen > result->sr_item.i_total_size) {
+						xt_register_taberr(XT_REG_CONTEXT, XT_ERR_INDEX_CORRUPTED, ot->ot_table->tab_name);
+						return FAILED;
+					}
+					break;
+				}
+				bitem += ilen + XT_RECORD_REF_SIZE + node_ref_size;
+			}
+		}
+
+		result->sr_item.i_item_offset = bitem - branch->tb_data;
+		result->sr_item.i_item_size = ilen + XT_RECORD_REF_SIZE;
+
+		value->sv_flags = XT_SEARCH_WHOLE_KEY;
+		value->sv_length = ilen;
+		xt_get_record_ref(bitem + ilen, &value->sv_rec_id, &value->sv_row_id);
+		memcpy(value->sv_key, bitem, value->sv_length);
+	}
+	return OK;
+}
+
+static size_t idx_write_branch_item(XTIndexPtr XT_UNUSED(ind), xtWord1 *item, XTIdxKeyValuePtr value)
+{
+	memcpy(item, value->sv_key, value->sv_length);
+	xt_set_val_record_ref(item + value->sv_length, value);
+	return value->sv_length + XT_RECORD_REF_SIZE;
+}
+
+static xtBool idx_replace_node_key(XTOpenTablePtr ot, XTIndexPtr ind, IdxStackItemPtr item, IdxBranchStackPtr stack, u_int item_size, xtWord1 *item_buf)
+{
+	XTIndReferenceRec	iref;
+	xtIndexNodeID		new_branch;
+	XTIdxResultRec		result;
+	xtIndexNodeID		current = item->i_branch;
+	u_int				new_size;
+	XTIdxBranchDPtr		new_branch_ptr;
+	XTIdxKeyValueRec	key_value;
+	xtWord1				key_buf[XT_INDEX_MAX_KEY_SIZE];
+
+#ifdef DEBUG
+	iref.ir_xlock = 2;
+	iref.ir_updated = 2;
+#endif
+	if (!xt_ind_fetch(ot, ind, current, XT_LOCK_WRITE, &iref))
+		return FAILED;
+	if (iref.ir_block->cb_handle_count) {
+		if (!xt_ind_copy_on_write(&iref))
+			goto failed_1;
+	}
+	if (ind->mi_lazy_delete) {
+		ASSERT_NS(item_size == item->i_pos.i_item_size);
+		if (idx_is_item_deleted(iref.ir_branch, &item->i_pos))
+			iref.ir_block->cp_del_count--;
+	}
+
+	if (item->i_pos.i_total_size + item_size - item->i_pos.i_item_size <= XT_INDEX_PAGE_DATA_SIZE) {
+		/* The new item is larger than the old, this can result
+		 * in overflow of the node!
+		 */
+		memmove(&iref.ir_branch->tb_data[item->i_pos.i_item_offset + item_size],
+			&iref.ir_branch->tb_data[item->i_pos.i_item_offset + item->i_pos.i_item_size],
+			item->i_pos.i_total_size - item->i_pos.i_item_offset - item->i_pos.i_item_size);
+		memcpy(&iref.ir_branch->tb_data[item->i_pos.i_item_offset],
+			item_buf, item_size);
+		if (ind->mi_lazy_delete) {
+			if (idx_is_item_deleted(iref.ir_branch, &item->i_pos))
+				iref.ir_block->cp_del_count++;
+		}
+		item->i_pos.i_total_size = item->i_pos.i_total_size + item_size - item->i_pos.i_item_size;
+		XT_SET_DISK_2(iref.ir_branch->tb_size_2, XT_MAKE_NODE_SIZE(item->i_pos.i_total_size));
+		IDX_TRACE("%d-> %x\n", (int) XT_NODE_ID(current), (int) XT_GET_DISK_2(iref.ir_branch->tb_size_2));
+		iref.ir_updated = TRUE;
+
+#ifdef DEBUG
+		ASSERT_NS(item->i_pos.i_total_size <= XT_INDEX_PAGE_DATA_SIZE);
+#endif
+		return xt_ind_release(ot, ind, XT_UNLOCK_W_UPDATE, &iref);
+	}
+
+	/* The node has overflowed!! */
+	result.sr_item = item->i_pos;
+
+	memcpy(ot->ot_ind_wbuf.tb_data, iref.ir_branch->tb_data, item->i_pos.i_item_offset);	// First part of the buffer
+	memcpy(&ot->ot_ind_wbuf.tb_data[item->i_pos.i_item_offset], item_buf, item_size);		// The new item
+	memcpy(&ot->ot_ind_wbuf.tb_data[item->i_pos.i_item_offset + item_size],
+		&iref.ir_branch->tb_data[item->i_pos.i_item_offset + item->i_pos.i_item_size],
+		item->i_pos.i_total_size - item->i_pos.i_item_offset - item->i_pos.i_item_size);
+	item->i_pos.i_total_size += item_size - item->i_pos.i_item_size;
+	item->i_pos.i_item_size = item_size;
+	XT_SET_DISK_2(ot->ot_ind_wbuf.tb_size_2, XT_MAKE_LEAF_SIZE(item->i_pos.i_total_size));
+	IDX_TRACE("%d-> %x\n", (int) XT_NODE_ID(current), (int) XT_GET_DISK_2(ot->ot_ind_wbuf.tb_size_2));
+	ASSERT_NS(item->i_pos.i_total_size > XT_INDEX_PAGE_DATA_SIZE && item->i_pos.i_total_size <= XT_INDEX_PAGE_DATA_SIZE*2);
+
+	/* Adjust the stack (we want the parents of the delete node): */
+	for (;;) {
+		if (idx_pop(stack) == item)
+			break;
+	}		
+
+	/* We assume that value can be overwritten (which is the case) */
+	key_value.sv_flags = XT_SEARCH_WHOLE_KEY;
+	key_value.sv_key = key_buf;
+	if (!idx_get_middle_branch_item(ot, ind, &ot->ot_ind_wbuf, &key_value, &result))
+		goto failed_1;
+
+	if (!idx_new_branch(ot, ind, &new_branch))
+		goto failed_1;
+
+	/* Split the node: */
+	new_size = result.sr_item.i_total_size - result.sr_item.i_item_offset - result.sr_item.i_item_size;
+	new_branch_ptr = (XTIdxBranchDPtr) &ot->ot_ind_wbuf.tb_data[XT_INDEX_PAGE_DATA_SIZE];
+	memmove(new_branch_ptr->tb_data, &iref.ir_branch->tb_data[result.sr_item.i_item_offset + result.sr_item.i_item_size], new_size);
+
+	XT_SET_DISK_2(new_branch_ptr->tb_size_2, XT_MAKE_NODE_SIZE(new_size));
+	IDX_TRACE("%d-> %x\n", (int) XT_NODE_ID(new_branch), (int) XT_GET_DISK_2(new_branch_ptr->tb_size_2));
+	if (!xt_ind_write(ot, ind, new_branch, offsetof(XTIdxBranchDRec, tb_data) + new_size, (xtWord1 *) new_branch_ptr))
+		goto failed_2;
+
+	/* Change the size of the old branch: */
+	XT_SET_DISK_2(ot->ot_ind_wbuf.tb_size_2, XT_MAKE_NODE_SIZE(result.sr_item.i_item_offset));
+	IDX_TRACE("%d-> %x\n", (int) XT_NODE_ID(current), (int) XT_GET_DISK_2(ot->ot_ind_wbuf.tb_size_2));
+	memcpy(iref.ir_branch, &ot->ot_ind_wbuf, offsetof(XTIdxBranchDRec, tb_data) + result.sr_item.i_item_offset);
+	iref.ir_updated = TRUE;
+	xt_ind_release(ot, ind, XT_UNLOCK_W_UPDATE, &iref);
+
+	/* Insert the new branch into the parent node, using the new middle key value: */
+	if (!idx_insert_node(ot, ind, stack, &key_value, new_branch)) {
+		/* 
+		 * TODO: Mark the index as corrupt.
+		 * This should not fail because everything has been
+		 * preallocated.
+		 * However, if it does fail the index
+		 * will be corrupt.
+		 * I could modify and release the branch above,
+		 * after this point.
+		 * But that would mean holding the lock longer,
+		 * and also may not help because idx_insert_node()
+		 * is recursive.
+		 */
+		idx_free_branch(ot, ind, new_branch);
+		return FAILED;
+	}
+
+	return OK;
+
+	failed_2:
+	idx_free_branch(ot, ind, new_branch);
+
+	failed_1:
+	xt_ind_release(ot, ind, XT_UNLOCK_WRITE, &iref);
+
+	return FAILED;
+}
+
+/*ot_ind_wbuf
+ * -----------------------------------------------------------------------
+ * Standard b-tree insert
+ */
+
+/*
+ * Insert the given branch into the node on the top of the stack. If the stack
+ * is empty we need to add a new root.
+ */
+static xtBool idx_insert_node(XTOpenTablePtr ot, XTIndexPtr ind, IdxBranchStackPtr stack, XTIdxKeyValuePtr key_value, xtIndexNodeID branch)
+{
+	IdxStackItemPtr		stack_item;
+	xtIndexNodeID		new_branch;
+	size_t				size;
+	xtIndexNodeID		current;
+	XTIndReferenceRec	iref;
+	XTIdxResultRec		result;
+	u_int				new_size;
+	XTIdxBranchDPtr		new_branch_ptr;
+
+#ifdef DEBUG
+	iref.ir_xlock = 2;
+	iref.ir_updated = 2;
+#endif
+	/* Insert a new branch (key, data)... */
+	if (!(stack_item = idx_pop(stack))) {
+		xtWord1 *ditem;
+
+		/* New root */
+		if (!idx_new_branch(ot, ind, &new_branch))
+			goto failed;
+
+		ditem = ot->ot_ind_wbuf.tb_data;
+		XT_SET_NODE_REF(ot->ot_table, ditem, ind->mi_root);
+		ditem += XT_NODE_REF_SIZE;
+		ditem += idx_write_branch_item(ind, ditem, key_value);
+		XT_SET_NODE_REF(ot->ot_table, ditem, branch);
+		ditem += XT_NODE_REF_SIZE;
+		size = ditem - ot->ot_ind_wbuf.tb_data;
+		XT_SET_DISK_2(ot->ot_ind_wbuf.tb_size_2, XT_MAKE_NODE_SIZE(size));
+		IDX_TRACE("%d-> %x\n", (int) XT_NODE_ID(new_branch), (int) XT_GET_DISK_2(ot->ot_ind_wbuf.tb_size_2));
+		if (!xt_ind_write(ot, ind, new_branch, offsetof(XTIdxBranchDRec, tb_data) + size, (xtWord1 *) &ot->ot_ind_wbuf))
+			goto failed_2;
+		ind->mi_root = new_branch;
+		goto done_ok;
+	}
+
+	current = stack_item->i_branch;
+	/* This read does not count (towards ot_ind_reads), because we are only
+	 * counting each loaded page once. We assume that the page is in
+	 * cache, and will remain in cache when we read again below for the
+	 * purpose of update.
+	 */
+	if (!xt_ind_fetch(ot, ind, current, XT_LOCK_READ, &iref))
+		goto failed;
+	ASSERT_NS(XT_IS_NODE(XT_GET_DISK_2(iref.ir_branch->tb_size_2)));
+	ind->mi_scan_branch(ot->ot_table, ind, iref.ir_branch, key_value, &result);
+
+	if (result.sr_item.i_total_size + key_value->sv_length + XT_RECORD_REF_SIZE + result.sr_item.i_node_ref_size <= XT_INDEX_PAGE_DATA_SIZE) {
+		if (iref.ir_block->cb_handle_count) {
+			if (!xt_ind_copy_on_write(&iref))
+				goto failed_1;
+		}
+		idx_insert_node_item(ot->ot_table, ind, iref.ir_branch, key_value, &result, branch);
+		IDX_TRACE("%d-> %x\n", (int) XT_NODE_ID(current), (int) XT_GET_DISK_2(ot->ot_ind_wbuf.tb_size_2));
+		iref.ir_updated = TRUE;
+		ASSERT_NS(result.sr_item.i_total_size <= XT_INDEX_PAGE_DATA_SIZE);
+		xt_ind_release(ot, ind, XT_UNLOCK_R_UPDATE, &iref);
+		goto done_ok;
+	}
+
+	memcpy(&ot->ot_ind_wbuf, iref.ir_branch, offsetof(XTIdxBranchDRec, tb_data) + result.sr_item.i_total_size);
+	idx_insert_node_item(ot->ot_table, ind, &ot->ot_ind_wbuf, key_value, &result, branch);
+	IDX_TRACE("%d-> %x\n", (int) XT_NODE_ID(current), (int) XT_GET_DISK_2(ot->ot_ind_wbuf.tb_size_2));
+	ASSERT_NS(result.sr_item.i_total_size > XT_INDEX_PAGE_DATA_SIZE);
+
+	/* We assume that value can be overwritten (which is the case) */
+	if (!idx_get_middle_branch_item(ot, ind, &ot->ot_ind_wbuf, key_value, &result))
+		goto failed_1;
+
+	if (!idx_new_branch(ot, ind, &new_branch))
+		goto failed_1;
+
+	/* Split the node: */
+	new_size = result.sr_item.i_total_size - result.sr_item.i_item_offset - result.sr_item.i_item_size;
+	new_branch_ptr = (XTIdxBranchDPtr) &ot->ot_ind_wbuf.tb_data[XT_INDEX_PAGE_DATA_SIZE];
+	memmove(new_branch_ptr->tb_data, &ot->ot_ind_wbuf.tb_data[result.sr_item.i_item_offset + result.sr_item.i_item_size], new_size);
+
+	XT_SET_DISK_2(new_branch_ptr->tb_size_2, XT_MAKE_NODE_SIZE(new_size));
+	IDX_TRACE("%d-> %x\n", (int) XT_NODE_ID(new_branch), (int) XT_GET_DISK_2(new_branch_ptr->tb_size_2));
+	if (!xt_ind_write(ot, ind, new_branch, offsetof(XTIdxBranchDRec, tb_data) + new_size, (xtWord1 *) new_branch_ptr))
+		goto failed_2;
+
+	/* Change the size of the old branch: */
+	XT_SET_DISK_2(ot->ot_ind_wbuf.tb_size_2, XT_MAKE_NODE_SIZE(result.sr_item.i_item_offset));
+	IDX_TRACE("%d-> %x\n", (int) XT_NODE_ID(current), (int) XT_GET_DISK_2(ot->ot_ind_wbuf.tb_size_2));
+	if (iref.ir_block->cb_handle_count) {
+		if (!xt_ind_copy_on_write(&iref))
+			goto failed_2;
+	}
+	memcpy(iref.ir_branch, &ot->ot_ind_wbuf, offsetof(XTIdxBranchDRec, tb_data) + result.sr_item.i_item_offset);
+	iref.ir_updated = TRUE;
+	xt_ind_release(ot, ind, XT_UNLOCK_R_UPDATE, &iref);
+
+	/* Insert the new branch into the parent node, using the new middle key value: */
+	if (!idx_insert_node(ot, ind, stack, key_value, new_branch)) {
+		// Index may be inconsistant now...
+		idx_free_branch(ot, ind, new_branch);
+		goto failed;
+	}
+
+	done_ok:
+	return OK;
+
+	failed_2:
+	idx_free_branch(ot, ind, new_branch);
+
+	failed_1:
+	xt_ind_release(ot, ind, XT_UNLOCK_READ, &iref);
+
+	failed:
+	return FAILED;
+}
+
+static xtBool idx_out_of_memory_failure(XTOpenTablePtr ot)
+{
+#ifdef XT_TRACK_INDEX_UPDATES
+	/* If the index has been changed when we run out of memory, we
+	 * will corrupt the index!
+	 */
+	ASSERT_NS(ot->ot_ind_changed == 0);
+#endif
+	if (ot->ot_thread->t_exception.e_xt_err == XT_ERR_NO_INDEX_CACHE) {
+		/* Flush index and retry! */
+		xt_clear_exception(ot->ot_thread);
+		if (!xt_flush_indices(ot, NULL, FALSE))
+			return FAILED;
+		return TRUE;
+	}
+	return FALSE;
+}
+
+/*
+ * Check all the duplicate variation in an index.
+ * If one of them is visible, then we have a duplicate key
+ * error.
+ *
+ * GOTCHA: This routine must use the write index buffer!
+ */
+static xtBool idx_check_duplicates(XTOpenTablePtr ot, XTIndexPtr ind, XTIdxKeyValuePtr key_value)
+{
+	IdxBranchStackRec	stack;
+	xtIndexNodeID		current;
+	XTIndReferenceRec	iref;
+	XTIdxResultRec		result;
+	xtBool				on_key = FALSE;
+	xtXactID			xn_id;
+	int					save_flags;				
+	XTXactWaitRec		xw;
+
+#ifdef DEBUG
+	iref.ir_xlock = 2;
+	iref.ir_updated = 2;
+#endif
+	retry:
+	idx_newstack(&stack);
+
+	if (!(XT_NODE_ID(current) = XT_NODE_ID(ind->mi_root)))
+		return OK;
+
+	save_flags = key_value->sv_flags;
+	key_value->sv_flags = 0;
+
+	while (XT_NODE_ID(current)) {
+		if (!xt_ind_fetch(ot, ind, current, XT_LOCK_READ, &iref)) {
+			key_value->sv_flags = save_flags;
+			return FAILED;
+		}
+		ind->mi_scan_branch(ot->ot_table, ind, iref.ir_branch, key_value, &result);
+		if (result.sr_found)
+			/* If we have found the key in a node: */
+			on_key = TRUE;
+		if (!result.sr_item.i_node_ref_size)
+			break;
+		xt_ind_release(ot, ind, XT_UNLOCK_READ, &iref);
+		if (!idx_push(&stack, current, &result.sr_item)) {
+			key_value->sv_flags = save_flags;
+			return FAILED;
+		}
+		current = result.sr_branch;
+	}
+
+	key_value->sv_flags = save_flags;
+
+	if (!on_key) {
+		xt_ind_release(ot, ind, XT_UNLOCK_READ, &iref);
+		return OK;
+	}
+
+	for (;;) {
+		if (result.sr_item.i_item_offset == result.sr_item.i_total_size) {
+			IdxStackItemPtr node;
+
+			/* We are at the end of a leaf node.
+			 * Go up the stack to find the start position of the next key.
+			 * If we find none, then we are the end of the index.
+			 */
+			xt_ind_release(ot, ind, XT_UNLOCK_READ, &iref);
+			while ((node = idx_pop(&stack))) {
+				if (node->i_pos.i_item_offset < node->i_pos.i_total_size) {
+					current = node->i_branch;
+					if (!xt_ind_fetch(ot, ind, current, XT_LOCK_READ, &iref))
+						return FAILED;
+					xt_get_res_record_ref(&iref.ir_branch->tb_data[node->i_pos.i_item_offset + node->i_pos.i_item_size - XT_RECORD_REF_SIZE], &result);
+					result.sr_item = node->i_pos;
+					goto check_value;
+				}
+			}
+			break;
+		}
+
+		check_value:
+		/* Quit the loop if the key is no longer matched! */
+		if (myxt_compare_key(ind, 0, key_value->sv_length, key_value->sv_key, &iref.ir_branch->tb_data[result.sr_item.i_item_offset]) != 0) {
+			xt_ind_release(ot, ind, XT_UNLOCK_READ, &iref);
+			break;
+		}
+
+		if (ind->mi_lazy_delete) {
+			if (result.sr_row_id == (xtRowID) -1)
+				goto next_item;
+		}
+
+		switch (xt_tab_maybe_committed(ot, result.sr_rec_id, &xn_id, NULL, NULL)) {
+			case XT_MAYBE:
+				/* Record is not committed, wait for the transaction. */
+				xt_ind_release(ot, ind, XT_UNLOCK_READ, &iref);
+				XT_INDEX_UNLOCK(ind, ot);				
+				xw.xw_xn_id = xn_id;
+				if (!xt_xn_wait_for_xact(ot->ot_thread, &xw, NULL)) {
+					XT_INDEX_WRITE_LOCK(ind, ot);
+					return FAILED;
+				}
+				XT_INDEX_WRITE_LOCK(ind, ot);
+				goto retry;			
+			case XT_ERR:
+				/* Error while reading... */
+				goto failed;
+			case TRUE:
+				/* Record is committed or belongs to me, duplicate key: */
+				XT_DEBUG_TRACE(("DUPLICATE KEY tx=%d rec=%d\n", (int) ot->ot_thread->st_xact_data->xd_start_xn_id, (int) result.sr_rec_id));
+				xt_register_xterr(XT_REG_CONTEXT, XT_ERR_DUPLICATE_KEY);
+				goto failed;
+			case FALSE:
+				/* Record is deleted or rolled-back: */
+				break;
+		}
+
+		next_item:
+		idx_next_branch_item(ot->ot_table, ind, iref.ir_branch, &result);
+
+		if (result.sr_item.i_node_ref_size) {
+			/* Go down to the bottom: */
+			while (XT_NODE_ID(current)) {
+				xt_ind_release(ot, ind, XT_UNLOCK_READ, &iref);
+				if (!idx_push(&stack, current, &result.sr_item))
+					return FAILED;
+				current = result.sr_branch;
+				if (!xt_ind_fetch(ot, ind, current, XT_LOCK_READ, &iref))
+					return FAILED;
+				idx_first_branch_item(ot->ot_table, ind, iref.ir_branch, &result);
+				if (!result.sr_item.i_node_ref_size)
+					break;
+			}
+		}
+	}
+
+	return OK;
+	
+	failed:
+	xt_ind_release(ot, ind, XT_UNLOCK_READ, &iref);
+	return FAILED;
+}
+
+inline static void idx_still_on_key(XTIndexPtr ind, register XTIdxSearchKeyPtr search_key, register XTIdxBranchDPtr branch, register XTIdxItemPtr item)
+{
+	if (search_key && search_key->sk_on_key) {
+		search_key->sk_on_key = myxt_compare_key(ind, search_key->sk_key_value.sv_flags, search_key->sk_key_value.sv_length,
+			search_key->sk_key_value.sv_key, &branch->tb_data[item->i_item_offset]) == 0;
+	}
+}
+
+/*
+ * Insert a value into the given index. Return FALSE if an error occurs.
+ */
+xtPublic xtBool xt_idx_insert(XTOpenTablePtr ot, XTIndexPtr ind, xtRowID row_id, xtRecordID rec_id, xtWord1 *rec_buf, xtWord1 *bef_buf, xtBool allow_dups)
+{
+	XTIdxKeyValueRec	key_value;
+	xtWord1				key_buf[XT_INDEX_MAX_KEY_SIZE];
+	IdxBranchStackRec	stack;
+	xtIndexNodeID		current;
+	XTIndReferenceRec	iref;
+	xtIndexNodeID		new_branch;
+	XTIdxBranchDPtr		new_branch_ptr;
+	size_t				size;
+	XTIdxResultRec		result;
+	size_t				new_size;
+	xtBool				check_for_dups = ind->mi_flags & (HA_UNIQUE_CHECK | HA_NOSAME) && !allow_dups;
+	xtBool				lock_structure = FALSE;
+	xtBool				updated = FALSE;
+
+#ifdef DEBUG
+	iref.ir_xlock = 2;
+	iref.ir_updated = 2;
+#endif
+#ifdef CHECK_AND_PRINT
+	//idx_check_index(ot, ind, TRUE);
+#endif
+
+	retry_after_oom:
+#ifdef XT_TRACK_INDEX_UPDATES
+	ot->ot_ind_changed = 0;
+#endif
+	key_value.sv_flags = XT_SEARCH_WHOLE_KEY;
+	key_value.sv_rec_id = rec_id;
+	key_value.sv_row_id = row_id;		/* Should always be zero on insert (will be update by sweeper later). 
+										 * Non-zero only during recovery, assuming that sweeper will process such records right after recovery.
+										 */
+	key_value.sv_key = key_buf;
+	key_value.sv_length = myxt_create_key_from_row(ind, key_buf, rec_buf, &check_for_dups);
+
+	if (bef_buf && check_for_dups) {
+		/* If we have a before image, and we are required to check for duplicates.
+		 * then compare the before image key with the after image key.
+		 */
+		xtWord1	bef_key_buf[XT_INDEX_MAX_KEY_SIZE];
+		u_int	len;
+		xtBool	has_no_null = TRUE;
+
+		len = myxt_create_key_from_row(ind, bef_key_buf, bef_buf, &has_no_null);
+		if (has_no_null) {
+			/* If the before key has no null values, then compare with the after key value.
+			 * We only have to check for duplicates if the key has changed!
+			 */
+			check_for_dups = myxt_compare_key(ind, 0, len, bef_key_buf, key_buf) != 0;
+		}
+	}
+
+	/* The index appears to have no root: */
+	if (!XT_NODE_ID(ind->mi_root))
+		lock_structure = TRUE;
+
+	lock_and_retry:
+	idx_newstack(&stack);
+
+	/* A write lock is only required if we are going to change the
+	 * strcuture of the index!
+	 */
+	if (lock_structure)
+		XT_INDEX_WRITE_LOCK(ind, ot);
+	else
+		XT_INDEX_READ_LOCK(ind, ot);
+
+	retry:
+	/* Create a root node if required: */
+	if (!(XT_NODE_ID(current) = XT_NODE_ID(ind->mi_root))) {
+		/* Index is empty, create a new one: */
+		ASSERT_NS(lock_structure);
+		if (!xt_ind_reserve(ot, 1, NULL))
+			goto failed;
+		if (!idx_new_branch(ot, ind, &new_branch))
+			goto failed;
+		size = idx_write_branch_item(ind, ot->ot_ind_wbuf.tb_data, &key_value);
+		XT_SET_DISK_2(ot->ot_ind_wbuf.tb_size_2, XT_MAKE_LEAF_SIZE(size));
+		IDX_TRACE("%d-> %x\n", (int) new_branch, (int) XT_GET_DISK_2(ot->ot_ind_wbuf.tb_size_2));
+		if (!xt_ind_write(ot, ind, new_branch, offsetof(XTIdxBranchDRec, tb_data) + size, (xtWord1 *) &ot->ot_ind_wbuf))
+			goto failed_2;
+		ind->mi_root = new_branch;
+		goto done_ok;
+	}
+
+	/* Search down the tree for the insertion point. */
+	while (XT_NODE_ID(current)) {
+		if (!xt_ind_fetch(ot, ind, current, XT_XLOCK_LEAF, &iref))
+			goto failed;
+		ind->mi_scan_branch(ot->ot_table, ind, iref.ir_branch, &key_value, &result);
+		if (result.sr_duplicate) {
+			if (check_for_dups) {
+				/* Duplicates are not allowed, at least one has been
+				 * found...
+				 */
+
+				/* Leaf nodes (i_node_ref_size == 0) are write locked,
+				 * non-leaf nodes are read locked.
+				 */
+				xt_ind_release(ot, ind, result.sr_item.i_node_ref_size ? XT_UNLOCK_READ : XT_UNLOCK_WRITE, &iref);
+
+				if (!idx_check_duplicates(ot, ind, &key_value))
+					goto failed;
+				/* We have checked all the "duplicate" variations. None of them are
+				 * relevant. So this will cause a correct insert.
+				 */
+				check_for_dups = FALSE;
+				idx_newstack(&stack);
+				goto retry;
+			}
+		}
+		if (result.sr_found) {
+			/* Node found, can happen during recovery of indexes! 
+			 * We have found an exact match of both key and record.
+			 */
+			XTPageUnlockType	utype;
+			xtBool				overwrite = FALSE;
+
+			/* {LAZY-DEL-INDEX-ITEMS}
+			 * If the item has been lazy deleted, then just overwrite!
+			 */ 
+			if (result.sr_row_id == (xtRowID) -1) {
+				xtWord2 del_count;
+	
+				/* This is safe because we have an xlock on the leaf. */
+				if ((del_count = iref.ir_block->cp_del_count))
+					iref.ir_block->cp_del_count = del_count-1;
+				overwrite = TRUE;
+			}
+
+			if (!result.sr_row_id && row_id) {
+				/* {INDEX-RECOV_ROWID} Set the row-id
+				 * during recovery, even if the index entry
+				 * is not committed.
+				 * It will be removed later by the sweeper.
+				 */
+				overwrite = TRUE;
+			}
+
+			if (overwrite) {
+				idx_set_item_row_id(&iref, &result.sr_item, row_id);
+				utype = result.sr_item.i_node_ref_size ? XT_UNLOCK_R_UPDATE : XT_UNLOCK_W_UPDATE;
+			}
+			else
+				utype = result.sr_item.i_node_ref_size ? XT_UNLOCK_READ : XT_UNLOCK_WRITE;
+			xt_ind_release(ot, ind, utype, &iref);
+			goto done_ok;
+		}
+		/* Stop when we get to a leaf: */
+		if (!result.sr_item.i_node_ref_size)
+			break;
+		xt_ind_release(ot, ind, result.sr_item.i_node_ref_size ? XT_UNLOCK_READ : XT_UNLOCK_WRITE, &iref);
+		if (!idx_push(&stack, current, NULL))
+			goto failed;
+		current = result.sr_branch;
+	}
+	ASSERT_NS(XT_NODE_ID(current));
+	
+	/* Must be a leaf!: */
+	ASSERT_NS(!result.sr_item.i_node_ref_size);
+
+	updated = FALSE;
+	if (ind->mi_lazy_delete && iref.ir_block->cp_del_count) {
+		/* There are a number of possibilities:
+		 * - We could just replace a lazy deleted slot.
+		 * - We could compact and insert.
+		 * - We could just insert
+		 */
+
+		if (result.sr_item.i_item_offset > 0) {
+			/* Check if it can go into the previous node: */
+			XTIdxResultRec	t_res;
+
+			t_res.sr_item = result.sr_item;
+			xt_prev_branch_item_fix(ot->ot_table, ind, iref.ir_branch, &t_res);
+			if (t_res.sr_row_id != (xtRowID) -1)
+				goto try_current;
+
+			/* Yup, it can, but first check to see if it would be 
+			 * better to put it in the current node.
+			 * This is the case if the previous node key is not the
+			 * same as the key we are adding...
+			 */
+			if (result.sr_item.i_item_offset < result.sr_item.i_total_size &&
+				result.sr_row_id == (xtRowID) -1) {
+				if (!idx_cmp_item_key_fix(&iref, &t_res.sr_item, &key_value))
+					goto try_current;
+			}
+
+			idx_set_item_key_fix(&iref, &t_res.sr_item, &key_value);
+			iref.ir_block->cp_del_count--;
+			xt_ind_release(ot, ind, XT_UNLOCK_W_UPDATE, &iref);
+			goto done_ok;
+		}
+
+		try_current:
+		if (result.sr_item.i_item_offset < result.sr_item.i_total_size) {
+			if (result.sr_row_id == (xtRowID) -1) {
+				idx_set_item_key_fix(&iref, &result.sr_item, &key_value);
+				iref.ir_block->cp_del_count--;
+				xt_ind_release(ot, ind, XT_UNLOCK_W_UPDATE, &iref);
+				goto done_ok;
+			}
+		}
+
+		/* Check if we must compact... 
+		 * It makes no sense to split as long as there are lazy deleted items
+		 * in the page. So, delete them if a split would otherwise be required!
+		 */
+		ASSERT_NS(key_value.sv_length + XT_RECORD_REF_SIZE == result.sr_item.i_item_size);
+		if (result.sr_item.i_total_size + key_value.sv_length + XT_RECORD_REF_SIZE > XT_INDEX_PAGE_DATA_SIZE) {
+			if (!idx_compact_leaf(ot, ind, &iref, &result.sr_item))
+				goto failed;
+			updated = TRUE;
+		}
+		
+		/* Fall through to the insert code... */
+		/* NOTE: if there were no lazy deleted items in the leaf, then
+		 * idx_compact_leaf is a NOP. This is the only case in which it may not
+		 * fall through and do the insert below.
+		 *
+		 * Normally, if the cp_del_count is correct then the insert
+		 * will work below, and the assertion here will not fail.
+		 *
+		 * In this case, the xt_ind_release() will correctly indicate an update.
+		 */
+		ASSERT_NS(result.sr_item.i_total_size + key_value.sv_length + XT_RECORD_REF_SIZE <= XT_INDEX_PAGE_DATA_SIZE);
+	}
+
+	if (result.sr_item.i_total_size + key_value.sv_length + XT_RECORD_REF_SIZE <= XT_INDEX_PAGE_DATA_SIZE) {
+		if (iref.ir_block->cb_handle_count) {
+			if (!xt_ind_copy_on_write(&iref))
+				goto failed_1;
+		}
+
+		idx_insert_leaf_item(ind, iref.ir_branch, &key_value, &result);
+		IDX_TRACE("%d-> %x\n", (int) XT_NODE_ID(current), (int) XT_GET_DISK_2(ot->ot_ind_wbuf.tb_size_2));
+		ASSERT_NS(result.sr_item.i_total_size <= XT_INDEX_PAGE_DATA_SIZE);
+		iref.ir_updated = TRUE;
+		xt_ind_release(ot, ind, XT_UNLOCK_W_UPDATE, &iref);
+		goto done_ok;
+	}
+
+	/* Key does not fit. Must split the node.
+	 * Make sure we have a structural lock:
+	 */
+	if (!lock_structure) {
+		xt_ind_release(ot, ind, updated ? XT_UNLOCK_W_UPDATE : XT_UNLOCK_WRITE, &iref);
+		XT_INDEX_UNLOCK(ind, ot);
+		lock_structure = TRUE;
+		goto lock_and_retry;
+	}
+
+	memcpy(&ot->ot_ind_wbuf, iref.ir_branch, offsetof(XTIdxBranchDRec, tb_data) + result.sr_item.i_total_size);
+	idx_insert_leaf_item(ind, &ot->ot_ind_wbuf, &key_value, &result);
+	IDX_TRACE("%d-> %x\n", (int) XT_NODE_ID(current), (int) XT_GET_DISK_2(ot->ot_ind_wbuf.tb_size_2));
+	ASSERT_NS(result.sr_item.i_total_size > XT_INDEX_PAGE_DATA_SIZE && result.sr_item.i_total_size <= XT_INDEX_PAGE_DATA_SIZE*2);
+
+	/* This is the number of potential writes. In other words, the total number
+	 * of blocks that may be accessed.
+	 *
+	 * Note that this assume if a block is read and written soon after that the block
+	 * will not be freed in-between (a safe assumption?)
+	 */
+	if (!xt_ind_reserve(ot, stack.s_top * 2 + 3, iref.ir_branch))
+		goto failed_1;
+
+	/* Key does not fit, must split... */
+	if (!idx_get_middle_branch_item(ot, ind, &ot->ot_ind_wbuf, &key_value, &result))
+		goto failed_1;
+
+	if (!idx_new_branch(ot, ind, &new_branch))
+		goto failed_1;
+
+	if (XT_NODE_ID(current) == XT_NODE_ID(new_branch)) {
+		xt_register_taberr(XT_REG_CONTEXT, XT_ERR_INDEX_CORRUPTED, ot->ot_table->tab_name);
+		goto failed_1;
+	}
+
+	/* Copy and write the rest of the data to the new node: */
+	new_size = result.sr_item.i_total_size - result.sr_item.i_item_offset - result.sr_item.i_item_size;
+	new_branch_ptr = (XTIdxBranchDPtr) &ot->ot_ind_wbuf.tb_data[XT_INDEX_PAGE_DATA_SIZE];
+	memmove(new_branch_ptr->tb_data, &ot->ot_ind_wbuf.tb_data[result.sr_item.i_item_offset + result.sr_item.i_item_size], new_size);
+
+	XT_SET_DISK_2(new_branch_ptr->tb_size_2, XT_MAKE_LEAF_SIZE(new_size));
+	IDX_TRACE("%d-> %x\n", (int) XT_NODE_ID(new_branch), (int) XT_GET_DISK_2(new_branch_ptr->tb_size_2));
+	if (!xt_ind_write(ot, ind, new_branch, offsetof(XTIdxBranchDRec, tb_data) + new_size, (xtWord1 *) new_branch_ptr))
+		goto failed_2;
+
+	/* Modify the first node: */
+	XT_SET_DISK_2(ot->ot_ind_wbuf.tb_size_2, XT_MAKE_LEAF_SIZE(result.sr_item.i_item_offset));
+	IDX_TRACE("%d-> %x\n", (int) XT_NODE_ID(current), (int) XT_GET_DISK_2(ot->ot_ind_wbuf.tb_size_2));
+
+	if (iref.ir_block->cb_handle_count) {
+		if (!xt_ind_copy_on_write(&iref))
+			goto failed_2;
+	}
+	memcpy(iref.ir_branch, &ot->ot_ind_wbuf, offsetof(XTIdxBranchDRec, tb_data) + result.sr_item.i_item_offset);
+	iref.ir_updated = TRUE;
+	xt_ind_release(ot, ind, XT_UNLOCK_W_UPDATE, &iref);
+
+	/* Insert the new branch into the parent node, using the new middle key value: */
+	if (!idx_insert_node(ot, ind, &stack, &key_value, new_branch)) {
+		// Index may be inconsistant now...
+		idx_free_branch(ot, ind, new_branch);
+		goto failed;
+	}
+
+#ifdef XT_TRACK_INDEX_UPDATES
+	ASSERT_NS(ot->ot_ind_reserved >= ot->ot_ind_reads);
+#endif
+
+	done_ok:
+	XT_INDEX_UNLOCK(ind, ot);
+
+#ifdef DEBUG
+	//printf("INSERT OK\n");
+	//idx_check_index(ot, ind, TRUE);
+#endif
+	xt_ind_unreserve(ot);
+	return OK;
+
+	failed_2:
+	idx_free_branch(ot, ind, new_branch);
+
+	failed_1:
+	xt_ind_release(ot, ind, updated ? XT_UNLOCK_W_UPDATE : XT_UNLOCK_WRITE, &iref);
+
+	failed:
+	XT_INDEX_UNLOCK(ind, ot);
+	if (idx_out_of_memory_failure(ot))
+		goto retry_after_oom;
+
+#ifdef DEBUG
+	//printf("INSERT FAILED\n");
+	//idx_check_index(ot, ind, TRUE);
+#endif
+	xt_ind_unreserve(ot);
+	return FAILED;
+}
+
+
+/* Remove the given item in the node.
+ * This is done by going down the tree to find a replacement
+ * for the deleted item!
+ */
+static xtBool idx_remove_item_in_node(XTOpenTablePtr ot, XTIndexPtr ind, IdxBranchStackPtr stack, XTIndReferencePtr iref, XTIdxKeyValuePtr key_value)
+{
+	IdxStackItemPtr		delete_node;
+	XTIdxResultRec		result;
+	xtIndexNodeID		current;
+	xtBool				lazy_delete_cleanup_required = FALSE;
+	IdxStackItemPtr		current_top;
+
+	delete_node = idx_top(stack);
+	current = delete_node->i_branch;
+	result.sr_item = delete_node->i_pos;
+
+	/* Follow the branch after this item: */
+	idx_next_branch_item(ot->ot_table, ind, iref->ir_branch, &result);
+	xt_ind_release(ot, ind, iref->ir_updated ? XT_UNLOCK_R_UPDATE : XT_UNLOCK_READ, iref);
+
+	/* Go down the left-hand side until we reach a leaf: */
+	while (XT_NODE_ID(current)) {
+		current = result.sr_branch;
+		if (!xt_ind_fetch(ot, ind, current, XT_XLOCK_LEAF, iref))
+			return FAILED;
+		idx_first_branch_item(ot->ot_table, ind, iref->ir_branch, &result);
+		if (!result.sr_item.i_node_ref_size)
+			break;
+		xt_ind_release(ot, ind, XT_UNLOCK_READ, iref);
+		if (!idx_push(stack, current, &result.sr_item))
+			return FAILED;
+	}
+
+	ASSERT_NS(XT_NODE_ID(current));
+	ASSERT_NS(!result.sr_item.i_node_ref_size);
+
+	if (!xt_ind_reserve(ot, stack->s_top + 2, iref->ir_branch)) {
+		xt_ind_release(ot, ind, XT_UNLOCK_WRITE, iref);
+		return FAILED;
+	}
+	
+	/* This code removes lazy deleted items from the leaf,
+	 * before we promote an item to a leaf.
+	 * This is not essential, but prevents lazy deleted
+	 * items from being propogated up the tree.
+	 */
+	if (ind->mi_lazy_delete) {
+		if (iref->ir_block->cp_del_count) {
+			if (!idx_compact_leaf(ot, ind, iref, &result.sr_item))
+				return FAILED;
+		}
+	}
+
+	/* Crawl back up the stack trace, looking for a key
+	 * that can be used to replace the deleted key.
+	 *
+	 * Any empty nodes on the way up can be removed!
+	 */
+	if (result.sr_item.i_total_size > 0) {
+		/* There is a key in the leaf, extract it, and put it in the node: */
+		memcpy(key_value->sv_key, &iref->ir_branch->tb_data[result.sr_item.i_item_offset], result.sr_item.i_item_size);
+		/* This call also frees the iref.ir_branch page! */
+		if (!idx_remove_branch_item_right(ot, ind, current, iref, &result.sr_item))
+			return FAILED;
+		if (!idx_replace_node_key(ot, ind, delete_node, stack, result.sr_item.i_item_size, key_value->sv_key))
+			return FAILED;
+		goto done_ok;
+	}
+
+	xt_ind_release(ot, ind, iref->ir_updated ? XT_UNLOCK_W_UPDATE : XT_UNLOCK_WRITE, iref);
+
+	for (;;) {
+		/* The current node/leaf is empty, remove it: */
+		idx_free_branch(ot, ind, current);
+
+		current_top = idx_pop(stack);
+		current = current_top->i_branch;
+		if (!xt_ind_fetch(ot, ind, current, XT_XLOCK_LEAF, iref))
+			return FAILED;
+		
+		if (current_top == delete_node) {
+			/* All children have been removed. Delete the key and done: */
+			if (!idx_remove_branch_item_right(ot, ind, current, iref, &current_top->i_pos))
+				return FAILED;
+			goto done_ok;
+		}
+
+		if (current_top->i_pos.i_total_size > current_top->i_pos.i_node_ref_size) {
+			/* Save the key: */
+			memcpy(key_value->sv_key, &iref->ir_branch->tb_data[current_top->i_pos.i_item_offset], current_top->i_pos.i_item_size);
+			/* This function also frees the cache page: */
+			if (!idx_remove_branch_item_left(ot, ind, current, iref, &current_top->i_pos, &lazy_delete_cleanup_required))
+				return FAILED;
+			if (!idx_replace_node_key(ot, ind, delete_node, stack, current_top->i_pos.i_item_size, key_value->sv_key))
+				return FAILED;
+			/* */
+			if (lazy_delete_cleanup_required) {
+				if (!xt_ind_fetch(ot, ind, current, XT_LOCK_READ, iref))
+					return FAILED;
+				if (!idx_remove_lazy_deleted_item_in_node(ot, ind, current, iref, key_value))
+					return FAILED;
+			}
+			goto done_ok;
+		}
+		xt_ind_release(ot, ind, current_top->i_pos.i_node_ref_size ? XT_UNLOCK_READ : XT_UNLOCK_WRITE, iref);
+	}
+
+	done_ok:
+#ifdef XT_TRACK_INDEX_UPDATES
+	ASSERT_NS(ot->ot_ind_reserved >= ot->ot_ind_reads);
+#endif
+	return OK;
+}
+
+/*
+ * This function assumes we have a lock on the structure of the index.
+ */
+static xtBool idx_remove_lazy_deleted_item_in_node(XTOpenTablePtr ot, XTIndexPtr ind, xtIndexNodeID current, XTIndReferencePtr iref, XTIdxKeyValuePtr key_value)
+{
+	IdxBranchStackRec	stack;
+	XTIdxResultRec		result;
+
+	/* Now remove all lazy deleted items in this node.... */
+	idx_first_branch_item(ot->ot_table, ind, (XTIdxBranchDPtr) iref->ir_block->cb_data, &result);
+
+	for (;;) {
+		while (result.sr_item.i_item_offset < result.sr_item.i_total_size) {
+			if (result.sr_row_id == (xtRowID) -1)
+				goto remove_item;
+			idx_next_branch_item(ot->ot_table, ind, (XTIdxBranchDPtr) iref->ir_block->cb_data, &result);
+		}
+		break;
+
+		remove_item:
+
+		idx_newstack(&stack);
+		if (!idx_push(&stack, current, &result.sr_item)) {
+			xt_ind_release(ot, ind, iref->ir_updated ? XT_UNLOCK_R_UPDATE : XT_UNLOCK_READ, iref);
+			return FAILED;
+		}
+
+		if (!idx_remove_item_in_node(ot, ind, &stack, iref, key_value))
+			return FAILED;
+
+		/* Go back up to the node we are trying to
+		 * free of things.
+		 */
+		if (!xt_ind_fetch(ot, ind, current, XT_LOCK_READ, iref))
+			return FAILED;
+		/* Load the data again: */
+		idx_reload_item_fix(ind, iref->ir_branch, &result);
+	}
+
+	xt_ind_release(ot, ind, iref->ir_updated ? XT_UNLOCK_R_UPDATE : XT_UNLOCK_READ, iref);
+	return OK;
+}
+
+static xtBool idx_delete(XTOpenTablePtr ot, XTIndexPtr ind, XTIdxKeyValuePtr key_value)
+{
+	IdxBranchStackRec	stack;
+	xtIndexNodeID		current;
+	XTIndReferenceRec	iref;
+	XTIdxResultRec		result;
+	xtBool				lock_structure = FALSE;
+
+#ifdef DEBUG
+	iref.ir_xlock = 2;
+	iref.ir_updated = 2;
+#endif
+	/* The index appears to have no root: */
+	if (!XT_NODE_ID(ind->mi_root))
+		lock_structure = TRUE;
+
+	lock_and_retry:
+	idx_newstack(&stack);
+
+	if (lock_structure)
+		XT_INDEX_WRITE_LOCK(ind, ot);
+	else
+		XT_INDEX_READ_LOCK(ind, ot);
+
+	if (!(XT_NODE_ID(current) = XT_NODE_ID(ind->mi_root)))
+		goto done_ok;
+
+	while (XT_NODE_ID(current)) {
+		if (!xt_ind_fetch(ot, ind, current, XT_XLOCK_DEL_LEAF, &iref))
+			goto failed;
+		ind->mi_scan_branch(ot->ot_table, ind, iref.ir_branch, key_value, &result);
+		if (!result.sr_item.i_node_ref_size) {
+			/* A leaf... */
+			if (result.sr_found) {
+				if (ind->mi_lazy_delete) {
+					/* If the we have a W lock, then fetch decided that we
+					 * need to compact the page.
+					 * The decision is made by xt_idx_lazy_delete_on_leaf() 
+					 */
+					if (!iref.ir_xlock)
+						idx_lazy_delete_branch_item(ot, ind, &iref, &result.sr_item);
+					else {
+						if (!iref.ir_block->cp_del_count) {
+							if (!idx_remove_branch_item_right(ot, ind, current, &iref, &result.sr_item))
+								goto failed;
+						}
+						else {
+							if (!idx_lazy_remove_leaf_item_right(ot, ind, &iref, &result.sr_item))
+								goto failed;
+						}
+					}
+				}
+				else {
+					if (!idx_remove_branch_item_right(ot, ind, current, &iref, &result.sr_item))
+						goto failed;
+				}
+			}
+			else
+				xt_ind_release(ot, ind, iref.ir_xlock ? XT_UNLOCK_WRITE : XT_UNLOCK_READ, &iref);
+			goto done_ok;
+		}
+		if (!idx_push(&stack, current, &result.sr_item)) {
+			xt_ind_release(ot, ind, XT_UNLOCK_READ, &iref);
+			goto failed;
+		}
+		if (result.sr_found)
+			/* If we have found the key in a node: */
+			break;
+		xt_ind_release(ot, ind, XT_UNLOCK_READ, &iref);
+		current = result.sr_branch;
+	}
+
+	/* Must be a non-leaf!: */
+	ASSERT_NS(result.sr_item.i_node_ref_size);
+
+	if (ind->mi_lazy_delete) {
+		if (!idx_lazy_delete_on_node(ind, iref.ir_block, &result.sr_item)) {
+			/* We need to remove some items from this node: */
+
+			if (!lock_structure) {
+				xt_ind_release(ot, ind, XT_UNLOCK_READ, &iref);
+				XT_INDEX_UNLOCK(ind, ot);
+				lock_structure = TRUE;
+				goto lock_and_retry;
+			}
+
+			idx_set_item_deleted(&iref, &result.sr_item);
+			if (!idx_remove_lazy_deleted_item_in_node(ot, ind, current, &iref, key_value))
+				goto failed;
+			goto done_ok;
+		}
+
+		if (!ot->ot_table->tab_dic.dic_no_lazy_delete) {
+			/* {LAZY-DEL-INDEX-ITEMS}
+			 * We just set item to deleted, this is a significant time
+			 * saver.
+			 * But this item can only be cleaned up when all
+			 * items on the node below are deleted.
+			 */
+			idx_lazy_delete_branch_item(ot, ind, &iref, &result.sr_item);
+			goto done_ok;
+		}
+	}
+
+	/* We will have to remove the key from a non-leaf node,
+	 * which means we are changing the structure of the index.
+	 * Make sure we have a structural lock:
+	 */
+	if (!lock_structure) {
+		xt_ind_release(ot, ind, XT_UNLOCK_READ, &iref);
+		XT_INDEX_UNLOCK(ind, ot);
+		lock_structure = TRUE;
+		goto lock_and_retry;
+	}
+
+	/* This is the item we will have to replace: */
+	if (!idx_remove_item_in_node(ot, ind, &stack, &iref, key_value))
+		goto failed;
+
+	done_ok:
+	XT_INDEX_UNLOCK(ind, ot);
+
+#ifdef DEBUG
+	//printf("DELETE OK\n");
+	//idx_check_index(ot, ind, TRUE);
+#endif
+	xt_ind_unreserve(ot);
+	return OK;
+
+	failed:
+	XT_INDEX_UNLOCK(ind, ot);
+	xt_ind_unreserve(ot);
+	return FAILED;
+}
+
+xtPublic xtBool xt_idx_delete(XTOpenTablePtr ot, XTIndexPtr ind, xtRecordID rec_id, xtWord1 *rec_buf)
+{
+	XTIdxKeyValueRec	key_value;
+	xtWord1				key_buf[XT_INDEX_MAX_KEY_SIZE + XT_MAX_RECORD_REF_SIZE];
+
+	retry_after_oom:
+#ifdef XT_TRACK_INDEX_UPDATES
+	ot->ot_ind_changed = 0;
+#endif
+
+	key_value.sv_flags = XT_SEARCH_WHOLE_KEY;
+	key_value.sv_rec_id = rec_id;
+	key_value.sv_row_id = 0;
+	key_value.sv_key = key_buf;
+	key_value.sv_length = myxt_create_key_from_row(ind, key_buf, rec_buf, NULL);
+
+	if (!idx_delete(ot, ind, &key_value)) {
+		if (idx_out_of_memory_failure(ot))
+			goto retry_after_oom;
+		return FAILED;
+	}
+	return OK;
+}
+
+xtPublic xtBool xt_idx_update_row_id(XTOpenTablePtr ot, XTIndexPtr ind, xtRecordID rec_id, xtRowID row_id, xtWord1 *rec_buf)
+{
+	xtIndexNodeID		current;
+	XTIndReferenceRec	iref;
+	XTIdxResultRec		result;
+	XTIdxKeyValueRec	key_value;
+	xtWord1				key_buf[XT_INDEX_MAX_KEY_SIZE + XT_MAX_RECORD_REF_SIZE];
+
+#ifdef DEBUG
+	iref.ir_xlock = 2;
+	iref.ir_updated = 2;
+#endif
+#ifdef CHECK_AND_PRINT
+	idx_check_index(ot, ind, TRUE);
+#endif
+	retry_after_oom:
+#ifdef XT_TRACK_INDEX_UPDATES
+	ot->ot_ind_changed = 0;
+#endif
+	key_value.sv_flags = XT_SEARCH_WHOLE_KEY;
+	key_value.sv_rec_id = rec_id;
+	key_value.sv_row_id = 0;
+	key_value.sv_key = key_buf;
+	key_value.sv_length = myxt_create_key_from_row(ind, key_buf, rec_buf, NULL);
+
+	/* NOTE: Only a read lock is required for this!!
+	 *
+	 * 09.05.2008 - This has changed because the dirty list now
+	 * hangs on the index. And the dirty list may be updated
+	 * by any change of the index.
+	 * However, the advantage is that I should be able to read
+	 * lock in the first phase of the flush.
+	 *
+	 * 18.02.2009 - This has changed again.
+	 * I am now using a read lock, because this update does not
+	 * require a structural change. In fact, it does not even
+	 * need a WRITE LOCK on the page affected, because there
+	 * is only ONE thread that can do this (the sweeper).
+	 *
+	 * This has the advantage that the sweeper (which uses this
+	 * function, causes less conflicts.
+	 *
+	 * However, it does mean that the dirty list must be otherwise
+	 * protected (which it now is be a spin lock - mi_dirty_lock).
+	 *
+	 * It also has the dissadvantage that I am going to have to
+	 * take an xlock in the first phase of the flush.
+	 */
+	XT_INDEX_READ_LOCK(ind, ot);
+
+	if (!(XT_NODE_ID(current) = XT_NODE_ID(ind->mi_root)))
+		goto done_ok;
+
+	while (XT_NODE_ID(current)) {
+		if (!xt_ind_fetch(ot, ind, current, XT_LOCK_READ, &iref))
+			goto failed;
+		ind->mi_scan_branch(ot->ot_table, ind, iref.ir_branch, &key_value, &result);
+		if (result.sr_found || !result.sr_item.i_node_ref_size)
+			break;
+		xt_ind_release(ot, ind, XT_UNLOCK_READ, &iref);
+		current = result.sr_branch;
+	}
+
+	if (result.sr_found) {
+		/* TODO: Check that concurrent reads can handle this!
+		 * assuming the write is not atomic.
+		 */
+		idx_set_item_row_id(&iref, &result.sr_item, row_id);
+		xt_ind_release(ot, ind, XT_UNLOCK_R_UPDATE, &iref);
+	}
+	else
+		xt_ind_release(ot, ind, XT_UNLOCK_READ, &iref);
+
+	done_ok:
+	XT_INDEX_UNLOCK(ind, ot);
+
+#ifdef DEBUG
+	//idx_check_index(ot, ind, TRUE);
+	//idx_check_on_key(ot);
+#endif
+	return OK;
+
+	failed:
+	XT_INDEX_UNLOCK(ind, ot);
+	if (idx_out_of_memory_failure(ot))
+		goto retry_after_oom;
+	return FAILED;
+}
+
+xtPublic void xt_idx_prep_key(XTIndexPtr ind, register XTIdxSearchKeyPtr search_key, int flags, xtWord1 *in_key_buf, size_t in_key_length)
+{
+	search_key->sk_key_value.sv_flags = flags;
+	search_key->sk_key_value.sv_rec_id = 0;
+	search_key->sk_key_value.sv_row_id = 0;
+	search_key->sk_key_value.sv_key = search_key->sk_key_buf;
+	search_key->sk_key_value.sv_length = myxt_create_key_from_key(ind, search_key->sk_key_buf, in_key_buf, in_key_length);
+	search_key->sk_on_key = FALSE;
+}
+
+xtPublic xtBool xt_idx_research(XTOpenTablePtr ot, XTIndexPtr ind)
+{
+	XTIdxSearchKeyRec search_key;
+
+	xt_ind_lock_handle(ot->ot_ind_rhandle);
+	search_key.sk_key_value.sv_flags = XT_SEARCH_WHOLE_KEY;
+	xt_get_record_ref(&ot->ot_ind_rhandle->ih_branch->tb_data[ot->ot_ind_state.i_item_offset + ot->ot_ind_state.i_item_size - XT_RECORD_REF_SIZE],
+		&search_key.sk_key_value.sv_rec_id, &search_key.sk_key_value.sv_row_id);
+	search_key.sk_key_value.sv_key = search_key.sk_key_buf;
+	search_key.sk_key_value.sv_length = ot->ot_ind_state.i_item_size - XT_RECORD_REF_SIZE;
+	search_key.sk_on_key = FALSE;
+	memcpy(search_key.sk_key_buf, &ot->ot_ind_rhandle->ih_branch->tb_data[ot->ot_ind_state.i_item_offset], search_key.sk_key_value.sv_length);
+	xt_ind_unlock_handle(ot->ot_ind_rhandle);
+	return xt_idx_search(ot, ind, &search_key);
+}
+
+/*
+ * Search for a given key and position the current pointer on the first
+ * key in the list of duplicates. If the key is not found the current
+ * pointer is placed at the first position after the key.
+ */
+xtPublic xtBool xt_idx_search(XTOpenTablePtr ot, XTIndexPtr ind, register XTIdxSearchKeyPtr search_key)
+{
+	IdxBranchStackRec	stack;
+	xtIndexNodeID		current;
+	XTIndReferenceRec	iref;
+	XTIdxResultRec		result;
+
+#ifdef DEBUG
+	iref.ir_xlock = 2;
+	iref.ir_updated = 2;
+#endif
+	if (ot->ot_ind_rhandle) {
+		xt_ind_release_handle(ot->ot_ind_rhandle, FALSE, ot->ot_thread);
+		ot->ot_ind_rhandle = NULL;
+	}
+#ifdef DEBUG
+	//idx_check_index(ot, ind, TRUE);
+#endif
+
+	/* Calling from recovery, this is not the case.
+	 * But the index read does not require a transaction!
+	 * Only insert requires this to check for duplicates.
+	if (!ot->ot_thread->st_xact_data) {
+		xt_register_xterr(XT_REG_CONTEXT, XT_ERR_NO_TRANSACTION);
+		return FAILED;
+	}
+	*/
+
+	retry_after_oom:
+#ifdef XT_TRACK_INDEX_UPDATES
+	ot->ot_ind_changed = 0;
+#endif
+	idx_newstack(&stack);
+
+	ot->ot_curr_rec_id = 0;
+	ot->ot_curr_row_id = 0;
+
+	XT_INDEX_READ_LOCK(ind, ot);
+
+	if (!(XT_NODE_ID(current) = XT_NODE_ID(ind->mi_root)))
+		goto done_ok;
+
+	while (XT_NODE_ID(current)) {
+		if (!xt_ind_fetch(ot, ind, current, XT_LOCK_READ, &iref))
+			goto failed;
+		ind->mi_scan_branch(ot->ot_table, ind, iref.ir_branch, &search_key->sk_key_value, &result);
+		if (result.sr_found)
+			/* If we have found the key in a node: */
+			search_key->sk_on_key = TRUE;
+		if (!result.sr_item.i_node_ref_size)
+			break;
+		xt_ind_release(ot, ind, XT_UNLOCK_READ, &iref);
+		if (!idx_push(&stack, current, &result.sr_item))
+			goto failed;
+		current = result.sr_branch;
+	}
+
+	if (ind->mi_lazy_delete) {
+		ignore_lazy_deleted_items:
+		while (result.sr_item.i_item_offset < result.sr_item.i_total_size) {
+			if (result.sr_row_id != (xtRowID) -1) {
+				idx_still_on_key(ind, search_key, iref.ir_branch, &result.sr_item);
+				break;
+			}
+			idx_next_branch_item(ot->ot_table, ind, iref.ir_branch, &result);
+		}
+	}
+
+	if (result.sr_item.i_item_offset == result.sr_item.i_total_size) {
+		IdxStackItemPtr node;
+
+		/* We are at the end of a leaf node.
+		 * Go up the stack to find the start position of the next key.
+		 * If we find none, then we are the end of the index.
+		 */
+		xt_ind_release(ot, ind, XT_UNLOCK_READ, &iref);
+		while ((node = idx_pop(&stack))) {
+			if (node->i_pos.i_item_offset < node->i_pos.i_total_size) {
+				if (!xt_ind_fetch(ot, ind, node->i_branch, XT_LOCK_READ, &iref))
+					goto failed;
+				xt_get_res_record_ref(&iref.ir_branch->tb_data[node->i_pos.i_item_offset + node->i_pos.i_item_size - XT_RECORD_REF_SIZE], &result);
+
+				if (ind->mi_lazy_delete) {
+					result.sr_item = node->i_pos;
+					if (result.sr_row_id == (xtRowID) -1) {
+						/* If this node pointer is lazy deleted, then
+						 * go down the next branch...
+						 */
+						idx_next_branch_item(ot->ot_table, ind, iref.ir_branch, &result);
+
+						/* Go down to the bottom: */
+						current = node->i_branch;
+						while (XT_NODE_ID(current)) {
+							xt_ind_release(ot, ind, XT_UNLOCK_READ, &iref);
+							if (!idx_push(&stack, current, &result.sr_item))
+								goto failed;
+							current = result.sr_branch;
+							if (!xt_ind_fetch(ot, ind, current, XT_LOCK_READ, &iref))
+								goto failed;
+							idx_first_branch_item(ot->ot_table, ind, iref.ir_branch, &result);
+							if (!result.sr_item.i_node_ref_size)
+								break;
+						}
+
+						goto ignore_lazy_deleted_items;
+					}
+					idx_still_on_key(ind, search_key, iref.ir_branch, &result.sr_item);
+				}
+
+				ot->ot_curr_rec_id = result.sr_rec_id;
+				ot->ot_curr_row_id = result.sr_row_id;
+				ot->ot_ind_state = node->i_pos;
+
+				/* Convert the pointer to a handle which can be used in later operations: */
+				ASSERT_NS(!ot->ot_ind_rhandle);
+				if (!(ot->ot_ind_rhandle = xt_ind_get_handle(ot, ind, &iref)))
+					goto failed;
+				/* Keep the node for next operations: */
+				/*
+				branch_size = XT_GET_INDEX_BLOCK_LEN(XT_GET_DISK_2(iref.ir_branch->tb_size_2));
+				memcpy(&ot->ot_ind_rbuf, iref.ir_branch, branch_size);
+				xt_ind_release(ot, ind, XT_UNLOCK_READ, &iref);
+				*/
+				break;
+			}
+		}
+	}
+	else {
+		ot->ot_curr_rec_id = result.sr_rec_id;
+		ot->ot_curr_row_id = result.sr_row_id;
+		ot->ot_ind_state = result.sr_item;
+
+		/* Convert the pointer to a handle which can be used in later operations: */
+		ASSERT_NS(!ot->ot_ind_rhandle);
+		if (!(ot->ot_ind_rhandle = xt_ind_get_handle(ot, ind, &iref)))
+			goto failed;
+		/* Keep the node for next operations: */
+		/*
+		branch_size = XT_GET_INDEX_BLOCK_LEN(XT_GET_DISK_2(iref.ir_branch->tb_size_2));
+		memcpy(&ot->ot_ind_rbuf, iref.ir_branch, branch_size);
+		xt_ind_release(ot, ind, XT_UNLOCK_READ, &iref);
+		*/
+	}
+
+	done_ok:
+	XT_INDEX_UNLOCK(ind, ot);
+
+#ifdef DEBUG
+	//idx_check_index(ot, ind, TRUE);
+	//idx_check_on_key(ot);
+#endif
+	ASSERT_NS(iref.ir_xlock == 2);
+	ASSERT_NS(iref.ir_updated == 2);
+	if (ind->mi_key_corrupted) {
+		xt_register_taberr(XT_REG_CONTEXT, XT_ERR_INDEX_CORRUPTED, ot->ot_table->tab_name);
+		return FAILED;
+	}
+	return OK;
+
+	failed:
+	XT_INDEX_UNLOCK(ind, ot);
+	if (idx_out_of_memory_failure(ot))
+		goto retry_after_oom;
+	ASSERT_NS(iref.ir_xlock == 2);
+	ASSERT_NS(iref.ir_updated == 2);
+	return FAILED;
+}
+
+xtPublic xtBool xt_idx_search_prev(XTOpenTablePtr ot, XTIndexPtr ind, register XTIdxSearchKeyPtr search_key)
+{
+	IdxBranchStackRec	stack;
+	xtIndexNodeID		current;
+	XTIndReferenceRec	iref;
+	XTIdxResultRec		result;
+
+#ifdef DEBUG
+	iref.ir_xlock = 2;
+	iref.ir_updated = 2;
+#endif
+	if (ot->ot_ind_rhandle) {
+		xt_ind_release_handle(ot->ot_ind_rhandle, FALSE, ot->ot_thread);
+		ot->ot_ind_rhandle = NULL;
+	}
+#ifdef DEBUG
+	//idx_check_index(ot, ind, TRUE);
+#endif
+
+	/* see the comment above in xt_idx_search */
+	/*
+	if (!ot->ot_thread->st_xact_data) {
+		xt_register_xterr(XT_REG_CONTEXT, XT_ERR_NO_TRANSACTION);
+		return FAILED;
+	}
+	*/
+
+	retry_after_oom:
+#ifdef XT_TRACK_INDEX_UPDATES
+	ot->ot_ind_changed = 0;
+#endif
+	idx_newstack(&stack);
+
+	ot->ot_curr_rec_id = 0;
+	ot->ot_curr_row_id = 0;
+
+	XT_INDEX_READ_LOCK(ind, ot);
+
+	if (!(XT_NODE_ID(current) = XT_NODE_ID(ind->mi_root)))
+		goto done_ok;
+
+	while (XT_NODE_ID(current)) {
+		if (!xt_ind_fetch(ot, ind, current, XT_LOCK_READ, &iref))
+			goto failed;
+		ind->mi_scan_branch(ot->ot_table, ind, iref.ir_branch, &search_key->sk_key_value, &result);
+		if (result.sr_found)
+			/* If we have found the key in a node: */
+			search_key->sk_on_key = TRUE;
+		if (!result.sr_item.i_node_ref_size)
+			break;
+		xt_ind_release(ot, ind, XT_UNLOCK_READ, &iref);
+		if (!idx_push(&stack, current, &result.sr_item))
+			goto failed;
+		current = result.sr_branch;
+	}
+
+	if (result.sr_item.i_item_offset == 0) {
+		IdxStackItemPtr node;
+
+		search_up_stack:
+		/* We are at the start of a leaf node.
+		 * Go up the stack to find the start position of the next key.
+		 * If we find none, then we are the end of the index.
+		 */
+		xt_ind_release(ot, ind, XT_UNLOCK_READ, &iref);
+		while ((node = idx_pop(&stack))) {
+			if (node->i_pos.i_item_offset > node->i_pos.i_node_ref_size) {
+				if (!xt_ind_fetch(ot, ind, node->i_branch, XT_LOCK_READ, &iref))
+					goto failed;
+				result.sr_item = node->i_pos;
+				ind->mi_prev_item(ot->ot_table, ind, iref.ir_branch, &result);
+
+				if (ind->mi_lazy_delete) {
+					if (result.sr_row_id == (xtRowID) -1) {
+						/* Go down to the bottom, in order to scan the leaf backwards: */
+						current = node->i_branch;
+						while (XT_NODE_ID(current)) {
+							xt_ind_release(ot, ind, XT_UNLOCK_READ, &iref);
+							if (!idx_push(&stack, current, &result.sr_item))
+								goto failed;
+							current = result.sr_branch;
+							if (!xt_ind_fetch(ot, ind, current, XT_LOCK_READ, &iref))
+								goto failed;
+							ind->mi_last_item(ot->ot_table, ind, iref.ir_branch, &result);
+							if (!result.sr_item.i_node_ref_size)
+								break;
+						}
+
+						/* If the leaf empty we have to go up the stack again... */
+						if (result.sr_item.i_total_size == 0)
+							goto search_up_stack;
+
+						goto scan_back_in_leaf;
+					}
+				}
+
+				goto record_found;
+			}
+		}
+		goto done_ok;
+	}
+
+	/* We must just step once to the left in this leaf node... */
+	ind->mi_prev_item(ot->ot_table, ind, iref.ir_branch, &result);
+
+	if (ind->mi_lazy_delete) {
+		scan_back_in_leaf:
+		while (result.sr_row_id == (xtRowID) -1) {
+			if (result.sr_item.i_item_offset == 0)
+				goto search_up_stack;
+			ind->mi_prev_item(ot->ot_table, ind, iref.ir_branch, &result);
+		}
+		idx_still_on_key(ind, search_key, iref.ir_branch, &result.sr_item);
+	}
+
+	record_found:
+	ot->ot_curr_rec_id = result.sr_rec_id;
+	ot->ot_curr_row_id = result.sr_row_id;
+	ot->ot_ind_state = result.sr_item;
+
+	/* Convert to handle for later operations: */
+	ASSERT_NS(!ot->ot_ind_rhandle);
+	if (!(ot->ot_ind_rhandle = xt_ind_get_handle(ot, ind, &iref)))
+		goto failed;
+	/* Keep a copy of the node for previous operations... */
+	/*
+	u_int branch_size;
+
+	branch_size = XT_GET_INDEX_BLOCK_LEN(XT_GET_DISK_2(iref.ir_branch->tb_size_2));
+	memcpy(&ot->ot_ind_rbuf, iref.ir_branch, branch_size);
+	xt_ind_release(ot, ind, XT_UNLOCK_READ, &iref);
+	*/
+
+	done_ok:
+	XT_INDEX_UNLOCK(ind, ot);
+
+#ifdef DEBUG
+	//idx_check_index(ot, ind, TRUE);
+	//idx_check_on_key(ot);
+#endif
+	if (ind->mi_key_corrupted) {
+		xt_register_taberr(XT_REG_CONTEXT, XT_ERR_INDEX_CORRUPTED, ot->ot_table->tab_name);
+		return FAILED;
+	}
+	return OK;
+
+	failed:
+	XT_INDEX_UNLOCK(ind, ot);
+	if (idx_out_of_memory_failure(ot))
+		goto retry_after_oom;
+	return FAILED;
+}
+
+/*
+ * Copy the current index value to the record.
+ */
+xtPublic xtBool xt_idx_read(XTOpenTablePtr ot, XTIndexPtr ind, xtWord1 *rec_buf)
+{
+	xtWord1	*bitem;
+
+#ifdef DEBUG
+	//idx_check_on_key(ot);
+#endif
+	xt_ind_lock_handle(ot->ot_ind_rhandle);
+	bitem = ot->ot_ind_rhandle->ih_branch->tb_data + ot->ot_ind_state.i_item_offset;
+	myxt_create_row_from_key(ot, ind, bitem, ot->ot_ind_state.i_item_size - XT_RECORD_REF_SIZE, rec_buf);
+	xt_ind_unlock_handle(ot->ot_ind_rhandle);
+	return OK;
+}
+
+xtPublic xtBool xt_idx_next(register XTOpenTablePtr ot, register XTIndexPtr ind, register XTIdxSearchKeyPtr search_key)
+{
+	XTIdxKeyValueRec	key_value;
+	xtWord1				key_buf[XT_INDEX_MAX_KEY_SIZE];
+	XTIdxResultRec		result;
+	IdxBranchStackRec	stack;
+	xtIndexNodeID		current;
+	XTIndReferenceRec	iref;
+
+#ifdef DEBUG
+	iref.ir_xlock = 2;
+	iref.ir_updated = 2;
+#endif
+	ASSERT_NS(ot->ot_ind_rhandle);
+	xt_ind_lock_handle(ot->ot_ind_rhandle);
+	result.sr_item = ot->ot_ind_state;
+	if (!result.sr_item.i_node_ref_size && 
+		result.sr_item.i_item_offset < result.sr_item.i_total_size && 
+		ot->ot_ind_rhandle->ih_cache_reference) {
+		XTIdxItemRec prev_item;
+
+		key_value.sv_key = &ot->ot_ind_rhandle->ih_branch->tb_data[result.sr_item.i_item_offset];
+		key_value.sv_length = result.sr_item.i_item_size - XT_RECORD_REF_SIZE;
+
+		prev_item = result.sr_item;
+		idx_next_branch_item(ot->ot_table, ind, ot->ot_ind_rhandle->ih_branch, &result);
+
+		if (ind->mi_lazy_delete) {
+			while (result.sr_item.i_item_offset < result.sr_item.i_total_size) {
+				if (result.sr_row_id != (xtRowID) -1)
+					break;
+				prev_item = result.sr_item;
+				idx_next_branch_item(ot->ot_table, ind, ot->ot_ind_rhandle->ih_branch, &result);
+			}
+		}
+
+		if (result.sr_item.i_item_offset < result.sr_item.i_total_size) {
+			/* Still on key? */
+			idx_still_on_key(ind, search_key, ot->ot_ind_rhandle->ih_branch, &result.sr_item);
+			xt_ind_unlock_handle(ot->ot_ind_rhandle);
+			goto checked_on_key;
+		}
+
+		result.sr_item = prev_item;
+	}
+
+	key_value.sv_flags = XT_SEARCH_WHOLE_KEY;
+	xt_get_record_ref(&ot->ot_ind_rhandle->ih_branch->tb_data[result.sr_item.i_item_offset + result.sr_item.i_item_size - XT_RECORD_REF_SIZE], &key_value.sv_rec_id, &key_value.sv_row_id);
+	key_value.sv_key = key_buf;
+	key_value.sv_length = result.sr_item.i_item_size - XT_RECORD_REF_SIZE;
+	memcpy(key_buf, &ot->ot_ind_rhandle->ih_branch->tb_data[result.sr_item.i_item_offset], key_value.sv_length);
+	xt_ind_release_handle(ot->ot_ind_rhandle, TRUE, ot->ot_thread);
+	ot->ot_ind_rhandle = NULL;
+
+	retry_after_oom:
+#ifdef XT_TRACK_INDEX_UPDATES
+	ot->ot_ind_changed = 0;
+#endif
+	idx_newstack(&stack);
+
+	XT_INDEX_READ_LOCK(ind, ot);
+
+	if (!(XT_NODE_ID(current) = XT_NODE_ID(ind->mi_root))) {
+		XT_INDEX_UNLOCK(ind, ot);
+		if (ind->mi_key_corrupted) {
+			xt_register_taberr(XT_REG_CONTEXT, XT_ERR_INDEX_CORRUPTED, ot->ot_table->tab_name);
+			return FAILED;
+		}
+		return OK;
+	}
+
+	while (XT_NODE_ID(current)) {
+		if (!xt_ind_fetch(ot, ind, current, XT_LOCK_READ, &iref))
+			goto failed;
+		ind->mi_scan_branch(ot->ot_table, ind, iref.ir_branch, &key_value, &result);
+		if (result.sr_item.i_node_ref_size) {
+			if (result.sr_found) {
+				/* If we have found the key in a node: */
+				idx_next_branch_item(ot->ot_table, ind, iref.ir_branch, &result);
+
+				/* Go down to the bottom: */
+				while (XT_NODE_ID(current)) {
+					xt_ind_release(ot, ind, XT_UNLOCK_READ, &iref);
+					if (!idx_push(&stack, current, &result.sr_item))
+						goto failed;
+					current = result.sr_branch;
+					if (!xt_ind_fetch(ot, ind, current, XT_LOCK_READ, &iref))
+						goto failed;
+					idx_first_branch_item(ot->ot_table, ind, iref.ir_branch, &result);
+					if (!result.sr_item.i_node_ref_size)
+						break;
+				}
+
+				/* Is the leaf not empty, then we are done... */
+				break;
+			}
+		}
+		else {
+			/* We have reached the leaf. */
+			if (result.sr_found)
+				/* If we have found the key in a leaf: */
+				idx_next_branch_item(ot->ot_table, ind, iref.ir_branch, &result);
+			/* If we did not find the key (although we should have). Our
+			 * position is automatically the next one.
+			 */
+			break;
+		}
+		xt_ind_release(ot, ind, XT_UNLOCK_READ, &iref);
+		if (!idx_push(&stack, current, &result.sr_item))
+			goto failed;
+		current = result.sr_branch;
+	}
+
+	if (ind->mi_lazy_delete) {
+		ignore_lazy_deleted_items:
+		while (result.sr_item.i_item_offset < result.sr_item.i_total_size) {
+			if (result.sr_row_id != (xtRowID) -1)
+				break;
+			idx_next_branch_item(NULL, ind, iref.ir_branch, &result);
+		}
+	}
+
+	/* Check the current position in a leaf: */
+	if (result.sr_item.i_item_offset == result.sr_item.i_total_size) {
+		/* At the end: */
+		IdxStackItemPtr node;
+
+		/* We are at the end of a leaf node.
+		 * Go up the stack to find the start poition of the next key.
+		 * If we find none, then we are the end of the index.
+		 */
+		xt_ind_release(ot, ind, XT_UNLOCK_READ, &iref);
+		while ((node = idx_pop(&stack))) {
+			if (node->i_pos.i_item_offset < node->i_pos.i_total_size) {
+				if (!xt_ind_fetch(ot, ind, node->i_branch, XT_LOCK_READ, &iref))
+					goto failed;
+				result.sr_item = node->i_pos;
+				xt_get_res_record_ref(&iref.ir_branch->tb_data[result.sr_item.i_item_offset + result.sr_item.i_item_size - XT_RECORD_REF_SIZE], &result);
+
+				if (ind->mi_lazy_delete) {
+					if (result.sr_row_id == (xtRowID) -1) {
+						/* If this node pointer is lazy deleted, then
+						 * go down the next branch...
+						 */
+						idx_next_branch_item(ot->ot_table, ind, iref.ir_branch, &result);
+
+						/* Go down to the bottom: */
+						current = node->i_branch;
+						while (XT_NODE_ID(current)) {
+							xt_ind_release(ot, ind, XT_UNLOCK_READ, &iref);
+							if (!idx_push(&stack, current, &result.sr_item))
+								goto failed;
+							current = result.sr_branch;
+							if (!xt_ind_fetch(ot, ind, current, XT_LOCK_READ, &iref))
+								goto failed;
+							idx_first_branch_item(ot->ot_table, ind, iref.ir_branch, &result);
+							if (!result.sr_item.i_node_ref_size)
+								break;
+						}
+
+						/* And scan the leaf... */
+						goto ignore_lazy_deleted_items;
+					}
+				}
+
+				goto unlock_check_on_key;
+			}
+		}
+
+		/* No more keys: */
+		if (search_key)
+			search_key->sk_on_key = FALSE;
+		ot->ot_curr_rec_id = 0;
+		ot->ot_curr_row_id = 0;
+		XT_INDEX_UNLOCK(ind, ot);
+		if (ind->mi_key_corrupted) {
+			xt_register_taberr(XT_REG_CONTEXT, XT_ERR_INDEX_CORRUPTED, ot->ot_table->tab_name);
+			return FAILED;
+		}
+		return OK;
+	}
+
+	unlock_check_on_key:
+
+	ASSERT_NS(!ot->ot_ind_rhandle);
+	if (!(ot->ot_ind_rhandle = xt_ind_get_handle(ot, ind, &iref)))
+		goto failed;
+	/*
+	u_int branch_size;
+
+	branch_size = XT_GET_INDEX_BLOCK_LEN(XT_GET_DISK_2(iref.ir_branch->tb_size_2));
+	memcpy(&ot->ot_ind_rbuf, iref.ir_branch, branch_size);
+	xt_ind_release(ot, ind, XT_UNLOCK_READ, &iref);
+	*/
+
+	XT_INDEX_UNLOCK(ind, ot);
+
+	/* Still on key? */
+	if (search_key && search_key->sk_on_key) {
+		/* GOTCHA: As a short-cut I was using a length compare
+		 * and a memcmp() here to check whether we as still on
+		 * the original search key.
+		 * This does not work because it does not take into account
+		 * trialing spaces (which are ignored in comparison).
+		 * So lengths can be different, but values still equal.
+		 * 
+		 * NOTE: We have to use the original search flags for
+		 * this compare.
+		 */
+		xt_ind_lock_handle(ot->ot_ind_rhandle);
+		search_key->sk_on_key = myxt_compare_key(ind, search_key->sk_key_value.sv_flags, search_key->sk_key_value.sv_length,
+			search_key->sk_key_value.sv_key, &ot->ot_ind_rhandle->ih_branch->tb_data[result.sr_item.i_item_offset]) == 0;
+		xt_ind_unlock_handle(ot->ot_ind_rhandle);
+	}
+
+	checked_on_key:
+	ot->ot_curr_rec_id = result.sr_rec_id;
+	ot->ot_curr_row_id = result.sr_row_id;
+	ot->ot_ind_state = result.sr_item;
+
+	if (ind->mi_key_corrupted) {
+		xt_register_taberr(XT_REG_CONTEXT, XT_ERR_INDEX_CORRUPTED, ot->ot_table->tab_name);
+		return FAILED;
+	}
+	return OK;
+
+	failed:
+	XT_INDEX_UNLOCK(ind, ot);
+	if (idx_out_of_memory_failure(ot))
+		goto retry_after_oom;
+	return FAILED;
+}
+
+xtPublic xtBool xt_idx_prev(register XTOpenTablePtr ot, register XTIndexPtr ind, register XTIdxSearchKeyPtr search_key)
+{
+	XTIdxKeyValueRec	key_value;
+	xtWord1				key_buf[XT_INDEX_MAX_KEY_SIZE];
+	XTIdxResultRec		result;
+	IdxBranchStackRec	stack;
+	xtIndexNodeID		current;
+	XTIndReferenceRec	iref;
+	IdxStackItemPtr		node;
+
+#ifdef DEBUG
+	iref.ir_xlock = 2;
+	iref.ir_updated = 2;
+#endif
+	ASSERT_NS(ot->ot_ind_rhandle);
+	xt_ind_lock_handle(ot->ot_ind_rhandle);
+	result.sr_item = ot->ot_ind_state;
+	if (!result.sr_item.i_node_ref_size && result.sr_item.i_item_offset > 0) {
+		key_value.sv_key = &ot->ot_ind_rhandle->ih_branch->tb_data[result.sr_item.i_item_offset];
+		key_value.sv_length = result.sr_item.i_item_size - XT_RECORD_REF_SIZE;
+
+		ind->mi_prev_item(ot->ot_table, ind, ot->ot_ind_rhandle->ih_branch, &result);
+
+		if (ind->mi_lazy_delete) {
+			while (result.sr_row_id == (xtRowID) -1) {
+				if (result.sr_item.i_item_offset == 0)
+					goto research;
+				ind->mi_prev_item(ot->ot_table, ind, ot->ot_ind_rhandle->ih_branch, &result);
+			}
+		}
+
+		idx_still_on_key(ind, search_key, ot->ot_ind_rhandle->ih_branch, &result.sr_item);
+
+		xt_ind_unlock_handle(ot->ot_ind_rhandle);
+		goto checked_on_key;
+	}
+
+	research:
+	key_value.sv_flags = XT_SEARCH_WHOLE_KEY;
+	key_value.sv_rec_id = ot->ot_curr_rec_id;
+	key_value.sv_row_id = 0;
+	key_value.sv_key = key_buf;
+	key_value.sv_length = result.sr_item.i_item_size - XT_RECORD_REF_SIZE;
+	memcpy(key_buf, &ot->ot_ind_rhandle->ih_branch->tb_data[result.sr_item.i_item_offset], key_value.sv_length);
+	xt_ind_release_handle(ot->ot_ind_rhandle, TRUE, ot->ot_thread);
+	ot->ot_ind_rhandle = NULL;
+
+	retry_after_oom:
+#ifdef XT_TRACK_INDEX_UPDATES
+	ot->ot_ind_changed = 0;
+#endif
+	idx_newstack(&stack);
+
+	XT_INDEX_READ_LOCK(ind, ot);
+
+	if (!(XT_NODE_ID(current) = XT_NODE_ID(ind->mi_root))) {
+		XT_INDEX_UNLOCK(ind, ot);
+		if (ind->mi_key_corrupted) {
+			xt_register_taberr(XT_REG_CONTEXT, XT_ERR_INDEX_CORRUPTED, ot->ot_table->tab_name);
+			return FAILED;
+		}
+		return OK;
+	}
+
+	while (XT_NODE_ID(current)) {
+		if (!xt_ind_fetch(ot, ind, current, XT_LOCK_READ, &iref))
+			goto failed;
+		ind->mi_scan_branch(ot->ot_table, ind, iref.ir_branch, &key_value, &result);
+		if (result.sr_item.i_node_ref_size) {
+			if (result.sr_found) {
+				/* If we have found the key in a node: */
+
+				search_down_stack:
+				/* Go down to the bottom: */
+				while (XT_NODE_ID(current)) {
+					xt_ind_release(ot, ind, XT_UNLOCK_READ, &iref);
+					if (!idx_push(&stack, current, &result.sr_item))
+						goto failed;
+					current = result.sr_branch;
+					if (!xt_ind_fetch(ot, ind, current, XT_LOCK_READ, &iref))
+						goto failed;
+					ind->mi_last_item(ot->ot_table, ind, iref.ir_branch, &result);
+					if (!result.sr_item.i_node_ref_size)
+						break;
+				}
+
+				/* If the leaf empty we have to go up the stack again... */
+				if (result.sr_item.i_total_size == 0)
+					break;
+
+				if (ind->mi_lazy_delete) {
+					while (result.sr_row_id == (xtRowID) -1) {
+						if (result.sr_item.i_item_offset == 0)
+							goto search_up_stack;
+						ind->mi_prev_item(ot->ot_table, ind, iref.ir_branch, &result);
+					}
+				}
+
+				goto unlock_check_on_key;
+			}
+		}
+		else {
+			/* We have reached the leaf.
+			 * Whether we found the key or not, we have
+			 * to move one to the left.
+			 */
+			if (result.sr_item.i_item_offset == 0)
+				break;
+			ind->mi_prev_item(ot->ot_table, ind, iref.ir_branch, &result);
+
+			if (ind->mi_lazy_delete) {
+				while (result.sr_row_id == (xtRowID) -1) {
+					if (result.sr_item.i_item_offset == 0)
+						goto search_up_stack;
+					ind->mi_prev_item(ot->ot_table, ind, iref.ir_branch, &result);
+				}
+			}
+
+			goto unlock_check_on_key;
+		}
+		xt_ind_release(ot, ind, XT_UNLOCK_READ, &iref);
+		if (!idx_push(&stack, current, &result.sr_item))
+			goto failed;
+		current = result.sr_branch;
+	}
+
+	search_up_stack:
+	/* We are at the start of a leaf node.
+	 * Go up the stack to find the start poition of the next key.
+	 * If we find none, then we are the end of the index.
+	 */
+	xt_ind_release(ot, ind, XT_UNLOCK_READ, &iref);
+	while ((node = idx_pop(&stack))) {
+		if (node->i_pos.i_item_offset > node->i_pos.i_node_ref_size) {
+			if (!xt_ind_fetch(ot, ind, node->i_branch, XT_LOCK_READ, &iref))
+				goto failed;
+			result.sr_item = node->i_pos;
+			ind->mi_prev_item(ot->ot_table, ind, iref.ir_branch, &result);
+
+			if (ind->mi_lazy_delete) {
+				if (result.sr_row_id == (xtRowID) -1) {
+					current = node->i_branch;
+					goto search_down_stack;
+				}
+			}
+
+			goto unlock_check_on_key;
+		}
+	}
+
+	/* No more keys: */
+	if (search_key)
+		search_key->sk_on_key = FALSE;
+	ot->ot_curr_rec_id = 0;
+	ot->ot_curr_row_id = 0;
+
+	XT_INDEX_UNLOCK(ind, ot);
+	if (ind->mi_key_corrupted) {
+		xt_register_taberr(XT_REG_CONTEXT, XT_ERR_INDEX_CORRUPTED, ot->ot_table->tab_name);
+		return FAILED;
+	}
+	return OK;
+
+	unlock_check_on_key:
+	ASSERT_NS(!ot->ot_ind_rhandle);
+	if (!(ot->ot_ind_rhandle = xt_ind_get_handle(ot, ind, &iref)))
+		goto failed;
+	/*
+	u_int branch_size;
+
+	branch_size = XT_GET_INDEX_BLOCK_LEN(XT_GET_DISK_2(iref.ir_branch->tb_size_2));
+	memcpy(&ot->ot_ind_rbuf, iref.ir_branch, branch_size);
+	xt_ind_release(ot, ind, XT_UNLOCK_READ, &iref);
+	*/
+
+	XT_INDEX_UNLOCK(ind, ot);
+
+	/* Still on key? */
+	if (search_key && search_key->sk_on_key) {
+		xt_ind_lock_handle(ot->ot_ind_rhandle);
+		search_key->sk_on_key = myxt_compare_key(ind, search_key->sk_key_value.sv_flags, search_key->sk_key_value.sv_length,
+			search_key->sk_key_value.sv_key, &ot->ot_ind_rhandle->ih_branch->tb_data[result.sr_item.i_item_offset]) == 0;
+		xt_ind_unlock_handle(ot->ot_ind_rhandle);
+	}
+
+	checked_on_key:
+	ot->ot_curr_rec_id = result.sr_rec_id;
+	ot->ot_curr_row_id = result.sr_row_id;
+	ot->ot_ind_state = result.sr_item;
+	if (ind->mi_key_corrupted) {
+		xt_register_taberr(XT_REG_CONTEXT, XT_ERR_INDEX_CORRUPTED, ot->ot_table->tab_name);
+		return FAILED;
+	}
+	return OK;
+
+	failed:
+	XT_INDEX_UNLOCK(ind, ot);
+	if (idx_out_of_memory_failure(ot))
+		goto retry_after_oom;
+	return FAILED;
+}
+
+/* Return TRUE if the record matches the current index search! */
+xtPublic xtBool xt_idx_match_search(register XTOpenTablePtr XT_UNUSED(ot), register XTIndexPtr ind, register XTIdxSearchKeyPtr search_key, xtWord1 *buf, int mode)
+{
+	int		r;
+	xtWord1	key_buf[XT_INDEX_MAX_KEY_SIZE];
+
+	myxt_create_key_from_row(ind, key_buf, (xtWord1 *) buf, NULL);
+	r = myxt_compare_key(ind, search_key->sk_key_value.sv_flags, search_key->sk_key_value.sv_length, search_key->sk_key_value.sv_key, key_buf);
+	switch (mode) {
+		case XT_S_MODE_MATCH:
+			return r == 0;
+		case XT_S_MODE_NEXT:
+			return r <= 0;
+		case XT_S_MODE_PREV:
+			return r >= 0;
+	}
+	return FALSE;
+}
+
+static void idx_set_index_selectivity(XTOpenTablePtr ot, XTIndexPtr ind, XTThreadPtr thread)
+{
+	static const xtRecordID MAX_RECORDS = 100;
+
+	XTIdxSearchKeyRec	search_key;
+	XTIndexSegPtr		key_seg;
+	u_int				select_count[2] = {0, 0};
+	xtWord1				key_buf[XT_INDEX_MAX_KEY_SIZE];
+	u_int				key_len;
+	xtWord1				*next_key_buf;
+	u_int				next_key_len;
+	u_int				curr_len;
+	u_int				diff;
+	u_int				j, i;
+	/* these 2 vars are used to check the overlapping if we have < 200 records */
+	xtRecordID			last_rec = 0;		/* last record accounted in this iteration */
+	xtRecordID			last_iter_rec = 0;	/* last record accounted in the previous iteration */
+
+	xtBool	(* xt_idx_iterator[2])(
+		register struct XTOpenTable *ot, register struct XTIndex *ind, register XTIdxSearchKeyPtr search_key) = {
+
+		xt_idx_next,
+		xt_idx_prev
+	};
+
+	xtBool	(* xt_idx_begin[2])(
+		struct XTOpenTable *ot, struct XTIndex *ind, register XTIdxSearchKeyPtr search_key) = {
+	
+		xt_idx_search,
+		xt_idx_search_prev
+	};
+
+	ind->mi_select_total = 0;
+	key_seg = ind->mi_seg;
+	for (i=0; i < ind->mi_seg_count; key_seg++, i++) {
+		key_seg->is_selectivity = 1;
+		key_seg->is_recs_in_range = 1;
+	}
+
+	for (j=0; j < 2; j++) {
+		xt_idx_prep_key(ind, &search_key, j == 0 ? XT_SEARCH_FIRST_FLAG : XT_SEARCH_AFTER_LAST_FLAG, NULL, 0);
+		if (!(xt_idx_begin[j])(ot, ind, &search_key))
+			goto failed;
+
+		/* Initialize the buffer with the first index valid index entry: */
+		while (!select_count[j] && ot->ot_curr_rec_id != last_iter_rec) {
+			if (ot->ot_curr_row_id) {
+				select_count[j]++;
+				last_rec = ot->ot_curr_rec_id;
+
+				key_len = ot->ot_ind_state.i_item_size - XT_RECORD_REF_SIZE;
+				xt_ind_lock_handle(ot->ot_ind_rhandle);
+				memcpy(key_buf, ot->ot_ind_rhandle->ih_branch->tb_data + ot->ot_ind_state.i_item_offset, key_len);
+				xt_ind_unlock_handle(ot->ot_ind_rhandle);
+			}
+			if (!(xt_idx_iterator[j])(ot, ind, &search_key))
+				goto failed_1;
+		}
+
+		while (select_count[j] < MAX_RECORDS && ot->ot_curr_rec_id != last_iter_rec) {
+			/* Check if the index entry is committed: */
+			if (ot->ot_curr_row_id) {
+				xt_ind_lock_handle(ot->ot_ind_rhandle);
+				select_count[j]++;
+				last_rec = ot->ot_curr_rec_id;
+
+				next_key_len = ot->ot_ind_state.i_item_size - XT_RECORD_REF_SIZE;
+				next_key_buf = ot->ot_ind_rhandle->ih_branch->tb_data + ot->ot_ind_state.i_item_offset;
+			
+				curr_len = 0;
+				diff = FALSE;
+				key_seg = ind->mi_seg;
+				for (i=0; i < ind->mi_seg_count; key_seg++, i++) {
+					curr_len += myxt_key_seg_length(key_seg, curr_len, key_buf);
+					if (!diff && myxt_compare_key(ind, 0, curr_len, key_buf, next_key_buf) != 0)
+						diff = i+1;
+					if (diff)
+						key_seg->is_selectivity++;
+				}
+
+				/* Store the key for the next comparison: */
+				key_len = next_key_len;
+				memcpy(key_buf, next_key_buf, key_len);
+				xt_ind_unlock_handle(ot->ot_ind_rhandle);
+			}
+
+			if (!(xt_idx_iterator[j])(ot, ind, &search_key))
+				goto failed_1;
+		}
+
+		last_iter_rec = last_rec;
+
+		if (ot->ot_ind_rhandle) {
+			xt_ind_release_handle(ot->ot_ind_rhandle, FALSE, thread);
+			ot->ot_ind_rhandle = NULL;
+		}
+	}
+
+	u_int select_total;
+
+	select_total = select_count[0] + select_count[1];
+	if (select_total) {
+		u_int recs;
+
+		ind->mi_select_total = select_total;
+		key_seg = ind->mi_seg;
+		for (i=0; i < ind->mi_seg_count; key_seg++, i++) {
+			recs = (u_int) ((double) select_total / (double) key_seg->is_selectivity + (double) 0.5);
+			key_seg->is_recs_in_range = recs ? recs : 1;
+		}
+	}
+	return;
+
+	failed_1:
+	if (ot->ot_ind_rhandle) {
+		xt_ind_release_handle(ot->ot_ind_rhandle, FALSE, thread);
+		ot->ot_ind_rhandle = NULL;
+	}
+
+	failed:
+	xt_tab_disable_index(ot->ot_table, XT_INDEX_CORRUPTED);
+	xt_log_and_clear_exception_ns();
+	return;
+}
+
+xtPublic void xt_ind_set_index_selectivity(XTOpenTablePtr ot, XTThreadPtr thread)
+{
+	XTTableHPtr		tab = ot->ot_table;
+	XTIndexPtr		*ind;
+	u_int			i;
+	time_t			now;
+
+	now = time(NULL);
+	xt_lock_mutex_ns(&tab->tab_ind_stat_lock);
+	if (tab->tab_ind_stat_calc_time < now) {
+		if (!tab->tab_dic.dic_disable_index) {
+			for (i=0, ind=tab->tab_dic.dic_keys; i<tab->tab_dic.dic_key_count; i++, ind++)
+				idx_set_index_selectivity(ot, *ind, thread);
+		}
+		tab->tab_ind_stat_calc_time = time(NULL);
+	}
+	xt_unlock_mutex_ns(&tab->tab_ind_stat_lock);
+}
+
+/*
+ * -----------------------------------------------------------------------
+ * Print a b-tree
+ */
+
+#ifdef TEST_CODE
+static void idx_check_on_key(XTOpenTablePtr ot)
+{
+	u_int		offs = ot->ot_ind_state.i_item_offset + ot->ot_ind_state.i_item_size - XT_RECORD_REF_SIZE;
+	xtRecordID	rec_id;
+	xtRowID		row_id;
+	
+	if (ot->ot_curr_rec_id && ot->ot_ind_state.i_item_offset < ot->ot_ind_state.i_total_size) {
+		xt_get_record_ref(&ot->ot_ind_rbuf.tb_data[offs], &rec_id, &row_id);
+		
+		ASSERT_NS(rec_id == ot->ot_curr_rec_id);
+	}
+}
+#endif
+
+static void idx_check_space(int depth)
+{
+	for (int i=0; i<depth; i++)
+		printf(". ");
+}
+
+static u_int idx_check_node(XTOpenTablePtr ot, XTIndexPtr ind, int depth, xtIndexNodeID node)
+{
+	XTIdxResultRec		result;
+	u_int				block_count = 1;
+	XTIndReferenceRec	iref;
+
+#ifdef DEBUG
+	iref.ir_xlock = 2;
+	iref.ir_updated = 2;
+#endif
+	ASSERT_NS(XT_NODE_ID(node) <= XT_NODE_ID(ot->ot_table->tab_ind_eof));
+	if (!xt_ind_fetch(ot, ind, node, XT_LOCK_READ, &iref))
+		return 0;
+
+	idx_first_branch_item(ot->ot_table, ind, iref.ir_branch, &result);
+	ASSERT_NS(result.sr_item.i_total_size + offsetof(XTIdxBranchDRec, tb_data) <= XT_INDEX_PAGE_SIZE);
+	if (result.sr_item.i_node_ref_size) {
+		idx_check_space(depth);
+		printf("%04d -->\n", (int) XT_NODE_ID(result.sr_branch));
+#ifdef TRACK_ACTIVITY
+		track_block_exists(result.sr_branch);
+#endif
+		block_count += idx_check_node(ot, ind, depth+1, result.sr_branch);
+	}
+
+	while (result.sr_item.i_item_offset < result.sr_item.i_total_size) {
+#ifdef CHECK_PRINTS_RECORD_REFERENCES
+		idx_check_space(depth);
+		if (result.sr_item.i_item_size == 12) {
+			/* Assume this is a NOT-NULL INT!: */
+			xtWord4 val = XT_GET_DISK_4(&iref.ir_branch->tb_data[result.sr_item.i_item_offset]);
+			printf("(%6d) ", (int) val);
+		}
+		printf("rec=%d row=%d ", (int) result.sr_rec_id, (int) result.sr_row_id);
+		printf("\n");
+#endif
+		idx_next_branch_item(ot->ot_table, ind, iref.ir_branch, &result);
+		if (result.sr_item.i_node_ref_size) {
+			idx_check_space(depth);
+			printf("%04d -->\n", (int) XT_NODE_ID(result.sr_branch));
+#ifdef TRACK_ACTIVITY
+			track_block_exists(result.sr_branch);
+#endif
+			block_count += idx_check_node(ot, ind, depth+1, result.sr_branch);
+		}
+	}
+
+	xt_ind_release(ot, ind, XT_UNLOCK_READ, &iref);
+	return block_count;
+}
+
+static u_int idx_check_index(XTOpenTablePtr ot, XTIndexPtr ind, xtBool with_lock)
+{
+	xtIndexNodeID			current;
+	u_int					block_count = 0;
+	u_int					i;
+
+	if (with_lock)
+		XT_INDEX_WRITE_LOCK(ind, ot);
+
+	printf("INDEX (%d) %04d ---------------------------------------\n", (int) ind->mi_index_no, (int) XT_NODE_ID(ind->mi_root));
+	if ((XT_NODE_ID(current) = XT_NODE_ID(ind->mi_root))) {
+#ifdef TRACK_ACTIVITY
+		track_block_exists(ind->mi_root);
+#endif
+		block_count = idx_check_node(ot, ind, 0, current);
+	}
+
+	if (ind->mi_free_list && ind->mi_free_list->fl_free_count) {
+		printf("INDEX (%d) FREE ---------------------------------------", (int) ind->mi_index_no);
+		ASSERT_NS(ind->mi_free_list->fl_start == 0);
+		for (i=0; i<ind->mi_free_list->fl_free_count; i++) {
+			if ((i % 40) == 0)
+				printf("\n");
+			block_count++;
+#ifdef TRACK_ACTIVITY
+			track_block_exists(ind->mi_free_list->fl_page_id[i]);
+#endif
+			printf("%2d ", (int) XT_NODE_ID(ind->mi_free_list->fl_page_id[i]));
+		}
+		if ((i % 40) != 0)
+			printf("\n");
+	}
+
+	if (with_lock)
+		XT_INDEX_UNLOCK(ind, ot);
+	return block_count;
+
+}
+
+xtPublic void xt_check_indices(XTOpenTablePtr ot)
+{
+	register XTTableHPtr	tab = ot->ot_table;
+	XTIndexPtr				*ind;
+	xtIndexNodeID			current;
+	XTIndFreeBlockRec		free_block;
+	u_int					ind_count, block_count = 0;
+	u_int					free_count = 0;
+	u_int					i, j;
+
+	xt_lock_mutex_ns(&tab->tab_ind_flush_lock);
+	printf("CHECK INDICES %s ==============================\n", tab->tab_name->ps_path);
+#ifdef TRACK_ACTIVITY
+	track_reset_missing();
+#endif
+
+	ind = tab->tab_dic.dic_keys;
+	for (u_int k=0; k<tab->tab_dic.dic_key_count; k++, ind++) {
+		ind_count = idx_check_index(ot, *ind, TRUE);
+		block_count += ind_count;
+	}
+
+	xt_lock_mutex_ns(&tab->tab_ind_lock);
+	printf("\nFREE: ---------------------------------------\n");
+	if (tab->tab_ind_free_list) {
+		XTIndFreeListPtr	ptr;
+
+		ptr = tab->tab_ind_free_list;
+		while (ptr) {
+			printf("Memory List:");
+			i = 0;
+			for (j=ptr->fl_start; j<ptr->fl_free_count; j++, i++) {
+				if ((i % 40) == 0)
+					printf("\n");
+				free_count++;
+#ifdef TRACK_ACTIVITY
+				track_block_exists(ptr->fl_page_id[j]);
+#endif
+				printf("%2d ", (int) XT_NODE_ID(ptr->fl_page_id[j]));
+			}
+			if ((i % 40) != 0)
+				printf("\n");
+			ptr = ptr->fl_next_list;
+		}
+	}
+
+	current = tab->tab_ind_free;
+	if (XT_NODE_ID(current)) {
+		u_int k = 0;
+		printf("Disk List:");
+		while (XT_NODE_ID(current)) {
+			if ((k % 40) == 0)
+				printf("\n");
+			free_count++;
+#ifdef TRACK_ACTIVITY
+			track_block_exists(current);
+#endif
+			printf("%d ", (int) XT_NODE_ID(current));
+			if (!xt_ind_read_bytes(ot, NULL, current, sizeof(XTIndFreeBlockRec), (xtWord1 *) &free_block)) {
+				xt_log_and_clear_exception_ns();
+				break;
+			}
+			XT_NODE_ID(current) = (xtIndexNodeID) XT_GET_DISK_8(free_block.if_next_block_8);
+			k++;
+		}
+		if ((k % 40) != 0)
+			printf("\n");
+	}
+	printf("\n-----------------------------\n");
+	printf("used blocks %d + free blocks %d = %d\n", block_count, free_count, block_count + free_count);
+	printf("EOF = %"PRIu64", total blocks = %d\n", (xtWord8) xt_ind_node_to_offset(tab, tab->tab_ind_eof), (int) (XT_NODE_ID(tab->tab_ind_eof) - 1));
+	printf("-----------------------------\n");
+	xt_unlock_mutex_ns(&tab->tab_ind_lock);
+#ifdef TRACK_ACTIVITY
+	track_dump_missing(tab->tab_ind_eof);
+	printf("===================================================\n");
+	track_dump_all((u_int) (XT_NODE_ID(tab->tab_ind_eof) - 1));
+#endif
+	printf("===================================================\n");
+	xt_unlock_mutex_ns(&tab->tab_ind_flush_lock);
+}
+
+/*
+ * -----------------------------------------------------------------------
+ * Load index
+ */
+
+static void idx_load_node(XTThreadPtr self, XTOpenTablePtr ot, XTIndexPtr ind, xtIndexNodeID node)
+{
+	XTIdxResultRec		result;
+	XTIndReferenceRec	iref;
+
+	ASSERT_NS(XT_NODE_ID(node) <= XT_NODE_ID(ot->ot_table->tab_ind_eof));
+	if (!xt_ind_fetch(ot, ind, node, XT_LOCK_READ, &iref))
+		xt_throw(self);
+
+	idx_first_branch_item(ot->ot_table, ind, iref.ir_branch, &result);
+	if (result.sr_item.i_node_ref_size)
+		idx_load_node(self, ot, ind, result.sr_branch);
+	while (result.sr_item.i_item_offset < result.sr_item.i_total_size) {
+		idx_next_branch_item(ot->ot_table, ind, iref.ir_branch, &result);
+		if (result.sr_item.i_node_ref_size)
+			idx_load_node(self, ot, ind, result.sr_branch);
+	}
+
+	xt_ind_release(ot, ind, XT_UNLOCK_READ, &iref);
+}
+
+xtPublic void xt_load_indices(XTThreadPtr self, XTOpenTablePtr ot)
+{
+	register XTTableHPtr	tab = ot->ot_table;
+	XTIndexPtr				*ind_ptr;
+	XTIndexPtr				ind;
+	xtIndexNodeID			current;
+
+	xt_lock_mutex(self, &tab->tab_ind_flush_lock);
+	pushr_(xt_unlock_mutex, &tab->tab_ind_flush_lock);
+
+	ind_ptr = tab->tab_dic.dic_keys;
+	for (u_int k=0; k<tab->tab_dic.dic_key_count; k++, ind_ptr++) {
+		ind = *ind_ptr;
+		XT_INDEX_WRITE_LOCK(ind, ot);
+		if ((XT_NODE_ID(current) = XT_NODE_ID(ind->mi_root)))
+			idx_load_node(self, ot, ind, current);
+		XT_INDEX_UNLOCK(ind, ot);
+	}
+
+	freer_(); // xt_unlock_mutex(&tab->tab_ind_flush_lock)
+}
+
+/*
+ * -----------------------------------------------------------------------
+ * Count the number of deleted entries in a node:
+ */
+
+/*
+ * {LAZY-DEL-INDEX-ITEMS}
+ *
+ * Use this function to count the number of deleted items 
+ * in a node when it is loaded.
+ *
+ * The count helps us decide of the node should be "packed".
+ */
+xtPublic void xt_ind_count_deleted_items(XTTableHPtr tab, XTIndexPtr ind, XTIndBlockPtr block)
+{
+	XTIdxResultRec		result;
+	int					del_count = 0;
+	xtWord2				branch_size;
+
+	branch_size = XT_GET_DISK_2(((XTIdxBranchDPtr) block->cb_data)->tb_size_2);
+
+	/* This is possible when reading free pages. */
+	if (XT_GET_INDEX_BLOCK_LEN(branch_size) < 2 || XT_GET_INDEX_BLOCK_LEN(branch_size) > XT_INDEX_PAGE_SIZE)
+		return;
+
+	idx_first_branch_item(tab, ind, (XTIdxBranchDPtr) block->cb_data, &result);
+	while (result.sr_item.i_item_offset < result.sr_item.i_total_size) {
+		if (result.sr_row_id == (xtRowID) -1)
+			del_count++;
+		idx_next_branch_item(tab, ind, (XTIdxBranchDPtr) block->cb_data, &result);
+	}
+	block->cp_del_count = del_count;
+}
+
+/*
+ * -----------------------------------------------------------------------
+ * Index consistant flush
+ */
+
+static xtBool idx_flush_dirty_list(XTIndexLogPtr il, XTOpenTablePtr ot, u_int *flush_count, XTIndBlockPtr *flush_list)
+{
+	for (u_int i=0; i<*flush_count; i++)
+		if (!il->il_write_block(ot, flush_list[i]))
+			return FAILED;
+	*flush_count = 0;
+	return OK;
+}
+
+static xtBool ind_add_to_dirty_list(XTIndexLogPtr il, XTOpenTablePtr ot, u_int *flush_count, XTIndBlockPtr *flush_list, XTIndBlockPtr block)
+{
+	register u_int		count;
+	register u_int		i;
+	register u_int		guess;
+
+	if (*flush_count == IND_FLUSH_BUFFER_SIZE) {
+		if (!idx_flush_dirty_list(il, ot, flush_count, flush_list))
+			return FAILED;
+	}
+
+	count = *flush_count;
+	i = 0;
+	while (i < count) {
+		guess = (i + count - 1) >> 1;
+		if (XT_NODE_ID(block->cb_address) == XT_NODE_ID(flush_list[guess]->cb_address)) {
+			// Should not happen...
+			ASSERT_NS(FALSE);
+			return OK;
+		}
+		if (XT_NODE_ID(block->cb_address) < XT_NODE_ID(flush_list[guess]->cb_address))
+			count = guess;
+		else
+			i = guess + 1;
+	}
+
+	/* Insert at position i */
+	memmove(flush_list + i + 1, flush_list + i, (*flush_count - i) * sizeof(XTIndBlockPtr));
+	flush_list[i] = block;
+	*flush_count = *flush_count + 1;
+	return OK;
+}
+
+xtPublic xtBool xt_flush_indices(XTOpenTablePtr ot, off_t *bytes_flushed, xtBool have_table_lock)
+{
+	register XTTableHPtr	tab = ot->ot_table;
+	XTIndexLogPtr			il;
+	XTIndexPtr				*indp;
+	XTIndexPtr				ind;
+	u_int					i, j;
+	xtBool					wrote_something = FALSE;
+	u_int					flush_count = 0;
+	XTIndBlockPtr			flush_list[IND_FLUSH_BUFFER_SIZE];
+	XTIndBlockPtr			block, fblock;
+	xtWord1					*data;
+	xtIndexNodeID			ind_free;
+	xtBool					something_to_free = FALSE;
+	xtIndexNodeID			last_address, next_address;
+	xtWord4					curr_flush_seq;
+	XTIndFreeListPtr		list_ptr;
+	u_int					dirty_blocks;
+	XTCheckPointTablePtr	cp_tab;
+	XTCheckPointStatePtr	cp = NULL;
+
+	if (!xt_begin_checkpoint(tab->tab_db, have_table_lock, ot->ot_thread))
+		return FAILED;
+
+#ifdef DEBUG_CHECK_IND_CACHE
+	xt_ind_check_cache(NULL);
+#endif
+	xt_lock_mutex_ns(&tab->tab_ind_flush_lock);
+
+	if (!tab->tab_db->db_indlogs.ilp_get_log(&il, ot->ot_thread))
+		goto failed_3;
+
+	if (!il->il_reset(ot))
+		goto failed_2;
+	if (!il->il_write_byte(ot, XT_DT_LOG_HEAD))
+		goto failed_2;
+	if (!il->il_write_word4(ot, tab->tab_id))
+		goto failed_2;
+	if (!il->il_write_word4(ot, 0))
+		goto failed_2;
+
+	/* Lock all: */
+	dirty_blocks = 0;
+	indp = tab->tab_dic.dic_keys;
+	for (i=0; i<tab->tab_dic.dic_key_count; i++, indp++) {
+		ind = *indp;
+		XT_INDEX_WRITE_LOCK(ind, ot);
+		if (ind->mi_free_list && ind->mi_free_list->fl_free_count)
+			something_to_free = TRUE;
+		dirty_blocks += ind->mi_dirty_blocks;
+	}
+	// 128 dirty blocks == 2MB
+#ifdef TRACE_FLUSH
+	printf("FLUSH index   %d %s\n", (int) dirty_blocks * XT_INDEX_PAGE_SIZE, tab->tab_name->ps_path);
+	fflush(stdout);
+#endif
+	if (bytes_flushed)
+		*bytes_flushed += (dirty_blocks * XT_INDEX_PAGE_SIZE);
+
+	curr_flush_seq = tab->tab_ind_flush_seq;
+	tab->tab_ind_flush_seq++;
+
+	/* Write the dirty pages: */
+	indp = tab->tab_dic.dic_keys;
+	data = tab->tab_index_head->tp_data;
+	for (i=0; i<tab->tab_dic.dic_key_count; i++, indp++) {
+		ind = *indp;
+		xt_spinlock_lock(&ind->mi_dirty_lock);
+		if ((block = ind->mi_dirty_list)) {
+			wrote_something = TRUE;
+			while (block) {
+				ASSERT_NS(block->cb_state == IDX_CAC_BLOCK_DIRTY);
+				ASSERT_NS((block->cp_flush_seq == curr_flush_seq) || xt_xn_is_before(block->cp_flush_seq, curr_flush_seq));
+				if (!ind_add_to_dirty_list(il, ot, &flush_count, flush_list, block))
+					goto failed;
+				block = block->cb_dirty_next;
+			}
+		}
+		xt_spinlock_unlock(&ind->mi_dirty_lock);
+		XT_SET_NODE_REF(tab, data, ind->mi_root);
+		data += XT_NODE_REF_SIZE;
+	}
+
+	/* Flush the dirty blocks: */
+	if (!idx_flush_dirty_list(il, ot, &flush_count, flush_list))
+		goto failed;
+
+	xt_lock_mutex_ns(&tab->tab_ind_lock);
+
+	/* Write the free list: */
+	if (something_to_free) {
+		union {
+			xtWord1				buffer[XT_BLOCK_SIZE_FOR_DIRECT_IO];
+			XTIndFreeBlockRec	free_block;
+		} x;
+		memset(x.buffer, 0, sizeof(XTIndFreeBlockRec));
+
+		/* The old start of the free list: */
+		XT_NODE_ID(ind_free) = 0;
+		while ((list_ptr = tab->tab_ind_free_list)) {
+			if (list_ptr->fl_start < list_ptr->fl_free_count) {
+				ind_free = list_ptr->fl_page_id[list_ptr->fl_start];
+				break;
+			}
+			tab->tab_ind_free_list = list_ptr->fl_next_list;
+			xt_free_ns(list_ptr);
+		}
+		if (!XT_NODE_ID(ind_free))
+			ind_free = tab->tab_ind_free;
+
+		if (!il->il_write_byte(ot, XT_DT_FREE_LIST))
+			goto failed;
+		indp = tab->tab_dic.dic_keys;
+		XT_NODE_ID(last_address) = 0;
+		for (i=0; i<tab->tab_dic.dic_key_count; i++, indp++) {
+			ind = *indp;
+			//ASSERT_NS(XT_INDEX_HAVE_XLOCK(ind, ot));
+			if (ind->mi_free_list && ind->mi_free_list->fl_free_count) {
+				for (j=0; j<ind->mi_free_list->fl_free_count; j++) {
+					next_address = ind->mi_free_list->fl_page_id[j];
+					if (!il->il_write_word4(ot, XT_NODE_ID(ind->mi_free_list->fl_page_id[j])))
+						goto failed;
+					if (XT_NODE_ID(last_address)) {
+						XT_SET_DISK_8(x.free_block.if_next_block_8, XT_NODE_ID(next_address));
+						if (!xt_ind_write_cache(ot, last_address, 8, x.buffer))
+							goto failed;
+					}
+					last_address = next_address;
+				}
+			}
+		}
+		if (!il->il_write_word4(ot, XT_NODE_ID(ind_free)))
+			goto failed;
+		if (XT_NODE_ID(last_address)) {
+			XT_SET_DISK_8(x.free_block.if_next_block_8, XT_NODE_ID(tab->tab_ind_free));
+			if (!xt_ind_write_cache(ot, last_address, 8, x.buffer))
+				goto failed;
+		}
+		if (!il->il_write_word4(ot, 0xFFFFFFFF))
+			goto failed;
+	}
+
+	/*
+	 * Add the free list caches to the global free list cache.
+	 * Added backwards to match the write order.
+	 */
+	indp = tab->tab_dic.dic_keys + tab->tab_dic.dic_key_count-1;
+	for (i=0; i<tab->tab_dic.dic_key_count; i++, indp--) {
+		ind = *indp;
+		//ASSERT_NS(XT_INDEX_HAVE_XLOCK(ind, ot));
+		if (ind->mi_free_list) {
+			wrote_something = TRUE;
+			ind->mi_free_list->fl_next_list = tab->tab_ind_free_list;
+			tab->tab_ind_free_list = ind->mi_free_list;
+		}
+		ind->mi_free_list = NULL;
+	}
+
+	/*
+	 * The new start of the free list is the first
+	 * item on the table free list:
+	 */
+	XT_NODE_ID(ind_free) = 0;
+	while ((list_ptr = tab->tab_ind_free_list)) {
+		if (list_ptr->fl_start < list_ptr->fl_free_count) {
+			ind_free = list_ptr->fl_page_id[list_ptr->fl_start];
+			break;
+		}
+		tab->tab_ind_free_list = list_ptr->fl_next_list;
+		xt_free_ns(list_ptr);
+	}
+	if (!XT_NODE_ID(ind_free))
+		ind_free = tab->tab_ind_free;
+	xt_unlock_mutex_ns(&tab->tab_ind_lock);
+
+	XT_SET_DISK_6(tab->tab_index_head->tp_ind_eof_6, XT_NODE_ID(tab->tab_ind_eof));
+	XT_SET_DISK_6(tab->tab_index_head->tp_ind_free_6, XT_NODE_ID(ind_free));
+
+	if (!il->il_write_header(ot, XT_INDEX_HEAD_SIZE, (xtWord1 *) tab->tab_index_head))
+		goto failed;
+
+	indp = tab->tab_dic.dic_keys;
+	for (i=0; i<tab->tab_dic.dic_key_count; i++, indp++) {
+		ind = *indp;
+		XT_INDEX_UNLOCK(ind, ot);
+	}
+
+	if (wrote_something) {
+		/* Flush the log before we flush the index.
+		 *
+		 * The reason is, we must make sure that changes that
+		 * will be in the index are already in the transaction
+		 * log.
+		 *
+		 * Only then are we able to undo those changes on
+		 * recovery.
+		 *
+		 * Simple example:
+		 * CREATE TABLE t1 (s1 INT PRIMARY KEY);
+		 * INSERT INTO t1 VALUES (1);
+		 *
+		 * BEGIN;
+		 * INSERT INTO t1 VALUES (2);
+		 *
+		 * --- INDEX IS FLUSHED HERE ---
+		 *
+		 * --- SERVER CRASH HERE ---
+		 *
+		 *
+		 * The INSERT VALUES (2) has been written
+		 * to the log, but not flushed.
+		 * But the index has been updated.
+		 * If the index is flushed it will contain
+		 * the entry for record with s1=2.
+		 * 
+		 * This entry must be removed on recovery.
+		 *
+		 * To prevent this situation I flush the log
+		 * here.
+		 */
+		if (!(tab->tab_dic.dic_tab_flags & XT_TAB_FLAGS_TEMP_TAB)) {
+			if (!xt_xlog_flush_log(tab->tab_db, ot->ot_thread))
+				goto failed_2;
+			if (!il->il_flush(ot))
+				goto failed_2;
+		}
+
+		if (!il->il_apply_log(ot))
+			goto failed_2;
+
+		indp = tab->tab_dic.dic_keys;
+		for (i=0; i<tab->tab_dic.dic_key_count; i++, indp++) {
+			ind = *indp;
+			XT_INDEX_WRITE_LOCK(ind, ot);
+		}
+
+		/* Free up flushed pages: */
+		indp = tab->tab_dic.dic_keys;
+		for (i=0; i<tab->tab_dic.dic_key_count; i++, indp++) {
+			ind = *indp;
+			xt_spinlock_lock(&ind->mi_dirty_lock);
+			if ((block = ind->mi_dirty_list)) {
+				while (block) {
+					fblock = block;
+					block = block->cb_dirty_next;
+					ASSERT_NS(fblock->cb_state == IDX_CAC_BLOCK_DIRTY);
+					if (fblock->cp_flush_seq == curr_flush_seq || xt_xn_is_before(fblock->cp_flush_seq, curr_flush_seq)) {
+						/* Take the block off the dirty list: */
+						if (fblock->cb_dirty_next)
+							fblock->cb_dirty_next->cb_dirty_prev = fblock->cb_dirty_prev;
+						if (fblock->cb_dirty_prev)
+							fblock->cb_dirty_prev->cb_dirty_next = fblock->cb_dirty_next;
+						if (ind->mi_dirty_list == fblock)
+							ind->mi_dirty_list = fblock->cb_dirty_next;
+						ind->mi_dirty_blocks--;
+						fblock->cb_state = IDX_CAC_BLOCK_CLEAN;
+					}
+				}
+			}
+			xt_spinlock_unlock(&ind->mi_dirty_lock);
+		}
+
+		indp = tab->tab_dic.dic_keys;
+		for (i=0; i<tab->tab_dic.dic_key_count; i++, indp++) {
+			ind = *indp;
+			XT_INDEX_UNLOCK(ind, ot);
+		}
+	}
+
+	il->il_release();
+
+	/* Mark this table as index flushed: */
+	cp = &tab->tab_db->db_cp_state;
+	xt_lock_mutex_ns(&cp->cp_state_lock);
+	if (cp->cp_running) {
+		cp_tab = (XTCheckPointTablePtr) xt_sl_find(NULL, cp->cp_table_ids, &tab->tab_id);
+		if (cp_tab && (cp_tab->cpt_flushed & XT_CPT_ALL_FLUSHED) != XT_CPT_ALL_FLUSHED) {
+			cp_tab->cpt_flushed |= XT_CPT_INDEX_FLUSHED;
+			if ((cp_tab->cpt_flushed & XT_CPT_ALL_FLUSHED) == XT_CPT_ALL_FLUSHED) {
+				ASSERT_NS(cp->cp_flush_count < xt_sl_get_size(cp->cp_table_ids));
+				cp->cp_flush_count++;
+			}
+		}
+	}
+	xt_unlock_mutex_ns(&cp->cp_state_lock);
+
+	xt_unlock_mutex_ns(&tab->tab_ind_flush_lock);
+#ifdef DEBUG_CHECK_IND_CACHE
+	xt_ind_check_cache((XTIndex *) 1);
+#endif
+#ifdef TRACE_FLUSH
+	printf("FLUSH --end-- %s\n", tab->tab_name->ps_path);
+	fflush(stdout);
+#endif
+	if (!xt_end_checkpoint(tab->tab_db, ot->ot_thread, NULL))
+		return FAILED;
+	return OK;
+
+	failed:
+	indp = tab->tab_dic.dic_keys;
+	for (i=0; i<tab->tab_dic.dic_key_count; i++, indp++) {
+		ind = *indp;
+		XT_INDEX_UNLOCK(ind, ot);
+	}
+
+	failed_2:
+	il->il_release();
+
+	failed_3:
+	xt_unlock_mutex_ns(&tab->tab_ind_flush_lock);
+#ifdef DEBUG_CHECK_IND_CACHE
+	xt_ind_check_cache(NULL);
+#endif
+	return FAILED;
+}
+
+void XTIndexLogPool::ilp_init(struct XTThread *self, struct XTDatabase *db, size_t log_buffer_size)
+{
+	char			path[PATH_MAX];
+	XTOpenDirPtr	od;
+	xtLogID			log_id;
+	char			*file;
+	XTIndexLogPtr	il = NULL;
+	XTOpenTablePtr	ot = NULL;
+
+	ilp_db = db;
+	ilp_log_buffer_size = log_buffer_size;
+	xt_init_mutex_with_autoname(self, &ilp_lock);
+
+	xt_strcpy(PATH_MAX, path, db->db_main_path);
+	xt_add_system_dir(PATH_MAX, path);
+	if (xt_fs_exists(path)) {
+		pushsr_(od, xt_dir_close, xt_dir_open(self, path, NULL));
+		while (xt_dir_next(self, od)) {
+			file = xt_dir_name(self, od);
+			if (xt_starts_with(file, "ilog")) {
+				if ((log_id = (xtLogID) xt_file_name_to_id(file))) {
+					if (!ilp_open_log(&il, log_id, FALSE, self))
+						goto failed;
+					if (il->il_tab_id && il->il_log_eof) {
+						char table_name[XT_IDENTIFIER_NAME_SIZE*3+3];
+
+						if (!il->il_open_table(&ot))
+							goto failed;
+						if (ot) {
+							xt_tab_make_table_name(ot->ot_table, table_name, sizeof(table_name));
+							xt_logf(XT_NT_INFO, "PBXT: Recovering index, table: %s, bytes to read: %llu\n", table_name, (u_llong) il->il_log_eof);
+							if (!il->il_apply_log(ot)) {
+								/* If recovery of an index fails, then it is corrupt! */
+								xt_tab_disable_index(ot->ot_table, XT_INDEX_CORRUPTED);
+								xt_log_and_clear_exception_ns();
+							}
+							ot->ot_thread = self;
+							il->il_close_table(ot);
+						}
+					}
+					il->il_close(TRUE);
+				}
+			}
+		}
+		freer_(); // xt_dir_close(od)
+	}
+	return;
+
+	failed:
+	if (ot && il)
+		il->il_close_table(ot);
+	if (il)
+		il->il_close(FALSE);
+	xt_throw(self);
+}
+
+void XTIndexLogPool::ilp_close(struct XTThread *XT_UNUSED(self), xtBool lock)
+{
+	XTIndexLogPtr	il;
+
+	if (lock)
+		xt_lock_mutex_ns(&ilp_lock);
+	while ((il = ilp_log_pool)) {
+		ilp_log_pool = il->il_next_in_pool;
+		il_pool_count--;
+		il->il_close(TRUE);
+	}
+	if (lock)
+		xt_unlock_mutex_ns(&ilp_lock);
+}
+
+void XTIndexLogPool::ilp_exit(struct XTThread *self)
+{
+	ilp_close(self, FALSE);
+	ASSERT_NS(il_pool_count == 0);
+	xt_free_mutex(&ilp_lock);
+}
+
+void XTIndexLogPool::ilp_name(size_t size, char *path, xtLogID log_id)
+{
+	char name[50];
+
+	sprintf(name, "ilog-%lu.xt", (u_long) log_id);
+	xt_strcpy(size, path, ilp_db->db_main_path);
+	xt_add_system_dir(size, path);
+	xt_add_dir_char(size, path);
+	xt_strcat(size, path, name);
+}
+
+xtBool XTIndexLogPool::ilp_open_log(XTIndexLogPtr *ret_il, xtLogID log_id, xtBool excl, XTThreadPtr thread)
+{
+	char				log_path[PATH_MAX];
+	XTIndexLogPtr		il;
+	XTIndLogHeadDRec	log_head;
+	size_t				read_size;
+
+	ilp_name(PATH_MAX, log_path, log_id);
+	if (!(il = (XTIndexLogPtr) xt_calloc_ns(sizeof(XTIndexLogRec))))
+		return FAILED;
+	il->il_log_id = log_id;
+	il->il_pool = this;
+
+	/* Writes will be rounded up to the nearest direct write block size (see [+]),
+	 * so make sure we have space in the buffer for that:
+	 */
+	if (!(il->il_buffer = (xtWord1 *) xt_malloc_ns(ilp_log_buffer_size + XT_BLOCK_SIZE_FOR_DIRECT_IO)))
+		goto failed;
+	il->il_buffer_size = ilp_log_buffer_size;
+
+	if (!(il->il_of = xt_open_file_ns(log_path, (excl ? XT_FS_EXCLUSIVE : 0) | XT_FS_CREATE | XT_FS_MAKE_PATH)))
+		goto failed;
+
+	if (!xt_pread_file(il->il_of, 0, sizeof(XTIndLogHeadDRec), 0, &log_head, &read_size, &thread->st_statistics.st_ilog, thread))
+		goto failed;
+
+	if (read_size == sizeof(XTIndLogHeadDRec)) {
+		il->il_tab_id = XT_GET_DISK_4(log_head.ilh_tab_id_4);
+		il->il_log_eof = XT_GET_DISK_4(log_head.ilh_log_eof_4);
+	}
+	else {
+		il->il_tab_id = 0;
+		il->il_log_eof = 0;
+	}
+
+	*ret_il = il;
+	return OK;
+
+	failed:
+	il->il_close(FALSE);
+	return FAILED;
+}
+
+xtBool XTIndexLogPool::ilp_get_log(XTIndexLogPtr *ret_il, XTThreadPtr thread)
+{
+	XTIndexLogPtr	il;
+	xtLogID			log_id = 0;
+
+	xt_lock_mutex_ns(&ilp_lock);
+	if ((il = ilp_log_pool)) {
+		ilp_log_pool = il->il_next_in_pool;
+		il_pool_count--;
+	}
+	else {
+		ilp_next_log_id++;
+		log_id = ilp_next_log_id;
+	}
+	xt_unlock_mutex_ns(&ilp_lock);
+	if (!il) {
+		if (!ilp_open_log(&il, log_id, TRUE, thread))
+			return FAILED;
+	}
+	*ret_il= il;
+	return OK;
+}
+
+void XTIndexLogPool::ilp_release_log(XTIndexLogPtr il)
+{
+	xt_lock_mutex_ns(&ilp_lock);
+	if (il_pool_count == 5)
+		il->il_close(TRUE);
+	else {
+		il_pool_count++;
+		il->il_next_in_pool = ilp_log_pool;
+		ilp_log_pool = il;
+	}
+	xt_unlock_mutex_ns(&ilp_lock);
+}
+
+xtBool XTIndexLog::il_reset(XTOpenTable *ot)
+{
+	XTIndLogHeadDRec	log_head;
+	xtTableID			tab_id = ot->ot_table->tab_id;
+
+	il_tab_id = tab_id;
+	il_log_eof = 0;
+	il_buffer_len = 0;
+	il_buffer_offset = 0;
+
+	/* We must write the header and flush here or the "previous" status (from the
+	 * last flush run) could remain. Failure to write the file completely leave the
+	 * old header in place, and other parts of the file changed.
+	 * This would lead to index corruption.  
+	 */
+	log_head.ilh_data_type = XT_DT_LOG_HEAD;
+	XT_SET_DISK_4(log_head.ilh_tab_id_4, tab_id);
+	XT_SET_DISK_4(log_head.ilh_log_eof_4, 0);
+
+	if (!xt_pwrite_file(il_of, 0, sizeof(XTIndLogHeadDRec), (xtWord1 *) &log_head, &ot->ot_thread->st_statistics.st_ilog, ot->ot_thread))
+		return FAILED;
+
+	if (!xt_flush_file(il_of, &ot->ot_thread->st_statistics.st_ilog, ot->ot_thread))
+		return FAILED;
+
+	return OK;
+}
+
+void XTIndexLog::il_close(xtBool delete_it)
+{
+	xtLogID	log_id = il_log_id;
+
+	if (il_of) {
+		xt_close_file_ns(il_of);
+		il_of = NULL;
+	}
+	
+	if (delete_it && log_id) {
+		char	log_path[PATH_MAX];
+
+		il_pool->ilp_name(PATH_MAX, log_path, log_id);
+		xt_fs_delete(NULL, log_path);
+	}
+
+	if (il_buffer) {
+		xt_free_ns(il_buffer);
+		il_buffer = NULL;
+	}
+
+	xt_free_ns(this);
+}
+
+
+void XTIndexLog::il_release()
+{
+	il_pool->ilp_db->db_indlogs.ilp_release_log(this);
+}
+
+xtBool XTIndexLog::il_require_space(size_t bytes, XTThreadPtr thread)
+{
+	if (il_buffer_len + bytes > il_buffer_size) {
+		if (!xt_pwrite_file(il_of, il_buffer_offset, il_buffer_len, il_buffer, &thread->st_statistics.st_ilog, thread))
+			return FAILED;
+		il_buffer_offset += il_buffer_len;
+		il_buffer_len = 0;
+	}
+
+	return OK;
+}
+
+xtBool XTIndexLog::il_write_byte(struct XTOpenTable *ot, xtWord1 byte)
+{
+	if (!il_require_space(1, ot->ot_thread))
+		return FAILED;
+	*(il_buffer + il_buffer_len) = byte;
+	il_buffer_len++;
+	return OK;
+}
+
+xtBool XTIndexLog::il_write_word4(struct XTOpenTable *ot, xtWord4 value)
+{
+	xtWord1 *buffer;
+
+	if (!il_require_space(4, ot->ot_thread))
+		return FAILED;
+	buffer = il_buffer + il_buffer_len;
+	XT_SET_DISK_4(buffer, value);
+	il_buffer_len += 4;
+	return OK;
+}
+
+xtBool XTIndexLog::il_write_block(struct XTOpenTable *ot, XTIndBlockPtr block)
+{
+	XTIndPageDataDPtr	page_data;
+	xtIndexNodeID		node_id;
+	XTIdxBranchDPtr		node;
+	u_int				block_len;
+
+	node_id = block->cb_address;
+	node = (XTIdxBranchDPtr) block->cb_data;
+	block_len = XT_GET_INDEX_BLOCK_LEN(XT_GET_DISK_2(node->tb_size_2));
+
+	if (!il_require_space(offsetof(XTIndPageDataDRec, ild_data) + block_len, ot->ot_thread))
+		return FAILED;
+
+	ASSERT_NS(offsetof(XTIndPageDataDRec, ild_data) + XT_INDEX_PAGE_SIZE <= il_buffer_size);
+
+	page_data = (XTIndPageDataDPtr) (il_buffer + il_buffer_len);
+	TRACK_BLOCK_TO_FLUSH(node_id);
+	page_data->ild_data_type = XT_DT_INDEX_PAGE;
+	XT_SET_DISK_4(page_data->ild_page_id_4, XT_NODE_ID(node_id));
+	memcpy(page_data->ild_data, block->cb_data, block_len);
+
+	il_buffer_len += offsetof(XTIndPageDataDRec, ild_data) + block_len;
+
+	return OK;
+}
+
+xtBool XTIndexLog::il_write_header(struct XTOpenTable *ot, size_t head_size, xtWord1 *head_buf)
+{
+	XTIndHeadDataDPtr	head_data;
+
+	if (!il_require_space(offsetof(XTIndHeadDataDRec, ilh_data) + head_size, ot->ot_thread))
+		return FAILED;
+
+	head_data = (XTIndHeadDataDPtr) (il_buffer + il_buffer_len);
+	head_data->ilh_data_type = XT_DT_HEADER;
+	XT_SET_DISK_2(head_data->ilh_head_size_2, head_size);
+	memcpy(head_data->ilh_data, head_buf, head_size);
+
+	il_buffer_len += offsetof(XTIndHeadDataDRec, ilh_data) + head_size;
+
+	return OK;
+}
+
+xtBool XTIndexLog::il_flush(struct XTOpenTable *ot)
+{
+	XTIndLogHeadDRec	log_head;
+	xtTableID			tab_id = ot->ot_table->tab_id;
+
+	if (il_buffer_len) {
+		if (!xt_pwrite_file(il_of, il_buffer_offset, il_buffer_len, il_buffer, &ot->ot_thread->st_statistics.st_ilog, ot->ot_thread))
+			return FAILED;
+		il_buffer_offset += il_buffer_len;
+		il_buffer_len = 0;
+	}
+
+	if (il_log_eof != il_buffer_offset) {
+		log_head.ilh_data_type = XT_DT_LOG_HEAD;
+		XT_SET_DISK_4(log_head.ilh_tab_id_4, tab_id);
+		XT_SET_DISK_4(log_head.ilh_log_eof_4, il_buffer_offset);
+
+		if (!xt_flush_file(il_of, &ot->ot_thread->st_statistics.st_ilog, ot->ot_thread))
+			return FAILED;
+
+		if (!xt_pwrite_file(il_of, 0, sizeof(XTIndLogHeadDRec), (xtWord1 *) &log_head, &ot->ot_thread->st_statistics.st_ilog, ot->ot_thread))
+			return FAILED;
+
+		if (!xt_flush_file(il_of, &ot->ot_thread->st_statistics.st_ilog, ot->ot_thread))
+			return FAILED;
+
+		il_tab_id = tab_id;
+		il_log_eof = il_buffer_offset;
+	}
+	return OK;
+}
+
+xtBool XTIndexLog::il_apply_log(struct XTOpenTable *ot)
+{
+	XT_NODE_TEMP;
+	register XTTableHPtr	tab = ot->ot_table;
+	off_t					offset;
+	size_t					pos;
+	xtWord1					*buffer;
+	off_t					address;
+	xtIndexNodeID			node_id;
+	size_t					req_size = 0;
+	XTIndLogHeadDRec		log_head;
+
+	offset = 0;
+	while (offset < il_log_eof) {
+		if (offset < il_buffer_offset ||
+			offset >= il_buffer_offset + (off_t) il_buffer_len) {
+			il_buffer_len = il_buffer_size;
+			if (il_log_eof - offset < (off_t) il_buffer_len)
+				il_buffer_len = (size_t) (il_log_eof - offset);
+
+			/* Corrupt log?! */
+			if (il_buffer_len < req_size) {
+				xt_register_ixterr(XT_REG_CONTEXT, XT_ERR_INDEX_LOG_CORRUPT, xt_file_path(il_of));
+				return FAILED;
+			}
+			if (!xt_pread_file(il_of, offset, il_buffer_len, il_buffer_len, il_buffer, NULL, &ot->ot_thread->st_statistics.st_ilog, ot->ot_thread))
+				return FAILED;
+			il_buffer_offset = offset;
+		}
+		pos = (size_t) (offset - il_buffer_offset);
+		ASSERT_NS(pos < il_buffer_len);
+		buffer = il_buffer + pos;
+		switch (*buffer) {
+			case XT_DT_LOG_HEAD:
+				req_size = sizeof(XTIndLogHeadDRec);
+				if (il_buffer_len - pos < req_size) {
+					il_buffer_len = 0;
+					continue;
+				}
+				offset += req_size;
+				req_size = 0;
+				break;
+			case XT_DT_INDEX_PAGE:
+				XTIndPageDataDPtr	page_data;
+				XTIdxBranchDPtr		node;
+				u_int				block_len;
+				size_t				size;
+
+				req_size = offsetof(XTIndPageDataDRec, ild_data) + 2;
+				if (il_buffer_len - pos < req_size) {
+					il_buffer_len = 0;
+					continue;
+				}
+				page_data = (XTIndPageDataDPtr) buffer;
+				node_id = XT_RET_NODE_ID(XT_GET_DISK_4(page_data->ild_page_id_4));
+				node = (XTIdxBranchDPtr) page_data->ild_data;
+				block_len = XT_GET_INDEX_BLOCK_LEN(XT_GET_DISK_2(node->tb_size_2));
+				if (block_len < 2 || block_len > XT_INDEX_PAGE_SIZE) {
+					xt_register_taberr(XT_REG_CONTEXT, XT_ERR_INDEX_CORRUPTED, tab->tab_name);
+					return FAILED;
+				}
+
+				req_size = offsetof(XTIndPageDataDRec, ild_data) + block_len;
+				if (il_buffer_len - pos < req_size) {
+					il_buffer_len = 0;
+					continue;
+				}
+
+				TRACK_BLOCK_FLUSH_N(node_id);
+				address = xt_ind_node_to_offset(tab, node_id);
+				/* [+] Round up the block size. Space has been provided. */
+				size = (((block_len - 1) / XT_BLOCK_SIZE_FOR_DIRECT_IO) + 1) * XT_BLOCK_SIZE_FOR_DIRECT_IO;
+				IDX_TRACE("%d- W%x\n", (int) XT_NODE_ID(node_id), (int) XT_GET_DISK_2(page_data->ild_data));
+				ASSERT_NS(size > 0 && size <= XT_INDEX_PAGE_SIZE);
+				if (!xt_pwrite_file(ot->ot_ind_file, address, size, page_data->ild_data, &ot->ot_thread->st_statistics.st_ind, ot->ot_thread))
+					return FAILED;
+
+				offset += req_size;
+				req_size = 0;
+				break;
+			case XT_DT_FREE_LIST:
+				xtWord4	block, nblock;
+				union {
+					xtWord1				buffer[XT_BLOCK_SIZE_FOR_DIRECT_IO];
+					XTIndFreeBlockRec	free_block;
+				} x;
+				off_t	aoff;
+
+				memset(x.buffer, 0, sizeof(XTIndFreeBlockRec));
+
+				pos++;
+				offset++;
+				
+				for (;;) {
+					req_size = 8;
+					if (il_buffer_len - pos < req_size) {
+						il_buffer_len = il_buffer_size;
+						if (il_log_eof - offset < (off_t) il_buffer_len)
+							il_buffer_len = (size_t) (il_log_eof - offset);
+						/* Corrupt log?! */
+						if (il_buffer_len < req_size) {
+							xt_register_ixterr(XT_REG_CONTEXT, XT_ERR_INDEX_LOG_CORRUPT, xt_file_path(il_of));
+							return FAILED;
+						}
+						if (!xt_pread_file(il_of, offset, il_buffer_len, il_buffer_len, il_buffer, NULL, &ot->ot_thread->st_statistics.st_ilog, ot->ot_thread))
+							return FAILED;
+						pos = 0;
+					}
+					block = XT_GET_DISK_4(il_buffer + pos);
+					nblock = XT_GET_DISK_4(il_buffer + pos + 4);
+					if (nblock == 0xFFFFFFFF)
+						break;
+					aoff = xt_ind_node_to_offset(tab, XT_RET_NODE_ID(block));
+					XT_SET_DISK_8(x.free_block.if_next_block_8, nblock);
+					IDX_TRACE("%d- *%x\n", (int) block, (int) XT_GET_DISK_2(x.buffer));
+					if (!xt_pwrite_file(ot->ot_ind_file, aoff, XT_BLOCK_SIZE_FOR_DIRECT_IO, x.buffer, &ot->ot_thread->st_statistics.st_ind, ot->ot_thread))
+						return FAILED;
+					pos += 4;
+					offset += 4;
+				}
+
+				offset += 8;
+				req_size = 0;
+				break;
+			case XT_DT_HEADER:
+				XTIndHeadDataDPtr	head_data;
+				size_t				len;
+
+				req_size = offsetof(XTIndHeadDataDRec, ilh_data);
+				if (il_buffer_len - pos < req_size) {
+					il_buffer_len = 0;
+					continue;
+				}
+				head_data = (XTIndHeadDataDPtr) buffer;
+				len = XT_GET_DISK_2(head_data->ilh_head_size_2);
+
+				req_size = offsetof(XTIndHeadDataDRec, ilh_data) + len;
+				if (il_buffer_len - pos < req_size) {
+					il_buffer_len = 0;
+					continue;
+				}
+
+				if (!xt_pwrite_file(ot->ot_ind_file, 0, len, head_data->ilh_data, &ot->ot_thread->st_statistics.st_ind, ot->ot_thread))
+					return FAILED;
+
+				offset += req_size;
+				req_size = 0;
+				break;
+			default:
+				xt_register_ixterr(XT_REG_CONTEXT, XT_ERR_INDEX_LOG_CORRUPT, xt_file_path(il_of));
+				return FAILED;
+		}
+	}
+
+	if (!xt_flush_file(ot->ot_ind_file, &ot->ot_thread->st_statistics.st_ind, ot->ot_thread))
+		return FAILED;
+
+	log_head.ilh_data_type = XT_DT_LOG_HEAD;
+	XT_SET_DISK_4(log_head.ilh_tab_id_4, il_tab_id);
+	XT_SET_DISK_4(log_head.ilh_log_eof_4, 0);
+
+	if (!xt_pwrite_file(il_of, 0, sizeof(XTIndLogHeadDRec), (xtWord1 *) &log_head, &ot->ot_thread->st_statistics.st_ilog, ot->ot_thread))
+		return FAILED;
+
+	if (!(tab->tab_dic.dic_tab_flags & XT_TAB_FLAGS_TEMP_TAB)) {
+		if (!xt_flush_file(il_of, &ot->ot_thread->st_statistics.st_ilog, ot->ot_thread))
+			return FAILED;
+	}
+	return OK;
+}
+
+xtBool XTIndexLog::il_open_table(struct XTOpenTable **ot)
+{
+	return xt_db_open_pool_table_ns(ot, il_pool->ilp_db, il_tab_id);
+}
+
+void XTIndexLog::il_close_table(struct XTOpenTable *ot)
+{
+	xt_db_return_table_to_pool_ns(ot);
+}
+
+
diff --git a/storage/pbxt/src/index_xt.h b/storage/pbxt/src/index_xt.h
new file mode 100644
index 00000000000..a56e7b3cdb7
--- /dev/null
+++ b/storage/pbxt/src/index_xt.h
@@ -0,0 +1,530 @@
+/* Copyright (c) 2005 PrimeBase Technologies GmbH
+ *
+ * PrimeBase XT
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * 2005-09-30	Paul McCullagh
+ *
+ * H&G2JCtL
+ */
+#ifndef __xt_index_h__
+#define __xt_index_h__
+
+#ifdef DRIZZLED
+#include <drizzled/definitions.h>
+#include <mysys/my_bitmap.h>
+#else
+#include <mysql_version.h>
+#include <my_bitmap.h>
+#endif
+#include <time.h>
+
+#include "thread_xt.h"
+#include "linklist_xt.h"
+#include "datalog_xt.h"
+#include "datadic_xt.h"
+
+#ifndef MYSQL_VERSION_ID
+#error MYSQL_VERSION_ID must be defined!
+#endif
+
+struct XTDictionary;
+STRUCT_TABLE;
+struct XTTable;
+struct XTOpenTable;
+struct XTIndex;
+struct XTIndBlock;
+struct XTTable;
+class Field;
+
+/*
+ * INDEX ROLLBACK
+ *
+ * When a transaction is rolled back, the index entries are not
+ * garbage collected!! Instead, the index entries are deleted
+ * when the data record is garbage collected.
+ *
+ * When an index record is written, and this record replaces
+ * some other record (i.e. a node is updated). The new record
+ * references its predecessor.
+ *
+ * On cleanup (rollback or commit), the predecessor records
+ * are garbage collected.
+ *
+ * NOTE: It is possible to loose memory if a crash occurs during
+ * index modification. This can occur if a node is split and
+ * we crash between writing the 2 new records.
+ *
+ */ 
+
+/*
+ * These flags influence the way the compare and search
+ * routines function.
+ *
+ * The low-order 16 bits are reserved for the caller
+ * (i.e. MySQL specific stuff).
+ */
+#define XT_SEARCH_WHOLE_KEY			0x10000000		/* This flag is used to search for an insertion point, or to find
+													 * a particular slot that has already been inserted into the
+													 * index. The compare includes the handle of the variation.
+													 */
+#define XT_SEARCH_AFTER_KEY			0x20000000		/* This flags searches for the position just after the given key.
+													 * Even if the key is not found, success is possible if there
+													 * is a value in the index that would be after the search key.
+													 *
+													 * If this flag is not set then we search for the first
+													 * occurrence of the key in the index. If not found we 
+													 * take the position just after the search key.
+													 */
+#define XT_SEARCH_FIRST_FLAG		0x40000000		/* Use this flags to find the first position in the index.
+													 * When set, the actual key value is ignored.
+													 */
+#define XT_SEARCH_AFTER_LAST_FLAG	0x80000000		/* Search out the position after the last in the index.
+													 * When set, the actual key value is ignored.
+													 */
+
+#define XT_INDEX_MAX_KEY_SIZE_MAX	2048			/* These are allocated on the stack, so this is the maximum! */
+
+#define XT_INDEX_MAX_KEY_SIZE		((XT_INDEX_PAGE_SIZE >> 1) > XT_INDEX_MAX_KEY_SIZE_MAX ? XT_INDEX_MAX_KEY_SIZE_MAX : (XT_INDEX_PAGE_SIZE >> 1))
+
+#define XT_IS_NODE_BIT				0x8000
+
+#define XT_IS_NODE(x)				((x) & XT_IS_NODE_BIT)
+
+#define XT_NODE_REF_SIZE			4
+#define XT_GET_NODE_REF(t, x)		XT_RET_NODE_ID(XT_GET_DISK_4(x))
+#define XT_SET_NODE_REF(t, x, y)	XT_SET_DISK_4((x), XT_NODE_ID(y))
+
+#define XT_MAX_RECORD_REF_SIZE		8
+
+#define XT_INDEX_PAGE_DATA_SIZE		(XT_INDEX_PAGE_SIZE - 2)			/* NOTE: 2 == offsetof(XTIdxBranchDRec, tb_data) */
+
+#define XT_MAKE_LEAF_SIZE(x)		((x) + offsetof(XTIdxBranchDRec, tb_data))
+
+#define XT_MAKE_NODE_SIZE(x)		(((x) + offsetof(XTIdxBranchDRec, tb_data)) | XT_IS_NODE_BIT)
+
+#define XT_MAKE_BRANCH_SIZE(x, y)	(((x) + offsetof(XTIdxBranchDRec, tb_data)) | ((y) ? XT_IS_NODE_BIT : 0))
+
+#define XT_GET_INDEX_BLOCK_LEN(x)	((x) & 0x7FFF)
+
+#define XT_GET_BRANCH_DATA_SIZE(x)	(XT_GET_INDEX_BLOCK_LEN(x) - offsetof(XTIdxBranchDRec, tb_data))
+
+typedef struct XTIndexHead {
+	XTDiskValue4		tp_format_offset_4;	/* The offset of the format part of the header. */
+
+	XTDiskValue4		tp_header_size_4;	/* The  size of the header. */
+	XTDiskValue6		tp_not_used_6;
+
+	XTDiskValue6		tp_ind_eof_6;
+	XTDiskValue6		tp_ind_free_6;
+
+	/* The index roots follow. Each is if_node_ref_size_1 size. */
+	xtWord1				tp_data[XT_VAR_LENGTH];
+} XTIndexHeadDRec, *XTIndexHeadDPtr;
+
+typedef struct XTIndexFormat {
+	XTDiskValue4		if_format_size_4;	/* The size of this structure (index format). */
+	XTDiskValue2		if_tab_version_2;	/* The table version number. */
+	XTDiskValue2		if_ind_version_2;	/* The index version number. */
+	XTDiskValue1		if_node_ref_size_1;	/* This size of index node reference in indexes (default 4 bytes). */
+	XTDiskValue1		if_rec_ref_size_1;	/* The size of record references in the indexes (default 4 bytes). */
+	XTDiskValue4		if_page_size_4;
+} XTIndexFormatDRec, *XTIndexFormatDPtr;
+
+typedef struct XTIdxBranch {
+	XTDiskValue2		tb_size_2;			/* No of bytes used below. */
+	
+	/* We enough space for 2 buffers when splitting! */
+	xtWord1				tb_data[XT_INDEX_PAGE_DATA_SIZE];
+} XTIdxBranchDRec, *XTIdxBranchDPtr;
+
+typedef struct XTIdxItem {
+	u_int				i_total_size;		/* Size of the data in the searched branch (excludes 2 byte header). */
+	u_int				i_item_size;		/* Size of the item at this position. */
+	u_int				i_node_ref_size;
+	u_int				i_item_offset;		/* Item offset. */
+} XTIdxItemRec, *XTIdxItemPtr;
+
+typedef struct XTIdxResult {
+	xtBool				sr_found;			/* TRUE if the key was found. */
+	xtBool				sr_duplicate;		/* TRUE if the duplicate was found. */
+	xtRecordID			sr_rec_id;			/* Reference to the record of the found key. */
+	xtRowID				sr_row_id;
+	xtIndexNodeID		sr_branch;			/* Branch to follow when searching a node. */
+	XTIdxItemRec		sr_item;
+} XTIdxResultRec, *XTIdxResultPtr;
+
+typedef struct XTIdxKeyValue {
+	int					sv_flags;
+	xtRecordID			sv_rec_id;
+	xtRowID				sv_row_id;
+	u_int				sv_length;
+	xtWord1				*sv_key;
+} XTIdxKeyValueRec, *XTIdxKeyValuePtr;
+
+typedef struct XTIdxSearchKey {
+	xtBool				sk_on_key;			/* TRUE if we are positioned on the search key. */
+	XTIdxKeyValueRec	sk_key_value;		/* The value of the search key. */
+	xtWord1				sk_key_buf[XT_INDEX_MAX_KEY_SIZE];
+} XTIdxSearchKeyRec, *XTIdxSearchKeyPtr;
+
+typedef void (*XTScanBranchFunc)(struct XTTable *tab, struct XTIndex *ind, XTIdxBranchDPtr branch, register XTIdxKeyValuePtr value, register XTIdxResultRec *result);
+typedef void (*XTPrevItemFunc)(struct XTTable *tab, struct XTIndex *ind, XTIdxBranchDPtr branch, register XTIdxResultRec *result);
+typedef void (*XTLastItemFunc)(struct XTTable *tab, struct XTIndex *ind, XTIdxBranchDPtr branch, register XTIdxResultRec *result);
+
+typedef int (*XTSimpleCompFunc)(struct XTIndex *ind, u_int key_length, xtWord1 *key_value, xtWord1 *b_value);
+
+struct charset_info_st;
+
+typedef struct XTIndexSeg		/* Key-portion */
+{
+	u_int				col_idx;			/* The table column index of this component. */
+	u_int				is_recs_in_range;	/* Value returned by records_in_range(). */
+	u_int				is_selectivity;		/* The number of unique values per mi_select_total. */
+	xtWord1				type;				/* Type of key (for sort) */
+	xtWord1				language;
+	xtWord1				null_bit;			/* bitmask to test for NULL */
+	xtWord1				bit_start,bit_end;	/* if bit field */
+	xtWord1				bit_pos,bit_length;	/* (not used in 4.1) */
+	xtWord2				flag;
+	xtWord2				length;				/* Keylength */
+	xtWord4				start;				/* Start of key in record */
+	xtWord4				null_pos;			/* position to NULL indicator */
+	MX_CONST_CHARSET_INFO	*charset;
+} XTIndexSegRec, *XTIndexSegPtr;
+
+typedef struct XTIndFreeList {
+	struct XTIndFreeList	*fl_next_list;				/* List of free pages for this index. */
+	u_int					fl_start;					/* Start for allocating from the front of the list. */
+	u_int					fl_free_count;				/* Total items in the free list. */
+	xtIndexNodeID			fl_page_id[XT_VAR_LENGTH];	/* List of page ID's of the free pages. */
+} XTIndFreeListRec, *XTIndFreeListPtr;
+
+/*
+ * XT_INDEX_USE_PTHREAD_RW:
+ * The stardard pthread RW lock is currently the fastest for INSERTs
+ * in 32 threads on smalltab: runTest(SMALL_INSERT_TEST, 32, dbUrl)
+ */
+/*
+ * XT_INDEX_USE_RWMUTEX:
+ * But the RW mutex is a close second, if not just as fast.
+ * If it is at least as fast, then it is better because read lock
+ * overhead is then zero.
+ *
+ * If definitely does get in the way of the 
+ */ 
+/* XT_INDEX_USE_PTHREAD_RW:
+ * But this is clearly better on Linux. 216682 instead of 169259
+ * payment transactions (DBT2 in non-conflict transactions,
+ * using only the customer table).
+ *
+ * 27.2.2009:
+ * The story continues. I have now fixed a bug in RW MUTEX that
+ * may have been slowing things down (see {RACE-WR_MUTEX}).
+ *
+ * So we will need to test "customer payment" again.
+ *
+ * 3.3.2009
+ * Latest test show that RW mutex is slightly faster:
+ * 127460 to 123574 payment transactions.
+ */
+
+#ifdef XT_NO_ATOMICS
+#define XT_INDEX_USE_PTHREAD_RW
+#else
+//#define XT_INDEX_USE_RWMUTEX
+//#define XT_INDEX_USE_PTHREAD_RW
+//#define XT_INDEX_SPINXSLOCK
+#define XT_TAB_ROW_USE_XSMUTEX
+#endif
+
+#ifdef XT_TAB_ROW_USE_XSMUTEX
+#define XT_INDEX_LOCK_TYPE				XTXSMutexRec
+#define XT_INDEX_INIT_LOCK(s, i)		xt_xsmutex_init_with_autoname(s, &(i)->mi_rwlock)
+#define XT_INDEX_FREE_LOCK(s, i)		xt_xsmutex_free(s, &(i)->mi_rwlock)	
+#define XT_INDEX_READ_LOCK(i, o)		xt_xsmutex_slock(&(i)->mi_rwlock, (o)->ot_thread->t_id)
+#define XT_INDEX_WRITE_LOCK(i, o)		xt_xsmutex_xlock(&(i)->mi_rwlock, (o)->ot_thread->t_id)
+#define XT_INDEX_UNLOCK(i, o)			xt_xsmutex_unlock(&(i)->mi_rwlock, (o)->ot_thread->t_id)
+#define XT_INDEX_HAVE_XLOCK(i, o)		((i)->sxs_xlocker == (o)->ot_thread->t_id)
+#elif defined(XT_INDEX_USE_PTHREAD_RW)
+#define XT_INDEX_LOCK_TYPE				xt_rwlock_type
+#define XT_INDEX_INIT_LOCK(s, i)		xt_init_rwlock_with_autoname(s, &(i)->mi_rwlock)
+#define XT_INDEX_FREE_LOCK(s, i)		xt_free_rwlock(&(i)->mi_rwlock)	
+#define XT_INDEX_READ_LOCK(i, o)		xt_slock_rwlock_ns(&(i)->mi_rwlock)
+#define XT_INDEX_WRITE_LOCK(i, o)		xt_xlock_rwlock_ns(&(i)->mi_rwlock)
+#define XT_INDEX_UNLOCK(i, o)			xt_unlock_rwlock_ns(&(i)->mi_rwlock)
+#define XT_INDEX_HAVE_XLOCK(i, o)		TRUE
+#elif defined(XT_INDEX_SPINXSLOCK)
+#define XT_INDEX_LOCK_TYPE				XTSpinXSLockRec
+#define XT_INDEX_INIT_LOCK(s, i)		xt_spinxslock_init_with_autoname(s, &(i)->mi_rwlock)
+#define XT_INDEX_FREE_LOCK(s, i)		xt_spinxslock_free(s, &(i)->mi_rwlock)	
+#define XT_INDEX_READ_LOCK(i, o)		xt_spinxslock_slock(&(i)->mi_rwlock, (o)->ot_thread->t_id)
+#define XT_INDEX_WRITE_LOCK(i, o)		xt_spinxslock_xlock(&(i)->mi_rwlock, (o)->ot_thread->t_id)
+#define XT_INDEX_UNLOCK(i, o)			xt_spinxslock_unlock(&(i)->mi_rwlock, (o)->ot_thread->t_id)
+#define XT_INDEX_HAVE_XLOCK(i, o)		((i)->mi_rwlock.nrw_xlocker == (o)->ot_thread->t_id)
+#else // XT_INDEX_USE_RWMUTEX
+#define XT_INDEX_LOCK_TYPE				XTRWMutexRec
+#define XT_INDEX_INIT_LOCK(s, i)		xt_rwmutex_init_with_autoname(s, &(i)->mi_rwlock)
+#define XT_INDEX_FREE_LOCK(s, i)		xt_rwmutex_free(s, &(i)->mi_rwlock)	
+#define XT_INDEX_READ_LOCK(i, o)		xt_rwmutex_slock(&(i)->mi_rwlock, (o)->ot_thread->t_id)
+#define XT_INDEX_WRITE_LOCK(i, o)		xt_rwmutex_xlock(&(i)->mi_rwlock, (o)->ot_thread->t_id)
+#define XT_INDEX_UNLOCK(i, o)			xt_rwmutex_unlock(&(i)->mi_rwlock, (o)->ot_thread->t_id)
+#define XT_INDEX_HAVE_XLOCK(i, o)		((i)->mi_rwlock.xs_xlocker == (o)->ot_thread->t_id)
+#endif
+
+/* The R/W lock on the index is used as follows:
+ * Read Lock - used for operations on the index that are not of a structural nature.
+ * This includes any read operation and update operations that change an index
+ * node.
+ * Write lock - used to change the structure of the index. This includes adding
+ * and deleting pages.
+ */
+typedef struct XTIndex {
+	u_int				mi_index_no;				/* The index number (used by MySQL). */
+
+	/* Protected by the mi_rwlock lock: */
+	XT_INDEX_LOCK_TYPE	mi_rwlock;					/* This lock protects the structure of the index.
+													 * Read lock - structure may not change, but pages may change.
+													 * Write lock - structure of index may be changed.
+													 */
+	xtIndexNodeID		mi_root;					/* The index root node. */
+	XTIndFreeListPtr	mi_free_list;				/* List of free pages for this index. */
+	
+	/* Protected by the mi_dirty_lock: */
+	XTSpinLockRec		mi_dirty_lock;				/* Spin lock protecting the dirty & free lists. */
+	struct XTIndBlock	*mi_dirty_list;				/* List of dirty pages for this index. */
+	u_int				mi_dirty_blocks;			/* Count of the dirty blocks. */
+
+	/* Index contants: */
+	u_int				mi_flags;
+	u_int				mi_key_size;
+	u_int				mi_max_items;				/* The maximum number of items that can fit in a leaf node. */
+	xtBool				mi_key_corrupted;			/* Set to TRUE if a currupted index key is detected. */
+	xtBool				mi_fix_key;
+	xtBool				mi_lazy_delete;				/* TRUE if index entries are "lazy deleted". */
+	u_int				mi_single_type;				/* Used when the index contains a single field. */
+	u_int				mi_select_total;
+	XTScanBranchFunc	mi_scan_branch;
+	XTPrevItemFunc		mi_prev_item;
+	XTLastItemFunc		mi_last_item;
+	XTSimpleCompFunc	mi_simple_comp_key;
+	MX_BITMAP			mi_col_map;					/* Bit-map of columns in the index. */
+	u_int				mi_subset_of;				/* Indicates if this index is a complete subset of someother index. */
+	u_int				mi_seg_count;
+	XTIndexSegRec		mi_seg[200];
+} XTIndexRec, *XTIndexPtr;
+
+#define XT_INDEX_OK				0
+#define XT_INDEX_TOO_OLD		1
+#define XT_INDEX_TOO_NEW		2
+#define XT_INDEX_BAD_BLOCK		3
+#define XT_INDEX_CORRUPTED		4
+#define XT_INDEX_MISSING		5
+
+typedef void (*XTFreeDicFunc)(struct XTThread *self, struct XTDictionary *dic);
+
+typedef struct XTDictionary {
+	XTDDTable			*dic_table;					/* XT table information. */
+
+	/* Table binary information. */
+	u_int				dic_mysql_buf_size;			/* This is the size of the MySQL buffer (row size + null bytes). */
+	u_int				dic_mysql_rec_size;			/* This is the size of the fixed length MySQL row. */
+	u_int				dic_rec_size;				/* This is the size of the handle data file record. */
+	xtBool				dic_rec_fixed;				/* TRUE if the record has a fixed length size. */
+	u_int				dic_tab_flags;				/* Table flags XT_TAB_FLAGS_* */
+	xtWord8				dic_min_auto_inc;			/* The minimum auto-increment value. */
+	xtWord8				dic_min_row_size;
+	xtWord8				dic_max_row_size;
+	xtWord8				dic_ave_row_size;
+	xtWord8				dic_def_ave_row_size;		/* Defined row size set by the user. */
+	u_int				dic_no_of_cols;				/* Number of columns. */
+	u_int				dic_fix_col_count;			/* The number of columns always in the fixed part of a extended record. */
+	u_int				dic_ind_cols_req;			/* The number of columns required to build all indexes. */
+	xtWord8				dic_ind_rec_len;			/* Length of the record part that is needed for all index columns! */
+
+	/* BLOB columns: */
+	u_int				dic_blob_cols_req;			/* The number of the columns required to load all LONGBLOB columns. */
+	u_int				dic_blob_count;
+	Field				**dic_blob_cols;
+
+	/* MySQL related information. NULL when no tables are open from MySQL side! */
+	xtBool				dic_no_lazy_delete;			/* FALSE if lazy delete is OK. */
+	u_int				dic_disable_index;			/* Non-zero if the index cannot be used. */
+	u_int				dic_index_ver;				/* The version of the index. */
+	u_int				dic_key_count;
+	XTIndexPtr			*dic_keys;					/* MySQL/PBXT key description */
+	STRUCT_TABLE		*dic_my_table;				/* MySQL table */
+} XTDictionaryRec, *XTDictionaryPtr;
+
+#define XT_DT_LOG_HEAD		0
+#define XT_DT_INDEX_PAGE	1
+#define XT_DT_FREE_LIST		2
+#define XT_DT_HEADER		3
+
+typedef struct XTIndLogHead {
+	xtWord1					ilh_data_type;			/* XT_DT_LOG_HEAD */
+	XTDiskValue4			ilh_tab_id_4;
+	XTDiskValue4			ilh_log_eof_4;		/* The entire size of the log (0 if invalid!) */
+} XTIndLogHeadDRec, *XTIndLogHeadDPtr;
+
+typedef struct XTIndPageData {
+	xtWord1					ild_data_type;
+	XTDiskValue4			ild_page_id_4;
+	xtWord1					ild_data[XT_VAR_LENGTH];
+} XTIndPageDataDRec, *XTIndPageDataDPtr;
+
+typedef struct XTIndHeadData {
+	xtWord1					ilh_data_type;
+	XTDiskValue2			ilh_head_size_2;
+	xtWord1					ilh_data[XT_VAR_LENGTH];
+} XTIndHeadDataDRec, *XTIndHeadDataDPtr;
+
+typedef struct XTIndexLog {
+	struct XTIndexLogPool	*il_pool;
+	struct XTIndexLog		*il_next_in_pool;
+
+	xtLogID					il_log_id;						/* The ID of the data log. */
+	XTOpenFilePtr			il_of;
+	size_t					il_buffer_size;
+	xtWord1					*il_buffer;
+
+	xtTableID				il_tab_id;
+	off_t					il_log_eof;	
+	size_t					il_buffer_len;
+	off_t					il_buffer_offset;
+
+
+	xtBool					il_reset(XTOpenTable *ot);
+	void					il_close(xtBool delete_it);
+	void					il_release();
+
+	xtBool					il_write_byte(struct XTOpenTable *ot, xtWord1 val);
+	xtBool					il_write_word4(struct XTOpenTable *ot, xtWord4 value);
+	xtBool					il_write_block(struct XTOpenTable *ot, struct XTIndBlock *block);
+	xtBool					il_write_free_list(struct XTOpenTable *ot, u_int free_count, XTIndFreeListPtr free_list);
+	xtBool					il_require_space(size_t bytes, XTThreadPtr thread);
+	xtBool					il_write_header(struct XTOpenTable *ot, size_t head_size, xtWord1 *head_data);
+	xtBool					il_flush(struct XTOpenTable *ot);
+	xtBool					il_apply_log(struct XTOpenTable *ot);
+	
+	xtBool					il_open_table(struct XTOpenTable **ot);
+	void					il_close_table(struct XTOpenTable *ot);
+} XTIndexLogRec, *XTIndexLogPtr;
+
+typedef struct XTIndexLogPool {
+	struct XTDatabase		*ilp_db;
+	size_t					ilp_log_buffer_size;
+	u_int					il_pool_count;
+	XTIndexLogPtr			ilp_log_pool;
+	xt_mutex_type			ilp_lock;						/* The public pool lock. */
+	xtLogID					ilp_next_log_id;
+
+	void					ilp_init(struct XTThread *self, struct XTDatabase *db, size_t log_buffer_size);
+	void					ilp_close(struct XTThread *self, xtBool lock);
+	void					ilp_exit(struct XTThread *self);
+	void					ilp_name(size_t size, char *path, xtLogID log_id);
+
+	xtBool					ilp_open_log(XTIndexLogPtr *il, xtLogID log_id, xtBool excl, XTThreadPtr thread);
+
+	xtBool					ilp_get_log(XTIndexLogPtr *il, XTThreadPtr thread);
+	void					ilp_release_log(XTIndexLogPtr il);
+} XTIndexLogPoolRec, *XTIndexLogPoolPtr;
+
+/* A record reference consists of a record ID and a row ID: */
+inline void xt_get_record_ref(register xtWord1 *item, xtRecordID *rec_id, xtRowID *row_id) {
+	*rec_id = XT_GET_DISK_4(item);
+	item += 4;
+	*row_id = XT_GET_DISK_4(item);
+}
+
+inline void xt_get_res_record_ref(register xtWord1 *item, register XTIdxResultRec *result) {
+	result->sr_rec_id = XT_GET_DISK_4(item);
+	item += 4;
+	result->sr_row_id = XT_GET_DISK_4(item);
+}
+
+inline void xt_set_record_ref(register xtWord1 *item, xtRecordID rec_id, xtRowID row_id) {
+	XT_SET_DISK_4(item, rec_id);
+	item += 4;
+	XT_SET_DISK_4(item, row_id);
+}
+
+inline void xt_set_val_record_ref(register xtWord1 *item, register XTIdxKeyValuePtr value) {
+	XT_SET_DISK_4(item, value->sv_rec_id);
+	item += 4;
+	XT_SET_DISK_4(item, value->sv_row_id);
+}
+
+xtBool	xt_idx_insert(struct XTOpenTable *ot, struct XTIndex *ind, xtRowID row_id, xtRecordID rec_id, xtWord1 *rec_buf, xtWord1 *bef_buf, xtBool allow_dups);
+xtBool	xt_idx_delete(struct XTOpenTable *ot, struct XTIndex *ind, xtRecordID rec_id, xtWord1 *rec_buf);
+xtBool	xt_idx_update_row_id(struct XTOpenTable *ot, struct XTIndex *ind, xtRecordID rec_id, xtRowID row_id, xtWord1 *rec_buf);
+void	xt_idx_prep_key(struct XTIndex *ind, register XTIdxSearchKeyPtr search_key, int flags, xtWord1 *in_key_buf, size_t in_key_length);
+xtBool	xt_idx_research(struct XTOpenTable *ot, struct XTIndex *ind);
+xtBool	xt_idx_search(struct XTOpenTable *ot, struct XTIndex *ind, register XTIdxSearchKeyPtr search_key);
+xtBool	xt_idx_search_prev(struct XTOpenTable *ot, struct XTIndex *ind, register XTIdxSearchKeyPtr search_key);
+xtBool	xt_idx_next(register struct XTOpenTable *ot, register struct XTIndex *ind, register XTIdxSearchKeyPtr search_key);
+xtBool	xt_idx_prev(register struct XTOpenTable *ot, register struct XTIndex *ind, register XTIdxSearchKeyPtr search_key);
+xtBool	xt_idx_read(struct XTOpenTable *ot, struct XTIndex *ind, xtWord1 *rec_buf);
+void	xt_ind_set_index_selectivity(struct XTOpenTable *ot, XTThreadPtr thread);
+void	xt_check_indices(struct XTOpenTable *ot);
+void	xt_load_indices(XTThreadPtr self, struct XTOpenTable *ot);
+void	xt_ind_count_deleted_items(struct XTTable *ot, struct XTIndex *ind, struct XTIndBlock *block);
+xtBool	xt_flush_indices(struct XTOpenTable *ot, off_t *bytes_flushed, xtBool have_table_lock);
+void	xt_ind_track_dump_block(struct XTTable *tab, xtIndexNodeID address);
+
+#define XT_S_MODE_MATCH		0
+#define XT_S_MODE_NEXT		1
+#define XT_S_MODE_PREV		2
+xtBool	xt_idx_match_search(struct XTOpenTable *ot, struct XTIndex *ind, register XTIdxSearchKeyPtr search_key, xtWord1 *buf, int mode);
+
+int		xt_compare_2_int4(XTIndexPtr ind, uint key_length, xtWord1 *key_value, xtWord1 *b_value);
+int		xt_compare_3_int4(XTIndexPtr ind, uint key_length, xtWord1 *key_value, xtWord1 *b_value);
+void	xt_scan_branch_single(struct XTTable *tab, XTIndexPtr ind, XTIdxBranchDPtr branch, register XTIdxKeyValuePtr value, register XTIdxResultRec *result);
+void	xt_scan_branch_fix(struct XTTable *tab, XTIndexPtr ind, XTIdxBranchDPtr branch, register XTIdxKeyValuePtr value, register XTIdxResultRec *result);
+void	xt_scan_branch_fix_simple(struct XTTable *tab, XTIndexPtr ind, XTIdxBranchDPtr branch, register XTIdxKeyValuePtr value, register XTIdxResultRec *result);
+void	xt_scan_branch_var(struct XTTable *tab, XTIndexPtr ind, XTIdxBranchDPtr branch, register XTIdxKeyValuePtr value, register XTIdxResultRec *result);
+
+void	xt_prev_branch_item_fix(struct XTTable *tab, XTIndexPtr ind, XTIdxBranchDPtr branch, register XTIdxResultRec *result);
+void	xt_prev_branch_item_var(struct XTTable *tab, XTIndexPtr ind, XTIdxBranchDPtr branch, register XTIdxResultRec *result);
+
+void	xt_last_branch_item_fix(struct XTTable *tab, XTIndexPtr ind, XTIdxBranchDPtr branch, register XTIdxResultPtr result);
+void	xt_last_branch_item_var(struct XTTable *tab, XTIndexPtr ind, XTIdxBranchDPtr branch, register XTIdxResultPtr result);
+xtBool	xt_idx_lazy_delete_on_leaf(XTIndexPtr ind, struct XTIndBlock *block, xtWord2 branch_size);
+
+//#define TRACK_ACTIVITY
+#ifdef TRACK_ACTIVITY
+
+#define TRACK_BLOCK_ALLOC(x)	track_work(xt_ind_offset_to_node(tab, x), "A")
+#define TRACK_BLOCK_FREE(x)		track_work(xt_ind_offset_to_node(ot->ot_table, x), "-")
+#define TRACK_BLOCK_SPLIT(x)	track_work(xt_ind_offset_to_node(ot->ot_table, x), "/")
+#define TRACK_BLOCK_WRITE(x)	track_work(xt_ind_offset_to_node(ot->ot_table, x), "w")
+#define TRACK_BLOCK_FLUSH_N(x)	track_work(x, "F")
+#define TRACK_BLOCK_TO_FLUSH(x)	track_work(x, "f")
+
+xtPublic void track_work(u_int block, char *what);
+#else
+
+#define TRACK_BLOCK_ALLOC(x)
+#define TRACK_BLOCK_FREE(x)
+#define TRACK_BLOCK_SPLIT(x)
+#define TRACK_BLOCK_WRITE(x)
+#define TRACK_BLOCK_FLUSH_N(x)
+#define TRACK_BLOCK_TO_FLUSH(x)
+
+#endif
+
+#endif
+
diff --git a/storage/pbxt/src/linklist_xt.cc b/storage/pbxt/src/linklist_xt.cc
new file mode 100644
index 00000000000..de5fc6170ce
--- /dev/null
+++ b/storage/pbxt/src/linklist_xt.cc
@@ -0,0 +1,224 @@
+/* Copyright (c) 2005 PrimeBase Technologies GmbH
+ *
+ * PrimeBase XT
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * 2005-02-03	Paul McCullagh
+ *
+ * H&G2JCtL
+ */
+
+#include "xt_config.h"
+
+#include "pthread_xt.h"
+#include "linklist_xt.h"
+#include "thread_xt.h"
+#include "memory_xt.h"
+
+xtPublic XTLinkedListPtr xt_new_linkedlist(struct XTThread *self, void *thunk, XTFreeFunc free_func, xtBool with_lock)
+{
+	XTLinkedListPtr	ll;
+
+	ll = (XTLinkedListPtr) xt_calloc(self, sizeof(XTLinkedListRec));
+	try_(a) {
+		if (with_lock) {
+			ll->ll_lock = (xt_mutex_type *) xt_calloc(self, sizeof(xt_mutex_type));
+			try_(b) {
+				xt_init_mutex_with_autoname(self, ll->ll_lock);
+			}
+			catch_(b) {
+				xt_free(self, ll->ll_lock);
+				ll->ll_lock = NULL;
+				throw_();
+			}
+			cont_(b);
+			ll->ll_cond = (xt_cond_type *) xt_calloc(self, sizeof(xt_cond_type));
+			try_(c) {
+				xt_init_cond(self, ll->ll_cond);
+			}
+			catch_(c) {
+				xt_free(self, ll->ll_cond);
+				ll->ll_cond = NULL;
+				throw_();
+			}
+			cont_(c);
+		}
+		ll->ll_thunk = thunk;
+		ll->ll_free_func = free_func;
+	}
+	catch_(a) {
+		xt_free_linkedlist(self, ll);
+		throw_();
+	}
+	cont_(a);
+	return ll;
+}
+
+xtPublic void xt_free_linkedlist(XTThreadPtr self, XTLinkedListPtr ll)
+{
+	if (ll->ll_lock)
+		xt_lock_mutex(self, ll->ll_lock);
+	while (ll->ll_items)
+		xt_ll_remove(self, ll, ll->ll_items, FALSE);
+	if (ll->ll_lock)
+		xt_unlock_mutex(self, ll->ll_lock);
+	if (ll->ll_lock) {
+		xt_free_mutex(ll->ll_lock);
+		xt_free(self, ll->ll_lock);
+	}
+	if (ll->ll_cond) {
+		xt_free_cond(ll->ll_cond);
+		xt_free(self, ll->ll_cond);
+	}
+	xt_free(self, ll);
+}
+
+xtPublic void xt_ll_add(XTThreadPtr self, XTLinkedListPtr ll, XTLinkedItemPtr li, xtBool lock)
+{
+	if (lock && ll->ll_lock)
+		xt_lock_mutex(self, ll->ll_lock);
+	li->li_next = ll->ll_items;
+	li->li_prev = NULL;
+	if (ll->ll_items)
+		ll->ll_items->li_prev = li;
+	ll->ll_items = li;
+	ll->ll_item_count++;
+	if (lock && ll->ll_lock)
+		xt_unlock_mutex(self, ll->ll_lock);
+}
+
+xtPublic XTLinkedItemPtr xt_ll_first_item(XTThreadPtr XT_UNUSED(self), XTLinkedListPtr ll)
+{
+	return ll ? ll->ll_items : NULL;
+}
+
+xtPublic XTLinkedItemPtr xt_ll_next_item(XTThreadPtr XT_UNUSED(self), XTLinkedItemPtr item)
+{
+	return item->li_next;
+}
+
+xtPublic xtBool xt_ll_exists(XTThreadPtr self, XTLinkedListPtr ll, XTLinkedItemPtr li, xtBool lock)
+{
+	XTLinkedItemPtr ptr;
+
+	if (lock && ll->ll_lock)
+		xt_lock_mutex(self, ll->ll_lock);
+
+	ptr = ll->ll_items;
+	
+	for (ptr = ll->ll_items; ptr && (ptr != li); ptr = ptr->li_next){}
+	
+	if (lock && ll->ll_lock)
+		xt_unlock_mutex(self, ll->ll_lock);
+		
+	return (ptr == li);
+}
+
+xtPublic void xt_ll_remove(XTThreadPtr self, XTLinkedListPtr ll, XTLinkedItemPtr li, xtBool lock)
+{
+	if (lock && ll->ll_lock)
+		xt_lock_mutex(self, ll->ll_lock);
+
+	/* Move front pointer: */
+	if (ll->ll_items == li)
+		ll->ll_items = li->li_next;
+
+	/* Remove from list: */
+	if (li->li_prev)
+		li->li_prev->li_next = li->li_next;
+	if (li->li_next)
+		li->li_next->li_prev = li->li_prev;
+
+	ll->ll_item_count--;
+	if (ll->ll_free_func)
+		(*ll->ll_free_func)(self, ll->ll_thunk, li);
+
+	/* Signal one less: */
+	if (ll->ll_cond)
+		xt_signal_cond(self, ll->ll_cond);
+
+	if (lock && ll->ll_lock)
+		xt_unlock_mutex(self, ll->ll_lock);
+}
+
+xtPublic void xt_ll_lock(XTThreadPtr self, XTLinkedListPtr ll)
+{
+	if (ll->ll_lock)
+		xt_lock_mutex(self, ll->ll_lock);
+}
+
+xtPublic void xt_ll_unlock(XTThreadPtr self, XTLinkedListPtr ll)
+{
+	if (ll->ll_lock)
+		xt_unlock_mutex(self, ll->ll_lock);
+}
+
+xtPublic void xt_ll_wait_till_empty(XTThreadPtr self, XTLinkedListPtr ll)
+{
+	xt_lock_mutex(self, ll->ll_lock);
+	pushr_(xt_unlock_mutex, ll->ll_lock);
+	for (;;) {
+		if (ll->ll_item_count == 0)
+			break;
+		xt_wait_cond(self, ll->ll_cond, ll->ll_lock);
+	}
+	freer_(); // xt_unlock_mutex(ll->ll_lock)
+}
+
+xtPublic u_int xt_ll_get_size(XTLinkedListPtr ll)
+{
+	return ll->ll_item_count;
+}
+
+xtPublic void xt_init_linkedqueue(XTThreadPtr XT_UNUSED(self), XTLinkedQueuePtr lq)
+{
+	lq->lq_count = 0;
+	lq->lq_front = NULL;
+	lq->lq_back = NULL;
+}
+
+xtPublic void xt_exit_linkedqueue(XTThreadPtr XT_UNUSED(self), XTLinkedQueuePtr lq)
+{
+	lq->lq_count = 0;
+	lq->lq_front = NULL;
+	lq->lq_back = NULL;
+}
+
+xtPublic void xt_lq_add(XTThreadPtr XT_UNUSED(self), XTLinkedQueuePtr lq, XTLinkedQItemPtr qi)
+{
+	lq->lq_count++;
+	qi->qi_next = NULL;
+	if (!lq->lq_front)
+		lq->lq_front = qi;
+	if (lq->lq_back)
+		lq->lq_back->qi_next = qi;
+	lq->lq_back = qi;
+}
+
+xtPublic XTLinkedQItemPtr xt_lq_remove(XTThreadPtr XT_UNUSED(self), XTLinkedQueuePtr lq)
+{
+	XTLinkedQItemPtr qi = NULL;
+
+	if (!lq->lq_front) {
+		qi = lq->lq_front;
+		lq->lq_front = qi->qi_next;
+		if (!lq->lq_front)
+			lq->lq_back = NULL;
+		qi->qi_next = NULL;
+	}
+	return qi;
+}
+
diff --git a/storage/pbxt/src/linklist_xt.h b/storage/pbxt/src/linklist_xt.h
new file mode 100644
index 00000000000..1e33f71a421
--- /dev/null
+++ b/storage/pbxt/src/linklist_xt.h
@@ -0,0 +1,77 @@
+/* Copyright (c) 2005 PrimeBase Technologies GmbH
+ *
+ * PrimeBase XT
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * 2005-02-03	Paul McCullagh
+ *
+ * H&G2JCtL
+ */
+#ifndef __xt_linklist_h__
+#define __xt_linklist_h__
+
+#include "xt_defs.h"
+
+struct XTThread;
+
+typedef struct XTLinkedItem {
+	struct XTLinkedItem		*li_prev;
+	struct XTLinkedItem		*li_next;
+} XTLinkedItemRec, *XTLinkedItemPtr;
+
+typedef struct XTLinkedList {
+	xt_mutex_type			*ll_lock;
+	xt_cond_type			*ll_cond;			/* Condition for wait for empty. */
+	void					*ll_thunk;
+	XTFreeFunc				ll_free_func;
+	u_int					ll_item_count;
+	XTLinkedItemPtr			ll_items;
+} XTLinkedListRec, *XTLinkedListPtr;
+
+XTLinkedListPtr		xt_new_linkedlist(struct XTThread *self, void *thunk, XTFreeFunc free_func, xtBool with_lock);
+void				xt_free_linkedlist(struct XTThread *self, XTLinkedListPtr ll);
+
+void				xt_ll_add(struct XTThread *self, XTLinkedListPtr ll, XTLinkedItemPtr li, xtBool lock);
+void				xt_ll_remove(struct XTThread *self, XTLinkedListPtr ll, XTLinkedItemPtr li, xtBool lock);
+xtBool 				xt_ll_exists(struct XTThread *self, XTLinkedListPtr ll, XTLinkedItemPtr li, xtBool lock);
+
+void				xt_ll_lock(struct XTThread *self, XTLinkedListPtr ll);
+void				xt_ll_unlock(struct XTThread *self, XTLinkedListPtr ll);
+
+void				xt_ll_wait_till_empty(struct XTThread *self, XTLinkedListPtr ll);
+
+XTLinkedItemPtr		xt_ll_first_item(struct XTThread *self, XTLinkedListPtr ll);
+XTLinkedItemPtr		xt_ll_next_item(struct XTThread *self, XTLinkedItemPtr item);
+u_int				xt_ll_get_size(XTLinkedListPtr ll);
+
+typedef struct XTLinkedQItem {
+	struct XTLinkedQItem	*qi_next;
+} XTLinkedQItemRec, *XTLinkedQItemPtr;
+
+typedef struct XTLinkedQueue {
+	size_t					lq_count;
+	XTLinkedQItemPtr		lq_front;
+	XTLinkedQItemPtr		lq_back;
+} XTLinkedQueueRec, *XTLinkedQueuePtr;
+
+void				xt_init_linkedqueue(struct XTThread *self, XTLinkedQueuePtr lq);
+void				xt_exit_linkedqueue(struct XTThread *self, XTLinkedQueuePtr lq);
+
+void				xt_lq_add(struct XTThread *self, XTLinkedQueuePtr lq, XTLinkedQItemPtr qi);
+XTLinkedQItemPtr	xt_lq_remove(struct XTThread *self, XTLinkedQueuePtr lq);
+
+#endif
+
diff --git a/storage/pbxt/src/lock_xt.cc b/storage/pbxt/src/lock_xt.cc
new file mode 100644
index 00000000000..0e9af277c7b
--- /dev/null
+++ b/storage/pbxt/src/lock_xt.cc
@@ -0,0 +1,2729 @@
+/* Copyright (c) 2005 PrimeBase Technologies GmbH
+ *
+ * PrimeBase XT
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * 2008-01-24	Paul McCullagh
+ *
+ * Row lock functions.
+ *
+ * H&G2JCtL
+ */
+
+#include "xt_config.h"
+
+#ifdef DRIZZLED
+#include <bitset>
+#endif
+
+#include <stdio.h>
+
+#include "lock_xt.h"
+#include "thread_xt.h"
+#include "table_xt.h"
+#include "xaction_xt.h"
+#include "database_xt.h"
+#include "trace_xt.h"
+
+#ifdef DEBUG
+//#define XT_TRACE_LOCKS
+//#define CHECK_ROWLOCK_GROUP_CONSISTENCY
+#endif
+
+/*
+ * This function should never be called. It indicates a link
+ * error!
+ */
+xtPublic void xt_log_atomic_error_and_abort(c_char *func, c_char *file, u_int line)
+{
+	xt_logf(NULL, func, file, line, XT_LOG_ERROR, "%s", "Atomic operations not supported\n");
+	abort();
+}
+
+/*
+ * -----------------------------------------------------------------------
+ * ROW LOCKS, LIST BASED
+ */
+#ifdef XT_USE_LIST_BASED_ROW_LOCKS
+
+#ifdef CHECK_ROWLOCK_GROUP_CONSISTENCY
+/* 
+ * Requires a spin-lock on group->lg_lock!
+ */
+static void check_rowlock_group(XTLockGroupPtr group)
+{
+	XTThreadPtr self = xt_get_self();
+
+	char *crash = NULL;
+
+	if (group->lg_lock.spl_locker != self)
+		*crash = 1;
+
+	if (group->lg_list_in_use > group->lg_list_size)
+		*crash = 1;
+
+	xtRowID prev_row = 0;
+	XTLockItemPtr item = group->lg_list;
+
+	for (int i = 0; i < group->lg_list_in_use; i++, item++) {
+
+		if (!item->li_thread_id)
+			*crash = 1;
+
+		if(!xt_thr_array[item->li_thread_id]->st_xact_data)
+			*crash = 1;
+
+		if(item->li_count > XT_TEMP_LOCK_BYTES)
+			*crash = 1;
+
+		// rows per thread must obey the row_id > prev_row_id + prev_count*group_size rule
+		if (prev_row >= item->li_row_id)
+			*crash = 1;
+
+		// calculate the new prev. row
+		if (item->li_count < XT_TEMP_LOCK_BYTES)
+			prev_row = item->li_row_id + (item->li_count - 1) * XT_ROW_LOCK_GROUP_COUNT;
+		else
+			prev_row = item->li_row_id;
+	}
+}
+#endif
+
+static int xlock_cmp_row_ids(XTThreadPtr XT_UNUSED(self), register const void *XT_UNUSED(thunk), register const void *a, register const void *b)
+{
+	xtRowID			row_id = *((xtTableID *) a);
+	XTLockItemPtr	item = (XTLockItemPtr) b;
+
+	if (row_id < item->li_row_id)
+		return -1;
+	if (row_id > item->li_row_id)
+		return 1;
+	return 0;
+}
+
+void XTRowLockList::xt_remove_all_locks(struct XTDatabase *, XTThreadPtr thread)
+{
+#ifdef XT_TRACE_LOCKS
+	xt_ttracef(xt_get_self(), "remove all locks\n");
+#endif
+	if (!bl_count)
+		return;
+
+	xtThreadID			thd_id;
+	XTPermRowLockPtr	plock;
+#ifndef XT_USE_TABLE_REF
+	XTOpenTablePtr		pot = NULL;
+#endif
+
+	thd_id = thread->t_id;
+	plock = (XTPermRowLockPtr) bl_data;
+	for (u_int i=0; i<bl_count; i++) {
+#ifdef XT_USE_TABLE_REF
+		XTTableHPtr		tab = plock->pr_table;
+#else
+		if (!xt_db_open_pool_table_ns(&pot, db, plock->pr_tab_id)) {
+			/* Should not happen, but just in case, we just don't
+			 * remove the lock. We will probably end up with a deadlock
+			 * somewhere.
+			 */
+			xt_log_and_clear_exception_ns();
+		}
+		else {
+#endif
+			for (int j=0; j<XT_ROW_LOCK_GROUP_COUNT; j++) {
+				if (plock->pr_group[j]) {
+					/* Go through group j and compact. */
+#ifndef XT_USE_TABLE_REF
+					XTTableHPtr		tab = pot->ot_table;
+#endif
+					XTLockGroupPtr	group;
+					XTLockItemPtr	copy;
+					XTLockItemPtr	item;
+					int				new_count;
+
+					group = &tab->tab_locks.rl_groups[j];
+					xt_spinlock_lock(&group->lg_lock);
+					copy = group->lg_list;
+					item = group->lg_list;
+					new_count = 0;
+					for (size_t k=0; k<group->lg_list_in_use; k++) {
+						if (item->li_thread_id != thd_id) {
+							if (copy != item) {
+								copy->li_row_id = item->li_row_id;
+								copy->li_count = item->li_count;
+								copy->li_thread_id = item->li_thread_id;
+							}
+							new_count++;
+							copy++;
+						}
+#ifdef XT_TRACE_LOCKS
+						else {
+							if (item->li_count == XT_TEMP_LOCK_BYTES)
+								xt_ttracef(xt_get_self(), "remove group %d lock row_id=%d TEMP\n", j, (int) item->li_row_id);
+							else
+								xt_ttracef(xt_get_self(), "remove group %d locks row_id=%d (%d)\n", j, (int) item->li_row_id, (int) item->li_count);
+						}
+#endif
+						item++;
+					}
+					group->lg_list_in_use = new_count;
+#ifdef CHECK_ROWLOCK_GROUP_CONSISTENCY
+					check_rowlock_group(group);
+#endif
+					if (group->lg_wait_queue)
+						tab->tab_locks.rl_grant_locks(group, thread);
+
+					xt_spinlock_unlock(&group->lg_lock);
+					
+					xt_xn_wakeup_thread_list(thread);
+				}
+			}
+#ifdef XT_USE_TABLE_REF
+			xt_heap_release(NULL, plock->pr_table);
+#else
+			xt_db_return_table_to_pool_ns(pot);
+		}
+#endif
+		plock++;
+	}
+	bl_count = 0;
+}
+
+#ifdef DEBUG_LOCK_QUEUE
+int *dummy_ptr = 0;
+
+void XTRowLocks::rl_check(XTLockWaitPtr no_lw)
+{
+	XTLockGroupPtr	group;
+	XTLockWaitPtr	lw, lw_prev;
+
+	for (int i=0; i<XT_ROW_LOCK_GROUP_COUNT; i++) {
+		group = &rl_groups[i];
+		xt_spinlock_lock(&group->lg_lock);
+
+		lw = group->lg_wait_queue;
+		lw_prev = NULL;
+		while (lw) {
+			if (lw == no_lw)
+				*dummy_ptr = 1;
+			if (lw->lw_prev != lw_prev)
+				*dummy_ptr = 2;
+			lw_prev = lw;
+			lw = lw->lw_next;
+		}
+		xt_spinlock_unlock(&group->lg_lock);
+	}
+}
+#endif
+
+xtBool XTRowLocks::rl_lock_row(XTLockGroupPtr group, XTLockWaitPtr lw, XTRowLockListPtr, int *result)
+{
+	XTLockItemPtr	item;
+	size_t			index;
+	xtRowID			row_id = lw->lw_row_id;
+
+#ifdef CHECK_ROWLOCK_GROUP_CONSISTENCY
+	check_rowlock_group(group);
+#endif
+	if (group->lg_list_size == group->lg_list_in_use) {
+		if (!xt_realloc_ns((void **) &group->lg_list, (group->lg_list_size + 2) * sizeof(XTLockItemRec)))
+			return FAILED;
+		group->lg_list_size += 2;
+	}
+	item = (XTLockItemPtr) xt_bsearch(NULL, &row_id, group->lg_list, group->lg_list_in_use, sizeof(XTLockItemRec), &index, NULL, xlock_cmp_row_ids);
+	
+	/* There's no item with this ID, but there could be an item with a range that covers this row */
+	if (!item && group->lg_list_in_use) {
+		if (index > 0) {
+			int count;
+	
+			item = group->lg_list + index - 1;
+
+			count = item->li_count;
+			if (item->li_count == XT_TEMP_LOCK_BYTES)
+				count = 1;
+
+			if (row_id >= item->li_row_id + count * XT_ROW_LOCK_GROUP_COUNT)
+				item = NULL;
+		}
+	}
+	
+	if (item) {
+		/* Item already exists. */
+		if (item->li_thread_id == lw->lw_thread->t_id) {
+			/* Already have a permanent lock: */
+			*result = XT_NO_LOCK;
+			lw->lw_curr_lock = XT_NO_LOCK;
+			return OK;
+		}
+		/* {REMOVE-LOCKS}
+		 * This must be valid because a thread must remove
+		 * the locks before it frees its st_xact_data structure,
+		 * xt_thr_array entry must also be valid, because
+		 * transaction must be ended before the thread is
+		 * killed.
+		 */
+		*result = item->li_count == XT_TEMP_LOCK_BYTES ? XT_TEMP_LOCK : XT_PERM_LOCK;
+		lw->lw_xn_id = xt_thr_array[item->li_thread_id]->st_xact_data->xd_start_xn_id;
+		lw->lw_curr_lock = *result;
+		return OK;
+	}
+
+	/* Add the lock: */
+	XT_MEMMOVE(group->lg_list, &group->lg_list[index+1], 
+		&group->lg_list[index], (group->lg_list_in_use - index) * sizeof(XTLockItemRec));
+	group->lg_list[index].li_row_id = row_id;
+	group->lg_list[index].li_count = XT_TEMP_LOCK_BYTES;
+	group->lg_list[index].li_thread_id = lw->lw_thread->t_id;
+	group->lg_list_in_use++;
+
+#ifdef XT_TRACE_LOCKS
+	xt_ttracef(ot->ot_thread, "set temp lock row=%d setby=%s\n", (int) row_id, xt_get_self()->t_name);
+#endif
+#ifdef CHECK_ROWLOCK_GROUP_CONSISTENCY
+	check_rowlock_group(group);
+#endif
+	*result = XT_NO_LOCK;
+	lw->lw_ot->ot_temp_row_lock = row_id;
+	lw->lw_curr_lock = XT_NO_LOCK;
+	return OK;
+}
+
+void XTRowLocks::rl_grant_locks(XTLockGroupPtr group, XTThreadPtr thread)
+{
+	XTLockWaitPtr	lw, lw_next, lw_prev;
+	int				result;
+	xtThreadID		lw_thd_id;
+
+	thread->st_thread_list_count = 0;
+	lw = group->lg_wait_queue;
+	while (lw) {
+		lw_next = lw->lw_next;
+		lw_prev = lw->lw_prev;
+		lw_thd_id = lw->lw_thread->t_id;
+		/* NOTE: after lw_curr_lock is changed, lw may no longer be referenced
+		 * by this function!!!
+		 */
+		if (!rl_lock_row(group, lw, &lw->lw_thread->st_lock_list, &result)) {
+			/* We transfer the error to the other thread! */
+			XTThreadPtr self = xt_get_self();
+
+			result = XT_LOCK_ERR;
+			memcpy(&lw->lw_thread->t_exception, &self->t_exception, sizeof(XTExceptionRec));
+			lw->lw_curr_lock = XT_LOCK_ERR;
+		}
+		if (result == XT_NO_LOCK || result == XT_LOCK_ERR) {
+			/* Remove from the wait queue: */
+			if (lw_next)
+				lw_next->lw_prev = lw_prev;
+			if (lw_prev)
+				lw_prev->lw_next = lw_next;
+			if (group->lg_wait_queue == lw)
+				group->lg_wait_queue = lw_next;
+			if (group->lg_wait_queue_end == lw)
+				group->lg_wait_queue_end = lw_prev;
+			if (result == XT_NO_LOCK) {
+				/* Add to the thread list: */
+				if (thread->st_thread_list_count == thread->st_thread_list_size) {
+					if (!xt_realloc_ns((void **) &thread->st_thread_list, (thread->st_thread_list_size+1) * sizeof(xtThreadID))) {
+						xt_xn_wakeup_thread(lw_thd_id);
+						goto done;
+					}
+					thread->st_thread_list_size++;
+				}
+				thread->st_thread_list[thread->st_thread_list_count] = lw_thd_id;
+				thread->st_thread_list_count++;
+				done:;
+			}
+		}
+		lw = lw_next;
+	}
+}
+
+void XTRowLocks::xt_cancel_temp_lock(XTLockWaitPtr lw)
+{
+	XTLockGroupPtr	group;
+
+	group = &rl_groups[lw->lw_row_id % XT_ROW_LOCK_GROUP_COUNT];
+	xt_spinlock_lock(&group->lg_lock);
+	if (lw->lw_curr_lock == XT_TEMP_LOCK || lw->lw_curr_lock == XT_PERM_LOCK) {
+		/* In case of XT_LOCK_ERR or XT_NO_LOCK, the lw structure will
+		 * no longer be on the wait queue.
+		 */
+		XTLockWaitPtr	lw_next, lw_prev;
+
+		lw_next = lw->lw_next;
+		lw_prev = lw->lw_prev;
+
+		/* Remove from the wait queue: */
+		if (lw_next)
+			lw_next->lw_prev = lw_prev;
+		if (lw_prev)
+			lw_prev->lw_next = lw_next;
+		if (group->lg_wait_queue == lw)
+			group->lg_wait_queue = lw_next;
+		if (group->lg_wait_queue_end == lw)
+			group->lg_wait_queue_end = lw_prev;
+	}
+	xt_spinlock_unlock(&group->lg_lock);
+}
+
+//#define QUEUE_ORDER_FIFO
+
+/* Try to lock a row.
+ * This function returns:
+ * XT_NO_LOCK on success.
+ * XT_TEMP_LOCK if there is a temporary lock on the row.
+ * XT_PERM_LOCK if there is a permanent lock in the row.
+ * XT_FAILED an error occured.
+ *
+ * If there is a lock on this row, the transaction ID of the
+ * locker is also returned.
+ *
+ * The caller must wait if the row is locked. If the lock is
+ * permanent, then the caller must wait for the transaction to
+ * terminate. If the lock is temporary, then the caller must
+ * wait for the transaction to signal that the lock has been
+ * released.
+ */
+xtBool XTRowLocks::xt_set_temp_lock(XTOpenTablePtr ot, XTLockWaitPtr lw, XTRowLockListPtr lock_list)
+{
+	XTLockGroupPtr	group;
+	int				result;
+
+	if (ot->ot_temp_row_lock) {
+		/* Check if we don't already have this temp lock: */
+		if (ot->ot_temp_row_lock == lw->lw_row_id) {
+			lw->lw_curr_lock = XT_NO_LOCK;
+			return OK;
+		}
+
+		xt_make_lock_permanent(ot, lock_list);
+	}
+
+	/* Add a temporary lock. */
+	group = &rl_groups[lw->lw_row_id % XT_ROW_LOCK_GROUP_COUNT];
+	xt_spinlock_lock(&group->lg_lock);
+
+	if (!rl_lock_row(group, lw, lock_list, &result)) {
+		xt_spinlock_unlock(&group->lg_lock);
+		return FAILED;
+	}
+
+	if (result != XT_NO_LOCK) {
+		/* Add the thread to the end of the thread queue: */
+#ifdef QUEUE_ORDER_FIFO
+		if (group->lg_wait_queue_end) {
+			group->lg_wait_queue_end->lw_next = lw;
+			lw->lw_prev = group->lg_wait_queue_end;
+		}
+		else {
+			group->lg_wait_queue = lw;
+			lw->lw_prev = NULL;
+		}
+		lw->lw_next = NULL;
+		group->lg_wait_queue_end = lw;
+#else
+		XTLockWaitPtr	pos = group->lg_wait_queue_end;
+		xtXactID		xn_id = ot->ot_thread->st_xact_data->xd_start_xn_id;
+		
+		while (pos) {
+			if (pos->lw_thread->st_xact_data->xd_start_xn_id < xn_id)
+				break;
+			pos = pos->lw_prev;
+		}
+		if (pos) {
+			lw->lw_prev = pos;
+			lw->lw_next = pos->lw_next;
+			if (pos->lw_next)
+				pos->lw_next->lw_prev = lw;
+			else
+				group->lg_wait_queue_end = lw;
+			pos->lw_next = lw;
+		}
+		else {
+			/* Front of the queue: */
+			lw->lw_prev = NULL;
+			lw->lw_next = group->lg_wait_queue;
+			if (group->lg_wait_queue)
+				group->lg_wait_queue->lw_prev = lw;
+			else
+				group->lg_wait_queue_end = lw;
+			group->lg_wait_queue = lw;
+		}
+#endif
+	}
+
+	xt_spinlock_unlock(&group->lg_lock);
+	return OK;
+}
+
+/*
+ * Remove a temporary lock.
+ * 
+ * If updated is set to TRUE this means that the row was update.
+ * This means that any thread waiting on the temporary lock will
+ * also have to wait for the transaction to quit before
+ * continuing.
+ *
+ * If the thread were to continue it would just hang again because
+ * it will discover that the transaction has updated the row.
+ *
+ * So the 'updated' flag is an optimisation which prevents the
+ * thread from making an unncessary retry.
+ */
+void XTRowLocks::xt_remove_temp_lock(XTOpenTablePtr ot, xtBool updated)
+{
+	xtRowID			row_id;
+	XTLockGroupPtr	group;
+	XTLockItemPtr	item;
+	size_t			index;
+	xtBool			lock_granted = FALSE;
+	xtThreadID		locking_thread_id = 0;
+
+	if (!(row_id = ot->ot_temp_row_lock))
+		return;
+
+	group = &rl_groups[row_id % XT_ROW_LOCK_GROUP_COUNT];
+	xt_spinlock_lock(&group->lg_lock);
+#ifdef CHECK_ROWLOCK_GROUP_CONSISTENCY
+	check_rowlock_group(group);
+#endif
+
+#ifdef XT_TRACE_LOCKS
+	xt_ttracef(xt_get_self(), "remove temp lock %d\n", (int) row_id);
+#endif
+	item = (XTLockItemPtr) xt_bsearch(NULL, &row_id, group->lg_list, group->lg_list_in_use, sizeof(XTLockItemRec), &index, NULL, xlock_cmp_row_ids);
+	if (item) {
+		/* Item exists. */
+		if (item->li_thread_id == ot->ot_thread->t_id &&
+			item->li_count == XT_TEMP_LOCK_BYTES) {
+			XTLockWaitPtr	lw;
+
+			/* First check if there is some thread waiting to take over this lock: */
+			lw = group->lg_wait_queue;
+			while (lw) {
+				if (lw->lw_row_id == row_id) {
+					lock_granted = TRUE;
+					break;
+				}
+				lw = lw->lw_next;
+			}
+
+			if (lock_granted) {
+				/* Grant the lock just released... */
+				XTLockWaitPtr	lw_next, lw_prev;
+				xtXactID		locking_xact_id;
+
+				/* Store this info, lw will soon be untouchable! */
+				lw_next = lw->lw_next;
+				lw_prev = lw->lw_prev;
+				locking_xact_id = lw->lw_thread->st_xact_data->xd_start_xn_id;
+				locking_thread_id = lw->lw_thread->t_id;
+
+				/* Lock has moved from one thread to the next.
+				 * change the thread holding this lock:
+				 */
+				item->li_thread_id = locking_thread_id;
+
+				/* Remove from the wait queue: */
+				if (lw_next)
+					lw_next->lw_prev = lw_prev;
+				if (lw_prev)
+					lw_prev->lw_next = lw_next;
+				if (group->lg_wait_queue == lw)
+					group->lg_wait_queue = lw_next;
+				if (group->lg_wait_queue_end == lw)
+					group->lg_wait_queue_end = lw_prev;
+
+				/* If the thread that release the lock updated the
+				 * row then we will have to wait for the transaction
+				 * to terminate:
+				 */
+				if (updated) {
+					lw->lw_row_updated = TRUE;
+					lw->lw_updating_xn_id = ot->ot_thread->st_xact_data->xd_start_xn_id;
+				}
+
+				/* The thread has the lock now: */
+				lw->lw_ot->ot_temp_row_lock = row_id;
+				lw->lw_curr_lock = XT_NO_LOCK;
+
+				/* Everyone after this that is waiting for the same lock is
+				 * now waiting for a different transaction:
+				 */
+				lw = lw_next;
+				while (lw) {
+					if (lw->lw_row_id == row_id) {
+						lw->lw_xn_id = locking_xact_id;
+						lw->lw_curr_lock = XT_TEMP_LOCK;
+					}
+					lw = lw->lw_next;
+				}
+			}
+			else {
+				/* Remove the lock: */
+				XT_MEMMOVE(group->lg_list, &group->lg_list[index], 
+					&group->lg_list[index+1], (group->lg_list_in_use - index - 1) * sizeof(XTLockItemRec));
+				group->lg_list_in_use--;
+			}
+		}
+	}
+#ifdef CHECK_ROWLOCK_GROUP_CONSISTENCY
+	check_rowlock_group(group);
+#endif
+	xt_spinlock_unlock(&group->lg_lock);
+
+	ot->ot_temp_row_lock = 0;
+	if (lock_granted)
+		xt_xn_wakeup_thread(locking_thread_id);
+}
+
+xtBool XTRowLocks::xt_make_lock_permanent(XTOpenTablePtr ot, XTRowLockListPtr lock_list)
+{
+	xtRowID			row_id;
+	XTLockGroupPtr	group;
+	XTLockItemPtr	item;
+	size_t			index;
+
+	if (!(row_id = ot->ot_temp_row_lock))
+		return OK;
+
+#ifdef XT_TRACE_LOCKS
+	xt_ttracef(xt_get_self(), "make lock perm %d\n", (int) ot->ot_temp_row_lock);
+#endif
+
+	/* Add to the lock list: */
+	XTPermRowLockPtr locks = (XTPermRowLockPtr) lock_list->bl_data;
+	for (unsigned i=0; i<lock_list->bl_count; i++) {
+#ifdef XT_USE_TABLE_REF
+		if (locks->pr_table == ot->ot_table) {
+#else
+		if (locks->pr_tab_id == ot->ot_table->tab_id) {
+#endif
+			locks->pr_group[row_id % XT_ROW_LOCK_GROUP_COUNT] = 1;
+			goto done;
+		}
+		locks++;
+	}
+
+	/* Add new to lock list: */
+	{
+		XTPermRowLockRec perm_lock;
+		
+#ifdef XT_USE_TABLE_REF
+		perm_lock.pr_table = ot->ot_table;
+		xt_heap_reference(NULL, perm_lock.pr_table);
+#else
+		perm_lock.pr_tab_id = ot->ot_table->tab_id;
+#endif
+		memset(perm_lock.pr_group, 0, XT_ROW_LOCK_GROUP_COUNT);
+		perm_lock.pr_group[row_id % XT_ROW_LOCK_GROUP_COUNT] = 1;
+		if (!xt_bl_append(NULL, lock_list, &perm_lock)) {
+			xt_remove_temp_lock(ot, FALSE);
+			return FAILED;
+		}
+	}
+
+	done:
+	group = &rl_groups[row_id % XT_ROW_LOCK_GROUP_COUNT];
+	xt_spinlock_lock(&group->lg_lock);
+
+	item = (XTLockItemPtr) xt_bsearch(NULL, &row_id, group->lg_list, group->lg_list_in_use, sizeof(XTLockItemRec), &index, NULL, xlock_cmp_row_ids);
+	ASSERT_NS(item);
+#ifdef CHECK_ROWLOCK_GROUP_CONSISTENCY
+	check_rowlock_group(group);
+#endif
+	if (item) {
+		/* Lock exists (it should!). */
+		if (item->li_thread_id == ot->ot_thread->t_id &&
+			item->li_count == XT_TEMP_LOCK_BYTES) {
+			if (index > 0 &&
+				group->lg_list[index-1].li_thread_id == ot->ot_thread->t_id &&
+				group->lg_list[index-1].li_count < XT_TEMP_LOCK_BYTES-2 &&
+				group->lg_list[index-1].li_row_id == row_id - (XT_ROW_LOCK_GROUP_COUNT * group->lg_list[index-1].li_count)) {
+				group->lg_list[index-1].li_count++;
+				/* Combine with the left: */
+				if (index + 1 < group->lg_list_in_use &&
+					group->lg_list[index+1].li_thread_id == ot->ot_thread->t_id &&
+					group->lg_list[index+1].li_count != XT_TEMP_LOCK_BYTES &&
+					group->lg_list[index+1].li_row_id == row_id + XT_ROW_LOCK_GROUP_COUNT) {
+					/* And combine with the right */
+					u_int left = group->lg_list[index-1].li_count + group->lg_list[index+1].li_count;
+					u_int right;
+
+					if (left > XT_TEMP_LOCK_BYTES-1) {
+						right = left - (XT_TEMP_LOCK_BYTES-1);
+						left = XT_TEMP_LOCK_BYTES-1;
+					}
+					else
+						right = 0;
+
+					group->lg_list[index-1].li_count = left;
+					if (right) {
+						/* There is something left over on the right: */
+						group->lg_list[index+1].li_count = right;
+						group->lg_list[index+1].li_row_id = group->lg_list[index-1].li_row_id + left * XT_ROW_LOCK_GROUP_COUNT;
+						XT_MEMMOVE(group->lg_list, &group->lg_list[index], 
+							&group->lg_list[index+1], (group->lg_list_in_use - index - 1) * sizeof(XTLockItemRec));
+						group->lg_list_in_use--;
+					}
+					else {
+						XT_MEMMOVE(group->lg_list, &group->lg_list[index], 
+							&group->lg_list[index+2], (group->lg_list_in_use - index - 2) * sizeof(XTLockItemRec));
+						group->lg_list_in_use -= 2;
+					}
+				}
+				else {
+					XT_MEMMOVE(group->lg_list, &group->lg_list[index], 
+						&group->lg_list[index+1], (group->lg_list_in_use - index - 1) * sizeof(XTLockItemRec));
+					group->lg_list_in_use--;
+				}
+			}
+			else if (index + 1 < group->lg_list_in_use &&
+					group->lg_list[index+1].li_thread_id == ot->ot_thread->t_id &&
+					group->lg_list[index+1].li_count < XT_TEMP_LOCK_BYTES-2 &&
+					group->lg_list[index+1].li_row_id == row_id + XT_ROW_LOCK_GROUP_COUNT) {
+				/* Combine with the right: */
+				group->lg_list[index+1].li_count++;
+				group->lg_list[index+1].li_row_id = row_id;
+				XT_MEMMOVE(group->lg_list, &group->lg_list[index], 
+					&group->lg_list[index+1], (group->lg_list_in_use - index - 1) * sizeof(XTLockItemRec));
+				group->lg_list_in_use--;
+			}
+			else
+				group->lg_list[index].li_count = 1;
+		}
+	}
+#ifdef CHECK_ROWLOCK_GROUP_CONSISTENCY
+	check_rowlock_group(group);
+#endif
+	xt_spinlock_unlock(&group->lg_lock);
+
+	ot->ot_temp_row_lock = 0;
+	return OK;
+}
+
+xtBool xt_init_row_locks(XTRowLocksPtr rl)
+{
+	for (int i=0; i<XT_ROW_LOCK_GROUP_COUNT; i++) {
+		xt_spinlock_init_with_autoname(NULL, &rl->rl_groups[i].lg_lock);
+		rl->rl_groups[i].lg_wait_queue = NULL;
+		rl->rl_groups[i].lg_list_size = 0;
+		rl->rl_groups[i].lg_list_in_use = 0;
+		rl->rl_groups[i].lg_list = NULL;
+	}
+	return OK;
+}
+
+void xt_exit_row_locks(XTRowLocksPtr rl)
+{
+	for (int i=0; i<XT_ROW_LOCK_GROUP_COUNT; i++) {
+		xt_spinlock_free(NULL, &rl->rl_groups[i].lg_lock);
+		rl->rl_groups[i].lg_wait_queue = NULL;
+		rl->rl_groups[i].lg_list_size = 0;
+		rl->rl_groups[i].lg_list_in_use = 0;
+		if (rl->rl_groups[i].lg_list) {
+			xt_free_ns(rl->rl_groups[i].lg_list);
+			rl->rl_groups[i].lg_list = NULL;
+		}
+	}
+}
+
+/*
+ * -----------------------------------------------------------------------
+ * ROW LOCKS, HASH BASED
+ */
+#else // XT_USE_LIST_BASED_ROW_LOCKS
+
+void XTRowLockList::old_xt_remove_all_locks(struct XTDatabase *db, xtThreadID thd_id)
+{
+#ifdef XT_TRACE_LOCKS
+	xt_ttracef(xt_get_self(), "remove all locks\n");
+#endif
+	if (!bl_count)
+		return;
+
+	int					pgroup;
+	xtTableID			ptab_id;
+	XTPermRowLockPtr	plock;
+	XTOpenTablePtr		pot = NULL;
+
+	plock = (XTPermRowLockPtr) &bl_data[bl_count * bl_item_size];
+	for (u_int i=0; i<bl_count; i++) {
+		plock--;
+		pgroup = plock->pr_group;
+		ptab_id = plock->pr_tab_id;
+		if (pot) {
+			if (pot->ot_table->tab_id == ptab_id)
+				goto remove_lock;
+			xt_db_return_table_to_pool_ns(pot);
+			pot = NULL;
+		}
+
+		if (!xt_db_open_pool_table_ns(&pot, db, ptab_id)) {
+			/* Should not happen, but just in case, we just don't
+			 * remove the lock. We will probably end up with a deadlock
+			 * somewhere.
+			 */
+			xt_log_and_clear_exception_ns();
+			goto skip_remove_lock;
+		}
+		if (!pot)
+			/* Can happen of the table has been dropped: */
+			goto skip_remove_lock;
+
+		remove_lock:
+#ifdef XT_TRACE_LOCKS
+		xt_ttracef(xt_get_self(), "remove lock group=%d\n", pgroup);
+#endif
+		pot->ot_table->tab_locks.tab_row_locks[pgroup] = NULL;
+		pot->ot_table->tab_locks.tab_lock_perm[pgroup] = 0;
+		skip_remove_lock:;
+	}
+	bl_count = 0;
+
+	if (pot)
+		xt_db_return_table_to_pool_ns(pot);
+}
+
+/* Try to lock a row.
+ * This function returns:
+ * XT_NO_LOCK on success.
+ * XT_TEMP_LOCK if there is a temporary lock on the row.
+ * XT_PERM_LOCK if there is a permanent lock in the row.
+ *
+ * If there is a lock on this row, the transaction ID of the
+ * locker is also returned.
+ *
+ * The caller must wait if the row is locked. If the lock is
+ * permanent, then the caller must wait for the transaction to
+ * terminate. If the lock is temporary, then the caller must
+ * wait for the transaction to signal that the lock has been
+ * released.
+ */
+int XTRowLocks::old_xt_set_temp_lock(XTOpenTablePtr ot, xtRowID row, xtXactID *xn_id, XTRowLockListPtr lock_list)
+{
+	int				group;
+	XTXactDataPtr	xact, my_xact;
+
+	if (ot->ot_temp_row_lock) {
+		/* Check if we don't already have this temp lock: */
+		if (ot->ot_temp_row_lock == row) {
+			gl->lw_curr_lock = XT_NO_LOCK;
+			return XT_NO_LOCK;
+		}
+
+		xt_make_lock_permanent(ot, lock_list);
+	}
+
+	my_xact = ot->ot_thread->st_xact_data;
+	group = row % XT_ROW_LOCK_COUNT;
+	if ((xact = tab_row_locks[group])) {
+		if (xact == my_xact)
+			return XT_NO_LOCK;
+		*xn_id = xact->xd_start_xn_id;
+		return tab_lock_perm[group] ? XT_PERM_LOCK : XT_TEMP_LOCK;
+	}
+
+	tab_row_locks[row % XT_ROW_LOCK_COUNT] = my_xact;
+
+#ifdef XT_TRACE_LOCKS
+	xt_ttracef(xt_get_self(), "set temp lock %d group=%d for %s\n", (int) row, (int) row % XT_ROW_LOCK_COUNT, ot->ot_thread->t_name);
+#endif
+	ot->ot_temp_row_lock = row;
+	return XT_NO_LOCK;
+}
+
+/* Just check if there is a lock on the row.
+ * This function returns:
+ * XT_NO_LOCK if there is no lock.
+ * XT_TEMP_LOCK if there is a temporary lock on the row.
+ * XT_PERM_LOCK if a lock is a permanent lock in the row.
+ */
+int XTRowLocks::old_xt_is_locked(struct XTOpenTable *ot, xtRowID row, xtXactID *xn_id)
+{
+	int				group;
+	XTXactDataPtr	xact;
+
+	group = row % XT_ROW_LOCK_COUNT;
+	if ((xact = tab_row_locks[group])) {
+		if (xact == ot->ot_thread->st_xact_data)
+			return XT_NO_LOCK;
+		*xn_id = xact->xd_start_xn_id;
+		if (tab_lock_perm[group])
+			return XT_PERM_LOCK;
+		return XT_TEMP_LOCK;
+	}
+	return XT_NO_LOCK;
+}
+
+void XTRowLocks::old_xt_remove_temp_lock(XTOpenTablePtr ot)
+{
+	int				group;
+	XTXactDataPtr	xact, my_xact;
+
+	if (!ot->ot_temp_row_lock)
+		return;
+
+	my_xact = ot->ot_thread->st_xact_data;
+	group = ot->ot_temp_row_lock % XT_ROW_LOCK_COUNT;
+#ifdef XT_TRACE_LOCKS
+	xt_ttracef(xt_get_self(), "remove temp lock %d group=%d\n", (int) ot->ot_temp_row_lock, (int) ot->ot_temp_row_lock % XT_ROW_LOCK_COUNT);
+#endif
+	ot->ot_temp_row_lock = 0;
+	if ((xact = tab_row_locks[group])) {
+		if (xact == my_xact)
+			tab_row_locks[group] = NULL;
+	}
+
+	if (ot->ot_table->tab_db->db_xn_wait_count)
+		xt_xn_wakeup_transactions(ot->ot_table->tab_db, ot->ot_thread);
+}
+
+xtBool XTRowLocks::old_xt_make_lock_permanent(XTOpenTablePtr ot, XTRowLockListPtr lock_list)
+{
+	int group;
+
+	if (!ot->ot_temp_row_lock)
+		return OK;
+
+#ifdef XT_TRACE_LOCKS
+	xt_ttracef(xt_get_self(), "make lock perm %d group=%d\n", (int) ot->ot_temp_row_lock, (int) ot->ot_temp_row_lock % XT_ROW_LOCK_COUNT);
+#endif
+	/* Check if the lock is already permanent: */
+	group = ot->ot_temp_row_lock % XT_ROW_LOCK_COUNT;
+	if (!tab_lock_perm[group]) {
+		XTPermRowLockRec plock;
+
+		plock.pr_tab_id = ot->ot_table->tab_id;
+		plock.pr_group = group;
+		if (!xt_bl_append(NULL, lock_list, &plock)) {
+			xt_remove_temp_lock(ot);
+			return FAILED;
+		}
+		tab_lock_perm[group] = 1;
+	}
+
+	ot->ot_temp_row_lock = 0;
+	return OK;
+}
+
+/* Release this lock, and all locks gained after this lock
+ * on this table.
+ *
+ * The locks are only released temporarily. The will be regained
+ * below using regain locks.
+ *
+ * Returns:
+ * XT_NO_LOCK if no lock is released.
+ * XT_PERM_LOCK if a lock is released.
+ *
+ * Note that only permanent locks can be released in this way.
+ * So if the thread has a temporary lock, it will first be made
+ * permanent.
+ *
+ * {RELEASING-LOCKS}
+ * The idea of the releasing locks comes from the fact that each
+ * lock, locks a group of records.
+ * So if T1 has a lock (e.g. when doing SELECT FOR UPDATE),
+ * and then encounters an updated record x
+ * from T2, and it must wait for T2, it firsts releases the
+ * lock, just in case T2 tries to gain a lock on another
+ * record y in the same group, which will cause it to
+ * wait on T1.
+ *
+ * However, there are several problems with releasing
+ * locks.
+ * - It can cause a "live-lock", where another transation
+ * keeps getting in before.
+ * - It may not solve the problem in all cases because
+ * the SELECT FOR UPDATE has locked other record groups
+ * before it encountered record x.
+ * - Further problems occur when locks are granted by
+ * callback:
+ * T1 waits for T2, because it has a lock on record x
+ * T2 releases the lock because it must wait for T3
+ * T1 is granted the lock (but does not know about this yet)
+ * T2 tries to regain lock (after T3 quits) and
+ * must wait for T1 - DEADLOCK
+ *
+ * In general, it does not make sense to release locks
+ * when it can be granted again by a callback.
+ *
+ * TODO: 2 possible solutions:
+ * - Do not lock groups, lock rows.
+ *   UPDATE INTENSION ROW LOCK
+ * - Use multiple lock types:
+ *   UPDATE INTENSION LOCK (required first)
+ *   SHARED UPDATE LOCK (used by INSERT or DELETE)
+ *   EXCLUSIVE UPDATE LOCK (used by SELECT FOR UPDATE)
+ *
+ * Temporary solution. Do not release any locks.
+int XTRowLocks::xt_release_locks(struct XTOpenTable *ot, xtRowID row, XTRowLockListPtr lock_list)
+ */ 
+
+/*
+ * Regain a lock previously held. This function regains locks
+ * released by xt_release_locks().
+ *
+ * It will return lock_type and xn_id if the row is locked, and therefore
+ * regain cannot continue. In this case, the caller must wait.
+ * It returns XT_NO_LOCK if there are no more locks to be regained.
+ *
+ * Locks are always regained in the order in which they were originally
+ * taken.
+xtBool XTRowLocks::xt_regain_locks(struct XTOpenTable *ot, int *lock_type, xtXactID *xn_id, XTRowLockListPtr lock_list)
+ */
+
+xtBool old_xt_init_row_locks(XTRowLocksPtr rl)
+{
+	memset(rl->tab_lock_perm, 0, XT_ROW_LOCK_COUNT);
+	memset(rl->tab_row_locks, 0, XT_ROW_LOCK_COUNT * sizeof(XTXactDataPtr));
+	return OK;
+}
+
+void old_xt_exit_row_locks(XTRowLocksPtr XT_UNUSED(rl))
+{
+}
+
+#endif // XT_USE_LIST_BASED_ROW_LOCKS
+
+xtPublic xtBool xt_init_row_lock_list(XTRowLockListPtr lock_list)
+{
+	lock_list->bl_item_size = sizeof(XTPermRowLockRec);
+	lock_list->bl_size = 0;
+	lock_list->bl_count = 0;
+	lock_list->bl_data = NULL;
+	return OK;
+}
+
+xtPublic void xt_exit_row_lock_list(XTRowLockListPtr lock_list)
+{
+	xt_bl_set_size(NULL, lock_list, 0);
+}
+
+/*
+ * -----------------------------------------------------------------------
+ * SPECIAL EXCLUSIVE/SHARED (XS) LOCK
+ */
+
+#ifdef XT_THREAD_LOCK_INFO
+xtPublic void xt_rwmutex_init(struct XTThread *self, XTRWMutexPtr xsl, const char *n)
+#else
+xtPublic void xt_rwmutex_init(XTThreadPtr self, XTRWMutexPtr xsl)
+#endif
+{
+#ifdef DEBUG
+	xsl->xs_lock_thread = 0;
+	xsl->xs_inited = 12345;
+#endif
+	xt_init_mutex_with_autoname(self, &xsl->xs_lock);
+	xt_init_cond(self, &xsl->xs_cond);
+	xt_atomic_set4(&xsl->xs_state, 0);
+	xsl->xs_xlocker = 0;
+	/* Must be aligned! */
+	ASSERT(xt_thr_maximum_threads == xt_align_size(xt_thr_maximum_threads, XT_XS_LOCK_ALIGN));
+	xsl->x.xs_rlock = (xtWord1 *) xt_calloc(self, xt_thr_maximum_threads);
+#ifdef XT_THREAD_LOCK_INFO
+	xsl->xs_name = n;
+	xt_thread_lock_info_init(&xsl->xs_lock_info, xsl);
+#endif
+}
+
+xtPublic void xt_rwmutex_free(XTThreadPtr self, XTRWMutexPtr xsl)
+{
+#ifdef DEBUG
+	ASSERT(!xsl->xs_lock_thread);
+	ASSERT(xsl->xs_inited == 12345);
+	xsl->xs_inited = 0;
+#endif
+	if (xsl->x.xs_rlock)
+		xt_free(self, (void *) xsl->x.xs_rlock);
+	xt_free_mutex(&xsl->xs_lock);
+	xt_free_cond(&xsl->xs_cond);
+#ifdef XT_THREAD_LOCK_INFO
+	xt_thread_lock_info_free(&xsl->xs_lock_info);
+#endif
+}
+
+xtPublic xtBool xt_rwmutex_xlock(XTRWMutexPtr xsl, xtThreadID thd_id)
+{
+#ifdef DEBUG
+	ASSERT_NS(xsl->xs_inited == 12345);
+#endif
+	ASSERT_NS(xt_get_self()->t_id == thd_id);
+	xt_lock_mutex_ns(&xsl->xs_lock);
+	ASSERT_NS(xsl->x.xs_rlock[thd_id] == XT_NO_LOCK);
+	
+	/* Wait for exclusive locker: */
+	while (xsl->xs_xlocker) {
+		if (!xt_timed_wait_cond_ns(&xsl->xs_cond, &xsl->xs_lock, 10000)) {
+			xt_unlock_mutex_ns(&xsl->xs_lock);
+			return FAILED;
+		}
+	}
+
+	/* I am the locker (set state before locker!): */
+	xt_atomic_set4(&xsl->xs_state, 0);
+	xsl->xs_xlocker = thd_id;
+
+	/* Wait for all the read lockers: */
+	while (xsl->xs_state < xt_thr_current_max_threads) {
+		while (xsl->x.xs_rlock[xsl->xs_state]) {
+			/* {RACE-WR_MUTEX}
+			 * Just in case of this, we keep the wait time down!
+			 */
+			if (!xt_timed_wait_cond_ns(&xsl->xs_cond, &xsl->xs_lock, 10)) {
+				xt_atomic_set4(&xsl->xs_state, 0);
+				xsl->xs_xlocker = 0;
+				xt_unlock_mutex_ns(&xsl->xs_lock);
+				return FAILED;
+			}
+		}
+		/* State can be incremented in parallel by a reader
+		 * thread!
+		 */
+		xt_atomic_set4(&xsl->xs_state, xsl->xs_state + 1);
+	}
+
+	/* I have waited for all: */
+	xt_atomic_set4(&xsl->xs_state, xt_thr_maximum_threads);
+
+#ifdef XT_THREAD_LOCK_INFO
+	xt_thread_lock_info_add_owner(&xsl->xs_lock_info);
+#endif
+
+	return OK;
+}
+
+xtPublic xtBool xt_rwmutex_slock(XTRWMutexPtr xsl, xtThreadID thd_id)
+{
+#ifdef DEBUG
+	ASSERT_NS(xsl->xs_inited == 12345);
+#endif
+	ASSERT_NS(xt_get_self()->t_id == thd_id);
+
+	xt_atomic_inc1(&xsl->x.xs_rlock[thd_id]);
+
+	if (xsl->x.xs_rlock[thd_id] > 1)
+		return OK;
+
+	/* Check if there could be an X locker: */
+	if (xsl->xs_xlocker) {
+		/* There is an X locker.
+		 * If xs_state < thd_id then the X locker will wait for me.
+		 * So I should not wait!
+		 */
+		if (xsl->xs_state >= thd_id) {
+			/* If xsl->xs_state >= thd_id, then the locker has already
+			 * checked me, and I will have to wait.
+			 *
+			 * Otherwise, xs_state <= thd_id, which means the
+			 * X locker has not checked me, and will still wait for me (or 
+			 * is already waiting for me). In this case, I will have to
+			 * take the mutex to make sure exactly how far he
+			 * is with the checking.
+			 */
+			xt_lock_mutex_ns(&xsl->xs_lock);
+			while (xsl->xs_state > thd_id && xsl->xs_xlocker) {
+				if (!xt_timed_wait_cond_ns(&xsl->xs_cond, &xsl->xs_lock, 10000)) {
+					xt_unlock_mutex_ns(&xsl->xs_lock);
+					xsl->x.xs_rlock[thd_id]--;
+					return FAILED;
+				}
+			}
+			xt_unlock_mutex_ns(&xsl->xs_lock);
+		}
+	}
+
+	/* There is no exclusive locker, so we have the read lock: */
+	ASSERT_NS(xsl->xs_state != xt_thr_maximum_threads);
+#ifdef XT_THREAD_LOCK_INFO
+	xt_thread_lock_info_add_owner(&xsl->xs_lock_info);
+#endif
+	return OK;
+}
+
+xtPublic xtBool xt_rwmutex_unlock(XTRWMutexPtr xsl, xtThreadID thd_id)
+{
+#ifdef DEBUG
+	ASSERT_NS(xsl->xs_inited == 12345);
+#endif
+	ASSERT_NS(xt_get_self()->t_id == thd_id);
+	if (xsl->xs_xlocker == thd_id) {
+		/* I have an X lock. */
+		ASSERT_NS(xsl->x.xs_rlock[thd_id] == XT_NO_LOCK);
+		ASSERT_NS(xsl->xs_state == xt_thr_maximum_threads);
+		xt_atomic_set4(&xsl->xs_state, 0);
+		xsl->xs_xlocker = 0;
+		xt_unlock_mutex_ns(&xsl->xs_lock);
+		/* Wake up any other X or shared lockers: */
+		if (!xt_broadcast_cond_ns(&xsl->xs_cond))
+			return FAILED;
+	}
+	else {
+		/* I have a shared lock: */
+		ASSERT_NS(xsl->x.xs_rlock[thd_id] > 0);
+		ASSERT_NS(xsl->xs_state != xt_thr_maximum_threads); /* TODO: PMC - HOW can this fail?! - but it does? */
+		if (xsl->x.xs_rlock[thd_id] > 1)
+			xsl->x.xs_rlock[thd_id]--;
+		else {
+			/* {RACE-WR_MUTEX}.
+			 * A BUG FIX:
+			 *
+			 * Previously I was checking "xsl->xs_xlocker" after,
+			 * descrementing the READ lock.
+			 *
+			 * This resulted in a race condition that could cause the
+			 * unlocking reader to hang in xt_lock_mutex_ns().
+			 * This was because the X locker, grabbed the mutex (xs_lock)
+			 * but did not wait for the reader.
+			 *
+			 * The result was that the reader had to wait in UNLOCK
+			 * until the X locker did an unlock!
+			 *
+			 * This only became obvious when it caused a deadlock (because
+			 * the reader was waiting for the locker, which it should not
+			 * have been, of course).
+			 */
+			if (xsl->xs_xlocker) {
+				xt_lock_mutex_ns(&xsl->xs_lock);
+				if (xsl->xs_xlocker && xsl->xs_state == thd_id) {
+					/* If the X locker is waiting for me,
+					 * then allow him to continue. 
+					 */
+					if (!xt_broadcast_cond_ns(&xsl->xs_cond)) {
+						xt_unlock_mutex_ns(&xsl->xs_lock);
+						return FAILED;
+					}
+				}
+				xt_atomic_dec1(&xsl->x.xs_rlock[thd_id]);
+				xt_unlock_mutex_ns(&xsl->xs_lock);
+			}
+			else
+				/* {RACE-WR_MUTEX}
+				 * There is a race condition between the check above, and the
+				 * the decrement here.
+				 *
+				 * However, if I check xsl->xs_xlocker afterwards, and then
+				 * try to get the lock xs_lock, I could hand for the duration
+				 * of the X lock.
+				 */
+				xt_atomic_dec1(&xsl->x.xs_rlock[thd_id]);
+		}
+	}
+#ifdef XT_THREAD_LOCK_INFO
+	xt_thread_lock_info_release_owner(&xsl->xs_lock_info);
+#endif
+	return OK;
+}
+
+/*
+ * -----------------------------------------------------------------------
+ * SPIN LOCK
+ */
+
+#ifdef XT_THREAD_LOCK_INFO
+xtPublic void xt_spinlock_init(XTThreadPtr self, XTSpinLockPtr spl, const char *n)
+#else
+xtPublic void xt_spinlock_init(XTThreadPtr self, XTSpinLockPtr spl)
+#endif
+{
+	(void) self;
+	spl->spl_lock = 0;
+#ifdef XT_NO_ATOMICS
+	xt_init_mutex_with_autoname(self, &spl->spl_mutex);
+#endif
+#ifdef DEBUG
+	spl->spl_locker = 0;
+#endif
+#ifdef XT_THREAD_LOCK_INFO
+	spl->spl_name = n;
+	xt_thread_lock_info_init(&spl->spl_lock_info, spl);
+#endif
+}
+
+xtPublic void xt_spinlock_free(XTThreadPtr XT_UNUSED(self), XTSpinLockPtr spl)
+{
+	(void) spl;
+#ifdef XT_NO_ATOMICS
+	xt_free_mutex(&spl->spl_mutex);
+#endif
+#ifdef XT_THREAD_LOCK_INFO
+	xt_thread_lock_info_free(&spl->spl_lock_info);
+#endif
+}
+
+xtPublic xtBool xt_spinlock_spin(XTSpinLockPtr spl)
+{
+	volatile xtWord4	*lck = &spl->spl_lock;
+
+	for (;;) {
+		for (int i=0; i<10; i++) {
+			/* Check the lock variable: */
+			if (!*lck) {
+				/* Try to get the lock: */
+				if (!xt_spinlock_set(spl))
+					goto done_ok;
+			}
+		}
+
+		/* Go to "sleep" */
+		xt_critical_wait();
+	}
+
+	done_ok:
+	return OK;
+}
+
+#ifdef DEBUG
+xtPublic void xt_spinlock_set_thread(XTSpinLockPtr spl)
+{
+	spl->spl_locker = xt_get_self();
+}
+#endif
+
+/*
+ * -----------------------------------------------------------------------
+ * FAST LOCK
+ */
+
+#ifdef XT_THREAD_LOCK_INFO
+xtPublic void xt_fastlock_init(XTThreadPtr self, XTFastLockPtr fal, const char *n)
+#else
+xtPublic void xt_fastlock_init(XTThreadPtr self, XTFastLockPtr fal)
+#endif
+{
+	xt_spinlock_init_with_autoname(self, &fal->fal_spinlock);
+	xt_spinlock_init_with_autoname(self, &fal->fal_wait_lock);
+	for (u_int i=0; i<XT_FAST_LOCK_MAX_WAIT; i++)
+		fal->fal_wait_list[i] = NULL;
+	fal->fal_wait_count = 0;
+	fal->fal_wait_wakeup = 0;
+	fal->fal_wait_alloc = 0;
+#ifdef XT_THREAD_LOCK_INFO
+	fal->fal_name = n;
+	xt_thread_lock_info_init(&fal->fal_lock_info, fal);
+#endif
+}
+
+xtPublic void xt_fastlock_free(XTThreadPtr self, XTFastLockPtr fal)
+{
+	xt_spinlock_free(self, &fal->fal_spinlock);
+	xt_spinlock_free(self, &fal->fal_wait_lock);
+#ifdef XT_THREAD_LOCK_INFO
+	xt_thread_lock_info_free(&fal->fal_lock_info);
+#endif
+}
+
+xtPublic xtBool xt_fastlock_spin(XTFastLockPtr fal, XTThreadPtr thread)
+{
+	volatile xtWord4	*lck = &fal->fal_spinlock.spl_lock;
+
+	do {
+		for (int i=0; i<10; i++) {
+			/* Check the lock variable: */
+			if (!*lck) {
+				/* Try to get the lock: */
+				if (!xt_spinlock_set(&fal->fal_spinlock)) {
+					fal->fal_locker = thread;
+					return OK;
+				}
+			}
+		}
+
+		for (int i=0; i<10; i++) {
+			xt_critical_wait();
+			if (!*lck) {
+				/* Try to get the lock: */
+				if (!xt_spinlock_set(&fal->fal_spinlock)) {
+					fal->fal_locker = thread;
+					return OK;
+				}
+			}
+		}
+
+		/* Wait for a wakeup */
+		xt_spinlock_lock(&fal->fal_wait_lock);
+		if (fal->fal_wait_count == XT_FAST_LOCK_MAX_WAIT) {
+			xt_register_ulxterr(XT_REG_CONTEXT, XT_ERR_TOO_MANY_WAITERS, (u_long) XT_FAST_LOCK_MAX_WAIT+1);
+			xt_spinlock_unlock(&fal->fal_wait_lock);
+			return FAILED;
+		}
+		while (fal->fal_wait_list[fal->fal_wait_alloc])
+			fal->fal_wait_alloc = (fal->fal_wait_alloc + 1) % XT_FAST_LOCK_MAX_WAIT;
+		fal->fal_wait_list[fal->fal_wait_alloc] = thread;
+		fal->fal_wait_alloc = (fal->fal_wait_alloc + 1) % XT_FAST_LOCK_MAX_WAIT;
+		fal->fal_wait_count++;
+		xt_lock_thread(thread);
+		xt_spinlock_unlock(&fal->fal_wait_lock);
+		if (!xt_wait_thread(thread)) {
+			xt_unlock_thread(thread);
+			if (fal->fal_locker == thread)
+				xt_fastlock_unlock(fal, thread);
+			return FAILED;
+		}
+		xt_unlock_thread(thread);
+	} while (fal->fal_locker != thread);
+	return OK;
+}
+
+/* Wake up one of the waiters. */
+xtPublic void xt_fastlock_wakeup(XTFastLockPtr fal)
+{
+	xt_spinlock_lock(&fal->fal_wait_lock);
+	if (fal->fal_wait_count) {
+		XTThreadPtr thread;
+
+		/* Find a waiting thread, and give it the exclusive lock: */
+		while (!fal->fal_wait_list[fal->fal_wait_wakeup])
+			fal->fal_wait_wakeup = (fal->fal_wait_wakeup + 1) % XT_FAST_LOCK_MAX_WAIT;
+		thread = fal->fal_wait_list[fal->fal_wait_wakeup];
+		fal->fal_wait_list[fal->fal_wait_wakeup] = NULL;
+		fal->fal_wait_wakeup = (fal->fal_wait_wakeup + 1) % XT_FAST_LOCK_MAX_WAIT;
+		fal->fal_wait_count--;
+		fal->fal_locker = thread;
+
+		xt_lock_thread(thread);
+		xt_spinlock_unlock(&fal->fal_wait_lock);
+		xt_signal_thread(thread);
+		xt_unlock_thread(thread);
+	}
+	else {
+		xt_spinlock_unlock(&fal->fal_wait_lock);
+		fal->fal_locker = NULL;
+		xt_spinlock_reset(&fal->fal_spinlock);
+	}
+}
+
+/*
+ * -----------------------------------------------------------------------
+ * READ/WRITE SPIN LOCK
+ *
+ * An extremely genius very fast read/write lock based on atomics!
+ */
+
+#ifdef XT_THREAD_LOCK_INFO
+xtPublic void xt_spinxslock_init(struct XTThread *XT_UNUSED(self), XTSpinXSLockPtr sxs, const char *name)
+#else
+xtPublic void xt_spinxslock_init(struct XTThread *XT_UNUSED(self), XTSpinXSLockPtr sxs)
+#endif
+{
+	sxs->sxs_xlocked = 0;
+	sxs->sxs_rlock_count = 0;
+	sxs->sxs_wait_count = 0;
+#ifdef DEBUG
+	sxs->sxs_locker = 0;
+#endif
+#ifdef XT_THREAD_LOCK_INFO
+	sxs->sxs_name = name;
+	xt_thread_lock_info_init(&sxs->sxs_lock_info, sxs);
+#endif
+}
+
+xtPublic void xt_spinxslock_free(struct XTThread *XT_UNUSED(self), XTSpinXSLockPtr sxs)
+{
+#ifdef XT_THREAD_LOCK_INFO
+	xt_thread_lock_info_free(&sxs->sxs_lock_info);
+#else
+	(void) sxs;
+#endif
+}
+
+xtPublic xtBool xt_spinxslock_xlock(XTSpinXSLockPtr sxs, xtBool try_lock, xtThreadID XT_NDEBUG_UNUSED(thd_id))
+{
+	register xtWord2 set;
+
+	/* Wait for exclusive locker: */
+	for (;;) {
+		set = xt_atomic_tas2(&sxs->sxs_xlocked, 1);
+		if (!set)
+			break;
+		if (try_lock)
+			return FALSE;
+		xt_yield();
+	}
+
+#ifdef DEBUG
+	sxs->sxs_locker = thd_id;
+#endif
+
+	/* Wait for all the readers to wait! */
+	while (sxs->sxs_wait_count < sxs->sxs_rlock_count) {
+		sxs->sxs_xwaiter = 1;
+		xt_yield(); //*
+		/* This should not be required, because there is only one thread
+		 * accessing this value. However, the lock fails if this
+		 * is not done with an atomic op.
+		 *
+		 * This is because threads on other processors have the
+		 * value in processor cache. So they do not
+		 * notice that the value has been set to zero.
+		 * They think it is still 1 and march through
+		 * the barrier (sxs->sxs_xwaiter < sxs->sxs_xlocked) below.
+		 *
+		 * In the meantime, this X locker has gone on thinking
+		 * all is OK.
+		 */
+		xt_atomic_tas2(&sxs->sxs_xwaiter, 0);
+	}
+
+#ifdef XT_THREAD_LOCK_INFO
+	xt_thread_lock_info_add_owner(&sxs->sxs_lock_info);
+#endif
+	return OK;
+}
+
+xtPublic xtBool xt_spinxslock_slock(XTSpinXSLockPtr sxs)
+{
+	xt_atomic_inc2(&sxs->sxs_rlock_count);
+
+	/* Wait as long as the locker is not waiting: */
+	while (sxs->sxs_xwaiter < sxs->sxs_xlocked) {
+		xt_atomic_inc2(&sxs->sxs_wait_count);
+		while (sxs->sxs_xwaiter < sxs->sxs_xlocked) {
+			xt_yield();
+		}
+		xt_atomic_dec2(&sxs->sxs_wait_count);
+	}
+
+#ifdef XT_THREAD_LOCK_INFO
+	xt_thread_lock_info_add_owner(&sxs->sxs_lock_info);
+#endif
+	return OK;
+}
+
+xtPublic xtBool xt_spinxslock_unlock(XTSpinXSLockPtr sxs, xtBool xlocked)
+{
+	if (xlocked) {
+#ifdef DEBUG
+		ASSERT_NS(sxs->sxs_locker && sxs->sxs_xlocked);
+		sxs->sxs_locker = 0;
+#endif
+		sxs->sxs_xlocked = 0;
+	}
+	else {
+#ifdef DEBUG
+		ASSERT_NS(sxs->sxs_rlock_count > 0);
+#endif
+		xt_atomic_dec2(&sxs->sxs_rlock_count);
+	}
+
+#ifdef XT_THREAD_LOCK_INFO
+	xt_thread_lock_info_release_owner(&sxs->sxs_lock_info);
+#endif
+	return OK;
+}
+
+/*
+ * -----------------------------------------------------------------------
+ * FAST READ/WRITE LOCK (BASED ON FAST MUTEX)
+ */
+
+#ifdef XT_THREAD_LOCK_INFO
+xtPublic void xt_xsmutex_init(struct XTThread *self, XTXSMutexLockPtr xsm, const char *name)
+#else
+xtPublic void xt_xsmutex_init(struct XTThread *self, XTXSMutexLockPtr xsm)
+#endif
+{
+	xt_init_mutex_with_autoname(self, &xsm->xsm_lock);
+	xt_init_cond(self, &xsm->xsm_cond);
+	xt_init_cond(self, &xsm->xsm_cond_2);
+	xsm->xsm_xlocker = 0;
+	xsm->xsm_rlock_count = 0;
+	xsm->xsm_wait_count = 0;
+#ifdef DEBUG
+	xsm->xsm_locker = 0;
+#endif
+#ifdef XT_THREAD_LOCK_INFO
+	xsm->xsm_name = name;
+	xt_thread_lock_info_init(&xsm->xsm_lock_info, xsm);
+#endif
+}
+
+xtPublic void xt_xsmutex_free(struct XTThread *XT_UNUSED(self), XTXSMutexLockPtr xsm)
+{
+	xt_free_mutex(&xsm->xsm_lock);
+	xt_free_cond(&xsm->xsm_cond);
+	xt_free_cond(&xsm->xsm_cond_2);
+#ifdef XT_THREAD_LOCK_INFO
+	xt_thread_lock_info_free(&xsm->xsm_lock_info);
+#endif
+}
+
+xtPublic xtBool xt_xsmutex_xlock(XTXSMutexLockPtr xsm, xtThreadID thd_id)
+{
+	xt_lock_mutex_ns(&xsm->xsm_lock);
+
+	/* Wait for exclusive locker: */
+	while (xsm->xsm_xlocker) {
+		if (!xt_timed_wait_cond_ns(&xsm->xsm_cond, &xsm->xsm_lock, 10000)) {
+			xt_unlock_mutex_ns(&xsm->xsm_lock);
+			return FAILED;
+		}
+	}
+
+	/* GOTCHA: You would think this is not necessary...
+	 * But is does not always work, if a normal insert is used.
+	 * The reason is, I guess, on MMP the assignment is not
+	 * always immediately visible to other processors, because they
+	 * have old versions of this variable in there cache.
+	 *
+	 * But this is required, because the locking mechanism is based
+	 * on:
+	 * Locker: sets xlocker, tests rlock_count
+	 * Reader: incs rlock_count, tests xlocker
+	 *
+	 * The test, in both cases, may not read stale values.
+	 * volatile does not help, because this just turns compiler
+	 * optimisations off.
+	 */
+	xt_atomic_set4(&xsm->xsm_xlocker, thd_id);
+
+	/* Wait for all the reader to wait! */
+	while (xsm->xsm_wait_count < xsm->xsm_rlock_count) {
+		/* {RACE-WR_MUTEX} Here as well: */
+		if (!xt_timed_wait_cond_ns(&xsm->xsm_cond, &xsm->xsm_lock, 100)) {
+			xsm->xsm_xlocker = 0;
+			xt_unlock_mutex_ns(&xsm->xsm_lock);
+			return FAILED;
+		}
+	}
+
+#ifdef XT_THREAD_LOCK_INFO
+	xt_thread_lock_info_add_owner(&xsm->xsm_lock_info);
+#endif
+	return OK;
+}
+
+xtPublic xtBool xt_xsmutex_slock(XTXSMutexLockPtr xsm, xtThreadID XT_UNUSED(thd_id))
+{
+	xt_atomic_inc2(&xsm->xsm_rlock_count);
+
+	/* Check if there could be an X locker: */
+	if (xsm->xsm_xlocker) {
+		/* I am waiting... */
+		xt_lock_mutex_ns(&xsm->xsm_lock);
+		xsm->xsm_wait_count++;
+		/* Wake up the xlocker: */
+		if (xsm->xsm_xlocker && xsm->xsm_wait_count == xsm->xsm_rlock_count) {
+			if (!xt_broadcast_cond_ns(&xsm->xsm_cond)) {
+				xsm->xsm_wait_count--;
+				xt_unlock_mutex_ns(&xsm->xsm_lock);
+				return FAILED;
+			}
+		}
+		while (xsm->xsm_xlocker) {
+			if (!xt_timed_wait_cond_ns(&xsm->xsm_cond_2, &xsm->xsm_lock, 10000)) {
+				xsm->xsm_wait_count--;
+				xt_unlock_mutex_ns(&xsm->xsm_lock);
+				return FAILED;
+			}
+		}
+		xsm->xsm_wait_count--;
+		xt_unlock_mutex_ns(&xsm->xsm_lock);
+	}
+
+#ifdef XT_THREAD_LOCK_INFO
+	xt_thread_lock_info_add_owner(&xsm->xsm_lock_info);
+#endif
+	return OK;
+}
+
+xtPublic xtBool xt_xsmutex_unlock(XTXSMutexLockPtr xsm, xtThreadID thd_id)
+{
+	if (xsm->xsm_xlocker == thd_id) {
+		xsm->xsm_xlocker = 0;
+		if (xsm->xsm_wait_count) {
+			if (!xt_broadcast_cond_ns(&xsm->xsm_cond_2)) {
+				xt_unlock_mutex_ns(&xsm->xsm_lock);
+				return FAILED;
+			}
+		}
+		else {
+			/* Wake up any other X or shared lockers: */
+			if (!xt_broadcast_cond_ns(&xsm->xsm_cond)) {
+				xt_unlock_mutex_ns(&xsm->xsm_lock);
+				return FAILED;
+			}
+		}
+		xt_unlock_mutex_ns(&xsm->xsm_lock);
+	}
+	else {
+		/* Taking the advice from {RACE-WR_MUTEX} I do the decrement
+		 * after I have a lock!
+		 */
+		if (xsm->xsm_xlocker) {
+			xt_lock_mutex_ns(&xsm->xsm_lock);
+			xt_atomic_dec2(&xsm->xsm_rlock_count);
+			if (xsm->xsm_xlocker && xsm->xsm_wait_count == xsm->xsm_rlock_count) {
+				/* If the X locker is waiting for me,
+				 * then allow him to continue. 
+				 */
+				if (!xt_broadcast_cond_ns(&xsm->xsm_cond)) {
+					xt_unlock_mutex_ns(&xsm->xsm_lock);
+					return FAILED;
+				}
+			}
+			xt_unlock_mutex_ns(&xsm->xsm_lock);
+		}
+		else
+			xt_atomic_dec2(&xsm->xsm_rlock_count);
+	}
+
+#ifdef XT_THREAD_LOCK_INFO
+	xt_thread_lock_info_release_owner(&xsm->xsm_lock_info);
+#endif
+	return OK;
+}
+
+/*
+ * -----------------------------------------------------------------------
+ * ATOMIC READ/WRITE LOCK (BASED ON ATOMIC OPERATIONS)
+ */
+
+#ifdef XT_THREAD_LOCK_INFO
+xtPublic void xt_atomicrwlock_init(struct XTThread *XT_UNUSED(self), XTAtomicRWLockPtr arw, const char *n)
+#else
+xtPublic void xt_atomicrwlock_init(struct XTThread *XT_UNUSED(self), XTAtomicRWLockPtr arw)
+#endif
+{
+	arw->arw_reader_count = 0;
+	arw->arw_xlock_set = 0;
+#ifdef XT_THREAD_LOCK_INFO
+	arw->arw_name = n;
+	xt_thread_lock_info_init(&arw->arw_lock_info, arw);
+#endif
+}
+
+#ifdef XT_THREAD_LOCK_INFO
+xtPublic void xt_atomicrwlock_free(struct XTThread *, XTAtomicRWLockPtr arw)
+#else
+xtPublic void xt_atomicrwlock_free(struct XTThread *, XTAtomicRWLockPtr XT_UNUSED(arw))
+#endif
+{
+#ifdef XT_THREAD_LOCK_INFO
+	xt_thread_lock_info_free(&arw->arw_lock_info);
+#endif
+}
+
+xtPublic xtBool xt_atomicrwlock_xlock(XTAtomicRWLockPtr arw, xtBool try_lock, xtThreadID XT_NDEBUG_UNUSED(thr_id))
+{
+	register xtWord2 set;
+
+	/* First get an exclusive lock: */
+	for (;;) {
+		set = xt_atomic_tas2(&arw->arw_xlock_set, 1);
+		if (!set)
+			break;
+		if (try_lock)
+			return FALSE;
+		xt_yield();
+	}
+
+	/* Wait for the remaining readers: */
+	while (arw->arw_reader_count)
+		xt_yield();
+
+#ifdef DEBUG
+	arw->arw_locker = thr_id;
+#endif
+
+#ifdef XT_THREAD_LOCK_INFO
+	xt_thread_lock_info_add_owner(&arw->arw_lock_info);
+#endif
+	return TRUE;
+}
+
+xtPublic xtBool xt_atomicrwlock_slock(XTAtomicRWLockPtr arw)
+{
+	register xtWord2 set;
+
+	/* First get an exclusive lock: */
+	for (;;) {
+		set = xt_atomic_tas2(&arw->arw_xlock_set, 1);
+		if (!set)
+			break;
+		xt_yield();
+	}
+
+	/* Add a reader: */
+	xt_atomic_inc2(&arw->arw_reader_count);
+
+	/* Release the xlock: */
+	arw->arw_xlock_set = 0;
+
+#ifdef XT_THREAD_LOCK_INFO
+	xt_thread_lock_info_add_owner(&arw->arw_lock_info);
+#endif
+	return OK;
+}
+
+xtPublic xtBool xt_atomicrwlock_unlock(XTAtomicRWLockPtr arw, xtBool xlocked)
+{
+	if (xlocked) {
+#ifdef DEBUG
+		arw->arw_locker = 0;
+#endif
+		arw->arw_xlock_set = 0;
+	}
+	else
+		xt_atomic_dec2(&arw->arw_reader_count);
+
+#ifdef XT_THREAD_LOCK_INFO
+	xt_thread_lock_info_release_owner(&arw->arw_lock_info);
+#endif
+
+	return OK;
+}
+
+/*
+ * -----------------------------------------------------------------------
+ * "SKEW" ATOMITC READ/WRITE LOCK (BASED ON ATOMIC OPERATIONS)
+ *
+ * This lock type favors writers. It only works if the proportion of readers
+ * to writer is high.
+ */
+
+#ifdef XT_THREAD_LOCK_INFO
+xtPublic void xt_skewrwlock_init(struct XTThread *XT_UNUSED(self), XTSkewRWLockPtr srw, const char *n)
+#else
+xtPublic void xt_skewrwlock_init(struct XTThread *XT_UNUSED(self), XTSkewRWLockPtr srw)
+#endif
+{
+	srw->srw_reader_count = 0;
+	srw->srw_xlock_set = 0;
+#ifdef XT_THREAD_LOCK_INFO
+	srw->srw_name = n;
+	xt_thread_lock_info_init(&srw->srw_lock_info, srw);
+#endif
+}
+
+#ifdef XT_THREAD_LOCK_INFO
+xtPublic void xt_skewrwlock_free(struct XTThread *, XTSkewRWLockPtr srw)
+#else
+xtPublic void xt_skewrwlock_free(struct XTThread *, XTSkewRWLockPtr XT_UNUSED(srw))
+#endif
+{
+#ifdef XT_THREAD_LOCK_INFO
+	xt_thread_lock_info_free(&srw->srw_lock_info);
+#endif
+}
+
+xtPublic xtBool xt_skewrwlock_xlock(XTSkewRWLockPtr srw, xtBool try_lock, xtThreadID XT_NDEBUG_UNUSED(thr_id))
+{
+	register xtWord2 set;
+
+	/* First get an exclusive lock: */
+	for (;;) {
+		set = xt_atomic_tas2(&srw->srw_xlock_set, 1);
+		if (!set)
+			break;
+		if (try_lock)
+			return FALSE;
+		xt_yield();
+	}
+
+	/* Wait for the remaining readers: */
+	while (srw->srw_reader_count)
+		xt_yield();
+
+#ifdef DEBUG
+	srw->srw_locker = thr_id;
+#endif
+
+#ifdef XT_THREAD_LOCK_INFO
+	xt_thread_lock_info_add_owner(&srw->srw_lock_info);
+#endif
+	return TRUE;
+}
+
+xtPublic xtBool xt_skewrwlock_slock(XTSkewRWLockPtr srw)
+{
+	/* Wait for an exclusive lock: */
+	retry:
+	for (;;) {
+		if (!srw->srw_xlock_set)
+			break;
+		xt_yield();
+	}
+
+	/* Add a reader: */
+	xt_atomic_inc2(&srw->srw_reader_count);
+
+	/* Check for xlock again: */
+	if (srw->srw_xlock_set) {
+		xt_atomic_dec2(&srw->srw_reader_count);
+		goto retry;
+	}
+
+#ifdef XT_THREAD_LOCK_INFO
+	xt_thread_lock_info_add_owner(&srw->srw_lock_info);
+#endif
+	return OK;
+}
+
+xtPublic xtBool xt_skewrwlock_unlock(XTSkewRWLockPtr srw, xtBool xlocked)
+{
+	if (xlocked)
+		srw->srw_xlock_set = 0;
+	else
+		xt_atomic_dec2(&srw->srw_reader_count);
+
+#ifdef XT_THREAD_LOCK_INFO
+	xt_thread_lock_info_release_owner(&srw->srw_lock_info);
+#endif
+#ifdef DEBUG
+	srw->srw_locker = 0;
+#endif
+
+	return OK;
+}
+
+/*
+ * -----------------------------------------------------------------------
+ * RECURSIVE R/W LOCK (allows X lockers to lock again)
+ */
+
+#ifdef XT_THREAD_LOCK_INFO
+void xt_recursivemutex_init(XTThreadPtr self, XTRecursiveMutexPtr rm, const char *name)
+{
+	rm->rm_locker = NULL;
+	rm->rm_lock_count = 0;
+	xt_init_mutex(self, &rm->rm_mutex, name);
+}
+#else
+xtPublic void xt_recursivemutex_init(XTThreadPtr self, XTRecursiveMutexPtr rm)
+{
+	rm->rm_locker = NULL;
+	rm->rm_lock_count = 0;
+	xt_init_mutex(self, &rm->rm_mutex);
+}
+#endif
+
+xtPublic void xt_recursivemutex_free(XTRecursiveMutexPtr rm)
+{
+	xt_free_mutex(&rm->rm_mutex);
+#ifdef XT_THREAD_LOCK_INFO
+	xt_thread_lock_info_free(&rm->rm_lock_info);
+#endif
+}
+
+xtPublic void xt_recursivemutex_lock(XTThreadPtr self, XTRecursiveMutexPtr rm)
+{
+	if (self != rm->rm_locker) {
+		xt_lock_mutex(self, &rm->rm_mutex);
+		rm->rm_locker = self;
+	}
+	rm->rm_lock_count++;
+}
+
+xtPublic void xt_recursivemutex_unlock(XTThreadPtr self, XTRecursiveMutexPtr rm)
+{
+	ASSERT(self == rm->rm_locker);
+	ASSERT(rm->rm_lock_count > 0);
+	rm->rm_lock_count--;
+	if (!rm->rm_lock_count) {
+		rm->rm_locker = NULL;
+		xt_unlock_mutex(self, &rm->rm_mutex);
+	}
+}
+
+/*
+ * -----------------------------------------------------------------------
+ * RECURSIVE MUTEX (allows lockers to lock again)
+ */
+
+#ifdef XT_THREAD_LOCK_INFO
+void xt_recurrwlock_init(struct XTThread *self, XTRecurRWLockPtr rrw, const char *name)
+{
+	rrw->rrw_locker = NULL;
+	rrw->rrw_lock_count = 0;
+	xt_init_rwlock(self, &rrw->rrw_lock, name);
+}
+#else
+void xt_recurrwlock_init(struct XTThread *self, XTRecurRWLockPtr rrw)
+{
+	rrw->rrw_locker = NULL;
+	rrw->rrw_lock_count = 0;
+	xt_init_rwlock(self, &rrw->rrw_lock);
+}
+#endif
+
+void xt_recurrwlock_free(XTRecurRWLockPtr rrw)
+{
+	xt_free_rwlock(&rrw->rrw_lock);
+#ifdef XT_THREAD_LOCK_INFO
+	xt_thread_lock_info_free(&rrw->rrw_lock_info);
+#endif
+}
+
+void xt_recurrwlock_xlock(struct XTThread *self, XTRecurRWLockPtr rrw)
+{
+	if (self != rrw->rrw_locker) {
+		xt_xlock_rwlock(self, &rrw->rrw_lock);
+		rrw->rrw_locker = self;
+	}
+	rrw->rrw_lock_count++;
+}
+
+void xt_recurrwlock_slock(struct XTThread *self, XTRecurRWLockPtr rrw)
+{
+	xt_slock_rwlock(self, &rrw->rrw_lock);
+}
+
+void xt_recurrwlock_slock_ns(XTRecurRWLockPtr rrw)
+{
+	xt_slock_rwlock_ns(&rrw->rrw_lock);
+}
+
+void xt_recurrwlock_unxlock(struct XTThread *self, XTRecurRWLockPtr rrw)
+{
+	ASSERT(self == rrw->rrw_locker);
+	ASSERT(rrw->rrw_lock_count > 0);
+	rrw->rrw_lock_count--;
+	if (!rrw->rrw_lock_count) {
+		rrw->rrw_locker = NULL;
+		xt_unlock_rwlock(self, &rrw->rrw_lock);
+	}
+}
+
+void xt_recurrwlock_unslock(struct XTThread *self, XTRecurRWLockPtr rrw)
+{
+	xt_unlock_rwlock(self, &rrw->rrw_lock);
+}
+
+void xt_recurrwlock_unslock_ns(XTRecurRWLockPtr rrw)
+{
+	xt_unlock_rwlock_ns(&rrw->rrw_lock);
+}
+
+/*
+ * -----------------------------------------------------------------------
+ * UNIT TESTS
+ */
+
+#define JOB_MEMCPY			1
+#define JOB_SLEEP			2
+#define JOB_PRINT			3
+#define JOB_INCREMENT		4
+#define JOB_SNOOZE			5
+#define JOB_DOUBLE_INC		6
+
+#define LOCK_PTHREAD_RW		1
+#define LOCK_PTHREAD_MUTEX	2
+#define LOCK_RWMUTEX		3
+#define LOCK_SPINLOCK		4
+#define LOCK_FASTLOCK		5
+#define LOCK_SPINXSLOCK		6
+#define LOCK_XSMUTEX		7
+#define LOCK_ATOMICRWLOCK	8
+#define LOCK_SKEWRWLOCK		9
+
+typedef struct XSLockTest {
+	u_int			xs_interations;
+	xtBool			xs_which_lock;
+	xtBool			xs_which_job;
+	xtBool			xs_debug_print;
+	XTRWMutexRec	xs_lock;
+	xt_rwlock_type	xs_plock;
+	XTSpinLockRec	xs_spinlock;
+	xt_mutex_type	xs_mutex;
+	XTFastLockRec	xs_fastlock;
+	XTSpinXSLockRec	xs_spinrwlock;
+	XTXSMutexRec	xs_fastrwlock;
+	XTAtomicRWLockRec xs_atomicrwlock;
+	XTSkewRWLockRec xs_skewrwlock;
+	int				xs_progress;
+	xtWord4			xs_inc;
+} XSLockTestRec, *XSLockTestPtr;
+
+static void lck_free_thread_data(XTThreadPtr XT_UNUSED(self), void *XT_UNUSED(data))
+{
+}
+
+static void lck_do_job(XTThreadPtr self, int job, XSLockTestPtr data, xtBool reader)
+{
+	char b1[2048], b2[2048];
+
+	switch (job) {
+		case JOB_MEMCPY:
+			memcpy(b1, b2, 2048);
+			data->xs_inc++;
+			break;
+		case JOB_SLEEP:
+			xt_sleep_milli_second(1);
+			data->xs_inc++;
+			break;
+		case JOB_PRINT:
+			printf("- %s got lock\n", self->t_name);
+			xt_sleep_milli_second(10);
+			data->xs_inc++;
+			break;
+		case JOB_INCREMENT:
+			data->xs_inc++;
+			break;
+		case JOB_SNOOZE:
+			xt_sleep_milli_second(10);
+			data->xs_inc++;
+			break;
+		case JOB_DOUBLE_INC:
+			if (reader) {
+				if ((data->xs_inc & 1) != 0)
+					printf("Noooo!\n");
+			}
+			else {
+				data->xs_inc++;
+				data->xs_inc++;
+			}
+			break;
+	}
+}
+
+#if 0
+static void *lck_run_dumper(XTThreadPtr self)
+{
+	int state = 0;
+
+	while (state != 1) {
+		sleep(1);
+		if (state == 2) {
+			xt_dump_trace();
+			state = 0;
+		}
+	}
+}
+#endif
+
+static void *lck_run_reader(XTThreadPtr self)
+{
+	XSLockTestRec	*data = (XSLockTestRec *) self->t_data;
+
+	if (data->xs_debug_print)
+		printf("- %s start\n", self->t_name);
+	for (u_int i=0; i<data->xs_interations; i++) {
+		if (data->xs_progress && ((i+1) % data->xs_progress) == 0)
+			printf("- %s %d\n", self->t_name, i+1);
+		if (data->xs_which_lock == LOCK_PTHREAD_RW) {
+			xt_slock_rwlock_ns(&data->xs_plock);
+			lck_do_job(self, data->xs_which_job, data, TRUE);
+			xt_unlock_rwlock_ns(&data->xs_plock);
+		}
+		else if (data->xs_which_lock == LOCK_RWMUTEX) {
+			xt_rwmutex_slock(&data->xs_lock, self->t_id);
+			lck_do_job(self, data->xs_which_job, data, TRUE);
+			xt_rwmutex_unlock(&data->xs_lock, self->t_id);
+		}
+		else if (data->xs_which_lock == LOCK_SPINXSLOCK) {
+			xt_spinxslock_slock(&data->xs_spinrwlock);
+			lck_do_job(self, data->xs_which_job, data, TRUE);
+			xt_spinxslock_unlock(&data->xs_spinrwlock, FALSE);
+		}
+		else if (data->xs_which_lock == LOCK_XSMUTEX) {
+			xt_xsmutex_slock(&data->xs_fastrwlock, self->t_id);
+			lck_do_job(self, data->xs_which_job, data, TRUE);
+			xt_xsmutex_unlock(&data->xs_fastrwlock, self->t_id);
+		}
+		else if (data->xs_which_lock == LOCK_ATOMICRWLOCK) {
+			xt_atomicrwlock_slock(&data->xs_atomicrwlock);
+			lck_do_job(self, data->xs_which_job, data, TRUE);
+			xt_atomicrwlock_unlock(&data->xs_atomicrwlock, FALSE);
+		}
+		else if (data->xs_which_lock == LOCK_SKEWRWLOCK) {
+			xt_skewrwlock_slock(&data->xs_skewrwlock);
+			lck_do_job(self, data->xs_which_job, data, TRUE);
+			xt_skewrwlock_unlock(&data->xs_skewrwlock, FALSE);
+		}
+		else
+			ASSERT(FALSE);
+	}
+	if (data->xs_debug_print)
+		printf("- %s stop\n", self->t_name);
+	return NULL;
+}
+
+static void *lck_run_writer(XTThreadPtr self)
+{
+	XSLockTestRec	*data = (XSLockTestRec *) self->t_data;
+
+	if (data->xs_debug_print)
+		printf("- %s start\n", self->t_name);
+	for (u_int i=0; i<data->xs_interations; i++) {
+		if (data->xs_progress && ((i+1) % data->xs_progress) == 0)
+			printf("- %s %d\n", self->t_name, i+1);
+		if (data->xs_which_lock == LOCK_PTHREAD_RW) {
+			xt_xlock_rwlock_ns(&data->xs_plock);
+			lck_do_job(self, data->xs_which_job, data, FALSE);
+			xt_unlock_rwlock_ns(&data->xs_plock);
+		}
+		else if (data->xs_which_lock == LOCK_RWMUTEX) {
+			xt_rwmutex_xlock(&data->xs_lock, self->t_id);
+			lck_do_job(self, data->xs_which_job, data, FALSE);
+			xt_rwmutex_unlock(&data->xs_lock, self->t_id);
+		}
+		else if (data->xs_which_lock == LOCK_SPINXSLOCK) {
+			xt_spinxslock_xlock(&data->xs_spinrwlock, FALSE, self->t_id);
+			lck_do_job(self, data->xs_which_job, data, FALSE);
+			xt_spinxslock_unlock(&data->xs_spinrwlock, TRUE);
+		}
+		else if (data->xs_which_lock == LOCK_XSMUTEX) {
+			xt_xsmutex_xlock(&data->xs_fastrwlock, self->t_id);
+			lck_do_job(self, data->xs_which_job, data, FALSE);
+			xt_xsmutex_unlock(&data->xs_fastrwlock, self->t_id);
+		}
+		else if (data->xs_which_lock == LOCK_ATOMICRWLOCK) {
+			xt_atomicrwlock_xlock(&data->xs_atomicrwlock, FALSE, self->t_id);
+			lck_do_job(self, data->xs_which_job, data, FALSE);
+			xt_atomicrwlock_unlock(&data->xs_atomicrwlock, TRUE);
+		}
+		else if (data->xs_which_lock == LOCK_SKEWRWLOCK) {
+			xt_skewrwlock_xlock(&data->xs_skewrwlock, FALSE, self->t_id);
+			lck_do_job(self, data->xs_which_job, data, FALSE);
+			xt_skewrwlock_unlock(&data->xs_skewrwlock, TRUE);
+		}
+		else
+			ASSERT(FALSE);
+	}
+	if (data->xs_debug_print)
+		printf("- %s stop\n", self->t_name);
+	return NULL;
+}
+
+static void lck_print_test(XSLockTestRec *data)
+{
+	switch (data->xs_which_lock) {
+		case LOCK_PTHREAD_RW:
+			printf("pthread read/write");
+			break;
+		case LOCK_PTHREAD_MUTEX:
+			printf("pthread mutex");
+			break;
+		case LOCK_RWMUTEX:
+			printf("fast read/write mutex");
+			break;
+		case LOCK_SPINLOCK:
+			printf("spin mutex");
+			break;
+		case LOCK_FASTLOCK:
+			printf("fast mutex");
+			break;
+		case LOCK_SPINXSLOCK:
+			printf("spin read/write lock");
+			break;
+		case LOCK_XSMUTEX:
+			printf("fast x/s mutex");
+			break;
+		case LOCK_ATOMICRWLOCK:
+			printf("atomic read/write lock");
+			break;
+		case LOCK_SKEWRWLOCK:
+			printf("skew read/write lock");
+			break;
+	}
+
+	switch (data->xs_which_job) {
+		case JOB_MEMCPY:
+			printf(" MEMCPY 2K");
+			break;
+		case JOB_SLEEP:
+			printf(" SLEEP 1/1000s");
+			break;
+		case JOB_PRINT:
+			printf(" PRINT DEBUG");
+			break;
+		case JOB_INCREMENT:
+			printf(" INCREMENT");
+			break;
+		case JOB_SNOOZE:
+			printf(" SLEEP 1/100s");
+			break;
+	}
+	
+	printf(" %d interations", data->xs_interations);
+}
+
+static void *lck_run_mutex_locker(XTThreadPtr self)
+{
+	XSLockTestRec *data = (XSLockTestRec *) self->t_data;
+
+	if (data->xs_debug_print)
+		printf("- %s start\n", self->t_name);
+	for (u_int i=0; i<data->xs_interations; i++) {
+		if (data->xs_progress && ((i+1) % data->xs_progress) == 0)
+			printf("- %s %d\n", self->t_name, i+1);
+		if (data->xs_which_lock == LOCK_PTHREAD_MUTEX) {
+			xt_lock_mutex_ns(&data->xs_mutex);
+			lck_do_job(self, data->xs_which_job, data, FALSE);
+			xt_unlock_mutex_ns(&data->xs_mutex);
+		}
+		else if (data->xs_which_lock == LOCK_SPINLOCK) {
+			xt_spinlock_lock(&data->xs_spinlock);
+			lck_do_job(self, data->xs_which_job, data, FALSE);
+			xt_spinlock_unlock(&data->xs_spinlock);
+		}
+		else if (data->xs_which_lock == LOCK_FASTLOCK) {
+			xt_fastlock_lock(&data->xs_fastlock, self);
+			lck_do_job(self, data->xs_which_job, data, FALSE);
+			xt_fastlock_unlock(&data->xs_fastlock, self);
+		}
+		else
+			ASSERT(FALSE);
+	}
+	if (data->xs_debug_print)
+		printf("- %s stop\n", self->t_name);
+	return NULL;
+}
+
+typedef struct LockThread {
+	xtThreadID		id;
+	XTThreadPtr		ptr;
+} LockThreadRec, *LockThreadPtr;
+
+static void lck_reader_writer_test(XTThreadPtr self, XSLockTestRec *data, int reader_cnt, int writer_cnt)
+{
+	xtWord8			start;
+	LockThreadPtr	threads;
+	int				thread_cnt = reader_cnt + writer_cnt;
+	char			buffer[40];
+
+	//XTThreadPtr dumper = xt_create_daemon(self, "DUMPER");
+	//xt_run_thread(self, dumper, lck_run_dumper);
+
+	printf("READ/WRITE TEST: ");
+	lck_print_test(data);
+	printf(", %d readers, %d writers\n", reader_cnt, writer_cnt);
+	threads = (LockThreadPtr) xt_malloc(self, thread_cnt * sizeof(LockThreadRec));
+
+	for (int i=0; i<thread_cnt; i++) {
+		sprintf(buffer, "%s%d", i < reader_cnt ? "READER-" : "WRITER-", i+1);
+		threads[i].ptr = xt_create_daemon(self, buffer);
+		threads[i].id = threads[i].ptr->t_id;
+		xt_set_thread_data(threads[i].ptr, data, lck_free_thread_data);
+	}
+
+	start = xt_trace_clock();
+	for (int i=0; i<reader_cnt; i++)
+		xt_run_thread(self, threads[i].ptr, lck_run_reader);
+	for (int i=reader_cnt; i<thread_cnt; i++)
+		xt_run_thread(self, threads[i].ptr, lck_run_writer);
+
+	for (int i=0; i<thread_cnt; i++)
+		xt_wait_for_thread(threads[i].id, TRUE);
+	printf("----- %d reader, %d writer time=%s\n", reader_cnt, writer_cnt, xt_trace_clock_diff(buffer, start));
+
+	xt_free(self, threads);
+	printf("TEST RESULT = %d\n", data->xs_inc);
+
+	//xt_wait_for_thread(dumper, TRUE);
+}
+
+static void lck_mutex_lock_test(XTThreadPtr self, XSLockTestRec *data, int thread_cnt)
+{
+	xtWord8			start;
+	LockThreadPtr	threads;
+	char			buffer[40];
+
+	printf("LOCK MUTEX TEST: ");
+	lck_print_test(data);
+	printf(", %d threads\n", thread_cnt);
+	threads = (LockThreadPtr) xt_malloc(self, thread_cnt * sizeof(LockThreadRec));
+
+	for (int i=0; i<thread_cnt; i++) {
+		sprintf(buffer, "THREAD%d", i+1);
+		threads[i].ptr = xt_create_daemon(self, buffer);
+		threads[i].id = threads[i].ptr->t_id;
+		xt_set_thread_data(threads[i].ptr, data, lck_free_thread_data);
+	}
+
+	start = xt_trace_clock();
+	for (int i=0; i<thread_cnt; i++)
+		xt_run_thread(self, threads[i].ptr, lck_run_mutex_locker);
+
+	for (int i=0; i<thread_cnt; i++)
+		xt_wait_for_thread(threads[i].id, TRUE);
+	printf("----- %d threads time=%s\n", thread_cnt, xt_trace_clock_diff(buffer, start));
+
+	xt_free(self, threads);
+	printf("TEST RESULT = %d\n", data->xs_inc);
+}
+
+xtPublic void xt_unit_test_read_write_locks(XTThreadPtr self)
+{
+	XSLockTestRec	data;
+
+	memset(&data, 0, sizeof(data));
+
+	printf("TEST: xt_unit_test_read_write_locks\n");
+	printf("size of XTXSMutexRec = %d\n", (int) sizeof(XTXSMutexRec));
+	printf("size of pthread_cond_t = %d\n", (int) sizeof(pthread_cond_t));
+	printf("size of pthread_mutex_t = %d\n", (int) sizeof(pthread_mutex_t));
+	xt_rwmutex_init_with_autoname(self, &data.xs_lock);
+	xt_init_rwlock_with_autoname(self, &data.xs_plock);
+	xt_spinxslock_init_with_autoname(self, &data.xs_spinrwlock);
+	xt_xsmutex_init_with_autoname(self, &data.xs_fastrwlock);
+	xt_atomicrwlock_init_with_autoname(self, &data.xs_atomicrwlock);
+	xt_skewrwlock_init_with_autoname(self, &data.xs_skewrwlock);
+
+	/**
+	data.xs_interations = 10;
+	data.xs_which_lock = LOCK_RWMUTEX; // LOCK_PTHREAD_RW, LOCK_RWMUTEX, LOCK_SPINXSLOCK, LOCK_XSMUTEX
+	data.xs_which_job = JOB_PRINT;
+	data.xs_debug_print = TRUE;
+	data.xs_progress = 0;
+	lck_reader_writer_test(self, &data, 4, 0);
+	lck_reader_writer_test(self, &data, 0, 2);
+	lck_reader_writer_test(self, &data, 1, 1);
+	lck_reader_writer_test(self, &data, 4, 2);
+	**/
+
+	/**
+	data.xs_interations = 4000;
+	data.xs_which_lock = LOCK_RWMUTEX; // LOCK_PTHREAD_RW, LOCK_RWMUTEX, LOCK_SPINXSLOCK, LOCK_XSMUTEX
+	data.xs_which_job = JOB_SLEEP;
+	data.xs_debug_print = TRUE;
+	data.xs_progress = 200;
+	lck_reader_writer_test(self, &data, 4, 0);
+	lck_reader_writer_test(self, &data, 0, 2);
+	lck_reader_writer_test(self, &data, 1, 1);
+	lck_reader_writer_test(self, &data, 4, 2);
+	**/
+
+	// LOCK_PTHREAD_RW, LOCK_RWMUTEX, LOCK_SPINXSLOCK, LOCK_XSMUTEX, LOCK_ATOMICRWLOCK, LOCK_SKEWRWLOCK
+	/**/
+	data.xs_interations = 100000;
+	data.xs_which_lock = LOCK_XSMUTEX;
+	data.xs_which_job = JOB_DOUBLE_INC; // JOB_INCREMENT, JOB_DOUBLE_INC
+	data.xs_debug_print = FALSE;
+	data.xs_progress = 0;
+	lck_reader_writer_test(self, &data, 10, 0);
+	data.xs_which_lock = LOCK_XSMUTEX;
+	lck_reader_writer_test(self, &data, 10, 0);
+	//lck_reader_writer_test(self, &data, 0, 5);
+	//lck_reader_writer_test(self, &data, 10, 0);
+	//lck_reader_writer_test(self, &data, 10, 5);
+	/**/
+
+	/**/
+	data.xs_interations = 10000;
+	data.xs_which_lock = LOCK_XSMUTEX;
+	data.xs_which_job = JOB_MEMCPY;
+	data.xs_debug_print = FALSE;
+	data.xs_progress = 0;
+	lck_reader_writer_test(self, &data, 10, 0);
+	data.xs_which_lock = LOCK_XSMUTEX;
+	lck_reader_writer_test(self, &data, 10, 0);
+	//lck_reader_writer_test(self, &data, 0, 5);
+	//lck_reader_writer_test(self, &data, 10, 0);
+	//lck_reader_writer_test(self, &data, 10, 5);
+	/**/
+
+	/**/
+	data.xs_interations = 1000;
+	data.xs_which_lock = LOCK_XSMUTEX;
+	data.xs_which_job = JOB_SLEEP; // JOB_SLEEP, JOB_SNOOZE
+	data.xs_debug_print = FALSE;
+	data.xs_progress = 0;
+	lck_reader_writer_test(self, &data, 10, 0);
+	data.xs_which_lock = LOCK_XSMUTEX;
+	lck_reader_writer_test(self, &data, 10, 0);
+	/**/
+
+	xt_rwmutex_free(self, &data.xs_lock);
+	xt_free_rwlock(&data.xs_plock);
+	xt_spinxslock_free(self, &data.xs_spinrwlock);
+	xt_xsmutex_free(self, &data.xs_fastrwlock);
+	xt_atomicrwlock_free(self, &data.xs_atomicrwlock);
+	xt_skewrwlock_free(self, &data.xs_skewrwlock);
+}
+
+xtPublic void xt_unit_test_mutex_locks(XTThreadPtr self)
+{
+	XSLockTestRec	data;
+
+	memset(&data, 0, sizeof(data));
+
+	printf("TEST: xt_unit_test_mutex_locks\n");
+	xt_spinlock_init_with_autoname(self, &data.xs_spinlock);
+	xt_fastlock_init_with_autoname(self, &data.xs_fastlock);
+	xt_init_mutex_with_autoname(self, &data.xs_mutex);
+
+	/**/
+	data.xs_interations = 10;
+	data.xs_which_lock = LOCK_SPINLOCK; // LOCK_SPINLOCK, LOCK_PTHREAD_MUTEX, LOCK_FASTLOCK
+	data.xs_which_job = JOB_PRINT;
+	data.xs_debug_print = TRUE;
+	data.xs_progress = 0;
+	data.xs_inc = 0;
+	lck_mutex_lock_test(self, &data, 2);
+	/**/
+
+	/**/
+	data.xs_interations = 100000;
+	data.xs_which_lock = LOCK_SPINLOCK; // LOCK_SPINLOCK, LOCK_PTHREAD_MUTEX, LOCK_FASTLOCK
+	data.xs_which_job = JOB_INCREMENT;
+	data.xs_debug_print = FALSE;
+	data.xs_progress = 0;
+	data.xs_inc = 0;
+	lck_mutex_lock_test(self, &data, 10);
+	/**/
+
+	/**/
+	data.xs_interations = 10000;
+	data.xs_which_lock = LOCK_SPINLOCK; // LOCK_SPINLOCK, LOCK_PTHREAD_MUTEX, LOCK_FASTLOCK
+	data.xs_which_job = JOB_MEMCPY;
+	data.xs_debug_print = FALSE;
+	data.xs_progress = 0;
+	data.xs_inc = 0;
+	lck_mutex_lock_test(self, &data, 10);
+	/**/
+
+	/**/
+	data.xs_interations = 1000;
+	data.xs_which_lock = LOCK_FASTLOCK; // LOCK_SPINLOCK, LOCK_PTHREAD_MUTEX, LOCK_FASTLOCK
+	data.xs_which_job = JOB_SLEEP;
+	data.xs_debug_print = FALSE;
+	data.xs_progress = 0;
+	data.xs_inc = 0;
+	lck_mutex_lock_test(self, &data, 10);
+	/**/
+
+	/**/
+	data.xs_interations = 100;
+	data.xs_which_lock = LOCK_FASTLOCK; // LOCK_SPINLOCK, LOCK_PTHREAD_MUTEX, LOCK_FASTLOCK
+	data.xs_which_job = JOB_SNOOZE;
+	data.xs_debug_print = FALSE;
+	data.xs_progress = 0;
+	data.xs_inc = 0;
+	lck_mutex_lock_test(self, &data, 10);
+	/**/
+
+	xt_spinlock_free(self, &data.xs_spinlock);
+	xt_fastlock_free(self, &data.xs_fastlock);
+	xt_free_mutex(&data.xs_mutex);
+}
+
+xtPublic void xt_unit_test_create_threads(XTThreadPtr self)
+{
+	XTThreadPtr		threads[10];
+
+	printf("TEST: xt_unit_test_create_threads\n");
+	printf("current max threads = %d, in use = %d\n", xt_thr_current_max_threads, xt_thr_current_thread_count);
+
+	/* Create some threads: */
+	threads[0] = xt_create_daemon(self, "test0");
+	printf("thread = %d\n", threads[0]->t_id);
+	threads[1] = xt_create_daemon(self, "test1");
+	printf("thread = %d\n", threads[1]->t_id);
+	threads[2] = xt_create_daemon(self, "test2");
+	printf("thread = %d\n", threads[2]->t_id);
+	threads[3] = xt_create_daemon(self, "test3");
+	printf("thread = %d\n", threads[3]->t_id);
+	threads[4] = xt_create_daemon(self, "test4");
+	printf("thread = %d\n", threads[4]->t_id);
+	printf("current max threads = %d, in use = %d\n", xt_thr_current_max_threads, xt_thr_current_thread_count);
+
+	/* Max stays the same: */
+	xt_free_thread(threads[3]);
+	xt_free_thread(threads[2]);
+	xt_free_thread(threads[1]);
+	printf("current max threads = %d, in use = %d\n", xt_thr_current_max_threads, xt_thr_current_thread_count);
+
+	/* Fill in the gaps: */
+	threads[1] = xt_create_daemon(self, "test1");
+	printf("thread = %d\n", threads[1]->t_id);
+	threads[2] = xt_create_daemon(self, "test2");
+	printf("thread = %d\n", threads[2]->t_id);
+	threads[3] = xt_create_daemon(self, "test3");
+	printf("thread = %d\n", threads[3]->t_id);
+	printf("current max threads = %d, in use = %d\n", xt_thr_current_max_threads, xt_thr_current_thread_count);
+
+	/* And add one: */
+	threads[5] = xt_create_daemon(self, "test5");
+	printf("thread = %d\n", threads[5]->t_id);
+	printf("current max threads = %d, in use = %d\n", xt_thr_current_max_threads, xt_thr_current_thread_count);
+
+	/* Max stays the same: */
+	xt_free_thread(threads[3]);
+	xt_free_thread(threads[2]);
+	xt_free_thread(threads[1]);
+	xt_free_thread(threads[4]);
+	printf("current max threads = %d, in use = %d\n", xt_thr_current_max_threads, xt_thr_current_thread_count);
+
+	/* Recalculate the max: */
+	xt_free_thread(threads[5]);
+	printf("current max threads = %d, in use = %d\n", xt_thr_current_max_threads, xt_thr_current_thread_count);
+
+	/* Fill in the gaps: */
+	threads[1] = xt_create_daemon(self, "test1");
+	printf("thread = %d\n", threads[1]->t_id);
+	threads[2] = xt_create_daemon(self, "test2");
+	printf("thread = %d\n", threads[2]->t_id);
+	threads[3] = xt_create_daemon(self, "test3");
+	printf("thread = %d\n", threads[3]->t_id);
+	printf("current max threads = %d, in use = %d\n", xt_thr_current_max_threads, xt_thr_current_thread_count);
+
+	xt_free_thread(threads[3]);
+	xt_free_thread(threads[2]);
+	xt_free_thread(threads[1]);
+	xt_free_thread(threads[0]);
+	printf("current max threads = %d, in use = %d\n", xt_thr_current_max_threads, xt_thr_current_thread_count);
+}
+
+#ifdef UNUSED_CODE
+int XTRowLocks::xt_release_locks(struct XTOpenTable *ot, xtRowID row, XTRowLockListPtr lock_list)
+{
+	if (ot->ot_temp_row_lock)
+		xt_make_lock_permanent(ot, lock_list);
+
+	if (!lock_list->bl_count)
+		return XT_NO_LOCK;
+
+	int					group, pgroup;
+	XTXactDataPtr		xact;
+	xtTableID			tab_id, ptab_id;
+	XTPermRowLockPtr	plock;
+	XTOpenTablePtr		pot = NULL;
+	XTRowLocksPtr		row_locks;
+
+	/* Do I have the lock? */
+	group = row % XT_ROW_LOCK_COUNT;
+	if (!(xact = tab_row_locks[group]))
+		/* There is no lock: */
+		return XT_NO_LOCK;
+
+	if (xact != ot->ot_thread->st_xact_data)
+		/* There is a lock but it does not belong to me! */
+		return XT_NO_LOCK;
+
+	tab_id = ot->ot_table->tab_id;
+	plock = (XTPermRowLockPtr) &lock_list->bl_data[lock_list->bl_count * lock_list->bl_item_size];
+	lock_list->rll_release_point = lock_list->bl_count;
+	for (u_int i=0; i<lock_list->bl_count; i++) {
+		plock--;
+
+		pgroup = plock->pr_group;
+		ptab_id = plock->pr_tab_id;
+
+		if (ptab_id == tab_id)
+			row_locks = this;
+		else {
+			if (pot) {
+				if (pot->ot_table->tab_id == ptab_id)
+					goto remove_lock;
+				xt_db_return_table_to_pool_ns(pot);
+				pot = NULL;
+			}
+
+			if (!xt_db_open_pool_table_ns(&pot, ot->ot_table->tab_db, tab_id)) {
+				/* Should not happen, but just in case, we just don't
+				 * remove the lock. We will probably end up with a deadlock
+				 * somewhere.
+				 */
+				xt_log_and_clear_exception_ns();
+				goto skip_remove_lock;
+			}
+			if (!pot)
+				/* Can happen of the table has been dropped: */
+				goto skip_remove_lock;
+
+			remove_lock:
+			row_locks = &pot->ot_table->tab_locks;
+		}
+
+#ifdef XT_TRACE_LOCKS
+		xt_ttracef(xt_get_self(), "release lock group=%d\n", pgroup);
+#endif
+		row_locks->tab_row_locks[pgroup] = NULL;
+		row_locks->tab_lock_perm[pgroup] = 0;
+		skip_remove_lock:;
+
+		lock_list->rll_release_point--;
+		if (tab_id == ptab_id && group == pgroup)
+			break;
+	}
+
+	if (pot) 
+		xt_db_return_table_to_pool_ns(pot);
+	return XT_PERM_LOCK;
+}
+
+xtBool XTRowLocks::xt_regain_locks(struct XTOpenTable *ot, int *lock_type, xtXactID *xn_id, XTRowLockListPtr lock_list)
+{
+	int					group;
+	XTXactDataPtr		xact, my_xact;
+	XTPermRowLockPtr	plock;
+	xtTableID			tab_id;
+	XTOpenTablePtr		pot = NULL;
+	XTRowLocksPtr		row_locks = NULL;
+	XTTableHPtr			tab = NULL;
+
+	for (u_int i=lock_list->rll_release_point; i<lock_list->bl_count; i++) {
+		plock = (XTPermRowLockPtr) &lock_list->bl_data[i * lock_list->bl_item_size];
+
+		my_xact = ot->ot_thread->st_xact_data;
+		group = plock->pr_group;
+		tab_id = plock->pr_tab_id;
+
+		if (tab_id == ot->ot_table->tab_id) {
+			row_locks = this;
+			tab = ot->ot_table;
+		}
+		else {
+			if (pot) {
+				if (tab_id == pot->ot_table->tab_id)
+					goto gain_lock;
+				xt_db_return_table_to_pool_ns(pot);
+				pot = NULL;
+			}
+
+			if (!xt_db_open_pool_table_ns(&pot, ot->ot_table->tab_db, tab_id))
+				return FAILED;
+			if (!pot)
+				goto no_gain_lock;
+			
+			gain_lock:
+			tab = pot->ot_table;
+			row_locks = &tab->tab_locks;
+			no_gain_lock:;
+		}
+
+#ifdef XT_TRACE_LOCKS
+		xt_ttracef(xt_get_self(), "regain lock group=%d\n", group);
+#endif
+		XT_TAB_ROW_WRITE_LOCK(&tab->tab_row_rwlock[group % XT_ROW_RWLOCKS], ot->ot_thread);
+		if ((xact = row_locks->tab_row_locks[group])) {
+			if (xact != my_xact) {
+				*xn_id = xact->xd_start_xn_id;
+				*lock_type = row_locks->tab_lock_perm[group] ? XT_PERM_LOCK : XT_TEMP_LOCK;
+				goto done;
+			}
+		}
+		else
+			row_locks->tab_row_locks[group] = my_xact;
+		row_locks->tab_lock_perm[group] = 1;
+		XT_TAB_ROW_UNLOCK(&tab->tab_row_rwlock[group % XT_ROW_RWLOCKS], ot->ot_thread);
+		lock_list->rll_release_point++;
+	}
+	*lock_type = XT_NO_LOCK;
+	return OK;
+
+	done:
+	XT_TAB_ROW_UNLOCK(&tab->tab_row_rwlock[group % XT_ROW_RWLOCKS], ot->ot_thread);
+	return OK;
+}
+
+#endif
diff --git a/storage/pbxt/src/lock_xt.h b/storage/pbxt/src/lock_xt.h
new file mode 100644
index 00000000000..4e5af648c37
--- /dev/null
+++ b/storage/pbxt/src/lock_xt.h
@@ -0,0 +1,772 @@
+/* Copyright (c) 2005 PrimeBase Technologies GmbH
+ *
+ * PrimeBase XT
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * 2008-01-24	Paul McCullagh
+ *
+ * Row lock functions.
+ *
+ * H&G2JCtL
+ */
+#ifndef __xt_lock_h__
+#define __xt_lock_h__
+
+#include "xt_defs.h"
+#include "util_xt.h"
+#include "locklist_xt.h"
+#include "pthread_xt.h"
+
+struct XTThread;
+struct XTDatabase;
+struct XTOpenTable;
+struct XTXactData;
+struct XTTable;
+
+#ifdef XT_ATOMIC_SOLARIS_LIB
+#include <atomic.h>
+#endif
+
+void xt_log_atomic_error_and_abort(c_char *func, c_char *file, u_int line);
+
+/*
+ * -----------------------------------------------------------------------
+ * ATOMIC OPERATIONS
+ */
+
+/*
+ * This macro is to remind me where it was safe
+ * to use a read lock!
+ */
+#define xt_lck_slock		xt_spinlock_lock
+
+/* I call these operations flushed because the result
+ * is written atomically.
+ * But the operations themselves are not atomic!
+ */
+inline void xt_atomic_inc1(volatile xtWord1 *mptr)
+{
+#ifdef XT_ATOMIC_WIN32_X86
+	__asm MOV  ECX, mptr
+	__asm MOV  DL, BYTE PTR [ECX]
+	__asm INC  DL
+	__asm XCHG DL, BYTE PTR [ECX]
+#elif defined(XT_ATOMIC_GNUC_X86)
+	xtWord1 val;
+
+	asm volatile ("movb %1,%0" : "=r" (val) : "m" (*mptr) : "memory");
+	val++;
+	asm volatile ("xchgb %1,%0" : "=r" (val) : "m" (*mptr), "0" (val) : "memory");
+#elif defined(XT_ATOMIC_SOLARIS_LIB)
+	atomic_inc_8(mptr);
+#else
+	*mptr++;
+	xt_log_atomic_error_and_abort(__FUNC__, __FILE__, __LINE__);
+#endif
+}
+
+inline xtWord1 xt_atomic_dec1(volatile xtWord1 *mptr)
+{
+	xtWord1 val;
+
+#ifdef XT_ATOMIC_WIN32_X86
+	__asm MOV  ECX, mptr
+	__asm MOV  DL, BYTE PTR [ECX]
+	__asm DEC  DL
+	__asm MOV  val, DL
+	__asm XCHG DL, BYTE PTR [ECX]
+#elif defined(XT_ATOMIC_GNUC_X86)
+	xtWord1 val2;
+
+	asm volatile ("movb %1, %0" : "=r" (val) : "m" (*mptr) : "memory");
+	val--;
+	asm volatile ("xchgb %1,%0" : "=r" (val2) : "m" (*mptr), "0" (val) : "memory");
+	/* Should work, but compiler makes a mistake?
+	 * asm volatile ("xchgb %1, %0" : : "r" (val), "m" (*mptr) : "memory");
+	 */
+#elif defined(XT_ATOMIC_SOLARIS_LIB)
+	val = atomic_dec_8_nv(mptr);
+#else
+	val = --(*mptr);
+	xt_log_atomic_error_and_abort(__FUNC__, __FILE__, __LINE__);
+#endif
+	return val;
+}
+
+inline void xt_atomic_inc2(volatile xtWord2 *mptr)
+{
+#ifdef XT_ATOMIC_WIN32_X86
+	__asm MOV  ECX, mptr
+	__asm LOCK INC	WORD PTR [ECX]
+#elif defined(XT_ATOMIC_GNUC_X86)
+	asm volatile ("lock; incw %0" : : "m" (*mptr) : "memory");
+#elif defined(XT_ATOMIC_GCC_OPS)
+	__sync_fetch_and_add(mptr, 1);
+#elif defined(XT_ATOMIC_SOLARIS_LIB)
+	atomic_inc_16_nv(mptr);
+#else
+	(*mptr)++;
+	xt_log_atomic_error_and_abort(__FUNC__, __FILE__, __LINE__);
+#endif
+}
+
+inline void xt_atomic_dec2(volatile xtWord2 *mptr)
+{
+#ifdef XT_ATOMIC_WIN32_X86
+	__asm MOV  ECX, mptr
+	__asm LOCK DEC	WORD PTR [ECX]
+#elif defined(XT_ATOMIC_GNUC_X86)
+	asm volatile ("lock; decw %0" : : "m" (*mptr) : "memory");
+#elif defined(XT_ATOMIC_GCC_OPS)
+	__sync_fetch_and_sub(mptr, 1);
+#elif defined(XT_ATOMIC_SOLARIS_LIB)
+	atomic_dec_16_nv(mptr);
+#else
+	--(*mptr);
+	xt_log_atomic_error_and_abort(__FUNC__, __FILE__, __LINE__);
+#endif
+}
+
+/* Atomic test and set 2 byte word! */
+inline xtWord2 xt_atomic_tas2(volatile xtWord2 *mptr, xtWord2 val)
+{
+#ifdef XT_ATOMIC_WIN32_X86
+	__asm MOV  ECX, mptr
+	__asm MOV  DX, val
+	__asm XCHG DX, WORD PTR [ECX]
+	__asm MOV  val, DX
+#elif defined(XT_ATOMIC_GNUC_X86)
+	asm volatile ("xchgw %1,%0" : "=r" (val) : "m" (*mptr), "0" (val) : "memory");
+#elif defined(XT_ATOMIC_SOLARIS_LIB)
+	val = atomic_swap_16(mptr, val);
+#else
+	/* Yikes! */
+	xtWord2 nval = val;
+
+	val = *mptr;
+	*mptr = nval;
+	xt_log_atomic_error_and_abort(__FUNC__, __FILE__, __LINE__);
+#endif
+	return val;
+}
+
+inline void xt_atomic_set4(volatile xtWord4 *mptr, xtWord4 val)
+{
+#ifdef XT_ATOMIC_WIN32_X86
+	__asm MOV  ECX, mptr
+	__asm MOV  EDX, val
+	__asm XCHG EDX, DWORD PTR [ECX]
+	//__asm MOV  DWORD PTR [ECX], EDX
+#elif defined(XT_ATOMIC_GNUC_X86)
+	asm volatile ("xchgl %1,%0" : "=r" (val) : "m" (*mptr), "0" (val) : "memory");
+	//asm volatile ("movl %0,%1" : "=r" (val) : "m" (*mptr) : "memory");
+#elif defined(XT_ATOMIC_SOLARIS_LIB)
+	atomic_swap_32(mptr, val);
+#else
+	*mptr = val;
+	xt_log_atomic_error_and_abort(__FUNC__, __FILE__, __LINE__);
+#endif
+}
+
+inline xtWord4 xt_atomic_tas4(volatile xtWord4 *mptr, xtWord4 val)
+{				
+#ifdef XT_ATOMIC_WIN32_X86
+	__asm MOV  ECX, mptr
+	__asm MOV  EDX, val
+	__asm XCHG EDX, DWORD PTR [ECX]
+	__asm MOV  val, EDX
+#elif defined(XT_ATOMIC_GNUC_X86)
+	val = val;
+	asm volatile ("xchgl %1,%0" : "=r" (val) : "m" (*mptr), "0" (val) : "memory");
+#elif defined(XT_ATOMIC_SOLARIS_LIB)
+	val = atomic_swap_32(mptr, val);
+#else
+	*mptr = val;
+	xt_log_atomic_error_and_abort(__FUNC__, __FILE__, __LINE__);
+#endif
+	return val;
+}
+
+/*
+ * -----------------------------------------------------------------------
+ * DIFFERENT TYPES OF LOCKS
+ */
+
+typedef struct XTSpinLock {
+	volatile xtWord4			spl_lock;
+#ifdef XT_NO_ATOMICS
+	xt_mutex_type				spl_mutex;
+#endif
+#ifdef DEBUG
+	struct XTThread				*spl_locker;
+#endif
+#ifdef XT_THREAD_LOCK_INFO
+	XTThreadLockInfoRec			spl_lock_info;
+	const char				    *spl_name;
+#endif
+} XTSpinLockRec, *XTSpinLockPtr;
+
+#ifdef XT_THREAD_LOCK_INFO
+#define xt_spinlock_init_with_autoname(a,b) xt_spinlock_init(a,b,LOCKLIST_ARG_SUFFIX(b))
+void	xt_spinlock_init(struct XTThread *self, XTSpinLockPtr sp, const char *name);
+#else
+#define xt_spinlock_init_with_autoname(a,b) xt_spinlock_init(a,b)
+void	xt_spinlock_init(struct XTThread *self, XTSpinLockPtr sp);
+#endif
+void	xt_spinlock_free(struct XTThread *self, XTSpinLockPtr sp);
+xtBool	xt_spinlock_spin(XTSpinLockPtr spl);
+#ifdef DEBUG
+void	xt_spinlock_set_thread(XTSpinLockPtr spl);
+#endif
+
+/* Code for test and set is derived from code by Larry Zhou and
+ * Google: http://code.google.com/p/google-perftools
+ */
+inline xtWord4 xt_spinlock_set(XTSpinLockPtr spl)
+{
+	xtWord4				prv;
+	volatile xtWord4	*lck;
+				
+	lck = &spl->spl_lock;
+#ifdef XT_ATOMIC_WIN32_X86
+	__asm MOV  ECX, lck
+	__asm MOV  EDX, 1
+	__asm XCHG EDX, DWORD PTR [ECX]
+	__asm MOV  prv, EDX
+#elif defined(XT_ATOMIC_GNUC_X86)
+	prv = 1;
+	asm volatile ("xchgl %1,%0" : "=r" (prv) : "m" (*lck), "0" (prv) : "memory");
+#elif defined(XT_ATOMIC_SOLARIS_LIB)
+	prv = atomic_swap_32(lck, 1);
+#else
+	/* The default implementation just uses a mutex, and
+	 * does not spin! */
+	xt_lock_mutex_ns(&spl->spl_mutex);
+	/* We have the lock */
+	*lck = 1;
+	prv = 0;
+#endif
+#ifdef DEBUG
+	if (!prv)
+		xt_spinlock_set_thread(spl);
+#endif
+	return prv;
+}
+
+inline xtWord4 xt_spinlock_reset(XTSpinLockPtr spl)
+{
+	xtWord4				prv;
+	volatile xtWord4	*lck;
+				
+#ifdef DEBUG
+	spl->spl_locker = NULL;
+#endif
+	lck = &spl->spl_lock;
+#ifdef XT_ATOMIC_WIN32_X86
+	__asm MOV  ECX, lck
+	__asm MOV  EDX, 0
+	__asm XCHG EDX, DWORD PTR [ECX]
+	__asm MOV  prv, EDX
+#elif defined(XT_ATOMIC_GNUC_X86)
+	prv = 0;
+	asm volatile ("xchgl %1,%0" : "=r" (prv) : "m" (*lck), "0" (prv) : "memory");
+#elif defined(XT_ATOMIC_SOLARIS_LIB)
+	prv = atomic_swap_32(lck, 0);
+#else
+	*lck = 0;
+	xt_unlock_mutex_ns(&spl->spl_mutex);
+	prv = 1;
+#endif
+	return prv;
+}
+
+/*
+ * Return FALSE, and register an error on failure.
+ */
+inline xtBool xt_spinlock_lock(XTSpinLockPtr spl)
+{
+	if (!xt_spinlock_set(spl)) {
+#ifdef XT_THREAD_LOCK_INFO
+		xt_thread_lock_info_add_owner(&spl->spl_lock_info);
+#endif
+		return OK;
+	}
+#ifdef XT_THREAD_LOCK_INFO
+	xtBool spin_result = xt_spinlock_spin(spl);
+	if (spin_result)
+		xt_thread_lock_info_add_owner(&spl->spl_lock_info);
+	return spin_result;
+#else
+	return xt_spinlock_spin(spl);
+#endif
+}
+
+inline void xt_spinlock_unlock(XTSpinLockPtr spl)
+{
+	xt_spinlock_reset(spl);
+#ifdef XT_THREAD_LOCK_INFO
+	xt_thread_lock_info_release_owner(&spl->spl_lock_info);
+#endif
+}
+
+/* Possibilities are 2 = align 4 or 2 = align 8 */
+#define XT_XS_LOCK_SHIFT		2
+#define XT_XS_LOCK_ALIGN		(1 << XT_XS_LOCK_SHIFT)
+
+/* This lock is fast for reads but slow for writes.
+ * Use this lock in situations where you have 99% reads,
+ * and then some potentially long writes.
+ */
+typedef struct XTRWMutex {
+#ifdef DEBUG
+	struct XTThread				*xs_lock_thread;
+	u_int						xs_inited;
+#endif
+#ifdef XT_THREAD_LOCK_INFO
+	XTThreadLockInfoRec			xs_lock_info;
+	const char				    *xs_name;
+#endif
+	xt_mutex_type				xs_lock;
+	xt_cond_type				xs_cond;
+	volatile xtWord4			xs_state;
+	volatile xtThreadID			xs_xlocker;
+	union {
+#if XT_XS_LOCK_ALIGN == 4
+		volatile xtWord4		*xs_rlock_align;
+#else
+		volatile  xtWord8		*xs_rlock_align;
+#endif
+		volatile  xtWord1		*xs_rlock;
+	}							x;
+} XTRWMutexRec, *XTRWMutexPtr;
+
+#ifdef XT_THREAD_LOCK_INFO
+#define xt_rwmutex_init_with_autoname(a,b) xt_rwmutex_init(a,b,LOCKLIST_ARG_SUFFIX(b))
+void xt_rwmutex_init(struct XTThread *self, XTRWMutexPtr xsl, const char *name);
+#else
+#define xt_rwmutex_init_with_autoname(a,b) xt_rwmutex_init(a,b)
+void xt_rwmutex_init(struct XTThread *self, XTRWMutexPtr xsl);
+#endif
+void xt_rwmutex_free(struct XTThread *self, XTRWMutexPtr xsl);
+xtBool xt_rwmutex_xlock(XTRWMutexPtr xsl, xtThreadID thd_id);
+xtBool xt_rwmutex_slock(XTRWMutexPtr xsl, xtThreadID thd_id);
+xtBool xt_rwmutex_unlock(XTRWMutexPtr xsl, xtThreadID thd_id);
+
+#define XT_FAST_LOCK_MAX_WAIT	100
+
+typedef struct XTFastLock {
+	XTSpinLockRec				fal_spinlock;
+	struct XTThread				*fal_locker;
+
+	XTSpinLockRec				fal_wait_lock;
+	u_int						fal_wait_count;
+	u_int						fal_wait_wakeup;
+	u_int						fal_wait_alloc;
+	struct XTThread				*fal_wait_list[XT_FAST_LOCK_MAX_WAIT];
+#ifdef XT_THREAD_LOCK_INFO
+	XTThreadLockInfoRec			fal_lock_info;
+	const char				    *fal_name;
+#endif
+} XTFastLockRec, *XTFastLockPtr;
+
+#ifdef XT_THREAD_LOCK_INFO
+#define xt_fastlock_init_with_autoname(a,b) xt_fastlock_init(a,b,LOCKLIST_ARG_SUFFIX(b))
+void	xt_fastlock_init(struct XTThread *self, XTFastLockPtr spl, const char *name);
+#else
+#define xt_fastlock_init_with_autoname(a,b) xt_fastlock_init(a,b)
+void	xt_fastlock_init(struct XTThread *self, XTFastLockPtr spl);
+#endif
+void	xt_fastlock_free(struct XTThread *self, XTFastLockPtr spl);
+void	xt_fastlock_wakeup(XTFastLockPtr spl);
+xtBool	xt_fastlock_spin(XTFastLockPtr spl, struct XTThread *thread);
+
+inline xtBool xt_fastlock_lock(XTFastLockPtr fal, struct XTThread *thread)
+{
+	if (!xt_spinlock_set(&fal->fal_spinlock)) {
+		fal->fal_locker = thread;
+#ifdef XT_THREAD_LOCK_INFO
+	xt_thread_lock_info_add_owner(&fal->fal_lock_info);
+#endif
+		return OK;
+	}
+#ifdef XT_THREAD_LOCK_INFO
+	xtBool spin_result = xt_fastlock_spin(fal, thread);
+	if (spin_result)
+		xt_thread_lock_info_add_owner(&fal->fal_lock_info);
+	return spin_result;
+#else
+	return xt_fastlock_spin(fal, thread);
+#endif
+}
+
+inline void xt_fastlock_unlock(XTFastLockPtr fal, struct XTThread *XT_UNUSED(thread))
+{
+	if (fal->fal_wait_count)
+		xt_fastlock_wakeup(fal);
+	else {
+		fal->fal_locker = NULL;
+		xt_spinlock_reset(&fal->fal_spinlock);
+	}
+#ifdef XT_THREAD_LOCK_INFO
+	xt_thread_lock_info_release_owner(&fal->fal_lock_info);
+#endif
+}
+
+#define XT_SXS_SLOCK_COUNT		2
+
+typedef struct XTSpinXSLock {
+	volatile xtWord2			sxs_xlocked;
+	volatile xtWord2			sxs_xwaiter;
+	volatile xtWord2			sxs_rlock_count;
+	volatile xtWord2			sxs_wait_count;			/* The number of readers waiting for the xlocker. */
+#ifdef DEBUG
+	xtThreadID					sxs_locker;
+#endif
+#ifdef XT_THREAD_LOCK_INFO
+	XTThreadLockInfoRec			sxs_lock_info;
+	const char				    *sxs_name;
+#endif
+} XTSpinXSLockRec, *XTSpinXSLockPtr;
+
+#ifdef XT_THREAD_LOCK_INFO
+#define xt_spinxslock_init_with_autoname(a,b) xt_spinxslock_init(a,b,LOCKLIST_ARG_SUFFIX(b))
+void xt_spinxslock_init(struct XTThread *self, XTSpinXSLockPtr sxs, const char *name);
+#else
+#define xt_spinxslock_init_with_autoname(a,b) xt_spinxslock_init(a,b)
+void xt_spinxslock_init(struct XTThread *self, XTSpinXSLockPtr sxs);
+#endif
+void xt_spinxslock_free(struct XTThread *self, XTSpinXSLockPtr sxs);
+xtBool xt_spinxslock_xlock(XTSpinXSLockPtr sxs, xtBool try_lock, xtThreadID thd_id);
+xtBool xt_spinxslock_slock(XTSpinXSLockPtr sxs);
+xtBool xt_spinxslock_unlock(XTSpinXSLockPtr sxs, xtBool xlocked);
+
+typedef struct XTXSMutexLock {
+	xt_mutex_type				xsm_lock;
+	xt_cond_type				xsm_cond;
+	xt_cond_type				xsm_cond_2;
+	volatile xtThreadID			xsm_xlocker;
+	volatile xtWord2			xsm_rlock_count;
+	volatile xtWord2			xsm_wait_count;			/* The number of readers waiting for the xlocker. */
+#ifdef DEBUG
+	xtThreadID					xsm_locker;
+#endif
+#ifdef XT_THREAD_LOCK_INFO
+	XTThreadLockInfoRec			xsm_lock_info;
+	const char				    *xsm_name;
+#endif
+} XTXSMutexRec, *XTXSMutexLockPtr;
+
+#ifdef XT_THREAD_LOCK_INFO
+#define xt_xsmutex_init_with_autoname(a,b) xt_xsmutex_init(a,b,LOCKLIST_ARG_SUFFIX(b))
+void xt_xsmutex_init(struct XTThread *self, XTXSMutexLockPtr xsm, const char *name);
+#else
+#define xt_xsmutex_init_with_autoname(a,b) xt_xsmutex_init(a,b)
+void xt_xsmutex_init(struct XTThread *self, XTXSMutexLockPtr xsm);
+#endif
+
+void xt_xsmutex_free(struct XTThread *self, XTXSMutexLockPtr xsm);
+xtBool xt_xsmutex_xlock(XTXSMutexLockPtr xsm, xtThreadID thd_id);
+xtBool xt_xsmutex_slock(XTXSMutexLockPtr xsm, xtThreadID thd_id);
+xtBool xt_xsmutex_unlock(XTXSMutexLockPtr xsm, xtThreadID thd_id);
+
+typedef struct XTAtomicRWLock {
+	volatile xtWord2			arw_reader_count;
+	volatile xtWord2			arw_xlock_set;
+
+#ifdef XT_THREAD_LOCK_INFO
+	XTThreadLockInfoRec			arw_lock_info;
+	const char				    *arw_name;
+#endif
+#ifdef DEBUG
+	xtThreadID					arw_locker;
+#endif
+} XTAtomicRWLockRec, *XTAtomicRWLockPtr;
+
+#ifdef XT_THREAD_LOCK_INFO
+#define xt_atomicrwlock_init_with_autoname(a,b) xt_atomicrwlock_init(a,b,LOCKLIST_ARG_SUFFIX(b))
+void xt_atomicrwlock_init(struct XTThread *self, XTAtomicRWLockPtr xsl, const char *name);
+#else
+#define xt_atomicrwlock_init_with_autoname(a,b) xt_atomicrwlock_init(a,b)
+void xt_atomicrwlock_init(struct XTThread *self, XTAtomicRWLockPtr xsl);
+#endif
+void xt_atomicrwlock_free(struct XTThread *self, XTAtomicRWLockPtr xsl);
+xtBool xt_atomicrwlock_xlock(XTAtomicRWLockPtr xsl, xtBool try_lock, xtThreadID thr_id);
+xtBool xt_atomicrwlock_slock(XTAtomicRWLockPtr xsl);
+xtBool xt_atomicrwlock_unlock(XTAtomicRWLockPtr xsl, xtBool xlocked);
+
+typedef struct XTSkewRWLock {
+	volatile xtWord2			srw_reader_count;
+	volatile xtWord2			srw_xlock_set;
+
+#ifdef XT_THREAD_LOCK_INFO
+	XTThreadLockInfoRec			srw_lock_info;
+	const char				    *srw_name;
+#endif
+#ifdef DEBUG
+	xtThreadID					srw_locker;
+#endif
+} XTSkewRWLockRec, *XTSkewRWLockPtr;
+
+#ifdef XT_THREAD_LOCK_INFO
+#define xt_skewrwlock_init_with_autoname(a,b) xt_skewrwlock_init(a,b,LOCKLIST_ARG_SUFFIX(b))
+void xt_skewrwlock_init(struct XTThread *self, XTSkewRWLockPtr xsl, const char *name);
+#else
+#define xt_skewrwlock_init_with_autoname(a,b) xt_skewrwlock_init(a,b)
+void xt_skewrwlock_init(struct XTThread *self, XTSkewRWLockPtr xsl);
+#endif
+void xt_skewrwlock_free(struct XTThread *self, XTSkewRWLockPtr xsl);
+xtBool xt_skewrwlock_xlock(XTSkewRWLockPtr xsl, xtBool try_lock, xtThreadID thr_id);
+xtBool xt_skewrwlock_slock(XTSkewRWLockPtr xsl);
+xtBool xt_skewrwlock_unlock(XTSkewRWLockPtr xsl, xtBool xlocked);
+
+void xt_unit_test_read_write_locks(struct XTThread *self);
+void xt_unit_test_mutex_locks(struct XTThread *self);
+void xt_unit_test_create_threads(struct XTThread *self);
+
+/*
+ * -----------------------------------------------------------------------
+ * ROW LOCKS
+ */
+
+/*
+ * [(9)]
+ *
+ * These are perminent row locks. They are set on rows for 2 reasons:
+ *
+ * 1. To lock a row that is being updated. The row is locked
+ *    when it is read, until the point that it is updated. If the row
+ *    is not updated, the lock is removed.
+ *    This prevents an update coming between which will cause an error
+ *    on the first thread.
+ *
+ * 2. The locks are used to implement SELECT FOR UPDATE.
+ */
+
+/*
+ * A lock that is set in order to perform an update is a temporary lock.
+ * This lock will be removed once the update of the record is done.
+ * The objective is to prevent some other thread from changine the
+ * record between the time the record is read and updated. This is to
+ * prevent unncessary "Record was updated" errors.
+ *
+ * A permanent lock is set by a SELECT FOR UPDATE. These locks are
+ * held until the end of the transaction.
+ *
+ * However, a SELECT FOR UPDATE will pop its lock stack before
+ * waiting for a transaction that has updated a record.
+ * This is to prevent the deadlock that can occur because a
+ * SELECT FOR UPDATE locks groups of records (I mean in general the
+ * locks used are group locks).
+ *
+ * This means a SELECT FOR UPDATE can get ahead of an UPDATE as far as
+ * locking is concerned. Example:
+ *
+ * Record 1,2 and 3 are in group A.
+ *
+ * T1: UPDATES record 2.
+ * T2: SELECT FOR UPDATE record 1, which locks group A.
+ * T2: SELECT FOR UPDATE record 2, which must wait for T1.
+ * T1: UPDATES record 3, which musts wait because of group lock A.
+ *
+ * To avoid deadlock, T2 releases its group lock A before waiting for
+ * record 2. It then regains the lock after waiting for record 2.
+ *
+ * (NOTE: Locks are no longer released. Please check this comment:
+ * {RELEASING-LOCKS} in lock_xt.cc. )
+ *
+ * However, release group A lock mean first releasing all locks gained
+ * after group a lock.
+ *
+ * For example: a thread locks groups: A, B and C. To release group B
+ * lock the thread must release C as well. Afterwards, it must gain
+ * B and C again, in that order. This is to ensure that the lock
+ * order is NOT changed!
+ *
+ */
+#define XT_LOCK_ERR					-1
+#define XT_NO_LOCK					0
+#define XT_TEMP_LOCK				1								/* A temporary lock */
+#define XT_PERM_LOCK				2								/* A permanent lock */
+
+typedef struct XTRowLockList : public XTBasicList {
+	void	xt_remove_all_locks(struct XTDatabase *db, struct XTThread *thread);
+} XTRowLockListRec, *XTRowLockListPtr;
+
+#define XT_USE_LIST_BASED_ROW_LOCKS
+
+#ifdef XT_USE_LIST_BASED_ROW_LOCKS
+/*
+ * This method stores each lock, and avoids conflicts.
+ * But it is a bit more expensive in time.
+ */
+
+#ifdef DEBUG
+#define XT_TEMP_LOCK_BYTES				10
+#define XT_ROW_LOCK_GROUP_COUNT			5
+#else
+#define XT_TEMP_LOCK_BYTES				0xFFFF
+#define XT_ROW_LOCK_GROUP_COUNT			23
+#endif
+
+typedef struct XTLockWait {
+	/* Information about the lock to be aquired: */
+	struct XTThread			*lw_thread;
+	struct XTOpenTable		*lw_ot;
+	xtRowID					lw_row_id;
+
+	/* This is the lock currently held, and the transaction ID: */
+	int						lw_curr_lock;
+	xtXactID				lw_xn_id;
+
+	/* This is information about the updating transaction: */
+	xtBool					lw_row_updated;
+	xtXactID				lw_updating_xn_id;
+
+	/* Pointers for the lock list: */
+	struct XTLockWait		*lw_next;
+	struct XTLockWait		*lw_prev;
+} XTLockWaitRec, *XTLockWaitPtr;
+
+typedef struct XTLockItem {
+	xtRowID					li_row_id;				/* The row list is sorted in this value. */
+	xtWord2					li_count;				/* The number of consecutive rows locked. FFFF means a temporary lock. */
+	xtWord2					li_thread_id;			/* The thread that holds this lock. */
+} XTLockItemRec, *XTLockItemPtr;
+
+typedef struct XTLockGroup {
+	XTSpinLockRec			lg_lock;				/* A lock for the list. */
+	XTLockWaitPtr			lg_wait_queue;			/* A queue of threads waiting for a lock in this group. */
+	XTLockWaitPtr			lg_wait_queue_end;		/* The end of the thread queue. */
+	size_t						lg_list_size;			/* The size of the list. */
+	size_t						lg_list_in_use;			/* Number of slots on the list in use. */
+	XTLockItemPtr			lg_list;				/* List of locks. */
+} XTLockGroupRec, *XTLockGroupPtr;
+
+struct XTLockWait;
+
+typedef struct XTRowLocks {
+	XTLockGroupRec			rl_groups[XT_ROW_LOCK_GROUP_COUNT];
+
+	void	xt_cancel_temp_lock(XTLockWaitPtr lw);
+	xtBool	xt_set_temp_lock(struct XTOpenTable *ot, XTLockWaitPtr lw, XTRowLockListPtr lock_list);
+	void	xt_remove_temp_lock(struct XTOpenTable *ot, xtBool updated);
+	xtBool	xt_make_lock_permanent(struct XTOpenTable *ot, XTRowLockListPtr lock_list);
+
+	xtBool	rl_lock_row(XTLockGroupPtr group, XTLockWaitPtr lw, XTRowLockListPtr lock_list, int *result);
+	void	rl_grant_locks(XTLockGroupPtr group, struct XTThread *thread);
+#ifdef DEBUG_LOCK_QUEUE
+	void	rl_check(XTLockWaitPtr lw);
+#endif
+} XTRowLocksRec, *XTRowLocksPtr;
+
+#define XT_USE_TABLE_REF
+
+typedef struct XTPermRowLock {
+#ifdef XT_USE_TABLE_REF
+	struct XTTable			*pr_table;
+#else
+	xtTableID				pr_tab_id;
+#endif
+	xtWord1					pr_group[XT_ROW_LOCK_GROUP_COUNT];
+} XTPermRowLockRec, *XTPermRowLockPtr;
+
+#else // XT_ROW_LOCK_GROUP_COUNT
+
+/* Hash based row locking. This method allows conflics, even
+ * when there is none.
+ */
+typedef struct XTRowLocks {
+	xtWord1					tab_lock_perm[XT_ROW_LOCK_COUNT];		/* Byte set to 1 for permanent locks. */
+	struct XTXactData		*tab_row_locks[XT_ROW_LOCK_COUNT];		/* The transactions that have locked the specific rows. */
+
+	int		xt_set_temp_lock(struct XTOpenTable *ot, xtRowID row, xtXactID *xn_id, XTRowLockListPtr lock_list);
+	void	xt_remove_temp_lock(struct XTOpenTable *ot);
+	xtBool	xt_make_lock_permanent(struct XTOpenTable *ot, XTRowLockListPtr lock_list);
+	int		xt_is_locked(struct XTOpenTable *ot, xtRowID row, xtXactID *xn_id);
+} XTRowLocksRec, *XTRowLocksPtr;
+
+typedef struct XTPermRowLock {
+	xtTableID				pr_tab_id;
+	xtWord4					pr_group;
+} XTPermRowLockRec, *XTPermRowLockPtr;
+
+#endif // XT_ROW_LOCK_GROUP_COUNT
+
+xtBool			xt_init_row_locks(XTRowLocksPtr rl);
+void			xt_exit_row_locks(XTRowLocksPtr rl);
+
+xtBool			xt_init_row_lock_list(XTRowLockListPtr rl);
+void			xt_exit_row_lock_list(XTRowLockListPtr rl);
+
+#define XT_NO_LOCK				0
+#define XT_WANT_LOCK			1
+#define XT_HAVE_LOCK			2
+#define XT_WAITING				3
+
+/*
+ * -----------------------------------------------------------------------
+ * RECURSIVE MUTEX (allows lockers to lock again)
+ */
+
+typedef struct XTRecursiveMutex {
+	struct XTThread				*rm_locker;
+	u_int						rm_lock_count;
+	xt_mutex_type				rm_mutex;
+
+#ifdef XT_THREAD_LOCK_INFO
+	XTThreadLockInfoRec			rm_lock_info;
+	const char				    *rm_name;
+#endif
+} XTRecursiveMutexRec, *XTRecursiveMutexPtr;
+
+#ifdef XT_THREAD_LOCK_INFO
+#define xt_recursivemutex_init_with_autoname(a,b) xt_recursivemutex_init(a,b,LOCKLIST_ARG_SUFFIX(b))
+void xt_recursivemutex_init(struct XTThread *self, XTRecursiveMutexPtr rm, const char *name);
+#else
+#define xt_recursivemutex_init_with_autoname(a,b) xt_recursivemutex_init(a,b)
+void xt_recursivemutex_init(struct XTThread *self, XTRecursiveMutexPtr rm);
+#endif
+void xt_recursivemutex_free(XTRecursiveMutexPtr rm);
+void xt_recursivemutex_lock(struct XTThread *self, XTRecursiveMutexPtr rm);
+void xt_recursivemutex_unlock(struct XTThread *self, XTRecursiveMutexPtr rm);
+
+typedef struct XTRecurRWLock {
+	struct XTThread				*rrw_locker;
+	u_int						rrw_lock_count;
+	xt_rwlock_type				rrw_lock;
+
+#ifdef XT_THREAD_LOCK_INFO
+	XTThreadLockInfoRec			rrw_lock_info;
+	const char				    *rrw_name;
+#endif
+} XTRecurRWLockRec, *XTRecurRWLockPtr;
+
+#ifdef XT_THREAD_LOCK_INFO
+#define xt_recurrwlock_init_with_autoname(a,b) xt_recurrwlock_init(a,b,LOCKLIST_ARG_SUFFIX(b))
+void xt_recurrwlock_init(struct XTThread *self, XTRecurRWLockPtr rrw, const char *name);
+#else
+#define xt_recurrwlock_init_with_autoname(a,b) xt_recurrwlock_init(a,b)
+void xt_recurrwlock_init(struct XTThread *self, XTRecurRWLockPtr rrw);
+#endif
+void xt_recurrwlock_free(XTRecurRWLockPtr rrw);
+void xt_recurrwlock_xlock(struct XTThread *self, XTRecurRWLockPtr rrw);
+void xt_recurrwlock_slock(struct XTThread *self, XTRecurRWLockPtr rrw);
+void xt_recurrwlock_slock_ns(XTRecurRWLockPtr rrw);
+void xt_recurrwlock_unxlock(struct XTThread *self, XTRecurRWLockPtr rrw);
+void xt_recurrwlock_unslock(struct XTThread *self, XTRecurRWLockPtr rrw);
+void xt_recurrwlock_unslock_ns(XTRecurRWLockPtr rrw);
+
+#endif
diff --git a/storage/pbxt/src/locklist_xt.cc b/storage/pbxt/src/locklist_xt.cc
new file mode 100644
index 00000000000..9f79442560b
--- /dev/null
+++ b/storage/pbxt/src/locklist_xt.cc
@@ -0,0 +1,199 @@
+/* Copyright (c) 2009 PrimeBase Technologies GmbH
+ *
+ * PrimeBase XT
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * 2009-01-20	Vladimir Kolesnikov
+ *
+ * H&G2JCtL
+ */
+
+#include "xt_config.h"
+#include "locklist_xt.h"
+
+#ifdef XT_THREAD_LOCK_INFO
+#include "pthread_xt.h"
+#include "thread_xt.h"
+#include "trace_xt.h"
+
+void xt_thread_lock_info_init(XTThreadLockInfoPtr ptr, XTSpinLock *lock)
+{
+	ptr->li_spin_lock = lock;
+	ptr->li_lock_type = XTThreadLockInfo::SPIN_LOCK;
+}
+
+void xt_thread_lock_info_init(XTThreadLockInfoPtr ptr, XTRWMutex *lock)
+{
+	ptr->li_rw_mutex  = lock;
+	ptr->li_lock_type = XTThreadLockInfo::RW_MUTEX;
+}
+
+void xt_thread_lock_info_init(XTThreadLockInfoPtr ptr, XTFastLock *lock)
+{
+	ptr->li_fast_lock  = lock;
+	ptr->li_lock_type = XTThreadLockInfo::FAST_LOCK;
+}
+
+void xt_thread_lock_info_init(XTThreadLockInfoPtr ptr, xt_mutex_struct *lock)
+{
+	ptr->li_mutex     = lock;
+	ptr->li_lock_type = XTThreadLockInfo::MUTEX;
+}
+
+void xt_thread_lock_info_init(XTThreadLockInfoPtr ptr, xt_rwlock_struct *lock)
+{
+	ptr->li_rwlock    = lock;
+	ptr->li_lock_type = XTThreadLockInfo::RW_LOCK;
+}
+
+void xt_thread_lock_info_init(XTThreadLockInfoPtr ptr, XTXSMutexLock *lock)
+{
+	ptr->li_fast_rwlock = lock;
+	ptr->li_lock_type   = XTThreadLockInfo::FAST_RW_LOCK;
+}
+
+void xt_thread_lock_info_init(XTThreadLockInfoPtr ptr, XTSpinXSLock *lock)
+{
+	ptr->li_spin_rwlock = lock;
+	ptr->li_lock_type   = XTThreadLockInfo::SPIN_RW_LOCK;
+}
+
+void xt_thread_lock_info_init(XTThreadLockInfoPtr ptr, XTAtomicRWLock *lock)
+{
+	ptr->li_atomic_rwlock = lock;
+	ptr->li_lock_type   = XTThreadLockInfo::ATOMIC_RW_LOCK;
+}
+
+void xt_thread_lock_info_init(XTThreadLockInfoPtr ptr, XTSkewRWLock *lock)
+{
+	ptr->li_skew_rwlock = lock;
+	ptr->li_lock_type   = XTThreadLockInfo::SKEW_RW_LOCK;
+}
+
+void xt_thread_lock_info_free(XTThreadLockInfoPtr ptr)
+{
+	/* TODO: check to see if it's present in a thread's list */
+}
+
+void xt_thread_lock_info_add_owner (XTThreadLockInfoPtr ptr)
+{
+	XTThread *self = xt_get_self();
+
+	if (!self)
+		return;
+
+	if (self->st_thread_lock_count < XT_THREAD_LOCK_INFO_MAX_COUNT) {
+		self->st_thread_lock_list[self->st_thread_lock_count] = ptr;
+		self->st_thread_lock_count++;
+	}
+}
+
+void xt_thread_lock_info_release_owner (XTThreadLockInfoPtr ptr)
+{
+	XTThread *self = xt_get_self();
+
+	if (!self)
+		return;
+
+	for (int i = self->st_thread_lock_count - 1; i >= 0; i--) {
+		if (self->st_thread_lock_list[i] == ptr) {
+			self->st_thread_lock_count--;
+			memcpy(self->st_thread_lock_list + i, 
+				self->st_thread_lock_list + i + 1, 
+				(self->st_thread_lock_count - i)*sizeof(XTThreadLockInfoPtr));
+			self->st_thread_lock_list[self->st_thread_lock_count] = NULL;
+			break;
+		}
+	}
+}
+
+void xt_trace_thread_locks(XTThread *self)
+{
+	if (!self)
+		return;
+
+	xt_ttracef(self, "thread lock list (first in list added first): ");
+
+	if (!self->st_thread_lock_count) {
+		xt_trace(" <empty>\n");
+		return;
+	}
+
+	xt_trace("\n");
+
+	int count = min(self->st_thread_lock_count, XT_THREAD_LOCK_INFO_MAX_COUNT);
+
+	for(int i = 0; i < count; i++) {
+
+		const char *lock_type = NULL;
+		const char *lock_name = NULL;
+
+		XTThreadLockInfoPtr li = self->st_thread_lock_list[i];
+
+		switch(li->li_lock_type) {
+			case XTThreadLockInfo::SPIN_LOCK:
+				lock_type = "XTSpinLock";
+				lock_name = li->li_spin_lock->spl_name;
+				break;
+			case XTThreadLockInfo::RW_MUTEX:
+				lock_type = "XTRWMutex";
+				lock_name = li->li_rw_mutex->xs_name;
+				break;
+			case XTThreadLockInfo::MUTEX:
+				lock_type = "xt_mutex_struct";
+#ifdef XT_WIN
+				lock_name = li->li_mutex->mt_name;
+#else
+				lock_name = li->li_mutex->mu_name;
+#endif
+				break;
+			case XTThreadLockInfo::RW_LOCK:
+				lock_type = "xt_rwlock_struct";
+				lock_name = li->li_rwlock->rw_name;
+				break;
+			case XTThreadLockInfo::FAST_LOCK:
+				lock_type = "XTFastLock";
+				lock_name = li->li_fast_lock->fal_name;
+				break;
+			case XTThreadLockInfo::FAST_RW_LOCK:
+				lock_type = "XTXSMutexLock";
+				lock_name = li->li_fast_rwlock->xsm_name;
+				break;
+			case XTThreadLockInfo::SPIN_RW_LOCK:
+				lock_type = "XTSpinRWLock";
+				lock_name = li->li_spin_rwlock->sxs_name;
+				break;
+			case XTThreadLockInfo::ATOMIC_RW_LOCK:
+				lock_type = "XTAtomicRWLock";
+				lock_name = li->li_atomic_rwlock->arw_name;
+				break;
+			case XTThreadLockInfo::SKEW_RW_LOCK:
+				lock_type = "XTSkewRWLock";
+				lock_name = li->li_skew_rwlock->srw_name;
+				break;
+		}
+
+		xt_ttracef(self, "  #lock#%d: type: %s name: %s \n", count, lock_type, lock_name);
+	}
+}
+
+#elif defined(__WIN__)
+
+// Remove linker warning 4221 about empty file
+namespace { char dummy; };
+
+#endif
+
diff --git a/storage/pbxt/src/locklist_xt.h b/storage/pbxt/src/locklist_xt.h
new file mode 100644
index 00000000000..17852b66d0b
--- /dev/null
+++ b/storage/pbxt/src/locklist_xt.h
@@ -0,0 +1,105 @@
+/* Copyright (c) 2009 PrimeBase Technologies GmbH
+ *
+ * PrimeBase XT
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * 2009-01-20	Vladimir Kolesnikov
+ *
+ * H&G2JCtL
+ */
+
+#ifndef __xt_locklist_h__
+#define __xt_locklist_h__
+
+/*
+ * XT_THREAD_LOCK_INFO and DEBUG_LOCKING code must be updated to avoid calls to xt_get_self() as it can be called before hton->slot is
+ * assigned by MySQL which is used by xt_get_self()
+ */
+
+#ifdef DEBUG
+//#define XT_THREAD_LOCK_INFO
+#ifndef XT_WIN
+/* We need DEBUG_LOCKING in order to enable pthread function wrappers */
+//#define DEBUG_LOCKING
+#endif
+#endif
+
+#include "xt_defs.h"
+
+struct XTThread;
+struct XTSpinLock;
+struct XTRWMutex;
+struct xt_mutex_struct;
+struct xt_rwlock_struct;
+struct XTFastLock;
+struct XTXSMutexLock;
+struct XTSpinXSLock;
+struct XTAtomicRWLock;
+struct XTSkewRWLock;
+
+#ifdef XT_THREAD_LOCK_INFO
+
+#define XT_THREAD_LOCK_INFO_MAX_COUNT 50
+
+#ifdef XT_WIN
+#define LOCKLIST_ARG_SUFFIX(name) #name " in " __FUNCTION__ "() at " __FILE__ ":" QUOTE(__LINE__)
+#else
+#define LOCKLIST_ARG_SUFFIX(name) #name " in " QUOTE(__PRETTY_FUNCTION__) "() at " QUOTE(__FILE__) ":" QUOTE(__LINE__)
+#endif
+
+/*
+ * An instance of XTThreadLockInfo class keeps information about a lock kept by a thread.
+ * There's a list of XTThreadLockInfo instances per thread. An instance can be included
+ * into several thread lists in case of shared locks.
+ */
+typedef struct XTThreadLockInfo {
+
+	enum LockType { SPIN_LOCK, RW_MUTEX, MUTEX, RW_LOCK, FAST_LOCK, FAST_RW_LOCK, SPIN_RW_LOCK, ATOMIC_RW_LOCK, SKEW_RW_LOCK };
+
+	LockType		  li_lock_type;
+
+	union {
+		XTSpinLock       *li_spin_lock;	  // SPIN_LOCK
+		XTRWMutex        *li_rw_mutex;	  // RW_MUTEX
+		XTFastLock		 *li_fast_lock;   // FAST_LOCK
+		XTXSMutexLock	 *li_fast_rwlock; // FAST_RW_LOCK
+		XTSpinXSLock	 *li_spin_rwlock; // SPIN_RW_LOCK
+		XTAtomicRWLock	 *li_atomic_rwlock; // ATOMIC_RW_LOCK
+		xt_mutex_struct  *li_mutex;		  // MUTEX
+		xt_rwlock_struct *li_rwlock;	  // RW_LOCK
+		XTSkewRWLock	 *li_skew_rwlock;	// SKEW_RW_LOCK
+	};
+} 
+XTThreadLockInfoRec, *XTThreadLockInfoPtr;
+
+void xt_thread_lock_info_init(XTThreadLockInfoPtr ptr, XTSpinLock *lock);
+void xt_thread_lock_info_init(XTThreadLockInfoPtr ptr, XTRWMutex *lock);
+void xt_thread_lock_info_init(XTThreadLockInfoPtr ptr, XTFastLock *lock);
+void xt_thread_lock_info_init(XTThreadLockInfoPtr ptr, XTXSMutexLock *lock);
+void xt_thread_lock_info_init(XTThreadLockInfoPtr ptr, XTSpinXSLock *lock);
+void xt_thread_lock_info_init(XTThreadLockInfoPtr ptr, XTAtomicRWLock *lock);
+void xt_thread_lock_info_init(XTThreadLockInfoPtr ptr, xt_mutex_struct *lock);
+void xt_thread_lock_info_init(XTThreadLockInfoPtr ptr, xt_rwlock_struct *lock);
+void xt_thread_lock_info_init(XTThreadLockInfoPtr ptr, XTSkewRWLock *lock);
+void xt_thread_lock_info_free(XTThreadLockInfoPtr ptr);
+
+void xt_thread_lock_info_add_owner (XTThreadLockInfoPtr ptr);
+void xt_thread_lock_info_release_owner (XTThreadLockInfoPtr ptr);
+
+void xt_trace_thread_locks(XTThread *self);
+
+#endif // XT_THREAD_LOCK_INFO
+#endif // __xt_locklist_h__
diff --git a/storage/pbxt/src/memory_xt.cc b/storage/pbxt/src/memory_xt.cc
new file mode 100644
index 00000000000..b2f6c248b3c
--- /dev/null
+++ b/storage/pbxt/src/memory_xt.cc
@@ -0,0 +1,1141 @@
+/* Copyright (c) 2005 PrimeBase Technologies GmbH
+ *
+ * PrimeBase XT
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * 2005-01-04	Paul McCullagh
+ *
+ * H&G2JCtL
+ */
+
+#include "xt_config.h"
+
+#include <stdio.h>
+#include <errno.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "pthread_xt.h"
+#include "thread_xt.h"
+#include "strutil_xt.h"
+#include "trace_xt.h"
+
+#ifdef DEBUG
+#define RECORD_MM
+#endif
+
+#ifdef DEBUG
+
+#undef	xt_malloc
+#undef	xt_calloc
+#undef	xt_realloc
+#undef	xt_free
+#undef	xt_pfree
+
+#undef	xt_malloc_ns
+#undef	xt_calloc_ns
+#undef	xt_realloc_ns
+#undef	xt_free_ns
+
+void	*xt_malloc(XTThreadPtr self, size_t size);
+void	*xt_calloc(XTThreadPtr self, size_t size);
+xtBool	xt_realloc(XTThreadPtr self, void **ptr, size_t size);
+void	xt_free(XTThreadPtr self, void *ptr);
+void	xt_pfree(XTThreadPtr self, void **ptr);
+
+void	*xt_malloc_ns(size_t size);
+void	*xt_calloc_ns(size_t size);
+xtBool	xt_realloc_ns(void **ptr, size_t size);
+void	xt_free_ns(void *ptr);
+
+#define ADD_TOTAL_ALLOCS			4000
+
+#define SHIFT_RIGHT(ptr, n)			memmove(((char *) (ptr)) + sizeof(MissingMemoryRec), (ptr), (long) (n) * sizeof(MissingMemoryRec))
+#define SHIFT_LEFT(ptr, n)			memmove((ptr), ((char *) (ptr)) + sizeof(MissingMemoryRec), (long) (n) * sizeof(MissingMemoryRec))
+
+#define STACK_TRACE_DEPTH			4
+
+typedef struct MissingMemory {
+	void			*mm_ptr;
+	xtWord4			id;
+	xtWord2			line_nr;
+	xtWord2			trace_count;
+	c_char			*mm_file;
+	c_char			*mm_func[STACK_TRACE_DEPTH];
+} MissingMemoryRec, *MissingMemoryPtr;
+
+static MissingMemoryRec	*mm_addresses = NULL;
+static long				mm_nr_in_use = 0L;
+static long				mm_total_allocated = 0L;
+static xtWord4			mm_alloc_count = 0;
+static xt_mutex_type	mm_mutex;
+
+#ifdef RECORD_MM
+static long mm_find_pointer(void *ptr);
+#endif
+
+#endif
+
+/*
+ * -----------------------------------------------------------------------
+ * STANDARD SYSTEM BASED MEMORY ALLOCATION
+ */
+
+xtPublic void *xt_malloc(XTThreadPtr self, size_t size)
+{
+	void *ptr;
+
+	if (!(ptr = malloc(size))) {
+		xt_throw_errno(XT_CONTEXT, XT_ENOMEM);
+		return NULL;
+	}
+	return ptr;
+}
+
+xtPublic xtBool	xt_realloc(XTThreadPtr self, void **ptr, size_t size)
+{
+	void *new_ptr;
+
+	if (!(new_ptr = realloc(*ptr, size))) {
+		xt_throw_errno(XT_CONTEXT, XT_ENOMEM);
+		return FAILED;
+	}
+	*ptr = new_ptr;
+	return OK;
+}
+
+xtPublic void xt_free(XTThreadPtr XT_UNUSED(self), void *ptr)
+{
+	free(ptr);
+}
+
+xtPublic void *xt_calloc(XTThreadPtr self, size_t size)
+{
+	void *ptr;
+
+	if ((ptr = xt_malloc(self, size)))
+		memset(ptr, 0, size);
+	return ptr;
+}
+
+#undef	xt_pfree
+
+xtPublic void xt_pfree(XTThreadPtr self, void **ptr)
+{
+	if (*ptr) {
+		void *p = *ptr;
+
+		*ptr = NULL;
+		xt_free(self, p);
+	}
+}
+
+/*
+ * -----------------------------------------------------------------------
+ * SYSTEM MEMORY ALLOCATION WITH A THREAD
+ */
+
+xtPublic void *xt_malloc_ns(size_t size)
+{
+	void *ptr;
+
+	if (!(ptr = malloc(size))) {
+		xt_register_errno(XT_REG_CONTEXT, XT_ENOMEM);
+		return NULL;
+	}
+	return ptr;
+}
+
+xtPublic void *xt_calloc_ns(size_t size)
+{
+	void *ptr;
+
+	if (!(ptr = malloc(size))) {
+		xt_register_errno(XT_REG_CONTEXT, XT_ENOMEM);
+		return NULL;
+	}
+	memset(ptr, 0, size);
+	return ptr;
+}
+
+xtPublic xtBool	xt_realloc_ns(void **ptr, size_t size)
+{
+	void *new_ptr;
+
+	if (!(new_ptr = realloc(*ptr, size)))
+		return xt_register_errno(XT_REG_CONTEXT, XT_ENOMEM);
+	*ptr = new_ptr;
+	return OK;
+}
+
+xtPublic void xt_free_ns(void *ptr)
+{
+	free(ptr);
+}
+
+#ifdef DEBUG_MEMORY
+
+/*
+ * -----------------------------------------------------------------------
+ * MEMORY SEARCHING CODE
+ */
+
+#define MM_THROW_ASSERTION(str) mm_throw_assertion(self, __FUNC__, __FILE__, __LINE__, str)
+
+static void mm_throw_assertion(XTThreadPtr self, c_char *func, c_char *file, u_int line, c_char *str)
+{
+	printf("***** MM:FATAL %s\n", str);
+	xt_throw_assertion(self, func, file, line, str);
+}
+
+/*
+ * -----------------------------------------------------------------------
+ * MEMORY SEARCHING CODE
+ */
+
+static int mm_debug_ik_inc;
+static int mm_debug_ik_dec;
+static int mm_debug_ik_no;
+
+/*
+ * Call this function where the missing memory
+ * is referenced.
+ */
+xtPublic void mm_trace_inc(XTThreadPtr self, XTMMTraceRefPtr tr)
+{
+	int i;
+
+#ifdef RECORD_MM
+	if (xt_lock_mutex(self, &mm_mutex)) {
+		long mm;
+
+		mm = mm_find_pointer(tr);
+		if (mm >= 0)
+			mm_addresses[mm].trace_count = 1;
+		xt_unlock_mutex(self, &mm_mutex);
+	}
+#endif
+	mm_debug_ik_inc++;
+	if (tr->mm_pos < XT_MM_STACK_TRACE-1) {
+		tr->mm_trace[tr->mm_pos++] = self->t_name[0] == 'S' ? XT_MM_TRACE_SW_INC : XT_MM_TRACE_INC;
+		for (i=1; i<=XT_MM_TRACE_DEPTH; i++) {
+			if (self->t_call_top-i < 0)
+				break;
+			if (tr->mm_pos < XT_MM_STACK_TRACE-1) {
+				tr->mm_line[tr->mm_pos] = self->t_call_stack[self->t_call_top-i].cs_line;
+				tr->mm_trace[tr->mm_pos++] = self->t_call_stack[self->t_call_top-i].cs_func;
+			}
+			else if (tr->mm_pos < XT_MM_STACK_TRACE)
+				tr->mm_trace[tr->mm_pos++] = XT_MM_TRACE_ERROR;
+		}
+	}
+	else if (tr->mm_pos < XT_MM_STACK_TRACE)
+		tr->mm_trace[tr->mm_pos++] = XT_MM_TRACE_ERROR;
+}
+
+xtPublic void mm_trace_dec(XTThreadPtr self, XTMMTraceRefPtr tr)
+{
+	int i;
+
+#ifdef RECORD_MM
+	if (xt_lock_mutex(self, &mm_mutex)) {
+		long mm;
+
+		mm = mm_find_pointer(tr);
+		if (mm >= 0)
+			mm_addresses[mm].trace_count = 1;
+		xt_unlock_mutex(self, &mm_mutex);
+	}
+#endif
+	mm_debug_ik_dec++;
+	if (tr->mm_pos < XT_MM_STACK_TRACE-1) {
+		tr->mm_trace[tr->mm_pos++] = self->t_name[0] == 'S' ? XT_MM_TRACE_SW_DEC : XT_MM_TRACE_DEC;
+		for (i=1; i<=XT_MM_TRACE_DEPTH; i++) {
+			if (self->t_call_top-i < 0)
+				break;
+			if (tr->mm_pos < XT_MM_STACK_TRACE-1) {
+				tr->mm_line[tr->mm_pos] = self->t_call_stack[self->t_call_top-i].cs_line;
+				tr->mm_trace[tr->mm_pos++] = self->t_call_stack[self->t_call_top-i].cs_func;
+			}
+			else if (tr->mm_pos < XT_MM_STACK_TRACE)
+				tr->mm_trace[tr->mm_pos++] = XT_MM_TRACE_ERROR;
+		}
+	}
+	else if (tr->mm_pos < XT_MM_STACK_TRACE)
+		tr->mm_trace[tr->mm_pos++] = XT_MM_TRACE_ERROR;
+}
+
+xtPublic void mm_trace_init(XTThreadPtr self, XTMMTraceRefPtr tr)
+{
+	mm_debug_ik_no++;
+	tr->mm_id = (u_int) mm_debug_ik_no;
+	tr->mm_pos = 0;
+	mm_trace_inc(self, tr);
+}
+
+xtPublic void mm_trace_print(XTMMTraceRefPtr tr)
+{
+	int i, cnt = 0;
+
+	for (i=0; i<tr->mm_pos; i++) {
+		if (tr->mm_trace[i] == XT_MM_TRACE_INC) {
+			if (i > 0)
+				printf("\n");
+			cnt++;
+			printf("INC (%d) ", cnt);
+		}
+		else if (tr->mm_trace[i] == XT_MM_TRACE_SW_INC) {
+			if (i > 0)
+				printf("\n");
+			printf("SW-DEC (%d) ", cnt);
+			cnt--;
+		}
+		else if (tr->mm_trace[i] == XT_MM_TRACE_DEC) {
+			if (i > 0)
+				printf("\n");
+			printf("DEC (%d) ", cnt);
+			cnt--;
+		}
+		else if (tr->mm_trace[i] == XT_MM_TRACE_SW_DEC) {
+			if (i > 0)
+				printf("\n");
+			printf("SW-DEC (%d) ", cnt);
+			cnt--;
+		}
+		else if (tr->mm_trace[i] == XT_MM_TRACE_ERROR) {
+			if (i > 0)
+				printf("\n");
+			printf("ERROR: Space out");
+		}
+		else
+			printf("%s(%d) ", tr->mm_trace[i], (int) tr->mm_line[i]);
+	}
+	printf("\n");
+}
+
+/* Call this function on exit, when you know the memory is missing. */
+static void mm_debug_trace_count(XTMMTraceRefPtr tr)
+{
+	printf("MM Trace ID: %d\n", tr->mm_id);
+	mm_trace_print(tr);
+}
+
+/* The give the sum of allocations, etc. */
+static void mm_debug_trace_sum(void)
+{
+	if (mm_debug_ik_no) {
+		printf("MM Trace INC: %d\n", mm_debug_ik_inc);
+		printf("MM Trace DEC: %d\n", mm_debug_ik_dec);
+		printf("MM Trace ALL: %d\n", mm_debug_ik_no);
+	}
+}
+
+/*
+ * -----------------------------------------------------------------------
+ * DEBUG MEMORY ALLOCATION AND HEAP CHECKING
+ */
+
+#ifdef RECORD_MM
+static long mm_find_pointer(void *ptr)
+{
+	register long	i, n, guess;
+
+	i = 0;
+	n = mm_nr_in_use;
+	while (i < n) {
+		guess = (i + n - 1) >> 1;
+		if (ptr == mm_addresses[guess].mm_ptr)
+			return(guess);
+		if (ptr < mm_addresses[guess].mm_ptr)
+			n = guess;
+		else
+			i = guess + 1;
+	}
+	return(-1);
+}
+
+static long mm_add_pointer(void *ptr, u_int XT_UNUSED(id))
+{
+	register int	i, n, guess;
+
+	if (mm_nr_in_use == mm_total_allocated) {
+		/* Not enough space, add more: */
+		MissingMemoryRec *new_addresses;
+
+		new_addresses = (MissingMemoryRec *) xt_calloc_ns(sizeof(MissingMemoryRec) * (mm_total_allocated + ADD_TOTAL_ALLOCS));
+		if (!new_addresses)
+			return(-1);
+
+		if (mm_addresses) {
+			memcpy(new_addresses, mm_addresses, sizeof(MissingMemoryRec) * mm_total_allocated);
+			free(mm_addresses);
+		}
+
+		mm_addresses = new_addresses;
+		mm_total_allocated += ADD_TOTAL_ALLOCS;
+	}
+
+	i = 0;
+	n = mm_nr_in_use;
+	while (i < n) {
+		guess = (i + n - 1) >> 1;
+		if (ptr < mm_addresses[guess].mm_ptr)
+			n = guess;
+		else
+			i = guess + 1;
+	}
+
+	SHIFT_RIGHT(&mm_addresses[i], mm_nr_in_use - i);
+	mm_nr_in_use++;
+	mm_addresses[i].mm_ptr = ptr;
+	return(i);
+}
+
+xtPublic char *mm_watch_point = 0;
+
+static long mm_remove_pointer(void *ptr)
+{
+	register int	i, n, guess;
+
+	if (mm_watch_point == ptr)
+		printf("Hit watch point!\n");
+
+	i = 0;
+	n = mm_nr_in_use;
+	while (i < n) {
+		guess = (i + n - 1) >> 1;
+		if (ptr == mm_addresses[guess].mm_ptr)
+			goto remove;
+		if (ptr < mm_addresses[guess].mm_ptr)
+			n = guess;
+		else
+			i = guess + 1;
+	}
+	return(-1);
+
+	remove:
+	/* Decrease the number of sets, and shift left: */
+	mm_nr_in_use--;
+	SHIFT_LEFT(&mm_addresses[guess], mm_nr_in_use - guess);	
+	return(guess);
+}
+
+static void mm_add_core_ptr(XTThreadPtr self, void *ptr, u_int id, u_int line, c_char *file_name)
+{
+	long mm;
+
+	mm = mm_add_pointer(ptr, id);
+	if (mm < 0) {
+		MM_THROW_ASSERTION("MM ERROR: Cannot allocate table big enough!");
+		return;
+	}
+
+	/* Record the pointer: */
+	if (mm_alloc_count >= 4115 && mm_alloc_count <= 4130) {
+		if (id)
+			mm_addresses[mm].id = id;
+		else
+			mm_addresses[mm].id = mm_alloc_count++;
+	}
+	else {
+		if (id)
+			mm_addresses[mm].id = id;
+		else
+			mm_addresses[mm].id = mm_alloc_count++;
+	}
+	mm_addresses[mm].mm_ptr = ptr;
+	mm_addresses[mm].line_nr = (ushort) line;
+	if (file_name)
+		mm_addresses[mm].mm_file = file_name;
+	else
+		mm_addresses[mm].mm_file = "?";
+	if (self) {
+		for (int i=1; i<=STACK_TRACE_DEPTH; i++) {
+			if (self->t_call_top-i >= 0)
+				mm_addresses[mm].mm_func[i-1] = self->t_call_stack[self->t_call_top-i].cs_func;
+			else
+				mm_addresses[mm].mm_func[i-1] = NULL;
+		}
+	}
+	else {
+		for (int i=0; i<STACK_TRACE_DEPTH; i++)
+			mm_addresses[mm].mm_func[i] = NULL;
+	}
+}
+
+static void mm_remove_core_ptr(void *ptr)
+{
+	XTThreadPtr	self = NULL;
+	long		mm;
+
+	mm = mm_remove_pointer(ptr);
+	if (mm < 0) {
+		MM_THROW_ASSERTION("Pointer not allocated");
+		return;
+	}
+}
+
+static void mm_throw_assertion(MissingMemoryPtr mm_ptr, void *p, c_char *message);
+
+static long mm_find_core_ptr(void *ptr)
+{
+	long mm;
+
+	mm = mm_find_pointer(ptr);
+	if (mm < 0)
+		mm_throw_assertion(NULL, ptr, "Pointer not allocated");
+	return(mm);
+}
+
+static void mm_replace_core_ptr(long i, void *ptr)
+{
+	XTThreadPtr			self = NULL;
+	MissingMemoryRec	tmp = mm_addresses[i];
+	long				mm;
+
+	mm_remove_pointer(mm_addresses[i].mm_ptr);
+	mm = mm_add_pointer(ptr, mm_addresses[i].id);
+	if (mm < 0) {
+		MM_THROW_ASSERTION("Cannot allocate table big enough!");
+		return;
+	}
+	mm_addresses[mm] = tmp;
+	mm_addresses[mm].mm_ptr = ptr;
+}
+#endif
+
+static void mm_throw_assertion(MissingMemoryPtr mm_ptr, void *p, c_char *message)
+{
+	XTThreadPtr	self = NULL;
+	char		str[200];
+
+	if (mm_ptr) {
+		sprintf(str, "MM: %08lX (#%ld) %s:%d %s",
+					   (unsigned long) mm_ptr->mm_ptr,
+					   (long) mm_ptr->id,
+					   xt_last_name_of_path(mm_ptr->mm_file),
+					   (int) mm_ptr->line_nr,
+					   message);
+	}
+	else
+		sprintf(str, "MM: %08lX %s", (unsigned long) p, message);
+	MM_THROW_ASSERTION(str);
+}
+
+/*
+ * -----------------------------------------------------------------------
+ * MISSING MEMORY PUBLIC ROUTINES
+ */
+
+#define MEM_DEBUG_HDR_SIZE		offsetof(MemoryDebugRec, data)
+#define MEM_TRAILER_SIZE		2
+#define MEM_HEADER				0x01010101
+#define MEM_FREED				0x03030303
+#define MEM_TRAILER_BYTE		0x02
+#define MEM_FREED_BYTE			0x03
+
+typedef struct MemoryDebug {
+	xtWord4		check;
+	xtWord4		size;
+	char		data[200];
+} MemoryDebugRec, *MemoryDebugPtr;
+
+static size_t mm_checkmem(XTThreadPtr self, MissingMemoryPtr mm_ptr, void *p, xtBool freeme)
+{
+	unsigned char	*ptr	= (unsigned char *) p - MEM_DEBUG_HDR_SIZE;
+	MemoryDebugPtr	debug_ptr = (MemoryDebugPtr) ptr;
+	size_t			size	= debug_ptr->size;
+	long			a_value;  /* Added to simplfy debugging. */
+
+	if (!ASSERT(p)) 
+		return(0);
+	if (!ASSERT(((long) p & 1L) == 0)) 
+		return(0);
+	a_value = MEM_FREED;
+	if (debug_ptr->check == MEM_FREED) { 
+		mm_throw_assertion(mm_ptr, p, "Pointer already freed 'debug_ptr->check != MEM_FREED'");
+		return(0);
+	}
+	a_value = MEM_HEADER;
+	if (debug_ptr->check != MEM_HEADER) {
+		mm_throw_assertion(mm_ptr, p, "Header not valid 'debug_ptr->check != MEM_HEADER'");
+		return(0);
+	}
+	a_value = MEM_TRAILER_BYTE;
+	if (!(*((unsigned char *) ptr + size + MEM_DEBUG_HDR_SIZE) == MEM_TRAILER_BYTE &&
+			*((unsigned char *) ptr + size + MEM_DEBUG_HDR_SIZE + 1L) == MEM_TRAILER_BYTE)) { 
+		mm_throw_assertion(mm_ptr, p, "Trailer overwritten");
+		return(0);
+	}
+
+	if (freeme) {
+		debug_ptr->check = MEM_FREED;
+		*((unsigned char *) ptr + size + MEM_DEBUG_HDR_SIZE) = MEM_FREED_BYTE;
+		*((unsigned char *) ptr + size + MEM_DEBUG_HDR_SIZE + 1L) = MEM_FREED_BYTE;
+
+		memset(((unsigned char *) ptr) + MEM_DEBUG_HDR_SIZE, 0xF5, size);
+		xt_free(self, ptr);
+	}
+
+	return size;
+}
+
+xtBool xt_mm_scan_core(void)
+{
+	long mm;
+
+	if (!mm_addresses)
+		return TRUE;
+
+	if (!xt_lock_mutex(NULL, &mm_mutex))
+		return TRUE;
+
+	for (mm=0; mm<mm_nr_in_use; mm++)	{
+		mm_checkmem(NULL, &mm_addresses[mm], mm_addresses[mm].mm_ptr, FALSE);
+	}
+	
+	xt_unlock_mutex(NULL, &mm_mutex);
+	return TRUE;
+}
+
+void xt_mm_memmove(void *block, void *dest, void *source, size_t size)
+{
+	if (block) {
+		MemoryDebugPtr	debug_ptr = (MemoryDebugPtr) ((char *) block - MEM_DEBUG_HDR_SIZE);
+
+#ifdef RECORD_MM
+		if (xt_lock_mutex(NULL, &mm_mutex)) {
+			mm_find_core_ptr(block);
+			xt_unlock_mutex(NULL, &mm_mutex);
+		}
+#endif
+		mm_checkmem(NULL, NULL, block, FALSE);
+
+		if (dest < block || (char *) dest > (char *) block + debug_ptr->size)
+			mm_throw_assertion(NULL, block, "Destination not in block");
+		if ((char *) dest + size > (char *) block + debug_ptr->size)
+			mm_throw_assertion(NULL, block, "Copy will overwrite memory");
+	}
+
+	memmove(dest, source, size);
+}
+
+void xt_mm_memcpy(void *block, void *dest, void *source, size_t size)
+{
+	if (block) {
+		MemoryDebugPtr	debug_ptr = (MemoryDebugPtr) ((char *) block - MEM_DEBUG_HDR_SIZE);
+
+#ifdef RECORD_MM
+		if (xt_lock_mutex(NULL, &mm_mutex)) {
+			mm_find_core_ptr(block);
+			xt_unlock_mutex(NULL, &mm_mutex);
+		}
+#endif
+		mm_checkmem(NULL, NULL, block, FALSE);
+
+		if (dest < block || (char *) dest > (char *) block + debug_ptr->size)
+			mm_throw_assertion(NULL, block, "Destination not in block");
+		if ((char *) dest + size > (char *) block + debug_ptr->size)
+			mm_throw_assertion(NULL, block, "Copy will overwrite memory");
+	}
+
+	memcpy(dest, source, size);
+}
+
+void xt_mm_memset(void *block, void *dest, int value, size_t size)
+{
+	if (block) {
+		MemoryDebugPtr	debug_ptr = (MemoryDebugPtr) ((char *) block - MEM_DEBUG_HDR_SIZE);
+
+#ifdef RECORD_MM
+		if (xt_lock_mutex(NULL, &mm_mutex)) {
+			mm_find_core_ptr(block);
+			xt_unlock_mutex(NULL, &mm_mutex);
+		}
+#endif
+		mm_checkmem(NULL, NULL, block, FALSE);
+
+		if (dest < block || (char *) dest > (char *) block + debug_ptr->size)
+			mm_throw_assertion(NULL, block, "Destination not in block");
+		if ((char *) dest + size > (char *) block + debug_ptr->size)
+			mm_throw_assertion(NULL, block, "Copy will overwrite memory");
+	}
+
+	memset(dest, value, size);
+}
+
+void *xt_mm_malloc(XTThreadPtr self, size_t size, u_int line, c_char *file)
+{
+	unsigned char *p;
+
+	if (size > (600*1024*1024))
+		mm_throw_assertion(NULL, NULL, "Very large block allocated - meaybe error");
+	p = (unsigned char *) xt_malloc(self, size + MEM_DEBUG_HDR_SIZE + MEM_TRAILER_SIZE);
+	if (!p)
+		return NULL;
+
+	memset(p, 0x55, size + MEM_DEBUG_HDR_SIZE + MEM_TRAILER_SIZE);
+
+	((MemoryDebugPtr) p)->check = MEM_HEADER;
+	((MemoryDebugPtr) p)->size  = size;
+	*(p + size + MEM_DEBUG_HDR_SIZE) = MEM_TRAILER_BYTE;
+	*(p + size + MEM_DEBUG_HDR_SIZE + 1L) = MEM_TRAILER_BYTE;
+
+	(void) line;
+	(void) file;
+#ifdef RECORD_MM
+	xt_lock_mutex(self, &mm_mutex);
+	mm_add_core_ptr(self, p + MEM_DEBUG_HDR_SIZE, 0, line, file);
+	xt_unlock_mutex(self, &mm_mutex);
+#endif
+
+	return p + MEM_DEBUG_HDR_SIZE;
+}
+
+void *xt_mm_calloc(XTThreadPtr self, size_t size, u_int line, c_char *file)
+{
+	unsigned char *p;
+	
+	if (size > (500*1024*1024))
+		mm_throw_assertion(NULL, NULL, "Very large block allocated - meaybe error");
+	p = (unsigned char *) xt_calloc(self, size + MEM_DEBUG_HDR_SIZE + MEM_TRAILER_SIZE);
+	if (!p) 
+		return NULL;
+
+	((MemoryDebugPtr) p)->check = MEM_HEADER;
+	((MemoryDebugPtr) p)->size  = size;
+	*(p + size + MEM_DEBUG_HDR_SIZE) = MEM_TRAILER_BYTE;
+	*(p + size + MEM_DEBUG_HDR_SIZE + 1L) = MEM_TRAILER_BYTE;
+
+	(void) line;
+	(void) file;
+#ifdef RECORD_MM
+	xt_lock_mutex(self, &mm_mutex);
+	mm_add_core_ptr(self, p + MEM_DEBUG_HDR_SIZE, 0, line, file);
+	xt_unlock_mutex(self, &mm_mutex);
+#endif
+
+	return p + MEM_DEBUG_HDR_SIZE;
+}
+
+xtBool xt_mm_sys_realloc(XTThreadPtr self, void **ptr, size_t newsize, u_int line, c_char *file)
+{
+	return xt_mm_realloc(self, ptr, newsize, line, file);
+}
+
+xtBool xt_mm_realloc(XTThreadPtr self, void **ptr, size_t newsize, u_int line, c_char *file)
+{
+	unsigned char	*oldptr = (unsigned char *) *ptr;
+	size_t			size;
+#ifdef RECORD_MM
+	long			mm;
+#endif
+	unsigned char	*pnew;
+
+	if (!oldptr) {
+		*ptr = xt_mm_malloc(self, newsize, line, file);
+		return *ptr ? TRUE : FALSE;
+	}
+
+#ifdef RECORD_MM
+	xt_lock_mutex(self, &mm_mutex);
+	if ((mm = mm_find_core_ptr(oldptr)) < 0) {
+		xt_unlock_mutex(self, &mm_mutex);
+		xt_throw_errno(XT_CONTEXT, XT_ENOMEM);
+		return FAILED;
+	}
+	xt_unlock_mutex(self, &mm_mutex);
+#endif
+
+	oldptr = oldptr - MEM_DEBUG_HDR_SIZE;
+	size = ((MemoryDebugPtr) oldptr)->size;
+
+	ASSERT(((MemoryDebugPtr) oldptr)->check == MEM_HEADER);
+	ASSERT(*((unsigned char *) oldptr + size + MEM_DEBUG_HDR_SIZE) == MEM_TRAILER_BYTE && 
+			*((unsigned char *) oldptr + size + MEM_DEBUG_HDR_SIZE + 1L) == MEM_TRAILER_BYTE);
+
+	/* Realloc allways moves! */
+	pnew = (unsigned char *) xt_malloc(self, newsize + MEM_DEBUG_HDR_SIZE + MEM_TRAILER_SIZE);
+	if (!pnew) {
+		xt_throw_errno(XT_CONTEXT, XT_ENOMEM);
+		return FAILED;
+	}
+
+	if (newsize > size) {
+		memcpy(((MemoryDebugPtr) pnew)->data, ((MemoryDebugPtr) oldptr)->data, size);
+		memset(((MemoryDebugPtr) pnew)->data + size, 0x55, newsize - size);
+	}
+	else
+		memcpy(((MemoryDebugPtr) pnew)->data, ((MemoryDebugPtr) oldptr)->data, newsize);
+
+	((MemoryDebugPtr) pnew)->check = MEM_HEADER;
+	((MemoryDebugPtr) pnew)->size = newsize;
+	*(pnew + newsize + MEM_DEBUG_HDR_SIZE) = MEM_TRAILER_BYTE;
+	*(pnew + newsize + MEM_DEBUG_HDR_SIZE + 1L)	= MEM_TRAILER_BYTE;
+
+#ifdef RECORD_MM
+	xt_lock_mutex(self, &mm_mutex);
+	if ((mm = mm_find_core_ptr(oldptr + MEM_DEBUG_HDR_SIZE)) < 0) {
+		xt_unlock_mutex(self, &mm_mutex);
+		xt_throw_errno(XT_CONTEXT, XT_ENOMEM);
+		return FAILED;
+	}
+	mm_replace_core_ptr(mm, pnew + MEM_DEBUG_HDR_SIZE);
+	xt_unlock_mutex(self, &mm_mutex);
+#endif
+
+	memset(oldptr, 0x55, size + MEM_DEBUG_HDR_SIZE + MEM_TRAILER_SIZE);
+	xt_free(self, oldptr);
+
+	*ptr = pnew + MEM_DEBUG_HDR_SIZE;
+	return OK;
+}
+
+void xt_mm_free(XTThreadPtr self, void *ptr)
+{
+#ifdef RECORD_MM
+	if (xt_lock_mutex(self, &mm_mutex)) {
+		mm_remove_core_ptr(ptr);
+		xt_unlock_mutex(self, &mm_mutex);
+	}
+#endif
+	mm_checkmem(self, NULL, ptr, TRUE);
+}
+
+void xt_mm_pfree(XTThreadPtr self, void **ptr)
+{
+	if (*ptr) {
+		void *p = *ptr;
+
+		*ptr = NULL;
+		xt_mm_free(self, p);
+	}
+}
+
+size_t xt_mm_malloc_size(XTThreadPtr self, void *ptr)
+{
+	size_t size = 0;
+
+#ifdef RECORD_MM
+	if (xt_lock_mutex(self, &mm_mutex)) {
+		mm_find_core_ptr(ptr);
+		xt_unlock_mutex(self, &mm_mutex);
+	}
+#endif
+	size = mm_checkmem(self, NULL, ptr, FALSE);
+	return size;
+}
+
+void xt_mm_check_ptr(XTThreadPtr self, void *ptr)
+{
+	mm_checkmem(self, NULL, ptr, FALSE);
+}
+#endif
+
+/*
+ * -----------------------------------------------------------------------
+ * INIT/EXIT MEMORY
+ */
+
+xtPublic xtBool xt_init_memory(void)
+{
+#ifdef DEBUG_MEMORY
+	XTThreadPtr	self = NULL;
+
+	if (!xt_init_mutex_with_autoname(NULL, &mm_mutex))
+		return FALSE;
+
+	mm_addresses = (MissingMemoryRec *) malloc(sizeof(MissingMemoryRec) * ADD_TOTAL_ALLOCS);
+	if (!mm_addresses) {
+		MM_THROW_ASSERTION("MM ERROR: Insuffient memory to allocate MM table");
+		xt_free_mutex(&mm_mutex);
+		return FALSE;
+	}
+
+	memset(mm_addresses, 0, sizeof(MissingMemoryRec) * ADD_TOTAL_ALLOCS);
+	mm_total_allocated = ADD_TOTAL_ALLOCS;
+	mm_nr_in_use = 0L;
+	mm_alloc_count = 0L;
+#endif
+	return TRUE;
+}
+
+xtPublic void debug_ik_count(void *value);
+xtPublic void debug_ik_sum(void);
+
+xtPublic void xt_exit_memory(void)
+{
+#ifdef DEBUG_MEMORY
+	long	mm;
+	int		i;
+
+	if (!mm_addresses)
+		return;
+
+	xt_lock_mutex(NULL, &mm_mutex);
+	for (mm=0; mm<mm_nr_in_use; mm++) {
+		MissingMemoryPtr mm_ptr = &mm_addresses[mm];
+
+		xt_logf(XT_NS_CONTEXT, XT_LOG_FATAL, "MM: %p (#%ld) %s:%d Not freed\n",
+			mm_ptr->mm_ptr,
+			(long) mm_ptr->id,
+			xt_last_name_of_path(mm_ptr->mm_file),
+			(int) mm_ptr->line_nr);
+		for (i=0; i<STACK_TRACE_DEPTH; i++) {
+			if (mm_ptr->mm_func[i])
+				xt_logf(XT_NS_CONTEXT, XT_LOG_FATAL, "MM: %s\n", mm_ptr->mm_func[i]);
+		}
+		/*
+		 * Assumes we place out tracing function in the first
+		 * position!!
+		 */
+		if (mm_ptr->trace_count)
+			mm_debug_trace_count((XTMMTraceRefPtr) mm_ptr->mm_ptr);
+	}
+	mm_debug_trace_sum();
+	free(mm_addresses);
+	mm_addresses = NULL;
+	mm_nr_in_use = 0L;
+	mm_total_allocated = 0L;
+	mm_alloc_count = 0L;
+	xt_unlock_mutex(NULL, &mm_mutex);
+
+	xt_free_mutex(&mm_mutex);
+#endif
+}
+
+/*
+ * -----------------------------------------------------------------------
+ * MEMORY ALLOCATION UTILITIES
+ */
+
+#ifdef DEBUG_MEMORY
+char	*xt_mm_dup_string(XTThreadPtr self, c_char *str, u_int line, c_char *file)
+#else
+char	*xt_dup_string(XTThreadPtr self, c_char *str)
+#endif
+{
+	size_t	len;
+	char	*new_str;
+
+	if (!str)
+		return NULL;
+	len = strlen(str);
+#ifdef DEBUG_MEMORY
+	new_str = (char *) xt_mm_malloc(self, len + 1, line, file);
+#else
+	new_str = (char *) xt_malloc(self, len + 1);
+#endif
+	if (new_str)
+		strcpy(new_str, str);
+	return new_str;
+}
+
+xtPublic char *xt_long_to_str(XTThreadPtr self, long v)
+{
+	char str[50];
+
+	sprintf(str, "%lu", v);
+	return xt_dup_string(self, str);
+}
+
+char *xt_dup_nstr(XTThreadPtr self, c_char *str, int start, size_t len)
+{
+	char *new_str = (char *) xt_malloc(self, len + 1);
+	
+	if (new_str) {
+		memcpy(new_str, str + start, len);
+		new_str[len] = 0;
+	}
+	return new_str;
+}
+
+/*
+ * -----------------------------------------------------------------------
+ * LIGHT WEIGHT CHECK FUNCTIONS
+ * Timing related memory management problems my not like the memset
+ * or other heavy checking. Try this...
+ */
+ 
+#ifdef LIGHT_WEIGHT_CHECKS
+xtPublic void *xt_malloc(XTThreadPtr self, size_t size)
+{
+	char *ptr;
+
+	if (!(ptr = (char *) malloc(size+8))) {
+		xt_throw_errno(XT_CONTEXT, XT_ENOMEM);
+		return NULL;
+	}
+	*((xtWord4 *) ptr) = size;
+	*((xtWord4 *) (ptr + size + 4)) = 0x7E7EFEFE;
+	return ptr+4;
+}
+
+xtPublic void xt_check_ptr(void *ptr)
+{
+	char *old_ptr;
+	xtWord4 size;
+
+	old_ptr = (char *) ptr;
+	old_ptr -= 4;
+	size = *((xtWord4 *) old_ptr);
+	if (size == 0xDEADBEAF || *((xtWord4 *) (old_ptr + size + 4)) != 0x7E7EFEFE) {
+		char *dummy = NULL;
+		
+		xt_dump_trace();
+		*dummy = 40;
+	}
+}
+
+xtPublic xtBool	xt_realloc(XTThreadPtr self, void **ptr, size_t size)
+{
+	char *old_ptr;
+	char *new_ptr;
+
+	if ((old_ptr = (char *) *ptr)) {
+		void check_for_file(char *my_ptr, xtWord4 len);
+
+		xt_check_ptr(old_ptr);
+		check_for_file((char *) old_ptr, *((xtWord4 *) (old_ptr - 4)));
+		if (!(new_ptr = (char *) realloc(old_ptr - 4, size+8))) {
+			xt_throw_errno(XT_CONTEXT, XT_ENOMEM);
+			return FAILED;
+		}
+		*((xtWord4 *) new_ptr) = size;
+		*((xtWord4 *) (new_ptr + size + 4)) = 0x7E7EFEFE;
+		*ptr = new_ptr+4;
+		return OK;
+	}
+	*ptr = xt_malloc(self, size);
+	return *ptr != NULL;
+}
+
+xtPublic void xt_free(XTThreadPtr XT_UNUSED(self), void *ptr)
+{
+	char	*old_ptr;
+	xtWord4 size;
+	void	check_for_file(char *my_ptr, xtWord4 len);
+
+	old_ptr = (char *) ptr;
+	old_ptr -= 4;
+	size = *((xtWord4 *) old_ptr);
+	if (size == 0xDEADBEAF || *((xtWord4 *) (old_ptr + size + 4)) != 0x7E7EFEFE) {
+		char *dummy = NULL;
+		
+		xt_dump_trace();
+		*dummy = 41;
+	}
+	check_for_file((char *) ptr, size);
+	*((xtWord4 *) old_ptr) = 0xDEADBEAF;
+	*((xtWord4 *) (old_ptr + size)) = 0xEFEFDFDF;
+	*((xtWord4 *) (old_ptr + size + 4)) = 0x1F1F1F1F;
+	//memset(old_ptr, 0xEF, size+4);
+	free(old_ptr);
+}
+
+xtPublic void *xt_calloc(XTThreadPtr self, size_t size)
+{
+	void *ptr;
+
+	if ((ptr = xt_malloc(self, size)))
+		memset(ptr, 0, size);
+	return ptr;
+}
+
+#undef	xt_pfree
+
+xtPublic void xt_pfree(XTThreadPtr self, void **ptr)
+{
+	if (*ptr) {
+		void *p = *ptr;
+
+		*ptr = NULL;
+		xt_free(self, p);
+	}
+}
+
+xtPublic void *xt_malloc_ns(size_t size)
+{
+	char *ptr;
+
+	if (!(ptr = (char *) malloc(size+8))) {
+		xt_register_errno(XT_REG_CONTEXT, XT_ENOMEM);
+		return NULL;
+	}
+	*((xtWord4 *) ptr) = size;
+	*((xtWord4 *) (ptr + size + 4)) = 0x7E7EFEFE;
+	return ptr+4;
+}
+
+xtPublic void *xt_calloc_ns(size_t size)
+{
+	char *ptr;
+
+	if (!(ptr = (char *) malloc(size+8))) {
+		xt_register_errno(XT_REG_CONTEXT, XT_ENOMEM);
+		return NULL;
+	}
+	*((xtWord4 *) ptr) = size;
+	*((xtWord4 *) (ptr + size + 4)) = 0x7E7EFEFE;
+	memset(ptr+4, 0, size);
+	return ptr+4;
+}
+
+xtPublic xtBool	xt_realloc_ns(void **ptr, size_t size)
+{
+	char *old_ptr;
+	char *new_ptr;
+
+	if ((old_ptr = (char *) *ptr)) {
+		void check_for_file(char *my_ptr, xtWord4 len);
+		
+		xt_check_ptr(old_ptr);
+		check_for_file((char *) old_ptr, *((xtWord4 *) (old_ptr - 4)));
+		if (!(new_ptr = (char *) realloc(old_ptr - 4, size+8)))
+			return xt_register_errno(XT_REG_CONTEXT, XT_ENOMEM);
+		*((xtWord4 *) new_ptr) = size;
+		*((xtWord4 *) (new_ptr + size + 4)) = 0x7E7EFEFE;
+		*ptr = new_ptr+4;
+		return OK;
+	}
+	*ptr = xt_malloc_ns(size);
+	return *ptr != NULL;
+}
+
+xtPublic void xt_free_ns(void *ptr)
+{
+	char	*old_ptr;
+	xtWord4	size;
+	void	check_for_file(char *my_ptr, xtWord4 len);
+
+	old_ptr = (char *) ptr;
+	old_ptr -= 4;
+	size = *((xtWord4 *) old_ptr);
+	if (size == 0xDEADBEAF || *((xtWord4 *) (old_ptr + size + 4)) != 0x7E7EFEFE) {
+		char *dummy = NULL;
+		
+		xt_dump_trace();
+		*dummy = 42;
+	}
+	check_for_file((char *) ptr, size);
+	*((xtWord4 *) old_ptr) = 0xDEADBEAF;
+	*((xtWord4 *) (old_ptr + size)) = 0xEFEFDFDF;
+	*((xtWord4 *) (old_ptr + size + 4)) = 0x1F1F1F1F;
+	//memset(old_ptr, 0xEE, size+4);
+	free(old_ptr);
+}
+#endif
+
diff --git a/storage/pbxt/src/memory_xt.h b/storage/pbxt/src/memory_xt.h
new file mode 100644
index 00000000000..1785cd0bd51
--- /dev/null
+++ b/storage/pbxt/src/memory_xt.h
@@ -0,0 +1,130 @@
+/* Copyright (c) 2005 PrimeBase Technologies GmbH
+ *
+ * PrimeBase XT
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * 2005-01-04	Paul McCullagh
+ *
+ * H&G2JCtL
+ */
+#ifndef __xt_memory_h__
+#define __xt_memory_h__
+
+#include <string.h>
+
+#include "xt_defs.h"
+
+struct XTThread;
+
+#ifdef DEBUG
+#define DEBUG_MEMORY
+#endif
+
+#ifdef DEBUG_MEMORY
+
+#define XT_MM_STACK_TRACE	200
+#define XT_MM_TRACE_DEPTH	4
+#define XT_MM_TRACE_INC		((char *) 1)
+#define XT_MM_TRACE_DEC		((char *) 2)
+#define XT_MM_TRACE_SW_INC	((char *) 1)
+#define XT_MM_TRACE_SW_DEC	((char *) 2)
+#define XT_MM_TRACE_ERROR	((char *) 3)
+
+typedef struct XTMMTraceRef {
+	int						mm_pos;
+	u_int					mm_id;
+	u_int					mm_line[XT_MM_STACK_TRACE];
+	c_char					*mm_trace[XT_MM_STACK_TRACE];
+} XTMMTraceRefRec, *XTMMTraceRefPtr;
+
+#define XT_MM_TRACE_INIT(x)	(x)->mm_pos = 0
+
+extern char *mm_watch_point;
+
+#define XT_MEMMOVE(b, d, s, l)	xt_mm_memmove(b, d, s, l)
+#define XT_MEMCPY(b, d, s, l)	xt_mm_memcpy(b, d, s, l)
+#define XT_MEMSET(b, d, v, l)	xt_mm_memset(b, d, v, l)
+
+#define xt_malloc(t, s)			xt_mm_malloc(t, s, __LINE__, __FILE__)
+#define xt_calloc(t, s)			xt_mm_calloc(t, s, __LINE__, __FILE__)
+#define xt_realloc(t, p, s)		xt_mm_realloc(t, p, s, __LINE__, __FILE__)
+#define xt_free					xt_mm_free
+#define xt_pfree				xt_mm_pfree
+
+#define xt_malloc_ns(s)			xt_mm_malloc(NULL, s, __LINE__, __FILE__)
+#define xt_calloc_ns(s)			xt_mm_calloc(NULL, s, __LINE__, __FILE__)
+#define xt_realloc_ns(p, s)		xt_mm_sys_realloc(NULL, p, s, __LINE__, __FILE__)
+#define xt_free_ns(p)			xt_mm_free(NULL, p)
+
+void	xt_mm_memmove(void *block, void *dest, void *source, size_t size);
+void	xt_mm_memcpy(void *block, void *dest, void *source, size_t size);
+void	xt_mm_memset(void *block, void *dest, int value, size_t size);
+
+void	*xt_mm_malloc(struct XTThread *self, size_t size, u_int line, const char *file);
+void	*xt_mm_calloc(struct XTThread *self, size_t size, u_int line, const char *file);
+xtBool	xt_mm_realloc(struct XTThread *self, void **ptr, size_t size, u_int line, const char *file);
+void	xt_mm_free(struct XTThread *self, void *ptr);
+void	xt_mm_pfree(struct XTThread *self, void **ptr);
+size_t	xt_mm_malloc_size(struct XTThread *self, void *ptr);
+void	xt_mm_check_ptr(struct XTThread *self, void *ptr);
+xtBool	xt_mm_sys_realloc(struct XTThread *self, void **ptr, size_t newsize, u_int line, const char *file);
+
+#ifndef XT_SCAN_CORE_DEFINED
+#define XT_SCAN_CORE_DEFINED
+xtBool	xt_mm_scan_core(void);
+#endif
+
+void	mm_trace_inc(struct XTThread *self, XTMMTraceRefPtr tr);
+void	mm_trace_dec(struct XTThread *self, XTMMTraceRefPtr tr);
+void	mm_trace_init(struct XTThread *self, XTMMTraceRefPtr tr);
+void	mm_trace_print(XTMMTraceRefPtr tr);
+
+#else
+
+#define XT_MEMMOVE(b, d, s, l)	memmove(d, s, l)
+#define XT_MEMCPY(b, d, s, l)	memcpy(d, s, l)
+#define XT_MEMSET(b, d, v, l)	memset(d, v, l)
+
+void	*xt_malloc(struct XTThread *self, size_t size);
+void	*xt_calloc(struct XTThread *self, size_t size);
+xtBool	xt_realloc(struct XTThread *self, void **ptr, size_t size);
+void	xt_free(struct XTThread *self, void *ptr);
+void	xt_pfree(struct XTThread *self, void **ptr);
+
+void	*xt_malloc_ns(size_t size);
+void	*xt_calloc_ns(size_t size);
+xtBool	xt_realloc_ns(void **ptr, size_t size);
+void	xt_free_ns(void *ptr);
+
+#define xt_pfree(t, p)			xt_pfree(t, (void **) p)
+
+#endif
+
+#ifdef DEBUG_MEMORY
+#define xt_dup_string(t, s)		xt_mm_dup_string(t, s, __LINE__, __FILE__)
+
+char	*xt_mm_dup_string(struct XTThread *self, const char *path, u_int line, const char *file);
+#else
+char	*xt_dup_string(struct XTThread *self, const char *path);
+#endif
+
+char	*xt_long_to_str(struct XTThread *self, long v);
+char	*xt_dup_nstr(struct XTThread *self, const char *str, int start, size_t len);
+
+xtBool	xt_init_memory(void);
+void	xt_exit_memory(void);
+
+#endif
diff --git a/storage/pbxt/src/myxt_xt.cc b/storage/pbxt/src/myxt_xt.cc
new file mode 100644
index 00000000000..410bf2d2f3c
--- /dev/null
+++ b/storage/pbxt/src/myxt_xt.cc
@@ -0,0 +1,3434 @@
+/* Copyright (c) 2005 PrimeBase Technologies GmbH
+ *
+ * PrimeBase XT
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.	See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * 2006-05-16	Paul McCullagh
+ *
+ * H&G2JCtL
+ *
+ * These functions implement the parts of PBXT which must conform to the
+ * key and row format used by MySQL. 
+ */
+
+#include "xt_config.h"
+
+#ifdef DRIZZLED
+#include <drizzled/server_includes.h>
+#include <drizzled/plugin.h>
+#include <drizzled/show.h>
+#include <drizzled/field/blob.h>
+#include <drizzled/field/enum.h>
+#include <drizzled/field/varstring.h>
+#include <drizzled/current_session.h>
+#include <drizzled/sql_lex.h>
+#include <drizzled/session.h>
+//extern "C" struct charset_info_st *session_charset(Session *session);
+extern pthread_key_t THR_Session;
+#else
+#include "mysql_priv.h"
+#include <mysql/plugin.h>
+#endif
+
+#ifdef HAVE_ISNAN
+#include <math.h>
+#endif
+
+#include "ha_pbxt.h"
+
+#include "myxt_xt.h"
+#include "strutil_xt.h"
+#include "database_xt.h"
+#include "cache_xt.h"
+#include "datalog_xt.h"
+#include "memory_xt.h"
+
+static void		myxt_bitmap_init(XTThreadPtr self, MX_BITMAP *map, u_int n_bits);
+static void		myxt_bitmap_free(XTThreadPtr self, MX_BITMAP *map);
+
+#ifdef DRIZZLED
+#define swap_variables(TYPE, a, b) \
+  do {                             \
+    TYPE dummy;                    \
+    dummy= a;                      \
+    a= b;                          \
+    b= dummy;                      \
+  } while (0)
+
+
+#define CMP_NUM(a,b) (((a) < (b)) ? -1 : ((a) == (b)) ? 0 : 1)
+#else
+#define get_rec_bits(bit_ptr, bit_ofs, bit_len) \
+	(((((uint16) (bit_ptr)[1] << 8) | (uint16) (bit_ptr)[0]) >> (bit_ofs)) & \
+   ((1 << (bit_len)) - 1))
+#endif
+
+#define FIX_LENGTH(cs, pos, length, char_length) \
+						do { \
+							if ((length) > char_length) \
+								char_length= my_charpos(cs, pos, pos+length, char_length); \
+							set_if_smaller(char_length,length); \
+						} while(0)
+
+#ifdef store_key_length_inc
+#undef store_key_length_inc
+#endif
+#define store_key_length_inc(key,length) \
+{ if ((length) < 255) \
+	{ *(key)++=(length); } \
+	else \
+	{ *(key)=255; mi_int2store((key)+1,(length)); (key)+=3; } \
+}
+
+#define set_rec_bits(bits, bit_ptr, bit_ofs, bit_len) \
+{ \
+	(bit_ptr)[0]= ((bit_ptr)[0] & ~(((1 << (bit_len)) - 1) << (bit_ofs))) | \
+                ((bits) << (bit_ofs)); \
+	if ((bit_ofs) + (bit_len) > 8) \
+    (bit_ptr)[1]= ((bit_ptr)[1] & ~((1 << ((bit_len) - 8 + (bit_ofs))) - 1)) | \
+                  ((bits) >> (8 - (bit_ofs))); \
+}
+
+#define clr_rec_bits(bit_ptr, bit_ofs, bit_len) \
+	set_rec_bits(0, bit_ptr, bit_ofs, bit_len)
+
+static ulong my_calc_blob_length(uint length, xtWord1 *pos)
+{
+	switch (length) {
+	case 1:
+		return (uint) (uchar) *pos;
+	case 2:
+		return (uint) uint2korr(pos);
+	case 3:
+		return uint3korr(pos);
+	case 4:
+		return uint4korr(pos);
+	default:
+		break;
+	}
+	return 0; /* Impossible */
+}
+
+static void my_store_blob_length(byte *pos,uint pack_length,uint length)
+{
+	switch (pack_length) {
+	case 1:
+		*pos= (uchar) length;
+		break;
+	case 2:
+		int2store(pos,length);
+		break;
+	case 3:
+		int3store(pos,length);
+		break;
+	case 4:
+		int4store(pos,length);
+	default:
+		break;
+	}
+	return;
+}
+
+static int my_compare_text(MX_CONST_CHARSET_INFO *charset_info, uchar *a, uint a_length,
+				uchar *b, uint b_length, my_bool part_key,
+				my_bool XT_UNUSED(skip_end_space))
+{
+	if (!part_key)
+		/* The last parameter is diff_if_only_endspace_difference, which means
+		 * that end spaces are not ignored. We actually always want
+		 * to ignore end spaces!
+		 */
+		return charset_info->coll->strnncollsp(charset_info, a, a_length,
+				b, b_length, /*(my_bool)!skip_end_space*/0);
+	return charset_info->coll->strnncoll(charset_info, a, a_length,
+			b, b_length, part_key);
+}
+
+/*
+ * -----------------------------------------------------------------------
+ * Create a key
+ */
+
+/*
+ * Derived from _mi_pack_key()
+ */
+xtPublic u_int myxt_create_key_from_key(XTIndexPtr ind, xtWord1 *key, xtWord1 *old, u_int k_length)
+{
+	xtWord1			*start_key = key;
+	XTIndexSegRec	*keyseg = ind->mi_seg;
+
+	for (u_int i=0; i<ind->mi_seg_count && (int) k_length > 0; i++, old += keyseg->length, keyseg++)
+	{
+#ifndef DRIZZLED
+		enum ha_base_keytype	type = (enum ha_base_keytype) keyseg->type;
+#endif
+		u_int					length = keyseg->length < k_length ? keyseg->length : k_length;
+		u_int					char_length;
+		xtWord1					*pos;
+		MX_CONST_CHARSET_INFO	*cs = keyseg->charset;
+
+		if (keyseg->null_bit) {
+			k_length--;
+			if (!(*key++ = (xtWord1) 1 - *old++)) {					/* Copy null marker */
+				k_length -= length;
+				if (keyseg->flag & (HA_VAR_LENGTH_PART | HA_BLOB_PART)) {
+					k_length -= 2;									/* Skip length */
+ 					old += 2;
+				}
+				continue;											/* Found NULL */
+			}
+		}
+		char_length= (cs && cs->mbmaxlen > 1) ? length/cs->mbmaxlen : length;
+		pos = old;
+		if (keyseg->flag & HA_SPACE_PACK) {
+			uchar *end = pos + length;
+#ifndef DRIZZLED
+			if (type != HA_KEYTYPE_NUM) {
+#endif
+				while (end > pos && end[-1] == ' ')
+					end--;
+#ifndef DRIZZLED
+			}
+			else {
+				while (pos < end && pos[0] == ' ')
+					pos++;
+			}
+#endif
+			k_length -= length;
+			length = (u_int) (end-pos);
+			FIX_LENGTH(cs, pos, length, char_length);
+			store_key_length_inc(key, char_length);
+			memcpy((byte*) key,pos,(size_t) char_length);
+			key += char_length;
+			continue;
+		}
+		if (keyseg->flag & (HA_VAR_LENGTH_PART | HA_BLOB_PART)) {
+			/* Length of key-part used with mi_rkey() always 2 */
+			u_int tmp_length = uint2korr(pos);
+			k_length -= 2 + length;
+			pos += 2;
+			set_if_smaller(length, tmp_length);	/* Safety */
+			FIX_LENGTH(cs, pos, length, char_length);
+			store_key_length_inc(key,char_length);
+			old +=2;					/* Skip length */
+			memcpy((char *) key, pos, (size_t) char_length);
+			key += char_length;
+			continue;
+		}
+		if (keyseg->flag & HA_SWAP_KEY)
+		{						/* Numerical column */
+			pos+=length;
+			k_length-=length;
+			while (length--) {
+				*key++ = *--pos;
+			}
+			continue;
+		}
+		FIX_LENGTH(cs, pos, length, char_length);
+		memcpy((byte*) key, pos, char_length);
+		if (length > char_length)
+			cs->cset->fill(cs, (char *) (key + char_length), length - char_length, ' ');
+		key += length;
+		k_length -= length;
+	}
+
+	return (u_int) (key - start_key);
+}
+
+/* Derived from _mi_make_key */
+xtPublic u_int myxt_create_key_from_row(XTIndexPtr ind, xtWord1 *key, xtWord1 *record, xtBool *no_duplicate)
+{
+	register XTIndexSegRec	*keyseg = ind->mi_seg;
+	xtWord1					*pos;
+	xtWord1					*end;
+	xtWord1					*start;
+
+#ifdef HAVE_valgrind
+       if (ind->mi_fix_key)
+               memset((byte*) key, 0,(size_t) (ind->mi_key_size) );
+#endif
+
+	start = key;
+ 	for (u_int i=0; i<ind->mi_seg_count; i++, keyseg++)
+	{
+		enum ha_base_keytype	type = (enum ha_base_keytype) keyseg->type;
+ 		u_int					length = keyseg->length;
+ 		u_int					char_length;
+ 		MX_CONST_CHARSET_INFO	*cs = keyseg->charset;
+
+		if (keyseg->null_bit) {
+			if (record[keyseg->null_pos] & keyseg->null_bit) {
+				*key++ = 0;				/* NULL in key */
+				
+				/* The point is, if a key contains a NULL value
+				 * the duplicate checking must be disabled.
+				 * This is because a NULL value is not considered
+				 * equal to any other value.
+				 */ 
+				if (no_duplicate)
+					*no_duplicate = FALSE;
+				continue;
+			}
+			*key++ = 1;					/* Not NULL */
+		}
+
+		char_length= ((cs && cs->mbmaxlen > 1) ? length/cs->mbmaxlen : length);
+
+		pos = record + keyseg->start;
+#ifndef DRIZZLED
+		if (type == HA_KEYTYPE_BIT)
+		{
+			if (keyseg->bit_length)
+			{
+				uchar bits = get_rec_bits((uchar*) record + keyseg->bit_pos,
+																 keyseg->bit_start, keyseg->bit_length);
+				*key++ = bits;
+				length--;
+			}
+			memcpy((byte*) key, pos, length);
+			key+= length;
+			continue;
+		}
+#endif
+		if (keyseg->flag & HA_SPACE_PACK)
+		{
+			end = pos + length;
+#ifndef DRIZZLED
+			if (type != HA_KEYTYPE_NUM) {
+#endif
+				while (end > pos && end[-1] == ' ')
+					end--;
+#ifndef DRIZZLED
+			}
+			else {
+				while (pos < end && pos[0] == ' ')
+					pos++;
+			}
+#endif
+			length = (u_int) (end-pos);
+			FIX_LENGTH(cs, pos, length, char_length);
+			store_key_length_inc(key,char_length);
+			memcpy((byte*) key,(byte*) pos,(size_t) char_length);
+			key += char_length;
+			continue;
+		}
+		if (keyseg->flag & HA_VAR_LENGTH_PART) {
+			uint pack_length= (keyseg->bit_start == 1 ? 1 : 2);
+			uint tmp_length= (pack_length == 1 ? (uint) *(uchar*) pos :
+												uint2korr(pos));
+			pos += pack_length;			/* Skip VARCHAR length */
+			set_if_smaller(length,tmp_length);
+			FIX_LENGTH(cs, pos, length, char_length);
+			store_key_length_inc(key,char_length);
+			memcpy((byte*) key,(byte*) pos,(size_t) char_length);
+			key += char_length;
+			continue;
+		}
+		if (keyseg->flag & HA_BLOB_PART)
+		{
+			u_int tmp_length = my_calc_blob_length(keyseg->bit_start, pos);
+			memcpy((byte*) &pos,pos+keyseg->bit_start,sizeof(char*));
+			set_if_smaller(length,tmp_length);
+			FIX_LENGTH(cs, pos, length, char_length);
+			store_key_length_inc(key,char_length);
+			memcpy((byte*) key,(byte*) pos,(size_t) char_length);
+			key+= char_length;
+			continue;
+		}
+		if (keyseg->flag & HA_SWAP_KEY)
+		{						/* Numerical column */
+#ifdef HAVE_ISNAN
+#ifndef DRIZZLED
+			if (type == HA_KEYTYPE_FLOAT)
+			{
+				float nr;
+				float4get(nr,pos);
+				if (isnan(nr))
+				{
+					/* Replace NAN with zero */
+					bzero(key,length);
+					key+=length;
+					continue;
+				}
+			}
+			else 
+#endif			
+			if (type == HA_KEYTYPE_DOUBLE) {
+				double nr;
+
+				float8get(nr,pos);
+				if (isnan(nr)) {
+					bzero(key,length);
+					key+=length;
+					continue;
+				}
+			}
+#endif
+			pos+=length;
+			while (length--) {
+				*key++ = *--pos;
+			}
+			continue;
+		}
+ 		FIX_LENGTH(cs, pos, length, char_length);
+		memcpy((byte*) key, pos, char_length);
+		if (length > char_length)
+			cs->cset->fill(cs, (char *) key + char_length, length - char_length, ' ');
+		key += length;
+	}
+
+	return ind->mi_fix_key ? ind->mi_key_size : (u_int) (key - start);		/* Return keylength */
+}
+
+xtPublic u_int myxt_create_foreign_key_from_row(XTIndexPtr ind, xtWord1 *key, xtWord1 *record, XTIndexPtr fkey_ind, xtBool *no_null)
+{
+	register XTIndexSegRec	*keyseg = ind->mi_seg;
+	register XTIndexSegRec	*fkey_keyseg = fkey_ind->mi_seg;
+	xtWord1					*pos;
+	xtWord1					*end;
+	xtWord1					*start;
+
+	start = key;
+ 	for (u_int i=0; i<ind->mi_seg_count; i++, keyseg++, fkey_keyseg++)
+	{
+		enum ha_base_keytype	type = (enum ha_base_keytype) keyseg->type;
+ 		u_int					length = keyseg->length;
+ 		u_int					char_length;
+ 		MX_CONST_CHARSET_INFO	*cs = keyseg->charset;
+		xtBool					is_null = FALSE;
+
+		if (keyseg->null_bit) {
+			if (record[keyseg->null_pos] & keyseg->null_bit) {
+				is_null = TRUE;
+				if (no_null)
+					*no_null = FALSE;
+			}
+		}
+
+		if (fkey_keyseg->null_bit) {
+			if (is_null) {
+				*key++ = 0;				/* NULL in key */
+				
+				/* The point is, if a key contains a NULL value
+				 * the duplicate checking must be disabled.
+				 * This is because a NULL value is not considered
+				 * equal to any other value.
+				 */ 
+				continue;
+			}
+			*key++ = 1;					/* Not NULL */
+		}
+
+		char_length= ((cs && cs->mbmaxlen > 1) ? length/cs->mbmaxlen : length);
+
+		pos = record + keyseg->start;
+#ifndef DRIZZLED
+		if (type == HA_KEYTYPE_BIT)
+		{
+			if (keyseg->bit_length)
+			{
+				uchar bits = get_rec_bits((uchar*) record + keyseg->bit_pos,
+																 keyseg->bit_start, keyseg->bit_length);
+				*key++ = bits;
+				length--;
+			}
+			memcpy((byte*) key, pos, length);
+			key+= length;
+			continue;
+		}
+#endif
+		if (keyseg->flag & HA_SPACE_PACK)
+		{
+			end = pos + length;
+#ifndef DRIZZLED
+			if (type != HA_KEYTYPE_NUM) {
+#endif
+				while (end > pos && end[-1] == ' ')
+					end--;
+#ifndef DRIZZLED
+			}
+			else {
+				while (pos < end && pos[0] == ' ')
+					pos++;
+			}
+#endif
+			length = (u_int) (end-pos);
+			FIX_LENGTH(cs, pos, length, char_length);
+			store_key_length_inc(key,char_length);
+			memcpy((byte*) key,(byte*) pos,(size_t) char_length);
+			key += char_length;
+			continue;
+		}
+		if (keyseg->flag & HA_VAR_LENGTH_PART) {
+			uint pack_length= (keyseg->bit_start == 1 ? 1 : 2);
+			uint tmp_length= (pack_length == 1 ? (uint) *(uchar*) pos :
+												uint2korr(pos));
+			pos += pack_length;			/* Skip VARCHAR length */
+			set_if_smaller(length,tmp_length);
+			FIX_LENGTH(cs, pos, length, char_length);
+			store_key_length_inc(key,char_length);
+			memcpy((byte*) key,(byte*) pos,(size_t) char_length);
+			key += char_length;
+			continue;
+		}
+		if (keyseg->flag & HA_BLOB_PART)
+		{
+			u_int tmp_length = my_calc_blob_length(keyseg->bit_start, pos);
+			memcpy((byte*) &pos,pos+keyseg->bit_start,sizeof(char*));
+			set_if_smaller(length,tmp_length);
+			FIX_LENGTH(cs, pos, length, char_length);
+			store_key_length_inc(key,char_length);
+			memcpy((byte*) key,(byte*) pos,(size_t) char_length);
+			key+= char_length;
+			continue;
+		}
+		if (keyseg->flag & HA_SWAP_KEY)
+		{						/* Numerical column */
+#ifdef HAVE_ISNAN
+#ifndef DRIZZLED
+			if (type == HA_KEYTYPE_FLOAT)
+			{
+				float nr;
+				float4get(nr,pos);
+				if (isnan(nr))
+				{
+					/* Replace NAN with zero */
+					bzero(key,length);
+					key+=length;
+					continue;
+				}
+			}
+			else 
+#endif
+			if (type == HA_KEYTYPE_DOUBLE) {
+				double nr;
+
+				float8get(nr,pos);
+				if (isnan(nr)) {
+					bzero(key,length);
+					key+=length;
+					continue;
+				}
+			}
+#endif
+			pos+=length;
+			while (length--) {
+				*key++ = *--pos;
+			}
+			continue;
+		}
+ 		FIX_LENGTH(cs, pos, length, char_length);
+		memcpy((byte*) key, pos, char_length);
+		if (length > char_length)
+			cs->cset->fill(cs, (char *) key + char_length, length - char_length, ' ');
+		key += length;
+	}
+
+	return (u_int) (key - start);
+}
+
+/* I may be overcautious here, but can I assume that
+ * null_ptr refers to my buffer. If I cannot, then I
+ * cannot use the set_notnull() method.
+ */
+static void mx_set_notnull_in_record(Field *field, char *record)
+{
+	if (field->null_ptr)
+		record[(uint) (field->null_ptr - (uchar *) field->table->record[0])] &= (uchar) ~field->null_bit;
+}
+
+static xtBool mx_is_null_in_record(Field *field, char *record)
+{
+	if (field->null_ptr) {
+		if (record[(uint) (field->null_ptr - (uchar *) field->table->record[0])] & (uchar) field->null_bit)
+			return TRUE;
+	}
+	return FALSE;
+}
+
+/*
+ * PBXT uses a completely different disk format to MySQL so I need a
+ * method that just returns the byte length and
+ * pointer to the data in a row.
+ */
+static char *mx_get_length_and_data(Field *field, char *dest, xtWord4 *len)
+{
+	char *from;
+	
+#if MYSQL_VERSION_ID < 50114
+	from = dest + field->offset();
+#else
+	from = dest + field->offset(field->table->record[0]);
+#endif
+	switch (field->real_type()) {
+#ifndef DRIZZLED
+		case MYSQL_TYPE_TINY_BLOB:
+		case MYSQL_TYPE_MEDIUM_BLOB:
+		case MYSQL_TYPE_LONG_BLOB:
+#endif
+		case MYSQL_TYPE_BLOB: {
+			/* TODO - Check: this was the original comment: I must set
+			 * *data to non-NULL value, *data == 0, means SQL NULL value.
+			 */
+			char	*data;
+
+			/* GOTCHA: There is no way this can work! field is shared
+			 * between threads.
+			char	*save = field->ptr;
+
+			field->ptr = (char *) from;
+			((Field_blob *) field)->get_ptr(&data);
+			field->ptr = save;					// Restore org row pointer
+			*/
+
+			xtWord4 packlength = ((Field_blob *) field)->pack_length() - field->table->s->blob_ptr_size;
+			memcpy(&data, ((char *) from)+packlength, sizeof(char*));
+			
+			*len = ((Field_blob *) field)->get_length((byte *) from);
+			return data;
+		}
+#ifndef DRIZZLED
+		case MYSQL_TYPE_STRING:
+			/* To write this function you would think Field_string::pack
+			 * would serve as a good example, but as far as I can tell
+			 * it has a bug: the test from[length-1] == ' ' assumes
+			 * 1-byte chars.
+			 *
+			 * But this is not relevant because I believe lengthsp
+			 * will give me the correct answer!
+			 */
+			*len = field->charset()->cset->lengthsp(field->charset(), from, field->field_length);
+			return from;
+		case MYSQL_TYPE_VAR_STRING: {
+			uint length=uint2korr(from);
+
+			*len = length;
+			return from+HA_KEY_BLOB_LENGTH;
+		}
+#endif
+		case MYSQL_TYPE_VARCHAR: {
+			uint length;
+
+			if (((Field_varstring *) field)->length_bytes == 1)
+				length = *((unsigned char *) from);
+			else
+				length = uint2korr(from);
+			
+			*len = length;
+			return from+((Field_varstring *) field)->length_bytes;
+		}
+#ifndef DRIZZLED
+		case MYSQL_TYPE_DECIMAL:
+		case MYSQL_TYPE_TINY:
+		case MYSQL_TYPE_SHORT:
+		case MYSQL_TYPE_LONG:
+		case MYSQL_TYPE_FLOAT:
+		case MYSQL_TYPE_DOUBLE:
+		case MYSQL_TYPE_NULL:
+		case MYSQL_TYPE_TIMESTAMP:
+		case MYSQL_TYPE_LONGLONG:
+		case MYSQL_TYPE_INT24:
+		case MYSQL_TYPE_DATE:
+		case MYSQL_TYPE_TIME:
+		case MYSQL_TYPE_DATETIME:
+		case MYSQL_TYPE_YEAR:
+		case MYSQL_TYPE_NEWDATE:
+		case MYSQL_TYPE_BIT:
+		case MYSQL_TYPE_NEWDECIMAL:
+		case MYSQL_TYPE_ENUM:
+		case MYSQL_TYPE_SET:
+		case MYSQL_TYPE_GEOMETRY:
+#else
+		case DRIZZLE_TYPE_LONG:
+		case DRIZZLE_TYPE_DOUBLE:
+		case DRIZZLE_TYPE_NULL:
+		case DRIZZLE_TYPE_TIMESTAMP:
+		case DRIZZLE_TYPE_LONGLONG:
+		case DRIZZLE_TYPE_DATETIME:
+		case DRIZZLE_TYPE_DATE:
+		case DRIZZLE_TYPE_NEWDECIMAL:
+		case DRIZZLE_TYPE_ENUM:
+#endif
+			break;
+	}
+
+	*len = field->pack_length();
+	return from;
+}
+
+/*
+ * Set the length and data value of a field.
+ * 
+ * If input data is NULL this is a NULL value. In this case
+ * we assume the null bit has been set and prepared
+ * the field as follows:
+ * 
+ * According to the InnoDB implementation, we need
+ * to zero out the field data...
+ * "MySQL seems to assume the field for an SQL NULL
+ * value is set to zero or space. Not taking this into
+ * account caused seg faults with NULL BLOB fields, and
+ * bug number 154 in the MySQL bug database: GROUP BY
+ * and DISTINCT could treat NULL values inequal".
+ */
+static void mx_set_length_and_data(Field *field, char *dest, xtWord4 len, char *data)
+{
+	char *from;
+	
+#if MYSQL_VERSION_ID < 50114
+	from = dest + field->offset();
+#else
+	from = dest + field->offset(field->table->record[0]);
+#endif
+	switch (field->real_type()) {
+#ifndef DRIZZLED
+		case MYSQL_TYPE_TINY_BLOB:
+		case MYSQL_TYPE_MEDIUM_BLOB:
+		case MYSQL_TYPE_LONG_BLOB:
+#endif
+		case MYSQL_TYPE_BLOB: {
+			/* GOTCHA: There is no way that this can work.
+			 * field is shared, because table is shared!
+			char *save = field->ptr;
+		 
+			field->ptr = (char *) from;
+			((Field_blob *) field)->set_ptr(len, data);
+			field->ptr = save;					// Restore org row pointer
+			*/
+			xtWord4 packlength = ((Field_blob *) field)->pack_length() - field->table->s->blob_ptr_size;
+
+			((Field_blob *) field)->store_length((byte *) from, packlength, len);
+			memcpy_fixed(((char *) from)+packlength, &data, sizeof(char*));
+
+			if (data)
+				mx_set_notnull_in_record(field, dest);
+			return;
+		}
+#ifndef DRIZZLED
+		case MYSQL_TYPE_STRING:
+			if (data) {
+				mx_set_notnull_in_record(field, dest);
+				memcpy(from, data, len);
+			}
+			else
+				len = 0;
+
+			/* And I think that fill will do this for me... */
+			field->charset()->cset->fill(field->charset(), from + len, field->field_length - len, ' ');
+			return;
+		case MYSQL_TYPE_VAR_STRING:
+			int2store(from, len);
+			if (data) {
+				mx_set_notnull_in_record(field, dest);
+				memcpy(from+HA_KEY_BLOB_LENGTH, data, len);
+			}
+			return;
+#endif
+		case MYSQL_TYPE_VARCHAR:
+			if (((Field_varstring *) field)->length_bytes == 1)
+				*((unsigned char *) from) = (unsigned char) len;
+			else
+				int2store(from, len);
+			if (data) {
+				mx_set_notnull_in_record(field, dest);
+				memcpy(from+((Field_varstring *) field)->length_bytes, data, len);
+			}
+			return;
+#ifndef DRIZZLED
+		case MYSQL_TYPE_DECIMAL:
+		case MYSQL_TYPE_TINY:
+		case MYSQL_TYPE_SHORT:
+		case MYSQL_TYPE_LONG:
+		case MYSQL_TYPE_FLOAT:
+		case MYSQL_TYPE_DOUBLE:
+		case MYSQL_TYPE_NULL:
+		case MYSQL_TYPE_TIMESTAMP:
+		case MYSQL_TYPE_LONGLONG:
+		case MYSQL_TYPE_INT24:
+		case MYSQL_TYPE_DATE:
+		case MYSQL_TYPE_TIME:
+		case MYSQL_TYPE_DATETIME:
+		case MYSQL_TYPE_YEAR:
+		case MYSQL_TYPE_NEWDATE:
+		case MYSQL_TYPE_BIT:
+		case MYSQL_TYPE_NEWDECIMAL:
+		case MYSQL_TYPE_ENUM:
+		case MYSQL_TYPE_SET:
+		case MYSQL_TYPE_GEOMETRY:
+#else
+		case DRIZZLE_TYPE_LONG:
+		case DRIZZLE_TYPE_DOUBLE:
+		case DRIZZLE_TYPE_NULL:
+		case DRIZZLE_TYPE_TIMESTAMP:
+		case DRIZZLE_TYPE_LONGLONG:
+		case DRIZZLE_TYPE_DATETIME:
+		case DRIZZLE_TYPE_DATE:
+		case DRIZZLE_TYPE_NEWDECIMAL:
+		case DRIZZLE_TYPE_ENUM:
+#endif
+			break;
+	}
+
+	if (data) {
+		mx_set_notnull_in_record(field, dest);
+		memcpy(from, data, len);
+	}
+	else
+		bzero(from, field->pack_length());
+}
+
+xtPublic void myxt_set_null_row_from_key(XTOpenTablePtr XT_UNUSED(ot), XTIndexPtr ind, xtWord1 *record)
+{
+	register XTIndexSegRec *keyseg = ind->mi_seg;
+
+	for (u_int i=0; i<ind->mi_seg_count; i++, keyseg++) {
+		ASSERT_NS(keyseg->null_bit);
+		record[keyseg->null_pos] |= keyseg->null_bit;
+	}
+}
+
+xtPublic void myxt_set_default_row_from_key(XTOpenTablePtr ot, XTIndexPtr ind, xtWord1 *record)
+{
+	XTTableHPtr		tab = ot->ot_table;
+	TABLE			*table = tab->tab_dic.dic_my_table;
+	XTIndexSegRec	*keyseg = ind->mi_seg;
+
+	xt_lock_mutex_ns(&tab->tab_dic_field_lock);
+
+	for (u_int i=0; i<ind->mi_seg_count; i++, keyseg++) {
+		
+		u_int col_idx = keyseg->col_idx;
+		Field *field = table->field[col_idx];
+		byte  *field_save = field->ptr;
+
+		field->ptr = table->s->default_values + keyseg->start;
+		memcpy(record + keyseg->start, field->ptr, field->pack_length());
+		record[keyseg->null_pos] &= ~keyseg->null_bit;
+		record[keyseg->null_pos] |= table->s->default_values[keyseg->null_pos] & keyseg->null_bit;
+
+		field->ptr = field_save;
+	}
+
+	xt_unlock_mutex_ns(&tab->tab_dic_field_lock);
+}
+
+/* Derived from _mi_put_key_in_record */
+xtPublic xtBool myxt_create_row_from_key(XTOpenTablePtr XT_UNUSED(ot), XTIndexPtr ind, xtWord1 *b_value, u_int key_len, xtWord1 *dest_buff)
+{
+	byte					*record = (byte *) dest_buff;
+	register byte			*key;
+	byte					*pos,*key_end;
+	register XTIndexSegRec	*keyseg = ind->mi_seg;
+
+	/* GOTCHA: When selecting from multiple
+	 * indexes the key values are "merged" into the
+	 * same buffer!!
+	 * This means that this function must not affect
+	 * the value of any other feilds.
+	 *
+	 * I was setting all to NULL:
+	memset(dest_buff, 0xFF, table->s->null_bytes);
+	*/
+	key = (byte *) b_value;
+	key_end = key + key_len;
+	for (u_int i=0; i<ind->mi_seg_count; i++, keyseg++) {
+		if (keyseg->null_bit) {
+			if (!*key++)
+			{
+				record[keyseg->null_pos] |= keyseg->null_bit;
+				continue;
+			}
+			record[keyseg->null_pos] &= ~keyseg->null_bit;
+		}
+#ifndef DRIZZLED
+		if (keyseg->type == HA_KEYTYPE_BIT)
+		{
+			uint length = keyseg->length;
+
+			if (keyseg->bit_length)
+			{
+				uchar bits= *key++;
+				set_rec_bits(bits, record + keyseg->bit_pos, keyseg->bit_start,
+										 keyseg->bit_length);
+				length--;
+			}
+			else
+			{
+				clr_rec_bits(record + keyseg->bit_pos, keyseg->bit_start,
+										 keyseg->bit_length);
+			}
+			memcpy(record + keyseg->start, (byte*) key, length);
+			key+= length;
+			continue;
+		}
+#endif
+		if (keyseg->flag & HA_SPACE_PACK)
+		{
+			uint length;
+			get_key_length(length,key);
+#ifdef CHECK_KEYS
+			if (length > keyseg->length || key+length > key_end)
+				goto err;
+#endif
+			pos = record+keyseg->start;
+#ifndef DRIZZLED
+			if (keyseg->type != (int) HA_KEYTYPE_NUM)
+			{
+#endif
+				memcpy(pos,key,(size_t) length);
+				bfill(pos+length,keyseg->length-length,' ');
+#ifndef DRIZZLED
+			}
+			else
+			{
+				bfill(pos,keyseg->length-length,' ');
+				memcpy(pos+keyseg->length-length,key,(size_t) length);
+			}
+#endif
+			key+=length;
+			continue;
+		}
+
+		if (keyseg->flag & HA_VAR_LENGTH_PART)
+		{
+			uint length;
+			get_key_length(length,key);
+#ifdef CHECK_KEYS
+			if (length > keyseg->length || key+length > key_end)
+	goto err;
+#endif
+			/* Store key length */
+			if (keyseg->bit_start == 1)
+				*(uchar*) (record+keyseg->start)= (uchar) length;
+			else
+				int2store(record+keyseg->start, length);
+			/* And key data */
+			memcpy(record+keyseg->start + keyseg->bit_start, (byte*) key, length);
+			key+= length;
+		}
+		else if (keyseg->flag & HA_BLOB_PART)
+		{
+			uint length;
+			get_key_length(length,key);
+#ifdef CHECK_KEYS
+			if (length > keyseg->length || key+length > key_end)
+				goto err;
+#endif
+			/* key is a pointer into ot_ind_rbuf, which should be
+			 * safe until we move to the next index item!
+			 */
+			byte *key_ptr = key; // Cannot take the address of a register variable!
+			memcpy(record+keyseg->start+keyseg->bit_start,
+			 (char*) &key_ptr,sizeof(char*));
+
+			my_store_blob_length(record+keyseg->start,
+					(uint) keyseg->bit_start,length);
+			key+=length;
+		}
+		else if (keyseg->flag & HA_SWAP_KEY)
+		{
+			byte *to=	record+keyseg->start+keyseg->length;
+			byte *end= key+keyseg->length;
+#ifdef CHECK_KEYS
+			if (end > key_end)
+				goto err;
+#endif
+			do {
+				*--to= *key++;
+			} while (key != end);
+			continue;
+		}
+		else
+		{
+#ifdef CHECK_KEYS
+			if (key+keyseg->length > key_end)
+				goto err;
+#endif
+			memcpy(record+keyseg->start,(byte*) key,
+			 (size_t) keyseg->length);
+			key+= keyseg->length;
+		}
+	
+	}
+	return OK;
+
+#ifdef CHECK_KEYS
+	err:
+	return FAILED;				/* Crashed row */
+#endif
+}
+
+/*
+ * -----------------------------------------------------------------------
+ * Compare keys
+ */
+
+static int my_compare_bin(uchar *a, uint a_length, uchar *b, uint b_length,
+											 my_bool part_key, my_bool skip_end_space)
+{
+	uint length= a_length < b_length ? a_length : b_length;
+	uchar *end= a+ length;
+	int flag;
+
+	while (a < end)
+		if ((flag= (int) *a++ - (int) *b++))
+			return flag;
+	if (part_key && b_length < a_length)
+		return 0;
+	if (skip_end_space && a_length != b_length)
+	{
+		int swap= 1;
+		/*
+			We are using space compression. We have to check if longer key
+			has next character < ' ', in which case it's less than the shorter
+			key that has an implicite space afterwards.
+
+			This code is identical to the one in
+			strings/ctype-simple.c:my_strnncollsp_simple
+		*/
+		if (a_length < b_length)
+		{
+			/* put shorter key in a */
+			a_length= b_length;
+			a= b;
+			swap= -1;					/* swap sign of result */
+		}
+		for (end= a + a_length-length; a < end ; a++)
+		{
+			if (*a != ' ')
+				return (*a < ' ') ? -swap : swap;
+		}
+		return 0;
+	}
+	return (int) (a_length-b_length);
+}
+
+xtPublic u_int myxt_get_key_length(XTIndexPtr ind, xtWord1 *key_buf)
+{
+	register XTIndexSegRec	*keyseg = ind->mi_seg;
+	register uchar			*key_data = (uchar *) key_buf;
+	uint					seg_len;
+	uint					pack_len;
+
+	for (u_int i=0; i<ind->mi_seg_count; i++, keyseg++) {
+		/* Handle NULL part */
+		if (keyseg->null_bit) {
+			if (!*key_data++)	
+				continue;
+		}
+
+		switch ((enum ha_base_keytype) keyseg->type) {
+			case HA_KEYTYPE_TEXT:											 /* Ascii; Key is converted */
+				if (keyseg->flag & HA_SPACE_PACK) {
+					get_key_pack_length(seg_len, pack_len, key_data);
+				}
+				else
+					seg_len = keyseg->length;
+				key_data += seg_len;
+				break;
+			case HA_KEYTYPE_BINARY:
+				if (keyseg->flag & HA_SPACE_PACK) {
+					get_key_pack_length(seg_len, pack_len, key_data);
+				}
+				else
+					seg_len = keyseg->length;
+				key_data += seg_len;
+				break;
+			case HA_KEYTYPE_VARTEXT1:
+			case HA_KEYTYPE_VARTEXT2:
+				get_key_pack_length(seg_len, pack_len, key_data);
+				key_data += seg_len;
+				break;
+			case HA_KEYTYPE_VARBINARY1:
+			case HA_KEYTYPE_VARBINARY2:
+				get_key_pack_length(seg_len, pack_len, key_data);
+				key_data += seg_len;
+				break;
+#ifndef DRIZZLED
+			case HA_KEYTYPE_NUM: {
+				/* Numeric key */
+				if (keyseg->flag & HA_SPACE_PACK)
+					seg_len = *key_data++;
+				else
+					seg_len = keyseg->length;
+				key_data += seg_len;
+				break;
+			}
+			case HA_KEYTYPE_INT8:
+			case HA_KEYTYPE_SHORT_INT:
+			case HA_KEYTYPE_USHORT_INT:
+			case HA_KEYTYPE_INT24:
+			case HA_KEYTYPE_FLOAT:
+			case HA_KEYTYPE_BIT:
+#endif
+			case HA_KEYTYPE_LONG_INT:
+			case HA_KEYTYPE_ULONG_INT:
+			case HA_KEYTYPE_UINT24:
+			case HA_KEYTYPE_DOUBLE:
+			case HA_KEYTYPE_LONGLONG:
+			case HA_KEYTYPE_ULONGLONG:
+				key_data += keyseg->length;
+				break;
+			case HA_KEYTYPE_END:
+				goto end;
+		}
+	}
+
+	end:
+	u_int ilen = (xtWord1 *) key_data - key_buf;
+	if (ilen > XT_INDEX_MAX_KEY_SIZE)
+		ind->mi_key_corrupted = TRUE;
+	return ilen;
+}
+
+/* Derived from ha_key_cmp */
+xtPublic int myxt_compare_key(XTIndexPtr ind, int search_flags, uint key_length, xtWord1 *key_value, xtWord1 *b_value)
+{
+	register XTIndexSegRec	*keyseg = ind->mi_seg;
+	int						flag;
+	register uchar			*a = (uchar *) key_value;
+	uint					a_length;
+	register uchar			*b = (uchar *) b_value;
+	uint					b_length;
+	uint					next_key_length;
+	uchar					*end;
+	uint					piks;
+	uint					pack_len;
+
+	for (uint i=0; i < ind->mi_seg_count && (int) key_length > 0; key_length = next_key_length, keyseg++, i++) {
+		piks = !(keyseg->flag & HA_NO_SORT);
+
+		/* Handle NULL part */
+		if (keyseg->null_bit) {
+			/* 1 is not null, 0 is null */
+			int b_not_null = (int) *b++;
+
+			key_length--;
+			if ((int) *a != b_not_null && piks)
+			{
+				flag = (int) *a - b_not_null;
+				return ((keyseg->flag & HA_REVERSE_SORT) ? -flag : flag);
+			}
+			if (!*a++) {		
+				/* If key was NULL */
+				if (search_flags == (SEARCH_FIND | SEARCH_UPDATE))
+					search_flags = SEARCH_SAME;								 /* Allow duplicate keys */
+				else if (search_flags & SEARCH_NULL_ARE_NOT_EQUAL)
+				{
+					/*
+					 * This is only used from mi_check() to calculate cardinality.
+					 * It can't be used when searching for a key as this would cause
+					 * compare of (a,b) and (b,a) to return the same value.
+					 */
+					return -1;
+				}
+				/* PMC - I don't know why I had next_key_length = key_length - keyseg->length;
+				 * This was my comment: even when null we have the complete length
+				 *
+				 * The truth is, a NULL only takes up one byte in the key, and this has already
+				 * been subtracted.
+				 */
+				next_key_length = key_length;
+				continue;															 /* To next key part */
+			}
+		}
+		
+		/* Both components are not null... */
+		if (keyseg->length < key_length) {
+			end = a + keyseg->length;
+			next_key_length = key_length - keyseg->length;
+		}
+		else {
+			end = a + key_length;
+			next_key_length = 0;
+		}
+
+		switch ((enum ha_base_keytype) keyseg->type) {
+			case HA_KEYTYPE_TEXT:											 /* Ascii; Key is converted */
+				if (keyseg->flag & HA_SPACE_PACK) {
+					get_key_pack_length(a_length, pack_len, a);
+					next_key_length = key_length - a_length - pack_len;
+					get_key_pack_length(b_length, pack_len, b);
+
+					if (piks && (flag = my_compare_text(keyseg->charset, a, a_length, b, b_length,
+									(my_bool) ((search_flags & SEARCH_PREFIX) && next_key_length <= 0),
+									(my_bool)!(search_flags & SEARCH_PREFIX))))
+						return ((keyseg->flag & HA_REVERSE_SORT) ? -flag : flag);
+					a += a_length;
+				}
+				else {
+					a_length = (uint) (end - a);
+					b_length = keyseg->length;
+					if (piks && (flag = my_compare_text(keyseg->charset, a, a_length, b, b_length,
+									(my_bool) ((search_flags & SEARCH_PREFIX) && next_key_length <= 0),
+									(my_bool)!(search_flags & SEARCH_PREFIX))))
+						return ((keyseg->flag & HA_REVERSE_SORT) ? -flag : flag);
+					a = end;
+				}
+				b += b_length;
+				break;
+			case HA_KEYTYPE_BINARY:
+				if (keyseg->flag & HA_SPACE_PACK) {
+					get_key_pack_length(a_length, pack_len, a);
+					next_key_length = key_length - a_length - pack_len;
+					get_key_pack_length(b_length, pack_len, b);
+
+					if (piks && (flag = my_compare_bin(a, a_length, b, b_length,
+								(my_bool) ((search_flags & SEARCH_PREFIX) && next_key_length <= 0), 1)))
+						return ((keyseg->flag & HA_REVERSE_SORT) ? -flag : flag);
+				}
+				else {
+					a_length = keyseg->length;
+					b_length = keyseg->length;
+					if (piks && (flag = my_compare_bin(a, a_length, b, b_length,
+									(my_bool) ((search_flags & SEARCH_PREFIX) && next_key_length <= 0), 0)))
+						return ((keyseg->flag & HA_REVERSE_SORT) ? -flag : flag);
+				}
+				a += a_length;
+				b += b_length;
+				break;
+			case HA_KEYTYPE_VARTEXT1:
+			case HA_KEYTYPE_VARTEXT2:
+			{
+				get_key_pack_length(a_length, pack_len, a);
+				next_key_length = key_length - a_length - pack_len;
+				get_key_pack_length(b_length, pack_len, b);
+
+				if (piks && (flag = my_compare_text(keyseg->charset, a, a_length, b, b_length,
+								(my_bool) ((search_flags & SEARCH_PREFIX) && next_key_length <= 0),
+								(my_bool) ((search_flags & (SEARCH_FIND | SEARCH_UPDATE)) == SEARCH_FIND))))
+					return ((keyseg->flag & HA_REVERSE_SORT) ? -flag : flag);
+				a += a_length;
+				b += b_length;
+				break;
+			}
+			case HA_KEYTYPE_VARBINARY1:
+			case HA_KEYTYPE_VARBINARY2:
+			{				
+				get_key_pack_length(a_length, pack_len, a);
+				next_key_length = key_length - a_length - pack_len;
+				get_key_pack_length(b_length, pack_len, b);
+
+				if (piks && (flag=my_compare_bin(a, a_length, b, b_length,
+						(my_bool) ((search_flags & SEARCH_PREFIX) && next_key_length <= 0), 0)))
+					return ((keyseg->flag & HA_REVERSE_SORT) ? -flag : flag);
+				a += a_length;
+				b += b_length;
+				break;
+			}
+#ifndef DRIZZLED
+			case HA_KEYTYPE_INT8:
+			{
+				int i_1 = (int) *((signed char *) a);
+				int i_2 = (int) *((signed char *) b);
+				if (piks && (flag = CMP_NUM(i_1,i_2)))
+					return ((keyseg->flag & HA_REVERSE_SORT) ? -flag : flag);
+				a = end;
+				b += keyseg->length;
+				break;
+			}
+			case HA_KEYTYPE_SHORT_INT: {
+				int16 s_1 = sint2korr(a);
+				int16 s_2 = sint2korr(b);
+				if (piks && (flag = CMP_NUM(s_1, s_2)))
+					return ((keyseg->flag & HA_REVERSE_SORT) ? -flag : flag);
+				a = end;
+				b += keyseg->length;
+				break;
+			}
+			case HA_KEYTYPE_USHORT_INT: {
+				uint16 us_1= sint2korr(a);
+				uint16 us_2= sint2korr(b);
+				if (piks && (flag = CMP_NUM(us_1, us_2)))
+					return ((keyseg->flag & HA_REVERSE_SORT) ? -flag : flag);
+				a =	end;
+				b += keyseg->length;
+				break;
+			}
+#endif
+			case HA_KEYTYPE_LONG_INT: {
+				int32 l_1 = sint4korr(a);
+				int32 l_2 = sint4korr(b);
+				if (piks && (flag = CMP_NUM(l_1, l_2)))
+					return ((keyseg->flag & HA_REVERSE_SORT) ? -flag : flag);
+				a = end;
+				b += keyseg->length;
+				break;
+			}
+			case HA_KEYTYPE_ULONG_INT: {
+				uint32 u_1 = sint4korr(a);
+				uint32 u_2 = sint4korr(b);
+				if (piks && (flag = CMP_NUM(u_1, u_2)))
+					return ((keyseg->flag & HA_REVERSE_SORT) ? -flag : flag);
+				a = end;
+				b += keyseg->length;
+				break;
+			}
+#ifndef DRIZZLED
+			case HA_KEYTYPE_INT24: {
+				int32 l_1 = sint3korr(a);
+				int32 l_2 = sint3korr(b);
+				if (piks && (flag = CMP_NUM(l_1, l_2)))
+					return ((keyseg->flag & HA_REVERSE_SORT) ? -flag : flag);
+				a = end;
+				b += keyseg->length;
+				break;
+			}
+#endif
+			case HA_KEYTYPE_UINT24: {
+				int32 l_1 = uint3korr(a);
+				int32 l_2 = uint3korr(b);
+				if (piks && (flag = CMP_NUM(l_1, l_2)))
+					return ((keyseg->flag & HA_REVERSE_SORT) ? -flag : flag);
+				a = end;
+				b += keyseg->length;
+				break;
+			}
+#ifndef DRIZZLED
+			case HA_KEYTYPE_FLOAT: {
+				float f_1, f_2;
+
+				float4get(f_1, a);
+				float4get(f_2, b);
+				/*
+				 * The following may give a compiler warning about floating point
+				 * comparison not being safe, but this is ok in this context as
+				 * we are bascily doing sorting
+				 */
+				if (piks && (flag = CMP_NUM(f_1, f_2)))
+					return ((keyseg->flag & HA_REVERSE_SORT) ? -flag : flag);
+				a = end;
+				b += keyseg->length;
+				break;
+			}
+#endif
+			case HA_KEYTYPE_DOUBLE: {
+				double d_1, d_2;
+
+				float8get(d_1, a);
+				float8get(d_2, b);
+				/*
+				 * The following may give a compiler warning about floating point
+				 * comparison not being safe, but this is ok in this context as
+				 * we are bascily doing sorting
+				 */
+				if (piks && (flag = CMP_NUM(d_1, d_2)))
+					return ((keyseg->flag & HA_REVERSE_SORT) ? -flag : flag);
+				a = end;
+				b += keyseg->length;
+				break;
+			}
+#ifndef DRIZZLED
+			case HA_KEYTYPE_NUM: {
+				/* Numeric key */
+				if (keyseg->flag & HA_SPACE_PACK) {
+					a_length = *a++;
+					end = a + a_length;
+					next_key_length = key_length - a_length - 1;
+					b_length = *b++;
+				}
+				else {
+					a_length = (int) (end - a);
+					b_length = keyseg->length;
+				}
+
+				/* remove pre space from keys */
+				for ( ; a_length && *a == ' ' ; a++, a_length--) ;
+				for ( ; b_length && *b == ' ' ; b++, b_length--) ;
+
+				if (keyseg->flag & HA_REVERSE_SORT) {
+					swap_variables(uchar *, a, b);
+					swap_variables(uint, a_length, b_length);
+				}
+				
+				if (piks) {
+					if (*a == '-') {
+						if (*b != '-')
+							return -1;
+						a++; b++;
+						swap_variables(uchar *, a, b);
+						swap_variables(uint, a_length, b_length);
+						a_length--; b_length--;
+					}
+					else if (*b == '-')
+						return 1;
+					while (a_length && (*a == '+' || *a == '0')) {
+						a++; a_length--;
+					}
+					
+					while (b_length && (*b == '+' || *b == '0')) {
+						b++; b_length--;
+					}
+				
+					if (a_length != b_length)
+						return (a_length < b_length) ? -1 : 1;
+					while (b_length) {
+						if (*a++ !=	*b++)
+							return ((int) a[-1] - (int) b[-1]);
+						b_length--;
+					}
+				}
+				a = end;
+				b += b_length;
+				break;
+			}
+#endif
+#ifdef HAVE_LONG_LONG
+			case HA_KEYTYPE_LONGLONG: {
+				longlong ll_a = sint8korr(a);
+				longlong ll_b = sint8korr(b);
+				if (piks && (flag = CMP_NUM(ll_a,ll_b)))
+					return ((keyseg->flag & HA_REVERSE_SORT) ? -flag : flag);
+				a = end;
+				b += keyseg->length;
+				break;
+			}
+			case HA_KEYTYPE_ULONGLONG: {					
+				ulonglong ll_a = uint8korr(a);
+				ulonglong ll_b = uint8korr(b);
+				if (piks && (flag = CMP_NUM(ll_a,ll_b)))
+					return ((keyseg->flag & HA_REVERSE_SORT) ? -flag : flag);
+				a = end;
+				b += keyseg->length;
+				break;
+			}
+#endif
+#ifndef DRIZZLED
+			case HA_KEYTYPE_BIT:
+				/* TODO: What here? */
+				break;
+#endif
+			case HA_KEYTYPE_END:												/* Ready */
+				goto end;
+		}
+	}
+
+	end:
+	return 0;
+}
+
+xtPublic u_int myxt_key_seg_length(XTIndexSegRec *keyseg, u_int key_offset, xtWord1 *key_value)
+{
+	register xtWord1	*a = (xtWord1 *) key_value + key_offset;
+	u_int				a_length;
+	u_int				has_null = 0;
+	u_int				key_length = 0;
+	u_int				pack_len;
+
+	/* Handle NULL part */
+	if (keyseg->null_bit) {
+		has_null++;
+		/* If the value is null, then it only requires one byte: */
+		if (!*a++)
+			return has_null;
+	}
+	
+	key_length = has_null + keyseg->length;
+
+	switch ((enum ha_base_keytype) keyseg->type) {
+		case HA_KEYTYPE_TEXT:											 /* Ascii; Key is converted */
+			if (keyseg->flag & HA_SPACE_PACK) {
+				get_key_pack_length(a_length, pack_len, a);
+				key_length = has_null + a_length + pack_len;
+			}
+			break;
+		case HA_KEYTYPE_BINARY:
+			if (keyseg->flag & HA_SPACE_PACK) {
+				get_key_pack_length(a_length, pack_len, a);
+				key_length = has_null + a_length + pack_len;
+			}
+			break;
+		case HA_KEYTYPE_VARTEXT1:
+		case HA_KEYTYPE_VARTEXT2:
+		case HA_KEYTYPE_VARBINARY1:
+		case HA_KEYTYPE_VARBINARY2: {				
+			get_key_pack_length(a_length, pack_len, a);
+			key_length = has_null + a_length + pack_len;
+			break;
+		}
+#ifndef DRIZZLED
+		case HA_KEYTYPE_INT8:
+		case HA_KEYTYPE_SHORT_INT:
+		case HA_KEYTYPE_USHORT_INT:
+		case HA_KEYTYPE_INT24:
+		case HA_KEYTYPE_FLOAT:
+#endif		
+		case HA_KEYTYPE_LONG_INT:
+		case HA_KEYTYPE_ULONG_INT:
+		case HA_KEYTYPE_UINT24:
+		case HA_KEYTYPE_DOUBLE:
+			break;
+#ifndef DRIZZLED
+		case HA_KEYTYPE_NUM: {
+			/* Numeric key */
+			if (keyseg->flag & HA_SPACE_PACK) {
+				a_length = *a++;
+				key_length = has_null + a_length + 1;
+			}
+			break;
+		}
+#endif
+#ifdef HAVE_LONG_LONG
+		case HA_KEYTYPE_LONGLONG:
+		case HA_KEYTYPE_ULONGLONG:
+			break;
+#endif
+#ifndef DRIZZLED
+		case HA_KEYTYPE_BIT:
+			/* TODO: What here? */
+			break;
+#endif
+		case HA_KEYTYPE_END:												/* Ready */
+			break;
+	}
+
+	return key_length;
+}
+
+/*
+ * -----------------------------------------------------------------------
+ * Load and store rows
+ */
+
+xtPublic xtWord4 myxt_store_row_length(XTOpenTablePtr ot, char *rec_buff)
+{
+	TABLE	*table = ot->ot_table->tab_dic.dic_my_table;
+	char	*sdata;
+	xtWord4	dlen;
+	xtWord4	item_size;
+	xtWord4 row_size = 0;
+
+ 	for (Field **field=table->field ; *field ; field++) {
+		if ((*field)->is_null_in_record((const uchar *) rec_buff)) {
+ 			sdata = NULL;
+ 			dlen = 0;
+ 			item_size = 1;
+ 		}
+ 		else {
+			sdata = mx_get_length_and_data(*field, rec_buff, &dlen);
+			if (!dlen) {
+				/* Empty, but not null (blobs may return NULL, when
+				 * length is 0.
+				 */
+				sdata = rec_buff; // Any valid pointer will do
+				item_size = 1 + dlen;
+			}
+			else if (dlen <= 240)
+				item_size = 1 + dlen;
+			else if (dlen <= 0xFFFF)
+				item_size = 3 + dlen;
+			else if (dlen <= 0xFFFFFF)
+				item_size = 4 + dlen;
+			else
+				item_size = 5 + dlen;
+		}
+
+		row_size += item_size;
+	}
+	return row_size;
+}
+
+xtPublic xtWord4 myxt_store_row_data(XTOpenTablePtr ot, xtWord4 row_size, char *rec_buff)
+{
+	TABLE	*table = ot->ot_table->tab_dic.dic_my_table;
+	char	*sdata;
+	xtWord4	dlen;
+	xtWord4	item_size;
+
+ 	for (Field **field=table->field ; *field ; field++) {
+		if ((*field)->is_null_in_record((const uchar *) rec_buff)) {
+ 			sdata = NULL;
+ 			dlen = 0;
+ 			item_size = 1;
+ 		}
+ 		else {
+			sdata = mx_get_length_and_data(*field, rec_buff, &dlen);
+			if (!dlen) {
+				/* Empty, but not null (blobs may return NULL, when
+				 * length is 0.
+				 */
+				sdata = rec_buff; // Any valid pointer will do
+				item_size = 1 + dlen;
+			}
+			else if (dlen <= 240)
+				item_size = 1 + dlen;
+			else if (dlen <= 0xFFFF)
+				item_size = 3 + dlen;
+			else if (dlen <= 0xFFFFFF)
+				item_size = 4 + dlen;
+			else
+				item_size = 5 + dlen;
+		}
+
+		if (row_size + item_size > ot->ot_row_wbuf_size) {
+			if (!xt_realloc_ns((void **) &ot->ot_row_wbuffer, row_size + item_size))
+				return 0;
+			ot->ot_row_wbuf_size = row_size + item_size;
+		}
+
+		if (!sdata)
+			ot->ot_row_wbuffer[row_size] = 255;
+		else if (dlen <= 240) {
+			ot->ot_row_wbuffer[row_size] = (unsigned char) dlen;
+			memcpy(&ot->ot_row_wbuffer[row_size+1], sdata, dlen);
+		}
+		else if (dlen <= 0xFFFF) {
+			ot->ot_row_wbuffer[row_size] = 254;
+			XT_SET_DISK_2(&ot->ot_row_wbuffer[row_size+1], dlen);
+			memcpy(&ot->ot_row_wbuffer[row_size+3], sdata, dlen);
+		}
+		else if (dlen <= 0xFFFFFF) {
+			ot->ot_row_wbuffer[row_size] = 253;
+			XT_SET_DISK_3(&ot->ot_row_wbuffer[row_size+1], dlen);
+			memcpy(&ot->ot_row_wbuffer[row_size+4], sdata, dlen);
+		}
+		else {
+			ot->ot_row_wbuffer[row_size] = 252;
+			XT_SET_DISK_4(&ot->ot_row_wbuffer[row_size+1], dlen);
+			memcpy(&ot->ot_row_wbuffer[row_size+5], sdata, dlen);
+		}
+
+		row_size += item_size;
+	}
+	return row_size;
+}
+
+/* Count the number and size of whole columns in the given buffer. */
+xtPublic size_t myxt_load_row_length(XTOpenTablePtr ot, size_t buffer_size, xtWord1 *source_buf, u_int *ret_col_cnt)
+{
+	u_int	col_cnt;
+	xtWord4	len;
+	size_t	size = 0;
+	u_int	i;
+
+	col_cnt = ot->ot_table->tab_dic.dic_no_of_cols;
+	if (ret_col_cnt)
+		col_cnt = *ret_col_cnt;
+ 	for (i=0; i<col_cnt; i++) {
+		if (size + 1 > buffer_size)
+			goto done;
+ 		switch (*source_buf) {
+			case 255: // Indicate NULL value
+				size++;
+				source_buf++;
+				break;
+			case 254: // 2 bytes length
+				if (size + 3 > buffer_size)
+					goto done;
+				len = XT_GET_DISK_2(source_buf + 1);
+				if (size + 3 + len > buffer_size)
+					goto done;
+				size += 3 + len;
+				source_buf += 3 + len;
+				break;
+			case 253: // 3 bytes length
+				if (size + 4 > buffer_size)
+					goto done;
+				len = XT_GET_DISK_3(source_buf + 1);
+				if (size + 4 + len > buffer_size)
+					goto done;
+				size += 4 + len;
+				source_buf += 4 + len;
+				break;
+			case 252: // 4 bytes length
+				if (size + 5 > buffer_size)
+					goto done;
+				len = XT_GET_DISK_4(source_buf + 1);
+				if (size + 5 + len > buffer_size)
+					goto done;
+				size += 5 + len;
+				source_buf += 5 + len;
+				break;
+			default: // Length byte
+				len = *source_buf;
+				if (size + 1 + len > buffer_size)
+					goto done;
+				size += 1 + len;
+				source_buf += 1 + len;
+				break;
+ 		}
+	}
+	
+	done:
+	if (ret_col_cnt)
+		*ret_col_cnt = i;
+	return size;
+}
+
+/* Unload from PBXT variable length format to the MySQL row format. */
+xtPublic xtWord4 myxt_load_row_data(XTOpenTablePtr ot, xtWord1 *source_buf, xtWord1 *dest_buff, u_int col_cnt)
+{
+	xtWord1 *input_buf = source_buf;
+	TABLE	*table;
+	xtWord4	len;
+	Field	*curr_field;
+	xtBool	is_null;
+	u_int	i = 0;
+
+	if (!(table = ot->ot_table->tab_dic.dic_my_table)) {
+		xt_register_taberr(XT_REG_CONTEXT, XT_ERR_NO_DICTIONARY, ot->ot_table->tab_name);
+		return 0;
+	}
+
+	/* According to the InnoDB implementation:
+	 * "MySQL assumes that all columns
+	 * have the SQL NULL bit set unless it
+	 * is a nullable column with a non-NULL value".
+	 */
+	memset(dest_buff, 0xFF, table->s->null_bytes);
+ 	for (Field **field=table->field ; *field && (!col_cnt || i<col_cnt); field++, i++) {
+		curr_field = *field;
+ 		is_null = FALSE;
+ 		switch (*source_buf) {
+			case 255: // Indicate NULL value
+				is_null = TRUE;
+				len = 0;
+				source_buf++;
+				break;
+			case 254: // 2 bytes length
+				len = XT_GET_DISK_2(source_buf + 1);
+				source_buf += 3;
+				break;
+			case 253: // 3 bytes length
+				len = XT_GET_DISK_3(source_buf + 1);
+				source_buf += 4;
+				break;
+			case 252: // 4 bytes length
+				len = XT_GET_DISK_4(source_buf + 1);
+				source_buf += 5;
+				break;
+			default: // Length byte
+				if (*source_buf > 240) {
+					xt_register_xterr(XT_REG_CONTEXT, XT_ERR_BAD_RECORD_FORMAT);
+					return 0;
+				}
+				len = *source_buf;
+				source_buf++;
+				break;
+ 		}
+
+		if (is_null)
+			mx_set_length_and_data(curr_field, (char *) dest_buff, 0, NULL);
+		else
+			mx_set_length_and_data(curr_field, (char *) dest_buff, len, (char *) source_buf);
+
+		source_buf += len;
+ 	}
+	return (xtWord4) (source_buf - input_buf);
+}
+
+xtPublic xtBool myxt_load_row(XTOpenTablePtr ot, xtWord1 *source_buf, xtWord1 *dest_buff, u_int col_cnt)
+{
+	return myxt_load_row_data(ot, source_buf, dest_buff, col_cnt) != 0;
+}
+
+xtPublic xtBool myxt_find_column(XTOpenTablePtr ot, u_int *col_idx, const char *col_name)
+{
+	TABLE	*table = ot->ot_table->tab_dic.dic_my_table;
+	u_int	i=0;
+
+	for (Field **field=table->field; *field; field++, i++) {
+		if (!my_strcasecmp(system_charset_info, (*field)->field_name, col_name)) {
+			*col_idx = i;
+			return OK;
+		}
+	}
+	return FALSE;
+}
+
+xtPublic void myxt_get_column_name(XTOpenTablePtr ot, u_int col_idx, u_int len, char *col_name)
+{
+	TABLE	*table = ot->ot_table->tab_dic.dic_my_table;
+	Field	*field;
+
+	field = table->field[col_idx];
+	xt_strcpy(len, col_name, field->field_name);
+}
+
+xtPublic void myxt_get_column_as_string(XTOpenTablePtr ot, char *buffer, u_int col_idx, u_int len, char *value)
+{
+	XTTableHPtr	tab = ot->ot_table;
+	XTThreadPtr self = ot->ot_thread;
+	TABLE		*table = tab->tab_dic.dic_my_table;
+	Field		*field = table->field[col_idx];
+	char		buf_val[MAX_FIELD_WIDTH];
+	String		val(buf_val, sizeof(buf_val), &my_charset_bin);
+
+	if (mx_is_null_in_record(field, buffer))
+		xt_strcpy(len, value, "NULL");
+	else {
+		byte	*save;
+
+		/* Required by store() - or an assertion will fail: */
+		if (table->read_set)
+			MX_BIT_SET(table->read_set, col_idx);
+
+		save = field->ptr;
+		xt_lock_mutex(self, &tab->tab_dic_field_lock);
+		pushr_(xt_unlock_mutex, &tab->tab_dic_field_lock);
+#if MYSQL_VERSION_ID < 50114
+		field->ptr = (byte *) buffer + field->offset();
+#else
+		field->ptr = (byte *) buffer + field->offset(field->table->record[0]);
+#endif
+		field->val_str(&val);
+		field->ptr = save;					// Restore org row pointer
+		freer_(); // xt_unlock_mutex(&tab->tab_dic_field_lock)
+		xt_strcpy(len, value, val.c_ptr());
+	}
+}
+
+xtPublic xtBool myxt_set_column(XTOpenTablePtr ot, char *buffer, u_int col_idx, const char *value, u_int len)
+{
+	XTTableHPtr	tab = ot->ot_table;
+	XTThreadPtr self = ot->ot_thread;
+	TABLE		*table = tab->tab_dic.dic_my_table;
+	Field		*field = table->field[col_idx];
+	byte		*save;
+	int			error;
+
+	/* Required by store() - or an assertion will fail: */
+	if (table->write_set)
+		MX_BIT_SET(table->write_set, col_idx);
+
+	mx_set_notnull_in_record(field, buffer);
+
+	save = field->ptr;
+	xt_lock_mutex(self, &tab->tab_dic_field_lock);
+	pushr_(xt_unlock_mutex, &tab->tab_dic_field_lock);
+#if MYSQL_VERSION_ID < 50114
+	field->ptr = (byte *) buffer + field->offset();
+#else
+	field->ptr = (byte *) buffer + field->offset(field->table->record[0]);
+#endif
+	error = field->store(value, len, &my_charset_utf8_general_ci);
+	field->ptr = save;					// Restore org row pointer
+	freer_(); // xt_unlock_mutex(&tab->tab_dic_field_lock)
+	return error ? FAILED : OK;
+}
+
+xtPublic void myxt_get_column_data(XTOpenTablePtr ot, char *buffer, u_int col_idx, char **value, size_t *len)
+{
+	TABLE	*table = ot->ot_table->tab_dic.dic_my_table;
+	Field	*field = table->field[col_idx];
+	char	*sdata;
+	xtWord4	dlen;
+
+	sdata = mx_get_length_and_data(field, buffer, &dlen);
+	*value = sdata;
+	*len = dlen;
+}
+
+xtPublic xtBool myxt_store_row(XTOpenTablePtr ot, XTTabRecInfoPtr rec_info, char *rec_buff)
+{
+	if (ot->ot_rec_fixed) {
+		rec_info->ri_fix_rec_buf = (XTTabRecFixDPtr) ot->ot_row_wbuffer;
+		rec_info->ri_rec_buf_size = ot->ot_rec_size;
+		rec_info->ri_ext_rec = NULL;
+
+		rec_info->ri_fix_rec_buf->tr_rec_type_1 = XT_TAB_STATUS_FIXED;
+		memcpy(rec_info->ri_fix_rec_buf->rf_data, rec_buff, ot->ot_rec_size - XT_REC_FIX_HEADER_SIZE);
+	}
+	else {
+		xtWord4 row_size;
+
+		if (!(row_size = myxt_store_row_data(ot, XT_REC_EXT_HEADER_SIZE, rec_buff)))
+			return FAILED;
+		if (row_size - XT_REC_FIX_EXT_HEADER_DIFF <= ot->ot_rec_size) {	
+			rec_info->ri_fix_rec_buf = (XTTabRecFixDPtr) &ot->ot_row_wbuffer[XT_REC_FIX_EXT_HEADER_DIFF];
+			rec_info->ri_rec_buf_size = row_size - XT_REC_FIX_EXT_HEADER_DIFF;
+			rec_info->ri_ext_rec = NULL;
+
+			rec_info->ri_fix_rec_buf->tr_rec_type_1 = XT_TAB_STATUS_VARIABLE;
+		}
+		else {
+			rec_info->ri_fix_rec_buf = (XTTabRecFixDPtr) ot->ot_row_wbuffer;
+			rec_info->ri_rec_buf_size = ot->ot_rec_size;
+			rec_info->ri_ext_rec = (XTTabRecExtDPtr) ot->ot_row_wbuffer;
+			rec_info->ri_log_data_size = row_size - ot->ot_rec_size;
+			rec_info->ri_log_buf = (XTactExtRecEntryDPtr) &ot->ot_row_wbuffer[ot->ot_rec_size - offsetof(XTactExtRecEntryDRec, er_data)];
+
+			rec_info->ri_ext_rec->tr_rec_type_1 = XT_TAB_STATUS_EXT_DLOG;
+			XT_SET_DISK_4(rec_info->ri_ext_rec->re_log_dat_siz_4, rec_info->ri_log_data_size);
+		}
+	}
+	return OK;
+}
+
+static void mx_print_string(uchar *s, uint count)
+{
+	while (count > 0) {
+		if (s[count - 1] != ' ')
+			break;
+		count--;
+	}
+	printf("\"");
+	for (u_int i=0; i<count; i++, s++)
+		printf("%c", *s);
+	printf("\"");
+}
+
+xtPublic void myxt_print_key(XTIndexPtr ind, xtWord1 *key_value)
+{
+	register XTIndexSegRec	*keyseg = ind->mi_seg;
+	register uchar			*b = (uchar *) key_value;
+	uint					b_length;
+	uint					pack_len;
+
+	for (u_int i = 0; i < ind->mi_seg_count; i++, keyseg++) {
+		if (i!=0)
+			printf(" ");
+		if (keyseg->null_bit) {
+			if (!*b++) {
+				printf("NULL");
+				continue;
+			}
+		}
+		switch ((enum ha_base_keytype) keyseg->type) {
+			case HA_KEYTYPE_TEXT:											 /* Ascii; Key is converted */
+				if (keyseg->flag & HA_SPACE_PACK) {
+					get_key_pack_length(b_length, pack_len, b);
+				}
+				else
+					b_length = keyseg->length;
+				mx_print_string(b, b_length);
+				b += b_length;
+				break;
+			case HA_KEYTYPE_LONG_INT: {
+				int32 l_2 = sint4korr(b);
+				b += keyseg->length;
+				printf("%ld", (long) l_2);
+				break;
+			}
+			case HA_KEYTYPE_ULONG_INT: {
+				xtWord4 u_2 = sint4korr(b);
+				b += keyseg->length;
+				printf("%lu", (u_long) u_2);
+				break;
+			}
+			default:
+				break;
+		}
+	}
+}
+
+/*
+ * -----------------------------------------------------------------------
+ * MySQL Data Dictionary
+ */
+
+#define TS(x)					(x)->s
+
+static void my_close_table(TABLE *table)
+{
+#ifdef DRIZZLED
+	TABLE_SHARE	*share;
+
+	share = (TABLE_SHARE *) ((char *) table + sizeof(TABLE));
+	share->free_table_share();
+#else
+	closefrm(table, 1);  // TODO: Q, why did Stewart remove this?
+#endif
+	xt_free_ns(table);
+}
+
+/*
+ * This function returns NULL if the table cannot be opened 
+ * because this is not a MySQL thread.
+ */ 
+static TABLE *my_open_table(XTThreadPtr self, XTDatabaseHPtr XT_UNUSED(db), XTPathStrPtr tab_path)
+{
+	THD			*thd = current_thd;
+	char		path_buffer[PATH_MAX];
+	char		*table_name;
+	char		database_name[XT_IDENTIFIER_NAME_SIZE];
+	char		*ptr;
+	size_t		size;
+	char		*buffer, *path, *db_name, *name;
+	TABLE_SHARE	*share;
+	int			error;
+	TABLE		*table;
+
+	/* If we have no MySQL thread, then we cannot open this table!
+	 * What this means is the thread is probably the sweeper or the
+	 * compactor.
+	 */
+	if (!thd)
+		return NULL;
+
+	/* GOTCHA: Check if the table name is a partitian,
+	 * if so we need to remove the partition
+	 * extension, in order for this to work!
+	 *
+	 * Reason: the parts of a partition table do not
+	 * have .frm files!!
+	 */
+	xt_strcpy(PATH_MAX, path_buffer, tab_path->ps_path);
+	table_name = xt_last_name_of_path(path_buffer);
+	if ((ptr = strstr(table_name, "#P#")))
+		*ptr = 0;
+
+	xt_2nd_last_name_of_path(XT_IDENTIFIER_NAME_SIZE, database_name, path_buffer);
+
+	size = sizeof(TABLE) + sizeof(TABLE_SHARE) + 
+		strlen(path_buffer) + 1 +
+		strlen(database_name) + 1 + strlen(table_name) + 1;
+	if (!(buffer = (char *) xt_malloc(self, size)))
+		return NULL;
+	table = (TABLE *) buffer;
+	buffer += sizeof(TABLE);
+	share = (TABLE_SHARE *) buffer;
+	buffer += sizeof(TABLE_SHARE);
+
+	path = buffer;
+	strcpy(path, path_buffer);
+	buffer += strlen(path_buffer) + 1;
+	db_name = buffer;
+	strcpy(db_name, database_name);
+	buffer += strlen(database_name) + 1;
+	name = buffer;
+	strcpy(name, table_name);
+
+	/* Required to call 'open_table_from_share'! */
+	LEX *old_lex, new_lex;
+
+	old_lex = thd->lex;
+	thd->lex = &new_lex;
+	new_lex.current_select= NULL;
+	lex_start(thd);
+
+#ifdef DRIZZLED
+	share->init(db_name, 0, name, path);
+	if ((error = open_table_def(*thd, share)) ||
+		(error = open_table_from_share(thd, share, "", 0, (uint32_t) READ_ALL, 0, table, OTM_OPEN)))
+	{
+		xt_free(self, table);
+		lex_end(&new_lex);
+		thd->lex = old_lex;
+		xt_throw_sulxterr(XT_CONTEXT, XT_ERR_LOADING_MYSQL_DIC, tab_path->ps_path, (u_long) error);
+		return NULL;
+	}
+#else
+#if MYSQL_VERSION_ID < 60000
+#if MYSQL_VERSION_ID < 50123
+	init_tmp_table_share(share, db_name, 0, name, path);
+#else
+	init_tmp_table_share(thd, share, db_name, 0, name, path);
+#endif
+#else
+#if MYSQL_VERSION_ID < 60004
+	init_tmp_table_share(share, db_name, 0, name, path);
+#else
+	init_tmp_table_share(thd, share, db_name, 0, name, path);
+#endif
+#endif
+
+	/* If MySQL shutsdown while we are just starting up, they
+	 * they kill the plugin sub-system before calling
+	 * shutdown for the engine!
+	 */
+	if (!ha_resolve_by_legacy_type(thd, DB_TYPE_PBXT)) {
+		xt_free(self, table);
+		lex_end(&new_lex);
+		thd->lex = old_lex;
+		xt_throw_xterr(XT_CONTEXT, XT_ERR_MYSQL_SHUTDOWN);
+		return NULL;
+	}
+
+	if ((error = open_table_def(thd, share, 0))) {
+		xt_free(self, table);
+		lex_end(&new_lex);
+		thd->lex = old_lex;
+		xt_throw_sulxterr(XT_CONTEXT, XT_ERR_LOADING_MYSQL_DIC, tab_path->ps_path, (u_long) error);
+		return NULL;
+	}
+
+#if MYSQL_VERSION_ID >= 50404
+	if ((error = open_table_from_share(thd, share, "", 0, (uint) READ_ALL, 0, table, OTM_OPEN)))
+#else
+	if ((error = open_table_from_share(thd, share, "", 0, (uint) READ_ALL, 0, table, FALSE)))
+#endif
+	{
+		xt_free(self, table);
+		lex_end(&new_lex);
+		thd->lex = old_lex;
+		xt_throw_sulxterr(XT_CONTEXT, XT_ERR_LOADING_MYSQL_DIC, tab_path->ps_path, (u_long) error);
+		return NULL;
+	}
+#endif
+
+	lex_end(&new_lex);
+	thd->lex = old_lex;
+
+	/* GOTCHA: I am the plug-in!!! Therefore, I should not hold 
+	 * a reference to myself. By holding this reference I prevent
+	 * plugin_shutdown() and reap_plugins() in sql_plugin.cc
+	 * from doing their job on shutdown!
+	 */
+#ifndef DRIZZLED
+	plugin_unlock(NULL, table->s->db_plugin);
+	table->s->db_plugin = NULL;
+#endif
+	return table;
+}
+
+/*
+static bool my_match_index(XTDDIndex *ind, KEY *index)
+{
+	KEY_PART_INFO	*key_part;
+	KEY_PART_INFO	*key_part_end;
+	u_int			j;
+	XTDDColumnRef	*cref;
+
+	if (index->key_parts != ind->co_cols.size())
+		return false;
+
+	j=0;
+	key_part_end = index->key_part + index->key_parts;
+	for (key_part = index->key_part; key_part != key_part_end; key_part++, j++) {
+		if (!(cref = ind->co_cols.itemAt(j)))
+			return false;
+		if (myxt_strcasecmp(cref->cr_col_name, (char *) key_part->field->field_name) != 0)
+			return false;
+	}
+
+	if (ind->co_type == XT_DD_KEY_PRIMARY) {
+		if (!(index->flags & HA_NOSAME))
+			return false;
+	}
+	else {
+		if (ind->co_type == XT_DD_INDEX_UNIQUE) {
+			if (!(index->flags & HA_NOSAME))
+				return false;
+		}
+		if (ind->co_ind_name) {
+			if (myxt_strcasecmp(ind->co_ind_name, index->name) != 0)
+				return false;
+		}
+	}
+
+	return true;
+}
+
+static XTDDIndex *my_find_index(XTDDTable *dd_tab, KEY *index)
+{
+	XTDDIndex *ind;
+
+	for (u_int i=0; i<dd_tab->dt_indexes.size(); i++)
+	{
+		ind = dd_tab->dt_indexes.itemAt(i);
+		if (my_match_index(ind, index))
+			return ind;
+	}
+	return NULL;
+}
+*/
+
+static void my_deref_index_data(struct XTThread *self, XTIndexPtr mi)
+{
+	enter_();
+	/* The dirty list of cache pages should be empty here! */
+	/* This is not the case if we were not able to flush data. E.g. when running out of disk space */
+	//ASSERT(!mi->mi_dirty_list);
+	ASSERT(!mi->mi_free_list);
+
+	xt_spinlock_free(self, &mi->mi_dirty_lock);
+	XT_INDEX_FREE_LOCK(self, mi);
+	myxt_bitmap_free(self, &mi->mi_col_map);
+	if (mi->mi_free_list)
+		xt_free(self, mi->mi_free_list);
+
+	xt_free(self, mi);
+	exit_();
+}
+
+static xtBool my_is_not_null_int4(XTIndexSegPtr seg)
+{
+	return (seg->type == HA_KEYTYPE_LONG_INT && !(seg->flag & HA_NULL_PART));
+}
+
+/* MY_BITMAP definition in Drizzle does not like if
+ * I use a NULL pointer to calculate the offset!?
+ */
+#define MX_OFFSETOF(x, y)		((size_t)(&((x *) 8)->y) - 8)
+
+/* Derived from ha_myisam::create and mi_create */
+static XTIndexPtr my_create_index(XTThreadPtr self, TABLE *table_arg, u_int idx, KEY *index)
+{
+	XTIndexPtr				ind;
+	KEY_PART_INFO			*key_part;
+	KEY_PART_INFO			*key_part_end;
+	XTIndexSegRec			*seg;
+	Field					*field;
+	enum ha_base_keytype	type;
+	uint					options = 0;
+	u_int					key_length = 0;
+	xtBool					partial_field;
+
+	enter_();
+
+	pushsr_(ind, my_deref_index_data, (XTIndexPtr) xt_calloc(self, MX_OFFSETOF(XTIndexRec, mi_seg) + sizeof(XTIndexSegRec) * index->key_parts));
+
+	XT_INDEX_INIT_LOCK(self, ind);
+	xt_spinlock_init_with_autoname(self, &ind->mi_dirty_lock);
+	ind->mi_index_no = idx;
+	ind->mi_flags = (index->flags & (HA_NOSAME | HA_NULL_ARE_EQUAL | HA_UNIQUE_CHECK));
+	//ind->mi_low_byte_first = TS(table_arg)->db_low_byte_first;
+	ind->mi_key_corrupted = FALSE;
+	ind->mi_fix_key = TRUE;
+	ind->mi_select_total = 0;
+	ind->mi_subset_of = 0;
+	myxt_bitmap_init(self, &ind->mi_col_map, TS(table_arg)->fields);
+	
+	ind->mi_seg_count = (uint) index->key_parts;
+	key_part_end = index->key_part + index->key_parts;
+	seg = ind->mi_seg;
+	for (key_part = index->key_part; key_part != key_part_end; key_part++, seg++) {
+		partial_field = FALSE;
+		field = key_part->field;
+
+		type = field->key_type();
+		seg->flag = key_part->key_part_flag;
+
+		if (options & HA_OPTION_PACK_KEYS ||
+			(index->flags & (HA_PACK_KEY | HA_BINARY_PACK_KEY | HA_SPACE_PACK_USED)))
+		{
+			if (key_part->length > 8 && (type == HA_KEYTYPE_TEXT || 
+#ifndef DRIZZLED
+				type == HA_KEYTYPE_NUM ||
+#endif
+				(type == HA_KEYTYPE_BINARY && !field->zero_pack())))
+			{
+				/* No blobs here */
+				if (key_part == index->key_part)
+					ind->mi_flags |= HA_PACK_KEY;
+#ifndef DRIZZLED
+				if (!(field->flags & ZEROFILL_FLAG) &&
+					(field->type() == MYSQL_TYPE_STRING ||
+					field->type() == MYSQL_TYPE_VAR_STRING ||
+					((int) (key_part->length - field->decimals())) >= 4))
+	    			seg->flag |= HA_SPACE_PACK;
+#endif
+			}
+		}
+
+		seg->col_idx = field->field_index;
+		seg->is_recs_in_range = 1;
+		seg->is_selectivity = 1;
+		seg->type = (int) type;
+		seg->start = key_part->offset;
+		seg->length = key_part->length;
+		seg->bit_start = seg->bit_end = 0;
+		seg->bit_length = seg->bit_pos = 0;
+		seg->charset = field->charset();
+
+		if (field->null_ptr) {
+			key_length++;
+			seg->flag |= HA_NULL_PART;
+			seg->null_bit = field->null_bit;
+			seg->null_pos = (uint) (field->null_ptr - (uchar*) table_arg->record[0]);
+		}
+		else {
+			seg->null_bit = 0;
+			seg->null_pos = 0;
+		}
+
+		if (field->real_type() == MYSQL_TYPE_ENUM
+#ifndef DRIZZLED
+			|| field->real_type() == MYSQL_TYPE_SET
+#endif
+			) {
+			/* This values are not indexed as string!!
+			 * The index will not be built correctly if this value is non-NULL.
+			 */
+			seg->charset = NULL;
+		}
+
+		if (field->type() == MYSQL_TYPE_BLOB
+#ifndef DRIZZLED
+			|| field->type() == MYSQL_TYPE_GEOMETRY
+#endif
+			) {
+			seg->flag |= HA_BLOB_PART;
+			/* save number of bytes used to pack length */
+			seg->bit_start = (uint) (field->pack_length() - TS(table_arg)->blob_ptr_size);
+		}
+#ifndef DRIZZLED
+		else if (field->type() == MYSQL_TYPE_BIT) {
+			seg->bit_length = ((Field_bit *) field)->bit_len;
+			seg->bit_start = ((Field_bit *) field)->bit_ofs;
+			seg->bit_pos = (uint) (((Field_bit *) field)->bit_ptr - (uchar*) table_arg->record[0]);
+		}
+#else
+		/* Drizzle uses HA_KEYTYPE_ULONG_INT keys for enums > 1 byte, which is not consistent with MySQL, so we fix it here  */
+		else if (field->type() == MYSQL_TYPE_ENUM) {
+			switch (seg->length) {
+				case 2: 
+#ifdef DRIZZLED
+					ASSERT_NS(FALSE);
+#else
+					seg->type = HA_KEYTYPE_USHORT_INT;
+					break;
+#endif
+				case 3:
+					seg->type = HA_KEYTYPE_UINT24;
+					break;
+			}
+		}
+#endif
+
+		switch (seg->type) {
+			case HA_KEYTYPE_VARTEXT1:
+			case HA_KEYTYPE_VARTEXT2:
+			case HA_KEYTYPE_VARBINARY1:
+			case HA_KEYTYPE_VARBINARY2:
+				if (!(seg->flag & HA_BLOB_PART)) {
+					/* Make a flag that this is a VARCHAR */
+					seg->flag |= HA_VAR_LENGTH_PART;
+					/* Store in bit_start number of bytes used to pack the length */
+					seg->bit_start = ((seg->type == HA_KEYTYPE_VARTEXT1 || seg->type == HA_KEYTYPE_VARBINARY1) ? 1 : 2);
+				}
+				break;
+		}
+
+		/* All packed fields start with a length (1 or 3 bytes): */
+		if (seg->flag & (HA_VAR_LENGTH_PART | HA_BLOB_PART | HA_SPACE_PACK)) {
+			key_length++;				/* At least one length byte */
+			if (seg->length >= 255)	/* prefix may be 3 bytes */
+	    		key_length +=2;
+		}
+
+		key_length += seg->length;
+		if (seg->length > 40)
+			ind->mi_fix_key = FALSE;
+
+		/* Determine if only part of the field is in the key:
+		 * This is important for index coverage!
+		 * Note, BLOB fields are never retrieved from
+		 * an index!
+		 */
+		if (field->type() == MYSQL_TYPE_BLOB)
+			partial_field = TRUE;
+		else if (field->real_type() == MYSQL_TYPE_VARCHAR		// For varbinary type
+#ifndef DRIZZLED
+			|| field->real_type() == MYSQL_TYPE_VAR_STRING		// For varbinary type
+			|| field->real_type() == MYSQL_TYPE_STRING			// For binary type
+#endif
+			)
+		{
+			Field	*tab_field = table_arg->field[key_part->fieldnr-1];
+			u_int	field_len = tab_field->key_length();
+
+			if (key_part->length != field_len)
+				partial_field = TRUE;
+		}
+
+		/* NOTE: do not set if the field is only partially in the index!!! */
+		if (!partial_field)
+			MX_BIT_FAST_TEST_AND_SET(&ind->mi_col_map, field->field_index);
+	}
+
+	if (key_length > XT_INDEX_MAX_KEY_SIZE)
+		xt_throw_sulxterr(XT_CONTEXT, XT_ERR_KEY_TOO_LARGE, index->name, (u_long) XT_INDEX_MAX_KEY_SIZE);
+
+	/* This is the maximum size of the index on disk: */
+	ind->mi_key_size = key_length;
+	ind->mi_max_items = (XT_INDEX_PAGE_SIZE-2) / (key_length+XT_RECORD_REF_SIZE);
+
+	if (ind->mi_fix_key) {
+		/* Special case for not-NULL 4 byte int value: */
+		switch (ind->mi_seg_count) {
+			case 1:
+				ind->mi_single_type = ind->mi_seg[0].type;
+				if (ind->mi_seg[0].type == HA_KEYTYPE_LONG_INT ||
+					ind->mi_seg[0].type == HA_KEYTYPE_ULONG_INT) {
+					if (!(ind->mi_seg[0].flag & HA_NULL_PART))
+						ind->mi_scan_branch = xt_scan_branch_single;
+				}
+				break;
+			case 2:
+				if (my_is_not_null_int4(&ind->mi_seg[0]) &&
+					my_is_not_null_int4(&ind->mi_seg[1])) {
+					ind->mi_scan_branch = xt_scan_branch_fix_simple;
+					ind->mi_simple_comp_key = xt_compare_2_int4;
+				}
+				break;
+			case 3:
+				if (my_is_not_null_int4(&ind->mi_seg[0]) &&
+					my_is_not_null_int4(&ind->mi_seg[1]) &&
+					my_is_not_null_int4(&ind->mi_seg[2])) {
+					ind->mi_scan_branch = xt_scan_branch_fix_simple;
+					ind->mi_simple_comp_key = xt_compare_3_int4;
+				}
+				break;
+		}
+		if (!ind->mi_scan_branch)
+			ind->mi_scan_branch = xt_scan_branch_fix;
+		ind->mi_prev_item = xt_prev_branch_item_fix;
+		ind->mi_last_item = xt_last_branch_item_fix;
+	}
+	else {
+		ind->mi_scan_branch = xt_scan_branch_var;
+		ind->mi_prev_item = xt_prev_branch_item_var;
+		ind->mi_last_item = xt_last_branch_item_var;
+	}
+	ind->mi_lazy_delete = ind->mi_fix_key && ind->mi_max_items >= 4;
+
+	XT_NODE_ID(ind->mi_root) = 0;
+
+	popr_(); // Discard my_deref_index_data(ind)
+
+	return_(ind);
+}
+
+/* We estimate the size of BLOBs depending on the number
+ * of BLOBs in the table.
+ */
+static u_int mx_blob_field_size_total[] = {
+	500,	// 1
+	400,	// 2
+	350,	// 3
+	320,	// 4
+	300,	// 5
+	280,	// 6
+	260,	// 7
+	240,	// 8
+	220,	// 9
+	210		// 10
+};
+
+static u_int mxvarchar_field_min_ave[] = {
+	120,	// 1
+	105,	// 2
+	90,		// 3
+	65,		// 4
+	50,		// 5
+	40,		// 6
+	40,		// 7
+	40,		// 8
+	40,		// 9
+	40		// 10
+};
+
+xtPublic void myxt_setup_dictionary(XTThreadPtr self, XTDictionaryPtr dic)
+{
+	TABLE	*my_tab = dic->dic_my_table;
+	u_int	field_count;
+	u_int	var_field_count = 0;
+	u_int	varchar_field_count = 0;
+	u_int	blob_field_count = 0;
+	u_int	large_blob_field_count = 0;
+	xtWord8 min_data_size = 0;
+	xtWord8 max_data_size = 0;
+	xtWord8 ave_data_size = 0;
+	xtWord8 min_row_size = 0;
+	xtWord8 max_row_size = 0;
+	xtWord8 ave_row_size = 0;
+	xtWord8 min_ave_row_size = 0;
+	xtWord8 max_ave_row_size = 0;
+	u_int	dic_rec_size;
+	xtBool	dic_rec_fixed;
+	Field	*curr_field;
+	Field	**field;
+
+	/* How many columns are required for all indexes. */
+	KEY				*index;
+	KEY_PART_INFO	*key_part;
+	KEY_PART_INFO	*key_part_end;
+
+#ifndef XT_USE_LAZY_DELETE
+	dic->dic_no_lazy_delete = TRUE;
+#endif
+
+	dic->dic_ind_cols_req = 0;
+	for (uint i=0; i<TS(my_tab)->keys; i++) {
+		index = &my_tab->key_info[i];
+
+		key_part_end = index->key_part + index->key_parts;
+		for (key_part = index->key_part; key_part != key_part_end; key_part++) {
+			curr_field = key_part->field;
+
+			if ((u_int) curr_field->field_index+1 > dic->dic_ind_cols_req)
+				dic->dic_ind_cols_req = curr_field->field_index+1;
+		}
+	}
+
+	/* We will work out how many columns are required for all blobs: */
+	dic->dic_blob_cols_req = 0;	
+	field_count = 0;
+ 	for (field=my_tab->field; (curr_field = *field); field++) {
+ 		field_count++;
+ 		min_data_size = curr_field->key_length();
+ 		max_data_size = curr_field->key_length();
+		enum_field_types tno = curr_field->type();
+
+		min_ave_row_size = 40;
+		max_ave_row_size = 128;
+ 		if (tno == MYSQL_TYPE_BLOB) {
+			blob_field_count++;
+			min_data_size = 0;
+			max_data_size = ((Field_blob *) curr_field)->max_data_length();
+			/* Set the average length higher for BLOBs: */
+			if (max_data_size == 0xFFFF ||
+				max_data_size == 0xFFFFFF) {
+				if (large_blob_field_count < 10)
+					max_ave_row_size = mx_blob_field_size_total[large_blob_field_count];
+				else
+					max_ave_row_size = 200;
+				large_blob_field_count++;
+			}
+			else if (max_data_size == 0xFFFFFFFF) {
+				/* Scale the estimated size of the blob depending on how many BLOBs
+				 * are in the table!
+				 */
+				if (large_blob_field_count < 10)
+					max_ave_row_size = mx_blob_field_size_total[large_blob_field_count];
+				else
+					max_ave_row_size = 200;
+				large_blob_field_count++;
+				if ((u_int) curr_field->field_index+1 > dic->dic_blob_cols_req)
+					dic->dic_blob_cols_req = curr_field->field_index+1;
+				dic->dic_blob_count++;
+				xt_realloc(self, (void **) &dic->dic_blob_cols, sizeof(Field *) * dic->dic_blob_count);
+				dic->dic_blob_cols[dic->dic_blob_count-1] = curr_field;
+			}
+		}
+		else if (tno == MYSQL_TYPE_VARCHAR
+#ifndef DRIZZLED
+			|| tno == MYSQL_TYPE_VAR_STRING
+#endif
+			) {
+			/* GOTCHA: MYSQL_TYPE_VAR_STRING does not exist as MYSQL_TYPE_VARCHAR define, but
+			 * is used when creating a table with
+			 * VARCHAR()
+			 */
+			min_data_size = 0;
+			if (varchar_field_count < 10)
+				min_ave_row_size = mxvarchar_field_min_ave[varchar_field_count];
+			else
+				min_ave_row_size = 40;
+			varchar_field_count++;
+		}
+
+ 		if (max_data_size == min_data_size)
+ 			ave_data_size = max_data_size;
+ 		else {
+ 			var_field_count++;
+			/* Take the average a 25% of the maximum: */
+ 			ave_data_size = max_data_size / 4;
+
+			/* Set the average based on min and max parameters: */
+ 			if (ave_data_size < min_ave_row_size)
+ 				ave_data_size = min_ave_row_size;
+ 			else if (ave_data_size > max_ave_row_size)
+ 				ave_data_size = max_ave_row_size;
+
+ 			if (ave_data_size > max_data_size)
+ 				ave_data_size = max_data_size;
+		}
+
+		/* Add space for the length indicators: */
+		if (min_data_size <= 240)
+			min_row_size += 1 + min_data_size;
+		else if (min_data_size <= 0xFFFF)
+			min_row_size += 3 + min_data_size;
+		else if (min_data_size <= 0xFFFFFF)
+			min_row_size += 4 + min_data_size;
+		else
+			min_row_size += 5 + min_data_size;
+
+		if (max_data_size <= 240)
+			max_row_size += 1 + max_data_size;
+		else if (max_data_size <= 0xFFFF)
+			max_row_size += 3 + max_data_size;
+		else if (max_data_size <= 0xFFFFFF)
+			max_row_size += 4 + max_data_size;
+		else
+			max_row_size += 5 + max_data_size;
+
+		if (ave_data_size <= 240)
+			ave_row_size += 1 + ave_data_size;
+		else /* Should not be more than this! */
+			ave_row_size += 3 + ave_data_size;
+
+		/* This is the length of the record required for all indexes: */
+		/* This was calculated incorrectly. Not a serius bug because it
+		 * is only used in the case of fixed length row, and in this
+		 * case the dic_ind_rec_len is set correctly below.
+		 */
+		if (field_count == dic->dic_ind_cols_req)
+			dic->dic_ind_rec_len = max_row_size;
+ 	}
+
+	dic->dic_min_row_size = min_row_size;
+	dic->dic_max_row_size = max_row_size;
+	dic->dic_ave_row_size = ave_row_size;
+	dic->dic_no_of_cols = field_count;
+
+	if (dic->dic_def_ave_row_size) {
+		/* The average row size has been set: */
+		dic_rec_size = offsetof(XTTabRecFix, rf_data) + TS(my_tab)->reclength;
+
+		/* The conditions for a fixed record are: */
+		if (dic->dic_def_ave_row_size >= (xtWord8) TS(my_tab)->reclength &&
+			dic_rec_size <= XT_TAB_MAX_FIX_REC_LENGTH &&
+			!blob_field_count) {
+			dic_rec_fixed = TRUE;
+		}
+		else {
+			xtWord8 new_rec_size;
+
+			dic_rec_fixed = FALSE;
+			if (dic->dic_def_ave_row_size > max_row_size)
+				new_rec_size = offsetof(XTTabRecFix, rf_data) + max_row_size;
+			else
+				new_rec_size = offsetof(XTTabRecFix, rf_data) + dic->dic_def_ave_row_size;
+
+			/* The maximum record size 64K for explicit AVG_ROW_LENGTH! */
+			if (new_rec_size > XT_TAB_MAX_FIX_REC_LENGTH_SPEC)
+				new_rec_size = XT_TAB_MAX_FIX_REC_LENGTH_SPEC;
+
+			dic_rec_size = (u_int) new_rec_size;
+		}
+	}
+	else {
+		/* If the average size is within 10% if of the maximum size, then we
+		 * we handle these rows as fixed size rows.
+		 * Fixed size rows use the internal MySQL format.
+		 */
+		dic_rec_size = offsetof(XTTabRecFix, rf_data) + TS(my_tab)->reclength;
+		/* Fixed length records must be less than 16K in size,
+		 * have an average size which is very close (20%) to the maximum size or
+		 * be less than a minimum size,
+		 * and not contain any BLOBs:
+		 */
+		if (dic_rec_size <= XT_TAB_MAX_FIX_REC_LENGTH &&
+			(ave_row_size + ave_row_size / 4 >= max_row_size ||
+			dic_rec_size < XT_TAB_MIN_VAR_REC_LENGTH) &&
+			!blob_field_count) {
+			dic_rec_fixed = TRUE;
+		}
+		else {
+			dic_rec_fixed = FALSE;
+			/* Note I add offsetof(XTTabRecFix, rf_data) insteard of
+			 * offsetof(XTTabRecExt, re_data) here!
+			 * The reason is that, we want to include the average size
+			 * record in the fixed data part. To do this we only need to
+			 * calculate a fixed header size, because in the cases in which
+			 * it fits, we will only be using a fixed header!
+			 */
+			dic_rec_size = (u_int) (offsetof(XTTabRecFix, rf_data) + ave_row_size);
+			/* The maximum record size (16K for autorow sizing)! */
+			if (dic_rec_size > XT_TAB_MAX_FIX_REC_LENGTH)
+				dic_rec_size = XT_TAB_MAX_FIX_REC_LENGTH;
+		}
+	}
+
+	/* Ensure that handle data record size is big enough to
+	 * include the extended record reference, in the case of
+	 * variable length rows
+	 */
+	if (!dic_rec_fixed) {
+		if (dic_rec_size < offsetof(XTTabRecExtDRec, re_data))
+			dic_rec_size = offsetof(XTTabRecExtDRec, re_data);
+	}
+#ifdef DEBUG
+	else {
+		ASSERT_NS(dic_rec_size > offsetof(XTTabRecFix, rf_data));
+	}
+#endif
+
+	if (!dic->dic_rec_size) {
+		dic->dic_rec_size = dic_rec_size;
+		dic->dic_rec_fixed = dic_rec_fixed;
+	}
+	else {
+		/* This just confirms that our original calculation on
+		 * create table agrees with the current calculation.
+		 * (i.e. if non-zero values were loaded from the table).
+		 *
+		 * It may be the criteria for calculating the data record size
+		 * and whether to used a fixed or variable record has changed,
+		 * but we need to stick to the current physical layout of the
+		 * table.
+		 *
+		 * Note that this can occur in rename table when the
+		 * method of calculation has changed.
+		 *
+		 * On rename, the format of the table does not change, so we
+		 * will not take the calculated values.
+		 */
+		//ASSERT(dic->dic_rec_size == dic_rec_size);
+		//ASSERT(dic->dic_rec_fixed == dic_rec_fixed);
+	}
+
+	if (dic_rec_fixed) {
+		/* Recalculate the length of the required required to address all
+		 * index columns!
+		 */		 
+		if (field_count == dic->dic_ind_cols_req)
+			dic->dic_ind_rec_len = TS(my_tab)->reclength;
+		else {
+			field=my_tab->field;
+			
+			curr_field = field[dic->dic_ind_cols_req];
+#if MYSQL_VERSION_ID < 50114
+			dic->dic_ind_rec_len = curr_field->offset();
+#else
+			dic->dic_ind_rec_len = curr_field->offset(curr_field->table->record[0]);
+#endif
+		}
+	}
+
+	/* We now calculate how many of the first columns in the row
+	 * will definitely fit into the buffer, when the record is
+	 * of type extended.
+	 *
+	 * In this way we can figure out if we need to load the extended
+	 * record at all.
+	 */
+	dic->dic_fix_col_count = 0;
+	if (!dic_rec_fixed) {
+		xtWord8 max_rec_size = offsetof(XTTabRecExt, re_data);
+
+		for (Field **f=my_tab->field; (curr_field = *f); f++) {
+			max_data_size = curr_field->key_length();
+			enum_field_types tno = curr_field->type();
+			if (tno == MYSQL_TYPE_BLOB)
+				max_data_size = ((Field_blob *) curr_field)->max_data_length();
+			if (max_data_size <= 240)
+				max_rec_size += 1 + max_data_size;
+			else if (max_data_size <= 0xFFFF)
+				max_rec_size += 3 + max_data_size;
+			else if (max_data_size <= 0xFFFFFF)
+				max_rec_size += 4 + max_data_size;
+			else
+				max_rec_size += 5 + max_data_size;
+			if (max_rec_size > (xtWord8) dic_rec_size)
+				break;
+			dic->dic_fix_col_count++;
+		}		
+		ASSERT(dic->dic_fix_col_count < dic->dic_no_of_cols);
+	}
+
+ 	dic->dic_key_count = TS(my_tab)->keys;
+	dic->dic_mysql_buf_size = TS(my_tab)->rec_buff_length;
+	dic->dic_mysql_rec_size = TS(my_tab)->reclength;
+}
+
+static u_int my_get_best_superset(XTThreadPtr XT_UNUSED(self), XTDictionaryPtr dic, XTIndexPtr ind)
+{
+	XTIndexPtr	super_ind;
+	u_int		super = 0;
+	u_int		super_seg_count = ind->mi_seg_count;
+
+	for (u_int i=0; i<dic->dic_key_count; i++) {
+		super_ind = dic->dic_keys[i];
+		if (ind->mi_index_no != super_ind->mi_index_no &&
+			super_seg_count < super_ind->mi_seg_count) {
+			for (u_int j=0; j<ind->mi_seg_count; j++) {
+				if (ind->mi_seg[j].col_idx != super_ind->mi_seg[j].col_idx)
+					goto next;
+			}
+			super_seg_count = super_ind->mi_seg_count;
+			super = i+1;
+			next:;
+		}
+	}
+	return super;
+}
+
+/*
+ * Return FAILED if the MySQL dictionary is not available.
+ */
+xtPublic xtBool myxt_load_dictionary(XTThreadPtr self, XTDictionaryPtr dic, XTDatabaseHPtr db, XTPathStrPtr tab_path)
+{
+	TABLE *my_tab;
+
+	if (!(my_tab = my_open_table(self, db, tab_path)))
+		return FAILED;
+	dic->dic_my_table = my_tab;
+#ifdef DRIZZLED
+	dic->dic_def_ave_row_size = (xtWord8) my_tab->s->getAvgRowLength();
+#else
+	dic->dic_def_ave_row_size = (xtWord8) my_tab->s->avg_row_length;
+#endif
+	myxt_setup_dictionary(self, dic);
+	dic->dic_keys = (XTIndexPtr *) xt_calloc(self, sizeof(XTIndexPtr) * TS(my_tab)->keys);
+	for (uint i=0; i<TS(my_tab)->keys; i++)
+		dic->dic_keys[i] = my_create_index(self, my_tab, i, &my_tab->key_info[i]);
+
+	/* Check if any key is a subset of another: */
+	for (u_int i=0; i<dic->dic_key_count; i++)
+		dic->dic_keys[i]->mi_subset_of = my_get_best_superset(self, dic, dic->dic_keys[i]);
+
+	return OK;
+}
+
+xtPublic void myxt_free_dictionary(XTThreadPtr self, XTDictionaryPtr dic)
+{
+	if (dic->dic_table) {
+		dic->dic_table->release(self);
+		dic->dic_table = NULL;
+	}
+
+	if (dic->dic_my_table) {
+		my_close_table(dic->dic_my_table);
+		dic->dic_my_table = NULL;
+	}
+
+	if (dic->dic_blob_cols) {
+		xt_free(self, dic->dic_blob_cols);
+		dic->dic_blob_cols = NULL;
+	}
+	dic->dic_blob_count = 0;
+
+	/* If we have opened a table, then this data is freed with the dictionary: */
+	if (dic->dic_keys) {
+		for (uint i=0; i<dic->dic_key_count; i++) {
+			if (dic->dic_keys[i])
+				my_deref_index_data(self, (XTIndexPtr) dic->dic_keys[i]);
+		}
+		xt_free(self, dic->dic_keys);
+		dic->dic_key_count = 0;
+		dic->dic_keys = NULL;
+	}
+}
+
+xtPublic void myxt_move_dictionary(XTDictionaryPtr dic, XTDictionaryPtr source_dic)
+{
+	dic->dic_my_table = source_dic->dic_my_table;
+	source_dic->dic_my_table = NULL;
+
+	if (!dic->dic_rec_size) {
+		dic->dic_rec_size = source_dic->dic_rec_size;
+		dic->dic_rec_fixed = source_dic->dic_rec_fixed;
+	}
+	else {
+		/* This just confirms that our original calculation on
+		 * create table agrees with the current calculation.
+		 * (i.e. if non-zero values were loaded from the table).
+		 *
+		 * It may be the criteria for calculating the data record size
+		 * and whether to used a fixed or variable record has changed,
+		 * but we need to stick to the current physical layout of the
+		 * table.
+		 */
+		ASSERT_NS(dic->dic_rec_size == source_dic->dic_rec_size);
+		ASSERT_NS(dic->dic_rec_fixed == source_dic->dic_rec_fixed);
+	}
+
+	dic->dic_tab_flags = source_dic->dic_tab_flags;
+	dic->dic_blob_cols_req = source_dic->dic_blob_cols_req;
+	dic->dic_blob_count = source_dic->dic_blob_count;
+	dic->dic_blob_cols = source_dic->dic_blob_cols;
+	source_dic->dic_blob_cols = NULL;
+
+	dic->dic_mysql_buf_size = source_dic->dic_mysql_buf_size;
+	dic->dic_mysql_rec_size = source_dic->dic_mysql_rec_size;
+ 	dic->dic_key_count = source_dic->dic_key_count;
+	dic->dic_keys = source_dic->dic_keys;
+
+	/* Set this to zero, bcause later xt_flush_tables() may be called. 
+	 * This can occur when using the BLOB streaming engine,
+	 * in command ALTER TABLE x ENGINE = PBXT;
+	 */
+	source_dic->dic_key_count = 0;
+	source_dic->dic_keys = NULL;
+
+	dic->dic_min_row_size = source_dic->dic_min_row_size;
+	dic->dic_max_row_size = source_dic->dic_max_row_size;
+	dic->dic_ave_row_size = source_dic->dic_ave_row_size;
+	dic->dic_def_ave_row_size = source_dic->dic_def_ave_row_size;
+
+	dic->dic_no_of_cols = source_dic->dic_no_of_cols;
+ 	dic->dic_fix_col_count = source_dic->dic_fix_col_count;
+ 	dic->dic_ind_cols_req = source_dic->dic_ind_cols_req;
+ 	dic->dic_ind_rec_len = source_dic->dic_ind_rec_len;
+}
+
+static void my_free_dd_table(XTThreadPtr self, XTDDTable *dd_tab)
+{
+	if (dd_tab)
+		dd_tab->release(self);
+}
+
+static void ha_create_dd_index(XTThreadPtr self, XTDDIndex *ind, KEY *key)
+{
+	KEY_PART_INFO	*key_part;
+	KEY_PART_INFO	*key_part_end;
+	XTDDColumnRef	*cref;
+
+	if (strcmp(key->name, "PRIMARY") == 0)
+		ind->co_type = XT_DD_KEY_PRIMARY;
+	else if (key->flags & HA_NOSAME)
+		ind->co_type = XT_DD_INDEX_UNIQUE;
+	else
+		ind->co_type = XT_DD_INDEX;
+
+	if (ind->co_type == XT_DD_KEY_PRIMARY)
+		ind->co_name = xt_dup_string(self, key->name);
+	else
+		ind->co_ind_name = xt_dup_string(self, key->name);
+
+	key_part_end = key->key_part + key->key_parts;
+	for (key_part = key->key_part; key_part != key_part_end; key_part++) {
+		if (!(cref = new XTDDColumnRef()))
+			xt_throw_errno(XT_CONTEXT, XT_ENOMEM);
+		cref->init(self);
+		ind->co_cols.append(self, cref);
+		cref->cr_col_name = xt_dup_string(self, (char *) key_part->field->field_name);
+	}
+}
+
+static char *my_type_to_string(XTThreadPtr self, Field *field, TABLE *XT_UNUSED(my_tab))
+{
+	char		buffer[MAX_FIELD_WIDTH + 400];
+	const char 	*ptr;
+	String		type((char *) buffer, sizeof(buffer), system_charset_info);
+	xtWord4		len;
+
+	/* GOTCHA:
+	 * - Above sets the string length to the same as the buffer,
+	 *   so we must set the length to zero.
+	 * - The result is not necessarilly zero terminated.
+	 * - We cannot assume that the input buffer is the one
+	 *   we get back (for example text field).
+	 */
+	type.length(0);
+	field->sql_type(type);
+	ptr = type.ptr();
+	len = type.length();
+
+	if (len >= sizeof(buffer))
+		len = sizeof(buffer)-1;
+
+	if (ptr != buffer)
+		xt_strcpy(sizeof(buffer), buffer, ptr);
+
+	buffer[len] = 0;
+			
+	if (field->has_charset()) {
+		/* Always include the charset so that we can compare types
+		 * for FK/PK releations.
+		 */
+		xt_strcat(sizeof(buffer), buffer, " CHARACTER SET ");
+		xt_strcat(sizeof(buffer), buffer, (char *) field->charset()->csname);
+
+		/* For string types dump collation name only if 
+		 * collation is not primary for the given charset
+		 */
+		if (!(field->charset()->state & MY_CS_PRIMARY)) {
+			xt_strcat(sizeof(buffer), buffer, " COLLATE ");
+			xt_strcat(sizeof(buffer), buffer, (char *) field->charset()->name);
+		}
+	}
+
+	return xt_dup_string(self, buffer); // type.length()
+}
+
+xtPublic XTDDTable *myxt_create_table_from_table(XTThreadPtr self, TABLE *my_tab)
+{
+	XTDDTable		*dd_tab;
+	Field			*curr_field;
+	XTDDColumn		*col;
+	XTDDIndex		*ind;
+
+	if (!(dd_tab = new XTDDTable()))
+		xt_throw_errno(XT_CONTEXT, XT_ENOMEM);
+	dd_tab->init(self);
+	pushr_(my_free_dd_table, dd_tab);
+
+ 	for (Field **field=my_tab->field; (curr_field = *field); field++) {
+		col = XTDDColumnFactory::createFromMySQLField(self, my_tab, curr_field);
+		dd_tab->dt_cols.append(self, col);
+	}
+
+	for (uint i=0; i<TS(my_tab)->keys; i++) {
+		if (!(ind = (XTDDIndex *) new XTDDIndex(XT_DD_UNKNOWN)))
+			xt_throw_errno(XT_CONTEXT, XT_ENOMEM);
+		dd_tab->dt_indexes.append(self, ind);
+		ind->co_table = dd_tab;
+		ind->in_index = i;
+		ha_create_dd_index(self, ind, &my_tab->key_info[i]);
+	}
+
+	popr_(); // my_free_dd_table(dd_tab)
+	return dd_tab;
+}
+
+/*
+ * -----------------------------------------------------------------------
+ * MySQL CHARACTER UTILITIES
+ */
+
+xtPublic void myxt_static_convert_identifier(XTThreadPtr XT_UNUSED(self), MX_CHARSET_INFO *cs, char *from, char *to, size_t to_len)
+{
+#ifdef DRIZZLED
+	((void *)cs);
+	 xt_strcpy(to_len, to, from);
+#else
+	uint errors;
+
+	/*
+	 * Bug#4417
+	 * Check that identifiers and strings are not converted 
+	 * when the client character set is binary.
+	 */
+	if (cs == &my_charset_utf8_general_ci || cs == &my_charset_bin)
+		xt_strcpy(to_len, to, from);
+	else
+		strconvert(cs, from, &my_charset_utf8_general_ci, to, to_len, &errors);
+#endif
+}
+
+// cs == current_thd->charset()
+xtPublic char *myxt_convert_identifier(XTThreadPtr self, MX_CHARSET_INFO *cs, char *from)
+{
+#ifdef DRIZZLED
+	char *to = xt_dup_string(self, from);
+	((void *)cs);
+#else
+	uint	errors;
+	u_int	len;
+	char	*to;
+
+	if (cs == &my_charset_utf8_general_ci || cs == &my_charset_bin)
+		to = xt_dup_string(self, from);
+	else {
+		len = strlen(from) * 3 + 1;
+		to = (char *) xt_malloc(self, len);
+		strconvert(cs, from, &my_charset_utf8_general_ci, to, len, &errors);
+	}
+#endif
+	return to;
+}
+
+xtPublic char *myxt_convert_table_name(XTThreadPtr self, char *from)
+{
+	u_int	len;
+	char	*to;
+
+	len = strlen(from) * 5 + 1;
+	to = (char *) xt_malloc(self, len);
+	tablename_to_filename(from, to, len);
+	return to;
+}
+
+xtPublic void myxt_static_convert_table_name(XTThreadPtr XT_UNUSED(self), char *from, char *to, size_t to_len)
+{
+	tablename_to_filename(from, to, to_len);
+}
+
+xtPublic void myxt_static_convert_file_name(char *from, char *to, size_t to_len)
+{
+	filename_to_tablename(from, to, to_len);
+}
+
+xtPublic int myxt_strcasecmp(char * a, char *b)
+{
+	return my_strcasecmp(&my_charset_utf8_general_ci, a, b);
+}
+
+xtPublic int myxt_isspace(MX_CHARSET_INFO *cs, char a)
+{
+	return my_isspace(cs, a);
+}
+
+xtPublic int myxt_ispunct(MX_CHARSET_INFO *cs, char a)
+{
+	return my_ispunct(cs, a);
+}
+
+xtPublic int myxt_isdigit(MX_CHARSET_INFO *cs, char a)
+{
+	return my_isdigit(cs, a);
+}
+
+xtPublic MX_CHARSET_INFO *myxt_getcharset(bool convert)
+{
+	if (convert) {
+		THD *thd = current_thd;
+
+		if (thd)
+			return (MX_CHARSET_INFO *)thd_charset(thd);
+	}
+	return (MX_CHARSET_INFO *)&my_charset_utf8_general_ci;
+}
+
+xtPublic void *myxt_create_thread()
+{
+#ifdef DRIZZLED
+	return (void *) 1;
+#else
+	THD *new_thd;
+
+	if (my_thread_init()) {
+		xt_register_error(XT_REG_CONTEXT, XT_ERR_MYSQL_ERROR, 0, "Unable to initialize MySQL threading");
+		return NULL;
+	}
+
+	/*
+	 * Unfortunately, if PBXT is the default engine, and we are shutting down
+	 * then global_system_variables.table_plugin may be NULL. Which will cause
+	 * a crash if we try to create a thread!
+	 *
+	 * The following call in plugin_shutdown() sets the global reference
+	 * to NULL:
+	 *
+	 * unlock_variables(NULL, &global_system_variables);
+	 *
+	 * Later plugin_deinitialize() is called.
+	 *
+	 * The following stack is an example crash which occurs when I call
+	 * myxt_create_thread() in ha_exit(), to force the error.
+	 *
+	 *   if (pi->state & (PLUGIN_IS_READY | PLUGIN_IS_UNINITIALIZED))
+	 *   pi is NULL!
+	 * #0	0x002ff684 in intern_plugin_lock at sql_plugin.cc:617
+	 * #1	0x0030296d in plugin_thdvar_init at sql_plugin.cc:2432
+	 * #2	0x000db4a4 in THD::init at sql_class.cc:756
+	 * #3	0x000e02ed in THD::THD at sql_class.cc:638
+	 * #4	0x00e2678d in myxt_create_thread at myxt_xt.cc:2990
+	 * #5	0x00e05d43 in ha_exit at ha_pbxt.cc:1011
+	 * #6	0x00e065c2 in pbxt_end at ha_pbxt.cc:1330
+	 * #7	0x00e065df in pbxt_panic at ha_pbxt.cc:1343
+	 * #8	0x0023e57d in ha_finalize_handlerton at handler.cc:392
+	 * #9	0x002ffc8b in plugin_deinitialize at sql_plugin.cc:816
+	 * #10	0x003037d9 in plugin_shutdown at sql_plugin.cc:1572
+	 * #11	0x000f7b2b in clean_up at mysqld.cc:1266
+	 * #12	0x000f7fca in unireg_end at mysqld.cc:1192
+	 * #13	0x000fa021 in kill_server at mysqld.cc:1134
+	 * #14	0x000fa6df in kill_server_thread at mysqld.cc:1155
+	 * #15	0x91fdb155 in _pthread_start
+	 * #16	0x91fdb012 in thread_start
+	 */
+	if (!global_system_variables.table_plugin) {
+		xt_register_xterr(XT_REG_CONTEXT, XT_ERR_MYSQL_NO_THREAD);
+		return NULL;
+	}
+
+	if (!(new_thd = new THD)) {
+		my_thread_end();
+		xt_register_error(XT_REG_CONTEXT, XT_ERR_MYSQL_ERROR, 0, "Unable to create MySQL thread (THD)");
+		return NULL;
+	}
+
+	/*
+	 * If PBXT is the default storage engine, then creating any THD objects will add extra 
+	 * references to the PBXT plugin object. because the threads are created but PBXT
+	 * this creates a self reference, and the reference count does not go to zero
+	 * on shutdown.
+	 *
+	 * The server then issues a message that it is forcing shutdown of the plugin.
+	 *
+	 * However, the engine reference is not required by the THDs used by PBXT, so 
+	 * I just remove them here.
+	 */
+	plugin_unlock(NULL, new_thd->variables.table_plugin);
+	new_thd->variables.table_plugin = NULL;
+
+	new_thd->thread_stack = (char *) &new_thd;
+	new_thd->store_globals();
+	lex_start(new_thd);
+
+	return (void *) new_thd;
+#endif
+}
+
+#ifdef DRIZZLED
+xtPublic void myxt_destroy_thread(void *, xtBool)
+{
+}
+
+xtPublic void myxt_delete_remaining_thread()
+{
+}
+#else
+xtPublic void myxt_destroy_thread(void *thread, xtBool end_threads)
+{
+	THD *thd = (THD *) thread;
+
+#if MYSQL_VERSION_ID > 60005
+	/* PMC - This is a HACK! It is required because
+	 * MySQL shuts down MDL before shutting down the
+	 * plug-ins.
+	 */
+	if (!pbxt_inited)
+		mdl_init();
+	close_thread_tables(thd);
+	if (!pbxt_inited)
+		mdl_destroy();
+#else
+	close_thread_tables(thd);
+#endif
+	
+	delete thd;
+
+	/* Remember that we don't have a THD */
+	my_pthread_setspecific_ptr(THR_THD, 0);
+
+	if (end_threads)
+		my_thread_end();
+}
+
+xtPublic void myxt_delete_remaining_thread()
+{
+	THD *thd;
+
+	if ((thd = current_thd))
+		myxt_destroy_thread((void *) thd, TRUE);
+}
+#endif
+
+xtPublic XTThreadPtr myxt_get_self()
+{
+	THD *thd;
+	
+	if ((thd = current_thd))
+		return xt_ha_thd_to_self(thd);
+	return NULL;
+}
+
+/*
+ * -----------------------------------------------------------------------
+ * INFORMATION SCHEMA FUNCTIONS
+ *
+ */
+
+static int mx_put_record(THD *thd, TABLE *table)
+{
+	return schema_table_store_record(thd, table);
+}
+
+#ifdef UNUSED_CODE
+static void mx_put_int(TABLE *table, int column, int value)
+{
+	table->field[column]->store(value, false);
+}
+
+static void mx_put_real8(TABLE *table, int column, xtReal8 value)
+{
+	table->field[column]->store(value);
+}
+
+static void mx_put_string(TABLE *table, int column, const char *string, u_int len, charset_info_st *charset)
+{
+	table->field[column]->store(string, len, charset);
+}
+#endif
+
+static void mx_put_u_llong(TABLE *table, int column, u_llong value)
+{
+	table->field[column]->store(value, false);
+}
+
+static void mx_put_string(TABLE *table, int column, const char *string, charset_info_st *charset)
+{
+	table->field[column]->store(string, strlen(string), charset);
+}
+
+xtPublic int myxt_statistics_fill_table(XTThreadPtr self, void *th, void *ta, void *, MX_CONST void *ch)
+{
+	THD				*thd = (THD *) th;
+	TABLE_LIST		*tables = (TABLE_LIST *) ta;
+	charset_info_st	*charset = (charset_info_st *) ch;
+	TABLE			*table = (TABLE *) tables->table;
+	int				err = 0;
+	int				col;
+	const char		*stat_name;
+	u_llong			stat_value;
+	XTStatisticsRec	statistics;
+	XTDatabaseHPtr	db = self->st_database;
+
+	xt_gather_statistics(&statistics);
+	for (u_int rec_id=0; !err && rec_id<XT_STAT_CURRENT_MAX; rec_id++) {
+		stat_name = xt_get_stat_meta_data(rec_id)->sm_name;
+		stat_value = xt_get_statistic(&statistics, db, rec_id);
+
+		col=0;
+		mx_put_u_llong(table, col++, rec_id+1);
+		mx_put_string(table, col++, stat_name, charset);
+		mx_put_u_llong(table, col++, stat_value);
+		err = mx_put_record(thd, table);
+	}
+
+	return err;
+}
+
+xtPublic void myxt_get_status(XTThreadPtr self, XTStringBufferPtr strbuf)
+{
+	char string[200];
+
+	xt_sb_concat(self, strbuf, "\n");
+	xt_get_now(string, 200);
+	xt_sb_concat(self, strbuf, string);
+	xt_sb_concat(self, strbuf, " PBXT ");
+	xt_sb_concat(self, strbuf, xt_get_version());
+	xt_sb_concat(self, strbuf, " STATUS OUTPUT");
+	xt_sb_concat(self, strbuf, "\n");
+
+	xt_sb_concat(self, strbuf, "Record cache usage: ");
+	xt_sb_concat_int8(self, strbuf, xt_tc_get_usage());
+	xt_sb_concat(self, strbuf, "\n");
+	xt_sb_concat(self, strbuf, "Record cache size:  ");
+	xt_sb_concat_int8(self, strbuf, xt_tc_get_size());
+	xt_sb_concat(self, strbuf, "\n");
+	xt_sb_concat(self, strbuf, "Record cache high:  ");
+	xt_sb_concat_int8(self, strbuf, xt_tc_get_high());
+	xt_sb_concat(self, strbuf, "\n");
+	xt_sb_concat(self, strbuf, "Index cache usage:  ");
+	xt_sb_concat_int8(self, strbuf, xt_ind_get_usage());
+	xt_sb_concat(self, strbuf, "\n");
+	xt_sb_concat(self, strbuf, "Index cache size:   ");
+	xt_sb_concat_int8(self, strbuf, xt_ind_get_size());
+	xt_sb_concat(self, strbuf, "\n");
+	xt_sb_concat(self, strbuf, "Log cache usage:    ");
+	xt_sb_concat_int8(self, strbuf, xt_xlog_get_usage());
+	xt_sb_concat(self, strbuf, "\n");
+	xt_sb_concat(self, strbuf, "Log cache size:     ");
+	xt_sb_concat_int8(self, strbuf, xt_xlog_get_size());
+	xt_sb_concat(self, strbuf, "\n");
+
+	xt_ht_lock(self, xt_db_open_databases);
+	pushr_(xt_ht_unlock, xt_db_open_databases);
+
+	XTDatabaseHPtr	*dbptr;
+	size_t len = xt_sl_get_size(xt_db_open_db_by_id);
+
+	if (len > 0) {
+		xt_sb_concat(self, strbuf, "Data log files:\n");
+		for (u_int i=0; i<len; i++) {
+			dbptr = (XTDatabaseHPtr *) xt_sl_item_at(xt_db_open_db_by_id, i);
+			
+#ifndef XT_USE_GLOBAL_DB
+			xt_sb_concat(self, strbuf, "Database: ");
+			xt_sb_concat(self, strbuf, (*dbptr)->db_name);
+			xt_sb_concat(self, strbuf, "\n");
+#endif
+			xt_dl_log_status(self, *dbptr, strbuf);
+		}
+	}
+	else
+		xt_sb_concat(self, strbuf, "No data logs in use\n");
+
+	freer_(); // xt_ht_unlock(xt_db_open_databases)
+}
+
+/*
+ * -----------------------------------------------------------------------
+ * MySQL Bit Maps
+ */
+
+static void myxt_bitmap_init(XTThreadPtr self, MX_BITMAP *map, u_int n_bits)
+{
+	my_bitmap_map	*buf;
+    uint			size_in_bytes = (((n_bits) + 31) / 32) * 4;
+
+	buf = (my_bitmap_map *) xt_malloc(self, size_in_bytes);
+
+#ifdef DRIZZLED
+	map->init(buf, n_bits);
+#else
+	map->bitmap= buf;
+	map->n_bits= n_bits;
+	create_last_word_mask(map);
+	bitmap_clear_all(map);
+#endif
+}
+
+static void myxt_bitmap_free(XTThreadPtr self, MX_BITMAP *map)
+{
+#ifdef DRIZZLED
+	my_bitmap_map *buf = map->getBitmap();
+	if (buf)
+		xt_free(self, buf);
+	map->setBitmap(NULL);
+#else
+	if (map->bitmap) {
+		xt_free(self, map->bitmap);
+		map->bitmap = NULL;
+	}
+#endif
+}
+
+/*
+ * -----------------------------------------------------------------------
+ * XTDDColumnFactory methods
+ */
+
+XTDDColumn *XTDDColumnFactory::createFromMySQLField(XTThread *self, TABLE *my_tab, Field *field)
+{
+	XTDDEnumerableColumn *en_col;
+	XTDDColumn *col;
+	xtBool is_enum = FALSE;
+
+	switch(field->real_type()) {
+		case MYSQL_TYPE_ENUM:
+			is_enum = TRUE;
+			/* fallthrough */
+
+#ifndef DRIZZLED
+		case MYSQL_TYPE_SET:
+#endif
+			col = en_col = new XTDDEnumerableColumn();
+		    if (!col)
+				xt_throw_errno(XT_CONTEXT, XT_ENOMEM); 
+			col->init(self);
+			en_col->enum_size = ((Field_enum *)field)->typelib->count;
+			en_col->is_enum = is_enum;
+			break;
+
+		default:
+			col = new XTDDColumn();
+			if (!col)
+				xt_throw_errno(XT_CONTEXT, XT_ENOMEM); 
+			col->init(self);
+	}
+
+	col->dc_name = xt_dup_string(self, (char *) field->field_name);
+	col->dc_data_type = my_type_to_string(self, field, my_tab);
+	col->dc_null_ok = field->null_ptr != NULL;
+
+	return col;
+}
+
+/*
+ * -----------------------------------------------------------------------
+ * utilities
+ */
+
+/*
+ * MySQL (not sure about Drizzle) first calls hton->init and then assigns the plugin a thread slot
+ * which is used by xt_get_self(). This is a problem as pbxt_init() starts a number of daemon threads
+ * which could try to use the slot before it is assigned. This code waits till slot is inited.
+ * We cannot directly check hton->slot as in some versions of MySQL it can be 0 before init which is a 
+ * valid value.
+ */
+extern ulong total_ha;
+
+xtPublic void myxt_wait_pbxt_plugin_slot_assigned(XTThread *self)
+{
+#ifdef DRIZZLED
+	static LEX_STRING plugin_name = { C_STRING_WITH_LEN("PBXT") };
+
+	while (!self->t_quit && !Registry::singleton().find(&plugin_name))
+		xt_sleep_milli_second(1);
+#else
+	while(!self->t_quit && (pbxt_hton->slot >= total_ha))
+		xt_sleep_milli_second(1);
+#endif
+}
diff --git a/storage/pbxt/src/myxt_xt.h b/storage/pbxt/src/myxt_xt.h
new file mode 100644
index 00000000000..3898c8e30c6
--- /dev/null
+++ b/storage/pbxt/src/myxt_xt.h
@@ -0,0 +1,100 @@
+/* Copyright (c) 2005 PrimeBase Technologies GmbH
+ *
+ * PrimeBase XT
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * 2006-05-16	Paul McCullagh
+ *
+ * H&G2JCtL
+ *
+ * These functions implement the parts of PBXT which must conform to the
+ * key and row format used by MySQL. 
+ */
+
+#ifndef __xt_myxt_h__
+#define __xt_myxt_h__
+
+#include "xt_defs.h"
+#include "table_xt.h"
+#include "datadic_xt.h"
+
+#ifndef MYSQL_VERSION_ID
+#error MYSQL_VERSION_ID must be defined!
+#endif
+
+struct XTDictionary;
+struct XTDatabase;
+STRUCT_TABLE;
+struct charset_info_st;
+
+u_int		myxt_create_key_from_key(XTIndexPtr ind, xtWord1 *key, xtWord1 *old, u_int k_length);
+u_int		myxt_create_key_from_row(XTIndexPtr ind, xtWord1 *key, xtWord1 *record, xtBool *no_duplicate);
+u_int		myxt_create_foreign_key_from_row(XTIndexPtr ind, xtWord1 *key, xtWord1 *record, XTIndexPtr fkey_ind, xtBool *no_null);
+u_int		myxt_get_key_length(XTIndexPtr ind, xtWord1 *b_value);
+int			myxt_compare_key(XTIndexPtr ind, int search_flags, uint key_length, xtWord1 *key_value, xtWord1 *b_value);
+u_int		myxt_key_seg_length(XTIndexSegRec *keyseg, u_int key_offset, xtWord1 *key_value);
+xtBool		myxt_create_row_from_key(XTOpenTablePtr ot, XTIndexPtr ind, xtWord1 *key, u_int key_len, xtWord1 *record);
+void		myxt_set_null_row_from_key(XTOpenTablePtr ot, XTIndexPtr ind, xtWord1 *record);
+void		myxt_set_default_row_from_key(XTOpenTablePtr ot, XTIndexPtr ind, xtWord1 *record);
+void		myxt_print_key(XTIndexPtr ind, xtWord1 *key_value);
+
+xtWord4		myxt_store_row_length(XTOpenTablePtr ot, char *rec_buff);
+xtWord4		myxt_store_row_data(XTOpenTablePtr ot, xtWord4 row_size, char *rec_buff);
+xtBool		myxt_store_row(XTOpenTablePtr ot, XTTabRecInfoPtr rec_info, char *rec_buff);
+size_t		myxt_load_row_length(XTOpenTablePtr ot, size_t buffer_size, xtWord1 *source_buf, u_int *ret_col_cnt);
+xtWord4		myxt_load_row_data(XTOpenTablePtr ot, xtWord1 *source_buf, xtWord1 *dest_buff, u_int col_cnt);
+xtBool		myxt_load_row(XTOpenTablePtr ot, xtWord1 *source_buf, xtWord1 *dest_buff, u_int col_cnt);
+xtBool		myxt_find_column(XTOpenTablePtr ot, u_int *col_idx, const char *col_name);
+void		myxt_get_column_name(XTOpenTablePtr ot, u_int col_idx, u_int len, char *col_name);
+void		myxt_get_column_as_string(XTOpenTablePtr ot, char *buffer, u_int col_idx, u_int len, char *value);
+xtBool		myxt_set_column(XTOpenTablePtr ot, char *buffer, u_int col_idx, const char *value, u_int len);
+void		myxt_get_column_data(XTOpenTablePtr ot, char *buffer, u_int col_idx, char **value, size_t *len);
+
+void		myxt_setup_dictionary(XTThreadPtr self, XTDictionary *dic);
+xtBool		myxt_load_dictionary(XTThreadPtr self, struct XTDictionary *dic, struct XTDatabase *db, XTPathStrPtr tab_path);
+void		myxt_free_dictionary(XTThreadPtr self, XTDictionary *dic);
+void		myxt_move_dictionary(XTDictionaryPtr dic, XTDictionaryPtr source_dic);
+XTDDTable	*myxt_create_table_from_table(XTThreadPtr self, STRUCT_TABLE *my_tab);
+
+void		myxt_static_convert_identifier(XTThreadPtr self, MX_CONST_CHARSET_INFO *cs, char *from, char *to, size_t to_len);
+char		*myxt_convert_identifier(XTThreadPtr self, MX_CONST_CHARSET_INFO *cs, char *from);
+void		myxt_static_convert_table_name(XTThreadPtr self, char *from, char *to, size_t to_len);
+void		myxt_static_convert_file_name(char *from, char *to, size_t to_len);
+char		*myxt_convert_table_name(XTThreadPtr self, char *from);
+int			myxt_strcasecmp(char * a, char *b);
+int			myxt_isspace(MX_CONST_CHARSET_INFO *cs, char a);
+int			myxt_ispunct(MX_CONST_CHARSET_INFO *cs, char a);
+int			myxt_isdigit(MX_CONST_CHARSET_INFO *cs, char a);
+
+MX_CONST_CHARSET_INFO *myxt_getcharset(bool convert);
+
+void		*myxt_create_thread();
+void		myxt_destroy_thread(void *thread, xtBool end_threads);
+void		myxt_delete_remaining_thread();
+XTThreadPtr	myxt_get_self();
+
+int			myxt_statistics_fill_table(XTThreadPtr self, void *th, void *ta, void *co, MX_CONST void *ch);
+void		myxt_get_status(XTThreadPtr self, XTStringBufferPtr strbuf);
+
+class XTDDColumnFactory
+{
+public:
+	static XTDDColumn *createFromMySQLField(XTThread *self, STRUCT_TABLE *, Field *);
+};
+
+void myxt_wait_pbxt_plugin_slot_assigned(XTThread *self);
+
+#endif
diff --git a/storage/pbxt/src/pbms.h b/storage/pbxt/src/pbms.h
new file mode 100644
index 00000000000..26753ce6581
--- /dev/null
+++ b/storage/pbxt/src/pbms.h
@@ -0,0 +1,745 @@
+/* Copyright (c) 2007 PrimeBase Technologies GmbH
+ *
+ * PrimeBase Media Stream for MySQL
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * Original author: Paul McCullagh
+ * Continued development: Barry Leslie
+ * H&G2JCtL
+ *
+ * 2007-06-01
+ *
+ * This file contains the BLOB streaming interface engines that
+ * are streaming enabled.
+ *
+ */
+#ifndef __streaming_unx_h__
+#define __streaming_unx_h__
+
+#include <stdio.h>
+#include <sys/types.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <fcntl.h>
+#include <string.h>
+#include <dirent.h>
+#include <signal.h>
+#include <ctype.h>
+#include <errno.h>
+
+
+#ifdef USE_PRAGMA_INTERFACE
+#pragma interface			/* gcc class implementation */
+#endif
+
+/*			2	10		1			10			20			10				10			20				20
+ * Format: "~*"<db_id><'~' || '_'><tab_id>"-"<blob_id>"-"<auth_code>"-"<server_id>"-"<blob_ref_id>"-"<blob_size>
+ */
+//If URL_FMT changes do not forget to update couldBeURL() in this file.
+ 
+#define URL_FMT "~*%lu%c%lu-%llu-%lx-%lu-%llu-%llu"
+
+#define MS_SHARED_MEMORY_MAGIC			0x7E9A120C
+#define MS_ENGINE_VERSION				1
+#define MS_CALLBACK_VERSION				4
+#define MS_SHARED_MEMORY_VERSION		2
+#define MS_ENGINE_LIST_SIZE				10
+#define MS_TEMP_FILE_PREFIX				"pbms_temp_"
+
+#define MS_BLOB_HANDLE_SIZE				300
+
+#define SH_MASK							((S_IRUSR | S_IWUSR) | (S_IRGRP | S_IWGRP) | (S_IROTH))
+
+#define MS_OK							0
+#define MS_ERR_ENGINE					1							/* Internal engine error. */
+#define MS_ERR_UNKNOWN_TABLE			2							/* Returned if the engine cannot open the given table. */
+#define MS_ERR_NOT_FOUND				3							/* The BLOB cannot be found. */
+#define MS_ERR_TABLE_LOCKED				4							/* Table is currently locked. */
+#define MS_ERR_INCORRECT_URL			5
+#define MS_ERR_AUTH_FAILED				6
+#define MS_ERR_NOT_IMPLEMENTED			7
+#define MS_ERR_UNKNOWN_DB				8
+#define MS_ERR_REMOVING_REPO			9
+#define MS_ERR_DATABASE_DELETED			10
+#define MS_ERR_DUPLICATE				11						/* Attempt to insert a duplicate key into a system table. */
+#define MS_ERR_INVALID_RECORD			12
+#define MS_ERR_RECOVERY_IN_PROGRESS		13
+#define MS_ERR_DUPLICATE_DB				14
+#define MS_ERR_DUPLICATE_DB_ID			15
+#define MS_ERR_INVALID_OPERATION		16
+
+#define MS_LOCK_NONE					0
+#define MS_LOCK_READONLY				1
+#define MS_LOCK_READ_WRITE				2
+
+#define PBMS_BLOB_URL_SIZE				120
+
+#define PBMS_FIELD_COL_SIZE				128
+#define PBMS_FIELD_COND_SIZE			300
+
+#define MS_RESULT_MESSAGE_SIZE			300
+#define MS_RESULT_STACK_SIZE			200
+
+typedef struct PBMSResultRec {
+	int						mr_code;								/* Engine specific error code. */ 
+	char					mr_message[MS_RESULT_MESSAGE_SIZE];		/* Error message, required if non-zero return code. */
+	char					mr_stack[MS_RESULT_STACK_SIZE];			/* Trace information about where the error occurred. */
+} PBMSResultRec, *PBMSResultPtr;
+
+
+
+typedef struct PBMSBlobID {
+	u_int32_t				bi_db_id;	
+	u_int64_t				bi_blob_size;	
+	u_int64_t				bi_blob_id;				// or repo file offset if type = REPO
+	u_int64_t				bi_blob_ref_id;			
+	u_int32_t				bi_tab_id;				// or repo ID if type = REPO
+	u_int32_t				bi_auth_code;
+	u_int32_t				bi_blob_type;
+} PBMSBlobIDRec, *PBMSBlobIDPtr;
+
+typedef struct PBMSBlobURL {
+	char					bu_data[PBMS_BLOB_URL_SIZE];
+} PBMSBlobURLRec, *PBMSBlobURLPtr;
+
+typedef struct PBMSEngineRec {
+	int						ms_version;							/* MS_ENGINE_VERSION */
+	int						ms_index;							/* The index into the engine list. */
+	int						ms_removing;						/* TRUE (1) if the engine is being removed. */
+	int						ms_internal;						/* TRUE (1) if the engine is supported directly in the mysq/drizzle handler code . */
+	char					ms_engine_name[32];
+} PBMSEngineRec, *PBMSEnginePtr;
+
+/*
+ * This function should never be called directly, it is called
+ * by deregisterEngine() below.
+ */
+typedef void (*ECRegisterdFunc)(PBMSEnginePtr engine);
+
+typedef void (*ECDeregisterdFunc)(PBMSEnginePtr engine);
+
+/*
+ * Call this function to store a BLOB in the repository the BLOB's
+ * URL will be returned. The returned URL buffer is expected to be atleast 
+ * PBMS_BLOB_URL_SIZE long.
+ *
+ * The BLOB URL must still be retained or it will automaticly be deleted after a timeout expires.
+ */
+typedef int (*ECCreateBlobsFunc)(bool built_in, const char *db_name, const char *tab_name, char *blob, size_t blob_len, char *blob_url, unsigned short col_index, PBMSResultPtr result);
+
+/*
+ * Call this function for each BLOB to be retained. When a BLOB is used, the 
+ * URL may be changed. The returned URL buffer is expected to be atleast 
+ * PBMS_BLOB_URL_SIZE long.
+ *
+ * The returned URL must be inserted into the row in place of the given
+ * URL.
+ */
+typedef int (*ECRetainBlobsFunc)(bool built_in, const char *db_name, const char *tab_name, char *ret_blob_url, char *blob_url, unsigned short col_index, PBMSResultPtr result);
+
+/*
+ * If a row containing a BLOB is deleted, then the BLOBs in the
+ * row must be released.
+ *
+ * Note: if a table is dropped, all the BLOBs referenced by the
+ * table are automatically released.
+ */
+typedef int (*ECReleaseBlobFunc)(bool built_in, const char *db_name, const char *tab_name, char *blob_url, PBMSResultPtr result);
+
+typedef int (*ECDropTable)(bool built_in, const char *db_name, const char *tab_name, PBMSResultPtr result);
+
+typedef int (*ECRenameTable)(bool built_in, const char *db_name, const char *from_table, const char *to_table, PBMSResultPtr result);
+
+typedef void (*ECCallCompleted)(bool built_in, bool ok);
+
+typedef struct PBMSCallbacksRec {
+	int						cb_version;							/* MS_CALLBACK_VERSION */
+	ECRegisterdFunc			cb_register;
+	ECDeregisterdFunc		cb_deregister;
+	ECCreateBlobsFunc		cb_create_blob;
+	ECRetainBlobsFunc		cb_retain_blob;
+	ECReleaseBlobFunc		cb_release_blob;
+	ECDropTable				cb_drop_table;
+	ECRenameTable			cb_rename_table;
+	ECCallCompleted			cb_completed;
+} PBMSCallbacksRec, *PBMSCallbacksPtr;
+
+typedef struct PBMSSharedMemoryRec {
+	int						sm_magic;							/* MS_SHARED_MEMORY_MAGIC */
+	int						sm_version;							/* MS_SHARED_MEMORY_VERSION */
+	volatile int			sm_shutdown_lock;					/* "Cheap" lock for shutdown! */
+	PBMSCallbacksPtr		sm_callbacks;
+	int						sm_reserved1[20];
+	void					*sm_reserved2[20];
+	int						sm_list_size;
+	int						sm_list_len;
+	PBMSEnginePtr			sm_engine_list[MS_ENGINE_LIST_SIZE];
+} PBMSSharedMemoryRec, *PBMSSharedMemoryPtr;
+
+#ifdef PBMS_API
+
+class PBMS_API
+{
+private:
+	const char *temp_prefix[3];
+	bool built_in;
+
+public:
+	PBMS_API(): sharedMemory(NULL) { 
+		int i = 0;
+		temp_prefix[i++] = MS_TEMP_FILE_PREFIX;
+		temp_prefix[i++] = NULL;
+		
+	}
+
+	~PBMS_API() { }
+
+	/*
+	 * This method is called by the PBMS engine during startup.
+	 */
+	int PBMSStartup(PBMSCallbacksPtr callbacks, PBMSResultPtr result) {
+		int err;
+		
+		deleteTempFiles();
+		err = getSharedMemory(true, result);
+		if (!err)
+			sharedMemory->sm_callbacks = callbacks;
+			
+		return err;
+	}
+
+	/*
+	 * This method is called by the PBMS engine during startup.
+	 */
+	void PBMSShutdown() {
+		
+		if (!sharedMemory)
+			return;
+			
+		lock();
+		sharedMemory->sm_callbacks = NULL;
+
+		bool empty = true;
+		for (int i=0; i<sharedMemory->sm_list_len && empty; i++) {
+			if (sharedMemory->sm_engine_list[i]) 
+				empty = false;
+		}
+
+		unlock();
+		
+		if (empty) 
+			removeSharedMemory();
+	}
+
+	/*
+	 * Register the engine with the Stream Engine.
+	 */
+	int registerEngine(PBMSEnginePtr engine, PBMSResultPtr result) {
+		int err;
+
+		deleteTempFiles();
+
+		// The first engine to register creates the shared memory.
+		if ((err = getSharedMemory(true, result)))
+			return err;
+
+		for (int i=0; i<sharedMemory->sm_list_size; i++) {
+			if (!sharedMemory->sm_engine_list[i]) {
+				sharedMemory->sm_engine_list[i] = engine;
+				engine->ms_index = i;
+				if (i >= sharedMemory->sm_list_len)
+					sharedMemory->sm_list_len = i+1;
+				if (sharedMemory->sm_callbacks)
+					sharedMemory->sm_callbacks->cb_register(engine);
+					
+				built_in = (engine->ms_internal == 1);
+				return MS_OK;
+			}
+		}
+		
+		result->mr_code = 15010;
+		strcpy(MS_RESULT_MESSAGE_SIZE, result->mr_message, "Too many BLOB streaming engines already registered");
+		*result->mr_stack = 0;
+		return MS_ERR_ENGINE;
+	}
+
+	void lock() {
+		while (sharedMemory->sm_shutdown_lock)
+			usleep(10000);
+		sharedMemory->sm_shutdown_lock++;
+		while (sharedMemory->sm_shutdown_lock != 1) {
+			usleep(random() % 10000);
+			sharedMemory->sm_shutdown_lock--;
+			usleep(10000);
+			sharedMemory->sm_shutdown_lock++;
+		}
+	}
+
+	void unlock() {
+		sharedMemory->sm_shutdown_lock--;
+	}
+
+	void deregisterEngine(PBMSEnginePtr engine) {
+		PBMSResultRec result;
+		int err;
+
+		if ((err = getSharedMemory(false, &result)))
+			return;
+
+		lock();
+
+		bool empty = true;
+		for (int i=0; i<sharedMemory->sm_list_len; i++) {
+			if (sharedMemory->sm_engine_list[i]) {
+				if (sharedMemory->sm_engine_list[i] == engine) {
+					if (sharedMemory->sm_callbacks)
+						sharedMemory->sm_callbacks->cb_deregister(engine);
+					sharedMemory->sm_engine_list[i] = NULL;
+				}
+				else
+					empty = false;
+			}
+		}
+
+		unlock();
+
+		if (empty) 
+			removeSharedMemory();
+	}
+
+	void removeSharedMemory() 
+	{
+		const char **prefix = temp_prefix;
+		char	temp_file[100];
+
+		// Do not remove the sharfed memory until after
+		// the PBMS engine has shutdown.
+		if (sharedMemory->sm_callbacks)
+			return;
+			
+		sharedMemory->sm_magic = 0;
+		free(sharedMemory);
+		sharedMemory = NULL;
+		
+		while (*prefix) {
+			getTempFileName(temp_file, *prefix, getpid());
+			unlink(temp_file);
+			prefix++;
+		}
+	}
+	
+	int couldBeURL(char *blob_url, int size)
+	{
+		if (blob_url && (size < PBMS_BLOB_URL_SIZE)) {
+			char				buffer[PBMS_BLOB_URL_SIZE+1];
+			unsigned long		db_id = 0;
+			unsigned long		tab_id = 0;
+			unsigned long long	blob_id = 0;
+			unsigned long long	blob_ref_id = 0;
+			unsigned long long	blob_size = 0;
+			unsigned long		auth_code = 0;
+			unsigned long		server_id = 0;
+			char				type, junk[5];
+			int					scanned;
+
+			junk[0] = 0;
+			if (blob_url[size]) { // There is no guarantee that the URL will be null terminated.
+				memcpy(buffer, blob_url, size);
+				buffer[size] = 0;
+				blob_url = buffer;
+			}
+			
+			scanned = sscanf(blob_url, URL_FMT"%4s", &db_id, &type, &tab_id, &blob_id, &auth_code, &server_id, &blob_ref_id, &blob_size, junk);
+			if (scanned != 8) {// If junk is found at the end this will also result in an invalid URL. 
+				printf("Bad URL \"%s\": scanned = %d, junk: %d, %d, %d, %d\n", blob_url, scanned, junk[0], junk[1], junk[2], junk[3]); 
+				return 0;
+			}
+			
+			if (junk[0] || (type != '~' && type != '_')) {
+				printf("Bad URL \"%s\": scanned = %d, junk: %d, %d, %d, %d\n", blob_url, scanned, junk[0], junk[1], junk[2], junk[3]); 
+				return 0;
+			}
+		
+			return 1;
+		}
+		
+		return 0;
+	}
+	
+	int  retainBlob(const char *db_name, const char *tab_name, char *ret_blob_url, char *blob_url, size_t blob_size, unsigned short col_index, PBMSResultPtr result)
+	{
+		int err;
+		char safe_url[PBMS_BLOB_URL_SIZE+1];
+
+
+		if ((err = getSharedMemory(false, result)))
+			return err;
+
+		if (!couldBeURL(blob_url, blob_size)) {
+		
+			if (!sharedMemory->sm_callbacks)  {
+				*ret_blob_url = 0;
+				return MS_OK;
+			}
+			err = sharedMemory->sm_callbacks->cb_create_blob(built_in, db_name, tab_name, blob_url, blob_size, ret_blob_url, col_index, result);
+			if (err)
+				return err;
+				
+			blob_url = ret_blob_url;
+		} else {
+			// Make sure the url is a C string:
+			if (blob_url[blob_size]) {
+				memcpy(safe_url, blob_url, blob_size);
+				safe_url[blob_size] = 0;
+				blob_url = safe_url;
+			}
+		}
+		
+
+		if (!sharedMemory->sm_callbacks) {
+			result->mr_code = MS_ERR_INCORRECT_URL;
+			strcpy(MS_RESULT_MESSAGE_SIZE, result->mr_message, "BLOB streaming engine (PBMS) not installed");
+			*result->mr_stack = 0;
+			return MS_ERR_INCORRECT_URL;
+		}
+
+		return sharedMemory->sm_callbacks->cb_retain_blob(built_in, db_name, tab_name, ret_blob_url, blob_url, col_index, result);
+	}
+
+	int releaseBlob(const char *db_name, const char *tab_name, char *blob_url, size_t blob_size, PBMSResultPtr result)
+	{
+		int err;
+		char safe_url[PBMS_BLOB_URL_SIZE+1];
+
+		if ((err = getSharedMemory(false, result)))
+			return err;
+
+		if (!sharedMemory->sm_callbacks)
+			return MS_OK;
+
+		if (!couldBeURL(blob_url, blob_size))
+			return MS_OK;
+
+		if (blob_url[blob_size]) {
+			memcpy(safe_url, blob_url, blob_size);
+			safe_url[blob_size] = 0;
+			blob_url = safe_url;
+		}
+		
+		return sharedMemory->sm_callbacks->cb_release_blob(built_in, db_name, tab_name, blob_url, result);
+	}
+
+	int dropTable(const char *db_name, const char *tab_name, PBMSResultPtr result)
+	{
+		int err;
+
+		if ((err = getSharedMemory(false, result)))
+			return err;
+
+		if (!sharedMemory->sm_callbacks)
+			return MS_OK;
+			
+		return sharedMemory->sm_callbacks->cb_drop_table(built_in, db_name, tab_name, result);
+	}
+
+	int renameTable(const char *db_name, const char *from_table, const char *to_table, PBMSResultPtr result)
+	{
+		int err;
+
+		if ((err = getSharedMemory(false, result)))
+			return err;
+
+		if (!sharedMemory->sm_callbacks)
+			return MS_OK;
+			
+		return sharedMemory->sm_callbacks->cb_rename_table(built_in, db_name, from_table, to_table, result);
+	}
+
+	void completed(int ok)
+	{
+		PBMSResultRec result;
+
+		if (getSharedMemory(false, &result))
+			return;
+
+		if (!sharedMemory->sm_callbacks)
+			return;
+			
+		sharedMemory->sm_callbacks->cb_completed(built_in, ok);
+	}
+	
+	volatile PBMSSharedMemoryPtr sharedMemory;
+
+private:
+	int getSharedMemory(bool create, PBMSResultPtr result)
+	{
+		int		tmp_f;
+		int		r;
+		char	temp_file[100];
+		const char	**prefix = temp_prefix;
+
+		if (sharedMemory)
+			return MS_OK;
+
+		while (*prefix) {
+			getTempFileName(temp_file, *prefix, getpid());
+			tmp_f = open(temp_file, O_RDWR | (create ? O_CREAT : 0), SH_MASK);
+			if (tmp_f == -1)
+				return setOSResult(errno, "open", temp_file, result);
+
+			r = lseek(tmp_f, 0, SEEK_SET);
+			if (r == -1) {
+				close(tmp_f);
+				return setOSResult(errno, "lseek", temp_file, result);
+			}
+			ssize_t tfer;
+			char buffer[100];
+			
+			tfer = read(tmp_f, buffer, 100);
+			if (tfer == -1) {
+				close(tmp_f);
+				return setOSResult(errno, "read", temp_file, result);
+			}
+
+			buffer[tfer] = 0;
+			sscanf(buffer, "%p", &sharedMemory);
+			if (!sharedMemory || sharedMemory->sm_magic != MS_SHARED_MEMORY_MAGIC) {
+				if (!create)
+					return MS_OK;
+
+				sharedMemory = (PBMSSharedMemoryPtr) calloc(1, sizeof(PBMSSharedMemoryRec));
+				sharedMemory->sm_magic = MS_SHARED_MEMORY_MAGIC;
+				sharedMemory->sm_version = MS_SHARED_MEMORY_VERSION;
+				sharedMemory->sm_list_size = MS_ENGINE_LIST_SIZE;
+
+				r = lseek(tmp_f, 0, SEEK_SET);
+				if (r == -1) {
+					close(tmp_f);
+					return setOSResult(errno, "fseek", temp_file, result);
+				}
+
+				sprintf(buffer, "%p", sharedMemory);
+				tfer = write(tmp_f, buffer, strlen(buffer));
+				if (tfer != strlen(buffer)) {
+					close(tmp_f);
+					return setOSResult(errno, "write", temp_file, result);
+				}
+				r = fsync(tmp_f);
+				if (r == -1) {
+					close(tmp_f);
+					return setOSResult(errno, "fsync", temp_file, result);
+				}
+			}
+			else if (sharedMemory->sm_version != MS_SHARED_MEMORY_VERSION) {
+				close(tmp_f);
+				result->mr_code = -1000;
+				*result->mr_stack = 0;
+				strcpy(MS_RESULT_MESSAGE_SIZE, result->mr_message, "Shared memory version: ");		
+				strcat(MS_RESULT_MESSAGE_SIZE, result->mr_message, sharedMemory->sm_version);		
+				strcat(MS_RESULT_MESSAGE_SIZE, result->mr_message, ", does not match engine shared memory version: ");		
+				strcat(MS_RESULT_MESSAGE_SIZE, result->mr_message, MS_SHARED_MEMORY_VERSION);		
+				strcat(MS_RESULT_MESSAGE_SIZE, result->mr_message, ".");		
+				return MS_ERR_ENGINE;
+			}
+			close(tmp_f);
+			
+			// For backward compatability we need to create the old versions but we only need to read the current version.
+			if (create)
+				prefix++;
+			else
+				break;
+		}
+		return MS_OK;
+	}
+
+	void strcpy(size_t size, char *to, const char *from)
+	{
+		if (size > 0) {
+			size--;
+			while (*from && size--)
+				*to++ = *from++;
+			*to = 0;
+		}
+	}
+
+	void strcat(size_t size, char *to, const char *from)
+	{
+		while (*to && size--) to++;
+		strcpy(size, to, from);
+	}
+
+	void strcat(size_t size, char *to, int val)
+	{
+		char buffer[100];
+
+		sprintf(buffer, "%d", val);
+		strcat(size, to, buffer);
+	}
+
+	int setOSResult(int err, const char *func, char *file, PBMSResultPtr result) {
+		char *msg;
+
+		result->mr_code = err;
+		*result->mr_stack = 0;
+		strcpy(MS_RESULT_MESSAGE_SIZE, result->mr_message, "System call ");		
+		strcat(MS_RESULT_MESSAGE_SIZE, result->mr_message, func);		
+		strcat(MS_RESULT_MESSAGE_SIZE, result->mr_message, "() failed on ");		
+		strcat(MS_RESULT_MESSAGE_SIZE, result->mr_message, file);		
+		strcat(MS_RESULT_MESSAGE_SIZE, result->mr_message, ": ");		
+
+#ifdef XT_WIN
+		if (FormatMessage(FORMAT_MESSAGE_FROM_SYSTEM, NULL, err, 0, iMessage + strlen(iMessage), MS_RESULT_MESSAGE_SIZE - strlen(iMessage), NULL)) {
+			char *ptr;
+
+			ptr = &iMessage[strlen(iMessage)];
+			while (ptr-1 > err_msg) {
+				if (*(ptr-1) != '\n' && *(ptr-1) != '\r' && *(ptr-1) != '.')
+					break;
+				ptr--;
+			}
+			*ptr = 0;
+
+			strcat(MS_RESULT_MESSAGE_SIZE, result->mr_message, " (");
+			strcat(MS_RESULT_MESSAGE_SIZE, result->mr_message, err);
+			strcat(MS_RESULT_MESSAGE_SIZE, result->mr_message, ")");
+			return MS_ERR_ENGINE;
+		}
+#endif
+
+		msg = strerror(err);
+		if (msg) {
+			strcat(MS_RESULT_MESSAGE_SIZE, result->mr_message, msg);
+			strcat(MS_RESULT_MESSAGE_SIZE, result->mr_message, " (");
+			strcat(MS_RESULT_MESSAGE_SIZE, result->mr_message, err);
+			strcat(MS_RESULT_MESSAGE_SIZE, result->mr_message, ")");
+		}
+		else {
+			strcat(MS_RESULT_MESSAGE_SIZE, result->mr_message, "Unknown OS error code ");
+			strcat(MS_RESULT_MESSAGE_SIZE, result->mr_message, err);
+		}
+
+		return MS_ERR_ENGINE;
+	}
+
+	void getTempFileName(char *temp_file, const char * prefix, int pid)
+	{
+		sprintf(temp_file, "/tmp/%s%d", prefix,  pid);
+	}
+
+	bool startsWith(const char *cstr, const char *w_cstr)
+	{
+		while (*cstr && *w_cstr) {
+			if (*cstr != *w_cstr)
+				return false;
+			cstr++;
+			w_cstr++;
+		}
+		return *cstr || !*w_cstr;
+	}
+
+	void deleteTempFiles()
+	{
+		struct dirent	*entry;
+		struct dirent	*result;
+		DIR				*odir;
+		int				err;
+		size_t			sz;
+		char			temp_file[100];
+
+#ifdef __sun
+		sz = sizeof(struct dirent) + pathconf("/tmp/", _PC_NAME_MAX); // Solaris, see readdir(3C)
+#else
+		sz = sizeof(struct dirent);
+#endif
+		if (!(entry = (struct dirent *) malloc(sz)))
+			return;
+		if (!(odir = opendir("/tmp/")))
+			return;
+		err = readdir_r(odir, entry, &result);
+		while (!err && result) {
+			const char **prefix = temp_prefix;
+			
+			while (*prefix) {
+				if (startsWith(entry->d_name, *prefix)) {
+					int pid = atoi(entry->d_name + strlen(*prefix));
+					
+					/* If the process does not exist: */
+					if (kill(pid, 0) == -1 && errno == ESRCH) {
+						getTempFileName(temp_file, *prefix, pid);
+						unlink(temp_file);
+					}
+				}
+				prefix++;
+			}
+			
+			err = readdir_r(odir, entry, &result);
+		}
+		closedir(odir);
+		free(entry);
+	}
+};
+#endif // PBMS_API
+
+/*
+ * The following is a low level API for accessing blobs directly.
+ */
+ 
+
+/*
+ * Any threads using the direct blob access API must first register them selves with the
+ * blob streaming engine before using the blob access functions. This is done by calling
+ * PBMSInitBlobStreamingThread(). Call PBMSDeinitBlobStreamingThread() after the thread is
+ * done using the direct blob access API
+ */
+ 
+/* 
+* PBMSInitBlobStreamingThread(): Returns a pointer to a blob streaming thread.
+*/
+extern void *PBMSInitBlobStreamingThread(char *thread_name, PBMSResultPtr result);
+extern void PBMSDeinitBlobStreamingThread(void *v_bs_thread);
+
+/* 
+* PBMSGetError():Gets the last error reported by a blob streaming thread.
+*/
+extern void PBMSGetError(void *v_bs_thread, PBMSResultPtr result);
+
+/* 
+* PBMSCreateBlob():Creates a new blob in the database of the given size.
+*/
+extern bool PBMSCreateBlob(PBMSBlobIDPtr blob_id, char *database_name, u_int64_t size);
+
+/* 
+* PBMSWriteBlob():Write the data to the blob in one or more chunks. The total size of all the chuncks of 
+* data written to the blob must match the size specified when the blob was created.
+*/
+extern bool PBMSWriteBlob(PBMSBlobIDPtr blob_id, char *data, size_t size, size_t offset);
+
+/* 
+* PBMSReadBlob():Read the blob data out of the blob in one or more chunks.
+*/
+extern bool PBMSReadBlob(PBMSBlobIDPtr blob_id, char *buffer, size_t *size, size_t offset);
+
+/*
+* PBMSIDToURL():Convert a blob id to a blob URL. The 'url' buffer must be atleast  PBMS_BLOB_URL_SIZE bytes in size.
+*/
+extern bool PBMSIDToURL(PBMSBlobIDPtr blob_id, char *url);
+
+/*
+* PBMSIDToURL():Convert a blob URL to a blob ID.
+*/
+extern bool PBMSURLToID(char *url, PBMSBlobIDPtr blob_id);
+
+#endif
diff --git a/storage/pbxt/src/pbms_enabled.cc b/storage/pbxt/src/pbms_enabled.cc
new file mode 100644
index 00000000000..7a67d89a4bb
--- /dev/null
+++ b/storage/pbxt/src/pbms_enabled.cc
@@ -0,0 +1,249 @@
+/* Copyright (c) 2009 PrimeBase Technologies GmbH, Germany
+ *
+ * PrimeBase Media Stream for MySQL
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * Barry Leslie
+ *
+ * 2009-07-16
+ *
+ * H&G2JCtL
+ *
+ * PBMS interface used to enable engines for use with the PBMS engine.
+ *
+ * For an example on how to build this into an engine have a look at the PBXT engine
+ * in file ha_pbxt.cc. Search for 'PBMS_ENABLED'.
+ *
+ */
+
+#include "xt_config.h"
+
+#ifdef PBMS_ENABLED
+
+#ifdef DRIZZLED
+#include <sys/stat.h>
+#include <drizzled/common_includes.h>
+#include <drizzled/plugin.h>
+#else
+#include "mysql_priv.h"
+#include <mysql/plugin.h>
+#define session_alloc(sess, size) thd_alloc(sess, size);
+#define current_session current_thd
+#endif
+
+#define GET_BLOB_FIELD(t, i)	(Field_blob *)(t->field[t->s->blob_field[i]])
+#define DB_NAME(f)				(f->table->s->db.str)
+#define TAB_NAME(f)				(*(f->table_name))
+
+#define PBMS_API	pbms_enabled_api
+
+#include "pbms_enabled.h"
+
+static PBMS_API pbms_api;
+
+PBMSEngineRec enabled_engine = {
+	MS_ENGINE_VERSION
+};
+
+//====================
+bool pbms_initialize(const char *engine_name, bool isServer, PBMSResultPtr result)
+{
+	int						err;
+
+	strncpy(enabled_engine.ms_engine_name, engine_name, 32);
+	enabled_engine.ms_internal = isServer;
+	enabled_engine.ms_engine_name[31] = 0;
+
+	err = pbms_api.registerEngine(&enabled_engine, result);
+
+	return (err == 0);
+}
+
+
+//====================
+void pbms_finalize()
+{
+	pbms_api.deregisterEngine(&enabled_engine);
+}
+
+//====================
+int pbms_write_row_blobs(TABLE *table, uchar *row_buffer, PBMSResultPtr result)
+{
+	Field_blob *field;
+	char *blob_rec, *blob;
+	size_t packlength, i, org_length, length;
+	char blob_url_buffer[PBMS_BLOB_URL_SIZE];
+	int err;
+	String type_name;
+
+	if (table->s->blob_fields == 0)
+		return 0;
+		
+	for (i= 0; i < table->s->blob_fields; i++) {
+		field = GET_BLOB_FIELD(table, i);
+
+		// Note: field->type() always returns MYSQL_TYPE_BLOB regardless of the type of BLOB
+		field->sql_type(type_name);
+		if (strcasecmp(type_name.c_ptr(), "LongBlob"))
+			continue;
+			
+		// Get the blob record:
+		blob_rec = (char *)row_buffer + field->offset(field->table->record[0]);
+		packlength = field->pack_length() - field->table->s->blob_ptr_size;
+
+		memcpy(&blob, blob_rec +packlength, sizeof(char*));
+		org_length = field->get_length((uchar *)blob_rec);
+
+		
+		// Signal PBMS to record a new reference to the BLOB.
+		// If 'blob' is not a BLOB URL then it will be stored in the repositor as a new BLOB
+		// and a reference to it will be created.
+		err = pbms_api.retainBlob(DB_NAME(field), TAB_NAME(field), blob_url_buffer, blob, org_length, field->field_index, result);
+		if (err)
+			return err;
+			
+		// If the BLOB length changed reset it. 
+		// This will happen if the BLOB data was replaced with a BLOB reference. 
+		length = strlen(blob_url_buffer)  +1;
+		if ((length != org_length) || memcmp(blob_url_buffer, blob, length)) {
+			if (length != org_length) {
+				field->store_length((uchar *)blob_rec, packlength, length);
+			}
+			
+			if (length > org_length) {
+				// This can only happen if the BLOB URL is actually larger than the BLOB itself.
+				blob = (char *) session_alloc(current_session, length);
+				memcpy(blob_rec+packlength, &blob, sizeof(char*));
+			}			
+			memcpy(blob, blob_url_buffer, length);
+		} 
+	}
+	
+	return 0;
+}
+
+//====================
+int pbms_delete_row_blobs(TABLE *table, const uchar *row_buffer, PBMSResultPtr result)
+{
+	Field_blob *field;
+	const char *blob_rec;
+	char *blob;
+	size_t packlength, i, length;
+	int err;
+	String type_name;
+
+	if (table->s->blob_fields == 0)
+		return 0;
+		
+	for (i= 0; i < table->s->blob_fields; i++) {
+		field = GET_BLOB_FIELD(table, i);
+
+		// Note: field->type() always returns MYSQL_TYPE_BLOB regardless of the type of BLOB
+		field->sql_type(type_name);
+		if (strcasecmp(type_name.c_ptr(), "LongBlob"))
+			continue;
+			
+		// Get the blob record:
+		blob_rec = (char *)row_buffer + field->offset(field->table->record[0]);
+		packlength = field->pack_length() - field->table->s->blob_ptr_size;
+
+		length = field->get_length((uchar *)blob_rec);
+		memcpy(&blob, blob_rec +packlength, sizeof(char*));
+		
+		// Signal PBMS to delete the reference to the BLOB.
+		err = pbms_api.releaseBlob(DB_NAME(field), TAB_NAME(field), blob, length, result);
+		if (err)
+			return err;
+	}
+	
+	return 0;
+}
+
+#define MAX_NAME_SIZE 64
+static void parse_table_path(const char *path, char *db_name, char *tab_name)
+{
+	const char *ptr = path + strlen(path) -1, *eptr;
+	int len;
+	
+	*db_name = *tab_name = 0;
+	
+	while ((ptr > path) && (*ptr != '/'))ptr --;
+	if (*ptr != '/') 
+		return;
+		
+	strncpy(tab_name, ptr+1, MAX_NAME_SIZE);
+	tab_name[MAX_NAME_SIZE-1] = 0;
+	eptr = ptr;
+	ptr--;
+	
+	while ((ptr > path) && (*ptr != '/'))ptr --;
+	if (*ptr != '/') 
+		return;
+	ptr++;
+	
+	len = eptr - ptr;
+	if (len >= MAX_NAME_SIZE)
+		len = MAX_NAME_SIZE-1;
+		
+	memcpy(db_name, ptr, len);
+	db_name[len] = 0;
+	
+}
+
+//====================
+int pbms_rename_table_with_blobs(const char *old_table_path, const char *new_table_path, PBMSResultPtr result)
+{
+	char o_db_name[MAX_NAME_SIZE], n_db_name[MAX_NAME_SIZE], o_tab_name[MAX_NAME_SIZE], n_tab_name[MAX_NAME_SIZE];
+
+	parse_table_path(old_table_path, o_db_name, o_tab_name);
+	parse_table_path(new_table_path, n_db_name, n_tab_name);
+	
+	if (strcmp(o_db_name, n_db_name)) {
+		result->mr_code = MS_ERR_INVALID_OPERATION;
+		strcpy(result->mr_message, "PBMS does not support renaming tables across databases.");
+		strcpy(result->mr_stack, "pbms_rename_table_with_blobs()");
+		return MS_ERR_INVALID_OPERATION;
+	}
+	
+	
+	 return pbms_api.renameTable(o_db_name, o_tab_name, n_tab_name, result);
+}
+
+//====================
+int pbms_delete_table_with_blobs(const char *table_path, PBMSResultPtr result)
+{
+	char db_name[MAX_NAME_SIZE], tab_name[MAX_NAME_SIZE];
+		
+	parse_table_path(table_path, db_name, tab_name);
+
+	return pbms_api.dropTable(db_name, tab_name, result);
+}
+
+//====================
+void pbms_completed(TABLE *table, bool ok)
+{
+	if ((!table) || (table->s->blob_fields != 0))
+		pbms_api.completed(ok) ;
+		
+	 return ;
+}
+
+#elif defined(__WIN__)
+
+// Remove linker warning 4221 about empty file
+namespace { char dummy; };
+
+#endif // PBMS_ENABLED
diff --git a/storage/pbxt/src/pbms_enabled.h b/storage/pbxt/src/pbms_enabled.h
new file mode 100644
index 00000000000..7e62c32f34f
--- /dev/null
+++ b/storage/pbxt/src/pbms_enabled.h
@@ -0,0 +1,103 @@
+/* Copyright (c) 2009 PrimeBase Technologies GmbH, Germany
+ *
+ * PrimeBase Media Stream for MySQL
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * Barry Leslie
+ *
+ * 2009-07-16
+ *
+ * H&G2JCtL
+ *
+ * PBMS interface used to enable engines for use with the PBMS engine.
+ *
+ * For an example on how to build this into an engine have a look at the PBXT engine
+ * in file ha_pbxt.cc. Search for 'PBMS_ENABLED'.
+ *
+ */
+
+
+#ifndef __PBMS_ENABLED_H__
+#define __PBMS_ENABLED_H__
+
+#include "pbms.h"
+
+/*
+ * pbms_initialize() should be called from the engines plugIn's 'init()' function.
+ * The engine_name is the name of your engine, "PBXT" or "InnoDB" for example.
+ *
+ * The isServer flag indicates if this entire server is being enabled. This is only
+ * true if this is being built into the server's handler code above the engine level
+ * calls. 
+ */
+extern bool pbms_initialize(const char *engine_name, bool isServer, PBMSResultPtr result);
+
+/*
+ * pbms_finalize() should be called from the engines plugIn's 'deinit()' function.
+ */
+extern void pbms_finalize();
+
+/*
+ * pbms_write_row_blobs() should be called from the engine's 'write_row' function.
+ * It can alter the row data so it must be called before any other function using the row data.
+ * It should also be called from engine's 'update_row' function for the new row.
+ *
+ * pbms_completed() must be called after calling pbms_write_row_blobs() and just before
+ * returning from write_row() to indicate if the operation completed successfully.
+ */
+extern int pbms_write_row_blobs(TABLE *table, uchar *buf, PBMSResultPtr result);
+
+/*
+ * pbms_delete_row_blobs() should be called from the engine's 'delete_row' function.
+ * It should also be called from engine's 'update_row' function for the old row.
+ *
+ * pbms_completed() must be called after calling pbms_delete_row_blobs() and just before
+ * returning from delete_row() to indicate if the operation completed successfully.
+ */
+extern int pbms_delete_row_blobs(TABLE *table, const uchar *buf, PBMSResultPtr result);
+
+/*
+ * pbms_rename_table_with_blobs() should be called from the engine's 'rename_table' function.
+ *
+ * NOTE: Renaming tables across databases is not supported.
+ *
+ * pbms_completed() must be called after calling pbms_rename_table_with_blobs() and just before
+ * returning from rename_table() to indicate if the operation completed successfully.
+ */
+extern int pbms_rename_table_with_blobs(const char *old_table_path, const char *new_table_path, PBMSResultPtr result);
+
+/*
+ * pbms_delete_table_with_blobs() should be called from the engine's 'delete_table' function.
+ *
+ * NOTE: Currently pbms_delete_table_with_blobs() cannot be undone so it should only
+ * be called after the host engine has performed successfully drop it's table.
+ *
+ * pbms_completed() must be called after calling pbms_delete_table_with_blobs() and just before
+ * returning from delete_table() to indicate if the operation completed successfully.
+ */
+extern int pbms_delete_table_with_blobs(const char *table_path, PBMSResultPtr result);
+
+/*
+ * pbms_completed() must be called to indicate success or failure of a an operation after having
+ * called  pbms_write_row_blobs(), pbms_delete_row_blobs(), pbms_rename_table_with_blobs(), or
+ * pbms_delete_table_with_blobs().
+ *
+ * pbms_completed() has the effect of committing or rolling back the changes made if the session
+ * is in 'autocommit' mode.
+ */
+extern void pbms_completed(TABLE *table, bool ok);
+
+#endif
diff --git a/storage/pbxt/src/pthread_xt.cc b/storage/pbxt/src/pthread_xt.cc
new file mode 100755
index 00000000000..e7f0632e9ae
--- /dev/null
+++ b/storage/pbxt/src/pthread_xt.cc
@@ -0,0 +1,787 @@
+/* Copyright (c) 2005 PrimeBase Technologies GmbH
+ *
+ * PrimeBase XT
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.	See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * 2006-03-22	Paul McCullagh
+ *
+ * H&G2JCtL
+ *
+ * This file contains windows specific code
+ */
+
+#include "xt_config.h"
+
+#ifdef XT_WIN
+#include <my_pthread.h>
+#else
+#include <sys/resource.h>
+#endif
+#include <errno.h>
+#include <limits.h>
+#include <string.h>
+
+#include "pthread_xt.h"
+#include "thread_xt.h"
+
+#ifdef XT_WIN
+
+xtPublic void xt_p_init_threading(void)
+{
+}
+
+xtPublic int xt_p_set_normal_priority(pthread_t thr)
+{
+	if (!SetThreadPriority (thr, THREAD_PRIORITY_NORMAL))
+		return GetLastError();
+	return 0;
+}
+
+xtPublic int xt_p_set_low_priority(pthread_t thr)
+{
+	if (!SetThreadPriority (thr, THREAD_PRIORITY_LOWEST))
+		return GetLastError();
+	return 0;
+}
+
+xtPublic int xt_p_set_high_priority(pthread_t thr)
+{
+	if (!SetThreadPriority (thr, THREAD_PRIORITY_HIGHEST))
+		return GetLastError();
+	return 0;
+}
+
+#define XT_RWLOCK_MAGIC 0x78AC390E
+
+#ifdef XT_THREAD_LOCK_INFO
+xtPublic int xt_p_mutex_init(xt_mutex_type *mutex, const pthread_mutexattr_t *attr, const char *n)
+#else
+xtPublic int xt_p_mutex_init(xt_mutex_type *mutex, const pthread_mutexattr_t *attr)
+#endif
+{
+	InitializeCriticalSection(&mutex->mt_cs);
+#ifdef XT_THREAD_LOCK_INFO
+	xt_thread_lock_info_init(&mutex->mt_lock_info, mutex);
+	mutex->mt_name = n;
+#endif
+	return 0;
+}
+
+xtPublic int xt_p_mutex_destroy(xt_mutex_type *mutex)
+{
+	DeleteCriticalSection(&mutex->mt_cs);
+#ifdef XT_THREAD_LOCK_INFO
+	xt_thread_lock_info_free(&mutex->mt_lock_info);
+#endif
+	return 0;
+}
+
+xtPublic int xt_p_mutex_lock(xt_mutex_type *mx)
+{
+	EnterCriticalSection(&mx->mt_cs);
+#ifdef XT_THREAD_LOCK_INFO
+	xt_thread_lock_info_add_owner(&mx->mt_lock_info);
+#endif
+	return 0;
+}
+
+xtPublic int xt_p_mutex_unlock(xt_mutex_type *mx)
+{
+	LeaveCriticalSection(&mx->mt_cs);
+#ifdef XT_THREAD_LOCK_INFO
+	xt_thread_lock_info_release_owner(&mx->mt_lock_info);
+#endif
+	return 0;
+}
+
+xtPublic int xt_p_mutex_trylock(xt_mutex_type *mutex)
+{
+#if(_WIN32_WINNT >= 0x0400)
+	/* NOTE: MySQL bug! was using?!
+	 * pthread_mutex_trylock(A) (WaitForSingleObject((A), 0) == WAIT_TIMEOUT)
+	 */
+	if (TryEnterCriticalSection(&mutex->mt_cs)) {
+#ifdef XT_THREAD_LOCK_INFO
+		xt_thread_lock_info_add_owner(&mutex->mt_lock_info);
+#endif
+		return 0;
+	}
+	return WAIT_TIMEOUT;
+#else
+	EnterCriticalSection(&mutex->mt_cs);
+#ifdef XT_THREAD_LOCK_INFO
+	xt_thread_lock_info_add_owner(&mutex->mt_lock_info);
+#endif
+	return 0;
+#endif
+}
+
+#ifdef XT_THREAD_LOCK_INFO
+xtPublic int xt_p_rwlock_init(xt_rwlock_type *rwl, const pthread_condattr_t *attr, const char *n)
+#else
+xtPublic int xt_p_rwlock_init(xt_rwlock_type *rwl, const pthread_condattr_t *attr)
+#endif
+{
+	int result;
+
+	if (rwl == NULL)
+		return ERROR_BAD_ARGUMENTS;
+
+	rwl->rw_sh_count = 0;
+	rwl->rw_ex_count = 0;
+	rwl->rw_sh_complete_count = 0;
+
+	result = xt_p_mutex_init_with_autoname(&rwl->rw_ex_lock, NULL);
+	if (result != 0)
+		goto failed;
+
+	result = xt_p_mutex_init_with_autoname(&rwl->rw_sh_lock, NULL);
+	if (result != 0)
+		goto failed_2;
+
+	result = pthread_cond_init(&rwl->rw_sh_cond, NULL);
+	if (result != 0)
+		goto failed_3;
+
+	rwl->rw_magic = XT_RWLOCK_MAGIC;
+#ifdef XT_THREAD_LOCK_INFO
+	rwl->rw_name = n;
+	xt_thread_lock_info_init(&rwl->rw_lock_info, rwl);
+#endif
+	return 0;
+
+	failed_3:
+	(void) xt_p_mutex_destroy(&rwl->rw_sh_lock);
+
+	failed_2:
+	(void) xt_p_mutex_destroy(&rwl->rw_ex_lock);
+
+	failed:
+	return result;
+}
+
+xtPublic int xt_p_rwlock_destroy(xt_rwlock_type *rwl)
+{
+	int result = 0, result1 = 0, result2 = 0;
+
+	if (rwl == NULL)
+		return ERROR_BAD_ARGUMENTS;
+
+	if (rwl->rw_magic != XT_RWLOCK_MAGIC)
+		return ERROR_BAD_ARGUMENTS;
+
+	if ((result = xt_p_mutex_lock(&rwl->rw_ex_lock)) != 0)
+		return result;
+
+	if ((result = xt_p_mutex_lock(&rwl->rw_sh_lock)) != 0) {
+		(void) xt_p_mutex_unlock(&rwl->rw_ex_lock);
+		return result;
+	}
+
+	/*
+	 * Check whether any threads own/wait for the lock (wait for ex.access);
+	 * report "BUSY" if so.
+	 */
+	if (rwl->rw_ex_count > 0 || rwl->rw_sh_count > rwl->rw_sh_complete_count) {
+		result = xt_p_mutex_unlock(&rwl->rw_sh_lock);
+		result1 = xt_p_mutex_unlock(&rwl->rw_ex_lock);
+		result2 = ERROR_BUSY;
+	}
+	else {
+		rwl->rw_magic = 0;
+
+		if ((result = xt_p_mutex_unlock(&rwl->rw_sh_lock)) != 0)
+		{
+			xt_p_mutex_unlock(&rwl->rw_ex_lock);
+			return result;
+		}
+
+		if ((result = xt_p_mutex_unlock(&rwl->rw_ex_lock)) != 0)
+			return result;
+
+		result = pthread_cond_destroy(&rwl->rw_sh_cond);
+		result1 = xt_p_mutex_destroy(&rwl->rw_sh_lock);
+		result2 = xt_p_mutex_destroy(&rwl->rw_ex_lock);
+	}
+
+#ifdef XT_THREAD_LOCK_INFO
+	xt_thread_lock_info_free(&rwl->rw_lock_info);
+#endif
+
+	return (result != 0) ? result : ((result1 != 0) ? result1 : result2);
+}
+
+
+xtPublic int xt_p_rwlock_rdlock(xt_rwlock_type *rwl)
+{
+	int result;
+
+	if (rwl == NULL)
+		return ERROR_BAD_ARGUMENTS;
+
+	if (rwl->rw_magic != XT_RWLOCK_MAGIC)
+		return ERROR_BAD_ARGUMENTS;
+
+	if ((result = xt_p_mutex_lock(&rwl->rw_ex_lock)) != 0)
+		return result;
+
+	if (++rwl->rw_sh_count == INT_MAX) {
+		if ((result = xt_p_mutex_lock(&rwl->rw_sh_lock)) != 0)
+		{
+			(void) xt_p_mutex_unlock(&rwl->rw_ex_lock);
+			return result;
+		}
+
+		rwl->rw_sh_count -= rwl->rw_sh_complete_count;
+		rwl->rw_sh_complete_count = 0;
+
+		if ((result = xt_p_mutex_unlock(&rwl->rw_sh_lock)) != 0)
+		{
+			(void) xt_p_mutex_unlock(&rwl->rw_ex_lock);
+			return result;
+		}
+	}
+
+#ifdef XT_THREAD_LOCK_INFO
+	xt_thread_lock_info_add_owner(&rwl->rw_lock_info);
+#endif
+
+	return (xt_p_mutex_unlock (&(rwl->rw_ex_lock)));
+}
+
+xtPublic int xt_p_rwlock_wrlock(xt_rwlock_type *rwl)
+{
+	int result;
+
+	if (rwl == NULL)
+		return ERROR_BAD_ARGUMENTS;
+
+	if (rwl->rw_magic != XT_RWLOCK_MAGIC)
+		return ERROR_BAD_ARGUMENTS;
+
+	if ((result = xt_p_mutex_lock (&rwl->rw_ex_lock)) != 0)
+		return result;
+
+	if ((result = xt_p_mutex_lock (&rwl->rw_sh_lock)) != 0) {
+		(void) xt_p_mutex_unlock (&rwl->rw_ex_lock);
+		return result;
+	}
+
+	if (rwl->rw_ex_count == 0) {
+		if (rwl->rw_sh_complete_count > 0) {
+			rwl->rw_sh_count -= rwl->rw_sh_complete_count;
+			rwl->rw_sh_complete_count = 0;
+		}
+
+		if (rwl->rw_sh_count > 0) {
+			rwl->rw_sh_complete_count = -rwl->rw_sh_count;
+
+			do {
+				result = pthread_cond_wait (&rwl->rw_sh_cond, &rwl->rw_sh_lock.mt_cs);
+			}
+			while (result == 0 && rwl->rw_sh_complete_count < 0);
+
+			if (result == 0)
+				rwl->rw_sh_count = 0;
+		}
+	}
+
+	if (result == 0)
+		rwl->rw_ex_count++;
+
+#ifdef XT_THREAD_LOCK_INFO
+	xt_thread_lock_info_add_owner(&rwl->rw_lock_info);
+#endif
+
+	return result;
+}
+
+xtPublic xtBool xt_p_rwlock_try_wrlock(xt_rwlock_type *rwl)
+{
+	int result;
+
+	if (rwl == NULL)
+		return FALSE;
+
+	if (rwl->rw_magic != XT_RWLOCK_MAGIC)
+		return FALSE;
+
+	if ((result = xt_p_mutex_trylock(&rwl->rw_ex_lock)) != 0)
+		return FALSE;
+
+	if ((result = xt_p_mutex_lock(&rwl->rw_sh_lock)) != 0) {
+		(void) xt_p_mutex_unlock(&rwl->rw_ex_lock);
+		return FALSE;
+	}
+
+	if (rwl->rw_ex_count == 0) {
+		if (rwl->rw_sh_complete_count > 0) {
+			rwl->rw_sh_count -= rwl->rw_sh_complete_count;
+			rwl->rw_sh_complete_count = 0;
+		}
+
+		if (rwl->rw_sh_count > 0) {
+			rwl->rw_sh_complete_count = -rwl->rw_sh_count;
+
+			do {
+				result = pthread_cond_wait (&rwl->rw_sh_cond, &rwl->rw_sh_lock.mt_cs);
+			}
+			while (result == 0 && rwl->rw_sh_complete_count < 0);
+
+			if (result == 0)
+				rwl->rw_sh_count = 0;
+		}
+	}
+
+	if (result == 0)
+		rwl->rw_ex_count++;
+
+#ifdef XT_THREAD_LOCK_INFO
+	xt_thread_lock_info_add_owner(&rwl->rw_lock_info);
+#endif
+
+	return TRUE;
+}
+
+xtPublic int xt_p_rwlock_unlock(xt_rwlock_type *rwl)
+{
+	int result, result1;
+
+	if (rwl == NULL)
+		return (ERROR_BAD_ARGUMENTS);
+
+	if (rwl->rw_magic != XT_RWLOCK_MAGIC)
+		return ERROR_BAD_ARGUMENTS;
+
+	if (rwl->rw_ex_count == 0) {
+		if ((result = xt_p_mutex_lock(&rwl->rw_sh_lock)) != 0)
+			return result;
+
+		if (++rwl->rw_sh_complete_count == 0)
+			result = pthread_cond_signal(&rwl->rw_sh_cond);
+
+		result1 = xt_p_mutex_unlock(&rwl->rw_sh_lock);
+	}
+	else {
+		rwl->rw_ex_count--;
+
+		result = xt_p_mutex_unlock(&rwl->rw_sh_lock);
+		result1 = xt_p_mutex_unlock(&rwl->rw_ex_lock);
+	}
+
+#ifdef XT_THREAD_LOCK_INFO
+	xt_thread_lock_info_release_owner(&rwl->rw_lock_info);
+#endif
+
+	return ((result != 0) ? result : result1);
+}
+
+xtPublic int xt_p_cond_wait(xt_cond_type *cond, xt_mutex_type *mutex)
+{
+	return xt_p_cond_timedwait(cond, mutex, NULL);
+}
+
+xtPublic int xt_p_cond_timedwait(xt_cond_type *cond, xt_mutex_type *mt, struct timespec *abstime)
+{
+	pthread_mutex_t	*mutex = &mt->mt_cs;
+	int				result;
+	long			timeout; 
+	union ft64		now;
+
+	if (abstime != NULL) {
+		GetSystemTimeAsFileTime(&now.ft);
+
+		timeout = (long)((abstime->tv.i64 - now.i64) / 10000);
+		if (timeout < 0)
+			timeout = 0L;
+		if (timeout > abstime->max_timeout_msec)
+			timeout = abstime->max_timeout_msec;
+	}
+	else
+		timeout= INFINITE;
+
+	WaitForSingleObject(cond->broadcast_block_event, INFINITE);
+
+	EnterCriticalSection(&cond->lock_waiting);
+	cond->waiting++;
+	LeaveCriticalSection(&cond->lock_waiting);
+
+	LeaveCriticalSection(mutex);
+
+	result= WaitForMultipleObjects(2, cond->events, FALSE, timeout);
+
+	EnterCriticalSection(&cond->lock_waiting);
+	cond->waiting--;
+	
+	if (cond->waiting == 0) {
+		/* The last waiter must reset the broadcast
+		 * state (whther there was a broadcast or not)!
+		 */
+		ResetEvent(cond->events[xt_cond_type::BROADCAST]);
+		SetEvent(cond->broadcast_block_event);
+	}
+	LeaveCriticalSection(&cond->lock_waiting);
+	
+	EnterCriticalSection(mutex);
+
+	return result == WAIT_TIMEOUT ? ETIMEDOUT : 0;
+}
+
+xtPublic int xt_p_join(pthread_t thread, void **value)
+{
+	DWORD exitcode;
+
+	while(1) {
+		switch (WaitForSingleObject(thread, 10000)) {
+			case WAIT_OBJECT_0:
+				return 0;
+			case WAIT_TIMEOUT:
+				/* Don't do this! According to the Win docs:
+				 * _endthread automatically closes the thread handle
+				 * (whereas _endthreadex does not). Therefore, when using
+				 * _beginthread and _endthread, do not explicitly close the
+				 * thread handle by calling the Win32 CloseHandle API.
+				CloseHandle(thread);
+				 */
+				/* This is done so that if the thread was not [yet] in the running
+				 * state when this function was called we won't deadlock here.
+				 */
+				if (GetExitCodeThread(thread, &exitcode) && (exitcode == STILL_ACTIVE))
+					break;
+				return 0;
+			case WAIT_FAILED:
+				return GetLastError();
+		}
+	}
+
+	return 0;
+}
+
+#else // XT_WIN
+
+#ifdef __darwin__
+#define POLICY			SCHED_RR
+#else
+#define POLICY			pth_policy
+#endif
+
+static int pth_policy;
+static int pth_normal_priority;
+static int pth_min_priority;
+static int pth_max_priority;
+
+/* Return zero if the priority was set OK,
+ * else errno.
+ */
+static int pth_set_priority(pthread_t thread, int priority)
+{
+	struct sched_param	sp;
+
+	memset(&sp, 0, sizeof(struct sched_param));
+	sp.sched_priority = priority;
+	return pthread_setschedparam(thread, POLICY, &sp);
+}
+
+static void pth_get_priority_limits(void)
+{
+	XTThreadPtr			self = NULL;
+	struct sched_param	sp;
+	int					err;
+	int					start;
+
+	/* Save original priority: */
+	err = pthread_getschedparam(pthread_self(), &pth_policy, &sp);
+	if (err) {
+		xt_throw_errno(XT_CONTEXT, err);
+		return;
+	}
+	pth_normal_priority = sp.sched_priority;
+
+	start = sp.sched_priority;
+
+#ifdef XT_FREEBSD 
+	pth_min_priority = sched_get_priority_min(sched_getscheduler(0));
+	pth_max_priority = sched_get_priority_max(sched_getscheduler(0));
+#else
+	/* Search for the minimum priority: */
+	pth_min_priority = start;
+	for (;;) {
+		/* 2007-03-01: Corrected, pth_set_priority returns the error code
+		 * (thanks to Hakan for pointing out this bug!)
+		 */
+		if (pth_set_priority(pthread_self(), pth_min_priority-1) != 0)
+			break;
+		pth_min_priority--;
+	}
+
+	/* Search for the maximum priority: */
+	pth_max_priority = start;
+	for (;;) {
+		if (pth_set_priority(pthread_self(), pth_max_priority+1) != 0)
+			break;
+		pth_max_priority++;
+	}
+
+	/* Restore original priority: */
+	pthread_setschedparam(pthread_self(), pth_policy, &sp);
+#endif
+}
+
+xtPublic void xt_p_init_threading(void)
+{
+	pth_get_priority_limits();
+}
+
+xtPublic int xt_p_set_low_priority(pthread_t thr)
+{
+	if (pth_min_priority == pth_max_priority) {
+		/* Under Linux the priority of normal (non-runtime)
+		 * threads are set using the standard methods
+		 * for setting process priority.
+		 */
+
+		/* We could set who == 0 because it should have the same affect
+		 * as using the PID.
+		 */
+
+		/* -20 = highest, 20 = lowest */
+		if (setpriority(PRIO_PROCESS, getpid(), 20) == -1)
+			return errno;
+		return 0;
+	}
+	return pth_set_priority(thr, pth_min_priority);
+}
+
+xtPublic int xt_p_set_normal_priority(pthread_t thr)
+{
+	if (pth_min_priority == pth_max_priority) {
+		if (setpriority(PRIO_PROCESS, getpid(), 0) == -1)
+			return errno;
+		return 0;
+	}
+	return pth_set_priority(thr, pth_normal_priority);
+}
+
+xtPublic int xt_p_set_high_priority(pthread_t thr)
+{
+	if (pth_min_priority == pth_max_priority) {
+		if (setpriority(PRIO_PROCESS, getpid(), -20) == -1)
+			return errno;
+		return 0;
+	}
+	return pth_set_priority(thr, pth_max_priority);
+}
+
+#ifdef DEBUG_LOCKING
+
+xtPublic int xt_p_mutex_lock(xt_mutex_type *mutex, u_int line, const char *file)
+{
+	XTThreadPtr self = xt_get_self();
+	int			r;
+
+	ASSERT_NS(mutex->mu_init == 12345);
+	r = pthread_mutex_lock(&mutex->mu_plock);
+	if (r == 0) {
+		if (mutex->mu_trace)
+			printf("==LOCK mutex %d %s:%d\n", (int) mutex->mu_trace, file, (int) line);
+		ASSERT_NS(!mutex->mu_locker);
+		mutex->mu_locker = self;
+		mutex->mu_line = line;
+		mutex->mu_file = file;
+	}
+#ifdef XT_THREAD_LOCK_INFO
+	xt_thread_lock_info_add_owner(&mutex->mu_lock_info);
+#endif
+	return r;
+}
+
+xtPublic int xt_p_mutex_unlock(xt_mutex_type *mutex)
+{
+	XTThreadPtr self = xt_get_self();
+
+	ASSERT_NS(mutex->mu_init == 12345);
+	ASSERT_NS(mutex->mu_locker == self);
+	mutex->mu_locker = NULL;
+	if (mutex->mu_trace)
+		printf("UNLOCK mutex %d\n", (int) mutex->mu_trace);
+#ifdef XT_THREAD_LOCK_INFO
+	xt_thread_lock_info_release_owner(&mutex->mu_lock_info);
+#endif
+	return pthread_mutex_unlock(&mutex->mu_plock);
+}
+
+xtPublic int xt_p_mutex_destroy(xt_mutex_type *mutex)
+{
+	//ASSERT_NS(mutex->mu_init == 12345);
+	mutex->mu_init = 11111;
+#ifdef XT_THREAD_LOCK_INFO
+	xt_thread_lock_info_free(&mutex->mu_lock_info);
+#endif
+	return pthread_mutex_destroy(&mutex->mu_plock);
+}
+
+xtPublic int xt_p_mutex_trylock(xt_mutex_type *mutex)
+{
+	XTThreadPtr self = xt_get_self();
+	int			r;
+
+	ASSERT_NS(mutex->mu_init == 12345);
+	r = pthread_mutex_trylock(&mutex->mu_plock);
+	if (r == 0) {
+		ASSERT_NS(!mutex->mu_locker);
+		mutex->mu_locker = self;
+#ifdef XT_THREAD_LOCK_INFO
+		xt_thread_lock_info_add_owner(&mutex->mu_lock_info);
+#endif
+	}
+	return r;
+}
+
+#ifdef XT_THREAD_LOCK_INFO
+xtPublic int xt_p_mutex_init(xt_mutex_type *mutex, const pthread_mutexattr_t *attr, const char *n)
+#else
+xtPublic int xt_p_mutex_init(xt_mutex_type *mutex, const pthread_mutexattr_t *attr)
+#endif
+{
+	mutex->mu_init = 12345;
+	mutex->mu_trace = FALSE;
+	mutex->mu_locker = NULL;
+#ifdef XT_THREAD_LOCK_INFO
+	mutex->mu_name = n;
+	xt_thread_lock_info_init(&mutex->mu_lock_info, mutex);
+#endif
+	return pthread_mutex_init(&mutex->mu_plock, attr);
+}
+
+xtPublic int xt_p_cond_wait(xt_cond_type *cond, xt_mutex_type *mutex)
+{
+	XTThreadPtr self = xt_get_self();
+	int			r;
+
+	ASSERT_NS(mutex->mu_init == 12345);
+	ASSERT_NS(mutex->mu_locker == self);
+	mutex->mu_locker = NULL;
+	r = pthread_cond_wait(cond, &mutex->mu_plock);
+	ASSERT_NS(!mutex->mu_locker);
+	mutex->mu_locker = self;
+	return r;
+}
+
+xtPublic int xt_p_cond_timedwait(xt_cond_type *cond, xt_mutex_type *mutex, const struct timespec *abstime)
+{
+	XTThreadPtr self = xt_get_self();
+	int			r;
+
+	ASSERT_NS(mutex->mu_init == 12345);
+	ASSERT_NS(mutex->mu_locker == self);
+	mutex->mu_locker = NULL;
+	r = pthread_cond_timedwait(cond, &mutex->mu_plock, abstime);
+	ASSERT_NS(!mutex->mu_locker);
+	mutex->mu_locker = self;
+	return r;
+}
+
+xtPublic int xt_p_rwlock_rdlock(xt_rwlock_type *rwlock)
+{
+	int r;
+
+	ASSERT_NS(rwlock->rw_init == 67890);
+	r = pthread_rwlock_rdlock(&rwlock->rw_plock);
+#ifdef XT_THREAD_LOCK_INFO
+	xt_thread_lock_info_add_owner(&rwlock->rw_lock_info);
+#endif
+	return r;
+}
+
+xtPublic int xt_p_rwlock_wrlock(xt_rwlock_type *rwlock)
+{
+	XTThreadPtr self = xt_get_self();
+	int			r;
+
+	ASSERT_NS(rwlock->rw_init == 67890);
+	r = pthread_rwlock_wrlock(&rwlock->rw_plock);
+	if (r == 0) {
+		ASSERT_NS(!rwlock->rw_locker);
+		rwlock->rw_locker = self;
+	}
+#ifdef XT_THREAD_LOCK_INFO
+	xt_thread_lock_info_add_owner(&rwlock->rw_lock_info);
+#endif
+	return r;
+}
+
+xtPublic xtBool xt_p_rwlock_try_wrlock(xt_rwlock_type *rwlock)
+{
+	XTThreadPtr self = xt_get_self();
+	int			r;
+
+	ASSERT_NS(rwlock->rw_init == 67890);
+	r = pthread_rwlock_trywrlock(&rwlock->rw_plock);
+	if (r == 0) {
+		ASSERT_NS(!rwlock->rw_locker);
+		rwlock->rw_locker = self;
+#ifdef XT_THREAD_LOCK_INFO
+		xt_thread_lock_info_add_owner(&rwlock->rw_lock_info);
+#endif
+	}
+	return r == 0;
+}
+
+xtPublic int xt_p_rwlock_unlock(xt_rwlock_type *rwlock)
+{
+	XTThreadPtr self = xt_get_self();
+
+	ASSERT_NS(rwlock->rw_init == 67890);
+	if (rwlock->rw_locker) {
+		ASSERT_NS(rwlock->rw_locker == self);
+		rwlock->rw_locker = NULL;
+	}
+#ifdef XT_THREAD_LOCK_INFO
+	xt_thread_lock_info_release_owner(&rwlock->rw_lock_info);
+#endif
+	return pthread_rwlock_unlock(&rwlock->rw_plock);
+}
+
+xtPublic int xt_p_rwlock_destroy(xt_rwlock_type *rwlock)
+{
+	ASSERT_NS(rwlock->rw_init == 67890);
+	rwlock->rw_init = 0;
+#ifdef XT_THREAD_LOCK_INFO
+	xt_thread_lock_info_free(&rwlock->rw_lock_info);
+#endif
+	return pthread_rwlock_destroy(&rwlock->rw_plock);
+}
+
+#ifdef XT_THREAD_LOCK_INFO
+xtPublic int xt_p_rwlock_init(xt_rwlock_type *rwlock, const pthread_rwlockattr_t *attr, const char *n)
+#else
+xtPublic int xt_p_rwlock_init(xt_rwlock_type *rwlock, const pthread_rwlockattr_t *attr)
+#endif
+{
+	rwlock->rw_init = 67890;
+	rwlock->rw_readers = 0;
+	rwlock->rw_locker = NULL;
+#ifdef XT_THREAD_LOCK_INFO
+	rwlock->rw_name = n;
+	xt_thread_lock_info_init(&rwlock->rw_lock_info, rwlock);
+#endif
+	return pthread_rwlock_init(&rwlock->rw_plock, attr);
+}
+
+#endif // DEBUG_LOCKING
+
+#endif // XT_WIN
+
diff --git a/storage/pbxt/src/pthread_xt.h b/storage/pbxt/src/pthread_xt.h
new file mode 100755
index 00000000000..dccc5779aad
--- /dev/null
+++ b/storage/pbxt/src/pthread_xt.h
@@ -0,0 +1,297 @@
+/* Copyright (c) 2005 PrimeBase Technologies GmbH
+ *
+ * PrimeBase XT
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * 2006-03-22	Paul McCullagh
+ *
+ * H&G2JCtL
+ *
+ * This file contains windows specific code.
+ */
+
+#ifndef __win_xt_h__
+#define __win_xt_h__
+
+#ifdef XT_WIN
+#include <windef.h>
+#include <my_pthread.h>
+#else
+#include <pthread.h>
+#endif
+
+#include "locklist_xt.h"
+
+#ifdef DEBUG
+//#define DEBUG_LOCKING
+#endif
+
+#define xt_cond_struct			_opaque_pthread_cond_t
+#define xt_cond_type			pthread_cond_t
+
+#define xt_cond_wait			pthread_cond_wait
+#define xt_cond_wakeall			pthread_cond_broadcast
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+void	xt_p_init_threading(void);
+int		xt_p_set_normal_priority(pthread_t thr);
+int		xt_p_set_low_priority(pthread_t thr);
+int		xt_p_set_high_priority(pthread_t thr);
+#ifdef	__cplusplus
+}
+#endif
+
+#ifdef XT_WIN
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+typedef LPVOID pthread_key_t;
+
+typedef struct xt_mutex_struct {
+	CRITICAL_SECTION	mt_cs;
+#ifdef XT_THREAD_LOCK_INFO
+	const char		   *mt_name;
+	XTThreadLockInfoRec mt_lock_info;
+#endif
+} xt_mutex_type;
+
+typedef struct xt_rwlock_struct {
+  xt_mutex_type			rw_ex_lock;
+  xt_mutex_type			rw_sh_lock;
+  pthread_cond_t		rw_sh_cond;
+  int					rw_sh_count;
+  int					rw_ex_count;
+  int					rw_sh_complete_count;
+  int					rw_magic;
+#ifdef XT_THREAD_LOCK_INFO
+	const char		   *rw_name;
+	XTThreadLockInfoRec rw_lock_info;
+#endif
+} xt_rwlock_type;
+
+#ifdef XT_THREAD_LOCK_INFO
+int xt_p_mutex_init(xt_mutex_type *mutex, const pthread_mutexattr_t *attr, const char *name);
+#else
+int xt_p_mutex_init(xt_mutex_type *mutex, const pthread_mutexattr_t *attr);
+#endif
+int xt_p_mutex_destroy(xt_mutex_type *mutex);
+int xt_p_mutex_lock(xt_mutex_type *mx);
+int xt_p_mutex_unlock(xt_mutex_type *mx);
+int xt_p_mutex_trylock(xt_mutex_type *mutex);
+
+#ifdef XT_THREAD_LOCK_INFO
+int xt_p_rwlock_init(xt_rwlock_type *rwlock, const pthread_condattr_t *attr, const char *name);
+#else
+int xt_p_rwlock_init(xt_rwlock_type *rwlock, const pthread_condattr_t *attr);
+#endif
+int		xt_p_rwlock_destroy(xt_rwlock_type *rwlock);
+int		xt_p_rwlock_rdlock(xt_rwlock_type *mx);
+int		xt_p_rwlock_wrlock(xt_rwlock_type *mx);
+xtBool	xt_p_rwlock_try_wrlock(xt_rwlock_type *rwl);
+int		xt_p_rwlock_unlock(xt_rwlock_type *mx);
+
+int		xt_p_cond_wait(xt_cond_type *cond, xt_mutex_type *mutex);
+int		xt_p_cond_timedwait(xt_cond_type *cond, xt_mutex_type *mutex, struct timespec *abstime);
+
+int xt_p_join(pthread_t thread, void **value);
+
+#ifdef	__cplusplus
+}
+#endif
+
+#ifdef XT_THREAD_LOCK_INFO
+#define xt_p_rwlock_init_with_name(a,b,c)   xt_p_rwlock_init(a,b,c)
+#define xt_p_rwlock_init_with_autoname(a,b) xt_p_rwlock_init_with_name(a,b,LOCKLIST_ARG_SUFFIX(a))
+#else
+#define xt_p_rwlock_init_with_name(a,b,c)   xt_p_rwlock_init(a,b,c)
+#define xt_p_rwlock_init_with_autoname(a,b) xt_p_rwlock_init(a,b)
+#endif
+
+#define xt_slock_rwlock_ns		xt_p_rwlock_rdlock
+#define xt_xlock_rwlock_ns		xt_p_rwlock_wrlock
+#define xt_xlock_try_rwlock_ns	xt_p_rwlock_try_wrlock
+#define xt_unlock_rwlock_ns		xt_p_rwlock_unlock
+
+#ifdef XT_THREAD_LOCK_INFO
+#define xt_p_mutex_init_with_name(a,b,c)   xt_p_mutex_init(a,b,c)
+#define xt_p_mutex_init_with_autoname(a,b) xt_p_mutex_init_with_name(a,b,LOCKLIST_ARG_SUFFIX(a))
+#else
+#define xt_p_mutex_init_with_name(a,b,c)   xt_p_mutex_init(a,b)
+#define xt_p_mutex_init_with_autoname(a,b) xt_p_mutex_init(a,b)
+#endif
+#define xt_lock_mutex_ns		xt_p_mutex_lock
+#define xt_unlock_mutex_ns		xt_p_mutex_unlock
+#define xt_mutex_trylock		xt_p_mutex_trylock
+
+#else // XT_WIN
+
+/* Finger weg! */
+#ifdef pthread_mutex_t
+#undef pthread_mutex_t
+#endif
+#ifdef pthread_rwlock_t
+#undef pthread_rwlock_t
+#endif
+#ifdef pthread_mutex_init
+#undef pthread_mutex_init
+#endif
+#ifdef pthread_mutex_destroy
+#undef pthread_mutex_destroy
+#endif
+#ifdef pthread_mutex_lock
+#undef pthread_mutex_lock
+#endif
+#ifdef pthread_mutex_unlock
+#undef pthread_mutex_unlock
+#endif
+#ifdef pthread_cond_wait
+#undef pthread_cond_wait
+#endif
+#ifdef pthread_cond_broadcast
+#undef pthread_cond_broadcast
+#endif
+#ifdef pthread_mutex_trylock
+#undef pthread_mutex_trylock
+#endif
+
+/*
+ * -----------------------------------------------------------------------
+ * Reedefinition of pthread locking, for debugging
+ */
+
+struct XTThread;
+
+
+#ifdef XT_THREAD_LOCK_INFO
+
+#define xt_p_mutex_init_with_name(a,b,c)   xt_p_mutex_init(a,b,c)
+#define xt_p_mutex_init_with_autoname(a,b) xt_p_mutex_init_with_name(a,b,LOCKLIST_ARG_SUFFIX(a))
+
+#define xt_p_rwlock_init_with_name(a,b,c)   xt_p_rwlock_init(a,b,c)
+#define xt_p_rwlock_init_with_autoname(a,b) xt_p_rwlock_init_with_name(a,b,LOCKLIST_ARG_SUFFIX(a))
+
+#else
+
+#define xt_p_mutex_init_with_name(a,b,c)   xt_p_mutex_init(a,b)
+#define xt_p_mutex_init_with_autoname(a,b) xt_p_mutex_init(a,b)
+
+#define xt_p_rwlock_init_with_name(a,b,c)   xt_p_rwlock_init(a,b)
+#define xt_p_rwlock_init_with_autoname(a,b) xt_p_rwlock_init_with_name(a,b)
+
+#endif
+
+#ifdef DEBUG_LOCKING
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+typedef struct xt_mutex_struct {
+	unsigned short				mu_init;
+	unsigned short				mu_trace;
+	unsigned int				mu_line;
+	const char					*mu_file;
+	struct XTThread				*mu_locker;
+	pthread_mutex_t				mu_plock;
+#ifdef XT_THREAD_LOCK_INFO
+	const char		   			*mu_name;
+	XTThreadLockInfoRec 		mu_lock_info;
+#endif
+} xt_mutex_type;
+
+typedef struct xt_rwlock_struct {
+	u_int						rw_init;
+	volatile u_int				rw_readers;
+	struct XTThread				*rw_locker;
+	pthread_rwlock_t			rw_plock;
+#ifdef XT_THREAD_LOCK_INFO
+	const char		   			*rw_name;
+	XTThreadLockInfoRec 		rw_lock_info;
+#endif
+} xt_rwlock_type;
+
+int		xt_p_rwlock_rdlock(xt_rwlock_type *mx);
+int		xt_p_rwlock_wrlock(xt_rwlock_type *mx);
+xtBool	xt_p_rwlock_try_wrlock(xt_rwlock_type *mx);
+int		xt_p_rwlock_unlock(xt_rwlock_type *mx);
+
+int xt_p_mutex_lock(xt_mutex_type *mx, u_int line, const char *file);
+int xt_p_mutex_unlock(xt_mutex_type *mx);
+int xt_p_mutex_trylock(xt_mutex_type *mutex);
+int xt_p_mutex_destroy(xt_mutex_type *mutex);
+#ifdef XT_THREAD_LOCK_INFO
+int xt_p_mutex_init(xt_mutex_type *mutex, const pthread_mutexattr_t *attr, const char *name);
+#else
+int xt_p_mutex_init(xt_mutex_type *mutex, const pthread_mutexattr_t *attr);
+#endif
+int xt_p_rwlock_destroy(xt_rwlock_type * rwlock);
+#ifdef XT_THREAD_LOCK_INFO
+int xt_p_rwlock_init(xt_rwlock_type *rwlock, const pthread_rwlockattr_t *attr, const char *name);
+#else
+int xt_p_rwlock_init(xt_rwlock_type *rwlock, const pthread_rwlockattr_t *attr);
+#endif
+int xt_p_cond_wait(xt_cond_type *cond, xt_mutex_type *mutex);
+int xt_p_cond_timedwait(xt_cond_type *cond, xt_mutex_type *mutex, const struct timespec *abstime);
+
+#ifdef	__cplusplus
+}
+#endif
+
+#define xt_slock_rwlock_ns			xt_p_rwlock_rdlock
+#define xt_xlock_rwlock_ns			xt_p_rwlock_wrlock
+#define xt_xlock_try_rwlock_ns		xt_p_rwlock_try_wrlock
+#define xt_unlock_rwlock_ns			xt_p_rwlock_unlock
+
+#define xt_lock_mutex_ns(x)			xt_p_mutex_lock(x, __LINE__, __FILE__)
+#define xt_unlock_mutex_ns			xt_p_mutex_unlock
+#define xt_mutex_trylock			xt_p_mutex_trylock
+
+#else // DEBUG_LOCKING
+
+#define xt_rwlock_struct			_opaque_pthread_rwlock_t
+#define xt_mutex_struct				_opaque_pthread_mutex_t
+
+#define xt_rwlock_type				pthread_rwlock_t
+#define xt_mutex_type				pthread_mutex_t
+
+#define xt_slock_rwlock_ns			pthread_rwlock_rdlock
+#define xt_xlock_rwlock_ns			pthread_rwlock_wrlock
+#define xt_xlock_try_rwlock_ns(x)	(pthread_rwlock_trywrlock(x) == 0)
+#define xt_unlock_rwlock_ns			pthread_rwlock_unlock
+
+#define xt_lock_mutex_ns			pthread_mutex_lock
+#define xt_unlock_mutex_ns			pthread_mutex_unlock
+#define xt_mutex_trylock			pthread_mutex_trylock
+
+#define xt_p_mutex_trylock			pthread_mutex_trylock
+#define xt_p_mutex_destroy			pthread_mutex_destroy
+#define xt_p_mutex_init				pthread_mutex_init
+#define xt_p_rwlock_destroy			pthread_rwlock_destroy
+#define xt_p_rwlock_init			pthread_rwlock_init
+#define xt_p_cond_wait				pthread_cond_wait
+#define xt_p_cond_timedwait			pthread_cond_timedwait
+
+#endif // DEBUG_LOCKING
+
+#define xt_p_join				pthread_join
+
+#endif // XT_WIN
+
+#endif
diff --git a/storage/pbxt/src/restart_xt.cc b/storage/pbxt/src/restart_xt.cc
new file mode 100644
index 00000000000..93720f2b113
--- /dev/null
+++ b/storage/pbxt/src/restart_xt.cc
@@ -0,0 +1,3542 @@
+/* Copyright (c) 2007 PrimeBase Technologies GmbH
+ *
+ * PrimeBase XT
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.	See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * 2007-11-12	Paul McCullagh
+ *
+ * H&G2JCtL
+ *
+ * Restart and write data to the database.
+ */
+
+#include "xt_config.h"
+
+#include <signal.h>
+#include <time.h>
+
+#ifndef DRIZZLED
+#include "mysql_priv.h"
+#endif
+
+#include "ha_pbxt.h"
+
+#ifdef DRIZZLED
+#include <drizzled/data_home.h>
+using drizzled::plugin::Registry;
+#endif
+
+#include "xactlog_xt.h"
+#include "database_xt.h"
+#include "util_xt.h"
+#include "strutil_xt.h"
+#include "filesys_xt.h"
+#include "myxt_xt.h"
+#include "trace_xt.h"
+
+#ifdef DEBUG
+//#define DEBUG_PRINT
+//#define DEBUG_KEEP_LOGS
+//#define PRINT_LOG_ON_RECOVERY
+//#define TRACE_RECORD_DATA
+//#define SKIP_STARTUP_CHECKPOINT
+//#define NEVER_CHECKPOINT
+//#define TRACE_CHECKPOINT
+#endif
+
+#define PRINTF		printf
+//#define PRINTF		xt_ftracef
+//#define PRINTF		xt_trace
+
+/*
+ * -----------------------------------------------------------------------
+ * GLOBALS
+ */
+
+xtPublic int				pbxt_recovery_state;
+
+/*
+ * -----------------------------------------------------------------------
+ * UTILITIES
+ */
+
+#ifdef TRACE_RECORD_DATA
+static void xt_print_bytes(xtWord1 *buf, u_int len)
+{
+	for (u_int i=0; i<len; i++) {
+		PRINTF("%02x ", (u_int) *buf);
+		buf++;
+	}
+}
+#endif
+
+void xt_print_log_record(xtLogID log, xtLogOffset offset, XTXactLogBufferDPtr record)
+{
+	const char		*type = NULL;
+	const char		*rec_type = NULL;
+	xtOpSeqNo		op_no = 0;
+	xtTableID		tab_id = 0;
+	xtRowID			row_id = 0;
+	xtRecordID		rec_id = 0;
+	xtBool			xn_set = FALSE;
+	xtXactID		xn_id = 0;
+	char			buffer[200];
+	XTTabRecExtDPtr	rec_buf;
+	XTTabRecExtDPtr	ext_rec;
+	XTTabRecFixDPtr	fix_rec;
+	u_int			rec_len;
+	xtLogID			log_id = 0;
+	xtLogOffset		log_offset = 0;
+
+	rec_buf = NULL;
+	ext_rec = NULL;
+	fix_rec = NULL;
+	rec_len = 0;
+	switch (record->xl.xl_status_1) {
+		case XT_LOG_ENT_REC_MODIFIED:
+		case XT_LOG_ENT_UPDATE:
+		case XT_LOG_ENT_INSERT:
+		case XT_LOG_ENT_DELETE:
+		case XT_LOG_ENT_UPDATE_BG:
+		case XT_LOG_ENT_INSERT_BG:
+		case XT_LOG_ENT_DELETE_BG:
+			op_no = XT_GET_DISK_4(record->xu.xu_op_seq_4);
+			tab_id = XT_GET_DISK_4(record->xu.xu_tab_id_4);
+			rec_id = XT_GET_DISK_4(record->xu.xu_rec_id_4);
+			xn_id = XT_GET_DISK_4(record->xu.xu_xact_id_4);
+			row_id = XT_GET_DISK_4(record->xu.xu_row_id_4);
+			rec_len = XT_GET_DISK_2(record->xu.xu_size_2);
+			xn_set = TRUE;
+			type="rec";
+			rec_buf = (XTTabRecExtDPtr) &record->xu.xu_rec_type_1;
+			ext_rec = (XTTabRecExtDPtr) &record->xu.xu_rec_type_1;
+			if (XT_REC_IS_EXT_DLOG(ext_rec->tr_rec_type_1)) {
+				log_id = XT_GET_DISK_2(ext_rec->re_log_id_2);
+				log_offset = XT_GET_DISK_6(ext_rec->re_log_offs_6);
+			}
+			else {
+				ext_rec = NULL;
+				fix_rec = (XTTabRecFixDPtr) &record->xu.xu_rec_type_1;
+			}
+			break;
+		case XT_LOG_ENT_UPDATE_FL:
+		case XT_LOG_ENT_INSERT_FL:
+		case XT_LOG_ENT_DELETE_FL:
+		case XT_LOG_ENT_UPDATE_FL_BG:
+		case XT_LOG_ENT_INSERT_FL_BG:
+		case XT_LOG_ENT_DELETE_FL_BG:
+			op_no = XT_GET_DISK_4(record->xf.xf_op_seq_4);
+			tab_id = XT_GET_DISK_4(record->xf.xf_tab_id_4);
+			rec_id = XT_GET_DISK_4(record->xf.xf_rec_id_4);
+			xn_id = XT_GET_DISK_4(record->xf.xf_xact_id_4);
+			row_id = XT_GET_DISK_4(record->xf.xf_row_id_4);
+			rec_len = XT_GET_DISK_2(record->xf.xf_size_2);
+			xn_set = TRUE;
+			type="rec";
+			rec_buf = (XTTabRecExtDPtr) &record->xf.xf_rec_type_1;
+			ext_rec = (XTTabRecExtDPtr) &record->xf.xf_rec_type_1;
+			if (XT_REC_IS_EXT_DLOG(ext_rec->tr_rec_type_1)) {
+				log_id = XT_GET_DISK_2(ext_rec->re_log_id_2);
+				log_offset = XT_GET_DISK_6(ext_rec->re_log_offs_6);
+			}
+			else {
+				ext_rec = NULL;
+				fix_rec = (XTTabRecFixDPtr) &record->xf.xf_rec_type_1;
+			}
+			break;
+		case XT_LOG_ENT_REC_FREED:
+		case XT_LOG_ENT_REC_REMOVED:
+		case XT_LOG_ENT_REC_REMOVED_EXT:
+			op_no = XT_GET_DISK_4(record->fr.fr_op_seq_4);
+			tab_id = XT_GET_DISK_4(record->fr.fr_tab_id_4);
+			rec_id = XT_GET_DISK_4(record->fr.fr_rec_id_4);
+			xn_id = XT_GET_DISK_4(record->fr.fr_xact_id_4);
+			xn_set = TRUE;
+			type="rec";
+			break;
+		case XT_LOG_ENT_REC_REMOVED_BI:
+			op_no = XT_GET_DISK_4(record->rb.rb_op_seq_4);
+			tab_id = XT_GET_DISK_4(record->rb.rb_tab_id_4);
+			rec_id = XT_GET_DISK_4(record->rb.rb_rec_id_4);
+			xn_id = XT_GET_DISK_4(record->rb.rb_xact_id_4);
+			row_id = XT_GET_DISK_4(record->rb.rb_row_id_4);
+			rec_len = XT_GET_DISK_2(record->rb.rb_size_2);
+			xn_set = TRUE;
+			type="rec";
+			rec_buf = (XTTabRecExtDPtr) &record->rb.rb_rec_type_1;
+			ext_rec = (XTTabRecExtDPtr) &record->rb.rb_rec_type_1;
+			if (XT_REC_IS_EXT_DLOG(record->rb.rb_rec_type_1)) {
+				log_id = XT_GET_DISK_2(ext_rec->re_log_id_2);
+				log_offset = XT_GET_DISK_6(ext_rec->re_log_offs_6);
+			}
+			else {
+				ext_rec = NULL;
+				fix_rec = (XTTabRecFixDPtr) &record->rb.rb_rec_type_1;
+			}
+			break;
+		case XT_LOG_ENT_REC_MOVED:
+			op_no = XT_GET_DISK_4(record->xw.xw_op_seq_4);
+			tab_id = XT_GET_DISK_4(record->xw.xw_tab_id_4);
+			rec_id = XT_GET_DISK_4(record->xw.xw_rec_id_4);
+			log_id = XT_GET_DISK_2(&record->xw.xw_rec_type_1);			// This is actually correct
+			log_offset = XT_GET_DISK_6(record->xw.xw_next_rec_id_4);	// This is actually correct!
+			type="rec";
+			break;
+		case XT_LOG_ENT_REC_CLEANED:
+		case XT_LOG_ENT_REC_CLEANED_1:
+		case XT_LOG_ENT_REC_UNLINKED:
+			op_no = XT_GET_DISK_4(record->xw.xw_op_seq_4);
+			tab_id = XT_GET_DISK_4(record->xw.xw_tab_id_4);
+			rec_id = XT_GET_DISK_4(record->xw.xw_rec_id_4);
+			type="rec";
+			break;
+		case XT_LOG_ENT_ROW_NEW:
+		case XT_LOG_ENT_ROW_NEW_FL:
+		case XT_LOG_ENT_ROW_ADD_REC:
+		case XT_LOG_ENT_ROW_SET:
+		case XT_LOG_ENT_ROW_FREED:
+			op_no = XT_GET_DISK_4(record->xa.xa_op_seq_4);
+			tab_id = XT_GET_DISK_4(record->xa.xa_tab_id_4);
+			rec_id = XT_GET_DISK_4(record->xa.xa_row_id_4);
+			type="row";
+			break;
+		case XT_LOG_ENT_NO_OP:
+			op_no = XT_GET_DISK_4(record->no.no_op_seq_4);
+			tab_id = XT_GET_DISK_4(record->no.no_tab_id_4);
+			type="-";
+			break;
+		case XT_LOG_ENT_END_OF_LOG:
+			break;
+	}
+
+	switch (record->xl.xl_status_1) {
+		case XT_LOG_ENT_HEADER:
+			rec_type = "HEADER";
+			break;
+		case XT_LOG_ENT_NEW_LOG:
+			rec_type = "NEW LOG";
+			break;
+		case XT_LOG_ENT_DEL_LOG:
+			sprintf(buffer, "DEL LOG log=%d ", (int) XT_GET_DISK_4(record->xl.xl_log_id_4));
+			rec_type = buffer;
+			break;
+		case XT_LOG_ENT_NEW_TAB:
+			rec_type = "NEW TABLE";
+			break;
+		case XT_LOG_ENT_COMMIT:
+			rec_type = "COMMIT";
+			xn_id = XT_GET_DISK_4(record->xe.xe_xact_id_4);
+			xn_set = TRUE;
+			break;
+		case XT_LOG_ENT_ABORT:
+			rec_type = "ABORT";
+			xn_id = XT_GET_DISK_4(record->xe.xe_xact_id_4);
+			xn_set = TRUE;
+			break;
+		case XT_LOG_ENT_CLEANUP:
+			rec_type = "CLEANUP";
+			xn_id = XT_GET_DISK_4(record->xc.xc_xact_id_4);
+			xn_set = TRUE;
+			break;
+		case XT_LOG_ENT_REC_MODIFIED:
+			rec_type = "MODIFIED";
+			break;
+		case XT_LOG_ENT_UPDATE:
+			rec_type = "UPDATE";
+			break;
+		case XT_LOG_ENT_UPDATE_FL:
+			rec_type = "UPDATE-FL";
+			break;
+		case XT_LOG_ENT_INSERT:
+			rec_type = "INSERT";
+			break;
+		case XT_LOG_ENT_INSERT_FL:
+			rec_type = "INSERT-FL";
+			break;
+		case XT_LOG_ENT_DELETE:
+			rec_type = "DELETE";
+			break;
+		case XT_LOG_ENT_DELETE_FL:
+			rec_type = "DELETE-FL";
+			break;
+		case XT_LOG_ENT_UPDATE_BG:
+			rec_type = "UPDATE-BG";
+			break;
+		case XT_LOG_ENT_UPDATE_FL_BG:
+			rec_type = "UPDATE-FL-BG";
+			break;
+		case XT_LOG_ENT_INSERT_BG:
+			rec_type = "INSERT-BG";
+			break;
+		case XT_LOG_ENT_INSERT_FL_BG:
+			rec_type = "INSERT-FL-BG";
+			break;
+		case XT_LOG_ENT_DELETE_BG:
+			rec_type = "DELETE-BG";
+			break;
+		case XT_LOG_ENT_DELETE_FL_BG:
+			rec_type = "DELETE-FL-BG";
+			break;
+		case XT_LOG_ENT_REC_FREED:
+			rec_type = "FREE REC";
+			break;
+		case XT_LOG_ENT_REC_REMOVED:
+			rec_type = "REMOVED REC";
+			break;
+		case XT_LOG_ENT_REC_REMOVED_EXT:
+			rec_type = "REMOVED-X REC";
+			break;
+		case XT_LOG_ENT_REC_REMOVED_BI:
+			rec_type = "REMOVED-BI REC";
+			break;
+		case XT_LOG_ENT_REC_MOVED:
+			rec_type = "MOVED REC";
+			break;
+		case XT_LOG_ENT_REC_CLEANED:
+			rec_type = "CLEAN REC";
+			break;
+		case XT_LOG_ENT_REC_CLEANED_1:
+			rec_type = "CLEAN REC-1";
+			break;
+		case XT_LOG_ENT_REC_UNLINKED:
+			rec_type = "UNLINK REC";
+			break;
+		case XT_LOG_ENT_ROW_NEW:
+			rec_type = "NEW ROW";
+			break;
+		case XT_LOG_ENT_ROW_NEW_FL:
+			rec_type = "NEW ROW-FL";
+			break;
+		case XT_LOG_ENT_ROW_ADD_REC:
+			rec_type = "REC ADD ROW";
+			break;
+		case XT_LOG_ENT_ROW_SET:
+			rec_type = "SET ROW";
+			break;
+		case XT_LOG_ENT_ROW_FREED:
+			rec_type = "FREE ROW";
+			break;
+		case XT_LOG_ENT_OP_SYNC:
+			rec_type = "OP SYNC";
+			break;
+		case XT_LOG_ENT_NO_OP:
+			rec_type = "NO OP";
+			break;
+		case XT_LOG_ENT_END_OF_LOG:
+			rec_type = "END OF LOG";
+			break;
+		case XT_LOG_ENT_PREPARE:
+			rec_type = "PREPARE";
+			xn_id = XT_GET_DISK_4(record->xp.xp_xact_id_4);
+			xn_set = TRUE;
+			break;
+	}
+
+	if (log)
+		PRINTF("log=%d offset=%d ", (int) log, (int) offset);
+	PRINTF("%s ", rec_type);
+	if (type)
+		PRINTF("op=%lu tab=%lu %s=%lu ", (u_long) op_no, (u_long) tab_id, type, (u_long) rec_id);
+	else if (tab_id)
+		PRINTF("tab=%lu ", (u_long) tab_id);
+	if (row_id)
+		PRINTF("row=%lu ", (u_long) row_id);
+	if (log_id)
+		PRINTF("log=%lu offset=%lu ", (u_long) log_id, (u_long) log_offset);
+	if (xn_set)
+		PRINTF("xact=%lu ", (u_long) xn_id);
+
+#ifdef TRACE_RECORD_DATA
+	if (rec_buf) {
+		switch (rec_buf->tr_rec_type_1 & XT_TAB_STATUS_MASK) {
+			case XT_TAB_STATUS_FREED:
+				PRINTF("FREE");
+				break;
+			case XT_TAB_STATUS_DELETE:
+				PRINTF("DELE");
+				break;
+			case XT_TAB_STATUS_FIXED:
+				PRINTF("FIX-");
+				break;
+			case XT_TAB_STATUS_VARIABLE:
+				PRINTF("VAR-");
+				break;
+			case XT_TAB_STATUS_EXT_DLOG:
+				PRINTF("EXT-");
+				break;
+		}
+		if (rec_buf->tr_rec_type_1 & XT_TAB_STATUS_CLEANED_BIT)
+			PRINTF("C");
+		else
+			PRINTF(" ");
+	}
+	if (ext_rec) {
+		rec_len -= offsetof(XTTabRecExtDRec, re_data);
+		xt_print_bytes((xtWord1 *) ext_rec, offsetof(XTTabRecExtDRec, re_data));
+		PRINTF("| ");
+		if (rec_len > 20)
+			rec_len = 20;
+		xt_print_bytes(ext_rec->re_data, rec_len);
+	}
+	if (fix_rec) {
+		rec_len -= offsetof(XTTabRecFixDRec, rf_data);
+		xt_print_bytes((xtWord1 *) fix_rec, offsetof(XTTabRecFixDRec, rf_data));
+		PRINTF("| ");
+		if (rec_len > 20)
+			rec_len = 20;
+		xt_print_bytes(fix_rec->rf_data, rec_len);
+	}
+#endif
+
+	PRINTF("\n");
+}
+
+#ifdef DEBUG_PRINT
+void check_rows(void)
+{
+	static XTOpenFilePtr of = NULL;
+
+	if (!of)
+		of = xt_open_file_ns("./test/test_tab-1.xtr", XT_FS_DEFAULT);
+	if (of) {
+		size_t size = (size_t) xt_seek_eof_file(NULL, of);
+		xtWord8 *buffer = (xtWord8 *) xt_malloc_ns(size);
+		xt_pread_file(of, 0, size, size, buffer, NULL);
+		for (size_t i=0; i<size/8; i++) {
+			if (!buffer[i])
+				printf("%d is NULL\n", (int) i);
+		}
+	}
+}
+
+#endif
+
+/* ----------------------------------------------------------------------
+ * APPLYING CHANGES IN SEQUENCE
+ */
+
+typedef struct XTOperation {
+	xtOpSeqNo				or_op_seq;
+	xtWord4					or_op_len;
+	xtLogID					or_log_id;
+	xtLogOffset				or_log_offset;
+} XTOperationRec, *XTOperationPtr;
+
+static int xres_cmp_op_seq(struct XTThread *XT_UNUSED(self), register const void *XT_UNUSED(thunk), register const void *a, register const void *b)
+{
+	xtOpSeqNo		lf_op_seq = *((xtOpSeqNo *) a);
+	XTOperationPtr	lf_ptr = (XTOperationPtr) b;
+
+	if (lf_op_seq == lf_ptr->or_op_seq)
+		return 0;
+	if (XTTableSeq::xt_op_is_before(lf_op_seq, lf_ptr->or_op_seq))
+		return -1;
+	return 1;
+}
+
+xtPublic void xt_xres_init_tab(XTThreadPtr self, XTTableHPtr tab)
+{
+	tab->tab_op_list = xt_new_sortedlist(self, sizeof(XTOperationRec), 20, 1000, xres_cmp_op_seq, NULL, NULL, TRUE, FALSE);
+}
+
+xtPublic void xt_xres_exit_tab(XTThreadPtr self, XTTableHPtr tab)
+{
+	if (tab->tab_op_list) {
+		xt_free_sortedlist(self, tab->tab_op_list);
+		tab->tab_op_list = NULL;
+	}
+}
+
+static xtBool xres_open_table(XTThreadPtr self, XTWriterStatePtr ws, xtTableID tab_id)
+{
+	XTOpenTablePtr	ot;
+
+	if ((ot = ws->ws_ot)) {
+		if (ot->ot_table->tab_id == tab_id)
+			return OK;
+		xt_db_return_table_to_pool(self, ot);
+		ws->ws_ot = NULL;
+	}
+
+	if (ws->ws_tab_gone == tab_id)
+		return FAILED;
+	if ((ws->ws_ot = xt_db_open_pool_table(self, ws->ws_db, tab_id, NULL, TRUE))) {
+		XTTableHPtr		tab;
+
+		tab = ws->ws_ot->ot_table;
+		if (!tab->tab_ind_rec_log_id) {
+			/* Should not happen... */
+			tab->tab_ind_rec_log_id = ws->ws_ind_rec_log_id;
+			tab->tab_ind_rec_log_offset = ws->ws_ind_rec_log_offset;
+		}
+		return OK;
+	}
+
+	ws->ws_tab_gone = tab_id;
+	return FAILED;
+}
+
+/* {INDEX-RECOV_ROWID}
+ * Add missing index entries during recovery.
+ * Set the row ID even if the index entry
+ * is not committed. It will be removed later by
+ * the sweeper.
+ */
+static xtBool xres_add_index_entries(XTOpenTablePtr ot, xtRowID row_id, xtRecordID rec_id, xtWord1 *rec_data)
+{
+	XTTableHPtr			tab = ot->ot_table;
+	u_int				idx_cnt;
+	XTIndexPtr			*ind;
+	//XTIdxSearchKeyRec	key;
+
+	if (tab->tab_dic.dic_disable_index)
+		return OK;
+
+	for (idx_cnt=0, ind=tab->tab_dic.dic_keys; idx_cnt<tab->tab_dic.dic_key_count; idx_cnt++, ind++) {
+		if (!xt_idx_insert(ot, *ind, row_id, rec_id, rec_data, NULL, TRUE)) {
+			/* Check the error, certain errors are recoverable! */
+			XTThreadPtr self = xt_get_self();
+
+			if (self->t_exception.e_xt_err == XT_SYSTEM_ERROR &&
+				(XT_FILE_IN_USE(self->t_exception.e_sys_err) ||
+				 XT_FILE_ACCESS_DENIED(self->t_exception.e_sys_err) ||
+				 XT_FILE_TOO_MANY_OPEN(self->t_exception.e_sys_err) ||
+				 self->t_exception.e_sys_err == XT_ENOMEM)) {
+				ot->ot_err_index_no = (*ind)->mi_index_no;
+				return FAILED;
+			}
+
+			/* TODO: Write something to the index header to indicate that
+			 * it is corrupted.
+			 */
+			xt_tab_disable_index(ot->ot_table, XT_INDEX_CORRUPTED);
+			xt_log_and_clear_exception_ns();
+			return OK;
+		}
+	}
+	return OK;
+}
+
+static void xres_remove_index_entries(XTOpenTablePtr ot, xtRecordID rec_id, xtWord1 *rec_data)
+{
+	XTTableHPtr	tab = ot->ot_table;
+	u_int		idx_cnt;
+	XTIndexPtr	*ind;
+
+	if (tab->tab_dic.dic_disable_index)
+		return;
+
+	for (idx_cnt=0, ind=tab->tab_dic.dic_keys; idx_cnt<tab->tab_dic.dic_key_count; idx_cnt++, ind++) {
+		if (!xt_idx_delete(ot, *ind, rec_id, rec_data))
+			xt_log_and_clear_exception_ns();
+	}
+}
+
+static xtWord1 *xres_load_record(XTThreadPtr self, XTOpenTablePtr ot, xtRecordID rec_id, xtWord1 *data, size_t red_size, XTInfoBufferPtr rec_buf, u_int cols_req)
+{
+	XTTableHPtr	tab = ot->ot_table;
+	xtWord1		*rec_data;
+
+	rec_data = ot->ot_row_rbuffer;
+
+	ASSERT(red_size <= ot->ot_row_rbuf_size);
+	ASSERT(tab->tab_dic.dic_rec_size <= ot->ot_row_rbuf_size);
+	if (data) {
+		if (rec_data != data)
+			memcpy(rec_data, data, red_size);
+	}
+	else {
+		/* It can be that less than 'dic_rec_size' was written for
+		 * variable length type records.
+		 * If this is the last record in the file, then we will read
+		 * less than actual record size.
+		 */
+		if (!XT_PREAD_RR_FILE(ot->ot_rec_file, xt_rec_id_to_rec_offset(tab, rec_id), tab->tab_dic.dic_rec_size, 0, rec_data, &red_size, &self->st_statistics.st_rec, self))
+			goto failed;
+		
+		if (red_size < sizeof(XTTabRecHeadDRec))
+			return NULL;
+	}
+	
+	if (XT_REC_IS_FIXED(rec_data[0]))
+		rec_data = ot->ot_row_rbuffer + XT_REC_FIX_HEADER_SIZE;
+	else {
+		if (!xt_ib_alloc(NULL, rec_buf, tab->tab_dic.dic_mysql_buf_size))
+			goto failed;
+		if (XT_REC_IS_VARIABLE(rec_data[0])) {
+			if (!myxt_load_row(ot, rec_data + XT_REC_FIX_HEADER_SIZE, rec_buf->ib_db.db_data, cols_req))
+				goto failed;
+		}
+		else if (XT_REC_IS_EXT_DLOG(rec_data[0])) {
+			if (red_size < XT_REC_EXT_HEADER_SIZE)
+				return NULL;
+
+			ASSERT(cols_req);
+			if (cols_req && cols_req <= tab->tab_dic.dic_fix_col_count) {
+				if (!myxt_load_row(ot, rec_data + XT_REC_EXT_HEADER_SIZE, rec_buf->ib_db.db_data, cols_req))
+					goto failed;
+			}
+			else {
+				if (!xt_tab_load_ext_data(ot, rec_id, rec_buf->ib_db.db_data, cols_req))
+					goto failed;
+			}
+		}
+		else
+			/* This is possible, the record has already been cleaned up. */
+			return NULL;
+		rec_data = rec_buf->ib_db.db_data;
+	}
+
+	return rec_data;
+
+	failed:
+	/* Running out of memory should not be ignored. */
+	if (self->t_exception.e_xt_err == XT_SYSTEM_ERROR &&
+		self->t_exception.e_sys_err == XT_ENOMEM)
+		xt_throw(self);
+	xt_log_and_clear_exception_ns();
+	return NULL;
+}
+
+/*
+ * Apply a change from the log.
+ *
+ * This function is basically very straight forward, were it not
+ * for the option to apply operations out of sequence.
+ * (i.e. in_sequence == FALSE)
+ *
+ * If operations are applied in sequence, then they can be
+ * applied blindly. The update operation is just executed as
+ * it was logged.
+ *
+ * If the changes are not in sequence, then some operation are missing,
+ * however, the operations that are present are in the correct order.
+ *
+ * This can only happen at the end of recovery!!!
+ * After we have applied all operations in the log we may be
+ * left with some operations that have not been applied
+ * because operations were logged out of sequence.
+ *
+ * The application of these operations there has to take into
+ * account the current state of the database.
+ * They are then applied in a manner that maintains the
+ * database consistency.
+ *
+ * For example, a record that is freed, is free by placing it
+ * on the current free list. Part of the data logged for the
+ * operation is ignored. Namely: the "next block" pointer
+ * that was originally written into the freed record.
+ */
+static void xres_apply_change(XTThreadPtr self, XTOpenTablePtr ot, XTXactLogBufferDPtr record, xtBool in_sequence, xtBool check_index, XTInfoBufferPtr rec_buf)
+{
+	XTTableHPtr			tab = ot->ot_table;
+	size_t				len;
+	xtRecordID			rec_id;
+	xtRefID				free_ref_id;
+	XTTabRecFreeDRec	free_rec;
+	xtRowID				row_id;
+	XTTabRowRefDRec		row_buf;
+	XTTabRecHeadDRec	rec_head;
+	size_t				tfer;
+	xtRecordID			link_rec_id, prev_link_rec_id;
+	xtWord1				*rec_data = NULL;
+	XTTabRecFreeDPtr	free_data;
+
+	ASSERT(ot->ot_thread == self);
+	if (tab->tab_dic.dic_key_count == 0)
+		check_index = FALSE;
+
+	switch (record->xl.xl_status_1) {
+		case XT_LOG_ENT_REC_MODIFIED:
+		case XT_LOG_ENT_UPDATE:
+		case XT_LOG_ENT_INSERT:
+		case XT_LOG_ENT_DELETE:
+		case XT_LOG_ENT_UPDATE_BG:
+		case XT_LOG_ENT_INSERT_BG:
+		case XT_LOG_ENT_DELETE_BG:
+			rec_id = XT_GET_DISK_4(record->xu.xu_rec_id_4);
+
+			/* This should be done before we apply change to table, as otherwise we lose
+			 * the key value that we need to remove from index
+			 */
+			if (check_index && record->xl.xl_status_1 == XT_LOG_ENT_REC_MODIFIED) {
+				if ((rec_data = xres_load_record(self, ot, rec_id, NULL, 0, rec_buf, tab->tab_dic.dic_ind_cols_req)))
+					xres_remove_index_entries(ot, rec_id, rec_data);			
+			}
+
+			len = (size_t) XT_GET_DISK_2(record->xu.xu_size_2);
+			if (!XT_PWRITE_RR_FILE(ot->ot_rec_file, xt_rec_id_to_rec_offset(tab, rec_id), len, (xtWord1 *) &record->xu.xu_rec_type_1, &ot->ot_thread->st_statistics.st_rec, ot->ot_thread))
+				xt_throw(self);
+			tab->tab_bytes_to_flush += len;
+
+			if (check_index) {
+				switch (record->xl.xl_status_1) {
+					case XT_LOG_ENT_DELETE:
+					case XT_LOG_ENT_DELETE_BG:
+						break;
+					default:
+						if ((rec_data = xres_load_record(self, ot, rec_id, &record->xu.xu_rec_type_1, len, rec_buf, tab->tab_dic.dic_ind_cols_req))) {
+							row_id = XT_GET_DISK_4(record->xu.xu_row_id_4);
+							if (!xres_add_index_entries(ot, row_id, rec_id, rec_data))
+								xt_throw(self);
+						}
+						break;
+				}
+			}
+
+			if (!in_sequence) {
+				/* A record has been allocated from the EOF, but out of sequence.
+				 * This could leave a gap where other records were allocated
+				 * from the EOF, but those operations have been lost!
+				 * We compensate for this by adding all blocks between
+				 * to the free list.
+				 */
+				free_rec.rf_rec_type_1 = XT_TAB_STATUS_FREED;
+				free_rec.rf_not_used_1 = 0;
+				while (tab->tab_head_rec_eof_id < rec_id) {
+					XT_SET_DISK_4(free_rec.rf_next_rec_id_4, tab->tab_head_rec_free_id);
+					if (!XT_PWRITE_RR_FILE(ot->ot_rec_file, tab->tab_head_rec_eof_id, sizeof(XTTabRecFreeDRec), (xtWord1 *) &free_rec, &ot->ot_thread->st_statistics.st_rec, ot->ot_thread))
+						xt_throw(self);
+					tab->tab_bytes_to_flush += sizeof(XTTabRecFreeDRec);
+					tab->tab_head_rec_free_id = tab->tab_head_rec_eof_id;
+					tab->tab_head_rec_eof_id++;
+				}
+			}
+			if (tab->tab_head_rec_eof_id < rec_id + 1)
+				tab->tab_head_rec_eof_id = rec_id + 1;
+			tab->tab_flush_pending = TRUE;
+			break;
+		case XT_LOG_ENT_UPDATE_FL:
+		case XT_LOG_ENT_INSERT_FL:
+		case XT_LOG_ENT_DELETE_FL:
+		case XT_LOG_ENT_UPDATE_FL_BG:
+		case XT_LOG_ENT_INSERT_FL_BG:
+		case XT_LOG_ENT_DELETE_FL_BG:
+			rec_id = XT_GET_DISK_4(record->xf.xf_rec_id_4);
+			len = (size_t) XT_GET_DISK_2(record->xf.xf_size_2);
+			free_ref_id = XT_GET_DISK_4(record->xf.xf_free_rec_id_4);
+
+			if (check_index &&
+				record->xf.xf_status_1 != XT_LOG_ENT_DELETE_FL &&
+				record->xf.xf_status_1 != XT_LOG_ENT_DELETE_FL_BG) {
+				if ((rec_data = xres_load_record(self, ot, rec_id, &record->xf.xf_rec_type_1, len, rec_buf, tab->tab_dic.dic_ind_cols_req))) {
+					row_id = XT_GET_DISK_4(record->xf.xf_row_id_4);
+					if (!xres_add_index_entries(ot, row_id, rec_id, rec_data))
+						xt_throw(self);
+				}
+			}
+
+			if (!in_sequence) {
+				/* This record was allocated from the free list.
+				 * Because this operation is out of sequence, there
+				 * could have been other allocations from the
+				 * free list before this, that have gone missing.
+				 * For this reason we have to search the current
+				 * free list and remove the record.
+				 */
+				link_rec_id = tab->tab_head_rec_free_id;
+				prev_link_rec_id = 0;
+				while (link_rec_id) {
+					if (!XT_PREAD_RR_FILE(ot->ot_rec_file, xt_rec_id_to_rec_offset(tab, link_rec_id), sizeof(XTTabRecFreeDRec), sizeof(XTTabRecFreeDRec), (xtWord1 *) &free_rec, NULL, &self->st_statistics.st_rec, self))
+						xt_throw(self);
+					if (link_rec_id == rec_id)
+						break;
+					prev_link_rec_id = link_rec_id;
+					link_rec_id = XT_GET_DISK_4(free_rec.rf_next_rec_id_4);
+				}
+				if (link_rec_id == rec_id) {
+					/* The block was found on the free list.
+					 * remove it: */
+					if (prev_link_rec_id) {
+						/* We write the record from position 'link_rec_id' into
+						 * position 'prev_link_rec_id'. This unlinks 'link_rec_id'!
+						 */
+						if (!XT_PWRITE_RR_FILE(ot->ot_rec_file, xt_rec_id_to_rec_offset(tab, prev_link_rec_id), sizeof(XTTabRecFreeDRec), (xtWord1 *) &free_rec, &ot->ot_thread->st_statistics.st_rec, ot->ot_thread))
+							xt_throw(self);
+						tab->tab_bytes_to_flush += sizeof(XTTabRecFreeDRec);
+						free_ref_id = tab->tab_head_rec_free_id;
+					}
+					else
+						/* The block is at the front of the list: */
+						free_ref_id = XT_GET_DISK_4(free_rec.rf_next_rec_id_4);
+				}
+				else {
+					/* Not found on the free list? */
+					if (tab->tab_head_rec_eof_id < rec_id + 1)
+						tab->tab_head_rec_eof_id = rec_id + 1;
+					goto write_mod_data;
+				}
+			}
+			if (tab->tab_head_rec_eof_id < rec_id + 1)
+				tab->tab_head_rec_eof_id = rec_id + 1;
+			tab->tab_head_rec_free_id = free_ref_id;
+			tab->tab_head_rec_fnum--;
+			write_mod_data:
+			if (!XT_PWRITE_RR_FILE(ot->ot_rec_file, xt_rec_id_to_rec_offset(tab, rec_id), len, (xtWord1 *) &record->xf.xf_rec_type_1, &ot->ot_thread->st_statistics.st_rec, ot->ot_thread))
+				xt_throw(self);
+			tab->tab_bytes_to_flush += len;
+			tab->tab_flush_pending = TRUE;
+			break;
+		case XT_LOG_ENT_REC_REMOVED:
+		case XT_LOG_ENT_REC_REMOVED_EXT: {
+			xtBool			record_loaded;
+			XTTabRecExtDPtr	ext_rec;
+			size_t			red_size;
+			xtWord4			log_over_size = 0;
+			xtLogID			data_log_id = 0;
+			xtLogOffset		data_log_offset = 0;
+			u_int			cols_required = 0;
+
+			rec_id = XT_GET_DISK_4(record->fr.fr_rec_id_4);
+			free_data = (XTTabRecFreeDPtr) &record->fr.fr_rec_type_1;
+
+			/* This is a short-cut, it does not require loading the record: */
+			if (!check_index && !tab->tab_dic.dic_blob_count && record->fr.fr_status_1 != XT_LOG_ENT_REC_REMOVED_EXT)
+				goto do_rec_freed;
+
+			ext_rec = (XTTabRecExtDPtr) ot->ot_row_rbuffer;
+
+			if (!XT_PREAD_RR_FILE(ot->ot_rec_file, xt_rec_id_to_rec_offset(tab, rec_id), tab->tab_dic.dic_rec_size, 0, ext_rec, &red_size, &self->st_statistics.st_rec, self)) {
+				xt_log_and_clear_exception_ns();
+				goto do_rec_freed;
+			}
+
+			if (red_size < sizeof(XTTabRecHeadDRec))
+				goto do_rec_freed;
+
+			/* Check that the record is the same as the one originally removed.
+			 * This can be different if recovery is repeated.
+			 * For example:
+			 * 
+			 * log=21 offset=6304472 REMOVED-X REC op=360616 tab=7 rec=25874 
+			 * log=21 offset=6309230 UPDATE-FL op=360618 tab=7 rec=25874 row=26667 log=1 offset=26503077 xact=209 
+			 * log=21 offset=6317500 CLEAN REC op=360631 tab=7 rec=25874 
+			 * 
+			 * If this recovery sequence is repeated, then the REMOVED-X will free the
+			 * extended record belonging to the update that came afterwards!
+			 *
+			 * Additional situation to consider:
+			 *
+			 * - A record "x" is created, and index entries created.
+			 * - A checkpoint is made done.
+			 * - Record "x" is deleted due to UPDATE.
+			 * - The index entries are removed, but the index is not
+			 *   flushed.
+			 * - This deletion is written to disk by the writer.
+			 * So we have the situation that the remove is on disk,
+			 * but the index changes have not been made.
+			 *
+			 * In this case, skipping to "do_rec_freed" is incorrect.
+			 */
+			if (record->fr.fr_stat_id_1 != ext_rec->tr_stat_id_1 ||
+				XT_GET_DISK_4(record->fr.fr_xact_id_4) != XT_GET_DISK_4(ext_rec->tr_xact_id_4))
+				goto dont_remove_x_record;
+
+			if (record->xl.xl_status_1 == XT_LOG_ENT_REC_REMOVED_EXT) {
+				if (!XT_REC_IS_EXT_DLOG(ext_rec->tr_rec_type_1))
+					goto dont_remove_x_record;
+				if (red_size < offsetof(XTTabRecExtDRec, re_data))
+					goto dont_remove_x_record;
+
+				/* Save this for later (can be overwritten by xres_load_record(): */
+				data_log_id = XT_GET_DISK_2(ext_rec->re_log_id_2);
+				data_log_offset = XT_GET_DISK_6(ext_rec->re_log_offs_6);
+				log_over_size = XT_GET_DISK_4(ext_rec->re_log_dat_siz_4);
+			}
+			dont_remove_x_record:
+
+			record_loaded = FALSE;
+
+			if (check_index) {
+				cols_required = tab->tab_dic.dic_ind_cols_req;
+				if (tab->tab_dic.dic_blob_cols_req > cols_required)
+					cols_required = tab->tab_dic.dic_blob_cols_req;
+				if (!(rec_data = xres_load_record(self, ot, rec_id, ot->ot_row_rbuffer, red_size, rec_buf, cols_required)))
+					goto do_rec_freed;
+				record_loaded = TRUE;
+				xres_remove_index_entries(ot, rec_id, rec_data);
+			}
+
+			if (tab->tab_dic.dic_blob_count) {
+				if (!record_loaded) {
+					if (tab->tab_dic.dic_blob_cols_req > cols_required)
+						cols_required = tab->tab_dic.dic_blob_cols_req;
+					if (!(rec_data = xres_load_record(self, ot, rec_id, ot->ot_row_rbuffer, red_size, rec_buf, cols_required)))
+						/* [(7)] REMOVE is followed by FREE:
+						goto get_rec_offset;
+						*/
+						goto do_rec_freed;
+					record_loaded = TRUE;
+				}
+			}
+
+			if (record->xl.xl_status_1 == XT_LOG_ENT_REC_REMOVED_EXT) {
+				/* Note: dlb_delete_log() may be repeated, but should handle this:
+				 * 
+				 * Example:
+				 * log=5 offset=213334 CLEAN REC op=28175 tab=1 rec=317428 
+				 * ...
+				 * log=6 offset=321063 REMOVED-X REC op=33878 tab=1 rec=317428 
+				 *
+				 * When this sequence is repeated during recovery, then CLEAN REC
+				 * will reset the status byte of the record so that it
+				 * comes back to here!
+				 *
+				 * The check for zero is probably not required here.
+				 */
+				if (data_log_id && data_log_offset && log_over_size) {
+					if (!ot->ot_thread->st_dlog_buf.dlb_delete_log(data_log_id, data_log_offset, log_over_size, tab->tab_id, rec_id, self)) {
+						if (ot->ot_thread->t_exception.e_xt_err != XT_ERR_BAD_EXT_RECORD &&
+							ot->ot_thread->t_exception.e_xt_err != XT_ERR_DATA_LOG_NOT_FOUND)
+							xt_log_and_clear_exception_ns();
+					}
+				}
+			}
+
+			goto do_rec_freed;
+		}
+		case XT_LOG_ENT_REC_REMOVED_BI: {
+			/*
+			 * For deletion we need the complete before image because of the following problem.
+			 *
+			 * DROP TABLE IF EXISTS t1;
+			 * CREATE TABLE t1 (ID int primary key auto_increment, value int, index (value)) engine=pbxt;
+			 * 
+			 * insert t1(value) values(50);
+			 * 
+			 * -- CHECKPOINT --
+			 * 
+			 * update t1 set value = 60;
+			 * 
+			 * -- PAUSE --
+			 * 
+			 * update t1 set value = 70;
+			 * 
+			 * -- CRASH --
+			 * 
+			 * select value from t1;
+			 * select * from t1;
+			 * 
+			 * 081203 12:11:46 [Note] PBXT: Recovering from 1-148, bytes to read: 33554284
+			 * log=1 offset=148 UPDATE-BG op=5 tab=1 rec=2 row=1 xact=3 
+			 * log=1 offset=188 REC ADD ROW op=6 tab=1 row=1 
+			 * log=1 offset=206 COMMIT xact=3 
+			 * log=1 offset=216 REMOVED REC op=7 tab=1 rec=1 xact=2 
+			 * log=1 offset=241 CLEAN REC op=8 tab=1 rec=2 
+			 * log=1 offset=261 CLEANUP xact=3 
+			 * log=1 offset=267 UPDATE-FL-BG op=9 tab=1 rec=1 row=1 xact=4 
+			 * log=1 offset=311 REC ADD ROW op=10 tab=1 row=1 
+			 * log=1 offset=329 COMMIT xact=4 
+			 * log=1 offset=339 REMOVED REC op=11 tab=1 rec=2 xact=3 
+			 * log=1 offset=364 CLEAN REC op=12 tab=1 rec=1 
+			 * log=1 offset=384 CLEANUP xact=4 
+			 * 081203 12:12:15 [Note] PBXT: Recovering complete at 1-390, bytes read: 33554284
+			 * 
+			 * mysql> select value from t1;
+			 * +-------+
+			 * | value |
+			 * +-------+
+			 * |    50 | 
+			 * |    70 | 
+			 * +-------+
+			 * 2 rows in set (55.99 sec)
+			 * 
+			 * mysql> select * from t1;
+			 * +----+-------+
+			 * | ID | value |
+			 * +----+-------+
+			 * |  1 |    70 | 
+			 * +----+-------+
+			 * 1 row in set (0.00 sec)
+			 */
+			XTTabRecExtDPtr	ext_rec;
+			xtWord4			log_over_size = 0;
+			xtLogID			data_log_id = 0;
+			xtLogOffset		data_log_offset = 0;
+			u_int			cols_required = 0;
+			xtBool			record_loaded;
+			size_t			rec_size;		
+
+			rec_id = XT_GET_DISK_4(record->rb.rb_rec_id_4);
+			rec_size = XT_GET_DISK_2(record->rb.rb_size_2);
+
+			ext_rec = (XTTabRecExtDPtr) &record->rb.rb_rec_type_1;
+
+			if (XT_REC_IS_EXT_DLOG(record->rb.rb_rec_type_1)) {
+				/* Save this for later (can be overwritten by xres_load_record(): */
+				data_log_id = XT_GET_DISK_2(ext_rec->re_log_id_2);
+				data_log_offset = XT_GET_DISK_6(ext_rec->re_log_offs_6);
+				log_over_size = XT_GET_DISK_4(ext_rec->re_log_dat_siz_4);
+			}
+
+			record_loaded = FALSE;
+
+			if (check_index) {
+				cols_required = tab->tab_dic.dic_ind_cols_req;
+				if (!(rec_data = xres_load_record(self, ot, rec_id, &record->rb.rb_rec_type_1, rec_size, rec_buf, cols_required)))
+					goto go_on_to_free;
+				record_loaded = TRUE;
+				xres_remove_index_entries(ot, rec_id, rec_data);
+			}
+
+			if (data_log_id && data_log_offset && log_over_size) {
+				if (!ot->ot_thread->st_dlog_buf.dlb_delete_log(data_log_id, data_log_offset, log_over_size, tab->tab_id, rec_id, self)) {
+					if (ot->ot_thread->t_exception.e_xt_err != XT_ERR_BAD_EXT_RECORD &&
+						ot->ot_thread->t_exception.e_xt_err != XT_ERR_DATA_LOG_NOT_FOUND)
+						xt_log_and_clear_exception_ns();
+				}
+			}
+
+			go_on_to_free:
+			/* Use the new record type: */
+			record->rb.rb_rec_type_1 = record->rb.rb_new_rec_type_1;
+			free_data = (XTTabRecFreeDPtr) &record->rb.rb_rec_type_1;
+			goto do_rec_freed;
+		}
+		case XT_LOG_ENT_REC_FREED:
+			rec_id = XT_GET_DISK_4(record->fr.fr_rec_id_4);
+			free_data = (XTTabRecFreeDPtr) &record->fr.fr_rec_type_1;
+			do_rec_freed:
+			if (!in_sequence) {
+				size_t	red_size;
+
+				/* Free the record.
+				 * We place the record on front of the current
+				 * free list.
+				 *
+				 * However, before we do this, we remove the record
+				 * from its row list, if the record is on a row list.
+				 *
+				 * We do this here, because in the normal removal
+				 * from the row list uses the operations:
+				 *
+				 * XT_LOG_ENT_REC_UNLINKED, XT_LOG_ENT_ROW_SET and
+				 * XT_LOG_ENT_ROW_FREED.
+				 *
+				 * When operations are performed out of sequence,
+				 * these operations are ignored for the purpose
+				 * of removing the record from the row.
+				 */
+				if (!XT_PREAD_RR_FILE(ot->ot_rec_file, xt_rec_id_to_rec_offset(tab, rec_id), sizeof(XTTabRecHeadDRec), sizeof(XTTabRecHeadDRec), (xtWord1 *) &rec_head, NULL, &self->st_statistics.st_rec, self))
+					xt_throw(self);
+				/* The record is already free: */
+				if (XT_REC_IS_FREE(rec_head.tr_rec_type_1))
+					goto free_done;
+				row_id = XT_GET_DISK_4(rec_head.tr_row_id_4);
+
+				/* Search the row for this record: */
+				if (!XT_PREAD_RR_FILE(ot->ot_row_file, xt_row_id_to_row_offset(tab, row_id), sizeof(XTTabRowRefDRec), sizeof(XTTabRowRefDRec), (xtWord1 *) &row_buf, NULL, &self->st_statistics.st_rec, self))
+					xt_throw(self);
+				link_rec_id = XT_GET_DISK_4(row_buf.rr_ref_id_4);
+				prev_link_rec_id = 0;
+				while (link_rec_id) {
+					if (!XT_PREAD_RR_FILE(ot->ot_rec_file, xt_rec_id_to_rec_offset(tab, link_rec_id), sizeof(XTTabRecHeadDRec), 0, (xtWord1 *) &rec_head, &red_size, &self->st_statistics.st_rec, self)) {
+						xt_log_and_clear_exception(self);
+						break;
+					}
+					if (red_size < sizeof(XTTabRecHeadDRec))
+						break;
+					if (link_rec_id == rec_id)
+						break;
+					if (XT_GET_DISK_4(rec_head.tr_row_id_4) != row_id)
+						break;
+					switch (rec_head.tr_rec_type_1 & XT_TAB_STATUS_MASK) {
+						case XT_TAB_STATUS_FREED:
+							break;
+						case XT_TAB_STATUS_DELETE:
+						case XT_TAB_STATUS_FIXED:
+						case XT_TAB_STATUS_VARIABLE:
+						case XT_TAB_STATUS_EXT_DLOG:
+							break;
+						default:
+							ASSERT(FALSE);
+							goto exit_loop;
+					}
+					if (rec_head.tr_rec_type_1 & ~(XT_TAB_STATUS_CLEANED_BIT | XT_TAB_STATUS_MASK)) {
+						ASSERT(FALSE);
+						break;
+					}
+					prev_link_rec_id = link_rec_id;
+					link_rec_id = XT_GET_DISK_4(rec_head.tr_prev_rec_id_4);
+				}
+
+				exit_loop:
+				if (link_rec_id == rec_id) {
+					/* The record was found on the row list, remove it: */
+					if (prev_link_rec_id) {
+						/* We write the previous variation pointer from position 'link_rec_id' into
+						 * variation pointer of the 'prev_link_rec_id' record. This unlinks 'link_rec_id'!
+						 */
+						if (!XT_PWRITE_RR_FILE(ot->ot_rec_file, xt_rec_id_to_rec_offset(tab, prev_link_rec_id) + offsetof(XTTabRecHeadDRec, tr_prev_rec_id_4), XT_RECORD_ID_SIZE, (xtWord1 *) &rec_head.tr_prev_rec_id_4, &ot->ot_thread->st_statistics.st_rec, ot->ot_thread))
+							xt_throw(self);
+						tab->tab_bytes_to_flush += XT_RECORD_ID_SIZE;
+					}
+					else {
+						/* The record is at the front of the row list: */
+						xtRefID ref_id = XT_GET_DISK_4(rec_head.tr_prev_rec_id_4);
+						XT_SET_DISK_4(row_buf.rr_ref_id_4, ref_id);
+						if (!XT_PWRITE_RR_FILE(ot->ot_row_file, xt_row_id_to_row_offset(tab, row_id), sizeof(XTTabRowRefDRec), (xtWord1 *) &row_buf, &ot->ot_thread->st_statistics.st_rec, ot->ot_thread))
+							xt_throw(self);
+						tab->tab_bytes_to_flush += sizeof(XTTabRowRefDRec);
+					}
+				}				
+
+				/* Now we free the record, by placing it at the front of
+				 * the free list:
+				 */
+				XT_SET_DISK_4(free_data->rf_next_rec_id_4, tab->tab_head_rec_free_id);				
+			}
+			tab->tab_head_rec_free_id = rec_id;
+			tab->tab_head_rec_fnum++;
+			if (!XT_PWRITE_RR_FILE(ot->ot_rec_file, xt_rec_id_to_rec_offset(tab, rec_id), sizeof(XTTabRecFreeDRec), (xtWord1 *) free_data, &ot->ot_thread->st_statistics.st_rec, ot->ot_thread))
+				xt_throw(self);
+			tab->tab_bytes_to_flush += sizeof(XTTabRecFreeDRec);
+			tab->tab_flush_pending = TRUE;
+			free_done:
+			break;
+		case XT_LOG_ENT_REC_MOVED:
+			len = 8;
+			rec_id = XT_GET_DISK_4(record->xw.xw_rec_id_4);
+			if (!XT_PWRITE_RR_FILE(ot->ot_rec_file, xt_rec_id_to_rec_offset(tab, rec_id) + offsetof(XTTabRecExtDRec, re_log_id_2), len, (xtWord1 *) &record->xw.xw_rec_type_1, &ot->ot_thread->st_statistics.st_rec, ot->ot_thread))
+				xt_throw(self);
+			tab->tab_bytes_to_flush += len;
+			tab->tab_flush_pending = TRUE;
+			break;
+		case XT_LOG_ENT_REC_CLEANED:
+			len = offsetof(XTTabRecHeadDRec, tr_prev_rec_id_4) + XT_RECORD_ID_SIZE;
+			goto get_rec_offset;
+		case XT_LOG_ENT_REC_CLEANED_1:
+			len = 1;
+			goto get_rec_offset;
+		case XT_LOG_ENT_REC_UNLINKED:
+			if (!in_sequence) {
+				/* Unlink the record.
+				 * This is done when the record is freed.
+				 */
+				break;
+			}
+			len = offsetof(XTTabRecHeadDRec, tr_prev_rec_id_4) + XT_RECORD_ID_SIZE;
+			get_rec_offset:
+			rec_id = XT_GET_DISK_4(record->xw.xw_rec_id_4);
+			if (!XT_PWRITE_RR_FILE(ot->ot_rec_file, xt_rec_id_to_rec_offset(tab, rec_id), len, (xtWord1 *) &record->xw.xw_rec_type_1, &ot->ot_thread->st_statistics.st_rec, ot->ot_thread))
+				xt_throw(self);
+			tab->tab_bytes_to_flush += len;
+			tab->tab_flush_pending = TRUE;
+			break;
+		case XT_LOG_ENT_ROW_NEW:
+			len = offsetof(XTactRowAddedEntryDRec, xa_free_list_4);
+			row_id = XT_GET_DISK_4(record->xa.xa_row_id_4);
+			if (!in_sequence) {
+				/* A row was allocated from the EOF. Because operations are missing.
+				 * The blocks between the current EOF and the new EOF need to be
+				 * place on the free list!
+				 */				
+				while (tab->tab_head_row_eof_id < row_id) {
+					XT_SET_DISK_4(row_buf.rr_ref_id_4, tab->tab_head_row_free_id);
+					if (!XT_PWRITE_RR_FILE(ot->ot_row_file, xt_row_id_to_row_offset(tab, tab->tab_head_row_eof_id), sizeof(XTTabRowRefDRec), (xtWord1 *) &row_buf, &ot->ot_thread->st_statistics.st_rec, ot->ot_thread))
+						xt_throw(self);
+					tab->tab_bytes_to_flush += sizeof(XTTabRowRefDRec);
+					tab->tab_head_row_free_id = tab->tab_head_row_eof_id;
+					tab->tab_head_row_eof_id++;
+				}
+			}
+			if (tab->tab_head_row_eof_id < row_id + 1)
+				tab->tab_head_row_eof_id = row_id + 1;
+			tab->tab_flush_pending = TRUE;
+			break;
+		case XT_LOG_ENT_ROW_NEW_FL:
+			len = sizeof(XTactRowAddedEntryDRec);
+			row_id = XT_GET_DISK_4(record->xa.xa_row_id_4);
+			free_ref_id = XT_GET_DISK_4(record->xa.xa_free_list_4);
+			if (!in_sequence) {
+				size_t red_size;
+				/* The record was taken from the free list.
+				 * If the operations were in sequence, then this would be
+				 * the front of the free list now.
+				 * However, because operations are missing, it may no
+				 * longer be the front of the free list!
+				 * Search and remove:
+				 */
+				link_rec_id = tab->tab_head_row_free_id;
+				prev_link_rec_id = 0;
+				while (link_rec_id) {
+					if (!XT_PREAD_RR_FILE(ot->ot_row_file, xt_row_id_to_row_offset(tab, link_rec_id), sizeof(XTTabRowRefDRec), 0, (xtWord1 *) &row_buf, &red_size, &self->st_statistics.st_rec, self)) {
+						xt_log_and_clear_exception(self);
+						break;
+					}
+					if (red_size < sizeof(XTTabRowRefDRec))
+						break;
+					if (link_rec_id == row_id)
+						break;
+					prev_link_rec_id = link_rec_id;
+					link_rec_id = XT_GET_DISK_4(row_buf.rr_ref_id_4);
+				}
+				if (link_rec_id == row_id) {
+					/* The block was found on the free list, remove it: */
+					if (prev_link_rec_id) {
+						/* We write the record from position 'link_rec_id' into
+						 * position 'prev_link_rec_id'. This unlinks 'link_rec_id'!
+						 */
+						if (!XT_PWRITE_RR_FILE(ot->ot_row_file, xt_row_id_to_row_offset(tab, prev_link_rec_id), sizeof(XTTabRowRefDRec), (xtWord1 *) &row_buf, &ot->ot_thread->st_statistics.st_rec, ot->ot_thread))
+							xt_throw(self);
+						tab->tab_bytes_to_flush += sizeof(XTTabRowRefDRec);
+						free_ref_id = tab->tab_head_row_free_id;
+					}
+					else
+						/* The block is at the front of the free list: */
+						free_ref_id = XT_GET_DISK_4(row_buf.rr_ref_id_4);
+				}
+				else {
+					/* Not found? */
+					if (tab->tab_head_row_eof_id < row_id + 1)
+						tab->tab_head_row_eof_id = row_id + 1;
+					break;
+				}
+					
+			}
+			if (tab->tab_head_row_eof_id < row_id + 1)
+				tab->tab_head_row_eof_id = row_id + 1;
+			tab->tab_head_row_free_id = free_ref_id;
+			tab->tab_head_row_fnum--;
+			tab->tab_flush_pending = TRUE;
+			break;
+		case XT_LOG_ENT_ROW_FREED:
+			row_id = XT_GET_DISK_4(record->wr.wr_row_id_4);
+			if (!in_sequence) {
+				/* Free the row.
+				 * Since this operation is being performed out of sequence, we
+				 * must assume that some other free and allocation operations
+				 * must be missing.
+				 * For this reason, we add the row to the front of the
+				 * existing free list.
+				 */
+				XT_SET_DISK_4(record->wr.wr_ref_id_4, tab->tab_head_row_free_id);
+			}
+			tab->tab_head_row_free_id = row_id;
+			tab->tab_head_row_fnum++;
+			goto write_row_data;
+		case XT_LOG_ENT_ROW_ADD_REC:
+			row_id = XT_GET_DISK_4(record->wr.wr_row_id_4);
+			if (!in_sequence) {
+				if (!XT_PREAD_RR_FILE(ot->ot_row_file, xt_row_id_to_row_offset(tab, row_id), sizeof(XTTabRowRefDRec), 0, (xtWord1 *) &row_buf, &tfer, &self->st_statistics.st_rec, self))
+					xt_throw(self);
+				if (tfer == sizeof(XTTabRowRefDRec)) {
+					/* Add a record to the front of the row.
+					 * This is easy, but we have to make sure that the next
+					 * pointer in the record is correct.
+					 */
+					rec_id = XT_GET_DISK_4(record->wr.wr_ref_id_4);
+					if (!XT_PREAD_RR_FILE(ot->ot_rec_file, xt_rec_id_to_rec_offset(tab, rec_id), sizeof(XTTabRecHeadDRec), 0, (xtWord1 *) &rec_head, &tfer, &self->st_statistics.st_rec, self))
+						xt_throw(self);
+					if (tfer == sizeof(XTTabRecHeadDRec) && XT_GET_DISK_4(rec_head.tr_row_id_4) == row_id) {
+						/* This is now the correct next pointer: */
+						xtRecordID next_ref_id = XT_GET_DISK_4(row_buf.rr_ref_id_4);
+						if (XT_GET_DISK_4(rec_head.tr_prev_rec_id_4) != next_ref_id &&
+							rec_id != next_ref_id) {
+							XT_SET_DISK_4(rec_head.tr_prev_rec_id_4, next_ref_id);
+							if (!XT_PWRITE_RR_FILE(ot->ot_rec_file, xt_rec_id_to_rec_offset(tab, rec_id), sizeof(XTTabRecHeadDRec), (xtWord1 *) &rec_head, &ot->ot_thread->st_statistics.st_rec, ot->ot_thread))
+								xt_throw(self);
+							tab->tab_bytes_to_flush += sizeof(XTTabRecHeadDRec);
+						}
+					}
+				}
+
+			}
+			goto write_row_data;
+		case XT_LOG_ENT_ROW_SET:
+			if (!in_sequence)
+				/* This operation is ignored when out of sequence!
+				 * The operation is used to remove a record from a row.
+				 * This is done automatically when the record is freed.
+				 */
+				break;
+			row_id = XT_GET_DISK_4(record->wr.wr_row_id_4);
+			write_row_data:
+			ASSERT_NS(XT_GET_DISK_4(record->wr.wr_ref_id_4) < tab->tab_head_rec_eof_id);
+			if (!XT_PWRITE_RR_FILE(ot->ot_row_file, xt_row_id_to_row_offset(tab, row_id), sizeof(XTTabRowRefDRec), (xtWord1 *) &record->wr.wr_ref_id_4, &ot->ot_thread->st_statistics.st_rec, self))
+				xt_throw(self);
+			tab->tab_bytes_to_flush += sizeof(XTTabRowRefDRec);
+			if (tab->tab_head_row_eof_id < row_id + 1)
+				tab->tab_head_row_eof_id = row_id + 1;
+			tab->tab_flush_pending = TRUE;
+			break;
+		case XT_LOG_ENT_NO_OP:
+		case XT_LOG_ENT_END_OF_LOG:
+			break;
+	}
+}
+
+/*
+ * Apply all operations that have been buffered
+ * for a particular table.
+ * Operations are buffered if they are
+ * read from the log out of sequence.
+ *
+ * In this case we buffer, and wait for the
+ * out of sequence operations to arrive.
+ *
+ * When the server is running, this will always be
+ * the case. A delay occurs while a transaction 
+ * fills its private log buffer.
+ */
+static void xres_apply_operations(XTThreadPtr self, XTWriterStatePtr ws, xtBool in_sequence)
+{
+	XTTableHPtr		tab = ws->ws_ot->ot_table;
+	u_int			i = 0;
+	XTOperationPtr	op;
+	xtBool			check_index;
+
+// XTDatabaseHPtr db, XTOpenTablePtr ot, XTXactSeqReadPtr sr, XTDataBufferPtr databuf
+	xt_sl_lock(self, tab->tab_op_list);
+	for (;;) {
+		op = (XTOperationPtr) xt_sl_item_at(tab->tab_op_list, i);
+		if (!op)
+			break;
+		if (in_sequence && tab->tab_head_op_seq+1 != op->or_op_seq)
+			break;
+		xt_db_set_size(self, &ws->ws_databuf, (size_t) op->or_op_len);
+		if (!ws->ws_db->db_xlog.xlog_rnd_read(&ws->ws_seqread, op->or_log_id, op->or_log_offset, (size_t) op->or_op_len, ws->ws_databuf.db_data, NULL, self))
+			xt_throw(self);
+		check_index = ws->ws_in_recover && xt_comp_log_pos(op->or_log_id, op->or_log_offset, ws->ws_ind_rec_log_id, ws->ws_ind_rec_log_offset) >= 0;
+		xres_apply_change(self, ws->ws_ot, (XTXactLogBufferDPtr) ws->ws_databuf.db_data, in_sequence, check_index, &ws->ws_rec_buf);
+		tab->tab_head_op_seq = op->or_op_seq;
+		if (tab->tab_wr_wake_freeer) {
+			if (!XTTableSeq::xt_op_is_before(tab->tab_head_op_seq, tab->tab_wake_freeer_op))
+				xt_wr_wake_freeer(self, ws->ws_db);
+		}
+		i++;
+	}
+	xt_sl_remove_from_front(self, tab->tab_op_list, i);
+	xt_sl_unlock(self, tab->tab_op_list);
+}
+
+/* Check for operations still remaining on tables.
+ * These operations are applied even though operations
+ * in sequence are missing.
+ */
+static xtBool xres_sync_operations(XTThreadPtr self, XTDatabaseHPtr db, XTWriterStatePtr ws)
+{
+	u_int			edx;
+	XTTableEntryPtr	te_ptr;
+	XTTableHPtr		tab;
+	xtBool			op_synced = FALSE;
+
+	xt_enum_tables_init(&edx);
+	while ((te_ptr = xt_enum_tables_next(self, db, &edx))) {
+		/* Dirty read of tab_op_list OK, here because this is the
+		 * only thread that updates the list!
+		 */
+		if ((tab = te_ptr->te_table)) {
+			if (xt_sl_get_size(tab->tab_op_list)) {
+				op_synced = TRUE;
+				if (xres_open_table(self, ws, te_ptr->te_tab_id))
+					xres_apply_operations(self, ws, FALSE);
+			}
+
+			/* Update the pointer cache: */
+			tab->tab_seq.xt_op_seq_set(self, tab->tab_head_op_seq+1);
+			tab->tab_row_eof_id = tab->tab_head_row_eof_id;
+			tab->tab_row_free_id = tab->tab_head_row_free_id;
+			tab->tab_row_fnum = tab->tab_head_row_fnum;
+			tab->tab_rec_eof_id = tab->tab_head_rec_eof_id;
+			tab->tab_rec_free_id = tab->tab_head_rec_free_id;
+			tab->tab_rec_fnum = tab->tab_head_rec_fnum;
+		}
+	}
+	return op_synced;
+}
+
+#ifdef XT_CORRECT_TABLE_FREE_COUNT
+#define CORRECT_COUNT		TRUE
+#else
+#define CORRECT_COUNT		FALSE
+#endif
+#ifdef XT_CHECK_RECORD_FREE_COUNT
+#define CHECK_RECS			TRUE
+#else
+#define CHECK_RECS			FALSE
+#endif
+#if defined(XT_CHECK_RECORD_FREE_COUNT) || defined(XT_CHECK_ROW_FREE_COUNT)
+#define RECOVER_FREE_COUNTS
+#endif
+
+#ifdef RECOVER_FREE_COUNTS
+/* {CORRECTED-ROW-COUNT}
+ * This error can be repeated by crashing the server during
+ * high activitity, after flush table writes the table header
+ * 
+ * On recovery, the free count "from the future" is used as
+ * the starting point for subsequent allocation and frees.
+ * The count is wrong after that point.
+ *
+ * The recovery of the count only works correctly if a
+ * checkpoint is complete successfully after that table
+ * header is flushed. Basically the writing of the table
+ * header should be synchronsized with the writing of the
+ * end of the checkpoint.
+ *
+ * Another solution would be to log the count, along with
+ * the allocate and free commannds.
+ *
+ * The 3rd solution is the one used here. The count is corrected
+ * after recovery.
+ */
+static void xres_recover_table_free_counts(XTThreadPtr self, XTDatabaseHPtr db, XTWriterStatePtr ws)
+{
+	u_int			edx;
+	XTTableEntryPtr	te_ptr;
+	XTTableHPtr		tab;
+
+	xt_enum_tables_init(&edx);
+	while ((te_ptr = xt_enum_tables_next(self, db, &edx))) {
+		if ((tab = te_ptr->te_table)) {
+			if (xres_open_table(self, ws, te_ptr->te_tab_id))
+				xt_tab_check_free_lists(self, ws->ws_ot, CHECK_RECS, CORRECT_COUNT);
+		}
+	}
+}
+#endif
+
+/*
+ * Operations from the log are applied in sequence order.
+ * If the operations are out of sequence, they are buffered
+ * until the missing operations appear.
+ *
+ * NOTE: No lock is required because there should only be
+ * one thread that does this!
+ */
+xtPublic void xt_xres_apply_in_order(XTThreadPtr self, XTWriterStatePtr ws, xtLogID log_id, xtLogOffset log_offset, XTXactLogBufferDPtr record)
+{
+	xtOpSeqNo		op_seq;
+	xtTableID		tab_id;
+	size_t			len;
+	xtBool			check_index;
+
+// XTDatabaseHPtr db, XTOpenTablePtr *ot, XTXactSeqReadPtr sr, XTDataBufferPtr databuf
+	switch (record->xl.xl_status_1) {
+		case XT_LOG_ENT_REC_MODIFIED:
+		case XT_LOG_ENT_UPDATE:
+		case XT_LOG_ENT_INSERT:
+		case XT_LOG_ENT_DELETE:
+		case XT_LOG_ENT_UPDATE_BG:
+		case XT_LOG_ENT_INSERT_BG:
+		case XT_LOG_ENT_DELETE_BG:
+			len = offsetof(XTactUpdateEntryDRec, xu_rec_type_1) + (size_t) XT_GET_DISK_2(record->xu.xu_size_2);
+			op_seq = XT_GET_DISK_4(record->xu.xu_op_seq_4);
+			tab_id = XT_GET_DISK_4(record->xu.xu_tab_id_4);
+			break;
+		case XT_LOG_ENT_UPDATE_FL:
+		case XT_LOG_ENT_INSERT_FL:
+		case XT_LOG_ENT_DELETE_FL:
+		case XT_LOG_ENT_UPDATE_FL_BG:
+		case XT_LOG_ENT_INSERT_FL_BG:
+		case XT_LOG_ENT_DELETE_FL_BG:
+			len = offsetof(XTactUpdateFLEntryDRec, xf_rec_type_1) + (size_t) XT_GET_DISK_2(record->xf.xf_size_2);
+			op_seq = XT_GET_DISK_4(record->xf.xf_op_seq_4);
+			tab_id = XT_GET_DISK_4(record->xf.xf_tab_id_4);
+			break;
+		case XT_LOG_ENT_REC_FREED:
+		case XT_LOG_ENT_REC_REMOVED:
+		case XT_LOG_ENT_REC_REMOVED_EXT:
+			/* [(7)] REMOVE is now a extended version of FREE! */
+			len = offsetof(XTactFreeRecEntryDRec, fr_rec_type_1) + sizeof(XTTabRecFreeDRec);
+			goto fixed_len_data;
+		case XT_LOG_ENT_REC_REMOVED_BI:
+			len = offsetof(XTactRemoveBIEntryDRec, rb_rec_type_1) + (size_t) XT_GET_DISK_2(record->rb.rb_size_2);
+			op_seq = XT_GET_DISK_4(record->rb.rb_op_seq_4);
+			tab_id = XT_GET_DISK_4(record->rb.rb_tab_id_4);
+			break;
+		case XT_LOG_ENT_REC_MOVED:
+			len = offsetof(XTactWriteRecEntryDRec, xw_rec_type_1) + 8;
+			goto fixed_len_data;
+		case XT_LOG_ENT_REC_CLEANED:
+			len = offsetof(XTactWriteRecEntryDRec, xw_rec_type_1) + offsetof(XTTabRecHeadDRec, tr_prev_rec_id_4) + XT_RECORD_ID_SIZE;
+			goto fixed_len_data;
+		case XT_LOG_ENT_REC_CLEANED_1:
+			len = offsetof(XTactWriteRecEntryDRec, xw_rec_type_1) + 1;
+			goto fixed_len_data;
+		case XT_LOG_ENT_REC_UNLINKED:
+			len = offsetof(XTactWriteRecEntryDRec, xw_rec_type_1) + offsetof(XTTabRecHeadDRec, tr_prev_rec_id_4) + XT_RECORD_ID_SIZE;
+			fixed_len_data:
+			op_seq = XT_GET_DISK_4(record->xw.xw_op_seq_4);
+			tab_id = XT_GET_DISK_4(record->xw.xw_tab_id_4);
+			break;
+		case XT_LOG_ENT_ROW_NEW:
+			len = sizeof(XTactRowAddedEntryDRec) - 4;
+			goto new_row;
+		case XT_LOG_ENT_ROW_NEW_FL:
+			len = sizeof(XTactRowAddedEntryDRec);
+			new_row:
+			op_seq = XT_GET_DISK_4(record->xa.xa_op_seq_4);
+			tab_id = XT_GET_DISK_4(record->xa.xa_tab_id_4);
+			break;
+		case XT_LOG_ENT_ROW_ADD_REC:
+		case XT_LOG_ENT_ROW_SET:
+		case XT_LOG_ENT_ROW_FREED:
+			len = offsetof(XTactWriteRowEntryDRec, wr_ref_id_4) + sizeof(XTTabRowRefDRec);
+			op_seq = XT_GET_DISK_4(record->wr.wr_op_seq_4);
+			tab_id = XT_GET_DISK_4(record->wr.wr_tab_id_4);
+			break;
+		case XT_LOG_ENT_NO_OP:
+		case XT_LOG_ENT_END_OF_LOG:
+			return;
+		default:
+			return;
+	}
+
+	if (!xres_open_table(self, ws, tab_id))
+		return;
+
+	XTTableHPtr tab = ws->ws_ot->ot_table;
+
+	/* NOTE:
+	 *
+	 * During normal operation this is actually given.
+	 *
+	 * During recovery, it only applies to the record/row files
+	 * The index file is flushed indepently, and changes may
+	 * have been applied to the index (due to a call to flush index,
+	 * which comes as a result of out of memory) that have not been
+	 * applied to the record/row files.
+	 *
+	 * As a result we need to do the index checks that apply to this
+	 * change.
+	 *
+	 * At the moment, I will just do everything, which should not
+	 * hurt!
+	 *
+	 * This error can be repeated by running the test
+	 * runTest(OUT_OF_CACHE_UPDATE_TEST, 32, OUT_OF_CACHE_UPDATE_TEST_UPDATE_COUNT, OUT_OF_CACHE_UPDATE_TEST_SET_SIZE)
+	 * and crashing after a while.
+	 *
+	 * Do this by setting not_this to NULL. This will cause the test to
+	 * hang after a while. After a restart the indexes are corrupt if the
+	 * ws->ws_in_recover condition is not present here. 
+	 */
+	if (ws->ws_in_recover) {
+		if (!tab->tab_recovery_done) {
+			/* op_seq <= tab_head_op_seq + 1: */
+			ASSERT(XTTableSeq::xt_op_is_before(op_seq, tab->tab_head_op_seq+2));
+			if (XTTableSeq::xt_op_is_before(op_seq-1, tab->tab_head_op_seq))
+				/* Adjust the operation sequence number: */
+				tab->tab_head_op_seq = op_seq-1;
+			tab->tab_recovery_done = TRUE;
+		}
+	}
+
+	if (!XTTableSeq::xt_op_is_before(tab->tab_head_op_seq, op_seq))
+		return;
+
+ 	if (tab->tab_head_op_seq+1 == op_seq) {
+		/* I could use tab_ind_rec_log_id, but this may be a problem, if
+		 * recovery does not recover up to the last committed transaction.
+		 */ 
+		check_index = ws->ws_in_recover && xt_comp_log_pos(log_id, log_offset, ws->ws_ind_rec_log_id, ws->ws_ind_rec_log_offset) >= 0;
+		xres_apply_change(self, ws->ws_ot, record, TRUE, check_index, &ws->ws_rec_buf);
+		tab->tab_head_op_seq = op_seq;
+		if (tab->tab_wr_wake_freeer) {
+			if (!XTTableSeq::xt_op_is_before(tab->tab_head_op_seq, tab->tab_wake_freeer_op))
+				xt_wr_wake_freeer(self, ws->ws_db);
+		}
+
+		/* Apply any operations in the list that now follow on...
+		 * NOTE: the tab_op_list only has be locked for modification.
+		 * This is because only one thread ever changes the list
+		 * (on startup and the writer), but the checkpoint thread
+		 * reads it.
+		 */		
+		XTOperationPtr	op;
+		if ((op = (XTOperationPtr) xt_sl_first_item(tab->tab_op_list))) {
+			if (tab->tab_head_op_seq+1 == op->or_op_seq) {
+				xres_apply_operations(self, ws, TRUE);
+			}
+		}
+	}
+	else {
+		/* Add the operation to the list: */
+		XTOperationRec op;
+
+		op.or_op_seq = op_seq;
+		op.or_op_len = len;
+		op.or_log_id = log_id;
+		op.or_log_offset = log_offset;
+		xt_sl_lock(self, tab->tab_op_list);
+		xt_sl_insert(self, tab->tab_op_list, &op_seq, &op);
+		ASSERT(tab->tab_op_list->sl_usage_count < 1000000);
+		xt_sl_unlock(self, tab->tab_op_list);
+	}
+}
+
+/* ----------------------------------------------------------------------
+ * CHECKPOINTING FUNCTIONALITY
+ */
+
+static xtBool xres_delete_data_log(XTDatabaseHPtr db, xtLogID log_id)
+{
+	XTDataLogFilePtr	data_log;
+	char				path[PATH_MAX];
+
+	db->db_datalogs.dlc_name(PATH_MAX, path, log_id);
+
+	if (!db->db_datalogs.dlc_remove_data_log(log_id, TRUE))
+		return FAILED;
+
+	if (xt_fs_exists(path)) {
+#ifdef DEBUG_LOG_DELETE
+		printf("-- delete log: %s\n", path);
+#endif
+		if (!xt_fs_delete(NULL, path))
+			return FAILED;
+	}
+	/* The log was deleted: */
+	if (!db->db_datalogs.dlc_get_data_log(&data_log, log_id, TRUE, NULL))
+		return FAILED;
+	if (data_log) {
+		if (!db->db_datalogs.dls_set_log_state(data_log, XT_DL_DELETED))
+			return FAILED;
+	}
+	return OK;
+}
+
+static int xres_comp_flush_tabs(XTThreadPtr XT_UNUSED(self), register const void *XT_UNUSED(thunk), register const void *a, register const void *b)
+{
+	xtTableID				tab_id = *((xtTableID *) a);
+	XTCheckPointTablePtr	cp_tab = (XTCheckPointTablePtr) b;
+
+	if (tab_id < cp_tab->cpt_tab_id)
+		return -1;
+	if (tab_id > cp_tab->cpt_tab_id)
+		return 1;
+	return 0;
+}
+
+static void xres_init_checkpoint_state(XTThreadPtr self, XTCheckPointStatePtr cp)
+{
+	xt_init_mutex_with_autoname(self, &cp->cp_state_lock);
+	cp->cp_inited = TRUE;
+}
+
+static void xres_free_checkpoint_state(XTThreadPtr self, XTCheckPointStatePtr cp)
+{
+	cp->cp_inited = FALSE;
+	xt_free_mutex(&cp->cp_state_lock);
+	if (cp->cp_table_ids) {
+		xt_free_sortedlist(self, cp->cp_table_ids);
+		cp->cp_table_ids = NULL;
+	}
+}
+
+/*
+ * Remove the deleted logs so that they can be re-used.
+ * This is only possible after a checkpoint has been
+ * written that does _not_ include these logs as logs
+ * to be deleted!
+ */
+static xtBool xres_remove_data_logs(XTDatabaseHPtr db)
+{
+	u_int		no_of_logs = xt_sl_get_size(db->db_datalogs.dlc_deleted);
+	xtLogID		*log_id_ptr;
+
+	for (u_int i=0; i<no_of_logs; i++) {
+		log_id_ptr = (xtLogID *) xt_sl_item_at(db->db_datalogs.dlc_deleted, i);
+		if (!db->db_datalogs.dlc_remove_data_log(*log_id_ptr, FALSE))
+			return FAILED;
+	}
+	xt_sl_set_size(db->db_datalogs.dlc_deleted, 0);
+	return OK;
+}
+
+/* ----------------------------------------------------------------------
+ * INIT & EXIT
+ */
+
+xtPublic void xt_xres_init(XTThreadPtr self, XTDatabaseHPtr db)
+{
+	xtLogID	max_log_id;
+
+	xt_init_mutex_with_autoname(self, &db->db_cp_lock);
+	xt_init_cond(self, &db->db_cp_cond);
+	xt_init_mutex_with_autoname(self, &db->db_fl_lock);
+	
+	xres_init_checkpoint_state(self, &db->db_cp_state);
+	db->db_restart.xres_init(self, db, &db->db_wr_log_id, &db->db_wr_log_offset, &max_log_id);
+
+	/* It is also the position where transactions will start writing the
+	 * log:
+	 */
+	if (!db->db_xlog.xlog_set_write_offset(db->db_wr_log_id, db->db_wr_log_offset, max_log_id, self))
+		xt_throw(self);
+}
+
+xtPublic void xt_xres_exit(XTThreadPtr self, XTDatabaseHPtr db)
+{
+	db->db_restart.xres_exit(self);
+	xres_free_checkpoint_state(self, &db->db_cp_state);
+	xt_free_mutex(&db->db_cp_lock);
+	xt_free_cond(&db->db_cp_cond);
+	xt_free_mutex(&db->db_fl_lock);
+}
+
+/* ----------------------------------------------------------------------
+ * RESTART FUNCTIONALITY
+ */
+
+/*
+ * Restart the database. This function loads the restart position, and
+ * applies all changes in the logs, until the end of the log, or
+ * a corrupted record is found.
+ *
+ * The restart position is the position in the log where we know that
+ * all the changes up to that point have been flushed to the
+ * database.
+ *
+ * This is called the checkpoint position. The checkpoint position
+ * is written alternatively to 2 restart files.
+ *
+ * To make a checkpoint:
+ * Get the current log writer log offset.
+ * For each table:
+ *    Get the log offset of the next operation on the table, if an
+ *    operation is queued for the table.
+ *    Flush that table, and the operation sequence to the table.
+ * For each unclean transaction:
+ *    Get the log offset of the begin of the transaction.
+ * Write the lowest of all log offsets to the restart file!
+ */
+
+void XTXactRestart::xres_init(XTThreadPtr self, XTDatabaseHPtr db, xtLogID *log_id, xtLogOffset *log_offset, xtLogID *max_log_id)
+{
+	char					path[PATH_MAX];
+	XTOpenFilePtr			of = NULL;
+	XTXlogCheckpointDPtr	res_1_buffer = NULL;
+	XTXlogCheckpointDPtr	res_2_buffer = NULL;
+	XTXlogCheckpointDPtr	use_buffer;
+	xtLogID					ind_rec_log_id = 0;
+	xtLogOffset				ind_rec_log_offset = 0;
+
+	enter_();
+	xres_db = db;
+
+	ASSERT(!self->st_database);
+	/* The following call stack:
+	 * XTDatabaseLog::xlog_flush_pending()
+	 * XTDatabaseLog::xlog_flush()
+	 * xt_xlog_flush_log()
+	 * xt_flush_indices()
+	 * idx_out_of_memory_failure()
+	 * xt_idx_delete()
+	 * xres_remove_index_entries()
+	 * xres_apply_change()
+	 * xt_xres_apply_in_order()
+	 * XTXactRestart::xres_restart()
+	 * XTXactRestart::xres_init()
+	 * Leads to st_database being used!
+	 */
+	self->st_database = db;
+
+#ifdef SKIP_STARTUP_CHECKPOINT
+	/* When debugging, we do not checkpoint immediately, just in case
+	 * we detect a problem during recovery.
+	 */
+	xres_cp_required = FALSE;
+#else
+	xres_cp_required = TRUE;
+#endif
+	xres_cp_number = 0;
+	try_(a) {
+
+		/* Figure out which restart file to use.
+		 */
+		xres_name(PATH_MAX, path, 1);
+		if ((of = xt_open_file(self, path, XT_FS_MISSING_OK))) {
+			size_t res_1_size;
+
+			res_1_size = (size_t) xt_seek_eof_file(self, of);
+			res_1_buffer = (XTXlogCheckpointDPtr) xt_malloc(self, res_1_size);
+			if (!xt_pread_file(of, 0, res_1_size, res_1_size, res_1_buffer, NULL, &self->st_statistics.st_x, self))
+				xt_throw(self);
+			xt_close_file(self, of);
+			of = NULL;
+			if (!xres_check_checksum(res_1_buffer, res_1_size)) {
+				xt_free(self, res_1_buffer);
+				res_1_buffer = NULL;
+			}
+		}
+
+		xres_name(PATH_MAX, path, 2);
+		if ((of = xt_open_file(self, path, XT_FS_MISSING_OK))) {
+			size_t res_2_size;
+
+			res_2_size = (size_t) xt_seek_eof_file(self, of);
+			res_2_buffer = (XTXlogCheckpointDPtr) xt_malloc(self, res_2_size);
+			if (!xt_pread_file(of, 0, res_2_size, res_2_size, res_2_buffer, NULL, &self->st_statistics.st_x, self))
+				xt_throw(self);
+			xt_close_file(self, of);
+			of = NULL;
+			if (!xres_check_checksum(res_2_buffer, res_2_size)) {
+				xt_free(self, res_2_buffer);
+				res_2_buffer = NULL;
+			}
+		}
+
+		if (res_1_buffer && res_2_buffer) {
+			if (xt_comp_log_pos(
+				XT_GET_DISK_4(res_1_buffer->xcp_log_id_4),
+				XT_GET_DISK_6(res_1_buffer->xcp_log_offs_6),
+				XT_GET_DISK_4(res_2_buffer->xcp_log_id_4),
+				XT_GET_DISK_6(res_2_buffer->xcp_log_offs_6)) > 0) {
+				/* The first log is the further along than the second: */
+				xt_free(self, res_2_buffer);
+				res_2_buffer = NULL;
+			}
+			else {
+				if (XT_GET_DISK_6(res_1_buffer->xcp_chkpnt_no_6) >
+					XT_GET_DISK_6(res_2_buffer->xcp_chkpnt_no_6)) {
+					xt_free(self, res_2_buffer);
+					res_2_buffer = NULL;
+				}
+				else {
+					xt_free(self, res_1_buffer);
+					res_1_buffer = NULL;
+				}
+			}
+		}
+
+		if (res_1_buffer) {
+			use_buffer = res_1_buffer;
+			xres_next_res_no = 2;
+		}
+		else {
+			use_buffer = res_2_buffer;
+			xres_next_res_no = 1;
+		}
+
+		/* Read the checkpoint data: */
+		if (use_buffer) {
+			u_int		no_of_logs;
+			xtLogID		xt_log_id;
+			xtTableID	xt_tab_id;
+
+			xres_cp_number = XT_GET_DISK_6(use_buffer->xcp_chkpnt_no_6);
+			xres_cp_log_id = XT_GET_DISK_4(use_buffer->xcp_log_id_4);
+			xres_cp_log_offset = XT_GET_DISK_6(use_buffer->xcp_log_offs_6);
+			xt_tab_id = XT_GET_DISK_4(use_buffer->xcp_tab_id_4);
+			if (xt_tab_id > db->db_curr_tab_id)
+				db->db_curr_tab_id = xt_tab_id;
+			db->db_xn_curr_id = XT_GET_DISK_4(use_buffer->xcp_xact_id_4);
+			ind_rec_log_id = XT_GET_DISK_4(use_buffer->xcp_ind_rec_log_id_4);
+			ind_rec_log_offset = XT_GET_DISK_6(use_buffer->xcp_ind_rec_log_offs_6);
+			no_of_logs = XT_GET_DISK_2(use_buffer->xcp_log_count_2);
+
+#ifdef DEBUG_PRINT
+			printf("CHECKPOINT log=%d offset=%d ", (int) xres_cp_log_id, (int) xres_cp_log_offset);
+			if (no_of_logs)
+				printf("DELETED LOGS: ");
+#endif
+
+			/* Logs that are deleted are locked until _after_ the next
+			 * checkpoint.
+			 *
+			 * To prevent the following problem from occuring:
+			 * - Recovery is performed, and log X is deleted 
+			 * - After delete a log is free for re-use.
+			 *   New data is writen to log X.
+			 * - Server crashes.
+			 * - Recovery is performed from previous checkpoint,
+			 *   and log X is deleted again.
+			 *
+			 * To lock the logs the are placed on the deleted list.
+			 * After the next checkpoint, all logs on this list
+			 * will be removed.
+			 */
+			for (u_int i=0; i<no_of_logs; i++) {
+				xt_log_id = (xtLogID) XT_GET_DISK_2(use_buffer->xcp_del_log[i]);
+#ifdef DEBUG_PRINT
+				if (i != 0)
+					printf(", ");
+				printf("%d", (int) xt_log_id);
+#endif
+#ifdef DEBUG_KEEP_LOGS
+				xt_dl_set_to_delete(self, db, xt_log_id);
+#else
+				if (!xres_delete_data_log(db, xt_log_id))
+					xt_throw(self);
+#endif
+			}
+
+#ifdef DEBUG_PRINT
+			printf("\n");
+#endif
+		}
+		else {
+			/* Try to determine the correct start point. */
+			xres_cp_number = 0;
+			xres_cp_log_id = xt_xlog_get_min_log(self, db);
+			xres_cp_log_offset = 0;
+			ind_rec_log_id = xres_cp_log_id;
+			ind_rec_log_offset = xres_cp_log_offset;
+
+#ifdef DEBUG_PRINT
+			printf("CHECKPOINT log=1 offset=0\n");
+#endif
+		}
+
+		if (res_1_buffer) {
+			xt_free(self, res_1_buffer);
+			res_1_buffer = NULL;
+		}
+		if (res_2_buffer) {
+			xt_free(self, res_2_buffer);
+			res_2_buffer = NULL;
+		}
+
+		if (!xres_restart(self, log_id, log_offset, ind_rec_log_id, ind_rec_log_offset, max_log_id))
+			xt_throw(self);
+	}
+	catch_(a) {
+		self->st_database = NULL;
+		if (of)
+			xt_close_file(self, of);
+		if (res_1_buffer)
+			xt_free(self, res_1_buffer);
+		if (res_2_buffer)
+			xt_free(self, res_2_buffer);
+		xres_exit(self);
+		throw_();
+	}
+	cont_(a);
+	self->st_database = NULL;
+
+	exit_();
+}
+
+void XTXactRestart::xres_exit(XTThreadPtr XT_UNUSED(self))
+{
+}
+
+void XTXactRestart::xres_name(size_t size, char *path, xtLogID log_id)
+{
+	char name[50];
+
+	sprintf(name, "restart-%lu.xt", (u_long) log_id);
+	xt_strcpy(size, path, xres_db->db_main_path);
+	xt_add_system_dir(size, path);
+	xt_add_dir_char(size, path);
+	xt_strcat(size, path, name);
+}
+
+xtBool XTXactRestart::xres_check_checksum(XTXlogCheckpointDPtr buffer, size_t size)
+{
+	size_t		head_size;
+
+	/* The minimum size: */
+	if (size < offsetof(XTXlogCheckpointDRec, xcp_head_size_4) + 4)
+		return FAILED;
+
+	/* Check the sizes: */
+	head_size = XT_GET_DISK_4(buffer->xcp_head_size_4);
+	if (size < head_size)
+		return FAILED;
+
+	if (XT_GET_DISK_2(buffer->xcp_checksum_2) != xt_get_checksum(((xtWord1 *) buffer) + 2, size - 2, 1))
+		return FAILED;
+
+	if (XT_GET_DISK_2(buffer->xcp_version_2) != XT_CHECKPOINT_VERSION)
+		return FAILED;
+
+	return OK;
+}
+
+void XTXactRestart::xres_recover_progress(XTThreadPtr self, XTOpenFilePtr *of, int perc)
+{
+#ifdef XT_USE_GLOBAL_DB
+	if (perc > 100) {
+		char file_path[PATH_MAX];
+
+		if (*of) {
+			xt_close_file(self, *of);
+			*of = NULL;
+		}
+		xt_strcpy(PATH_MAX, file_path, xres_db->db_main_path);
+		xt_add_pbxt_file(PATH_MAX, file_path, "recovery-progress");
+		if (xt_fs_exists(file_path))
+			xt_fs_delete(self, file_path);
+	}
+	else {
+		char number[40];
+
+		if (!*of) {
+			char file_path[PATH_MAX];
+
+			xt_strcpy(PATH_MAX, file_path, xres_db->db_main_path);
+			xt_add_pbxt_file(PATH_MAX, file_path, "recovery-progress");
+			*of = xt_open_file(self, file_path, XT_FS_CREATE | XT_FS_MAKE_PATH);
+			xt_set_eof_file(self, *of, 0);
+		}
+
+		sprintf(number, "%d", perc);
+		if (!xt_pwrite_file(*of, 0, strlen(number), number, &self->st_statistics.st_x, self))
+			xt_throw(self);
+		if (!xt_flush_file(*of, &self->st_statistics.st_x, self))
+			xt_throw(self);
+	}
+#endif
+}
+
+xtBool XTXactRestart::xres_restart(XTThreadPtr self, xtLogID *log_id, xtLogOffset *log_offset, xtLogID ind_rec_log_id, xtLogOffset ind_rec_log_offset, xtLogID *max_log_id)
+{
+	xtBool					ok = TRUE;
+	XTDatabaseHPtr			db = xres_db;
+	XTXactLogBufferDPtr		record;
+	xtXactID				xn_id;
+	XTXactDataPtr			xact;
+	xtTableID				tab_id;
+	XTWriterStateRec		ws;
+	off_t					bytes_read = 0;
+	off_t					bytes_to_read;
+	volatile xtBool			print_progress = FALSE;
+	volatile off_t			perc_size = 0, next_goal = 0;
+	int						perc_complete = 1, perc_to_write = 1;
+	XTOpenFilePtr			progress_file = NULL;
+	xtBool					min_ram_xn_id_set = FALSE;
+	u_int					log_count;
+	time_t					start_time;
+
+	memset(&ws, 0, sizeof(ws));
+
+	ws.ws_db = db;
+	ws.ws_in_recover = TRUE;
+	ws.ws_ind_rec_log_id = ind_rec_log_id;
+	ws.ws_ind_rec_log_offset = ind_rec_log_offset;
+
+	/* Initialize the data log buffer (required if extended data is
+	 * referenced).
+	 * Note: this buffer is freed later. It is part of the thread
+	 * "open database" state, and this means that a thread
+	 * may not have another database open (in use) when
+	 * it calls this functions.
+	 */
+	self->st_dlog_buf.dlb_init(db, xt_db_log_buffer_size);
+
+	if (!db->db_xlog.xlog_seq_init(&ws.ws_seqread, xt_db_log_buffer_size, TRUE))
+		return FAILED;
+
+	bytes_to_read = xres_bytes_to_read(self, db, &log_count, max_log_id);
+	/* Don't print anything about recovering an empty database: */
+	if (bytes_to_read != 0)
+		xt_logf(XT_NT_INFO, "PBXT: Recovering from %lu-%llu, bytes to read: %llu\n", (u_long) xres_cp_log_id, (u_llong) xres_cp_log_offset, (u_llong) bytes_to_read);
+
+	print_progress = FALSE;
+	start_time = time(NULL);
+	perc_size = bytes_to_read / 100;
+	next_goal = perc_size;
+
+	if (!db->db_xlog.xlog_seq_start(&ws.ws_seqread, xres_cp_log_id, xres_cp_log_offset, FALSE)) {
+		ok = FALSE;
+		goto failed;
+	}
+
+	try_(a) {
+		for (;;) {
+			if (!db->db_xlog.xlog_seq_next(&ws.ws_seqread, &record, TRUE, self)) {
+				ok = FALSE;
+				break;
+			}
+			/* Increment before. If record is NULL then xseq_record_len will be zero,
+			 * UNLESS the last record was of type XT_LOG_ENT_END_OF_LOG 
+			 * which fills the log to align to block of size 512.
+			 */
+			bytes_read += ws.ws_seqread.xseq_record_len;
+			if (!record)
+				break;
+#ifdef PRINT_LOG_ON_RECOVERY
+			xt_print_log_record(ws.ws_seqread.xseq_rec_log_id, ws.ws_seqread.xseq_rec_log_offset, record);
+#endif
+			if (bytes_read >= next_goal) {
+				while (bytes_read >= next_goal) {
+					next_goal += perc_size;
+					perc_complete++;
+				}
+				if (!print_progress) {
+					if (time(NULL) - start_time > 2)
+						print_progress = TRUE;
+				}
+				if (print_progress) {
+					while (perc_to_write < perc_complete) {
+						if (((perc_to_write - 1) % 25) == 0)
+							xt_logf(XT_NT_INFO, "PBXT: ");
+						if ((perc_to_write % 25) == 0)
+							xt_logf(XT_NT_INFO, "%2d\n", (int) perc_to_write);
+						else
+							xt_logf(XT_NT_INFO, "%2d ", (int) perc_to_write);
+						xt_log_flush(self);
+						xres_recover_progress(self, &progress_file, perc_to_write);
+						perc_to_write++;
+					}
+				}
+			}
+			switch (record->xl.xl_status_1) {
+				case XT_LOG_ENT_HEADER:
+					break;
+				case XT_LOG_ENT_NEW_LOG: {
+					/* Adjust the bytes read for the fact that logs are written
+					 * on 512 byte boundaries.
+					 */
+					off_t offs, eof = ws.ws_seqread.xseq_log_eof;
+
+					offs = ws.ws_seqread.xseq_rec_log_offset + ws.ws_seqread.xseq_record_len;
+					if (eof > offs)
+						bytes_read += eof - offs;
+					if (!db->db_xlog.xlog_seq_start(&ws.ws_seqread, XT_GET_DISK_4(record->xl.xl_log_id_4), 0, TRUE))
+						xt_throw(self);
+					break;
+				}
+				case XT_LOG_ENT_NEW_TAB:
+					tab_id = XT_GET_DISK_4(record->xt.xt_tab_id_4);
+					if (tab_id > db->db_curr_tab_id)
+						db->db_curr_tab_id = tab_id;
+					break;
+				case XT_LOG_ENT_UPDATE_BG:
+				case XT_LOG_ENT_INSERT_BG:
+				case XT_LOG_ENT_DELETE_BG:
+					xn_id = XT_GET_DISK_4(record->xu.xu_xact_id_4);
+					goto start_xact;
+				case XT_LOG_ENT_UPDATE_FL_BG:
+				case XT_LOG_ENT_INSERT_FL_BG:
+				case XT_LOG_ENT_DELETE_FL_BG:
+					xn_id = XT_GET_DISK_4(record->xf.xf_xact_id_4);
+					start_xact:
+					if (xt_xn_is_before(db->db_xn_curr_id, xn_id))
+						db->db_xn_curr_id = xn_id;
+
+					if (!(xact = xt_xn_add_old_xact(db, xn_id, self)))
+						xt_throw(self);
+
+					xact->xd_begin_log = ws.ws_seqread.xseq_rec_log_id;
+					xact->xd_begin_offset = ws.ws_seqread.xseq_rec_log_offset;
+
+					xact->xd_end_xn_id = xn_id;
+					xact->xd_end_time = db->db_xn_end_time;
+					xact->xd_flags = (XT_XN_XAC_LOGGED | XT_XN_XAC_ENDED | XT_XN_XAC_RECOVERED | XT_XN_XAC_SWEEP);
+
+					/* This may affect the "minimum RAM transaction": */
+					if (!min_ram_xn_id_set || xt_xn_is_before(xn_id, db->db_xn_min_ram_id)) {
+						min_ram_xn_id_set = TRUE;
+						db->db_xn_min_ram_id = xn_id;
+					}
+					xt_xres_apply_in_order(self, &ws, ws.ws_seqread.xseq_rec_log_id, ws.ws_seqread.xseq_rec_log_offset, record);
+					break;
+				case XT_LOG_ENT_COMMIT:
+				case XT_LOG_ENT_ABORT:
+					xn_id = XT_GET_DISK_4(record->xe.xe_xact_id_4);
+					if ((xact = xt_xn_get_xact(db, xn_id, self))) {
+						xact->xd_end_xn_id = xn_id;
+						xact->xd_flags |= XT_XN_XAC_ENDED | XT_XN_XAC_SWEEP;
+						xact->xd_flags &= ~XT_XN_XAC_RECOVERED; // We can expect an end record on cleanup!
+						xact->xd_flags &= ~XT_XN_XAC_PREPARED;  // Prepared transactions cannot be swept!
+						if (record->xl.xl_status_1 == XT_LOG_ENT_COMMIT)
+							xact->xd_flags |= XT_XN_XAC_COMMITTED;
+						if (xt_sl_get_size(db->db_xn_xa_list) > 0)
+							xt_xn_delete_xa_data_by_xact(db, xn_id, self);
+					}
+					break;
+				case XT_LOG_ENT_CLEANUP:
+					/* The transaction was cleaned up: */
+					xn_id = XT_GET_DISK_4(record->xc.xc_xact_id_4);
+					xt_xn_delete_xact(db, xn_id, self);
+					break;
+				case XT_LOG_ENT_OP_SYNC:
+					xres_sync_operations(self, db, &ws);
+					break;
+				case XT_LOG_ENT_DEL_LOG:
+					xtLogID rec_log_id;
+
+					rec_log_id = XT_GET_DISK_4(record->xl.xl_log_id_4);
+					xt_dl_set_to_delete(self, db, rec_log_id);
+					break;
+				case XT_LOG_ENT_PREPARE:
+					xn_id = XT_GET_DISK_4(record->xp.xp_xact_id_4);
+					if ((xact = xt_xn_get_xact(db, xn_id, self))) {
+						xact->xd_flags |= XT_XN_XAC_PREPARED;
+						if (!xt_xn_store_xa_data(db, xn_id, record->xp.xp_xa_len_1, record->xp.xp_xa_data, self))
+							xt_throw(self);
+					}
+					break;
+				default:
+					xt_xres_apply_in_order(self, &ws, ws.ws_seqread.xseq_rec_log_id, ws.ws_seqread.xseq_rec_log_offset, record);
+					break;
+			}
+		}
+
+		if (xres_sync_operations(self, db, &ws)) {
+			XTactOpSyncEntryDRec	op_sync;
+			time_t					now = time(NULL);
+
+			op_sync.os_status_1 = XT_LOG_ENT_OP_SYNC;
+			op_sync.os_checksum_1 = XT_CHECKSUM_1(now) ^ XT_CHECKSUM_1(ws.ws_seqread.xseq_rec_log_id);
+			XT_SET_DISK_4(op_sync.os_time_4, (xtWord4) now);
+			/* TODO: If this is done, check to see that
+			 * the byte written here are read back by the writter.
+			 * This is in order to be in sync with 'xl_log_bytes_written'.
+			 * i.e. xl_log_bytes_written == xl_log_bytes_read
+			 */
+			if (!db->db_xlog.xlog_write_thru(&ws.ws_seqread, sizeof(XTactOpSyncEntryDRec), (xtWord1 *) &op_sync, self))
+				xt_throw(self);
+		}
+	}
+	catch_(a) {
+		ok = FALSE;
+	}
+	cont_(a);
+
+	if (ok) {
+		if (print_progress) {
+			while (perc_complete <= 100) {
+				if (((perc_complete - 1) % 25) == 0)
+					xt_logf(XT_NT_INFO, "PBXT: ");
+				if ((perc_complete % 25) == 0)
+					xt_logf(XT_NT_INFO, "%2d\n", (int) perc_complete);
+				else
+					xt_logf(XT_NT_INFO, "%2d ", (int) perc_complete);
+				xt_log_flush(self);
+				xres_recover_progress(self, &progress_file, perc_complete);
+				perc_complete++;
+			}
+		}
+		if (bytes_to_read != 0)
+			xt_logf(XT_NT_INFO, "PBXT: Recovering complete at %lu-%llu, bytes read: %llu\n", (u_long) ws.ws_seqread.xseq_rec_log_id, (u_llong) ws.ws_seqread.xseq_rec_log_offset, (u_llong) bytes_read);
+
+		*log_id = ws.ws_seqread.xseq_rec_log_id;
+		*log_offset = ws.ws_seqread.xseq_rec_log_offset;
+
+		if (!min_ram_xn_id_set)
+			/* This is true because if no transaction was placed in RAM then
+			 * the next transaction in RAM will have the next ID: */
+			db->db_xn_min_ram_id = db->db_xn_curr_id + 1;
+
+#ifdef RECOVER_FREE_COUNTS
+		if (xres_cp_log_id != *log_id || xres_cp_log_offset != *log_offset) {
+			/* Recovery took place, correct the row count! */
+			xres_recover_table_free_counts(self, db, &ws);
+		}
+#endif
+	}
+
+	failed:
+	xt_free_writer_state(self, &ws);
+	self->st_dlog_buf.dlb_exit(self);
+	xres_recover_progress(self, &progress_file, 101);
+	return ok;
+}
+
+xtBool XTXactRestart::xres_is_checkpoint_pending(xtLogID curr_log_id, xtLogOffset curr_log_offset)
+{
+	return xt_bytes_since_last_checkpoint(xres_db, curr_log_id, curr_log_offset) >= xt_db_checkpoint_frequency;
+}
+
+/*
+ * Calculate the bytes to be read for recovery.
+ * This is only an estimate of the number of bytes that
+ * will be read.
+ */
+off_t XTXactRestart::xres_bytes_to_read(XTThreadPtr self, XTDatabaseHPtr db, u_int *log_count, xtLogID *max_log_id)
+{
+	off_t				to_read = 0, eof;
+	xtLogID				log_id = xres_cp_log_id;
+	char				log_path[PATH_MAX];
+	XTOpenFilePtr		of;
+	XTXactLogHeaderDRec	log_head;
+	size_t				head_size;
+	size_t				red_size;
+
+	*max_log_id = log_id;
+	*log_count = 0;
+	for (;;) {
+		db->db_xlog.xlog_name(PATH_MAX, log_path, log_id);
+		of = NULL;
+		if (!xt_open_file_ns(&of, log_path, XT_FS_MISSING_OK))
+			xt_throw(self);
+		if (!of)
+			break;
+		pushr_(xt_close_file, of);
+
+		/* Check the first record of the log, to see if it is valid. */
+		if (!xt_pread_file(of, 0, sizeof(XTXactLogHeaderDRec), 0, (xtWord1 *) &log_head, &red_size, &self->st_statistics.st_xlog, self))
+			xt_throw(self);
+		/* The minimum size (old log size): */
+		if (red_size < XT_MIN_LOG_HEAD_SIZE)
+			goto done;
+		head_size = XT_GET_DISK_4(log_head.xh_size_4);
+		if (log_head.xh_status_1 != XT_LOG_ENT_HEADER)
+			goto done;
+		if (log_head.xh_checksum_1 != XT_CHECKSUM_1(log_id))
+			goto done;
+		if (XT_LOG_HEAD_MAGIC(&log_head, head_size) != XT_LOG_FILE_MAGIC)
+			goto done;
+		if (head_size > offsetof(XTXactLogHeaderDRec, xh_log_id_4) + 4) {
+			if (XT_GET_DISK_4(log_head.xh_log_id_4) != log_id)
+				goto done;
+		}
+		if (head_size > offsetof(XTXactLogHeaderDRec, xh_version_2) + 4) {
+			if (XT_GET_DISK_2(log_head.xh_version_2) > XT_LOG_VERSION_NO) 				
+				xt_throw_ulxterr(XT_CONTEXT, XT_ERR_NEW_TYPE_OF_XLOG, (u_long) log_id);
+		}
+
+		eof = xt_seek_eof_file(self, of);
+		freer_(); // xt_close_file(of)
+		if (log_id == xres_cp_log_id)
+			to_read += (eof - xres_cp_log_offset);
+		else
+			to_read += eof;
+		(*log_count)++;
+		*max_log_id = log_id;
+		log_id++;
+	}
+	return to_read;
+
+	done:
+	freer_(); // xt_close_file(of)
+	return to_read;
+}
+
+
+/* ----------------------------------------------------------------------
+ * C H E C K P O I N T    P R O C E S S
+ */
+
+typedef enum XTFileType {
+	XT_FT_RECROW_FILE,
+	XT_FT_INDEX_FILE
+} XTFileType;
+
+typedef struct XTDirtyFile {
+	xtTableID				df_tab_id;
+	XTFileType				df_file_type;
+} XTDirtyFileRec, *XTDirtyFilePtr;
+
+#define XT_MAX_FLUSH_FILES			200
+#define XT_FLUSH_THRESHOLD			(2 * 1024 * 1024)
+
+/* Sort files to be flused. */
+#ifdef USE_LATER
+static void xres_cp_flush_files(XTThreadPtr self, XTDatabaseHPtr db)
+{
+	u_int			edx;
+	XTTableEntryPtr	te;
+	XTDirtyFileRec	flush_list[XT_MAX_FLUSH_FILES];
+	u_int			file_count = 0;
+	XTIndexPtr		*iptr;
+	u_int			dirty_blocks;
+	XTOpenTablePtr	ot;
+	XTTableHPtr		tab;
+
+	retry:
+	xt_enum_tables_init(&edx);
+	xt_ht_lock(self, db->db_tables);
+	pushr_(xt_ht_unlock, db->db_tables);
+	while (file_count < XT_MAX_FLUSH_FILES &&
+		(te = xt_enum_tables_next(self, db, &edx))) {
+		if ((tab = te->te_table)) {
+			if (tab->tab_bytes_to_flush >= XT_FLUSH_THRESHOLD) {
+				flush_list[file_count].df_tab_id = te->te_tab_id;
+				flush_list[file_count].df_file_type = XT_FT_RECROW_FILE;
+				file_count++;
+			}
+			if (file_count == XT_MAX_FLUSH_FILES)
+				break;
+			iptr = tab->tab_dic.dic_keys;
+			dirty_blocks = 0;
+			for (u_int i=0;i<tab->tab_dic.dic_key_count; i++) {
+				dirty_blocks += (*iptr)->mi_dirty_blocks;
+				iptr++;
+			}
+			if ((dirty_blocks * XT_INDEX_PAGE_SIZE) >= XT_FLUSH_THRESHOLD) {
+				flush_list[file_count].df_tab_id = te->te_tab_id;
+				flush_list[file_count].df_file_type = XT_FT_INDEX_FILE;
+				file_count++;
+			}
+		}
+	}
+	freer_(); // xt_ht_unlock(db->db_tables)
+
+	for (u_int i=0;i<file_count && !self->t_quit; i++) {
+		/* We want to flush about once a second: */ 
+		xt_sleep_milli_second(400);
+		if ((ot = xt_db_open_pool_table(self, db, flush_list[i].df_tab_id, NULL, TRUE))) {
+			pushr_(xt_db_return_table_to_pool, ot);
+
+			if (flush_list[i].df_file_type == XT_FT_RECROW_FILE) {
+				if (!xt_flush_record_row(ot, NULL))
+					xt_throw(self);
+			}
+			else {
+				if (!xt_flush_indices(ot, NULL))
+					xt_throw(self);
+			}
+
+			freer_(); // xt_db_return_table_to_pool(ot)
+		}
+	}
+	
+	if (file_count == 100)
+		goto retry;
+}
+#endif
+
+#ifdef xxx
+void XTXactRestart::xres_checkpoint_pending(xtLogID log_id, xtLogOffset log_offset)
+{
+#ifdef TRACE_CHECKPOINT_ACTIVITY
+	xtBool tmp = xres_cp_pending;
+#endif
+	xres_cp_pending = xres_is_checkpoint_pending(log_id, log_offset);
+#ifdef TRACE_CHECKPOINT_ACTIVITY
+	if (tmp) {
+		if (!xres_cp_pending)
+			printf("%s xres_cp_pending = FALSE\n", xt_get_self()->t_name);
+	}
+	else {
+		if (xres_cp_pending)
+			printf("%s xres_cp_pending = TRUE\n", xt_get_self()->t_name);
+	}
+#endif
+}
+
+
+	xres_checkpoint_pending();
+
+	if (!xres_cp_required &&
+		!xres_cp_pending &&
+		xt_sl_get_size(db->db_datalogs.dlc_to_delete) == 0 &&
+		xt_sl_get_size(db->db_datalogs.dlc_deleted) == 0)
+		return FALSE;
+#endif
+
+#ifdef NEVER_CHECKPOINT
+xtBool no_checkpoint = TRUE;
+#endif
+
+#define XT_CHECKPOINT_IF_NO_ACTIVITY		0
+#define XT_CHECKPOINT_PAUSE_IF_ACTIVITY		1
+#define XT_CHECKPOINT_NO_PAUSE				2
+
+/*
+ * This function performs table flush, as long as the system is idle.
+ */
+static xtBool xres_cp_checkpoint(XTThreadPtr self, XTDatabaseHPtr db, u_int curr_writer_total, xtBool force_checkpoint)
+{
+	XTCheckPointStatePtr	cp = &db->db_cp_state;
+	XTOpenTablePtr			ot;
+	XTCheckPointTablePtr	to_flush_ptr;
+	XTCheckPointTableRec	to_flush;
+	u_int					table_count = 0;
+	xtBool					checkpoint_done;
+	off_t					bytes_flushed = 0;
+	int						check_type;
+
+#ifdef NEVER_CHECKPOINT
+	if (no_checkpoint)
+		return FALSE;
+#endif
+	if (force_checkpoint) {
+		if (db->db_restart.xres_cp_required)
+			check_type = XT_CHECKPOINT_NO_PAUSE;
+		else
+			check_type = XT_CHECKPOINT_PAUSE_IF_ACTIVITY;
+	}
+	else
+		check_type = XT_CHECKPOINT_IF_NO_ACTIVITY;	
+
+	to_flush.cpt_tab_id = 0;
+	to_flush.cpt_flushed = 0;
+
+	/* Start a checkpoint: */
+	if (!xt_begin_checkpoint(db, FALSE, self))
+		xt_throw(self);
+
+	while (!self->t_quit) {
+		xt_lock_mutex_ns(&cp->cp_state_lock);
+		table_count = 0;
+		if (cp->cp_table_ids)
+			table_count = xt_sl_get_size(cp->cp_table_ids);
+		if (!cp->cp_running || cp->cp_flush_count >= table_count) {
+			xt_unlock_mutex_ns(&cp->cp_state_lock);
+			break;
+		}
+		if (cp->cp_next_to_flush > table_count)
+			cp->cp_next_to_flush = 0;
+
+		to_flush_ptr = (XTCheckPointTablePtr) xt_sl_item_at(cp->cp_table_ids, cp->cp_next_to_flush);
+		if (to_flush_ptr)
+			to_flush = *to_flush_ptr;
+		xt_unlock_mutex_ns(&cp->cp_state_lock);
+
+		if (to_flush_ptr) {
+			if ((ot = xt_db_open_pool_table(self, db, to_flush.cpt_tab_id, NULL, TRUE))) {
+				pushr_(xt_db_return_table_to_pool, ot);
+
+				if (!(to_flush.cpt_flushed & XT_CPT_REC_ROW_FLUSHED)) {
+					if (!xt_flush_record_row(ot, &bytes_flushed, FALSE))
+						xt_throw(self);
+				}
+
+				xt_lock_mutex_ns(&cp->cp_state_lock);
+				to_flush_ptr = NULL;
+				if (cp->cp_running)
+					to_flush_ptr = (XTCheckPointTablePtr) xt_sl_item_at(cp->cp_table_ids, cp->cp_next_to_flush);
+				if (to_flush_ptr)
+					to_flush = *to_flush_ptr;
+				xt_unlock_mutex_ns(&cp->cp_state_lock);
+
+				if (to_flush_ptr && !self->t_quit) {
+					if (!(to_flush.cpt_flushed & XT_CPT_INDEX_FLUSHED)) {
+						switch (check_type) {
+							case XT_CHECKPOINT_IF_NO_ACTIVITY:
+								if (bytes_flushed > 0 && curr_writer_total != db->db_xn_total_writer_count) {
+									freer_(); // xt_db_return_table_to_pool(ot)
+									goto end_checkpoint;
+								}
+								break;
+							case XT_CHECKPOINT_PAUSE_IF_ACTIVITY:
+								if (bytes_flushed > 2 * 1024 * 1024 && curr_writer_total != db->db_xn_total_writer_count) {
+									curr_writer_total = db->db_xn_total_writer_count;
+									bytes_flushed = 0;
+									xt_sleep_milli_second(400);
+								}
+								break;
+							case XT_CHECKPOINT_NO_PAUSE:
+								break;
+						}
+
+						if (!self->t_quit) {
+							if (!xt_flush_indices(ot, &bytes_flushed, FALSE))
+								xt_throw(self);
+							to_flush.cpt_flushed |= XT_CPT_INDEX_FLUSHED;
+						}
+					}
+				}
+
+				freer_(); // xt_db_return_table_to_pool(ot)
+			}
+			
+			if ((to_flush.cpt_flushed & XT_CPT_ALL_FLUSHED) == XT_CPT_ALL_FLUSHED)
+				cp->cp_next_to_flush++;
+		}
+		else
+			cp->cp_next_to_flush++;
+
+		if (self->t_quit)
+			break;
+
+		switch (check_type) {
+			case XT_CHECKPOINT_IF_NO_ACTIVITY:
+				if (bytes_flushed > 0 && curr_writer_total != db->db_xn_total_writer_count)
+					goto end_checkpoint;
+				break;
+			case XT_CHECKPOINT_PAUSE_IF_ACTIVITY:
+				if (bytes_flushed > 2 * 1024 * 1024 && curr_writer_total != db->db_xn_total_writer_count) {
+					curr_writer_total = db->db_xn_total_writer_count;
+					bytes_flushed = 0;
+					xt_sleep_milli_second(400);
+				}
+				break;
+			case XT_CHECKPOINT_NO_PAUSE:
+				break;
+		}
+	}
+
+	end_checkpoint:
+	if (!xt_end_checkpoint(db, self, &checkpoint_done))
+		xt_throw(self);
+	return checkpoint_done;
+}
+
+
+/* Wait for the log writer to tell us to do something.
+ */
+static void xres_cp_wait_for_log_writer(XTThreadPtr self, XTDatabaseHPtr db, u_long milli_secs)
+{
+	xt_lock_mutex(self, &db->db_cp_lock);
+	pushr_(xt_unlock_mutex, &db->db_cp_lock);
+	if (!self->t_quit)
+		xt_timed_wait_cond(self, &db->db_cp_cond, &db->db_cp_lock, milli_secs);
+	freer_(); // xt_unlock_mutex(&db->db_cp_lock)
+}
+
+/*
+ * This is the way checkpoint works:
+ *
+ * To write a checkpoint we need to flush all tables in
+ * the database.
+ *
+ * Before flushing the first table we get the checkpoint
+ * log position.
+ *
+ * After flushing all files we write of the checkpoint
+ * log position.
+ */
+static void xres_cp_main(XTThreadPtr self)
+{
+	XTDatabaseHPtr		db = self->st_database;
+	u_int				curr_writer_total;
+	time_t				now;
+	xtXactID			sweep_count;
+
+	xt_set_low_priority(self);
+
+	while (!self->t_quit) {
+		/* Wait 2 seconds: */
+		curr_writer_total = db->db_xn_total_writer_count;
+		xt_db_approximate_time = time(NULL);
+		now = xt_db_approximate_time;
+		while (!self->t_quit && xt_db_approximate_time < now + 2 && !db->db_restart.xres_cp_required) {
+			xres_cp_wait_for_log_writer(self, db, 400);
+			xt_db_approximate_time = time(NULL);
+			xt_db_free_unused_open_tables(self, db);
+		}
+		
+		if (self->t_quit)
+			break;
+
+		sweep_count = db->db_xn_curr_id + 1 - db->db_xn_to_clean_id;
+		if (curr_writer_total == db->db_xn_total_writer_count &&
+			!sweep_count &&
+			db->db_wr_idle == XT_THREAD_IDLE) {
+			/* No activity in 2 seconds: */
+			xres_cp_checkpoint(self, db, curr_writer_total, FALSE);
+		}
+		else {
+			/* There server is busy, check if we need to
+			 * write a checkpoint anyway...
+			 */
+			if (db->db_restart.xres_cp_required ||
+				db->db_restart.xres_is_checkpoint_pending(db->db_xlog.xl_write_log_id, db->db_xlog.xl_write_log_offset)) {
+				/* Flush tables, until the checkpoint is complete. */
+				xres_cp_checkpoint(self, db, curr_writer_total, TRUE);
+			}
+		}
+
+		if (curr_writer_total == db->db_xn_total_writer_count) {
+			/* We did a checkpoint, and still, nothing has
+			 * happened....
+			 *
+			 * Wait for something to happen:
+			 */
+			xtLogID		log_id;
+			xtLogOffset	log_offset;
+
+			while (!self->t_quit && curr_writer_total == db->db_xn_total_writer_count) {
+				/* The writer position: */
+				xt_lock_mutex(self, &db->db_wr_lock);
+				pushr_(xt_unlock_mutex, &db->db_wr_lock);
+				log_id = db->db_wr_log_id;
+				log_offset = db->db_wr_log_offset;
+				freer_(); // xt_unlock_mutex(&db->db_wr_lock)
+
+				/* This condition means we could checkpoint: */
+				if (!(xt_sl_get_size(db->db_datalogs.dlc_to_delete) == 0 &&
+					xt_sl_get_size(db->db_datalogs.dlc_deleted) == 0 &&
+					xt_comp_log_pos(log_id, log_offset, db->db_restart.xres_cp_log_id, db->db_restart.xres_cp_log_offset) <= 0) &&
+					xt_sl_get_size(db->db_xn_xa_list) == 0)
+					break;
+
+				xres_cp_wait_for_log_writer(self, db, 400);
+				xt_db_approximate_time = time(NULL);
+				xt_db_free_unused_open_tables(self, db);
+			}
+		}
+	}
+}
+
+static void *xres_cp_run_thread(XTThreadPtr self)
+{
+	XTDatabaseHPtr	db = (XTDatabaseHPtr) self->t_data;
+	int				count;
+	void			*mysql_thread;
+
+	if (!(mysql_thread = myxt_create_thread()))
+		xt_throw(self);
+
+	while (!self->t_quit) {
+		try_(a) {
+			/*
+			 * The garbage collector requires that the database
+			 * is in use because.
+			 */
+			xt_use_database(self, db, XT_FOR_CHECKPOINTER);
+
+			/* This action is both safe and required (see details elsewhere) */
+			xt_heap_release(self, self->st_database);
+
+			xres_cp_main(self);
+		}
+		catch_(a) {
+			/* This error is "normal"! */
+			if (self->t_exception.e_xt_err != XT_ERR_NO_DICTIONARY &&
+				!(self->t_exception.e_xt_err == XT_SIGNAL_CAUGHT &&
+				self->t_exception.e_sys_err == SIGTERM))
+				xt_log_and_clear_exception(self);
+		}
+		cont_(a);
+
+		/* Avoid releasing the database (done above) */
+		self->st_database = NULL;
+		xt_unuse_database(self, self);
+
+		/* After an exception, pause before trying again... */
+		/* Number of seconds */
+		count = 60;
+		while (!self->t_quit && count > 0) {
+			sleep(1);
+			count--;
+		}
+	}
+
+   /*
+	* {MYSQL-THREAD-KILL}
+	myxt_destroy_thread(mysql_thread, TRUE);
+	*/
+	return NULL;
+}
+
+static void xres_cp_free_thread(XTThreadPtr self, void *data)
+{
+	XTDatabaseHPtr db = (XTDatabaseHPtr) data;
+
+	if (db->db_cp_thread) {
+		xt_lock_mutex(self, &db->db_cp_lock);
+		pushr_(xt_unlock_mutex, &db->db_cp_lock);
+		db->db_cp_thread = NULL;
+		freer_(); // xt_unlock_mutex(&db->db_cp_lock)
+	}
+}
+
+/* Start a checkpoint, if none has been started. */
+xtPublic xtBool xt_begin_checkpoint(XTDatabaseHPtr db, xtBool have_table_lock, XTThreadPtr thread)
+{
+	XTCheckPointStatePtr	cp = &db->db_cp_state;
+	xtLogID					log_id;
+	xtLogOffset				log_offset;
+	xtLogID					ind_rec_log_id;
+	xtLogOffset				ind_rec_log_offset;
+	u_int					edx;
+	XTTableEntryPtr			te_ptr;
+	XTTableHPtr				tab;
+	XTOperationPtr			op;
+	XTCheckPointTableRec	cpt;
+	XTSortedListPtr			tables = NULL;
+	
+	/* during startup we can get an error before the checkpointer is inited */
+	if (!cp->cp_inited)
+		return FAILED;
+
+	/* First check if a checkpoint is already running: */
+	xt_lock_mutex_ns(&cp->cp_state_lock);
+	if (cp->cp_running) {
+		xt_unlock_mutex_ns(&cp->cp_state_lock);
+		return OK;
+	}
+	if (cp->cp_table_ids) {
+		xt_free_sortedlist(NULL, cp->cp_table_ids);
+		cp->cp_table_ids = NULL;
+	}
+	xt_unlock_mutex_ns(&cp->cp_state_lock);
+	
+	/* Flush the log before we continue. This is to ensure that
+	 * before we write a checkpoint, that the changes
+	 * done by the sweeper and the compactor, have been
+	 * applied.
+	 *
+	 * Note, the sweeper does not flush the log, so this is
+	 * necessary!
+	 *
+	 * --- I have removed this flush. It is actually just a
+	 * minor optimisation, which pushes the flush position
+	 * below ahead.
+	 *
+	 * Note that the writer position used for the checkpoint
+	 * _will_ be behind the current log flush position.
+	 *
+	 * This is because the writer cannot apply log changes
+	 * until they are flushed.
+	 */
+	/* This is an alternative to the above.
+	if (!xt_xlog_flush_log(db, self))
+		xt_throw(self);
+	*/
+	xt_lock_mutex_ns(&db->db_wr_lock);
+
+	/* The theoretical maximum restart log postion, is the
+	 * position of the writer thread:
+	 */
+	log_id = db->db_wr_log_id;
+	log_offset = db->db_wr_log_offset;
+
+	ind_rec_log_id = db->db_xlog.xl_flush_log_id;
+	ind_rec_log_offset = db->db_xlog.xl_flush_log_offset;
+
+	xt_unlock_mutex_ns(&db->db_wr_lock);
+
+	/* Go through all the transactions, and find
+	 * the lowest log start position of all the transactions.
+	 */
+	for (u_int i=0; i<XT_XN_NO_OF_SEGMENTS; i++) {
+		XTXactSegPtr 	seg;
+
+		seg = &db->db_xn_idx[i];
+		XT_XACT_READ_LOCK(&seg->xs_tab_lock, self);
+		for (u_int j=0; j<XT_XN_HASH_TABLE_SIZE; j++) {
+			XTXactDataPtr	xact;
+			
+			xact = seg->xs_table[j];
+			while (xact) {
+				/* If the transaction is logged, but not cleaned: */
+				if ((xact->xd_flags & (XT_XN_XAC_LOGGED | XT_XN_XAC_CLEANED)) == XT_XN_XAC_LOGGED) {
+					if (xt_comp_log_pos(log_id, log_offset, xact->xd_begin_log, xact->xd_begin_offset) > 0) {
+						log_id = xact->xd_begin_log;
+						log_offset = xact->xd_begin_offset;
+					}
+				}
+				xact = xact->xd_next_xact;
+			}
+		}
+		XT_XACT_UNLOCK(&seg->xs_tab_lock, self, FALSE);
+	}
+
+#ifdef TRACE_CHECKPOINT
+	printf("BEGIN CHECKPOINT %d-%llu\n", (int) log_id, (u_llong) log_offset);
+#endif
+	/* Go through all tables, and find the lowest log position.
+	 * The log position stored by each table shows the position of
+	 * the next operation that still needs to be applied.
+	 *
+	 * This comes from the list of operations which are
+	 * queued for the table.
+	 *
+	 * This function also builds a list of tables!
+	 */
+
+	if (!(tables = xt_new_sortedlist_ns(sizeof(XTCheckPointTableRec), 20, xres_comp_flush_tabs, NULL, NULL)))
+		return FAILED;
+
+	xt_enum_tables_init(&edx);
+	if (!have_table_lock)
+		xt_ht_lock(NULL, db->db_tables);
+	while ((te_ptr = xt_enum_tables_next(NULL, db, &edx))) {
+		if ((tab = te_ptr->te_table)) {
+			xt_sl_lock_ns(tab->tab_op_list, thread);
+			if ((op = (XTOperationPtr) xt_sl_first_item(tab->tab_op_list))) {
+				if (xt_comp_log_pos(log_id, log_offset, op->or_log_id, op->or_log_offset) > 0) {
+					log_id = op->or_log_id;
+					log_offset = op->or_log_offset;
+				}
+			}
+			xt_sl_unlock(NULL, tab->tab_op_list);
+			cpt.cpt_flushed = 0;
+			cpt.cpt_tab_id = tab->tab_id;
+#ifdef TRACE_CHECKPOINT
+			printf("to flush: %d %s\n", (int) tab->tab_id, tab->tab_name->ps_path);
+#endif
+			if (!xt_sl_insert(NULL, tables, &tab->tab_id, &cpt)) {
+				if (!have_table_lock)
+					xt_ht_unlock(NULL, db->db_tables);
+				xt_free_sortedlist(NULL, tables);
+				return FAILED;
+			}
+		}
+	}
+	if (!have_table_lock)
+		xt_ht_unlock(NULL, db->db_tables);
+
+	xt_lock_mutex_ns(&cp->cp_state_lock);
+	/* If there is a table list, then someone was faster than me! */
+	if (!cp->cp_running && log_id && log_offset) {
+		cp->cp_running = TRUE;
+		cp->cp_log_id = log_id;
+		cp->cp_log_offset = log_offset;
+
+		cp->cp_ind_rec_log_id = ind_rec_log_id;
+		cp->cp_ind_rec_log_offset = ind_rec_log_offset;
+
+		cp->cp_flush_count = 0;
+		cp->cp_next_to_flush = 0;
+		cp->cp_table_ids = tables;
+	}
+	else
+		xt_free_sortedlist(NULL, tables);
+	xt_unlock_mutex_ns(&cp->cp_state_lock);
+
+	/* At this point, log flushing can begin... */
+	return OK;
+}
+
+/* End a checkpoint, if a checkpoint has been started,
+ * and all checkpoint tables have been flushed
+ */
+xtPublic xtBool xt_end_checkpoint(XTDatabaseHPtr db, XTThreadPtr thread, xtBool *checkpoint_done)
+{
+	XTCheckPointStatePtr	cp = &db->db_cp_state;
+	XTXlogCheckpointDPtr	cp_buf = NULL;
+	char					path[PATH_MAX];
+	XTOpenFilePtr			of;
+	u_int					table_count;
+	size_t					chk_size = 0; 
+	u_int					no_of_logs = 0; 
+
+	/* As long as we have outstanding XA transactions, we may not checkpoint! */
+	if (xt_sl_get_size(db->db_xn_xa_list) > 0) {
+#ifdef DEBUG
+		printf("Checkpoint must wait\n");
+#endif
+		return OK;
+	}
+
+#ifdef NEVER_CHECKPOINT
+	return OK;
+#endif
+	/* Lock the checkpoint state so that only on thread can do this! */
+	xt_lock_mutex_ns(&cp->cp_state_lock);
+	if (!cp->cp_running)
+		goto checkpoint_done;
+
+	table_count = 0;
+	if (cp->cp_table_ids)
+		table_count = xt_sl_get_size(cp->cp_table_ids);
+	if (cp->cp_flush_count < table_count) {
+		/* Checkpoint is not done, yet! */
+		xt_unlock_mutex_ns(&cp->cp_state_lock);
+		if (checkpoint_done)
+			*checkpoint_done = FALSE;
+		return OK;
+	}
+
+	/* Check if anything has changed since the last checkpoint,
+	 * if not, there is no need to write a new checkpoint!
+	 */
+	if (xt_sl_get_size(db->db_datalogs.dlc_to_delete) == 0 &&
+		xt_sl_get_size(db->db_datalogs.dlc_deleted) == 0 &&
+		xt_comp_log_pos(cp->cp_log_id, cp->cp_log_offset, db->db_restart.xres_cp_log_id, db->db_restart.xres_cp_log_offset) <= 0) {
+		/* A checkpoint is required if the size of the deleted
+		 * list is not zero. The reason is, I cannot remove the
+		 * logs from the deleted list BEFORE a checkpoint has been
+		 * done which does NOT include these logs.
+		 *
+		 * Even though the logs have already been deleted. They
+		 * remain on the deleted list to ensure that they are NOT
+		 * reused during this time, until the next checkpoint.
+		 *
+		 * This is done because if they are used, then on restart
+		 * they would be deleted!
+		 */
+#ifdef TRACE_CHECKPOINT
+		printf("--- END CHECKPOINT - no write\n");
+#endif
+		goto checkpoint_done;
+	}
+
+#ifdef TRACE_CHECKPOINT
+	printf("--- END CHECKPOINT - write start point\n");
+#endif
+	xt_lock_mutex_ns(&db->db_datalogs.dlc_lock);
+
+	no_of_logs = xt_sl_get_size(db->db_datalogs.dlc_to_delete);
+	chk_size = offsetof(XTXlogCheckpointDRec, xcp_del_log) + no_of_logs * 2;
+	xtLogID	*log_id_ptr;
+
+	if (!(cp_buf = (XTXlogCheckpointDPtr) xt_malloc_ns(chk_size))) {
+		xt_unlock_mutex_ns(&db->db_datalogs.dlc_lock);
+		goto failed_0;
+	}
+
+	/* Increment the checkpoint number. This value is used if 2 checkpoint have the
+	 * same log number. In this case checkpoints may differ in the log files
+	 * that should be deleted. Here it is important to use the most recent
+	 * log file!
+	 */
+	db->db_restart.xres_cp_number++;
+
+	/* Create the checkpoint record: */
+	XT_SET_DISK_4(cp_buf->xcp_head_size_4, chk_size);
+	XT_SET_DISK_2(cp_buf->xcp_version_2, XT_CHECKPOINT_VERSION);
+	XT_SET_DISK_6(cp_buf->xcp_chkpnt_no_6, db->db_restart.xres_cp_number);
+	XT_SET_DISK_4(cp_buf->xcp_log_id_4, cp->cp_log_id);
+	XT_SET_DISK_6(cp_buf->xcp_log_offs_6, cp->cp_log_offset);
+	XT_SET_DISK_4(cp_buf->xcp_tab_id_4, db->db_curr_tab_id);
+	XT_SET_DISK_4(cp_buf->xcp_xact_id_4, db->db_xn_curr_id);
+	XT_SET_DISK_4(cp_buf->xcp_ind_rec_log_id_4, cp->cp_ind_rec_log_id);
+	XT_SET_DISK_6(cp_buf->xcp_ind_rec_log_offs_6, cp->cp_ind_rec_log_offset);
+	XT_SET_DISK_2(cp_buf->xcp_log_count_2, no_of_logs);
+
+	for (u_int i=0; i<no_of_logs; i++) {
+		log_id_ptr = (xtLogID *) xt_sl_item_at(db->db_datalogs.dlc_to_delete, i);
+		XT_SET_DISK_2(cp_buf->xcp_del_log[i], (xtWord2) *log_id_ptr);
+	}
+
+	XT_SET_DISK_2(cp_buf->xcp_checksum_2, xt_get_checksum(((xtWord1 *) cp_buf) + 2, chk_size - 2, 1));
+
+	xt_unlock_mutex_ns(&db->db_datalogs.dlc_lock);
+
+	/* Write the checkpoint: */
+	db->db_restart.xres_name(PATH_MAX, path, db->db_restart.xres_next_res_no);
+	if (!(of = xt_open_file_ns(path, XT_FS_CREATE | XT_FS_MAKE_PATH)))
+		goto failed_1;
+
+	if (!xt_set_eof_file(NULL, of, 0))
+		goto failed_2;
+	if (!xt_pwrite_file(of, 0, chk_size, (xtWord1 *) cp_buf, &thread->st_statistics.st_x, thread))
+		goto failed_2;
+	if (!xt_flush_file(of, &thread->st_statistics.st_x, thread))
+		goto failed_2;
+
+	xt_close_file_ns(of);
+
+	/* Next time write the other restart file: */
+	db->db_restart.xres_next_res_no = (db->db_restart.xres_next_res_no % 2) + 1;
+	db->db_restart.xres_cp_log_id = cp->cp_log_id;
+	db->db_restart.xres_cp_log_offset = cp->cp_log_offset;
+	db->db_restart.xres_cp_required = FALSE;
+
+	/*
+	 * Remove all the data logs that were deleted on the
+	 * last checkpoint:
+	 */
+	if (!xres_remove_data_logs(db))
+		goto failed_0;
+
+#ifndef DEBUG_KEEP_LOGS
+	/* After checkpoint, we can delete transaction logs that will no longer be required
+	 * for recovery...
+	 */
+	if (cp->cp_log_id > 1) {
+		xtLogID	current_log_id = cp->cp_log_id;
+		xtLogID	del_log_id;
+
+#ifdef XT_NUMBER_OF_LOGS_TO_SAVE
+		if (pbxt_crash_debug) {
+			/* To save the logs, we just consider them in use: */
+			if (current_log_id > XT_NUMBER_OF_LOGS_TO_SAVE)
+				current_log_id -= XT_NUMBER_OF_LOGS_TO_SAVE;
+			else
+				current_log_id = 1;
+		}
+#endif
+
+		del_log_id = current_log_id - 1;
+
+		while (del_log_id > 0) {
+			db->db_xlog.xlog_name(PATH_MAX, path, del_log_id);
+			if (!xt_fs_exists(path))
+				break;
+			del_log_id--;
+		}
+
+		/* This was the lowest log ID that existed: */
+		del_log_id++;
+
+		/* Delete all logs that still exist, that come before
+		 * the current log:
+		 *
+		 * Do this from least to greatest to ensure no "holes" appear.
+		 */
+		while (del_log_id < current_log_id) {
+			switch (db->db_xlog.xlog_delete_log(del_log_id, thread)) {
+				case OK:
+					break;
+				case FAILED:
+					goto exit_loop;
+				case XT_ERR:
+					goto failed_0;
+			}
+			del_log_id++;
+		}
+		exit_loop:;
+	}
+
+	/* And we can delete data logs in the list, and place them
+	 * on the deleted list.
+	 */
+	xtLogID log_id;
+	for (u_int i=0; i<no_of_logs; i++) {
+		log_id = (xtLogID) XT_GET_DISK_2(cp_buf->xcp_del_log[i]);
+		if (!xres_delete_data_log(db, log_id))
+			goto failed_0;
+	}
+#endif
+
+	xt_free_ns(cp_buf);
+	cp_buf = NULL;
+
+	checkpoint_done:
+	cp->cp_running = FALSE;
+	if (cp->cp_table_ids) {
+		xt_free_sortedlist(NULL, cp->cp_table_ids);
+		cp->cp_table_ids = NULL;
+	}
+	cp->cp_flush_count = 0;
+	cp->cp_next_to_flush = 0;
+	db->db_restart.xres_cp_required = FALSE;
+	xt_unlock_mutex_ns(&cp->cp_state_lock);
+	if (checkpoint_done)
+		*checkpoint_done = TRUE;
+	return OK;
+
+	failed_2:
+	xt_close_file_ns(of);
+
+	failed_1:
+	xt_free_ns(cp_buf);
+
+	failed_0:
+	if (cp_buf)
+		xt_free_ns(cp_buf);
+	xt_unlock_mutex_ns(&cp->cp_state_lock);
+	return FAILED;
+}
+
+xtPublic xtWord8 xt_bytes_since_last_checkpoint(XTDatabaseHPtr db, xtLogID curr_log_id, xtLogOffset curr_log_offset)
+{
+	xtLogID					log_id;
+	xtLogOffset				log_offset;
+	size_t					byte_count = 0;
+
+	log_id = db->db_restart.xres_cp_log_id;
+	log_offset = db->db_restart.xres_cp_log_offset;
+
+	/* Assume the logs have the threshold: */
+	if (log_id < curr_log_id) {
+		if (log_offset < xt_db_log_file_threshold)
+			byte_count = (size_t) (xt_db_log_file_threshold - log_offset);
+		log_offset = 0;
+		log_id++;
+	}
+	while (log_id < curr_log_id) {
+		byte_count += (size_t) xt_db_log_file_threshold;
+		log_id++;
+	}
+	if (log_offset < curr_log_offset)
+		byte_count += (size_t) (curr_log_offset - log_offset);
+
+	return byte_count;
+}
+
+xtPublic void xt_start_checkpointer(XTThreadPtr self, XTDatabaseHPtr db)
+{
+	char name[PATH_MAX];
+
+	sprintf(name, "CP-%s", xt_last_directory_of_path(db->db_main_path));
+	xt_remove_dir_char(name);
+	db->db_cp_thread = xt_create_daemon(self, name);
+	xt_set_thread_data(db->db_cp_thread, db, xres_cp_free_thread);
+	xt_run_thread(self, db->db_cp_thread, xres_cp_run_thread);
+}
+
+xtPublic void xt_wait_for_checkpointer(XTThreadPtr self, XTDatabaseHPtr db)
+{
+	time_t		then, now;
+	xtBool		message = FALSE;
+	xtLogID		log_id;
+	xtLogOffset	log_offset;
+
+	if (db->db_cp_thread) {
+		then = time(NULL);
+		for (;;) {
+			xt_lock_mutex(self, &db->db_wr_lock);
+			pushr_(xt_unlock_mutex, &db->db_wr_lock);
+			log_id = db->db_wr_log_id;
+			log_offset = db->db_wr_log_offset;
+			freer_(); // xt_unlock_mutex(&db->db_wr_lock)
+
+			if (xt_sl_get_size(db->db_datalogs.dlc_to_delete) == 0 &&
+				xt_sl_get_size(db->db_datalogs.dlc_deleted) == 0 &&
+				xt_comp_log_pos(log_id, log_offset, db->db_restart.xres_cp_log_id, db->db_restart.xres_cp_log_offset) <= 0)
+				break;
+
+			/* Do a final checkpoint before shutdown: */
+			db->db_restart.xres_cp_required = TRUE;
+
+			xt_lock_mutex(self, &db->db_cp_lock);
+			pushr_(xt_unlock_mutex, &db->db_cp_lock);
+			if (!xt_broadcast_cond_ns(&db->db_cp_cond)) {
+				xt_log_and_clear_exception_ns();
+				break;
+			}
+			freer_(); // xt_unlock_mutex(&db->db_cp_lock)
+
+			xt_sleep_milli_second(10);
+
+			now = time(NULL);
+			if (now >= then + 16) {
+				xt_logf(XT_NT_INFO, "Aborting wait for '%s' checkpointer\n", db->db_name);
+				message = FALSE;
+				break;
+			}
+			if (now >= then + 2) {
+				if (!message) {
+					message = TRUE;
+					xt_logf(XT_NT_INFO, "Waiting for '%s' checkpointer...\n", db->db_name);
+				}
+			}
+		}
+
+		if (message)
+			xt_logf(XT_NT_INFO, "Checkpointer '%s' done.\n", db->db_name);
+	}
+}
+
+xtPublic void xt_stop_checkpointer(XTThreadPtr self, XTDatabaseHPtr db)
+{
+	XTThreadPtr thr_wr;
+
+	if (db->db_cp_thread) {
+		xt_lock_mutex(self, &db->db_cp_lock);
+		pushr_(xt_unlock_mutex, &db->db_cp_lock);
+
+		/* This pointer is safe as long as you have the transaction lock. */
+		if ((thr_wr = db->db_cp_thread)) {
+			xtThreadID tid = thr_wr->t_id;
+
+			/* Make sure the thread quits when woken up. */
+			xt_terminate_thread(self, thr_wr);
+
+			xt_wake_checkpointer(self, db);
+
+			freer_(); // xt_unlock_mutex(&db->db_cp_lock)
+
+			/*
+			 * GOTCHA: This is a wierd thing but the SIGTERM directed
+			 * at a particular thread (in this case the sweeper) was
+			 * being caught by a different thread and killing the server
+			 * sometimes. Disconcerting.
+			 * (this may only be a problem on Mac OS X)
+			xt_kill_thread(thread);
+			 */
+			xt_wait_for_thread(tid, FALSE);
+
+			/* PMC - This should not be necessary to set the signal here, but in the
+			 * debugger the handler is not called!!?
+			thr_wr->t_delayed_signal = SIGTERM;
+			xt_kill_thread(thread);
+			 */
+			db->db_cp_thread = NULL;
+		}
+		else
+			freer_(); // xt_unlock_mutex(&db->db_cp_lock)
+	}
+}
+
+xtPublic void xt_wake_checkpointer(XTThreadPtr self, XTDatabaseHPtr db)
+{
+	if (!xt_broadcast_cond_ns(&db->db_cp_cond))
+		xt_log_and_clear_exception(self);
+}
+
+xtPublic void xt_free_writer_state(struct XTThread *self, XTWriterStatePtr ws)
+{
+	if (ws->ws_db)
+		ws->ws_db->db_xlog.xlog_seq_exit(&ws->ws_seqread);
+	xt_db_set_size(self, &ws->ws_databuf, 0);
+	xt_ib_free(self, &ws->ws_rec_buf);
+	if (ws->ws_ot) {
+		xt_db_return_table_to_pool(self, ws->ws_ot);
+		ws->ws_ot = NULL;
+	}
+}
+
+xtPublic void xt_dump_xlogs(XTDatabaseHPtr db, xtLogID start_log)
+{
+	XTXactSeqReadRec	seq;
+	XTXactLogBufferDPtr	record;
+	xtLogID				log_id = db->db_restart.xres_cp_log_id;
+	char				log_path[PATH_MAX];
+	XTThreadPtr			thread = xt_get_self();
+
+	/* Find the first log that still exists:*/
+	for (;;) {
+		log_id--;
+		db->db_xlog.xlog_name(PATH_MAX, log_path, log_id);
+		if (!xt_fs_exists(log_path))
+			break;
+	}
+	log_id++;
+
+	if (!db->db_xlog.xlog_seq_init(&seq, xt_db_log_buffer_size, FALSE))
+		return;
+
+	if (log_id < start_log)
+		log_id = start_log;
+
+	for (;;) {
+		db->db_xlog.xlog_name(PATH_MAX, log_path, log_id);
+		if (!xt_fs_exists(log_path))
+			break;
+
+		if (!db->db_xlog.xlog_seq_start(&seq, log_id, 0, FALSE))
+			goto done;
+
+		PRINTF("---------- DUMP LOG %d\n", (int) log_id);
+		for (;;) {
+			if (!db->db_xlog.xlog_seq_next(&seq, &record, TRUE, thread)) {
+				PRINTF("---------- DUMP LOG %d ERROR\n", (int) log_id);
+				xt_log_and_clear_exception_ns();
+				break;
+			}
+			if (!record) {
+				PRINTF("---------- DUMP LOG %d DONE\n", (int) log_id);
+				break;
+			}
+			xt_print_log_record(seq.xseq_rec_log_id, seq.xseq_rec_log_offset, record);
+		}
+
+		log_id++;
+	}
+
+	done:
+	db->db_xlog.xlog_seq_exit(&seq);
+}
+
+/* ----------------------------------------------------------------------
+ * D A T A B A S E   R E C O V E R Y   T H R E A D
+ */
+
+
+static XTThreadPtr		xres_recovery_thread;
+
+static void *xn_xres_run_recovery_thread(XTThreadPtr self)
+{
+	THD *mysql_thread;
+
+	if (!(mysql_thread = (THD *) myxt_create_thread()))
+		xt_throw(self);
+
+	myxt_wait_pbxt_plugin_slot_assigned(self);
+
+	if (!xres_recovery_thread->t_quit) {
+		try_(a) {
+			/* {GLOBAL-DB}
+			 * It can happen that something will just get in before this
+			 * thread and open/recover the database!
+			 */
+			if (!pbxt_database) {
+				xt_open_database(self, mysql_real_data_home, TRUE);
+				/* {GLOBAL-DB}
+				 * This can be done at the same time as the recovery thread,
+				 * strictly speaking I need a lock.
+				 */
+				if (!pbxt_database) {
+					pbxt_database = self->st_database;
+					xt_heap_reference(self, pbxt_database);
+				}
+			}
+			else
+				xt_use_database(self, pbxt_database, XT_FOR_USER);
+
+			pbxt_recovery_state = XT_RECOVER_DONE;
+
+			/* {WAIT-FOR-SW-AFTER-RECOV}
+			 * Moved to here...
+			 */
+			xt_wait_for_sweeper(self, self->st_database, 0);
+
+			pbxt_recovery_state = XT_RECOVER_SWEPT;
+		}
+		catch_(a) {
+			xt_log_and_clear_exception(self);
+		}
+		cont_(a);
+	}
+
+   /*
+    * {MYSQL-THREAD-KILL}
+	* Here is the problem with destroying the thread at this
+	* point. If we had an error started, then it can lead
+	* to a callback into pbxt: pbxt_panic().
+	*
+	* This will shutdown things, making it impossible quite the
+	* thread and do a cleanup. Solution:
+	*
+	* Move the MySQL thread descruction to a later point!
+	*
+	* sql/mysqld --no-defaults --basedir=~/maria/trunk 
+	* --character-sets-dir=~/maria/trunk/sql/share/charsets 
+	* --language=~/maria/trunk/sql/share/english 
+	* --skip-networking --datadir=/tmp/x --skip-grant-tables --nonexistentoption 
+	*
+	* #0	0x003893f9 in xt_exit_databases at database_xt.cc:304
+	* #1	0x0039dc7e in pbxt_end at ha_pbxt.cc:947
+	* #2	0x0039dd27 in pbxt_panic at ha_pbxt.cc:1289
+	* #3	0x001d619e in ha_finalize_handlerton at handler.cc:391
+	* #4	0x00279d22 in plugin_deinitialize at sql_plugin.cc:816
+	* #5	0x0027bcf5 in reap_plugins at sql_plugin.cc:904
+	* #6	0x0027c38c in plugin_thdvar_cleanup at sql_plugin.cc:2513
+	* #7	0x000c0db2 in THD::~THD at sql_class.cc:934
+	* #8	0x003b025b in myxt_destroy_thread at myxt_xt.cc:2999
+	* #9	0x003b66b5 in xn_xres_run_recovery_thread at restart_xt.cc:3196
+	* #10	0x003cbfbb in xt_thread_main at thread_xt.cc:1020
+	*
+	myxt_destroy_thread(mysql_thread, TRUE);
+	*/
+
+	xres_recovery_thread = NULL;
+	return NULL;
+}
+
+xtPublic void xt_xres_start_database_recovery(XTThreadPtr self)
+{
+	char name[PATH_MAX];
+
+	sprintf(name, "DB-RECOVERY-%s", xt_last_directory_of_path(mysql_real_data_home));
+	xt_remove_dir_char(name);
+
+	pbxt_recovery_state = XT_RECOVER_PENDING;
+	xres_recovery_thread = xt_create_daemon(self, name);
+	xt_run_thread(self, xres_recovery_thread, xn_xres_run_recovery_thread);
+}
+
+xtPublic void xt_xres_terminate_recovery(XTThreadPtr self)
+{
+	XTThreadPtr thr_rec;
+
+	/* {MYSQL-THREAD-KILL}
+	 * Stack above shows that his is possible!
+	 */
+	if ((thr_rec = xres_recovery_thread) && (self != xres_recovery_thread)) {
+		xtThreadID tid = thr_rec->t_id;
+
+		xt_terminate_thread(self, thr_rec);
+
+		xt_wait_for_thread(tid, TRUE);
+	}
+}
+
+/* ----------------------------------------------------------------------
+ * L O G   F L U S H    P R O C E S S
+ */
+
+static void *xres_fl_run_thread(XTThreadPtr self)
+{
+	XTDatabaseHPtr	db = (XTDatabaseHPtr) self->t_data;
+	int				count;
+	void			*mysql_thread;
+	xtWord8			to_flush;
+
+	if (!(mysql_thread = myxt_create_thread()))
+		xt_throw(self);
+
+	while (!self->t_quit) {
+		try_(a) {
+			/*
+			 * The garbage collector requires that the database
+			 * is in use because.
+			 */
+			xt_use_database(self, db, XT_FOR_CHECKPOINTER);
+
+			/* This action is both safe and required (see details elsewhere) */
+			xt_heap_release(self, self->st_database);
+
+			xt_set_low_priority(self);
+
+			to_flush = xt_trace_clock() + XT_XLOG_FLUSH_FREQ * 1000;
+			for (;;) {
+				/* Wait 1 second: */
+				while (!self->t_quit && xt_trace_clock() < to_flush)
+					xt_sleep_milli_second(10);
+
+				if (self->t_quit)
+					break;
+
+				if (!db->db_xlog.xlog_flush(self))
+					xt_throw(self);
+
+				to_flush += XT_XLOG_FLUSH_FREQ * 1000;
+			}
+		}
+		catch_(a) {
+			/* This error is "normal"! */
+			if (self->t_exception.e_xt_err != XT_ERR_NO_DICTIONARY &&
+				!(self->t_exception.e_xt_err == XT_SIGNAL_CAUGHT &&
+				self->t_exception.e_sys_err == SIGTERM))
+				xt_log_and_clear_exception(self);
+		}
+		cont_(a);
+
+		/* Avoid releasing the database (done above) */
+		self->st_database = NULL;
+		xt_unuse_database(self, self);
+
+		/* After an exception, pause before trying again... */
+		/* Number of seconds */
+		count = 60;
+		while (!self->t_quit && count > 0) {
+			sleep(1);
+			count--;
+		}
+	}
+
+   /*
+	* {MYSQL-THREAD-KILL}
+	myxt_destroy_thread(mysql_thread, TRUE);
+	*/
+	return NULL;
+}
+
+static void xres_fl_free_thread(XTThreadPtr self, void *data)
+{
+	XTDatabaseHPtr db = (XTDatabaseHPtr) data;
+
+	if (db->db_fl_thread) {
+		xt_lock_mutex(self, &db->db_fl_lock);
+		pushr_(xt_unlock_mutex, &db->db_fl_lock);
+		db->db_fl_thread = NULL;
+		freer_(); // xt_unlock_mutex(&db->db_fl_lock)
+	}
+}
+
+xtPublic void xt_start_flusher(XTThreadPtr self, XTDatabaseHPtr db)
+{
+	char name[PATH_MAX];
+
+	sprintf(name, "FL-%s", xt_last_directory_of_path(db->db_main_path));
+	xt_remove_dir_char(name);
+	db->db_fl_thread = xt_create_daemon(self, name);
+	xt_set_thread_data(db->db_fl_thread, db, xres_fl_free_thread);
+	xt_run_thread(self, db->db_fl_thread, xres_fl_run_thread);
+}
+
+xtPublic void xt_stop_flusher(XTThreadPtr self, XTDatabaseHPtr db)
+{
+	XTThreadPtr thr_fl;
+
+	if (db->db_fl_thread) {
+		xt_lock_mutex(self, &db->db_fl_lock);
+		pushr_(xt_unlock_mutex, &db->db_fl_lock);
+
+		/* This pointer is safe as long as you have the transaction lock. */
+		if ((thr_fl = db->db_fl_thread)) {
+			xtThreadID tid = thr_fl->t_id;
+
+			/* Make sure the thread quits when woken up. */
+			xt_terminate_thread(self, thr_fl);
+
+			freer_(); // xt_unlock_mutex(&db->db_cp_lock)
+
+			xt_wait_for_thread(tid, FALSE);
+			db->db_fl_thread = NULL;
+		}
+		else
+			freer_(); // xt_unlock_mutex(&db->db_cp_lock)
+	}
+}
+
diff --git a/storage/pbxt/src/restart_xt.h b/storage/pbxt/src/restart_xt.h
new file mode 100644
index 00000000000..614fd74a39d
--- /dev/null
+++ b/storage/pbxt/src/restart_xt.h
@@ -0,0 +1,153 @@
+/* Copyright (c) 2007 PrimeBase Technologies GmbH
+ *
+ * PrimeBase XT
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.	See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * 2007-11-12	Paul McCullagh
+ *
+ * H&G2JCtL
+ *
+ * Restart and write data to the database.
+ */
+
+#ifndef __restart_xt_h__
+#define __restart_xt_h__
+
+#include "pthread_xt.h"
+#include "filesys_xt.h"
+#include "sortedlist_xt.h"
+#include "util_xt.h"
+#include "xactlog_xt.h"
+
+struct XTThread;
+struct XTOpenTable;
+struct XTDatabase;
+struct XTTable;
+
+extern int				pbxt_recovery_state;
+
+typedef struct XTWriterState {
+	struct XTDatabase		*ws_db;
+	xtBool					ws_in_recover;
+	xtLogID					ws_ind_rec_log_id;
+	xtLogOffset				ws_ind_rec_log_offset;
+	XTXactSeqReadRec		ws_seqread;
+	XTDataBufferRec			ws_databuf;
+	XTInfoBufferRec			ws_rec_buf;
+	xtTableID				ws_tab_gone;					/* Cache the ID of the last table that does not exist. */
+	struct XTOpenTable		*ws_ot;
+} XTWriterStateRec, *XTWriterStatePtr;
+
+#define XT_CHECKPOINT_VERSION	1
+
+typedef struct XTXlogCheckpoint {
+	XTDiskValue2			xcp_checksum_2;					/* The checksum of the all checkpoint data. */
+	XTDiskValue4			xcp_head_size_4;
+	XTDiskValue2			xcp_version_2;					/* The version of the checkpoint record. */
+	XTDiskValue6			xcp_chkpnt_no_6;				/* Incremented for each checkpoint. */
+	XTDiskValue4			xcp_log_id_4;					/* The restart log ID. */
+	XTDiskValue6			xcp_log_offs_6;					/* The restart log offset. */
+	XTDiskValue4			xcp_tab_id_4;					/* The current high table ID. */
+	XTDiskValue4			xcp_xact_id_4;					/* The current high transaction ID. */
+	XTDiskValue4			xcp_ind_rec_log_id_4;			/* The index recovery log ID. */
+	XTDiskValue6			xcp_ind_rec_log_offs_6;		/* The index recovery log offset. */
+	XTDiskValue2			xcp_log_count_2;				/* Number of logs to be deleted in the area below. */
+	XTDiskValue2			xcp_del_log[XT_VAR_LENGTH];
+} XTXlogCheckpointDRec, *XTXlogCheckpointDPtr;
+
+typedef struct XTXactRestart {
+	struct XTDatabase		*xres_db;
+	int						xres_next_res_no;				/* The next restart file to be written. */
+	xtLogID					xres_cp_log_id;					/* Log number of the last checkpoint. */
+	xtLogOffset				xres_cp_log_offset;				/* Log offset of the last checkpoint */
+	xtBool					xres_cp_required;				/* Checkpoint required (startup and shutdown). */
+	xtWord8					xres_cp_number;					/* The checkpoint number (used to decide which is the latest checkpoint). */
+
+public:
+	void					xres_init(struct XTThread *self, struct XTDatabase *db, xtLogID *log_id, xtLogOffset *log_offset, xtLogID	*max_log_id);
+	void					xres_exit(struct XTThread *self);
+	xtBool					xres_is_checkpoint_pending(xtLogID log_id, xtLogOffset log_offset);
+	void					xres_checkpoint_pending(xtLogID log_id, xtLogOffset log_offset);
+	xtBool					xres_checkpoint(struct XTThread *self);
+	void					xres_name(size_t size, char *path, xtLogID log_id);
+
+private:
+	xtBool					xres_check_checksum(XTXlogCheckpointDPtr buffer, size_t size);
+	void					xres_recover_progress(XTThreadPtr self, XTOpenFilePtr *of, int perc);
+	xtBool					xres_restart(struct XTThread *self, xtLogID *log_id, xtLogOffset *log_offset, xtLogID ind_rec_log_id, off_t ind_rec_log_offset, xtLogID *max_log_id);
+	off_t					xres_bytes_to_read(struct XTThread *self, struct XTDatabase *db, u_int *log_count, xtLogID *max_log_id);
+} XTXactRestartRec, *XTXactRestartPtr;
+
+typedef struct XTCheckPointState {
+	xtBool					cp_inited;						/* TRUE if structure was inited */
+	xt_mutex_type			cp_state_lock;					/* Lock and the entire checkpoint state. */
+	xtBool					cp_running;						/* TRUE if a checkpoint is running. */
+	xtLogID					cp_log_id;
+	xtLogOffset				cp_log_offset;
+	xtLogID					cp_ind_rec_log_id;
+	xtLogOffset				cp_ind_rec_log_offset;
+	XTSortedListPtr			cp_table_ids;					/* List of tables to be flushed for the checkpoint. */
+	u_int					cp_flush_count;					/* The number of tables flushed. */
+	u_int					cp_next_to_flush;				/* The next table to be flushed. */
+} XTCheckPointStateRec, *XTCheckPointStatePtr;
+
+#define XT_CPT_NONE_FLUSHED			0
+#define XT_CPT_REC_ROW_FLUSHED		1
+#define XT_CPT_INDEX_FLUSHED		2
+#define XT_CPT_ALL_FLUSHED			(XT_CPT_REC_ROW_FLUSHED | XT_CPT_INDEX_FLUSHED)
+
+typedef struct XTCheckPointTable {
+	u_int					cpt_flushed;
+	xtTableID				cpt_tab_id;
+} XTCheckPointTableRec, *XTCheckPointTablePtr;
+
+void xt_xres_init(struct XTThread *self, struct XTDatabase *db);
+void xt_xres_exit(struct XTThread *self, struct XTDatabase *db);
+
+void xt_xres_init_tab(struct XTThread *self, struct XTTable *tab);
+void xt_xres_exit_tab(struct XTThread *self, struct XTTable *tab);
+
+void xt_xres_apply_in_order(struct XTThread *self, XTWriterStatePtr ws, xtLogID log_id, xtLogOffset log_offset, XTXactLogBufferDPtr record);
+
+xtBool	xt_begin_checkpoint(struct XTDatabase *db, xtBool have_table_lock, struct XTThread *thread);
+xtBool	xt_end_checkpoint(struct XTDatabase *db, struct XTThread *thread, xtBool *checkpoint_done);
+void	xt_start_checkpointer(struct XTThread *self, struct XTDatabase *db);
+void	xt_wait_for_checkpointer(struct XTThread *self, struct XTDatabase *db);
+void	xt_stop_checkpointer(struct XTThread *self, struct XTDatabase *db);
+void	xt_wake_checkpointer(struct XTThread *self, struct XTDatabase *db);
+void	xt_free_writer_state(struct XTThread *self, XTWriterStatePtr ws);
+xtWord8	xt_bytes_since_last_checkpoint(struct XTDatabase *db, xtLogID curr_log_id, xtLogOffset curr_log_offset);
+
+void xt_print_log_record(xtLogID log, off_t offset, XTXactLogBufferDPtr record);
+void xt_dump_xlogs(struct XTDatabase *db, xtLogID start_log);
+
+void xt_xres_start_database_recovery(XTThreadPtr self);
+void xt_xres_terminate_recovery(XTThreadPtr self);
+
+void xt_start_flusher(struct XTThread *self, struct XTDatabase *db);
+void xt_stop_flusher(struct XTThread *self, struct XTDatabase *db);
+
+#define XT_RECOVER_PENDING			0
+#define XT_RECOVER_DONE				1
+#define XT_RECOVER_SWEPT			2
+
+inline void xt_xres_wait_for_recovery(XTThreadPtr XT_UNUSED(self), int state)
+{
+	while (pbxt_recovery_state < state)
+		xt_sleep_milli_second(100);
+}
+
+#endif
diff --git a/storage/pbxt/src/sortedlist_xt.cc b/storage/pbxt/src/sortedlist_xt.cc
new file mode 100644
index 00000000000..f1742b64330
--- /dev/null
+++ b/storage/pbxt/src/sortedlist_xt.cc
@@ -0,0 +1,352 @@
+/* Copyright (c) 2005 PrimeBase Technologies GmbH
+ *
+ * PrimeBase XT
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * 2005-02-04	Paul McCullagh
+ *
+ * H&G2JCtL
+ */
+
+#include "xt_config.h"
+
+#include "pthread_xt.h"
+#include "thread_xt.h"
+#include "sortedlist_xt.h"
+
+XTSortedListPtr xt_new_sortedlist_ns(u_int item_size, u_int grow_size, XTCompareFunc comp_func, void *thunk, XTFreeFunc free_func)
+{
+	XTSortedListPtr sl;
+
+	if (!(sl = (XTSortedListPtr) xt_calloc_ns(sizeof(XTSortedListRec))))
+		return NULL;
+	sl->sl_item_size = item_size;
+	sl->sl_grow_size = grow_size;
+	sl->sl_comp_func = comp_func;
+	sl->sl_thunk = thunk;
+	sl->sl_free_func = free_func;
+	sl->sl_current_size = 0;
+	return sl;
+}
+
+XTSortedListPtr xt_new_sortedlist(XTThreadPtr self, u_int item_size, u_int initial_size, u_int grow_size, XTCompareFunc comp_func, void *thunk, XTFreeFunc free_func, xtBool with_lock, xtBool with_cond)
+{
+	XTSortedListPtr sl;
+
+	sl = (XTSortedListPtr) xt_calloc(self, sizeof(XTSortedListRec));
+	xt_init_sortedlist(self, sl, item_size, initial_size, grow_size, comp_func, thunk, free_func, with_lock, with_cond);
+	return sl;
+}
+
+xtPublic void xt_init_sortedlist(XTThreadPtr self, XTSortedListPtr sl, u_int item_size, u_int initial_size, u_int grow_size, XTCompareFunc comp_func, void *thunk, XTFreeFunc free_func, xtBool with_lock, xtBool with_cond)
+{
+	sl->sl_item_size = item_size;
+	sl->sl_grow_size = grow_size;
+	sl->sl_comp_func = comp_func;
+	sl->sl_thunk = thunk;
+	sl->sl_free_func = free_func;
+	sl->sl_current_size = initial_size;
+
+	if (initial_size) {
+		try_(a) {
+			sl->sl_data = (char *) xt_malloc(self, initial_size * item_size);
+		}
+		catch_(a) {
+			xt_free(self, sl);
+			throw_();
+		}
+		cont_(a);
+	}
+
+	if (with_lock || with_cond) {
+		sl->sl_lock = (xt_mutex_type *) xt_calloc(self, sizeof(xt_mutex_type));
+		try_(b) {
+			xt_init_mutex_with_autoname(self, sl->sl_lock);
+		}
+		catch_(b) {
+			xt_free(self, sl->sl_lock);
+			sl->sl_lock = NULL;
+			xt_free_sortedlist(self, sl);
+			throw_();
+		}
+		cont_(b);
+	}
+
+	if (with_cond) {
+		sl->sl_cond = (xt_cond_type *) xt_calloc(self, sizeof(xt_cond_type));
+		try_(c) {
+			xt_init_cond(self, sl->sl_cond);
+		}
+		catch_(c) {
+			xt_free(self, sl->sl_cond);
+			sl->sl_cond = NULL;
+			xt_free_sortedlist(self, sl);
+			throw_();
+		}
+		cont_(c);
+	}
+}
+
+xtPublic void xt_empty_sortedlist(XTThreadPtr self, XTSortedListPtr sl)
+{
+	if (sl->sl_lock)
+		xt_lock_mutex(self, sl->sl_lock);
+	if (sl->sl_data) {
+		while (sl->sl_usage_count > 0) {
+			sl->sl_usage_count--;
+			if (sl->sl_free_func)
+				(*sl->sl_free_func)(self, sl->sl_thunk, &sl->sl_data[sl->sl_usage_count * sl->sl_item_size]);
+		}
+	}
+	if (sl->sl_lock)
+		xt_unlock_mutex(self, sl->sl_lock);
+}
+
+xtPublic void xt_free_sortedlist(XTThreadPtr self, XTSortedListPtr sl)
+{
+	xt_empty_sortedlist(self, sl);
+	if (sl->sl_data) {
+		xt_free(self, sl->sl_data);
+		sl->sl_data = NULL;
+	}
+	if (sl->sl_lock) {
+		xt_free_mutex(sl->sl_lock);
+		xt_free(self, sl->sl_lock);
+	}
+	if (sl->sl_cond) {
+		xt_free_cond(sl->sl_cond);
+		xt_free(self, sl->sl_cond);
+	}
+	xt_free(self, sl);
+}
+
+xtPublic void *xt_sl_find(XTThreadPtr self, XTSortedListPtr sl, void *key)
+{
+	void	*result;
+	size_t	idx;
+
+	if (sl->sl_usage_count == 0)
+		return NULL;
+	else if (sl->sl_usage_count == 1) {
+		if ((*sl->sl_comp_func)(self, sl->sl_thunk, key, sl->sl_data) == 0)
+			return sl->sl_data;
+		return NULL;
+	}
+	result = xt_bsearch(self, key, sl->sl_data, sl->sl_usage_count, sl->sl_item_size, &idx, sl->sl_thunk, sl->sl_comp_func);
+	return result;
+}
+
+/*
+ * Returns:
+ * 1 = Value inserted.
+ * 2 = Value not inserted, already in the list.
+ * 0 = An error occurred.
+ */
+xtPublic int xt_sl_insert(XTThreadPtr self, XTSortedListPtr sl, void *key, void *data)
+{
+	size_t idx;
+
+	if (sl->sl_usage_count == 0)
+		idx = 0;
+	else if (sl->sl_usage_count == 1) {
+		int r;
+
+		if ((r = (*sl->sl_comp_func)(self, sl->sl_thunk, key, sl->sl_data)) == 0) {
+			if (sl->sl_free_func)
+				(*sl->sl_free_func)(self, sl->sl_thunk, data);
+			return 2;
+		}
+		if (r < 0)
+			idx = 0;
+		else
+			idx = 1;
+	}
+	else {
+		if (xt_bsearch(self, key, sl->sl_data, sl->sl_usage_count, sl->sl_item_size, &idx, sl->sl_thunk, sl->sl_comp_func)) {
+			if (sl->sl_free_func)
+				(*sl->sl_free_func)(self, sl->sl_thunk, data);
+			return 2;
+		}
+	}
+	if (sl->sl_usage_count == sl->sl_current_size) {		
+		if (!xt_realloc_ns((void **) &sl->sl_data, (sl->sl_current_size + sl->sl_grow_size) * sl->sl_item_size)) {
+			if (sl->sl_free_func)
+				(*sl->sl_free_func)(self, sl->sl_thunk, data);
+			if (self)
+				xt_throw(self);
+			return 0;
+		}
+		sl->sl_current_size = sl->sl_current_size + sl->sl_grow_size;
+	}
+	XT_MEMMOVE(sl->sl_data, &sl->sl_data[(idx+1) * sl->sl_item_size], &sl->sl_data[idx * sl->sl_item_size], (sl->sl_usage_count-idx) * sl->sl_item_size);
+	XT_MEMCPY(sl->sl_data, &sl->sl_data[idx * sl->sl_item_size], data, sl->sl_item_size);
+	sl->sl_usage_count++;
+	return 1;
+}
+
+xtPublic xtBool xt_sl_delete(XTThreadPtr self, XTSortedListPtr sl, void *key)
+{
+	void	*result;
+	size_t	idx;
+	
+	if (sl->sl_usage_count == 0)
+		return FALSE;
+	if (sl->sl_usage_count == 1) {
+		if ((*sl->sl_comp_func)(self, sl->sl_thunk, key, sl->sl_data) != 0)
+			return FALSE;
+		idx = 0;
+		result = sl->sl_data;
+	}
+	else {
+		if (!(result = xt_bsearch(self, key, sl->sl_data, sl->sl_usage_count, sl->sl_item_size, &idx, sl->sl_thunk, sl->sl_comp_func)))
+			return FALSE;
+	}
+	if (sl->sl_free_func)
+		(*sl->sl_free_func)(self, sl->sl_thunk, result);
+	sl->sl_usage_count--;
+	XT_MEMMOVE(sl->sl_data, &sl->sl_data[idx * sl->sl_item_size], &sl->sl_data[(idx+1) * sl->sl_item_size], (sl->sl_usage_count-idx) * sl->sl_item_size);
+	return TRUE;
+}
+
+xtPublic void xt_sl_delete_item_at(struct XTThread *self, XTSortedListPtr sl, size_t idx)
+{
+	void *result;
+
+	if (idx >= sl->sl_usage_count)
+		return;
+	result = &sl->sl_data[idx * sl->sl_item_size];
+	if (sl->sl_free_func)
+		(*sl->sl_free_func)(self, sl->sl_thunk, result);
+	sl->sl_usage_count--;
+	XT_MEMMOVE(sl->sl_data, &sl->sl_data[idx * sl->sl_item_size], &sl->sl_data[(idx+1) * sl->sl_item_size], (sl->sl_usage_count-idx) * sl->sl_item_size);
+}
+
+xtPublic void xt_sl_remove_from_front(struct XTThread *XT_UNUSED(self), XTSortedListPtr sl, size_t items)
+{
+	if (sl->sl_usage_count <= items)
+		xt_sl_set_size(sl, 0);
+	else {
+		XT_MEMMOVE(sl->sl_data, sl->sl_data, &sl->sl_data[items * sl->sl_item_size], (sl->sl_usage_count-items) * sl->sl_item_size);
+		sl->sl_usage_count -= items;
+	}
+}
+
+xtPublic void xt_sl_delete_from_info(XTThreadPtr self, XTSortedListInfoPtr li_undo)
+{	
+	xt_sl_delete(self, li_undo->li_sl, li_undo->li_key);
+}
+
+xtPublic size_t xt_sl_get_size(XTSortedListPtr sl)
+{
+	return sl->sl_usage_count;
+}
+
+xtPublic void xt_sl_set_size(XTSortedListPtr sl, size_t new_size)
+{
+	sl->sl_usage_count = new_size;
+	if (sl->sl_usage_count + sl->sl_grow_size <= sl->sl_current_size) {
+		size_t curr_size;
+
+		curr_size = sl->sl_usage_count;
+		if (curr_size < sl->sl_grow_size)
+			curr_size = sl->sl_grow_size;
+
+		if (xt_realloc(NULL, (void **) &sl->sl_data, curr_size * sl->sl_item_size))
+			sl->sl_current_size = curr_size;
+	}
+}
+
+xtPublic void *xt_sl_item_at(XTSortedListPtr sl, size_t idx)
+{
+	if (idx < sl->sl_usage_count)
+		return &sl->sl_data[idx * sl->sl_item_size];
+	return NULL;
+}
+
+xtPublic void *xt_sl_last_item(XTSortedListPtr sl)
+{
+	if (sl->sl_usage_count > 0)
+		return xt_sl_item_at(sl, sl->sl_usage_count - 1);
+	return NULL;
+}
+
+xtPublic void *xt_sl_first_item(XTSortedListPtr sl)
+{
+	if (sl->sl_usage_count > 0)
+		return xt_sl_item_at(sl, 0);
+	return NULL;
+}
+
+xtPublic xtBool xt_sl_lock(XTThreadPtr self, XTSortedListPtr sl)
+{
+	xtBool r = OK;
+	
+	if (sl->sl_locker != self)
+		r = xt_lock_mutex(self, sl->sl_lock);
+	if (r) {
+		sl->sl_locker = self;
+		sl->sl_lock_count++;
+	}
+	return r;
+}
+
+xtPublic void xt_sl_unlock(XTThreadPtr self, XTSortedListPtr sl)
+{
+	ASSERT(!self || sl->sl_locker == self);
+	ASSERT(sl->sl_lock_count > 0);
+
+	sl->sl_lock_count--;
+	if (!sl->sl_lock_count) {
+		sl->sl_locker = NULL;
+		xt_unlock_mutex(self, sl->sl_lock);
+	}
+}
+
+xtPublic void xt_sl_lock_ns(XTSortedListPtr sl, XTThreadPtr thread)
+{
+	if (sl->sl_locker != thread)
+		xt_lock_mutex_ns(sl->sl_lock);
+	sl->sl_locker = thread;
+	sl->sl_lock_count++;
+}
+
+xtPublic void xt_sl_unlock_ns(XTSortedListPtr sl)
+{
+	ASSERT_NS(!sl->sl_locker || sl->sl_locker == xt_get_self());
+	ASSERT_NS(sl->sl_lock_count > 0);
+
+	sl->sl_lock_count--;
+	if (!sl->sl_lock_count) {
+		sl->sl_locker = NULL;
+		xt_unlock_mutex_ns(sl->sl_lock);
+	}
+}
+
+xtPublic void xt_sl_wait(XTThreadPtr self, XTSortedListPtr sl)
+{
+	xt_wait_cond(self, sl->sl_cond, sl->sl_lock);
+}
+
+xtPublic xtBool xt_sl_signal(XTThreadPtr self, XTSortedListPtr sl)
+{
+	return xt_signal_cond(self, sl->sl_cond);
+}
+
+xtPublic void xt_sl_broadcast(XTThreadPtr self, XTSortedListPtr sl)
+{
+	xt_broadcast_cond(self, sl->sl_cond);
+}
+
diff --git a/storage/pbxt/src/sortedlist_xt.h b/storage/pbxt/src/sortedlist_xt.h
new file mode 100644
index 00000000000..cf3066981fe
--- /dev/null
+++ b/storage/pbxt/src/sortedlist_xt.h
@@ -0,0 +1,79 @@
+/* Copyright (c) 2005 PrimeBase Technologies GmbH
+ *
+ * PrimeBase XT
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * 2005-02-04	Paul McCullagh
+ *
+ * H&G2JCtL
+ */
+#ifndef __xt_sortedlist_h__
+#define __xt_sortedlist_h__
+
+#include "pthread_xt.h"
+#include "bsearch_xt.h"
+
+struct XTThread;
+
+typedef struct XTSortedList {
+	u_int			sl_item_size;
+	u_int			sl_grow_size;
+	XTCompareFunc	sl_comp_func;
+	void			*sl_thunk;
+	XTFreeFunc		sl_free_func;
+	xt_mutex_type	*sl_lock;
+	struct XTThread *sl_locker;
+	u_int			sl_lock_count;
+	xt_cond_type	*sl_cond;
+
+	u_int			sl_current_size;
+	u_int			sl_usage_count;
+	char			*sl_data;
+} XTSortedListRec, *XTSortedListPtr;
+
+typedef struct XTSortedListInfo {
+	XTSortedListPtr	li_sl;
+	void			*li_key;
+} XTSortedListInfoRec, *XTSortedListInfoPtr;
+
+XTSortedListPtr		xt_new_sortedlist(struct XTThread *self, u_int item_size, u_int initial_size, u_int grow_size, XTCompareFunc comp_func, void *thunk, XTFreeFunc free_func, xtBool with_lock, xtBool with_cond);
+void				xt_init_sortedlist(struct XTThread *self, XTSortedListPtr sl, u_int item_size, u_int initial_size, u_int grow_size, XTCompareFunc comp_func, void *thunk, XTFreeFunc free_func, xtBool with_lock, xtBool with_cond);
+void				xt_free_sortedlist(struct XTThread *self, XTSortedListPtr ld);
+void				xt_empty_sortedlist(struct XTThread *self, XTSortedListPtr sl);
+XTSortedListPtr		xt_new_sortedlist_ns(u_int item_size, u_int grow_size, XTCompareFunc comp_func, void *thunk, XTFreeFunc free_func);
+
+xtBool				xt_sl_insert(struct XTThread *self, XTSortedListPtr sl, void *key, void *data);
+void				*xt_sl_find(struct XTThread *self, XTSortedListPtr sl, void *key);
+xtBool				xt_sl_delete(struct XTThread *self, XTSortedListPtr sl, void *key);
+void				xt_sl_delete_item_at(struct XTThread *self, XTSortedListPtr sl, size_t i);
+void				xt_sl_remove_from_front(struct XTThread *self, XTSortedListPtr sl, size_t items);
+void				xt_sl_delete_from_info(struct XTThread *self, XTSortedListInfoPtr li);
+size_t				xt_sl_get_size(XTSortedListPtr sl);
+void				xt_sl_set_size(XTSortedListPtr sl, size_t new_size);
+void				*xt_sl_item_at(XTSortedListPtr sl, size_t i);
+void				*xt_sl_last_item(XTSortedListPtr sl);
+void				*xt_sl_first_item(XTSortedListPtr sl);
+
+xtBool				xt_sl_lock(struct XTThread *self, XTSortedListPtr sl);
+void				xt_sl_unlock(struct XTThread *self, XTSortedListPtr sl);
+void				xt_sl_lock_ns(XTSortedListPtr sl, struct XTThread *thread);
+void				xt_sl_unlock_ns(XTSortedListPtr sl);
+
+void				xt_sl_wait(struct XTThread *self, XTSortedListPtr sl);
+xtBool				xt_sl_signal(struct XTThread *self, XTSortedListPtr sl);
+void				xt_sl_broadcast(struct XTThread *self, XTSortedListPtr sl);
+
+#endif
diff --git a/storage/pbxt/src/strutil_xt.cc b/storage/pbxt/src/strutil_xt.cc
new file mode 100644
index 00000000000..8183034a204
--- /dev/null
+++ b/storage/pbxt/src/strutil_xt.cc
@@ -0,0 +1,580 @@
+/* Copyright (c) 2005 PrimeBase Technologies GmbH
+ *
+ * PrimeBase XT
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * 2005-01-03	Paul McCullagh
+ *
+ * H&G2JCtL
+ */
+
+#include "xt_config.h"
+
+#include <stdio.h>
+#include <string.h>
+#include <ctype.h>
+
+#include "strutil_xt.h"
+
+xtPublic void xt_strcpy(size_t size, char *to, c_char *from)
+{
+	if (size > 0) {
+		size--;
+		while (*from && size--)
+			*to++ = *from++;
+		*to = 0;
+	}
+}
+
+xtPublic void xt_strncpy(size_t size, char *to, c_char *from, size_t len_from)
+{
+	if (size > 0) {
+		size--;
+		while (len_from-- && size--)
+			*to++ = *from++;
+		*to = 0;
+	}
+}
+
+xtPublic void xt_strcpy_term(size_t size, char *to, c_char *from, char term)
+{
+	if (size > 0) {
+		size--;
+		while (*from && *from != term && size--)
+			*to++ = *from++;
+		*to = 0;
+	}
+}
+
+xtPublic void xt_strcat_term(size_t size, char *to, c_char *from, char term)
+{
+	while (*to && size--) to++;
+	if (size > 0) {
+		size--;
+		while (*from && *from != term && size--)
+			*to++ = *from++;
+		*to = 0;
+	}
+}
+
+xtPublic void xt_strcat(size_t size, char *to, c_char *from)
+{
+	while (*to && size--) to++;
+	xt_strcpy(size, to, from);
+}
+
+xtPublic void xt_strcati(size_t size, char *to, int i)
+{
+	char buffer[50];
+	
+	sprintf(buffer, "%d", i);
+	xt_strcat(size, to, buffer);
+}
+
+xtPublic xtBool xt_ends_with(c_char *str, c_char *sub)
+{
+	unsigned long len = strlen(str);
+	
+	if (len >= strlen(sub))
+		return strcmp(&str[len-strlen(sub)], sub) == 0;
+	return FALSE;
+}
+
+xtPublic xtPublic xtBool xt_starts_with(c_char *str, c_char *sub)
+{
+	return (strstr(str, sub) == str);
+}
+
+/* This function returns "" if the path ends with a dir char */
+xtPublic void xt_2nd_last_name_of_path(size_t size, char *dest, c_char *path)
+{
+	size_t	len;
+	c_char	*ptr, *pend;
+
+	len = strlen(path);
+	if (!len) {
+		*dest = 0;
+		return;
+	}
+
+ 	/* {INVALID-OLD-TABLE-FIX}
+	 * I have changed the implementation of
+	 * this bug fix (see {INVALID-OLD-TABLE-FIX}).
+       if (!is_prefix(path, mysql_data_home) &&
+            !is_prefix(path, mysql_real_data_home))
+        {
+          *dest= 0;
+          return;
+        }
+	 */
+
+	ptr = path + len - 1;
+	while (ptr != path && !XT_IS_DIR_CHAR(*ptr))
+		ptr--;
+	if (!XT_IS_DIR_CHAR(*ptr)) {
+		*dest = 0;
+		return;
+	}
+	pend = ptr;
+	ptr--;
+	while (ptr != path && !XT_IS_DIR_CHAR(*ptr))
+		ptr--;
+	if (XT_IS_DIR_CHAR(*ptr))
+		ptr++;
+	len = (size_t) (pend - ptr);
+	if (len > size-1)
+		len = size-1;
+	memcpy(dest, ptr, len);
+	dest[len] = 0;
+}
+
+/* This function returns "" if the path ends with a dir char */
+xtPublic char *xt_last_name_of_path(c_char *path)
+{
+	size_t	length;
+	c_char	*ptr;
+
+	length = strlen(path);
+	if (!length)
+		return (char *) path;
+	ptr = path + length - 1;
+	while (ptr != path && !XT_IS_DIR_CHAR(*ptr)) ptr--;
+	if (XT_IS_DIR_CHAR(*ptr)) ptr++;
+	return (char *) ptr;
+}
+
+xtPublic char *xt_last_2_names_of_path(c_char *path)
+{
+	size_t	length;
+	c_char	*ptr;
+
+	length = strlen(path);
+	if (!length)
+		return (char *) path;
+	ptr = path + length - 1;
+	while (ptr != path && !XT_IS_DIR_CHAR(*ptr)) ptr--;
+	if (XT_IS_DIR_CHAR(*ptr)) {
+		ptr--;
+		while (ptr != path && !XT_IS_DIR_CHAR(*ptr)) ptr--;
+		if (XT_IS_DIR_CHAR(*ptr))
+			ptr++;
+	}
+	return (char *) ptr;
+}
+
+xtPublic c_char *xt_last_directory_of_path(c_char *path)
+/* This function returns the last name component, even if the path ends with a dir char */
+{
+	size_t	length;
+	c_char	*ptr;
+
+	length = strlen(path);
+	if (!length)
+		return(path);
+	ptr = path + length - 1;
+	/* Path may end with multiple slashes: */
+	while (ptr != path && XT_IS_DIR_CHAR(*ptr))
+		ptr--;
+	while (ptr != path && !XT_IS_DIR_CHAR(*ptr))
+		ptr--;
+	if (XT_IS_DIR_CHAR(*ptr)) ptr++;
+	return(ptr);
+}
+
+xtPublic char *xt_find_extension(c_char *file_name)
+{
+	c_char	*ptr;
+
+	for (ptr = file_name + strlen(file_name) - 1; ptr >= file_name; ptr--) {
+		if (XT_IS_DIR_CHAR(*ptr))
+			break;
+		if (*ptr == '.')
+			return (char *) (ptr + 1);
+	}
+	return NULL;
+}
+
+xtPublic void xt_remove_extension(char *file_name)
+{
+	char *ptr = xt_find_extension(file_name);
+
+	if (ptr)
+		*(ptr - 1) = 0;
+}
+
+xtPublic xtBool xt_is_extension(c_char *file_name, c_char *ext)
+{
+	char *ptr;
+	
+	if (!(ptr = xt_find_extension(file_name)))
+		return FALSE;
+	return strcmp(ptr, ext) == 0;
+}
+
+/*
+ * Optionally remove trailing directory delimiters (If the directory name consists of one
+ * character, the directory delimiter is not removed).
+ */
+xtPublic xtBool xt_remove_dir_char(char *dir_name)
+{
+	size_t	length;
+	xtBool	removed = FALSE;
+	
+	length = strlen(dir_name);
+	while (length > 1 && XT_IS_DIR_CHAR(dir_name[length - 1])) {
+		dir_name[length - 1] = '\0';
+		length--;
+		removed = TRUE;
+	}
+	return removed;
+}
+
+xtPublic void xt_remove_last_name_of_path(char *path)
+{
+	char *ptr;
+
+	if ((ptr = xt_last_name_of_path(path)))
+		*ptr = 0;
+}
+
+xtBool xt_add_dir_char(size_t max, char *path)
+{
+	size_t slen = strlen(path);
+
+	if (slen >= max)
+		return FALSE;
+
+	if (slen == 0) {
+		/* If no path is given we will be at the current working directory, under UNIX we must
+		 * NOT add a directory delimiter character:
+		 */
+		return FALSE;
+	}
+
+	if (!XT_IS_DIR_CHAR(path[slen - 1])) {
+		path[slen] = XT_DIR_CHAR;
+		path[slen + 1] = '\0';
+		return TRUE;
+	}
+	return FALSE;
+}
+
+xtPublic xtInt8 xt_str_to_int8(c_char *ptr, xtBool *overflow)
+{
+	xtInt8 value = 0;
+
+	if (overflow)
+		*overflow = FALSE;
+	while (*ptr == '0') ptr++;
+	if (!*ptr)
+		value = (xtInt8) 0;
+	else {
+		sscanf(ptr, "%"PRId64, &value);
+		if (!value && overflow)
+			*overflow = TRUE;
+	}
+	return value;
+}
+
+xtPublic void xt_int8_to_str(xtInt8 value, char *string)
+{
+	sprintf(string, "%"PRId64, value);
+}
+
+xtPublic void xt_double_to_str(double value, int scale, char *string)
+{
+	char *ptr;
+
+	sprintf(string, "%.*f", scale, value);
+	ptr = string + strlen(string) - 1;
+	
+	if (strchr(string, '.') && (*ptr == '0' || *ptr == '.')) {
+		while (ptr-1 > string && *(ptr-1) == '0') ptr--;
+		if (ptr-1 > string && *(ptr-1) == '.') ptr--;
+		*ptr = 0;
+	}
+}
+
+/*
+ * This function understand GB, MB, KB.
+ */
+xtPublic xtInt8 xt_byte_size_to_int8(c_char *ptr)
+{
+	char	number[101], *num_ptr;
+	xtInt8	size;
+
+	while (*ptr && isspace(*ptr))
+		ptr++;
+
+	num_ptr = number;
+	while (*ptr && isdigit(*ptr)) {
+		if (num_ptr < number+100) {
+			*num_ptr = *ptr;
+			num_ptr++;
+		}
+		ptr++;
+	}
+	*num_ptr = 0;
+	size = xt_str_to_int8(number, NULL);
+
+	while (*ptr && isspace(*ptr))
+		ptr++;
+	
+	switch (toupper(*ptr)) {
+		case 'P':
+			size *= 1024;
+		case 'T':
+			size *= 1024;
+		case 'G':
+			size *= 1024;
+		case 'M':
+			size *= 1024;
+		case 'K':
+			size *= 1024;
+			break;
+	}
+	
+	return size;
+}
+
+xtPublic void xt_int8_to_byte_size(xtInt8 value, char *string)
+{
+	double	v;
+	c_char	*unit;
+	char	val_str[100];
+
+	if (value >= (xtInt8) (1024 * 1024 * 1024)) {
+		v = (double) value / (double) (1024 * 1024 * 1024);
+		unit = "GB";
+	}
+	else if (value >= (xtInt8) (1024 * 1024)) {
+		v = (double) value / (double) (1024 * 1024);
+		unit = "MB";
+	}
+	else if (value >= (xtInt8) 1024) {
+		v = (double) value / (double) (1024);
+		unit = "Kb";
+	}
+	else {
+		v = (double) value;
+		unit = "bytes";
+	}
+	
+	xt_double_to_str(v, 2, val_str);
+	sprintf(string, "%s %s (%"PRId64" bytes)", val_str, unit, value);
+}
+
+/* Version number must also be set in configure.in! */
+xtPublic c_char *xt_get_version(void)
+{
+	return "1.0.11-7 Pre-GA";
+}
+
+/* Copy and URL decode! */
+xtPublic void xt_strcpy_url(size_t size, char *to, c_char *from)
+{
+	if (size > 0) {
+		size--;
+		while (*from && size--) {
+			if (*from == '%' && isxdigit(*(from+1)) && isxdigit(*(from+2))) {
+				unsigned char a = xt_hex_digit(*(from+1));
+				unsigned char b = xt_hex_digit(*(from+2));
+				*to++ = a << 4 | b;
+				from += 3;
+			}
+			else
+				*to++ = *from++;
+		}
+		*to = 0;
+	}
+}
+
+/* Copy and URL decode! */
+xtPublic void xt_strncpy_url(size_t size, char *to, c_char *from, size_t len_from)
+{
+	if (size > 0) {
+		size--;
+		while (len_from-- && size--) {
+			if (*from == '%' && len_from >= 2 && isxdigit(*(from+1)) && isxdigit(*(from+2))) {
+				unsigned char a = xt_hex_digit(*(from+1));
+				unsigned char b = xt_hex_digit(*(from+2));
+				*to++ = a << 4 | b;
+				from += 3;
+			}
+			else
+				*to++ = *from++;
+		}
+		*to = 0;
+	}
+}
+
+/* Returns a pointer to the end of the string if nothing found! */
+const char *xt_strchr(const char *str, char ch)
+{
+	while (*str && *str != ch) str++;
+	return str;
+}
+
+unsigned char xt_hex_digit(char ch)
+{
+	if (isdigit(ch))
+		return((unsigned char) ch - (unsigned char) '0');
+
+	ch = toupper(ch);
+	if (ch >= 'A' && ch <= 'F')
+		return((unsigned char) ch - (unsigned char) 'A' + (unsigned char) 10);
+
+	return((unsigned char) 0);
+}
+
+#ifdef XT_WIN
+xtPublic void xt_win_dialog(char *message)
+{
+	MessageBoxA(NULL, message, "Debug Me!", MB_ICONWARNING | MB_OK);
+}
+#endif
+
+/*
+ * --------------- SYSTEM STATISTICS ------------------
+ */
+
+static char					su_t_unit[10] = "usec";
+/*
+ * Note times, are return in microseconds, but the display in xtstat is currently
+ * in milliseconds.
+ */
+static XTStatMetaDataRec	pbxt_stat_meta_data[XT_STAT_MAXIMUM] = {
+	{ XT_STAT_TIME_CURRENT,	"Current Time",				"time",	"curr",		XT_STAT_DATE,
+		"The current time in seconds" },
+	{ XT_STAT_TIME_PASSED,	"Time Since Last Call",		"time",	su_t_unit,	XT_STAT_ACCUMULATIVE | XT_STAT_TIME_VALUE,
+		"Time passed in %sseconds since last statistics call" },
+
+	{ XT_STAT_COMMITS,			"Commit Count",			"xact", "commt",	XT_STAT_ACCUMULATIVE,
+		"Number of transactions committed" },
+	{ XT_STAT_ROLLBACKS,		"Rollback Count",		"xact", "rollb",	XT_STAT_ACCUMULATIVE,
+		"Number of transactions rolled back" },
+	{ XT_STAT_WAIT_FOR_XACT,	"Wait for Xact Count",	"xact", "waits",	XT_STAT_ACCUMULATIVE,
+		"Number of times waited for another transaction" },
+	{ XT_STAT_XACT_TO_CLEAN,	"Dirty Xact Count",		"xact", "dirty",	0,
+		"Number of transactions still to be cleaned up" },
+
+	{ XT_STAT_STAT_READS,		"Read Statements",		"stat", "read",		XT_STAT_ACCUMULATIVE,
+		"Number of SELECT statements" },
+	{ XT_STAT_STAT_WRITES,		"Write Statements",		"stat", "write",	XT_STAT_ACCUMULATIVE,
+		"Number of UPDATE/INSERT/DELETE statements" },
+
+	{ XT_STAT_REC_BYTES_IN,		"Record Bytes Read",	"rec", "in",		XT_STAT_ACCUMULATIVE | XT_STAT_BYTE_COUNT,
+		"Bytes read from the record/row files" },
+	{ XT_STAT_REC_BYTES_OUT,	"Record Bytes Written",	"rec", "out",		XT_STAT_ACCUMULATIVE | XT_STAT_BYTE_COUNT,
+		"Bytes written from the record/row files" },
+	{ XT_STAT_REC_SYNC_COUNT,	"Record File Flushes",	"rec", "syncs",		XT_STAT_ACCUMULATIVE | XT_STAT_COMBO_FIELD,
+		"Number of flushes to record/row files" },
+	{ XT_STAT_REC_SYNC_TIME,	"Record Flush Time",	"rec", su_t_unit,	XT_STAT_ACCUMULATIVE | XT_STAT_TIME_VALUE | XT_STAT_COMBO_FIELD_2,
+		"The time in %sseconds to flush record/row files" },
+	{ XT_STAT_REC_CACHE_HIT,	"Record Cache Hits",	"rec", "hits",		XT_STAT_ACCUMULATIVE,
+		"Hits when accessing the record cache" },
+	{ XT_STAT_REC_CACHE_MISS,	"Record Cache Misses",	"rec", "miss",		XT_STAT_ACCUMULATIVE,
+		"Misses when accessing the record cache" },
+	{ XT_STAT_REC_CACHE_FREES,	"Record Cache Frees",	"rec", "frees",		XT_STAT_ACCUMULATIVE,
+		"Number of record cache pages freed" },
+	{ XT_STAT_REC_CACHE_USAGE,	"Record Cache Usage",	"rec", "%use",		XT_STAT_PERCENTAGE,
+		"Percentage of record cache in use" },
+
+	{ XT_STAT_IND_BYTES_IN,		"Index Bytes Read",		"ind", "in",		XT_STAT_ACCUMULATIVE | XT_STAT_BYTE_COUNT,
+		"Bytes read from the index files" },
+	{ XT_STAT_IND_BYTES_OUT,	"Index Bytes Written",	"ind", "out",		XT_STAT_ACCUMULATIVE | XT_STAT_BYTE_COUNT,
+		"Bytes written from the index files" },
+	{ XT_STAT_IND_SYNC_COUNT,	"Index File Flushes",	"ind", "syncs",		XT_STAT_ACCUMULATIVE | XT_STAT_COMBO_FIELD,
+		"Number of flushes to index files" },
+	{ XT_STAT_IND_SYNC_TIME,	"Index Flush Time",		"ind", su_t_unit,	XT_STAT_ACCUMULATIVE | XT_STAT_TIME_VALUE | XT_STAT_COMBO_FIELD_2,
+		"The time in %sseconds to flush index files" },
+	{ XT_STAT_IND_CACHE_HIT,	"Index Cache Hits",		"ind", "hits",		XT_STAT_ACCUMULATIVE,
+		"Hits when accessing the index cache" },
+	{ XT_STAT_IND_CACHE_MISS,	"Index Cache Misses",	"ind", "miss",		XT_STAT_ACCUMULATIVE,
+		"Misses when accessing the index cache" },
+	{ XT_STAT_IND_CACHE_USAGE,	"Index Cache Usage",	"ind", "%use",		XT_STAT_PERCENTAGE,
+		"Percentage of index cache used" },
+	{ XT_STAT_ILOG_BYTES_IN,	"Index Log Bytes In",	"ilog", "in",		XT_STAT_ACCUMULATIVE | XT_STAT_BYTE_COUNT,
+		"Bytes read from the index log files" },
+	{ XT_STAT_ILOG_BYTES_OUT,	"Index Log Bytes Out",	"ilog", "out",		XT_STAT_ACCUMULATIVE | XT_STAT_BYTE_COUNT,
+		"Bytes written from the index log files" },
+	{ XT_STAT_ILOG_SYNC_COUNT,	"Index Log File Syncs",	"ilog", "syncs",	XT_STAT_ACCUMULATIVE | XT_STAT_COMBO_FIELD,
+		"Number of flushes to index log files" },
+	{ XT_STAT_ILOG_SYNC_TIME,	"Index Log Sync Time",	"ilog", su_t_unit,	XT_STAT_ACCUMULATIVE | XT_STAT_TIME_VALUE | XT_STAT_COMBO_FIELD_2,
+		"The time in %sseconds to flush index log files" },
+
+	{ XT_STAT_XLOG_BYTES_IN,	"Xact Log Bytes In",	"xlog", "in",		XT_STAT_ACCUMULATIVE | XT_STAT_BYTE_COUNT,
+		"Bytes read from the transaction log files" },
+	{ XT_STAT_XLOG_BYTES_OUT,	"Xact Log Bytes Out",	"xlog", "out",		XT_STAT_ACCUMULATIVE | XT_STAT_BYTE_COUNT,
+		"Bytes written from the transaction log files" },
+	{ XT_STAT_XLOG_SYNC_COUNT,	"Xact Log File Syncs",	"xlog", "syncs",	XT_STAT_ACCUMULATIVE,
+		"Number of flushes to transaction log files" },
+	{ XT_STAT_XLOG_SYNC_TIME,	"Xact Log Sync Time",	"xlog", su_t_unit,	XT_STAT_ACCUMULATIVE | XT_STAT_TIME_VALUE,
+		"The time in %sseconds to flush transaction log files" },
+	{ XT_STAT_XLOG_CACHE_HIT,	"Xact Log Cache Hits",	"xlog", "hits",		XT_STAT_ACCUMULATIVE,
+		"Hits when accessing the transaction log cache" },
+	{ XT_STAT_XLOG_CACHE_MISS,	"Xact Log Cache Misses","xlog", "miss",		XT_STAT_ACCUMULATIVE,
+		"Misses when accessing the transaction log cache" },
+	{ XT_STAT_XLOG_CACHE_USAGE,	"Xact Log Cache Usage",	"xlog", "%use",		XT_STAT_PERCENTAGE,
+		"Percentage of transaction log cache used" },
+
+	{ XT_STAT_DATA_BYTES_IN,	"Data Log Bytes In",	"data", "in",		XT_STAT_ACCUMULATIVE | XT_STAT_BYTE_COUNT,
+		"Bytes read from the data log files" },
+	{ XT_STAT_DATA_BYTES_OUT,	"Data Log Bytes Out",	"data", "out",		XT_STAT_ACCUMULATIVE | XT_STAT_BYTE_COUNT,
+		"Bytes written from the data log files" },
+	{ XT_STAT_DATA_SYNC_COUNT,	"Data Log File Syncs",	"data", "syncs",	XT_STAT_ACCUMULATIVE,
+		"Number of flushes to data log files" },
+	{ XT_STAT_DATA_SYNC_TIME,	"Data Log Sync Time",	"data", su_t_unit,	XT_STAT_ACCUMULATIVE | XT_STAT_TIME_VALUE,
+		"The time in %sseconds to flush data log files" },
+
+	{ XT_STAT_BYTES_TO_CHKPNT,	"Bytes to Checkpoint",	"to", "chkpt",		XT_STAT_BYTE_COUNT,
+		"Bytes written to the log since the last checkpoint" },
+	{ XT_STAT_LOG_BYTES_TO_WRITE, "Log Bytes to Write",	"to", "write",		XT_STAT_BYTE_COUNT,
+		"Bytes written to the log, still to be written to the database" },
+	{ XT_STAT_BYTES_TO_SWEEP,	"Log Bytes to Sweep",	"to", "sweep",		XT_STAT_BYTE_COUNT,
+		"Bytes written to the log, still to be read by the sweeper" },
+	{ XT_STAT_SWEEPER_WAITS,	"Sweeper Wait on Xact",	"sweep", "waits",	XT_STAT_ACCUMULATIVE,
+		"Attempts to cleanup a transaction" },
+
+	{ XT_STAT_SCAN_INDEX,		"Index Scan Count",		"scan", "index",	XT_STAT_ACCUMULATIVE,
+		"Number of index scans" },
+	{ XT_STAT_SCAN_TABLE,		"Table Scan Count",		"scan", "table",	XT_STAT_ACCUMULATIVE,
+		"Number of table scans" },
+	{ XT_STAT_ROW_SELECT,		"Select Row Count",		"row", "sel",		XT_STAT_ACCUMULATIVE,
+		"Number of rows selected" },
+	{ XT_STAT_ROW_INSERT,		"Insert Row Count",		"row", "ins",		XT_STAT_ACCUMULATIVE,
+		"Number of rows inserted" },
+	{ XT_STAT_ROW_UPDATE,		"Update Row Count",		"row", "upd",		XT_STAT_ACCUMULATIVE,
+		"Number of rows updated" },
+	{ XT_STAT_ROW_DELETE,		"Delete Row Count",		"row", "del",		XT_STAT_ACCUMULATIVE,
+		"Number of rows deleted" },
+
+	{ XT_STAT_RETRY_INDEX_SCAN,	"Index Scan Retries",	"retry", "iscan",	XT_STAT_ACCUMULATIVE,
+		"Index scans restarted because of locked record" },
+	{ XT_STAT_REREAD_REC_LIST,	"Record List Rereads",	"retry", "rlist",	XT_STAT_ACCUMULATIVE,
+		"Record list rescanned due to lock" }
+};
+
+xtPublic XTStatMetaDataPtr xt_get_stat_meta_data(int i)
+{
+	return &pbxt_stat_meta_data[i];
+}
+
+xtPublic void xt_set_time_unit(const char *u)
+{
+	xt_strcpy(10, su_t_unit, u);
+}
+
diff --git a/storage/pbxt/src/strutil_xt.h b/storage/pbxt/src/strutil_xt.h
new file mode 100644
index 00000000000..62067e0b671
--- /dev/null
+++ b/storage/pbxt/src/strutil_xt.h
@@ -0,0 +1,164 @@
+/* Copyright (c) 2005 PrimeBase Technologies GmbH
+ *
+ * PrimeBase XT
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * 2005-01-03	Paul McCullagh
+ *
+ * H&G2JCtL
+ */
+
+#ifndef __xt_strutil_h__
+#define __xt_strutil_h__
+
+#include <string.h>
+
+#include "xt_defs.h"
+
+#ifdef XT_WIN
+#define XT_DIR_CHAR					'\\'
+#define XT_IS_DIR_CHAR(c)			((c) == '/' || (c) == '\\')
+#else
+#define XT_DIR_CHAR					'/'
+#define XT_IS_DIR_CHAR(c)			((c) == '/')
+#endif
+
+#define MAX_INT8_STRING_SIZE		100
+
+void	xt_strcpy(size_t size, char *to, c_char *from);
+void	xt_strncpy(size_t size, char *to, c_char *from, size_t len_from);
+void	xt_strcat(size_t size, char *to, c_char *from);
+void	xt_strcati(size_t size, char *to, int i);
+void	xt_strcpy_term(size_t size, char *to, c_char *from, char term);
+void	xt_strcat_term(size_t size, char *to, c_char *from, char term);
+
+xtBool	xt_ends_with(c_char *str, c_char *sub);
+xtBool	xt_starts_with(c_char *str, c_char *sub);
+
+char	*xt_last_2_names_of_path(c_char *path);
+char	*xt_last_name_of_path(c_char *path);
+void	xt_2nd_last_name_of_path(size_t size, char *dest, c_char *path);
+c_char	*xt_last_directory_of_path(c_char *path);
+xtBool	xt_remove_dir_char(char *dir_name);
+xtBool	xt_add_dir_char(size_t max, char *path);
+void	xt_remove_last_name_of_path(char *path);
+char	*xt_find_extension(c_char *file_name);
+void	xt_remove_extension(char *file_name);
+xtBool	xt_is_extension(c_char *file_name, c_char *ext);
+
+xtInt8	xt_str_to_int8(c_char *ptr, xtBool *overflow);
+void	xt_int8_to_str(xtInt8 value, char *string);
+void	xt_double_to_str(double value, int scale, char *string);
+
+xtInt8	xt_byte_size_to_int8(c_char *ptr);
+void	xt_int8_to_byte_size(xtInt8 value, char *string);
+
+c_char	*xt_get_version(void);
+
+void xt_strcpy_url(size_t size, char *to, c_char *from);
+void xt_strncpy_url(size_t size, char *to, c_char *from, size_t len_from);
+
+const char		*xt_strchr(const char *str, char ch);
+unsigned char	xt_hex_digit(char ch);
+
+#define XT_STAT_TIME_CURRENT		0
+#define XT_STAT_TIME_PASSED			1
+
+#define XT_STAT_COMMITS				2
+#define XT_STAT_ROLLBACKS			3
+#define XT_STAT_WAIT_FOR_XACT		4
+#define XT_STAT_XACT_TO_CLEAN		5
+
+#define XT_STAT_STAT_READS			6
+#define XT_STAT_STAT_WRITES			7
+
+#define XT_STAT_REC_BYTES_IN		8
+#define XT_STAT_REC_BYTES_OUT		9
+#define XT_STAT_REC_SYNC_COUNT		10
+#define XT_STAT_REC_SYNC_TIME		11
+#define XT_STAT_REC_CACHE_HIT		12
+#define XT_STAT_REC_CACHE_MISS		13
+#define XT_STAT_REC_CACHE_FREES		14
+#define XT_STAT_REC_CACHE_USAGE		15
+
+#define XT_STAT_IND_BYTES_IN		16
+#define XT_STAT_IND_BYTES_OUT		17
+#define XT_STAT_IND_SYNC_COUNT		18
+#define XT_STAT_IND_SYNC_TIME		19
+#define XT_STAT_IND_CACHE_HIT		20
+#define XT_STAT_IND_CACHE_MISS		21
+#define XT_STAT_IND_CACHE_USAGE		22
+#define XT_STAT_ILOG_BYTES_IN		23
+#define XT_STAT_ILOG_BYTES_OUT		24
+#define XT_STAT_ILOG_SYNC_COUNT		25
+#define XT_STAT_ILOG_SYNC_TIME		26
+
+#define XT_STAT_XLOG_BYTES_IN		27
+#define XT_STAT_XLOG_BYTES_OUT		28
+#define XT_STAT_XLOG_SYNC_COUNT		29
+#define XT_STAT_XLOG_SYNC_TIME		30
+#define XT_STAT_XLOG_CACHE_HIT		31
+#define XT_STAT_XLOG_CACHE_MISS		32
+#define XT_STAT_XLOG_CACHE_USAGE	33
+
+#define XT_STAT_DATA_BYTES_IN		34
+#define XT_STAT_DATA_BYTES_OUT		35
+#define XT_STAT_DATA_SYNC_COUNT		36
+#define XT_STAT_DATA_SYNC_TIME		37
+
+#define XT_STAT_BYTES_TO_CHKPNT		38
+#define XT_STAT_LOG_BYTES_TO_WRITE	39
+#define XT_STAT_BYTES_TO_SWEEP		40
+#define XT_STAT_SWEEPER_WAITS		41
+
+#define XT_STAT_SCAN_INDEX			42
+#define XT_STAT_SCAN_TABLE			43
+#define XT_STAT_ROW_SELECT			44
+#define XT_STAT_ROW_INSERT			45
+#define XT_STAT_ROW_UPDATE			46
+#define XT_STAT_ROW_DELETE			47
+
+#define XT_STAT_CURRENT_MAX			48
+
+#define XT_STAT_RETRY_INDEX_SCAN	48
+#define XT_STAT_REREAD_REC_LIST		49
+#define XT_STAT_MAXIMUM				50
+
+#define XT_STAT_ACCUMULATIVE		1
+#define XT_STAT_BYTE_COUNT			2
+#define XT_STAT_PERCENTAGE			4
+#define XT_STAT_COMBO_FIELD			8				/* Field is short, 2 chars instead of 5. */
+#define XT_STAT_COMBO_FIELD_2		16				/* Field is short, 2 chars instead of 5. */
+#define XT_STAT_TIME_VALUE			32
+#define XT_STAT_DATE				64
+
+typedef struct XTStatMetaData {
+	int				sm_id;
+	const char		*sm_name;
+	const char		*sm_short_line_1;
+	const char		*sm_short_line_2;
+	int				sm_flags;
+	const char		*sm_description;
+} XTStatMetaDataRec, *XTStatMetaDataPtr;
+
+XTStatMetaDataPtr	xt_get_stat_meta_data(int i);
+void				xt_set_time_unit(const char *u);
+
+#ifdef XT_WIN
+void	xt_win_dialog(char *message);
+#endif
+
+#endif
diff --git a/storage/pbxt/src/systab_xt.cc b/storage/pbxt/src/systab_xt.cc
new file mode 100644
index 00000000000..783debca115
--- /dev/null
+++ b/storage/pbxt/src/systab_xt.cc
@@ -0,0 +1,655 @@
+/* Copyright (c) 2008 PrimeBase Technologies GmbH, Germany
+ *
+ * PrimeBase Media Stream for MySQL
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * Paul McCullagh
+ *
+ * 2007-07-18
+ *
+ * H&G2JCtL
+ *
+ * System tables.
+ *
+ */
+
+#include "xt_config.h"
+
+#include <stdlib.h>
+#include <time.h>
+#ifdef DRIZZLED
+#include <drizzled/server_includes.h>
+#include <drizzled/current_session.h>
+#endif
+
+#include "ha_pbxt.h"
+#include "systab_xt.h"
+#include "discover_xt.h"
+#include "table_xt.h"
+#include "strutil_xt.h"
+#include "database_xt.h"
+#include "trace_xt.h"
+
+#if MYSQL_VERSION_ID >= 50120
+#define byte uchar
+#endif
+
+/*
+ * -------------------------------------------------------------------------
+ * SYSTEM TABLE DEFINITIONS
+ */
+
+//--------------------------------
+static DT_FIELD_INFO xt_location_info[] =
+{
+	{ "Path",				128,	NULL, MYSQL_TYPE_VARCHAR,	(CHARSET_INFO *) system_charset_info,	0,	"The location of PBXT tables"},
+	{ "Table_count",		0,		NULL, MYSQL_TYPE_LONGLONG,	NULL,					NOT_NULL_FLAG,		"The number of PBXT table in this location"},
+	{ NULL,					0,		NULL, MYSQL_TYPE_STRING,	NULL,					0, NULL}
+};
+
+static DT_FIELD_INFO xt_statistics_info[] =
+{
+	{ "ID",					0,	NULL, MYSQL_TYPE_LONG,			NULL,					NOT_NULL_FLAG,		"The ID of the statistic"},
+	{ "Name",				40,	NULL, MYSQL_TYPE_VARCHAR,		(CHARSET_INFO *) system_charset_info,	0,	"The name of the statistic"},
+	{ "Value",				0,	NULL,	MYSQL_TYPE_LONGLONG,	NULL,					NOT_NULL_FLAG,		"The accumulated value"},
+	{ NULL,					0,	NULL, MYSQL_TYPE_STRING,		NULL,					0, NULL}
+};
+
+/*
+static DT_FIELD_INFO xt_reference_info[] =
+{
+	{"Table_name",		128,					NULL, MYSQL_TYPE_STRING,	system_charset_info,	NOT_NULL_FLAG,	"The name of the referencing table"},
+	{"Blob_id",			NULL,					NULL, MYSQL_TYPE_LONGLONG,	NULL,					NOT_NULL_FLAG,	"The BLOB reference number - part of the BLOB URL"},
+	{"Column_name",		50,						NULL, MYSQL_TYPE_STRING,	system_charset_info,	NOT_NULL_FLAG,	"The column name of the referencing field"},
+	{"Row_condition",	50,						NULL, MYSQL_TYPE_VARCHAR,	system_charset_info,	0,				"This condition identifies the row in the table"},
+	{"Blob_url",		50,						NULL, MYSQL_TYPE_VARCHAR,	system_charset_info,	NOT_NULL_FLAG,	"The BLOB URL for HTTP GET access"},
+	{"Repository_id",	NULL,					NULL, MYSQL_TYPE_LONG,		NULL,					NOT_NULL_FLAG,	"The repository file number of the BLOB"},
+	{"Repo_blob_offset",NULL,					NULL, MYSQL_TYPE_LONGLONG,	NULL,					NOT_NULL_FLAG,	"The offset in the repository file"},
+	{"Blob_size",		NULL,					NULL, MYSQL_TYPE_LONGLONG,	NULL,					NOT_NULL_FLAG,	"The size of the BLOB in bytes"},
+	{"Deletion_time",	NULL,					NULL, MYSQL_TYPE_TIMESTAMP,	NULL,					0,				"The time the BLOB was deleted"},
+	{"Remove_in",		NULL,					NULL, MYSQL_TYPE_LONG,		NULL,					0,				"The number of seconds before the reference/BLOB is removed perminently"},
+	{"Temp_log_id",		NULL,					NULL, MYSQL_TYPE_LONG,		NULL,					0,				"Temporary log number of the referencing deletion entry"},
+	{"Temp_log_offset",	NULL,					NULL, MYSQL_TYPE_LONGLONG,	NULL,					0,				"Temporary log offset of the referencing deletion entry"},
+	{NULL,				NULL,					NULL, MYSQL_TYPE_STRING,	NULL, 0,											NULL}
+};
+*/
+
+#define XT_SYSTAB_INVALID			0
+#define XT_SYSTAB_LOCATION_ID		1
+#define XT_SYSTAB_STATISTICS_ID		2
+
+static THR_LOCK sys_location_lock;
+static THR_LOCK sys_statistics_lock;
+static xtBool	sys_lock_inited = FALSE;
+
+static XTSystemTableShareRec xt_internal_tables[] =
+{
+	{ XT_SYSTAB_LOCATION_ID,	"pbxt.location", &sys_location_lock, xt_location_info, NULL, FALSE},
+	{ XT_SYSTAB_STATISTICS_ID,	"pbxt.statistics", &sys_statistics_lock, xt_statistics_info, NULL, FALSE},
+	{ XT_SYSTAB_INVALID,		NULL, NULL, NULL, NULL, FALSE}
+};
+
+
+/*
+static int pbms_discover_handler(handlerton *hton, THD* thd, const char *db, const char *name, uchar **frmblob, size_t *frmlen)
+{
+	int err = 1, i = 0;
+	MY_STAT stat_info;
+
+	// Check that the database exists!
+	if ((!db) || ! my_stat(db,&stat_info,MYF(0)))
+		return err;
+		
+	while (pbms_internal_tables[i].name) {
+		if (!strcasecmp(name, pbms_internal_tables[i].name)) {
+			err = ms_create_table_frm(hton, thd, db, name, pbms_internal_tables[i].info, pbms_internal_tables[i].keys, frmblob, frmlen);
+			break;
+		}
+		i++;
+	}
+	
+	return err;
+}
+*/
+
+/*
+ * -------------------------------------------------------------------------
+ * MYSQL UTILITIES
+ */
+
+static void xt_my_set_notnull_in_record(Field *field, char *record)
+{
+	if (field->null_ptr)
+		record[(uint) (field->null_ptr - (uchar *) field->table->record[0])] &= (uchar) ~field->null_bit;
+}
+
+/*
+ * -------------------------------------------------------------------------
+ * OPEN SYSTEM TABLES
+ */
+
+XTOpenSystemTable::XTOpenSystemTable(XTThreadPtr self, XTDatabaseHPtr db, XTSystemTableShare *share, TABLE *table):
+XTObject()
+{
+	ost_share = share;
+	ost_my_table = table;
+	ost_db = db;
+	xt_heap_reference(self, db);
+}
+
+XTOpenSystemTable::~XTOpenSystemTable()
+{
+	XTSystemTableShare::releaseSystemTable(this);
+}
+
+/*
+ * -------------------------------------------------------------------------
+ * LOCATION TABLE
+ */
+
+XTLocationTable::XTLocationTable(XTThreadPtr self, XTDatabaseHPtr db, XTSystemTableShare *share, TABLE *table):
+XTOpenSystemTable(self, db, share, table)
+{
+}
+
+XTLocationTable::~XTLocationTable()
+{
+	unuse();
+}
+
+bool XTLocationTable::use()
+{
+	return true;
+}
+
+bool XTLocationTable::unuse()
+{
+	return true;
+}
+
+
+bool XTLocationTable::seqScanInit()
+{
+	lt_index = 0;
+	return true;
+}
+
+bool XTLocationTable::seqScanNext(char *buf, bool *eof)
+{
+	bool ok = true;
+
+	*eof = false;
+
+	xt_ht_lock(NULL, ost_db->db_tables);
+	if (lt_index >= xt_sl_get_size(ost_db->db_table_paths)) {
+		ok = false;
+		*eof = true;
+		goto done;
+	}
+	loadRow(buf, lt_index);
+	lt_index++;
+
+	done:
+	xt_ht_unlock(NULL, ost_db->db_tables);
+	return ok;
+#ifdef xxx
+	csWord4		last_access;
+	csWord4		last_ref;
+	csWord4		creation_time;
+	csWord4		access_code;
+	csWord2		cont_type;
+	size_t		ref_size;
+	csWord2		head_size;
+	csWord8		blob_size;
+	uint32		len;
+	Field		*curr_field;
+	byte		*save;
+	MX_BITMAP	*save_write_set;
+
+	last_access = CS_GET_DISK_4(blob->rb_last_access_4);
+	last_ref = CS_GET_DISK_4(blob->rb_last_ref_4);
+	creation_time = CS_GET_DISK_4(blob->rb_create_time_4);
+	cont_type = CS_GET_DISK_2(blob->rb_cont_type_2);
+	ref_size = CS_GET_DISK_1(blob->rb_ref_size_1);
+	head_size = CS_GET_DISK_2(blob->rb_head_size_2);
+	blob_size = CS_GET_DISK_6(blob->rb_blob_size_6);
+	access_code = CS_GET_DISK_4(blob->rb_auth_code_4);
+
+	/* ASSERT_COLUMN_MARKED_FOR_WRITE is failing when
+	 * I use store()!??
+	 * But I want to use it! :(
+	 */
+	save_write_set = table->write_set;
+	table->write_set = NULL;
+
+	memset(buf, 0xFF, table->s->null_bytes);
+ 	for (Field **field=table->field ; *field ; field++) {
+ 		curr_field = *field;
+
+		save = curr_field->ptr;
+#if MYSQL_VERSION_ID < 50114
+		curr_field->ptr = (byte *) buf + curr_field->offset();
+#else
+		curr_field->ptr = (byte *) buf + curr_field->offset(curr_field->table->record[0]);
+#endif
+		switch (curr_field->field_name[0]) {
+			case 'A':
+				ASSERT(strcmp(curr_field->field_name, "Access_code") == 0);
+				curr_field->store(access_code, true);
+				xt_my_set_notnull_in_record(curr_field, buf);
+				break;
+			case 'R':
+				switch (curr_field->field_name[6]) {
+					case 't':
+						// Repository_id     INT
+						ASSERT(strcmp(curr_field->field_name, "Repository_id") == 0);
+						curr_field->store(iRepoFile->myRepo->getRepoID(), true);
+						xt_my_set_notnull_in_record(curr_field, buf);
+						break;
+					case 'l':
+						// Repo_blob_offset  BIGINT
+						ASSERT(strcmp(curr_field->field_name, "Repo_blob_offset") == 0);
+						curr_field->store(iRepoOffset, true);
+						xt_my_set_notnull_in_record(curr_field, buf);
+						break;
+				}
+				break;
+			case 'B':
+				switch (curr_field->field_name[5]) {
+					case 's':
+						// Blob_size         BIGINT
+						ASSERT(strcmp(curr_field->field_name, "Blob_size") == 0);
+						curr_field->store(blob_size, true);
+						xt_my_set_notnull_in_record(curr_field, buf);
+						break;
+					case 'd':
+						// Blob_data         LONGBLOB
+						ASSERT(strcmp(curr_field->field_name, "Blob_data") == 0);
+						if (blob_size <= 0xFFFFFFF) {
+							iBlobBuffer->setLength((u_int) blob_size);
+							len = iRepoFile->read(iBlobBuffer->getBuffer(0), iRepoOffset + head_size, (size_t) blob_size, 0);
+							((Field_blob *) curr_field)->set_ptr(len, (byte *) iBlobBuffer->getBuffer(0));
+							xt_my_set_notnull_in_record(curr_field, buf);
+						}
+						break;
+				}
+				break;
+			case 'H':
+				// Head_size         SMALLINT UNSIGNED
+				ASSERT(strcmp(curr_field->field_name, "Head_size") == 0);
+				curr_field->store(head_size, true);
+				xt_my_set_notnull_in_record(curr_field, buf);
+				break;
+			case 'C':
+				switch (curr_field->field_name[1]) {
+					case 'r':
+						// Creation_time     TIMESTAMP
+						ASSERT(strcmp(curr_field->field_name, "Creation_time") == 0);
+						curr_field->store(ms_my_1970_to_mysql_time(creation_time), true);
+						xt_my_set_notnull_in_record(curr_field, buf);
+						break;
+					case 'o':
+						// Content_type      CHAR(128)
+						ASSERT(strcmp(curr_field->field_name, "Content_type") == 0);
+						CSString *cont_type_str = ost_share->mySysDatabase->getContentType(cont_type);
+						if (cont_type_str) {
+							curr_field->store(cont_type_str->getCString(), cont_type_str->length(), &my_charset_utf8_general_ci);
+							cont_type_str->release();
+							xt_my_set_notnull_in_record(curr_field, buf);
+						}
+						break;
+				}
+				break;
+			case 'L':
+				switch (curr_field->field_name[5]) {
+					case 'r':
+						// Last_ref_time     TIMESTAMP
+						ASSERT(strcmp(curr_field->field_name, "Last_ref_time") == 0);
+						curr_field->store(ms_my_1970_to_mysql_time(last_ref), true);
+						xt_my_set_notnull_in_record(curr_field, buf);
+						break;
+					case 'a':
+						// Last_access_time  TIMESTAMP
+						ASSERT(strcmp(curr_field->field_name, "Last_access_time") == 0);
+						curr_field->store(ms_my_1970_to_mysql_time(last_access), true);
+						xt_my_set_notnull_in_record(curr_field, buf);
+						break;
+				}
+				break;
+		}
+		curr_field->ptr = save;
+	}
+
+	table->write_set = save_write_set;
+	return true;
+#endif
+}
+
+void XTLocationTable::loadRow(char *buf, xtWord4 row_id)
+{
+	TABLE			*table = ost_my_table;
+	Field			*curr_field;
+	XTTablePathPtr	tp_ptr;
+	byte			*save;
+	MX_BITMAP		*save_write_set;
+
+	/* ASSERT_COLUMN_MARKED_FOR_WRITE is failing when
+	 * I use store()!??
+	 * But I want to use it! :(
+	 */
+	save_write_set = table->write_set;
+	table->write_set = NULL;
+
+	memset(buf, 0xFF, table->s->null_bytes);
+
+	tp_ptr = *((XTTablePathPtr *) xt_sl_item_at(ost_db->db_table_paths, row_id));
+
+ 	for (Field **field=table->field ; *field ; field++) {
+ 		curr_field = *field;
+
+		save = curr_field->ptr;
+#if MYSQL_VERSION_ID < 50114
+		curr_field->ptr = (byte *) buf + curr_field->offset();
+#else
+		curr_field->ptr = (byte *) buf + curr_field->offset(curr_field->table->record[0]);
+#endif
+		switch (curr_field->field_name[0]) {
+			case 'P':
+				// Path			VARCHAR(128)
+				ASSERT_NS(strcmp(curr_field->field_name, "Path") == 0);
+				curr_field->store(tp_ptr->tp_path, strlen(tp_ptr->tp_path), &my_charset_utf8_general_ci);
+				xt_my_set_notnull_in_record(curr_field, buf);
+				break;
+			case 'T':
+				// Table_count   INT
+				ASSERT_NS(strcmp(curr_field->field_name, "Table_count") == 0);
+				curr_field->store(tp_ptr->tp_tab_count, true);
+				xt_my_set_notnull_in_record(curr_field, buf);
+				break;
+		}
+		curr_field->ptr = save;
+	}
+	table->write_set = save_write_set;
+}
+
+xtWord4 XTLocationTable::seqScanPos(xtWord1 *XT_UNUSED(buf))
+{
+	return lt_index-1;
+}
+
+bool XTLocationTable::seqScanRead(xtWord4 rec_id, char *buf)
+{
+	loadRow(buf, rec_id);
+	return true;
+}
+
+/*
+ * -------------------------------------------------------------------------
+ * STATISTICS TABLE
+ */
+
+XTStatisticsTable::XTStatisticsTable(XTThreadPtr self, XTDatabaseHPtr db, XTSystemTableShare *share, TABLE *table):
+XTOpenSystemTable(self, db, share, table)
+{
+}
+
+XTStatisticsTable::~XTStatisticsTable()
+{
+	unuse();
+}
+
+bool XTStatisticsTable::use()
+{
+	return true;
+}
+
+bool XTStatisticsTable::unuse()
+{
+	return true;
+}
+
+
+bool XTStatisticsTable::seqScanInit()
+{
+	tt_index = 0;
+	xt_gather_statistics(&tt_statistics);
+	return true;
+}
+
+bool XTStatisticsTable::seqScanNext(char *buf, bool *eof)
+{
+	bool ok = true;
+
+	*eof = false;
+
+	if (tt_index >= XT_STAT_CURRENT_MAX) {
+		ok = false;
+		*eof = true;
+		goto done;
+	}
+	loadRow(buf, tt_index);
+	tt_index++;
+
+	done:
+	return ok;
+}
+
+void XTStatisticsTable::loadRow(char *buf, xtWord4 rec_id)
+{
+	TABLE			*table = ost_my_table;
+	MX_BITMAP		*save_write_set;
+	Field			*curr_field;
+	byte			*save;
+	const char		*stat_name;
+	u_llong			stat_value;
+
+	/* ASSERT_COLUMN_MARKED_FOR_WRITE is failing when
+	 * I use store()!??
+	 * But I want to use it! :(
+	 */
+	save_write_set = table->write_set;
+	table->write_set = NULL;
+
+	memset(buf, 0xFF, table->s->null_bytes);
+
+	stat_name = xt_get_stat_meta_data(rec_id)->sm_name;
+	stat_value = xt_get_statistic(&tt_statistics, ost_db, rec_id);
+
+ 	for (Field **field=table->field ; *field ; field++) {
+ 		curr_field = *field;
+
+		save = curr_field->ptr;
+#if MYSQL_VERSION_ID < 50114
+		curr_field->ptr = (byte *) buf + curr_field->offset();
+#else
+		curr_field->ptr = (byte *) buf + curr_field->offset(curr_field->table->record[0]);
+#endif
+		switch (curr_field->field_name[0]) {
+			case 'I':
+				// Value BIGINT
+				ASSERT_NS(strcmp(curr_field->field_name, "ID") == 0);
+				curr_field->store(rec_id+1, true);
+				xt_my_set_notnull_in_record(curr_field, buf);
+				break;
+			case 'N':
+				// Name VARCHAR(40)
+				ASSERT_NS(strcmp(curr_field->field_name, "Name") == 0);
+				curr_field->store(stat_name, strlen(stat_name), &my_charset_utf8_general_ci);
+				xt_my_set_notnull_in_record(curr_field, buf);
+				break;
+			case 'V':
+				// Value BIGINT
+				ASSERT_NS(strcmp(curr_field->field_name, "Value") == 0);
+				curr_field->store(stat_value, true);
+				xt_my_set_notnull_in_record(curr_field, buf);
+				break;
+		}
+		curr_field->ptr = save;
+	}
+	table->write_set = save_write_set;
+}
+
+xtWord4 XTStatisticsTable::seqScanPos(xtWord1 *XT_UNUSED(buf))
+{
+	return tt_index-1;
+}
+
+bool XTStatisticsTable::seqScanRead(xtWord4 rec_id, char *buf)
+{
+	loadRow(buf, rec_id);
+	return true;
+}
+
+/*
+ * -------------------------------------------------------------------------
+ * SYSTEM TABLE SHARES
+ */
+
+static void st_path_to_table_name(size_t size, char *buffer, const char *path)
+{
+	char *str;
+
+	xt_strcpy(size, buffer, xt_last_2_names_of_path(path));
+	xt_remove_extension(buffer);
+	if ((str = strchr(buffer, '\\')))
+		*str = '.';
+	if ((str = strchr(buffer, '/')))
+		*str = '.';
+}
+
+void XTSystemTableShare::startUp(XTThreadPtr XT_UNUSED(self))
+{
+	thr_lock_init(&sys_location_lock);
+	thr_lock_init(&sys_statistics_lock);
+	sys_lock_inited = TRUE;
+}
+
+void XTSystemTableShare::shutDown(XTThreadPtr XT_UNUSED(self))
+{
+	if (sys_lock_inited) {
+		thr_lock_delete(&sys_location_lock);
+		thr_lock_delete(&sys_statistics_lock);
+		sys_lock_inited = FALSE;
+	}
+}
+
+bool XTSystemTableShare::isSystemTable(const char *table_path)
+{
+	int		i = 0;
+	char	tab_name[100];
+
+	st_path_to_table_name(100, tab_name, table_path);
+	while (xt_internal_tables[i].sts_path) {
+		if (strcasecmp(tab_name, xt_internal_tables[i].sts_path) == 0)
+			return true;
+		i++;
+	}
+	return false;
+}
+
+void XTSystemTableShare::setSystemTableDeleted(const char *table_path)
+{
+	int		i = 0;
+	char	tab_name[100];
+
+	st_path_to_table_name(100, tab_name, table_path);
+	while (xt_internal_tables[i].sts_path) {
+		if (strcasecmp(tab_name, xt_internal_tables[i].sts_path) == 0) {
+			xt_internal_tables[i].sts_exists = FALSE;
+			break;
+		}
+		i++;
+	}
+}
+
+bool XTSystemTableShare::doesSystemTableExist()
+{
+	int i = 0;
+
+	while (xt_internal_tables[i].sts_path) {
+		if (xt_internal_tables[i].sts_exists)
+			return true;
+		i++;
+	}
+	return false;
+}
+
+void XTSystemTableShare::createSystemTables(XTThreadPtr XT_UNUSED(self), XTDatabaseHPtr XT_UNUSED(db))
+{
+	int		i = 0;
+
+	while (xt_internal_tables[i].sts_path) {
+		if (!xt_create_table_frm(pbxt_hton,
+			current_thd, "pbxt",
+			strchr(xt_internal_tables[i].sts_path, '.') + 1,
+			xt_internal_tables[i].sts_info,
+			xt_internal_tables[i].sts_keys,
+			TRUE /*do not recreate*/))
+			xt_internal_tables[i].sts_exists = TRUE;
+		i++;
+	}
+}
+
+XTOpenSystemTable *XTSystemTableShare::openSystemTable(XTThreadPtr self, const char *table_path, TABLE *table)
+{
+	XTSystemTableShare	*share;
+	XTOpenSystemTable	*otab = NULL;
+	int					i = 0;
+	char				tab_name[100];
+
+	st_path_to_table_name(100, tab_name, table_path);
+	while (xt_internal_tables[i].sts_path) {
+		if (strcasecmp(tab_name, xt_internal_tables[i].sts_path) == 0) {
+			share = &xt_internal_tables[i];
+			goto found;
+		}
+		i++;
+	}
+	return NULL;
+
+	found:
+	share->sts_exists = TRUE;
+	switch (share->sts_id) {
+		case XT_SYSTAB_LOCATION_ID:
+			if (!(otab = new XTLocationTable(self, self->st_database, share, table)))
+				xt_throw_errno(XT_CONTEXT, XT_ENOMEM);
+			break;
+		case XT_SYSTAB_STATISTICS_ID:
+			if (!(otab = new XTStatisticsTable(self, self->st_database, share, table)))
+				xt_throw_errno(XT_CONTEXT, XT_ENOMEM);
+			break;
+		default:
+			xt_throw_taberr(XT_CONTEXT, XT_ERR_TABLE_NOT_FOUND, (XTPathStrPtr) table_path);
+			break;
+	}	
+
+	return otab;
+}
+
+void XTSystemTableShare::releaseSystemTable(XTOpenSystemTable *tab)
+{
+	if (tab->ost_db) {
+		XTThreadPtr self = xt_get_self();
+
+		try_(a) {
+			xt_heap_release(self, tab->ost_db);
+		}
+		catch_(a) {
+		}
+		cont_(a);
+		tab->ost_db = NULL;
+	}
+}
diff --git a/storage/pbxt/src/systab_xt.h b/storage/pbxt/src/systab_xt.h
new file mode 100644
index 00000000000..408a8749dd0
--- /dev/null
+++ b/storage/pbxt/src/systab_xt.h
@@ -0,0 +1,155 @@
+/* Copyright (c) 2008 PrimeBase Technologies GmbH, Germany
+ *
+ * PrimeBase XT
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * Paul McCullagh
+ *
+ * 2007-07-18
+ *
+ * H&G2JCtL
+ *
+ * PBXT System tables.
+ *
+ */
+
+/*
+
+DROP TABLE IF EXISTS pbms_repository;
+CREATE TABLE pbms_repository (
+	Repository_id     INT COMMENT 'The reppository file number',
+	Repo_blob_offset  BIGINT COMMENT 'The offset of the BLOB in the repository file',
+	Blob_size         BIGINT COMMENT 'The size of the BLOB in bytes',
+	Head_size         SMALLINT UNSIGNED COMMENT 'The size of the BLOB header - preceeds the BLOB data',
+	Access_code       INT COMMENT 'The 4-byte authorisation code required to access the BLOB - part of the BLOB URL',
+	Creation_time     TIMESTAMP COMMENT 'The time the BLOB was created',
+	Last_ref_time     TIMESTAMP COMMENT 'The last time the BLOB was referenced',
+	Last_access_time  TIMESTAMP COMMENT 'The last time the BLOB was accessed (read)',
+	Content_type      CHAR(128) COMMENT 'The content type of the BLOB - returned by HTTP GET calls',
+	Blob_data         LONGBLOB COMMENT 'The data of this BLOB'
+) ENGINE=PBMS;
+
+	PRIMARY KEY (Repository_id, Repo_blob_offset)
+
+DROP TABLE IF EXISTS pbms_reference;
+CREATE TABLE pbms_reference (
+	Table_name        CHAR(64) COMMENT 'The name of the referencing table',
+	Blob_id           BIGINT COMMENT 'The BLOB reference number - part of the BLOB URL',
+	Column_name       CHAR(64) COMMENT 'The column name of the referencing field',
+	Row_condition     VARCHAR(255) COMMENT 'This condition identifies the row in the table',
+	Blob_url          VARCHAR(200) COMMENT 'The BLOB URL for HTTP GET access',
+	Repository_id     INT COMMENT 'The repository file number of the BLOB',
+	Repo_blob_offset  BIGINT COMMENT 'The offset in the repository file',
+	Blob_size         BIGINT COMMENT 'The size of the BLOB in bytes',
+	Deletion_time     TIMESTAMP COMMENT 'The time the BLOB was deleted',
+	Remove_in         INT COMMENT 'The number of seconds before the reference/BLOB is removed perminently',
+	Temp_log_id       INT COMMENT 'Temporary log number of the referencing deletion entry',
+	Temp_log_offset   BIGINT COMMENT 'Temporary log offset of the referencing deletion entry'
+) ENGINE=PBMS;
+
+	PRIMARY KEY (Table_name, Blob_id, Column_name, Condition)
+*/
+
+#ifndef __SYSTAB_XT_H__
+#define __SYSTAB_XT_H__
+
+#include "ccutils_xt.h"
+#include "discover_xt.h"
+#include "thread_xt.h"
+
+struct XTSystemTableShare;
+struct XTDatabase;
+
+class XTOpenSystemTable : public XTObject {
+public:
+	XTSystemTableShare		*ost_share;
+	TABLE					*ost_my_table;
+	struct XTDatabase		*ost_db;
+
+	XTOpenSystemTable(XTThreadPtr self, struct XTDatabase *db, XTSystemTableShare *share, TABLE *table);
+	virtual ~XTOpenSystemTable();
+
+	virtual bool use() { return true; }
+	virtual bool unuse() { return true; }
+	virtual bool seqScanInit() { return true; }
+	virtual bool seqScanNext(char *XT_UNUSED(buf), bool *eof) {
+		*eof = true;
+		return false;
+	}
+	virtual int	getRefLen() { return 4; }
+	virtual xtWord4 seqScanPos(xtWord1 *XT_UNUSED(buf)) {
+		return 0;
+	}
+	virtual bool seqScanRead(xtWord4 XT_UNUSED(rec_id), char *XT_UNUSED(buf)) {
+		return true;
+	}
+
+private:
+};
+
+class XTLocationTable : public XTOpenSystemTable {
+	u_int	lt_index;
+
+public:
+	XTLocationTable(XTThreadPtr self, struct XTDatabase *db, XTSystemTableShare *share, TABLE *table);
+	virtual ~XTLocationTable();
+
+	virtual bool use();
+	virtual bool unuse();
+	virtual bool seqScanInit();
+	virtual bool seqScanNext(char *buf, bool *eof);
+	virtual void loadRow(char *buf, xtWord4 row_id);
+	virtual xtWord4 seqScanPos(xtWord1 *buf);
+	virtual bool seqScanRead(xtWord4 rec_id, char *buf);
+};
+
+class XTStatisticsTable : public XTOpenSystemTable {
+	u_int				tt_index;
+	XTStatisticsRec		tt_statistics;
+
+public:
+	XTStatisticsTable(XTThreadPtr self, struct XTDatabase *db, XTSystemTableShare *share, TABLE *table);
+	virtual ~XTStatisticsTable();
+
+	virtual bool use();
+	virtual bool unuse();
+	virtual bool seqScanInit();
+	virtual bool seqScanNext(char *buf, bool *eof);
+	virtual void loadRow(char *buf, xtWord4 row_id);
+	virtual xtWord4 seqScanPos(xtWord1 *buf);
+	virtual bool seqScanRead(xtWord4 rec_id, char *buf);
+};
+
+typedef struct XTSystemTableShare {
+	u_int						sts_id;
+	const char					*sts_path;
+	THR_LOCK					*sts_my_lock;
+	DT_FIELD_INFO				*sts_info;
+	DT_KEY_INFO					*sts_keys;
+	xtBool						sts_exists;
+
+	static void					startUp(XTThreadPtr self);
+	static void					shutDown(XTThreadPtr self);
+	
+	static bool					isSystemTable(const char *table_path);
+	static void					setSystemTableDeleted(const char *table_path);
+	static bool					doesSystemTableExist();
+	static void					createSystemTables(XTThreadPtr self, struct XTDatabase *db);
+	static XTOpenSystemTable	*openSystemTable(XTThreadPtr self, const char *table_path, TABLE *table);
+	static void					releaseSystemTable(XTOpenSystemTable *tab);
+} XTSystemTableShareRec, *XTSystemTableSharePtr;
+
+#endif
diff --git a/storage/pbxt/src/tabcache_xt.cc b/storage/pbxt/src/tabcache_xt.cc
new file mode 100644
index 00000000000..92958f2da49
--- /dev/null
+++ b/storage/pbxt/src/tabcache_xt.cc
@@ -0,0 +1,1339 @@
+/* Copyright (c) 2007 PrimeBase Technologies GmbH
+ *
+ * PrimeBase XT
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.	See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * 2007-10-30	Paul McCullagh
+ *
+ * H&G2JCtL
+ *
+ * The new table cache. Caches all non-index data. This includes the data
+ * files and the row pointer files.
+ */
+
+#include "xt_config.h"
+
+#ifdef DRIZZLED
+#include <bitset>
+#endif
+
+#include <signal.h>
+
+#include "pthread_xt.h"
+#include "tabcache_xt.h"
+#include "table_xt.h"
+#include "database_xt.h"
+#include "trace_xt.h"
+#include "myxt_xt.h"
+
+xtPublic XTTabCacheMemRec	xt_tab_cache;
+
+static void tabc_fr_wait_for_cache(XTThreadPtr self, u_int msecs);
+
+xtPublic void xt_tc_set_cache_size(size_t cache_size)
+{
+	xt_tab_cache.tcm_cache_size = cache_size;
+	/* Multiplying by this number can overflow a 4 byte value! */
+	xt_tab_cache.tcm_low_level = (size_t) ((xtWord8) cache_size * (xtWord8) 70 / (xtWord8) 100);	// Current 70%
+	xt_tab_cache.tcm_high_level = (size_t) ((xtWord8) cache_size * 95 / (xtWord8) 100);				// Current 95%
+	xt_tab_cache.tcm_mid_level = (size_t) ((xtWord8) cache_size * 85 / (xtWord8) 100);				// Current 85%
+}
+
+/*
+ * Initialize the disk cache.
+ */
+xtPublic void xt_tc_init(XTThreadPtr self, size_t cache_size)
+{
+	xt_tc_set_cache_size(cache_size);
+
+	xt_tab_cache.tcm_approx_page_count = cache_size / sizeof(XTTabCachePageRec);
+	/* Determine the size of the hash table.
+	 * The size is set to 2* the number of pages!
+	 */
+	xt_tab_cache.tcm_hash_size = (xt_tab_cache.tcm_approx_page_count * 2) / XT_TC_SEGMENT_COUNT;
+
+	try_(a) {
+		for (u_int i=0; i<XT_TC_SEGMENT_COUNT; i++) {
+			xt_tab_cache.tcm_segment[i].tcs_cache_in_use = 0;
+			xt_tab_cache.tcm_segment[i].tcs_hash_table = (XTTabCachePagePtr *) xt_calloc(self, xt_tab_cache.tcm_hash_size * sizeof(XTTabCachePagePtr));
+			TAB_CAC_INIT_LOCK(self, &xt_tab_cache.tcm_segment[i].tcs_lock);
+		}
+
+		xt_init_mutex_with_autoname(self, &xt_tab_cache.tcm_lock);
+		xt_init_cond(self, &xt_tab_cache.tcm_cond);
+		xt_init_mutex_with_autoname(self, &xt_tab_cache.tcm_freeer_lock);
+		xt_init_cond(self, &xt_tab_cache.tcm_freeer_cond);
+	}
+	catch_(a) {
+		xt_tc_exit(self);
+		throw_();
+	}
+	cont_(a);
+}
+
+xtPublic void xt_tc_exit(XTThreadPtr self)
+{
+	XTTabCacheSegPtr seg;
+
+	for (u_int i=0; i<XT_TC_SEGMENT_COUNT; i++) {
+		seg = &xt_tab_cache.tcm_segment[i];
+		if (seg->tcs_hash_table) {
+			XTTabCachePagePtr page, tmp_page;
+
+			for (size_t j=0; j<xt_tab_cache.tcm_hash_size; j++) {
+				page = seg->tcs_hash_table[j];
+				while (page) {
+					tmp_page = page;
+					page = page->tcp_next;
+					ASSERT_NS(seg->tcs_cache_in_use >= offsetof(XTTabCachePageRec, tcp_data) + tmp_page->tcp_data_size);
+					seg->tcs_cache_in_use -= (offsetof(XTTabCachePageRec, tcp_data) + tmp_page->tcp_data_size);
+					ASSERT_NS(seg->tcs_cache_in_use == 0 || seg->tcs_cache_in_use >= 25000);
+					xt_free(self, tmp_page);
+				}
+			}
+
+			xt_free(self, seg->tcs_hash_table);
+			seg->tcs_hash_table = NULL;
+			TAB_CAC_FREE_LOCK(self, &seg->tcs_lock);
+		}
+		ASSERT_NS(seg->tcs_cache_in_use == 0);
+	}
+
+	xt_free_mutex(&xt_tab_cache.tcm_lock);
+	xt_free_cond(&xt_tab_cache.tcm_cond);
+	xt_free_mutex(&xt_tab_cache.tcm_freeer_lock);
+	xt_free_cond(&xt_tab_cache.tcm_freeer_cond);
+}
+
+xtPublic xtInt8 xt_tc_get_usage()
+{
+	xtInt8 size = 0;
+
+	for (u_int i=0; i<XT_TC_SEGMENT_COUNT; i++) {
+		size += xt_tab_cache.tcm_segment[i].tcs_cache_in_use;
+	}
+	return size;
+}
+
+xtPublic xtInt8 xt_tc_get_size()
+{
+	return (xtInt8) xt_tab_cache.tcm_cache_size;
+}
+
+xtPublic xtInt8 xt_tc_get_high()
+{
+	return (xtInt8) xt_tab_cache.tcm_cache_high;
+}
+
+#ifdef DEBUG
+xtPublic void xt_check_table_cache(XTTableHPtr tab)
+{
+	XTTabCachePagePtr page, ppage;
+
+	xt_lock_mutex_ns(&xt_tab_cache.tcm_lock);
+	ppage = NULL;
+	page = xt_tab_cache.tcm_lru_page;
+	while (page) {
+		if (tab) {
+			if (page->tcp_db_id == tab->tab_db->db_id && page->tcp_tab_id == tab->tab_id) {
+				ASSERT_NS(!XTTableSeq::xt_op_is_before(tab->tab_seq.ts_next_seq, page->tcp_op_seq));
+			}
+		}
+		ASSERT_NS(page->tcp_lr_used == ppage);
+		ppage = page;
+		page = page->tcp_mr_used;
+	}
+	ASSERT_NS(xt_tab_cache.tcm_mru_page == ppage);
+	xt_unlock_mutex_ns(&xt_tab_cache.tcm_lock);
+}
+#endif
+
+void XTTabCache::xt_tc_setup(XTTableHPtr tab, size_t head_size, size_t rec_size)
+{
+	tci_table = tab;
+	tci_header_size = head_size;
+	tci_rec_size = rec_size;
+	tci_rows_per_page = (XT_TC_PAGE_SIZE / rec_size) + 1;
+	if (tci_rows_per_page < 2)
+		tci_rows_per_page = 2;
+	tci_page_size = tci_rows_per_page * rec_size;
+}
+
+/*
+ * This function assumes that we never write past the boundary of a page.
+ * This should be the case, because we should never write more than
+ * a row, and there are only whole rows on a page.
+ */
+xtBool XTTabCache::xt_tc_write(XT_ROW_REC_FILE_PTR file, xtRefID ref_id, size_t inc, size_t size, xtWord1 *data, xtOpSeqNo *op_seq, xtBool read, XTThreadPtr thread)
+{
+	size_t				offset;
+	XTTabCachePagePtr	page;
+	XTTabCacheSegPtr	seg;
+
+	/*
+	retry:
+	*/
+	if (!tc_fetch(file, ref_id, &seg, &page, &offset, read, thread))
+		return FAILED;
+	/* Don't write while there is a read lock on the page,
+	 * which can happen during a sequential scan...
+	 *
+	 * This will have to be OK.
+	 * I cannot wait for the lock because a thread locks
+	 * itself out when updating during a sequential scan.
+	 *
+	 * However, I don't think this is a problem, because
+	 * the only records that are changed, are records
+	 * containing uncommitted data. Such records should
+	 * be ignored by a sequential scan. As long as
+	 * we don't crash due to reading half written
+	 * data!
+	 *
+	if (page->tcp_lock_count) {
+		if (!xt_timed_wait_cond_ns(&seg->tcs_cond, &seg->tcs_lock, 100)) {
+			xt_rwmutex_unlock(&seg->tcs_lock, thread->t_id);
+			return FAILED;
+		}
+		xt_rwmutex_unlock(&seg->tcs_lock, thread->t_id);
+		// The page may have dissappeared from the cache, while we were sleeping!
+		goto retry;
+	}
+	*/
+	
+	ASSERT_NS(offset + inc + 4 <= tci_page_size);
+	memcpy(page->tcp_data + offset + inc, data, size);
+	/* GOTCHA, this was "op_seq > page->tcp_op_seq", however
+	 * this does not handle overflow!
+	if (XTTableSeq::xt_op_is_before(page->tcp_op_seq, op_seq))
+		page->tcp_op_seq = op_seq;
+	 */
+
+	page->tcp_dirty = TRUE;
+	ASSERT_NS(page->tcp_db_id == tci_table->tab_db->db_id && page->tcp_tab_id == tci_table->tab_id);
+	*op_seq = tci_table->tab_seq.ts_set_op_seq(page);
+	TAB_CAC_UNLOCK(&seg->tcs_lock, thread->t_id);
+	return OK;
+}
+
+/*
+ * This is a special version of write which is used to set the "clean" bit.
+ * The alternative would be to read the record first, but this
+ * is much quicker!
+ *
+ * This function also checks if xn_id, row_id and other data match (the checks 
+ * are similar to xn_sw_cleanup_done) before modifying the record, otherwise it 
+ * assumes that the record was already updated earlier and we must not set it to 
+ * clean.
+ *
+ * If the record was not modified the function returns FALSE.
+ *
+ * The function has a self pointer and can throw an exception.
+ */
+xtBool XTTabCache::xt_tc_write_cond(XTThreadPtr self, XT_ROW_REC_FILE_PTR file, xtRefID ref_id, xtWord1 new_type, xtOpSeqNo *op_seq, 
+	xtXactID xn_id, xtRowID row_id, u_int stat_id, u_int rec_type)
+{
+	size_t				offset;
+	XTTabCachePagePtr	page;
+	XTTabCacheSegPtr	seg;
+	XTTabRecHeadDPtr	rec_head;
+
+	if (!tc_fetch(file, ref_id, &seg, &page, &offset, TRUE, self))
+		xt_throw(self);
+
+	ASSERT(offset + 1 <= tci_page_size);
+
+	rec_head = (XTTabRecHeadDPtr)(page->tcp_data + offset);
+
+	/* Transaction must match: */
+	if (XT_GET_DISK_4(rec_head->tr_xact_id_4) != xn_id)
+		goto no_change;
+
+	/* Record header must match expected value from
+	 * log or clean has been done, or is not required.
+	 *
+	 * For example, it is not required if a record
+	 * has been overwritten in a transaction.
+	 */
+	if (rec_head->tr_rec_type_1 != rec_type ||
+		rec_head->tr_stat_id_1 != stat_id)
+		goto no_change;
+
+	/* Row must match: */
+	if (XT_GET_DISK_4(rec_head->tr_row_id_4) != row_id)
+		goto no_change;
+
+	*(page->tcp_data + offset) = new_type;
+
+	page->tcp_dirty = TRUE;
+	ASSERT(page->tcp_db_id == tci_table->tab_db->db_id && page->tcp_tab_id == tci_table->tab_id);
+	*op_seq = tci_table->tab_seq.ts_set_op_seq(page);
+	TAB_CAC_UNLOCK(&seg->tcs_lock, self->t_id);
+	return TRUE;
+
+	no_change:
+	TAB_CAC_UNLOCK(&seg->tcs_lock, self->t_id);
+	return FALSE;
+}
+
+xtBool XTTabCache::xt_tc_read(XT_ROW_REC_FILE_PTR file, xtRefID ref_id, size_t size, xtWord1 *data, XTThreadPtr thread)
+{
+#ifdef XT_USE_ROW_REC_MMAP_FILES
+	return tc_read_direct(file, ref_id, size, data, thread);
+#else
+	size_t				offset;
+	XTTabCachePagePtr	page;
+	XTTabCacheSegPtr	seg;
+
+	if (!tc_fetch(file, ref_id, &seg, &page, &offset, TRUE, thread))
+		return FAILED;
+	/* A read must be completely on a page: */
+	ASSERT_NS(offset + size <= tci_page_size);
+	memcpy(data, page->tcp_data + offset, size);
+	TAB_CAC_UNLOCK(&seg->tcs_lock, thread->t_id);
+	return OK;
+#endif
+}
+
+xtBool XTTabCache::xt_tc_read_4(XT_ROW_REC_FILE_PTR file, xtRefID ref_id, xtWord4 *value, XTThreadPtr thread)
+{
+#ifdef XT_USE_ROW_REC_MMAP_FILES
+	register u_int				page_idx;
+	register XTTabCachePagePtr	page;
+	register XTTabCacheSegPtr	seg;
+	register u_int				hash_idx;
+	register XTTabCacheMemPtr	dcg = &xt_tab_cache;
+	off_t						address;
+
+	ASSERT_NS(ref_id);
+	ref_id--;
+	page_idx = ref_id / this->tci_rows_per_page;
+	address = (off_t) ref_id * (off_t) this->tci_rec_size + (off_t) this->tci_header_size;
+
+	hash_idx = page_idx + (file->fr_id * 223);
+	seg = &dcg->tcm_segment[hash_idx & XT_TC_SEGMENT_MASK];
+	hash_idx = (hash_idx >> XT_TC_SEGMENT_SHIFTS) % dcg->tcm_hash_size;
+
+	TAB_CAC_READ_LOCK(&seg->tcs_lock, thread->t_id);
+	page = seg->tcs_hash_table[hash_idx];
+	while (page) {
+		if (page->tcp_page_idx == page_idx && page->tcp_file_id == file->fr_id) {
+			size_t	offset;
+			xtWord1	*buffer;
+
+			offset = (ref_id % this->tci_rows_per_page) * this->tci_rec_size;
+			ASSERT_NS(offset + 4 <= this->tci_page_size);
+			buffer = page->tcp_data + offset;
+			*value = XT_GET_DISK_4(buffer);
+			TAB_CAC_UNLOCK(&seg->tcs_lock, thread->t_id);
+			return OK;
+		}
+		page = page->tcp_next;
+	}
+	TAB_CAC_UNLOCK(&seg->tcs_lock, thread->t_id);
+
+	return xt_pread_fmap_4(file, address, value, &thread->st_statistics.st_rec, thread);
+#else
+	size_t				offset;
+	XTTabCachePagePtr	page;
+	XTTabCacheSegPtr	seg;
+	xtWord1				*data;
+
+	if (!tc_fetch(file, ref_id, &seg, &page, &offset, TRUE, thread))
+		return FAILED;
+	/* A read must be completely on a page: */
+	ASSERT_NS(offset + 4 <= tci_page_size);
+	data = page->tcp_data + offset;
+	*value = XT_GET_DISK_4(data);
+	TAB_CAC_UNLOCK(&seg->tcs_lock, thread->t_id);
+	return OK;
+#endif
+}
+
+xtBool XTTabCache::xt_tc_get_page(XT_ROW_REC_FILE_PTR file, xtRefID ref_id, xtBool load, XTTabCachePagePtr *ret_page, size_t *offset, XTThreadPtr thread)
+{
+	XTTabCachePagePtr	page;
+	XTTabCacheSegPtr	seg;
+
+	if (load) {
+		if (!tc_fetch(file, ref_id, &seg, &page, offset, TRUE, thread))
+			return FAILED;
+	}
+	else {
+		if (!tc_fetch_direct(file, ref_id, &seg, &page, offset, thread))
+			return FAILED;
+		if (!seg) {
+			*ret_page = NULL;
+			return OK;
+		}
+	}
+	page->tcp_lock_count++;
+	TAB_CAC_UNLOCK(&seg->tcs_lock, thread->t_id);
+	*ret_page = page;
+	return OK;
+}
+
+void XTTabCache::xt_tc_release_page(XT_ROW_REC_FILE_PTR XT_UNUSED(file), XTTabCachePagePtr page, XTThreadPtr thread)
+{
+	XTTabCacheSegPtr	seg;
+
+	seg = &xt_tab_cache.tcm_segment[page->tcp_seg];
+	TAB_CAC_WRITE_LOCK(&seg->tcs_lock, thread->t_id);
+
+#ifdef DEBUG
+	XTTabCachePagePtr lpage, ppage;
+
+	ppage = NULL;
+	lpage = seg->tcs_hash_table[page->tcp_hash_idx];
+	while (lpage) {
+		if (lpage->tcp_page_idx == page->tcp_page_idx &&
+			lpage->tcp_file_id == page->tcp_file_id)
+			break;
+		ppage = lpage;
+		lpage = lpage->tcp_next;
+	}
+
+	ASSERT_NS(page == lpage);
+	ASSERT_NS(page->tcp_lock_count > 0);
+#endif
+
+	if (page->tcp_lock_count > 0)
+		page->tcp_lock_count--;
+
+	TAB_CAC_UNLOCK(&seg->tcs_lock, thread->t_id);
+}
+
+xtBool XTTabCache::xt_tc_read_page(XT_ROW_REC_FILE_PTR file, xtRefID ref_id, xtWord1 *data, XTThreadPtr thread)
+{
+	return tc_read_direct(file, ref_id, this->tci_page_size, data, thread);
+}
+
+/* Read row and record files directly.
+ * This by-passed the cache when reading, which mean
+ * we rely in the OS for caching.
+ * This probably only makes sense when these files
+ * are memory mapped.
+ */
+xtBool XTTabCache::tc_read_direct(XT_ROW_REC_FILE_PTR file, xtRefID ref_id, size_t size, xtWord1 *data, XTThreadPtr thread)
+{
+	register u_int				page_idx;
+	register XTTabCachePagePtr	page;
+	register XTTabCacheSegPtr	seg;
+	register u_int				hash_idx;
+	register XTTabCacheMemPtr	dcg = &xt_tab_cache;
+	size_t						red_size;
+	off_t						address;
+
+	ASSERT_NS(ref_id);
+	ref_id--;
+	page_idx = ref_id / this->tci_rows_per_page;
+	address = (off_t) ref_id * (off_t) this->tci_rec_size + (off_t) this->tci_header_size;
+
+	hash_idx = page_idx + (file->fr_id * 223);
+	seg = &dcg->tcm_segment[hash_idx & XT_TC_SEGMENT_MASK];
+	hash_idx = (hash_idx >> XT_TC_SEGMENT_SHIFTS) % dcg->tcm_hash_size;
+
+	TAB_CAC_READ_LOCK(&seg->tcs_lock, thread->t_id);
+	page = seg->tcs_hash_table[hash_idx];
+	while (page) {
+		if (page->tcp_page_idx == page_idx && page->tcp_file_id == file->fr_id) {
+			size_t offset;
+
+			offset = (ref_id % this->tci_rows_per_page) * this->tci_rec_size;
+			ASSERT_NS(offset + size <= this->tci_page_size);
+			memcpy(data, page->tcp_data + offset, size);
+			TAB_CAC_UNLOCK(&seg->tcs_lock, thread->t_id);
+			return OK;
+		}
+		page = page->tcp_next;
+	}
+	TAB_CAC_UNLOCK(&seg->tcs_lock, thread->t_id);
+	if (!XT_PREAD_RR_FILE(file, address, size, 0, data, &red_size, &thread->st_statistics.st_rec, thread))
+		return FAILED;
+	memset(data + red_size, 0, size - red_size);
+	return OK;
+}
+
+xtBool XTTabCache::tc_fetch_direct(XT_ROW_REC_FILE_PTR file, xtRefID ref_id, XTTabCacheSegPtr *ret_seg, XTTabCachePagePtr *ret_page, size_t *offset, XTThreadPtr thread)
+{
+	register u_int				page_idx;
+	register XTTabCachePagePtr	page;
+	register XTTabCacheSegPtr	seg;
+	register u_int				hash_idx;
+	register XTTabCacheMemPtr	dcg = &xt_tab_cache;
+
+	ASSERT_NS(ref_id);
+	ref_id--;
+	page_idx = ref_id / this->tci_rows_per_page;
+	*offset = (ref_id % this->tci_rows_per_page) * this->tci_rec_size;
+
+	hash_idx = page_idx + (file->fr_id * 223);
+	seg = &dcg->tcm_segment[hash_idx & XT_TC_SEGMENT_MASK];
+	hash_idx = (hash_idx >> XT_TC_SEGMENT_SHIFTS) % dcg->tcm_hash_size;
+
+	TAB_CAC_WRITE_LOCK(&seg->tcs_lock, thread->t_id);
+	page = seg->tcs_hash_table[hash_idx];
+	while (page) {
+		if (page->tcp_page_idx == page_idx && page->tcp_file_id == file->fr_id) {
+			*ret_seg = seg;
+			*ret_page = page;
+			return OK;
+		}
+		page = page->tcp_next;
+	}
+	TAB_CAC_UNLOCK(&seg->tcs_lock, thread->t_id);
+	*ret_seg = NULL;
+	*ret_page = NULL;
+	return OK;
+}
+
+/*
+ * Note, this function may return an exclusive, or a shared lock.
+ * If the page is in cache it will return a shared lock of the segment.
+ * If the page was just added to the cache it will return an
+ * exclusive lock.
+ */
+xtBool XTTabCache::tc_fetch(XT_ROW_REC_FILE_PTR file, xtRefID ref_id, XTTabCacheSegPtr *ret_seg, XTTabCachePagePtr *ret_page, size_t *offset, xtBool read, XTThreadPtr thread)
+{
+	register u_int				page_idx;
+	register XTTabCachePagePtr	page, new_page;
+	register XTTabCacheSegPtr	seg;
+	register u_int				hash_idx;
+	register XTTabCacheMemPtr	dcg = &xt_tab_cache;
+	size_t						red_size;
+	off_t						address;
+
+	ASSERT_NS(ref_id);
+	ref_id--;
+	page_idx = ref_id / this->tci_rows_per_page;
+	address = (off_t) page_idx * (off_t) this->tci_page_size + (off_t) this->tci_header_size;
+	*offset = (ref_id % this->tci_rows_per_page) * this->tci_rec_size;
+
+	hash_idx = page_idx + (file->fr_id * 223);
+	seg = &dcg->tcm_segment[hash_idx & XT_TC_SEGMENT_MASK];
+	hash_idx = (hash_idx >> XT_TC_SEGMENT_SHIFTS) % dcg->tcm_hash_size;
+
+	TAB_CAC_READ_LOCK(&seg->tcs_lock, thread->t_id);
+	page = seg->tcs_hash_table[hash_idx];
+	while (page) {
+		if (page->tcp_page_idx == page_idx && page->tcp_file_id == file->fr_id) {
+			/* This page has been most recently used: */
+			if (XT_TIME_DIFF(page->tcp_ru_time, dcg->tcm_ru_now) > (dcg->tcm_approx_page_count >> 1)) {
+				/* Move to the front of the MRU list: */
+				xt_lock_mutex_ns(&dcg->tcm_lock);
+
+				page->tcp_ru_time = ++dcg->tcm_ru_now;
+				if (dcg->tcm_mru_page != page) {
+					/* Remove from the MRU list: */
+					if (dcg->tcm_lru_page == page)
+						dcg->tcm_lru_page = page->tcp_mr_used;
+					if (page->tcp_lr_used)
+						page->tcp_lr_used->tcp_mr_used = page->tcp_mr_used;
+					if (page->tcp_mr_used)
+						page->tcp_mr_used->tcp_lr_used = page->tcp_lr_used;
+	
+					/* Make the page the most recently used: */
+					if ((page->tcp_lr_used = dcg->tcm_mru_page))
+						dcg->tcm_mru_page->tcp_mr_used = page;
+					page->tcp_mr_used = NULL;
+					dcg->tcm_mru_page = page;
+					if (!dcg->tcm_lru_page)
+						dcg->tcm_lru_page = page;
+				}
+				xt_unlock_mutex_ns(&dcg->tcm_lock);
+			}
+			*ret_seg = seg;
+			*ret_page = page;
+			thread->st_statistics.st_rec_cache_hit++;
+			return OK;
+		}
+		page = page->tcp_next;
+	}
+	
+	size_t page_size = offsetof(XTTabCachePageRec, tcp_data) + this->tci_page_size;
+
+	TAB_CAC_UNLOCK(&seg->tcs_lock, thread->t_id);
+	
+	/* Page not found, allocate a new page: */
+	if (!(new_page = (XTTabCachePagePtr) xt_malloc_ns(page_size)))
+		return FAILED;
+
+	/* Check the level of the cache: */
+	size_t cache_used = 0;
+	for (int i=0; i<XT_TC_SEGMENT_COUNT; i++)
+		cache_used += dcg->tcm_segment[i].tcs_cache_in_use;
+
+	if (cache_used + page_size > dcg->tcm_cache_high)
+		dcg->tcm_cache_high = cache_used;
+
+	if (cache_used + page_size > dcg->tcm_cache_size) {
+		XTThreadPtr self;
+		time_t		now;
+
+		/* Wait for the cache level to go down.
+		 * If this happens, then the freeer is not working fast
+		 * enough!
+		 */
+
+		/* But before I do this, I must flush my own log because:
+		 * - The freeer might be waiting for a page to be cleaned.
+		 * - The page can only be cleaned once it has been written to
+		 *   the database.
+		 * - The writer cannot write the page data until it has been
+		 *   flushed to the log.
+		 * - The log won't be flushed, unless this thread does it.
+		 * So there could be a deadlock if I don't flush the log!
+		 */
+		if ((self = xt_get_self())) {
+			if (!xt_xlog_flush_log(tci_table->tab_db, self))
+				goto failed;
+		}
+
+		/* Wait for the free'er thread: */
+		xt_lock_mutex_ns(&dcg->tcm_freeer_lock);
+		now = time(NULL);
+		do {
+			/* I have set the timeout to 2 here because of the following situation:
+			 * 1. Transaction allocates an op seq
+			 * 2. Transaction goes to update cache, but must wait for
+			 *    cache to be freed (after this, the op would be written to
+			 *    the log).
+			 * 3. The free'er wants to free cache, but is waiting for the writter.
+			 * 4. The writer cannot continue because an op seq is missing!
+			 *    So the writer is waiting for the transaction thread to write
+			 *    the op seq.
+			 * - So we have a deadlock situation.
+			 * - However, this situation can only occur if there is not enougn
+			 *   cache.
+			 * The timeout helps, but will not solve the problem, unless we
+			 * ignore cache level here, after a while, and just continue.
+			 */
+
+			/* Wake freeer before we go to sleep: */
+			if (!dcg->tcm_freeer_busy) {
+				if (!xt_broadcast_cond_ns(&dcg->tcm_freeer_cond))
+					xt_log_and_clear_exception_ns();
+			}
+
+			dcg->tcm_threads_waiting++;
+#ifdef DEBUG
+			if (!xt_timed_wait_cond_ns(&dcg->tcm_freeer_cond, &dcg->tcm_freeer_lock, 30000)) {
+				dcg->tcm_threads_waiting--;
+				break;
+			}
+#else
+			if (!xt_timed_wait_cond_ns(&dcg->tcm_freeer_cond, &dcg->tcm_freeer_lock, 1000)) {
+				dcg->tcm_threads_waiting--;
+				break;
+			}
+#endif
+			dcg->tcm_threads_waiting--;
+
+			cache_used = 0;
+			for (int i=0; i<XT_TC_SEGMENT_COUNT; i++)
+				cache_used += dcg->tcm_segment[i].tcs_cache_in_use;
+
+			if (cache_used + page_size <= dcg->tcm_high_level)
+				break;
+			/*
+			 * If there is too little cache we can get stuck here.
+			 * The problem is that seg numbers are allocated before fetching a
+			 * record to be updated.
+			 *
+			 * It can happen that we end up waiting for that seq number
+			 * to be written to the log before we can continue here.
+			 *
+			 * This happens as follows:
+			 * 1. This thread waits for the freeer.
+			 * 2. The freeer cannot free a page because it has not been
+			 *    written by the writter.
+			 * 3. The writter cannot continue because it is waiting
+			 *    for a missing sequence number.
+			 * 4. The missing sequence number is the one allocated
+			 *    before we entered this function!
+			 * 
+			 * So don't wait for more than 5 seconds here!
+			 */
+		}
+		while (time(NULL) < now + 5);
+		xt_unlock_mutex_ns(&dcg->tcm_freeer_lock);
+	}
+	else if (cache_used + page_size > dcg->tcm_high_level) {
+		/* Wake up the freeer because the cache level,
+		 * is higher than the high level.
+		 */
+		if (!dcg->tcm_freeer_busy) {
+			xt_lock_mutex_ns(&xt_tab_cache.tcm_freeer_lock);
+			if (!xt_broadcast_cond_ns(&xt_tab_cache.tcm_freeer_cond))
+				xt_log_and_clear_exception_ns();
+			xt_unlock_mutex_ns(&xt_tab_cache.tcm_freeer_lock);
+		}
+	}
+
+	/* Read the page into memory.... */
+	new_page->tcp_dirty = FALSE;
+	new_page->tcp_seg = (xtWord1) ((page_idx + (file->fr_id * 223)) & XT_TC_SEGMENT_MASK);
+	new_page->tcp_lock_count = 0;
+	new_page->tcp_hash_idx = hash_idx;
+	new_page->tcp_page_idx = page_idx;
+	new_page->tcp_file_id = file->fr_id;
+	new_page->tcp_db_id = this->tci_table->tab_db->db_id;
+	new_page->tcp_tab_id = this->tci_table->tab_id;
+	new_page->tcp_data_size = this->tci_page_size;
+	new_page->tcp_op_seq = 0; // Value not used because not dirty
+
+	if (read) {
+		if (!XT_PREAD_RR_FILE(file, address, this->tci_page_size, 0, new_page->tcp_data, &red_size, &thread->st_statistics.st_rec, thread))
+			goto failed;
+	}
+	
+#ifdef XT_MEMSET_UNUSED_SPACE
+	else
+		red_size = 0;
+
+	/* Removing this is an optimization. It should not be required
+	 * to clear the unused space in the page.
+	 */
+	memset(new_page->tcp_data + red_size, 0, this->tci_page_size - red_size);
+#endif
+
+	/* Add the page to the cache! */
+	TAB_CAC_WRITE_LOCK(&seg->tcs_lock, thread->t_id);
+	page = seg->tcs_hash_table[hash_idx];
+	while (page) {
+		if (page->tcp_page_idx == page_idx && page->tcp_file_id == file->fr_id) {
+			/* Oops, someone else was faster! */
+			xt_free_ns(new_page);
+			goto done_ok;
+		}
+		page = page->tcp_next;
+	}
+	page = new_page;
+
+	/* Make the page the most recently used: */
+	xt_lock_mutex_ns(&dcg->tcm_lock);
+	page->tcp_ru_time = ++dcg->tcm_ru_now;
+	if ((page->tcp_lr_used = dcg->tcm_mru_page))
+		dcg->tcm_mru_page->tcp_mr_used = page;
+	page->tcp_mr_used = NULL;
+	dcg->tcm_mru_page = page;
+	if (!dcg->tcm_lru_page)
+		dcg->tcm_lru_page = page;
+	xt_unlock_mutex_ns(&dcg->tcm_lock);
+
+	/* Add the page to the hash table: */
+	page->tcp_next = seg->tcs_hash_table[hash_idx];
+	seg->tcs_hash_table[hash_idx] = page;
+
+	/* GOTCHA! This increment was done just after the malloc!
+	 * So it was not protected by the segment lock!
+	 * The result was that this count was no longer reliable,
+	 * This resulted in the amount of cache being used becoming less, and\
+	 * less, because increments were lost over time!
+	 */
+	/* Increment cache used. */
+	seg->tcs_cache_in_use += page_size;
+
+	done_ok:
+	*ret_seg = seg;
+	*ret_page = page;
+#ifdef DEBUG_CHECK_CACHE
+	//XT_TC_check_cache();
+#endif
+	thread->st_statistics.st_rec_cache_miss++;
+	return OK;
+
+	failed:
+	xt_free_ns(new_page);
+	return FAILED;
+}
+
+
+/* ----------------------------------------------------------------------
+ * OPERATION SEQUENCE
+ */
+
+xtBool XTTableSeq::ts_log_no_op(XTThreadPtr thread, xtTableID tab_id, xtOpSeqNo op_seq)
+{
+	XTactNoOpEntryDRec	ent_rec;
+	xtWord4				sum = (xtWord4) tab_id ^ (xtWord4) op_seq;
+
+	ent_rec.no_status_1 = XT_LOG_ENT_NO_OP;
+	ent_rec.no_checksum_1 = XT_CHECKSUM_1(sum);
+	XT_SET_DISK_4(ent_rec.no_tab_id_4, tab_id);
+	XT_SET_DISK_4(ent_rec.no_op_seq_4, op_seq);
+	/* TODO - If this also fails we have a problem.
+	 * From this point on we should actually not generate
+	 * any more op IDs. The problem is that the
+	 * some will be missing, so the writer will not
+	 * be able to contniue.
+	 */
+	return xt_xlog_log_data(thread, sizeof(XTactNoOpEntryDRec), (XTXactLogBufferDPtr) &ent_rec, XT_XLOG_NO_WRITE_NO_FLUSH);
+}
+
+#ifdef XT_NOT_INLINE
+xtOpSeqNo XTTableSeq::ts_set_op_seq(XTTabCachePagePtr page)
+{
+	xtOpSeqNo seq;
+
+	xt_lock_mutex_ns(&ts_ns_lock);
+	page->tcp_op_seq = seq = ts_next_seq++;
+	xt_unlock_mutex_ns(&ts_ns_lock);
+	return seq;
+}
+
+xtOpSeqNo XTTableSeq::ts_get_op_seq()
+{
+	xtOpSeqNo seq;
+
+	xt_lock_mutex_ns(&ts_ns_lock);
+	seq = ts_next_seq++;
+	xt_unlock_mutex_ns(&ts_ns_lock);
+	return seq;
+}
+#endif
+
+#ifdef XT_NOT_INLINE
+/*
+ * Return TRUE if the current sequence is before the
+ * target (then) sequence number. This function
+ * takes into account overflow. Overflow is detected
+ * by checking the difference between the 2 values.
+ * If the difference is very large, then we
+ * assume overflow.
+ */
+xtBool XTTableSeq::xt_op_is_before(register xtOpSeqNo now, register xtOpSeqNo then)
+{
+	ASSERT_NS(sizeof(xtOpSeqNo) == 4);
+	/* The now time is being incremented.
+	 * If it is after the then time (which is static, then
+	 * it is not before!
+	 */
+	if (now >= then) {
+		if ((now - then) > (xtOpSeqNo) 0xFFFFFFFF/2)
+			return TRUE;
+		return FALSE;
+	}
+
+	/* If it appears to be before, we still have to check
+	 * for overflow. If the gap is bigger then half of
+	 * the MAX value, then we can assume it has wrapped around
+	 * because we know that no then can be so far in the
+	 * future!
+	 */
+	if ((then - now) > (xtOpSeqNo) 0xFFFFFFFF/2)
+		return FALSE;
+	return TRUE;
+}
+#endif
+
+
+/* ----------------------------------------------------------------------
+ * F R E E E R    P R O C E S S
+ */
+
+/*
+ * Used by the writer to wake the freeer.
+ */
+xtPublic void xt_wr_wake_freeer(XTThreadPtr self, XTDatabaseHPtr db)
+{
+	/* BUG FIX: Was using tcm_freeer_cond.
+	 * This is incorrect. When the freeer waits for the
+	 * writter, it uses the writer's condition!
+	 */
+	xt_lock_mutex_ns(&db->db_wr_lock);
+	if (!xt_broadcast_cond_ns(&db->db_wr_cond))
+		xt_log_and_clear_exception_ns();
+	xt_unlock_mutex_ns(&db->db_wr_lock);
+/*
+	xt_lock_mutex(self, &xt_tab_cache.tcm_freeer_lock);
+	pushr_(xt_unlock_mutex, &xt_tab_cache.tcm_freeer_lock);
+	if (!xt_broadcast_cond_ns(&xt_tab_cache.tcm_freeer_cond))
+		xt_log_and_clear_exception_ns();
+	freer_(); // xt_unlock_mutex(&xt_tab_cache.tcm_freeer_lock)
+*/
+}
+
+/* Wait for a transaction to quit: */
+static void tabc_fr_wait_for_cache(XTThreadPtr self, u_int msecs)
+{
+	if (!self->t_quit)
+		xt_timed_wait_cond(NULL, &xt_tab_cache.tcm_freeer_cond, &xt_tab_cache.tcm_freeer_lock, msecs);
+}
+
+typedef struct TCResource {
+	XTOpenTablePtr		tc_ot;
+} TCResourceRec, *TCResourcePtr;
+
+static void tabc_free_fr_resources(XTThreadPtr self, TCResourcePtr tc)
+{
+	if (tc->tc_ot) {
+		xt_db_return_table_to_pool(self, tc->tc_ot);
+		tc->tc_ot = NULL;
+	}
+}
+
+static XTTableHPtr tabc_get_table(XTThreadPtr self, TCResourcePtr tc, xtDatabaseID db_id, xtTableID tab_id)
+{
+	XTTableHPtr	tab;
+	XTDatabaseHPtr	db;
+
+	if (tc->tc_ot) {
+		tab = tc->tc_ot->ot_table;
+		if (tab->tab_id == tab_id && tab->tab_db->db_id == db_id)
+			return tab;
+
+		xt_db_return_table_to_pool(self, tc->tc_ot);
+		tc->tc_ot = NULL;
+	}
+
+	if (!tc->tc_ot) {
+		if (!(db = xt_get_database_by_id(self, db_id)))
+			return NULL;
+
+		pushr_(xt_heap_release, db);
+		tc->tc_ot = xt_db_open_pool_table(self, db, tab_id, NULL, TRUE);
+		freer_(); // xt_heap_release(db);
+		if (!tc->tc_ot)
+			return NULL;
+	}
+
+	return tc->tc_ot->ot_table;
+}
+
+/*
+ * Free the given page, or the least recently used page.
+ * Return the amount of bytes freed.
+ */
+static size_t tabc_free_page(XTThreadPtr self, TCResourcePtr tc)
+{
+	register XTTabCacheMemPtr	dcg = &xt_tab_cache;
+	XTTableHPtr					tab = NULL;
+	XTTabCachePagePtr			page, lpage, ppage;
+	XTTabCacheSegPtr			seg;
+	u_int						page_cnt;
+	xtBool						was_dirty;
+
+#ifdef DEBUG_CHECK_CACHE
+	//XT_TC_check_cache();
+#endif
+	dcg->tcm_free_try_count = 0;
+
+	retry:
+	/* Note, handling the page is safe because
+	 * there is only one free'er thread which
+	 * can remove pages from the cache!
+	 */
+	page_cnt = 0;
+	if (!(page = dcg->tcm_lru_page)) {
+		dcg->tcm_free_try_count = 0;
+		return 0;
+	}
+
+	retry_2:
+	if ((was_dirty = page->tcp_dirty)) {
+		/* Do all this stuff without a lock, because to
+		 * have a lock while doing this is too expensive!
+		 */
+	
+		/* Wait for the page to be cleaned. */
+		tab = tabc_get_table(self, tc, page->tcp_db_id, page->tcp_tab_id);
+	}
+
+	seg = &dcg->tcm_segment[page->tcp_seg];
+	TAB_CAC_WRITE_LOCK(&seg->tcs_lock, self->t_id);
+
+	if (page->tcp_dirty) {
+		if (!was_dirty) {
+			TAB_CAC_UNLOCK(&seg->tcs_lock, self->t_id);
+			goto retry_2;
+		}
+
+		if (tab) {
+			ASSERT(!XTTableSeq::xt_op_is_before(tab->tab_seq.ts_next_seq, page->tcp_op_seq+1));
+			/* This should never happen. However, is has been occuring,
+			 * during multi_update test on Windows.
+			 * In particular it occurs after rename of a table, during ALTER.
+			 * As if the table was not flushed before the rename!?
+			 * To guard against an infinite loop below, I will just continue here.
+			 */
+			if (XTTableSeq::xt_op_is_before(tab->tab_seq.ts_next_seq, page->tcp_op_seq+1))
+				goto go_on;
+			/* OK, we have the table, now we check where the current
+			 * sequence number is.
+			 */
+			if (XTTableSeq::xt_op_is_before(tab->tab_head_op_seq, page->tcp_op_seq)) {
+				XTDatabaseHPtr db = tab->tab_db;
+
+				rewait:
+				TAB_CAC_UNLOCK(&seg->tcs_lock, self->t_id);
+
+				/* Flush the log, in case this is holding up the
+				 * writer!
+				 */
+				if (!db->db_xlog.xlog_flush(self)) {
+					dcg->tcm_free_try_count = 0;
+					xt_throw(self);
+				}
+
+				xt_lock_mutex(self, &db->db_wr_lock);
+				pushr_(xt_unlock_mutex, &db->db_wr_lock);
+
+				/* The freeer is now waiting: */
+				db->db_wr_freeer_waiting = TRUE;
+
+				/* If the writer is idle, wake it up. 
+				 * The writer will commit the changes to the database
+				 * which will allow the freeer to free up the cache.
+				 */
+				if (db->db_wr_idle) {
+					if (!xt_broadcast_cond_ns(&db->db_wr_cond))
+						xt_log_and_clear_exception_ns();
+				}
+
+				/* Go to sleep on the writer's condition.
+				 * The writer will wake the free'er before it goes to
+				 * sleep!
+				 */
+				tab->tab_wake_freeer_op = page->tcp_op_seq;
+				tab->tab_wr_wake_freeer = TRUE;
+				if (!xt_timed_wait_cond_ns(&db->db_wr_cond, &db->db_wr_lock, 30000)) {
+					tab->tab_wr_wake_freeer = FALSE;
+					db->db_wr_freeer_waiting = FALSE;
+					xt_throw(self);
+				}
+				tab->tab_wr_wake_freeer = FALSE;
+				db->db_wr_freeer_waiting = FALSE;
+				freer_(); // xt_unlock_mutex(&db->db_wr_lock)
+
+				TAB_CAC_WRITE_LOCK(&seg->tcs_lock, self->t_id);
+				if (XTTableSeq::xt_op_is_before(tab->tab_head_op_seq, page->tcp_op_seq))
+					goto rewait;
+			}
+			go_on:;
+		}
+	}
+
+	/* Wait if the page is being read or locked. */
+	if (page->tcp_lock_count) {
+		/* (1) If the page is being read, then we should not free
+		 *     it immediately.
+		 * (2) If a page is locked, the locker may be waiting
+		 *     for the freeer to free some cache - this
+		 *     causes a deadlock.
+		 *
+		 * Therefore, we move on, and try to free another page...
+		 */
+		if (page_cnt < (dcg->tcm_approx_page_count >> 1)) {
+			/* Page has not changed MRU position, and we
+			 * have looked at less than half of the pages.
+			 * Go to the next page...
+			 */
+			if ((page = page->tcp_mr_used)) {
+				page_cnt++;
+				TAB_CAC_UNLOCK(&seg->tcs_lock, self->t_id);
+				goto retry_2;
+			}
+		}
+		TAB_CAC_UNLOCK(&seg->tcs_lock, self->t_id);
+		dcg->tcm_free_try_count++;				
+
+		/* Starting to spin, free the threads: */
+		if (dcg->tcm_threads_waiting) {
+			if (!xt_broadcast_cond_ns(&dcg->tcm_freeer_cond))
+				xt_log_and_clear_exception_ns();
+		}
+		goto retry;
+	}
+
+	/* Page is clean, remove from the hash table: */
+
+	/* Find the page on the list: */
+	u_int page_idx = page->tcp_page_idx;
+	u_int file_id = page->tcp_file_id;
+
+	ppage = NULL;
+	lpage = seg->tcs_hash_table[page->tcp_hash_idx];
+	while (lpage) {
+		if (lpage->tcp_page_idx == page_idx && lpage->tcp_file_id == file_id)
+			break;
+		ppage = lpage;
+		lpage = lpage->tcp_next;
+	}
+
+	if (page == lpage) {
+		/* Should be the case! */
+		if (ppage)
+			ppage->tcp_next = page->tcp_next;
+		else
+			seg->tcs_hash_table[page->tcp_hash_idx] = page->tcp_next;
+	}
+#ifdef DEBUG
+	else
+		ASSERT_NS(FALSE);
+#endif
+
+	/* Remove from the MRU list: */
+	xt_lock_mutex_ns(&dcg->tcm_lock);
+	if (dcg->tcm_lru_page == page)
+		dcg->tcm_lru_page = page->tcp_mr_used;
+	if (dcg->tcm_mru_page == page)
+		dcg->tcm_mru_page = page->tcp_lr_used;
+	if (page->tcp_lr_used)
+		page->tcp_lr_used->tcp_mr_used = page->tcp_mr_used;
+	if (page->tcp_mr_used)
+		page->tcp_mr_used->tcp_lr_used = page->tcp_lr_used;
+	xt_unlock_mutex_ns(&dcg->tcm_lock);
+
+	/* Free the page: */
+	size_t freed_space = offsetof(XTTabCachePageRec, tcp_data) + page->tcp_data_size;
+	ASSERT_NS(seg->tcs_cache_in_use >= freed_space);
+	seg->tcs_cache_in_use -= freed_space;
+	ASSERT_NS(seg->tcs_cache_in_use == 0 || seg->tcs_cache_in_use >= 25000);
+	xt_free_ns(page);
+
+	TAB_CAC_UNLOCK(&seg->tcs_lock, self->t_id);
+	self->st_statistics.st_rec_cache_frees++;
+	dcg->tcm_free_try_count = 0;
+	return freed_space;
+}
+
+static void tabc_fr_main(XTThreadPtr self)
+{
+	register XTTabCacheMemPtr	dcg = &xt_tab_cache;
+	TCResourceRec				tc = { 0 };
+	int							i;
+
+	xt_set_low_priority(self);
+	dcg->tcm_freeer_busy = TRUE;
+
+	while (!self->t_quit) {		
+		size_t cache_used, freed;
+
+		pushr_(tabc_free_fr_resources, &tc);
+
+		while (!self->t_quit) {
+			/* Total up the cache memory used: */
+			cache_used = 0;
+			for (i=0; i<XT_TC_SEGMENT_COUNT; i++)
+				cache_used += dcg->tcm_segment[i].tcs_cache_in_use;
+
+			if (cache_used > dcg->tcm_cache_high)
+				dcg->tcm_cache_high = cache_used;
+
+			/* Check if the cache usage is over 95%: */
+			if (self->t_quit)
+				break;
+
+			/* If threads are waiting then we are more aggressive about freeing
+			 * cache.
+			 */ 
+			if (cache_used < (dcg->tcm_threads_waiting ? dcg->tcm_mid_level : dcg->tcm_high_level))
+				break;
+
+			/* Reduce cache to the 75% level: */
+			while (!self->t_quit && cache_used > dcg->tcm_low_level) {
+				freed = tabc_free_page(self, &tc);
+				cache_used -= freed;
+				if (cache_used <= dcg->tcm_high_level) {
+					/* Wakeup any threads that are waiting for some cache to be
+					 * freed.
+					 */
+					if (dcg->tcm_threads_waiting) {
+						if (!xt_broadcast_cond_ns(&dcg->tcm_freeer_cond))
+							xt_log_and_clear_exception_ns();
+					}
+				}
+			}
+		}
+
+		freer_(); // tabc_free_fr_resources(&tc)
+
+		xt_lock_mutex(self, &dcg->tcm_freeer_lock);
+		pushr_(xt_unlock_mutex, &dcg->tcm_freeer_lock);
+
+		if (dcg->tcm_threads_waiting) {
+			/* Wake threads before we go to sleep: */
+			if (!xt_broadcast_cond_ns(&dcg->tcm_freeer_cond))
+				xt_log_and_clear_exception_ns();
+		}
+			
+		/* Wait for a thread that allocates data to signal
+		 * that the cache level has exceeeded the upper limit:
+		 */
+		xt_db_approximate_time = time(NULL);
+		dcg->tcm_freeer_busy = FALSE;
+		/* No idea, why, but I am getting an uneccesarry pause here.
+		 * I run DBT2 with low record cache.
+		 *
+		 * Every now and then there is a pause where the freeer is here,
+		 * and all user threads are waiting for the freeer.
+		 *
+		 * So adding the tcm_threads_waiting condition.
+		 */
+		if (dcg->tcm_threads_waiting) {
+			cache_used = 0;
+			for (i=0; i<XT_TC_SEGMENT_COUNT; i++)
+				cache_used += dcg->tcm_segment[i].tcs_cache_in_use;
+			if (cache_used < dcg->tcm_mid_level)
+				tabc_fr_wait_for_cache(self, 500);
+		}
+		else
+			tabc_fr_wait_for_cache(self, 500);
+		//tabc_fr_wait_for_cache(self, 30*1000);
+		dcg->tcm_freeer_busy = TRUE;
+		xt_db_approximate_time = time(NULL);
+		freer_(); // xt_unlock_mutex(&dcg->tcm_freeer_lock)
+	}
+}
+
+static void *tabc_fr_run_thread(XTThreadPtr self)
+{
+	int		count;
+	void	*mysql_thread;
+
+	myxt_wait_pbxt_plugin_slot_assigned(self);
+
+	mysql_thread = myxt_create_thread();
+
+	while (!self->t_quit) {
+		try_(a) {
+			tabc_fr_main(self);
+		}
+		catch_(a) {
+			/* This error is "normal"! */
+			if (!(self->t_exception.e_xt_err == XT_SIGNAL_CAUGHT &&
+				self->t_exception.e_sys_err == SIGTERM))
+				xt_log_and_clear_exception(self);
+		}
+		cont_(a);
+
+		/* After an exception, pause before trying again... */
+		/* Number of seconds */
+#ifdef DEBUG
+		count = 10;
+#else
+		count = 2*60;
+#endif
+		while (!self->t_quit && count > 0) {
+			xt_db_approximate_time = time(NULL);
+			sleep(1);
+			count--;
+		}
+	}
+
+   /*
+	* {MYSQL-THREAD-KILL}
+	myxt_destroy_thread(mysql_thread, TRUE);
+	*/
+	return NULL;
+}
+
+static void tabc_fr_free_thread(XTThreadPtr self, void *XT_UNUSED(data))
+{
+	if (xt_tab_cache.tcm_freeer_thread) {
+		xt_lock_mutex(self, &xt_tab_cache.tcm_freeer_lock);
+		pushr_(xt_unlock_mutex, &xt_tab_cache.tcm_freeer_lock);
+		xt_tab_cache.tcm_freeer_thread = NULL;
+		freer_(); // xt_unlock_mutex(&xt_tab_cache.tcm_freeer_lock)
+	}
+}
+
+xtPublic void xt_start_freeer(XTThreadPtr self)
+{
+	xt_tab_cache.tcm_freeer_thread = xt_create_daemon(self, "free-er");
+	xt_set_thread_data(xt_tab_cache.tcm_freeer_thread, NULL, tabc_fr_free_thread);
+	xt_run_thread(self, xt_tab_cache.tcm_freeer_thread, tabc_fr_run_thread);
+}
+
+xtPublic void xt_quit_freeer(XTThreadPtr self)
+{
+	if (xt_tab_cache.tcm_freeer_thread) {
+		xt_lock_mutex(self, &xt_tab_cache.tcm_freeer_lock);
+		pushr_(xt_unlock_mutex, &xt_tab_cache.tcm_freeer_lock);
+		xt_terminate_thread(self, xt_tab_cache.tcm_freeer_thread);
+		freer_(); // xt_unlock_mutex(&xt_tab_cache.tcm_freeer_lock)
+	}
+}
+
+xtPublic void xt_stop_freeer(XTThreadPtr self)
+{
+	XTThreadPtr thr_fr;
+
+	if (xt_tab_cache.tcm_freeer_thread) {
+		xt_lock_mutex(self, &xt_tab_cache.tcm_freeer_lock);
+		pushr_(xt_unlock_mutex, &xt_tab_cache.tcm_freeer_lock);
+
+		/* This pointer is safe as long as you have the transaction lock. */
+		if ((thr_fr = xt_tab_cache.tcm_freeer_thread)) {
+			xtThreadID tid = thr_fr->t_id;
+
+			/* Make sure the thread quits when woken up. */
+			xt_terminate_thread(self, thr_fr);
+
+			/* Wake the freeer to get it to quit: */
+			if (!xt_broadcast_cond_ns(&xt_tab_cache.tcm_freeer_cond))
+				xt_log_and_clear_exception_ns();
+	
+			freer_(); // xt_unlock_mutex(&xt_tab_cache.tcm_freeer_lock)
+
+			/*
+			 * GOTCHA: This is a wierd thing but the SIGTERM directed
+			 * at a particular thread (in this case the sweeper) was
+			 * being caught by a different thread and killing the server
+			 * sometimes. Disconcerting.
+			 * (this may only be a problem on Mac OS X)
+			xt_kill_thread(thread);
+			 */
+			xt_wait_for_thread(tid, FALSE);
+	
+			/* PMC - This should not be necessary to set the signal here, but in the
+			 * debugger the handler is not called!!?
+			thr_fr->t_delayed_signal = SIGTERM;
+			xt_kill_thread(thread);
+			 */
+			xt_tab_cache.tcm_freeer_thread = NULL;
+		}
+		else
+			freer_(); // xt_unlock_mutex(&xt_tab_cache.tcm_freeer_lock)
+	}
+}
+
+xtPublic void xt_load_pages(XTThreadPtr self, XTOpenTablePtr ot)
+{
+	XTTableHPtr			tab = ot->ot_table;
+	xtRecordID			rec_id;
+	XTTabCachePagePtr	page;
+	XTTabCacheSegPtr	seg;
+	size_t				poffset;
+
+	rec_id = 1;
+	while (rec_id<tab->tab_row_eof_id) {
+		if (!tab->tab_rows.tc_fetch(ot->ot_row_file, rec_id, &seg, &page, &poffset, TRUE, self))
+			xt_throw(self);
+		TAB_CAC_UNLOCK(&seg->tcs_lock, self->t_id);
+		rec_id += tab->tab_rows.tci_rows_per_page;
+	}
+
+	rec_id = 1;
+	while (rec_id<tab->tab_rec_eof_id) {
+		if (!tab->tab_recs.tc_fetch(ot->ot_rec_file, rec_id, &seg, &page, &poffset, TRUE, self))
+			xt_throw(self);
+		TAB_CAC_UNLOCK(&seg->tcs_lock, self->t_id);
+		rec_id += tab->tab_recs.tci_rows_per_page;
+	}
+}
+
+
diff --git a/storage/pbxt/src/tabcache_xt.h b/storage/pbxt/src/tabcache_xt.h
new file mode 100644
index 00000000000..5dcd39050d4
--- /dev/null
+++ b/storage/pbxt/src/tabcache_xt.h
@@ -0,0 +1,290 @@
+/* Copyright (c) 2007 PrimeBase Technologies GmbH
+ *
+ * PrimeBase XT
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.	See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * 2007-10-31	Paul McCullagh
+ *
+ * H&G2JCtL
+ *
+ * The new table cache. Caches all non-index data. This includes the data
+ * files and the row pointer files.
+ */
+#ifndef __tabcache_h__
+#define __tabcache_h__
+
+struct XTTable;
+struct XTOpenTable;
+struct XTTabCache;
+struct XTDatabase;
+
+#include "thread_xt.h"
+#include "filesys_xt.h"
+#include "lock_xt.h"
+
+#ifdef DEBUG
+//#define XT_USE_CACHE_DEBUG_SIZES
+//#define XT_NOT_INLINE
+#endif
+
+#ifdef XT_USE_CACHE_DEBUG_SIZES
+
+#define XT_TC_PAGE_SIZE				(4*1024)
+#define XT_TC_SEGMENT_SHIFTS		1
+
+#else
+
+#define XT_TC_PAGE_SIZE				(32*1024)
+#define XT_TC_SEGMENT_SHIFTS		3
+
+#endif
+
+#define XT_TIME_DIFF(start, now) (\
+	((xtWord4) (now) < (xtWord4) (start)) ? \
+	((xtWord4) 0XFFFFFFFF - ((xtWord4) (start) - (xtWord4) (now))) : \
+	((xtWord4) (now) - (xtWord4) (start)))
+
+#define XT_TC_SEGMENT_COUNT			((off_t) 1 << XT_TC_SEGMENT_SHIFTS)
+#define XT_TC_SEGMENT_MASK			(XT_TC_SEGMENT_COUNT - 1)
+
+typedef struct XTTabCachePage {
+	xtWord1					tcp_dirty;						/* TRUE if the page is dirty. */
+	xtWord1					tcp_seg;						/* Segement number of the page. */
+	u_int					tcp_lock_count;					/* Number of read locks on this page. */
+	u_int					tcp_hash_idx;					/* The hash index of the page. */
+	u_int					tcp_page_idx;					/* The page address. */
+	u_int					tcp_file_id;					/* The file id of the page. */
+	xtDatabaseID			tcp_db_id;						/* The ID of the database. */
+	xtTableID				tcp_tab_id;						/* The ID of the table of this cache page. */
+	xtWord4					tcp_data_size;					/* Size of the data on this page. */
+	xtOpSeqNo				tcp_op_seq;						/* The operation sequence number (dirty pages have a operations sequence) */
+	xtWord4					tcp_ru_time;					/* If this is in the top 1/4 don't change position in MRU list. */
+	struct XTTabCachePage	*tcp_next;						/* Pointer to next page on hash list, or next free page on free list. */
+	struct XTTabCachePage	*tcp_mr_used;					/* More recently used pages. */
+	struct XTTabCachePage	*tcp_lr_used;					/* Less recently used pages. */
+	xtWord1					tcp_data[XT_TC_PAGE_SIZE];		/* This is actually tci_page_size! */
+} XTTabCachePageRec, *XTTabCachePagePtr;
+
+/*
+ * Each table has a "table operation sequence". This sequence is incremented by
+ * each operation on the table. Each operation in the log is tagged by a
+ * sequence number.
+ *
+ * The writter threads re-order operations in the log, and write the operations
+ * to the database in sequence.
+ *
+ * It is safe to free a cache page when the sequence number of the cache page,
+ * is less than or equal to the written sequence number.
+ */
+typedef struct XTTableSeq {
+	xtOpSeqNo				ts_next_seq;					/* The next sequence number for operations on the table. */
+	xt_mutex_type			ts_ns_lock;						/* Lock for the next sequence number. */
+
+	xtBool ts_log_no_op(XTThreadPtr thread, xtTableID tab_id, xtOpSeqNo op_seq);
+
+	/* Return the next operation sequence number. */
+#ifdef XT_NOT_INLINE
+	xtOpSeqNo ts_set_op_seq(XTTabCachePagePtr page);
+
+	xtOpSeqNo ts_get_op_seq();
+#else
+	xtOpSeqNo ts_set_op_seq(XTTabCachePagePtr page)
+	{
+		xtOpSeqNo seq;
+
+		xt_lock_mutex_ns(&ts_ns_lock);
+		page->tcp_op_seq = seq = ts_next_seq++;
+		xt_unlock_mutex_ns(&ts_ns_lock);
+		return seq;
+	}
+
+	xtOpSeqNo ts_get_op_seq()
+	{
+		xtOpSeqNo seq;
+
+		xt_lock_mutex_ns(&ts_ns_lock);
+		seq = ts_next_seq++;
+		xt_unlock_mutex_ns(&ts_ns_lock);
+		return seq;
+	}
+#endif
+
+	void xt_op_seq_init(XTThreadPtr self) {
+		xt_init_mutex_with_autoname(self, &ts_ns_lock);
+	}
+
+	void xt_op_seq_set(XTThreadPtr XT_UNUSED(self), xtOpSeqNo n) {
+		ts_next_seq = n;
+	}
+
+	void xt_op_seq_exit(XTThreadPtr XT_UNUSED(self)) {
+		xt_free_mutex(&ts_ns_lock);
+	}
+
+#ifdef XT_NOT_INLINE
+	static xtBool xt_op_is_before(register xtOpSeqNo now, register xtOpSeqNo then);
+#else
+	static inline xtBool xt_op_is_before(register xtOpSeqNo now, register xtOpSeqNo then)
+	{
+		if (now >= then) {
+			if ((now - then) > (xtOpSeqNo) 0xFFFFFFFF/2)
+				return TRUE;
+			return FALSE;
+		}
+		if ((then - now) > (xtOpSeqNo) 0xFFFFFFFF/2)
+			return FALSE;
+		return TRUE;
+	}
+#endif
+} XTTableSeqRec, *XTTableSeqPtr;
+
+#ifdef XT_NO_ATOMICS
+#define TAB_CAC_USE_PTHREAD_RW
+#else
+//#define TAB_CAC_USE_RWMUTEX
+//#define TAB_CAC_USE_PTHREAD_RW
+//#define IDX_USE_SPINXSLOCK
+#define TAB_CAC_USE_XSMUTEX
+#endif
+
+#ifdef TAB_CAC_USE_XSMUTEX
+#define TAB_CAC_LOCK_TYPE				XTXSMutexRec
+#define TAB_CAC_INIT_LOCK(s, i)			xt_xsmutex_init_with_autoname(s, i)
+#define TAB_CAC_FREE_LOCK(s, i)			xt_xsmutex_free(s, i)	
+#define TAB_CAC_READ_LOCK(i, o)			xt_xsmutex_slock(i, o)
+#define TAB_CAC_WRITE_LOCK(i, o)		xt_xsmutex_xlock(i, o)
+#define TAB_CAC_UNLOCK(i, o)			xt_xsmutex_unlock(i, o)
+#elif defined(TAB_CAC_USE_PTHREAD_RW)
+#define TAB_CAC_LOCK_TYPE				xt_rwlock_type
+#define TAB_CAC_INIT_LOCK(s, i)			xt_init_rwlock_with_autoname(s, i)
+#define TAB_CAC_FREE_LOCK(s, i)			xt_free_rwlock(i)	
+#define TAB_CAC_READ_LOCK(i, o)			xt_slock_rwlock_ns(i)
+#define TAB_CAC_WRITE_LOCK(i, o)		xt_xlock_rwlock_ns(i)
+#define TAB_CAC_UNLOCK(i, o)			xt_unlock_rwlock_ns(i)
+#elif defined(TAB_CAC_USE_RWMUTEX)
+#define TAB_CAC_LOCK_TYPE				XTRWMutexRec
+#define TAB_CAC_INIT_LOCK(s, i)			xt_rwmutex_init_with_autoname(s, i)
+#define TAB_CAC_FREE_LOCK(s, i)			xt_rwmutex_free(s, i)	
+#define TAB_CAC_READ_LOCK(i, o)			xt_rwmutex_slock(i, o)
+#define TAB_CAC_WRITE_LOCK(i, o)		xt_rwmutex_xlock(i, o)
+#define TAB_CAC_UNLOCK(i, o)			xt_rwmutex_unlock(i, o)
+#elif defined(TAB_CAC_USE_SPINXSLOCK)
+#define TAB_CAC_LOCK_TYPE				XTSpinXSLockRec
+#define TAB_CAC_INIT_LOCK(s, i)			xt_spinxslock_init_with_autoname(s, i)
+#define TAB_CAC_FREE_LOCK(s, i)			xt_spinxslock_free(s, i)	
+#define TAB_CAC_READ_LOCK(i, o)			xt_spinxslock_slock(i, o)
+#define TAB_CAC_WRITE_LOCK(i, o)		xt_spinxslock_xlock(i, o)
+#define TAB_CAC_UNLOCK(i, o)			xt_spinxslock_unlock(i, o)
+#endif
+
+/* A disk cache segment. The cache is divided into a number of segments
+ * to improve concurrency.
+ */
+typedef struct XTTabCacheSeg {
+	TAB_CAC_LOCK_TYPE		tcs_lock;						/* The cache segment read/write lock. */
+	XTTabCachePagePtr		*tcs_hash_table;
+	size_t					tcs_cache_in_use;
+} XTTabCacheSegRec, *XTTabCacheSegPtr;
+
+/*
+ * The free'er thread has a list of tables to be purged from the cache.
+ * If a table is in the list then it is not allowed to fetch a cache page from
+ * that table.
+ * The free'er thread goes through all the cache, and removes
+ * all cache pages for any table in the purge list.
+ * When a table has been purged it signals any threads waiting for the
+ * purge to complete (this is usually due to a drop table).
+ */
+typedef struct XTTabCachePurge {
+	int						tcp_state;						/* The state of the purge. */
+	XTTableSeqPtr			tcp_tab_seq;					/* Identifies the table to be purged from cache. */
+} XTTabCachePurgeRec, *XTTabCachePurgePtr;
+
+typedef struct XTTabCacheMem {
+	xt_mutex_type			tcm_lock;						/* The public cache lock. */
+	xt_cond_type			tcm_cond;						/* The public cache wait condition. */
+	XTTabCacheSegRec		tcm_segment[XT_TC_SEGMENT_COUNT];
+	XTTabCachePagePtr		tcm_lru_page;
+	XTTabCachePagePtr		tcm_mru_page;
+	xtWord4					tcm_ru_now;
+	size_t					tcm_approx_page_count;
+	size_t					tcm_hash_size;
+	u_int					tcm_writer_thread_count;
+	size_t					tcm_cache_size;
+	size_t					tcm_cache_high;					/* The high water level of cache allocation. */
+	size_t					tcm_low_level;					/* This is the level to which the freeer will free, once it starts working. */
+	size_t					tcm_high_level;					/* This is the level at which the freeer will start to work (to avoid waiting)! */
+	size_t					tcm_mid_level;					/* At this level the freeer will not sleep if there are threads waiting. */
+
+	/* The free'er thread: */
+	struct XTThread			*tcm_freeer_thread;				/* The freeer thread . */
+	xt_mutex_type			tcm_freeer_lock;				/* The public cache lock. */
+	xt_cond_type			tcm_freeer_cond;				/* The public cache wait condition. */
+	u_int					tcm_purge_list_len;				/* The length of the purge list. */
+	XTTabCachePurgePtr		tcm_purge_list;					/* Non-NULL if a table is to be purged. */
+	u_int					tcm_threads_waiting;			/* Count of the number of threads waiting for the freeer. */
+	xtBool					tcm_freeer_busy;
+	u_int					tcm_free_try_count;
+} XTTabCacheMemRec, *XTTabCacheMemPtr;
+
+/*
+ * This structure contains the information about a particular table
+ * for the cache. Each table has its own page size, row size
+ * and rows per page.
+ * Tables also have 
+ */
+typedef struct XTTabCache {
+	struct XTTable			*tci_table;
+	size_t					tci_header_size;
+	size_t					tci_page_size;
+	size_t					tci_rec_size;
+	size_t					tci_rows_per_page;
+
+public:
+	void					xt_tc_setup(struct XTTable *tab, size_t head_size, size_t row_size);
+	xtBool					xt_tc_write(XT_ROW_REC_FILE_PTR file, xtRefID ref_id, size_t offset, size_t size, xtWord1 *data, xtOpSeqNo *op_seq, xtBool read, XTThreadPtr thread);
+	xtBool					xt_tc_write_cond(XTThreadPtr self, XT_ROW_REC_FILE_PTR file, xtRefID ref_id, xtWord1 new_type, xtOpSeqNo *op_seq, xtXactID xn_id, xtRowID row_id, u_int stat_id, u_int rec_type);
+	xtBool					xt_tc_read(XT_ROW_REC_FILE_PTR file, xtRefID ref_id, size_t size, xtWord1 *data, XTThreadPtr thread);
+	xtBool					xt_tc_read_4(XT_ROW_REC_FILE_PTR file, xtRefID ref_id, xtWord4 *data, XTThreadPtr thread);
+	xtBool					xt_tc_read_page(XT_ROW_REC_FILE_PTR file, xtRefID ref_id, xtWord1 *data, XTThreadPtr thread);
+	xtBool					xt_tc_get_page(XT_ROW_REC_FILE_PTR file, xtRefID ref_id, xtBool load, XTTabCachePagePtr *page, size_t *offset, XTThreadPtr thread);
+	void					xt_tc_release_page(XT_ROW_REC_FILE_PTR file, XTTabCachePagePtr page, XTThreadPtr thread);
+	xtBool					tc_fetch(XT_ROW_REC_FILE_PTR file, xtRefID ref_id, XTTabCacheSegPtr *ret_seg, XTTabCachePagePtr *ret_page, size_t *offset, xtBool read, XTThreadPtr thread);
+
+private:
+	xtBool					tc_read_direct(XT_ROW_REC_FILE_PTR file, xtRefID ref_id, size_t size, xtWord1 *data, XTThreadPtr thread);
+	xtBool					tc_fetch_direct(XT_ROW_REC_FILE_PTR file, xtRefID ref_id, XTTabCacheSegPtr *ret_seg, XTTabCachePagePtr *ret_page, size_t *offset, XTThreadPtr thread);
+} XTTabCacheRec, *XTTabCachePtr;
+
+extern XTTabCacheMemRec xt_tab_cache;
+
+void	xt_tc_init(XTThreadPtr self, size_t cache_size);
+void	xt_tc_exit(XTThreadPtr self);
+void	xt_tc_set_cache_size(size_t cache_size);
+xtInt8	xt_tc_get_usage();
+xtInt8	xt_tc_get_size();
+xtInt8	xt_tc_get_high();
+void	xt_load_pages(XTThreadPtr self, struct XTOpenTable *ot);
+#ifdef DEBUG
+void	xt_check_table_cache(struct XTTable *tab);
+#endif
+
+void	xt_quit_freeer(XTThreadPtr self);
+void	xt_stop_freeer(XTThreadPtr self);
+void	xt_start_freeer(XTThreadPtr self);
+void	xt_wr_wake_freeer(XTThreadPtr self, struct XTDatabase *db);
+
+#endif
diff --git a/storage/pbxt/src/table_xt.cc b/storage/pbxt/src/table_xt.cc
new file mode 100644
index 00000000000..2d93f161ac9
--- /dev/null
+++ b/storage/pbxt/src/table_xt.cc
@@ -0,0 +1,5570 @@
+/* Copyright (c) 2005 PrimeBase Technologies GmbH
+ *
+ * PrimeBase XT
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * 2005-02-08	Paul McCullagh
+ *
+ * H&G2JCtL
+ */
+
+#include "xt_config.h"
+
+#include <string.h>
+#include <stdio.h>
+#ifndef XT_WIN
+#include <strings.h>
+#endif
+#include <ctype.h>
+#include <time.h>
+
+#ifdef DRIZZLED
+#include <drizzled/common.h>
+#include <mysys/thr_lock.h>
+#include <drizzled/dtcollation.h>
+#else
+#include "mysql_priv.h"
+#endif
+
+#include "table_xt.h"
+#include "database_xt.h"
+#include "heap_xt.h"
+#include "strutil_xt.h"
+#include "myxt_xt.h"
+#include "cache_xt.h"
+#include "trace_xt.h"
+#include "index_xt.h"
+#include "systab_xt.h"
+
+#ifdef DEBUG
+//#define TRACE_VARIATIONS
+//#define TRACE_VARIATIONS_IN_DUP_CHECK
+//#define DUMP_CHECK_TABLE
+//#define CHECK_INDEX_ON_CHECK_TABLE
+//#define TRACE_TABLE_IDS
+//#define TRACE_FLUSH
+//#define TRACE_CREATE_TABLES
+#endif
+
+#define CHECK_TABLE_STATS
+
+/* The problem is that this can take a long time
+ * if the table is very large!
+ */
+//#define CHECK_TABLE_READ_DATA_LOG
+
+#ifdef TRACE_TABLE_IDS
+//#define PRINTF		xt_ftracef
+#define PRINTF		xt_trace
+#endif
+
+/*
+ * -----------------------------------------------------------------------
+ * Internal structures
+ */
+
+#define XT_MAX_TABLE_FILE_NAME_SIZE		(XT_TABLE_NAME_SIZE+6+40)
+
+/*
+ * -----------------------------------------------------------------------
+ * Handle Error Detected in a Table
+ */
+
+struct XTTableError {
+	xtTableID		ter_tab_id;
+	xtRecordID		ter_rec_id;
+};
+
+static int tab_comp_tab_error(XTThreadPtr XT_UNUSED(self), register const void *XT_UNUSED(thunk), register const void *a, register const void *b)
+{
+	XTTableError	*ter_a = ((XTTableError *) a);
+	XTTableError	*ter_b = (XTTableError *) b;
+
+	if (ter_a->ter_tab_id < ter_b->ter_tab_id)
+		return -1;
+	if (ter_a->ter_tab_id == ter_b->ter_tab_id) {
+		if (ter_a->ter_rec_id < ter_b->ter_rec_id)
+			return -1;
+		if (ter_a->ter_rec_id == ter_b->ter_rec_id)
+			return 0;
+		return 1;
+	}
+	return 1;
+}
+
+static xtBool tab_record_corrupt(XTOpenTablePtr ot, xtRowID row_id, xtRecordID rec_id, bool not_valid, int where)
+{
+	XTTableHPtr		tab = ot->ot_table;
+	XTDatabaseHPtr	db = tab->tab_db;
+	XTTableError	ter;
+	XTTableError	*ter_ptr;
+	
+	ter.ter_tab_id = tab->tab_id;
+	ter.ter_rec_id = rec_id;
+	
+	xt_sl_lock_ns(db->db_error_list, ot->ot_thread);
+	if (!(ter_ptr = (XTTableError *) xt_sl_find(NULL, db->db_error_list, &ter))) {
+		xtBool	ok;
+		char	table_name[XT_IDENTIFIER_NAME_SIZE*3+3];
+
+		ok = xt_sl_insert(NULL, db->db_error_list, &ter, &ter);
+		xt_sl_unlock_ns(db->db_error_list);
+		if (!ok)
+			return FAILED;
+		xt_tab_set_table_repair_pending(tab);
+		xt_tab_make_table_name(tab, table_name, sizeof(table_name));
+		xt_logf(XT_NT_ERROR, "#%d Table %s: row %llu, record %llu, is %s, REPAIR TABLE required.\n", where,
+			table_name, 
+			(u_llong) row_id,
+			(u_llong) rec_id,
+			not_valid ? "not valid" : "free");
+	}
+	else
+		xt_sl_unlock_ns(db->db_error_list);
+	return OK;
+}
+
+/*
+ * -----------------------------------------------------------------------
+ * Compare paths:
+ */
+
+/* GOTCHA! The problem:
+ *
+ * The server uses names like: "./test/my_tab",
+ * the BLOB streaming engine uses: "test/my_tab"
+ * which leads to the same table being loaded twice.
+ */
+xtPublic int xt_tab_compare_paths(char *n1, char *n2)
+{
+	n1 = xt_last_2_names_of_path(n1);
+	n2 = xt_last_2_names_of_path(n2);
+	if (pbxt_ignore_case)
+		return strcasecmp(n1, n2);
+	return strcmp(n1, n2);
+}
+
+/*
+ * This function only compares only the last 2 components of
+ * the path because table names must differ in this area.
+ */
+xtPublic int xt_tab_compare_names(const char *n1, const char *n2)
+{
+	n1 = xt_last_2_names_of_path(n1);
+	n2 = xt_last_2_names_of_path(n2);
+	if (pbxt_ignore_case)
+		return strcasecmp(n1, n2);
+	return strcmp(n1, n2);
+}
+
+/*
+ * -----------------------------------------------------------------------
+ * Private utilities
+ */
+
+static xtBool tab_list_comp(void *key, void *data)
+{
+	XTTableHPtr	tab = (XTTableHPtr) data;
+
+	return strcmp(xt_last_2_names_of_path((char *) key), xt_last_2_names_of_path(tab->tab_name->ps_path)) == 0;
+}
+
+static xtHashValue tab_list_hash(xtBool is_key, void *key_data)
+{
+	XTTableHPtr	tab = (XTTableHPtr) key_data;
+
+	if (is_key)
+		return xt_ht_hash(xt_last_2_names_of_path((char *) key_data));
+	return xt_ht_hash(xt_last_2_names_of_path(tab->tab_name->ps_path));
+}
+
+static xtBool tab_list_comp_ci(void *key, void *data)
+{
+	XTTableHPtr	tab = (XTTableHPtr) data;
+
+	return strcasecmp(xt_last_2_names_of_path((char *) key), xt_last_2_names_of_path(tab->tab_name->ps_path)) == 0;
+}
+
+static xtHashValue tab_list_hash_ci(xtBool is_key, void *key_data)
+{
+	XTTableHPtr	tab = (XTTableHPtr) key_data;
+
+	if (is_key)
+		return xt_ht_casehash(xt_last_2_names_of_path((char *) key_data));
+	return xt_ht_casehash(xt_last_2_names_of_path(tab->tab_name->ps_path));
+}
+
+static void tab_list_free(XTThreadPtr self, void *data)
+{
+	XTTableHPtr		tab = (XTTableHPtr) data;
+	XTDatabaseHPtr	db = tab->tab_db;
+	XTTableEntryPtr	te_ptr;
+
+	/* Remove the reference from the ID list, whem the table is
+	 * removed from the name list:
+	 */
+	if ((te_ptr = (XTTableEntryPtr) xt_sl_find(self, db->db_table_by_id, &tab->tab_id)))
+		te_ptr->te_table = NULL;
+
+	if (tab->tab_dic.dic_table)
+		tab->tab_dic.dic_table->removeReferences(self);
+	xt_heap_release(self, tab);
+}
+
+static void tab_close_mapped_files(XTThreadPtr self, XTTableHPtr tab)
+{
+	if (tab->tab_rec_file) {
+		xt_fs_release_file(self, tab->tab_rec_file);
+		tab->tab_rec_file = NULL;
+	}
+	if (tab->tab_row_file) {
+		xt_fs_release_file(self, tab->tab_row_file);
+		tab->tab_row_file = NULL;
+	}
+}
+
+static void tab_finalize(XTThreadPtr self, void *x)
+{
+	XTTableHPtr	tab = (XTTableHPtr) x;
+
+	xt_exit_row_locks(&tab->tab_locks);
+
+	xt_xres_exit_tab(self, tab);
+
+	if (tab->tab_ind_free_list) {
+		XTIndFreeListPtr list, flist;
+		
+		list = tab->tab_ind_free_list;
+		while (list) {
+			flist = list;
+			list = list->fl_next_list;
+			xt_free(self, flist);
+		}
+		tab->tab_ind_free_list = NULL;
+	}
+
+	if (tab->tab_ind_file) {
+		xt_fs_release_file(self, tab->tab_ind_file);
+		tab->tab_ind_file = NULL;
+	}
+	tab_close_mapped_files(self, tab);
+
+	if (tab->tab_index_head) {
+		xt_free(self, tab->tab_index_head);
+		tab->tab_index_head = NULL;
+	}
+
+#ifdef TRACE_TABLE_IDS
+	PRINTF("%s: free TABLE: db=%d tab=%d %s\n", self->t_name, (int) tab->tab_db ? tab->tab_db->db_id : 0, (int) tab->tab_id, 
+		tab->tab_name ? xt_last_2_names_of_path(tab->tab_name->ps_path) : "?");
+#endif
+	if (tab->tab_name) {
+		xt_free(self, tab->tab_name);
+		tab->tab_name = NULL;
+	}
+	myxt_free_dictionary(self, &tab->tab_dic);
+	if (tab->tab_free_locks) {
+		tab->tab_seq.xt_op_seq_exit(self);
+		xt_spinlock_free(self, &tab->tab_ainc_lock);
+		xt_free_mutex(&tab->tab_rec_flush_lock);
+		xt_free_mutex(&tab->tab_ind_flush_lock);
+		xt_free_mutex(&tab->tab_ind_stat_lock);
+		xt_free_mutex(&tab->tab_dic_field_lock);
+		xt_free_mutex(&tab->tab_row_lock);
+		xt_free_mutex(&tab->tab_ind_lock);
+		xt_free_mutex(&tab->tab_rec_lock);
+		for (u_int i=0; i<XT_ROW_RWLOCKS; i++)
+			XT_TAB_ROW_FREE_LOCK(self, &tab->tab_row_rwlock[i]);
+	}
+}
+
+static void tab_onrelease(XTThreadPtr self, void *x)
+{
+	XTTableHPtr	tab = (XTTableHPtr) x;
+
+	/* Signal threads waiting for exclusive use of the table: */
+	if (tab->tab_db->db_tables)
+		xt_ht_signal(self, tab->tab_db->db_tables);
+}
+
+/*
+ * -----------------------------------------------------------------------
+ * PUBLIC METHODS
+ */
+
+/*
+ * This function sets the table name to "", if the file
+ * does not belong to XT.
+ */
+xtPublic char *xt_tab_file_to_name(size_t size, char *tab_name, char *file_name)
+{
+	char	*cptr;
+	size_t	len;
+
+	file_name = xt_last_name_of_path(file_name);
+	cptr = file_name + strlen(file_name) - 1;
+	while (cptr > file_name && *cptr != '.')
+		cptr--;
+	if (cptr > file_name && *cptr == '.') {
+		if (strcmp(cptr, ".xtl") == 0 || strcmp(cptr, ".xtr") == 0) {
+			cptr--;
+			while (cptr > file_name && isdigit(*cptr))
+				cptr--;
+		}
+		else {
+			const char **ext = pbxt_extensions;
+			
+			while (*ext) {
+				if (strcmp(cptr, *ext) == 0)
+					goto ret_name;
+				ext++;
+			}
+			cptr = file_name;
+		}
+	}
+
+	ret_name:
+	len = cptr - file_name;
+	if (len > size-1)
+		len = size-1;
+
+	memcpy(tab_name, file_name, len);
+	tab_name[len] = 0;
+
+	/* Return a pointer to what was removed! */
+	return file_name + len;
+}
+
+static void tab_get_row_file_name(char *table_name, char *name, xtTableID tab_id)
+{
+	sprintf(table_name, "%s-%lu.xtr", name, (u_long) tab_id);
+}
+
+static void tab_get_data_file_name(char *table_name, char *name, xtTableID XT_UNUSED(tab_id))
+{
+	sprintf(table_name, "%s.xtd", name);
+}
+
+static void tab_get_index_file_name(char *table_name, char *name, xtTableID XT_UNUSED(tab_id))
+{
+	sprintf(table_name, "%s.xti", name);
+}
+
+static void tab_free_by_id(XTThreadPtr self, void *XT_UNUSED(thunk), void *item)
+{
+	XTTableEntryPtr	te_ptr = (XTTableEntryPtr) item;
+
+	if (te_ptr->te_tab_name) {
+		xt_free(self, te_ptr->te_tab_name);
+		te_ptr->te_tab_name = NULL;
+	}
+	te_ptr->te_tab_id = 0;
+	te_ptr->te_table = NULL;
+}
+
+static int tab_comp_by_id(XTThreadPtr XT_UNUSED(self), register const void *XT_UNUSED(thunk), register const void *a, register const void *b)
+{
+	xtTableID		te_id = *((xtTableID *) a);
+	XTTableEntryPtr	te_ptr = (XTTableEntryPtr) b;
+
+	if (te_id < te_ptr->te_tab_id)
+		return -1;
+	if (te_id == te_ptr->te_tab_id)
+		return 0;
+	return 1;
+}
+
+static void tab_free_path(XTThreadPtr self, void *XT_UNUSED(thunk), void *item)
+{
+	XTTablePathPtr	tp_ptr = *((XTTablePathPtr *) item);
+
+	xt_free(self, tp_ptr);
+}
+
+static int tab_comp_path(XTThreadPtr XT_UNUSED(self), register const void *XT_UNUSED(thunk), register const void *a, register const void *b)
+{
+	char			*path = (char *) a;
+	XTTablePathPtr	tp_ptr = *((XTTablePathPtr *) b);
+
+	return xt_tab_compare_paths(path, tp_ptr->tp_path);
+}
+
+xtPublic void xt_describe_tables_init(XTThreadPtr self, XTDatabaseHPtr db, XTTableDescPtr td)
+{
+	td->td_db = db;
+	td->td_path_idx = 0;
+	if (td->td_path_idx < xt_sl_get_size(db->db_table_paths)) {
+		XTTablePathPtr *tp_ptr;
+
+		tp_ptr = (XTTablePathPtr *) xt_sl_item_at(db->db_table_paths, td->td_path_idx);
+		td->td_tab_path = *tp_ptr;
+		td->td_open_dir = xt_dir_open(self, td->td_tab_path->tp_path, "*.xtr");
+	}
+	else
+		td->td_open_dir = NULL;
+}
+
+xtPublic xtBool xt_describe_tables_next(XTThreadPtr self, XTTableDescPtr td)
+{
+	char	*tab_name;
+	xtBool	r = FALSE;
+
+	enter_();
+	retry:
+	if (!td->td_open_dir)
+		return_(FALSE);
+	try_(a) {
+		r = xt_dir_next(self, td->td_open_dir);
+	}
+	catch_(a) {
+		xt_describe_tables_exit(self, td);
+		throw_();
+	}
+	cont_(a);
+	if (!r) {
+		XTTablePathPtr *tp_ptr;
+
+		if (td->td_path_idx+1 >= xt_sl_get_size(td->td_db->db_table_paths))
+			return_(FALSE);
+
+		if (td->td_open_dir)
+			xt_dir_close(NULL, td->td_open_dir);
+		td->td_open_dir = NULL;
+
+		td->td_path_idx++;
+		tp_ptr = (XTTablePathPtr *) xt_sl_item_at(td->td_db->db_table_paths, td->td_path_idx);
+		td->td_tab_path = *tp_ptr;
+		td->td_open_dir = xt_dir_open(self, td->td_tab_path->tp_path, "*.xtr");
+		goto retry;
+	}
+
+	tab_name = xt_dir_name(self, td->td_open_dir);
+	td->td_file_name = tab_name;
+	td->td_tab_id = (xtTableID) xt_file_name_to_id(tab_name);
+	xt_tab_file_to_name(XT_TABLE_NAME_SIZE, td->td_tab_name, tab_name);
+	return_(TRUE);
+}
+
+xtPublic void xt_describe_tables_exit(XTThreadPtr XT_UNUSED(self), XTTableDescPtr td)
+{
+	if (td->td_open_dir)
+		xt_dir_close(NULL, td->td_open_dir);
+	td->td_open_dir = NULL;
+	td->td_tab_path = NULL;
+}
+
+xtPublic void xt_tab_init_db(XTThreadPtr self, XTDatabaseHPtr db)
+{
+	XTTableDescRec		desc;
+	XTTableEntryRec		te_tab;
+	XTTableEntryPtr		te_ptr;
+	XTTablePathPtr		db_path;
+	char				pbuf[PATH_MAX];
+	int					len;
+	u_int				edx;
+
+	enter_();
+	pushr_(xt_tab_exit_db, db);
+	if (pbxt_ignore_case)
+		db->db_tables = xt_new_hashtable(self, tab_list_comp_ci, tab_list_hash_ci, tab_list_free, TRUE, TRUE);
+	else
+		db->db_tables = xt_new_hashtable(self, tab_list_comp, tab_list_hash, tab_list_free, TRUE, TRUE);
+	db->db_table_by_id = xt_new_sortedlist(self, sizeof(XTTableEntryRec), 20, 20, tab_comp_by_id, db, tab_free_by_id, FALSE, FALSE);
+	db->db_table_paths = xt_new_sortedlist(self, sizeof(XTTablePathPtr), 20, 20, tab_comp_path, db, tab_free_path, FALSE, FALSE);
+	db->db_error_list = xt_new_sortedlist(self, sizeof(XTTableError), 20, 20, tab_comp_tab_error, db, NULL, TRUE, FALSE);
+
+	if (db->db_multi_path) {
+		XTOpenFilePtr	of;
+		char			*buffer, *ptr, *path;
+
+		xt_strcpy(PATH_MAX, pbuf, db->db_main_path);
+		xt_add_location_file(PATH_MAX, pbuf);
+		if (xt_fs_exists(pbuf)) {
+			of = xt_open_file(self, pbuf, XT_FS_DEFAULT);
+			pushr_(xt_close_file, of);
+			len = (int) xt_seek_eof_file(self, of);
+			buffer = (char *) xt_malloc(self, len + 1);
+			pushr_(xt_free, buffer);
+			if (!xt_pread_file(of, 0, len, len, buffer, NULL, &self->st_statistics.st_x, self))
+				xt_throw(self);
+			buffer[len] = 0;
+			ptr = buffer;
+			while (*ptr) {
+				/* Ignore preceeding space: */
+				while (*ptr && isspace(*ptr))
+					ptr++;
+				path = ptr;
+				while (*ptr && *ptr != '\n' && *ptr != '\r') {
+#ifdef XT_WIN
+					/* Undo the conversion below: */
+					if (*ptr == '/')
+						*ptr = '\\';
+#endif
+					ptr++;
+				}
+				if (*path != '#' && ptr > path) {
+					len = (int) (ptr - path);
+					db_path = (XTTablePathPtr) xt_malloc(self, offsetof(XTTablePathRec, tp_path) + len + 1);
+					db_path->tp_tab_count = 0;
+					memcpy(db_path->tp_path, path, len);
+					db_path->tp_path[len] = 0;
+					xt_sl_insert(self, db->db_table_paths, db_path->tp_path, &db_path);
+				}
+				ptr++;
+			}
+			freer_(); // xt_free(buffer)
+			freer_(); // xt_close_file(of)
+		}
+	}
+	else {
+		len = (int) strlen(db->db_main_path);
+		db_path = (XTTablePathPtr) xt_malloc(self, offsetof(XTTablePathRec, tp_path) + len + 1);
+		db_path->tp_tab_count = 0;
+		strcpy(db_path->tp_path, db->db_main_path);
+		xt_sl_insert(self, db->db_table_paths, db_path->tp_path, &db_path);
+	}
+
+	xt_describe_tables_init(self, db, &desc);
+	pushr_(xt_describe_tables_exit, &desc);
+	while (xt_describe_tables_next(self, &desc)) {
+		te_tab.te_tab_id = desc.td_tab_id;
+
+		if (te_tab.te_tab_id > db->db_curr_tab_id)
+			db->db_curr_tab_id = te_tab.te_tab_id;
+
+		te_tab.te_tab_name = xt_dup_string(self, desc.td_tab_name);
+		te_tab.te_tab_path = desc.td_tab_path;
+		desc.td_tab_path->tp_tab_count++;
+		te_tab.te_table = NULL;
+		xt_sl_insert(self, db->db_table_by_id, &desc.td_tab_id, &te_tab);
+	}
+	freer_(); // xt_describe_tables_exit(&desc)
+
+	/*
+	 * When we open all tables, we ignore problems with foreign keys.
+	 * This must be done or we will not be able to load tables that
+	 * were created with foreign key checks off.
+	 */
+	self->st_ignore_fkeys = 1;
+	/* 
+	 * The purpose of this code is to ensure that all tables are opened and cached,
+	 * which is actually only required if tables have foreign key references.
+	 *
+	 * In other words, a side affect of this code is that FK references between tables
+	 * are registered, and checked.
+	 *
+	 * Unfortunately we don't know if a table is referenced by a FK, so we have to open
+	 * all tables.
+	 * 
+	 * Cannot open tables in the loop above because db->db_table_by_id which is built 
+	 * above is used by xt_use_table_no_lock() 
+	 *
+	 * {TABLE-STATS}
+	 * NOTE: The code also lead to the statistics failing to work because 
+	 * the tables were already open when the handler was opened.
+	 * Previously we only caclulated statistics when a handler was opened
+	 * and the underlying table was also opened.
+	 */
+	xt_enum_tables_init(&edx);
+	while ((te_ptr = xt_enum_tables_next(self, db, &edx))) {
+		xt_strcpy(PATH_MAX, pbuf, te_ptr->te_tab_path->tp_path);
+		xt_add_dir_char(PATH_MAX, pbuf);
+		xt_strcat(PATH_MAX, pbuf, te_ptr->te_tab_name);
+		try_(a) {
+			xt_heap_release(self, xt_use_table_no_lock(self, db, (XTPathStrPtr) pbuf, FALSE, FALSE, NULL));
+		}
+		catch_(a) {
+			/* ignore errors, because we are just loading all
+			 * the tables that we can...
+			 */
+			xt_log_and_clear_warning(self);
+		}
+		cont_(a);
+	}
+	self->st_ignore_fkeys = 0;
+
+	popr_(); // Discard xt_tab_exit_db(db)
+	exit_();
+}
+
+static void tab_save_table_paths(XTThreadPtr self, XTDatabaseHPtr db)
+{
+	XTTablePathPtr		*tp_ptr;
+	XTStringBufferRec	buffer;
+	XTOpenFilePtr		of;
+	char				path[PATH_MAX];
+
+	memset(&buffer, 0, sizeof(buffer));
+
+	xt_strcpy(PATH_MAX, path, db->db_main_path);
+	xt_add_location_file(PATH_MAX, path);
+
+	if (xt_sl_get_size(db->db_table_paths)) {
+		pushr_(xt_sb_free, &buffer);
+		for (u_int i=0; i<xt_sl_get_size(db->db_table_paths); i++) {
+			tp_ptr = (XTTablePathPtr *) xt_sl_item_at(db->db_table_paths, i);
+			xt_sb_concat(self, &buffer, (*tp_ptr)->tp_path);
+			xt_sb_concat(self, &buffer, "\n");
+		}
+
+#ifdef XT_WIN
+		/* To make the location file cross-platform (at least
+		 * as long as relative paths are used) we replace all '\' 
+		 * with '/': */
+		char *ptr;
+		
+		ptr = buffer.sb_cstring;
+		while (*ptr) {
+			if (*ptr == '\\')
+				*ptr = '/';
+			ptr++;
+		}
+#endif
+
+		of = xt_open_file(self, path, XT_FS_CREATE | XT_FS_MAKE_PATH);
+		pushr_(xt_close_file, of);
+		if (!xt_pwrite_file(of, 0, strlen(buffer.sb_cstring), buffer.sb_cstring, &self->st_statistics.st_x, self))
+			xt_throw(self);
+		xt_set_eof_file(self, of, strlen(buffer.sb_cstring));
+		freer_(); // xt_close_file(of)
+		
+		freer_(); // xt_sb_free(&buffer);
+	}
+	else
+		xt_fs_delete(NULL, path);
+}
+
+static XTTablePathPtr tab_get_table_path(XTThreadPtr self, XTDatabaseHPtr db, XTPathStrPtr tab_name, xtBool save_it)
+{
+	XTTablePathPtr	*tp, tab_path;
+	char			path[PATH_MAX];
+
+	xt_strcpy(PATH_MAX, path, tab_name->ps_path);
+	xt_remove_last_name_of_path(path);
+	xt_remove_dir_char(path);
+	tp = (XTTablePathPtr *) xt_sl_find(self, db->db_table_paths, path);
+	if (tp)
+		tab_path = *tp;
+	else {
+		int len = (int) strlen(path);
+
+		tab_path = (XTTablePathPtr) xt_malloc(self, offsetof(XTTablePathRec, tp_path) + len + 1);
+		tab_path->tp_tab_count = 0;
+		memcpy(tab_path->tp_path, path, len);
+		tab_path->tp_path[len] = 0;
+		xt_sl_insert(self, db->db_table_paths, tab_path->tp_path, &tab_path);
+		if (save_it) {
+			tab_save_table_paths(self, db);
+			if (xt_sl_get_size(db->db_table_paths) == 1) {
+				XTSystemTableShare::createSystemTables(self, db);
+			}
+		}
+	}
+	tab_path->tp_tab_count++;
+	return tab_path;
+}
+
+static void tab_remove_table_path(XTThreadPtr self, XTDatabaseHPtr db, XTTablePathPtr tab_path)
+{
+	if (tab_path->tp_tab_count > 0) {
+		tab_path->tp_tab_count--;
+		if (tab_path->tp_tab_count == 0) {
+			xt_sl_delete(self, db->db_table_paths, tab_path->tp_path);
+			tab_save_table_paths(self, db);
+		}
+	}
+}
+
+static void tab_free_table_path(XTThreadPtr self, XTTablePathPtr tab_path)
+{
+	XTDatabaseHPtr db = self->st_database;
+
+	tab_remove_table_path(self, db, tab_path);
+}
+
+xtPublic void xt_tab_exit_db(XTThreadPtr self, XTDatabaseHPtr db)
+{
+	if (db->db_tables) {
+		xt_free_hashtable(self, db->db_tables);
+		db->db_tables = NULL;
+	}
+	if (db->db_table_by_id) {
+		xt_free_sortedlist(self, db->db_table_by_id);
+		db->db_table_by_id = NULL;
+	}
+	if (db->db_table_paths) {
+		xt_free_sortedlist(self, db->db_table_paths);
+		db->db_table_paths = NULL;
+	}
+	if (db->db_error_list) {
+		xt_free_sortedlist(self, db->db_error_list);
+		db->db_error_list = NULL;
+	}
+}
+
+static void tab_check_table(XTThreadPtr self, XTTableHPtr XT_UNUSED(tab))
+{
+	(void) self;
+	enter_();
+	exit_();
+}
+
+xtPublic void xt_check_tables(XTThreadPtr self)
+{
+	u_int					edx;
+	XTTableEntryPtr			te_ptr;
+	volatile XTTableHPtr	tab;
+	char					path[PATH_MAX];
+
+	enter_();
+	xt_logf(XT_INFO, "Check %s: Table...\n", self->st_database->db_main_path);
+	xt_enum_tables_init(&edx);
+	try_(a) {
+		for (;;) {
+			xt_ht_lock(self, self->st_database->db_tables);
+			pushr_(xt_ht_unlock, self->st_database->db_tables);
+			te_ptr = xt_enum_tables_next(self, self->st_database, &edx);
+			freer_(); // xt_ht_unlock(db->db_tables)
+			if (!te_ptr)
+				break;
+			xt_strcpy(PATH_MAX, path, te_ptr->te_tab_path->tp_path);
+			xt_add_dir_char(PATH_MAX, path);
+			xt_strcat(PATH_MAX, path, te_ptr->te_tab_name);
+			tab = xt_use_table(self, (XTPathStrPtr) path, FALSE, FALSE);
+			tab_check_table(self, tab);
+			xt_heap_release(self, tab);
+			tab = NULL;
+		}
+	}
+	catch_(a) {
+		if (tab)
+			xt_heap_release(self, tab);
+		throw_();
+	}
+	cont_(a);
+	exit_();
+}
+
+xtPublic xtBool xt_table_exists(XTDatabaseHPtr db)
+{
+	return xt_sl_get_size(db->db_table_by_id) > 0;
+}
+
+/*
+ * Enumerate all tables in the current database.
+ */
+
+xtPublic void xt_enum_tables_init(u_int *edx)
+{
+	*edx = 0;
+}
+
+xtPublic XTTableEntryPtr xt_enum_tables_next(XTThreadPtr XT_UNUSED(self), XTDatabaseHPtr db, u_int *edx)
+{
+	XTTableEntryPtr en_ptr;
+
+	if (*edx >= xt_sl_get_size(db->db_table_by_id))
+		return NULL;
+	en_ptr = (XTTableEntryPtr) xt_sl_item_at(db->db_table_by_id, *edx);
+	(*edx)++;
+	return en_ptr;
+}
+
+xtPublic void xt_enum_files_of_tables_init(XTPathStrPtr tab_name, xtTableID tab_id, XTFilesOfTablePtr ft)
+{
+	ft->ft_state = 0;
+	ft->ft_tab_name = tab_name;
+	ft->ft_tab_id = tab_id;
+}
+
+xtPublic xtBool xt_enum_files_of_tables_next(XTFilesOfTablePtr ft)
+{
+	char file_name[XT_MAX_TABLE_FILE_NAME_SIZE];
+
+	retry:
+	switch (ft->ft_state) {
+		case 0:
+			tab_get_row_file_name(file_name, xt_last_name_of_path(ft->ft_tab_name->ps_path), ft->ft_tab_id);
+			break;
+		case 1:
+			tab_get_data_file_name(file_name, xt_last_name_of_path(ft->ft_tab_name->ps_path), ft->ft_tab_id);
+			break;
+		case 2:
+			tab_get_index_file_name(file_name, xt_last_name_of_path(ft->ft_tab_name->ps_path), ft->ft_tab_id);
+			break;
+		default:
+			return FAILED;
+	}
+
+	ft->ft_state++;
+	xt_strcpy(PATH_MAX, ft->ft_file_path, ft->ft_tab_name->ps_path);
+	xt_remove_last_name_of_path(ft->ft_file_path);
+	xt_strcat(PATH_MAX, ft->ft_file_path, file_name);
+	if (!xt_fs_exists(ft->ft_file_path))
+		goto retry;
+
+	return TRUE;
+}
+
+static xtBool tab_find_table(XTThreadPtr self, XTDatabaseHPtr db, XTPathStrPtr name, xtTableID *tab_id)
+{
+	u_int			edx;
+	XTTableEntryPtr	te_ptr;
+	char			path[PATH_MAX];
+
+	xt_enum_tables_init(&edx);
+	while ((te_ptr = xt_enum_tables_next(self, db, &edx))) {
+		xt_strcpy(PATH_MAX, path, te_ptr->te_tab_path->tp_path);
+		xt_add_dir_char(PATH_MAX, path);
+		xt_strcat(PATH_MAX, path, te_ptr->te_tab_name);
+		if (xt_tab_compare_names(path, name->ps_path) == 0) {
+			*tab_id = te_ptr->te_tab_id;
+			return TRUE;
+		}
+	}
+	return FALSE;
+}
+
+xtPublic void xt_tab_disable_index(XTTableHPtr tab, u_int ind_error)
+{
+	tab->tab_dic.dic_disable_index = ind_error;
+	xt_tab_set_table_repair_pending(tab);
+}
+
+xtPublic void xt_tab_set_index_error(XTTableHPtr tab)
+{
+	switch (tab->tab_dic.dic_disable_index) {
+		case XT_INDEX_OK:
+			break;
+		case XT_INDEX_TOO_OLD:
+			xt_register_taberr(XT_REG_CONTEXT, XT_ERR_INDEX_OLD_VERSION, tab->tab_name);
+			break;
+		case XT_INDEX_TOO_NEW:
+			xt_register_taberr(XT_REG_CONTEXT, XT_ERR_INDEX_NEW_VERSION, tab->tab_name);
+			break;
+		case XT_INDEX_BAD_BLOCK:
+			char number[40];
+
+			sprintf(number, "%d", (int) tab->tab_index_page_size);
+			xt_register_i2xterr(XT_REG_CONTEXT, XT_ERR_BAD_IND_BLOCK_SIZE, xt_last_name_of_path(tab->tab_name->ps_path), number);
+			break;
+		case XT_INDEX_CORRUPTED:
+			xt_register_taberr(XT_REG_CONTEXT, XT_ERR_INDEX_CORRUPTED, tab->tab_name);
+			break;
+		case XT_INDEX_MISSING:
+			xt_register_taberr(XT_REG_CONTEXT, XT_ERR_INDEX_MISSING, tab->tab_name);
+			break;
+	}
+}
+
+static void tab_load_index_header(XTThreadPtr self, XTTableHPtr tab, XTOpenFilePtr file, XTPathStrPtr table_name)
+{
+	XT_NODE_TEMP;
+	XTIndexPtr			*ind;
+	xtWord1				*data;
+	XTIndexFormatDPtr	index_fmt;
+
+	/* Load the pointers: */
+	if (tab->tab_index_head)
+		xt_free_ns(tab->tab_index_head);
+	tab->tab_index_head = (XTIndexHeadDPtr) xt_calloc(self, XT_INDEX_HEAD_SIZE);
+
+	if (file) {
+		if (!xt_pread_file(file, 0, XT_INDEX_HEAD_SIZE, 0, tab->tab_index_head, NULL, &self->st_statistics.st_ind, self))
+			xt_throw(self);
+
+		tab->tab_index_format_offset = XT_GET_DISK_4(tab->tab_index_head->tp_format_offset_4);
+		index_fmt = (XTIndexFormatDPtr) (((xtWord1 *) tab->tab_index_head) + tab->tab_index_format_offset);
+
+		/* If the table version is less than or equal to an incompatible (unsupported
+		 * version), or greater than the current version, then we cannot open this table
+		 */
+		if (XT_GET_DISK_2(index_fmt->if_tab_version_2) <= XT_TAB_INCOMPATIBLE_VERSION ||
+			XT_GET_DISK_2(index_fmt->if_tab_version_2) > XT_TAB_CURRENT_VERSION) {
+			switch (XT_GET_DISK_2(index_fmt->if_tab_version_2)) {
+				case 4: 
+					xt_throw_tabcolerr(XT_CONTEXT, XT_ERR_UPGRADE_TABLE, table_name, "0.9.91 Beta");
+					break;
+				case 3: 
+					xt_throw_tabcolerr(XT_CONTEXT, XT_ERR_UPGRADE_TABLE, table_name, "0.9.85 Beta");
+					break;
+				default:
+					xt_throw_taberr(XT_CONTEXT, XT_ERR_BAD_TABLE_VERSION, table_name);
+					break;
+			}
+			return;
+		}
+
+		tab->tab_dic.dic_index_ver = XT_GET_DISK_2(index_fmt->if_ind_version_2);
+		tab->tab_dic.dic_disable_index = XT_INDEX_OK;
+
+		if (tab->tab_dic.dic_index_ver == 1) {
+			tab->tab_index_header_size = 1024 * 16;
+			tab->tab_index_page_size = 1024 * 16;
+		}
+		else {
+			tab->tab_index_header_size = XT_GET_DISK_4(tab->tab_index_head->tp_header_size_4);
+			tab->tab_index_page_size = XT_GET_DISK_4(index_fmt->if_page_size_4);
+		}	
+
+#ifdef XT_USE_LAZY_DELETE
+		if (tab->tab_dic.dic_index_ver <= XT_IND_NO_LAZY_DELETE)
+			tab->tab_dic.dic_no_lazy_delete = TRUE;
+		else
+			tab->tab_dic.dic_no_lazy_delete = FALSE;
+#else
+		tab->tab_dic.dic_no_lazy_delete = TRUE;
+#endif
+
+		/* Incorrect version of index is handled by allowing a sequential scan, but no index access.
+		 * Recovery with the wrong index type will not recover the indexes, a REPAIR TABLE
+		 * will be required!
+		 */
+		if (tab->tab_dic.dic_index_ver != XT_IND_CURRENT_VERSION) {
+			switch (tab->tab_dic.dic_index_ver) {
+				case XT_IND_NO_LAZY_DELETE:
+				case XT_IND_LAZY_DELETE_OK:
+					/* I can handle this type of index. */
+					break;
+				default:
+					if (tab->tab_dic.dic_index_ver < XT_IND_CURRENT_VERSION)
+						xt_tab_disable_index(tab, XT_INDEX_TOO_OLD);
+					else
+						xt_tab_disable_index(tab, XT_INDEX_TOO_NEW);
+					break;
+			}
+		}
+		else if (tab->tab_index_page_size != XT_INDEX_PAGE_SIZE)
+			xt_tab_disable_index(tab, XT_INDEX_BAD_BLOCK);
+	}
+	else {
+		memset(tab->tab_index_head, 0, XT_INDEX_HEAD_SIZE);
+		xt_tab_disable_index(tab, XT_INDEX_MISSING);
+		tab->tab_index_header_size = XT_INDEX_HEAD_SIZE;
+		tab->tab_index_page_size = XT_INDEX_PAGE_SIZE;
+		tab->tab_dic.dic_index_ver = 0;
+		tab->tab_index_format_offset = 0;
+	}
+
+	
+	if (tab->tab_dic.dic_disable_index) {
+		xt_tab_set_index_error(tab);
+		xt_log_and_clear_exception_ns();
+	}
+
+	if (tab->tab_dic.dic_disable_index) {
+		/* Reset, as if we have empty indexes.
+		 * Flush will wipe things out, of course.
+		 * REPAIR TABLE will be required...
+		 */
+		XT_NODE_ID(tab->tab_ind_eof) = 1;
+		XT_NODE_ID(tab->tab_ind_free) = 0;
+
+		ind = tab->tab_dic.dic_keys;
+		for (u_int i=0; i<tab->tab_dic.dic_key_count; i++, ind++)
+			XT_NODE_ID((*ind)->mi_root) = 0;
+	}
+	else {
+		XT_NODE_ID(tab->tab_ind_eof) = (xtIndexNodeID) XT_GET_DISK_6(tab->tab_index_head->tp_ind_eof_6);
+		XT_NODE_ID(tab->tab_ind_free) = (xtIndexNodeID) XT_GET_DISK_6(tab->tab_index_head->tp_ind_free_6);
+
+		data = tab->tab_index_head->tp_data;
+		ind = tab->tab_dic.dic_keys;
+		for (u_int i=0; i<tab->tab_dic.dic_key_count; i++, ind++) {
+			(*ind)->mi_root = XT_GET_NODE_REF(tab, data);
+			data += XT_NODE_REF_SIZE;
+		}
+	}
+}
+
+static void tab_load_table_format(XTThreadPtr self, XTOpenFilePtr file, XTPathStrPtr table_name, size_t *ret_format_offset, size_t *ret_head_size, XTDictionaryPtr dic)
+{
+	XTDiskValue4		size_buf;
+	size_t				head_size;
+	XTTableFormatDRec	tab_fmt;
+	size_t				fmt_size;
+
+	if (!xt_pread_file(file, 0, 4, 4, &size_buf, NULL, &self->st_statistics.st_rec, self))
+		xt_throw(self);
+
+	head_size = XT_GET_DISK_4(size_buf);
+	*ret_format_offset = head_size;
+
+	/* Load the table format information: */
+	if (!xt_pread_file(file, head_size, offsetof(XTTableFormatDRec, tf_definition), offsetof(XTTableFormatDRec, tf_tab_version_2) + 2, &tab_fmt, NULL, &self->st_statistics.st_rec, self))
+		xt_throw(self);
+
+	/* If the table version is less than or equal to an incompatible (unsupported
+	 * version), or greater than the current version, then we cannot open this table
+	 */
+	if (XT_GET_DISK_2(tab_fmt.tf_tab_version_2) <= XT_TAB_INCOMPATIBLE_VERSION ||
+		XT_GET_DISK_2(tab_fmt.tf_tab_version_2) > XT_TAB_CURRENT_VERSION) {
+		switch (XT_GET_DISK_2(tab_fmt.tf_tab_version_2)) {
+			case 4: 
+				xt_throw_tabcolerr(XT_CONTEXT, XT_ERR_UPGRADE_TABLE, table_name, "0.9.91 Beta");
+				break;
+			case 3: 
+				xt_throw_tabcolerr(XT_CONTEXT, XT_ERR_UPGRADE_TABLE, table_name, "0.9.85 Beta");
+				break;
+			default:
+				xt_throw_taberr(XT_CONTEXT, XT_ERR_BAD_TABLE_VERSION, table_name);
+				break;
+		}
+		return;
+	}
+
+	fmt_size = XT_GET_DISK_4(tab_fmt.tf_format_size_4);
+	*ret_head_size = XT_GET_DISK_4(tab_fmt.tf_tab_head_size_4);
+	dic->dic_rec_size = XT_GET_DISK_4(tab_fmt.tf_rec_size_4);
+	dic->dic_rec_fixed = XT_GET_DISK_1(tab_fmt.tf_rec_fixed_1);
+	dic->dic_tab_flags = XT_GET_DISK_2(tab_fmt.tf_tab_flags_2);
+	dic->dic_min_auto_inc = XT_GET_DISK_8(tab_fmt.tf_min_auto_inc_8);
+	if (fmt_size > offsetof(XTTableFormatDRec, tf_definition)) {
+		size_t	def_size = fmt_size - offsetof(XTTableFormatDRec, tf_definition);
+		char	*def_sql;
+
+		pushsr_(def_sql, xt_free, (char *) xt_malloc(self, def_size));
+		if (!xt_pread_file(file, head_size+offsetof(XTTableFormatDRec, tf_definition), def_size, def_size, def_sql, NULL, &self->st_statistics.st_rec, self))
+			xt_throw(self);
+		dic->dic_table = xt_ri_create_table(self, false, table_name, def_sql, myxt_create_table_from_table(self, dic->dic_my_table));
+		freer_(); // xt_free(def_sql)
+	}
+	else
+		dic->dic_table = myxt_create_table_from_table(self, dic->dic_my_table);
+}
+
+static void tab_load_table_header(XTThreadPtr self, XTTableHPtr tab, XTOpenFilePtr file)
+{
+	XTTableHeadDRec	rec_head;
+
+	if (!xt_pread_file(file, 0, sizeof(XTTableHeadDRec), sizeof(XTTableHeadDRec), (xtWord1 *) &rec_head, NULL, &self->st_statistics.st_rec, self))
+		xt_throw(self);
+
+	tab->tab_head_op_seq = XT_GET_DISK_4(rec_head.th_op_seq_4);
+	tab->tab_head_row_free_id = (xtRowID) XT_GET_DISK_6(rec_head.th_row_free_6);
+	tab->tab_head_row_eof_id = (xtRowID) XT_GET_DISK_6(rec_head.th_row_eof_6);
+	tab->tab_head_row_fnum = (xtWord4) XT_GET_DISK_6(rec_head.th_row_fnum_6);
+	tab->tab_head_rec_free_id = (xtRecordID) XT_GET_DISK_6(rec_head.th_rec_free_6);
+	tab->tab_head_rec_eof_id = (xtRecordID) XT_GET_DISK_6(rec_head.th_rec_eof_6);
+	tab->tab_head_rec_fnum = (xtWord4) XT_GET_DISK_6(rec_head.th_rec_fnum_6);
+}
+
+xtPublic void xt_tab_store_header(XTOpenTablePtr ot, XTTableHeadDPtr rec_head)
+{
+	XTTableHPtr tab = ot->ot_table;
+
+	XT_SET_DISK_4(rec_head->th_op_seq_4, tab->tab_head_op_seq);
+	XT_SET_DISK_6(rec_head->th_row_free_6, tab->tab_head_row_free_id);
+	XT_SET_DISK_6(rec_head->th_row_eof_6, tab->tab_head_row_eof_id);
+	XT_SET_DISK_6(rec_head->th_row_fnum_6, tab->tab_head_row_fnum);
+	XT_SET_DISK_6(rec_head->th_rec_free_6, tab->tab_head_rec_free_id);
+	XT_SET_DISK_6(rec_head->th_rec_eof_6, tab->tab_head_rec_eof_id);
+	XT_SET_DISK_6(rec_head->th_rec_fnum_6, tab->tab_head_rec_fnum);
+}
+
+xtPublic xtBool xt_tab_write_header(XTOpenTablePtr ot, XTTableHeadDPtr rec_head, struct XTThread *thread)
+{
+	if (!XT_PWRITE_RR_FILE(ot->ot_rec_file, offsetof(XTTableHeadDRec, th_op_seq_4), 40, (xtWord1 *) rec_head->th_op_seq_4, &thread->st_statistics.st_rec, thread))
+		return FAILED;
+	if (!XT_FLUSH_RR_FILE(ot->ot_rec_file, &thread->st_statistics.st_rec, thread))
+		return FAILED;
+	return OK;
+}
+
+xtPublic xtBool xt_tab_write_min_auto_inc(XTOpenTablePtr ot)
+{
+	xtWord1		value[8];
+	off_t		offset;
+
+	XT_SET_DISK_8(value, ot->ot_table->tab_dic.dic_min_auto_inc);
+	offset = ot->ot_table->tab_table_format_offset + offsetof(XTTableFormatDRec, tf_min_auto_inc_8);
+	if (!XT_PWRITE_RR_FILE(ot->ot_rec_file, offset, 8, value, &ot->ot_thread->st_statistics.st_rec, ot->ot_thread))
+		return FAILED;
+	if (!XT_FLUSH_RR_FILE(ot->ot_rec_file, &ot->ot_thread->st_statistics.st_rec, ot->ot_thread))
+		return FAILED;
+	return OK;
+}
+
+/* a helper function to remove table from the open tables hash on exception
+ * used in tab_new_handle() below
+ */
+ #ifdef NO_LONGER_REQ
+static void xt_del_from_db_tables_ht(XTThreadPtr self, XTTableHPtr tab)
+{
+	XTTableEntryPtr	te_ptr;
+	XTDatabaseHPtr	db = tab->tab_db;
+	xtTableID		tab_id = tab->tab_id;
+
+	/* Oops! should use tab->tab_name, instead of tab! */
+	xt_ht_del(self, db->db_tables, tab->tab_name);
+
+	/* Remove the reference from the ID list, when a table is
+	 * removed from the table name list:
+	 */
+	if ((te_ptr = (XTTableEntryPtr) xt_sl_find(self, db->db_table_by_id, &tab_id)))
+		te_ptr->te_table = NULL;
+}
+#endif
+
+/*
+ * Create a new table handle (i.e. open a table).
+ * Return NULL if the table is missing, and it is OK for the table
+ * to be missing.
+ */
+static int tab_new_handle(XTThreadPtr self, XTTableHPtr *r_tab, XTDatabaseHPtr db, xtTableID tab_id, XTPathStrPtr tab_path, xtBool missing_ok, XTDictionaryPtr dic)
+{
+	char			path[PATH_MAX];
+	XTTableHPtr		tab;
+	char			file_name[XT_MAX_TABLE_FILE_NAME_SIZE];
+	XTOpenFilePtr	of_rec, of_ind;
+	XTTableEntryPtr	te_ptr;
+	size_t			tab_format_offset;
+	size_t			tab_head_size;
+
+	enter_();
+
+	tab = (XTTableHPtr) xt_heap_new(self, sizeof(XTTableHRec), tab_finalize);
+	pushr_(xt_heap_release, tab);
+
+	tab->tab_name = (XTPathStrPtr) xt_dup_string(self, tab_path->ps_path);
+	tab->tab_db = db;
+	tab->tab_id = tab_id;
+#ifdef TRACE_TABLE_IDS
+	PRINTF("%s: allocated TABLE: db=%d tab=%d %s\n", self->t_name, (int) db->db_id, (int) tab->tab_id, xt_last_2_names_of_path(tab->tab_name->ps_path));
+#endif
+
+	if (dic) {
+		myxt_move_dictionary(&tab->tab_dic, dic);
+		myxt_setup_dictionary(self, &tab->tab_dic);
+	}
+	else {
+		if (!myxt_load_dictionary(self, &tab->tab_dic, db, tab_path)) {
+			freer_(); // xt_heap_release(tab)
+			return_(XT_TAB_NO_DICTIONARY);
+		}
+	}
+
+	tab->tab_seq.xt_op_seq_init(self);
+	xt_spinlock_init_with_autoname(self, &tab->tab_ainc_lock);
+	xt_init_mutex_with_autoname(self, &tab->tab_rec_flush_lock);
+	xt_init_mutex_with_autoname(self, &tab->tab_ind_flush_lock);
+	xt_init_mutex_with_autoname(self, &tab->tab_ind_stat_lock);
+	xt_init_mutex_with_autoname(self, &tab->tab_dic_field_lock);
+	xt_init_mutex_with_autoname(self, &tab->tab_row_lock);
+	xt_init_mutex_with_autoname(self, &tab->tab_ind_lock);
+	xt_init_mutex_with_autoname(self, &tab->tab_rec_lock);
+	for (u_int i=0; i<XT_ROW_RWLOCKS; i++)
+		XT_TAB_ROW_INIT_LOCK(self, &tab->tab_row_rwlock[i]);
+	tab->tab_free_locks = TRUE;
+
+	xt_strcpy(PATH_MAX, path, tab_path->ps_path);
+	xt_remove_last_name_of_path(path);
+	tab_get_row_file_name(file_name, xt_last_name_of_path(tab_path->ps_path), tab_id);
+	xt_strcat(PATH_MAX, path, file_name);
+	tab->tab_row_file = xt_fs_get_file(self, path);
+
+	xt_remove_last_name_of_path(path);
+	tab_get_data_file_name(file_name, xt_last_name_of_path(tab_path->ps_path), tab_id);
+	xt_strcat(PATH_MAX, path, file_name);
+	tab->tab_rec_file = xt_fs_get_file(self, path);
+
+	xt_remove_last_name_of_path(path);
+	tab_get_index_file_name(file_name, xt_last_name_of_path(tab_path->ps_path), tab_id);
+	xt_strcat(PATH_MAX, path, file_name);
+	tab->tab_ind_file = xt_fs_get_file(self, path);
+
+	of_ind = xt_open_file(self, tab->tab_ind_file->fil_path, XT_FS_MISSING_OK);
+	if (of_ind) {
+		pushr_(xt_close_file, of_ind);
+		tab_load_index_header(self, tab, of_ind, tab_path);
+		freer_(); // xt_close_file(of_ind)
+	}
+	else
+		tab_load_index_header(self, tab, of_ind, tab_path);
+
+	of_rec = xt_open_file(self, tab->tab_rec_file->fil_path, missing_ok ? XT_FS_MISSING_OK : XT_FS_DEFAULT);
+	if (!of_rec) {
+		freer_(); // xt_heap_release(tab)
+		return_(XT_TAB_NOT_FOUND);
+	}
+	pushr_(xt_close_file, of_rec);
+	tab_load_table_format(self, of_rec, tab_path, &tab_format_offset, &tab_head_size, &tab->tab_dic);
+	tab->tab_table_format_offset = tab_format_offset;
+	tab->tab_table_head_size = tab_head_size;
+	tab->tab_dic.dic_table->dt_table = tab;
+	tab_load_table_header(self, tab, of_rec);
+	freer_(); // xt_close_file(of_rec)
+
+	tab->tab_seq.xt_op_seq_set(self, tab->tab_head_op_seq+1);
+	tab->tab_row_eof_id = tab->tab_head_row_eof_id;
+	tab->tab_row_free_id = tab->tab_head_row_free_id;
+	tab->tab_row_fnum = tab->tab_head_row_fnum;
+	tab->tab_rec_eof_id = tab->tab_head_rec_eof_id;
+	tab->tab_rec_free_id = tab->tab_head_rec_free_id;
+	tab->tab_rec_fnum = tab->tab_head_rec_fnum;
+
+	tab->tab_rows.xt_tc_setup(tab, sizeof(XTTabRowHeadDRec), sizeof(XTTabRowRefDRec));
+	tab->tab_recs.xt_tc_setup(tab, tab_head_size, tab->tab_dic.dic_rec_size);
+
+	xt_xres_init_tab(self, tab);
+
+	if (!xt_init_row_locks(&tab->tab_locks))
+		xt_throw(self);
+
+	xt_heap_set_release_callback(self, tab, tab_onrelease);
+
+	tab->tab_repair_pending = xt_tab_is_table_repair_pending(tab);
+
+	popr_(); // Discard xt_heap_release(tab)
+
+	xt_ht_put(self, db->db_tables, tab);
+
+	/* Add a reference to the ID list, when a table is
+	 * added to the table name list:
+	 */
+	if ((te_ptr = (XTTableEntryPtr) xt_sl_find(self, db->db_table_by_id, &tab->tab_id)))
+		te_ptr->te_table = tab;
+
+    /* Moved from after xt_init_row_locks() above, so that calling
+     * xt_use_table_no_lock() with no_load == FALSE from attachReferences()
+     * will work if we have cyclic foreign key references.
+     */ 
+	if (tab->tab_dic.dic_table) {
+		try_(a) {
+			tab->tab_dic.dic_table->attachReferences(self, db);
+		}
+		catch_(a) {
+			/* Errors are thrown when: set foreign_key_checks = 1 */
+			/* Undo everything done above: */
+			xt_ht_del(self, db->db_tables, tab->tab_name);
+			xt_throw(self);
+		}
+		cont_(a);
+	}
+
+	*r_tab = tab;
+	return_(XT_TAB_OK);
+}
+
+
+/*
+ * Get a reference to a table in the current database. The table reference is valid,
+ * as long as the thread is using the database!!!
+ */
+xtPublic XTTableHPtr xt_use_table_no_lock(XTThreadPtr self, XTDatabaseHPtr db, XTPathStrPtr name, xtBool no_load, xtBool missing_ok, XTDictionaryPtr dic)
+{
+	XTTableHPtr tab;
+
+	if (!db)
+		xt_throw_xterr(XT_CONTEXT, XT_ERR_NO_DATABASE_IN_USE);
+
+	tab = (XTTableHPtr) xt_ht_get(self, db->db_tables, name);
+	if (!tab && !no_load) {
+		xtTableID	tab_id = 0;
+
+		if (!tab_find_table(self, db, name, &tab_id)) {
+			if (missing_ok)
+				return NULL;
+			xt_throw_taberr(XT_CONTEXT, XT_ERR_TABLE_NOT_FOUND, name);
+		}
+
+		if (tab_new_handle(self, &tab, db, tab_id, name, FALSE, dic) == XT_TAB_NO_DICTIONARY)
+			xt_throw_taberr(XT_CONTEXT, XT_ERR_NO_DICTIONARY, name);
+	}
+	
+	if (tab)
+		xt_heap_reference(self, tab);
+
+	return tab;
+}
+
+static void tab_close_table(XTOpenTablePtr ot)
+{
+	xt_ind_free_reserved(ot);
+
+	if (ot->ot_rec_file) {
+		XT_CLOSE_RR_FILE_NS(ot->ot_rec_file);
+		ot->ot_rec_file = NULL;
+		
+	}
+	if (ot->ot_ind_file) {
+		xt_close_file_ns(ot->ot_ind_file);
+		ot->ot_ind_file = NULL;
+		
+	}
+	if (ot->ot_row_file) {
+		XT_CLOSE_RR_FILE_NS(ot->ot_row_file);
+		ot->ot_row_file = NULL;
+		
+	}
+	if (ot->ot_table) {
+		xt_heap_release(xt_get_self(), ot->ot_table);
+		ot->ot_table = NULL;
+	}
+	if (ot->ot_ind_rhandle) {
+		xt_ind_release_handle(ot->ot_ind_rhandle, FALSE, ot->ot_thread);
+		ot->ot_ind_rhandle = NULL;
+	}
+	if (ot->ot_row_rbuffer) {
+		xt_free_ns(ot->ot_row_rbuffer);
+		ot->ot_row_rbuf_size = 0;
+		ot->ot_row_rbuffer = NULL;
+	}
+	if (ot->ot_row_wbuffer) {
+		xt_free_ns(ot->ot_row_wbuffer);
+		ot->ot_row_wbuf_size = 0;
+		ot->ot_row_wbuffer = NULL;
+	}
+#ifdef XT_TRACK_RETURNED_ROWS
+	if (ot->ot_rows_returned) {
+		xt_free_ns(ot->ot_rows_returned);
+		ot->ot_rows_returned = NULL;
+	}
+	ot->ot_rows_ret_curr = 0;
+	ot->ot_rows_ret_max = 0;
+#endif
+	xt_free(NULL, ot);
+}
+
+/*
+ * This function locks a particular table by locking the table directory
+ * and waiting for all open tables handles to close.
+ *
+ * Things are a bit complicated because the sweeper must be turned off before
+ * the table directory is locked.
+ */
+static XTOpenTablePoolPtr tab_lock_table(XTThreadPtr self, XTPathStrPtr name, xtBool no_load, xtBool flush_table, xtBool missing_ok, XTTableHPtr *tab)
+{
+	XTOpenTablePoolPtr	table_pool;
+	XTDatabaseHPtr		db = self->st_database;
+
+	enter_();
+	/* Lock the table, and close all references: */
+	pushsr_(table_pool, xt_db_unlock_table_pool, xt_db_lock_table_pool_by_name(self, db, name, no_load, flush_table, missing_ok, FALSE, tab));
+	if (!table_pool) {
+		freer_(); // xt_db_unlock_table_pool(db)
+		return_(NULL);
+	}
+
+	/* Wait for all open tables to close: */
+	xt_db_wait_for_open_tables(self, table_pool);
+
+	popr_(); // Discard xt_db_unlock_table_pool(table_pool)
+	return_(table_pool);
+}
+
+static void tab_delete_table_files(XTThreadPtr self, XTPathStrPtr tab_name, xtTableID tab_id)
+{
+	XTFilesOfTableRec	ft;
+
+	xt_enum_files_of_tables_init(tab_name, tab_id, &ft);
+	while (xt_enum_files_of_tables_next(&ft)) {
+		if (!xt_fs_delete(NULL, ft.ft_file_path))
+			xt_log_and_clear_exception(self);
+	}
+}
+
+xtPublic void xt_create_table(XTThreadPtr self, XTPathStrPtr name, XTDictionaryPtr dic)
+{
+	char				table_name[XT_MAX_TABLE_FILE_NAME_SIZE];
+	char				path[PATH_MAX];
+	XTDatabaseHPtr		db = self->st_database;
+	XTOpenTablePoolPtr	table_pool;
+	XTTableHPtr			tab;
+	XTTableHPtr			old_tab = NULL;
+	xtTableID			old_tab_id = 0;
+	xtTableID			tab_id = 0;
+	XTTabRowHeadDRec	row_head;
+	XTTableHeadDRec		rec_head;
+	XTTableFormatDRec	table_fmt;
+	XTIndexFormatDPtr	index_fmt;
+	XTStringBufferRec	tab_def = { 0, 0, 0 };
+	XTTableEntryRec		te_tab;
+	XTSortedListInfoRec	li_undo;
+
+#ifdef TRACE_CREATE_TABLES
+	printf("CREATE %s\n", name->ps_path);
+#endif
+	enter_();
+	if (strlen(xt_last_name_of_path(name->ps_path)) > XT_TABLE_NAME_SIZE-1)
+		xt_throw_taberr(XT_CONTEXT, XT_ERR_NAME_TOO_LONG, name);
+	if (!db)
+		xt_throw_xterr(XT_CONTEXT, XT_ERR_NO_DATABASE_IN_USE);
+
+	/* Lock to prevent table list change during creation. */
+	table_pool = tab_lock_table(self, name, FALSE, TRUE, TRUE, &old_tab);
+	pushr_(xt_db_unlock_table_pool, table_pool);
+	xt_ht_lock(self, db->db_tables);
+	pushr_(xt_ht_unlock, db->db_tables);
+	pushr_(xt_heap_release, old_tab);
+
+	/* This must be done before we remove the old table
+	 * from the directory, or we will not be able
+	 * to find the table, which could is require
+	 * for TRUNCATE!
+	 */
+	if (xt_sl_get_size(db->db_table_by_id) >= XT_MAX_TABLES)
+		xt_throw_ulxterr(XT_CONTEXT, XT_ERR_TOO_MANY_TABLES, (u_long) XT_MAX_TABLES);
+
+	tab_id = db->db_curr_tab_id + 1;		
+
+	if (old_tab) {
+		old_tab_id = old_tab->tab_id;		
+		xt_dl_delete_ext_data(self, old_tab, FALSE, TRUE);
+		freer_(); // xt_heap_release(self, old_tab)
+
+		/* For the Windows version this must be done before we
+		 * start to delete the underlying files!
+		 */
+		tab_close_mapped_files(self, old_tab);
+
+		tab_delete_table_files(self, name, old_tab_id);
+
+		/* Remove the PBMS table: */
+		ASSERT(xt_get_self() == self);
+
+		/* Remove the table from the directory. It will get a new
+		 * ID so the handle in the directory will no longer be valid.
+		 */
+		xt_ht_del(self, db->db_tables, name);
+	}
+	else {
+		freer_(); // xt_heap_release(self, old_tab)
+	}
+
+	/* Add the table to the directory, well remove on error! */
+	li_undo.li_sl = db->db_table_by_id;
+	li_undo.li_key = &tab_id;
+	te_tab.te_tab_id = tab_id;
+	te_tab.te_tab_name = xt_dup_string(self, xt_last_name_of_path(name->ps_path));
+	te_tab.te_tab_path = tab_get_table_path(self, db, name, TRUE);
+	te_tab.te_table = NULL;
+	xt_sl_insert(self, db->db_table_by_id, &tab_id, &te_tab);
+	pushr_(xt_sl_delete_from_info, &li_undo);
+
+	*path = 0;
+	try_(a) {
+		XTOpenFilePtr	of_row, of_rec, of_ind;
+		off_t			eof;
+		size_t			def_len = 0;
+
+		tab = (XTTableHPtr) xt_heap_new(self, sizeof(XTTableHRec), tab_finalize);
+		pushr_(xt_heap_release, tab);
+
+		/* The length of the foreign key definition: */
+		if (dic->dic_table) {
+			dic->dic_table->loadString(self, &tab_def);
+			def_len = tab_def.sb_len + 1;
+		}
+
+		tab->tab_head_op_seq = 0;
+#ifdef DEBUG
+		//tab->tab_head_op_seq = 0xFFFFFFFF - 12;
+#endif
+
+		/* ------- ROW FILE: */
+		xt_strcpy(PATH_MAX, path, name->ps_path);
+		xt_remove_last_name_of_path(path);
+		tab_get_row_file_name(table_name, xt_last_name_of_path(name->ps_path), tab_id);
+		xt_strcat(PATH_MAX, path, table_name);
+
+		of_row = xt_open_file(self, path, XT_FS_CREATE | XT_FS_EXCLUSIVE);
+		pushr_(xt_close_file, of_row);
+		XT_SET_DISK_4(row_head.rh_magic_4, XT_TAB_ROW_MAGIC);
+		if (!xt_pwrite_file(of_row, 0, sizeof(row_head), &row_head, &self->st_statistics.st_rec, self))
+			xt_throw(self);
+		freer_(); // xt_close_file(of_row)
+
+		(void) ASSERT(sizeof(XTTabRowHeadDRec) == sizeof(XTTabRowRefDRec));
+		(void) ASSERT(sizeof(XTTabRowRefDRec) == 1 << XT_TAB_ROW_SHIFTS);
+
+		tab->tab_row_eof_id = 1;
+		tab->tab_row_free_id = 0;
+		tab->tab_row_fnum = 0;
+
+		tab->tab_head_row_eof_id = 1;
+		tab->tab_head_row_free_id = 0;
+		tab->tab_head_row_fnum  = 0;
+
+		/* ------------ DATA FILE: */
+		xt_remove_last_name_of_path(path);
+		tab_get_data_file_name(table_name, xt_last_name_of_path(name->ps_path), tab_id);
+		xt_strcat(PATH_MAX, path, table_name);
+		of_rec = xt_open_file(self, path, XT_FS_CREATE | XT_FS_EXCLUSIVE);
+		pushr_(xt_close_file, of_rec);
+
+		/* Calculate the offset of the first record in the data handle file. */
+		eof = sizeof(XTTableHeadDRec) + offsetof(XTTableFormatDRec, tf_definition) + def_len + XT_FORMAT_DEF_SPACE;
+		eof = (eof + 1024 - 1) / 1024 * 1024;		// Round to a value divisible by 1024
+
+		tab->tab_table_format_offset = sizeof(XTTableHeadDRec);
+		tab->tab_table_head_size = (size_t) eof;
+
+		tab->tab_rec_eof_id = 1;						// This is the first record ID!
+		tab->tab_rec_free_id = 0;
+		tab->tab_rec_fnum = 0;
+		
+		tab->tab_head_rec_eof_id = 1;					// The first record ID
+		tab->tab_head_rec_free_id = 0;
+		tab->tab_head_rec_fnum = 0;
+
+		tab->tab_dic.dic_rec_size = dic->dic_rec_size;
+		tab->tab_dic.dic_rec_fixed = dic->dic_rec_fixed;
+		tab->tab_dic.dic_tab_flags = dic->dic_tab_flags;
+		tab->tab_dic.dic_min_auto_inc = dic->dic_min_auto_inc;
+		tab->tab_dic.dic_def_ave_row_size = dic->dic_def_ave_row_size;
+
+		XT_SET_DISK_4(rec_head.th_head_size_4, sizeof(XTTableHeadDRec));
+		XT_SET_DISK_4(rec_head.th_op_seq_4, tab->tab_head_op_seq);
+		XT_SET_DISK_6(rec_head.th_row_free_6, tab->tab_head_row_free_id);
+		XT_SET_DISK_6(rec_head.th_row_eof_6, tab->tab_head_row_eof_id);
+		XT_SET_DISK_6(rec_head.th_row_fnum_6, tab->tab_head_row_fnum);
+		XT_SET_DISK_6(rec_head.th_rec_free_6, tab->tab_head_rec_free_id);
+		XT_SET_DISK_6(rec_head.th_rec_eof_6, tab->tab_head_rec_eof_id);
+		XT_SET_DISK_6(rec_head.th_rec_fnum_6, tab->tab_head_rec_fnum);
+
+		if (!xt_pwrite_file(of_rec, 0, sizeof(XTTableHeadDRec), &rec_head, &self->st_statistics.st_rec, self))
+			xt_throw(self);
+
+		/* Store the table format: */
+		memset(&table_fmt, 0, offsetof(XTTableFormatDRec, tf_definition));
+		XT_SET_DISK_4(table_fmt.tf_format_size_4, offsetof(XTTableFormatDRec, tf_definition) + def_len);
+		XT_SET_DISK_4(table_fmt.tf_tab_head_size_4, eof);
+		XT_SET_DISK_2(table_fmt.tf_tab_version_2, XT_TAB_CURRENT_VERSION);
+		XT_SET_DISK_4(table_fmt.tf_rec_size_4, tab->tab_dic.dic_rec_size);
+		XT_SET_DISK_1(table_fmt.tf_rec_fixed_1, tab->tab_dic.dic_rec_fixed);
+		XT_SET_DISK_2(table_fmt.tf_tab_flags_2, tab->tab_dic.dic_tab_flags);
+		XT_SET_DISK_8(table_fmt.tf_min_auto_inc_8, tab->tab_dic.dic_min_auto_inc);
+
+		if (!xt_pwrite_file(of_rec, sizeof(XTTableHeadDRec), offsetof(XTTableFormatDRec, tf_definition), &table_fmt, &self->st_statistics.st_rec, self))
+			xt_throw(self);
+		if (def_len) {
+			if (!xt_pwrite_file(of_rec, sizeof(XTTableHeadDRec) + offsetof(XTTableFormatDRec, tf_definition), def_len, tab_def.sb_cstring, &self->st_statistics.st_rec, self))
+				xt_throw(self);
+		}
+
+		freer_(); // xt_close_file(of_rec)
+
+		/* ----------- INDEX FILE: */
+		xt_remove_last_name_of_path(path);
+		tab_get_index_file_name(table_name, xt_last_name_of_path(name->ps_path), tab_id);
+		xt_strcat(PATH_MAX, path, table_name);
+		of_ind = xt_open_file(self, path, XT_FS_CREATE | XT_FS_EXCLUSIVE);
+		pushr_(xt_close_file, of_ind);
+
+		/* This is the size of the index header: */
+		tab->tab_index_format_offset = offsetof(XTIndexHeadDRec, tp_data) + dic->dic_key_count * XT_NODE_REF_SIZE;
+		if (!(tab->tab_index_head = (XTIndexHeadDPtr) xt_calloc_ns(XT_INDEX_HEAD_SIZE)))
+			xt_throw(self);
+
+		XT_NODE_ID(tab->tab_ind_eof) = 1;
+		XT_NODE_ID(tab->tab_ind_free) = 0;
+
+		XT_SET_DISK_4(tab->tab_index_head->tp_header_size_4, XT_INDEX_HEAD_SIZE);
+		XT_SET_DISK_4(tab->tab_index_head->tp_format_offset_4, tab->tab_index_format_offset);
+		XT_SET_DISK_6(tab->tab_index_head->tp_ind_eof_6, XT_NODE_ID(tab->tab_ind_eof));
+		XT_SET_DISK_6(tab->tab_index_head->tp_ind_free_6, XT_NODE_ID(tab->tab_ind_free));
+
+		/* Store the index format: */
+		index_fmt = (XTIndexFormatDPtr) (((xtWord1 *) tab->tab_index_head) + tab->tab_index_format_offset);
+		XT_SET_DISK_4(index_fmt->if_format_size_4, sizeof(XTIndexFormatDRec));
+		XT_SET_DISK_2(index_fmt->if_tab_version_2, XT_TAB_CURRENT_VERSION);
+		XT_SET_DISK_2(index_fmt->if_ind_version_2, XT_IND_CURRENT_VERSION);
+		XT_SET_DISK_1(index_fmt->if_node_ref_size_1, XT_NODE_REF_SIZE);
+		XT_SET_DISK_1(index_fmt->if_rec_ref_size_1, XT_RECORD_REF_SIZE);
+		XT_SET_DISK_4(index_fmt->if_page_size_4, XT_INDEX_PAGE_SIZE);
+
+		/* Save the header: */
+		if (!xt_pwrite_file(of_ind, 0, XT_INDEX_HEAD_SIZE, tab->tab_index_head, &self->st_statistics.st_ind, self))
+			xt_throw(self);
+
+		freer_(); // xt_close_file(of_ind)
+
+		/* ------------ */
+		/* Log the new table ID! */
+		db->db_curr_tab_id = tab_id;
+		if (!xt_xn_log_tab_id(self, tab_id)) {
+			db->db_curr_tab_id = tab_id - 1;
+			xt_throw(self);
+		}
+
+		freer_(); // xt_heap_release(tab)
+
+		/* {LOAD-FOR-FKS}
+		 * 2008-12-10: Note, there is another problem, example:
+		 * set storage_engine = pbxt;
+		 * 
+		 * CREATE TABLE t1 (s1 INT PRIMARY KEY, s2 INT);
+		 * CREATE TABLE t2 (s1 INT PRIMARY KEY, FOREIGN KEY (s1) REFERENCES t1 (s1) ON UPDATE CASCADE);
+		 * CREATE TABLE t3 (s1 INT PRIMARY KEY, FOREIGN KEY (s1) REFERENCES t2 (s1) ON UPDATE CASCADE);
+		 * 
+		 * DROP TABLE IF EXISTS t2,t1;
+		 * CREATE TABLE t1 (s1 ENUM('a','b') PRIMARY KEY);
+		 * CREATE TABLE t2 (s1 ENUM('A','B'), FOREIGN KEY (s1) REFERENCES t1 (s1));
+		 * 
+		 * DROP TABLE IF EXISTS t2,t1;
+		 * 
+		 * In the example above. The second create t2 does not fail, although t3 references it,
+		 * and the data types do not match.
+		 * 
+		 * The main problem is that this error comes on DROP TABLE IF EXISTS t2! Which prevents
+		 * the table from being dropped - not good.
+		 *
+		 * So my idea here is to open the table, and if it fails, then the create table fails
+		 * as well.
+		 */
+		if (!old_tab_id) {
+			tab = xt_use_table_no_lock(self, db, name, FALSE, FALSE, NULL);
+			xt_heap_release(self, tab);
+		}
+	}
+	catch_(a) {
+		/* Creation failed, delete the table files: */
+		if (*path)
+			tab_delete_table_files(self, name, tab_id);
+		tab_remove_table_path(self, db, te_tab.te_tab_path);
+		xt_sb_set_size(self, &tab_def, 0);
+		throw_();
+	}
+	cont_(a);
+
+	xt_sb_set_size(self, &tab_def, 0);
+
+	if (old_tab_id) {
+		try_(b) {
+			XTTableEntryPtr	te_ptr;
+
+			if ((te_ptr = (XTTableEntryPtr) xt_sl_find(self, db->db_table_by_id, &old_tab_id))) {
+				tab_remove_table_path(self, db, te_ptr->te_tab_path);
+				xt_sl_delete(self, db->db_table_by_id, &old_tab_id);
+			}
+
+			/* Same purpose as above {LOAD-FOR-FKS} (although this should work, 
+			 * beacuse this is a TRUNCATE TABLE.
+			 */
+			tab = xt_use_table_no_lock(self, db, name, FALSE, FALSE, NULL);
+			xt_heap_release(self, tab);
+		}
+		catch_(b) {
+			/* Log this error, but do not return it, because
+			 * it just involves the cleanup of the old table,
+			 * the new table has been successfully created.
+			 */
+			xt_log_and_clear_exception(self);
+		}
+		cont_(b);
+	}
+
+	popr_(); // Discard xt_sl_delete_from_info(&li_undo)
+
+	freer_(); // xt_ht_unlock(db->db_tables)
+	freer_(); // xt_db_unlock_table_pool(table_pool)
+
+	/* I open the table here, because I cannot rely on MySQL to do
+	 * it after a create. This is normally OK, but with foreign keys
+	 * tables can be referenced and then they are not opened
+	 * before use. In this example, the INSERT opens t2, but t1 is
+	 * not opened of the create. As a result the foreign key
+	 * reference is not resolved.
+	 *
+	 * drop table t1, t2;
+	 * CREATE TABLE t1
+	 * (
+	 *  id INT PRIMARY KEY
+	 * ) ENGINE=pbxt;
+	 * 
+	 * CREATE TABLE t2
+	 * (
+	 *  v INT,
+	 *  CONSTRAINT c1 FOREIGN KEY (v) REFERENCES t1(id)
+	 * ) ENGINE=pbxt;
+	 * 
+	 * --error 1452
+	 * INSERT INTO t2 VALUES(2);
+	 */
+	/* this code is not needed anymore as we open tables referred by FKs as necessary during checks
+	xt_ht_lock(self, db->db_tables);
+	pushr_(xt_ht_unlock, db->db_tables);
+	tab = xt_use_table_no_lock(self, db, name, FALSE, FALSE, NULL);
+	freer_(); // xt_ht_unlock(db->db_tables)
+	xt_heap_release(self, tab);
+	* CHANGED see {LOAD-FOR-FKS} above.
+	*/
+
+	exit_();
+}
+
+xtPublic void xt_drop_table(XTThreadPtr self, XTPathStrPtr tab_name, xtBool drop_db)
+{
+	XTDatabaseHPtr		db = self->st_database;
+	XTOpenTablePoolPtr	table_pool;
+	XTTableHPtr			tab = NULL;
+	xtTableID			tab_id = 0;
+	xtBool				can_drop = TRUE;
+
+	enter_();
+
+#ifdef TRACE_CREATE_TABLES
+	printf("DROP %s\n", tab_name->ps_path);
+#endif
+
+	table_pool = tab_lock_table(self, tab_name, FALSE, TRUE, TRUE, &tab);
+	pushr_(xt_db_unlock_table_pool, table_pool);
+	xt_ht_lock(self, db->db_tables);
+	pushr_(xt_ht_unlock, db->db_tables);
+	pushr_(xt_heap_release, tab);
+
+	if (table_pool) {
+		tab_id = tab->tab_id;	/* tab is not null if returned table_pool is not null */
+		/* check if other tables refer this */
+		if (!self->st_ignore_fkeys) 
+			can_drop = tab->tab_dic.dic_table->checkCanDrop(drop_db);
+	}
+#ifdef DRIZZLED 
+	/* See the comment in ha_pbxt::delete_table regarding different implmentation of DROP TABLE
+         * in MySQL and Drizzle
+         */
+	else {
+		xt_throw_xterr(XT_CONTEXT, XT_ERR_TABLE_NOT_FOUND);
+	}
+#endif
+
+	if (can_drop) {
+		if (tab_id) {
+			XTTableEntryPtr	te_ptr;
+
+			xt_dl_delete_ext_data(self, tab, FALSE, TRUE);
+			freer_(); // xt_heap_release(self, tab)
+
+			/* For the Windows version this must be done before we
+			 * start to delete the underlying files!
+			 */
+			tab_close_mapped_files(self, tab);
+
+			tab_delete_table_files(self, tab_name, tab_id);
+
+			ASSERT(xt_get_self() == self);
+			if ((te_ptr = (XTTableEntryPtr) xt_sl_find(self, db->db_table_by_id, &tab_id))) {
+				tab_remove_table_path(self, db, te_ptr->te_tab_path);
+				xt_sl_delete(self, db->db_table_by_id, &tab_id);
+			}
+		}
+		else {
+			freer_(); // xt_heap_release(self, tab)
+		}
+
+		xt_ht_del(self, db->db_tables, tab_name);
+	}
+	else {	/* cannot drop table because of FK dependencies */
+		xt_throw_xterr(XT_CONTEXT, XT_ERR_ROW_IS_REFERENCED);
+	}
+
+	freer_(); // xt_ht_unlock(db->db_tables)
+	freer_(); // xt_db_unlock_table_pool(table_pool)
+	exit_();
+}
+
+xtPublic void xt_tab_check_free_lists(XTThreadPtr self, XTOpenTablePtr ot, bool check_recs, bool correct_count)
+{
+	char					table_name[XT_IDENTIFIER_NAME_SIZE*3+3];
+	register XTTableHPtr	tab = ot->ot_table;
+	xtRowID					prev_row_id;
+	xtRowID					row_id;
+	xtRefID					next_row_id;
+	u_llong					free_count;
+
+	xt_tab_make_table_name(tab, table_name, sizeof(table_name));
+	if (check_recs) {
+		xtRecordID		prev_rec_id;
+		xtRecordID		rec_id;
+		XTTabRecExtDRec	rec_buf;
+
+		xt_lock_mutex_ns(&tab->tab_rec_lock);
+		/* Checking the free list: */
+		prev_rec_id = 0;
+		free_count = 0;
+		rec_id = tab->tab_rec_free_id;
+		while (rec_id) {
+			if (rec_id >= tab->tab_rec_eof_id) {
+				xt_logf(XT_NT_ERROR, "Table %s: invalid reference on free list: %llu, ", table_name, (u_llong) rec_id);
+				if (prev_rec_id)
+					xt_logf(XT_NT_ERROR, "reference by: %llu\n", (u_llong) prev_rec_id);
+				else
+					xt_logf(XT_NT_ERROR, "reference by list head pointer\n");
+				xt_tab_set_table_repair_pending(tab);
+				break;
+			}
+			if (!xt_tab_get_rec_data(ot, rec_id, XT_REC_FIX_HEADER_SIZE, (xtWord1 *) &rec_buf)) {
+				if (self)
+					xt_throw(self);
+				else
+					xt_log_and_clear_warning(ot->ot_thread);
+				break;
+			}
+			if ((rec_buf.tr_rec_type_1 & XT_TAB_STATUS_MASK) != XT_TAB_STATUS_FREED)
+				xt_logf(XT_NT_INFO, "Table %s: record, %llu, on free list is not free\n", table_name, (u_llong) rec_id);
+			free_count++;
+			prev_rec_id = rec_id;
+			rec_id = XT_GET_DISK_4(rec_buf.tr_prev_rec_id_4);
+		}
+		if (free_count != tab->tab_rec_fnum) {
+			if (correct_count) {
+				tab->tab_rec_fnum = free_count;
+				tab->tab_head_rec_fnum = free_count;
+				tab->tab_flush_pending = TRUE;
+				xt_logf(XT_NT_INFO, "Table %s: free record count (%llu) has been set to the number of records on the list: %llu\n", table_name, (u_llong) tab->tab_rec_fnum, (u_llong) free_count);
+			}
+			else
+				xt_logf(XT_NT_INFO, "Table %s: free record count (%llu) differs from the number of records on the list: %llu\n", table_name, (u_llong) tab->tab_rec_fnum, (u_llong) free_count);
+		}
+		xt_unlock_mutex_ns(&tab->tab_rec_lock);
+	}
+
+	/* Check the row free list: */
+	xt_lock_mutex_ns(&tab->tab_row_lock);
+
+	prev_row_id = 0;
+	free_count = 0;
+	row_id = tab->tab_row_free_id;
+	while (row_id) {
+		if (row_id >= tab->tab_row_eof_id) {
+			xt_logf(XT_NT_ERROR, "Table %s: invalid reference on free row: %llu, ", table_name, (u_llong) row_id);
+			if (prev_row_id)
+				xt_logf(XT_NT_ERROR, "reference by: %llu\n", (u_llong) prev_row_id);
+			else
+				xt_logf(XT_NT_ERROR, "reference by list head pointer\n");
+			xt_tab_set_table_repair_pending(tab);
+			break;
+		}
+		if (!tab->tab_rows.xt_tc_read_4(ot->ot_row_file, row_id, &next_row_id, ot->ot_thread)) {
+			if (self)
+				xt_throw(self);
+			else
+				xt_log_and_clear_warning(ot->ot_thread);
+			break;
+		}
+		free_count++;
+		prev_row_id = row_id;
+		row_id = next_row_id;
+	}
+	if (free_count != tab->tab_row_fnum) {
+		if (correct_count) {
+			/* tab_row_fnum is the current value, and tab_head_row_fnum is the value on
+			 * disk. tab_head_row_fnum is set by the writer as the changes are applied
+			 * to the database.
+			 *
+			 * This is the value then stored in the header of the file. This value
+			 * is in sync with other changes to the file.
+			 *
+			 * So the fact that I am setting both value means this will not work at
+			 * runtime, unless all changes have been applied by the writer.
+			 *
+			 * The correct way to do this at run time would be to add the change to the
+			 * transaction log, so that it is applied by the writer.
+			 */
+			tab->tab_row_fnum = free_count;
+			tab->tab_head_row_fnum = free_count;
+			tab->tab_flush_pending = TRUE;
+			xt_logf(XT_NT_INFO, "Table %s: free row count (%llu) has been set to the number of rows on the list: %llu\n", table_name, (u_llong) tab->tab_row_fnum, (u_llong) free_count);
+		}
+		else
+			xt_logf(XT_NT_INFO, "Table %s: free row count (%llu) differs from the number of rows on the list: %llu\n", table_name, (u_llong) tab->tab_row_fnum, (u_llong) free_count);
+	}
+
+	xt_unlock_mutex_ns(&tab->tab_row_lock);
+}
+
+/*
+ * Record buffer size:
+ * -------------------
+ * The size of the record buffer used to hold the row
+ * in memory. This buffer size does not include the BLOB data.
+ * About 8 bytes (a pointer and a size) is reserved for each BLOB
+ * in this buffer.
+ *
+ * The buffer size includes a number of "NULL" bytes followed by
+ * the data area. The NULL bytes contain 1 bit for every column,
+ * to indicate of the columns is NULL or not.
+ *
+ * The size of the buffer is 4/8-byte aligned, so it may be padded
+ * at the end.
+ *
+ * Fixed length rec. len.:
+ * -----------------------
+ * If the record does not include any BLOBs then this is the size of the
+ * fixed length record. The size if the data in the data handle record
+ * need never be bigger then this length, if the record does not
+ * contain BLOBs. So this should be the maximum size set for
+ * AVG_ROW_LENGTH in this case.
+ *
+ * Handle data record size:
+ * ------------------------
+ * This is the size of the handle data record. It is the data size
+ * plus the "max header size".
+ *
+ * Min/max header size:
+ * The min and max header size of the header in the data handle file.
+ * The larger header is used if a record has an extended data (data log
+ * file) component.
+ *
+ * Min/avg/max record size:
+ * ------------------------
+ * These are variable length records sizes. That is, the size of records
+ * when stored in the variable length format. Variable length records
+ * do not have fixed fields sizes, instead the fields are packed one
+ * after the other, prefixed by a number of size indicator bytes.
+ *
+ * The average is an estimate of the average record size. This estimate
+ * is used if no AVG_ROW_LENGTH is specifically given.
+ *
+ * If the average estimate is withing 20% of the maximum size of the record,
+ * then the record will be handled as a fixed length record.
+ *
+ * Avg row len set for tab:
+ * ------------------------
+ * This is the value set using AVG_ROW_LENGTH when the table is declared.
+ *
+ * Rows fixed length:
+ * ------------------
+ * YES if the records of this table are handled as a fixed length records.
+ * In this case the table records will never have an extended record
+ * component.
+ *
+ * The size of the data area in the handle data record is set to the
+ * size of the MySQL data record ("Fixed length rec. len.").
+ *
+ * It also means that the record format used is identical to the MySQL
+ * record format.
+ *
+ * If the records are not fixed, then the variable length record format
+ * is used. Records size are then in the range specified by
+ * "Min/avg/max record size".
+ *
+ * Maximum fixed size:
+ * -------------------
+ * This is the maximum size of a data log record.
+ *
+ * Minimum variable size:
+ * ------------------------
+ * Records below this size are handled as a fixed length record size, unless
+ * the AVG_ROW_LENGTH is specifically set.
+ */
+xtPublic void xt_check_table(XTThreadPtr self, XTOpenTablePtr ot)
+{
+	XTTableHPtr				tab = ot->ot_table;
+	xtRecordID				prec_id;
+	XTTabRecExtDPtr			rec_buf = (XTTabRecExtDPtr) ot->ot_row_rbuffer;
+#ifdef CHECK_TABLE_READ_DATA_LOG
+	XTactExtRecEntryDRec	ext_rec;
+	size_t					log_size;
+	xtLogID					log_id;
+	xtLogOffset				log_offset;
+#endif
+	xtRecordID				rec_id;
+	xtRecordID				prev_rec_id;
+	xtXactID				xn_id;
+	xtRowID					row_id;
+	u_llong					free_rec_count = 0, free_count2 = 0;
+	u_llong					delete_rec_count = 0;
+	u_llong					alloc_rec_count = 0;
+	u_llong					alloc_rec_bytes = 0;
+	u_llong					min_comp_rec_len = 0;
+	u_llong					max_comp_rec_len = 0;
+	size_t					rec_size;
+	size_t					row_size;
+	u_llong					ext_data_len = 0;
+	u_llong					ext_rec_count = 0;
+
+#if defined(DUMP_CHECK_TABLE) || defined(CHECK_TABLE_STATS)
+	printf("\nCHECK TABLE: %s\n", tab->tab_name->ps_path);
+#endif
+
+	xt_lock_mutex(self, &tab->tab_db->db_co_ext_lock);
+	pushr_(xt_unlock_mutex, &tab->tab_db->db_co_ext_lock);
+
+	xt_lock_mutex(self, &tab->tab_rec_lock);
+	pushr_(xt_unlock_mutex, &tab->tab_rec_lock);
+
+#ifdef CHECK_TABLE_STATS
+	printf("Record buffer size      = %lu\n", (u_long) tab->tab_dic.dic_mysql_buf_size);
+	printf("Fixed length rec. len.  = %lu\n", (u_long) tab->tab_dic.dic_mysql_rec_size);
+	printf("Handle data record size = %lu\n", (u_long) tab->tab_dic.dic_rec_size);
+	printf("Min/max header size     = %d/%d\n", (int) offsetof(XTTabRecFix, rf_data), tab->tab_dic.dic_rec_fixed ? (int) offsetof(XTTabRecFix, rf_data) : (int) offsetof(XTTabRecExtDRec, re_data));
+	printf("Min/avg/max record size = %llu/%llu/%llu\n", (u_llong) tab->tab_dic.dic_min_row_size, (u_llong) tab->tab_dic.dic_ave_row_size, (u_llong) tab->tab_dic.dic_max_row_size);
+	if (tab->tab_dic.dic_def_ave_row_size)
+		printf("Avg row len set for tab = %lu\n", (u_long) tab->tab_dic.dic_def_ave_row_size);
+	else
+		printf("Avg row len set for tab = not specified\n");
+	printf("Rows fixed length       = %s\n", tab->tab_dic.dic_rec_fixed ? "YES" : "NO");
+	if (tab->tab_dic.dic_tab_flags & XT_TAB_FLAGS_TEMP_TAB)
+		printf("Table type              = TEMP\n");
+	if (tab->tab_dic.dic_def_ave_row_size)
+		printf("Maximum fixed size      = %lu\n", (u_long) XT_TAB_MAX_FIX_REC_LENGTH_SPEC);
+	else
+		printf("Maximum fixed size      = %lu\n", (u_long) XT_TAB_MAX_FIX_REC_LENGTH);
+	printf("Minimum variable size   = %lu\n", (u_long) XT_TAB_MIN_VAR_REC_LENGTH);
+	printf("Minimum auto-increment  = %llu\n", (u_llong) tab->tab_dic.dic_min_auto_inc);
+	printf("Number of columns       = %lu\n", (u_long) tab->tab_dic.dic_no_of_cols);
+	printf("Number of fixed columns = %lu\n", (u_long) tab->tab_dic.dic_fix_col_count);
+	printf("Columns req. for index  = %lu\n", (u_long) tab->tab_dic.dic_ind_cols_req);
+	if (tab->tab_dic.dic_ind_rec_len)
+		printf("Rec len req. for index  = %llu\n", (u_llong) tab->tab_dic.dic_ind_rec_len);
+	printf("Columns req. for blobs  = %lu\n", (u_long) tab->tab_dic.dic_blob_cols_req);
+	printf("Number of blob columns  = %lu\n", (u_long) tab->tab_dic.dic_blob_count);
+	printf("Number of indices       = %lu\n", (u_long) tab->tab_dic.dic_key_count);
+#endif
+
+#ifdef DUMP_CHECK_TABLE
+	printf("Records:-\n");
+	printf("Free list: %llu (%llu)\n", (u_llong) tab->tab_rec_free_id, (u_llong) tab->tab_rec_fnum);
+	printf("EOF:       %llu\n", (u_llong) tab->tab_rec_eof_id);
+#endif
+
+	rec_size = XT_REC_EXT_HEADER_SIZE;
+	if (rec_size > tab->tab_recs.tci_rec_size)
+		rec_size = tab->tab_recs.tci_rec_size;
+	rec_id = 1;
+	while (rec_id < tab->tab_rec_eof_id) {
+		if (!xt_tab_get_rec_data(ot, rec_id, tab->tab_dic.dic_rec_size, ot->ot_row_rbuffer))
+			xt_throw(self);
+
+#ifdef DUMP_CHECK_TABLE
+		printf("%-4llu ", (u_llong) rec_id);
+#endif
+		switch (rec_buf->tr_rec_type_1 & XT_TAB_STATUS_MASK) {
+			case XT_TAB_STATUS_FREED:
+#ifdef DUMP_CHECK_TABLE
+				printf("======== ");
+#endif
+				free_rec_count++;
+				break;
+			case XT_TAB_STATUS_DELETE:
+#ifdef DUMP_CHECK_TABLE
+				printf("delete   ");
+#endif
+				delete_rec_count++;
+				break;
+			case XT_TAB_STATUS_FIXED:
+#ifdef DUMP_CHECK_TABLE
+				printf("record-F ");
+#endif
+				alloc_rec_count++;
+				row_size = myxt_store_row_length(ot, (char *) ot->ot_row_rbuffer + XT_REC_FIX_HEADER_SIZE);
+				alloc_rec_bytes += row_size;
+				if (!min_comp_rec_len || row_size < min_comp_rec_len)
+					min_comp_rec_len = row_size;
+				if (row_size > max_comp_rec_len)
+					max_comp_rec_len = row_size;
+				break;
+			case XT_TAB_STATUS_VARIABLE:
+#ifdef DUMP_CHECK_TABLE
+				printf("record-V ");
+#endif
+				alloc_rec_count++;
+				row_size = myxt_load_row_length(ot, tab->tab_dic.dic_rec_size, ot->ot_row_rbuffer + XT_REC_FIX_HEADER_SIZE, NULL);
+				alloc_rec_bytes += row_size;
+				if (!min_comp_rec_len || row_size < min_comp_rec_len)
+					min_comp_rec_len = row_size;
+				if (row_size > max_comp_rec_len)
+					max_comp_rec_len = row_size;
+				break;
+			case XT_TAB_STATUS_EXT_DLOG:
+#ifdef DUMP_CHECK_TABLE
+				printf("record-X ");
+#endif
+				alloc_rec_count++;
+				ext_rec_count++;
+				ext_data_len += XT_GET_DISK_4(rec_buf->re_log_dat_siz_4);
+				row_size = XT_GET_DISK_4(rec_buf->re_log_dat_siz_4) + ot->ot_rec_size - XT_REC_EXT_HEADER_SIZE;
+				alloc_rec_bytes += row_size;
+				if (!min_comp_rec_len || row_size < min_comp_rec_len)
+					min_comp_rec_len = row_size;
+				if (row_size > max_comp_rec_len)
+					max_comp_rec_len = row_size;
+				break;
+		}
+#ifdef DUMP_CHECK_TABLE
+		if (rec_buf->tr_rec_type_1 & XT_TAB_STATUS_CLEANED_BIT)
+			printf("C");
+		else
+			printf(" ");
+#endif
+		prev_rec_id = XT_GET_DISK_4(rec_buf->tr_prev_rec_id_4);
+		xn_id = XT_GET_DISK_4(rec_buf->tr_xact_id_4);
+		row_id = XT_GET_DISK_4(rec_buf->tr_row_id_4);
+		switch (rec_buf->tr_rec_type_1 & XT_TAB_STATUS_MASK) {
+			case XT_TAB_STATUS_FREED:
+#ifdef DUMP_CHECK_TABLE
+				printf(" prev=%-3llu (xact=%-3llu row=%lu)\n", (u_llong) prev_rec_id, (u_llong) xn_id, (u_long) row_id);
+#endif
+				break;
+			case XT_TAB_STATUS_EXT_DLOG:
+#ifdef DUMP_CHECK_TABLE
+				printf(" prev=%-3llu  xact=%-3llu row=%lu  Xlog=%lu Xoff=%llu Xsiz=%lu\n", (u_llong) prev_rec_id, (u_llong) xn_id, (u_long) row_id, (u_long) XT_GET_DISK_2(rec_buf->re_log_id_2), (u_llong) XT_GET_DISK_6(rec_buf->re_log_offs_6), (u_long) XT_GET_DISK_4(rec_buf->re_log_dat_siz_4));
+#endif
+
+#ifdef CHECK_TABLE_READ_DATA_LOG
+				log_size = XT_GET_DISK_4(rec_buf->re_log_dat_siz_4);
+				XT_GET_LOG_REF(log_id, log_offset, rec_buf);
+				if (!self->st_dlog_buf.dlb_read_log(log_id, log_offset, offsetof(XTactExtRecEntryDRec, er_data), (xtWord1 *) &ext_rec, self))
+					xt_log_and_clear_exception(self);
+				else {
+					size_t		log_size2;
+					xtTableID	curr_tab_id;
+					xtRecordID	curr_rec_id;
+
+					log_size2 = XT_GET_DISK_4(ext_rec.er_data_size_4);
+					curr_tab_id = XT_GET_DISK_4(ext_rec.er_tab_id_4);
+					curr_rec_id = XT_GET_DISK_4(ext_rec.er_rec_id_4);
+					if (log_size2 != log_size || curr_tab_id != tab->tab_id || curr_rec_id != rec_id) {
+						xt_logf(XT_INFO, "Table %s: record %llu, extended record %lu:%llu not valid\n", tab->tab_name, (u_llong) rec_id, (u_long) log_id, (u_llong) log_offset);
+					}
+				}
+#endif
+				break;
+			default:
+#ifdef DUMP_CHECK_TABLE
+				printf(" prev=%-3llu  xact=%-3llu row=%lu\n", (u_llong) prev_rec_id, (u_llong) xn_id, (u_long) row_id);
+#endif
+				break;
+		}
+		rec_id++;
+	}
+	
+#ifdef CHECK_TABLE_STATS
+	if (!tab->tab_dic.dic_rec_fixed) {
+		printf("Extended data length    = %llu\n", ext_data_len);
+		printf("Extended record count   = %llu\n", ext_rec_count);
+	}
+	
+	if (alloc_rec_count) {
+		printf("Minumum comp. rec. len. = %llu\n", (u_llong) min_comp_rec_len);
+		printf("Average comp. rec. len. = %llu\n", (u_llong) ((double) alloc_rec_bytes / (double) alloc_rec_count + (double) 0.5));
+		printf("Maximum comp. rec. len. = %llu\n", (u_llong) max_comp_rec_len);
+	}
+	printf("Free record count       = %llu\n", (u_llong) free_rec_count);
+	printf("Deleted record count    = %llu\n", (u_llong) delete_rec_count);
+	printf("Allocated record count  = %llu\n", (u_llong) alloc_rec_count);
+#endif
+	if (tab->tab_rec_fnum != free_rec_count)
+		xt_logf(XT_INFO, "Table %s: incorrect number of free blocks, %llu, should be: %llu\n", tab->tab_name, (u_llong) free_rec_count, (u_llong) tab->tab_rec_fnum);
+
+	/* Checking the free list: */
+	prec_id = 0;
+	rec_id = tab->tab_rec_free_id;
+	while (rec_id) {
+		if (rec_id >= tab->tab_rec_eof_id) {
+			xt_logf(XT_INFO, "Table %s: invalid reference on free list: %llu, ", tab->tab_name, (u_llong) rec_id);
+			if (prec_id)
+				xt_logf(XT_INFO, "reference by: %llu\n", (u_llong) prec_id);
+			else
+				xt_logf(XT_INFO, "reference by list head pointer\n");
+			break;
+		}
+		if (!xt_tab_get_rec_data(ot, rec_id, XT_REC_FIX_HEADER_SIZE, (xtWord1 *) rec_buf)) {
+			xt_log_and_clear_exception(self);
+			break;
+		}
+		if ((rec_buf->tr_rec_type_1 & XT_TAB_STATUS_MASK) != XT_TAB_STATUS_FREED)
+			xt_logf(XT_INFO, "Table %s: record, %llu, on free list is not free\n", tab->tab_name, (u_llong) rec_id);
+		free_count2++;
+		prec_id = rec_id;
+		rec_id = XT_GET_DISK_4(rec_buf->tr_prev_rec_id_4);
+	}
+	if (free_count2 != free_rec_count)
+		xt_logf(XT_INFO, "Table %s: not all free blocks (%llu) on free list: %llu\n", tab->tab_name, (u_llong) free_rec_count, (u_llong) free_count2);
+
+	freer_(); // xt_unlock_mutex_ns(&tab->tab_rec_lock);
+
+	xtRefID ref_id;
+
+	xt_lock_mutex(self, &tab->tab_row_lock);
+	pushr_(xt_unlock_mutex, &tab->tab_row_lock);
+
+#ifdef DUMP_CHECK_TABLE
+	printf("Rows:-\n");
+	printf("Free list: %llu (%llu)\n", (u_llong) tab->tab_row_free_id, (u_llong) tab->tab_row_fnum);
+	printf("EOF:       %llu\n", (u_llong) tab->tab_row_eof_id);
+#endif
+
+	rec_id = 1;
+	while (rec_id < tab->tab_row_eof_id) {
+		if (!tab->tab_rows.xt_tc_read_4(ot->ot_row_file, rec_id, &ref_id, self))
+			xt_throw(self);
+#ifdef DUMP_CHECK_TABLE
+		printf("%-3llu ", (u_llong) rec_id);
+#endif
+#ifdef DUMP_CHECK_TABLE
+		if (ref_id == 0)
+			printf("====== 0\n");
+		else
+			printf("in use %llu\n", (u_llong) ref_id);
+#endif
+		rec_id++;
+	}
+
+	prec_id = 0;
+	free_count2 = 0;
+	row_id = tab->tab_row_free_id;
+	while (row_id) {
+		if (row_id >= tab->tab_row_eof_id) {
+			xt_logf(XT_INFO, "Table %s: invalid reference on free row: %llu, ", tab->tab_name, (u_llong) row_id);
+			if (prec_id)
+				xt_logf(XT_INFO, "reference by: %llu\n", (u_llong) prec_id);
+			else
+				xt_logf(XT_INFO, "reference by list head pointer\n");
+			break;
+		}
+		if (!tab->tab_rows.xt_tc_read_4(ot->ot_row_file, row_id, &ref_id, self)) {
+			xt_log_and_clear_exception(self);
+			break;
+		}
+		free_count2++;
+		prec_id = row_id;
+		row_id = ref_id;
+	}
+	if (free_count2 != tab->tab_row_fnum)
+		xt_logf(XT_INFO, "Table %s: free row count (%llu) differs from the number of row on the list: %llu\n", tab->tab_name, (u_llong) tab->tab_row_fnum, (u_llong) free_count2);
+
+	freer_(); // xt_unlock_mutex(&tab->tab_row_lock);
+
+#ifdef CHECK_INDEX_ON_CHECK_TABLE
+	xt_check_indices(ot);
+#endif
+	freer_(); // xt_unlock_mutex(&tab->tab_db->db_co_ext_lock);
+}
+
+xtPublic void xt_rename_table(XTThreadPtr self, XTPathStrPtr old_name, XTPathStrPtr new_name)
+{
+	XTDatabaseHPtr		db = self->st_database;
+	XTOpenTablePoolPtr	table_pool;
+	XTTableHPtr			tab = NULL;
+	char				table_name[XT_MAX_TABLE_FILE_NAME_SIZE];
+	char				*postfix;
+	XTFilesOfTableRec	ft;
+	XTDictionaryRec		dic;
+	xtTableID			tab_id;
+	XTTableEntryPtr		te_ptr;
+	char				*te_new_name;
+	XTTablePathPtr		te_new_path;
+	XTTablePathPtr		te_old_path;
+	char				to_path[PATH_MAX];
+
+	memset(&dic, 0, sizeof(dic));
+
+#ifdef TRACE_CREATE_TABLES
+	printf("RENAME %s --> %s\n", old_name->ps_path, new_name->ps_path);
+#endif
+	if (strlen(xt_last_name_of_path(new_name->ps_path)) > XT_TABLE_NAME_SIZE-1)
+		xt_throw_taberr(XT_CONTEXT, XT_ERR_NAME_TOO_LONG, new_name);
+
+	/* MySQL renames the table while it is in use. Here is
+	 * the sequence:
+	 *
+	 * OPEN tab1
+	 * CREATE tmp_tab
+	 * OPEN tmp_tab
+	 * COPY tab1 -> tmp_tab
+	 * CLOSE tmp_tab
+	 * RENAME tab1 -> tmp2_tab
+	 * RENAME tmp_tab -> tab1
+	 * CLOSE tab1 (tmp2_tab)
+	 * DELETE tmp2_tab
+	 * OPEN tab1
+	 *
+	 * Since the table is open when it is renamed, I cannot
+	 * get exclusive use of the table for this operation.
+	 *
+	 * So instead we just make sure that the sweeper is not
+	 * using the table.
+	 */
+	table_pool = tab_lock_table(self, old_name, FALSE, TRUE, FALSE, &tab);
+	pushr_(xt_db_unlock_table_pool, table_pool);
+	xt_ht_lock(self, db->db_tables);
+	pushr_(xt_ht_unlock, db->db_tables);
+	tab_id = tab->tab_id;
+	myxt_move_dictionary(&dic, &tab->tab_dic);
+	pushr_(myxt_free_dictionary, &dic);
+	pushr_(xt_heap_release, tab);
+
+	/* Unmap the memory mapped table files: 
+	 * For windows this must be done before we
+	 * can rename the files.
+	 */
+	tab_close_mapped_files(self, tab);
+
+	freer_(); // xt_heap_release(self, old_tab)
+
+	/* Create the new name and path: */
+	te_new_name = xt_dup_string(self, xt_last_name_of_path(new_name->ps_path));
+	pushr_(xt_free, te_new_name);
+	te_new_path = tab_get_table_path(self, db, new_name, FALSE);
+	pushr_(tab_free_table_path, te_new_path);
+
+	te_ptr = (XTTableEntryPtr) xt_sl_find(self, db->db_table_by_id, &tab_id);
+
+	/* Remove the table from the Database directory: */
+	xt_ht_del(self, db->db_tables, old_name);
+
+	xt_enum_files_of_tables_init(old_name, tab_id, &ft);
+	while (xt_enum_files_of_tables_next(&ft)) {
+		postfix = xt_tab_file_to_name(XT_MAX_TABLE_FILE_NAME_SIZE, table_name, ft.ft_file_path);
+
+		xt_strcpy(PATH_MAX, to_path, new_name->ps_path);
+		xt_strcat(PATH_MAX, to_path, postfix);
+
+		if (!xt_fs_rename(NULL, ft.ft_file_path, to_path))
+			xt_log_and_clear_exception(self);
+	}
+
+	/* Switch the table name and path: */
+	xt_free(self, te_ptr->te_tab_name);
+	te_ptr->te_tab_name = te_new_name;
+	te_old_path = te_ptr->te_tab_path;
+	te_ptr->te_tab_path = te_new_path;
+	tab_remove_table_path(self, db, te_old_path);
+
+	popr_(); // Discard tab_free_table_path(te_new_path);
+	popr_(); // Discard xt_free(te_new_name);
+
+	tab = xt_use_table_no_lock(self, db, new_name, FALSE, FALSE, &dic);
+	/* All renamed tables are considered repaired! */
+	xt_tab_table_repaired(tab);
+	xt_heap_release(self, tab);
+
+	freer_(); // myxt_free_dictionary(&dic)
+	freer_(); // xt_ht_unlock(db->db_tables)
+	freer_(); // xt_db_unlock_table_pool(table_pool)
+}
+
+xtPublic XTTableHPtr xt_use_table(XTThreadPtr self, XTPathStrPtr name, xtBool no_load, xtBool missing_ok)
+{
+	XTTableHPtr		tab;
+	XTDatabaseHPtr	db = self->st_database;
+
+	xt_ht_lock(self, db->db_tables);
+	pushr_(xt_ht_unlock, db->db_tables);
+	tab = xt_use_table_no_lock(self, db, name, no_load, missing_ok, NULL);
+	freer_();
+	return tab;
+}
+
+xtPublic void xt_sync_flush_table(XTThreadPtr self, XTOpenTablePtr ot)
+{
+	XTTableHPtr		tab = ot->ot_table;
+	XTDatabaseHPtr	db = tab->tab_db;
+
+	/* Wakeup the sweeper:
+	 * We want the sweeper to check if there is anything to do,
+	 * so we must wake it up.
+	 * Once it has done all it can, it will go back to sleep.
+	 * This should be good enough.
+	 *
+	 * NOTE: I all cases, we do not wait if the sweeper is in
+	 * error state.
+	 */
+	if (db->db_sw_idle) {
+		u_int check_count = db->db_sw_check_count;
+
+		for (;;) {
+			xt_wakeup_sweeper(db);
+			if (!db->db_sw_thread || db->db_sw_idle != XT_THREAD_IDLE || check_count != db->db_sw_check_count)
+				break;
+			xt_sleep_milli_second(10);
+		}
+	}
+
+	/* Wait for the sweeper to become idle: */
+	xt_lock_mutex(self, &db->db_sw_lock);
+	pushr_(xt_unlock_mutex, &db->db_sw_lock);
+	while (db->db_sw_thread && !db->db_sw_idle) {
+		xt_timed_wait_cond(self, &db->db_sw_cond, &db->db_sw_lock, 10);
+	}
+	freer_(); // xt_unlock_mutex(&db->db_sw_lock)
+
+	/* Wait for the writer to write out all operations on the table:
+	 * We also do not wait for the writer if it is in
+	 * error state.
+	 */
+	while (db->db_wr_thread && 
+		db->db_wr_idle != XT_THREAD_INERR &&
+		XTTableSeq::xt_op_is_before(tab->tab_head_op_seq+1, tab->tab_seq.ts_next_seq)) {
+		/* Flush the log, in case this is holding up the
+		 * writer!
+		 */
+		if (!db->db_xlog.xlog_flush(self))
+			xt_throw(self);
+
+		xt_lock_mutex(self, &db->db_wr_lock);
+		pushr_(xt_unlock_mutex, &db->db_wr_lock);
+		db->db_wr_thread_waiting++;
+		/*
+		 * Wake the writer if it is sleeping. In order to
+		 * flush a table we must wait for the writer to complete
+		 * committing all the changes in the table to the database.
+		 */
+		if (db->db_wr_idle) {
+			if (!xt_broadcast_cond_ns(&db->db_wr_cond))
+				xt_log_and_clear_exception_ns();
+		}
+
+		freer_(); // xt_unlock_mutex(&db->db_wr_lock)
+		xt_sleep_milli_second(10);
+
+		xt_lock_mutex(self, &db->db_wr_lock);
+		pushr_(xt_unlock_mutex, &db->db_wr_lock);
+		db->db_wr_thread_waiting--;
+		freer_(); // xt_unlock_mutex(&db->db_wr_lock)
+	}
+
+	xt_flush_table(self, ot);
+}
+
+xtPublic xtBool xt_flush_record_row(XTOpenTablePtr ot, off_t *bytes_flushed, xtBool have_table_lock)
+{
+	XTTableHeadDRec			rec_head;
+	XTTableHPtr				tab = ot->ot_table;
+	off_t					to_flush;
+	XTCheckPointTablePtr	cp_tab;
+	XTCheckPointStatePtr	cp = NULL;
+
+	if (!xt_begin_checkpoint(tab->tab_db, have_table_lock, ot->ot_thread))
+		return FAILED;
+
+	xt_lock_mutex_ns(&tab->tab_rec_flush_lock);
+
+	ASSERT_NS(ot->ot_thread == xt_get_self());
+	/* Make sure that the table recovery point, in
+	 * particular the operation ID is recorded
+	 * before all other flush activity!
+	 *
+	 * This is because only operations after the
+	 * recovery point in the header are applied
+	 * to the table on recovery.
+	 *
+	 * So the operation ID is recorded before the
+	 * flush activity, and written after all is done.
+	 */
+	xt_tab_store_header(ot, &rec_head);
+
+#ifdef TRACE_FLUSH
+	printf("FLUSH rec/row %d %s\n", (int) tab->tab_bytes_to_flush, tab->tab_name->ps_path);
+	fflush(stdout);
+#endif
+	/* Write the table header: */
+	if (tab->tab_flush_pending) {
+		tab->tab_flush_pending = FALSE;
+		// Want to see how much was to be flushed in the debugger:
+		to_flush = tab->tab_bytes_to_flush;
+		tab->tab_bytes_to_flush = 0;
+		if (bytes_flushed)
+			*bytes_flushed += to_flush;
+		/* Flush the table data: */
+		if (!(tab->tab_dic.dic_tab_flags & XT_TAB_FLAGS_TEMP_TAB)) {
+			if (!XT_FLUSH_RR_FILE(ot->ot_rec_file, &ot->ot_thread->st_statistics.st_rec, ot->ot_thread) ||
+				!XT_FLUSH_RR_FILE(ot->ot_row_file, &ot->ot_thread->st_statistics.st_rec, ot->ot_thread)) {
+				tab->tab_flush_pending = TRUE;
+				goto failed;
+			}
+		}
+
+		/* The header includes the operation number which
+		 * must be written AFTER all other data,
+		 * because operations will not be applied again.
+		 */
+		if (!xt_tab_write_header(ot, &rec_head, ot->ot_thread)) {
+			tab->tab_flush_pending = TRUE;
+			goto failed;
+		}
+	}
+
+	/* Flush the auto-increment: */
+	if (xt_db_auto_increment_mode == 1) {
+		if (tab->tab_auto_inc != tab->tab_dic.dic_min_auto_inc) {
+			tab->tab_dic.dic_min_auto_inc = tab->tab_auto_inc;
+			if (!xt_tab_write_min_auto_inc(ot))
+				goto failed;
+		}
+	}
+
+	/* Mark this table as record/row flushed: */
+	cp = &tab->tab_db->db_cp_state;
+	xt_lock_mutex_ns(&cp->cp_state_lock);
+	if (cp->cp_running) {
+		cp_tab = (XTCheckPointTablePtr) xt_sl_find(NULL, cp->cp_table_ids, &tab->tab_id);
+		if (cp_tab && (cp_tab->cpt_flushed & XT_CPT_ALL_FLUSHED) != XT_CPT_ALL_FLUSHED) {
+			cp_tab->cpt_flushed |= XT_CPT_REC_ROW_FLUSHED;
+			if ((cp_tab->cpt_flushed & XT_CPT_ALL_FLUSHED) == XT_CPT_ALL_FLUSHED) {
+				ASSERT_NS(cp->cp_flush_count < xt_sl_get_size(cp->cp_table_ids));
+				cp->cp_flush_count++;
+			}
+		}
+	}
+	xt_unlock_mutex_ns(&cp->cp_state_lock);
+
+#ifdef TRACE_FLUSH
+	printf("FLUSH --end-- %s\n", tab->tab_name->ps_path);
+	fflush(stdout);
+#endif
+	xt_unlock_mutex_ns(&tab->tab_rec_flush_lock);
+
+	if (!xt_end_checkpoint(tab->tab_db, ot->ot_thread, NULL))
+		return FAILED;
+	return OK;
+	
+	failed:
+	xt_unlock_mutex_ns(&tab->tab_rec_flush_lock);
+	return FAILED;
+}
+
+xtPublic void xt_flush_table(XTThreadPtr self, XTOpenTablePtr ot)
+{
+	/* GOTCHA [*10*]: This bug was difficult to find.
+	 * It occured on Windows in the multi_update
+	 * test, sometimes.
+	 *
+	 * What happens is the checkpointer starts to
+	 * flush the table, and gets to the 
+	 * XT_FLUSH_RR_FILE part.
+	 *
+	 * Then a rename occurs, and the user thread
+	 * flushes the table, and goes through and
+	 * writes the table header, with the most
+	 * recent table operation (the last operation
+	 * that occurred).
+	 *
+	 * The checkpointer the completes and
+	 * also writes the header, but with old
+	 * values (as read in xt_tab_store_header()).
+	 *
+	 * The then user thread continues, and
+	 * reopens the table after rename.
+	 * On reopen, it reads the old value from the header,
+	 * and sets the current operation number.
+	 *
+	 * Now there is a problem in the able cache,
+	 * because some cache pages have operation numbers
+	 * that are greater than current operation
+	 * number!
+	 *
+	 * This later lead to the free-er hanging while
+	 * it waited for an operation to be 
+	 * written to the disk that never would be.
+	 * This is because a page can only be freed when
+	 * the head operation number has passed the
+	 * page operation number.
+	 *
+	 * Which indicates that the page has been written
+	 * to disk.
+	 */
+
+	if (!xt_flush_record_row(ot, NULL, FALSE))
+		xt_throw(self);
+
+	/* This was before the table data flush,
+	 * (after xt_tab_store_header() above,
+	 * but I don't think it makes any difference.
+	 * Because in the checkpointer it was at this
+	 * position.
+	 */
+	if (!xt_flush_indices(ot, NULL, FALSE))
+		xt_throw(self);
+
+}
+
+static XTOpenTablePtr tab_open_table(XTTableHPtr tab)
+{
+	volatile XTOpenTablePtr	ot;
+	XTThreadPtr				self;
+
+	if (!(ot = (XTOpenTablePtr) xt_malloc_ns(sizeof(XTOpenTableRec))))
+		return NULL;
+	memset(ot, 0, offsetof(XTOpenTableRec, ot_ind_wbuf));
+
+	ot->ot_seq_page = NULL;
+	ot->ot_seq_data = NULL;
+
+	self = xt_get_self();
+	try_(a) {
+		xt_heap_reference(self, tab);
+		ot->ot_table = tab;
+#ifdef XT_USE_ROW_REC_MMAP_FILES
+		ot->ot_row_file = xt_open_fmap(self, ot->ot_table->tab_row_file->fil_path, xt_db_row_file_grow_size);
+		ot->ot_rec_file = xt_open_fmap(self, ot->ot_table->tab_rec_file->fil_path, xt_db_data_file_grow_size);
+#else
+		ot->ot_row_file = xt_open_file(self, ot->ot_table->tab_row_file->fil_path, XT_FS_DEFAULT);
+		ot->ot_rec_file = xt_open_file(self, ot->ot_table->tab_rec_file->fil_path, XT_FS_DEFAULT);
+#endif
+#ifdef XT_USE_DIRECT_IO_ON_INDEX
+		ot->ot_ind_file = xt_open_file(self, ot->ot_table->tab_ind_file->fil_path, XT_FS_MISSING_OK | XT_FS_DIRECT_IO);
+#else
+		ot->ot_ind_file = xt_open_file(self, ot->ot_table->tab_ind_file->fil_path, XT_FS_MISSING_OK);
+#endif
+	}
+	catch_(a) {
+		;
+	}
+	cont_(a);
+
+	if (!ot->ot_table || !ot->ot_row_file || !ot->ot_rec_file)
+		goto failed;
+
+	if (!(ot->ot_row_rbuffer = (xtWord1 *) xt_malloc_ns(ot->ot_table->tab_dic.dic_rec_size)))
+		goto failed;
+	ot->ot_row_rbuf_size = ot->ot_table->tab_dic.dic_rec_size;
+	if (!(ot->ot_row_wbuffer = (xtWord1 *) xt_malloc_ns(ot->ot_table->tab_dic.dic_rec_size)))
+		goto failed;
+	ot->ot_row_wbuf_size = ot->ot_table->tab_dic.dic_rec_size;
+
+	/* Cache this stuff to speed access a bit: */
+	ot->ot_rec_fixed = ot->ot_table->tab_dic.dic_rec_fixed;
+	ot->ot_rec_size = ot->ot_table->tab_dic.dic_rec_size;
+
+	return ot;
+
+	failed:
+	tab_close_table(ot);
+	return NULL;
+}
+
+xtPublic XTOpenTablePtr xt_open_table(XTTableHPtr tab)
+{
+	return tab_open_table(tab);
+}
+
+xtPublic void xt_close_table(XTOpenTablePtr ot, xtBool flush, xtBool have_table_lock)
+{
+	if (flush) {
+		if (!xt_flush_record_row(ot, NULL, have_table_lock))
+			xt_log_and_clear_exception_ns();
+
+		if (!xt_flush_indices(ot, NULL, have_table_lock))
+			xt_log_and_clear_exception_ns();
+	}
+	tab_close_table(ot);
+}
+
+xtPublic int xt_use_table_by_id(XTThreadPtr self, XTTableHPtr *r_tab, XTDatabaseHPtr db, xtTableID tab_id)
+{
+	XTTableEntryPtr	te_ptr;
+	XTTableHPtr		tab = NULL;
+	int				r = XT_TAB_OK;
+	char			path[PATH_MAX];
+
+	if (!db)
+		xt_throw_xterr(XT_CONTEXT, XT_ERR_NO_DATABASE_IN_USE);
+	xt_ht_lock(self, db->db_tables);
+	pushr_(xt_ht_unlock, db->db_tables);
+
+	te_ptr = (XTTableEntryPtr) xt_sl_find(self, db->db_table_by_id, &tab_id);
+	if (te_ptr) {
+		if (!(tab = te_ptr->te_table)) {
+			/* Open the table: */
+			xt_strcpy(PATH_MAX, path, te_ptr->te_tab_path->tp_path);
+			xt_add_dir_char(PATH_MAX, path);
+			xt_strcat(PATH_MAX, path, te_ptr->te_tab_name);
+			r = tab_new_handle(self, &tab, db, tab_id, (XTPathStrPtr) path, TRUE, NULL);
+		}
+	}
+	else
+		r = XT_TAB_NOT_FOUND;
+
+	if (tab)
+		xt_heap_reference(self, tab);
+	*r_tab = tab;
+
+	freer_(); // xt_ht_unlock(db->db_tables)
+	return r;
+}
+
+/* The fixed part of the record is already in the row buffer.
+ * This function loads the extended part, expanding the row
+ * buffer if necessary.
+ */
+xtPublic xtBool xt_tab_load_ext_data(XTOpenTablePtr ot, xtRecordID load_rec_id, xtWord1 *buffer, u_int cols_req)
+{
+	size_t					log_size;
+	xtLogID					log_id;
+	xtLogOffset				log_offset;
+	xtWord1					save_buffer[offsetof(XTactExtRecEntryDRec, er_data)];
+	xtBool					retried = FALSE;
+	XTactExtRecEntryDPtr	ext_data_ptr;
+	size_t					log_size2;
+	xtTableID				curr_tab_id;
+	xtRecordID				curr_rec_id;
+
+	log_size = XT_GET_DISK_4(((XTTabRecExtDPtr) ot->ot_row_rbuffer)->re_log_dat_siz_4);
+	XT_GET_LOG_REF(log_id, log_offset, (XTTabRecExtDPtr) ot->ot_row_rbuffer);
+
+	if (ot->ot_rec_size + log_size > ot->ot_row_rbuf_size) {
+		if (!xt_realloc_ns((void **) &ot->ot_row_rbuffer, ot->ot_rec_size + log_size))
+			return FAILED;
+		ot->ot_row_rbuf_size = ot->ot_rec_size + log_size;
+	}
+
+	/* Read the extended part first: */
+	ext_data_ptr = (XTactExtRecEntryDPtr) (ot->ot_row_rbuffer + ot->ot_rec_size - offsetof(XTactExtRecEntryDRec, er_data));
+
+	/* Save the data which the header will overwrite: */
+	memcpy(save_buffer, ext_data_ptr, offsetof(XTactExtRecEntryDRec, er_data));
+	
+	reread:
+	if (!ot->ot_thread->st_dlog_buf.dlb_read_log(log_id, log_offset, offsetof(XTactExtRecEntryDRec, er_data) + log_size, (xtWord1 *) ext_data_ptr, ot->ot_thread))
+		goto retry_read;
+
+	log_size2 = XT_GET_DISK_4(ext_data_ptr->er_data_size_4);
+	curr_tab_id = XT_GET_DISK_4(ext_data_ptr->er_tab_id_4);
+	curr_rec_id = XT_GET_DISK_4(ext_data_ptr->er_rec_id_4);
+
+	if (log_size2 != log_size || curr_tab_id != ot->ot_table->tab_id || curr_rec_id != load_rec_id) {
+		/* [(3)] This can happen in the following circumstances:
+		 * - A new record is created, but the data log is not
+		 * flushed.
+		 * - The server quits.
+		 * - On restart the transaction is rolled back, but the data record
+		 *   was not written, so later a new record could be written at this
+		 *   location.
+		 * - Later the sweeper tries to cleanup this record, and finds
+		 *   that a different record has been written at this position.
+		 *
+		 * NOTE: Index entries can only be written to disk for records
+		 *       that have been committed to the disk, because uncommitted
+		 *       records may not exist in order to remove the index entry
+		 *       on cleanup.
+		 */
+		xt_register_xterr(XT_REG_CONTEXT, XT_ERR_BAD_EXT_RECORD);
+		goto retry_read;
+	}
+
+	/* Restore the saved area: */
+	memcpy(ext_data_ptr, save_buffer, offsetof(XTactExtRecEntryDRec, er_data));
+
+	if (retried)
+		xt_unlock_mutex_ns(&ot->ot_table->tab_db->db_co_ext_lock);
+	return myxt_load_row(ot, ot->ot_row_rbuffer + XT_REC_EXT_HEADER_SIZE, buffer, cols_req);
+
+	retry_read:
+	if (!retried) {
+		/* (1) It may be that reading the log fails because the garbage collector
+		 * has moved the record since we determined the location.
+		 * We handle this here, by re-reading the data the garbage collector
+		 * would have updated.
+		 *
+		 * (2) It may also happen that a new record is just being updated or
+		 * inserted. It is possible that the handle part of the record
+		 * has been written, but not yet the overflow.
+		 * This means that repeating the read attempt could work.
+		 *
+		 * (3) The extended data has been written by another handler and not yet
+		 * flushed. This should not happen because on committed extended
+		 * records are read, and all data should be flushed before
+		 * commit!
+		 *
+		 * NOTE: (2) above is not a problem when versioning is working
+		 * correctly. In this case, we should never try to read the extended
+		 * part of an uncommitted record (belonging to some other thread/
+		 * transaction).
+		 */
+		XTTabRecExtDRec	rec_buf;
+
+		xt_lock_mutex_ns(&ot->ot_table->tab_db->db_co_ext_lock);
+		retried = TRUE;
+
+		if (!xt_tab_get_rec_data(ot, load_rec_id, XT_REC_EXT_HEADER_SIZE, (xtWord1 *) &rec_buf))
+			goto failed;
+
+		XT_GET_LOG_REF(log_id, log_offset, &rec_buf);
+		goto reread;
+	}
+
+	failed:
+	if (retried)
+		xt_unlock_mutex_ns(&ot->ot_table->tab_db->db_co_ext_lock);
+	return FAILED;
+}
+
+xtPublic xtBool xt_tab_put_rec_data(XTOpenTablePtr ot, xtRecordID rec_id, size_t size, xtWord1 *buffer, xtOpSeqNo *op_seq)
+{
+	register XTTableHPtr	tab = ot->ot_table;
+
+	ASSERT_NS(rec_id);
+
+	return tab->tab_recs.xt_tc_write(ot->ot_rec_file, rec_id, 0, size, buffer, op_seq, TRUE, ot->ot_thread);
+}
+
+xtPublic xtBool xt_tab_put_log_op_rec_data(XTOpenTablePtr ot, u_int status, xtRecordID free_rec_id, xtRecordID rec_id, size_t size, xtWord1 *buffer)
+{
+	register XTTableHPtr	tab = ot->ot_table;
+	xtOpSeqNo				op_seq;
+
+	ASSERT_NS(rec_id);
+
+	if (status == XT_LOG_ENT_REC_MOVED) {
+		if (!tab->tab_recs.xt_tc_write(ot->ot_rec_file, rec_id, offsetof(XTTabRecExtDRec, re_log_id_2), size, buffer, &op_seq, TRUE, ot->ot_thread))
+			return FAILED;
+	}
+#ifdef DEBUG
+	else if (status == XT_LOG_ENT_REC_CLEANED_1) {
+		ASSERT_NS(0);	// shouldn't be used anymore
+	}
+#endif
+	else {
+		if (!tab->tab_recs.xt_tc_write(ot->ot_rec_file, rec_id, 0, size, buffer, &op_seq, TRUE, ot->ot_thread))
+			return FAILED;
+	}
+
+	return xt_xlog_modify_table(tab->tab_id, status, op_seq, free_rec_id, rec_id, size, buffer, ot->ot_thread);
+}
+
+xtPublic xtBool xt_tab_put_log_rec_data(XTOpenTablePtr ot, u_int status, xtRecordID free_rec_id, xtRecordID rec_id, size_t size, xtWord1 *buffer, xtOpSeqNo *op_seq)
+{
+	register XTTableHPtr	tab = ot->ot_table;
+
+	ASSERT_NS(rec_id);
+
+	if (status == XT_LOG_ENT_REC_MOVED) {
+		if (!tab->tab_recs.xt_tc_write(ot->ot_rec_file, rec_id, offsetof(XTTabRecExtDRec, re_log_id_2), size, buffer, op_seq, TRUE, ot->ot_thread))
+			return FAILED;
+	}
+	else {
+		if (!tab->tab_recs.xt_tc_write(ot->ot_rec_file, rec_id, 0, size, buffer, op_seq, TRUE, ot->ot_thread))
+			return FAILED;
+	}
+
+	return xt_xlog_modify_table(tab->tab_id, status, *op_seq, free_rec_id, rec_id, size, buffer, ot->ot_thread);
+}
+
+xtPublic xtBool xt_tab_get_rec_data(XTOpenTablePtr ot, xtRecordID rec_id, size_t size, xtWord1 *buffer)
+{
+	register XTTableHPtr	tab = ot->ot_table;
+
+	ASSERT_NS(rec_id);
+
+	return tab->tab_recs.xt_tc_read(ot->ot_rec_file, rec_id, (size_t) size, buffer, ot->ot_thread);
+}
+
+/*
+ * Note: this function grants locks even to transactions that
+ * are not specifically waiting for this transaction.
+ * This is required, because all threads waiting for 
+ * a lock should be considered "equal". In other words,
+ * they should not have to wait for the "right" transaction
+ * before they get the lock, or it will turn into a
+ * race to wait for the correct transaction.
+ *
+ * A transaction T1 can end up waiting for the wrong transaction
+ * T2, because T2 has released the lock, and given it to T3.
+ * Of course, T1 will wake up soon and realize this, but
+ * it is a matter of timing.
+ *
+ * The main point is that T2 has release the lock because
+ * it has ended (see {RELEASING-LOCKS} for more details)
+ * and therefore, there is no danger of it claiming the
+ * lock again, which can lead to a deadlock if T1 is
+ * given the lock instead of T3 in the example above.
+ * Then, if T2 tries to regain the lock before T1
+ * realizes that it has the lock.
+ */
+//static xtBool tab_get_lock_after_wait(XTThreadPtr thread, XTLockWaitPtr lw)
+//{
+//	register XTTableHPtr	tab = lw->lw_ot->ot_table;
+
+	/* {ROW-LIST-LOCK}
+	 * I don't believe this lock is required. If it is, please explain why!!
+	 * XT_TAB_ROW_READ_LOCK(&tab->tab_row_rwlock[gl->lw_row_id % XT_ROW_RWLOCKS], thread);
+	 *
+	 * With the old row lock implementation a XT_TAB_ROW_WRITE_LOCK was required because
+	 * the row locking did not have its own locks.
+	 * The new list locking has its own locks. I was using XT_TAB_ROW_READ_LOCK,
+	 * but i don't think this is required.
+	 */
+//	return tab->tab_locks.xt_set_temp_lock(lw->lw_ot, lw, &lw->lw_thread->st_lock_list);
+//}
+
+/*
+ * NOTE: Previously this function did not gain the row lock.
+ * If this change is a problem, please document why!
+ * The previously implementation did wait until no lock was on the
+ * row.
+ *
+ * I am thinking that it is simply a good idea to grab the lock,
+ * instead of waiting for no lock, before the retry. But it could
+ * result in locking more than required!
+ */
+static xtBool tab_wait_for_update(register XTOpenTablePtr ot, xtRowID row_id, xtXactID xn_id, XTThreadPtr thread)
+{
+	XTLockWaitRec	lw;
+	XTXactWaitRec	xw;
+	xtBool			ok;
+				
+	xw.xw_xn_id = xn_id;
+
+	lw.lw_thread = thread;
+	lw.lw_ot = ot;
+	lw.lw_row_id = row_id;
+	lw.lw_row_updated = FALSE;
+
+	/* First try to get the lock: */
+	if (!ot->ot_table->tab_locks.xt_set_temp_lock(ot, &lw, &thread->st_lock_list))
+		return FAILED;
+	if (lw.lw_curr_lock != XT_NO_LOCK)
+		/* Wait for the lock, then the transaction: */
+		ok = xt_xn_wait_for_xact(thread, &xw, &lw);
+	else
+		/* Just wait for the transaction: */
+		ok = xt_xn_wait_for_xact(thread, &xw, NULL);
+	
+#ifdef DEBUG_LOCK_QUEUE
+	ot->ot_table->tab_locks.rl_check(&lw);
+#endif
+	return ok;
+}
+
+/* {WAIT-FOR}
+ * XT_OLD - The record is old. No longer visible because there is
+ * newer committed record before it in the record list.
+ * This is a special case of FALSE (the record is not visible).
+ * (see {WAIT-FOR} for details).
+ * It is significant because if we find too many of these when
+ * searching for records, then we have reason to believe the
+ * sweeper is far behind. This can happen in a test like this:
+ * runTest(INCREMENT_TEST, 2, INCREMENT_TEST_UPDATE_COUNT);
+ * What happens is T1 detects an updated row by T2,
+ * but T2 has not committed yet.
+ * It waits for T2. T2 commits and updates again before T1
+ * can update.
+ *
+ * Of course if we got a lock on the row when T2 quits, then
+ * this would not happen!
+ */
+
+/*
+ * Is a record visible?
+ * Returns TRUE, FALSE, XT_ERR.
+ *
+ * TRUE - The record is visible.
+ * FALSE - The record is not visible.
+ * XT_ERR - An exception (error) occurred.
+ * XT_NEW - The most recent variation of this row has been returned
+ * and is to be used instead of the input!
+ * XT_REREAD - Re-read the record, and try again.
+ *
+ * Basically, a record is visible if it was committed on or before
+ * the transactions "visible time" (st_visible_time), and there
+ * are no other visible records before this record in the
+ * variation chain for the record.
+ *
+ * This holds in general, but you don't always get to see the
+ * visible record (as defined in this sence).
+ *
+ * On any kind of update (SELECT FOR UPDATE, UPDATE or DELETE), you
+ * get to see the most recent variation of the row!
+ *
+ * So on update, this function will wait if necessary for a recent
+ * update to be committed.
+ *
+ * So an update is a kind of "committed read" with a wait for
+ * uncommitted records.
+ *
+ * The result:
+ * - INSERTS may not seen by the update read, depending on when
+ *   they occur.
+ * - Records may be returned in non-index order.
+ * - New records returned must be checked again by an index scan
+ *   to make sure they conform to the condition!
+ * 
+ * CREATE TABLE test_tab (ID int primary key, Value int, Name varchar(20), 
+ * index(Value, Name)) ENGINE=pbxt;
+ * INSERT test_tab values(4, 2, 'D');
+ * INSERT test_tab values(5, 2, 'E');
+ * INSERT test_tab values(6, 2, 'F');
+ * INSERT test_tab values(7, 2, 'G');
+ * 
+ * -- C1
+ * begin;
+ * select * from test_tab where id = 6 for update;
+ * -- C2
+ * begin;
+ * select * from test_tab where value = 2 order by value, name for update;
+ * -- C1
+ * update test_tab set Name = 'A' where id = 7;
+ * commit;
+ * -- C2
+ * Result order D, E, F, A.
+ *
+ * But Jim does it like this, so it should be OK.
+ */
+static int tab_visible(register XTOpenTablePtr ot, XTTabRecHeadDPtr rec_head, xtRecordID *new_rec_id)
+{
+	XTThreadPtr				thread = ot->ot_thread;
+	xtXactID				xn_id;
+	XTTabRecHeadDRec		var_head;
+	xtRowID					row_id;
+	xtRecordID				var_rec_id;
+	register XTTableHPtr	tab;
+	xtBool					wait = FALSE;
+	xtXactID				wait_xn_id = 0;
+#ifdef TRACE_VARIATIONS
+	char					t_buf[500];
+	int						len;
+#endif
+	int						result = TRUE;
+	xtBool					rec_clean;
+	xtRecordID				invalid_rec;
+
+	retry:
+	/* It can be that between the time that I read the index,
+	 * and the time that I try to access the
+	 * record, that the record is removed by
+	 * the sweeper!
+	 */
+	if (XT_REC_NOT_VALID(rec_head->tr_rec_type_1))
+		return FALSE;
+
+	row_id = XT_GET_DISK_4(rec_head->tr_row_id_4);
+
+	/* This can happen if the row has been removed, and
+	 * reused:
+	 */
+	if (ot->ot_curr_row_id && row_id != ot->ot_curr_row_id)
+		return FALSE;
+
+#ifdef TRACE_VARIATIONS
+	len = sprintf(t_buf, "row=%d rec=%d ", (int) row_id, (int) ot->ot_curr_rec_id);
+#endif
+	if (!(rec_clean = XT_REC_IS_CLEAN(rec_head->tr_rec_type_1))) {
+		/* The record is not clean, which means it has not been swept.
+		 * So we have to check if it is visible.
+		 */
+		xn_id = XT_GET_DISK_4(rec_head->tr_xact_id_4);
+		switch (xt_xn_status(ot, xn_id, ot->ot_curr_rec_id)) {
+			case XT_XN_VISIBLE:
+				break;
+			case XT_XN_NOT_VISIBLE:
+				if (ot->ot_for_update) {
+					/* It is visible, only if it is an insert,
+					 * which means if has no previous variation.
+					 * Note, if an insert is updated, the record
+					 * should be overwritten (TODO - check this).
+					 */
+					var_rec_id = XT_GET_DISK_4(rec_head->tr_prev_rec_id_4);
+					if (!var_rec_id)
+						break;
+#ifdef TRACE_VARIATIONS
+					if (len <= 450)
+						len += sprintf(t_buf+len, "OTHER COMMIT (OVERWRITTEN) T%d\n", (int) xn_id);
+					xt_ttracef(thread, "%s", t_buf);
+#endif
+				}
+#ifdef TRACE_VARIATIONS
+				else {
+					if (len <= 450)
+						len += sprintf(t_buf+len, "OTHER COMMIT T%d\n", (int) xn_id);
+					xt_ttracef(thread, "%s", t_buf);
+				}
+#endif
+				/* {WAKE-SW}
+				 * The record is not visible, although it has been committed.
+				 * Clean the transaction ASAP.
+				 */
+				ot->ot_table->tab_db->db_sw_faster |= XT_SW_DIRTY_RECORD_FOUND;
+				return FALSE;
+			case XT_XN_ABORTED:
+				/* {WAKE-SW}
+				 * Reading an aborted record, this transaction
+				 * must be cleaned up ASAP!
+				 */
+				ot->ot_table->tab_db->db_sw_faster |= XT_SW_DIRTY_RECORD_FOUND;
+#ifdef TRACE_VARIATIONS
+				if (len <= 450)
+					len += sprintf(t_buf+len, "ABORTED T%d\n", (int) xn_id);
+				xt_ttracef(thread, "%s", t_buf);
+#endif
+				return FALSE;
+			case XT_XN_MY_UPDATE:
+				/* This is a record written by this transaction. */
+				if (thread->st_is_update) {
+					/* Check that it was not written by the current update statement: */
+					if (XT_STAT_ID_MASK(ot->ot_update_id) == rec_head->tr_stat_id_1) {
+#ifdef TRACE_VARIATIONS
+						if (len <= 450)
+							len += sprintf(t_buf+len, "MY UPDATE IN THIS STATEMENT T%d\n", (int) xn_id);
+						xt_ttracef(thread, "%s", t_buf);
+#endif
+						return FALSE;
+					}
+				}
+				ot->ot_curr_row_id = row_id;
+				ot->ot_curr_updated = TRUE;
+				if (!(xt_tab_get_row(ot, row_id, &var_rec_id)))
+					return XT_ERR;
+				/* It is visible if it is at the front of the list.
+				 * An update can end up not being at the front of the list
+				 * if it is deleted afterwards!
+				 */
+#ifdef TRACE_VARIATIONS
+				if (len <= 450) {
+					if (var_rec_id == ot->ot_curr_rec_id)
+						len += sprintf(t_buf+len, "MY UPDATE T%d\n", (int) xn_id);
+					else
+						len += sprintf(t_buf+len, "MY UPDATE (OVERWRITTEN) T%d\n", (int) xn_id);
+				}
+				xt_ttracef(thread, "%s", t_buf);
+#endif
+				return var_rec_id == ot->ot_curr_rec_id;
+			case XT_XN_OTHER_UPDATE:
+				if (ot->ot_for_update) {
+					/* If this is an insert, we are interested!
+					 * Updated values are handled below. This is because
+					 * the changed (new) records returned below are always
+					 * followed (in the version chain) by the record
+					 * we would have returned (if nothing had changed).
+					 *
+					 * As a result, we only return records here which have
+					 * no "history". 
+					 */
+					var_rec_id = XT_GET_DISK_4(rec_head->tr_prev_rec_id_4);
+					if (!var_rec_id) {
+#ifdef TRACE_VARIATIONS
+						if (len <= 450)
+							len += sprintf(t_buf+len, "OTHER INSERT (WAIT FOR) T%d\n", (int) xn_id);
+						xt_ttracef(thread, "%s", t_buf);
+#endif
+						if (!tab_wait_for_update(ot, row_id, xn_id, thread))
+							return XT_ERR;
+						if (!xt_tab_get_rec_data(ot, ot->ot_curr_rec_id, sizeof(XTTabRecHeadDRec), (xtWord1 *) &var_head))
+							return XT_ERR;
+						rec_head = &var_head;
+						goto retry;
+					}
+				}
+#ifdef TRACE_VARIATIONS
+				if (len <= 450)
+					len += sprintf(t_buf+len, "OTHER UPDATE T%d\n", (int) xn_id);
+				xt_ttracef(thread, "%s", t_buf);
+#endif
+				return FALSE;
+			case XT_XN_REREAD:
+#ifdef TRACE_VARIATIONS
+				if (len <= 450)
+					len += sprintf(t_buf+len, "REREAD?! T%d\n", (int) xn_id);
+				xt_ttracef(thread, "%s", t_buf);
+#endif
+				return XT_REREAD;
+		}
+	}
+
+	/* Follow the variation chain until we come to this record.
+	 * If it is not the first visible variation then
+	 * it is not visible at all. If it in not found on the
+	 * variation chain, it is also not visible.
+	 */
+	tab = ot->ot_table;
+
+	retry_2:
+
+#ifdef XT_USE_LIST_BASED_ROW_LOCKS
+	/* The list based row locks used there own locks, so
+	 * it is not necessary to get a write lock here.
+	 */
+	XT_TAB_ROW_READ_LOCK(&tab->tab_row_rwlock[row_id % XT_ROW_RWLOCKS], thread);
+#else
+	if (ot->ot_for_update)
+		XT_TAB_ROW_WRITE_LOCK(&tab->tab_row_rwlock[row_id % XT_ROW_RWLOCKS], thread);
+	else
+		XT_TAB_ROW_READ_LOCK(&tab->tab_row_rwlock[row_id % XT_ROW_RWLOCKS], thread);
+#endif
+
+	invalid_rec = 0;
+	retry_3:
+	if (!(xt_tab_get_row(ot, row_id, &var_rec_id)))
+		goto failed;
+#ifdef TRACE_VARIATIONS
+	len += sprintf(t_buf+len, "ROW=%d", (int) row_id);
+#endif
+	while (var_rec_id != ot->ot_curr_rec_id) {
+		if (!var_rec_id) {
+#ifdef TRACE_VARIATIONS
+			xt_ttracef(thread, "row=%d rec=%d NOT VISI not found in list\n", (int) row_id, (int) ot->ot_curr_rec_id);
+#endif
+			goto not_found;
+		}
+		if (!xt_tab_get_rec_data(ot, var_rec_id, sizeof(XTTabRecHeadDRec), (xtWord1 *) &var_head))
+			goto failed;
+#ifdef TRACE_VARIATIONS
+		if (len <= 450)
+			len += sprintf(t_buf+len, " -> %d(%d)", (int) var_rec_id, (int) var_head.tr_rec_type_1);
+#endif
+		/* All clean records are visible, by all transactions: */
+		if (XT_REC_IS_CLEAN(var_head.tr_rec_type_1)) {
+#ifdef TRACE_VARIATIONS
+			xt_ttracef(thread, "row=%d rec=%d NOT VISI clean rec found\n", (int) row_id, (int) ot->ot_curr_rec_id);
+#endif
+			goto not_found;
+		}
+		if (XT_REC_IS_FREE(var_head.tr_rec_type_1)) {
+#ifdef TRACE_VARIATIONS
+			xt_ttracef(thread, "row=%d rec=%d NOT VISI free rec found?!\n", (int) row_id, (int) ot->ot_curr_rec_id);
+#endif
+			/*
+			 * After an analysis we came to conclusion that this situation is
+			 * possible and valid. It can happen if index scan and row deletion
+			 * go in parallel:
+			 *
+			 *      Client Thread                                Sweeper
+			 *      -------------                                -------
+			 *   1. start index scan, lock the index file.
+			 *                                                2. start row deletion, wait for index lock
+			 *   3. unlock the index file, start search for 
+			 *      the valid version of the record
+			 *                                                4. delete the row, mark record as freed, 
+			 *                                                   but not yet cleaned by sweeper
+			 *   5. observe the record being freed
+			 *
+			 * after these steps we can get here, if the record was marked as free after
+			 * the tab_visible was entered by the scanning thread. 
+			 *
+			 */
+			if (invalid_rec != var_rec_id) {
+				/* This was "var_rec_id = invalid_rec", caused an infinite loop (bug #310184!) */
+				invalid_rec = var_rec_id;
+				goto retry_3;
+			}
+			/* Assume end of list. */
+			goto not_found;
+		}
+
+		/* This can happen if the row has been removed, and
+		 * reused:
+		 */
+		if (row_id != XT_GET_DISK_4(var_head.tr_row_id_4))
+			goto not_found;
+
+		xn_id = XT_GET_DISK_4(var_head.tr_xact_id_4);
+		/* This variation is visibleif committed before this
+		 * transaction started, or updated by this transaction.
+		 *
+		 * We now know that this is the valid variation for
+		 * this record (for this table) for this transaction!
+		 * This will not change, unless the transaction
+		 * updates the record (again).
+		 *
+		 * So we can store this information as a hint, if
+		 * we see other variations belonging to this record,
+		 * then we can ignore them immediately!
+		 */
+		switch (xt_xn_status(ot, xn_id, var_rec_id)) {
+			case XT_XN_VISIBLE:
+				/* {WAKE-SW}
+				 * We have encountered a record that has been overwritten, if the
+				 * record has not been cleaned, then the sweeper is too far
+				 * behind!
+				 */
+				if (!rec_clean)
+					ot->ot_table->tab_db->db_sw_faster |= XT_SW_DIRTY_RECORD_FOUND;
+#ifdef TRACE_VARIATIONS
+				xt_ttracef(thread, "row=%d rec=%d NOT VISI committed rec found\n", (int) row_id, (int) ot->ot_curr_rec_id);
+#endif
+				goto not_found;
+			case XT_XN_NOT_VISIBLE:
+				if (ot->ot_for_update) {
+					/* Substitute this record for the one we
+					 * are reading!!
+					 */
+					if (result == TRUE) {
+						if (XT_REC_IS_DELETE(var_head.tr_rec_type_1))
+							result = FALSE;
+						else {
+							*new_rec_id = var_rec_id;
+							result = XT_NEW;
+						}
+					}
+				}
+				break;
+			case XT_XN_ABORTED:
+				/* Ignore the record, it will be removed. */
+				break;
+			case XT_XN_MY_UPDATE:
+#ifdef TRACE_VARIATIONS
+				xt_ttracef(thread, "row=%d rec=%d NOT VISI my update found\n", (int) row_id, (int) ot->ot_curr_rec_id);
+#endif
+				goto not_found;
+			case XT_XN_OTHER_UPDATE:
+				/* Wait for this update to commit or abort: */
+				if (!wait) {
+					wait = TRUE;
+					wait_xn_id = xn_id;
+				}
+#ifdef TRACE_VARIATIONS
+				if (len <= 450)
+					len += sprintf(t_buf+len, "-T%d", (int) wait_xn_id);
+#endif
+				break;
+			case XT_XN_REREAD:
+				/* {RETRY-READ}
+				 * TODO: This is not as "correct" as it could be.
+				 * Such records should be considered to be aborted,
+				 * and removed from the list.
+				 */
+				if (invalid_rec != var_rec_id) {
+					invalid_rec = var_rec_id;
+					goto retry_3;
+				}
+				if (!tab_record_corrupt(ot, row_id, var_rec_id, true, 1))
+					goto failed;
+
+				/* Assume end of list. */
+#ifdef XT_CRASH_DEBUG
+				/* Should not happen! */
+				xt_crash_me();
+#endif
+				goto not_found;
+		}
+		var_rec_id = XT_GET_DISK_4(var_head.tr_prev_rec_id_4);
+	}
+#ifdef TRACE_VARIATIONS
+	if (len <= 450)
+		sprintf(t_buf+len, " -> %d(%d)\n", (int) var_rec_id, (int) rec_head->tr_rec_type_1);
+	else
+		sprintf(t_buf+len, " ...\n");
+	//xt_ttracef(thread, "%s", t_buf);
+#endif
+
+	if (ot->ot_for_update) {
+		xtBool			ok;
+		XTLockWaitRec	lw;
+
+		if (wait) {
+			XT_TAB_ROW_UNLOCK(&tab->tab_row_rwlock[row_id % XT_ROW_RWLOCKS], thread);
+#ifdef TRACE_VARIATIONS
+			xt_ttracef(thread, "T%d WAIT FOR T%d (will retry)\n", (int) thread->st_xact_data->xd_start_xn_id, (int) wait_xn_id);
+#endif
+			if (!tab_wait_for_update(ot, row_id, wait_xn_id, thread))
+				return XT_ERR;
+			wait = FALSE;
+			wait_xn_id = 0;
+			/*
+			 * Retry in order to try to avoid missing
+			 * any records that we should see in FOR UPDATE
+			 * mode.
+			 *
+			 * We also want to take another look at the record
+			 * we just tried to read.
+			 *
+			 * If it has been updated, then a new record has
+			 * been created. This will be detected when we
+			 * try to read it again, and XT_NEW will be returned.
+			 */
+			thread->st_statistics.st_retry_index_scan++;
+			return XT_RETRY;
+		}
+
+		/* {ROW-LIST-LOCK} */
+		lw.lw_thread = thread;
+		lw.lw_ot = ot;
+		lw.lw_row_id = row_id;
+		lw.lw_row_updated = FALSE;
+		ok = tab->tab_locks.xt_set_temp_lock(ot, &lw, &thread->st_lock_list);
+		XT_TAB_ROW_UNLOCK(&tab->tab_row_rwlock[row_id % XT_ROW_RWLOCKS], thread);
+		if (!ok) {
+#ifdef DEBUG_LOCK_QUEUE
+			ot->ot_table->tab_locks.rl_check(&lw);
+#endif
+			return XT_ERR;
+		}
+		if (lw.lw_curr_lock != XT_NO_LOCK) {
+#ifdef TRACE_VARIATIONS
+			xt_ttracef(thread, "T%d WAIT FOR LOCK(%s) T%d\n", (int) thread->st_xact_data->xd_start_xn_id, (int) lw.lw_curr_lock == XT_TEMP_LOCK ? "temp" : "perm", (int) xn_id);
+#endif
+			if (!xt_xn_wait_for_xact(thread, NULL, &lw)) {
+#ifdef DEBUG_LOCK_QUEUE
+				ot->ot_table->tab_locks.rl_check(&lw);
+#endif
+				return XT_ERR;
+			}
+#ifdef DEBUG_LOCK_QUEUE
+			ot->ot_table->tab_locks.rl_check(&lw);
+#endif
+#ifdef TRACE_VARIATIONS
+			len = sprintf(t_buf, "(retry): row=%d rec=%d ", (int) row_id, (int) ot->ot_curr_rec_id);
+#endif
+			/* GOTCHA!
+			 * Reset the result before we go down the list again, to make sure we
+			 * get the latest record!!
+			 */
+			result = TRUE;
+			thread->st_statistics.st_reread_record_list++;
+			goto retry_2;
+		}
+#ifdef DEBUG_LOCK_QUEUE
+		ot->ot_table->tab_locks.rl_check(&lw);
+#endif
+	}
+	else {
+		XT_TAB_ROW_UNLOCK(&tab->tab_row_rwlock[row_id % XT_ROW_RWLOCKS], thread);
+	}
+
+#ifdef TRACE_VARIATIONS
+	if (result == XT_NEW)
+		xt_ttracef(thread, "row=%d rec=%d RETURN NEW %d\n", (int) row_id, (int) ot->ot_curr_rec_id, (int) *new_rec_id);
+	else if (result)
+		xt_ttracef(thread, "row=%d rec=%d VISIBLE\n", (int) row_id, (int) ot->ot_curr_rec_id);
+	else
+		xt_ttracef(thread, "row=%d rec=%d RETURN NOT VISIBLE (NEW)\n", (int) row_id, (int) ot->ot_curr_rec_id);
+#endif
+
+	ot->ot_curr_row_id = row_id;
+	ot->ot_curr_updated = FALSE;
+	return result;
+
+	not_found:
+	XT_TAB_ROW_UNLOCK(&tab->tab_row_rwlock[row_id % XT_ROW_RWLOCKS], thread);
+	return FALSE;
+
+	failed:
+	XT_TAB_ROW_UNLOCK(&tab->tab_row_rwlock[row_id % XT_ROW_RWLOCKS], thread);
+	return XT_ERR;
+}
+
+/*
+ * Return TRUE if the record has been read, and is visible.
+ * Return FALSE if the record is not visible.
+ * Return XT_ERR if an error occurs.
+ */
+xtPublic int xt_tab_visible(XTOpenTablePtr ot)
+{
+	xtRowID				row_id;
+	XTTabRecHeadDRec	rec_head;
+	xtRecordID			new_rec_id;
+	xtBool				read_again = FALSE;
+	int					r;
+
+	if ((row_id = ot->ot_curr_row_id)) {
+		/* Fast track, do a quick check.
+		 * Row ID is only set if this record has been committed,
+		 * (and swept).
+		 * Check if it is the first on the list!
+		 */
+		xtRecordID var_rec_id;
+
+		retry:
+		if (!(xt_tab_get_row(ot, row_id, &var_rec_id)))
+			return XT_ERR;
+		if (ot->ot_curr_rec_id == var_rec_id) {
+			/* Looks good.. */
+			if (ot->ot_for_update) {
+				XTThreadPtr		thread = ot->ot_thread;
+				XTTableHPtr		tab = ot->ot_table;
+				XTLockWaitRec	lw;
+
+				/* {ROW-LIST-LOCK} */
+				lw.lw_thread = thread;
+				lw.lw_ot = ot;
+				lw.lw_row_id = row_id;
+				lw.lw_row_updated = FALSE;
+				if (!tab->tab_locks.xt_set_temp_lock(ot, &lw, &thread->st_lock_list)) {
+#ifdef DEBUG_LOCK_QUEUE
+					ot->ot_table->tab_locks.rl_check(&lw);
+#endif
+					return XT_ERR;
+				}
+				if (lw.lw_curr_lock != XT_NO_LOCK) {
+					if (!xt_xn_wait_for_xact(thread, NULL, &lw)) {
+#ifdef DEBUG_LOCK_QUEUE
+						ot->ot_table->tab_locks.rl_check(&lw);
+#endif
+						return XT_ERR;
+					}
+#ifdef DEBUG_LOCK_QUEUE
+					ot->ot_table->tab_locks.rl_check(&lw);
+#endif
+					goto retry;
+				}
+#ifdef DEBUG_LOCK_QUEUE
+				ot->ot_table->tab_locks.rl_check(&lw);
+#endif
+			}
+			return TRUE;
+		}
+	}
+
+	reread:
+	if (!xt_tab_get_rec_data(ot, ot->ot_curr_rec_id, sizeof(XTTabRecHeadDRec), (xtWord1 *) &rec_head))
+		return XT_ERR;
+
+	switch ((r = tab_visible(ot, &rec_head, &new_rec_id))) {
+		case XT_NEW:
+			ot->ot_curr_rec_id = new_rec_id;
+			break;
+		case XT_REREAD:
+			/* Avoid infinite loop: */
+			if (read_again) {
+				/* Should not happen! */
+				if (!tab_record_corrupt(ot, row_id, ot->ot_curr_rec_id, true, 2))
+					return XT_ERR;
+#ifdef XT_CRASH_DEBUG
+				/* Generate a core dump! */
+				xt_crash_me();
+#endif
+				return FALSE;
+			}
+			read_again = TRUE;
+			goto reread;
+		default:
+			break;
+	}
+	return r;
+}
+
+/*
+ * Read a record, and return one of the following:
+ * TRUE - the record has been read, and is visible.
+ * FALSE - the record is not visible.
+ * XT_ERR - an error occurs.
+ * XT_NEW - Means the expected record has been changed.
+ * When doing an index scan, the conditions must be checked again!
+ */
+xtPublic int xt_tab_read_record(register XTOpenTablePtr ot, xtWord1 *buffer)
+{
+	register XTTableHPtr	tab = ot->ot_table;
+	size_t					rec_size = tab->tab_dic.dic_rec_size;
+	xtRecordID				new_rec_id;
+	int						result;
+	xtBool					read_again = FALSE;
+
+	if (!(ot->ot_thread->st_xact_data)) {
+		xt_register_xterr(XT_REG_CONTEXT, XT_ERR_NO_TRANSACTION);
+		return XT_ERR;
+	}
+
+	reread:
+	if (!xt_tab_get_rec_data(ot, ot->ot_curr_rec_id, rec_size, ot->ot_row_rbuffer))
+		return XT_ERR;
+
+	switch (tab_visible(ot, (XTTabRecHeadDPtr) ot->ot_row_rbuffer, &new_rec_id)) {
+		case FALSE:
+			return FALSE;
+		case XT_ERR:
+			return XT_ERR;
+		case XT_NEW:
+			if (!xt_tab_get_rec_data(ot, new_rec_id, rec_size, ot->ot_row_rbuffer))
+				return XT_ERR;
+			ot->ot_curr_rec_id = new_rec_id;
+			result = XT_NEW;
+			break;
+		case XT_RETRY:
+			return XT_RETRY;
+		case XT_REREAD:
+			/* Avoid infinite loop: */
+			if (read_again) {
+				/* Should not happen! */
+				if (!tab_record_corrupt(ot, XT_GET_DISK_4(((XTTabRecHeadDPtr) ot->ot_row_rbuffer)->tr_row_id_4), ot->ot_curr_rec_id, true, 3))
+					return XT_ERR;
+#ifdef XT_CRASH_DEBUG
+				/* Generate a core dump! */
+				xt_crash_me();
+#endif
+				return FALSE;
+			}
+			read_again = TRUE;
+			goto reread;
+		default:
+			result = OK;
+			break;
+	}
+
+	if (ot->ot_rec_fixed)
+		memcpy(buffer, ot->ot_row_rbuffer + XT_REC_FIX_HEADER_SIZE, rec_size - XT_REC_FIX_HEADER_SIZE);
+	else if (ot->ot_row_rbuffer[0] == XT_TAB_STATUS_VARIABLE || ot->ot_row_rbuffer[0] == XT_TAB_STATUS_VAR_CLEAN) {
+		if (!myxt_load_row(ot, ot->ot_row_rbuffer + XT_REC_FIX_HEADER_SIZE, buffer, ot->ot_cols_req))
+			return XT_ERR;
+	}
+	else {
+		u_int cols_req = ot->ot_cols_req;
+
+		ASSERT_NS(cols_req);
+		if (cols_req && cols_req <= tab->tab_dic.dic_fix_col_count) {
+			if (!myxt_load_row(ot, ot->ot_row_rbuffer + XT_REC_EXT_HEADER_SIZE, buffer, cols_req))
+				return XT_ERR;
+		}
+		else {
+			if (!xt_tab_load_ext_data(ot, ot->ot_curr_rec_id, buffer, cols_req))
+				return XT_ERR;
+		}
+	}
+
+	return result;
+}
+
+/*
+ * Returns:
+ *
+ * TRUE/OK - record was read.
+ * FALSE/FAILED - An error occurred.
+ */
+xtPublic int xt_tab_dirty_read_record(register XTOpenTablePtr ot, xtWord1 *buffer)
+{
+	register XTTableHPtr	tab = ot->ot_table;
+	size_t					rec_size = tab->tab_dic.dic_rec_size;
+
+	if (!xt_tab_get_rec_data(ot, ot->ot_curr_rec_id, rec_size, ot->ot_row_rbuffer))
+		return FAILED;
+
+	if (XT_REC_NOT_VALID(ot->ot_row_rbuffer[0])) {
+		/* Should not happen! */
+		xt_register_xterr(XT_REG_CONTEXT, XT_ERR_RECORD_DELETED);
+		return FAILED;
+	}
+
+	ot->ot_curr_row_id = XT_GET_DISK_4(((XTTabRecHeadDPtr) ot->ot_row_rbuffer)->tr_row_id_4);
+	ot->ot_curr_updated =
+		(XT_GET_DISK_4(((XTTabRecHeadDPtr) ot->ot_row_rbuffer)->tr_xact_id_4) == ot->ot_thread->st_xact_data->xd_start_xn_id);
+
+	if (ot->ot_rec_fixed)
+		memcpy(buffer, ot->ot_row_rbuffer + XT_REC_FIX_HEADER_SIZE, rec_size - XT_REC_FIX_HEADER_SIZE);
+	else if (ot->ot_row_rbuffer[0] == XT_TAB_STATUS_VARIABLE || ot->ot_row_rbuffer[0] == XT_TAB_STATUS_VAR_CLEAN) {
+		if (!myxt_load_row(ot, ot->ot_row_rbuffer + XT_REC_FIX_HEADER_SIZE, buffer, ot->ot_cols_req))
+			return FAILED;
+	}
+	else {
+		u_int cols_req = ot->ot_cols_req;
+
+		ASSERT_NS(cols_req);
+		if (cols_req && cols_req <= tab->tab_dic.dic_fix_col_count) {
+			if (!myxt_load_row(ot, ot->ot_row_rbuffer + XT_REC_EXT_HEADER_SIZE, buffer, cols_req))
+				return FAILED;
+		}
+		else {
+			if (!xt_tab_load_ext_data(ot, ot->ot_curr_rec_id, buffer, cols_req))
+				return FAILED;
+		}
+	}
+
+	return OK;
+}
+
+#ifdef XT_USE_ROW_REC_MMAP_FILES
+/* Loading into cache is not required,
+ * Instead we copy the memory map to load the
+ * data.
+ */
+#define TAB_ROW_LOAD_CACHE		FALSE
+#else
+#define TAB_ROW_LOAD_CACHE		TRUE
+#endif
+
+/*
+ * Pull the entire row pointer file into memory.
+ */
+xtPublic void xt_tab_load_row_pointers(XTThreadPtr self, XTOpenTablePtr ot)
+{
+	XTTableHPtr	tab = ot->ot_table;
+	xtRecordID	eof_rec_id = tab->tab_row_eof_id;
+	xtInt8		usage;
+	xtWord1		*buffer = NULL;
+
+	/* Check if there is enough cache: */
+	usage = xt_tc_get_usage();
+	if (xt_tc_get_high() > usage)
+		usage = xt_tc_get_high();
+	if (usage + ((xtInt8) eof_rec_id * (xtInt8) tab->tab_rows.tci_rec_size) < xt_tc_get_size()) {
+		xtRecordID			rec_id;
+		size_t				poffset, tfer;
+		off_t				offset, end_offset;
+		XTTabCachePagePtr	page;
+		
+		end_offset = xt_row_id_to_row_offset(tab, eof_rec_id);
+		rec_id = 1;
+		while (rec_id < eof_rec_id) {
+			if (!tab->tab_rows.xt_tc_get_page(ot->ot_row_file, rec_id, TAB_ROW_LOAD_CACHE, &page, &poffset, self))
+				xt_throw(self);
+			if (page)
+				tab->tab_rows.xt_tc_release_page(ot->ot_row_file, page, self);
+			else {
+				xtWord1 *buff_ptr;
+
+				if (!buffer)
+					buffer = (xtWord1 *) xt_malloc(self, tab->tab_rows.tci_page_size);
+				offset = xt_row_id_to_row_offset(tab, rec_id);
+				tfer = tab->tab_rows.tci_page_size;
+				if (offset + (off_t) tfer > end_offset)
+					tfer = (size_t) (end_offset - offset);
+				XT_LOCK_MEMORY_PTR(buff_ptr, ot->ot_row_file, offset, tfer, &self->st_statistics.st_rec, self);
+				if (buff_ptr) {
+					memcpy(buffer, buff_ptr, tfer);
+					XT_UNLOCK_MEMORY_PTR(ot->ot_row_file, buff_ptr, TRUE, self);
+				}
+			}
+			rec_id += tab->tab_rows.tci_rows_per_page;
+		}
+		if (buffer)
+			xt_free(self, buffer);
+	}
+}
+
+xtPublic void xt_tab_load_table(XTThreadPtr self, XTOpenTablePtr ot)
+{
+	xt_load_pages(self, ot);
+	xt_load_indices(self, ot);
+}
+
+xtPublic xtBool xt_tab_load_record(register XTOpenTablePtr ot, xtRecordID rec_id, XTInfoBufferPtr rec_buf)
+{
+	register XTTableHPtr	tab = ot->ot_table;
+	size_t					rec_size = tab->tab_dic.dic_rec_size;
+
+	if (!xt_tab_get_rec_data(ot, rec_id, rec_size, ot->ot_row_rbuffer))
+		return FAILED;
+
+	if (XT_REC_NOT_VALID(ot->ot_row_rbuffer[0])) {
+		/* Should not happen! */
+		XTThreadPtr self = ot->ot_thread;
+
+		xt_log(XT_WARNING, "Recently updated record invalid\n");
+		return OK;
+	}
+
+	ot->ot_curr_row_id = XT_GET_DISK_4(((XTTabRecHeadDPtr) ot->ot_row_rbuffer)->tr_row_id_4);
+	ot->ot_curr_updated =
+		(XT_GET_DISK_4(((XTTabRecHeadDPtr) ot->ot_row_rbuffer)->tr_xact_id_4) == ot->ot_thread->st_xact_data->xd_start_xn_id);
+
+	if (ot->ot_rec_fixed) {
+		size_t size = rec_size - XT_REC_FIX_HEADER_SIZE;
+		if (!xt_ib_alloc(NULL, rec_buf, size))
+			return FAILED;
+		memcpy(rec_buf->ib_db.db_data, ot->ot_row_rbuffer + XT_REC_FIX_HEADER_SIZE, size);
+	}
+	else {
+		if (!xt_ib_alloc(NULL, rec_buf, tab->tab_dic.dic_mysql_buf_size))
+			return FAILED;
+		if (ot->ot_row_rbuffer[0] == XT_TAB_STATUS_VARIABLE || ot->ot_row_rbuffer[0] == XT_TAB_STATUS_VAR_CLEAN) {
+			if (!myxt_load_row(ot, ot->ot_row_rbuffer + XT_REC_FIX_HEADER_SIZE, rec_buf->ib_db.db_data, ot->ot_cols_req))
+				return FAILED;
+		}
+		else {
+			u_int cols_req = ot->ot_cols_req;
+
+			ASSERT_NS(cols_req);
+			if (cols_req && cols_req <= tab->tab_dic.dic_fix_col_count) {
+				if (!myxt_load_row(ot, ot->ot_row_rbuffer + XT_REC_EXT_HEADER_SIZE, rec_buf->ib_db.db_data, cols_req))
+					return FAILED;
+			}
+			else {
+				if (!xt_tab_load_ext_data(ot, ot->ot_curr_rec_id, rec_buf->ib_db.db_data, cols_req))
+					return FAILED;
+			}
+		}
+	}
+
+	return OK;
+}
+
+xtPublic xtBool xt_tab_free_row(XTOpenTablePtr ot, XTTableHPtr tab, xtRowID row_id)
+{
+	XTTabRowRefDRec free_row;
+	xtRowID			prev_row;
+	xtOpSeqNo		op_seq;
+
+	ASSERT_NS(row_id); // Cannot free the header!
+
+	xt_lock_mutex_ns(&tab->tab_row_lock);
+	prev_row = tab->tab_row_free_id;
+	XT_SET_DISK_4(free_row.rr_ref_id_4, prev_row);
+	if (!tab->tab_rows.xt_tc_write(ot->ot_row_file, row_id, 0, sizeof(XTTabRowRefDRec), (xtWord1 *) &free_row, &op_seq, TRUE, ot->ot_thread)) {
+		xt_unlock_mutex_ns(&tab->tab_row_lock);
+		return FAILED;
+	}
+	tab->tab_row_free_id = row_id;
+	tab->tab_row_fnum++;
+	ASSERT_NS(tab->tab_row_fnum < tab->tab_row_eof_id);
+	xt_unlock_mutex_ns(&tab->tab_row_lock);
+
+	if (!xt_xlog_modify_table(tab->tab_id, XT_LOG_ENT_ROW_FREED, op_seq, 0, row_id, sizeof(XTTabRowRefDRec), (xtWord1 *) &free_row, ot->ot_thread))
+		return FAILED;
+
+	return OK;
+}
+
+static void tab_free_ext_record_on_fail(XTOpenTablePtr ot, xtRecordID rec_id, XTTabRecExtDPtr ext_rec, xtBool log_err)
+{
+	xtWord4		log_over_size = XT_GET_DISK_4(ext_rec->re_log_dat_siz_4);
+	xtLogID		log_id;
+	xtLogOffset	log_offset;
+
+	XT_GET_LOG_REF(log_id, log_offset, ext_rec);
+
+	if (!ot->ot_thread->st_dlog_buf.dlb_delete_log(log_id, log_offset, log_over_size, ot->ot_table->tab_id, rec_id, ot->ot_thread)) {
+		if (log_err)
+			xt_log_and_clear_exception_ns();
+	}
+}
+
+static void tab_save_exception(XTExceptionPtr e)
+{
+	XTThreadPtr self = xt_get_self();
+
+	*e = self->t_exception;
+}
+
+static void tab_restore_exception(XTExceptionPtr e)
+{
+	XTThreadPtr self = xt_get_self();
+
+	self->t_exception = *e;
+}
+
+/*
+ * This function assumes that a record may be partially written.
+ * It removes all associated data and references to the record.
+ *
+ * This function return XT_ERR if an error occurs.
+ * TRUE if the record has been removed, and may be freed.
+ * FALSE if the record has already been freed. 
+ *
+ */
+xtPublic int xt_tab_remove_record(XTOpenTablePtr ot, xtRecordID rec_id, xtWord1 *rec_data, xtRecordID *prev_var_id, xtBool clean_delete, xtRowID row_id, xtXactID XT_UNUSED(xn_id))
+{
+	register XTTableHPtr	tab = ot->ot_table;
+	size_t					rec_size;
+	xtWord1					old_rec_type;
+	u_int					cols_req;
+	u_int					cols_in_buffer;
+
+	*prev_var_id = 0;
+
+	if (!rec_id)
+		return FALSE;
+
+	/*
+	 * NOTE: This function uses the read buffer. This should be OK because
+	 * the function is only called by the sweeper. The read buffer
+	 * is REQUIRED because of the call to xt_tab_load_ext_data()!!!
+	 */
+	rec_size = tab->tab_dic.dic_rec_size;
+	if (!xt_tab_get_rec_data(ot, rec_id, rec_size, ot->ot_row_rbuffer))
+		return XT_ERR;
+	old_rec_type = ot->ot_row_rbuffer[0];
+
+	/* Check of the record has not already been freed: */
+	if (XT_REC_IS_FREE(old_rec_type))
+		return FALSE;
+
+	/* This record must belong to the given row: */
+	if (XT_GET_DISK_4(((XTTabRecExtDPtr) ot->ot_row_rbuffer)->tr_row_id_4) != row_id)
+		return FALSE;
+
+	/* The transaction ID of the record must be BEFORE or equal to the given
+	 * transaction ID.
+	 *
+	 * No, this does not always hold. Because we wait for updates now,
+	 * a "younger" transaction can update before an older
+	 * transaction.
+	 * Commit order determined the actual order in which the transactions
+	 * should be replicated. This is determined by the log number of
+	 * the commit record!
+	if (db->db_xn_curr_id(xn_id, XT_GET_DISK_4(((XTTabRecExtDPtr) ot->ot_row_rbuffer)->tr_xact_id_4)))
+		return FALSE;
+	 */
+
+	*prev_var_id = XT_GET_DISK_4(((XTTabRecExtDPtr) ot->ot_row_rbuffer)->tr_prev_rec_id_4);
+
+	if (tab->tab_dic.dic_key_count) {
+		XTIndexPtr	*ind;
+
+		switch (old_rec_type) {
+			case XT_TAB_STATUS_DELETE:
+			case XT_TAB_STATUS_DEL_CLEAN:
+				rec_size = sizeof(XTTabRecHeadDRec);
+				goto set_removed;
+			case XT_TAB_STATUS_FIXED:
+			case XT_TAB_STATUS_FIX_CLEAN:
+				/* We know that for a fixed length record, 
+				 * dic_ind_rec_len <= dic_rec_size! */
+				rec_size = (size_t) tab->tab_dic.dic_ind_rec_len + XT_REC_FIX_HEADER_SIZE;
+				rec_data = ot->ot_row_rbuffer + XT_REC_FIX_HEADER_SIZE;
+				break;
+			case XT_TAB_STATUS_VARIABLE:
+			case XT_TAB_STATUS_VAR_CLEAN:
+				cols_req = tab->tab_dic.dic_ind_cols_req;
+
+				cols_in_buffer = cols_req;
+				rec_size = myxt_load_row_length(ot, rec_size - XT_REC_FIX_HEADER_SIZE, ot->ot_row_rbuffer + XT_REC_FIX_HEADER_SIZE, &cols_in_buffer);
+				if (cols_in_buffer < cols_req)
+					rec_size = tab->tab_dic.dic_rec_size;
+				else 
+					rec_size += XT_REC_FIX_HEADER_SIZE;
+				if (!myxt_load_row(ot, ot->ot_row_rbuffer + XT_REC_FIX_HEADER_SIZE, rec_data, cols_req)) {
+					xt_log_and_clear_exception_ns();
+					goto set_removed;
+				}
+				break;
+			case XT_TAB_STATUS_EXT_DLOG:
+			case XT_TAB_STATUS_EXT_CLEAN:
+				cols_req = tab->tab_dic.dic_ind_cols_req;
+
+				ASSERT_NS(cols_req);
+				cols_in_buffer = cols_req;
+				rec_size = myxt_load_row_length(ot, rec_size - XT_REC_EXT_HEADER_SIZE, ot->ot_row_rbuffer + XT_REC_EXT_HEADER_SIZE, &cols_in_buffer);
+				if (cols_in_buffer < cols_req) {
+					rec_size = tab->tab_dic.dic_rec_size;
+					if (!xt_tab_load_ext_data(ot, rec_id, rec_data, cols_req)) {
+						/* This is actually quite possible after recovery, see [(3)] */
+						if (ot->ot_thread->t_exception.e_xt_err != XT_ERR_BAD_EXT_RECORD &&
+							ot->ot_thread->t_exception.e_xt_err != XT_ERR_DATA_LOG_NOT_FOUND)
+							xt_log_and_clear_exception_ns();
+						goto set_removed;
+					}
+				}
+				else {
+					/* All the records we require are in the buffer... */
+					rec_size += XT_REC_EXT_HEADER_SIZE;
+					if (!myxt_load_row(ot, ot->ot_row_rbuffer + XT_REC_EXT_HEADER_SIZE, rec_data, cols_req)) {
+						xt_log_and_clear_exception_ns();
+						goto set_removed;
+					}
+				}
+				break;
+			default:
+				break;
+		}
+
+		/* Could this be the case?: This change may only be flushed after the
+		 * operation below has been flushed to the log.
+		 *
+		 * No, remove records are never "undone". The sweeper will delete
+		 * the record again if it does not land in the log.
+		 *
+		 * The fact that the index entries have already been removed is not
+		 * a problem.
+		 */
+		if (!tab->tab_dic.dic_disable_index) {
+			ind = tab->tab_dic.dic_keys;
+			for (u_int i=0; i<tab->tab_dic.dic_key_count; i++, ind++) {
+				if (!xt_idx_delete(ot, *ind, rec_id, rec_data))
+					xt_log_and_clear_exception_ns();
+			}
+		}
+	}
+	else {
+		/* No indices: */
+		switch (old_rec_type) {
+			case XT_TAB_STATUS_DELETE:
+			case XT_TAB_STATUS_DEL_CLEAN:
+				rec_size = XT_REC_FIX_HEADER_SIZE;
+				break;
+			case XT_TAB_STATUS_FIXED:
+			case XT_TAB_STATUS_FIX_CLEAN:
+			case XT_TAB_STATUS_VARIABLE:
+			case XT_TAB_STATUS_VAR_CLEAN:
+				rec_size = XT_REC_FIX_HEADER_SIZE;
+				break;
+			case XT_TAB_STATUS_EXT_DLOG:
+			case XT_TAB_STATUS_EXT_CLEAN:
+				rec_size = XT_REC_EXT_HEADER_SIZE;
+				break;
+		}
+	}
+
+	set_removed:
+	if (XT_REC_IS_EXT_DLOG(old_rec_type)) {
+		/* {LOCK-EXT-REC} Lock, and read again to make sure that the
+		 * compactor does not change this record, while
+		 * we are removing it! */
+		xt_lock_mutex_ns(&tab->tab_db->db_co_ext_lock);
+		if (!xt_tab_get_rec_data(ot, rec_id, XT_REC_EXT_HEADER_SIZE, ot->ot_row_rbuffer)) {
+			xt_unlock_mutex_ns(&tab->tab_db->db_co_ext_lock);
+			return XT_ERR;
+		}
+		xt_unlock_mutex_ns(&tab->tab_db->db_co_ext_lock);
+
+	}
+
+	xtOpSeqNo			op_seq;
+	XTTabRecFreeDPtr	free_rec = (XTTabRecFreeDPtr) ot->ot_row_rbuffer;
+	xtRecordID			prev_rec_id;
+
+	/* A record is "clean" deleted if the record was
+	 * XT_TAB_STATUS_DELETE which was comitted.
+	 * This makes sure that the record will still invalidate
+	 * following records in a row.
+	 *
+	 * Example:
+	 *
+	 * 1. INSERT A ROW, then DELETE it, assume the sweeper is delayed.
+	 *
+	 * We now have the sequence row X --> del rec A --> valid rec B.
+	 *
+	 * 2. A SELECT can still find B. Assume it now goes to check
+	 *    if the record is valid, it reads row X, and gets A.
+	 *
+	 * 3. Now the sweeper gets control and removes X, A and B.
+	 *    It frees A with the clean bit.
+	 *
+	 * 4. Now the SELECT gets control and reads A. Normally a freed record
+	 *    would be ignored, and it would go onto B, which would then
+	 *    be considered valid (note, even after the free, the next
+	 *    pointer is not affected).
+	 *
+	 * However, because the clean bit has been set, it will stop at A
+	 * and consider B invalid (which is the desired result).
+	 *
+	 * NOTE: We assume it is not possible for A to be allocated and refer
+	 * to B, because B is freed before A. This means that B may refer to
+	 * A after the next allocation.
+	 */
+
+	xtWord1 new_rec_type = XT_TAB_STATUS_FREED | (clean_delete ? XT_TAB_STATUS_CLEANED_BIT : 0);
+
+	xt_lock_mutex_ns(&tab->tab_rec_lock);
+	free_rec->rf_rec_type_1 = new_rec_type;
+	prev_rec_id = tab->tab_rec_free_id;
+	XT_SET_DISK_4(free_rec->rf_next_rec_id_4, prev_rec_id);
+	if (!xt_tab_put_rec_data(ot, rec_id, sizeof(XTTabRecFreeDRec), ot->ot_row_rbuffer, &op_seq)) {
+		xt_unlock_mutex_ns(&tab->tab_rec_lock);
+		return XT_ERR;
+	}
+	tab->tab_rec_free_id = rec_id;
+	ASSERT_NS(tab->tab_rec_free_id < tab->tab_rec_eof_id);
+	tab->tab_rec_fnum++;
+	xt_unlock_mutex_ns(&tab->tab_rec_lock);
+
+	free_rec->rf_rec_type_1 = old_rec_type;
+	if (!xt_xlog_modify_table(tab->tab_id, XT_LOG_ENT_REC_REMOVED_BI, op_seq, (xtRecordID) new_rec_type, rec_id, rec_size, ot->ot_row_rbuffer, ot->ot_thread))
+		return XT_ERR;
+	return OK;
+}
+
+static xtRowID tab_new_row(XTOpenTablePtr ot, XTTableHPtr tab)
+{
+	xtRowID			row_id;
+	xtOpSeqNo		op_seq;
+	xtRowID			next_row_id = 0;
+	u_int			status;
+
+	xt_lock_mutex_ns(&tab->tab_row_lock);
+	if ((row_id = tab->tab_row_free_id)) {
+		status = XT_LOG_ENT_ROW_NEW_FL;
+
+		if (!tab->tab_rows.xt_tc_read_4(ot->ot_row_file, row_id, &next_row_id, ot->ot_thread)) {
+			xt_unlock_mutex_ns(&tab->tab_row_lock);
+			return 0;
+		}
+		tab->tab_row_free_id = next_row_id;
+		ASSERT_NS(tab->tab_row_fnum > 0);
+		tab->tab_row_fnum--;
+	}
+	else {
+		status = XT_LOG_ENT_ROW_NEW;
+		row_id = tab->tab_row_eof_id;
+		if (row_id == 0xFFFFFFFF) {
+			xt_unlock_mutex_ns(&tab->tab_row_lock);
+			xt_register_xterr(XT_REG_CONTEXT, XT_ERR_MAX_ROW_COUNT);
+			return 0;
+		}
+		if (((row_id - 1) % tab->tab_rows.tci_rows_per_page) == 0) {
+			/* By fetching the page now, we avoid reading it later... */
+			XTTabCachePagePtr	page;
+			XTTabCacheSegPtr	seg;
+			size_t				poffset;
+
+			if (!tab->tab_rows.tc_fetch(ot->ot_row_file, row_id, &seg, &page, &poffset, FALSE, ot->ot_thread)) {
+				xt_unlock_mutex_ns(&tab->tab_row_lock);
+				return 0;
+			}
+			TAB_CAC_UNLOCK(&seg->tcs_lock, ot->ot_thread->t_id);
+		}
+		tab->tab_row_eof_id++;
+	}
+	op_seq = tab->tab_seq.ts_get_op_seq();
+	xt_unlock_mutex_ns(&tab->tab_row_lock);
+
+	if (!xt_xlog_modify_table(tab->tab_id, status, op_seq, next_row_id, row_id, 0, NULL, ot->ot_thread))
+		return 0;
+
+	XT_DISABLED_TRACE(("new row tx=%d row=%d\n", (int) ot->ot_thread->st_xact_data->xd_start_xn_id, (int) row_id));
+	ASSERT_NS(row_id);
+	return row_id;
+}
+
+xtPublic xtBool xt_tab_get_row(register XTOpenTablePtr ot, xtRowID row_id, xtRecordID *var_rec_id)
+{
+	register XTTableHPtr	tab = ot->ot_table;
+
+	(void) ASSERT_NS(sizeof(XTTabRowRefDRec) == 4);
+
+	if (!tab->tab_rows.xt_tc_read_4(ot->ot_row_file, row_id, var_rec_id, ot->ot_thread))
+		return FAILED;
+	return OK;
+}
+
+xtPublic xtBool xt_tab_set_row(XTOpenTablePtr ot, u_int status, xtRowID row_id, xtRecordID var_rec_id)
+{
+	register XTTableHPtr	tab = ot->ot_table;
+	XTTabRowRefDRec			row_buf;
+	xtOpSeqNo				op_seq;
+
+	ASSERT_NS(var_rec_id < tab->tab_rec_eof_id);
+	XT_SET_DISK_4(row_buf.rr_ref_id_4, var_rec_id);
+
+	if (!tab->tab_rows.xt_tc_write(ot->ot_row_file, row_id, 0, sizeof(XTTabRowRefDRec), (xtWord1 *) &row_buf, &op_seq, TRUE, ot->ot_thread))
+		return FAILED;
+
+	return xt_xlog_modify_table(tab->tab_id, status, op_seq, 0, row_id, sizeof(XTTabRowRefDRec), (xtWord1 *) &row_buf, ot->ot_thread);
+}
+
+xtPublic xtBool xt_tab_free_record(XTOpenTablePtr ot, u_int status, xtRecordID rec_id, xtBool clean_delete)
+{
+	register XTTableHPtr	tab = ot->ot_table;
+	XTTabRecHeadDRec		rec_head;
+	XTactFreeRecEntryDRec	free_rec;
+	xtRecordID				prev_rec_id;
+
+	/* Don't free the record if it is already free! */
+	if (!xt_tab_get_rec_data(ot, rec_id, sizeof(XTTabRecHeadDRec), (xtWord1 *) &rec_head))
+		return FAILED;
+
+	if (!XT_REC_IS_FREE(rec_head.tr_rec_type_1)) {
+		xtOpSeqNo op_seq;
+
+		/* This information will be used to determine if the resources of the record
+		 * should be removed.
+		 */
+		free_rec.fr_stat_id_1 = rec_head.tr_stat_id_1;
+		XT_COPY_DISK_4(free_rec.fr_xact_id_4, rec_head.tr_xact_id_4);
+
+		/* A record is "clean" deleted if the record was
+		 * XT_TAB_STATUS_DELETE which was comitted.
+		 * This makes sure that the record will still invalidate
+		 * following records in a row.
+		 *
+		 * Example:
+		 *
+		 * 1. INSERT A ROW, then DELETE it, assume the sweeper is delayed.
+		 *
+		 * We now have the sequence row X --> del rec A --> valid rec B.
+		 *
+		 * 2. A SELECT can still find B. Assume it now goes to check
+		 *    if the record is valid, ti reads row X, and gets A.
+		 *
+		 * 3. Now the sweeper gets control and removes X, A and B.
+		 *    It frees A with the clean bit.
+		 *
+		 * 4. Now the SELECT gets control and reads A. Normally a freed record
+		 *    would be ignored, and it would go onto B, which would then
+		 *    be considered valid (note, even after the free, the next
+		 *    pointer is not affected).
+		 *
+		 * However, because the clean bit has been set, it will stop at A
+		 * and consider B invalid (which is the desired result).
+		 *
+		 * NOTE: We assume it is not possible for A to be allocated and refer
+		 * to B, because B is freed before A. This means that B may refer to
+		 * A after the next allocation.
+		 */
+
+		(void) ASSERT_NS(sizeof(XTTabRecFreeDRec) == sizeof(XTactFreeRecEntryDRec) - offsetof(XTactFreeRecEntryDRec, fr_rec_type_1));
+		free_rec.fr_rec_type_1 = XT_TAB_STATUS_FREED | (clean_delete ? XT_TAB_STATUS_CLEANED_BIT : 0);
+		free_rec.fr_not_used_1 = 0;
+
+		xt_lock_mutex_ns(&tab->tab_rec_lock);
+		prev_rec_id = tab->tab_rec_free_id;
+		XT_SET_DISK_4(free_rec.fr_next_rec_id_4, prev_rec_id);
+		if (!xt_tab_put_rec_data(ot, rec_id, sizeof(XTTabRecFreeDRec), &free_rec.fr_rec_type_1, &op_seq)) {
+			xt_unlock_mutex_ns(&tab->tab_rec_lock);
+			return FAILED;
+		}
+		tab->tab_rec_free_id = rec_id;
+		ASSERT_NS(tab->tab_rec_free_id < tab->tab_rec_eof_id);
+		tab->tab_rec_fnum++;
+		xt_unlock_mutex_ns(&tab->tab_rec_lock);
+
+		if (!xt_xlog_modify_table(tab->tab_id, status, op_seq, rec_id, rec_id, sizeof(XTactFreeRecEntryDRec) - offsetof(XTactFreeRecEntryDRec, fr_stat_id_1), &free_rec.fr_stat_id_1, ot->ot_thread))
+			return FAILED;
+	}
+	return OK;
+}
+
+static void tab_free_row_on_fail(XTOpenTablePtr ot, XTTableHPtr tab, xtRowID row_id)
+{
+	XTExceptionRec e;
+
+	tab_save_exception(&e);
+	xt_tab_free_row(ot, tab, row_id);
+	tab_restore_exception(&e);
+}
+
+static xtBool tab_write_ext_record(XTOpenTablePtr ot, XTTabRecInfoPtr rec_info, xtTableID tab_id, xtRecordID rec_id, xtLogID log_id, xtLogOffset log_offset, XTThreadPtr thread)
+{
+	xtWord1 tmp_buffer[offsetof(XTactExtRecEntryDRec, er_data)];
+	xtBool	ok;
+
+	memcpy(tmp_buffer, rec_info->ri_log_buf, sizeof(tmp_buffer));
+	rec_info->ri_log_buf->er_status_1 = XT_LOG_ENT_EXT_REC_OK;
+	XT_SET_DISK_4(rec_info->ri_log_buf->er_data_size_4, rec_info->ri_log_data_size);
+	XT_SET_DISK_4(rec_info->ri_log_buf->er_tab_id_4, tab_id);
+	XT_SET_DISK_4(rec_info->ri_log_buf->er_rec_id_4, rec_id);
+	ok = thread->st_dlog_buf.dlb_append_log(log_id, log_offset, offsetof(XTactExtRecEntryDRec, er_data) + rec_info->ri_log_data_size, (xtWord1 *) rec_info->ri_log_buf, thread);
+	memcpy(rec_info->ri_log_buf, tmp_buffer, sizeof(tmp_buffer));
+	return ok;
+}
+
+static xtBool tab_add_record(XTOpenTablePtr ot, XTTabRecInfoPtr rec_info, u_int status)
+{
+	register XTTableHPtr	tab = ot->ot_table;
+	XTThreadPtr				thread = ot->ot_thread;
+	xtRecordID				rec_id;
+	xtLogID					log_id;
+	xtLogOffset				log_offset;
+	xtOpSeqNo				op_seq;
+	xtRecordID				next_rec_id = 0;
+
+	if (rec_info->ri_ext_rec) {
+		/* Determine where the overflow will go... */
+		if (!thread->st_dlog_buf.dlb_get_log_offset(&log_id, &log_offset, rec_info->ri_log_data_size + offsetof(XTactExtRecEntryDRec, er_data), ot->ot_thread))
+			return FAILED;
+		XT_SET_LOG_REF(rec_info->ri_ext_rec, log_id, log_offset);
+	}
+
+	/* Write the record to disk: */
+	xt_lock_mutex_ns(&tab->tab_rec_lock);
+	if ((rec_id = tab->tab_rec_free_id)) {
+		XTTabRecFreeDRec free_block;
+
+		ASSERT_NS(rec_id < tab->tab_rec_eof_id);
+		if (!xt_tab_get_rec_data(ot, rec_id, sizeof(XTTabRecFreeDRec), (xtWord1 *) &free_block)) {
+			xt_unlock_mutex_ns(&tab->tab_rec_lock);
+			return FAILED;
+		}
+		next_rec_id = XT_GET_DISK_4(free_block.rf_next_rec_id_4);
+		tab->tab_rec_free_id = next_rec_id;
+			
+		tab->tab_rec_fnum--;
+		
+		/* XT_LOG_ENT_UPDATE --> XT_LOG_ENT_UPDATE_FL */
+		/* XT_LOG_ENT_INSERT --> XT_LOG_ENT_INSERT_FL */
+		/* XT_LOG_ENT_DELETE --> XT_LOG_ENT_DELETE_FL */
+		status += 2;
+
+		if (!xt_tab_put_rec_data(ot, rec_id, rec_info->ri_rec_buf_size, (xtWord1 *) rec_info->ri_fix_rec_buf, &op_seq)) {
+			xt_unlock_mutex_ns(&tab->tab_rec_lock);
+			return FAILED;
+		}
+	}
+	else {
+		xtBool read;
+
+		rec_id = tab->tab_rec_eof_id;
+		tab->tab_rec_eof_id++;
+
+		/* If we are writing to a new page (at the EOF)
+		 * then we do not need to read the page from the
+		 * file because it is new.
+		 *
+		 * Note that this only works because we are holding
+		 * a lock on the record file.
+		 */
+		read = ((rec_id - 1) % tab->tab_recs.tci_rows_per_page) != 0;
+
+		if (!tab->tab_recs.xt_tc_write(ot->ot_rec_file, rec_id, 0, rec_info->ri_rec_buf_size, (xtWord1 *) rec_info->ri_fix_rec_buf, &op_seq, read, thread)) {
+			xt_unlock_mutex_ns(&tab->tab_rec_lock);
+			return FAILED;
+		}
+	}
+	xt_unlock_mutex_ns(&tab->tab_rec_lock);
+
+	if (!xt_xlog_modify_table(tab->tab_id, status, op_seq, next_rec_id, rec_id,  rec_info->ri_rec_buf_size, (xtWord1 *) rec_info->ri_fix_rec_buf, thread))
+		return FAILED;
+
+	if (rec_info->ri_ext_rec) {
+		if (!tab_write_ext_record(ot, rec_info, tab->tab_id, rec_id, log_id, log_offset, thread))
+			return FAILED;
+	}
+
+	XT_DISABLED_TRACE(("new rec tx=%d val=%d\n", (int) thread->st_xact_data->xd_start_xn_id, (int) rec_id));
+	rec_info->ri_rec_id = rec_id;
+	return OK;
+}
+
+static void tab_delete_record_on_fail(XTOpenTablePtr ot, xtRowID row_id, xtRecordID rec_id, XTTabRecHeadDPtr row_ptr, xtWord1 *rec_data, u_int key_count)
+{
+	XTExceptionRec	e;
+	xtBool			log_err = TRUE;
+	XTTabRecInfoRec	rec_info;
+
+	tab_save_exception(&e);
+	
+	if (e.e_xt_err == XT_ERR_DUPLICATE_KEY || 
+		e.e_xt_err == XT_ERR_DUPLICATE_FKEY) {
+		/* If the error does not cause rollback, then we will ignore the
+		 * error if an error occurs in the UNDO!
+		 */
+		log_err = FALSE;
+		tab_restore_exception(&e);
+	}
+	if (key_count) {
+		XTIndexPtr	*ind;
+
+		ind = ot->ot_table->tab_dic.dic_keys;
+		for (u_int i=0; i<key_count; i++, ind++) {
+			if (!xt_idx_delete(ot, *ind, rec_id, rec_data)) {
+				if (log_err)
+					xt_log_and_clear_exception_ns();
+			}
+		}
+	}
+
+	/* This is not required because the extended record will be free
+	 * later when the record is freed!
+	if (row_ptr->tr_rec_type_1 == XT_TAB_STATUS_EXT_DLOG || row_ptr->tr_rec_type_1 == XT_TAB_STATUS_EXT_CLEAN)
+		tab_free_ext_record_on_fail(ot, rec_id, (XTTabRecExtDPtr) row_ptr, log_err);
+	 */
+
+	rec_info.ri_fix_rec_buf = (XTTabRecFixDPtr) ot->ot_row_wbuffer;
+	rec_info.ri_rec_buf_size = offsetof(XTTabRecFixDRec, rf_data);
+	rec_info.ri_ext_rec = NULL;
+	rec_info.ri_fix_rec_buf->tr_rec_type_1 = XT_TAB_STATUS_DELETE;
+	rec_info.ri_fix_rec_buf->tr_stat_id_1 = 0;
+	XT_SET_DISK_4(rec_info.ri_fix_rec_buf->tr_row_id_4, row_id);
+	XT_SET_DISK_4(rec_info.ri_fix_rec_buf->tr_prev_rec_id_4, rec_id);
+	XT_SET_DISK_4(rec_info.ri_fix_rec_buf->tr_xact_id_4, ot->ot_thread->st_xact_data->xd_start_xn_id);
+
+	if (!tab_add_record(ot, &rec_info, XT_LOG_ENT_DELETE))
+		goto failed;
+
+	if (!xt_tab_set_row(ot, XT_LOG_ENT_ROW_ADD_REC, row_id, rec_info.ri_rec_id))
+		goto failed;
+
+	if (log_err)
+		tab_restore_exception(&e);
+	return;
+
+	failed:
+	if (log_err)
+		xt_log_and_clear_exception_ns();
+	else
+		tab_restore_exception(&e);
+}
+
+/*
+ * Wait until all the variations between the start of the chain, and
+ * the given record have been rolled-back.
+ * If any is committed, register a locked error, and return FAILED.
+ */
+static xtBool tab_wait_for_rollback(XTOpenTablePtr ot, xtRowID row_id, xtRecordID commit_rec_id)
+{
+	register XTTableHPtr	tab = ot->ot_table;
+	xtRecordID				var_rec_id;
+	XTTabRecHeadDRec		var_head;
+	xtXactID				xn_id;
+	xtRecordID				invalid_rec = 0;
+	XTXactWaitRec			xw;
+
+	retry:
+	if (!xt_tab_get_row(ot, row_id, &var_rec_id))
+		return FAILED;
+
+	while (var_rec_id != commit_rec_id) {
+		if (!var_rec_id)
+			goto locked;
+		if (!xt_tab_get_rec_data(ot, var_rec_id, sizeof(XTTabRecHeadDRec), (xtWord1 *) &var_head))
+			return FAILED;
+		if (XT_REC_IS_CLEAN(var_head.tr_rec_type_1))
+			goto locked;
+		if (XT_REC_IS_FREE(var_head.tr_rec_type_1)) {
+			/* Should not happen: */
+			if (!tab_record_corrupt(ot, row_id, var_rec_id, false, 4))
+				return FAILED;
+			goto record_invalid;
+		}
+		xn_id = XT_GET_DISK_4(var_head.tr_xact_id_4);
+		switch (xt_xn_status(ot, xn_id, var_rec_id)) {
+			case XT_XN_VISIBLE:
+			case XT_XN_NOT_VISIBLE:
+				goto locked;
+			case XT_XN_ABORTED:
+				/* Ingore the record, it will be removed. */
+				break;
+			case XT_XN_MY_UPDATE:
+				/* Should not happen: */
+				goto locked;
+			case XT_XN_OTHER_UPDATE:
+				/* Wait for the transaction to commit or rollback: */
+				XT_TAB_ROW_UNLOCK(&tab->tab_row_rwlock[row_id % XT_ROW_RWLOCKS], ot->ot_thread);
+				xw.xw_xn_id = xn_id;
+				if (!xt_xn_wait_for_xact(ot->ot_thread, &xw, NULL)) {
+					XT_TAB_ROW_WRITE_LOCK(&tab->tab_row_rwlock[row_id % XT_ROW_RWLOCKS], ot->ot_thread);
+					return FAILED;
+				}
+				XT_TAB_ROW_WRITE_LOCK(&tab->tab_row_rwlock[row_id % XT_ROW_RWLOCKS], ot->ot_thread);
+				goto retry;
+			case XT_XN_REREAD:
+				if (!tab_record_corrupt(ot, row_id, var_rec_id, true, 5))
+					return FAILED;
+				goto record_invalid;
+		}
+		var_rec_id = XT_GET_DISK_4(var_head.tr_prev_rec_id_4);
+	}
+	return OK;
+
+	locked:
+	xt_register_xterr(XT_REG_CONTEXT, XT_ERR_RECORD_CHANGED);
+	return FAILED;
+	
+	record_invalid:
+	/* {RETRY-READ} */
+	/* Prevent an infinite loop due to a bad record: */
+	if (invalid_rec != var_rec_id) {
+		invalid_rec = var_rec_id;
+		goto retry;
+	}
+	/* The record is invalid, it will be "overwritten"... */
+#ifdef XT_CRASH_DEBUG
+	/* Should not happen! */
+	xt_crash_me();
+#endif
+	return OK;
+}
+
+/* Check if a record may be visible:
+ * Return TRUE of the record may be visible now.
+ * Return XT_MAYBE if the record may be visible in the future (set out_xn_id).
+ * Return FALSE of the record is not valid (freed or is a delete record).
+ * Return XT_ERR if an error occurred.
+ */
+xtPublic int xt_tab_maybe_committed(XTOpenTablePtr ot, xtRecordID rec_id, xtXactID *out_xn_id, xtRowID *out_rowid, xtBool *out_updated)
+{
+	XTTabRecHeadDRec		rec_head;
+	xtXactID				rec_xn_id = 0;
+	xtBool					wait = FALSE;
+	xtXactID				wait_xn_id = 0;
+	xtRowID					row_id;
+	xtRecordID				var_rec_id;
+	xtXactID				xn_id;
+	register XTTableHPtr	tab;
+#ifdef TRACE_VARIATIONS_IN_DUP_CHECK
+	char					t_buf[500];
+	int						len;
+	char					*t_type = "C";
+#endif
+	xtRecordID				invalid_rec = 0;
+
+	reread:
+	if (!xt_tab_get_rec_data(ot, rec_id, sizeof(XTTabRecHeadDRec), (xtWord1 *) &rec_head))
+		return XT_ERR;
+
+	if (XT_REC_NOT_VALID(rec_head.tr_rec_type_1))
+		return FALSE;
+
+	if (!XT_REC_IS_CLEAN(rec_head.tr_rec_type_1)) {
+		rec_xn_id = XT_GET_DISK_4(rec_head.tr_xact_id_4);
+		switch (xt_xn_status(ot, rec_xn_id, rec_id)) {
+			case XT_XN_VISIBLE:
+#ifdef TRACE_VARIATIONS_IN_DUP_CHECK
+				t_type="V";
+#endif
+				break;
+			case XT_XN_NOT_VISIBLE:
+#ifdef TRACE_VARIATIONS_IN_DUP_CHECK
+				t_type="NV";
+#endif
+				break;
+			case XT_XN_ABORTED:
+				return FALSE;
+			case XT_XN_MY_UPDATE:
+#ifdef TRACE_VARIATIONS_IN_DUP_CHECK
+				t_type="My-Upd";
+#endif
+				break;
+			case XT_XN_OTHER_UPDATE:
+#ifdef TRACE_VARIATIONS_IN_DUP_CHECK
+				t_type="Wait";
+#endif
+				wait = TRUE;
+				wait_xn_id = rec_xn_id;
+				break;
+			case XT_XN_REREAD:
+#ifdef TRACE_VARIATIONS_IN_DUP_CHECK
+				t_type="Re-read";
+#endif
+				/* {RETRY-READ} */
+				/* Avoid infinite loop: */
+				if (invalid_rec == rec_id) {
+					/* Should not happen! */
+					if (!tab_record_corrupt(ot, XT_GET_DISK_4(rec_head.tr_row_id_4), rec_id, true, 6))
+						goto failed;
+#ifdef XT_CRASH_DEBUG
+					/* Generate a core dump! */
+					xt_crash_me();
+#endif
+					return FALSE;
+				}
+				invalid_rec = rec_id;
+				goto reread;
+		}
+	}
+
+	/* Follow the variation chain until we come to this record.
+	 * If it is not the first visible variation then
+	 * it is not visible at all. If it in not found on the
+	 * variation chain, it is also not visible.
+	 */
+	row_id = XT_GET_DISK_4(rec_head.tr_row_id_4);
+
+	tab = ot->ot_table;
+	XT_TAB_ROW_READ_LOCK(&tab->tab_row_rwlock[row_id % XT_ROW_RWLOCKS], ot->ot_thread);
+
+	invalid_rec = 0;
+	retry:
+	if (!(xt_tab_get_row(ot, row_id, &var_rec_id)))
+		goto failed;
+#ifdef TRACE_VARIATIONS_IN_DUP_CHECK
+	len = sprintf(t_buf, "dup row=%d", (int) row_id);
+#endif
+	while (var_rec_id != rec_id) {
+		if (!var_rec_id)
+			goto not_found;
+#ifdef TRACE_VARIATIONS_IN_DUP_CHECK
+		if (len <= 450)
+			len += sprintf(t_buf+len, " -> %d", (int) var_rec_id);
+#endif
+		if (!xt_tab_get_rec_data(ot, var_rec_id, sizeof(XTTabRecHeadDRec), (xtWord1 *) &rec_head))
+			goto failed;
+		/* All clean records are visible, by all transactions: */
+		if (XT_REC_IS_CLEAN(rec_head.tr_rec_type_1))
+			goto not_found;
+
+		if (XT_REC_IS_FREE(rec_head.tr_rec_type_1)) {
+			/* Should not happen: */
+			if (invalid_rec != var_rec_id) {
+				invalid_rec = var_rec_id;
+				goto retry;
+			}
+			/* Assume end of list. */
+#ifdef XT_CRASH_DEBUG
+			/* Should not happen! */
+			xt_crash_me();
+#endif
+			goto not_found;
+		}
+
+		xn_id = XT_GET_DISK_4(rec_head.tr_xact_id_4);
+		switch (xt_xn_status(ot, xn_id, var_rec_id)) {
+			case XT_XN_VISIBLE:
+			case XT_XN_NOT_VISIBLE:
+				goto not_found;
+			case XT_XN_ABORTED:
+				/* Ingore the record, it will be removed. */
+#ifdef TRACE_VARIATIONS_IN_DUP_CHECK
+				if (len <= 450)
+					len += sprintf(t_buf+len, "(T%d-A)", (int) xn_id);
+#endif
+				break;
+			case XT_XN_MY_UPDATE:
+				goto not_found;
+			case XT_XN_OTHER_UPDATE:
+#ifdef TRACE_VARIATIONS_IN_DUP_CHECK
+				if (len <= 450)
+					len += sprintf(t_buf+len, "(T%d-wait)", (int) xn_id);
+#endif
+				/* Wait for this update to commit or abort: */
+				if (!wait) {
+					wait = TRUE;
+					wait_xn_id = xn_id;
+				}
+				break;
+			case XT_XN_REREAD:
+				/* {RETRY-READ} */
+				if (invalid_rec != var_rec_id) {
+					invalid_rec = var_rec_id;
+					goto retry;
+				}
+				/* Assume end of list. */
+				if (!tab_record_corrupt(ot, row_id, invalid_rec, true, 7))
+					goto failed;
+#ifdef XT_CRASH_DEBUG
+				/* Should not happen! */
+				xt_crash_me();
+#endif
+				goto not_found;
+		}
+		var_rec_id = XT_GET_DISK_4(rec_head.tr_prev_rec_id_4);
+	}
+#ifdef TRACE_VARIATIONS_IN_DUP_CHECK
+	if (len <= 450)
+		sprintf(t_buf+len, " -> %d(T%d-%s)\n", (int) var_rec_id, (int) rec_xn_id, t_type);
+	else
+		sprintf(t_buf+len, " ...(T%d-%s)\n", (int) rec_xn_id, t_type);
+#endif
+
+	XT_TAB_ROW_UNLOCK(&tab->tab_row_rwlock[row_id % XT_ROW_RWLOCKS], ot->ot_thread);
+	if (wait) {
+		*out_xn_id = wait_xn_id;
+		return XT_MAYBE;
+	}
+#ifdef TRACE_VARIATIONS_IN_DUP_CHECK
+	xt_ttracef(thread, "%s", t_buf);
+#endif
+	if (out_rowid) {
+		*out_rowid = row_id;
+		*out_updated = (rec_xn_id == ot->ot_thread->st_xact_data->xd_start_xn_id);
+	}
+	return TRUE;
+
+	not_found:
+	XT_TAB_ROW_UNLOCK(&tab->tab_row_rwlock[row_id % XT_ROW_RWLOCKS], ot->ot_thread);
+	return FALSE;
+
+	failed:
+	XT_TAB_ROW_UNLOCK(&tab->tab_row_rwlock[row_id % XT_ROW_RWLOCKS], ot->ot_thread);
+	return XT_ERR;
+}
+
+xtPublic xtBool xt_tab_new_record(XTOpenTablePtr ot, xtWord1 *rec_buf)
+{
+	register XTTableHPtr	tab = ot->ot_table;
+	register XTThreadPtr	self = ot->ot_thread;
+	XTTabRecInfoRec			rec_info;
+	xtRowID					row_id;
+	u_int					idx_cnt = 0;
+	XTIndexPtr				*ind;
+
+	if (!myxt_store_row(ot, &rec_info, (char *) rec_buf))
+		goto failed_0;
+
+	/* Get a new row ID: */
+	if (!(row_id = tab_new_row(ot, tab)))
+		goto failed_0;
+
+	rec_info.ri_fix_rec_buf->tr_stat_id_1 = ot->ot_update_id;
+	XT_SET_DISK_4(rec_info.ri_fix_rec_buf->tr_row_id_4, row_id);
+	XT_SET_DISK_4(rec_info.ri_fix_rec_buf->tr_prev_rec_id_4, 0);
+	XT_SET_DISK_4(rec_info.ri_fix_rec_buf->tr_xact_id_4, self->st_xact_data->xd_start_xn_id);
+
+	/* Note, it is important that this record is written BEFORE the row
+	 * due to the problem distributed here [(5)]
+	 */
+	if (!tab_add_record(ot, &rec_info, XT_LOG_ENT_INSERT))
+		goto failed_1;
+
+#ifdef TRACE_VARIATIONS
+	xt_ttracef(self, "insert: row=%d rec=%d T%d\n", (int) row_id, (int) rec_info.ri_rec_id, (int) self->st_xact_data->xd_start_xn_id);
+#endif
+	if (!xt_tab_set_row(ot, XT_LOG_ENT_ROW_ADD_REC, row_id, rec_info.ri_rec_id))
+		goto failed_1;
+	XT_DISABLED_TRACE(("set new tx=%d row=%d rec=%d\n", (int) self->st_xact_data->xd_start_xn_id, (int) row_id, (int) rec_info.ri_rec_id));
+
+	/* Add the index references: */
+	for (idx_cnt=0, ind=tab->tab_dic.dic_keys; idx_cnt<tab->tab_dic.dic_key_count; idx_cnt++, ind++) {
+		if (!xt_idx_insert(ot, *ind, 0, rec_info.ri_rec_id, rec_buf, NULL, FALSE)) {
+			ot->ot_err_index_no = (*ind)->mi_index_no;
+			goto failed_2;
+		}
+	}
+
+	/* Do the foreign key stuff: */
+	if (ot->ot_table->tab_dic.dic_table->dt_fkeys.size() > 0) {
+		if (!ot->ot_table->tab_dic.dic_table->insertRow(ot, rec_buf))
+			goto failed_2;
+	}
+
+	self->st_statistics.st_row_insert++;
+	return OK;	
+
+	failed_2:
+	/* Once the row has been inserted, it is to late to remove it!
+	 * Now all we can do is delete it!
+	 */
+	tab_delete_record_on_fail(ot, row_id, rec_info.ri_rec_id, (XTTabRecHeadDPtr) rec_info.ri_fix_rec_buf, rec_buf, idx_cnt);
+	goto failed_0;
+
+	failed_1:
+	tab_free_row_on_fail(ot, tab, row_id);
+
+	failed_0:
+	return FAILED;
+}
+
+/* We cannot remove a change we have made to a row while a transaction
+ * is running, so we have to undo what we have done by
+ * overwriting the record we just created with
+ * the before image!
+ */
+static xtBool tab_overwrite_record_on_fail(XTOpenTablePtr ot, XTTabRecInfoPtr rec_info, xtWord1 *before_buf, xtWord1 *after_buf, u_int idx_cnt)
+{
+	register XTTableHPtr	tab = ot->ot_table;
+	XTTabRecHeadDRec		prev_rec_head;
+	u_int					i;
+	XTIndexPtr				*ind;
+	XTThreadPtr				thread = ot->ot_thread;
+	xtLogID					log_id;
+	xtLogOffset				log_offset;
+	xtRecordID				rec_id = rec_info->ri_rec_id;
+
+	/* Remove the new extended record: */
+	if (rec_info->ri_ext_rec)
+		tab_free_ext_record_on_fail(ot, rec_id, (XTTabRecExtDPtr) rec_info->ri_fix_rec_buf, TRUE);
+
+	/* Undo index entries of the new record: */
+	if (after_buf) {
+		for (i=0, ind=tab->tab_dic.dic_keys; i<idx_cnt; i++, ind++) {
+			if (!xt_idx_delete(ot, *ind, rec_id, after_buf))
+				return FAILED;
+		}
+	}
+
+	memcpy(&prev_rec_head, rec_info->ri_fix_rec_buf, sizeof(XTTabRecHeadDRec));
+
+	if (!before_buf) {
+		/* Can happen if the delete was called from some cascaded action.
+		 * And this is better than a crash...
+		 *
+		 * TODO: to make sure the change will not be applied in case the 
+		 * transaction will be commited, we'd need to add a log entry to 
+		 * restore the record like it's done for top-level operation. In 
+		 * order to do this we'd need to read the before-image of the 
+		 * record before modifying it.
+		 */
+		if (!thread->t_exception.e_xt_err)
+			xt_register_xterr(XT_REG_CONTEXT, XT_ERR_NO_BEFORE_IMAGE);
+		return FAILED;
+	}
+
+	/* Restore the previous record! */
+	if (!myxt_store_row(ot, rec_info, (char *) before_buf))
+		return FAILED;
+
+	memcpy(rec_info->ri_fix_rec_buf, &prev_rec_head, sizeof(XTTabRecHeadDRec));
+
+	if (rec_info->ri_ext_rec) {
+		/* Determine where the overflow will go... */
+		if (!thread->st_dlog_buf.dlb_get_log_offset(&log_id, &log_offset, rec_info->ri_log_data_size + offsetof(XTactExtRecEntryDRec, er_data), thread))
+			return FAILED;
+		XT_SET_LOG_REF(rec_info->ri_ext_rec, log_id, log_offset);
+	}
+
+	if (!xt_tab_put_log_op_rec_data(ot, XT_LOG_ENT_REC_MODIFIED, 0, rec_id, rec_info->ri_rec_buf_size, (xtWord1 *) rec_info->ri_fix_rec_buf))
+		return FAILED;
+
+	if (rec_info->ri_ext_rec) {
+		/* Write the log buffer overflow: */		
+		if (!tab_write_ext_record(ot, rec_info, tab->tab_id, rec_id, log_id, log_offset, thread))
+			return FAILED;
+	}
+
+	/* Put the index entries back: */
+	for (idx_cnt=0, ind=tab->tab_dic.dic_keys; idx_cnt<tab->tab_dic.dic_key_count; idx_cnt++, ind++) {
+		if (!xt_idx_insert(ot, *ind, 0, rec_id, before_buf, after_buf, TRUE))
+			/* Incomplete restore, there will be a rollback... */
+			return FAILED;
+	}
+
+	return OK;
+}
+
+/*
+ * GOTCHA:
+ * If a transaction updates the same record over again, we should update
+ * in place. This prevents producing unnecessary variations!
+ */
+static xtBool tab_overwrite_record(XTOpenTablePtr ot, xtWord1 *before_buf, xtWord1 *after_buf)
+{
+	register XTTableHPtr	tab = ot->ot_table;
+	xtRowID					row_id = ot->ot_curr_row_id;
+	register XTThreadPtr	self = ot->ot_thread;
+	xtRecordID				rec_id = ot->ot_curr_rec_id;
+	XTTabRecExtDRec			prev_rec_head;
+	XTTabRecInfoRec			rec_info;
+	u_int					idx_cnt = 0, i;
+	XTIndexPtr				*ind;
+	xtLogID					log_id;
+	xtLogOffset				log_offset;
+	xtBool					prev_ext_rec;
+
+	if (!myxt_store_row(ot, &rec_info, (char *) after_buf))
+		goto failed_0;
+
+	/* Read before we overwrite! */
+	if (!xt_tab_get_rec_data(ot, rec_id, XT_REC_EXT_HEADER_SIZE, (xtWord1 *) &prev_rec_head))
+		goto failed_0;
+
+	prev_ext_rec = prev_rec_head.tr_rec_type_1 & XT_TAB_STATUS_EXT_DLOG;
+
+	if (rec_info.ri_ext_rec) {
+		/* Determine where the overflow will go... */
+		if (!self->st_dlog_buf.dlb_get_log_offset(&log_id, &log_offset, offsetof(XTactExtRecEntryDRec, er_data) + rec_info.ri_log_data_size, self))
+			goto failed_0;
+		XT_SET_LOG_REF(rec_info.ri_ext_rec, log_id, log_offset);
+	}
+
+	rec_info.ri_fix_rec_buf->tr_stat_id_1 = ot->ot_update_id;
+	XT_SET_DISK_4(rec_info.ri_fix_rec_buf->tr_row_id_4, row_id);
+	XT_COPY_DISK_4(rec_info.ri_fix_rec_buf->tr_prev_rec_id_4, prev_rec_head.tr_prev_rec_id_4);
+	XT_SET_DISK_4(rec_info.ri_fix_rec_buf->tr_xact_id_4, self->st_xact_data->xd_start_xn_id);
+
+	/* Remove the index references, that have changed: */
+	for (idx_cnt=0, ind=tab->tab_dic.dic_keys; idx_cnt<tab->tab_dic.dic_key_count; idx_cnt++, ind++) {
+		if (!xt_idx_delete(ot, *ind, rec_id, before_buf)) {
+			goto failed_0;
+		}
+	}
+
+#ifdef TRACE_VARIATIONS
+	xt_ttracef(self, "overwrite: row=%d rec=%d T%d\n", (int) row_id, (int) rec_id, (int) self->st_xact_data->xd_start_xn_id);
+#endif
+	/* Overwrite the record: */
+	if (!xt_tab_put_log_op_rec_data(ot, XT_LOG_ENT_REC_MODIFIED, 0, rec_id, rec_info.ri_rec_buf_size, (xtWord1 *) rec_info.ri_fix_rec_buf))
+		goto failed_0;
+
+	if (rec_info.ri_ext_rec) {
+		/* Write the log buffer overflow: */		
+		if (!tab_write_ext_record(ot, &rec_info, tab->tab_id, rec_id, log_id, log_offset, self))
+			goto failed_1;
+	}
+
+	/* Add the index references that have changed: */
+	for (idx_cnt=0, ind=tab->tab_dic.dic_keys; idx_cnt<tab->tab_dic.dic_key_count; idx_cnt++, ind++) {
+		if (!xt_idx_insert(ot, *ind, 0, rec_id, after_buf, before_buf, FALSE)) {
+			ot->ot_err_index_no = (*ind)->mi_index_no;
+			goto failed_2;
+		}
+	}
+
+	/* Do the foreign key stuff: */
+	if (ot->ot_table->tab_dic.dic_table->dt_trefs || ot->ot_table->tab_dic.dic_table->dt_fkeys.size() > 0) {
+		if (!ot->ot_table->tab_dic.dic_table->updateRow(ot, before_buf, after_buf))
+			goto failed_2;
+	}
+	
+	/* Delete the previous overflow area: */
+	if (prev_ext_rec)
+		tab_free_ext_record_on_fail(ot, rec_id, &prev_rec_head, TRUE);
+
+	return OK;
+
+	failed_2:
+	/* Remove the new extended record: */
+	if (rec_info.ri_ext_rec)
+		tab_free_ext_record_on_fail(ot, rec_id, (XTTabRecExtDPtr) rec_info.ri_fix_rec_buf, TRUE);
+
+	/* Restore the previous record! */
+	/* Undo index entries: */
+	for (i=0, ind=tab->tab_dic.dic_keys; i<idx_cnt; i++, ind++) {
+		if (!xt_idx_delete(ot, *ind, rec_id, after_buf))
+			goto failed_1;
+	}
+
+	/* Restore the record: */
+	if (!myxt_store_row(ot, &rec_info, (char *) before_buf))
+		goto failed_1;
+
+	if (rec_info.ri_ext_rec)
+		memcpy(rec_info.ri_fix_rec_buf, &prev_rec_head, XT_REC_EXT_HEADER_SIZE);
+	else
+		memcpy(rec_info.ri_fix_rec_buf, &prev_rec_head, sizeof(XTTabRecHeadDRec));
+
+	if (!xt_tab_put_log_op_rec_data(ot, XT_LOG_ENT_REC_MODIFIED, 0, rec_id, rec_info.ri_rec_buf_size, (xtWord1 *) rec_info.ri_fix_rec_buf))
+		goto failed_1;
+
+	/* Put the index entries back: */
+	for (idx_cnt=0, ind=tab->tab_dic.dic_keys; idx_cnt<tab->tab_dic.dic_key_count; idx_cnt++, ind++) {
+		if (!xt_idx_insert(ot, *ind, 0, rec_id, before_buf, after_buf, TRUE))
+			/* Incomplete restore, there will be a rollback... */
+			goto failed_0;
+	}
+
+	/* The previous record has now been restored. */
+	goto failed_0;
+
+	failed_1:
+	/* The old record is overwritten, I must free the previous extended record: */
+	if (prev_ext_rec)
+		tab_free_ext_record_on_fail(ot, rec_id, &prev_rec_head, TRUE);
+
+	failed_0:
+	return FAILED;
+}
+
+xtPublic xtBool xt_tab_update_record(XTOpenTablePtr ot, xtWord1 *before_buf, xtWord1 *after_buf)
+{
+	register XTTableHPtr	tab;
+	xtRowID					row_id;
+	register XTThreadPtr	self;
+	xtRecordID				curr_var_rec_id;
+	XTTabRecInfoRec			rec_info;
+	u_int					idx_cnt = 0;
+	XTIndexPtr				*ind;
+
+	/*
+	 * Originally only the flag ot->ot_curr_updated was checked, and if it was on, then
+	 * tab_overwrite_record() was called, but this caused crashes in some cases like:
+	 *
+	 * set @@autocommit = 0;
+	 * create table t1 (s1 int primary key); 
+	 * create table t2 (s1 int primary key, foreign key (s1) references t1 (s1) on update cascade);
+     * insert into t1 values (1);
+	 * insert into t2 values (1);
+	 * update t1 set s1 = 1;
+	 *
+	 * the last update lead to a crash on t2 cascade update because before_buf argument is NULL 
+	 * in the call below. It is NULL only during cascade update of child table. In that case we 
+	 * cannot pass before_buf value from XTDDTableRef::modifyRow as the before_buf is the original 
+	 * row for the parent (t1) table and it would be used to update any existing indexes
+	 * in the child table which would be wrong of course.
+	 *
+	 * Alternative solution would be to copy the after_info in the XTDDTableRef::modifyRow():
+	 * 
+	 * ...
+	 * if (!xt_tab_load_record(ot, ot->ot_curr_rec_id, &after_info))
+	 *     goto failed_2;
+	 * ...
+	 *
+	 * here the xt_tab_load_record() loads the original row, so we can copy it from there, but in 
+	 * that case we'd need to allocate a new (possibly up to 65536 bytes long) buffer, which makes 
+	 * the optimization questionable
+	 *
+	 */
+	if (ot->ot_curr_updated && before_buf)
+		/* This record has already been updated by this transaction.
+		 * Do the update in place!
+		 */
+		return tab_overwrite_record(ot, before_buf, after_buf);
+
+	tab = ot->ot_table;
+	row_id = ot->ot_curr_row_id;
+	self = ot->ot_thread;
+
+	if (!myxt_store_row(ot, &rec_info, (char *) after_buf))
+		goto failed_0;
+
+	rec_info.ri_fix_rec_buf->tr_stat_id_1 = ot->ot_update_id;
+	XT_SET_DISK_4(rec_info.ri_fix_rec_buf->tr_row_id_4, row_id);
+	XT_SET_DISK_4(rec_info.ri_fix_rec_buf->tr_prev_rec_id_4, ot->ot_curr_rec_id);
+	XT_SET_DISK_4(rec_info.ri_fix_rec_buf->tr_xact_id_4, self->st_xact_data->xd_start_xn_id);
+
+	/* Create the new record: */
+	if (!tab_add_record(ot, &rec_info, XT_LOG_ENT_UPDATE))
+		goto failed_0;
+
+	/* Link the new variation into the list: */
+	XT_TAB_ROW_WRITE_LOCK(&tab->tab_row_rwlock[row_id % XT_ROW_RWLOCKS], self);
+
+	if (!xt_tab_get_row(ot, row_id, &curr_var_rec_id))
+		goto failed_1;
+
+	if (curr_var_rec_id != ot->ot_curr_rec_id) {
+		/* If the transaction does not rollback, I will get an
+		 * exception here:
+		 */
+		if (!tab_wait_for_rollback(ot, row_id, ot->ot_curr_rec_id))
+			goto failed_1;
+		/* [(4)] This is the situation when we overwrite the
+		 * reference to curr_var_rec_id!
+		 * When curr_var_rec_id is cleaned up by the sweeper, the
+		 * sweeper will notice that the record is no longer in
+		 * the row list.
+		 */
+	}
+
+#ifdef TRACE_VARIATIONS
+	xt_ttracef(self, "update: row=%d rec=%d T%d\n", (int) row_id, (int) rec_info.ri_rec_id, (int) self->st_xact_data->xd_start_xn_id);
+#endif
+	if (!xt_tab_set_row(ot, XT_LOG_ENT_ROW_ADD_REC, row_id, rec_info.ri_rec_id))
+		goto failed_1;
+	XT_DISABLED_TRACE(("set upd tx=%d row=%d rec=%d\n", (int) self->st_xact_data->xd_start_xn_id, (int) row_id, (int) rec_info.ri_rec_id));
+
+	XT_TAB_ROW_UNLOCK(&tab->tab_row_rwlock[row_id % XT_ROW_RWLOCKS], self);
+
+	/* Add the index references: */
+	for (idx_cnt=0, ind=tab->tab_dic.dic_keys; idx_cnt<tab->tab_dic.dic_key_count; idx_cnt++, ind++) {
+		if (!xt_idx_insert(ot, *ind, 0, rec_info.ri_rec_id, after_buf, before_buf, FALSE)) {
+			ot->ot_err_index_no = (*ind)->mi_index_no;
+			goto failed_2;
+		}
+	}
+
+	if (ot->ot_table->tab_dic.dic_table->dt_trefs || ot->ot_table->tab_dic.dic_table->dt_fkeys.size() > 0) {
+		if (!ot->ot_table->tab_dic.dic_table->updateRow(ot, before_buf, after_buf))
+			goto failed_2;
+	}
+
+	self->st_statistics.st_row_update++;
+	return OK;
+
+	failed_2:
+	tab_overwrite_record_on_fail(ot, &rec_info, before_buf, after_buf, idx_cnt);
+	goto failed_0;
+
+	failed_1:
+	XT_TAB_ROW_UNLOCK(&tab->tab_row_rwlock[row_id % XT_ROW_RWLOCKS], self);
+
+	failed_0:
+	return FAILED;
+}
+
+xtPublic xtBool xt_tab_delete_record(XTOpenTablePtr ot, xtWord1 *rec_buf)
+{
+	register XTTableHPtr	tab = ot->ot_table;
+	xtRowID					row_id = ot->ot_curr_row_id;
+	xtRecordID				curr_var_rec_id;
+	XTTabRecInfoRec			rec_info;
+
+	/* Setup a delete record: */
+	rec_info.ri_fix_rec_buf = (XTTabRecFixDPtr) ot->ot_row_wbuffer;
+	rec_info.ri_rec_buf_size = offsetof(XTTabRecFixDRec, rf_data);
+	rec_info.ri_ext_rec = NULL;
+	rec_info.ri_fix_rec_buf->tr_rec_type_1 = XT_TAB_STATUS_DELETE;
+	rec_info.ri_fix_rec_buf->tr_stat_id_1 = 0;
+	XT_SET_DISK_4(rec_info.ri_fix_rec_buf->tr_row_id_4, row_id);
+	XT_SET_DISK_4(rec_info.ri_fix_rec_buf->tr_prev_rec_id_4, ot->ot_curr_rec_id);
+	XT_SET_DISK_4(rec_info.ri_fix_rec_buf->tr_xact_id_4, ot->ot_thread->st_xact_data->xd_start_xn_id);
+
+	if (!tab_add_record(ot, &rec_info, XT_LOG_ENT_DELETE))
+		return FAILED;
+
+	XT_TAB_ROW_WRITE_LOCK(&tab->tab_row_rwlock[row_id % XT_ROW_RWLOCKS], ot->ot_thread);
+
+	if (!xt_tab_get_row(ot, row_id, &curr_var_rec_id))
+		goto failed_1;
+
+	if (curr_var_rec_id != ot->ot_curr_rec_id) {
+		if (!tab_wait_for_rollback(ot, row_id, ot->ot_curr_rec_id))
+			goto failed_1;		
+	}
+
+#ifdef TRACE_VARIATIONS
+	xt_ttracef(ot->ot_thread, "update: row=%d rec=%d T%d\n", (int) row_id, (int) rec_info.ri_rec_id, (int) ot->ot_thread->st_xact_data->xd_start_xn_id);
+#endif
+	if (!xt_tab_set_row(ot, XT_LOG_ENT_ROW_ADD_REC, row_id, rec_info.ri_rec_id))
+		goto failed_1;
+	XT_DISABLED_TRACE(("del row tx=%d row=%d rec=%d\n", (int) ot->ot_thread->st_xact_data->xd_start_xn_id, (int) row_id, (int) rec_info.ri_rec_id));
+
+	XT_TAB_ROW_UNLOCK(&tab->tab_row_rwlock[row_id % XT_ROW_RWLOCKS], ot->ot_thread);
+
+	if (ot->ot_table->tab_dic.dic_table->dt_trefs) {
+		if (!ot->ot_table->tab_dic.dic_table->deleteRow(ot, rec_buf))
+			goto failed_2;
+	}
+
+	ot->ot_thread->st_statistics.st_row_delete++;
+	return OK;
+
+	failed_2:
+	tab_overwrite_record_on_fail(ot, &rec_info, rec_buf, NULL, 0);
+	return FAILED;
+
+	failed_1:
+	XT_TAB_ROW_UNLOCK(&tab->tab_row_rwlock[row_id % XT_ROW_RWLOCKS], ot->ot_thread);
+	return FAILED;
+}
+
+xtPublic xtBool xt_tab_restrict_rows(XTBasicListPtr list, XTThreadPtr thread)
+{
+	u_int				i;
+	XTRestrictItemPtr	item;
+	XTOpenTablePtr		pot = NULL;
+	XTDatabaseHPtr		db = thread->st_database;
+	xtBool				ok = TRUE;
+
+	for (i=0; i<list->bl_count; i++) {
+		item = (XTRestrictItemPtr) xt_bl_item_at(list, i);
+		if (item)
+			if (pot) {
+				if (pot->ot_table->tab_id == item->ri_tab_id)
+					goto check_action;
+				xt_db_return_table_to_pool_ns(pot);
+				pot = NULL;
+			}
+
+			if (!xt_db_open_pool_table_ns(&pot, db, item->ri_tab_id)) {
+				/* Should not happen, but just in case, we just don't
+				 * remove the lock. We will probably end up with a deadlock
+				 * somewhere.
+				 */
+				xt_log_and_clear_exception_ns();
+				goto skip_check_action;
+			}
+			if (!pot)
+				/* Can happen of the table has been dropped: */
+				goto skip_check_action;
+
+			check_action:
+			if (!pot->ot_table->tab_dic.dic_table->checkNoAction(pot, item->ri_rec_id)) {
+				ok = FALSE;
+				break;
+			}
+			skip_check_action:;
+	}
+
+	if (pot)
+		xt_db_return_table_to_pool_ns(pot);
+	xt_bl_free(NULL, list);
+	return ok;
+}
+
+
+xtPublic xtBool xt_tab_seq_init(XTOpenTablePtr ot)
+{
+	register XTTableHPtr tab = ot->ot_table;
+	
+	ot->ot_seq_page = NULL;
+	ot->ot_seq_data = NULL;
+	ot->ot_on_page = FALSE;
+	ot->ot_seq_offset = 0;
+
+	ot->ot_curr_rec_id = 0;			// 0 is an invalid position!
+	ot->ot_curr_row_id = 0;			// 0 is an invalid row ID!
+	ot->ot_curr_updated = FALSE;
+
+	/* We note the current EOF before we start a sequential scan.
+	 * It is basically possible to update the same record more than
+	 * once because an updated record creates a new record which
+	 * has a new position which may be in the area that is
+	 * still to be scanned.
+	 *
+	 * By noting the EOF before we start a sequential scan we
+	 * reduce the possibility of this.
+	 *
+	 * However, the possibility still remains, but it should
+	 * not be a problem because a record is not modified
+	 * if there is nothing to change, which is the case
+	 * if the record has already been changed!
+	 *
+	 * NOTE (2008-01-29) There is no longer a problem with updating a
+	 * record twice because records are marked by an update.
+	 *
+	 * [(10)] I have changed this (see below). I now check the
+	 * current EOF of the table.
+	 *
+	 * The reason is that committed read must be able to see the
+	 * changes that occur during table table scan.	 * 
+	 */
+	ot->ot_seq_eof_id = tab->tab_rec_eof_id;
+
+	if (!ot->ot_thread->st_xact_data) {
+		/* MySQL ignores this error, so we
+		 * setup the sequential scan so that it will
+		 * deliver nothing!
+		 */
+		ot->ot_seq_rec_id = ot->ot_seq_eof_id;
+		xt_register_xterr(XT_REG_CONTEXT, XT_ERR_NO_TRANSACTION);
+		return FAILED;
+	}
+
+	ot->ot_seq_rec_id = 1;
+	ot->ot_thread->st_statistics.st_scan_table++;
+	return OK;
+}
+
+xtPublic void xt_tab_seq_reset(XTOpenTablePtr ot)
+{
+	ot->ot_seq_rec_id = 0;
+	ot->ot_seq_eof_id = 0;
+	ot->ot_seq_page = NULL;
+	ot->ot_seq_data = NULL;
+	ot->ot_on_page = FALSE;
+	ot->ot_seq_offset = 0;
+}
+
+xtPublic void xt_tab_seq_exit(XTOpenTablePtr ot)
+{
+	register XTTableHPtr	tab = ot->ot_table;
+
+	if (ot->ot_seq_page) {
+		tab->tab_recs.xt_tc_release_page(ot->ot_rec_file, ot->ot_seq_page, ot->ot_thread);
+		ot->ot_seq_page = NULL;
+	}
+	if (ot->ot_seq_data)
+		XT_UNLOCK_MEMORY_PTR(ot->ot_rec_file, ot->ot_seq_data, TRUE, ot->ot_thread);
+	ot->ot_on_page = FALSE;
+}
+
+#ifdef XT_USE_ROW_REC_MMAP_FILES
+#define TAB_SEQ_LOAD_CACHE		FALSE
+#else
+#ifdef XT_SEQ_SCAN_LOADS_CACHE
+#define TAB_SEQ_LOAD_CACHE		TRUE
+#else
+#define TAB_SEQ_LOAD_CACHE		FALSE
+#endif
+#endif
+
+xtPublic void xt_tab_seq_repeat(XTOpenTablePtr ot)
+{
+	ot->ot_seq_rec_id--;
+	ot->ot_seq_offset -= ot->ot_table->tab_dic.dic_rec_size;
+}
+
+xtPublic xtBool xt_tab_seq_next(XTOpenTablePtr ot, xtWord1 *buffer, xtBool *eof)
+{
+	register XTTableHPtr	tab = ot->ot_table;
+	register size_t			rec_size = tab->tab_dic.dic_rec_size;
+	xtWord1					*buff_ptr;
+	xtRecordID				new_rec_id;
+	xtRecordID				invalid_rec = 0;
+
+	next_page:
+	if (!ot->ot_on_page) {
+		if (!(ot->ot_on_page = tab->tab_recs.xt_tc_get_page(ot->ot_rec_file, ot->ot_seq_rec_id, TAB_SEQ_LOAD_CACHE, &ot->ot_seq_page, &ot->ot_seq_offset, ot->ot_thread)))
+			return FAILED;
+		if (!ot->ot_seq_page) {
+			XT_LOCK_MEMORY_PTR(ot->ot_seq_data, ot->ot_rec_file, xt_rec_id_to_rec_offset(tab, ot->ot_seq_rec_id), tab->tab_rows.tci_page_size, &ot->ot_thread->st_statistics.st_rec, ot->ot_thread);
+			if (!ot->ot_seq_data)
+				return FAILED;
+			ot->ot_on_page = TRUE;
+			ot->ot_seq_offset = 0;
+		}
+	}
+
+	next_record:
+	/* [(10)] The current EOF is used: */
+	if (ot->ot_seq_rec_id >= ot->ot_seq_eof_id) {
+		*eof = TRUE;
+		return OK;
+	}
+
+	if (ot->ot_seq_offset >= tab->tab_recs.tci_page_size) {
+		if (ot->ot_seq_page) {
+			tab->tab_recs.xt_tc_release_page(ot->ot_rec_file, ot->ot_seq_page, ot->ot_thread);
+			ot->ot_seq_page = NULL;
+		}
+		if (ot->ot_seq_data)
+			/* NULL here means that in the case of non-memory mapped
+			 * files we "keep" the lock.
+			 */
+			XT_UNLOCK_MEMORY_PTR(ot->ot_rec_file, ot->ot_seq_data, FALSE, ot->ot_thread);
+		ot->ot_on_page = FALSE;
+		goto next_page;
+	}
+
+	if (ot->ot_seq_page)
+		buff_ptr = ot->ot_seq_page->tcp_data + ot->ot_seq_offset;
+	else
+		buff_ptr = ot->ot_seq_data + ot->ot_seq_offset;
+
+	/* This is the current record: */
+	ot->ot_curr_rec_id = ot->ot_seq_rec_id;
+	ot->ot_curr_row_id = 0;
+
+	/* Move to the next record: */
+	ot->ot_seq_rec_id++;
+	ot->ot_seq_offset += rec_size;
+
+	retry:
+	switch (tab_visible(ot, (XTTabRecHeadDPtr) buff_ptr, &new_rec_id)) {
+		case FALSE:
+			goto next_record;
+		case XT_ERR:
+			goto failed;
+		case XT_NEW:
+			buff_ptr = ot->ot_row_rbuffer;
+			if (!xt_tab_get_rec_data(ot, new_rec_id, rec_size, ot->ot_row_rbuffer))
+				return XT_ERR;
+			ot->ot_curr_rec_id = new_rec_id;
+			break;
+		case XT_RETRY:
+			goto retry;
+		case XT_REREAD:
+			if (invalid_rec != ot->ot_curr_rec_id) {
+				/* Don't re-read for the same record twice: */
+				invalid_rec = ot->ot_curr_rec_id;
+
+				/* Undo move to next: */
+				ot->ot_seq_rec_id--;
+				ot->ot_seq_offset -= rec_size;
+				
+				/* Prepare to reread the page: */
+				if (ot->ot_seq_page) {
+					tab->tab_recs.xt_tc_release_page(ot->ot_rec_file, ot->ot_seq_page, ot->ot_thread);
+					ot->ot_seq_page = NULL;
+				}
+				ot->ot_on_page = FALSE;
+				goto next_page;
+			}
+			if (!tab_record_corrupt(ot, XT_GET_DISK_4(((XTTabRecHeadDPtr) buff_ptr)->tr_row_id_4), invalid_rec, true, 8))
+				return XT_ERR;
+#ifdef XT_CRASH_DEBUG
+			/* Should not happen! */
+			xt_crash_me();
+#endif
+			/* Continue, and skip the record... */
+			invalid_rec = 0;
+			goto next_record;
+		default:
+			break;
+	}
+
+	switch (*buff_ptr) {
+		case XT_TAB_STATUS_FIXED:
+		case XT_TAB_STATUS_FIX_CLEAN:
+			memcpy(buffer, buff_ptr + XT_REC_FIX_HEADER_SIZE, rec_size - XT_REC_FIX_HEADER_SIZE);
+			break;
+		case XT_TAB_STATUS_VARIABLE:
+		case XT_TAB_STATUS_VAR_CLEAN:
+			if (!myxt_load_row(ot, buff_ptr + XT_REC_FIX_HEADER_SIZE, buffer, ot->ot_cols_req))
+				goto failed_1;
+			break;
+		case XT_TAB_STATUS_EXT_DLOG:
+		case XT_TAB_STATUS_EXT_CLEAN: {
+			u_int cols_req = ot->ot_cols_req;
+
+			ASSERT_NS(cols_req);
+			if (cols_req && cols_req <= tab->tab_dic.dic_fix_col_count) {
+				if (!myxt_load_row(ot, buff_ptr + XT_REC_EXT_HEADER_SIZE, buffer, cols_req))
+					goto failed_1;
+			}
+			else {
+				if (buff_ptr != ot->ot_row_rbuffer)
+					memcpy(ot->ot_row_rbuffer, buff_ptr, rec_size);
+				if (!xt_tab_load_ext_data(ot, ot->ot_curr_rec_id, buffer, cols_req))
+					goto failed_1;
+			}
+			break;
+		}
+	}
+
+	*eof = FALSE;
+	return OK;
+
+	failed_1:
+
+	failed:
+	return FAILED;
+}
+
+/*
+ * -----------------------------------------------------------------------
+ * REPAIR TABLE
+ */
+
+#define REP_FIND		0
+#define REP_ADD			1
+#define REP_DEL			2
+
+static xtBool tab_exec_repair_pending(XTDatabaseHPtr db, int what, char *table_name)
+{
+	XTThreadPtr			thread = xt_get_self();
+	char				file_path[PATH_MAX];
+	XTOpenFilePtr		of = NULL;
+	int					len;
+	char				*buffer = NULL, *ptr, *name;
+	char				ch;
+	xtBool				found = FALSE;
+
+	xt_strcpy(PATH_MAX, file_path, db->db_main_path);
+	xt_add_pbxt_file(PATH_MAX, file_path, "repair-pending");
+	
+	if (what == REP_ADD) {
+		if (!xt_open_file_ns(&of, file_path, XT_FS_CREATE | XT_FS_MAKE_PATH))
+			return FALSE;
+	}
+	else {
+		if (!xt_open_file_ns(&of, file_path, XT_FS_DEFAULT | XT_FS_MISSING_OK))
+			return FALSE;
+	}
+	if (!of)
+		return FALSE;
+
+	len = (int) xt_seek_eof_file(NULL, of);
+	
+	if (!(buffer = (char *) xt_malloc_ns(len + 1)))
+		goto failed;
+
+	if (!xt_pread_file(of, 0, len, len, buffer, NULL, &thread->st_statistics.st_x, thread))
+		goto failed;
+
+	buffer[len] = 0;
+	ptr = buffer;
+	for(;;) {
+		name = ptr;
+		while (*ptr && *ptr != '\n' && *ptr != '\r')
+			ptr++;
+		if (ptr > name) {
+			ch = *ptr;
+			*ptr = 0;
+			if (xt_tab_compare_names(name, table_name) == 0) {
+				*ptr = ch;
+				found = TRUE;
+				break;
+			}	
+			*ptr = ch;
+		}
+		if (!*ptr)
+			break;
+		ptr++;
+	}
+
+	switch (what) {
+		case REP_ADD:
+			if (!found) {
+				/* Remove any trailing empty lines: */
+				while (len > 0) {
+					if (buffer[len-1] != '\n' && buffer[len-1] != '\r')
+						break;
+					len--;
+				}
+				if (len > 0) {
+					if (!xt_pwrite_file(of, len, 1, (void *) "\n", &thread->st_statistics.st_x, thread))
+						goto failed;
+					len++;
+				}
+				if (!xt_pwrite_file(of, len, strlen(table_name), table_name, &thread->st_statistics.st_x, thread))
+					goto failed;
+				len += strlen(table_name);
+				if (!xt_set_eof_file(NULL, of, len))
+					goto failed;
+			}
+			break;
+		case REP_DEL:
+			if (found) {
+				if (*ptr != '\0')
+					ptr++;
+				memmove(name, ptr, len - (ptr - buffer));
+				len = len - (ptr - name);
+
+				/* Remove trailing empty lines: */
+				while (len > 0) {
+					if (buffer[len-1] != '\n' && buffer[len-1] != '\r')
+						break;
+					len--;
+				}
+
+				if (len > 0) {
+					if (!xt_pwrite_file(of, 0, len, buffer, &thread->st_statistics.st_x, thread))
+						goto failed;
+					if (!xt_set_eof_file(NULL, of, len))
+						goto failed;
+				}
+			}
+			break;
+	}
+
+	xt_close_file_ns(of);
+	xt_free_ns(buffer);
+
+	if (len == 0)
+		xt_fs_delete(NULL, file_path);
+	return found;
+
+	failed:
+	if (of)
+		xt_close_file_ns(of);
+	if (buffer)
+		xt_free_ns(buffer);
+	xt_log_and_clear_exception(thread);
+	return FALSE;
+}
+
+xtPublic void xt_tab_make_table_name(XTTableHPtr tab, char *table_name, size_t size)
+{
+	char	*nptr;
+
+	nptr = xt_last_name_of_path(tab->tab_name->ps_path);
+	if (xt_starts_with(nptr, "#sql")) {
+		/* {INVALID-OLD-TABLE-FIX}
+		 * Temporary files can have strange paths, for example
+		 * ..../var/tmp/mysqld.1/#sqldaec_1_6
+		 * This occurs, for example, occurs when the temp_table.test is
+		 * run using the PBXT suite in MariaDB:
+		 * ./mtr --suite=pbxt --do-test=temp_table
+		 *
+		 * Calling myxt_static_convert_file_name, with a '.', in the name
+		 * causes the error:
+		 * [ERROR] Invalid (old?) table or database name 'mysqld.1'
+		 * To prevent this, we do not convert the temporary
+		 * table names using the mysql functions.
+		 *
+		 * Note, this bug was found by Monty, and fixed by modifying
+		 * xt_2nd_last_name_of_path(), see {INVALID-OLD-TABLE-FIX}.
+		 *
+		 */
+		xt_2nd_last_name_of_path(size, table_name, tab->tab_name->ps_path);
+		xt_strcat(size, table_name, ".");
+		xt_strcat(size, table_name, nptr);
+	}
+	else {
+		char	name_buf[XT_TABLE_NAME_SIZE*3+3];
+		char	*part_ptr;
+		size_t	len;
+
+		xt_2nd_last_name_of_path(sizeof(name_buf), name_buf, tab->tab_name->ps_path);
+		myxt_static_convert_file_name(name_buf, table_name, size);
+		xt_strcat(size, table_name, ".");
+		
+		/* Handle partition extensions to table names: */
+		if ((part_ptr = strstr(nptr, "#P#")))
+			xt_strncpy(sizeof(name_buf), name_buf, nptr, part_ptr - nptr);
+		else
+			xt_strcpy(sizeof(name_buf), name_buf, nptr);
+
+		len = strlen(table_name);
+		myxt_static_convert_file_name(name_buf, table_name + len, size - len);
+
+		if (part_ptr) {
+			/* Add the partition extension (which is relevant to the engine). */
+			char	*sub_part_ptr;
+
+			part_ptr += 3;
+			if ((sub_part_ptr = strstr(part_ptr, "#SP#")))
+				xt_strncpy(sizeof(name_buf), name_buf, part_ptr, sub_part_ptr - part_ptr);
+			else
+				xt_strcpy(sizeof(name_buf), name_buf, part_ptr);
+			
+			xt_strcat(size, table_name, " (");
+			len = strlen(table_name);
+			myxt_static_convert_file_name(name_buf, table_name + len, size - len);
+			
+			if (sub_part_ptr) {
+			
+				sub_part_ptr += 4;
+				xt_strcat(size, table_name, " - ");
+				len = strlen(table_name);
+				myxt_static_convert_file_name(sub_part_ptr, table_name + len, size - len);
+			}
+
+			xt_strcat(size, table_name, ")");
+		}
+	}
+}
+
+xtPublic xtBool xt_tab_is_table_repair_pending(XTTableHPtr tab)
+{
+	char table_name[XT_IDENTIFIER_NAME_SIZE*3+3];
+
+	xt_tab_make_table_name(tab, table_name, sizeof(table_name));
+	return tab_exec_repair_pending(tab->tab_db, REP_FIND, table_name);
+}
+
+xtPublic void xt_tab_table_repaired(XTTableHPtr tab)
+{
+	if (tab->tab_repair_pending) {
+		char table_name[XT_IDENTIFIER_NAME_SIZE*3+3];
+
+		tab->tab_repair_pending = FALSE;
+		xt_tab_make_table_name(tab, table_name, sizeof(table_name));
+		tab_exec_repair_pending(tab->tab_db, REP_DEL, table_name);
+	}
+}
+
+xtPublic void xt_tab_set_table_repair_pending(XTTableHPtr tab)
+{
+	if (!tab->tab_repair_pending) {
+		char table_name[XT_IDENTIFIER_NAME_SIZE*3+3];
+
+		tab->tab_repair_pending = TRUE;
+		xt_tab_make_table_name(tab, table_name, sizeof(table_name));
+		tab_exec_repair_pending(tab->tab_db, REP_ADD, table_name);
+	}
+}
diff --git a/storage/pbxt/src/table_xt.h b/storage/pbxt/src/table_xt.h
new file mode 100644
index 00000000000..f6c32587419
--- /dev/null
+++ b/storage/pbxt/src/table_xt.h
@@ -0,0 +1,648 @@
+/* Copyright (c) 2005 PrimeBase Technologies GmbH
+ *
+ * PrimeBase XT
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * 2005-02-08	Paul McCullagh
+ *
+ * H&G2JCtL
+ */
+#ifndef __xt_table_h__
+#define __xt_table_h__
+
+#include <time.h>
+
+#include "datalog_xt.h"
+#include "filesys_xt.h"
+#include "hashtab_xt.h"
+#include "index_xt.h"
+#include "cache_xt.h"
+#include "util_xt.h"
+#include "heap_xt.h"
+#include "tabcache_xt.h"
+#include "xactlog_xt.h"
+#include "lock_xt.h"
+
+struct XTDatabase;
+struct XTThread;
+struct XTCache;
+struct XTOpenTable;
+struct XTTablePath;
+
+#define XT_TAB_INCOMPATIBLE_VERSION	4
+#define XT_TAB_CURRENT_VERSION		5
+
+/* This version of the index does not have lazy
+ * delete. The new version is compatible with
+ * this and maintains the old format.
+ */
+#define XT_IND_NO_LAZY_DELETE		3
+#define XT_IND_LAZY_DELETE_OK		4
+#ifdef XT_USE_LAZY_DELETE
+#define XT_IND_CURRENT_VERSION		XT_IND_LAZY_DELETE_OK
+#else
+#define XT_IND_CURRENT_VERSION		XT_IND_NO_LAZY_DELETE
+#endif
+
+#define XT_HEAD_BUFFER_SIZE			1024
+
+#ifdef DEBUG
+//#define XT_TRACK_INDEX_UPDATES
+//#define XT_TRACK_RETURNED_ROWS
+#endif
+
+/*
+ * NOTE: Records may only be freed (placed on the free list), after
+ * all currently running transactions have ended.
+ * The reason is, running transactions may have references in memory
+ * to these records (a sequential scan has a large buffer).
+ * If the records are freed they may be re-used. This will
+ * cause problems because the references will then refer to
+ * new data.
+ *
+ * As a result, deleted records are first placed in the
+ * REMOVED state. Later, when transactions have quit, they
+ * are freed.
+ */
+#define XT_TAB_STATUS_FREED			0x00			/* On the free list. */
+#define XT_TAB_STATUS_DELETE		0x01			/* A transactional delete record (an "update" that indicates a delete). */
+#define XT_TAB_STATUS_FIXED			0x02
+#define XT_TAB_STATUS_VARIABLE		0x03			/* Uses one block, but has the variable format. */
+#define XT_TAB_STATUS_EXT_DLOG		0x04			/* Variable format, and the trailing part of the record in the data log. */
+#define XT_TAB_STATUS_EXT_HDATA		0x05			/* Variable format, and the trailing part of the record in the handle data file. */
+#define XT_TAB_STATUS_DATA			0x06			/* A block of data with a next pointer (5 bytes overhead). */
+#define XT_TAB_STATUS_END_DATA		0x07			/* An block of data without an end pointer (1 byte overhead). */
+#define XT_TAB_STATUS_MASK			0x0F
+
+#define XT_TAB_STATUS_DEL_CLEAN		(XT_TAB_STATUS_DELETE | XT_TAB_STATUS_CLEANED_BIT)
+#define XT_TAB_STATUS_FIX_CLEAN		(XT_TAB_STATUS_FIXED | XT_TAB_STATUS_CLEANED_BIT)
+#define XT_TAB_STATUS_VAR_CLEAN		(XT_TAB_STATUS_VARIABLE | XT_TAB_STATUS_CLEANED_BIT)
+#define XT_TAB_STATUS_EXT_CLEAN		(XT_TAB_STATUS_EXT_DLOG | XT_TAB_STATUS_CLEANED_BIT)
+
+#define XT_TAB_STATUS_CLEANED_BIT	0x80			/* This bit is set when the record is cleaned and committed. */
+
+#define XT_REC_IS_CLEAN(x)			((x) & XT_TAB_STATUS_CLEANED_BIT)
+#define XT_REC_IS_FREE(x)			(((x) & XT_TAB_STATUS_MASK) == XT_TAB_STATUS_FREED)
+#define XT_REC_IS_DELETE(x)			(((x) & XT_TAB_STATUS_MASK) == XT_TAB_STATUS_DELETE)
+#define XT_REC_IS_FIXED(x)			(((x) & XT_TAB_STATUS_MASK) == XT_TAB_STATUS_FIXED)
+#define XT_REC_IS_VARIABLE(x)		(((x) & XT_TAB_STATUS_MASK) == XT_TAB_STATUS_VARIABLE)
+#define XT_REC_IS_EXT_DLOG(x)		(((x) & XT_TAB_STATUS_MASK) == XT_TAB_STATUS_EXT_DLOG)
+#define XT_REC_IS_EXT_HDATA(x)		(((x) & XT_TAB_STATUS_MASK) == XT_TAB_STATUS_EXT_HDATA)
+#define XT_REC_NOT_VALID(x)			(XT_REC_IS_FREE(x) || XT_REC_IS_DELETE(x))
+
+/* Results for xt_use_table_by_id(): */
+#define XT_TAB_OK					0
+#define XT_TAB_NOT_FOUND			1
+#define XT_TAB_NO_DICTIONARY		2
+#define XT_TAB_POOL_CLOSED			3				/* Cannot open table at the moment, the pool is closed. */
+#define XT_TAB_FAILED				4
+
+#ifdef XT_NO_ATOMICS
+#define XT_TAB_ROW_USE_PTHREAD_RW
+#else
+//#define XT_TAB_ROW_USE_RWMUTEX
+//#define XT_TAB_ROW_USE_SPINXSLOCK
+#define XT_TAB_ROW_USE_XSMUTEX
+#endif
+
+#ifdef XT_TAB_ROW_USE_XSMUTEX
+#define XT_TAB_ROW_LOCK_TYPE			XTXSMutexRec
+#define XT_TAB_ROW_INIT_LOCK(s, i)		xt_xsmutex_init_with_autoname(s, i)
+#define XT_TAB_ROW_FREE_LOCK(s, i)		xt_xsmutex_free(s, i)	
+#define XT_TAB_ROW_READ_LOCK(i, s)		xt_xsmutex_slock(i, (s)->t_id)
+#define XT_TAB_ROW_WRITE_LOCK(i, s)		xt_xsmutex_xlock(i, (s)->t_id)
+#define XT_TAB_ROW_UNLOCK(i, s)			xt_xsmutex_unlock(i, (s)->t_id)
+#elif defined(XT_TAB_ROW_USE_PTHREAD_RW)
+#define XT_TAB_ROW_LOCK_TYPE			xt_rwlock_type
+#define XT_TAB_ROW_INIT_LOCK(s, i)		xt_init_rwlock_with_autoname(s, i)
+#define XT_TAB_ROW_FREE_LOCK(s, i)		xt_free_rwlock(i)	
+#define XT_TAB_ROW_READ_LOCK(i, s)		xt_slock_rwlock_ns(i)
+#define XT_TAB_ROW_WRITE_LOCK(i, s)		xt_xlock_rwlock_ns(i)
+#define XT_TAB_ROW_UNLOCK(i, s)			xt_unlock_rwlock_ns(i)
+#elif defined(XT_TAB_ROW_USE_RWMUTEX)
+#define XT_TAB_ROW_LOCK_TYPE			XTRWMutexRec
+#define XT_TAB_ROW_INIT_LOCK(s, i)		xt_rwmutex_init_with_autoname(s, i)
+#define XT_TAB_ROW_FREE_LOCK(s, i)		xt_rwmutex_free(s, i)	
+#define XT_TAB_ROW_READ_LOCK(i, s)		xt_rwmutex_slock(i, (s)->t_id)
+#define XT_TAB_ROW_WRITE_LOCK(i, s)		xt_rwmutex_xlock(i, (s)->t_id)
+#define XT_TAB_ROW_UNLOCK(i, s)			xt_rwmutex_unlock(i, (s)->t_id)
+#elif defined(XT_TAB_ROW_USE_SPINXSLOCK)
+#define XT_TAB_ROW_LOCK_TYPE			XTSpinXSLockRec
+#define XT_TAB_ROW_INIT_LOCK(s, i)		xt_spinxslock_init_with_autoname(s, i)
+#define XT_TAB_ROW_FREE_LOCK(s, i)		xt_spinxslock_free(s, i)	
+#define XT_TAB_ROW_READ_LOCK(i, s)		xt_spinxslock_slock(i, (s)->t_id)
+#define XT_TAB_ROW_WRITE_LOCK(i, s)		xt_spinxslock_xlock(i, (s)->t_id)
+#define XT_TAB_ROW_UNLOCK(i, s)			xt_spinxslock_unlock(i, (s)->t_id)
+#else
+#define XT_TAB_ROW_LOCK_TYPE			XTSpinLockRec
+#define XT_TAB_ROW_INIT_LOCK(s, i)		xt_spinlock_init_with_autoname(s, i)
+#define XT_TAB_ROW_FREE_LOCK(s, i)		xt_spinlock_free(s, i)	
+#define XT_TAB_ROW_READ_LOCK(i, s)		xt_spinlock_lock(i)
+#define XT_TAB_ROW_WRITE_LOCK(i, s)		xt_spinlock_lock(i)
+#define XT_TAB_ROW_UNLOCK(i, s)			xt_spinlock_unlock(i)
+#endif
+
+/* ------- TABLE DATA FILE ------- */
+
+#define XT_TAB_DATA_MAGIC		0x1234ABCD
+
+#define XT_FORMAT_DEF_SPACE		512
+
+#define XT_TAB_FLAGS_TEMP_TAB	1
+
+/*
+ * This header ensures that no record in the data file has the offset 0.
+ */
+typedef struct XTTableHead {
+	XTDiskValue4			th_head_size_4;							/* The size of the table header. */
+	XTDiskValue4			th_op_seq_4;
+	XTDiskValue6			th_row_free_6;
+	XTDiskValue6			th_row_eof_6;
+	XTDiskValue6			th_row_fnum_6;
+	XTDiskValue6			th_rec_free_6;
+	XTDiskValue6			th_rec_eof_6;
+	XTDiskValue6			th_rec_fnum_6;
+} XTTableHeadDRec, *XTTableHeadDPtr;
+
+typedef struct XTTableFormat {
+	XTDiskValue4			tf_format_size_4;						/* The size of this structure (table format). */
+	XTDiskValue4			tf_tab_head_size_4;						/* The offset of the first record in the data handle file. */
+	XTDiskValue2			tf_tab_version_2;						/* The table version number. */
+	XTDiskValue2			tf_tab_flags_2;							/* Table flags XT_TAB_FLAGS_* */
+	XTDiskValue4			tf_rec_size_4;							/* The maximum size of records in the table. */
+	XTDiskValue1			tf_rec_fixed_1;							/* Set to 1 if this table contains fixed length records. */
+	XTDiskValue1			tf_reserved_1;							/* - */
+	XTDiskValue8			tf_min_auto_inc_8;						/* This is the minimum auto-increment value. */
+	xtWord1					tf_reserved[64];						/* Reserved, set to 0. */
+	char					tf_definition[XT_VAR_LENGTH];			/* A cstring, currently it only contains the foreign key information. */
+} XTTableFormatDRec, *XTTableFormatDPtr;
+
+#define XT_STAT_ID_MASK(x)	((x) & (u_int) 0x000000FF)
+
+/* A record that fits completely in the data file record */
+typedef struct XTTabRecHead {
+	xtWord1					tr_rec_type_1;
+	xtWord1					tr_stat_id_1;
+	xtDiskRecordID4			tr_prev_rec_id_4;		/* The previous variation of this record. */
+	XTDiskValue4			tr_xact_id_4;			/* The transaction ID. */
+	XTDiskValue4			tr_row_id_4;			/* The row ID of this record. */
+} XTTabRecHeadDRec, *XTTabRecHeadDPtr;
+
+typedef struct XTTabRecFix {
+	xtWord1					tr_rec_type_1;			/* XT_TAB_STATUS_FREED, XT_TAB_STATUS_DELETE,
+													 * XT_TAB_STATUS_FIXED, XT_TAB_STATUS_VARIABLE */
+	xtWord1					tr_stat_id_1;
+	xtDiskRecordID4			tr_prev_rec_id_4;		/* The previous variation of this record. */
+	XTDiskValue4			tr_xact_id_4;			/* The transaction ID. */
+	XTDiskValue4			tr_row_id_4;			/* The row ID of this record. */
+	xtWord1					rf_data[XT_VAR_LENGTH];	/* NOTE: This data is in RAW MySQL format. */
+} XTTabRecFixDRec, *XTTabRecFixDPtr;
+
+/* An extended record that overflows into the log file: */
+typedef struct XTTabRecExt {
+	xtWord1					tr_rec_type_1;			/* XT_TAB_STATUS_EXT_DLOG */
+	xtWord1					tr_stat_id_1;
+	xtDiskRecordID4			tr_prev_rec_id_4;		/* The previous variation of this record. */
+	XTDiskValue4			tr_xact_id_4;			/* The transaction ID. */
+	XTDiskValue4			tr_row_id_4;			/* The row ID of this record. */
+	XTDiskValue2			re_log_id_2;			/* Reference to overflow area, log ID */
+	XTDiskValue6			re_log_offs_6;			/* Reference to the overflow area, log offset */
+	XTDiskValue4			re_log_dat_siz_4;		/* Size of the overflow data. */
+	xtWord1					re_data[XT_VAR_LENGTH];	/* This data is in packed PBXT format. */
+} XTTabRecExtDRec, *XTTabRecExtDPtr;
+
+typedef struct XTTabRecExtHdat {
+	xtWord1					tr_rec_type_1;			/* XT_TAB_STATUS_EXT_HDATA */
+	xtWord1					tr_stat_id_1;
+	xtDiskRecordID4			tr_prev_rec_id_4;		/* The previous variation of this record. */
+	XTDiskValue4			tr_xact_id_4;			/* The transaction ID. */
+	XTDiskValue4			tr_row_id_4;			/* The row ID of this record. */
+	XTDiskValue4			eh_blk_rec_id_4;		/* The record ID of the next block. */
+	XTDiskValue2			eh_blk_siz_2;			/* The total size of the data in the trailing blocks */
+	xtWord1					eh_data[XT_VAR_LENGTH];	/* This data is in packed PBXT format. */
+} XTTabRecExtHdatDRec, *XTTabRecExtHdatDPtr;
+
+typedef struct XTTabRecData {
+	xtWord1					tr_rec_type_1;			/* XT_TAB_STATUS_DATA */
+	XTDiskValue4			rd_blk_rec_id_4;		/* The record ID of the next block. */
+	xtWord1					rd_data[XT_VAR_LENGTH];	/* This data is in packed PBXT format. */
+} XTTabRecDataDRec, *XTTabRecDataDPtr;
+
+typedef struct XTTabRecEndDat {
+	xtWord1					tr_rec_type_1;			/* XT_TAB_STATUS_END_DATA */
+	xtWord1					ed_data[XT_VAR_LENGTH];	/* This data is in packed PBXT format. */
+} XTTabRecEndDatDRec, *XTTabRecEndDatDPtr;
+
+#define XT_REC_FIX_HEADER_SIZE		sizeof(XTTabRecHeadDRec)
+#define XT_REC_EXT_HEADER_SIZE		offsetof(XTTabRecExtDRec, re_data)
+#define XT_REC_FIX_EXT_HEADER_DIFF	(XT_REC_EXT_HEADER_SIZE - XT_REC_FIX_HEADER_SIZE)
+
+typedef struct XTTabRecFree {
+	xtWord1					rf_rec_type_1;
+	xtWord1					rf_not_used_1;
+	xtDiskRecordID4			rf_next_rec_id_4;		/* The next block on the free list. */
+} XTTabRecFreeDRec, *XTTabRecFreeDPtr;
+
+typedef struct XTTabRecInfo {
+	XTTabRecFixDPtr			ri_fix_rec_buf;			/* This references the start of the buffer (set for all types of records) */
+	XTTabRecExtDPtr			ri_ext_rec;				/* This is only set for extended records. */
+	xtWord4					ri_rec_buf_size;
+	XTactExtRecEntryDPtr	ri_log_buf;
+	xtWord4					ri_log_data_size;		/* This size of the data in the log record. */
+	xtRecordID				ri_rec_id;				/* The record ID. */
+} XTTabRecInfoRec, *XTTabRecInfoPtr;
+
+/* ------- TABLE ROW FILE ------- */
+
+#define XT_TAB_ROW_SHIFTS		2
+#define XT_TAB_ROW_MAGIC		0x4567CDEF
+//#define XT_TAB_ROW_FREE			0
+//#define XT_TAB_ROW_IN_USE		1
+
+/*
+ * NOTE: The shift count assumes the size of a table row
+ * reference is 8 bytes (XT_TAB_ROW_SHIFTS)
+ */
+typedef struct XTTabRowRef {
+	XTDiskValue4			rr_ref_id_4;			/* 4-byte reference, could be a RowID or a RecordID
+													 * If this row is free, then it is a RowID, which
+													 * references the next free row.
+													 * If it is in use, then it is a RecordID which
+													 * points to the first record in the variation
+													 * list for the row.
+													 */
+} XTTabRowRefDRec, *XTTabRowRefDPtr;
+
+/*
+ * This is the header for the row file. The size MUST be a
+ * the same size as sizeof(XTTabRowRefDRec)
+ */
+typedef struct XTTabRowHead {
+	XTDiskValue4			rh_magic_4;
+} XTTabRowHeadDRec, *XTTabRowHeadDPtr;
+
+/* ------- TABLE & OPEN TABLES & TABLE LISTING ------- */
+
+/* {TEMP-TABLES}
+ * Temporary tables do not need to be flused,
+ * and they also do not need to be recovered!
+ * Currently this is determined by the name of the
+ * table!
+ */
+typedef struct XTTable : public XTHeap {
+	struct XTDatabase		*tab_db;			/* Heap pointer */
+	XTPathStrPtr			tab_name;
+	xtBool					tab_free_locks;
+	xtTableID				tab_id;
+
+	xtWord8					tab_auto_inc;							/* The last value returned as an auto-increment value {PRE-INC}. */
+	XTSpinLockRec			tab_ainc_lock;							/* Lock for the auto-increment counter. */
+
+	size_t					tab_index_format_offset;
+	size_t					tab_index_header_size;
+	size_t					tab_index_page_size;
+	u_int					tab_index_block_shifts;
+	XTIndexHeadDPtr			tab_index_head;
+	size_t					tab_table_format_offset;
+	size_t					tab_table_head_size;
+	XTDictionaryRec			tab_dic;
+	xt_mutex_type			tab_dic_field_lock;						/* Lock for setting field->ptr!. */
+
+	XTRowLocksRec			tab_locks;								/* The locks held on this table. */
+
+	XTTableSeqRec			tab_seq;								/* The table operation sequence. */
+	XTTabCacheRec			tab_rows;
+	XTTabCacheRec			tab_recs;
+
+	/* Used to apply operations to the database in order. */
+	XTSortedListPtr			tab_op_list;							/* The operation list. Operations to be applied. */
+
+	/* Values that belong in the header when flushed! */
+	xtBool					tab_flush_pending;						/* TRUE if the table needs to be flushed */
+	xtBool					tab_recovery_done;						/* TRUE if the table has been recovered */
+	xtBool					tab_repair_pending;						/* TRUE if the table has been marked for repair */
+	xtBool					tab_temporary;							/* TRUE if this is a temporary table {TEMP-TABLES}. */
+	off_t					tab_bytes_to_flush;						/* Number of bytes of the record/row files to flush. */
+
+	xtOpSeqNo				tab_head_op_seq;						/* The number of the operation last applied to the database. */
+	xtRowID					tab_head_row_free_id;
+	xtRowID					tab_head_row_eof_id;
+	xtWord4					tab_head_row_fnum;
+	xtRecordID				tab_head_rec_free_id;
+	xtRecordID				tab_head_rec_eof_id;
+	xtWord4					tab_head_rec_fnum;
+
+	xtOpSeqNo				tab_co_op_seq;							/* The operation last applied by the compactor. */
+
+	xtBool					tab_wr_wake_freeer;						/* Set to TRUE if the writer must wake the freeer. */
+	xtOpSeqNo				tab_wake_freeer_op;						/* Set to the sequence number the freeer is waiting for. */
+
+	XTFilePtr				tab_row_file;
+	xtRowID					tab_row_eof_id;							/* Indicates the EOF of the table row file. */
+	xtRowID					tab_row_free_id;						/* The start of the free list in the table row file. */
+	xtWord4					tab_row_fnum;							/* The count of the number of free rows on the free list. */
+	xt_mutex_type			tab_row_lock;							/* Lock for updating the EOF and free list. */
+	XT_TAB_ROW_LOCK_TYPE	tab_row_rwlock[XT_ROW_RWLOCKS];			/* Used to lock a row during update. */
+
+	xt_mutex_type			tab_rec_flush_lock;						/* Required while the record/row files are being flushed. */
+	XTFilePtr				tab_rec_file;
+	xtRecordID				tab_rec_eof_id;							/* This value can only grow. */
+	xtRecordID				tab_rec_free_id;
+	xtWord4					tab_rec_fnum;							/* The count of the number of free rows on the free list. */
+	xt_mutex_type			tab_rec_lock;							/* Lock for the free list. */
+
+	xt_mutex_type			tab_ind_stat_lock;						/* Aquired when calculating index statistics. */
+	time_t					tab_ind_stat_calc_time;					/* Zero means the index stats have not be calculated, otherwize this is a time. */
+
+	xt_mutex_type			tab_ind_flush_lock;						/* Required while the index file is being flushed. */
+	xtLogID					tab_ind_rec_log_id;						/* The point before which index entries have been written. */
+	xtLogOffset				tab_ind_rec_log_offset;					/* The log offset of the write point. */
+	XTFilePtr				tab_ind_file;
+	xtIndexNodeID			tab_ind_eof;							/* This value can only grow. */
+	xtIndexNodeID			tab_ind_free;							/* The start of the free page list of the index. */
+	XTIndFreeListPtr		tab_ind_free_list;						/* A cache of the free list (if exists, don't go to disk!) */
+	xt_mutex_type			tab_ind_lock;							/* Lock for reading and writing the index free list. */
+	xtWord4					tab_ind_flush_seq;
+} XTTableHRec, *XTTableHPtr;		/* Heap pointer */
+
+/* Used for an in-memory list of the tables, ordered by ID. */
+typedef struct XTTableEntry {
+	xtTableID				te_tab_id;
+	char					*te_tab_name;
+	struct XTTablePath		*te_tab_path;
+	XTTableHPtr				te_table;
+} XTTableEntryRec, *XTTableEntryPtr;
+
+typedef struct XTOpenTable {
+	struct XTThread			*ot_thread;								/* The thread currently using this open table. */
+	XTTableHPtr				ot_table;								/* PBXT table information. */
+
+	struct XTOpenTable		*ot_otp_next_free;						/* Next free open table in the open table pool. */
+	struct XTOpenTable		*ot_otp_mr_used;
+	struct XTOpenTable		*ot_otp_lr_used;
+	time_t					ot_otp_free_time;						/* The time this table was place on the free list. */
+
+	//struct XTOpenTable	*ot_pool_next;							/* Next pointer for open table pool. */
+
+	XT_ROW_REC_FILE_PTR		ot_rec_file;
+	XT_ROW_REC_FILE_PTR		ot_row_file;
+	XTOpenFilePtr			ot_ind_file;
+	u_int					ot_err_index_no;						/* The number of the index on which the last error occurred */
+
+	xtBool					ot_rec_fixed;							/* Cached from table for quick access. */
+	size_t					ot_rec_size;							/* Cached from table for quick access. */
+	
+	char					ot_error_key[XT_IDENTIFIER_NAME_SIZE];
+	struct XTOpenTable		*ot_prev_update;						/* The UPDATE statement stack! {UPDATE-STACK} */
+	u_int					ot_update_id;							/* The update statement ID. */	
+	xtBool					ot_for_update;							/* True if reading FOR UPDATE. */
+	xtBool					ot_is_modify;							/* True if UPDATE or DELETE. */
+	xtRowID					ot_temp_row_lock;						/* The temporary row lock set on this table. */
+	u_int					ot_cols_req;							/* The number of columns required from the table. */
+
+	/* GOTCHA: Separate buffers for reading and writing rows because
+	 * of blob references, to this buffer, as in this test:
+	 *
+	 * drop table if exists t1;
+	 * CREATE TABLE t1 (id MEDIUMINT NOT NULL, b1 BIT(8), vc TEXT, 
+	 *                  bc CHAR(255), d DECIMAL(10,4) DEFAULT 0, 
+	 *                  f FLOAT DEFAULT 0, total BIGINT UNSIGNED, 
+	 *                  y YEAR, t DATE)
+	 *                  PARTITION BY RANGE (YEAR(t)) 
+	 *                 (PARTITION p1 VALUES LESS THAN (2005), 
+	 *                  PARTITION p2 VALUES LESS THAN MAXVALUE);
+	 *                
+	 * INSERT INTO t1 VALUES(412,1,'eTesting MySQL databases is a cool ',
+	 *                       'EEEMust make it bug free for the customer',
+	 *                        654321.4321,15.21,0,1965,"2005-11-14");
+	 * 
+	 * UPDATE t1 SET b1 = 0, t="2006-02-22" WHERE id = 412;
+	 * 
+	 */
+	size_t					ot_row_rbuf_size;						/* The current size of the read row buffer (resized dynamically). */
+	xtWord1					*ot_row_rbuffer;						/* The row buffer for reading rows. */
+	size_t					ot_row_wbuf_size;						/* The current size of the write row buffer (resized dynamically). */
+	xtWord1					*ot_row_wbuffer;						/* The row buffer for writing rows. */
+
+	/* Details of the current record: */
+	xtRecordID				ot_curr_rec_id;							/* The offset of the current record. */
+	xtRowID					ot_curr_row_id;							/* The row ID of the current record. */
+	xtBool					ot_curr_updated;						/* TRUE if the current record was updated by the current transaction. */
+
+	XTIndBlockPtr			ot_ind_res_bufs;						/* A list of reserved index buffers. */
+	u_int					ot_ind_res_count;						/* The number of reserved buffers. */
+#ifdef XT_TRACK_INDEX_UPDATES
+	u_int					ot_ind_changed;
+	u_int					ot_ind_reserved;
+	u_int					ot_ind_reads;
+#endif
+#ifdef XT_TRACK_RETURNED_ROWS
+	u_int					ot_rows_ret_max;
+	u_int					ot_rows_ret_curr;
+	xtRecordID				*ot_rows_returned;
+#endif
+	/* GOTCHA: Separate buffers for reading and writing the index are required
+	 * because MySQL sometimes scans and updates an index with the same
+	 * table handler.
+	 */
+	XTIdxItemRec			ot_ind_state;							/* Decribes the state of the index buffer. */
+	XTIndHandlePtr			ot_ind_rhandle;							/* This handle references a block which is being used in a sequential scan. */
+	//XTIdxBranchDRec			ot_ind_rbuf;							/* The index read buffer. */
+	XTIdxBranchDRec			ot_ind_wbuf;							/* Buffer for the current index node for writing. */
+	xtWord1					ot_ind_wbuf2[XT_INDEX_PAGE_SIZE];		/* Overflow for the write buffer when a node is too big. */
+
+	/* Note: the fields below ot_ind_rbuf are not zero'ed out on creation
+	 * of this structure!
+	 */
+	xtRecordID				ot_seq_rec_id;							/* Current position of a sequential scan. */
+	xtRecordID				ot_seq_eof_id;							/* The EOF at the start of the sequential scan. */
+	XTTabCachePagePtr		ot_seq_page;							/* If ot_seq_buffer is non-NULL, then a page has been locked! */
+	xtWord1					*ot_seq_data;							/* Non-NULL if the data references memory mapped memory, or if it was
+																	 * allocated if no memory mapping is being used.
+																	 */
+	xtBool					ot_on_page;
+	size_t					ot_seq_offset;							/* Offset on the current page. */
+} XTOpenTableRec, *XTOpenTablePtr;
+
+#define XT_DATABASE_NAME_SIZE		XT_IDENTIFIER_NAME_SIZE
+
+typedef struct XTTableDesc {
+	char					td_tab_name[XT_TABLE_NAME_SIZE+4];	// 4 extra for DEL# (tables being deleted)
+	xtTableID				td_tab_id;
+	char					*td_file_name;
+
+	struct XTDatabase		*td_db;
+	struct XTTablePath		*td_tab_path;						// The path of the table.
+	u_int					td_path_idx;
+	XTOpenDirPtr			td_open_dir;
+} XTTableDescRec, *XTTableDescPtr;
+
+
+typedef struct XTFilesOfTable {
+	int						ft_state;
+	XTPathStrPtr			ft_tab_name;
+	xtTableID				ft_tab_id;
+	char					ft_file_path[PATH_MAX];
+} XTFilesOfTableRec, *XTFilesOfTablePtr;
+
+typedef struct XTRestrictItem {
+	xtTableID				ri_tab_id;
+	xtRecordID				ri_rec_id;
+} XTRestrictItemRec, *XTRestrictItemPtr;
+
+int					xt_tab_compare_names(const char *n1, const char *n2);
+int					xt_tab_compare_paths(char *n1, char *n2);
+void				xt_tab_init_db(struct XTThread *self, struct XTDatabase *db);
+void				xt_tab_exit_db(struct XTThread *self, struct XTDatabase *db);
+void				xt_tab_check_free_lists(struct XTThread *self, XTOpenTablePtr ot, bool check_recs, bool correct_count);
+void				xt_check_tables(struct XTThread *self);
+
+char				*xt_tab_file_to_name(size_t size, char *tab_name, char *file_name);
+
+void				xt_create_table(struct XTThread *self, XTPathStrPtr name, XTDictionaryPtr dic);
+XTTableHPtr			xt_use_table(struct XTThread *self, XTPathStrPtr name, xtBool no_load, xtBool missing_ok);
+void				xt_sync_flush_table(struct XTThread *self, XTOpenTablePtr ot);
+xtBool				xt_flush_record_row(XTOpenTablePtr ot, off_t *bytes_flushed, xtBool have_table_loc);
+void				xt_flush_table(struct XTThread *self, XTOpenTablePtr ot);
+XTTableHPtr			xt_use_table_no_lock(XTThreadPtr self, struct XTDatabase *db, XTPathStrPtr name, xtBool no_load, xtBool missing_ok, XTDictionaryPtr dic);
+int					xt_use_table_by_id(struct XTThread *self, XTTableHPtr *tab, struct XTDatabase *db, xtTableID tab_id);
+XTOpenTablePtr		xt_open_table(XTTableHPtr tab);
+void				xt_close_table(XTOpenTablePtr ot, xtBool flush, xtBool have_table_lock);
+void				xt_drop_table(struct XTThread *self, XTPathStrPtr name, xtBool drop_db);
+void				xt_check_table(XTThreadPtr self, XTOpenTablePtr tab);
+void				xt_rename_table(struct XTThread *self, XTPathStrPtr old_name, XTPathStrPtr new_name);
+
+void				xt_describe_tables_init(struct XTThread *self, struct XTDatabase *db, XTTableDescPtr td);
+xtBool				xt_describe_tables_next(struct XTThread *self, XTTableDescPtr td);
+void				xt_describe_tables_exit(struct XTThread *self, XTTableDescPtr td);
+
+xtBool				xt_table_exists(struct XTDatabase *db);
+
+void				xt_enum_tables_init(u_int *edx);
+XTTableEntryPtr		xt_enum_tables_next(struct XTThread *self, struct XTDatabase *db, u_int *edx);
+
+void				xt_enum_files_of_tables_init(XTPathStrPtr tab_name, xtTableID tab_id, XTFilesOfTablePtr ft);
+xtBool				xt_enum_files_of_tables_next(XTFilesOfTablePtr ft);
+
+xtBool				xt_tab_seq_init(XTOpenTablePtr ot);
+void				xt_tab_seq_reset(XTOpenTablePtr ot);
+void				xt_tab_seq_exit(XTOpenTablePtr ot);
+xtBool				xt_tab_seq_next(XTOpenTablePtr ot, xtWord1 *buffer, xtBool *eof);
+void				xt_tab_seq_repeat(XTOpenTablePtr ot);
+
+xtBool				xt_tab_new_record(XTOpenTablePtr ot, xtWord1 *buffer);
+xtBool				xt_tab_delete_record(XTOpenTablePtr ot, xtWord1 *buffer);
+xtBool				xt_tab_restrict_rows(XTBasicListPtr list, struct XTThread *thread);
+xtBool				xt_tab_update_record(XTOpenTablePtr ot, xtWord1 *before_buf, xtWord1 *after_buf);
+int					xt_tab_visible(XTOpenTablePtr ot);
+int					xt_tab_read_record(register XTOpenTablePtr ot, xtWord1 *buffer);
+int					xt_tab_dirty_read_record(register XTOpenTablePtr ot, xtWord1 *buffer);
+void				xt_tab_load_row_pointers(XTThreadPtr self, XTOpenTablePtr ot);
+void				xt_tab_load_table(struct XTThread *self, XTOpenTablePtr ot);
+xtBool				xt_tab_load_record(register XTOpenTablePtr ot, xtRecordID rec_id, XTInfoBufferPtr rec_buf);
+int					xt_tab_remove_record(XTOpenTablePtr ot, xtRecordID rec_id, xtWord1 *rec_data, xtRecordID *prev_var_rec_id, xtBool clean_delete, xtRowID row_id, xtXactID xn_id);
+int					xt_tab_maybe_committed(XTOpenTablePtr ot, xtRecordID rec_id, xtXactID *xn_id, xtRowID *out_rowid, xtBool *out_updated);
+xtBool				xt_tab_free_record(XTOpenTablePtr ot, u_int status, xtRecordID rec_id, xtBool clean_delete);
+void				xt_tab_store_header(XTOpenTablePtr ot, XTTableHeadDPtr rec_head);
+xtBool				xt_tab_write_header(XTOpenTablePtr ot, XTTableHeadDPtr rec_head, struct XTThread *thread);
+xtBool				xt_tab_write_min_auto_inc(XTOpenTablePtr ot);
+
+xtBool				xt_tab_get_row(register XTOpenTablePtr ot, xtRowID row_id, xtRecordID *var_rec_id);
+xtBool				xt_tab_set_row(XTOpenTablePtr ot, u_int status, xtRowID row_id, xtRecordID var_rec_id);
+xtBool				xt_tab_free_row(XTOpenTablePtr ot, XTTableHPtr tab, xtRowID row_id);
+
+xtBool				xt_tab_load_ext_data(XTOpenTablePtr ot, xtRecordID load_rec_id, xtWord1 *buffer, u_int cols_req);
+xtBool				xt_tab_put_rec_data(XTOpenTablePtr ot, xtRecordID rec_id, size_t size, xtWord1 *buffer, xtOpSeqNo *op_seq);
+xtBool				xt_tab_put_eof_rec_data(XTOpenTablePtr ot, xtRecordID rec_id, size_t size, xtWord1 *buffer, xtOpSeqNo *op_seq);
+xtBool				xt_tab_put_log_op_rec_data(XTOpenTablePtr ot, u_int status, xtRecordID free_rec_id, xtRecordID rec_id, size_t size, xtWord1 *buffer);
+xtBool				xt_tab_put_log_rec_data(XTOpenTablePtr ot, u_int status, xtRecordID free_rec_id, xtRecordID rec_id, size_t size, xtWord1 *buffer, xtOpSeqNo *op_seq);
+xtBool				xt_tab_get_rec_data(register XTOpenTablePtr ot, xtRecordID rec_id, size_t size, xtWord1 *buffer);
+void				xt_tab_disable_index(XTTableHPtr tab, u_int ind_error);
+void				xt_tab_set_index_error(XTTableHPtr tab);
+
+void				xt_tab_make_table_name(XTTableHPtr tab, char *table_name, size_t size);
+xtBool				xt_tab_is_table_repair_pending(XTTableHPtr tab);
+void				xt_tab_table_repaired(XTTableHPtr tab);
+void				xt_tab_set_table_repair_pending(XTTableHPtr tab);
+
+inline off_t		xt_row_id_to_row_offset(register XTTableHPtr tab, xtRowID row_id)
+{
+	return (off_t) tab->tab_rows.tci_header_size + (off_t) (row_id - 1) * (off_t) tab->tab_rows.tci_rec_size;
+}
+
+inline  xtRowID		xt_row_offset_row_id(register XTTableHPtr tab, off_t rec_offs)
+{
+#ifdef DEBUG
+	if (((rec_offs - (off_t) tab->tab_rows.tci_header_size) % (off_t) tab->tab_rows.tci_rec_size) != 0) {
+		printf("ERROR! Not a valid record offset!\n");
+	}
+#endif
+	return (xtRowID) ((rec_offs - (off_t) tab->tab_rows.tci_header_size) / (off_t) tab->tab_rows.tci_rec_size) + 1;
+}
+
+inline off_t		xt_rec_id_to_rec_offset(register XTTableHPtr tab, xtRefID ref_id)
+{
+	if (!ref_id)
+		return (off_t) 0;
+	return (off_t) tab->tab_recs.tci_header_size + (off_t) (ref_id-1) * (off_t) tab->tab_recs.tci_rec_size;
+}
+
+inline  xtRefID		xt_rec_offset_rec_id(register XTTableHPtr tab, off_t ref_offs)
+{
+	if (!ref_offs)
+		return (xtRefID) 0;
+#ifdef DEBUG
+	if (((ref_offs - (off_t) tab->tab_recs.tci_header_size) % (off_t) tab->tab_recs.tci_rec_size) != 0) {
+		printf("ERROR! Not a valid record offset!\n");
+	}
+#endif
+		
+	return (xtRefID) ((ref_offs - (off_t) tab->tab_recs.tci_header_size) / (off_t) tab->tab_recs.tci_rec_size)+1;
+}
+
+inline off_t		xt_ind_node_to_offset(register XTTableHPtr tab, xtIndexNodeID node_id)
+{
+	if (!XT_NODE_ID(node_id))
+		return (off_t) 0;
+	return (off_t) tab->tab_index_header_size + (off_t) (XT_NODE_ID(node_id)-1) * (off_t) tab->tab_index_page_size;
+}
+
+inline xtIndexNodeID xt_ind_offset_to_node(register XTTableHPtr tab, off_t ind_offs)
+{
+	XT_NODE_TEMP;
+
+	if (!ind_offs)
+		return XT_RET_NODE_ID(0);
+#ifdef DEBUG
+	if (((ind_offs - (off_t) tab->tab_index_header_size) % (off_t) tab->tab_index_page_size) != 0) {
+		printf("ERROR! Not a valid index offset!\n");
+	}
+#endif
+		
+	return XT_RET_NODE_ID(((ind_offs - (off_t) tab->tab_index_header_size) / (off_t) tab->tab_index_page_size)+1);
+}
+
+#define XT_RESIZE_ROW_BUFFER(thr, rb, size) \
+	do { \
+		if (rb->rb_size < size) { \
+			xt_realloc(thr, (void **) &rb->x.rb_buffer, size); \
+			rb->rb_size = size; \
+		} \
+	} \
+	while (0)
+
+#endif
+
diff --git a/storage/pbxt/src/thread_xt.cc b/storage/pbxt/src/thread_xt.cc
new file mode 100644
index 00000000000..52c2c6c29c5
--- /dev/null
+++ b/storage/pbxt/src/thread_xt.cc
@@ -0,0 +1,2349 @@
+/* Copyright (c) 2005 PrimeBase Technologies GmbH
+ *
+ * PrimeBase XT
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * 2005-01-03	Paul McCullagh
+ *
+ * H&G2JCtL
+ */
+
+#include "xt_config.h"
+
+#ifdef DRIZZLED
+#include <bitset>
+#endif
+
+#ifndef XT_WIN
+#include <unistd.h>
+#include <sys/time.h>
+#include <sys/resource.h>
+#endif
+#include <time.h>
+#include <stdarg.h>
+#include <signal.h>
+#include <stdlib.h>
+#include <ctype.h>
+#include <errno.h>
+
+#include "xt_defs.h"
+#include "strutil_xt.h"
+#include "pthread_xt.h"
+#include "thread_xt.h"
+#include "memory_xt.h"
+#include "sortedlist_xt.h"
+#include "trace_xt.h"
+#include "myxt_xt.h"
+#include "database_xt.h"
+
+void xt_db_init_thread(XTThreadPtr self, XTThreadPtr new_thread);
+void xt_db_exit_thread(XTThreadPtr self);
+
+static void thr_accumulate_statistics(XTThreadPtr self);
+
+/*
+ * -----------------------------------------------------------------------
+ * THREAD GLOBALS
+ */
+
+xtPublic u_int			xt_thr_maximum_threads;
+xtPublic u_int			xt_thr_current_thread_count;
+xtPublic u_int			xt_thr_current_max_threads;
+
+/* This structure is a double linked list of thread, with a wait
+ * condition on it.
+ */
+static XTLinkedListPtr	thr_list;
+
+/* This structure maps thread ID's to thread pointers. */
+xtPublic XTThreadPtr	*xt_thr_array;
+static xt_mutex_type	thr_array_lock;
+
+/* Global accumulated statistics: */
+static XTStatisticsRec	thr_statistics;
+
+#ifdef DEBUG
+static void break_in_assertion(c_char *expr, c_char *func, c_char *file, u_int line)
+{
+	printf("%s(%s:%d) %s\n", func, file, (int) line, expr);
+}
+#endif
+
+/*
+ * -----------------------------------------------------------------------
+ * Error logging
+ */
+
+static xt_mutex_type	log_mutex;
+static int				log_level = 0;
+static FILE				*log_file = NULL;
+static xtBool			log_newline = TRUE;
+
+xtPublic xtBool xt_init_logging(void)
+{
+	int err;
+
+	log_file = stdout;
+	log_level = XT_LOG_TRACE;
+	err = xt_p_mutex_init_with_autoname(&log_mutex, NULL);
+	if (err) {
+		xt_log_errno(XT_NS_CONTEXT, err);
+		log_file = NULL;
+		log_level = 0;
+		return FALSE;
+	}
+	if (!xt_init_trace()) {
+		xt_exit_logging();
+		return FALSE;
+	}
+	return TRUE;
+}
+
+xtPublic void xt_exit_logging(void)
+{
+	if (log_file) {
+		xt_free_mutex(&log_mutex);
+		log_file = NULL;
+	}
+	xt_exit_trace();
+}
+
+xtPublic void xt_get_now(char *buffer, size_t len)
+{
+	time_t		ticks;
+	struct tm	ltime;
+
+	ticks = time(NULL);
+	if (ticks == (time_t) -1) {
+#ifdef XT_WIN
+		printf(buffer, "** error %d getting time **", errno);
+#else
+		snprintf(buffer, len, "** error %d getting time **", errno);
+#endif
+		return;
+	}
+	localtime_r(&ticks, &ltime);
+	strftime(buffer, len, "%y%m%d %H:%M:%S", &ltime);
+}
+
+static void thr_log_newline(XTThreadPtr self, c_char *func, c_char *file, u_int line, int level)
+{
+	c_char	*level_str;
+	char	time_str[200];
+	char	thr_name[XT_THR_NAME_SIZE+3];
+
+	xt_get_now(time_str, 200);
+	if (self && *self->t_name) {
+		xt_strcpy(XT_THR_NAME_SIZE+3, thr_name, " ");
+		xt_strcat(XT_THR_NAME_SIZE+3, thr_name, self->t_name);
+	}
+	else
+		thr_name[0] = 0;
+	switch (level) {
+		case XT_LOG_FATAL: level_str = " [Fatal]"; break;
+		case XT_LOG_ERROR: level_str = " [Error]"; break;
+		case XT_LOG_WARNING: level_str = " [Warning]"; break;
+		case XT_LOG_INFO: level_str = " [Note]"; break;
+		case XT_LOG_TRACE: level_str = " [Trace]"; break;
+		default: level_str = " "; break;
+	}
+	if (func && *func && *func != '-') {
+		char func_name[XT_MAX_FUNC_NAME_SIZE];
+
+		xt_strcpy_term(XT_MAX_FUNC_NAME_SIZE, func_name, func, '(');
+		if (file && *file)
+			fprintf(log_file, "%s%s%s %s(%s:%d) ", time_str, level_str, thr_name, func_name, xt_last_name_of_path(file), line);
+		else
+			fprintf(log_file, "%s%s%s %s() ", time_str, level_str, thr_name, func_name);
+	}
+	else {
+		if (file && *file)
+			fprintf(log_file, "%s%s%s [%s:%d] ", time_str, level_str, thr_name, xt_last_name_of_path(file), line);
+		else
+			fprintf(log_file, "%s%s%s ", time_str, level_str, thr_name);
+	}
+}
+
+#ifdef XT_WIN
+/* Windows uses printf()!! */
+#define DEFAULT_LOG_BUFFER_SIZE			2000
+#else
+#ifdef DEBUG
+#define DEFAULT_LOG_BUFFER_SIZE			10
+#else
+#define DEFAULT_LOG_BUFFER_SIZE			2000
+#endif
+#endif
+
+void xt_log_flush(XTThreadPtr XT_UNUSED(self))
+{
+	fflush(log_file);
+}
+
+/*
+ * Log the given formated string information to the log file.
+ * Before each new line, this function writes the
+ * log header, which includes the time, log level,
+ * and source file and line number (optional).
+ */
+static void thr_log_va(XTThreadPtr self, c_char *func, c_char *file, u_int line, int level, c_char *fmt, va_list ap)
+{
+	char buffer[DEFAULT_LOG_BUFFER_SIZE];
+	char *log_string = NULL;
+
+	if (level > log_level)
+		return;
+
+	xt_lock_mutex_ns(&log_mutex);
+
+#ifdef XT_WIN
+	vsprintf(buffer, fmt, ap);
+	log_string = buffer;
+#else
+#if !defined(va_copy) || defined(XT_SOLARIS)
+	int len;
+
+	len = vsnprintf(buffer, DEFAULT_LOG_BUFFER_SIZE-1, fmt, ap);
+	if (len > DEFAULT_LOG_BUFFER_SIZE-1)
+		len = DEFAULT_LOG_BUFFER_SIZE-1;
+	buffer[len] = 0;
+	log_string = buffer;
+#else
+	/* Use the buffer, unless it is too small */
+	va_list ap2;
+	int bufsize;
+
+	va_copy(ap2, ap);
+	bufsize = vsnprintf(buffer, DEFAULT_LOG_BUFFER_SIZE, fmt, ap);
+	if (bufsize >= DEFAULT_LOG_BUFFER_SIZE) {
+		log_string = (char *) malloc(bufsize + 1);
+		if (vsnprintf(log_string, bufsize + 1, fmt, ap2) > bufsize) {
+			free(log_string);
+			log_string = NULL;
+		}
+	}
+	else
+		log_string = buffer;
+#endif
+#endif
+
+	if (log_string) {
+		char *str, *str_end, tmp_ch;
+
+		str = log_string;
+		while (*str) {
+			if (log_newline) {
+				thr_log_newline(self, func, file, line, level);
+				log_newline = FALSE;
+			}
+			str_end = strchr(str, '\n');
+			if (str_end) {
+				str_end++;
+				tmp_ch = *str_end;
+				*str_end = 0;
+				log_newline = TRUE;
+			}
+			else {
+				str_end = str + strlen(str);
+				tmp_ch = 0;
+			}
+			fprintf(log_file, "%s", str);
+			fflush(log_file);
+			*str_end = tmp_ch;
+			str = str_end;
+		}
+
+		if (log_string != buffer)
+			free(log_string);
+	}
+
+	xt_unlock_mutex_ns(&log_mutex);
+}
+
+xtPublic void xt_logf(XTThreadPtr self, c_char *func, c_char *file, u_int line, int level, c_char *fmt, ...)
+{
+	va_list	ap;
+
+	va_start(ap, fmt);
+	thr_log_va(self, func, file, line, level, fmt, ap);
+	va_end(ap);
+}
+
+xtPublic void xt_log(XTThreadPtr self, c_char *func, c_char *file, u_int line, int level, c_char *string)
+{
+	xt_logf(self, func, file, line, level, "%s", string);
+}
+
+static int thr_log_error_va(XTThreadPtr self, c_char *func, c_char *file, u_int line, int level, int xt_err, int sys_err, c_char *fmt, va_list ap)
+{
+	int		default_level;
+	char	xt_err_string[50];
+
+	*xt_err_string = 0;
+	switch (xt_err) {
+		case XT_ASSERTION_FAILURE:
+			strcpy(xt_err_string, "Assertion");
+			default_level = XT_LOG_FATAL;
+			break;
+		case XT_SYSTEM_ERROR:
+			strcpy(xt_err_string, "errno");
+			default_level = XT_LOG_ERROR;
+			break;
+		case XT_SIGNAL_CAUGHT:
+			strcpy(xt_err_string, "Signal");
+			default_level = XT_LOG_ERROR;
+			break;
+		default:
+			sprintf(xt_err_string, "%d", xt_err);
+			default_level = XT_LOG_ERROR;
+			break;
+	}
+	if (level == XT_LOG_DEFAULT)
+		level = default_level;
+
+	if (*xt_err_string) {
+		if (sys_err)
+			xt_logf(self, func, file, line, level, "%s (%d): ", xt_err_string, sys_err);
+		else
+			xt_logf(self, func, file, line, level, "%s: ", xt_err_string);
+	}
+	thr_log_va(self, func, file, line, level, fmt, ap);
+	xt_logf(self, func, file, line, level, "\n");
+	return level;
+}
+
+/* The function returns the actual log level used. */
+xtPublic int xt_log_errorf(XTThreadPtr self, c_char *func, c_char *file, u_int line, int level, int xt_err, int sys_err, c_char *fmt, ...)
+{
+	va_list	ap;
+
+	va_start(ap, fmt);
+	level = thr_log_error_va(self, func, file, line, level, xt_err, sys_err, fmt, ap);
+	va_end(ap);
+	return level;
+}
+
+/* The function returns the actual log level used. */
+xtPublic int xt_log_error(XTThreadPtr self, c_char *func, c_char *file, u_int line, int level, int xt_err, int sys_err, c_char *string)
+{
+	return xt_log_errorf(self, func, file, line, level, xt_err, sys_err, "%s", string);
+}
+
+xtPublic void xt_log_exception(XTThreadPtr self, XTExceptionPtr e, int level)
+{
+	level = xt_log_error(
+		self,
+		e->e_func_name,
+		e->e_source_file,
+		e->e_source_line,
+		level,
+		e->e_xt_err,
+		e->e_sys_err,
+		e->e_err_msg);
+	/* Dump the catch trace: */
+	if (*e->e_catch_trace)
+		xt_logf(self, NULL, NULL, 0, level, "%s", e->e_catch_trace);
+}
+
+xtPublic void xt_log_and_clear_exception(XTThreadPtr self)
+{
+	xt_log_exception(self, &self->t_exception, XT_LOG_DEFAULT);
+	xt_clear_exception(self);
+}
+
+xtPublic void xt_log_and_clear_exception_ns(void)
+{
+	xt_log_and_clear_exception(xt_get_self());
+}
+
+xtPublic void xt_log_and_clear_warning(XTThreadPtr self)
+{
+	xt_log_exception(self, &self->t_exception, XT_LOG_WARNING);
+	xt_clear_exception(self);
+}
+
+xtPublic void xt_log_and_clear_warning_ns(void)
+{
+	xt_log_and_clear_warning(xt_get_self());
+}
+
+/*
+ * -----------------------------------------------------------------------
+ * Exceptions
+ */
+
+static void thr_add_catch_trace(XTExceptionPtr e, c_char *func, c_char *file, u_int line)
+{
+	if (func && *func && *func != '-') {
+		xt_strcat_term(XT_CATCH_TRACE_SIZE, e->e_catch_trace, func, '(');
+		xt_strcat(XT_CATCH_TRACE_SIZE, e->e_catch_trace, "(");
+	}
+	if (file && *file) {
+		xt_strcat(XT_CATCH_TRACE_SIZE, e->e_catch_trace, xt_last_name_of_path(file));
+		if (line) {
+			char buffer[40];
+
+			sprintf(buffer, "%u", line);
+			xt_strcat(XT_CATCH_TRACE_SIZE, e->e_catch_trace, ":");
+			xt_strcat(XT_CATCH_TRACE_SIZE, e->e_catch_trace, buffer);
+		}
+	}
+	if (func && *func && *func != '-')
+		xt_strcat(XT_CATCH_TRACE_SIZE, e->e_catch_trace, ")");
+	xt_strcat(XT_CATCH_TRACE_SIZE, e->e_catch_trace, "\n");
+}
+
+static void thr_save_error_va(XTExceptionPtr e, XTThreadPtr self, xtBool throw_it, c_char *func, c_char *file, u_int line, int xt_err, int sys_err, c_char *fmt, va_list ap)
+{
+	int i;
+
+	if (!e)
+		return;
+
+	e->e_xt_err = xt_err;
+	e->e_sys_err = sys_err;
+	vsnprintf(e->e_err_msg, XT_ERR_MSG_SIZE, fmt, ap);
+
+	/* Make the first character of the message upper case: */
+	/* This did not work for foreign languages! */
+	if (e->e_err_msg[0] >= 'a' && e->e_err_msg[0] <= 'z')
+		e->e_err_msg[0] = (char) toupper(e->e_err_msg[0]);
+
+	if (func && *func && *func != '-')
+		xt_strcpy_term(XT_MAX_FUNC_NAME_SIZE, e->e_func_name, func, '(');
+	else
+		*e->e_func_name = 0;
+	if (file && *file) {
+		xt_strcpy(XT_SOURCE_FILE_NAME_SIZE, e->e_source_file, xt_last_name_of_path(file));
+		e->e_source_line = line;
+	}
+	else {
+		*e->e_source_file = 0;
+		e->e_source_line = 0;
+	}
+	*e->e_catch_trace = 0;
+
+	if (!self)
+		return;
+
+	/* Create a stack trace for this exception: */
+	thr_add_catch_trace(e, func, file, line);
+	for (i=self->t_call_top-1; i>=0; i--)
+		thr_add_catch_trace(e, self->t_call_stack[i].cs_func, self->t_call_stack[i].cs_file, self->t_call_stack[i].cs_line);
+
+	if (throw_it)
+		xt_throw(self);
+}
+
+/*
+ * -----------------------------------------------------------------------
+ * THROWING EXCEPTIONS
+ */
+
+/* If we have to allocate resources and the hold them temporarily during which
+ * time an exception could occur, then these functions provide a holding
+ * place for the data, which will be freed in the case of an exception.
+ *
+ * Note: the free functions could themselves allocated resources.
+ * to make sure all things work out we only remove the resource from
+ * then stack when it is freed.
+ */
+static void thr_free_resources(XTThreadPtr self, XTResourcePtr top)
+{
+	XTResourcePtr		rp;
+	XTThreadFreeFunc	free_func;
+
+	if (!top)
+		return;
+	while (self->t_res_top > top) {
+		/* Pop the top resource: */
+		rp = (XTResourcePtr) (((char *) self->t_res_top) - self->t_res_top->r_prev_size);
+
+		/* Free the resource: */
+		if (rp->r_free_func) {
+			free_func = rp->r_free_func;
+			rp->r_free_func = NULL;
+			free_func(self, rp->r_data);
+		}
+
+		self->t_res_top = rp;
+	}
+}
+
+xtPublic void xt_bug(XTThreadPtr XT_UNUSED(self))
+{
+	static int *bug_ptr = NULL;
+	
+	bug_ptr = NULL;
+}
+
+/*
+ * This function is called when an exception is caught.
+ * It restores the function call top and frees
+ * any resource allocated by lower levels.
+ */
+xtPublic void xt_caught(XTThreadPtr self)
+{
+	/* Restore the call top: */
+	self->t_call_top = self->t_jmp_env[self->t_jmp_depth].jb_call_top;
+
+	/* Free the temporary data that would otherwize be lost
+	 * This should do nothing, because we actually free things on throw
+	 * (se below).
+	 */
+	thr_free_resources(self, self->t_jmp_env[self->t_jmp_depth].jb_res_top);
+}
+
+/* Throw an already registered error: */
+xtPublic void xt_throw(XTThreadPtr self)
+{
+	if (self) {
+		ASSERT_NS(self->t_exception.e_xt_err);
+		if (self->t_jmp_depth > 0 && self->t_jmp_depth <= XT_MAX_JMP) {
+			/* As recommended by Barry: rree the resources before the stack is invalid! */
+			thr_free_resources(self, self->t_jmp_env[self->t_jmp_depth-1].jb_res_top);
+
+			/* Then do the longjmp: */
+			longjmp(self->t_jmp_env[self->t_jmp_depth-1].jb_buffer, 1);
+		}
+	}
+
+	/*
+	 * We cannot throw an error, because it will not be caught.
+	 * This means there is no try ... catch block above.
+	 * In this case, we just return.
+	 * The calling functions must handle errors...
+	xt_caught(self);
+	xt_log(XT_CONTEXT, XT_LOG_FATAL, "Uncaught exception\n");
+	xt_exit_thread(self, NULL);
+	*/
+}
+
+xtPublic void xt_throwf(XTThreadPtr self, c_char *func, c_char *file, u_int line, int xt_err, int sys_err, c_char *fmt, ...)
+{
+	va_list		ap;
+	XTThreadPtr	thread = self ? self : xt_get_self();
+
+	va_start(ap, fmt);
+	thr_save_error_va(thread ? &thread->t_exception : NULL, thread, self ? TRUE : FALSE, func, file, line, xt_err, sys_err, fmt, ap);
+	va_end(ap);
+}
+
+xtPublic void xt_throw_error(XTThreadPtr self, c_char *func, c_char *file, u_int line, int xt_err, int sys_err, c_char *msg)
+{
+	xt_throwf(self, func, file, line, xt_err, sys_err, "%s", msg);
+}
+
+#define XT_SYS_ERR_SIZE		300
+
+#ifdef XT_WIN
+static c_char *thr_get_sys_error(int err, char *err_msg)
+#else
+static c_char *thr_get_sys_error(int err, char *XT_UNUSED(err_msg))
+#endif
+{
+#ifdef XT_WIN
+	char *ptr;
+
+	if (!FormatMessage(FORMAT_MESSAGE_FROM_SYSTEM, NULL,
+		err, 0, err_msg, XT_SYS_ERR_SIZE, NULL)) {
+		return strerror(err);
+	}
+
+	ptr = &err_msg[strlen(err_msg)];
+	while (ptr-1 > err_msg) {
+		if (*(ptr-1) != '\n' && *(ptr-1) != '\r' && *(ptr-1) != '.')
+			break;
+		ptr--;
+	}
+	*ptr = 0;
+return err_msg;
+#else
+	return strerror(err);
+#endif
+}
+
+static c_char *thr_get_err_string(int xt_err)
+{
+	c_char *str;
+
+	switch (xt_err) {
+		case XT_ERR_STACK_OVERFLOW:		str = "Stack overflow"; break;
+		case XT_ERR_JUMP_OVERFLOW:		str = "Jump overflow"; break;
+		case XT_ERR_TABLE_EXISTS:		str = "Table `%s` already exists"; break;
+		case XT_ERR_NAME_TOO_LONG:		str = "Name '%s' is too long"; break;
+		case XT_ERR_TABLE_NOT_FOUND:	str = "Table `%s` not found"; break;
+		case XT_ERR_SESSION_NOT_FOUND:	str = "Session %s not found"; break;
+		case XT_ERR_BAD_ADDRESS:		str = "Incorrect address '%s'"; break;
+		case XT_ERR_UNKNOWN_SERVICE:	str = "Unknown service '%s'"; break;
+		case XT_ERR_UNKNOWN_HOST:		str = "Host '%s' not found"; break;
+		case XT_ERR_TOKEN_EXPECTED:		str = "%s expected in place of %s"; break;
+		case XT_ERR_PROPERTY_REQUIRED:	str = "Property '%s' required"; break;
+		case XT_ERR_DEADLOCK:			str = "Deadlock, transaction aborted"; break;
+		case XT_ERR_CANNOT_CHANGE_DB:	str = "Cannot change database while transaction is in progress"; break;
+		case XT_ERR_ILLEGAL_CHAR:		str = "Illegal character: '%s'"; break;
+		case XT_ERR_UNTERMINATED_STRING:str = "Unterminated string: %s"; break;
+		case XT_ERR_SYNTAX:				str = "Syntax error near %s"; break;
+		case XT_ERR_ILLEGAL_INSTRUCTION:str = "Illegal instruction"; break;
+		case XT_ERR_OUT_OF_BOUNDS:		str = "Memory reference out of bounds"; break;
+		case XT_ERR_STACK_UNDERFLOW:	str = "Stack underflow"; break;
+		case XT_ERR_TYPE_MISMATCH:		str = "Type mismatch"; break;
+		case XT_ERR_ILLEGAL_TYPE:		str = "Illegal type for operator"; break;
+		case XT_ERR_ID_TOO_LONG:		str = "Identifier too long: %s"; break;
+		case XT_ERR_TYPE_OVERFLOW:		str = "Type overflow: %s"; break;
+		case XT_ERR_TABLE_IN_USE:		str = "Table `%s` in use"; break;
+		case XT_ERR_NO_DATABASE_IN_USE:	str = "No database in use"; break;
+		case XT_ERR_CANNOT_RESOLVE_TYPE:str = "Cannot resolve type with ID: %s"; break;
+		case XT_ERR_BAD_INDEX_DESC:		str = "Unsupported index description: %s"; break;
+		case XT_ERR_WRONG_NO_OF_VALUES:	str = "Incorrect number of values"; break;
+		case XT_ERR_CANNOT_OUTPUT_VALUE:str = "Cannot output given type"; break;
+		case XT_ERR_COLUMN_NOT_FOUND:	str = "Column `%s.%s` not found"; break;
+		case XT_ERR_NOT_IMPLEMENTED:	str = "Not implemented"; break;
+		case XT_ERR_UNEXPECTED_EOS:		str = "Connection unexpectedly lost"; break;
+		case XT_ERR_BAD_TOKEN:			str = "Incorrect binary token"; break;
+		case XT_ERR_RES_STACK_OVERFLOW:	str = "Internal error: resource stack overflow"; break;
+		case XT_ERR_BAD_INDEX_TYPE:		str = "Unsupported index type: %s"; break;
+		case XT_ERR_INDEX_EXISTS:		str = "Index '%s' already exists"; break;
+		case XT_ERR_INDEX_STRUC_EXISTS:	str = "Index '%s' has an identical structure"; break;
+		case XT_ERR_INDEX_NOT_FOUND:	str = "Index '%s' not found"; break;
+		case XT_ERR_INDEX_CORRUPT:		str = "Cannot read index '%s'"; break;
+		case XT_ERR_TYPE_NOT_SUPPORTED:	str = "Data type %s not supported"; break;
+		case XT_ERR_BAD_TABLE_VERSION:	str = "Table `%s` version not supported, upgrade required"; break;
+		case XT_ERR_BAD_RECORD_FORMAT:	str = "Record format unknown, either corrupted or upgrade required"; break;
+		case XT_ERR_BAD_EXT_RECORD:		str = "Extended record part does not match reference"; break;
+		case XT_ERR_RECORD_CHANGED:		str = "Record already updated, transaction aborted"; break;
+		case XT_ERR_XLOG_WAS_CORRUPTED:	str = "Corrupted transaction log has been truncated"; break;
+		case XT_ERR_DUPLICATE_KEY:		str = "Duplicate unique key"; break;
+		case XT_ERR_NO_DICTIONARY:		str = "Table `%s` has not yet been opened by MySQL"; break;
+		case XT_ERR_TOO_MANY_TABLES:	str = "Limit of %s tables per database exceeded"; break;
+		case XT_ERR_KEY_TOO_LARGE:		str = "Index '%s' exceeds the key size limit of %s"; break;
+		case XT_ERR_MULTIPLE_DATABASES:	str = "Multiple database in a single transaction is not permitted"; break;
+		case XT_ERR_NO_TRANSACTION:		str = "Internal error: no transaction running"; break;
+		case XT_ERR_A_EXPECTED_NOT_B:	str = "%s expected in place of %s"; break;
+		case XT_ERR_NO_MATCHING_INDEX:	str = "Matching index required for '%s'"; break;
+		case XT_ERR_TABLE_LOCKED:		str = "Table `%s` locked"; break;
+		case XT_ERR_NO_REFERENCED_ROW:		str = "Constraint: `%s`"; break;  // "Foreign key '%s', referenced row does not exist"
+		case XT_ERR_ROW_IS_REFERENCED:		str = "Constraint: `%s`"; break;  // "Foreign key '%s', has a referencing row"
+		case XT_ERR_BAD_DICTIONARY:			str = "Internal dictionary does not match MySQL dictionary"; break;
+		case XT_ERR_LOADING_MYSQL_DIC:		str = "Error loading %s.frm file, MySQL error: %s"; break;
+		case XT_ERR_COLUMN_IS_NOT_NULL:		str = "Column `%s` is NOT NULL"; break;
+		case XT_ERR_INCORRECT_NO_OF_COLS:	str = "Incorrect number of columns near %s"; break;
+		case XT_ERR_FK_ON_TEMP_TABLE:		str = "Cannot create foreign key on temporary table"; break;
+		case XT_ERR_REF_TABLE_NOT_FOUND:	str = "Referenced table `%s` not found"; break;
+		case XT_ERR_REF_TYPE_WRONG:			str = "Incorrect data type on referenced column `%s`"; break;
+		case XT_ERR_DUPLICATE_FKEY:			str = "Duplicate unique foreign key, contraint: %s"; break;
+		case XT_ERR_INDEX_FILE_TO_LARGE:	str = "Index file has grown too large: %s"; break;
+		case XT_ERR_UPGRADE_TABLE:			str = "Table `%s` must be upgraded from PBXT version %s"; break;
+		case XT_ERR_INDEX_NEW_VERSION:		str = "Table `%s` index created by a newer version, upgrade required"; break;
+		case XT_ERR_LOCK_TIMEOUT:			str = "Lock timeout on table `%s`"; break;
+		case XT_ERR_CONVERSION:				str = "Error converting value for column `%s.%s`"; break;
+		case XT_ERR_NO_ROWS:				str = "No matching row found in table `%s`"; break;
+		case XT_ERR_DATA_LOG_NOT_FOUND:		str = "Data log not found: '%s'"; break;
+		case XT_ERR_LOG_MAX_EXCEEDED:		str = "Maximum log count, %s, exceeded"; break;
+		case XT_ERR_MAX_ROW_COUNT:			str = "Maximum row count reached"; break;
+		case XT_ERR_FILE_TOO_LONG:			str = "File cannot be mapped, too large: '%s'"; break;
+		case XT_ERR_BAD_IND_BLOCK_SIZE:		str = "Table `%s`, incorrect index block size: %s"; break;
+		case XT_ERR_INDEX_CORRUPTED:		str = "Table `%s` index is corrupted, REPAIR TABLE required"; break;
+		case XT_ERR_NO_INDEX_CACHE:			str = "Not enough index cache memory to handle concurrent updates"; break;
+		case XT_ERR_INDEX_LOG_CORRUPT:		str = "Index log corrupt: '%s'"; break;
+		case XT_ERR_TOO_MANY_THREADS:		str = "Too many threads: %s, increase pbxt_max_threads"; break;
+		case XT_ERR_TOO_MANY_WAITERS:		str = "Too many waiting threads: %s"; break;
+		case XT_ERR_INDEX_OLD_VERSION:		str = "Table `%s` index created by an older version, REPAIR TABLE required"; break;
+		case XT_ERR_PBXT_TABLE_EXISTS:		str = "System table cannot be dropped because PBXT table still exists"; break;
+		case XT_ERR_SERVER_RUNNING:			str = "A server is possibly already running"; break;
+		case XT_ERR_INDEX_MISSING:			str = "Index file of table '%s' is missing"; break;
+		case XT_ERR_RECORD_DELETED:			str = "Record was deleted"; break;
+		case XT_ERR_NEW_TYPE_OF_XLOG:		str = "Transaction log %s, is using a newer format, upgrade required"; break;
+		case XT_ERR_NO_BEFORE_IMAGE:		str = "Internal error: no before image"; break;
+		case XT_ERR_FK_REF_TEMP_TABLE:		str = "Foreign key may not reference temporary table"; break;
+		case XT_ERR_MYSQL_SHUTDOWN:			str = "Cannot open table, MySQL has shutdown"; break;
+		case XT_ERR_MYSQL_NO_THREAD:		str = "Cannot create thread, MySQL has shutdown"; break;
+		case XT_ERR_BUFFER_TOO_SMALL:		str = "System backup buffer too small"; break;
+		case XT_ERR_BAD_BACKUP_FORMAT:		str = "Unknown or corrupt backup format, restore aborted"; break;
+		case XT_ERR_PBXT_NOT_INSTALLED:		str = "PBXT plugin is not installed"; break;
+		default:							str = "Unknown XT error"; break;
+	}
+	return str;
+}
+
+xtPublic void xt_throw_i2xterr(XTThreadPtr self, c_char *func, c_char *file, u_int line, int xt_err, c_char *item, c_char *item2)
+{
+	xt_throwf(self, func, file, line, xt_err, 0, thr_get_err_string(xt_err), item, item2);
+}
+
+xtPublic void xt_throw_ixterr(XTThreadPtr self, c_char *func, c_char *file, u_int line, int xt_err, c_char *item)
+{
+	xt_throw_i2xterr(self, func, file, line, xt_err, item, NULL);
+}
+
+xtPublic void xt_throw_tabcolerr(XTThreadPtr self, c_char *func, c_char *file, u_int line, int xt_err, XTPathStrPtr tab_item, c_char *item2)
+{
+	char buffer[XT_IDENTIFIER_NAME_SIZE + XT_IDENTIFIER_NAME_SIZE + XT_IDENTIFIER_NAME_SIZE + 3];
+
+	xt_2nd_last_name_of_path(sizeof(buffer), buffer, tab_item->ps_path);
+	xt_strcat(sizeof(buffer), buffer, ".");
+	xt_strcat(sizeof(buffer), buffer, xt_last_name_of_path(tab_item->ps_path));
+
+	xt_throw_i2xterr(self, func, file, line, xt_err, buffer, item2);
+}
+
+xtPublic void xt_throw_taberr(XTThreadPtr self, c_char *func, c_char *file, u_int line, int xt_err, XTPathStrPtr tab_item)
+{
+	char buffer[XT_IDENTIFIER_NAME_SIZE + XT_IDENTIFIER_NAME_SIZE + XT_IDENTIFIER_NAME_SIZE + 3];
+
+	xt_2nd_last_name_of_path(sizeof(buffer), buffer, tab_item->ps_path);
+	xt_strcat(sizeof(buffer), buffer, ".");
+	xt_strcat(sizeof(buffer), buffer, xt_last_name_of_path(tab_item->ps_path));
+
+	xt_throw_ixterr(self, func, file, line, xt_err, buffer);
+}
+
+xtPublic void xt_throw_ulxterr(XTThreadPtr self, c_char *func, c_char *file, u_int line, int xt_err, u_long value)
+{
+	char buffer[100];
+
+	sprintf(buffer, "%lu", value);
+	xt_throw_ixterr(self, func, file, line, xt_err, buffer);
+}
+
+xtPublic void xt_throw_sulxterr(XTThreadPtr self, c_char *func, c_char *file, u_int line, int xt_err, c_char *item, u_long value)
+{
+	char buffer[100];
+
+	sprintf(buffer, "%lu", value);
+	xt_throw_i2xterr(self, func, file, line, xt_err, item, buffer);
+}
+
+xtPublic void xt_throw_xterr(XTThreadPtr self, c_char *func, c_char *file, u_int line, int xt_err)
+{
+	xt_throw_ixterr(self, func, file, line, xt_err, NULL);
+}
+
+xtPublic void xt_throw_errno(XTThreadPtr self, c_char *func, c_char *file, u_int line, int err)
+{
+	char err_msg[XT_SYS_ERR_SIZE];
+
+	xt_throw_error(self, func, file, line, XT_SYSTEM_ERROR, err, thr_get_sys_error(err, err_msg));
+}
+
+xtPublic void xt_throw_ferrno(XTThreadPtr self, c_char *func, c_char *file, u_int line, int err, c_char *path)
+{
+	char err_msg[XT_SYS_ERR_SIZE];
+
+	xt_throwf(self, func, file, line, XT_SYSTEM_ERROR, err, "%s: '%s'", thr_get_sys_error(err, err_msg), path);
+}
+
+xtPublic void xt_throw_assertion(XTThreadPtr self, c_char *func, c_char *file, u_int line, c_char *str)
+{
+	xt_throw_error(self, func, file, line, XT_ASSERTION_FAILURE, 0, str);
+}
+
+static void xt_log_assertion(XTThreadPtr self, c_char *func, c_char *file, u_int line, c_char *str)
+{
+	xt_log_error(self, func, file, line, XT_LOG_DEFAULT, XT_ASSERTION_FAILURE, 0, str);
+}
+
+xtPublic void xt_throw_signal(XTThreadPtr self, c_char *func, c_char *file, u_int line, int sig)
+{
+#ifdef XT_WIN
+	char buffer[100];
+
+	sprintf(buffer, "Signal #%d", sig);
+	xt_throw_error(self, func, file, line, XT_SIGNAL_CAUGHT, sig, buffer);
+#else
+	xt_throw_error(self, func, file, line, XT_SIGNAL_CAUGHT, sig, strsignal(sig));
+#endif
+}
+
+/*
+ * -----------------------------------------------------------------------
+ * REGISTERING EXCEPTIONS
+ */
+
+xtPublic void xt_registerf(c_char *func, c_char *file, u_int line, int xt_err, int sys_err, c_char *fmt, ...)
+{
+	va_list		ap;
+	XTThreadPtr	thread = xt_get_self();
+
+	va_start(ap, fmt);
+	thr_save_error_va(thread ? &thread->t_exception : NULL, thread, FALSE, func, file, line, xt_err, sys_err, fmt, ap);
+	va_end(ap);
+}
+
+xtPublic void xt_register_i2xterr(c_char *func, c_char *file, u_int line, int xt_err, c_char *item, c_char *item2)
+{
+	xt_registerf(func, file, line, xt_err, 0, thr_get_err_string(xt_err), item, item2);
+}
+
+xtPublic void xt_register_ixterr(c_char *func, c_char *file, u_int line, int xt_err, c_char *item)
+{
+	xt_register_i2xterr(func, file, line, xt_err, item, NULL);
+}
+
+xtPublic void xt_register_tabcolerr(c_char *func, c_char *file, u_int line, int xt_err, XTPathStrPtr tab_item, c_char *item2)
+{
+	char buffer[XT_IDENTIFIER_NAME_SIZE + XT_IDENTIFIER_NAME_SIZE + XT_IDENTIFIER_NAME_SIZE + 3];
+
+	xt_2nd_last_name_of_path(sizeof(buffer), buffer, tab_item->ps_path);
+	xt_strcat(sizeof(buffer), buffer, ".");
+	xt_strcat(sizeof(buffer), buffer, xt_last_name_of_path(tab_item->ps_path));
+
+	xt_register_i2xterr(func, file, line, xt_err, buffer, item2);
+}
+
+xtPublic void xt_register_taberr(c_char *func, c_char *file, u_int line, int xt_err, XTPathStrPtr tab_item)
+{
+	char buffer[XT_IDENTIFIER_NAME_SIZE + XT_IDENTIFIER_NAME_SIZE + XT_IDENTIFIER_NAME_SIZE + 3];
+
+	xt_2nd_last_name_of_path(sizeof(buffer), buffer, tab_item->ps_path);
+	xt_strcat(sizeof(buffer), buffer, ".");
+	xt_strcat(sizeof(buffer), buffer, xt_last_name_of_path(tab_item->ps_path));
+
+	xt_register_ixterr(func, file, line, xt_err, buffer);
+}
+
+xtPublic void xt_register_ulxterr(c_char *func, c_char *file, u_int line, int xt_err, u_long value)
+{
+	char buffer[100];
+
+	sprintf(buffer, "%lu", value);
+	xt_register_ixterr(func, file, line, xt_err, buffer);
+}
+
+xtPublic xtBool xt_register_ferrno(c_char *func, c_char *file, u_int line, int err, c_char *path)
+{
+	char err_msg[XT_SYS_ERR_SIZE];
+
+	xt_registerf(func, file, line, XT_SYSTEM_ERROR, err, "%s: '%s'", thr_get_sys_error(err, err_msg), path);
+	return FAILED;
+}
+
+xtPublic void xt_register_error(c_char *func, c_char *file, u_int line, int xt_err, int sys_err, c_char *msg)
+{
+	xt_registerf(func, file, line, xt_err, sys_err, "%s", msg);
+}
+
+xtPublic xtBool xt_register_errno(c_char *func, c_char *file, u_int line, int err)
+{
+	char err_msg[XT_SYS_ERR_SIZE];
+
+	xt_register_error(func, file, line, XT_SYSTEM_ERROR, err, thr_get_sys_error(err, err_msg));
+	return FAILED;
+}
+
+xtPublic void xt_register_xterr(c_char *func, c_char *file, u_int line, int xt_err)
+{
+	xt_register_error(func, file, line, xt_err, 0, thr_get_err_string(xt_err));
+}
+
+/*
+ * -----------------------------------------------------------------------
+ * CREATING EXCEPTIONS
+ */
+
+xtPublic void xt_exceptionf(XTExceptionPtr e, XTThreadPtr self, c_char *func, c_char *file, u_int line, int xt_err, int sys_err, c_char *fmt, ...)
+{
+	va_list	ap;
+
+	va_start(ap, fmt);
+	thr_save_error_va(e, self, FALSE, func, file, line, xt_err, sys_err, fmt, ap);
+	va_end(ap);
+}
+
+xtPublic void xt_exception_error(XTExceptionPtr e, XTThreadPtr self, c_char *func, c_char *file, u_int line, int xt_err, int sys_err, c_char *msg)
+{
+	xt_exceptionf(e, self, func, file, line, xt_err, sys_err, "%s", msg);
+}
+
+xtPublic xtBool xt_exception_errno(XTExceptionPtr e, XTThreadPtr self, c_char *func, c_char *file, u_int line, int err)
+{
+	char err_msg[XT_SYS_ERR_SIZE];
+
+	xt_exception_error(e, self, func, file, line, XT_SYSTEM_ERROR, err, thr_get_sys_error(err, err_msg));
+	return FAILED;
+}
+
+xtPublic void xt_exception_xterr(XTExceptionPtr e, XTThreadPtr self, c_char *func, c_char *file, u_int line, int xt_err)
+{
+	xt_exception_error(e, self, func, file, line, xt_err, 0, thr_get_err_string(xt_err));
+}
+
+/*
+ * -----------------------------------------------------------------------
+ * LOG ERRORS
+ */
+
+xtPublic void xt_log_errno(XTThreadPtr self, c_char *func, c_char *file, u_int line, int err)
+{
+	XTExceptionRec e;
+
+	xt_exception_errno(&e, self, func, file, line, err);
+	xt_log_exception(self, &e, XT_LOG_DEFAULT);
+}
+
+/*
+ * -----------------------------------------------------------------------
+ * Assertions and failures (one breakpoints for all failures)
+ */
+//#define CRASH_ON_ASSERT
+
+xtPublic xtBool xt_assert(XTThreadPtr self, c_char *expr, c_char *func, c_char *file, u_int line)
+{
+	(void) self;
+#ifdef DEBUG
+	//xt_set_fflush(TRUE);
+	//xt_dump_trace();
+	break_in_assertion(expr, func, file, line);
+#ifdef CRASH_ON_ASSERT
+	abort();
+#endif
+#ifdef XT_WIN
+	FatalAppExit(0, "Assertion Failed!");
+#endif
+#else
+	xt_throw_assertion(self, func, file, line, expr);
+#endif
+	return FALSE;
+}
+
+xtPublic xtBool xt_assume(XTThreadPtr self, c_char *expr, c_char *func, c_char *file, u_int line)
+{
+	xt_log_assertion(self, func, file, line, expr);
+	return FALSE;
+}
+
+/*
+ * -----------------------------------------------------------------------
+ * Create and destroy threads
+ */
+
+typedef struct ThreadData {
+	xtBool			td_started;
+	XTThreadPtr		td_thr;
+	void			*(*td_start_routine)(XTThreadPtr self);
+} ThreadDataRec, *ThreadDataPtr;
+
+#ifdef XT_WIN
+pthread_key(void *, thr_key);
+#else
+static pthread_key_t thr_key;
+#endif
+
+#ifdef HANDLE_SIGNALS
+static void thr_ignore_signal(int sig)
+{
+#pragma unused(sig)
+}
+
+static void thr_throw_signal(int sig)
+{
+	XTThreadPtr	self;
+
+	self = xt_get_self();
+
+	if (self->t_main) {
+		/* The main thread will pass on a signal to all threads: */
+		xt_signal_all_threads(self, sig);
+		if (sig != SIGTERM) {
+			if (self->t_disable_interrupts) {
+				self->t_delayed_signal = sig;
+				self->t_disable_interrupts = FALSE;	/* Prevent infinite loop */
+			}
+			else {
+				self->t_delayed_signal = 0;
+				xt_throw_signal(self, "thr_throw_signal", NULL, 0, sig);
+			}
+		}
+	}
+	else {
+		if (self->t_disable_interrupts) {
+			self->t_delayed_signal = sig;
+			self->t_disable_interrupts = FALSE;	/* Prevent infinite loop */
+		}
+		else {
+			self->t_delayed_signal = 0;
+			xt_throw_signal(self, "thr_throw_signal", NULL, 0, sig);
+		}
+	}
+}
+
+static xtBool thr_setup_signals(void)
+{
+	struct sigaction action;
+
+    sigemptyset(&action.sa_mask);
+    action.sa_flags = 0;
+    action.sa_handler = thr_ignore_signal;
+
+	if (sigaction(SIGPIPE, &action, NULL) == -1)
+		goto error_occurred;
+	if (sigaction(SIGHUP, &action, NULL) == -1)
+		goto error_occurred;
+
+    action.sa_handler = thr_throw_signal;
+
+	if (sigaction(SIGQUIT, &action, NULL) == -1)
+		goto error_occurred;
+	if (sigaction(SIGTERM, &action, NULL) == -1)
+		goto error_occurred;
+#ifndef DEBUG
+	if (sigaction(SIGILL, &action, NULL) == -1)
+		goto error_occurred;
+	if (sigaction(SIGBUS, &action, NULL) == -1)
+		goto error_occurred;
+	if (sigaction(SIGSEGV, &action, NULL) == -1)
+		goto error_occurred;
+#endif
+	return TRUE;
+
+	error_occurred:
+	xt_log_errno(XT_NS_CONTEXT, errno);
+	return FALSE;
+}
+#endif
+
+typedef void *(*ThreadMainFunc)(XTThreadPtr self);
+
+extern "C" void *xt_thread_main(void *data)
+{
+	ThreadDataPtr	td = (ThreadDataPtr) data;
+	XTThreadPtr		self = td->td_thr;
+	ThreadMainFunc		start_routine;
+	void			*return_data;
+
+	enter_();
+	self->t_pthread = pthread_self();
+	start_routine = td->td_start_routine;
+	return_data = NULL;
+
+#ifdef HANDLE_SIGNALS
+	if (!thr_setup_signals())
+		return NULL;
+#endif
+
+	try_(a) {
+		if (!xt_set_key((pthread_key_t)thr_key, self, &self->t_exception))
+			throw_();
+		td->td_started = TRUE;
+		return_data = (*start_routine)(self);
+	}
+	catch_(a) {
+		xt_log_and_clear_exception(self);
+	}
+	cont_(a);
+
+	outer_();
+	xt_free_thread(self);
+	
+	/* {MYSQL-THREAD-KILL}
+	 * Clean up any remaining MySQL thread!
+	 */
+	myxt_delete_remaining_thread();
+	return return_data;
+}
+
+static void thr_free_data(XTThreadPtr self)
+{
+	if (self->t_free_data) {
+		(*self->t_free_data)(self, self->t_data);
+		self->t_data = NULL;
+	}
+}
+
+xtPublic void xt_set_thread_data(XTThreadPtr self, void *data, XTThreadFreeFunc free_func)
+{
+	thr_free_data(self);
+	self->t_free_data = free_func;
+	self->t_data = data;
+}
+
+static void thr_exit(XTThreadPtr self)
+{
+	/* Free the thread temporary data. */
+	thr_free_resources(self, (XTResourcePtr) self->x.t_res_stack);
+	xt_db_exit_thread(self);
+	thr_free_data(self);					/* Free custom user data. */
+
+	if (self->t_id > 0) {
+		ASSERT(self->t_id < xt_thr_current_max_threads);
+		xt_lock_mutex(self, &thr_array_lock);
+		pushr_(xt_unlock_mutex, &thr_array_lock);
+		thr_accumulate_statistics(self);
+		xt_thr_array[self->t_id] = NULL;
+		xt_thr_current_thread_count--;
+		if (self->t_id+1 == xt_thr_current_max_threads) {
+			/* We can reduce the current maximum,
+			 * this makes operations that scan the array faster!
+			 */
+			u_int i;
+
+			i = self->t_id;
+			for(;;) {
+				if (xt_thr_array[i])
+					break;
+				if (!i)
+					break;
+				i--;
+			}
+			xt_thr_current_max_threads = i+1;
+		}
+		freer_(); // xt_unlock_mutex(&thr_array_lock)
+	}
+
+	xt_free_cond(&self->t_cond);
+	xt_free_mutex(&self->t_lock);
+
+	self->st_thread_list_count = 0;
+	self->st_thread_list_size = 0;
+	if (self->st_thread_list) {
+		xt_free_ns(self->st_thread_list);
+		self->st_thread_list = NULL;
+	}
+}
+
+static void thr_init(XTThreadPtr self, XTThreadPtr new_thread)
+{
+	new_thread->t_res_top = (XTResourcePtr) new_thread->x.t_res_stack;
+
+	new_thread->st_thread_list_count = 0;
+	new_thread->st_thread_list_size = 0;
+	new_thread->st_thread_list = NULL;
+	try_(a) {
+		xt_init_cond(self, &new_thread->t_cond);
+		xt_init_mutex_with_autoname(self, &new_thread->t_lock);
+
+		xt_lock_mutex(self, &thr_array_lock);
+		pushr_(xt_unlock_mutex, &thr_array_lock);
+
+		ASSERT(xt_thr_current_thread_count <= xt_thr_current_max_threads);
+		ASSERT(xt_thr_current_max_threads <= xt_thr_maximum_threads);
+		if (xt_thr_current_thread_count == xt_thr_maximum_threads)
+			xt_throw_ulxterr(XT_CONTEXT, XT_ERR_TOO_MANY_THREADS, (u_long) xt_thr_maximum_threads+1);
+		if (xt_thr_current_thread_count == xt_thr_current_max_threads) {
+			new_thread->t_id = xt_thr_current_thread_count;
+			xt_thr_array[new_thread->t_id] = new_thread;
+			xt_thr_current_max_threads++;
+		}
+		else {
+			/* There must be a free slot: */
+			for (u_int i=0; i<xt_thr_current_max_threads; i++) {
+				if (!xt_thr_array[i]) {
+					new_thread->t_id = i;
+					xt_thr_array[i] = new_thread;
+					break;
+				}
+			}
+		}
+		xt_thr_current_thread_count++;
+		freer_(); // xt_unlock_mutex(&thr_array_lock)
+
+		xt_db_init_thread(self, new_thread);
+	}
+	catch_(a) {
+		thr_exit(new_thread);
+		throw_();
+	}
+	cont_(a);
+	
+}
+
+/*
+ * The caller of this function automatically becomes the main thread.
+ */
+xtPublic XTThreadPtr xt_init_threading(u_int max_threads)
+{
+	volatile XTThreadPtr	self = NULL;
+	XTExceptionRec			e;
+	int						err;
+
+	/* Align the number of threads: */
+	xt_thr_maximum_threads = xt_align_size(max_threads, XT_XS_LOCK_ALIGN);
+
+#ifdef XT_TRACK_CONNECTIONS
+	if (xt_thr_maximum_threads > XT_TRACK_MAX_CONNS) {
+		xt_log_error(XT_NS_CONTEXT, XT_LOG_FATAL, XT_ERR_TOO_MANY_THREADS, 0, 
+			"XT_TRACK_CONNECTIONS is enabled and xt_thr_maximum_threads > XT_TRACK_MAX_CONNS");
+		goto failed;
+	}
+#endif
+
+#ifdef HANDLE_SIGNALS
+	if (!thr_setup_signals())
+		return NULL;
+#endif
+
+	xt_p_init_threading();
+
+	err = pthread_key_create(&thr_key, NULL);
+	if (err) {
+		xt_log_errno(XT_NS_CONTEXT, err);
+		return NULL;
+	}
+
+	if ((err = xt_p_mutex_init_with_autoname(&thr_array_lock, NULL))) {
+		xt_log_errno(XT_NS_CONTEXT, err);
+		goto failed;
+	}
+	
+	if (!(xt_thr_array = (XTThreadPtr *) malloc(xt_thr_maximum_threads * sizeof(XTThreadPtr)))) {
+		xt_log_errno(XT_NS_CONTEXT, XT_ENOMEM);
+		goto failed;
+	}
+
+	xt_thr_array[0] = (XTThreadPtr) 1; // Dummy, not used
+	xt_thr_current_thread_count = 1;
+	xt_thr_current_max_threads = 1;
+
+	/* Create the main thread: */
+	self = xt_create_thread("MainThread", TRUE, FALSE, &e);
+	if (!self) {
+		xt_log_exception(NULL, &e, XT_LOG_DEFAULT);
+		goto failed;
+	}
+
+	try_(a) {
+		XTThreadPtr	thread = self;
+		thr_list = xt_new_linkedlist(thread, NULL, NULL, TRUE);
+	}
+	catch_(a) {
+		XTThreadPtr	thread = self;
+		xt_log_and_clear_exception(thread);
+		xt_exit_threading(thread);
+	}
+	cont_(a);
+
+	return self;
+	
+	failed:
+	xt_exit_threading(NULL);
+	return NULL;
+}
+
+xtPublic void xt_exit_threading(XTThreadPtr self)
+{
+	if (thr_list) {
+		xt_free_linkedlist(self, thr_list);
+		thr_list = NULL;
+	}
+
+	/* This should be the main thread! */
+	if (self) {
+		ASSERT(self->t_main);
+		xt_free_thread(self);
+	}
+
+	if (xt_thr_array) {
+		free(xt_thr_array);
+		xt_thr_array = NULL;
+		xt_free_mutex(&thr_array_lock);
+	}
+
+	xt_thr_current_thread_count = 0;
+	xt_thr_current_max_threads = 0;
+
+	/* I no longer delete 'thr_key' because
+	 * functions that call xt_get_self() after this
+	 * point will get junk back if we delete
+	 * thr_key. In particular the XT_THREAD_LOCK_INFO
+	 * code fails
+	if (thr_key) {
+		pthread_key_delete(thr_key);
+		thr_key = (pthread_key_t) 0;
+	}
+	*/
+}
+
+xtPublic void xt_wait_for_all_threads(XTThreadPtr self)
+{
+	if (thr_list)
+		xt_ll_wait_till_empty(self, thr_list);
+}
+
+/*
+ * Call this function in a busy wait loop!
+ * Use if for wait loops that are not
+ * time critical.
+ */
+xtPublic void xt_busy_wait(void)
+{
+#ifdef XT_WIN
+	Sleep(1);
+#else
+	usleep(10);
+#endif
+}
+
+xtPublic void xt_critical_wait(void)
+{
+	/* NOTE: On Mac xt_busy_wait() works better than xt_yield()
+	 */
+#if defined(XT_MAC) || defined(XT_WIN)
+	xt_busy_wait();
+#else
+	xt_yield();
+#endif
+}
+
+
+/*
+ * Use this for loops that time critical.
+ * Time critical means we need to get going
+ * as soon as possible!
+ */
+xtPublic void xt_yield(void)
+{
+#ifdef XT_WIN
+	Sleep(0);
+#elif defined(XT_MAC) || defined(XT_SOLARIS)
+	usleep(0);
+#elif defined(XT_NETBSD)
+	sched_yield();
+#else
+	pthread_yield();
+#endif
+}
+
+xtPublic void xt_sleep_milli_second(u_int t)
+{
+#ifdef XT_WIN
+	Sleep(t);
+#else
+	usleep(t * 1000);
+#endif
+}
+
+xtPublic void xt_signal_all_threads(XTThreadPtr self, int sig)
+{
+	XTLinkedItemPtr li;
+	XTThreadPtr		sig_thr;
+
+	xt_ll_lock(self, thr_list);
+	try_(a) {
+		li = thr_list->ll_items;
+		while (li) {
+			sig_thr = (XTThreadPtr) li;
+			if (sig_thr != self)
+				pthread_kill(sig_thr->t_pthread, sig);
+			li = li->li_next;
+		}
+	}
+	catch_(a) {
+		xt_ll_unlock(self, thr_list);
+		throw_();
+	}
+	cont_(a);
+	xt_ll_unlock(self, thr_list);
+}
+
+/*
+ * Apply the given function to all threads except self!
+ */
+xtPublic void xt_do_to_all_threads(XTThreadPtr self, void (*do_func_ptr)(XTThreadPtr self, XTThreadPtr to_thr, void *thunk), void *thunk)
+{
+	XTLinkedItemPtr li;
+	XTThreadPtr		to_thr;
+
+	xt_ll_lock(self, thr_list);
+	pushr_(xt_ll_unlock, thr_list);
+
+	li = thr_list->ll_items;
+	while (li) {
+		to_thr = (XTThreadPtr) li;
+		if (to_thr != self)
+			(*do_func_ptr)(self, to_thr, thunk);
+		li = li->li_next;
+	}
+
+	freer_(); // xt_ll_unlock(thr_list)
+}
+
+xtPublic XTThreadPtr xt_get_self(void)
+{
+	XTThreadPtr self;
+
+	/* First check if the handler has the data: */
+	if ((self = myxt_get_self()))
+		return self;
+	/* Then it must be a background process, and the 
+	 * thread info is stored in the local key: */
+	return (XTThreadPtr) xt_get_key((pthread_key_t)thr_key);
+}
+
+xtPublic void xt_set_self(XTThreadPtr self)
+{
+	xt_set_key((pthread_key_t)thr_key, self, NULL);
+}
+
+xtPublic void xt_clear_exception(XTThreadPtr thread)
+{
+	thread->t_exception.e_xt_err = 0;
+	thread->t_exception.e_sys_err = 0;
+	*thread->t_exception.e_err_msg = 0;
+	*thread->t_exception.e_func_name = 0;
+	*thread->t_exception.e_source_file = 0;
+	thread->t_exception.e_source_line = 0;
+	*thread->t_exception.e_catch_trace = 0;
+}
+
+/*
+ * Create a thread without requiring thread to do it (as in xt_create_daemon()).
+ *
+ * This function returns NULL on error.
+ */
+xtPublic XTThreadPtr xt_create_thread(c_char *name, xtBool main_thread, xtBool user_thread, XTExceptionPtr e)
+{
+	volatile XTThreadPtr self;
+	
+	self = (XTThreadPtr) xt_calloc_ns(sizeof(XTThreadRec));
+	if (!self) {
+		xt_exception_errno(e, XT_CONTEXT, ENOMEM);
+		return NULL;
+	}
+
+	if (!xt_set_key((pthread_key_t)thr_key, self, e)) {
+		xt_free_ns(self);
+		return NULL;
+	}
+
+	xt_strcpy(XT_THR_NAME_SIZE, self->t_name, name);
+	self->t_main = main_thread;
+	self->t_daemon = FALSE;
+
+	try_(a) {
+		thr_init(self, self);
+	}
+	catch_(a) {
+		*e = self->t_exception;
+		xt_set_key((pthread_key_t)thr_key, NULL, NULL);
+		xt_free_ns(self);
+		self = NULL;
+	}
+	cont_(a);
+
+	if (self && user_thread) {
+		/* Add non-temporary threads to the thread list. */
+		try_(b) {
+			xt_ll_add(self, thr_list, &self->t_links, TRUE);
+		}
+		catch_(b) {
+			*e = self->t_exception;
+			xt_free_thread(self);
+			self = NULL;
+		}
+		cont_(b);
+	}
+
+	return self;
+}
+
+/*
+ * Create a daemon thread.
+ */
+xtPublic XTThreadPtr xt_create_daemon(XTThreadPtr self, c_char *name)
+{
+	XTThreadPtr new_thread;
+
+	/* NOTE: thr_key will be set when this thread start running. */
+
+	new_thread = (XTThreadPtr) xt_calloc(self, sizeof(XTThreadRec));
+	xt_strcpy(XT_THR_NAME_SIZE, new_thread->t_name, name);
+	new_thread->t_main = FALSE;
+	new_thread->t_daemon = TRUE;
+
+	try_(a) {
+		thr_init(self, new_thread);
+	}
+	catch_(a) {
+		xt_free(self, new_thread);
+		throw_();
+	}
+	cont_(a);
+	return new_thread;
+}
+
+void xt_free_thread(XTThreadPtr self)
+{
+	thr_exit(self);
+	if (!self->t_daemon && thr_list)
+		xt_ll_remove(self, thr_list, &self->t_links, TRUE);
+	/* Note, if I move this before thr_exit() then self = xt_get_self(); will fail in 
+	 * xt_close_file_ns() which is called by xt_unuse_database()!
+	 */
+
+	 /*
+	  * Do not clear the pthread's key value unless it is the same as the thread just released.
+	  * This can happen during shutdown when the engine is deregistered with the PBMS engine.
+	  *
+	  * What happens is that during deregistration the PBMS engine calls close to close all
+	  * PBXT resources on all MySQL THDs created by PBMS for it's own pthreads. So the 'self' 
+	  * being freed is not the same 'self' associated with the PBXT 'thr_key'.
+	  */
+	if (thr_key && (self == ((XTThreadPtr) xt_get_key((pthread_key_t)thr_key)))) {
+		xt_set_key((pthread_key_t)thr_key, NULL, NULL);
+	}
+	xt_free_ns(self);
+}
+
+xtPublic pthread_t xt_run_thread(XTThreadPtr self, XTThreadPtr child, void *(*start_routine)(XTThreadPtr))
+{
+	ThreadDataRec	data;
+	int				err;
+	pthread_t		child_thread;
+
+	enter_();
+	
+	// 'data' can be on the stack because we are waiting for the thread to start
+	// before exiting the function.
+	data.td_started = FALSE;
+	data.td_thr = child;
+	data.td_start_routine = start_routine;
+#ifdef XT_WIN
+	{
+		pthread_attr_t	attr = { 0, 0, 0 };
+
+		attr.priority = THREAD_PRIORITY_NORMAL;
+		err = pthread_create(&child_thread, &attr, xt_thread_main, &data);
+	}
+#else
+	err = pthread_create(&child_thread, NULL, xt_thread_main, &data);
+#endif
+	if (err) {
+		xt_free_thread(child);
+		xt_throw_errno(XT_CONTEXT, err);
+	}
+	while (!data.td_started) {
+		/* Check that the self is still alive: */
+		if (pthread_kill(child_thread, 0))
+			break;
+		xt_busy_wait();
+	}
+	return_(child_thread);
+}
+
+xtPublic void xt_exit_thread(XTThreadPtr self, void *result)
+{
+	xt_free_thread(self);
+	pthread_exit(result);
+}
+
+xtPublic void *xt_wait_for_thread(xtThreadID tid, xtBool ignore_error)
+{
+	int			err;
+	void		*value_ptr = NULL;
+	xtBool		ok = FALSE;
+	XTThreadPtr thread;
+	pthread_t	t1 = 0;
+
+	xt_lock_mutex_ns(&thr_array_lock);
+	if (tid < xt_thr_maximum_threads) {
+		if ((thread = xt_thr_array[tid])) {
+			t1 = thread->t_pthread;
+			ok = TRUE;
+		}
+	}
+	xt_unlock_mutex_ns(&thr_array_lock);
+	if (ok) {
+		err = xt_p_join(t1, &value_ptr);
+		if (err && !ignore_error)
+			xt_log_errno(XT_NS_CONTEXT, err);
+	}
+	return value_ptr;
+}
+
+/*
+ * Kill the given thead, and wait for it to terminate.
+ * This function just returns if the self is already dead.
+ */
+xtPublic void xt_kill_thread(pthread_t t1)
+{
+	int		err;
+	void	*value_ptr = NULL;
+
+	err = pthread_kill(t1, SIGTERM);
+	if (err)
+		return;
+	err = xt_p_join(t1, &value_ptr);
+	if (err)
+		xt_log_errno(XT_NS_CONTEXT, err);
+}
+
+/*
+ * -----------------------------------------------------------------------
+ * Read/write locking
+ */
+
+#ifdef XT_THREAD_LOCK_INFO
+xtPublic xtBool xt_init_rwlock(XTThreadPtr self, xt_rwlock_type *rwlock, const char *name)
+#else
+xtPublic xtBool xt_init_rwlock(XTThreadPtr self, xt_rwlock_type *rwlock)
+#endif
+{
+	int err;
+
+#ifdef XT_THREAD_LOCK_INFO
+	err = xt_p_rwlock_init_with_name(rwlock, NULL, name);
+#else
+	err = xt_p_rwlock_init(rwlock, NULL);
+#endif
+
+	if (err) {
+		xt_throw_errno(XT_CONTEXT, err);
+		return FAILED;
+	}
+	return OK;
+}
+
+xtPublic void xt_free_rwlock(xt_rwlock_type *rwlock)
+{
+	int err;
+
+	for (;;) {
+		err = xt_p_rwlock_destroy(rwlock);
+		if (err != XT_EBUSY)
+			break;
+		xt_busy_wait();
+	}
+	/* PMC - xt_xn_exit_db() is called even when xt_xn_init_db() is not fully completed!
+	 * This generates a lot of log entries. But I have no desire to only call
+	 * free for those articles that I have init'ed!
+	if (err)
+		xt_log_errno(XT_NS_CONTEXT, err);
+	*/
+}
+
+xtPublic xt_rwlock_type *xt_slock_rwlock(XTThreadPtr self, xt_rwlock_type *rwlock)
+{
+	int err;
+
+	for (;;) {
+		err = xt_slock_rwlock_ns(rwlock);
+		if (err != XT_EAGAIN)
+			break;
+		xt_busy_wait();
+	}
+	if (err) {
+		xt_throw_errno(XT_CONTEXT, err);
+		return NULL;
+	}
+	return rwlock;
+}
+
+xtPublic xt_rwlock_type *xt_xlock_rwlock(XTThreadPtr self, xt_rwlock_type *rwlock)
+{
+	int err;
+
+	for (;;) {
+		err = xt_xlock_rwlock_ns(rwlock);
+		if (err != XT_EAGAIN)
+			break;
+		xt_busy_wait();
+	}
+
+	if (err) {
+		xt_throw_errno(XT_CONTEXT, err);
+		return NULL;
+	}
+	return rwlock;
+}
+
+xtPublic void xt_unlock_rwlock(XTThreadPtr XT_UNUSED(self), xt_rwlock_type *rwlock)
+{
+	int err;
+
+	err = xt_unlock_rwlock_ns(rwlock);
+	if (err)
+		xt_log_errno(XT_NS_CONTEXT, err);
+}
+
+/*
+ * -----------------------------------------------------------------------
+ * Mutex locking
+ */
+
+xtPublic xt_mutex_type *xt_new_mutex(XTThreadPtr self)
+{
+	xt_mutex_type *mx;
+
+	if (!(mx = (xt_mutex_type *) xt_calloc(self, sizeof(xt_mutex_type))))
+		return NULL;
+	pushr_(xt_free, mx);
+	if (!xt_init_mutex_with_autoname(self, mx)) {
+		freer_();
+		return NULL;
+	}
+	popr_();
+	return mx;
+}
+
+xtPublic void xt_delete_mutex(XTThreadPtr self, xt_mutex_type *mx)
+{
+	if (mx) {
+		xt_free_mutex(mx);
+		xt_free(self, mx);
+	}
+}
+
+#ifdef XT_THREAD_LOCK_INFO
+xtPublic xtBool xt_init_mutex(XTThreadPtr self, xt_mutex_type *mx, const char *name)
+#else
+xtPublic xtBool xt_init_mutex(XTThreadPtr self, xt_mutex_type *mx)
+#endif
+{
+	int err;
+
+	err = xt_p_mutex_init_with_name(mx, NULL, name);
+	if (err) {
+		xt_throw_errno(XT_CONTEXT, err);
+		return FALSE;
+	}
+	return TRUE;
+}
+
+void xt_free_mutex(xt_mutex_type *mx)
+{
+	int err;
+
+	for (;;) {
+		err = xt_p_mutex_destroy(mx);
+		if (err != XT_EBUSY)
+			break;
+		xt_busy_wait();
+	}
+	/* PMC - xt_xn_exit_db() is called even when xt_xn_init_db() is not fully completed!
+	if (err)
+		xt_log_errno(XT_NS_CONTEXT, err);
+	*/
+}
+
+xtPublic xtBool xt_lock_mutex(XTThreadPtr self, xt_mutex_type *mx)
+{
+	int err;
+
+	for (;;) {
+		err = xt_lock_mutex_ns(mx);
+		if (err != XT_EAGAIN)
+			break;
+		xt_busy_wait();
+	}
+
+	if (err) {
+		xt_throw_errno(XT_CONTEXT, err);
+		return FALSE;
+	}
+	return TRUE;
+}
+
+xtPublic void xt_unlock_mutex(XTThreadPtr self, xt_mutex_type *mx)
+{
+	int err;
+
+	err = xt_unlock_mutex_ns(mx);
+	if (err)
+		xt_throw_errno(XT_CONTEXT, err);
+}
+
+xtPublic xtBool xt_set_key(pthread_key_t key, const void *value, XTExceptionPtr e)
+{
+#ifdef XT_WIN
+	my_pthread_setspecific_ptr(thr_key, (void *) value);
+#else
+	int err;
+
+	err = pthread_setspecific(key, value);
+	if (err) {
+		if (e)
+			xt_exception_errno(e, XT_NS_CONTEXT, err);
+		return FALSE;
+	}
+#endif
+	return TRUE;
+}
+
+xtPublic void *xt_get_key(pthread_key_t key)
+{
+#ifdef XT_WIN
+	return my_pthread_getspecific_ptr(void *, thr_key);
+#else
+	return pthread_getspecific(key);
+#endif
+}
+
+xtPublic xt_cond_type *xt_new_cond(XTThreadPtr self)
+{
+	xt_cond_type *cond;
+
+	if (!(cond = (xt_cond_type *) xt_calloc(self, sizeof(xt_cond_type))))
+		return NULL;
+	pushr_(xt_free, cond);
+	if (!xt_init_cond(self, cond)) {
+		freer_();
+		return NULL;
+	}
+	popr_();
+	return cond;
+}
+
+xtPublic void xt_delete_cond(XTThreadPtr self, xt_cond_type *cond)
+{
+	if (cond) {
+		xt_free_cond(cond);
+		xt_free(self, cond);
+	}
+}
+
+xtPublic xtBool xt_init_cond(XTThreadPtr self, xt_cond_type *cond)
+{
+	int err;
+
+	err = pthread_cond_init(cond, NULL);
+	if (err) {
+		xt_throw_errno(XT_CONTEXT, err);
+		return FALSE;
+	}
+	return TRUE;
+}
+
+xtPublic void xt_free_cond(xt_cond_type *cond)
+{
+	int err;
+
+	for (;;) {
+		err = pthread_cond_destroy(cond);
+		if (err != XT_EBUSY)
+			break;
+		xt_busy_wait();
+	}
+	/* PMC - xt_xn_exit_db() is called even when xt_xn_init_db() is not fully completed!
+	if (err)
+		xt_log_errno(XT_NS_CONTEXT, err);
+	*/
+}
+
+xtPublic xtBool xt_throw_delayed_signal(XTThreadPtr self, c_char *func, c_char *file, u_int line)
+{
+	XTThreadPtr me = self ? self : xt_get_self();
+
+	if (me->t_delayed_signal) {
+		int sig = me->t_delayed_signal;
+		
+		me->t_delayed_signal = 0;
+		xt_throw_signal(self, func, file, line, sig);
+		return FAILED;
+	}
+	return OK;
+}
+
+xtPublic xtBool xt_wait_cond(XTThreadPtr self, xt_cond_type *cond, xt_mutex_type *mutex)
+{
+	int			err;
+	XTThreadPtr	me = self ? self : xt_get_self();
+
+	/* PMC - In my tests, if I throw an exception from within the wait
+	 * the condition and the mutex remain locked.
+	 */
+	me->t_disable_interrupts = TRUE;
+	err = xt_p_cond_wait(cond, mutex);
+	me->t_disable_interrupts = FALSE;
+	if (err) {
+		xt_throw_errno(XT_CONTEXT, err);
+		return FALSE;
+	}
+	if (me->t_delayed_signal) {
+		xt_throw_delayed_signal(XT_CONTEXT);
+		return FALSE;
+	}
+	return TRUE;
+}
+
+xtPublic xtBool xt_suspend(XTThreadPtr thread)
+{
+	xtBool ok;
+
+	// You can only suspend yourself. 
+	ASSERT_NS(pthread_equal(thread->t_pthread, pthread_self()));
+	
+	xt_lock_mutex_ns(&thread->t_lock);
+	ok = xt_wait_cond(NULL, &thread->t_cond, &thread->t_lock);
+	xt_unlock_mutex_ns(&thread->t_lock);
+	return ok;
+}
+
+xtPublic xtBool xt_unsuspend(XTThreadPtr target)
+{
+	return xt_broadcast_cond_ns(&target->t_cond);
+}
+
+xtPublic void xt_lock_thread(XTThreadPtr thread)
+{
+	xt_lock_mutex_ns(&thread->t_lock);
+}
+
+xtPublic void xt_unlock_thread(XTThreadPtr thread)
+{
+	xt_unlock_mutex_ns(&thread->t_lock);
+}
+
+xtPublic xtBool xt_wait_thread(XTThreadPtr thread)
+{
+	return xt_wait_cond(NULL, &thread->t_cond, &thread->t_lock);
+}
+
+xtPublic void xt_signal_thread(XTThreadPtr target)
+{
+	xt_broadcast_cond_ns(&target->t_cond);
+}
+
+xtPublic void xt_terminate_thread(XTThreadPtr XT_UNUSED(self), XTThreadPtr target)
+{
+	target->t_quit = TRUE;
+	target->t_delayed_signal = SIGTERM;
+}
+
+xtPublic xtProcID xt_getpid()
+{
+#ifdef XT_WIN
+	return GetCurrentProcessId();
+#else
+	return getpid();
+#endif
+}
+
+xtPublic xtBool xt_process_exists(xtProcID pid)
+{
+	xtBool found;
+
+#ifdef XT_WIN
+	HANDLE	h;
+	DWORD	code;
+
+	found = FALSE;
+	h = OpenProcess(PROCESS_QUERY_INFORMATION, FALSE, pid);
+	if (h) {
+		if (GetExitCodeProcess(h, &code)) {
+			if (code == STILL_ACTIVE)
+				found = TRUE;
+		}
+		CloseHandle(h);
+	}
+	else {
+		int err;
+
+		err = HRESULT_CODE(GetLastError());
+		if (err != ERROR_INVALID_PARAMETER)
+			found = TRUE;
+	}
+#else
+	found = TRUE;
+	if (kill(pid, 0) == -1) {
+		if (errno == ESRCH)
+			found = FALSE;
+	}
+#endif
+	return found;	
+}
+
+xtPublic xtBool xt_timed_wait_cond(XTThreadPtr self, xt_cond_type *cond, xt_mutex_type *mutex, u_long milli_sec)
+{
+	int				err;
+	struct timespec	abstime;
+	XTThreadPtr		me = self ? self : xt_get_self();
+
+#ifdef XT_WIN
+	union ft64		now;
+  
+	GetSystemTimeAsFileTime(&now.ft);
+
+	/* System time is measured in 100ns units.
+	 * This calculation will be reversed by the Windows implementation
+	 * of pthread_cond_timedwait(), in order to extract the
+	 * milli-second timeout!
+	 */
+	abstime.tv.i64 = now.i64 + (milli_sec * 10000);
+  
+	abstime.max_timeout_msec = milli_sec;
+#else
+	struct timeval	now;
+	u_llong			micro_sec;
+
+	/* Get the current time in microseconds: */
+	gettimeofday(&now, NULL);
+	micro_sec = (u_llong) now.tv_sec * (u_llong) 1000000 + (u_llong) now.tv_usec;
+	
+	/* Add the timeout which is in milli seconds */
+	micro_sec += (u_llong) milli_sec * (u_llong) 1000;
+
+	/* Setup the end time, which is in nano-seconds. */
+	abstime.tv_sec = (long) (micro_sec / 1000000);				/* seconds */
+	abstime.tv_nsec = (long) ((micro_sec % 1000000) * 1000);	/* and nanoseconds */
+#endif
+
+	me->t_disable_interrupts = TRUE;
+	err = xt_p_cond_timedwait(cond, mutex, &abstime);
+	me->t_disable_interrupts = FALSE;
+	if (err && err != ETIMEDOUT) {
+		xt_throw_errno(XT_CONTEXT, err);
+		return FALSE;
+	}
+	if (me->t_delayed_signal) {
+		xt_throw_delayed_signal(XT_CONTEXT);
+		return FALSE;
+	}
+	return TRUE;
+}
+
+xtPublic xtBool xt_signal_cond(XTThreadPtr self, xt_cond_type *cond)
+{
+	int err;
+
+	err = pthread_cond_signal(cond);
+	if (err) {
+		xt_throw_errno(XT_CONTEXT, err);
+		return FAILED;
+	}
+	return OK;
+}
+
+xtPublic void xt_broadcast_cond(XTThreadPtr self, xt_cond_type *cond)
+{
+	int err;
+
+	err = pthread_cond_broadcast(cond);
+	if (err)
+		xt_throw_errno(XT_CONTEXT, err);
+}
+
+xtPublic xtBool xt_broadcast_cond_ns(xt_cond_type *cond)
+{
+	int err;
+
+	err = pthread_cond_broadcast(cond);
+	if (err) {
+		xt_register_errno(XT_REG_CONTEXT, err);
+		return FAILED;
+	}
+	return OK;
+}
+
+static int prof_setjmp_count = 0;
+
+xtPublic int prof_setjmp(void)
+{
+	prof_setjmp_count++;
+	return 0;
+}
+
+xtPublic void xt_set_low_priority(XTThreadPtr self)
+{
+	int err = xt_p_set_low_priority(self->t_pthread);
+	if (err) {
+		self = NULL; /* Will cause logging, instead of throwing exception */
+		xt_throw_errno(XT_CONTEXT, err);
+	}
+}
+
+xtPublic void xt_set_normal_priority(XTThreadPtr self)
+{
+	int err = xt_p_set_normal_priority(self->t_pthread);
+	if (err) {
+		self = NULL; /* Will cause logging, instead of throwing exception */
+		xt_throw_errno(XT_CONTEXT, err);
+	}
+}
+
+xtPublic void xt_set_high_priority(XTThreadPtr self)
+{
+	int err = xt_p_set_high_priority(self->t_pthread);
+	if (err) {
+		self = NULL; /* Will cause logging, instead of throwing exception */
+		xt_throw_errno(XT_CONTEXT, err);
+	}
+}
+
+xtPublic void xt_set_priority(XTThreadPtr self, int priority)
+{
+	if (priority < XT_PRIORITY_NORMAL)
+		xt_set_low_priority(self);
+	else if (priority > XT_PRIORITY_NORMAL)
+		xt_set_high_priority(self);
+	else
+		xt_set_normal_priority(self);
+}
+
+/*
+ * -----------------------------------------------------------------------
+ * STATISTICS
+ */
+
+xtPublic void xt_gather_statistics(XTStatisticsPtr stats)
+{
+	XTThreadPtr *thr;
+	xtWord8		s;
+
+	xt_lock_mutex_ns(&thr_array_lock);
+	*stats = thr_statistics;
+	// Ignore index 0, it is not used!
+	thr = &xt_thr_array[1];
+	for (u_int i=1; i<xt_thr_current_max_threads; i++) {
+		if (*thr) {
+			stats->st_commits += (*thr)->st_statistics.st_commits;
+			stats->st_rollbacks += (*thr)->st_statistics.st_rollbacks;
+			stats->st_stat_read += (*thr)->st_statistics.st_stat_read;
+			stats->st_stat_write += (*thr)->st_statistics.st_stat_write;
+
+			XT_ADD_STATS(stats->st_rec, (*thr)->st_statistics.st_rec);
+			if ((s = (*thr)->st_statistics.st_rec.ts_flush_start))
+				stats->st_rec.ts_flush_time += xt_trace_clock() - s;
+			stats->st_rec_cache_hit += (*thr)->st_statistics.st_rec_cache_hit;
+			stats->st_rec_cache_miss += (*thr)->st_statistics.st_rec_cache_miss;
+			stats->st_rec_cache_frees += (*thr)->st_statistics.st_rec_cache_frees;
+
+			XT_ADD_STATS(stats->st_ind, (*thr)->st_statistics.st_ind);
+			if ((s = (*thr)->st_statistics.st_ind.ts_flush_start))
+				stats->st_ind.ts_flush_time += xt_trace_clock() - s;
+			stats->st_ind_cache_hit += (*thr)->st_statistics.st_ind_cache_hit;
+			stats->st_ind_cache_miss += (*thr)->st_statistics.st_ind_cache_miss;
+			XT_ADD_STATS(stats->st_ilog, (*thr)->st_statistics.st_ilog);
+
+			XT_ADD_STATS(stats->st_xlog, (*thr)->st_statistics.st_xlog);
+			if ((s = (*thr)->st_statistics.st_xlog.ts_flush_start))
+				stats->st_xlog.ts_flush_time += xt_trace_clock() - s;
+			stats->st_xlog_cache_hit += (*thr)->st_statistics.st_xlog_cache_hit;
+			stats->st_xlog_cache_miss += (*thr)->st_statistics.st_xlog_cache_miss;
+
+			XT_ADD_STATS(stats->st_data, (*thr)->st_statistics.st_data);
+			if ((s = (*thr)->st_statistics.st_data.ts_flush_start))
+				stats->st_data.ts_flush_time += xt_trace_clock() - s;
+
+			stats->st_scan_index += (*thr)->st_statistics.st_scan_index;
+			stats->st_scan_table += (*thr)->st_statistics.st_scan_table;
+			stats->st_row_select += (*thr)->st_statistics.st_row_select;
+			stats->st_row_insert += (*thr)->st_statistics.st_row_insert;
+			stats->st_row_update += (*thr)->st_statistics.st_row_update;
+			stats->st_row_delete += (*thr)->st_statistics.st_row_delete;
+
+			stats->st_wait_for_xact += (*thr)->st_statistics.st_wait_for_xact;
+			stats->st_retry_index_scan += (*thr)->st_statistics.st_retry_index_scan;
+			stats->st_reread_record_list += (*thr)->st_statistics.st_reread_record_list;
+		}
+		thr++;
+	}
+	xt_unlock_mutex_ns(&thr_array_lock);
+}
+
+static void thr_accumulate_statistics(XTThreadPtr self)
+{
+	thr_statistics.st_commits += self->st_statistics.st_commits;
+	thr_statistics.st_rollbacks += self->st_statistics.st_rollbacks;
+	thr_statistics.st_stat_read += self->st_statistics.st_stat_read;
+	thr_statistics.st_stat_write += self->st_statistics.st_stat_write;
+
+	XT_ADD_STATS(thr_statistics.st_rec, self->st_statistics.st_rec);
+	thr_statistics.st_rec_cache_hit += self->st_statistics.st_rec_cache_hit;
+	thr_statistics.st_rec_cache_miss += self->st_statistics.st_rec_cache_miss;
+	thr_statistics.st_rec_cache_frees += self->st_statistics.st_rec_cache_frees;
+
+	XT_ADD_STATS(thr_statistics.st_ind, self->st_statistics.st_ind);
+	thr_statistics.st_ind_cache_hit += self->st_statistics.st_ind_cache_hit;
+	thr_statistics.st_ind_cache_miss += self->st_statistics.st_ind_cache_miss;
+	XT_ADD_STATS(thr_statistics.st_ilog, self->st_statistics.st_ilog);
+
+	XT_ADD_STATS(thr_statistics.st_xlog, self->st_statistics.st_xlog);
+	thr_statistics.st_xlog_cache_hit += self->st_statistics.st_xlog_cache_hit;
+	thr_statistics.st_xlog_cache_miss += self->st_statistics.st_xlog_cache_miss;
+
+	XT_ADD_STATS(thr_statistics.st_data, self->st_statistics.st_data);
+
+	thr_statistics.st_scan_index += self->st_statistics.st_scan_index;
+	thr_statistics.st_scan_table += self->st_statistics.st_scan_table;
+	thr_statistics.st_row_select += self->st_statistics.st_row_select;
+	thr_statistics.st_row_insert += self->st_statistics.st_row_insert;
+	thr_statistics.st_row_update += self->st_statistics.st_row_update;
+	thr_statistics.st_row_delete += self->st_statistics.st_row_delete;
+
+	thr_statistics.st_wait_for_xact += self->st_statistics.st_wait_for_xact;
+	thr_statistics.st_retry_index_scan += self->st_statistics.st_retry_index_scan;
+	thr_statistics.st_reread_record_list += self->st_statistics.st_reread_record_list;
+}
+
+xtPublic u_llong xt_get_statistic(XTStatisticsPtr stats, XTDatabaseHPtr db, u_int rec_id)
+{
+	u_llong stat_value;
+
+	switch (rec_id) {
+		case XT_STAT_TIME_CURRENT:
+			stat_value = (u_llong) time(NULL);
+			break;
+		case XT_STAT_TIME_PASSED:
+			stat_value = (u_llong) xt_trace_clock();
+			break;
+		case XT_STAT_COMMITS:
+			stat_value = stats->st_commits;
+			break;
+		case XT_STAT_ROLLBACKS:
+			stat_value = stats->st_rollbacks;
+			break;
+		case XT_STAT_STAT_READS:
+			stat_value = stats->st_stat_read;
+			break;
+		case XT_STAT_STAT_WRITES:
+			stat_value = stats->st_stat_write;
+			break;
+
+		case XT_STAT_REC_BYTES_IN:
+			stat_value = stats->st_rec.ts_read;
+			break;
+		case XT_STAT_REC_BYTES_OUT:
+			stat_value = stats->st_rec.ts_write;
+			break;
+		case XT_STAT_REC_SYNC_COUNT:
+			stat_value = stats->st_rec.ts_flush;
+			break;
+		case XT_STAT_REC_SYNC_TIME:
+			stat_value = stats->st_rec.ts_flush_time;
+			break;
+		case XT_STAT_REC_CACHE_HIT:
+			stat_value = stats->st_rec_cache_hit;
+			break;
+		case XT_STAT_REC_CACHE_MISS:
+			stat_value = stats->st_rec_cache_miss;
+			break;
+		case XT_STAT_REC_CACHE_FREES:
+			stat_value = stats->st_rec_cache_frees;
+			break;
+		case XT_STAT_REC_CACHE_USAGE:
+			stat_value = (u_llong) xt_tc_get_usage();
+			break;
+
+		case XT_STAT_IND_BYTES_IN:
+			stat_value = stats->st_ind.ts_read;
+			break;
+		case XT_STAT_IND_BYTES_OUT:
+			stat_value = stats->st_ind.ts_write;
+			break;
+		case XT_STAT_IND_SYNC_COUNT:
+			stat_value = stats->st_ind.ts_flush;
+			break;
+		case XT_STAT_IND_SYNC_TIME:
+			stat_value = stats->st_ind.ts_flush_time;
+			break;
+		case XT_STAT_IND_CACHE_HIT:
+			stat_value = stats->st_ind_cache_hit;
+			break;
+		case XT_STAT_IND_CACHE_MISS:
+			stat_value = stats->st_ind_cache_miss;
+			break;
+		case XT_STAT_IND_CACHE_USAGE:
+			stat_value = (u_llong) xt_ind_get_usage();
+			break;
+		case XT_STAT_ILOG_BYTES_IN:
+			stat_value = stats->st_ilog.ts_read;
+			break;
+		case XT_STAT_ILOG_BYTES_OUT:
+			stat_value = stats->st_ilog.ts_write;
+			break;
+		case XT_STAT_ILOG_SYNC_COUNT:
+			stat_value = stats->st_ilog.ts_flush;
+			break;
+		case XT_STAT_ILOG_SYNC_TIME:
+			stat_value = stats->st_ilog.ts_flush_time;
+			break;
+
+		case XT_STAT_XLOG_BYTES_IN:
+			stat_value = stats->st_xlog.ts_read;
+			break;
+		case XT_STAT_XLOG_BYTES_OUT:
+			stat_value = stats->st_xlog.ts_write;
+			break;
+		case XT_STAT_XLOG_SYNC_COUNT:
+			stat_value = stats->st_xlog.ts_flush;
+			break;
+		case XT_STAT_XLOG_SYNC_TIME:
+			stat_value = stats->st_xlog.ts_flush_time;
+			break;
+		case XT_STAT_XLOG_CACHE_HIT:
+			stat_value = stats->st_xlog_cache_hit;
+			break;
+		case XT_STAT_XLOG_CACHE_MISS:
+			stat_value = stats->st_xlog_cache_miss;
+			break;
+		case XT_STAT_XLOG_CACHE_USAGE:
+			stat_value = (u_llong) xt_xlog_get_usage();
+			break;
+
+		case XT_STAT_DATA_BYTES_IN:
+			stat_value = stats->st_data.ts_read;
+			break;
+		case XT_STAT_DATA_BYTES_OUT:
+			stat_value = stats->st_data.ts_write;
+			break;
+		case XT_STAT_DATA_SYNC_COUNT:
+			stat_value = stats->st_data.ts_flush;
+			break;
+		case XT_STAT_DATA_SYNC_TIME:
+			stat_value = stats->st_data.ts_flush_time;
+			break;
+
+		case XT_STAT_BYTES_TO_CHKPNT:
+			stat_value = db ? xt_bytes_since_last_checkpoint(db, db->db_xlog.xl_write_log_id, db->db_xlog.xl_write_log_offset) : 0;
+			break;
+		case XT_STAT_LOG_BYTES_TO_WRITE:
+			stat_value = db ? db->db_xlog.xl_log_bytes_written - db->db_xlog.xl_log_bytes_read : 0;//db->db_xlog.xlog_bytes_to_write();
+			break;
+		case XT_STAT_BYTES_TO_SWEEP:
+			/* This stat is potentially very expensive: */
+			stat_value = db ? xt_xn_bytes_to_sweep(db, xt_get_self()) : 0;
+			break;
+		case XT_STAT_WAIT_FOR_XACT:
+			stat_value = stats->st_wait_for_xact;
+			break;
+		case XT_STAT_XACT_TO_CLEAN:
+			stat_value = db ? db->db_xn_curr_id + 1 - db->db_xn_to_clean_id : 0;
+			break;
+		case XT_STAT_SWEEPER_WAITS:
+			stat_value = db ? db->db_stat_sweep_waits : 0;
+			break;
+
+		case XT_STAT_SCAN_INDEX:
+			stat_value = stats->st_scan_index;
+			break;
+		case XT_STAT_SCAN_TABLE:
+			stat_value = stats->st_scan_table;
+			break;
+		case XT_STAT_ROW_SELECT:
+			stat_value = stats->st_row_select;
+			break;
+		case XT_STAT_ROW_INSERT:
+			stat_value = stats->st_row_insert;
+			break;
+		case XT_STAT_ROW_UPDATE:
+			stat_value = stats->st_row_update;
+			break;
+		case XT_STAT_ROW_DELETE:
+			stat_value = stats->st_row_delete;
+			break;
+
+		case XT_STAT_RETRY_INDEX_SCAN:
+			stat_value = stats->st_retry_index_scan;
+			break;
+		case XT_STAT_REREAD_REC_LIST:
+			stat_value = stats->st_reread_record_list;
+			break;
+		default:
+			stat_value = 0;
+			break;
+	}
+	return stat_value;
+}
diff --git a/storage/pbxt/src/thread_xt.h b/storage/pbxt/src/thread_xt.h
new file mode 100644
index 00000000000..a07f7b7ae01
--- /dev/null
+++ b/storage/pbxt/src/thread_xt.h
@@ -0,0 +1,679 @@
+/* Copyright (c) 2005 PrimeBase Technologies GmbH
+ *
+ * PrimeBase XT
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * 2005-01-03	Paul McCullagh
+ *
+ * H&G2JCtL
+ */
+
+#ifndef __xt_thread_h__
+#define __xt_thread_h__
+
+#include <stdio.h>
+#ifndef XT_WIN
+#include <sys/param.h>
+#endif
+#include <setjmp.h>
+
+#include "xt_defs.h"
+#include "xt_errno.h"
+#include "linklist_xt.h"
+#include "memory_xt.h"
+#include "xactlog_xt.h"
+#include "datalog_xt.h"
+#include "lock_xt.h"
+#include "locklist_xt.h"
+
+/*
+ * -----------------------------------------------------------------------
+ * Macros and defines
+ */
+
+#define XT_ERR_MSG_SIZE					(PATH_MAX + 200)
+
+#ifdef DEBUG
+#define ASSERT(expr)					((expr) ? TRUE : xt_assert(self, #expr, __FUNC__, __FILE__, __LINE__))
+#else
+#define ASSERT(expr)					((void) 0)
+#endif
+
+#ifdef DEBUG
+#define ASSUME(expr)					((expr) ? TRUE : xt_assume(self, #expr, __FUNC__, __FILE__, __LINE__))
+#else
+#define ASSUME(expr)					((void) 0)
+#endif
+
+#ifdef DEBUG
+#define ASSERT_NS(expr)					((expr) ? TRUE : xt_assert(NULL, #expr, __FUNC__, __FILE__, __LINE__))
+#else
+#define ASSERT_NS(expr)					((void) 0)
+#endif
+
+#define XT_THROW_ASSERTION(str)			xt_throw_assertion(self, __FUNC__, __FILE__, __LINE__, str)
+
+/* Log levels */
+#define XT_LOG_DEFAULT					-1
+#define XT_LOG_PROTOCOL					0
+#define XT_LOG_FATAL					1
+#define XT_LOG_ERROR					2
+#define XT_LOG_WARNING					3
+#define XT_LOG_INFO						4
+#define XT_LOG_TRACE					5
+
+#define XT_PROTOCOL						self, "", NULL, 0, XT_LOG_PROTOCOL
+#define XT_WARNING						self, "", NULL, 0, XT_LOG_WARNING
+#define XT_INFO							self, "", NULL, 0, XT_LOG_INFO
+#define XT_ERROR						self, "", NULL, 0, XT_LOG_ERROR
+#define XT_TRACE						self, "", NULL, 0, XT_LOG_TRACE
+
+#define XT_NT_PROTOCOL					NULL, "", NULL, 0, XT_LOG_PROTOCOL
+#define XT_NT_WARNING					NULL, "", NULL, 0, XT_LOG_WARNING
+#define XT_NT_INFO						NULL, "", NULL, 0, XT_LOG_INFO
+#define XT_NT_ERROR						NULL, "", NULL, 0, XT_LOG_ERROR
+#define XT_NT_TRACE						NULL, "", NULL, 0, XT_LOG_TRACE
+
+#define XT_ERROR_CONTEXT(func)			self, __FUNC__, __FILE__, __LINE__, XT_LOG_ERROR
+
+/* Thread types */
+#define XT_THREAD_MAIN					0
+#define XT_THREAD_WORKER				1
+
+/* Thread Priorities: */
+#define XT_PRIORITY_LOW					0
+#define XT_PRIORITY_NORMAL				1
+#define XT_PRIORITY_HIGH				2
+
+#define XT_CONTEXT						self, __FUNC__, __FILE__, __LINE__
+#define XT_NS_CONTEXT					NULL, __FUNC__, __FILE__, __LINE__
+#define XT_REG_CONTEXT					__FUNC__, __FILE__, __LINE__
+
+#define XT_MAX_JMP						20
+#define XT_MAX_CALL_STACK				100						/* The number of functions recorded by enter_() and exit() */
+#define XT_RES_STACK_SIZE				4000					/* The size of the stack resource stack in bytes. */
+#define XT_MAX_RESOURCE_USAGE			5						/* The maximum number of temp slots used per routine. */
+#define XT_CATCH_TRACE_SIZE				1024
+#define XT_MAX_FUNC_NAME_SIZE			120
+#define XT_SOURCE_FILE_NAME_SIZE		40
+#define XT_THR_NAME_SIZE				80
+
+#ifdef XT_THREAD_LOCK_INFO
+#define xt_init_rwlock_with_autoname(a,b)	xt_init_rwlock(a,b,LOCKLIST_ARG_SUFFIX(b))
+#else
+#define xt_init_rwlock_with_autoname(a,b)	xt_init_rwlock(a,b)
+#endif
+
+typedef struct XTException {
+	int						e_xt_err;									/* The XT error number (ALWAYS non-zero on error, else zero) */
+	int						e_sys_err;									/* The system error number (0 if none) */
+	char					e_err_msg[XT_ERR_MSG_SIZE];					/* The error message text (0 terminated string) */
+	char					e_func_name[XT_MAX_FUNC_NAME_SIZE];			/* The name of the function in which the exception occurred */
+	char					e_source_file[XT_SOURCE_FILE_NAME_SIZE];	/* The source file in which the exception was thrown */
+	u_int					e_source_line;								/* The source code line number on which the exception was thrown */
+	char					e_catch_trace[XT_CATCH_TRACE_SIZE];			/* A string of the catch trace. */
+} XTExceptionRec, *XTExceptionPtr;
+
+struct XTThread;
+struct XTSortedList;
+struct XTXactLog;
+struct XTXactData;
+struct XTDatabase;
+struct XTOpenTable;
+
+typedef void (*XTThreadFreeFunc)(struct XTThread *self, void *data);
+
+typedef struct XTResourceArgs {
+	void					*ra_p1;
+	xtWord4					ra_p2;
+} XTResourceArgsRec, *XTResourceArgsPtr;
+
+/* This structure represents a temporary resource on the resource stack.
+ * Resource are automatically freed if an exception occurs.
+ */
+typedef struct XTResource {
+	xtWord4					r_prev_size;					/* The size of the previous resource on the stack (must be first!) */
+	void					*r_data;						/* A pointer to the resource data (this may be on the resource stack) */
+	XTThreadFreeFunc		r_free_func;					/* The function used to free the resource. */
+} XTResourceRec, *XTResourcePtr;
+
+typedef struct XTJumpBuf {
+	XTResourcePtr			jb_res_top;
+	int						jb_call_top;
+	jmp_buf					jb_buffer;
+} XTJumpBufRec, *XTJumpBufPtr;
+
+typedef struct XTCallStack {
+	c_char					*cs_func;
+	c_char					*cs_file;
+	u_int					cs_line;
+} XTCallStackRec, *XTCallStackPtr;
+
+typedef struct XTIOStats {
+	u_int					ts_read;						/* The number of bytes read. */
+	u_int					ts_write;						/* The number of bytes written. */
+	xtWord8					ts_flush_time;					/* The accumulated flush time. */
+	xtWord8					ts_flush_start;					/* Start time, non-zero if a timer is running. */
+	u_int					ts_flush;						/* The number of flush operations. */
+} XTIOStatsRec, *XTIOStatsPtr;
+
+#define XT_ADD_STATS(x, y)	{ \
+	(x).ts_read += (y).ts_read; \
+	(x).ts_write += (y).ts_write; \
+	(x).ts_flush_time += (y).ts_flush_time; \
+	(x).ts_flush += (y).ts_flush; \
+}
+
+typedef struct XTStatistics {
+	u_int					st_commits;
+	u_int					st_rollbacks;
+	u_int					st_stat_read;
+	u_int					st_stat_write;
+
+	XTIOStatsRec			st_rec;
+	u_int					st_rec_cache_hit;
+	u_int					st_rec_cache_miss;
+	u_int					st_rec_cache_frees;
+
+	XTIOStatsRec			st_ind;
+	u_int					st_ind_cache_hit;
+	u_int					st_ind_cache_miss;
+	XTIOStatsRec			st_ilog;
+
+	XTIOStatsRec			st_xlog;
+	u_int					st_xlog_cache_hit;
+	u_int					st_xlog_cache_miss;
+
+	XTIOStatsRec			st_data;
+
+	XTIOStatsRec			st_x;
+
+	u_int					st_scan_index;
+	u_int					st_scan_table;
+	u_int					st_row_select;
+	u_int					st_row_insert;
+	u_int					st_row_update;
+	u_int					st_row_delete;
+
+	u_int					st_wait_for_xact;
+	u_int					st_retry_index_scan;
+	u_int					st_reread_record_list;
+	XTIOStatsRec			st_ind_flush_time;
+} XTStatisticsRec, *XTStatisticsPtr;
+
+/*
+ * PBXT supports COMMITTED READ and REPEATABLE READ.
+ *
+ * As Jim says, multi-versioning cannot implement SERIALIZABLE. Basically
+ * you need locking to do this. Although phantom reads do not occur with
+ * MVCC, it is still not serializable.
+ *
+ * This can be seen from the following example:
+ *
+ * T1: INSERT t1 VALUE (1, 1);
+ * T2: INSERT t1 VALUE (2, 2);
+ * T1: UPDATE t1 SET b = 3 WHERE a IN (1, 2);
+ * T2: UPDATE t1 SET b = 4 WHERE a IN (1, 2);
+ * Serialized result (T1, T2) or (T2, T1):
+ * a   b	or	a   b
+ * 1   4		1   3
+ * 2   4		1   3
+ * Non-serialized (MVCC) result:
+ * a   b
+ * 1   3
+ * 2   4
+ */
+#define XT_XACT_UNCOMMITTED_READ	0
+#define XT_XACT_COMMITTED_READ		1
+#define XT_XACT_REPEATABLE_READ		2						/* Guarentees rows already read will not change. */
+#define XT_XACT_SERIALIZABLE		3						
+
+typedef struct XTThread {
+	XTLinkedItemRec			t_links;						/* Required to be a member of a double-linked list. */
+
+	char					t_name[XT_THR_NAME_SIZE];		/* The name of the thread. */
+	xtBool					t_main;							/* TRUE if this is the main (initial) thread */
+	xtBool					t_quit;							/* TRUE if this thread should stop running. */
+	xtBool					t_daemon;						/* TRUE if this thread is a daemon. */
+	xtThreadID				t_id;							/* The thread ID (0=main), index into thread array. */
+	pthread_t				t_pthread;						/* The pthread associated with xt thread */
+	xtBool					t_disable_interrupts;			/* TRUE if interrupts are disabled. */
+	int						t_delayed_signal;				/* Throw this signal as soon as you can! */
+
+	void					*t_data;						/* Data passed to the thread. */
+	XTThreadFreeFunc		t_free_data;					/* Routine used to free the thread data */
+
+	int						t_call_top;						/* A pointer to the top of the call stack. */
+	XTCallStackRec			t_call_stack[XT_MAX_CALL_STACK];/* Records the function under execution (to be output on error). */
+
+	XTResourcePtr			t_res_top;						/* The top of the resource stack (reference next free space). */
+	union {
+		char				t_res_stack[XT_RES_STACK_SIZE];	/* Temporary data to be freed if an exception occurs. */
+		xtWord4				t_align_res_stack;
+	} x;
+
+	int						t_jmp_depth;					/* The current jump depth */
+	XTJumpBufRec			t_jmp_env[XT_MAX_JMP];			/* The process environment to be restored on exception */
+	XTExceptionRec			t_exception;					/* The exception details. */
+
+	xt_cond_type			t_cond;							/* The pthread condition used for suspending the thread. */
+	xt_mutex_type			t_lock;							/* Thread lock, used for operations on a thread that may be done by other threads.
+															 * for example xt_unuse_database().
+															 */
+	
+	/* Application specific data: */
+	struct XTDatabase		*st_database;					/* The database in use by the thread. */
+	u_int					st_lock_count;					/* We count the number of locks MySQL has set in order to know when they are all released. */
+	u_int					st_stat_count;					/* start statement count. */
+	struct XTXactData		*st_xact_data;					/* The transaction data, not NULL if the transaction performs an update. */
+	xtBool					st_xact_writer;					/* TRUE if the transaction has written somthing to the log. */
+	time_t					st_xact_write_time;				/* Approximate first write time (uses xt_db_approximate_time). */
+	xtBool					st_xact_long_running;			/* TRUE if this is a long running writer transaction. */
+	xtWord4					st_visible_time;				/* Transactions committed before this time are visible. */
+	XTDataLogBufferRec		st_dlog_buf;
+	
+	/* A list of the last 10 transactions run by this connection: */
+#ifdef XT_WAIT_FOR_CLEANUP
+	u_int					st_last_xact;
+	xtXactID				st_prev_xact[XT_MAX_XACT_BEHIND];
+#endif
+
+	int						st_xact_mode;					/* The transaction mode. */
+	xtBool					st_ignore_fkeys;				/* TRUE if we must ignore foreign keys. */
+	xtBool					st_auto_commit;					/* TRUE if this is an auto-commit transaction. */
+	xtBool					st_table_trans;					/* TRUE transactions is a result of LOCK TABLES. */
+	xtBool					st_abort_trans;					/* TRUE if the transaction should be aborted. */
+	xtBool					st_stat_ended;					/* TRUE if the statement was ended. */
+	xtBool					st_stat_trans;					/* TRUE if a statement transaction is running (started on UPDATE). */
+	xtBool					st_stat_modify;					/* TRUE if the statement is an INSERT/UPDATE/DELETE */
+#ifdef XT_IMPLEMENT_NO_ACTION
+	XTBasicListRec			st_restrict_list;				/* These records have been deleted and should have no reference. */
+#endif
+	/* Local thread list. */
+	u_int					st_thread_list_count;
+	u_int					st_thread_list_size;
+	xtThreadID				*st_thread_list;
+
+	/* Used to prevent a record from being updated twice in one statement. */
+	struct XTOpenTable		*st_is_update;					/* TRUE if this is an UPDATE statement.  {UPDATE-STACK} */
+
+	XTRowLockListRec		st_lock_list;					/* The thread row lock list (drop locks on transaction end). */
+	XTStatisticsRec			st_statistics;					/* Accumulated statistics for this thread. */
+#ifdef XT_THREAD_LOCK_INFO
+	/* list of locks (spins, mutextes, etc) that this thread currently holds (debugging) */
+	XTThreadLockInfoPtr		st_thread_lock_list[XT_THREAD_LOCK_INFO_MAX_COUNT];
+	int						st_thread_lock_count;
+#endif
+} XTThreadRec, *XTThreadPtr;
+
+/*
+ * -----------------------------------------------------------------------
+ * Call stack
+ */
+
+#define XT_INIT_CHECK_STACK		char xt_chk_buffer[512]; memset(xt_chk_buffer, 0xFE, 512);
+#define XT_RE_CHECK_STACK		memset(xt_chk_buffer, 0xFE, 512);
+
+/*
+ * This macro must be placed at the start of every function.
+ * It records the current context so that we can
+ * dump a type of stack trace later if necessary.
+ *
+ * It also sets up the current thread pointer 'self'.
+ */
+#ifdef DEBUG
+#define XT_STACK_TRACE
+#endif
+
+/*
+ * These macros generate a stack trace which can be used
+ * to locate an error on exception.
+ */
+#ifdef XT_STACK_TRACE
+
+/*
+ * Place this call at the top of a function,
+ * after the declaration of local variable, and
+ * before the first code is executed.
+ */
+#define enter_()			int xt_frame = self->t_call_top++; \
+							do { \
+								if (xt_frame < XT_MAX_CALL_STACK) { \
+									self->t_call_stack[xt_frame].cs_func = __FUNC__; \
+									self->t_call_stack[xt_frame].cs_file = __FILE__; \
+									self->t_call_stack[xt_frame].cs_line = __LINE__; \
+								} \
+							} while (0)
+
+#define outer_()			self->t_call_top = xt_frame;
+
+/*
+ * On exit to a function, either exit_() or
+ * return_() must be called.
+ */
+#define exit_()				do { \
+								outer_(); \
+								return; \
+							} while (0)
+	
+#define return_(x)			do { \
+								outer_(); \
+								return(x); \
+							} while (0)
+
+#define returnc_(x, typ)	do { \
+								typ rv; \
+								rv = (x); \
+								outer_(); \
+								return(rv); \
+							} while (0)
+
+/*
+ * Sets the line number before a call to get a better
+ * stack trace;
+ */
+#define call_(x)			do { self->t_call_stack[xt_frame].cs_line = __LINE__; x; } while (0)
+
+#else
+#define enter_()
+#define outer_()
+#define exit_()				return;
+#define return_(x)			return (x)
+#define returnc_(x, typ)	return (x)
+#define call_(x)			x
+#endif
+
+/*
+ * -----------------------------------------------------------------------
+ * Throwing and catching
+ */
+
+int prof_setjmp(void);
+
+#define TX_CHK_JMP()		if ((self)->t_jmp_depth < 0 || (self)->t_jmp_depth >= XT_MAX_JMP) xt_throw_xterr(self, __FUNC__, __FILE__, __LINE__, XT_ERR_JUMP_OVERFLOW)
+#ifdef PROFILE
+#define profile_setjmp		prof_setjmp()
+#else
+#define profile_setjmp			
+#endif
+
+#define try_(n)				TX_CHK_JMP(); \
+							(self)->t_jmp_env[(self)->t_jmp_depth].jb_res_top = (self)->t_res_top; \
+							(self)->t_jmp_env[(self)->t_jmp_depth].jb_call_top = (self)->t_call_top; \
+							(self)->t_jmp_depth++; profile_setjmp; if (setjmp((self)->t_jmp_env[(self)->t_jmp_depth-1].jb_buffer)) goto catch_##n;
+#define catch_(n)			(self)->t_jmp_depth--; goto cont_##n; catch_##n: (self)->t_jmp_depth--; xt_caught(self);
+#define cont_(n)			cont_##n:
+#define throw_()			xt_throw(self)
+
+/*
+ * -----------------------------------------------------------------------
+ * Resource stack
+ */
+
+//#define DEBUG_RESOURCE_STACK
+
+#ifdef DEBUG_RESOURCE_STACK
+#define CHECK_RS			if ((char *) (self)->t_res_top < (self)->x.t_res_stack) xt_bug(self);
+#define CHECK_NS_RS			{ XTThreadPtr self = xt_get_self(); CHECK_RS; }
+#else
+#define CHECK_RS			remove this!
+#define CHECK_NS_RS			remove this!
+#endif
+
+/*
+ * Allocate a resource on the resource stack. The resource will be freed
+ * automatocally if an exception occurs. Before exiting the current
+ * procedure you must free the resource using popr_() or freer_().
+ * v = value to be set to the resource,
+ * f = function which frees the resource,
+ * s = the size of the resource,
+ */
+
+/* GOTCHA: My experience is that contructs such as *((xtWordPS *) &(v)) = (xtWordPS) (x)
+ * cause optimised versions to crash?!
+ */
+#define allocr_(v, f, s, t)		do { \
+									if (((char *) (self)->t_res_top) > (self)->x.t_res_stack + XT_RES_STACK_SIZE - sizeof(XTResourceRec) + (s) + 4) \
+										xt_throw_xterr(self, __FUNC__, __FILE__, __LINE__, XT_ERR_RES_STACK_OVERFLOW); \
+									v = (t) (((char *) (self)->t_res_top) + sizeof(XTResourceRec)); \
+									(self)->t_res_top->r_data = (v); \
+									(self)->t_res_top->r_free_func = (XTThreadFreeFunc) (f); \
+									(self)->t_res_top = (XTResourcePtr) (((char *) (self)->t_res_top) + sizeof(XTResourceRec) + (s)); \
+									(self)->t_res_top->r_prev_size = sizeof(XTResourceRec) + (s); \
+								} while (0)
+
+#define alloczr_(v, f, s, t)	do { allocr_(v, f, s, t); \
+									memset(v, 0, s); } while (0)
+
+/* Push and set a resource:
+ * v = value to be set to the resource,
+ * f = function which frees the resource,
+ * r = the resource,
+ * NOTE: the expression (r) must come first because it may contain
+ * calls which use the resource stack!!
+ */
+#define pushsr_(v, f, r)	do { \
+								if (((char *) (self)->t_res_top) > (self)->x.t_res_stack + XT_RES_STACK_SIZE - sizeof(XTResourceRec) + 4) \
+									xt_throw_xterr(self, __FUNC__, __FILE__, __LINE__, XT_ERR_RES_STACK_OVERFLOW); \
+								v = (r); \
+								(self)->t_res_top->r_data = (v); \
+								(self)->t_res_top->r_free_func = (XTThreadFreeFunc) (f); \
+								(self)->t_res_top = (XTResourcePtr) (((char *) (self)->t_res_top) + sizeof(XTResourceRec)); \
+								(self)->t_res_top->r_prev_size = sizeof(XTResourceRec); \
+							} while (0)
+
+/* Push a resource. In the event of an exception it will be freed
+ * the free routine.
+ * f = function which frees the resource,
+ * r = a pointer to the resource,
+ */
+#define pushr_(f, r)		do { \
+								if (((char *) (self)->t_res_top) > (self)->x.t_res_stack + XT_RES_STACK_SIZE - sizeof(XTResourceRec) + 4) \
+									xt_throw_xterr(self, __FUNC__, __FILE__, __LINE__, XT_ERR_RES_STACK_OVERFLOW); \
+								(self)->t_res_top->r_data = (r); \
+								(self)->t_res_top->r_free_func = (XTThreadFreeFunc) (f); \
+								(self)->t_res_top = (XTResourcePtr) (((char *) (self)->t_res_top) + sizeof(XTResourceRec)); \
+								(self)->t_res_top->r_prev_size = sizeof(XTResourceRec); \
+							} while (0)
+
+/* Pop a resource without freeing it: */
+#ifdef DEBUG_RESOURCE_STACK
+#define popr_()				do { \
+								(self)->t_res_top = (XTResourcePtr) (((char *) (self)->t_res_top) - (self)->t_res_top->r_prev_size); \
+								if ((char *) (self)->t_res_top < (self)->x.t_res_stack) \
+									xt_bug(self); \
+							} while (0)
+#else
+#define popr_()				do { (self)->t_res_top = (XTResourcePtr) (((char *) (self)->t_res_top) - (self)->t_res_top->r_prev_size); } while (0)
+#endif
+
+#define setr_(r)			do { ((XTResourcePtr) (((char *) (self)->t_res_top) - (self)->t_res_top->r_prev_size))->r_data = (r); } while (0)
+
+/* Pop and free a resource: */
+#ifdef DEBUG_RESOURCE_STACK
+#define freer_()			do {  \
+								register XTResourcePtr	rp; \
+								rp = (XTResourcePtr) (((char *) (self)->t_res_top) - (self)->t_res_top->r_prev_size); \
+								if ((char *) rp < (self)->x.t_res_stack) \
+									xt_bug(self); \
+								(rp->r_free_func)((self), rp->r_data); \
+								(self)->t_res_top = rp; \
+							} while (0)
+#else
+#define freer_()			do {  \
+								register XTResourcePtr	rp; \
+								rp = (XTResourcePtr) (((char *) (self)->t_res_top) - (self)->t_res_top->r_prev_size); \
+								(rp->r_free_func)((self), rp->r_data); \
+								(self)->t_res_top = rp; \
+							} while (0)
+#endif
+
+/*
+ * -----------------------------------------------------------------------
+ * Thread globals
+ */
+
+extern u_int			xt_thr_maximum_threads;
+extern u_int			xt_thr_current_thread_count;
+extern u_int			xt_thr_current_max_threads;
+extern struct XTThread	**xt_thr_array;
+
+/*
+ * -----------------------------------------------------------------------
+ * Function prototypes
+ */
+
+/* OpenSolaris has thr_main in /usr/include/thread.h (name conflict)
+ * Thanks for the tip Monty!
+ */
+extern "C" void *xt_thread_main(void *data);
+
+void			xt_get_now(char *buffer, size_t len);
+xtBool			xt_init_logging(void);
+void			xt_exit_logging(void);
+void			xt_log_flush(XTThreadPtr self);
+void			xt_logf(XTThreadPtr self, c_char *func, c_char *file, u_int line, int level, c_char *fmt, ...);
+void			xt_log(XTThreadPtr self, c_char *func, c_char *file, u_int line, int level, c_char *string);
+int				xt_log_errorf(XTThreadPtr self, c_char *func, c_char *file, u_int line, int level, int xt_err, int sys_err, c_char *fmt, ...);
+int				xt_log_error(XTThreadPtr self, c_char *func, c_char *file, u_int line, int level, int xt_err, int sys_err, c_char *string);
+void			xt_log_exception(XTThreadPtr self, XTExceptionPtr e, int level);
+void			xt_clear_exception(XTThreadPtr self);
+void			xt_log_and_clear_exception(XTThreadPtr self);
+void			xt_log_and_clear_exception_ns(void);
+void			xt_log_and_clear_warning(XTThreadPtr self);
+void			xt_log_and_clear_warning_ns(void);
+
+void			xt_bug(XTThreadPtr self);
+void			xt_caught(XTThreadPtr self);
+void			xt_throw(XTThreadPtr self);
+void			xt_throwf(XTThreadPtr self, c_char *func, c_char *file, u_int line, int xt_err, int sys_err, c_char *format, ...);
+void			xt_throw_error(XTThreadPtr self, c_char *func, c_char *file, u_int line, int xt_err, int sys_err, c_char *message);
+void			xt_throw_i2xterr(XTThreadPtr self, c_char *func, c_char *file, u_int line, int xt_err, c_char *item, c_char *item2);
+void			xt_throw_ixterr(XTThreadPtr self, c_char *func, c_char *file, u_int line, int xt_err, c_char *item);
+void			xt_throw_tabcolerr(XTThreadPtr self, c_char *func, c_char *file, u_int line, int xt_err, XTPathStrPtr tab_item, c_char *item2);
+void			xt_throw_taberr(XTThreadPtr self, c_char *func, c_char *file, u_int line, int xt_err, XTPathStrPtr tab_item);
+void			xt_throw_ulxterr(XTThreadPtr self, c_char *func, c_char *file, u_int line, int xt_err, u_long value);
+void			xt_throw_sulxterr(XTThreadPtr self, c_char *func, c_char *file, u_int line, int xt_err, c_char *item, u_long value);
+void			xt_throw_xterr(XTThreadPtr self, c_char *func, c_char *file, u_int line, int xt_err);
+void			xt_throw_errno(XTThreadPtr self, c_char *func, c_char *file, u_int line, int err_no);
+void			xt_throw_ferrno(XTThreadPtr self, c_char *func, c_char *file, u_int line, int err_no, c_char *path);
+void			xt_throw_assertion(XTThreadPtr self, c_char *func, c_char *file, u_int line, c_char *str);
+void			xt_throw_signal(XTThreadPtr self, c_char *func, c_char *file, u_int line, int sig);
+xtBool			xt_throw_delayed_signal(XTThreadPtr self, c_char *func, c_char *file, u_int line);
+
+void			xt_registerf(c_char *func, c_char *file, u_int line, int xt_err, int sys_err, c_char *fmt, ...);
+void			xt_register_i2xterr(c_char *func, c_char *file, u_int line, int xt_err, c_char *item, c_char *item2);
+void			xt_register_ixterr(c_char *func, c_char *file, u_int line, int xt_err, c_char *item);
+void			xt_register_tabcolerr(c_char *func, c_char *file, u_int line, int xt_err, XTPathStrPtr tab_item, c_char *item2);
+void			xt_register_taberr(c_char *func, c_char *file, u_int line, int xt_err, XTPathStrPtr tab_item);
+void			xt_register_ulxterr(c_char *func, c_char *file, u_int line, int xt_err, u_long value);
+xtBool			xt_register_ferrno(c_char *func, c_char *file, u_int line, int err, c_char *path);
+void			xt_register_error(c_char *func, c_char *file, u_int line, int xt_err, int sys_err, c_char *msg);
+xtBool			xt_register_errno(c_char *func, c_char *file, u_int line, int err);
+void			xt_register_xterr(c_char *func, c_char *file, u_int line, int xt_err);
+
+void			xt_exceptionf(XTExceptionPtr e, XTThreadPtr self, c_char *func, c_char *file, u_int line, int xt_err, int sys_err, c_char *fmt, ...);
+void			xt_exception_error(XTExceptionPtr e, XTThreadPtr self, c_char *func, c_char *file, u_int line, int xt_err, int sys_err, c_char *msg);
+xtBool			xt_exception_errno(XTExceptionPtr e, XTThreadPtr self, c_char *func, c_char *file, u_int line, int err);
+void			xt_exception_xterr(XTExceptionPtr e, XTThreadPtr self, c_char *func, c_char *file, u_int line, int xt_err);
+
+void			xt_log_errno(XTThreadPtr self, c_char *func, c_char *file, u_int line, int err);
+
+xtBool			xt_assert(XTThreadPtr self, c_char *expr, c_char *func, c_char *file, u_int line);
+xtBool			xt_assume(XTThreadPtr self, c_char *expr, c_char *func, c_char *file, u_int line);
+
+XTThreadPtr		xt_init_threading(u_int max_threads);
+void			xt_exit_threading(XTThreadPtr self);
+
+XTThreadPtr		xt_create_thread(c_char *name, xtBool main_thread, xtBool temp_thread, XTExceptionPtr e);
+XTThreadPtr		xt_create_daemon(XTThreadPtr parent, c_char *name);
+void			xt_free_thread(XTThreadPtr self);
+void			xt_set_thread_data(XTThreadPtr self, void *data, XTThreadFreeFunc free_func);
+pthread_t		xt_run_thread(XTThreadPtr parent, XTThreadPtr child, void *(*start_routine)(XTThreadPtr));
+void			xt_exit_thread(XTThreadPtr self, void *result);
+void			*xt_wait_for_thread(xtThreadID tid, xtBool ignore_error);
+void			xt_signal_all_threads(XTThreadPtr self, int sig);
+void			xt_do_to_all_threads(XTThreadPtr self, void (*do_func_ptr)(XTThreadPtr self, XTThreadPtr to_thr, void *thunk), void *thunk);
+void			xt_kill_thread(pthread_t t1);
+XTThreadPtr		xt_get_self(void);
+void			xt_set_self(XTThreadPtr self);
+void			xt_wait_for_all_threads(XTThreadPtr self);
+void			xt_busy_wait(void);
+void			xt_critical_wait(void);
+void			xt_yield(void);
+void			xt_sleep_milli_second(u_int t);
+xtBool 			xt_suspend(XTThreadPtr self);
+xtBool			xt_unsuspend(XTThreadPtr target);
+void			xt_lock_thread(XTThreadPtr thread);
+void			xt_unlock_thread(XTThreadPtr thread);
+xtBool			xt_wait_thread(XTThreadPtr thread);
+void			xt_signal_thread(XTThreadPtr target);
+void 			xt_terminate_thread(XTThreadPtr self, XTThreadPtr target);
+xtProcID		xt_getpid();
+xtBool			xt_process_exists(xtProcID pid);
+
+#ifdef XT_THREAD_LOCK_INFO
+#define	xt_init_rwlock_with_autoname(a,b) xt_init_rwlock(a,b,LOCKLIST_ARG_SUFFIX(b))
+xtBool xt_init_rwlock(XTThreadPtr self, xt_rwlock_type *rwlock, const char *name);
+#else
+#define	xt_init_rwlock_with_autoname(a,b) xt_init_rwlock(a,b)
+xtBool xt_init_rwlock(XTThreadPtr self, xt_rwlock_type *rwlock);
+#endif
+
+void			xt_free_rwlock(xt_rwlock_type *rwlock);
+xt_rwlock_type	*xt_slock_rwlock(XTThreadPtr self, xt_rwlock_type *rwlock);
+xt_rwlock_type	*xt_xlock_rwlock(XTThreadPtr self, xt_rwlock_type *rwlock);
+void			xt_unlock_rwlock(XTThreadPtr self, xt_rwlock_type *rwlock);
+
+xt_mutex_type	*xt_new_mutex(XTThreadPtr self);
+void			xt_delete_mutex(XTThreadPtr self, xt_mutex_type *mx);
+#ifdef XT_THREAD_LOCK_INFO
+#define			xt_init_mutex_with_autoname(a,b) xt_init_mutex(a,b,LOCKLIST_ARG_SUFFIX(b))
+xtBool			xt_init_mutex(XTThreadPtr self, xt_mutex_type *mx, const char *name);
+#else
+#define			xt_init_mutex_with_autoname(a,b) xt_init_mutex(a,b)
+xtBool			xt_init_mutex(XTThreadPtr self, xt_mutex_type *mx);
+#endif
+void			xt_free_mutex(xt_mutex_type *mx);
+xtBool			xt_lock_mutex(XTThreadPtr self, xt_mutex_type *mx);
+void			xt_unlock_mutex(XTThreadPtr self, xt_mutex_type *mx);
+
+pthread_cond_t	*xt_new_cond(XTThreadPtr self);
+void			xt_delete_cond(XTThreadPtr self, pthread_cond_t *cond);
+
+xtBool			xt_init_cond(XTThreadPtr self, pthread_cond_t *cond);
+void			xt_free_cond(pthread_cond_t *cond);
+xtBool			xt_wait_cond(XTThreadPtr self, pthread_cond_t *cond, xt_mutex_type *mutex);
+xtBool			xt_timed_wait_cond(XTThreadPtr self, pthread_cond_t *cond, xt_mutex_type *mutex, u_long milli_sec);
+xtBool			xt_signal_cond(XTThreadPtr self, pthread_cond_t *cond);
+void			xt_broadcast_cond(XTThreadPtr self, pthread_cond_t *cond);
+xtBool			xt_broadcast_cond_ns(xt_cond_type *cond);
+
+xtBool			xt_set_key(pthread_key_t key, const void *value, XTExceptionPtr e);
+void			*xt_get_key(pthread_key_t key);
+
+void			xt_set_low_priority(XTThreadPtr self);
+void			xt_set_normal_priority(XTThreadPtr self);
+void			xt_set_high_priority(XTThreadPtr self);
+void			xt_set_priority(XTThreadPtr self, int priority);
+
+void			xt_gather_statistics(XTStatisticsPtr stats);
+u_llong			xt_get_statistic(XTStatisticsPtr stats, struct XTDatabase *db, u_int rec_id);
+
+#define xt_timed_wait_cond_ns(a, b, c)	xt_timed_wait_cond(NULL, a, b, c)
+
+#endif
+
diff --git a/storage/pbxt/src/trace_xt.cc b/storage/pbxt/src/trace_xt.cc
new file mode 100644
index 00000000000..709ff71addc
--- /dev/null
+++ b/storage/pbxt/src/trace_xt.cc
@@ -0,0 +1,404 @@
+/* Copyright (c) 2005 PrimeBase Technologies GmbH
+ *
+ * PrimeBase XT
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * 2005-02-07	Paul McCullagh
+ *
+ * H&G2JCtL
+ */
+
+#include "xt_config.h"
+
+#include <stdio.h>
+#include <stdarg.h>
+#include <errno.h>
+#include <stdlib.h>
+#include <time.h>
+
+#include "trace_xt.h"
+#include "pthread_xt.h"
+#include "thread_xt.h"
+
+#ifdef DEBUG
+//#define PRINT_TRACE
+//#define RESET_AFTER_DUMP
+//#define DUMP_TO_STDOUT
+#endif
+
+static xtBool			trace_initialized = FALSE;
+static xt_mutex_type	trace_mutex;
+static size_t			trace_log_size;
+static size_t			trace_log_offset;
+static size_t			trace_log_end;
+static char				*trace_log_buffer;
+static u_long			trace_stat_count;
+static FILE				*trace_dump_file;
+static xtBool			trace_flush_dump = FALSE;
+
+#define DEFAULT_TRACE_LOG_SIZE		(40*1024*1204)
+#define MAX_PRINT_LEN				2000
+
+xtPublic xtBool xt_init_trace(void)
+{
+	int err;
+
+	err = xt_p_mutex_init_with_autoname(&trace_mutex, NULL);
+	if (err) {
+		xt_log_errno(XT_NS_CONTEXT, err);
+		trace_initialized = FALSE;
+		return FALSE;
+	}
+	trace_initialized = TRUE;
+	trace_log_buffer = (char *) malloc(DEFAULT_TRACE_LOG_SIZE+1);
+	if (!trace_log_buffer) {
+		xt_log_errno(XT_NS_CONTEXT, ENOMEM);
+		xt_exit_trace();
+		return FALSE;
+	}
+	trace_log_size = DEFAULT_TRACE_LOG_SIZE;
+	trace_log_offset = 0;
+	trace_log_end = 0;
+	trace_stat_count = 0;
+
+#ifdef XT_TRACK_CONNECTIONS
+	for (int i=0; i<XT_TRACK_MAX_CONNS; i++)
+		xt_track_conn_info[i].cu_t_id = i;
+#endif
+
+	return TRUE;
+}
+
+xtPublic void xt_exit_trace(void)
+{
+	if (trace_initialized) {
+#ifdef DEBUG
+		xt_dump_trace();
+#endif
+		xt_free_mutex(&trace_mutex);
+		trace_initialized = FALSE;
+		if (trace_log_buffer)
+			free(trace_log_buffer);
+		trace_log_buffer = NULL;
+		trace_log_size = 0;
+		trace_log_offset = 0;
+		trace_log_end = 0;
+		trace_stat_count = 0;
+	}
+	if (trace_dump_file) {
+		fclose(trace_dump_file);
+		trace_dump_file = NULL;
+	}
+}
+
+xtPublic void xt_print_trace(void)
+{
+	if (trace_log_offset) {
+		xt_lock_mutex_ns(&trace_mutex);
+		if (trace_log_end > trace_log_offset+1) {
+			trace_log_buffer[trace_log_end] = 0;
+			printf("%s", trace_log_buffer + trace_log_offset + 1);
+		}
+		trace_log_buffer[trace_log_offset] = 0;
+		printf("%s", trace_log_buffer);
+		trace_log_offset = 0;
+		trace_log_end = 0;
+		xt_unlock_mutex_ns(&trace_mutex);
+	}
+}
+
+xtPublic void xt_dump_trace(void)
+{
+	if (trace_log_offset) {
+#ifdef DUMP_TO_STDOUT
+		if (trace_log_end > trace_log_offset+1) {
+			trace_log_buffer[trace_log_end] = 0;
+			printf("%s", trace_log_buffer + trace_log_offset + 1);
+		}
+		trace_log_buffer[trace_log_offset] = 0;
+		printf("%s", trace_log_buffer);
+		printf("\n");
+#else
+		FILE *fp;
+
+		fp = fopen("pbxt.log", "w");
+
+		xt_lock_mutex_ns(&trace_mutex);
+		if (fp) {
+			if (trace_log_end > trace_log_offset+1) {
+				trace_log_buffer[trace_log_end] = 0;
+				fprintf(fp, "%s", trace_log_buffer + trace_log_offset + 1);
+			}
+			trace_log_buffer[trace_log_offset] = 0;
+			fprintf(fp, "%s", trace_log_buffer);
+			fclose(fp);
+		}
+#endif
+
+#ifdef RESET_AFTER_DUMP
+		trace_log_offset = 0;
+		trace_log_end = 0;
+		trace_stat_count = 0;
+#endif
+		xt_unlock_mutex_ns(&trace_mutex);
+	}
+
+	if (trace_dump_file) {
+		xt_lock_mutex_ns(&trace_mutex);
+		if (trace_dump_file) {
+			fflush(trace_dump_file);
+			fclose(trace_dump_file);
+			trace_dump_file = NULL;
+		}
+		xt_unlock_mutex_ns(&trace_mutex);
+	}
+}
+
+xtPublic void xt_trace(const char *fmt, ...)
+{
+	va_list	ap;
+	size_t	len;
+
+	va_start(ap, fmt);
+	xt_lock_mutex_ns(&trace_mutex);
+
+	if (trace_log_offset + MAX_PRINT_LEN > trace_log_size) {
+		/* Start at the beginning of the buffer again: */
+		trace_log_end = trace_log_offset;
+		trace_log_offset = 0;
+	}
+
+	len = (size_t) vsnprintf(trace_log_buffer + trace_log_offset, trace_log_size - trace_log_offset, fmt, ap);
+	trace_log_offset += len;
+
+	xt_unlock_mutex_ns(&trace_mutex);
+	va_end(ap);
+
+#ifdef PRINT_TRACE
+	xt_print_trace();
+#endif
+}
+
+xtPublic void xt_ttracef(XTThreadPtr self, char *fmt, ...)
+{
+	va_list	ap;
+	size_t	len;
+
+	va_start(ap, fmt);
+	xt_lock_mutex_ns(&trace_mutex);
+
+	if (trace_log_offset + MAX_PRINT_LEN > trace_log_size) {
+		trace_log_end = trace_log_offset;
+		trace_log_offset = 0;
+	}
+
+	trace_stat_count++;
+	len = (size_t) sprintf(trace_log_buffer + trace_log_offset, "%lu %s: ", trace_stat_count, self->t_name);
+	trace_log_offset += len;
+	len = (size_t) vsnprintf(trace_log_buffer + trace_log_offset, trace_log_size - trace_log_offset, fmt, ap);
+	trace_log_offset += len;
+
+	xt_unlock_mutex_ns(&trace_mutex);
+	va_end(ap);
+
+#ifdef PRINT_TRACE
+	xt_print_trace();
+#endif
+}
+
+xtPublic void xt_ttraceq(XTThreadPtr self, char *query)
+{
+	size_t	qlen = strlen(query), tlen;
+	char	*ptr, *qptr;
+
+	if (!self)
+		self = xt_get_self();
+
+	xt_lock_mutex_ns(&trace_mutex);
+
+	if (trace_log_offset + qlen + 100 >= trace_log_size) {
+		/* Start at the beginning of the buffer again: */
+		trace_log_end = trace_log_offset;
+		trace_log_offset = 0;
+	}
+
+	trace_stat_count++;
+	tlen = (size_t) sprintf(trace_log_buffer + trace_log_offset, "%lu %s: ", trace_stat_count, self->t_name);
+	trace_log_offset += tlen;
+
+	ptr = trace_log_buffer + trace_log_offset;
+	qlen = 0;
+	qptr = query;
+	while (*qptr) {
+		if (*qptr == '\n' || *qptr == '\r')
+			*ptr = ' ';
+		else
+			*ptr = *qptr;
+		if (*qptr == '\n' || *qptr == '\r' || *qptr == ' ') {
+			qptr++;
+			while (*qptr == '\n' || *qptr == '\r' || *qptr == ' ')
+				qptr++;				
+		}
+		else
+			qptr++;
+		ptr++;
+		qlen++;
+	}
+
+	trace_log_offset += qlen;
+	*(trace_log_buffer + trace_log_offset) = '\n';
+	*(trace_log_buffer + trace_log_offset + 1) = '\0';
+	trace_log_offset++;
+	
+	xt_unlock_mutex_ns(&trace_mutex);
+
+#ifdef PRINT_TRACE
+	xt_print_trace();
+#endif
+}
+
+/*
+ * Returns the time in microseconds.
+ * (1/1000000 of a second)
+ */
+xtPublic xtWord8 xt_trace_clock(void)
+{
+	static xtWord8	trace_start_clock = 0;
+	xtWord8			now;
+
+#ifdef XT_WIN
+	now = ((xtWord8) GetTickCount()) * (xtWord8) 1000;
+#else
+	struct timeval	tv;
+
+	gettimeofday(&tv, NULL);
+	now = (xtWord8) tv.tv_sec * (xtWord8) 1000000 + tv.tv_usec;
+#endif
+	if (trace_start_clock)
+		return now - trace_start_clock;
+	trace_start_clock = now;
+	return 0;
+}
+
+xtPublic char *xt_trace_clock_str(char *ptr)
+{
+	static char	buffer[50];
+	xtWord8		now = xt_trace_clock();
+
+	if (!ptr)
+		ptr = buffer;
+
+	sprintf(ptr, "%d.%06d", (int) (now / (xtWord8) 1000000), (int) (now % (xtWord8) 1000000));
+	return ptr;
+}
+
+xtPublic char *xt_trace_clock_diff(char *ptr)
+{
+	static xtWord8	trace_last_clock = 0;
+	static char		buffer[50];
+	xtWord8			now = xt_trace_clock();
+
+	if (!ptr)
+		ptr = buffer;
+
+	sprintf(ptr, "%d.%06d (%d)", (int) (now / (xtWord8) 1000000), (int) (now % (xtWord8) 1000000), (int) (now - trace_last_clock));
+	trace_last_clock = now;
+	return ptr;
+}
+
+xtPublic char *xt_trace_clock_diff(char *ptr, xtWord8 start_time)
+{
+	xtWord8 now = xt_trace_clock();
+
+	sprintf(ptr, "%d.%06d (%d)", (int) (now / (xtWord8) 1000000), (int) (now % (xtWord8) 1000000), (int) (now - start_time));
+	return ptr;
+}
+
+
+xtPublic void xt_set_fflush(xtBool on)
+{
+	trace_flush_dump = on;
+}
+
+xtPublic void xt_ftracef(char *fmt, ...)
+{
+	va_list	ap;
+
+	va_start(ap, fmt);
+	xt_lock_mutex_ns(&trace_mutex);
+
+	if (!trace_dump_file) {
+		char buffer[100];
+
+		for (int i=1; ;i++) {
+			sprintf(buffer, "pbxt-dump-%d.log", i);
+			if (!xt_fs_exists(buffer)) {
+				trace_dump_file = fopen(buffer, "w");
+				break;
+			}
+		}
+	}
+
+	vfprintf(trace_dump_file, fmt, ap);
+	if (trace_flush_dump)
+		fflush(trace_dump_file);
+
+	xt_unlock_mutex_ns(&trace_mutex);
+	va_end(ap);
+}
+
+/*
+ * -----------------------------------------------------------------------
+ * CONNECTION TRACKING
+ */
+
+#ifdef XT_TRACK_CONNECTIONS
+XTConnInfoRec	xt_track_conn_info[XT_TRACK_MAX_CONNS];
+
+static int trace_comp_conn_info(const void *a, const void *b)
+{
+	XTConnInfoPtr	ci_a = (XTConnInfoPtr) a, ci_b = (XTConnInfoPtr) b;
+
+	if (ci_a->ci_curr_xact_id > ci_b->ci_curr_xact_id)
+		return 1;
+	if (ci_a->ci_curr_xact_id < ci_b->ci_curr_xact_id)
+		return -1;
+	return 0;
+}
+
+xtPublic void xt_dump_conn_tracking(void)
+{
+	XTConnInfoRec	conn_info[XT_TRACK_MAX_CONNS];
+	XTConnInfoPtr	ptr;
+
+	memcpy(conn_info, xt_track_conn_info, sizeof(xt_track_conn_info));
+	qsort(conn_info, XT_TRACK_MAX_CONNS, sizeof(XTConnInfoRec), trace_comp_conn_info);
+
+	ptr = conn_info;
+	for (int i=0; i<XT_TRACK_MAX_CONNS; i++) {
+		if (ptr->ci_curr_xact_id || ptr->ci_prev_xact_id) {
+			printf("%3d curr=%d prev=%d prev-time=%ld\n", (int) ptr->cu_t_id, (int) ptr->ci_curr_xact_id, (int) ptr->ci_prev_xact_id, (long) ptr->ci_prev_xact_time);
+			if (i+1<XT_TRACK_MAX_CONNS) {
+				printf("    diff=%d\n", (int) (ptr+1)->ci_curr_xact_id - (int) ptr->ci_curr_xact_id);
+			}
+		}
+		ptr++;
+	}
+}
+
+#endif
+
+
diff --git a/storage/pbxt/src/trace_xt.h b/storage/pbxt/src/trace_xt.h
new file mode 100644
index 00000000000..34459a94dca
--- /dev/null
+++ b/storage/pbxt/src/trace_xt.h
@@ -0,0 +1,76 @@
+/* Copyright (c) 2005 PrimeBase Technologies GmbH
+ *
+ * PrimeBase XT
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * 2005-02-07	Paul McCullagh
+ *
+ * H&G2JCtL
+ */
+#ifndef __xt_trace_h__
+#define __xt_trace_h__
+
+#include "xt_defs.h"
+
+xtBool	xt_init_trace(void);
+void	xt_exit_trace(void);
+void	xt_dump_trace(void);
+void	xt_print_trace(void);
+
+void	xt_trace(const char *fmt, ...);
+void	xt_ttraceq(struct XTThread *self, char *query);
+void	xt_ttracef(struct XTThread *self, char *fmt, ...);
+xtWord8	xt_trace_clock(void);
+char	*xt_trace_clock_str(char *ptr);
+char	*xt_trace_clock_diff(char *ptr);
+char	*xt_trace_clock_diff(char *ptr, xtWord8 start_time);
+void	xt_set_fflush(xtBool on);
+void	xt_ftracef(char *fmt, ...);
+
+#define XT_DEBUG_TRACE(x)
+#define XT_DISABLED_TRACE(x)
+#ifdef DEBUG
+//#define PBXT_HANDLER_TRACE
+#endif
+
+/*
+ * -----------------------------------------------------------------------
+ * CONNECTION TRACKING
+ */
+
+#ifdef DEBUG
+#define XT_TRACK_CONNECTIONS
+#endif
+
+#ifdef XT_TRACK_CONNECTIONS
+#define XT_TRACK_MAX_CONNS		500
+
+typedef struct XTConnInfo {
+	xtThreadID			cu_t_id;
+	xtXactID			ci_curr_xact_id;
+	xtWord8				ci_xact_start;
+
+	xtXactID			ci_prev_xact_id;
+	xtWord8				ci_prev_xact_time;
+} XTConnInfoRec, *XTConnInfoPtr;
+
+extern XTConnInfoRec xt_track_conn_info[XT_TRACK_MAX_CONNS];
+
+void	xt_dump_conn_tracking(void);
+
+#endif
+
+#endif
diff --git a/storage/pbxt/src/util_xt.cc b/storage/pbxt/src/util_xt.cc
new file mode 100644
index 00000000000..192c990c48b
--- /dev/null
+++ b/storage/pbxt/src/util_xt.cc
@@ -0,0 +1,431 @@
+/* Copyright (c) 2005 PrimeBase Technologies GmbH
+ *
+ * PrimeBase XT
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * 2004-01-03	Paul McCullagh
+ *
+ * H&G2JCtL
+ */
+
+#include "xt_config.h"
+
+#include <stdio.h>
+#include <time.h>
+#include <ctype.h>
+#ifndef XT_WIN
+#include <sys/param.h>
+#endif
+
+#include "util_xt.h"
+#include "strutil_xt.h"
+#include "memory_xt.h"
+
+xtPublic int xt_comp_log_pos(xtLogID id1, off_t off1, xtLogID id2, off_t off2)
+{
+	if (id1 < id2)
+		return -1;
+	if (id1 > id2)
+		return 1;
+	if (off1 < off2)
+		return -1;
+	if (off1 > off2)
+		return 1;
+	return 0;
+}
+
+/*
+ * This function returns the current time in micorsonds since
+ * 00:00:00 UTC, January 1, 1970.
+ * Currently it is accurate to the second :(
+ */
+xtPublic xtWord8 xt_time_now(void)
+{
+	xtWord8 ms;
+
+	ms = (xtWord8) time(NULL);
+	ms *= 1000000;
+	return ms;
+}
+
+xtPublic void xt_free_nothing(struct XTThread *XT_UNUSED(thread), void *XT_UNUSED(x))
+{
+}
+
+/*
+ * A file name has the form:
+ * <text>-<number>[.<ext>]
+ * This function return the number part as a
+ * u_long.
+ */
+xtPublic xtWord4 xt_file_name_to_id(char *file_name)
+{
+	u_long value = 0;
+
+	if (file_name) {
+		char	*num = file_name +  strlen(file_name) - 1;
+		
+		while (num >= file_name && *num != '-')
+			num--;
+		num++;
+		if (isdigit(*num))
+			sscanf(num, "%lu", &value);
+	}
+	return (xtWord4) value;
+}
+
+/*
+ * now is moving forward. then is a static time in the
+ * future. What is the time difference?
+ *
+ * These variables can overflow.
+ */ 
+xtPublic int xt_time_difference(register xtWord4 now, register xtWord4 then)
+{
+	/* now is after then, so the now time has passed 
+	 * then. So we return a negative difference.
+	 */
+	if (now >= then) {
+		/* now has gone past then. If the difference is
+		 * great, then we assume an overflow, and reverse!
+		 */
+		if ((now - then) > (xtWord4) 0xFFFFFFFF/2)
+			return (int) (0xFFFFFFFF - (now - then));
+
+		return (int) now - (int) then;
+	}
+	/* If now is before then, we check the difference.
+	 * If the difference is very large, then we assume
+	 * that now has gone past then, and overflowed.
+	 */
+	if ((then - now) > (xtWord4) 0xFFFFFFFF/2)
+		return - (int) (0xFFFFFFFF - (then - now));
+	return then - now;
+}
+
+xtPublic xtWord2 xt_get_checksum(xtWord1 *data, size_t len, u_int interval)
+{
+	register xtWord4	sum = 0, g;
+	xtWord1				*chk;
+
+	chk = data + len - 1;
+	while (chk > data) {
+		sum = (sum << 4) + *chk;
+		if ((g = sum & 0xF0000000)) {
+			sum = sum ^ (g >> 24);
+			sum = sum ^ g;
+		}
+		chk -= interval;
+	}
+	return (xtWord2) (sum ^ (sum >> 16));
+}
+
+xtPublic xtWord1 xt_get_checksum1(xtWord1 *data, size_t len)
+{
+	register xtWord4	sum = 0, g;
+	xtWord1				*chk;
+
+	chk = data + len - 1;
+	while (chk > data) {
+		sum = (sum << 4) + *chk;
+		if ((g = sum & 0xF0000000)) {
+			sum = sum ^ (g >> 24);
+			sum = sum ^ g;
+		}
+		chk--;
+	}
+	return (xtWord1) (sum ^ (sum >> 24) ^ (sum >> 16) ^ (sum >> 8));
+}
+
+xtPublic xtWord4 xt_get_checksum4(xtWord1 *data, size_t len)
+{
+	register xtWord4	sum = 0, g;
+	xtWord1				*chk;
+
+	chk = data + len - 1;
+	while (chk > data) {
+		sum = (sum << 4) + *chk;
+		if ((g = sum & 0xF0000000)) {
+			sum = sum ^ (g >> 24);
+			sum = sum ^ g;
+		}
+		chk--;
+	}
+	return sum;
+}
+
+/*
+ * --------------- Data Buffer ------------------
+ */
+
+xtPublic xtBool xt_db_set_size(struct XTThread *self, XTDataBufferPtr dbuf, size_t size)
+{
+	if (dbuf->db_size < size) {
+		if (!xt_realloc(self, (void **) &dbuf->db_data, size))
+			return FAILED;
+		dbuf->db_size = size;
+	}
+	else if (!size) {
+		if (dbuf->db_data)
+			xt_free(self, dbuf->db_data);
+		dbuf->db_data = NULL;
+		dbuf->db_size = 0;
+	}
+	return OK;
+}
+
+/*
+ * --------------- Data Buffer ------------------
+ */
+
+xtPublic xtBool xt_ib_alloc(struct XTThread *self, XTInfoBufferPtr ib, size_t size)
+{
+	if (!ib->ib_free) {
+		ib->ib_db.db_size = 0;
+		ib->ib_db.db_data = NULL;
+	}
+	if (size <= ib->ib_db.db_size)
+		return OK;
+
+	if (size <= XT_IB_DEFAULT_SIZE) {
+		ib->ib_db.db_size = XT_IB_DEFAULT_SIZE;
+		ib->ib_db.db_data = ib->ib_data;
+		return OK;
+	}
+
+	if (ib->ib_db.db_data == ib->ib_data) {
+		ib->ib_db.db_size = 0;
+		ib->ib_db.db_data = NULL;
+	}
+
+	ib->ib_free = TRUE;
+	return xt_db_set_size(self, &ib->ib_db, size);
+}
+
+void xt_ib_free(struct XTThread *self, XTInfoBufferPtr ib)
+{
+	if (ib->ib_free) {
+		xt_db_set_size(self, &ib->ib_db, 0);
+		ib->ib_free = FALSE;
+	}
+}
+
+/*
+ * --------------- Basic List ------------------
+ */
+
+xtPublic xtBool xt_bl_set_size(struct XTThread *self, XTBasicListPtr bl, size_t size)
+{
+	if (bl->bl_size < size) {
+		if (!xt_realloc(self, (void **) &bl->bl_data, size * bl->bl_item_size))
+			return FAILED;
+		bl->bl_size = size;
+	}
+	else if (!size) {
+		if (bl->bl_data)
+			xt_free(self, bl->bl_data);
+		bl->bl_data = NULL;
+		bl->bl_size = 0;
+		bl->bl_count = 0;
+	}
+	return OK;
+}
+
+xtPublic xtBool xt_bl_dup(struct XTThread *self, XTBasicListPtr from_bl, XTBasicListPtr to_bl)
+{
+	to_bl->bl_item_size = from_bl->bl_item_size;
+	to_bl->bl_size = 0;
+	to_bl->bl_count = from_bl->bl_count;
+	to_bl->bl_data = NULL;
+	if (!xt_bl_set_size(self, to_bl, from_bl->bl_count))
+		return FAILED;
+	memcpy(to_bl->bl_data, from_bl->bl_data, to_bl->bl_count * to_bl->bl_item_size);
+	return OK;
+}
+
+xtPublic xtBool xt_bl_append(struct XTThread *self, XTBasicListPtr bl, void *value)
+{
+	if (bl->bl_count == bl->bl_size) {
+		if (!xt_bl_set_size(self, bl, bl->bl_count+1))
+			return FAILED;
+	}
+	memcpy(&bl->bl_data[bl->bl_count * bl->bl_item_size], value, bl->bl_item_size);
+	bl->bl_count++;
+	return OK;
+}
+
+xtPublic void *xt_bl_last_item(XTBasicListPtr bl)
+{
+	if (!bl->bl_count)
+		return NULL;
+	return &bl->bl_data[(bl->bl_count-1) * bl->bl_item_size];
+}
+
+xtPublic void *xt_bl_item_at(XTBasicListPtr bl, u_int i)
+{
+	if (i >= bl->bl_count)
+		return NULL;
+	return &bl->bl_data[i * bl->bl_item_size];
+}
+
+xtPublic void xt_bl_free(struct XTThread *self, XTBasicListPtr wl)
+{
+	xt_bl_set_size(self, wl, 0);
+}
+
+/*
+ * --------------- Basic Queue ------------------
+ */
+
+xtPublic xtBool xt_bq_set_size(struct XTThread *self, XTBasicQueuePtr bq, size_t size)
+{
+	if (bq->bq_size < size) {
+		if (!xt_realloc(self, (void **) &bq->bq_data, size * bq->bq_item_size))
+			return FAILED;
+		bq->bq_size = size;
+	}
+	else if (!size) {
+		if (bq->bq_data)
+			xt_free(self, bq->bq_data);
+		bq->bq_data = NULL;
+		bq->bq_size = 0;
+		bq->bq_front = 0;
+		bq->bq_back = 0;
+	}
+	return OK;
+}
+
+xtPublic void *xt_bq_get(XTBasicQueuePtr bq)
+{
+	if (bq->bq_back == bq->bq_front)
+		return NULL;
+	return &bq->bq_data[bq->bq_back * bq->bq_item_size];
+}
+
+xtPublic void xt_bq_next(XTBasicQueuePtr bq)
+{
+	if (bq->bq_back < bq->bq_front) {
+		bq->bq_back++;
+		if (bq->bq_front == bq->bq_back) {
+			bq->bq_front = 0;
+			bq->bq_back = 0;
+		}
+	}
+}
+
+xtPublic xtBool xt_bq_add(struct XTThread *self, XTBasicQueuePtr bq, void *value)
+{
+	if (bq->bq_front == bq->bq_size) {
+		if (bq->bq_back >= bq->bq_max_waste) {
+			bq->bq_front -= bq->bq_back;
+			memmove(bq->bq_data, &bq->bq_data[bq->bq_back * bq->bq_item_size], bq->bq_front * bq->bq_item_size);
+			bq->bq_back = 0;
+		}
+		else {
+			if (!xt_bq_set_size(self, bq, bq->bq_front+bq->bq_item_inc))
+				return FAILED;
+		}
+	}
+	memcpy(&bq->bq_data[bq->bq_front * bq->bq_item_size], value, bq->bq_item_size);
+	bq->bq_front++;
+	return OK;
+}
+
+xtPublic void xt_sb_free(struct XTThread *self, XTStringBufferPtr dbuf)
+{
+	xt_sb_set_size(self, dbuf, 0);
+}
+
+xtPublic xtBool xt_sb_set_size(struct XTThread *self, XTStringBufferPtr dbuf, size_t size)
+{
+	if (dbuf->sb_size < size) {
+		if (!xt_realloc(self, (void **) &dbuf->sb_cstring, size))
+			return FAILED;
+		dbuf->sb_size = size;
+	}
+	else if (!size) {
+		if (dbuf->sb_cstring)
+			xt_free(self, dbuf->sb_cstring);
+		dbuf->sb_cstring = NULL;
+		dbuf->sb_size = 0;
+		dbuf->sb_len = 0;
+	}
+	return OK;
+}
+
+xtPublic xtBool xt_sb_concat_len(struct XTThread *self, XTStringBufferPtr dbuf, c_char *str, size_t len)
+{
+	if (!xt_sb_set_size(self, dbuf, dbuf->sb_len + len + 1))
+		return FAILED;
+	memcpy(dbuf->sb_cstring + dbuf->sb_len, str, len);
+	dbuf->sb_len += len;
+	dbuf->sb_cstring[dbuf->sb_len] = 0;
+	return OK;
+}
+
+xtPublic xtBool xt_sb_concat(struct XTThread *self, XTStringBufferPtr dbuf, c_char *str)
+{
+	return xt_sb_concat_len(self, dbuf, str, strlen(str));
+}
+
+xtPublic xtBool xt_sb_concat_char(struct XTThread *self, XTStringBufferPtr dbuf, int ch)
+{
+	if (!xt_sb_set_size(self, dbuf, dbuf->sb_len + 1 + 1))
+		return FAILED;
+	dbuf->sb_cstring[dbuf->sb_len] = (char) ch;
+	dbuf->sb_len++;
+	dbuf->sb_cstring[dbuf->sb_len] = 0;
+	return OK;
+}
+
+xtPublic xtBool xt_sb_concat_int8(struct XTThread *self, XTStringBufferPtr dbuf, xtInt8 val)
+{
+	char buffer[200];
+
+	sprintf(buffer, "%"PRId64, val);
+	return xt_sb_concat(self, dbuf, buffer);
+}
+
+xtPublic char *xt_sb_take_cstring(XTStringBufferPtr sbuf)
+{
+	char *str = sbuf->sb_cstring;
+	
+	sbuf->sb_cstring = NULL;
+	sbuf->sb_size = 0; 
+	sbuf->sb_len = 0; 
+	return str;
+}
+
+xtPublic xtBool xt_sb_concat_url_len(struct XTThread *self, XTStringBufferPtr dbuf, c_char *from, size_t len_from)
+{
+	if (!xt_sb_set_size(self, dbuf, dbuf->sb_len + len_from + 1))
+		return FAILED;
+	while (len_from--) {
+		if (*from == '%' && len_from >= 2 && isxdigit(*(from+1)) && isxdigit(*(from+2))) {
+			unsigned char a = xt_hex_digit(*(from+1));
+			unsigned char b = xt_hex_digit(*(from+2));
+			dbuf->sb_cstring[dbuf->sb_len] = a << 4 | b;
+			from += 3;
+		}
+		else
+			dbuf->sb_cstring[dbuf->sb_len] = *from++;
+		dbuf->sb_len++;
+	}
+	dbuf->sb_cstring[dbuf->sb_len] = 0;
+	return OK;
+}
+
+
diff --git a/storage/pbxt/src/util_xt.h b/storage/pbxt/src/util_xt.h
new file mode 100644
index 00000000000..28d47544ef9
--- /dev/null
+++ b/storage/pbxt/src/util_xt.h
@@ -0,0 +1,124 @@
+/* Copyright (c) 2005 PrimeBase Technologies GmbH
+ *
+ * PrimeBase XT
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * 2004-01-03	Paul McCullagh
+ *
+ * H&G2JCtL
+ */
+
+#ifndef __xt_xtutil_h__
+#define __xt_xtutil_h__
+
+#include <stddef.h>
+
+#include "xt_defs.h"
+
+#define XT_CHECKSUM_1(sum)		((xtWord1) ((sum) ^ ((sum) >> 24) ^ ((sum) >> 16) ^ ((sum) >> 8)))
+#define XT_CHECKSUM_2(sum)		((xtWord2) ((sum) ^ ((sum) >> 16)))
+#define XT_CHECKSUM4_8(sum)		((xtWord4) (sum) ^ (xtWord4) ((sum) >> 32))
+
+int		xt_comp_log_pos(xtLogID id1, off_t off1, xtLogID id2, off_t off2);
+xtWord8	xt_time_now(void);
+void	xt_free_nothing(struct XTThread *self, void *x);
+xtWord4	xt_file_name_to_id(char *file_name);
+xtBool	xt_time_difference(register xtWord4 now, register xtWord4 then);
+xtWord2	xt_get_checksum(xtWord1 *data, size_t len, u_int interval);
+xtWord1 xt_get_checksum1(xtWord1 *data, size_t len);
+xtWord4 xt_get_checksum4(xtWord1 *data, size_t len);
+
+typedef struct XTDataBuffer {
+	size_t			db_size;
+	xtWord1			*db_data;
+} XTDataBufferRec, *XTDataBufferPtr;
+
+xtBool xt_db_set_size(struct XTThread *self, XTDataBufferPtr db, size_t size);
+
+#define XT_IB_DEFAULT_SIZE			512
+
+typedef struct XTInfoBuffer {
+	xtBool			ib_free;
+	XTDataBufferRec	ib_db;
+	xtWord1			ib_data[XT_IB_DEFAULT_SIZE];
+} XTInfoBufferRec, *XTInfoBufferPtr;
+
+xtBool	xt_ib_alloc(struct XTThread *self, XTInfoBufferPtr ib, size_t size);
+void	xt_ib_free(struct XTThread *self, XTInfoBufferPtr ib);
+
+typedef struct XTBasicList {
+	u_int			bl_item_size;
+	u_int			bl_size;
+	u_int			bl_count;
+	xtWord1			*bl_data;
+} XTBasicListRec, *XTBasicListPtr;
+
+xtBool	xt_bl_set_size(struct XTThread *self, XTBasicListPtr wl, size_t size);
+xtBool	xt_bl_dup(struct XTThread *self, XTBasicListPtr from_bl, XTBasicListPtr to_bl);
+xtBool	xt_bl_append(struct XTThread *self, XTBasicListPtr wl, void *value);
+void	*xt_bl_last_item(XTBasicListPtr wl);
+void	*xt_bl_item_at(XTBasicListPtr wl, u_int i);
+void	xt_bl_free(struct XTThread *self, XTBasicListPtr wl);
+
+typedef struct XTBasicQueue {
+	u_int			bq_item_size;
+	u_int			bq_max_waste;
+	u_int			bq_item_inc;
+	u_int			bq_size;
+	u_int			bq_front;
+	u_int			bq_back;
+	xtWord1			*bq_data;
+} XTBasicQueueRec, *XTBasicQueuePtr;
+
+xtBool	xt_bq_set_size(struct XTThread *self, XTBasicQueuePtr wq, size_t size);
+void	*xt_bq_get(XTBasicQueuePtr wq);
+void	xt_bq_next(XTBasicQueuePtr wq);
+xtBool	xt_bq_add(struct XTThread *self, XTBasicQueuePtr wl, void *value);
+
+typedef struct XTStringBuffer {
+	size_t			sb_size;
+	size_t			sb_len;
+	char			*sb_cstring;
+} XTStringBufferRec, *XTStringBufferPtr;
+
+void	xt_sb_free(struct XTThread *self, XTStringBufferPtr db);
+xtBool	xt_sb_set_size(struct XTThread *self, XTStringBufferPtr db, size_t size);
+xtBool	xt_sb_concat_len(struct XTThread *self, XTStringBufferPtr dbuf, c_char *str, size_t len);
+xtBool	xt_sb_concat(struct XTThread *self, XTStringBufferPtr dbuf, c_char *str);
+xtBool	xt_sb_concat_char(struct XTThread *self, XTStringBufferPtr dbuf, int ch);
+xtBool	xt_sb_concat_int8(struct XTThread *self, XTStringBufferPtr dbuf, xtInt8 val);
+char	*xt_sb_take_cstring(XTStringBufferPtr dbuf);
+xtBool	xt_sb_concat_url_len(struct XTThread *self, XTStringBufferPtr dbuf, c_char *str, size_t len);
+
+static inline size_t xt_align_size(size_t size, size_t align)
+{
+	register size_t diff = size % align;
+	
+	if (diff)
+		return size + align - diff;
+	return size;
+}
+
+static inline off_t xt_align_offset(off_t size, size_t align)
+{
+	register off_t diff = size % (off_t) align;
+	
+	if (diff)
+		return size + align - diff;
+	return size;
+}
+
+#endif
diff --git a/storage/pbxt/src/win_inttypes.h b/storage/pbxt/src/win_inttypes.h
new file mode 100644
index 00000000000..c8561939e54
--- /dev/null
+++ b/storage/pbxt/src/win_inttypes.h
@@ -0,0 +1,259 @@
+/* Copyright (C) 1997-2001, 2004, 2007 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, write to the Free
+   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+   02111-1307 USA.  */
+
+/*
+ *	ISO C99: 7.8 Format conversion of integer types	<inttypes.h>
+ */
+
+/*
+ * this is a reduced verion of the original linux inttypes.h file
+ */
+
+#ifndef _INTTYPES_H
+#define _INTTYPES_H	1
+
+/* The ISO C99 standard specifies that these macros must only be
+   defined if explicitly requested.  */
+#if !defined __cplusplus || defined __STDC_FORMAT_MACROS
+
+# if __WORDSIZE == 64
+#  define __PRI64_PREFIX	"l"
+#  define __PRIPTR_PREFIX	"l"
+# else
+#  define __PRI64_PREFIX	"ll"
+#  define __PRIPTR_PREFIX
+# endif
+
+/* Macros for printing format specifiers.  */
+
+/* Decimal notation.  */
+# define PRId8		"d"
+# define PRId16		"d"
+# define PRId32		"d"
+# define PRId64		__PRI64_PREFIX "d"
+
+# define PRIdLEAST8	"d"
+# define PRIdLEAST16	"d"
+# define PRIdLEAST32	"d"
+# define PRIdLEAST64	__PRI64_PREFIX "d"
+
+# define PRIdFAST8	"d"
+# define PRIdFAST16	__PRIPTR_PREFIX "d"
+# define PRIdFAST32	__PRIPTR_PREFIX "d"
+# define PRIdFAST64	__PRI64_PREFIX "d"
+
+
+# define PRIi8		"i"
+# define PRIi16		"i"
+# define PRIi32		"i"
+# define PRIi64		__PRI64_PREFIX "i"
+
+# define PRIiLEAST8	"i"
+# define PRIiLEAST16	"i"
+# define PRIiLEAST32	"i"
+# define PRIiLEAST64	__PRI64_PREFIX "i"
+
+# define PRIiFAST8	"i"
+# define PRIiFAST16	__PRIPTR_PREFIX "i"
+# define PRIiFAST32	__PRIPTR_PREFIX "i"
+# define PRIiFAST64	__PRI64_PREFIX "i"
+
+/* Octal notation.  */
+# define PRIo8		"o"
+# define PRIo16		"o"
+# define PRIo32		"o"
+# define PRIo64		__PRI64_PREFIX "o"
+
+# define PRIoLEAST8	"o"
+# define PRIoLEAST16	"o"
+# define PRIoLEAST32	"o"
+# define PRIoLEAST64	__PRI64_PREFIX "o"
+
+# define PRIoFAST8	"o"
+# define PRIoFAST16	__PRIPTR_PREFIX "o"
+# define PRIoFAST32	__PRIPTR_PREFIX "o"
+# define PRIoFAST64	__PRI64_PREFIX "o"
+
+/* Unsigned integers.  */
+# define PRIu8		"u"
+# define PRIu16		"u"
+# define PRIu32		"u"
+# define PRIu64		__PRI64_PREFIX "u"
+
+# define PRIuLEAST8	"u"
+# define PRIuLEAST16	"u"
+# define PRIuLEAST32	"u"
+# define PRIuLEAST64	__PRI64_PREFIX "u"
+
+# define PRIuFAST8	"u"
+# define PRIuFAST16	__PRIPTR_PREFIX "u"
+# define PRIuFAST32	__PRIPTR_PREFIX "u"
+# define PRIuFAST64	__PRI64_PREFIX "u"
+
+/* lowercase hexadecimal notation.  */
+# define PRIx8		"x"
+# define PRIx16		"x"
+# define PRIx32		"x"
+# define PRIx64		__PRI64_PREFIX "x"
+
+# define PRIxLEAST8	"x"
+# define PRIxLEAST16	"x"
+# define PRIxLEAST32	"x"
+# define PRIxLEAST64	__PRI64_PREFIX "x"
+
+# define PRIxFAST8	"x"
+# define PRIxFAST16	__PRIPTR_PREFIX "x"
+# define PRIxFAST32	__PRIPTR_PREFIX "x"
+# define PRIxFAST64	__PRI64_PREFIX "x"
+
+/* UPPERCASE hexadecimal notation.  */
+# define PRIX8		"X"
+# define PRIX16		"X"
+# define PRIX32		"X"
+# define PRIX64		__PRI64_PREFIX "X"
+
+# define PRIXLEAST8	"X"
+# define PRIXLEAST16	"X"
+# define PRIXLEAST32	"X"
+# define PRIXLEAST64	__PRI64_PREFIX "X"
+
+# define PRIXFAST8	"X"
+# define PRIXFAST16	__PRIPTR_PREFIX "X"
+# define PRIXFAST32	__PRIPTR_PREFIX "X"
+# define PRIXFAST64	__PRI64_PREFIX "X"
+
+
+/* Macros for printing `intmax_t' and `uintmax_t'.  */
+# define PRIdMAX	__PRI64_PREFIX "d"
+# define PRIiMAX	__PRI64_PREFIX "i"
+# define PRIoMAX	__PRI64_PREFIX "o"
+# define PRIuMAX	__PRI64_PREFIX "u"
+# define PRIxMAX	__PRI64_PREFIX "x"
+# define PRIXMAX	__PRI64_PREFIX "X"
+
+
+/* Macros for printing `intptr_t' and `uintptr_t'.  */
+# define PRIdPTR	__PRIPTR_PREFIX "d"
+# define PRIiPTR	__PRIPTR_PREFIX "i"
+# define PRIoPTR	__PRIPTR_PREFIX "o"
+# define PRIuPTR	__PRIPTR_PREFIX "u"
+# define PRIxPTR	__PRIPTR_PREFIX "x"
+# define PRIXPTR	__PRIPTR_PREFIX "X"
+
+
+/* Macros for scanning format specifiers.  */
+
+/* Signed decimal notation.  */
+# define SCNd8		"hhd"
+# define SCNd16		"hd"
+# define SCNd32		"d"
+# define SCNd64		__PRI64_PREFIX "d"
+
+# define SCNdLEAST8	"hhd"
+# define SCNdLEAST16	"hd"
+# define SCNdLEAST32	"d"
+# define SCNdLEAST64	__PRI64_PREFIX "d"
+
+# define SCNdFAST8	"hhd"
+# define SCNdFAST16	__PRIPTR_PREFIX "d"
+# define SCNdFAST32	__PRIPTR_PREFIX "d"
+# define SCNdFAST64	__PRI64_PREFIX "d"
+
+/* Signed decimal notation.  */
+# define SCNi8		"hhi"
+# define SCNi16		"hi"
+# define SCNi32		"i"
+# define SCNi64		__PRI64_PREFIX "i"
+
+# define SCNiLEAST8	"hhi"
+# define SCNiLEAST16	"hi"
+# define SCNiLEAST32	"i"
+# define SCNiLEAST64	__PRI64_PREFIX "i"
+
+# define SCNiFAST8	"hhi"
+# define SCNiFAST16	__PRIPTR_PREFIX "i"
+# define SCNiFAST32	__PRIPTR_PREFIX "i"
+# define SCNiFAST64	__PRI64_PREFIX "i"
+
+/* Unsigned decimal notation.  */
+# define SCNu8		"hhu"
+# define SCNu16		"hu"
+# define SCNu32		"u"
+# define SCNu64		__PRI64_PREFIX "u"
+
+# define SCNuLEAST8	"hhu"
+# define SCNuLEAST16	"hu"
+# define SCNuLEAST32	"u"
+# define SCNuLEAST64	__PRI64_PREFIX "u"
+
+# define SCNuFAST8	"hhu"
+# define SCNuFAST16	__PRIPTR_PREFIX "u"
+# define SCNuFAST32	__PRIPTR_PREFIX "u"
+# define SCNuFAST64	__PRI64_PREFIX "u"
+
+/* Octal notation.  */
+# define SCNo8		"hho"
+# define SCNo16		"ho"
+# define SCNo32		"o"
+# define SCNo64		__PRI64_PREFIX "o"
+
+# define SCNoLEAST8	"hho"
+# define SCNoLEAST16	"ho"
+# define SCNoLEAST32	"o"
+# define SCNoLEAST64	__PRI64_PREFIX "o"
+
+# define SCNoFAST8	"hho"
+# define SCNoFAST16	__PRIPTR_PREFIX "o"
+# define SCNoFAST32	__PRIPTR_PREFIX "o"
+# define SCNoFAST64	__PRI64_PREFIX "o"
+
+/* Hexadecimal notation.  */
+# define SCNx8		"hhx"
+# define SCNx16		"hx"
+# define SCNx32		"x"
+# define SCNx64		__PRI64_PREFIX "x"
+
+# define SCNxLEAST8	"hhx"
+# define SCNxLEAST16	"hx"
+# define SCNxLEAST32	"x"
+# define SCNxLEAST64	__PRI64_PREFIX "x"
+
+# define SCNxFAST8	"hhx"
+# define SCNxFAST16	__PRIPTR_PREFIX "x"
+# define SCNxFAST32	__PRIPTR_PREFIX "x"
+# define SCNxFAST64	__PRI64_PREFIX "x"
+
+
+/* Macros for scanning `intmax_t' and `uintmax_t'.  */
+# define SCNdMAX	__PRI64_PREFIX "d"
+# define SCNiMAX	__PRI64_PREFIX "i"
+# define SCNoMAX	__PRI64_PREFIX "o"
+# define SCNuMAX	__PRI64_PREFIX "u"
+# define SCNxMAX	__PRI64_PREFIX "x"
+
+/* Macros for scaning `intptr_t' and `uintptr_t'.  */
+# define SCNdPTR	__PRIPTR_PREFIX "d"
+# define SCNiPTR	__PRIPTR_PREFIX "i"
+# define SCNoPTR	__PRIPTR_PREFIX "o"
+# define SCNuPTR	__PRIPTR_PREFIX "u"
+# define SCNxPTR	__PRIPTR_PREFIX "x"
+
+#endif	/* C++ && format macros */
+
+
+#endif /* inttypes.h */
diff --git a/storage/pbxt/src/xaction_xt.cc b/storage/pbxt/src/xaction_xt.cc
new file mode 100644
index 00000000000..48abc5d2b66
--- /dev/null
+++ b/storage/pbxt/src/xaction_xt.cc
@@ -0,0 +1,2904 @@
+/* Copyright (c) 2005 PrimeBase Technologies GmbH
+ *
+ * PrimeBase XT
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * 2005-04-10	Paul McCullagh
+ *
+ * H&G2JCtL
+ */
+
+#include "xt_config.h"
+
+#ifdef DRIZZLED
+#include <bitset>
+#endif
+
+#include <time.h>
+#include <signal.h>
+
+#include "xaction_xt.h"
+#include "database_xt.h"
+#include "strutil_xt.h"
+#include "heap_xt.h"
+#include "trace_xt.h"
+#include "myxt_xt.h"
+#include "tabcache_xt.h"
+
+#ifdef DEBUG
+//#define TRACE_WAIT_FOR
+//#define TRACE_VARIATIONS
+//#define TRACE_SWEEPER_ACTIVITY
+
+/* Enable to trace the statements executed by the engine: */
+//#define TRACE_STATEMENTS
+#endif
+
+#if defined(TRACE_STATEMENTS) || defined(TRACE_VARIATIONS)
+#define TRACE_TRANSACTION
+#endif
+
+static void xn_sw_wait_for_xact(XTThreadPtr self, XTDatabaseHPtr db, u_int hsecs);
+static xtBool xn_get_xact_details(XTDatabaseHPtr db, xtXactID xn_id, XTThreadPtr XT_UNUSED(thread), int *flags, xtXactID *start, xtXactID *end, xtThreadID *thd_id);
+static xtBool xn_get_xact_pointer(XTDatabaseHPtr db, xtXactID xn_id, XTXactDataPtr *xact_ptr);
+
+/* ============================================================================================== */
+
+typedef struct XNSWRecItem {
+	xtTableID				ri_tab_id;
+	xtRecordID				ri_rec_id;
+} XNSWRecItemRec, *XNSWRecItemPtr;
+
+typedef struct XNSWToFreeItem {
+	xtTableID				ri_tab_id;			/* If non-zero, then this is the table of the data record to be freed.
+												 * If zero, then this free the transaction below must be freed.
+												 */
+	union {
+		xtRecordID			ri_rec_id;
+		xtXactID			ri_xn_id;
+	} x;
+	xtXactID				ri_wait_xn_id;		/* Wait for this transaction to be cleaned (or being cleaned up)
+												 * before freeing this resource. */
+} XNSWToFreeItemRec, *XNSWToFreeItemPtr;
+
+/* ----------------------------------------------------------------------
+ * TRANSACTION/THREAD WAIT LIST
+ */
+
+typedef struct XNWaitThread {
+	/* The wait condition of the thread. */
+	xt_mutex_type			wt_lock;
+	xt_cond_type			wt_cond;
+
+	/* The list of threads waiting for this thread. */
+	XTSpinLockRec			wt_wait_list_lock;
+	u_int					wt_wait_list_count;
+	u_int					wt_wait_list_size;
+	xtThreadID				*wt_wait_list;
+} XNWaitThreadRec, *XNWaitThreadPtr;
+
+static XNWaitThreadPtr	xn_wait_thread_array;
+
+xtPublic void xt_thread_wait_init(XTThreadPtr self)
+{
+	xn_wait_thread_array = (XNWaitThreadPtr) xt_calloc(self, xt_thr_maximum_threads * sizeof(XNWaitThreadRec));
+	for (u_int i=0; i<xt_thr_maximum_threads; i++) {
+		xt_init_mutex_with_autoname(self, &xn_wait_thread_array[i].wt_lock);
+		xt_init_cond(self, &xn_wait_thread_array[i].wt_cond);
+		xn_wait_thread_array[i].wt_wait_list = NULL;
+		xn_wait_thread_array[i].wt_wait_list_count = 0;
+		xn_wait_thread_array[i].wt_wait_list_size = 0;
+		xt_spinlock_init_with_autoname(self, &xn_wait_thread_array[i].wt_wait_list_lock);
+	}
+}
+
+xtPublic void xt_thread_wait_exit(XTThreadPtr self)
+{
+	if (xn_wait_thread_array) {
+		for (u_int i=0; i<xt_thr_maximum_threads; i++) {
+			xt_free_mutex(&xn_wait_thread_array[i].wt_lock);
+			xt_free_cond(&xn_wait_thread_array[i].wt_cond);
+			if (xn_wait_thread_array[i].wt_wait_list)
+				xt_free(self, xn_wait_thread_array[i].wt_wait_list);
+			xt_spinlock_free(self, &xn_wait_thread_array[i].wt_wait_list_lock);
+		}
+		xt_free(self, xn_wait_thread_array);
+	}
+}
+
+static xtBool xn_wait_for_thread(xtThreadID waiting_id, xtThreadID wait_for_id)
+{
+	XNWaitThreadPtr wt;
+	
+	wt = &xn_wait_thread_array[wait_for_id];
+	xt_spinlock_lock(&wt->wt_wait_list_lock);
+	if (wt->wt_wait_list_count == wt->wt_wait_list_size) {
+		if (!xt_realloc_ns((void **) &wt->wt_wait_list, (wt->wt_wait_list_size+1) * sizeof(xtThreadID)))
+			return FAILED;
+		wt->wt_wait_list_size++;
+	}
+	for (u_int i=0; i<wt->wt_wait_list_count; i++) {
+		if (wt->wt_wait_list[i] == waiting_id)
+			goto done;
+	}
+	wt->wt_wait_list[wt->wt_wait_list_count] = waiting_id;
+	wt->wt_wait_list_count++;
+	done:
+	xt_spinlock_unlock(&wt->wt_wait_list_lock);
+	return OK;
+}
+
+xtPublic void xt_xn_wakeup_thread(xtThreadID thd_id)
+{
+	XNWaitThreadPtr	target_wt;
+
+	target_wt = &xn_wait_thread_array[thd_id];
+	xt_lock_mutex_ns(&target_wt->wt_lock);
+	xt_broadcast_cond_ns(&target_wt->wt_cond);
+	xt_unlock_mutex_ns(&target_wt->wt_lock);
+}
+
+xtPublic void xt_xn_wakeup_thread_list(XTThreadPtr thread)
+{
+	XNWaitThreadPtr	target_wt;
+
+	for (u_int i=0; i<thread->st_thread_list_count; i++) {
+		target_wt = &xn_wait_thread_array[thread->st_thread_list[i]];
+		xt_lock_mutex_ns(&target_wt->wt_lock);
+		xt_broadcast_cond_ns(&target_wt->wt_cond);
+		xt_unlock_mutex_ns(&target_wt->wt_lock);
+	}
+	thread->st_thread_list_count = 0;
+}
+
+xtPublic void xt_xn_wakeup_waiting_threads(XTThreadPtr thread)
+{
+	XNWaitThreadPtr wt;
+	XNWaitThreadPtr	target_wt;
+	
+	wt = &xn_wait_thread_array[thread->t_id];
+	if (!wt->wt_wait_list_count)
+		return;
+
+	xt_spinlock_lock(&wt->wt_wait_list_lock);
+	if (thread->st_thread_list_size < wt->wt_wait_list_count) {
+		if (!xt_realloc_ns((void **) &thread->st_thread_list, wt->wt_wait_list_count * sizeof(xtThreadID)))
+			goto failed;
+		 thread->st_thread_list_size = wt->wt_wait_list_count;
+	}
+	memcpy(thread->st_thread_list, wt->wt_wait_list, wt->wt_wait_list_count * sizeof(xtThreadID));
+	thread->st_thread_list_count = wt->wt_wait_list_count;
+	wt->wt_wait_list_count = 0;
+	xt_spinlock_unlock(&wt->wt_wait_list_lock);
+
+	xt_xn_wakeup_thread_list(thread);
+	return;
+	
+	failed:
+	for (u_int i=0; i<wt->wt_wait_list_count; i++) {
+		target_wt = &xn_wait_thread_array[wt->wt_wait_list[i]];
+		xt_lock_mutex_ns(&target_wt->wt_lock);
+		xt_broadcast_cond_ns(&target_wt->wt_cond);
+		xt_unlock_mutex_ns(&target_wt->wt_lock);
+	}
+	wt->wt_wait_list_count = 0;
+	xt_spinlock_unlock(&wt->wt_wait_list_lock);
+}
+
+/* ----------------------------------------------------------------------
+ * WAIT FOR TRANSACTIONS
+ */
+
+typedef struct XNWaitFor {
+	xtXactID				wf_waiting_xn_id;		/* The transaction of the waiting thread. */
+	xtXactID				wf_for_me_xn_id;		/* The transaction we are waiting for. */
+} XNWaitForRec, *XNWaitForPtr;
+
+static int xn_compare_wait_for(XTThreadPtr XT_UNUSED(self), register const void *XT_UNUSED(thunk), register const void *a, register const void *b)
+{
+	xtXactID		*x = (xtXactID *) a;
+	XNWaitForPtr	y = (XNWaitForPtr) b;
+
+	if (*x == y->wf_waiting_xn_id)
+		return 0;
+	if (xt_xn_is_before(*x, y->wf_waiting_xn_id))
+		return -1;
+	return 1;
+}
+
+static void xn_free_wait_for(XTThreadPtr XT_UNUSED(self), void *XT_UNUSED(thunk), void *XT_UNUSED(item))
+{
+}
+
+/*
+ * A deadlock occurs when a transaction is waiting for itself!
+ * For example A is waiting for B which is waiting for A.
+ * By repeatedly scanning the wait_for list we can find out if a
+ * transaction is waiting for itself.
+ */
+static xtBool xn_detect_deadlock(XTDatabaseHPtr db, xtXactID waiting, xtXactID for_me)
+{
+	XNWaitForPtr wf;
+
+	for (;;) {
+		if (waiting == for_me) {
+#ifdef TRACE_WAIT_FOR
+			for (u_int i=0; i<xt_sl_get_size(db->db_xn_wait_for); i++) {
+				wf = (XNWaitForPtr) xt_sl_item_at(db->db_xn_wait_for, i);
+				xt_trace("T%lu --> T%lu\n", (u_long) wf->wf_waiting_xn_id, (u_long) wf->wf_for_me_xn_id);
+			}
+			xt_ttracef(xt_get_self(), "DEADLOCK\n");
+			xt_dump_trace();
+#endif
+			xt_register_xterr(XT_REG_CONTEXT, XT_ERR_DEADLOCK);
+			return TRUE;
+		}
+		if (!(wf = (XNWaitForPtr) xt_sl_find(NULL, db->db_xn_wait_for, &for_me)))
+			break;
+		for_me = wf->wf_for_me_xn_id;
+	}
+	return FALSE;
+}
+
+#ifdef XT_USE_SPINLOCK_WAIT_FOR
+
+#if defined(XT_MAC) || defined(XT_WIN)
+#define WAIT_SPIN_COUNT			10
+#else
+#define WAIT_SPIN_COUNT			50
+#endif
+
+/* Should not be required, but we wait for a second,
+ * just in case the wakeup is missed!
+ */
+#ifdef DEBUG
+#define WAIT_FOR_XACT_TIME		30000
+#else
+#define WAIT_FOR_XACT_TIME		1000
+#endif
+
+static xtBool xn_add_to_wait_for(XTDatabaseHPtr db, XNWaitForPtr wf, XTThreadPtr thread)
+{
+	/* If we are waiting for a transaction to end, 
+	 * put this thread on the wait list...
+	 *
+	 * As long as the temporary lock is removed
+	 * or turned into a permanent lock before
+	 * a thread waits again, all should be OK!
+	 */
+	xt_spinlock_lock(&db->db_xn_wait_spinlock);
+
+#ifdef TRACE_WAIT_FOR
+	xt_ttracef(thread, "T%lu -wait-> T%lu\n", (u_long) thread->st_xact_data->xd_start_xn_id, (u_long) wait_xn_id);
+#endif
+	/* Check for a deadlock: */
+	if (xn_detect_deadlock(db, wf->wf_waiting_xn_id, wf->wf_for_me_xn_id))
+		goto failed;
+
+	/* We will wait for this transaction... */
+	db->db_xn_wait_count++;
+	if (thread->st_xact_writer)
+		db->db_xn_writer_wait_count++;
+
+	if (!xt_sl_insert(NULL, db->db_xn_wait_for, &wf->wf_waiting_xn_id, wf)) {
+		db->db_xn_wait_count--;
+		goto failed;
+	}
+
+	xt_spinlock_unlock(&db->db_xn_wait_spinlock);
+	return OK;
+
+	failed:
+	xt_spinlock_unlock(&db->db_xn_wait_spinlock);
+	return FAILED;
+}
+
+inline void xn_remove_from_wait_for(XTDatabaseHPtr db, XNWaitForPtr wf, XTThreadPtr thread)
+{
+	xt_spinlock_lock(&db->db_xn_wait_spinlock);
+
+	xt_sl_delete(NULL, db->db_xn_wait_for, &wf->wf_waiting_xn_id);
+	db->db_xn_wait_count--;
+	if (thread->st_xact_writer)
+		db->db_xn_writer_wait_count--;
+
+#ifdef TRACE_WAIT_FOR
+	xt_ttracef(thread, "T%lu -wait-> T%lu FAILED\n", (u_long) thread->st_xact_data->xd_start_xn_id, (u_long) wait_xn_id);
+#endif
+	xt_spinlock_unlock(&db->db_xn_wait_spinlock);
+}
+
+/* Wait for a transation to terminate or a lock to be granted.
+ *
+ * If term_req is TRUE, then the termination of the transaction is required
+ * before continuing.
+ *
+ * If pw_func is set then this function will not return before this call has
+ * succeeded.
+ *
+ * This function returns FAILE on error.
+ */
+xtPublic xtBool xt_xn_wait_for_xact(XTThreadPtr thread, XTXactWaitPtr xw, XTLockWaitPtr lw)
+{
+	XTDatabaseHPtr		db = thread->st_database;
+	XNWaitForRec		wf;
+	int					flags = 0;
+	xtXactID			start = 0;
+	XTXactDataPtr		wait_xact_ptr;
+	xtBool				on_wait_list = FALSE;
+	XTXactWaitRec		xw_new;
+	u_int				loop_count = 0;
+	XNWaitThreadPtr		my_wt;
+
+	ASSERT_NS(thread->st_xact_data);
+	thread->st_statistics.st_wait_for_xact++;
+
+	wf.wf_waiting_xn_id = thread->st_xact_data->xd_start_xn_id;
+
+	if (lw) {
+		/* If we are here, then the lw structure is on the wait
+		 * queue for the given lock.
+		 */
+		xtXactID locking_xn_id;
+		
+		wait_for_locker:
+		locking_xn_id = lw->lw_xn_id;
+		wf.wf_for_me_xn_id = lw->lw_xn_id;
+		if (!xn_add_to_wait_for(db, &wf, thread)) {
+			lw->lw_ot->ot_table->tab_locks.xt_cancel_temp_lock(lw);
+			return FAILED;
+		}
+
+		while (loop_count < WAIT_SPIN_COUNT) {
+			loop_count++;
+
+			switch (lw->lw_curr_lock) {
+				case XT_LOCK_ERR:
+					xn_remove_from_wait_for(db, &wf, thread);
+					return FAILED;
+				case XT_NO_LOCK:
+					/* Got the lock: */
+					/* Check if we must also wait for the transaction: */
+					if (lw->lw_row_updated) {
+						/* This will override the xw passed in.
+						 * The reason is, because we are actually waiting
+						 * for a lock, and the lock owner may have changed
+						 * while we were waiting for the lock.
+						 */
+						xw_new.xw_xn_id = lw->lw_updating_xn_id;
+						xw = &xw_new;
+					}
+					if (xw) {
+						if (wf.wf_for_me_xn_id == xw->xw_xn_id)
+							on_wait_list = TRUE;
+						else
+							xn_remove_from_wait_for(db, &wf, thread);
+						goto wait_for_xact;
+					}
+					xn_remove_from_wait_for(db, &wf, thread);
+					return OK;
+				case XT_TEMP_LOCK:
+				case XT_PERM_LOCK:
+					if (locking_xn_id != lw->lw_xn_id) {
+						/* Change the transaction that we are waiting for: */
+						xn_remove_from_wait_for(db, &wf, thread);
+						goto wait_for_locker;
+					}
+					break;
+			}
+
+			xt_critical_wait();
+		}
+
+
+		/* The non-spinning version... */
+		wait_for_locker_no_spin:
+		my_wt = &xn_wait_thread_array[thread->t_id];
+		xt_lock_mutex_ns(&my_wt->wt_lock);
+
+		for (;;) {
+			switch (lw->lw_curr_lock) {
+				case XT_LOCK_ERR:
+					xt_unlock_mutex_ns(&my_wt->wt_lock);
+					xn_remove_from_wait_for(db, &wf, thread);
+					return FAILED;
+				case XT_NO_LOCK:
+					xt_unlock_mutex_ns(&my_wt->wt_lock);
+					if (lw->lw_row_updated) {
+						xw_new.xw_xn_id = lw->lw_updating_xn_id;
+						xw = &xw_new;
+					}
+					if (xw) {
+						if (wf.wf_for_me_xn_id == xw->xw_xn_id)
+							on_wait_list = TRUE;
+						else
+							xn_remove_from_wait_for(db, &wf, thread);
+						goto wait_for_xact;
+					}
+					xn_remove_from_wait_for(db, &wf, thread);
+					return OK;
+				case XT_TEMP_LOCK:
+				case XT_PERM_LOCK:
+					if (locking_xn_id != lw->lw_xn_id) {
+						/* Change the transaction that we are waiting for: */
+						xt_unlock_mutex_ns(&my_wt->wt_lock);
+						xn_remove_from_wait_for(db, &wf, thread);
+						locking_xn_id = lw->lw_xn_id;
+						wf.wf_for_me_xn_id = lw->lw_xn_id;
+						if (!xn_add_to_wait_for(db, &wf, thread)) {
+							lw->lw_ot->ot_table->tab_locks.xt_cancel_temp_lock(lw);
+							return FAILED;
+						}
+						goto wait_for_locker_no_spin;
+					}
+					break;
+			}
+
+			xt_timed_wait_cond_ns(&my_wt->wt_cond, &my_wt->wt_lock, WAIT_FOR_XACT_TIME);
+		}
+
+		/* Unreachable
+		xt_unlock_mutex_ns(&my_wt->wt_lock);
+		*/
+	}
+
+	if (xw) {
+		xtThreadID		tn_thd_id;
+
+		wait_for_xact:
+		wf.wf_for_me_xn_id = xw->xw_xn_id;
+
+		if (!xn_get_xact_pointer(db, xw->xw_xn_id, &wait_xact_ptr))
+			/* The transaction was not found... */
+			goto wait_done;
+
+		if (wait_xact_ptr) {
+			/* This is a dirty read, but it should work! */
+			flags = wait_xact_ptr->xd_flags;
+			start = wait_xact_ptr->xd_start_xn_id;
+			tn_thd_id = wait_xact_ptr->xd_thread_id;
+		}
+		else {
+			tn_thd_id = 0;
+			if (!xn_get_xact_details(db, xw->xw_xn_id, thread, &flags, &start, NULL, &tn_thd_id))
+				flags = XT_XN_XAC_ENDED | XT_XN_XAC_SWEEP;
+		}
+
+		if ((flags & XT_XN_XAC_ENDED) || start != xw->xw_xn_id)
+			/* The transaction has terminated! */
+			goto wait_done;
+
+		/* Tell the thread we are waiting for it: */
+		xn_wait_for_thread(thread->t_id, tn_thd_id);
+
+		if (!on_wait_list) {
+			if (!xn_add_to_wait_for(db, &wf, thread))
+				return FAILED;
+			on_wait_list = TRUE;
+		}
+
+		/* The spinning version: */
+		while (loop_count < WAIT_SPIN_COUNT) {
+			loop_count++;
+
+			xt_critical_wait();
+
+			if (wait_xact_ptr) {
+				/* This is a dirty read, but it should work! */
+				flags = wait_xact_ptr->xd_flags;
+				start = wait_xact_ptr->xd_start_xn_id;
+			}
+			else {
+				if (!xn_get_xact_details(db, xw->xw_xn_id, thread, &flags, &start, NULL, NULL))
+					flags = XT_XN_XAC_ENDED | XT_XN_XAC_SWEEP;
+			}
+
+			if ((flags & XT_XN_XAC_ENDED) || start != xw->xw_xn_id)
+				/* The transaction has terminated! */
+				goto wait_done;
+		}
+
+		/* The non-spinning version:
+		 *
+		 * I believe I can avoid missing the wakeup signal
+		 * by locking before we check if the transaction
+		 * is still running.
+		 *
+		 * Even though db->db_xn_wait_on_cond is "dirty read".
+		 *
+		 * The reason is, before the signal is sent the 
+		 * lock is also aquired. This is not possible until
+		 * this thread is safely sleaping.
+		 */
+		my_wt = &xn_wait_thread_array[thread->t_id];
+		xt_lock_mutex_ns(&my_wt->wt_lock);
+
+		for (;;) {
+			if (wait_xact_ptr) {
+				/* This is a dirty read, but it should work! */
+				flags = wait_xact_ptr->xd_flags;
+				start = wait_xact_ptr->xd_start_xn_id;
+			}
+			else {
+				if (!xn_get_xact_details(db, xw->xw_xn_id, thread, &flags, &start, NULL, NULL))
+					flags = XT_XN_XAC_ENDED | XT_XN_XAC_SWEEP;
+			}
+
+			if ((flags & XT_XN_XAC_ENDED) || start != xw->xw_xn_id)
+				/* The transaction has terminated! */
+				break;
+
+			xt_timed_wait_cond_ns(&my_wt->wt_cond, &my_wt->wt_lock, WAIT_FOR_XACT_TIME);
+		}
+
+		xt_unlock_mutex_ns(&my_wt->wt_lock);
+
+		wait_done:
+		if (on_wait_list)
+			xn_remove_from_wait_for(db, &wf, thread);
+	}
+
+	return OK;
+}
+
+#else // XT_USE_SPINLOCK_WAIT_FOR
+/*
+ * The given thread must wait for the specified transaction to terminate. This
+ * function places the transaction of the thread on a list of waiting threads.
+ *
+ * Before waiting we make a check for deadlocks. A deadlock occurs
+ * if waiting would introduce a cycle.
+ */
+xtPublic xtBool old_xt_xn_wait_for_xact(XTThreadPtr thread, xtXactID xn_id, xtBool will_retry, XTLockWaitFuncPtr pw_func, XTLockWaitPtr pw_data)
+{
+	XTDatabaseHPtr		db = thread->st_database;
+	XNWaitForRec		wf;
+	int					flags = 0;
+	xtXactID			start = 0;
+
+	ASSERT_NS(thread->st_xact_data);
+
+	thread->st_statistics.st_wait_for_xact++;
+	wf.wf_waiting_xn_id = thread->st_xact_data->xd_start_xn_id;
+	wf.wf_for_me_xn_id = xn_id;
+	wf.wf_thread_id = thread->t_id;
+
+	xt_lock_mutex_ns(&db->db_xn_wait_lock);
+
+#ifdef TRACE_WAIT_FOR
+	xt_ttracef(thread, "T%lu -wait-> T%lu\n", (u_long) thread->st_xact_data->xd_start_xn_id, (u_long) xn_id);
+#endif
+	for (;;) {
+		if (!xn_get_xact_details(db, xn_id, thread, &flags, &start, NULL, NULL))
+			break;
+
+		/* This is a dirty read, but it should work! */
+		if ((flags & XT_XN_XAC_ENDED) || start != xn_id)
+			break;
+
+		if (xn_detect_deadlock(db, wf.wf_waiting_xn_id, wf.wf_for_me_xn_id))
+			goto failed;
+
+		/* We will wait for this transaction... */
+		db->db_xn_wait_count++;
+		if (thread->st_xact_writer)
+			db->db_xn_writer_wait_count++;
+
+		if (!xt_sl_insert(NULL, db->db_xn_wait_for, &wf.wf_waiting_xn_id, &wf)) {
+			db->db_xn_wait_count--;
+			goto failed;
+		}
+
+		if (!xn_get_xact_details(db, xn_id, thread, &flags, &start, NULL, NULL)) {
+			xt_sl_delete(NULL, db->db_xn_wait_for, &wf.wf_waiting_xn_id);
+			db->db_xn_wait_count--;
+			if (thread->st_xact_writer)
+				db->db_xn_writer_wait_count--;
+			break;
+		}
+
+		if ((flags & XT_XN_XAC_ENDED) || start != xn_id) {
+			xt_sl_delete(NULL, db->db_xn_wait_for, &wf.wf_waiting_xn_id);
+			db->db_xn_wait_count--;
+			if (thread->st_xact_writer)
+				db->db_xn_writer_wait_count--;
+			break;
+		}
+
+		db->db_xn_post_wait[thread->t_id].pw_call_me = pw_func;
+		db->db_xn_post_wait[thread->t_id].pw_thread = thread;
+		db->db_xn_post_wait[thread->t_id].pw_data = pw_data;
+
+		/* Timed wait because it is possible that transaction quits before
+		 * we go to sleep.
+		 */
+		if (!xt_timed_wait_cond(NULL, &db->db_xn_wait_cond, &db->db_xn_wait_lock, 2 * 1000)) {
+			xt_sl_delete(NULL, db->db_xn_wait_for, &wf.wf_waiting_xn_id);
+			db->db_xn_wait_count--;
+			if (thread->st_xact_writer)
+				db->db_xn_writer_wait_count--;
+			goto failed;
+		}
+
+		db->db_xn_post_wait[thread->t_id].pw_call_me = NULL;
+		xt_sl_delete(NULL, db->db_xn_wait_for, &wf.wf_waiting_xn_id);
+		db->db_xn_wait_count--;
+		if (thread->st_xact_writer)
+			db->db_xn_writer_wait_count--;
+		
+		if (will_retry)
+			break;
+	}
+
+#ifdef TRACE_WAIT_FOR
+	xt_ttracef(thread, "T%lu -wait-> T%lu DONE\n", (u_long) thread->st_xact_data->xd_start_xn_id, (u_long) xn_id);
+#endif
+	xt_unlock_mutex_ns(&db->db_xn_wait_lock);
+	return OK;
+
+	failed:
+#ifdef TRACE_WAIT_FOR
+	xt_ttracef(self, "T%lu -wait-> T%lu FAILED\n", (u_long) self->st_xact_data->xd_start_xn_id, (u_long) xn_id);
+#endif
+	xt_unlock_mutex_ns(&db->db_xn_wait_lock);
+	return FAILED;
+}
+
+xtPublic void old_xt_xn_wakeup_transactions(XTDatabaseHPtr db, XTThreadPtr thread)
+{
+	u_int			len;
+	XNWaitForPtr	wf;
+
+	xt_lock_mutex_ns(&db->db_xn_wait_lock);
+	/* The idea here is to release the oldest transactions
+	 * first. Although this may not be completely fair
+	 * it has the advantage that older transactions are
+	 * encouraged to complete first.
+	 *
+	 * I have found the following problem with this test:
+	 * runTest(INCREMENT_TEST, 16, INCREMENT_TEST_UPDATE_COUNT);
+	 * with a bit of bad luck a transaction can be starved.
+	 * This results in the sweeper stalling because it is
+	 * waiting for an old transaction to quite so that
+	 * it continue.
+	 *
+	 * Because the sweeper is waiting, the number of
+	 * versions of the record to be updated
+	 * begins to increase. In the above test over
+	 * 1600 transaction remain uncleaned.
+	 *
+	 * This means that there are 1600 version of the
+	 * row which must be scanned to find the most
+	 * recent version.
+	 */
+	if ((len = (u_int) xt_sl_get_size(db->db_xn_wait_for))) {
+		for (u_int i=0; i<len; i++) {
+			wf = (XNWaitForPtr) xt_sl_item_at(db->db_xn_wait_for, i);
+			if (db->db_xn_post_wait[wf->wf_thread_id].pw_call_me) {
+				if (db->db_xn_post_wait[wf->wf_thread_id].pw_call_me(thread, &db->db_xn_post_wait[wf->wf_thread_id]))
+					db->db_xn_post_wait[wf->wf_thread_id].pw_call_me = NULL;
+			}
+		}
+		if (!xt_broadcast_cond_ns(&db->db_xn_wait_cond))
+			xt_log_and_clear_exception_ns();
+	}
+	ASSERT_NS(db->db_xn_wait_count == len);
+	xt_unlock_mutex_ns(&db->db_xn_wait_lock);
+}
+#endif  // XT_USE_SPINLOCK_WAIT_FOR
+
+/* ----------------------------------------------------------------------
+ * Utilities
+ */
+
+//#define HIGH_X
+#ifdef HIGH_X
+u_long tot_alloced;
+u_long high_alloced;
+u_long not_clean_max;
+u_long in_ram_max;
+#endif
+
+static void xn_free_xact(XTDatabaseHPtr db, XTXactSegPtr seg, XTXactDataPtr xact)
+{
+#ifdef HIGH_X
+	tot_alloced--;
+#endif
+	/* This indicates the structure is free: */
+	xact->xd_start_xn_id = 0;
+	if ((xtWord1 *) xact >= db->db_xn_data && (xtWord1 *) xact < db->db_xn_data_end) {
+		/* Put it in the free list: */
+		xact->xd_next_xact = seg->xs_free_list;
+		seg->xs_free_list = xact;
+		return;
+	}
+	xt_free_ns(xact);
+}
+
+/*
+ * GOTCHA: The value db->db_xn_curr_id may be a bit larger
+ * than the actual transaction created because there is
+ * a gap between the issude of the transaction ID
+ * and the creation of a memory structure.
+ * (indicated here: {GAP-INC-ADD-XACT})
+ *
+ * This function returns the actuall current transaction ID.
+ * This is the number of the last transaction actually
+ * created in memory.
+ *
+ * This means that if you call xt_xn_get_xact() with any
+ * number less than or equal to this value, not finding
+ * the transaction means it has already ended!
+ */
+xtPublic xtXactID xt_xn_get_curr_id(XTDatabaseHPtr db)
+{
+	int						i;
+	xtXactID				curr_xn_id;
+	register XTXactSegPtr 	seg = db->db_xn_idx;
+
+	/* Find the highest transaction ID actually created... */
+	curr_xn_id = seg->xs_last_xn_id;
+	seg++;
+	for (i=1; i<XT_XN_NO_OF_SEGMENTS; i++, seg++) {
+		if (xt_xn_is_before(curr_xn_id, seg->xs_last_xn_id))
+			curr_xn_id = seg->xs_last_xn_id;
+	}
+	return curr_xn_id;
+}
+
+xtPublic XTXactDataPtr xt_xn_add_old_xact(XTDatabaseHPtr db, xtXactID xn_id, XTThreadPtr thread)
+{
+	register XTXactDataPtr	xact;
+	register XTXactSegPtr 	seg;
+	register XTXactDataPtr	*hash;
+
+	(void) thread;
+	seg = &db->db_xn_idx[xn_id & XT_XN_SEGMENT_MASK];
+	XT_XACT_WRITE_LOCK(&seg->xs_tab_lock, thread);
+	hash = &seg->xs_table[(xn_id >> XT_XN_SEGMENT_SHIFTS) % XT_XN_HASH_TABLE_SIZE];
+	xact = *hash;
+	while (xact) {
+		if (xact->xd_start_xn_id == xn_id)
+			goto done_ok;
+		xact = xact->xd_next_xact;
+	}
+
+	if ((xact = seg->xs_free_list))
+		seg->xs_free_list = xact->xd_next_xact;
+	else {
+		/* We have used up all the free transaction slots,
+		 * the sweeper should work faster to free them
+		 * up...
+		 */
+		db->db_sw_faster |= XT_SW_NO_MORE_XACT_SLOTS;
+		if (!(xact = (XTXactDataPtr) xt_malloc_ns(sizeof(XTXactDataRec)))) {
+			XT_XACT_UNLOCK(&seg->xs_tab_lock, thread, TRUE);
+			return NULL;
+		}
+	}
+
+	xact->xd_next_xact = *hash;
+	*hash = xact;
+
+	xact->xd_start_xn_id = xn_id;
+	xact->xd_end_xn_id = 0;
+	xact->xd_end_time = 0;
+	xact->xd_begin_log = 0;
+	xact->xd_flags = 0;
+
+	/* Get the largest transaction id. */
+	if (xt_xn_is_before(seg->xs_last_xn_id, xn_id))
+		seg->xs_last_xn_id = xn_id;
+
+	done_ok:
+	XT_XACT_UNLOCK(&seg->xs_tab_lock, thread, TRUE);
+#ifdef HIGH_X
+	tot_alloced++;
+	if (tot_alloced > high_alloced)
+		high_alloced = tot_alloced;
+#endif
+	return xact;
+}
+
+static XTXactDataPtr xn_add_new_xact(XTDatabaseHPtr db, xtXactID xn_id, XTThreadPtr thread)
+{
+	register XTXactDataPtr	xact;
+	register XTXactSegPtr 	seg;
+	register XTXactDataPtr	*hash;
+
+	(void) thread;
+	seg = &db->db_xn_idx[xn_id & XT_XN_SEGMENT_MASK];
+	XT_XACT_WRITE_LOCK(&seg->xs_tab_lock, thread);
+	hash = &seg->xs_table[(xn_id >> XT_XN_SEGMENT_SHIFTS) % XT_XN_HASH_TABLE_SIZE];
+
+	if ((xact = seg->xs_free_list))
+		seg->xs_free_list = xact->xd_next_xact;
+	else {
+		/* We have used up all the free transaction slots,
+		 * the sweeper should work faster to free them
+		 * up...
+		 */
+		db->db_sw_faster |= XT_SW_NO_MORE_XACT_SLOTS;
+		if (!(xact = (XTXactDataPtr) xt_malloc_ns(sizeof(XTXactDataRec)))) {
+			XT_XACT_UNLOCK(&seg->xs_tab_lock, thread, TRUE);
+			return NULL;
+		}
+	}
+
+	xact->xd_next_xact = *hash;
+	*hash = xact;
+
+	xact->xd_thread_id = thread->t_id;
+	xact->xd_start_xn_id = xn_id;
+	xact->xd_end_xn_id = 0;
+	xact->xd_end_time = 0;
+	xact->xd_begin_log = 0;
+	xact->xd_flags = 0;
+
+	seg->xs_last_xn_id = xn_id;
+	XT_XACT_UNLOCK(&seg->xs_tab_lock, thread, TRUE);
+#ifdef HIGH_X
+	tot_alloced++;
+	if (tot_alloced > high_alloced)
+		high_alloced = tot_alloced;
+#endif
+	return xact;
+}
+
+static xtBool xn_get_xact_details(XTDatabaseHPtr db, xtXactID xn_id, XTThreadPtr XT_UNUSED(thread), int *flags, xtXactID *start, xtWord4 *end, xtThreadID *thd_id)
+{
+	register XTXactSegPtr 	seg;
+	register XTXactDataPtr	xact;
+	xtBool					found = FALSE;
+
+	seg = &db->db_xn_idx[xn_id & XT_XN_SEGMENT_MASK];
+	XT_XACT_READ_LOCK(&seg->xs_tab_lock, thread);
+	xact = seg->xs_table[(xn_id >> XT_XN_SEGMENT_SHIFTS) % XT_XN_HASH_TABLE_SIZE];
+	while (xact) {
+		if (xact->xd_start_xn_id == xn_id) {
+			found = TRUE;
+			if (flags)
+				*flags = xact->xd_flags;
+			if (start)
+				*start = xact->xd_start_xn_id;
+			if (end)
+				*end = xact->xd_end_time;
+			if (thd_id)
+				*thd_id = xact->xd_thread_id;
+			break;
+		}
+		xact = xact->xd_next_xact;
+	}
+	XT_XACT_UNLOCK(&seg->xs_tab_lock, thread, FALSE);
+	return found;
+}
+
+static xtBool xn_get_xact_pointer(XTDatabaseHPtr db, xtXactID xn_id, XTXactDataPtr *xact_ptr)
+{
+	register XTXactSegPtr 	seg;
+	register XTXactDataPtr	xact;
+	xtBool					found = FALSE;
+
+	*xact_ptr = NULL;
+	seg = &db->db_xn_idx[xn_id & XT_XN_SEGMENT_MASK];
+	XT_XACT_READ_LOCK(&seg->xs_tab_lock, thread);
+	xact = seg->xs_table[(xn_id >> XT_XN_SEGMENT_SHIFTS) % XT_XN_HASH_TABLE_SIZE];
+	while (xact) {
+		if (xact->xd_start_xn_id == xn_id) {
+			found = TRUE;
+			/* We only return pointers to transaction structures that are permanently
+			 * allocated!
+			 */
+			if ((xtWord1 *) xact >= db->db_xn_data && (xtWord1 *) xact < db->db_xn_data_end)
+				*xact_ptr = xact;
+			break;
+		}
+		xact = xact->xd_next_xact;
+	}
+	XT_XACT_UNLOCK(&seg->xs_tab_lock, thread, FALSE);
+	return found;
+}
+
+static xtBool xn_get_xact_start(XTDatabaseHPtr db, xtXactID xn_id, XTThreadPtr XT_UNUSED(thread), xtLogID *log_id, xtLogOffset *log_offset)
+{
+	register XTXactSegPtr 	seg;
+	register XTXactDataPtr	xact;
+	xtBool					found = FALSE;
+
+	seg = &db->db_xn_idx[xn_id & XT_XN_SEGMENT_MASK];
+	XT_XACT_READ_LOCK(&seg->xs_tab_lock, thread);
+	xact = seg->xs_table[(xn_id >> XT_XN_SEGMENT_SHIFTS) % XT_XN_HASH_TABLE_SIZE];
+	while (xact) {
+		if (xact->xd_start_xn_id == xn_id) {
+			found = TRUE;
+			*log_id = xact->xd_begin_log;
+			*log_offset = xact->xd_begin_offset;
+			break;
+		}
+		xact = xact->xd_next_xact;
+	}
+	XT_XACT_UNLOCK(&seg->xs_tab_lock, thread, FALSE);
+	return found;
+}
+
+/* NOTE: this function may only be used by the sweeper or the recovery process. */
+xtPublic XTXactDataPtr xt_xn_get_xact(XTDatabaseHPtr db, xtXactID xn_id, XTThreadPtr XT_UNUSED(thread))
+{
+	register XTXactSegPtr 	seg;
+	register XTXactDataPtr	xact;
+
+	seg = &db->db_xn_idx[xn_id & XT_XN_SEGMENT_MASK];
+	XT_XACT_READ_LOCK(&seg->xs_tab_lock, thread);
+	xact = seg->xs_table[(xn_id >> XT_XN_SEGMENT_SHIFTS) % XT_XN_HASH_TABLE_SIZE];
+	while (xact) {
+		if (xact->xd_start_xn_id == xn_id)
+			break;
+		xact = xact->xd_next_xact;
+	}
+	XT_XACT_UNLOCK(&seg->xs_tab_lock, thread, FALSE);
+	return xact;
+}
+
+/*
+ * Delete a transaction, return TRUE if the transaction
+ * was found.
+ */
+xtPublic xtBool xt_xn_delete_xact(XTDatabaseHPtr db, xtXactID xn_id, XTThreadPtr thread)
+{
+	XTXactDataPtr	xact, pxact = NULL;
+	XTXactSegPtr 	seg;
+
+	(void) thread;
+	seg = &db->db_xn_idx[xn_id & XT_XN_SEGMENT_MASK];
+	XT_XACT_WRITE_LOCK(&seg->xs_tab_lock, thread);
+	xact = seg->xs_table[(xn_id >> XT_XN_SEGMENT_SHIFTS) % XT_XN_HASH_TABLE_SIZE];
+	while (xact) {
+		if (xact->xd_start_xn_id == xn_id) {
+			if (pxact)
+				pxact->xd_next_xact = xact->xd_next_xact;
+			else
+				 seg->xs_table[(xn_id >> XT_XN_SEGMENT_SHIFTS) % XT_XN_HASH_TABLE_SIZE] = xact->xd_next_xact;
+			xn_free_xact(db, seg, xact);
+			XT_XACT_UNLOCK(&seg->xs_tab_lock, thread, TRUE);
+			return TRUE;
+		}
+		pxact = xact;
+		xact = xact->xd_next_xact;
+	}
+	XT_XACT_UNLOCK(&seg->xs_tab_lock, thread, TRUE);
+	return FALSE;
+}
+
+//#define DEBUG_RAM_LIST
+#ifdef DEBUG_RAM_LIST
+
+#define DEBUG_RAM_LIST_SIZE			80
+
+int					check_ram_init_count = 0;
+xt_rwlock_type		check_ram_lock;
+xtXactID			check_ram_trns[DEBUG_RAM_LIST_SIZE];
+int					check_ram_dummy;
+
+static void check_ram_init(void)
+{
+	if (check_ram_init_count == 0)
+		xt_init_rwlock(NULL, &check_ram_lock);
+	check_ram_init_count++;
+}
+
+static void check_ram_free(void)
+{
+	check_ram_init_count--;
+	if (check_ram_init_count == 0)
+		xt_free_rwlock(&check_ram_lock);
+}
+
+static void check_ram_min_id(XTDatabaseHPtr db)
+{
+	int i;
+
+	xt_slock_rwlock_ns(&check_ram_lock);
+	for (i=0; i<DEBUG_RAM_LIST_SIZE; i++) {
+		if (check_ram_trns[i] && xt_xn_is_before(check_ram_trns[i], db->db_xn_min_ram_id)) {
+			/* This should never happen! */
+			XTXactDataPtr x_ptr;
+
+			check_ram_dummy = 0;
+			for (i=0; i<DEBUG_RAM_LIST_SIZE; i++) {
+				if (check_ram_trns[i]) {
+					x_ptr = xt_xn_get_xact(db, check_ram_trns[i]);
+					check_ram_dummy = 1;
+				}
+			}
+			break;
+		}
+	}
+	xt_unlock_rwlock_ns(&check_ram_lock);
+}
+
+static void check_ram_add(xtXactID xn_id)
+{
+	int i;
+	
+	xt_xlock_rwlock_ns(&check_ram_lock);
+	for (i=0; i<DEBUG_RAM_LIST_SIZE; i++) {
+		if (!check_ram_trns[i]) {
+			check_ram_trns[i] = xn_id;
+			xt_unlock_rwlock_ns(&check_ram_lock);
+			return;
+		}
+	}
+	xt_unlock_rwlock_ns(&check_ram_lock);
+	printf("DEBUG --- List too small\n");
+}
+
+static void check_ram_del(xtXactID xn_id)
+{
+	int i;
+	
+	xt_xlock_rwlock_ns(&check_ram_lock);
+	for (i=0; i<DEBUG_RAM_LIST_SIZE; i++) {
+		if (check_ram_trns[i] == xn_id) {
+			check_ram_trns[i] = 0;
+			xt_unlock_rwlock_ns(&check_ram_lock);
+			return;
+		}
+	}
+	xt_unlock_rwlock_ns(&check_ram_lock);
+}
+#endif
+
+/* ----------------------------------------------------------------------
+ * Init and Exit
+ */
+
+xtPublic void xt_xn_init_db(XTThreadPtr self, XTDatabaseHPtr db)
+{
+	XTXactDataPtr	xact;
+	XTXactSegPtr	seg;
+
+#ifdef DEBUG_RAM_LIST
+	check_ram_init();
+#endif
+	xt_spinlock_init_with_autoname(self, &db->db_xn_id_lock);
+	xt_spinlock_init_with_autoname(self, &db->db_xn_wait_spinlock);
+	xt_init_mutex_with_autoname(self, &db->db_xn_xa_lock);
+	//xt_init_mutex_with_autoname(self, &db->db_xn_wait_lock);
+	//xt_init_cond(self, &db->db_xn_wait_cond);
+	xt_init_mutex_with_autoname(self, &db->db_sw_lock);
+	xt_init_cond(self, &db->db_sw_cond);
+	xt_init_mutex_with_autoname(self, &db->db_wr_lock);
+	xt_init_cond(self, &db->db_wr_cond);
+
+	/* Pre-allocate transaction data structures: */
+	db->db_xn_data = (xtWord1 *) xt_malloc(self, sizeof(XTXactDataRec) * XT_XN_DATA_ALLOC_COUNT * XT_XN_NO_OF_SEGMENTS);
+	db->db_xn_data_end = db->db_xn_data + sizeof(XTXactDataRec) * XT_XN_DATA_ALLOC_COUNT * XT_XN_NO_OF_SEGMENTS;
+	xact = (XTXactDataPtr) db->db_xn_data;
+	for (u_int i=0; i<XT_XN_NO_OF_SEGMENTS; i++) {
+		seg = &db->db_xn_idx[i];
+		XT_XACT_INIT_LOCK(self, &seg->xs_tab_lock);
+		for (u_int j=0;  j<XT_XN_DATA_ALLOC_COUNT; j++) {
+			xact->xd_next_xact = seg->xs_free_list;
+			seg->xs_free_list = xact;
+			xact++;
+		}
+	}
+
+	/* Create a sorted list for XA transactions recovered: */
+	db->db_xn_xa_list = xt_new_sortedlist(self, sizeof(XTXactXARec), 100, 50, xt_xn_xa_compare, db, NULL, FALSE, FALSE);
+
+	/* Initialize the data logs: */
+	db->db_datalogs.dlc_init(self, db); 
+
+	/* Setup the transaction log: */
+	db->db_xlog.xlog_setup(self, db, (off_t) xt_db_log_file_threshold, xt_db_transaction_buffer_size, xt_db_log_file_count);
+
+	db->db_xn_end_time = 1;
+
+	/* Initializing the restart file, also does
+	 * recovery. This returns the log position after recovery.
+	 *
+	 * This is the log position where the writer thread will
+	 * begin. The writer thread writes changes to the database that
+	 * have been flushed to the log.
+	 */
+	xt_xres_init(self, db);
+
+	/* Initialize the "last transaction in memory", by default
+	 * this is the current transaction ID, which is the ID
+	 * of the last transaction.
+	 */
+	for (u_int i=0; i<XT_XN_NO_OF_SEGMENTS; i++) {
+		seg = &db->db_xn_idx[i];
+		XT_XACT_INIT_LOCK(self, &seg->xs_tab_lock);
+		seg->xs_last_xn_id = db->db_xn_curr_id;
+	}
+
+	/*
+	 * The next transaction to clean is the lowest transaction
+	 * in memory:
+	 */
+	db->db_xn_to_clean_id = db->db_xn_min_ram_id;
+
+	/*
+	 * No transactions are running, so the minimum transaction
+	 * ID is the next one to run:
+	 */
+	db->db_xn_min_run_id = db->db_xn_curr_id + 1;
+
+	db->db_xn_wait_for = xt_new_sortedlist(self, sizeof(XNWaitForRec), 100, 50, xn_compare_wait_for, db, xn_free_wait_for, FALSE, FALSE);
+}
+
+xtPublic void xt_xn_exit_db(XTThreadPtr self, XTDatabaseHPtr db)
+{
+#ifdef HIGH_X
+	printf("=========> MOST TXs CURR ALLOC: %lu\n", tot_alloced);
+	printf("=========> MOST TXs HIGH ALLOC: %lu\n", high_alloced);
+	printf("=========> MAX TXs NOT CLEAN: %lu\n", not_clean_max);
+	printf("=========> MAX TXs IN RAM: %lu\n", in_ram_max);
+#endif
+	XTXactPreparePtr xap, xap_next;
+
+	xt_stop_sweeper(self, db);	// Should be done already!
+	xt_stop_writer(self, db);	// Should be done already!
+
+	xt_xres_exit(self, db);
+	db->db_xlog.xlog_exit(self);
+
+	db->db_datalogs.dlc_exit(self); 
+
+	for (u_int i=0; i<XT_XN_NO_OF_SEGMENTS; i++) {
+		XTXactSegPtr 	seg;
+
+		seg = &db->db_xn_idx[i];
+		for (u_int j=0; j<XT_XN_HASH_TABLE_SIZE; j++) {
+			XTXactDataPtr	xact, nxact;
+			
+			xact = seg->xs_table[j];
+			while (xact) {
+				nxact = xact->xd_next_xact;
+				xn_free_xact(db, seg, xact);
+				xact = nxact;
+			}
+		}
+		XT_XACT_FREE_LOCK(self, &seg->xs_tab_lock);
+	}
+	if (db->db_xn_wait_for) {
+		xt_free_sortedlist(self, db->db_xn_wait_for);
+		db->db_xn_wait_for = NULL;
+	}
+	if (db->db_xn_data) {
+		xt_free(self, db->db_xn_data);
+		db->db_xn_data = NULL;
+		db->db_xn_data_end = NULL;
+	}
+
+	xt_free_cond(&db->db_wr_cond);
+	xt_free_mutex(&db->db_wr_lock);
+	xt_free_cond(&db->db_sw_cond);
+	xt_free_mutex(&db->db_sw_lock);
+	//xt_free_cond(&db->db_xn_wait_cond);
+	//xt_free_mutex(&db->db_xn_wait_lock);
+	xt_free_mutex(&db->db_xn_xa_lock);
+	for (u_int i=0; i<XT_XA_HASH_TAB_SIZE; i++) {
+		xap = db->db_xn_xa_table[i];
+		while (xap) {
+			xap_next = xap->xp_next;
+			xt_free(self, xap);
+			xap = xap_next;
+		}
+	}
+	if (db->db_xn_xa_list) {
+		xt_free_sortedlist(self, db->db_xn_xa_list);
+		db->db_xn_xa_list = NULL;
+	}
+	xt_spinlock_free(self, &db->db_xn_wait_spinlock);
+	xt_spinlock_free(self, &db->db_xn_id_lock);
+#ifdef DEBUG_RAM_LIST
+	check_ram_free();
+#endif
+}
+
+xtPublic void xt_xn_init_thread(XTThreadPtr self, int what_for)
+{
+	ASSERT(self->st_database);
+
+	if (!xt_init_row_lock_list(&self->st_lock_list))
+		xt_throw(self);
+	switch (what_for) {
+		case XT_FOR_COMPACTOR:
+			self->st_dlog_buf.dlb_init(self->st_database, xt_db_log_buffer_size);
+			break;
+		case XT_FOR_WRITER:
+			/* The writer does not need a transaction buffer. */
+			self->st_dlog_buf.dlb_init(self->st_database, 0);
+			break;
+		case XT_FOR_SWEEPER:
+			self->st_dlog_buf.dlb_init(self->st_database, 0);
+			break;
+		case XT_FOR_USER:
+			self->st_dlog_buf.dlb_init(self->st_database, xt_db_log_buffer_size);
+			break;
+	}
+}
+
+xtPublic void xt_xn_exit_thread(XTThreadPtr self)
+{
+	if (self->st_xact_data)
+		xt_xn_rollback(self);
+	self->st_dlog_buf.dlb_exit(self);
+	xt_exit_row_lock_list(&self->st_lock_list);
+}
+
+/* ----------------------------------------------------------------------
+ * Begin and End Transactions
+ */
+
+xtPublic xtBool xt_xn_begin(XTThreadPtr self)
+{
+	XTDatabaseHPtr	db = self->st_database;
+	xtXactID		xn_id;
+
+	ASSERT(!self->st_xact_data);
+
+	xt_spinlock_lock(&db->db_xn_id_lock);
+	xn_id = ++db->db_xn_curr_id;
+	xt_spinlock_unlock(&db->db_xn_id_lock);
+
+#ifdef HIGH_X
+	if (xt_xn_is_before(not_clean_max, xn_id - db->db_xn_to_clean_id))
+		not_clean_max = xn_id - db->db_xn_to_clean_id;
+	if (xt_xn_is_before(in_ram_max, xn_id - db->db_xn_min_ram_id))
+		in_ram_max = xn_id - db->db_xn_min_ram_id;
+#endif
+	/* {GAP-INC-ADD-XACT} This is the gap between incrementing the ID,
+	 * and creating the transaction in memory.
+	 * See xt_xn_get_curr_id().
+	 */
+
+	if (!(self->st_xact_data = xn_add_new_xact(db, xn_id, self)))
+		return FAILED;
+	self->st_xact_writer = FALSE;
+	
+	/* All transactions that committed before or at this time
+	 * are this one are visible: */
+	self->st_visible_time = db->db_xn_end_time;
+
+#ifdef TRACE_TRANSACTION
+	xt_ttracef(self, "BEGIN T%lu\n", (u_long) self->st_xact_data->xd_start_xn_id);
+#endif
+#ifdef XT_TRACK_CONNECTIONS
+	xt_track_conn_info[self->t_id].ci_curr_xact_id = self->st_xact_data->xd_start_xn_id;
+	xt_track_conn_info[self->t_id].ci_xact_start = xt_trace_clock();
+#endif
+	return OK;
+}
+
+static xtBool xn_end_xact(XTThreadPtr thread, u_int status)
+{
+	XTXactDataPtr	xact;
+	xtBool			ok = TRUE;
+
+	ASSERT_NS(thread->st_xact_data);
+	if ((xact = thread->st_xact_data)) {
+		XTDatabaseHPtr	db = thread->st_database;
+		xtXactID		xn_id = xact->xd_start_xn_id;
+		xtBool			writer;
+		
+		if ((writer = thread->st_xact_writer)) {
+			/* The transaction wrote something: */
+			XTXactEndEntryDRec	entry;
+			xtWord4				sum;
+
+			sum = XT_CHECKSUM4_XACT(xn_id) ^ XT_CHECKSUM4_XACT(0);
+			entry.xe_status_1 = status;
+			entry.xe_checksum_1 = XT_CHECKSUM_1(sum);
+			XT_SET_DISK_4(entry.xe_xact_id_4, xn_id);
+			XT_SET_DISK_4(entry.xe_not_used_4, 0);
+
+#ifdef XT_IMPLEMENT_NO_ACTION
+			/* This will check any resticts that have been delayed to the end of the statement. */
+			if (thread->st_restrict_list.bl_count) {
+				if (!xt_tab_restrict_rows(&thread->st_restrict_list, thread)) {
+					ok = FALSE;
+					status = XT_LOG_ENT_ABORT;
+				}
+			}
+#endif
+
+			/* Flush the data log: */
+			if (!thread->st_dlog_buf.dlb_flush_log(TRUE, thread)) {
+				ok = FALSE;
+				status = XT_LOG_ENT_ABORT;
+			}
+
+			/* Write and flush the transaction log: */
+			if (!xt_xlog_log_data(thread, sizeof(XTXactEndEntryDRec), (XTXactLogBufferDPtr) &entry, xt_db_flush_log_at_trx_commit)) {
+				ok = FALSE;
+				status = XT_LOG_ENT_ABORT;
+				/* Make sure this is done, if we failed to log
+				 * the transction end!
+				 */
+				if (thread->st_xact_writer) {
+					/* Adjust this in case of error, but don't forget
+					 * to lock!
+					 */
+					xt_spinlock_lock(&db->db_xlog.xl_buffer_lock);
+					db->db_xn_writer_count--;
+					thread->st_xact_writer = FALSE;
+					if (thread->st_xact_long_running) {
+						db->db_xn_long_running_count--;
+						thread->st_xact_long_running = FALSE;
+					}
+					xt_spinlock_unlock(&db->db_xlog.xl_buffer_lock);
+				}
+			}
+
+			/* Setting this flag completes the transaction,
+			 * Do this before we release the locks, because
+			 * the unlocked transactions expect the
+			 * transaction they are waiting for to be
+			 * gone!
+			 */
+			xact->xd_end_time = ++db->db_xn_end_time;
+			if (status == XT_LOG_ENT_COMMIT) {
+				thread->st_statistics.st_commits++;
+				xact->xd_flags |= (XT_XN_XAC_COMMITTED | XT_XN_XAC_ENDED);
+			}
+			else {
+				thread->st_statistics.st_rollbacks++;
+				xact->xd_flags |= XT_XN_XAC_ENDED;
+			}
+
+			/* {REMOVE-LOCKS} Drop locks is you have any: */
+			thread->st_lock_list.xt_remove_all_locks(db, thread);
+
+			/* Do this afterwards to make sure the sweeper
+			 * does not cleanup transactions start cleaning up
+			 * before any transactions that were waiting for
+			 * this transaction have completed!
+			 */
+			xact->xd_end_xn_id = db->db_xn_curr_id;
+
+			/* Now you can sweep! */
+			xact->xd_flags |= XT_XN_XAC_SWEEP;
+		}
+		else {
+			/* Read-only transaction can be removed, immediately */
+			xact->xd_end_time = ++db->db_xn_end_time;
+			xact->xd_flags |= (XT_XN_XAC_COMMITTED | XT_XN_XAC_ENDED);
+
+			/* Drop locks is you have any: */
+			thread->st_lock_list.xt_remove_all_locks(db, thread);
+
+			xact->xd_end_xn_id = db->db_xn_curr_id;
+
+			xact->xd_flags |= XT_XN_XAC_SWEEP;
+
+			if (xt_xn_delete_xact(db, xn_id, thread)) {
+				if (db->db_xn_min_ram_id == xn_id)
+					db->db_xn_min_ram_id = xn_id+1;
+			}
+		}
+
+#ifdef TRACE_TRANSACTION
+		if (status == XT_LOG_ENT_COMMIT)
+			xt_ttracef(thread, "COMMIT T%lu\n", (u_long) xn_id);
+		else
+			xt_ttracef(thread, "ABORT T%lu\n", (u_long) xn_id);
+#endif
+
+		if (db->db_xn_min_run_id == xn_id)
+			db->db_xn_min_run_id = xn_id+1;
+
+		thread->st_xact_data = NULL;
+
+#ifdef XT_TRACK_CONNECTIONS
+		xt_track_conn_info[thread->t_id].ci_prev_xact_id = xt_track_conn_info[thread->t_id].ci_curr_xact_id;
+		xt_track_conn_info[thread->t_id].ci_prev_xact_time = xt_trace_clock() - xt_track_conn_info[thread->t_id].ci_xact_start;
+		xt_track_conn_info[thread->t_id].ci_curr_xact_id = 0;
+		xt_track_conn_info[thread->t_id].ci_xact_start = 0;
+#endif
+
+		xt_xn_wakeup_waiting_threads(thread);
+
+		/* {WAKE-SW} Waking the sweeper
+		 * is no longer unconditional.
+		 * (see all comments to {WAKE-SW})
+		 *
+		 * We now wake the sweeper if it is
+		 * supposed to work faster.
+		 *
+		 * There are now 2 cases:
+		 * - We run out of transaction slots.
+		 * - We encounter old index entries.
+		 *
+		 * The following test:
+		 * runTest(INCREMENT_TEST, 16, INCREMENT_TEST_UPDATE_COUNT);
+		 * has extreme problems with sweeping every 1/10s
+		 * because a huge number of index entries accumulate
+		 * that need to be cleaned.
+		 *
+		 * New code detects this case.
+		 */
+		if (db->db_sw_faster)
+			xt_wakeup_sweeper(db);
+
+		/* Don't get too far ahead of the sweeper! */
+		if (writer) {
+#ifdef XT_WAIT_FOR_CLEANUP
+			if (db->db_sw_faster & XT_SW_TOO_FAR_BEHIND) {
+				/* Set a maximum wait time (1/100s) */
+				xtWord8		then = xt_trace_clock() + (xtWord8) 100000;
+				xtXactID	wait_xn_id;
+				
+				/* This is the transaction that was committed 3 transactions ago: */
+				wait_xn_id = thread->st_prev_xact[thread->st_last_xact];
+				thread->st_prev_xact[thread->st_last_xact] = xn_id;
+				/* This works because XT_MAX_XACT_BEHIND == 2! */
+				ASSERT_NS((thread->st_last_xact + 1) % XT_MAX_XACT_BEHIND == (thread->st_last_xact ^ 1));
+				thread->st_last_xact ^= 1;
+
+				while (xt_xn_is_before(db->db_xn_to_clean_id, wait_xn_id) && (db->db_sw_faster & XT_SW_TOO_FAR_BEHIND)) {
+					if (xt_trace_clock() >= then)
+						break;
+					xt_critical_wait();
+				}
+			}
+#else
+			if ((db->db_sw_faster & XT_SW_TOO_FAR_BEHIND) != 0) {
+				xtWord8	then = xt_trace_clock() + (xtWord8) 20000;
+
+				for (;;) {
+					xt_critical_wait();
+					if (db->db_sw_faster & XT_SW_TOO_FAR_BEHIND)
+						break;
+					if (xt_trace_clock() >= then)
+						break;
+				}
+			}
+#endif
+		}
+	}
+	return ok;
+}
+
+xtPublic xtBool xt_xn_commit(XTThreadPtr thread)
+{
+	return xn_end_xact(thread, XT_LOG_ENT_COMMIT);
+}
+
+xtPublic xtBool xt_xn_rollback(XTThreadPtr thread)
+{
+	return xn_end_xact(thread, XT_LOG_ENT_ABORT);
+}
+
+xtPublic xtBool xt_xn_log_tab_id(XTThreadPtr self, xtTableID tab_id)
+{
+	XTXactNewTabEntryDRec	entry;
+
+	entry.xt_status_1 = XT_LOG_ENT_NEW_TAB;
+	entry.xt_checksum_1 = XT_CHECKSUM_1(tab_id);
+	XT_SET_DISK_4(entry.xt_tab_id_4, tab_id);
+	return xt_xlog_log_data(self, sizeof(XTXactNewTabEntryDRec), (XTXactLogBufferDPtr) &entry, XT_XLOG_WRITE_AND_FLUSH);
+}
+
+xtPublic int xt_xn_status(XTOpenTablePtr ot, xtXactID xn_id, xtRecordID XT_UNUSED(rec_id))
+{
+	register XTThreadPtr	self = ot->ot_thread;
+	int						flags;
+	xtWord4					end;
+
+#ifdef DRIZZLED
+	/* Conditional waste of time!
+	 * Drizzle has strict warnings.
+	 * I know this is not necessary!
+	 */
+	flags = 0;
+	end = 0;
+#endif
+	if (xn_id == self->st_xact_data->xd_start_xn_id)
+		return XT_XN_MY_UPDATE;
+	if (xt_xn_is_before(xn_id, self->st_database->db_xn_min_ram_id) ||
+		!xn_get_xact_details(self->st_database, xn_id, ot->ot_thread, &flags, NULL, &end, NULL)) {
+		/* Not in RAM, rollback done: */
+//*DBG*/xt_dump_xlogs(self->st_database, 0);
+//*DBG*/xt_check_table(self, ot);
+//*DBG*/xt_dump_trace();
+		/* {XACT-NOT-IN-RAM}
+		 * This should never happen (CHANGED see below)!
+		 *
+		 * Because if the transaction is no longer in RAM, then it has been
+		 * cleaned up. So the record should be marked as clean, or not
+		 * exist.
+		 *
+		 * After sweeping, we wait for all transactions to quit that were
+		 * running at the time of cleanup before removing the transaction record.
+		 * (see {XACT-NOT-IN-RAM})
+		 *
+		 * If this was not the case, then we could be here because:
+		 * - The user transaction (T2) reads record x and notes that the record
+		 * has not been cleaned (CLEAN bit not set).
+		 *
+		 * - The sweeper is busy sweeping the transaction (T1) that created
+		 * record x.
+		 * The SW sets the CLEAN bit on record x, and the schedules T1 for
+		 * deletion.
+		 *
+		 * Now T1 should not be deleted before T2 quits. If it does happen
+		 * then we land up here.
+		 *
+		 * THIS CAN NOW HAPPEN!
+		 *
+		 * First of all, a MYSTERY:
+		 * This did happen, dispite the description above! The reason why
+		 * is left as an exercise to the reader (really, I don't now why!)
+		 *
+		 * This has force me to add code to handle the situation. This
+		 * is done by re-reading the record that is being checked by this
+		 * function. After re-reading, the record should either be
+		 * invalid (free) or clean (CLEAN bit set).
+		 *
+		 * If this is the case, then we will not run land up here
+		 * again.
+		 *
+		 * Because we are only here because the record was valid but not
+		 * clean (you can confirm this by looking at the code that
+		 * calls this function).
+		 *
+		 * See {RETRY-READ}
+		 */
+		return XT_XN_REREAD;
+	}
+	if (!(flags & XT_XN_XAC_ENDED))
+		/* Transaction not ended, may be visible. */
+		return XT_XN_OTHER_UPDATE;
+	/* Visible if the transaction was committed: */
+	if (flags & XT_XN_XAC_COMMITTED) {
+		if (!xt_xn_is_before(self->st_visible_time, end))  // was self->st_visible_time >= xact->xd_end_time
+			return XT_XN_VISIBLE;
+		return XT_XN_NOT_VISIBLE;
+	}
+	return XT_XN_ABORTED;
+}
+
+/* ----------------------------------------------------------------------
+ * XA Functionality
+ */
+ 
+xtPublic int xt_xn_xa_compare(XTThreadPtr XT_UNUSED(self), register const void *XT_UNUSED(thunk), register const void *a, register const void *b)
+{
+	xtXactID	*x = (xtXactID *) a;
+	XTXactXAPtr	y = (XTXactXAPtr) b;
+
+	if (*x == y->xx_xact_id)
+		return 0;
+	if (xt_xn_is_before(*x, y->xx_xact_id))
+		return -1;
+	return 1;
+}
+
+xtPublic xtBool xt_xn_prepare(int len, xtWord1 *xa_data, XTThreadPtr thread)
+{
+	XTXactDataPtr xact;
+
+	ASSERT_NS(thread->st_xact_data);
+	if ((xact = thread->st_xact_data)) {
+		xtXactID xn_id = xact->xd_start_xn_id;
+
+		/* Only makes sense if the transaction has already been logged: */
+		if ((thread->st_xact_data->xd_flags & XT_XN_XAC_LOGGED)) {
+			if (!xt_xlog_modify_table(0, XT_LOG_ENT_PREPARE, xn_id, 0, 0, len, xa_data, thread))
+				return FAILED;
+		}
+	}
+	return OK;
+}
+
+xtPublic xtBool xt_xn_store_xa_data(XTDatabaseHPtr db, xtXactID xact_id, int len, xtWord1 *xa_data, XTThreadPtr XT_UNUSED(thread))
+{
+	XTXactPreparePtr	xap;
+	u_int				idx;
+	XTXactXARec			xx;
+
+	if (!(xap = (XTXactPreparePtr) xt_malloc_ns(offsetof(XTXactPrepareRec, xp_xa_data) + len)))
+		return FAILED;
+	xap->xp_xact_id = xact_id;
+	xap->xp_hash = xt_get_checksum4(xa_data, len);
+	xap->xp_data_len = len;
+	memcpy(xap->xp_xa_data, xa_data, len);
+	xx.xx_xact_id = xact_id;
+	xx.xx_xa_ptr = xap;
+
+	idx = xap->xp_hash % XT_XA_HASH_TAB_SIZE;
+	xt_lock_mutex_ns(&db->db_xn_xa_lock);
+	if (!xt_sl_insert(NULL, db->db_xn_xa_list, &xact_id, &xx)) {
+		xt_unlock_mutex_ns(&db->db_xn_xa_lock);
+		xt_free_ns(xap);
+	}
+	xap->xp_next = db->db_xn_xa_table[idx];
+	db->db_xn_xa_table[idx] = xap;
+	xt_unlock_mutex_ns(&db->db_xn_xa_lock);
+	return OK;
+}
+
+xtPublic void xt_xn_delete_xa_data_by_xact(XTDatabaseHPtr db, xtXactID xact_id, XTThreadPtr thread)
+{
+	XTXactXAPtr xx;
+
+	xt_lock_mutex_ns(&db->db_xn_xa_lock);
+	if (!(xx = (XTXactXAPtr) xt_sl_find(NULL, db->db_xn_xa_list, &xact_id)))
+		return;
+	xt_xn_delete_xa_data(db, xx->xx_xa_ptr, TRUE, thread);
+}
+
+xtPublic void xt_xn_delete_xa_data(XTDatabaseHPtr db, XTXactPreparePtr xap, xtBool unlock, XTThreadPtr XT_UNUSED(thread))
+{
+	u_int				idx;
+	XTXactPreparePtr	xap_ptr, xap_pptr = NULL;
+
+	xt_sl_delete(NULL, db->db_xn_xa_list, &xap->xp_xact_id);
+	idx = xap->xp_hash % XT_XA_HASH_TAB_SIZE;
+	xap_ptr = db->db_xn_xa_table[idx];
+	while (xap_ptr) {
+		if (xap_ptr == xap)
+			break;
+		xap_pptr = xap_ptr;
+		xap_ptr = xap_ptr->xp_next;
+	}
+	if (xap_ptr) {
+		if (xap_pptr)
+			xap_pptr->xp_next = xap_ptr->xp_next;
+		else
+			db->db_xn_xa_table[idx] = xap_ptr->xp_next;
+		xt_free_ns(xap);
+	}
+	if (unlock)
+		xt_unlock_mutex_ns(&db->db_xn_xa_lock);
+}
+
+xtPublic XTXactPreparePtr xt_xn_find_xa_data(XTDatabaseHPtr db, int len, xtWord1 *xa_data, xtBool lock, XTThreadPtr XT_UNUSED(thread))
+{
+	xtWord4				hash;
+	XTXactPreparePtr	xap;
+	u_int				idx;
+
+	if (lock)
+		xt_lock_mutex_ns(&db->db_xn_xa_lock);
+	hash = xt_get_checksum4(xa_data, len);
+	idx = hash % XT_XA_HASH_TAB_SIZE;
+	xap = db->db_xn_xa_table[idx];
+	while (xap) {
+		if (xap->xp_hash == hash &&
+			xap->xp_data_len == len &&
+			memcmp(xap->xp_xa_data, xa_data, len) == 0) {
+			break;
+		}
+		xap = xap->xp_next;
+	}
+	
+	return xap;
+}
+
+xtPublic XTXactPreparePtr xt_xn_enum_xa_data(XTDatabaseHPtr db, XTXactEnumXAPtr exa)
+{
+	XTXactXAPtr xx;
+
+	if (!exa->exa_locked) {
+		xt_lock_mutex_ns(&db->db_xn_xa_lock);
+		exa->exa_locked = TRUE;
+	}
+
+	if ((xx = (XTXactXAPtr) xt_sl_item_at(db->db_xn_xa_list, exa->exa_index))) {
+		exa->exa_index++;
+		return xx->xx_xa_ptr;
+	}
+
+	if (exa->exa_locked) {
+		exa->exa_locked = FALSE;
+		xt_unlock_mutex_ns(&db->db_xn_xa_lock);
+	}
+	return NULL;
+}
+
+/* ----------------------------------------------------------------------
+ * S W E E P E R    F U N C T I O N S
+ */
+
+xtPublic xtWord8 xt_xn_bytes_to_sweep(XTDatabaseHPtr db, XTThreadPtr thread)
+{
+	xtXactID				xn_id;
+	xtXactID				curr_xn_id;
+	xtLogID					xn_log_id = 0;
+	xtLogOffset				xn_log_offset = 0;
+	xtLogID					x_log_id = 0;
+	xtLogOffset				x_log_offset = 0;
+	xtLogID					log_id;
+	xtLogOffset				log_offset;
+	xtWord8					byte_count = 0;
+
+	xn_id = db->db_xn_to_clean_id;
+	curr_xn_id = xt_xn_get_curr_id(db);
+	// Limit the number of transactions checked!
+	for (int i=0; i<1000; i++) {
+		if (xt_xn_is_before(curr_xn_id, xn_id))
+			break;
+		if (xn_get_xact_start(db, xn_id, thread, &x_log_id, &x_log_offset)) {
+			if (xn_log_id) {
+				if (xt_comp_log_pos(x_log_id, x_log_offset, xn_log_id, xn_log_offset) < 0) {
+					xn_log_id = x_log_id;
+					xn_log_offset = x_log_offset;
+				}
+			}
+			else {
+				xn_log_id = x_log_id;
+				xn_log_offset = x_log_offset;
+			}
+		}
+		xn_id++;
+	}
+	if (!xn_log_id)
+		return 0;
+
+	/* Assume the logs have the threshold: */
+	log_id = db->db_xlog.xl_write_log_id;
+	log_offset = db->db_xlog.xl_write_log_offset;
+	if (xn_log_id < log_id) {
+		if (xn_log_offset < xt_db_log_file_threshold)
+			byte_count = (size_t) (xt_db_log_file_threshold - xn_log_offset);
+		xn_log_offset = 0;
+		xn_log_id++;
+	}
+	while (xn_log_id < log_id) {
+		byte_count += (size_t) xt_db_log_file_threshold;
+		xn_log_id++;
+	}
+	if (xn_log_offset < log_offset)
+		byte_count += (size_t) (log_offset - xn_log_offset);
+
+	return byte_count;
+}
+
+/* ----------------------------------------------------------------------
+ * S W E E P E R    P R O C E S S
+ */
+
+typedef struct XNSweeperState {
+	XTDatabaseHPtr			ss_db;
+	XTXactSeqReadRec		ss_seqread;
+	XTDataBufferRec			ss_databuf;
+	u_int					ss_call_cnt;
+	XTBasicQueueRec			ss_to_free;
+	xtBool					ss_flush_pending;
+	XTOpenTablePtr			ss_ot;
+} XNSweeperStateRec, *XNSweeperStatePtr;
+
+static XTOpenTablePtr xn_sw_get_open_table(XTThreadPtr self, XNSweeperStatePtr ss, xtTableID tab_id, int *r)
+{
+	if (ss->ss_ot) {
+		if (ss->ss_ot->ot_table->tab_id == tab_id)
+			return ss->ss_ot;
+
+		xt_db_return_table_to_pool(self, ss->ss_ot);
+		ss->ss_ot = NULL;
+	}
+
+	if (!ss->ss_ot) {
+		if (!(ss->ss_ot = xt_db_open_pool_table(self, ss->ss_db, tab_id, r, TRUE)))
+			return NULL;
+	}
+
+	return ss->ss_ot;
+}
+
+static void xn_sw_close_open_table(XTThreadPtr self, XNSweeperStatePtr ss)
+{
+	if (ss->ss_ot) {
+		xt_db_return_table_to_pool(self, ss->ss_ot);
+		ss->ss_ot = NULL;
+	}
+}
+
+/*
+ * A thread can set a bit in db_sw_faster to make
+ * the sweeper go faster.
+ */
+static void xn_sw_could_go_faster(XTThreadPtr self, XTDatabaseHPtr db)
+{
+	if (db->db_sw_faster) {
+		if (!db->db_sw_fast) {
+			xt_set_priority(self, xt_db_sweeper_priority+1);
+			db->db_sw_fast = TRUE;
+		}
+	}
+}
+
+static void xn_sw_go_slower(XTThreadPtr self, XTDatabaseHPtr db)
+{
+	if (db->db_sw_fast) {
+		xt_set_priority(self, xt_db_sweeper_priority);
+		db->db_sw_fast = FALSE;
+	}
+	db->db_sw_faster = XT_SW_WORK_NORMAL;
+}
+
+/* Add a record to the "to free" queue. We note the current
+ * transaction at the time this is done. The record will
+ * only be freed once this transaction terminated, together
+ * with all transactions that started before it! 
+ *
+ * The reason for this is that a sequential scan or some
+ * other operation may read a committed record which is no longer
+ * valid because it is no longer the latest variation (the first
+ * variation reachable from the row pointer).
+ *
+ * In this case, the sweeper will free the variation.
+ * If the variation is re-used and committed before
+ * the sequential scan or read completes, and by some
+ * fluke is used by the same record as previously,
+ * the system will think the record is valid
+ * again.
+ *
+ * Without re-reading the record the sequential
+ * scan or other read will find it on the variation list, and
+ * return the record data as if valid!
+ *
+ * ------------ 2008-01-03
+ *
+ * An example of this is:
+ *
+ * Assume we have 3 records.
+ * The 3rd record is deleted, and committed.
+ * Before cleanup can be performed
+ * a sequential scan takes a copy of the records.
+ *
+ * Now assume a new insert is done before
+ * the sequential scan gets to the 3rd record.
+ *
+ * The insert allocates the 3rd row and 3rd record
+ * again.
+ *
+ * Now, when the sequential scan gets to the old copy of the 3rd record,
+ * this is valid because the row points to this record again.
+ *
+ * HOWEVER! I have now changed the sequential scan so that it accesses
+ * the records from the cache, without making a copy.
+ *
+ * This means that this problem cannot occur because the sequential scan
+ * always reads the current data from the cache.
+ *
+ * There is also no race condition (although no lock is taken), because
+ * the record is writen before the row (see here [(5)]).
+ *
+ * This means that the row does not point to the record before the
+ * record has been modified.
+ *
+ * Once the record has been modified then the sequential scan will see
+ * that the record belongs to a new transaction.
+ *
+ * If the row pointer was set before the record updated then a race
+ * condition would exist when the sequential scan reads the record
+ * after the insert has updated the row pointer but before it has
+ * changed the record.
+ *
+ * AS A RESULT:
+ *
+ * I believe I can remove the delayed free record!
+ *
+ * This means I can combine the REMOVE and FREE operations.
+ *
+ * This is good because this takes care of the problem
+ * that records are lost when:
+ *
+ * The server crashes when the delayed free list still has items on it.
+ * AND
+ * The transaction that freed the records has been cleaned, and this
+ * fact has been committed to the log.
+ *
+ * So I have removed the delay here: [(6)]
+ *
+ * ------------ 2008-12-03
+ *
+ * This code to delay removal of records was finally removed (see above)
+ */
+
+/*
+ * As above, but instead a transaction is added to the "to free" queue.
+ *
+ * It is important that transactions remain in memory until all
+ * currently running transactions have ended. This is because
+ * sequential and index scans have copies of old data.
+ *
+ * In the old data a record may not be indicated as cleaned. Such
+ * a record is considered invalid if the transaction is not in RAM.
+ *
+ * GOTCHA:
+ *
+ * And this problem is demonstrated by the following example
+ * which was derived from flush_table.test.
+ *
+ * Each handler command below is a separate transaction.
+ * However the buffer is loaded by 'read first'.
+ * Depending on when cleanup occurs, records can disappear
+ * in some of the next commands.
+ *
+ * 2 solutions for the test. Use begin ... commit around
+ * handler open ... close. Or use analyze table t1 before
+ * open. analyze table waits for the sweeper to complete!
+ *
+ * create table dummy(table_id char(20) primary key);
+ * let $1=100;
+ * while ($1)
+ * {
+ *   drop table if exists t1;
+ *   create table t1(table_id char(20) primary key);
+ *   insert into t1 values ('Record-01');
+ *   insert into t1 values ('Record-02');
+ *   insert into t1 values ('Record-03');
+ *   insert into t1 values ('Record-04');
+ *   insert into t1 values ('Record-05');
+ *   handler t1 open;
+ *   handler t1 read first limit 1;
+ *   handler t1 read next limit 1;
+ *   handler t1 read next limit 1;
+ *   handler t1 read next limit 1;
+ *   handler t1 close;
+ *   commit;
+ *   dec $1;
+ * }
+ * 
+ */
+#ifdef MUST_DELAY_REMOVE
+static void xn_sw_add_xact_to_free(XTThreadPtr self, XNSweeperStatePtr ss, xtXactID xn_id)
+{
+	XNSWToFreeItemRec free_item;
+
+	if ((ss->ss_to_free.bq_front - ss->ss_to_free.bq_back) >= XT_TN_MAX_TO_FREE) {
+		/* If the queue is full, try to free some items:
+		 * We use the call count to avoid doing this every time,
+		 * when the queue overflows!
+		 */
+		if ((ss->ss_call_cnt % XT_TN_MAX_TO_FREE_CHECK) == 0)
+			/* GOTCHA: This call was not locking the sweeper,
+			 * this could cause failure, of course:
+			 */
+			xn_sw_service_to_free(self, ss, TRUE);
+		ss->ss_call_cnt++;
+	}
+
+	free_item.ri_wait_xn_id = ss->ss_db->db_xn_curr_id;
+	free_item.ri_tab_id = 0;
+	free_item.x.ri_xn_id = xn_id;
+
+	xt_bq_add(self, &ss->ss_to_free, &free_item);
+}
+#endif
+
+static void xt_sw_delete_variations(XTThreadPtr self, XNSweeperStatePtr ss, XTOpenTablePtr ot, xtRecordID rec_id, xtRowID row_id, xtXactID xn_id)
+{
+	xtRecordID prev_var_rec_id;
+
+	while (rec_id) {
+		switch (xt_tab_remove_record(ot, rec_id, ss->ss_databuf.db_data, &prev_var_rec_id, FALSE, row_id, xn_id)) {
+			case XT_ERR:
+				throw_();
+				return;
+			case TRUE:
+				break;
+		}
+		rec_id = prev_var_rec_id;
+	}
+}
+
+static void xt_sw_delete_variation(XTThreadPtr self, XNSweeperStatePtr ss, XTOpenTablePtr ot, xtRecordID rec_id, xtBool clean_delete, xtRowID row_id, xtXactID xn_id)
+{
+	xtRecordID prev_var_rec_id;
+
+	switch (xt_tab_remove_record(ot, rec_id, ss->ss_databuf.db_data, &prev_var_rec_id, clean_delete, row_id, xn_id)) {
+		case XT_ERR:
+			throw_();
+			return;
+		case TRUE:
+			break;
+		case FALSE:
+			break;
+	}
+}
+
+/* Set rec_type to this value in order to force cleanup, without
+ * a check.
+ */
+#define XN_FORCE_CLEANUP		XT_TAB_STATUS_FREED
+
+/*
+ * Read the record to be cleaned. Return TRUE if the cleanup has already been done.
+ */
+static xtBool xn_sw_cleanup_done(XTThreadPtr self, XTOpenTablePtr ot, xtRecordID rec_id, xtXactID xn_id, u_int rec_type, u_int stat_id, xtRowID row_id, XTTabRecHeadDPtr rec_head)
+{
+	if (!xt_tab_get_rec_data(ot, rec_id, sizeof(XTTabRecHeadDRec), (xtWord1 *) rec_head))
+		throw_();
+
+	if (rec_type == XN_FORCE_CLEANUP) {
+		if (XT_REC_IS_FREE(rec_head->tr_rec_type_1))
+			return TRUE;
+	}
+	else {
+		/* Transaction must match: */
+		if (XT_GET_DISK_4(rec_head->tr_xact_id_4) != xn_id)
+			return TRUE;
+
+		/* Record header must match expected value from
+		 * log or clean has been done, or is not required.
+		 *
+		 * For example, it is not required if a record
+		 * has been overwritten in a transaction.
+		 */
+		if (rec_head->tr_rec_type_1 != rec_type ||
+			rec_head->tr_stat_id_1 != stat_id)
+			return TRUE;
+
+		/* Row must match: */
+		if (XT_GET_DISK_4(rec_head->tr_row_id_4) != row_id)
+			return TRUE;
+	}
+
+	return FALSE;
+}
+
+static void xn_sw_clean_indices(XTThreadPtr XT_NDEBUG_UNUSED(self), XTOpenTablePtr ot, xtRecordID rec_id, xtRowID row_id, xtWord1 *rec_data, xtWord1 *rec_buffer)
+{
+	XTTableHPtr	tab = ot->ot_table;
+	u_int		cols_req;
+	XTIndexPtr	*ind;
+
+	if (!tab->tab_dic.dic_key_count)
+		return;
+
+	cols_req = tab->tab_dic.dic_ind_cols_req;
+	if (XT_REC_IS_FIXED(rec_data[0]))
+		rec_buffer = rec_data + XT_REC_FIX_HEADER_SIZE;
+	else {
+		if (XT_REC_IS_VARIABLE(rec_data[0])) {
+			if (!myxt_load_row(ot, rec_data + XT_REC_FIX_HEADER_SIZE, rec_buffer, cols_req))
+				goto failed;
+		}
+		else if (XT_REC_IS_EXT_DLOG(rec_data[0])) {
+			ASSERT(cols_req);
+			if (cols_req && cols_req <= tab->tab_dic.dic_fix_col_count) {
+				if (!myxt_load_row(ot, rec_data + XT_REC_EXT_HEADER_SIZE, rec_buffer, cols_req))
+					goto failed;
+			}
+			else {
+				if (rec_data != ot->ot_row_rbuffer)
+					memcpy(ot->ot_row_rbuffer, rec_data, tab->tab_dic.dic_rec_size);
+				if (!xt_tab_load_ext_data(ot, rec_id, rec_buffer, cols_req))
+					goto failed;
+			}
+		}
+		else
+			/* This is possible, the record has already been cleaned up. */
+			return;
+	}
+
+	ind = tab->tab_dic.dic_keys;
+	for (u_int i=0; i<tab->tab_dic.dic_key_count; i++, ind++) {
+		if (!xt_idx_update_row_id(ot, *ind, rec_id, row_id, rec_buffer))
+			xt_log_and_clear_exception_ns();
+	}
+	return;
+	
+	failed:
+	xt_log_and_clear_exception_ns();
+}
+
+/*
+ * Return TRUE if the cleanup was done. FAILED if cleanup could not be done
+ * because dictionary information is not available.
+ */
+static xtBool xn_sw_cleanup_variation(XTThreadPtr self, XNSweeperStatePtr ss, XTXactDataPtr xact, xtTableID tab_id, xtRecordID rec_id, u_int status, u_int rec_type, u_int stat_id, xtRowID row_id, xtWord1 *rec_buf)
+{
+	XTOpenTablePtr		ot;
+	XTTableHPtr			tab;
+	XTTabRecHeadDRec	rec_head;
+	xtRecordID			after_rec_id;
+	xtXactID			xn_id;
+	int					r;
+
+	if (!(ot = xn_sw_get_open_table(self, ss, tab_id, &r))) {
+		/* The table no longer exists, consider cleanup done: */
+		switch (r) {
+			case XT_TAB_NOT_FOUND:
+				break;
+			case XT_TAB_NO_DICTIONARY:
+			case XT_TAB_POOL_CLOSED:
+				return FALSE;
+		}
+		return TRUE;
+	}
+
+	tab = ot->ot_table;
+
+	/* Make sure the buffer is large enough! */
+	xt_db_set_size(self, &ss->ss_databuf, (size_t) tab->tab_dic.dic_mysql_buf_size);
+
+	xn_id = xact->xd_start_xn_id;
+	if (xact->xd_flags & XT_XN_XAC_COMMITTED) {
+		/* The transaction has been committed. Clean the record and
+		 * remove variations no longer in use.
+		 */
+		switch (status) {
+			case XT_LOG_ENT_REC_MODIFIED:
+			case XT_LOG_ENT_UPDATE:
+			case XT_LOG_ENT_UPDATE_FL:
+			case XT_LOG_ENT_UPDATE_BG:
+			case XT_LOG_ENT_UPDATE_FL_BG:
+				if (xn_sw_cleanup_done(self, ot, rec_id, xn_id, rec_type, stat_id, row_id, &rec_head))
+					goto done_ok;
+				after_rec_id = XT_GET_DISK_4(rec_head.tr_prev_rec_id_4);
+				xt_sw_delete_variations(self, ss, ot, after_rec_id, row_id, xn_id);
+				rec_head.tr_rec_type_1 |= XT_TAB_STATUS_CLEANED_BIT;
+				XT_SET_NULL_DISK_4(rec_head.tr_prev_rec_id_4);
+				if (!xt_tab_put_log_op_rec_data(ot, XT_LOG_ENT_REC_CLEANED, 0, rec_id, offsetof(XTTabRecHeadDRec, tr_prev_rec_id_4) + XT_RECORD_ID_SIZE, (xtWord1 *) &rec_head))
+					throw_();
+				xn_sw_clean_indices(self, ot, rec_id, row_id, rec_buf, ss->ss_databuf.db_data);
+				break;
+			case XT_LOG_ENT_INSERT:
+			case XT_LOG_ENT_INSERT_FL:
+			case XT_LOG_ENT_INSERT_BG:
+			case XT_LOG_ENT_INSERT_FL_BG: {
+				/* POTENTIAL BUG 1:
+				 *
+				 * DROP TABLE IF EXISTS t1;
+				 * CREATE TABLE t1 ( id int, name varchar(300)) engine=pbxt;
+				 * 
+				 * begin;
+				 * insert t1(id, name) values(1, "aaa");
+				 * update t1 set name=REPEAT('A', 300) where id = 1;
+				 * commit;
+				 * flush tables;
+				 * select * from t1;
+				 *
+				 * Because the type of record changes, from VARIABLE to
+				 * EXTENDED, the cleanup needs to take this into account.
+				 *
+				 * The input new status value which is written here
+				 * depends on the first write to the record.
+				 * However, the second write changes the record status.
+				 *
+				 * Previously we used a OR function to write the bit and
+				 * return the byte value of the result.
+				 *
+				 * The write funtion now checks the record to be written
+				 * to make sure it matches the record that needs to be
+				 * cleaned. So OR'ing the bit is no longer required.
+				 *
+				 * POTENTIAL BUG 2:
+				 *
+				 * We have changed this to fix the following bug:
+				 *
+				 * T1 starts
+				 * T2 starts
+				 * T2 insert record 100 in row 50
+				 * T2 commits
+				 * T1 updates row 50 and adds record 101
+				 *
+				 * The sweeper does cleanup in order T1, T2, ...
+				 *
+				 * The sweeper cleans T1 by removing record 100 from the 
+				 * row 50 variation list.
+				 * This means that record 100 is free.
+				 *
+				 * The sweeper cleans T2 by marking record 100 as clean.
+				 * !BUG! record 100 has already been freed!
+				 *
+				 * To avoid this we have to check a record before 
+				 * cleaning (as we do above for update in xn_sw_cleanup_done())
+				 * We check that the record is, in fact, the exact
+				 * record that was inserted.
+				 *
+				 * This is now done be xt_tc_write_cond().
+				 */
+				xtOpSeqNo op_seq;
+
+				rec_head.tr_rec_type_1 = rec_type | XT_TAB_STATUS_CLEANED_BIT;
+				if(!tab->tab_recs.xt_tc_write_cond(self, ot->ot_rec_file, rec_id, rec_head.tr_rec_type_1, &op_seq, xn_id, row_id, stat_id, rec_type))
+					/* this means record was not updated by xt_tc_write_bor and doesn't need to */
+					break;
+				if (!xt_xlog_modify_table(tab->tab_id, XT_LOG_ENT_REC_CLEANED_1, op_seq, 0, rec_id, 1, &rec_head.tr_rec_type_1, self))
+					throw_();
+				xn_sw_clean_indices(self, ot, rec_id, row_id, rec_buf, ss->ss_databuf.db_data);
+				break;
+			}
+			case XT_LOG_ENT_DELETE:
+			case XT_LOG_ENT_DELETE_FL:
+			case XT_LOG_ENT_DELETE_BG:
+			case XT_LOG_ENT_DELETE_FL_BG:
+				if (xn_sw_cleanup_done(self, ot, rec_id, xn_id, rec_type, stat_id, row_id, &rec_head))
+					goto done_ok;
+				after_rec_id = XT_GET_DISK_4(rec_head.tr_prev_rec_id_4);
+				xt_sw_delete_variations(self, ss, ot, after_rec_id, row_id, xn_id);
+				xt_sw_delete_variation(self, ss, ot, rec_id, TRUE, row_id, xn_id);
+				if (row_id) {
+					if (!xt_tab_free_row(ot, tab, row_id))
+						throw_();
+				}
+				break;
+		}
+	}
+	else {
+		/* The transaction has been aborted. Remove the variation from the
+		 * variation list. If this means the list is empty, then remove
+		 * the record as well.
+		 */
+		xtRecordID			first_rec_id, next_rec_id, prev_rec_id;
+		XTTabRecHeadDRec	prev_rec_head;
+
+		if (xn_sw_cleanup_done(self, ot, rec_id, xn_id, rec_type, stat_id, row_id, &rec_head))
+			goto done_ok;
+
+		if (!row_id)
+			row_id = XT_GET_DISK_4(rec_head.tr_row_id_4);
+		after_rec_id = XT_GET_DISK_4(rec_head.tr_prev_rec_id_4);
+		if (!row_id)
+			goto unlink_done;
+
+		/* Now remove the record from the variation list,
+		 * (if it is still on the list).
+		 */
+		XT_TAB_ROW_WRITE_LOCK(&tab->tab_row_rwlock[row_id % XT_ROW_RWLOCKS], self);
+
+		/* Find the variation before the variation we wish to remove: */
+		if (!(xt_tab_get_row(ot, row_id, &first_rec_id)))
+			goto failed;
+		prev_rec_id = 0;
+		next_rec_id = first_rec_id;
+		while (next_rec_id != rec_id) {
+			if (!next_rec_id) {
+				/* The record was not found in the list (we are done) */
+				XT_TAB_ROW_UNLOCK(&tab->tab_row_rwlock[row_id % XT_ROW_RWLOCKS], self);
+				goto unlink_done;
+			}
+			if (!xt_tab_get_rec_data(ot, next_rec_id, sizeof(XTTabRecHeadDRec), (xtWord1 *) &prev_rec_head)) {
+				xt_log_and_clear_exception(self);
+				break;
+			}
+			
+			prev_rec_id = next_rec_id;
+			next_rec_id = XT_GET_DISK_4(prev_rec_head.tr_prev_rec_id_4);
+		}
+
+		if (next_rec_id == rec_id) {
+			/* The record was found on the list: */
+			if (prev_rec_id) {
+				/* Unlink the deleted variation:
+				 * I have found the following sequence:
+				 *
+				 * 17933 in use  1906112
+				 * 1906112 delete      xact=2901   row=17933 prev=2419240
+				 * 2419240 delete      xact=2899   row=17933 prev=2153360
+				 * 2153360 record-X C  xact=2599   row=17933 prev=0 Xlog=151 Xoff=16824 Xsiz=100
+				 *
+				 * Despite the following facts which should prevent chains from
+				 * forming:
+				 *
+				 * --- Only one transaction can modify a row
+				 * at any one time. So it is not possible for a new change
+				 * to be linked onto an uncommitted change.
+				 * 
+				 * --- Transactions that modify the same row
+				 * twice do not allocate a new record for each change.
+				 *
+				 * -- A change that has been
+				 * rolled back will not be linked onto. Instead
+				 * the new transaction will link to the last.
+				 * Comitted record.
+				 *
+				 * So if the sweeper is slow in doing its job
+				 * we can have the situation that a number of records
+				 * can refer to the last committed record of the
+				 * row.
+				 *
+				 * Only one will be reference by the row pointer.
+				 *
+				 * The other, will all have been rolled back.
+				 * This occurs over here: [(4)]
+				 */
+				XT_SET_DISK_4(prev_rec_head.tr_prev_rec_id_4, after_rec_id);
+				if (!xt_tab_put_log_op_rec_data(ot, XT_LOG_ENT_REC_UNLINKED, 0, prev_rec_id, offsetof(XTTabRecHeadDRec, tr_prev_rec_id_4) + XT_RECORD_ID_SIZE, (xtWord1 *) &prev_rec_head))
+					goto failed;
+			}
+			else {
+				/* Variation to be removed at the front of the list. */
+				ASSERT(rec_id == first_rec_id);
+				if (after_rec_id) {
+					/* Unlink the deleted variation, from the front of the list: */
+					if (!xt_tab_set_row(ot, XT_LOG_ENT_ROW_SET, row_id, after_rec_id))
+						goto failed;
+				}
+				else {
+					/* No more variations, remove the row: */
+					if (!xt_tab_free_row(ot, tab, row_id))
+						goto failed;
+				}
+			}
+		}
+
+		XT_TAB_ROW_UNLOCK(&tab->tab_row_rwlock[row_id % XT_ROW_RWLOCKS], self);
+
+		/* Note: even when not found on the row list, the record must still
+		 * be freed.
+		 *
+		 * There might be an exception to this, but there are very definite
+		 * cases where this is required, for example when an unreferenced
+		 * record is found and added to the clean up list xn_add_cu_record().
+		 */
+
+		unlink_done:
+		/* Delete the extended record and index entries:
+		 *
+		 * NOTE! This must be done after we have release the row lock. Because
+		 * a thread that does a duplicate check locks the index, and then
+		 * check whether a row is valid, and can deadlock with
+		 * code that locks a row, then an index!
+		 *
+		 * However, this should all be OK, because the variation has been removed from the
+		 * row variation list at this stage, and now just need to be deleted.
+		 */
+		xt_sw_delete_variation(self, ss, ot, rec_id, FALSE, row_id, xn_id);
+	}
+
+	done_ok:
+	return OK;
+
+	failed:
+	XT_TAB_ROW_UNLOCK(&tab->tab_row_rwlock[row_id % XT_ROW_RWLOCKS], self);
+	throw_();
+	return FAILED;
+}
+
+/* Go through all updated records of a transaction and cleanup.
+ * This means, of the transaction was aborted, then all the variations written
+ * by the transaction must be removed.
+ * If the transaction was committed then we remove older variations.
+ * If a delete was committed this can lead to the row being removed.
+ *
+ * After a transaction has been cleaned it can be removed from RAM.
+ * If this was the last transaction in a log, and the log has reached
+ * threshold, and the log is no longer in exclusive use, then the log
+ * can be deleted.
+ *
+ * This function returns OK if the transaction was cleaned up, FALSE
+ * if a retry is required. Othersize an error is thrown.
+ */
+static xtBool xn_sw_cleanup_xact(XTThreadPtr self, XNSweeperStatePtr ss, XTXactDataPtr xact)
+{
+	XTDatabaseHPtr		db = ss->ss_db;
+	XTXactLogBufferDPtr	record;
+	xtTableID			tab_id;
+	xtRecordID			rec_id;
+	xtXactID			xn_id;
+	xtRowID				row_id;
+
+	if (!db->db_xlog.xlog_seq_start(&ss->ss_seqread, xact->xd_begin_log, xact->xd_begin_offset, FALSE))
+		xt_throw(self);
+
+	for (;;) {
+		if (self->t_quit)
+			return FAILED;
+
+		xn_sw_could_go_faster(self, db);
+
+		if (!db->db_xlog.xlog_seq_next(&ss->ss_seqread, &record, FALSE, self))
+			xt_throw(self);
+		if (!record) {
+			/* Recovered transactions are considered cleaned when we
+			 * reach the end of the transaction log.
+			 * This is required, because transactions that do
+			 * not have a commit (or rollback) record, because they were
+			 * running when the server last went down, will otherwise not
+			 * have the cleanup completed!!
+			 */
+			ASSERT(xact->xd_flags & XT_XN_XAC_RECOVERED);
+			if (!(xact->xd_flags & XT_XN_XAC_RECOVERED))
+				return FAILED;
+			goto cleanup_done;
+		}
+		switch (record->xh.xh_status_1) {
+			case XT_LOG_ENT_NEW_LOG:
+				if (!db->db_xlog.xlog_seq_start(&ss->ss_seqread, XT_GET_DISK_4(record->xl.xl_log_id_4), 0, FALSE))
+					xt_throw(self);
+				break;
+			case XT_LOG_ENT_COMMIT:
+			case XT_LOG_ENT_ABORT:
+				xn_id = XT_GET_DISK_4(record->xe.xe_xact_id_4);
+				if (xn_id == xact->xd_start_xn_id)
+					goto cleanup_done;
+				break;
+			case XT_LOG_ENT_REC_MODIFIED:
+			case XT_LOG_ENT_UPDATE:
+			case XT_LOG_ENT_INSERT:
+			case XT_LOG_ENT_DELETE:
+			case XT_LOG_ENT_UPDATE_BG:
+			case XT_LOG_ENT_INSERT_BG:
+			case XT_LOG_ENT_DELETE_BG:
+				xn_id = XT_GET_DISK_4(record->xu.xu_xact_id_4);
+				if (xn_id != xact->xd_start_xn_id)
+					break;
+				tab_id = XT_GET_DISK_4(record->xu.xu_tab_id_4);
+				rec_id = XT_GET_DISK_4(record->xu.xu_rec_id_4);
+				row_id = XT_GET_DISK_4(record->xu.xu_row_id_4);
+				if (!xn_sw_cleanup_variation(self, ss, xact, tab_id, rec_id, record->xu.xu_status_1, record->xu.xu_rec_type_1, record->xu.xu_stat_id_1, row_id, &record->xu.xu_rec_type_1))
+					return FAILED;
+				break;
+			case XT_LOG_ENT_UPDATE_FL:
+			case XT_LOG_ENT_INSERT_FL:
+			case XT_LOG_ENT_DELETE_FL:
+			case XT_LOG_ENT_UPDATE_FL_BG:
+			case XT_LOG_ENT_INSERT_FL_BG:
+			case XT_LOG_ENT_DELETE_FL_BG:
+				xn_id = XT_GET_DISK_4(record->xf.xf_xact_id_4);
+				if (xn_id != xact->xd_start_xn_id)
+					break;
+				tab_id = XT_GET_DISK_4(record->xf.xf_tab_id_4);
+				rec_id = XT_GET_DISK_4(record->xf.xf_rec_id_4);
+				row_id = XT_GET_DISK_4(record->xf.xf_row_id_4);
+				if (!xn_sw_cleanup_variation(self, ss, xact, tab_id, rec_id, record->xf.xf_status_1, record->xf.xf_rec_type_1, record->xf.xf_stat_id_1, row_id, &record->xf.xf_rec_type_1))
+					return FAILED;
+				break;
+			default:
+				break;
+		}
+	}
+
+	cleanup_done:
+	/* Write the log to indicate the transaction has been cleaned: */
+	XTXactCleanupEntryDRec cu;
+
+	cu.xc_status_1 = XT_LOG_ENT_CLEANUP;
+	cu.xc_checksum_1 = XT_CHECKSUM_1(XT_CHECKSUM4_XACT(xact->xd_start_xn_id));
+	XT_SET_DISK_4(cu.xc_xact_id_4, xact->xd_start_xn_id);
+
+	if (!xt_xlog_log_data(self, sizeof(XTXactCleanupEntryDRec), (XTXactLogBufferDPtr) &cu, XT_XLOG_NO_WRITE_NO_FLUSH))
+		return FAILED;
+
+	ss->ss_flush_pending = TRUE;
+
+	xact->xd_flags |= XT_XN_XAC_CLEANED;
+	ASSERT(db->db_xn_to_clean_id == xact->xd_start_xn_id);
+#ifdef MUST_DELAY_REMOVE
+	xn_sw_add_xact_to_free(self, ss, xact->xd_start_xn_id);
+#else
+	xn_id = xact->xd_start_xn_id;
+	if (xt_xn_delete_xact(db, xn_id, self)) {
+		/* Recalculate the minimum memory transaction: */
+		ASSERT(!xt_xn_is_before(xn_id, db->db_xn_min_ram_id));
+		
+		if (db->db_xn_min_ram_id == xn_id) {
+			db->db_xn_min_ram_id = xn_id+1;
+		}
+		else {
+			xtXactID xn_curr_xn_id = xt_xn_get_curr_id(db);
+
+			while (!xt_xn_is_before(xn_curr_xn_id, db->db_xn_min_ram_id)) { // was db->db_xn_min_ram_id <= xn_curr_xn_id
+				/* db_xn_min_ram_id may be changed, by some other process! */
+				xn_id = db->db_xn_min_ram_id;
+				if (xn_get_xact_details(db, xn_id, self, NULL, NULL, NULL, NULL))
+					break;
+				db->db_xn_min_ram_id = xn_id+1;
+			}
+		}
+	}
+#endif
+
+	return OK;
+}
+
+static void xn_free_sw_state(XTThreadPtr self, XNSweeperStatePtr ss)
+{
+	xn_sw_close_open_table(self, ss);
+	if (ss->ss_db)
+		ss->ss_db->db_xlog.xlog_seq_exit(&ss->ss_seqread);
+	xt_db_set_size(self, &ss->ss_databuf, 0);
+	xt_bq_set_size(self, &ss->ss_to_free, 0);
+}
+
+static void xn_sw_main(XTThreadPtr self)
+{
+	XTDatabaseHPtr		db = self->st_database;
+	XNSweeperStatePtr	ss;
+	XTXactDataPtr		xact, xact2;
+	time_t				idle_start = 0;
+	xtXactID			curr_id;
+
+	xt_set_priority(self, xt_db_sweeper_priority);
+
+	alloczr_(ss, xn_free_sw_state, sizeof(XNSweeperStateRec), XNSweeperStatePtr);
+	ss->ss_db = db;
+
+	if (!db->db_xlog.xlog_seq_init(&ss->ss_seqread, xt_db_log_buffer_size, FALSE))
+		xt_throw(self);
+
+	ss->ss_to_free.bq_item_size = sizeof(XNSWToFreeItemRec);
+	ss->ss_to_free.bq_max_waste = XT_TN_MAX_TO_FREE_WASTE;
+	ss->ss_to_free.bq_item_inc = XT_TN_MAX_TO_FREE_INC;
+	ss->ss_call_cnt = 0;
+	ss->ss_flush_pending = FALSE;
+
+	while (!self->t_quit) {
+		while (!self->t_quit) {
+			/* We are just about to check the condition for sleeping,
+			 * so if the condition for sleeping holds, then we wil
+			 * exit the loop and sleep.
+			 *
+			 * We will then sleep if nobody sets the flag before we
+			 * actually do sleep!
+			 */
+			curr_id = xt_xn_get_curr_id(db);
+			if (xt_xn_is_before(curr_id, db->db_xn_to_clean_id)) {
+				db->db_sw_faster &= ~XT_SW_TOO_FAR_BEHIND;
+				break;
+			}
+			/* {TUNING} How far to we allow the sweeper to get behind?
+			 * The higher this is, the higher burst performance can
+			 * be. But too high and the sweeper falls out of reading the
+			 * transaction log cache, and also starts to spread
+			 * changes around in index and data blocks that are no
+			 * longer hot.
+			 */
+			if (curr_id - db->db_xn_to_clean_id > 250)
+				db->db_sw_faster |= XT_SW_TOO_FAR_BEHIND;
+			else
+				db->db_sw_faster &= ~XT_SW_TOO_FAR_BEHIND;
+			xn_sw_could_go_faster(self, db);
+			idle_start = 0;
+
+			if ((xact = xt_xn_get_xact(db, db->db_xn_to_clean_id, self))) {
+				xtXactID xn_id;
+
+				/* The sweep flag is set when the transaction is ready for sweeping.
+				 * Prepared transactions may not be swept!
+				 */
+				if (!(xact->xd_flags & XT_XN_XAC_SWEEP) || (xact->xd_flags & XT_XN_XAC_PREPARED))
+					goto sleep;
+
+				/* Check if we can cleanup the transaction.
+				 * We do this by checking to see if there is any running
+				 * transaction which start before the end of this transaction.
+				 */
+				xn_id = xact->xd_start_xn_id;
+				while (xt_xn_is_before(xn_id, xact->xd_end_xn_id)) {
+					xn_id++;
+					if ((xact2 = xt_xn_get_xact(db, xn_id, self))) {
+						if (!(xact2->xd_flags & XT_XN_XAC_ENDED)) {
+							/* A transaction was started before the end of
+							 * the transaction we wish to sweep, and this
+							 * transaction has not committed, the we have to
+							 * wait.
+							 */
+							db->db_stat_sweep_waits++;
+							goto sleep;
+						}
+					}
+				}
+				
+				/* Can cleanup the transaction, and move to the next. */
+				if (xact->xd_flags & XT_XN_XAC_LOGGED) {
+#ifdef TRACE_SWEEPER_ACTIVITY
+					printf("SWEEPER: cleanup %d\n", (int) xact->xd_start_xn_id);
+#endif
+					if (!xn_sw_cleanup_xact(self, ss, xact)) {
+						/* We failed to clean (try again later)... */
+#ifdef TRACE_SWEEPER_ACTIVITY
+						printf("SWEEPER: cleanup retry...\n", (int) xact->xd_start_xn_id);
+#endif
+						goto sleep;
+					}
+#ifdef TRACE_SWEEPER_ACTIVITY
+					printf("SWEEPER: cleanup DONE\n", (int) xact->xd_start_xn_id);
+#endif
+				}
+				else {
+					/* This was a read-only transaction, it is safe to
+					 * just remove the transaction structure from memory.
+					 * (should not be necessary because RO transactions
+					 * do this themselves):
+					 */
+					if (xt_xn_delete_xact(db, db->db_xn_to_clean_id, self)) {
+						if (db->db_xn_min_ram_id == db->db_xn_to_clean_id)
+							db->db_xn_min_ram_id = db->db_xn_to_clean_id+1;
+					}
+				}
+			}
+			
+			/* Move on to clean the next: */
+			db->db_xn_to_clean_id++;
+		}
+
+		sleep:			
+
+		xn_sw_close_open_table(self, ss);
+
+		xn_sw_go_slower(self, db);
+
+		/* Shrink the free list, if it is empty, and larger then
+		 * the default:
+		 */
+		if (ss->ss_to_free.bq_size > XT_TN_MAX_TO_FREE) {
+			if (ss->ss_to_free.bq_front == 0 && ss->ss_to_free.bq_back == 0)
+				xt_bq_set_size(self, &ss->ss_to_free, XT_TN_MAX_TO_FREE);
+		}
+
+		/* Windows: close the log file that we have open for reading, if we
+		 * read past the end of the log on the last transaction.
+		 * This makes sure that the log is closed when the checkpointer
+		 * tries to remove or rename it!!
+		 */
+		if (ss->ss_seqread.xseq_log_file) {
+			if (ss->ss_seqread.xseq_rec_log_id != ss->ss_seqread.xseq_log_id)
+				db->db_xlog.xlog_seq_close(&ss->ss_seqread);
+		}
+
+		if (ss->ss_flush_pending) {
+			/* Flush pending means we have written something to the log.
+			 *
+			 * if so we flush the log so that the writer will also do
+			 * its work!
+			 *
+			 * This will lead to the freeer continuing if it is waiting.
+			 */
+
+			time_t now = time(NULL);
+			if (idle_start) {
+				/* By default, we wait for 2 seconds idle time, the
+				 * we flush the log.
+				 */
+				if (now >= idle_start + 2) {
+					/* Don't do this if flusher is active! */
+					if (!db->db_fl_thread &&
+						!xt_xlog_flush_log(db, self))
+						xt_throw(self);
+					ss->ss_flush_pending = FALSE;
+				}
+			}
+			else
+				idle_start = now;
+		}
+
+		/* {WAKE-SW} Waking up the sweeper is very expensive!
+		 * Cost is 3% of execution time on the test:
+		 * runTest(SMALL_SELECT_TEST, 2, 100000)
+		 *
+		 * On the other hand, polling every 1/10 second
+		 * is cheap, because the check for transactions
+		 * ready for cleanup is very quick.
+		 *
+		 * So this is the prefered method.
+		 */
+		xn_sw_wait_for_xact(self, db, 10);
+	}
+
+	if (ss->ss_flush_pending) {
+		xt_xlog_flush_log(db, self);
+		ss->ss_flush_pending = FALSE;
+	}
+
+	freer_(); // xn_free_sw_state(ss)
+}
+
+static void *xn_sw_run_thread(XTThreadPtr self)
+{
+	XTDatabaseHPtr	db = (XTDatabaseHPtr) self->t_data;
+	int				count;
+	void			*mysql_thread;
+
+	if (!(mysql_thread = myxt_create_thread()))
+		xt_throw(self);
+
+	while (!self->t_quit) {
+		try_(a) {
+			/*
+			 * The garbage collector requires that the database
+			 * is in use because.
+			 */
+			xt_use_database(self, db, XT_FOR_SWEEPER);
+
+			/* This action is both safe and required:
+			 *
+			 * safe: releasing the database is safe because as
+			 * long as this thread is running the database
+			 * reference is valid, and this reference cannot
+			 * be the only one to the database because
+			 * otherwize this thread would not be running.
+			 *
+			 * required: releasing the database is necessary
+			 * otherwise we cannot close the database
+			 * correctly because we only shutdown this
+			 * thread when the database is closed and we
+			 * only close the database when all references
+			 * are removed.
+			 */
+			xt_heap_release(self, self->st_database);
+
+			xn_sw_main(self);
+		}
+		catch_(a) {
+			/* This error is "normal"! */
+			if (self->t_exception.e_xt_err != XT_ERR_NO_DICTIONARY &&
+				!(self->t_exception.e_xt_err == XT_SIGNAL_CAUGHT &&
+				self->t_exception.e_sys_err == SIGTERM))
+				xt_log_and_clear_exception(self);
+		}
+		cont_(a);
+
+		/* Avoid releasing the database (done above) */
+		self->st_database = NULL;
+		xt_unuse_database(self, self);
+
+		/* After an exception, pause before trying again... */
+		/* Number of seconds */
+#ifdef DEBUG
+		count = 10;
+#else
+		count = 2*60;
+#endif
+		db->db_sw_idle = XT_THREAD_INERR;
+		while (!self->t_quit && count > 0) {
+			sleep(1);
+			count--;
+		}
+		db->db_sw_idle = XT_THREAD_BUSY;
+	}
+
+   /*
+	* {MYSQL-THREAD-KILL}
+	myxt_destroy_thread(mysql_thread, TRUE);
+	*/
+	return NULL;
+}
+
+static void xn_sw_free_thread(XTThreadPtr self, void *data)
+{
+	XTDatabaseHPtr db = (XTDatabaseHPtr) data;
+
+	if (db->db_sw_thread) {
+		xt_lock_mutex(self, &db->db_sw_lock);
+		pushr_(xt_unlock_mutex, &db->db_sw_lock);
+		db->db_sw_thread = NULL;
+		freer_(); // xt_unlock_mutex(&db->db_sw_lock)
+	}
+}
+
+/* Wait for a transaction to quit: */
+static void xn_sw_wait_for_xact(XTThreadPtr self, XTDatabaseHPtr db, u_int hsecs)
+{
+	xt_lock_mutex(self, &db->db_sw_lock);
+	pushr_(xt_unlock_mutex, &db->db_sw_lock);
+	db->db_sw_idle = XT_THREAD_IDLE;
+	if (!self->t_quit && !db->db_sw_faster)
+		xt_timed_wait_cond(self, &db->db_sw_cond, &db->db_sw_lock, hsecs * 10);
+	db->db_sw_idle = XT_THREAD_BUSY;
+	db->db_sw_check_count++;
+	freer_(); // xt_unlock_mutex(&db->db_sw_lock)
+}
+
+xtPublic void xt_start_sweeper(XTThreadPtr self, XTDatabaseHPtr db)
+{
+	char name[PATH_MAX];
+
+	sprintf(name, "SW-%s", xt_last_directory_of_path(db->db_main_path));
+	xt_remove_dir_char(name);
+	db->db_sw_thread = xt_create_daemon(self, name);
+	xt_set_thread_data(db->db_sw_thread, db, xn_sw_free_thread);
+	xt_run_thread(self, db->db_sw_thread, xn_sw_run_thread);
+}
+
+xtPublic void xt_wait_for_sweeper(XTThreadPtr self, XTDatabaseHPtr db, int abort_time)
+{
+	time_t	then, now;
+	xtBool	message = FALSE;
+
+	if (db->db_sw_thread) {
+		then = time(NULL);
+		/* Changed xt_xn_get_curr_id(db) to db->db_xn_curr_id,
+		 * This should work because we are not concerned about the difference
+		 * between xt_xn_get_curr_id(db) and db->db_xn_curr_id,
+		 * Which is just a matter of when transactions we can expect ot find
+		 * in memory (see {GAP-INC-ADD-XACT})
+		 */
+		while (!xt_xn_is_before(db->db_xn_curr_id, db->db_xn_to_clean_id)) { // was db->db_xn_to_clean_id <= xt_xn_get_curr_id(db)
+			xt_lock_mutex(self, &db->db_sw_lock);
+			pushr_(xt_unlock_mutex, &db->db_sw_lock);
+			xt_wakeup_sweeper(db);
+			freer_(); // xt_unlock_mutex(&db->db_sw_lock)
+			xt_sleep_milli_second(10);
+			now = time(NULL);
+			if (abort_time && now >= then + abort_time) {
+				xt_logf(XT_NT_INFO, "Aborting wait for '%s' sweeper\n", db->db_name);
+				message = FALSE;
+				break;
+			}
+			if (now >= then + 2) {
+				if (!message) {
+					message = TRUE;
+					xt_logf(XT_NT_INFO, "Waiting for '%s' sweeper...\n", db->db_name);
+				}
+			}
+		}
+
+		if (message)
+			xt_logf(XT_NT_INFO, "Sweeper '%s' done.\n", db->db_name);
+	}
+}
+
+xtPublic void xt_stop_sweeper(XTThreadPtr self, XTDatabaseHPtr db)
+{
+	XTThreadPtr thr_sw;
+
+	if (db->db_sw_thread) {
+		xt_lock_mutex(self, &db->db_sw_lock);
+		pushr_(xt_unlock_mutex, &db->db_sw_lock);
+
+		/* This pointer is safe as long as you have the transaction lock. */
+		if ((thr_sw = db->db_sw_thread)) {
+			xtThreadID tid = thr_sw->t_id;
+
+			/* Make sure the thread quits when woken up. */
+			xt_terminate_thread(self, thr_sw);
+
+			xt_wakeup_sweeper(db);
+	
+			freer_(); // xt_unlock_mutex(&db->db_sw_lock)
+
+			/*
+			 * GOTCHA: This is a wierd thing but the SIGTERM directed
+			 * at a particular thread (in this case the sweeper) was
+			 * being caught by a different thread and killing the server
+			 * sometimes. Disconcerting.
+			 * (this may only be a problem on Mac OS X)
+			xt_kill_thread(thread);
+			 */
+			xt_wait_for_thread(tid, FALSE);
+	
+			/* PMC - This should not be necessary to set the signal here, but in the
+			 * debugger the handler is not called!!?
+			thr_sw->t_delayed_signal = SIGTERM;
+			xt_kill_thread(thread);
+			 */
+			db->db_sw_thread = NULL;
+		}
+		else
+			freer_(); // xt_unlock_mutex(&db->db_sw_lock)
+	}
+}
+
+xtPublic void xt_wakeup_sweeper(XTDatabaseHPtr db)
+{
+	/* This flag makes the gap for the race condition
+	 * very small.
+	 *
+	 * However, this posibility still remains because
+	 * we do not lock the mutex db_sw_lock here.
+	 *
+	 * The reason is that it is too expensive.
+	 *
+	 * In the event that the wakeup is missed the sleeper
+	 * wait will timeout eventually.
+	 */
+	if (db->db_sw_idle) {
+		if (!xt_broadcast_cond_ns(&db->db_sw_cond))
+			xt_log_and_clear_exception_ns();
+	}
+}
diff --git a/storage/pbxt/src/xaction_xt.h b/storage/pbxt/src/xaction_xt.h
new file mode 100644
index 00000000000..e679a0f38f0
--- /dev/null
+++ b/storage/pbxt/src/xaction_xt.h
@@ -0,0 +1,230 @@
+/* Copyright (c) 2005 PrimeBase Technologies GmbH
+ *
+ * PrimeBase XT
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * 2005-04-10	Paul McCullagh
+ *
+ * H&G2JCtL
+ */
+#ifndef __xt_xaction_h__
+#define __xt_xaction_h__
+
+#include "filesys_xt.h"
+#include "lock_xt.h"
+
+struct XTThread;
+struct XTDatabase;
+struct XTOpenTable;
+
+#ifdef DEBUG
+//#define XT_USE_XACTION_DEBUG_SIZES
+#endif
+
+#ifdef XT_USE_XACTION_DEBUG_SIZES
+
+#define XT_TN_NUMBER_INCREMENT	20
+#define XT_TN_MAX_TO_FREE		20
+#define XT_TN_MAX_TO_FREE_WASTE	3
+#define XT_TN_MAX_TO_FREE_CHECK	3
+#define XT_TN_MAX_TO_FREE_INC	3
+
+#define XT_XN_SEGMENT_SHIFTS	1
+
+#else
+
+#define XT_TN_NUMBER_INCREMENT	100		// The increment of the transaction number on restart
+#define XT_TN_MAX_TO_FREE		800		// The maximum size of the "to free" list
+#define XT_TN_MAX_TO_FREE_WASTE	400
+#define XT_TN_MAX_TO_FREE_CHECK	100		// Once we have exceeded the limit, we only try in intervals
+#define XT_TN_MAX_TO_FREE_INC	100
+
+//#define XT_XN_SEGMENT_SHIFTS	5		// (32)
+//#define XT_XN_SEGMENT_SHIFTS	6		// (64)
+//#define XT_XN_SEGMENT_SHIFTS	7		// (128)
+#define XT_XN_SEGMENT_SHIFTS	8		// (256)
+//#define XT_XN_SEGMENT_SHIFTS	9		// (512)
+
+#endif
+
+/* The hash table size (a prime number) */
+#if XT_XN_SEGMENT_SHIFTS == 1		// (1)
+#define XT_XN_HASH_TABLE_SIZE	1301
+#elif XT_XN_SEGMENT_SHIFTS == 5		// (32)
+#define XT_XN_HASH_TABLE_SIZE	1009
+#elif XT_XN_SEGMENT_SHIFTS == 6		// (64)
+#define XT_XN_HASH_TABLE_SIZE	503
+#elif XT_XN_SEGMENT_SHIFTS == 7		// (128)
+#define XT_XN_HASH_TABLE_SIZE	251
+#elif XT_XN_SEGMENT_SHIFTS == 8		// (256)
+#define XT_XN_HASH_TABLE_SIZE	127
+#elif XT_XN_SEGMENT_SHIFTS == 9		// (512)
+#define XT_XN_HASH_TABLE_SIZE	67
+#endif
+
+/* Number of pre-allocated transaction data structures per segment */
+#define XT_XN_DATA_ALLOC_COUNT	XT_XN_HASH_TABLE_SIZE
+
+#define XT_XN_NO_OF_SEGMENTS	(1 << XT_XN_SEGMENT_SHIFTS)
+#define XT_XN_SEGMENT_MASK		(XT_XN_NO_OF_SEGMENTS - 1)
+
+#define XT_XN_XAC_LOGGED		1
+#define XT_XN_XAC_ENDED			2					/* The transaction has ended. */
+#define XT_XN_XAC_COMMITTED		4					/* The transaction was committed. */
+#define XT_XN_XAC_CLEANED		8					/* The transaction has been cleaned. */
+#define XT_XN_XAC_RECOVERED		16					/* This transaction was detected on recovery. */
+#define XT_XN_XAC_SWEEP			32					/* End ID has been set, OK to sweep. */
+#define XT_XN_XAC_PREPARED		64					/* The transaction was prepared (used only by recovery). */
+
+#define XT_XN_VISIBLE			0					/* The transaction is committed, and the record is visible. */
+#define XT_XN_NOT_VISIBLE		1					/* The transaction is committed, but not visible. */
+#define XT_XN_ABORTED			2					/* Transaction was aborted. */
+#define XT_XN_MY_UPDATE			3					/* The record was update by me. */
+#define XT_XN_OTHER_UPDATE		4					/* The record was updated by someone else. */
+#define XT_XN_REREAD			5					/* The transaction is not longer in RAM, status is unkown, retry. */
+
+typedef struct XTXactPrepare {
+	xtXactID					xp_xact_id;
+	xtWord4						xp_hash;
+	struct XTXactPrepare		*xp_next;			/* Next item in hash table. */
+	int							xp_data_len;
+	xtWord1						xp_xa_data[XT_MAX_XA_DATA_SIZE];
+} XTXactPrepareRec, *XTXactPreparePtr;
+
+typedef struct XTXactXA {
+	xtXactID					xx_xact_id;
+	XTXactPreparePtr			xx_xa_ptr;
+} XTXactXARec, *XTXactXAPtr;
+
+typedef struct XTXactEnumXA {
+	u_int						exa_index;
+	xtBool						exa_locked;
+} XTXactEnumXARec, *XTXactEnumXAPtr;
+
+typedef struct XTXactData {
+	xtXactID					xd_start_xn_id;			/* Note: may be zero!. */
+	xtXactID					xd_end_xn_id;			/* Note: may be zero!. */
+
+	/* The begin position: */
+	xtLogID						xd_begin_log;			/* Non-zero if begin has been logged. */
+	xtLogOffset					xd_begin_offset;
+	int							xd_flags;
+	xtWord4						xd_end_time;
+	xtThreadID					xd_thread_id;
+	xtWord4						xd_xa_hash;				/* 0 if no XA transaction. */
+
+	/* A transaction may be indexed twice in the hash table.
+	 * Once on the start sequence number, and once on the
+	 * end sequence number.
+	 */
+	struct XTXactData			*xd_next_xact;		/* Next pointer in the hash table, also used by the free list. */
+
+} XTXactDataRec, *XTXactDataPtr;
+
+#ifdef XT_NO_ATOMICS
+#define XT_XACT_USE_PTHREAD_RW
+#else
+//#define XT_XACT_USE_SKEWRWLOCK
+#define XT_XACT_USE_SPINXSLOCK
+#endif
+
+#if defined(XT_XACT_USE_PTHREAD_RW)
+#define XT_XACT_LOCK_TYPE				xt_rwlock_type
+#define XT_XACT_INIT_LOCK(s, i)			xt_init_rwlock_with_autoname(s, i)
+#define XT_XACT_FREE_LOCK(s, i)			xt_free_rwlock(i)	
+#define XT_XACT_READ_LOCK(i, s)			xt_slock_rwlock_ns(i)
+#define XT_XACT_WRITE_LOCK(i, s)		xt_xlock_rwlock_ns(i)
+#define XT_XACT_UNLOCK(i, s, b)			xt_unlock_rwlock_ns(i)
+#elif defined(XT_XACT_USE_SPINXSLOCK)
+#define XT_XACT_LOCK_TYPE				XTSpinXSLockRec
+#define XT_XACT_INIT_LOCK(s, i)			xt_spinxslock_init_with_autoname(s, i)
+#define XT_XACT_FREE_LOCK(s, i)			xt_spinxslock_free(s, i)	
+#define XT_XACT_READ_LOCK(i, s)			xt_spinxslock_slock(i)
+#define XT_XACT_WRITE_LOCK(i, s)		xt_spinxslock_xlock(i, FALSE, (s)->t_id)
+#define XT_XACT_UNLOCK(i, s, b)			xt_spinxslock_unlock(i, b)
+#else
+#define XT_XACT_LOCK_TYPE				XTSkewRWLockRec
+#define XT_XACT_INIT_LOCK(s, i)			xt_skewrwlock_init_with_autoname(s, i)
+#define XT_XACT_FREE_LOCK(s, i)			xt_skewrwlock_free(s, i)	
+#define XT_XACT_READ_LOCK(i, s)			xt_skewrwlock_slock(i)
+#define XT_XACT_WRITE_LOCK(i, s)		xt_skewrwlock_xlock(i, FALSE, (s)->t_id)
+#define XT_XACT_UNLOCK(i, s, b)			xt_skewrwlock_unlock(i, b)
+#endif
+
+/* We store the transactions in a number of segments, each
+ * segment has a hash table.
+ */
+typedef struct XTXactSeg {
+	XT_XACT_LOCK_TYPE			xs_tab_lock;						/* Lock for hash table. */
+	xtXactID					xs_last_xn_id;						/* The last transaction ID added. */
+	XTXactDataPtr				xs_free_list;						/* List of transaction data structures. */
+	XTXactDataPtr				xs_table[XT_XN_HASH_TABLE_SIZE];	/* Hash table containing the transaction data structures. */
+} XTXactSegRec, *XTXactSegPtr;
+
+typedef struct XTXactWait {
+	xtXactID					xw_xn_id;
+} XTXactWaitRec, *XTXactWaitPtr;
+
+void			xt_thread_wait_init(struct XTThread *self);
+void			xt_thread_wait_exit(struct XTThread *self);
+
+void			xt_xn_init_db(struct XTThread *self, struct XTDatabase *db);
+void			xt_xn_exit_db(struct XTThread *self, struct XTDatabase *db);
+void			xt_start_sweeper(struct XTThread *self, struct XTDatabase *db);
+void			xt_wait_for_sweeper(struct XTThread *self, struct XTDatabase *db, int abort_time);
+void			xt_stop_sweeper(struct XTThread *self, struct XTDatabase *db);
+
+void			xt_xn_init_thread(struct XTThread *self, int what_for);
+void			xt_xn_exit_thread(struct XTThread *self);
+void			xt_wakeup_sweeper(struct XTDatabase *db);
+
+xtBool			xt_xn_begin(struct XTThread *self);
+xtBool			xt_xn_commit(struct XTThread *self);
+xtBool			xt_xn_rollback(struct XTThread *self);
+xtBool			xt_xn_log_tab_id(struct XTThread *self, xtTableID tab_id);
+int				xt_xn_status(struct XTOpenTable *ot, xtXactID xn_id, xtRecordID rec_id);
+xtBool			xt_xn_wait_for_xact(struct XTThread *self, XTXactWaitPtr xw, struct XTLockWait *lw);
+void			xt_xn_wakeup_waiting_threads(struct XTThread *thread);
+void			xt_xn_wakeup_thread_list(struct XTThread *thread);
+void			xt_xn_wakeup_thread(xtThreadID thd_id);
+xtXactID		xt_xn_get_curr_id(struct XTDatabase *db);
+xtWord8			xt_xn_bytes_to_sweep(struct XTDatabase *db, struct XTThread *thread);
+
+int				xt_xn_xa_compare(struct XTThread *self, register const void *thunk, register const void *a, register const void *b);
+xtBool			xt_xn_prepare(int len, xtWord1 *xa_data, struct XTThread *thread);
+xtBool			xt_xn_store_xa_data(struct XTDatabase *db, xtXactID xn_id, int len, xtWord1 *xa_data, struct XTThread *thread);
+void			xt_xn_delete_xa_data_by_xact(struct XTDatabase *db, xtXactID xact_id, struct XTThread *thread);
+void			xt_xn_delete_xa_data(struct XTDatabase *db, XTXactPreparePtr xap, xtBool unlock, struct XTThread *thread);
+XTXactPreparePtr	xt_xn_find_xa_data(struct XTDatabase *db, int len, xtWord1 *xa_data, xtBool lock, struct XTThread *thread);
+XTXactPreparePtr	xt_xn_enum_xa_data(struct XTDatabase *db, XTXactEnumXAPtr exa);
+
+XTXactDataPtr	xt_xn_add_old_xact(struct XTDatabase *db, xtXactID xn_id, struct XTThread *thread);
+XTXactDataPtr	xt_xn_get_xact(struct XTDatabase *db, xtXactID xn_id, struct XTThread *thread);
+xtBool			xt_xn_delete_xact(struct XTDatabase *db, xtXactID xn_id, struct XTThread *thread);
+
+inline xtBool	xt_xn_is_before(register xtXactID now, register xtXactID then)
+{
+	if (now >= then) {
+		if ((now - then) > (xtXactID) 0xFFFFFFFF/2)
+			return TRUE;
+		return FALSE;
+	}
+	if ((then - now) > (xtXactID) 0xFFFFFFFF/2)
+		return FALSE;
+	return TRUE;
+}
+
+#endif
diff --git a/storage/pbxt/src/xactlog_xt.cc b/storage/pbxt/src/xactlog_xt.cc
new file mode 100644
index 00000000000..addc14ff5d8
--- /dev/null
+++ b/storage/pbxt/src/xactlog_xt.cc
@@ -0,0 +1,3027 @@
+/* Copyright (c) 2007 PrimeBase Technologies GmbH
+ *
+ * PrimeBase XT
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.	See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * 2007-10-30	Paul McCullagh
+ *
+ * H&G2JCtL
+ *
+ * The transaction log contains all operations on the data handle
+ * and row pointer files of a table.
+ *
+ * The transaction log does not contain operations on index data.
+ */
+
+#include "xt_config.h"
+
+#ifdef DRIZZLED
+#include <bitset>
+#endif
+
+#include <signal.h>
+
+#include "xactlog_xt.h"
+#include "database_xt.h"
+#include "util_xt.h"
+#include "strutil_xt.h"
+#include "filesys_xt.h"
+#include "myxt_xt.h"
+#include "trace_xt.h"
+
+#ifdef DEBUG
+//#define PRINT_TABLE_MODIFICATIONS
+//#define TRACE_WRITER_ACTIVITY
+#endif
+#ifndef XT_WIN
+#ifndef XT_MAC
+#define PREWRITE_LOG_COMPLETELY
+#endif
+#endif
+
+static void xlog_wr_log_written(XTDatabaseHPtr db);
+
+/*
+ * -----------------------------------------------------------------------
+ * T R A N S A C T I O   L O G   C A C H E
+ */
+
+static XTXLogCacheRec	xt_xlog_cache;
+
+/*
+ * Initialize the disk cache.
+ */
+xtPublic void xt_xlog_init(XTThreadPtr self, size_t cache_size)
+{
+	XTXLogBlockPtr	block;
+
+	/*
+	 * This is required to ensure that the block
+	 * works!
+	 */
+
+	/* Determine the number of block that will fit into the given memory: */
+	/*
+	xt_xlog_cache.xlc_hash_size = (cache_size / (XLC_SEGMENT_COUNT * sizeof(XTXLogBlockPtr) + sizeof(XTXLogBlockRec))) / (XLC_SEGMENT_COUNT >> 1);
+	xt_xlog_cache.xlc_block_count = (cache_size - (XLC_SEGMENT_COUNT * xt_xlog_cache.xlc_hash_size * sizeof(XTXLogBlockPtr))) / sizeof(XTXLogBlockRec);
+	*/
+	/* Do not count the size of the cache directory towards the cache size: */
+	xt_xlog_cache.xlc_block_count = cache_size / sizeof(XTXLogBlockRec);
+	xt_xlog_cache.xlc_upper_limit = ((xtWord8) xt_xlog_cache.xlc_block_count * (xtWord8) XT_XLC_BLOCK_SIZE * (xtWord8) 3) / (xtWord8) 4;
+	xt_xlog_cache.xlc_hash_size = xt_xlog_cache.xlc_block_count / (XLC_SEGMENT_COUNT >> 1);
+	if (!xt_xlog_cache.xlc_hash_size)
+		xt_xlog_cache.xlc_hash_size = 1;
+
+	try_(a) {
+		for (u_int i=0; i<XLC_SEGMENT_COUNT; i++) {
+			xt_xlog_cache.xlc_segment[i].lcs_hash_table = (XTXLogBlockPtr *) xt_calloc(self, xt_xlog_cache.xlc_hash_size * sizeof(XTXLogBlockPtr));
+			xt_init_mutex_with_autoname(self, &xt_xlog_cache.xlc_segment[i].lcs_lock);
+			xt_init_cond(self, &xt_xlog_cache.xlc_segment[i].lcs_cond);
+		}
+
+		block = (XTXLogBlockPtr) xt_malloc(self, xt_xlog_cache.xlc_block_count * sizeof(XTXLogBlockRec));
+		xt_xlog_cache.xlc_blocks = block; 
+		xt_xlog_cache.xlc_blocks_end = (XTXLogBlockPtr) ((char *) block + (xt_xlog_cache.xlc_block_count * sizeof(XTXLogBlockRec))); 
+		xt_xlog_cache.xlc_next_to_free = block; 
+		xt_init_mutex_with_autoname(self, &xt_xlog_cache.xlc_lock);
+		xt_init_cond(self, &xt_xlog_cache.xlc_cond);
+
+		for (u_int i=0; i<xt_xlog_cache.xlc_block_count; i++) {
+			block->xlb_address = 0;
+			block->xlb_log_id = 0;
+			block->xlb_state = XLC_BLOCK_FREE;
+			block++;
+		}
+		xt_xlog_cache.xlc_free_count = xt_xlog_cache.xlc_block_count;
+	}
+	catch_(a) {
+		xt_xlog_exit(self);
+		throw_();
+	}
+	cont_(a);
+}
+
+xtPublic void xt_xlog_exit(XTThreadPtr self)
+{
+	for (u_int i=0; i<XLC_SEGMENT_COUNT; i++) {
+		if (xt_xlog_cache.xlc_segment[i].lcs_hash_table) {
+			xt_free(self, xt_xlog_cache.xlc_segment[i].lcs_hash_table);
+			xt_xlog_cache.xlc_segment[i].lcs_hash_table = NULL;
+			xt_free_mutex(&xt_xlog_cache.xlc_segment[i].lcs_lock);
+			xt_free_cond(&xt_xlog_cache.xlc_segment[i].lcs_cond);
+		}
+	}
+
+	if (xt_xlog_cache.xlc_blocks) {
+		xt_free(self, xt_xlog_cache.xlc_blocks);
+		xt_xlog_cache.xlc_blocks = NULL;
+		xt_free_mutex(&xt_xlog_cache.xlc_lock);
+		xt_free_cond(&xt_xlog_cache.xlc_cond);
+	}
+	memset(&xt_xlog_cache, 0, sizeof(xt_xlog_cache));
+}
+
+xtPublic xtInt8 xt_xlog_get_usage()
+{
+	xtInt8 size;
+
+	size = (xtInt8) (xt_xlog_cache.xlc_block_count - xt_xlog_cache.xlc_free_count) * sizeof(XTXLogBlockRec);
+	return size;
+}
+
+xtPublic xtInt8 xt_xlog_get_size()
+{
+	xtInt8 size;
+
+	size = (xtInt8) xt_xlog_cache.xlc_block_count * sizeof(XTXLogBlockRec);
+	return size;
+}
+
+xtPublic xtLogID xt_xlog_get_min_log(XTThreadPtr self, XTDatabaseHPtr db)
+{
+	char			path[PATH_MAX];
+	XTOpenDirPtr	od;
+	char			*file;
+	xtLogID			log_id, min_log = 0;
+
+	xt_strcpy(PATH_MAX, path, db->db_main_path);
+	xt_add_system_dir(PATH_MAX, path);
+	if (xt_fs_exists(path)) {
+		pushsr_(od, xt_dir_close, xt_dir_open(self, path, NULL));
+		while (xt_dir_next(self, od)) {
+			file = xt_dir_name(self, od);
+			if (xt_starts_with(file, "xlog")) {
+				if ((log_id = (xtLogID) xt_file_name_to_id(file))) {
+					if (!min_log || log_id < min_log)
+						min_log = log_id;
+				}
+			}
+		}
+		freer_(); // xt_dir_close(od)
+	}
+	if (!min_log)
+		return 1;
+	return min_log;
+}
+
+xtPublic void xt_xlog_delete_logs(XTThreadPtr self, XTDatabaseHPtr db)
+{
+	char			path[PATH_MAX];
+	XTOpenDirPtr	od;
+	char			*file;
+
+	/* Close all the index logs before we delete them: */
+	db->db_indlogs.ilp_close(self, TRUE);
+
+	/* Close the transaction logs too: */
+	db->db_xlog.xlog_close(self);
+
+	xt_strcpy(PATH_MAX, path, db->db_main_path);
+	xt_add_system_dir(PATH_MAX, path);
+	if (!xt_fs_exists(path))
+		return;
+	pushsr_(od, xt_dir_close, xt_dir_open(self, path, NULL));
+	while (xt_dir_next(self, od)) {
+		file = xt_dir_name(self, od);
+		if (xt_ends_with(file, ".xt")) {
+			xt_add_dir_char(PATH_MAX, path);
+			xt_strcat(PATH_MAX, path, file);
+			xt_fs_delete(self, path);
+			xt_remove_last_name_of_path(path);
+		}
+	}
+	freer_(); // xt_dir_close(od)
+
+	/* I no longer attach the condition: !db->db_multi_path
+	 * to removing this directory. This is because
+	 * the pbxt directory must now be removed explicitly
+	 * by drop database, or by delete all the PBXT
+	 * system tables.
+	 */
+	if (!xt_fs_rmdir(NULL, path))
+		xt_log_and_clear_exception(self);
+}
+
+#ifdef DEBUG_CHECK_CACHE
+static void xt_xlog_check_cache(void)
+{
+	XTXLogBlockPtr	block, pblock;
+	u_int			used_count;
+	u_int			free_count;
+
+	// Check the LRU list:
+	used_count = 0;
+	pblock = NULL;
+	block = xt_xlog_cache.xlc_lru_block;
+	while (block) {
+		used_count++;
+		ASSERT_NS(block->xlb_state != XLC_BLOCK_FREE);
+		ASSERT_NS(block->xlb_lr_used == pblock);
+		pblock = block;
+		block = block->xlb_mr_used;
+	}
+	ASSERT_NS(xt_xlog_cache.xlc_mru_block == pblock);
+	ASSERT_NS(xt_xlog_cache.xlc_free_count + used_count == xt_xlog_cache.xlc_block_count);
+
+	// Check the free list:
+	free_count = 0;
+	block = xt_xlog_cache.xlc_free_list;
+	while (block) {
+		free_count++;
+		ASSERT_NS(block->xlb_state == XLC_BLOCK_FREE);
+		block = block->xlb_next;
+	}
+	ASSERT_NS(xt_xlog_cache.xlc_free_count == free_count);
+}
+#endif
+
+#ifdef FOR_DEBUG
+static void xlog_check_lru_list(XTXLogBlockPtr block)
+{
+	XTXLogBlockPtr list_block, plist_block;
+	
+	plist_block = NULL;
+	list_block = xt_xlog_cache.xlc_lru_block;
+	while (list_block) {
+		ASSERT_NS(block != list_block);
+		ASSERT_NS(list_block->xlb_lr_used == plist_block);
+		plist_block = list_block;
+		list_block = list_block->xlb_mr_used;
+	}
+	ASSERT_NS(xt_xlog_cache.xlc_mru_block == plist_block);
+}
+#endif
+
+/*
+ * Log cache blocks are used and freed on a round-robin basis.
+ * In addition, only data read by restart, and data transfered
+ * from the transaction log are stored in the transaction log.
+ *
+ * This ensures that the transaction log contains the most
+ * recently written log data.
+ *
+ * If the sweeper gets behind due to a long running transacation
+ * then it falls out of the log cache, and must read from
+ * the log files directly.
+ *
+ * This data read is no longer cached as it was previously.
+ * This has the advantage that it does not disturn the writter
+ * thread which would otherwise hit the cache.
+ *
+ * If transactions are not too long, it should be possible
+ * to keep the sweeper in the log cache.
+ */
+static xtBool xlog_free_block(XTXLogBlockPtr to_free)
+{
+	XTXLogBlockPtr		block, pblock;
+	xtLogID				log_id;
+	off_t				address;
+	XTXLogCacheSegPtr	seg;
+	u_int				hash_idx;
+
+	retry:
+	log_id = to_free->xlb_log_id;
+	address = to_free->xlb_address;
+
+	seg = &xt_xlog_cache.xlc_segment[((u_int) address >> XT_XLC_BLOCK_SHIFTS) & XLC_SEGMENT_MASK];
+	hash_idx = (((u_int) (address >> (XT_XLC_SEGMENT_SHIFTS + XT_XLC_BLOCK_SHIFTS))) ^ (log_id << 16)) % xt_xlog_cache.xlc_hash_size;
+
+	xt_lock_mutex_ns(&seg->lcs_lock);
+	if (to_free->xlb_state == XLC_BLOCK_FREE)
+		goto done_ok;
+	if (to_free->xlb_log_id != log_id || to_free->xlb_address != address) {
+		xt_unlock_mutex_ns(&seg->lcs_lock);
+		goto retry;
+	}
+
+	pblock = NULL;
+	block = seg->lcs_hash_table[hash_idx];
+	while (block) {
+		if (block->xlb_address == address && block->xlb_log_id == log_id) {
+			ASSERT_NS(block == to_free);
+			ASSERT_NS(block->xlb_state != XLC_BLOCK_FREE);
+			
+			/* Wait if the block is being read: */
+			if (block->xlb_state == XLC_BLOCK_READING) {
+				/* Wait for the block to be read, then try again. */
+				if (!xt_timed_wait_cond_ns(&seg->lcs_cond, &seg->lcs_lock, 100))
+					goto failed;
+				xt_unlock_mutex_ns(&seg->lcs_lock);
+				goto retry;
+			}
+			
+			goto free_the_block;
+		}
+		pblock = block;
+		block = block->xlb_next;
+	}
+
+	/* We did not find the block, someone else freed it... */
+	xt_unlock_mutex_ns(&seg->lcs_lock);
+	goto retry;
+
+	free_the_block:
+	ASSERT_NS(block->xlb_state == XLC_BLOCK_CLEAN);
+
+	/* Remove from the hash table: */
+	if (pblock)
+		pblock->xlb_next = block->xlb_next;
+	else
+		seg->lcs_hash_table[hash_idx] = block->xlb_next;
+
+	/* Free the block: */
+	xt_xlog_cache.xlc_free_count++;
+	block->xlb_state = XLC_BLOCK_FREE;
+
+	done_ok:
+	xt_unlock_mutex_ns(&seg->lcs_lock);
+	return OK;
+	
+	failed:
+	xt_unlock_mutex_ns(&seg->lcs_lock);
+	return FAILED;
+}
+
+#define XT_FETCH_READ		0
+#define XT_FETCH_BLANK		1
+#define XT_FETCH_TEST		2
+
+static xtBool xlog_fetch_block(XTXLogBlockPtr *ret_block, XTOpenFilePtr file, xtLogID log_id, off_t address, XTXLogCacheSegPtr *ret_seg, int fetch_type, XTThreadPtr thread)
+{
+	register XTXLogBlockPtr		block;
+	register XTXLogCacheSegPtr	seg;
+	register u_int				hash_idx;
+	register XTXLogCacheRec		*dcg = &xt_xlog_cache;
+	size_t						red_size;
+
+	/* Make sure we have a free block ready (to avoid unlock below): */
+	if (fetch_type != XT_FETCH_TEST && dcg->xlc_next_to_free->xlb_state != XLC_BLOCK_FREE) {
+		if (!xlog_free_block(dcg->xlc_next_to_free))
+			return FAILED;
+	}
+
+	seg = &dcg->xlc_segment[((u_int) address >> XT_XLC_BLOCK_SHIFTS) & XLC_SEGMENT_MASK];
+	hash_idx = (((u_int) (address >> (XT_XLC_SEGMENT_SHIFTS + XT_XLC_BLOCK_SHIFTS))) ^ (log_id << 16)) % dcg->xlc_hash_size;
+
+	xt_lock_mutex_ns(&seg->lcs_lock);
+	retry:
+	block = seg->lcs_hash_table[hash_idx];
+	while (block) {
+		if (block->xlb_address == address && block->xlb_log_id == log_id) {
+			ASSERT_NS(block->xlb_state != XLC_BLOCK_FREE);
+
+			/*
+			 * Wait if the block is being read.
+			 */
+			if (block->xlb_state == XLC_BLOCK_READING) {
+				if (!xt_timed_wait_cond_ns(&seg->lcs_cond, &seg->lcs_lock, 100)) {
+					xt_unlock_mutex_ns(&seg->lcs_lock);
+					return FAILED;
+				}
+				goto retry;
+			}
+
+			*ret_seg = seg;
+			*ret_block = block;
+			thread->st_statistics.st_xlog_cache_hit++;
+			return OK;
+		}
+		block = block->xlb_next;
+	}
+
+	if (fetch_type == XT_FETCH_TEST) {
+		xt_unlock_mutex_ns(&seg->lcs_lock);
+		*ret_seg = NULL;
+		*ret_block = NULL;
+		thread->st_statistics.st_xlog_cache_miss++;
+		return OK;
+	}
+
+	/* Block not found: */
+	get_free_block:
+	if (dcg->xlc_next_to_free->xlb_state != XLC_BLOCK_FREE) {
+		xt_unlock_mutex_ns(&seg->lcs_lock);
+		if (!xlog_free_block(dcg->xlc_next_to_free))
+			return FAILED;
+		xt_lock_mutex_ns(&seg->lcs_lock);
+	}
+
+	xt_lock_mutex_ns(&dcg->xlc_lock);
+	block = dcg->xlc_next_to_free;
+	if (block->xlb_state != XLC_BLOCK_FREE) {
+		xt_unlock_mutex_ns(&dcg->xlc_lock);
+		goto get_free_block;
+	}
+	dcg->xlc_next_to_free++;
+	if (dcg->xlc_next_to_free == dcg->xlc_blocks_end)
+		dcg->xlc_next_to_free = dcg->xlc_blocks;
+	dcg->xlc_free_count--;
+
+	if (fetch_type == XT_FETCH_READ) {
+		block->xlb_address = address;
+		block->xlb_log_id = log_id;
+		block->xlb_state = XLC_BLOCK_READING;
+
+		xt_unlock_mutex_ns(&dcg->xlc_lock);
+
+		/* Add the block to the hash table: */
+		block->xlb_next = seg->lcs_hash_table[hash_idx];
+		seg->lcs_hash_table[hash_idx] = block;
+
+		/* Read the block into memory: */
+		xt_unlock_mutex_ns(&seg->lcs_lock);
+
+		if (!xt_pread_file(file, address, XT_XLC_BLOCK_SIZE, 0, block->xlb_data, &red_size, &thread->st_statistics.st_xlog, thread))
+			return FAILED;
+		memset(block->xlb_data + red_size, 0, XT_XLC_BLOCK_SIZE - red_size);
+		thread->st_statistics.st_xlog_cache_miss++;
+
+		xt_lock_mutex_ns(&seg->lcs_lock);
+		block->xlb_state = XLC_BLOCK_CLEAN;
+		xt_cond_wakeall(&seg->lcs_cond);
+	}
+	else {
+		block->xlb_address = address;
+		block->xlb_log_id = log_id;
+		block->xlb_state = XLC_BLOCK_CLEAN;
+		memset(block->xlb_data, 0, XT_XLC_BLOCK_SIZE);
+
+		xt_unlock_mutex_ns(&dcg->xlc_lock);
+
+		/* Add the block to the hash table: */
+		block->xlb_next = seg->lcs_hash_table[hash_idx];
+		seg->lcs_hash_table[hash_idx] = block;
+	}
+
+	*ret_seg = seg;
+	*ret_block = block;
+#ifdef DEBUG_CHECK_CACHE
+	//xt_xlog_check_cache();
+#endif
+	return OK;
+}
+
+static xtBool xlog_transfer_to_cache(XTOpenFilePtr file, xtLogID log_id, off_t offset, size_t size, xtWord1 *data, XTThreadPtr thread)
+{
+	off_t				address;
+	XTXLogBlockPtr		block;
+	XTXLogCacheSegPtr	seg;
+	size_t				boff;
+	size_t				tfer;
+	xtBool				read_block = FALSE;
+
+#ifdef DEBUG_CHECK_CACHE
+	//xt_xlog_check_cache();
+#endif
+	/* We have to read the first block, if we are
+	 * not at the begining of the file:
+	 */
+	if (offset)
+		read_block = TRUE;
+	address = offset & ~XT_XLC_BLOCK_MASK;
+
+	boff = (size_t) (offset - address);
+	tfer = XT_XLC_BLOCK_SIZE - boff;
+	if (tfer > size)
+		tfer = size;
+	while (size > 0) {
+		if (!xlog_fetch_block(&block, file, log_id, address, &seg, read_block ? XT_FETCH_READ : XT_FETCH_BLANK, thread)) {
+#ifdef DEBUG_CHECK_CACHE
+			//xt_xlog_check_cache();
+#endif
+			return FAILED;
+		}
+		ASSERT_NS(block && block->xlb_state == XLC_BLOCK_CLEAN);
+		memcpy(block->xlb_data + boff, data, tfer);
+		xt_unlock_mutex_ns(&seg->lcs_lock);
+		size -= tfer;
+		data += tfer;
+
+		/* Following block need not be read
+		 * because we always transfer to the
+		 * end of the file!
+		 */
+		read_block = FALSE;
+		address += XT_XLC_BLOCK_SIZE;
+
+		boff = 0;
+		tfer = size;
+		if (tfer > XT_XLC_BLOCK_SIZE)
+			tfer = XT_XLC_BLOCK_SIZE;
+	}
+#ifdef DEBUG_CHECK_CACHE
+	//xt_xlog_check_cache();
+#endif
+	return OK;
+}
+
+static xtBool xt_xlog_read(XTOpenFilePtr file, xtLogID log_id, off_t offset, size_t size, xtWord1 *data, xtBool load_cache, XTThreadPtr thread)
+{
+	off_t				address;
+	XTXLogBlockPtr		block;
+	XTXLogCacheSegPtr	seg;
+	size_t				boff;
+	size_t				tfer;
+
+#ifdef DEBUG_CHECK_CACHE
+	//xt_xlog_check_cache();
+#endif
+	address = offset & ~XT_XLC_BLOCK_MASK;
+	boff = (size_t) (offset - address);
+	tfer = XT_XLC_BLOCK_SIZE - boff;
+	if (tfer > size)
+		tfer = size;
+	while (size > 0) {
+		if (!xlog_fetch_block(&block, file, log_id, address, &seg, load_cache ? XT_FETCH_READ : XT_FETCH_TEST, thread))
+			return FAILED;
+		if (!block) {
+			size_t red_size;
+
+			if (!xt_pread_file(file, address + boff, size, 0, data, &red_size, &thread->st_statistics.st_xlog, thread))
+				return FAILED;
+			memset(data + red_size, 0, size - red_size);
+			return OK;
+		}
+		memcpy(data, block->xlb_data + boff, tfer);
+		xt_unlock_mutex_ns(&seg->lcs_lock);
+		size -= tfer;
+		data += tfer;
+		address += XT_XLC_BLOCK_SIZE;
+		boff = 0;
+		tfer = size;
+		if (tfer > XT_XLC_BLOCK_SIZE)
+			tfer = XT_XLC_BLOCK_SIZE;
+	}
+#ifdef DEBUG_CHECK_CACHE
+	//xt_xlog_check_cache();
+#endif
+	return OK;
+}
+
+static xtBool xt_xlog_write(XTOpenFilePtr file, xtLogID log_id, off_t offset, size_t size, xtWord1 *data, XTThreadPtr thread)
+{
+	if (!xt_pwrite_file(file, offset, size, data, &thread->st_statistics.st_xlog, thread))
+		return FAILED;
+	return xlog_transfer_to_cache(file, log_id, offset, size, data, thread);
+}
+
+/*
+ * -----------------------------------------------------------------------
+ * D A T A B A S E   T R A N S A C T I O N   L O G S
+ */
+
+void XTDatabaseLog::xlog_setup(XTThreadPtr self, XTDatabaseHPtr db, off_t inp_log_file_size, size_t transaction_buffer_size, int log_count)
+{
+	volatile off_t	log_file_size = inp_log_file_size;
+	size_t			log_size;
+
+	try_(a) {
+		memset(this, 0, sizeof(XTDatabaseLogRec));
+
+		if (log_count <= 1)
+			log_count = 1;
+		else if (log_count > 1000000)
+			log_count = 1000000;
+
+		xl_db = db;
+
+		xl_log_file_threshold = xt_align_offset(log_file_size, 1024);
+		xl_log_file_count = log_count;
+		xl_size_of_buffers = transaction_buffer_size;
+	
+		xt_init_mutex_with_autoname(self, &xl_write_lock);
+		xt_init_cond(self, &xl_write_cond);
+#ifdef XT_XLOG_WAIT_SPINS
+		xt_writing = 0;
+		xt_waiting = 0;
+#else
+		xt_writing = FALSE;
+#endif
+		xl_log_id = 0;
+		xl_log_file = 0;
+	
+		xt_spinlock_init_with_autoname(self, &xl_buffer_lock);
+
+		/* Note that we allocate a little bit more for each buffer
+		 * in order to make sure that we can write a trailing record
+		 * to the log buffer.
+		 */
+		log_size = transaction_buffer_size + sizeof(XTXactNewLogEntryDRec);
+		
+		/* Add in order to round the buffer to an integral of 512 */
+		if (log_size % 512)
+			log_size += (512 - (log_size % 512));
+
+		xl_write_log_id = 0;
+		xl_write_log_offset = 0;
+		xl_write_buf_pos = 0;
+		xl_write_buf_pos_start = 0;
+		xl_write_buffer = (xtWord1 *) xt_malloc(self, log_size);
+		xl_write_done = TRUE;
+
+		xl_append_log_id = 0;
+		xl_append_log_offset = 0;
+		xl_append_buf_pos = 0;
+		xl_append_buf_pos_start = 0;
+		xl_append_buffer = (xtWord1 *) xt_malloc(self, log_size);
+
+		xl_last_flush_time = 10;
+		xl_flush_log_id = 0;
+		xl_flush_log_offset = 0;
+	}
+	catch_(a) {
+		xlog_exit(self);
+		throw_();
+	}
+	cont_(a);
+}
+
+xtBool XTDatabaseLog::xlog_set_write_offset(xtLogID log_id, xtLogOffset log_offset, xtLogID max_log_id, XTThreadPtr thread)
+{
+	xl_max_log_id = max_log_id;
+
+	xl_write_log_id = log_id;
+	xl_write_log_offset = log_offset;
+	xl_write_buf_pos = 0;
+	xl_write_buf_pos_start = 0;
+	xl_write_done = TRUE;
+
+	xl_append_log_id = log_id;
+	xl_append_log_offset = log_offset;
+	if (log_offset == 0) {
+		XTXactLogHeaderDPtr log_head;
+
+		log_head = (XTXactLogHeaderDPtr) xl_append_buffer;
+		memset(log_head, 0, sizeof(XTXactLogHeaderDRec));
+		log_head->xh_status_1 = XT_LOG_ENT_HEADER;
+		log_head->xh_checksum_1 = XT_CHECKSUM_1(log_id);
+		XT_SET_DISK_4(log_head->xh_size_4, sizeof(XTXactLogHeaderDRec));
+		XT_SET_DISK_4(log_head->xh_log_id_4, log_id);
+		XT_SET_DISK_2(log_head->xh_version_2, XT_LOG_VERSION_NO);
+		XT_SET_DISK_4(log_head->xh_magic_4, XT_LOG_FILE_MAGIC);
+		xl_append_buf_pos = sizeof(XTXactLogHeaderDRec);
+		xl_append_buf_pos_start = 0;
+	}
+	else {
+		/* Start the log buffer at a block boundary: */
+		size_t buf_pos = (size_t) (log_offset % 512);
+
+		xl_append_buf_pos = buf_pos;
+		xl_append_buf_pos_start = buf_pos;
+		xl_append_log_offset = log_offset - buf_pos;
+
+		if (!xlog_open_log(log_id, log_offset, thread))
+			return FAILED;
+
+		if (!xt_pread_file(xl_log_file, xl_append_log_offset, buf_pos, buf_pos, xl_append_buffer, NULL, &thread->st_statistics.st_xlog, thread))
+			return FAILED;
+	}
+
+	xl_flush_log_id = log_id;
+	xl_flush_log_offset = log_offset;
+	return OK;
+}
+
+void XTDatabaseLog::xlog_close(XTThreadPtr self)
+{
+	if (xl_log_file) {
+		xt_close_file(self, xl_log_file);
+		xl_log_file = NULL;
+	}
+}
+
+void XTDatabaseLog::xlog_exit(XTThreadPtr self)
+{
+	xt_spinlock_free(self, &xl_buffer_lock);
+	xt_free_mutex(&xl_write_lock);
+	xt_free_cond(&xl_write_cond);
+	xlog_close(self);
+	if (xl_write_buffer) {
+		xt_free(self, xl_write_buffer);
+		xl_write_buffer = NULL;
+	}
+	if (xl_append_buffer) {
+		xt_free(self, xl_append_buffer);
+		xl_append_buffer = NULL;
+	}
+}
+
+#define WR_NO_SPACE		1			/* Write because there is no space, or some other reason */
+#define WR_FLUSH		2			/* Normal commit, write and flush */
+
+xtBool XTDatabaseLog::xlog_flush(XTThreadPtr thread)
+{
+	if (!xlog_flush_pending())
+		return OK;
+	return xlog_append(thread, 0, NULL, 0, NULL, XT_XLOG_WRITE_AND_FLUSH, NULL, NULL);
+}
+
+xtBool XTDatabaseLog::xlog_flush_pending()
+{
+	xtLogID		req_flush_log_id;
+	xtLogOffset	req_flush_log_offset;
+
+	xt_lck_slock(&xl_buffer_lock);
+	req_flush_log_id = xl_append_log_id;
+	req_flush_log_offset = xl_append_log_offset + xl_append_buf_pos;
+	if (xt_comp_log_pos(req_flush_log_id, req_flush_log_offset, xl_flush_log_id, xl_flush_log_offset) <= 0) {
+		xt_spinlock_unlock(&xl_buffer_lock);
+		return FALSE;
+	}
+	xt_spinlock_unlock(&xl_buffer_lock);
+	return TRUE;
+}
+
+/*
+ * Write data to the end of the log buffer.
+ *
+ * commit is set to true if the caller also requires
+ * the log to be flushed, after writing the data.
+ *
+ * This function returns the log ID and offset of
+ * the data write position.
+ */
+xtBool XTDatabaseLog::xlog_append(XTThreadPtr thread, size_t size1, xtWord1 *data1, size_t size2, xtWord1 *data2, int flush_log_at_trx_commit, xtLogID *log_id, xtLogOffset *log_offset)
+{
+	int			write_reason = 0;
+	xtLogID		req_flush_log_id;
+	xtLogOffset	req_flush_log_offset;
+	size_t		part_size;
+	xtWord8		flush_time;
+	xtWord2		sum;
+
+	/* The first size value must be set, of the second is set! */
+	ASSERT_NS(size1 || !size2);
+
+	if (!size1) {
+		/* Just flush the buffer... */
+		xt_lck_slock(&xl_buffer_lock);
+		write_reason = flush_log_at_trx_commit == XT_XLOG_WRITE_AND_FLUSH ? WR_FLUSH : WR_NO_SPACE;
+		req_flush_log_id = xl_append_log_id;
+		req_flush_log_offset = xl_append_log_offset + xl_append_buf_pos;
+		xt_spinlock_unlock(&xl_buffer_lock);
+		goto write_log_to_file;
+	}
+	req_flush_log_id = 0;
+	req_flush_log_offset = 0;
+
+	/*
+	 * This is a dirty read, which will send us to the
+	 * best starting position:
+	 *
+	 * If there is space, now, then there is probably
+	 * still enough space, after we have locked the
+	 * buffer for writting.
+	 */
+	if (xl_append_buf_pos + size1 + size2 <= xl_size_of_buffers)
+		goto copy_to_log_buffer;
+
+	/*
+	 * There is not enough space in the append buffer.
+	 * So we need to write the log, until there is space.
+	 */
+	write_reason = WR_NO_SPACE;
+
+	write_log_to_file:
+	if (write_reason) {
+		/* We need to write for one of 2 reasons: not
+		 * enough space in the buffer, or a flush
+		 * is required.
+		 */
+		xtWord8	then;
+		 
+		/*
+		 * The objective of the following code is to
+		 * pick one writer, out of all threads.
+		 * The rest will wait for the writer.
+		 */
+
+		if (write_reason == WR_FLUSH) {
+			/* Before we flush, check if we should wait for running
+			 * transactions that may commit shortly.
+			 */
+			if (xl_db->db_xn_writer_count - xl_db->db_xn_writer_wait_count - xl_db->db_xn_long_running_count > 0 && xl_last_flush_time) {
+				/* Wait for about as long as the last flush took,
+				 * the idea is to saturate the disk with flushing...: */
+				then = xt_trace_clock() + (xtWord8) xl_last_flush_time;
+				for (;;) {
+					xt_critical_wait();
+					/* If a thread leaves this loop because times up, or
+					 * a thread manages to flush so fast that this thread
+					 * sleeps during this time, then it could be that
+					 * the required flush occurs before other conditions
+					 * of this loop are met!
+					 *
+					 * So we check here to make sure that the log has not been
+					 * flushed as we require:
+					 */
+					if (xt_comp_log_pos(req_flush_log_id, req_flush_log_offset, xl_flush_log_id, xl_flush_log_offset) <= 0) {
+						ASSERT_NS(xt_comp_log_pos(xl_write_log_id, xl_write_log_offset, xl_append_log_id, xl_append_log_offset) <= 0);
+						return OK;
+					}
+
+					if (xl_db->db_xn_writer_count - xl_db->db_xn_writer_wait_count - xl_db->db_xn_long_running_count > 0)
+						break;
+					if (xt_trace_clock() >= then)
+						break;
+				}
+			}
+		}
+
+#ifdef XT_XLOG_WAIT_SPINS
+		/* Spin for 1/1000s: */
+		then = xt_trace_clock() + (xtWord8) 1000;
+		for (;;) {
+			if (!xt_atomic_tas4(&xt_writing, 1))
+				break;
+
+			/* If I am not the writer, then I just waited for the
+			 * writer. So it may be that my requirements have now
+			 * been met!
+			 */
+			if (write_reason == WR_FLUSH) {
+				/* If the reason was to flush, then
+				 * check the last flush sequence, maybe it is passed
+				 * our required sequence.
+				 */
+				if (xt_comp_log_pos(req_flush_log_id, req_flush_log_offset, xl_flush_log_id, xl_flush_log_offset) <= 0) {
+					/* The required flush position of the log is before
+					 * or equal to the actual flush position. This means the condition
+					 * for this thread have been satified (via group commit).
+					 * Nothing more to do!
+					 */
+					ASSERT_NS(xt_comp_log_pos(xl_write_log_id, xl_write_log_offset, xl_append_log_id, xl_append_log_offset) <= 0);
+					return OK;
+				}
+			}
+			else if (size1) {
+				/* It may be that there is now space in the append buffer: */
+				if (xl_append_buf_pos + size1 + size2 <= xl_size_of_buffers)
+					goto copy_to_log_buffer;
+			}
+			else {
+				/* We are just writing the buffer! */
+				ASSERT_NS(write_reason == WR_NO_SPACE);
+				if (xt_comp_log_pos(req_flush_log_id, req_flush_log_offset, xl_write_log_id, xl_write_log_offset + (xl_write_done ? xl_write_buf_pos : xl_write_buf_pos_start)) <= 0)
+					return OK;
+			}
+
+			if (xt_trace_clock() >= then) {
+				xt_lock_mutex_ns(&xl_write_lock);
+				xt_waiting++;
+				if (!xt_timed_wait_cond_ns(&xl_write_cond, &xl_write_lock, 500)) {
+					xt_waiting--;
+					xt_unlock_mutex_ns(&xl_write_lock);
+					return FALSE;
+				}
+				xt_waiting--;
+				xt_unlock_mutex_ns(&xl_write_lock);
+			}
+			else
+				xt_critical_wait();
+		}
+#else
+		xtBool i_am_writer;
+
+		i_am_writer = FALSE;
+		xt_lock_mutex_ns(&xl_write_lock);
+		if (xt_writing) {
+			if (!xt_timed_wait_cond_ns(&xl_write_cond, &xl_write_lock, 500)) {
+				xt_unlock_mutex_ns(&xl_write_lock);
+				return FALSE;
+			}
+		}
+		else {
+			xt_writing = TRUE;
+			i_am_writer = TRUE;
+		}
+		xt_unlock_mutex_ns(&xl_write_lock);
+
+		if (!i_am_writer) {
+			/* If I am not the writer, then I just waited for the
+			 * writer. So it may be that my requirements have now
+			 * been met!
+			 */
+			if (write_reason == WR_FLUSH) {
+				/* If the reason was to flush, then
+				 * check the last flush sequence, maybe it is passed
+				 * our required sequence.
+				 */
+				if (xt_comp_log_pos(req_flush_log_id, req_flush_log_offset, xl_flush_log_id, xl_flush_log_offset) <= 0) {
+					/* The required flush position of the log is before
+					 * or equal to the actual flush position. This means the condition
+					 * for this thread have been satified (via group commit).
+					 * Nothing more to do!
+					 */
+					ASSERT_NS(xt_comp_log_pos(xl_write_log_id, xl_write_log_offset, xl_append_log_id, xl_append_log_offset) <= 0);
+					return OK;
+				}
+			}
+			else if (size1) {
+				/* It may be that there is now space in the append buffer: */
+				if (xl_append_buf_pos + size1 + size2 <= xl_size_of_buffers)
+					goto copy_to_log_buffer;
+			}
+			else {
+				/* We are just writing the buffer! */
+				ASSERT_NS(write_reason == WR_NO_SPACE);
+				if (xt_comp_log_pos(req_flush_log_id, req_flush_log_offset, xl_write_log_id, xl_write_log_offset + (xl_write_done ? xl_write_buf_pos : xl_write_buf_pos_start)) <= 0)
+					return OK;
+			}
+				
+			goto write_log_to_file;
+		}
+#endif
+
+		/* I am the writer, check the conditions, again: */
+		if (write_reason == WR_FLUSH) {
+			/* The writer wants the log to be flushed to a particular point: */
+			if (xt_comp_log_pos(req_flush_log_id, req_flush_log_offset, xl_flush_log_id, xl_flush_log_offset) <= 0) {
+				/* The writers required flush position is before or equal
+				 * to the actual position, so the writer is done...
+				 */
+#ifdef XT_XLOG_WAIT_SPINS
+				xt_writing = 0;
+				if (xt_waiting)
+					xt_cond_wakeall(&xl_write_cond);
+#else
+				xt_writing = FALSE;
+				xt_cond_wakeall(&xl_write_cond);
+#endif
+				ASSERT_NS(xt_comp_log_pos(xl_write_log_id, xl_write_log_offset, xl_append_log_id, xl_append_log_offset) <= 0);
+				return OK;
+			}
+			/* Not flushed, but what about written? */
+			if (xt_comp_log_pos(req_flush_log_id, req_flush_log_offset, xl_write_log_id, xl_write_log_offset + (xl_write_done ? xl_write_buf_pos : xl_write_buf_pos_start)) <= 0) {
+				/* The write position is after or equal to the required flush
+				 * position. This means that all we have to do is flush
+				 * to satisfy the writers condition.
+				 */
+				xtBool ok = TRUE;
+
+				if (xl_log_id != xl_write_log_id)
+					ok = xlog_open_log(xl_write_log_id, xl_write_log_offset + (xl_write_done ? xl_write_buf_pos : xl_write_buf_pos_start), thread);
+
+				if (ok) {
+					if (xl_db->db_co_busy) {
+						/* [(8)] Flush the compactor log. */
+						xt_lock_mutex_ns(&xl_db->db_co_dlog_lock);
+						ok = xl_db->db_co_thread->st_dlog_buf.dlb_flush_log(TRUE, thread);
+						xt_unlock_mutex_ns(&xl_db->db_co_dlog_lock);
+					}
+				}
+
+				if (ok) {
+					flush_time = thread->st_statistics.st_xlog.ts_flush_time;
+					if ((ok = xt_flush_file(xl_log_file, &thread->st_statistics.st_xlog, thread))) {
+						xl_last_flush_time = (u_int) (thread->st_statistics.st_xlog.ts_flush_time - flush_time);
+						xl_log_bytes_flushed = xl_log_bytes_written;
+
+						xt_lock_mutex_ns(&xl_db->db_wr_lock);
+						xl_flush_log_id = xl_write_log_id;
+						xl_flush_log_offset = xl_write_log_offset + (xl_write_done ? xl_write_buf_pos : xl_write_buf_pos_start);
+						/*
+						 * We have written data to the log, wake the writer to commit
+						* the data to the database.
+						*/
+						xlog_wr_log_written(xl_db);
+						xt_unlock_mutex_ns(&xl_db->db_wr_lock);
+					}
+				}
+#ifdef XT_XLOG_WAIT_SPINS
+				xt_writing = 0;
+				if (xt_waiting)
+					xt_cond_wakeall(&xl_write_cond);
+#else
+				xt_writing = FALSE;
+				xt_cond_wakeall(&xl_write_cond);
+#endif
+				ASSERT_NS(xt_comp_log_pos(xl_write_log_id, xl_write_log_offset, xl_append_log_id, xl_append_log_offset) <= 0);
+				return ok;
+			}
+		}
+		else if (size1) {
+			/* If the amounf of data to be written is 0, then we are just required
+			 * to write the transaction buffer.
+			 *
+			 * If there is space in the buffer, then we can go on
+			 * to copy our data into the buffer:
+			 */
+			if (xl_append_buf_pos + size1 + size2 <= xl_size_of_buffers) {
+#ifdef XT_XLOG_WAIT_SPINS
+				xt_writing = 0;
+				if (xt_waiting)
+					xt_cond_wakeall(&xl_write_cond);
+#else
+				xt_writing = FALSE;
+				xt_cond_wakeall(&xl_write_cond);
+#endif
+				goto copy_to_log_buffer;
+			}
+		}
+		else {
+			/* We are just writing the buffer! */
+			ASSERT_NS(write_reason == WR_NO_SPACE);
+			if (xt_comp_log_pos(req_flush_log_id, req_flush_log_offset, xl_write_log_id, xl_write_log_offset + (xl_write_done ? xl_write_buf_pos : xl_write_buf_pos_start)) <= 0) {
+#ifdef XT_XLOG_WAIT_SPINS
+				xt_writing = 0;
+				if (xt_waiting)
+					xt_cond_wakeall(&xl_write_cond);
+#else
+				xt_writing = FALSE;
+				xt_cond_wakeall(&xl_write_cond);
+#endif
+				return OK;
+			}
+		}
+
+		rewrite:
+		/* If the current write buffer has been written, then
+		 * switch the logs. Otherwise we must try to existing
+		 * write buffer.
+		 */
+		if (xl_write_done) {
+			/* This means that the current write buffer has been writen,
+			 * i.e. it is empty!
+			 */
+			xt_spinlock_lock(&xl_buffer_lock);
+			xtWord1	*tmp_buffer = xl_write_buffer;
+
+			/* The write position is now the append position: */
+			xl_write_log_id = xl_append_log_id;
+			xl_write_log_offset = xl_append_log_offset;
+			xl_write_buf_pos = xl_append_buf_pos;
+			xl_write_buf_pos_start = xl_append_buf_pos_start;
+			xl_write_buffer = xl_append_buffer;
+			xl_write_done = FALSE;
+
+			/* We have to maintain 512 byte alignment: */
+			ASSERT_NS((xl_write_log_offset % 512) == 0);
+			part_size = xl_write_buf_pos % 512;
+			if (part_size != 0)
+				memcpy(tmp_buffer, xl_write_buffer + xl_write_buf_pos - part_size, part_size);
+
+			/* The new append position will be after the
+			 * current append position:
+			 */
+			xl_append_log_offset += xl_append_buf_pos - part_size;
+			xl_append_buf_pos = part_size;
+			xl_append_buf_pos_start = part_size;
+			xl_append_buffer = tmp_buffer; // The old write buffer (which is empty)
+
+			/*
+			 * If the append offset exceeds the log threshhold, then
+			 * we set the append buffer to a new log file:
+			 *
+			 * NOTE: This algorithm will cause the log to be overwriten by a maximum
+			 * of the log buffer size!
+			 */
+			if (xl_append_log_offset >= xl_log_file_threshold) {
+				XTXactNewLogEntryDPtr	log_tail;
+				XTXactLogHeaderDPtr		log_head;
+
+				xl_append_log_id++;
+
+				/* Write the final record to the old log.
+				 * There is enough space for this because we allocate the
+				 * buffer a little bigger than required.
+				 */
+				log_tail = (XTXactNewLogEntryDPtr) (xl_write_buffer + xl_write_buf_pos);
+				log_tail->xl_status_1 = XT_LOG_ENT_NEW_LOG;
+				log_tail->xl_checksum_1 = XT_CHECKSUM_1(xl_append_log_id) ^ XT_CHECKSUM_1(xl_write_log_id);
+				XT_SET_DISK_4(log_tail->xl_log_id_4, xl_append_log_id);
+				xl_write_buf_pos += sizeof(XTXactNewLogEntryDRec);
+
+				/* We add the header to the next log. */
+				log_head = (XTXactLogHeaderDPtr) xl_append_buffer;
+				memset(log_head, 0, sizeof(XTXactLogHeaderDRec));
+				log_head->xh_status_1 = XT_LOG_ENT_HEADER;
+				log_head->xh_checksum_1 = XT_CHECKSUM_1(xl_append_log_id);
+				XT_SET_DISK_4(log_head->xh_size_4, sizeof(XTXactLogHeaderDRec));
+				XT_SET_DISK_4(log_head->xh_log_id_4, xl_append_log_id);
+				XT_SET_DISK_2(log_head->xh_version_2, XT_LOG_VERSION_NO);
+				XT_SET_DISK_4(log_head->xh_magic_4, XT_LOG_FILE_MAGIC);
+
+				xl_append_log_offset = 0;
+				xl_append_buf_pos = sizeof(XTXactLogHeaderDRec);
+				xl_append_buf_pos_start = 0;
+			}
+			xt_spinlock_unlock(&xl_buffer_lock);
+			/* We have completed the switch. The append buffer is empty, and
+			 * other threads can begin to write to it.
+			 *
+			 * Meanwhile, this thread will write the write buffer...
+			 */
+		}
+
+		/* Make sure we have the correct log open: */
+		if (xl_log_id != xl_write_log_id) {
+			if (!xlog_open_log(xl_write_log_id, xl_write_log_offset, thread))
+				goto write_failed;
+		}
+
+		/* Write the buffer. */
+		/* Always write an integral number of 512 byte blocks: */
+		ASSERT_NS((xl_write_log_offset % 512) == 0);
+		if ((part_size = xl_write_buf_pos % 512)) {
+			part_size = 512 - part_size;
+			xl_write_buffer[xl_write_buf_pos] = XT_LOG_ENT_END_OF_LOG;
+#ifdef HAVE_valgrind
+			if (part_size > 1)
+				memset(xl_write_buffer + xl_write_buf_pos + 1, 0x66, part_size - 1);
+#endif
+			if (!xt_pwrite_file(xl_log_file, xl_write_log_offset, xl_write_buf_pos+part_size, xl_write_buffer, &thread->st_statistics.st_xlog, thread))
+				goto write_failed;			
+		}
+		else {
+			if (!xt_pwrite_file(xl_log_file, xl_write_log_offset, xl_write_buf_pos, xl_write_buffer, &thread->st_statistics.st_xlog, thread))
+				goto write_failed;
+		}
+
+		/* This part has not been written: */
+		part_size = xl_write_buf_pos - xl_write_buf_pos_start;
+
+		/* We have written the data to the log, transfer
+		 * the buffer data into the cache. */
+		if (!xlog_transfer_to_cache(xl_log_file, xl_log_id, xl_write_log_offset+xl_write_buf_pos_start, part_size, xl_write_buffer+xl_write_buf_pos_start, thread))
+			goto write_failed;
+
+		xl_write_done = TRUE;
+		xl_log_bytes_written += part_size;
+
+		if (write_reason == WR_FLUSH) {
+			if (xl_db->db_co_busy) {
+				/* [(8)] Flush the compactor log. */
+				xt_lock_mutex_ns(&xl_db->db_co_dlog_lock);
+				if (!xl_db->db_co_thread->st_dlog_buf.dlb_flush_log(TRUE, thread)) {
+					xl_log_bytes_written -= part_size;
+					xt_unlock_mutex_ns(&xl_db->db_co_dlog_lock);
+					goto write_failed;
+				}
+				xt_unlock_mutex_ns(&xl_db->db_co_dlog_lock);
+			}
+
+			/* And flush if required: */
+			flush_time = thread->st_statistics.st_xlog.ts_flush_time;
+			if (!xt_flush_file(xl_log_file, &thread->st_statistics.st_xlog, thread)) {
+				xl_log_bytes_written -= part_size;
+				goto write_failed;
+			}
+			xl_last_flush_time = (u_int) (thread->st_statistics.st_xlog.ts_flush_time - flush_time);
+
+			xl_log_bytes_flushed = xl_log_bytes_written;
+
+			xt_lock_mutex_ns(&xl_db->db_wr_lock);
+			xl_flush_log_id = xl_write_log_id;
+			xl_flush_log_offset = xl_write_log_offset + xl_write_buf_pos;
+			/*
+			 * We have written data to the log, wake the writer to commit
+			 * the data to the database.
+			 */
+			xlog_wr_log_written(xl_db);
+			xt_unlock_mutex_ns(&xl_db->db_wr_lock);
+
+			/* Check that the require flush condition has arrived. */
+			if (xt_comp_log_pos(req_flush_log_id, req_flush_log_offset, xl_flush_log_id, xl_flush_log_offset) > 0)
+				/* The required position is still after the current flush
+				 * position, continue writing: */
+				goto rewrite;
+
+#ifdef XT_XLOG_WAIT_SPINS
+			xt_writing = 0;
+			if (xt_waiting)
+				xt_cond_wakeall(&xl_write_cond);
+#else
+			xt_writing = FALSE;
+			xt_cond_wakeall(&xl_write_cond);
+#endif
+			ASSERT_NS(xt_comp_log_pos(xl_write_log_id, xl_write_log_offset, xl_append_log_id, xl_append_log_offset) <= 0);
+			return OK;
+		}
+		else
+			xlog_wr_log_written(xl_db);
+
+		/*
+		 * Check that the buffer is now available, otherwise,
+		 * switch and write again!
+		 */
+		if (xl_append_buf_pos + size1 + size2 > xl_size_of_buffers)
+			goto rewrite;
+
+#ifdef XT_XLOG_WAIT_SPINS
+		xt_writing = 0;
+		if (xt_waiting)
+			xt_cond_wakeall(&xl_write_cond);
+#else
+		xt_writing = FALSE;
+		xt_cond_wakeall(&xl_write_cond);
+#endif
+
+		if (size1 == 0)
+			return OK;
+	}
+
+	copy_to_log_buffer:
+	ASSERT_NS(size1);
+	xt_spinlock_lock(&xl_buffer_lock);
+	/* Now we have to check again. The check above was a dirty read!
+	 */
+	if (xl_append_buf_pos + size1 + size2 > xl_size_of_buffers) {
+		xt_spinlock_unlock(&xl_buffer_lock);
+		/* Not enough space, write the buffer, and return here. */
+		write_reason = WR_NO_SPACE;
+		goto write_log_to_file;
+	}
+
+	memcpy(xl_append_buffer + xl_append_buf_pos, data1, size1);
+	if (size2)
+		memcpy(xl_append_buffer + xl_append_buf_pos + size1, data2, size2);
+	/* Add the log ID to the checksum!
+	 * This is required because log files are re-used, and we don't
+	 * want the records to be valid when the log is re-used.
+	 */
+	register XTXactLogBufferDPtr record;
+
+	/*
+	 * Adjust db_xn_writer_count here. It is protected by
+	 * xl_buffer_lock.
+	 */
+	record = (XTXactLogBufferDPtr) (xl_append_buffer + xl_append_buf_pos);
+	switch (record->xh.xh_status_1) {
+		case XT_LOG_ENT_HEADER:
+		case XT_LOG_ENT_END_OF_LOG:
+			break;
+		case XT_LOG_ENT_REC_MODIFIED:
+		case XT_LOG_ENT_UPDATE:
+		case XT_LOG_ENT_UPDATE_BG:
+		case XT_LOG_ENT_UPDATE_FL:
+		case XT_LOG_ENT_UPDATE_FL_BG:
+		case XT_LOG_ENT_INSERT:
+		case XT_LOG_ENT_INSERT_BG:
+		case XT_LOG_ENT_INSERT_FL:
+		case XT_LOG_ENT_INSERT_FL_BG:
+		case XT_LOG_ENT_DELETE:
+		case XT_LOG_ENT_DELETE_BG:
+		case XT_LOG_ENT_DELETE_FL:
+		case XT_LOG_ENT_DELETE_FL_BG:
+			sum = XT_GET_DISK_2(record->xu.xu_checksum_2) ^ XT_CHECKSUM_2(xl_append_log_id);
+			XT_SET_DISK_2(record->xu.xu_checksum_2, sum);
+
+			if (!thread->st_xact_writer) {
+				thread->st_xact_writer = TRUE;
+				thread->st_xact_write_time = xt_db_approximate_time;
+				xl_db->db_xn_writer_count++;
+				xl_db->db_xn_total_writer_count++;
+			}
+			break;
+		case XT_LOG_ENT_REC_REMOVED_BI:
+			sum = XT_GET_DISK_2(record->xu.xu_checksum_2) ^ XT_CHECKSUM_2(xl_append_log_id);
+			XT_SET_DISK_2(record->xu.xu_checksum_2, sum);
+			break;
+		case XT_LOG_ENT_ROW_NEW:
+		case XT_LOG_ENT_ROW_NEW_FL:
+			record->xl.xl_checksum_1 ^= XT_CHECKSUM_1(xl_append_log_id);
+
+			if (!thread->st_xact_writer) {
+				thread->st_xact_writer = TRUE;
+				thread->st_xact_write_time = xt_db_approximate_time;
+				xl_db->db_xn_writer_count++;
+				xl_db->db_xn_total_writer_count++;
+			}
+			break;
+		case XT_LOG_ENT_COMMIT:
+		case XT_LOG_ENT_ABORT:
+			ASSERT_NS(thread->st_xact_writer);
+			ASSERT_NS(xl_db->db_xn_writer_count > 0);
+			if (thread->st_xact_writer) {
+				xl_db->db_xn_writer_count--;
+				thread->st_xact_writer = FALSE;
+				if (thread->st_xact_long_running) {
+					xl_db->db_xn_long_running_count--;
+					thread->st_xact_long_running = FALSE;
+				}
+			}
+			/* No break required! */
+		default:
+			record->xl.xl_checksum_1 ^= XT_CHECKSUM_1(xl_append_log_id);
+			break;
+	}
+#ifdef DEBUG
+	ASSERT_NS(xlog_verify(record, size1 + size2, xl_append_log_id));
+#endif
+	if (log_id)
+		*log_id = xl_append_log_id;
+	if (log_offset)
+		*log_offset = xl_append_log_offset + xl_append_buf_pos;
+	xl_append_buf_pos += size1 + size2;
+	if (flush_log_at_trx_commit != XT_XLOG_NO_WRITE_NO_FLUSH) {
+		write_reason = flush_log_at_trx_commit == XT_XLOG_WRITE_AND_FLUSH ? WR_FLUSH : WR_NO_SPACE;
+		req_flush_log_id = xl_append_log_id;
+		req_flush_log_offset = xl_append_log_offset + xl_append_buf_pos;
+		xt_spinlock_unlock(&xl_buffer_lock);
+		/* We have written the data already! */
+		size1 = 0;
+		size2 = 0;
+		goto write_log_to_file;
+	}
+
+	// Failed sometime when outside the spinlock!
+	ASSERT_NS(xt_comp_log_pos(xl_write_log_id, xl_write_log_offset, xl_append_log_id, xl_append_log_offset + xl_append_buf_pos) <= 0); 
+	xt_spinlock_unlock(&xl_buffer_lock);
+
+	return OK;
+
+	write_failed:
+#ifdef XT_XLOG_WAIT_SPINS
+	xt_writing = 0;
+	if (xt_waiting)
+		xt_cond_wakeall(&xl_write_cond);
+#else
+	xt_writing = FALSE;
+	xt_cond_wakeall(&xl_write_cond);
+#endif
+	return FAILED;
+}
+
+/*
+ * This function does not always delete the log. It may just rename a
+ * log to a new log which it will need.
+ * This speeds things up:
+ *
+ * - No need to pre-allocate the new log.
+ * - Log data is already flushed (i.e. disk blocks allocated)
+ * - Log is already in OS cache.
+ *
+ * However, it means that I need to checksum things differently
+ * on each log to make sure I do not treat an old record
+ * as valid!
+ *
+ * Return OK, FAILED or XT_ERR
+ */ 
+int XTDatabaseLog::xlog_delete_log(xtLogID del_log_id, XTThreadPtr thread)
+{
+	char	path[PATH_MAX];
+
+	if (xl_max_log_id < xl_write_log_id)
+		xl_max_log_id = xl_write_log_id;
+
+	xlog_name(PATH_MAX, path, del_log_id);
+
+	if (xt_db_offline_log_function == XT_RECYCLE_LOGS) {
+		char	new_path[PATH_MAX];
+		xtLogID	new_log_id;
+		xtBool	ok;
+
+		/* Make sure that the total logs is less than or equal to the log file count
+		 * (plus dynamic component).
+		 */
+		while (xl_max_log_id - del_log_id + 1 <= (xl_log_file_count + xt_log_file_dyn_count) &&
+			/* And the number of logs after the current log (including the current log)
+			 * must be less or equal to the log file count. */
+			xl_max_log_id - xl_write_log_id + 1 <= xl_log_file_count) {
+			new_log_id = xl_max_log_id+1;
+			xlog_name(PATH_MAX, new_path, new_log_id);
+			ok = xt_fs_rename(NULL, path, new_path);
+			if (ok) {
+				xl_max_log_id = new_log_id;
+				goto done;
+			}
+			if (!xt_fs_exists(new_path)) {
+				/* Try again later: */
+				if (thread->t_exception.e_xt_err == XT_SYSTEM_ERROR &&
+					XT_FILE_IN_USE(thread->t_exception.e_sys_err))
+					return FAILED;
+
+				return XT_ERR;
+			}
+			xl_max_log_id = new_log_id;
+		}
+	}
+
+	if (xt_db_offline_log_function != XT_KEEP_LOGS) {
+		if (!xt_fs_delete(NULL, path)) {
+			if (thread->t_exception.e_xt_err == XT_SYSTEM_ERROR &&
+				XT_FILE_IN_USE(thread->t_exception.e_sys_err))
+				return FAILED;
+
+			return XT_ERR;
+		}
+	}
+
+	done:
+	return OK;
+}
+
+/* PRIVATE FUNCTIONS */
+xtBool XTDatabaseLog::xlog_open_log(xtLogID log_id, off_t curr_write_pos, XTThreadPtr thread)
+{
+	char	log_path[PATH_MAX];
+	off_t	eof;
+
+	if (xl_log_id == log_id)
+		return OK;
+
+	if (xl_log_file) {
+		if (!xt_flush_file(xl_log_file, &thread->st_statistics.st_xlog, thread))
+			return FAILED;
+		xt_close_file_ns(xl_log_file);
+		xl_log_file = NULL;
+		xl_log_id = 0;
+	}
+
+	xlog_name(PATH_MAX, log_path, log_id);
+	if (!(xl_log_file = xt_open_file_ns(log_path, XT_FS_CREATE | XT_FS_MAKE_PATH)))
+		return FAILED;
+	/* Allocate space until the required size: */
+	if (curr_write_pos <  xl_log_file_threshold) {
+		eof = xt_seek_eof_file(NULL, xl_log_file);
+		if (eof == 0) {
+			/* A new file (bad), we need a greater file count: */
+			xt_log_file_dyn_count++;
+			xt_log_file_dyn_dec = 4;
+		}
+		else {
+			/* An existing file (good): */
+			if (xt_log_file_dyn_count > 0) {
+				if (xt_log_file_dyn_dec > 0)
+					xt_log_file_dyn_dec--;
+				else
+					xt_log_file_dyn_count--;
+			}
+		}
+		if (eof < xl_log_file_threshold) {
+			char	buffer[2048];
+			size_t	tfer;
+
+			memset(buffer, 0, 2048);
+
+			curr_write_pos = xt_align_offset(curr_write_pos, 512);
+#ifdef PREWRITE_LOG_COMPLETELY
+			while (curr_write_pos < xl_log_file_threshold) {
+				tfer = 2048;
+				if ((off_t) tfer > xl_log_file_threshold - curr_write_pos)
+					tfer = (size_t) (xl_log_file_threshold - curr_write_pos);
+				if (curr_write_pos == 0)
+					*buffer = XT_LOG_ENT_END_OF_LOG;
+				if (!xt_pwrite_file(xl_log_file, curr_write_pos, tfer, buffer, &thread->st_statistics.st_xlog, thread))
+					return FAILED;
+				*buffer = 0;
+				curr_write_pos += tfer;
+			}
+#else
+			if (curr_write_pos < xl_log_file_threshold) {
+				tfer = 2048;
+				
+				if (curr_write_pos < xl_log_file_threshold - 2048)
+					curr_write_pos = xl_log_file_threshold - 2048;
+				if ((off_t) tfer > xl_log_file_threshold - curr_write_pos)
+					tfer = (size_t) (xl_log_file_threshold - curr_write_pos);
+				if (!xt_pwrite_file(xl_log_file, curr_write_pos, tfer, buffer, &thread->st_statistics.st_xlog, thread))
+					return FAILED;
+			}
+#endif
+		}
+		else if (eof > xl_log_file_threshold + (128 * 1024 * 1024)) {
+			if (!xt_set_eof_file(NULL, xl_log_file, xl_log_file_threshold))
+				return FAILED;
+		}
+	}
+	xl_log_id = log_id;
+	return OK;
+}
+
+void XTDatabaseLog::xlog_name(size_t size, char *path, xtLogID log_id)
+{
+	char name[50];
+
+	sprintf(name, "xlog-%lu.xt", (u_long) log_id);
+	xt_strcpy(size, path, xl_db->db_main_path);
+	xt_add_system_dir(size, path);
+	xt_add_dir_char(size, path);
+	xt_strcat(size, path, name);
+}
+
+/*
+ * -----------------------------------------------------------------------
+ * T H R E A D   T R A N S A C T I O N   B U F F E R
+ */
+
+xtPublic xtBool xt_xlog_flush_log(struct XTDatabase *db, XTThreadPtr thread)
+{
+	return db->db_xlog.xlog_flush(thread);
+}
+
+xtPublic xtBool xt_xlog_log_data(XTThreadPtr thread, size_t size, XTXactLogBufferDPtr log_entry, int flush_log_at_trx_commit)
+{
+	return thread->st_database->db_xlog.xlog_append(thread, size, (xtWord1 *) log_entry, 0, NULL, flush_log_at_trx_commit, NULL, NULL);
+}
+
+/* Allocate a record from the free list. */
+xtPublic xtBool xt_xlog_modify_table(xtTableID tab_id, u_int status, xtOpSeqNo op_seq, xtRecordID free_rec_id, xtRecordID rec_id, size_t size, xtWord1 *data, XTThreadPtr thread)
+{
+	XTXactLogBufferDRec	log_entry;
+	size_t				len;
+	xtWord4				sum = 0;
+	int					check_size = 1;
+	XTXactDataPtr		xact = NULL;
+	int					flush_log_at_trx_commit = XT_XLOG_NO_WRITE_NO_FLUSH;
+
+	switch (status) {
+		case XT_LOG_ENT_REC_MODIFIED:
+		case XT_LOG_ENT_UPDATE:
+		case XT_LOG_ENT_INSERT:
+		case XT_LOG_ENT_DELETE:
+			check_size = 2;
+			XT_SET_DISK_4(log_entry.xu.xu_op_seq_4, op_seq);
+			XT_SET_DISK_4(log_entry.xu.xu_tab_id_4, tab_id);
+			XT_SET_DISK_4(log_entry.xu.xu_rec_id_4, rec_id);
+			XT_SET_DISK_2(log_entry.xu.xu_size_2, size);
+			len = offsetof(XTactUpdateEntryDRec, xu_rec_type_1);
+			if (!(thread->st_xact_data->xd_flags & XT_XN_XAC_LOGGED)) {
+				/* Add _BG: */
+				status++;
+				xact = thread->st_xact_data;
+				xact->xd_flags |= XT_XN_XAC_LOGGED;
+			}
+			break;
+		case XT_LOG_ENT_UPDATE_FL:
+		case XT_LOG_ENT_INSERT_FL:
+		case XT_LOG_ENT_DELETE_FL:
+			check_size = 2;
+			XT_SET_DISK_4(log_entry.xf.xf_op_seq_4, op_seq);
+			XT_SET_DISK_4(log_entry.xf.xf_tab_id_4, tab_id);
+			XT_SET_DISK_4(log_entry.xf.xf_rec_id_4, rec_id);
+			XT_SET_DISK_2(log_entry.xf.xf_size_2, size);
+			XT_SET_DISK_4(log_entry.xf.xf_free_rec_id_4, free_rec_id);
+			sum ^= XT_CHECKSUM4_REC(free_rec_id);
+			len = offsetof(XTactUpdateFLEntryDRec, xf_rec_type_1);
+			if (!(thread->st_xact_data->xd_flags & XT_XN_XAC_LOGGED)) {
+				/* Add _BG: */
+				status++;
+				xact = thread->st_xact_data;
+				xact->xd_flags |= XT_XN_XAC_LOGGED;
+			}
+			break;
+		case XT_LOG_ENT_REC_FREED:
+		case XT_LOG_ENT_REC_REMOVED:
+		case XT_LOG_ENT_REC_REMOVED_EXT:
+			ASSERT_NS(size == 1 + XT_XACT_ID_SIZE + sizeof(XTTabRecFreeDRec));
+			XT_SET_DISK_4(log_entry.fr.fr_op_seq_4, op_seq);
+			XT_SET_DISK_4(log_entry.fr.fr_tab_id_4, tab_id);
+			XT_SET_DISK_4(log_entry.fr.fr_rec_id_4, rec_id);
+			len = offsetof(XTactFreeRecEntryDRec, fr_stat_id_1);
+			break;
+		case XT_LOG_ENT_REC_REMOVED_BI:
+			check_size = 2;
+			XT_SET_DISK_4(log_entry.rb.rb_op_seq_4, op_seq);
+			XT_SET_DISK_4(log_entry.rb.rb_tab_id_4, tab_id);
+			XT_SET_DISK_4(log_entry.rb.rb_rec_id_4, rec_id);
+			XT_SET_DISK_2(log_entry.rb.rb_size_2, size);
+			log_entry.rb.rb_new_rec_type_1 = (xtWord1) free_rec_id;
+			sum ^= XT_CHECKSUM4_REC(free_rec_id);
+			len = offsetof(XTactRemoveBIEntryDRec, rb_rec_type_1);
+			break;
+		case XT_LOG_ENT_REC_MOVED:
+			ASSERT_NS(size == 8);
+			XT_SET_DISK_4(log_entry.xw.xw_op_seq_4, op_seq);
+			XT_SET_DISK_4(log_entry.xw.xw_tab_id_4, tab_id);
+			XT_SET_DISK_4(log_entry.xw.xw_rec_id_4, rec_id);
+			len = offsetof(XTactWriteRecEntryDRec, xw_rec_type_1);
+			break;
+		case XT_LOG_ENT_REC_CLEANED:
+			ASSERT_NS(size == offsetof(XTTabRecHeadDRec, tr_prev_rec_id_4) + XT_RECORD_ID_SIZE);
+			XT_SET_DISK_4(log_entry.xw.xw_op_seq_4, op_seq);
+			XT_SET_DISK_4(log_entry.xw.xw_tab_id_4, tab_id);
+			XT_SET_DISK_4(log_entry.xw.xw_rec_id_4, rec_id);
+			len = offsetof(XTactWriteRecEntryDRec, xw_rec_type_1);
+			break;
+		case XT_LOG_ENT_REC_CLEANED_1:
+			ASSERT_NS(size == 1);
+			XT_SET_DISK_4(log_entry.xw.xw_op_seq_4, op_seq);
+			XT_SET_DISK_4(log_entry.xw.xw_tab_id_4, tab_id);
+			XT_SET_DISK_4(log_entry.xw.xw_rec_id_4, rec_id);
+			len = offsetof(XTactWriteRecEntryDRec, xw_rec_type_1);
+			break;
+		case XT_LOG_ENT_REC_UNLINKED:
+			ASSERT_NS(size == offsetof(XTTabRecHeadDRec, tr_prev_rec_id_4) + XT_RECORD_ID_SIZE);
+			XT_SET_DISK_4(log_entry.xw.xw_op_seq_4, op_seq);
+			XT_SET_DISK_4(log_entry.xw.xw_tab_id_4, tab_id);
+			XT_SET_DISK_4(log_entry.xw.xw_rec_id_4, rec_id);
+			len = offsetof(XTactWriteRecEntryDRec, xw_rec_type_1);
+			break;
+		case XT_LOG_ENT_ROW_NEW:
+			ASSERT_NS(size == 0);
+			XT_SET_DISK_4(log_entry.xa.xa_op_seq_4, op_seq);
+			XT_SET_DISK_4(log_entry.xa.xa_tab_id_4, tab_id);
+			XT_SET_DISK_4(log_entry.xa.xa_row_id_4, rec_id);
+			len = offsetof(XTactRowAddedEntryDRec, xa_row_id_4) + XT_ROW_ID_SIZE;
+			break;
+		case XT_LOG_ENT_ROW_NEW_FL:
+			ASSERT_NS(size == 0);
+			XT_SET_DISK_4(log_entry.xa.xa_op_seq_4, op_seq);
+			XT_SET_DISK_4(log_entry.xa.xa_tab_id_4, tab_id);
+			XT_SET_DISK_4(log_entry.xa.xa_row_id_4, rec_id);
+			XT_SET_DISK_4(log_entry.xa.xa_free_list_4, free_rec_id);
+			sum ^= XT_CHECKSUM4_REC(free_rec_id);
+			len = offsetof(XTactRowAddedEntryDRec, xa_free_list_4) + XT_ROW_ID_SIZE;
+			break;
+		case XT_LOG_ENT_ROW_ADD_REC:
+		case XT_LOG_ENT_ROW_SET:
+		case XT_LOG_ENT_ROW_FREED:
+			ASSERT_NS(size == sizeof(XTTabRowRefDRec));
+			XT_SET_DISK_4(log_entry.wr.wr_op_seq_4, op_seq);
+			XT_SET_DISK_4(log_entry.wr.wr_tab_id_4, tab_id);
+			XT_SET_DISK_4(log_entry.wr.wr_row_id_4, rec_id);
+			len = offsetof(XTactWriteRowEntryDRec, wr_ref_id_4);
+			break;
+		case XT_LOG_ENT_PREPARE:
+			check_size = 2;
+			XT_SET_DISK_4(log_entry.xp.xp_xact_id_4, op_seq);
+			log_entry.xp.xp_xa_len_1 = (xtWord1) size;
+			len = offsetof(XTXactPrepareEntryDRec, xp_xa_data);
+			flush_log_at_trx_commit = xt_db_flush_log_at_trx_commit;
+			break;
+		default:
+			ASSERT_NS(FALSE);
+			len = 0;
+			break;
+	}
+
+	xtWord1	*dptr = data;
+	xtWord4	g;
+
+	sum ^= op_seq ^ (tab_id << 8) ^ XT_CHECKSUM4_REC(rec_id);
+	if ((g = sum & 0xF0000000)) {
+		sum = sum ^ (g >> 24);
+		sum = sum ^ g;
+	}
+	for (u_int i=0; i<(u_int) size; i++) {
+		sum = (sum << 4) + *dptr;
+		if ((g = sum & 0xF0000000)) {
+			sum = sum ^ (g >> 24);
+			sum = sum ^ g;
+		}
+		dptr++;
+	}
+
+	log_entry.xh.xh_status_1 = status;
+	if (check_size == 1) {
+		log_entry.xh.xh_checksum_1 = XT_CHECKSUM_1(sum);
+	}
+	else {
+		xtWord2 c;
+		
+		c = XT_CHECKSUM_2(sum);
+		XT_SET_DISK_2(log_entry.xu.xu_checksum_2, c);
+	}
+#ifdef PRINT_TABLE_MODIFICATIONS
+	xt_print_log_record(0, 0, &log_entry);
+#endif
+	if (xact)
+		return thread->st_database->db_xlog.xlog_append(thread, len, (xtWord1 *) &log_entry, size, data, flush_log_at_trx_commit, &xact->xd_begin_log, &xact->xd_begin_offset);
+
+	return thread->st_database->db_xlog.xlog_append(thread, len, (xtWord1 *) &log_entry, size, data, flush_log_at_trx_commit, NULL, NULL);
+}
+
+/*
+ * -----------------------------------------------------------------------
+ * S E Q U E N T I A L   L O G   R E A  D I N G
+ */
+
+/*
+ * Use the log buffer for sequential reading the log.
+ */
+xtBool XTDatabaseLog::xlog_seq_init(XTXactSeqReadPtr seq, size_t buffer_size, xtBool load_cache)
+{
+	seq->xseq_buffer_size = buffer_size;
+	seq->xseq_load_cache = load_cache;
+
+	seq->xseq_log_id = 0;
+	seq->xseq_log_file = NULL;
+	seq->xseq_log_eof = 0;
+
+	seq->xseq_buf_log_offset = 0;
+	seq->xseq_buffer_len = 0;
+	seq->xseq_buffer = (xtWord1 *) xt_malloc_ns(buffer_size);
+
+	seq->xseq_rec_log_id = 0;
+	seq->xseq_rec_log_offset = 0;
+	seq->xseq_record_len = 0;
+
+	return seq->xseq_buffer != NULL;
+}
+
+void XTDatabaseLog::xlog_seq_exit(XTXactSeqReadPtr seq)
+{
+	xlog_seq_close(seq);
+	if (seq->xseq_buffer) {
+		xt_free_ns(seq->xseq_buffer);
+		seq->xseq_buffer = NULL;
+	}
+}
+
+void XTDatabaseLog::xlog_seq_close(XTXactSeqReadPtr seq)
+{
+	if (seq->xseq_log_file) {
+		xt_close_file_ns(seq->xseq_log_file);
+		seq->xseq_log_file = NULL;
+	}
+	seq->xseq_log_id = 0;
+	seq->xseq_log_eof = 0;
+}
+
+xtBool XTDatabaseLog::xlog_seq_start(XTXactSeqReadPtr seq, xtLogID log_id, xtLogOffset log_offset, xtBool XT_UNUSED(missing_ok))
+{
+	if (seq->xseq_rec_log_id != log_id) {
+		seq->xseq_rec_log_id = log_id;
+		seq->xseq_buf_log_offset = seq->xseq_rec_log_offset;
+		seq->xseq_buffer_len = 0;
+	}
+
+	/* Windows version: this will help to switch
+	 * to the new log file.
+	 * Due to reading from the log buffers, this was
+	 * not always done!
+	 */
+	if (seq->xseq_log_id != log_id) {
+		if (seq->xseq_log_file) {
+			xt_close_file_ns(seq->xseq_log_file);
+			seq->xseq_log_file = NULL;
+		}
+	}
+	seq->xseq_rec_log_offset = log_offset;
+	seq->xseq_record_len = 0;
+	return OK;
+}
+
+size_t XTDatabaseLog::xlog_bytes_to_write()
+{
+	xtLogID					log_id;
+	xtLogOffset				log_offset;
+	xtLogID					to_log_id;
+	xtLogOffset				to_log_offset;
+	size_t					byte_count = 0;
+
+	log_id = xl_db->db_wr_log_id;
+	log_offset = xl_db->db_wr_log_offset;
+	to_log_id = xl_db->db_xlog.xl_flush_log_id;
+	to_log_offset = xl_db->db_xlog.xl_flush_log_offset;
+
+	/* Assume the logs have the threshold: */
+	if (log_id < to_log_id) {
+		if (log_offset < xt_db_log_file_threshold)
+			byte_count = (size_t) (xt_db_log_file_threshold - log_offset);
+		log_offset = 0;
+		log_id++;
+	}
+	while (log_id < to_log_id) {
+		byte_count += (size_t) xt_db_log_file_threshold;
+		log_id++;
+	}
+	if (log_offset < to_log_offset)
+		byte_count += (size_t) (to_log_offset - log_offset);
+
+	return byte_count;
+}
+
+xtBool XTDatabaseLog::xlog_read_from_cache(XTXactSeqReadPtr seq, xtLogID log_id, xtLogOffset log_offset, size_t size, off_t eof, xtWord1 *buffer, size_t *data_read, XTThreadPtr thread)
+{
+	/* xseq_log_file could be NULL because xseq_log_id is not set
+	 * to zero when xseq_log_file is set to NULL!
+	 * This bug caused a crash in TeamDrive.
+	 */
+	if (seq->xseq_log_id != log_id || !seq->xseq_log_file) {
+		char path[PATH_MAX];
+
+		if (seq->xseq_log_file) {
+			xt_close_file_ns(seq->xseq_log_file);
+			seq->xseq_log_file = NULL;
+		}
+
+		xlog_name(PATH_MAX, path, log_id);
+		if (!xt_open_file_ns(&seq->xseq_log_file, path, XT_FS_MISSING_OK))
+			return FAILED;
+		if (!seq->xseq_log_file) {
+			if (data_read)
+				*data_read = 0;
+			return OK;
+		}
+		seq->xseq_log_id = log_id;
+		seq->xseq_log_eof = 0;
+	}
+
+	if (!eof) {
+		if (!seq->xseq_log_eof)
+			seq->xseq_log_eof = xt_seek_eof_file(NULL, seq->xseq_log_file);
+		eof = seq->xseq_log_eof;
+	}
+
+	if (log_offset >= eof) {
+		if (data_read)
+			*data_read = 0;
+		return OK;
+	}
+
+	if ((off_t) size > eof - log_offset)
+		size = (size_t) (eof - log_offset);
+
+	if (data_read)
+		*data_read = size;
+	return xt_xlog_read(seq->xseq_log_file, seq->xseq_log_id, log_offset, size, buffer, seq->xseq_load_cache, thread);
+}
+
+xtBool XTDatabaseLog::xlog_rnd_read(XTXactSeqReadPtr seq, xtLogID log_id, xtLogOffset log_offset, size_t size, xtWord1 *buffer, size_t *data_read, XTThreadPtr thread)
+{
+	/* Fast track to reading from cache: */
+	if (log_id < xl_write_log_id)
+		return xlog_read_from_cache(seq, log_id, log_offset, size, 0, buffer, data_read, thread);
+	
+	if (log_id == xl_write_log_id && log_offset + (xtLogOffset) size <= xl_write_log_offset)
+		return xlog_read_from_cache(seq, log_id, log_offset, size, xl_write_log_offset, buffer, data_read, thread);
+
+	/* May be in the log write or append buffer: */
+	xt_lck_slock(&xl_buffer_lock);
+
+	if (log_id < xl_write_log_id) {
+		xt_spinlock_unlock(&xl_buffer_lock);
+		return xlog_read_from_cache(seq, log_id, log_offset, size, 0, buffer, data_read, thread);
+	}
+
+	/* Check the write buffer: */
+	if (log_id == xl_write_log_id) {
+		if (log_offset + (xtLogOffset) size <= xl_write_log_offset) {
+			xt_spinlock_unlock(&xl_buffer_lock);
+			return xlog_read_from_cache(seq, log_id, log_offset, size, xl_write_log_offset, buffer, data_read, thread);
+		}
+
+		if (log_offset < xl_write_log_offset + (xtLogOffset) xl_write_buf_pos) {
+			/* Reading partially from the write buffer: */
+			if (log_offset >= xl_write_log_offset) {
+				/* Completely in the buffer. */
+				off_t offset = log_offset - xl_write_log_offset;
+				
+				if (size > xl_write_buf_pos - offset)
+					size = (size_t) (xl_write_buf_pos - offset);
+				
+				memcpy(buffer, xl_write_buffer + offset, size);
+				if (data_read)
+					*data_read = size;
+				goto unlock_and_return;
+			}
+
+			/* End part in the buffer: */
+			size_t tfer;
+			
+			/* The amount that will be taken from the cache: */
+			tfer = (size_t) (xl_write_log_offset - log_offset);
+			
+			size -= tfer;
+			if (size > xl_write_buf_pos)
+				size = xl_write_buf_pos;
+			
+			memcpy(buffer + tfer, xl_write_buffer, size);
+
+			xt_spinlock_unlock(&xl_buffer_lock);
+			
+			/* Read the first part from the cache: */
+			if (data_read)
+				*data_read = tfer + size;			
+			return xlog_read_from_cache(seq, log_id, log_offset, tfer, log_offset + tfer, buffer, NULL, thread);
+		}
+	}
+
+	/* Check the append buffer: */
+	if (log_id == xl_append_log_id) {
+		if (log_offset >= xl_append_log_offset && log_offset < xl_append_log_offset + (xtLogOffset) xl_append_buf_pos) {
+			/* It is in the append buffer: */
+			size_t offset = (size_t) (log_offset - xl_append_log_offset);
+			
+			if (size > xl_append_buf_pos - offset)
+				size = xl_append_buf_pos - offset;
+			
+			memcpy(buffer, xl_append_buffer + offset, size);
+			if (data_read)
+				*data_read = size;
+			goto unlock_and_return;
+		}
+	}
+
+	if (xl_append_log_id == 0) {
+		/* This catches the case that
+		 * the log has not yet been initialized
+		 * for writing.
+		 */
+		xt_spinlock_unlock(&xl_buffer_lock);
+		return xlog_read_from_cache(seq, log_id, log_offset, size, 0, buffer, data_read, thread);
+	}
+
+	if (data_read)
+		*data_read = 0;
+
+	unlock_and_return:
+	xt_spinlock_unlock(&xl_buffer_lock);
+	return OK;
+}
+
+xtBool XTDatabaseLog::xlog_write_thru(XTXactSeqReadPtr seq, size_t size, xtWord1 *data, XTThreadPtr thread)
+{
+	if (!xt_xlog_write(seq->xseq_log_file, seq->xseq_log_id, seq->xseq_rec_log_offset, size, data, thread))
+		return FALSE;
+	xl_log_bytes_written += size;
+	seq->xseq_rec_log_offset += size;
+	return TRUE;
+}
+
+xtBool XTDatabaseLog::xlog_verify(XTXactLogBufferDPtr record, size_t rec_size, xtLogID log_id)
+{
+	xtWord4		sum = 0;
+	xtOpSeqNo	op_seq;
+	xtTableID	tab_id;
+	xtRecordID	rec_id, free_rec_id;
+	int			check_size = 1;
+	xtWord1		*dptr;
+	xtWord4		g;
+
+	switch (record->xh.xh_status_1) {
+		case XT_LOG_ENT_HEADER:
+			if (record->xh.xh_checksum_1 != XT_CHECKSUM_1(log_id))
+				return FALSE;
+			if (XT_LOG_HEAD_MAGIC(record, rec_size) != XT_LOG_FILE_MAGIC)
+				return FALSE;
+			if (rec_size >= offsetof(XTXactLogHeaderDRec, xh_log_id_4) + 4) {
+				if (XT_GET_DISK_4(record->xh.xh_log_id_4) != log_id)
+					return FALSE;
+			}
+			return TRUE;
+		case XT_LOG_ENT_NEW_LOG:
+		case XT_LOG_ENT_DEL_LOG:
+			return record->xl.xl_checksum_1 == (XT_CHECKSUM_1(XT_GET_DISK_4(record->xl.xl_log_id_4)) ^ XT_CHECKSUM_1(log_id));
+		case XT_LOG_ENT_NEW_TAB:
+			return record->xl.xl_checksum_1 == (XT_CHECKSUM_1(XT_GET_DISK_4(record->xt.xt_tab_id_4)) ^ XT_CHECKSUM_1(log_id));
+		case XT_LOG_ENT_COMMIT:
+		case XT_LOG_ENT_ABORT:
+			sum = XT_CHECKSUM4_XACT(XT_GET_DISK_4(record->xe.xe_xact_id_4)) ^ XT_CHECKSUM4_XACT(XT_GET_DISK_4(record->xe.xe_not_used_4));
+			return record->xe.xe_checksum_1 == (XT_CHECKSUM_1(sum) ^ XT_CHECKSUM_1(log_id));
+		case XT_LOG_ENT_CLEANUP:
+			sum = XT_CHECKSUM4_XACT(XT_GET_DISK_4(record->xc.xc_xact_id_4));
+			return record->xc.xc_checksum_1 == (XT_CHECKSUM_1(sum) ^ XT_CHECKSUM_1(log_id));
+		case XT_LOG_ENT_REC_MODIFIED:
+		case XT_LOG_ENT_UPDATE:
+		case XT_LOG_ENT_INSERT:
+		case XT_LOG_ENT_DELETE:
+		case XT_LOG_ENT_UPDATE_BG:
+		case XT_LOG_ENT_INSERT_BG:
+		case XT_LOG_ENT_DELETE_BG:
+			check_size = 2;
+			op_seq = XT_GET_DISK_4(record->xu.xu_op_seq_4);
+			tab_id = XT_GET_DISK_4(record->xu.xu_tab_id_4);
+			rec_id = XT_GET_DISK_4(record->xu.xu_rec_id_4);
+			dptr = &record->xu.xu_rec_type_1;
+			rec_size -= offsetof(XTactUpdateEntryDRec, xu_rec_type_1);
+			break;
+		case XT_LOG_ENT_UPDATE_FL:
+		case XT_LOG_ENT_INSERT_FL:
+		case XT_LOG_ENT_DELETE_FL:
+		case XT_LOG_ENT_UPDATE_FL_BG:
+		case XT_LOG_ENT_INSERT_FL_BG:
+		case XT_LOG_ENT_DELETE_FL_BG:
+			check_size = 2;
+			op_seq = XT_GET_DISK_4(record->xf.xf_op_seq_4);
+			tab_id = XT_GET_DISK_4(record->xf.xf_tab_id_4);
+			rec_id = XT_GET_DISK_4(record->xf.xf_rec_id_4);
+			free_rec_id = XT_GET_DISK_4(record->xf.xf_free_rec_id_4);
+			sum ^= XT_CHECKSUM4_REC(free_rec_id);
+			dptr = &record->xf.xf_rec_type_1;
+			rec_size -= offsetof(XTactUpdateFLEntryDRec, xf_rec_type_1);
+			break;
+		case XT_LOG_ENT_REC_FREED:
+		case XT_LOG_ENT_REC_REMOVED:
+		case XT_LOG_ENT_REC_REMOVED_EXT:
+			op_seq = XT_GET_DISK_4(record->fr.fr_op_seq_4);
+			tab_id = XT_GET_DISK_4(record->fr.fr_tab_id_4);
+			rec_id = XT_GET_DISK_4(record->fr.fr_rec_id_4);
+			dptr = &record->fr.fr_stat_id_1;
+			rec_size -= offsetof(XTactFreeRecEntryDRec, fr_stat_id_1);
+			break;
+		case XT_LOG_ENT_REC_REMOVED_BI:
+			check_size = 2;
+			op_seq = XT_GET_DISK_4(record->rb.rb_op_seq_4);
+			tab_id = XT_GET_DISK_4(record->rb.rb_tab_id_4);
+			rec_id = XT_GET_DISK_4(record->rb.rb_rec_id_4);
+			free_rec_id = (xtWord4) record->rb.rb_new_rec_type_1;
+			sum ^= XT_CHECKSUM4_REC(free_rec_id);
+			dptr = &record->rb.rb_rec_type_1;
+			rec_size -= offsetof(XTactRemoveBIEntryDRec, rb_rec_type_1);
+			break;
+		case XT_LOG_ENT_REC_MOVED:
+		case XT_LOG_ENT_REC_CLEANED:
+		case XT_LOG_ENT_REC_CLEANED_1:
+		case XT_LOG_ENT_REC_UNLINKED:
+			op_seq = XT_GET_DISK_4(record->xw.xw_op_seq_4);
+			tab_id = XT_GET_DISK_4(record->xw.xw_tab_id_4);
+			rec_id = XT_GET_DISK_4(record->xw.xw_rec_id_4);
+			dptr = &record->xw.xw_rec_type_1;
+			rec_size -= offsetof(XTactWriteRecEntryDRec, xw_rec_type_1);
+			break;
+		case XT_LOG_ENT_ROW_NEW:
+		case XT_LOG_ENT_ROW_NEW_FL:
+			op_seq = XT_GET_DISK_4(record->xa.xa_op_seq_4);
+			tab_id = XT_GET_DISK_4(record->xa.xa_tab_id_4);
+			rec_id = XT_GET_DISK_4(record->xa.xa_row_id_4);
+			if (record->xh.xh_status_1 == XT_LOG_ENT_ROW_NEW) {
+				dptr = (xtWord1 *) record + offsetof(XTactRowAddedEntryDRec, xa_free_list_4);
+				rec_size -= offsetof(XTactRowAddedEntryDRec, xa_free_list_4);
+			}
+			else {
+				free_rec_id = XT_GET_DISK_4(record->xa.xa_free_list_4);
+				sum ^= XT_CHECKSUM4_REC(free_rec_id);
+				dptr = (xtWord1 *) record + sizeof(XTactRowAddedEntryDRec);
+				rec_size -= sizeof(XTactRowAddedEntryDRec);
+			}
+			break;
+		case XT_LOG_ENT_ROW_ADD_REC:
+		case XT_LOG_ENT_ROW_SET:
+		case XT_LOG_ENT_ROW_FREED:
+			op_seq = XT_GET_DISK_4(record->wr.wr_op_seq_4);
+			tab_id = XT_GET_DISK_4(record->wr.wr_tab_id_4);
+			rec_id = XT_GET_DISK_4(record->wr.wr_row_id_4);
+			dptr = (xtWord1 *) &record->wr.wr_ref_id_4;
+			rec_size -= offsetof(XTactWriteRowEntryDRec, wr_ref_id_4);
+			break;
+		case XT_LOG_ENT_OP_SYNC:
+			return record->xl.xl_checksum_1 == (XT_CHECKSUM_1(XT_GET_DISK_4(record->os.os_time_4)) ^ XT_CHECKSUM_1(log_id));
+		case XT_LOG_ENT_NO_OP:
+			sum = XT_GET_DISK_4(record->no.no_tab_id_4) ^ XT_GET_DISK_4(record->no.no_op_seq_4);
+			return record->xe.xe_checksum_1 == (XT_CHECKSUM_1(sum) ^ XT_CHECKSUM_1(log_id));
+		case XT_LOG_ENT_END_OF_LOG:
+			return FALSE;
+		case XT_LOG_ENT_PREPARE:
+			check_size = 2;
+			op_seq = XT_GET_DISK_4(record->xp.xp_xact_id_4);
+			tab_id = 0;
+			rec_id = 0;
+			dptr = record->xp.xp_xa_data;
+			rec_size -= offsetof(XTXactPrepareEntryDRec, xp_xa_data);
+			break;
+		default:
+			ASSERT_NS(FALSE);
+			return FALSE;
+	}
+
+	sum ^= (xtWord4) op_seq ^ ((xtWord4) tab_id << 8) ^ XT_CHECKSUM4_REC(rec_id);
+
+	if ((g = sum & 0xF0000000)) {
+		sum = sum ^ (g >> 24);
+		sum = sum ^ g;
+	}
+	for (u_int i=0; i<(u_int) rec_size; i++) {
+		sum = (sum << 4) + *dptr;
+		if ((g = sum & 0xF0000000)) {
+			sum = sum ^ (g >> 24);
+			sum = sum ^ g;
+		}
+		dptr++;
+	}
+
+	if (check_size == 1) {
+		if (record->xh.xh_checksum_1 != (XT_CHECKSUM_1(sum) ^ XT_CHECKSUM_1(log_id))) {
+			return FAILED;
+		}
+	}
+	else {
+		if (XT_GET_DISK_2(record->xu.xu_checksum_2) != (XT_CHECKSUM_2(sum) ^ XT_CHECKSUM_2(log_id))) {
+			return FAILED;
+		}
+	}
+	return TRUE;
+}
+
+xtBool XTDatabaseLog::xlog_seq_next(XTXactSeqReadPtr seq, XTXactLogBufferDPtr *ret_entry, xtBool verify, XTThreadPtr thread)
+{
+	XTXactLogBufferDPtr	record;
+	size_t				tfer;
+	size_t				len;
+	size_t				rec_offset;
+	size_t				max_rec_len;
+	size_t				size;
+	u_int				check_size = 1;
+
+	/* Go to the next record (xseq_record_len must be initialized
+	 * to 0 for this to work.
+	 */
+	seq->xseq_rec_log_offset += seq->xseq_record_len;
+	seq->xseq_record_len = 0;
+
+	if (seq->xseq_rec_log_offset < seq->xseq_buf_log_offset ||
+		seq->xseq_rec_log_offset >= seq->xseq_buf_log_offset + (xtLogOffset) seq->xseq_buffer_len) {
+		/* The current position is nowhere near the buffer, read data into the
+		 * buffer:
+		 */
+		tfer = seq->xseq_buffer_size;
+		if (!xlog_rnd_read(seq, seq->xseq_rec_log_id, seq->xseq_rec_log_offset, tfer, seq->xseq_buffer, &tfer, thread))
+			return FAILED;
+		seq->xseq_buf_log_offset = seq->xseq_rec_log_offset;
+		seq->xseq_buffer_len = tfer;
+
+		/* Should we go to the next log? */
+		if (!tfer) {
+			goto return_empty;
+		}
+	}
+
+	/* The start of the record is in the buffer: */
+	read_from_buffer:
+	rec_offset = (size_t) (seq->xseq_rec_log_offset - seq->xseq_buf_log_offset);
+	max_rec_len = seq->xseq_buffer_len - rec_offset;
+	size = 0;
+
+	/* Check the type of record: */
+	record = (XTXactLogBufferDPtr) (seq->xseq_buffer + rec_offset);
+	switch (record->xh.xh_status_1) {
+		case XT_LOG_ENT_HEADER:
+			len = sizeof(XTXactLogHeaderDRec);
+			break;
+		case XT_LOG_ENT_NEW_LOG:
+		case XT_LOG_ENT_DEL_LOG:
+			len = sizeof(XTXactNewLogEntryDRec);
+			break;
+		case XT_LOG_ENT_NEW_TAB:
+			len = sizeof(XTXactNewTabEntryDRec);
+			break;
+		case XT_LOG_ENT_COMMIT:
+		case XT_LOG_ENT_ABORT:
+			len = sizeof(XTXactEndEntryDRec);
+			break;
+		case XT_LOG_ENT_CLEANUP:
+			len = sizeof(XTXactCleanupEntryDRec);
+			break;
+		case XT_LOG_ENT_REC_MODIFIED:
+		case XT_LOG_ENT_UPDATE:
+		case XT_LOG_ENT_INSERT:
+		case XT_LOG_ENT_DELETE:
+		case XT_LOG_ENT_UPDATE_BG:
+		case XT_LOG_ENT_INSERT_BG:
+		case XT_LOG_ENT_DELETE_BG:
+			check_size = 2;
+			len = offsetof(XTactUpdateEntryDRec, xu_rec_type_1);
+			if (len > max_rec_len)
+				/* The size is not in the buffer: */
+				goto read_more;
+			len += (size_t) XT_GET_DISK_2(record->xu.xu_size_2);
+			break;
+		case XT_LOG_ENT_UPDATE_FL:
+		case XT_LOG_ENT_INSERT_FL:
+		case XT_LOG_ENT_DELETE_FL:
+		case XT_LOG_ENT_UPDATE_FL_BG:
+		case XT_LOG_ENT_INSERT_FL_BG:
+		case XT_LOG_ENT_DELETE_FL_BG:
+			check_size = 2;
+			len = offsetof(XTactUpdateFLEntryDRec, xf_rec_type_1);
+			if (len > max_rec_len)
+				/* The size is not in the buffer: */
+				goto read_more;
+			len += (size_t) XT_GET_DISK_2(record->xf.xf_size_2);
+			break;
+		case XT_LOG_ENT_REC_FREED:
+		case XT_LOG_ENT_REC_REMOVED:
+		case XT_LOG_ENT_REC_REMOVED_EXT:
+			/* [(7)] REMOVE is now a extended version of FREE! */
+			len = offsetof(XTactFreeRecEntryDRec, fr_rec_type_1) + sizeof(XTTabRecFreeDRec);
+			break;
+		case XT_LOG_ENT_REC_REMOVED_BI:
+			check_size = 2;
+			len = offsetof(XTactRemoveBIEntryDRec, rb_rec_type_1);
+			if (len > max_rec_len)
+				/* The size is not in the buffer: */
+				goto read_more;
+			len += (size_t) XT_GET_DISK_2(record->rb.rb_size_2);
+			break;
+		case XT_LOG_ENT_REC_MOVED:
+			len = offsetof(XTactWriteRecEntryDRec, xw_rec_type_1) + 8;
+			break;
+		case XT_LOG_ENT_REC_CLEANED:
+			len = offsetof(XTactWriteRecEntryDRec, xw_rec_type_1) + offsetof(XTTabRecHeadDRec, tr_prev_rec_id_4) + XT_RECORD_ID_SIZE;
+			break;
+		case XT_LOG_ENT_REC_CLEANED_1:
+			len = offsetof(XTactWriteRecEntryDRec, xw_rec_type_1) + 1;
+			break;
+		case XT_LOG_ENT_REC_UNLINKED:
+			len = offsetof(XTactWriteRecEntryDRec, xw_rec_type_1) + offsetof(XTTabRecHeadDRec, tr_prev_rec_id_4) + XT_RECORD_ID_SIZE;
+			break;
+		case XT_LOG_ENT_ROW_NEW:
+			len = offsetof(XTactRowAddedEntryDRec, xa_row_id_4) + XT_ROW_ID_SIZE;
+			break;
+		case XT_LOG_ENT_ROW_NEW_FL:
+			len = offsetof(XTactRowAddedEntryDRec, xa_free_list_4) + XT_ROW_ID_SIZE;
+			break;
+		case XT_LOG_ENT_ROW_ADD_REC:
+		case XT_LOG_ENT_ROW_SET:
+		case XT_LOG_ENT_ROW_FREED:
+			len = offsetof(XTactWriteRowEntryDRec, wr_ref_id_4) + XT_REF_ID_SIZE;
+			break;
+		case XT_LOG_ENT_OP_SYNC:
+			len = sizeof(XTactOpSyncEntryDRec);
+			break;
+		case XT_LOG_ENT_NO_OP:
+			len = sizeof(XTactNoOpEntryDRec);
+			break;
+		case XT_LOG_ENT_END_OF_LOG: {
+			off_t eof = seq->xseq_log_eof, adjust;
+			
+			if (eof > seq->xseq_rec_log_offset) {
+				adjust = eof - seq->xseq_rec_log_offset;
+
+				seq->xseq_record_len = (size_t) adjust;
+			}
+			goto return_empty;
+		}
+		case XT_LOG_ENT_PREPARE:
+			check_size = 2;
+			len = offsetof(XTXactPrepareEntryDRec, xp_xa_data);
+			if (len > max_rec_len)
+				/* The size is not in the buffer: */
+				goto read_more;
+			len += (size_t) record->xp.xp_xa_len_1;
+			break;
+		default:
+			/* It is possible to land here after a crash, if the
+			 * log was not completely written.
+			 */
+			seq->xseq_record_len = 0;
+			goto return_empty;
+	}
+
+	ASSERT_NS(len <= seq->xseq_buffer_size);
+	if (len <= max_rec_len) {
+		if (verify) {
+			if (!xlog_verify(record, len, seq->xseq_rec_log_id)) {
+				goto return_empty;
+			}
+		}
+
+		/* The record is completely in the buffer: */
+		seq->xseq_record_len = len;
+		*ret_entry = record;
+		return OK;
+	}
+	
+	/* The record is partially in the buffer. */
+	memmove(seq->xseq_buffer, seq->xseq_buffer + rec_offset, max_rec_len);
+	seq->xseq_buf_log_offset += rec_offset;
+	seq->xseq_buffer_len = max_rec_len;
+
+	/* Read the rest, as far as possible: */
+	tfer = seq->xseq_buffer_size - max_rec_len;
+	if (!xlog_rnd_read(seq, seq->xseq_rec_log_id, seq->xseq_buf_log_offset + max_rec_len, tfer, seq->xseq_buffer + max_rec_len, &tfer, thread))
+		return FAILED;
+	seq->xseq_buffer_len += tfer;
+
+	if (seq->xseq_buffer_len < len) {
+		/* A partial record is in the log, must be the end of the log: */
+		goto return_empty;
+	}
+
+	/* The record is now completely in the buffer: */
+	seq->xseq_record_len = len;
+	*ret_entry = (XTXactLogBufferDPtr) seq->xseq_buffer;
+	return OK;
+
+	read_more:
+	ASSERT_NS(len <= seq->xseq_buffer_size);
+	memmove(seq->xseq_buffer, seq->xseq_buffer + rec_offset, max_rec_len);
+	seq->xseq_buf_log_offset += rec_offset;
+	seq->xseq_buffer_len = max_rec_len;
+
+	/* Read the rest, as far as possible: */
+	tfer = seq->xseq_buffer_size - max_rec_len;
+	if (!xlog_rnd_read(seq, seq->xseq_rec_log_id, seq->xseq_buf_log_offset + max_rec_len, tfer, seq->xseq_buffer + max_rec_len, &tfer, thread))
+		return FAILED;
+	seq->xseq_buffer_len += tfer;
+
+	if (seq->xseq_buffer_len < len + size) {
+		/* We did not get as much as we need, return an empty record: */
+		goto return_empty;
+	}
+
+	goto read_from_buffer;
+
+	return_empty:
+	*ret_entry = NULL;
+	return OK;
+}
+
+void XTDatabaseLog::xlog_seq_skip(XTXactSeqReadPtr seq, size_t size)
+{
+	seq->xseq_record_len += size;
+}
+
+/* ----------------------------------------------------------------------
+ * W R I T E R    P R O C E S S
+ */
+
+/*
+ * The log has been written. Wake the writer to commit the
+ * data to disk, if the transaction log cache is full.
+ *
+ * Data may not be written to the database until it has been
+ * flushed to the log.
+ *
+ * This is because there is no way to undo changes to the
+ * database.
+ *
+ * However, I have dicovered that writing constantly in the
+ * background can disturb the I/O in the foreground.
+ *
+ * So we can delay the writing of the database. But we should
+ * not delay it longer than we have transaction log cache.
+ *
+ * If so, the data that we need will fall out of the cache
+ * and we will have to read it again.
+ */
+static void xlog_wr_log_written(XTDatabaseHPtr db)
+{
+	if (db->db_wr_idle) {
+		xtWord8 cached_bytes;
+
+		/* Determine if the cached log data is about to fall out of the cache. */
+		cached_bytes = db->db_xlog.xl_log_bytes_written - db->db_xlog.xl_log_bytes_read;
+		/* The limit is 75%: */
+		if (cached_bytes >= xt_xlog_cache.xlc_upper_limit) {
+			if (!xt_broadcast_cond_ns(&db->db_wr_cond))
+				xt_log_and_clear_exception_ns();
+		}
+	}
+}
+
+#define XT_MORE_TO_WRITE		1
+#define XT_FREER_WAITING		2
+#define XT_NO_ACTIVITY			3
+#define XT_LOG_CACHE_FULL		4
+#define XT_CHECKPOINT_REQ		5
+#define XT_THREAD_WAITING		6
+#define XT_TIME_TO_WRITE		7
+
+/*
+ * Wait for a transaction to quit, i.e. the log to be flushed.
+ */
+static void xlog_wr_wait_for_log_flush(XTThreadPtr self, XTDatabaseHPtr db)
+{
+	xtXactID	last_xn_id;
+	xtWord8		cached_bytes;
+	int			reason = XT_MORE_TO_WRITE;
+
+#ifdef TRACE_WRITER_ACTIVITY
+	printf("WRITER --- DONE\n");
+#endif
+
+	xt_lock_mutex(self, &db->db_wr_lock);
+	pushr_(xt_unlock_mutex, &db->db_wr_lock);
+
+	/*
+	 * Wake the freeer if it is waiting for this writer, before
+	 * we go to sleep!
+	 */
+	if (db->db_wr_freeer_waiting) {
+		if (!xt_broadcast_cond_ns(&db->db_wr_cond))
+			xt_log_and_clear_exception_ns();
+	}
+
+	if (db->db_wr_flush_point_log_id == db->db_xlog.xl_flush_log_id &&
+		db->db_wr_flush_point_log_offset == db->db_xlog.xl_flush_log_offset) {
+		/* Wake the checkpointer to flush the indexes:
+		 * PMC 15.05.2008 - Not doing this anymore!
+		xt_wake_checkpointer(self, db);
+		*/
+
+		/* Sleep as long as the flush point has not changed, from the last
+		 * target flush point.
+		 */
+		while (!self->t_quit &&
+			db->db_wr_flush_point_log_id == db->db_xlog.xl_flush_log_id &&
+			db->db_wr_flush_point_log_offset == db->db_xlog.xl_flush_log_offset &&
+			reason != XT_LOG_CACHE_FULL &&
+			reason != XT_TIME_TO_WRITE &&
+			reason != XT_CHECKPOINT_REQ) {
+
+			/*
+			 * Sleep as long as there is no reason to write any more...
+			 */
+			while (!self->t_quit) {
+				last_xn_id = db->db_xn_curr_id;
+				db->db_wr_idle = XT_THREAD_IDLE;
+				xt_timed_wait_cond(self, &db->db_wr_cond, &db->db_wr_lock, 500);
+				db->db_wr_idle = XT_THREAD_BUSY;
+				/* These are the reasons for doing work: */
+				/* The free'er thread is waiting for the writer: */
+				if (db->db_wr_freeer_waiting) {
+					reason = XT_FREER_WAITING;
+					break;
+				}
+				/* Some thread is waiting for the writer: */
+				if (db->db_wr_thread_waiting) {
+					reason = XT_THREAD_WAITING;
+					break;
+				}
+				/* Check if the cache will soon overflow... */
+				ASSERT(db->db_xlog.xl_log_bytes_written >= db->db_xlog.xl_log_bytes_read);
+				ASSERT(db->db_xlog.xl_log_bytes_written >= db->db_xlog.xl_log_bytes_flushed);
+				/* Sanity check: */
+				ASSERT(db->db_xlog.xl_log_bytes_written < db->db_xlog.xl_log_bytes_read + 500000000);
+				/* This is the amount of data still to be written: */
+				cached_bytes = db->db_xlog.xl_log_bytes_written - db->db_xlog.xl_log_bytes_read;
+				/* The limit is 75%: */
+				if (cached_bytes >= xt_xlog_cache.xlc_upper_limit) {
+					reason = XT_LOG_CACHE_FULL;
+					break;
+				}
+				
+				/* TODO: Create a system variable which specifies the write frequency. *//*
+				if (cached_bytes >= (12 * 1024 * 1024)) {
+					reason = XT_TIME_TO_WRITE;
+					break;
+				}
+				*/
+				
+				/* Check if we are holding up a checkpoint: */
+				if (db->db_restart.xres_cp_required ||
+					db->db_restart.xres_is_checkpoint_pending(db->db_xlog.xl_write_log_id, db->db_xlog.xl_write_log_offset)) {
+					/* Enough data has been flushed for a checkpoint: */
+					if (!db->db_restart.xres_is_checkpoint_pending(db->db_wr_log_id, db->db_wr_log_offset)) {
+						/* But not enough data has been written for a checkpoint: */
+						reason = XT_CHECKPOINT_REQ;
+						break;
+					}
+				}
+				/* There is no activity, if the current ID has not changed during
+				 * the wait, and the sweeper has nothing to do, and the checkpointer.
+				 */
+				if (db->db_xn_curr_id == last_xn_id &&
+					/* Changed xt_xn_get_curr_id(db) to db->db_xn_curr_id,
+					 * This should work because we are not concerned about the difference
+					 * between xt_xn_get_curr_id(db) and db->db_xn_curr_id,
+					 * Which is just a matter of when transactions we can expect ot find
+					 * in memory (see {GAP-INC-ADD-XACT})
+					 */
+					xt_xn_is_before(db->db_xn_curr_id, db->db_xn_to_clean_id) && // db->db_xn_curr_id < db->db_xn_to_clean_id
+					!db->db_restart.xres_is_checkpoint_pending(db->db_xlog.xl_write_log_id, db->db_xlog.xl_write_log_offset)) {
+					/* There seems to be no activity at the moment.
+					 * this might be a good time to write the log data.
+					 */
+					reason = XT_NO_ACTIVITY;
+					break;
+				}
+			}
+		}
+	}
+	freer_(); // xt_unlock_mutex(&db->db_wr_lock)
+
+	if (reason == XT_LOG_CACHE_FULL || reason == XT_TIME_TO_WRITE || reason == XT_CHECKPOINT_REQ) {
+		/* Make sure that we have something to write: */
+		if (db->db_xlog.xlog_bytes_to_write() < 2 * 1204 * 1024)
+			xt_xlog_flush_log(db, self);
+	}
+
+#ifdef TRACE_WRITER_ACTIVITY
+	switch (reason) {
+		case XT_MORE_TO_WRITE:	printf("WRITER --- still more to write...\n"); break;
+		case XT_FREER_WAITING:	printf("WRITER --- free'er waiting for writer...\n"); break;
+		case XT_NO_ACTIVITY:	printf("WRITER --- no activity...\n"); break;
+		case XT_LOG_CACHE_FULL:	printf("WRITER --- running out of log cache...\n"); break;
+		case XT_CHECKPOINT_REQ:	printf("WRITER --- enough flushed for a checkpoint...\n"); break;
+		case XT_THREAD_WAITING: printf("WRITER --- thread waiting for writer...\n"); break;
+		case XT_TIME_TO_WRITE:	printf("WRITER --- limit of 12MB reached, time to write...\n"); break;
+	}
+#endif
+}
+
+static void xlog_wr_could_go_faster(XTThreadPtr self, XTDatabaseHPtr db)
+{
+	if (db->db_wr_faster) {
+		if (!db->db_wr_fast) {
+			xt_set_normal_priority(self);
+			db->db_wr_fast = TRUE;
+		}
+		db->db_wr_faster = FALSE;
+	}
+}
+
+static void xlog_wr_could_go_slower(XTThreadPtr self, XTDatabaseHPtr db)
+{
+	if (db->db_wr_fast && !db->db_wr_faster) {
+		xt_set_low_priority(self);
+		db->db_wr_fast = FALSE;
+	}
+}
+
+static void xlog_wr_main(XTThreadPtr self)
+{
+	XTDatabaseHPtr		db = self->st_database;
+	XTWriterStatePtr	ws;
+	XTXactLogBufferDPtr	record;
+
+	xt_set_low_priority(self);
+
+	alloczr_(ws, xt_free_writer_state, sizeof(XTWriterStateRec), XTWriterStatePtr);
+	ws->ws_db = db;
+	ws->ws_in_recover = FALSE;
+
+	if (!db->db_xlog.xlog_seq_init(&ws->ws_seqread, xt_db_log_buffer_size, FALSE))
+		xt_throw(self);
+
+	if (!db->db_xlog.xlog_seq_start(&ws->ws_seqread, db->db_wr_log_id, db->db_wr_log_offset, FALSE))
+		xt_throw(self);
+
+	while (!self->t_quit) {
+		while (!self->t_quit) {
+			/* Determine the point to which we can write.
+			 * This is the current log flush point!
+			 */
+			xt_lock_mutex_ns(&db->db_wr_lock);
+			db->db_wr_flush_point_log_id = db->db_xlog.xl_flush_log_id;
+			db->db_wr_flush_point_log_offset = db->db_xlog.xl_flush_log_offset;
+			xt_unlock_mutex_ns(&db->db_wr_lock);
+
+			if (xt_comp_log_pos(db->db_wr_log_id, db->db_wr_log_offset, db->db_wr_flush_point_log_id, db->db_wr_flush_point_log_offset) >= 0) {
+				break;
+			}
+
+			while (!self->t_quit) {
+				xlog_wr_could_go_faster(self, db);
+
+				/* This is the restart position: */
+				xt_lock_mutex(self, &db->db_wr_lock);
+				pushr_(xt_unlock_mutex, &db->db_wr_lock);
+				db->db_wr_log_id = ws->ws_seqread.xseq_rec_log_id;
+				db->db_wr_log_offset = ws->ws_seqread.xseq_rec_log_offset +  ws->ws_seqread.xseq_record_len;
+				freer_(); // xt_unlock_mutex(&db->db_wr_lock)
+
+				if (xt_comp_log_pos(db->db_wr_log_id, db->db_wr_log_offset, db->db_wr_flush_point_log_id, db->db_wr_flush_point_log_offset) >= 0) {
+					break;
+				}
+
+				/* Apply all changes that have been flushed to the log, to the
+				 * database.
+				 */
+				if (!db->db_xlog.xlog_seq_next(&ws->ws_seqread, &record, FALSE, self))
+					xt_throw(self);
+				if (!record) {
+					break;
+				}
+				switch (record->xl.xl_status_1) {
+					case XT_LOG_ENT_HEADER:
+						break;
+					case XT_LOG_ENT_NEW_LOG:
+						if (!db->db_xlog.xlog_seq_start(&ws->ws_seqread, XT_GET_DISK_4(record->xl.xl_log_id_4), 0, TRUE))
+							xt_throw(self);
+						break;
+					case XT_LOG_ENT_NEW_TAB:
+					case XT_LOG_ENT_COMMIT:
+					case XT_LOG_ENT_ABORT:
+					case XT_LOG_ENT_CLEANUP:
+					case XT_LOG_ENT_OP_SYNC:
+					case XT_LOG_ENT_PREPARE:
+						break;
+					case XT_LOG_ENT_DEL_LOG:
+						xtLogID log_id;
+
+						log_id = XT_GET_DISK_4(record->xl.xl_log_id_4);
+						xt_dl_set_to_delete(self, db, log_id);
+						break;
+					default:
+						xt_xres_apply_in_order(self, ws, ws->ws_seqread.xseq_rec_log_id, ws->ws_seqread.xseq_rec_log_offset, record);
+						break;
+				}
+				/* Count the number of bytes read from the log: */
+				db->db_xlog.xl_log_bytes_read += ws->ws_seqread.xseq_record_len;
+			}
+		}
+
+		if (ws->ws_ot) {
+			xt_db_return_table_to_pool(self, ws->ws_ot);
+			ws->ws_ot = NULL;
+		}
+
+		xlog_wr_could_go_slower(self, db);
+
+		/* Note, we delay writing the database for a maximum of
+		 * 2 seconds.
+		 */
+		xlog_wr_wait_for_log_flush(self, db);
+	}
+
+	freer_(); // xt_free_writer_state(ss)
+}
+
+static void *xlog_wr_run_thread(XTThreadPtr self)
+{
+	XTDatabaseHPtr	db = (XTDatabaseHPtr) self->t_data;
+	int				count;
+	void			*mysql_thread;
+
+	mysql_thread = myxt_create_thread();
+
+	while (!self->t_quit) {
+		try_(a) {
+			/*
+			 * The garbage collector requires that the database
+			 * is in use because.
+			 */
+			xt_use_database(self, db, XT_FOR_WRITER);
+
+			/* This action is both safe and required (see details elsewhere) */
+			xt_heap_release(self, self->st_database);
+
+			xlog_wr_main(self);
+		}
+		catch_(a) {
+			/* This error is "normal"! */
+			if (self->t_exception.e_xt_err != XT_ERR_NO_DICTIONARY &&
+				!(self->t_exception.e_xt_err == XT_SIGNAL_CAUGHT &&
+				self->t_exception.e_sys_err == SIGTERM))
+				xt_log_and_clear_exception(self);
+		}
+		cont_(a);
+
+		/* Avoid releasing the database (done above) */
+		self->st_database = NULL;
+		xt_unuse_database(self, self);
+
+		/* After an exception, pause before trying again... */
+		/* Number of seconds */
+#ifdef DEBUG
+		count = 10;
+#else
+		count = 2*60;
+#endif
+		db->db_wr_idle = XT_THREAD_INERR;
+		while (!self->t_quit && count > 0) {
+			sleep(1);
+			count--;
+		}
+		db->db_wr_idle = XT_THREAD_BUSY;
+	}
+
+   /*
+	* {MYSQL-THREAD-KILL}
+	myxt_destroy_thread(mysql_thread, TRUE);
+	*/
+	return NULL;
+}
+
+static void xlog_wr_free_thread(XTThreadPtr self, void *data)
+{
+	XTDatabaseHPtr db = (XTDatabaseHPtr) data;
+
+	if (db->db_wr_thread) {
+		xt_lock_mutex(self, &db->db_wr_lock);
+		pushr_(xt_unlock_mutex, &db->db_wr_lock);
+		db->db_wr_thread = NULL;
+		freer_(); // xt_unlock_mutex(&db->db_wr_lock)
+	}
+}
+
+xtPublic void xt_start_writer(XTThreadPtr self, XTDatabaseHPtr db)
+{
+	char name[PATH_MAX];
+
+	sprintf(name, "WR-%s", xt_last_directory_of_path(db->db_main_path));
+	xt_remove_dir_char(name);
+	db->db_wr_thread = xt_create_daemon(self, name);
+	xt_set_thread_data(db->db_wr_thread, db, xlog_wr_free_thread);
+	xt_run_thread(self, db->db_wr_thread, xlog_wr_run_thread);
+}
+
+/*
+ * This function is called on database shutdown.
+ * We will wait a certain amounnt of time for the writer to
+ * complete its work.
+ * If it takes to long we will abort!
+ */
+xtPublic void xt_wait_for_writer(XTThreadPtr self, XTDatabaseHPtr db)
+{
+	time_t	then, now;
+	xtBool	message = FALSE;
+
+	if (db->db_wr_thread) {
+		then = time(NULL);
+		while (xt_comp_log_pos(db->db_wr_log_id, db->db_wr_log_offset, db->db_wr_flush_point_log_id, db->db_wr_flush_point_log_offset) < 0) {
+
+			xt_lock_mutex(self, &db->db_wr_lock);
+			pushr_(xt_unlock_mutex, &db->db_wr_lock);
+			db->db_wr_thread_waiting++;
+			/* Wake the writer so that it con complete its work. */
+			if (db->db_wr_idle) {
+				if (!xt_broadcast_cond_ns(&db->db_wr_cond))
+					xt_log_and_clear_exception_ns();
+			}
+			freer_(); // xt_unlock_mutex(&db->db_wr_lock)
+
+			xt_sleep_milli_second(10);
+
+			xt_lock_mutex(self, &db->db_wr_lock);
+			pushr_(xt_unlock_mutex, &db->db_wr_lock);
+			db->db_wr_thread_waiting--;
+			freer_(); // xt_unlock_mutex(&db->db_wr_lock)
+
+			now = time(NULL);
+			if (now >= then + 16) {
+				xt_logf(XT_NT_INFO, "Aborting wait for '%s' writer\n", db->db_name);
+				message = FALSE;
+				break;
+			}
+			if (now >= then + 2) {
+				if (!message) {
+					message = TRUE;
+					xt_logf(XT_NT_INFO, "Waiting for '%s' writer...\n", db->db_name);
+				}
+			}
+		}
+		
+		if (message)
+			xt_logf(XT_NT_INFO, "Writer '%s' done.\n", db->db_name);
+	}
+}
+
+xtPublic void xt_stop_writer(XTThreadPtr self, XTDatabaseHPtr db)
+{
+	XTThreadPtr thr_wr;
+
+	if (db->db_wr_thread) {
+		xt_lock_mutex(self, &db->db_wr_lock);
+		pushr_(xt_unlock_mutex, &db->db_wr_lock);
+
+		/* This pointer is safe as long as you have the transaction lock. */
+		if ((thr_wr = db->db_wr_thread)) {
+			xtThreadID tid = thr_wr->t_id;
+
+			/* Make sure the thread quits when woken up. */
+			xt_terminate_thread(self, thr_wr);
+
+			/* Wake the writer thread so that it will quit: */
+			xt_broadcast_cond(self, &db->db_wr_cond);
+	
+			freer_(); // xt_unlock_mutex(&db->db_wr_lock)
+
+			/*
+			 * GOTCHA: This is a wierd thing but the SIGTERM directed
+			 * at a particular thread (in this case the sweeper) was
+			 * being caught by a different thread and killing the server
+			 * sometimes. Disconcerting.
+			 * (this may only be a problem on Mac OS X)
+			xt_kill_thread(thread);
+			 */
+			xt_wait_for_thread(tid, FALSE);
+	
+			/* PMC - This should not be necessary to set the signal here, but in the
+			 * debugger the handler is not called!!?
+			thr_wr->t_delayed_signal = SIGTERM;
+			xt_kill_thread(thread);
+			 */
+			db->db_wr_thread = NULL;
+		}
+		else
+			freer_(); // xt_unlock_mutex(&db->db_wr_lock)
+	}
+}
+
+#ifdef NOT_USED
+static void xlog_add_to_flush_buffer(u_int flush_count, XTXLogBlockPtr *flush_buffer, XTXLogBlockPtr block)
+{
+	register u_int		count = flush_count;
+	register u_int		i;
+	register u_int		guess;
+	register xtInt8		r;
+
+	i = 0;
+	while (i < count) {
+		guess = (i + count - 1) >> 1;
+		r = (xtInt8) block->xlb_address - (xtInt8) flush_buffer[guess]->xlb_address;
+		if (r == 0) {
+			// Should not happen...
+			ASSERT_NS(FALSE);
+			return;
+		}
+		if (r < (xtInt8) 0)
+			count = guess;
+		else
+			i = guess + 1;
+	}
+
+	/* Insert at position i */
+	memmove(flush_buffer + i + 1, flush_buffer + i, (flush_count - i) * sizeof(XTXLogBlockPtr));
+	flush_buffer[i] = block;
+}
+
+static XTXLogBlockPtr xlog_find_block(XTOpenFilePtr file, xtLogID log_id, off_t address, XTXLogCacheSegPtr *ret_seg)
+{
+	register XTXLogCacheSegPtr	seg;
+	register XTXLogBlockPtr		block;
+	register u_int				hash_idx;
+	register XTXLogCacheRec		*dcg = &xt_xlog_cache;
+
+	seg = &dcg->xlc_segment[((u_int) address >> XT_XLC_BLOCK_SHIFTS) & XLC_SEGMENT_MASK];
+	hash_idx = (((u_int) (address >> (XT_XLC_SEGMENT_SHIFTS + XT_XLC_BLOCK_SHIFTS))) ^ (log_id << 16)) % dcg->xlc_hash_size;
+
+	xt_lock_mutex_ns(&seg->lcs_lock);
+	retry:
+	block = seg->lcs_hash_table[hash_idx];
+	while (block) {
+		if (block->xlb_address == address && block->xlb_log_id == log_id) {
+			ASSERT_NS(block->xlb_state != XLC_BLOCK_FREE);
+
+			/* Wait if the block is being read or written.
+			 * If we will just read the data, then we don't care
+			 * if the buffer is being written.
+			 */
+			if (block->xlb_state == XLC_BLOCK_READING) {
+				if (!xt_timed_wait_cond_ns(&seg->lcs_cond, &seg->lcs_lock, 100))
+					break;
+				goto retry;
+			}
+
+			*ret_seg = seg;
+			return block;
+		}
+		block = block->xlb_next;
+	}
+	
+	/* Block not found: */
+	xt_unlock_mutex_ns(&seg->lcs_lock);
+	return NULL;
+}
+
+static int xlog_cmp_log_files(struct XTThread *self, register const void *thunk, register const void *a, register const void *b)
+{
+#pragma unused(self, thunk)
+	xtLogID				lf_id = *((xtLogID *) a);
+	XTXactLogFilePtr	lf_ptr = (XTXactLogFilePtr) b;
+
+	if (lf_id < lf_ptr->lf_log_id)
+		return -1;
+	if (lf_id == lf_ptr->lf_log_id)
+		return 0;
+	return 1;
+}
+
+#endif
+
+
+#ifdef OLD_CODE
+static xtBool xlog_free_lru_blocks()
+{
+	XTXLogBlockPtr		block, pblock;
+	xtWord4				ru_time;
+	xtLogID				log_id;
+	off_t				address;
+	//off_t				hash;
+	XTXLogCacheSegPtr	seg;
+	u_int				hash_idx;
+	xtBool				have_global_lock = FALSE;
+
+#ifdef DEBUG_CHECK_CACHE
+	//xt_xlog_check_cache();
+#endif
+	retry:
+	if (!(block = xt_xlog_cache.xlc_lru_block))
+		return OK;
+
+	ru_time = block->xlb_ru_time;
+	log_id = block->xlb_log_id;
+	address = block->xlb_address;
+
+	/*
+	hash = (address >> XT_XLC_BLOCK_SHIFTS) ^ ((off_t) log_id << 28);
+	seg = &xt_xlog_cache.xlc_segment[hash & XLC_SEGMENT_MASK];
+	hash_idx = (hash >> XT_XLC_SEGMENT_SHIFTS) % xt_xlog_cache.xlc_hash_size;
+	*/
+	seg = &xt_xlog_cache.xlc_segment[((u_int) address >> XT_XLC_BLOCK_SHIFTS) & XLC_SEGMENT_MASK];
+	hash_idx = (((u_int) (address >> (XT_XLC_SEGMENT_SHIFTS + XT_XLC_BLOCK_SHIFTS))) ^ (log_id << 16)) % xt_xlog_cache.xlc_hash_size;
+
+	xt_lock_mutex_ns(&seg->lcs_lock);
+
+	free_more:
+	pblock = NULL;
+	block = seg->lcs_hash_table[hash_idx];
+	while (block) {
+		if (block->xlb_address == address && block->xlb_log_id == log_id) {
+			ASSERT_NS(block->xlb_state != XLC_BLOCK_FREE);
+			
+			/* Try again if the block has been used in the meantime: */
+			if (ru_time != block->xlb_ru_time) {
+				if (have_global_lock)
+					/* Having this lock means we have already freed at least one block so
+					 * don't bother to free more if we are having trouble.
+					 */
+					goto done_ok;
+
+				/* If the recently used time has changed, then the
+				 * block is probably no longer the LR used.
+				 */
+				xt_unlock_mutex_ns(&seg->lcs_lock);
+				goto retry;
+			}
+
+			/* Wait if the block is being read: */
+			if (block->xlb_state == XLC_BLOCK_READING) {
+				if (have_global_lock)
+					goto done_ok;
+
+				/* Wait for the block to be read, then try again. */
+				if (!xt_timed_wait_cond_ns(&seg->lcs_cond, &seg->lcs_lock, 100))
+					goto failed;
+				xt_unlock_mutex_ns(&seg->lcs_lock);
+				goto retry;
+			}
+			
+			goto free_the_block;
+		}
+		pblock = block;
+		block = block->xlb_next;
+	}
+
+	if (have_global_lock) {
+		xt_unlock_mutex_ns(&xt_xlog_cache.xlc_lock);
+		have_global_lock = FALSE;
+	}
+
+	/* We did not find the block, someone else freed it... */
+	xt_unlock_mutex_ns(&seg->lcs_lock);
+	goto retry;
+
+	free_the_block:
+	ASSERT_NS(block->xlb_state == XLC_BLOCK_CLEAN);
+
+	/* Remove from the hash table: */
+	if (pblock)
+		pblock->xlb_next = block->xlb_next;
+	else
+		seg->lcs_hash_table[hash_idx] = block->xlb_next;
+
+	/* Now free the block */
+	if (!have_global_lock) {
+		xt_lock_mutex_ns(&xt_xlog_cache.xlc_lock);
+		have_global_lock = TRUE;
+	}
+
+	/* Remove from the MRU list: */
+	if (xt_xlog_cache.xlc_lru_block == block)
+		xt_xlog_cache.xlc_lru_block = block->xlb_mr_used;
+	if (xt_xlog_cache.xlc_mru_block == block)
+		xt_xlog_cache.xlc_mru_block = block->xlb_lr_used;
+	if (block->xlb_lr_used)
+		block->xlb_lr_used->xlb_mr_used = block->xlb_mr_used;
+	if (block->xlb_mr_used)
+		block->xlb_mr_used->xlb_lr_used = block->xlb_lr_used;
+
+	/* Put the block on the free list: */
+	block->xlb_next = xt_xlog_cache.xlc_free_list;
+	xt_xlog_cache.xlc_free_list = block;
+	xt_xlog_cache.xlc_free_count++;
+	block->xlb_state = XLC_BLOCK_FREE;
+
+	if (xt_xlog_cache.xlc_free_count < XT_XLC_MAX_FREE_COUNT) {
+		/* Now that we have all the locks, try to free some more in this segment: */
+		block = block->xlb_mr_used;
+		for (u_int i=0; block && i<XLC_SEGMENT_COUNT; i++) {
+			ru_time = block->xlb_ru_time;
+			log_id = block->xlb_log_id;
+			address = block->xlb_address;
+
+			if (seg == &xt_xlog_cache.xlc_segment[((u_int) address >> XT_XLC_BLOCK_SHIFTS) & XLC_SEGMENT_MASK]) {
+				hash_idx = (((u_int) (address >> (XT_XLC_SEGMENT_SHIFTS + XT_XLC_BLOCK_SHIFTS))) ^ (log_id << 16)) % xt_xlog_cache.xlc_hash_size;
+				goto free_more;
+			}
+			block = block->xlb_mr_used;
+		}
+	}
+
+	done_ok:
+	xt_unlock_mutex_ns(&xt_xlog_cache.xlc_lock);
+	xt_unlock_mutex_ns(&seg->lcs_lock);
+#ifdef DEBUG_CHECK_CACHE
+	//xt_xlog_check_cache();
+#endif
+	return OK;
+	
+	failed:
+	xt_unlock_mutex_ns(&seg->lcs_lock);
+#ifdef DEBUG_CHECK_CACHE
+	//xt_xlog_check_cache();
+#endif
+	return FAILED;
+}
+
+#endif
diff --git a/storage/pbxt/src/xactlog_xt.h b/storage/pbxt/src/xactlog_xt.h
new file mode 100644
index 00000000000..f72b810d66b
--- /dev/null
+++ b/storage/pbxt/src/xactlog_xt.h
@@ -0,0 +1,486 @@
+/* Copyright (c) 2007 PrimeBase Technologies GmbH
+ *
+ * PrimeBase XT
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.	See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * 2007-10-31	Paul McCullagh
+ *
+ * H&G2JCtL
+ *
+ * The new table cache. Caches all non-index data. This includes the data
+ * files and the row pointer files.
+ */
+
+#ifndef __xactlog_xt_h__
+#define __xactlog_xt_h__
+
+#include "pthread_xt.h"
+#include "filesys_xt.h"
+#include "sortedlist_xt.h"
+
+struct XTThread;
+struct XTOpenTable;
+struct XTDatabase;
+
+#ifdef DEBUG
+//#define XT_USE_CACHE_DEBUG_SIZES
+#endif
+
+#ifdef XT_USE_CACHE_DEBUG_SIZES
+#define XT_XLC_BLOCK_SHIFTS			5
+#define XT_XLC_FILE_SLOTS			7
+#define XT_XLC_SEGMENT_SHIFTS		1
+#define XT_XLC_MAX_FLUSH_SEG_COUNT	10
+#define XT_XLC_MAX_FREE_COUNT		10
+#else
+/* Block size is determined by the number of shifts 1 << 15 = 32K */
+#define XT_XLC_BLOCK_SHIFTS			15
+#define XT_XLC_FILE_SLOTS			71
+/* The number of segments are determined by the segment shifts 1 << 3 = 8 */
+#define XT_XLC_SEGMENT_SHIFTS		3
+#define XT_XLC_MAX_FLUSH_SEG_COUNT	250
+#define XT_XLC_MAX_FREE_COUNT		100
+#endif
+
+#define XT_XLC_BLOCK_SIZE			(1 << XT_XLC_BLOCK_SHIFTS)
+#define XT_XLC_BLOCK_MASK			(XT_XLC_BLOCK_SIZE - 1)
+
+#define XT_TIME_DIFF(start, now) (\
+	((xtWord4) (now) < (xtWord4) (start)) ? \
+	((xtWord4) 0XFFFFFFFF - ((xtWord4) (start) - (xtWord4) (now))) : \
+	((xtWord4) (now) - (xtWord4) (start)))
+
+#define XLC_SEGMENT_COUNT			((off_t) 1 << XT_XLC_SEGMENT_SHIFTS)
+#define XLC_SEGMENT_MASK			(XLC_SEGMENT_COUNT - 1)
+#define XLC_MAX_FLUSH_COUNT			(XT_XLC_MAX_FLUSH_SEG_COUNT * XLC_SEGMENT_COUNT)
+
+#define XLC_BLOCK_FREE				0
+#define XLC_BLOCK_READING			1
+#define XLC_BLOCK_CLEAN				2
+
+#define XT_RECYCLE_LOGS				0
+#define XT_DELETE_LOGS				1
+#define XT_KEEP_LOGS				2
+
+#define XT_XLOG_NO_WRITE_NO_FLUSH	0
+#define XT_XLOG_WRITE_AND_FLUSH		1
+#define XT_XLOG_WRITE_AND_NO_FLUSH	2
+
+/* LOG CACHE ---------------------------------------------------- */
+
+typedef struct XTXLogBlock {
+	off_t					xlb_address;					/* The block address. */
+	xtLogID					xlb_log_id;						/* The log id of the block. */
+	xtWord4					xlb_state;						/* Block status. */
+	struct XTXLogBlock		*xlb_next;						/* Pointer to next block on hash list, or next free block on free list. */
+	xtWord1					xlb_data[XT_XLC_BLOCK_SIZE];
+} XTXLogBlockRec, *XTXLogBlockPtr;
+
+/* A disk cache segment. The cache is divided into a number of segments
+ * to improve concurrency.
+ */
+typedef struct XTXLogCacheSeg {
+	xt_mutex_type			lcs_lock;						/* The cache segment lock. */
+	xt_cond_type			lcs_cond;
+	XTXLogBlockPtr			*lcs_hash_table;
+} XTXLogCacheSegRec, *XTXLogCacheSegPtr;
+
+typedef struct XTXLogCache {
+	xt_mutex_type			xlc_lock;						/* The public cache lock. */
+	xt_cond_type			xlc_cond;						/* The public cache wait condition. */
+	XTXLogCacheSegRec		xlc_segment[XLC_SEGMENT_COUNT];
+	XTXLogBlockPtr			xlc_blocks;
+	XTXLogBlockPtr			xlc_blocks_end;
+	XTXLogBlockPtr			xlc_next_to_free;
+	xtWord4					xlc_free_count;
+	xtWord4					xlc_hash_size;
+	xtWord4					xlc_block_count;
+	xtWord8					xlc_upper_limit;
+} XTXLogCacheRec;
+
+/* LOG ENTRIES ---------------------------------------------------- */
+
+#define XT_LOG_ENT_EOF				0
+#define XT_LOG_ENT_HEADER			1
+#define XT_LOG_ENT_NEW_LOG			2					/* Move to the next log! NOTE!! May not appear in a group!! */
+#define XT_LOG_ENT_DEL_LOG			3					/* Delete the given transaction/data log. */
+#define XT_LOG_ENT_NEW_TAB			4					/* This record indicates a new table was created. */
+
+#define XT_LOG_ENT_COMMIT			5					/* Transaction was committed. */
+#define XT_LOG_ENT_ABORT			6					/* Transaction was aborted. */
+#define XT_LOG_ENT_CLEANUP			7					/* Written after a cleanup. */
+
+#define XT_LOG_ENT_REC_MODIFIED		8					/* This records has been modified by the transaction. */
+#define XT_LOG_ENT_UPDATE			9
+#define XT_LOG_ENT_UPDATE_BG		10
+#define XT_LOG_ENT_UPDATE_FL		11
+#define XT_LOG_ENT_UPDATE_FL_BG		12
+#define XT_LOG_ENT_INSERT			13
+#define XT_LOG_ENT_INSERT_BG		14
+#define XT_LOG_ENT_INSERT_FL		15
+#define XT_LOG_ENT_INSERT_FL_BG		16
+#define XT_LOG_ENT_DELETE			17
+#define XT_LOG_ENT_DELETE_BG		18
+#define XT_LOG_ENT_DELETE_FL		19
+#define XT_LOG_ENT_DELETE_FL_BG		20
+
+#define XT_LOG_ENT_REC_FREED		21					/* This record has been placed in the free list. */
+#define XT_LOG_ENT_REC_REMOVED		22					/* Free record and dependecies: index references, blob references. */
+#define XT_LOG_ENT_REC_REMOVED_EXT	23					/* Free record and dependecies: index references, extended data, blob references. */
+#define XT_LOG_ENT_REC_REMOVED_BI	38					/* Free record and dependecies: includes before image of record, for freeing index, etc. */
+
+#define XT_LOG_ENT_REC_MOVED		24					/* The record has been moved by the compactor. */
+#define XT_LOG_ENT_REC_CLEANED		25					/* This record has been cleaned by the sweeper. */
+#define XT_LOG_ENT_REC_CLEANED_1	26					/* This record has been cleaned by the sweeper (short form). */
+#define XT_LOG_ENT_REC_UNLINKED		27					/* The record after this record is unlinked from the variation list. */
+
+#define XT_LOG_ENT_ROW_NEW			28					/* Row allocated from the EOF. */
+#define XT_LOG_ENT_ROW_NEW_FL		29					/* Row allocated from the free list. */
+#define XT_LOG_ENT_ROW_ADD_REC		30					/* Record added to the row. */
+#define XT_LOG_ENT_ROW_SET			31
+#define XT_LOG_ENT_ROW_FREED		32
+
+#define XT_LOG_ENT_OP_SYNC			33					/* Operations syncronised. */
+#define XT_LOG_ENT_EXT_REC_OK		34					/* An extended record */
+#define XT_LOG_ENT_EXT_REC_DEL		35					/* A deleted extended record */
+
+#define XT_LOG_ENT_NO_OP			36					/* If write to the database fails, we still try to log the
+														 * op code, in an attempt to continue, if writting to log
+														 * still works.
+														 */
+#define XT_LOG_ENT_END_OF_LOG		37					/* This is a record that indicates the end of the log, and
+														 * fills to the end of a 512 byte block.
+														 */
+#define XT_LOG_ENT_PREPARE			39					/* XA prepare log entry. */
+
+#define XT_LOG_FILE_MAGIC			0xAE88FE12
+#define XT_LOG_VERSION_NO			1
+
+typedef struct XTXactLogHeader {
+	xtWord1					xh_status_1;		/* XT_LOG_ENT_HEADER */
+	xtWord1					xh_checksum_1;		
+	XTDiskValue4			xh_size_4;			/* Must be set to sizeof(XTXactLogHeaderDRec). */
+	XTDiskValue8			xh_free_space_8;	/* The accumulated free space in this file. */
+	XTDiskValue8			xh_file_len_8;		/* The last confirmed correct file length (always set on close). */
+	XTDiskValue8			xh_comp_pos_8;		/* Compaction position (XT_DL_STATUS_CO_SOURCE only). */
+	xtWord1					xh_comp_stat_1;		/* The compaction status XT_DL_STATUS_CO_SOURCE/XT_DL_STATUS_CO_TARGET */
+	XTDiskValue4			xh_log_id_4;
+	XTDiskValue4			xh_version_2;		/* XT_LOG_VERSION_NO */
+	XTDiskValue4			xh_magic_4;			/* MUST always be at the end of the structure!! */
+} XTXactLogHeaderDRec, *XTXactLogHeaderDPtr;
+
+/* This is the original log head size (don't change): */
+#define XT_MIN_LOG_HEAD_SIZE		(offsetof(XTXactLogHeaderDRec, xh_log_id_4) + 4)
+#define XT_LOG_HEAD_MAGIC(b, l)		XT_GET_DISK_4(((xtWord1 *) (b)) + (l) - 4)
+
+typedef struct XTXactNewLogEntry {
+	xtWord1					xl_status_1;		/* XT_LOG_ENT_NEW_LOG, XT_LOG_ENT_DEL_LOG */
+	xtWord1					xl_checksum_1;		
+	XTDiskValue4			xl_log_id_4;		/* Store the current table ID. */
+} XTXactNewLogEntryDRec, *XTXactNewLogEntryDPtr;
+
+typedef struct XTXactNewTabEntry {
+	xtWord1					xt_status_1;		/* XT_LOG_ENT_NEW_TAB */
+	xtWord1					xt_checksum_1;		
+	XTDiskValue4			xt_tab_id_4;		/* Store the current table ID. */
+} XTXactNewTabEntryDRec, *XTXactNewTabEntryDPtr;
+
+/* This record must appear in a transaction group, and therefore has no transaction ID: */
+typedef struct XTXactEndEntry {
+	xtWord1					xe_status_1;		/* XT_LOG_ENT_COMMIT, XT_LOG_ENT_ABORT */
+	xtWord1					xe_checksum_1;		
+	XTDiskValue4			xe_xact_id_4;		/* The transaction. */
+	XTDiskValue4			xe_not_used_4;		/* Was the end sequence number (no longer used - v1.0.04+), set to zero). */
+} XTXactEndEntryDRec, *XTXactEndEntryDPtr;
+
+typedef struct XTXactPrepareEntry {
+	xtWord1					xp_status_1;		/* XT_LOG_ENT_PREPARE */
+	XTDiskValue2			xp_checksum_2;		
+	XTDiskValue4			xp_xact_id_4;		/* The transaction. */
+	xtWord1					xp_xa_len_1;		/* The length of the XA data. */
+	xtWord1					xp_xa_data[XT_MAX_XA_DATA_SIZE];
+} XTXactPrepareEntryDRec, *XTXactPrepareEntryDPtr;
+
+typedef struct XTXactCleanupEntry {
+	xtWord1					xc_status_1;		/* XT_LOG_ENT_CLEANUP */
+	xtWord1					xc_checksum_1;		
+	XTDiskValue4			xc_xact_id_4;		/* The transaction that was cleaned up. */
+} XTXactCleanupEntryDRec, *XTXactCleanupEntryDPtr;
+
+typedef struct XTactUpdateEntry {
+	xtWord1					xu_status_1;		/* XT_LOG_ENT_REC_MODIFIED, XT_LOG_ENT_UPDATE, XT_LOG_ENT_INSERT, XT_LOG_ENT_DELETE */
+												/* XT_LOG_ENT_UPDATE_BG, XT_LOG_ENT_INSERT_BG, XT_LOG_ENT_DELETE_BG */
+	XTDiskValue2			xu_checksum_2;		
+	XTDiskValue4			xu_op_seq_4;		/* Operation sequence number. */
+	XTDiskValue4			xu_tab_id_4;		/* Table ID of the record. */
+	xtDiskRecordID4			xu_rec_id_4;		/* Offset of the new updated record. */
+	XTDiskValue2			xu_size_2;			/* Size of the record data. */
+	/* This is the start of the actual record data: */
+	xtWord1					xu_rec_type_1;		/* Type of the record. */
+	xtWord1					xu_stat_id_1;
+	xtDiskRecordID4			xu_prev_rec_id_4;		/* The previous variation of this record. */
+	XTDiskValue4			xu_xact_id_4;		/* The transaction ID. */
+	XTDiskValue4			xu_row_id_4;		/* The row ID of this record. */
+} XTactUpdateEntryDRec, *XTactUpdateEntryDPtr;
+
+typedef struct XTactUpdateFLEntry {
+	xtWord1					xf_status_1;		/* XT_LOG_ENT_UPDATE_FL, XT_LOG_ENT_INSERT_FL, XT_LOG_ENT_DELETE_FL */
+												/* XT_LOG_ENT_UPDATE_FL_BG, XT_LOG_ENT_INSERT_FL_BG, XT_LOG_ENT_DELETE_FL_BG */
+	XTDiskValue2			xf_checksum_2;		
+	XTDiskValue4			xf_op_seq_4;		/* Operation sequence number. */
+	XTDiskValue4			xf_tab_id_4;		/* Table ID of the record. */
+	xtDiskRecordID4			xf_rec_id_4;		/* Offset of the new updated record. */
+	XTDiskValue2			xf_size_2;			/* Size of the record data. */
+	xtDiskRecordID4			xf_free_rec_id_4;	/* Update to the free list. */
+	/* This is the start of the actual record data: */
+	xtWord1					xf_rec_type_1;		/* Type of the record. */
+	xtWord1					xf_stat_id_1;
+	xtDiskRecordID4			xf_prev_rec_id_4;	/* The previous variation of this record. */
+	XTDiskValue4			xf_xact_id_4;		/* The transaction ID. */
+	XTDiskValue4			xf_row_id_4;		/* The row ID of this record. */
+} XTactUpdateFLEntryDRec, *XTactUpdateFLEntryDPtr;
+
+typedef struct XTactFreeRecEntry {
+	xtWord1					fr_status_1;		/* XT_LOG_ENT_REC_REMOVED, XT_LOG_ENT_REC_REMOVED_EXT, XT_LOG_ENT_REC_FREED */
+	xtWord1					fr_checksum_1;		
+	XTDiskValue4			fr_op_seq_4;		/* Operation sequence number. */
+	XTDiskValue4			fr_tab_id_4;		/* Table ID of the record. */
+	xtDiskRecordID4			fr_rec_id_4;		/* Offset of the new written record. */
+	/* This data confirms the record state for release of
+	 * attached resources (extended records, indexes and blobs)
+	 */
+	xtWord1					fr_stat_id_1;		/* The statement ID of the record. */
+	XTDiskValue4			fr_xact_id_4;		/* The transaction ID of the record. */
+	/* This is the start of the actual record data: */
+	xtWord1					fr_rec_type_1;
+	xtWord1					fr_not_used_1;
+	xtDiskRecordID4			fr_next_rec_id_4;	/* The next block on the free list. */
+} XTactFreeRecEntryDRec, *XTactFreeRecEntryDPtr;
+
+typedef struct XTactRemoveBIEntry {
+	xtWord1					rb_status_1;		/* XT_LOG_ENT_REC_REMOVED_BI */
+	XTDiskValue2			rb_checksum_2;		
+	XTDiskValue4			rb_op_seq_4;		/* Operation sequence number. */
+	XTDiskValue4			rb_tab_id_4;		/* Table ID of the record. */
+	xtDiskRecordID4			rb_rec_id_4;		/* Offset of the new written record. */
+	XTDiskValue2			rb_size_2;			/* Size of the record data. */
+
+	xtWord1					rb_new_rec_type_1;	/* New type of the record (needed for below). */
+
+	/* This is the start of the record data, with some fields overwritten for the free: */
+	xtWord1					rb_rec_type_1;		/* Type of the record. */
+	xtWord1					rb_stat_id_1;
+	xtDiskRecordID4			rb_next_rec_id_4;	/* The next block on the free list (overwritten). */
+	XTDiskValue4			rb_xact_id_4;		/* The transaction ID. */
+	XTDiskValue4			rb_row_id_4;		/* The row ID of this record. */
+} XTactRemoveBIEntryDRec, *XTactRemoveBIEntryDPtr;
+
+typedef struct XTactWriteRecEntry {
+	xtWord1					xw_status_1;		/* XT_LOG_ENT_REC_MOVED, XT_LOG_ENT_REC_CLEANED, XT_LOG_ENT_REC_CLEANED_1,
+												 * XT_LOG_ENT_REC_UNLINKED */
+	xtWord1					xw_checksum_1;		
+	XTDiskValue4			xw_op_seq_4;		/* Operation sequence number. */
+	XTDiskValue4			xw_tab_id_4;		/* Table ID of the record. */
+	xtDiskRecordID4			xw_rec_id_4;		/* Offset of the new written record. */
+	/* This is the start of the actual record data: */
+	xtWord1					xw_rec_type_1;
+	xtWord1					xw_stat_id_1;
+	xtDiskRecordID4			xw_next_rec_id_4;	/* The next block on the free list. */
+} XTactWriteRecEntryDRec, *XTactWriteRecEntryDPtr;
+
+typedef struct XTactRowAddedEntry {
+	xtWord1					xa_status_1;		/* XT_LOG_ENT_ROW_NEW or XT_LOG_ENT_ROW_NEW_FL */
+	xtWord1					xa_checksum_1;		
+	XTDiskValue4			xa_op_seq_4;		/* Operation sequence number. */
+	XTDiskValue4			xa_tab_id_4;		/* Table ID of the record. */
+	XTDiskValue4			xa_row_id_4;		/* The row ID of the row allocated. */
+	XTDiskValue4			xa_free_list_4;		/* Change to the free list (ONLY for XT_LOG_ENT_ROW_NEW_FL). */
+} XTactRowAddedEntryDRec, *XTactRowAddedEntryDPtr;
+
+typedef struct XTactWriteRowEntry {
+	xtWord1					wr_status_1;		/* XT_LOG_ENT_ROW_ADD_REC, XT_LOG_ENT_ROW_SET, XT_LOG_ENT_ROW_FREED */
+	xtWord1					wr_checksum_1;		
+	XTDiskValue4			wr_op_seq_4;		/* Operation sequence number. */
+	XTDiskValue4			wr_tab_id_4;		/* Table ID of the record. */
+	XTDiskValue4			wr_row_id_4;		/* Row ID of the row that was modified. */
+	/* This is the start of the actual record data: */
+	XTDiskValue4			wr_ref_id_4;		/* The row reference data. */
+} XTactWriteRowEntryDRec, *XTactWriteRowEntryDPtr;
+
+typedef struct XTactOpSyncEntry {
+	xtWord1					os_status_1;		/* XT_LOG_ENT_OP_SYNC  */
+	xtWord1					os_checksum_1;		
+	XTDiskValue4			os_time_4;			/* Time of the restart. */
+} XTactOpSyncEntryDRec, *XTactOpSyncEntryDPtr;
+
+typedef struct XTactNoOpEntry {
+	xtWord1					no_status_1;		/* XT_LOG_ENT_NO_OP */
+	xtWord1					no_checksum_1;		
+	XTDiskValue4			no_op_seq_4;		/* Operation sequence number. */
+	XTDiskValue4			no_tab_id_4;		/* Table ID of the record. */
+} XTactNoOpEntryDRec, *XTactNoOpEntryDPtr;
+
+typedef struct XTactExtRecEntry {
+	xtWord1					er_status_1;		/* XT_LOG_ENT_EXT_REC_OK, XT_LOG_ENT_EXT_REC_DEL */
+	XTDiskValue4			er_data_size_4;		/* Size of this record data area only. */
+	XTDiskValue4			er_tab_id_4;		/* The table referencing this extended record. */
+	xtDiskRecordID4			er_rec_id_4;		/* The ID of the reference record. */
+	xtWord1					er_data[XT_VAR_LENGTH];
+} XTactExtRecEntryDRec, *XTactExtRecEntryDPtr;
+
+typedef union XTXactLogBuffer {
+	XTXactLogHeaderDRec		xh;
+	XTXactNewLogEntryDRec	xl;
+	XTXactNewTabEntryDRec	xt;
+	XTXactEndEntryDRec		xe;
+	XTXactCleanupEntryDRec	xc;
+	XTactUpdateEntryDRec	xu;
+	XTactUpdateFLEntryDRec	xf;
+	XTactFreeRecEntryDRec	fr;
+	XTactRemoveBIEntryDRec	rb;
+	XTactWriteRecEntryDRec	xw;
+	XTactRowAddedEntryDRec	xa;
+	XTactWriteRowEntryDRec	wr;
+	XTactOpSyncEntryDRec	os;
+	XTactExtRecEntryDRec	er;
+	XTactNoOpEntryDRec		no;
+	XTXactPrepareEntryDRec	xp;
+} XTXactLogBufferDRec, *XTXactLogBufferDPtr;
+
+/* ---------------------------------------- */
+
+typedef struct XTXactSeqRead {
+	size_t					xseq_buffer_size;		/* Size of the buffer. */
+	xtBool					xseq_load_cache;		/* TRUE if reads should load the cache! */
+
+	xtLogID					xseq_log_id;
+	XTOpenFilePtr			xseq_log_file;
+	off_t					xseq_log_eof;
+
+	xtLogOffset				xseq_buf_log_offset;	/* File offset of the buffer. */
+	size_t					xseq_buffer_len;		/* Amount of data in the buffer. */
+	xtWord1					*xseq_buffer;
+
+	xtLogID					xseq_rec_log_id;		/* The current record log ID. */
+	xtLogOffset				xseq_rec_log_offset;	/* The current log read position. */
+	size_t					xseq_record_len;		/* The length of the current record. */
+} XTXactSeqReadRec, *XTXactSeqReadPtr;
+
+typedef struct XTXactLogFile {
+	xtLogID					lf_log_id;
+	off_t					lr_file_len;					/* The log file size (0 means this is the last log) */
+} XTXactLogFileRec, *XTXactLogFilePtr;
+
+/*
+ * The transaction log. Each database has one.
+ */
+ 
+/* Does not seem to make much difference... */
+#ifndef XT_NO_ATOMICS
+/* This function uses atomic ops: */
+//#define XT_XLOG_WAIT_SPINS
+#endif
+
+typedef struct XTDatabaseLog {
+	struct XTDatabase		*xl_db;
+
+	off_t					xl_log_file_threshold;
+	u_int					xl_log_file_count;				/* Number of logs to use (>= 1). */
+	u_int					xt_log_file_dyn_count;			/* A dynamic value to add to log file count. */
+	u_int					xt_log_file_dyn_dec;			/* Used to descide when to decrement the dynamic count. */
+	size_t					xl_size_of_buffers;				/* The size of both log buffers. */
+	xtWord8					xl_log_bytes_written;			/* The total number of bytes written to the log, after recovery. */
+	xtWord8					xl_log_bytes_flushed;			/* The total number of bytes flushed to the log, after recovery. */
+	xtWord8					xl_log_bytes_read;				/* The total number of log bytes read, after recovery. */
+
+	u_int					xl_last_flush_time;				/* Last flush time in micro-seconds. */
+
+	/* The writer log buffer: */
+	xt_mutex_type			xl_write_lock;
+	xt_cond_type			xl_write_cond;
+#ifdef XT_XLOG_WAIT_SPINS
+	xtWord4					xt_writing;						/* 1 if a thread is writing. */
+	xtWord4					xt_waiting;						/* Count of the threads waiting on the xl_write_cond. */
+#else
+	xtBool					xt_writing;						/* TRUE if a thread is writing. */
+#endif
+	xtLogID					xl_log_id;						/* The number of the write log. */
+	XTOpenFilePtr			xl_log_file;					/* The open write log. */
+
+	XTSpinLockRec			xl_buffer_lock;					/* This locks both the write and the append log buffers. */
+
+	xtLogID					xl_max_log_id;					/* The ID of the highest log on disk. */
+
+	xtLogID					xl_write_log_id;				/* This is the log ID were the write data will go. */
+	xtLogOffset				xl_write_log_offset;			/* The file offset of the write log. */
+	size_t					xl_write_buf_pos;
+	size_t					xl_write_buf_pos_start;
+	xtWord1					*xl_write_buffer;
+	xtBool					xl_write_done;					/* TRUE if the write buffer has been written! */
+
+	xtLogID					xl_append_log_id;				/* This is the log ID were the append data will go. */
+	xtLogOffset				xl_append_log_offset;			/* The file offset in the log were the append data will go. */
+	size_t					xl_append_buf_pos;				/* The amount of data in the append buffer. */
+	size_t					xl_append_buf_pos_start;		/* The amount of data in the append buffer already written. */
+	xtWord1					*xl_append_buffer;
+
+	xtLogID					xl_flush_log_id;				/* The last log flushed. */
+	xtLogOffset				xl_flush_log_offset;			/* The position in the log flushed. */
+
+	void					xlog_setup(struct XTThread *self, struct XTDatabase *db, off_t log_file_size, size_t transaction_buffer_size, int log_count);
+	xtBool					xlog_set_write_offset(xtLogID log_id, xtLogOffset log_offset, xtLogID max_log_id, struct XTThread *thread);
+	void					xlog_close(struct XTThread *self);
+	void					xlog_exit(struct XTThread *self);
+	void					xlog_name(size_t size, char *path, xtLogID log_id);
+	int						xlog_delete_log(xtLogID del_log_id, struct XTThread *thread);
+
+	xtBool					xlog_append(struct XTThread *thread, size_t size1, xtWord1 *data1, size_t size2, xtWord1 *data2, int flush_log_at_trx_commit, xtLogID *log_id, xtLogOffset *log_offset);
+	xtBool					xlog_flush(struct XTThread *thread);
+	xtBool					xlog_flush_pending();
+
+	xtBool					xlog_seq_init(XTXactSeqReadPtr seq, size_t buffer_size, xtBool load_cache);
+	void					xlog_seq_exit(XTXactSeqReadPtr seq);
+	void					xlog_seq_close(XTXactSeqReadPtr seq);
+	xtBool					xlog_seq_start(XTXactSeqReadPtr seq, xtLogID log_id, xtLogOffset log_offset, xtBool missing_ok);
+	xtBool					xlog_rnd_read(XTXactSeqReadPtr seq, xtLogID log_id, xtLogOffset log_offset, size_t size, xtWord1 *data, size_t *read, struct XTThread *thread);
+	size_t					xlog_bytes_to_write();
+	xtBool					xlog_read_from_cache(XTXactSeqReadPtr seq, xtLogID log_id, xtLogOffset log_offset, size_t size, off_t eof, xtWord1 *buffer, size_t *data_read, struct XTThread *thread);
+	xtBool					xlog_write_thru(XTXactSeqReadPtr seq, size_t size, xtWord1 *data, struct XTThread *thread);
+	xtBool					xlog_verify(XTXactLogBufferDPtr record, size_t rec_size, xtLogID log_id);
+	xtBool					xlog_seq_next(XTXactSeqReadPtr seq, XTXactLogBufferDPtr *entry, xtBool verify, struct XTThread *thread);
+	void					xlog_seq_skip(XTXactSeqReadPtr seq, size_t size);
+
+private:
+	xtBool					xlog_open_log(xtLogID log_id, off_t curr_eof, struct XTThread *thread);
+} XTDatabaseLogRec, *XTDatabaseLogPtr;
+
+xtBool			xt_xlog_flush_log(struct XTDatabase *db, struct XTThread *thread);
+xtBool			xt_xlog_log_data(struct XTThread *thread, size_t len, XTXactLogBufferDPtr log_entry, int flush_log_at_trx_commit);
+xtBool			xt_xlog_modify_table(xtTableID tab_id, u_int status, xtOpSeqNo op_seq, xtRecordID free_list, xtRecordID address, size_t size, xtWord1 *data, struct XTThread *thread);
+
+void			xt_xlog_init(struct XTThread *self, size_t cache_size);
+void			xt_xlog_exit(struct XTThread *self);
+xtInt8			xt_xlog_get_usage();
+xtInt8			xt_xlog_get_size();
+xtLogID			xt_xlog_get_min_log(struct XTThread *self, struct XTDatabase *db);
+void			xt_xlog_delete_logs(struct XTThread *self, struct XTDatabase *db);
+
+void			xt_start_writer(struct XTThread *self, struct XTDatabase *db);
+void			xt_wait_for_writer(struct XTThread *self, struct XTDatabase *db);
+void			xt_stop_writer(struct XTThread *self, struct XTDatabase *db);
+
+#endif
+
diff --git a/storage/pbxt/src/xt_config.h b/storage/pbxt/src/xt_config.h
new file mode 100644
index 00000000000..cb3009ecb79
--- /dev/null
+++ b/storage/pbxt/src/xt_config.h
@@ -0,0 +1,135 @@
+/* Copyright (c) 2005 PrimeBase Technologies GmbH
+ *
+ * PrimeBase XT
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * 2006-03-22	Paul McCullagh
+ *
+ * H&G2JCtL
+ *
+ * This header file should be included in every source, before all other
+ * headers.
+ *
+ * In particular: BEFORE THE SYSTEM HEADERS
+ */
+
+#ifndef __xt_config_h__
+#define __xt_config_h__
+
+#define MYSQL_SERVER		1
+
+#ifdef DRIZZLED
+#include "drizzled/global.h"
+const int max_connections = 500;
+#else
+#include <mysql_version.h>
+#include "my_global.h"
+#endif
+
+/*
+ * This enables everything that GNU can do. The macro is actually
+ * recommended for new programs.
+ */
+#ifndef _GNU_SOURCE
+#define _GNU_SOURCE
+#endif
+
+/*
+ * Make sure we use the thread safe version of the library.
+ */
+#ifndef _THREAD_SAFE // Seems to be defined by some Drizzle header
+#define _THREAD_SAFE
+#endif
+
+/*
+ * This causes things to be defined like stuff in inttypes.h
+ * which is used in printf()
+ */
+#ifndef __STDC_FORMAT_MACROS
+#define __STDC_FORMAT_MACROS
+#endif
+
+/*
+ * This define is not required by Linux because the _GNU_SOURCE
+ * definition includes POSIX complience. But I need it for
+ * Mac OS X.
+ */
+//#define _POSIX_C_SOURCE	2
+//#define _ANSI_SOURCE
+
+#ifdef __APPLE__
+#define XT_MAC
+#endif
+
+#if defined(MSDOS) || defined(__WIN__) || defined(_WIN64)
+#define XT_WIN
+#endif
+
+#ifdef XT_WIN
+#if defined(_DEBUG) && !defined(DEBUG)
+#define DEBUG
+#endif // _DEBUG
+#else
+// Paul suggested to disable PBMS in MariaDB for now.
+// #define PBMS_ENABLED
+#endif
+
+#ifdef __FreeBSD__
+#define XT_FREEBSD
+#endif
+
+#ifdef __NetBSD__
+#define XT_NETBSD
+#endif
+
+#ifdef __sun
+#define XT_SOLARIS
+#endif
+
+/*
+ * Definition of which atomic operations to use:
+ */
+#ifdef XT_WIN
+#ifdef _WIN64
+/* 64-bit Windows atomic ops are not yet supported: */
+#define XT_NO_ATOMICS
+#else
+/* MS Studio style embedded assembler for x86 */
+#define XT_ATOMIC_WIN32_X86
+#endif
+#elif defined(__GNUC__) && (defined(__x86_64__) || defined(__i386__))
+/* Use GNU style embedded assembler for x86 */
+#define XT_ATOMIC_GNUC_X86
+#elif defined(XT_SOLARIS)
+/* Use Sun atomic operations library
+ * http://docs.sun.com/app/docs/doc/816-5168/atomic-ops-3c?a=view
+ */
+#define XT_ATOMIC_SOLARIS_LIB
+#else
+#define XT_NO_ATOMICS
+#endif
+
+#ifndef DRIZZLED
+#if MYSQL_VERSION_ID >= 50404
+#define MYSQL_SUPPORTS_BACKUP
+#endif
+#endif
+
+#if defined(DBUG_ON) && !defined(DBUG_OFF) && !defined(DEBUG)
+#define DEBUG
+#endif // DBUG_ON
+
+#endif
diff --git a/storage/pbxt/src/xt_defs.h b/storage/pbxt/src/xt_defs.h
new file mode 100644
index 00000000000..3c77415265c
--- /dev/null
+++ b/storage/pbxt/src/xt_defs.h
@@ -0,0 +1,915 @@
+/* Copyright (c) 2005 PrimeBase Technologies GmbH
+ *
+ * PrimeBase XT
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * Author: Paul McCullagh
+ *
+ * H&G2JCtL
+ */
+#ifndef __xt_defs_h__
+#define __xt_defs_h__
+
+#ifdef XT_WIN
+#include "win_inttypes.h"
+#else
+#include <inttypes.h>
+#endif
+#include <sys/types.h>
+#include <assert.h>
+#include <stddef.h>
+#include <string.h>
+
+//#include "pthread_xt.h"
+
+#ifdef DEBUG
+//#define DEBUG_LOG_DELETE
+#endif
+
+/* the following macros are used to quote compile-time numeric 
+ * constants into strings, e.g. __LINE__ 
+ */
+#define _QUOTE(x) #x
+#define QUOTE(x) _QUOTE(x)
+
+/* ----------------------------------------------------------------------
+ * CRASH DEBUGGING
+ */
+
+/* Define this if crash debug should be on by default:
+ * pbxt_crash_debug set to TRUE by default.
+ * It can be turned off by creating a file called 'no-debug'
+ * in the pbxt database.
+ * It can be turned on by defining the file 'crash-debug'
+ * in the pbxt database.
+ */
+//#define XT_CRASH_DEBUG
+
+/* These are the things crash debug will do: */
+/* Create a core dump (windows only): */
+#define XT_COREDUMP
+
+/* Backup the datadir before recovery after a crash: */
+//#define XT_BACKUP_BEFORE_RECOVERY
+
+/* Keep this number of transaction logs around
+ * for analysis after a crash.
+ */
+#define XT_NUMBER_OF_LOGS_TO_SAVE		5
+
+/* ----------------------------------------------------------------------
+ * GENERIC GLOBAL TYPES
+ */
+
+#ifdef XT_WIN
+
+#define xtInt1			__int8
+#define xtInt2			__int16
+#define xtInt4			__int32
+#define xtInt8			__int64
+
+#define xtWord1			unsigned __int8
+#define xtWord2			unsigned __int16
+#define xtWord4			unsigned __int32
+#define xtWord8			unsigned __int64
+
+#ifndef PATH_MAX
+#define PATH_MAX		MAX_PATH
+#endif
+#ifndef NAME_MAX
+#define NAME_MAX		MAX_PATH
+#endif
+
+/* XT actually assumes that off_t is 8 bytes: */
+#define off_t			xtWord8
+
+#else // XT_WIN
+
+#define xtInt1			int8_t
+#define xtInt2			int16_t
+#define xtInt4			int32_t
+#define xtInt8			int64_t
+
+#ifdef XT_SOLARIS
+#define u_int8_t		uint8_t
+#define u_int16_t		uint16_t
+#define u_int32_t		uint32_t
+#define u_int64_t		uint64_t
+#endif
+
+#define xtWord1			u_int8_t
+#define xtWord2			u_int16_t
+#define xtWord4			u_int32_t
+#define xtWord8			u_int64_t
+
+#endif // XT_WIN
+
+/* A pointer sized word value: */
+#define xtWordPS		ptrdiff_t
+
+#define XT_MAX_INT_1	((xtInt1) 0x7F)
+#define XT_MIN_INT_1	((xtInt1) 0x80)
+#define XT_MAX_INT_2	((xtInt2) 0x7FFF)
+#define XT_MIN_INT_2	((xtInt2) 0x8000)
+#define XT_MAX_INT_4	((xtInt4) 0x7FFFFFFF)
+#define XT_MIN_INT_4	((xtInt4) 0x80000000)
+
+#define xtReal4			float
+#define xtReal8			double
+
+#ifndef u_int
+#define u_int			unsigned int				/* Assumed at least 4 bytes long! */
+#define u_long			unsigned long				/* Assumed at least 4 bytes long! */
+#endif
+#define llong			long long				/* Assumed at least 8 bytes long! */
+#define u_llong			unsigned long long			/* Assumed at least 8 bytes long! */
+
+#define c_char			const char
+
+#ifndef NULL
+#define NULL			0
+#endif
+
+#define xtPublic
+
+#define xtBool			int
+#ifndef TRUE
+#define TRUE			1
+#endif
+#ifndef FALSE
+#define FALSE			0
+#endif
+
+/* Additional return codes: */
+#define XT_MAYBE		2
+#define XT_ERR			-1
+#define XT_NEW			-2
+#define XT_RETRY		-3
+#define XT_REREAD		-4
+
+#ifdef OK
+#undef OK
+#endif
+#define OK				TRUE
+
+#ifdef FAILED
+#undef FAILED
+#endif
+#define FAILED			FALSE
+
+typedef xtWord1			XTDiskValue1[1];	
+typedef xtWord1			XTDiskValue2[2];	
+typedef xtWord1			XTDiskValue3[3];	
+typedef xtWord1			XTDiskValue4[4];	
+typedef xtWord1			XTDiskValue6[6];	
+typedef xtWord1			XTDiskValue8[8];	
+
+#ifdef DEBUG
+#define XT_VAR_LENGTH	100
+#else
+#define XT_VAR_LENGTH	1
+#endif
+
+typedef struct XTPathStr {
+	char				ps_path[XT_VAR_LENGTH];
+} *XTPathStrPtr;
+
+//#define XT_UNUSED(x)		x __attribute__((__unused__))
+#define XT_UNUSED(x)
+
+/* Only used when DEBUG is on: */
+#ifdef DEBUG
+#define XT_NDEBUG_UNUSED(x)	x
+#else
+//#define XT_NDEBUG_UNUSED(x)	x __attribute__((__unused__))
+#define XT_NDEBUG_UNUSED(x)
+#endif
+
+/* ----------------------------------------------------------------------
+ * MAIN CONSTANTS
+ */
+
+/*
+ * Define if there should only be one database per server instance:
+ */
+#define XT_USE_GLOBAL_DB
+
+/*
+ * The rollover size is the write limit of a log file.
+ * After this size is reached, a thread will start a
+ * new log.
+ *
+ * However, logs can grow much larger than this size.
+ * The reason is, a transaction single transaction
+ * may not span more than one data log file.
+ *
+ * This means the log rollover size is actually a
+ * minimum size.
+ */
+
+#ifdef DEBUG
+#define XT_USE_GLOBAL_DEBUG_SIZES
+#endif
+
+/*
+ * I believe the MySQL limit is 16. This limit is currently only used for
+ * BLOB streaming.
+ */
+#define XT_MAX_COLS_PER_INDEX			32
+
+/*
+ * The maximum number of tables that can be created in a PBXT
+ * database. The amount is based on the fact that XT creates
+ * about 5 files per table in the database, and also
+ * uses directory listing to find tables.
+ */
+#define XT_MAX_TABLES					10000
+
+/*
+ * When the amount of garbage in the file is greater than the
+ * garbage threshold, then compactor is activated.
+ */
+#define XT_GARBAGE_THRESHOLD			((double) 50.0)
+
+/* A record that does not contain blobs will be handled as a fixed
+ * length record if its maximum size is less than this amount,
+ * regardless of the size of the VARCHAR fields it contains.
+ */
+#define XT_TAB_MIN_VAR_REC_LENGTH		320
+
+/* No record in the data handle file may exceed this size: */
+#define XT_TAB_MAX_FIX_REC_LENGTH		(16 * 1024)
+
+/* No record in the data handle file may exceed this size, if
+ * AVG_ROW_LENGTH is set.
+ */
+#define XT_TAB_MAX_FIX_REC_LENGTH_SPEC	(64 * 1024)
+
+/*
+ * Determines the page size of the indexes. The value is given
+ * in shifts of 1 to the left (e.g. 1 << 11 == 2048,
+ * 1 << 12 == 4096).
+ *
+ * PMC: Note the performance of sysbench is better with 11
+ * than with 12.
+ *
+ * InnoDB uses 16K pages:
+ * 1 << 14 == 16384.
+ */
+#define XT_INDEX_PAGE_SHIFTS			14
+
+/* The number of RW locks used to scatter locks on the rows
+ * of a table. The locks are only help for a short time during which
+ * the row list is scanned.
+ *
+ * For more details see [(9)].
+ * 223, 1019, 3613
+ */
+#define XT_ROW_RWLOCKS					1019
+//#define XT_ROW_RWLOCKS					223
+
+/*
+ * These are the number of row lock "slots" per table.
+ * Row locks are taken on UPDATE/DELETE or SELECT FOR UPDATE.
+ */
+#define XT_ROW_LOCK_COUNT				(XT_ROW_RWLOCKS * 91)
+
+/*
+ * The size of index write buffer. Must be at least as large as the
+ * largest index page, plus overhead.
+ */
+#define XT_INDEX_WRITE_BUFFER_SIZE		(1024 * 1024)
+
+/* This is the time in seconds that a open table in the open
+ * table pool must be on the free list before it
+ * is actually freed from the pool.
+ *
+ * This is to reduce the affect from MySQL with a very low
+ * table cache size, which causes tables to be openned and
+ * closed very rapidly.
+ */
+#define XT_OPEN_TABLE_FREE_TIME			30
+
+/* Define this in order to use memory mapped files
+ * (record and row pointer files only).
+ *
+ * This makes no difference in sysbench R/W performance
+ * test.
+ */
+//#define XT_USE_ROW_REC_MMAP_FILES
+
+/* Define this if sequential scan should load data into the 
+ * record cache.
+ *
+ * This is the way InnoDB behaves.
+ */
+#define XT_SEQ_SCAN_LOADS_CACHE
+
+/* Define this in order to use direct I/O on index files: */
+/* NOTE: DO NOT ENABLE!
+ * {DIRECT-IO}
+ * It currently does not work, because of changes to the inde
+ * cache.
+ */
+//#define XT_USE_DIRECT_IO_ON_INDEX
+
+/*
+ * Define this variable if PBXT should do lazy deleting in indexes
+ * Note, even if the variable is not defined, PBXT will handle
+ * lazy deleted items in an index.
+ *
+ * NOTE: This can cause significant degrade of index scan speed.
+ * 25% on sysbench readonly index scan tests.
+ */
+//#define XT_USE_LAZY_DELETE
+
+/*
+ * Define this variable if a connection should wait for the
+ * sweeper to clean up previous transactions executed by the
+ * connection, before continuing.
+ *
+ * The number of transactions that the sweeper is aload to
+ * lag can be dynamic, but there is a limit (XT_MAX_XACT_BEHIND)
+ */
+#define XT_WAIT_FOR_CLEANUP
+
+/*
+ * This seems to be the optimal value, at least according to
+ * sysbench/sysbench run --test=oltp --num-threads=128 --max-requests=50000 --mysql-user=root 
+ * --oltp-table-size=100000 --oltp-table-name=sb_pbxt --mysql-engine-trx=yes
+ *
+ * Using 8, 16 and 128 threads.
+ */
+#define XT_MAX_XACT_BEHIND				2
+
+/* {NO-ACTION-BUG}
+ * Define this to implement NO ACTION correctly
+ * NOTE: this does not work currently because of a bug
+ * in MySQL
+ *
+ * The bug prevent returning of an error in external_lock()
+ * on statement end. In this case an assertion fails.
+ *
+ * set storage_engine = pbxt;
+ * DROP TABLE IF EXISTS t4,t3,t2,t1;
+ * CREATE TABLE t1 (s1 INT PRIMARY KEY);
+ * CREATE TABLE t2 (s1 INT PRIMARY KEY, FOREIGN KEY (s1) REFERENCES t1 (s1) ON DELETE NO ACTION);
+ * 
+ * INSERT INTO t1 VALUES (1);
+ * INSERT INTO t2 VALUES (1);
+ * 
+ * begin;
+ * INSERT INTO t1 VALUES (2);
+ * DELETE FROM t1 where s1 = 1;
+ * <-- Assertion fails here because this DELETE returns
+ * an error from external_lock()
+ */
+//#define XT_IMPLEMENT_NO_ACTION
+
+/* Define this value if online-backup should be supported.
+ * Note that, online backup is currently only supported
+ * by MySQL 6.0.9 or later
+ */
+#define XT_ENABLE_ONLINE_BACKUP
+
+/* Define this switch if you don't want to use atomic
+ * synchronisation.
+ */
+#ifndef XT_NO_ATOMICS
+//#define XT_NO_ATOMICS
+#endif
+
+/* When pbxt_flush_log_at_trx_commit != 1, the transaction log is flushed
+ * at regular intervals. Set the interval here.
+ */
+#define XT_XLOG_FLUSH_FREQ				1000
+
+/*
+ * Define here if you want to check (and correct) the table free list
+ * counts. The free list counts are not durable, because they are not
+ * written to the log.
+ *
+ * The row free count is most critical because it can be used to
+ * estimate the the of rows in the record.
+ */
+#define XT_CHECK_ROW_FREE_COUNT
+#ifdef DEBUG
+#define XT_CHECK_RECORD_FREE_COUNT
+#endif
+#define XT_CORRECT_TABLE_FREE_COUNT 
+
+#if defined(XT_CHECK_ROW_FREE_COUNT) && defined(XT_CORRECT_TABLE_FREE_COUNT)
+#define XT_ROW_COUNT_CORRECTED
+#endif
+
+/* ----------------------------------------------------------------------
+ * GLOBAL CONSTANTS
+ */
+
+#define XT_INDEX_PAGE_SIZE				(1 << XT_INDEX_PAGE_SHIFTS)
+#define XT_INDEX_PAGE_MASK				(XT_INDEX_PAGE_SIZE - 1)
+
+/* The index file uses direct I/O. This is the minimum block.
+ * size that can be used when doing direct I/O.
+ */
+#define XT_BLOCK_SIZE_FOR_DIRECT_IO		512
+
+/*
+ * The header is currently a fixed size, so the information must
+ * fit in this block!
+ *
+ * This must also be a multiple of XT_INDEX_MIN_BLOCK_SIZE
+ */
+#define XT_INDEX_HEAD_SIZE				(XT_BLOCK_SIZE_FOR_DIRECT_IO * 8)		// 4K
+
+#define XT_IDENTIFIER_CHAR_COUNT		64
+
+#define XT_IDENTIFIER_NAME_SIZE			((XT_IDENTIFIER_CHAR_COUNT * 3) + 1)	// The identifier length as UTF-8
+#define XT_TABLE_NAME_SIZE				((XT_IDENTIFIER_CHAR_COUNT * 5) + 1)	// The maximum length of a file name that has been normalized
+
+#define XT_ADD_PTR(p, l)				((void *) ((char *) (p) + (l)))
+
+#define XT_MAX_XA_DATA_SIZE				(3*4 + 128)			/* Corresponds to the maximum size of struct xid_t in handler.h. */
+
+/* ----------------------------------------------------------------------
+ * DEFINES DEPENDENT ON  CONSTANTS
+ */
+
+#ifdef XT_USE_ROW_REC_MMAP_FILES
+
+#define XT_ROW_REC_FILE_PTR						XTMapFilePtr
+#define XT_PWRITE_RR_FILE						xt_pwrite_fmap
+#define XT_PREAD_RR_FILE						xt_pread_fmap
+#define XT_FLUSH_RR_FILE						xt_flush_fmap
+#define XT_CLOSE_RR_FILE_NS						xt_close_fmap_ns
+
+#define XT_LOCK_MEMORY_PTR(x, f, a, s, v, c)	do { x = xt_lock_fmap_ptr(f, a, s, v, c); } while (0)
+#define XT_UNLOCK_MEMORY_PTR(f, d, e, v)		do { xt_unlock_fmap_ptr(f, v); d = NULL; } while (0)
+
+#else
+
+#define XT_ROW_REC_FILE_PTR						XTOpenFilePtr
+#define XT_PWRITE_RR_FILE						xt_pwrite_file
+#define XT_PREAD_RR_FILE						xt_pread_file
+#define XT_FLUSH_RR_FILE						xt_flush_file
+#define XT_CLOSE_RR_FILE_NS						xt_close_file_ns
+
+#define XT_LOCK_MEMORY_PTR(x, f, a, s, v, c)	do { if (!xt_lock_file_ptr(f, &x, a, s, v, c)) x = NULL; } while (0)
+#define XT_UNLOCK_MEMORY_PTR(f, d, e, v)		do { if (e) { xt_unlock_file_ptr(f, d, v); d = NULL; } } while (0)
+
+#endif
+
+/* ----------------------------------------------------------------------
+ * DEBUG SIZES!
+ * Reduce the thresholds to make things happen faster.
+ */
+
+#ifdef XT_USE_GLOBAL_DEBUG_SIZES
+
+//#undef XT_ROW_RWLOCKS
+//#define XT_ROW_RWLOCKS					2
+
+//#undef XT_TAB_MIN_VAR_REC_LENGTH
+//#define XT_TAB_MIN_VAR_REC_LENGTH			20
+
+//#undef XT_ROW_LOCK_COUNT
+//#define XT_ROW_LOCK_COUNT					(XT_ROW_RWLOCKS * 2)
+
+//#undef XT_INDEX_PAGE_SHIFTS
+//#define XT_INDEX_PAGE_SHIFTS				8	// 256
+//#undef XT_BLOCK_SIZE_FOR_DIRECT_IO
+//#define XT_BLOCK_SIZE_FOR_DIRECT_IO		256
+
+//#undef XT_INDEX_WRITE_BUFFER_SIZE
+//#define XT_INDEX_WRITE_BUFFER_SIZE		(40 * 1024)
+
+//#undef XT_XLOG_FLUSH_FREQ
+//#define XT_XLOG_FLUSH_FREQ				(30 * 1000)
+
+#endif
+
+/* ----------------------------------------------------------------------
+ * BYTE ORDER
+ */
+
+/*
+ * Byte order on the disk is little endian! This is the byte order of the i386.
+ * Little endian byte order starts with the least significant byte.
+ *
+ * The reason for choosing this byte order for the disk is 2-fold:
+ * Firstly the i386 is the cheapest and fasted platform today.
+ * Secondly the i386, unlike RISK chips (with big endian) can address
+ * memory that is not aligned!
+ *
+ * Since the disk image of PrimeBase XT is not aligned, the second point
+ * is significant. A RISK chip needs to access it byte-wise, so we might as
+ * well do the byte swapping at the same time.
+ *
+ * The macros below are of 4 general types:
+ *
+ * GET/SET - Get and set 1,2,4,8 byte values (short, int, long, etc).
+ * Values are swapped only on big endian platforms. This makes these
+ * functions very efficient on little-endian platforms.
+ *
+ * COPY - Transfer data without swapping regardless of platform. This
+ * function is a bit more efficient on little-endian platforms
+ * because alignment is not an issue.
+ *
+ * MOVE - Similar to get and set, but the deals with memory instead
+ * of values. Since no swapping is done on little-endian platforms
+ * this function is identical to COPY on little-endian platforms.
+ *
+ * SWAP - Transfer and swap data regardless of the platform type.
+ * Aligment is not assumed.
+ *
+ * The DISK component of the macro names indicates that alignment of
+ * the value cannot be assumed.
+ *
+ */
+#if BYTE_ORDER == BIG_ENDIAN
+/* The native order of the machine is big endian. Since the native disk
+ * disk order of XT is little endian, all data to and from disk
+ * must be swapped.
+ */
+#define XT_SET_DISK_1(d, s)		((d)[0] = (xtWord1) (s))
+
+#define XT_SET_DISK_2(d, s)		do { (d)[0] = (xtWord1)  (((xtWord2) (s))        & 0xFF); (d)[1] = (xtWord1) ((((xtWord2) (s)) >> 8 ) & 0xFF); } while (0)
+
+#define XT_SET_DISK_3(d, s)		do { (d)[0] = (xtWord1)  (((xtWord4) (s))        & 0xFF); (d)[1] = (xtWord1) ((((xtWord4) (s)) >> 8 ) & 0xFF); \
+									 (d)[2] = (xtWord1) ((((xtWord4) (s)) >> 16) & 0xFF); } while (0)
+
+#define XT_SET_DISK_4(d, s)		do { (d)[0] = (xtWord1)  (((xtWord4) (s))        & 0xFF); (d)[1] = (xtWord1) ((((xtWord4) (s)) >> 8 ) & 0xFF); \
+									 (d)[2] = (xtWord1) ((((xtWord4) (s)) >> 16) & 0xFF); (d)[3] = (xtWord1) ((((xtWord4) (s)) >> 24) & 0xFF); } while (0)
+
+#define XT_SET_DISK_6(d, s)		do { (d)[0] = (xtWord1)  (((xtWord8) (s))        & 0xFF); (d)[1] = (xtWord1) ((((xtWord8) (s)) >> 8 ) & 0xFF); \
+									 (d)[2] = (xtWord1) ((((xtWord8) (s)) >> 16) & 0xFF); (d)[3] = (xtWord1) ((((xtWord8) (s)) >> 24) & 0xFF); \
+									 (d)[4] = (xtWord1) ((((xtWord8) (s)) >> 32) & 0xFF); (d)[5] = (xtWord1) ((((xtWord8) (s)) >> 40) & 0xFF); } while (0)
+
+#define XT_SET_DISK_8(d, s)		do { (d)[0] = (xtWord1)  (((xtWord8) (s))        & 0xFF); (d)[1] = (xtWord1) ((((xtWord8) (s)) >> 8 ) & 0xFF); \
+									 (d)[2] = (xtWord1) ((((xtWord8) (s)) >> 16) & 0xFF); (d)[3] = (xtWord1) ((((xtWord8) (s)) >> 24) & 0xFF); \
+									 (d)[4] = (xtWord1) ((((xtWord8) (s)) >> 32) & 0xFF); (d)[5] = (xtWord1) ((((xtWord8) (s)) >> 40) & 0xFF); \
+									 (d)[6] = (xtWord1) ((((xtWord8) (s)) >> 48) & 0xFF); (d)[7] = (xtWord1) ((((xtWord8) (s)) >> 56) & 0xFF); } while (0)
+
+#define XT_GET_DISK_1(s)		((s)[0])
+
+#define XT_GET_DISK_2(s)		((xtWord2) (((xtWord2) (s)[0]) | (((xtWord2) (s)[1]) << 8)))
+
+#define XT_GET_DISK_3(s)		((xtWord4) (((xtWord4) (s)[0]) | (((xtWord4) (s)[1]) << 8) | (((xtWord4) (s)[2]) << 16)))
+
+#define XT_GET_DISK_4(s)		(((xtWord4) (s)[0])        | (((xtWord4) (s)[1]) << 8 ) | \
+								(((xtWord4) (s)[2]) << 16) | (((xtWord4) (s)[3]) << 24))
+
+#define XT_GET_DISK_6(s)		(((xtWord8) (s)[0])        | (((xtWord8) (s)[1]) << 8 ) | \
+								(((xtWord8) (s)[2]) << 16) | (((xtWord8) (s)[3]) << 24) | \
+								(((xtWord8) (s)[4]) << 32) | (((xtWord8) (s)[5]) << 40))
+
+#define XT_GET_DISK_8(s)		(((xtWord8) (s)[0])        | (((xtWord8) (s)[1]) << 8 ) | \
+								(((xtWord8) (s)[2]) << 16) | (((xtWord8) (s)[3]) << 24) | \
+								(((xtWord8) (s)[4]) << 32) | (((xtWord8) (s)[5]) << 40) | \
+								(((xtWord8) (s)[6]) << 48) | (((xtWord8) (s)[7]) << 56))
+
+/* Move will copy memory, and swap the bytes on a big endian machine.
+ * On a little endian machine it is the same as COPY.
+ */
+#define XT_MOVE_DISK_1(d, s)	((d)[0] = (s)[0])
+#define XT_MOVE_DISK_2(d, s)	do { (d)[0] = (s)[1]; (d)[1] = (s)[0]; } while (0)
+#define XT_MOVE_DISK_3(d, s)	do { (d)[0] = (s)[2]; (d)[1] = (s)[1]; (d)[2] = (s)[0]; } while (0)
+#define XT_MOVE_DISK_4(d, s)	do { (d)[0] = (s)[3]; (d)[1] = (s)[2]; (d)[2] = (s)[1]; (d)[3] = (s)[0]; } while (0)
+#define XT_MOVE_DISK_8(d, s)	do { (d)[0] = (s)[7]; (d)[1] = (s)[6]; \
+									 (d)[2] = (s)[5]; (d)[3] = (s)[4]; \
+									 (d)[4] = (s)[3]; (d)[5] = (s)[2]; \
+									 (d)[6] = (s)[1]; (d)[7] = (s)[0]; } while (0)
+
+/*
+ * Copy just copies the number of bytes assuming the data is not alligned.
+ */
+#define XT_COPY_DISK_1(d, s)	(d)[0] = s
+#define XT_COPY_DISK_2(d, s)	do { (d)[0] = (s)[0]; (d)[1] = (s)[1]; } while (0)
+#define XT_COPY_DISK_3(d, s)	do { (d)[0] = (s)[0]; (d)[1] = (s)[1]; (d)[2] = (s)[2]; } while (0)
+#define XT_COPY_DISK_4(d, s)	do { (d)[0] = (s)[0]; (d)[1] = (s)[1]; (d)[2] = (s)[2]; (d)[3] = (s)[3]; } while (0)
+#define XT_COPY_DISK_6(d, s)	memcpy(&((d)[0]), &((s)[0]), 6)
+#define XT_COPY_DISK_8(d, s)	memcpy(&((d)[0]), &((s)[0]), 8)
+#define XT_COPY_DISK_10(d, s)	memcpy(&((d)[0]), &((s)[0]), 10)
+
+#define XT_SET_NULL_DISK_1(d)	XT_SET_DISK_1(d, 0)
+#define XT_SET_NULL_DISK_2(d)	do { (d)[0] = 0; (d)[1] = 0; } while (0)
+#define XT_SET_NULL_DISK_4(d)	do { (d)[0] = 0; (d)[1] = 0; (d)[2] = 0; (d)[3] = 0; } while (0)
+#define XT_SET_NULL_DISK_6(d)	do { (d)[0] = 0; (d)[1] = 0; (d)[2] = 0; (d)[3] = 0; (d)[4] = 0; (d)[5] = 0; } while (0)
+#define XT_SET_NULL_DISK_8(d)	do { (d)[0] = 0; (d)[1] = 0; (d)[2] = 0; (d)[3] = 0; (d)[4] = 0; (d)[5] = 0; (d)[6] = 0; (d)[7] = 0; } while (0)
+
+#define XT_IS_NULL_DISK_1(d)	(!(XT_GET_DISK_1(d)))
+#define XT_IS_NULL_DISK_4(d)	(!(d)[0] && !(d)[1] && !(d)[2] && !(d)[3])
+#define XT_IS_NULL_DISK_8(d)	(!(d)[0] && !(d)[1] && !(d)[2] && !(d)[3] && !(d)[4] && !(d)[5] && !(d)[6] && !(7)[3])
+
+#define XT_EQ_DISK_4(d, s)		((d)[0] == (s)[0] && (d)[1] == (s)[1] && (d)[2] == (s)[2] && (d)[3] == (s)[3])
+#define XT_EQ_DISK_8(d, s)		((d)[0] == (s)[0] && (d)[1] == (s)[1] && (d)[2] == (s)[2] && (d)[3] == (s)[3] && \
+								(d)[4] == (s)[4] && (d)[5] == (s)[5] && (d)[6] == (s)[6] && (d)[7] == (s)[7])
+
+#define XT_IS_FF_DISK_4(d)		((d)[0] == 0xFF && (d)[1] == 0xFF && (d)[2] == 0xFF && (d)[3] == 0xFF)
+#else
+/*
+ * The native order of the machine is little endian. This means the data to
+ * and from disk need not be swapped. In addition to this, since
+ * the i386 can access non-aligned memory we are not required to
+ * handle the data byte-for-byte.
+ */
+#define XT_SET_DISK_1(d, s)		((d)[0] = (xtWord1) (s))
+#define XT_SET_DISK_2(d, s)		(*((xtWord2 *) &((d)[0])) = (xtWord2) (s))
+#define XT_SET_DISK_3(d, s)		do { (*((xtWord2 *) &((d)[0])) = (xtWord2) (s));  *((xtWord1 *) &((d)[2])) = (xtWord1) (((xtWord4) (s)) >> 16); } while (0)
+#define XT_SET_DISK_4(d, s)		(*((xtWord4 *) &((d)[0])) = (xtWord4) (s))
+#define XT_SET_DISK_6(d, s)		do { *((xtWord4 *) &((d)[0])) = (xtWord4) (s); *((xtWord2 *) &((d)[4])) = (xtWord2) (((xtWord8) (s)) >> 32); } while (0)
+#define XT_SET_DISK_8(d, s)		(*((xtWord8 *) &((d)[0])) = (xtWord8) (s))
+
+#define XT_GET_DISK_1(s)		((s)[0])
+#define XT_GET_DISK_2(s)		*((xtWord2 *) &((s)[0]))
+#define XT_GET_DISK_3(s)		((xtWord4) *((xtWord2 *) &((s)[0])) | (((xtWord4) *((xtWord1 *) &((s)[2]))) << 16))
+#define XT_GET_DISK_4(s)		*((xtWord4 *) &((s)[0]))
+#define XT_GET_DISK_6(s)		((xtWord8) *((xtWord4 *) &((s)[0])) | (((xtWord8) *((xtWord2 *) &((s)[4]))) << 32))
+#define XT_GET_DISK_8(s)		*((xtWord8 *) &((s)[0]))
+
+#define XT_MOVE_DISK_1(d, s)	((d)[0] = (s)[0])
+#define XT_MOVE_DISK_2(d, s)	XT_COPY_DISK_2(d, s)
+#define XT_MOVE_DISK_3(d, s)	XT_COPY_DISK_3(d, s)
+#define XT_MOVE_DISK_4(d, s)	XT_COPY_DISK_4(d, s)
+#define XT_MOVE_DISK_8(d, s)	XT_COPY_DISK_8(d, s)
+
+#define XT_COPY_DISK_1(d, s)	(d)[0] = s
+#define XT_COPY_DISK_2(d, s)	(*((xtWord2 *) &((d)[0])) = (*((xtWord2 *) &((s)[0]))))
+#define XT_COPY_DISK_3(d, s)	do { *((xtWord2 *) &((d)[0])) = *((xtWord2 *) &((s)[0])); (d)[2] = (s)[2]; } while (0)
+#define XT_COPY_DISK_4(d, s)	(*((xtWord4 *) &((d)[0])) = (*((xtWord4 *) &((s)[0]))))
+#define XT_COPY_DISK_6(d, s)	do { *((xtWord4 *) &((d)[0])) = *((xtWord4 *) &((s)[0])); *((xtWord2 *) &((d)[4])) = *((xtWord2 *) &((s)[4])); } while (0)
+#define XT_COPY_DISK_8(d, s)	(*((xtWord8 *) &(d[0])) = (*((xtWord8 *) &((s)[0]))))
+#define XT_COPY_DISK_10(d, s)	memcpy(&((d)[0]), &((s)[0]), 10)
+
+#define XT_SET_NULL_DISK_1(d)	XT_SET_DISK_1(d, 0)
+#define XT_SET_NULL_DISK_2(d)	XT_SET_DISK_2(d, 0)
+#define XT_SET_NULL_DISK_3(d)	XT_SET_DISK_3(d, 0)
+#define XT_SET_NULL_DISK_4(d)	XT_SET_DISK_4(d, 0L)
+#define XT_SET_NULL_DISK_6(d)	XT_SET_DISK_6(d, 0LL)
+#define XT_SET_NULL_DISK_8(d)	XT_SET_DISK_8(d, 0LL)
+
+#define XT_IS_NULL_DISK_1(d)	(!(XT_GET_DISK_1(d)))
+#define XT_IS_NULL_DISK_2(d)	(!(XT_GET_DISK_2(d)))
+#define XT_IS_NULL_DISK_3(d)	(!(XT_GET_DISK_3(d)))
+#define XT_IS_NULL_DISK_4(d)	(!(XT_GET_DISK_4(d)))
+#define XT_IS_NULL_DISK_8(d)	(!(XT_GET_DISK_8(d)))
+
+#define XT_EQ_DISK_4(d, s)		(XT_GET_DISK_4(d) == XT_GET_DISK_4(s))
+#define XT_EQ_DISK_8(d, s)		(XT_GET_DISK_8(d) == XT_GET_DISK_8(s))
+
+#define XT_IS_FF_DISK_4(d)		(XT_GET_DISK_4(d) == 0xFFFFFFFF)
+#endif
+
+#define XT_CMP_DISK_4(a, b)		((xtInt4) XT_GET_DISK_4(a) - (xtInt4) XT_GET_DISK_4(b))
+#define XT_CMP_DISK_8(d, s)		memcmp(&((d)[0]), &((s)[0]), 8)
+//#define XT_CMP_DISK_8(d, s)		(XT_CMP_DISK_4((d).h_number_4, (s).h_number_4) == 0 ? XT_CMP_DISK_4((d).h_file_4, (s).h_file_4) : XT_CMP_DISK_4((d).h_number_4, (s).h_number_4))
+
+#define XT_SWAP_DISK_2(d, s)	do { (d)[0] = (s)[1]; (d)[1] = (s)[0]; } while (0)
+#define XT_SWAP_DISK_3(d, s)	do { (d)[0] = (s)[2]; (d)[1] = (s)[1]; (d)[2] = (s)[0]; } while (0)
+#define XT_SWAP_DISK_4(d, s)	do { (d)[0] = (s)[3]; (d)[1] = (s)[2]; (d)[2] = (s)[1]; (d)[3] = (s)[0]; } while (0)
+#define XT_SWAP_DISK_8(d, s)	do { (d)[0] = (s)[7]; (d)[1] = (s)[6]; (d)[2] = (s)[5]; (d)[3] = (s)[4]; \
+									 (d)[4] = (s)[3]; (d)[5] = (s)[2]; (d)[6] = (s)[1]; (d)[7] = (s)[0]; } while (0)
+
+/* ----------------------------------------------------------------------
+ *  GLOBAL APPLICATION TYPES & MACROS
+ */
+
+struct XTThread;
+
+typedef void (*XTFreeFunc)(struct XTThread *self, void *thunk, void *item);
+typedef int (*XTCompareFunc)(struct XTThread *self, register const void *thunk, register const void *a, register const void *b);
+
+/* Log ID and offset: */
+#define xtLogID					xtWord4
+#define xtLogOffset				off_t
+
+#define xtDatabaseID			xtWord4
+#define xtTableID				xtWord4
+#define xtOpSeqNo				xtWord4
+#define xtXactID				xtWord4
+#define xtThreadID				xtWord4
+
+#ifdef DEBUG
+//#define XT_USE_NODE_ID_STRUCT
+#endif
+
+#ifdef XT_USE_NODE_ID_STRUCT
+typedef struct xtIndexNodeID {
+	xtWord4						x;
+} xtIndexNodeID;
+#define XT_NODE_TEMP			xtWord4 xt_node_temp
+#define	XT_NODE_ID(a)			(a).x
+#define	XT_RET_NODE_ID(a)		*((xtIndexNodeID *) &(xt_node_temp = (a)))
+#else
+#define XT_NODE_TEMP			
+#define xtIndexNodeID			xtWord4
+#define	XT_NODE_ID(a)			a
+#define	XT_RET_NODE_ID(a)		((xtIndexNodeID) (a))
+#endif
+
+/* Row, Record ID and Record offsets: */
+#define xtRowID					xtWord4
+#define xtRecordID				xtWord4				/* NOTE: Record offset == header-size + record-id * record-size! */
+#define xtRefID					xtWord4				/* Must be big enough to contain a xtRowID and a xtRecordID! */
+#define xtRecOffset				off_t
+#define	xtDiskRecordID4			XTDiskValue4
+#ifdef XT_WIN
+#define xtProcID				DWORD
+#else
+#define xtProcID				pid_t
+#endif
+
+#define XT_ROW_ID_SIZE			4
+#define XT_RECORD_ID_SIZE		4
+#define XT_REF_ID_SIZE			4					/* max(XT_ROW_ID_SIZE, XT_RECORD_ID_SIZE) */
+#define XT_RECORD_OFFS_SIZE		4
+#define XT_RECORD_REF_SIZE		(XT_RECORD_ID_SIZE + XT_ROW_ID_SIZE)
+#define XT_CHECKSUM4_REC(x)		(x)
+
+#define XT_XACT_ID_SIZE			4
+#define XT_CHECKSUM4_XACT(x)	(x)
+
+#ifdef XT_WIN
+#define __FUNC__				__FUNCTION__
+#elif defined(XT_SOLARIS)
+#define __FUNC__				"__func__"
+#else
+#define __FUNC__				__PRETTY_FUNCTION__
+#endif
+
+/* ----------------------------------------------------------------------
+ * GLOBAL VARIABLES
+ */
+
+extern bool					pbxt_inited;
+extern xtBool				pbxt_ignore_case;
+extern const char			*pbxt_extensions[];
+extern xtBool				pbxt_crash_debug;
+
+
+/* ----------------------------------------------------------------------
+ * DRIZZLE MAPPINGS VARIABLES
+ */
+
+#ifdef DRIZZLED
+/* Drizzle is stuck at this level: */
+#define MYSQL_VERSION_ID					60005
+
+#define TABLE_LIST							TableList
+#define TABLE								Table
+#define THD									Session
+#define MYSQL_THD							Session *
+#define THR_THD								THR_Session
+#define STRUCT_TABLE						class Table
+#define TABLE_SHARE							TableShare
+
+#define MYSQL_TYPE_STRING					DRIZZLE_TYPE_VARCHAR
+#define MYSQL_TYPE_VARCHAR					DRIZZLE_TYPE_VARCHAR
+#define MYSQL_TYPE_LONGLONG					DRIZZLE_TYPE_LONGLONG
+#define MYSQL_TYPE_BLOB						DRIZZLE_TYPE_BLOB
+#define MYSQL_TYPE_ENUM						DRIZZLE_TYPE_ENUM
+#define MYSQL_TYPE_LONG						DRIZZLE_TYPE_LONG
+#define MYSQL_PLUGIN_VAR_HEADER				DRIZZLE_PLUGIN_VAR_HEADER
+#define MYSQL_SYSVAR_STR					DRIZZLE_SYSVAR_STR
+#define MYSQL_SYSVAR_INT					DRIZZLE_SYSVAR_INT
+#define MYSQL_SYSVAR_BOOL					DRIZZLE_SYSVAR_BOOL
+#define MYSQL_SYSVAR						DRIZZLE_SYSVAR
+#define MYSQL_STORAGE_ENGINE_PLUGIN			DRIZZLE_STORAGE_ENGINE_PLUGIN
+#define MYSQL_INFORMATION_SCHEMA_PLUGIN		DRIZZLE_INFORMATION_SCHEMA_PLUGIN
+#define memcpy_fixed						memcpy
+#define bfill(m, len, ch)					memset(m, ch, len)
+
+#define mx_tmp_use_all_columns(x, y)		(x)->use_all_columns(y)
+#define mx_tmp_restore_column_map(x, y)		(x)->restore_column_map(y)
+
+#define MX_TABLE_TYPES_T					Cursor::Table_flags
+#define MX_UINT8_T							uint8_t
+#define MX_ULONG_T							uint32_t
+#define MX_ULONGLONG_T						uint64_t
+#define MX_LONGLONG_T						uint64_t
+#define MX_CHARSET_INFO						struct charset_info_st
+#define MX_CONST_CHARSET_INFO				const struct charset_info_st			
+#define MX_CONST							const
+#define MX_BITMAP							MyBitmap
+#define MX_BIT_SIZE()						numOfBitsInMap()
+#define MX_BIT_SET(x, y)					(x)->setBit(y)
+#define MX_BIT_FAST_TEST_AND_SET(x, y)				(x)->testAndSet(y)
+
+#define my_bool								bool
+#define int16								int16_t
+#define int32								int32_t
+#define uint16								uint16_t
+#define uint32								uint32_t
+#define uchar								unsigned char
+#define longlong							int64_t
+#define ulonglong							uint64_t
+#define handler								Cursor
+
+#define HAVE_LONG_LONG
+
+#define my_malloc(x, y)						malloc(x)
+#define my_free(x, y)						free(x)
+
+#define HA_CAN_SQL_HANDLER					0
+#define HA_CAN_INSERT_DELAYED				0
+#define HA_BINLOG_ROW_CAPABLE				0
+#define HA_BINLOG_STMT_CAPABLE				0
+#define HA_CACHE_TBL_TRANSACT				0
+
+#define max									cmax
+#define min									cmin
+
+#define NullS								NULL
+
+#define thd_charset							session_charset
+#define thd_query							session_query
+#define thd_slave_thread					session_slave_thread
+#define thd_non_transactional_update		session_non_transactional_update
+#define thd_binlog_format					session_binlog_format
+#define thd_mark_transaction_to_rollback	session_mark_transaction_to_rollback
+#define thd_ha_data							session_ha_data
+#define current_thd							current_session
+#define thd_sql_command						session_sql_command
+#define thd_test_options					session_test_options
+#define thd_killed							session_killed
+#define thd_tx_isolation					session_tx_isolation
+#define thd_in_lock_tables					session_in_lock_tables
+#define thd_tablespace_op					session_tablespace_op
+#define thd_alloc							session_alloc
+#define thd_make_lex_string					session_make_lex_string
+#define column_bitmaps_signal()
+
+#define my_pthread_setspecific_ptr(T, V)	pthread_setspecific(T, (void*) (V))
+
+#define mysql_real_data_home				drizzle_real_data_home
+
+#define mi_int4store(T,A)   { uint32_t def_temp= (uint32_t) (A);\
+                              ((unsigned char*) (T))[3]= (unsigned char) (def_temp);\
+                              ((unsigned char*) (T))[2]= (unsigned char) (def_temp >> 8);\
+                              ((unsigned char*) (T))[1]= (unsigned char) (def_temp >> 16);\
+                              ((unsigned char*) (T))[0]= (unsigned char) (def_temp >> 24); }
+
+#define mi_uint4korr(A) ((uint32_t) (((uint32_t) (((const unsigned char*) (A))[3])) +\
+                                   (((uint32_t) (((const unsigned char*) (A))[2])) << 8) +\
+                                   (((uint32_t) (((const unsigned char*) (A))[1])) << 16) +\
+                                   (((uint32_t) (((const unsigned char*) (A))[0])) << 24)))
+
+class PBXTStorageEngine;
+typedef PBXTStorageEngine handlerton;
+class Session;
+
+extern "C" void session_mark_transaction_to_rollback(Session *session, bool all);
+
+#else // DRIZZLED
+/* The MySQL case: */
+#if MYSQL_VERSION_ID >= 50404
+#define STRUCT_TABLE						struct TABLE
+#else
+#define STRUCT_TABLE						struct st_table
+#endif
+
+#define mx_tmp_use_all_columns				dbug_tmp_use_all_columns
+#define mx_tmp_restore_column_map(x, y)		dbug_tmp_restore_column_map((x)->read_set, y)
+#define MX_BIT_FAST_TEST_AND_SET(x, y)		bitmap_fast_test_and_set(x, y)
+
+#define MX_TABLE_TYPES_T					ulonglong
+#define MX_UINT8_T							uint8
+#define MX_ULONG_T							ulong
+#define MX_ULONGLONG_T						ulonglong
+#define MX_LONGLONG_T						longlong
+#define MX_CHARSET_INFO						CHARSET_INFO
+#if defined(MARIADB_BASE_VERSION) && MYSQL_VERSION_ID > 50200
+#define MX_CONST_CHARSET_INFO				const struct charset_info_st
+#else
+#define MX_CONST_CHARSET_INFO				struct charset_info_st
+#endif			
+#define MX_CONST							
+#define MX_BITMAP							MY_BITMAP
+#define MX_BIT_SIZE()						n_bits
+#define MX_BIT_SET(x, y)					bitmap_set_bit(x, y)
+
+#endif // DRIZZLED
+
+#define MX_BIT_IS_SUBSET(x, y)				bitmap_is_subset(x, y)
+
+#ifndef XT_SCAN_CORE_DEFINED
+#define XT_SCAN_CORE_DEFINED
+xtBool	xt_mm_scan_core(void);
+#endif
+
+//#define DEBUG_LOCK_QUEUE
+
+#endif
diff --git a/storage/pbxt/src/xt_errno.h b/storage/pbxt/src/xt_errno.h
new file mode 100644
index 00000000000..959c9422043
--- /dev/null
+++ b/storage/pbxt/src/xt_errno.h
@@ -0,0 +1,134 @@
+/* Copyright (c) 2005 PrimeBase Technologies GmbH
+ *
+ * PrimeBase XT
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * Author: Paul McCullagh
+ *
+ * H&G2JCtL
+ */
+
+#define XT_NO_ERR					0
+#define XT_SYSTEM_ERROR				-1
+#define XT_ERR_STACK_OVERFLOW		-2
+#define XT_ASSERTION_FAILURE		-3
+#define XT_SIGNAL_CAUGHT			-4
+#define XT_ERR_JUMP_OVERFLOW		-5
+#define XT_ERR_BAD_HANDLE			-6
+#define XT_ERR_TABLE_EXISTS			-7
+#define XT_ERR_NAME_TOO_LONG		-8
+#define XT_ERR_TABLE_NOT_FOUND		-9
+#define XT_ERR_SESSION_NOT_FOUND	-10
+#define XT_ERR_BAD_ADDRESS			-11
+#define XT_ERR_UNKNOWN_SERVICE		-12
+#define XT_ERR_UNKNOWN_HOST			-13
+#define XT_ERR_TOKEN_EXPECTED		-14
+#define XT_ERR_PROPERTY_REQUIRED	-15
+#define XT_ERR_BAD_XACTION			-16
+#define XT_ERR_INVALID_SLOT			-17
+#define XT_ERR_DEADLOCK				-18
+#define XT_ERR_CANNOT_CHANGE_DB		-19
+#define XT_ERR_ILLEGAL_CHAR			-20
+#define XT_ERR_UNTERMINATED_STRING	-21
+#define XT_ERR_SYNTAX				-22
+#define XT_ERR_ILLEGAL_INSTRUCTION	-23
+#define XT_ERR_OUT_OF_BOUNDS		-24
+#define XT_ERR_STACK_UNDERFLOW		-25
+#define XT_ERR_TYPE_MISMATCH		-26
+#define XT_ERR_ILLEGAL_TYPE			-27
+#define XT_ERR_ID_TOO_LONG			-28
+#define XT_ERR_TYPE_OVERFLOW		-29
+#define XT_ERR_TABLE_IN_USE			-30
+#define XT_ERR_NO_DATABASE_IN_USE	-31
+#define XT_ERR_CANNOT_RESOLVE_TYPE	-32
+#define XT_ERR_BAD_INDEX_DESC		-33
+#define XT_ERR_WRONG_NO_OF_VALUES	-34
+#define XT_ERR_CANNOT_OUTPUT_VALUE	-35
+#define XT_ERR_COLUMN_NOT_FOUND		-36
+#define XT_ERR_NOT_IMPLEMENTED		-37
+#define XT_ERR_UNEXPECTED_EOS		-38
+#define XT_ERR_BAD_TOKEN			-39
+#define XT_ERR_RES_STACK_OVERFLOW	-40
+#define XT_ERR_BAD_INDEX_TYPE		-41
+#define XT_ERR_INDEX_EXISTS			-42
+#define XT_ERR_INDEX_STRUC_EXISTS	-43
+#define XT_ERR_INDEX_NOT_FOUND		-44
+#define XT_ERR_INDEX_CORRUPT		-45
+#define XT_ERR_DUPLICATE_KEY		-46
+#define XT_ERR_TYPE_NOT_SUPPORTED	-47
+#define XT_ERR_BAD_TABLE_VERSION	-48
+#define XT_ERR_BAD_RECORD_FORMAT	-49
+#define XT_ERR_BAD_EXT_RECORD		-50
+#define XT_ERR_RECORD_CHANGED		-51			// Record has already been updated by some other transaction
+#define XT_ERR_XLOG_WAS_CORRUPTED	-52
+#define XT_ERR_NO_DICTIONARY		-53
+#define XT_ERR_TOO_MANY_TABLES		-54			// Maximum number of table exceeded.
+#define XT_ERR_KEY_TOO_LARGE		-55			// Maximum size of an index key exceeded
+#define XT_ERR_MULTIPLE_DATABASES	-56
+#define XT_ERR_NO_TRANSACTION		-57
+#define XT_ERR_A_EXPECTED_NOT_B		-58
+#define XT_ERR_NO_MATCHING_INDEX	-59
+#define XT_ERR_TABLE_LOCKED			-60
+#define XT_ERR_NO_REFERENCED_ROW	-61
+#define XT_ERR_BAD_DICTIONARY		-62
+#define XT_ERR_LOADING_MYSQL_DIC	-63
+#define XT_ERR_ROW_IS_REFERENCED	-64
+#define XT_ERR_COLUMN_IS_NOT_NULL	-65
+#define XT_ERR_INCORRECT_NO_OF_COLS	-66
+#define XT_ERR_FK_ON_TEMP_TABLE		-67
+#define XT_ERR_REF_TABLE_NOT_FOUND	-68
+#define XT_ERR_REF_TYPE_WRONG		-69
+#define XT_ERR_DUPLICATE_FKEY		-70
+#define XT_ERR_INDEX_FILE_TO_LARGE	-71
+#define XT_ERR_UPGRADE_TABLE		-72
+#define XT_ERR_INDEX_NEW_VERSION	-73
+#define XT_ERR_LOCK_TIMEOUT			-74
+#define XT_ERR_CONVERSION			-75
+#define XT_ERR_NO_ROWS				-76
+#define XT_ERR_MYSQL_ERROR			-77
+#define XT_ERR_DATA_LOG_NOT_FOUND	-78
+#define XT_ERR_LOG_MAX_EXCEEDED		-79
+#define XT_ERR_MAX_ROW_COUNT		-80
+#define XT_ERR_FILE_TOO_LONG		-81
+#define XT_ERR_BAD_IND_BLOCK_SIZE	-82
+#define XT_ERR_INDEX_CORRUPTED		-83
+#define XT_ERR_NO_INDEX_CACHE		-84
+#define XT_ERR_INDEX_LOG_CORRUPT	-85
+#define XT_ERR_TOO_MANY_THREADS		-86
+#define XT_ERR_TOO_MANY_WAITERS		-87
+#define XT_ERR_INDEX_OLD_VERSION	-88
+#define XT_ERR_PBXT_TABLE_EXISTS	-89
+#define XT_ERR_SERVER_RUNNING		-90
+#define XT_ERR_INDEX_MISSING		-91
+#define XT_ERR_RECORD_DELETED		-92
+#define XT_ERR_NEW_TYPE_OF_XLOG		-93
+#define XT_ERR_NO_BEFORE_IMAGE		-94
+#define XT_ERR_FK_REF_TEMP_TABLE	-95
+#define XT_ERR_MYSQL_SHUTDOWN		-98
+#define XT_ERR_MYSQL_NO_THREAD		-99
+#define XT_ERR_BUFFER_TOO_SMALL		-100
+#define XT_ERR_BAD_BACKUP_FORMAT	-101
+#define XT_ERR_PBXT_NOT_INSTALLED	-102
+
+#ifdef XT_WIN
+#define XT_ENOMEM					ERROR_NOT_ENOUGH_MEMORY
+#define XT_EAGAIN					ERROR_RETRY
+#define XT_EBUSY					ERROR_BUSY
+#else
+#define XT_ENOMEM					ENOMEM
+#define XT_EAGAIN					EAGAIN
+#define XT_EBUSY					EBUSY
+#endif
diff --git a/storage/sphinx/CMakeLists.txt b/storage/sphinx/CMakeLists.txt
new file mode 100644
index 00000000000..15034f615d3
--- /dev/null
+++ b/storage/sphinx/CMakeLists.txt
@@ -0,0 +1,5 @@
+SET(SPHINX_SOURCES ha_sphinx.cc)
+LINK_LIBRARIES(ws2_32)
+SET(SPHINX_PLUGIN_STATIC  "sphinx")
+SET(SPHINX_PLUGIN_DYNAMIC "ha_sphinx")
+MYSQL_ADD_PLUGIN(sphinx ${SPHINX_SOURCES} STORAGE_ENGINE)
diff --git a/storage/sphinx/Makefile.am b/storage/sphinx/Makefile.am
new file mode 100644
index 00000000000..5f58d673547
--- /dev/null
+++ b/storage/sphinx/Makefile.am
@@ -0,0 +1,55 @@
+# Copyright (C) 2000 MySQL AB & MySQL Finland AB & TCX DataKonsult AB
+# 
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+# 
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+# 
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+
+#called from the top level Makefile
+
+MYSQLDATAdir =          $(localstatedir)
+MYSQLSHAREdir =         $(pkgdatadir)
+MYSQLBASEdir=           $(prefix)
+MYSQLLIBdir=            $(pkglibdir)
+pkgplugindir =          $(pkglibdir)/plugin
+INCLUDES =              -I$(top_srcdir)/include -I$(top_builddir)/include \
+			-I$(top_srcdir)/regex \
+			-I$(top_srcdir)/sql \
+                        -I$(srcdir)
+
+DEFS= @DEFS@ -D_REENTRANT -D_PTHREADS -DMYSQL_SERVER
+
+noinst_HEADERS =	ha_sphinx.h
+
+EXTRA_LTLIBRARIES =	ha_sphinx.la
+pkgplugin_LTLIBRARIES = @plugin_sphinx_shared_target@ sphinx.la
+
+ha_sphinx_la_LDFLAGS =	-module -rpath $(MYSQLLIBdir) \
+			-L$(top_builddir)/libservices -lmysqlservices
+ha_sphinx_la_CXXFLAGS=	$(AM_CFLAGS) -DMYSQL_DYNAMIC_PLUGIN
+ha_sphinx_la_CFLAGS =	$(AM_CFLAGS) -DMYSQL_DYNAMIC_PLUGIN
+ha_sphinx_la_SOURCES =	ha_sphinx.cc
+
+sphinx_la_LDFLAGS = -module
+sphinx_la_CXXFLAGS = $(AM_CFLAGS)
+sphinx_la_CFLAGS = $(AM_CFLAGS)
+sphinx_la_SOURCES = snippets_udf.cc
+
+EXTRA_LIBRARIES =	libsphinx.a
+noinst_LIBRARIES =	@plugin_sphinx_static_target@
+libsphinx_a_CXXFLAGS =	$(AM_CFLAGS)
+libsphinx_a_CFLAGS =	$(AM_CFLAGS)
+libsphinx_a_SOURCES=	ha_sphinx.cc
+
+EXTRA_DIST =		CMakeLists.txt
+# Don't update the files from bitkeeper
+%::SCCS/s.%
diff --git a/storage/sphinx/gen_data.php b/storage/sphinx/gen_data.php
new file mode 100644
index 00000000000..dac374f095d
--- /dev/null
+++ b/storage/sphinx/gen_data.php
@@ -0,0 +1,37 @@
+<?php
+
+$file_name= $argv[1];
+
+//echo $file_name;
+
+$cont= file_get_contents($file_name);
+
+$words= explode(" ", $cont);
+
+//echo "words: ".(count($words))."\n";
+
+$cw = count($words);
+
+echo "REPLACE INTO test.documents ( id, group_id, date_added, title, content ) VALUES\n";
+
+
+for ($i=1; $i<=100000; $i++)
+{
+  $count_words= mt_rand(10,30);
+  $pred = "";
+  for ($j=0; $j<$count_words; $j++)
+  {
+    $pred .= chop($words[mt_rand(1, $cw-1)])." ";
+  }
+  $count_words= mt_rand(3,5);
+  $tit = "";
+  for ($j=0; $j<$count_words; $j++)
+  {
+    $tit .= chop($words[mt_rand(1, $cw-1)])." ";
+  }
+  echo "($i,".mt_rand(1,20).",NOW(),'".addslashes($tit)."','".addslashes($pred)."'),\n";
+}       
+  echo "(0,1,now(),'end','eND');\n";
+  
+
+?>
diff --git a/storage/sphinx/ha_sphinx.cc b/storage/sphinx/ha_sphinx.cc
new file mode 100644
index 00000000000..e88464eb16c
--- /dev/null
+++ b/storage/sphinx/ha_sphinx.cc
@@ -0,0 +1,3115 @@
+//
+// $Id: ha_sphinx.cc 2058 2009-11-07 04:01:57Z shodan $
+//
+
+#ifdef USE_PRAGMA_IMPLEMENTATION
+#pragma implementation // gcc: Class implementation
+#endif
+
+#if _MSC_VER>=1400
+#define _CRT_SECURE_NO_DEPRECATE 1
+#define _CRT_NONSTDC_NO_DEPRECATE 1
+#endif
+
+#include <mysql_version.h>
+
+#if MYSQL_VERSION_ID>50100
+#include "mysql_priv.h"
+#include <mysql/plugin.h>
+#else
+#include "../mysql_priv.h"
+#endif
+
+#include <mysys_err.h>
+#include <my_sys.h>
+
+#ifndef __WIN__
+	// UNIX-specific
+	#include <my_net.h>
+	#include <netdb.h>
+	#include <sys/un.h>
+
+	#define	RECV_FLAGS	MSG_WAITALL
+
+	#define sphSockClose(_sock)	::close(_sock)
+#else
+	// Windows-specific
+	#include <io.h>
+	#define strcasecmp	stricmp
+	#define snprintf	_snprintf
+
+	#define	RECV_FLAGS	0
+
+	#define sphSockClose(_sock)	::closesocket(_sock)
+#endif
+
+#include <ctype.h>
+#include "ha_sphinx.h"
+
+#ifndef MSG_WAITALL
+#define MSG_WAITALL 0
+#endif
+
+#if _MSC_VER>=1400
+#pragma warning(push,4)
+#endif
+
+/////////////////////////////////////////////////////////////////////////////
+
+/// there might be issues with min() on different platforms (eg. Gentoo, they say)
+#define Min(a,b) ((a)<(b)?(a):(b))
+
+/// unaligned RAM accesses are forbidden on SPARC
+#if defined(sparc) || defined(__sparc__)
+#define UNALIGNED_RAM_ACCESS 0
+#else
+#define UNALIGNED_RAM_ACCESS 1
+#endif
+
+#if MYSQL_VERSION_ID<50100
+#define thd_ha_data(X,Y) (X)->ha_data[sphinx_hton.slot]
+#define ha_thd()         current_thd
+#endif // <50100
+
+#if UNALIGNED_RAM_ACCESS
+
+/// pass-through wrapper
+template < typename T > inline T sphUnalignedRead ( const T & tRef )
+{
+	return tRef;
+}
+
+/// pass-through wrapper
+template < typename T > void sphUnalignedWrite ( void * pPtr, const T & tVal )
+{
+	*(T*)pPtr = tVal;
+}
+
+#else
+
+/// unaligned read wrapper for some architectures (eg. SPARC)
+template < typename T >
+inline T sphUnalignedRead ( const T & tRef )
+{
+	T uTmp;
+	byte * pSrc = (byte *) &tRef;
+	byte * pDst = (byte *) &uTmp;
+	for ( int i=0; i<(int)sizeof(T); i++ )
+		*pDst++ = *pSrc++;
+	return uTmp;
+}
+
+/// unaligned write wrapper for some architectures (eg. SPARC)
+template < typename T >
+void sphUnalignedWrite ( void * pPtr, const T & tVal )
+{
+	byte * pDst = (byte *) pPtr;
+	byte * pSrc = (byte *) &tVal;
+	for ( int i=0; i<(int)sizeof(T); i++ )
+		*pDst++ = *pSrc++;
+}
+
+#endif
+
+/////////////////////////////////////////////////////////////////////////////
+
+// FIXME! make this all dynamic
+#define SPHINXSE_MAX_FILTERS		32
+
+#define SPHINXSE_DEFAULT_HOST		"127.0.0.1"
+#define SPHINXSE_DEFAULT_PORT		9312
+#define SPHINXSE_DEFAULT_INDEX		"*"
+
+#define SPHINXSE_SYSTEM_COLUMNS		3
+
+#define SPHINXSE_MAX_ALLOC			(16*1024*1024)
+#define SPHINXSE_MAX_KEYWORDSTATS	4096
+
+// FIXME! all the following is cut-n-paste from sphinx.h and searchd.cpp
+#define SPHINX_VERSION		"0.9.9"
+
+enum
+{
+	SPHINX_SEARCHD_PROTO	= 1,
+	SEARCHD_COMMAND_SEARCH	= 0,
+	VER_COMMAND_SEARCH		= 0x116,
+};
+
+/// search query sorting orders
+enum ESphSortOrder
+{
+	SPH_SORT_RELEVANCE		= 0,	///< sort by document relevance desc, then by date
+	SPH_SORT_ATTR_DESC		= 1,	///< sort by document date desc, then by relevance desc
+	SPH_SORT_ATTR_ASC		= 2,	///< sort by document date asc, then by relevance desc
+	SPH_SORT_TIME_SEGMENTS	= 3,	///< sort by time segments (hour/day/week/etc) desc, then by relevance desc
+	SPH_SORT_EXTENDED		= 4,	///< sort by SQL-like expression (eg. "@relevance DESC, price ASC, @id DESC")
+	SPH_SORT_EXPR			= 5,	///< sort by expression
+
+	SPH_SORT_TOTAL
+};
+
+/// search query matching mode
+enum ESphMatchMode
+{
+	SPH_MATCH_ALL = 0,			///< match all query words
+	SPH_MATCH_ANY,				///< match any query word
+	SPH_MATCH_PHRASE,			///< match this exact phrase
+	SPH_MATCH_BOOLEAN,			///< match this boolean query
+	SPH_MATCH_EXTENDED,			///< match this extended query
+	SPH_MATCH_FULLSCAN,			///< match all document IDs w/o fulltext query, apply filters
+	SPH_MATCH_EXTENDED2,		///< extended engine V2
+
+	SPH_MATCH_TOTAL
+};
+
+/// search query relevance ranking mode
+enum ESphRankMode
+{
+	SPH_RANK_PROXIMITY_BM25		= 0,	///< default mode, phrase proximity major factor and BM25 minor one
+	SPH_RANK_BM25				= 1,	///< statistical mode, BM25 ranking only (faster but worse quality)
+	SPH_RANK_NONE				= 2,	///< no ranking, all matches get a weight of 1
+	SPH_RANK_WORDCOUNT			= 3,	///< simple word-count weighting, rank is a weighted sum of per-field keyword occurence counts
+	SPH_RANK_PROXIMITY			= 4,	///< phrase proximity
+	SPH_RANK_MATCHANY			= 5,	///< emulate old match-any weighting
+	SPH_RANK_FIELDMASK			= 6,	///< sets bits where there were matches
+
+	SPH_RANK_TOTAL,
+	SPH_RANK_DEFAULT			= SPH_RANK_PROXIMITY_BM25
+};
+
+/// search query grouping mode
+enum ESphGroupBy
+{
+	SPH_GROUPBY_DAY		= 0,	///< group by day
+	SPH_GROUPBY_WEEK	= 1,	///< group by week
+	SPH_GROUPBY_MONTH	= 2,	///< group by month
+	SPH_GROUPBY_YEAR	= 3,	///< group by year
+	SPH_GROUPBY_ATTR	= 4		///< group by attribute value
+};
+
+/// known attribute types
+enum
+{
+	SPH_ATTR_NONE		= 0,			///< not an attribute at all
+	SPH_ATTR_INTEGER	= 1,			///< this attr is just an integer
+	SPH_ATTR_TIMESTAMP	= 2,			///< this attr is a timestamp
+	SPH_ATTR_ORDINAL	= 3,			///< this attr is an ordinal string number (integer at search time, specially handled at indexing time)
+	SPH_ATTR_BOOL		= 4,			///< this attr is a boolean bit field
+	SPH_ATTR_FLOAT		= 5,
+	SPH_ATTR_BIGINT		= 6,
+
+	SPH_ATTR_MULTI		= 0x40000000UL	///< this attr has multiple values (0 or more)
+};
+
+/// known answers
+enum
+{
+	SEARCHD_OK		= 0,	///< general success, command-specific reply follows
+	SEARCHD_ERROR	= 1,	///< general failure, error message follows
+	SEARCHD_RETRY	= 2,	///< temporary failure, error message follows, client should retry later
+	SEARCHD_WARNING	= 3		///< general success, warning message and command-specific reply follow
+};
+
+//////////////////////////////////////////////////////////////////////////////
+
+#define SPHINX_DEBUG_OUTPUT		0
+#define SPHINX_DEBUG_CALLS		0
+
+#include <stdarg.h>
+
+#if SPHINX_DEBUG_OUTPUT
+inline void SPH_DEBUG ( const char * format, ... )
+{
+	va_list ap;
+	va_start ( ap, format );
+	fprintf ( stderr, "SphinxSE: " );
+	vfprintf ( stderr, format, ap );
+	fprintf ( stderr, "\n" );
+	va_end ( ap );
+}
+#else
+inline void SPH_DEBUG ( const char *, ... ) {}
+#endif
+
+#if SPHINX_DEBUG_CALLS
+
+#define SPH_ENTER_FUNC() { SPH_DEBUG ( "enter %s", __FUNCTION__ ); }
+#define SPH_ENTER_METHOD() { SPH_DEBUG ( "enter %s(this=%08x)", __FUNCTION__, this ); }
+#define SPH_RET(_arg) { SPH_DEBUG ( "leave %s", __FUNCTION__ ); return _arg; }
+#define SPH_VOID_RET() { SPH_DEBUG ( "leave %s", __FUNCTION__ ); return; }
+
+#else
+
+#define SPH_ENTER_FUNC()
+#define SPH_ENTER_METHOD()
+#define SPH_RET(_arg) { return(_arg); }
+#define SPH_VOID_RET() { return; }
+
+#endif
+
+
+#define SafeDelete(_arg)		{ if ( _arg ) delete ( _arg );		(_arg) = NULL; }
+#define SafeDeleteArray(_arg)	{ if ( _arg ) delete [] ( _arg );	(_arg) = NULL; }
+
+//////////////////////////////////////////////////////////////////////////////
+
+/// a structure that will be shared among all open Sphinx SE handlers
+struct CSphSEShare
+{
+	pthread_mutex_t	m_tMutex;
+	THR_LOCK		m_tLock;
+
+	char *			m_sTable;
+	char *			m_sScheme;
+	char *			m_sHost;	///< points into m_sScheme buffer, DO NOT FREE EXPLICITLY
+	char *			m_sSocket;	///< points into m_sScheme buffer, DO NOT FREE EXPLICITLY
+	char *			m_sIndex;	///< points into m_sScheme buffer, DO NOT FREE EXPLICITLY
+	ushort			m_iPort;
+	uint			m_iTableNameLen;
+	uint			m_iUseCount;
+	CHARSET_INFO *	m_pTableQueryCharset;
+
+	int					m_iTableFields;
+	char **				m_sTableField;
+	enum_field_types *	m_eTableFieldType;
+
+	CSphSEShare ()
+		: m_sTable ( NULL )
+		, m_sScheme ( NULL )
+		, m_sHost ( NULL )
+		, m_sSocket ( NULL )
+		, m_sIndex ( NULL )
+		, m_iPort ( 0 )
+		, m_iTableNameLen ( 0 )
+		, m_iUseCount ( 1 )
+		, m_pTableQueryCharset ( NULL )
+
+		, m_iTableFields ( 0 )
+		, m_sTableField ( NULL )
+		, m_eTableFieldType ( NULL )
+	{
+		thr_lock_init ( &m_tLock );
+		pthread_mutex_init ( &m_tMutex, MY_MUTEX_INIT_FAST );
+	}
+
+	~CSphSEShare ()
+	{
+		pthread_mutex_destroy ( &m_tMutex );
+		thr_lock_delete ( &m_tLock );
+
+		SafeDeleteArray ( m_sTable );
+		SafeDeleteArray ( m_sScheme );
+		ResetTable ();
+	}
+
+	void ResetTable ()
+	{
+		for ( int i=0; i<m_iTableFields; i++ )
+			SafeDeleteArray ( m_sTableField[i] );
+		SafeDeleteArray ( m_sTableField );
+		SafeDeleteArray ( m_eTableFieldType );
+	}
+};
+
+/// schema attribute
+struct CSphSEAttr
+{
+	char *			m_sName;		///< attribute name (received from Sphinx)
+	uint32			m_uType;		///< attribute type (received from Sphinx)
+	int				m_iField;		///< field index in current table (-1 if none)
+
+	CSphSEAttr()
+		: m_sName ( NULL )
+		, m_uType ( SPH_ATTR_NONE )
+		, m_iField ( -1 )
+	{}
+
+	~CSphSEAttr ()
+	{
+		SafeDeleteArray ( m_sName );
+	}
+};
+
+/// word stats
+struct CSphSEWordStats
+{
+	char *			m_sWord;
+	int				m_iDocs;
+	int				m_iHits;
+
+	CSphSEWordStats ()
+		: m_sWord ( NULL )
+		, m_iDocs ( 0 )
+		, m_iHits ( 0 )
+	{}
+
+	~CSphSEWordStats ()
+	{
+		SafeDeleteArray ( m_sWord );
+	}
+};
+
+/// request stats
+struct CSphSEStats
+{
+public:
+	int					m_iMatchesTotal;
+	int					m_iMatchesFound;
+	int					m_iQueryMsec;
+	int					m_iWords;
+	CSphSEWordStats *	m_dWords;
+	bool				m_bLastError;
+	char				m_sLastMessage[1024];
+
+	CSphSEStats()
+		: m_dWords ( NULL )
+	{
+		Reset ();
+	}
+
+	void Reset ()
+	{
+		m_iMatchesTotal = 0;
+		m_iMatchesFound = 0;
+		m_iQueryMsec = 0;
+		m_iWords = 0;
+		SafeDeleteArray ( m_dWords );
+		m_bLastError = false;
+		m_sLastMessage[0] = '\0';
+	}
+
+	~CSphSEStats()
+	{
+		Reset ();
+	}
+};
+
+/// thread local storage
+struct CSphSEThreadData
+{
+	static const int	MAX_QUERY_LEN	= 262144; // 256k should be enough, right?
+
+	bool				m_bStats;
+	CSphSEStats			m_tStats;
+
+	bool				m_bQuery;
+	char				m_sQuery[MAX_QUERY_LEN];
+
+	CHARSET_INFO *		m_pQueryCharset;
+
+	CSphSEThreadData ()
+		: m_bStats ( false )
+		, m_bQuery ( false )
+		, m_pQueryCharset ( NULL )
+	{}
+};
+
+/// filter types
+enum ESphFilter
+{
+	SPH_FILTER_VALUES		= 0,	///< filter by integer values set
+	SPH_FILTER_RANGE		= 1,	///< filter by integer range
+	SPH_FILTER_FLOATRANGE	= 2		///< filter by float range
+};
+
+
+/// search query filter
+struct CSphSEFilter
+{
+public:
+	ESphFilter		m_eType;
+	char *			m_sAttrName;
+	longlong		m_uMinValue;
+	longlong		m_uMaxValue;
+	float			m_fMinValue;
+	float			m_fMaxValue;
+	int				m_iValues;
+	longlong *		m_pValues;
+	int				m_bExclude;
+
+public:
+	CSphSEFilter ()
+		: m_eType ( SPH_FILTER_VALUES )
+		, m_sAttrName ( NULL )
+		, m_uMinValue ( 0 )
+		, m_uMaxValue ( UINT_MAX )
+		, m_fMinValue ( 0.0f )
+		, m_fMaxValue ( 0.0f )
+		, m_iValues ( 0 )
+		, m_pValues ( NULL )
+		, m_bExclude ( 0 )
+	{
+	}
+
+	~CSphSEFilter ()
+	{
+		SafeDeleteArray ( m_pValues );
+	}
+};
+
+
+/// float vs dword conversion
+inline uint32 sphF2DW ( float f )	{ union { float f; uint32 d; } u; u.f = f; return u.d; }
+
+/// dword vs float conversion
+inline float sphDW2F ( uint32 d )	{ union { float f; uint32 d; } u; u.d = d; return u.f; }
+
+
+/// client-side search query
+struct CSphSEQuery
+{
+public:
+	const char *	m_sHost;
+	int				m_iPort;
+
+private:
+	char *			m_sQueryBuffer;
+
+	const char *	m_sIndex;
+	int				m_iOffset;
+	int				m_iLimit;
+
+	bool			m_bQuery;
+	char *			m_sQuery;
+	uint32 *		m_pWeights;
+	int				m_iWeights;
+	ESphMatchMode	m_eMode;
+	ESphRankMode	m_eRanker;
+	ESphSortOrder	m_eSort;
+	char *			m_sSortBy;
+	int				m_iMaxMatches;
+	int				m_iMaxQueryTime;
+	uint32			m_iMinID;
+	uint32			m_iMaxID;
+
+	int				m_iFilters;
+	CSphSEFilter	m_dFilters[SPHINXSE_MAX_FILTERS];
+
+	ESphGroupBy		m_eGroupFunc;
+	char *			m_sGroupBy;
+	char *			m_sGroupSortBy;
+	int				m_iCutoff;
+	int				m_iRetryCount;
+	int				m_iRetryDelay;
+	char *			m_sGroupDistinct;							///< points to query buffer; do NOT delete
+	int				m_iIndexWeights;
+	char *			m_sIndexWeight[SPHINXSE_MAX_FILTERS];		///< points to query buffer; do NOT delete
+	int				m_iIndexWeight[SPHINXSE_MAX_FILTERS];
+	int				m_iFieldWeights;
+	char *			m_sFieldWeight[SPHINXSE_MAX_FILTERS];		///< points to query buffer; do NOT delete
+	int				m_iFieldWeight[SPHINXSE_MAX_FILTERS];
+
+	bool			m_bGeoAnchor;
+	char *			m_sGeoLatAttr;
+	char *			m_sGeoLongAttr;
+	float			m_fGeoLatitude;
+	float			m_fGeoLongitude;
+
+	char *			m_sComment;
+
+	struct Override_t
+	{
+		union Value_t
+		{
+			uint32		m_uValue;
+			longlong	m_iValue64;
+			float		m_fValue;
+		};
+		char *						m_sName; ///< points to query buffer
+		int							m_iType;
+		Dynamic_array<ulonglong>	m_dIds;
+		Dynamic_array<Value_t>		m_dValues;
+	};
+	Dynamic_array<Override_t *> m_dOverrides;
+
+public:
+	char			m_sParseError[256];
+
+public:
+	CSphSEQuery ( const char * sQuery, int iLength, const char * sIndex );
+	~CSphSEQuery ();
+
+	bool			Parse ();
+	int				BuildRequest ( char ** ppBuffer );
+
+protected:
+	char *			m_pBuf;
+	char *			m_pCur;
+	int				m_iBufLeft;
+	bool			m_bBufOverrun;
+
+	template < typename T > int ParseArray ( T ** ppValues, const char * sValue );
+	bool			ParseField ( char * sField );
+
+	void			SendBytes ( const void * pBytes, int iBytes );
+	void			SendWord ( short int v )		{ v = ntohs(v); SendBytes ( &v, sizeof(short int) ); }
+	void			SendInt ( int v )				{ v = ntohl(v); SendBytes ( &v, sizeof(int) ); }
+	void			SendDword ( uint v )			{ v = ntohl(v) ;SendBytes ( &v, sizeof(uint) ); }
+	void			SendUint64 ( ulonglong v )		{ SendDword ( uint(v>>32) ); SendDword ( uint(v&0xFFFFFFFFUL) ); }
+	void			SendString ( const char * v )	{ int iLen = strlen(v); SendDword(iLen); SendBytes ( v, iLen ); }
+	void			SendFloat ( float v )			{ SendDword ( sphF2DW(v) ); }
+};
+
+template int CSphSEQuery::ParseArray<uint32> ( uint32 **, const char * );
+template int CSphSEQuery::ParseArray<longlong> ( longlong **, const char * );
+
+//////////////////////////////////////////////////////////////////////////////
+
+#if MYSQL_VERSION_ID>50100
+
+#if MYSQL_VERSION_ID<50114
+#error Sphinx SE requires MySQL 5.1.14 or higher if compiling for 5.1.x series!
+#endif
+
+static handler *	sphinx_create_handler ( handlerton * hton, TABLE_SHARE * table, MEM_ROOT * mem_root );
+static int			sphinx_init_func ( void * p );
+static int			sphinx_close_connection ( handlerton * hton, THD * thd );
+static int			sphinx_panic ( handlerton * hton, enum ha_panic_function flag );
+static bool			sphinx_show_status ( handlerton * hton, THD * thd, stat_print_fn * stat_print, enum ha_stat_type stat_type );
+
+#else
+
+static bool			sphinx_init_func_for_handlerton ();
+static int			sphinx_close_connection ( THD * thd );
+bool				sphinx_show_status ( THD * thd );
+
+#endif // >50100
+
+//////////////////////////////////////////////////////////////////////////////
+
+static const char	sphinx_hton_name[]		= "SPHINX";
+static const char	sphinx_hton_comment[]	= "Sphinx storage engine " SPHINX_VERSION;
+
+#if MYSQL_VERSION_ID<50100
+handlerton sphinx_hton =
+{
+	#ifdef MYSQL_HANDLERTON_INTERFACE_VERSION
+	MYSQL_HANDLERTON_INTERFACE_VERSION,
+	#endif
+	sphinx_hton_name,
+	SHOW_OPTION_YES,
+	sphinx_hton_comment,
+	DB_TYPE_SPHINX_DB,
+	sphinx_init_func_for_handlerton,
+	0,							// slot
+	0,							// savepoint size
+	sphinx_close_connection,	// close_connection
+	NULL,	// savepoint
+	NULL,	// rollback to savepoint
+	NULL,	// release savepoint
+	NULL,	// commit
+	NULL,	// rollback
+	NULL,	// prepare
+	NULL,	// recover
+	NULL,	// commit_by_xid
+	NULL,	// rollback_by_xid
+	NULL,	// create_cursor_read_view
+	NULL,	// set_cursor_read_view
+	NULL,	// close_cursor_read_view
+	HTON_CAN_RECREATE
+};
+#else
+static handlerton * sphinx_hton_ptr = NULL;
+#endif
+
+//////////////////////////////////////////////////////////////////////////////
+
+// variables for Sphinx shared methods
+pthread_mutex_t		sphinx_mutex;		// mutex to init the hash
+static int			sphinx_init = 0;	// flag whether the hash was initialized
+static HASH			sphinx_open_tables;	// hash used to track open tables
+
+//////////////////////////////////////////////////////////////////////////////
+// INITIALIZATION AND SHUTDOWN
+//////////////////////////////////////////////////////////////////////////////
+
+// hashing function
+#if MYSQL_VERSION_ID>=50120
+typedef size_t GetKeyLength_t;
+#else
+typedef uint GetKeyLength_t;
+#endif
+
+static byte * sphinx_get_key ( const byte * pSharePtr, GetKeyLength_t * pLength, my_bool )
+{
+	CSphSEShare * pShare = (CSphSEShare *) pSharePtr;
+	*pLength = (size_t) pShare->m_iTableNameLen;
+	return (byte*) pShare->m_sTable;
+}
+
+#if MYSQL_VERSION_ID<50100
+static int sphinx_init_func ( void * ) // to avoid unused arg warning
+#else
+static int sphinx_init_func ( void * p )
+#endif
+{
+	SPH_ENTER_FUNC();
+	if ( !sphinx_init )
+	{
+		sphinx_init = 1;
+		VOID ( pthread_mutex_init ( &sphinx_mutex, MY_MUTEX_INIT_FAST ) );
+		hash_init ( &sphinx_open_tables, system_charset_info, 32, 0, 0,
+			sphinx_get_key, 0, 0 );
+
+		#if MYSQL_VERSION_ID > 50100
+		handlerton * hton = (handlerton*) p;
+		hton->state				= SHOW_OPTION_YES;
+		hton->db_type			= DB_TYPE_AUTOASSIGN;
+		hton->create			= sphinx_create_handler;
+		hton->close_connection	= sphinx_close_connection;
+		hton->show_status		= sphinx_show_status;
+		hton->panic				= sphinx_panic;
+		hton->flags				= HTON_CAN_RECREATE;
+		sphinx_hton_ptr = hton;
+		#endif
+	}
+	SPH_RET(0);
+}
+
+
+#if MYSQL_VERSION_ID<50100
+static bool sphinx_init_func_for_handlerton ()
+{
+	return sphinx_init_func ( &sphinx_hton );
+}
+#endif
+
+
+#if MYSQL_VERSION_ID>50100
+
+static int sphinx_close_connection ( handlerton * hton, THD * thd )
+{
+	// deallocate common handler data
+	SPH_ENTER_FUNC();
+	void ** tmp = thd_ha_data ( thd, hton );
+	CSphSEThreadData * pTls = (CSphSEThreadData*) (*tmp);
+	SafeDelete ( pTls );
+	*tmp = NULL;
+	SPH_RET(0);
+}
+
+
+static int sphinx_done_func ( void * )
+{
+	SPH_ENTER_FUNC();
+
+	int error = 0;
+	if ( sphinx_init )
+	{
+		sphinx_init = 0;
+		if ( sphinx_open_tables.records )
+			error = 1;
+		hash_free ( &sphinx_open_tables );
+		pthread_mutex_destroy ( &sphinx_mutex );
+	}
+
+	SPH_RET(0);
+}
+
+
+static int sphinx_panic ( handlerton * hton, enum ha_panic_function )
+{
+	return sphinx_done_func ( hton );
+}
+
+#else
+
+static int sphinx_close_connection ( THD * thd )
+{
+	// deallocate common handler data
+	SPH_ENTER_FUNC();
+	CSphSEThreadData * pTls = (CSphSEThreadData*) thd->ha_data[sphinx_hton.slot];
+	SafeDelete ( pTls );
+	thd->ha_data[sphinx_hton.slot] = NULL;
+	SPH_RET(0);
+}
+
+#endif // >50100
+
+//////////////////////////////////////////////////////////////////////////////
+// SHOW STATUS
+//////////////////////////////////////////////////////////////////////////////
+
+#if MYSQL_VERSION_ID>50100
+static bool sphinx_show_status ( handlerton * hton, THD * thd, stat_print_fn * stat_print,
+	enum ha_stat_type )
+#else
+bool sphinx_show_status ( THD * thd )
+#endif
+{
+	SPH_ENTER_FUNC();
+
+#if MYSQL_VERSION_ID<50100
+	Protocol * protocol = thd->protocol;
+	List<Item> field_list;
+#endif
+
+	char buf1[IO_SIZE];
+	uint buf1len;
+	char buf2[IO_SIZE];
+	uint buf2len= 0;
+	String words;
+
+	buf1[0] = '\0';
+	buf2[0] = '\0';
+
+#if MYSQL_VERSION_ID>50100
+	CSphSEThreadData * pTls = (CSphSEThreadData*) ( *thd_ha_data ( thd, hton ) );
+#else
+	if ( have_sphinx_db!=SHOW_OPTION_YES )
+	{
+		my_message ( ER_NOT_SUPPORTED_YET,
+			"failed to call SHOW SPHINX STATUS: --skip-sphinx was specified",
+			MYF(0) );
+		SPH_RET(TRUE);
+	}
+	CSphSEThreadData * pTls = (CSphSEThreadData*) thd->ha_data[sphinx_hton.slot];
+#endif
+
+	if ( pTls && pTls->m_bStats )
+	{
+		const CSphSEStats * pStats = &pTls->m_tStats;
+		buf1len = my_snprintf ( buf1, sizeof(buf1),
+			"total: %d, total found: %d, time: %d, words: %d", 
+			pStats->m_iMatchesTotal, pStats->m_iMatchesFound, pStats->m_iQueryMsec, pStats->m_iWords );
+
+#if MYSQL_VERSION_ID>50100
+		stat_print ( thd, sphinx_hton_name, strlen(sphinx_hton_name),
+			STRING_WITH_LEN("stats"), buf1, buf1len );
+#else
+		field_list.push_back ( new Item_empty_string ( "Type",10 ) );
+		field_list.push_back ( new Item_empty_string ( "Name",FN_REFLEN ) );
+		field_list.push_back ( new Item_empty_string ( "Status",10 ) );
+		if ( protocol->send_fields ( &field_list, Protocol::SEND_NUM_ROWS | Protocol::SEND_EOF ) )
+			SPH_RET(TRUE);
+
+		protocol->prepare_for_resend ();
+		protocol->store ( STRING_WITH_LEN("SPHINX"), system_charset_info );
+		protocol->store ( STRING_WITH_LEN("stats"), system_charset_info );
+		protocol->store ( buf1, buf1len, system_charset_info );
+		if ( protocol->write() )
+			SPH_RET(TRUE);
+#endif
+
+		if ( pStats->m_iWords )
+		{
+			for ( int i=0; i<pStats->m_iWords; i++ )
+			{
+				CSphSEWordStats & tWord = pStats->m_dWords[i];
+				buf2len = my_snprintf ( buf2, sizeof(buf2), "%s%s:%d:%d ",
+					buf2, tWord.m_sWord, tWord.m_iDocs, tWord.m_iHits );
+			}
+
+			// convert it if we can
+			const char * sWord = buf2;
+			int iWord = buf2len;
+
+			String sBuf3;
+			if ( pTls->m_pQueryCharset )
+			{
+				uint iErrors;
+				sBuf3.copy ( buf2, buf2len, pTls->m_pQueryCharset, system_charset_info, &iErrors );
+				sWord = sBuf3.c_ptr();
+				iWord = sBuf3.length();
+			}
+
+#if MYSQL_VERSION_ID>50100
+			stat_print ( thd, sphinx_hton_name, strlen(sphinx_hton_name),
+				STRING_WITH_LEN("words"), sWord, iWord );
+#else
+			protocol->prepare_for_resend ();
+			protocol->store ( STRING_WITH_LEN("SPHINX"), system_charset_info );
+			protocol->store ( STRING_WITH_LEN("words"), system_charset_info );
+			protocol->store ( sWord, iWord, system_charset_info );
+			if ( protocol->write() )
+				SPH_RET(TRUE);
+#endif
+		}
+
+		// send last error or warning
+		if ( pStats->m_sLastMessage && pStats->m_sLastMessage[0] )
+		{
+			const char * sMessageType = pStats->m_bLastError ? "error" : "warning";
+
+#if MYSQL_VERSION_ID>50100
+			stat_print ( thd, sphinx_hton_name, strlen(sphinx_hton_name),
+				sMessageType, strlen(sMessageType), pStats->m_sLastMessage, strlen(pStats->m_sLastMessage) );
+#else
+			protocol->prepare_for_resend ();
+			protocol->store ( STRING_WITH_LEN("SPHINX"), system_charset_info );
+			protocol->store ( sMessageType, strlen(sMessageType), system_charset_info );
+			protocol->store ( pStats->m_sLastMessage, strlen(pStats->m_sLastMessage), system_charset_info );
+			if ( protocol->write() )
+				SPH_RET(TRUE);
+#endif
+		}
+
+	} else
+	{
+		#if MYSQL_VERSION_ID < 50100
+		field_list.push_back ( new Item_empty_string ( "Type", 10 ) );
+		field_list.push_back ( new Item_empty_string ( "Name", FN_REFLEN ) );
+		field_list.push_back ( new Item_empty_string ( "Status", 10 ) );
+		if ( protocol->send_fields ( &field_list, Protocol::SEND_NUM_ROWS | Protocol::SEND_EOF ) )
+			SPH_RET(TRUE);
+
+		protocol->prepare_for_resend ();
+		protocol->store ( STRING_WITH_LEN("SPHINX"), system_charset_info );
+		protocol->store ( STRING_WITH_LEN("stats"), system_charset_info );
+		protocol->store ( STRING_WITH_LEN("no query has been executed yet"), system_charset_info );
+		if ( protocol->write() )
+			SPH_RET(TRUE);
+		#endif
+	}
+
+	#if MYSQL_VERSION_ID < 50100
+	send_eof(thd);
+	#endif
+
+	SPH_RET(FALSE);
+}
+
+//////////////////////////////////////////////////////////////////////////////
+// HELPERS
+//////////////////////////////////////////////////////////////////////////////
+
+static char * sphDup ( const char * sSrc, int iLen=-1 )
+{
+	if ( !sSrc )
+		return NULL;
+
+	if ( iLen<0 )
+		iLen = strlen(sSrc);
+
+	char * sRes = new char [ 1+iLen ];
+	memcpy ( sRes, sSrc, iLen );
+	sRes[iLen] = '\0';
+	return sRes;
+}
+
+
+static void sphLogError ( const char * sFmt, ... )
+{
+	// emit timestamp
+#ifdef __WIN__
+	SYSTEMTIME t;
+	GetLocalTime ( &t );
+
+	fprintf ( stderr, "%02d%02d%02d %2d:%02d:%02d SphinxSE: internal error: ",
+		(int)t.wYear % 100, (int)t.wMonth, (int)t.wDay,
+		(int)t.wHour, (int)t.wMinute, (int)t.wSecond );
+#else
+	// Unix version
+	time_t tStamp;
+	time ( &tStamp );
+
+	struct tm * pParsed;
+#ifdef HAVE_LOCALTIME_R
+	struct tm tParsed;
+	localtime_r ( &tStamp, &tParsed );
+	pParsed = &tParsed;
+#else
+	pParsed = localtime ( &tStamp );
+#endif // HAVE_LOCALTIME_R
+
+	fprintf ( stderr, "%02d%02d%02d %2d:%02d:%02d SphinxSE: internal error: ",
+		pParsed->tm_year % 100, pParsed->tm_mon + 1, pParsed->tm_mday,
+		pParsed->tm_hour, pParsed->tm_min, pParsed->tm_sec);
+#endif // __WIN__
+
+	// emit message
+	va_list ap;
+	va_start ( ap, sFmt );
+	vfprintf ( stderr, sFmt, ap );
+	va_end ( ap );
+
+	// emit newline
+	fprintf ( stderr, "\n" );
+}
+
+
+
+// the following scheme variants are recognized
+//
+// sphinx://host/index
+// sphinx://host:port/index
+// unix://unix/domain/socket:index
+// unix://unix/domain/socket
+static bool ParseUrl ( CSphSEShare * share, TABLE * table, bool bCreate )
+{
+	SPH_ENTER_FUNC();
+
+	if ( share )
+	{
+		// check incoming stuff
+		if ( !table )
+		{
+			sphLogError ( "table==NULL in ParseUrl()" ); 
+			return false;
+		}
+		if ( !table->s )
+		{
+			sphLogError ( "(table->s)==NULL in ParseUrl()" ); 
+			return false;
+		}
+
+		// free old stuff
+		share->ResetTable ();
+
+		// fill new stuff
+		share->m_iTableFields = table->s->fields;
+		if ( share->m_iTableFields )
+		{
+			share->m_sTableField = new char * [ share->m_iTableFields ];
+			share->m_eTableFieldType = new enum_field_types [ share->m_iTableFields ];
+
+			for ( int i=0; i<share->m_iTableFields; i++ )
+			{
+				share->m_sTableField[i] = sphDup ( table->field[i]->field_name );
+				share->m_eTableFieldType[i] = table->field[i]->type();
+			}
+		}
+	}
+
+	char * sScheme = NULL;
+	char * sHost = (char*) SPHINXSE_DEFAULT_HOST;
+	char * sIndex = (char*) SPHINXSE_DEFAULT_INDEX;
+	int iPort = SPHINXSE_DEFAULT_PORT;
+
+	bool bOk = true;
+	while ( table->s->connect_string.length!=0 )
+	{
+		bOk = false;
+		sScheme = sphDup ( table->s->connect_string.str, table->s->connect_string.length );
+
+		sHost = strstr ( sScheme, "://" );
+		if ( !sHost )
+			break;
+		sHost[0] = '\0';
+		sHost += 2;
+
+		if ( !strcmp ( sScheme, "unix" ) )
+		{
+			// unix-domain socket
+			iPort = 0;
+			if (!( sIndex = strrchr ( sHost, ':' ) ))
+                                sIndex = (char*) SPHINXSE_DEFAULT_INDEX;
+			else
+			{
+				*sIndex++ = '\0';
+				if ( !*sIndex )
+                                  	sIndex = (char*) SPHINXSE_DEFAULT_INDEX;
+			}
+			bOk = true;
+			break;
+		}
+		if( strcmp ( sScheme, "sphinx" )!=0 && strcmp ( sScheme, "inet" )!=0 )
+			break;
+
+		// tcp
+		sHost++;
+		char * sPort = strchr ( sHost, ':' );
+		if ( sPort )
+		{
+			*sPort++ = '\0';
+			if ( *sPort )
+			{
+				sIndex = strchr ( sPort, '/' );
+				if ( sIndex )
+					*sIndex++ = '\0'; 
+				else
+                                  	sIndex = (char*) SPHINXSE_DEFAULT_INDEX;
+
+				iPort = atoi(sPort);
+				if ( !iPort )
+                                  	iPort = SPHINXSE_DEFAULT_PORT;
+			}
+		} else
+		{
+			sIndex = strchr ( sHost, '/' );
+			if ( sIndex )
+				*sIndex++ = '\0';
+			else
+                          	sIndex = (char*) SPHINXSE_DEFAULT_INDEX;
+		}
+
+		bOk = true;
+		break;
+	}
+
+	if ( !bOk )
+	{
+		my_error ( bCreate ? ER_FOREIGN_DATA_STRING_INVALID_CANT_CREATE : ER_FOREIGN_DATA_STRING_INVALID,
+			MYF(0), table->s->connect_string );
+	} else
+	{
+		if ( share )
+		{
+			SafeDeleteArray ( share->m_sScheme );
+			share->m_sScheme = sScheme;
+			share->m_sHost = sHost;
+			share->m_sIndex = sIndex;
+			share->m_iPort = (ushort)iPort;
+		}
+	}
+	if ( !bOk && !share )
+		SafeDeleteArray ( sScheme );
+
+	SPH_RET(bOk);
+}
+
+
+// Example of simple lock controls. The "share" it creates is structure we will
+// pass to each sphinx handler. Do you have to have one of these? Well, you have
+// pieces that are used for locking, and they are needed to function.
+static CSphSEShare * get_share ( const char * table_name, TABLE * table )
+{
+	SPH_ENTER_FUNC();
+	pthread_mutex_lock ( &sphinx_mutex );
+
+	CSphSEShare * pShare = NULL;
+	for ( ;; )
+	{
+		// check if we already have this share
+#if MYSQL_VERSION_ID>=50120
+		pShare = (CSphSEShare*) hash_search ( &sphinx_open_tables, (const uchar *) table_name, strlen(table_name) );
+#else
+#ifdef __WIN__
+		pShare = (CSphSEShare*) hash_search ( &sphinx_open_tables, (const byte *) table_name, strlen(table_name) );
+#else
+		pShare = (CSphSEShare*) hash_search ( &sphinx_open_tables, table_name, strlen(table_name) );
+#endif // win
+#endif // pre-5.1.20
+
+		if ( pShare )
+		{
+			pShare->m_iUseCount++;
+			break;
+		}
+
+		// try to allocate new share
+		pShare = new CSphSEShare ();
+		if ( !pShare )
+			break;
+
+		// try to setup it
+		pShare->m_pTableQueryCharset = table->field[2]->charset();
+		if ( !ParseUrl ( pShare, table, false ) )
+		{
+			SafeDelete ( pShare );
+			break;
+		}
+
+		// try to hash it
+		pShare->m_iTableNameLen = strlen(table_name);
+		pShare->m_sTable = sphDup ( table_name );
+		if ( my_hash_insert ( &sphinx_open_tables, (const byte *)pShare ) )
+		{
+			SafeDelete ( pShare );
+			break;
+		}
+
+		// all seems fine
+		break;
+	}
+
+	pthread_mutex_unlock ( &sphinx_mutex );
+	SPH_RET(pShare);
+}
+
+
+// Free lock controls. We call this whenever we close a table. If the table had
+// the last reference to the share then we free memory associated with it.
+static int free_share ( CSphSEShare * pShare )
+{
+	SPH_ENTER_FUNC();
+	pthread_mutex_lock ( &sphinx_mutex );
+
+	if ( !--pShare->m_iUseCount )
+	{
+		hash_delete ( &sphinx_open_tables, (byte *)pShare );
+		SafeDelete ( pShare );
+	}
+
+	pthread_mutex_unlock ( &sphinx_mutex );
+	SPH_RET(0);
+}
+
+
+#if MYSQL_VERSION_ID>50100
+static handler * sphinx_create_handler ( handlerton * hton, TABLE_SHARE * table, MEM_ROOT * mem_root )
+{
+	return new ( mem_root ) ha_sphinx ( hton, table );
+}
+#endif
+
+//////////////////////////////////////////////////////////////////////////////
+// CLIENT-SIDE REQUEST STUFF
+//////////////////////////////////////////////////////////////////////////////
+
+CSphSEQuery::CSphSEQuery ( const char * sQuery, int iLength, const char * sIndex )
+	: m_sHost ( "" )
+	, m_iPort ( 0 )
+	, m_sIndex ( sIndex ? sIndex : (char*) "*" )
+	, m_iOffset ( 0 )
+	, m_iLimit ( 20 )
+	, m_bQuery ( false )
+	, m_sQuery ( (char*) "" )
+	, m_pWeights ( NULL )
+	, m_iWeights ( 0 )
+	, m_eMode ( SPH_MATCH_ALL )
+	, m_eRanker ( SPH_RANK_PROXIMITY_BM25 )
+	, m_eSort ( SPH_SORT_RELEVANCE )
+	, m_sSortBy ( (char*) "" )
+	, m_iMaxMatches ( 1000 )
+	, m_iMaxQueryTime ( 0 )
+	, m_iMinID ( 0 )
+	, m_iMaxID ( 0 )
+	, m_iFilters ( 0 )
+	, m_eGroupFunc ( SPH_GROUPBY_DAY )
+	, m_sGroupBy ( (char*) "" )
+	, m_sGroupSortBy ( (char*) "@group desc" )
+	, m_iCutoff ( 0 )
+	, m_iRetryCount ( 0 )
+	, m_iRetryDelay ( 0 )
+	, m_sGroupDistinct ( (char*) "" )
+	, m_iIndexWeights ( 0 )
+	, m_iFieldWeights ( 0 )
+	, m_bGeoAnchor ( false )
+	, m_sGeoLatAttr ( (char*) "" )
+	, m_sGeoLongAttr ( (char*) "" )
+	, m_fGeoLatitude ( 0.0f )
+	, m_fGeoLongitude ( 0.0f )
+	, m_sComment ( (char*) "" )
+
+	, m_pBuf ( NULL )
+	, m_pCur ( NULL )
+	, m_iBufLeft ( 0 )
+	, m_bBufOverrun ( false )
+{
+	m_sQueryBuffer = new char [ iLength+2 ];
+	memcpy ( m_sQueryBuffer, sQuery, iLength );
+	m_sQueryBuffer[iLength]= ';';
+	m_sQueryBuffer[iLength+1]= '\0';
+}
+
+
+CSphSEQuery::~CSphSEQuery ()
+{
+	SPH_ENTER_METHOD();
+	SafeDeleteArray ( m_sQueryBuffer );
+	SafeDeleteArray ( m_pWeights );
+	SafeDeleteArray ( m_pBuf );
+	for ( int i=0; i<m_dOverrides.elements(); i++ )
+		SafeDelete ( m_dOverrides.at(i) );
+	SPH_VOID_RET();
+}
+
+
+template < typename T >
+int CSphSEQuery::ParseArray ( T ** ppValues, const char * sValue )
+{
+	SPH_ENTER_METHOD();
+
+	assert ( ppValues );
+	assert ( !(*ppValues) );
+
+	const char * pValue;
+	bool bPrevDigit = false;
+	int iValues = 0;
+
+	// count the values
+	for ( pValue=sValue; *pValue; pValue++ )
+	{
+		bool bDigit = (*pValue)>='0' && (*pValue)<='9';
+		if ( bDigit && !bPrevDigit )
+			iValues++;
+		bPrevDigit = bDigit;
+	}
+	if ( !iValues )
+		SPH_RET(0);
+
+	// extract the values
+	T * pValues = new T [ iValues ];
+	*ppValues = pValues;
+
+	int iIndex = 0, iSign = 1;
+	T uValue = 0;
+
+	bPrevDigit = false;
+	for ( pValue=sValue ;; pValue++ )
+	{
+		bool bDigit = (*pValue)>='0' && (*pValue)<='9';
+
+		if ( bDigit )
+		{
+			if ( !bPrevDigit )
+				uValue = 0;
+			uValue = uValue*10 + ( (*pValue)-'0' );
+		}
+		else if ( bPrevDigit )
+		{
+			assert ( iIndex<iValues );
+			pValues [ iIndex++ ] = uValue * iSign;
+			iSign = 1;
+		}
+		else if ( *pValue=='-' )
+			iSign = -1;
+		bPrevDigit = bDigit;
+
+		if ( !*pValue )
+			break;
+	}
+
+	SPH_RET(iValues);
+}
+
+
+static char * chop ( char * s )
+{
+	while ( *s && isspace(*s) )
+		s++;
+
+	char * p = s + strlen(s);
+	while ( p>s && isspace(p[-1]) )
+		p--;
+	*p = '\0';
+
+	return s;
+}
+
+
+static bool myisattr ( char c )
+{
+	return
+		( c>='0' && c<='9' ) ||
+		( c>='a' && c<='z' ) ||
+		( c>='A' && c<='Z' ) ||
+		c=='_';
+}
+
+
+bool CSphSEQuery::ParseField ( char * sField )
+{
+	SPH_ENTER_METHOD();
+
+	// look for option name/value separator
+	char * sValue = strchr ( sField, '=' );
+	if ( !sValue || sValue==sField || sValue[-1]=='\\' )
+	{
+		// by default let's assume it's just query
+		if ( sField[0] )
+		{
+			if ( m_bQuery )
+			{
+				snprintf ( m_sParseError, sizeof(m_sParseError), "search query already specified; '%s' is redundant", sField );
+				SPH_RET(false);
+			} else
+			{
+				m_sQuery = sField;
+				m_bQuery = true;
+
+				// unescape
+				char *s = sField, *d = sField;
+				while ( *s )
+				{
+					if ( *s!='\\' ) *d++ = *s;
+					s++;
+				}
+				*d = '\0';
+			}
+		}
+		SPH_RET(true);
+	}
+
+	// split
+	*sValue++ = '\0';
+	sValue = chop ( sValue );
+	int iValue = atoi ( sValue );
+
+	// handle options
+	char * sName = chop ( sField );
+
+	if ( !strcmp ( sName, "query" ) )			m_sQuery = sValue;
+	else if ( !strcmp ( sName, "host" ) )		m_sHost = sValue;
+	else if ( !strcmp ( sName, "port" ) )		m_iPort = iValue;
+	else if ( !strcmp ( sName, "index" ) )		m_sIndex = sValue;
+	else if ( !strcmp ( sName, "offset" ) )		m_iOffset = iValue;
+	else if ( !strcmp ( sName, "limit" ) )		m_iLimit = iValue;
+	else if ( !strcmp ( sName, "weights" ) )	m_iWeights = ParseArray<uint32> ( &m_pWeights, sValue );
+	else if ( !strcmp ( sName, "minid" ) )		m_iMinID = iValue;
+	else if ( !strcmp ( sName, "maxid" ) )		m_iMaxID = iValue;
+	else if ( !strcmp ( sName, "maxmatches" ) )	m_iMaxMatches = iValue;
+	else if ( !strcmp ( sName, "maxquerytime" ) )	m_iMaxQueryTime = iValue;
+	else if ( !strcmp ( sName, "groupsort" ) )	m_sGroupSortBy = sValue;
+	else if ( !strcmp ( sName, "distinct" ) )	m_sGroupDistinct = sValue;
+	else if ( !strcmp ( sName, "cutoff" ) )		m_iCutoff = iValue;
+	else if ( !strcmp ( sName, "comment" ) )	m_sComment = sValue;
+
+	else if ( !strcmp ( sName, "mode" ) )
+	{
+
+		m_eMode = SPH_MATCH_ALL;
+		if ( !strcmp ( sValue, "any") )				m_eMode = SPH_MATCH_ANY;
+		else if ( !strcmp ( sValue, "phrase" ) )	m_eMode = SPH_MATCH_PHRASE;
+		else if ( !strcmp ( sValue, "boolean") )	m_eMode = SPH_MATCH_BOOLEAN;
+		else if ( !strcmp ( sValue, "ext") )		m_eMode = SPH_MATCH_EXTENDED;
+		else if ( !strcmp ( sValue, "extended") )	m_eMode = SPH_MATCH_EXTENDED;
+		else if ( !strcmp ( sValue, "ext2") )		m_eMode = SPH_MATCH_EXTENDED2;
+		else if ( !strcmp ( sValue, "extended2") )	m_eMode = SPH_MATCH_EXTENDED2;
+		else if ( !strcmp ( sValue, "all") )		m_eMode = SPH_MATCH_ALL;
+		else if ( !strcmp ( sValue, "fullscan") )	m_eMode = SPH_MATCH_FULLSCAN;
+		else
+		{
+			snprintf ( m_sParseError, sizeof(m_sParseError), "unknown matching mode '%s'", sValue );
+			SPH_RET(false);
+		}
+	} else if ( !strcmp ( sName, "ranker" ) )
+	{
+
+		m_eRanker = SPH_RANK_PROXIMITY_BM25;
+		if ( !strcmp ( sValue, "proximity_bm25") )	m_eRanker = SPH_RANK_PROXIMITY_BM25;
+		else if ( !strcmp ( sValue, "bm25" ) )		m_eRanker = SPH_RANK_BM25;
+		else if ( !strcmp ( sValue, "none" ) )		m_eRanker = SPH_RANK_NONE;
+		else if ( !strcmp ( sValue, "wordcount" ) )	m_eRanker = SPH_RANK_WORDCOUNT;
+		else if ( !strcmp ( sValue, "proximity" ) )	m_eRanker = SPH_RANK_PROXIMITY;
+		else if ( !strcmp ( sValue, "matchany" ) )	m_eRanker = SPH_RANK_MATCHANY;
+		else if ( !strcmp ( sValue, "fieldmask" ) )	m_eRanker = SPH_RANK_FIELDMASK;
+		else
+		{
+			snprintf ( m_sParseError, sizeof(m_sParseError), "unknown ranking mode '%s'", sValue );
+			SPH_RET(false);
+		}
+	} else if ( !strcmp ( sName, "sort" ) )
+	{
+		static const struct 
+		{
+			const char *	m_sName;
+			ESphSortOrder	m_eSort;
+		} dSortModes[] = 
+		{
+			{ "relevance",		SPH_SORT_RELEVANCE },
+			{ "attr_desc:",		SPH_SORT_ATTR_DESC },
+			{ "attr_asc:",		SPH_SORT_ATTR_ASC },
+			{ "time_segments:",	SPH_SORT_TIME_SEGMENTS },
+			{ "extended:",		SPH_SORT_EXTENDED },
+			{ "expr:",			SPH_SORT_EXPR }
+		};
+
+		int i;
+		const int nModes = sizeof(dSortModes)/sizeof(dSortModes[0]);
+		for ( i=0; i<nModes; i++ )
+			if ( !strncmp ( sValue, dSortModes[i].m_sName, strlen(dSortModes[i].m_sName) ) )
+		{
+			m_eSort = dSortModes[i].m_eSort;
+			m_sSortBy = sValue + strlen(dSortModes[i].m_sName);
+			break;
+		}
+		if ( i==nModes )
+		{
+			snprintf ( m_sParseError, sizeof(m_sParseError), "unknown sorting mode '%s'", sValue );
+			SPH_RET(false);
+		}
+
+	} else if ( !strcmp ( sName, "groupby" ) )
+	{
+		static const struct 
+		{
+			const char *	m_sName;
+			ESphGroupBy		m_eFunc;
+		} dGroupModes[] = 
+		{
+			{ "day:",	SPH_GROUPBY_DAY },
+			{ "week:",	SPH_GROUPBY_WEEK },
+			{ "month:",	SPH_GROUPBY_MONTH },
+			{ "year:",	SPH_GROUPBY_YEAR },
+			{ "attr:",	SPH_GROUPBY_ATTR },
+		};
+
+		int i;
+		const int nModes = sizeof(dGroupModes)/sizeof(dGroupModes[0]);
+		for ( i=0; i<nModes; i++ )
+			if ( !strncmp ( sValue, dGroupModes[i].m_sName, strlen(dGroupModes[i].m_sName) ) )
+		{
+			m_eGroupFunc = dGroupModes[i].m_eFunc;
+			m_sGroupBy = sValue + strlen(dGroupModes[i].m_sName);
+			break;
+		}
+		if ( i==nModes )
+		{
+			snprintf ( m_sParseError, sizeof(m_sParseError), "unknown groupby mode '%s'", sValue );
+			SPH_RET(false);
+		}
+
+	} else if ( m_iFilters<SPHINXSE_MAX_FILTERS &&
+		( !strcmp ( sName, "range" ) || !strcmp ( sName, "!range" ) || !strcmp ( sName, "floatrange" ) || !strcmp ( sName, "!floatrange" ) ) )
+	{
+		for ( ;; )
+		{
+			char * p = sName;
+			CSphSEFilter & tFilter = m_dFilters [ m_iFilters ];
+			tFilter.m_bExclude = ( *p=='!' ); if ( tFilter.m_bExclude ) p++;
+			tFilter.m_eType = ( *p=='f' ) ? SPH_FILTER_FLOATRANGE : SPH_FILTER_RANGE;
+
+			if (!( p = strchr ( sValue, ',' ) ))
+				break;
+			*p++ = '\0';
+
+			tFilter.m_sAttrName = chop ( sValue );
+			sValue = p;
+
+			if (!( p = strchr ( sValue, ',' ) ))
+				break;
+			*p++ = '\0';
+
+			if ( tFilter.m_eType==SPH_FILTER_RANGE )
+			{
+				tFilter.m_uMinValue = strtoll ( sValue, NULL, 0 );
+				tFilter.m_uMaxValue = strtoll ( p, NULL, 0 );
+			} else
+			{
+				tFilter.m_fMinValue = (float)atof(sValue);
+				tFilter.m_fMaxValue = (float)atof(p);
+			}
+
+			// all ok
+			m_iFilters++;
+			break;
+		}
+
+	} else if ( m_iFilters<SPHINXSE_MAX_FILTERS &&
+		( !strcmp ( sName, "filter" ) || !strcmp ( sName, "!filter" ) ) )
+	{
+		for ( ;; )
+		{
+			CSphSEFilter & tFilter = m_dFilters [ m_iFilters ];
+			tFilter.m_eType = SPH_FILTER_VALUES;
+			tFilter.m_bExclude = ( strcmp ( sName, "!filter")==0 );
+
+			// get the attr name
+			while ( (*sValue) && !myisattr(*sValue) )
+				sValue++;
+			if ( !*sValue )
+				break;
+
+			tFilter.m_sAttrName = sValue;
+			while ( (*sValue) && myisattr(*sValue) )
+				sValue++;
+			if ( !*sValue )
+				break;
+			*sValue++ = '\0';
+
+			// get the values
+			tFilter.m_iValues = ParseArray<longlong> ( &tFilter.m_pValues, sValue );
+			if ( !tFilter.m_iValues )
+			{
+				assert ( !tFilter.m_pValues );
+				break;
+			}
+
+			// all ok
+			m_iFilters++;
+			break;
+		}
+
+	} else if ( !strcmp ( sName, "indexweights" ) || !strcmp ( sName, "fieldweights" ) )
+	{
+		bool bIndex = !strcmp ( sName, "indexweights" );
+		int * pCount = bIndex ? &m_iIndexWeights : &m_iFieldWeights;
+		char ** pNames = bIndex ? &m_sIndexWeight[0] : &m_sFieldWeight[0];
+		int * pWeights = bIndex ? &m_iIndexWeight[0] : &m_iFieldWeight[0];
+
+		*pCount = 0;
+
+		char * p = sValue;
+		while ( *p && *pCount<SPHINXSE_MAX_FILTERS )
+		{
+			// extract attr name
+			if ( !myisattr(*p) )
+			{
+				snprintf ( m_sParseError, sizeof(m_sParseError), "%s: index name expected near '%s'", sName, p );
+				SPH_RET(false);
+			}
+
+			pNames[*pCount] = p;
+			while ( myisattr(*p) ) p++;
+
+			if ( *p!=',' )
+			{
+				snprintf ( m_sParseError, sizeof(m_sParseError), "%s: comma expected near '%s'", sName, p );
+				SPH_RET(false);
+			}
+			*p++ = '\0';
+
+			// extract attr value
+			char * sVal = p;
+			while ( isdigit(*p) ) p++;
+			if ( p==sVal )
+			{
+				snprintf ( m_sParseError, sizeof(m_sParseError), "%s: integer weight expected near '%s'", sName, sVal );
+				SPH_RET(false);
+			}
+			pWeights[*pCount] = atoi(sVal);
+			(*pCount)++;
+
+			if ( !*p )  break;
+			if ( *p!=',' )
+			{
+				snprintf ( m_sParseError, sizeof(m_sParseError), "%s: comma expected near '%s'", sName, p );
+				SPH_RET(false);
+			}
+			p++;
+		}
+
+	} else if ( !strcmp ( sName, "geoanchor" ) )
+	{
+		m_bGeoAnchor = false;
+		for ( ;; )
+		{
+			char * sLat = sValue;
+			char * p = sValue;
+
+			if (!( p = strchr ( p, ',' ) )) break; *p++ = '\0';
+			char * sLong = p;
+
+			if (!( p = strchr ( p, ',' ) )) break; *p++ = '\0';
+			char * sLatVal = p;
+
+			if (!( p = strchr ( p, ',' ) )) break; *p++ = '\0';
+			char * sLongVal = p;
+
+			m_sGeoLatAttr = chop(sLat);
+			m_sGeoLongAttr = chop(sLong);
+			m_fGeoLatitude = (float)atof(sLatVal);
+			m_fGeoLongitude = (float)atof(sLongVal);
+			m_bGeoAnchor = true;
+			break;
+		}
+		if ( !m_bGeoAnchor )
+		{
+			snprintf ( m_sParseError, sizeof(m_sParseError), "geoanchor: parse error, not enough comma-separated arguments" );
+			SPH_RET(false);
+		}
+	}
+	else if ( !strcmp ( sName, "override" ) ) // name,type,id:value,id:value,...
+	{
+		char * sName = NULL;
+		int iType = 0;
+		CSphSEQuery::Override_t * pOverride = NULL;
+
+		// get name and type
+		char * sRest = sValue;
+		for ( ;; )
+		{
+			sName = sRest;
+			if ( !*sName )
+				break;
+			
+			if (!( sRest = strchr ( sRest, ',' ) )) break; *sRest++ = '\0';
+			char * sType = sRest;
+			if (!( sRest = strchr ( sRest, ',' ) )) break;
+			
+			static const struct
+			{
+				const char *	m_sName;
+				int				m_iType;
+			}
+			dAttrTypes[] =
+			{
+				{ "int",		SPH_ATTR_INTEGER },
+				{ "timestamp",	SPH_ATTR_TIMESTAMP },
+				{ "bool",		SPH_ATTR_BOOL },
+				{ "float",		SPH_ATTR_FLOAT },
+				{ "bigint",		SPH_ATTR_BIGINT }
+			};
+			for ( uint i=0; i<sizeof(dAttrTypes)/sizeof(*dAttrTypes); i++ )
+				if ( !strncmp( sType, dAttrTypes[i].m_sName, sRest - sType ) )
+			{
+				iType = dAttrTypes[i].m_iType;
+				break;
+			}
+			break;
+		}
+
+		// fail
+		if ( !sName || !*sName  || !iType )
+		{
+			snprintf ( m_sParseError, sizeof(m_sParseError), "override: malformed query" );
+			SPH_RET(false);
+		}
+
+		// grab id:value pairs
+		sRest++;
+		while ( sRest )
+		{
+			char * sId = sRest;
+			if (!( sRest = strchr ( sRest, ':' ) )) break; *sRest++ = '\0';
+			if (!( sRest - sId )) break;
+
+			char * sValue = sRest;
+			if (( sRest = strchr ( sRest, ',' ) )) *sRest++ = '\0';
+			if ( !*sValue )
+				break;
+
+			if ( !pOverride )
+			{
+				pOverride = new CSphSEQuery::Override_t;
+				pOverride->m_sName = chop(sName);
+				pOverride->m_iType = iType;
+				m_dOverrides.append(pOverride);
+			}
+
+			ulonglong uId = strtoull ( sId, NULL, 10 );
+			CSphSEQuery::Override_t::Value_t tValue;
+			if ( iType == SPH_ATTR_FLOAT )
+				tValue.m_fValue = (float)atof(sValue);
+			else if ( iType == SPH_ATTR_BIGINT )
+				tValue.m_iValue64 = strtoll ( sValue, NULL, 10 );
+			else
+				tValue.m_uValue = (uint32)strtoul ( sValue, NULL, 10 );
+			
+			pOverride->m_dIds.append ( uId );
+			pOverride->m_dValues.append ( tValue );
+		}
+
+		if ( !pOverride )
+		{
+			snprintf ( m_sParseError, sizeof(m_sParseError), "override: id:value mapping expected" );
+			SPH_RET(false);
+		}
+		SPH_RET(true);
+	}
+	else
+	{
+		snprintf ( m_sParseError, sizeof(m_sParseError), "unknown parameter '%s'", sName );
+		SPH_RET(false);
+	}
+
+	// !COMMIT handle syntax errors
+
+	SPH_RET(true);
+}
+
+
+bool CSphSEQuery::Parse ()
+{
+	SPH_ENTER_METHOD();
+	SPH_DEBUG ( "query [[ %s ]]", m_sQueryBuffer );
+
+	m_bQuery = false;
+	char * pCur = m_sQueryBuffer;
+	char * pNext = pCur;
+
+	while (( pNext = strchr ( pNext, ';' ) ))
+	{
+		// handle escaped semicolons
+		if ( pNext>m_sQueryBuffer && pNext[-1]=='\\' && pNext[1]!='\0' )
+		{
+			pNext++;
+			continue;
+		}
+
+		// handle semicolon-separated clauses
+		*pNext++ = '\0';
+		if ( !ParseField ( pCur ) )
+			SPH_RET(false);
+		pCur = pNext;
+	}
+
+	SPH_RET(true);
+}
+
+
+void CSphSEQuery::SendBytes ( const void * pBytes, int iBytes )
+{
+	SPH_ENTER_METHOD();
+	if ( m_iBufLeft<iBytes )
+	{
+		m_bBufOverrun = true;
+		SPH_VOID_RET();
+	}
+
+	memcpy ( m_pCur, pBytes, iBytes );
+
+	m_pCur += iBytes;
+	m_iBufLeft -= iBytes;
+	SPH_VOID_RET();
+}
+
+
+int CSphSEQuery::BuildRequest ( char ** ppBuffer )
+{
+	SPH_ENTER_METHOD();
+
+	// calc request length
+	int iReqSize = 124 + 4*m_iWeights
+		+ strlen ( m_sSortBy )
+		+ strlen ( m_sQuery )
+		+ strlen ( m_sIndex )
+		+ strlen ( m_sGroupBy )
+		+ strlen ( m_sGroupSortBy )
+		+ strlen ( m_sGroupDistinct )
+		+ strlen ( m_sComment );
+	for ( int i=0; i<m_iFilters; i++ )
+	{
+		const CSphSEFilter & tFilter = m_dFilters[i];
+		iReqSize += 12 + strlen ( tFilter.m_sAttrName ); // string attr-name; int type; int exclude-flag
+		switch ( tFilter.m_eType )
+		{
+			case SPH_FILTER_VALUES:		iReqSize += 4 + 8*tFilter.m_iValues; break;
+			case SPH_FILTER_RANGE:		iReqSize += 16; break;
+			case SPH_FILTER_FLOATRANGE:	iReqSize += 8; break;
+		}
+	}
+	if ( m_bGeoAnchor ) // 1.14+
+		iReqSize += 16 + strlen ( m_sGeoLatAttr ) + strlen  ( m_sGeoLongAttr );
+	for ( int i=0; i<m_iIndexWeights; i++ ) // 1.15+
+		iReqSize += 8 + strlen(m_sIndexWeight[i] );
+	for ( int i=0; i<m_iFieldWeights; i++ ) // 1.18+
+		iReqSize += 8 + strlen(m_sFieldWeight[i] );
+	// overrides
+	iReqSize += 4;
+	for ( int i=0; i<m_dOverrides.elements(); i++ )
+	{
+		CSphSEQuery::Override_t * pOverride = m_dOverrides.at(i);
+		const uint32 uSize = pOverride->m_iType == SPH_ATTR_BIGINT ? 16 : 12; // id64 + value
+		iReqSize += strlen ( pOverride->m_sName ) + 12 + uSize*pOverride->m_dIds.elements();
+	}
+	// select
+	iReqSize += 4;
+		
+	m_iBufLeft = 0;
+	SafeDeleteArray ( m_pBuf );
+
+	m_pBuf = new char [ iReqSize ];
+	if ( !m_pBuf )
+		SPH_RET(-1);
+
+	m_pCur = m_pBuf;
+	m_iBufLeft = iReqSize;
+	m_bBufOverrun = false;
+	(*ppBuffer) = m_pBuf;
+
+	// build request
+	SendWord ( SEARCHD_COMMAND_SEARCH ); // command id
+	SendWord ( VER_COMMAND_SEARCH ); // command version
+	SendInt ( iReqSize-8 ); // packet body length
+
+	SendInt ( 1 ); // number of queries
+	SendInt ( m_iOffset );
+	SendInt ( m_iLimit );
+	SendInt ( m_eMode );
+	SendInt ( m_eRanker ); // 1.16+
+	SendInt ( m_eSort );
+	SendString ( m_sSortBy ); // sort attr
+	SendString ( m_sQuery ); // query
+	SendInt ( m_iWeights );
+	for ( int j=0; j<m_iWeights; j++ )
+		SendInt ( m_pWeights[j] ); // weights
+	SendString ( m_sIndex ); // indexes
+	SendInt ( 1 ); // id64 range follows
+	SendUint64 ( m_iMinID ); // id/ts ranges
+	SendUint64 ( m_iMaxID );
+
+	SendInt ( m_iFilters );
+	for ( int j=0; j<m_iFilters; j++ )
+	{
+		const CSphSEFilter & tFilter = m_dFilters[j];
+		SendString ( tFilter.m_sAttrName );
+		SendInt ( tFilter.m_eType );
+
+		switch ( tFilter.m_eType )
+		{
+			case SPH_FILTER_VALUES:
+				SendInt ( tFilter.m_iValues );
+				for ( int k=0; k<tFilter.m_iValues; k++ )
+					SendUint64 ( tFilter.m_pValues[k] );
+				break;
+
+			case SPH_FILTER_RANGE:
+				SendUint64 ( tFilter.m_uMinValue );
+				SendUint64 ( tFilter.m_uMaxValue );
+				break;
+
+			case SPH_FILTER_FLOATRANGE:
+				SendFloat ( tFilter.m_fMinValue );
+				SendFloat ( tFilter.m_fMaxValue );
+				break;
+		}
+
+		SendInt ( tFilter.m_bExclude );
+	}
+
+	SendInt ( m_eGroupFunc );
+	SendString ( m_sGroupBy );
+	SendInt ( m_iMaxMatches );
+	SendString ( m_sGroupSortBy );
+	SendInt ( m_iCutoff ); // 1.9+
+	SendInt ( m_iRetryCount ); // 1.10+
+	SendInt ( m_iRetryDelay );
+	SendString ( m_sGroupDistinct ); // 1.11+
+	SendInt ( m_bGeoAnchor ); // 1.14+
+	if ( m_bGeoAnchor )
+	{
+		SendString ( m_sGeoLatAttr );
+		SendString ( m_sGeoLongAttr );
+		SendFloat ( m_fGeoLatitude );
+		SendFloat ( m_fGeoLongitude );
+	}
+	SendInt ( m_iIndexWeights ); // 1.15+
+	for ( int i=0; i<m_iIndexWeights; i++ )
+	{
+		SendString ( m_sIndexWeight[i] );
+		SendInt ( m_iIndexWeight[i] );
+	}
+	SendInt ( m_iMaxQueryTime ); // 1.17+
+	SendInt ( m_iFieldWeights ); // 1.18+
+	for ( int i=0; i<m_iFieldWeights; i++ )
+	{
+		SendString ( m_sFieldWeight[i] );
+		SendInt ( m_iFieldWeight[i] );
+	}
+	SendString ( m_sComment );
+
+	// overrides
+	SendInt ( m_dOverrides.elements() );
+	for ( int i=0; i<m_dOverrides.elements(); i++ )
+	{
+		CSphSEQuery::Override_t * pOverride = m_dOverrides.at(i);
+		SendString ( pOverride->m_sName );
+		SendDword ( pOverride->m_iType );
+		SendInt ( pOverride->m_dIds.elements() );
+		for ( int j=0; j<pOverride->m_dIds.elements(); j++ )
+		{
+			SendUint64 ( pOverride->m_dIds.at(j) );
+			if ( pOverride->m_iType == SPH_ATTR_FLOAT )
+				SendFloat ( pOverride->m_dValues.at(j).m_fValue );
+			else if ( pOverride->m_iType == SPH_ATTR_BIGINT )
+				SendUint64 ( pOverride->m_dValues.at(j).m_iValue64 );
+			else
+				SendDword ( pOverride->m_dValues.at(j).m_uValue );
+		}
+	}
+
+	// select
+	SendString ( "" );
+
+	// detect buffer overruns and underruns, and report internal error
+	if ( m_bBufOverrun || m_iBufLeft!=0 || m_pCur-m_pBuf!=iReqSize )
+		SPH_RET(-1);
+
+	// all fine
+	SPH_RET(iReqSize);
+}
+
+//////////////////////////////////////////////////////////////////////////////
+// SPHINX HANDLER
+//////////////////////////////////////////////////////////////////////////////
+
+static const char * ha_sphinx_exts[] = { NullS };
+
+
+#if MYSQL_VERSION_ID<50100
+ha_sphinx::ha_sphinx ( TABLE_ARG * table )
+	: handler ( &sphinx_hton, table )
+#else
+ha_sphinx::ha_sphinx ( handlerton * hton, TABLE_ARG * table )
+	: handler ( hton, table )
+#endif
+	, m_pShare ( NULL )
+	, m_iMatchesTotal ( 0 )
+	, m_iCurrentPos ( 0 )
+	, m_pCurrentKey ( NULL )
+	, m_iCurrentKeyLen ( 0 )
+	, m_pResponse ( NULL )
+	, m_pResponseEnd ( NULL )
+	, m_pCur ( NULL )
+	, m_bUnpackError ( false )
+	, m_iFields ( 0 )
+	, m_dFields ( NULL )
+	, m_iAttrs ( 0 )
+	, m_dAttrs ( NULL )
+	, m_bId64 ( 0 )
+	, m_dUnboundFields ( NULL )
+{
+	SPH_ENTER_METHOD();
+	SPH_VOID_RET();
+}
+
+
+// If frm_error() is called then we will use this to to find out what file extentions
+// exist for the storage engine. This is also used by the default rename_table and
+// delete_table method in handler.cc.
+const char ** ha_sphinx::bas_ext() const
+{
+	return ha_sphinx_exts;
+}
+
+
+// Used for opening tables. The name will be the name of the file.
+// A table is opened when it needs to be opened. For instance
+// when a request comes in for a select on the table (tables are not
+// open and closed for each request, they are cached).
+//
+// Called from handler.cc by handler::ha_open(). The server opens all tables by
+// calling ha_open() which then calls the handler specific open().
+int ha_sphinx::open ( const char * name, int, uint )
+{
+	SPH_ENTER_METHOD();
+	m_pShare = get_share ( name, table );
+	if ( !m_pShare )
+		SPH_RET(1);
+
+	thr_lock_data_init ( &m_pShare->m_tLock, &m_tLock, NULL );
+
+	*thd_ha_data ( table->in_use, ht ) = NULL;
+
+	SPH_RET(0);
+}
+
+
+int ha_sphinx::ConnectToSearchd ( const char * sQueryHost, int iQueryPort )
+{
+	SPH_ENTER_METHOD();
+
+	struct sockaddr_in sin;
+#ifndef __WIN__
+	struct sockaddr_un saun;
+#endif
+
+	int iDomain = 0;
+	int iSockaddrSize = 0;
+	struct sockaddr * pSockaddr = NULL;
+
+	in_addr_t ip_addr;
+	int version;
+	uint uClientVersion = htonl ( SPHINX_SEARCHD_PROTO );
+
+	const char * sHost = ( sQueryHost && *sQueryHost ) ? sQueryHost : m_pShare->m_sHost;
+	ushort iPort = iQueryPort ? (ushort)iQueryPort : m_pShare->m_iPort;
+
+	if ( iPort )
+	{
+		iDomain = AF_INET;
+		iSockaddrSize = sizeof(sin);
+		pSockaddr = (struct sockaddr *) &sin;
+
+		memset ( &sin, 0, sizeof(sin) );
+		sin.sin_family = AF_INET;
+		sin.sin_port = htons(iPort);
+		
+		// prepare host address
+		if ( (int)( ip_addr=inet_addr(sHost) ) != (int)INADDR_NONE )
+		{ 
+			memcpy ( &sin.sin_addr, &ip_addr, sizeof(ip_addr) );
+		} else
+		{
+			int tmp_errno;
+			struct hostent tmp_hostent, *hp;
+			char buff2 [ GETHOSTBYNAME_BUFF_SIZE ];
+			
+			hp = my_gethostbyname_r ( sHost, &tmp_hostent,
+				buff2, sizeof(buff2), &tmp_errno );
+			if ( !hp )
+			{ 
+				my_gethostbyname_r_free();
+				
+				char sError[256];
+				my_snprintf ( sError, sizeof(sError), "failed to resolve searchd host (name=%s)", sHost );
+				
+				my_error ( ER_CONNECT_TO_FOREIGN_DATA_SOURCE, MYF(0), sError );
+				SPH_RET(-1);
+			}
+			
+			memcpy ( &sin.sin_addr, hp->h_addr,
+				Min ( sizeof(sin.sin_addr), (size_t)hp->h_length ) );
+			my_gethostbyname_r_free();
+		}
+	} else
+	{
+#ifndef __WIN__
+		iDomain = AF_UNIX;
+		iSockaddrSize = sizeof(saun);
+		pSockaddr = (struct sockaddr *) &saun;
+
+		memset ( &saun, 0, sizeof(saun) );
+		saun.sun_family = AF_UNIX;
+		strncpy ( saun.sun_path, sHost, sizeof(saun.sun_path)-1 );
+#else
+		my_error ( ER_CONNECT_TO_FOREIGN_DATA_SOURCE, MYF(0), "UNIX sockets are not supported on Windows" );
+		SPH_RET(-1);
+#endif
+	}
+
+	char sError[512];
+	int iSocket = socket ( iDomain, SOCK_STREAM, 0 );
+
+	if ( iSocket<0 )
+	{
+		my_error ( ER_CONNECT_TO_FOREIGN_DATA_SOURCE, MYF(0), "failed to create client socket" );
+		SPH_RET(-1);
+	}
+
+	if ( connect ( iSocket, pSockaddr, iSockaddrSize )<0 )
+	{
+		sphSockClose ( iSocket );
+		my_snprintf ( sError, sizeof(sError), "failed to connect to searchd (host=%s, errno=%d, port=%d)",
+			sHost, errno, iPort );
+		my_error ( ER_CONNECT_TO_FOREIGN_DATA_SOURCE, MYF(0), sError );
+		SPH_RET(-1);
+	}
+
+	if ( ::recv ( iSocket, (char *)&version, sizeof(version), 0 )!=sizeof(version) )
+	{
+		sphSockClose ( iSocket );
+		my_snprintf ( sError, sizeof(sError), "failed to receive searchd version (host=%s, port=%d)",
+			sHost, iPort );
+		my_error ( ER_CONNECT_TO_FOREIGN_DATA_SOURCE, MYF(0), sError );
+		SPH_RET(-1);
+	}
+
+	if ( ::send ( iSocket, (char*)&uClientVersion, sizeof(uClientVersion), 0 )!=sizeof(uClientVersion) )
+	{
+		sphSockClose ( iSocket );
+		my_snprintf ( sError, sizeof(sError), "failed to send client version (host=%s, port=%d)",
+			sHost, iPort );
+		my_error ( ER_CONNECT_TO_FOREIGN_DATA_SOURCE, MYF(0), sError );
+		SPH_RET(-1);
+	}
+
+	SPH_RET(iSocket);
+}
+
+
+// Closes a table. We call the free_share() function to free any resources
+// that we have allocated in the "shared" structure.
+//
+// Called from sql_base.cc, sql_select.cc, and table.cc.
+// In sql_select.cc it is only used to close up temporary tables or during
+// the process where a temporary table is converted over to being a
+// myisam table.
+// For sql_base.cc look at close_data_tables().
+int ha_sphinx::close()
+{
+	SPH_ENTER_METHOD();
+	SPH_RET ( free_share(m_pShare) );
+}
+
+
+int ha_sphinx::write_row ( uchar * )
+{
+	SPH_ENTER_METHOD();
+	SPH_RET ( HA_ERR_WRONG_COMMAND );
+}
+
+
+int ha_sphinx::update_row ( const uchar *, uchar * )
+{
+	SPH_ENTER_METHOD();
+	SPH_RET ( HA_ERR_WRONG_COMMAND );
+}
+
+
+int ha_sphinx::delete_row ( const uchar * )
+{
+	SPH_ENTER_METHOD();
+	SPH_RET ( HA_ERR_WRONG_COMMAND );
+}
+
+
+// keynr is key (index) number
+// sorted is 1 if result MUST be sorted according to index
+int ha_sphinx::index_init ( uint keynr, bool )
+{
+	SPH_ENTER_METHOD();
+	active_index = keynr;
+	SPH_RET(0);
+}
+
+
+int ha_sphinx::index_end()
+{
+	SPH_ENTER_METHOD();
+	SPH_RET(0);
+}
+
+
+uint32 ha_sphinx::UnpackDword ()
+{
+	if ( m_pCur+sizeof(uint32)>m_pResponseEnd )
+	{
+		m_pCur = m_pResponseEnd;
+		m_bUnpackError = true;
+		return 0;
+	}
+
+	uint32 uRes = ntohl ( sphUnalignedRead ( *(uint32*)m_pCur ) );
+	m_pCur += sizeof(uint32);
+	return uRes;
+}
+
+
+char * ha_sphinx::UnpackString ()
+{
+	uint32 iLen = UnpackDword ();
+	if ( !iLen )
+		return NULL;
+
+	if ( m_pCur+iLen>m_pResponseEnd )
+	{
+		m_pCur = m_pResponseEnd;
+		m_bUnpackError = true;
+		return NULL;
+	}
+
+	char * sRes = new char [ 1+iLen ];
+	memcpy ( sRes, m_pCur, iLen );
+	sRes[iLen] = '\0';
+	m_pCur += iLen;
+	return sRes;
+}
+
+
+static inline const char * FixNull ( const char * s )
+{
+	return s ? s : "(null)";
+}
+
+
+bool ha_sphinx::UnpackSchema ()
+{
+	SPH_ENTER_METHOD();
+
+	// cleanup
+	if ( m_dFields )
+		for ( int i=0; i<(int)m_iFields; i++ )
+			SafeDeleteArray ( m_dFields[i] );
+	SafeDeleteArray ( m_dFields );
+
+	// unpack network packet
+	uint32 uStatus = UnpackDword ();
+	char * sMessage = NULL;
+
+	if ( uStatus!=SEARCHD_OK )
+	{
+		sMessage = UnpackString ();
+		CSphSEThreadData * pTls = GetTls ();
+		if ( pTls )
+		{
+			strncpy ( pTls->m_tStats.m_sLastMessage, sMessage, sizeof(pTls->m_tStats.m_sLastMessage) );
+			pTls->m_tStats.m_bLastError = ( uStatus==SEARCHD_ERROR );
+		}
+
+		if ( uStatus==SEARCHD_ERROR )
+		{
+			char sError[1024];
+			my_snprintf ( sError, sizeof(sError), "searchd error: %s", sMessage );
+			my_error ( ER_QUERY_ON_FOREIGN_DATA_SOURCE, MYF(0), sError );
+			SafeDeleteArray ( sMessage );
+			SPH_RET ( false );
+		}
+	}
+
+	m_iFields = UnpackDword ();
+	m_dFields = new char * [ m_iFields ];
+	if ( !m_dFields )
+	{
+		my_error ( ER_QUERY_ON_FOREIGN_DATA_SOURCE, MYF(0), "INTERNAL ERROR: UnpackSchema() failed (fields alloc error)" );
+		SPH_RET(false);
+	}
+
+	for ( uint32 i=0; i<m_iFields; i++ )
+		m_dFields[i] = UnpackString ();
+
+	SafeDeleteArray ( m_dAttrs );
+	m_iAttrs = UnpackDword ();
+	m_dAttrs = new CSphSEAttr [ m_iAttrs ];
+	if ( !m_dAttrs )
+	{
+		for ( int i=0; i<(int)m_iFields; i++ )
+			SafeDeleteArray ( m_dFields[i] );
+		SafeDeleteArray ( m_dFields );
+		my_error ( ER_QUERY_ON_FOREIGN_DATA_SOURCE, MYF(0), "INTERNAL ERROR: UnpackSchema() failed (attrs alloc error)" );
+		SPH_RET(false);
+	}
+
+	for ( uint32 i=0; i<m_iAttrs; i++ )
+	{
+		m_dAttrs[i].m_sName = UnpackString ();
+		m_dAttrs[i].m_uType = UnpackDword ();
+		if ( m_bUnpackError ) // m_sName may be null
+			break;
+
+		m_dAttrs[i].m_iField = -1;
+		for ( int j=SPHINXSE_SYSTEM_COLUMNS; j<m_pShare->m_iTableFields; j++ )
+		{
+			const char * sTableField = m_pShare->m_sTableField[j];
+			const char * sAttrField = m_dAttrs[i].m_sName;
+			if ( m_dAttrs[i].m_sName[0]=='@' )
+			{
+				const char * sAtPrefix = "_sph_";
+				if ( strncmp ( sTableField, sAtPrefix, strlen(sAtPrefix) ) )
+					continue;
+				sTableField += strlen(sAtPrefix);
+				sAttrField++;
+			}
+
+			if ( !strcasecmp ( sAttrField, sTableField ) )
+			{
+				// we're almost good, but
+				// let's enforce that timestamp columns can only receive timestamp attributes
+				if ( m_pShare->m_eTableFieldType[j]!=MYSQL_TYPE_TIMESTAMP || m_dAttrs[i].m_uType==SPH_ATTR_TIMESTAMP )
+					m_dAttrs[i].m_iField = j;
+				break;
+			}
+		}
+	}
+
+	m_iMatchesTotal = UnpackDword ();
+
+	m_bId64 = UnpackDword ();
+	if ( m_bId64 && m_pShare->m_eTableFieldType[0] != MYSQL_TYPE_LONGLONG )
+	{
+		my_error ( ER_QUERY_ON_FOREIGN_DATA_SOURCE, MYF(0), "INTERNAL ERROR: 1st column must be bigint to accept 64-bit DOCIDs" );
+		SPH_RET(false);
+	}
+
+	// network packet unpacked; build unbound fields map
+	SafeDeleteArray ( m_dUnboundFields );
+	m_dUnboundFields = new int [ m_pShare->m_iTableFields ];
+
+	for ( int i=0; i<m_pShare->m_iTableFields; i++ )
+	{
+		if ( i<SPHINXSE_SYSTEM_COLUMNS )
+			m_dUnboundFields[i] = SPH_ATTR_NONE;
+
+		else if ( m_pShare->m_eTableFieldType[i]==MYSQL_TYPE_TIMESTAMP )
+			m_dUnboundFields[i] = SPH_ATTR_TIMESTAMP;
+
+		else
+			m_dUnboundFields[i] = SPH_ATTR_INTEGER;
+	}
+
+	for ( uint32 i=0; i<m_iAttrs; i++ )
+		if ( m_dAttrs[i].m_iField>=0 )
+			m_dUnboundFields [ m_dAttrs[i].m_iField ] = SPH_ATTR_NONE;
+
+	if ( m_bUnpackError )
+		my_error ( ER_QUERY_ON_FOREIGN_DATA_SOURCE, MYF(0), "INTERNAL ERROR: UnpackSchema() failed (unpack error)" );
+
+	SPH_RET(!m_bUnpackError);
+}
+
+
+bool ha_sphinx::UnpackStats ( CSphSEStats * pStats )
+{
+	assert ( pStats );
+
+	char * pCurSave = m_pCur;
+	for ( uint i=0; i<m_iMatchesTotal && m_pCur<m_pResponseEnd-sizeof(uint32); i++ )
+	{
+		m_pCur += m_bId64 ? 12 : 8; // skip id+weight
+		for ( uint32 i=0; i<m_iAttrs && m_pCur<m_pResponseEnd-sizeof(uint32); i++ )
+		{
+			if ( m_dAttrs[i].m_uType & SPH_ATTR_MULTI )
+			{
+				// skip MVA list
+				uint32 uCount = UnpackDword ();
+				m_pCur += uCount*4;
+			}
+			else // skip normal value 
+				m_pCur += m_dAttrs[i].m_uType == SPH_ATTR_BIGINT ? 8 : 4;
+		}
+	}
+	
+	pStats->m_iMatchesTotal = UnpackDword ();
+	pStats->m_iMatchesFound = UnpackDword ();
+	pStats->m_iQueryMsec = UnpackDword ();
+	pStats->m_iWords = UnpackDword ();
+
+	if ( m_bUnpackError )
+		return false;
+
+	SafeDeleteArray ( pStats->m_dWords );
+	if ( pStats->m_iWords<0 || pStats->m_iWords>=SPHINXSE_MAX_KEYWORDSTATS )
+		return false;
+	pStats->m_dWords = new CSphSEWordStats [ pStats->m_iWords ];
+	if ( !pStats->m_dWords )
+		return false;
+
+	for ( int i=0; i<pStats->m_iWords; i++ )
+	{
+		CSphSEWordStats & tWord = pStats->m_dWords[i];
+		tWord.m_sWord = UnpackString ();
+		tWord.m_iDocs = UnpackDword ();
+		tWord.m_iHits = UnpackDword ();
+	}
+
+	if ( m_bUnpackError )
+		return false;
+
+	m_pCur = pCurSave;
+	return true;
+}
+
+
+/// condition pushdown implementation, to properly intercept WHERE clauses on my columns
+const COND * ha_sphinx::cond_push ( const COND * cond )
+{
+	// catch the simplest case: query_column="some text"
+	for ( ;; )
+	{
+		if ( cond->type()!=COND::FUNC_ITEM )
+			break;
+
+		Item_func * condf = (Item_func *)cond;
+		if ( condf->functype()!=Item_func::EQ_FUNC || condf->argument_count()!=2 )
+			break;
+
+		Item ** args = condf->arguments();
+		if ( args[0]->type()!=COND::FIELD_ITEM || args[1]->type()!=COND::STRING_ITEM )
+			break;
+
+		Item_field * pField = (Item_field *) args[0];
+		if ( pField->field->field_index!=2 ) // FIXME! magic key index
+			break;
+
+		// get my tls
+		CSphSEThreadData * pTls = GetTls ();
+		if ( !pTls )
+			break;
+
+		// copy the query, and let know that we intercepted this condition
+		Item_string * pString = (Item_string *) args[1];
+		pTls->m_bQuery = true;
+		strncpy ( pTls->m_sQuery, pString->str_value.c_ptr(), sizeof(pTls->m_sQuery) );
+		pTls->m_sQuery[sizeof(pTls->m_sQuery)-1] = '\0';
+		pTls->m_pQueryCharset = pString->str_value.charset();
+		return NULL;
+	}
+
+	// don't change anything
+	return cond;
+}
+
+
+/// condition popup
+void ha_sphinx::cond_pop ()
+{
+	CSphSEThreadData * pTls = GetTls ();
+	if ( pTls && pTls->m_bQuery )
+		pTls->m_bQuery = false;
+	return;
+}
+
+
+/// get TLS (maybe allocate it, too)
+CSphSEThreadData * ha_sphinx::GetTls()
+{
+	// where do we store that pointer in today's version?
+	CSphSEThreadData ** ppTls;
+	ppTls = (CSphSEThreadData**) thd_ha_data ( ha_thd(), ht );
+
+	// allocate if needed
+	if ( !*ppTls )
+		*ppTls = new CSphSEThreadData ();
+
+	// errors will be handled by caller
+	return *ppTls;
+}
+
+
+// Positions an index cursor to the index specified in the handle. Fetches the
+// row if available. If the key value is null, begin at the first key of the
+// index.
+int ha_sphinx::index_read ( byte * buf, const byte * key, uint key_len, enum ha_rkey_function )
+{
+	SPH_ENTER_METHOD();
+	char sError[256];
+
+	// set new data for thd->ha_data, it is used in show_status
+	CSphSEThreadData * pTls = GetTls();
+	if ( !pTls )
+	{
+		my_error ( ER_QUERY_ON_FOREIGN_DATA_SOURCE, MYF(0), "INTERNAL ERROR: TLS malloc() failed" );
+		SPH_RET ( HA_ERR_END_OF_FILE );
+	}
+	pTls->m_tStats.Reset ();
+
+	// parse query
+	if ( pTls->m_bQuery )
+	{
+		// we have a query from condition pushdown
+		m_pCurrentKey = (const byte *) pTls->m_sQuery;
+		m_iCurrentKeyLen = strlen(pTls->m_sQuery);
+	} else
+	{
+		// just use the key (might be truncated)
+		m_pCurrentKey = key+HA_KEY_BLOB_LENGTH;
+		m_iCurrentKeyLen = uint2korr(key); // or maybe key_len?
+		pTls->m_pQueryCharset = m_pShare ? m_pShare->m_pTableQueryCharset : NULL;
+	}
+
+	CSphSEQuery q ( (const char*)m_pCurrentKey, m_iCurrentKeyLen, m_pShare->m_sIndex );
+	if ( !q.Parse () )
+	{
+		my_error ( ER_QUERY_ON_FOREIGN_DATA_SOURCE, MYF(0), q.m_sParseError );
+		SPH_RET ( HA_ERR_END_OF_FILE );
+	}
+
+	// do connect
+	int iSocket = ConnectToSearchd ( q.m_sHost, q.m_iPort );
+	if ( iSocket<0 )
+		SPH_RET ( HA_ERR_END_OF_FILE );
+
+	// my buffer
+	char * pBuffer; // will be free by CSphSEQuery dtor; do NOT free manually
+	int iReqLen = q.BuildRequest ( &pBuffer );
+
+	if ( iReqLen<=0 )
+	{
+		my_error ( ER_QUERY_ON_FOREIGN_DATA_SOURCE, MYF(0), "INTERNAL ERROR: q.BuildRequest() failed" );
+		SPH_RET ( HA_ERR_END_OF_FILE );
+	}
+
+	// send request
+	::send ( iSocket, pBuffer, iReqLen, 0 );
+
+	// receive reply
+	char sHeader[8];
+	int iGot = ::recv ( iSocket, sHeader, sizeof(sHeader), RECV_FLAGS );
+	if ( iGot!=sizeof(sHeader) )
+	{
+		my_error ( ER_QUERY_ON_FOREIGN_DATA_SOURCE, MYF(0), "failed to receive response header (searchd went away?)" );
+		SPH_RET ( HA_ERR_END_OF_FILE );
+	}
+
+	short int uRespStatus = ntohs ( sphUnalignedRead ( *(short int*)( &sHeader[0] ) ) );
+	short int uRespVersion = ntohs ( sphUnalignedRead ( *(short int*)( &sHeader[2] ) ) );
+	uint uRespLength = ntohl ( sphUnalignedRead ( *(uint *)( &sHeader[4] ) ) );
+	SPH_DEBUG ( "got response header (status=%d version=%d length=%d)",
+		uRespStatus, uRespVersion, uRespLength );
+
+	SafeDeleteArray ( m_pResponse );
+	if ( uRespLength<=SPHINXSE_MAX_ALLOC )
+		m_pResponse = new char [ uRespLength+1 ];
+
+	if ( !m_pResponse )
+	{
+		my_snprintf ( sError, sizeof(sError), "bad searchd response length (length=%u)", uRespLength );
+		my_error ( ER_QUERY_ON_FOREIGN_DATA_SOURCE, MYF(0), sError );
+		SPH_RET ( HA_ERR_END_OF_FILE );
+	}
+
+	int iRecvLength = 0;
+	while ( iRecvLength<(int)uRespLength )
+	{
+		int iRecv = ::recv ( iSocket, m_pResponse+iRecvLength, uRespLength-iRecvLength, RECV_FLAGS );
+		if ( iRecv<0 )
+			break;
+		iRecvLength += iRecv;
+	}
+
+	::closesocket ( iSocket );
+	iSocket = -1;
+
+	if ( iRecvLength!=(int)uRespLength )
+	{
+		my_snprintf ( sError, sizeof(sError), "net read error (expected=%d, got=%d)", uRespLength, iRecvLength );
+		my_error ( ER_QUERY_ON_FOREIGN_DATA_SOURCE, MYF(0), sError );
+		SPH_RET ( HA_ERR_END_OF_FILE );
+	}
+
+	// we'll have a message, at least
+	pTls->m_bStats = true;
+
+	// parse reply
+	m_iCurrentPos = 0;
+	m_pCur = m_pResponse;
+	m_pResponseEnd = m_pResponse + uRespLength;
+	m_bUnpackError = false;
+
+	if ( uRespStatus!=SEARCHD_OK )
+	{
+		char * sMessage = UnpackString ();
+		if ( !sMessage )
+		{
+			my_error ( ER_QUERY_ON_FOREIGN_DATA_SOURCE, MYF(0), "no valid response from searchd (status=%d, resplen=%d)",
+				uRespStatus, uRespLength );
+			SPH_RET ( HA_ERR_END_OF_FILE );
+		}
+
+		strncpy ( pTls->m_tStats.m_sLastMessage, sMessage, sizeof(pTls->m_tStats.m_sLastMessage) );
+		SafeDeleteArray ( sMessage );
+
+		if ( uRespStatus!=SEARCHD_WARNING )
+		{
+			my_snprintf ( sError, sizeof(sError), "searchd error: %s", pTls->m_tStats.m_sLastMessage );
+			my_error ( ER_QUERY_ON_FOREIGN_DATA_SOURCE, MYF(0), sError );
+
+			pTls->m_tStats.m_bLastError = true;
+			SPH_RET ( HA_ERR_END_OF_FILE );
+		}
+	}
+
+	if ( !UnpackSchema () )
+		SPH_RET ( HA_ERR_END_OF_FILE );
+
+	if ( !UnpackStats ( &pTls->m_tStats ) )
+	{
+		my_error ( ER_QUERY_ON_FOREIGN_DATA_SOURCE, MYF(0), "INTERNAL ERROR: UnpackStats() failed" );
+		SPH_RET ( HA_ERR_END_OF_FILE );
+	}
+
+	SPH_RET ( get_rec ( buf, key, key_len ) );
+}
+
+
+// Positions an index cursor to the index specified in key. Fetches the
+// row if any. This is only used to read whole keys.
+int ha_sphinx::index_read_idx ( byte *, uint, const byte *, uint, enum ha_rkey_function )
+{
+	SPH_ENTER_METHOD();
+	SPH_RET ( HA_ERR_WRONG_COMMAND );
+}
+
+
+// Used to read forward through the index.
+int ha_sphinx::index_next ( byte * buf )
+{
+	SPH_ENTER_METHOD();
+	SPH_RET ( get_rec ( buf, m_pCurrentKey, m_iCurrentKeyLen ) );
+}
+
+
+int ha_sphinx::index_next_same ( byte * buf, const byte * key, uint keylen )
+{
+	SPH_ENTER_METHOD();
+	SPH_RET ( get_rec ( buf, key, keylen ) );
+}
+
+
+int ha_sphinx::get_rec ( byte * buf, const byte *, uint )
+{
+	SPH_ENTER_METHOD();
+
+	if ( m_iCurrentPos>=m_iMatchesTotal )
+	{
+		SafeDeleteArray ( m_pResponse );
+		SPH_RET ( HA_ERR_END_OF_FILE ); 
+	}
+
+	#if MYSQL_VERSION_ID>50100
+	my_bitmap_map * org_bitmap = dbug_tmp_use_all_columns ( table, table->write_set );
+	#endif
+	Field ** field = table->field;
+
+	// unpack and return the match
+	longlong uMatchID = UnpackDword ();
+	if ( m_bId64 )
+		uMatchID = ( uMatchID<<32 ) + UnpackDword();
+	uint32 uMatchWeight = UnpackDword ();
+
+	field[0]->store ( uMatchID, 1 );
+	field[1]->store ( uMatchWeight, 1 );
+	field[2]->store ( (const char*)m_pCurrentKey, m_iCurrentKeyLen, &my_charset_bin );
+
+	for ( uint32 i=0; i<m_iAttrs; i++ )
+	{
+		longlong iValue64= 0;
+		uint32 uValue = UnpackDword ();
+		if ( m_dAttrs[i].m_uType == SPH_ATTR_BIGINT )
+			iValue64 = ( (longlong)uValue<<32 ) | UnpackDword();
+		if ( m_dAttrs[i].m_iField<0 )
+		{
+			// skip MVA
+			if ( m_dAttrs[i].m_uType & SPH_ATTR_MULTI )
+				for ( ; uValue>0 && !m_bUnpackError; uValue-- )
+					UnpackDword();
+			continue;
+		}
+
+		Field * af = field [ m_dAttrs[i].m_iField ];
+		switch ( m_dAttrs[i].m_uType )
+		{
+			case SPH_ATTR_INTEGER:
+			case SPH_ATTR_ORDINAL:
+			case SPH_ATTR_BOOL:
+				af->store ( uValue, 1 );
+				break;
+
+			case SPH_ATTR_FLOAT:
+				af->store ( sphDW2F(uValue) );
+				break;
+
+			case SPH_ATTR_TIMESTAMP:
+				if ( af->type()==MYSQL_TYPE_TIMESTAMP )
+					longstore ( af->ptr, uValue ); // because store() does not accept timestamps
+				else
+					af->store ( uValue, 1 );
+				break;
+
+			case SPH_ATTR_BIGINT:
+				af->store ( iValue64, 0 );
+				break;
+
+			case ( SPH_ATTR_MULTI | SPH_ATTR_INTEGER ):
+				if ( uValue<=0 )
+				{
+					// shortcut, empty MVA set
+					af->store ( "", 0, &my_charset_bin );
+
+				} else
+				{
+					// convert MVA set to comma-separated string
+					char sBuf[1024]; // FIXME! magic size
+					char * pCur = sBuf;
+
+					for ( ; uValue>0 && !m_bUnpackError; uValue-- )
+					{
+						uint32 uEntry = UnpackDword ();
+						if ( pCur < sBuf+sizeof(sBuf)-16 ) // 10 chars per 32bit value plus some safety bytes
+						{
+							sprintf ( pCur, "%u", uEntry );
+							while ( *pCur ) *pCur++;
+							if ( uValue>1 )
+								*pCur++ = ','; // non-trailing commas
+						}
+					}
+
+					af->store ( sBuf, pCur-sBuf, &my_charset_bin );
+				}
+				break;
+
+			default:
+				my_error ( ER_QUERY_ON_FOREIGN_DATA_SOURCE, MYF(0), "INTERNAL ERROR: unhandled attr type" );
+				SafeDeleteArray ( m_pResponse );
+				SPH_RET ( HA_ERR_END_OF_FILE ); 
+		}
+	}
+
+	if ( m_bUnpackError )
+	{
+		my_error ( ER_QUERY_ON_FOREIGN_DATA_SOURCE, MYF(0), "INTERNAL ERROR: response unpacker failed" );
+		SafeDeleteArray ( m_pResponse );
+		SPH_RET ( HA_ERR_END_OF_FILE ); 
+	}
+
+	// zero out unmapped fields
+	for ( int i=SPHINXSE_SYSTEM_COLUMNS; i<(int)table->s->fields; i++ )
+		if ( m_dUnboundFields[i]!=SPH_ATTR_NONE )
+			switch ( m_dUnboundFields[i] )
+	{
+		case SPH_ATTR_INTEGER:		table->field[i]->store ( 0, 1 ); break;
+		case SPH_ATTR_TIMESTAMP:	longstore ( table->field[i]->ptr, 0 ); break;
+		default:
+			my_error ( ER_QUERY_ON_FOREIGN_DATA_SOURCE, MYF(0),
+				"INTERNAL ERROR: unhandled unbound field type %d", m_dUnboundFields[i] );
+			SafeDeleteArray ( m_pResponse );
+			SPH_RET ( HA_ERR_END_OF_FILE );
+	}
+
+	memset ( buf, 0, table->s->null_bytes );
+	m_iCurrentPos++;
+
+	#if MYSQL_VERSION_ID > 50100
+	dbug_tmp_restore_column_map(table->write_set, org_bitmap);
+	#endif
+
+	SPH_RET(0);
+}
+
+
+// Used to read backwards through the index.
+int ha_sphinx::index_prev ( byte * )
+{
+	SPH_ENTER_METHOD();
+	SPH_RET ( HA_ERR_WRONG_COMMAND );
+}
+
+
+// index_first() asks for the first key in the index.
+//
+// Called from opt_range.cc, opt_sum.cc, sql_handler.cc,
+// and sql_select.cc.
+int ha_sphinx::index_first ( byte * )
+{
+	SPH_ENTER_METHOD();
+	SPH_RET ( HA_ERR_END_OF_FILE );
+}
+
+// index_last() asks for the last key in the index.
+//
+// Called from opt_range.cc, opt_sum.cc, sql_handler.cc,
+// and sql_select.cc.
+int ha_sphinx::index_last ( byte * )
+{
+	SPH_ENTER_METHOD();
+	SPH_RET ( HA_ERR_WRONG_COMMAND );
+}
+
+
+int ha_sphinx::rnd_init ( bool )
+{
+	SPH_ENTER_METHOD();
+	SPH_RET(0);
+}
+
+
+int ha_sphinx::rnd_end()
+{
+	SPH_ENTER_METHOD();
+	SPH_RET(0);
+}
+
+
+int ha_sphinx::rnd_next ( byte * )
+{
+	SPH_ENTER_METHOD();
+	SPH_RET ( HA_ERR_END_OF_FILE );
+}
+
+
+void ha_sphinx::position ( const byte * )
+{
+	SPH_ENTER_METHOD();
+	SPH_VOID_RET();
+}
+
+
+// This is like rnd_next, but you are given a position to use
+// to determine the row. The position will be of the type that you stored in
+// ref. You can use ha_get_ptr(pos,ref_length) to retrieve whatever key
+// or position you saved when position() was called.
+// Called from filesort.cc records.cc sql_insert.cc sql_select.cc sql_update.cc.
+int ha_sphinx::rnd_pos ( byte *, byte * )
+{
+	SPH_ENTER_METHOD();
+	SPH_RET ( HA_ERR_WRONG_COMMAND );
+}
+
+
+#if MYSQL_VERSION_ID>=50030
+int ha_sphinx::info ( uint )
+#else
+void ha_sphinx::info ( uint )
+#endif
+{
+	SPH_ENTER_METHOD();
+
+	if ( table->s->keys>0 )
+		table->key_info[0].rec_per_key[0] = 1;
+
+	#if MYSQL_VERSION_ID>50100
+	stats.records = 20;
+	#else
+	records = 20;
+	#endif
+
+#if MYSQL_VERSION_ID>=50030
+	SPH_RET(0);
+#else
+	SPH_VOID_RET();
+#endif
+}
+
+
+int ha_sphinx::reset ()
+{
+	SPH_ENTER_METHOD();
+	CSphSEThreadData * pTls = GetTls ();
+	if ( pTls )
+		pTls->m_bQuery = false;
+	SPH_RET(0);
+}
+
+
+int ha_sphinx::delete_all_rows()
+{
+	SPH_ENTER_METHOD();
+	SPH_RET ( HA_ERR_WRONG_COMMAND );
+}
+
+
+// First you should go read the section "locking functions for mysql" in
+// lock.cc to understand this.
+// This create a lock on the table. If you are implementing a storage engine
+// that can handle transacations look at ha_berkely.cc to see how you will
+// want to go about doing this. Otherwise you should consider calling flock()
+// here.
+//
+// Called from lock.cc by lock_external() and unlock_external(). Also called
+// from sql_table.cc by copy_data_between_tables().
+int ha_sphinx::external_lock ( THD *, int )
+{
+	SPH_ENTER_METHOD();
+	SPH_RET(0);
+}
+
+
+THR_LOCK_DATA ** ha_sphinx::store_lock ( THD *, THR_LOCK_DATA ** to,
+	enum thr_lock_type lock_type )
+{
+	SPH_ENTER_METHOD();
+
+	if ( lock_type!=TL_IGNORE && m_tLock.type==TL_UNLOCK )
+		m_tLock.type=lock_type;
+
+	*to++ = &m_tLock;
+	SPH_RET(to);
+}
+
+
+int ha_sphinx::delete_table ( const char * )
+{
+	SPH_ENTER_METHOD();
+	SPH_RET(0);
+}
+
+
+// Renames a table from one name to another from alter table call.
+//
+// If you do not implement this, the default rename_table() is called from
+// handler.cc and it will delete all files with the file extentions returned
+// by bas_ext().
+//
+// Called from sql_table.cc by mysql_rename_table().
+int ha_sphinx::rename_table ( const char *, const char * )
+{
+	SPH_ENTER_METHOD();
+	SPH_RET(0);
+}
+
+
+// Given a starting key, and an ending key estimate the number of rows that
+// will exist between the two. end_key may be empty which in case determine
+// if start_key matches any rows.
+//
+// Called from opt_range.cc by check_quick_keys().
+ha_rows ha_sphinx::records_in_range ( uint, key_range *, key_range * )
+{
+	SPH_ENTER_METHOD();
+	SPH_RET(3); // low number to force index usage
+}
+
+
+static inline bool IsIntegerFieldType ( enum_field_types eType )
+{
+	return eType==MYSQL_TYPE_LONG || eType==MYSQL_TYPE_LONGLONG;
+}
+
+
+// create() is called to create a database. The variable name will have the name
+// of the table. When create() is called you do not need to worry about opening
+// the table. Also, the FRM file will have already been created so adjusting
+// create_info will not do you any good. You can overwrite the frm file at this
+// point if you wish to change the table definition, but there are no methods
+// currently provided for doing that.
+//
+// Called from handle.cc by ha_create_table().
+int ha_sphinx::create ( const char * name, TABLE * table, HA_CREATE_INFO * )
+{
+	SPH_ENTER_METHOD();
+	char sError[256];
+
+	if ( !ParseUrl ( NULL, table, true ) )
+		SPH_RET(-1);
+
+	for ( ;; )
+	{
+		// check system fields (count and types)
+		if ( table->s->fields<SPHINXSE_SYSTEM_COLUMNS )
+		{
+			my_snprintf ( sError, sizeof(sError), "%s: there MUST be at least %d columns",
+				name, SPHINXSE_SYSTEM_COLUMNS );
+			break;
+		}
+
+		if ( !IsIntegerFieldType ( table->field[0]->type() ) || !((Field_num *)table->field[0])->unsigned_flag )
+		{
+			my_snprintf ( sError, sizeof(sError), "%s: 1st column (docid) MUST be unsigned integer or bigint", name );
+			break;
+		}
+
+		if ( !IsIntegerFieldType ( table->field[1]->type() ) )
+		{
+			my_snprintf ( sError, sizeof(sError), "%s: 2nd column (weight) MUST be integer or bigint", name );
+			break;
+		}
+
+		enum_field_types f2 = table->field[2]->type();
+		if ( f2!=MYSQL_TYPE_VARCHAR
+			&& f2!=MYSQL_TYPE_BLOB && f2!=MYSQL_TYPE_MEDIUM_BLOB && f2!=MYSQL_TYPE_LONG_BLOB && f2!=MYSQL_TYPE_TINY_BLOB )
+		{
+			my_snprintf ( sError, sizeof(sError), "%s: 3rd column (search query) MUST be varchar or text", name );
+			break;
+		}
+
+		// check attributes
+		int i;
+		for ( i=3; i<(int)table->s->fields; i++ )
+		{
+			enum_field_types eType = table->field[i]->type();
+			if ( eType!=MYSQL_TYPE_TIMESTAMP && !IsIntegerFieldType(eType) && eType!=MYSQL_TYPE_VARCHAR && eType!=MYSQL_TYPE_FLOAT )
+			{
+				my_snprintf ( sError, sizeof(sError), "%s: %dth column (attribute %s) MUST be integer, bigint, timestamp, varchar, or float",
+					name, i+1, table->field[i]->field_name );
+				break;
+			}
+		}
+
+		if ( i!=(int)table->s->fields )
+			break;
+
+		// check index
+		if (
+			table->s->keys!=1 ||
+			table->key_info[0].key_parts!=1 ||
+			strcasecmp ( table->key_info[0].key_part[0].field->field_name, table->field[2]->field_name ) )
+		{
+			my_snprintf ( sError, sizeof(sError), "%s: there must be an index on '%s' column",
+				name, table->field[2]->field_name );
+			break;
+		}
+
+		// all good
+		sError[0] = '\0';
+		break;
+	}
+	if ( sError[0] )
+	{
+		my_error ( ER_CANT_CREATE_TABLE, MYF(0), sError, -1 );
+		SPH_RET(-1);
+	}
+
+	SPH_RET(0);
+}
+
+//// show functions
+
+#if MYSQL_VERSION_ID<50100	
+#define SHOW_VAR_FUNC_BUFF_SIZE 1024
+#endif
+
+static int sphinx_showfunc ( THD * thd, SHOW_VAR * out, char * sBuffer )
+{
+	CSphSEThreadData *pTls = (CSphSEThreadData *) *thd_ha_data ( thd, sphinx_hton_ptr );
+	CSphSEStats * pStats = ( pTls && pTls->m_bStats ) ? &pTls->m_tStats : 0;
+    SHOW_VAR *array = (SHOW_VAR*)thd_alloc(thd, sizeof(SHOW_VAR)*7);
+    out->type = SHOW_ARRAY;
+    out->value = (char*)array;
+    if (pStats)
+    {
+        array[0].name = "total";
+        array[0].type = SHOW_INT;
+        array[0].value = (char *) &pStats->m_iMatchesTotal;
+        array[1].name = "total_found";
+        array[1].type = SHOW_INT;
+        array[1].value = (char *) &pStats->m_iMatchesFound;
+        array[2].name = "time";
+        array[2].type = SHOW_INT;
+        array[2].value = (char *) &pStats->m_iQueryMsec;
+        array[3].name = "word_count";
+        array[3].type = SHOW_INT;
+        array[3].value = (char *) &pStats->m_iWords;
+        array[4].name = "error";
+        array[4].type = SHOW_CHAR;
+        array[4].value = (char *) &pStats->m_sLastMessage;
+        array[5].name = "words";
+        array[5].type = SHOW_CHAR;
+        array[5].value = sBuffer;
+        sBuffer[0] = 0;
+
+        if ( pStats->m_iWords )
+        {
+            uint uBuffLen = 0;
+        
+            // the following is partially based on code in sphinx_show_status()
+            for ( int i=0; i<pStats->m_iWords; i++ )
+            {
+                CSphSEWordStats & tWord = pStats->m_dWords[i];
+                uBuffLen = my_snprintf ( sBuffer, SHOW_VAR_FUNC_BUFF_SIZE, "%s%s:%d:%d ", sBuffer,
+                    tWord.m_sWord, tWord.m_iDocs, tWord.m_iHits );
+            }
+
+            if ( uBuffLen > 0 )
+            {
+                // trim last space
+                sBuffer [ --uBuffLen ] = 0;
+            
+                if ( pTls->m_pQueryCharset )
+                {
+                    // String::c_ptr() will nul-terminate the buffer.
+                    //
+                    // NOTE: It's not entirely clear whether this conversion is necessary at all.
+                    
+                    String sConvert;
+                    uint iErrors;
+                    sConvert.copy ( sBuffer, uBuffLen, pTls->m_pQueryCharset, system_charset_info, &iErrors );
+                    memcpy ( sBuffer, sConvert.c_ptr(), sConvert.length() + 1 );
+                }
+            }
+        }
+        
+        array[6].name = 0; // terminate the array
+    }
+    else
+      array[0].name = 0;
+	return 0;
+}
+
+#if MYSQL_VERSION_ID>50100
+struct st_mysql_storage_engine sphinx_storage_engine =
+{
+	MYSQL_HANDLERTON_INTERFACE_VERSION
+};
+
+struct st_mysql_show_var sphinx_status_vars[] =
+{
+	{"sphinx",     (char *)sphinx_showfunc,      			SHOW_FUNC},
+	{0, 0, (enum_mysql_show_type)0}
+};
+
+
+mysql_declare_plugin(sphinx)
+{
+	MYSQL_STORAGE_ENGINE_PLUGIN,
+	&sphinx_storage_engine,
+	sphinx_hton_name,
+	"Sphinx developers",
+	sphinx_hton_comment,
+	PLUGIN_LICENSE_GPL,
+	sphinx_init_func, // Plugin Init
+	sphinx_done_func, // Plugin Deinit
+	0x0001, // 0.1
+	sphinx_status_vars,
+	NULL,
+	NULL
+}
+mysql_declare_plugin_end;
+
+#ifdef maria_declare_plugin
+maria_declare_plugin(sphinx)
+{
+	MYSQL_STORAGE_ENGINE_PLUGIN,
+	&sphinx_storage_engine,
+	sphinx_hton_name,
+	"Sphinx developers",
+	sphinx_hton_comment,
+	PLUGIN_LICENSE_GPL,
+	sphinx_init_func, // Plugin Init
+	sphinx_done_func, // Plugin Deinit
+	0x0001, // 0.1
+	sphinx_status_vars,
+	NULL,
+	"0.1", // string version
+	MariaDB_PLUGIN_MATURITY_EXPERIMENTAL
+}
+maria_declare_plugin_end;
+#endif
+
+#endif // >50100
+
+//
+// $Id: ha_sphinx.cc 2058 2009-11-07 04:01:57Z shodan $
+//
diff --git a/storage/sphinx/ha_sphinx.h b/storage/sphinx/ha_sphinx.h
new file mode 100644
index 00000000000..3f517062cff
--- /dev/null
+++ b/storage/sphinx/ha_sphinx.h
@@ -0,0 +1,159 @@
+//
+// $Id: ha_sphinx.h 1428 2008-09-05 18:06:30Z xale $
+//
+
+#ifdef USE_PRAGMA_INTERFACE
+#pragma interface // gcc class implementation
+#endif
+
+
+#if MYSQL_VERSION_ID>50100
+#define TABLE_ARG	st_table_share
+#else
+#define TABLE_ARG	st_table
+#endif
+
+
+#if MYSQL_VERSION_ID>=50120
+typedef uchar byte;
+#endif
+
+
+/// forward decls
+class THD;
+struct CSphReqQuery;
+struct CSphSEShare;
+struct CSphSEAttr;
+struct CSphSEStats;
+struct CSphSEThreadData;
+
+/// Sphinx SE handler class
+class ha_sphinx : public handler
+{
+protected:
+	THR_LOCK_DATA	m_tLock;				///< MySQL lock
+
+	CSphSEShare *	m_pShare;				///< shared lock info
+
+	uint			m_iMatchesTotal;
+	uint			m_iCurrentPos;
+	const byte *	m_pCurrentKey;
+	uint			m_iCurrentKeyLen;
+
+	char *			m_pResponse;			///< searchd response storage
+	char *			m_pResponseEnd;			///< searchd response storage end (points to wilderness!)
+	char *			m_pCur;					///< current position into response
+	bool			m_bUnpackError;			///< any errors while unpacking response
+
+public:
+#if MYSQL_VERSION_ID<50100
+					ha_sphinx ( TABLE_ARG * table_arg );
+#else
+					ha_sphinx ( handlerton * hton, TABLE_ARG * table_arg );
+#endif
+					~ha_sphinx () {}
+
+	const char *	table_type () const		{ return "SPHINX"; }	///< SE name for display purposes
+	const char *	index_type ( uint )		{ return "HASH"; }		///< index type name for display purposes
+	const char **	bas_ext () const;								///< my file extensions
+
+	#if MYSQL_VERSION_ID>50100
+	ulonglong		table_flags () const	{ return HA_CAN_INDEX_BLOBS; }			///< bitmap of implemented flags (see handler.h for more info)
+	#else
+	ulong			table_flags () const	{ return HA_CAN_INDEX_BLOBS; }			///< bitmap of implemented flags (see handler.h for more info)
+	#endif
+
+	ulong			index_flags ( uint, uint, bool ) const	{ return 0; }	///< bitmap of flags that says how SE implements indexes
+	uint			max_supported_record_length () const	{ return HA_MAX_REC_LENGTH; }
+	uint			max_supported_keys () const				{ return 1; }
+	uint			max_supported_key_parts () const		{ return 1; }
+	uint			max_supported_key_length () const		{ return MAX_KEY_LENGTH; }
+	uint			max_supported_key_part_length () const	{ return MAX_KEY_LENGTH; }
+
+	#if MYSQL_VERSION_ID>50100
+	virtual double	scan_time ()	{ return (double)( stats.records+stats.deleted )/20.0 + 10; }	///< called in test_quick_select to determine if indexes should be used
+	#else
+	virtual double	scan_time ()	{ return (double)( records+deleted )/20.0 + 10; }				///< called in test_quick_select to determine if indexes should be used
+	#endif
+
+	virtual double	read_time(uint index, uint ranges, ha_rows rows)
+        { return (double)rows/20.0 + 1; }					///< index read time estimate
+
+public:
+	int				open ( const char * name, int mode, uint test_if_locked );
+	int				close ();
+
+	int				write_row ( uchar * buf );
+	int				update_row ( const uchar * old_data, uchar * new_data );
+	int				delete_row ( const uchar * buf );
+
+	int				index_init ( uint keynr, bool sorted ); // 5.1.x
+	int				index_init ( uint keynr ) { return index_init ( keynr, false ); } // 5.0.x
+
+	int				index_end (); 
+	int				index_read ( byte * buf, const byte * key, uint key_len, enum ha_rkey_function find_flag );
+	int				index_read_idx ( byte * buf, uint idx, const byte * key, uint key_len, enum ha_rkey_function find_flag );
+	int				index_next ( byte * buf );
+	int				index_next_same ( byte * buf, const byte * key, uint keylen );
+	int				index_prev ( byte * buf );
+	int				index_first ( byte * buf );
+	int				index_last ( byte * buf );
+
+	int				get_rec ( byte * buf, const byte * key, uint keylen );
+
+	int				rnd_init ( bool scan );
+	int				rnd_end ();
+	int				rnd_next ( byte * buf );
+	int				rnd_pos ( byte * buf, byte * pos );
+	void			position ( const byte * record );
+
+#if MYSQL_VERSION_ID>=50030
+	int				info ( uint );
+#else
+	void			info ( uint );
+#endif
+
+	int				reset();
+	int				external_lock ( THD * thd, int lock_type );
+	int				delete_all_rows ();
+	ha_rows			records_in_range ( uint inx, key_range * min_key, key_range * max_key );
+
+	int				delete_table ( const char * from );
+	int				rename_table ( const char * from, const char * to );
+	int				create ( const char * name, TABLE * form, HA_CREATE_INFO * create_info );
+
+	THR_LOCK_DATA **store_lock ( THD * thd, THR_LOCK_DATA ** to, enum thr_lock_type lock_type );
+
+public:
+	virtual const COND *	cond_push ( const COND *cond );
+	virtual void			cond_pop ();
+
+private:
+	uint32			m_iFields;
+	char **			m_dFields;
+
+	uint32			m_iAttrs;
+	CSphSEAttr *	m_dAttrs;
+	int				m_bId64;
+
+	int *			m_dUnboundFields;
+
+private:
+	int				ConnectToSearchd ( const char * sQueryHost, int iQueryPort );
+
+	uint32			UnpackDword ();
+	char *			UnpackString ();
+	bool			UnpackSchema ();
+	bool			UnpackStats ( CSphSEStats * pStats );
+
+	CSphSEThreadData *	GetTls ();
+};
+
+
+#if MYSQL_VERSION_ID < 50100
+bool sphinx_show_status ( THD * thd );
+#endif
+
+//
+// $Id: ha_sphinx.h 1428 2008-09-05 18:06:30Z xale $
+//
diff --git a/storage/sphinx/make-patch.sh b/storage/sphinx/make-patch.sh
new file mode 100644
index 00000000000..6fca5838ded
--- /dev/null
+++ b/storage/sphinx/make-patch.sh
@@ -0,0 +1,36 @@
+#!/bin/sh
+
+OUT=$1
+ORIG=$2
+NEW=$3
+
+if [ ! \( "$1" -a "$2" -a "$3" \) ]; then
+	echo "$0 <patch> <original> <new>"
+	exit 1
+fi
+
+FILES='
+/config/ac-macros/ha_sphinx.m4
+/configure.in
+/libmysqld/Makefile.am
+/sql/handler.cc
+/sql/handler.h
+/sql/Makefile.am
+/sql/mysqld.cc
+/sql/mysql_priv.h
+/sql/set_var.cc
+/sql/sql_lex.h
+/sql/sql_parse.cc
+/sql/sql_yacc.yy
+/sql/structs.h
+/sql/sql_show.cc
+'
+
+rm -f $OUT
+if [ -e $OUT ]; then
+	exit 1
+fi
+
+for name in $FILES; do
+	diff -BNru "$ORIG$name" "$NEW$name" >> $OUT
+done
diff --git a/storage/sphinx/plug.in b/storage/sphinx/plug.in
new file mode 100644
index 00000000000..6c96e41ae52
--- /dev/null
+++ b/storage/sphinx/plug.in
@@ -0,0 +1,6 @@
+MYSQL_STORAGE_ENGINE(sphinx,,[Sphinx Storage Engine],
+        [SE client for Sphinx search daemon], [])
+MYSQL_PLUGIN_DIRECTORY(sphinx, [storage/sphinx])
+MYSQL_PLUGIN_STATIC(sphinx,  [libsphinx.a])
+MYSQL_PLUGIN_DYNAMIC(sphinx, [ha_sphinx.la])
+
diff --git a/storage/sphinx/snippets_udf.cc b/storage/sphinx/snippets_udf.cc
new file mode 100644
index 00000000000..961d1a92ed1
--- /dev/null
+++ b/storage/sphinx/snippets_udf.cc
@@ -0,0 +1,766 @@
+//
+// $Id: snippets_udf.cc 2058 2009-11-07 04:01:57Z shodan $
+//
+
+//
+// Copyright (c) 2001-2008, Andrew Aksyonoff. All rights reserved.
+//
+// This program is free software; you can redistribute it and/or modify
+// it under the terms of the GNU General Public License. You should have
+// received a copy of the GPL license along with this program; if you
+// did not, you can find it at http://www.gnu.org/
+//
+
+#include <mysql_version.h>
+
+#if MYSQL_VERSION_ID>50100
+#include "mysql_priv.h"
+#include <mysql/plugin.h>
+#else
+#include "../mysql_priv.h"
+#endif
+
+#include <stdio.h>
+#include <string.h>
+#include <assert.h>
+
+#include <sys/un.h>
+#include <netdb.h>
+
+#include <mysys_err.h>
+#include <my_sys.h>
+
+#if MYSQL_VERSION_ID>=50120
+typedef uchar byte;
+#endif
+
+/// partially copy-pasted stuff that should be moved elsewhere
+
+#if UNALIGNED_RAM_ACCESS
+
+/// pass-through wrapper
+template < typename T > inline T sphUnalignedRead ( const T & tRef )
+{
+	return tRef;
+}
+
+/// pass-through wrapper
+template < typename T > void sphUnalignedWrite ( void * pPtr, const T & tVal )
+{
+	*(T*)pPtr = tVal;
+}
+
+#else
+
+/// unaligned read wrapper for some architectures (eg. SPARC)
+template < typename T >
+inline T sphUnalignedRead ( const T & tRef )
+{
+	T uTmp;
+	byte * pSrc = (byte *) &tRef;
+	byte * pDst = (byte *) &uTmp;
+	for ( int i=0; i<(int)sizeof(T); i++ )
+		*pDst++ = *pSrc++;
+	return uTmp;
+}
+
+/// unaligned write wrapper for some architectures (eg. SPARC)
+template < typename T >
+void sphUnalignedWrite ( void * pPtr, const T & tVal )
+{
+	byte * pDst = (byte *) pPtr;
+	byte * pSrc = (byte *) &tVal;
+	for ( int i=0; i<(int)sizeof(T); i++ )
+		*pDst++ = *pSrc++;
+}
+
+#endif
+
+#define SPHINXSE_MAX_ALLOC			(16*1024*1024)
+
+#define SafeDelete(_arg)		{ if ( _arg ) delete ( _arg );		(_arg) = NULL; }
+#define SafeDeleteArray(_arg)	{ if ( _arg ) delete [] ( _arg );	(_arg) = NULL; }
+
+#define Min(a,b) ((a)<(b)?(a):(b))
+
+typedef unsigned int DWORD;
+
+inline DWORD sphF2DW ( float f ) { union { float f; uint32 d; } u; u.f = f; return u.d; }
+
+static char * sphDup ( const char * sSrc, int iLen=-1 )
+{
+	if ( !sSrc )
+		return NULL;
+
+	if ( iLen<0 )
+		iLen = strlen(sSrc);
+
+	char * sRes = new char [ 1+iLen ];
+	memcpy ( sRes, sSrc, iLen );
+	sRes[iLen] = '\0';
+	return sRes;
+}
+
+static inline void sphShowErrno ( const char * sCall )
+{
+	char sError[256];
+	snprintf ( sError, sizeof(sError), "%s() failed: [%d] %s", sCall, errno, strerror(errno) );
+	my_error ( ER_QUERY_ON_FOREIGN_DATA_SOURCE, MYF(0), sError );
+}
+
+static const bool sphReportErrors = true;
+
+static bool sphSend ( int iFd, const char * pBuffer, int iSize, bool bReportErrors = false )
+{
+	assert ( pBuffer );
+	assert ( iSize > 0 );
+
+	const int iResult = send ( iFd, pBuffer, iSize, 0 );
+	if ( iResult != iSize )
+	{
+		if ( bReportErrors ) sphShowErrno("send");
+		return false;
+	}
+	return true;
+}
+
+static bool sphRecv ( int iFd, char * pBuffer, int iSize, bool bReportErrors = false )
+{
+	assert ( pBuffer );
+	assert ( iSize > 0 );
+	
+	while ( iSize )
+	{
+		const int iResult = recv ( iFd, pBuffer, iSize, 0 );
+		if ( iResult > 0 )
+		{
+			iSize -= iResult;
+			pBuffer += iSize;
+		}
+		else if ( iResult == 0 )
+		{
+			if ( bReportErrors )
+				my_error ( ER_CONNECT_TO_FOREIGN_DATA_SOURCE, MYF(0), "recv() failed: disconnected" );
+			return false;
+		}
+		else
+		{
+			if ( bReportErrors ) sphShowErrno("recv");
+			return false;
+		}
+	}
+	return true;
+}
+
+enum
+{
+	SPHINX_SEARCHD_PROTO		= 1,
+
+	SEARCHD_COMMAND_SEARCH		= 0,
+	SEARCHD_COMMAND_EXCERPT		= 1,
+
+	VER_COMMAND_SEARCH		= 0x116,
+	VER_COMMAND_EXCERPT		= 0x100,
+};
+
+/// known answers
+enum
+{
+	SEARCHD_OK		= 0,	///< general success, command-specific reply follows
+	SEARCHD_ERROR	= 1,	///< general failure, error message follows
+	SEARCHD_RETRY	= 2,	///< temporary failure, error message follows, client should retry later
+	SEARCHD_WARNING	= 3		///< general success, warning message and command-specific reply follow
+};
+
+#define SPHINXSE_DEFAULT_SCHEME		"sphinx"
+#define SPHINXSE_DEFAULT_HOST		"127.0.0.1"
+#define SPHINXSE_DEFAULT_PORT		9312
+#define SPHINXSE_DEFAULT_INDEX		"*"
+
+class CSphBuffer
+{
+private:
+	bool m_bOverrun;
+	int m_iSize;
+	int m_iLeft;
+	char * m_pBuffer;
+	char * m_pCurrent;
+
+public:
+	CSphBuffer ( const int iSize )
+		: m_bOverrun ( false )
+		, m_iSize ( iSize )
+		, m_iLeft ( iSize )
+	{
+		assert ( iSize > 0 );
+		m_pBuffer = new char[iSize];
+		m_pCurrent = m_pBuffer;
+	}
+
+	~CSphBuffer ()
+	{
+		SafeDelete ( m_pBuffer );
+	}
+
+	const char * Ptr() const { return m_pBuffer; }
+
+	bool Finalize()
+	{
+		return !( m_bOverrun || m_iLeft != 0 || m_pCurrent - m_pBuffer != m_iSize );
+	}
+	
+	void SendBytes ( const void * pBytes, int iBytes );
+	
+	void SendWord ( short int v )					{ v = ntohs(v); SendBytes ( &v, sizeof(v) ); }
+	void SendInt ( int v )							{ v = ntohl(v); SendBytes ( &v, sizeof(v) ); }
+	void SendDword ( DWORD v )						{ v = ntohl(v) ;SendBytes ( &v, sizeof(v) ); }
+	void SendUint64 ( ulonglong v )					{ SendDword ( uint(v>>32) ); SendDword ( uint(v&0xFFFFFFFFUL) ); }
+	void SendString ( const char * v )				{ SendString ( v, strlen(v) ); }
+	void SendString ( const char * v, int iLen )	{ SendDword(iLen); SendBytes ( v, iLen ); }
+	void SendFloat ( float v )						{ SendDword ( sphF2DW(v) ); }
+};
+
+void CSphBuffer::SendBytes ( const void * pBytes, int iBytes )
+{
+	if ( m_iLeft < iBytes )
+	{
+		m_bOverrun = true;
+		return;
+	}
+
+	memcpy ( m_pCurrent, pBytes, iBytes );
+
+	m_pCurrent += iBytes;
+	m_iLeft -= iBytes;
+}
+
+struct CSphUrl
+{
+	char * m_sBuffer;
+	char * m_sFormatted;
+	
+	char * m_sScheme;
+	char * m_sHost;
+	char * m_sIndex;
+	
+	int m_iPort;
+	
+	CSphUrl()
+		: m_sBuffer ( NULL )
+		, m_sFormatted ( NULL )
+		, m_sScheme ( (char*) SPHINXSE_DEFAULT_SCHEME )
+		, m_sHost ( (char*) SPHINXSE_DEFAULT_HOST )
+		, m_sIndex ( (char*) SPHINXSE_DEFAULT_INDEX )
+		, m_iPort ( SPHINXSE_DEFAULT_PORT )
+	{}
+	
+	~CSphUrl()
+	{
+		SafeDeleteArray ( m_sFormatted );
+		SafeDeleteArray ( m_sBuffer );
+	}
+	
+	bool Parse ( const char * sUrl, int iLen );
+	int Connect();
+	const char * Format();
+};
+
+const char * CSphUrl::Format()
+{
+	if ( !m_sFormatted )
+	{
+		int iSize = 15 + strlen(m_sHost) + strlen(m_sIndex);
+		m_sFormatted = new char [ iSize ];
+		if ( m_iPort )
+			snprintf ( m_sFormatted, iSize, "inet://%s:%d/%s", m_sHost, m_iPort, m_sIndex );
+		else
+			snprintf ( m_sFormatted, iSize, "unix://%s/%s", m_sHost, m_sIndex );
+	}
+	return m_sFormatted;
+}
+
+// the following scheme variants are recognized
+//
+// inet://host/index
+// inet://host:port/index
+// unix://unix/domain/socket:index
+// unix://unix/domain/socket
+bool CSphUrl::Parse ( const char * sUrl, int iLen )
+{
+	bool bOk = true;
+	while ( iLen )
+	{
+		bOk = false;
+		
+		m_sBuffer = sphDup ( sUrl, iLen );
+		m_sScheme = m_sBuffer;
+		
+		m_sHost = strstr ( m_sBuffer, "://" );
+		if ( !m_sHost )
+			break;
+		m_sHost[0] = '\0';
+		m_sHost += 2;
+		
+		if ( !strcmp ( m_sScheme, "unix" ) )
+		{
+			// unix-domain socket
+			m_iPort = 0;
+			if (!( m_sIndex = strrchr ( m_sHost, ':' ) ))
+				m_sIndex = (char*) SPHINXSE_DEFAULT_INDEX;
+			else
+			{
+				*m_sIndex++ = '\0';
+				if ( !*m_sIndex )
+					m_sIndex = (char*) SPHINXSE_DEFAULT_INDEX;
+			}
+			bOk = true;
+			break;
+		}
+		if( strcmp ( m_sScheme, "sphinx" ) != 0 && strcmp ( m_sScheme, "inet" ) != 0 )
+			break;
+
+		// inet
+		m_sHost++;
+		char * sPort = strchr ( m_sHost, ':' );
+		if ( sPort )
+		{
+			*sPort++ = '\0';
+			if ( *sPort )
+			{
+				m_sIndex = strchr ( sPort, '/' );
+				if ( m_sIndex )
+					*m_sIndex++ = '\0'; 
+				else
+					m_sIndex = (char*) SPHINXSE_DEFAULT_INDEX;
+				
+				m_iPort = atoi(sPort);
+				if ( !m_iPort )
+					m_iPort = SPHINXSE_DEFAULT_PORT;
+			}
+		} else
+		{
+			m_sIndex = strchr ( m_sHost, '/' );
+			if ( m_sIndex )
+				*m_sIndex++ = '\0';
+			else
+				m_sIndex = (char*) SPHINXSE_DEFAULT_INDEX;
+		}
+
+		bOk = true;
+		break;
+	}
+	
+	return bOk;
+}
+
+int CSphUrl::Connect()
+{
+	struct sockaddr_in sin;
+#ifndef __WIN__
+	struct sockaddr_un saun;
+#endif
+
+	int iDomain = 0;
+	int iSockaddrSize = 0;
+	struct sockaddr * pSockaddr = NULL;
+
+	in_addr_t ip_addr;
+
+	if ( m_iPort )
+	{
+		iDomain = AF_INET;
+		iSockaddrSize = sizeof(sin);
+		pSockaddr = (struct sockaddr *) &sin;
+
+		memset ( &sin, 0, sizeof(sin) );
+		sin.sin_family = AF_INET;
+		sin.sin_port = htons(m_iPort);
+		
+		// resolve address
+		if ( (int)( ip_addr=inet_addr(m_sHost) ) != (int)INADDR_NONE )
+			memcpy ( &sin.sin_addr, &ip_addr, sizeof(ip_addr) );
+		else
+		{
+			int tmp_errno;
+			struct hostent tmp_hostent, *hp;
+			char buff2 [ GETHOSTBYNAME_BUFF_SIZE ];
+			
+			hp = my_gethostbyname_r ( m_sHost, &tmp_hostent,
+									  buff2, sizeof(buff2), &tmp_errno );
+			if ( !hp )
+			{ 
+				my_gethostbyname_r_free();
+				
+				char sError[256];
+				snprintf ( sError, sizeof(sError), "failed to resolve searchd host (name=%s)", m_sHost );
+				
+				my_error ( ER_CONNECT_TO_FOREIGN_DATA_SOURCE, MYF(0), sError );
+				return -1;
+			}
+			
+			memcpy ( &sin.sin_addr, hp->h_addr, Min ( sizeof(sin.sin_addr), (size_t)hp->h_length ) );
+			my_gethostbyname_r_free();
+		}
+	}
+	else
+	{
+#ifndef __WIN__
+		iDomain = AF_UNIX;
+		iSockaddrSize = sizeof(saun);
+		pSockaddr = (struct sockaddr *) &saun;
+
+		memset ( &saun, 0, sizeof(saun) );
+		saun.sun_family = AF_UNIX;
+		strncpy ( saun.sun_path, m_sHost, sizeof(saun.sun_path)-1 );
+#else
+		my_error ( ER_CONNECT_TO_FOREIGN_DATA_SOURCE, MYF(0), "Unix-domain sockets are not supported on Windows" );
+		return -1;
+#endif
+	}
+
+	// connect to searchd and exchange versions
+	uint uServerVersion;
+	uint uClientVersion = htonl ( SPHINX_SEARCHD_PROTO );
+	int iSocket = -1;
+        const char * pError = NULL;
+	do
+	{
+		iSocket = socket ( iDomain, SOCK_STREAM, 0 );
+		if ( iSocket == -1 )
+		{
+			pError = "Failed to create client socket";
+			break;
+		}
+	
+		if ( connect ( iSocket, pSockaddr, iSockaddrSize ) == -1)
+		{
+			pError = "Failed to connect to searchd";
+			break;
+		}
+
+		if ( !sphRecv ( iSocket, (char *)&uServerVersion, sizeof(uServerVersion) ) )
+		{
+			pError = "Failed to receive searchd version";
+			break;
+		}
+		
+		if ( !sphSend ( iSocket, (char *)&uClientVersion, sizeof(uClientVersion) ) )
+		{
+			pError = "Failed to send client version";
+			break;
+		}
+	}
+	while(0);
+
+	// fixme: compare versions?
+
+	if ( pError )
+	{
+		char sError[1024];
+		snprintf ( sError, sizeof(sError), "%s [%d] %s", Format(), errno, strerror(errno) );
+		my_error ( ER_CONNECT_TO_FOREIGN_DATA_SOURCE, MYF(0), sError );
+
+		if ( iSocket != -1 )
+			close ( iSocket );
+		
+		return -1;
+	}
+
+	return iSocket;
+}
+
+struct CSphResponse
+{
+	char * m_pBuffer;
+	char * m_pBody;
+
+	CSphResponse ()
+		: m_pBuffer ( NULL )
+		, m_pBody ( NULL )
+	{}
+
+	CSphResponse ( DWORD uSize )
+		: m_pBody ( NULL )
+	{
+		m_pBuffer = new char[uSize];
+	}
+
+	~CSphResponse ()
+	{
+		SafeDeleteArray ( m_pBuffer );
+	}
+	
+	static CSphResponse * Read ( int iSocket, int iClientVersion );
+};
+
+CSphResponse *
+CSphResponse::Read ( int iSocket, int iClientVersion )
+{
+	char sHeader[8];
+	if ( !sphRecv ( iSocket, sHeader, sizeof(sHeader) ) )
+		return NULL;
+
+	int iStatus   = ntohs ( sphUnalignedRead ( *(short int *) &sHeader[0] ) );
+	int iVersion  = ntohs ( sphUnalignedRead ( *(short int *) &sHeader[2] ) );
+	DWORD uLength = ntohl ( sphUnalignedRead ( *(DWORD *)     &sHeader[4] ) );
+
+	if ( iVersion < iClientVersion ) // fixme: warn
+        {}
+
+	if ( uLength <= SPHINXSE_MAX_ALLOC )
+	{
+		CSphResponse * pResponse = new CSphResponse ( uLength );
+		if ( !sphRecv ( iSocket, pResponse->m_pBuffer, uLength ) )
+		{
+			SafeDelete ( pResponse );
+			return NULL;
+		}
+
+		pResponse->m_pBody = pResponse->m_pBuffer;
+		if ( iStatus != SEARCHD_OK )
+		{
+			DWORD uSize = ntohl ( *(DWORD *)pResponse->m_pBuffer );
+			if ( iStatus == SEARCHD_WARNING )
+				pResponse->m_pBody += uSize; // fixme: report the warning somehow
+			else
+			{
+				char * sMessage = sphDup ( pResponse->m_pBuffer + sizeof(DWORD), uSize );
+				my_error ( ER_QUERY_ON_FOREIGN_DATA_SOURCE, MYF(0), sMessage );
+				SafeDelete ( sMessage );
+				SafeDelete ( pResponse );
+				return NULL;
+			}
+		}
+		return pResponse;
+	}
+	return NULL;
+}
+
+/// udf
+
+extern "C"
+{
+	my_bool sphinx_snippets_init ( UDF_INIT * pUDF, UDF_ARGS * pArgs, char * sMessage );
+	void sphinx_snippets_deinit ( UDF_INIT * pUDF );
+	char * sphinx_snippets ( UDF_INIT * pUDF, UDF_ARGS * pArgs, char * sResult, unsigned long * pLength, char * pIsNull, char * sError );
+};
+
+#define MAX_MESSAGE_LENGTH 255
+#define MAX_RESULT_LENGTH 255
+
+struct CSphSnippets
+{
+	CSphUrl m_tUrl;
+	CSphResponse * m_pResponse;
+
+	int m_iBeforeMatch;
+	int m_iAfterMatch;
+	int m_iChunkSeparator;
+	int m_iLimit;
+	int m_iAround;
+	int m_iFlags;
+
+	CSphSnippets()
+		: m_pResponse(NULL)
+		, m_iBeforeMatch(0)
+		, m_iAfterMatch(0)
+		, m_iChunkSeparator(0)
+		  // defaults
+		, m_iLimit(256)
+		, m_iAround(5)
+		, m_iFlags(1)
+	{
+	}
+
+	~CSphSnippets()
+	{
+		SafeDelete ( m_pResponse );
+	}
+};
+
+#define KEYWORD(NAME) else if ( strncmp ( NAME, pArgs->attributes[i], pArgs->attribute_lengths[i] ) == 0 )
+
+#define CHECK_TYPE(TYPE)											\
+	if ( pArgs->arg_type[i] != TYPE )								\
+	{																\
+		snprintf ( sMessage, MAX_MESSAGE_LENGTH,					\
+				   "%.*s argument must be a string",				\
+				   (int)pArgs->attribute_lengths[i],				\
+				   pArgs->attributes[i] );							\
+		bFail = true;												\
+		break;														\
+	}																\
+	if ( TYPE == STRING_RESULT && !pArgs->args[i] )					\
+	{																\
+		snprintf ( sMessage, MAX_MESSAGE_LENGTH,					\
+				   "%.*s argument must be constant (and not NULL)",	\
+				   (int)pArgs->attribute_lengths[i],				\
+				   pArgs->attributes[i] );							\
+		bFail = true;												\
+		break;														\
+	}
+
+#define STRING CHECK_TYPE(STRING_RESULT)
+#define INT CHECK_TYPE(INT_RESULT); int iValue = *(long long *)pArgs->args[i]
+
+my_bool sphinx_snippets_init ( UDF_INIT * pUDF, UDF_ARGS * pArgs, char * sMessage )
+{
+	if ( pArgs->arg_count < 3 )
+	{
+		strncpy ( sMessage, "insufficient arguments", MAX_MESSAGE_LENGTH );
+		return 1;
+	}
+
+	bool bFail = false;
+	CSphSnippets * pOpts = new CSphSnippets;
+	for ( uint i = 0; i < pArgs->arg_count; i++ )
+	{
+		if ( i < 3 )
+		{
+			if ( pArgs->arg_type[i] != STRING_RESULT )
+			{
+				strncpy ( sMessage, "first three arguments must be of string type", MAX_MESSAGE_LENGTH );
+				bFail = true;
+				break;
+			}
+		}
+		KEYWORD("sphinx")
+		{
+			STRING;
+			if ( !pOpts->m_tUrl.Parse ( pArgs->args[i], pArgs->lengths[i] ) )
+			{
+				strncpy ( sMessage, "failed to parse connection string", MAX_MESSAGE_LENGTH );
+				bFail = true;
+				break;
+			}
+		}
+		KEYWORD("before_match")		{ STRING; pOpts->m_iBeforeMatch = i; }
+		KEYWORD("after_match")		{ STRING; pOpts->m_iAfterMatch = i; }
+		KEYWORD("chunk_separator")	{ STRING; pOpts->m_iChunkSeparator = i; }
+		KEYWORD("limit")			{ INT; pOpts->m_iLimit = iValue; }
+		KEYWORD("around")			{ INT; pOpts->m_iAround = iValue; }
+		KEYWORD("exact_phrase")		{ INT; if ( iValue ) pOpts->m_iFlags |= 2; }
+		KEYWORD("single_passage")	{ INT; if ( iValue ) pOpts->m_iFlags |= 4; }
+		KEYWORD("use_boundaries")	{ INT; if ( iValue ) pOpts->m_iFlags |= 8; }
+		KEYWORD("weight_order")		{ INT; if ( iValue ) pOpts->m_iFlags |= 16; }
+		else
+		{
+			snprintf ( sMessage, MAX_MESSAGE_LENGTH, "unrecognized argument: %.*s",
+					   (int)pArgs->attribute_lengths[i], pArgs->attributes[i] );
+			bFail = true;
+			break;
+		}
+	}
+	
+	if ( bFail )
+	{
+		SafeDelete ( pOpts );
+		return 1;
+	}
+	pUDF->ptr = (char *)pOpts;
+	return 0;
+}
+
+#undef STRING
+#undef INT
+#undef KEYWORD
+#undef CHECK_TYPE
+
+#define ARG(i) pArgs->args[i], pArgs->lengths[i]
+#define ARG_LEN(VAR, LEN) ( VAR ? pArgs->lengths[VAR] : LEN )
+
+#define SEND_STRING(INDEX, DEFAULT)							\
+	if ( INDEX )											\
+		tBuffer.SendString ( ARG(INDEX) );					\
+	else													\
+		tBuffer.SendString ( DEFAULT, sizeof(DEFAULT) - 1 );
+
+
+char * sphinx_snippets ( UDF_INIT * pUDF, UDF_ARGS * pArgs, char * sResult, unsigned long * pLength, char * pIsNull, char * pError )
+{
+	CSphSnippets * pOpts = (CSphSnippets *)pUDF->ptr;
+	assert ( pOpts );
+
+	if ( !pArgs->args[0] || !pArgs->args[1] || !pArgs->args[2] )
+	{
+		*pIsNull = 1;
+		return sResult;
+	}
+
+	const int iSize =
+		8 + // header
+		8 +
+		4 + pArgs->lengths[1] + // index
+		4 + pArgs->lengths[2] + // words
+		4 + ARG_LEN ( pOpts->m_iBeforeMatch, 3 ) +
+		4 + ARG_LEN ( pOpts->m_iAfterMatch, 4 ) +
+		4 + ARG_LEN ( pOpts->m_iChunkSeparator, 5 ) +
+		12 +
+		4 + pArgs->lengths[0]; // document
+
+	CSphBuffer tBuffer(iSize);
+
+	tBuffer.SendWord ( SEARCHD_COMMAND_EXCERPT );
+	tBuffer.SendWord ( VER_COMMAND_EXCERPT );
+	tBuffer.SendDword ( iSize - 8 );
+
+	tBuffer.SendDword ( 0 );
+	tBuffer.SendDword ( pOpts->m_iFlags );
+
+	tBuffer.SendString ( ARG(1) ); // index
+	tBuffer.SendString ( ARG(2) ); // words
+
+	SEND_STRING ( pOpts->m_iBeforeMatch, "<b>" );
+	SEND_STRING ( pOpts->m_iAfterMatch, "</b>" );
+	SEND_STRING ( pOpts->m_iChunkSeparator, " ... " );
+
+	tBuffer.SendInt ( pOpts->m_iLimit );
+	tBuffer.SendInt ( pOpts->m_iAround );
+
+	// single document
+	tBuffer.SendInt ( 1 );
+	tBuffer.SendString ( ARG(0) );
+
+	int iSocket = -1;
+	do
+	{
+		if ( !tBuffer.Finalize() )
+		{
+			my_error ( ER_QUERY_ON_FOREIGN_DATA_SOURCE, MYF(0), "INTERNAL ERROR: failed to build request" );
+			break;
+		}
+		
+		iSocket = pOpts->m_tUrl.Connect();
+		if ( iSocket == -1 ) break;
+		if ( !sphSend ( iSocket, tBuffer.Ptr(), iSize, sphReportErrors ) ) break;
+
+		CSphResponse * pResponse = CSphResponse::Read ( iSocket, 0x100 );
+		if ( !pResponse ) break;
+
+		close ( iSocket );
+		pOpts->m_pResponse = pResponse;
+		*pLength = ntohl( *(DWORD *)pResponse->m_pBody );
+		return pResponse->m_pBody + sizeof(DWORD);
+	}
+	while(0);
+
+	if ( iSocket != -1 )
+		close ( iSocket );
+
+	*pError = 1;
+	return sResult;
+}
+
+#undef SEND_STRING
+#undef ARG_LEN	
+#undef ARG
+
+void sphinx_snippets_deinit ( UDF_INIT * pUDF )
+{
+	CSphSnippets * pOpts = (CSphSnippets *)pUDF->ptr;
+	SafeDelete ( pOpts );
+}
+
+//
+// $Id: snippets_udf.cc 2058 2009-11-07 04:01:57Z shodan $
+//
diff --git a/storage/sphinx/sphinx.5.0.22.diff b/storage/sphinx/sphinx.5.0.22.diff
new file mode 100644
index 00000000000..7dd4ebf1410
--- /dev/null
+++ b/storage/sphinx/sphinx.5.0.22.diff
@@ -0,0 +1,284 @@
+diff -B -N -r -u mysql-5.0.22/config/ac-macros/ha_sphinx.m4 mysql-5.0.22.sx/config/ac-macros/ha_sphinx.m4
+--- mysql-5.0.22/config/ac-macros/ha_sphinx.m4	1970-01-01 01:00:00.000000000 +0100
++++ mysql-5.0.22.sx/config/ac-macros/ha_sphinx.m4	2006-06-06 19:49:38.000000000 +0200
+@@ -0,0 +1,30 @@
++dnl ---------------------------------------------------------------------------
++dnl Macro: MYSQL_CHECK_EXAMPLEDB
++dnl Sets HAVE_SPHINX_DB if --with-sphinx-storage-engine is used
++dnl ---------------------------------------------------------------------------
++AC_DEFUN([MYSQL_CHECK_SPHINXDB], [
++  AC_ARG_WITH([sphinx-storage-engine],
++              [
++  --with-sphinx-storage-engine
++                          Enable the Sphinx Storage Engine],
++              [sphinxdb="$withval"],
++              [sphinxdb=no])
++  AC_MSG_CHECKING([for example storage engine])
++
++  case "$sphinxdb" in
++    yes )
++      AC_DEFINE([HAVE_SPHINX_DB], [1], [Builds Sphinx Engine])
++      AC_MSG_RESULT([yes])
++      [sphinxdb=yes]
++      ;;
++    * )
++      AC_MSG_RESULT([no])
++      [sphinxdb=no]
++      ;;
++  esac
++
++])
++dnl ---------------------------------------------------------------------------
++dnl END OF MYSQL_CHECK_EXAMPLE SECTION
++dnl ---------------------------------------------------------------------------
++
+diff -B -N -r -u mysql-5.0.22/configure.in mysql-5.0.22.sx/configure.in
+--- mysql-5.0.22/configure.in	2006-05-25 10:56:45.000000000 +0200
++++ mysql-5.0.22.sx/configure.in	2006-06-06 19:49:38.000000000 +0200
+@@ -41,6 +41,7 @@
+ sinclude(config/ac-macros/ha_berkeley.m4)
+ sinclude(config/ac-macros/ha_blackhole.m4)
+ sinclude(config/ac-macros/ha_example.m4)
++sinclude(config/ac-macros/ha_sphinx.m4)
+ sinclude(config/ac-macros/ha_federated.m4)
+ sinclude(config/ac-macros/ha_innodb.m4)
+ sinclude(config/ac-macros/ha_ndbcluster.m4)
+@@ -2450,6 +2451,7 @@
+ MYSQL_CHECK_BDB
+ MYSQL_CHECK_INNODB
+ MYSQL_CHECK_EXAMPLEDB
++MYSQL_CHECK_SPHINXDB
+ MYSQL_CHECK_ARCHIVEDB
+ MYSQL_CHECK_CSVDB
+ MYSQL_CHECK_BLACKHOLEDB
+diff -B -N -r -u mysql-5.0.22/libmysqld/Makefile.am mysql-5.0.22.sx/libmysqld/Makefile.am
+--- mysql-5.0.22/libmysqld/Makefile.am	2006-05-25 10:56:55.000000000 +0200
++++ mysql-5.0.22.sx/libmysqld/Makefile.am	2006-06-06 19:49:38.000000000 +0200
+@@ -27,7 +27,7 @@
+ 			-DSHAREDIR="\"$(MYSQLSHAREdir)\""
+ INCLUDES=		@bdb_includes@ \
+ 			-I$(top_builddir)/include -I$(top_srcdir)/include \
+-			-I$(top_srcdir)/sql -I$(top_srcdir)/sql/examples \
++			-I$(top_srcdir)/sql -I$(top_srcdir)/sql/examples -I$(top_srcdir)/sql/sphinx \
+ 			-I$(top_srcdir)/regex \
+ 			$(openssl_includes) $(yassl_includes) @ZLIB_INCLUDES@
+ 
+@@ -38,6 +38,7 @@
+ libmysqlsources =	errmsg.c get_password.c libmysql.c client.c pack.c \
+                         my_time.c
+ sqlexamplessources =	ha_example.cc ha_tina.cc
++sqlsphinxsources =	ha_sphinx.cc
+ 
+ noinst_HEADERS =	embedded_priv.h emb_qcache.h
+ 
+@@ -65,7 +66,7 @@
+ 	parse_file.cc sql_view.cc sql_trigger.cc my_decimal.cc \
+ 	ha_blackhole.cc ha_archive.cc my_user.c
+ 
+-libmysqld_int_a_SOURCES= $(libmysqld_sources) $(libmysqlsources) $(sqlsources) $(sqlexamplessources)
++libmysqld_int_a_SOURCES= $(libmysqld_sources) $(libmysqlsources) $(sqlsources) $(sqlexamplessources) $(sqlsphinxsources)
+ libmysqld_a_SOURCES=
+ 
+ # automake misses these
+@@ -133,12 +134,16 @@
+ 	    rm -f $$f; \
+ 	    @LN_CP_F@ $(top_srcdir)/sql/examples/$$f $$f; \
+ 	  done; \
++	  for f in $(sqlsphinxsources); do \
++	    rm -f $$f; \
++	    @LN_CP_F@ $(top_srcdir)/sql/sphinx/$$f $$f; \
++	  done; \
+ 	  rm -f client_settings.h; \
+ 	  @LN_CP_F@ $(top_srcdir)/libmysql/client_settings.h client_settings.h
+ 
+ 
+ clean-local:
+-	rm -f `echo $(sqlsources) $(libmysqlsources) $(sqlexamplessources) | sed "s;\.lo;.c;g"` \
++	rm -f `echo $(sqlsources) $(libmysqlsources) $(sqlexamplessources) $(sqlsphinxsources) | sed "s;\.lo;.c;g"` \
+ 	       $(top_srcdir)/linked_libmysqld_sources; \
+ 	rm -f client_settings.h
+ 
+diff -B -N -r -u mysql-5.0.22/sql/handler.cc mysql-5.0.22.sx/sql/handler.cc
+--- mysql-5.0.22/sql/handler.cc	2006-05-25 10:56:42.000000000 +0200
++++ mysql-5.0.22.sx/sql/handler.cc	2006-06-06 19:49:38.000000000 +0200
+@@ -78,6 +78,15 @@
+   NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+   HTON_NO_FLAGS };
+ #endif
++#ifdef HAVE_SPHINX_DB
++#include "sphinx/ha_sphinx.h"
++extern handlerton sphinx_hton;
++#else
++handlerton sphinx_hton = { "SPHINX", SHOW_OPTION_NO, "SPHINX storage engine",
++  DB_TYPE_SPHINX_DB, NULL, 0, 0, NULL, NULL,
++  NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
++  HTON_NO_FLAGS };
++#endif
+ #ifdef HAVE_INNOBASE_DB
+ #include "ha_innodb.h"
+ extern handlerton innobase_hton;
+@@ -147,6 +156,7 @@
+   &example_hton,
+   &archive_hton,
+   &tina_hton,
++  &sphinx_hton,
+   &ndbcluster_hton,
+   &federated_hton,
+   &myisammrg_hton,
+@@ -345,6 +355,12 @@
+       return new (alloc) ha_tina(table);
+     return NULL;
+ #endif
++#ifdef HAVE_SPHINX_DB
++  case DB_TYPE_SPHINX_DB:
++    if (have_sphinx_db == SHOW_OPTION_YES)
++      return new (alloc) ha_sphinx(table);
++    return NULL;
++#endif
+ #ifdef HAVE_NDBCLUSTER_DB
+   case DB_TYPE_NDBCLUSTER:
+     if (have_ndbcluster == SHOW_OPTION_YES)
+diff -B -N -r -u mysql-5.0.22/sql/handler.h mysql-5.0.22.sx/sql/handler.h
+--- mysql-5.0.22/sql/handler.h	2006-05-25 10:56:55.000000000 +0200
++++ mysql-5.0.22.sx/sql/handler.h	2006-06-06 19:49:38.000000000 +0200
+@@ -183,8 +183,9 @@
+   DB_TYPE_BERKELEY_DB, DB_TYPE_INNODB,
+   DB_TYPE_GEMINI, DB_TYPE_NDBCLUSTER,
+   DB_TYPE_EXAMPLE_DB, DB_TYPE_ARCHIVE_DB, DB_TYPE_CSV_DB,
+-  DB_TYPE_FEDERATED_DB,
++  DB_TYPE_FEDERATED_DB, 
+   DB_TYPE_BLACKHOLE_DB,
++  DB_TYPE_SPHINX_DB,
+   DB_TYPE_DEFAULT // Must be last
+ };
+ 
+diff -B -N -r -u mysql-5.0.22/sql/Makefile.am mysql-5.0.22.sx/sql/Makefile.am
+--- mysql-5.0.22/sql/Makefile.am	2006-05-25 10:56:41.000000000 +0200
++++ mysql-5.0.22.sx/sql/Makefile.am	2006-06-06 19:49:38.000000000 +0200
+@@ -66,6 +66,7 @@
+ 			sql_array.h sql_cursor.h \
+ 			examples/ha_example.h ha_archive.h \
+ 			examples/ha_tina.h ha_blackhole.h  \
++			sphinx/ha_sphinx.h \
+ 			ha_federated.h
+ mysqld_SOURCES =	sql_lex.cc sql_handler.cc \
+ 			item.cc item_sum.cc item_buff.cc item_func.cc \
+@@ -102,6 +103,7 @@
+ 			sp_cache.cc parse_file.cc sql_trigger.cc \
+ 			examples/ha_example.cc ha_archive.cc \
+ 			examples/ha_tina.cc ha_blackhole.cc \
++			sphinx/ha_sphinx.cc \
+ 			ha_federated.cc
+ 
+ gen_lex_hash_SOURCES =	gen_lex_hash.cc
+diff -B -N -r -u mysql-5.0.22/sql/mysqld.cc mysql-5.0.22.sx/sql/mysqld.cc
+--- mysql-5.0.22/sql/mysqld.cc	2006-05-25 10:56:41.000000000 +0200
++++ mysql-5.0.22.sx/sql/mysqld.cc	2006-06-06 19:49:38.000000000 +0200
+@@ -6420,6 +6420,11 @@
+ #else
+   have_csv_db= SHOW_OPTION_NO;
+ #endif
++#ifdef HAVE_SPHINX_DB
++  have_sphinx_db= SHOW_OPTION_YES;
++#else
++  have_sphinx_db= SHOW_OPTION_NO;
++#endif
+ #ifdef HAVE_NDBCLUSTER_DB
+   have_ndbcluster=SHOW_OPTION_DISABLED;
+ #else
+@@ -7457,6 +7462,7 @@
+ #undef have_example_db
+ #undef have_archive_db
+ #undef have_csv_db
++#undef have_sphinx_db
+ #undef have_federated_db
+ #undef have_partition_db
+ #undef have_blackhole_db
+@@ -7467,6 +7473,7 @@
+ SHOW_COMP_OPTION have_example_db= SHOW_OPTION_NO;
+ SHOW_COMP_OPTION have_archive_db= SHOW_OPTION_NO;
+ SHOW_COMP_OPTION have_csv_db= SHOW_OPTION_NO;
++SHOW_COMP_OPTION have_sphinx_db= SHOW_OPTION_NO;
+ SHOW_COMP_OPTION have_federated_db= SHOW_OPTION_NO;
+ SHOW_COMP_OPTION have_partition_db= SHOW_OPTION_NO;
+ SHOW_COMP_OPTION have_blackhole_db= SHOW_OPTION_NO;
+diff -B -N -r -u mysql-5.0.22/sql/mysql_priv.h mysql-5.0.22.sx/sql/mysql_priv.h
+--- mysql-5.0.22/sql/mysql_priv.h	2006-05-25 10:56:43.000000000 +0200
++++ mysql-5.0.22.sx/sql/mysql_priv.h	2006-06-06 19:49:38.000000000 +0200
+@@ -1279,6 +1279,12 @@
+ #else
+ extern SHOW_COMP_OPTION have_csv_db;
+ #endif
++#ifdef HAVE_SPHINX_DB
++extern handlerton sphinx_hton;
++#define have_sphinx_db sphinx_hton.state
++#else
++extern SHOW_COMP_OPTION have_sphinx_db;
++#endif
+ #ifdef HAVE_FEDERATED_DB
+ extern handlerton federated_hton;
+ #define have_federated_db federated_hton.state
+diff -B -N -r -u mysql-5.0.22/sql/set_var.cc mysql-5.0.22.sx/sql/set_var.cc
+--- mysql-5.0.22/sql/set_var.cc	2006-05-25 10:56:41.000000000 +0200
++++ mysql-5.0.22.sx/sql/set_var.cc	2006-06-06 19:49:38.000000000 +0200
+@@ -809,6 +809,7 @@
+   {"have_compress",	      (char*) &have_compress,		    SHOW_HAVE},
+   {"have_crypt",	      (char*) &have_crypt,		    SHOW_HAVE},
+   {"have_csv",	              (char*) &have_csv_db,	            SHOW_HAVE},
++  {"have_sphinx",	      (char*) &have_sphinx_db,	            SHOW_HAVE},
+   {"have_example_engine",     (char*) &have_example_db,	            SHOW_HAVE},
+   {"have_federated_engine",   (char*) &have_federated_db,           SHOW_HAVE},
+   {"have_geometry",           (char*) &have_geometry,               SHOW_HAVE},
+diff -B -N -r -u mysql-5.0.22/sql/sql_lex.h mysql-5.0.22.sx/sql/sql_lex.h
+--- mysql-5.0.22/sql/sql_lex.h	2006-05-25 10:56:41.000000000 +0200
++++ mysql-5.0.22.sx/sql/sql_lex.h	2006-06-06 19:49:38.000000000 +0200
+@@ -58,6 +58,7 @@
+   SQLCOM_SHOW_DATABASES, SQLCOM_SHOW_TABLES, SQLCOM_SHOW_FIELDS,
+   SQLCOM_SHOW_KEYS, SQLCOM_SHOW_VARIABLES, SQLCOM_SHOW_LOGS, SQLCOM_SHOW_STATUS,
+   SQLCOM_SHOW_INNODB_STATUS, SQLCOM_SHOW_NDBCLUSTER_STATUS, SQLCOM_SHOW_MUTEX_STATUS,
++  SQLCOM_SHOW_SPHINX_STATUS,
+   SQLCOM_SHOW_PROCESSLIST, SQLCOM_SHOW_MASTER_STAT, SQLCOM_SHOW_SLAVE_STAT,
+   SQLCOM_SHOW_GRANTS, SQLCOM_SHOW_CREATE, SQLCOM_SHOW_CHARSETS,
+   SQLCOM_SHOW_COLLATIONS, SQLCOM_SHOW_CREATE_DB, SQLCOM_SHOW_TABLE_STATUS,
+diff -B -N -r -u mysql-5.0.22/sql/sql_parse.cc mysql-5.0.22.sx/sql/sql_parse.cc
+--- mysql-5.0.22/sql/sql_parse.cc	2006-05-25 10:56:41.000000000 +0200
++++ mysql-5.0.22.sx/sql/sql_parse.cc	2006-06-06 19:49:38.000000000 +0200
+@@ -25,6 +25,9 @@
+ #ifdef HAVE_INNOBASE_DB
+ #include "ha_innodb.h"
+ #endif
++#ifdef HAVE_SPHINX_DB
++#include "sphinx/ha_sphinx.h"
++#endif
+ 
+ #ifdef HAVE_NDBCLUSTER_DB
+ #include "ha_ndbcluster.h"
+@@ -2722,6 +2725,15 @@
+       break;
+     }
+ #endif
++#ifdef HAVE_SPHINX_DB
++  case SQLCOM_SHOW_SPHINX_STATUS:
++    {
++      if (check_global_access(thd, SUPER_ACL))
++	goto error;
++      res = sphinx_show_status(thd);
++      break;
++    }
++#endif
+ #ifdef HAVE_REPLICATION
+   case SQLCOM_LOAD_MASTER_TABLE:
+   {
+diff -B -N -r -u mysql-5.0.22/sql/sql_yacc.yy mysql-5.0.22.sx/sql/sql_yacc.yy
+--- mysql-5.0.22/sql/sql_yacc.yy	2006-05-25 10:56:43.000000000 +0200
++++ mysql-5.0.22.sx/sql/sql_yacc.yy	2006-06-06 19:49:38.000000000 +0200
+@@ -6584,6 +6584,9 @@
+ 	    case DB_TYPE_INNODB:
+ 	      Lex->sql_command = SQLCOM_SHOW_INNODB_STATUS;
+ 	      break;
++	    case DB_TYPE_SPHINX_DB:
++	      Lex->sql_command = SQLCOM_SHOW_SPHINX_STATUS;
++	      break;
+ 	    default:
+ 	      my_error(ER_NOT_SUPPORTED_YET, MYF(0), "STATUS");
+ 	      YYABORT;
diff --git a/storage/sphinx/sphinx.5.0.27.diff b/storage/sphinx/sphinx.5.0.27.diff
new file mode 100644
index 00000000000..9ff6cf4fe48
--- /dev/null
+++ b/storage/sphinx/sphinx.5.0.27.diff
@@ -0,0 +1,284 @@
+diff -B -N -r -u mysql-5.0.22/config/ac-macros/ha_sphinx.m4 mysql-5.0.22.sx/config/ac-macros/ha_sphinx.m4
+--- mysql-5.0.22/config/ac-macros/ha_sphinx.m4	1970-01-01 01:00:00.000000000 +0100
++++ mysql-5.0.22.sx/config/ac-macros/ha_sphinx.m4	2006-06-06 19:49:38.000000000 +0200
+@@ -0,0 +1,30 @@
++dnl ---------------------------------------------------------------------------
++dnl Macro: MYSQL_CHECK_EXAMPLEDB
++dnl Sets HAVE_SPHINX_DB if --with-sphinx-storage-engine is used
++dnl ---------------------------------------------------------------------------
++AC_DEFUN([MYSQL_CHECK_SPHINXDB], [
++  AC_ARG_WITH([sphinx-storage-engine],
++              [
++  --with-sphinx-storage-engine
++                          Enable the Sphinx Storage Engine],
++              [sphinxdb="$withval"],
++              [sphinxdb=no])
++  AC_MSG_CHECKING([for example storage engine])
++
++  case "$sphinxdb" in
++    yes )
++      AC_DEFINE([HAVE_SPHINX_DB], [1], [Builds Sphinx Engine])
++      AC_MSG_RESULT([yes])
++      [sphinxdb=yes]
++      ;;
++    * )
++      AC_MSG_RESULT([no])
++      [sphinxdb=no]
++      ;;
++  esac
++
++])
++dnl ---------------------------------------------------------------------------
++dnl END OF MYSQL_CHECK_EXAMPLE SECTION
++dnl ---------------------------------------------------------------------------
++
+diff -B -N -r -u mysql-5.0.22/configure.in mysql-5.0.22.sx/configure.in
+--- mysql-5.0.22/configure.in	2006-05-25 10:56:45.000000000 +0200
++++ mysql-5.0.22.sx/configure.in	2006-06-06 19:49:38.000000000 +0200
+@@ -41,6 +41,7 @@
+ sinclude(config/ac-macros/ha_berkeley.m4)
+ sinclude(config/ac-macros/ha_blackhole.m4)
+ sinclude(config/ac-macros/ha_example.m4)
++sinclude(config/ac-macros/ha_sphinx.m4)
+ sinclude(config/ac-macros/ha_federated.m4)
+ sinclude(config/ac-macros/ha_innodb.m4)
+ sinclude(config/ac-macros/ha_ndbcluster.m4)
+@@ -2450,6 +2451,7 @@
+ MYSQL_CHECK_BDB
+ MYSQL_CHECK_INNODB
+ MYSQL_CHECK_EXAMPLEDB
++MYSQL_CHECK_SPHINXDB
+ MYSQL_CHECK_ARCHIVEDB
+ MYSQL_CHECK_CSVDB
+ MYSQL_CHECK_BLACKHOLEDB
+diff -B -N -r -u mysql-5.0.22/libmysqld/Makefile.am mysql-5.0.22.sx/libmysqld/Makefile.am
+--- mysql-5.0.22/libmysqld/Makefile.am	2006-05-25 10:56:55.000000000 +0200
++++ mysql-5.0.22.sx/libmysqld/Makefile.am	2006-06-06 19:49:38.000000000 +0200
+@@ -27,7 +27,7 @@
+ 			-DSHAREDIR="\"$(MYSQLSHAREdir)\""
+ INCLUDES=		@bdb_includes@ \
+ 			-I$(top_builddir)/include -I$(top_srcdir)/include \
+-			-I$(top_srcdir)/sql -I$(top_srcdir)/sql/examples \
++			-I$(top_srcdir)/sql -I$(top_srcdir)/sql/examples -I$(top_srcdir)/sql/sphinx \
+ 			-I$(top_srcdir)/regex \
+ 			$(openssl_includes) $(yassl_includes) @ZLIB_INCLUDES@
+ 
+@@ -38,6 +38,7 @@
+ libmysqlsources =	errmsg.c get_password.c libmysql.c client.c pack.c \
+                         my_time.c
+ sqlexamplessources =	ha_example.cc ha_tina.cc
++sqlsphinxsources =	ha_sphinx.cc
+ 
+ noinst_HEADERS =	embedded_priv.h emb_qcache.h
+ 
+@@ -65,7 +66,7 @@
+ 	parse_file.cc sql_view.cc sql_trigger.cc my_decimal.cc \
+ 	ha_blackhole.cc ha_archive.cc my_user.c
+ 
+-libmysqld_int_a_SOURCES= $(libmysqld_sources) $(libmysqlsources) $(sqlsources) $(sqlexamplessources)
++libmysqld_int_a_SOURCES= $(libmysqld_sources) $(libmysqlsources) $(sqlsources) $(sqlexamplessources) $(sqlsphinxsources)
+ libmysqld_a_SOURCES=
+ 
+ # automake misses these
+@@ -133,12 +134,16 @@
+ 	    rm -f $$f; \
+ 	    @LN_CP_F@ $(top_srcdir)/sql/examples/$$f $$f; \
+ 	  done; \
++	  for f in $(sqlsphinxsources); do \
++	    rm -f $$f; \
++	    @LN_CP_F@ $(top_srcdir)/sql/sphinx/$$f $$f; \
++	  done; \
+ 	  rm -f client_settings.h; \
+ 	  @LN_CP_F@ $(top_srcdir)/libmysql/client_settings.h client_settings.h
+ 
+ 
+ clean-local:
+-	rm -f `echo $(sqlsources) $(libmysqlsources) $(sqlexamplessources) | sed "s;\.lo;.c;g"` \
++	rm -f `echo $(sqlsources) $(libmysqlsources) $(sqlexamplessources) $(sqlsphinxsources) | sed "s;\.lo;.c;g"` \
+ 	       $(top_srcdir)/linked_libmysqld_sources; \
+ 	rm -f client_settings.h
+ 
+diff -B -N -r -u mysql-5.0.22/sql/handler.cc mysql-5.0.22.sx/sql/handler.cc
+--- mysql-5.0.22/sql/handler.cc	2006-05-25 10:56:42.000000000 +0200
++++ mysql-5.0.22.sx/sql/handler.cc	2006-06-06 19:49:38.000000000 +0200
+@@ -78,6 +78,15 @@
+   NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+   HTON_NO_FLAGS };
+ #endif
++#ifdef HAVE_SPHINX_DB
++#include "sphinx/ha_sphinx.h"
++extern handlerton sphinx_hton;
++#else
++handlerton sphinx_hton = { "SPHINX", SHOW_OPTION_NO, "SPHINX storage engine",
++  DB_TYPE_SPHINX_DB, NULL, 0, 0, NULL, NULL,
++  NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
++  HTON_NO_FLAGS };
++#endif
+ #ifdef HAVE_INNOBASE_DB
+ #include "ha_innodb.h"
+ extern handlerton innobase_hton;
+@@ -147,6 +156,7 @@
+   &example_hton,
+   &archive_hton,
+   &tina_hton,
++  &sphinx_hton,
+   &ndbcluster_hton,
+   &federated_hton,
+   &myisammrg_hton,
+@@ -345,6 +355,12 @@
+       return new (alloc) ha_tina(table);
+     return NULL;
+ #endif
++#ifdef HAVE_SPHINX_DB
++  case DB_TYPE_SPHINX_DB:
++    if (have_sphinx_db == SHOW_OPTION_YES)
++      return new (alloc) ha_sphinx(table);
++    return NULL;
++#endif
+ #ifdef HAVE_NDBCLUSTER_DB
+   case DB_TYPE_NDBCLUSTER:
+     if (have_ndbcluster == SHOW_OPTION_YES)
+diff -B -N -r -u mysql-5.0.22/sql/handler.h mysql-5.0.22.sx/sql/handler.h
+--- mysql-5.0.22/sql/handler.h	2006-05-25 10:56:55.000000000 +0200
++++ mysql-5.0.22.sx/sql/handler.h	2006-06-06 19:49:38.000000000 +0200
+@@ -183,8 +183,9 @@
+   DB_TYPE_BERKELEY_DB, DB_TYPE_INNODB,
+   DB_TYPE_GEMINI, DB_TYPE_NDBCLUSTER,
+   DB_TYPE_EXAMPLE_DB, DB_TYPE_ARCHIVE_DB, DB_TYPE_CSV_DB,
+-  DB_TYPE_FEDERATED_DB,
++  DB_TYPE_FEDERATED_DB, 
+   DB_TYPE_BLACKHOLE_DB,
++  DB_TYPE_SPHINX_DB,
+   DB_TYPE_DEFAULT // Must be last
+ };
+ 
+diff -B -N -r -u mysql-5.0.22/sql/Makefile.am mysql-5.0.22.sx/sql/Makefile.am
+--- mysql-5.0.22/sql/Makefile.am	2006-05-25 10:56:41.000000000 +0200
++++ mysql-5.0.22.sx/sql/Makefile.am	2006-06-06 19:49:38.000000000 +0200
+@@ -66,6 +66,7 @@
+ 			sql_array.h sql_cursor.h \
+ 			examples/ha_example.h ha_archive.h \
+ 			examples/ha_tina.h ha_blackhole.h  \
++			sphinx/ha_sphinx.h \
+ 			ha_federated.h
+ mysqld_SOURCES =	sql_lex.cc sql_handler.cc \
+ 			item.cc item_sum.cc item_buff.cc item_func.cc \
+@@ -102,6 +103,7 @@
+ 			sp_cache.cc parse_file.cc sql_trigger.cc \
+ 			examples/ha_example.cc ha_archive.cc \
+ 			examples/ha_tina.cc ha_blackhole.cc \
++			sphinx/ha_sphinx.cc \
+ 			ha_federated.cc
+ 
+ gen_lex_hash_SOURCES =	gen_lex_hash.cc
+diff -B -N -r -u mysql-5.0.22/sql/mysqld.cc mysql-5.0.22.sx/sql/mysqld.cc
+--- mysql-5.0.22/sql/mysqld.cc	2006-05-25 10:56:41.000000000 +0200
++++ mysql-5.0.22.sx/sql/mysqld.cc	2006-06-06 19:49:38.000000000 +0200
+@@ -6420,6 +6420,11 @@
+ #else
+   have_csv_db= SHOW_OPTION_NO;
+ #endif
++#ifdef HAVE_SPHINX_DB
++  have_sphinx_db= SHOW_OPTION_YES;
++#else
++  have_sphinx_db= SHOW_OPTION_NO;
++#endif
+ #ifdef HAVE_NDBCLUSTER_DB
+   have_ndbcluster=SHOW_OPTION_DISABLED;
+ #else
+@@ -7457,6 +7462,7 @@
+ #undef have_example_db
+ #undef have_archive_db
+ #undef have_csv_db
++#undef have_sphinx_db
+ #undef have_federated_db
+ #undef have_partition_db
+ #undef have_blackhole_db
+@@ -7467,6 +7473,7 @@
+ SHOW_COMP_OPTION have_example_db= SHOW_OPTION_NO;
+ SHOW_COMP_OPTION have_archive_db= SHOW_OPTION_NO;
+ SHOW_COMP_OPTION have_csv_db= SHOW_OPTION_NO;
++SHOW_COMP_OPTION have_sphinx_db= SHOW_OPTION_NO;
+ SHOW_COMP_OPTION have_federated_db= SHOW_OPTION_NO;
+ SHOW_COMP_OPTION have_partition_db= SHOW_OPTION_NO;
+ SHOW_COMP_OPTION have_blackhole_db= SHOW_OPTION_NO;
+diff -B -N -r -u mysql-5.0.22/sql/mysql_priv.h mysql-5.0.22.sx/sql/mysql_priv.h
+--- mysql-5.0.22/sql/mysql_priv.h	2006-05-25 10:56:43.000000000 +0200
++++ mysql-5.0.22.sx/sql/mysql_priv.h	2006-06-06 19:49:38.000000000 +0200
+@@ -1279,6 +1279,12 @@
+ #else
+ extern SHOW_COMP_OPTION have_csv_db;
+ #endif
++#ifdef HAVE_SPHINX_DB
++extern handlerton sphinx_hton;
++#define have_sphinx_db sphinx_hton.state
++#else
++extern SHOW_COMP_OPTION have_sphinx_db;
++#endif
+ #ifdef HAVE_FEDERATED_DB
+ extern handlerton federated_hton;
+ #define have_federated_db federated_hton.state
+diff -B -N -r -u mysql-5.0.22/sql/set_var.cc mysql-5.0.22.sx/sql/set_var.cc
+--- mysql-5.0.22/sql/set_var.cc	2006-05-25 10:56:41.000000000 +0200
++++ mysql-5.0.22.sx/sql/set_var.cc	2006-06-06 19:49:38.000000000 +0200
+@@ -864,6 +864,7 @@
+   {"have_compress",	      (char*) &have_compress,		    SHOW_HAVE},
+   {"have_crypt",	      (char*) &have_crypt,		    SHOW_HAVE},
+   {"have_csv",	              (char*) &have_csv_db,	            SHOW_HAVE},
++  {"have_sphinx",             (char*) &have_sphinx_db,              SHOW_HAVE},
+   {"have_dynamic_loading",    (char*) &have_dlopen,	            SHOW_HAVE},
+   {"have_example_engine",     (char*) &have_example_db,	            SHOW_HAVE},
+   {"have_federated_engine",   (char*) &have_federated_db,           SHOW_HAVE},
+diff -B -N -r -u mysql-5.0.22/sql/sql_lex.h mysql-5.0.22.sx/sql/sql_lex.h
+--- mysql-5.0.22/sql/sql_lex.h	2006-05-25 10:56:41.000000000 +0200
++++ mysql-5.0.22.sx/sql/sql_lex.h	2006-06-06 19:49:38.000000000 +0200
+@@ -58,6 +58,7 @@
+   SQLCOM_SHOW_DATABASES, SQLCOM_SHOW_TABLES, SQLCOM_SHOW_FIELDS,
+   SQLCOM_SHOW_KEYS, SQLCOM_SHOW_VARIABLES, SQLCOM_SHOW_LOGS, SQLCOM_SHOW_STATUS,
+   SQLCOM_SHOW_INNODB_STATUS, SQLCOM_SHOW_NDBCLUSTER_STATUS, SQLCOM_SHOW_MUTEX_STATUS,
++  SQLCOM_SHOW_SPHINX_STATUS,
+   SQLCOM_SHOW_PROCESSLIST, SQLCOM_SHOW_MASTER_STAT, SQLCOM_SHOW_SLAVE_STAT,
+   SQLCOM_SHOW_GRANTS, SQLCOM_SHOW_CREATE, SQLCOM_SHOW_CHARSETS,
+   SQLCOM_SHOW_COLLATIONS, SQLCOM_SHOW_CREATE_DB, SQLCOM_SHOW_TABLE_STATUS,
+diff -B -N -r -u mysql-5.0.22/sql/sql_parse.cc mysql-5.0.22.sx/sql/sql_parse.cc
+--- mysql-5.0.22/sql/sql_parse.cc	2006-05-25 10:56:41.000000000 +0200
++++ mysql-5.0.22.sx/sql/sql_parse.cc	2006-06-06 19:49:38.000000000 +0200
+@@ -25,6 +25,9 @@
+ #ifdef HAVE_INNOBASE_DB
+ #include "ha_innodb.h"
+ #endif
++#ifdef HAVE_SPHINX_DB
++#include "sphinx/ha_sphinx.h"
++#endif
+ 
+ #ifdef HAVE_NDBCLUSTER_DB
+ #include "ha_ndbcluster.h"
+@@ -2722,6 +2725,15 @@
+       break;
+     }
+ #endif
++#ifdef HAVE_SPHINX_DB
++  case SQLCOM_SHOW_SPHINX_STATUS:
++    {
++      if (check_global_access(thd, SUPER_ACL))
++	goto error;
++      res = sphinx_show_status(thd);
++      break;
++    }
++#endif
+ #ifdef HAVE_REPLICATION
+   case SQLCOM_LOAD_MASTER_TABLE:
+   {
+diff -B -N -r -u mysql-5.0.22/sql/sql_yacc.yy mysql-5.0.22.sx/sql/sql_yacc.yy
+--- mysql-5.0.22/sql/sql_yacc.yy	2006-05-25 10:56:43.000000000 +0200
++++ mysql-5.0.22.sx/sql/sql_yacc.yy	2006-06-06 19:49:38.000000000 +0200
+@@ -6584,6 +6584,9 @@
+ 	    case DB_TYPE_INNODB:
+ 	      Lex->sql_command = SQLCOM_SHOW_INNODB_STATUS;
+ 	      break;
++	    case DB_TYPE_SPHINX_DB:
++	      Lex->sql_command = SQLCOM_SHOW_SPHINX_STATUS;
++	      break;
+ 	    default:
+ 	      my_error(ER_NOT_SUPPORTED_YET, MYF(0), "STATUS");
+ 	      YYABORT;
diff --git a/storage/sphinx/sphinx.5.0.37.diff b/storage/sphinx/sphinx.5.0.37.diff
new file mode 100644
index 00000000000..3f86e545b4d
--- /dev/null
+++ b/storage/sphinx/sphinx.5.0.37.diff
@@ -0,0 +1,338 @@
+--- mysql-5.0.67/config/ac-macros/ha_sphinx.m4	1970-01-01 10:00:00.000000000 +1000
++++ mysql-5.0.67-sphinx/config/ac-macros/ha_sphinx.m4	2009-02-14 09:15:48.000000000 +1000
+@@ -0,0 +1,30 @@
++dnl ---------------------------------------------------------------------------
++dnl Macro: MYSQL_CHECK_EXAMPLEDB
++dnl Sets HAVE_SPHINX_DB if --with-sphinx-storage-engine is used
++dnl ---------------------------------------------------------------------------
++AC_DEFUN([MYSQL_CHECK_SPHINXDB], [
++  AC_ARG_WITH([sphinx-storage-engine],
++              [
++  --with-sphinx-storage-engine
++                          Enable the Sphinx Storage Engine],
++              [sphinxdb="$withval"],
++              [sphinxdb=no])
++  AC_MSG_CHECKING([for example storage engine])
++
++  case "$sphinxdb" in
++    yes )
++      AC_DEFINE([HAVE_SPHINX_DB], [1], [Builds Sphinx Engine])
++      AC_MSG_RESULT([yes])
++      [sphinxdb=yes]
++      ;;
++    * )
++      AC_MSG_RESULT([no])
++      [sphinxdb=no]
++      ;;
++  esac
++
++])
++dnl ---------------------------------------------------------------------------
++dnl END OF MYSQL_CHECK_EXAMPLE SECTION
++dnl ---------------------------------------------------------------------------
++
+--- mysql-5.0.67/configure.in	2008-08-04 23:19:07.000000000 +1100
++++ mysql-5.0.67-sphinx/configure.in	2009-02-14 09:15:48.000000000 +1000
+@@ -58,6 +58,7 @@
+ sinclude(config/ac-macros/ha_berkeley.m4)
+ sinclude(config/ac-macros/ha_blackhole.m4)
+ sinclude(config/ac-macros/ha_example.m4)
++sinclude(config/ac-macros/ha_sphinx.m4)
+ sinclude(config/ac-macros/ha_federated.m4)
+ sinclude(config/ac-macros/ha_innodb.m4)
+ sinclude(config/ac-macros/ha_ndbcluster.m4)
+@@ -2625,6 +2626,7 @@
+ MYSQL_CHECK_BDB
+ MYSQL_CHECK_INNODB
+ MYSQL_CHECK_EXAMPLEDB
++MYSQL_CHECK_SPHINXDB
+ MYSQL_CHECK_ARCHIVEDB
+ MYSQL_CHECK_CSVDB
+ MYSQL_CHECK_BLACKHOLEDB
+--- mysql-5.0.67/libmysqld/Makefile.am	2008-08-04 23:19:18.000000000 +1100
++++ mysql-5.0.67-sphinx/libmysqld/Makefile.am	2009-02-14 09:15:48.000000000 +1000
+@@ -29,6 +29,7 @@
+ 			-I$(top_builddir)/include -I$(top_srcdir)/include \
+ 			-I$(top_builddir)/sql -I$(top_srcdir)/sql \
+ 			-I$(top_srcdir)/sql/examples \
++			-I$(top_srcdir)/sql/sphinx \
+ 			-I$(top_srcdir)/regex \
+ 			$(openssl_includes) @ZLIB_INCLUDES@
+ 
+@@ -39,6 +40,7 @@
+ libmysqlsources =	errmsg.c get_password.c libmysql.c client.c pack.c \
+                         my_time.c
+ sqlexamplessources =	ha_example.cc ha_tina.cc
++sqlsphinxsources =	ha_sphinx.cc
+ 
+ noinst_HEADERS =	embedded_priv.h emb_qcache.h
+ 
+@@ -67,7 +69,7 @@
+ 	parse_file.cc sql_view.cc sql_trigger.cc my_decimal.cc \
+ 	ha_blackhole.cc ha_archive.cc my_user.c
+ 
+-libmysqld_int_a_SOURCES= $(libmysqld_sources) $(libmysqlsources) $(sqlsources) $(sqlexamplessources)
++libmysqld_int_a_SOURCES= $(libmysqld_sources) $(libmysqlsources) $(sqlsources) $(sqlexamplessources) $(sqlsphinxsources)
+ libmysqld_a_SOURCES=
+ 
+ # automake misses these
+@@ -147,12 +149,16 @@
+ 	    rm -f $$f; \
+ 	    @LN_CP_F@ $(top_srcdir)/sql/examples/$$f $$f; \
+ 	  done; \
++	  for f in $(sqlsphinxsources); do \
++	    rm -f $$f; \
++	    @LN_CP_F@ $(top_srcdir)/sql/sphinx/$$f $$f; \
++	  done; \
+ 	  rm -f client_settings.h; \
+ 	  @LN_CP_F@ $(top_srcdir)/libmysql/client_settings.h client_settings.h
+ 
+ 
+ clean-local:
+-	rm -f `echo $(sqlsources) $(libmysqlsources) $(sqlexamplessources) | sed "s;\.lo;.c;g"` \
++	rm -f `echo $(sqlsources) $(libmysqlsources) $(sqlexamplessources) $(sqlsphinxsources) | sed "s;\.lo;.c;g"` \
+ 	       $(top_srcdir)/linked_libmysqld_sources; \
+ 	rm -f client_settings.h
+ 
+--- mysql-5.0.67/sql/handler.cc	2008-08-04 23:20:04.000000000 +1100
++++ mysql-5.0.67-sphinx/sql/handler.cc	2009-02-14 09:15:48.000000000 +1000
+@@ -77,6 +77,15 @@
+   NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+   HTON_NO_FLAGS };
+ #endif
++#ifdef HAVE_SPHINX_DB
++#include "sphinx/ha_sphinx.h"
++extern handlerton sphinx_hton;
++#else
++handlerton sphinx_hton = { "SPHINX", SHOW_OPTION_NO, "SPHINX storage engine",
++  DB_TYPE_SPHINX_DB, NULL, 0, 0, NULL, NULL,
++  NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
++  HTON_NO_FLAGS };
++#endif
+ #ifdef HAVE_INNOBASE_DB
+ #include "ha_innodb.h"
+ extern handlerton innobase_hton;
+@@ -141,6 +150,7 @@
+   &example_hton,
+   &archive_hton,
+   &tina_hton,
++  &sphinx_hton,
+   &ndbcluster_hton,
+   &federated_hton,
+   &myisammrg_hton,
+@@ -341,6 +351,12 @@
+       return new (alloc) ha_tina(table);
+     return NULL;
+ #endif
++#ifdef HAVE_SPHINX_DB
++  case DB_TYPE_SPHINX_DB:
++    if (have_sphinx_db == SHOW_OPTION_YES)
++      return new (alloc) ha_sphinx(table);
++    return NULL;
++#endif
+ #ifdef HAVE_NDBCLUSTER_DB
+   case DB_TYPE_NDBCLUSTER:
+     if (have_ndbcluster == SHOW_OPTION_YES)
+--- mysql-5.0.67/sql/handler.h	2008-08-04 23:20:04.000000000 +1100
++++ mysql-5.0.67-sphinx/sql/handler.h	2009-02-14 09:15:48.000000000 +1000
+@@ -186,8 +186,9 @@
+   DB_TYPE_BERKELEY_DB, DB_TYPE_INNODB,
+   DB_TYPE_GEMINI, DB_TYPE_NDBCLUSTER,
+   DB_TYPE_EXAMPLE_DB, DB_TYPE_ARCHIVE_DB, DB_TYPE_CSV_DB,
+-  DB_TYPE_FEDERATED_DB,
++  DB_TYPE_FEDERATED_DB, 
+   DB_TYPE_BLACKHOLE_DB,
++  DB_TYPE_SPHINX_DB,
+   DB_TYPE_DEFAULT // Must be last
+ };
+ 
+--- mysql-5.0.67/sql/Makefile.am	2008-08-04 23:20:02.000000000 +1100
++++ mysql-5.0.67-sphinx/sql/Makefile.am	2009-02-14 09:23:28.000000000 +1000
+@@ -68,6 +68,7 @@
+ 			sql_array.h sql_cursor.h \
+ 			examples/ha_example.h ha_archive.h \
+ 			examples/ha_tina.h ha_blackhole.h  \
++			sphinx/ha_sphinx.h \
+ 			ha_federated.h
+ mysqld_SOURCES =	sql_lex.cc sql_handler.cc \
+ 			item.cc item_sum.cc item_buff.cc item_func.cc \
+@@ -105,6 +106,7 @@
+ 			sp_cache.cc parse_file.cc sql_trigger.cc \
+ 			examples/ha_example.cc ha_archive.cc \
+ 			examples/ha_tina.cc ha_blackhole.cc \
++			sphinx/ha_sphinx.cc \
+ 			ha_federated.cc
+ 
+ gen_lex_hash_SOURCES =	gen_lex_hash.cc
+@@ -174,6 +176,10 @@
+ udf_example_la_SOURCES= udf_example.c
+ udf_example_la_LDFLAGS= -module -rpath $(pkglibdir)
+ 
++pkglib_LTLIBRARIES = sphinx/sphinx.la
++sphinx_sphinx_la_SOURCES = sphinx/snippets_udf.cc
++sphinx_sphinx_la_LDFLAGS = -module
++
+ 
+ # Don't update the files from bitkeeper
+ %::SCCS/s.%
+--- mysql-5.0.67/sql/mysqld.cc	2008-08-04 23:20:07.000000000 +1100
++++ mysql-5.0.67-sphinx/sql/mysqld.cc	2009-02-14 09:15:48.000000000 +1000
+@@ -36,6 +36,10 @@
+ #include <sys/prctl.h>
+ #endif
+ 
++#ifdef HAVE_SPHINX_DB
++#include "sphinx/ha_sphinx.h"
++#endif
++
+ #ifdef HAVE_INNOBASE_DB
+ #define OPT_INNODB_DEFAULT 1
+ #else
+@@ -6633,6 +6637,13 @@
+   {"Threads_running",          (char*) &thread_running,         SHOW_INT_CONST},
+   {"Uptime",                   (char*) 0,                       SHOW_STARTTIME},
+   {"Uptime_since_flush_status",(char*) 0,                       SHOW_FLUSHTIME},
++#ifdef HAVE_SPHINX_DB
++  {"sphinx_total",			(char *)sphinx_showfunc_total,			SHOW_SPHINX_FUNC},
++  {"sphinx_total_found",	(char *)sphinx_showfunc_total_found,	SHOW_SPHINX_FUNC},
++  {"sphinx_time",			(char *)sphinx_showfunc_time,			SHOW_SPHINX_FUNC},
++  {"sphinx_word_count",		(char *)sphinx_showfunc_word_count,		SHOW_SPHINX_FUNC},
++  {"sphinx_words",			(char *)sphinx_showfunc_words,			SHOW_SPHINX_FUNC},
++#endif
+   {NullS, NullS, SHOW_LONG}
+ };
+ 
+@@ -6875,6 +6886,11 @@
+ #else
+   have_csv_db= SHOW_OPTION_NO;
+ #endif
++#ifdef HAVE_SPHINX_DB
++  have_sphinx_db= SHOW_OPTION_YES;
++#else
++  have_sphinx_db= SHOW_OPTION_NO;
++#endif
+ #ifdef HAVE_NDBCLUSTER_DB
+   have_ndbcluster=SHOW_OPTION_DISABLED;
+ #else
+@@ -7983,6 +7999,7 @@
+ #undef have_example_db
+ #undef have_archive_db
+ #undef have_csv_db
++#undef have_sphinx_db
+ #undef have_federated_db
+ #undef have_partition_db
+ #undef have_blackhole_db
+@@ -7993,6 +8010,7 @@
+ SHOW_COMP_OPTION have_example_db= SHOW_OPTION_NO;
+ SHOW_COMP_OPTION have_archive_db= SHOW_OPTION_NO;
+ SHOW_COMP_OPTION have_csv_db= SHOW_OPTION_NO;
++SHOW_COMP_OPTION have_sphinx_db= SHOW_OPTION_NO;
+ SHOW_COMP_OPTION have_federated_db= SHOW_OPTION_NO;
+ SHOW_COMP_OPTION have_partition_db= SHOW_OPTION_NO;
+ SHOW_COMP_OPTION have_blackhole_db= SHOW_OPTION_NO;
+--- mysql-5.0.67/sql/mysql_priv.h	2008-08-04 23:20:07.000000000 +1100
++++ mysql-5.0.67-sphinx/sql/mysql_priv.h	2009-02-14 09:15:48.000000000 +1000
+@@ -1439,6 +1439,12 @@
+ #else
+ extern SHOW_COMP_OPTION have_csv_db;
+ #endif
++#ifdef HAVE_SPHINX_DB
++extern handlerton sphinx_hton;
++#define have_sphinx_db sphinx_hton.state
++#else
++extern SHOW_COMP_OPTION have_sphinx_db;
++#endif
+ #ifdef HAVE_FEDERATED_DB
+ extern handlerton federated_hton;
+ #define have_federated_db federated_hton.state
+--- mysql-5.0.67/sql/set_var.cc	2008-08-04 23:20:08.000000000 +1100
++++ mysql-5.0.67-sphinx/sql/set_var.cc	2009-02-14 09:15:48.000000000 +1000
+@@ -888,6 +888,7 @@
+   {"have_compress",	      (char*) &have_compress,		    SHOW_HAVE},
+   {"have_crypt",	      (char*) &have_crypt,		    SHOW_HAVE},
+   {"have_csv",	              (char*) &have_csv_db,	            SHOW_HAVE},
++  {"have_sphinx",             (char*) &have_sphinx_db,              SHOW_HAVE},
+   {"have_dynamic_loading",    (char*) &have_dlopen,	            SHOW_HAVE},
+   {"have_example_engine",     (char*) &have_example_db,	            SHOW_HAVE},
+   {"have_federated_engine",   (char*) &have_federated_db,           SHOW_HAVE},
+--- mysql-5.0.67/sql/sql_lex.h	2008-08-04 23:20:10.000000000 +1100
++++ mysql-5.0.67-sphinx/sql/sql_lex.h	2009-02-14 09:15:48.000000000 +1000
+@@ -57,6 +57,7 @@
+   SQLCOM_SHOW_DATABASES, SQLCOM_SHOW_TABLES, SQLCOM_SHOW_FIELDS,
+   SQLCOM_SHOW_KEYS, SQLCOM_SHOW_VARIABLES, SQLCOM_SHOW_LOGS, SQLCOM_SHOW_STATUS,
+   SQLCOM_SHOW_INNODB_STATUS, SQLCOM_SHOW_NDBCLUSTER_STATUS, SQLCOM_SHOW_MUTEX_STATUS,
++  SQLCOM_SHOW_SPHINX_STATUS,
+   SQLCOM_SHOW_PROCESSLIST, SQLCOM_SHOW_MASTER_STAT, SQLCOM_SHOW_SLAVE_STAT,
+   SQLCOM_SHOW_GRANTS, SQLCOM_SHOW_CREATE, SQLCOM_SHOW_CHARSETS,
+   SQLCOM_SHOW_COLLATIONS, SQLCOM_SHOW_CREATE_DB, SQLCOM_SHOW_TABLE_STATUS,
+--- mysql-5.0.67/sql/sql_parse.cc	2008-08-04 23:20:10.000000000 +1100
++++ mysql-5.0.67-sphinx/sql/sql_parse.cc	2009-02-14 09:15:48.000000000 +1000
+@@ -24,6 +24,9 @@
+ #ifdef HAVE_INNOBASE_DB
+ #include "ha_innodb.h"
+ #endif
++#ifdef HAVE_SPHINX_DB
++#include "sphinx/ha_sphinx.h"
++#endif
+ 
+ #ifdef HAVE_NDBCLUSTER_DB
+ #include "ha_ndbcluster.h"
+@@ -3006,6 +3009,15 @@
+       break;
+     }
+ #endif
++#ifdef HAVE_SPHINX_DB
++  case SQLCOM_SHOW_SPHINX_STATUS:
++    {
++      if (check_global_access(thd, SUPER_ACL))
++	goto error;
++      res = sphinx_show_status(thd);
++      break;
++    }
++#endif
+ #ifdef HAVE_REPLICATION
+   case SQLCOM_LOAD_MASTER_TABLE:
+   {
+--- mysql-5.0.67/sql/sql_yacc.yy	2008-08-04 23:20:12.000000000 +1100
++++ mysql-5.0.67-sphinx/sql/sql_yacc.yy	2009-02-14 09:15:48.000000000 +1000
+@@ -7393,6 +7393,9 @@
+ 	    case DB_TYPE_INNODB:
+ 	      Lex->sql_command = SQLCOM_SHOW_INNODB_STATUS;
+ 	      break;
++	    case DB_TYPE_SPHINX_DB:
++	      Lex->sql_command = SQLCOM_SHOW_SPHINX_STATUS;
++	      break;
+ 	    default:
+ 	      my_error(ER_NOT_SUPPORTED_YET, MYF(0), "STATUS");
+ 	      MYSQL_YYABORT;
+--- mysql-5.0.67/sql/structs.h	2008-08-04 23:20:12.000000000 +1100
++++ mysql-5.0.67-sphinx/sql/structs.h	2009-02-14 09:15:48.000000000 +1000
+@@ -188,6 +188,9 @@
+   SHOW_SSL_CTX_SESS_TIMEOUTS, SHOW_SSL_CTX_SESS_CACHE_FULL,
+   SHOW_SSL_GET_CIPHER_LIST,
+ #endif /* HAVE_OPENSSL */
++#ifdef HAVE_SPHINX_DB
++  SHOW_SPHINX_FUNC,
++#endif
+   SHOW_NET_COMPRESSION,
+   SHOW_RPL_STATUS, SHOW_SLAVE_RUNNING, SHOW_SLAVE_RETRIED_TRANS,
+   SHOW_KEY_CACHE_LONG, SHOW_KEY_CACHE_CONST_LONG, SHOW_KEY_CACHE_LONGLONG,
+--- mysql-5.0.67/sql/sql_show.cc	2008-08-04 23:20:11.000000000 +1100
++++ mysql-5.0.67-sphinx/sql/sql_show.cc	2009-02-14 09:15:48.000000000 +1000
+@@ -1473,6 +1473,16 @@
+           value=     (char*) ((sys_var*) value)->value_ptr(thd, value_type,
+                                                            &null_lex_str);
+         }
++		#ifdef HAVE_SPHINX_DB
++		else if (show_type == SHOW_SPHINX_FUNC)
++		{
++			SHOW_VAR var;
++			((int (*)(THD *, SHOW_VAR *, char *))value)(thd, &var, buff);
++
++			value = var.value;
++			show_type = var.type;
++		}
++		#endif /* HAVE_SPHINX_DB */
+ 
+         pos= end= buff;
+         switch (show_type) {
diff --git a/storage/xtradb/CMakeLists.txt b/storage/xtradb/CMakeLists.txt
new file mode 100644
index 00000000000..789dbd5327e
--- /dev/null
+++ b/storage/xtradb/CMakeLists.txt
@@ -0,0 +1,256 @@
+# Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved.
+# 
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; version 2 of the License.
+# 
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+# 
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
+
+# This is the CMakeLists for XtraDB
+
+INCLUDE(CheckFunctionExists)
+INCLUDE(CheckCSourceCompiles)
+INCLUDE(CheckCSourceRuns)
+
+# OS tests
+IF(UNIX)
+  IF(CMAKE_SYSTEM_NAME STREQUAL "Linux")
+    CHECK_INCLUDE_FILES (libaio.h HAVE_LIBAIO_H)
+    CHECK_LIBRARY_EXISTS(aio io_queue_init "" HAVE_LIBAIO)
+    ADD_DEFINITIONS("-DUNIV_LINUX -D_GNU_SOURCE=1")
+    IF(HAVE_LIBAIO_H AND HAVE_LIBAIO)
+      ADD_DEFINITIONS(-DLINUX_NATIVE_AIO=1)
+      LINK_LIBRARIES(aio)
+    ENDIF()
+  ELSEIF(CMAKE_SYSTEM_NAME MATCHES "HP*")
+    ADD_DEFINITIONS("-DUNIV_HPUX -DUNIV_MUST_NOT_INLINE")
+  ELSEIF(CMAKE_SYSTEM_NAME STREQUAL "AIX")
+    ADD_DEFINITIONS("-DUNIV_AIX -DUNIX_MUST_NOT_INLINE")
+  ELSEIF(CMAKE_SYSTEM_NAME STREQUAL "SunOS")
+    ADD_DEFINITIONS("-DUNIV_SOLARIS")
+  ELSE()
+   ADD_DEFINITIONS("-DUNIV_MUST_NOT_INLINE")
+  ENDIF()
+ENDIF()
+
+# Enable InnoDB's UNIV_DEBUG if MySQL's WITH_DEBUG[_FULL] is defined
+# enable when this bug is resolved:
+# Bug#54861 Additional connections not handled properly in mtr --embedded
+#IF(WITH_DEBUG)
+#  ADD_DEFINITIONS("-DUNIV_DEBUG")
+#ENDIF()
+
+IF(NOT MSVC)
+# either define HAVE_IB_GCC_ATOMIC_BUILTINS or not
+IF(NOT CMAKE_CROSSCOMPILING)
+  CHECK_C_SOURCE_RUNS(
+  "
+  int main()
+  {
+    long	x;
+    long	y;
+    long	res;
+    char	c;
+
+    x = 10;
+    y = 123;
+    res = __sync_bool_compare_and_swap(&x, x, y);
+    if (!res || x != y) {
+      return(1);
+    }
+
+    x = 10;
+    y = 123;
+    res = __sync_bool_compare_and_swap(&x, x + 1, y);
+    if (res || x != 10) {
+      return(1);
+    }
+    x = 10;
+    y = 123;
+    res = __sync_add_and_fetch(&x, y);
+    if (res != 123 + 10 || x != 123 + 10) {
+      return(1);
+    }
+
+    c = 10;
+    res = __sync_lock_test_and_set(&c, 123);
+    if (res != 10 || c != 123) {
+      return(1);
+    }
+    return(0);
+  }"
+  HAVE_IB_GCC_ATOMIC_BUILTINS
+  )
+ENDIF()
+
+IF(HAVE_IB_GCC_ATOMIC_BUILTINS)
+ ADD_DEFINITIONS(-DHAVE_IB_GCC_ATOMIC_BUILTINS=1)
+ENDIF()
+
+ # either define HAVE_IB_ATOMIC_PTHREAD_T_GCC or not
+IF(NOT CMAKE_CROSSCOMPILING)
+  CHECK_C_SOURCE_RUNS(
+  "
+  #include <pthread.h>
+  #include <string.h>
+
+  int main() {
+    pthread_t       x1;
+    pthread_t       x2;
+    pthread_t       x3;
+
+    memset(&x1, 0x0, sizeof(x1));
+    memset(&x2, 0x0, sizeof(x2));
+    memset(&x3, 0x0, sizeof(x3));
+
+    __sync_bool_compare_and_swap(&x1, x2, x3);
+
+    return(0);
+  }"
+  HAVE_IB_ATOMIC_PTHREAD_T_GCC)
+ENDIF()
+IF(HAVE_IB_ATOMIC_PTHREAD_T_GCC)
+  ADD_DEFINITIONS(-DHAVE_IB_ATOMIC_PTHREAD_T_GCC=1)
+ENDIF()
+
+ENDIF(NOT MSVC)
+
+# Solaris atomics
+IF(CMAKE_SYSTEM_NAME STREQUAL "SunOS")
+  CHECK_FUNCTION_EXISTS(atomic_cas_ulong  HAVE_ATOMIC_CAS_ULONG)
+  CHECK_FUNCTION_EXISTS(atomic_cas_32 HAVE_ATOMIC_CAS_32)
+  CHECK_FUNCTION_EXISTS(atomic_cas_64 HAVE_ATOMIC_CAS_64)
+  CHECK_FUNCTION_EXISTS(atomic_add_long HAVE_ATOMIC_ADD_LONG)
+  IF(HAVE_ATOMIC_CAS_ULONG AND HAVE_ATOMIC_CAS_32 AND
+    HAVE_ATOMIC_CAS_64 AND HAVE_ATOMIC_ADD_LONG)
+    SET(HAVE_IB_SOLARIS_ATOMICS 1)
+  ENDIF()
+  
+  IF(HAVE_IB_SOLARIS_ATOMICS)
+    ADD_DEFINITIONS(-DHAVE_IB_SOLARIS_ATOMICS=1)
+  ENDIF()
+
+  IF(NOT CMAKE_CROSSCOMPILING)
+  # either define HAVE_IB_ATOMIC_PTHREAD_T_SOLARIS or not
+  CHECK_C_SOURCE_COMPILES(
+  "   #include <pthread.h>
+      #include <string.h>
+
+      int main(int argc, char** argv) {
+        pthread_t       x1;
+        pthread_t       x2;
+        pthread_t       x3;
+
+        memset(&x1, 0x0, sizeof(x1));
+        memset(&x2, 0x0, sizeof(x2));
+        memset(&x3, 0x0, sizeof(x3));
+
+        if (sizeof(pthread_t) == 4) {
+        
+          atomic_cas_32(&x1, x2, x3);
+        
+        } else if (sizeof(pthread_t) == 8) {
+        
+          atomic_cas_64(&x1, x2, x3);
+        
+        } else {
+        
+          return(1);
+        }
+
+      return(0);
+    }
+  " HAVE_IB_ATOMIC_PTHREAD_T_SOLARIS)
+  ENDIF()
+  IF(HAVE_IB_ATOMIC_PTHREAD_T_SOLARIS)
+    ADD_DEFINITIONS(-DHAVE_IB_ATOMIC_PTHREAD_T_SOLARIS=1)
+  ENDIF()
+ENDIF()
+
+
+IF(UNIX)
+# this is needed to know which one of atomic_cas_32() or atomic_cas_64()
+# to use in the source
+SET(CMAKE_EXTRA_INCLUDE_FILES pthread.h)
+CHECK_TYPE_SIZE(pthread_t SIZEOF_PTHREAD_T)
+SET(CMAKE_EXTRA_INCLUDE_FILES)
+ENDIF()
+
+IF(SIZEOF_PTHREAD_T)
+  ADD_DEFINITIONS(-DSIZEOF_PTHREAD_T=${SIZEOF_PTHREAD_T})
+ENDIF()
+
+IF(MSVC)
+  ADD_DEFINITIONS(-DHAVE_WINDOWS_ATOMICS -DHAVE_IB_PAUSE_INSTRUCTION)
+ENDIF()
+
+
+# Include directories under innobase
+INCLUDE_DIRECTORIES(${CMAKE_SOURCE_DIR}/storage/innobase/include
+		    ${CMAKE_SOURCE_DIR}/storage/innobase/handler)
+
+# Sun Studio bug with -xO2
+IF(CMAKE_C_COMPILER_ID MATCHES "SunPro" 
+	AND CMAKE_C_FLAGS_RELEASE MATCHES "O2" 
+	AND NOT CMAKE_BUILD_TYPE STREQUAL "Debug")
+	# Sun Studio 12 crashes with -xO2 flag, but not with higher optimization
+	# -xO3
+	SET_SOURCE_FILES_PROPERTIES(${CMAKE_CURRENT_SOURCE_DIR}/rem/rem0rec.c 
+    PROPERTIES COMPILE_FLAGS -xO3)
+ENDIF()
+
+# Removing compiler optimizations for innodb/mem/* files on 64-bit Windows
+# due to 64-bit compiler error, See MySQL Bug #19424, #36366, #34297
+IF (MSVC AND CMAKE_SIZEOF_VOID_P EQUAL 8)
+	SET_SOURCE_FILES_PROPERTIES(mem/mem0mem.c mem/mem0pool.c
+				    PROPERTIES COMPILE_FLAGS -Od)
+ENDIF()
+
+ADD_DEFINITIONS(-D_WIN32 -D_LIB -DMYSQL_SERVER)
+
+SET(XTRADB_SOURCES	btr/btr0btr.c btr/btr0cur.c btr/btr0pcur.c btr/btr0sea.c
+			buf/buf0buddy.c buf/buf0buf.c buf/buf0flu.c buf/buf0lru.c buf/buf0rea.c
+			data/data0data.c data/data0type.c
+			dict/dict0boot.c dict/dict0crea.c dict/dict0dict.c dict/dict0load.c dict/dict0mem.c
+			dyn/dyn0dyn.c
+			eval/eval0eval.c eval/eval0proc.c
+			fil/fil0fil.c
+			fsp/fsp0fsp.c
+			fut/fut0fut.c fut/fut0lst.c
+			ha/ha0ha.c ha/hash0hash.c ha/ha0storage.c
+			ibuf/ibuf0ibuf.c
+			pars/lexyy.c pars/pars0grm.c pars/pars0opt.c pars/pars0pars.c pars/pars0sym.c
+			lock/lock0lock.c lock/lock0iter.c
+			log/log0log.c log/log0recv.c
+			mach/mach0data.c
+			mem/mem0mem.c mem/mem0pool.c
+			mtr/mtr0log.c mtr/mtr0mtr.c
+			os/os0file.c os/os0proc.c os/os0sync.c os/os0thread.c
+			page/page0cur.c page/page0page.c page/page0zip.c
+			que/que0que.c
+			handler/ha_innodb.cc handler/handler0alter.cc handler/i_s.cc handler/mysql_addons.cc
+			read/read0read.c
+			rem/rem0cmp.c rem/rem0rec.c
+			row/row0ext.c row/row0ins.c row/row0merge.c row/row0mysql.c row/row0purge.c row/row0row.c
+			row/row0sel.c row/row0uins.c row/row0umod.c row/row0undo.c row/row0upd.c row/row0vers.c
+			srv/srv0que.c srv/srv0srv.c srv/srv0start.c
+			sync/sync0arr.c sync/sync0rw.c sync/sync0sync.c
+			thr/thr0loc.c
+			trx/trx0i_s.c trx/trx0purge.c trx/trx0rec.c trx/trx0roll.c trx/trx0rseg.c
+			trx/trx0sys.c trx/trx0trx.c trx/trx0undo.c
+			usr/usr0sess.c
+			ut/ut0byte.c ut/ut0dbg.c ut/ut0mem.c ut/ut0rbt.c ut/ut0rnd.c ut/ut0ut.c ut/ut0vec.c
+			ut/ut0list.c ut/ut0wqueue.c)
+
+# New plugin support, cross-platform ,  base name for shared module is "ha_innodb"
+MYSQL_ADD_PLUGIN(innobase ${XTRADB_SOURCES} STORAGE_ENGINE 
+	DEFAULT
+	MODULE_OUTPUT_NAME ha_xtradb
+	LINK_LIBRARIES ${ZLIB_LIBRARY})
diff --git a/storage/xtradb/COPYING.Google b/storage/xtradb/COPYING.Google
new file mode 100644
index 00000000000..5ade2b0e381
--- /dev/null
+++ b/storage/xtradb/COPYING.Google
@@ -0,0 +1,30 @@
+Portions of this software contain modifications contributed by Google, Inc.
+These contributions are used with the following license:
+
+Copyright (c) 2008, Google Inc. All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+      * Redistributions of source code must retain the above copyright
+        notice, this list of conditions and the following disclaimer.
+      * Redistributions in binary form must reproduce the above
+        copyright notice, this list of conditions and the following
+        disclaimer in the documentation and/or other materials
+        provided with the distribution.
+      * Neither the name of the Google Inc. nor the names of its
+        contributors may be used to endorse or promote products
+        derived from this software without specific prior written
+        permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/storage/xtradb/COPYING.Percona b/storage/xtradb/COPYING.Percona
new file mode 100644
index 00000000000..8c786811719
--- /dev/null
+++ b/storage/xtradb/COPYING.Percona
@@ -0,0 +1,30 @@
+Portions of this software contain modifications contributed by Percona, Inc.
+These contributions are used with the following license:
+
+Copyright (c) 2008, 2009, Percona Inc. All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+      * Redistributions of source code must retain the above copyright
+        notice, this list of conditions and the following disclaimer.
+      * Redistributions in binary form must reproduce the above
+        copyright notice, this list of conditions and the following
+        disclaimer in the documentation and/or other materials
+        provided with the distribution.
+      * Neither the name of the Percona Inc. nor the names of its
+        contributors may be used to endorse or promote products
+        derived from this software without specific prior written
+        permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/storage/xtradb/COPYING.Sun_Microsystems b/storage/xtradb/COPYING.Sun_Microsystems
new file mode 100644
index 00000000000..5a77ef3ab73
--- /dev/null
+++ b/storage/xtradb/COPYING.Sun_Microsystems
@@ -0,0 +1,31 @@
+Portions of this software contain modifications contributed by
+Sun Microsystems, Inc. These contributions are used with the following
+license:
+
+Copyright (c) 2009, Sun Microsystems, Inc.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+      * Redistributions of source code must retain the above copyright
+        notice, this list of conditions and the following disclaimer.
+      * Redistributions in binary form must reproduce the above
+        copyright notice, this list of conditions and the following
+        disclaimer in the documentation and/or other materials
+        provided with the distribution.
+      * Neither the name of Sun Microsystems, Inc. nor the names of its
+        contributors may be used to endorse or promote products derived
+        from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/storage/xtradb/ChangeLog b/storage/xtradb/ChangeLog
new file mode 100644
index 00000000000..43f87a1baf5
--- /dev/null
+++ b/storage/xtradb/ChangeLog
@@ -0,0 +1,1916 @@
+2010-08-24	The InnoDB Team
+
+	* handler/ha_innodb.c, dict/dict0dict.c:
+	Fix Bug #55832 selects crash too easily when innodb_force_recovery>3
+
+2010-08-03	The InnoDB Team
+
+	* include/dict0dict.h, include/dict0dict.ic, row/row0mysql.c:
+	Fix bug #54678, InnoDB, TRUNCATE, ALTER, I_S SELECT, crash or deadlock
+
+2010-08-03	The InnoDB Team
+
+	* dict/dict0load.c, handler/ha_innodb.cc, include/db0err.h,
+	include/dict0load.h, include/dict0mem.h, include/que0que.h,
+	row/row0merge.c, row/row0mysql.c:
+	Fix Bug#54582 stack overflow when opening many tables linked
+	with foreign keys at once
+
+2010-08-03	The InnoDB Team
+
+	* include/ut0mem.h, ut/ut0mem.c:
+	Fix Bug #55627 segv in ut_free pars_lexer_close innobase_shutdown
+	innodb-use-sys-malloc=0
+
+2010-08-01	The InnoDB Team
+
+	* handler/ha_innodb.cc
+	Fix Bug #55382 Assignment with SELECT expressions takes unexpected
+	S locks in READ COMMITTED
+>>>>>>> MERGE-SOURCE
+
+2010-07-27	The InnoDB Team
+
+	* include/mem0pool.h, mem/mem0mem.c, mem/mem0pool.c, srv/srv0start.c:
+	Fix Bug#55581 shutdown with innodb-use-sys-malloc=0: assert
+	mutex->magic_n == MUTEX_MAGIC_N.
+
+2010-06-30	The InnoDB Team
+
+	* btr/btr0sea.c, ha/ha0ha.c, handler/ha_innodb.cc, include/btr0sea.h:
+	Fix Bug#54311 Crash on CHECK PARTITION after concurrent LOAD DATA
+	and adaptive_hash_index=OFF
+
+2010-06-29	The InnoDB Team
+	* row/row0row.c, row/row0undo.c, row/row0upd.c:
+	Fix Bug#54408 txn rollback after recovery: row0umod.c:673
+	dict_table_get_format(index->table)
+
+2010-06-29	The InnoDB Team
+
+	* btr/btr0cur.c, include/btr0cur.h,
+	include/row0mysql.h, row/row0merge.c, row/row0sel.c:
+	Fix Bug#54358 READ UNCOMMITTED access failure of off-page DYNAMIC
+	or COMPRESSED columns
+
+2010-06-24	The InnoDB Team
+
+	* handler/ha_innodb.cc:
+	Fix Bug#54679 alter table causes compressed row_format to revert
+	to compact
+
+2010-06-22	The InnoDB Team
+
+	* dict/dict0dict.c, dict/dict0mem.c, include/dict0mem.h,
+	include/univ.i, page/page0zip.c, row/row0merge.c:
+	Fix Bug#47991 InnoDB Dictionary Cache memory usage increases
+	indefinitely when renaming tables
+
+2010-06-22	The InnoDB Team
+
+	* handler/ha_innodb.cc:
+	Fix Bug#54686: "field->col->mtype == type" assertion error at
+	row/row0sel.c
+
+2010-06-22	The InnoDB Team
+
+	* handler/ha_innodb.cc, innodb_bug54044.result, innodb_bug54044.test:
+	Fix Bug#54044 Create temporary tables and using innodb crashes.
+
+2010-06-21	The InnoDB Team
+
+	* dict/dict0load.c, fil/fil0fil.c:
+	Fix Bug#54658: InnoDB: Warning: allocated tablespace %lu,
+	old maximum was 0 (introduced in Bug #53578 fix)
+
+2010-06-16	The InnoDB Team
+
+	* row/row0merge.c:
+	Fix Bug#54330 Broken fast index creation
+
+2010-06-10	The InnoDB Team
+
+	* include/log0log.ic, row/row0ins.c, row/row0purge.c,
+	row/row0uins.c, row/row0umod.c, row/row0upd.c:
+	Fix Bug#39168 ERROR: the age of the last checkpoint ... exceeds
+	the log group capacity
+
+2010-06-08	The InnoDB Team
+
+	* dict/dict0load.c:
+	Fix Bug#54009 Server crashes when data is selected from non backed
+	up table for InnoDB plugin
+
+2010-06-02	The InnoDB Team
+
+	* include/db0err.h, include/lock0lock.h, include/row0mysql.h,
+	lock/lock0lock.c, row/row0ins.c, row/row0mysql.c, row/row0sel.c:
+	Fix Bug#53674 InnoDB: Error: unlock row could not find a
+	4 mode lock on the record
+
+2010-06-01	The InnoDB Team
+
+	* include/sync0rw.h, sync/sync0rw.c:
+	Fix Bug#48197 Concurrent rw_lock_free may cause assertion failure
+
+2010-06-01	The InnoDB Team
+
+	* row/row0umod.c:
+	Fix Bug#53812 assert row/row0umod.c line 660 in txn rollback
+	after crash recovery
+
+2010-05-25	The InnoDB Team
+
+	* handler/ha_innodb.cc, include/row0mysql.h, row/row0mysql.c:
+	Fix Bug#53592: crash replacing duplicates into table after fast
+	alter table added unique key 
+
+2010-05-24	The InnoDB Team
+
+	* dict/dict0boot.c, dict/dict0crea.c, fil/fil0fil.c,
+	include/dict0boot.h, include/fil0fil.h, row/row0mysql.c:
+	Fix Bug#53578: assert on invalid page access, in fil_io()
+
+2010-05-14	The InnoDB Team
+	* mysql-test/innodb_bug48024.test, mysql-test/innodb_bug48024.result,
+	dict/dict0dict.c, handler/ha_innodb.cc, handler/ha_innodb.h,
+	include/dict0dict.h, include/ha_prototypes.h, include/row0mysql.h,
+	include/trx0trx.h, row/row0mysql.c, trx/trx0i_s.c, trx/trx0trx.c:
+	Fix Bug#48024 Innodb doesn't work with multi-statements
+	Fix Bug#53644 InnoDB thinks that /*/ starts and ends a comment
+
+2010-05-12	The InnoDB Team
+
+	* handler/handler0alter.cc:
+	Fix Bug#53591 crash with fast alter table and text/blob prefix
+	primary key
+
+2010-05-12	The InnoDB Team
+
+	* row/row0merge.c:
+	Fix Bug#53471 row_merge_drop_temp_indexes() refers freed memory, SEGVs
+
+2010-05-11	The InnoDB Team
+
+	* mysql-test/innodb_bug53290.test, mysql-test/innodb_bug53290.result,
+	include/rem0cmp.h, rem/rem0cmp.c, row/row0merge.c:
+	Fix Bug#53290 wrong duplicate key error when adding a unique index
+	via fast alter table
+
+2010-05-11	The InnoDB Team
+	* buf/buf0lru.c, include/buf0buf.ic:
+	Fix Bug#53307 valgrind: warnings in main.partition_innodb_plugin
+
+2010-05-05	The InnoDB Team
+
+	* row/row0merge.c:
+	Fix Bug#53256 in a stress test, assert dict/dict0dict.c:815
+	table2 == NULL
+
+2010-05-05	The InnoDB Team
+
+	* handler/ha_innodb.cc:
+	Fix Bug#53165 Setting innodb_change_buffering=DEFAULT produces
+	incorrect result
+
+2010-05-04	The InnoDB Team
+
+	* fsp/fsp0fsp.c:
+	Fix Bug#53306 valgrind: warnings in innodb.innodb
+
+2010-05-03	The InnoDB Team
+
+	* buf0buf.c:
+	Fix Bug#53248 compressed tables page checksum mismatch after
+	re-enabling innodb_checksums
+
+2010-04-28	The InnoDB Team
+
+	* log/log0recv.h, log/log0recv.c:
+	Fix Bug#53122 InnoDB recovery uses too big a hash table for redo
+	log records
+
+2010-04-27	The InnoDB Team
+
+	* handler/ha_innodb.cc, lock/lock0lock.c, row/row0mysql.c,
+	row/row0sel.c:
+	Fix Bug#48607 READ UNCOMMITTED uses more locks than READ COMMITTED
+	in InnoDB 5.1+
+
+2010-04-26	The InnoDB Team
+
+	* row/row0sel.c:
+	Fix Bug#52663 Lost update incrementing column value under
+	READ COMMITTED isolation level
+
+2010-04-22	The InnoDB Team
+
+	* include/dict0boot.h, dict/dict0boot.c:
+	Fix a bug that prevented the crash recovery of fast CREATE INDEX
+	from dropping partially created indexes.
+
+2010-04-21	The InnoDB Team
+
+	* btr/btr0btr.c:
+	Fix Bug#52964 Infinite loop in btr_page_split_and_insert()
+	in ROW_FORMAT=COMPRESSED
+
+2010-04-21	The InnoDB Team
+
+	* data/data0data.c:
+	Fix Bug#52745 Failing assertion: blob_no < page_zip->n_blobs
+
+2010-04-20	The InnoDB Team
+
+	* dict/dict0crea.c, handler/ha_innodb.cc, include/trx0trx.h:
+	Fix Bug#50495 'Row size too large' for plugin, but works for
+	built-in InnoDB
+	Only check the record size at index creation time when
+	innodb_strict_mode is set or when ROW_FORMAT is DYNAMIC or COMPRESSED.
+
+2010-04-15	The InnoDB Team
+
+	* trx/trx0rec.c:
+	Fix Bug#52746 InnoDB purge thread crashed with table containing
+	prefix indexed blobs
+
+2010-03-31	The InnoDB Team
+
+	* mysql-test/innodb_bug51920.test, mysql-test/innodb_bug51920.result,
+	srv/srv0srv.c:
+	Fix Bug#51920 InnoDB connections in row lock wait ignore KILL
+	until lock wait timeout
+
+2010-03-31	The InnoDB Team
+
+	* mysql-test/innodb_bug38231.test:
+	Remove non-determinism in the test case.
+
+2010-03-29	The InnoDB Team
+
+	InnoDB Plugin 1.0.7 released
+
+2010-03-18	The InnoDB Team
+
+	* CMakeLists.txt:
+	Fix Bug#52102 InnoDB Plugin shows performance drop compared to
+	InnoDB (Windows)
+
+2010-03-18	The InnoDB Team
+
+	* buf0buf.ic:
+	When comparing the time of the first access to a block against
+	innodb_old_blocks_time, use 32-bit arithmetics. The comparison was
+	incorrect on 64-bit systems.
+
+2010-03-11	The InnoDB Team
+
+	* buf0buf.h, buf0buf.ic:
+	Fix and clarify the latching of some buf_block_t members.
+	Note that check_index_page_at_flush is not protected by any mutex.
+	Note and assert that lock_hash_val is protected by the rw-latch.
+
+2010-03-10	The InnoDB Team
+
+	* trx/trx0sys.c:
+	Fix Bug#51653 outdated reference to set-variable
+
+2010-03-10	The InnoDB Team
+
+	* handler/ha_innodb.cc, mysql-test/innodb_bug21704.result,
+	mysql-test/innodb_bug47621.result, mysql-test/innodb_bug47621.test:
+	Fix Bug#47621 MySQL and InnoDB data dictionaries will become out of
+	sync when renaming columns
+
+2010-03-10	The InnoDB Team
+
+	* handler/ha_innodb.cc:
+	Fix Bug#51356 Many Valgrind errors in error messages
+	with concurrent DDL
+
+2010-03-10	The InnoDB Team
+
+	* handler/ha_innodb.cc, handler/handler0alter.cc,
+	mysql-test/innodb_bug51378.result, mysql-test/innodb_bug51378.test:
+	Fix Bug#51378 Init 'ref_length' to correct value, in case an out
+	of bound MySQL primary_key
+
+2010-03-10	The InnoDB Team
+
+	* log/log0recv.c:
+	Remove a bogus assertion about page numbers exceeding 0x90000000
+	in the redo log. Abort when encountering a corrupted redo log
+	record, unless innodb_force_recovery is set.
+
+2010-03-09	The InnoDB Team
+
+	* handler/ha_innodb.cc:
+	Make SHOW ENGINE INNODB MUTEX STATUS display SUM(os_waits)
+	for the buffer pool block mutexes and locks.
+
+2010-03-08	The InnoDB Team
+
+	* fil/fil0fil.c:
+	Fix ALTER TABLE ... IMPORT TABLESPACE of compressed tables.
+
+2010-03-03	The InnoDB Team
+
+	* handler/handler0alter.cc, innodb-index.result, innodb-index.test,
+	innodb.result, innodb.test:
+	Disallow a duplicate index name when creating an index.
+
+2010-02-11	The InnoDB Team
+
+	* include/mem0mem.h, include/mem0mem.ic, mem/mem0mem.c:
+	Fix Bug#49535 Available memory check slows down crash
+	recovery tens of times
+
+2010-02-09	The InnoDB Team
+
+	* buf/buf0buf.c:
+	Fix Bug#38901 InnoDB logs error repeatedly when trying to load
+	page into buffer pool
+
+2010-02-09	The InnoDB Team
+
+	* srv/srv0srv.c:
+	Let the master thread sleep if the amount of work to be done is
+	calibrated as taking less than a second.
+
+2010-02-04	The InnoDB Team
+
+	* btr/btr0btr.c, btr/btr0cur.c, btr/btr0pcur.c, buf/buf0buf.c,
+	include/btr0btr.h, include/btr0cur.h, include/btr0pcur.h,
+	include/btr0pcur.ic, include/buf0buf.h, row/row0ins.c, row/row0sel.c:
+	Pass the file name and line number of the caller of the
+	b-tree cursor functions to the buffer pool requests, in order
+	to make the latch diagnostics more accurate.
+
+2010-02-03	The InnoDB Team
+
+	* lock/lock0lock.c:
+	Fix Bug#49001 SHOW INNODB STATUS deadlock info incorrect
+	when deadlock detection aborts
+
+2010-02-03	The InnoDB Team
+
+	* buf/buf0lru.c:
+	Fix Bug#35077 Very slow DROP TABLE (ALTER TABLE, OPTIMIZE TABLE)
+	on compressed tables
+
+2010-02-03	The InnoDB Team
+
+	* handler/ha_innodb.cc, include/row0mysql.h, row/row0mysql.c:
+	Clean up CHECK TABLE error handling.
+
+2010-02-01	The InnoDB Team
+
+	* handler/ha_innodb.cc, mysql-test/innodb-autoinc.test,
+	mysql-test/innodb-autoinc.result,
+	mysql-test/innodb-autoinc-44030.test,
+	mysql-test/innodb-autoinc-44030.result:
+	Fix Bug#49497 Error 1467 (ER_AUTOINC_READ_FAILED) on inserting
+	a negative value
+
+2010-01-28	The InnoDB Team
+	*  handler/ha_innodb.h, handler/ha_innodb.cc,
+	handler/handler0alter.cc,
+	mysql-test/innodb_bug47622.test,
+	mysql-test/innodb_bug47622.result:
+	Fix Bug#47622 the new index is added before the existing ones
+	in MySQL, but after one in SE
+
+2010-01-27	The InnoDB Team
+
+	* include/row0mysql.h, log/log0recv.c, row/row0mysql.c:
+	Drop temporary tables at startup.
+	This addresses the third aspect of
+	Bug#41609 Crash recovery does not work for InnoDB temporary tables.
+
+2010-01-21	The InnoDB Team
+
+	* buf/buf0buf.c:
+	Do not merge buffered inserts to compressed pages before
+	the redo log has been applied in crash recovery.
+
+2010-01-13	The InnoDB Team
+
+	* row/row0sel.c:
+	On the READ UNCOMMITTED isolation level, do not attempt to access
+	a clustered index record that has been marked for deletion. The
+	built-in InnoDB in MySQL 5.1 and earlier would attempt to retrieve
+	a previous version of the record in this case.
+
+2010-01-13	The InnoDB Team
+
+	* buf/buf0buf.c:
+	When disabling the adaptive hash index, check the block state
+	before checking block->is_hashed, because the latter may be
+	uninitialized right after server startup.
+
+2010-01-12	The InnoDB Team
+
+	* handler/ha_innodb.cc, handler/ha_innodb.h:
+	Fix Bug#46193 crash when accessing tables after enabling
+	innodb_force_recovery option
+
+2010-01-12	The InnoDB Team
+
+	* row/row0mysql.c:
+	Fix Bug#49238 Creating/Dropping a temporary table while at 1023
+	transactions will cause assert.
+
+2009-12-02	The InnoDB Team
+
+	* srv/srv0start.c:
+	Display the zlib version number at startup.
+	InnoDB compressed tables use zlib, and the implementation depends
+	on the zlib function compressBound(), whose definition was slightly
+	changed in zlib version 1.2.3.1 in 2006. MySQL bundles zlib 1.2.3
+	from 2005, but some installations use a more recent zlib.
+
+2009-11-30	The InnoDB Team
+
+	* dict/dict0crea.c, dict/dict0mem.c, dict/dict0load.c,
+	dict/dict0boot.c, fil/fil0fil.c, handler/ha_innodb.cc,
+	include/dict0mem.h, row/row0mysql.c:
+	Fix the bogus warning messages for non-existing temporary
+	tables that were reported in
+	Bug#41609 Crash recovery does not work for InnoDB temporary tables.
+	The actual crash recovery bug was corrected on 2009-04-29.
+
+2009-11-27	The InnoDB Team
+
+	InnoDB Plugin 1.0.6 released
+
+2009-11-20	The InnoDB Team
+
+	* handler/ha_innodb.cc:
+	Add a workaround to prevent a crash due to Bug#45961 DDL on
+	partitioned innodb tables leaves data dictionary in an inconsistent
+	state
+
+2009-11-19	The InnoDB Team
+
+	* btr/btr0btr.c:
+	Fix Bug#48469 when innodb tablespace is configured too small, crash
+	and corruption!
+
+2009-11-19	The InnoDB Team
+
+	* data/data0type.c:
+	Fix Bug#48526 Data type for float and double is incorrectly reported
+	in InnoDB table monitor
+
+2009-11-19	The InnoDB Team
+
+	* CMakeLists.txt:
+	Fix Bug#48317 cannot build innodb as static library
+
+2009-11-18	The InnoDB Team
+
+	* handler/handler0alter.cc:
+	Fix Bug#48782 On lock wait timeout, CREATE INDEX (creating primary key)
+	attempts DROP TABLE
+
+2009-11-17	The InnoDB Team
+
+	* handler/ha_innodb.cc, mysql-test/innodb.result,
+	mysql-test/innodb.test, mysql-test/innodb_bug44369.result,
+	mysql-test/innodb_bug44369.test, mysql-test/patches/innodb-index.diff,
+	row/row0mysql.c:
+	Report duplicate table names to the client connection, not to the
+	error log.
+
+2009-11-12	The InnoDB Team
+
+	* handler/ha_innodb.cc, include/db0err.h, row/row0merge.c,
+	row/row0mysql.c:
+	Allow CREATE INDEX to be interrupted.
+	Also, when CHECK TABLE is interrupted, report ER_QUERY_INTERRUPTED.
+
+2009-11-11	The InnoDB Team
+
+	* handler/ha_innodb.cc, mysql-test/innodb_bug47167.result,
+	mysql-test/innodb_bug47167.test, mysql-test/innodb_file_format.result:
+	Fix Bug#47167 "set global innodb_file_format_check" cannot set value
+	by User-Defined Variable
+
+2009-11-11	The InnoDB Team
+
+	* include/os0file.h, os/os0file.c:
+	Fix Bug#3139 Mysql crashes: 'windows error 995' after several selects
+	on a large DB
+
+2009-11-04	The InnoDB Team
+
+	* handler/ha_innodb.cc:
+	Fix Bug#32430 'show innodb status' causes errors
+	Invalid (old?) table or database name in logs
+
+2009-11-02	The InnoDB Team
+
+	* btr/btr0sea.c, buf/buf0buf.c, dict/dict0dict.c, fil/fil0fil.c,
+	ibuf/ibuf0ibuf.c, include/btr0sea.h, include/dict0dict.h,
+	include/fil0fil.h, include/ibuf0ibuf.h, include/lock0lock.h,
+	include/log0log.h, include/log0recv.h, include/mem0mem.h,
+	include/mem0pool.h, include/os0file.h, include/pars0pars.h,
+	include/srv0srv.h, include/thr0loc.h, include/trx0i_s.h,
+	include/trx0purge.h, include/trx0rseg.h, include/trx0sys.h,
+	include/trx0undo.h, include/usr0sess.h, lock/lock0lock.c,
+	log/log0log.c, log/log0recv.c, mem/mem0dbg.c, mem/mem0pool.c,
+	os/os0file.c, os/os0sync.c, os/os0thread.c, pars/lexyy.c,
+	pars/pars0lex.l, que/que0que.c, srv/srv0srv.c, srv/srv0start.c,
+	sync/sync0arr.c, sync/sync0sync.c, thr/thr0loc.c, trx/trx0i_s.c,
+	trx/trx0purge.c, trx/trx0rseg.c, trx/trx0sys.c, trx/trx0undo.c,
+	usr/usr0sess.c, ut/ut0mem.c:
+	Fix Bug#45992 innodb memory not freed after shutdown
+	Fix Bug#46656 InnoDB plugin: memory leaks (Valgrind)
+
+2009-10-29	The InnoDB Team
+
+	* handler/ha_innodb.cc, mysql-test/innodb-autoinc.result,
+	mysql-test/innodb-autoinc.test:
+	Fix Bug#47125 auto_increment start value is ignored if an index is
+	created and engine=innodb
+
+2009-10-29	The InnoDB Team
+
+	* handler/ha_innodb.cc, mysql-test/innodb_bug47777.result,
+	mysql-test/innodb_bug47777.test:
+	Fix Bug#47777 innodb dies with spatial pk: Failing assertion: buf <=
+	original_buf + buf_len
+
+2009-10-29	The InnoDB Team
+
+	* handler/ha_innodb.cc:
+	Fix Bug#38996 Race condition in ANALYZE TABLE
+
+2009-10-29	The InnoDB Team
+
+	* handler/ha_innodb.cc:
+	Fix bug#42383: Can't create table 'test.bug39438'
+
+2009-10-29	The InnoDB Team
+
+	* os/os0proc.c:
+	Fix Bug#48237 Error handling in os_mem_alloc_large appears to
+	be incorrect
+
+2009-10-29	The InnoDB Team
+
+	* buf/buf0buf.c, buf/buf0lru.c, include/buf0buf.h, include/buf0buf.ic:
+	Fix corruption of the buf_pool->LRU_old list and improve debug
+	assertions.
+
+2009-10-28	The InnoDB Team
+
+	* srv/srv0start.c:
+	Fix Bug#41490 After enlargement of InnoDB page size, the error message
+	become inaccurate
+
+2009-10-26	The InnoDB Team
+
+	* row/row0ins.c:
+	When allocating a data tuple, zero out the system fields in order
+	to avoid Valgrind warnings about uninitialized fields in
+	dtuple_validate().
+
+2009-10-22	The InnoDB Team
+
+	* handler/ha_innodb.cc, mysql-test/innodb-zip.result,
+	mysql-test/innodb-zip.test, mysql-test/innodb_bug44369.result,
+	mysql-test/innodb_bug44369.test:
+	Fix Bug#47233 Innodb calls push_warning(MYSQL_ERROR::WARN_LEVEL_ERROR)
+
+2009-10-19	The InnoDB Team
+
+	* mysql-test/innodb_information_schema.test:
+	Fix Bug#47808 innodb_information_schema.test fails when run under
+	valgrind
+
+2009-10-15	The InnoDB Team
+
+	* include/page0page.ic:
+	Fix Bug#47058 Failure to compile innodb_plugin on solaris 10u7 + spro
+	cc/CC 5.10
+
+2009-10-13	The InnoDB Team
+
+	* buf/buf0flu.c:
+	Call fsync() on datafiles after a batch of pages is written to disk
+	even when skip_innodb_doublewrite is set.
+
+2009-10-05	The InnoDB Team
+
+	* buf/buf0buf.c:
+	Do not invalidate buffer pool while an LRU batch is active. Added code
+	to buf_pool_invalidate() to wait for the running batches to finish.
+
+2009-10-01	The InnoDB Team
+
+	* handler/ha_innodb.cc:
+	Fix Bug#47763 typo in error message: Failed to open table %s after %lu
+	attemtps.
+
+2009-10-01	The InnoDB Team
+
+	* fsp/fsp0fsp.c, row/row0merge.c:
+	Clean up after a crash during DROP INDEX. When InnoDB crashes
+	while dropping an index, ensure that the index will be completely
+	dropped during crash recovery. The MySQL .frm file may still
+	contain the dropped index, but there is little that we can do
+	about it.
+
+2009-09-28	The InnoDB Team
+
+	* handler/ha_innodb.cc:
+	When a secondary index exists in the MySQL .frm file but not in
+	the InnoDB data dictionary, return an error instead of letting an
+	assertion fail in index_read.
+
+2009-09-28	The InnoDB Team
+
+	* btr/btr0btr.c, buf/buf0buf.c, include/page0page.h,
+	include/page0zip.h, page/page0cur.c, page/page0page.c,
+	page/page0zip.c:
+	Do not write to PAGE_INDEX_ID when restoring an uncompressed page
+	after a compression failure. The field should only be written
+	when creating a B-tree page. This fix addresses a race condition
+	in a debug assertion.
+
+2009-09-28	The InnoDB Team
+
+	* fil/fil0fil.c:
+	Try to prevent the reuse of tablespace identifiers after InnoDB
+	has crashed during table creation. Also, refuse to start if files
+	with duplicate tablespace identifiers are encountered.
+
+2009-09-25	The InnoDB Team
+
+	* include/os0file.h, os/os0file.c:
+	Fix Bug#47055 unconditional exit(1) on ERROR_WORKING_SET_QUOTA
+	1453 (0x5AD) for InnoDB backend
+
+2009-09-19	The InnoDB Team
+
+	* handler/ha_innodb.cc, mysql-test/innodb-consistent-master.opt,
+	mysql-test/innodb-consistent.result,
+	mysql-test/innodb-consistent.test:
+	Fix Bug#37232 Innodb might get too many read locks for DML with
+	repeatable-read
+
+2009-09-19	The InnoDB Team
+
+	* fsp/fsp0fsp.c:
+	Fix Bug#31183 Tablespace full problems not reported in error log,
+	error message unclear
+
+2009-09-17	The InnoDB Team
+
+	* mysql-test/innodb-zip.result, mysql-test/innodb-zip.test:
+	Make the test pass with zlib 1.2.3.3. Apparently, the definition
+	of compressBound() has changed between zlib versions, and the
+	maximum record size of a table with 1K compressed page size has
+	been reduced by one byte. This is an arbitrary test. In practical
+	applications, for good write performance, the compressed page size
+	should be chosen to be bigger than the absolute minimum.
+
+2009-09-16	The InnoDB Team
+
+	* handler/ha_innodb.cc:
+	Fix Bug#46256 drop table with unknown collation crashes innodb
+
+2009-09-16	The InnoDB Team
+
+	* dict/dict0dict.c, handler/ha_innodb.cc,
+	mysql-test/innodb_bug44369.result, mysql-test/innodb_bug44369.test,
+	row/row0mysql.c:
+	Fix Bug#44369 InnoDB: Does not uniformly disallow disallowed column
+	names
+
+2009-09-16	The InnoDB Team
+
+	* handler/ha_innodb.cc, include/db0err.h,
+	mysql-test/innodb_bug46000.result, mysql-test/innodb_bug46000.test:
+	Fix Bug#46000 using index called GEN_CLUST_INDEX crashes server
+
+2009-09-02	The InnoDB Team
+
+	* include/lock0lock.h, include/row0mysql.h, lock/lock0lock.c,
+	row/row0mysql.c:
+	Fix a regression introduced by the fix for MySQL bug#26316. We check
+	whether a transaction holds any AUTOINC locks before we acquire
+	the kernel mutex and release those locks.
+
+2009-08-27	The InnoDB Team
+
+	* dict/dict0dict.c, include/dict0dict.h,
+	mysql-test/innodb_bug44571.result, mysql-test/innodb_bug44571.test:
+	Fix Bug#44571 InnoDB Plugin crashes on ADD INDEX
+
+2009-08-27	The InnoDB Team
+
+	* row/row0merge.c:
+	Fix a bug in the merge sort that can corrupt indexes in fast index
+	creation. Add some consistency checks. Check that the number of
+	records remains constant in every merge sort pass.
+
+2009-08-27	The InnoDB Team
+
+	* buf/buf0buf.c, buf/buf0lru.c, buf/buf0rea.c, handler/ha_innodb.cc,
+	include/buf0buf.h, include/buf0buf.ic, include/buf0lru.h,
+	include/ut0ut.h, ut/ut0ut.c:
+	Make it possible to tune the buffer pool LRU eviction policy to be
+	more resistant against index scans. Introduce the settable global
+	variables innodb_old_blocks_pct and innodb_old_blocks_time for
+	controlling the buffer pool eviction policy. The parameter
+	innodb_old_blocks_pct (5..95) controls the desired amount of "old"
+	blocks in the LRU list. The default is 37, corresponding to the
+	old fixed ratio of 3/8. Each time a block is accessed, it will be
+	moved to the "new" blocks if its first access was at least
+	innodb_old_blocks_time milliseconds ago (default 0, meaning every
+	block). The idea is that in index scans, blocks will be accessed
+	a few times within innodb_old_blocks_time, and they will remain in
+	the "old" section of the LRU list. Thus, when innodb_old_blocks_time
+	is nonzero, blocks retrieved for one-time index scans will be more
+	likely candidates for eviction than blocks that are accessed in
+	random patterns.
+
+2009-08-26	The InnoDB Team
+
+	* handler/ha_innodb.cc, os/os0file.c:
+	Fix Bug#42885 buf_read_ahead_random, buf_read_ahead_linear counters,
+	thread wakeups
+
+2009-08-20	The InnoDB Team
+
+	* lock/lock0lock.c:
+	Fix Bug#46650 Innodb assertion autoinc_lock == lock in
+	lock_table_remove_low on INSERT SELECT
+
+2009-08-13	The InnoDB Team
+
+	* handler/handler0alter.cc:
+	Fix Bug#46657 InnoDB plugin: invalid read in index_merge_innodb test
+	(Valgrind)
+
+2009-08-11	The InnoDB Team
+
+	InnoDB Plugin 1.0.4 released
+
+2009-07-20	The InnoDB Team
+
+	* buf/buf0rea.c, handler/ha_innodb.cc, include/srv0srv.h,
+	srv/srv0srv.c:
+	Change the read ahead parameter name to innodb_read_ahead_threshold.
+	Change the meaning of this parameter to signify the number of pages
+	that must be sequentially accessed for InnoDB to trigger a readahead
+	request.
+
+2009-07-20	The InnoDB Team
+
+	* handler/ha_innodb.cc:
+	Fix Bug#39802 On Windows, 32-bit time_t should be enforced
+
+2009-07-16	The InnoDB Team
+
+	* include/univ.i:
+	Support inlining of functions and prefetch with Sun Studio.
+	These changes are based on contribution from Sun Microsystems Inc.
+	under a BSD license.
+
+2009-07-14	The InnoDB Team
+
+	* fil/fil0fil.c:
+	Fix Bug#45814 URL reference in InnoDB server errors needs adjusting to
+	match documentation
+
+2009-07-14	The InnoDB Team
+
+	* handler/ha_innodb.cc, mysql-test/innodb_bug21704.result,
+	mysql-test/innodb_bug21704.test:
+	Fix Bug#21704 Renaming column does not update FK definition
+
+2009-07-10	The InnoDB Team
+
+	* handler/ha_innodb.cc, srv/srv0srv.c:
+	Change the defaults for
+	innodb_sync_spin_loops: 20 -> 30
+	innodb_spin_wait_delay: 5 -> 6
+
+2009-07-08	The InnoDB Team
+
+	* buf/buf0flu.c, handler/ha_innodb.cc, include/buf0flu.h,
+	include/log0log.h, include/log0log.ic, include/srv0srv.h,
+	srv/srv0srv.c:
+	Implement the adaptive flushing of dirty pages, which uses
+	a heuristics based flushing rate of dirty pages to avoid IO
+	bursts at checkpoint. Expose new configure knob
+	innodb_adaptive_flushing to control whether the new flushing
+	algorithm should be used.
+
+2009-07-07	The InnoDB Team
+
+	* handler/ha_innodb.cc, include/srv0srv.h, log/log0log.c,
+	srv/srv0srv.c:
+	Implement IO capacity tuning. Expose new configure knob
+	innodb_io_capacity to control the master threads IO rate. The
+	ibuf merge is also changed from synchronous to asynchronous.
+	These changes are based on contribution from Google Inc.
+	under a BSD license.
+
+2009-07-02	The InnoDB Team
+
+	* include/ut0ut.h, plug.in, ut/ut0ut.c:
+	Use the PAUSE instruction inside the spinloop if it is available,
+	Thanks to Mikael Ronstrom <mikael@mysql.com>.
+
+2009-06-29	The InnoDB Team
+
+	* handler/ha_innodb.cc, mysql-test/innodb_file_format.test,
+	mysql-test/innodb_file_format.result:
+	Do not crash on SET GLOBAL innodb_file_format=DEFAULT
+	or SET GLOBAL innodb_file_format_check=DEFAULT.
+
+2009-06-29	The InnoDB Team
+
+	* buf/buf0buf.c, buf/buf0rea.c, lock/lock0lock.c:
+	Tolerate missing tablespaces during crash recovery and when
+	printing information on locks.
+
+2009-06-29	The InnoDB Team
+
+	* buf/buf0buf.c:
+	Fix a race condition when reading buf_fix_count.
+	Currently, it is not being protected by the buffer pool mutex,
+	but by the block mutex.
+
+2009-06-29	The InnoDB Team
+
+	* handler/handler0alter.cc:
+	Start the user transaction prebuilt->trx if it was not started
+	before adding or dropping an index. Without this fix, the
+	table could be locked outside an active transaction.
+
+2009-06-25	The InnoDB Team
+
+	* handler/ha_innodb.cc, mysql-test/innodb_bug42101.test,
+	mysql-test/innodb_bug42101.result,
+	mysql-test/innodb_bug42101-nonzero.test,
+	mysql-test/innodb_bug42101-nonzero.result:
+	Fix Bug#45749 Race condition in SET GLOBAL
+	innodb_commit_concurrency=DEFAULT
+
+2009-06-25	The InnoDB Team
+
+	* dict/dict0dict.c:
+	When an index column cannot be found in the table during index
+	creation, display additional diagnostic before an assertion failure.
+	This does NOT fix Bug#44571 InnoDB Plugin crashes on ADD INDEX,
+	but it helps understand the reason of the crash.
+
+2009-06-17	The InnoDB Team
+
+	* row/row0merge.c:
+	Fix Bug#45426 UNIV_DEBUG build cause assertion error at CREATE INDEX
+
+2009-06-17	The InnoDB Team
+
+	* mysql-test/innodb_bug45357.result, mysql-test/innodb_bug45357.test,
+	row/row0mysql.c:
+	Fix Bug#45357 5.1.35 crashes with Failing assertion: index->type &
+	DICT_CLUSTERED
+
+2009-06-17	The InnoDB Team
+
+	* handler/ha_innodb.cc, mysql-test/innodb-autoinc.result,
+	mysql-test/innodb-autoinc.test:
+	Fix Bug#44030 Error: (1500) Couldn't read the MAX(ID) autoinc value
+	from the index (PRIMARY)
+
+2009-06-11	The InnoDB Team
+
+	* handler/ha_innodb.cc, mysql-test/innodb.result, srv/srv0srv.c:
+	Change the following defaults:
+	max_dirty_pages_pct: from 90 to 75, max allowed from 100 to 99
+	additional_mem_pool_size: from 1 to 8 MB
+	buffer_pool_size: from 8 to 128 MB
+	log_buffer_size: from 1 to 8 MB
+	read_io_threads/write_io_threads: from 1 to 4
+
+2009-06-09	The InnoDB Team
+
+	* handler/ha_innodb.cc, include/trx0trx.h, trx/trx0trx.c:
+	Enable Group Commit functionality that was broken in 5.0 when
+	distributed transactions were introduced.
+
+2009-06-05	The InnoDB Team
+
+	* handler/ha_innodb.cc, include/os0file.h, include/srv0srv.h,
+	os/os0file.c, srv/srv0srv.c, srv/srv0start.c:
+	Enable functionality to have multiple background IO helper threads.
+	Expose new configure knobs innodb_read_io_threads and
+	innodb_write_io_threads and deprecate innodb_file_io_threads (this
+	parameter was relevant only on windows). Internally this allows
+	multiple segments for read and write IO request arrays where one
+	thread works on one segment.
+
+2009-06-05	The InnoDB Team
+
+	* buf/buf0lru.c, buf/buf0rea.c, handler/ha_innodb.cc,
+	include/srv0srv.h, srv/srv0srv.c:
+	Fix a bug in linear read ahead:
+	  1) Take into account access pattern when deciding whether or not to
+	    do linear read ahead.
+	  2) Expose a knob innodb_read_ahead_factor = [0-64] default (8),
+	    dynamic, global to control linear read ahead behavior. This is the
+	    value of the number of pages that InnoDB will tolerate within a
+	    64 page extent even if they are accessed out of order or have
+	    not been accessed at all. This number (which varies from 0 to 64)
+	    is indicative of the slack that we have when deciding about linear
+	    readahead.
+	  3) Disable random read ahead. Keep the code for now.
+
+2009-06-03	The InnoDB Team
+
+	* dict/dict0dict.c, mysql-test/t/innodb_mysql.test,
+	mysql-test/r/innodb_mysql.result:
+	Fix Bug#39793 Foreign keys not constructed when column
+	has a '#' in a comment or default value
+
+2009-05-27	The InnoDB Team
+
+	* Doxyfile:
+	Allow the extraction of documentation from the code base with the
+	Doxygen tool. Convert and add many (but not yet all) comments to
+	Doxygen format.
+
+2009-05-19	The InnoDB Team
+
+	* btr/btr0btr.c, btr/btr0cur.c, lock/lock0lock.c,
+	include/page0page.ic, include/lock0lock.h, include/dict0dict.h,
+	include/page0page.h, include/dict0dict.ic, ibuf/ibuf0ibuf.c,
+	page/page0zip.c, page/page0page.c:
+	Write updates of PAGE_MAX_TRX_ID to the redo log and add debug
+	assertions for checking that PAGE_MAX_TRX_ID is valid on leaf
+	pages of secondary indexes and the insert buffer B-tree. This bug
+	could cause failures in secondary index lookups in consistent
+	reads right after crash recovery.
+
+2009-05-18	The InnoDB Team
+
+	* btr/btr0cur.c:
+	Correctly estimate the space needed on the compressed page when
+	performing an update by delete-and-insert.
+
+2009-05-14	The InnoDB Team
+
+	* handler/ha_innodb.cc, include/srv0srv.h,
+	mysql-test/innodb_bug42101-nonzero-master.opt,
+	mysql-test/innodb_bug42101-nonzero.result,
+	mysql-test/innodb_bug42101-nonzero.test,
+	mysql-test/innodb_bug42101.result, mysql-test/innodb_bug42101.test,
+	srv/srv0srv.c:
+	Fix Bug#42101 Race condition in innodb_commit_concurrency
+
+2009-05-13	The InnoDB Team
+
+	* dict/dict0dict.c:
+	Fix Bug#44320 InnoDB: missing DB_ROLL_PTR in Table Monitor COLUMNS
+	output
+
+2009-04-29	The InnoDB Team
+
+	* fil/fil0fil.c, include/fil0fil.h, include/mtr0mtr.h,
+	log/log0recv.c:
+	Fix Bug#41609 Crash recovery does not work for InnoDB temporary tables
+
+2009-04-23	The InnoDB Team
+
+	* row/row0mysql.c:
+	When scanning indexes, report in the error log any error codes
+	returned by the search function. These error codes will still be
+	ignored in CHECK TABLE.
+
+2009-04-23	The InnoDB Team
+
+	* include/trx0types.h:
+	Define the logical type names trx_id_t, roll_ptr_t, and undo_no_t
+	and use them in place of dulint everywhere.
+
+2009-04-18	The InnoDB Team
+
+	* handler/ha_innodb.cc, include/pars0pars.h:
+	Fix Bug#29125 Windows Server X64: so many compiler warnings
+
+2009-04-16	The InnoDB Team
+
+	* include/univ.i:
+	Define REFMAN as the base URL of the MySQL Reference Manual and
+	use the macro in all diagnostic output.
+
+2009-04-16	The InnoDB Team
+
+	* CMakeLists.txt, include/os0sync.h, include/sync0sync.h,
+	include/sync0sync.ic, include/univ.i, srv/srv0start.c,
+	sync/sync0sync.c:
+	Use the Windows Interlocked functions for atomic memory
+	access.
+
+2009-04-15	The InnoDB Team
+
+	* mysql-test/innodb.result, mysql-test/innodb.test:
+	Fix Bug#43309 Test main.innodb can't be run twice
+
+2009-04-14	The InnoDB Team
+
+	* CMakeLists.txt, handler/win_delay_loader.cc,
+	win-plugin/win-plugin.diff:
+	Remove statically linked libraries from MySQL (zlib and strings).
+
+2009-04-11	The InnoDB Team
+
+	* CMakeLists.txt, win-plugin/README, win-plugin/win-plugin.diff:
+	Rewrite CMakeLists.txt.
+
+2009-04-07	The InnoDB Team
+
+	* include/os0sync.h, include/sync0rw.ic, include/sync0sync.h,
+	include/sync0sync.ic, include/univ.i, plug.in, srv/srv0srv.c,
+	srv/srv0start.c, sync/sync0arr.c, sync/sync0sync.c:
+	Enable atomics on Solaris (using the libc functions as defined in
+	atomic.h) if GCC atomic builtins are not present.
+
+2009-04-07	The InnoDB Team
+
+	* btr/btr0btr.c, dict/dict0dict.c, ibuf/ibuf0ibuf.c,
+	include/data0data.h, include/data0data.ic, include/data0type.h,
+	include/data0type.ic, include/dict0dict.h, include/dict0dict.ic,
+	include/rem0rec.ic, mysql-test/innodb.result, mysql-test/innodb.test,
+	pars/pars0pars.c, rem/rem0rec.c, row/row0upd.c:
+	Fix Bug#44032 In ROW_FORMAT=REDUNDANT, update UTF-8 CHAR
+	to/from NULL is not in-place
+
+2009-04-07	The InnoDB Team
+
+	* page/page0cur.c:
+	Fix Bug#43660 SHOW INDEXES/ANALYZE does NOT update cardinality for
+	indexes of InnoDB table
+
+2009-04-06	The InnoDB Team
+
+	* handler/ha_innodb.cc:
+	Make the parameter innodb_change_buffering settable by the
+	configuration file or mysqld command line options. Before this
+	fix, the initial value specified for this parameter was ignored.
+
+2009-04-06	The InnoDB Team
+
+	* sync/sync0rw.c:
+	Avoid a bogus failure in UNIV_SYNC_DEBUG diagnostics.
+
+2009-04-02	The InnoDB Team
+
+	* handler/ha_innodb.cc, include/srv0srv.h, srv/srv0srv.c:
+	Add new parameter innodb_spin_wait_delay to set the maximum delay
+	between polling for a spin lock.
+
+2009-04-02	The InnoDB Team
+
+	* dict/dict0crea.c, handler/ha_innodb.cc, handler/ha_innodb.h,
+	include/dict0mem.h, include/row0merge.h, include/row0mysql.h,
+	mysql-test/innodb-index.result, mysql-test/innodb-index.test,
+	row/row0merge.c, row/row0sel.c:
+	In consistent reads, refuse to use newly created indexes that may
+	lack history.
+
+2009-03-25	The InnoDB Team
+
+	* buf/buf0buf.c, handler/ha_innodb.cc, include/buf0buf.h:
+	In SHOW ENGINE INNODB MUTEX do not show the status of block->mutex,
+	block->lock, block->lock->mutex (if applicable) and all mutexes and
+	rw-locks for which number of os-waits are zero because this can
+	be overwhelming particularly when the buffer pool is very large.
+
+2009-03-20	The InnoDB Team
+
+	* buf/buf0buf.c, include/log0recv.h, log/log0recv.c:
+	Remove the compile-time constant parameters of
+	recv_recover_page(), recv_scan_log_recs(), and recv_sys_init().
+
+2009-03-20	The InnoDB Team
+
+	* data/data0type.c, handler/ha_innodb.cc, include/ha_prototypes.h:
+	Declare innobase_get_at_most_n_mbchars() in ha_prototypes.h.
+
+2009-03-20	The InnoDB Team
+
+	* fil/fil0fil.h, fil/fil0fil.c, srv/srv0start.c:
+	Add the parameter hash_size to fil_init().
+
+2009-03-20	The InnoDB Team
+
+	* fil/fil0fil.c:
+	Refer to fil_system directly, not via local variables.
+
+2009-03-20	The InnoDB Team
+
+	* page/page0page.c:
+	In page_validate(), always report the space id, page number and
+	the name of the index when corruption is noticed.
+
+2009-03-20	The InnoDB Team
+
+	* include/log0log.h, include/log0log.ic, log/log0log.c:
+	Add in/out comments or const qualifiers to some function
+	parameters as appropriate.
+
+2009-03-20	The InnoDB Team
+
+	* dict/dict0boot.c, dict/dict0dict.c, fsp/fsp0fsp.c,
+	include/dict0dict.h, include/srv0srv.h, srv/srv0srv.c,
+	page/page0page.c:
+	Replace srv_sys->dummy_ind1 and srv_sys->dummy_ind2 with
+	dict_ind_redundant and dict_ind_compact, which are
+	initialized by dict_init().
+
+2009-03-11	The InnoDB Team
+
+	InnoDB Plugin 1.0.3 released
+
+2009-03-05	The InnoDB Team
+
+	* handler/ha_innodb.cc, mysql-test/innodb-autoinc.result,
+	mysql-test/innodb-autoinc.test:
+	Fix Bug#43203 Overflow from auto incrementing causes server segv
+
+2009-02-25	The InnoDB Team
+
+	* handler/ha_innodb.cc, mysql-test/innodb-autoinc.result,
+	mysql-test/innodb-autoinc.test:
+	Fix Bug#42714 AUTO_INCREMENT errors in 5.1.31
+
+2009-02-23	The InnoDB Team
+
+	* btr/btr0cur.c:
+	Fix Bug#43043 Crash on BLOB delete operation
+
+2009-02-20	The InnoDB Team
+
+	* handler/ha_innodb.cc:
+	Make innodb_use_sys_malloc=ON the default.
+
+2009-02-20	The InnoDB Team
+
+	* handler/ha_innodb.cc, mysql-test/innodb-autoinc.result,
+	mysql-test/innodb-autoinc.test:
+	Fix Bug#42400 InnoDB autoinc code can't handle floating-point columns
+
+2009-02-18	The InnoDB Team
+
+	* include/ut0mem.h, os/os0proc.c, ut/ut0mem.c:
+	Protect ut_total_allocated_memory with ut_list_mutex in
+	os_mem_alloc_large() and os_mem_free_large(). The lack of this mutex
+	protection could cause an assertion failure during fast index
+	creation. Also, add UNIV_MEM_ALLOC and UNIV_MEM_FREE instrumentation
+	to os_mem_alloc_large() and os_mem_free_large(), so that Valgrind can
+	detect more errors.
+
+2009-02-11	The InnoDB Team
+
+	* handler/ha_innodb.cc:
+	Make innodb_thread_concurrency=0 the default. The old default value
+	was 8. A non-zero setting may be useful when InnoDB is showing severe
+	scalability problems under multiple concurrent connections.
+
+2009-02-10	The InnoDB Team
+
+	* handler/ha_innodb.cc, handler/ha_innodb.h:
+	Fix Bug#41676 Table names are case insensitive in locking
+
+2009-02-10	The InnoDB Team
+
+	* mem/mem0dbg.c, mem/mem0mem.c, mem/mem0pool.c:
+	When innodb_use_sys_malloc is set, ignore
+	innodb_additional_mem_pool_size, because nothing will be allocated
+	from mem_comm_pool.
+
+2009-02-10	The InnoDB Team
+
+	* ut/ut0mem.c:
+	Map ut_malloc_low(), ut_realloc(), and ut_free() directly to malloc(),
+	realloc(), and free() when innodb_use_sys_malloc is set. As a side
+	effect, ut_total_allocated_memory ("Total memory allocated" in the
+	"BUFFER POOL AND MEMORY" section of SHOW ENGINE INNODB STATUS) will
+	exclude any memory allocated by these functions when
+	innodb_use_sys_malloc is set.
+
+2009-02-10	The InnoDB Team
+
+	* btr/btr0cur.c, btr/btr0sea.c, buf/buf0buf.c, handler/ha_innodb.cc,
+	include/buf0buf.ic, include/os0sync.h, include/srv0srv.h,
+	include/sync0rw.h, include/sync0rw.ic, include/sync0sync.h,
+	include/sync0sync.ic, include/univ.i, row/row0sel.c, srv/srv0srv.c,
+	srv/srv0start.c, sync/sync0arr.c, sync/sync0rw.c, sync/sync0sync.c:
+	On those platforms that support it, implement the synchronization
+	primitives of InnoDB mutexes and read/write locks with GCC atomic
+	builtins instead of Pthreads mutexes and InnoDB mutexes. These changes
+	are based on a patch supplied by Mark Callaghan of Google under a BSD
+	license.
+
+2009-01-30	The InnoDB Team
+
+	* btr/btr0cur.c, btr/btr0sea.c, buf/buf0buf.c, handler/ha_innodb.cc,
+	include/btr0sea.h, include/buf0buf.h, include/sync0sync.h,
+	sync/sync0sync.c:
+	Make the configuration parameter innodb_adaptive_hash_index dynamic,
+	so that it can be changed at runtime.
+
+2009-01-29	The InnoDB Team
+
+	* handler/ha_innodb.cc, ibuf/ibuf0ibuf.c, include/ibuf0ibuf.h,
+	include/ibuf0ibuf.ic:
+	Implement the settable global variable innodb_change_buffering,
+	with the allowed values 'none' and 'inserts'. The default value
+	'inserts' enables the buffering of inserts to non-unique secondary
+	index trees when the B-tree leaf page is not in the buffer pool.
+
+2009-01-27	The InnoDB Team
+
+	* buf/buf0lru.c:
+	Fix a race condition in buf_LRU_invalidate_tablespace(): The
+	compressed page size (zip_size) was read while the block descriptor
+	was no longer protected by a mutex. This could lead to corruption
+	when a table is dropped on a busy system that contains compressed
+	tables.
+
+2009-01-26	The InnoDB Team
+
+	* btr/btr0sea.c, buf/buf0buf.c, include/buf0buf.h, include/buf0buf.ic,
+	include/mtr0log.ic, include/row0upd.ic, mtr/mtr0mtr.c:
+	Implement buf_block_align() with pointer arithmetics, as it is in the
+	built-in InnoDB distributed with MySQL. Do not acquire the buffer pool
+	mutex before buf_block_align(). This removes a scalability bottleneck
+	in the adaptive hash index lookup. In CHECK TABLE, check that
+	buf_pool->page_hash is consistent with buf_block_align().
+
+2009-01-23	The InnoDB Team
+
+	* btr/btr0sea.c:
+	Fix Bug#42279 Race condition in btr_search_drop_page_hash_when_freed()
+
+2009-01-23	The InnoDB Team
+
+	* buf/buf0buf.c, include/buf0buf.h:
+	Remove the unused mode BUF_GET_NOWAIT of buf_page_get_gen()
+
+2009-01-20	The InnoDB Team
+
+	* include/rem0rec.h, include/rem0rec.ic:
+	Fix Bug#41571 MySQL segfaults after innodb recovery
+
+2009-01-20	The InnoDB Team
+
+	* lock/lock0lock.c:
+	Fix Bug#42152 Race condition in lock_is_table_exclusive()
+
+2009-01-14	The InnoDB Team
+
+	* include/trx0roll.h, trx/trx0roll.c, trx/trx0trx.c:
+	Fix Bug#38187 Error 153 when creating savepoints
+
+2009-01-14	The InnoDB Team
+
+	* dict/dict0load.c:
+	Fix Bug#42075 dict_load_indexes failure in dict_load_table will
+	corrupt the dictionary cache
+
+2009-01-13	The InnoDB Team
+
+	* buf/buf0buddy.c, dict/dict0dict.c, dict/dict0mem.c, fil/fil0fil.c,
+	ha/ha0storage.c, handler/ha_innodb.cc, handler/win_delay_loader.cc,
+	include/buf0buf.ic, include/dict0dict.ic, include/hash0hash.h,
+	thr/thr0loc.c, trx/trx0i_s.c:
+	Add the parameter ASSERTION to HASH_SEARCH() macro, and use it for
+	light validation of the traversed items in hash table lookups when
+	UNIV_DEBUG is enabled.
+
+2009-01-09	The InnoDB Team
+
+	* buf/buf0flu.c, include/buf0flu.h, include/buf0flu.ic:
+	Remove unused code from the functions
+	buf_flush_insert_into_flush_list() and
+	buf_flush_insert_sorted_into_flush_list().
+
+2009-01-09	The InnoDB Team
+
+	* buf/buf0flu.c:
+	Simplify the functions buf_flush_try_page() and buf_flush_batch(). Add
+	debug assertions and an explanation to buf_flush_write_block_low().
+
+2009-01-07	The InnoDB Team
+
+	* row/row0merge.c:
+	Fix a bug in recovery when dropping temporary indexes.
+
+2009-01-07	The InnoDB Team
+
+	* handler/ha_innodb.cc, handler/ha_innodb.h, handler/handler0alter.cc:
+	Fix Bug#41680 calls to trx_allocate_for_mysql are not consistent
+
+2009-01-07	The InnoDB Team
+
+	* mysql-test/innodb_bug41904.result, mysql-test/innodb_bug41904.test,
+	row/row0merge.c:
+	Fix Bug#41904 create unique index problem
+
+2009-01-02	The InnoDB Team
+
+	* handler/ha_innodb.cc, include/srv0srv.h, mem/mem0pool.c,
+	mysql-test/innodb-use-sys-malloc-master.opt,
+	mysql-test/innodb-use-sys-malloc.result,
+	mysql-test/innodb-use-sys-malloc.test, srv/srv0srv.c, srv/srv0start.c:
+	Implement the configuration parameter innodb_use_sys_malloc (false by
+	default), for disabling InnoDB's internal memory allocator and using
+	system malloc/free instead. The "BUFFER POOL AND MEMORY" section of
+	SHOW ENGINE INNODB STATUS will report "in additional pool allocated
+	allocated 0" when innodb_use_sys_malloc is set.
+
+2008-12-30	The InnoDB Team
+
+	* btr/btr0btr.c:
+	When setting the PAGE_LEVEL of a compressed B-tree page from or to 0,
+	compress the page at the same time. This is necessary, because the
+	column information stored on the compressed page will differ between
+	leaf and non-leaf pages. Leaf pages are identified by PAGE_LEVEL=0.
+	This bug can make InnoDB crash when all rows of a compressed table are
+	deleted.
+
+2008-12-17	The InnoDB Team
+
+	* include/row0sel.h, include/row0upd.h, pars/pars0pars.c,
+	row/row0mysql.c, row/row0sel.c, row/row0upd.c:
+	Remove update-in-place select from the internal SQL interpreter. It
+	was only used for updating the InnoDB internal data dictionary when
+	renaming or dropping tables. It could have caused deadlocks when
+	acquiring latches on insert buffer bitmap pages.
+
+2008-12-17	The InnoDB Team
+
+	* btr/btr0sea.c, buf/buf0buf.c, buf/buf0lru.c, ha/ha0ha.c,
+	ha/hash0hash.c, include/buf0buf.h, include/ha0ha.h, include/ha0ha.ic,
+	include/hash0hash.h, include/univ.i:
+	Introduce the preprocessor symbol UNIV_AHI_DEBUG for enabling adaptive
+	hash index debugging independently of UNIV_DEBUG.
+
+2008-12-16	The InnoDB Team
+
+	* btr/btr0cur.c:
+	Do not update the free bits in the insert buffer bitmap when inserting
+	or deleting from the insert buffer B-tree. Assert that records in the
+	insert buffer B-tree are never updated.
+
+2008-12-12	The InnoDB Team
+
+	* buf/buf0buf.c, fil/fil0fil.c, fsp/fsp0fsp.c, ibuf/ibuf0ibuf.c,
+	include/fil0fil.h, include/ibuf0ibuf.h, include/ibuf0ibuf.ic,
+	include/ibuf0types.h:
+	Clean up the insert buffer subsystem so that only one insert
+	buffer B-tree exists.
+	Originally, there were provisions in InnoDB for multiple insert
+	buffer B-trees, apparently one for each tablespace.
+	When Heikki Tuuri implemented multiple InnoDB tablespaces in
+	MySQL/InnoDB 4.1, he made the insert buffer live only in the
+	system tablespace (space 0) but left the provisions in the code.
+
+2008-12-11	The InnoDB Team
+
+	* include/srv0srv.h, os/os0proc.c, srv/srv0srv.c:
+	Fix the issue that the InnoDB plugin fails if innodb_buffer_pool_size
+	is defined bigger than 4096M on 64-bit Windows. This bug should not
+	have affected other 64-bit systems.
+
+2008-12-09	The InnoDB Team
+
+	* handler/ha_innodb.cc:
+	Fix Bug#40386 Not flushing query cache after truncate.
+
+2008-12-09	The InnoDB Team
+
+	* handler/ha_innodb.cc, srv/srv0srv.c, trx/trx0trx.c:
+	Fix Bug#40760 "set global innodb_thread_concurrency = 0;" is not safe
+
+2008-12-04	The InnoDB Team
+
+	* handler/ha_innodb.cc, handler/mysql_addons.cc,
+	include/mysql_addons.h, trx/trx0i_s.c, win-plugin/win-plugin.diff:
+	Remove dependencies to MySQL internals (defining MYSQL_SERVER).
+
+2008-12-02	The InnoDB Team
+
+	* page/page0cur.c:
+	When allocating space for a record from the free list of previously
+	purged records, zero out the DB_TRX_ID and DB_ROLL_PTR of the purged
+	record if the new record would not overwrite these fields. This fixes
+	a harmless content mismatch reported by page_zip_validate().
+
+2008-12-02	The InnoDB Team
+
+	* row/row0merge.c:
+	Replace the WHILE 1 with WHILE 1=1 in the SQL procedure, so that the
+	loop will actually be entered and temporary indexes be dropped during
+	crash recovery.
+
+2008-12-01	The InnoDB Team
+
+	InnoDB Plugin 1.0.2 released
+
+2008-10-31	The InnoDB Team
+
+	* dict/dict0mem.c, include/dict0mem.h, include/lock0lock.h,
+	include/row0mysql.h, include/trx0trx.h, include/univ.i,
+	include/ut0vec.h, include/ut0vec.ic, lock/lock0lock.c,
+	row/row0mysql.c, trx/trx0trx.c:
+	Fix Bug#26316 Triggers create duplicate entries on auto-increment
+	columns
+
+2008-10-30	The InnoDB Team
+
+	* handler/ha_innodb.cc, handler/handler0vars.h,
+	handler/win_delay_loader.cc, mysql-test/innodb_bug40360.result,
+	mysql-test/innodb_bug40360.test:
+	Fix Bug#40360 Binlog related errors with binlog off
+
+2008-10-29	The InnoDB Team
+
+	* include/data0type.ic:
+	Fix Bug#40369 dtype_get_sql_null_size() returns 0 or 1, not the size
+
+2008-10-29	The InnoDB Team
+
+	* handler/ha_innodb.cc, include/srv0srv.h, srv/srv0srv.c:
+	Fix Bug#38189 innodb_stats_on_metadata missing
+
+2008-10-28	The InnoDB Team
+
+	* CMakeLists.txt, ha_innodb.def, handler/ha_innodb.cc,
+	handler/handler0alter.cc, handler/handler0vars.h, handler/i_s.cc,
+	handler/win_delay_loader.cc, win-plugin/*:
+	Implemented the delayloading of externals for the plugin on Windows.
+	This makes it possible to build a dynamic plugin (ha_innodb.dll) on
+	Windows.
+
+2008-10-27	The InnoDB Team
+
+	* CMakeLists.txt:
+	Fix Bug#19424 InnoDB: Possibly a memory overrun of the buffer being
+	freed (64-bit Visual C)
+
+2008-10-23	The InnoDB Team
+
+	* ibuf/ibuf0ibuf.c:
+	ibuf_delete_rec(): When the cursor to the insert buffer record
+	cannot be restored, do not complain if the tablespace does not
+	exist, because the insert buffer record may have been discarded by
+	some other thread. This bug has existed in MySQL/InnoDB since
+	version 4.1, when innodb_file_per_table was implemented.
+	This may fix Bug#27276 InnoDB Error: ibuf cursor restoration fails.
+
+2008-10-22	The InnoDB Team
+
+	* dict/dict0dict.c, dict/dict0mem.c, handler/ha_innodb.cc,
+	handler/ha_innodb.h, include/dict0dict.h, include/dict0mem.h,
+	row/row0mysql.c:
+	Fix Bug#39830 Table autoinc value not updated on first insert
+	Fix Bug#35498 Cannot get table test/table1 auto-inccounter value in
+	::info
+	Fix Bug#36411 "Failed to read auto-increment value from storage
+	engine" in 5.1.24 auto-inc
+
+2008-10-22	The InnoDB Team
+
+	* handler/ha_innodb.cc, include/row0mysql.h, row/row0mysql.c:
+	Fix Bug#40224 New AUTOINC changes mask reporting of deadlock/timeout
+	errors
+
+2008-10-16	The InnoDB Team
+
+	* dict/dict0dict.c, mysql-test/innodb-index.result,
+	mysql-test/innodb-index.test:
+	Skip the undo log size check when creating REDUNDANT and COMPACT
+	tables. In ROW_FORMAT=DYNAMIC and ROW_FORMAT=COMPRESSED, column
+	prefix indexes require that prefixes of externally stored columns
+	be written to the undo log. This may make the undo log record
+	bigger than the record on the B-tree page. The maximum size of an
+	undo log record is the page size. That must be checked for, in
+	dict_index_add_to_cache(). However, this restriction must not
+	be enforced on REDUNDANT or COMPACT tables.
+
+2008-10-15	The InnoDB Team
+
+	* btr/btr0cur.c, include/btr0cur.h, row/row0ext.c, row/row0sel.c,
+	row/row0upd.c:
+	When the server crashes while freeing an externally stored column
+	of a compressed table, the BTR_EXTERN_LEN field in the BLOB
+	pointer will be written as 0. Tolerate this in the functions that
+	deal with externally stored columns. This fixes problems after
+	crash recovery, in the rollback of incomplete transactions, and in
+	the purge of delete-marked records.
+
+2008-10-15	The InnoDB Team
+
+	* btr/btr0btr.c, include/page0zip.h, page/page0zip.c, include/univ.i:
+	When a B-tree node of a compressed table is split or merged, the
+	compression may fail. In this case, the entire compressed page
+	will be copied and the excess records will be deleted. However,
+	page_zip_copy(), now renamed to page_zip_copy_recs(), copied too
+	many fields in the page header, overwriting PAGE_BTR_SEG_LEAF and
+	PAGE_BTR_SEG_TOP when splitting the B-tree root. This caused
+	corruption of compressed tables. Furthermore, the lock table and
+	the adaptive hash index would be corrupted, because we forgot to
+	update them when invoking page_zip_copy_recs().
+
+	Introduce the symbol UNIV_ZIP_DEBUG for triggering the copying of
+	compressed pages more often, for debugging purposes.
+
+2008-10-10	The InnoDB Team
+
+	* handler/handler0alter.cc, include/row0merge.h, row/row0merge.c,
+	row/row0mysql.c:
+	Fix some locking issues, mainly in fast index creation. The
+	InnoDB data dictionary cache should be latched whenever a
+	transaction is holding locks on any data dictionary tables.
+	Otherwise, lock waits or deadlocks could occur. Furthermore, the
+	data dictionary transaction must be committed (and the locks
+	released) before the data dictionary latch is released.
+
+	ha_innobase::add_index(): Lock the data dictionary before renaming
+	or dropping the created indexes, because neither operation will
+	commit the data dictionary transaction.
+
+	ha_innobase::final_drop_index(): Commit the transactions before
+	unlocking the data dictionary.
+
+2008-10-09	The InnoDB Team
+
+	* buf/buf0lru.c:
+	Fix Bug#39939 DROP TABLE/DISCARD TABLESPACE takes long time in
+	buf_LRU_invalidate_tablespace()
+
+2008-10-08	The InnoDB Team
+
+	* dict/dict0crea.c, trx/trx0roll.c, include/row0mysql.h,
+	row/row0merge.c, row/row0mysql.c:
+	When dropping a table, hold the data dictionary latch until the
+	transaction has been committed. The data dictionary latch is
+	supposed to prevent lock waits and deadlocks in the data
+	dictionary tables. Due to this bug, DROP TABLE could cause a
+	deadlock or hang. Note that because of Bug#33650 and Bug#39833,
+	MySQL may also drop a (temporary) table when executing CREATE INDEX
+	or ALTER TABLE ... ADD INDEX.
+
+2008-10-04	The InnoDB Team
+
+	* handler/ha_innodb.cc, mysql-test/innodb_bug39438-master.opt,
+	mysql-test/innodb_bug39438.result, mysql-test/innodb_bug39438.test:
+	Fix Bug#39438 Testcase for Bug#39436 crashes on 5.1 in
+	fil_space_get_latch
+
+2008-10-04	The InnoDB Team
+
+	* include/lock0lock.h, lock/lock0lock.c,
+	mysql-test/innodb_bug38231.result, mysql-test/innodb_bug38231.test,
+	row/row0mysql.c:
+	Fix Bug#38231 Innodb crash in lock_reset_all_on_table() on TRUNCATE +
+	LOCK / UNLOCK
+
+2008-10-04	The InnoDB Team
+
+	* handler/ha_innodb.cc:
+	Fix Bug#35498 Cannot get table test/table1 auto-inccounter value in
+	::info
+
+2008-10-04	The InnoDB Team
+
+	* handler/ha_innodb.cc, handler/ha_innodb.h:
+	Fix Bug#37788 InnoDB Plugin: AUTO_INCREMENT wrong for compressed
+	tables
+
+2008-10-04	The InnoDB Team
+
+	* dict/dict0dict.c, handler/ha_innodb.cc, handler/ha_innodb.h,
+	include/dict0dict.h, include/dict0mem.h, row/row0mysql.c:
+	Fix Bug#39830 Table autoinc value not updated on first insert
+
+2008-10-03	The InnoDB Team
+
+	* mysql-test/innodb-index.test, mysql-test/innodb-index.result,
+	mysql-test/innodb-timeout.test, mysql-test/innodb-timeout.result,
+	srv/srv0srv.c, include/srv0srv.h, handler/ha_innodb.cc,
+	include/ha_prototypes.h:
+	Fix Bug#36285 innodb_lock_wait_timeout is not dynamic, not per session
+
+2008-09-19	The InnoDB Team
+
+	* os/os0proc.c:
+	Fix a memory leak on Windows. The memory leak was due to wrong
+	parameters passed into VirtualFree() call. As the result, the
+	call fails with Windows error 87.
+
+2008-09-17	The InnoDB Team
+
+	* mysql-test/innodb.result, mysql-test/innodb-zip.result,
+	mysql-test/innodb-zip.test, mysql-test/innodb.test, ibuf/ibuf0ibuf.c,
+	dict/dict0crea.c, dict/dict0load.c, dict/dict0boot.c,
+	include/dict0dict.h, include/trx0trx.h, dict/dict0dict.c,
+	trx/trx0trx.c, include/ha_prototypes.h, handler/ha_innodb.cc:
+	When creating an index in innodb_strict_mode, check that the
+	maximum record size will never exceed the B-tree page size limit.
+	For uncompressed tables, there should always be enough space for
+	two records in an empty B-tree page. For compressed tables, there
+	should be enough space for storing two node pointer records or one
+	data record in an empty page in uncompressed format.
+	The purpose of this check is to guarantee that INSERT or UPDATE
+	will never fail due to too big record size.
+
+2008-09-17	The InnoDB Team
+
+	* btr/btr0cur.c, data/data0data.c, include/page0zip.h,
+	include/page0zip.ic, page/page0zip.c, mysql-test/innodb_bug36172.test:
+	Prevent infinite B-tree page splits in compressed tables by
+	ensuring that there will always be enough space for two node
+	pointer records in an empty B-tree page. Also, require that at
+	least one data record will fit in an empty compressed page. This
+	will reduce the maximum size of records in compressed tables.
+
+2008-09-09	The InnoDB Team
+
+	* mysql-test/innodb.result:
+	Fix the failing innodb test by merging changes that MySQL made to
+	that file (r2646.12.1 in MySQL BZR repository)
+
+2008-09-09	The InnoDB Team
+
+	* handler/ha_innodb.cc, mysql-test/innodb-autoinc.result,
+	mysql-test/innodb-autoinc.test:
+	Fix Bug#38839 auto increment does not work properly with InnoDB after
+	update
+
+2008-09-09	The InnoDB Team
+
+	* dict/dict0dict.c, handler/handler0alter.cc, include/dict0dict.h,
+	mysql-test/innodb-index.result, mysql-test/innodb-index.test:
+	Fix Bug#38786 InnoDB plugin crashes on drop table/create table with FK
+
+2008-08-21	The InnoDB Team
+
+	* handler/ha_innodb.cc, include/ha_prototypes.h, row/row0sel.c:
+	Fix Bug#37885 row_search_for_mysql may gap lock unnecessarily with SQL
+	comments in query
+
+2008-08-21	The InnoDB Team
+
+	* handler/ha_innodb.cc:
+	Fix Bug#38185 ha_innobase::info can hold locks even when called with
+	HA_STATUS_NO_LOCK
+
+2008-08-18	The InnoDB Team
+
+	* buf/buf0buf.c, buf/buf0lru.c, include/buf0buf.ic, include/univ.i:
+	Introduce UNIV_LRU_DEBUG for debugging the LRU buffer pool cache
+
+2008-08-08	The InnoDB Team
+
+	* buf/buf0lru.c, include/buf0buf.h:
+	Fix two recovery bugs that could lead to a crash in debug builds with
+	small buffer size
+
+2008-08-07	The InnoDB Team
+
+	* btr/btr0cur.c, handler/ha_innodb.cc, include/srv0srv.h,
+	srv/srv0srv.c:
+	Add a parameter innodb_stats_sample_pages to allow users to control
+	the number of index dives when InnoDB estimates the cardinality of
+	an index (ANALYZE TABLE, SHOW TABLE STATUS etc)
+
+2008-08-07	The InnoDB Team
+
+	* trx/trx0i_s.c:
+	Fix a bug that would lead to a crash if a SELECT was issued from the
+	INFORMATION_SCHEMA tables and there are rolling back transactions at
+	the same time
+
+2008-08-06	The InnoDB Team
+
+	* btr/btr0btr.c, btr/btr0cur.c, ibuf/ibuf0ibuf.c, include/btr0cur.h,
+	include/trx0roll.h, include/trx0types.h, row/row0purge.c,
+	row/row0uins.c, row/row0umod.c, trx/trx0roll.c:
+	In the rollback of incomplete transactions after crash recovery,
+	tolerate clustered index records whose externally stored columns
+	have not been written.
+
+2008-07-30	The InnoDB Team
+
+	* trx/trx0trx.c:
+	Fixes a race in recovery where the recovery thread recovering a
+	PREPARED trx and the background rollback thread can both try
+	to free the trx after its status is set to COMMITTED_IN_MEMORY.
+
+2008-07-29	The InnoDB Team
+
+	* include/trx0rec.h, row/row0purge.c, row/row0vers.c, trx/trx0rec.c:
+	Fix a BLOB corruption bug
+
+2008-07-15	The InnoDB Team
+
+	* btr/btr0sea.c, dict/dict0dict.c, include/btr0sea.h:
+	Fixed a timing hole where a thread dropping an index can free the
+	in-memory index struct while another thread is still using that
+	structure to remove entries from adaptive hash index belonging
+	to one of the pages that belongs to the index being dropped.
+
+2008-07-04	The InnoDB Team
+
+	* mysql-test/innodb-index.result:
+	Fix the failing innodb-index test by adjusting the result to a new
+	MySQL behavior (the change occured in BZR-r2667)
+
+2008-07-03	The InnoDB Team
+
+	* mysql-test/innodb-zip.result, mysql-test/innodb-zip.test:
+	Remove the negative test cases that produce warnings
+
+2008-07-02	The InnoDB Team
+
+	* mysql-test/innodb-replace.result, mysql-test/innodb-index.test:
+	Disable part of innodb-index test because MySQL changed its behavior
+	and is not calling ::add_index() anymore when adding primary index on
+	non-NULL column
+
+2008-07-01	The InnoDB Team
+
+	* mysql-test/innodb-replace.result, mysql-test/innodb-replace.test:
+	Fix the failing innodb-replace test by merging changes that MySQL
+	made to that file (r2659 in MySQL BZR repository)
+
+2008-07-01	The InnoDB Team
+
+	* lock/lock0lock.c:
+	Fix Bug#36942 Performance problem in lock_get_n_rec_locks (SHOW INNODB
+	STATUS)
+
+2008-07-01	The InnoDB Team
+
+	* ha/ha0ha.c:
+	Fix Bug#36941 Performance problem in ha_print_info (SHOW INNODB
+	STATUS)
+
+2008-07-01	The InnoDB Team
+
+	* handler/ha_innodb.cc, mysql-test/innodb-autoinc.result,
+	mysql-test/innodb-autoinc.test:
+	Fix Bug#37531 After truncate, auto_increment behaves incorrectly for
+	InnoDB
+
+2008-06-19	The InnoDB Team
+
+	* handler/ha_innodb.cc:
+	Rewrite the function innodb_plugin_init() to support parameters in
+	different order (in static and dynamic InnoDB) and to support more
+	parameters in the static InnoDB
+
+2008-06-19	The InnoDB Team
+
+	* handler/handler0alter.cc:
+	Fix a bug in ::add_index() which set the transaction state to "active"
+	but never restored it to the original value. This bug caused warnings
+	to be printed by the rpl.rpl_ddl mysql-test.
+
+2008-06-19	The InnoDB Team
+
+	* mysql-test/patches:
+	Add a directory which contains patches, which need to be applied to
+	MySQL source in order to get some mysql-tests to succeed. The patches
+	cannot be committed in MySQL repository because they are specific to
+	the InnoDB plugin.
+
+2008-06-19	The InnoDB Team
+
+	* mysql-test/innodb-zip.result, mysql-test/innodb-zip.test,
+	row/row0row.c:
+	Fix an anomaly when updating a record with BLOB prefix
+
+2008-06-18	The InnoDB Team
+
+	* include/trx0sys.h, srv/srv0start.c, trx/trx0sys.c:
+	Fix a bug in recovery which was a side effect of the file_format_check
+	changes
+
+2008-06-09	The InnoDB Team
+
+	* mysql-test/innodb.result:
+	Fix the failing innodb test by merging changes that MySQL made to that
+	file
+
+2008-06-06	The InnoDB Team
+
+	* buf/buf0buf.c, handler/ha_innodb.cc, include/buf0buf.h,
+	include/srv0srv.h, srv/srv0srv.c:
+	Fix Bug#36600 SHOW STATUS takes a lot of CPU in
+	buf_get_latched_pages_number
+
+	* handler/ha_innodb.cc, os/os0file.c:
+	Fix Bug#11894 innodb_file_per_table crashes w/ Windows .sym symbolic
+	link hack
+
+	* include/ut0ut.h, srv/srv0srv.c, ut/ut0ut.c:
+	Fix Bug#36819 ut_usectime does not handle errors from gettimeofday
+
+	* handler/ha_innodb.cc:
+	Fix Bug#35602 Failed to read auto-increment value from storage engine
+
+	* srv/srv0start.c:
+	Fix Bug#36149 Read buffer overflow in srv0start.c found during "make
+	test"
+
+2008-05-08	The InnoDB Team
+
+	* btr/btr0btr.c, mysql-test/innodb_bug36172.result,
+	mysql-test/innodb_bug36172.test:
+	Fix Bug#36172 insert into compressed innodb table crashes
+
+2008-05-08	The InnoDB Team
+
+	InnoDB Plugin 1.0.1 released
+
+2008-05-06	The InnoDB Team
+
+	* handler/ha_innodb.cc, include/srv0srv.h, include/sync0sync.h,
+	include/trx0sys.h, mysql-test/innodb-zip.result,
+	mysql-test/innodb-zip.test, srv/srv0srv.c, srv/srv0start.c,
+	sync/sync0sync.c, trx/trx0sys.c:
+	Implement the system tablespace tagging
+
+	* handler/ha_innodb.cc, handler/i_s.cc, include/univ.i,
+	srv/srv0start.c:
+	Add InnoDB version in INFORMATION_SCHEMA.PLUGINS.PLUGIN_VERSION,
+	in the startup message and in a server variable innodb_version.
+
+	* sync/sync0sync.c:
+	Fix a bug in the sync debug code where a lock with level
+	SYNC_LEVEL_VARYING would cause an assertion failure when a thread
+	tried to release it.
+
+2008-04-30	The InnoDB Team
+
+	* Makefile.am:
+	Fix Bug#36434 ha_innodb.so is installed in the wrong directory
+
+	* handler/ha_innodb.cc:
+	Merge change from MySQL (Fix Bug#35406 5.1-opt crashes on select from
+	I_S.REFERENTIAL_CONSTRAINTS):
+	ChangeSet@1.2563, 2008-03-18 19:42:04+04:00, gluh@mysql.com +1 -0
+
+	* scripts/install_innodb_plugins.sql:
+	Added
+
+	* mysql-test/innodb.result:
+	Merge change from MySQL (this fixes the failing innodb test):
+	ChangeSet@1.1810.3601.4, 2008-02-07 02:33:21+04:00
+
+	* row/row0sel.c:
+	Fix Bug#35226 RBR event crashes slave
+
+	* handler/ha_innodb.cc:
+	Change the fix for Bug#32440 to show bytes instead of kilobytes in
+	INFORMATION_SCHEMA.TABLES.DATA_FREE
+
+	* handler/ha_innodb.cc, mysql-test/innodb.result,
+	mysql-test/innodb.test:
+	Fix Bug#29507 TRUNCATE shows to many rows effected
+
+	* handler/ha_innodb.cc, mysql-test/innodb.result,
+	mysql-test/innodb.test:
+	Fix Bug#35537 Innodb doesn't increment handler_update and
+	handler_delete
+
+2008-04-29	The InnoDB Team
+
+	* handler/i_s.cc, include/srv0start.h, srv/srv0start.c:
+	Fix Bug#36310 InnoDB plugin crash
+
+2008-04-23	The InnoDB Team
+
+	* mysql-test/innodb_bug36169.result, mysql-test/innodb_bug36169.test,
+	row/row0mysql.c:
+	Fix Bug#36169 create innodb compressed table with too large row size
+	crashed
+
+	* (outside the source tree):
+	Fix Bug#36222 New InnoDB plugin 1.0 has wrong MKDIR_P defined in
+	Makefile.in
+
+2008-04-15	The InnoDB Team
+
+	InnoDB Plugin 1.0.0 released
diff --git a/storage/xtradb/Doxyfile b/storage/xtradb/Doxyfile
new file mode 100644
index 00000000000..62aa7dd8abc
--- /dev/null
+++ b/storage/xtradb/Doxyfile
@@ -0,0 +1,1419 @@
+# Doxyfile 1.5.6
+
+# Usage: SVNVERSION=-r$(svnversion) doxygen
+
+# This file describes the settings to be used by the documentation system
+# doxygen (www.doxygen.org) for a project
+#
+# All text after a hash (#) is considered a comment and will be ignored
+# The format is:
+#       TAG = value [value, ...]
+# For lists items can also be appended using:
+#       TAG += value [value, ...]
+# Values that contain spaces should be placed between quotes (" ")
+
+#---------------------------------------------------------------------------
+# Project related configuration options
+#---------------------------------------------------------------------------
+
+# This tag specifies the encoding used for all characters in the config file
+# that follow. The default is UTF-8 which is also the encoding used for all
+# text before the first occurrence of this tag. Doxygen uses libiconv (or the
+# iconv built into libc) for the transcoding. See
+# http://www.gnu.org/software/libiconv for the list of possible encodings.
+
+DOXYFILE_ENCODING      = UTF-8
+
+# The PROJECT_NAME tag is a single word (or a sequence of words surrounded
+# by quotes) that should identify the project.
+
+PROJECT_NAME           = "InnoDB Plugin"
+
+# The PROJECT_NUMBER tag can be used to enter a project or revision number.
+# This could be handy for archiving the generated documentation or
+# if some version control system is used.
+
+PROJECT_NUMBER         = 1.0$(SVNVERSION)
+
+# The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute)
+# base path where the generated documentation will be put.
+# If a relative path is entered, it will be relative to the location
+# where doxygen was started. If left blank the current directory will be used.
+
+OUTPUT_DIRECTORY       = dox
+
+# If the CREATE_SUBDIRS tag is set to YES, then doxygen will create
+# 4096 sub-directories (in 2 levels) under the output directory of each output
+# format and will distribute the generated files over these directories.
+# Enabling this option can be useful when feeding doxygen a huge amount of
+# source files, where putting all generated files in the same directory would
+# otherwise cause performance problems for the file system.
+
+CREATE_SUBDIRS         = NO
+
+# The OUTPUT_LANGUAGE tag is used to specify the language in which all
+# documentation generated by doxygen is written. Doxygen will use this
+# information to generate all constant output in the proper language.
+# The default language is English, other supported languages are:
+# Afrikaans, Arabic, Brazilian, Catalan, Chinese, Chinese-Traditional,
+# Croatian, Czech, Danish, Dutch, Farsi, Finnish, French, German, Greek,
+# Hungarian, Italian, Japanese, Japanese-en (Japanese with English messages),
+# Korean, Korean-en, Lithuanian, Norwegian, Macedonian, Persian, Polish,
+# Portuguese, Romanian, Russian, Serbian, Slovak, Slovene, Spanish, Swedish,
+# and Ukrainian.
+
+OUTPUT_LANGUAGE        = English
+
+# If the BRIEF_MEMBER_DESC tag is set to YES (the default) Doxygen will
+# include brief member descriptions after the members that are listed in
+# the file and class documentation (similar to JavaDoc).
+# Set to NO to disable this.
+
+BRIEF_MEMBER_DESC      = YES
+
+# If the REPEAT_BRIEF tag is set to YES (the default) Doxygen will prepend
+# the brief description of a member or function before the detailed description.
+# Note: if both HIDE_UNDOC_MEMBERS and BRIEF_MEMBER_DESC are set to NO, the
+# brief descriptions will be completely suppressed.
+
+REPEAT_BRIEF           = YES
+
+# This tag implements a quasi-intelligent brief description abbreviator
+# that is used to form the text in various listings. Each string
+# in this list, if found as the leading text of the brief description, will be
+# stripped from the text and the result after processing the whole list, is
+# used as the annotated text. Otherwise, the brief description is used as-is.
+# If left blank, the following values are used ("$name" is automatically
+# replaced with the name of the entity): "The $name class" "The $name widget"
+# "The $name file" "is" "provides" "specifies" "contains"
+# "represents" "a" "an" "the"
+
+ABBREVIATE_BRIEF       =
+
+# If the ALWAYS_DETAILED_SEC and REPEAT_BRIEF tags are both set to YES then
+# Doxygen will generate a detailed section even if there is only a brief
+# description.
+
+ALWAYS_DETAILED_SEC    = NO
+
+# If the INLINE_INHERITED_MEMB tag is set to YES, doxygen will show all
+# inherited members of a class in the documentation of that class as if those
+# members were ordinary class members. Constructors, destructors and assignment
+# operators of the base classes will not be shown.
+
+INLINE_INHERITED_MEMB  = NO
+
+# If the FULL_PATH_NAMES tag is set to YES then Doxygen will prepend the full
+# path before files name in the file list and in the header files. If set
+# to NO the shortest path that makes the file name unique will be used.
+
+FULL_PATH_NAMES        = YES
+
+# If the FULL_PATH_NAMES tag is set to YES then the STRIP_FROM_PATH tag
+# can be used to strip a user-defined part of the path. Stripping is
+# only done if one of the specified strings matches the left-hand part of
+# the path. The tag can be used to show relative paths in the file list.
+# If left blank the directory from which doxygen is run is used as the
+# path to strip.
+
+STRIP_FROM_PATH        =
+
+# The STRIP_FROM_INC_PATH tag can be used to strip a user-defined part of
+# the path mentioned in the documentation of a class, which tells
+# the reader which header file to include in order to use a class.
+# If left blank only the name of the header file containing the class
+# definition is used. Otherwise one should specify the include paths that
+# are normally passed to the compiler using the -I flag.
+
+STRIP_FROM_INC_PATH    =
+
+# If the SHORT_NAMES tag is set to YES, doxygen will generate much shorter
+# (but less readable) file names. This can be useful is your file systems
+# doesn't support long names like on DOS, Mac, or CD-ROM.
+
+SHORT_NAMES            = NO
+
+# If the JAVADOC_AUTOBRIEF tag is set to YES then Doxygen
+# will interpret the first line (until the first dot) of a JavaDoc-style
+# comment as the brief description. If set to NO, the JavaDoc
+# comments will behave just like regular Qt-style comments
+# (thus requiring an explicit @brief command for a brief description.)
+
+JAVADOC_AUTOBRIEF      = NO
+
+# If the QT_AUTOBRIEF tag is set to YES then Doxygen will
+# interpret the first line (until the first dot) of a Qt-style
+# comment as the brief description. If set to NO, the comments
+# will behave just like regular Qt-style comments (thus requiring
+# an explicit \brief command for a brief description.)
+
+QT_AUTOBRIEF           = NO
+
+# The MULTILINE_CPP_IS_BRIEF tag can be set to YES to make Doxygen
+# treat a multi-line C++ special comment block (i.e. a block of //! or ///
+# comments) as a brief description. This used to be the default behaviour.
+# The new default is to treat a multi-line C++ comment block as a detailed
+# description. Set this tag to YES if you prefer the old behaviour instead.
+
+MULTILINE_CPP_IS_BRIEF = NO
+
+# If the DETAILS_AT_TOP tag is set to YES then Doxygen
+# will output the detailed description near the top, like JavaDoc.
+# If set to NO, the detailed description appears after the member
+# documentation.
+
+DETAILS_AT_TOP         = NO
+
+# If the INHERIT_DOCS tag is set to YES (the default) then an undocumented
+# member inherits the documentation from any documented member that it
+# re-implements.
+
+INHERIT_DOCS           = YES
+
+# If the SEPARATE_MEMBER_PAGES tag is set to YES, then doxygen will produce
+# a new page for each member. If set to NO, the documentation of a member will
+# be part of the file/class/namespace that contains it.
+
+SEPARATE_MEMBER_PAGES  = NO
+
+# The TAB_SIZE tag can be used to set the number of spaces in a tab.
+# Doxygen uses this value to replace tabs by spaces in code fragments.
+
+TAB_SIZE               = 8
+
+# This tag can be used to specify a number of aliases that acts
+# as commands in the documentation. An alias has the form "name=value".
+# For example adding "sideeffect=\par Side Effects:\n" will allow you to
+# put the command \sideeffect (or @sideeffect) in the documentation, which
+# will result in a user-defined paragraph with heading "Side Effects:".
+# You can put \n's in the value part of an alias to insert newlines.
+
+ALIASES                =
+
+# Set the OPTIMIZE_OUTPUT_FOR_C tag to YES if your project consists of C
+# sources only. Doxygen will then generate output that is more tailored for C.
+# For instance, some of the names that are used will be different. The list
+# of all members will be omitted, etc.
+
+OPTIMIZE_OUTPUT_FOR_C  = YES
+
+# Set the OPTIMIZE_OUTPUT_JAVA tag to YES if your project consists of Java
+# sources only. Doxygen will then generate output that is more tailored for
+# Java. For instance, namespaces will be presented as packages, qualified
+# scopes will look different, etc.
+
+OPTIMIZE_OUTPUT_JAVA   = NO
+
+# Set the OPTIMIZE_FOR_FORTRAN tag to YES if your project consists of Fortran
+# sources only. Doxygen will then generate output that is more tailored for
+# Fortran.
+
+OPTIMIZE_FOR_FORTRAN   = NO
+
+# Set the OPTIMIZE_OUTPUT_VHDL tag to YES if your project consists of VHDL
+# sources. Doxygen will then generate output that is tailored for
+# VHDL.
+
+OPTIMIZE_OUTPUT_VHDL   = NO
+
+# If you use STL classes (i.e. std::string, std::vector, etc.) but do not want
+# to include (a tag file for) the STL sources as input, then you should
+# set this tag to YES in order to let doxygen match functions declarations and
+# definitions whose arguments contain STL classes (e.g. func(std::string); v.s.
+# func(std::string) {}). This also make the inheritance and collaboration
+# diagrams that involve STL classes more complete and accurate.
+
+BUILTIN_STL_SUPPORT    = NO
+
+# If you use Microsoft's C++/CLI language, you should set this option to YES to
+# enable parsing support.
+
+CPP_CLI_SUPPORT        = NO
+
+# Set the SIP_SUPPORT tag to YES if your project consists of sip sources only.
+# Doxygen will parse them like normal C++ but will assume all classes use public
+# instead of private inheritance when no explicit protection keyword is present.
+
+SIP_SUPPORT            = NO
+
+# For Microsoft's IDL there are propget and propput attributes to indicate getter
+# and setter methods for a property. Setting this option to YES (the default)
+# will make doxygen to replace the get and set methods by a property in the
+# documentation. This will only work if the methods are indeed getting or
+# setting a simple type. If this is not the case, or you want to show the
+# methods anyway, you should set this option to NO.
+
+IDL_PROPERTY_SUPPORT   = YES
+
+# If member grouping is used in the documentation and the DISTRIBUTE_GROUP_DOC
+# tag is set to YES, then doxygen will reuse the documentation of the first
+# member in the group (if any) for the other members of the group. By default
+# all members of a group must be documented explicitly.
+
+DISTRIBUTE_GROUP_DOC   = NO
+
+# Set the SUBGROUPING tag to YES (the default) to allow class member groups of
+# the same type (for instance a group of public functions) to be put as a
+# subgroup of that type (e.g. under the Public Functions section). Set it to
+# NO to prevent subgrouping. Alternatively, this can be done per class using
+# the \nosubgrouping command.
+
+SUBGROUPING            = YES
+
+# When TYPEDEF_HIDES_STRUCT is enabled, a typedef of a struct, union, or enum
+# is documented as struct, union, or enum with the name of the typedef. So
+# typedef struct TypeS {} TypeT, will appear in the documentation as a struct
+# with name TypeT. When disabled the typedef will appear as a member of a file,
+# namespace, or class. And the struct will be named TypeS. This can typically
+# be useful for C code in case the coding convention dictates that all compound
+# types are typedef'ed and only the typedef is referenced, never the tag name.
+
+TYPEDEF_HIDES_STRUCT   = NO
+
+#---------------------------------------------------------------------------
+# Build related configuration options
+#---------------------------------------------------------------------------
+
+# If the EXTRACT_ALL tag is set to YES doxygen will assume all entities in
+# documentation are documented, even if no documentation was available.
+# Private class members and static file members will be hidden unless
+# the EXTRACT_PRIVATE and EXTRACT_STATIC tags are set to YES
+
+EXTRACT_ALL            = NO
+
+# If the EXTRACT_PRIVATE tag is set to YES all private members of a class
+# will be included in the documentation.
+
+EXTRACT_PRIVATE        = YES
+
+# If the EXTRACT_STATIC tag is set to YES all static members of a file
+# will be included in the documentation.
+
+EXTRACT_STATIC         = YES
+
+# If the EXTRACT_LOCAL_CLASSES tag is set to YES classes (and structs)
+# defined locally in source files will be included in the documentation.
+# If set to NO only classes defined in header files are included.
+
+EXTRACT_LOCAL_CLASSES  = YES
+
+# This flag is only useful for Objective-C code. When set to YES local
+# methods, which are defined in the implementation section but not in
+# the interface are included in the documentation.
+# If set to NO (the default) only methods in the interface are included.
+
+EXTRACT_LOCAL_METHODS  = NO
+
+# If this flag is set to YES, the members of anonymous namespaces will be
+# extracted and appear in the documentation as a namespace called
+# 'anonymous_namespace{file}', where file will be replaced with the base
+# name of the file that contains the anonymous namespace. By default
+# anonymous namespace are hidden.
+
+EXTRACT_ANON_NSPACES   = NO
+
+# If the HIDE_UNDOC_MEMBERS tag is set to YES, Doxygen will hide all
+# undocumented members of documented classes, files or namespaces.
+# If set to NO (the default) these members will be included in the
+# various overviews, but no documentation section is generated.
+# This option has no effect if EXTRACT_ALL is enabled.
+
+HIDE_UNDOC_MEMBERS     = NO
+
+# If the HIDE_UNDOC_CLASSES tag is set to YES, Doxygen will hide all
+# undocumented classes that are normally visible in the class hierarchy.
+# If set to NO (the default) these classes will be included in the various
+# overviews. This option has no effect if EXTRACT_ALL is enabled.
+
+HIDE_UNDOC_CLASSES     = NO
+
+# If the HIDE_FRIEND_COMPOUNDS tag is set to YES, Doxygen will hide all
+# friend (class|struct|union) declarations.
+# If set to NO (the default) these declarations will be included in the
+# documentation.
+
+HIDE_FRIEND_COMPOUNDS  = NO
+
+# If the HIDE_IN_BODY_DOCS tag is set to YES, Doxygen will hide any
+# documentation blocks found inside the body of a function.
+# If set to NO (the default) these blocks will be appended to the
+# function's detailed documentation block.
+
+HIDE_IN_BODY_DOCS      = NO
+
+# The INTERNAL_DOCS tag determines if documentation
+# that is typed after a \internal command is included. If the tag is set
+# to NO (the default) then the documentation will be excluded.
+# Set it to YES to include the internal documentation.
+
+INTERNAL_DOCS          = NO
+
+# If the CASE_SENSE_NAMES tag is set to NO then Doxygen will only generate
+# file names in lower-case letters. If set to YES upper-case letters are also
+# allowed. This is useful if you have classes or files whose names only differ
+# in case and if your file system supports case sensitive file names. Windows
+# and Mac users are advised to set this option to NO.
+
+CASE_SENSE_NAMES       = YES
+
+# If the HIDE_SCOPE_NAMES tag is set to NO (the default) then Doxygen
+# will show members with their full class and namespace scopes in the
+# documentation. If set to YES the scope will be hidden.
+
+HIDE_SCOPE_NAMES       = NO
+
+# If the SHOW_INCLUDE_FILES tag is set to YES (the default) then Doxygen
+# will put a list of the files that are included by a file in the documentation
+# of that file.
+
+SHOW_INCLUDE_FILES     = YES
+
+# If the INLINE_INFO tag is set to YES (the default) then a tag [inline]
+# is inserted in the documentation for inline members.
+
+INLINE_INFO            = YES
+
+# If the SORT_MEMBER_DOCS tag is set to YES (the default) then doxygen
+# will sort the (detailed) documentation of file and class members
+# alphabetically by member name. If set to NO the members will appear in
+# declaration order.
+
+SORT_MEMBER_DOCS       = YES
+
+# If the SORT_BRIEF_DOCS tag is set to YES then doxygen will sort the
+# brief documentation of file, namespace and class members alphabetically
+# by member name. If set to NO (the default) the members will appear in
+# declaration order.
+
+SORT_BRIEF_DOCS        = NO
+
+# If the SORT_GROUP_NAMES tag is set to YES then doxygen will sort the
+# hierarchy of group names into alphabetical order. If set to NO (the default)
+# the group names will appear in their defined order.
+
+SORT_GROUP_NAMES       = NO
+
+# If the SORT_BY_SCOPE_NAME tag is set to YES, the class list will be
+# sorted by fully-qualified names, including namespaces. If set to
+# NO (the default), the class list will be sorted only by class name,
+# not including the namespace part.
+# Note: This option is not very useful if HIDE_SCOPE_NAMES is set to YES.
+# Note: This option applies only to the class list, not to the
+# alphabetical list.
+
+SORT_BY_SCOPE_NAME     = NO
+
+# The GENERATE_TODOLIST tag can be used to enable (YES) or
+# disable (NO) the todo list. This list is created by putting \todo
+# commands in the documentation.
+
+GENERATE_TODOLIST      = YES
+
+# The GENERATE_TESTLIST tag can be used to enable (YES) or
+# disable (NO) the test list. This list is created by putting \test
+# commands in the documentation.
+
+GENERATE_TESTLIST      = YES
+
+# The GENERATE_BUGLIST tag can be used to enable (YES) or
+# disable (NO) the bug list. This list is created by putting \bug
+# commands in the documentation.
+
+GENERATE_BUGLIST       = YES
+
+# The GENERATE_DEPRECATEDLIST tag can be used to enable (YES) or
+# disable (NO) the deprecated list. This list is created by putting
+# \deprecated commands in the documentation.
+
+GENERATE_DEPRECATEDLIST= YES
+
+# The ENABLED_SECTIONS tag can be used to enable conditional
+# documentation sections, marked by \if sectionname ... \endif.
+
+ENABLED_SECTIONS       =
+
+# The MAX_INITIALIZER_LINES tag determines the maximum number of lines
+# the initial value of a variable or define consists of for it to appear in
+# the documentation. If the initializer consists of more lines than specified
+# here it will be hidden. Use a value of 0 to hide initializers completely.
+# The appearance of the initializer of individual variables and defines in the
+# documentation can be controlled using \showinitializer or \hideinitializer
+# command in the documentation regardless of this setting.
+
+MAX_INITIALIZER_LINES  = 30
+
+# Set the SHOW_USED_FILES tag to NO to disable the list of files generated
+# at the bottom of the documentation of classes and structs. If set to YES the
+# list will mention the files that were used to generate the documentation.
+
+SHOW_USED_FILES        = YES
+
+# If the sources in your project are distributed over multiple directories
+# then setting the SHOW_DIRECTORIES tag to YES will show the directory hierarchy
+# in the documentation. The default is NO.
+
+SHOW_DIRECTORIES       = NO
+
+# Set the SHOW_FILES tag to NO to disable the generation of the Files page.
+# This will remove the Files entry from the Quick Index and from the
+# Folder Tree View (if specified). The default is YES.
+
+SHOW_FILES             = YES
+
+# Set the SHOW_NAMESPACES tag to NO to disable the generation of the
+# Namespaces page.  This will remove the Namespaces entry from the Quick Index
+# and from the Folder Tree View (if specified). The default is YES.
+
+SHOW_NAMESPACES        = YES
+
+# The FILE_VERSION_FILTER tag can be used to specify a program or script that
+# doxygen should invoke to get the current version for each file (typically from
+# the version control system). Doxygen will invoke the program by executing (via
+# popen()) the command <command> <input-file>, where <command> is the value of
+# the FILE_VERSION_FILTER tag, and <input-file> is the name of an input file
+# provided by doxygen. Whatever the program writes to standard output
+# is used as the file version. See the manual for examples.
+
+FILE_VERSION_FILTER    =
+
+#---------------------------------------------------------------------------
+# configuration options related to warning and progress messages
+#---------------------------------------------------------------------------
+
+# The QUIET tag can be used to turn on/off the messages that are generated
+# by doxygen. Possible values are YES and NO. If left blank NO is used.
+
+QUIET                  = YES
+
+# The WARNINGS tag can be used to turn on/off the warning messages that are
+# generated by doxygen. Possible values are YES and NO. If left blank
+# NO is used.
+
+WARNINGS               = YES
+
+# If WARN_IF_UNDOCUMENTED is set to YES, then doxygen will generate warnings
+# for undocumented members. If EXTRACT_ALL is set to YES then this flag will
+# automatically be disabled.
+
+WARN_IF_UNDOCUMENTED   = YES
+
+# If WARN_IF_DOC_ERROR is set to YES, doxygen will generate warnings for
+# potential errors in the documentation, such as not documenting some
+# parameters in a documented function, or documenting parameters that
+# don't exist or using markup commands wrongly.
+
+WARN_IF_DOC_ERROR      = YES
+
+# This WARN_NO_PARAMDOC option can be abled to get warnings for
+# functions that are documented, but have no documentation for their parameters
+# or return value. If set to NO (the default) doxygen will only warn about
+# wrong or incomplete parameter documentation, but not about the absence of
+# documentation.
+
+WARN_NO_PARAMDOC       = NO
+
+# The WARN_FORMAT tag determines the format of the warning messages that
+# doxygen can produce. The string should contain the $file, $line, and $text
+# tags, which will be replaced by the file and line number from which the
+# warning originated and the warning text. Optionally the format may contain
+# $version, which will be replaced by the version of the file (if it could
+# be obtained via FILE_VERSION_FILTER)
+
+WARN_FORMAT            = "$file:$line: $text"
+
+# The WARN_LOGFILE tag can be used to specify a file to which warning
+# and error messages should be written. If left blank the output is written
+# to stderr.
+
+WARN_LOGFILE           =
+
+#---------------------------------------------------------------------------
+# configuration options related to the input files
+#---------------------------------------------------------------------------
+
+# The INPUT tag can be used to specify the files and/or directories that contain
+# documented source files. You may enter file names like "myfile.cpp" or
+# directories like "/usr/src/myproject". Separate the files or directories
+# with spaces.
+
+INPUT                  = . include/univ.i
+
+# This tag can be used to specify the character encoding of the source files
+# that doxygen parses. Internally doxygen uses the UTF-8 encoding, which is
+# also the default input encoding. Doxygen uses libiconv (or the iconv built
+# into libc) for the transcoding. See http://www.gnu.org/software/libiconv for
+# the list of possible encodings.
+
+INPUT_ENCODING         = UTF-8
+
+# If the value of the INPUT tag contains directories, you can use the
+# FILE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp
+# and *.h) to filter out the source-files in the directories. If left
+# blank the following patterns are tested:
+# *.c *.cc *.cxx *.cpp *.c++ *.java *.ii *.ixx *.ipp *.i++ *.inl *.h *.hh *.hxx
+# *.hpp *.h++ *.idl *.odl *.cs *.php *.php3 *.inc *.m *.mm *.py *.f90
+
+FILE_PATTERNS          = *.c *.ic *.h
+
+# The RECURSIVE tag can be used to turn specify whether or not subdirectories
+# should be searched for input files as well. Possible values are YES and NO.
+# If left blank NO is used.
+
+RECURSIVE              = YES
+
+# The EXCLUDE tag can be used to specify files and/or directories that should
+# excluded from the INPUT source files. This way you can easily exclude a
+# subdirectory from a directory tree whose root is specified with the INPUT tag.
+
+EXCLUDE                = ut0auxconf_*
+
+# The EXCLUDE_SYMLINKS tag can be used select whether or not files or
+# directories that are symbolic links (a Unix filesystem feature) are excluded
+# from the input.
+
+EXCLUDE_SYMLINKS       = NO
+
+# If the value of the INPUT tag contains directories, you can use the
+# EXCLUDE_PATTERNS tag to specify one or more wildcard patterns to exclude
+# certain files from those directories. Note that the wildcards are matched
+# against the file with absolute path, so to exclude all test directories
+# for example use the pattern */test/*
+
+EXCLUDE_PATTERNS       =
+
+# The EXCLUDE_SYMBOLS tag can be used to specify one or more symbol names
+# (namespaces, classes, functions, etc.) that should be excluded from the
+# output. The symbol name can be a fully qualified name, a word, or if the
+# wildcard * is used, a substring. Examples: ANamespace, AClass,
+# AClass::ANamespace, ANamespace::*Test
+
+EXCLUDE_SYMBOLS        =
+
+# The EXAMPLE_PATH tag can be used to specify one or more files or
+# directories that contain example code fragments that are included (see
+# the \include command).
+
+EXAMPLE_PATH           =
+
+# If the value of the EXAMPLE_PATH tag contains directories, you can use the
+# EXAMPLE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp
+# and *.h) to filter out the source-files in the directories. If left
+# blank all files are included.
+
+EXAMPLE_PATTERNS       =
+
+# If the EXAMPLE_RECURSIVE tag is set to YES then subdirectories will be
+# searched for input files to be used with the \include or \dontinclude
+# commands irrespective of the value of the RECURSIVE tag.
+# Possible values are YES and NO. If left blank NO is used.
+
+EXAMPLE_RECURSIVE      = NO
+
+# The IMAGE_PATH tag can be used to specify one or more files or
+# directories that contain image that are included in the documentation (see
+# the \image command).
+
+IMAGE_PATH             =
+
+# The INPUT_FILTER tag can be used to specify a program that doxygen should
+# invoke to filter for each input file. Doxygen will invoke the filter program
+# by executing (via popen()) the command <filter> <input-file>, where <filter>
+# is the value of the INPUT_FILTER tag, and <input-file> is the name of an
+# input file. Doxygen will then use the output that the filter program writes
+# to standard output.  If FILTER_PATTERNS is specified, this tag will be
+# ignored.
+
+INPUT_FILTER           =
+
+# The FILTER_PATTERNS tag can be used to specify filters on a per file pattern
+# basis.  Doxygen will compare the file name with each pattern and apply the
+# filter if there is a match.  The filters are a list of the form:
+# pattern=filter (like *.cpp=my_cpp_filter). See INPUT_FILTER for further
+# info on how filters are used. If FILTER_PATTERNS is empty, INPUT_FILTER
+# is applied to all files.
+
+FILTER_PATTERNS        =
+
+# If the FILTER_SOURCE_FILES tag is set to YES, the input filter (if set using
+# INPUT_FILTER) will be used to filter the input files when producing source
+# files to browse (i.e. when SOURCE_BROWSER is set to YES).
+
+FILTER_SOURCE_FILES    = NO
+
+#---------------------------------------------------------------------------
+# configuration options related to source browsing
+#---------------------------------------------------------------------------
+
+# If the SOURCE_BROWSER tag is set to YES then a list of source files will
+# be generated. Documented entities will be cross-referenced with these sources.
+# Note: To get rid of all source code in the generated output, make sure also
+# VERBATIM_HEADERS is set to NO.
+
+SOURCE_BROWSER         = NO
+
+# Setting the INLINE_SOURCES tag to YES will include the body
+# of functions and classes directly in the documentation.
+
+INLINE_SOURCES         = NO
+
+# Setting the STRIP_CODE_COMMENTS tag to YES (the default) will instruct
+# doxygen to hide any special comment blocks from generated source code
+# fragments. Normal C and C++ comments will always remain visible.
+
+STRIP_CODE_COMMENTS    = YES
+
+# If the REFERENCED_BY_RELATION tag is set to YES
+# then for each documented function all documented
+# functions referencing it will be listed.
+
+REFERENCED_BY_RELATION = NO
+
+# If the REFERENCES_RELATION tag is set to YES
+# then for each documented function all documented entities
+# called/used by that function will be listed.
+
+REFERENCES_RELATION    = NO
+
+# If the REFERENCES_LINK_SOURCE tag is set to YES (the default)
+# and SOURCE_BROWSER tag is set to YES, then the hyperlinks from
+# functions in REFERENCES_RELATION and REFERENCED_BY_RELATION lists will
+# link to the source code.  Otherwise they will link to the documentstion.
+
+REFERENCES_LINK_SOURCE = YES
+
+# If the USE_HTAGS tag is set to YES then the references to source code
+# will point to the HTML generated by the htags(1) tool instead of doxygen
+# built-in source browser. The htags tool is part of GNU's global source
+# tagging system (see http://www.gnu.org/software/global/global.html). You
+# will need version 4.8.6 or higher.
+
+USE_HTAGS              = NO
+
+# If the VERBATIM_HEADERS tag is set to YES (the default) then Doxygen
+# will generate a verbatim copy of the header file for each class for
+# which an include is specified. Set to NO to disable this.
+
+VERBATIM_HEADERS       = YES
+
+#---------------------------------------------------------------------------
+# configuration options related to the alphabetical class index
+#---------------------------------------------------------------------------
+
+# If the ALPHABETICAL_INDEX tag is set to YES, an alphabetical index
+# of all compounds will be generated. Enable this if the project
+# contains a lot of classes, structs, unions or interfaces.
+
+ALPHABETICAL_INDEX     = NO
+
+# If the alphabetical index is enabled (see ALPHABETICAL_INDEX) then
+# the COLS_IN_ALPHA_INDEX tag can be used to specify the number of columns
+# in which this list will be split (can be a number in the range [1..20])
+
+COLS_IN_ALPHA_INDEX    = 5
+
+# In case all classes in a project start with a common prefix, all
+# classes will be put under the same header in the alphabetical index.
+# The IGNORE_PREFIX tag can be used to specify one or more prefixes that
+# should be ignored while generating the index headers.
+
+IGNORE_PREFIX          =
+
+#---------------------------------------------------------------------------
+# configuration options related to the HTML output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_HTML tag is set to YES (the default) Doxygen will
+# generate HTML output.
+
+GENERATE_HTML          = YES
+
+# The HTML_OUTPUT tag is used to specify where the HTML docs will be put.
+# If a relative path is entered the value of OUTPUT_DIRECTORY will be
+# put in front of it. If left blank `html' will be used as the default path.
+
+HTML_OUTPUT            = html
+
+# The HTML_FILE_EXTENSION tag can be used to specify the file extension for
+# each generated HTML page (for example: .htm,.php,.asp). If it is left blank
+# doxygen will generate files with .html extension.
+
+HTML_FILE_EXTENSION    = .html
+
+# The HTML_HEADER tag can be used to specify a personal HTML header for
+# each generated HTML page. If it is left blank doxygen will generate a
+# standard header.
+
+HTML_HEADER            =
+
+# The HTML_FOOTER tag can be used to specify a personal HTML footer for
+# each generated HTML page. If it is left blank doxygen will generate a
+# standard footer.
+
+HTML_FOOTER            =
+
+# The HTML_STYLESHEET tag can be used to specify a user-defined cascading
+# style sheet that is used by each HTML page. It can be used to
+# fine-tune the look of the HTML output. If the tag is left blank doxygen
+# will generate a default style sheet. Note that doxygen will try to copy
+# the style sheet file to the HTML output directory, so don't put your own
+# stylesheet in the HTML output directory as well, or it will be erased!
+
+HTML_STYLESHEET        =
+
+# If the HTML_ALIGN_MEMBERS tag is set to YES, the members of classes,
+# files or namespaces will be aligned in HTML using tables. If set to
+# NO a bullet list will be used.
+
+HTML_ALIGN_MEMBERS     = YES
+
+# If the GENERATE_HTMLHELP tag is set to YES, additional index files
+# will be generated that can be used as input for tools like the
+# Microsoft HTML help workshop to generate a compiled HTML help file (.chm)
+# of the generated HTML documentation.
+
+GENERATE_HTMLHELP      = NO
+
+# If the GENERATE_DOCSET tag is set to YES, additional index files
+# will be generated that can be used as input for Apple's Xcode 3
+# integrated development environment, introduced with OSX 10.5 (Leopard).
+# To create a documentation set, doxygen will generate a Makefile in the
+# HTML output directory. Running make will produce the docset in that
+# directory and running "make install" will install the docset in
+# ~/Library/Developer/Shared/Documentation/DocSets so that Xcode will find
+# it at startup.
+
+GENERATE_DOCSET        = NO
+
+# When GENERATE_DOCSET tag is set to YES, this tag determines the name of the
+# feed. A documentation feed provides an umbrella under which multiple
+# documentation sets from a single provider (such as a company or product suite)
+# can be grouped.
+
+DOCSET_FEEDNAME        = "Doxygen generated docs"
+
+# When GENERATE_DOCSET tag is set to YES, this tag specifies a string that
+# should uniquely identify the documentation set bundle. This should be a
+# reverse domain-name style string, e.g. com.mycompany.MyDocSet. Doxygen
+# will append .docset to the name.
+
+DOCSET_BUNDLE_ID       = org.doxygen.Project
+
+# If the HTML_DYNAMIC_SECTIONS tag is set to YES then the generated HTML
+# documentation will contain sections that can be hidden and shown after the
+# page has loaded. For this to work a browser that supports
+# JavaScript and DHTML is required (for instance Mozilla 1.0+, Firefox
+# Netscape 6.0+, Internet explorer 5.0+, Konqueror, or Safari).
+
+HTML_DYNAMIC_SECTIONS  = NO
+
+# If the GENERATE_HTMLHELP tag is set to YES, the CHM_FILE tag can
+# be used to specify the file name of the resulting .chm file. You
+# can add a path in front of the file if the result should not be
+# written to the html output directory.
+
+CHM_FILE               =
+
+# If the GENERATE_HTMLHELP tag is set to YES, the HHC_LOCATION tag can
+# be used to specify the location (absolute path including file name) of
+# the HTML help compiler (hhc.exe). If non-empty doxygen will try to run
+# the HTML help compiler on the generated index.hhp.
+
+HHC_LOCATION           =
+
+# If the GENERATE_HTMLHELP tag is set to YES, the GENERATE_CHI flag
+# controls if a separate .chi index file is generated (YES) or that
+# it should be included in the master .chm file (NO).
+
+GENERATE_CHI           = NO
+
+# If the GENERATE_HTMLHELP tag is set to YES, the CHM_INDEX_ENCODING
+# is used to encode HtmlHelp index (hhk), content (hhc) and project file
+# content.
+
+CHM_INDEX_ENCODING     =
+
+# If the GENERATE_HTMLHELP tag is set to YES, the BINARY_TOC flag
+# controls whether a binary table of contents is generated (YES) or a
+# normal table of contents (NO) in the .chm file.
+
+BINARY_TOC             = NO
+
+# The TOC_EXPAND flag can be set to YES to add extra items for group members
+# to the contents of the HTML help documentation and to the tree view.
+
+TOC_EXPAND             = NO
+
+# The DISABLE_INDEX tag can be used to turn on/off the condensed index at
+# top of each HTML page. The value NO (the default) enables the index and
+# the value YES disables it.
+
+DISABLE_INDEX          = NO
+
+# This tag can be used to set the number of enum values (range [1..20])
+# that doxygen will group on one line in the generated HTML documentation.
+
+ENUM_VALUES_PER_LINE   = 4
+
+# The GENERATE_TREEVIEW tag is used to specify whether a tree-like index
+# structure should be generated to display hierarchical information.
+# If the tag value is set to FRAME, a side panel will be generated
+# containing a tree-like index structure (just like the one that
+# is generated for HTML Help). For this to work a browser that supports
+# JavaScript, DHTML, CSS and frames is required (for instance Mozilla 1.0+,
+# Netscape 6.0+, Internet explorer 5.0+, or Konqueror). Windows users are
+# probably better off using the HTML help feature. Other possible values
+# for this tag are: HIERARCHIES, which will generate the Groups, Directories,
+# and Class Hiererachy pages using a tree view instead of an ordered list;
+# ALL, which combines the behavior of FRAME and HIERARCHIES; and NONE, which
+# disables this behavior completely. For backwards compatibility with previous
+# releases of Doxygen, the values YES and NO are equivalent to FRAME and NONE
+# respectively.
+
+GENERATE_TREEVIEW      = NONE
+
+# If the treeview is enabled (see GENERATE_TREEVIEW) then this tag can be
+# used to set the initial width (in pixels) of the frame in which the tree
+# is shown.
+
+TREEVIEW_WIDTH         = 250
+
+# Use this tag to change the font size of Latex formulas included
+# as images in the HTML documentation. The default is 10. Note that
+# when you change the font size after a successful doxygen run you need
+# to manually remove any form_*.png images from the HTML output directory
+# to force them to be regenerated.
+
+FORMULA_FONTSIZE       = 10
+
+#---------------------------------------------------------------------------
+# configuration options related to the LaTeX output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_LATEX tag is set to YES (the default) Doxygen will
+# generate Latex output.
+
+GENERATE_LATEX         = NO
+
+# The LATEX_OUTPUT tag is used to specify where the LaTeX docs will be put.
+# If a relative path is entered the value of OUTPUT_DIRECTORY will be
+# put in front of it. If left blank `latex' will be used as the default path.
+
+LATEX_OUTPUT           = latex
+
+# The LATEX_CMD_NAME tag can be used to specify the LaTeX command name to be
+# invoked. If left blank `latex' will be used as the default command name.
+
+LATEX_CMD_NAME         = latex
+
+# The MAKEINDEX_CMD_NAME tag can be used to specify the command name to
+# generate index for LaTeX. If left blank `makeindex' will be used as the
+# default command name.
+
+MAKEINDEX_CMD_NAME     = makeindex
+
+# If the COMPACT_LATEX tag is set to YES Doxygen generates more compact
+# LaTeX documents. This may be useful for small projects and may help to
+# save some trees in general.
+
+COMPACT_LATEX          = NO
+
+# The PAPER_TYPE tag can be used to set the paper type that is used
+# by the printer. Possible values are: a4, a4wide, letter, legal and
+# executive. If left blank a4wide will be used.
+
+PAPER_TYPE             = a4wide
+
+# The EXTRA_PACKAGES tag can be to specify one or more names of LaTeX
+# packages that should be included in the LaTeX output.
+
+EXTRA_PACKAGES         =
+
+# The LATEX_HEADER tag can be used to specify a personal LaTeX header for
+# the generated latex document. The header should contain everything until
+# the first chapter. If it is left blank doxygen will generate a
+# standard header. Notice: only use this tag if you know what you are doing!
+
+LATEX_HEADER           =
+
+# If the PDF_HYPERLINKS tag is set to YES, the LaTeX that is generated
+# is prepared for conversion to pdf (using ps2pdf). The pdf file will
+# contain links (just like the HTML output) instead of page references
+# This makes the output suitable for online browsing using a pdf viewer.
+
+PDF_HYPERLINKS         = YES
+
+# If the USE_PDFLATEX tag is set to YES, pdflatex will be used instead of
+# plain latex in the generated Makefile. Set this option to YES to get a
+# higher quality PDF documentation.
+
+USE_PDFLATEX           = YES
+
+# If the LATEX_BATCHMODE tag is set to YES, doxygen will add the \\batchmode.
+# command to the generated LaTeX files. This will instruct LaTeX to keep
+# running if errors occur, instead of asking the user for help.
+# This option is also used when generating formulas in HTML.
+
+LATEX_BATCHMODE        = NO
+
+# If LATEX_HIDE_INDICES is set to YES then doxygen will not
+# include the index chapters (such as File Index, Compound Index, etc.)
+# in the output.
+
+LATEX_HIDE_INDICES     = NO
+
+#---------------------------------------------------------------------------
+# configuration options related to the RTF output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_RTF tag is set to YES Doxygen will generate RTF output
+# The RTF output is optimized for Word 97 and may not look very pretty with
+# other RTF readers or editors.
+
+GENERATE_RTF           = NO
+
+# The RTF_OUTPUT tag is used to specify where the RTF docs will be put.
+# If a relative path is entered the value of OUTPUT_DIRECTORY will be
+# put in front of it. If left blank `rtf' will be used as the default path.
+
+RTF_OUTPUT             = rtf
+
+# If the COMPACT_RTF tag is set to YES Doxygen generates more compact
+# RTF documents. This may be useful for small projects and may help to
+# save some trees in general.
+
+COMPACT_RTF            = NO
+
+# If the RTF_HYPERLINKS tag is set to YES, the RTF that is generated
+# will contain hyperlink fields. The RTF file will
+# contain links (just like the HTML output) instead of page references.
+# This makes the output suitable for online browsing using WORD or other
+# programs which support those fields.
+# Note: wordpad (write) and others do not support links.
+
+RTF_HYPERLINKS         = NO
+
+# Load stylesheet definitions from file. Syntax is similar to doxygen's
+# config file, i.e. a series of assignments. You only have to provide
+# replacements, missing definitions are set to their default value.
+
+RTF_STYLESHEET_FILE    =
+
+# Set optional variables used in the generation of an rtf document.
+# Syntax is similar to doxygen's config file.
+
+RTF_EXTENSIONS_FILE    =
+
+#---------------------------------------------------------------------------
+# configuration options related to the man page output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_MAN tag is set to YES (the default) Doxygen will
+# generate man pages
+
+GENERATE_MAN           = NO
+
+# The MAN_OUTPUT tag is used to specify where the man pages will be put.
+# If a relative path is entered the value of OUTPUT_DIRECTORY will be
+# put in front of it. If left blank `man' will be used as the default path.
+
+MAN_OUTPUT             = man
+
+# The MAN_EXTENSION tag determines the extension that is added to
+# the generated man pages (default is the subroutine's section .3)
+
+MAN_EXTENSION          = .3
+
+# If the MAN_LINKS tag is set to YES and Doxygen generates man output,
+# then it will generate one additional man file for each entity
+# documented in the real man page(s). These additional files
+# only source the real man page, but without them the man command
+# would be unable to find the correct page. The default is NO.
+
+MAN_LINKS              = NO
+
+#---------------------------------------------------------------------------
+# configuration options related to the XML output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_XML tag is set to YES Doxygen will
+# generate an XML file that captures the structure of
+# the code including all documentation.
+
+GENERATE_XML           = NO
+
+# The XML_OUTPUT tag is used to specify where the XML pages will be put.
+# If a relative path is entered the value of OUTPUT_DIRECTORY will be
+# put in front of it. If left blank `xml' will be used as the default path.
+
+XML_OUTPUT             = xml
+
+# The XML_SCHEMA tag can be used to specify an XML schema,
+# which can be used by a validating XML parser to check the
+# syntax of the XML files.
+
+XML_SCHEMA             =
+
+# The XML_DTD tag can be used to specify an XML DTD,
+# which can be used by a validating XML parser to check the
+# syntax of the XML files.
+
+XML_DTD                =
+
+# If the XML_PROGRAMLISTING tag is set to YES Doxygen will
+# dump the program listings (including syntax highlighting
+# and cross-referencing information) to the XML output. Note that
+# enabling this will significantly increase the size of the XML output.
+
+XML_PROGRAMLISTING     = YES
+
+#---------------------------------------------------------------------------
+# configuration options for the AutoGen Definitions output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_AUTOGEN_DEF tag is set to YES Doxygen will
+# generate an AutoGen Definitions (see autogen.sf.net) file
+# that captures the structure of the code including all
+# documentation. Note that this feature is still experimental
+# and incomplete at the moment.
+
+GENERATE_AUTOGEN_DEF   = NO
+
+#---------------------------------------------------------------------------
+# configuration options related to the Perl module output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_PERLMOD tag is set to YES Doxygen will
+# generate a Perl module file that captures the structure of
+# the code including all documentation. Note that this
+# feature is still experimental and incomplete at the
+# moment.
+
+GENERATE_PERLMOD       = NO
+
+# If the PERLMOD_LATEX tag is set to YES Doxygen will generate
+# the necessary Makefile rules, Perl scripts and LaTeX code to be able
+# to generate PDF and DVI output from the Perl module output.
+
+PERLMOD_LATEX          = NO
+
+# If the PERLMOD_PRETTY tag is set to YES the Perl module output will be
+# nicely formatted so it can be parsed by a human reader.  This is useful
+# if you want to understand what is going on.  On the other hand, if this
+# tag is set to NO the size of the Perl module output will be much smaller
+# and Perl will parse it just the same.
+
+PERLMOD_PRETTY         = YES
+
+# The names of the make variables in the generated doxyrules.make file
+# are prefixed with the string contained in PERLMOD_MAKEVAR_PREFIX.
+# This is useful so different doxyrules.make files included by the same
+# Makefile don't overwrite each other's variables.
+
+PERLMOD_MAKEVAR_PREFIX =
+
+#---------------------------------------------------------------------------
+# Configuration options related to the preprocessor
+#---------------------------------------------------------------------------
+
+# If the ENABLE_PREPROCESSING tag is set to YES (the default) Doxygen will
+# evaluate all C-preprocessor directives found in the sources and include
+# files.
+
+ENABLE_PREPROCESSING   = YES
+
+# If the MACRO_EXPANSION tag is set to YES Doxygen will expand all macro
+# names in the source code. If set to NO (the default) only conditional
+# compilation will be performed. Macro expansion can be done in a controlled
+# way by setting EXPAND_ONLY_PREDEF to YES.
+
+MACRO_EXPANSION        = YES
+
+# If the EXPAND_ONLY_PREDEF and MACRO_EXPANSION tags are both set to YES
+# then the macro expansion is limited to the macros specified with the
+# PREDEFINED and EXPAND_AS_DEFINED tags.
+
+EXPAND_ONLY_PREDEF     = YES
+
+# If the SEARCH_INCLUDES tag is set to YES (the default) the includes files
+# in the INCLUDE_PATH (see below) will be search if a #include is found.
+
+SEARCH_INCLUDES        = YES
+
+# The INCLUDE_PATH tag can be used to specify one or more directories that
+# contain include files that are not input files but should be processed by
+# the preprocessor.
+
+INCLUDE_PATH           =
+
+# You can use the INCLUDE_FILE_PATTERNS tag to specify one or more wildcard
+# patterns (like *.h and *.hpp) to filter out the header-files in the
+# directories. If left blank, the patterns specified with FILE_PATTERNS will
+# be used.
+
+INCLUDE_FILE_PATTERNS  =
+
+# The PREDEFINED tag can be used to specify one or more macro names that
+# are defined before the preprocessor is started (similar to the -D option of
+# gcc). The argument of the tag is a list of macros of the form: name
+# or name=definition (no spaces). If the definition and the = are
+# omitted =1 is assumed. To prevent a macro definition from being
+# undefined via #undef or recursively expanded use the := operator
+# instead of the = operator.
+
+PREDEFINED             = DOXYGEN UNIV_DEBUG UNIV_SYNC_DEBUG __attribute__()=
+
+# If the MACRO_EXPANSION and EXPAND_ONLY_PREDEF tags are set to YES then
+# this tag can be used to specify a list of macro names that should be expanded.
+# The macro definition that is found in the sources will be used.
+# Use the PREDEFINED tag if you want to use a different macro definition.
+
+EXPAND_AS_DEFINED      = UT_LIST_BASE_NODE_T UT_LIST_NODE_T
+
+# If the SKIP_FUNCTION_MACROS tag is set to YES (the default) then
+# doxygen's preprocessor will remove all function-like macros that are alone
+# on a line, have an all uppercase name, and do not end with a semicolon. Such
+# function macros are typically used for boiler-plate code, and will confuse
+# the parser if not removed.
+
+SKIP_FUNCTION_MACROS   = YES
+
+#---------------------------------------------------------------------------
+# Configuration::additions related to external references
+#---------------------------------------------------------------------------
+
+# The TAGFILES option can be used to specify one or more tagfiles.
+# Optionally an initial location of the external documentation
+# can be added for each tagfile. The format of a tag file without
+# this location is as follows:
+#   TAGFILES = file1 file2 ...
+# Adding location for the tag files is done as follows:
+#   TAGFILES = file1=loc1 "file2 = loc2" ...
+# where "loc1" and "loc2" can be relative or absolute paths or
+# URLs. If a location is present for each tag, the installdox tool
+# does not have to be run to correct the links.
+# Note that each tag file must have a unique name
+# (where the name does NOT include the path)
+# If a tag file is not located in the directory in which doxygen
+# is run, you must also specify the path to the tagfile here.
+
+TAGFILES               =
+
+# When a file name is specified after GENERATE_TAGFILE, doxygen will create
+# a tag file that is based on the input files it reads.
+
+GENERATE_TAGFILE       =
+
+# If the ALLEXTERNALS tag is set to YES all external classes will be listed
+# in the class index. If set to NO only the inherited external classes
+# will be listed.
+
+ALLEXTERNALS           = NO
+
+# If the EXTERNAL_GROUPS tag is set to YES all external groups will be listed
+# in the modules index. If set to NO, only the current project's groups will
+# be listed.
+
+EXTERNAL_GROUPS        = NO
+
+# The PERL_PATH should be the absolute path and name of the perl script
+# interpreter (i.e. the result of `which perl').
+
+PERL_PATH              = /usr/bin/perl
+
+#---------------------------------------------------------------------------
+# Configuration options related to the dot tool
+#---------------------------------------------------------------------------
+
+# If the CLASS_DIAGRAMS tag is set to YES (the default) Doxygen will
+# generate a inheritance diagram (in HTML, RTF and LaTeX) for classes with base
+# or super classes. Setting the tag to NO turns the diagrams off. Note that
+# this option is superseded by the HAVE_DOT option below. This is only a
+# fallback. It is recommended to install and use dot, since it yields more
+# powerful graphs.
+
+CLASS_DIAGRAMS         = YES
+
+# You can define message sequence charts within doxygen comments using the \msc
+# command. Doxygen will then run the mscgen tool (see
+# http://www.mcternan.me.uk/mscgen/) to produce the chart and insert it in the
+# documentation. The MSCGEN_PATH tag allows you to specify the directory where
+# the mscgen tool resides. If left empty the tool is assumed to be found in the
+# default search path.
+
+MSCGEN_PATH            =
+
+# If set to YES, the inheritance and collaboration graphs will hide
+# inheritance and usage relations if the target is undocumented
+# or is not a class.
+
+HIDE_UNDOC_RELATIONS   = YES
+
+# If you set the HAVE_DOT tag to YES then doxygen will assume the dot tool is
+# available from the path. This tool is part of Graphviz, a graph visualization
+# toolkit from AT&T and Lucent Bell Labs. The other options in this section
+# have no effect if this option is set to NO (the default)
+
+HAVE_DOT               = YES
+
+# By default doxygen will write a font called FreeSans.ttf to the output
+# directory and reference it in all dot files that doxygen generates. This
+# font does not include all possible unicode characters however, so when you need
+# these (or just want a differently looking font) you can specify the font name
+# using DOT_FONTNAME. You need need to make sure dot is able to find the font,
+# which can be done by putting it in a standard location or by setting the
+# DOTFONTPATH environment variable or by setting DOT_FONTPATH to the directory
+# containing the font.
+
+DOT_FONTNAME           = FreeSans
+
+# By default doxygen will tell dot to use the output directory to look for the
+# FreeSans.ttf font (which doxygen will put there itself). If you specify a
+# different font using DOT_FONTNAME you can set the path where dot
+# can find it using this tag.
+
+DOT_FONTPATH           =
+
+# If the CLASS_GRAPH and HAVE_DOT tags are set to YES then doxygen
+# will generate a graph for each documented class showing the direct and
+# indirect inheritance relations. Setting this tag to YES will force the
+# the CLASS_DIAGRAMS tag to NO.
+
+CLASS_GRAPH            = YES
+
+# If the COLLABORATION_GRAPH and HAVE_DOT tags are set to YES then doxygen
+# will generate a graph for each documented class showing the direct and
+# indirect implementation dependencies (inheritance, containment, and
+# class references variables) of the class with other documented classes.
+
+COLLABORATION_GRAPH    = YES
+
+# If the GROUP_GRAPHS and HAVE_DOT tags are set to YES then doxygen
+# will generate a graph for groups, showing the direct groups dependencies
+
+GROUP_GRAPHS           = NO
+
+# If the UML_LOOK tag is set to YES doxygen will generate inheritance and
+# collaboration diagrams in a style similar to the OMG's Unified Modeling
+# Language.
+
+UML_LOOK               = NO
+
+# If set to YES, the inheritance and collaboration graphs will show the
+# relations between templates and their instances.
+
+TEMPLATE_RELATIONS     = NO
+
+# If the ENABLE_PREPROCESSING, SEARCH_INCLUDES, INCLUDE_GRAPH, and HAVE_DOT
+# tags are set to YES then doxygen will generate a graph for each documented
+# file showing the direct and indirect include dependencies of the file with
+# other documented files.
+
+INCLUDE_GRAPH          = YES
+
+# If the ENABLE_PREPROCESSING, SEARCH_INCLUDES, INCLUDED_BY_GRAPH, and
+# HAVE_DOT tags are set to YES then doxygen will generate a graph for each
+# documented header file showing the documented files that directly or
+# indirectly include this file.
+
+INCLUDED_BY_GRAPH      = YES
+
+# If the CALL_GRAPH and HAVE_DOT options are set to YES then
+# doxygen will generate a call dependency graph for every global function
+# or class method. Note that enabling this option will significantly increase
+# the time of a run. So in most cases it will be better to enable call graphs
+# for selected functions only using the \callgraph command.
+
+CALL_GRAPH             = NO
+
+# If the CALLER_GRAPH and HAVE_DOT tags are set to YES then
+# doxygen will generate a caller dependency graph for every global function
+# or class method. Note that enabling this option will significantly increase
+# the time of a run. So in most cases it will be better to enable caller
+# graphs for selected functions only using the \callergraph command.
+
+CALLER_GRAPH           = NO
+
+# If the GRAPHICAL_HIERARCHY and HAVE_DOT tags are set to YES then doxygen
+# will graphical hierarchy of all classes instead of a textual one.
+
+GRAPHICAL_HIERARCHY    = YES
+
+# If the DIRECTORY_GRAPH, SHOW_DIRECTORIES and HAVE_DOT tags are set to YES
+# then doxygen will show the dependencies a directory has on other directories
+# in a graphical way. The dependency relations are determined by the #include
+# relations between the files in the directories.
+
+DIRECTORY_GRAPH        = YES
+
+# The DOT_IMAGE_FORMAT tag can be used to set the image format of the images
+# generated by dot. Possible values are png, jpg, or gif
+# If left blank png will be used.
+
+DOT_IMAGE_FORMAT       = png
+
+# The tag DOT_PATH can be used to specify the path where the dot tool can be
+# found. If left blank, it is assumed the dot tool can be found in the path.
+
+DOT_PATH               =
+
+# The DOTFILE_DIRS tag can be used to specify one or more directories that
+# contain dot files that are included in the documentation (see the
+# \dotfile command).
+
+DOTFILE_DIRS           =
+
+# The DOT_GRAPH_MAX_NODES tag can be used to set the maximum number of
+# nodes that will be shown in the graph. If the number of nodes in a graph
+# becomes larger than this value, doxygen will truncate the graph, which is
+# visualized by representing a node as a red box. Note that doxygen if the
+# number of direct children of the root node in a graph is already larger than
+# DOT_GRAPH_MAX_NODES then the graph will not be shown at all. Also note
+# that the size of a graph can be further restricted by MAX_DOT_GRAPH_DEPTH.
+
+DOT_GRAPH_MAX_NODES    = 50
+
+# The MAX_DOT_GRAPH_DEPTH tag can be used to set the maximum depth of the
+# graphs generated by dot. A depth value of 3 means that only nodes reachable
+# from the root by following a path via at most 3 edges will be shown. Nodes
+# that lay further from the root node will be omitted. Note that setting this
+# option to 1 or 2 may greatly reduce the computation time needed for large
+# code bases. Also note that the size of a graph can be further restricted by
+# DOT_GRAPH_MAX_NODES. Using a depth of 0 means no depth restriction.
+
+MAX_DOT_GRAPH_DEPTH    = 3
+
+# Set the DOT_TRANSPARENT tag to YES to generate images with a transparent
+# background. This is enabled by default, which results in a transparent
+# background. Warning: Depending on the platform used, enabling this option
+# may lead to badly anti-aliased labels on the edges of a graph (i.e. they
+# become hard to read).
+
+DOT_TRANSPARENT        = YES
+
+# Set the DOT_MULTI_TARGETS tag to YES allow dot to generate multiple output
+# files in one run (i.e. multiple -o and -T options on the command line). This
+# makes dot run faster, but since only newer versions of dot (>1.8.10)
+# support this, this feature is disabled by default.
+
+DOT_MULTI_TARGETS      = NO
+
+# If the GENERATE_LEGEND tag is set to YES (the default) Doxygen will
+# generate a legend page explaining the meaning of the various boxes and
+# arrows in the dot generated graphs.
+
+GENERATE_LEGEND        = YES
+
+# If the DOT_CLEANUP tag is set to YES (the default) Doxygen will
+# remove the intermediate dot files that are used to generate
+# the various graphs.
+
+DOT_CLEANUP            = YES
+
+#---------------------------------------------------------------------------
+# Configuration::additions related to the search engine
+#---------------------------------------------------------------------------
+
+# The SEARCHENGINE tag specifies whether or not a search engine should be
+# used. If set to NO the values of all tags below this one will be ignored.
+
+SEARCHENGINE           = NO
diff --git a/storage/xtradb/Makefile.am b/storage/xtradb/Makefile.am
new file mode 100644
index 00000000000..0a6d7ddefb6
--- /dev/null
+++ b/storage/xtradb/Makefile.am
@@ -0,0 +1,345 @@
+# Copyright (C) 2001, 2004, 2006 MySQL AB & Innobase Oy
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; version 2 of the License.
+# 
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+# 
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+
+# Process this file with automake to create Makefile.in
+
+MYSQLDATAdir=		$(localstatedir)
+MYSQLSHAREdir=		$(pkgdatadir)
+MYSQLBASEdir=		$(prefix)
+MYSQLLIBdir=		$(pkglibdir)
+pkgplugindir=		$(pkglibdir)/plugin
+INCLUDES=		-I$(top_srcdir)/include -I$(top_builddir)/include \
+			-I$(top_srcdir)/regex \
+			-I$(srcdir)/include \
+			-I$(top_srcdir)/sql \
+			-I$(srcdir) @ZLIB_INCLUDES@
+
+DEFS=			@DEFS@
+
+
+noinst_HEADERS=		\
+			handler/ha_innodb.h	\
+			handler/i_s.h		\
+			include/btr0btr.h	\
+			include/btr0btr.ic	\
+			include/btr0cur.h	\
+			include/btr0cur.ic	\
+			include/btr0pcur.h	\
+			include/btr0pcur.ic	\
+			include/btr0sea.h	\
+			include/btr0sea.ic	\
+			include/btr0types.h	\
+			include/buf0buddy.h	\
+			include/buf0buddy.ic	\
+			include/buf0buf.h	\
+			include/buf0buf.ic	\
+			include/buf0flu.h	\
+			include/buf0flu.ic	\
+			include/buf0lru.h	\
+			include/buf0lru.ic	\
+			include/buf0rea.h	\
+			include/buf0types.h	\
+			include/data0data.h	\
+			include/data0data.ic	\
+			include/data0type.h	\
+			include/data0type.ic	\
+			include/data0types.h	\
+			include/db0err.h	\
+			include/dict0boot.h	\
+			include/dict0boot.ic	\
+			include/dict0crea.h	\
+			include/dict0crea.ic	\
+			include/dict0dict.h	\
+			include/dict0dict.ic	\
+			include/dict0load.h	\
+			include/dict0load.ic	\
+			include/dict0mem.h	\
+			include/dict0mem.ic	\
+			include/dict0types.h	\
+			include/dyn0dyn.h	\
+			include/dyn0dyn.ic	\
+			include/eval0eval.h	\
+			include/eval0eval.ic	\
+			include/eval0proc.h	\
+			include/eval0proc.ic	\
+			include/fil0fil.h	\
+			include/fsp0fsp.h	\
+			include/fsp0fsp.ic	\
+			include/fsp0types.h	\
+			include/fut0fut.h	\
+			include/fut0fut.ic	\
+			include/fut0lst.h	\
+			include/fut0lst.ic	\
+			include/ha0ha.h		\
+			include/ha0ha.ic	\
+			include/ha0storage.h	\
+			include/ha0storage.ic	\
+			include/ha_prototypes.h	\
+			include/handler0alter.h	\
+			include/hash0hash.h	\
+			include/hash0hash.ic	\
+			include/ibuf0ibuf.h	\
+			include/ibuf0ibuf.ic	\
+			include/ibuf0types.h	\
+			include/lock0iter.h	\
+			include/lock0lock.h	\
+			include/lock0lock.ic	\
+			include/lock0priv.h	\
+			include/lock0priv.ic	\
+			include/lock0types.h	\
+			include/log0log.h	\
+			include/log0log.ic	\
+			include/log0recv.h	\
+			include/log0recv.ic	\
+			include/mach0data.h	\
+			include/mach0data.ic	\
+			include/mem0dbg.h	\
+			include/mem0dbg.ic	\
+			include/mem0mem.h	\
+			include/mem0mem.ic	\
+			include/mem0pool.h	\
+			include/mem0pool.ic	\
+			include/mtr0log.h	\
+			include/mtr0log.ic	\
+			include/mtr0mtr.h	\
+			include/mtr0mtr.ic	\
+			include/mtr0types.h	\
+			include/mysql_addons.h	\
+			include/os0file.h	\
+			include/os0proc.h	\
+			include/os0proc.ic	\
+			include/os0sync.h	\
+			include/os0sync.ic	\
+			include/os0thread.h	\
+			include/os0thread.ic	\
+			include/page0cur.h	\
+			include/page0cur.ic	\
+			include/page0page.h	\
+			include/page0page.ic	\
+			include/page0types.h	\
+			include/page0zip.h	\
+			include/page0zip.ic	\
+			include/pars0grm.h	\
+			include/pars0opt.h	\
+			include/pars0opt.ic	\
+			include/pars0pars.h	\
+			include/pars0pars.ic	\
+			include/pars0sym.h	\
+			include/pars0sym.ic	\
+			include/pars0types.h	\
+			include/que0que.h	\
+			include/que0que.ic	\
+			include/que0types.h	\
+			include/read0read.h	\
+			include/read0read.ic	\
+			include/read0types.h	\
+			include/rem0cmp.h	\
+			include/rem0cmp.ic	\
+			include/rem0rec.h	\
+			include/rem0rec.ic	\
+			include/rem0types.h	\
+			include/row0ext.h	\
+			include/row0ext.ic	\
+			include/row0ins.h	\
+			include/row0ins.ic	\
+			include/row0merge.h	\
+			include/row0mysql.h	\
+			include/row0mysql.ic	\
+			include/row0purge.h	\
+			include/row0purge.ic	\
+			include/row0row.h	\
+			include/row0row.ic	\
+			include/row0sel.h	\
+			include/row0sel.ic	\
+			include/row0types.h	\
+			include/row0uins.h	\
+			include/row0uins.ic	\
+			include/row0umod.h	\
+			include/row0umod.ic	\
+			include/row0undo.h	\
+			include/row0undo.ic	\
+			include/row0upd.h	\
+			include/row0upd.ic	\
+			include/row0vers.h	\
+			include/row0vers.ic	\
+			include/srv0que.h	\
+			include/srv0srv.h	\
+			include/srv0srv.ic	\
+			include/srv0start.h	\
+			include/sync0arr.h	\
+			include/sync0arr.ic	\
+			include/sync0rw.h	\
+			include/sync0rw.ic	\
+			include/sync0sync.h	\
+			include/sync0sync.ic	\
+			include/sync0types.h	\
+			include/thr0loc.h	\
+			include/thr0loc.ic	\
+			include/trx0i_s.h	\
+			include/trx0purge.h	\
+			include/trx0purge.ic	\
+			include/trx0rec.h	\
+			include/trx0rec.ic	\
+			include/trx0roll.h	\
+			include/trx0roll.ic	\
+			include/trx0rseg.h	\
+			include/trx0rseg.ic	\
+			include/trx0sys.h	\
+			include/trx0sys.ic	\
+			include/trx0trx.h	\
+			include/trx0trx.ic	\
+			include/trx0types.h	\
+			include/trx0undo.h	\
+			include/trx0undo.ic	\
+			include/trx0xa.h	\
+			include/univ.i		\
+			include/usr0sess.h	\
+			include/usr0sess.ic	\
+			include/usr0types.h	\
+			include/ut0auxconf.h	\
+			include/ut0byte.h	\
+			include/ut0byte.ic	\
+			include/ut0dbg.h	\
+			include/ut0list.h	\
+			include/ut0list.ic	\
+			include/ut0lst.h	\
+			include/ut0mem.h	\
+			include/ut0mem.ic	\
+			include/ut0rbt.h	\
+			include/ut0rnd.h	\
+			include/ut0rnd.ic	\
+			include/ut0sort.h	\
+			include/ut0ut.h		\
+			include/ut0ut.ic	\
+			include/ut0vec.h	\
+			include/ut0vec.ic	\
+			include/ut0wqueue.h	\
+			handler/innodb_patch_info.h	\
+			mem/mem0dbg.c
+
+EXTRA_LIBRARIES=	libxtradb.a
+noinst_LIBRARIES=	@plugin_xtradb_static_target@
+libxtradb_a_SOURCES=	\
+			btr/btr0btr.c			\
+			btr/btr0cur.c			\
+			btr/btr0pcur.c			\
+			btr/btr0sea.c			\
+			buf/buf0buddy.c			\
+			buf/buf0buf.c			\
+			buf/buf0flu.c			\
+			buf/buf0lru.c			\
+			buf/buf0rea.c			\
+			data/data0data.c		\
+			data/data0type.c		\
+			dict/dict0boot.c		\
+			dict/dict0crea.c		\
+			dict/dict0dict.c		\
+			dict/dict0load.c		\
+			dict/dict0mem.c			\
+			dyn/dyn0dyn.c			\
+			eval/eval0eval.c		\
+			eval/eval0proc.c		\
+			fil/fil0fil.c			\
+			fsp/fsp0fsp.c			\
+			fut/fut0fut.c			\
+			fut/fut0lst.c			\
+			ha/ha0ha.c			\
+			ha/ha0storage.c			\
+			ha/hash0hash.c			\
+			handler/ha_innodb.cc		\
+			handler/handler0alter.cc	\
+			handler/i_s.cc			\
+			handler/mysql_addons.cc		\
+			ibuf/ibuf0ibuf.c		\
+			lock/lock0iter.c		\
+			lock/lock0lock.c		\
+			log/log0log.c			\
+			log/log0recv.c			\
+			mach/mach0data.c		\
+			mem/mem0mem.c			\
+			mem/mem0pool.c			\
+			mtr/mtr0log.c			\
+			mtr/mtr0mtr.c			\
+			os/os0file.c			\
+			os/os0proc.c			\
+			os/os0sync.c			\
+			os/os0thread.c			\
+			page/page0cur.c			\
+			page/page0page.c		\
+			page/page0zip.c			\
+			pars/lexyy.c			\
+			pars/pars0grm.c			\
+			pars/pars0opt.c			\
+			pars/pars0pars.c		\
+			pars/pars0sym.c			\
+			que/que0que.c			\
+			read/read0read.c		\
+			rem/rem0cmp.c			\
+			rem/rem0rec.c			\
+			row/row0ext.c			\
+			row/row0ins.c			\
+			row/row0merge.c			\
+			row/row0mysql.c			\
+			row/row0purge.c			\
+			row/row0row.c			\
+			row/row0sel.c			\
+			row/row0uins.c			\
+			row/row0umod.c			\
+			row/row0undo.c			\
+			row/row0upd.c			\
+			row/row0vers.c			\
+			srv/srv0que.c			\
+			srv/srv0srv.c			\
+			srv/srv0start.c			\
+			sync/sync0arr.c			\
+			sync/sync0rw.c			\
+			sync/sync0sync.c		\
+			thr/thr0loc.c			\
+			trx/trx0i_s.c			\
+			trx/trx0purge.c			\
+			trx/trx0rec.c			\
+			trx/trx0roll.c			\
+			trx/trx0rseg.c			\
+			trx/trx0sys.c			\
+			trx/trx0trx.c			\
+			trx/trx0undo.c			\
+			usr/usr0sess.c			\
+			ut/ut0byte.c			\
+			ut/ut0dbg.c			\
+			ut/ut0list.c			\
+			ut/ut0mem.c			\
+			ut/ut0rbt.c			\
+			ut/ut0rnd.c			\
+			ut/ut0ut.c			\
+			ut/ut0vec.c			\
+			ut/ut0wqueue.c
+
+libxtradb_a_CXXFLAGS=	$(AM_CXXFLAGS)
+libxtradb_a_CFLAGS=	$(AM_CFLAGS)
+
+EXTRA_LTLIBRARIES=	ha_xtradb.la
+pkgplugin_LTLIBRARIES=	@plugin_xtradb_shared_target@
+
+ha_xtradb_la_LDFLAGS=	-module -rpath $(pkgplugindir) -L$(top_builddir)/libservices -lmysqlservices
+ha_xtradb_la_CXXFLAGS=	-shared $(AM_CXXFLAGS) $(INNODB_DYNAMIC_CFLAGS)
+ha_xtradb_la_CFLAGS=	-shared $(AM_CFLAGS) $(INNODB_DYNAMIC_CFLAGS)
+ha_xtradb_la_SOURCES=	$(libxtradb_a_SOURCES)
+
+EXTRA_DIST=		CMakeLists.txt plug.in \
+			pars/make_bison.sh pars/make_flex.sh \
+			pars/pars0grm.y pars/pars0lex.l
+
+# Don't update the files from bitkeeper
+%::SCCS/s.%
diff --git a/storage/xtradb/btr/btr0btr.c b/storage/xtradb/btr/btr0btr.c
new file mode 100644
index 00000000000..ff047095aa4
--- /dev/null
+++ b/storage/xtradb/btr/btr0btr.c
@@ -0,0 +1,3789 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2010, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file btr/btr0btr.c
+The B-tree
+
+Created 6/2/1994 Heikki Tuuri
+*******************************************************/
+
+#include "btr0btr.h"
+
+#ifdef UNIV_NONINL
+#include "btr0btr.ic"
+#endif
+
+#include "fsp0fsp.h"
+#include "page0page.h"
+#include "page0zip.h"
+
+#ifndef UNIV_HOTBACKUP
+#include "btr0cur.h"
+#include "btr0sea.h"
+#include "btr0pcur.h"
+#include "rem0cmp.h"
+#include "lock0lock.h"
+#include "ibuf0ibuf.h"
+#include "trx0trx.h"
+
+/*
+Latching strategy of the InnoDB B-tree
+--------------------------------------
+A tree latch protects all non-leaf nodes of the tree. Each node of a tree
+also has a latch of its own.
+
+A B-tree operation normally first acquires an S-latch on the tree. It
+searches down the tree and releases the tree latch when it has the
+leaf node latch. To save CPU time we do not acquire any latch on
+non-leaf nodes of the tree during a search, those pages are only bufferfixed.
+
+If an operation needs to restructure the tree, it acquires an X-latch on
+the tree before searching to a leaf node. If it needs, for example, to
+split a leaf,
+(1) InnoDB decides the split point in the leaf,
+(2) allocates a new page,
+(3) inserts the appropriate node pointer to the first non-leaf level,
+(4) releases the tree X-latch,
+(5) and then moves records from the leaf to the new allocated page.
+
+Node pointers
+-------------
+Leaf pages of a B-tree contain the index records stored in the
+tree. On levels n > 0 we store 'node pointers' to pages on level
+n - 1. For each page there is exactly one node pointer stored:
+thus the our tree is an ordinary B-tree, not a B-link tree.
+
+A node pointer contains a prefix P of an index record. The prefix
+is long enough so that it determines an index record uniquely.
+The file page number of the child page is added as the last
+field. To the child page we can store node pointers or index records
+which are >= P in the alphabetical order, but < P1 if there is
+a next node pointer on the level, and P1 is its prefix.
+
+If a node pointer with a prefix P points to a non-leaf child,
+then the leftmost record in the child must have the same
+prefix P. If it points to a leaf node, the child is not required
+to contain any record with a prefix equal to P. The leaf case
+is decided this way to allow arbitrary deletions in a leaf node
+without touching upper levels of the tree.
+
+We have predefined a special minimum record which we
+define as the smallest record in any alphabetical order.
+A minimum record is denoted by setting a bit in the record
+header. A minimum record acts as the prefix of a node pointer
+which points to a leftmost node on any level of the tree.
+
+File page allocation
+--------------------
+In the root node of a B-tree there are two file segment headers.
+The leaf pages of a tree are allocated from one file segment, to
+make them consecutive on disk if possible. From the other file segment
+we allocate pages for the non-leaf levels of the tree.
+*/
+
+#ifdef UNIV_BTR_DEBUG
+/**************************************************************//**
+Checks a file segment header within a B-tree root page.
+@return	TRUE if valid */
+static
+ibool
+btr_root_fseg_validate(
+/*===================*/
+	const fseg_header_t*	seg_header,	/*!< in: segment header */
+	ulint			space)		/*!< in: tablespace identifier */
+{
+	ulint	offset = mach_read_from_2(seg_header + FSEG_HDR_OFFSET);
+
+	ut_a(mach_read_from_4(seg_header + FSEG_HDR_SPACE) == space);
+	ut_a(offset >= FIL_PAGE_DATA);
+	ut_a(offset <= UNIV_PAGE_SIZE - FIL_PAGE_DATA_END);
+	return(TRUE);
+}
+#endif /* UNIV_BTR_DEBUG */
+
+/**************************************************************//**
+Gets the root node of a tree and x-latches it.
+@return	root page, x-latched */
+static
+buf_block_t*
+btr_root_block_get(
+/*===============*/
+	dict_index_t*	index,	/*!< in: index tree */
+	mtr_t*		mtr)	/*!< in: mtr */
+{
+	ulint		space;
+	ulint		zip_size;
+	ulint		root_page_no;
+	buf_block_t*	block;
+
+	space = dict_index_get_space(index);
+	zip_size = dict_table_zip_size(index->table);
+	root_page_no = dict_index_get_page(index);
+
+	block = btr_block_get(space, zip_size, root_page_no, RW_X_LATCH, mtr);
+
+	if (srv_pass_corrupt_table && !block) {
+		return(0);
+	}
+	ut_a(block);
+
+	ut_a((ibool)!!page_is_comp(buf_block_get_frame(block))
+	     == dict_table_is_comp(index->table));
+#ifdef UNIV_BTR_DEBUG
+	if (!dict_index_is_ibuf(index)) {
+		const page_t*	root = buf_block_get_frame(block);
+
+		ut_a(btr_root_fseg_validate(FIL_PAGE_DATA + PAGE_BTR_SEG_LEAF
+					    + root, space));
+		ut_a(btr_root_fseg_validate(FIL_PAGE_DATA + PAGE_BTR_SEG_TOP
+					    + root, space));
+	}
+#endif /* UNIV_BTR_DEBUG */
+
+	return(block);
+}
+
+/**************************************************************//**
+Gets the root node of a tree and x-latches it.
+@return	root page, x-latched */
+UNIV_INTERN
+page_t*
+btr_root_get(
+/*=========*/
+	dict_index_t*	index,	/*!< in: index tree */
+	mtr_t*		mtr)	/*!< in: mtr */
+{
+	return(buf_block_get_frame(btr_root_block_get(index, mtr)));
+}
+
+/*************************************************************//**
+Gets pointer to the previous user record in the tree. It is assumed that
+the caller has appropriate latches on the page and its neighbor.
+@return	previous user record, NULL if there is none */
+UNIV_INTERN
+rec_t*
+btr_get_prev_user_rec(
+/*==================*/
+	rec_t*	rec,	/*!< in: record on leaf level */
+	mtr_t*	mtr)	/*!< in: mtr holding a latch on the page, and if
+			needed, also to the previous page */
+{
+	page_t*	page;
+	page_t*	prev_page;
+	ulint	prev_page_no;
+
+	if (!page_rec_is_infimum(rec)) {
+
+		rec_t*	prev_rec = page_rec_get_prev(rec);
+
+		if (!page_rec_is_infimum(prev_rec)) {
+
+			return(prev_rec);
+		}
+	}
+
+	page = page_align(rec);
+	prev_page_no = btr_page_get_prev(page, mtr);
+
+	if (prev_page_no != FIL_NULL) {
+
+		ulint		space;
+		ulint		zip_size;
+		buf_block_t*	prev_block;
+
+		space = page_get_space_id(page);
+		zip_size = fil_space_get_zip_size(space);
+
+		prev_block = buf_page_get_with_no_latch(space, zip_size,
+							prev_page_no, mtr);
+		prev_page = buf_block_get_frame(prev_block);
+		/* The caller must already have a latch to the brother */
+		ut_ad(mtr_memo_contains(mtr, prev_block,
+					MTR_MEMO_PAGE_S_FIX)
+		      || mtr_memo_contains(mtr, prev_block,
+					   MTR_MEMO_PAGE_X_FIX));
+#ifdef UNIV_BTR_DEBUG
+		ut_a(page_is_comp(prev_page) == page_is_comp(page));
+		ut_a(btr_page_get_next(prev_page, mtr)
+		     == page_get_page_no(page));
+#endif /* UNIV_BTR_DEBUG */
+
+		return(page_rec_get_prev(page_get_supremum_rec(prev_page)));
+	}
+
+	return(NULL);
+}
+
+/*************************************************************//**
+Gets pointer to the next user record in the tree. It is assumed that the
+caller has appropriate latches on the page and its neighbor.
+@return	next user record, NULL if there is none */
+UNIV_INTERN
+rec_t*
+btr_get_next_user_rec(
+/*==================*/
+	rec_t*	rec,	/*!< in: record on leaf level */
+	mtr_t*	mtr)	/*!< in: mtr holding a latch on the page, and if
+			needed, also to the next page */
+{
+	page_t*	page;
+	page_t*	next_page;
+	ulint	next_page_no;
+
+	if (!page_rec_is_supremum(rec)) {
+
+		rec_t*	next_rec = page_rec_get_next(rec);
+
+		if (!page_rec_is_supremum(next_rec)) {
+
+			return(next_rec);
+		}
+	}
+
+	page = page_align(rec);
+	next_page_no = btr_page_get_next(page, mtr);
+
+	if (next_page_no != FIL_NULL) {
+		ulint		space;
+		ulint		zip_size;
+		buf_block_t*	next_block;
+
+		space = page_get_space_id(page);
+		zip_size = fil_space_get_zip_size(space);
+
+		next_block = buf_page_get_with_no_latch(space, zip_size,
+							next_page_no, mtr);
+		next_page = buf_block_get_frame(next_block);
+		/* The caller must already have a latch to the brother */
+		ut_ad(mtr_memo_contains(mtr, next_block, MTR_MEMO_PAGE_S_FIX)
+		      || mtr_memo_contains(mtr, next_block,
+					   MTR_MEMO_PAGE_X_FIX));
+#ifdef UNIV_BTR_DEBUG
+		ut_a(page_is_comp(next_page) == page_is_comp(page));
+		ut_a(btr_page_get_prev(next_page, mtr)
+		     == page_get_page_no(page));
+#endif /* UNIV_BTR_DEBUG */
+
+		return(page_rec_get_next(page_get_infimum_rec(next_page)));
+	}
+
+	return(NULL);
+}
+
+/**************************************************************//**
+Creates a new index page (not the root, and also not
+used in page reorganization).  @see btr_page_empty(). */
+static
+void
+btr_page_create(
+/*============*/
+	buf_block_t*	block,	/*!< in/out: page to be created */
+	page_zip_des_t*	page_zip,/*!< in/out: compressed page, or NULL */
+	dict_index_t*	index,	/*!< in: index */
+	ulint		level,	/*!< in: the B-tree level of the page */
+	mtr_t*		mtr)	/*!< in: mtr */
+{
+	page_t*		page = buf_block_get_frame(block);
+
+	ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
+
+	if (UNIV_LIKELY_NULL(page_zip)) {
+		page_create_zip(block, index, level, mtr);
+	} else {
+		page_create(block, mtr, dict_table_is_comp(index->table));
+		/* Set the level of the new index page */
+		btr_page_set_level(page, NULL, level, mtr);
+	}
+
+	block->check_index_page_at_flush = TRUE;
+
+	btr_page_set_index_id(page, page_zip, index->id, mtr);
+}
+
+/**************************************************************//**
+Allocates a new file page to be used in an ibuf tree. Takes the page from
+the free list of the tree, which must contain pages!
+@return	new allocated block, x-latched */
+static
+buf_block_t*
+btr_page_alloc_for_ibuf(
+/*====================*/
+	dict_index_t*	index,	/*!< in: index tree */
+	mtr_t*		mtr)	/*!< in: mtr */
+{
+	fil_addr_t	node_addr;
+	page_t*		root;
+	page_t*		new_page;
+	buf_block_t*	new_block;
+
+	root = btr_root_get(index, mtr);
+
+	node_addr = flst_get_first(root + PAGE_HEADER
+				   + PAGE_BTR_IBUF_FREE_LIST, mtr);
+	ut_a(node_addr.page != FIL_NULL);
+
+	new_block = buf_page_get(dict_index_get_space(index),
+				 dict_table_zip_size(index->table),
+				 node_addr.page, RW_X_LATCH, mtr);
+	new_page = buf_block_get_frame(new_block);
+	buf_block_dbg_add_level(new_block, SYNC_TREE_NODE_NEW);
+
+	flst_remove(root + PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST,
+		    new_page + PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST_NODE,
+		    mtr);
+	ut_ad(flst_validate(root + PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST,
+			    mtr));
+
+	return(new_block);
+}
+
+/**************************************************************//**
+Allocates a new file page to be used in an index tree. NOTE: we assume
+that the caller has made the reservation for free extents!
+@return	new allocated block, x-latched; NULL if out of space */
+UNIV_INTERN
+buf_block_t*
+btr_page_alloc(
+/*===========*/
+	dict_index_t*	index,		/*!< in: index */
+	ulint		hint_page_no,	/*!< in: hint of a good page */
+	byte		file_direction,	/*!< in: direction where a possible
+					page split is made */
+	ulint		level,		/*!< in: level where the page is placed
+					in the tree */
+	mtr_t*		mtr)		/*!< in: mtr */
+{
+	fseg_header_t*	seg_header;
+	page_t*		root;
+	buf_block_t*	new_block;
+	ulint		new_page_no;
+
+	if (dict_index_is_ibuf(index)) {
+
+		return(btr_page_alloc_for_ibuf(index, mtr));
+	}
+
+	root = btr_root_get(index, mtr);
+
+	if (level == 0) {
+		seg_header = root + PAGE_HEADER + PAGE_BTR_SEG_LEAF;
+	} else {
+		seg_header = root + PAGE_HEADER + PAGE_BTR_SEG_TOP;
+	}
+
+	/* Parameter TRUE below states that the caller has made the
+	reservation for free extents, and thus we know that a page can
+	be allocated: */
+
+	new_page_no = fseg_alloc_free_page_general(seg_header, hint_page_no,
+						   file_direction, TRUE, mtr);
+	if (new_page_no == FIL_NULL) {
+
+		return(NULL);
+	}
+
+	new_block = buf_page_get(dict_index_get_space(index),
+				 dict_table_zip_size(index->table),
+				 new_page_no, RW_X_LATCH, mtr);
+	buf_block_dbg_add_level(new_block, SYNC_TREE_NODE_NEW);
+
+	return(new_block);
+}
+
+/**************************************************************//**
+Gets the number of pages in a B-tree.
+@return	number of pages */
+UNIV_INTERN
+ulint
+btr_get_size(
+/*=========*/
+	dict_index_t*	index,	/*!< in: index */
+	ulint		flag)	/*!< in: BTR_N_LEAF_PAGES or BTR_TOTAL_SIZE */
+{
+	fseg_header_t*	seg_header;
+	page_t*		root;
+	ulint		n;
+	ulint		dummy;
+	mtr_t		mtr;
+
+	mtr_start(&mtr);
+
+	mtr_s_lock(dict_index_get_lock(index), &mtr);
+
+	root = btr_root_get(index, &mtr);
+
+	if (srv_pass_corrupt_table && !root) {
+		mtr_commit(&mtr);
+		return(0);
+	}
+	ut_a(root);
+
+	if (flag == BTR_N_LEAF_PAGES) {
+		seg_header = root + PAGE_HEADER + PAGE_BTR_SEG_LEAF;
+
+		fseg_n_reserved_pages(seg_header, &n, &mtr);
+
+	} else if (flag == BTR_TOTAL_SIZE) {
+		seg_header = root + PAGE_HEADER + PAGE_BTR_SEG_TOP;
+
+		n = fseg_n_reserved_pages(seg_header, &dummy, &mtr);
+
+		seg_header = root + PAGE_HEADER + PAGE_BTR_SEG_LEAF;
+
+		n += fseg_n_reserved_pages(seg_header, &dummy, &mtr);
+	} else {
+		ut_error;
+	}
+
+	mtr_commit(&mtr);
+
+	return(n);
+}
+
+/**************************************************************//**
+Frees a page used in an ibuf tree. Puts the page to the free list of the
+ibuf tree. */
+static
+void
+btr_page_free_for_ibuf(
+/*===================*/
+	dict_index_t*	index,	/*!< in: index tree */
+	buf_block_t*	block,	/*!< in: block to be freed, x-latched */
+	mtr_t*		mtr)	/*!< in: mtr */
+{
+	page_t*		root;
+
+	ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
+	root = btr_root_get(index, mtr);
+
+	flst_add_first(root + PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST,
+		       buf_block_get_frame(block)
+		       + PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST_NODE, mtr);
+
+	ut_ad(flst_validate(root + PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST,
+			    mtr));
+}
+
+/**************************************************************//**
+Frees a file page used in an index tree. Can be used also to (BLOB)
+external storage pages, because the page level 0 can be given as an
+argument. */
+UNIV_INTERN
+void
+btr_page_free_low(
+/*==============*/
+	dict_index_t*	index,	/*!< in: index tree */
+	buf_block_t*	block,	/*!< in: block to be freed, x-latched */
+	ulint		level,	/*!< in: page level */
+	mtr_t*		mtr)	/*!< in: mtr */
+{
+	fseg_header_t*	seg_header;
+	page_t*		root;
+
+	ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
+	/* The page gets invalid for optimistic searches: increment the frame
+	modify clock */
+
+	buf_block_modify_clock_inc(block);
+
+	if (dict_index_is_ibuf(index)) {
+
+		btr_page_free_for_ibuf(index, block, mtr);
+
+		return;
+	}
+
+	root = btr_root_get(index, mtr);
+
+	if (level == 0) {
+		seg_header = root + PAGE_HEADER + PAGE_BTR_SEG_LEAF;
+	} else {
+		seg_header = root + PAGE_HEADER + PAGE_BTR_SEG_TOP;
+	}
+
+	fseg_free_page(seg_header,
+		       buf_block_get_space(block),
+		       buf_block_get_page_no(block), mtr);
+}
+
+/**************************************************************//**
+Frees a file page used in an index tree. NOTE: cannot free field external
+storage pages because the page must contain info on its level. */
+UNIV_INTERN
+void
+btr_page_free(
+/*==========*/
+	dict_index_t*	index,	/*!< in: index tree */
+	buf_block_t*	block,	/*!< in: block to be freed, x-latched */
+	mtr_t*		mtr)	/*!< in: mtr */
+{
+	ulint		level;
+
+	level = btr_page_get_level(buf_block_get_frame(block), mtr);
+
+	btr_page_free_low(index, block, level, mtr);
+}
+
+/**************************************************************//**
+Sets the child node file address in a node pointer. */
+UNIV_INLINE
+void
+btr_node_ptr_set_child_page_no(
+/*===========================*/
+	rec_t*		rec,	/*!< in: node pointer record */
+	page_zip_des_t*	page_zip,/*!< in/out: compressed page whose uncompressed
+				part will be updated, or NULL */
+	const ulint*	offsets,/*!< in: array returned by rec_get_offsets() */
+	ulint		page_no,/*!< in: child node address */
+	mtr_t*		mtr)	/*!< in: mtr */
+{
+	byte*	field;
+	ulint	len;
+
+	ut_ad(rec_offs_validate(rec, NULL, offsets));
+	ut_ad(!page_is_leaf(page_align(rec)));
+	ut_ad(!rec_offs_comp(offsets) || rec_get_node_ptr_flag(rec));
+
+	/* The child address is in the last field */
+	field = rec_get_nth_field(rec, offsets,
+				  rec_offs_n_fields(offsets) - 1, &len);
+
+	ut_ad(len == REC_NODE_PTR_SIZE);
+
+	if (UNIV_LIKELY_NULL(page_zip)) {
+		page_zip_write_node_ptr(page_zip, rec,
+					rec_offs_data_size(offsets),
+					page_no, mtr);
+	} else {
+		mlog_write_ulint(field, page_no, MLOG_4BYTES, mtr);
+	}
+}
+
+/************************************************************//**
+Returns the child page of a node pointer and x-latches it.
+@return	child page, x-latched */
+static
+buf_block_t*
+btr_node_ptr_get_child(
+/*===================*/
+	const rec_t*	node_ptr,/*!< in: node pointer */
+	dict_index_t*	index,	/*!< in: index */
+	const ulint*	offsets,/*!< in: array returned by rec_get_offsets() */
+	mtr_t*		mtr)	/*!< in: mtr */
+{
+	ulint	page_no;
+	ulint	space;
+
+	ut_ad(rec_offs_validate(node_ptr, index, offsets));
+	space = page_get_space_id(page_align(node_ptr));
+	page_no = btr_node_ptr_get_child_page_no(node_ptr, offsets);
+
+	return(btr_block_get(space, dict_table_zip_size(index->table),
+			     page_no, RW_X_LATCH, mtr));
+}
+
+/************************************************************//**
+Returns the upper level node pointer to a page. It is assumed that mtr holds
+an x-latch on the tree.
+@return	rec_get_offsets() of the node pointer record */
+static
+ulint*
+btr_page_get_father_node_ptr_func(
+/*==============================*/
+	ulint*		offsets,/*!< in: work area for the return value */
+	mem_heap_t*	heap,	/*!< in: memory heap to use */
+	btr_cur_t*	cursor,	/*!< in: cursor pointing to user record,
+				out: cursor on node pointer record,
+				its page x-latched */
+	const char*	file,	/*!< in: file name */
+	ulint		line,	/*!< in: line where called */
+	mtr_t*		mtr)	/*!< in: mtr */
+{
+	dtuple_t*	tuple;
+	rec_t*		user_rec;
+	rec_t*		node_ptr;
+	ulint		level;
+	ulint		page_no;
+	dict_index_t*	index;
+
+	page_no = buf_block_get_page_no(btr_cur_get_block(cursor));
+	index = btr_cur_get_index(cursor);
+
+	ut_ad(mtr_memo_contains(mtr, dict_index_get_lock(index),
+				MTR_MEMO_X_LOCK));
+
+	ut_ad(dict_index_get_page(index) != page_no);
+
+	level = btr_page_get_level(btr_cur_get_page(cursor), mtr);
+	user_rec = btr_cur_get_rec(cursor);
+	ut_a(page_rec_is_user_rec(user_rec));
+	tuple = dict_index_build_node_ptr(index, user_rec, 0, heap, level);
+
+	btr_cur_search_to_nth_level(index, level + 1, tuple, PAGE_CUR_LE,
+				    BTR_CONT_MODIFY_TREE, cursor, 0,
+				    file, line, mtr);
+
+	node_ptr = btr_cur_get_rec(cursor);
+	ut_ad(!page_rec_is_comp(node_ptr)
+	      || rec_get_status(node_ptr) == REC_STATUS_NODE_PTR);
+	offsets = rec_get_offsets(node_ptr, index, offsets,
+				  ULINT_UNDEFINED, &heap);
+
+	if (UNIV_UNLIKELY(btr_node_ptr_get_child_page_no(node_ptr, offsets)
+			  != page_no)) {
+		rec_t*	print_rec;
+		fputs("InnoDB: Dump of the child page:\n", stderr);
+		buf_page_print(page_align(user_rec), 0);
+		fputs("InnoDB: Dump of the parent page:\n", stderr);
+		buf_page_print(page_align(node_ptr), 0);
+
+		fputs("InnoDB: Corruption of an index tree: table ", stderr);
+		ut_print_name(stderr, NULL, TRUE, index->table_name);
+		fputs(", index ", stderr);
+		ut_print_name(stderr, NULL, FALSE, index->name);
+		fprintf(stderr, ",\n"
+			"InnoDB: father ptr page no %lu, child page no %lu\n",
+			(ulong)
+			btr_node_ptr_get_child_page_no(node_ptr, offsets),
+			(ulong) page_no);
+		print_rec = page_rec_get_next(
+			page_get_infimum_rec(page_align(user_rec)));
+		offsets = rec_get_offsets(print_rec, index,
+					  offsets, ULINT_UNDEFINED, &heap);
+		page_rec_print(print_rec, offsets);
+		offsets = rec_get_offsets(node_ptr, index, offsets,
+					  ULINT_UNDEFINED, &heap);
+		page_rec_print(node_ptr, offsets);
+
+		fputs("InnoDB: You should dump + drop + reimport the table"
+		      " to fix the\n"
+		      "InnoDB: corruption. If the crash happens at "
+		      "the database startup, see\n"
+		      "InnoDB: " REFMAN "forcing-recovery.html about\n"
+		      "InnoDB: forcing recovery. "
+		      "Then dump + drop + reimport.\n", stderr);
+
+		ut_error;
+	}
+
+	return(offsets);
+}
+
+#define btr_page_get_father_node_ptr(of,heap,cur,mtr)			\
+	btr_page_get_father_node_ptr_func(of,heap,cur,__FILE__,__LINE__,mtr)
+
+/************************************************************//**
+Returns the upper level node pointer to a page. It is assumed that mtr holds
+an x-latch on the tree.
+@return	rec_get_offsets() of the node pointer record */
+static
+ulint*
+btr_page_get_father_block(
+/*======================*/
+	ulint*		offsets,/*!< in: work area for the return value */
+	mem_heap_t*	heap,	/*!< in: memory heap to use */
+	dict_index_t*	index,	/*!< in: b-tree index */
+	buf_block_t*	block,	/*!< in: child page in the index */
+	mtr_t*		mtr,	/*!< in: mtr */
+	btr_cur_t*	cursor)	/*!< out: cursor on node pointer record,
+				its page x-latched */
+{
+	rec_t*	rec
+		= page_rec_get_next(page_get_infimum_rec(buf_block_get_frame(
+								 block)));
+	btr_cur_position(index, rec, block, cursor);
+	return(btr_page_get_father_node_ptr(offsets, heap, cursor, mtr));
+}
+
+/************************************************************//**
+Seeks to the upper level node pointer to a page.
+It is assumed that mtr holds an x-latch on the tree. */
+static
+void
+btr_page_get_father(
+/*================*/
+	dict_index_t*	index,	/*!< in: b-tree index */
+	buf_block_t*	block,	/*!< in: child page in the index */
+	mtr_t*		mtr,	/*!< in: mtr */
+	btr_cur_t*	cursor)	/*!< out: cursor on node pointer record,
+				its page x-latched */
+{
+	mem_heap_t*	heap;
+	rec_t*		rec
+		= page_rec_get_next(page_get_infimum_rec(buf_block_get_frame(
+								 block)));
+	btr_cur_position(index, rec, block, cursor);
+
+	heap = mem_heap_create(100);
+	btr_page_get_father_node_ptr(NULL, heap, cursor, mtr);
+	mem_heap_free(heap);
+}
+
+/************************************************************//**
+Creates the root node for a new index tree.
+@return	page number of the created root, FIL_NULL if did not succeed */
+UNIV_INTERN
+ulint
+btr_create(
+/*=======*/
+	ulint		type,	/*!< in: type of the index */
+	ulint		space,	/*!< in: space where created */
+	ulint		zip_size,/*!< in: compressed page size in bytes
+				or 0 for uncompressed pages */
+	dulint		index_id,/*!< in: index id */
+	dict_index_t*	index,	/*!< in: index */
+	mtr_t*		mtr)	/*!< in: mini-transaction handle */
+{
+	ulint		page_no;
+	buf_block_t*	block;
+	buf_frame_t*	frame;
+	page_t*		page;
+	page_zip_des_t*	page_zip;
+
+	/* Create the two new segments (one, in the case of an ibuf tree) for
+	the index tree; the segment headers are put on the allocated root page
+	(for an ibuf tree, not in the root, but on a separate ibuf header
+	page) */
+
+	if (type & DICT_IBUF) {
+		/* Allocate first the ibuf header page */
+		buf_block_t*	ibuf_hdr_block = fseg_create(
+			space, 0,
+			IBUF_HEADER + IBUF_TREE_SEG_HEADER, mtr);
+
+		buf_block_dbg_add_level(ibuf_hdr_block, SYNC_TREE_NODE_NEW);
+
+		ut_ad(buf_block_get_page_no(ibuf_hdr_block)
+		      == IBUF_HEADER_PAGE_NO);
+		/* Allocate then the next page to the segment: it will be the
+		tree root page */
+
+		page_no = fseg_alloc_free_page(buf_block_get_frame(
+						       ibuf_hdr_block)
+					       + IBUF_HEADER
+					       + IBUF_TREE_SEG_HEADER,
+					       IBUF_TREE_ROOT_PAGE_NO,
+					       FSP_UP, mtr);
+		ut_ad(page_no == IBUF_TREE_ROOT_PAGE_NO);
+
+		block = buf_page_get(space, zip_size, page_no,
+				     RW_X_LATCH, mtr);
+	} else {
+		block = fseg_create(space, 0,
+				    PAGE_HEADER + PAGE_BTR_SEG_TOP, mtr);
+	}
+
+	if (block == NULL) {
+
+		return(FIL_NULL);
+	}
+
+	page_no = buf_block_get_page_no(block);
+	frame = buf_block_get_frame(block);
+
+	buf_block_dbg_add_level(block, SYNC_TREE_NODE_NEW);
+
+	if (type & DICT_IBUF) {
+		/* It is an insert buffer tree: initialize the free list */
+
+		ut_ad(page_no == IBUF_TREE_ROOT_PAGE_NO);
+
+		flst_init(frame + PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST, mtr);
+	} else {
+		/* It is a non-ibuf tree: create a file segment for leaf
+		pages */
+		if (!fseg_create(space, page_no,
+				 PAGE_HEADER + PAGE_BTR_SEG_LEAF, mtr)) {
+			/* Not enough space for new segment, free root
+			segment before return. */
+			btr_free_root(space, zip_size, page_no, mtr);
+
+			return(FIL_NULL);
+		}
+
+		/* The fseg create acquires a second latch on the page,
+		therefore we must declare it: */
+		buf_block_dbg_add_level(block, SYNC_TREE_NODE_NEW);
+	}
+
+	/* Create a new index page on the allocated segment page */
+	page_zip = buf_block_get_page_zip(block);
+
+	if (UNIV_LIKELY_NULL(page_zip)) {
+		page = page_create_zip(block, index, 0, mtr);
+	} else {
+		page = page_create(block, mtr,
+				   dict_table_is_comp(index->table));
+		/* Set the level of the new index page */
+		btr_page_set_level(page, NULL, 0, mtr);
+	}
+
+	block->check_index_page_at_flush = TRUE;
+
+	/* Set the index id of the page */
+	btr_page_set_index_id(page, page_zip, index_id, mtr);
+
+	/* Set the next node and previous node fields */
+	btr_page_set_next(page, page_zip, FIL_NULL, mtr);
+	btr_page_set_prev(page, page_zip, FIL_NULL, mtr);
+
+	/* We reset the free bits for the page to allow creation of several
+	trees in the same mtr, otherwise the latch on a bitmap page would
+	prevent it because of the latching order */
+
+	if (!(type & DICT_CLUSTERED)) {
+		ibuf_reset_free_bits(block);
+	}
+
+	/* In the following assertion we test that two records of maximum
+	allowed size fit on the root page: this fact is needed to ensure
+	correctness of split algorithms */
+
+	ut_ad(page_get_max_insert_size(page, 2) > 2 * BTR_PAGE_MAX_REC_SIZE);
+
+	return(page_no);
+}
+
+/************************************************************//**
+Frees a B-tree except the root page, which MUST be freed after this
+by calling btr_free_root. */
+UNIV_INTERN
+void
+btr_free_but_not_root(
+/*==================*/
+	ulint	space,		/*!< in: space where created */
+	ulint	zip_size,	/*!< in: compressed page size in bytes
+				or 0 for uncompressed pages */
+	ulint	root_page_no)	/*!< in: root page number */
+{
+	ibool	finished;
+	page_t*	root;
+	mtr_t	mtr;
+
+leaf_loop:
+	mtr_start(&mtr);
+
+	root = btr_page_get(space, zip_size, root_page_no, RW_X_LATCH, &mtr);
+
+	if (srv_pass_corrupt_table && !root) {
+		mtr_commit(&mtr);
+		return;
+	}
+	ut_a(root);
+	
+#ifdef UNIV_BTR_DEBUG
+	ut_a(btr_root_fseg_validate(FIL_PAGE_DATA + PAGE_BTR_SEG_LEAF
+				    + root, space));
+	ut_a(btr_root_fseg_validate(FIL_PAGE_DATA + PAGE_BTR_SEG_TOP
+				    + root, space));
+#endif /* UNIV_BTR_DEBUG */
+
+	/* NOTE: page hash indexes are dropped when a page is freed inside
+	fsp0fsp. */
+
+	finished = fseg_free_step(root + PAGE_HEADER + PAGE_BTR_SEG_LEAF,
+				  &mtr);
+	mtr_commit(&mtr);
+
+	if (!finished) {
+
+		goto leaf_loop;
+	}
+top_loop:
+	mtr_start(&mtr);
+
+	root = btr_page_get(space, zip_size, root_page_no, RW_X_LATCH, &mtr);
+
+	if (srv_pass_corrupt_table && !root) {
+		mtr_commit(&mtr);
+		return;
+	}
+	ut_a(root);
+#ifdef UNIV_BTR_DEBUG
+	ut_a(btr_root_fseg_validate(FIL_PAGE_DATA + PAGE_BTR_SEG_TOP
+				    + root, space));
+#endif /* UNIV_BTR_DEBUG */
+
+	finished = fseg_free_step_not_header(
+		root + PAGE_HEADER + PAGE_BTR_SEG_TOP, &mtr);
+	mtr_commit(&mtr);
+
+	if (!finished) {
+
+		goto top_loop;
+	}
+}
+
+/************************************************************//**
+Frees the B-tree root page. Other tree MUST already have been freed. */
+UNIV_INTERN
+void
+btr_free_root(
+/*==========*/
+	ulint	space,		/*!< in: space where created */
+	ulint	zip_size,	/*!< in: compressed page size in bytes
+				or 0 for uncompressed pages */
+	ulint	root_page_no,	/*!< in: root page number */
+	mtr_t*	mtr)		/*!< in: a mini-transaction which has already
+				been started */
+{
+	buf_block_t*	block;
+	fseg_header_t*	header;
+
+	block = btr_block_get(space, zip_size, root_page_no, RW_X_LATCH, mtr);
+
+	if (srv_pass_corrupt_table && !block) {
+		return;
+	}
+	ut_a(block);
+
+	btr_search_drop_page_hash_index(block);
+
+	header = buf_block_get_frame(block) + PAGE_HEADER + PAGE_BTR_SEG_TOP;
+#ifdef UNIV_BTR_DEBUG
+	ut_a(btr_root_fseg_validate(header, space));
+#endif /* UNIV_BTR_DEBUG */
+
+	while (!fseg_free_step(header, mtr));
+}
+#endif /* !UNIV_HOTBACKUP */
+
+/*************************************************************//**
+Reorganizes an index page. */
+static
+ibool
+btr_page_reorganize_low(
+/*====================*/
+	ibool		recovery,/*!< in: TRUE if called in recovery:
+				locks should not be updated, i.e.,
+				there cannot exist locks on the
+				page, and a hash index should not be
+				dropped: it cannot exist */
+	buf_block_t*	block,	/*!< in: page to be reorganized */
+	dict_index_t*	index,	/*!< in: record descriptor */
+	mtr_t*		mtr)	/*!< in: mtr */
+{
+	page_t*		page		= buf_block_get_frame(block);
+	page_zip_des_t*	page_zip	= buf_block_get_page_zip(block);
+	buf_block_t*	temp_block;
+	page_t*		temp_page;
+	ulint		log_mode;
+	ulint		data_size1;
+	ulint		data_size2;
+	ulint		max_ins_size1;
+	ulint		max_ins_size2;
+	ibool		success		= FALSE;
+
+	ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
+	ut_ad(!!page_is_comp(page) == dict_table_is_comp(index->table));
+#ifdef UNIV_ZIP_DEBUG
+	ut_a(!page_zip || page_zip_validate(page_zip, page));
+#endif /* UNIV_ZIP_DEBUG */
+	data_size1 = page_get_data_size(page);
+	max_ins_size1 = page_get_max_insert_size_after_reorganize(page, 1);
+
+#ifndef UNIV_HOTBACKUP
+	/* Write the log record */
+	mlog_open_and_write_index(mtr, page, index, page_is_comp(page)
+				  ? MLOG_COMP_PAGE_REORGANIZE
+				  : MLOG_PAGE_REORGANIZE, 0);
+#endif /* !UNIV_HOTBACKUP */
+
+	/* Turn logging off */
+	log_mode = mtr_set_log_mode(mtr, MTR_LOG_NONE);
+
+#ifndef UNIV_HOTBACKUP
+	temp_block = buf_block_alloc(0);
+#else /* !UNIV_HOTBACKUP */
+	ut_ad(block == back_block1);
+	temp_block = back_block2;
+#endif /* !UNIV_HOTBACKUP */
+	temp_page = temp_block->frame;
+
+	/* Copy the old page to temporary space */
+	buf_frame_copy(temp_page, page);
+
+#ifndef UNIV_HOTBACKUP
+	if (UNIV_LIKELY(!recovery)) {
+		btr_search_drop_page_hash_index(block);
+	}
+
+	block->check_index_page_at_flush = TRUE;
+#endif /* !UNIV_HOTBACKUP */
+
+	/* Recreate the page: note that global data on page (possible
+	segment headers, next page-field, etc.) is preserved intact */
+
+	page_create(block, mtr, dict_table_is_comp(index->table));
+
+	/* Copy the records from the temporary space to the recreated page;
+	do not copy the lock bits yet */
+
+	page_copy_rec_list_end_no_locks(block, temp_block,
+					page_get_infimum_rec(temp_page),
+					index, mtr);
+
+	if (dict_index_is_sec_or_ibuf(index) && page_is_leaf(page)) {
+		/* Copy max trx id to recreated page */
+		trx_id_t	max_trx_id = page_get_max_trx_id(temp_page);
+		page_set_max_trx_id(block, NULL, max_trx_id, mtr);
+		/* In crash recovery, dict_index_is_sec_or_ibuf() always
+		returns TRUE, even for clustered indexes.  max_trx_id is
+		unused in clustered index pages. */
+		ut_ad(!ut_dulint_is_zero(max_trx_id) || recovery);
+	}
+
+	if (UNIV_LIKELY_NULL(page_zip)
+	    && UNIV_UNLIKELY
+	    (!page_zip_compress(page_zip, page, index, NULL))) {
+
+		/* Restore the old page and exit. */
+
+#if defined UNIV_DEBUG || defined UNIV_ZIP_DEBUG
+		/* Check that the bytes that we skip are identical. */
+		ut_a(!memcmp(page, temp_page, PAGE_HEADER));
+		ut_a(!memcmp(PAGE_HEADER + PAGE_N_RECS + page,
+			     PAGE_HEADER + PAGE_N_RECS + temp_page,
+			     PAGE_DATA - (PAGE_HEADER + PAGE_N_RECS)));
+		ut_a(!memcmp(UNIV_PAGE_SIZE - FIL_PAGE_DATA_END + page,
+			     UNIV_PAGE_SIZE - FIL_PAGE_DATA_END + temp_page,
+			     FIL_PAGE_DATA_END));
+#endif /* UNIV_DEBUG || UNIV_ZIP_DEBUG */
+
+		memcpy(PAGE_HEADER + page, PAGE_HEADER + temp_page,
+		       PAGE_N_RECS - PAGE_N_DIR_SLOTS);
+		memcpy(PAGE_DATA + page, PAGE_DATA + temp_page,
+		       UNIV_PAGE_SIZE - PAGE_DATA - FIL_PAGE_DATA_END);
+
+#if defined UNIV_DEBUG || defined UNIV_ZIP_DEBUG
+		ut_a(!memcmp(page, temp_page, UNIV_PAGE_SIZE));
+#endif /* UNIV_DEBUG || UNIV_ZIP_DEBUG */
+
+		goto func_exit;
+	}
+
+#ifndef UNIV_HOTBACKUP
+	if (UNIV_LIKELY(!recovery)) {
+		/* Update the record lock bitmaps */
+		lock_move_reorganize_page(block, temp_block);
+	}
+#endif /* !UNIV_HOTBACKUP */
+
+	data_size2 = page_get_data_size(page);
+	max_ins_size2 = page_get_max_insert_size_after_reorganize(page, 1);
+
+	if (UNIV_UNLIKELY(data_size1 != data_size2)
+	    || UNIV_UNLIKELY(max_ins_size1 != max_ins_size2)) {
+		buf_page_print(page, 0);
+		buf_page_print(temp_page, 0);
+		fprintf(stderr,
+			"InnoDB: Error: page old data size %lu"
+			" new data size %lu\n"
+			"InnoDB: Error: page old max ins size %lu"
+			" new max ins size %lu\n"
+			"InnoDB: Submit a detailed bug report"
+			" to http://bugs.mysql.com\n",
+			(unsigned long) data_size1, (unsigned long) data_size2,
+			(unsigned long) max_ins_size1,
+			(unsigned long) max_ins_size2);
+	} else {
+		success = TRUE;
+	}
+
+func_exit:
+#ifdef UNIV_ZIP_DEBUG
+	ut_a(!page_zip || page_zip_validate(page_zip, page));
+#endif /* UNIV_ZIP_DEBUG */
+#ifndef UNIV_HOTBACKUP
+	buf_block_free(temp_block);
+#endif /* !UNIV_HOTBACKUP */
+
+	/* Restore logging mode */
+	mtr_set_log_mode(mtr, log_mode);
+
+	return(success);
+}
+
+#ifndef UNIV_HOTBACKUP
+/*************************************************************//**
+Reorganizes an index page.
+IMPORTANT: if btr_page_reorganize() is invoked on a compressed leaf
+page of a non-clustered index, the caller must update the insert
+buffer free bits in the same mini-transaction in such a way that the
+modification will be redo-logged.
+@return	TRUE on success, FALSE on failure */
+UNIV_INTERN
+ibool
+btr_page_reorganize(
+/*================*/
+	buf_block_t*	block,	/*!< in: page to be reorganized */
+	dict_index_t*	index,	/*!< in: record descriptor */
+	mtr_t*		mtr)	/*!< in: mtr */
+{
+	return(btr_page_reorganize_low(FALSE, block, index, mtr));
+}
+#endif /* !UNIV_HOTBACKUP */
+
+/***********************************************************//**
+Parses a redo log record of reorganizing a page.
+@return	end of log record or NULL */
+UNIV_INTERN
+byte*
+btr_parse_page_reorganize(
+/*======================*/
+	byte*		ptr,	/*!< in: buffer */
+	byte*		end_ptr __attribute__((unused)),
+				/*!< in: buffer end */
+	dict_index_t*	index,	/*!< in: record descriptor */
+	buf_block_t*	block,	/*!< in: page to be reorganized, or NULL */
+	mtr_t*		mtr)	/*!< in: mtr or NULL */
+{
+	ut_ad(ptr && end_ptr);
+
+	/* The record is empty, except for the record initial part */
+
+	if (UNIV_LIKELY(block != NULL)) {
+		btr_page_reorganize_low(TRUE, block, index, mtr);
+	}
+
+	return(ptr);
+}
+
+#ifndef UNIV_HOTBACKUP
+/*************************************************************//**
+Empties an index page.  @see btr_page_create(). */
+static
+void
+btr_page_empty(
+/*===========*/
+	buf_block_t*	block,	/*!< in: page to be emptied */
+	page_zip_des_t*	page_zip,/*!< out: compressed page, or NULL */
+	dict_index_t*	index,	/*!< in: index of the page */
+	ulint		level,	/*!< in: the B-tree level of the page */
+	mtr_t*		mtr)	/*!< in: mtr */
+{
+	page_t*	page = buf_block_get_frame(block);
+
+	ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
+	ut_ad(page_zip == buf_block_get_page_zip(block));
+#ifdef UNIV_ZIP_DEBUG
+	ut_a(!page_zip || page_zip_validate(page_zip, page));
+#endif /* UNIV_ZIP_DEBUG */
+
+	btr_search_drop_page_hash_index(block);
+
+	/* Recreate the page: note that global data on page (possible
+	segment headers, next page-field, etc.) is preserved intact */
+
+	if (UNIV_LIKELY_NULL(page_zip)) {
+		page_create_zip(block, index, level, mtr);
+	} else {
+		page_create(block, mtr, dict_table_is_comp(index->table));
+		btr_page_set_level(page, NULL, level, mtr);
+	}
+
+	block->check_index_page_at_flush = TRUE;
+}
+
+/*************************************************************//**
+Makes tree one level higher by splitting the root, and inserts
+the tuple. It is assumed that mtr contains an x-latch on the tree.
+NOTE that the operation of this function must always succeed,
+we cannot reverse it: therefore enough free disk space must be
+guaranteed to be available before this function is called.
+@return	inserted record */
+UNIV_INTERN
+rec_t*
+btr_root_raise_and_insert(
+/*======================*/
+	btr_cur_t*	cursor,	/*!< in: cursor at which to insert: must be
+				on the root page; when the function returns,
+				the cursor is positioned on the predecessor
+				of the inserted record */
+	const dtuple_t*	tuple,	/*!< in: tuple to insert */
+	ulint		n_ext,	/*!< in: number of externally stored columns */
+	mtr_t*		mtr)	/*!< in: mtr */
+{
+	dict_index_t*	index;
+	page_t*		root;
+	page_t*		new_page;
+	ulint		new_page_no;
+	rec_t*		rec;
+	mem_heap_t*	heap;
+	dtuple_t*	node_ptr;
+	ulint		level;
+	rec_t*		node_ptr_rec;
+	page_cur_t*	page_cursor;
+	page_zip_des_t*	root_page_zip;
+	page_zip_des_t*	new_page_zip;
+	buf_block_t*	root_block;
+	buf_block_t*	new_block;
+
+	root = btr_cur_get_page(cursor);
+	root_block = btr_cur_get_block(cursor);
+	root_page_zip = buf_block_get_page_zip(root_block);
+#ifdef UNIV_ZIP_DEBUG
+	ut_a(!root_page_zip || page_zip_validate(root_page_zip, root));
+#endif /* UNIV_ZIP_DEBUG */
+	index = btr_cur_get_index(cursor);
+#ifdef UNIV_BTR_DEBUG
+	if (!dict_index_is_ibuf(index)) {
+		ulint	space = dict_index_get_space(index);
+
+		ut_a(btr_root_fseg_validate(FIL_PAGE_DATA + PAGE_BTR_SEG_LEAF
+					    + root, space));
+		ut_a(btr_root_fseg_validate(FIL_PAGE_DATA + PAGE_BTR_SEG_TOP
+					    + root, space));
+	}
+
+	ut_a(dict_index_get_page(index) == page_get_page_no(root));
+#endif /* UNIV_BTR_DEBUG */
+	ut_ad(mtr_memo_contains(mtr, dict_index_get_lock(index),
+				MTR_MEMO_X_LOCK));
+	ut_ad(mtr_memo_contains(mtr, root_block, MTR_MEMO_PAGE_X_FIX));
+
+	/* Allocate a new page to the tree. Root splitting is done by first
+	moving the root records to the new page, emptying the root, putting
+	a node pointer to the new page, and then splitting the new page. */
+
+	level = btr_page_get_level(root, mtr);
+
+	new_block = btr_page_alloc(index, 0, FSP_NO_DIR, level, mtr);
+	new_page = buf_block_get_frame(new_block);
+	new_page_zip = buf_block_get_page_zip(new_block);
+	ut_a(!new_page_zip == !root_page_zip);
+	ut_a(!new_page_zip
+	     || page_zip_get_size(new_page_zip)
+	     == page_zip_get_size(root_page_zip));
+
+	btr_page_create(new_block, new_page_zip, index, level, mtr);
+
+	/* Set the next node and previous node fields of new page */
+	btr_page_set_next(new_page, new_page_zip, FIL_NULL, mtr);
+	btr_page_set_prev(new_page, new_page_zip, FIL_NULL, mtr);
+
+	/* Copy the records from root to the new page one by one. */
+
+	if (0
+#ifdef UNIV_ZIP_COPY
+	    || new_page_zip
+#endif /* UNIV_ZIP_COPY */
+	    || UNIV_UNLIKELY
+	    (!page_copy_rec_list_end(new_block, root_block,
+				     page_get_infimum_rec(root),
+				     index, mtr))) {
+		ut_a(new_page_zip);
+
+		/* Copy the page byte for byte. */
+		page_zip_copy_recs(new_page_zip, new_page,
+				   root_page_zip, root, index, mtr);
+
+		/* Update the lock table and possible hash index. */
+
+		lock_move_rec_list_end(new_block, root_block,
+				       page_get_infimum_rec(root));
+
+		btr_search_move_or_delete_hash_entries(new_block, root_block,
+						       index);
+	}
+
+	/* If this is a pessimistic insert which is actually done to
+	perform a pessimistic update then we have stored the lock
+	information of the record to be inserted on the infimum of the
+	root page: we cannot discard the lock structs on the root page */
+
+	lock_update_root_raise(new_block, root_block);
+
+	/* Create a memory heap where the node pointer is stored */
+	heap = mem_heap_create(100);
+
+	rec = page_rec_get_next(page_get_infimum_rec(new_page));
+	new_page_no = buf_block_get_page_no(new_block);
+
+	/* Build the node pointer (= node key and page address) for the
+	child */
+
+	node_ptr = dict_index_build_node_ptr(index, rec, new_page_no, heap,
+					     level);
+	/* The node pointer must be marked as the predefined minimum record,
+	as there is no lower alphabetical limit to records in the leftmost
+	node of a level: */
+	dtuple_set_info_bits(node_ptr,
+			     dtuple_get_info_bits(node_ptr)
+			     | REC_INFO_MIN_REC_FLAG);
+
+	/* Rebuild the root page to get free space */
+	btr_page_empty(root_block, root_page_zip, index, level + 1, mtr);
+
+	/* Set the next node and previous node fields, although
+	they should already have been set.  The previous node field
+	must be FIL_NULL if root_page_zip != NULL, because the
+	REC_INFO_MIN_REC_FLAG (of the first user record) will be
+	set if and only if btr_page_get_prev() == FIL_NULL. */
+	btr_page_set_next(root, root_page_zip, FIL_NULL, mtr);
+	btr_page_set_prev(root, root_page_zip, FIL_NULL, mtr);
+
+	page_cursor = btr_cur_get_page_cur(cursor);
+
+	/* Insert node pointer to the root */
+
+	page_cur_set_before_first(root_block, page_cursor);
+
+	node_ptr_rec = page_cur_tuple_insert(page_cursor, node_ptr,
+					     index, 0, mtr);
+
+	/* The root page should only contain the node pointer
+	to new_page at this point.  Thus, the data should fit. */
+	ut_a(node_ptr_rec);
+
+	/* Free the memory heap */
+	mem_heap_free(heap);
+
+	/* We play safe and reset the free bits for the new page */
+
+#if 0
+	fprintf(stderr, "Root raise new page no %lu\n", new_page_no);
+#endif
+
+	if (!dict_index_is_clust(index)) {
+		ibuf_reset_free_bits(new_block);
+	}
+
+	/* Reposition the cursor to the child node */
+	page_cur_search(new_block, index, tuple,
+			PAGE_CUR_LE, page_cursor);
+
+	/* Split the child and insert tuple */
+	return(btr_page_split_and_insert(cursor, tuple, n_ext, mtr));
+}
+
+/*************************************************************//**
+Decides if the page should be split at the convergence point of inserts
+converging to the left.
+@return	TRUE if split recommended */
+UNIV_INTERN
+ibool
+btr_page_get_split_rec_to_left(
+/*===========================*/
+	btr_cur_t*	cursor,	/*!< in: cursor at which to insert */
+	rec_t**		split_rec) /*!< out: if split recommended,
+				the first record on upper half page,
+				or NULL if tuple to be inserted should
+				be first */
+{
+	page_t*	page;
+	rec_t*	insert_point;
+	rec_t*	infimum;
+
+	page = btr_cur_get_page(cursor);
+	insert_point = btr_cur_get_rec(cursor);
+
+	if (page_header_get_ptr(page, PAGE_LAST_INSERT)
+	    == page_rec_get_next(insert_point)) {
+
+		infimum = page_get_infimum_rec(page);
+
+		/* If the convergence is in the middle of a page, include also
+		the record immediately before the new insert to the upper
+		page. Otherwise, we could repeatedly move from page to page
+		lots of records smaller than the convergence point. */
+
+		if (infimum != insert_point
+		    && page_rec_get_next(infimum) != insert_point) {
+
+			*split_rec = insert_point;
+		} else {
+			*split_rec = page_rec_get_next(insert_point);
+		}
+
+		return(TRUE);
+	}
+
+	return(FALSE);
+}
+
+/*************************************************************//**
+Decides if the page should be split at the convergence point of inserts
+converging to the right.
+@return	TRUE if split recommended */
+UNIV_INTERN
+ibool
+btr_page_get_split_rec_to_right(
+/*============================*/
+	btr_cur_t*	cursor,	/*!< in: cursor at which to insert */
+	rec_t**		split_rec) /*!< out: if split recommended,
+				the first record on upper half page,
+				or NULL if tuple to be inserted should
+				be first */
+{
+	page_t*	page;
+	rec_t*	insert_point;
+
+	page = btr_cur_get_page(cursor);
+	insert_point = btr_cur_get_rec(cursor);
+
+	/* We use eager heuristics: if the new insert would be right after
+	the previous insert on the same page, we assume that there is a
+	pattern of sequential inserts here. */
+
+	if (UNIV_LIKELY(page_header_get_ptr(page, PAGE_LAST_INSERT)
+			== insert_point)) {
+
+		rec_t*	next_rec;
+
+		next_rec = page_rec_get_next(insert_point);
+
+		if (page_rec_is_supremum(next_rec)) {
+split_at_new:
+			/* Split at the new record to insert */
+			*split_rec = NULL;
+		} else {
+			rec_t*	next_next_rec = page_rec_get_next(next_rec);
+			if (page_rec_is_supremum(next_next_rec)) {
+
+				goto split_at_new;
+			}
+
+			/* If there are >= 2 user records up from the insert
+			point, split all but 1 off. We want to keep one because
+			then sequential inserts can use the adaptive hash
+			index, as they can do the necessary checks of the right
+			search position just by looking at the records on this
+			page. */
+
+			*split_rec = next_next_rec;
+		}
+
+		return(TRUE);
+	}
+
+	return(FALSE);
+}
+
+/*************************************************************//**
+Calculates a split record such that the tuple will certainly fit on
+its half-page when the split is performed. We assume in this function
+only that the cursor page has at least one user record.
+@return split record, or NULL if tuple will be the first record on
+the lower or upper half-page (determined by btr_page_tuple_smaller()) */
+static
+rec_t*
+btr_page_get_split_rec(
+/*===================*/
+	btr_cur_t*	cursor,	/*!< in: cursor at which insert should be made */
+	const dtuple_t*	tuple,	/*!< in: tuple to insert */
+	ulint		n_ext)	/*!< in: number of externally stored columns */
+{
+	page_t*		page;
+	page_zip_des_t*	page_zip;
+	ulint		insert_size;
+	ulint		free_space;
+	ulint		total_data;
+	ulint		total_n_recs;
+	ulint		total_space;
+	ulint		incl_data;
+	rec_t*		ins_rec;
+	rec_t*		rec;
+	rec_t*		next_rec;
+	ulint		n;
+	mem_heap_t*	heap;
+	ulint*		offsets;
+
+	page = btr_cur_get_page(cursor);
+
+	insert_size = rec_get_converted_size(cursor->index, tuple, n_ext);
+	free_space  = page_get_free_space_of_empty(page_is_comp(page));
+
+	page_zip = btr_cur_get_page_zip(cursor);
+	if (UNIV_LIKELY_NULL(page_zip)) {
+		/* Estimate the free space of an empty compressed page. */
+		ulint	free_space_zip = page_zip_empty_size(
+			cursor->index->n_fields,
+			page_zip_get_size(page_zip));
+
+		if (UNIV_LIKELY(free_space > (ulint) free_space_zip)) {
+			free_space = (ulint) free_space_zip;
+		}
+	}
+
+	/* free_space is now the free space of a created new page */
+
+	total_data   = page_get_data_size(page) + insert_size;
+	total_n_recs = page_get_n_recs(page) + 1;
+	ut_ad(total_n_recs >= 2);
+	total_space  = total_data + page_dir_calc_reserved_space(total_n_recs);
+
+	n = 0;
+	incl_data = 0;
+	ins_rec = btr_cur_get_rec(cursor);
+	rec = page_get_infimum_rec(page);
+
+	heap = NULL;
+	offsets = NULL;
+
+	/* We start to include records to the left half, and when the
+	space reserved by them exceeds half of total_space, then if
+	the included records fit on the left page, they will be put there
+	if something was left over also for the right page,
+	otherwise the last included record will be the first on the right
+	half page */
+
+	do {
+		/* Decide the next record to include */
+		if (rec == ins_rec) {
+			rec = NULL;	/* NULL denotes that tuple is
+					now included */
+		} else if (rec == NULL) {
+			rec = page_rec_get_next(ins_rec);
+		} else {
+			rec = page_rec_get_next(rec);
+		}
+
+		if (rec == NULL) {
+			/* Include tuple */
+			incl_data += insert_size;
+		} else {
+			offsets = rec_get_offsets(rec, cursor->index,
+						  offsets, ULINT_UNDEFINED,
+						  &heap);
+			incl_data += rec_offs_size(offsets);
+		}
+
+		n++;
+	} while (incl_data + page_dir_calc_reserved_space(n)
+		 < total_space / 2);
+
+	if (incl_data + page_dir_calc_reserved_space(n) <= free_space) {
+		/* The next record will be the first on
+		the right half page if it is not the
+		supremum record of page */
+
+		if (rec == ins_rec) {
+			rec = NULL;
+
+			goto func_exit;
+		} else if (rec == NULL) {
+			next_rec = page_rec_get_next(ins_rec);
+		} else {
+			next_rec = page_rec_get_next(rec);
+		}
+		ut_ad(next_rec);
+		if (!page_rec_is_supremum(next_rec)) {
+			rec = next_rec;
+		}
+	}
+
+func_exit:
+	if (UNIV_LIKELY_NULL(heap)) {
+		mem_heap_free(heap);
+	}
+	return(rec);
+}
+
+/*************************************************************//**
+Returns TRUE if the insert fits on the appropriate half-page with the
+chosen split_rec.
+@return	TRUE if fits */
+static
+ibool
+btr_page_insert_fits(
+/*=================*/
+	btr_cur_t*	cursor,	/*!< in: cursor at which insert
+				should be made */
+	const rec_t*	split_rec,/*!< in: suggestion for first record
+				on upper half-page, or NULL if
+				tuple to be inserted should be first */
+	const ulint*	offsets,/*!< in: rec_get_offsets(
+				split_rec, cursor->index) */
+	const dtuple_t*	tuple,	/*!< in: tuple to insert */
+	ulint		n_ext,	/*!< in: number of externally stored columns */
+	mem_heap_t*	heap)	/*!< in: temporary memory heap */
+{
+	page_t*		page;
+	ulint		insert_size;
+	ulint		free_space;
+	ulint		total_data;
+	ulint		total_n_recs;
+	const rec_t*	rec;
+	const rec_t*	end_rec;
+	ulint*		offs;
+
+	page = btr_cur_get_page(cursor);
+
+	ut_ad(!split_rec == !offsets);
+	ut_ad(!offsets
+	      || !page_is_comp(page) == !rec_offs_comp(offsets));
+	ut_ad(!offsets
+	      || rec_offs_validate(split_rec, cursor->index, offsets));
+
+	insert_size = rec_get_converted_size(cursor->index, tuple, n_ext);
+	free_space  = page_get_free_space_of_empty(page_is_comp(page));
+
+	/* free_space is now the free space of a created new page */
+
+	total_data   = page_get_data_size(page) + insert_size;
+	total_n_recs = page_get_n_recs(page) + 1;
+
+	/* We determine which records (from rec to end_rec, not including
+	end_rec) will end up on the other half page from tuple when it is
+	inserted. */
+
+	if (split_rec == NULL) {
+		rec = page_rec_get_next(page_get_infimum_rec(page));
+		end_rec = page_rec_get_next(btr_cur_get_rec(cursor));
+
+	} else if (cmp_dtuple_rec(tuple, split_rec, offsets) >= 0) {
+
+		rec = page_rec_get_next(page_get_infimum_rec(page));
+		end_rec = split_rec;
+	} else {
+		rec = split_rec;
+		end_rec = page_get_supremum_rec(page);
+	}
+
+	if (total_data + page_dir_calc_reserved_space(total_n_recs)
+	    <= free_space) {
+
+		/* Ok, there will be enough available space on the
+		half page where the tuple is inserted */
+
+		return(TRUE);
+	}
+
+	offs = NULL;
+
+	while (rec != end_rec) {
+		/* In this loop we calculate the amount of reserved
+		space after rec is removed from page. */
+
+		offs = rec_get_offsets(rec, cursor->index, offs,
+				       ULINT_UNDEFINED, &heap);
+
+		total_data -= rec_offs_size(offs);
+		total_n_recs--;
+
+		if (total_data + page_dir_calc_reserved_space(total_n_recs)
+		    <= free_space) {
+
+			/* Ok, there will be enough available space on the
+			half page where the tuple is inserted */
+
+			return(TRUE);
+		}
+
+		rec = page_rec_get_next_const(rec);
+	}
+
+	return(FALSE);
+}
+
+/*******************************************************//**
+Inserts a data tuple to a tree on a non-leaf level. It is assumed
+that mtr holds an x-latch on the tree. */
+UNIV_INTERN
+void
+btr_insert_on_non_leaf_level_func(
+/*==============================*/
+	dict_index_t*	index,	/*!< in: index */
+	ulint		level,	/*!< in: level, must be > 0 */
+	dtuple_t*	tuple,	/*!< in: the record to be inserted */
+	const char*	file,	/*!< in: file name */
+	ulint		line,	/*!< in: line where called */
+	mtr_t*		mtr)	/*!< in: mtr */
+{
+	big_rec_t*	dummy_big_rec;
+	btr_cur_t	cursor;
+	ulint		err;
+	rec_t*		rec;
+
+	ut_ad(level > 0);
+
+	btr_cur_search_to_nth_level(index, level, tuple, PAGE_CUR_LE,
+				    BTR_CONT_MODIFY_TREE,
+				    &cursor, 0, file, line, mtr);
+
+	err = btr_cur_pessimistic_insert(BTR_NO_LOCKING_FLAG
+					 | BTR_KEEP_SYS_FLAG
+					 | BTR_NO_UNDO_LOG_FLAG,
+					 &cursor, tuple, &rec,
+					 &dummy_big_rec, 0, NULL, mtr);
+	ut_a(err == DB_SUCCESS);
+}
+
+/**************************************************************//**
+Attaches the halves of an index page on the appropriate level in an
+index tree. */
+static
+void
+btr_attach_half_pages(
+/*==================*/
+	dict_index_t*	index,		/*!< in: the index tree */
+	buf_block_t*	block,		/*!< in/out: page to be split */
+	rec_t*		split_rec,	/*!< in: first record on upper
+					half page */
+	buf_block_t*	new_block,	/*!< in/out: the new half page */
+	ulint		direction,	/*!< in: FSP_UP or FSP_DOWN */
+	mtr_t*		mtr)		/*!< in: mtr */
+{
+	ulint		space;
+	ulint		zip_size;
+	ulint		prev_page_no;
+	ulint		next_page_no;
+	ulint		level;
+	page_t*		page		= buf_block_get_frame(block);
+	page_t*		lower_page;
+	page_t*		upper_page;
+	ulint		lower_page_no;
+	ulint		upper_page_no;
+	page_zip_des_t*	lower_page_zip;
+	page_zip_des_t*	upper_page_zip;
+	dtuple_t*	node_ptr_upper;
+	mem_heap_t*	heap;
+
+	ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
+	ut_ad(mtr_memo_contains(mtr, new_block, MTR_MEMO_PAGE_X_FIX));
+
+	/* Create a memory heap where the data tuple is stored */
+	heap = mem_heap_create(1024);
+
+	/* Based on split direction, decide upper and lower pages */
+	if (direction == FSP_DOWN) {
+
+		btr_cur_t	cursor;
+		ulint*		offsets;
+
+		lower_page = buf_block_get_frame(new_block);
+		lower_page_no = buf_block_get_page_no(new_block);
+		lower_page_zip = buf_block_get_page_zip(new_block);
+		upper_page = buf_block_get_frame(block);
+		upper_page_no = buf_block_get_page_no(block);
+		upper_page_zip = buf_block_get_page_zip(block);
+
+		/* Look up the index for the node pointer to page */
+		offsets = btr_page_get_father_block(NULL, heap, index,
+						    block, mtr, &cursor);
+
+		/* Replace the address of the old child node (= page) with the
+		address of the new lower half */
+
+		btr_node_ptr_set_child_page_no(
+			btr_cur_get_rec(&cursor),
+			btr_cur_get_page_zip(&cursor),
+			offsets, lower_page_no, mtr);
+		mem_heap_empty(heap);
+	} else {
+		lower_page = buf_block_get_frame(block);
+		lower_page_no = buf_block_get_page_no(block);
+		lower_page_zip = buf_block_get_page_zip(block);
+		upper_page = buf_block_get_frame(new_block);
+		upper_page_no = buf_block_get_page_no(new_block);
+		upper_page_zip = buf_block_get_page_zip(new_block);
+	}
+
+	/* Get the level of the split pages */
+	level = btr_page_get_level(buf_block_get_frame(block), mtr);
+	ut_ad(level
+	      == btr_page_get_level(buf_block_get_frame(new_block), mtr));
+
+	/* Build the node pointer (= node key and page address) for the upper
+	half */
+
+	node_ptr_upper = dict_index_build_node_ptr(index, split_rec,
+						   upper_page_no, heap, level);
+
+	/* Insert it next to the pointer to the lower half. Note that this
+	may generate recursion leading to a split on the higher level. */
+
+	btr_insert_on_non_leaf_level(index, level + 1, node_ptr_upper, mtr);
+
+	/* Free the memory heap */
+	mem_heap_free(heap);
+
+	/* Get the previous and next pages of page */
+
+	prev_page_no = btr_page_get_prev(page, mtr);
+	next_page_no = btr_page_get_next(page, mtr);
+	space = buf_block_get_space(block);
+	zip_size = buf_block_get_zip_size(block);
+
+	/* Update page links of the level */
+
+	if (prev_page_no != FIL_NULL) {
+		buf_block_t*	prev_block = btr_block_get(space, zip_size,
+							   prev_page_no,
+							   RW_X_LATCH, mtr);
+#ifdef UNIV_BTR_DEBUG
+		ut_a(page_is_comp(prev_block->frame) == page_is_comp(page));
+		ut_a(btr_page_get_next(prev_block->frame, mtr)
+		     == buf_block_get_page_no(block));
+#endif /* UNIV_BTR_DEBUG */
+
+		btr_page_set_next(buf_block_get_frame(prev_block),
+				  buf_block_get_page_zip(prev_block),
+				  lower_page_no, mtr);
+	}
+
+	if (next_page_no != FIL_NULL) {
+		buf_block_t*	next_block = btr_block_get(space, zip_size,
+							   next_page_no,
+							   RW_X_LATCH, mtr);
+#ifdef UNIV_BTR_DEBUG
+		ut_a(page_is_comp(next_block->frame) == page_is_comp(page));
+		ut_a(btr_page_get_prev(next_block->frame, mtr)
+		     == page_get_page_no(page));
+#endif /* UNIV_BTR_DEBUG */
+
+		btr_page_set_prev(buf_block_get_frame(next_block),
+				  buf_block_get_page_zip(next_block),
+				  upper_page_no, mtr);
+	}
+
+	btr_page_set_prev(lower_page, lower_page_zip, prev_page_no, mtr);
+	btr_page_set_next(lower_page, lower_page_zip, upper_page_no, mtr);
+
+	btr_page_set_prev(upper_page, upper_page_zip, lower_page_no, mtr);
+	btr_page_set_next(upper_page, upper_page_zip, next_page_no, mtr);
+}
+
+/*************************************************************//**
+Determine if a tuple is smaller than any record on the page.
+@return TRUE if smaller */
+static
+ibool
+btr_page_tuple_smaller(
+/*===================*/
+	btr_cur_t*	cursor,	/*!< in: b-tree cursor */
+	const dtuple_t*	tuple,	/*!< in: tuple to consider */
+	ulint*		offsets,/*!< in/out: temporary storage */
+	ulint		n_uniq,	/*!< in: number of unique fields
+				in the index page records */
+	mem_heap_t**	heap)	/*!< in/out: heap for offsets */
+{
+	buf_block_t*	block;
+	const rec_t*	first_rec;
+	page_cur_t	pcur;
+
+	/* Read the first user record in the page. */
+	block = btr_cur_get_block(cursor);
+	page_cur_set_before_first(block, &pcur);
+	page_cur_move_to_next(&pcur);
+	first_rec = page_cur_get_rec(&pcur);
+
+	offsets = rec_get_offsets(
+		first_rec, cursor->index, offsets,
+		n_uniq, heap);
+
+	return(cmp_dtuple_rec(tuple, first_rec, offsets) < 0);
+}
+
+/*************************************************************//**
+Splits an index page to halves and inserts the tuple. It is assumed
+that mtr holds an x-latch to the index tree. NOTE: the tree x-latch is
+released within this function! NOTE that the operation of this
+function must always succeed, we cannot reverse it: therefore enough
+free disk space (2 pages) must be guaranteed to be available before
+this function is called.
+
+@return inserted record */
+UNIV_INTERN
+rec_t*
+btr_page_split_and_insert(
+/*======================*/
+	btr_cur_t*	cursor,	/*!< in: cursor at which to insert; when the
+				function returns, the cursor is positioned
+				on the predecessor of the inserted record */
+	const dtuple_t*	tuple,	/*!< in: tuple to insert */
+	ulint		n_ext,	/*!< in: number of externally stored columns */
+	mtr_t*		mtr)	/*!< in: mtr */
+{
+	buf_block_t*	block;
+	page_t*		page;
+	page_zip_des_t*	page_zip;
+	ulint		page_no;
+	byte		direction;
+	ulint		hint_page_no;
+	buf_block_t*	new_block;
+	page_t*		new_page;
+	page_zip_des_t*	new_page_zip;
+	rec_t*		split_rec;
+	buf_block_t*	left_block;
+	buf_block_t*	right_block;
+	buf_block_t*	insert_block;
+	page_t*		insert_page;
+	page_cur_t*	page_cursor;
+	rec_t*		first_rec;
+	byte*		buf = 0; /* remove warning */
+	rec_t*		move_limit;
+	ibool		insert_will_fit;
+	ibool		insert_left;
+	ulint		n_iterations = 0;
+	rec_t*		rec;
+	mem_heap_t*	heap;
+	ulint		n_uniq;
+	ulint*		offsets;
+
+	heap = mem_heap_create(1024);
+	n_uniq = dict_index_get_n_unique_in_tree(cursor->index);
+func_start:
+	mem_heap_empty(heap);
+	offsets = NULL;
+
+	ut_ad(mtr_memo_contains(mtr, dict_index_get_lock(cursor->index),
+				MTR_MEMO_X_LOCK));
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(rw_lock_own(dict_index_get_lock(cursor->index), RW_LOCK_EX));
+#endif /* UNIV_SYNC_DEBUG */
+
+	block = btr_cur_get_block(cursor);
+	page = buf_block_get_frame(block);
+	page_zip = buf_block_get_page_zip(block);
+
+	ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
+	ut_ad(page_get_n_recs(page) >= 1);
+
+	page_no = buf_block_get_page_no(block);
+
+	/* 1. Decide the split record; split_rec == NULL means that the
+	tuple to be inserted should be the first record on the upper
+	half-page */
+	insert_left = FALSE;
+
+	if (n_iterations > 0) {
+		direction = FSP_UP;
+		hint_page_no = page_no + 1;
+		split_rec = btr_page_get_split_rec(cursor, tuple, n_ext);
+
+		if (UNIV_UNLIKELY(split_rec == NULL)) {
+			insert_left = btr_page_tuple_smaller(
+				cursor, tuple, offsets, n_uniq, &heap);
+		}
+	} else if (btr_page_get_split_rec_to_right(cursor, &split_rec)) {
+		direction = FSP_UP;
+		hint_page_no = page_no + 1;
+
+	} else if (btr_page_get_split_rec_to_left(cursor, &split_rec)) {
+		direction = FSP_DOWN;
+		hint_page_no = page_no - 1;
+		ut_ad(split_rec);
+	} else {
+		direction = FSP_UP;
+		hint_page_no = page_no + 1;
+
+		/* If there is only one record in the index page, we
+		can't split the node in the middle by default. We need
+		to determine whether the new record will be inserted
+		to the left or right. */
+
+		if (page_get_n_recs(page) > 1) {
+			split_rec = page_get_middle_rec(page);
+		} else if (btr_page_tuple_smaller(cursor, tuple,
+						  offsets, n_uniq, &heap)) {
+			split_rec = page_rec_get_next(
+				page_get_infimum_rec(page));
+		} else {
+			split_rec = NULL;
+		}
+	}
+
+	/* 2. Allocate a new page to the index */
+	new_block = btr_page_alloc(cursor->index, hint_page_no, direction,
+				   btr_page_get_level(page, mtr), mtr);
+	new_page = buf_block_get_frame(new_block);
+	new_page_zip = buf_block_get_page_zip(new_block);
+	btr_page_create(new_block, new_page_zip, cursor->index,
+			btr_page_get_level(page, mtr), mtr);
+
+	/* 3. Calculate the first record on the upper half-page, and the
+	first record (move_limit) on original page which ends up on the
+	upper half */
+
+	if (split_rec) {
+		first_rec = move_limit = split_rec;
+
+		offsets = rec_get_offsets(split_rec, cursor->index, offsets,
+					  n_uniq, &heap);
+
+		insert_left = cmp_dtuple_rec(tuple, split_rec, offsets) < 0;
+
+		if (UNIV_UNLIKELY(!insert_left && new_page_zip
+				  && n_iterations > 0)) {
+			/* If a compressed page has already been split,
+			avoid further splits by inserting the record
+			to an empty page. */
+			split_rec = NULL;
+			goto insert_empty;
+		}
+	} else if (UNIV_UNLIKELY(insert_left)) {
+		ut_a(n_iterations > 0);
+		first_rec = page_rec_get_next(page_get_infimum_rec(page));
+		move_limit = page_rec_get_next(btr_cur_get_rec(cursor));
+	} else {
+insert_empty:
+		ut_ad(!split_rec);
+		ut_ad(!insert_left);
+		buf = mem_alloc(rec_get_converted_size(cursor->index,
+						       tuple, n_ext));
+
+		first_rec = rec_convert_dtuple_to_rec(buf, cursor->index,
+						      tuple, n_ext);
+		move_limit = page_rec_get_next(btr_cur_get_rec(cursor));
+	}
+
+	/* 4. Do first the modifications in the tree structure */
+
+	btr_attach_half_pages(cursor->index, block,
+			      first_rec, new_block, direction, mtr);
+
+	/* If the split is made on the leaf level and the insert will fit
+	on the appropriate half-page, we may release the tree x-latch.
+	We can then move the records after releasing the tree latch,
+	thus reducing the tree latch contention. */
+
+	if (split_rec) {
+		insert_will_fit = !new_page_zip
+			&& btr_page_insert_fits(cursor, split_rec,
+						offsets, tuple, n_ext, heap);
+	} else {
+		if (!insert_left) {
+			mem_free(buf);
+			buf = NULL;
+		}
+
+		insert_will_fit = !new_page_zip
+			&& btr_page_insert_fits(cursor, NULL,
+						NULL, tuple, n_ext, heap);
+	}
+
+	if (insert_will_fit && page_is_leaf(page)) {
+
+		mtr_memo_release(mtr, dict_index_get_lock(cursor->index),
+				 MTR_MEMO_X_LOCK);
+	}
+
+	/* 5. Move then the records to the new page */
+	if (direction == FSP_DOWN) {
+		/*		fputs("Split left\n", stderr); */
+
+		if (0
+#ifdef UNIV_ZIP_COPY
+		    || page_zip
+#endif /* UNIV_ZIP_COPY */
+		    || UNIV_UNLIKELY
+		    (!page_move_rec_list_start(new_block, block, move_limit,
+					       cursor->index, mtr))) {
+			/* For some reason, compressing new_page failed,
+			even though it should contain fewer records than
+			the original page.  Copy the page byte for byte
+			and then delete the records from both pages
+			as appropriate.  Deleting will always succeed. */
+			ut_a(new_page_zip);
+
+			page_zip_copy_recs(new_page_zip, new_page,
+					   page_zip, page, cursor->index, mtr);
+			page_delete_rec_list_end(move_limit - page + new_page,
+						 new_block, cursor->index,
+						 ULINT_UNDEFINED,
+						 ULINT_UNDEFINED, mtr);
+
+			/* Update the lock table and possible hash index. */
+
+			lock_move_rec_list_start(
+				new_block, block, move_limit,
+				new_page + PAGE_NEW_INFIMUM);
+
+			btr_search_move_or_delete_hash_entries(
+				new_block, block, cursor->index);
+
+			/* Delete the records from the source page. */
+
+			page_delete_rec_list_start(move_limit, block,
+						   cursor->index, mtr);
+		}
+
+		left_block = new_block;
+		right_block = block;
+
+		lock_update_split_left(right_block, left_block);
+	} else {
+		/*		fputs("Split right\n", stderr); */
+
+		if (0
+#ifdef UNIV_ZIP_COPY
+		    || page_zip
+#endif /* UNIV_ZIP_COPY */
+		    || UNIV_UNLIKELY
+		    (!page_move_rec_list_end(new_block, block, move_limit,
+					     cursor->index, mtr))) {
+			/* For some reason, compressing new_page failed,
+			even though it should contain fewer records than
+			the original page.  Copy the page byte for byte
+			and then delete the records from both pages
+			as appropriate.  Deleting will always succeed. */
+			ut_a(new_page_zip);
+
+			page_zip_copy_recs(new_page_zip, new_page,
+					   page_zip, page, cursor->index, mtr);
+			page_delete_rec_list_start(move_limit - page
+						   + new_page, new_block,
+						   cursor->index, mtr);
+
+			/* Update the lock table and possible hash index. */
+
+			lock_move_rec_list_end(new_block, block, move_limit);
+
+			btr_search_move_or_delete_hash_entries(
+				new_block, block, cursor->index);
+
+			/* Delete the records from the source page. */
+
+			page_delete_rec_list_end(move_limit, block,
+						 cursor->index,
+						 ULINT_UNDEFINED,
+						 ULINT_UNDEFINED, mtr);
+		}
+
+		left_block = block;
+		right_block = new_block;
+
+		lock_update_split_right(right_block, left_block);
+	}
+
+#ifdef UNIV_ZIP_DEBUG
+	if (UNIV_LIKELY_NULL(page_zip)) {
+		ut_a(page_zip_validate(page_zip, page));
+		ut_a(page_zip_validate(new_page_zip, new_page));
+	}
+#endif /* UNIV_ZIP_DEBUG */
+
+	/* At this point, split_rec, move_limit and first_rec may point
+	to garbage on the old page. */
+
+	/* 6. The split and the tree modification is now completed. Decide the
+	page where the tuple should be inserted */
+
+	if (insert_left) {
+		insert_block = left_block;
+	} else {
+		insert_block = right_block;
+	}
+
+	insert_page = buf_block_get_frame(insert_block);
+
+	/* 7. Reposition the cursor for insert and try insertion */
+	page_cursor = btr_cur_get_page_cur(cursor);
+
+	page_cur_search(insert_block, cursor->index, tuple,
+			PAGE_CUR_LE, page_cursor);
+
+	rec = page_cur_tuple_insert(page_cursor, tuple,
+				    cursor->index, n_ext, mtr);
+
+#ifdef UNIV_ZIP_DEBUG
+	{
+		page_zip_des_t*	insert_page_zip
+			= buf_block_get_page_zip(insert_block);
+		ut_a(!insert_page_zip
+		     || page_zip_validate(insert_page_zip, insert_page));
+	}
+#endif /* UNIV_ZIP_DEBUG */
+
+	if (UNIV_LIKELY(rec != NULL)) {
+
+		goto func_exit;
+	}
+
+	/* 8. If insert did not fit, try page reorganization */
+
+	if (UNIV_UNLIKELY
+	    (!btr_page_reorganize(insert_block, cursor->index, mtr))) {
+
+		goto insert_failed;
+	}
+
+	page_cur_search(insert_block, cursor->index, tuple,
+			PAGE_CUR_LE, page_cursor);
+	rec = page_cur_tuple_insert(page_cursor, tuple, cursor->index,
+				    n_ext, mtr);
+
+	if (UNIV_UNLIKELY(rec == NULL)) {
+		/* The insert did not fit on the page: loop back to the
+		start of the function for a new split */
+insert_failed:
+		/* We play safe and reset the free bits for new_page */
+		if (!dict_index_is_clust(cursor->index)) {
+			ibuf_reset_free_bits(new_block);
+		}
+
+		/* fprintf(stderr, "Split second round %lu\n",
+		page_get_page_no(page)); */
+		n_iterations++;
+		ut_ad(n_iterations < 2
+		      || buf_block_get_page_zip(insert_block));
+		ut_ad(!insert_will_fit);
+
+		goto func_start;
+	}
+
+func_exit:
+	/* Insert fit on the page: update the free bits for the
+	left and right pages in the same mtr */
+
+	if (!dict_index_is_clust(cursor->index) && page_is_leaf(page)) {
+		ibuf_update_free_bits_for_two_pages_low(
+			buf_block_get_zip_size(left_block),
+			left_block, right_block, mtr);
+	}
+
+#if 0
+	fprintf(stderr, "Split and insert done %lu %lu\n",
+		buf_block_get_page_no(left_block),
+		buf_block_get_page_no(right_block));
+#endif
+
+	ut_ad(page_validate(buf_block_get_frame(left_block), cursor->index));
+	ut_ad(page_validate(buf_block_get_frame(right_block), cursor->index));
+
+	mem_heap_free(heap);
+	return(rec);
+}
+
+/*************************************************************//**
+Removes a page from the level list of pages. */
+static
+void
+btr_level_list_remove(
+/*==================*/
+	ulint		space,	/*!< in: space where removed */
+	ulint		zip_size,/*!< in: compressed page size in bytes
+				or 0 for uncompressed pages */
+	page_t*		page,	/*!< in: page to remove */
+	mtr_t*		mtr)	/*!< in: mtr */
+{
+	ulint	prev_page_no;
+	ulint	next_page_no;
+
+	ut_ad(page && mtr);
+	ut_ad(mtr_memo_contains_page(mtr, page, MTR_MEMO_PAGE_X_FIX));
+	ut_ad(space == page_get_space_id(page));
+	/* Get the previous and next page numbers of page */
+
+	prev_page_no = btr_page_get_prev(page, mtr);
+	next_page_no = btr_page_get_next(page, mtr);
+
+	/* Update page links of the level */
+
+	if (prev_page_no != FIL_NULL) {
+		buf_block_t*	prev_block
+			= btr_block_get(space, zip_size, prev_page_no,
+					RW_X_LATCH, mtr);
+		page_t*		prev_page
+			= buf_block_get_frame(prev_block);
+#ifdef UNIV_BTR_DEBUG
+		ut_a(page_is_comp(prev_page) == page_is_comp(page));
+		ut_a(btr_page_get_next(prev_page, mtr)
+		     == page_get_page_no(page));
+#endif /* UNIV_BTR_DEBUG */
+
+		btr_page_set_next(prev_page,
+				  buf_block_get_page_zip(prev_block),
+				  next_page_no, mtr);
+	}
+
+	if (next_page_no != FIL_NULL) {
+		buf_block_t*	next_block
+			= btr_block_get(space, zip_size, next_page_no,
+					RW_X_LATCH, mtr);
+		page_t*		next_page
+			= buf_block_get_frame(next_block);
+#ifdef UNIV_BTR_DEBUG
+		ut_a(page_is_comp(next_page) == page_is_comp(page));
+		ut_a(btr_page_get_prev(next_page, mtr)
+		     == page_get_page_no(page));
+#endif /* UNIV_BTR_DEBUG */
+
+		btr_page_set_prev(next_page,
+				  buf_block_get_page_zip(next_block),
+				  prev_page_no, mtr);
+	}
+}
+
+/****************************************************************//**
+Writes the redo log record for setting an index record as the predefined
+minimum record. */
+UNIV_INLINE
+void
+btr_set_min_rec_mark_log(
+/*=====================*/
+	rec_t*	rec,	/*!< in: record */
+	byte	type,	/*!< in: MLOG_COMP_REC_MIN_MARK or MLOG_REC_MIN_MARK */
+	mtr_t*	mtr)	/*!< in: mtr */
+{
+	mlog_write_initial_log_record(rec, type, mtr);
+
+	/* Write rec offset as a 2-byte ulint */
+	mlog_catenate_ulint(mtr, page_offset(rec), MLOG_2BYTES);
+}
+#else /* !UNIV_HOTBACKUP */
+# define btr_set_min_rec_mark_log(rec,comp,mtr) ((void) 0)
+#endif /* !UNIV_HOTBACKUP */
+
+/****************************************************************//**
+Parses the redo log record for setting an index record as the predefined
+minimum record.
+@return	end of log record or NULL */
+UNIV_INTERN
+byte*
+btr_parse_set_min_rec_mark(
+/*=======================*/
+	byte*	ptr,	/*!< in: buffer */
+	byte*	end_ptr,/*!< in: buffer end */
+	ulint	comp,	/*!< in: nonzero=compact page format */
+	page_t*	page,	/*!< in: page or NULL */
+	mtr_t*	mtr)	/*!< in: mtr or NULL */
+{
+	rec_t*	rec;
+
+	if (end_ptr < ptr + 2) {
+
+		return(NULL);
+	}
+
+	if (page) {
+		ut_a(!page_is_comp(page) == !comp);
+
+		rec = page + mach_read_from_2(ptr);
+
+		btr_set_min_rec_mark(rec, mtr);
+	}
+
+	return(ptr + 2);
+}
+
+/****************************************************************//**
+Sets a record as the predefined minimum record. */
+UNIV_INTERN
+void
+btr_set_min_rec_mark(
+/*=================*/
+	rec_t*	rec,	/*!< in: record */
+	mtr_t*	mtr)	/*!< in: mtr */
+{
+	ulint	info_bits;
+
+	if (UNIV_LIKELY(page_rec_is_comp(rec))) {
+		info_bits = rec_get_info_bits(rec, TRUE);
+
+		rec_set_info_bits_new(rec, info_bits | REC_INFO_MIN_REC_FLAG);
+
+		btr_set_min_rec_mark_log(rec, MLOG_COMP_REC_MIN_MARK, mtr);
+	} else {
+		info_bits = rec_get_info_bits(rec, FALSE);
+
+		rec_set_info_bits_old(rec, info_bits | REC_INFO_MIN_REC_FLAG);
+
+		btr_set_min_rec_mark_log(rec, MLOG_REC_MIN_MARK, mtr);
+	}
+}
+
+#ifndef UNIV_HOTBACKUP
+/*************************************************************//**
+Deletes on the upper level the node pointer to a page. */
+UNIV_INTERN
+void
+btr_node_ptr_delete(
+/*================*/
+	dict_index_t*	index,	/*!< in: index tree */
+	buf_block_t*	block,	/*!< in: page whose node pointer is deleted */
+	mtr_t*		mtr)	/*!< in: mtr */
+{
+	btr_cur_t	cursor;
+	ibool		compressed;
+	ulint		err;
+
+	ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
+
+	/* Delete node pointer on father page */
+	btr_page_get_father(index, block, mtr, &cursor);
+
+	compressed = btr_cur_pessimistic_delete(&err, TRUE, &cursor, RB_NONE,
+						mtr);
+	ut_a(err == DB_SUCCESS);
+
+	if (!compressed) {
+		btr_cur_compress_if_useful(&cursor, mtr);
+	}
+}
+
+/*************************************************************//**
+If page is the only on its level, this function moves its records to the
+father page, thus reducing the tree height. */
+static
+void
+btr_lift_page_up(
+/*=============*/
+	dict_index_t*	index,	/*!< in: index tree */
+	buf_block_t*	block,	/*!< in: page which is the only on its level;
+				must not be empty: use
+				btr_discard_only_page_on_level if the last
+				record from the page should be removed */
+	mtr_t*		mtr)	/*!< in: mtr */
+{
+	buf_block_t*	father_block;
+	page_t*		father_page;
+	ulint		page_level;
+	page_zip_des_t*	father_page_zip;
+	page_t*		page		= buf_block_get_frame(block);
+	ulint		root_page_no;
+	buf_block_t*	blocks[BTR_MAX_LEVELS];
+	ulint		n_blocks;	/*!< last used index in blocks[] */
+	ulint		i;
+
+	ut_ad(btr_page_get_prev(page, mtr) == FIL_NULL);
+	ut_ad(btr_page_get_next(page, mtr) == FIL_NULL);
+	ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
+
+	page_level = btr_page_get_level(page, mtr);
+	root_page_no = dict_index_get_page(index);
+
+	{
+		btr_cur_t	cursor;
+		mem_heap_t*	heap	= mem_heap_create(100);
+		ulint*		offsets;
+		buf_block_t*	b;
+
+		offsets = btr_page_get_father_block(NULL, heap, index,
+						    block, mtr, &cursor);
+		father_block = btr_cur_get_block(&cursor);
+		father_page_zip = buf_block_get_page_zip(father_block);
+		father_page = buf_block_get_frame(father_block);
+
+		n_blocks = 0;
+
+		/* Store all ancestor pages so we can reset their
+		levels later on.  We have to do all the searches on
+		the tree now because later on, after we've replaced
+		the first level, the tree is in an inconsistent state
+		and can not be searched. */
+		for (b = father_block;
+		     buf_block_get_page_no(b) != root_page_no; ) {
+			ut_a(n_blocks < BTR_MAX_LEVELS);
+
+			offsets = btr_page_get_father_block(offsets, heap,
+							    index, b,
+							    mtr, &cursor);
+
+			blocks[n_blocks++] = b = btr_cur_get_block(&cursor);
+		}
+
+		mem_heap_free(heap);
+	}
+
+	btr_search_drop_page_hash_index(block);
+
+	/* Make the father empty */
+	btr_page_empty(father_block, father_page_zip, index, page_level, mtr);
+
+	/* Copy the records to the father page one by one. */
+	if (0
+#ifdef UNIV_ZIP_COPY
+	    || father_page_zip
+#endif /* UNIV_ZIP_COPY */
+	    || UNIV_UNLIKELY
+	    (!page_copy_rec_list_end(father_block, block,
+				     page_get_infimum_rec(page),
+				     index, mtr))) {
+		const page_zip_des_t*	page_zip
+			= buf_block_get_page_zip(block);
+		ut_a(father_page_zip);
+		ut_a(page_zip);
+
+		/* Copy the page byte for byte. */
+		page_zip_copy_recs(father_page_zip, father_page,
+				   page_zip, page, index, mtr);
+
+		/* Update the lock table and possible hash index. */
+
+		lock_move_rec_list_end(father_block, block,
+				       page_get_infimum_rec(page));
+
+		btr_search_move_or_delete_hash_entries(father_block, block,
+						       index);
+	}
+
+	lock_update_copy_and_discard(father_block, block);
+
+	/* Go upward to root page, decrementing levels by one. */
+	for (i = 0; i < n_blocks; i++, page_level++) {
+		page_t*		page	= buf_block_get_frame(blocks[i]);
+		page_zip_des_t*	page_zip= buf_block_get_page_zip(blocks[i]);
+
+		ut_ad(btr_page_get_level(page, mtr) == page_level + 1);
+
+		btr_page_set_level(page, page_zip, page_level, mtr);
+#ifdef UNIV_ZIP_DEBUG
+		ut_a(!page_zip || page_zip_validate(page_zip, page));
+#endif /* UNIV_ZIP_DEBUG */
+	}
+
+	/* Free the file page */
+	btr_page_free(index, block, mtr);
+
+	/* We play it safe and reset the free bits for the father */
+	if (!dict_index_is_clust(index)) {
+		ibuf_reset_free_bits(father_block);
+	}
+	ut_ad(page_validate(father_page, index));
+	ut_ad(btr_check_node_ptr(index, father_block, mtr));
+}
+
+/*************************************************************//**
+Tries to merge the page first to the left immediate brother if such a
+brother exists, and the node pointers to the current page and to the brother
+reside on the same page. If the left brother does not satisfy these
+conditions, looks at the right brother. If the page is the only one on that
+level lifts the records of the page to the father page, thus reducing the
+tree height. It is assumed that mtr holds an x-latch on the tree and on the
+page. If cursor is on the leaf level, mtr must also hold x-latches to the
+brothers, if they exist.
+@return	TRUE on success */
+UNIV_INTERN
+ibool
+btr_compress(
+/*=========*/
+	btr_cur_t*	cursor,	/*!< in: cursor on the page to merge or lift;
+				the page must not be empty: in record delete
+				use btr_discard_page if the page would become
+				empty */
+	mtr_t*		mtr)	/*!< in: mtr */
+{
+	dict_index_t*	index;
+	ulint		space;
+	ulint		zip_size;
+	ulint		left_page_no;
+	ulint		right_page_no;
+	buf_block_t*	merge_block;
+	page_t*		merge_page;
+	page_zip_des_t*	merge_page_zip;
+	ibool		is_left;
+	buf_block_t*	block;
+	page_t*		page;
+	btr_cur_t	father_cursor;
+	mem_heap_t*	heap;
+	ulint*		offsets;
+	ulint		data_size;
+	ulint		n_recs;
+	ulint		max_ins_size;
+	ulint		max_ins_size_reorg;
+	ulint		level;
+
+	block = btr_cur_get_block(cursor);
+	page = btr_cur_get_page(cursor);
+	index = btr_cur_get_index(cursor);
+	ut_a((ibool) !!page_is_comp(page) == dict_table_is_comp(index->table));
+
+	ut_ad(mtr_memo_contains(mtr, dict_index_get_lock(index),
+				MTR_MEMO_X_LOCK));
+	ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
+	level = btr_page_get_level(page, mtr);
+	space = dict_index_get_space(index);
+	zip_size = dict_table_zip_size(index->table);
+
+	left_page_no = btr_page_get_prev(page, mtr);
+	right_page_no = btr_page_get_next(page, mtr);
+
+#if 0
+	fprintf(stderr, "Merge left page %lu right %lu \n",
+		left_page_no, right_page_no);
+#endif
+
+	heap = mem_heap_create(100);
+	offsets = btr_page_get_father_block(NULL, heap, index, block, mtr,
+					    &father_cursor);
+
+	/* Decide the page to which we try to merge and which will inherit
+	the locks */
+
+	is_left = left_page_no != FIL_NULL;
+
+	if (is_left) {
+
+		merge_block = btr_block_get(space, zip_size, left_page_no,
+					    RW_X_LATCH, mtr);
+		merge_page = buf_block_get_frame(merge_block);
+#ifdef UNIV_BTR_DEBUG
+		ut_a(btr_page_get_next(merge_page, mtr)
+		     == buf_block_get_page_no(block));
+#endif /* UNIV_BTR_DEBUG */
+	} else if (right_page_no != FIL_NULL) {
+
+		merge_block = btr_block_get(space, zip_size, right_page_no,
+					    RW_X_LATCH, mtr);
+		merge_page = buf_block_get_frame(merge_block);
+#ifdef UNIV_BTR_DEBUG
+		ut_a(btr_page_get_prev(merge_page, mtr)
+		     == buf_block_get_page_no(block));
+#endif /* UNIV_BTR_DEBUG */
+	} else {
+		/* The page is the only one on the level, lift the records
+		to the father */
+		btr_lift_page_up(index, block, mtr);
+		mem_heap_free(heap);
+		return(TRUE);
+	}
+
+	n_recs = page_get_n_recs(page);
+	data_size = page_get_data_size(page);
+#ifdef UNIV_BTR_DEBUG
+	ut_a(page_is_comp(merge_page) == page_is_comp(page));
+#endif /* UNIV_BTR_DEBUG */
+
+	max_ins_size_reorg = page_get_max_insert_size_after_reorganize(
+		merge_page, n_recs);
+	if (data_size > max_ins_size_reorg) {
+
+		/* No space for merge */
+err_exit:
+		/* We play it safe and reset the free bits. */
+		if (zip_size
+		    && page_is_leaf(merge_page)
+		    && !dict_index_is_clust(index)) {
+			ibuf_reset_free_bits(merge_block);
+		}
+
+		mem_heap_free(heap);
+		return(FALSE);
+	}
+
+	ut_ad(page_validate(merge_page, index));
+
+	max_ins_size = page_get_max_insert_size(merge_page, n_recs);
+
+	if (UNIV_UNLIKELY(data_size > max_ins_size)) {
+
+		/* We have to reorganize merge_page */
+
+		if (UNIV_UNLIKELY(!btr_page_reorganize(merge_block,
+						       index, mtr))) {
+
+			goto err_exit;
+		}
+
+		max_ins_size = page_get_max_insert_size(merge_page, n_recs);
+
+		ut_ad(page_validate(merge_page, index));
+		ut_ad(max_ins_size == max_ins_size_reorg);
+
+		if (UNIV_UNLIKELY(data_size > max_ins_size)) {
+
+			/* Add fault tolerance, though this should
+			never happen */
+
+			goto err_exit;
+		}
+	}
+
+	merge_page_zip = buf_block_get_page_zip(merge_block);
+#ifdef UNIV_ZIP_DEBUG
+	if (UNIV_LIKELY_NULL(merge_page_zip)) {
+		const page_zip_des_t*	page_zip
+			= buf_block_get_page_zip(block);
+		ut_a(page_zip);
+		ut_a(page_zip_validate(merge_page_zip, merge_page));
+		ut_a(page_zip_validate(page_zip, page));
+	}
+#endif /* UNIV_ZIP_DEBUG */
+
+	/* Move records to the merge page */
+	if (is_left) {
+		rec_t*	orig_pred = page_copy_rec_list_start(
+			merge_block, block, page_get_supremum_rec(page),
+			index, mtr);
+
+		if (UNIV_UNLIKELY(!orig_pred)) {
+			goto err_exit;
+		}
+
+		btr_search_drop_page_hash_index(block);
+
+		/* Remove the page from the level list */
+		btr_level_list_remove(space, zip_size, page, mtr);
+
+		btr_node_ptr_delete(index, block, mtr);
+		lock_update_merge_left(merge_block, orig_pred, block);
+	} else {
+		rec_t*		orig_succ;
+#ifdef UNIV_BTR_DEBUG
+		byte		fil_page_prev[4];
+#endif /* UNIV_BTR_DEBUG */
+
+		if (UNIV_LIKELY_NULL(merge_page_zip)) {
+			/* The function page_zip_compress(), which will be
+			invoked by page_copy_rec_list_end() below,
+			requires that FIL_PAGE_PREV be FIL_NULL.
+			Clear the field, but prepare to restore it. */
+#ifdef UNIV_BTR_DEBUG
+			memcpy(fil_page_prev, merge_page + FIL_PAGE_PREV, 4);
+#endif /* UNIV_BTR_DEBUG */
+#if FIL_NULL != 0xffffffff
+# error "FIL_NULL != 0xffffffff"
+#endif
+			memset(merge_page + FIL_PAGE_PREV, 0xff, 4);
+		}
+
+		orig_succ = page_copy_rec_list_end(merge_block, block,
+						   page_get_infimum_rec(page),
+						   cursor->index, mtr);
+
+		if (UNIV_UNLIKELY(!orig_succ)) {
+			ut_a(merge_page_zip);
+#ifdef UNIV_BTR_DEBUG
+			/* FIL_PAGE_PREV was restored from merge_page_zip. */
+			ut_a(!memcmp(fil_page_prev,
+				     merge_page + FIL_PAGE_PREV, 4));
+#endif /* UNIV_BTR_DEBUG */
+			goto err_exit;
+		}
+
+		btr_search_drop_page_hash_index(block);
+
+#ifdef UNIV_BTR_DEBUG
+		if (UNIV_LIKELY_NULL(merge_page_zip)) {
+			/* Restore FIL_PAGE_PREV in order to avoid an assertion
+			failure in btr_level_list_remove(), which will set
+			the field again to FIL_NULL.  Even though this makes
+			merge_page and merge_page_zip inconsistent for a
+			split second, it is harmless, because the pages
+			are X-latched. */
+			memcpy(merge_page + FIL_PAGE_PREV, fil_page_prev, 4);
+		}
+#endif /* UNIV_BTR_DEBUG */
+
+		/* Remove the page from the level list */
+		btr_level_list_remove(space, zip_size, page, mtr);
+
+		/* Replace the address of the old child node (= page) with the
+		address of the merge page to the right */
+
+		btr_node_ptr_set_child_page_no(
+			btr_cur_get_rec(&father_cursor),
+			btr_cur_get_page_zip(&father_cursor),
+			offsets, right_page_no, mtr);
+		btr_node_ptr_delete(index, merge_block, mtr);
+
+		lock_update_merge_right(merge_block, orig_succ, block);
+	}
+
+	mem_heap_free(heap);
+
+	if (!dict_index_is_clust(index) && page_is_leaf(merge_page)) {
+		/* Update the free bits of the B-tree page in the
+		insert buffer bitmap.  This has to be done in a
+		separate mini-transaction that is committed before the
+		main mini-transaction.  We cannot update the insert
+		buffer bitmap in this mini-transaction, because
+		btr_compress() can be invoked recursively without
+		committing the mini-transaction in between.  Since
+		insert buffer bitmap pages have a lower rank than
+		B-tree pages, we must not access other pages in the
+		same mini-transaction after accessing an insert buffer
+		bitmap page. */
+
+		/* The free bits in the insert buffer bitmap must
+		never exceed the free space on a page.  It is safe to
+		decrement or reset the bits in the bitmap in a
+		mini-transaction that is committed before the
+		mini-transaction that affects the free space. */
+
+		/* It is unsafe to increment the bits in a separately
+		committed mini-transaction, because in crash recovery,
+		the free bits could momentarily be set too high. */
+
+		if (zip_size) {
+			/* Because the free bits may be incremented
+			and we cannot update the insert buffer bitmap
+			in the same mini-transaction, the only safe
+			thing we can do here is the pessimistic
+			approach: reset the free bits. */
+			ibuf_reset_free_bits(merge_block);
+		} else {
+			/* On uncompressed pages, the free bits will
+			never increase here.  Thus, it is safe to
+			write the bits accurately in a separate
+			mini-transaction. */
+			ibuf_update_free_bits_if_full(merge_block,
+						      UNIV_PAGE_SIZE,
+						      ULINT_UNDEFINED);
+		}
+	}
+
+	ut_ad(page_validate(merge_page, index));
+#ifdef UNIV_ZIP_DEBUG
+	ut_a(!merge_page_zip || page_zip_validate(merge_page_zip, merge_page));
+#endif /* UNIV_ZIP_DEBUG */
+
+	/* Free the file page */
+	btr_page_free(index, block, mtr);
+
+	ut_ad(btr_check_node_ptr(index, merge_block, mtr));
+	return(TRUE);
+}
+
+/*************************************************************//**
+Discards a page that is the only page on its level.  This will empty
+the whole B-tree, leaving just an empty root page.  This function
+should never be reached, because btr_compress(), which is invoked in
+delete operations, calls btr_lift_page_up() to flatten the B-tree. */
+static
+void
+btr_discard_only_page_on_level(
+/*===========================*/
+	dict_index_t*	index,	/*!< in: index tree */
+	buf_block_t*	block,	/*!< in: page which is the only on its level */
+	mtr_t*		mtr)	/*!< in: mtr */
+{
+	ulint		page_level = 0;
+	trx_id_t	max_trx_id;
+
+	/* Save the PAGE_MAX_TRX_ID from the leaf page. */
+	max_trx_id = page_get_max_trx_id(buf_block_get_frame(block));
+
+	while (buf_block_get_page_no(block) != dict_index_get_page(index)) {
+		btr_cur_t	cursor;
+		buf_block_t*	father;
+		const page_t*	page	= buf_block_get_frame(block);
+
+		ut_a(page_get_n_recs(page) == 1);
+		ut_a(page_level == btr_page_get_level(page, mtr));
+		ut_a(btr_page_get_prev(page, mtr) == FIL_NULL);
+		ut_a(btr_page_get_next(page, mtr) == FIL_NULL);
+
+		ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
+		btr_search_drop_page_hash_index(block);
+
+		btr_page_get_father(index, block, mtr, &cursor);
+		father = btr_cur_get_block(&cursor);
+
+		lock_update_discard(father, PAGE_HEAP_NO_SUPREMUM, block);
+
+		/* Free the file page */
+		btr_page_free(index, block, mtr);
+
+		block = father;
+		page_level++;
+	}
+
+	/* block is the root page, which must be empty, except
+	for the node pointer to the (now discarded) block(s). */
+
+#ifdef UNIV_BTR_DEBUG
+	if (!dict_index_is_ibuf(index)) {
+		const page_t*	root	= buf_block_get_frame(block);
+		const ulint	space	= dict_index_get_space(index);
+		ut_a(btr_root_fseg_validate(FIL_PAGE_DATA + PAGE_BTR_SEG_LEAF
+					    + root, space));
+		ut_a(btr_root_fseg_validate(FIL_PAGE_DATA + PAGE_BTR_SEG_TOP
+					    + root, space));
+	}
+#endif /* UNIV_BTR_DEBUG */
+
+	btr_page_empty(block, buf_block_get_page_zip(block), index, 0, mtr);
+
+	if (!dict_index_is_clust(index)) {
+		/* We play it safe and reset the free bits for the root */
+		ibuf_reset_free_bits(block);
+
+		if (page_is_leaf(buf_block_get_frame(block))) {
+			ut_a(!ut_dulint_is_zero(max_trx_id));
+			page_set_max_trx_id(block,
+					    buf_block_get_page_zip(block),
+					    max_trx_id, mtr);
+		}
+	}
+}
+
+/*************************************************************//**
+Discards a page from a B-tree. This is used to remove the last record from
+a B-tree page: the whole page must be removed at the same time. This cannot
+be used for the root page, which is allowed to be empty. */
+UNIV_INTERN
+void
+btr_discard_page(
+/*=============*/
+	btr_cur_t*	cursor,	/*!< in: cursor on the page to discard: not on
+				the root page */
+	mtr_t*		mtr)	/*!< in: mtr */
+{
+	dict_index_t*	index;
+	ulint		space;
+	ulint		zip_size;
+	ulint		left_page_no;
+	ulint		right_page_no;
+	buf_block_t*	merge_block;
+	page_t*		merge_page;
+	buf_block_t*	block;
+	page_t*		page;
+	rec_t*		node_ptr;
+
+	block = btr_cur_get_block(cursor);
+	index = btr_cur_get_index(cursor);
+
+	ut_ad(dict_index_get_page(index) != buf_block_get_page_no(block));
+	ut_ad(mtr_memo_contains(mtr, dict_index_get_lock(index),
+				MTR_MEMO_X_LOCK));
+	ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
+	space = dict_index_get_space(index);
+	zip_size = dict_table_zip_size(index->table);
+
+	/* Decide the page which will inherit the locks */
+
+	left_page_no = btr_page_get_prev(buf_block_get_frame(block), mtr);
+	right_page_no = btr_page_get_next(buf_block_get_frame(block), mtr);
+
+	if (left_page_no != FIL_NULL) {
+		merge_block = btr_block_get(space, zip_size, left_page_no,
+					    RW_X_LATCH, mtr);
+		merge_page = buf_block_get_frame(merge_block);
+#ifdef UNIV_BTR_DEBUG
+		ut_a(btr_page_get_next(merge_page, mtr)
+		     == buf_block_get_page_no(block));
+#endif /* UNIV_BTR_DEBUG */
+	} else if (right_page_no != FIL_NULL) {
+		merge_block = btr_block_get(space, zip_size, right_page_no,
+					    RW_X_LATCH, mtr);
+		merge_page = buf_block_get_frame(merge_block);
+#ifdef UNIV_BTR_DEBUG
+		ut_a(btr_page_get_prev(merge_page, mtr)
+		     == buf_block_get_page_no(block));
+#endif /* UNIV_BTR_DEBUG */
+	} else {
+		btr_discard_only_page_on_level(index, block, mtr);
+
+		return;
+	}
+
+	page = buf_block_get_frame(block);
+	ut_a(page_is_comp(merge_page) == page_is_comp(page));
+	btr_search_drop_page_hash_index(block);
+
+	if (left_page_no == FIL_NULL && !page_is_leaf(page)) {
+
+		/* We have to mark the leftmost node pointer on the right
+		side page as the predefined minimum record */
+		node_ptr = page_rec_get_next(page_get_infimum_rec(merge_page));
+
+		ut_ad(page_rec_is_user_rec(node_ptr));
+
+		/* This will make page_zip_validate() fail on merge_page
+		until btr_level_list_remove() completes.  This is harmless,
+		because everything will take place within a single
+		mini-transaction and because writing to the redo log
+		is an atomic operation (performed by mtr_commit()). */
+		btr_set_min_rec_mark(node_ptr, mtr);
+	}
+
+	btr_node_ptr_delete(index, block, mtr);
+
+	/* Remove the page from the level list */
+	btr_level_list_remove(space, zip_size, page, mtr);
+#ifdef UNIV_ZIP_DEBUG
+	{
+		page_zip_des_t*	merge_page_zip
+			= buf_block_get_page_zip(merge_block);
+		ut_a(!merge_page_zip
+		     || page_zip_validate(merge_page_zip, merge_page));
+	}
+#endif /* UNIV_ZIP_DEBUG */
+
+	if (left_page_no != FIL_NULL) {
+		lock_update_discard(merge_block, PAGE_HEAP_NO_SUPREMUM,
+				    block);
+	} else {
+		lock_update_discard(merge_block,
+				    lock_get_min_heap_no(merge_block),
+				    block);
+	}
+
+	/* Free the file page */
+	btr_page_free(index, block, mtr);
+
+	ut_ad(btr_check_node_ptr(index, merge_block, mtr));
+}
+
+#ifdef UNIV_BTR_PRINT
+/*************************************************************//**
+Prints size info of a B-tree. */
+UNIV_INTERN
+void
+btr_print_size(
+/*===========*/
+	dict_index_t*	index)	/*!< in: index tree */
+{
+	page_t*		root;
+	fseg_header_t*	seg;
+	mtr_t		mtr;
+
+	if (dict_index_is_ibuf(index)) {
+		fputs("Sorry, cannot print info of an ibuf tree:"
+		      " use ibuf functions\n", stderr);
+
+		return;
+	}
+
+	mtr_start(&mtr);
+
+	root = btr_root_get(index, &mtr);
+
+	seg = root + PAGE_HEADER + PAGE_BTR_SEG_TOP;
+
+	fputs("INFO OF THE NON-LEAF PAGE SEGMENT\n", stderr);
+	fseg_print(seg, &mtr);
+
+	if (!(index->type & DICT_UNIVERSAL)) {
+
+		seg = root + PAGE_HEADER + PAGE_BTR_SEG_LEAF;
+
+		fputs("INFO OF THE LEAF PAGE SEGMENT\n", stderr);
+		fseg_print(seg, &mtr);
+	}
+
+	mtr_commit(&mtr);
+}
+
+/************************************************************//**
+Prints recursively index tree pages. */
+static
+void
+btr_print_recursive(
+/*================*/
+	dict_index_t*	index,	/*!< in: index tree */
+	buf_block_t*	block,	/*!< in: index page */
+	ulint		width,	/*!< in: print this many entries from start
+				and end */
+	mem_heap_t**	heap,	/*!< in/out: heap for rec_get_offsets() */
+	ulint**		offsets,/*!< in/out: buffer for rec_get_offsets() */
+	mtr_t*		mtr)	/*!< in: mtr */
+{
+	const page_t*	page	= buf_block_get_frame(block);
+	page_cur_t	cursor;
+	ulint		n_recs;
+	ulint		i	= 0;
+	mtr_t		mtr2;
+
+	ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
+	fprintf(stderr, "NODE ON LEVEL %lu page number %lu\n",
+		(ulong) btr_page_get_level(page, mtr),
+		(ulong) buf_block_get_page_no(block));
+
+	page_print(block, index, width, width);
+
+	n_recs = page_get_n_recs(page);
+
+	page_cur_set_before_first(block, &cursor);
+	page_cur_move_to_next(&cursor);
+
+	while (!page_cur_is_after_last(&cursor)) {
+
+		if (page_is_leaf(page)) {
+
+			/* If this is the leaf level, do nothing */
+
+		} else if ((i <= width) || (i >= n_recs - width)) {
+
+			const rec_t*	node_ptr;
+
+			mtr_start(&mtr2);
+
+			node_ptr = page_cur_get_rec(&cursor);
+
+			*offsets = rec_get_offsets(node_ptr, index, *offsets,
+						   ULINT_UNDEFINED, heap);
+			btr_print_recursive(index,
+					    btr_node_ptr_get_child(node_ptr,
+								   index,
+								   *offsets,
+								   &mtr2),
+					    width, heap, offsets, &mtr2);
+			mtr_commit(&mtr2);
+		}
+
+		page_cur_move_to_next(&cursor);
+		i++;
+	}
+}
+
+/**************************************************************//**
+Prints directories and other info of all nodes in the tree. */
+UNIV_INTERN
+void
+btr_print_index(
+/*============*/
+	dict_index_t*	index,	/*!< in: index */
+	ulint		width)	/*!< in: print this many entries from start
+				and end */
+{
+	mtr_t		mtr;
+	buf_block_t*	root;
+	mem_heap_t*	heap	= NULL;
+	ulint		offsets_[REC_OFFS_NORMAL_SIZE];
+	ulint*		offsets	= offsets_;
+	rec_offs_init(offsets_);
+
+	fputs("--------------------------\n"
+	      "INDEX TREE PRINT\n", stderr);
+
+	mtr_start(&mtr);
+
+	root = btr_root_block_get(index, &mtr);
+
+	btr_print_recursive(index, root, width, &heap, &offsets, &mtr);
+	if (UNIV_LIKELY_NULL(heap)) {
+		mem_heap_free(heap);
+	}
+
+	mtr_commit(&mtr);
+
+	btr_validate_index(index, NULL);
+}
+#endif /* UNIV_BTR_PRINT */
+
+#ifdef UNIV_DEBUG
+/************************************************************//**
+Checks that the node pointer to a page is appropriate.
+@return	TRUE */
+UNIV_INTERN
+ibool
+btr_check_node_ptr(
+/*===============*/
+	dict_index_t*	index,	/*!< in: index tree */
+	buf_block_t*	block,	/*!< in: index page */
+	mtr_t*		mtr)	/*!< in: mtr */
+{
+	mem_heap_t*	heap;
+	dtuple_t*	tuple;
+	ulint*		offsets;
+	btr_cur_t	cursor;
+	page_t*		page = buf_block_get_frame(block);
+
+	ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
+	if (dict_index_get_page(index) == buf_block_get_page_no(block)) {
+
+		return(TRUE);
+	}
+
+	heap = mem_heap_create(256);
+	offsets = btr_page_get_father_block(NULL, heap, index, block, mtr,
+					    &cursor);
+
+	if (page_is_leaf(page)) {
+
+		goto func_exit;
+	}
+
+	tuple = dict_index_build_node_ptr(
+		index, page_rec_get_next(page_get_infimum_rec(page)), 0, heap,
+		btr_page_get_level(page, mtr));
+
+	ut_a(!cmp_dtuple_rec(tuple, btr_cur_get_rec(&cursor), offsets));
+func_exit:
+	mem_heap_free(heap);
+
+	return(TRUE);
+}
+#endif /* UNIV_DEBUG */
+
+/************************************************************//**
+Display identification information for a record. */
+static
+void
+btr_index_rec_validate_report(
+/*==========================*/
+	const page_t*		page,	/*!< in: index page */
+	const rec_t*		rec,	/*!< in: index record */
+	const dict_index_t*	index)	/*!< in: index */
+{
+	fputs("InnoDB: Record in ", stderr);
+	dict_index_name_print(stderr, NULL, index);
+	fprintf(stderr, ", page %lu, at offset %lu\n",
+		page_get_page_no(page), (ulint) page_offset(rec));
+}
+
+/************************************************************//**
+Checks the size and number of fields in a record based on the definition of
+the index.
+@return	TRUE if ok */
+UNIV_INTERN
+ibool
+btr_index_rec_validate(
+/*===================*/
+	const rec_t*		rec,		/*!< in: index record */
+	const dict_index_t*	index,		/*!< in: index */
+	ibool			dump_on_error)	/*!< in: TRUE if the function
+						should print hex dump of record
+						and page on error */
+{
+	ulint		len;
+	ulint		n;
+	ulint		i;
+	const page_t*	page;
+	mem_heap_t*	heap	= NULL;
+	ulint		offsets_[REC_OFFS_NORMAL_SIZE];
+	ulint*		offsets	= offsets_;
+	rec_offs_init(offsets_);
+
+	page = page_align(rec);
+
+	if (UNIV_UNLIKELY(index->type & DICT_UNIVERSAL)) {
+		/* The insert buffer index tree can contain records from any
+		other index: we cannot check the number of fields or
+		their length */
+
+		return(TRUE);
+	}
+
+	if (UNIV_UNLIKELY((ibool)!!page_is_comp(page)
+			  != dict_table_is_comp(index->table))) {
+		btr_index_rec_validate_report(page, rec, index);
+		fprintf(stderr, "InnoDB: compact flag=%lu, should be %lu\n",
+			(ulong) !!page_is_comp(page),
+			(ulong) dict_table_is_comp(index->table));
+
+		return(FALSE);
+	}
+
+	n = dict_index_get_n_fields(index);
+
+	if (!page_is_comp(page)
+	    && UNIV_UNLIKELY(rec_get_n_fields_old(rec) != n)) {
+		btr_index_rec_validate_report(page, rec, index);
+		fprintf(stderr, "InnoDB: has %lu fields, should have %lu\n",
+			(ulong) rec_get_n_fields_old(rec), (ulong) n);
+
+		if (dump_on_error) {
+			buf_page_print(page, 0);
+
+			fputs("InnoDB: corrupt record ", stderr);
+			rec_print_old(stderr, rec);
+			putc('\n', stderr);
+		}
+		return(FALSE);
+	}
+
+	offsets = rec_get_offsets(rec, index, offsets, ULINT_UNDEFINED, &heap);
+
+	for (i = 0; i < n; i++) {
+		ulint	fixed_size = dict_col_get_fixed_size(
+			dict_index_get_nth_col(index, i), page_is_comp(page));
+
+		rec_get_nth_field_offs(offsets, i, &len);
+
+		/* Note that if fixed_size != 0, it equals the
+		length of a fixed-size column in the clustered index.
+		A prefix index of the column is of fixed, but different
+		length.  When fixed_size == 0, prefix_len is the maximum
+		length of the prefix index column. */
+
+		if ((dict_index_get_nth_field(index, i)->prefix_len == 0
+		     && len != UNIV_SQL_NULL && fixed_size
+		     && len != fixed_size)
+		    || (dict_index_get_nth_field(index, i)->prefix_len > 0
+			&& len != UNIV_SQL_NULL
+			&& len
+			> dict_index_get_nth_field(index, i)->prefix_len)) {
+
+			btr_index_rec_validate_report(page, rec, index);
+			fprintf(stderr,
+				"InnoDB: field %lu len is %lu,"
+				" should be %lu\n",
+				(ulong) i, (ulong) len, (ulong) fixed_size);
+
+			if (dump_on_error) {
+				buf_page_print(page, 0);
+
+				fputs("InnoDB: corrupt record ", stderr);
+				rec_print_new(stderr, rec, offsets);
+				putc('\n', stderr);
+			}
+			if (UNIV_LIKELY_NULL(heap)) {
+				mem_heap_free(heap);
+			}
+			return(FALSE);
+		}
+	}
+
+	if (UNIV_LIKELY_NULL(heap)) {
+		mem_heap_free(heap);
+	}
+	return(TRUE);
+}
+
+/************************************************************//**
+Checks the size and number of fields in records based on the definition of
+the index.
+@return	TRUE if ok */
+static
+ibool
+btr_index_page_validate(
+/*====================*/
+	buf_block_t*	block,	/*!< in: index page */
+	dict_index_t*	index)	/*!< in: index */
+{
+	page_cur_t	cur;
+	ibool		ret	= TRUE;
+
+	page_cur_set_before_first(block, &cur);
+	page_cur_move_to_next(&cur);
+
+	for (;;) {
+		if (page_cur_is_after_last(&cur)) {
+
+			break;
+		}
+
+		if (!btr_index_rec_validate(cur.rec, index, TRUE)) {
+
+			return(FALSE);
+		}
+
+		page_cur_move_to_next(&cur);
+	}
+
+	return(ret);
+}
+
+/************************************************************//**
+Report an error on one page of an index tree. */
+static
+void
+btr_validate_report1(
+/*=================*/
+	dict_index_t*		index,	/*!< in: index */
+	ulint			level,	/*!< in: B-tree level */
+	const buf_block_t*	block)	/*!< in: index page */
+{
+	fprintf(stderr, "InnoDB: Error in page %lu of ",
+		buf_block_get_page_no(block));
+	dict_index_name_print(stderr, NULL, index);
+	if (level) {
+		fprintf(stderr, ", index tree level %lu", level);
+	}
+	putc('\n', stderr);
+}
+
+/************************************************************//**
+Report an error on two pages of an index tree. */
+static
+void
+btr_validate_report2(
+/*=================*/
+	const dict_index_t*	index,	/*!< in: index */
+	ulint			level,	/*!< in: B-tree level */
+	const buf_block_t*	block1,	/*!< in: first index page */
+	const buf_block_t*	block2)	/*!< in: second index page */
+{
+	fprintf(stderr, "InnoDB: Error in pages %lu and %lu of ",
+		buf_block_get_page_no(block1),
+		buf_block_get_page_no(block2));
+	dict_index_name_print(stderr, NULL, index);
+	if (level) {
+		fprintf(stderr, ", index tree level %lu", level);
+	}
+	putc('\n', stderr);
+}
+
+/************************************************************//**
+Validates index tree level.
+@return	TRUE if ok */
+static
+ibool
+btr_validate_level(
+/*===============*/
+	dict_index_t*	index,	/*!< in: index tree */
+	trx_t*		trx,	/*!< in: transaction or NULL */
+	ulint		level)	/*!< in: level number */
+{
+	ulint		space;
+	ulint		zip_size;
+	buf_block_t*	block;
+	page_t*		page;
+	buf_block_t*	right_block = 0; /* remove warning */
+	page_t*		right_page = 0; /* remove warning */
+	page_t*		father_page;
+	btr_cur_t	node_cur;
+	btr_cur_t	right_node_cur;
+	rec_t*		rec;
+	ulint		right_page_no;
+	ulint		left_page_no;
+	page_cur_t	cursor;
+	dtuple_t*	node_ptr_tuple;
+	ibool		ret	= TRUE;
+	mtr_t		mtr;
+	mem_heap_t*	heap	= mem_heap_create(256);
+	ulint*		offsets	= NULL;
+	ulint*		offsets2= NULL;
+#ifdef UNIV_ZIP_DEBUG
+	page_zip_des_t*	page_zip;
+#endif /* UNIV_ZIP_DEBUG */
+
+	mtr_start(&mtr);
+
+	mtr_x_lock(dict_index_get_lock(index), &mtr);
+
+	block = btr_root_block_get(index, &mtr);
+	page = buf_block_get_frame(block);
+
+	space = dict_index_get_space(index);
+	zip_size = dict_table_zip_size(index->table);
+
+	while (level != btr_page_get_level(page, &mtr)) {
+		const rec_t*	node_ptr;
+
+		ut_a(space == buf_block_get_space(block));
+		ut_a(space == page_get_space_id(page));
+#ifdef UNIV_ZIP_DEBUG
+		page_zip = buf_block_get_page_zip(block);
+		ut_a(!page_zip || page_zip_validate(page_zip, page));
+#endif /* UNIV_ZIP_DEBUG */
+		ut_a(!page_is_leaf(page));
+
+		page_cur_set_before_first(block, &cursor);
+		page_cur_move_to_next(&cursor);
+
+		node_ptr = page_cur_get_rec(&cursor);
+		offsets = rec_get_offsets(node_ptr, index, offsets,
+					  ULINT_UNDEFINED, &heap);
+		block = btr_node_ptr_get_child(node_ptr, index, offsets, &mtr);
+		page = buf_block_get_frame(block);
+	}
+
+	/* Now we are on the desired level. Loop through the pages on that
+	level. */
+loop:
+	if (trx_is_interrupted(trx)) {
+		mtr_commit(&mtr);
+		mem_heap_free(heap);
+		return(ret);
+	}
+	mem_heap_empty(heap);
+	offsets = offsets2 = NULL;
+	mtr_x_lock(dict_index_get_lock(index), &mtr);
+
+#ifdef UNIV_ZIP_DEBUG
+	page_zip = buf_block_get_page_zip(block);
+	ut_a(!page_zip || page_zip_validate(page_zip, page));
+#endif /* UNIV_ZIP_DEBUG */
+
+	/* Check ordering etc. of records */
+
+	if (!page_validate(page, index)) {
+		btr_validate_report1(index, level, block);
+
+		ret = FALSE;
+	} else if (level == 0) {
+		/* We are on level 0. Check that the records have the right
+		number of fields, and field lengths are right. */
+
+		if (!btr_index_page_validate(block, index)) {
+
+			ret = FALSE;
+		}
+	}
+
+	ut_a(btr_page_get_level(page, &mtr) == level);
+
+	right_page_no = btr_page_get_next(page, &mtr);
+	left_page_no = btr_page_get_prev(page, &mtr);
+
+	ut_a(page_get_n_recs(page) > 0 || (level == 0
+					   && page_get_page_no(page)
+					   == dict_index_get_page(index)));
+
+	if (right_page_no != FIL_NULL) {
+		const rec_t*	right_rec;
+		right_block = btr_block_get(space, zip_size, right_page_no,
+					    RW_X_LATCH, &mtr);
+		right_page = buf_block_get_frame(right_block);
+		if (UNIV_UNLIKELY(btr_page_get_prev(right_page, &mtr)
+				  != page_get_page_no(page))) {
+			btr_validate_report2(index, level, block, right_block);
+			fputs("InnoDB: broken FIL_PAGE_NEXT"
+			      " or FIL_PAGE_PREV links\n", stderr);
+			buf_page_print(page, 0);
+			buf_page_print(right_page, 0);
+
+			ret = FALSE;
+		}
+
+		if (UNIV_UNLIKELY(page_is_comp(right_page)
+				  != page_is_comp(page))) {
+			btr_validate_report2(index, level, block, right_block);
+			fputs("InnoDB: 'compact' flag mismatch\n", stderr);
+			buf_page_print(page, 0);
+			buf_page_print(right_page, 0);
+
+			ret = FALSE;
+
+			goto node_ptr_fails;
+		}
+
+		rec = page_rec_get_prev(page_get_supremum_rec(page));
+		right_rec = page_rec_get_next(page_get_infimum_rec(
+						      right_page));
+		offsets = rec_get_offsets(rec, index,
+					  offsets, ULINT_UNDEFINED, &heap);
+		offsets2 = rec_get_offsets(right_rec, index,
+					   offsets2, ULINT_UNDEFINED, &heap);
+		if (UNIV_UNLIKELY(cmp_rec_rec(rec, right_rec,
+					      offsets, offsets2,
+					      index) >= 0)) {
+
+			btr_validate_report2(index, level, block, right_block);
+
+			fputs("InnoDB: records in wrong order"
+			      " on adjacent pages\n", stderr);
+
+			buf_page_print(page, 0);
+			buf_page_print(right_page, 0);
+
+			fputs("InnoDB: record ", stderr);
+			rec = page_rec_get_prev(page_get_supremum_rec(page));
+			rec_print(stderr, rec, index);
+			putc('\n', stderr);
+			fputs("InnoDB: record ", stderr);
+			rec = page_rec_get_next(
+				page_get_infimum_rec(right_page));
+			rec_print(stderr, rec, index);
+			putc('\n', stderr);
+
+			ret = FALSE;
+		}
+	}
+
+	if (level > 0 && left_page_no == FIL_NULL) {
+		ut_a(REC_INFO_MIN_REC_FLAG & rec_get_info_bits(
+			     page_rec_get_next(page_get_infimum_rec(page)),
+			     page_is_comp(page)));
+	}
+
+	if (buf_block_get_page_no(block) != dict_index_get_page(index)) {
+
+		/* Check father node pointers */
+
+		rec_t*	node_ptr;
+
+		offsets = btr_page_get_father_block(offsets, heap, index,
+						    block, &mtr, &node_cur);
+		father_page = btr_cur_get_page(&node_cur);
+		node_ptr = btr_cur_get_rec(&node_cur);
+
+		btr_cur_position(
+			index, page_rec_get_prev(page_get_supremum_rec(page)),
+			block, &node_cur);
+		offsets = btr_page_get_father_node_ptr(offsets, heap,
+						       &node_cur, &mtr);
+
+		if (UNIV_UNLIKELY(node_ptr != btr_cur_get_rec(&node_cur))
+		    || UNIV_UNLIKELY(btr_node_ptr_get_child_page_no(node_ptr,
+								    offsets)
+				     != buf_block_get_page_no(block))) {
+
+			btr_validate_report1(index, level, block);
+
+			fputs("InnoDB: node pointer to the page is wrong\n",
+			      stderr);
+
+			buf_page_print(father_page, 0);
+			buf_page_print(page, 0);
+
+			fputs("InnoDB: node ptr ", stderr);
+			rec_print(stderr, node_ptr, index);
+
+			rec = btr_cur_get_rec(&node_cur);
+			fprintf(stderr, "\n"
+				"InnoDB: node ptr child page n:o %lu\n",
+				(ulong) btr_node_ptr_get_child_page_no(
+					rec, offsets));
+
+			fputs("InnoDB: record on page ", stderr);
+			rec_print_new(stderr, rec, offsets);
+			putc('\n', stderr);
+			ret = FALSE;
+
+			goto node_ptr_fails;
+		}
+
+		if (!page_is_leaf(page)) {
+			node_ptr_tuple = dict_index_build_node_ptr(
+				index,
+				page_rec_get_next(page_get_infimum_rec(page)),
+				0, heap, btr_page_get_level(page, &mtr));
+
+			if (cmp_dtuple_rec(node_ptr_tuple, node_ptr,
+					   offsets)) {
+				const rec_t* first_rec = page_rec_get_next(
+					page_get_infimum_rec(page));
+
+				btr_validate_report1(index, level, block);
+
+				buf_page_print(father_page, 0);
+				buf_page_print(page, 0);
+
+				fputs("InnoDB: Error: node ptrs differ"
+				      " on levels > 0\n"
+				      "InnoDB: node ptr ", stderr);
+				rec_print_new(stderr, node_ptr, offsets);
+				fputs("InnoDB: first rec ", stderr);
+				rec_print(stderr, first_rec, index);
+				putc('\n', stderr);
+				ret = FALSE;
+
+				goto node_ptr_fails;
+			}
+		}
+
+		if (left_page_no == FIL_NULL) {
+			ut_a(node_ptr == page_rec_get_next(
+				     page_get_infimum_rec(father_page)));
+			ut_a(btr_page_get_prev(father_page, &mtr) == FIL_NULL);
+		}
+
+		if (right_page_no == FIL_NULL) {
+			ut_a(node_ptr == page_rec_get_prev(
+				     page_get_supremum_rec(father_page)));
+			ut_a(btr_page_get_next(father_page, &mtr) == FIL_NULL);
+		} else {
+			const rec_t*	right_node_ptr
+				= page_rec_get_next(node_ptr);
+
+			offsets = btr_page_get_father_block(
+				offsets, heap, index, right_block,
+				&mtr, &right_node_cur);
+			if (right_node_ptr
+			    != page_get_supremum_rec(father_page)) {
+
+				if (btr_cur_get_rec(&right_node_cur)
+				    != right_node_ptr) {
+					ret = FALSE;
+					fputs("InnoDB: node pointer to"
+					      " the right page is wrong\n",
+					      stderr);
+
+					btr_validate_report1(index, level,
+							     block);
+
+					buf_page_print(father_page, 0);
+					buf_page_print(page, 0);
+					buf_page_print(right_page, 0);
+				}
+			} else {
+				page_t*	right_father_page
+					= btr_cur_get_page(&right_node_cur);
+
+				if (btr_cur_get_rec(&right_node_cur)
+				    != page_rec_get_next(
+					    page_get_infimum_rec(
+						    right_father_page))) {
+					ret = FALSE;
+					fputs("InnoDB: node pointer 2 to"
+					      " the right page is wrong\n",
+					      stderr);
+
+					btr_validate_report1(index, level,
+							     block);
+
+					buf_page_print(father_page, 0);
+					buf_page_print(right_father_page, 0);
+					buf_page_print(page, 0);
+					buf_page_print(right_page, 0);
+				}
+
+				if (page_get_page_no(right_father_page)
+				    != btr_page_get_next(father_page, &mtr)) {
+
+					ret = FALSE;
+					fputs("InnoDB: node pointer 3 to"
+					      " the right page is wrong\n",
+					      stderr);
+
+					btr_validate_report1(index, level,
+							     block);
+
+					buf_page_print(father_page, 0);
+					buf_page_print(right_father_page, 0);
+					buf_page_print(page, 0);
+					buf_page_print(right_page, 0);
+				}
+			}
+		}
+	}
+
+node_ptr_fails:
+	/* Commit the mini-transaction to release the latch on 'page'.
+	Re-acquire the latch on right_page, which will become 'page'
+	on the next loop.  The page has already been checked. */
+	mtr_commit(&mtr);
+
+	if (right_page_no != FIL_NULL) {
+		mtr_start(&mtr);
+
+		block = btr_block_get(space, zip_size, right_page_no,
+				      RW_X_LATCH, &mtr);
+		page = buf_block_get_frame(block);
+
+		goto loop;
+	}
+
+	mem_heap_free(heap);
+	return(ret);
+}
+
+/**************************************************************//**
+Checks the consistency of an index tree.
+@return	TRUE if ok */
+UNIV_INTERN
+ibool
+btr_validate_index(
+/*===============*/
+	dict_index_t*	index,	/*!< in: index */
+	trx_t*		trx)	/*!< in: transaction or NULL */
+{
+	mtr_t	mtr;
+	page_t*	root;
+	ulint	i;
+	ulint	n;
+
+	mtr_start(&mtr);
+	mtr_x_lock(dict_index_get_lock(index), &mtr);
+
+	root = btr_root_get(index, &mtr);
+	n = btr_page_get_level(root, &mtr);
+
+	for (i = 0; i <= n && !trx_is_interrupted(trx); i++) {
+		if (!btr_validate_level(index, trx, n - i)) {
+
+			mtr_commit(&mtr);
+
+			return(FALSE);
+		}
+	}
+
+	mtr_commit(&mtr);
+
+	return(TRUE);
+}
+#endif /* !UNIV_HOTBACKUP */
diff --git a/storage/xtradb/btr/btr0cur.c b/storage/xtradb/btr/btr0cur.c
new file mode 100644
index 00000000000..3fc2b48162a
--- /dev/null
+++ b/storage/xtradb/btr/btr0cur.c
@@ -0,0 +1,5256 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2010, Innobase Oy. All Rights Reserved.
+Copyright (c) 2008, Google Inc.
+
+Portions of this file contain modifications contributed and copyrighted by
+Google, Inc. Those modifications are gratefully acknowledged and are described
+briefly in the InnoDB documentation. The contributions by Google are
+incorporated with their permission, and subject to the conditions contained in
+the file COPYING.Google.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file btr/btr0cur.c
+The index tree cursor
+
+All changes that row operations make to a B-tree or the records
+there must go through this module! Undo log records are written here
+of every modify or insert of a clustered index record.
+
+			NOTE!!!
+To make sure we do not run out of disk space during a pessimistic
+insert or update, we have to reserve 2 x the height of the index tree
+many pages in the tablespace before we start the operation, because
+if leaf splitting has been started, it is difficult to undo, except
+by crashing the database and doing a roll-forward.
+
+Created 10/16/1994 Heikki Tuuri
+*******************************************************/
+
+#include "btr0cur.h"
+
+#ifdef UNIV_NONINL
+#include "btr0cur.ic"
+#endif
+
+#include "row0upd.h"
+#ifndef UNIV_HOTBACKUP
+#include "mtr0log.h"
+#include "page0page.h"
+#include "page0zip.h"
+#include "rem0rec.h"
+#include "rem0cmp.h"
+#include "buf0lru.h"
+#include "btr0btr.h"
+#include "btr0sea.h"
+#include "trx0rec.h"
+#include "trx0roll.h" /* trx_is_recv() */
+#include "que0que.h"
+#include "row0row.h"
+#include "srv0srv.h"
+#include "ibuf0ibuf.h"
+#include "lock0lock.h"
+#include "zlib.h"
+
+#ifdef UNIV_DEBUG
+/** If the following is set to TRUE, this module prints a lot of
+trace information of individual record operations */
+UNIV_INTERN ibool	btr_cur_print_record_ops = FALSE;
+#endif /* UNIV_DEBUG */
+
+/** Number of searches down the B-tree in btr_cur_search_to_nth_level(). */
+UNIV_INTERN ulint	btr_cur_n_non_sea	= 0;
+/** Number of successful adaptive hash index lookups in
+btr_cur_search_to_nth_level(). */
+UNIV_INTERN ulint	btr_cur_n_sea		= 0;
+/** Old value of btr_cur_n_non_sea.  Copied by
+srv_refresh_innodb_monitor_stats().  Referenced by
+srv_printf_innodb_monitor(). */
+UNIV_INTERN ulint	btr_cur_n_non_sea_old	= 0;
+/** Old value of btr_cur_n_sea.  Copied by
+srv_refresh_innodb_monitor_stats().  Referenced by
+srv_printf_innodb_monitor(). */
+UNIV_INTERN ulint	btr_cur_n_sea_old	= 0;
+
+/** In the optimistic insert, if the insert does not fit, but this much space
+can be released by page reorganize, then it is reorganized */
+#define BTR_CUR_PAGE_REORGANIZE_LIMIT	(UNIV_PAGE_SIZE / 32)
+
+/** The structure of a BLOB part header */
+/* @{ */
+/*--------------------------------------*/
+#define BTR_BLOB_HDR_PART_LEN		0	/*!< BLOB part len on this
+						page */
+#define BTR_BLOB_HDR_NEXT_PAGE_NO	4	/*!< next BLOB part page no,
+						FIL_NULL if none */
+/*--------------------------------------*/
+#define BTR_BLOB_HDR_SIZE		8	/*!< Size of a BLOB
+						part header, in bytes */
+/* @} */
+#endif /* !UNIV_HOTBACKUP */
+
+/** A BLOB field reference full of zero, for use in assertions and tests.
+Initially, BLOB field references are set to zero, in
+dtuple_convert_big_rec(). */
+UNIV_INTERN const byte field_ref_zero[BTR_EXTERN_FIELD_REF_SIZE];
+
+#ifndef UNIV_HOTBACKUP
+/*******************************************************************//**
+Marks all extern fields in a record as owned by the record. This function
+should be called if the delete mark of a record is removed: a not delete
+marked record always owns all its extern fields. */
+static
+void
+btr_cur_unmark_extern_fields(
+/*=========================*/
+	page_zip_des_t*	page_zip,/*!< in/out: compressed page whose uncompressed
+				part will be updated, or NULL */
+	rec_t*		rec,	/*!< in/out: record in a clustered index */
+	dict_index_t*	index,	/*!< in: index of the page */
+	const ulint*	offsets,/*!< in: array returned by rec_get_offsets() */
+	mtr_t*		mtr);	/*!< in: mtr, or NULL if not logged */
+/*******************************************************************//**
+Adds path information to the cursor for the current page, for which
+the binary search has been performed. */
+static
+void
+btr_cur_add_path_info(
+/*==================*/
+	btr_cur_t*	cursor,		/*!< in: cursor positioned on a page */
+	ulint		height,		/*!< in: height of the page in tree;
+					0 means leaf node */
+	ulint		root_height);	/*!< in: root node height in tree */
+/***********************************************************//**
+Frees the externally stored fields for a record, if the field is mentioned
+in the update vector. */
+static
+void
+btr_rec_free_updated_extern_fields(
+/*===============================*/
+	dict_index_t*	index,	/*!< in: index of rec; the index tree MUST be
+				X-latched */
+	rec_t*		rec,	/*!< in: record */
+	page_zip_des_t*	page_zip,/*!< in: compressed page whose uncompressed
+				part will be updated, or NULL */
+	const ulint*	offsets,/*!< in: rec_get_offsets(rec, index) */
+	const upd_t*	update,	/*!< in: update vector */
+	enum trx_rb_ctx	rb_ctx,	/*!< in: rollback context */
+	mtr_t*		mtr);	/*!< in: mini-transaction handle which contains
+				an X-latch to record page and to the tree */
+/***********************************************************//**
+Frees the externally stored fields for a record. */
+static
+void
+btr_rec_free_externally_stored_fields(
+/*==================================*/
+	dict_index_t*	index,	/*!< in: index of the data, the index
+				tree MUST be X-latched */
+	rec_t*		rec,	/*!< in: record */
+	const ulint*	offsets,/*!< in: rec_get_offsets(rec, index) */
+	page_zip_des_t*	page_zip,/*!< in: compressed page whose uncompressed
+				part will be updated, or NULL */
+	enum trx_rb_ctx	rb_ctx,	/*!< in: rollback context */
+	mtr_t*		mtr);	/*!< in: mini-transaction handle which contains
+				an X-latch to record page and to the index
+				tree */
+/***********************************************************//**
+Gets the externally stored size of a record, in units of a database page.
+@return	externally stored part, in units of a database page */
+static
+ulint
+btr_rec_get_externally_stored_len(
+/*==============================*/
+	rec_t*		rec,	/*!< in: record */
+	const ulint*	offsets);/*!< in: array returned by rec_get_offsets() */
+#endif /* !UNIV_HOTBACKUP */
+
+/******************************************************//**
+The following function is used to set the deleted bit of a record. */
+UNIV_INLINE
+void
+btr_rec_set_deleted_flag(
+/*=====================*/
+	rec_t*		rec,	/*!< in/out: physical record */
+	page_zip_des_t*	page_zip,/*!< in/out: compressed page (or NULL) */
+	ulint		flag)	/*!< in: nonzero if delete marked */
+{
+	if (page_rec_is_comp(rec)) {
+		rec_set_deleted_flag_new(rec, page_zip, flag);
+	} else {
+		ut_ad(!page_zip);
+		rec_set_deleted_flag_old(rec, flag);
+	}
+}
+
+#ifndef UNIV_HOTBACKUP
+/*==================== B-TREE SEARCH =========================*/
+
+/********************************************************************//**
+Latches the leaf page or pages requested. */
+static
+void
+btr_cur_latch_leaves(
+/*=================*/
+	page_t*		page,		/*!< in: leaf page where the search
+					converged */
+	ulint		space,		/*!< in: space id */
+	ulint		zip_size,	/*!< in: compressed page size in bytes
+					or 0 for uncompressed pages */
+	ulint		page_no,	/*!< in: page number of the leaf */
+	ulint		latch_mode,	/*!< in: BTR_SEARCH_LEAF, ... */
+	btr_cur_t*	cursor,		/*!< in: cursor */
+	mtr_t*		mtr)		/*!< in: mtr */
+{
+	ulint		mode;
+	ulint		left_page_no;
+	ulint		right_page_no;
+	buf_block_t*	get_block;
+
+	ut_ad(page && mtr);
+
+	switch (latch_mode) {
+	case BTR_SEARCH_LEAF:
+	case BTR_MODIFY_LEAF:
+		mode = latch_mode == BTR_SEARCH_LEAF ? RW_S_LATCH : RW_X_LATCH;
+		get_block = btr_block_get(space, zip_size, page_no, mode, mtr);
+
+		if (srv_pass_corrupt_table && !get_block) {
+			return;
+		}
+		ut_a(get_block);
+#ifdef UNIV_BTR_DEBUG
+		ut_a(page_is_comp(get_block->frame) == page_is_comp(page));
+#endif /* UNIV_BTR_DEBUG */
+		get_block->check_index_page_at_flush = TRUE;
+		return;
+	case BTR_MODIFY_TREE:
+		/* x-latch also brothers from left to right */
+		left_page_no = btr_page_get_prev(page, mtr);
+
+		if (left_page_no != FIL_NULL) {
+			get_block = btr_block_get(space, zip_size,
+						  left_page_no,
+						  RW_X_LATCH, mtr);
+
+			if (srv_pass_corrupt_table && !get_block) {
+				return;
+			}
+			ut_a(get_block);
+#ifdef UNIV_BTR_DEBUG
+			ut_a(page_is_comp(get_block->frame)
+			     == page_is_comp(page));
+			ut_a(btr_page_get_next(get_block->frame, mtr)
+			     == page_get_page_no(page));
+#endif /* UNIV_BTR_DEBUG */
+			get_block->check_index_page_at_flush = TRUE;
+		}
+
+		get_block = btr_block_get(space, zip_size, page_no,
+					  RW_X_LATCH, mtr);
+
+		if (srv_pass_corrupt_table && !get_block) {
+			return;
+		}
+		ut_a(get_block);
+#ifdef UNIV_BTR_DEBUG
+		ut_a(page_is_comp(get_block->frame) == page_is_comp(page));
+#endif /* UNIV_BTR_DEBUG */
+		get_block->check_index_page_at_flush = TRUE;
+
+		right_page_no = btr_page_get_next(page, mtr);
+
+		if (right_page_no != FIL_NULL) {
+			get_block = btr_block_get(space, zip_size,
+						  right_page_no,
+						  RW_X_LATCH, mtr);
+
+			if (srv_pass_corrupt_table && !get_block) {
+				return;
+			}
+			ut_a(get_block);
+#ifdef UNIV_BTR_DEBUG
+			ut_a(page_is_comp(get_block->frame)
+			     == page_is_comp(page));
+			ut_a(btr_page_get_prev(get_block->frame, mtr)
+			     == page_get_page_no(page));
+#endif /* UNIV_BTR_DEBUG */
+			get_block->check_index_page_at_flush = TRUE;
+		}
+
+		return;
+
+	case BTR_SEARCH_PREV:
+	case BTR_MODIFY_PREV:
+		mode = latch_mode == BTR_SEARCH_PREV ? RW_S_LATCH : RW_X_LATCH;
+		/* latch also left brother */
+		left_page_no = btr_page_get_prev(page, mtr);
+
+		if (left_page_no != FIL_NULL) {
+			get_block = btr_block_get(space, zip_size,
+						  left_page_no, mode, mtr);
+			cursor->left_block = get_block;
+
+			if (srv_pass_corrupt_table && !get_block) {
+				return;
+			}
+			ut_a(get_block);
+#ifdef UNIV_BTR_DEBUG
+			ut_a(page_is_comp(get_block->frame)
+			     == page_is_comp(page));
+			ut_a(btr_page_get_next(get_block->frame, mtr)
+			     == page_get_page_no(page));
+#endif /* UNIV_BTR_DEBUG */
+			get_block->check_index_page_at_flush = TRUE;
+		}
+
+		get_block = btr_block_get(space, zip_size, page_no, mode, mtr);
+
+		if (srv_pass_corrupt_table && !get_block) {
+			return;
+		}
+		ut_a(get_block);
+#ifdef UNIV_BTR_DEBUG
+		ut_a(page_is_comp(get_block->frame) == page_is_comp(page));
+#endif /* UNIV_BTR_DEBUG */
+		get_block->check_index_page_at_flush = TRUE;
+		return;
+	}
+
+	ut_error;
+}
+
+/********************************************************************//**
+Searches an index tree and positions a tree cursor on a given level.
+NOTE: n_fields_cmp in tuple must be set so that it cannot be compared
+to node pointer page number fields on the upper levels of the tree!
+Note that if mode is PAGE_CUR_LE, which is used in inserts, then
+cursor->up_match and cursor->low_match both will have sensible values.
+If mode is PAGE_CUR_GE, then up_match will a have a sensible value.
+
+If mode is PAGE_CUR_LE , cursor is left at the place where an insert of the
+search tuple should be performed in the B-tree. InnoDB does an insert
+immediately after the cursor. Thus, the cursor may end up on a user record,
+or on a page infimum record. */
+UNIV_INTERN
+void
+btr_cur_search_to_nth_level(
+/*========================*/
+	dict_index_t*	index,	/*!< in: index */
+	ulint		level,	/*!< in: the tree level of search */
+	const dtuple_t*	tuple,	/*!< in: data tuple; NOTE: n_fields_cmp in
+				tuple must be set so that it cannot get
+				compared to the node ptr page number field! */
+	ulint		mode,	/*!< in: PAGE_CUR_L, ...;
+				Inserts should always be made using
+				PAGE_CUR_LE to search the position! */
+	ulint		latch_mode, /*!< in: BTR_SEARCH_LEAF, ..., ORed with
+				BTR_INSERT and BTR_ESTIMATE;
+				cursor->left_block is used to store a pointer
+				to the left neighbor page, in the cases
+				BTR_SEARCH_PREV and BTR_MODIFY_PREV;
+				NOTE that if has_search_latch
+				is != 0, we maybe do not have a latch set
+				on the cursor page, we assume
+				the caller uses his search latch
+				to protect the record! */
+	btr_cur_t*	cursor, /*!< in/out: tree cursor; the cursor page is
+				s- or x-latched, but see also above! */
+	ulint		has_search_latch,/*!< in: info on the latch mode the
+				caller currently has on btr_search_latch:
+				RW_S_LATCH, or 0 */
+	const char*	file,	/*!< in: file name */
+	ulint		line,	/*!< in: line where called */
+	mtr_t*		mtr)	/*!< in: mtr */
+{
+	page_cur_t*	page_cursor;
+	page_t*		page;
+	buf_block_t*	guess;
+	rec_t*		node_ptr;
+	ulint		page_no;
+	ulint		space;
+	ulint		up_match;
+	ulint		up_bytes;
+	ulint		low_match;
+	ulint		low_bytes;
+	ulint		height;
+	ulint		savepoint;
+	ulint		page_mode;
+	ulint		insert_planned;
+	ulint		estimate;
+	ulint		ignore_sec_unique;
+	ulint		root_height = 0; /* remove warning */
+#ifdef BTR_CUR_ADAPT
+	btr_search_t*	info;
+#endif
+	mem_heap_t*	heap		= NULL;
+	ulint		offsets_[REC_OFFS_NORMAL_SIZE];
+	ulint*		offsets		= offsets_;
+	rec_offs_init(offsets_);
+	/* Currently, PAGE_CUR_LE is the only search mode used for searches
+	ending to upper levels */
+
+	ut_ad(level == 0 || mode == PAGE_CUR_LE);
+	ut_ad(dict_index_check_search_tuple(index, tuple));
+	ut_ad(!dict_index_is_ibuf(index) || ibuf_inside());
+	ut_ad(dtuple_check_typed(tuple));
+
+#ifdef UNIV_DEBUG
+	cursor->up_match = ULINT_UNDEFINED;
+	cursor->low_match = ULINT_UNDEFINED;
+#endif
+	insert_planned = latch_mode & BTR_INSERT;
+	estimate = latch_mode & BTR_ESTIMATE;
+	ignore_sec_unique = latch_mode & BTR_IGNORE_SEC_UNIQUE;
+	latch_mode = latch_mode & ~(BTR_INSERT | BTR_ESTIMATE
+				    | BTR_IGNORE_SEC_UNIQUE);
+
+	ut_ad(!insert_planned || (mode == PAGE_CUR_LE));
+
+	cursor->flag = BTR_CUR_BINARY;
+	cursor->index = index;
+
+#ifndef BTR_CUR_ADAPT
+	guess = NULL;
+#else
+	info = btr_search_get_info(index);
+
+	guess = info->root_guess;
+
+#ifdef BTR_CUR_HASH_ADAPT
+
+#ifdef UNIV_SEARCH_PERF_STAT
+	info->n_searches++;
+#endif
+	if (rw_lock_get_writer(&btr_search_latch) == RW_LOCK_NOT_LOCKED
+	    && latch_mode <= BTR_MODIFY_LEAF && info->last_hash_succ
+	    && !estimate
+#ifdef PAGE_CUR_LE_OR_EXTENDS
+	    && mode != PAGE_CUR_LE_OR_EXTENDS
+#endif /* PAGE_CUR_LE_OR_EXTENDS */
+	    /* If !has_search_latch, we do a dirty read of
+	    btr_search_enabled below, and btr_search_guess_on_hash()
+	    will have to check it again. */
+	    && UNIV_LIKELY(btr_search_enabled)
+	    && btr_search_guess_on_hash(index, info, tuple, mode,
+					latch_mode, cursor,
+					has_search_latch, mtr)) {
+
+		/* Search using the hash index succeeded */
+
+		ut_ad(cursor->up_match != ULINT_UNDEFINED
+		      || mode != PAGE_CUR_GE);
+		ut_ad(cursor->up_match != ULINT_UNDEFINED
+		      || mode != PAGE_CUR_LE);
+		ut_ad(cursor->low_match != ULINT_UNDEFINED
+		      || mode != PAGE_CUR_LE);
+		btr_cur_n_sea++;
+
+		return;
+	}
+#endif /* BTR_CUR_HASH_ADAPT */
+#endif /* BTR_CUR_ADAPT */
+	btr_cur_n_non_sea++;
+
+	/* If the hash search did not succeed, do binary search down the
+	tree */
+
+	if (has_search_latch) {
+		/* Release possible search latch to obey latching order */
+		rw_lock_s_unlock(&btr_search_latch);
+	}
+
+	/* Store the position of the tree latch we push to mtr so that we
+	know how to release it when we have latched leaf node(s) */
+
+	savepoint = mtr_set_savepoint(mtr);
+
+	if (latch_mode == BTR_MODIFY_TREE) {
+		mtr_x_lock(dict_index_get_lock(index), mtr);
+
+	} else if (latch_mode == BTR_CONT_MODIFY_TREE) {
+		/* Do nothing */
+		ut_ad(mtr_memo_contains(mtr, dict_index_get_lock(index),
+					MTR_MEMO_X_LOCK));
+	} else {
+		mtr_s_lock(dict_index_get_lock(index), mtr);
+	}
+
+	page_cursor = btr_cur_get_page_cur(cursor);
+
+	space = dict_index_get_space(index);
+	page_no = dict_index_get_page(index);
+
+	up_match = 0;
+	up_bytes = 0;
+	low_match = 0;
+	low_bytes = 0;
+
+	height = ULINT_UNDEFINED;
+
+	/* We use these modified search modes on non-leaf levels of the
+	B-tree. These let us end up in the right B-tree leaf. In that leaf
+	we use the original search mode. */
+
+	switch (mode) {
+	case PAGE_CUR_GE:
+		page_mode = PAGE_CUR_L;
+		break;
+	case PAGE_CUR_G:
+		page_mode = PAGE_CUR_LE;
+		break;
+	default:
+#ifdef PAGE_CUR_LE_OR_EXTENDS
+		ut_ad(mode == PAGE_CUR_L || mode == PAGE_CUR_LE
+		      || mode == PAGE_CUR_LE_OR_EXTENDS);
+#else /* PAGE_CUR_LE_OR_EXTENDS */
+		ut_ad(mode == PAGE_CUR_L || mode == PAGE_CUR_LE);
+#endif /* PAGE_CUR_LE_OR_EXTENDS */
+		page_mode = mode;
+		break;
+	}
+
+	/* Loop and search until we arrive at the desired level */
+
+	for (;;) {
+		ulint		zip_size;
+		buf_block_t*	block;
+		ulint		rw_latch;
+		ulint		buf_mode;
+
+		zip_size = dict_table_zip_size(index->table);
+		rw_latch = RW_NO_LATCH;
+		buf_mode = BUF_GET;
+
+		if (height == 0 && latch_mode <= BTR_MODIFY_LEAF) {
+
+			rw_latch = latch_mode;
+
+			if (insert_planned
+			    && ibuf_should_try(index, ignore_sec_unique)) {
+
+				/* Try insert to the insert buffer if the
+				page is not in the buffer pool */
+
+				buf_mode = BUF_GET_IF_IN_POOL;
+			}
+		}
+
+retry_page_get:
+		block = buf_page_get_gen(space, zip_size, page_no,
+					 rw_latch, guess, buf_mode,
+					 file, line, mtr);
+		if (block == NULL) {
+			if (srv_pass_corrupt_table && buf_mode != BUF_GET_IF_IN_POOL) {
+				page_cursor->block = 0;
+				page_cursor->rec = 0;
+				if (estimate) {
+					cursor->path_arr->nth_rec = ULINT_UNDEFINED;
+				}
+				break;
+			}
+			ut_a(buf_mode == BUF_GET_IF_IN_POOL);
+
+			/* This must be a search to perform an insert;
+			try insert to the insert buffer */
+
+			ut_ad(buf_mode == BUF_GET_IF_IN_POOL);
+			ut_ad(insert_planned);
+			ut_ad(cursor->thr);
+
+			if (ibuf_insert(tuple, index, space, zip_size,
+					page_no, cursor->thr)) {
+				/* Insertion to the insert buffer succeeded */
+				cursor->flag = BTR_CUR_INSERT_TO_IBUF;
+				if (UNIV_LIKELY_NULL(heap)) {
+					mem_heap_free(heap);
+				}
+				goto func_exit;
+			}
+
+			/* Insert to the insert buffer did not succeed:
+			retry page get */
+
+			buf_mode = BUF_GET;
+
+			goto retry_page_get;
+		}
+
+		page = buf_block_get_frame(block);
+
+		if (srv_pass_corrupt_table && !page) {
+			page_cursor->block = 0;
+			page_cursor->rec = 0;
+			if (estimate) {
+				cursor->path_arr->nth_rec = ULINT_UNDEFINED;
+			}
+			break;
+		}
+		ut_a(page);
+
+		block->check_index_page_at_flush = TRUE;
+
+		if (rw_latch != RW_NO_LATCH) {
+#ifdef UNIV_ZIP_DEBUG
+			const page_zip_des_t*	page_zip
+				= buf_block_get_page_zip(block);
+			ut_a(!page_zip || page_zip_validate(page_zip, page));
+#endif /* UNIV_ZIP_DEBUG */
+
+			buf_block_dbg_add_level(block, SYNC_TREE_NODE);
+		}
+
+		ut_ad(0 == ut_dulint_cmp(index->id,
+					 btr_page_get_index_id(page)));
+
+		if (UNIV_UNLIKELY(height == ULINT_UNDEFINED)) {
+			/* We are in the root node */
+
+			height = btr_page_get_level(page, mtr);
+			root_height = height;
+			cursor->tree_height = root_height + 1;
+#ifdef BTR_CUR_ADAPT
+			if (block != guess) {
+				info->root_guess = block;
+			}
+#endif
+		}
+
+		if (height == 0) {
+			if (rw_latch == RW_NO_LATCH) {
+
+				btr_cur_latch_leaves(page, space, zip_size,
+						     page_no, latch_mode,
+						     cursor, mtr);
+			}
+
+			if ((latch_mode != BTR_MODIFY_TREE)
+			    && (latch_mode != BTR_CONT_MODIFY_TREE)) {
+
+				/* Release the tree s-latch */
+
+				mtr_release_s_latch_at_savepoint(
+					mtr, savepoint,
+					dict_index_get_lock(index));
+			}
+
+			page_mode = mode;
+		}
+
+		page_cur_search_with_match(block, index, tuple, page_mode,
+					   &up_match, &up_bytes,
+					   &low_match, &low_bytes,
+					   page_cursor);
+
+		if (estimate) {
+			btr_cur_add_path_info(cursor, height, root_height);
+		}
+
+		/* If this is the desired level, leave the loop */
+
+		ut_ad(height == btr_page_get_level(
+			      page_cur_get_page(page_cursor), mtr));
+
+		if (level == height) {
+
+			if (level > 0) {
+				/* x-latch the page */
+				page = btr_page_get(space, zip_size,
+						    page_no, RW_X_LATCH, mtr);
+				ut_a((ibool)!!page_is_comp(page)
+				     == dict_table_is_comp(index->table));
+			}
+
+			break;
+		}
+
+		ut_ad(height > 0);
+
+		height--;
+
+		guess = NULL;
+
+		node_ptr = page_cur_get_rec(page_cursor);
+		offsets = rec_get_offsets(node_ptr, cursor->index, offsets,
+					  ULINT_UNDEFINED, &heap);
+		/* Go to the child node */
+		page_no = btr_node_ptr_get_child_page_no(node_ptr, offsets);
+	}
+
+	if (UNIV_LIKELY_NULL(heap)) {
+		mem_heap_free(heap);
+	}
+
+	if (level == 0) {
+		cursor->low_match = low_match;
+		cursor->low_bytes = low_bytes;
+		cursor->up_match = up_match;
+		cursor->up_bytes = up_bytes;
+
+#ifdef BTR_CUR_ADAPT
+		/* We do a dirty read of btr_search_enabled here.  We
+		will properly check btr_search_enabled again in
+		btr_search_build_page_hash_index() before building a
+		page hash index, while holding btr_search_latch. */
+		if (UNIV_LIKELY(btr_search_enabled)) {
+
+			btr_search_info_update(index, cursor);
+		}
+#endif
+		ut_ad(cursor->up_match != ULINT_UNDEFINED
+		      || mode != PAGE_CUR_GE);
+		ut_ad(cursor->up_match != ULINT_UNDEFINED
+		      || mode != PAGE_CUR_LE);
+		ut_ad(cursor->low_match != ULINT_UNDEFINED
+		      || mode != PAGE_CUR_LE);
+	}
+
+func_exit:
+	if (has_search_latch) {
+
+		rw_lock_s_lock(&btr_search_latch);
+	}
+}
+
+/*****************************************************************//**
+Opens a cursor at either end of an index. */
+UNIV_INTERN
+void
+btr_cur_open_at_index_side_func(
+/*============================*/
+	ibool		from_left,	/*!< in: TRUE if open to the low end,
+					FALSE if to the high end */
+	dict_index_t*	index,		/*!< in: index */
+	ulint		latch_mode,	/*!< in: latch mode */
+	btr_cur_t*	cursor,		/*!< in: cursor */
+	const char*	file,		/*!< in: file name */
+	ulint		line,		/*!< in: line where called */
+	mtr_t*		mtr)		/*!< in: mtr */
+{
+	page_cur_t*	page_cursor;
+	ulint		page_no;
+	ulint		space;
+	ulint		zip_size;
+	ulint		height;
+	ulint		root_height = 0; /* remove warning */
+	rec_t*		node_ptr;
+	ulint		estimate;
+	ulint		savepoint;
+	mem_heap_t*	heap		= NULL;
+	ulint		offsets_[REC_OFFS_NORMAL_SIZE];
+	ulint*		offsets		= offsets_;
+	rec_offs_init(offsets_);
+
+	estimate = latch_mode & BTR_ESTIMATE;
+	latch_mode = latch_mode & ~BTR_ESTIMATE;
+
+	/* Store the position of the tree latch we push to mtr so that we
+	know how to release it when we have latched the leaf node */
+
+	savepoint = mtr_set_savepoint(mtr);
+
+	if (latch_mode == BTR_MODIFY_TREE) {
+		mtr_x_lock(dict_index_get_lock(index), mtr);
+	} else {
+		mtr_s_lock(dict_index_get_lock(index), mtr);
+	}
+
+	page_cursor = btr_cur_get_page_cur(cursor);
+	cursor->index = index;
+
+	space = dict_index_get_space(index);
+	zip_size = dict_table_zip_size(index->table);
+	page_no = dict_index_get_page(index);
+
+	height = ULINT_UNDEFINED;
+
+	for (;;) {
+		buf_block_t*	block;
+		page_t*		page;
+		block = buf_page_get_gen(space, zip_size, page_no,
+					 RW_NO_LATCH, NULL, BUF_GET,
+					 file, line, mtr);
+		page = buf_block_get_frame(block);
+
+		if (srv_pass_corrupt_table && !page) {
+			page_cursor->block = 0;
+			page_cursor->rec = 0;
+			if (estimate) {
+				cursor->path_arr->nth_rec = ULINT_UNDEFINED;
+			}
+			break;
+		}
+		ut_a(page);
+
+		ut_ad(0 == ut_dulint_cmp(index->id,
+					 btr_page_get_index_id(page)));
+
+		block->check_index_page_at_flush = TRUE;
+
+		if (height == ULINT_UNDEFINED) {
+			/* We are in the root node */
+
+			height = btr_page_get_level(page, mtr);
+			root_height = height;
+		}
+
+		if (height == 0) {
+			btr_cur_latch_leaves(page, space, zip_size, page_no,
+					     latch_mode, cursor, mtr);
+
+			/* In versions <= 3.23.52 we had forgotten to
+			release the tree latch here. If in an index scan
+			we had to scan far to find a record visible to the
+			current transaction, that could starve others
+			waiting for the tree latch. */
+
+			if ((latch_mode != BTR_MODIFY_TREE)
+			    && (latch_mode != BTR_CONT_MODIFY_TREE)) {
+
+				/* Release the tree s-latch */
+
+				mtr_release_s_latch_at_savepoint(
+					mtr, savepoint,
+					dict_index_get_lock(index));
+			}
+		}
+
+		if (from_left) {
+			page_cur_set_before_first(block, page_cursor);
+		} else {
+			page_cur_set_after_last(block, page_cursor);
+		}
+
+		if (height == 0) {
+			if (estimate) {
+				btr_cur_add_path_info(cursor, height,
+						      root_height);
+			}
+
+			break;
+		}
+
+		ut_ad(height > 0);
+
+		if (from_left) {
+			page_cur_move_to_next(page_cursor);
+		} else {
+			page_cur_move_to_prev(page_cursor);
+		}
+
+		if (estimate) {
+			btr_cur_add_path_info(cursor, height, root_height);
+		}
+
+		height--;
+
+		node_ptr = page_cur_get_rec(page_cursor);
+		offsets = rec_get_offsets(node_ptr, cursor->index, offsets,
+					  ULINT_UNDEFINED, &heap);
+		/* Go to the child node */
+		page_no = btr_node_ptr_get_child_page_no(node_ptr, offsets);
+	}
+
+	if (UNIV_LIKELY_NULL(heap)) {
+		mem_heap_free(heap);
+	}
+}
+
+/**********************************************************************//**
+Positions a cursor at a randomly chosen position within a B-tree. */
+UNIV_INTERN
+void
+btr_cur_open_at_rnd_pos_func(
+/*=========================*/
+	dict_index_t*	index,		/*!< in: index */
+	ulint		latch_mode,	/*!< in: BTR_SEARCH_LEAF, ... */
+	btr_cur_t*	cursor,		/*!< in/out: B-tree cursor */
+	const char*	file,		/*!< in: file name */
+	ulint		line,		/*!< in: line where called */
+	mtr_t*		mtr)		/*!< in: mtr */
+{
+	page_cur_t*	page_cursor;
+	ulint		page_no;
+	ulint		space;
+	ulint		zip_size;
+	ulint		height;
+	rec_t*		node_ptr;
+	mem_heap_t*	heap		= NULL;
+	ulint		offsets_[REC_OFFS_NORMAL_SIZE];
+	ulint*		offsets		= offsets_;
+	rec_offs_init(offsets_);
+
+	if (latch_mode == BTR_MODIFY_TREE) {
+		mtr_x_lock(dict_index_get_lock(index), mtr);
+	} else {
+		mtr_s_lock(dict_index_get_lock(index), mtr);
+	}
+
+	page_cursor = btr_cur_get_page_cur(cursor);
+	cursor->index = index;
+
+	space = dict_index_get_space(index);
+	zip_size = dict_table_zip_size(index->table);
+	page_no = dict_index_get_page(index);
+
+	height = ULINT_UNDEFINED;
+
+	for (;;) {
+		buf_block_t*	block;
+		page_t*		page;
+
+		block = buf_page_get_gen(space, zip_size, page_no,
+					 RW_NO_LATCH, NULL, BUF_GET,
+					 file, line, mtr);
+		page = buf_block_get_frame(block);
+
+		if (srv_pass_corrupt_table && !page) {
+			page_cursor->block = 0;
+			page_cursor->rec = 0;
+			break;
+		}
+		ut_a(page);
+
+		ut_ad(0 == ut_dulint_cmp(index->id,
+					 btr_page_get_index_id(page)));
+
+		if (height == ULINT_UNDEFINED) {
+			/* We are in the root node */
+
+			height = btr_page_get_level(page, mtr);
+		}
+
+		if (height == 0) {
+			btr_cur_latch_leaves(page, space, zip_size, page_no,
+					     latch_mode, cursor, mtr);
+		}
+
+		page_cur_open_on_rnd_user_rec(block, page_cursor);
+
+		if (height == 0) {
+
+			break;
+		}
+
+		ut_ad(height > 0);
+
+		height--;
+
+		node_ptr = page_cur_get_rec(page_cursor);
+		offsets = rec_get_offsets(node_ptr, cursor->index, offsets,
+					  ULINT_UNDEFINED, &heap);
+		/* Go to the child node */
+		page_no = btr_node_ptr_get_child_page_no(node_ptr, offsets);
+	}
+
+	if (UNIV_LIKELY_NULL(heap)) {
+		mem_heap_free(heap);
+	}
+}
+
+/**********************************************************************//**
+Positions a cursor at a randomly chosen position within a B-tree
+after the given path
+@return TRUE if the position is at the first page, and cursor must point
+        the first record for used by the caller.*/
+UNIV_INTERN
+ibool
+btr_cur_open_at_rnd_pos_after_path(
+/*====================*/
+	dict_index_t*	index,		/*!< in: index */
+	ulint		latch_mode,	/*!< in: BTR_SEARCH_LEAF, ... */
+	btr_path_t*	first_rec_path,
+	btr_cur_t*	cursor,		/*!< in/out: B-tree cursor */
+	mtr_t*		mtr)		/*!< in: mtr */
+{
+	page_cur_t*	page_cursor;
+	btr_path_t*	slot;
+	ibool		is_first_rec	= TRUE;
+	ulint		page_no;
+	ulint		space;
+	ulint		zip_size;
+	ulint		height;
+	rec_t*		node_ptr;
+	mem_heap_t*	heap		= NULL;
+	ulint		offsets_[REC_OFFS_NORMAL_SIZE];
+	ulint*		offsets		= offsets_;
+	rec_offs_init(offsets_);
+
+	if (latch_mode == BTR_MODIFY_TREE) {
+		mtr_x_lock(dict_index_get_lock(index), mtr);
+	} else {
+		mtr_s_lock(dict_index_get_lock(index), mtr);
+	}
+
+	page_cursor = btr_cur_get_page_cur(cursor);
+	cursor->index = index;
+
+	space = dict_index_get_space(index);
+	zip_size = dict_table_zip_size(index->table);
+	page_no = dict_index_get_page(index);
+
+	height = ULINT_UNDEFINED;
+	slot = first_rec_path;
+
+	for (;;) {
+		buf_block_t*	block;
+		page_t*		page;
+
+		block = buf_page_get_gen(space, zip_size, page_no,
+					 RW_NO_LATCH, NULL, BUF_GET,
+					 __FILE__, __LINE__, mtr);
+		page = buf_block_get_frame(block);
+		ut_ad(0 == ut_dulint_cmp(index->id,
+					 btr_page_get_index_id(page)));
+
+		if (height == ULINT_UNDEFINED) {
+			/* We are in the root node */
+
+			height = btr_page_get_level(page, mtr);
+		}
+
+		if (height == 0) {
+			btr_cur_latch_leaves(page, space, zip_size, page_no,
+					     latch_mode, cursor, mtr);
+		}
+
+		if (is_first_rec && slot->nth_rec != ULINT_UNDEFINED) {
+			if (height == 0) {
+				/* must open the first rec */
+				page_cur_open_on_nth_user_rec(block, page_cursor, slot->nth_rec);
+			} else {
+				is_first_rec = page_cur_open_on_rnd_user_rec_after_nth(block,
+								page_cursor, slot->nth_rec);
+			}
+		} else {
+			is_first_rec = FALSE;
+			page_cur_open_on_rnd_user_rec(block, page_cursor);
+		}
+
+		if (height == 0) {
+			break;
+		}
+
+		ut_ad(height > 0);
+
+		height--;
+		slot++;
+
+		node_ptr = page_cur_get_rec(page_cursor);
+		offsets = rec_get_offsets(node_ptr, cursor->index, offsets,
+					  ULINT_UNDEFINED, &heap);
+		/* Go to the child node */
+		page_no = btr_node_ptr_get_child_page_no(node_ptr, offsets);
+	}
+
+	if (UNIV_LIKELY_NULL(heap)) {
+		mem_heap_free(heap);
+	}
+
+	return (is_first_rec);
+}
+
+/*==================== B-TREE INSERT =========================*/
+
+/*************************************************************//**
+Inserts a record if there is enough space, or if enough space can
+be freed by reorganizing. Differs from btr_cur_optimistic_insert because
+no heuristics is applied to whether it pays to use CPU time for
+reorganizing the page or not.
+@return	pointer to inserted record if succeed, else NULL */
+static
+rec_t*
+btr_cur_insert_if_possible(
+/*=======================*/
+	btr_cur_t*	cursor,	/*!< in: cursor on page after which to insert;
+				cursor stays valid */
+	const dtuple_t*	tuple,	/*!< in: tuple to insert; the size info need not
+				have been stored to tuple */
+	ulint		n_ext,	/*!< in: number of externally stored columns */
+	mtr_t*		mtr)	/*!< in: mtr */
+{
+	page_cur_t*	page_cursor;
+	buf_block_t*	block;
+	rec_t*		rec;
+
+	ut_ad(dtuple_check_typed(tuple));
+
+	block = btr_cur_get_block(cursor);
+
+	ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
+	page_cursor = btr_cur_get_page_cur(cursor);
+
+	/* Now, try the insert */
+	rec = page_cur_tuple_insert(page_cursor, tuple,
+				    cursor->index, n_ext, mtr);
+
+	if (UNIV_UNLIKELY(!rec)) {
+		/* If record did not fit, reorganize */
+
+		if (btr_page_reorganize(block, cursor->index, mtr)) {
+
+			page_cur_search(block, cursor->index, tuple,
+					PAGE_CUR_LE, page_cursor);
+
+			rec = page_cur_tuple_insert(page_cursor, tuple,
+						    cursor->index, n_ext, mtr);
+		}
+	}
+
+	return(rec);
+}
+
+/*************************************************************//**
+For an insert, checks the locks and does the undo logging if desired.
+@return	DB_SUCCESS, DB_WAIT_LOCK, DB_FAIL, or error number */
+UNIV_INLINE
+ulint
+btr_cur_ins_lock_and_undo(
+/*======================*/
+	ulint		flags,	/*!< in: undo logging and locking flags: if
+				not zero, the parameters index and thr
+				should be specified */
+	btr_cur_t*	cursor,	/*!< in: cursor on page after which to insert */
+	const dtuple_t*	entry,	/*!< in: entry to insert */
+	que_thr_t*	thr,	/*!< in: query thread or NULL */
+	mtr_t*		mtr,	/*!< in/out: mini-transaction */
+	ibool*		inherit)/*!< out: TRUE if the inserted new record maybe
+				should inherit LOCK_GAP type locks from the
+				successor record */
+{
+	dict_index_t*	index;
+	ulint		err;
+	rec_t*		rec;
+	roll_ptr_t	roll_ptr;
+
+	/* Check if we have to wait for a lock: enqueue an explicit lock
+	request if yes */
+
+	rec = btr_cur_get_rec(cursor);
+	index = cursor->index;
+
+	err = lock_rec_insert_check_and_lock(flags, rec,
+					     btr_cur_get_block(cursor),
+					     index, thr, mtr, inherit);
+
+	if (err != DB_SUCCESS) {
+
+		return(err);
+	}
+
+	if (dict_index_is_clust(index) && !dict_index_is_ibuf(index)) {
+
+		err = trx_undo_report_row_operation(flags, TRX_UNDO_INSERT_OP,
+						    thr, index, entry,
+						    NULL, 0, NULL,
+						    &roll_ptr);
+		if (err != DB_SUCCESS) {
+
+			return(err);
+		}
+
+		/* Now we can fill in the roll ptr field in entry */
+
+		if (!(flags & BTR_KEEP_SYS_FLAG)) {
+
+			row_upd_index_entry_sys_field(entry, index,
+						      DATA_ROLL_PTR, roll_ptr);
+		}
+	}
+
+	return(DB_SUCCESS);
+}
+
+#ifdef UNIV_DEBUG
+/*************************************************************//**
+Report information about a transaction. */
+static
+void
+btr_cur_trx_report(
+/*===============*/
+	trx_t*			trx,	/*!< in: transaction */
+	const dict_index_t*	index,	/*!< in: index */
+	const char*		op)	/*!< in: operation */
+{
+	fprintf(stderr, "Trx with id " TRX_ID_FMT " going to ",
+		TRX_ID_PREP_PRINTF(trx->id));
+	fputs(op, stderr);
+	dict_index_name_print(stderr, trx, index);
+	putc('\n', stderr);
+}
+#endif /* UNIV_DEBUG */
+
+/*************************************************************//**
+Tries to perform an insert to a page in an index tree, next to cursor.
+It is assumed that mtr holds an x-latch on the page. The operation does
+not succeed if there is too little space on the page. If there is just
+one record on the page, the insert will always succeed; this is to
+prevent trying to split a page with just one record.
+@return	DB_SUCCESS, DB_WAIT_LOCK, DB_FAIL, or error number */
+UNIV_INTERN
+ulint
+btr_cur_optimistic_insert(
+/*======================*/
+	ulint		flags,	/*!< in: undo logging and locking flags: if not
+				zero, the parameters index and thr should be
+				specified */
+	btr_cur_t*	cursor,	/*!< in: cursor on page after which to insert;
+				cursor stays valid */
+	dtuple_t*	entry,	/*!< in/out: entry to insert */
+	rec_t**		rec,	/*!< out: pointer to inserted record if
+				succeed */
+	big_rec_t**	big_rec,/*!< out: big rec vector whose fields have to
+				be stored externally by the caller, or
+				NULL */
+	ulint		n_ext,	/*!< in: number of externally stored columns */
+	que_thr_t*	thr,	/*!< in: query thread or NULL */
+	mtr_t*		mtr)	/*!< in: mtr; if this function returns
+				DB_SUCCESS on a leaf page of a secondary
+				index in a compressed tablespace, the
+				mtr must be committed before latching
+				any further pages */
+{
+	big_rec_t*	big_rec_vec	= NULL;
+	dict_index_t*	index;
+	page_cur_t*	page_cursor;
+	buf_block_t*	block;
+	page_t*		page;
+	ulint		max_size;
+	rec_t*		dummy_rec;
+	ibool		leaf;
+	ibool		reorg;
+	ibool		inherit;
+	ulint		zip_size;
+	ulint		rec_size;
+	ulint		err;
+
+	*big_rec = NULL;
+
+	block = btr_cur_get_block(cursor);
+
+	if (srv_pass_corrupt_table && !block) {
+		return(DB_CORRUPTION);
+	}
+	ut_a(block);
+
+	page = buf_block_get_frame(block);
+	index = cursor->index;
+	zip_size = buf_block_get_zip_size(block);
+#ifdef UNIV_DEBUG_VALGRIND
+	if (zip_size) {
+		UNIV_MEM_ASSERT_RW(page, UNIV_PAGE_SIZE);
+		UNIV_MEM_ASSERT_RW(block->page.zip.data, zip_size);
+	}
+#endif /* UNIV_DEBUG_VALGRIND */
+
+	if (!dtuple_check_typed_no_assert(entry)) {
+		fputs("InnoDB: Error in a tuple to insert into ", stderr);
+		dict_index_name_print(stderr, thr_get_trx(thr), index);
+	}
+#ifdef UNIV_DEBUG
+	if (btr_cur_print_record_ops && thr) {
+		btr_cur_trx_report(thr_get_trx(thr), index, "insert into ");
+		dtuple_print(stderr, entry);
+	}
+#endif /* UNIV_DEBUG */
+
+	ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
+	max_size = page_get_max_insert_size_after_reorganize(page, 1);
+	leaf = page_is_leaf(page);
+
+	/* Calculate the record size when entry is converted to a record */
+	rec_size = rec_get_converted_size(index, entry, n_ext);
+
+	if (page_zip_rec_needs_ext(rec_size, page_is_comp(page),
+				   dtuple_get_n_fields(entry), zip_size)) {
+
+		/* The record is so big that we have to store some fields
+		externally on separate database pages */
+		big_rec_vec = dtuple_convert_big_rec(index, entry, &n_ext);
+
+		if (UNIV_UNLIKELY(big_rec_vec == NULL)) {
+
+			return(DB_TOO_BIG_RECORD);
+		}
+
+		rec_size = rec_get_converted_size(index, entry, n_ext);
+	}
+
+	if (UNIV_UNLIKELY(zip_size)) {
+		/* Estimate the free space of an empty compressed page.
+		Subtract one byte for the encoded heap_no in the
+		modification log. */
+		ulint	free_space_zip = page_zip_empty_size(
+			cursor->index->n_fields, zip_size) - 1;
+		ulint	n_uniq = dict_index_get_n_unique_in_tree(index);
+
+		ut_ad(dict_table_is_comp(index->table));
+
+		/* There should be enough room for two node pointer
+		records on an empty non-leaf page.  This prevents
+		infinite page splits. */
+
+		if (UNIV_LIKELY(entry->n_fields >= n_uniq)
+		    && UNIV_UNLIKELY(REC_NODE_PTR_SIZE
+				     + rec_get_converted_size_comp_prefix(
+					     index, entry->fields, n_uniq,
+					     NULL)
+				     /* On a compressed page, there is
+				     a two-byte entry in the dense
+				     page directory for every record.
+				     But there is no record header. */
+				     - (REC_N_NEW_EXTRA_BYTES - 2)
+				     > free_space_zip / 2)) {
+
+			if (big_rec_vec) {
+				dtuple_convert_back_big_rec(
+					index, entry, big_rec_vec);
+			}
+
+			return(DB_TOO_BIG_RECORD);
+		}
+	}
+
+	/* If there have been many consecutive inserts, and we are on the leaf
+	level, check if we have to split the page to reserve enough free space
+	for future updates of records. */
+
+	if (dict_index_is_clust(index)
+	    && (page_get_n_recs(page) >= 2)
+	    && UNIV_LIKELY(leaf)
+	    && (dict_index_get_space_reserve() + rec_size > max_size)
+	    && (btr_page_get_split_rec_to_right(cursor, &dummy_rec)
+		|| btr_page_get_split_rec_to_left(cursor, &dummy_rec))) {
+fail:
+		err = DB_FAIL;
+fail_err:
+
+		if (big_rec_vec) {
+			dtuple_convert_back_big_rec(index, entry, big_rec_vec);
+		}
+
+		return(err);
+	}
+
+	if (UNIV_UNLIKELY(max_size < BTR_CUR_PAGE_REORGANIZE_LIMIT
+			  || max_size < rec_size)
+	    && UNIV_LIKELY(page_get_n_recs(page) > 1)
+	    && page_get_max_insert_size(page, 1) < rec_size) {
+
+		goto fail;
+	}
+
+	/* Check locks and write to the undo log, if specified */
+	err = btr_cur_ins_lock_and_undo(flags, cursor, entry,
+					thr, mtr, &inherit);
+
+	if (UNIV_UNLIKELY(err != DB_SUCCESS)) {
+
+		goto fail_err;
+	}
+
+	page_cursor = btr_cur_get_page_cur(cursor);
+
+	/* Now, try the insert */
+
+	{
+		const rec_t* page_cursor_rec = page_cur_get_rec(page_cursor);
+		*rec = page_cur_tuple_insert(page_cursor, entry, index,
+					     n_ext, mtr);
+		reorg = page_cursor_rec != page_cur_get_rec(page_cursor);
+
+		if (UNIV_UNLIKELY(reorg)) {
+			ut_a(zip_size);
+			ut_a(*rec);
+		}
+	}
+
+	if (UNIV_UNLIKELY(!*rec) && UNIV_LIKELY(!reorg)) {
+		/* If the record did not fit, reorganize */
+		if (UNIV_UNLIKELY(!btr_page_reorganize(block, index, mtr))) {
+			ut_a(zip_size);
+
+			goto fail;
+		}
+
+		ut_ad(zip_size
+		      || page_get_max_insert_size(page, 1) == max_size);
+
+		reorg = TRUE;
+
+		page_cur_search(block, index, entry, PAGE_CUR_LE, page_cursor);
+
+		*rec = page_cur_tuple_insert(page_cursor, entry, index,
+					     n_ext, mtr);
+
+		if (UNIV_UNLIKELY(!*rec)) {
+			if (UNIV_LIKELY(zip_size != 0)) {
+
+				goto fail;
+			}
+
+			fputs("InnoDB: Error: cannot insert tuple ", stderr);
+			dtuple_print(stderr, entry);
+			fputs(" into ", stderr);
+			dict_index_name_print(stderr, thr_get_trx(thr), index);
+			fprintf(stderr, "\nInnoDB: max insert size %lu\n",
+				(ulong) max_size);
+			ut_error;
+		}
+	}
+
+#ifdef BTR_CUR_HASH_ADAPT
+	if (!reorg && leaf && (cursor->flag == BTR_CUR_HASH)) {
+		btr_search_update_hash_node_on_insert(cursor);
+	} else {
+		btr_search_update_hash_on_insert(cursor);
+	}
+#endif
+
+	if (!(flags & BTR_NO_LOCKING_FLAG) && inherit) {
+
+		lock_update_insert(block, *rec);
+	}
+
+#if 0
+	fprintf(stderr, "Insert into page %lu, max ins size %lu,"
+		" rec %lu ind type %lu\n",
+		buf_block_get_page_no(block), max_size,
+		rec_size + PAGE_DIR_SLOT_SIZE, index->type);
+#endif
+	if (leaf && !dict_index_is_clust(index)) {
+		/* Update the free bits of the B-tree page in the
+		insert buffer bitmap. */
+
+		/* The free bits in the insert buffer bitmap must
+		never exceed the free space on a page.  It is safe to
+		decrement or reset the bits in the bitmap in a
+		mini-transaction that is committed before the
+		mini-transaction that affects the free space. */
+
+		/* It is unsafe to increment the bits in a separately
+		committed mini-transaction, because in crash recovery,
+		the free bits could momentarily be set too high. */
+
+		if (zip_size) {
+			/* Update the bits in the same mini-transaction. */
+			ibuf_update_free_bits_zip(block, mtr);
+		} else {
+			/* Decrement the bits in a separate
+			mini-transaction. */
+			ibuf_update_free_bits_if_full(
+				block, max_size,
+				rec_size + PAGE_DIR_SLOT_SIZE);
+		}
+	}
+
+	*big_rec = big_rec_vec;
+
+	return(DB_SUCCESS);
+}
+
+/*************************************************************//**
+Performs an insert on a page of an index tree. It is assumed that mtr
+holds an x-latch on the tree and on the cursor page. If the insert is
+made on the leaf level, to avoid deadlocks, mtr must also own x-latches
+to brothers of page, if those brothers exist.
+@return	DB_SUCCESS or error number */
+UNIV_INTERN
+ulint
+btr_cur_pessimistic_insert(
+/*=======================*/
+	ulint		flags,	/*!< in: undo logging and locking flags: if not
+				zero, the parameter thr should be
+				specified; if no undo logging is specified,
+				then the caller must have reserved enough
+				free extents in the file space so that the
+				insertion will certainly succeed */
+	btr_cur_t*	cursor,	/*!< in: cursor after which to insert;
+				cursor stays valid */
+	dtuple_t*	entry,	/*!< in/out: entry to insert */
+	rec_t**		rec,	/*!< out: pointer to inserted record if
+				succeed */
+	big_rec_t**	big_rec,/*!< out: big rec vector whose fields have to
+				be stored externally by the caller, or
+				NULL */
+	ulint		n_ext,	/*!< in: number of externally stored columns */
+	que_thr_t*	thr,	/*!< in: query thread or NULL */
+	mtr_t*		mtr)	/*!< in: mtr */
+{
+	dict_index_t*	index		= cursor->index;
+	ulint		zip_size	= dict_table_zip_size(index->table);
+	big_rec_t*	big_rec_vec	= NULL;
+	mem_heap_t*	heap		= NULL;
+	ulint		err;
+	ibool		dummy_inh;
+	ibool		success;
+	ulint		n_extents	= 0;
+	ulint		n_reserved;
+
+	ut_ad(dtuple_check_typed(entry));
+
+	*big_rec = NULL;
+
+	ut_ad(mtr_memo_contains(mtr,
+				dict_index_get_lock(btr_cur_get_index(cursor)),
+				MTR_MEMO_X_LOCK));
+	ut_ad(mtr_memo_contains(mtr, btr_cur_get_block(cursor),
+				MTR_MEMO_PAGE_X_FIX));
+
+	/* Try first an optimistic insert; reset the cursor flag: we do not
+	assume anything of how it was positioned */
+
+	cursor->flag = BTR_CUR_BINARY;
+
+	err = btr_cur_optimistic_insert(flags, cursor, entry, rec,
+					big_rec, n_ext, thr, mtr);
+	if (err != DB_FAIL) {
+
+		return(err);
+	}
+
+	/* Retry with a pessimistic insert. Check locks and write to undo log,
+	if specified */
+
+	err = btr_cur_ins_lock_and_undo(flags, cursor, entry,
+					thr, mtr, &dummy_inh);
+
+	if (err != DB_SUCCESS) {
+
+		return(err);
+	}
+
+	if (!(flags & BTR_NO_UNDO_LOG_FLAG)) {
+		/* First reserve enough free space for the file segments
+		of the index tree, so that the insert will not fail because
+		of lack of space */
+
+		n_extents = cursor->tree_height / 16 + 3;
+
+		success = fsp_reserve_free_extents(&n_reserved, index->space,
+						   n_extents, FSP_NORMAL, mtr);
+		if (!success) {
+			return(DB_OUT_OF_FILE_SPACE);
+		}
+	}
+
+	if (page_zip_rec_needs_ext(rec_get_converted_size(index, entry, n_ext),
+				   dict_table_is_comp(index->table),
+				   dict_index_get_n_fields(index),
+				   zip_size)) {
+		/* The record is so big that we have to store some fields
+		externally on separate database pages */
+
+		if (UNIV_LIKELY_NULL(big_rec_vec)) {
+			/* This should never happen, but we handle
+			the situation in a robust manner. */
+			ut_ad(0);
+			dtuple_convert_back_big_rec(index, entry, big_rec_vec);
+		}
+
+		big_rec_vec = dtuple_convert_big_rec(index, entry, &n_ext);
+
+		if (big_rec_vec == NULL) {
+
+			if (n_extents > 0) {
+				fil_space_release_free_extents(index->space,
+							       n_reserved);
+			}
+			return(DB_TOO_BIG_RECORD);
+		}
+	}
+
+	if (dict_index_get_page(index)
+	    == buf_block_get_page_no(btr_cur_get_block(cursor))) {
+
+		/* The page is the root page */
+		*rec = btr_root_raise_and_insert(cursor, entry, n_ext, mtr);
+	} else {
+		*rec = btr_page_split_and_insert(cursor, entry, n_ext, mtr);
+	}
+
+	if (UNIV_LIKELY_NULL(heap)) {
+		mem_heap_free(heap);
+	}
+
+	ut_ad(page_rec_get_next(btr_cur_get_rec(cursor)) == *rec);
+
+#ifdef BTR_CUR_ADAPT
+	btr_search_update_hash_on_insert(cursor);
+#endif
+	if (!(flags & BTR_NO_LOCKING_FLAG)) {
+
+		lock_update_insert(btr_cur_get_block(cursor), *rec);
+	}
+
+	if (n_extents > 0) {
+		fil_space_release_free_extents(index->space, n_reserved);
+	}
+
+	*big_rec = big_rec_vec;
+
+	return(DB_SUCCESS);
+}
+
+/*==================== B-TREE UPDATE =========================*/
+
+/*************************************************************//**
+For an update, checks the locks and does the undo logging.
+@return	DB_SUCCESS, DB_WAIT_LOCK, or error number */
+UNIV_INLINE
+ulint
+btr_cur_upd_lock_and_undo(
+/*======================*/
+	ulint		flags,	/*!< in: undo logging and locking flags */
+	btr_cur_t*	cursor,	/*!< in: cursor on record to update */
+	const upd_t*	update,	/*!< in: update vector */
+	ulint		cmpl_info,/*!< in: compiler info on secondary index
+				updates */
+	que_thr_t*	thr,	/*!< in: query thread */
+	mtr_t*		mtr,	/*!< in/out: mini-transaction */
+	roll_ptr_t*	roll_ptr)/*!< out: roll pointer */
+{
+	dict_index_t*	index;
+	rec_t*		rec;
+	ulint		err;
+
+	ut_ad(cursor && update && thr && roll_ptr);
+
+	rec = btr_cur_get_rec(cursor);
+	index = cursor->index;
+
+	if (!dict_index_is_clust(index)) {
+		/* We do undo logging only when we update a clustered index
+		record */
+		return(lock_sec_rec_modify_check_and_lock(
+			       flags, btr_cur_get_block(cursor), rec,
+			       index, thr, mtr));
+	}
+
+	/* Check if we have to wait for a lock: enqueue an explicit lock
+	request if yes */
+
+	err = DB_SUCCESS;
+
+	if (!(flags & BTR_NO_LOCKING_FLAG)) {
+		mem_heap_t*	heap		= NULL;
+		ulint		offsets_[REC_OFFS_NORMAL_SIZE];
+		rec_offs_init(offsets_);
+
+		err = lock_clust_rec_modify_check_and_lock(
+			flags, btr_cur_get_block(cursor), rec, index,
+			rec_get_offsets(rec, index, offsets_,
+					ULINT_UNDEFINED, &heap), thr);
+		if (UNIV_LIKELY_NULL(heap)) {
+			mem_heap_free(heap);
+		}
+		if (err != DB_SUCCESS) {
+
+			return(err);
+		}
+	}
+
+	/* Append the info about the update in the undo log */
+
+	err = trx_undo_report_row_operation(flags, TRX_UNDO_MODIFY_OP, thr,
+					    index, NULL, update,
+					    cmpl_info, rec, roll_ptr);
+	return(err);
+}
+
+/***********************************************************//**
+Writes a redo log record of updating a record in-place. */
+UNIV_INLINE
+void
+btr_cur_update_in_place_log(
+/*========================*/
+	ulint		flags,		/*!< in: flags */
+	rec_t*		rec,		/*!< in: record */
+	dict_index_t*	index,		/*!< in: index where cursor positioned */
+	const upd_t*	update,		/*!< in: update vector */
+	trx_t*		trx,		/*!< in: transaction */
+	roll_ptr_t	roll_ptr,	/*!< in: roll ptr */
+	mtr_t*		mtr)		/*!< in: mtr */
+{
+	byte*	log_ptr;
+	page_t*	page	= page_align(rec);
+	ut_ad(flags < 256);
+	ut_ad(!!page_is_comp(page) == dict_table_is_comp(index->table));
+
+	log_ptr = mlog_open_and_write_index(mtr, rec, index, page_is_comp(page)
+					    ? MLOG_COMP_REC_UPDATE_IN_PLACE
+					    : MLOG_REC_UPDATE_IN_PLACE,
+					    1 + DATA_ROLL_PTR_LEN + 14 + 2
+					    + MLOG_BUF_MARGIN);
+
+	if (!log_ptr) {
+		/* Logging in mtr is switched off during crash recovery */
+		return;
+	}
+
+	/* The code below assumes index is a clustered index: change index to
+	the clustered index if we are updating a secondary index record (or we
+	could as well skip writing the sys col values to the log in this case
+	because they are not needed for a secondary index record update) */
+
+	index = dict_table_get_first_index(index->table);
+
+	mach_write_to_1(log_ptr, flags);
+	log_ptr++;
+
+	log_ptr = row_upd_write_sys_vals_to_log(index, trx, roll_ptr, log_ptr,
+						mtr);
+	mach_write_to_2(log_ptr, page_offset(rec));
+	log_ptr += 2;
+
+	row_upd_index_write_log(update, log_ptr, mtr);
+}
+#endif /* UNIV_HOTBACKUP */
+
+/***********************************************************//**
+Parses a redo log record of updating a record in-place.
+@return	end of log record or NULL */
+UNIV_INTERN
+byte*
+btr_cur_parse_update_in_place(
+/*==========================*/
+	byte*		ptr,	/*!< in: buffer */
+	byte*		end_ptr,/*!< in: buffer end */
+	page_t*		page,	/*!< in/out: page or NULL */
+	page_zip_des_t*	page_zip,/*!< in/out: compressed page, or NULL */
+	dict_index_t*	index)	/*!< in: index corresponding to page */
+{
+	ulint		flags;
+	rec_t*		rec;
+	upd_t*		update;
+	ulint		pos;
+	trx_id_t	trx_id;
+	roll_ptr_t	roll_ptr;
+	ulint		rec_offset;
+	mem_heap_t*	heap;
+	ulint*		offsets;
+
+	if (end_ptr < ptr + 1) {
+
+		return(NULL);
+	}
+
+	flags = mach_read_from_1(ptr);
+	ptr++;
+
+	ptr = row_upd_parse_sys_vals(ptr, end_ptr, &pos, &trx_id, &roll_ptr);
+
+	if (ptr == NULL) {
+
+		return(NULL);
+	}
+
+	if (end_ptr < ptr + 2) {
+
+		return(NULL);
+	}
+
+	rec_offset = mach_read_from_2(ptr);
+	ptr += 2;
+
+	ut_a(rec_offset <= UNIV_PAGE_SIZE);
+
+	heap = mem_heap_create(256);
+
+	ptr = row_upd_index_parse(ptr, end_ptr, heap, &update);
+
+	if (!ptr || !page) {
+
+		goto func_exit;
+	}
+
+	ut_a((ibool)!!page_is_comp(page) == dict_table_is_comp(index->table));
+	rec = page + rec_offset;
+
+	/* We do not need to reserve btr_search_latch, as the page is only
+	being recovered, and there cannot be a hash index to it. */
+
+	offsets = rec_get_offsets(rec, index, NULL, ULINT_UNDEFINED, &heap);
+
+	if (!(flags & BTR_KEEP_SYS_FLAG)) {
+		row_upd_rec_sys_fields_in_recovery(rec, page_zip, offsets,
+						   pos, trx_id, roll_ptr);
+	}
+
+	row_upd_rec_in_place(rec, index, offsets, update, page_zip);
+
+func_exit:
+	mem_heap_free(heap);
+
+	return(ptr);
+}
+
+#ifndef UNIV_HOTBACKUP
+/*************************************************************//**
+See if there is enough place in the page modification log to log
+an update-in-place.
+@return	TRUE if enough place */
+static
+ibool
+btr_cur_update_alloc_zip(
+/*=====================*/
+	page_zip_des_t*	page_zip,/*!< in/out: compressed page */
+	buf_block_t*	block,	/*!< in/out: buffer page */
+	dict_index_t*	index,	/*!< in: the index corresponding to the block */
+	ulint		length,	/*!< in: size needed */
+	ibool		create,	/*!< in: TRUE=delete-and-insert,
+				FALSE=update-in-place */
+	mtr_t*		mtr)	/*!< in: mini-transaction */
+{
+	ut_a(page_zip == buf_block_get_page_zip(block));
+	ut_ad(page_zip);
+	ut_ad(!dict_index_is_ibuf(index));
+
+	if (page_zip_available(page_zip, dict_index_is_clust(index),
+			       length, create)) {
+		return(TRUE);
+	}
+
+	if (!page_zip->m_nonempty) {
+		/* The page has been freshly compressed, so
+		recompressing it will not help. */
+		return(FALSE);
+	}
+
+	if (!page_zip_compress(page_zip, buf_block_get_frame(block),
+			       index, mtr)) {
+		/* Unable to compress the page */
+		return(FALSE);
+	}
+
+	/* After recompressing a page, we must make sure that the free
+	bits in the insert buffer bitmap will not exceed the free
+	space on the page.  Because this function will not attempt
+	recompression unless page_zip_available() fails above, it is
+	safe to reset the free bits if page_zip_available() fails
+	again, below.  The free bits can safely be reset in a separate
+	mini-transaction.  If page_zip_available() succeeds below, we
+	can be sure that the page_zip_compress() above did not reduce
+	the free space available on the page. */
+
+	if (!page_zip_available(page_zip, dict_index_is_clust(index),
+				length, create)) {
+		/* Out of space: reset the free bits. */
+		if (!dict_index_is_clust(index)
+		    && page_is_leaf(buf_block_get_frame(block))) {
+			ibuf_reset_free_bits(block);
+		}
+		return(FALSE);
+	}
+
+	return(TRUE);
+}
+
+/*************************************************************//**
+Updates a record when the update causes no size changes in its fields.
+We assume here that the ordering fields of the record do not change.
+@return	DB_SUCCESS or error number */
+UNIV_INTERN
+ulint
+btr_cur_update_in_place(
+/*====================*/
+	ulint		flags,	/*!< in: undo logging and locking flags */
+	btr_cur_t*	cursor,	/*!< in: cursor on the record to update;
+				cursor stays valid and positioned on the
+				same record */
+	const upd_t*	update,	/*!< in: update vector */
+	ulint		cmpl_info,/*!< in: compiler info on secondary index
+				updates */
+	que_thr_t*	thr,	/*!< in: query thread */
+	mtr_t*		mtr)	/*!< in: mtr; must be committed before
+				latching any further pages */
+{
+	dict_index_t*	index;
+	buf_block_t*	block;
+	page_zip_des_t*	page_zip;
+	ulint		err;
+	rec_t*		rec;
+	roll_ptr_t	roll_ptr	= ut_dulint_zero;
+	trx_t*		trx;
+	ulint		was_delete_marked;
+	mem_heap_t*	heap		= NULL;
+	ulint		offsets_[REC_OFFS_NORMAL_SIZE];
+	ulint*		offsets		= offsets_;
+	rec_offs_init(offsets_);
+
+	rec = btr_cur_get_rec(cursor);
+	index = cursor->index;
+	ut_ad(!!page_rec_is_comp(rec) == dict_table_is_comp(index->table));
+	/* The insert buffer tree should never be updated in place. */
+	ut_ad(!dict_index_is_ibuf(index));
+
+	trx = thr_get_trx(thr);
+	offsets = rec_get_offsets(rec, index, offsets, ULINT_UNDEFINED, &heap);
+#ifdef UNIV_DEBUG
+	if (btr_cur_print_record_ops && thr) {
+		btr_cur_trx_report(trx, index, "update ");
+		rec_print_new(stderr, rec, offsets);
+	}
+#endif /* UNIV_DEBUG */
+
+	block = btr_cur_get_block(cursor);
+	page_zip = buf_block_get_page_zip(block);
+
+	/* Check that enough space is available on the compressed page. */
+	if (UNIV_LIKELY_NULL(page_zip)
+	    && !btr_cur_update_alloc_zip(page_zip, block, index,
+					 rec_offs_size(offsets), FALSE, mtr)) {
+		return(DB_ZIP_OVERFLOW);
+	}
+
+	/* Do lock checking and undo logging */
+	err = btr_cur_upd_lock_and_undo(flags, cursor, update, cmpl_info,
+					thr, mtr, &roll_ptr);
+	if (UNIV_UNLIKELY(err != DB_SUCCESS)) {
+
+		if (UNIV_LIKELY_NULL(heap)) {
+			mem_heap_free(heap);
+		}
+		return(err);
+	}
+
+	if (block->is_hashed) {
+		/* The function row_upd_changes_ord_field_binary works only
+		if the update vector was built for a clustered index, we must
+		NOT call it if index is secondary */
+
+		if (!dict_index_is_clust(index)
+		    || row_upd_changes_ord_field_binary(NULL, index, update)) {
+
+			/* Remove possible hash index pointer to this record */
+			btr_search_update_hash_on_delete(cursor);
+		}
+
+		rw_lock_x_lock(&btr_search_latch);
+	}
+
+	if (!(flags & BTR_KEEP_SYS_FLAG)) {
+		row_upd_rec_sys_fields(rec, NULL,
+				       index, offsets, trx, roll_ptr);
+	}
+
+	was_delete_marked = rec_get_deleted_flag(
+		rec, page_is_comp(buf_block_get_frame(block)));
+
+	row_upd_rec_in_place(rec, index, offsets, update, page_zip);
+
+	if (block->is_hashed) {
+		rw_lock_x_unlock(&btr_search_latch);
+	}
+
+	if (page_zip && !dict_index_is_clust(index)
+	    && page_is_leaf(buf_block_get_frame(block))) {
+		/* Update the free bits in the insert buffer. */
+		ibuf_update_free_bits_zip(block, mtr);
+	}
+
+	btr_cur_update_in_place_log(flags, rec, index, update,
+				    trx, roll_ptr, mtr);
+
+	if (was_delete_marked
+	    && !rec_get_deleted_flag(rec, page_is_comp(
+					     buf_block_get_frame(block)))) {
+		/* The new updated record owns its possible externally
+		stored fields */
+
+		btr_cur_unmark_extern_fields(page_zip,
+					     rec, index, offsets, mtr);
+	}
+
+	if (UNIV_LIKELY_NULL(heap)) {
+		mem_heap_free(heap);
+	}
+	return(DB_SUCCESS);
+}
+
+/*************************************************************//**
+Tries to update a record on a page in an index tree. It is assumed that mtr
+holds an x-latch on the page. The operation does not succeed if there is too
+little space on the page or if the update would result in too empty a page,
+so that tree compression is recommended. We assume here that the ordering
+fields of the record do not change.
+@return DB_SUCCESS, or DB_OVERFLOW if the updated record does not fit,
+DB_UNDERFLOW if the page would become too empty, or DB_ZIP_OVERFLOW if
+there is not enough space left on the compressed page */
+UNIV_INTERN
+ulint
+btr_cur_optimistic_update(
+/*======================*/
+	ulint		flags,	/*!< in: undo logging and locking flags */
+	btr_cur_t*	cursor,	/*!< in: cursor on the record to update;
+				cursor stays valid and positioned on the
+				same record */
+	const upd_t*	update,	/*!< in: update vector; this must also
+				contain trx id and roll ptr fields */
+	ulint		cmpl_info,/*!< in: compiler info on secondary index
+				updates */
+	que_thr_t*	thr,	/*!< in: query thread */
+	mtr_t*		mtr)	/*!< in: mtr; must be committed before
+				latching any further pages */
+{
+	dict_index_t*	index;
+	page_cur_t*	page_cursor;
+	ulint		err;
+	buf_block_t*	block;
+	page_t*		page;
+	page_zip_des_t*	page_zip;
+	rec_t*		rec;
+	rec_t*		orig_rec;
+	ulint		max_size;
+	ulint		new_rec_size;
+	ulint		old_rec_size;
+	dtuple_t*	new_entry;
+	roll_ptr_t	roll_ptr;
+	trx_t*		trx;
+	mem_heap_t*	heap;
+	ulint		i;
+	ulint		n_ext;
+	ulint*		offsets;
+
+	block = btr_cur_get_block(cursor);
+	page = buf_block_get_frame(block);
+	orig_rec = rec = btr_cur_get_rec(cursor);
+	index = cursor->index;
+	ut_ad(!!page_rec_is_comp(rec) == dict_table_is_comp(index->table));
+	ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
+	/* The insert buffer tree should never be updated in place. */
+	ut_ad(!dict_index_is_ibuf(index));
+
+	heap = mem_heap_create(1024);
+	offsets = rec_get_offsets(rec, index, NULL, ULINT_UNDEFINED, &heap);
+
+#ifdef UNIV_DEBUG
+	if (btr_cur_print_record_ops && thr) {
+		btr_cur_trx_report(thr_get_trx(thr), index, "update ");
+		rec_print_new(stderr, rec, offsets);
+	}
+#endif /* UNIV_DEBUG */
+
+	if (!row_upd_changes_field_size_or_external(index, offsets, update)) {
+
+		/* The simplest and the most common case: the update does not
+		change the size of any field and none of the updated fields is
+		externally stored in rec or update, and there is enough space
+		on the compressed page to log the update. */
+
+		mem_heap_free(heap);
+		return(btr_cur_update_in_place(flags, cursor, update,
+					       cmpl_info, thr, mtr));
+	}
+
+	if (rec_offs_any_extern(offsets)) {
+any_extern:
+		/* Externally stored fields are treated in pessimistic
+		update */
+
+		mem_heap_free(heap);
+		return(DB_OVERFLOW);
+	}
+
+	for (i = 0; i < upd_get_n_fields(update); i++) {
+		if (dfield_is_ext(&upd_get_nth_field(update, i)->new_val)) {
+
+			goto any_extern;
+		}
+	}
+
+	page_cursor = btr_cur_get_page_cur(cursor);
+
+	new_entry = row_rec_to_index_entry(ROW_COPY_DATA, rec, index, offsets,
+					   &n_ext, heap);
+	/* We checked above that there are no externally stored fields. */
+	ut_a(!n_ext);
+
+	/* The page containing the clustered index record
+	corresponding to new_entry is latched in mtr.
+	Thus the following call is safe. */
+	row_upd_index_replace_new_col_vals_index_pos(new_entry, index, update,
+						     FALSE, heap);
+	old_rec_size = rec_offs_size(offsets);
+	new_rec_size = rec_get_converted_size(index, new_entry, 0);
+
+	page_zip = buf_block_get_page_zip(block);
+#ifdef UNIV_ZIP_DEBUG
+	ut_a(!page_zip || page_zip_validate(page_zip, page));
+#endif /* UNIV_ZIP_DEBUG */
+
+	if (UNIV_LIKELY_NULL(page_zip)
+	    && !btr_cur_update_alloc_zip(page_zip, block, index,
+					 new_rec_size, TRUE, mtr)) {
+		err = DB_ZIP_OVERFLOW;
+		goto err_exit;
+	}
+
+	if (UNIV_UNLIKELY(new_rec_size
+			  >= (page_get_free_space_of_empty(page_is_comp(page))
+			      / 2))) {
+
+		err = DB_OVERFLOW;
+		goto err_exit;
+	}
+
+	if (UNIV_UNLIKELY(page_get_data_size(page)
+			  - old_rec_size + new_rec_size
+			  < BTR_CUR_PAGE_COMPRESS_LIMIT)) {
+
+		/* The page would become too empty */
+
+		err = DB_UNDERFLOW;
+		goto err_exit;
+	}
+
+	max_size = old_rec_size
+		+ page_get_max_insert_size_after_reorganize(page, 1);
+
+	if (!(((max_size >= BTR_CUR_PAGE_REORGANIZE_LIMIT)
+	       && (max_size >= new_rec_size))
+	      || (page_get_n_recs(page) <= 1))) {
+
+		/* There was not enough space, or it did not pay to
+		reorganize: for simplicity, we decide what to do assuming a
+		reorganization is needed, though it might not be necessary */
+
+		err = DB_OVERFLOW;
+		goto err_exit;
+	}
+
+	/* Do lock checking and undo logging */
+	err = btr_cur_upd_lock_and_undo(flags, cursor, update, cmpl_info,
+					thr, mtr, &roll_ptr);
+	if (err != DB_SUCCESS) {
+
+		goto err_exit;
+	}
+
+	/* Ok, we may do the replacement. Store on the page infimum the
+	explicit locks on rec, before deleting rec (see the comment in
+	btr_cur_pessimistic_update). */
+
+	lock_rec_store_on_page_infimum(block, rec);
+
+	btr_search_update_hash_on_delete(cursor);
+
+	/* The call to row_rec_to_index_entry(ROW_COPY_DATA, ...) above
+	invokes rec_offs_make_valid() to point to the copied record that
+	the fields of new_entry point to.  We have to undo it here. */
+	ut_ad(rec_offs_validate(NULL, index, offsets));
+	rec_offs_make_valid(page_cur_get_rec(page_cursor), index, offsets);
+
+	page_cur_delete_rec(page_cursor, index, offsets, mtr);
+
+	page_cur_move_to_prev(page_cursor);
+
+	trx = thr_get_trx(thr);
+
+	if (!(flags & BTR_KEEP_SYS_FLAG)) {
+		row_upd_index_entry_sys_field(new_entry, index, DATA_ROLL_PTR,
+					      roll_ptr);
+		row_upd_index_entry_sys_field(new_entry, index, DATA_TRX_ID,
+					      trx->id);
+	}
+
+	/* There are no externally stored columns in new_entry */
+	rec = btr_cur_insert_if_possible(cursor, new_entry, 0/*n_ext*/, mtr);
+	ut_a(rec); /* <- We calculated above the insert would fit */
+
+	if (page_zip && !dict_index_is_clust(index)
+	    && page_is_leaf(page)) {
+		/* Update the free bits in the insert buffer. */
+		ibuf_update_free_bits_zip(block, mtr);
+	}
+
+	/* Restore the old explicit lock state on the record */
+
+	lock_rec_restore_from_page_infimum(block, rec, block);
+
+	page_cur_move_to_next(page_cursor);
+
+	err = DB_SUCCESS;
+err_exit:
+	mem_heap_free(heap);
+	return(err);
+}
+
+/*************************************************************//**
+If, in a split, a new supremum record was created as the predecessor of the
+updated record, the supremum record must inherit exactly the locks on the
+updated record. In the split it may have inherited locks from the successor
+of the updated record, which is not correct. This function restores the
+right locks for the new supremum. */
+static
+void
+btr_cur_pess_upd_restore_supremum(
+/*==============================*/
+	buf_block_t*	block,	/*!< in: buffer block of rec */
+	const rec_t*	rec,	/*!< in: updated record */
+	mtr_t*		mtr)	/*!< in: mtr */
+{
+	page_t*		page;
+	buf_block_t*	prev_block;
+	ulint		space;
+	ulint		zip_size;
+	ulint		prev_page_no;
+
+	page = buf_block_get_frame(block);
+
+	if (page_rec_get_next(page_get_infimum_rec(page)) != rec) {
+		/* Updated record is not the first user record on its page */
+
+		return;
+	}
+
+	space = buf_block_get_space(block);
+	zip_size = buf_block_get_zip_size(block);
+	prev_page_no = btr_page_get_prev(page, mtr);
+
+	ut_ad(prev_page_no != FIL_NULL);
+	prev_block = buf_page_get_with_no_latch(space, zip_size,
+						prev_page_no, mtr);
+#ifdef UNIV_BTR_DEBUG
+	ut_a(btr_page_get_next(prev_block->frame, mtr)
+	     == page_get_page_no(page));
+#endif /* UNIV_BTR_DEBUG */
+
+	/* We must already have an x-latch on prev_block! */
+	ut_ad(mtr_memo_contains(mtr, prev_block, MTR_MEMO_PAGE_X_FIX));
+
+	lock_rec_reset_and_inherit_gap_locks(prev_block, block,
+					     PAGE_HEAP_NO_SUPREMUM,
+					     page_rec_get_heap_no(rec));
+}
+
+/*************************************************************//**
+Performs an update of a record on a page of a tree. It is assumed
+that mtr holds an x-latch on the tree and on the cursor page. If the
+update is made on the leaf level, to avoid deadlocks, mtr must also
+own x-latches to brothers of page, if those brothers exist. We assume
+here that the ordering fields of the record do not change.
+@return	DB_SUCCESS or error code */
+UNIV_INTERN
+ulint
+btr_cur_pessimistic_update(
+/*=======================*/
+	ulint		flags,	/*!< in: undo logging, locking, and rollback
+				flags */
+	btr_cur_t*	cursor,	/*!< in: cursor on the record to update */
+	mem_heap_t**	heap,	/*!< in/out: pointer to memory heap, or NULL */
+	big_rec_t**	big_rec,/*!< out: big rec vector whose fields have to
+				be stored externally by the caller, or NULL */
+	const upd_t*	update,	/*!< in: update vector; this is allowed also
+				contain trx id and roll ptr fields, but
+				the values in update vector have no effect */
+	ulint		cmpl_info,/*!< in: compiler info on secondary index
+				updates */
+	que_thr_t*	thr,	/*!< in: query thread */
+	mtr_t*		mtr)	/*!< in: mtr; must be committed before
+				latching any further pages */
+{
+	big_rec_t*	big_rec_vec	= NULL;
+	big_rec_t*	dummy_big_rec;
+	dict_index_t*	index;
+	buf_block_t*	block;
+	page_t*		page;
+	page_zip_des_t*	page_zip;
+	rec_t*		rec;
+	page_cur_t*	page_cursor;
+	dtuple_t*	new_entry;
+	ulint		err;
+	ulint		optim_err;
+	roll_ptr_t	roll_ptr;
+	trx_t*		trx;
+	ibool		was_first;
+	ulint		n_extents	= 0;
+	ulint		n_reserved;
+	ulint		n_ext;
+	ulint*		offsets		= NULL;
+
+	*big_rec = NULL;
+
+	block = btr_cur_get_block(cursor);
+	page = buf_block_get_frame(block);
+	page_zip = buf_block_get_page_zip(block);
+	rec = btr_cur_get_rec(cursor);
+	index = cursor->index;
+
+	ut_ad(mtr_memo_contains(mtr, dict_index_get_lock(index),
+				MTR_MEMO_X_LOCK));
+	ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
+#ifdef UNIV_ZIP_DEBUG
+	ut_a(!page_zip || page_zip_validate(page_zip, page));
+#endif /* UNIV_ZIP_DEBUG */
+	/* The insert buffer tree should never be updated in place. */
+	ut_ad(!dict_index_is_ibuf(index));
+
+	optim_err = btr_cur_optimistic_update(flags, cursor, update,
+					      cmpl_info, thr, mtr);
+
+	switch (optim_err) {
+	case DB_UNDERFLOW:
+	case DB_OVERFLOW:
+	case DB_ZIP_OVERFLOW:
+		break;
+	default:
+		return(optim_err);
+	}
+
+	/* Do lock checking and undo logging */
+	err = btr_cur_upd_lock_and_undo(flags, cursor, update, cmpl_info,
+					thr, mtr, &roll_ptr);
+	if (err != DB_SUCCESS) {
+
+		return(err);
+	}
+
+	if (optim_err == DB_OVERFLOW) {
+		ulint	reserve_flag;
+
+		/* First reserve enough free space for the file segments
+		of the index tree, so that the update will not fail because
+		of lack of space */
+
+		n_extents = cursor->tree_height / 16 + 3;
+
+		if (flags & BTR_NO_UNDO_LOG_FLAG) {
+			reserve_flag = FSP_CLEANING;
+		} else {
+			reserve_flag = FSP_NORMAL;
+		}
+
+		if (!fsp_reserve_free_extents(&n_reserved, index->space,
+					      n_extents, reserve_flag, mtr)) {
+			return(DB_OUT_OF_FILE_SPACE);
+		}
+	}
+
+	if (!*heap) {
+		*heap = mem_heap_create(1024);
+	}
+	offsets = rec_get_offsets(rec, index, NULL, ULINT_UNDEFINED, heap);
+
+	trx = thr_get_trx(thr);
+
+	new_entry = row_rec_to_index_entry(ROW_COPY_DATA, rec, index, offsets,
+					   &n_ext, *heap);
+	/* The call to row_rec_to_index_entry(ROW_COPY_DATA, ...) above
+	invokes rec_offs_make_valid() to point to the copied record that
+	the fields of new_entry point to.  We have to undo it here. */
+	ut_ad(rec_offs_validate(NULL, index, offsets));
+	rec_offs_make_valid(rec, index, offsets);
+
+	/* The page containing the clustered index record
+	corresponding to new_entry is latched in mtr.  If the
+	clustered index record is delete-marked, then its externally
+	stored fields cannot have been purged yet, because then the
+	purge would also have removed the clustered index record
+	itself.  Thus the following call is safe. */
+	row_upd_index_replace_new_col_vals_index_pos(new_entry, index, update,
+						     FALSE, *heap);
+	if (!(flags & BTR_KEEP_SYS_FLAG)) {
+		row_upd_index_entry_sys_field(new_entry, index, DATA_ROLL_PTR,
+					      roll_ptr);
+		row_upd_index_entry_sys_field(new_entry, index, DATA_TRX_ID,
+					      trx->id);
+	}
+
+	if ((flags & BTR_NO_UNDO_LOG_FLAG) && rec_offs_any_extern(offsets)) {
+		/* We are in a transaction rollback undoing a row
+		update: we must free possible externally stored fields
+		which got new values in the update, if they are not
+		inherited values. They can be inherited if we have
+		updated the primary key to another value, and then
+		update it back again. */
+
+		ut_ad(big_rec_vec == NULL);
+
+		btr_rec_free_updated_extern_fields(
+			index, rec, page_zip, offsets, update,
+			trx_is_recv(trx) ? RB_RECOVERY : RB_NORMAL, mtr);
+	}
+
+	/* We have to set appropriate extern storage bits in the new
+	record to be inserted: we have to remember which fields were such */
+
+	ut_ad(!page_is_comp(page) || !rec_get_node_ptr_flag(rec));
+	offsets = rec_get_offsets(rec, index, offsets, ULINT_UNDEFINED, heap);
+	n_ext += btr_push_update_extern_fields(new_entry, update, *heap);
+
+	if (UNIV_LIKELY_NULL(page_zip)) {
+		ut_ad(page_is_comp(page));
+		if (page_zip_rec_needs_ext(
+			    rec_get_converted_size(index, new_entry, n_ext),
+			    TRUE,
+			    dict_index_get_n_fields(index),
+			    page_zip_get_size(page_zip))) {
+
+			goto make_external;
+		}
+	} else if (page_zip_rec_needs_ext(
+			   rec_get_converted_size(index, new_entry, n_ext),
+			   page_is_comp(page), 0, 0)) {
+make_external:
+		big_rec_vec = dtuple_convert_big_rec(index, new_entry, &n_ext);
+		if (UNIV_UNLIKELY(big_rec_vec == NULL)) {
+
+			err = DB_TOO_BIG_RECORD;
+			goto return_after_reservations;
+		}
+	}
+
+	/* Store state of explicit locks on rec on the page infimum record,
+	before deleting rec. The page infimum acts as a dummy carrier of the
+	locks, taking care also of lock releases, before we can move the locks
+	back on the actual record. There is a special case: if we are
+	inserting on the root page and the insert causes a call of
+	btr_root_raise_and_insert. Therefore we cannot in the lock system
+	delete the lock structs set on the root page even if the root
+	page carries just node pointers. */
+
+	lock_rec_store_on_page_infimum(block, rec);
+
+	btr_search_update_hash_on_delete(cursor);
+
+#ifdef UNIV_ZIP_DEBUG
+	ut_a(!page_zip || page_zip_validate(page_zip, page));
+#endif /* UNIV_ZIP_DEBUG */
+	page_cursor = btr_cur_get_page_cur(cursor);
+
+	page_cur_delete_rec(page_cursor, index, offsets, mtr);
+
+	page_cur_move_to_prev(page_cursor);
+
+	rec = btr_cur_insert_if_possible(cursor, new_entry, n_ext, mtr);
+
+	if (rec) {
+		lock_rec_restore_from_page_infimum(btr_cur_get_block(cursor),
+						   rec, block);
+
+		offsets = rec_get_offsets(rec, index, offsets,
+					  ULINT_UNDEFINED, heap);
+
+		if (!rec_get_deleted_flag(rec, rec_offs_comp(offsets))) {
+			/* The new inserted record owns its possible externally
+			stored fields */
+			btr_cur_unmark_extern_fields(page_zip,
+						     rec, index, offsets, mtr);
+		}
+
+		btr_cur_compress_if_useful(cursor, mtr);
+
+		if (page_zip && !dict_index_is_clust(index)
+		    && page_is_leaf(page)) {
+			/* Update the free bits in the insert buffer. */
+			ibuf_update_free_bits_zip(block, mtr);
+		}
+
+		err = DB_SUCCESS;
+		goto return_after_reservations;
+	} else {
+		ut_a(optim_err != DB_UNDERFLOW);
+
+		/* Out of space: reset the free bits. */
+		if (!dict_index_is_clust(index)
+		    && page_is_leaf(page)) {
+			ibuf_reset_free_bits(block);
+		}
+	}
+
+	/* Was the record to be updated positioned as the first user
+	record on its page? */
+	was_first = page_cur_is_before_first(page_cursor);
+
+	/* The first parameter means that no lock checking and undo logging
+	is made in the insert */
+
+	err = btr_cur_pessimistic_insert(BTR_NO_UNDO_LOG_FLAG
+					 | BTR_NO_LOCKING_FLAG
+					 | BTR_KEEP_SYS_FLAG,
+					 cursor, new_entry, &rec,
+					 &dummy_big_rec, n_ext, NULL, mtr);
+	ut_a(rec);
+	ut_a(err == DB_SUCCESS);
+	ut_a(dummy_big_rec == NULL);
+
+	if (dict_index_is_sec_or_ibuf(index)) {
+		/* Update PAGE_MAX_TRX_ID in the index page header.
+		It was not updated by btr_cur_pessimistic_insert()
+		because of BTR_NO_LOCKING_FLAG. */
+		buf_block_t*	rec_block;
+
+		rec_block = btr_cur_get_block(cursor);
+
+		page_update_max_trx_id(rec_block,
+				       buf_block_get_page_zip(rec_block),
+				       trx->id, mtr);
+	}
+
+	if (!rec_get_deleted_flag(rec, rec_offs_comp(offsets))) {
+		/* The new inserted record owns its possible externally
+		stored fields */
+		buf_block_t*	rec_block = btr_cur_get_block(cursor);
+
+#ifdef UNIV_ZIP_DEBUG
+		ut_a(!page_zip || page_zip_validate(page_zip, page));
+		page = buf_block_get_frame(rec_block);
+#endif /* UNIV_ZIP_DEBUG */
+		page_zip = buf_block_get_page_zip(rec_block);
+
+		offsets = rec_get_offsets(rec, index, offsets,
+					  ULINT_UNDEFINED, heap);
+		btr_cur_unmark_extern_fields(page_zip,
+					     rec, index, offsets, mtr);
+	}
+
+	lock_rec_restore_from_page_infimum(btr_cur_get_block(cursor),
+					   rec, block);
+
+	/* If necessary, restore also the correct lock state for a new,
+	preceding supremum record created in a page split. While the old
+	record was nonexistent, the supremum might have inherited its locks
+	from a wrong record. */
+
+	if (!was_first) {
+		btr_cur_pess_upd_restore_supremum(btr_cur_get_block(cursor),
+						  rec, mtr);
+	}
+
+return_after_reservations:
+#ifdef UNIV_ZIP_DEBUG
+	ut_a(!page_zip || page_zip_validate(page_zip, page));
+#endif /* UNIV_ZIP_DEBUG */
+
+	if (n_extents > 0) {
+		fil_space_release_free_extents(index->space, n_reserved);
+	}
+
+	*big_rec = big_rec_vec;
+
+	return(err);
+}
+
+/*==================== B-TREE DELETE MARK AND UNMARK ===============*/
+
+/****************************************************************//**
+Writes the redo log record for delete marking or unmarking of an index
+record. */
+UNIV_INLINE
+void
+btr_cur_del_mark_set_clust_rec_log(
+/*===============================*/
+	ulint		flags,	/*!< in: flags */
+	rec_t*		rec,	/*!< in: record */
+	dict_index_t*	index,	/*!< in: index of the record */
+	ibool		val,	/*!< in: value to set */
+	trx_t*		trx,	/*!< in: deleting transaction */
+	roll_ptr_t	roll_ptr,/*!< in: roll ptr to the undo log record */
+	mtr_t*		mtr)	/*!< in: mtr */
+{
+	byte*	log_ptr;
+	ut_ad(flags < 256);
+	ut_ad(val <= 1);
+
+	ut_ad(!!page_rec_is_comp(rec) == dict_table_is_comp(index->table));
+
+	log_ptr = mlog_open_and_write_index(mtr, rec, index,
+					    page_rec_is_comp(rec)
+					    ? MLOG_COMP_REC_CLUST_DELETE_MARK
+					    : MLOG_REC_CLUST_DELETE_MARK,
+					    1 + 1 + DATA_ROLL_PTR_LEN
+					    + 14 + 2);
+
+	if (!log_ptr) {
+		/* Logging in mtr is switched off during crash recovery */
+		return;
+	}
+
+	mach_write_to_1(log_ptr, flags);
+	log_ptr++;
+	mach_write_to_1(log_ptr, val);
+	log_ptr++;
+
+	log_ptr = row_upd_write_sys_vals_to_log(index, trx, roll_ptr, log_ptr,
+						mtr);
+	mach_write_to_2(log_ptr, page_offset(rec));
+	log_ptr += 2;
+
+	mlog_close(mtr, log_ptr);
+}
+#endif /* !UNIV_HOTBACKUP */
+
+/****************************************************************//**
+Parses the redo log record for delete marking or unmarking of a clustered
+index record.
+@return	end of log record or NULL */
+UNIV_INTERN
+byte*
+btr_cur_parse_del_mark_set_clust_rec(
+/*=================================*/
+	byte*		ptr,	/*!< in: buffer */
+	byte*		end_ptr,/*!< in: buffer end */
+	page_t*		page,	/*!< in/out: page or NULL */
+	page_zip_des_t*	page_zip,/*!< in/out: compressed page, or NULL */
+	dict_index_t*	index)	/*!< in: index corresponding to page */
+{
+	ulint		flags;
+	ulint		val;
+	ulint		pos;
+	trx_id_t	trx_id;
+	roll_ptr_t	roll_ptr;
+	ulint		offset;
+	rec_t*		rec;
+
+	ut_ad(!page
+	      || !!page_is_comp(page) == dict_table_is_comp(index->table));
+
+	if (end_ptr < ptr + 2) {
+
+		return(NULL);
+	}
+
+	flags = mach_read_from_1(ptr);
+	ptr++;
+	val = mach_read_from_1(ptr);
+	ptr++;
+
+	ptr = row_upd_parse_sys_vals(ptr, end_ptr, &pos, &trx_id, &roll_ptr);
+
+	if (ptr == NULL) {
+
+		return(NULL);
+	}
+
+	if (end_ptr < ptr + 2) {
+
+		return(NULL);
+	}
+
+	offset = mach_read_from_2(ptr);
+	ptr += 2;
+
+	ut_a(offset <= UNIV_PAGE_SIZE);
+
+	if (page) {
+		rec = page + offset;
+
+		/* We do not need to reserve btr_search_latch, as the page
+		is only being recovered, and there cannot be a hash index to
+		it. */
+
+		btr_rec_set_deleted_flag(rec, page_zip, val);
+
+		if (!(flags & BTR_KEEP_SYS_FLAG)) {
+			mem_heap_t*	heap		= NULL;
+			ulint		offsets_[REC_OFFS_NORMAL_SIZE];
+			rec_offs_init(offsets_);
+
+			row_upd_rec_sys_fields_in_recovery(
+				rec, page_zip,
+				rec_get_offsets(rec, index, offsets_,
+						ULINT_UNDEFINED, &heap),
+				pos, trx_id, roll_ptr);
+			if (UNIV_LIKELY_NULL(heap)) {
+				mem_heap_free(heap);
+			}
+		}
+	}
+
+	return(ptr);
+}
+
+#ifndef UNIV_HOTBACKUP
+/***********************************************************//**
+Marks a clustered index record deleted. Writes an undo log record to
+undo log on this delete marking. Writes in the trx id field the id
+of the deleting transaction, and in the roll ptr field pointer to the
+undo log record created.
+@return	DB_SUCCESS, DB_LOCK_WAIT, or error number */
+UNIV_INTERN
+ulint
+btr_cur_del_mark_set_clust_rec(
+/*===========================*/
+	ulint		flags,	/*!< in: undo logging and locking flags */
+	btr_cur_t*	cursor,	/*!< in: cursor */
+	ibool		val,	/*!< in: value to set */
+	que_thr_t*	thr,	/*!< in: query thread */
+	mtr_t*		mtr)	/*!< in: mtr */
+{
+	dict_index_t*	index;
+	buf_block_t*	block;
+	roll_ptr_t	roll_ptr;
+	ulint		err;
+	rec_t*		rec;
+	page_zip_des_t*	page_zip;
+	trx_t*		trx;
+	mem_heap_t*	heap		= NULL;
+	ulint		offsets_[REC_OFFS_NORMAL_SIZE];
+	ulint*		offsets		= offsets_;
+	rec_offs_init(offsets_);
+
+	rec = btr_cur_get_rec(cursor);
+	index = cursor->index;
+	ut_ad(!!page_rec_is_comp(rec) == dict_table_is_comp(index->table));
+	offsets = rec_get_offsets(rec, index, offsets, ULINT_UNDEFINED, &heap);
+
+#ifdef UNIV_DEBUG
+	if (btr_cur_print_record_ops && thr) {
+		btr_cur_trx_report(thr_get_trx(thr), index, "del mark ");
+		rec_print_new(stderr, rec, offsets);
+	}
+#endif /* UNIV_DEBUG */
+
+	ut_ad(dict_index_is_clust(index));
+	ut_ad(!rec_get_deleted_flag(rec, rec_offs_comp(offsets)));
+
+	err = lock_clust_rec_modify_check_and_lock(flags,
+						   btr_cur_get_block(cursor),
+						   rec, index, offsets, thr);
+
+	if (err != DB_SUCCESS) {
+
+		goto func_exit;
+	}
+
+	err = trx_undo_report_row_operation(flags, TRX_UNDO_MODIFY_OP, thr,
+					    index, NULL, NULL, 0, rec,
+					    &roll_ptr);
+	if (err != DB_SUCCESS) {
+
+		goto func_exit;
+	}
+
+	block = btr_cur_get_block(cursor);
+
+	if (block->is_hashed) {
+		rw_lock_x_lock(&btr_search_latch);
+	}
+
+	page_zip = buf_block_get_page_zip(block);
+
+	btr_rec_set_deleted_flag(rec, page_zip, val);
+
+	trx = thr_get_trx(thr);
+
+	if (!(flags & BTR_KEEP_SYS_FLAG)) {
+		row_upd_rec_sys_fields(rec, page_zip,
+				       index, offsets, trx, roll_ptr);
+	}
+
+	if (block->is_hashed) {
+		rw_lock_x_unlock(&btr_search_latch);
+	}
+
+	btr_cur_del_mark_set_clust_rec_log(flags, rec, index, val, trx,
+					   roll_ptr, mtr);
+
+func_exit:
+	if (UNIV_LIKELY_NULL(heap)) {
+		mem_heap_free(heap);
+	}
+	return(err);
+}
+
+/****************************************************************//**
+Writes the redo log record for a delete mark setting of a secondary
+index record. */
+UNIV_INLINE
+void
+btr_cur_del_mark_set_sec_rec_log(
+/*=============================*/
+	rec_t*		rec,	/*!< in: record */
+	ibool		val,	/*!< in: value to set */
+	mtr_t*		mtr)	/*!< in: mtr */
+{
+	byte*	log_ptr;
+	ut_ad(val <= 1);
+
+	log_ptr = mlog_open(mtr, 11 + 1 + 2);
+
+	if (!log_ptr) {
+		/* Logging in mtr is switched off during crash recovery:
+		in that case mlog_open returns NULL */
+		return;
+	}
+
+	log_ptr = mlog_write_initial_log_record_fast(
+		rec, MLOG_REC_SEC_DELETE_MARK, log_ptr, mtr);
+	mach_write_to_1(log_ptr, val);
+	log_ptr++;
+
+	mach_write_to_2(log_ptr, page_offset(rec));
+	log_ptr += 2;
+
+	mlog_close(mtr, log_ptr);
+}
+#endif /* !UNIV_HOTBACKUP */
+
+/****************************************************************//**
+Parses the redo log record for delete marking or unmarking of a secondary
+index record.
+@return	end of log record or NULL */
+UNIV_INTERN
+byte*
+btr_cur_parse_del_mark_set_sec_rec(
+/*===============================*/
+	byte*		ptr,	/*!< in: buffer */
+	byte*		end_ptr,/*!< in: buffer end */
+	page_t*		page,	/*!< in/out: page or NULL */
+	page_zip_des_t*	page_zip)/*!< in/out: compressed page, or NULL */
+{
+	ulint	val;
+	ulint	offset;
+	rec_t*	rec;
+
+	if (end_ptr < ptr + 3) {
+
+		return(NULL);
+	}
+
+	val = mach_read_from_1(ptr);
+	ptr++;
+
+	offset = mach_read_from_2(ptr);
+	ptr += 2;
+
+	ut_a(offset <= UNIV_PAGE_SIZE);
+
+	if (page) {
+		rec = page + offset;
+
+		/* We do not need to reserve btr_search_latch, as the page
+		is only being recovered, and there cannot be a hash index to
+		it. */
+
+		btr_rec_set_deleted_flag(rec, page_zip, val);
+	}
+
+	return(ptr);
+}
+
+#ifndef UNIV_HOTBACKUP
+/***********************************************************//**
+Sets a secondary index record delete mark to TRUE or FALSE.
+@return	DB_SUCCESS, DB_LOCK_WAIT, or error number */
+UNIV_INTERN
+ulint
+btr_cur_del_mark_set_sec_rec(
+/*=========================*/
+	ulint		flags,	/*!< in: locking flag */
+	btr_cur_t*	cursor,	/*!< in: cursor */
+	ibool		val,	/*!< in: value to set */
+	que_thr_t*	thr,	/*!< in: query thread */
+	mtr_t*		mtr)	/*!< in: mtr */
+{
+	buf_block_t*	block;
+	rec_t*		rec;
+	ulint		err;
+
+	block = btr_cur_get_block(cursor);
+	rec = btr_cur_get_rec(cursor);
+
+#ifdef UNIV_DEBUG
+	if (btr_cur_print_record_ops && thr) {
+		btr_cur_trx_report(thr_get_trx(thr), cursor->index,
+				   "del mark ");
+		rec_print(stderr, rec, cursor->index);
+	}
+#endif /* UNIV_DEBUG */
+
+	err = lock_sec_rec_modify_check_and_lock(flags,
+						 btr_cur_get_block(cursor),
+						 rec, cursor->index, thr, mtr);
+	if (err != DB_SUCCESS) {
+
+		return(err);
+	}
+
+	ut_ad(!!page_rec_is_comp(rec)
+	      == dict_table_is_comp(cursor->index->table));
+
+	if (block->is_hashed) {
+		rw_lock_x_lock(&btr_search_latch);
+	}
+
+	btr_rec_set_deleted_flag(rec, buf_block_get_page_zip(block), val);
+
+	if (block->is_hashed) {
+		rw_lock_x_unlock(&btr_search_latch);
+	}
+
+	btr_cur_del_mark_set_sec_rec_log(rec, val, mtr);
+
+	return(DB_SUCCESS);
+}
+
+/***********************************************************//**
+Clear a secondary index record's delete mark.  This function is only
+used by the insert buffer insert merge mechanism. */
+UNIV_INTERN
+void
+btr_cur_del_unmark_for_ibuf(
+/*========================*/
+	rec_t*		rec,		/*!< in/out: record to delete unmark */
+	page_zip_des_t*	page_zip,	/*!< in/out: compressed page
+					corresponding to rec, or NULL
+					when the tablespace is
+					uncompressed */
+	mtr_t*		mtr)		/*!< in: mtr */
+{
+	/* We do not need to reserve btr_search_latch, as the page has just
+	been read to the buffer pool and there cannot be a hash index to it. */
+
+	btr_rec_set_deleted_flag(rec, page_zip, FALSE);
+
+	btr_cur_del_mark_set_sec_rec_log(rec, FALSE, mtr);
+}
+
+/*==================== B-TREE RECORD REMOVE =========================*/
+
+/*************************************************************//**
+Tries to compress a page of the tree if it seems useful. It is assumed
+that mtr holds an x-latch on the tree and on the cursor page. To avoid
+deadlocks, mtr must also own x-latches to brothers of page, if those
+brothers exist. NOTE: it is assumed that the caller has reserved enough
+free extents so that the compression will always succeed if done!
+@return	TRUE if compression occurred */
+UNIV_INTERN
+ibool
+btr_cur_compress_if_useful(
+/*=======================*/
+	btr_cur_t*	cursor,	/*!< in: cursor on the page to compress;
+				cursor does not stay valid if compression
+				occurs */
+	mtr_t*		mtr)	/*!< in: mtr */
+{
+	ut_ad(mtr_memo_contains(mtr,
+				dict_index_get_lock(btr_cur_get_index(cursor)),
+				MTR_MEMO_X_LOCK));
+	ut_ad(mtr_memo_contains(mtr, btr_cur_get_block(cursor),
+				MTR_MEMO_PAGE_X_FIX));
+
+	return(btr_cur_compress_recommendation(cursor, mtr)
+	       && btr_compress(cursor, mtr));
+}
+
+/*******************************************************//**
+Removes the record on which the tree cursor is positioned on a leaf page.
+It is assumed that the mtr has an x-latch on the page where the cursor is
+positioned, but no latch on the whole tree.
+@return	TRUE if success, i.e., the page did not become too empty */
+UNIV_INTERN
+ibool
+btr_cur_optimistic_delete(
+/*======================*/
+	btr_cur_t*	cursor,	/*!< in: cursor on leaf page, on the record to
+				delete; cursor stays valid: if deletion
+				succeeds, on function exit it points to the
+				successor of the deleted record */
+	mtr_t*		mtr)	/*!< in: mtr; if this function returns
+				TRUE on a leaf page of a secondary
+				index, the mtr must be committed
+				before latching any further pages */
+{
+	buf_block_t*	block;
+	rec_t*		rec;
+	mem_heap_t*	heap		= NULL;
+	ulint		offsets_[REC_OFFS_NORMAL_SIZE];
+	ulint*		offsets		= offsets_;
+	ibool		no_compress_needed;
+	rec_offs_init(offsets_);
+
+	ut_ad(mtr_memo_contains(mtr, btr_cur_get_block(cursor),
+				MTR_MEMO_PAGE_X_FIX));
+	/* This is intended only for leaf page deletions */
+
+	block = btr_cur_get_block(cursor);
+
+	if (srv_pass_corrupt_table && !block) {
+		return(DB_CORRUPTION);
+	}
+	ut_a(block);
+
+	ut_ad(page_is_leaf(buf_block_get_frame(block)));
+
+	rec = btr_cur_get_rec(cursor);
+	offsets = rec_get_offsets(rec, cursor->index, offsets,
+				  ULINT_UNDEFINED, &heap);
+
+	no_compress_needed = !rec_offs_any_extern(offsets)
+		&& btr_cur_can_delete_without_compress(
+			cursor, rec_offs_size(offsets), mtr);
+
+	if (no_compress_needed) {
+
+		page_t*		page	= buf_block_get_frame(block);
+		page_zip_des_t*	page_zip= buf_block_get_page_zip(block);
+		ulint		max_ins	= 0;
+
+		lock_update_delete(block, rec);
+
+		btr_search_update_hash_on_delete(cursor);
+
+		if (!page_zip) {
+			max_ins = page_get_max_insert_size_after_reorganize(
+				page, 1);
+		}
+#ifdef UNIV_ZIP_DEBUG
+		ut_a(!page_zip || page_zip_validate(page_zip, page));
+#endif /* UNIV_ZIP_DEBUG */
+		page_cur_delete_rec(btr_cur_get_page_cur(cursor),
+				    cursor->index, offsets, mtr);
+#ifdef UNIV_ZIP_DEBUG
+		ut_a(!page_zip || page_zip_validate(page_zip, page));
+#endif /* UNIV_ZIP_DEBUG */
+
+		if (dict_index_is_clust(cursor->index)
+		    || dict_index_is_ibuf(cursor->index)
+		    || !page_is_leaf(page)) {
+			/* The insert buffer does not handle
+			inserts to clustered indexes, to
+			non-leaf pages of secondary index B-trees,
+			or to the insert buffer. */
+		} else if (page_zip) {
+			ibuf_update_free_bits_zip(block, mtr);
+		} else {
+			ibuf_update_free_bits_low(block, max_ins, mtr);
+		}
+	}
+
+	if (UNIV_LIKELY_NULL(heap)) {
+		mem_heap_free(heap);
+	}
+
+	return(no_compress_needed);
+}
+
+/*************************************************************//**
+Removes the record on which the tree cursor is positioned. Tries
+to compress the page if its fillfactor drops below a threshold
+or if it is the only page on the level. It is assumed that mtr holds
+an x-latch on the tree and on the cursor page. To avoid deadlocks,
+mtr must also own x-latches to brothers of page, if those brothers
+exist.
+@return	TRUE if compression occurred */
+UNIV_INTERN
+ibool
+btr_cur_pessimistic_delete(
+/*=======================*/
+	ulint*		err,	/*!< out: DB_SUCCESS or DB_OUT_OF_FILE_SPACE;
+				the latter may occur because we may have
+				to update node pointers on upper levels,
+				and in the case of variable length keys
+				these may actually grow in size */
+	ibool		has_reserved_extents, /*!< in: TRUE if the
+				caller has already reserved enough free
+				extents so that he knows that the operation
+				will succeed */
+	btr_cur_t*	cursor,	/*!< in: cursor on the record to delete;
+				if compression does not occur, the cursor
+				stays valid: it points to successor of
+				deleted record on function exit */
+	enum trx_rb_ctx	rb_ctx,	/*!< in: rollback context */
+	mtr_t*		mtr)	/*!< in: mtr */
+{
+	buf_block_t*	block;
+	page_t*		page;
+	page_zip_des_t*	page_zip;
+	dict_index_t*	index;
+	rec_t*		rec;
+	dtuple_t*	node_ptr;
+	ulint		n_extents	= 0;
+	ulint		n_reserved;
+	ibool		success;
+	ibool		ret		= FALSE;
+	ulint		level;
+	mem_heap_t*	heap;
+	ulint*		offsets;
+
+	block = btr_cur_get_block(cursor);
+	page = buf_block_get_frame(block);
+	index = btr_cur_get_index(cursor);
+
+	ut_ad(mtr_memo_contains(mtr, dict_index_get_lock(index),
+				MTR_MEMO_X_LOCK));
+	ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
+	if (!has_reserved_extents) {
+		/* First reserve enough free space for the file segments
+		of the index tree, so that the node pointer updates will
+		not fail because of lack of space */
+
+		n_extents = cursor->tree_height / 32 + 1;
+
+		success = fsp_reserve_free_extents(&n_reserved,
+						   index->space,
+						   n_extents,
+						   FSP_CLEANING, mtr);
+		if (!success) {
+			*err = DB_OUT_OF_FILE_SPACE;
+
+			return(FALSE);
+		}
+	}
+
+	heap = mem_heap_create(1024);
+	rec = btr_cur_get_rec(cursor);
+	page_zip = buf_block_get_page_zip(block);
+#ifdef UNIV_ZIP_DEBUG
+	ut_a(!page_zip || page_zip_validate(page_zip, page));
+#endif /* UNIV_ZIP_DEBUG */
+
+	offsets = rec_get_offsets(rec, index, NULL, ULINT_UNDEFINED, &heap);
+
+	if (rec_offs_any_extern(offsets)) {
+		btr_rec_free_externally_stored_fields(index,
+						      rec, offsets, page_zip,
+						      rb_ctx, mtr);
+#ifdef UNIV_ZIP_DEBUG
+		ut_a(!page_zip || page_zip_validate(page_zip, page));
+#endif /* UNIV_ZIP_DEBUG */
+	}
+
+	if (UNIV_UNLIKELY(page_get_n_recs(page) < 2)
+	    && UNIV_UNLIKELY(dict_index_get_page(index)
+			     != buf_block_get_page_no(block))) {
+
+		/* If there is only one record, drop the whole page in
+		btr_discard_page, if this is not the root page */
+
+		btr_discard_page(cursor, mtr);
+
+		*err = DB_SUCCESS;
+		ret = TRUE;
+
+		goto return_after_reservations;
+	}
+
+	lock_update_delete(block, rec);
+	level = btr_page_get_level(page, mtr);
+
+	if (level > 0
+	    && UNIV_UNLIKELY(rec == page_rec_get_next(
+				     page_get_infimum_rec(page)))) {
+
+		rec_t*	next_rec = page_rec_get_next(rec);
+
+		if (btr_page_get_prev(page, mtr) == FIL_NULL) {
+
+			/* If we delete the leftmost node pointer on a
+			non-leaf level, we must mark the new leftmost node
+			pointer as the predefined minimum record */
+
+			/* This will make page_zip_validate() fail until
+			page_cur_delete_rec() completes.  This is harmless,
+			because everything will take place within a single
+			mini-transaction and because writing to the redo log
+			is an atomic operation (performed by mtr_commit()). */
+			btr_set_min_rec_mark(next_rec, mtr);
+		} else {
+			/* Otherwise, if we delete the leftmost node pointer
+			on a page, we have to change the father node pointer
+			so that it is equal to the new leftmost node pointer
+			on the page */
+
+			btr_node_ptr_delete(index, block, mtr);
+
+			node_ptr = dict_index_build_node_ptr(
+				index, next_rec, buf_block_get_page_no(block),
+				heap, level);
+
+			btr_insert_on_non_leaf_level(index,
+						     level + 1, node_ptr, mtr);
+		}
+	}
+
+	btr_search_update_hash_on_delete(cursor);
+
+	page_cur_delete_rec(btr_cur_get_page_cur(cursor), index, offsets, mtr);
+#ifdef UNIV_ZIP_DEBUG
+	ut_a(!page_zip || page_zip_validate(page_zip, page));
+#endif /* UNIV_ZIP_DEBUG */
+
+	ut_ad(btr_check_node_ptr(index, block, mtr));
+
+	*err = DB_SUCCESS;
+
+return_after_reservations:
+	mem_heap_free(heap);
+
+	if (ret == FALSE) {
+		ret = btr_cur_compress_if_useful(cursor, mtr);
+	}
+
+	if (n_extents > 0) {
+		fil_space_release_free_extents(index->space, n_reserved);
+	}
+
+	return(ret);
+}
+
+/*******************************************************************//**
+Adds path information to the cursor for the current page, for which
+the binary search has been performed. */
+static
+void
+btr_cur_add_path_info(
+/*==================*/
+	btr_cur_t*	cursor,		/*!< in: cursor positioned on a page */
+	ulint		height,		/*!< in: height of the page in tree;
+					0 means leaf node */
+	ulint		root_height)	/*!< in: root node height in tree */
+{
+	btr_path_t*	slot;
+	rec_t*		rec;
+
+	ut_a(cursor->path_arr);
+
+	if (root_height >= BTR_PATH_ARRAY_N_SLOTS - 1) {
+		/* Do nothing; return empty path */
+
+		slot = cursor->path_arr;
+		slot->nth_rec = ULINT_UNDEFINED;
+
+		return;
+	}
+
+	if (height == 0) {
+		/* Mark end of slots for path */
+		slot = cursor->path_arr + root_height + 1;
+		slot->nth_rec = ULINT_UNDEFINED;
+	}
+
+	rec = btr_cur_get_rec(cursor);
+
+	slot = cursor->path_arr + (root_height - height);
+
+	slot->nth_rec = page_rec_get_n_recs_before(rec);
+	slot->n_recs = page_get_n_recs(page_align(rec));
+}
+
+/*******************************************************************//**
+Estimates the number of rows in a given index range.
+@return	estimated number of rows */
+UNIV_INTERN
+ib_int64_t
+btr_estimate_n_rows_in_range(
+/*=========================*/
+	dict_index_t*	index,	/*!< in: index */
+	const dtuple_t*	tuple1,	/*!< in: range start, may also be empty tuple */
+	ulint		mode1,	/*!< in: search mode for range start */
+	const dtuple_t*	tuple2,	/*!< in: range end, may also be empty tuple */
+	ulint		mode2)	/*!< in: search mode for range end */
+{
+	btr_path_t	path1[BTR_PATH_ARRAY_N_SLOTS];
+	btr_path_t	path2[BTR_PATH_ARRAY_N_SLOTS];
+	btr_cur_t	cursor;
+	btr_path_t*	slot1;
+	btr_path_t*	slot2;
+	ibool		diverged;
+	ibool		diverged_lot;
+	ulint		divergence_level;
+	ib_int64_t	n_rows;
+	ulint		i;
+	mtr_t		mtr;
+
+	mtr_start(&mtr);
+
+	cursor.path_arr = path1;
+
+	if (dtuple_get_n_fields(tuple1) > 0) {
+
+		btr_cur_search_to_nth_level(index, 0, tuple1, mode1,
+					    BTR_SEARCH_LEAF | BTR_ESTIMATE,
+					    &cursor, 0,
+					    __FILE__, __LINE__, &mtr);
+	} else {
+		btr_cur_open_at_index_side(TRUE, index,
+					   BTR_SEARCH_LEAF | BTR_ESTIMATE,
+					   &cursor, &mtr);
+	}
+
+	mtr_commit(&mtr);
+
+	mtr_start(&mtr);
+
+	cursor.path_arr = path2;
+
+	if (dtuple_get_n_fields(tuple2) > 0) {
+
+		btr_cur_search_to_nth_level(index, 0, tuple2, mode2,
+					    BTR_SEARCH_LEAF | BTR_ESTIMATE,
+					    &cursor, 0,
+					    __FILE__, __LINE__, &mtr);
+	} else {
+		btr_cur_open_at_index_side(FALSE, index,
+					   BTR_SEARCH_LEAF | BTR_ESTIMATE,
+					   &cursor, &mtr);
+	}
+
+	mtr_commit(&mtr);
+
+	/* We have the path information for the range in path1 and path2 */
+
+	n_rows = 1;
+	diverged = FALSE;	    /* This becomes true when the path is not
+				    the same any more */
+	diverged_lot = FALSE;	    /* This becomes true when the paths are
+				    not the same or adjacent any more */
+	divergence_level = 1000000; /* This is the level where paths diverged
+				    a lot */
+	for (i = 0; ; i++) {
+		ut_ad(i < BTR_PATH_ARRAY_N_SLOTS);
+
+		slot1 = path1 + i;
+		slot2 = path2 + i;
+
+		if (slot1->nth_rec == ULINT_UNDEFINED
+		    || slot2->nth_rec == ULINT_UNDEFINED) {
+
+			if (i > divergence_level + 1) {
+				/* In trees whose height is > 1 our algorithm
+				tends to underestimate: multiply the estimate
+				by 2: */
+
+				n_rows = n_rows * 2;
+			}
+
+			/* Do not estimate the number of rows in the range
+			to over 1 / 2 of the estimated rows in the whole
+			table */
+
+			if (n_rows > index->table->stat_n_rows / 2) {
+				n_rows = index->table->stat_n_rows / 2;
+
+				/* If there are just 0 or 1 rows in the table,
+				then we estimate all rows are in the range */
+
+				if (n_rows == 0) {
+					n_rows = index->table->stat_n_rows;
+				}
+			}
+
+			return(n_rows);
+		}
+
+		if (!diverged && slot1->nth_rec != slot2->nth_rec) {
+
+			diverged = TRUE;
+
+			if (slot1->nth_rec < slot2->nth_rec) {
+				n_rows = slot2->nth_rec - slot1->nth_rec;
+
+				if (n_rows > 1) {
+					diverged_lot = TRUE;
+					divergence_level = i;
+				}
+			} else {
+				/* Maybe the tree has changed between
+				searches */
+
+				return(10);
+			}
+
+		} else if (diverged && !diverged_lot) {
+
+			if (slot1->nth_rec < slot1->n_recs
+			    || slot2->nth_rec > 1) {
+
+				diverged_lot = TRUE;
+				divergence_level = i;
+
+				n_rows = 0;
+
+				if (slot1->nth_rec < slot1->n_recs) {
+					n_rows += slot1->n_recs
+						- slot1->nth_rec;
+				}
+
+				if (slot2->nth_rec > 1) {
+					n_rows += slot2->nth_rec - 1;
+				}
+			}
+		} else if (diverged_lot) {
+
+			n_rows = (n_rows * (slot1->n_recs + slot2->n_recs))
+				/ 2;
+		}
+	}
+}
+
+/*******************************************************************//**
+Estimates the number of pages which have not null value of the key of n_cols.
+@return	estimated number of pages */
+UNIV_INTERN
+ulint
+btr_estimate_n_pages_not_null(
+/*=========================*/
+	dict_index_t*	index,	/*!< in: index */
+	ulint		n_cols,	/*!< in: The cols should be not null */
+	btr_path_t*	path1)	/*!< in: path1[BTR_PATH_ARRAY_N_SLOTS] */
+{
+	dtuple_t*	tuple1;
+	btr_path_t	path2[BTR_PATH_ARRAY_N_SLOTS];
+	btr_cur_t	cursor;
+	btr_path_t*	slot1;
+	btr_path_t*	slot2;
+	ibool		diverged;
+	ibool		diverged_lot;
+	ulint		divergence_level;
+	ulint		n_pages;
+	ulint		i;
+	mtr_t		mtr;
+	mem_heap_t*	heap;
+
+	heap = mem_heap_create(n_cols * sizeof(dfield_t)
+				+ sizeof(dtuple_t));
+
+	/* make tuple1 (NULL,NULL,,,) from n_cols */
+	tuple1 = dtuple_create(heap, n_cols);
+	dict_index_copy_types(tuple1, index, n_cols);
+
+	for (i = 0; i < n_cols; i++) {
+		dfield_set_null(dtuple_get_nth_field(tuple1, i));
+	}
+
+	mtr_start(&mtr);
+
+	cursor.path_arr = path1;
+
+	btr_cur_search_to_nth_level(index, 0, tuple1, PAGE_CUR_G,
+				    BTR_SEARCH_LEAF | BTR_ESTIMATE,
+				    &cursor, 0, __FILE__, __LINE__, &mtr);
+
+	mtr_commit(&mtr);
+
+
+
+	mtr_start(&mtr);
+
+	cursor.path_arr = path2;
+
+	btr_cur_open_at_index_side(FALSE, index,
+				   BTR_SEARCH_LEAF | BTR_ESTIMATE,
+				   &cursor, &mtr);
+
+	mtr_commit(&mtr);
+
+	mem_heap_free(heap);
+
+	/* We have the path information for the range in path1 and path2 */
+
+	n_pages = 1;
+	diverged = FALSE;	    /* This becomes true when the path is not
+				    the same any more */
+	diverged_lot = FALSE;	    /* This becomes true when the paths are
+				    not the same or adjacent any more */
+	divergence_level = 1000000; /* This is the level where paths diverged
+				    a lot */
+	for (i = 0; ; i++) {
+		ut_ad(i < BTR_PATH_ARRAY_N_SLOTS);
+
+		slot1 = path1 + i;
+		slot2 = path2 + i;
+
+		if ((slot1 + 1)->nth_rec == ULINT_UNDEFINED
+		    || (slot2 + 1)->nth_rec == ULINT_UNDEFINED) {
+
+			if (i > divergence_level + 1) {
+				/* In trees whose height is > 1 our algorithm
+				tends to underestimate: multiply the estimate
+				by 2: */
+
+				n_pages = n_pages * 2;
+			}
+
+			/* Do not estimate the number of rows in the range
+			to over 1 / 2 of the estimated rows in the whole
+			table */
+
+			if (n_pages > index->stat_n_leaf_pages / 2) {
+				n_pages = index->stat_n_leaf_pages / 2;
+
+				/* If there are just 0 or 1 rows in the table,
+				then we estimate all rows are in the range */
+
+				if (n_pages == 0) {
+					n_pages = index->stat_n_leaf_pages;
+				}
+			}
+
+			return(n_pages);
+		}
+
+		if (!diverged && slot1->nth_rec != slot2->nth_rec) {
+
+			diverged = TRUE;
+
+			if (slot1->nth_rec < slot2->nth_rec) {
+				n_pages = slot2->nth_rec - slot1->nth_rec;
+
+				if (n_pages > 1) {
+					diverged_lot = TRUE;
+					divergence_level = i;
+				}
+			} else {
+				/* Maybe the tree has changed between
+				searches */
+
+				return(10);
+			}
+
+		} else if (diverged && !diverged_lot) {
+
+			if (slot1->nth_rec < slot1->n_recs
+			    || slot2->nth_rec > 1) {
+
+				diverged_lot = TRUE;
+				divergence_level = i;
+
+				n_pages = 0;
+
+				if (slot1->nth_rec < slot1->n_recs) {
+					n_pages += slot1->n_recs
+						- slot1->nth_rec;
+				}
+
+				if (slot2->nth_rec > 1) {
+					n_pages += slot2->nth_rec - 1;
+				}
+			}
+		} else if (diverged_lot) {
+
+			n_pages = (n_pages * (slot1->n_recs + slot2->n_recs))
+				/ 2;
+		}
+	}
+}
+
+/*******************************************************************//**
+Estimates the number of different key values in a given index, for
+each n-column prefix of the index where n <= dict_index_get_n_unique(index).
+The estimates are stored in the array index->stat_n_diff_key_vals. */
+UNIV_INTERN
+void
+btr_estimate_number_of_different_key_vals(
+/*======================================*/
+	dict_index_t*	index)	/*!< in: index */
+{
+	btr_cur_t	cursor;
+	page_t*		page;
+	rec_t*		rec;
+	ulint		n_cols;
+	ulint		matched_fields;
+	ulint		matched_bytes;
+	ib_int64_t*	n_diff;
+	ullint		n_sample_pages; /* number of pages to sample */
+	ulint		not_empty_flag	= 0;
+	ulint		total_external_size = 0;
+	ulint		i;
+	ulint		j;
+	ullint		add_on;
+	mtr_t		mtr;
+	mem_heap_t*	heap		= NULL;
+	ulint		offsets_rec_[REC_OFFS_NORMAL_SIZE];
+	ulint		offsets_next_rec_[REC_OFFS_NORMAL_SIZE];
+	ulint*		offsets_rec	= offsets_rec_;
+	ulint*		offsets_next_rec= offsets_next_rec_;
+	ulint		stats_method	= srv_stats_method;
+	btr_path_t	first_rec_path[BTR_PATH_ARRAY_N_SLOTS];
+	ulint		effective_pages; /* effective leaf pages */
+	rec_offs_init(offsets_rec_);
+	rec_offs_init(offsets_next_rec_);
+
+	n_cols = dict_index_get_n_unique(index);
+
+	if (stats_method == SRV_STATS_METHOD_IGNORE_NULLS) {
+		/* estimate effective pages and path for the first effective record */
+		/* TODO: make it work also for n_cols > 1. */
+		effective_pages = btr_estimate_n_pages_not_null(index, 1 /*k*/, first_rec_path);
+
+		if (!effective_pages) {
+			dict_index_stat_mutex_enter(index);
+			for (j = 0; j <= n_cols; j++) {
+				index->stat_n_diff_key_vals[j] = (ib_int64_t)index->stat_n_leaf_pages;
+			}
+			dict_index_stat_mutex_exit(index);
+			return;
+		} else if (effective_pages > index->stat_n_leaf_pages) {
+			effective_pages = index->stat_n_leaf_pages;
+		}
+	} else {
+		effective_pages = index->stat_n_leaf_pages;
+	}
+
+	n_diff = mem_zalloc((n_cols + 1) * sizeof(ib_int64_t));
+
+	/* It makes no sense to test more pages than are contained
+	in the index, thus we lower the number if it is too high */
+	if (srv_stats_sample_pages > effective_pages) {
+		if (effective_pages > 0) {
+			n_sample_pages = effective_pages;
+		} else {
+			n_sample_pages = 1;
+		}
+	} else {
+		n_sample_pages = srv_stats_sample_pages;
+	}
+
+	/* We sample some pages in the index to get an estimate */
+
+	for (i = 0; i < n_sample_pages; i++) {
+		rec_t*	supremum;
+		ibool	is_first_page = TRUE;
+		mtr_start(&mtr);
+
+		if (stats_method == SRV_STATS_METHOD_IGNORE_NULLS) {
+			is_first_page = btr_cur_open_at_rnd_pos_after_path(index, BTR_SEARCH_LEAF,
+									first_rec_path, &cursor, &mtr);
+		} else {
+		btr_cur_open_at_rnd_pos(index, BTR_SEARCH_LEAF, &cursor, &mtr);
+		}
+
+		/* Count the number of different key values for each prefix of
+		the key on this index page. If the prefix does not determine
+		the index record uniquely in the B-tree, then we subtract one
+		because otherwise our algorithm would give a wrong estimate
+		for an index where there is just one key value. */
+
+		page = btr_cur_get_page(&cursor);
+
+		if (srv_pass_corrupt_table && !page) {
+			break;
+		}
+		ut_a(page);
+
+		supremum = page_get_supremum_rec(page);
+		if (stats_method == SRV_STATS_METHOD_IGNORE_NULLS && is_first_page) {
+			/* the cursor should be the first record of the page. */
+			/* Counting should be started from here. */
+			rec = btr_cur_get_rec(&cursor);
+		} else {
+		rec = page_rec_get_next(page_get_infimum_rec(page));
+		}
+
+		if (rec != supremum) {
+			not_empty_flag = 1;
+			offsets_rec = rec_get_offsets(rec, index, offsets_rec,
+						      ULINT_UNDEFINED, &heap);
+		}
+
+		while (rec != supremum) {
+			rec_t*  next_rec;
+			next_rec = page_rec_get_next(rec);
+			if (next_rec == supremum) {
+				break;
+			}
+
+			matched_fields = 0;
+			matched_bytes = 0;
+			offsets_next_rec = rec_get_offsets(next_rec, index,
+							   offsets_next_rec,
+							   n_cols, &heap);
+
+			cmp_rec_rec_with_match(rec, next_rec,
+					       offsets_rec, offsets_next_rec,
+					       index, &matched_fields,
+					       &matched_bytes,
+				(stats_method==SRV_STATS_METHOD_NULLS_NOT_EQUAL) ?
+				SRV_STATS_METHOD_NULLS_NOT_EQUAL :
+				SRV_STATS_METHOD_NULLS_EQUAL);
+
+			for (j = matched_fields + 1; j <= n_cols; j++) {
+				/* We add one if this index record has
+				a different prefix from the previous */
+
+				n_diff[j]++;
+			}
+
+			total_external_size
+				+= btr_rec_get_externally_stored_len(
+					rec, offsets_rec);
+
+			rec = next_rec;
+			/* Initialize offsets_rec for the next round
+			and assign the old offsets_rec buffer to
+			offsets_next_rec. */
+			{
+				ulint*	offsets_tmp = offsets_rec;
+				offsets_rec = offsets_next_rec;
+				offsets_next_rec = offsets_tmp;
+			}
+		}
+
+
+		if (n_cols == dict_index_get_n_unique_in_tree(index)) {
+
+			/* If there is more than one leaf page in the tree,
+			we add one because we know that the first record
+			on the page certainly had a different prefix than the
+			last record on the previous index page in the
+			alphabetical order. Before this fix, if there was
+			just one big record on each clustered index page, the
+			algorithm grossly underestimated the number of rows
+			in the table. */
+
+			if (btr_page_get_prev(page, &mtr) != FIL_NULL
+			    || btr_page_get_next(page, &mtr) != FIL_NULL) {
+
+				n_diff[n_cols]++;
+			}
+		}
+
+		offsets_rec = rec_get_offsets(rec, index, offsets_rec,
+					      ULINT_UNDEFINED, &heap);
+		total_external_size += btr_rec_get_externally_stored_len(
+			rec, offsets_rec);
+		mtr_commit(&mtr);
+	}
+
+	/* If we saw k borders between different key values on
+	n_sample_pages leaf pages, we can estimate how many
+	there will be in index->stat_n_leaf_pages */
+
+	/* We must take into account that our sample actually represents
+	also the pages used for external storage of fields (those pages are
+	included in index->stat_n_leaf_pages) */
+
+	dict_index_stat_mutex_enter(index);
+
+	for (j = 0; j <= n_cols; j++) {
+		index->stat_n_diff_key_vals[j]
+			= ((n_diff[j]
+			    * (ib_int64_t)effective_pages
+			    + n_sample_pages - 1
+			    + total_external_size
+			    + not_empty_flag)
+			   / (n_sample_pages
+			      + total_external_size));
+
+		/* If the tree is small, smaller than
+		10 * n_sample_pages + total_external_size, then
+		the above estimate is ok. For bigger trees it is common that we
+		do not see any borders between key values in the few pages
+		we pick. But still there may be n_sample_pages
+		different key values, or even more. Let us try to approximate
+		that: */
+
+		add_on = effective_pages
+			/ (10 * (n_sample_pages
+				 + total_external_size));
+
+		if (add_on > n_sample_pages) {
+			add_on = n_sample_pages;
+		}
+
+		index->stat_n_diff_key_vals[j] += add_on;
+
+		if (stats_method == SRV_STATS_METHOD_IGNORE_NULLS) {
+			/* index->stat_n_diff_key_vals[k] is used for calc rec_per_key,
+			as "stats.records / index->stat_n_diff_key_vals[x]".
+			So it should be adjusted to the value which is based on whole of the index. */
+			index->stat_n_diff_key_vals[j] =
+				index->stat_n_diff_key_vals[j] * (ib_int64_t)index->stat_n_leaf_pages
+					/ (ib_int64_t)effective_pages;
+		}
+	}
+
+	dict_index_stat_mutex_exit(index);
+
+	mem_free(n_diff);
+	if (UNIV_LIKELY_NULL(heap)) {
+		mem_heap_free(heap);
+	}
+}
+
+/*================== EXTERNAL STORAGE OF BIG FIELDS ===================*/
+
+/***********************************************************//**
+Gets the externally stored size of a record, in units of a database page.
+@return	externally stored part, in units of a database page */
+static
+ulint
+btr_rec_get_externally_stored_len(
+/*==============================*/
+	rec_t*		rec,	/*!< in: record */
+	const ulint*	offsets)/*!< in: array returned by rec_get_offsets() */
+{
+	ulint	n_fields;
+	byte*	data;
+	ulint	local_len;
+	ulint	extern_len;
+	ulint	total_extern_len = 0;
+	ulint	i;
+
+	ut_ad(!rec_offs_comp(offsets) || !rec_get_node_ptr_flag(rec));
+	n_fields = rec_offs_n_fields(offsets);
+
+	for (i = 0; i < n_fields; i++) {
+		if (rec_offs_nth_extern(offsets, i)) {
+
+			data = rec_get_nth_field(rec, offsets, i, &local_len);
+
+			local_len -= BTR_EXTERN_FIELD_REF_SIZE;
+
+			extern_len = mach_read_from_4(data + local_len
+						      + BTR_EXTERN_LEN + 4);
+
+			total_extern_len += ut_calc_align(extern_len,
+							  UNIV_PAGE_SIZE);
+		}
+	}
+
+	return(total_extern_len / UNIV_PAGE_SIZE);
+}
+
+/*******************************************************************//**
+Sets the ownership bit of an externally stored field in a record. */
+static
+void
+btr_cur_set_ownership_of_extern_field(
+/*==================================*/
+	page_zip_des_t*	page_zip,/*!< in/out: compressed page whose uncompressed
+				part will be updated, or NULL */
+	rec_t*		rec,	/*!< in/out: clustered index record */
+	dict_index_t*	index,	/*!< in: index of the page */
+	const ulint*	offsets,/*!< in: array returned by rec_get_offsets() */
+	ulint		i,	/*!< in: field number */
+	ibool		val,	/*!< in: value to set */
+	mtr_t*		mtr)	/*!< in: mtr, or NULL if not logged */
+{
+	byte*	data;
+	ulint	local_len;
+	ulint	byte_val;
+
+	data = rec_get_nth_field(rec, offsets, i, &local_len);
+
+	ut_a(local_len >= BTR_EXTERN_FIELD_REF_SIZE);
+
+	local_len -= BTR_EXTERN_FIELD_REF_SIZE;
+
+	byte_val = mach_read_from_1(data + local_len + BTR_EXTERN_LEN);
+
+	if (val) {
+		byte_val = byte_val & (~BTR_EXTERN_OWNER_FLAG);
+	} else {
+		byte_val = byte_val | BTR_EXTERN_OWNER_FLAG;
+	}
+
+	if (UNIV_LIKELY_NULL(page_zip)) {
+		mach_write_to_1(data + local_len + BTR_EXTERN_LEN, byte_val);
+		page_zip_write_blob_ptr(page_zip, rec, index, offsets, i, mtr);
+	} else if (UNIV_LIKELY(mtr != NULL)) {
+
+		mlog_write_ulint(data + local_len + BTR_EXTERN_LEN, byte_val,
+				 MLOG_1BYTE, mtr);
+	} else {
+		mach_write_to_1(data + local_len + BTR_EXTERN_LEN, byte_val);
+	}
+}
+
+/*******************************************************************//**
+Marks not updated extern fields as not-owned by this record. The ownership
+is transferred to the updated record which is inserted elsewhere in the
+index tree. In purge only the owner of externally stored field is allowed
+to free the field.
+@return TRUE if BLOB ownership was transferred */
+UNIV_INTERN
+ibool
+btr_cur_mark_extern_inherited_fields(
+/*=================================*/
+	page_zip_des_t*	page_zip,/*!< in/out: compressed page whose uncompressed
+				part will be updated, or NULL */
+	rec_t*		rec,	/*!< in/out: record in a clustered index */
+	dict_index_t*	index,	/*!< in: index of the page */
+	const ulint*	offsets,/*!< in: array returned by rec_get_offsets() */
+	const upd_t*	update,	/*!< in: update vector */
+	mtr_t*		mtr)	/*!< in: mtr, or NULL if not logged */
+{
+	ulint	n;
+	ulint	j;
+	ulint	i;
+	ibool	change_ownership = FALSE;
+
+	ut_ad(rec_offs_validate(rec, NULL, offsets));
+	ut_ad(!rec_offs_comp(offsets) || !rec_get_node_ptr_flag(rec));
+
+	if (!rec_offs_any_extern(offsets)) {
+
+		return(FALSE);
+	}
+
+	n = rec_offs_n_fields(offsets);
+
+	for (i = 0; i < n; i++) {
+		if (rec_offs_nth_extern(offsets, i)) {
+
+			/* Check it is not in updated fields */
+
+			if (update) {
+				for (j = 0; j < upd_get_n_fields(update);
+				     j++) {
+					if (upd_get_nth_field(update, j)
+					    ->field_no == i) {
+
+						goto updated;
+					}
+				}
+			}
+
+			btr_cur_set_ownership_of_extern_field(
+				page_zip, rec, index, offsets, i, FALSE, mtr);
+
+			change_ownership = TRUE;
+updated:
+			;
+		}
+	}
+
+	return(change_ownership);
+}
+
+/*******************************************************************//**
+The complement of the previous function: in an update entry may inherit
+some externally stored fields from a record. We must mark them as inherited
+in entry, so that they are not freed in a rollback. */
+UNIV_INTERN
+void
+btr_cur_mark_dtuple_inherited_extern(
+/*=================================*/
+	dtuple_t*	entry,		/*!< in/out: updated entry to be
+					inserted to clustered index */
+	const upd_t*	update)		/*!< in: update vector */
+{
+	ulint		i;
+
+	for (i = 0; i < dtuple_get_n_fields(entry); i++) {
+
+		dfield_t*	dfield = dtuple_get_nth_field(entry, i);
+		byte*		data;
+		ulint		len;
+		ulint		j;
+
+		if (!dfield_is_ext(dfield)) {
+			continue;
+		}
+
+		/* Check if it is in updated fields */
+
+		for (j = 0; j < upd_get_n_fields(update); j++) {
+			if (upd_get_nth_field(update, j)->field_no == i) {
+
+				goto is_updated;
+			}
+		}
+
+		data = dfield_get_data(dfield);
+		len = dfield_get_len(dfield);
+		data[len - BTR_EXTERN_FIELD_REF_SIZE + BTR_EXTERN_LEN]
+			|= BTR_EXTERN_INHERITED_FLAG;
+
+is_updated:
+		;
+	}
+}
+
+/*******************************************************************//**
+Marks all extern fields in a record as owned by the record. This function
+should be called if the delete mark of a record is removed: a not delete
+marked record always owns all its extern fields. */
+static
+void
+btr_cur_unmark_extern_fields(
+/*=========================*/
+	page_zip_des_t*	page_zip,/*!< in/out: compressed page whose uncompressed
+				part will be updated, or NULL */
+	rec_t*		rec,	/*!< in/out: record in a clustered index */
+	dict_index_t*	index,	/*!< in: index of the page */
+	const ulint*	offsets,/*!< in: array returned by rec_get_offsets() */
+	mtr_t*		mtr)	/*!< in: mtr, or NULL if not logged */
+{
+	ulint	n;
+	ulint	i;
+
+	ut_ad(!rec_offs_comp(offsets) || !rec_get_node_ptr_flag(rec));
+	n = rec_offs_n_fields(offsets);
+
+	if (!rec_offs_any_extern(offsets)) {
+
+		return;
+	}
+
+	for (i = 0; i < n; i++) {
+		if (rec_offs_nth_extern(offsets, i)) {
+
+			btr_cur_set_ownership_of_extern_field(
+				page_zip, rec, index, offsets, i, TRUE, mtr);
+		}
+	}
+}
+
+/*******************************************************************//**
+Marks all extern fields in a dtuple as owned by the record. */
+UNIV_INTERN
+void
+btr_cur_unmark_dtuple_extern_fields(
+/*================================*/
+	dtuple_t*	entry)		/*!< in/out: clustered index entry */
+{
+	ulint	i;
+
+	for (i = 0; i < dtuple_get_n_fields(entry); i++) {
+		dfield_t* dfield = dtuple_get_nth_field(entry, i);
+
+		if (dfield_is_ext(dfield)) {
+			byte*	data = dfield_get_data(dfield);
+			ulint	len = dfield_get_len(dfield);
+
+			data[len - BTR_EXTERN_FIELD_REF_SIZE + BTR_EXTERN_LEN]
+				&= ~BTR_EXTERN_OWNER_FLAG;
+		}
+	}
+}
+
+/*******************************************************************//**
+Flags the data tuple fields that are marked as extern storage in the
+update vector.  We use this function to remember which fields we must
+mark as extern storage in a record inserted for an update.
+@return	number of flagged external columns */
+UNIV_INTERN
+ulint
+btr_push_update_extern_fields(
+/*==========================*/
+	dtuple_t*	tuple,	/*!< in/out: data tuple */
+	const upd_t*	update,	/*!< in: update vector */
+	mem_heap_t*	heap)	/*!< in: memory heap */
+{
+	ulint			n_pushed	= 0;
+	ulint			n;
+	const upd_field_t*	uf;
+
+	ut_ad(tuple);
+	ut_ad(update);
+
+	uf = update->fields;
+	n = upd_get_n_fields(update);
+
+	for (; n--; uf++) {
+		if (dfield_is_ext(&uf->new_val)) {
+			dfield_t*	field
+				= dtuple_get_nth_field(tuple, uf->field_no);
+
+			if (!dfield_is_ext(field)) {
+				dfield_set_ext(field);
+				n_pushed++;
+			}
+
+			switch (uf->orig_len) {
+				byte*	data;
+				ulint	len;
+				byte*	buf;
+			case 0:
+				break;
+			case BTR_EXTERN_FIELD_REF_SIZE:
+				/* Restore the original locally stored
+				part of the column.  In the undo log,
+				InnoDB writes a longer prefix of externally
+				stored columns, so that column prefixes
+				in secondary indexes can be reconstructed. */
+				dfield_set_data(field, (byte*) dfield_get_data(field)
+						+ dfield_get_len(field)
+						- BTR_EXTERN_FIELD_REF_SIZE,
+						BTR_EXTERN_FIELD_REF_SIZE);
+				dfield_set_ext(field);
+				break;
+			default:
+				/* Reconstruct the original locally
+				stored part of the column.  The data
+				will have to be copied. */
+				ut_a(uf->orig_len > BTR_EXTERN_FIELD_REF_SIZE);
+
+				data = dfield_get_data(field);
+				len = dfield_get_len(field);
+
+				buf = mem_heap_alloc(heap, uf->orig_len);
+				/* Copy the locally stored prefix. */
+				memcpy(buf, data,
+				       uf->orig_len
+				       - BTR_EXTERN_FIELD_REF_SIZE);
+				/* Copy the BLOB pointer. */
+				memcpy(buf + uf->orig_len
+				       - BTR_EXTERN_FIELD_REF_SIZE,
+				       data + len - BTR_EXTERN_FIELD_REF_SIZE,
+				       BTR_EXTERN_FIELD_REF_SIZE);
+
+				dfield_set_data(field, buf, uf->orig_len);
+				dfield_set_ext(field);
+			}
+		}
+	}
+
+	return(n_pushed);
+}
+
+/*******************************************************************//**
+Returns the length of a BLOB part stored on the header page.
+@return	part length */
+static
+ulint
+btr_blob_get_part_len(
+/*==================*/
+	const byte*	blob_header)	/*!< in: blob header */
+{
+	return(mach_read_from_4(blob_header + BTR_BLOB_HDR_PART_LEN));
+}
+
+/*******************************************************************//**
+Returns the page number where the next BLOB part is stored.
+@return	page number or FIL_NULL if no more pages */
+static
+ulint
+btr_blob_get_next_page_no(
+/*======================*/
+	const byte*	blob_header)	/*!< in: blob header */
+{
+	return(mach_read_from_4(blob_header + BTR_BLOB_HDR_NEXT_PAGE_NO));
+}
+
+/*******************************************************************//**
+Deallocate a buffer block that was reserved for a BLOB part. */
+static
+void
+btr_blob_free(
+/*==========*/
+	buf_block_t*	block,	/*!< in: buffer block */
+	ibool		all,	/*!< in: TRUE=remove also the compressed page
+				if there is one */
+	mtr_t*		mtr)	/*!< in: mini-transaction to commit */
+{
+	ulint	space	= buf_block_get_space(block);
+	ulint	page_no	= buf_block_get_page_no(block);
+
+	ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
+
+	mtr_commit(mtr);
+
+	//buf_pool_mutex_enter();
+	mutex_enter(&LRU_list_mutex);
+	mutex_enter(&block->mutex);
+
+	/* Only free the block if it is still allocated to
+	the same file page. */
+
+	if (buf_block_get_state(block)
+	    == BUF_BLOCK_FILE_PAGE
+	    && buf_block_get_space(block) == space
+	    && buf_block_get_page_no(block) == page_no) {
+
+		if (buf_LRU_free_block(&block->page, all, NULL, TRUE)
+		    != BUF_LRU_FREED
+		    && all && block->page.zip.data
+		    /* Now, buf_LRU_free_block() may release mutex temporarily */
+		    && buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE
+		    && buf_block_get_space(block) == space
+		    && buf_block_get_page_no(block) == page_no) {
+			/* Attempt to deallocate the uncompressed page
+			if the whole block cannot be deallocted. */
+
+			buf_LRU_free_block(&block->page, FALSE, NULL, TRUE);
+		}
+	}
+
+	//buf_pool_mutex_exit();
+	mutex_exit(&LRU_list_mutex);
+	mutex_exit(&block->mutex);
+}
+
+/*******************************************************************//**
+Stores the fields in big_rec_vec to the tablespace and puts pointers to
+them in rec.  The extern flags in rec will have to be set beforehand.
+The fields are stored on pages allocated from leaf node
+file segment of the index tree.
+@return	DB_SUCCESS or error */
+UNIV_INTERN
+ulint
+btr_store_big_rec_extern_fields(
+/*============================*/
+	dict_index_t*	index,		/*!< in: index of rec; the index tree
+					MUST be X-latched */
+	buf_block_t*	rec_block,	/*!< in/out: block containing rec */
+	rec_t*		rec,		/*!< in/out: record */
+	const ulint*	offsets,	/*!< in: rec_get_offsets(rec, index);
+					the "external storage" flags in offsets
+					will not correspond to rec when
+					this function returns */
+	big_rec_t*	big_rec_vec,	/*!< in: vector containing fields
+					to be stored externally */
+	mtr_t*		local_mtr __attribute__((unused))) /*!< in: mtr
+					containing the latch to rec and to the
+					tree */
+{
+	ulint	rec_page_no;
+	byte*	field_ref;
+	ulint	extern_len;
+	ulint	store_len;
+	ulint	page_no;
+	ulint	space_id;
+	ulint	zip_size;
+	ulint	prev_page_no;
+	ulint	hint_page_no;
+	ulint	i;
+	mtr_t	mtr;
+	mem_heap_t* heap = NULL;
+	page_zip_des_t*	page_zip;
+	z_stream c_stream;
+
+	ut_ad(rec_offs_validate(rec, index, offsets));
+	ut_ad(mtr_memo_contains(local_mtr, dict_index_get_lock(index),
+				MTR_MEMO_X_LOCK));
+	ut_ad(mtr_memo_contains(local_mtr, rec_block, MTR_MEMO_PAGE_X_FIX));
+	ut_ad(buf_block_get_frame(rec_block) == page_align(rec));
+	ut_a(dict_index_is_clust(index));
+
+	page_zip = buf_block_get_page_zip(rec_block);
+	ut_a(dict_table_zip_size(index->table)
+	     == buf_block_get_zip_size(rec_block));
+
+	space_id = buf_block_get_space(rec_block);
+	zip_size = buf_block_get_zip_size(rec_block);
+	rec_page_no = buf_block_get_page_no(rec_block);
+	ut_a(fil_page_get_type(page_align(rec)) == FIL_PAGE_INDEX);
+
+	if (UNIV_LIKELY_NULL(page_zip)) {
+		int	err;
+
+		/* Zlib deflate needs 128 kilobytes for the default
+		window size, plus 512 << memLevel, plus a few
+		kilobytes for small objects.  We use reduced memLevel
+		to limit the memory consumption, and preallocate the
+		heap, hoping to avoid memory fragmentation. */
+		heap = mem_heap_create(250000);
+		page_zip_set_alloc(&c_stream, heap);
+
+		err = deflateInit2(&c_stream, Z_DEFAULT_COMPRESSION,
+				   Z_DEFLATED, 15, 7, Z_DEFAULT_STRATEGY);
+		ut_a(err == Z_OK);
+	}
+
+	/* We have to create a file segment to the tablespace
+	for each field and put the pointer to the field in rec */
+
+	for (i = 0; i < big_rec_vec->n_fields; i++) {
+		ut_ad(rec_offs_nth_extern(offsets,
+					  big_rec_vec->fields[i].field_no));
+		{
+			ulint	local_len;
+			field_ref = rec_get_nth_field(
+				rec, offsets, big_rec_vec->fields[i].field_no,
+				&local_len);
+			ut_a(local_len >= BTR_EXTERN_FIELD_REF_SIZE);
+			local_len -= BTR_EXTERN_FIELD_REF_SIZE;
+			field_ref += local_len;
+		}
+		extern_len = big_rec_vec->fields[i].len;
+		UNIV_MEM_ASSERT_RW(big_rec_vec->fields[i].data,
+				   extern_len);
+
+		ut_a(extern_len > 0);
+
+		prev_page_no = FIL_NULL;
+
+		if (UNIV_LIKELY_NULL(page_zip)) {
+			int	err = deflateReset(&c_stream);
+			ut_a(err == Z_OK);
+
+			c_stream.next_in = (void*) big_rec_vec->fields[i].data;
+			c_stream.avail_in = extern_len;
+		}
+
+		for (;;) {
+			buf_block_t*	block;
+			page_t*		page;
+
+			mtr_start(&mtr);
+
+			if (prev_page_no == FIL_NULL) {
+				hint_page_no = 1 + rec_page_no;
+			} else {
+				hint_page_no = prev_page_no + 1;
+			}
+
+			block = btr_page_alloc(index, hint_page_no,
+					       FSP_NO_DIR, 0, &mtr);
+			if (UNIV_UNLIKELY(block == NULL)) {
+
+				mtr_commit(&mtr);
+
+				if (UNIV_LIKELY_NULL(page_zip)) {
+					deflateEnd(&c_stream);
+					mem_heap_free(heap);
+				}
+
+				return(DB_OUT_OF_FILE_SPACE);
+			}
+
+			page_no = buf_block_get_page_no(block);
+			page = buf_block_get_frame(block);
+
+			if (prev_page_no != FIL_NULL) {
+				buf_block_t*	prev_block;
+				page_t*		prev_page;
+
+				prev_block = buf_page_get(space_id, zip_size,
+							  prev_page_no,
+							  RW_X_LATCH, &mtr);
+				buf_block_dbg_add_level(prev_block,
+							SYNC_EXTERN_STORAGE);
+				prev_page = buf_block_get_frame(prev_block);
+
+				if (UNIV_LIKELY_NULL(page_zip)) {
+					mlog_write_ulint(
+						prev_page + FIL_PAGE_NEXT,
+						page_no, MLOG_4BYTES, &mtr);
+					memcpy(buf_block_get_page_zip(
+						       prev_block)
+					       ->data + FIL_PAGE_NEXT,
+					       prev_page + FIL_PAGE_NEXT, 4);
+				} else {
+					mlog_write_ulint(
+						prev_page + FIL_PAGE_DATA
+						+ BTR_BLOB_HDR_NEXT_PAGE_NO,
+						page_no, MLOG_4BYTES, &mtr);
+				}
+
+			}
+
+			if (UNIV_LIKELY_NULL(page_zip)) {
+				int		err;
+				page_zip_des_t*	blob_page_zip;
+
+				/* Write FIL_PAGE_TYPE to the redo log
+				separately, before logging any other
+				changes to the page, so that the debug
+				assertions in
+				recv_parse_or_apply_log_rec_body() can
+				be made simpler.  Before InnoDB Plugin
+				1.0.4, the initialization of
+				FIL_PAGE_TYPE was logged as part of
+				the mlog_log_string() below. */
+
+				mlog_write_ulint(page + FIL_PAGE_TYPE,
+						 prev_page_no == FIL_NULL
+						 ? FIL_PAGE_TYPE_ZBLOB
+						 : FIL_PAGE_TYPE_ZBLOB2,
+						 MLOG_2BYTES, &mtr);
+
+				c_stream.next_out = page
+					+ FIL_PAGE_DATA;
+				c_stream.avail_out
+					= page_zip_get_size(page_zip)
+					- FIL_PAGE_DATA;
+
+				err = deflate(&c_stream, Z_FINISH);
+				ut_a(err == Z_OK || err == Z_STREAM_END);
+				ut_a(err == Z_STREAM_END
+				     || c_stream.avail_out == 0);
+
+				/* Write the "next BLOB page" pointer */
+				mlog_write_ulint(page + FIL_PAGE_NEXT,
+						 FIL_NULL, MLOG_4BYTES, &mtr);
+				/* Initialize the unused "prev page" pointer */
+				mlog_write_ulint(page + FIL_PAGE_PREV,
+						 FIL_NULL, MLOG_4BYTES, &mtr);
+				/* Write a back pointer to the record
+				into the otherwise unused area.  This
+				information could be useful in
+				debugging.  Later, we might want to
+				implement the possibility to relocate
+				BLOB pages.  Then, we would need to be
+				able to adjust the BLOB pointer in the
+				record.  We do not store the heap
+				number of the record, because it can
+				change in page_zip_reorganize() or
+				btr_page_reorganize().  However, also
+				the page number of the record may
+				change when B-tree nodes are split or
+				merged. */
+				mlog_write_ulint(page
+						 + FIL_PAGE_FILE_FLUSH_LSN,
+						 space_id,
+						 MLOG_4BYTES, &mtr);
+				mlog_write_ulint(page
+						 + FIL_PAGE_FILE_FLUSH_LSN + 4,
+						 rec_page_no,
+						 MLOG_4BYTES, &mtr);
+
+				/* Zero out the unused part of the page. */
+				memset(page + page_zip_get_size(page_zip)
+				       - c_stream.avail_out,
+				       0, c_stream.avail_out);
+				mlog_log_string(page + FIL_PAGE_FILE_FLUSH_LSN,
+						page_zip_get_size(page_zip)
+						- FIL_PAGE_FILE_FLUSH_LSN,
+						&mtr);
+				/* Copy the page to compressed storage,
+				because it will be flushed to disk
+				from there. */
+				blob_page_zip = buf_block_get_page_zip(block);
+				ut_ad(blob_page_zip);
+				ut_ad(page_zip_get_size(blob_page_zip)
+				      == page_zip_get_size(page_zip));
+				memcpy(blob_page_zip->data, page,
+				       page_zip_get_size(page_zip));
+
+				if (err == Z_OK && prev_page_no != FIL_NULL) {
+
+					goto next_zip_page;
+				}
+
+				rec_block = buf_page_get(space_id, zip_size,
+							 rec_page_no,
+							 RW_X_LATCH, &mtr);
+				buf_block_dbg_add_level(rec_block,
+							SYNC_NO_ORDER_CHECK);
+
+				if (err == Z_STREAM_END) {
+					mach_write_to_4(field_ref
+							+ BTR_EXTERN_LEN, 0);
+					mach_write_to_4(field_ref
+							+ BTR_EXTERN_LEN + 4,
+							c_stream.total_in);
+				} else {
+					memset(field_ref + BTR_EXTERN_LEN,
+					       0, 8);
+				}
+
+				if (prev_page_no == FIL_NULL) {
+					mach_write_to_4(field_ref
+							+ BTR_EXTERN_SPACE_ID,
+							space_id);
+
+					mach_write_to_4(field_ref
+							+ BTR_EXTERN_PAGE_NO,
+							page_no);
+
+					mach_write_to_4(field_ref
+							+ BTR_EXTERN_OFFSET,
+							FIL_PAGE_NEXT);
+				}
+
+				page_zip_write_blob_ptr(
+					page_zip, rec, index, offsets,
+					big_rec_vec->fields[i].field_no, &mtr);
+
+next_zip_page:
+				prev_page_no = page_no;
+
+				/* Commit mtr and release the
+				uncompressed page frame to save memory. */
+				btr_blob_free(block, FALSE, &mtr);
+
+				if (err == Z_STREAM_END) {
+					break;
+				}
+			} else {
+				mlog_write_ulint(page + FIL_PAGE_TYPE,
+						 FIL_PAGE_TYPE_BLOB,
+						 MLOG_2BYTES, &mtr);
+
+				if (extern_len > (UNIV_PAGE_SIZE
+						  - FIL_PAGE_DATA
+						  - BTR_BLOB_HDR_SIZE
+						  - FIL_PAGE_DATA_END)) {
+					store_len = UNIV_PAGE_SIZE
+						- FIL_PAGE_DATA
+						- BTR_BLOB_HDR_SIZE
+						- FIL_PAGE_DATA_END;
+				} else {
+					store_len = extern_len;
+				}
+
+				mlog_write_string(page + FIL_PAGE_DATA
+						  + BTR_BLOB_HDR_SIZE,
+						  (const byte*)
+						  big_rec_vec->fields[i].data
+						  + big_rec_vec->fields[i].len
+						  - extern_len,
+						  store_len, &mtr);
+				mlog_write_ulint(page + FIL_PAGE_DATA
+						 + BTR_BLOB_HDR_PART_LEN,
+						 store_len, MLOG_4BYTES, &mtr);
+				mlog_write_ulint(page + FIL_PAGE_DATA
+						 + BTR_BLOB_HDR_NEXT_PAGE_NO,
+						 FIL_NULL, MLOG_4BYTES, &mtr);
+
+				extern_len -= store_len;
+
+				rec_block = buf_page_get(space_id, zip_size,
+							 rec_page_no,
+							 RW_X_LATCH, &mtr);
+				buf_block_dbg_add_level(rec_block,
+							SYNC_NO_ORDER_CHECK);
+
+				mlog_write_ulint(field_ref + BTR_EXTERN_LEN, 0,
+						 MLOG_4BYTES, &mtr);
+				mlog_write_ulint(field_ref
+						 + BTR_EXTERN_LEN + 4,
+						 big_rec_vec->fields[i].len
+						 - extern_len,
+						 MLOG_4BYTES, &mtr);
+
+				if (prev_page_no == FIL_NULL) {
+					mlog_write_ulint(field_ref
+							 + BTR_EXTERN_SPACE_ID,
+							 space_id,
+							 MLOG_4BYTES, &mtr);
+
+					mlog_write_ulint(field_ref
+							 + BTR_EXTERN_PAGE_NO,
+							 page_no,
+							 MLOG_4BYTES, &mtr);
+
+					mlog_write_ulint(field_ref
+							 + BTR_EXTERN_OFFSET,
+							 FIL_PAGE_DATA,
+							 MLOG_4BYTES, &mtr);
+				}
+
+				prev_page_no = page_no;
+
+				mtr_commit(&mtr);
+
+				if (extern_len == 0) {
+					break;
+				}
+			}
+		}
+	}
+
+	if (UNIV_LIKELY_NULL(page_zip)) {
+		deflateEnd(&c_stream);
+		mem_heap_free(heap);
+	}
+
+	return(DB_SUCCESS);
+}
+
+/*******************************************************************//**
+Check the FIL_PAGE_TYPE on an uncompressed BLOB page. */
+static
+void
+btr_check_blob_fil_page_type(
+/*=========================*/
+	ulint		space_id,	/*!< in: space id */
+	ulint		page_no,	/*!< in: page number */
+	const page_t*	page,		/*!< in: page */
+	ibool		read)		/*!< in: TRUE=read, FALSE=purge */
+{
+	ulint	type = fil_page_get_type(page);
+
+	ut_a(space_id == page_get_space_id(page));
+	ut_a(page_no == page_get_page_no(page));
+
+	if (UNIV_UNLIKELY(type != FIL_PAGE_TYPE_BLOB)) {
+		ulint	flags = fil_space_get_flags(space_id);
+
+		if (UNIV_LIKELY
+		    ((flags & DICT_TF_FORMAT_MASK) == DICT_TF_FORMAT_51)) {
+			/* Old versions of InnoDB did not initialize
+			FIL_PAGE_TYPE on BLOB pages.  Do not print
+			anything about the type mismatch when reading
+			a BLOB page that is in Antelope format.*/
+			return;
+		}
+
+		ut_print_timestamp(stderr);
+		fprintf(stderr,
+			"  InnoDB: FIL_PAGE_TYPE=%lu"
+			" on BLOB %s space %lu page %lu flags %lx\n",
+			(ulong) type, read ? "read" : "purge",
+			(ulong) space_id, (ulong) page_no, (ulong) flags);
+		ut_error;
+	}
+}
+
+/*******************************************************************//**
+Frees the space in an externally stored field to the file space
+management if the field in data is owned by the externally stored field,
+in a rollback we may have the additional condition that the field must
+not be inherited. */
+UNIV_INTERN
+void
+btr_free_externally_stored_field(
+/*=============================*/
+	dict_index_t*	index,		/*!< in: index of the data, the index
+					tree MUST be X-latched; if the tree
+					height is 1, then also the root page
+					must be X-latched! (this is relevant
+					in the case this function is called
+					from purge where 'data' is located on
+					an undo log page, not an index
+					page) */
+	byte*		field_ref,	/*!< in/out: field reference */
+	const rec_t*	rec,		/*!< in: record containing field_ref, for
+					page_zip_write_blob_ptr(), or NULL */
+	const ulint*	offsets,	/*!< in: rec_get_offsets(rec, index),
+					or NULL */
+	page_zip_des_t*	page_zip,	/*!< in: compressed page corresponding
+					to rec, or NULL if rec == NULL */
+	ulint		i,		/*!< in: field number of field_ref;
+					ignored if rec == NULL */
+	enum trx_rb_ctx	rb_ctx,		/*!< in: rollback context */
+	mtr_t*		local_mtr __attribute__((unused))) /*!< in: mtr
+					containing the latch to data an an
+					X-latch to the index tree */
+{
+	page_t*		page;
+	ulint		space_id;
+	ulint		rec_zip_size = dict_table_zip_size(index->table);
+	ulint		ext_zip_size;
+	ulint		page_no;
+	ulint		next_page_no;
+	mtr_t		mtr;
+#ifdef UNIV_DEBUG
+	ut_ad(mtr_memo_contains(local_mtr, dict_index_get_lock(index),
+				MTR_MEMO_X_LOCK));
+	ut_ad(mtr_memo_contains_page(local_mtr, field_ref,
+				     MTR_MEMO_PAGE_X_FIX));
+	ut_ad(!rec || rec_offs_validate(rec, index, offsets));
+
+	if (rec) {
+		ulint	local_len;
+		const byte*	f = rec_get_nth_field(rec, offsets,
+						      i, &local_len);
+		ut_a(local_len >= BTR_EXTERN_FIELD_REF_SIZE);
+		local_len -= BTR_EXTERN_FIELD_REF_SIZE;
+		f += local_len;
+		ut_ad(f == field_ref);
+	}
+#endif /* UNIV_DEBUG */
+
+	if (UNIV_UNLIKELY(!memcmp(field_ref, field_ref_zero,
+				  BTR_EXTERN_FIELD_REF_SIZE))) {
+		/* In the rollback of uncommitted transactions, we may
+		encounter a clustered index record whose BLOBs have
+		not been written.  There is nothing to free then. */
+		ut_a(rb_ctx == RB_RECOVERY || rb_ctx == RB_RECOVERY_PURGE_REC);
+		return;
+	}
+
+	space_id = mach_read_from_4(field_ref + BTR_EXTERN_SPACE_ID);
+
+	if (UNIV_UNLIKELY(space_id != dict_index_get_space(index))) {
+		ext_zip_size = fil_space_get_zip_size(space_id);
+		/* This must be an undo log record in the system tablespace,
+		that is, in row_purge_upd_exist_or_extern().
+		Currently, externally stored records are stored in the
+		same tablespace as the referring records. */
+		ut_ad(!page_get_space_id(page_align(field_ref)));
+		ut_ad(!rec);
+		ut_ad(!page_zip);
+	} else {
+		ext_zip_size = rec_zip_size;
+	}
+
+	if (!rec) {
+		/* This is a call from row_purge_upd_exist_or_extern(). */
+		ut_ad(!page_zip);
+		rec_zip_size = 0;
+	}
+
+	for (;;) {
+		buf_block_t*	rec_block;
+		buf_block_t*	ext_block;
+
+		mtr_start(&mtr);
+
+		rec_block = buf_page_get(page_get_space_id(
+						 page_align(field_ref)),
+					 rec_zip_size,
+					 page_get_page_no(
+						 page_align(field_ref)),
+					 RW_X_LATCH, &mtr);
+		buf_block_dbg_add_level(rec_block, SYNC_NO_ORDER_CHECK);
+		page_no = mach_read_from_4(field_ref + BTR_EXTERN_PAGE_NO);
+
+		if (/* There is no external storage data */
+		    page_no == FIL_NULL
+		    /* This field does not own the externally stored field */
+		    || (mach_read_from_1(field_ref + BTR_EXTERN_LEN)
+			& BTR_EXTERN_OWNER_FLAG)
+		    /* Rollback and inherited field */
+		    || ((rb_ctx == RB_NORMAL || rb_ctx == RB_RECOVERY)
+			&& (mach_read_from_1(field_ref + BTR_EXTERN_LEN)
+			    & BTR_EXTERN_INHERITED_FLAG))) {
+
+			/* Do not free */
+			mtr_commit(&mtr);
+
+			return;
+		}
+
+		ext_block = buf_page_get(space_id, ext_zip_size, page_no,
+					 RW_X_LATCH, &mtr);
+		buf_block_dbg_add_level(ext_block, SYNC_EXTERN_STORAGE);
+		page = buf_block_get_frame(ext_block);
+
+		if (ext_zip_size) {
+			/* Note that page_zip will be NULL
+			in row_purge_upd_exist_or_extern(). */
+			switch (fil_page_get_type(page)) {
+			case FIL_PAGE_TYPE_ZBLOB:
+			case FIL_PAGE_TYPE_ZBLOB2:
+				break;
+			default:
+				ut_error;
+			}
+			next_page_no = mach_read_from_4(page + FIL_PAGE_NEXT);
+
+			btr_page_free_low(index, ext_block, 0, &mtr);
+
+			if (UNIV_LIKELY(page_zip != NULL)) {
+				mach_write_to_4(field_ref + BTR_EXTERN_PAGE_NO,
+						next_page_no);
+				mach_write_to_4(field_ref + BTR_EXTERN_LEN + 4,
+						0);
+				page_zip_write_blob_ptr(page_zip, rec, index,
+							offsets, i, &mtr);
+			} else {
+				mlog_write_ulint(field_ref
+						 + BTR_EXTERN_PAGE_NO,
+						 next_page_no,
+						 MLOG_4BYTES, &mtr);
+				mlog_write_ulint(field_ref
+						 + BTR_EXTERN_LEN + 4, 0,
+						 MLOG_4BYTES, &mtr);
+			}
+		} else {
+			ut_a(!page_zip);
+			btr_check_blob_fil_page_type(space_id, page_no, page,
+						     FALSE);
+
+			next_page_no = mach_read_from_4(
+				page + FIL_PAGE_DATA
+				+ BTR_BLOB_HDR_NEXT_PAGE_NO);
+
+			/* We must supply the page level (= 0) as an argument
+			because we did not store it on the page (we save the
+			space overhead from an index page header. */
+
+			btr_page_free_low(index, ext_block, 0, &mtr);
+
+			mlog_write_ulint(field_ref + BTR_EXTERN_PAGE_NO,
+					 next_page_no,
+					 MLOG_4BYTES, &mtr);
+			/* Zero out the BLOB length.  If the server
+			crashes during the execution of this function,
+			trx_rollback_or_clean_all_recovered() could
+			dereference the half-deleted BLOB, fetching a
+			wrong prefix for the BLOB. */
+			mlog_write_ulint(field_ref + BTR_EXTERN_LEN + 4,
+					 0,
+					 MLOG_4BYTES, &mtr);
+		}
+
+		/* Commit mtr and release the BLOB block to save memory. */
+		btr_blob_free(ext_block, TRUE, &mtr);
+	}
+}
+
+/***********************************************************//**
+Frees the externally stored fields for a record. */
+static
+void
+btr_rec_free_externally_stored_fields(
+/*==================================*/
+	dict_index_t*	index,	/*!< in: index of the data, the index
+				tree MUST be X-latched */
+	rec_t*		rec,	/*!< in/out: record */
+	const ulint*	offsets,/*!< in: rec_get_offsets(rec, index) */
+	page_zip_des_t*	page_zip,/*!< in: compressed page whose uncompressed
+				part will be updated, or NULL */
+	enum trx_rb_ctx	rb_ctx,	/*!< in: rollback context */
+	mtr_t*		mtr)	/*!< in: mini-transaction handle which contains
+				an X-latch to record page and to the index
+				tree */
+{
+	ulint	n_fields;
+	ulint	i;
+
+	ut_ad(rec_offs_validate(rec, index, offsets));
+	ut_ad(mtr_memo_contains_page(mtr, rec, MTR_MEMO_PAGE_X_FIX));
+	/* Free possible externally stored fields in the record */
+
+	ut_ad(dict_table_is_comp(index->table) == !!rec_offs_comp(offsets));
+	n_fields = rec_offs_n_fields(offsets);
+
+	for (i = 0; i < n_fields; i++) {
+		if (rec_offs_nth_extern(offsets, i)) {
+			ulint	len;
+			byte*	data
+				= rec_get_nth_field(rec, offsets, i, &len);
+			ut_a(len >= BTR_EXTERN_FIELD_REF_SIZE);
+
+			btr_free_externally_stored_field(
+				index, data + len - BTR_EXTERN_FIELD_REF_SIZE,
+				rec, offsets, page_zip, i, rb_ctx, mtr);
+		}
+	}
+}
+
+/***********************************************************//**
+Frees the externally stored fields for a record, if the field is mentioned
+in the update vector. */
+static
+void
+btr_rec_free_updated_extern_fields(
+/*===============================*/
+	dict_index_t*	index,	/*!< in: index of rec; the index tree MUST be
+				X-latched */
+	rec_t*		rec,	/*!< in/out: record */
+	page_zip_des_t*	page_zip,/*!< in: compressed page whose uncompressed
+				part will be updated, or NULL */
+	const ulint*	offsets,/*!< in: rec_get_offsets(rec, index) */
+	const upd_t*	update,	/*!< in: update vector */
+	enum trx_rb_ctx	rb_ctx,	/*!< in: rollback context */
+	mtr_t*		mtr)	/*!< in: mini-transaction handle which contains
+				an X-latch to record page and to the tree */
+{
+	ulint	n_fields;
+	ulint	i;
+
+	ut_ad(rec_offs_validate(rec, index, offsets));
+	ut_ad(mtr_memo_contains_page(mtr, rec, MTR_MEMO_PAGE_X_FIX));
+
+	/* Free possible externally stored fields in the record */
+
+	n_fields = upd_get_n_fields(update);
+
+	for (i = 0; i < n_fields; i++) {
+		const upd_field_t* ufield = upd_get_nth_field(update, i);
+
+		if (rec_offs_nth_extern(offsets, ufield->field_no)) {
+			ulint	len;
+			byte*	data = rec_get_nth_field(
+				rec, offsets, ufield->field_no, &len);
+			ut_a(len >= BTR_EXTERN_FIELD_REF_SIZE);
+
+			btr_free_externally_stored_field(
+				index, data + len - BTR_EXTERN_FIELD_REF_SIZE,
+				rec, offsets, page_zip,
+				ufield->field_no, rb_ctx, mtr);
+		}
+	}
+}
+
+/*******************************************************************//**
+Copies the prefix of an uncompressed BLOB.  The clustered index record
+that points to this BLOB must be protected by a lock or a page latch.
+@return	number of bytes written to buf */
+static
+ulint
+btr_copy_blob_prefix(
+/*=================*/
+	byte*		buf,	/*!< out: the externally stored part of
+				the field, or a prefix of it */
+	ulint		len,	/*!< in: length of buf, in bytes */
+	ulint		space_id,/*!< in: space id of the BLOB pages */
+	ulint		page_no,/*!< in: page number of the first BLOB page */
+	ulint		offset)	/*!< in: offset on the first BLOB page */
+{
+	ulint	copied_len	= 0;
+
+	for (;;) {
+		mtr_t		mtr;
+		buf_block_t*	block;
+		const page_t*	page;
+		const byte*	blob_header;
+		ulint		part_len;
+		ulint		copy_len;
+
+		mtr_start(&mtr);
+
+		block = buf_page_get(space_id, 0, page_no, RW_S_LATCH, &mtr);
+		buf_block_dbg_add_level(block, SYNC_EXTERN_STORAGE);
+		page = buf_block_get_frame(block);
+
+		btr_check_blob_fil_page_type(space_id, page_no, page, TRUE);
+
+		blob_header = page + offset;
+		part_len = btr_blob_get_part_len(blob_header);
+		copy_len = ut_min(part_len, len - copied_len);
+
+		memcpy(buf + copied_len,
+		       blob_header + BTR_BLOB_HDR_SIZE, copy_len);
+		copied_len += copy_len;
+
+		page_no = btr_blob_get_next_page_no(blob_header);
+
+		mtr_commit(&mtr);
+
+		if (page_no == FIL_NULL || copy_len != part_len) {
+			UNIV_MEM_ASSERT_RW(buf, copied_len);
+			return(copied_len);
+		}
+
+		/* On other BLOB pages except the first the BLOB header
+		always is at the page data start: */
+
+		offset = FIL_PAGE_DATA;
+
+		ut_ad(copied_len <= len);
+	}
+}
+
+/*******************************************************************//**
+Copies the prefix of a compressed BLOB.  The clustered index record
+that points to this BLOB must be protected by a lock or a page latch. */
+static
+void
+btr_copy_zblob_prefix(
+/*==================*/
+	z_stream*	d_stream,/*!< in/out: the decompressing stream */
+	ulint		zip_size,/*!< in: compressed BLOB page size */
+	ulint		space_id,/*!< in: space id of the BLOB pages */
+	ulint		page_no,/*!< in: page number of the first BLOB page */
+	ulint		offset)	/*!< in: offset on the first BLOB page */
+{
+	ulint	page_type = FIL_PAGE_TYPE_ZBLOB;
+
+	ut_ad(ut_is_2pow(zip_size));
+	ut_ad(zip_size >= PAGE_ZIP_MIN_SIZE);
+	ut_ad(zip_size <= UNIV_PAGE_SIZE);
+	ut_ad(space_id);
+
+	for (;;) {
+		buf_page_t*	bpage;
+		int		err;
+		ulint		next_page_no;
+
+		/* There is no latch on bpage directly.  Instead,
+		bpage is protected by the B-tree page latch that
+		is being held on the clustered index record, or,
+		in row_merge_copy_blobs(), by an exclusive table lock. */
+		bpage = buf_page_get_zip(space_id, zip_size, page_no);
+
+		if (UNIV_UNLIKELY(!bpage)) {
+			ut_print_timestamp(stderr);
+			fprintf(stderr,
+				"  InnoDB: Cannot load"
+				" compressed BLOB"
+				" page %lu space %lu\n",
+				(ulong) page_no, (ulong) space_id);
+			return;
+		}
+
+		if (UNIV_UNLIKELY
+		    (fil_page_get_type(bpage->zip.data) != page_type)) {
+			ut_print_timestamp(stderr);
+			fprintf(stderr,
+				"  InnoDB: Unexpected type %lu of"
+				" compressed BLOB"
+				" page %lu space %lu\n",
+				(ulong) fil_page_get_type(bpage->zip.data),
+				(ulong) page_no, (ulong) space_id);
+			goto end_of_blob;
+		}
+
+		next_page_no = mach_read_from_4(bpage->zip.data + offset);
+
+		if (UNIV_LIKELY(offset == FIL_PAGE_NEXT)) {
+			/* When the BLOB begins at page header,
+			the compressed data payload does not
+			immediately follow the next page pointer. */
+			offset = FIL_PAGE_DATA;
+		} else {
+			offset += 4;
+		}
+
+		d_stream->next_in = bpage->zip.data + offset;
+		d_stream->avail_in = zip_size - offset;
+
+		err = inflate(d_stream, Z_NO_FLUSH);
+		switch (err) {
+		case Z_OK:
+			if (!d_stream->avail_out) {
+				goto end_of_blob;
+			}
+			break;
+		case Z_STREAM_END:
+			if (next_page_no == FIL_NULL) {
+				goto end_of_blob;
+			}
+			/* fall through */
+		default:
+inflate_error:
+			ut_print_timestamp(stderr);
+			fprintf(stderr,
+				"  InnoDB: inflate() of"
+				" compressed BLOB"
+				" page %lu space %lu returned %d (%s)\n",
+				(ulong) page_no, (ulong) space_id,
+				err, d_stream->msg);
+		case Z_BUF_ERROR:
+			goto end_of_blob;
+		}
+
+		if (next_page_no == FIL_NULL) {
+			if (!d_stream->avail_in) {
+				ut_print_timestamp(stderr);
+				fprintf(stderr,
+					"  InnoDB: unexpected end of"
+					" compressed BLOB"
+					" page %lu space %lu\n",
+					(ulong) page_no,
+					(ulong) space_id);
+			} else {
+				err = inflate(d_stream, Z_FINISH);
+				switch (err) {
+				case Z_STREAM_END:
+				case Z_BUF_ERROR:
+					break;
+				default:
+					goto inflate_error;
+				}
+			}
+
+end_of_blob:
+			buf_page_release_zip(bpage);
+			return;
+		}
+
+		buf_page_release_zip(bpage);
+
+		/* On other BLOB pages except the first
+		the BLOB header always is at the page header: */
+
+		page_no = next_page_no;
+		offset = FIL_PAGE_NEXT;
+		page_type = FIL_PAGE_TYPE_ZBLOB2;
+	}
+}
+
+/*******************************************************************//**
+Copies the prefix of an externally stored field of a record.  The
+clustered index record that points to this BLOB must be protected by a
+lock or a page latch.
+@return	number of bytes written to buf */
+static
+ulint
+btr_copy_externally_stored_field_prefix_low(
+/*========================================*/
+	byte*		buf,	/*!< out: the externally stored part of
+				the field, or a prefix of it */
+	ulint		len,	/*!< in: length of buf, in bytes */
+	ulint		zip_size,/*!< in: nonzero=compressed BLOB page size,
+				zero for uncompressed BLOBs */
+	ulint		space_id,/*!< in: space id of the first BLOB page */
+	ulint		page_no,/*!< in: page number of the first BLOB page */
+	ulint		offset)	/*!< in: offset on the first BLOB page */
+{
+	if (UNIV_UNLIKELY(len == 0)) {
+		return(0);
+	}
+
+	if (UNIV_UNLIKELY(zip_size)) {
+		int		err;
+		z_stream	d_stream;
+		mem_heap_t*	heap;
+
+		/* Zlib inflate needs 32 kilobytes for the default
+		window size, plus a few kilobytes for small objects. */
+		heap = mem_heap_create(40000);
+		page_zip_set_alloc(&d_stream, heap);
+
+		err = inflateInit(&d_stream);
+		ut_a(err == Z_OK);
+
+		d_stream.next_out = buf;
+		d_stream.avail_out = len;
+		d_stream.avail_in = 0;
+
+		btr_copy_zblob_prefix(&d_stream, zip_size,
+				      space_id, page_no, offset);
+		inflateEnd(&d_stream);
+		mem_heap_free(heap);
+		UNIV_MEM_ASSERT_RW(buf, d_stream.total_out);
+		return(d_stream.total_out);
+	} else {
+		return(btr_copy_blob_prefix(buf, len, space_id,
+					    page_no, offset));
+	}
+}
+
+/*******************************************************************//**
+Copies the prefix of an externally stored field of a record.  The
+clustered index record must be protected by a lock or a page latch.
+@return the length of the copied field, or 0 if the column was being
+or has been deleted */
+UNIV_INTERN
+ulint
+btr_copy_externally_stored_field_prefix(
+/*====================================*/
+	byte*		buf,	/*!< out: the field, or a prefix of it */
+	ulint		len,	/*!< in: length of buf, in bytes */
+	ulint		zip_size,/*!< in: nonzero=compressed BLOB page size,
+				zero for uncompressed BLOBs */
+	const byte*	data,	/*!< in: 'internally' stored part of the
+				field containing also the reference to
+				the external part; must be protected by
+				a lock or a page latch */
+	ulint		local_len)/*!< in: length of data, in bytes */
+{
+	ulint	space_id;
+	ulint	page_no;
+	ulint	offset;
+
+	ut_a(local_len >= BTR_EXTERN_FIELD_REF_SIZE);
+
+	local_len -= BTR_EXTERN_FIELD_REF_SIZE;
+
+	if (UNIV_UNLIKELY(local_len >= len)) {
+		memcpy(buf, data, len);
+		return(len);
+	}
+
+	memcpy(buf, data, local_len);
+	data += local_len;
+
+	ut_a(memcmp(data, field_ref_zero, BTR_EXTERN_FIELD_REF_SIZE));
+
+	if (!mach_read_from_4(data + BTR_EXTERN_LEN + 4)) {
+		/* The externally stored part of the column has been
+		(partially) deleted.  Signal the half-deleted BLOB
+		to the caller. */
+
+		return(0);
+	}
+
+	space_id = mach_read_from_4(data + BTR_EXTERN_SPACE_ID);
+
+	page_no = mach_read_from_4(data + BTR_EXTERN_PAGE_NO);
+
+	offset = mach_read_from_4(data + BTR_EXTERN_OFFSET);
+
+	return(local_len
+	       + btr_copy_externally_stored_field_prefix_low(buf + local_len,
+							     len - local_len,
+							     zip_size,
+							     space_id, page_no,
+							     offset));
+}
+
+/*******************************************************************//**
+Copies an externally stored field of a record to mem heap.  The
+clustered index record must be protected by a lock or a page latch.
+@return	the whole field copied to heap */
+static
+byte*
+btr_copy_externally_stored_field(
+/*=============================*/
+	ulint*		len,	/*!< out: length of the whole field */
+	const byte*	data,	/*!< in: 'internally' stored part of the
+				field containing also the reference to
+				the external part; must be protected by
+				a lock or a page latch */
+	ulint		zip_size,/*!< in: nonzero=compressed BLOB page size,
+				zero for uncompressed BLOBs */
+	ulint		local_len,/*!< in: length of data */
+	mem_heap_t*	heap)	/*!< in: mem heap */
+{
+	ulint	space_id;
+	ulint	page_no;
+	ulint	offset;
+	ulint	extern_len;
+	byte*	buf;
+
+	ut_a(local_len >= BTR_EXTERN_FIELD_REF_SIZE);
+
+	local_len -= BTR_EXTERN_FIELD_REF_SIZE;
+
+	space_id = mach_read_from_4(data + local_len + BTR_EXTERN_SPACE_ID);
+
+	page_no = mach_read_from_4(data + local_len + BTR_EXTERN_PAGE_NO);
+
+	offset = mach_read_from_4(data + local_len + BTR_EXTERN_OFFSET);
+
+	/* Currently a BLOB cannot be bigger than 4 GB; we
+	leave the 4 upper bytes in the length field unused */
+
+	extern_len = mach_read_from_4(data + local_len + BTR_EXTERN_LEN + 4);
+
+	buf = mem_heap_alloc(heap, local_len + extern_len);
+
+	memcpy(buf, data, local_len);
+	*len = local_len
+		+ btr_copy_externally_stored_field_prefix_low(buf + local_len,
+							      extern_len,
+							      zip_size,
+							      space_id,
+							      page_no, offset);
+
+	return(buf);
+}
+
+/*******************************************************************//**
+Copies an externally stored field of a record to mem heap.
+@return	the field copied to heap, or NULL if the field is incomplete */
+UNIV_INTERN
+byte*
+btr_rec_copy_externally_stored_field(
+/*=================================*/
+	const rec_t*	rec,	/*!< in: record in a clustered index;
+				must be protected by a lock or a page latch */
+	const ulint*	offsets,/*!< in: array returned by rec_get_offsets() */
+	ulint		zip_size,/*!< in: nonzero=compressed BLOB page size,
+				zero for uncompressed BLOBs */
+	ulint		no,	/*!< in: field number */
+	ulint*		len,	/*!< out: length of the field */
+	mem_heap_t*	heap)	/*!< in: mem heap */
+{
+	ulint		local_len;
+	const byte*	data;
+
+	ut_a(rec_offs_nth_extern(offsets, no));
+
+	/* An externally stored field can contain some initial
+	data from the field, and in the last 20 bytes it has the
+	space id, page number, and offset where the rest of the
+	field data is stored, and the data length in addition to
+	the data stored locally. We may need to store some data
+	locally to get the local record length above the 128 byte
+	limit so that field offsets are stored in two bytes, and
+	the extern bit is available in those two bytes. */
+
+	data = rec_get_nth_field(rec, offsets, no, &local_len);
+
+	ut_a(local_len >= BTR_EXTERN_FIELD_REF_SIZE);
+
+	if (UNIV_UNLIKELY
+	    (!memcmp(data + local_len - BTR_EXTERN_FIELD_REF_SIZE,
+		     field_ref_zero, BTR_EXTERN_FIELD_REF_SIZE))) {
+		/* The externally stored field was not written yet.
+		This record should only be seen by
+		recv_recovery_rollback_active() or any
+		TRX_ISO_READ_UNCOMMITTED transactions. */
+		return(NULL);
+	}
+
+	return(btr_copy_externally_stored_field(len, data,
+						zip_size, local_len, heap));
+}
+#endif /* !UNIV_HOTBACKUP */
diff --git a/storage/xtradb/btr/btr0pcur.c b/storage/xtradb/btr/btr0pcur.c
new file mode 100644
index 00000000000..537c26f6bf2
--- /dev/null
+++ b/storage/xtradb/btr/btr0pcur.c
@@ -0,0 +1,606 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2010, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file btr/btr0pcur.c
+The index tree persistent cursor
+
+Created 2/23/1996 Heikki Tuuri
+*******************************************************/
+
+#include "btr0pcur.h"
+
+#ifdef UNIV_NONINL
+#include "btr0pcur.ic"
+#endif
+
+#include "ut0byte.h"
+#include "rem0cmp.h"
+#include "trx0trx.h"
+#include "srv0srv.h"
+/**************************************************************//**
+Allocates memory for a persistent cursor object and initializes the cursor.
+@return	own: persistent cursor */
+UNIV_INTERN
+btr_pcur_t*
+btr_pcur_create_for_mysql(void)
+/*============================*/
+{
+	btr_pcur_t*	pcur;
+
+	pcur = mem_alloc(sizeof(btr_pcur_t));
+
+	pcur->btr_cur.index = NULL;
+	btr_pcur_init(pcur);
+
+	return(pcur);
+}
+
+/**************************************************************//**
+Frees the memory for a persistent cursor object. */
+UNIV_INTERN
+void
+btr_pcur_free_for_mysql(
+/*====================*/
+	btr_pcur_t*	cursor)	/*!< in, own: persistent cursor */
+{
+	if (cursor->old_rec_buf != NULL) {
+
+		mem_free(cursor->old_rec_buf);
+
+		cursor->old_rec_buf = NULL;
+	}
+
+	cursor->btr_cur.page_cur.rec = NULL;
+	cursor->old_rec = NULL;
+	cursor->old_n_fields = 0;
+	cursor->old_stored = BTR_PCUR_OLD_NOT_STORED;
+
+	cursor->latch_mode = BTR_NO_LATCHES;
+	cursor->pos_state = BTR_PCUR_NOT_POSITIONED;
+
+	mem_free(cursor);
+}
+
+/**************************************************************//**
+The position of the cursor is stored by taking an initial segment of the
+record the cursor is positioned on, before, or after, and copying it to the
+cursor data structure, or just setting a flag if the cursor id before the
+first in an EMPTY tree, or after the last in an EMPTY tree. NOTE that the
+page where the cursor is positioned must not be empty if the index tree is
+not totally empty! */
+UNIV_INTERN
+void
+btr_pcur_store_position(
+/*====================*/
+	btr_pcur_t*	cursor, /*!< in: persistent cursor */
+	mtr_t*		mtr)	/*!< in: mtr */
+{
+	page_cur_t*	page_cursor;
+	buf_block_t*	block;
+	rec_t*		rec;
+	dict_index_t*	index;
+	page_t*		page;
+	ulint		offs;
+
+	ut_a(cursor->pos_state == BTR_PCUR_IS_POSITIONED);
+	ut_ad(cursor->latch_mode != BTR_NO_LATCHES);
+
+	block = btr_pcur_get_block(cursor);
+
+	if (srv_pass_corrupt_table && !block) {
+		return;
+	}
+	ut_a(block);
+
+	index = btr_cur_get_index(btr_pcur_get_btr_cur(cursor));
+
+	page_cursor = btr_pcur_get_page_cur(cursor);
+
+	rec = page_cur_get_rec(page_cursor);
+	page = page_align(rec);
+	offs = page_offset(rec);
+
+	ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_S_FIX)
+	      || mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
+	ut_a(cursor->latch_mode != BTR_NO_LATCHES);
+
+	if (UNIV_UNLIKELY(page_get_n_recs(page) == 0)) {
+		/* It must be an empty index tree; NOTE that in this case
+		we do not store the modify_clock, but always do a search
+		if we restore the cursor position */
+
+		ut_a(btr_page_get_next(page, mtr) == FIL_NULL);
+		ut_a(btr_page_get_prev(page, mtr) == FIL_NULL);
+
+		cursor->old_stored = BTR_PCUR_OLD_STORED;
+
+		if (page_rec_is_supremum_low(offs)) {
+
+			cursor->rel_pos = BTR_PCUR_AFTER_LAST_IN_TREE;
+		} else {
+			cursor->rel_pos = BTR_PCUR_BEFORE_FIRST_IN_TREE;
+		}
+
+		return;
+	}
+
+	if (page_rec_is_supremum_low(offs)) {
+
+		rec = page_rec_get_prev(rec);
+
+		cursor->rel_pos = BTR_PCUR_AFTER;
+
+	} else if (page_rec_is_infimum_low(offs)) {
+
+		rec = page_rec_get_next(rec);
+
+		cursor->rel_pos = BTR_PCUR_BEFORE;
+	} else {
+		cursor->rel_pos = BTR_PCUR_ON;
+	}
+
+	cursor->old_stored = BTR_PCUR_OLD_STORED;
+	cursor->old_rec = dict_index_copy_rec_order_prefix(
+		index, rec, &cursor->old_n_fields,
+		&cursor->old_rec_buf, &cursor->buf_size);
+
+	cursor->block_when_stored = block;
+	cursor->modify_clock = buf_block_get_modify_clock(block);
+}
+
+/**************************************************************//**
+Copies the stored position of a pcur to another pcur. */
+UNIV_INTERN
+void
+btr_pcur_copy_stored_position(
+/*==========================*/
+	btr_pcur_t*	pcur_receive,	/*!< in: pcur which will receive the
+					position info */
+	btr_pcur_t*	pcur_donate)	/*!< in: pcur from which the info is
+					copied */
+{
+	if (pcur_receive->old_rec_buf) {
+		mem_free(pcur_receive->old_rec_buf);
+	}
+
+	ut_memcpy(pcur_receive, pcur_donate, sizeof(btr_pcur_t));
+
+	if (pcur_donate->old_rec_buf) {
+
+		pcur_receive->old_rec_buf = mem_alloc(pcur_donate->buf_size);
+
+		ut_memcpy(pcur_receive->old_rec_buf, pcur_donate->old_rec_buf,
+			  pcur_donate->buf_size);
+		pcur_receive->old_rec = pcur_receive->old_rec_buf
+			+ (pcur_donate->old_rec - pcur_donate->old_rec_buf);
+	}
+
+	pcur_receive->old_n_fields = pcur_donate->old_n_fields;
+}
+
+/**************************************************************//**
+Restores the stored position of a persistent cursor bufferfixing the page and
+obtaining the specified latches. If the cursor position was saved when the
+(1) cursor was positioned on a user record: this function restores the position
+to the last record LESS OR EQUAL to the stored record;
+(2) cursor was positioned on a page infimum record: restores the position to
+the last record LESS than the user record which was the successor of the page
+infimum;
+(3) cursor was positioned on the page supremum: restores to the first record
+GREATER than the user record which was the predecessor of the supremum.
+(4) cursor was positioned before the first or after the last in an empty tree:
+restores to before first or after the last in the tree.
+@return TRUE if the cursor position was stored when it was on a user
+record and it can be restored on a user record whose ordering fields
+are identical to the ones of the original user record */
+UNIV_INTERN
+ibool
+btr_pcur_restore_position_func(
+/*===========================*/
+	ulint		latch_mode,	/*!< in: BTR_SEARCH_LEAF, ... */
+	btr_pcur_t*	cursor,		/*!< in: detached persistent cursor */
+	const char*	file,		/*!< in: file name */
+	ulint		line,		/*!< in: line where called */
+	mtr_t*		mtr)		/*!< in: mtr */
+{
+	dict_index_t*	index;
+	dtuple_t*	tuple;
+	ulint		mode;
+	ulint		old_mode;
+	mem_heap_t*	heap;
+
+	ut_ad(mtr);
+	ut_ad(mtr->state == MTR_ACTIVE);
+
+	index = btr_cur_get_index(btr_pcur_get_btr_cur(cursor));
+
+	if (UNIV_UNLIKELY(cursor->old_stored != BTR_PCUR_OLD_STORED)
+	    || UNIV_UNLIKELY(cursor->pos_state != BTR_PCUR_WAS_POSITIONED
+			     && cursor->pos_state != BTR_PCUR_IS_POSITIONED)) {
+		ut_print_buf(stderr, cursor, sizeof(btr_pcur_t));
+		putc('\n', stderr);
+		if (cursor->trx_if_known) {
+			trx_print(stderr, cursor->trx_if_known, 0);
+		}
+
+		ut_error;
+	}
+
+	if (UNIV_UNLIKELY
+	    (cursor->rel_pos == BTR_PCUR_AFTER_LAST_IN_TREE
+	     || cursor->rel_pos == BTR_PCUR_BEFORE_FIRST_IN_TREE)) {
+
+		/* In these cases we do not try an optimistic restoration,
+		but always do a search */
+
+		btr_cur_open_at_index_side(
+			cursor->rel_pos == BTR_PCUR_BEFORE_FIRST_IN_TREE,
+			index, latch_mode, btr_pcur_get_btr_cur(cursor), mtr);
+
+		cursor->block_when_stored = btr_pcur_get_block(cursor);
+
+		return(FALSE);
+	}
+
+	ut_a(cursor->old_rec);
+	ut_a(cursor->old_n_fields);
+
+	if (UNIV_LIKELY(latch_mode == BTR_SEARCH_LEAF)
+	    || UNIV_LIKELY(latch_mode == BTR_MODIFY_LEAF)) {
+		/* Try optimistic restoration */
+
+		if (UNIV_LIKELY(buf_page_optimistic_get(
+					latch_mode,
+					cursor->block_when_stored,
+					cursor->modify_clock,
+					file, line, mtr))) {
+			cursor->pos_state = BTR_PCUR_IS_POSITIONED;
+
+			buf_block_dbg_add_level(btr_pcur_get_block(cursor),
+						SYNC_TREE_NODE);
+
+			if (cursor->rel_pos == BTR_PCUR_ON) {
+#ifdef UNIV_DEBUG
+				const rec_t*	rec;
+				const ulint*	offsets1;
+				const ulint*	offsets2;
+#endif /* UNIV_DEBUG */
+				cursor->latch_mode = latch_mode;
+#ifdef UNIV_DEBUG
+				rec = btr_pcur_get_rec(cursor);
+
+				heap = mem_heap_create(256);
+				offsets1 = rec_get_offsets(
+					cursor->old_rec, index, NULL,
+					cursor->old_n_fields, &heap);
+				offsets2 = rec_get_offsets(
+					rec, index, NULL,
+					cursor->old_n_fields, &heap);
+
+				ut_ad(!cmp_rec_rec(cursor->old_rec,
+						   rec, offsets1, offsets2,
+						   index));
+				mem_heap_free(heap);
+#endif /* UNIV_DEBUG */
+				return(TRUE);
+			}
+
+			return(FALSE);
+		}
+	}
+
+	/* If optimistic restoration did not succeed, open the cursor anew */
+
+	heap = mem_heap_create(256);
+
+	tuple = dict_index_build_data_tuple(index, cursor->old_rec,
+					    cursor->old_n_fields, heap);
+
+	/* Save the old search mode of the cursor */
+	old_mode = cursor->search_mode;
+
+	if (UNIV_LIKELY(cursor->rel_pos == BTR_PCUR_ON)) {
+		mode = PAGE_CUR_LE;
+	} else if (cursor->rel_pos == BTR_PCUR_AFTER) {
+		mode = PAGE_CUR_G;
+	} else {
+		ut_ad(cursor->rel_pos == BTR_PCUR_BEFORE);
+		mode = PAGE_CUR_L;
+	}
+
+	btr_pcur_open_with_no_init_func(index, tuple, mode, latch_mode,
+					cursor, 0, file, line, mtr);
+
+	/* Restore the old search mode */
+	cursor->search_mode = old_mode;
+
+	if (cursor->rel_pos == BTR_PCUR_ON
+	    && btr_pcur_is_on_user_rec(cursor)
+	    && 0 == cmp_dtuple_rec(tuple, btr_pcur_get_rec(cursor),
+				   rec_get_offsets(
+					   btr_pcur_get_rec(cursor), index,
+					   NULL, ULINT_UNDEFINED, &heap))) {
+
+		/* We have to store the NEW value for the modify clock, since
+		the cursor can now be on a different page! But we can retain
+		the value of old_rec */
+
+		cursor->block_when_stored = btr_pcur_get_block(cursor);
+		cursor->modify_clock = buf_block_get_modify_clock(
+			cursor->block_when_stored);
+		cursor->old_stored = BTR_PCUR_OLD_STORED;
+
+		mem_heap_free(heap);
+
+		return(TRUE);
+	}
+
+	mem_heap_free(heap);
+
+	/* We have to store new position information, modify_clock etc.,
+	to the cursor because it can now be on a different page, the record
+	under it may have been removed, etc. */
+
+	btr_pcur_store_position(cursor, mtr);
+
+	return(FALSE);
+}
+
+/**************************************************************//**
+If the latch mode of the cursor is BTR_LEAF_SEARCH or BTR_LEAF_MODIFY,
+releases the page latch and bufferfix reserved by the cursor.
+NOTE! In the case of BTR_LEAF_MODIFY, there should not exist changes
+made by the current mini-transaction to the data protected by the
+cursor latch, as then the latch must not be released until mtr_commit. */
+UNIV_INTERN
+void
+btr_pcur_release_leaf(
+/*==================*/
+	btr_pcur_t*	cursor, /*!< in: persistent cursor */
+	mtr_t*		mtr)	/*!< in: mtr */
+{
+	buf_block_t*	block;
+
+	ut_a(cursor->pos_state == BTR_PCUR_IS_POSITIONED);
+	ut_ad(cursor->latch_mode != BTR_NO_LATCHES);
+
+	block = btr_pcur_get_block(cursor);
+
+	btr_leaf_page_release(block, cursor->latch_mode, mtr);
+
+	cursor->latch_mode = BTR_NO_LATCHES;
+
+	cursor->pos_state = BTR_PCUR_WAS_POSITIONED;
+}
+
+/*********************************************************//**
+Moves the persistent cursor to the first record on the next page. Releases the
+latch on the current page, and bufferunfixes it. Note that there must not be
+modifications on the current page, as then the x-latch can be released only in
+mtr_commit. */
+UNIV_INTERN
+void
+btr_pcur_move_to_next_page(
+/*=======================*/
+	btr_pcur_t*	cursor,	/*!< in: persistent cursor; must be on the
+				last record of the current page */
+	mtr_t*		mtr)	/*!< in: mtr */
+{
+	ulint		next_page_no;
+	ulint		space;
+	ulint		zip_size;
+	page_t*		page;
+	buf_block_t*	next_block;
+	page_t*		next_page;
+
+	ut_a(cursor->pos_state == BTR_PCUR_IS_POSITIONED);
+	ut_ad(cursor->latch_mode != BTR_NO_LATCHES);
+	ut_ad(btr_pcur_is_after_last_on_page(cursor));
+
+	cursor->old_stored = BTR_PCUR_OLD_NOT_STORED;
+
+	page = btr_pcur_get_page(cursor);
+	next_page_no = btr_page_get_next(page, mtr);
+	space = buf_block_get_space(btr_pcur_get_block(cursor));
+	zip_size = buf_block_get_zip_size(btr_pcur_get_block(cursor));
+
+	ut_ad(next_page_no != FIL_NULL);
+
+	next_block = btr_block_get(space, zip_size, next_page_no,
+				   cursor->latch_mode, mtr);
+	next_page = buf_block_get_frame(next_block);
+
+	if (srv_pass_corrupt_table && !next_page) {
+		btr_leaf_page_release(btr_pcur_get_block(cursor),
+				      cursor->latch_mode, mtr);
+		btr_pcur_get_page_cur(cursor)->block = 0;
+		btr_pcur_get_page_cur(cursor)->rec = 0;
+		return;
+	}
+	ut_a(next_page);
+#ifdef UNIV_BTR_DEBUG
+	ut_a(page_is_comp(next_page) == page_is_comp(page));
+	ut_a(btr_page_get_prev(next_page, mtr)
+	     == buf_block_get_page_no(btr_pcur_get_block(cursor)));
+#endif /* UNIV_BTR_DEBUG */
+	next_block->check_index_page_at_flush = TRUE;
+
+	btr_leaf_page_release(btr_pcur_get_block(cursor),
+			      cursor->latch_mode, mtr);
+
+	page_cur_set_before_first(next_block, btr_pcur_get_page_cur(cursor));
+
+	page_check_dir(next_page);
+}
+
+/*********************************************************//**
+Moves the persistent cursor backward if it is on the first record of the page.
+Commits mtr. Note that to prevent a possible deadlock, the operation
+first stores the position of the cursor, commits mtr, acquires the necessary
+latches and restores the cursor position again before returning. The
+alphabetical position of the cursor is guaranteed to be sensible on
+return, but it may happen that the cursor is not positioned on the last
+record of any page, because the structure of the tree may have changed
+during the time when the cursor had no latches. */
+UNIV_INTERN
+void
+btr_pcur_move_backward_from_page(
+/*=============================*/
+	btr_pcur_t*	cursor,	/*!< in: persistent cursor, must be on the first
+				record of the current page */
+	mtr_t*		mtr)	/*!< in: mtr */
+{
+	ulint		prev_page_no;
+	ulint		space;
+	page_t*		page;
+	buf_block_t*	prev_block;
+	ulint		latch_mode;
+	ulint		latch_mode2;
+
+	ut_a(cursor->pos_state == BTR_PCUR_IS_POSITIONED);
+	ut_ad(cursor->latch_mode != BTR_NO_LATCHES);
+	ut_ad(btr_pcur_is_before_first_on_page(cursor));
+	ut_ad(!btr_pcur_is_before_first_in_tree(cursor, mtr));
+
+	latch_mode = cursor->latch_mode;
+
+	if (latch_mode == BTR_SEARCH_LEAF) {
+
+		latch_mode2 = BTR_SEARCH_PREV;
+
+	} else if (latch_mode == BTR_MODIFY_LEAF) {
+
+		latch_mode2 = BTR_MODIFY_PREV;
+	} else {
+		latch_mode2 = 0; /* To eliminate compiler warning */
+		ut_error;
+	}
+
+	btr_pcur_store_position(cursor, mtr);
+
+	mtr_commit(mtr);
+
+	mtr_start(mtr);
+
+	btr_pcur_restore_position(latch_mode2, cursor, mtr);
+
+	page = btr_pcur_get_page(cursor);
+
+	prev_page_no = btr_page_get_prev(page, mtr);
+	space = buf_block_get_space(btr_pcur_get_block(cursor));
+
+	if (prev_page_no == FIL_NULL) {
+	} else if (btr_pcur_is_before_first_on_page(cursor)) {
+
+		prev_block = btr_pcur_get_btr_cur(cursor)->left_block;
+
+		btr_leaf_page_release(btr_pcur_get_block(cursor),
+				      latch_mode, mtr);
+
+		page_cur_set_after_last(prev_block,
+					btr_pcur_get_page_cur(cursor));
+	} else {
+
+		/* The repositioned cursor did not end on an infimum record on
+		a page. Cursor repositioning acquired a latch also on the
+		previous page, but we do not need the latch: release it. */
+
+		prev_block = btr_pcur_get_btr_cur(cursor)->left_block;
+
+		btr_leaf_page_release(prev_block, latch_mode, mtr);
+	}
+
+	cursor->latch_mode = latch_mode;
+
+	cursor->old_stored = BTR_PCUR_OLD_NOT_STORED;
+}
+
+/*********************************************************//**
+Moves the persistent cursor to the previous record in the tree. If no records
+are left, the cursor stays 'before first in tree'.
+@return	TRUE if the cursor was not before first in tree */
+UNIV_INTERN
+ibool
+btr_pcur_move_to_prev(
+/*==================*/
+	btr_pcur_t*	cursor,	/*!< in: persistent cursor; NOTE that the
+				function may release the page latch */
+	mtr_t*		mtr)	/*!< in: mtr */
+{
+	ut_ad(cursor->pos_state == BTR_PCUR_IS_POSITIONED);
+	ut_ad(cursor->latch_mode != BTR_NO_LATCHES);
+
+	cursor->old_stored = BTR_PCUR_OLD_NOT_STORED;
+
+	if (btr_pcur_is_before_first_on_page(cursor)) {
+
+		if (btr_pcur_is_before_first_in_tree(cursor, mtr)) {
+
+			return(FALSE);
+		}
+
+		btr_pcur_move_backward_from_page(cursor, mtr);
+
+		return(TRUE);
+	}
+
+	btr_pcur_move_to_prev_on_page(cursor);
+
+	return(TRUE);
+}
+
+/**************************************************************//**
+If mode is PAGE_CUR_G or PAGE_CUR_GE, opens a persistent cursor on the first
+user record satisfying the search condition, in the case PAGE_CUR_L or
+PAGE_CUR_LE, on the last user record. If no such user record exists, then
+in the first case sets the cursor after last in tree, and in the latter case
+before first in tree. The latching mode must be BTR_SEARCH_LEAF or
+BTR_MODIFY_LEAF. */
+UNIV_INTERN
+void
+btr_pcur_open_on_user_rec_func(
+/*===========================*/
+	dict_index_t*	index,		/*!< in: index */
+	const dtuple_t*	tuple,		/*!< in: tuple on which search done */
+	ulint		mode,		/*!< in: PAGE_CUR_L, ... */
+	ulint		latch_mode,	/*!< in: BTR_SEARCH_LEAF or
+					BTR_MODIFY_LEAF */
+	btr_pcur_t*	cursor,		/*!< in: memory buffer for persistent
+					cursor */
+	const char*	file,		/*!< in: file name */
+	ulint		line,		/*!< in: line where called */
+	mtr_t*		mtr)		/*!< in: mtr */
+{
+	btr_pcur_open_func(index, tuple, mode, latch_mode, cursor,
+			   file, line, mtr);
+
+	if ((mode == PAGE_CUR_GE) || (mode == PAGE_CUR_G)) {
+
+		if (btr_pcur_is_after_last_on_page(cursor)) {
+
+			btr_pcur_move_to_next_user_rec(cursor, mtr);
+		}
+	} else {
+		ut_ad((mode == PAGE_CUR_LE) || (mode == PAGE_CUR_L));
+
+		/* Not implemented yet */
+
+		ut_error;
+	}
+}
diff --git a/storage/xtradb/btr/btr0sea.c b/storage/xtradb/btr/btr0sea.c
new file mode 100644
index 00000000000..6628333d32a
--- /dev/null
+++ b/storage/xtradb/btr/btr0sea.c
@@ -0,0 +1,2032 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 2008, Google Inc.
+
+Portions of this file contain modifications contributed and copyrighted by
+Google, Inc. Those modifications are gratefully acknowledged and are described
+briefly in the InnoDB documentation. The contributions by Google are
+incorporated with their permission, and subject to the conditions contained in
+the file COPYING.Google.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/********************************************************************//**
+@file btr/btr0sea.c
+The index tree adaptive search
+
+Created 2/17/1996 Heikki Tuuri
+*************************************************************************/
+
+#include "btr0sea.h"
+#ifdef UNIV_NONINL
+#include "btr0sea.ic"
+#endif
+
+#include "buf0buf.h"
+#include "page0page.h"
+#include "page0cur.h"
+#include "btr0cur.h"
+#include "btr0pcur.h"
+#include "btr0btr.h"
+#include "ha0ha.h"
+#include "srv0srv.h"
+/** Flag: has the search system been enabled?
+Protected by btr_search_latch and btr_search_enabled_mutex. */
+UNIV_INTERN char		btr_search_enabled	= TRUE;
+UNIV_INTERN ibool		btr_search_fully_disabled = FALSE;
+
+/** Mutex protecting btr_search_enabled */
+static mutex_t			btr_search_enabled_mutex;
+
+/** A dummy variable to fool the compiler */
+UNIV_INTERN ulint		btr_search_this_is_zero = 0;
+
+#ifdef UNIV_SEARCH_PERF_STAT
+/** Number of successful adaptive hash index lookups */
+UNIV_INTERN ulint		btr_search_n_succ	= 0;
+/** Number of failed adaptive hash index lookups */
+UNIV_INTERN ulint		btr_search_n_hash_fail	= 0;
+#endif /* UNIV_SEARCH_PERF_STAT */
+
+/** padding to prevent other memory update
+hotspots from residing on the same memory
+cache line as btr_search_latch */
+UNIV_INTERN byte		btr_sea_pad1[64];
+
+/** The latch protecting the adaptive search system: this latch protects the
+(1) positions of records on those pages where a hash index has been built.
+NOTE: It does not protect values of non-ordering fields within a record from
+being updated in-place! We can use fact (1) to perform unique searches to
+indexes. */
+
+/* We will allocate the latch from dynamic memory to get it to the
+same DRAM page as other hotspot semaphores */
+UNIV_INTERN rw_lock_t*		btr_search_latch_temp;
+
+/** padding to prevent other memory update hotspots from residing on
+the same memory cache line */
+UNIV_INTERN byte		btr_sea_pad2[64];
+
+/** The adaptive hash index */
+UNIV_INTERN btr_search_sys_t*	btr_search_sys;
+
+/** If the number of records on the page divided by this parameter
+would have been successfully accessed using a hash index, the index
+is then built on the page, assuming the global limit has been reached */
+#define BTR_SEARCH_PAGE_BUILD_LIMIT	16
+
+/** The global limit for consecutive potentially successful hash searches,
+before hash index building is started */
+#define BTR_SEARCH_BUILD_LIMIT		100
+
+/********************************************************************//**
+Builds a hash index on a page with the given parameters. If the page already
+has a hash index with different parameters, the old hash index is removed.
+If index is non-NULL, this function checks if n_fields and n_bytes are
+sensible values, and does not build a hash index if not. */
+static
+void
+btr_search_build_page_hash_index(
+/*=============================*/
+	dict_index_t*	index,	/*!< in: index for which to build, or NULL if
+				not known */
+	buf_block_t*	block,	/*!< in: index page, s- or x-latched */
+	ulint		n_fields,/*!< in: hash this many full fields */
+	ulint		n_bytes,/*!< in: hash this many bytes from the next
+				field */
+	ibool		left_side);/*!< in: hash for searches from left side? */
+
+/*****************************************************************//**
+This function should be called before reserving any btr search mutex, if
+the intended operation might add nodes to the search system hash table.
+Because of the latching order, once we have reserved the btr search system
+latch, we cannot allocate a free frame from the buffer pool. Checks that
+there is a free buffer frame allocated for hash table heap in the btr search
+system. If not, allocates a free frames for the heap. This check makes it
+probable that, when have reserved the btr search system latch and we need to
+allocate a new node to the hash table, it will succeed. However, the check
+will not guarantee success. */
+static
+void
+btr_search_check_free_space_in_heap(void)
+/*=====================================*/
+{
+	hash_table_t*	table;
+	mem_heap_t*	heap;
+
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(!rw_lock_own(&btr_search_latch, RW_LOCK_SHARED));
+	ut_ad(!rw_lock_own(&btr_search_latch, RW_LOCK_EX));
+#endif /* UNIV_SYNC_DEBUG */
+
+	table = btr_search_sys->hash_index;
+
+	heap = table->heap;
+
+	/* Note that we peek the value of heap->free_block without reserving
+	the latch: this is ok, because we will not guarantee that there will
+	be enough free space in the hash table. */
+
+	if (heap->free_block == NULL) {
+		buf_block_t*	block = buf_block_alloc(0);
+
+		rw_lock_x_lock(&btr_search_latch);
+
+		if (heap->free_block == NULL) {
+			heap->free_block = block;
+		} else {
+			buf_block_free(block);
+		}
+
+		rw_lock_x_unlock(&btr_search_latch);
+	}
+}
+
+/*****************************************************************//**
+Creates and initializes the adaptive search system at a database start. */
+UNIV_INTERN
+void
+btr_search_sys_create(
+/*==================*/
+	ulint	hash_size)	/*!< in: hash index hash table size */
+{
+	/* We allocate the search latch from dynamic memory:
+	see above at the global variable definition */
+
+	btr_search_latch_temp = mem_alloc(sizeof(rw_lock_t));
+
+	rw_lock_create(&btr_search_latch, SYNC_SEARCH_SYS);
+	mutex_create(&btr_search_enabled_mutex, SYNC_SEARCH_SYS_CONF);
+
+	btr_search_sys = mem_alloc(sizeof(btr_search_sys_t));
+
+	btr_search_sys->hash_index = ha_create(hash_size, 0, 0);
+}
+
+/*****************************************************************//**
+Frees the adaptive search system at a database shutdown. */
+UNIV_INTERN
+void
+btr_search_sys_free(void)
+/*=====================*/
+{
+	rw_lock_free(&btr_search_latch);
+	mem_free(btr_search_latch_temp);
+	btr_search_latch_temp = NULL;
+	mem_heap_free(btr_search_sys->hash_index->heap);
+	hash_table_free(btr_search_sys->hash_index);
+	mem_free(btr_search_sys);
+	btr_search_sys = NULL;
+}
+
+/********************************************************************//**
+Disable the adaptive hash search system and empty the index. */
+UNIV_INTERN
+void
+btr_search_disable(void)
+/*====================*/
+{
+	mutex_enter(&btr_search_enabled_mutex);
+	rw_lock_x_lock(&btr_search_latch);
+
+	/* Disable access to hash index, also tell ha_insert_for_fold()
+	stop adding new nodes to hash index, but still allow updating
+	existing nodes */
+	btr_search_enabled = FALSE;
+
+	/* Clear all block->is_hashed flags and remove all entries
+	from btr_search_sys->hash_index. */
+	buf_pool_drop_hash_index();
+
+	/* hash index has been cleaned up, disallow any operation to
+	the hash index */
+	btr_search_fully_disabled = TRUE;
+
+	/* btr_search_enabled_mutex should guarantee this. */
+	ut_ad(!btr_search_enabled);
+
+	rw_lock_x_unlock(&btr_search_latch);
+	mutex_exit(&btr_search_enabled_mutex);
+}
+
+/********************************************************************//**
+Enable the adaptive hash search system. */
+UNIV_INTERN
+void
+btr_search_enable(void)
+/*====================*/
+{
+	mutex_enter(&btr_search_enabled_mutex);
+	rw_lock_x_lock(&btr_search_latch);
+
+	btr_search_enabled = TRUE;
+	btr_search_fully_disabled = FALSE;
+
+	rw_lock_x_unlock(&btr_search_latch);
+	mutex_exit(&btr_search_enabled_mutex);
+}
+
+/*****************************************************************//**
+Creates and initializes a search info struct.
+@return	own: search info struct */
+UNIV_INTERN
+btr_search_t*
+btr_search_info_create(
+/*===================*/
+	mem_heap_t*	heap)	/*!< in: heap where created */
+{
+	btr_search_t*	info;
+
+	info = mem_heap_alloc(heap, sizeof(btr_search_t));
+
+#ifdef UNIV_DEBUG
+	info->magic_n = BTR_SEARCH_MAGIC_N;
+#endif /* UNIV_DEBUG */
+
+	info->ref_count = 0;
+	info->root_guess = NULL;
+
+	info->hash_analysis = 0;
+	info->n_hash_potential = 0;
+
+	info->last_hash_succ = FALSE;
+
+#ifdef UNIV_SEARCH_PERF_STAT
+	info->n_hash_succ = 0;
+	info->n_hash_fail = 0;
+	info->n_patt_succ = 0;
+	info->n_searches = 0;
+#endif /* UNIV_SEARCH_PERF_STAT */
+
+	/* Set some sensible values */
+	info->n_fields = 1;
+	info->n_bytes = 0;
+
+	info->left_side = TRUE;
+
+	return(info);
+}
+
+/*****************************************************************//**
+Returns the value of ref_count. The value is protected by
+btr_search_latch.
+@return	ref_count value. */
+UNIV_INTERN
+ulint
+btr_search_info_get_ref_count(
+/*==========================*/
+	btr_search_t*   info)	/*!< in: search info. */
+{
+	ulint ret;
+
+	ut_ad(info);
+
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(!rw_lock_own(&btr_search_latch, RW_LOCK_SHARED));
+	ut_ad(!rw_lock_own(&btr_search_latch, RW_LOCK_EX));
+#endif /* UNIV_SYNC_DEBUG */
+
+	rw_lock_s_lock(&btr_search_latch);
+	ret = info->ref_count;
+	rw_lock_s_unlock(&btr_search_latch);
+
+	return(ret);
+}
+
+/*********************************************************************//**
+Updates the search info of an index about hash successes. NOTE that info
+is NOT protected by any semaphore, to save CPU time! Do not assume its fields
+are consistent. */
+static
+void
+btr_search_info_update_hash(
+/*========================*/
+	btr_search_t*	info,	/*!< in/out: search info */
+	btr_cur_t*	cursor)	/*!< in: cursor which was just positioned */
+{
+	dict_index_t*	index;
+	ulint		n_unique;
+	int		cmp;
+
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(!rw_lock_own(&btr_search_latch, RW_LOCK_SHARED));
+	ut_ad(!rw_lock_own(&btr_search_latch, RW_LOCK_EX));
+#endif /* UNIV_SYNC_DEBUG */
+
+	index = cursor->index;
+
+	if (dict_index_is_ibuf(index)) {
+		/* So many deletes are performed on an insert buffer tree
+		that we do not consider a hash index useful on it: */
+
+		return;
+	}
+
+	n_unique = dict_index_get_n_unique_in_tree(index);
+
+	if (info->n_hash_potential == 0) {
+
+		goto set_new_recomm;
+	}
+
+	/* Test if the search would have succeeded using the recommended
+	hash prefix */
+
+	if (info->n_fields >= n_unique && cursor->up_match >= n_unique) {
+increment_potential:
+		info->n_hash_potential++;
+
+		return;
+	}
+
+	cmp = ut_pair_cmp(info->n_fields, info->n_bytes,
+			  cursor->low_match, cursor->low_bytes);
+
+	if (info->left_side ? cmp <= 0 : cmp > 0) {
+
+		goto set_new_recomm;
+	}
+
+	cmp = ut_pair_cmp(info->n_fields, info->n_bytes,
+			  cursor->up_match, cursor->up_bytes);
+
+	if (info->left_side ? cmp <= 0 : cmp > 0) {
+
+		goto increment_potential;
+	}
+
+set_new_recomm:
+	/* We have to set a new recommendation; skip the hash analysis
+	for a while to avoid unnecessary CPU time usage when there is no
+	chance for success */
+
+	info->hash_analysis = 0;
+
+	cmp = ut_pair_cmp(cursor->up_match, cursor->up_bytes,
+			  cursor->low_match, cursor->low_bytes);
+	if (cmp == 0) {
+		info->n_hash_potential = 0;
+
+		/* For extra safety, we set some sensible values here */
+
+		info->n_fields = 1;
+		info->n_bytes = 0;
+
+		info->left_side = TRUE;
+
+	} else if (cmp > 0) {
+		info->n_hash_potential = 1;
+
+		if (cursor->up_match >= n_unique) {
+
+			info->n_fields = n_unique;
+			info->n_bytes = 0;
+
+		} else if (cursor->low_match < cursor->up_match) {
+
+			info->n_fields = cursor->low_match + 1;
+			info->n_bytes = 0;
+		} else {
+			info->n_fields = cursor->low_match;
+			info->n_bytes = cursor->low_bytes + 1;
+		}
+
+		info->left_side = TRUE;
+	} else {
+		info->n_hash_potential = 1;
+
+		if (cursor->low_match >= n_unique) {
+
+			info->n_fields = n_unique;
+			info->n_bytes = 0;
+
+		} else if (cursor->low_match > cursor->up_match) {
+
+			info->n_fields = cursor->up_match + 1;
+			info->n_bytes = 0;
+		} else {
+			info->n_fields = cursor->up_match;
+			info->n_bytes = cursor->up_bytes + 1;
+		}
+
+		info->left_side = FALSE;
+	}
+}
+
+/*********************************************************************//**
+Updates the block search info on hash successes. NOTE that info and
+block->n_hash_helps, n_fields, n_bytes, side are NOT protected by any
+semaphore, to save CPU time! Do not assume the fields are consistent.
+@return	TRUE if building a (new) hash index on the block is recommended */
+static
+ibool
+btr_search_update_block_hash_info(
+/*==============================*/
+	btr_search_t*	info,	/*!< in: search info */
+	buf_block_t*	block,	/*!< in: buffer block */
+	btr_cur_t*	cursor __attribute__((unused)))
+				/*!< in: cursor */
+{
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(!rw_lock_own(&btr_search_latch, RW_LOCK_SHARED));
+	ut_ad(!rw_lock_own(&btr_search_latch, RW_LOCK_EX));
+	ut_ad(rw_lock_own(&block->lock, RW_LOCK_SHARED)
+	      || rw_lock_own(&block->lock, RW_LOCK_EX));
+#endif /* UNIV_SYNC_DEBUG */
+	ut_ad(cursor);
+
+	info->last_hash_succ = FALSE;
+
+	ut_a(buf_block_state_valid(block));
+	ut_ad(info->magic_n == BTR_SEARCH_MAGIC_N);
+
+	if ((block->n_hash_helps > 0)
+	    && (info->n_hash_potential > 0)
+	    && (block->n_fields == info->n_fields)
+	    && (block->n_bytes == info->n_bytes)
+	    && (block->left_side == info->left_side)) {
+
+		if ((block->is_hashed)
+		    && (block->curr_n_fields == info->n_fields)
+		    && (block->curr_n_bytes == info->n_bytes)
+		    && (block->curr_left_side == info->left_side)) {
+
+			/* The search would presumably have succeeded using
+			the hash index */
+
+			info->last_hash_succ = TRUE;
+		}
+
+		block->n_hash_helps++;
+	} else {
+		block->n_hash_helps = 1;
+		block->n_fields = info->n_fields;
+		block->n_bytes = info->n_bytes;
+		block->left_side = info->left_side;
+	}
+
+#ifdef UNIV_DEBUG
+	if (cursor->index->table->does_not_fit_in_memory) {
+		block->n_hash_helps = 0;
+	}
+#endif /* UNIV_DEBUG */
+
+	if ((block->n_hash_helps > page_get_n_recs(block->frame)
+	     / BTR_SEARCH_PAGE_BUILD_LIMIT)
+	    && (info->n_hash_potential >= BTR_SEARCH_BUILD_LIMIT)) {
+
+		if ((!block->is_hashed)
+		    || (block->n_hash_helps
+			> 2 * page_get_n_recs(block->frame))
+		    || (block->n_fields != block->curr_n_fields)
+		    || (block->n_bytes != block->curr_n_bytes)
+		    || (block->left_side != block->curr_left_side)) {
+
+			/* Build a new hash index on the page */
+
+			return(TRUE);
+		}
+	}
+
+	return(FALSE);
+}
+
+/*********************************************************************//**
+Updates a hash node reference when it has been unsuccessfully used in a
+search which could have succeeded with the used hash parameters. This can
+happen because when building a hash index for a page, we do not check
+what happens at page boundaries, and therefore there can be misleading
+hash nodes. Also, collisions in the fold value can lead to misleading
+references. This function lazily fixes these imperfections in the hash
+index. */
+static
+void
+btr_search_update_hash_ref(
+/*=======================*/
+	btr_search_t*	info,	/*!< in: search info */
+	buf_block_t*	block,	/*!< in: buffer block where cursor positioned */
+	btr_cur_t*	cursor)	/*!< in: cursor */
+{
+	ulint	fold;
+	rec_t*	rec;
+	dulint	index_id;
+
+	ut_ad(cursor->flag == BTR_CUR_HASH_FAIL);
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(rw_lock_own(&btr_search_latch, RW_LOCK_EX));
+	ut_ad(rw_lock_own(&(block->lock), RW_LOCK_SHARED)
+	      || rw_lock_own(&(block->lock), RW_LOCK_EX));
+#endif /* UNIV_SYNC_DEBUG */
+	ut_ad(page_align(btr_cur_get_rec(cursor))
+	      == buf_block_get_frame(block));
+
+	if (!block->is_hashed) {
+
+		return;
+	}
+
+	ut_a(block->index == cursor->index);
+	ut_a(!dict_index_is_ibuf(cursor->index));
+
+	if ((info->n_hash_potential > 0)
+	    && (block->curr_n_fields == info->n_fields)
+	    && (block->curr_n_bytes == info->n_bytes)
+	    && (block->curr_left_side == info->left_side)) {
+		mem_heap_t*	heap		= NULL;
+		ulint		offsets_[REC_OFFS_NORMAL_SIZE];
+		rec_offs_init(offsets_);
+
+		rec = btr_cur_get_rec(cursor);
+
+		if (!page_rec_is_user_rec(rec)) {
+
+			return;
+		}
+
+		index_id = cursor->index->id;
+		fold = rec_fold(rec,
+				rec_get_offsets(rec, cursor->index, offsets_,
+						ULINT_UNDEFINED, &heap),
+				block->curr_n_fields,
+				block->curr_n_bytes, index_id);
+		if (UNIV_LIKELY_NULL(heap)) {
+			mem_heap_free(heap);
+		}
+#ifdef UNIV_SYNC_DEBUG
+		ut_ad(rw_lock_own(&btr_search_latch, RW_LOCK_EX));
+#endif /* UNIV_SYNC_DEBUG */
+
+		ha_insert_for_fold(btr_search_sys->hash_index, fold,
+				   block, rec);
+	}
+}
+
+/*********************************************************************//**
+Updates the search info. */
+UNIV_INTERN
+void
+btr_search_info_update_slow(
+/*========================*/
+	btr_search_t*	info,	/*!< in/out: search info */
+	btr_cur_t*	cursor)	/*!< in: cursor which was just positioned */
+{
+	buf_block_t*	block;
+	ibool		build_index;
+	ulint*		params;
+	ulint*		params2;
+
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(!rw_lock_own(&btr_search_latch, RW_LOCK_SHARED));
+	ut_ad(!rw_lock_own(&btr_search_latch, RW_LOCK_EX));
+#endif /* UNIV_SYNC_DEBUG */
+
+	block = btr_cur_get_block(cursor);
+
+	if (srv_pass_corrupt_table && !block) {
+		return;
+	}
+	ut_a(block);
+
+	/* NOTE that the following two function calls do NOT protect
+	info or block->n_fields etc. with any semaphore, to save CPU time!
+	We cannot assume the fields are consistent when we return from
+	those functions! */
+
+	btr_search_info_update_hash(info, cursor);
+
+	build_index = btr_search_update_block_hash_info(info, block, cursor);
+
+	if (build_index || (cursor->flag == BTR_CUR_HASH_FAIL)) {
+
+		btr_search_check_free_space_in_heap();
+	}
+
+	if (cursor->flag == BTR_CUR_HASH_FAIL) {
+		/* Update the hash node reference, if appropriate */
+
+#ifdef UNIV_SEARCH_PERF_STAT
+		btr_search_n_hash_fail++;
+#endif /* UNIV_SEARCH_PERF_STAT */
+
+		rw_lock_x_lock(&btr_search_latch);
+
+		btr_search_update_hash_ref(info, block, cursor);
+
+		rw_lock_x_unlock(&btr_search_latch);
+	}
+
+	if (build_index) {
+		/* Note that since we did not protect block->n_fields etc.
+		with any semaphore, the values can be inconsistent. We have
+		to check inside the function call that they make sense. We
+		also malloc an array and store the values there to make sure
+		the compiler does not let the function call parameters change
+		inside the called function. It might be that the compiler
+		would optimize the call just to pass pointers to block. */
+
+		params = mem_alloc(3 * sizeof(ulint));
+		params[0] = block->n_fields;
+		params[1] = block->n_bytes;
+		params[2] = block->left_side;
+
+		/* Make sure the compiler cannot deduce the values and do
+		optimizations */
+
+		params2 = params + btr_search_this_is_zero;
+
+		btr_search_build_page_hash_index(cursor->index,
+						 block,
+						 params2[0],
+						 params2[1],
+						 params2[2]);
+		mem_free(params);
+	}
+}
+
+/******************************************************************//**
+Checks if a guessed position for a tree cursor is right. Note that if
+mode is PAGE_CUR_LE, which is used in inserts, and the function returns
+TRUE, then cursor->up_match and cursor->low_match both have sensible values.
+@return	TRUE if success */
+static
+ibool
+btr_search_check_guess(
+/*===================*/
+	btr_cur_t*	cursor,	/*!< in: guessed cursor position */
+	ibool		can_only_compare_to_cursor_rec,
+				/*!< in: if we do not have a latch on the page
+				of cursor, but only a latch on
+				btr_search_latch, then ONLY the columns
+				of the record UNDER the cursor are
+				protected, not the next or previous record
+				in the chain: we cannot look at the next or
+				previous record to check our guess! */
+	const dtuple_t*	tuple,	/*!< in: data tuple */
+	ulint		mode,	/*!< in: PAGE_CUR_L, PAGE_CUR_LE, PAGE_CUR_G,
+				or PAGE_CUR_GE */
+	mtr_t*		mtr)	/*!< in: mtr */
+{
+	rec_t*		rec;
+	ulint		n_unique;
+	ulint		match;
+	ulint		bytes;
+	int		cmp;
+	mem_heap_t*	heap		= NULL;
+	ulint		offsets_[REC_OFFS_NORMAL_SIZE];
+	ulint*		offsets		= offsets_;
+	ibool		success		= FALSE;
+	rec_offs_init(offsets_);
+
+	n_unique = dict_index_get_n_unique_in_tree(cursor->index);
+
+	rec = btr_cur_get_rec(cursor);
+
+	ut_ad(page_rec_is_user_rec(rec));
+
+	match = 0;
+	bytes = 0;
+
+	offsets = rec_get_offsets(rec, cursor->index, offsets,
+				  n_unique, &heap);
+	cmp = page_cmp_dtuple_rec_with_match(tuple, rec,
+					     offsets, &match, &bytes);
+
+	if (mode == PAGE_CUR_GE) {
+		if (cmp == 1) {
+			goto exit_func;
+		}
+
+		cursor->up_match = match;
+
+		if (match >= n_unique) {
+			success = TRUE;
+			goto exit_func;
+		}
+	} else if (mode == PAGE_CUR_LE) {
+		if (cmp == -1) {
+			goto exit_func;
+		}
+
+		cursor->low_match = match;
+
+	} else if (mode == PAGE_CUR_G) {
+		if (cmp != -1) {
+			goto exit_func;
+		}
+	} else if (mode == PAGE_CUR_L) {
+		if (cmp != 1) {
+			goto exit_func;
+		}
+	}
+
+	if (can_only_compare_to_cursor_rec) {
+		/* Since we could not determine if our guess is right just by
+		looking at the record under the cursor, return FALSE */
+		goto exit_func;
+	}
+
+	match = 0;
+	bytes = 0;
+
+	if ((mode == PAGE_CUR_G) || (mode == PAGE_CUR_GE)) {
+		rec_t*	prev_rec;
+
+		ut_ad(!page_rec_is_infimum(rec));
+
+		prev_rec = page_rec_get_prev(rec);
+
+		if (page_rec_is_infimum(prev_rec)) {
+			success = btr_page_get_prev(page_align(prev_rec), mtr)
+				== FIL_NULL;
+
+			goto exit_func;
+		}
+
+		offsets = rec_get_offsets(prev_rec, cursor->index, offsets,
+					  n_unique, &heap);
+		cmp = page_cmp_dtuple_rec_with_match(tuple, prev_rec,
+						     offsets, &match, &bytes);
+		if (mode == PAGE_CUR_GE) {
+			success = cmp == 1;
+		} else {
+			success = cmp != -1;
+		}
+
+		goto exit_func;
+	} else {
+		rec_t*	next_rec;
+
+		ut_ad(!page_rec_is_supremum(rec));
+
+		next_rec = page_rec_get_next(rec);
+
+		if (page_rec_is_supremum(next_rec)) {
+			if (btr_page_get_next(page_align(next_rec), mtr)
+			    == FIL_NULL) {
+
+				cursor->up_match = 0;
+				success = TRUE;
+			}
+
+			goto exit_func;
+		}
+
+		offsets = rec_get_offsets(next_rec, cursor->index, offsets,
+					  n_unique, &heap);
+		cmp = page_cmp_dtuple_rec_with_match(tuple, next_rec,
+						     offsets, &match, &bytes);
+		if (mode == PAGE_CUR_LE) {
+			success = cmp == -1;
+			cursor->up_match = match;
+		} else {
+			success = cmp != 1;
+		}
+	}
+exit_func:
+	if (UNIV_LIKELY_NULL(heap)) {
+		mem_heap_free(heap);
+	}
+	return(success);
+}
+
+/******************************************************************//**
+Tries to guess the right search position based on the hash search info
+of the index. Note that if mode is PAGE_CUR_LE, which is used in inserts,
+and the function returns TRUE, then cursor->up_match and cursor->low_match
+both have sensible values.
+@return	TRUE if succeeded */
+UNIV_INTERN
+ibool
+btr_search_guess_on_hash(
+/*=====================*/
+	dict_index_t*	index,		/*!< in: index */
+	btr_search_t*	info,		/*!< in: index search info */
+	const dtuple_t*	tuple,		/*!< in: logical record */
+	ulint		mode,		/*!< in: PAGE_CUR_L, ... */
+	ulint		latch_mode,	/*!< in: BTR_SEARCH_LEAF, ...;
+					NOTE that only if has_search_latch
+					is 0, we will have a latch set on
+					the cursor page, otherwise we assume
+					the caller uses his search latch
+					to protect the record! */
+	btr_cur_t*	cursor,		/*!< out: tree cursor */
+	ulint		has_search_latch,/*!< in: latch mode the caller
+					currently has on btr_search_latch:
+					RW_S_LATCH, RW_X_LATCH, or 0 */
+	mtr_t*		mtr)		/*!< in: mtr */
+{
+	buf_block_t*	block;
+	rec_t*		rec;
+	ulint		fold;
+	dulint		index_id;
+#ifdef notdefined
+	btr_cur_t	cursor2;
+	btr_pcur_t	pcur;
+#endif
+	ut_ad(index && info && tuple && cursor && mtr);
+	ut_ad((latch_mode == BTR_SEARCH_LEAF)
+	      || (latch_mode == BTR_MODIFY_LEAF));
+
+	/* Note that, for efficiency, the struct info may not be protected by
+	any latch here! */
+
+	if (UNIV_UNLIKELY(info->n_hash_potential == 0)) {
+
+		return(FALSE);
+	}
+
+	cursor->n_fields = info->n_fields;
+	cursor->n_bytes = info->n_bytes;
+
+	if (UNIV_UNLIKELY(dtuple_get_n_fields(tuple)
+			  < cursor->n_fields + (cursor->n_bytes > 0))) {
+
+		return(FALSE);
+	}
+
+	index_id = index->id;
+
+#ifdef UNIV_SEARCH_PERF_STAT
+	info->n_hash_succ++;
+#endif
+	fold = dtuple_fold(tuple, cursor->n_fields, cursor->n_bytes, index_id);
+
+	cursor->fold = fold;
+	cursor->flag = BTR_CUR_HASH;
+
+	if (UNIV_LIKELY(!has_search_latch)) {
+		rw_lock_s_lock(&btr_search_latch);
+
+		if (UNIV_UNLIKELY(!btr_search_enabled)) {
+			goto failure_unlock;
+		}
+	}
+
+	ut_ad(rw_lock_get_writer(&btr_search_latch) != RW_LOCK_EX);
+	ut_ad(rw_lock_get_reader_count(&btr_search_latch) > 0);
+
+	rec = ha_search_and_get_data(btr_search_sys->hash_index, fold);
+
+	if (UNIV_UNLIKELY(!rec)) {
+		goto failure_unlock;
+	}
+
+	block = buf_block_align(rec);
+
+	if (UNIV_LIKELY(!has_search_latch)) {
+
+		if (UNIV_UNLIKELY(
+			    !buf_page_get_known_nowait(latch_mode, block,
+						       BUF_MAKE_YOUNG,
+						       __FILE__, __LINE__,
+						       mtr))) {
+			goto failure_unlock;
+		}
+
+		rw_lock_s_unlock(&btr_search_latch);
+
+		buf_block_dbg_add_level(block, SYNC_TREE_NODE_FROM_HASH);
+	}
+
+	if (UNIV_UNLIKELY(buf_block_get_state(block) != BUF_BLOCK_FILE_PAGE)) {
+		ut_ad(buf_block_get_state(block) == BUF_BLOCK_REMOVE_HASH);
+
+		if (UNIV_LIKELY(!has_search_latch)) {
+
+			btr_leaf_page_release(block, latch_mode, mtr);
+		}
+
+		goto failure;
+	}
+
+	ut_ad(page_rec_is_user_rec(rec));
+
+	btr_cur_position(index, rec, block, cursor);
+
+	/* Check the validity of the guess within the page */
+
+	/* If we only have the latch on btr_search_latch, not on the
+	page, it only protects the columns of the record the cursor
+	is positioned on. We cannot look at the next of the previous
+	record to determine if our guess for the cursor position is
+	right. */
+	if (UNIV_EXPECT
+	    (ut_dulint_cmp(index_id, btr_page_get_index_id(block->frame)), 0)
+	    || !btr_search_check_guess(cursor,
+				       has_search_latch,
+				       tuple, mode, mtr)) {
+		if (UNIV_LIKELY(!has_search_latch)) {
+			btr_leaf_page_release(block, latch_mode, mtr);
+		}
+
+		goto failure;
+	}
+
+	if (UNIV_LIKELY(info->n_hash_potential < BTR_SEARCH_BUILD_LIMIT + 5)) {
+
+		info->n_hash_potential++;
+	}
+
+#ifdef notdefined
+	/* These lines of code can be used in a debug version to check
+	the correctness of the searched cursor position: */
+
+	info->last_hash_succ = FALSE;
+
+	/* Currently, does not work if the following fails: */
+	ut_ad(!has_search_latch);
+
+	btr_leaf_page_release(block, latch_mode, mtr);
+
+	btr_cur_search_to_nth_level(index, 0, tuple, mode, latch_mode,
+				    &cursor2, 0, mtr);
+	if (mode == PAGE_CUR_GE
+	    && page_rec_is_supremum(btr_cur_get_rec(&cursor2))) {
+
+		/* If mode is PAGE_CUR_GE, then the binary search
+		in the index tree may actually take us to the supremum
+		of the previous page */
+
+		info->last_hash_succ = FALSE;
+
+		btr_pcur_open_on_user_rec(index, tuple, mode, latch_mode,
+					  &pcur, mtr);
+		ut_ad(btr_pcur_get_rec(&pcur) == btr_cur_get_rec(cursor));
+	} else {
+		ut_ad(btr_cur_get_rec(&cursor2) == btr_cur_get_rec(cursor));
+	}
+
+	/* NOTE that it is theoretically possible that the above assertions
+	fail if the page of the cursor gets removed from the buffer pool
+	meanwhile! Thus it might not be a bug. */
+#endif
+	info->last_hash_succ = TRUE;
+
+#ifdef UNIV_SEARCH_PERF_STAT
+	btr_search_n_succ++;
+#endif
+	if (UNIV_LIKELY(!has_search_latch)
+	    && buf_page_peek_if_too_old(&block->page)) {
+
+		buf_page_make_young(&block->page);
+	}
+
+	/* Increment the page get statistics though we did not really
+	fix the page: for user info only */
+
+	buf_pool->stat.n_page_gets++;
+
+	return(TRUE);
+
+	/*-------------------------------------------*/
+failure_unlock:
+	if (UNIV_LIKELY(!has_search_latch)) {
+		rw_lock_s_unlock(&btr_search_latch);
+	}
+failure:
+	cursor->flag = BTR_CUR_HASH_FAIL;
+
+#ifdef UNIV_SEARCH_PERF_STAT
+	info->n_hash_fail++;
+
+	if (info->n_hash_succ > 0) {
+		info->n_hash_succ--;
+	}
+#endif
+	info->last_hash_succ = FALSE;
+
+	return(FALSE);
+}
+
+/********************************************************************//**
+Drops a page hash index. */
+UNIV_INTERN
+void
+btr_search_drop_page_hash_index(
+/*============================*/
+	buf_block_t*	block)	/*!< in: block containing index page,
+				s- or x-latched, or an index page
+				for which we know that
+				block->buf_fix_count == 0 */
+{
+	hash_table_t*		table;
+	ulint			n_fields;
+	ulint			n_bytes;
+	const page_t*		page;
+	const rec_t*		rec;
+	ulint			fold;
+	ulint			prev_fold;
+	dulint			index_id;
+	ulint			n_cached;
+	ulint			n_recs;
+	ulint*			folds;
+	ulint			i;
+	mem_heap_t*		heap;
+	const dict_index_t*	index;
+	ulint*			offsets;
+
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(!rw_lock_own(&btr_search_latch, RW_LOCK_SHARED));
+	ut_ad(!rw_lock_own(&btr_search_latch, RW_LOCK_EX));
+#endif /* UNIV_SYNC_DEBUG */
+
+retry:
+	rw_lock_s_lock(&btr_search_latch);
+	page = block->frame;
+
+	if (UNIV_LIKELY(!block->is_hashed)) {
+
+		rw_lock_s_unlock(&btr_search_latch);
+
+		return;
+	}
+
+	table = btr_search_sys->hash_index;
+
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(rw_lock_own(&(block->lock), RW_LOCK_SHARED)
+	      || rw_lock_own(&(block->lock), RW_LOCK_EX)
+	      || (block->page.buf_fix_count == 0));
+#endif /* UNIV_SYNC_DEBUG */
+
+	n_fields = block->curr_n_fields;
+	n_bytes = block->curr_n_bytes;
+	index = block->index;
+	ut_a(!dict_index_is_ibuf(index));
+
+	/* NOTE: The fields of block must not be accessed after
+	releasing btr_search_latch, as the index page might only
+	be s-latched! */
+
+	rw_lock_s_unlock(&btr_search_latch);
+
+	ut_a(n_fields + n_bytes > 0);
+
+	n_recs = page_get_n_recs(page);
+
+	/* Calculate and cache fold values into an array for fast deletion
+	from the hash index */
+
+	folds = mem_alloc(n_recs * sizeof(ulint));
+
+	n_cached = 0;
+
+	rec = page_get_infimum_rec(page);
+	rec = page_rec_get_next_low(rec, page_is_comp(page));
+
+	index_id = btr_page_get_index_id(page);
+
+	ut_a(0 == ut_dulint_cmp(index_id, index->id));
+
+	prev_fold = 0;
+
+	heap = NULL;
+	offsets = NULL;
+
+	while (!page_rec_is_supremum(rec)) {
+		offsets = rec_get_offsets(rec, index, offsets,
+					  n_fields + (n_bytes > 0), &heap);
+		ut_a(rec_offs_n_fields(offsets) == n_fields + (n_bytes > 0));
+		fold = rec_fold(rec, offsets, n_fields, n_bytes, index_id);
+
+		if (fold == prev_fold && prev_fold != 0) {
+
+			goto next_rec;
+		}
+
+		/* Remove all hash nodes pointing to this page from the
+		hash chain */
+
+		folds[n_cached] = fold;
+		n_cached++;
+next_rec:
+		rec = page_rec_get_next_low(rec, page_rec_is_comp(rec));
+		prev_fold = fold;
+	}
+
+	if (UNIV_LIKELY_NULL(heap)) {
+		mem_heap_free(heap);
+	}
+
+	rw_lock_x_lock(&btr_search_latch);
+
+	if (UNIV_UNLIKELY(!block->is_hashed)) {
+		/* Someone else has meanwhile dropped the hash index */
+
+		goto cleanup;
+	}
+
+	ut_a(block->index == index);
+
+	if (UNIV_UNLIKELY(block->curr_n_fields != n_fields)
+	    || UNIV_UNLIKELY(block->curr_n_bytes != n_bytes)) {
+
+		/* Someone else has meanwhile built a new hash index on the
+		page, with different parameters */
+
+		rw_lock_x_unlock(&btr_search_latch);
+
+		mem_free(folds);
+		goto retry;
+	}
+
+	for (i = 0; i < n_cached; i++) {
+
+		ha_remove_all_nodes_to_page(table, folds[i], page);
+	}
+
+	ut_a(index->search_info->ref_count > 0);
+	index->search_info->ref_count--;
+
+	block->is_hashed = FALSE;
+	block->index = NULL;
+	
+cleanup:
+#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
+	if (UNIV_UNLIKELY(block->n_pointers)) {
+		/* Corruption */
+		ut_print_timestamp(stderr);
+		fprintf(stderr,
+			"  InnoDB: Corruption of adaptive hash index."
+			" After dropping\n"
+			"InnoDB: the hash index to a page of %s,"
+			" still %lu hash nodes remain.\n",
+			index->name, (ulong) block->n_pointers);
+		rw_lock_x_unlock(&btr_search_latch);
+
+		btr_search_validate();
+	} else {
+		rw_lock_x_unlock(&btr_search_latch);
+	}
+#else /* UNIV_AHI_DEBUG || UNIV_DEBUG */
+	rw_lock_x_unlock(&btr_search_latch);
+#endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */
+
+	mem_free(folds);
+}
+
+/************************************************************************
+Drops a page hash index based on index */
+UNIV_INTERN
+void
+btr_search_drop_page_hash_index_on_index(
+/*=====================================*/
+	dict_index_t*	index)		/* in: record descriptor */
+{
+	buf_page_t*	bpage;
+	hash_table_t*	table;
+	buf_block_t*	block;
+	ulint		n_fields;
+	ulint		n_bytes;
+	const page_t*		page;
+	const rec_t*		rec;
+	ulint		fold;
+	ulint		prev_fold;
+	dulint			index_id;
+	ulint		n_cached;
+	ulint		n_recs;
+	ulint*		folds;
+	ulint		i;
+	mem_heap_t*	heap	= NULL;
+	ulint*		offsets;
+
+	rw_lock_x_lock(&btr_search_latch);
+	mutex_enter(&LRU_list_mutex);
+
+	table = btr_search_sys->hash_index;
+
+	bpage = UT_LIST_GET_LAST(buf_pool->LRU);
+
+	while (bpage != NULL) {
+		block = (buf_block_t*) bpage;
+		if (block->index == index && block->is_hashed) {
+			page = block->frame;
+
+			/* from btr_search_drop_page_hash_index() */
+			n_fields = block->curr_n_fields;
+			n_bytes = block->curr_n_bytes;
+
+			ut_a(n_fields + n_bytes > 0);
+
+			n_recs = page_get_n_recs(page);
+
+			/* Calculate and cache fold values into an array for fast deletion
+			from the hash index */
+
+			folds = mem_alloc(n_recs * sizeof(ulint));
+
+			n_cached = 0;
+
+			rec = page_get_infimum_rec(page);
+			rec = page_rec_get_next_low(rec, page_is_comp(page));
+
+			index_id = btr_page_get_index_id(page);
+	
+			ut_a(0 == ut_dulint_cmp(index_id, index->id));
+
+			prev_fold = 0;
+
+			offsets = NULL;
+
+			while (!page_rec_is_supremum(rec)) {
+				offsets = rec_get_offsets(rec, index, offsets,
+							n_fields + (n_bytes > 0), &heap);
+				ut_a(rec_offs_n_fields(offsets) == n_fields + (n_bytes > 0));
+				fold = rec_fold(rec, offsets, n_fields, n_bytes, index_id);
+
+				if (fold == prev_fold && prev_fold != 0) {
+
+					goto next_rec;
+				}
+
+				/* Remove all hash nodes pointing to this page from the
+				hash chain */
+
+				folds[n_cached] = fold;
+				n_cached++;
+next_rec:
+				rec = page_rec_get_next_low(rec, page_rec_is_comp(rec));
+				prev_fold = fold;
+			}
+
+			for (i = 0; i < n_cached; i++) {
+
+				ha_remove_all_nodes_to_page(table, folds[i], page);
+			}
+
+			ut_a(index->search_info->ref_count > 0);
+			index->search_info->ref_count--;
+
+			block->is_hashed = FALSE;
+			block->index = NULL;
+	
+#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
+			if (UNIV_UNLIKELY(block->n_pointers)) {
+				/* Corruption */
+				ut_print_timestamp(stderr);
+				fprintf(stderr,
+"  InnoDB: Corruption of adaptive hash index. After dropping\n"
+"InnoDB: the hash index to a page of %s, still %lu hash nodes remain.\n",
+					index->name, (ulong) block->n_pointers);
+			}
+#endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */
+
+			mem_free(folds);
+		}
+
+		bpage = UT_LIST_GET_PREV(LRU, bpage);
+	}
+
+	mutex_exit(&LRU_list_mutex);
+	rw_lock_x_unlock(&btr_search_latch);
+
+	if (UNIV_LIKELY_NULL(heap)) {
+		mem_heap_free(heap);
+	}
+}
+
+/********************************************************************//**
+Drops a page hash index when a page is freed from a fseg to the file system.
+Drops possible hash index if the page happens to be in the buffer pool. */
+UNIV_INTERN
+void
+btr_search_drop_page_hash_when_freed(
+/*=================================*/
+	ulint	space,		/*!< in: space id */
+	ulint	zip_size,	/*!< in: compressed page size in bytes
+				or 0 for uncompressed pages */
+	ulint	page_no)	/*!< in: page number */
+{
+	buf_block_t*	block;
+	mtr_t		mtr;
+
+	if (!buf_page_peek_if_search_hashed(space, page_no)) {
+
+		return;
+	}
+
+	mtr_start(&mtr);
+
+	/* We assume that if the caller has a latch on the page, then the
+	caller has already dropped the hash index for the page, and we never
+	get here. Therefore we can acquire the s-latch to the page without
+	having to fear a deadlock. */
+
+	block = buf_page_get_gen(space, zip_size, page_no, RW_S_LATCH, NULL,
+				BUF_GET_IF_IN_POOL, __FILE__, __LINE__,
+				&mtr);
+	/* Because the buffer pool mutex was released by
+	buf_page_peek_if_search_hashed(), it is possible that the
+	block was removed from the buffer pool by another thread
+	before buf_page_get_gen() got a chance to acquire the buffer
+	pool mutex again.  Thus, we must check for a NULL return. */
+
+	if (UNIV_LIKELY(block != NULL)) {
+
+		buf_block_dbg_add_level(block, SYNC_TREE_NODE_FROM_HASH);
+
+		btr_search_drop_page_hash_index(block);
+	}
+
+	mtr_commit(&mtr);
+}
+
+/********************************************************************//**
+Builds a hash index on a page with the given parameters. If the page already
+has a hash index with different parameters, the old hash index is removed.
+If index is non-NULL, this function checks if n_fields and n_bytes are
+sensible values, and does not build a hash index if not. */
+static
+void
+btr_search_build_page_hash_index(
+/*=============================*/
+	dict_index_t*	index,	/*!< in: index for which to build */
+	buf_block_t*	block,	/*!< in: index page, s- or x-latched */
+	ulint		n_fields,/*!< in: hash this many full fields */
+	ulint		n_bytes,/*!< in: hash this many bytes from the next
+				field */
+	ibool		left_side)/*!< in: hash for searches from left side? */
+{
+	hash_table_t*	table;
+	page_t*		page;
+	rec_t*		rec;
+	rec_t*		next_rec;
+	ulint		fold;
+	ulint		next_fold;
+	dulint		index_id;
+	ulint		n_cached;
+	ulint		n_recs;
+	ulint*		folds;
+	rec_t**		recs;
+	ulint		i;
+	mem_heap_t*	heap		= NULL;
+	ulint		offsets_[REC_OFFS_NORMAL_SIZE];
+	ulint*		offsets		= offsets_;
+	rec_offs_init(offsets_);
+
+	ut_ad(index);
+	ut_a(!dict_index_is_ibuf(index));
+
+	table = btr_search_sys->hash_index;
+	page = buf_block_get_frame(block);
+
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(!rw_lock_own(&btr_search_latch, RW_LOCK_EX));
+	ut_ad(rw_lock_own(&(block->lock), RW_LOCK_SHARED)
+	      || rw_lock_own(&(block->lock), RW_LOCK_EX));
+#endif /* UNIV_SYNC_DEBUG */
+
+	rw_lock_s_lock(&btr_search_latch);
+
+	if (block->is_hashed && ((block->curr_n_fields != n_fields)
+				 || (block->curr_n_bytes != n_bytes)
+				 || (block->curr_left_side != left_side))) {
+
+		rw_lock_s_unlock(&btr_search_latch);
+
+		btr_search_drop_page_hash_index(block);
+	} else {
+		rw_lock_s_unlock(&btr_search_latch);
+	}
+
+	n_recs = page_get_n_recs(page);
+
+	if (n_recs == 0) {
+
+		return;
+	}
+
+	/* Check that the values for hash index build are sensible */
+
+	if (n_fields + n_bytes == 0) {
+
+		return;
+	}
+
+	if (dict_index_get_n_unique_in_tree(index) < n_fields
+	    || (dict_index_get_n_unique_in_tree(index) == n_fields
+		&& n_bytes > 0)) {
+		return;
+	}
+
+	/* Calculate and cache fold values and corresponding records into
+	an array for fast insertion to the hash index */
+
+	folds = mem_alloc(n_recs * sizeof(ulint));
+	recs = mem_alloc(n_recs * sizeof(rec_t*));
+
+	n_cached = 0;
+
+	index_id = btr_page_get_index_id(page);
+
+	rec = page_rec_get_next(page_get_infimum_rec(page));
+
+	offsets = rec_get_offsets(rec, index, offsets,
+				  n_fields + (n_bytes > 0), &heap);
+
+	if (!page_rec_is_supremum(rec)) {
+		ut_a(n_fields <= rec_offs_n_fields(offsets));
+
+		if (n_bytes > 0) {
+			ut_a(n_fields < rec_offs_n_fields(offsets));
+		}
+	}
+
+	fold = rec_fold(rec, offsets, n_fields, n_bytes, index_id);
+
+	if (left_side) {
+
+		folds[n_cached] = fold;
+		recs[n_cached] = rec;
+		n_cached++;
+	}
+
+	for (;;) {
+		next_rec = page_rec_get_next(rec);
+
+		if (page_rec_is_supremum(next_rec)) {
+
+			if (!left_side) {
+
+				folds[n_cached] = fold;
+				recs[n_cached] = rec;
+				n_cached++;
+			}
+
+			break;
+		}
+
+		offsets = rec_get_offsets(next_rec, index, offsets,
+					  n_fields + (n_bytes > 0), &heap);
+		next_fold = rec_fold(next_rec, offsets, n_fields,
+				     n_bytes, index_id);
+
+		if (fold != next_fold) {
+			/* Insert an entry into the hash index */
+
+			if (left_side) {
+
+				folds[n_cached] = next_fold;
+				recs[n_cached] = next_rec;
+				n_cached++;
+			} else {
+				folds[n_cached] = fold;
+				recs[n_cached] = rec;
+				n_cached++;
+			}
+		}
+
+		rec = next_rec;
+		fold = next_fold;
+	}
+
+	btr_search_check_free_space_in_heap();
+
+	rw_lock_x_lock(&btr_search_latch);
+
+	if (UNIV_UNLIKELY(btr_search_fully_disabled)) {
+		goto exit_func;
+	}
+
+	if (block->is_hashed && ((block->curr_n_fields != n_fields)
+				 || (block->curr_n_bytes != n_bytes)
+				 || (block->curr_left_side != left_side))) {
+		goto exit_func;
+	}
+
+	/* This counter is decremented every time we drop page
+	hash index entries and is incremented here. Since we can
+	rebuild hash index for a page that is already hashed, we
+	have to take care not to increment the counter in that
+	case. */
+	if (!block->is_hashed) {
+		index->search_info->ref_count++;
+	}
+
+	block->is_hashed = TRUE;
+	block->n_hash_helps = 0;
+
+	block->curr_n_fields = n_fields;
+	block->curr_n_bytes = n_bytes;
+	block->curr_left_side = left_side;
+	block->index = index;
+
+	for (i = 0; i < n_cached; i++) {
+
+		ha_insert_for_fold(table, folds[i], block, recs[i]);
+	}
+
+exit_func:
+	rw_lock_x_unlock(&btr_search_latch);
+
+	mem_free(folds);
+	mem_free(recs);
+	if (UNIV_LIKELY_NULL(heap)) {
+		mem_heap_free(heap);
+	}
+}
+
+/********************************************************************//**
+Moves or deletes hash entries for moved records. If new_page is already hashed,
+then the hash index for page, if any, is dropped. If new_page is not hashed,
+and page is hashed, then a new hash index is built to new_page with the same
+parameters as page (this often happens when a page is split). */
+UNIV_INTERN
+void
+btr_search_move_or_delete_hash_entries(
+/*===================================*/
+	buf_block_t*	new_block,	/*!< in: records are copied
+					to this page */
+	buf_block_t*	block,		/*!< in: index page from which
+					records were copied, and the
+					copied records will be deleted
+					from this page */
+	dict_index_t*	index)		/*!< in: record descriptor */
+{
+	ulint	n_fields;
+	ulint	n_bytes;
+	ibool	left_side;
+
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(rw_lock_own(&(block->lock), RW_LOCK_EX));
+	ut_ad(rw_lock_own(&(new_block->lock), RW_LOCK_EX));
+#endif /* UNIV_SYNC_DEBUG */
+	ut_a(!new_block->is_hashed || new_block->index == index);
+	ut_a(!block->is_hashed || block->index == index);
+	ut_a(!(new_block->is_hashed || block->is_hashed)
+	     || !dict_index_is_ibuf(index));
+
+	rw_lock_s_lock(&btr_search_latch);
+
+	if (new_block->is_hashed) {
+
+		rw_lock_s_unlock(&btr_search_latch);
+
+		btr_search_drop_page_hash_index(block);
+
+		return;
+	}
+
+	if (block->is_hashed) {
+
+		n_fields = block->curr_n_fields;
+		n_bytes = block->curr_n_bytes;
+		left_side = block->curr_left_side;
+
+		new_block->n_fields = block->curr_n_fields;
+		new_block->n_bytes = block->curr_n_bytes;
+		new_block->left_side = left_side;
+
+		rw_lock_s_unlock(&btr_search_latch);
+
+		ut_a(n_fields + n_bytes > 0);
+
+		btr_search_build_page_hash_index(index, new_block, n_fields,
+						 n_bytes, left_side);
+		ut_ad(n_fields == block->curr_n_fields);
+		ut_ad(n_bytes == block->curr_n_bytes);
+		ut_ad(left_side == block->curr_left_side);
+		return;
+	}
+
+	rw_lock_s_unlock(&btr_search_latch);
+}
+
+/********************************************************************//**
+Updates the page hash index when a single record is deleted from a page. */
+UNIV_INTERN
+void
+btr_search_update_hash_on_delete(
+/*=============================*/
+	btr_cur_t*	cursor)	/*!< in: cursor which was positioned on the
+				record to delete using btr_cur_search_...,
+				the record is not yet deleted */
+{
+	hash_table_t*	table;
+	buf_block_t*	block;
+	rec_t*		rec;
+	ulint		fold;
+	dulint		index_id;
+	ibool		found;
+	ulint		offsets_[REC_OFFS_NORMAL_SIZE];
+	mem_heap_t*	heap		= NULL;
+	rec_offs_init(offsets_);
+
+	rec = btr_cur_get_rec(cursor);
+
+	block = btr_cur_get_block(cursor);
+
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(rw_lock_own(&(block->lock), RW_LOCK_EX));
+#endif /* UNIV_SYNC_DEBUG */
+
+	if (!block->is_hashed) {
+
+		return;
+	}
+
+	ut_a(block->index == cursor->index);
+	ut_a(block->curr_n_fields + block->curr_n_bytes > 0);
+	ut_a(!dict_index_is_ibuf(cursor->index));
+
+	table = btr_search_sys->hash_index;
+
+	index_id = cursor->index->id;
+	fold = rec_fold(rec, rec_get_offsets(rec, cursor->index, offsets_,
+					     ULINT_UNDEFINED, &heap),
+			block->curr_n_fields, block->curr_n_bytes, index_id);
+	if (UNIV_LIKELY_NULL(heap)) {
+		mem_heap_free(heap);
+	}
+	rw_lock_x_lock(&btr_search_latch);
+
+	found = ha_search_and_delete_if_found(table, fold, rec);
+
+	rw_lock_x_unlock(&btr_search_latch);
+}
+
+/********************************************************************//**
+Updates the page hash index when a single record is inserted on a page. */
+UNIV_INTERN
+void
+btr_search_update_hash_node_on_insert(
+/*==================================*/
+	btr_cur_t*	cursor)	/*!< in: cursor which was positioned to the
+				place to insert using btr_cur_search_...,
+				and the new record has been inserted next
+				to the cursor */
+{
+	hash_table_t*	table;
+	buf_block_t*	block;
+	rec_t*		rec;
+
+	rec = btr_cur_get_rec(cursor);
+
+	block = btr_cur_get_block(cursor);
+
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(rw_lock_own(&(block->lock), RW_LOCK_EX));
+#endif /* UNIV_SYNC_DEBUG */
+
+	if (!block->is_hashed) {
+
+		return;
+	}
+
+	ut_a(block->index == cursor->index);
+	ut_a(!dict_index_is_ibuf(cursor->index));
+
+	rw_lock_x_lock(&btr_search_latch);
+
+	if ((cursor->flag == BTR_CUR_HASH)
+	    && (cursor->n_fields == block->curr_n_fields)
+	    && (cursor->n_bytes == block->curr_n_bytes)
+	    && !block->curr_left_side) {
+
+		table = btr_search_sys->hash_index;
+
+		ha_search_and_update_if_found(table, cursor->fold, rec,
+					      block, page_rec_get_next(rec));
+
+		rw_lock_x_unlock(&btr_search_latch);
+	} else {
+		rw_lock_x_unlock(&btr_search_latch);
+
+		btr_search_update_hash_on_insert(cursor);
+	}
+}
+
+/********************************************************************//**
+Updates the page hash index when a single record is inserted on a page. */
+UNIV_INTERN
+void
+btr_search_update_hash_on_insert(
+/*=============================*/
+	btr_cur_t*	cursor)	/*!< in: cursor which was positioned to the
+				place to insert using btr_cur_search_...,
+				and the new record has been inserted next
+				to the cursor */
+{
+	hash_table_t*	table;
+	buf_block_t*	block;
+	rec_t*		rec;
+	rec_t*		ins_rec;
+	rec_t*		next_rec;
+	dulint		index_id;
+	ulint		fold;
+	ulint		ins_fold;
+	ulint		next_fold = 0; /* remove warning (??? bug ???) */
+	ulint		n_fields;
+	ulint		n_bytes;
+	ibool		left_side;
+	ibool		locked		= FALSE;
+	mem_heap_t*	heap		= NULL;
+	ulint		offsets_[REC_OFFS_NORMAL_SIZE];
+	ulint*		offsets		= offsets_;
+	rec_offs_init(offsets_);
+
+	table = btr_search_sys->hash_index;
+
+	btr_search_check_free_space_in_heap();
+
+	rec = btr_cur_get_rec(cursor);
+
+	block = btr_cur_get_block(cursor);
+
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(rw_lock_own(&(block->lock), RW_LOCK_EX));
+#endif /* UNIV_SYNC_DEBUG */
+
+	if (!block->is_hashed) {
+
+		return;
+	}
+
+	ut_a(block->index == cursor->index);
+	ut_a(!dict_index_is_ibuf(cursor->index));
+
+	index_id = cursor->index->id;
+
+	n_fields = block->curr_n_fields;
+	n_bytes = block->curr_n_bytes;
+	left_side = block->curr_left_side;
+
+	ins_rec = page_rec_get_next(rec);
+	next_rec = page_rec_get_next(ins_rec);
+
+	offsets = rec_get_offsets(ins_rec, cursor->index, offsets,
+				  ULINT_UNDEFINED, &heap);
+	ins_fold = rec_fold(ins_rec, offsets, n_fields, n_bytes, index_id);
+
+	if (!page_rec_is_supremum(next_rec)) {
+		offsets = rec_get_offsets(next_rec, cursor->index, offsets,
+					  n_fields + (n_bytes > 0), &heap);
+		next_fold = rec_fold(next_rec, offsets, n_fields,
+				     n_bytes, index_id);
+	}
+
+	if (!page_rec_is_infimum(rec)) {
+		offsets = rec_get_offsets(rec, cursor->index, offsets,
+					  n_fields + (n_bytes > 0), &heap);
+		fold = rec_fold(rec, offsets, n_fields, n_bytes, index_id);
+	} else {
+		if (left_side) {
+
+			rw_lock_x_lock(&btr_search_latch);
+
+			locked = TRUE;
+
+			ha_insert_for_fold(table, ins_fold, block, ins_rec);
+		}
+
+		goto check_next_rec;
+	}
+
+	if (fold != ins_fold) {
+
+		if (!locked) {
+
+			rw_lock_x_lock(&btr_search_latch);
+
+			locked = TRUE;
+		}
+
+		if (!left_side) {
+			ha_insert_for_fold(table, fold, block, rec);
+		} else {
+			ha_insert_for_fold(table, ins_fold, block, ins_rec);
+		}
+	}
+
+check_next_rec:
+	if (page_rec_is_supremum(next_rec)) {
+
+		if (!left_side) {
+
+			if (!locked) {
+				rw_lock_x_lock(&btr_search_latch);
+
+				locked = TRUE;
+			}
+
+			ha_insert_for_fold(table, ins_fold, block, ins_rec);
+		}
+
+		goto function_exit;
+	}
+
+	if (ins_fold != next_fold) {
+
+		if (!locked) {
+
+			rw_lock_x_lock(&btr_search_latch);
+
+			locked = TRUE;
+		}
+
+		if (!left_side) {
+
+			ha_insert_for_fold(table, ins_fold, block, ins_rec);
+			/*
+			fputs("Hash insert for ", stderr);
+			dict_index_name_print(stderr, cursor->index);
+			fprintf(stderr, " fold %lu\n", ins_fold);
+			*/
+		} else {
+			ha_insert_for_fold(table, next_fold, block, next_rec);
+		}
+	}
+
+function_exit:
+	if (UNIV_LIKELY_NULL(heap)) {
+		mem_heap_free(heap);
+	}
+	if (locked) {
+		rw_lock_x_unlock(&btr_search_latch);
+	}
+}
+
+#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
+/********************************************************************//**
+Validates the search system.
+@return	TRUE if ok */
+UNIV_INTERN
+ibool
+btr_search_validate(void)
+/*=====================*/
+{
+	ha_node_t*	node;
+	ulint		n_page_dumps	= 0;
+	ibool		ok		= TRUE;
+	ulint		i;
+	ulint		cell_count;
+	mem_heap_t*	heap		= NULL;
+	ulint		offsets_[REC_OFFS_NORMAL_SIZE];
+	ulint*		offsets		= offsets_;
+
+	/* How many cells to check before temporarily releasing
+	btr_search_latch. */
+	ulint		chunk_size = 10000;
+
+	rec_offs_init(offsets_);
+
+	rw_lock_x_lock(&btr_search_latch);
+	//buf_pool_mutex_enter();
+	rw_lock_x_lock(&page_hash_latch);
+
+	cell_count = hash_get_n_cells(btr_search_sys->hash_index);
+
+	for (i = 0; i < cell_count; i++) {
+		/* We release btr_search_latch every once in a while to
+		give other queries a chance to run. */
+		if ((i != 0) && ((i % chunk_size) == 0)) {
+			//buf_pool_mutex_exit();
+			rw_lock_x_unlock(&page_hash_latch);
+			rw_lock_x_unlock(&btr_search_latch);
+			os_thread_yield();
+			rw_lock_x_lock(&btr_search_latch);
+			//buf_pool_mutex_enter();
+			rw_lock_x_lock(&page_hash_latch);
+		}
+
+		node = hash_get_nth_cell(btr_search_sys->hash_index, i)->node;
+
+		for (; node != NULL; node = node->next) {
+			const buf_block_t*	block
+				= buf_block_align(node->data);
+			const buf_block_t*	hash_block;
+
+			if (UNIV_LIKELY(buf_block_get_state(block)
+					== BUF_BLOCK_FILE_PAGE)) {
+
+				/* The space and offset are only valid
+				for file blocks.  It is possible that
+				the block is being freed
+				(BUF_BLOCK_REMOVE_HASH, see the
+				assertion and the comment below) */
+				hash_block = buf_block_hash_get(
+					buf_block_get_space(block),
+					buf_block_get_page_no(block));
+			} else {
+				hash_block = NULL;
+			}
+
+			if (hash_block) {
+				ut_a(hash_block == block);
+			} else {
+				/* When a block is being freed,
+				buf_LRU_search_and_free_block() first
+				removes the block from
+				buf_pool->page_hash by calling
+				buf_LRU_block_remove_hashed_page().
+				After that, it invokes
+				btr_search_drop_page_hash_index() to
+				remove the block from
+				btr_search_sys->hash_index. */
+
+				ut_a(buf_block_get_state(block)
+				     == BUF_BLOCK_REMOVE_HASH);
+			}
+
+			ut_a(!dict_index_is_ibuf(block->index));
+
+			offsets = rec_get_offsets((const rec_t*) node->data,
+						  block->index, offsets,
+						  block->curr_n_fields
+						  + (block->curr_n_bytes > 0),
+						  &heap);
+
+			if (!block->is_hashed || node->fold
+			    != rec_fold((rec_t*)(node->data),
+					offsets,
+					block->curr_n_fields,
+					block->curr_n_bytes,
+					btr_page_get_index_id(block->frame))) {
+				const page_t*	page = block->frame;
+
+				ok = FALSE;
+				ut_print_timestamp(stderr);
+
+				fprintf(stderr,
+					"  InnoDB: Error in an adaptive hash"
+					" index pointer to page %lu\n"
+					"InnoDB: ptr mem address %p"
+					" index id %lu %lu,"
+					" node fold %lu, rec fold %lu\n",
+					(ulong) page_get_page_no(page),
+					node->data,
+					(ulong) ut_dulint_get_high(
+						btr_page_get_index_id(page)),
+					(ulong) ut_dulint_get_low(
+						btr_page_get_index_id(page)),
+					(ulong) node->fold,
+					(ulong) rec_fold((rec_t*)(node->data),
+							 offsets,
+							 block->curr_n_fields,
+							 block->curr_n_bytes,
+							 btr_page_get_index_id(
+								 page)));
+
+				fputs("InnoDB: Record ", stderr);
+				rec_print_new(stderr, (rec_t*)node->data,
+					      offsets);
+				fprintf(stderr, "\nInnoDB: on that page."
+					" Page mem address %p, is hashed %lu,"
+					" n fields %lu, n bytes %lu\n"
+					"InnoDB: side %lu\n",
+					(void*) page, (ulong) block->is_hashed,
+					(ulong) block->curr_n_fields,
+					(ulong) block->curr_n_bytes,
+					(ulong) block->curr_left_side);
+
+				if (n_page_dumps < 20) {
+					buf_page_print(page, 0);
+					n_page_dumps++;
+				}
+			}
+		}
+	}
+
+	for (i = 0; i < cell_count; i += chunk_size) {
+		ulint end_index = ut_min(i + chunk_size - 1, cell_count - 1);
+
+		/* We release btr_search_latch every once in a while to
+		give other queries a chance to run. */
+		if (i != 0) {
+			//buf_pool_mutex_exit();
+			rw_lock_x_unlock(&page_hash_latch);
+			rw_lock_x_unlock(&btr_search_latch);
+			os_thread_yield();
+			rw_lock_x_lock(&btr_search_latch);
+			//buf_pool_mutex_enter();
+			rw_lock_x_lock(&page_hash_latch);
+		}
+
+		if (!ha_validate(btr_search_sys->hash_index, i, end_index)) {
+			ok = FALSE;
+		}
+	}
+
+	//buf_pool_mutex_exit();
+	rw_lock_x_unlock(&page_hash_latch);
+	rw_lock_x_unlock(&btr_search_latch);
+	if (UNIV_LIKELY_NULL(heap)) {
+		mem_heap_free(heap);
+	}
+
+	return(ok);
+}
+#endif /* defined UNIV_AHI_DEBUG || defined UNIV_DEBUG */
diff --git a/storage/xtradb/buf/buf0buddy.c b/storage/xtradb/buf/buf0buddy.c
new file mode 100644
index 00000000000..e6b80bcda55
--- /dev/null
+++ b/storage/xtradb/buf/buf0buddy.c
@@ -0,0 +1,804 @@
+/*****************************************************************************
+
+Copyright (c) 2006, 2010, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file buf/buf0buddy.c
+Binary buddy allocator for compressed pages
+
+Created December 2006 by Marko Makela
+*******************************************************/
+
+#define THIS_MODULE
+#include "buf0buddy.h"
+#ifdef UNIV_NONINL
+# include "buf0buddy.ic"
+#endif
+#undef THIS_MODULE
+#include "buf0buf.h"
+#include "buf0lru.h"
+#include "buf0flu.h"
+#include "page0zip.h"
+
+/* Statistic counters */
+
+#ifdef UNIV_DEBUG
+/** Number of frames allocated from the buffer pool to the buddy system.
+Protected by buf_pool_mutex. */
+static ulint buf_buddy_n_frames;
+#endif /* UNIV_DEBUG */
+/** Statistics of the buddy system, indexed by block size.
+Protected by buf_pool_mutex. */
+UNIV_INTERN buf_buddy_stat_t buf_buddy_stat[BUF_BUDDY_SIZES_MAX + 1];
+
+/**********************************************************************//**
+Get the offset of the buddy of a compressed page frame.
+@return	the buddy relative of page */
+UNIV_INLINE
+byte*
+buf_buddy_get(
+/*==========*/
+	byte*	page,	/*!< in: compressed page */
+	ulint	size)	/*!< in: page size in bytes */
+{
+	ut_ad(ut_is_2pow(size));
+	ut_ad(size >= BUF_BUDDY_LOW);
+	ut_ad(size < BUF_BUDDY_HIGH);
+	ut_ad(!ut_align_offset(page, size));
+
+	if (((ulint) page) & size) {
+		return(page - size);
+	} else {
+		return(page + size);
+	}
+}
+
+/**********************************************************************//**
+Add a block to the head of the appropriate buddy free list. */
+UNIV_INLINE
+void
+buf_buddy_add_to_free(
+/*==================*/
+	buf_page_t*	bpage,	/*!< in,own: block to be freed */
+	ulint		i)	/*!< in: index of buf_pool->zip_free[] */
+{
+#ifdef UNIV_DEBUG_VALGRIND
+	buf_page_t*	b  = UT_LIST_GET_FIRST(buf_pool->zip_free[i]);
+
+	if (b) UNIV_MEM_VALID(b, BUF_BUDDY_LOW << i);
+#endif /* UNIV_DEBUG_VALGRIND */
+
+	//ut_ad(buf_pool_mutex_own());
+	ut_ad(mutex_own(&zip_free_mutex));
+	ut_ad(buf_page_get_state(bpage) == BUF_BLOCK_ZIP_FREE);
+	ut_ad(buf_pool->zip_free[i].start != bpage);
+	UT_LIST_ADD_FIRST(zip_list, buf_pool->zip_free[i], bpage);
+
+#ifdef UNIV_DEBUG_VALGRIND
+	if (b) UNIV_MEM_FREE(b, BUF_BUDDY_LOW << i);
+	UNIV_MEM_ASSERT_AND_FREE(bpage, BUF_BUDDY_LOW << i);
+#endif /* UNIV_DEBUG_VALGRIND */
+}
+
+/**********************************************************************//**
+Remove a block from the appropriate buddy free list. */
+UNIV_INLINE
+void
+buf_buddy_remove_from_free(
+/*=======================*/
+	buf_page_t*	bpage,	/*!< in: block to be removed */
+	ulint		i)	/*!< in: index of buf_pool->zip_free[] */
+{
+#ifdef UNIV_DEBUG_VALGRIND
+	buf_page_t*	prev = UT_LIST_GET_PREV(zip_list, bpage);
+	buf_page_t*	next = UT_LIST_GET_NEXT(zip_list, bpage);
+
+	if (prev) UNIV_MEM_VALID(prev, BUF_BUDDY_LOW << i);
+	if (next) UNIV_MEM_VALID(next, BUF_BUDDY_LOW << i);
+
+	ut_ad(!prev || buf_page_get_state(prev) == BUF_BLOCK_ZIP_FREE);
+	ut_ad(!next || buf_page_get_state(next) == BUF_BLOCK_ZIP_FREE);
+#endif /* UNIV_DEBUG_VALGRIND */
+
+	//ut_ad(buf_pool_mutex_own());
+	ut_ad(mutex_own(&zip_free_mutex));
+	ut_ad(buf_page_get_state(bpage) == BUF_BLOCK_ZIP_FREE);
+	UT_LIST_REMOVE(zip_list, buf_pool->zip_free[i], bpage);
+
+#ifdef UNIV_DEBUG_VALGRIND
+	if (prev) UNIV_MEM_FREE(prev, BUF_BUDDY_LOW << i);
+	if (next) UNIV_MEM_FREE(next, BUF_BUDDY_LOW << i);
+#endif /* UNIV_DEBUG_VALGRIND */
+}
+
+/**********************************************************************//**
+Try to allocate a block from buf_pool->zip_free[].
+@return	allocated block, or NULL if buf_pool->zip_free[] was empty */
+static
+void*
+buf_buddy_alloc_zip(
+/*================*/
+	ulint	i)	/*!< in: index of buf_pool->zip_free[] */
+{
+	buf_page_t*	bpage;
+
+	//ut_ad(buf_pool_mutex_own());
+	ut_ad(mutex_own(&zip_free_mutex));
+	ut_a(i < BUF_BUDDY_SIZES);
+
+#ifndef UNIV_DEBUG_VALGRIND
+	/* Valgrind would complain about accessing free memory. */
+	ut_d(UT_LIST_VALIDATE(zip_list, buf_page_t, buf_pool->zip_free[i],
+			      ut_ad(buf_page_get_state(ut_list_node_313)
+				    == BUF_BLOCK_ZIP_FREE)));
+#endif /* !UNIV_DEBUG_VALGRIND */
+	bpage = UT_LIST_GET_LAST(buf_pool->zip_free[i]);
+
+	if (bpage) {
+		UNIV_MEM_VALID(bpage, BUF_BUDDY_LOW << i);
+		ut_a(buf_page_get_state(bpage) == BUF_BLOCK_ZIP_FREE);
+
+		buf_buddy_remove_from_free(bpage, i);
+	} else if (i + 1 < BUF_BUDDY_SIZES) {
+		/* Attempt to split. */
+		bpage = buf_buddy_alloc_zip(i + 1);
+
+		if (bpage) {
+			buf_page_t*	buddy = (buf_page_t*)
+				(((char*) bpage) + (BUF_BUDDY_LOW << i));
+
+			ut_ad(!buf_pool_contains_zip(buddy));
+			ut_d(memset(buddy, i, BUF_BUDDY_LOW << i));
+			buddy->state = BUF_BLOCK_ZIP_FREE;
+			buf_buddy_add_to_free(buddy, i);
+		}
+	}
+
+#ifdef UNIV_DEBUG
+	if (bpage) {
+		memset(bpage, ~i, BUF_BUDDY_LOW << i);
+	}
+#endif /* UNIV_DEBUG */
+
+	UNIV_MEM_ALLOC(bpage, BUF_BUDDY_SIZES << i);
+
+	return(bpage);
+}
+
+/**********************************************************************//**
+Deallocate a buffer frame of UNIV_PAGE_SIZE. */
+static
+void
+buf_buddy_block_free(
+/*=================*/
+	void*	buf,	/*!< in: buffer frame to deallocate */
+	ibool	have_page_hash_mutex)
+{
+	const ulint	fold	= BUF_POOL_ZIP_FOLD_PTR(buf);
+	buf_page_t*	bpage;
+	buf_block_t*	block;
+
+	//ut_ad(buf_pool_mutex_own());
+	ut_ad(!mutex_own(&buf_pool_zip_mutex));
+	ut_a(!ut_align_offset(buf, UNIV_PAGE_SIZE));
+
+	mutex_enter(&zip_hash_mutex);
+
+	HASH_SEARCH(hash, buf_pool->zip_hash, fold, buf_page_t*, bpage,
+		    ut_ad(buf_page_get_state(bpage) == BUF_BLOCK_MEMORY
+			  && bpage->in_zip_hash && !bpage->in_page_hash),
+		    ((buf_block_t*) bpage)->frame == buf);
+	ut_a(bpage);
+	ut_a(buf_page_get_state(bpage) == BUF_BLOCK_MEMORY);
+	ut_ad(!bpage->in_page_hash);
+	ut_ad(bpage->in_zip_hash);
+	ut_d(bpage->in_zip_hash = FALSE);
+	HASH_DELETE(buf_page_t, hash, buf_pool->zip_hash, fold, bpage);
+
+	mutex_exit(&zip_hash_mutex);
+
+	ut_d(memset(buf, 0, UNIV_PAGE_SIZE));
+	UNIV_MEM_INVALID(buf, UNIV_PAGE_SIZE);
+
+	block = (buf_block_t*) bpage;
+	mutex_enter(&block->mutex);
+	buf_LRU_block_free_non_file_page(block, have_page_hash_mutex);
+	mutex_exit(&block->mutex);
+
+	ut_ad(buf_buddy_n_frames > 0);
+	ut_d(buf_buddy_n_frames--);
+}
+
+/**********************************************************************//**
+Allocate a buffer block to the buddy allocator. */
+static
+void
+buf_buddy_block_register(
+/*=====================*/
+	buf_block_t*	block)	/*!< in: buffer frame to allocate */
+{
+	const ulint	fold = BUF_POOL_ZIP_FOLD(block);
+	//ut_ad(buf_pool_mutex_own());
+	ut_ad(!mutex_own(&buf_pool_zip_mutex));
+	ut_ad(buf_block_get_state(block) == BUF_BLOCK_READY_FOR_USE);
+
+	buf_block_set_state(block, BUF_BLOCK_MEMORY);
+
+	ut_a(block->frame);
+	ut_a(!ut_align_offset(block->frame, UNIV_PAGE_SIZE));
+
+	ut_ad(!block->page.in_page_hash);
+	ut_ad(!block->page.in_zip_hash);
+	ut_d(block->page.in_zip_hash = TRUE);
+
+	mutex_enter(&zip_hash_mutex);
+	HASH_INSERT(buf_page_t, hash, buf_pool->zip_hash, fold, &block->page);
+	mutex_exit(&zip_hash_mutex);
+
+	ut_d(buf_buddy_n_frames++);
+}
+
+/**********************************************************************//**
+Allocate a block from a bigger object.
+@return	allocated block */
+static
+void*
+buf_buddy_alloc_from(
+/*=================*/
+	void*		buf,	/*!< in: a block that is free to use */
+	ulint		i,	/*!< in: index of buf_pool->zip_free[] */
+	ulint		j)	/*!< in: size of buf as an index
+				of buf_pool->zip_free[] */
+{
+	ulint	offs	= BUF_BUDDY_LOW << j;
+	ut_ad(j <= BUF_BUDDY_SIZES);
+	ut_ad(j >= i);
+	ut_ad(!ut_align_offset(buf, offs));
+
+	/* Add the unused parts of the block to the free lists. */
+	while (j > i) {
+		buf_page_t*	bpage;
+
+		offs >>= 1;
+		j--;
+
+		bpage = (buf_page_t*) ((byte*) buf + offs);
+		ut_d(memset(bpage, j, BUF_BUDDY_LOW << j));
+		bpage->state = BUF_BLOCK_ZIP_FREE;
+#ifndef UNIV_DEBUG_VALGRIND
+		/* Valgrind would complain about accessing free memory. */
+		ut_d(UT_LIST_VALIDATE(zip_list, buf_page_t, buf_pool->zip_free[i],
+				      ut_ad(buf_page_get_state(
+						    ut_list_node_313)
+					    == BUF_BLOCK_ZIP_FREE)));
+#endif /* !UNIV_DEBUG_VALGRIND */
+		buf_buddy_add_to_free(bpage, j);
+	}
+
+	return(buf);
+}
+
+/**********************************************************************//**
+Allocate a block.  The thread calling this function must hold
+buf_pool_mutex and must not hold buf_pool_zip_mutex or any block->mutex.
+The buf_pool_mutex may only be released and reacquired if lru != NULL.
+@return	allocated block, possibly NULL if lru==NULL */
+UNIV_INTERN
+void*
+buf_buddy_alloc_low(
+/*================*/
+	ulint	i,	/*!< in: index of buf_pool->zip_free[],
+			or BUF_BUDDY_SIZES */
+	ibool*	lru,	/*!< in: pointer to a variable that will be assigned
+			TRUE if storage was allocated from the LRU list
+			and buf_pool_mutex was temporarily released,
+			or NULL if the LRU list should not be used */
+	ibool	have_page_hash_mutex)
+{
+	buf_block_t*	block;
+
+	//ut_ad(buf_pool_mutex_own());
+	ut_ad(!mutex_own(&buf_pool_zip_mutex));
+
+	if (i < BUF_BUDDY_SIZES) {
+		/* Try to allocate from the buddy system. */
+		mutex_enter(&zip_free_mutex);
+		block = buf_buddy_alloc_zip(i);
+
+		if (block) {
+
+			goto func_exit;
+		}
+
+		mutex_exit(&zip_free_mutex);
+	}
+
+	/* Try allocating from the buf_pool->free list. */
+	block = buf_LRU_get_free_only();
+
+	if (block) {
+
+		goto alloc_big;
+	}
+
+	if (!lru) {
+
+		return(NULL);
+	}
+
+	/* Try replacing an uncompressed page in the buffer pool. */
+	//buf_pool_mutex_exit();
+	mutex_exit(&LRU_list_mutex);
+	if (have_page_hash_mutex) {
+		rw_lock_x_unlock(&page_hash_latch);
+	}
+	block = buf_LRU_get_free_block(0);
+	*lru = TRUE;
+	//buf_pool_mutex_enter();
+	mutex_enter(&LRU_list_mutex);
+	if (have_page_hash_mutex) {
+		rw_lock_x_lock(&page_hash_latch);
+	}
+
+alloc_big:
+	buf_buddy_block_register(block);
+
+	mutex_enter(&zip_free_mutex);
+	block = buf_buddy_alloc_from(block->frame, i, BUF_BUDDY_SIZES);
+
+func_exit:
+	buf_buddy_stat[i].used++;
+	mutex_exit(&zip_free_mutex);
+
+	return(block);
+}
+
+/**********************************************************************//**
+Try to relocate the control block of a compressed page.
+@return	TRUE if relocated */
+static
+ibool
+buf_buddy_relocate_block(
+/*=====================*/
+	buf_page_t*	bpage,	/*!< in: block to relocate */
+	buf_page_t*	dpage)	/*!< in: free block to relocate to */
+{
+	buf_page_t*	b;
+
+	//ut_ad(buf_pool_mutex_own());
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(rw_lock_own(&page_hash_latch, RW_LOCK_EX));
+#endif
+
+	switch (buf_page_get_state(bpage)) {
+	case BUF_BLOCK_ZIP_FREE:
+	case BUF_BLOCK_NOT_USED:
+	case BUF_BLOCK_READY_FOR_USE:
+	case BUF_BLOCK_FILE_PAGE:
+	case BUF_BLOCK_MEMORY:
+	case BUF_BLOCK_REMOVE_HASH:
+		/* ut_error; */ /* optimistic */
+	case BUF_BLOCK_ZIP_DIRTY:
+		/* Cannot relocate dirty pages. */
+		return(FALSE);
+
+	case BUF_BLOCK_ZIP_PAGE:
+		break;
+	}
+
+	mutex_enter(&buf_pool_zip_mutex);
+	mutex_enter(&zip_free_mutex);
+
+	if (!buf_page_can_relocate(bpage)) {
+		mutex_exit(&buf_pool_zip_mutex);
+		mutex_exit(&zip_free_mutex);
+		return(FALSE);
+	}
+
+	if (bpage != buf_page_hash_get(bpage->space, bpage->offset)) {
+		mutex_exit(&buf_pool_zip_mutex);
+		mutex_exit(&zip_free_mutex);
+		return(FALSE);
+	}
+
+	buf_relocate(bpage, dpage);
+	ut_d(bpage->state = BUF_BLOCK_ZIP_FREE);
+
+	/* relocate buf_pool->zip_clean */
+	mutex_enter(&flush_list_mutex);
+	b = UT_LIST_GET_PREV(zip_list, dpage);
+	UT_LIST_REMOVE(zip_list, buf_pool->zip_clean, dpage);
+
+	if (b) {
+		UT_LIST_INSERT_AFTER(zip_list, buf_pool->zip_clean, b, dpage);
+	} else {
+		UT_LIST_ADD_FIRST(zip_list, buf_pool->zip_clean, dpage);
+	}
+	mutex_exit(&flush_list_mutex);
+
+	UNIV_MEM_INVALID(bpage, sizeof *bpage);
+
+	mutex_exit(&buf_pool_zip_mutex);
+	mutex_exit(&zip_free_mutex);
+	return(TRUE);
+}
+
+/**********************************************************************//**
+Try to relocate a block.
+@return	TRUE if relocated */
+static
+ibool
+buf_buddy_relocate(
+/*===============*/
+	void*	src,	/*!< in: block to relocate */
+	void*	dst,	/*!< in: free block to relocate to */
+	ulint	i,	/*!< in: index of buf_pool->zip_free[] */
+	ibool	have_page_hash_mutex)
+{
+	buf_page_t*	bpage;
+	const ulint	size	= BUF_BUDDY_LOW << i;
+	ullint		usec	= ut_time_us(NULL);
+	ulint		space;
+	ulint		page_no;
+
+	//ut_ad(buf_pool_mutex_own());
+	ut_ad(mutex_own(&zip_free_mutex));
+	ut_ad(!mutex_own(&buf_pool_zip_mutex));
+	ut_ad(!ut_align_offset(src, size));
+	ut_ad(!ut_align_offset(dst, size));
+	UNIV_MEM_ASSERT_W(dst, size);
+
+	/* We assume that all memory from buf_buddy_alloc()
+	is used for either compressed pages or buf_page_t
+	objects covering compressed pages. */
+
+	/* We look inside the allocated objects returned by
+	buf_buddy_alloc() and assume that anything of
+	PAGE_ZIP_MIN_SIZE or larger is a compressed page that contains
+	a valid space_id and page_no in the page header.  Should the
+	fields be invalid, we will be unable to relocate the block.
+	We also assume that anything that fits sizeof(buf_page_t)
+	actually is a properly initialized buf_page_t object. */
+
+	if (size >= PAGE_ZIP_MIN_SIZE) {
+		/* This is a compressed page. */
+		mutex_t*	mutex;
+
+		if (!have_page_hash_mutex) {
+			mutex_exit(&zip_free_mutex);
+			mutex_enter(&LRU_list_mutex);
+			rw_lock_x_lock(&page_hash_latch);
+		}
+
+		/* The src block may be split into smaller blocks,
+		some of which may be free.  Thus, the
+		mach_read_from_4() calls below may attempt to read
+		from free memory.  The memory is "owned" by the buddy
+		allocator (and it has been allocated from the buffer
+		pool), so there is nothing wrong about this.  The
+		mach_read_from_4() calls here will only trigger bogus
+		Valgrind memcheck warnings in UNIV_DEBUG_VALGRIND builds. */
+		space	= mach_read_from_4(
+			(const byte*) src + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID);
+		page_no	= mach_read_from_4(
+			(const byte*) src + FIL_PAGE_OFFSET);
+		/* Suppress Valgrind warnings about conditional jump
+		on uninitialized value. */
+		UNIV_MEM_VALID(&space, sizeof space);
+		UNIV_MEM_VALID(&page_no, sizeof page_no);
+		bpage = buf_page_hash_get(space, page_no);
+
+		if (!bpage || bpage->zip.data != src) {
+			/* The block has probably been freshly
+			allocated by buf_LRU_get_free_block() but not
+			added to buf_pool->page_hash yet.  Obviously,
+			it cannot be relocated. */
+
+			if (!have_page_hash_mutex) {
+				mutex_enter(&zip_free_mutex);
+				mutex_exit(&LRU_list_mutex);
+				rw_lock_x_unlock(&page_hash_latch);
+			}
+			return(FALSE);
+		}
+
+		if (page_zip_get_size(&bpage->zip) != size) {
+			/* The block is of different size.  We would
+			have to relocate all blocks covered by src.
+			For the sake of simplicity, give up. */
+			ut_ad(page_zip_get_size(&bpage->zip) < size);
+
+			if (!have_page_hash_mutex) {
+				mutex_enter(&zip_free_mutex);
+				mutex_exit(&LRU_list_mutex);
+				rw_lock_x_unlock(&page_hash_latch);
+			}
+			return(FALSE);
+		}
+
+		/* To keep latch order */
+		if (have_page_hash_mutex)
+			mutex_exit(&zip_free_mutex);
+
+		/* The block must have been allocated, but it may
+		contain uninitialized data. */
+		UNIV_MEM_ASSERT_W(src, size);
+
+		mutex = buf_page_get_mutex_enter(bpage);
+
+		mutex_enter(&zip_free_mutex);
+
+		if (mutex && buf_page_can_relocate(bpage)) {
+			/* Relocate the compressed page. */
+			ut_a(bpage->zip.data == src);
+			memcpy(dst, src, size);
+			bpage->zip.data = dst;
+			mutex_exit(mutex);
+success:
+			UNIV_MEM_INVALID(src, size);
+			{
+				buf_buddy_stat_t*	buddy_stat
+					= &buf_buddy_stat[i];
+				buddy_stat->relocated++;
+				buddy_stat->relocated_usec
+					+= ut_time_us(NULL) - usec;
+			}
+
+			if (!have_page_hash_mutex) {
+				mutex_exit(&LRU_list_mutex);
+				rw_lock_x_unlock(&page_hash_latch);
+			}
+			return(TRUE);
+		}
+
+		if (!have_page_hash_mutex) {
+			mutex_exit(&LRU_list_mutex);
+			rw_lock_x_unlock(&page_hash_latch);
+		}
+
+		if (mutex) {
+			mutex_exit(mutex);
+		}
+	} else if (i == buf_buddy_get_slot(sizeof(buf_page_t))) {
+		/* This must be a buf_page_t object. */
+#if UNIV_WORD_SIZE == 4
+		/* On 32-bit systems, there is no padding in
+		buf_page_t.  On other systems, Valgrind could complain
+		about uninitialized pad bytes. */
+		UNIV_MEM_ASSERT_RW(src, size);
+#endif
+
+		mutex_exit(&zip_free_mutex);
+
+		if (!have_page_hash_mutex) {
+			mutex_enter(&LRU_list_mutex);
+			rw_lock_x_lock(&page_hash_latch);
+		}
+
+		if (buf_buddy_relocate_block(src, dst)) {
+			mutex_enter(&zip_free_mutex);
+
+			if (!have_page_hash_mutex) {
+				mutex_exit(&LRU_list_mutex);
+				rw_lock_x_unlock(&page_hash_latch);
+			}
+
+			goto success;
+		}
+
+		mutex_enter(&zip_free_mutex);
+
+		if (!have_page_hash_mutex) {
+			mutex_exit(&LRU_list_mutex);
+			rw_lock_x_unlock(&page_hash_latch);
+		}
+	}
+
+	return(FALSE);
+}
+
+/**********************************************************************//**
+Deallocate a block. */
+UNIV_INTERN
+void
+buf_buddy_free_low(
+/*===============*/
+	void*	buf,	/*!< in: block to be freed, must not be
+			pointed to by the buffer pool */
+	ulint	i,	/*!< in: index of buf_pool->zip_free[],
+			or BUF_BUDDY_SIZES */
+	ibool	have_page_hash_mutex)
+{
+	buf_page_t*	bpage;
+	buf_page_t*	buddy;
+
+	//ut_ad(buf_pool_mutex_own());
+	ut_ad(mutex_own(&zip_free_mutex));
+	ut_ad(!mutex_own(&buf_pool_zip_mutex));
+	ut_ad(i <= BUF_BUDDY_SIZES);
+	ut_ad(buf_buddy_stat[i].used > 0);
+
+	buf_buddy_stat[i].used--;
+recombine:
+	UNIV_MEM_ASSERT_AND_ALLOC(buf, BUF_BUDDY_LOW << i);
+	ut_d(((buf_page_t*) buf)->state = BUF_BLOCK_ZIP_FREE);
+
+	if (i == BUF_BUDDY_SIZES) {
+		mutex_exit(&zip_free_mutex);
+		buf_buddy_block_free(buf, have_page_hash_mutex);
+		mutex_enter(&zip_free_mutex);
+		return;
+	}
+
+	ut_ad(i < BUF_BUDDY_SIZES);
+	ut_ad(buf == ut_align_down(buf, BUF_BUDDY_LOW << i));
+	ut_ad(!buf_pool_contains_zip(buf));
+
+	/* Try to combine adjacent blocks. */
+
+	buddy = (buf_page_t*) buf_buddy_get(((byte*) buf), BUF_BUDDY_LOW << i);
+
+#ifndef UNIV_DEBUG_VALGRIND
+	/* Valgrind would complain about accessing free memory. */
+
+	if (buddy->state != BUF_BLOCK_ZIP_FREE) {
+
+		goto buddy_nonfree;
+	}
+
+	/* The field buddy->state can only be trusted for free blocks.
+	If buddy->state == BUF_BLOCK_ZIP_FREE, the block is free if
+	it is in the free list. */
+#endif /* !UNIV_DEBUG_VALGRIND */
+
+	for (bpage = UT_LIST_GET_FIRST(buf_pool->zip_free[i]); bpage; ) {
+		UNIV_MEM_VALID(bpage, BUF_BUDDY_LOW << i);
+		ut_ad(buf_page_get_state(bpage) == BUF_BLOCK_ZIP_FREE);
+
+		if (bpage == buddy) {
+buddy_free:
+			/* The buddy is free: recombine */
+			buf_buddy_remove_from_free(bpage, i);
+buddy_free2:
+			ut_ad(buf_page_get_state(buddy) == BUF_BLOCK_ZIP_FREE);
+			ut_ad(!buf_pool_contains_zip(buddy));
+			i++;
+			buf = ut_align_down(buf, BUF_BUDDY_LOW << i);
+
+			goto recombine;
+		}
+
+		ut_a(bpage != buf);
+
+		{
+			buf_page_t*	next = UT_LIST_GET_NEXT(zip_list, bpage);
+			UNIV_MEM_ASSERT_AND_FREE(bpage, BUF_BUDDY_LOW << i);
+			bpage = next;
+		}
+	}
+
+#ifndef UNIV_DEBUG_VALGRIND
+buddy_nonfree:
+	/* Valgrind would complain about accessing free memory. */
+	ut_d(UT_LIST_VALIDATE(zip_list, buf_page_t, buf_pool->zip_free[i],
+			      ut_ad(buf_page_get_state(ut_list_node_313)
+				    == BUF_BLOCK_ZIP_FREE)));
+#endif /* UNIV_DEBUG_VALGRIND */
+
+	/* The buddy is not free. Is there a free block of this size? */
+	bpage = UT_LIST_GET_LAST(buf_pool->zip_free[i]);
+
+	if (bpage) {
+		/* Remove the block from the free list, because a successful
+		buf_buddy_relocate() will overwrite bpage->list. */
+
+		UNIV_MEM_VALID(bpage, BUF_BUDDY_LOW << i);
+		buf_buddy_remove_from_free(bpage, i);
+
+		/* Try to relocate the buddy of buf to the free block. */
+		if (buf_buddy_relocate(buddy, bpage, i, have_page_hash_mutex)) {
+
+			ut_d(buddy->state = BUF_BLOCK_ZIP_FREE);
+			goto buddy_free2;
+		}
+
+		buf_buddy_add_to_free(bpage, i);
+
+		/* Try to relocate the buddy of the free block to buf. */
+		buddy = (buf_page_t*) buf_buddy_get(((byte*) bpage),
+						    BUF_BUDDY_LOW << i);
+
+#ifndef UNIV_DEBUG_VALGRIND
+		/* Valgrind would complain about accessing free memory. */
+
+		/* The buddy must not be (completely) free, because we
+		always recombine adjacent free blocks.
+
+		(Parts of the buddy can be free in
+		buf_pool->zip_free[j] with j < i.) */
+		ut_d(UT_LIST_VALIDATE(zip_list, buf_page_t, buf_pool->zip_free[i],
+				      ut_ad(buf_page_get_state(
+						    ut_list_node_313)
+					    == BUF_BLOCK_ZIP_FREE
+					    && ut_list_node_313 != buddy)));
+#endif /* !UNIV_DEBUG_VALGRIND */
+
+		if (buf_buddy_relocate(buddy, buf, i, have_page_hash_mutex)) {
+
+			buf = bpage;
+			UNIV_MEM_VALID(bpage, BUF_BUDDY_LOW << i);
+			ut_d(buddy->state = BUF_BLOCK_ZIP_FREE);
+			goto buddy_free;
+		}
+	}
+
+	/* Free the block to the buddy list. */
+	bpage = buf;
+#ifdef UNIV_DEBUG
+	if (i < buf_buddy_get_slot(PAGE_ZIP_MIN_SIZE)) {
+		/* This area has most likely been allocated for at
+		least one compressed-only block descriptor.  Check
+		that there are no live objects in the area.  This is
+		not a complete check: it may yield false positives as
+		well as false negatives.  Also, due to buddy blocks
+		being recombined, it is possible (although unlikely)
+		that this branch is never reached. */
+
+		char* c;
+
+# ifndef UNIV_DEBUG_VALGRIND
+		/* Valgrind would complain about accessing
+		uninitialized memory.  Besides, Valgrind performs a
+		more exhaustive check, at every memory access. */
+		const buf_page_t* b = buf;
+		const buf_page_t* const b_end = (buf_page_t*)
+			((char*) b + (BUF_BUDDY_LOW << i));
+
+		for (; b < b_end; b++) {
+			/* Avoid false positives (and cause false
+			negatives) by checking for b->space < 1000. */
+
+			if ((b->state == BUF_BLOCK_ZIP_PAGE
+			     || b->state == BUF_BLOCK_ZIP_DIRTY)
+			    && b->space > 0 && b->space < 1000) {
+				fprintf(stderr,
+					"buddy dirty %p %u (%u,%u) %p,%lu\n",
+					(void*) b,
+					b->state, b->space, b->offset,
+					buf, i);
+			}
+		}
+# endif /* !UNIV_DEBUG_VALGRIND */
+
+		/* Scramble the block.  This should make any pointers
+		invalid and trigger a segmentation violation.  Because
+		the scrambling can be reversed, it may be possible to
+		track down the object pointing to the freed data by
+		dereferencing the unscrambled bpage->LRU or
+		bpage->list pointers. */
+		for (c = (char*) buf + (BUF_BUDDY_LOW << i);
+		     c-- > (char*) buf; ) {
+			*c = ~*c ^ i;
+		}
+	} else {
+		/* Fill large blocks with a constant pattern. */
+		memset(bpage, i, BUF_BUDDY_LOW << i);
+	}
+#endif /* UNIV_DEBUG */
+	bpage->state = BUF_BLOCK_ZIP_FREE;
+	buf_buddy_add_to_free(bpage, i);
+}
diff --git a/storage/xtradb/buf/buf0buf.c b/storage/xtradb/buf/buf0buf.c
new file mode 100644
index 00000000000..1c08bd6d0bf
--- /dev/null
+++ b/storage/xtradb/buf/buf0buf.c
@@ -0,0 +1,4901 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2010, Innobase Oy. All Rights Reserved.
+Copyright (c) 2008, Google Inc.
+
+Portions of this file contain modifications contributed and copyrighted by
+Google, Inc. Those modifications are gratefully acknowledged and are described
+briefly in the InnoDB documentation. The contributions by Google are
+incorporated with their permission, and subject to the conditions contained in
+the file COPYING.Google.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file buf/buf0buf.c
+The database buffer buf_pool
+
+Created 11/5/1995 Heikki Tuuri
+*******************************************************/
+
+#include "buf0buf.h"
+
+#ifdef UNIV_NONINL
+#include "buf0buf.ic"
+#endif
+
+#include "mem0mem.h"
+#include "btr0btr.h"
+#include "fil0fil.h"
+#ifndef UNIV_HOTBACKUP
+#include "buf0buddy.h"
+#include "lock0lock.h"
+#include "btr0sea.h"
+#include "ibuf0ibuf.h"
+#include "trx0undo.h"
+#include "log0log.h"
+#endif /* !UNIV_HOTBACKUP */
+#include "srv0srv.h"
+#include "dict0dict.h"
+#include "log0recv.h"
+#include "page0zip.h"
+#include "trx0trx.h"
+#include "srv0start.h"
+#include "que0que.h"
+#include "read0read.h"
+#include "row0row.h"
+#include "ha_prototypes.h"
+
+/* prototypes for new functions added to ha_innodb.cc */
+trx_t* innobase_get_trx();
+
+inline void _increment_page_get_statistics(buf_block_t* block, trx_t* trx)
+{
+	ulint           block_hash;
+	ulint           block_hash_byte;
+	byte            block_hash_offset;
+
+	ut_ad(block);
+
+	if (!innobase_get_slow_log() || !trx || !trx->take_stats)
+		return;
+
+	if (!trx->distinct_page_access_hash) {
+		trx->distinct_page_access_hash = mem_alloc(DPAH_SIZE);
+		memset(trx->distinct_page_access_hash, 0, DPAH_SIZE);
+	}
+
+	block_hash = ut_hash_ulint((block->page.space << 20) + block->page.space +
+					block->page.offset, DPAH_SIZE << 3);
+	block_hash_byte = block_hash >> 3;
+	block_hash_offset = (byte) block_hash & 0x07;
+	if (block_hash_byte >= DPAH_SIZE)
+		fprintf(stderr, "!!! block_hash_byte = %lu  block_hash_offset = %d !!!\n", block_hash_byte, block_hash_offset);
+	if (block_hash_offset > 7)
+		fprintf(stderr, "!!! block_hash_byte = %lu  block_hash_offset = %d !!!\n", block_hash_byte, block_hash_offset);
+	if ((trx->distinct_page_access_hash[block_hash_byte] & ((byte) 0x01 << block_hash_offset)) == 0)
+		trx->distinct_page_access++;
+	trx->distinct_page_access_hash[block_hash_byte] |= (byte) 0x01 << block_hash_offset;
+	return;
+}
+
+/*
+		IMPLEMENTATION OF THE BUFFER POOL
+		=================================
+
+Performance improvement:
+------------------------
+Thread scheduling in NT may be so slow that the OS wait mechanism should
+not be used even in waiting for disk reads to complete.
+Rather, we should put waiting query threads to the queue of
+waiting jobs, and let the OS thread do something useful while the i/o
+is processed. In this way we could remove most OS thread switches in
+an i/o-intensive benchmark like TPC-C.
+
+A possibility is to put a user space thread library between the database
+and NT. User space thread libraries might be very fast.
+
+SQL Server 7.0 can be configured to use 'fibers' which are lightweight
+threads in NT. These should be studied.
+
+		Buffer frames and blocks
+		------------------------
+Following the terminology of Gray and Reuter, we call the memory
+blocks where file pages are loaded buffer frames. For each buffer
+frame there is a control block, or shortly, a block, in the buffer
+control array. The control info which does not need to be stored
+in the file along with the file page, resides in the control block.
+
+		Buffer pool struct
+		------------------
+The buffer buf_pool contains a single mutex which protects all the
+control data structures of the buf_pool. The content of a buffer frame is
+protected by a separate read-write lock in its control block, though.
+These locks can be locked and unlocked without owning the buf_pool mutex.
+The OS events in the buf_pool struct can be waited for without owning the
+buf_pool mutex.
+
+The buf_pool mutex is a hot-spot in main memory, causing a lot of
+memory bus traffic on multiprocessor systems when processors
+alternately access the mutex. On our Pentium, the mutex is accessed
+maybe every 10 microseconds. We gave up the solution to have mutexes
+for each control block, for instance, because it seemed to be
+complicated.
+
+A solution to reduce mutex contention of the buf_pool mutex is to
+create a separate mutex for the page hash table. On Pentium,
+accessing the hash table takes 2 microseconds, about half
+of the total buf_pool mutex hold time.
+
+		Control blocks
+		--------------
+
+The control block contains, for instance, the bufferfix count
+which is incremented when a thread wants a file page to be fixed
+in a buffer frame. The bufferfix operation does not lock the
+contents of the frame, however. For this purpose, the control
+block contains a read-write lock.
+
+The buffer frames have to be aligned so that the start memory
+address of a frame is divisible by the universal page size, which
+is a power of two.
+
+We intend to make the buffer buf_pool size on-line reconfigurable,
+that is, the buf_pool size can be changed without closing the database.
+Then the database administarator may adjust it to be bigger
+at night, for example. The control block array must
+contain enough control blocks for the maximum buffer buf_pool size
+which is used in the particular database.
+If the buf_pool size is cut, we exploit the virtual memory mechanism of
+the OS, and just refrain from using frames at high addresses. Then the OS
+can swap them to disk.
+
+The control blocks containing file pages are put to a hash table
+according to the file address of the page.
+We could speed up the access to an individual page by using
+"pointer swizzling": we could replace the page references on
+non-leaf index pages by direct pointers to the page, if it exists
+in the buf_pool. We could make a separate hash table where we could
+chain all the page references in non-leaf pages residing in the buf_pool,
+using the page reference as the hash key,
+and at the time of reading of a page update the pointers accordingly.
+Drawbacks of this solution are added complexity and,
+possibly, extra space required on non-leaf pages for memory pointers.
+A simpler solution is just to speed up the hash table mechanism
+in the database, using tables whose size is a power of 2.
+
+		Lists of blocks
+		---------------
+
+There are several lists of control blocks.
+
+The free list (buf_pool->free) contains blocks which are currently not
+used.
+
+The common LRU list contains all the blocks holding a file page
+except those for which the bufferfix count is non-zero.
+The pages are in the LRU list roughly in the order of the last
+access to the page, so that the oldest pages are at the end of the
+list. We also keep a pointer to near the end of the LRU list,
+which we can use when we want to artificially age a page in the
+buf_pool. This is used if we know that some page is not needed
+again for some time: we insert the block right after the pointer,
+causing it to be replaced sooner than would noramlly be the case.
+Currently this aging mechanism is used for read-ahead mechanism
+of pages, and it can also be used when there is a scan of a full
+table which cannot fit in the memory. Putting the pages near the
+of the LRU list, we make sure that most of the buf_pool stays in the
+main memory, undisturbed.
+
+The unzip_LRU list contains a subset of the common LRU list.  The
+blocks on the unzip_LRU list hold a compressed file page and the
+corresponding uncompressed page frame.  A block is in unzip_LRU if and
+only if the predicate buf_page_belongs_to_unzip_LRU(&block->page)
+holds.  The blocks in unzip_LRU will be in same order as they are in
+the common LRU list.  That is, each manipulation of the common LRU
+list will result in the same manipulation of the unzip_LRU list.
+
+The chain of modified blocks (buf_pool->flush_list) contains the blocks
+holding file pages that have been modified in the memory
+but not written to disk yet. The block with the oldest modification
+which has not yet been written to disk is at the end of the chain.
+
+The chain of unmodified compressed blocks (buf_pool->zip_clean)
+contains the control blocks (buf_page_t) of those compressed pages
+that are not in buf_pool->flush_list and for which no uncompressed
+page has been allocated in the buffer pool.  The control blocks for
+uncompressed pages are accessible via buf_block_t objects that are
+reachable via buf_pool->chunks[].
+
+The chains of free memory blocks (buf_pool->zip_free[]) are used by
+the buddy allocator (buf0buddy.c) to keep track of currently unused
+memory blocks of size sizeof(buf_page_t)..UNIV_PAGE_SIZE / 2.  These
+blocks are inside the UNIV_PAGE_SIZE-sized memory blocks of type
+BUF_BLOCK_MEMORY that the buddy allocator requests from the buffer
+pool.  The buddy allocator is solely used for allocating control
+blocks for compressed pages (buf_page_t) and compressed page frames.
+
+		Loading a file page
+		-------------------
+
+First, a victim block for replacement has to be found in the
+buf_pool. It is taken from the free list or searched for from the
+end of the LRU-list. An exclusive lock is reserved for the frame,
+the io_fix field is set in the block fixing the block in buf_pool,
+and the io-operation for loading the page is queued. The io-handler thread
+releases the X-lock on the frame and resets the io_fix field
+when the io operation completes.
+
+A thread may request the above operation using the function
+buf_page_get(). It may then continue to request a lock on the frame.
+The lock is granted when the io-handler releases the x-lock.
+
+		Read-ahead
+		----------
+
+The read-ahead mechanism is intended to be intelligent and
+isolated from the semantically higher levels of the database
+index management. From the higher level we only need the
+information if a file page has a natural successor or
+predecessor page. On the leaf level of a B-tree index,
+these are the next and previous pages in the natural
+order of the pages.
+
+Let us first explain the read-ahead mechanism when the leafs
+of a B-tree are scanned in an ascending or descending order.
+When a read page is the first time referenced in the buf_pool,
+the buffer manager checks if it is at the border of a so-called
+linear read-ahead area. The tablespace is divided into these
+areas of size 64 blocks, for example. So if the page is at the
+border of such an area, the read-ahead mechanism checks if
+all the other blocks in the area have been accessed in an
+ascending or descending order. If this is the case, the system
+looks at the natural successor or predecessor of the page,
+checks if that is at the border of another area, and in this case
+issues read-requests for all the pages in that area. Maybe
+we could relax the condition that all the pages in the area
+have to be accessed: if data is deleted from a table, there may
+appear holes of unused pages in the area.
+
+A different read-ahead mechanism is used when there appears
+to be a random access pattern to a file.
+If a new page is referenced in the buf_pool, and several pages
+of its random access area (for instance, 32 consecutive pages
+in a tablespace) have recently been referenced, we may predict
+that the whole area may be needed in the near future, and issue
+the read requests for the whole area.
+*/
+
+#ifndef UNIV_HOTBACKUP
+/** Value in microseconds */
+static const int WAIT_FOR_READ	= 5000;
+/** Number of attemtps made to read in a page in the buffer pool */
+static const ulint BUF_PAGE_READ_MAX_RETRIES = 100;
+
+/** The buffer buf_pool of the database */
+UNIV_INTERN buf_pool_t*	buf_pool = NULL;
+
+/** mutex protecting the buffer pool struct and control blocks, except the
+read-write lock in them */
+UNIV_INTERN mutex_t		buf_pool_mutex;
+UNIV_INTERN mutex_t		LRU_list_mutex;
+UNIV_INTERN mutex_t		flush_list_mutex;
+UNIV_INTERN rw_lock_t		page_hash_latch;
+UNIV_INTERN mutex_t		free_list_mutex;
+UNIV_INTERN mutex_t		zip_free_mutex;
+UNIV_INTERN mutex_t		zip_hash_mutex;
+/** mutex protecting the control blocks of compressed-only pages
+(of type buf_page_t, not buf_block_t) */
+UNIV_INTERN mutex_t		buf_pool_zip_mutex;
+
+#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
+static ulint	buf_dbg_counter	= 0; /*!< This is used to insert validation
+					operations in excution in the
+					debug version */
+/** Flag to forbid the release of the buffer pool mutex.
+Protected by buf_pool_mutex. */
+UNIV_INTERN ulint		buf_pool_mutex_exit_forbidden = 0;
+#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
+#ifdef UNIV_DEBUG
+/** If this is set TRUE, the program prints info whenever
+read-ahead or flush occurs */
+UNIV_INTERN ibool		buf_debug_prints = FALSE;
+#endif /* UNIV_DEBUG */
+
+/* Buffer pool shared memory segment information */
+typedef	struct buf_shm_info_struct	buf_shm_info_t;
+
+struct buf_shm_info_struct {
+	char	head_str[8];
+	ulint	binary_id;
+	ibool	is_new;		/* during initializing */
+	ibool	clean;		/* clean shutdowned and free */
+	ibool	reusable;	/* reusable */
+	ulint	buf_pool_size;	/* backup value */
+	ulint	page_size;	/* backup value */
+	ulint	frame_offset;	/* offset of the first frame based on chunk->mem */
+	ulint	zip_hash_offset;
+	ulint	zip_hash_n;
+
+	ulint	checksum;
+
+	buf_pool_t	buf_pool_backup;
+	buf_chunk_t	chunk_backup;
+
+	ib_uint64_t	dummy;
+};
+
+#define BUF_SHM_INFO_HEAD "XTRA_SHM"
+#endif /* !UNIV_HOTBACKUP */
+
+/********************************************************************//**
+Calculates a page checksum which is stored to the page when it is written
+to a file. Note that we must be careful to calculate the same value on
+32-bit and 64-bit architectures.
+@return	checksum */
+UNIV_INTERN
+ulint
+buf_calc_page_new_checksum(
+/*=======================*/
+	const byte*	page)	/*!< in: buffer page */
+{
+	ulint checksum;
+
+	/* Since the field FIL_PAGE_FILE_FLUSH_LSN, and in versions <= 4.1.x
+	..._ARCH_LOG_NO, are written outside the buffer pool to the first
+	pages of data files, we have to skip them in the page checksum
+	calculation.
+	We must also skip the field FIL_PAGE_SPACE_OR_CHKSUM where the
+	checksum is stored, and also the last 8 bytes of page because
+	there we store the old formula checksum. */
+
+	checksum = ut_fold_binary(page + FIL_PAGE_OFFSET,
+				  FIL_PAGE_FILE_FLUSH_LSN - FIL_PAGE_OFFSET)
+		+ ut_fold_binary(page + FIL_PAGE_DATA,
+				 UNIV_PAGE_SIZE - FIL_PAGE_DATA
+				 - FIL_PAGE_END_LSN_OLD_CHKSUM);
+	checksum = checksum & 0xFFFFFFFFUL;
+
+	return(checksum);
+}
+
+UNIV_INTERN
+ulint
+buf_calc_page_new_checksum_32(
+/*==========================*/
+	const byte*	page)	/*!< in: buffer page */
+{
+	ulint checksum;
+
+	checksum = ut_fold_binary(page + FIL_PAGE_OFFSET,
+				  FIL_PAGE_FILE_FLUSH_LSN - FIL_PAGE_OFFSET)
+		+ ut_fold_binary(page + FIL_PAGE_DATA,
+				 FIL_PAGE_DATA_ALIGN_32 - FIL_PAGE_DATA)
+		+ ut_fold_binary_32(page + FIL_PAGE_DATA_ALIGN_32,
+				    UNIV_PAGE_SIZE - FIL_PAGE_DATA_ALIGN_32
+				    - FIL_PAGE_END_LSN_OLD_CHKSUM);
+
+	checksum = checksum & 0xFFFFFFFFUL;
+
+	return(checksum);
+}
+
+/********************************************************************//**
+In versions < 4.0.14 and < 4.1.1 there was a bug that the checksum only
+looked at the first few bytes of the page. This calculates that old
+checksum.
+NOTE: we must first store the new formula checksum to
+FIL_PAGE_SPACE_OR_CHKSUM before calculating and storing this old checksum
+because this takes that field as an input!
+@return	checksum */
+UNIV_INTERN
+ulint
+buf_calc_page_old_checksum(
+/*=======================*/
+	const byte*	page)	/*!< in: buffer page */
+{
+	ulint checksum;
+
+	checksum = ut_fold_binary(page, FIL_PAGE_FILE_FLUSH_LSN);
+
+	checksum = checksum & 0xFFFFFFFFUL;
+
+	return(checksum);
+}
+
+/********************************************************************//**
+Checks if a page is corrupt.
+@return	TRUE if corrupted */
+UNIV_INTERN
+ibool
+buf_page_is_corrupted(
+/*==================*/
+	const byte*	read_buf,	/*!< in: a database page */
+	ulint		zip_size)	/*!< in: size of compressed page;
+					0 for uncompressed pages */
+{
+	ulint		checksum_field;
+	ulint		old_checksum_field;
+
+	if (UNIV_LIKELY(!zip_size)
+	    && memcmp(read_buf + FIL_PAGE_LSN + 4,
+		      read_buf + UNIV_PAGE_SIZE
+		      - FIL_PAGE_END_LSN_OLD_CHKSUM + 4, 4)) {
+
+		/* Stored log sequence numbers at the start and the end
+		of page do not match */
+
+		return(TRUE);
+	}
+
+#ifndef UNIV_HOTBACKUP
+	if (recv_lsn_checks_on) {
+		ib_uint64_t	current_lsn;
+
+		if (log_peek_lsn(&current_lsn)
+		    && current_lsn < mach_read_ull(read_buf + FIL_PAGE_LSN)) {
+			ut_print_timestamp(stderr);
+
+			fprintf(stderr,
+				"  InnoDB: Error: page %lu log sequence number"
+				" %llu\n"
+				"InnoDB: is in the future! Current system "
+				"log sequence number %llu.\n"
+				"InnoDB: Your database may be corrupt or "
+				"you may have copied the InnoDB\n"
+				"InnoDB: tablespace but not the InnoDB "
+				"log files. See\n"
+				"InnoDB: " REFMAN "forcing-recovery.html\n"
+				"InnoDB: for more information.\n",
+				(ulong) mach_read_from_4(read_buf
+							 + FIL_PAGE_OFFSET),
+				mach_read_ull(read_buf + FIL_PAGE_LSN),
+				current_lsn);
+		}
+	}
+#endif
+
+	/* If we use checksums validation, make additional check before
+	returning TRUE to ensure that the checksum is not equal to
+	BUF_NO_CHECKSUM_MAGIC which might be stored by InnoDB with checksums
+	disabled. Otherwise, skip checksum calculation and return FALSE */
+
+	if (UNIV_LIKELY(srv_use_checksums)) {
+		checksum_field = mach_read_from_4(read_buf
+						  + FIL_PAGE_SPACE_OR_CHKSUM);
+
+		if (UNIV_UNLIKELY(zip_size)) {
+			return(checksum_field != BUF_NO_CHECKSUM_MAGIC
+			       && checksum_field
+			       != page_zip_calc_checksum(read_buf, zip_size));
+		}
+
+		old_checksum_field = mach_read_from_4(
+			read_buf + UNIV_PAGE_SIZE
+			- FIL_PAGE_END_LSN_OLD_CHKSUM);
+
+		/* There are 2 valid formulas for old_checksum_field:
+
+		1. Very old versions of InnoDB only stored 8 byte lsn to the
+		start and the end of the page.
+
+		2. Newer InnoDB versions store the old formula checksum
+		there. */
+
+		if (old_checksum_field != mach_read_from_4(read_buf
+							   + FIL_PAGE_LSN)
+		    && old_checksum_field != BUF_NO_CHECKSUM_MAGIC
+		    && old_checksum_field
+		    != buf_calc_page_old_checksum(read_buf)) {
+
+			return(TRUE);
+		}
+
+		/* InnoDB versions < 4.0.14 and < 4.1.1 stored the space id
+		(always equal to 0), to FIL_PAGE_SPACE_OR_CHKSUM */
+
+		if (!srv_fast_checksum
+		    && checksum_field != 0
+		    && checksum_field != BUF_NO_CHECKSUM_MAGIC
+		    && checksum_field
+		    != buf_calc_page_new_checksum(read_buf)) {
+
+			return(TRUE);
+		}
+
+		if (srv_fast_checksum
+		    && checksum_field != 0
+		    && checksum_field != BUF_NO_CHECKSUM_MAGIC
+		    && checksum_field
+		    != buf_calc_page_new_checksum_32(read_buf)
+		    && checksum_field
+		    != buf_calc_page_new_checksum(read_buf)) {
+
+			return(TRUE);
+		}
+	}
+
+	return(FALSE);
+}
+
+/********************************************************************//**
+Prints a page to stderr. */
+UNIV_INTERN
+void
+buf_page_print(
+/*===========*/
+	const byte*	read_buf,	/*!< in: a database page */
+	ulint		zip_size)	/*!< in: compressed page size, or
+				0 for uncompressed pages */
+{
+#ifndef UNIV_HOTBACKUP
+	dict_index_t*	index;
+#endif /* !UNIV_HOTBACKUP */
+	ulint		checksum;
+	ulint		checksum_32;
+	ulint		old_checksum;
+	ulint		size	= zip_size;
+
+	if (!size) {
+		size = UNIV_PAGE_SIZE;
+	}
+
+	ut_print_timestamp(stderr);
+	fprintf(stderr, "  InnoDB: Page dump in ascii and hex (%lu bytes):\n",
+		(ulong) size);
+	ut_print_buf(stderr, read_buf, size);
+	fputs("\nInnoDB: End of page dump\n", stderr);
+
+	if (zip_size) {
+		/* Print compressed page. */
+
+		switch (fil_page_get_type(read_buf)) {
+		case FIL_PAGE_TYPE_ZBLOB:
+		case FIL_PAGE_TYPE_ZBLOB2:
+			checksum = srv_use_checksums
+				? page_zip_calc_checksum(read_buf, zip_size)
+				: BUF_NO_CHECKSUM_MAGIC;
+			ut_print_timestamp(stderr);
+			fprintf(stderr,
+				"  InnoDB: Compressed BLOB page"
+				" checksum %lu, stored %lu\n"
+				"InnoDB: Page lsn %lu %lu\n"
+				"InnoDB: Page number (if stored"
+				" to page already) %lu,\n"
+				"InnoDB: space id (if stored"
+				" to page already) %lu\n",
+				(ulong) checksum,
+				(ulong) mach_read_from_4(
+					read_buf + FIL_PAGE_SPACE_OR_CHKSUM),
+				(ulong) mach_read_from_4(
+					read_buf + FIL_PAGE_LSN),
+				(ulong) mach_read_from_4(
+					read_buf + (FIL_PAGE_LSN + 4)),
+				(ulong) mach_read_from_4(
+					read_buf + FIL_PAGE_OFFSET),
+				(ulong) mach_read_from_4(
+					read_buf
+					+ FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID));
+			return;
+		default:
+			ut_print_timestamp(stderr);
+			fprintf(stderr,
+				"  InnoDB: unknown page type %lu,"
+				" assuming FIL_PAGE_INDEX\n",
+				fil_page_get_type(read_buf));
+			/* fall through */
+		case FIL_PAGE_INDEX:
+			checksum = srv_use_checksums
+				? page_zip_calc_checksum(read_buf, zip_size)
+				: BUF_NO_CHECKSUM_MAGIC;
+
+			ut_print_timestamp(stderr);
+			fprintf(stderr,
+				"  InnoDB: Compressed page checksum %lu,"
+				" stored %lu\n"
+				"InnoDB: Page lsn %lu %lu\n"
+				"InnoDB: Page number (if stored"
+				" to page already) %lu,\n"
+				"InnoDB: space id (if stored"
+				" to page already) %lu\n",
+				(ulong) checksum,
+				(ulong) mach_read_from_4(
+					read_buf + FIL_PAGE_SPACE_OR_CHKSUM),
+				(ulong) mach_read_from_4(
+					read_buf + FIL_PAGE_LSN),
+				(ulong) mach_read_from_4(
+					read_buf + (FIL_PAGE_LSN + 4)),
+				(ulong) mach_read_from_4(
+					read_buf + FIL_PAGE_OFFSET),
+				(ulong) mach_read_from_4(
+					read_buf
+					+ FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID));
+			return;
+		case FIL_PAGE_TYPE_XDES:
+			/* This is an uncompressed page. */
+			break;
+		}
+	}
+
+	checksum = srv_use_checksums
+		? buf_calc_page_new_checksum(read_buf) : BUF_NO_CHECKSUM_MAGIC;
+	checksum_32 = srv_use_checksums
+		? buf_calc_page_new_checksum_32(read_buf) : BUF_NO_CHECKSUM_MAGIC;
+	old_checksum = srv_use_checksums
+		? buf_calc_page_old_checksum(read_buf) : BUF_NO_CHECKSUM_MAGIC;
+
+	ut_print_timestamp(stderr);
+	fprintf(stderr,
+		"  InnoDB: Page checksum %lu (32bit_calc: %lu), prior-to-4.0.14-form"
+		" checksum %lu\n"
+		"InnoDB: stored checksum %lu, prior-to-4.0.14-form"
+		" stored checksum %lu\n"
+		"InnoDB: Page lsn %lu %lu, low 4 bytes of lsn"
+		" at page end %lu\n"
+		"InnoDB: Page number (if stored to page already) %lu,\n"
+		"InnoDB: space id (if created with >= MySQL-4.1.1"
+		" and stored already) %lu\n",
+		(ulong) checksum, (ulong) checksum_32, (ulong) old_checksum,
+		(ulong) mach_read_from_4(read_buf + FIL_PAGE_SPACE_OR_CHKSUM),
+		(ulong) mach_read_from_4(read_buf + UNIV_PAGE_SIZE
+					 - FIL_PAGE_END_LSN_OLD_CHKSUM),
+		(ulong) mach_read_from_4(read_buf + FIL_PAGE_LSN),
+		(ulong) mach_read_from_4(read_buf + FIL_PAGE_LSN + 4),
+		(ulong) mach_read_from_4(read_buf + UNIV_PAGE_SIZE
+					 - FIL_PAGE_END_LSN_OLD_CHKSUM + 4),
+		(ulong) mach_read_from_4(read_buf + FIL_PAGE_OFFSET),
+		(ulong) mach_read_from_4(read_buf
+					 + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID));
+
+#ifndef UNIV_HOTBACKUP
+	if (mach_read_from_2(read_buf + TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_TYPE)
+	    == TRX_UNDO_INSERT) {
+		fprintf(stderr,
+			"InnoDB: Page may be an insert undo log page\n");
+	} else if (mach_read_from_2(read_buf + TRX_UNDO_PAGE_HDR
+				    + TRX_UNDO_PAGE_TYPE)
+		   == TRX_UNDO_UPDATE) {
+		fprintf(stderr,
+			"InnoDB: Page may be an update undo log page\n");
+	}
+#endif /* !UNIV_HOTBACKUP */
+
+	switch (fil_page_get_type(read_buf)) {
+	case FIL_PAGE_INDEX:
+		fprintf(stderr,
+			"InnoDB: Page may be an index page where"
+			" index id is %lu %lu\n",
+			(ulong) ut_dulint_get_high(
+				btr_page_get_index_id(read_buf)),
+			(ulong) ut_dulint_get_low(
+				btr_page_get_index_id(read_buf)));
+#ifndef UNIV_HOTBACKUP
+		index = dict_index_find_on_id_low(
+			btr_page_get_index_id(read_buf));
+		if (index) {
+			fputs("InnoDB: (", stderr);
+			dict_index_name_print(stderr, NULL, index);
+			fputs(")\n", stderr);
+		}
+#endif /* !UNIV_HOTBACKUP */
+		break;
+	case FIL_PAGE_INODE:
+		fputs("InnoDB: Page may be an 'inode' page\n", stderr);
+		break;
+	case FIL_PAGE_IBUF_FREE_LIST:
+		fputs("InnoDB: Page may be an insert buffer free list page\n",
+		      stderr);
+		break;
+	case FIL_PAGE_TYPE_ALLOCATED:
+		fputs("InnoDB: Page may be a freshly allocated page\n",
+		      stderr);
+		break;
+	case FIL_PAGE_IBUF_BITMAP:
+		fputs("InnoDB: Page may be an insert buffer bitmap page\n",
+		      stderr);
+		break;
+	case FIL_PAGE_TYPE_SYS:
+		fputs("InnoDB: Page may be a system page\n",
+		      stderr);
+		break;
+	case FIL_PAGE_TYPE_TRX_SYS:
+		fputs("InnoDB: Page may be a transaction system page\n",
+		      stderr);
+		break;
+	case FIL_PAGE_TYPE_FSP_HDR:
+		fputs("InnoDB: Page may be a file space header page\n",
+		      stderr);
+		break;
+	case FIL_PAGE_TYPE_XDES:
+		fputs("InnoDB: Page may be an extent descriptor page\n",
+		      stderr);
+		break;
+	case FIL_PAGE_TYPE_BLOB:
+		fputs("InnoDB: Page may be a BLOB page\n",
+		      stderr);
+		break;
+	case FIL_PAGE_TYPE_ZBLOB:
+	case FIL_PAGE_TYPE_ZBLOB2:
+		fputs("InnoDB: Page may be a compressed BLOB page\n",
+		      stderr);
+		break;
+	}
+}
+
+#ifndef UNIV_HOTBACKUP
+/********************************************************************//**
+Initializes a buffer control block when the buf_pool is created. */
+static
+void
+buf_block_init(
+/*===========*/
+	buf_block_t*	block,	/*!< in: pointer to control block */
+	byte*		frame)	/*!< in: pointer to buffer frame */
+{
+	UNIV_MEM_DESC(frame, UNIV_PAGE_SIZE, block);
+
+	block->frame = frame;
+
+	block->page.state = BUF_BLOCK_NOT_USED;
+	block->page.buf_fix_count = 0;
+	block->page.io_fix = BUF_IO_NONE;
+
+	block->modify_clock = 0;
+
+#ifdef UNIV_DEBUG_FILE_ACCESSES
+	block->page.file_page_was_freed = FALSE;
+#endif /* UNIV_DEBUG_FILE_ACCESSES */
+
+	block->check_index_page_at_flush = FALSE;
+	block->index = NULL;
+
+#ifdef UNIV_DEBUG
+	block->page.in_page_hash = FALSE;
+	block->page.in_zip_hash = FALSE;
+	block->page.in_flush_list = FALSE;
+	block->page.in_free_list = FALSE;
+#endif /* UNIV_DEBUG */
+	block->page.in_LRU_list = FALSE;
+	block->in_unzip_LRU_list = FALSE;
+#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
+	block->n_pointers = 0;
+#endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */
+	page_zip_des_init(&block->page.zip);
+
+	mutex_create(&block->mutex, SYNC_BUF_BLOCK);
+
+	rw_lock_create(&block->lock, SYNC_LEVEL_VARYING);
+	ut_ad(rw_lock_validate(&(block->lock)));
+
+#ifdef UNIV_SYNC_DEBUG
+	rw_lock_create(&block->debug_latch, SYNC_NO_ORDER_CHECK);
+#endif /* UNIV_SYNC_DEBUG */
+}
+
+static
+void
+buf_block_reuse(
+/*============*/
+	buf_block_t*	block,
+	ptrdiff_t	frame_offset)
+{
+	/* block_init */
+	block->frame += frame_offset;
+
+	UNIV_MEM_DESC(block->frame, UNIV_PAGE_SIZE, block);
+
+	block->index = NULL;
+
+#ifdef UNIV_DEBUG
+	/* recreate later */
+	block->page.in_page_hash = FALSE;
+	block->page.in_zip_hash = FALSE;
+#endif /* UNIV_DEBUG */
+
+#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
+	block->n_pointers = 0;
+#endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */
+
+	if (block->page.zip.data)
+		block->page.zip.data += frame_offset;
+
+	block->is_hashed = FALSE;
+
+	mutex_create(&block->mutex, SYNC_BUF_BLOCK);
+
+	rw_lock_create(&block->lock, SYNC_LEVEL_VARYING);
+	ut_ad(rw_lock_validate(&(block->lock)));
+
+#ifdef UNIV_SYNC_DEBUG
+	rw_lock_create(&block->debug_latch, SYNC_NO_ORDER_CHECK);
+#endif /* UNIV_SYNC_DEBUG */
+}
+
+/********************************************************************//**
+Allocates a chunk of buffer frames.
+@return	chunk, or NULL on failure */
+static
+buf_chunk_t*
+buf_chunk_init(
+/*===========*/
+	buf_chunk_t*	chunk,		/*!< out: chunk of buffers */
+	ulint		mem_size)	/*!< in: requested size in bytes */
+{
+	buf_block_t*	block;
+	byte*		frame;
+	ulint		zip_hash_n = 0;
+	ulint		zip_hash_mem_size = 0;
+	hash_table_t*	zip_hash_tmp = NULL;
+	ulint		i;
+	buf_shm_info_t*	shm_info = NULL;
+
+	/* Round down to a multiple of page size,
+	although it already should be. */
+	mem_size = ut_2pow_round(mem_size, UNIV_PAGE_SIZE);
+
+	srv_buffer_pool_shm_is_reused = FALSE;
+
+	if (srv_buffer_pool_shm_key) {
+		/* zip_hash size */
+		zip_hash_n = (mem_size / UNIV_PAGE_SIZE) * 2;
+		zip_hash_mem_size = ut_2pow_round(hash_create_needed(zip_hash_n)
+						  + (UNIV_PAGE_SIZE - 1), UNIV_PAGE_SIZE);
+	}
+
+	/* Reserve space for the block descriptors. */
+	mem_size += ut_2pow_round((mem_size / UNIV_PAGE_SIZE) * (sizeof *block)
+				  + (UNIV_PAGE_SIZE - 1), UNIV_PAGE_SIZE);
+	if (srv_buffer_pool_shm_key) {
+		 mem_size += ut_2pow_round(sizeof(buf_shm_info_t)
+					   + (UNIV_PAGE_SIZE - 1), UNIV_PAGE_SIZE);
+		 mem_size += zip_hash_mem_size;
+	}
+
+	chunk->mem_size = mem_size;
+
+	if (srv_buffer_pool_shm_key) {
+		ulint	binary_id;
+		ibool	is_new;
+
+		ut_a(buf_pool->n_chunks == 1);
+
+		fprintf(stderr,
+		"InnoDB: Warning: The innodb_buffer_pool_shm_key option has been specified.\n"
+		"InnoDB: Do not change the following between restarts of the server while this option is being used:\n"
+		"InnoDB:   * the mysqld executable between restarts of the server.\n"
+		"InnoDB:   * the value of innodb_buffer_pool_size.\n"
+		"InnoDB:   * the value of innodb_page_size.\n"
+		"InnoDB:   * datafiles created by InnoDB during this session.\n"
+		"InnoDB: Otherwise, data corruption in datafiles may result.\n");
+
+		/* FIXME: This is vague id still */
+		binary_id = (ulint) ((byte*)mtr_commit - (byte*)btr_root_get)
+			  + (ulint) ((byte*)os_get_os_version - (byte*)buf_calc_page_new_checksum)
+			  + (ulint) ((byte*)page_dir_find_owner_slot - (byte*)dfield_data_is_binary_equal)
+			  + (ulint) ((byte*)que_graph_publish - (byte*)dict_casedn_str)
+			  + (ulint) ((byte*)read_view_oldest_copy_or_open_new - (byte*)fil_space_get_version)
+			  + (ulint) ((byte*)rec_get_n_extern_new - (byte*)fsp_get_size_low)
+			  + (ulint) ((byte*)row_get_trx_id_offset - (byte*)ha_create_func)
+			  + (ulint) ((byte*)srv_set_io_thread_op_info - (byte*)thd_is_replication_slave_thread)
+			  + (ulint) ((byte*)mutex_create_func - (byte*)ibuf_inside)
+			  + (ulint) ((byte*)trx_set_detailed_error - (byte*)lock_check_trx_id_sanity)
+			  + (ulint) ((byte*)ut_time - (byte*)mem_heap_strdup);
+
+		chunk->mem = os_shm_alloc(&chunk->mem_size, srv_buffer_pool_shm_key, &is_new);
+
+		if (UNIV_UNLIKELY(chunk->mem == NULL)) {
+			return(NULL);
+		}
+init_again:
+#ifdef UNIV_SET_MEM_TO_ZERO
+		if (is_new) {
+			memset(chunk->mem, '\0', chunk->mem_size);
+		}
+#endif
+		/* for ut_fold_binary_32(), these values should be 32-bit aligned */
+		ut_a(sizeof(buf_shm_info_t) % 4 == 0);
+		ut_a((ulint)chunk->mem % 4 == 0);
+		ut_a(chunk->mem_size % 4 == 0);
+
+		shm_info = chunk->mem;
+
+		zip_hash_tmp = (hash_table_t*)((byte*)chunk->mem + chunk->mem_size - zip_hash_mem_size);
+
+		if (is_new) {
+			strncpy(shm_info->head_str, BUF_SHM_INFO_HEAD, 8);
+			shm_info->binary_id = binary_id;
+			shm_info->is_new = TRUE;	/* changed to FALSE when the initialization is finished */
+			shm_info->clean = FALSE;	/* changed to TRUE when free the segment. */
+			shm_info->reusable = FALSE;	/* changed to TRUE when validation is finished. */
+			shm_info->buf_pool_size = srv_buf_pool_size;
+			shm_info->page_size = srv_page_size;
+			shm_info->zip_hash_offset = chunk->mem_size - zip_hash_mem_size;
+			shm_info->zip_hash_n = zip_hash_n;
+		} else {
+			ulint	checksum;
+
+			if (strncmp(shm_info->head_str, BUF_SHM_INFO_HEAD, 8)) {
+				fprintf(stderr,
+				"InnoDB: Error: The shared memory segment seems not to be for buffer pool.\n");
+				return(NULL);
+			}
+			if (shm_info->binary_id != binary_id) {
+				fprintf(stderr,
+				"InnoDB: Error: The shared memory segment seems not to be for this binary.\n");
+				return(NULL);
+			}
+			if (shm_info->is_new) {
+				fprintf(stderr,
+				"InnoDB: Error: The shared memory was not initialized yet.\n");
+				return(NULL);
+			}
+			if (shm_info->buf_pool_size != srv_buf_pool_size) {
+				fprintf(stderr,
+				"InnoDB: Error: srv_buf_pool_size is different (shm=%lu current=%lu).\n",
+				shm_info->buf_pool_size, srv_buf_pool_size);
+				return(NULL);
+			}
+			if (shm_info->page_size != srv_page_size) {
+				fprintf(stderr,
+				"InnoDB: Error: srv_page_size is different (shm=%lu current=%lu).\n",
+				shm_info->page_size, srv_page_size);
+				return(NULL);
+			}
+			if (!shm_info->reusable) {
+				fprintf(stderr,
+				"InnoDB: Warning: The shared memory has unrecoverable contents.\n"
+				"InnoDB: The shared memory segment is initialized.\n");
+				is_new = TRUE;
+				goto init_again;
+			}
+			if (!shm_info->clean) {
+				fprintf(stderr,
+				"InnoDB: Warning: The shared memory was not shut down cleanly.\n"
+				"InnoDB: The shared memory segment is initialized.\n");
+				is_new = TRUE;
+				goto init_again;
+			}
+
+			ut_a(shm_info->zip_hash_offset == chunk->mem_size - zip_hash_mem_size);
+			ut_a(shm_info->zip_hash_n == zip_hash_n);
+
+			/* check checksum */
+			if (srv_buffer_pool_shm_checksum) {
+				checksum = ut_fold_binary_32((byte*)chunk->mem + sizeof(buf_shm_info_t),
+							     chunk->mem_size - sizeof(buf_shm_info_t));
+			} else {
+				checksum = BUF_NO_CHECKSUM_MAGIC;
+			}
+
+			if (shm_info->checksum != BUF_NO_CHECKSUM_MAGIC
+			    && shm_info->checksum != checksum) {
+				fprintf(stderr,
+				"InnoDB: Error: checksum of the shared memory is not match. "
+				"(stored=%lu calculated=%lu)\n",
+				shm_info->checksum, checksum);
+				return(NULL);
+			}
+
+			/* flag to use the segment. */
+			shm_info->clean = FALSE;	/* changed to TRUE when free the segment. */
+		}
+
+		/* init zip_hash contents */
+		if (is_new) {
+			hash_create_init(zip_hash_tmp, zip_hash_n);
+		} else {
+			/* adjust offset is done later */
+			hash_create_reuse(zip_hash_tmp);
+
+			srv_buffer_pool_shm_is_reused = TRUE;
+		}
+	} else {
+	chunk->mem = os_mem_alloc_large(&chunk->mem_size);
+
+	if (UNIV_UNLIKELY(chunk->mem == NULL)) {
+
+		return(NULL);
+	}
+	}
+
+	/* Allocate the block descriptors from
+	the start of the memory block. */
+	if (srv_buffer_pool_shm_key) {
+		chunk->blocks = (buf_block_t*)((byte*)chunk->mem + sizeof(buf_shm_info_t));
+	} else {
+	chunk->blocks = chunk->mem;
+	}
+
+	/* Align a pointer to the first frame.  Note that when
+	os_large_page_size is smaller than UNIV_PAGE_SIZE,
+	we may allocate one fewer block than requested.  When
+	it is bigger, we may allocate more blocks than requested. */
+
+	frame = ut_align(chunk->mem, UNIV_PAGE_SIZE);
+	if (srv_buffer_pool_shm_key) {
+		/* reserve zip_hash space and always -1 for reproductibity */
+		chunk->size = (chunk->mem_size - zip_hash_mem_size) / UNIV_PAGE_SIZE - 1;
+	} else {
+	chunk->size = chunk->mem_size / UNIV_PAGE_SIZE
+		- (frame != chunk->mem);
+	}
+
+	/* Subtract the space needed for block descriptors. */
+	{
+		ulint	size = chunk->size;
+
+		while (frame < (byte*) (chunk->blocks + size)) {
+			frame += UNIV_PAGE_SIZE;
+			size--;
+		}
+
+		chunk->size = size;
+	}
+
+	if (shm_info && !(shm_info->is_new)) {
+		/* convert the shared memory segment for reuse */
+		ptrdiff_t	phys_offset;
+		ptrdiff_t	logi_offset;
+		ptrdiff_t	blocks_offset;
+		byte*		previous_frame_address;
+
+		if (chunk->size < shm_info->chunk_backup.size) {
+			fprintf(stderr,
+			"InnoDB: Error: The buffer pool became smaller because of allocated address.\n"
+			"InnoDB: Retrying may avoid this situation.\n");
+			shm_info->clean = TRUE; /* release the flag for retrying */
+			return(NULL);
+		}
+
+		chunk->size = shm_info->chunk_backup.size;
+		phys_offset = frame - ((byte*)chunk->mem + shm_info->frame_offset);
+		logi_offset = frame - chunk->blocks[0].frame;
+		previous_frame_address = chunk->blocks[0].frame;
+		blocks_offset = (byte*)chunk->blocks - (byte*)shm_info->chunk_backup.blocks;
+
+		if (phys_offset || logi_offset || blocks_offset) {
+			fprintf(stderr,
+			"InnoDB: Buffer pool in the shared memory segment should be converted.\n"
+			"InnoDB: Previous frames in address      : %p\n"
+			"InnoDB: Previous frames were located    : %p\n"
+			"InnoDB: Current frames should be located: %p\n"
+			"InnoDB: Pysical offset                  : %ld (%#lx)\n"
+			"InnoDB: Logical offset (frames)         : %ld (%#lx)\n"
+			"InnoDB: Logical offset (blocks)         : %ld (%#lx)\n",
+				(byte*)chunk->mem + shm_info->frame_offset,
+				chunk->blocks[0].frame, frame,
+				(long) phys_offset, (ulong) phys_offset, (long) logi_offset, (ulong) logi_offset,
+				(long) blocks_offset, (ulong) blocks_offset);
+		} else {
+			fprintf(stderr,
+			"InnoDB: Buffer pool in the shared memory segment can be used as it is.\n");
+		}
+
+		if (phys_offset) {
+			fprintf(stderr,
+			"InnoDB: Aligning physical offset...");
+
+			memmove(frame, (byte*)chunk->mem + shm_info->frame_offset,
+				chunk->size * UNIV_PAGE_SIZE);
+
+			fprintf(stderr,
+			" Done.\n");
+		}
+
+		/* buf_block_t */
+		block = chunk->blocks;
+		for (i = chunk->size; i--; ) {
+			buf_block_reuse(block, logi_offset);
+			block++;
+		}
+
+		if (logi_offset || blocks_offset) {
+			fprintf(stderr,
+			"InnoDB: Aligning logical offset...");
+
+
+			/* buf_pool_t buf_pool_backup */
+			UT_LIST_OFFSET(flush_list, buf_page_t, shm_info->buf_pool_backup.flush_list,
+					previous_frame_address, logi_offset, blocks_offset);
+			UT_LIST_OFFSET(free, buf_page_t, shm_info->buf_pool_backup.free,
+					previous_frame_address, logi_offset, blocks_offset);
+			UT_LIST_OFFSET(LRU, buf_page_t, shm_info->buf_pool_backup.LRU,
+					previous_frame_address, logi_offset, blocks_offset);
+			if (shm_info->buf_pool_backup.LRU_old)
+				shm_info->buf_pool_backup.LRU_old =
+					(buf_page_t*)((byte*)(shm_info->buf_pool_backup.LRU_old)
+						+ (((byte*)shm_info->buf_pool_backup.LRU_old > previous_frame_address)
+						  ? logi_offset : blocks_offset));
+
+			UT_LIST_OFFSET(unzip_LRU, buf_block_t, shm_info->buf_pool_backup.unzip_LRU,
+					previous_frame_address, logi_offset, blocks_offset);
+
+			UT_LIST_OFFSET(zip_list, buf_page_t, shm_info->buf_pool_backup.zip_clean,
+					previous_frame_address, logi_offset, blocks_offset);
+			for (i = 0; i < BUF_BUDDY_SIZES_MAX; i++) {
+				UT_LIST_OFFSET(zip_list, buf_page_t, shm_info->buf_pool_backup.zip_free[i],
+					previous_frame_address, logi_offset, blocks_offset);
+			}
+
+			HASH_OFFSET(zip_hash_tmp, buf_page_t, hash,
+					previous_frame_address, logi_offset, blocks_offset);
+
+			fprintf(stderr,
+			" Done.\n");
+		}
+	} else {
+	/* Init block structs and assign frames for them. Then we
+	assign the frames to the first blocks (we already mapped the
+	memory above). */
+
+	block = chunk->blocks;
+
+	for (i = chunk->size; i--; ) {
+
+		buf_block_init(block, frame);
+
+#ifdef HAVE_valgrind
+		/* Wipe contents of frame to eliminate a Purify warning */
+		memset(block->frame, '\0', UNIV_PAGE_SIZE);
+#endif
+		/* Add the block to the free list */
+		mutex_enter(&free_list_mutex);
+		UT_LIST_ADD_LAST(free, buf_pool->free, (&block->page));
+		ut_d(block->page.in_free_list = TRUE);
+		mutex_exit(&free_list_mutex);
+
+		block++;
+		frame += UNIV_PAGE_SIZE;
+	}
+	}
+
+	if (shm_info) {
+		shm_info->frame_offset = chunk->blocks[0].frame - (byte*)chunk->mem;
+	}
+
+	return(chunk);
+}
+
+#ifdef UNIV_DEBUG
+/*********************************************************************//**
+Finds a block in the given buffer chunk that points to a
+given compressed page.
+@return	buffer block pointing to the compressed page, or NULL */
+static
+buf_block_t*
+buf_chunk_contains_zip(
+/*===================*/
+	buf_chunk_t*	chunk,	/*!< in: chunk being checked */
+	const void*	data)	/*!< in: pointer to compressed page */
+{
+	buf_block_t*	block;
+	ulint		i;
+
+	ut_ad(buf_pool);
+	//ut_ad(buf_pool_mutex_own());
+
+	block = chunk->blocks;
+
+	for (i = chunk->size; i--; block++) {
+		if (block->page.zip.data == data) {
+
+			return(block);
+		}
+	}
+
+	return(NULL);
+}
+
+/*********************************************************************//**
+Finds a block in the buffer pool that points to a
+given compressed page.
+@return	buffer block pointing to the compressed page, or NULL */
+UNIV_INTERN
+buf_block_t*
+buf_pool_contains_zip(
+/*==================*/
+	const void*	data)	/*!< in: pointer to compressed page */
+{
+	ulint		n;
+	buf_chunk_t*	chunk = buf_pool->chunks;
+
+	for (n = buf_pool->n_chunks; n--; chunk++) {
+		buf_block_t* block = buf_chunk_contains_zip(chunk, data);
+
+		if (block) {
+			return(block);
+		}
+	}
+
+	return(NULL);
+}
+#endif /* UNIV_DEBUG */
+
+/*********************************************************************//**
+Checks that all file pages in the buffer chunk are in a replaceable state.
+@return	address of a non-free block, or NULL if all freed */
+static
+const buf_block_t*
+buf_chunk_not_freed(
+/*================*/
+	buf_chunk_t*	chunk)	/*!< in: chunk being checked */
+{
+	buf_block_t*	block;
+	ulint		i;
+
+	ut_ad(buf_pool);
+	//ut_ad(buf_pool_mutex_own()); /*optimistic...*/
+
+	block = chunk->blocks;
+
+	for (i = chunk->size; i--; block++) {
+		ibool	ready;
+
+		switch (buf_block_get_state(block)) {
+		case BUF_BLOCK_ZIP_FREE:
+		case BUF_BLOCK_ZIP_PAGE:
+		case BUF_BLOCK_ZIP_DIRTY:
+			/* The uncompressed buffer pool should never
+			contain compressed block descriptors. */
+			ut_error;
+			break;
+		case BUF_BLOCK_NOT_USED:
+		case BUF_BLOCK_READY_FOR_USE:
+		case BUF_BLOCK_MEMORY:
+		case BUF_BLOCK_REMOVE_HASH:
+			/* Skip blocks that are not being used for
+			file pages. */
+			break;
+		case BUF_BLOCK_FILE_PAGE:
+			mutex_enter(&block->mutex);
+			ready = buf_flush_ready_for_replace(&block->page);
+			mutex_exit(&block->mutex);
+
+			if (block->page.is_corrupt) {
+				/* corrupt page may remain, it can be skipped */
+				break;
+			}
+
+			if (!ready) {
+
+				return(block);
+			}
+
+			break;
+		}
+	}
+
+	return(NULL);
+}
+
+/*********************************************************************//**
+Checks that all blocks in the buffer chunk are in BUF_BLOCK_NOT_USED state.
+@return	TRUE if all freed */
+static
+ibool
+buf_chunk_all_free(
+/*===============*/
+	const buf_chunk_t*	chunk)	/*!< in: chunk being checked */
+{
+	const buf_block_t*	block;
+	ulint			i;
+
+	ut_ad(buf_pool);
+	ut_ad(buf_pool_mutex_own()); /* but we need all mutex here */
+
+	block = chunk->blocks;
+
+	for (i = chunk->size; i--; block++) {
+
+		if (buf_block_get_state(block) != BUF_BLOCK_NOT_USED) {
+
+			return(FALSE);
+		}
+	}
+
+	return(TRUE);
+}
+
+/********************************************************************//**
+Frees a chunk of buffer frames. */
+static
+void
+buf_chunk_free(
+/*===========*/
+	buf_chunk_t*	chunk)		/*!< out: chunk of buffers */
+{
+	buf_block_t*		block;
+	const buf_block_t*	block_end;
+
+	ut_ad(buf_pool_mutex_own()); /* but we need all mutex here */
+
+	block_end = chunk->blocks + chunk->size;
+
+	for (block = chunk->blocks; block < block_end; block++) {
+		ut_a(buf_block_get_state(block) == BUF_BLOCK_NOT_USED);
+		ut_a(!block->page.zip.data);
+
+		ut_ad(!block->page.in_LRU_list);
+		ut_ad(!block->in_unzip_LRU_list);
+		ut_ad(!block->page.in_flush_list);
+		/* Remove the block from the free list. */
+		mutex_enter(&free_list_mutex);
+		ut_ad(block->page.in_free_list);
+		UT_LIST_REMOVE(free, buf_pool->free, (&block->page));
+		mutex_exit(&free_list_mutex);
+
+		/* Free the latches. */
+		mutex_free(&block->mutex);
+		rw_lock_free(&block->lock);
+#ifdef UNIV_SYNC_DEBUG
+		rw_lock_free(&block->debug_latch);
+#endif /* UNIV_SYNC_DEBUG */
+		UNIV_MEM_UNDESC(block);
+	}
+
+	ut_a(!srv_buffer_pool_shm_key);
+
+	os_mem_free_large(chunk->mem, chunk->mem_size);
+}
+
+/********************************************************************//**
+Creates the buffer pool.
+@return	own: buf_pool object, NULL if not enough memory or error */
+UNIV_INTERN
+buf_pool_t*
+buf_pool_init(void)
+/*===============*/
+{
+	buf_chunk_t*	chunk;
+	ulint		i;
+
+	buf_pool = mem_zalloc(sizeof(buf_pool_t));
+
+	/* 1. Initialize general fields
+	------------------------------- */
+	mutex_create(&buf_pool_mutex, SYNC_BUF_POOL);
+	mutex_create(&LRU_list_mutex, SYNC_BUF_LRU_LIST);
+	mutex_create(&flush_list_mutex, SYNC_BUF_FLUSH_LIST);
+	rw_lock_create(&page_hash_latch, SYNC_BUF_PAGE_HASH);
+	mutex_create(&free_list_mutex, SYNC_BUF_FREE_LIST);
+	mutex_create(&zip_free_mutex, SYNC_BUF_ZIP_FREE);
+	mutex_create(&zip_hash_mutex, SYNC_BUF_ZIP_HASH);
+
+	mutex_create(&buf_pool_zip_mutex, SYNC_BUF_BLOCK);
+
+	mutex_enter(&LRU_list_mutex);
+	rw_lock_x_lock(&page_hash_latch);
+	buf_pool_mutex_enter();
+
+	buf_pool->n_chunks = 1;
+	buf_pool->chunks = chunk = mem_alloc(sizeof *chunk);
+
+	UT_LIST_INIT(buf_pool->free);
+
+	if (!buf_chunk_init(chunk, srv_buf_pool_size)) {
+		mem_free(chunk);
+		mem_free(buf_pool);
+		buf_pool = NULL;
+		return(NULL);
+	}
+
+	srv_buf_pool_old_size = srv_buf_pool_size;
+	buf_pool->curr_size = chunk->size;
+	srv_buf_pool_curr_size = buf_pool->curr_size * UNIV_PAGE_SIZE;
+
+	buf_pool->page_hash = hash_create(2 * buf_pool->curr_size);
+	/* zip_hash is allocated to shm when srv_buffer_pool_shm_key is enabled */
+	if (!srv_buffer_pool_shm_key) {
+	buf_pool->zip_hash = hash_create(2 * buf_pool->curr_size);
+	}
+
+	buf_pool->last_printout_time = time(NULL);
+
+	/* 2. Initialize flushing fields
+	-------------------------------- */
+
+	for (i = BUF_FLUSH_LRU; i < BUF_FLUSH_N_TYPES; i++) {
+		buf_pool->no_flush[i] = os_event_create(NULL);
+	}
+
+	/* 3. Initialize LRU fields
+	--------------------------- */
+	/* All fields are initialized by mem_zalloc(). */
+
+	if (srv_buffer_pool_shm_key) {
+		buf_shm_info_t*	shm_info;
+
+		ut_a((byte*)chunk->blocks == (byte*)chunk->mem + sizeof(buf_shm_info_t));
+		shm_info = chunk->mem;
+
+		buf_pool->zip_hash = (hash_table_t*)((byte*)chunk->mem + shm_info->zip_hash_offset);
+
+		if(shm_info->is_new) {
+			shm_info->is_new = FALSE; /* initialization was finished */
+		} else {
+			buf_block_t*	block = chunk->blocks;
+			buf_page_t*	b;
+
+			/* shm_info->buf_pool_backup should be converted */
+			/* at buf_chunk_init(). So copy simply. */
+			buf_pool->flush_list 		= shm_info->buf_pool_backup.flush_list;
+			buf_pool->freed_page_clock 	= shm_info->buf_pool_backup.freed_page_clock;
+			buf_pool->free			= shm_info->buf_pool_backup.free;
+			buf_pool->LRU			= shm_info->buf_pool_backup.LRU;
+			buf_pool->LRU_old		= shm_info->buf_pool_backup.LRU_old;
+			buf_pool->LRU_old_len		= shm_info->buf_pool_backup.LRU_old_len;
+			buf_pool->unzip_LRU		= shm_info->buf_pool_backup.unzip_LRU;
+			buf_pool->zip_clean		= shm_info->buf_pool_backup.zip_clean;
+			for (i = 0; i < BUF_BUDDY_SIZES_MAX; i++) {
+				buf_pool->zip_free[i]	= shm_info->buf_pool_backup.zip_free[i];
+			}
+
+			for (i = 0; i < chunk->size; i++, block++) {
+				if (buf_block_get_state(block)
+				    == BUF_BLOCK_FILE_PAGE) {
+					ut_d(block->page.in_page_hash = TRUE);
+					HASH_INSERT(buf_page_t, hash, buf_pool->page_hash,
+						    buf_page_address_fold(
+							    block->page.space,
+							    block->page.offset),
+						    &block->page);
+				}
+			}
+
+			for (b = UT_LIST_GET_FIRST(buf_pool->zip_clean); b;
+			     b = UT_LIST_GET_NEXT(zip_list, b)) {
+				ut_ad(!b->in_flush_list);
+				ut_ad(b->in_LRU_list);
+
+				ut_d(b->in_page_hash = TRUE);
+				HASH_INSERT(buf_page_t, hash, buf_pool->page_hash,
+					    buf_page_address_fold(b->space, b->offset), b);
+			}
+
+			for (b = UT_LIST_GET_FIRST(buf_pool->flush_list); b;
+			     b = UT_LIST_GET_NEXT(flush_list, b)) {
+				ut_ad(b->in_flush_list);
+				ut_ad(b->in_LRU_list);
+
+				switch (buf_page_get_state(b)) {
+				case BUF_BLOCK_ZIP_DIRTY:
+					ut_d(b->in_page_hash = TRUE);
+					HASH_INSERT(buf_page_t, hash, buf_pool->page_hash,
+						    buf_page_address_fold(b->space,
+							    		  b->offset), b);
+					break;
+				case BUF_BLOCK_FILE_PAGE:
+					/* uncompressed page */
+					break;
+				case BUF_BLOCK_ZIP_FREE:
+				case BUF_BLOCK_ZIP_PAGE:
+				case BUF_BLOCK_NOT_USED:
+				case BUF_BLOCK_READY_FOR_USE:
+				case BUF_BLOCK_MEMORY:
+				case BUF_BLOCK_REMOVE_HASH:
+					ut_error;
+					break;
+				}
+			}
+
+
+		}
+	}
+
+	mutex_exit(&LRU_list_mutex);
+	rw_lock_x_unlock(&page_hash_latch);
+	buf_pool_mutex_exit();
+
+	btr_search_sys_create(buf_pool->curr_size
+			      * UNIV_PAGE_SIZE / sizeof(void*) / 64);
+
+	/* 4. Initialize the buddy allocator fields */
+	/* All fields are initialized by mem_zalloc(). */
+
+	return(buf_pool);
+}
+
+/********************************************************************//**
+Frees the buffer pool at shutdown.  This must not be invoked before
+freeing all mutexes. */
+UNIV_INTERN
+void
+buf_pool_free(void)
+/*===============*/
+{
+	buf_chunk_t*	chunk;
+	buf_chunk_t*	chunks;
+
+	if (srv_buffer_pool_shm_key) {
+		buf_shm_info_t*	shm_info;
+
+		ut_a(buf_pool->n_chunks == 1);
+
+		chunk = buf_pool->chunks;
+		shm_info = chunk->mem;
+		ut_a((byte*)chunk->blocks == (byte*)chunk->mem + sizeof(buf_shm_info_t));
+
+		/* validation the shared memory segment doesn't have unrecoverable contents. */
+		/* Currently, validation became not needed */
+		shm_info->reusable = TRUE;
+
+		memcpy(&(shm_info->buf_pool_backup), buf_pool, sizeof(buf_pool_t));
+		memcpy(&(shm_info->chunk_backup), chunk, sizeof(buf_chunk_t));
+
+		if (srv_fast_shutdown < 2) {
+			if (srv_buffer_pool_shm_checksum) {
+				shm_info->checksum = ut_fold_binary_32((byte*)chunk->mem + sizeof(buf_shm_info_t),
+								       chunk->mem_size - sizeof(buf_shm_info_t));
+			} else {
+				shm_info->checksum = BUF_NO_CHECKSUM_MAGIC;
+			}
+			shm_info->clean = TRUE;
+		}
+
+		os_shm_free(chunk->mem, chunk->mem_size);
+	} else {
+	chunks = buf_pool->chunks;
+	chunk = chunks + buf_pool->n_chunks;
+
+	while (--chunk >= chunks) {
+		/* Bypass the checks of buf_chunk_free(), since they
+		would fail at shutdown. */
+		os_mem_free_large(chunk->mem, chunk->mem_size);
+	}
+	}
+
+	mem_free(buf_pool->chunks);
+	hash_table_free(buf_pool->page_hash);
+	if (!srv_buffer_pool_shm_key) {
+	hash_table_free(buf_pool->zip_hash);
+	}
+	mem_free(buf_pool);
+	buf_pool = NULL;
+}
+
+/********************************************************************//**
+Drops the adaptive hash index.  To prevent a livelock, this function
+is only to be called while holding btr_search_latch and while
+btr_search_enabled == FALSE. */
+UNIV_INTERN
+void
+buf_pool_drop_hash_index(void)
+/*==========================*/
+{
+	ibool		released_search_latch;
+
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(rw_lock_own(&btr_search_latch, RW_LOCK_EX));
+#endif /* UNIV_SYNC_DEBUG */
+	ut_ad(!btr_search_enabled);
+
+	do {
+		buf_chunk_t*	chunks	= buf_pool->chunks;
+		buf_chunk_t*	chunk	= chunks + buf_pool->n_chunks;
+
+		released_search_latch = FALSE;
+
+		while (--chunk >= chunks) {
+			buf_block_t*	block	= chunk->blocks;
+			ulint		i	= chunk->size;
+
+			for (; i--; block++) {
+				/* block->is_hashed cannot be modified
+				when we have an x-latch on btr_search_latch;
+				see the comment in buf0buf.h */
+
+				if (buf_block_get_state(block)
+				    != BUF_BLOCK_FILE_PAGE
+				    || !block->is_hashed) {
+					continue;
+				}
+
+				/* To follow the latching order, we
+				have to release btr_search_latch
+				before acquiring block->latch. */
+				rw_lock_x_unlock(&btr_search_latch);
+				/* When we release the search latch,
+				we must rescan all blocks, because
+				some may become hashed again. */
+				released_search_latch = TRUE;
+
+				rw_lock_x_lock(&block->lock);
+
+				/* This should be guaranteed by the
+				callers, which will be holding
+				btr_search_enabled_mutex. */
+				ut_ad(!btr_search_enabled);
+
+				/* Because we did not buffer-fix the
+				block by calling buf_block_get_gen(),
+				it is possible that the block has been
+				allocated for some other use after
+				btr_search_latch was released above.
+				We do not care which file page the
+				block is mapped to.  All we want to do
+				is to drop any hash entries referring
+				to the page. */
+
+				/* It is possible that
+				block->page.state != BUF_FILE_PAGE.
+				Even that does not matter, because
+				btr_search_drop_page_hash_index() will
+				check block->is_hashed before doing
+				anything.  block->is_hashed can only
+				be set on uncompressed file pages. */
+
+				btr_search_drop_page_hash_index(block);
+
+				rw_lock_x_unlock(&block->lock);
+
+				rw_lock_x_lock(&btr_search_latch);
+
+				ut_ad(!btr_search_enabled);
+			}
+		}
+	} while (released_search_latch);
+}
+
+/********************************************************************//**
+Relocate a buffer control block.  Relocates the block on the LRU list
+and in buf_pool->page_hash.  Does not relocate bpage->list.
+The caller must take care of relocating bpage->list. */
+UNIV_INTERN
+void
+buf_relocate(
+/*=========*/
+	buf_page_t*	bpage,	/*!< in/out: control block being relocated;
+				buf_page_get_state(bpage) must be
+				BUF_BLOCK_ZIP_DIRTY or BUF_BLOCK_ZIP_PAGE */
+	buf_page_t*	dpage)	/*!< in/out: destination control block */
+{
+	buf_page_t*	b;
+	ulint		fold;
+
+	//ut_ad(buf_pool_mutex_own());
+	ut_ad(mutex_own(&LRU_list_mutex));
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(rw_lock_own(&page_hash_latch, RW_LOCK_EX));
+#endif
+	ut_ad(mutex_own(buf_page_get_mutex(bpage)));
+	ut_a(buf_page_get_io_fix(bpage) == BUF_IO_NONE);
+	ut_a(bpage->buf_fix_count == 0);
+	ut_ad(bpage->in_LRU_list);
+	ut_ad(!bpage->in_zip_hash);
+	ut_ad(bpage->in_page_hash);
+	ut_ad(bpage == buf_page_hash_get(bpage->space, bpage->offset));
+#ifdef UNIV_DEBUG
+	switch (buf_page_get_state(bpage)) {
+	case BUF_BLOCK_ZIP_FREE:
+	case BUF_BLOCK_NOT_USED:
+	case BUF_BLOCK_READY_FOR_USE:
+	case BUF_BLOCK_FILE_PAGE:
+	case BUF_BLOCK_MEMORY:
+	case BUF_BLOCK_REMOVE_HASH:
+		ut_error;
+	case BUF_BLOCK_ZIP_DIRTY:
+	case BUF_BLOCK_ZIP_PAGE:
+		break;
+	}
+#endif /* UNIV_DEBUG */
+
+	memcpy(dpage, bpage, sizeof *dpage);
+
+	bpage->in_LRU_list = FALSE;
+	ut_d(bpage->in_page_hash = FALSE);
+
+	/* relocate buf_pool->LRU */
+	b = UT_LIST_GET_PREV(LRU, bpage);
+	UT_LIST_REMOVE(LRU, buf_pool->LRU, bpage);
+
+	if (b) {
+		UT_LIST_INSERT_AFTER(LRU, buf_pool->LRU, b, dpage);
+	} else {
+		UT_LIST_ADD_FIRST(LRU, buf_pool->LRU, dpage);
+	}
+
+	if (UNIV_UNLIKELY(buf_pool->LRU_old == bpage)) {
+		buf_pool->LRU_old = dpage;
+#ifdef UNIV_LRU_DEBUG
+		/* buf_pool->LRU_old must be the first item in the LRU list
+		whose "old" flag is set. */
+		ut_a(buf_pool->LRU_old->old);
+		ut_a(!UT_LIST_GET_PREV(LRU, buf_pool->LRU_old)
+		     || !UT_LIST_GET_PREV(LRU, buf_pool->LRU_old)->old);
+		ut_a(!UT_LIST_GET_NEXT(LRU, buf_pool->LRU_old)
+		     || UT_LIST_GET_NEXT(LRU, buf_pool->LRU_old)->old);
+	} else {
+		/* Check that the "old" flag is consistent in
+		the block and its neighbours. */
+		buf_page_set_old(dpage, buf_page_is_old(dpage));
+#endif /* UNIV_LRU_DEBUG */
+	}
+
+	ut_d(UT_LIST_VALIDATE(LRU, buf_page_t, buf_pool->LRU,
+			      ut_ad(ut_list_node_313->in_LRU_list)));
+
+	/* relocate buf_pool->page_hash */
+	fold = buf_page_address_fold(bpage->space, bpage->offset);
+
+	HASH_DELETE(buf_page_t, hash, buf_pool->page_hash, fold, bpage);
+	HASH_INSERT(buf_page_t, hash, buf_pool->page_hash, fold, dpage);
+}
+
+/********************************************************************//**
+Shrinks the buffer pool. */
+static
+void
+buf_pool_shrink(
+/*============*/
+	ulint	chunk_size)	/*!< in: number of pages to remove */
+{
+	buf_chunk_t*	chunks;
+	buf_chunk_t*	chunk;
+	ulint		max_size;
+	ulint		max_free_size;
+	buf_chunk_t*	max_chunk;
+	buf_chunk_t*	max_free_chunk;
+
+	ut_ad(!buf_pool_mutex_own());
+
+try_again:
+	btr_search_disable(); /* Empty the adaptive hash index again */
+	//buf_pool_mutex_enter();
+	mutex_enter(&LRU_list_mutex);
+
+	if (srv_buffer_pool_shm_key) {
+		/* Cannot support shrink */
+		goto func_done;
+	}
+
+shrink_again:
+	if (buf_pool->n_chunks <= 1) {
+
+		/* Cannot shrink if there is only one chunk */
+		goto func_done;
+	}
+
+	/* Search for the largest free chunk
+	not larger than the size difference */
+	chunks = buf_pool->chunks;
+	chunk = chunks + buf_pool->n_chunks;
+	max_size = max_free_size = 0;
+	max_chunk = max_free_chunk = NULL;
+
+	while (--chunk >= chunks) {
+		if (chunk->size <= chunk_size
+		    && chunk->size > max_free_size) {
+			if (chunk->size > max_size) {
+				max_size = chunk->size;
+				max_chunk = chunk;
+			}
+
+			if (buf_chunk_all_free(chunk)) {
+				max_free_size = chunk->size;
+				max_free_chunk = chunk;
+			}
+		}
+	}
+
+	if (!max_free_size) {
+
+		ulint		dirty	= 0;
+		ulint		nonfree	= 0;
+		buf_block_t*	block;
+		buf_block_t*	bend;
+
+		/* Cannot shrink: try again later
+		(do not assign srv_buf_pool_old_size) */
+		if (!max_chunk) {
+
+			goto func_exit;
+		}
+
+		block = max_chunk->blocks;
+		bend = block + max_chunk->size;
+
+		/* Move the blocks of chunk to the end of the
+		LRU list and try to flush them. */
+		for (; block < bend; block++) {
+			switch (buf_block_get_state(block)) {
+			case BUF_BLOCK_NOT_USED:
+				continue;
+			case BUF_BLOCK_FILE_PAGE:
+				break;
+			default:
+				nonfree++;
+				continue;
+			}
+
+			mutex_enter(&block->mutex);
+			/* The following calls will temporarily
+			release block->mutex and buf_pool_mutex.
+			Therefore, we have to always retry,
+			even if !dirty && !nonfree. */
+
+			if (!buf_flush_ready_for_replace(&block->page)) {
+
+				buf_LRU_make_block_old(&block->page);
+				dirty++;
+			} else if (buf_LRU_free_block(&block->page, TRUE, NULL, FALSE)
+				   != BUF_LRU_FREED) {
+				nonfree++;
+			}
+
+			mutex_exit(&block->mutex);
+		}
+
+		//buf_pool_mutex_exit();
+		mutex_exit(&LRU_list_mutex);
+
+		/* Request for a flush of the chunk if it helps.
+		Do not flush if there are non-free blocks, since
+		flushing will not make the chunk freeable. */
+		if (nonfree) {
+			/* Avoid busy-waiting. */
+			os_thread_sleep(100000);
+		} else if (dirty
+			   && buf_flush_batch(BUF_FLUSH_LRU, dirty, 0)
+			   == ULINT_UNDEFINED) {
+
+			buf_flush_wait_batch_end(BUF_FLUSH_LRU);
+		}
+
+		goto try_again;
+	}
+
+	max_size = max_free_size;
+	max_chunk = max_free_chunk;
+
+	srv_buf_pool_old_size = srv_buf_pool_size;
+
+	/* Rewrite buf_pool->chunks.  Copy everything but max_chunk. */
+	chunks = mem_alloc((buf_pool->n_chunks - 1) * sizeof *chunks);
+	memcpy(chunks, buf_pool->chunks,
+	       (max_chunk - buf_pool->chunks) * sizeof *chunks);
+	memcpy(chunks + (max_chunk - buf_pool->chunks),
+	       max_chunk + 1,
+	       buf_pool->chunks + buf_pool->n_chunks
+	       - (max_chunk + 1));
+	ut_a(buf_pool->curr_size > max_chunk->size);
+	buf_pool->curr_size -= max_chunk->size;
+	srv_buf_pool_curr_size = buf_pool->curr_size * UNIV_PAGE_SIZE;
+	chunk_size -= max_chunk->size;
+	buf_chunk_free(max_chunk);
+	mem_free(buf_pool->chunks);
+	buf_pool->chunks = chunks;
+	buf_pool->n_chunks--;
+
+	/* Allow a slack of one megabyte. */
+	if (chunk_size > 1048576 / UNIV_PAGE_SIZE) {
+
+		goto shrink_again;
+	}
+
+func_done:
+	srv_buf_pool_old_size = srv_buf_pool_size;
+func_exit:
+	//buf_pool_mutex_exit();
+	mutex_exit(&LRU_list_mutex);
+	btr_search_enable();
+}
+
+/********************************************************************//**
+Rebuild buf_pool->page_hash. */
+static
+void
+buf_pool_page_hash_rebuild(void)
+/*============================*/
+{
+	ulint		i;
+	ulint		n_chunks;
+	buf_chunk_t*	chunk;
+	hash_table_t*	page_hash;
+	hash_table_t*	zip_hash;
+	buf_page_t*	b;
+
+	//buf_pool_mutex_enter();
+	mutex_enter(&LRU_list_mutex);
+	rw_lock_x_lock(&page_hash_latch);
+	mutex_enter(&flush_list_mutex);
+	
+
+	/* Free, create, and populate the hash table. */
+	hash_table_free(buf_pool->page_hash);
+	buf_pool->page_hash = page_hash = hash_create(2 * buf_pool->curr_size);
+	zip_hash = hash_create(2 * buf_pool->curr_size);
+
+	HASH_MIGRATE(buf_pool->zip_hash, zip_hash, buf_page_t, hash,
+		     BUF_POOL_ZIP_FOLD_BPAGE);
+
+	hash_table_free(buf_pool->zip_hash);
+	buf_pool->zip_hash = zip_hash;
+
+	/* Insert the uncompressed file pages to buf_pool->page_hash. */
+
+	chunk = buf_pool->chunks;
+	n_chunks = buf_pool->n_chunks;
+
+	for (i = 0; i < n_chunks; i++, chunk++) {
+		ulint		j;
+		buf_block_t*	block = chunk->blocks;
+
+		for (j = 0; j < chunk->size; j++, block++) {
+			if (buf_block_get_state(block)
+			    == BUF_BLOCK_FILE_PAGE) {
+				ut_ad(!block->page.in_zip_hash);
+				ut_ad(block->page.in_page_hash);
+
+				HASH_INSERT(buf_page_t, hash, page_hash,
+					    buf_page_address_fold(
+						    block->page.space,
+						    block->page.offset),
+					    &block->page);
+			}
+		}
+	}
+
+	/* Insert the compressed-only pages to buf_pool->page_hash.
+	All such blocks are either in buf_pool->zip_clean or
+	in buf_pool->flush_list. */
+
+	for (b = UT_LIST_GET_FIRST(buf_pool->zip_clean); b;
+	     b = UT_LIST_GET_NEXT(zip_list, b)) {
+		ut_a(buf_page_get_state(b) == BUF_BLOCK_ZIP_PAGE);
+		ut_ad(!b->in_flush_list);
+		ut_ad(b->in_LRU_list);
+		ut_ad(b->in_page_hash);
+		ut_ad(!b->in_zip_hash);
+
+		HASH_INSERT(buf_page_t, hash, page_hash,
+			    buf_page_address_fold(b->space, b->offset), b);
+	}
+
+	for (b = UT_LIST_GET_FIRST(buf_pool->flush_list); b;
+	     b = UT_LIST_GET_NEXT(flush_list, b)) {
+		ut_ad(b->in_flush_list);
+		ut_ad(b->in_LRU_list);
+		ut_ad(b->in_page_hash);
+		ut_ad(!b->in_zip_hash);
+
+		switch (buf_page_get_state(b)) {
+		case BUF_BLOCK_ZIP_DIRTY:
+			HASH_INSERT(buf_page_t, hash, page_hash,
+				    buf_page_address_fold(b->space,
+							  b->offset), b);
+			break;
+		case BUF_BLOCK_FILE_PAGE:
+			/* uncompressed page */
+			break;
+		case BUF_BLOCK_ZIP_FREE:
+		case BUF_BLOCK_ZIP_PAGE:
+		case BUF_BLOCK_NOT_USED:
+		case BUF_BLOCK_READY_FOR_USE:
+		case BUF_BLOCK_MEMORY:
+		case BUF_BLOCK_REMOVE_HASH:
+			ut_error;
+			break;
+		}
+	}
+
+	//buf_pool_mutex_exit();
+	mutex_exit(&LRU_list_mutex);
+	rw_lock_x_unlock(&page_hash_latch);
+	mutex_exit(&flush_list_mutex);
+}
+
+/********************************************************************//**
+Resizes the buffer pool. */
+UNIV_INTERN
+void
+buf_pool_resize(void)
+/*=================*/
+{
+	if (srv_buffer_pool_shm_key) {
+		/* Cannot support resize */
+		return;
+	}
+
+	//buf_pool_mutex_enter();
+	mutex_enter(&LRU_list_mutex);
+
+	if (srv_buf_pool_old_size == srv_buf_pool_size) {
+
+		//buf_pool_mutex_exit();
+		mutex_exit(&LRU_list_mutex);
+		return;
+	}
+
+	if (srv_buf_pool_curr_size + 1048576 > srv_buf_pool_size) {
+
+		//buf_pool_mutex_exit();
+		mutex_exit(&LRU_list_mutex);
+
+		/* Disable adaptive hash indexes and empty the index
+		in order to free up memory in the buffer pool chunks. */
+		buf_pool_shrink((srv_buf_pool_curr_size - srv_buf_pool_size)
+				/ UNIV_PAGE_SIZE);
+	} else if (srv_buf_pool_curr_size + 1048576 < srv_buf_pool_size) {
+
+		/* Enlarge the buffer pool by at least one megabyte */
+
+		ulint		mem_size
+			= srv_buf_pool_size - srv_buf_pool_curr_size;
+		buf_chunk_t*	chunks;
+		buf_chunk_t*	chunk;
+
+		chunks = mem_alloc((buf_pool->n_chunks + 1) * sizeof *chunks);
+
+		memcpy(chunks, buf_pool->chunks, buf_pool->n_chunks
+		       * sizeof *chunks);
+
+		chunk = &chunks[buf_pool->n_chunks];
+
+		if (!buf_chunk_init(chunk, mem_size)) {
+			mem_free(chunks);
+		} else {
+			buf_pool->curr_size += chunk->size;
+			srv_buf_pool_curr_size = buf_pool->curr_size
+				* UNIV_PAGE_SIZE;
+			mem_free(buf_pool->chunks);
+			buf_pool->chunks = chunks;
+			buf_pool->n_chunks++;
+		}
+
+		srv_buf_pool_old_size = srv_buf_pool_size;
+		//buf_pool_mutex_exit();
+		mutex_exit(&LRU_list_mutex);
+	}
+
+	buf_pool_page_hash_rebuild();
+}
+
+/********************************************************************//**
+Moves a page to the start of the buffer pool LRU list. This high-level
+function can be used to prevent an important page from slipping out of
+the buffer pool. */
+UNIV_INTERN
+void
+buf_page_make_young(
+/*================*/
+	buf_page_t*	bpage)	/*!< in: buffer block of a file page */
+{
+	//buf_pool_mutex_enter();
+	mutex_enter(&LRU_list_mutex);
+
+	ut_a(buf_page_in_file(bpage));
+
+	buf_LRU_make_block_young(bpage);
+
+	//buf_pool_mutex_exit();
+	mutex_exit(&LRU_list_mutex);
+}
+
+/********************************************************************//**
+Sets the time of the first access of a page and moves a page to the
+start of the buffer pool LRU list if it is too old.  This high-level
+function can be used to prevent an important page from slipping
+out of the buffer pool. */
+static
+void
+buf_page_set_accessed_make_young(
+/*=============================*/
+	buf_page_t*	bpage,		/*!< in/out: buffer block of a
+					file page */
+	unsigned	access_time)	/*!< in: bpage->access_time
+					read under mutex protection,
+					or 0 if unknown */
+{
+	ut_ad(!buf_pool_mutex_own());
+	ut_a(buf_page_in_file(bpage));
+
+	if (buf_page_peek_if_too_old(bpage)) {
+		//buf_pool_mutex_enter();
+		mutex_enter(&LRU_list_mutex);
+		buf_LRU_make_block_young(bpage);
+		//buf_pool_mutex_exit();
+		mutex_exit(&LRU_list_mutex);
+	} else if (!access_time) {
+		ulint	time_ms = ut_time_ms();
+		mutex_t*	block_mutex = buf_page_get_mutex_enter(bpage);
+		//buf_pool_mutex_enter();
+		if (block_mutex) {
+		buf_page_set_accessed(bpage, time_ms);
+		mutex_exit(block_mutex);
+		}
+		//buf_pool_mutex_exit();
+	}
+}
+
+/********************************************************************//**
+Resets the check_index_page_at_flush field of a page if found in the buffer
+pool. */
+UNIV_INTERN
+void
+buf_reset_check_index_page_at_flush(
+/*================================*/
+	ulint	space,	/*!< in: space id */
+	ulint	offset)	/*!< in: page number */
+{
+	buf_block_t*	block;
+
+	//buf_pool_mutex_enter();
+	rw_lock_s_lock(&page_hash_latch);
+
+	block = (buf_block_t*) buf_page_hash_get(space, offset);
+
+	if (block && buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE) {
+		block->check_index_page_at_flush = FALSE;
+	}
+
+	//buf_pool_mutex_exit();
+	rw_lock_s_unlock(&page_hash_latch);
+}
+
+/********************************************************************//**
+Returns the current state of is_hashed of a page. FALSE if the page is
+not in the pool. NOTE that this operation does not fix the page in the
+pool if it is found there.
+@return	TRUE if page hash index is built in search system */
+UNIV_INTERN
+ibool
+buf_page_peek_if_search_hashed(
+/*===========================*/
+	ulint	space,	/*!< in: space id */
+	ulint	offset)	/*!< in: page number */
+{
+	buf_block_t*	block;
+	ibool		is_hashed;
+
+	//buf_pool_mutex_enter();
+	rw_lock_s_lock(&page_hash_latch);
+
+	block = (buf_block_t*) buf_page_hash_get(space, offset);
+
+	if (!block || buf_block_get_state(block) != BUF_BLOCK_FILE_PAGE) {
+		is_hashed = FALSE;
+	} else {
+		is_hashed = block->is_hashed;
+	}
+
+	//buf_pool_mutex_exit();
+	rw_lock_s_unlock(&page_hash_latch);
+
+	return(is_hashed);
+}
+
+#ifdef UNIV_DEBUG_FILE_ACCESSES
+/********************************************************************//**
+Sets file_page_was_freed TRUE if the page is found in the buffer pool.
+This function should be called when we free a file page and want the
+debug version to check that it is not accessed any more unless
+reallocated.
+@return	control block if found in page hash table, otherwise NULL */
+UNIV_INTERN
+buf_page_t*
+buf_page_set_file_page_was_freed(
+/*=============================*/
+	ulint	space,	/*!< in: space id */
+	ulint	offset)	/*!< in: page number */
+{
+	buf_page_t*	bpage;
+
+	//buf_pool_mutex_enter();
+	rw_lock_s_lock(&page_hash_latch);
+
+	bpage = buf_page_hash_get(space, offset);
+
+	if (bpage) {
+		bpage->file_page_was_freed = TRUE;
+	}
+
+	//buf_pool_mutex_exit();
+	rw_lock_s_unlock(&page_hash_latch);
+
+	return(bpage);
+}
+
+/********************************************************************//**
+Sets file_page_was_freed FALSE if the page is found in the buffer pool.
+This function should be called when we free a file page and want the
+debug version to check that it is not accessed any more unless
+reallocated.
+@return	control block if found in page hash table, otherwise NULL */
+UNIV_INTERN
+buf_page_t*
+buf_page_reset_file_page_was_freed(
+/*===============================*/
+	ulint	space,	/*!< in: space id */
+	ulint	offset)	/*!< in: page number */
+{
+	buf_page_t*	bpage;
+
+	//buf_pool_mutex_enter();
+	rw_lock_s_lock(&page_hash_latch);
+
+	bpage = buf_page_hash_get(space, offset);
+
+	if (bpage) {
+		bpage->file_page_was_freed = FALSE;
+	}
+
+	//buf_pool_mutex_exit();
+	rw_lock_s_unlock(&page_hash_latch);
+
+	return(bpage);
+}
+#endif /* UNIV_DEBUG_FILE_ACCESSES */
+
+/********************************************************************//**
+Get read access to a compressed page (usually of type
+FIL_PAGE_TYPE_ZBLOB or FIL_PAGE_TYPE_ZBLOB2).
+The page must be released with buf_page_release_zip().
+NOTE: the page is not protected by any latch.  Mutual exclusion has to
+be implemented at a higher level.  In other words, all possible
+accesses to a given page through this function must be protected by
+the same set of mutexes or latches.
+@return	pointer to the block */
+UNIV_INTERN
+buf_page_t*
+buf_page_get_zip(
+/*=============*/
+	ulint		space,	/*!< in: space id */
+	ulint		zip_size,/*!< in: compressed page size */
+	ulint		offset)	/*!< in: page number */
+{
+	buf_page_t*	bpage;
+	mutex_t*	block_mutex;
+	ibool		must_read;
+	unsigned	access_time;
+	trx_t*		trx = NULL;
+	ulint		sec;
+	ulint		ms;
+	ib_uint64_t	start_time;
+	ib_uint64_t	finish_time;
+
+#ifndef UNIV_LOG_DEBUG
+	ut_ad(!ibuf_inside());
+#endif
+	if (innobase_get_slow_log()) {
+		trx = innobase_get_trx();
+	}
+	buf_pool->stat.n_page_gets++;
+
+	for (;;) {
+		//buf_pool_mutex_enter();
+lookup:
+		rw_lock_s_lock(&page_hash_latch);
+		bpage = buf_page_hash_get(space, offset);
+		if (bpage) {
+			break;
+		}
+
+		/* Page not in buf_pool: needs to be read from file */
+
+		//buf_pool_mutex_exit();
+		rw_lock_s_unlock(&page_hash_latch);
+
+		buf_read_page(space, zip_size, offset, trx);
+
+#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
+		ut_a(++buf_dbg_counter % 37 || buf_validate());
+#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
+	}
+
+	if (UNIV_UNLIKELY(!bpage->zip.data)) {
+		/* There is no compressed page. */
+err_exit:
+		//buf_pool_mutex_exit();
+		rw_lock_s_unlock(&page_hash_latch);
+		return(NULL);
+	}
+
+	if (srv_pass_corrupt_table) {
+		if (bpage->is_corrupt) {
+			rw_lock_s_unlock(&page_hash_latch);
+			return(NULL);
+		}
+	}
+	ut_a(!(bpage->is_corrupt));
+
+	block_mutex = buf_page_get_mutex_enter(bpage);
+
+	rw_lock_s_unlock(&page_hash_latch);
+
+	switch (buf_page_get_state(bpage)) {
+	case BUF_BLOCK_NOT_USED:
+	case BUF_BLOCK_READY_FOR_USE:
+	case BUF_BLOCK_MEMORY:
+	case BUF_BLOCK_REMOVE_HASH:
+	case BUF_BLOCK_ZIP_FREE:
+		if (block_mutex)
+			mutex_exit(block_mutex);
+		break;
+	case BUF_BLOCK_ZIP_PAGE:
+	case BUF_BLOCK_ZIP_DIRTY:
+		ut_a(block_mutex == &buf_pool_zip_mutex);
+		bpage->buf_fix_count++;
+		goto got_block;
+	case BUF_BLOCK_FILE_PAGE:
+		ut_a(block_mutex == &((buf_block_t*) bpage)->mutex);
+
+		/* Discard the uncompressed page frame if possible. */
+		if (buf_LRU_free_block(bpage, FALSE, NULL, FALSE)
+		    == BUF_LRU_FREED) {
+
+			mutex_exit(block_mutex);
+			goto lookup;
+		}
+
+		buf_block_buf_fix_inc((buf_block_t*) bpage,
+				      __FILE__, __LINE__);
+		goto got_block;
+	}
+
+	ut_error;
+	goto err_exit;
+
+got_block:
+	must_read = buf_page_get_io_fix(bpage) == BUF_IO_READ;
+	access_time = buf_page_is_accessed(bpage);
+
+	//buf_pool_mutex_exit();
+
+	mutex_exit(block_mutex);
+
+	buf_page_set_accessed_make_young(bpage, access_time);
+
+#ifdef UNIV_DEBUG_FILE_ACCESSES
+	ut_a(!bpage->file_page_was_freed);
+#endif
+
+#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
+	ut_a(++buf_dbg_counter % 5771 || buf_validate());
+	ut_a(bpage->buf_fix_count > 0);
+	ut_a(buf_page_in_file(bpage));
+#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
+
+	if (must_read) {
+		/* Let us wait until the read operation
+		completes */
+
+		if (innobase_get_slow_log() && trx && trx->take_stats)
+		{
+			ut_usectime(&sec, &ms);
+			start_time = (ib_uint64_t)sec * 1000000 + ms;
+		} else {
+			start_time = 0;
+		}
+		for (;;) {
+			enum buf_io_fix	io_fix;
+
+			mutex_enter(block_mutex);
+			io_fix = buf_page_get_io_fix(bpage);
+			mutex_exit(block_mutex);
+
+			if (io_fix == BUF_IO_READ) {
+
+				os_thread_sleep(WAIT_FOR_READ);
+			} else {
+				break;
+			}
+		}
+		if (innobase_get_slow_log() && trx && trx->take_stats && start_time)
+		{
+			ut_usectime(&sec, &ms);
+			finish_time = (ib_uint64_t)sec * 1000000 + ms;
+			trx->io_reads_wait_timer += (ulint)(finish_time - start_time);
+		}
+	}
+
+#ifdef UNIV_IBUF_COUNT_DEBUG
+	ut_a(ibuf_count_get(buf_page_get_space(bpage),
+			    buf_page_get_page_no(bpage)) == 0);
+#endif
+	return(bpage);
+}
+
+/********************************************************************//**
+Initialize some fields of a control block. */
+UNIV_INLINE
+void
+buf_block_init_low(
+/*===============*/
+	buf_block_t*	block)	/*!< in: block to init */
+{
+	block->check_index_page_at_flush = FALSE;
+	block->index		= NULL;
+
+	block->n_hash_helps	= 0;
+	block->is_hashed	= FALSE;
+	block->n_fields		= 1;
+	block->n_bytes		= 0;
+	block->left_side	= TRUE;
+}
+#endif /* !UNIV_HOTBACKUP */
+
+/********************************************************************//**
+Decompress a block.
+@return	TRUE if successful */
+UNIV_INTERN
+ibool
+buf_zip_decompress(
+/*===============*/
+	buf_block_t*	block,	/*!< in/out: block */
+	ibool		check)	/*!< in: TRUE=verify the page checksum */
+{
+	const byte*	frame		= block->page.zip.data;
+	ulint		stamp_checksum	= mach_read_from_4(
+		frame + FIL_PAGE_SPACE_OR_CHKSUM);
+
+	ut_ad(buf_block_get_zip_size(block));
+	ut_a(buf_block_get_space(block) != 0);
+
+	if (UNIV_LIKELY(check && stamp_checksum != BUF_NO_CHECKSUM_MAGIC)) {
+		ulint	calc_checksum	= page_zip_calc_checksum(
+			frame, page_zip_get_size(&block->page.zip));
+
+		if (UNIV_UNLIKELY(stamp_checksum != calc_checksum)) {
+			ut_print_timestamp(stderr);
+			fprintf(stderr,
+				"  InnoDB: compressed page checksum mismatch"
+				" (space %u page %u): %lu != %lu\n",
+				block->page.space, block->page.offset,
+				stamp_checksum, calc_checksum);
+			return(FALSE);
+		}
+	}
+
+	switch (fil_page_get_type(frame)) {
+	case FIL_PAGE_INDEX:
+		if (page_zip_decompress(&block->page.zip,
+					block->frame, TRUE)) {
+			return(TRUE);
+		}
+
+		fprintf(stderr,
+			"InnoDB: unable to decompress space %lu page %lu\n",
+			(ulong) block->page.space,
+			(ulong) block->page.offset);
+		return(FALSE);
+
+	case FIL_PAGE_TYPE_ALLOCATED:
+	case FIL_PAGE_INODE:
+	case FIL_PAGE_IBUF_BITMAP:
+	case FIL_PAGE_TYPE_FSP_HDR:
+	case FIL_PAGE_TYPE_XDES:
+	case FIL_PAGE_TYPE_ZBLOB:
+	case FIL_PAGE_TYPE_ZBLOB2:
+		/* Copy to uncompressed storage. */
+		memcpy(block->frame, frame,
+		       buf_block_get_zip_size(block));
+		return(TRUE);
+	}
+
+	ut_print_timestamp(stderr);
+	fprintf(stderr,
+		"  InnoDB: unknown compressed page"
+		" type %lu\n",
+		fil_page_get_type(frame));
+	return(FALSE);
+}
+
+#ifndef UNIV_HOTBACKUP
+/*******************************************************************//**
+Gets the block to whose frame the pointer is pointing to.
+@return	pointer to block, never NULL */
+UNIV_INTERN
+buf_block_t*
+buf_block_align(
+/*============*/
+	const byte*	ptr)	/*!< in: pointer to a frame */
+{
+	buf_chunk_t*	chunk;
+	ulint		i;
+
+	/* TODO: protect buf_pool->chunks with a mutex (it will
+	currently remain constant after buf_pool_init()) */
+	for (chunk = buf_pool->chunks, i = buf_pool->n_chunks; i--; chunk++) {
+		lint	offs = ptr - chunk->blocks->frame;
+
+		if (UNIV_UNLIKELY(offs < 0)) {
+
+			continue;
+		}
+
+		offs >>= UNIV_PAGE_SIZE_SHIFT;
+
+		if (UNIV_LIKELY((ulint) offs < chunk->size)) {
+			buf_block_t*	block = &chunk->blocks[offs];
+
+			/* The function buf_chunk_init() invokes
+			buf_block_init() so that block[n].frame ==
+			block->frame + n * UNIV_PAGE_SIZE.  Check it. */
+			ut_ad(block->frame == page_align(ptr));
+#ifdef UNIV_DEBUG
+			/* A thread that updates these fields must
+			hold buf_pool_mutex and block->mutex.  Acquire
+			only the latter. */
+			mutex_enter(&block->mutex);
+
+			switch (buf_block_get_state(block)) {
+			case BUF_BLOCK_ZIP_FREE:
+			case BUF_BLOCK_ZIP_PAGE:
+			case BUF_BLOCK_ZIP_DIRTY:
+				/* These types should only be used in
+				the compressed buffer pool, whose
+				memory is allocated from
+				buf_pool->chunks, in UNIV_PAGE_SIZE
+				blocks flagged as BUF_BLOCK_MEMORY. */
+				ut_error;
+				break;
+			case BUF_BLOCK_NOT_USED:
+			case BUF_BLOCK_READY_FOR_USE:
+			case BUF_BLOCK_MEMORY:
+				/* Some data structures contain
+				"guess" pointers to file pages.  The
+				file pages may have been freed and
+				reused.  Do not complain. */
+				break;
+			case BUF_BLOCK_REMOVE_HASH:
+				/* buf_LRU_block_remove_hashed_page()
+				will overwrite the FIL_PAGE_OFFSET and
+				FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID with
+				0xff and set the state to
+				BUF_BLOCK_REMOVE_HASH. */
+				ut_ad(page_get_space_id(page_align(ptr))
+				      == 0xffffffff);
+				ut_ad(page_get_page_no(page_align(ptr))
+				      == 0xffffffff);
+				break;
+			case BUF_BLOCK_FILE_PAGE:
+				ut_ad(block->page.space
+				      == page_get_space_id(page_align(ptr)));
+				ut_ad(block->page.offset
+				      == page_get_page_no(page_align(ptr)));
+				break;
+			}
+
+			mutex_exit(&block->mutex);
+#endif /* UNIV_DEBUG */
+
+			return(block);
+		}
+	}
+
+	/* The block should always be found. */
+	ut_error;
+	return(NULL);
+}
+
+/********************************************************************//**
+Find out if a pointer belongs to a buf_block_t. It can be a pointer to
+the buf_block_t itself or a member of it
+@return	TRUE if ptr belongs to a buf_block_t struct */
+UNIV_INTERN
+ibool
+buf_pointer_is_block_field(
+/*=======================*/
+	const void*		ptr)	/*!< in: pointer not
+					dereferenced */
+{
+	const buf_chunk_t*		chunk	= buf_pool->chunks;
+	const buf_chunk_t* const	echunk	= chunk + buf_pool->n_chunks;
+
+	/* TODO: protect buf_pool->chunks with a mutex (it will
+	currently remain constant after buf_pool_init()) */
+	while (chunk < echunk) {
+		if (ptr >= (void *)chunk->blocks
+		    && ptr < (void *)(chunk->blocks + chunk->size)) {
+
+			return(TRUE);
+		}
+
+		chunk++;
+	}
+
+	return(FALSE);
+}
+
+/********************************************************************//**
+Find out if a buffer block was created by buf_chunk_init().
+@return	TRUE if "block" has been added to buf_pool->free by buf_chunk_init() */
+static
+ibool
+buf_block_is_uncompressed(
+/*======================*/
+	const buf_block_t*	block)	/*!< in: pointer to block,
+					not dereferenced */
+{
+	//ut_ad(buf_pool_mutex_own());
+
+	if (UNIV_UNLIKELY((((ulint) block) % sizeof *block) != 0)) {
+		/* The pointer should be aligned. */
+		return(FALSE);
+	}
+
+	return(buf_pointer_is_block_field((void *)block));
+}
+
+/********************************************************************//**
+This is the general function used to get access to a database page.
+@return	pointer to the block or NULL */
+UNIV_INTERN
+buf_block_t*
+buf_page_get_gen(
+/*=============*/
+	ulint		space,	/*!< in: space id */
+	ulint		zip_size,/*!< in: compressed page size in bytes
+				or 0 for uncompressed pages */
+	ulint		offset,	/*!< in: page number */
+	ulint		rw_latch,/*!< in: RW_S_LATCH, RW_X_LATCH, RW_NO_LATCH */
+	buf_block_t*	guess,	/*!< in: guessed block or NULL */
+	ulint		mode,	/*!< in: BUF_GET, BUF_GET_IF_IN_POOL,
+				BUF_GET_NO_LATCH */
+	const char*	file,	/*!< in: file name */
+	ulint		line,	/*!< in: line where called */
+	mtr_t*		mtr)	/*!< in: mini-transaction */
+{
+	buf_block_t*	block;
+	unsigned	access_time;
+	ulint		fix_type;
+	ibool		must_read;
+	ulint		retries = 0;
+	mutex_t*	block_mutex;
+	trx_t*          trx = NULL;
+	ulint           sec;
+	ulint           ms;
+	ib_uint64_t     start_time;
+	ib_uint64_t     finish_time;
+
+	ut_ad(mtr);
+	ut_ad(mtr->state == MTR_ACTIVE);
+	ut_ad((rw_latch == RW_S_LATCH)
+	      || (rw_latch == RW_X_LATCH)
+	      || (rw_latch == RW_NO_LATCH));
+	ut_ad((mode != BUF_GET_NO_LATCH) || (rw_latch == RW_NO_LATCH));
+	ut_ad((mode == BUF_GET) || (mode == BUF_GET_IF_IN_POOL)
+	      || (mode == BUF_GET_NO_LATCH));
+	ut_ad(zip_size == fil_space_get_zip_size(space));
+	ut_ad(ut_is_2pow(zip_size));
+#ifndef UNIV_LOG_DEBUG
+	ut_ad(!ibuf_inside() || ibuf_page(space, zip_size, offset, NULL));
+#endif
+	if (innobase_get_slow_log()) {
+		trx = innobase_get_trx();
+	}
+	buf_pool->stat.n_page_gets++;
+loop:
+	block = guess;
+	//buf_pool_mutex_enter();
+
+	if (block) {
+		block_mutex = buf_page_get_mutex_enter((buf_page_t*)block);
+
+		/* If the guess is a compressed page descriptor that
+		has been allocated by buf_buddy_alloc(), it may have
+		been invalidated by buf_buddy_relocate().  In that
+		case, block could point to something that happens to
+		contain the expected bits in block->page.  Similarly,
+		the guess may be pointing to a buffer pool chunk that
+		has been released when resizing the buffer pool. */
+
+		if (!block_mutex) {
+			block = guess = NULL;
+		} else if (!buf_block_is_uncompressed(block)
+		    || offset != block->page.offset
+		    || space != block->page.space
+		    || buf_block_get_state(block) != BUF_BLOCK_FILE_PAGE) {
+
+			mutex_exit(block_mutex);
+
+			block = guess = NULL;
+		} else {
+			ut_ad(!block->page.in_zip_hash);
+			ut_ad(block->page.in_page_hash);
+		}
+	}
+
+	if (block == NULL) {
+		rw_lock_s_lock(&page_hash_latch);
+		block = (buf_block_t*) buf_page_hash_get(space, offset);
+		if (block) {
+			block_mutex = buf_page_get_mutex_enter((buf_page_t*)block);
+			ut_a(block_mutex);
+		}
+		rw_lock_s_unlock(&page_hash_latch);
+	}
+
+loop2:
+	if (block == NULL) {
+		/* Page not in buf_pool: needs to be read from file */
+
+		//buf_pool_mutex_exit();
+
+		if (mode == BUF_GET_IF_IN_POOL) {
+
+			return(NULL);
+		}
+
+		if (buf_read_page(space, zip_size, offset, trx)) {
+			retries = 0;
+		} else if (retries < BUF_PAGE_READ_MAX_RETRIES) {
+			++retries;
+		} else {
+			fprintf(stderr, "InnoDB: Error: Unable"
+				" to read tablespace %lu page no"
+				" %lu into the buffer pool after"
+				" %lu attempts\n"
+				"InnoDB: The most probable cause"
+				" of this error may be that the"
+				" table has been corrupted.\n"
+				"InnoDB: You can try to fix this"
+				" problem by using"
+				" innodb_force_recovery.\n"
+				"InnoDB: Please see reference manual"
+				" for more details.\n"
+				"InnoDB: Aborting...\n",
+				space, offset,
+				BUF_PAGE_READ_MAX_RETRIES);
+
+			ut_error;
+		}
+
+#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
+		ut_a(++buf_dbg_counter % 37 || buf_validate());
+#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
+		goto loop;
+	}
+
+	ut_ad(page_zip_get_size(&block->page.zip) == zip_size);
+
+	must_read = buf_block_get_io_fix(block) == BUF_IO_READ;
+
+	if (must_read && mode == BUF_GET_IF_IN_POOL) {
+		/* The page is only being read to buffer */
+		//buf_pool_mutex_exit();
+		mutex_exit(block_mutex);
+
+		return(NULL);
+	}
+
+	if (srv_pass_corrupt_table) {
+		if (block->page.is_corrupt) {
+			mutex_exit(block_mutex);
+			return(NULL);
+		}
+	}
+	ut_a(!(block->page.is_corrupt));
+
+	switch (buf_block_get_state(block)) {
+		buf_page_t*	bpage;
+		ibool		success;
+
+	case BUF_BLOCK_FILE_PAGE:
+		if (block_mutex == &buf_pool_zip_mutex) {
+			/* it is wrong mutex... */
+			mutex_exit(block_mutex);
+			goto loop;
+		}
+		break;
+
+	case BUF_BLOCK_ZIP_PAGE:
+	case BUF_BLOCK_ZIP_DIRTY:
+		ut_ad(block_mutex == &buf_pool_zip_mutex);
+		bpage = &block->page;
+		/* Protect bpage->buf_fix_count. */
+		/* Already proteced here. */
+		//mutex_enter(&buf_pool_zip_mutex);
+
+		if (bpage->buf_fix_count
+		    || buf_page_get_io_fix(bpage) != BUF_IO_NONE) {
+			/* This condition often occurs when the buffer
+			is not buffer-fixed, but I/O-fixed by
+			buf_page_init_for_read(). */
+			//mutex_exit(&buf_pool_zip_mutex);
+wait_until_unfixed:
+			/* The block is buffer-fixed or I/O-fixed.
+			Try again later. */
+			//buf_pool_mutex_exit();
+			mutex_exit(block_mutex);
+			os_thread_sleep(WAIT_FOR_READ);
+
+			goto loop;
+		}
+
+		/* Allocate an uncompressed page. */
+		//buf_pool_mutex_exit();
+		//mutex_exit(&buf_pool_zip_mutex);
+		mutex_exit(block_mutex);
+
+		block = buf_LRU_get_free_block(0);
+		ut_a(block);
+		block_mutex = &block->mutex;
+
+		//buf_pool_mutex_enter();
+		mutex_enter(&LRU_list_mutex);
+		rw_lock_x_lock(&page_hash_latch);
+		mutex_enter(block_mutex);
+
+		{
+			buf_page_t*	hash_bpage
+				= buf_page_hash_get(space, offset);
+
+			if (UNIV_UNLIKELY(bpage != hash_bpage)) {
+				/* The buf_pool->page_hash was modified
+				while buf_pool_mutex was released.
+				Free the block that was allocated. */
+
+				buf_LRU_block_free_non_file_page(block, TRUE);
+				mutex_exit(block_mutex);
+
+				block = (buf_block_t*) hash_bpage;
+				if (block) {
+					block_mutex = buf_page_get_mutex_enter((buf_page_t*)block);
+					ut_a(block_mutex);
+				}
+				rw_lock_x_unlock(&page_hash_latch);
+				mutex_exit(&LRU_list_mutex);
+				goto loop2;
+			}
+		}
+
+		mutex_enter(&buf_pool_zip_mutex);
+
+		if (UNIV_UNLIKELY
+		    (bpage->buf_fix_count
+		     || buf_page_get_io_fix(bpage) != BUF_IO_NONE)) {
+
+			mutex_exit(&buf_pool_zip_mutex);
+			/* The block was buffer-fixed or I/O-fixed
+			while buf_pool_mutex was not held by this thread.
+			Free the block that was allocated and try again.
+			This should be extremely unlikely. */
+
+			buf_LRU_block_free_non_file_page(block, TRUE);
+			//mutex_exit(&block->mutex);
+
+			rw_lock_x_unlock(&page_hash_latch);
+			mutex_exit(&LRU_list_mutex);
+			goto wait_until_unfixed;
+		}
+
+		/* Move the compressed page from bpage to block,
+		and uncompress it. */
+
+		mutex_enter(&flush_list_mutex);
+
+		buf_relocate(bpage, &block->page);
+
+		rw_lock_x_unlock(&page_hash_latch);
+
+		buf_block_init_low(block);
+		block->lock_hash_val = lock_rec_hash(space, offset);
+
+		UNIV_MEM_DESC(&block->page.zip.data,
+			      page_zip_get_size(&block->page.zip), block);
+
+		if (buf_page_get_state(&block->page)
+		    == BUF_BLOCK_ZIP_PAGE) {
+			UT_LIST_REMOVE(zip_list, buf_pool->zip_clean,
+				       &block->page);
+			ut_ad(!block->page.in_flush_list);
+		} else {
+			/* Relocate buf_pool->flush_list. */
+			buf_flush_relocate_on_flush_list(bpage,
+							 &block->page);
+		}
+
+		mutex_exit(&flush_list_mutex);
+
+		/* Buffer-fix, I/O-fix, and X-latch the block
+		for the duration of the decompression.
+		Also add the block to the unzip_LRU list. */
+		block->page.state = BUF_BLOCK_FILE_PAGE;
+
+		/* Insert at the front of unzip_LRU list */
+		buf_unzip_LRU_add_block(block, FALSE);
+
+		mutex_exit(&LRU_list_mutex);
+
+		block->page.buf_fix_count = 1;
+		buf_block_set_io_fix(block, BUF_IO_READ);
+		rw_lock_x_lock_func(&block->lock, 0, file, line);
+
+		UNIV_MEM_INVALID(bpage, sizeof *bpage);
+
+		mutex_exit(block_mutex);
+		mutex_exit(&buf_pool_zip_mutex);
+
+		mutex_enter(&buf_pool_mutex);
+		buf_pool->n_pend_unzip++;
+		mutex_exit(&buf_pool_mutex);
+
+		buf_buddy_free(bpage, sizeof *bpage, FALSE);
+
+		//buf_pool_mutex_exit();
+
+		/* Decompress the page and apply buffered operations
+		while not holding buf_pool_mutex or block->mutex. */
+		success = buf_zip_decompress(block, srv_use_checksums);
+		ut_a(success);
+
+		if (UNIV_LIKELY(!recv_no_ibuf_operations)) {
+			ibuf_merge_or_delete_for_page(block, space, offset,
+						      zip_size, TRUE);
+		}
+
+		/* Unfix and unlatch the block. */
+		//buf_pool_mutex_enter();
+		block_mutex = &block->mutex;
+		mutex_enter(block_mutex);
+		block->page.buf_fix_count--;
+		buf_block_set_io_fix(block, BUF_IO_NONE);
+
+		mutex_enter(&buf_pool_mutex);
+		buf_pool->n_pend_unzip--;
+		mutex_exit(&buf_pool_mutex);
+		rw_lock_x_unlock(&block->lock);
+		break;
+
+	case BUF_BLOCK_ZIP_FREE:
+	case BUF_BLOCK_NOT_USED:
+	case BUF_BLOCK_READY_FOR_USE:
+	case BUF_BLOCK_MEMORY:
+	case BUF_BLOCK_REMOVE_HASH:
+		ut_error;
+		break;
+	}
+
+	ut_ad(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE);
+
+	//mutex_enter(&block->mutex);
+#if UNIV_WORD_SIZE == 4
+	/* On 32-bit systems, there is no padding in buf_page_t.  On
+	other systems, Valgrind could complain about uninitialized pad
+	bytes. */
+	UNIV_MEM_ASSERT_RW(&block->page, sizeof block->page);
+#endif
+
+	buf_block_buf_fix_inc(block, file, line);
+
+	//mutex_exit(&block->mutex);
+
+	/* Check if this is the first access to the page */
+
+	access_time = buf_page_is_accessed(&block->page);
+
+	//buf_pool_mutex_exit();
+	mutex_exit(block_mutex);
+
+	buf_page_set_accessed_make_young(&block->page, access_time);
+
+#ifdef UNIV_DEBUG_FILE_ACCESSES
+	ut_a(!block->page.file_page_was_freed);
+#endif
+
+#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
+	ut_a(++buf_dbg_counter % 5771 || buf_validate());
+	ut_a(block->page.buf_fix_count > 0);
+	ut_a(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE);
+#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
+
+	switch (rw_latch) {
+	case RW_NO_LATCH:
+		if (must_read) {
+			/* Let us wait until the read operation
+			completes */
+
+			if (innobase_get_slow_log() && trx && trx->take_stats)
+			{
+				ut_usectime(&sec, &ms);
+				start_time = (ib_uint64_t)sec * 1000000 + ms;
+			} else {
+				start_time = 0;
+			}
+			for (;;) {
+				enum buf_io_fix	io_fix;
+
+				mutex_enter(&block->mutex);
+				io_fix = buf_block_get_io_fix(block);
+				mutex_exit(&block->mutex);
+
+				if (io_fix == BUF_IO_READ) {
+
+					os_thread_sleep(WAIT_FOR_READ);
+				} else {
+					break;
+				}
+			}
+			if (innobase_get_slow_log() && trx && trx->take_stats && start_time)
+			{
+				ut_usectime(&sec, &ms);
+				finish_time = (ib_uint64_t)sec * 1000000 + ms;
+				trx->io_reads_wait_timer += (ulint)(finish_time - start_time);
+			}
+		}
+
+		fix_type = MTR_MEMO_BUF_FIX;
+		break;
+
+	case RW_S_LATCH:
+		rw_lock_s_lock_func(&(block->lock), 0, file, line);
+
+		fix_type = MTR_MEMO_PAGE_S_FIX;
+		break;
+
+	default:
+		ut_ad(rw_latch == RW_X_LATCH);
+		rw_lock_x_lock_func(&(block->lock), 0, file, line);
+
+		fix_type = MTR_MEMO_PAGE_X_FIX;
+		break;
+	}
+
+	mtr_memo_push(mtr, block, fix_type);
+
+	if (!access_time) {
+		/* In the case of a first access, try to apply linear
+		read-ahead */
+
+		buf_read_ahead_linear(space, zip_size, offset, trx);
+	}
+
+#ifdef UNIV_IBUF_COUNT_DEBUG
+	ut_a(ibuf_count_get(buf_block_get_space(block),
+			    buf_block_get_page_no(block)) == 0);
+#endif
+	if (innobase_get_slow_log()) {
+		_increment_page_get_statistics(block, trx);
+	}
+
+	return(block);
+}
+
+/********************************************************************//**
+This is the general function used to get optimistic access to a database
+page.
+@return	TRUE if success */
+UNIV_INTERN
+ibool
+buf_page_optimistic_get(
+/*====================*/
+	ulint		rw_latch,/*!< in: RW_S_LATCH, RW_X_LATCH */
+	buf_block_t*	block,	/*!< in: guessed buffer block */
+	ib_uint64_t	modify_clock,/*!< in: modify clock value if mode is
+				..._GUESS_ON_CLOCK */
+	const char*	file,	/*!< in: file name */
+	ulint		line,	/*!< in: line where called */
+	mtr_t*		mtr)	/*!< in: mini-transaction */
+{
+	unsigned	access_time;
+	ibool		success;
+	ulint		fix_type;
+	trx_t*		trx = NULL;
+
+	ut_ad(block);
+	ut_ad(mtr);
+	ut_ad(mtr->state == MTR_ACTIVE);
+	ut_ad((rw_latch == RW_S_LATCH) || (rw_latch == RW_X_LATCH));
+
+	mutex_enter(&block->mutex);
+
+	if (UNIV_UNLIKELY(buf_block_get_state(block) != BUF_BLOCK_FILE_PAGE)) {
+
+		mutex_exit(&block->mutex);
+
+		return(FALSE);
+	}
+
+	buf_block_buf_fix_inc(block, file, line);
+
+	mutex_exit(&block->mutex);
+
+	/* Check if this is the first access to the page.
+	We do a dirty read on purpose, to avoid mutex contention.
+	This field is only used for heuristic purposes; it does not
+	affect correctness. */
+
+	access_time = buf_page_is_accessed(&block->page);
+	buf_page_set_accessed_make_young(&block->page, access_time);
+
+	ut_ad(!ibuf_inside()
+	      || ibuf_page(buf_block_get_space(block),
+			   buf_block_get_zip_size(block),
+			   buf_block_get_page_no(block), NULL));
+
+	if (rw_latch == RW_S_LATCH) {
+		success = rw_lock_s_lock_nowait(&(block->lock),
+						file, line);
+		fix_type = MTR_MEMO_PAGE_S_FIX;
+	} else {
+		success = rw_lock_x_lock_func_nowait(&(block->lock),
+						     file, line);
+		fix_type = MTR_MEMO_PAGE_X_FIX;
+	}
+
+	if (UNIV_UNLIKELY(!success)) {
+		mutex_enter(&block->mutex);
+		buf_block_buf_fix_dec(block);
+		mutex_exit(&block->mutex);
+
+		return(FALSE);
+	}
+
+	if (UNIV_UNLIKELY(modify_clock != block->modify_clock)) {
+		buf_block_dbg_add_level(block, SYNC_NO_ORDER_CHECK);
+
+		if (rw_latch == RW_S_LATCH) {
+			rw_lock_s_unlock(&(block->lock));
+		} else {
+			rw_lock_x_unlock(&(block->lock));
+		}
+
+		mutex_enter(&block->mutex);
+		buf_block_buf_fix_dec(block);
+		mutex_exit(&block->mutex);
+
+		return(FALSE);
+	}
+
+	mtr_memo_push(mtr, block, fix_type);
+
+#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
+	ut_a(++buf_dbg_counter % 5771 || buf_validate());
+	ut_a(block->page.buf_fix_count > 0);
+	ut_a(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE);
+#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
+
+#ifdef UNIV_DEBUG_FILE_ACCESSES
+	ut_a(block->page.file_page_was_freed == FALSE);
+#endif
+	if (innobase_get_slow_log()) {
+		trx = innobase_get_trx();
+	}
+
+	if (UNIV_UNLIKELY(!access_time)) {
+		/* In the case of a first access, try to apply linear
+		read-ahead */
+
+		buf_read_ahead_linear(buf_block_get_space(block),
+				      buf_block_get_zip_size(block),
+				      buf_block_get_page_no(block), trx);
+	}
+
+#ifdef UNIV_IBUF_COUNT_DEBUG
+	ut_a(ibuf_count_get(buf_block_get_space(block),
+			    buf_block_get_page_no(block)) == 0);
+#endif
+	buf_pool->stat.n_page_gets++;
+
+	if (innobase_get_slow_log()) {
+		_increment_page_get_statistics(block, trx);
+	}
+	return(TRUE);
+}
+
+/********************************************************************//**
+This is used to get access to a known database page, when no waiting can be
+done. For example, if a search in an adaptive hash index leads us to this
+frame.
+@return	TRUE if success */
+UNIV_INTERN
+ibool
+buf_page_get_known_nowait(
+/*======================*/
+	ulint		rw_latch,/*!< in: RW_S_LATCH, RW_X_LATCH */
+	buf_block_t*	block,	/*!< in: the known page */
+	ulint		mode,	/*!< in: BUF_MAKE_YOUNG or BUF_KEEP_OLD */
+	const char*	file,	/*!< in: file name */
+	ulint		line,	/*!< in: line where called */
+	mtr_t*		mtr)	/*!< in: mini-transaction */
+{
+	ibool		success;
+	ulint		fix_type;
+	trx_t*		trx = NULL;
+
+	ut_ad(mtr);
+	ut_ad(mtr->state == MTR_ACTIVE);
+	ut_ad((rw_latch == RW_S_LATCH) || (rw_latch == RW_X_LATCH));
+
+	mutex_enter(&block->mutex);
+
+	if (buf_block_get_state(block) == BUF_BLOCK_REMOVE_HASH) {
+		/* Another thread is just freeing the block from the LRU list
+		of the buffer pool: do not try to access this page; this
+		attempt to access the page can only come through the hash
+		index because when the buffer block state is ..._REMOVE_HASH,
+		we have already removed it from the page address hash table
+		of the buffer pool. */
+
+		mutex_exit(&block->mutex);
+
+		return(FALSE);
+	}
+
+	ut_a(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE);
+
+	buf_block_buf_fix_inc(block, file, line);
+
+	mutex_exit(&block->mutex);
+
+	if (mode == BUF_MAKE_YOUNG && buf_page_peek_if_too_old(&block->page)) {
+		//buf_pool_mutex_enter();
+		mutex_enter(&LRU_list_mutex);
+		buf_LRU_make_block_young(&block->page);
+		//buf_pool_mutex_exit();
+		mutex_exit(&LRU_list_mutex);
+	} else if (!buf_page_is_accessed(&block->page)) {
+		/* Above, we do a dirty read on purpose, to avoid
+		mutex contention.  The field buf_page_t::access_time
+		is only used for heuristic purposes.  Writes to the
+		field must be protected by mutex, however. */
+		ulint	time_ms = ut_time_ms();
+
+		//buf_pool_mutex_enter();
+		mutex_enter(&block->mutex);
+		buf_page_set_accessed(&block->page, time_ms);
+		//buf_pool_mutex_exit();
+		mutex_exit(&block->mutex);
+	}
+
+	ut_ad(!ibuf_inside() || (mode == BUF_KEEP_OLD));
+
+	if (rw_latch == RW_S_LATCH) {
+		success = rw_lock_s_lock_nowait(&(block->lock),
+						file, line);
+		fix_type = MTR_MEMO_PAGE_S_FIX;
+	} else {
+		success = rw_lock_x_lock_func_nowait(&(block->lock),
+						     file, line);
+		fix_type = MTR_MEMO_PAGE_X_FIX;
+	}
+
+	if (!success) {
+		mutex_enter(&block->mutex);
+		buf_block_buf_fix_dec(block);
+		mutex_exit(&block->mutex);
+
+		return(FALSE);
+	}
+
+	mtr_memo_push(mtr, block, fix_type);
+
+#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
+	ut_a(++buf_dbg_counter % 5771 || buf_validate());
+	ut_a(block->page.buf_fix_count > 0);
+	ut_a(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE);
+#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
+#ifdef UNIV_DEBUG_FILE_ACCESSES
+	ut_a(block->page.file_page_was_freed == FALSE);
+#endif
+
+#ifdef UNIV_IBUF_COUNT_DEBUG
+	ut_a((mode == BUF_KEEP_OLD)
+	     || (ibuf_count_get(buf_block_get_space(block),
+				buf_block_get_page_no(block)) == 0));
+#endif
+	buf_pool->stat.n_page_gets++;
+
+	if (innobase_get_slow_log()) {
+		trx = innobase_get_trx();
+		_increment_page_get_statistics(block, trx);
+	}
+
+	return(TRUE);
+}
+
+/*******************************************************************//**
+Given a tablespace id and page number tries to get that page. If the
+page is not in the buffer pool it is not loaded and NULL is returned.
+Suitable for using when holding the kernel mutex.
+@return	pointer to a page or NULL */
+UNIV_INTERN
+const buf_block_t*
+buf_page_try_get_func(
+/*==================*/
+	ulint		space_id,/*!< in: tablespace id */
+	ulint		page_no,/*!< in: page number */
+	const char*	file,	/*!< in: file name */
+	ulint		line,	/*!< in: line where called */
+	mtr_t*		mtr)	/*!< in: mini-transaction */
+{
+	buf_block_t*	block;
+	ibool		success;
+	ulint		fix_type;
+
+	ut_ad(mtr);
+	ut_ad(mtr->state == MTR_ACTIVE);
+
+	//buf_pool_mutex_enter();
+	rw_lock_s_lock(&page_hash_latch);
+	block = buf_block_hash_get(space_id, page_no);
+
+	if (!block) {
+		//buf_pool_mutex_exit();
+		rw_lock_s_unlock(&page_hash_latch);
+		return(NULL);
+	}
+
+	mutex_enter(&block->mutex);
+	//buf_pool_mutex_exit();
+	rw_lock_s_unlock(&page_hash_latch);
+
+#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
+	ut_a(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE);
+	ut_a(buf_block_get_space(block) == space_id);
+	ut_a(buf_block_get_page_no(block) == page_no);
+#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
+
+	buf_block_buf_fix_inc(block, file, line);
+	mutex_exit(&block->mutex);
+
+	fix_type = MTR_MEMO_PAGE_S_FIX;
+	success = rw_lock_s_lock_nowait(&block->lock, file, line);
+
+	if (!success) {
+		/* Let us try to get an X-latch. If the current thread
+		is holding an X-latch on the page, we cannot get an
+		S-latch. */
+
+		fix_type = MTR_MEMO_PAGE_X_FIX;
+		success = rw_lock_x_lock_func_nowait(&block->lock,
+						     file, line);
+	}
+
+	if (!success) {
+		mutex_enter(&block->mutex);
+		buf_block_buf_fix_dec(block);
+		mutex_exit(&block->mutex);
+
+		return(NULL);
+	}
+
+	mtr_memo_push(mtr, block, fix_type);
+#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
+	ut_a(++buf_dbg_counter % 5771 || buf_validate());
+	ut_a(block->page.buf_fix_count > 0);
+	ut_a(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE);
+#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
+#ifdef UNIV_DEBUG_FILE_ACCESSES
+	ut_a(block->page.file_page_was_freed == FALSE);
+#endif /* UNIV_DEBUG_FILE_ACCESSES */
+	buf_block_dbg_add_level(block, SYNC_NO_ORDER_CHECK);
+
+	buf_pool->stat.n_page_gets++;
+
+#ifdef UNIV_IBUF_COUNT_DEBUG
+	ut_a(ibuf_count_get(buf_block_get_space(block),
+			    buf_block_get_page_no(block)) == 0);
+#endif
+
+	return(block);
+}
+
+/********************************************************************//**
+Initialize some fields of a control block. */
+UNIV_INLINE
+void
+buf_page_init_low(
+/*==============*/
+	buf_page_t*	bpage)	/*!< in: block to init */
+{
+	bpage->flush_type = BUF_FLUSH_LRU;
+	bpage->io_fix = BUF_IO_NONE;
+	bpage->buf_fix_count = 0;
+	bpage->freed_page_clock = 0;
+	bpage->access_time = 0;
+	bpage->newest_modification = 0;
+	bpage->oldest_modification = 0;
+	HASH_INVALIDATE(bpage, hash);
+	bpage->is_corrupt = FALSE;
+#ifdef UNIV_DEBUG_FILE_ACCESSES
+	bpage->file_page_was_freed = FALSE;
+#endif /* UNIV_DEBUG_FILE_ACCESSES */
+}
+
+/********************************************************************//**
+Inits a page to the buffer buf_pool. */
+static
+void
+buf_page_init(
+/*==========*/
+	ulint		space,	/*!< in: space id */
+	ulint		offset,	/*!< in: offset of the page within space
+				in units of a page */
+	buf_block_t*	block)	/*!< in: block to init */
+{
+	buf_page_t*	hash_page;
+
+	//ut_ad(buf_pool_mutex_own());
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(rw_lock_own(&page_hash_latch, RW_LOCK_EX));
+#endif
+	ut_ad(mutex_own(&(block->mutex)));
+	ut_a(buf_block_get_state(block) != BUF_BLOCK_FILE_PAGE);
+
+	/* Set the state of the block */
+	buf_block_set_file_page(block, space, offset);
+
+#ifdef UNIV_DEBUG_VALGRIND
+	if (!space) {
+		/* Silence valid Valgrind warnings about uninitialized
+		data being written to data files.  There are some unused
+		bytes on some pages that InnoDB does not initialize. */
+		UNIV_MEM_VALID(block->frame, UNIV_PAGE_SIZE);
+	}
+#endif /* UNIV_DEBUG_VALGRIND */
+
+	buf_block_init_low(block);
+
+	block->lock_hash_val	= lock_rec_hash(space, offset);
+
+	/* Insert into the hash table of file pages */
+
+	hash_page = buf_page_hash_get(space, offset);
+
+	if (UNIV_LIKELY_NULL(hash_page)) {
+		fprintf(stderr,
+			"InnoDB: Error: page %lu %lu already found"
+			" in the hash table: %p, %p\n",
+			(ulong) space,
+			(ulong) offset,
+			(const void*) hash_page, (const void*) block);
+#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
+		mutex_exit(&block->mutex);
+		//buf_pool_mutex_exit();
+		rw_lock_x_unlock(&page_hash_latch);
+		buf_print();
+		buf_LRU_print();
+		buf_validate();
+		buf_LRU_validate();
+#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
+		ut_error;
+	}
+
+	buf_page_init_low(&block->page);
+
+	ut_ad(!block->page.in_zip_hash);
+	ut_ad(!block->page.in_page_hash);
+	ut_d(block->page.in_page_hash = TRUE);
+	HASH_INSERT(buf_page_t, hash, buf_pool->page_hash,
+		    buf_page_address_fold(space, offset), &block->page);
+}
+
+/********************************************************************//**
+Function which inits a page for read to the buffer buf_pool. If the page is
+(1) already in buf_pool, or
+(2) if we specify to read only ibuf pages and the page is not an ibuf page, or
+(3) if the space is deleted or being deleted,
+then this function does nothing.
+Sets the io_fix flag to BUF_IO_READ and sets a non-recursive exclusive lock
+on the buffer frame. The io-handler must take care that the flag is cleared
+and the lock released later.
+@return	pointer to the block or NULL */
+UNIV_INTERN
+buf_page_t*
+buf_page_init_for_read(
+/*===================*/
+	ulint*		err,	/*!< out: DB_SUCCESS or DB_TABLESPACE_DELETED */
+	ulint		mode,	/*!< in: BUF_READ_IBUF_PAGES_ONLY, ... */
+	ulint		space,	/*!< in: space id */
+	ulint		zip_size,/*!< in: compressed page size, or 0 */
+	ibool		unzip,	/*!< in: TRUE=request uncompressed page */
+	ib_int64_t	tablespace_version,/*!< in: prevents reading from a wrong
+				version of the tablespace in case we have done
+				DISCARD + IMPORT */
+	ulint		offset)	/*!< in: page number */
+{
+	buf_block_t*	block;
+	buf_page_t*	bpage;
+	mtr_t		mtr;
+	ibool		lru	= FALSE;
+	void*		data;
+
+	ut_ad(buf_pool);
+
+	*err = DB_SUCCESS;
+
+	if (mode == BUF_READ_IBUF_PAGES_ONLY) {
+		/* It is a read-ahead within an ibuf routine */
+
+		ut_ad(!ibuf_bitmap_page(zip_size, offset));
+		ut_ad(ibuf_inside());
+
+		mtr_start(&mtr);
+
+		if (!recv_no_ibuf_operations
+		    && !ibuf_page(space, zip_size, offset, &mtr)) {
+
+			mtr_commit(&mtr);
+
+			return(NULL);
+		}
+	} else {
+		ut_ad(mode == BUF_READ_ANY_PAGE);
+	}
+
+	if (zip_size && UNIV_LIKELY(!unzip)
+	    && UNIV_LIKELY(!recv_recovery_is_on())) {
+		block = NULL;
+	} else {
+		block = buf_LRU_get_free_block(0);
+		ut_ad(block);
+	}
+
+	//buf_pool_mutex_enter();
+	mutex_enter(&LRU_list_mutex);
+	rw_lock_x_lock(&page_hash_latch);
+
+	if (buf_page_hash_get(space, offset)) {
+		/* The page is already in the buffer pool. */
+err_exit:
+		if (block) {
+			mutex_enter(&block->mutex);
+			mutex_exit(&LRU_list_mutex);
+			rw_lock_x_unlock(&page_hash_latch);
+			buf_LRU_block_free_non_file_page(block, FALSE);
+			mutex_exit(&block->mutex);
+		}
+		else {
+			mutex_exit(&LRU_list_mutex);
+			rw_lock_x_unlock(&page_hash_latch);
+		}
+
+		bpage = NULL;
+		goto func_exit;
+	}
+
+	if (fil_tablespace_deleted_or_being_deleted_in_mem(
+		    space, tablespace_version)) {
+		/* The page belongs to a space which has been
+		deleted or is being deleted. */
+		*err = DB_TABLESPACE_DELETED;
+
+		goto err_exit;
+	}
+
+	if (block) {
+		bpage = &block->page;
+		mutex_enter(&block->mutex);
+		buf_page_init(space, offset, block);
+
+		rw_lock_x_unlock(&page_hash_latch);
+
+		/* The block must be put to the LRU list, to the old blocks */
+		buf_LRU_add_block(bpage, TRUE/* to old blocks */);
+
+		/* We set a pass-type x-lock on the frame because then
+		the same thread which called for the read operation
+		(and is running now at this point of code) can wait
+		for the read to complete by waiting for the x-lock on
+		the frame; if the x-lock were recursive, the same
+		thread would illegally get the x-lock before the page
+		read is completed.  The x-lock is cleared by the
+		io-handler thread. */
+
+		rw_lock_x_lock_gen(&block->lock, BUF_IO_READ);
+		buf_page_set_io_fix(bpage, BUF_IO_READ);
+
+		if (UNIV_UNLIKELY(zip_size)) {
+			page_zip_set_size(&block->page.zip, zip_size);
+
+			/* buf_pool_mutex may be released and
+			reacquired by buf_buddy_alloc().  Thus, we
+			must release block->mutex in order not to
+			break the latching order in the reacquisition
+			of buf_pool_mutex.  We also must defer this
+			operation until after the block descriptor has
+			been added to buf_pool->LRU and
+			buf_pool->page_hash. */
+			mutex_exit(&block->mutex);
+			data = buf_buddy_alloc(zip_size, &lru, FALSE);
+			mutex_enter(&block->mutex);
+			block->page.zip.data = data;
+
+			/* To maintain the invariant
+			block->in_unzip_LRU_list
+			== buf_page_belongs_to_unzip_LRU(&block->page)
+			we have to add this block to unzip_LRU
+			after block->page.zip.data is set. */
+			ut_ad(buf_page_belongs_to_unzip_LRU(&block->page));
+			buf_unzip_LRU_add_block(block, TRUE);
+		}
+
+		mutex_exit(&LRU_list_mutex);
+		mutex_exit(&block->mutex);
+	} else {
+		/* Defer buf_buddy_alloc() until after the block has
+		been found not to exist.  The buf_buddy_alloc() and
+		buf_buddy_free() calls may be expensive because of
+		buf_buddy_relocate(). */
+
+		/* The compressed page must be allocated before the
+		control block (bpage), in order to avoid the
+		invocation of buf_buddy_relocate_block() on
+		uninitialized data. */
+		data = buf_buddy_alloc(zip_size, &lru, TRUE);
+		bpage = buf_buddy_alloc(sizeof *bpage, &lru, TRUE);
+
+		/* If buf_buddy_alloc() allocated storage from the LRU list,
+		it released and reacquired buf_pool_mutex.  Thus, we must
+		check the page_hash again, as it may have been modified. */
+		if (UNIV_UNLIKELY(lru)
+		    && UNIV_LIKELY_NULL(buf_page_hash_get(space, offset))) {
+
+			/* The block was added by some other thread. */
+			buf_buddy_free(bpage, sizeof *bpage, TRUE);
+			buf_buddy_free(data, zip_size, TRUE);
+
+			mutex_exit(&LRU_list_mutex);
+			rw_lock_x_unlock(&page_hash_latch);
+
+			bpage = NULL;
+			goto func_exit;
+		}
+
+		page_zip_des_init(&bpage->zip);
+		page_zip_set_size(&bpage->zip, zip_size);
+		bpage->zip.data = data;
+
+		mutex_enter(&buf_pool_zip_mutex);
+		UNIV_MEM_DESC(bpage->zip.data,
+			      page_zip_get_size(&bpage->zip), bpage);
+		buf_page_init_low(bpage);
+		bpage->state	= BUF_BLOCK_ZIP_PAGE;
+		bpage->space	= space;
+		bpage->offset	= offset;
+
+#ifdef UNIV_DEBUG
+		bpage->in_page_hash = FALSE;
+		bpage->in_zip_hash = FALSE;
+		bpage->in_flush_list = FALSE;
+		bpage->in_free_list = FALSE;
+#endif /* UNIV_DEBUG */
+		bpage->in_LRU_list = FALSE;
+
+		ut_d(bpage->in_page_hash = TRUE);
+		HASH_INSERT(buf_page_t, hash, buf_pool->page_hash,
+			    buf_page_address_fold(space, offset), bpage);
+
+		rw_lock_x_unlock(&page_hash_latch);
+
+		/* The block must be put to the LRU list, to the old blocks */
+		buf_LRU_add_block(bpage, TRUE/* to old blocks */);
+		mutex_enter(&flush_list_mutex);
+		buf_LRU_insert_zip_clean(bpage);
+		mutex_exit(&flush_list_mutex);
+
+		mutex_exit(&LRU_list_mutex);
+
+		buf_page_set_io_fix(bpage, BUF_IO_READ);
+
+		mutex_exit(&buf_pool_zip_mutex);
+	}
+
+	mutex_enter(&buf_pool_mutex);
+	buf_pool->n_pend_reads++;
+	mutex_exit(&buf_pool_mutex);
+func_exit:
+	//buf_pool_mutex_exit();
+
+	if (mode == BUF_READ_IBUF_PAGES_ONLY) {
+
+		mtr_commit(&mtr);
+	}
+
+	ut_ad(!bpage || buf_page_in_file(bpage));
+	return(bpage);
+}
+
+/********************************************************************//**
+Initializes a page to the buffer buf_pool. The page is usually not read
+from a file even if it cannot be found in the buffer buf_pool. This is one
+of the functions which perform to a block a state transition NOT_USED =>
+FILE_PAGE (the other is buf_page_get_gen).
+@return	pointer to the block, page bufferfixed */
+UNIV_INTERN
+buf_block_t*
+buf_page_create(
+/*============*/
+	ulint	space,	/*!< in: space id */
+	ulint	offset,	/*!< in: offset of the page within space in units of
+			a page */
+	ulint	zip_size,/*!< in: compressed page size, or 0 */
+	mtr_t*	mtr)	/*!< in: mini-transaction handle */
+{
+	buf_frame_t*	frame;
+	buf_block_t*	block;
+	buf_block_t*	free_block	= NULL;
+	ulint		time_ms		= ut_time_ms();
+
+	ut_ad(mtr);
+	ut_ad(mtr->state == MTR_ACTIVE);
+	ut_ad(space || !zip_size);
+
+	free_block = buf_LRU_get_free_block(0);
+
+	//buf_pool_mutex_enter();
+	mutex_enter(&LRU_list_mutex);
+	rw_lock_x_lock(&page_hash_latch);
+
+	block = (buf_block_t*) buf_page_hash_get(space, offset);
+
+	if (block && buf_page_in_file(&block->page)) {
+#ifdef UNIV_IBUF_COUNT_DEBUG
+		ut_a(ibuf_count_get(space, offset) == 0);
+#endif
+#ifdef UNIV_DEBUG_FILE_ACCESSES
+		block->page.file_page_was_freed = FALSE;
+#endif /* UNIV_DEBUG_FILE_ACCESSES */
+
+		/* Page can be found in buf_pool */
+		//buf_pool_mutex_exit();
+		mutex_exit(&LRU_list_mutex);
+		rw_lock_x_unlock(&page_hash_latch);
+
+		buf_block_free(free_block);
+
+		return(buf_page_get_with_no_latch(space, zip_size,
+						  offset, mtr));
+	}
+
+	/* If we get here, the page was not in buf_pool: init it there */
+
+#ifdef UNIV_DEBUG
+	if (buf_debug_prints) {
+		fprintf(stderr, "Creating space %lu page %lu to buffer\n",
+			(ulong) space, (ulong) offset);
+	}
+#endif /* UNIV_DEBUG */
+
+	block = free_block;
+
+	mutex_enter(&block->mutex);
+
+	buf_page_init(space, offset, block);
+	rw_lock_x_unlock(&page_hash_latch);
+
+	/* The block must be put to the LRU list */
+	buf_LRU_add_block(&block->page, FALSE);
+
+	buf_block_buf_fix_inc(block, __FILE__, __LINE__);
+	buf_pool->stat.n_pages_created++;
+
+	if (zip_size) {
+		void*	data;
+		ibool	lru;
+
+		/* Prevent race conditions during buf_buddy_alloc(),
+		which may release and reacquire buf_pool_mutex,
+		by IO-fixing and X-latching the block. */
+
+		buf_page_set_io_fix(&block->page, BUF_IO_READ);
+		rw_lock_x_lock(&block->lock);
+
+		page_zip_set_size(&block->page.zip, zip_size);
+		mutex_exit(&block->mutex);
+		/* buf_pool_mutex may be released and reacquired by
+		buf_buddy_alloc().  Thus, we must release block->mutex
+		in order not to break the latching order in
+		the reacquisition of buf_pool_mutex.  We also must
+		defer this operation until after the block descriptor
+		has been added to buf_pool->LRU and buf_pool->page_hash. */
+		data = buf_buddy_alloc(zip_size, &lru, FALSE);
+		mutex_enter(&block->mutex);
+		block->page.zip.data = data;
+
+		/* To maintain the invariant
+		block->in_unzip_LRU_list
+		== buf_page_belongs_to_unzip_LRU(&block->page)
+		we have to add this block to unzip_LRU after
+		block->page.zip.data is set. */
+		ut_ad(buf_page_belongs_to_unzip_LRU(&block->page));
+		buf_unzip_LRU_add_block(block, FALSE);
+
+		buf_page_set_io_fix(&block->page, BUF_IO_NONE);
+		rw_lock_x_unlock(&block->lock);
+	}
+
+	buf_page_set_accessed(&block->page, time_ms);
+
+	//buf_pool_mutex_exit();
+	mutex_exit(&LRU_list_mutex);
+
+	mtr_memo_push(mtr, block, MTR_MEMO_BUF_FIX);
+
+	mutex_exit(&block->mutex);
+
+	/* Delete possible entries for the page from the insert buffer:
+	such can exist if the page belonged to an index which was dropped */
+
+	ibuf_merge_or_delete_for_page(NULL, space, offset, zip_size, TRUE);
+
+	/* Flush pages from the end of the LRU list if necessary */
+	buf_flush_free_margin(FALSE);
+
+	frame = block->frame;
+
+	memset(frame + FIL_PAGE_PREV, 0xff, 4);
+	memset(frame + FIL_PAGE_NEXT, 0xff, 4);
+	mach_write_to_2(frame + FIL_PAGE_TYPE, FIL_PAGE_TYPE_ALLOCATED);
+
+	/* Reset to zero the file flush lsn field in the page; if the first
+	page of an ibdata file is 'created' in this function into the buffer
+	pool then we lose the original contents of the file flush lsn stamp.
+	Then InnoDB could in a crash recovery print a big, false, corruption
+	warning if the stamp contains an lsn bigger than the ib_logfile lsn. */
+
+	memset(frame + FIL_PAGE_FILE_FLUSH_LSN, 0, 8);
+
+#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
+	ut_a(++buf_dbg_counter % 357 || buf_validate());
+#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
+#ifdef UNIV_IBUF_COUNT_DEBUG
+	ut_a(ibuf_count_get(buf_block_get_space(block),
+			    buf_block_get_page_no(block)) == 0);
+#endif
+	return(block);
+}
+
+/********************************************************************//**
+Completes an asynchronous read or write request of a file page to or from
+the buffer pool. */
+UNIV_INTERN
+void
+buf_page_io_complete(
+/*=================*/
+	buf_page_t*	bpage,	/*!< in: pointer to the block in question */
+	trx_t*		trx)
+{
+	enum buf_io_fix	io_type;
+	const ibool	uncompressed = (buf_page_get_state(bpage)
+					== BUF_BLOCK_FILE_PAGE);
+	enum buf_flush	flush_type;
+	mutex_t*	block_mutex;
+
+	ut_a(buf_page_in_file(bpage));
+
+	/* We do not need protect io_fix here by mutex to read
+	it because this is the only function where we can change the value
+	from BUF_IO_READ or BUF_IO_WRITE to some other value, and our code
+	ensures that this is the only thread that handles the i/o for this
+	block. */
+
+	io_type = buf_page_get_io_fix(bpage);
+	ut_ad(io_type == BUF_IO_READ || io_type == BUF_IO_WRITE);
+
+	if (io_type == BUF_IO_READ) {
+		ulint	read_page_no;
+		ulint	read_space_id;
+		byte*	frame;
+
+		if (buf_page_get_zip_size(bpage)) {
+			frame = bpage->zip.data;
+			buf_pool->n_pend_unzip++;
+			if (uncompressed
+			    && !buf_zip_decompress((buf_block_t*) bpage,
+						   FALSE)) {
+
+				buf_pool->n_pend_unzip--;
+				goto corrupt;
+			}
+			buf_pool->n_pend_unzip--;
+		} else {
+			ut_a(uncompressed);
+			frame = ((buf_block_t*) bpage)->frame;
+		}
+
+		/* If this page is not uninitialized and not in the
+		doublewrite buffer, then the page number and space id
+		should be the same as in block. */
+		read_page_no = mach_read_from_4(frame + FIL_PAGE_OFFSET);
+		read_space_id = mach_read_from_4(
+			frame + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID);
+
+		if ((bpage->space == TRX_SYS_SPACE
+		     || (srv_doublewrite_file && bpage->space == TRX_DOUBLEWRITE_SPACE))
+		    && trx_doublewrite_page_inside(bpage->offset)) {
+
+			ut_print_timestamp(stderr);
+			fprintf(stderr,
+				"  InnoDB: Error: reading page %lu\n"
+				"InnoDB: which is in the"
+				" doublewrite buffer!\n",
+				(ulong) bpage->offset);
+		} else if (!read_space_id && !read_page_no) {
+			/* This is likely an uninitialized page. */
+		} else if ((bpage->space
+			    && bpage->space != read_space_id)
+			   || bpage->offset != read_page_no) {
+			/* We did not compare space_id to read_space_id
+			if bpage->space == 0, because the field on the
+			page may contain garbage in MySQL < 4.1.1,
+			which only supported bpage->space == 0. */
+
+			ut_print_timestamp(stderr);
+			fprintf(stderr,
+				"  InnoDB: Error: space id and page n:o"
+				" stored in the page\n"
+				"InnoDB: read in are %lu:%lu,"
+				" should be %lu:%lu!\n",
+				(ulong) read_space_id, (ulong) read_page_no,
+				(ulong) bpage->space,
+				(ulong) bpage->offset);
+		}
+
+		if (!srv_pass_corrupt_table || !bpage->is_corrupt) {
+		/* From version 3.23.38 up we store the page checksum
+		to the 4 first bytes of the page end lsn field */
+
+		if (buf_page_is_corrupted(frame,
+					  buf_page_get_zip_size(bpage))) {
+corrupt:
+			fprintf(stderr,
+				"InnoDB: Database page corruption on disk"
+				" or a failed\n"
+				"InnoDB: file read of page %lu.\n"
+				"InnoDB: You may have to recover"
+				" from a backup.\n",
+				(ulong) bpage->offset);
+			buf_page_print(frame, buf_page_get_zip_size(bpage));
+			fprintf(stderr,
+				"InnoDB: Database page corruption on disk"
+				" or a failed\n"
+				"InnoDB: file read of page %lu.\n"
+				"InnoDB: You may have to recover"
+				" from a backup.\n",
+				(ulong) bpage->offset);
+			fputs("InnoDB: It is also possible that"
+			      " your operating\n"
+			      "InnoDB: system has corrupted its"
+			      " own file cache\n"
+			      "InnoDB: and rebooting your computer"
+			      " removes the\n"
+			      "InnoDB: error.\n"
+			      "InnoDB: If the corrupt page is an index page\n"
+			      "InnoDB: you can also try to"
+			      " fix the corruption\n"
+			      "InnoDB: by dumping, dropping,"
+			      " and reimporting\n"
+			      "InnoDB: the corrupt table."
+			      " You can use CHECK\n"
+			      "InnoDB: TABLE to scan your"
+			      " table for corruption.\n"
+			      "InnoDB: See also "
+			      REFMAN "forcing-recovery.html\n"
+			      "InnoDB: about forcing recovery.\n", stderr);
+
+			if (srv_pass_corrupt_table && !trx_sys_sys_space(bpage->space)
+			    && bpage->space < SRV_LOG_SPACE_FIRST_ID) {
+				fprintf(stderr,
+					"InnoDB: space %u will be treated as corrupt.\n",
+					bpage->space);
+				fil_space_set_corrupt(bpage->space);
+				if (trx && trx->dict_operation_lock_mode == 0) {
+					dict_table_set_corrupt_by_space(bpage->space, TRUE);
+				} else {
+					dict_table_set_corrupt_by_space(bpage->space, FALSE);
+				}
+				bpage->is_corrupt = TRUE;
+			} else
+			if (srv_force_recovery < SRV_FORCE_IGNORE_CORRUPT) {
+				fputs("InnoDB: Ending processing because of"
+				      " a corrupt database page.\n",
+				      stderr);
+				exit(1);
+			}
+		}
+		} /**/
+
+		if (recv_recovery_is_on()) {
+			/* Pages must be uncompressed for crash recovery. */
+			ut_a(uncompressed);
+			recv_recover_page(TRUE, (buf_block_t*) bpage);
+		}
+
+		if (uncompressed && !recv_no_ibuf_operations) {
+			ibuf_merge_or_delete_for_page(
+				/* Delete possible entries, if bpage is_corrupt */
+				(srv_pass_corrupt_table && bpage->is_corrupt) ? NULL :
+				(buf_block_t*) bpage, bpage->space,
+				bpage->offset, buf_page_get_zip_size(bpage),
+				(srv_pass_corrupt_table && bpage->is_corrupt) ? FALSE :
+				TRUE);
+		}
+	}
+
+	//buf_pool_mutex_enter();
+	if (io_type == BUF_IO_WRITE) {
+		flush_type = buf_page_get_flush_type(bpage);
+		/* to keep consistency at buf_LRU_insert_zip_clean() */
+		//if (flush_type == BUF_FLUSH_LRU) { /* optimistic! */
+			mutex_enter(&LRU_list_mutex);
+		//}
+	}
+	block_mutex = buf_page_get_mutex_enter(bpage);
+	ut_a(block_mutex);
+	mutex_enter(&buf_pool_mutex);
+
+#ifdef UNIV_IBUF_COUNT_DEBUG
+	if (io_type == BUF_IO_WRITE || uncompressed) {
+		/* For BUF_IO_READ of compressed-only blocks, the
+		buffered operations will be merged by buf_page_get_gen()
+		after the block has been uncompressed. */
+		ut_a(ibuf_count_get(bpage->space, bpage->offset) == 0);
+	}
+#endif
+	/* Because this thread which does the unlocking is not the same that
+	did the locking, we use a pass value != 0 in unlock, which simply
+	removes the newest lock debug record, without checking the thread
+	id. */
+
+	buf_page_set_io_fix(bpage, BUF_IO_NONE);
+
+	switch (io_type) {
+	case BUF_IO_READ:
+		/* NOTE that the call to ibuf may have moved the ownership of
+		the x-latch to this OS thread: do not let this confuse you in
+		debugging! */
+
+		ut_ad(buf_pool->n_pend_reads > 0);
+		buf_pool->n_pend_reads--;
+		buf_pool->stat.n_pages_read++;
+
+		if (uncompressed) {
+			rw_lock_x_unlock_gen(&((buf_block_t*) bpage)->lock,
+					     BUF_IO_READ);
+		}
+
+		break;
+
+	case BUF_IO_WRITE:
+		/* Write means a flush operation: call the completion
+		routine in the flush system */
+
+		buf_flush_write_complete(bpage);
+
+		/* to keep consistency at buf_LRU_insert_zip_clean() */
+		//if (flush_type == BUF_FLUSH_LRU) { /* optimistic! */
+			mutex_exit(&LRU_list_mutex);
+		//}
+
+		if (uncompressed) {
+			rw_lock_s_unlock_gen(&((buf_block_t*) bpage)->lock,
+					     BUF_IO_WRITE);
+		}
+
+		buf_pool->stat.n_pages_written++;
+
+		break;
+
+	default:
+		ut_error;
+	}
+
+#ifdef UNIV_DEBUG
+	if (buf_debug_prints) {
+		fprintf(stderr, "Has %s page space %lu page no %lu\n",
+			io_type == BUF_IO_READ ? "read" : "written",
+			(ulong) buf_page_get_space(bpage),
+			(ulong) buf_page_get_page_no(bpage));
+	}
+#endif /* UNIV_DEBUG */
+
+	mutex_exit(&buf_pool_mutex);
+	mutex_exit(block_mutex);
+	//buf_pool_mutex_exit();
+}
+
+/*********************************************************************//**
+Invalidates the file pages in the buffer pool when an archive recovery is
+completed. All the file pages buffered must be in a replaceable state when
+this function is called: not latched and not modified. */
+UNIV_INTERN
+void
+buf_pool_invalidate(void)
+/*=====================*/
+{
+	ibool		freed;
+	enum buf_flush	i;
+
+	buf_pool_mutex_enter();
+
+	for (i = BUF_FLUSH_LRU; i < BUF_FLUSH_N_TYPES; i++) {
+
+		/* As this function is called during startup and
+		during redo application phase during recovery, InnoDB
+		is single threaded (apart from IO helper threads) at
+		this stage. No new write batch can be in intialization
+		stage at this point. */
+		ut_ad(buf_pool->init_flush[i] == FALSE);
+
+		/* However, it is possible that a write batch that has
+		been posted earlier is still not complete. For buffer
+		pool invalidation to proceed we must ensure there is NO
+		write activity happening. */
+		if (buf_pool->n_flush[i] > 0) {
+			buf_pool_mutex_exit();
+			buf_flush_wait_batch_end(i);
+			buf_pool_mutex_enter();
+		}
+	}
+
+	buf_pool_mutex_exit();
+
+	ut_ad(buf_all_freed());
+
+	freed = TRUE;
+
+	while (freed) {
+		freed = buf_LRU_search_and_free_block(100);
+	}
+
+	//buf_pool_mutex_enter();
+	mutex_enter(&LRU_list_mutex);
+
+	ut_ad(UT_LIST_GET_LEN(buf_pool->LRU) == 0);
+	ut_ad(UT_LIST_GET_LEN(buf_pool->unzip_LRU) == 0);
+
+	buf_pool->freed_page_clock = 0;
+	buf_pool->LRU_old = NULL;
+	buf_pool->LRU_old_len = 0;
+	buf_pool->LRU_flush_ended = 0;
+
+	memset(&buf_pool->stat, 0x00, sizeof(buf_pool->stat));
+	buf_refresh_io_stats();
+
+	//buf_pool_mutex_exit();
+	mutex_exit(&LRU_list_mutex);
+}
+
+#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
+/*********************************************************************//**
+Validates the buffer buf_pool data structure.
+@return	TRUE */
+UNIV_INTERN
+ibool
+buf_validate(void)
+/*==============*/
+{
+	buf_page_t*	b;
+	buf_chunk_t*	chunk;
+	ulint		i;
+	ulint		n_single_flush	= 0;
+	ulint		n_lru_flush	= 0;
+	ulint		n_list_flush	= 0;
+	ulint		n_lru		= 0;
+	ulint		n_flush		= 0;
+	ulint		n_free		= 0;
+	ulint		n_zip		= 0;
+
+	ut_ad(buf_pool);
+
+	//buf_pool_mutex_enter();
+	mutex_enter(&LRU_list_mutex);
+	rw_lock_x_lock(&page_hash_latch);
+	/* for keep the new latch order, it cannot validate correctly... */
+
+	chunk = buf_pool->chunks;
+
+	/* Check the uncompressed blocks. */
+
+	for (i = buf_pool->n_chunks; i--; chunk++) {
+
+		ulint		j;
+		buf_block_t*	block = chunk->blocks;
+
+		for (j = chunk->size; j--; block++) {
+
+			mutex_enter(&block->mutex);
+
+			switch (buf_block_get_state(block)) {
+			case BUF_BLOCK_ZIP_FREE:
+			case BUF_BLOCK_ZIP_PAGE:
+			case BUF_BLOCK_ZIP_DIRTY:
+				/* These should only occur on
+				zip_clean, zip_free[], or flush_list. */
+				ut_error;
+				break;
+
+			case BUF_BLOCK_FILE_PAGE:
+				ut_a(buf_page_hash_get(buf_block_get_space(
+							       block),
+						       buf_block_get_page_no(
+							       block))
+				     == &block->page);
+
+#ifdef UNIV_IBUF_COUNT_DEBUG
+				ut_a(buf_page_get_io_fix(&block->page)
+				     == BUF_IO_READ
+				     || !ibuf_count_get(buf_block_get_space(
+								block),
+							buf_block_get_page_no(
+								block)));
+#endif
+				switch (buf_page_get_io_fix(&block->page)) {
+				case BUF_IO_NONE:
+					break;
+
+				case BUF_IO_WRITE:
+					switch (buf_page_get_flush_type(
+							&block->page)) {
+					case BUF_FLUSH_LRU:
+						n_lru_flush++;
+						ut_a(rw_lock_is_locked(
+							     &block->lock,
+							     RW_LOCK_SHARED));
+						break;
+					case BUF_FLUSH_LIST:
+						n_list_flush++;
+						break;
+					case BUF_FLUSH_SINGLE_PAGE:
+						n_single_flush++;
+						break;
+					default:
+						ut_error;
+					}
+
+					break;
+
+				case BUF_IO_READ:
+
+					ut_a(rw_lock_is_locked(&block->lock,
+							       RW_LOCK_EX));
+					break;
+				}
+
+				n_lru++;
+
+				if (block->page.oldest_modification > 0) {
+					n_flush++;
+				}
+
+				break;
+
+			case BUF_BLOCK_NOT_USED:
+				n_free++;
+				break;
+
+			case BUF_BLOCK_READY_FOR_USE:
+			case BUF_BLOCK_MEMORY:
+			case BUF_BLOCK_REMOVE_HASH:
+				/* do nothing */
+				break;
+			}
+
+			mutex_exit(&block->mutex);
+		}
+	}
+
+	mutex_enter(&buf_pool_zip_mutex);
+
+	/* Check clean compressed-only blocks. */
+
+	for (b = UT_LIST_GET_FIRST(buf_pool->zip_clean); b;
+	     b = UT_LIST_GET_NEXT(zip_list, b)) {
+		ut_a(buf_page_get_state(b) == BUF_BLOCK_ZIP_PAGE);
+		switch (buf_page_get_io_fix(b)) {
+		case BUF_IO_NONE:
+			/* All clean blocks should be I/O-unfixed. */
+			break;
+		case BUF_IO_READ:
+			/* In buf_LRU_free_block(), we temporarily set
+			b->io_fix = BUF_IO_READ for a newly allocated
+			control block in order to prevent
+			buf_page_get_gen() from decompressing the block. */
+			break;
+		default:
+			ut_error;
+			break;
+		}
+		ut_a(!b->oldest_modification);
+		ut_a(buf_page_hash_get(b->space, b->offset) == b);
+
+		n_lru++;
+		n_zip++;
+	}
+
+	/* Check dirty compressed-only blocks. */
+
+	mutex_enter(&flush_list_mutex);
+	for (b = UT_LIST_GET_FIRST(buf_pool->flush_list); b;
+	     b = UT_LIST_GET_NEXT(flush_list, b)) {
+		ut_ad(b->in_flush_list);
+
+		switch (buf_page_get_state(b)) {
+		case BUF_BLOCK_ZIP_DIRTY:
+			ut_a(b->oldest_modification);
+			n_lru++;
+			n_flush++;
+			n_zip++;
+			switch (buf_page_get_io_fix(b)) {
+			case BUF_IO_NONE:
+			case BUF_IO_READ:
+				break;
+
+			case BUF_IO_WRITE:
+				switch (buf_page_get_flush_type(b)) {
+				case BUF_FLUSH_LRU:
+					n_lru_flush++;
+					break;
+				case BUF_FLUSH_LIST:
+					n_list_flush++;
+					break;
+				case BUF_FLUSH_SINGLE_PAGE:
+					n_single_flush++;
+					break;
+				default:
+					ut_error;
+				}
+				break;
+			}
+			break;
+		case BUF_BLOCK_FILE_PAGE:
+			/* uncompressed page */
+			break;
+		case BUF_BLOCK_ZIP_FREE:
+		case BUF_BLOCK_ZIP_PAGE:
+		case BUF_BLOCK_NOT_USED:
+		case BUF_BLOCK_READY_FOR_USE:
+		case BUF_BLOCK_MEMORY:
+		case BUF_BLOCK_REMOVE_HASH:
+			ut_error;
+			break;
+		}
+		ut_a(buf_page_hash_get(b->space, b->offset) == b);
+	}
+	mutex_exit(&flush_list_mutex);
+
+	mutex_exit(&buf_pool_zip_mutex);
+
+	if (n_lru + n_free > buf_pool->curr_size + n_zip) {
+		fprintf(stderr, "n LRU %lu, n free %lu, pool %lu zip %lu\n",
+			(ulong) n_lru, (ulong) n_free,
+			(ulong) buf_pool->curr_size, (ulong) n_zip);
+		ut_error;
+	}
+
+	ut_a(UT_LIST_GET_LEN(buf_pool->LRU) == n_lru);
+	/* because of latching order with block->mutex, we cannot get free_list_mutex before that */
+/*
+	if (UT_LIST_GET_LEN(buf_pool->free) != n_free) {
+		fprintf(stderr, "Free list len %lu, free blocks %lu\n",
+			(ulong) UT_LIST_GET_LEN(buf_pool->free),
+			(ulong) n_free);
+		ut_error;
+	}
+*/
+	/* because of latching order with block->mutex, we cannot get flush_list_mutex before that */
+/*
+	ut_a(UT_LIST_GET_LEN(buf_pool->flush_list) == n_flush);
+
+	ut_a(buf_pool->n_flush[BUF_FLUSH_SINGLE_PAGE] == n_single_flush);
+	ut_a(buf_pool->n_flush[BUF_FLUSH_LIST] == n_list_flush);
+	ut_a(buf_pool->n_flush[BUF_FLUSH_LRU] == n_lru_flush);
+*/
+
+	//buf_pool_mutex_exit();
+	mutex_exit(&LRU_list_mutex);
+	rw_lock_x_unlock(&page_hash_latch);
+
+	ut_a(buf_LRU_validate());
+	ut_a(buf_flush_validate());
+
+	return(TRUE);
+}
+#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
+
+#if defined UNIV_DEBUG_PRINT || defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
+/*********************************************************************//**
+Prints info of the buffer buf_pool data structure. */
+UNIV_INTERN
+void
+buf_print(void)
+/*===========*/
+{
+	dulint*		index_ids;
+	ulint*		counts;
+	ulint		size;
+	ulint		i;
+	ulint		j;
+	dulint		id;
+	ulint		n_found;
+	buf_chunk_t*	chunk;
+	dict_index_t*	index;
+
+	ut_ad(buf_pool);
+
+	size = buf_pool->curr_size;
+
+	index_ids = mem_alloc(sizeof(dulint) * size);
+	counts = mem_alloc(sizeof(ulint) * size);
+
+	//buf_pool_mutex_enter();
+	mutex_enter(&LRU_list_mutex);
+	mutex_enter(&free_list_mutex);
+	mutex_enter(&flush_list_mutex);
+
+	fprintf(stderr,
+		"buf_pool size %lu\n"
+		"database pages %lu\n"
+		"free pages %lu\n"
+		"modified database pages %lu\n"
+		"n pending decompressions %lu\n"
+		"n pending reads %lu\n"
+		"n pending flush LRU %lu list %lu single page %lu\n"
+		"pages made young %lu, not young %lu\n"
+		"pages read %lu, created %lu, written %lu\n",
+		(ulong) size,
+		(ulong) UT_LIST_GET_LEN(buf_pool->LRU),
+		(ulong) UT_LIST_GET_LEN(buf_pool->free),
+		(ulong) UT_LIST_GET_LEN(buf_pool->flush_list),
+		(ulong) buf_pool->n_pend_unzip,
+		(ulong) buf_pool->n_pend_reads,
+		(ulong) buf_pool->n_flush[BUF_FLUSH_LRU],
+		(ulong) buf_pool->n_flush[BUF_FLUSH_LIST],
+		(ulong) buf_pool->n_flush[BUF_FLUSH_SINGLE_PAGE],
+		(ulong) buf_pool->stat.n_pages_made_young,
+		(ulong) buf_pool->stat.n_pages_not_made_young,
+		(ulong) buf_pool->stat.n_pages_read,
+		(ulong) buf_pool->stat.n_pages_created,
+		(ulong) buf_pool->stat.n_pages_written);
+
+	/* Count the number of blocks belonging to each index in the buffer */
+
+	n_found = 0;
+
+	chunk = buf_pool->chunks;
+
+	for (i = buf_pool->n_chunks; i--; chunk++) {
+		buf_block_t*	block		= chunk->blocks;
+		ulint		n_blocks	= chunk->size;
+
+		for (; n_blocks--; block++) {
+			const buf_frame_t* frame = block->frame;
+
+			if (fil_page_get_type(frame) == FIL_PAGE_INDEX) {
+
+				id = btr_page_get_index_id(frame);
+
+				/* Look for the id in the index_ids array */
+				j = 0;
+
+				while (j < n_found) {
+
+					if (ut_dulint_cmp(index_ids[j],
+							  id) == 0) {
+						counts[j]++;
+
+						break;
+					}
+					j++;
+				}
+
+				if (j == n_found) {
+					n_found++;
+					index_ids[j] = id;
+					counts[j] = 1;
+				}
+			}
+		}
+	}
+
+	//buf_pool_mutex_exit();
+	mutex_exit(&LRU_list_mutex);
+	mutex_exit(&free_list_mutex);
+	mutex_exit(&flush_list_mutex);
+
+	for (i = 0; i < n_found; i++) {
+		index = dict_index_get_if_in_cache(index_ids[i]);
+
+		fprintf(stderr,
+			"Block count for index %lu in buffer is about %lu",
+			(ulong) ut_dulint_get_low(index_ids[i]),
+			(ulong) counts[i]);
+
+		if (index) {
+			putc(' ', stderr);
+			dict_index_name_print(stderr, NULL, index);
+		}
+
+		putc('\n', stderr);
+	}
+
+	mem_free(index_ids);
+	mem_free(counts);
+
+	ut_a(buf_validate());
+}
+#endif /* UNIV_DEBUG_PRINT || UNIV_DEBUG || UNIV_BUF_DEBUG */
+
+#ifdef UNIV_DEBUG
+/*********************************************************************//**
+Returns the number of latched pages in the buffer pool.
+@return	number of latched pages */
+UNIV_INTERN
+ulint
+buf_get_latched_pages_number(void)
+/*==============================*/
+{
+	buf_chunk_t*	chunk;
+	buf_page_t*	b;
+	ulint		i;
+	ulint		fixed_pages_number = 0;
+
+	//buf_pool_mutex_enter();
+
+	chunk = buf_pool->chunks;
+
+	for (i = buf_pool->n_chunks; i--; chunk++) {
+		buf_block_t*	block;
+		ulint		j;
+
+		block = chunk->blocks;
+
+		for (j = chunk->size; j--; block++) {
+			if (buf_block_get_state(block)
+			    != BUF_BLOCK_FILE_PAGE) {
+
+				continue;
+			}
+
+			mutex_enter(&block->mutex);
+
+			if (block->page.buf_fix_count != 0
+			    || buf_page_get_io_fix(&block->page)
+			    != BUF_IO_NONE) {
+				fixed_pages_number++;
+			}
+
+			mutex_exit(&block->mutex);
+		}
+	}
+
+	mutex_enter(&buf_pool_zip_mutex);
+
+	/* Traverse the lists of clean and dirty compressed-only blocks. */
+
+	for (b = UT_LIST_GET_FIRST(buf_pool->zip_clean); b;
+	     b = UT_LIST_GET_NEXT(zip_list, b)) {
+		ut_a(buf_page_get_state(b) == BUF_BLOCK_ZIP_PAGE);
+		ut_a(buf_page_get_io_fix(b) != BUF_IO_WRITE);
+
+		if (b->buf_fix_count != 0
+		    || buf_page_get_io_fix(b) != BUF_IO_NONE) {
+			fixed_pages_number++;
+		}
+	}
+
+	mutex_enter(&flush_list_mutex);
+	for (b = UT_LIST_GET_FIRST(buf_pool->flush_list); b;
+	     b = UT_LIST_GET_NEXT(flush_list, b)) {
+		ut_ad(b->in_flush_list);
+
+		switch (buf_page_get_state(b)) {
+		case BUF_BLOCK_ZIP_DIRTY:
+			if (b->buf_fix_count != 0
+			    || buf_page_get_io_fix(b) != BUF_IO_NONE) {
+				fixed_pages_number++;
+			}
+			break;
+		case BUF_BLOCK_FILE_PAGE:
+			/* uncompressed page */
+			break;
+		case BUF_BLOCK_ZIP_FREE:
+		case BUF_BLOCK_ZIP_PAGE:
+		case BUF_BLOCK_NOT_USED:
+		case BUF_BLOCK_READY_FOR_USE:
+		case BUF_BLOCK_MEMORY:
+		case BUF_BLOCK_REMOVE_HASH:
+			ut_error;
+			break;
+		}
+	}
+	mutex_exit(&flush_list_mutex);
+
+	mutex_exit(&buf_pool_zip_mutex);
+	//buf_pool_mutex_exit();
+
+	return(fixed_pages_number);
+}
+#endif /* UNIV_DEBUG */
+
+/*********************************************************************//**
+Returns the number of pending buf pool ios.
+@return	number of pending I/O operations */
+UNIV_INTERN
+ulint
+buf_get_n_pending_ios(void)
+/*=======================*/
+{
+	return(buf_pool->n_pend_reads
+	       + buf_pool->n_flush[BUF_FLUSH_LRU]
+	       + buf_pool->n_flush[BUF_FLUSH_LIST]
+	       + buf_pool->n_flush[BUF_FLUSH_SINGLE_PAGE]);
+}
+
+/*********************************************************************//**
+Returns the ratio in percents of modified pages in the buffer pool /
+database pages in the buffer pool.
+@return	modified page percentage ratio */
+UNIV_INTERN
+ulint
+buf_get_modified_ratio_pct(void)
+/*============================*/
+{
+	ulint	ratio;
+
+	//buf_pool_mutex_enter(); /* optimistic */
+
+	ratio = (100 * UT_LIST_GET_LEN(buf_pool->flush_list))
+		/ (1 + UT_LIST_GET_LEN(buf_pool->LRU)
+		   + UT_LIST_GET_LEN(buf_pool->free));
+
+	/* 1 + is there to avoid division by zero */
+
+	//buf_pool_mutex_exit(); /* optimistic */
+
+	return(ratio);
+}
+
+/*********************************************************************//**
+Prints info of the buffer i/o. */
+UNIV_INTERN
+void
+buf_print_io(
+/*=========*/
+	FILE*	file)	/*!< in/out: buffer where to print */
+{
+	time_t	current_time;
+	double	time_elapsed;
+	ulint	n_gets_diff;
+
+	ut_ad(buf_pool);
+
+	//buf_pool_mutex_enter();
+	mutex_enter(&LRU_list_mutex);
+	mutex_enter(&free_list_mutex);
+	mutex_enter(&buf_pool_mutex);
+	mutex_enter(&flush_list_mutex);
+
+	fprintf(file,
+		"Buffer pool size        %lu\n"
+		"Buffer pool size, bytes %lu\n"
+		"Free buffers            %lu\n"
+		"Database pages          %lu\n"
+		"Old database pages      %lu\n"
+		"Modified db pages       %lu\n"
+		"Pending reads %lu\n"
+		"Pending writes: LRU %lu, flush list %lu, single page %lu\n",
+		(ulong) buf_pool->curr_size,
+		(ulong) buf_pool->curr_size * UNIV_PAGE_SIZE,
+		(ulong) UT_LIST_GET_LEN(buf_pool->free),
+		(ulong) UT_LIST_GET_LEN(buf_pool->LRU),
+		(ulong) buf_pool->LRU_old_len,
+		(ulong) UT_LIST_GET_LEN(buf_pool->flush_list),
+		(ulong) buf_pool->n_pend_reads,
+		(ulong) buf_pool->n_flush[BUF_FLUSH_LRU]
+		+ buf_pool->init_flush[BUF_FLUSH_LRU],
+		(ulong) buf_pool->n_flush[BUF_FLUSH_LIST]
+		+ buf_pool->init_flush[BUF_FLUSH_LIST],
+		(ulong) buf_pool->n_flush[BUF_FLUSH_SINGLE_PAGE]);
+
+	current_time = time(NULL);
+	time_elapsed = 0.001 + difftime(current_time,
+					buf_pool->last_printout_time);
+
+	fprintf(file,
+		"Pages made young %lu, not young %lu\n"
+		"%.2f youngs/s, %.2f non-youngs/s\n"
+		"Pages read %lu, created %lu, written %lu\n"
+		"%.2f reads/s, %.2f creates/s, %.2f writes/s\n",
+		(ulong) buf_pool->stat.n_pages_made_young,
+		(ulong) buf_pool->stat.n_pages_not_made_young,
+		(buf_pool->stat.n_pages_made_young
+		 - buf_pool->old_stat.n_pages_made_young)
+		/ time_elapsed,
+		(buf_pool->stat.n_pages_not_made_young
+		 - buf_pool->old_stat.n_pages_not_made_young)
+		/ time_elapsed,
+		(ulong) buf_pool->stat.n_pages_read,
+		(ulong) buf_pool->stat.n_pages_created,
+		(ulong) buf_pool->stat.n_pages_written,
+		(buf_pool->stat.n_pages_read
+		 - buf_pool->old_stat.n_pages_read)
+		/ time_elapsed,
+		(buf_pool->stat.n_pages_created
+		 - buf_pool->old_stat.n_pages_created)
+		/ time_elapsed,
+		(buf_pool->stat.n_pages_written
+		 - buf_pool->old_stat.n_pages_written)
+		/ time_elapsed);
+
+	n_gets_diff = buf_pool->stat.n_page_gets - buf_pool->old_stat.n_page_gets;
+
+	if (n_gets_diff) {
+		fprintf(file,
+			"Buffer pool hit rate %lu / 1000,"
+			" young-making rate %lu / 1000 not %lu / 1000\n",
+			(ulong)
+			(1000 - ((1000 * (buf_pool->stat.n_pages_read
+					  - buf_pool->old_stat.n_pages_read))
+				 / (buf_pool->stat.n_page_gets
+				    - buf_pool->old_stat.n_page_gets))),
+			(ulong)
+			(1000 * (buf_pool->stat.n_pages_made_young
+				 - buf_pool->old_stat.n_pages_made_young)
+			 / n_gets_diff),
+			(ulong)
+			(1000 * (buf_pool->stat.n_pages_not_made_young
+				 - buf_pool->old_stat.n_pages_not_made_young)
+			 / n_gets_diff));
+	} else {
+		fputs("No buffer pool page gets since the last printout\n",
+		      file);
+	}
+
+	/* Statistics about read ahead algorithm */
+	fprintf(file, "Pages read ahead %.2f/s,"
+		" evicted without access %.2f/s\n",
+		(buf_pool->stat.n_ra_pages_read
+		- buf_pool->old_stat.n_ra_pages_read)
+		/ time_elapsed,
+		(buf_pool->stat.n_ra_pages_evicted
+		- buf_pool->old_stat.n_ra_pages_evicted)
+		/ time_elapsed);
+
+	/* Print some values to help us with visualizing what is
+	happening with LRU eviction. */
+	fprintf(file,
+		"LRU len: %lu, unzip_LRU len: %lu\n"
+		"I/O sum[%lu]:cur[%lu], unzip sum[%lu]:cur[%lu]\n",
+		UT_LIST_GET_LEN(buf_pool->LRU),
+		UT_LIST_GET_LEN(buf_pool->unzip_LRU),
+		buf_LRU_stat_sum.io, buf_LRU_stat_cur.io,
+		buf_LRU_stat_sum.unzip, buf_LRU_stat_cur.unzip);
+
+	buf_refresh_io_stats();
+	//buf_pool_mutex_exit();
+	mutex_exit(&LRU_list_mutex);
+	mutex_exit(&free_list_mutex);
+	mutex_exit(&buf_pool_mutex);
+	mutex_exit(&flush_list_mutex);
+}
+
+/**********************************************************************//**
+Refreshes the statistics used to print per-second averages. */
+UNIV_INTERN
+void
+buf_refresh_io_stats(void)
+/*======================*/
+{
+	buf_pool->last_printout_time = time(NULL);
+	buf_pool->old_stat = buf_pool->stat;
+}
+
+/*********************************************************************//**
+Asserts that all file pages in the buffer are in a replaceable state.
+@return	TRUE */
+UNIV_INTERN
+ibool
+buf_all_freed(void)
+/*===============*/
+{
+	buf_chunk_t*	chunk;
+	ulint		i;
+
+	ut_ad(buf_pool);
+
+	//buf_pool_mutex_enter(); /* optimistic */
+
+	chunk = buf_pool->chunks;
+
+	for (i = buf_pool->n_chunks; i--; chunk++) {
+
+		const buf_block_t* block = buf_chunk_not_freed(chunk);
+
+		if (UNIV_LIKELY_NULL(block)) {
+			fprintf(stderr,
+				"Page %lu %lu still fixed or dirty\n",
+				(ulong) block->page.space,
+				(ulong) block->page.offset);
+			ut_error;
+		}
+	}
+
+	//buf_pool_mutex_exit(); /* optimistic */
+
+	return(TRUE);
+}
+
+/*********************************************************************//**
+Checks that there currently are no pending i/o-operations for the buffer
+pool.
+@return	TRUE if there is no pending i/o */
+UNIV_INTERN
+ibool
+buf_pool_check_no_pending_io(void)
+/*==============================*/
+{
+	ibool	ret;
+
+	//buf_pool_mutex_enter();
+	mutex_enter(&buf_pool_mutex);
+
+	if (buf_pool->n_pend_reads + buf_pool->n_flush[BUF_FLUSH_LRU]
+	    + buf_pool->n_flush[BUF_FLUSH_LIST]
+	    + buf_pool->n_flush[BUF_FLUSH_SINGLE_PAGE]) {
+		ret = FALSE;
+	} else {
+		ret = TRUE;
+	}
+
+	//buf_pool_mutex_exit();
+	mutex_exit(&buf_pool_mutex);
+
+	return(ret);
+}
+
+/*********************************************************************//**
+Gets the current length of the free list of buffer blocks.
+@return	length of the free list */
+UNIV_INTERN
+ulint
+buf_get_free_list_len(void)
+/*=======================*/
+{
+	ulint	len;
+
+	//buf_pool_mutex_enter();
+	mutex_enter(&free_list_mutex);
+
+	len = UT_LIST_GET_LEN(buf_pool->free);
+
+	//buf_pool_mutex_exit();
+	mutex_exit(&free_list_mutex);
+
+	return(len);
+}
+#else /* !UNIV_HOTBACKUP */
+/********************************************************************//**
+Inits a page to the buffer buf_pool, for use in ibbackup --restore. */
+UNIV_INTERN
+void
+buf_page_init_for_backup_restore(
+/*=============================*/
+	ulint		space,	/*!< in: space id */
+	ulint		offset,	/*!< in: offset of the page within space
+				in units of a page */
+	ulint		zip_size,/*!< in: compressed page size in bytes
+				or 0 for uncompressed pages */
+	buf_block_t*	block)	/*!< in: block to init */
+{
+	block->page.state	= BUF_BLOCK_FILE_PAGE;
+	block->page.space	= space;
+	block->page.offset	= offset;
+
+	page_zip_des_init(&block->page.zip);
+
+	/* We assume that block->page.data has been allocated
+	with zip_size == UNIV_PAGE_SIZE. */
+	ut_ad(zip_size <= UNIV_PAGE_SIZE);
+	ut_ad(ut_is_2pow(zip_size));
+	page_zip_set_size(&block->page.zip, zip_size);
+	if (zip_size) {
+		block->page.zip.data = block->frame + UNIV_PAGE_SIZE;
+	}
+}
+#endif /* !UNIV_HOTBACKUP */
diff --git a/storage/xtradb/buf/buf0flu.c b/storage/xtradb/buf/buf0flu.c
new file mode 100644
index 00000000000..0a03d583549
--- /dev/null
+++ b/storage/xtradb/buf/buf0flu.c
@@ -0,0 +1,1781 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2010, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file buf/buf0flu.c
+The database buffer buf_pool flush algorithm
+
+Created 11/11/1995 Heikki Tuuri
+*******************************************************/
+
+#include "buf0flu.h"
+
+#ifdef UNIV_NONINL
+#include "buf0flu.ic"
+#endif
+
+#include "buf0buf.h"
+#include "srv0srv.h"
+#include "page0zip.h"
+#ifndef UNIV_HOTBACKUP
+#include "ut0byte.h"
+#include "ut0lst.h"
+#include "page0page.h"
+#include "fil0fil.h"
+#include "buf0lru.h"
+#include "buf0rea.h"
+#include "ibuf0ibuf.h"
+#include "log0log.h"
+#include "os0file.h"
+#include "trx0sys.h"
+
+/**********************************************************************
+These statistics are generated for heuristics used in estimating the
+rate at which we should flush the dirty blocks to avoid bursty IO
+activity. Note that the rate of flushing not only depends on how many
+dirty pages we have in the buffer pool but it is also a fucntion of
+how much redo the workload is generating and at what rate. */
+/* @{ */
+
+/** Number of intervals for which we keep the history of these stats.
+Each interval is 1 second, defined by the rate at which
+srv_error_monitor_thread() calls buf_flush_stat_update(). */
+#define BUF_FLUSH_STAT_N_INTERVAL 20
+
+/** Sampled values buf_flush_stat_cur.
+Not protected by any mutex.  Updated by buf_flush_stat_update(). */
+static buf_flush_stat_t	buf_flush_stat_arr[BUF_FLUSH_STAT_N_INTERVAL];
+
+/** Cursor to buf_flush_stat_arr[]. Updated in a round-robin fashion. */
+static ulint		buf_flush_stat_arr_ind;
+
+/** Values at start of the current interval. Reset by
+buf_flush_stat_update(). */
+static buf_flush_stat_t	buf_flush_stat_cur;
+
+/** Running sum of past values of buf_flush_stat_cur.
+Updated by buf_flush_stat_update(). Not protected by any mutex. */
+static buf_flush_stat_t	buf_flush_stat_sum;
+
+/** Number of pages flushed through non flush_list flushes. */
+static ulint buf_lru_flush_page_count = 0;
+
+/* @} */
+
+#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
+/******************************************************************//**
+Validates the flush list.
+@return	TRUE if ok */
+static
+ibool
+buf_flush_validate_low(void);
+/*========================*/
+#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
+
+/********************************************************************//**
+Insert a block in the flush_rbt and returns a pointer to its
+predecessor or NULL if no predecessor. The ordering is maintained
+on the basis of the <oldest_modification, space, offset> key.
+@return pointer to the predecessor or NULL if no predecessor. */
+static
+buf_page_t*
+buf_flush_insert_in_flush_rbt(
+/*==========================*/
+	buf_page_t*	bpage)		/*!< in: bpage to be inserted. */
+{
+	buf_page_t*		prev = NULL;
+	const ib_rbt_node_t*	c_node;
+	const ib_rbt_node_t*	p_node;
+
+	//ut_ad(buf_pool_mutex_own());
+	ut_ad(mutex_own(&flush_list_mutex));
+
+	/* Insert this buffer into the rbt. */
+	c_node = rbt_insert(buf_pool->flush_rbt, &bpage, &bpage);
+	ut_a(c_node != NULL);
+
+	/* Get the predecessor. */
+	p_node = rbt_prev(buf_pool->flush_rbt, c_node);
+
+	if (p_node != NULL) {
+		prev = *rbt_value(buf_page_t*, p_node);
+		ut_a(prev != NULL);
+	}
+
+	return(prev);
+}
+
+/********************************************************************//**
+Delete a bpage from the flush_rbt. */
+static
+void
+buf_flush_delete_from_flush_rbt(
+/*============================*/
+	buf_page_t*	bpage)		/*!< in: bpage to be removed. */
+{
+
+	ibool	ret = FALSE;
+
+	//ut_ad(buf_pool_mutex_own());
+	ut_ad(mutex_own(&flush_list_mutex));
+	ret = rbt_delete(buf_pool->flush_rbt, &bpage);
+	ut_ad(ret);
+}
+
+/********************************************************************//**
+Compare two modified blocks in the buffer pool. The key for comparison
+is:
+key = <oldest_modification, space, offset>
+This comparison is used to maintian ordering of blocks in the
+buf_pool->flush_rbt.
+Note that for the purpose of flush_rbt, we only need to order blocks
+on the oldest_modification. The other two fields are used to uniquely
+identify the blocks.
+@return < 0 if b2 < b1, 0 if b2 == b1, > 0 if b2 > b1 */
+static
+int
+buf_flush_block_cmp(
+/*================*/
+	const void*	p1,		/*!< in: block1 */
+	const void*	p2)		/*!< in: block2 */
+{
+	int		ret;
+	const buf_page_t* b1;
+	const buf_page_t* b2;
+
+	ut_ad(p1 != NULL);
+	ut_ad(p2 != NULL);
+
+	b1 = *(const buf_page_t**) p1;
+	b2 = *(const buf_page_t**) p2;
+
+	ut_ad(b1 != NULL);
+	ut_ad(b2 != NULL);
+
+	ut_ad(b1->in_flush_list);
+	ut_ad(b2->in_flush_list);
+
+	if (b2->oldest_modification
+	    > b1->oldest_modification) {
+		return(1);
+	}
+
+	if (b2->oldest_modification
+	    < b1->oldest_modification) {
+		return(-1);
+	}
+
+	/* If oldest_modification is same then decide on the space. */
+	ret = (int)(b2->space - b1->space);
+
+	/* Or else decide ordering on the offset field. */
+	return(ret ? ret : (int)(b2->offset - b1->offset));
+}
+
+/********************************************************************//**
+Initialize the red-black tree to speed up insertions into the flush_list
+during recovery process. Should be called at the start of recovery
+process before any page has been read/written. */
+UNIV_INTERN
+void
+buf_flush_init_flush_rbt(void)
+/*==========================*/
+{
+	//buf_pool_mutex_enter();
+	mutex_enter(&flush_list_mutex);
+
+	/* Create red black tree for speedy insertions in flush list. */
+	buf_pool->flush_rbt = rbt_create(sizeof(buf_page_t*),
+					 buf_flush_block_cmp);
+	//buf_pool_mutex_exit();
+	mutex_exit(&flush_list_mutex);
+}
+
+/********************************************************************//**
+Frees up the red-black tree. */
+UNIV_INTERN
+void
+buf_flush_free_flush_rbt(void)
+/*==========================*/
+{
+	//buf_pool_mutex_enter();
+	mutex_enter(&flush_list_mutex);
+
+#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
+	ut_a(buf_flush_validate_low());
+#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
+
+	rbt_free(buf_pool->flush_rbt);
+	buf_pool->flush_rbt = NULL;
+
+	//buf_pool_mutex_exit();
+	mutex_exit(&flush_list_mutex);
+}
+
+/********************************************************************//**
+Inserts a modified block into the flush list. */
+UNIV_INTERN
+void
+buf_flush_insert_into_flush_list(
+/*=============================*/
+	buf_block_t*	block)	/*!< in/out: block which is modified */
+{
+	//ut_ad(buf_pool_mutex_own());
+	ut_ad(mutex_own(&block->mutex));
+	ut_ad(mutex_own(&flush_list_mutex));
+	ut_ad((UT_LIST_GET_FIRST(buf_pool->flush_list) == NULL)
+	      || (UT_LIST_GET_FIRST(buf_pool->flush_list)->oldest_modification
+		  <= block->page.oldest_modification));
+
+	/* If we are in the recovery then we need to update the flush
+	red-black tree as well. */
+	if (UNIV_LIKELY_NULL(buf_pool->flush_rbt)) {
+		buf_flush_insert_sorted_into_flush_list(block);
+		return;
+	}
+
+	ut_ad(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE);
+	ut_ad(block->page.in_LRU_list);
+	ut_ad(block->page.in_page_hash);
+	ut_ad(!block->page.in_zip_hash);
+	ut_ad(!block->page.in_flush_list);
+	ut_d(block->page.in_flush_list = TRUE);
+	UT_LIST_ADD_FIRST(flush_list, buf_pool->flush_list, &block->page);
+
+#ifdef UNIV_DEBUG_VALGRIND
+	{
+		ulint	zip_size = buf_block_get_zip_size(block);
+
+		if (UNIV_UNLIKELY(zip_size)) {
+			UNIV_MEM_ASSERT_RW(block->page.zip.data, zip_size);
+		} else {
+			UNIV_MEM_ASSERT_RW(block->frame, UNIV_PAGE_SIZE);
+		}
+	}
+#endif /* UNIV_DEBUG_VALGRIND */
+#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
+	ut_a(buf_flush_validate_low());
+#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
+}
+
+/********************************************************************//**
+Inserts a modified block into the flush list in the right sorted position.
+This function is used by recovery, because there the modifications do not
+necessarily come in the order of lsn's. */
+UNIV_INTERN
+void
+buf_flush_insert_sorted_into_flush_list(
+/*====================================*/
+	buf_block_t*	block)	/*!< in/out: block which is modified */
+{
+	buf_page_t*	prev_b;
+	buf_page_t*	b;
+
+	//ut_ad(buf_pool_mutex_own());
+	ut_ad(mutex_own(&block->mutex));
+	ut_ad(mutex_own(&flush_list_mutex));
+	ut_ad(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE);
+
+	ut_ad(block->page.in_LRU_list);
+	ut_ad(block->page.in_page_hash);
+	ut_ad(!block->page.in_zip_hash);
+	ut_ad(!block->page.in_flush_list);
+	ut_d(block->page.in_flush_list = TRUE);
+
+#ifdef UNIV_DEBUG_VALGRIND
+	{
+		ulint	zip_size = buf_block_get_zip_size(block);
+
+		if (UNIV_UNLIKELY(zip_size)) {
+			UNIV_MEM_ASSERT_RW(block->page.zip.data, zip_size);
+		} else {
+			UNIV_MEM_ASSERT_RW(block->frame, UNIV_PAGE_SIZE);
+		}
+	}
+#endif /* UNIV_DEBUG_VALGRIND */
+
+	prev_b = NULL;
+
+	/* For the most part when this function is called the flush_rbt
+	should not be NULL. In a very rare boundary case it is possible
+	that the flush_rbt has already been freed by the recovery thread
+	before the last page was hooked up in the flush_list by the
+	io-handler thread. In that case we'll  just do a simple
+	linear search in the else block. */
+	if (buf_pool->flush_rbt) {
+
+		prev_b = buf_flush_insert_in_flush_rbt(&block->page);
+
+	} else {
+
+		b = UT_LIST_GET_FIRST(buf_pool->flush_list);
+
+		while (b && b->oldest_modification
+		       > block->page.oldest_modification) {
+			ut_ad(b->in_flush_list);
+			prev_b = b;
+			b = UT_LIST_GET_NEXT(flush_list, b);
+		}
+	}
+
+	if (prev_b == NULL) {
+		UT_LIST_ADD_FIRST(flush_list, buf_pool->flush_list, &block->page);
+	} else {
+		UT_LIST_INSERT_AFTER(flush_list, buf_pool->flush_list,
+				     prev_b, &block->page);
+	}
+
+#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
+	ut_a(buf_flush_validate_low());
+#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
+}
+
+/********************************************************************//**
+Returns TRUE if the file page block is immediately suitable for replacement,
+i.e., the transition FILE_PAGE => NOT_USED allowed.
+@return	TRUE if can replace immediately */
+UNIV_INTERN
+ibool
+buf_flush_ready_for_replace(
+/*========================*/
+	buf_page_t*	bpage)	/*!< in: buffer control block, must be
+				buf_page_in_file(bpage) and in the LRU list */
+{
+	//ut_ad(buf_pool_mutex_own());
+	ut_ad(mutex_own(buf_page_get_mutex(bpage)));
+	//ut_ad(bpage->in_LRU_list); /* optimistic use */
+
+	if (UNIV_LIKELY(bpage->in_LRU_list && buf_page_in_file(bpage))) {
+
+		return(bpage->oldest_modification == 0
+		       && buf_page_get_io_fix(bpage) == BUF_IO_NONE
+		       && bpage->buf_fix_count == 0);
+	}
+
+	/* permited not to own LRU_mutex..  */
+/*
+	ut_print_timestamp(stderr);
+	fprintf(stderr,
+		"  InnoDB: Error: buffer block state %lu"
+		" in the LRU list!\n",
+		(ulong) buf_page_get_state(bpage));
+	ut_print_buf(stderr, bpage, sizeof(buf_page_t));
+	putc('\n', stderr);
+*/
+
+	return(FALSE);
+}
+
+/********************************************************************//**
+Returns TRUE if the block is modified and ready for flushing.
+@return	TRUE if can flush immediately */
+UNIV_INLINE
+ibool
+buf_flush_ready_for_flush(
+/*======================*/
+	buf_page_t*	bpage,	/*!< in: buffer control block, must be
+				buf_page_in_file(bpage) */
+	enum buf_flush	flush_type)/*!< in: BUF_FLUSH_LRU or BUF_FLUSH_LIST */
+{
+	//ut_a(buf_page_in_file(bpage));
+	//ut_ad(buf_pool_mutex_own()); /*optimistic...*/
+	ut_ad(mutex_own(buf_page_get_mutex(bpage)));
+	ut_ad(flush_type == BUF_FLUSH_LRU || BUF_FLUSH_LIST);
+
+	if (buf_page_in_file(bpage) && bpage->oldest_modification != 0
+	    && buf_page_get_io_fix(bpage) == BUF_IO_NONE) {
+		ut_ad(bpage->in_flush_list);
+
+		if (flush_type != BUF_FLUSH_LRU) {
+
+			return(TRUE);
+
+		} else if (bpage->buf_fix_count == 0) {
+
+			/* If we are flushing the LRU list, to avoid deadlocks
+			we require the block not to be bufferfixed, and hence
+			not latched. */
+
+			return(TRUE);
+		}
+	}
+
+	return(FALSE);
+}
+
+/********************************************************************//**
+Remove a block from the flush list of modified blocks. */
+UNIV_INTERN
+void
+buf_flush_remove(
+/*=============*/
+	buf_page_t*	bpage)	/*!< in: pointer to the block in question */
+{
+	//ut_ad(buf_pool_mutex_own());
+	ut_ad(mutex_own(buf_page_get_mutex(bpage)));
+
+	mutex_enter(&flush_list_mutex);
+
+	ut_ad(bpage->in_flush_list);
+
+	switch (buf_page_get_state(bpage)) {
+	case BUF_BLOCK_ZIP_PAGE:
+		/* clean compressed pages should not be on the flush list */
+	case BUF_BLOCK_ZIP_FREE:
+	case BUF_BLOCK_NOT_USED:
+	case BUF_BLOCK_READY_FOR_USE:
+	case BUF_BLOCK_MEMORY:
+	case BUF_BLOCK_REMOVE_HASH:
+		mutex_exit(&flush_list_mutex);
+		ut_error;
+		return;
+	case BUF_BLOCK_ZIP_DIRTY:
+		buf_page_set_state(bpage, BUF_BLOCK_ZIP_PAGE);
+		UT_LIST_REMOVE(flush_list, buf_pool->flush_list, bpage);
+		buf_LRU_insert_zip_clean(bpage);
+		break;
+	case BUF_BLOCK_FILE_PAGE:
+		UT_LIST_REMOVE(flush_list, buf_pool->flush_list, bpage);
+		break;
+	}
+
+	/* If the flush_rbt is active then delete from it as well. */
+	if (UNIV_LIKELY_NULL(buf_pool->flush_rbt)) {
+		buf_flush_delete_from_flush_rbt(bpage);
+	}
+
+	/* Must be done after we have removed it from the flush_rbt
+	because we assert on in_flush_list in comparison function. */
+	ut_d(bpage->in_flush_list = FALSE);
+
+	bpage->oldest_modification = 0;
+
+	ut_d(UT_LIST_VALIDATE(flush_list, buf_page_t, buf_pool->flush_list,
+			      ut_ad(ut_list_node_313->in_flush_list)));
+	mutex_exit(&flush_list_mutex);
+}
+
+/********************************************************************//**
+Relocates a buffer control block on the flush_list.
+Note that it is assumed that the contents of bpage has already been
+copied to dpage. */
+UNIV_INTERN
+void
+buf_flush_relocate_on_flush_list(
+/*=============================*/
+	buf_page_t*	bpage,	/*!< in/out: control block being moved */
+	buf_page_t*	dpage)	/*!< in/out: destination block */
+{
+	buf_page_t* prev;
+	buf_page_t* prev_b = NULL;
+
+	//ut_ad(buf_pool_mutex_own());
+	ut_ad(mutex_own(&flush_list_mutex));
+
+	ut_ad(mutex_own(buf_page_get_mutex(bpage)));
+
+	ut_ad(bpage->in_flush_list);
+	ut_ad(dpage->in_flush_list);
+
+	/* If recovery is active we must swap the control blocks in
+	the flush_rbt as well. */
+	if (UNIV_LIKELY_NULL(buf_pool->flush_rbt)) {
+		buf_flush_delete_from_flush_rbt(bpage);
+		prev_b = buf_flush_insert_in_flush_rbt(dpage);
+	}
+
+	/* Must be done after we have removed it from the flush_rbt
+	because we assert on in_flush_list in comparison function. */
+	ut_d(bpage->in_flush_list = FALSE);
+
+	prev = UT_LIST_GET_PREV(flush_list, bpage);
+	UT_LIST_REMOVE(flush_list, buf_pool->flush_list, bpage);
+
+	if (prev) {
+		ut_ad(prev->in_flush_list);
+		UT_LIST_INSERT_AFTER(
+			flush_list,
+			buf_pool->flush_list,
+			prev, dpage);
+	} else {
+		UT_LIST_ADD_FIRST(
+			flush_list,
+			buf_pool->flush_list,
+			dpage);
+	}
+
+	/* Just an extra check. Previous in flush_list
+	should be the same control block as in flush_rbt. */
+	ut_a(!buf_pool->flush_rbt || prev_b == prev);
+
+#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
+	ut_a(buf_flush_validate_low());
+#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
+}
+
+/********************************************************************//**
+Updates the flush system data structures when a write is completed. */
+UNIV_INTERN
+void
+buf_flush_write_complete(
+/*=====================*/
+	buf_page_t*	bpage)	/*!< in: pointer to the block in question */
+{
+	enum buf_flush	flush_type;
+
+	ut_ad(bpage);
+
+	buf_flush_remove(bpage);
+
+	flush_type = buf_page_get_flush_type(bpage);
+	buf_pool->n_flush[flush_type]--;
+
+	if (flush_type == BUF_FLUSH_LRU) {
+		/* Put the block to the end of the LRU list to wait to be
+		moved to the free list */
+
+		buf_LRU_make_block_old(bpage);
+
+		buf_pool->LRU_flush_ended++;
+	}
+
+	/* fprintf(stderr, "n pending flush %lu\n",
+	buf_pool->n_flush[flush_type]); */
+
+	if ((buf_pool->n_flush[flush_type] == 0)
+	    && (buf_pool->init_flush[flush_type] == FALSE)) {
+
+		/* The running flush batch has ended */
+
+		os_event_set(buf_pool->no_flush[flush_type]);
+	}
+}
+
+/********************************************************************//**
+Flush a batch of writes to the datafiles that have already been
+written by the OS. */
+static
+void
+buf_flush_sync_datafiles(void)
+/*==========================*/
+{
+	/* Wake possible simulated aio thread to actually post the
+	writes to the operating system */
+	os_aio_simulated_wake_handler_threads();
+
+	/* Wait that all async writes to tablespaces have been posted to
+	the OS */
+	os_aio_wait_until_no_pending_writes();
+
+	/* Now we flush the data to disk (for example, with fsync) */
+	fil_flush_file_spaces(FIL_TABLESPACE);
+
+	return;
+}
+
+/********************************************************************//**
+Flushes possible buffered writes from the doublewrite memory buffer to disk,
+and also wakes up the aio thread if simulated aio is used. It is very
+important to call this function after a batch of writes has been posted,
+and also when we may have to wait for a page latch! Otherwise a deadlock
+of threads can occur. */
+static
+void
+buf_flush_buffered_writes(void)
+/*===========================*/
+{
+	byte*		write_buf;
+	ulint		len;
+	ulint		len2;
+	ulint		i;
+
+	if (!srv_use_doublewrite_buf || trx_doublewrite == NULL) {
+		/* Sync the writes to the disk. */
+		buf_flush_sync_datafiles();
+		return;
+	}
+
+	mutex_enter(&(trx_doublewrite->mutex));
+
+	/* Write first to doublewrite buffer blocks. We use synchronous
+	aio and thus know that file write has been completed when the
+	control returns. */
+
+	if (trx_doublewrite->first_free == 0) {
+
+		mutex_exit(&(trx_doublewrite->mutex));
+
+		return;
+	}
+
+	for (i = 0; i < trx_doublewrite->first_free; i++) {
+
+		const buf_block_t*	block;
+
+		block = (buf_block_t*) trx_doublewrite->buf_block_arr[i];
+
+		if (buf_block_get_state(block) != BUF_BLOCK_FILE_PAGE
+		    || block->page.zip.data) {
+			/* No simple validate for compressed pages exists. */
+			continue;
+		}
+
+		if (UNIV_UNLIKELY
+		    (memcmp(block->frame + (FIL_PAGE_LSN + 4),
+			    block->frame + (UNIV_PAGE_SIZE
+					    - FIL_PAGE_END_LSN_OLD_CHKSUM + 4),
+			    4))) {
+			ut_print_timestamp(stderr);
+			fprintf(stderr,
+				"  InnoDB: ERROR: The page to be written"
+				" seems corrupt!\n"
+				"InnoDB: The lsn fields do not match!"
+				" Noticed in the buffer pool\n"
+				"InnoDB: before posting to the"
+				" doublewrite buffer.\n");
+		}
+
+		if (!block->check_index_page_at_flush) {
+		} else if (page_is_comp(block->frame)) {
+			if (UNIV_UNLIKELY
+			    (!page_simple_validate_new(block->frame))) {
+corrupted_page:
+				buf_page_print(block->frame, 0);
+
+				ut_print_timestamp(stderr);
+				fprintf(stderr,
+					"  InnoDB: Apparent corruption of an"
+					" index page n:o %lu in space %lu\n"
+					"InnoDB: to be written to data file."
+					" We intentionally crash server\n"
+					"InnoDB: to prevent corrupt data"
+					" from ending up in data\n"
+					"InnoDB: files.\n",
+					(ulong) buf_block_get_page_no(block),
+					(ulong) buf_block_get_space(block));
+
+				ut_error;
+			}
+		} else if (UNIV_UNLIKELY
+			   (!page_simple_validate_old(block->frame))) {
+
+			goto corrupted_page;
+		}
+	}
+
+	/* increment the doublewrite flushed pages counter */
+	srv_dblwr_pages_written+= trx_doublewrite->first_free;
+	srv_dblwr_writes++;
+
+	len = ut_min(TRX_SYS_DOUBLEWRITE_BLOCK_SIZE,
+		     trx_doublewrite->first_free) * UNIV_PAGE_SIZE;
+
+	write_buf = trx_doublewrite->write_buf;
+	i = 0;
+
+	fil_io(OS_FILE_WRITE, TRUE,
+	       (srv_doublewrite_file ? TRX_DOUBLEWRITE_SPACE : TRX_SYS_SPACE), 0,
+	       trx_doublewrite->block1, 0, len,
+	       (void*) write_buf, NULL);
+
+	for (len2 = 0; len2 + UNIV_PAGE_SIZE <= len;
+	     len2 += UNIV_PAGE_SIZE, i++) {
+		const buf_block_t* block = (buf_block_t*)
+			trx_doublewrite->buf_block_arr[i];
+
+		if (UNIV_LIKELY(!block->page.zip.data)
+		    && UNIV_LIKELY(buf_block_get_state(block)
+				   == BUF_BLOCK_FILE_PAGE)
+		    && UNIV_UNLIKELY
+		    (memcmp(write_buf + len2 + (FIL_PAGE_LSN + 4),
+			    write_buf + len2
+			    + (UNIV_PAGE_SIZE
+			       - FIL_PAGE_END_LSN_OLD_CHKSUM + 4), 4))) {
+			ut_print_timestamp(stderr);
+			fprintf(stderr,
+				"  InnoDB: ERROR: The page to be written"
+				" seems corrupt!\n"
+				"InnoDB: The lsn fields do not match!"
+				" Noticed in the doublewrite block1.\n");
+		}
+	}
+
+	if (trx_doublewrite->first_free <= TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) {
+		goto flush;
+	}
+
+	len = (trx_doublewrite->first_free - TRX_SYS_DOUBLEWRITE_BLOCK_SIZE)
+		* UNIV_PAGE_SIZE;
+
+	write_buf = trx_doublewrite->write_buf
+		+ TRX_SYS_DOUBLEWRITE_BLOCK_SIZE * UNIV_PAGE_SIZE;
+	ut_ad(i == TRX_SYS_DOUBLEWRITE_BLOCK_SIZE);
+
+	fil_io(OS_FILE_WRITE, TRUE,
+	       (srv_doublewrite_file ? TRX_DOUBLEWRITE_SPACE : TRX_SYS_SPACE), 0,
+	       trx_doublewrite->block2, 0, len,
+	       (void*) write_buf, NULL);
+
+	for (len2 = 0; len2 + UNIV_PAGE_SIZE <= len;
+	     len2 += UNIV_PAGE_SIZE, i++) {
+		const buf_block_t* block = (buf_block_t*)
+			trx_doublewrite->buf_block_arr[i];
+
+		if (UNIV_LIKELY(!block->page.zip.data)
+		    && UNIV_LIKELY(buf_block_get_state(block)
+				   == BUF_BLOCK_FILE_PAGE)
+		    && UNIV_UNLIKELY
+		    (memcmp(write_buf + len2 + (FIL_PAGE_LSN + 4),
+			    write_buf + len2
+			    + (UNIV_PAGE_SIZE
+			       - FIL_PAGE_END_LSN_OLD_CHKSUM + 4), 4))) {
+			ut_print_timestamp(stderr);
+			fprintf(stderr,
+				"  InnoDB: ERROR: The page to be"
+				" written seems corrupt!\n"
+				"InnoDB: The lsn fields do not match!"
+				" Noticed in"
+				" the doublewrite block2.\n");
+		}
+	}
+
+flush:
+	/* Now flush the doublewrite buffer data to disk */
+
+	fil_flush(srv_doublewrite_file ? TRX_DOUBLEWRITE_SPACE : TRX_SYS_SPACE);
+
+	/* We know that the writes have been flushed to disk now
+	and in recovery we will find them in the doublewrite buffer
+	blocks. Next do the writes to the intended positions. */
+
+	for (i = 0; i < trx_doublewrite->first_free; i++) {
+		const buf_block_t* block = (buf_block_t*)
+			trx_doublewrite->buf_block_arr[i];
+
+		ut_a(buf_page_in_file(&block->page));
+		if (UNIV_LIKELY_NULL(block->page.zip.data)) {
+			fil_io(OS_FILE_WRITE | OS_AIO_SIMULATED_WAKE_LATER,
+			       FALSE, buf_page_get_space(&block->page),
+			       buf_page_get_zip_size(&block->page),
+			       buf_page_get_page_no(&block->page), 0,
+			       buf_page_get_zip_size(&block->page),
+			       (void*)block->page.zip.data,
+			       (void*)block);
+
+			/* Increment the counter of I/O operations used
+			for selecting LRU policy. */
+			buf_LRU_stat_inc_io();
+
+			continue;
+		}
+
+		ut_a(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE);
+
+		if (UNIV_UNLIKELY(memcmp(block->frame + (FIL_PAGE_LSN + 4),
+					 block->frame
+					 + (UNIV_PAGE_SIZE
+					    - FIL_PAGE_END_LSN_OLD_CHKSUM + 4),
+					 4))) {
+			ut_print_timestamp(stderr);
+			fprintf(stderr,
+				"  InnoDB: ERROR: The page to be written"
+				" seems corrupt!\n"
+				"InnoDB: The lsn fields do not match!"
+				" Noticed in the buffer pool\n"
+				"InnoDB: after posting and flushing"
+				" the doublewrite buffer.\n"
+				"InnoDB: Page buf fix count %lu,"
+				" io fix %lu, state %lu\n",
+				(ulong)block->page.buf_fix_count,
+				(ulong)buf_block_get_io_fix(block),
+				(ulong)buf_block_get_state(block));
+		}
+
+		fil_io(OS_FILE_WRITE | OS_AIO_SIMULATED_WAKE_LATER,
+		       FALSE, buf_block_get_space(block), 0,
+		       buf_block_get_page_no(block), 0, UNIV_PAGE_SIZE,
+		       (void*)block->frame, (void*)block);
+
+		/* Increment the counter of I/O operations used
+		for selecting LRU policy. */
+		buf_LRU_stat_inc_io();
+	}
+
+	/* Sync the writes to the disk. */
+	buf_flush_sync_datafiles();
+
+	/* We can now reuse the doublewrite memory buffer: */
+	trx_doublewrite->first_free = 0;
+
+	mutex_exit(&(trx_doublewrite->mutex));
+}
+
+/********************************************************************//**
+Posts a buffer page for writing. If the doublewrite memory buffer is
+full, calls buf_flush_buffered_writes and waits for for free space to
+appear. */
+static
+void
+buf_flush_post_to_doublewrite_buf(
+/*==============================*/
+	buf_page_t*	bpage)	/*!< in: buffer block to write */
+{
+	ulint	zip_size;
+try_again:
+	mutex_enter(&(trx_doublewrite->mutex));
+
+	ut_a(buf_page_in_file(bpage));
+
+	if (trx_doublewrite->first_free
+	    >= 2 * TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) {
+		mutex_exit(&(trx_doublewrite->mutex));
+
+		buf_flush_buffered_writes();
+
+		goto try_again;
+	}
+
+	zip_size = buf_page_get_zip_size(bpage);
+
+	if (UNIV_UNLIKELY(zip_size)) {
+		UNIV_MEM_ASSERT_RW(bpage->zip.data, zip_size);
+		/* Copy the compressed page and clear the rest. */
+		memcpy(trx_doublewrite->write_buf
+		       + UNIV_PAGE_SIZE * trx_doublewrite->first_free,
+		       bpage->zip.data, zip_size);
+		memset(trx_doublewrite->write_buf
+		       + UNIV_PAGE_SIZE * trx_doublewrite->first_free
+		       + zip_size, 0, UNIV_PAGE_SIZE - zip_size);
+	} else {
+		ut_a(buf_page_get_state(bpage) == BUF_BLOCK_FILE_PAGE);
+		UNIV_MEM_ASSERT_RW(((buf_block_t*) bpage)->frame,
+				   UNIV_PAGE_SIZE);
+
+		memcpy(trx_doublewrite->write_buf
+		       + UNIV_PAGE_SIZE * trx_doublewrite->first_free,
+		       ((buf_block_t*) bpage)->frame, UNIV_PAGE_SIZE);
+	}
+
+	trx_doublewrite->buf_block_arr[trx_doublewrite->first_free] = bpage;
+
+	trx_doublewrite->first_free++;
+
+	if (trx_doublewrite->first_free
+	    >= 2 * TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) {
+		mutex_exit(&(trx_doublewrite->mutex));
+
+		buf_flush_buffered_writes();
+
+		return;
+	}
+
+	mutex_exit(&(trx_doublewrite->mutex));
+}
+#endif /* !UNIV_HOTBACKUP */
+
+/********************************************************************//**
+Initializes a page for writing to the tablespace. */
+UNIV_INTERN
+void
+buf_flush_init_for_writing(
+/*=======================*/
+	byte*		page,		/*!< in/out: page */
+	void*		page_zip_,	/*!< in/out: compressed page, or NULL */
+	ib_uint64_t	newest_lsn)	/*!< in: newest modification lsn
+					to the page */
+{
+	ut_ad(page);
+
+	if (page_zip_) {
+		page_zip_des_t*	page_zip = page_zip_;
+		ulint		zip_size = page_zip_get_size(page_zip);
+		ut_ad(zip_size);
+		ut_ad(ut_is_2pow(zip_size));
+		ut_ad(zip_size <= UNIV_PAGE_SIZE);
+
+		switch (UNIV_EXPECT(fil_page_get_type(page), FIL_PAGE_INDEX)) {
+		case FIL_PAGE_TYPE_ALLOCATED:
+		case FIL_PAGE_INODE:
+		case FIL_PAGE_IBUF_BITMAP:
+		case FIL_PAGE_TYPE_FSP_HDR:
+		case FIL_PAGE_TYPE_XDES:
+			/* These are essentially uncompressed pages. */
+			memcpy(page_zip->data, page, zip_size);
+			/* fall through */
+		case FIL_PAGE_TYPE_ZBLOB:
+		case FIL_PAGE_TYPE_ZBLOB2:
+		case FIL_PAGE_INDEX:
+			mach_write_ull(page_zip->data
+				       + FIL_PAGE_LSN, newest_lsn);
+			memset(page_zip->data + FIL_PAGE_FILE_FLUSH_LSN, 0, 8);
+			mach_write_to_4(page_zip->data
+					+ FIL_PAGE_SPACE_OR_CHKSUM,
+					srv_use_checksums
+					? page_zip_calc_checksum(
+						page_zip->data, zip_size)
+					: BUF_NO_CHECKSUM_MAGIC);
+			return;
+		}
+
+		ut_print_timestamp(stderr);
+		fputs("  InnoDB: ERROR: The compressed page to be written"
+		      " seems corrupt:", stderr);
+		ut_print_buf(stderr, page, zip_size);
+		fputs("\nInnoDB: Possibly older version of the page:", stderr);
+		ut_print_buf(stderr, page_zip->data, zip_size);
+		putc('\n', stderr);
+		ut_error;
+	}
+
+	/* Write the newest modification lsn to the page header and trailer */
+	mach_write_ull(page + FIL_PAGE_LSN, newest_lsn);
+
+	mach_write_ull(page + UNIV_PAGE_SIZE - FIL_PAGE_END_LSN_OLD_CHKSUM,
+		       newest_lsn);
+
+	/* Store the new formula checksum */
+
+	mach_write_to_4(page + FIL_PAGE_SPACE_OR_CHKSUM,
+			srv_use_checksums
+			? (!srv_fast_checksum
+			   ? buf_calc_page_new_checksum(page)
+			   : buf_calc_page_new_checksum_32(page))
+			: BUF_NO_CHECKSUM_MAGIC);
+
+	/* We overwrite the first 4 bytes of the end lsn field to store
+	the old formula checksum. Since it depends also on the field
+	FIL_PAGE_SPACE_OR_CHKSUM, it has to be calculated after storing the
+	new formula checksum. */
+
+	mach_write_to_4(page + UNIV_PAGE_SIZE - FIL_PAGE_END_LSN_OLD_CHKSUM,
+			srv_use_checksums
+			? buf_calc_page_old_checksum(page)
+			: BUF_NO_CHECKSUM_MAGIC);
+}
+
+#ifndef UNIV_HOTBACKUP
+/********************************************************************//**
+Does an asynchronous write of a buffer page. NOTE: in simulated aio and
+also when the doublewrite buffer is used, we must call
+buf_flush_buffered_writes after we have posted a batch of writes! */
+static
+void
+buf_flush_write_block_low(
+/*======================*/
+	buf_page_t*	bpage)	/*!< in: buffer block to write */
+{
+	ulint	zip_size	= buf_page_get_zip_size(bpage);
+	page_t*	frame		= NULL;
+#ifdef UNIV_LOG_DEBUG
+	static ibool univ_log_debug_warned;
+#endif /* UNIV_LOG_DEBUG */
+
+	ut_ad(buf_page_in_file(bpage));
+
+	/* We are not holding buf_pool_mutex or block_mutex here.
+	Nevertheless, it is safe to access bpage, because it is
+	io_fixed and oldest_modification != 0.  Thus, it cannot be
+	relocated in the buffer pool or removed from flush_list or
+	LRU_list. */
+	//ut_ad(!buf_pool_mutex_own());
+	ut_ad(!mutex_own(&LRU_list_mutex));
+	ut_ad(!mutex_own(&flush_list_mutex));
+	ut_ad(!mutex_own(buf_page_get_mutex(bpage)));
+	ut_ad(buf_page_get_io_fix(bpage) == BUF_IO_WRITE);
+	ut_ad(bpage->oldest_modification != 0);
+
+#ifdef UNIV_IBUF_COUNT_DEBUG
+	ut_a(ibuf_count_get(bpage->space, bpage->offset) == 0);
+#endif
+	ut_ad(bpage->newest_modification != 0);
+
+#ifdef UNIV_LOG_DEBUG
+	if (!univ_log_debug_warned) {
+		univ_log_debug_warned = TRUE;
+		fputs("Warning: cannot force log to disk if"
+		      " UNIV_LOG_DEBUG is defined!\n"
+		      "Crash recovery will not work!\n",
+		      stderr);
+	}
+#else
+	/* Force the log to the disk before writing the modified block */
+	log_write_up_to(bpage->newest_modification, LOG_WAIT_ALL_GROUPS, TRUE);
+#endif
+	switch (buf_page_get_state(bpage)) {
+	case BUF_BLOCK_ZIP_FREE:
+	case BUF_BLOCK_ZIP_PAGE: /* The page should be dirty. */
+	case BUF_BLOCK_NOT_USED:
+	case BUF_BLOCK_READY_FOR_USE:
+	case BUF_BLOCK_MEMORY:
+	case BUF_BLOCK_REMOVE_HASH:
+		ut_error;
+		break;
+	case BUF_BLOCK_ZIP_DIRTY:
+		frame = bpage->zip.data;
+		if (UNIV_LIKELY(srv_use_checksums)) {
+			ut_a(mach_read_from_4(frame + FIL_PAGE_SPACE_OR_CHKSUM)
+			     == page_zip_calc_checksum(frame, zip_size));
+		}
+		mach_write_ull(frame + FIL_PAGE_LSN,
+			       bpage->newest_modification);
+		memset(frame + FIL_PAGE_FILE_FLUSH_LSN, 0, 8);
+		break;
+	case BUF_BLOCK_FILE_PAGE:
+		frame = bpage->zip.data;
+		if (!frame) {
+			frame = ((buf_block_t*) bpage)->frame;
+		}
+
+		buf_flush_init_for_writing(((buf_block_t*) bpage)->frame,
+					   bpage->zip.data
+					   ? &bpage->zip : NULL,
+					   bpage->newest_modification);
+		break;
+	}
+
+	if (!srv_use_doublewrite_buf || !trx_doublewrite) {
+		fil_io(OS_FILE_WRITE | OS_AIO_SIMULATED_WAKE_LATER,
+		       FALSE, buf_page_get_space(bpage), zip_size,
+		       buf_page_get_page_no(bpage), 0,
+		       zip_size ? zip_size : UNIV_PAGE_SIZE,
+		       frame, bpage);
+	} else {
+		buf_flush_post_to_doublewrite_buf(bpage);
+	}
+}
+
+/********************************************************************//**
+Writes a flushable page asynchronously from the buffer pool to a file.
+NOTE: in simulated aio we must call
+os_aio_simulated_wake_handler_threads after we have posted a batch of
+writes! NOTE: buf_pool_mutex and buf_page_get_mutex(bpage) must be
+held upon entering this function, and they will be released by this
+function. */
+static
+void
+buf_flush_page(
+/*===========*/
+	buf_page_t*	bpage,		/*!< in: buffer control block */
+	enum buf_flush	flush_type)	/*!< in: BUF_FLUSH_LRU
+					or BUF_FLUSH_LIST */
+{
+	mutex_t*	block_mutex;
+	ibool		is_uncompressed;
+
+	ut_ad(flush_type == BUF_FLUSH_LRU || flush_type == BUF_FLUSH_LIST);
+	//ut_ad(buf_pool_mutex_own());
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(rw_lock_own(&page_hash_latch, RW_LOCK_EX)
+	      || rw_lock_own(&page_hash_latch, RW_LOCK_SHARED));
+#endif
+	ut_ad(buf_page_in_file(bpage));
+
+	block_mutex = buf_page_get_mutex(bpage);
+	ut_ad(mutex_own(block_mutex));
+
+	mutex_enter(&buf_pool_mutex);
+	rw_lock_s_unlock(&page_hash_latch);
+
+	ut_ad(buf_flush_ready_for_flush(bpage, flush_type));
+
+	buf_page_set_io_fix(bpage, BUF_IO_WRITE);
+
+	buf_page_set_flush_type(bpage, flush_type);
+
+	if (buf_pool->n_flush[flush_type] == 0) {
+
+		os_event_reset(buf_pool->no_flush[flush_type]);
+	}
+
+	buf_pool->n_flush[flush_type]++;
+
+	is_uncompressed = (buf_page_get_state(bpage) == BUF_BLOCK_FILE_PAGE);
+	ut_ad(is_uncompressed == (block_mutex != &buf_pool_zip_mutex));
+
+	switch (flush_type) {
+		ibool	is_s_latched;
+	case BUF_FLUSH_LIST:
+		/* If the simulated aio thread is not running, we must
+		not wait for any latch, as we may end up in a deadlock:
+		if buf_fix_count == 0, then we know we need not wait */
+
+		is_s_latched = (bpage->buf_fix_count == 0);
+		if (is_s_latched && is_uncompressed) {
+			rw_lock_s_lock_gen(&((buf_block_t*) bpage)->lock,
+					   BUF_IO_WRITE);
+		}
+
+		mutex_exit(block_mutex);
+		//buf_pool_mutex_exit();
+		mutex_exit(&buf_pool_mutex);
+
+		/* Even though bpage is not protected by any mutex at
+		this point, it is safe to access bpage, because it is
+		io_fixed and oldest_modification != 0.  Thus, it
+		cannot be relocated in the buffer pool or removed from
+		flush_list or LRU_list. */
+
+		if (!is_s_latched) {
+			buf_flush_buffered_writes();
+
+			if (is_uncompressed) {
+				rw_lock_s_lock_gen(&((buf_block_t*) bpage)
+						   ->lock, BUF_IO_WRITE);
+			}
+		}
+
+		break;
+
+	case BUF_FLUSH_LRU:
+		/* VERY IMPORTANT:
+		Because any thread may call the LRU flush, even when owning
+		locks on pages, to avoid deadlocks, we must make sure that the
+		s-lock is acquired on the page without waiting: this is
+		accomplished because buf_flush_ready_for_flush() must hold,
+		and that requires the page not to be bufferfixed. */
+
+		if (is_uncompressed) {
+			rw_lock_s_lock_gen(&((buf_block_t*) bpage)->lock,
+					   BUF_IO_WRITE);
+		}
+
+		/* Note that the s-latch is acquired before releasing the
+		buf_pool mutex: this ensures that the latch is acquired
+		immediately. */
+
+		mutex_exit(block_mutex);
+		//buf_pool_mutex_exit();
+		mutex_exit(&buf_pool_mutex);
+		break;
+
+	default:
+		ut_error;
+	}
+
+	/* Even though bpage is not protected by any mutex at this
+	point, it is safe to access bpage, because it is io_fixed and
+	oldest_modification != 0.  Thus, it cannot be relocated in the
+	buffer pool or removed from flush_list or LRU_list. */
+
+#ifdef UNIV_DEBUG
+	if (buf_debug_prints) {
+		fprintf(stderr,
+			"Flushing %u space %u page %u\n",
+			flush_type, bpage->space, bpage->offset);
+	}
+#endif /* UNIV_DEBUG */
+	buf_flush_write_block_low(bpage);
+}
+
+/***********************************************************//**
+Flushes to disk all flushable pages within the flush area.
+@return	number of pages flushed */
+static
+ulint
+buf_flush_try_neighbors(
+/*====================*/
+	ulint		space,		/*!< in: space id */
+	ulint		offset,		/*!< in: page offset */
+	enum buf_flush	flush_type,	/*!< in: BUF_FLUSH_LRU or
+					BUF_FLUSH_LIST */
+	ulint		flush_neighbors)
+{
+	buf_page_t*	bpage;
+	ulint		low, high;
+	ulint		count		= 0;
+	ulint		i;
+
+	ut_ad(flush_type == BUF_FLUSH_LRU || flush_type == BUF_FLUSH_LIST);
+
+	if (UT_LIST_GET_LEN(buf_pool->LRU) < BUF_LRU_OLD_MIN_LEN || !flush_neighbors) {
+		/* If there is little space, it is better not to flush any
+		block except from the end of the LRU list */
+
+		low = offset;
+		high = offset + 1;
+	} else {
+		/* When flushed, dirty blocks are searched in neighborhoods of
+		this size, and flushed along with the original page. */
+
+		ulint	buf_flush_area	= ut_min(BUF_READ_AHEAD_AREA,
+						 buf_pool->curr_size / 16);
+
+		low = (offset / buf_flush_area) * buf_flush_area;
+		high = (offset / buf_flush_area + 1) * buf_flush_area;
+	}
+
+	/* fprintf(stderr, "Flush area: low %lu high %lu\n", low, high); */
+
+	if (high > fil_space_get_size(space)) {
+		high = fil_space_get_size(space);
+	}
+
+	//buf_pool_mutex_enter();
+	rw_lock_s_lock(&page_hash_latch);
+
+	for (i = low; i < high; i++) {
+
+		bpage = buf_page_hash_get(space, i);
+
+		if (!bpage) {
+
+			continue;
+		}
+
+		ut_a(buf_page_in_file(bpage));
+
+		/* We avoid flushing 'non-old' blocks in an LRU flush,
+		because the flushed blocks are soon freed */
+
+		if (flush_type != BUF_FLUSH_LRU
+		    || i == offset
+		    || buf_page_is_old(bpage)) {
+			mutex_t* block_mutex = buf_page_get_mutex_enter(bpage);
+
+			if (block_mutex && buf_flush_ready_for_flush(bpage, flush_type)
+			    && (i == offset || !bpage->buf_fix_count)) {
+				/* We only try to flush those
+				neighbors != offset where the buf fix count is
+				zero, as we then know that we probably can
+				latch the page without a semaphore wait.
+				Semaphore waits are expensive because we must
+				flush the doublewrite buffer before we start
+				waiting. */
+
+				buf_flush_page(bpage, flush_type);
+				ut_ad(!mutex_own(block_mutex));
+				count++;
+
+				//buf_pool_mutex_enter();
+				rw_lock_s_lock(&page_hash_latch);
+			} else if (block_mutex) {
+				mutex_exit(block_mutex);
+			}
+		}
+	}
+
+	//buf_pool_mutex_exit();
+	rw_lock_s_unlock(&page_hash_latch);
+
+	return(count);
+}
+
+/*******************************************************************//**
+This utility flushes dirty blocks from the end of the LRU list or flush_list.
+NOTE 1: in the case of an LRU flush the calling thread may own latches to
+pages: to avoid deadlocks, this function must be written so that it cannot
+end up waiting for these latches! NOTE 2: in the case of a flush list flush,
+the calling thread is not allowed to own any latches on pages!
+@return number of blocks for which the write request was queued;
+ULINT_UNDEFINED if there was a flush of the same type already running */
+UNIV_INTERN
+ulint
+buf_flush_batch(
+/*============*/
+	enum buf_flush	flush_type,	/*!< in: BUF_FLUSH_LRU or
+					BUF_FLUSH_LIST; if BUF_FLUSH_LIST,
+					then the caller must not own any
+					latches on pages */
+	ulint		min_n,		/*!< in: wished minimum mumber of blocks
+					flushed (it is not guaranteed that the
+					actual number is that big, though) */
+	ib_uint64_t	lsn_limit)	/*!< in the case BUF_FLUSH_LIST all
+					blocks whose oldest_modification is
+					smaller than this should be flushed
+					(if their number does not exceed
+					min_n), otherwise ignored */
+{
+	buf_page_t*	bpage;
+	buf_page_t*	prev_bpage	= NULL;
+	ulint		page_count	= 0;
+	ulint		old_page_count;
+	ulint		space;
+	ulint		offset;
+	ulint		remaining	= 0;
+
+	ut_ad((flush_type == BUF_FLUSH_LRU)
+	      || (flush_type == BUF_FLUSH_LIST));
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad((flush_type != BUF_FLUSH_LIST)
+	      || sync_thread_levels_empty_gen(TRUE));
+#endif /* UNIV_SYNC_DEBUG */
+	//buf_pool_mutex_enter();
+	mutex_enter(&buf_pool_mutex);
+
+	if ((buf_pool->n_flush[flush_type] > 0)
+	    || (buf_pool->init_flush[flush_type] == TRUE)) {
+
+		/* There is already a flush batch of the same type running */
+
+		//buf_pool_mutex_exit();
+		mutex_exit(&buf_pool_mutex);
+
+		return(ULINT_UNDEFINED);
+	}
+
+	buf_pool->init_flush[flush_type] = TRUE;
+
+	mutex_exit(&buf_pool_mutex);
+
+	if (flush_type == BUF_FLUSH_LRU) {
+		mutex_enter(&LRU_list_mutex);
+	}
+
+	for (;;) {
+flush_next:
+		/* If we have flushed enough, leave the loop */
+		if (page_count >= min_n) {
+
+			break;
+		}
+
+		/* Start from the end of the list looking for a suitable
+		block to be flushed. */
+
+		if (flush_type == BUF_FLUSH_LRU) {
+			bpage = UT_LIST_GET_LAST(buf_pool->LRU);
+		} else {
+			ut_ad(flush_type == BUF_FLUSH_LIST);
+
+			mutex_enter(&flush_list_mutex);
+			remaining = UT_LIST_GET_LEN(buf_pool->flush_list);
+			bpage = UT_LIST_GET_LAST(buf_pool->flush_list);
+			if (bpage) {
+				prev_bpage = UT_LIST_GET_PREV(flush_list, bpage);
+			}
+			mutex_exit(&flush_list_mutex);
+			if (!bpage
+			    || bpage->oldest_modification >= lsn_limit) {
+				/* We have flushed enough */
+
+				break;
+			}
+			ut_ad(bpage->in_flush_list);
+		}
+
+		/* Note that after finding a single flushable page, we try to
+		flush also all its neighbors, and after that start from the
+		END of the LRU list or flush list again: the list may change
+		during the flushing and we cannot safely preserve within this
+		function a pointer to a block in the list! */
+
+		do {
+			mutex_t*block_mutex = buf_page_get_mutex_enter(bpage);
+			ibool	ready;
+
+			//ut_a(buf_page_in_file(bpage));
+
+			if (block_mutex) {
+				ready = buf_flush_ready_for_flush(bpage, flush_type);
+				mutex_exit(block_mutex);
+			} else {
+				ready = FALSE;
+			}
+
+			if (ready) {
+				space = buf_page_get_space(bpage);
+				offset = buf_page_get_page_no(bpage);
+
+				//buf_pool_mutex_exit();
+				if (flush_type == BUF_FLUSH_LRU) {
+					mutex_exit(&LRU_list_mutex);
+				}
+
+				old_page_count = page_count;
+
+				/* Try to flush also all the neighbors */
+				page_count += buf_flush_try_neighbors(
+					space, offset, flush_type, srv_flush_neighbor_pages);
+				/* fprintf(stderr,
+				"Flush type %lu, page no %lu, neighb %lu\n",
+				flush_type, offset,
+				page_count - old_page_count); */
+
+				//buf_pool_mutex_enter();
+				if (flush_type == BUF_FLUSH_LRU) {
+					mutex_enter(&LRU_list_mutex);
+				}
+				goto flush_next;
+
+			} else if (flush_type == BUF_FLUSH_LRU) {
+				bpage = UT_LIST_GET_PREV(LRU, bpage);
+			} else {
+				ut_ad(flush_type == BUF_FLUSH_LIST);
+
+				mutex_enter(&flush_list_mutex);
+				bpage = UT_LIST_GET_PREV(flush_list, bpage);
+				//ut_ad(!bpage || bpage->in_flush_list); /* optimistic */
+				if (bpage != prev_bpage) {
+					/* the search may warp.. retrying */
+					bpage = NULL;
+				}
+				if (bpage) {
+					prev_bpage = UT_LIST_GET_PREV(flush_list, bpage);
+				}
+				mutex_exit(&flush_list_mutex);
+				remaining--;
+			}
+		} while (bpage != NULL);
+
+		if (remaining)
+			goto flush_next;
+
+		/* If we could not find anything to flush, leave the loop */
+
+		break;
+	}
+
+	if (flush_type == BUF_FLUSH_LRU) {
+		mutex_exit(&LRU_list_mutex);
+	}
+
+	mutex_enter(&buf_pool_mutex);
+
+	buf_pool->init_flush[flush_type] = FALSE;
+
+	if (buf_pool->n_flush[flush_type] == 0) {
+
+		/* The running flush batch has ended */
+
+		os_event_set(buf_pool->no_flush[flush_type]);
+	}
+
+	//buf_pool_mutex_exit();
+	mutex_exit(&buf_pool_mutex);
+
+	buf_flush_buffered_writes();
+
+#ifdef UNIV_DEBUG
+	if (buf_debug_prints && page_count > 0) {
+		ut_a(flush_type == BUF_FLUSH_LRU
+		     || flush_type == BUF_FLUSH_LIST);
+		fprintf(stderr, flush_type == BUF_FLUSH_LRU
+			? "Flushed %lu pages in LRU flush\n"
+			: "Flushed %lu pages in flush list flush\n",
+			(ulong) page_count);
+	}
+#endif /* UNIV_DEBUG */
+
+	srv_buf_pool_flushed += page_count;
+
+	/* We keep track of all flushes happening as part of LRU
+	flush. When estimating the desired rate at which flush_list
+	should be flushed we factor in this value. */
+	if (flush_type == BUF_FLUSH_LRU) {
+		buf_lru_flush_page_count += page_count;
+	}
+
+	return(page_count);
+}
+
+/******************************************************************//**
+Waits until a flush batch of the given type ends */
+UNIV_INTERN
+void
+buf_flush_wait_batch_end(
+/*=====================*/
+	enum buf_flush	type)	/*!< in: BUF_FLUSH_LRU or BUF_FLUSH_LIST */
+{
+	ut_ad((type == BUF_FLUSH_LRU) || (type == BUF_FLUSH_LIST));
+
+	os_event_wait(buf_pool->no_flush[type]);
+}
+
+/******************************************************************//**
+Gives a recommendation of how many blocks should be flushed to establish
+a big enough margin of replaceable blocks near the end of the LRU list
+and in the free list.
+@return number of blocks which should be flushed from the end of the
+LRU list */
+static
+ulint
+buf_flush_LRU_recommendation(void)
+/*==============================*/
+{
+	buf_page_t*	bpage;
+	ulint		n_replaceable;
+	ulint		distance	= 0;
+	ibool		have_LRU_mutex = FALSE;
+
+	if(UT_LIST_GET_LEN(buf_pool->unzip_LRU))
+		have_LRU_mutex = TRUE;
+retry:
+	//buf_pool_mutex_enter();
+	if (have_LRU_mutex)
+		mutex_enter(&LRU_list_mutex);
+
+	n_replaceable = UT_LIST_GET_LEN(buf_pool->free);
+
+	bpage = UT_LIST_GET_LAST(buf_pool->LRU);
+
+	while ((bpage != NULL)
+	       && (n_replaceable < BUF_FLUSH_FREE_BLOCK_MARGIN
+		   + BUF_FLUSH_EXTRA_MARGIN)
+	       && (distance < BUF_LRU_FREE_SEARCH_LEN)) {
+
+		mutex_t* block_mutex;
+		if (!bpage->in_LRU_list) {
+			/* reatart. but it is very optimistic */
+			bpage = UT_LIST_GET_LAST(buf_pool->LRU);
+			continue;
+		}
+		block_mutex = buf_page_get_mutex_enter(bpage);
+
+		if (block_mutex && buf_flush_ready_for_replace(bpage)) {
+			n_replaceable++;
+		}
+
+		if (block_mutex) {
+			mutex_exit(block_mutex);
+		}
+
+		distance++;
+
+		bpage = UT_LIST_GET_PREV(LRU, bpage);
+	}
+
+	//buf_pool_mutex_exit();
+	if (have_LRU_mutex)
+		mutex_exit(&LRU_list_mutex);
+
+	if (n_replaceable >= BUF_FLUSH_FREE_BLOCK_MARGIN) {
+
+		return(0);
+	} else if (!have_LRU_mutex) {
+		/* confirm it again with LRU_mutex for exactness */
+		have_LRU_mutex = TRUE;
+		distance = 0;
+		goto retry;
+	}
+
+	return(BUF_FLUSH_FREE_BLOCK_MARGIN + BUF_FLUSH_EXTRA_MARGIN
+	       - n_replaceable);
+}
+
+/*********************************************************************//**
+Flushes pages from the end of the LRU list if there is too small a margin
+of replaceable pages there or in the free list. VERY IMPORTANT: this function
+is called also by threads which have locks on pages. To avoid deadlocks, we
+flush only pages such that the s-lock required for flushing can be acquired
+immediately, without waiting. */
+UNIV_INTERN
+void
+buf_flush_free_margin(
+/*=======================*/
+	ibool	wait)
+{
+	ulint	n_to_flush;
+	ulint	n_flushed;
+
+	n_to_flush = buf_flush_LRU_recommendation();
+
+	if (n_to_flush > 0) {
+		n_flushed = buf_flush_batch(BUF_FLUSH_LRU, n_to_flush, 0);
+		if (wait && n_flushed == ULINT_UNDEFINED) {
+			/* There was an LRU type flush batch already running;
+			let us wait for it to end */
+
+			buf_flush_wait_batch_end(BUF_FLUSH_LRU);
+		}
+	}
+}
+
+/*********************************************************************
+Update the historical stats that we are collecting for flush rate
+heuristics at the end of each interval.
+Flush rate heuristic depends on (a) rate of redo log generation and
+(b) the rate at which LRU flush is happening. */
+UNIV_INTERN
+void
+buf_flush_stat_update(void)
+/*=======================*/
+{
+	buf_flush_stat_t*	item;
+	ib_uint64_t		lsn_diff;
+	ib_uint64_t		lsn;
+	ulint			n_flushed;
+
+	lsn = log_get_lsn();
+	if (buf_flush_stat_cur.redo == 0) {
+		/* First time around. Just update the current LSN
+		and return. */
+		buf_flush_stat_cur.redo = lsn;
+		return;
+	}
+
+	item = &buf_flush_stat_arr[buf_flush_stat_arr_ind];
+
+	/* values for this interval */
+	lsn_diff = lsn - buf_flush_stat_cur.redo;
+	n_flushed = buf_lru_flush_page_count
+		    - buf_flush_stat_cur.n_flushed;
+
+	/* add the current value and subtract the obsolete entry. */
+	buf_flush_stat_sum.redo += lsn_diff - item->redo;
+	buf_flush_stat_sum.n_flushed += n_flushed - item->n_flushed;
+
+	/* put current entry in the array. */
+	item->redo = lsn_diff;
+	item->n_flushed = n_flushed;
+
+	/* update the index */
+	buf_flush_stat_arr_ind++;
+	buf_flush_stat_arr_ind %= BUF_FLUSH_STAT_N_INTERVAL;
+
+	/* reset the current entry. */
+	buf_flush_stat_cur.redo = lsn;
+	buf_flush_stat_cur.n_flushed = buf_lru_flush_page_count;
+}
+
+/*********************************************************************
+Determines the fraction of dirty pages that need to be flushed based
+on the speed at which we generate redo log. Note that if redo log
+is generated at a significant rate without corresponding increase
+in the number of dirty pages (for example, an in-memory workload)
+it can cause IO bursts of flushing. This function implements heuristics
+to avoid this burstiness.
+@return	number of dirty pages to be flushed / second */
+UNIV_INTERN
+ulint
+buf_flush_get_desired_flush_rate(void)
+/*==================================*/
+{
+	ulint			redo_avg;
+	ulint			lru_flush_avg;
+	ulint			n_dirty;
+	ulint			n_flush_req;
+	lint			rate;
+	ib_uint64_t		lsn = log_get_lsn();
+	ulint			log_capacity = log_get_capacity();
+
+	/* log_capacity should never be zero after the initialization
+	of log subsystem. */
+	ut_ad(log_capacity != 0);
+
+	/* Get total number of dirty pages. It is OK to access
+	flush_list without holding any mtex as we are using this
+	only for heuristics. */
+	n_dirty = UT_LIST_GET_LEN(buf_pool->flush_list);
+
+	/* An overflow can happen if we generate more than 2^32 bytes
+	of redo in this interval i.e.: 4G of redo in 1 second. We can
+	safely consider this as infinity because if we ever come close
+	to 4G we'll start a synchronous flush of dirty pages. */
+	/* redo_avg below is average at which redo is generated in
+	past BUF_FLUSH_STAT_N_INTERVAL + redo generated in the current
+	interval. */
+	redo_avg = (ulint) (buf_flush_stat_sum.redo
+			    / BUF_FLUSH_STAT_N_INTERVAL
+			    + (lsn - buf_flush_stat_cur.redo));
+
+	/* An overflow can happen possibly if we flush more than 2^32
+	pages in BUF_FLUSH_STAT_N_INTERVAL. This is a very very
+	unlikely scenario. Even when this happens it means that our
+	flush rate will be off the mark. It won't affect correctness
+	of any subsystem. */
+	/* lru_flush_avg below is rate at which pages are flushed as
+	part of LRU flush in past BUF_FLUSH_STAT_N_INTERVAL + the
+	number of pages flushed in the current interval. */
+	lru_flush_avg = buf_flush_stat_sum.n_flushed
+			/ BUF_FLUSH_STAT_N_INTERVAL
+			+ (buf_lru_flush_page_count
+			   - buf_flush_stat_cur.n_flushed);
+
+	n_flush_req = (n_dirty * redo_avg) / log_capacity;
+
+	/* The number of pages that we want to flush from the flush
+	list is the difference between the required rate and the
+	number of pages that we are historically flushing from the
+	LRU list */
+	rate = n_flush_req - lru_flush_avg;
+	return(rate > 0 ? (ulint) rate : 0);
+}
+
+#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
+/******************************************************************//**
+Validates the flush list.
+@return	TRUE if ok */
+static
+ibool
+buf_flush_validate_low(void)
+/*========================*/
+{
+	buf_page_t*		bpage;
+	const ib_rbt_node_t*	rnode = NULL;
+
+	UT_LIST_VALIDATE(flush_list, buf_page_t, buf_pool->flush_list,
+			 ut_ad(ut_list_node_313->in_flush_list));
+
+	bpage = UT_LIST_GET_FIRST(buf_pool->flush_list);
+
+	/* If we are in recovery mode i.e.: flush_rbt != NULL
+	then each block in the flush_list must also be present
+	in the flush_rbt. */
+	if (UNIV_LIKELY_NULL(buf_pool->flush_rbt)) {
+		rnode = rbt_first(buf_pool->flush_rbt);
+	}
+
+	while (bpage != NULL) {
+		const ib_uint64_t om = bpage->oldest_modification;
+		ut_ad(bpage->in_flush_list);
+		//ut_a(buf_page_in_file(bpage)); /* optimistic */
+		ut_a(om > 0);
+
+		if (UNIV_LIKELY_NULL(buf_pool->flush_rbt)) {
+			ut_a(rnode);
+			buf_page_t* rpage = *rbt_value(buf_page_t*,
+						       rnode);
+			ut_a(rpage);
+			ut_a(rpage == bpage);
+			rnode = rbt_next(buf_pool->flush_rbt, rnode);
+		}
+
+		bpage = UT_LIST_GET_NEXT(flush_list, bpage);
+
+		ut_a(!bpage || om >= bpage->oldest_modification);
+	}
+
+	/* By this time we must have exhausted the traversal of
+	flush_rbt (if active) as well. */
+	ut_a(rnode == NULL);
+
+	return(TRUE);
+}
+
+/******************************************************************//**
+Validates the flush list.
+@return	TRUE if ok */
+UNIV_INTERN
+ibool
+buf_flush_validate(void)
+/*====================*/
+{
+	ibool	ret;
+
+	//buf_pool_mutex_enter();
+	mutex_enter(&flush_list_mutex);
+
+	ret = buf_flush_validate_low();
+
+	//buf_pool_mutex_exit();
+	mutex_exit(&flush_list_mutex);
+
+	return(ret);
+}
+#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
+#endif /* !UNIV_HOTBACKUP */
diff --git a/storage/xtradb/buf/buf0lru.c b/storage/xtradb/buf/buf0lru.c
new file mode 100644
index 00000000000..94828940fd4
--- /dev/null
+++ b/storage/xtradb/buf/buf0lru.c
@@ -0,0 +1,2580 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2010, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file buf/buf0lru.c
+The database buffer replacement algorithm
+
+Created 11/5/1995 Heikki Tuuri
+*******************************************************/
+
+#include "buf0lru.h"
+
+#ifdef UNIV_NONINL
+#include "buf0lru.ic"
+#endif
+
+#include "ut0byte.h"
+#include "ut0lst.h"
+#include "ut0rnd.h"
+#include "sync0sync.h"
+#include "sync0rw.h"
+#include "hash0hash.h"
+#include "os0sync.h"
+#include "fil0fil.h"
+#include "btr0btr.h"
+#include "buf0buddy.h"
+#include "buf0buf.h"
+#include "buf0flu.h"
+#include "buf0rea.h"
+#include "btr0sea.h"
+#include "ibuf0ibuf.h"
+#include "os0file.h"
+#include "page0zip.h"
+#include "log0recv.h"
+#include "srv0srv.h"
+
+/** The number of blocks from the LRU_old pointer onward, including
+the block pointed to, must be buf_LRU_old_ratio/BUF_LRU_OLD_RATIO_DIV
+of the whole LRU list length, except that the tolerance defined below
+is allowed. Note that the tolerance must be small enough such that for
+even the BUF_LRU_OLD_MIN_LEN long LRU list, the LRU_old pointer is not
+allowed to point to either end of the LRU list. */
+
+#define BUF_LRU_OLD_TOLERANCE	20
+
+/** The minimum amount of non-old blocks when the LRU_old list exists
+(that is, when there are more than BUF_LRU_OLD_MIN_LEN blocks).
+@see buf_LRU_old_adjust_len */
+#define BUF_LRU_NON_OLD_MIN_LEN	5
+#if BUF_LRU_NON_OLD_MIN_LEN >= BUF_LRU_OLD_MIN_LEN
+# error "BUF_LRU_NON_OLD_MIN_LEN >= BUF_LRU_OLD_MIN_LEN"
+#endif
+
+/** When dropping the search hash index entries before deleting an ibd
+file, we build a local array of pages belonging to that tablespace
+in the buffer pool. Following is the size of that array. */
+#define BUF_LRU_DROP_SEARCH_HASH_SIZE	1024
+
+/** If we switch on the InnoDB monitor because there are too few available
+frames in the buffer pool, we set this to TRUE */
+static ibool	buf_lru_switched_on_innodb_mon	= FALSE;
+
+/******************************************************************//**
+These statistics are not 'of' LRU but 'for' LRU.  We keep count of I/O
+and page_zip_decompress() operations.  Based on the statistics,
+buf_LRU_evict_from_unzip_LRU() decides if we want to evict from
+unzip_LRU or the regular LRU.  From unzip_LRU, we will only evict the
+uncompressed frame (meaning we can evict dirty blocks as well).  From
+the regular LRU, we will evict the entire block (i.e.: both the
+uncompressed and compressed data), which must be clean. */
+
+/* @{ */
+
+/** Number of intervals for which we keep the history of these stats.
+Each interval is 1 second, defined by the rate at which
+srv_error_monitor_thread() calls buf_LRU_stat_update(). */
+#define BUF_LRU_STAT_N_INTERVAL 50
+
+/** Co-efficient with which we multiply I/O operations to equate them
+with page_zip_decompress() operations. */
+#define BUF_LRU_IO_TO_UNZIP_FACTOR 50
+
+/** Sampled values buf_LRU_stat_cur.
+Protected by buf_pool_mutex.  Updated by buf_LRU_stat_update(). */
+static buf_LRU_stat_t		buf_LRU_stat_arr[BUF_LRU_STAT_N_INTERVAL];
+/** Cursor to buf_LRU_stat_arr[] that is updated in a round-robin fashion. */
+static ulint			buf_LRU_stat_arr_ind;
+
+/** Current operation counters.  Not protected by any mutex.  Cleared
+by buf_LRU_stat_update(). */
+UNIV_INTERN buf_LRU_stat_t	buf_LRU_stat_cur;
+
+/** Running sum of past values of buf_LRU_stat_cur.
+Updated by buf_LRU_stat_update().  Protected by buf_pool_mutex. */
+UNIV_INTERN buf_LRU_stat_t	buf_LRU_stat_sum;
+
+/* @} */
+
+/** @name Heuristics for detecting index scan @{ */
+/** Reserve this much/BUF_LRU_OLD_RATIO_DIV of the buffer pool for
+"old" blocks.  Protected by buf_pool_mutex. */
+UNIV_INTERN uint	buf_LRU_old_ratio;
+/** Move blocks to "new" LRU list only if the first access was at
+least this many milliseconds ago.  Not protected by any mutex or latch. */
+UNIV_INTERN uint	buf_LRU_old_threshold_ms;
+/* @} */
+
+/******************************************************************//**
+Takes a block out of the LRU list and page hash table.
+If the block is compressed-only (BUF_BLOCK_ZIP_PAGE),
+the object will be freed and buf_pool_zip_mutex will be released.
+
+If a compressed page or a compressed-only block descriptor is freed,
+other compressed pages or compressed-only block descriptors may be
+relocated.
+@return the new state of the block (BUF_BLOCK_ZIP_FREE if the state
+was BUF_BLOCK_ZIP_PAGE, or BUF_BLOCK_REMOVE_HASH otherwise) */
+static
+enum buf_page_state
+buf_LRU_block_remove_hashed_page(
+/*=============================*/
+	buf_page_t*	bpage,	/*!< in: block, must contain a file page and
+				be in a state where it can be freed; there
+				may or may not be a hash index to the page */
+	ibool		zip);	/*!< in: TRUE if should remove also the
+				compressed page of an uncompressed page */
+/******************************************************************//**
+Puts a file page whose has no hash index to the free list. */
+static
+void
+buf_LRU_block_free_hashed_page(
+/*===========================*/
+	buf_block_t*	block,	/*!< in: block, must contain a file page and
+				be in a state where it can be freed */
+	ibool		have_page_hash_mutex);
+
+/******************************************************************//**
+Determines if the unzip_LRU list should be used for evicting a victim
+instead of the general LRU list.
+@return	TRUE if should use unzip_LRU */
+UNIV_INLINE
+ibool
+buf_LRU_evict_from_unzip_LRU(
+	ibool		have_LRU_mutex)
+/*==============================*/
+{
+	ulint	io_avg;
+	ulint	unzip_avg;
+
+	//ut_ad(buf_pool_mutex_own());
+
+	if (!have_LRU_mutex)
+		mutex_enter(&LRU_list_mutex);
+	/* If the unzip_LRU list is empty, we can only use the LRU. */
+	if (UT_LIST_GET_LEN(buf_pool->unzip_LRU) == 0) {
+		if (!have_LRU_mutex)
+			mutex_exit(&LRU_list_mutex);
+		return(FALSE);
+	}
+
+	/* If unzip_LRU is at most 10% of the size of the LRU list,
+	then use the LRU.  This slack allows us to keep hot
+	decompressed pages in the buffer pool. */
+	if (UT_LIST_GET_LEN(buf_pool->unzip_LRU)
+	    <= UT_LIST_GET_LEN(buf_pool->LRU) / 10) {
+		if (!have_LRU_mutex)
+			mutex_exit(&LRU_list_mutex);
+		return(FALSE);
+	}
+
+	/* If eviction hasn't started yet, we assume by default
+	that a workload is disk bound. */
+	if (buf_pool->freed_page_clock == 0) {
+		if (!have_LRU_mutex)
+			mutex_exit(&LRU_list_mutex);
+		return(TRUE);
+	}
+	if (!have_LRU_mutex)
+		mutex_exit(&LRU_list_mutex);
+
+	/* Calculate the average over past intervals, and add the values
+	of the current interval. */
+	io_avg = buf_LRU_stat_sum.io / BUF_LRU_STAT_N_INTERVAL
+		+ buf_LRU_stat_cur.io;
+	unzip_avg = buf_LRU_stat_sum.unzip / BUF_LRU_STAT_N_INTERVAL
+		+ buf_LRU_stat_cur.unzip;
+
+	/* Decide based on our formula.  If the load is I/O bound
+	(unzip_avg is smaller than the weighted io_avg), evict an
+	uncompressed frame from unzip_LRU.  Otherwise we assume that
+	the load is CPU bound and evict from the regular LRU. */
+	return(unzip_avg <= io_avg * BUF_LRU_IO_TO_UNZIP_FACTOR);
+}
+
+/******************************************************************//**
+Attempts to drop page hash index on a batch of pages belonging to a
+particular space id. */
+static
+void
+buf_LRU_drop_page_hash_batch(
+/*=========================*/
+	ulint		space_id,	/*!< in: space id */
+	ulint		zip_size,	/*!< in: compressed page size in bytes
+					or 0 for uncompressed pages */
+	const ulint*	arr,		/*!< in: array of page_no */
+	ulint		count)		/*!< in: number of entries in array */
+{
+	ulint	i;
+
+	ut_ad(arr != NULL);
+	ut_ad(count <= BUF_LRU_DROP_SEARCH_HASH_SIZE);
+
+	for (i = 0; i < count; ++i) {
+		btr_search_drop_page_hash_when_freed(space_id, zip_size,
+						     arr[i]);
+	}
+}
+
+/******************************************************************//**
+When doing a DROP TABLE/DISCARD TABLESPACE we have to drop all page
+hash index entries belonging to that table. This function tries to
+do that in batch. Note that this is a 'best effort' attempt and does
+not guarantee that ALL hash entries will be removed. */
+static
+void
+buf_LRU_drop_page_hash_for_tablespace(
+/*==================================*/
+	ulint	id)	/*!< in: space id */
+{
+	buf_page_t*	bpage;
+	ulint*		page_arr;
+	ulint		num_entries;
+	ulint		zip_size;
+
+	zip_size = fil_space_get_zip_size(id);
+
+	if (UNIV_UNLIKELY(zip_size == ULINT_UNDEFINED)) {
+		/* Somehow, the tablespace does not exist.  Nothing to drop. */
+		ut_ad(0);
+		return;
+	}
+
+	page_arr = ut_malloc(sizeof(ulint)
+			     * BUF_LRU_DROP_SEARCH_HASH_SIZE);
+	//buf_pool_mutex_enter();
+	mutex_enter(&LRU_list_mutex);
+
+scan_again:
+	num_entries = 0;
+	bpage = UT_LIST_GET_LAST(buf_pool->LRU);
+
+	while (bpage != NULL) {
+		mutex_t*	block_mutex = buf_page_get_mutex_enter(bpage);
+		buf_page_t*	prev_bpage;
+
+		prev_bpage = UT_LIST_GET_PREV(LRU, bpage);
+
+		if (!block_mutex) {
+			goto next_page;
+		}
+
+		ut_a(buf_page_in_file(bpage));
+
+		if (buf_page_get_state(bpage) != BUF_BLOCK_FILE_PAGE
+		    || bpage->space != id
+		    || bpage->buf_fix_count > 0
+		    || bpage->io_fix != BUF_IO_NONE) {
+			/* We leave the fixed pages as is in this scan.
+			To be dealt with later in the final scan. */
+			mutex_exit(block_mutex);
+			goto next_page;
+		}
+
+		if (((buf_block_t*) bpage)->is_hashed) {
+
+			/* Store the offset(i.e.: page_no) in the array
+			so that we can drop hash index in a batch
+			later. */
+			page_arr[num_entries] = bpage->offset;
+			mutex_exit(block_mutex);
+			ut_a(num_entries < BUF_LRU_DROP_SEARCH_HASH_SIZE);
+			++num_entries;
+
+			if (num_entries < BUF_LRU_DROP_SEARCH_HASH_SIZE) {
+				goto next_page;
+			}
+			/* Array full. We release the buf_pool_mutex to
+			obey the latching order. */
+			//buf_pool_mutex_exit();
+			mutex_exit(&LRU_list_mutex);
+
+			buf_LRU_drop_page_hash_batch(id, zip_size, page_arr,
+						     num_entries);
+			num_entries = 0;
+			//buf_pool_mutex_enter();
+			mutex_enter(&LRU_list_mutex);
+		} else {
+			mutex_exit(block_mutex);
+		}
+
+next_page:
+		/* Note that we may have released the buf_pool mutex
+		above after reading the prev_bpage during processing
+		of a page_hash_batch (i.e.: when the array was full).
+		This means that prev_bpage can change in LRU list.
+		This is OK because this function is a 'best effort'
+		to drop as many search hash entries as possible and
+		it does not guarantee that ALL such entries will be
+		dropped. */
+		bpage = prev_bpage;
+
+		/* If, however, bpage has been removed from LRU list
+		to the free list then we should restart the scan.
+		bpage->state is protected by buf_pool mutex. */
+		if (bpage && !buf_page_in_file(bpage)) {
+			ut_a(num_entries == 0);
+			goto scan_again;
+		}
+	}
+
+	//buf_pool_mutex_exit();
+	mutex_exit(&LRU_list_mutex);
+
+	/* Drop any remaining batch of search hashed pages. */
+	buf_LRU_drop_page_hash_batch(id, zip_size, page_arr, num_entries);
+	ut_free(page_arr);
+}
+
+/******************************************************************//**
+Invalidates all pages belonging to a given tablespace when we are deleting
+the data file(s) of that tablespace. */
+UNIV_INTERN
+void
+buf_LRU_invalidate_tablespace(
+/*==========================*/
+	ulint	id)	/*!< in: space id */
+{
+	buf_page_t*	bpage;
+	ibool		all_freed;
+
+	/* Before we attempt to drop pages one by one we first
+	attempt to drop page hash index entries in batches to make
+	it more efficient. The batching attempt is a best effort
+	attempt and does not guarantee that all pages hash entries
+	will be dropped. We get rid of remaining page hash entries
+	one by one below. */
+	buf_LRU_drop_page_hash_for_tablespace(id);
+
+scan_again:
+	//buf_pool_mutex_enter();
+	mutex_enter(&LRU_list_mutex);
+	rw_lock_x_lock(&page_hash_latch);
+
+	all_freed = TRUE;
+
+	bpage = UT_LIST_GET_LAST(buf_pool->LRU);
+
+	while (bpage != NULL) {
+		buf_page_t*	prev_bpage;
+		ibool		prev_bpage_buf_fix = FALSE;
+
+		ut_a(buf_page_in_file(bpage));
+
+		prev_bpage = UT_LIST_GET_PREV(LRU, bpage);
+
+		/* bpage->space and bpage->io_fix are protected by
+		buf_pool_mutex and block_mutex.  It is safe to check
+		them while holding buf_pool_mutex only. */
+
+		if (buf_page_get_space(bpage) != id) {
+			/* Skip this block, as it does not belong to
+			the space that is being invalidated. */
+		} else if (buf_page_get_io_fix(bpage) != BUF_IO_NONE) {
+			/* We cannot remove this page during this scan
+			yet; maybe the system is currently reading it
+			in, or flushing the modifications to the file */
+
+			all_freed = FALSE;
+		} else {
+			mutex_t* block_mutex = buf_page_get_mutex_enter(bpage);
+
+			if (!block_mutex) {
+				/* It may be impossible case...
+				Something wrong, so will be scan_again */
+
+				all_freed = FALSE;
+
+				goto next_page_no_mutex;
+			}
+
+			if (bpage->buf_fix_count > 0) {
+
+				/* We cannot remove this page during
+				this scan yet; maybe the system is
+				currently reading it in, or flushing
+				the modifications to the file */
+
+				all_freed = FALSE;
+
+				goto next_page;
+			}
+
+#ifdef UNIV_DEBUG
+			if (buf_debug_prints) {
+				fprintf(stderr,
+					"Dropping space %lu page %lu\n",
+					(ulong) buf_page_get_space(bpage),
+					(ulong) buf_page_get_page_no(bpage));
+			}
+#endif
+			if (buf_page_get_state(bpage) != BUF_BLOCK_FILE_PAGE) {
+				/* This is a compressed-only block
+				descriptor.  Ensure that prev_bpage
+				cannot be relocated when bpage is freed. */
+				if (UNIV_LIKELY(prev_bpage != NULL)) {
+					switch (buf_page_get_state(
+							prev_bpage)) {
+					case BUF_BLOCK_FILE_PAGE:
+						/* Descriptors of uncompressed
+						blocks will not be relocated,
+						because we are holding the
+						buf_pool_mutex. */
+						break;
+					case BUF_BLOCK_ZIP_PAGE:
+					case BUF_BLOCK_ZIP_DIRTY:
+						/* Descriptors of compressed-
+						only blocks can be relocated,
+						unless they are buffer-fixed.
+						Because both bpage and
+						prev_bpage are protected by
+						buf_pool_zip_mutex, it is
+						not necessary to acquire
+						further mutexes. */
+						ut_ad(&buf_pool_zip_mutex
+						      == block_mutex);
+						ut_ad(mutex_own(block_mutex));
+						prev_bpage_buf_fix = TRUE;
+						prev_bpage->buf_fix_count++;
+						break;
+					default:
+						ut_error;
+					}
+				}
+			} else if (((buf_block_t*) bpage)->is_hashed) {
+				ulint	page_no;
+				ulint	zip_size;
+
+				//buf_pool_mutex_exit();
+				mutex_exit(&LRU_list_mutex);
+				rw_lock_x_unlock(&page_hash_latch);
+
+				zip_size = buf_page_get_zip_size(bpage);
+				page_no = buf_page_get_page_no(bpage);
+
+				mutex_exit(block_mutex);
+
+				/* Note that the following call will acquire
+				an S-latch on the page */
+
+				btr_search_drop_page_hash_when_freed(
+					id, zip_size, page_no);
+				goto scan_again;
+			}
+
+			if (bpage->oldest_modification != 0) {
+
+				buf_flush_remove(bpage);
+			}
+
+			/* Remove from the LRU list. */
+
+			if (buf_LRU_block_remove_hashed_page(bpage, TRUE)
+			    != BUF_BLOCK_ZIP_FREE) {
+				buf_LRU_block_free_hashed_page((buf_block_t*)
+							       bpage, TRUE);
+			} else {
+				/* The block_mutex should have been
+				released by buf_LRU_block_remove_hashed_page()
+				when it returns BUF_BLOCK_ZIP_FREE. */
+				ut_ad(block_mutex == &buf_pool_zip_mutex);
+				ut_ad(!mutex_own(block_mutex));
+
+				if (prev_bpage_buf_fix) {
+					/* We temporarily buffer-fixed
+					prev_bpage, so that
+					buf_buddy_free() could not
+					relocate it, in case it was a
+					compressed-only block
+					descriptor. */
+
+					mutex_enter(block_mutex);
+					ut_ad(prev_bpage->buf_fix_count > 0);
+					prev_bpage->buf_fix_count--;
+					mutex_exit(block_mutex);
+				}
+
+				goto next_page_no_mutex;
+			}
+next_page:
+			mutex_exit(block_mutex);
+		}
+
+next_page_no_mutex:
+		bpage = prev_bpage;
+	}
+
+	//buf_pool_mutex_exit();
+	mutex_exit(&LRU_list_mutex);
+	rw_lock_x_unlock(&page_hash_latch);
+
+	if (!all_freed) {
+		os_thread_sleep(20000);
+
+		goto scan_again;
+	}
+}
+
+/********************************************************************//**
+Insert a compressed block into buf_pool->zip_clean in the LRU order. */
+UNIV_INTERN
+void
+buf_LRU_insert_zip_clean(
+/*=====================*/
+	buf_page_t*	bpage)	/*!< in: pointer to the block in question */
+{
+	buf_page_t*	b;
+
+	//ut_ad(buf_pool_mutex_own());
+	ut_ad(mutex_own(&LRU_list_mutex));
+	ut_ad(mutex_own(&flush_list_mutex));
+	ut_ad(buf_page_get_state(bpage) == BUF_BLOCK_ZIP_PAGE);
+
+	/* Find the first successor of bpage in the LRU list
+	that is in the zip_clean list. */
+	b = bpage;
+	do {
+		b = UT_LIST_GET_NEXT(LRU, b);
+	} while (b && (buf_page_get_state(b) != BUF_BLOCK_ZIP_PAGE || !b->in_LRU_list));
+
+	/* Insert bpage before b, i.e., after the predecessor of b. */
+	if (b) {
+		b = UT_LIST_GET_PREV(zip_list, b);
+	}
+
+	if (b) {
+		UT_LIST_INSERT_AFTER(zip_list, buf_pool->zip_clean, b, bpage);
+	} else {
+		UT_LIST_ADD_FIRST(zip_list, buf_pool->zip_clean, bpage);
+	}
+}
+
+/******************************************************************//**
+Try to free an uncompressed page of a compressed block from the unzip
+LRU list.  The compressed page is preserved, and it need not be clean.
+@return	TRUE if freed */
+UNIV_INLINE
+ibool
+buf_LRU_free_from_unzip_LRU_list(
+/*=============================*/
+	ulint	n_iterations,	/*!< in: how many times this has been called
+				repeatedly without result: a high value means
+				that we should search farther; we will search
+				n_iterations / 5 of the unzip_LRU list,
+				or nothing if n_iterations >= 5 */
+	ibool	have_LRU_mutex)
+{
+	buf_block_t*	block;
+	ulint		distance;
+
+	//ut_ad(buf_pool_mutex_own()); /* optimistic */
+
+	/* Theoratically it should be much easier to find a victim
+	from unzip_LRU as we can choose even a dirty block (as we'll
+	be evicting only the uncompressed frame).  In a very unlikely
+	eventuality that we are unable to find a victim from
+	unzip_LRU, we fall back to the regular LRU list.  We do this
+	if we have done five iterations so far. */
+
+	if (UNIV_UNLIKELY(n_iterations >= 5)
+	    || !buf_LRU_evict_from_unzip_LRU(have_LRU_mutex)) {
+
+		return(FALSE);
+	}
+
+	distance = 100 + (n_iterations
+			  * UT_LIST_GET_LEN(buf_pool->unzip_LRU)) / 5;
+
+restart:
+	for (block = UT_LIST_GET_LAST(buf_pool->unzip_LRU);
+	     UNIV_LIKELY(block != NULL) && UNIV_LIKELY(distance > 0);
+	     block = UT_LIST_GET_PREV(unzip_LRU, block), distance--) {
+
+		enum buf_lru_free_block_status	freed;
+
+		mutex_enter(&block->mutex);
+		if (!block->in_unzip_LRU_list || !block->page.in_LRU_list
+		    || buf_block_get_state(block) != BUF_BLOCK_FILE_PAGE) {
+			mutex_exit(&block->mutex);
+			goto restart;
+		}
+
+		ut_ad(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE);
+		ut_ad(block->in_unzip_LRU_list);
+		ut_ad(block->page.in_LRU_list);
+
+		freed = buf_LRU_free_block(&block->page, FALSE, NULL, have_LRU_mutex);
+		mutex_exit(&block->mutex);
+
+		switch (freed) {
+		case BUF_LRU_FREED:
+			return(TRUE);
+
+		case BUF_LRU_CANNOT_RELOCATE:
+			/* If we failed to relocate, try
+			regular LRU eviction. */
+			return(FALSE);
+
+		case BUF_LRU_NOT_FREED:
+			/* The block was buffer-fixed or I/O-fixed.
+			Keep looking. */
+			continue;
+		}
+
+		/* inappropriate return value from
+		buf_LRU_free_block() */
+		ut_error;
+	}
+
+	return(FALSE);
+}
+
+/******************************************************************//**
+Try to free a clean page from the common LRU list.
+@return	TRUE if freed */
+UNIV_INLINE
+ibool
+buf_LRU_free_from_common_LRU_list(
+/*==============================*/
+	ulint	n_iterations,	/*!< in: how many times this has been called
+				repeatedly without result: a high value means
+				that we should search farther; if
+				n_iterations < 10, then we search
+				n_iterations / 10 * buf_pool->curr_size
+				pages from the end of the LRU list */
+	ibool	have_LRU_mutex)
+{
+	buf_page_t*	bpage;
+	ulint		distance;
+
+	//ut_ad(buf_pool_mutex_own()); /* optimistic */
+
+	distance = 100 + (n_iterations * buf_pool->curr_size) / 10;
+
+restart:
+	for (bpage = UT_LIST_GET_LAST(buf_pool->LRU);
+	     UNIV_LIKELY(bpage != NULL) && UNIV_LIKELY(distance > 0);
+	     bpage = UT_LIST_GET_PREV(LRU, bpage), distance--) {
+
+		enum buf_lru_free_block_status	freed;
+		unsigned			accessed;
+		mutex_t*			block_mutex
+			= buf_page_get_mutex_enter(bpage);
+
+		if (!block_mutex) {
+			goto restart;
+		}
+
+		if (!bpage->in_LRU_list
+		    || !buf_page_in_file(bpage)) {
+			mutex_exit(block_mutex);
+			goto restart;
+		}
+
+		ut_ad(buf_page_in_file(bpage));
+		ut_ad(bpage->in_LRU_list);
+
+		accessed = buf_page_is_accessed(bpage);
+		freed = buf_LRU_free_block(bpage, TRUE, NULL, have_LRU_mutex);
+		mutex_exit(block_mutex);
+
+		switch (freed) {
+		case BUF_LRU_FREED:
+			/* Keep track of pages that are evicted without
+			ever being accessed. This gives us a measure of
+			the effectiveness of readahead */
+			if (!accessed) {
+				++buf_pool->stat.n_ra_pages_evicted;
+			}
+			return(TRUE);
+
+		case BUF_LRU_NOT_FREED:
+			/* The block was dirty, buffer-fixed, or I/O-fixed.
+			Keep looking. */
+			continue;
+
+		case BUF_LRU_CANNOT_RELOCATE:
+			/* This should never occur, because we
+			want to discard the compressed page too. */
+			break;
+		}
+
+		/* inappropriate return value from
+		buf_LRU_free_block() */
+		ut_error;
+	}
+
+	return(FALSE);
+}
+
+/******************************************************************//**
+Try to free a replaceable block.
+@return	TRUE if found and freed */
+UNIV_INTERN
+ibool
+buf_LRU_search_and_free_block(
+/*==========================*/
+	ulint	n_iterations)	/*!< in: how many times this has been called
+				repeatedly without result: a high value means
+				that we should search farther; if
+				n_iterations < 10, then we search
+				n_iterations / 10 * buf_pool->curr_size
+				pages from the end of the LRU list; if
+				n_iterations < 5, then we will also search
+				n_iterations / 5 of the unzip_LRU list. */
+{
+	ibool	freed = FALSE;
+	ibool	have_LRU_mutex = FALSE;
+
+	if (UT_LIST_GET_LEN(buf_pool->unzip_LRU))
+		have_LRU_mutex = TRUE;
+
+	/* optimistic search... */
+	//buf_pool_mutex_enter();
+	if (have_LRU_mutex)
+		mutex_enter(&LRU_list_mutex);
+
+	freed = buf_LRU_free_from_unzip_LRU_list(n_iterations, have_LRU_mutex);
+
+	if (!freed) {
+		freed = buf_LRU_free_from_common_LRU_list(n_iterations, have_LRU_mutex);
+	}
+
+	mutex_enter(&buf_pool_mutex);
+	if (!freed) {
+		buf_pool->LRU_flush_ended = 0;
+	} else if (buf_pool->LRU_flush_ended > 0) {
+		buf_pool->LRU_flush_ended--;
+	}
+	mutex_exit(&buf_pool_mutex);
+
+	//buf_pool_mutex_exit();
+	if (have_LRU_mutex)
+		mutex_exit(&LRU_list_mutex);
+
+	return(freed);
+}
+
+/******************************************************************//**
+Tries to remove LRU flushed blocks from the end of the LRU list and put them
+to the free list. This is beneficial for the efficiency of the insert buffer
+operation, as flushed pages from non-unique non-clustered indexes are here
+taken out of the buffer pool, and their inserts redirected to the insert
+buffer. Otherwise, the flushed blocks could get modified again before read
+operations need new buffer blocks, and the i/o work done in flushing would be
+wasted. */
+UNIV_INTERN
+void
+buf_LRU_try_free_flushed_blocks(void)
+/*=================================*/
+{
+	//buf_pool_mutex_enter();
+	mutex_enter(&buf_pool_mutex);
+
+	while (buf_pool->LRU_flush_ended > 0) {
+
+		//buf_pool_mutex_exit();
+		mutex_exit(&buf_pool_mutex);
+
+		buf_LRU_search_and_free_block(1);
+
+		//buf_pool_mutex_enter();
+		mutex_enter(&buf_pool_mutex);
+	}
+
+	//buf_pool_mutex_exit();
+	mutex_exit(&buf_pool_mutex);
+}
+
+/******************************************************************//**
+Returns TRUE if less than 25 % of the buffer pool is available. This can be
+used in heuristics to prevent huge transactions eating up the whole buffer
+pool for their locks.
+@return	TRUE if less than 25 % of buffer pool left */
+UNIV_INTERN
+ibool
+buf_LRU_buf_pool_running_out(void)
+/*==============================*/
+{
+	ibool	ret	= FALSE;
+
+	//buf_pool_mutex_enter();
+	mutex_enter(&LRU_list_mutex);
+	mutex_enter(&free_list_mutex);
+
+	if (!recv_recovery_on && UT_LIST_GET_LEN(buf_pool->free)
+	    + UT_LIST_GET_LEN(buf_pool->LRU) < buf_pool->curr_size / 4) {
+
+		ret = TRUE;
+	}
+
+	//buf_pool_mutex_exit();
+	mutex_exit(&LRU_list_mutex);
+	mutex_exit(&free_list_mutex);
+
+	return(ret);
+}
+
+/******************************************************************//**
+Returns a free block from the buf_pool.  The block is taken off the
+free list.  If it is empty, returns NULL.
+@return	a free control block, or NULL if the buf_block->free list is empty */
+UNIV_INTERN
+buf_block_t*
+buf_LRU_get_free_only(void)
+/*=======================*/
+{
+	buf_block_t*	block;
+
+	//ut_ad(buf_pool_mutex_own());
+
+	mutex_enter(&free_list_mutex);
+	block = (buf_block_t*) UT_LIST_GET_LAST(buf_pool->free);
+
+	if (block) {
+		ut_ad(block->page.in_free_list);
+		ut_d(block->page.in_free_list = FALSE);
+		ut_ad(!block->page.in_flush_list);
+		ut_ad(!block->page.in_LRU_list);
+		ut_a(!buf_page_in_file(&block->page));
+		UT_LIST_REMOVE(free, buf_pool->free, (&block->page));
+
+		mutex_exit(&free_list_mutex);
+
+		mutex_enter(&block->mutex);
+
+		buf_block_set_state(block, BUF_BLOCK_READY_FOR_USE);
+		UNIV_MEM_ALLOC(block->frame, UNIV_PAGE_SIZE);
+
+		mutex_exit(&block->mutex);
+	} else {
+		mutex_exit(&free_list_mutex);
+	}
+
+	return(block);
+}
+
+/******************************************************************//**
+Returns a free block from the buf_pool. The block is taken off the
+free list. If it is empty, blocks are moved from the end of the
+LRU list to the free list.
+@return	the free control block, in state BUF_BLOCK_READY_FOR_USE */
+UNIV_INTERN
+buf_block_t*
+buf_LRU_get_free_block(
+/*===================*/
+	ulint	zip_size)	/*!< in: compressed page size in bytes,
+				or 0 if uncompressed tablespace */
+{
+	buf_block_t*	block		= NULL;
+	ibool		freed;
+	ulint		n_iterations	= 1;
+	ibool		mon_value_was	= FALSE;
+	ibool		started_monitor	= FALSE;
+loop:
+	//buf_pool_mutex_enter();
+
+	if (!recv_recovery_on && UT_LIST_GET_LEN(buf_pool->free)
+	    + UT_LIST_GET_LEN(buf_pool->LRU) < buf_pool->curr_size / 20) {
+		ut_print_timestamp(stderr);
+
+		fprintf(stderr,
+			"  InnoDB: ERROR: over 95 percent of the buffer pool"
+			" is occupied by\n"
+			"InnoDB: lock heaps or the adaptive hash index!"
+			" Check that your\n"
+			"InnoDB: transactions do not set too many row locks.\n"
+			"InnoDB: Your buffer pool size is %lu MB."
+			" Maybe you should make\n"
+			"InnoDB: the buffer pool bigger?\n"
+			"InnoDB: We intentionally generate a seg fault"
+			" to print a stack trace\n"
+			"InnoDB: on Linux!\n",
+			(ulong) (buf_pool->curr_size
+				 / (1024 * 1024 / UNIV_PAGE_SIZE)));
+
+		ut_error;
+
+	} else if (!recv_recovery_on
+		   && (UT_LIST_GET_LEN(buf_pool->free)
+		       + UT_LIST_GET_LEN(buf_pool->LRU))
+		   < buf_pool->curr_size / 3) {
+
+		if (!buf_lru_switched_on_innodb_mon) {
+
+			/* Over 67 % of the buffer pool is occupied by lock
+			heaps or the adaptive hash index. This may be a memory
+			leak! */
+
+			ut_print_timestamp(stderr);
+			fprintf(stderr,
+				"  InnoDB: WARNING: over 67 percent of"
+				" the buffer pool is occupied by\n"
+				"InnoDB: lock heaps or the adaptive"
+				" hash index! Check that your\n"
+				"InnoDB: transactions do not set too many"
+				" row locks.\n"
+				"InnoDB: Your buffer pool size is %lu MB."
+				" Maybe you should make\n"
+				"InnoDB: the buffer pool bigger?\n"
+				"InnoDB: Starting the InnoDB Monitor to print"
+				" diagnostics, including\n"
+				"InnoDB: lock heap and hash index sizes.\n",
+				(ulong) (buf_pool->curr_size
+					 / (1024 * 1024 / UNIV_PAGE_SIZE)));
+
+			buf_lru_switched_on_innodb_mon = TRUE;
+			srv_print_innodb_monitor = TRUE;
+			os_event_set(srv_lock_timeout_thread_event);
+		}
+	} else if (buf_lru_switched_on_innodb_mon) {
+
+		/* Switch off the InnoDB Monitor; this is a simple way
+		to stop the monitor if the situation becomes less urgent,
+		but may also surprise users if the user also switched on the
+		monitor! */
+
+		buf_lru_switched_on_innodb_mon = FALSE;
+		srv_print_innodb_monitor = FALSE;
+	}
+
+	/* If there is a block in the free list, take it */
+	block = buf_LRU_get_free_only();
+	if (block) {
+
+#ifdef UNIV_DEBUG
+		block->page.zip.m_start =
+#endif /* UNIV_DEBUG */
+			block->page.zip.m_end =
+			block->page.zip.m_nonempty =
+			block->page.zip.n_blobs = 0;
+
+		if (UNIV_UNLIKELY(zip_size)) {
+			ibool	lru;
+			page_zip_set_size(&block->page.zip, zip_size);
+			mutex_enter(&LRU_list_mutex);
+			block->page.zip.data = buf_buddy_alloc(zip_size, &lru, FALSE);
+			mutex_exit(&LRU_list_mutex);
+			UNIV_MEM_DESC(block->page.zip.data, zip_size, block);
+		} else {
+			page_zip_set_size(&block->page.zip, 0);
+			block->page.zip.data = NULL;
+		}
+
+		//buf_pool_mutex_exit();
+
+		if (started_monitor) {
+			srv_print_innodb_monitor = mon_value_was;
+		}
+
+		return(block);
+	}
+
+	/* If no block was in the free list, search from the end of the LRU
+	list and try to free a block there */
+
+	//buf_pool_mutex_exit();
+
+	freed = buf_LRU_search_and_free_block(n_iterations);
+
+	if (freed > 0) {
+		goto loop;
+	}
+
+	if (n_iterations > 30) {
+		ut_print_timestamp(stderr);
+		fprintf(stderr,
+			"  InnoDB: Warning: difficult to find free blocks in\n"
+			"InnoDB: the buffer pool (%lu search iterations)!"
+			" Consider\n"
+			"InnoDB: increasing the buffer pool size.\n"
+			"InnoDB: It is also possible that"
+			" in your Unix version\n"
+			"InnoDB: fsync is very slow, or"
+			" completely frozen inside\n"
+			"InnoDB: the OS kernel. Then upgrading to"
+			" a newer version\n"
+			"InnoDB: of your operating system may help."
+			" Look at the\n"
+			"InnoDB: number of fsyncs in diagnostic info below.\n"
+			"InnoDB: Pending flushes (fsync) log: %lu;"
+			" buffer pool: %lu\n"
+			"InnoDB: %lu OS file reads, %lu OS file writes,"
+			" %lu OS fsyncs\n"
+			"InnoDB: Starting InnoDB Monitor to print further\n"
+			"InnoDB: diagnostics to the standard output.\n",
+			(ulong) n_iterations,
+			(ulong) fil_n_pending_log_flushes,
+			(ulong) fil_n_pending_tablespace_flushes,
+			(ulong) os_n_file_reads, (ulong) os_n_file_writes,
+			(ulong) os_n_fsyncs);
+
+		mon_value_was = srv_print_innodb_monitor;
+		started_monitor = TRUE;
+		srv_print_innodb_monitor = TRUE;
+		os_event_set(srv_lock_timeout_thread_event);
+	}
+
+	/* No free block was found: try to flush the LRU list */
+
+	buf_flush_free_margin(TRUE);
+	++srv_buf_pool_wait_free;
+
+	os_aio_simulated_wake_handler_threads();
+
+	//buf_pool_mutex_enter();
+	mutex_enter(&buf_pool_mutex);
+
+	if (buf_pool->LRU_flush_ended > 0) {
+		/* We have written pages in an LRU flush. To make the insert
+		buffer more efficient, we try to move these pages to the free
+		list. */
+
+		//buf_pool_mutex_exit();
+		mutex_exit(&buf_pool_mutex);
+
+		buf_LRU_try_free_flushed_blocks();
+	} else {
+		//buf_pool_mutex_exit();
+		mutex_exit(&buf_pool_mutex);
+	}
+
+	if (n_iterations > 10) {
+
+		os_thread_sleep(500000);
+	}
+
+	n_iterations++;
+
+	goto loop;
+}
+
+/*******************************************************************//**
+Moves the LRU_old pointer so that the length of the old blocks list
+is inside the allowed limits. */
+UNIV_INLINE
+void
+buf_LRU_old_adjust_len(void)
+/*========================*/
+{
+	ulint	old_len;
+	ulint	new_len;
+
+	ut_a(buf_pool->LRU_old);
+	//ut_ad(buf_pool_mutex_own());
+	ut_ad(mutex_own(&LRU_list_mutex));
+	ut_ad(buf_LRU_old_ratio >= BUF_LRU_OLD_RATIO_MIN);
+	ut_ad(buf_LRU_old_ratio <= BUF_LRU_OLD_RATIO_MAX);
+#if BUF_LRU_OLD_RATIO_MIN * BUF_LRU_OLD_MIN_LEN <= BUF_LRU_OLD_RATIO_DIV * (BUF_LRU_OLD_TOLERANCE + 5)
+# error "BUF_LRU_OLD_RATIO_MIN * BUF_LRU_OLD_MIN_LEN <= BUF_LRU_OLD_RATIO_DIV * (BUF_LRU_OLD_TOLERANCE + 5)"
+#endif
+#ifdef UNIV_LRU_DEBUG
+	/* buf_pool->LRU_old must be the first item in the LRU list
+	whose "old" flag is set. */
+	ut_a(buf_pool->LRU_old->old);
+	ut_a(!UT_LIST_GET_PREV(LRU, buf_pool->LRU_old)
+	     || !UT_LIST_GET_PREV(LRU, buf_pool->LRU_old)->old);
+	ut_a(!UT_LIST_GET_NEXT(LRU, buf_pool->LRU_old)
+	     || UT_LIST_GET_NEXT(LRU, buf_pool->LRU_old)->old);
+#endif /* UNIV_LRU_DEBUG */
+
+	old_len = buf_pool->LRU_old_len;
+	new_len = ut_min(UT_LIST_GET_LEN(buf_pool->LRU)
+			 * buf_LRU_old_ratio / BUF_LRU_OLD_RATIO_DIV,
+			 UT_LIST_GET_LEN(buf_pool->LRU)
+			 - (BUF_LRU_OLD_TOLERANCE
+			    + BUF_LRU_NON_OLD_MIN_LEN));
+
+	for (;;) {
+		buf_page_t*	LRU_old = buf_pool->LRU_old;
+
+		ut_a(LRU_old);
+		ut_ad(LRU_old->in_LRU_list);
+#ifdef UNIV_LRU_DEBUG
+		ut_a(LRU_old->old);
+#endif /* UNIV_LRU_DEBUG */
+
+		/* Update the LRU_old pointer if necessary */
+
+		if (old_len + BUF_LRU_OLD_TOLERANCE < new_len) {
+
+			buf_pool->LRU_old = LRU_old = UT_LIST_GET_PREV(
+				LRU, LRU_old);
+#ifdef UNIV_LRU_DEBUG
+			ut_a(!LRU_old->old);
+#endif /* UNIV_LRU_DEBUG */
+			old_len = ++buf_pool->LRU_old_len;
+			buf_page_set_old(LRU_old, TRUE);
+
+		} else if (old_len > new_len + BUF_LRU_OLD_TOLERANCE) {
+
+			buf_pool->LRU_old = UT_LIST_GET_NEXT(LRU, LRU_old);
+			old_len = --buf_pool->LRU_old_len;
+			buf_page_set_old(LRU_old, FALSE);
+		} else {
+			return;
+		}
+	}
+}
+
+/*******************************************************************//**
+Initializes the old blocks pointer in the LRU list. This function should be
+called when the LRU list grows to BUF_LRU_OLD_MIN_LEN length. */
+static
+void
+buf_LRU_old_init(void)
+/*==================*/
+{
+	buf_page_t*	bpage;
+
+	//ut_ad(buf_pool_mutex_own());
+	ut_ad(mutex_own(&LRU_list_mutex));
+	ut_a(UT_LIST_GET_LEN(buf_pool->LRU) == BUF_LRU_OLD_MIN_LEN);
+
+	/* We first initialize all blocks in the LRU list as old and then use
+	the adjust function to move the LRU_old pointer to the right
+	position */
+
+	for (bpage = UT_LIST_GET_LAST(buf_pool->LRU); bpage != NULL;
+	     bpage = UT_LIST_GET_PREV(LRU, bpage)) {
+		ut_ad(bpage->in_LRU_list);
+		ut_ad(buf_page_in_file(bpage));
+		/* This loop temporarily violates the
+		assertions of buf_page_set_old(). */
+		bpage->old = TRUE;
+	}
+
+	buf_pool->LRU_old = UT_LIST_GET_FIRST(buf_pool->LRU);
+	buf_pool->LRU_old_len = UT_LIST_GET_LEN(buf_pool->LRU);
+
+	buf_LRU_old_adjust_len();
+}
+
+/******************************************************************//**
+Remove a block from the unzip_LRU list if it belonged to the list. */
+static
+void
+buf_unzip_LRU_remove_block_if_needed(
+/*=================================*/
+	buf_page_t*	bpage)	/*!< in/out: control block */
+{
+	ut_ad(buf_pool);
+	ut_ad(bpage);
+	ut_ad(buf_page_in_file(bpage));
+	//ut_ad(buf_pool_mutex_own());
+	ut_ad(mutex_own(&LRU_list_mutex));
+
+	if (buf_page_belongs_to_unzip_LRU(bpage)) {
+		buf_block_t*	block = (buf_block_t*) bpage;
+
+		ut_ad(block->in_unzip_LRU_list);
+		block->in_unzip_LRU_list = FALSE;
+
+		UT_LIST_REMOVE(unzip_LRU, buf_pool->unzip_LRU, block);
+	}
+}
+
+/******************************************************************//**
+Removes a block from the LRU list. */
+UNIV_INLINE
+void
+buf_LRU_remove_block(
+/*=================*/
+	buf_page_t*	bpage)	/*!< in: control block */
+{
+	ut_ad(buf_pool);
+	ut_ad(bpage);
+	//ut_ad(buf_pool_mutex_own());
+	ut_ad(mutex_own(&LRU_list_mutex));
+
+	ut_a(buf_page_in_file(bpage));
+
+	ut_ad(bpage->in_LRU_list);
+
+	/* If the LRU_old pointer is defined and points to just this block,
+	move it backward one step */
+
+	if (UNIV_UNLIKELY(bpage == buf_pool->LRU_old)) {
+
+		/* Below: the previous block is guaranteed to exist,
+		because the LRU_old pointer is only allowed to differ
+		by BUF_LRU_OLD_TOLERANCE from strict
+		buf_LRU_old_ratio/BUF_LRU_OLD_RATIO_DIV of the LRU
+		list length. */
+		buf_page_t*	prev_bpage = UT_LIST_GET_PREV(LRU, bpage);
+
+		ut_a(prev_bpage);
+#ifdef UNIV_LRU_DEBUG
+		ut_a(!prev_bpage->old);
+#endif /* UNIV_LRU_DEBUG */
+		buf_pool->LRU_old = prev_bpage;
+		buf_page_set_old(prev_bpage, TRUE);
+
+		buf_pool->LRU_old_len++;
+	}
+
+	/* Remove the block from the LRU list */
+	UT_LIST_REMOVE(LRU, buf_pool->LRU, bpage);
+	bpage->in_LRU_list = FALSE;
+
+	buf_unzip_LRU_remove_block_if_needed(bpage);
+
+	/* If the LRU list is so short that LRU_old is not defined,
+	clear the "old" flags and return */
+	if (UT_LIST_GET_LEN(buf_pool->LRU) < BUF_LRU_OLD_MIN_LEN) {
+
+		for (bpage = UT_LIST_GET_FIRST(buf_pool->LRU); bpage != NULL;
+		     bpage = UT_LIST_GET_NEXT(LRU, bpage)) {
+			/* This loop temporarily violates the
+			assertions of buf_page_set_old(). */
+			bpage->old = FALSE;
+		}
+
+		buf_pool->LRU_old = NULL;
+		buf_pool->LRU_old_len = 0;
+
+		return;
+	}
+
+	ut_ad(buf_pool->LRU_old);
+
+	/* Update the LRU_old_len field if necessary */
+	if (buf_page_is_old(bpage)) {
+
+		buf_pool->LRU_old_len--;
+	}
+
+	/* Adjust the length of the old block list if necessary */
+	buf_LRU_old_adjust_len();
+}
+
+/******************************************************************//**
+Adds a block to the LRU list of decompressed zip pages. */
+UNIV_INTERN
+void
+buf_unzip_LRU_add_block(
+/*====================*/
+	buf_block_t*	block,	/*!< in: control block */
+	ibool		old)	/*!< in: TRUE if should be put to the end
+				of the list, else put to the start */
+{
+	ut_ad(buf_pool);
+	ut_ad(block);
+	//ut_ad(buf_pool_mutex_own());
+	ut_ad(mutex_own(&LRU_list_mutex));
+
+	ut_a(buf_page_belongs_to_unzip_LRU(&block->page));
+
+	ut_ad(!block->in_unzip_LRU_list);
+	block->in_unzip_LRU_list = TRUE;
+
+	if (old) {
+		UT_LIST_ADD_LAST(unzip_LRU, buf_pool->unzip_LRU, block);
+	} else {
+		UT_LIST_ADD_FIRST(unzip_LRU, buf_pool->unzip_LRU, block);
+	}
+}
+
+/******************************************************************//**
+Adds a block to the LRU list end. */
+UNIV_INLINE
+void
+buf_LRU_add_block_to_end_low(
+/*=========================*/
+	buf_page_t*	bpage)	/*!< in: control block */
+{
+	ut_ad(buf_pool);
+	ut_ad(bpage);
+	//ut_ad(buf_pool_mutex_own());
+	ut_ad(mutex_own(&LRU_list_mutex));
+
+	ut_a(buf_page_in_file(bpage));
+
+	ut_ad(!bpage->in_LRU_list);
+	UT_LIST_ADD_LAST(LRU, buf_pool->LRU, bpage);
+	bpage->in_LRU_list = TRUE;
+
+	if (UT_LIST_GET_LEN(buf_pool->LRU) > BUF_LRU_OLD_MIN_LEN) {
+
+		ut_ad(buf_pool->LRU_old);
+
+		/* Adjust the length of the old block list if necessary */
+
+		buf_page_set_old(bpage, TRUE);
+		buf_pool->LRU_old_len++;
+		buf_LRU_old_adjust_len();
+
+	} else if (UT_LIST_GET_LEN(buf_pool->LRU) == BUF_LRU_OLD_MIN_LEN) {
+
+		/* The LRU list is now long enough for LRU_old to become
+		defined: init it */
+
+		buf_LRU_old_init();
+	} else {
+		buf_page_set_old(bpage, buf_pool->LRU_old != NULL);
+	}
+
+	/* If this is a zipped block with decompressed frame as well
+	then put it on the unzip_LRU list */
+	if (buf_page_belongs_to_unzip_LRU(bpage)) {
+		buf_unzip_LRU_add_block((buf_block_t*) bpage, TRUE);
+	}
+}
+
+/******************************************************************//**
+Adds a block to the LRU list. */
+UNIV_INLINE
+void
+buf_LRU_add_block_low(
+/*==================*/
+	buf_page_t*	bpage,	/*!< in: control block */
+	ibool		old)	/*!< in: TRUE if should be put to the old blocks
+				in the LRU list, else put to the start; if the
+				LRU list is very short, the block is added to
+				the start, regardless of this parameter */
+{
+	ut_ad(buf_pool);
+	ut_ad(bpage);
+	//ut_ad(buf_pool_mutex_own());
+	ut_ad(mutex_own(&LRU_list_mutex));
+
+	ut_a(buf_page_in_file(bpage));
+	ut_ad(!bpage->in_LRU_list);
+
+	if (!old || (UT_LIST_GET_LEN(buf_pool->LRU) < BUF_LRU_OLD_MIN_LEN)) {
+
+		UT_LIST_ADD_FIRST(LRU, buf_pool->LRU, bpage);
+
+		bpage->freed_page_clock = buf_pool->freed_page_clock;
+	} else {
+#ifdef UNIV_LRU_DEBUG
+		/* buf_pool->LRU_old must be the first item in the LRU list
+		whose "old" flag is set. */
+		ut_a(buf_pool->LRU_old->old);
+		ut_a(!UT_LIST_GET_PREV(LRU, buf_pool->LRU_old)
+		     || !UT_LIST_GET_PREV(LRU, buf_pool->LRU_old)->old);
+		ut_a(!UT_LIST_GET_NEXT(LRU, buf_pool->LRU_old)
+		     || UT_LIST_GET_NEXT(LRU, buf_pool->LRU_old)->old);
+#endif /* UNIV_LRU_DEBUG */
+		UT_LIST_INSERT_AFTER(LRU, buf_pool->LRU, buf_pool->LRU_old,
+				     bpage);
+		buf_pool->LRU_old_len++;
+	}
+
+	bpage->in_LRU_list = TRUE;
+
+	if (UT_LIST_GET_LEN(buf_pool->LRU) > BUF_LRU_OLD_MIN_LEN) {
+
+		ut_ad(buf_pool->LRU_old);
+
+		/* Adjust the length of the old block list if necessary */
+
+		buf_page_set_old(bpage, old);
+		buf_LRU_old_adjust_len();
+
+	} else if (UT_LIST_GET_LEN(buf_pool->LRU) == BUF_LRU_OLD_MIN_LEN) {
+
+		/* The LRU list is now long enough for LRU_old to become
+		defined: init it */
+
+		buf_LRU_old_init();
+	} else {
+		buf_page_set_old(bpage, buf_pool->LRU_old != NULL);
+	}
+
+	/* If this is a zipped block with decompressed frame as well
+	then put it on the unzip_LRU list */
+	if (buf_page_belongs_to_unzip_LRU(bpage)) {
+		buf_unzip_LRU_add_block((buf_block_t*) bpage, old);
+	}
+}
+
+/******************************************************************//**
+Adds a block to the LRU list. */
+UNIV_INTERN
+void
+buf_LRU_add_block(
+/*==============*/
+	buf_page_t*	bpage,	/*!< in: control block */
+	ibool		old)	/*!< in: TRUE if should be put to the old
+				blocks in the LRU list, else put to the start;
+				if the LRU list is very short, the block is
+				added to the start, regardless of this
+				parameter */
+{
+	buf_LRU_add_block_low(bpage, old);
+}
+
+/******************************************************************//**
+Moves a block to the start of the LRU list. */
+UNIV_INTERN
+void
+buf_LRU_make_block_young(
+/*=====================*/
+	buf_page_t*	bpage)	/*!< in: control block */
+{
+	//ut_ad(buf_pool_mutex_own());
+	ut_ad(mutex_own(&LRU_list_mutex));
+
+	if (bpage->old) {
+		buf_pool->stat.n_pages_made_young++;
+	}
+
+	buf_LRU_remove_block(bpage);
+	buf_LRU_add_block_low(bpage, FALSE);
+}
+
+/******************************************************************//**
+Moves a block to the end of the LRU list. */
+UNIV_INTERN
+void
+buf_LRU_make_block_old(
+/*===================*/
+	buf_page_t*	bpage)	/*!< in: control block */
+{
+	buf_LRU_remove_block(bpage);
+	buf_LRU_add_block_to_end_low(bpage);
+}
+
+/******************************************************************//**
+Try to free a block.  If bpage is a descriptor of a compressed-only
+page, the descriptor object will be freed as well.
+
+NOTE: If this function returns BUF_LRU_FREED, it will temporarily
+release buf_pool_mutex.  Furthermore, the page frame will no longer be
+accessible via bpage.
+
+The caller must hold buf_pool_mutex and buf_page_get_mutex(bpage) and
+release these two mutexes after the call.  No other
+buf_page_get_mutex() may be held when calling this function.
+@return BUF_LRU_FREED if freed, BUF_LRU_CANNOT_RELOCATE or
+BUF_LRU_NOT_FREED otherwise. */
+UNIV_INTERN
+enum buf_lru_free_block_status
+buf_LRU_free_block(
+/*===============*/
+	buf_page_t*	bpage,	/*!< in: block to be freed */
+	ibool		zip,	/*!< in: TRUE if should remove also the
+				compressed page of an uncompressed page */
+	ibool*		buf_pool_mutex_released,
+				/*!< in: pointer to a variable that will
+				be assigned TRUE if buf_pool_mutex
+				was temporarily released, or NULL */
+	ibool		have_LRU_mutex)
+{
+	buf_page_t*	b = NULL;
+	mutex_t*	block_mutex = buf_page_get_mutex(bpage);
+
+	//ut_ad(buf_pool_mutex_own());
+	ut_ad(mutex_own(block_mutex));
+	ut_ad(buf_page_in_file(bpage));
+	//ut_ad(bpage->in_LRU_list);
+	ut_ad(!bpage->in_flush_list == !bpage->oldest_modification);
+#if UNIV_WORD_SIZE == 4
+	/* On 32-bit systems, there is no padding in buf_page_t.  On
+	other systems, Valgrind could complain about uninitialized pad
+	bytes. */
+	UNIV_MEM_ASSERT_RW(bpage, sizeof *bpage);
+#endif
+
+	if (!bpage->in_LRU_list || !block_mutex || !buf_page_can_relocate(bpage)) {
+
+		/* Do not free buffer-fixed or I/O-fixed blocks. */
+		return(BUF_LRU_NOT_FREED);
+	}
+
+#ifdef UNIV_IBUF_COUNT_DEBUG
+	ut_a(ibuf_count_get(bpage->space, bpage->offset) == 0);
+#endif /* UNIV_IBUF_COUNT_DEBUG */
+
+	if (zip || !bpage->zip.data) {
+		/* This would completely free the block. */
+		/* Do not completely free dirty blocks. */
+
+		if (bpage->oldest_modification) {
+			return(BUF_LRU_NOT_FREED);
+		}
+	} else if (bpage->oldest_modification) {
+		/* Do not completely free dirty blocks. */
+
+		if (buf_page_get_state(bpage) != BUF_BLOCK_FILE_PAGE) {
+			ut_ad(buf_page_get_state(bpage)
+			      == BUF_BLOCK_ZIP_DIRTY);
+			return(BUF_LRU_NOT_FREED);
+		}
+
+		goto alloc;
+	} else if (buf_page_get_state(bpage) == BUF_BLOCK_FILE_PAGE) {
+		/* Allocate the control block for the compressed page.
+		If it cannot be allocated (without freeing a block
+		from the LRU list), refuse to free bpage. */
+alloc:
+		//buf_pool_mutex_exit_forbid();
+		b = buf_buddy_alloc(sizeof *b, NULL, FALSE);
+		//buf_pool_mutex_exit_allow();
+
+		if (UNIV_UNLIKELY(!b)) {
+			return(BUF_LRU_CANNOT_RELOCATE);
+		}
+
+		//memcpy(b, bpage, sizeof *b);
+	}
+
+#ifdef UNIV_DEBUG
+	if (buf_debug_prints) {
+		fprintf(stderr, "Putting space %lu page %lu to free list\n",
+			(ulong) buf_page_get_space(bpage),
+			(ulong) buf_page_get_page_no(bpage));
+	}
+#endif /* UNIV_DEBUG */
+
+	/* not to break latch order, must re-enter block_mutex */
+	mutex_exit(block_mutex);
+
+	if (!have_LRU_mutex)
+		mutex_enter(&LRU_list_mutex); /* optimistic */
+	rw_lock_x_lock(&page_hash_latch);
+	mutex_enter(block_mutex);
+
+	/* recheck states of block */
+	if (!bpage->in_LRU_list || block_mutex != buf_page_get_mutex(bpage)
+	    || !buf_page_can_relocate(bpage)) {
+not_freed:
+		if (b) {
+			buf_buddy_free(b, sizeof *b, TRUE);
+		}
+		if (!have_LRU_mutex)
+			mutex_exit(&LRU_list_mutex);
+		rw_lock_x_unlock(&page_hash_latch);
+		return(BUF_LRU_NOT_FREED);
+	} else if (zip || !bpage->zip.data) {
+		if (bpage->oldest_modification)
+			goto not_freed;
+	} else if (bpage->oldest_modification) {
+		if (buf_page_get_state(bpage) != BUF_BLOCK_FILE_PAGE) {
+			ut_ad(buf_page_get_state(bpage) == BUF_BLOCK_ZIP_DIRTY);
+			goto not_freed;
+		}
+	}
+
+	if (b) {
+		memcpy(b, bpage, sizeof *b);
+	}
+
+	if (buf_LRU_block_remove_hashed_page(bpage, zip)
+	    != BUF_BLOCK_ZIP_FREE) {
+		ut_a(bpage->buf_fix_count == 0);
+
+		if (b) {
+			buf_page_t*	prev_b	= UT_LIST_GET_PREV(LRU, b);
+			const ulint	fold	= buf_page_address_fold(
+				bpage->space, bpage->offset);
+
+			ut_a(!buf_page_hash_get(bpage->space, bpage->offset));
+
+			while (prev_b && !prev_b->in_LRU_list) {
+				prev_b = UT_LIST_GET_PREV(LRU, prev_b);
+			}
+
+			b->state = b->oldest_modification
+				? BUF_BLOCK_ZIP_DIRTY
+				: BUF_BLOCK_ZIP_PAGE;
+			UNIV_MEM_DESC(b->zip.data,
+				      page_zip_get_size(&b->zip), b);
+
+			/* The fields in_page_hash and in_LRU_list of
+			the to-be-freed block descriptor should have
+			been cleared in
+			buf_LRU_block_remove_hashed_page(), which
+			invokes buf_LRU_remove_block(). */
+			ut_ad(!bpage->in_page_hash);
+			ut_ad(!bpage->in_LRU_list);
+			/* bpage->state was BUF_BLOCK_FILE_PAGE because
+			b != NULL. The type cast below is thus valid. */
+			ut_ad(!((buf_block_t*) bpage)->in_unzip_LRU_list);
+
+			/* The fields of bpage were copied to b before
+			buf_LRU_block_remove_hashed_page() was invoked. */
+			ut_ad(!b->in_zip_hash);
+			ut_ad(b->in_page_hash);
+			ut_ad(b->in_LRU_list);
+
+			HASH_INSERT(buf_page_t, hash,
+				    buf_pool->page_hash, fold, b);
+
+			/* Insert b where bpage was in the LRU list. */
+			if (UNIV_LIKELY(prev_b != NULL)) {
+				ulint	lru_len;
+
+				ut_ad(prev_b->in_LRU_list);
+				ut_ad(buf_page_in_file(prev_b));
+#if UNIV_WORD_SIZE == 4
+				/* On 32-bit systems, there is no
+				padding in buf_page_t.  On other
+				systems, Valgrind could complain about
+				uninitialized pad bytes. */
+				UNIV_MEM_ASSERT_RW(prev_b, sizeof *prev_b);
+#endif
+				UT_LIST_INSERT_AFTER(LRU, buf_pool->LRU,
+						     prev_b, b);
+
+				if (buf_page_is_old(b)) {
+					buf_pool->LRU_old_len++;
+					if (UNIV_UNLIKELY
+					    (buf_pool->LRU_old
+					     == UT_LIST_GET_NEXT(LRU, b))) {
+
+						buf_pool->LRU_old = b;
+					}
+				}
+
+				lru_len = UT_LIST_GET_LEN(buf_pool->LRU);
+
+				if (lru_len > BUF_LRU_OLD_MIN_LEN) {
+					ut_ad(buf_pool->LRU_old);
+					/* Adjust the length of the
+					old block list if necessary */
+					buf_LRU_old_adjust_len();
+				} else if (lru_len == BUF_LRU_OLD_MIN_LEN) {
+					/* The LRU list is now long
+					enough for LRU_old to become
+					defined: init it */
+					buf_LRU_old_init();
+				}
+#ifdef UNIV_LRU_DEBUG
+				/* Check that the "old" flag is consistent
+				in the block and its neighbours. */
+				buf_page_set_old(b, buf_page_is_old(b));
+#endif /* UNIV_LRU_DEBUG */
+			} else {
+				b->in_LRU_list = FALSE;
+				buf_LRU_add_block_low(b, buf_page_is_old(b));
+			}
+
+			mutex_enter(&flush_list_mutex);
+			if (b->state == BUF_BLOCK_ZIP_PAGE) {
+				buf_LRU_insert_zip_clean(b);
+			} else {
+				/* Relocate on buf_pool->flush_list. */
+				buf_flush_relocate_on_flush_list(bpage, b);
+			}
+			mutex_exit(&flush_list_mutex);
+
+			bpage->zip.data = NULL;
+			page_zip_set_size(&bpage->zip, 0);
+
+			/* Prevent buf_page_get_gen() from
+			decompressing the block while we release
+			buf_pool_mutex and block_mutex. */
+			b->buf_fix_count++;
+			b->io_fix = BUF_IO_READ;
+		}
+
+		if (buf_pool_mutex_released) {
+			*buf_pool_mutex_released = TRUE;
+		}
+
+		//buf_pool_mutex_exit();
+		mutex_exit(&LRU_list_mutex);
+		rw_lock_x_unlock(&page_hash_latch);
+		mutex_exit(block_mutex);
+
+		/* Remove possible adaptive hash index on the page.
+		The page was declared uninitialized by
+		buf_LRU_block_remove_hashed_page().  We need to flag
+		the contents of the page valid (which it still is) in
+		order to avoid bogus Valgrind warnings.*/
+
+		UNIV_MEM_VALID(((buf_block_t*) bpage)->frame,
+			       UNIV_PAGE_SIZE);
+		btr_search_drop_page_hash_index((buf_block_t*) bpage);
+		UNIV_MEM_INVALID(((buf_block_t*) bpage)->frame,
+				 UNIV_PAGE_SIZE);
+
+		if (b) {
+			/* Compute and stamp the compressed page
+			checksum while not holding any mutex.  The
+			block is already half-freed
+			(BUF_BLOCK_REMOVE_HASH) and removed from
+			buf_pool->page_hash, thus inaccessible by any
+			other thread. */
+
+			mach_write_to_4(
+				b->zip.data + FIL_PAGE_SPACE_OR_CHKSUM,
+				UNIV_LIKELY(srv_use_checksums)
+				? page_zip_calc_checksum(
+					b->zip.data,
+					page_zip_get_size(&b->zip))
+				: BUF_NO_CHECKSUM_MAGIC);
+		}
+
+		//buf_pool_mutex_enter();
+		if (have_LRU_mutex)
+			mutex_enter(&LRU_list_mutex);
+		mutex_enter(block_mutex);
+
+		if (b) {
+			mutex_enter(&buf_pool_zip_mutex);
+			b->buf_fix_count--;
+			buf_page_set_io_fix(b, BUF_IO_NONE);
+			mutex_exit(&buf_pool_zip_mutex);
+		}
+
+		buf_LRU_block_free_hashed_page((buf_block_t*) bpage, FALSE);
+	} else {
+		/* The block_mutex should have been released by
+		buf_LRU_block_remove_hashed_page() when it returns
+		BUF_BLOCK_ZIP_FREE. */
+		ut_ad(block_mutex == &buf_pool_zip_mutex);
+		mutex_enter(block_mutex);
+
+		if (!have_LRU_mutex)
+			mutex_exit(&LRU_list_mutex);
+		rw_lock_x_unlock(&page_hash_latch);
+	}
+
+	return(BUF_LRU_FREED);
+}
+
+/******************************************************************//**
+Puts a block back to the free list. */
+UNIV_INTERN
+void
+buf_LRU_block_free_non_file_page(
+/*=============================*/
+	buf_block_t*	block,	/*!< in: block, must not contain a file page */
+	ibool		have_page_hash_mutex)
+{
+	void*	data;
+
+	ut_ad(block);
+	//ut_ad(buf_pool_mutex_own());
+	ut_ad(mutex_own(&block->mutex));
+
+	switch (buf_block_get_state(block)) {
+	case BUF_BLOCK_MEMORY:
+	case BUF_BLOCK_READY_FOR_USE:
+		break;
+	default:
+		ut_error;
+	}
+
+#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
+	ut_a(block->n_pointers == 0);
+#endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */
+	ut_ad(!block->page.in_free_list);
+	ut_ad(!block->page.in_flush_list);
+	ut_ad(!block->page.in_LRU_list);
+
+	buf_block_set_state(block, BUF_BLOCK_NOT_USED);
+
+	UNIV_MEM_ALLOC(block->frame, UNIV_PAGE_SIZE);
+#ifdef UNIV_DEBUG
+	/* Wipe contents of page to reveal possible stale pointers to it */
+	memset(block->frame, '\0', UNIV_PAGE_SIZE);
+#else
+	/* Wipe page_no and space_id */
+	memset(block->frame + FIL_PAGE_OFFSET, 0xfe, 4);
+	memset(block->frame + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID, 0xfe, 4);
+#endif
+	data = block->page.zip.data;
+
+	if (data) {
+		block->page.zip.data = NULL;
+		mutex_exit(&block->mutex);
+		//buf_pool_mutex_exit_forbid();
+		buf_buddy_free(data, page_zip_get_size(&block->page.zip), have_page_hash_mutex);
+		//buf_pool_mutex_exit_allow();
+		mutex_enter(&block->mutex);
+		page_zip_set_size(&block->page.zip, 0);
+	}
+
+	mutex_enter(&free_list_mutex);
+	UT_LIST_ADD_FIRST(free, buf_pool->free, (&block->page));
+	ut_d(block->page.in_free_list = TRUE);
+	mutex_exit(&free_list_mutex);
+
+	UNIV_MEM_ASSERT_AND_FREE(block->frame, UNIV_PAGE_SIZE);
+}
+
+/******************************************************************//**
+Takes a block out of the LRU list and page hash table.
+If the block is compressed-only (BUF_BLOCK_ZIP_PAGE),
+the object will be freed and buf_pool_zip_mutex will be released.
+
+If a compressed page or a compressed-only block descriptor is freed,
+other compressed pages or compressed-only block descriptors may be
+relocated.
+@return the new state of the block (BUF_BLOCK_ZIP_FREE if the state
+was BUF_BLOCK_ZIP_PAGE, or BUF_BLOCK_REMOVE_HASH otherwise) */
+static
+enum buf_page_state
+buf_LRU_block_remove_hashed_page(
+/*=============================*/
+	buf_page_t*	bpage,	/*!< in: block, must contain a file page and
+				be in a state where it can be freed; there
+				may or may not be a hash index to the page */
+	ibool		zip)	/*!< in: TRUE if should remove also the
+				compressed page of an uncompressed page */
+{
+	const buf_page_t*	hashed_bpage;
+	ut_ad(bpage);
+	//ut_ad(buf_pool_mutex_own());
+	ut_ad(mutex_own(&LRU_list_mutex));
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(rw_lock_own(&page_hash_latch, RW_LOCK_EX));
+#endif
+	ut_ad(mutex_own(buf_page_get_mutex(bpage)));
+
+	ut_a(buf_page_get_io_fix(bpage) == BUF_IO_NONE);
+	ut_a(bpage->buf_fix_count == 0);
+
+#if UNIV_WORD_SIZE == 4
+	/* On 32-bit systems, there is no padding in
+	buf_page_t.  On other systems, Valgrind could complain
+	about uninitialized pad bytes. */
+	UNIV_MEM_ASSERT_RW(bpage, sizeof *bpage);
+#endif
+
+	buf_LRU_remove_block(bpage);
+
+	buf_pool->freed_page_clock += 1;
+
+	switch (buf_page_get_state(bpage)) {
+	case BUF_BLOCK_FILE_PAGE:
+		UNIV_MEM_ASSERT_W(bpage, sizeof(buf_block_t));
+		UNIV_MEM_ASSERT_W(((buf_block_t*) bpage)->frame,
+				  UNIV_PAGE_SIZE);
+		buf_block_modify_clock_inc((buf_block_t*) bpage);
+		if (bpage->zip.data) {
+			const page_t*	page = ((buf_block_t*) bpage)->frame;
+			const ulint	zip_size
+				= page_zip_get_size(&bpage->zip);
+
+			ut_a(!zip || bpage->oldest_modification == 0);
+
+			switch (UNIV_EXPECT(fil_page_get_type(page),
+					    FIL_PAGE_INDEX)) {
+			case FIL_PAGE_TYPE_ALLOCATED:
+			case FIL_PAGE_INODE:
+			case FIL_PAGE_IBUF_BITMAP:
+			case FIL_PAGE_TYPE_FSP_HDR:
+			case FIL_PAGE_TYPE_XDES:
+				/* These are essentially uncompressed pages. */
+				if (!zip) {
+					/* InnoDB writes the data to the
+					uncompressed page frame.  Copy it
+					to the compressed page, which will
+					be preserved. */
+					memcpy(bpage->zip.data, page,
+					       zip_size);
+				}
+				break;
+			case FIL_PAGE_TYPE_ZBLOB:
+			case FIL_PAGE_TYPE_ZBLOB2:
+				break;
+			case FIL_PAGE_INDEX:
+#ifdef UNIV_ZIP_DEBUG
+				ut_a(page_zip_validate(&bpage->zip, page));
+#endif /* UNIV_ZIP_DEBUG */
+				break;
+			default:
+				ut_print_timestamp(stderr);
+				fputs("  InnoDB: ERROR: The compressed page"
+				      " to be evicted seems corrupt:", stderr);
+				ut_print_buf(stderr, page, zip_size);
+				fputs("\nInnoDB: Possibly older version"
+				      " of the page:", stderr);
+				ut_print_buf(stderr, bpage->zip.data,
+					     zip_size);
+				putc('\n', stderr);
+				ut_error;
+			}
+
+			break;
+		}
+		/* fall through */
+	case BUF_BLOCK_ZIP_PAGE:
+		ut_a(bpage->oldest_modification == 0);
+		UNIV_MEM_ASSERT_W(bpage->zip.data,
+				  page_zip_get_size(&bpage->zip));
+		break;
+	case BUF_BLOCK_ZIP_FREE:
+	case BUF_BLOCK_ZIP_DIRTY:
+	case BUF_BLOCK_NOT_USED:
+	case BUF_BLOCK_READY_FOR_USE:
+	case BUF_BLOCK_MEMORY:
+	case BUF_BLOCK_REMOVE_HASH:
+		ut_error;
+		break;
+	}
+
+	hashed_bpage = buf_page_hash_get(bpage->space, bpage->offset);
+
+	if (UNIV_UNLIKELY(bpage != hashed_bpage)) {
+		fprintf(stderr,
+			"InnoDB: Error: page %lu %lu not found"
+			" in the hash table\n",
+			(ulong) bpage->space,
+			(ulong) bpage->offset);
+		if (hashed_bpage) {
+			fprintf(stderr,
+				"InnoDB: In hash table we find block"
+				" %p of %lu %lu which is not %p\n",
+				(const void*) hashed_bpage,
+				(ulong) hashed_bpage->space,
+				(ulong) hashed_bpage->offset,
+				(const void*) bpage);
+		}
+
+#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
+		mutex_exit(buf_page_get_mutex(bpage));
+		//buf_pool_mutex_exit();
+		mutex_exit(&LRU_list_mutex);
+		rw_lock_x_unlock(&page_hash_latch);
+		buf_print();
+		buf_LRU_print();
+		buf_validate();
+		buf_LRU_validate();
+#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
+		ut_error;
+	}
+
+	ut_ad(!bpage->in_zip_hash);
+	ut_ad(bpage->in_page_hash);
+	ut_d(bpage->in_page_hash = FALSE);
+	HASH_DELETE(buf_page_t, hash, buf_pool->page_hash,
+		    buf_page_address_fold(bpage->space, bpage->offset),
+		    bpage);
+	switch (buf_page_get_state(bpage)) {
+	case BUF_BLOCK_ZIP_PAGE:
+		ut_ad(!bpage->in_free_list);
+		ut_ad(!bpage->in_flush_list);
+		ut_ad(!bpage->in_LRU_list);
+		ut_a(bpage->zip.data);
+		ut_a(buf_page_get_zip_size(bpage));
+
+		UT_LIST_REMOVE(zip_list, buf_pool->zip_clean, bpage);
+
+		mutex_exit(&buf_pool_zip_mutex);
+		//buf_pool_mutex_exit_forbid();
+		buf_buddy_free(bpage->zip.data,
+			       page_zip_get_size(&bpage->zip), TRUE);
+		buf_buddy_free(bpage, sizeof(*bpage), TRUE);
+		//buf_pool_mutex_exit_allow();
+		UNIV_MEM_UNDESC(bpage);
+		return(BUF_BLOCK_ZIP_FREE);
+
+	case BUF_BLOCK_FILE_PAGE:
+		memset(((buf_block_t*) bpage)->frame
+		       + FIL_PAGE_OFFSET, 0xff, 4);
+		memset(((buf_block_t*) bpage)->frame
+		       + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID, 0xff, 4);
+		UNIV_MEM_INVALID(((buf_block_t*) bpage)->frame,
+				 UNIV_PAGE_SIZE);
+		buf_page_set_state(bpage, BUF_BLOCK_REMOVE_HASH);
+
+		if (zip && bpage->zip.data) {
+			/* Free the compressed page. */
+			void*	data = bpage->zip.data;
+			bpage->zip.data = NULL;
+
+			ut_ad(!bpage->in_free_list);
+			ut_ad(!bpage->in_flush_list);
+			ut_ad(!bpage->in_LRU_list);
+			mutex_exit(&((buf_block_t*) bpage)->mutex);
+			//buf_pool_mutex_exit_forbid();
+			buf_buddy_free(data, page_zip_get_size(&bpage->zip), TRUE);
+			//buf_pool_mutex_exit_allow();
+			mutex_enter(&((buf_block_t*) bpage)->mutex);
+			page_zip_set_size(&bpage->zip, 0);
+		}
+
+		return(BUF_BLOCK_REMOVE_HASH);
+
+	case BUF_BLOCK_ZIP_FREE:
+	case BUF_BLOCK_ZIP_DIRTY:
+	case BUF_BLOCK_NOT_USED:
+	case BUF_BLOCK_READY_FOR_USE:
+	case BUF_BLOCK_MEMORY:
+	case BUF_BLOCK_REMOVE_HASH:
+		break;
+	}
+
+	ut_error;
+	return(BUF_BLOCK_ZIP_FREE);
+}
+
+/******************************************************************//**
+Puts a file page whose has no hash index to the free list. */
+static
+void
+buf_LRU_block_free_hashed_page(
+/*===========================*/
+	buf_block_t*	block,	/*!< in: block, must contain a file page and
+				be in a state where it can be freed */
+	ibool		have_page_hash_mutex)
+{
+	//ut_ad(buf_pool_mutex_own());
+	ut_ad(mutex_own(&block->mutex));
+
+	buf_block_set_state(block, BUF_BLOCK_MEMORY);
+
+	buf_LRU_block_free_non_file_page(block, have_page_hash_mutex);
+}
+
+/**********************************************************************//**
+Updates buf_LRU_old_ratio.
+@return	updated old_pct */
+UNIV_INTERN
+uint
+buf_LRU_old_ratio_update(
+/*=====================*/
+	uint	old_pct,/*!< in: Reserve this percentage of
+			the buffer pool for "old" blocks. */
+	ibool	adjust)	/*!< in: TRUE=adjust the LRU list;
+			FALSE=just assign buf_LRU_old_ratio
+			during the initialization of InnoDB */
+{
+	uint	ratio;
+
+	ratio = old_pct * BUF_LRU_OLD_RATIO_DIV / 100;
+	if (ratio < BUF_LRU_OLD_RATIO_MIN) {
+		ratio = BUF_LRU_OLD_RATIO_MIN;
+	} else if (ratio > BUF_LRU_OLD_RATIO_MAX) {
+		ratio = BUF_LRU_OLD_RATIO_MAX;
+	}
+
+	if (adjust) {
+		//buf_pool_mutex_enter();
+		mutex_enter(&LRU_list_mutex);
+
+		if (ratio != buf_LRU_old_ratio) {
+			buf_LRU_old_ratio = ratio;
+
+			if (UT_LIST_GET_LEN(buf_pool->LRU)
+			    >= BUF_LRU_OLD_MIN_LEN) {
+				buf_LRU_old_adjust_len();
+			}
+		}
+
+		//buf_pool_mutex_exit();
+		mutex_exit(&LRU_list_mutex);
+	} else {
+		buf_LRU_old_ratio = ratio;
+	}
+
+	/* the reverse of 
+	ratio = old_pct * BUF_LRU_OLD_RATIO_DIV / 100 */
+	return((uint) (ratio * 100 / (double) BUF_LRU_OLD_RATIO_DIV + 0.5));
+}
+
+/********************************************************************//**
+Update the historical stats that we are collecting for LRU eviction
+policy at the end of each interval. */
+UNIV_INTERN
+void
+buf_LRU_stat_update(void)
+/*=====================*/
+{
+	buf_LRU_stat_t*	item;
+
+	/* If we haven't started eviction yet then don't update stats. */
+	if (buf_pool->freed_page_clock == 0) {
+		goto func_exit;
+	}
+
+	//buf_pool_mutex_enter();
+	mutex_enter(&buf_pool_mutex);
+
+	/* Update the index. */
+	item = &buf_LRU_stat_arr[buf_LRU_stat_arr_ind];
+	buf_LRU_stat_arr_ind++;
+	buf_LRU_stat_arr_ind %= BUF_LRU_STAT_N_INTERVAL;
+
+	/* Add the current value and subtract the obsolete entry. */
+	buf_LRU_stat_sum.io += buf_LRU_stat_cur.io - item->io;
+	buf_LRU_stat_sum.unzip += buf_LRU_stat_cur.unzip - item->unzip;
+
+	/* Put current entry in the array. */
+	memcpy(item, &buf_LRU_stat_cur, sizeof *item);
+
+	//buf_pool_mutex_exit();
+	mutex_exit(&buf_pool_mutex);
+
+func_exit:
+	/* Clear the current entry. */
+	memset(&buf_LRU_stat_cur, 0, sizeof buf_LRU_stat_cur);
+}
+
+/********************************************************************//**
+Dump the LRU page list to the specific file. */
+#define LRU_DUMP_FILE "ib_lru_dump"
+
+UNIV_INTERN
+ibool
+buf_LRU_file_dump(void)
+/*===================*/
+{
+	os_file_t	dump_file = (os_file_t) -1;
+	ibool		success;
+	byte*		buffer_base = NULL;
+	byte*		buffer = NULL;
+	buf_page_t*	bpage;
+	ulint		buffers;
+	ulint		offset;
+	ibool		ret = FALSE;
+	ulint		i;
+
+	for (i = 0; i < srv_n_data_files; i++) {
+		if (strstr(srv_data_file_names[i], LRU_DUMP_FILE) != NULL) {
+			fprintf(stderr,
+				" InnoDB: The name '%s' seems to be used for"
+				" innodb_data_file_path. Dumping LRU list is not"
+				" done for safeness.\n", LRU_DUMP_FILE);
+			goto end;
+		}
+	}
+
+	buffer_base = ut_malloc(2 * UNIV_PAGE_SIZE);
+	buffer = ut_align(buffer_base, UNIV_PAGE_SIZE);
+	if (!buffer) {
+		fprintf(stderr,
+			" InnoDB: cannot allocate buffer.\n");
+		goto end;
+	}
+
+	dump_file = os_file_create(LRU_DUMP_FILE, OS_FILE_OVERWRITE,
+				OS_FILE_NORMAL, OS_DATA_FILE, &success);
+	if (!success) {
+		os_file_get_last_error(TRUE);
+		fprintf(stderr,
+			" InnoDB: cannot open %s\n", LRU_DUMP_FILE);
+		goto end;
+	}
+
+	mutex_enter(&LRU_list_mutex);
+	bpage = UT_LIST_GET_LAST(buf_pool->LRU);
+
+	buffers = offset = 0;
+	while (bpage != NULL) {
+		if (offset == 0) {
+			memset(buffer, 0, UNIV_PAGE_SIZE);
+		}
+
+		mach_write_to_4(buffer + offset * 4, bpage->space);
+		offset++;
+		mach_write_to_4(buffer + offset * 4, bpage->offset);
+		offset++;
+
+		if (offset == UNIV_PAGE_SIZE/4) {
+			success = os_file_write(LRU_DUMP_FILE, dump_file, buffer,
+					(buffers << UNIV_PAGE_SIZE_SHIFT) & 0xFFFFFFFFUL,
+					(buffers >> (32 - UNIV_PAGE_SIZE_SHIFT)),
+					UNIV_PAGE_SIZE);
+			if (!success) {
+				mutex_exit(&LRU_list_mutex);
+				fprintf(stderr,
+					" InnoDB: cannot write page %lu of %s\n",
+					buffers, LRU_DUMP_FILE);
+				goto end;
+			}
+			buffers++;
+			offset = 0;
+		}
+
+		bpage = UT_LIST_GET_PREV(LRU, bpage);
+	}
+	mutex_exit(&LRU_list_mutex);
+
+	if (offset == 0) {
+		memset(buffer, 0, UNIV_PAGE_SIZE);
+	}
+
+	mach_write_to_4(buffer + offset * 4, 0xFFFFFFFFUL);
+	offset++;
+	mach_write_to_4(buffer + offset * 4, 0xFFFFFFFFUL);
+	offset++;
+
+	success = os_file_write(LRU_DUMP_FILE, dump_file, buffer,
+			(buffers << UNIV_PAGE_SIZE_SHIFT) & 0xFFFFFFFFUL,
+			(buffers >> (32 - UNIV_PAGE_SIZE_SHIFT)),
+			UNIV_PAGE_SIZE);
+	if (!success) {
+		goto end;
+	}
+
+	ret = TRUE;
+end:
+	if (dump_file != (os_file_t) -1)
+		os_file_close(dump_file);
+	if (buffer_base)
+		ut_free(buffer_base);
+
+	return(ret);
+}
+
+typedef struct {
+	ib_uint32_t space_id;
+	ib_uint32_t page_no;
+} dump_record_t;
+
+static int dump_record_cmp(const void *a, const void *b)
+{
+	const dump_record_t *rec1 = (dump_record_t *) a;
+	const dump_record_t *rec2 = (dump_record_t *) b;
+
+	if (rec1->space_id < rec2->space_id)
+		return -1;
+	if (rec1->space_id > rec2->space_id)
+		return 1;
+	if (rec1->page_no < rec2->page_no)
+		return -1;
+	return rec1->page_no > rec2->page_no;
+}
+
+/********************************************************************//**
+Read the pages based on the specific file.*/
+UNIV_INTERN
+ibool
+buf_LRU_file_restore(void)
+/*======================*/
+{
+	os_file_t	dump_file = (os_file_t) -1;
+	ibool		success;
+	byte*		buffer_base = NULL;
+	byte*		buffer = NULL;
+	ulint		buffers;
+	ulint		offset;
+	ulint		reads = 0;
+	ulint		req = 0;
+	ibool		terminated = FALSE;
+	ibool		ret = FALSE;
+	dump_record_t*	records= 0;
+	ulint		size;
+	ulint		size_high;
+	ulint		length;
+
+	dump_file = os_file_create_simple_no_error_handling(
+		LRU_DUMP_FILE, OS_FILE_OPEN, OS_FILE_READ_ONLY, &success);
+	if (!success || !os_file_get_size(dump_file, &size, &size_high)) {
+		os_file_get_last_error(TRUE);
+		fprintf(stderr,
+			" InnoDB: cannot open %s\n", LRU_DUMP_FILE);
+		goto end;
+	}
+	if (size == 0 || size_high > 0 || size % 8) {
+		fprintf(stderr, " InnoDB: broken LRU dump file\n");
+		goto end;
+	}
+	buffer_base = ut_malloc(2 * UNIV_PAGE_SIZE);
+	buffer = ut_align(buffer_base, UNIV_PAGE_SIZE);
+	records = ut_malloc(size);
+	if (!buffer || !records) {
+		fprintf(stderr,
+			" InnoDB: cannot allocate buffer.\n");
+		goto end;
+	}
+
+	buffers = 0;
+	length = 0;
+	while (!terminated) {
+		success = os_file_read(dump_file, buffer,
+				(buffers << UNIV_PAGE_SIZE_SHIFT) & 0xFFFFFFFFUL,
+				(buffers >> (32 - UNIV_PAGE_SIZE_SHIFT)),
+				UNIV_PAGE_SIZE);
+		if (!success) {
+			fprintf(stderr,
+				" InnoDB: cannot read page %lu of %s,"
+				" or meet unexpected terminal.\n",
+				buffers, LRU_DUMP_FILE);
+			goto end;
+		}
+
+		for (offset = 0; offset < UNIV_PAGE_SIZE/4; offset += 2) {
+			ulint	space_id;
+			ulint	page_no;
+
+			space_id = mach_read_from_4(buffer + offset * 4);
+			page_no = mach_read_from_4(buffer + (offset + 1) * 4);
+			if (space_id == 0xFFFFFFFFUL
+			    || page_no == 0xFFFFFFFFUL) {
+				terminated = TRUE;
+				break;
+			}
+
+			records[length].space_id = space_id;
+			records[length].page_no = page_no;
+			length++;
+			if (length * 8 >= size) {
+				fprintf(stderr,
+					" InnoDB: could not find the "
+					"end-of-file marker after reading "
+					"the expected %lu bytes from the "
+					"LRU dump file.\n"
+					" InnoDB: this could be caused by a "
+					"broken or incomplete file.\n"
+					" InnoDB: trying to process what has "
+					"been read so far.\n",
+					size);
+				terminated= TRUE;
+				break;
+			}
+		}
+		buffers++;
+	}
+
+	qsort(records, length, sizeof(dump_record_t), dump_record_cmp);
+
+	for (offset = 0; offset < length; offset++) {
+		ulint		space_id;
+		ulint		page_no;
+		ulint		zip_size;
+		ulint		err;
+		ib_int64_t	tablespace_version;
+
+		space_id = records[offset].space_id;
+		page_no = records[offset].page_no;
+
+		if (offset % 16 == 15) {
+			os_aio_simulated_wake_handler_threads();
+			buf_flush_free_margin(FALSE);
+		}
+
+		zip_size = fil_space_get_zip_size(space_id);
+		if (UNIV_UNLIKELY(zip_size == ULINT_UNDEFINED)) {
+			continue;
+		}
+
+		if (fil_area_is_exist(space_id, zip_size, page_no, 0,
+				      zip_size ? zip_size : UNIV_PAGE_SIZE)) {
+
+			tablespace_version = fil_space_get_version(space_id);
+
+			req++;
+			reads += buf_read_page_low(&err, FALSE, BUF_READ_ANY_PAGE
+						   | OS_AIO_SIMULATED_WAKE_LATER,
+						   space_id, zip_size, TRUE,
+						   tablespace_version, page_no, NULL);
+			buf_LRU_stat_inc_io();
+		}
+	}
+
+	os_aio_simulated_wake_handler_threads();
+	buf_flush_free_margin(FALSE);
+
+	ut_print_timestamp(stderr);
+	fprintf(stderr,
+		" InnoDB: reading pages based on the dumped LRU list was done."
+		" (requested: %lu, read: %lu)\n", req, reads);
+	ret = TRUE;
+end:
+	if (dump_file != (os_file_t) -1)
+		os_file_close(dump_file);
+	if (buffer_base)
+		ut_free(buffer_base);
+	if (records)
+		ut_free(records);
+
+	return(ret);
+}
+
+#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
+/**********************************************************************//**
+Validates the LRU list.
+@return	TRUE */
+UNIV_INTERN
+ibool
+buf_LRU_validate(void)
+/*==================*/
+{
+	buf_page_t*	bpage;
+	buf_block_t*	block;
+	ulint		old_len;
+	ulint		new_len;
+
+	ut_ad(buf_pool);
+	//buf_pool_mutex_enter();
+	mutex_enter(&LRU_list_mutex);
+
+	if (UT_LIST_GET_LEN(buf_pool->LRU) >= BUF_LRU_OLD_MIN_LEN) {
+
+		ut_a(buf_pool->LRU_old);
+		old_len = buf_pool->LRU_old_len;
+		new_len = ut_min(UT_LIST_GET_LEN(buf_pool->LRU)
+				 * buf_LRU_old_ratio / BUF_LRU_OLD_RATIO_DIV,
+				 UT_LIST_GET_LEN(buf_pool->LRU)
+				 - (BUF_LRU_OLD_TOLERANCE
+				    + BUF_LRU_NON_OLD_MIN_LEN));
+		ut_a(old_len >= new_len - BUF_LRU_OLD_TOLERANCE);
+		ut_a(old_len <= new_len + BUF_LRU_OLD_TOLERANCE);
+	}
+
+	UT_LIST_VALIDATE(LRU, buf_page_t, buf_pool->LRU,
+			 ut_ad(ut_list_node_313->in_LRU_list));
+
+	bpage = UT_LIST_GET_FIRST(buf_pool->LRU);
+
+	old_len = 0;
+
+	while (bpage != NULL) {
+
+		switch (buf_page_get_state(bpage)) {
+		case BUF_BLOCK_ZIP_FREE:
+		case BUF_BLOCK_NOT_USED:
+		case BUF_BLOCK_READY_FOR_USE:
+		case BUF_BLOCK_MEMORY:
+		case BUF_BLOCK_REMOVE_HASH:
+			ut_error;
+			break;
+		case BUF_BLOCK_FILE_PAGE:
+			ut_ad(((buf_block_t*) bpage)->in_unzip_LRU_list
+			      == buf_page_belongs_to_unzip_LRU(bpage));
+		case BUF_BLOCK_ZIP_PAGE:
+		case BUF_BLOCK_ZIP_DIRTY:
+			break;
+		}
+
+		if (buf_page_is_old(bpage)) {
+			const buf_page_t*	prev
+				= UT_LIST_GET_PREV(LRU, bpage);
+			const buf_page_t*	next
+				= UT_LIST_GET_NEXT(LRU, bpage);
+
+			if (!old_len++) {
+				ut_a(buf_pool->LRU_old == bpage);
+			} else {
+				ut_a(!prev || buf_page_is_old(prev));
+			}
+
+			ut_a(!next || buf_page_is_old(next));
+		}
+
+		bpage = UT_LIST_GET_NEXT(LRU, bpage);
+	}
+
+	ut_a(buf_pool->LRU_old_len == old_len);
+
+	mutex_exit(&LRU_list_mutex);
+	mutex_enter(&free_list_mutex);
+
+	UT_LIST_VALIDATE(free, buf_page_t, buf_pool->free,
+			 ut_ad(ut_list_node_313->in_free_list));
+
+	for (bpage = UT_LIST_GET_FIRST(buf_pool->free);
+	     bpage != NULL;
+	     bpage = UT_LIST_GET_NEXT(free, bpage)) {
+
+		ut_a(buf_page_get_state(bpage) == BUF_BLOCK_NOT_USED);
+	}
+
+	mutex_exit(&free_list_mutex);
+	mutex_enter(&LRU_list_mutex);
+
+	UT_LIST_VALIDATE(unzip_LRU, buf_block_t, buf_pool->unzip_LRU,
+			 ut_ad(ut_list_node_313->in_unzip_LRU_list
+			       && ut_list_node_313->page.in_LRU_list));
+
+	for (block = UT_LIST_GET_FIRST(buf_pool->unzip_LRU);
+	     block;
+	     block = UT_LIST_GET_NEXT(unzip_LRU, block)) {
+
+		ut_ad(block->in_unzip_LRU_list);
+		ut_ad(block->page.in_LRU_list);
+		ut_a(buf_page_belongs_to_unzip_LRU(&block->page));
+	}
+
+	//buf_pool_mutex_exit();
+	mutex_exit(&LRU_list_mutex);
+	return(TRUE);
+}
+#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
+
+#if defined UNIV_DEBUG_PRINT || defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
+/**********************************************************************//**
+Prints the LRU list. */
+UNIV_INTERN
+void
+buf_LRU_print(void)
+/*===============*/
+{
+	const buf_page_t*	bpage;
+
+	ut_ad(buf_pool);
+	//buf_pool_mutex_enter();
+	mutex_enter(&LRU_list_mutex);
+
+	bpage = UT_LIST_GET_FIRST(buf_pool->LRU);
+
+	while (bpage != NULL) {
+
+		fprintf(stderr, "BLOCK space %lu page %lu ",
+			(ulong) buf_page_get_space(bpage),
+			(ulong) buf_page_get_page_no(bpage));
+
+		if (buf_page_is_old(bpage)) {
+			fputs("old ", stderr);
+		}
+
+		if (bpage->buf_fix_count) {
+			fprintf(stderr, "buffix count %lu ",
+				(ulong) bpage->buf_fix_count);
+		}
+
+		if (buf_page_get_io_fix(bpage)) {
+			fprintf(stderr, "io_fix %lu ",
+				(ulong) buf_page_get_io_fix(bpage));
+		}
+
+		if (bpage->oldest_modification) {
+			fputs("modif. ", stderr);
+		}
+
+		switch (buf_page_get_state(bpage)) {
+			const byte*	frame;
+		case BUF_BLOCK_FILE_PAGE:
+			frame = buf_block_get_frame((buf_block_t*) bpage);
+			fprintf(stderr, "\ntype %lu"
+				" index id %lu\n",
+				(ulong) fil_page_get_type(frame),
+				(ulong) ut_dulint_get_low(
+					btr_page_get_index_id(frame)));
+			break;
+		case BUF_BLOCK_ZIP_PAGE:
+			frame = bpage->zip.data;
+			fprintf(stderr, "\ntype %lu size %lu"
+				" index id %lu\n",
+				(ulong) fil_page_get_type(frame),
+				(ulong) buf_page_get_zip_size(bpage),
+				(ulong) ut_dulint_get_low(
+					btr_page_get_index_id(frame)));
+			break;
+
+		default:
+			fprintf(stderr, "\n!state %lu!\n",
+				(ulong) buf_page_get_state(bpage));
+			break;
+		}
+
+		bpage = UT_LIST_GET_NEXT(LRU, bpage);
+	}
+
+	//buf_pool_mutex_exit();
+	mutex_exit(&LRU_list_mutex);
+}
+#endif /* UNIV_DEBUG_PRINT || UNIV_DEBUG || UNIV_BUF_DEBUG */
diff --git a/storage/xtradb/buf/buf0rea.c b/storage/xtradb/buf/buf0rea.c
new file mode 100644
index 00000000000..59de70d9a8a
--- /dev/null
+++ b/storage/xtradb/buf/buf0rea.c
@@ -0,0 +1,764 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2010, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file buf/buf0rea.c
+The database buffer read
+
+Created 11/5/1995 Heikki Tuuri
+*******************************************************/
+
+#include "buf0rea.h"
+
+#include "fil0fil.h"
+#include "mtr0mtr.h"
+
+#include "buf0buf.h"
+#include "buf0flu.h"
+#include "buf0lru.h"
+#include "ibuf0ibuf.h"
+#include "log0recv.h"
+#include "trx0sys.h"
+#include "os0file.h"
+#include "srv0start.h"
+#include "srv0srv.h"
+
+/** The linear read-ahead area size */
+#define	BUF_READ_AHEAD_LINEAR_AREA	BUF_READ_AHEAD_AREA
+
+/** If there are buf_pool->curr_size per the number below pending reads, then
+read-ahead is not done: this is to prevent flooding the buffer pool with
+i/o-fixed buffer blocks */
+#define BUF_READ_AHEAD_PEND_LIMIT	2
+
+/********************************************************************//**
+Low-level function which reads a page asynchronously from a file to the
+buffer buf_pool if it is not already there, in which case does nothing.
+Sets the io_fix flag and sets an exclusive lock on the buffer frame. The
+flag is cleared and the x-lock released by an i/o-handler thread.
+@return 1 if a read request was queued, 0 if the page already resided
+in buf_pool, or if the page is in the doublewrite buffer blocks in
+which case it is never read into the pool, or if the tablespace does
+not exist or is being dropped 
+@return 1 if read request is issued. 0 if it is not */
+UNIV_INTERN
+ulint
+buf_read_page_low(
+/*==============*/
+	ulint*	err,	/*!< out: DB_SUCCESS or DB_TABLESPACE_DELETED if we are
+			trying to read from a non-existent tablespace, or a
+			tablespace which is just now being dropped */
+	ibool	sync,	/*!< in: TRUE if synchronous aio is desired */
+	ulint	mode,	/*!< in: BUF_READ_IBUF_PAGES_ONLY, ...,
+			ORed to OS_AIO_SIMULATED_WAKE_LATER (see below
+			at read-ahead functions) */
+	ulint	space,	/*!< in: space id */
+	ulint	zip_size,/*!< in: compressed page size, or 0 */
+	ibool	unzip,	/*!< in: TRUE=request uncompressed page */
+	ib_int64_t tablespace_version, /*!< in: if the space memory object has
+			this timestamp different from what we are giving here,
+			treat the tablespace as dropped; this is a timestamp we
+			use to stop dangling page reads from a tablespace
+			which we have DISCARDed + IMPORTed back */
+	ulint	offset,	/*!< in: page number */
+	trx_t*	trx)
+{
+	buf_page_t*	bpage;
+	ulint		wake_later;
+
+	*err = DB_SUCCESS;
+
+	wake_later = mode & OS_AIO_SIMULATED_WAKE_LATER;
+	mode = mode & ~OS_AIO_SIMULATED_WAKE_LATER;
+
+	if (trx_doublewrite
+	    && (space == TRX_SYS_SPACE
+		|| (srv_doublewrite_file && space == TRX_DOUBLEWRITE_SPACE))
+	    && (   (offset >= trx_doublewrite->block1
+		    && offset < trx_doublewrite->block1
+		    + TRX_SYS_DOUBLEWRITE_BLOCK_SIZE)
+		   || (offset >= trx_doublewrite->block2
+		       && offset < trx_doublewrite->block2
+		       + TRX_SYS_DOUBLEWRITE_BLOCK_SIZE))) {
+		ut_print_timestamp(stderr);
+		fprintf(stderr,
+			"  InnoDB: Warning: trying to read"
+			" doublewrite buffer page %lu\n",
+			(ulong) offset);
+
+		return(0);
+	}
+
+	if (ibuf_bitmap_page(zip_size, offset)
+	    || trx_sys_hdr_page(space, offset)) {
+
+		/* Trx sys header is so low in the latching order that we play
+		safe and do not leave the i/o-completion to an asynchronous
+		i/o-thread. Ibuf bitmap pages must always be read with
+		syncronous i/o, to make sure they do not get involved in
+		thread deadlocks. */
+
+		sync = TRUE;
+	}
+
+	/* The following call will also check if the tablespace does not exist
+	or is being dropped; if we succeed in initing the page in the buffer
+	pool for read, then DISCARD cannot proceed until the read has
+	completed */
+	bpage = buf_page_init_for_read(err, mode, space, zip_size, unzip,
+				       tablespace_version, offset);
+	if (bpage == NULL) {
+		/* bugfix: http://bugs.mysql.com/bug.php?id=43948 */
+		if (recv_recovery_is_on() && *err == DB_TABLESPACE_DELETED) {
+			/* hashed log recs must be treated here */
+			recv_addr_t*    recv_addr;
+
+			mutex_enter(&(recv_sys->mutex));
+
+			if (recv_sys->apply_log_recs == FALSE) {
+				mutex_exit(&(recv_sys->mutex));
+				goto not_to_recover;
+			}
+
+			/* recv_get_fil_addr_struct() */
+			recv_addr = HASH_GET_FIRST(recv_sys->addr_hash,
+					hash_calc_hash(ut_fold_ulint_pair(space, offset),
+						recv_sys->addr_hash));
+			while (recv_addr) {
+				if ((recv_addr->space == space)
+					&& (recv_addr->page_no == offset)) {
+					break;
+				}
+				recv_addr = HASH_GET_NEXT(addr_hash, recv_addr);
+			}
+
+			if ((recv_addr == NULL)
+			    || (recv_addr->state == RECV_BEING_PROCESSED)
+			    || (recv_addr->state == RECV_PROCESSED)) {
+				mutex_exit(&(recv_sys->mutex));
+				goto not_to_recover;
+			}
+
+			fprintf(stderr, " (cannot find space: %lu)", space);
+			recv_addr->state = RECV_PROCESSED;
+
+			ut_a(recv_sys->n_addrs);
+			recv_sys->n_addrs--;
+
+			mutex_exit(&(recv_sys->mutex));
+		}
+not_to_recover:
+
+		return(0);
+	}
+
+#ifdef UNIV_DEBUG
+	if (buf_debug_prints) {
+		fprintf(stderr,
+			"Posting read request for page %lu, sync %lu\n",
+			(ulong) offset,
+			(ulong) sync);
+	}
+#endif
+
+	ut_ad(buf_page_in_file(bpage));
+
+	if (zip_size) {
+		*err = _fil_io(OS_FILE_READ | wake_later,
+			      sync, space, zip_size, offset, 0, zip_size,
+			      bpage->zip.data, bpage, trx);
+	} else {
+		ut_a(buf_page_get_state(bpage) == BUF_BLOCK_FILE_PAGE);
+
+		*err = _fil_io(OS_FILE_READ | wake_later,
+			      sync, space, 0, offset, 0, UNIV_PAGE_SIZE,
+			      ((buf_block_t*) bpage)->frame, bpage, trx);
+	}
+
+	if (srv_pass_corrupt_table) {
+		if (*err != DB_SUCCESS) {
+			bpage->is_corrupt = TRUE;
+		}
+	} else {
+	ut_a(*err == DB_SUCCESS);
+	}
+
+	if (sync) {
+		/* The i/o is already completed when we arrive from
+		fil_read */
+		buf_page_io_complete(bpage, trx);
+	}
+
+	return(1);
+}
+
+/********************************************************************//**
+High-level function which reads a page asynchronously from a file to the
+buffer buf_pool if it is not already there. Sets the io_fix flag and sets
+an exclusive lock on the buffer frame. The flag is cleared and the x-lock
+released by the i/o-handler thread.
+@return TRUE if page has been read in, FALSE in case of failure */
+UNIV_INTERN
+ibool
+buf_read_page(
+/*==========*/
+	ulint	space,	/*!< in: space id */
+	ulint	zip_size,/*!< in: compressed page size in bytes, or 0 */
+	ulint	offset,	/*!< in: page number */
+	trx_t*	trx)
+{
+	ib_int64_t	tablespace_version;
+	ulint		count;
+	ulint		err;
+
+	tablespace_version = fil_space_get_version(space);
+
+	/* We do the i/o in the synchronous aio mode to save thread
+	switches: hence TRUE */
+
+	count = buf_read_page_low(&err, TRUE, BUF_READ_ANY_PAGE, space,
+				  zip_size, FALSE,
+				  tablespace_version, offset, trx);
+	srv_buf_pool_reads += count;
+	if (err == DB_TABLESPACE_DELETED) {
+		ut_print_timestamp(stderr);
+		fprintf(stderr,
+			"  InnoDB: Error: trying to access"
+			" tablespace %lu page no. %lu,\n"
+			"InnoDB: but the tablespace does not exist"
+			" or is just being dropped.\n",
+			(ulong) space, (ulong) offset);
+	}
+
+	/* Flush pages from the end of the LRU list if necessary */
+	buf_flush_free_margin(FALSE);
+
+	/* Increment number of I/O operations used for LRU policy. */
+	buf_LRU_stat_inc_io();
+
+	return(count > 0);
+}
+
+/********************************************************************//**
+Applies linear read-ahead if in the buf_pool the page is a border page of
+a linear read-ahead area and all the pages in the area have been accessed.
+Does not read any page if the read-ahead mechanism is not activated. Note
+that the algorithm looks at the 'natural' adjacent successor and
+predecessor of the page, which on the leaf level of a B-tree are the next
+and previous page in the chain of leaves. To know these, the page specified
+in (space, offset) must already be present in the buf_pool. Thus, the
+natural way to use this function is to call it when a page in the buf_pool
+is accessed the first time, calling this function just after it has been
+bufferfixed.
+NOTE 1: as this function looks at the natural predecessor and successor
+fields on the page, what happens, if these are not initialized to any
+sensible value? No problem, before applying read-ahead we check that the
+area to read is within the span of the space, if not, read-ahead is not
+applied. An uninitialized value may result in a useless read operation, but
+only very improbably.
+NOTE 2: the calling thread may own latches on pages: to avoid deadlocks this
+function must be written such that it cannot end up waiting for these
+latches!
+NOTE 3: the calling thread must want access to the page given: this rule is
+set to prevent unintended read-aheads performed by ibuf routines, a situation
+which could result in a deadlock if the OS does not support asynchronous io.
+@return	number of page read requests issued */
+UNIV_INTERN
+ulint
+buf_read_ahead_linear(
+/*==================*/
+	ulint	space,	/*!< in: space id */
+	ulint	zip_size,/*!< in: compressed page size in bytes, or 0 */
+	ulint	offset,	/*!< in: page number of a page; NOTE: the current thread
+			must want access to this page (see NOTE 3 above) */
+	trx_t*	trx)
+{
+	ib_int64_t	tablespace_version;
+	buf_page_t*	bpage;
+	buf_frame_t*	frame;
+	buf_page_t*	pred_bpage	= NULL;
+	ulint		pred_offset;
+	ulint		succ_offset;
+	ulint		count;
+	int		asc_or_desc;
+	ulint		new_offset;
+	ulint		fail_count;
+	ulint		ibuf_mode;
+	ulint		low, high;
+	ulint		err;
+	ulint		i;
+	const ulint	buf_read_ahead_linear_area
+		= BUF_READ_AHEAD_LINEAR_AREA;
+	ulint		threshold;
+
+ 	if (!(srv_read_ahead & 2)) {
+ 		return(0);
+ 	}
+
+	if (UNIV_UNLIKELY(srv_startup_is_before_trx_rollback_phase)) {
+		/* No read-ahead to avoid thread deadlocks */
+		return(0);
+	}
+
+	low  = (offset / buf_read_ahead_linear_area)
+		* buf_read_ahead_linear_area;
+	high = (offset / buf_read_ahead_linear_area + 1)
+		* buf_read_ahead_linear_area;
+
+	if ((offset != low) && (offset != high - 1)) {
+		/* This is not a border page of the area: return */
+
+		return(0);
+	}
+
+	if (ibuf_bitmap_page(zip_size, offset)
+	    || trx_sys_hdr_page(space, offset)) {
+
+		/* If it is an ibuf bitmap page or trx sys hdr, we do
+		no read-ahead, as that could break the ibuf page access
+		order */
+
+		return(0);
+	}
+
+	/* Remember the tablespace version before we ask te tablespace size
+	below: if DISCARD + IMPORT changes the actual .ibd file meanwhile, we
+	do not try to read outside the bounds of the tablespace! */
+
+	tablespace_version = fil_space_get_version(space);
+
+	//buf_pool_mutex_enter();
+	mutex_enter(&buf_pool_mutex);
+
+	if (high > fil_space_get_size(space)) {
+		//buf_pool_mutex_exit();
+		mutex_exit(&buf_pool_mutex);
+		/* The area is not whole, return */
+
+		return(0);
+	}
+
+	if (buf_pool->n_pend_reads
+	    > buf_pool->curr_size / BUF_READ_AHEAD_PEND_LIMIT) {
+		//buf_pool_mutex_exit();
+		mutex_exit(&buf_pool_mutex);
+
+		return(0);
+	}
+	mutex_exit(&buf_pool_mutex);
+
+	/* Check that almost all pages in the area have been accessed; if
+	offset == low, the accesses must be in a descending order, otherwise,
+	in an ascending order. */
+
+	asc_or_desc = 1;
+
+	if (offset == low) {
+		asc_or_desc = -1;
+	}
+
+	/* How many out of order accessed pages can we ignore
+	when working out the access pattern for linear readahead */
+	threshold = ut_min((64 - srv_read_ahead_threshold),
+			   BUF_READ_AHEAD_AREA);
+
+	fail_count = 0;
+
+	rw_lock_s_lock(&page_hash_latch);
+	for (i = low; i < high; i++) {
+		bpage = buf_page_hash_get(space, i);
+
+		if ((bpage == NULL) || !buf_page_is_accessed(bpage)) {
+			/* Not accessed */
+			fail_count++;
+
+		} else if (pred_bpage) {
+			/* Note that buf_page_is_accessed() returns
+			the time of the first access.  If some blocks
+			of the extent existed in the buffer pool at
+			the time of a linear access pattern, the first
+			access times may be nonmonotonic, even though
+			the latest access times were linear.  The
+			threshold (srv_read_ahead_factor) should help
+			a little against this. */
+			int res = ut_ulint_cmp(
+				buf_page_is_accessed(bpage),
+				buf_page_is_accessed(pred_bpage));
+			/* Accesses not in the right order */
+			if (res != 0 && res != asc_or_desc) {
+				fail_count++;
+			}
+		}
+
+		if (fail_count > threshold) {
+			/* Too many failures: return */
+			//buf_pool_mutex_exit();
+			rw_lock_s_unlock(&page_hash_latch);
+			return(0);
+		}
+
+		if (bpage && buf_page_is_accessed(bpage)) {
+			pred_bpage = bpage;
+		}
+	}
+
+	/* If we got this far, we know that enough pages in the area have
+	been accessed in the right order: linear read-ahead can be sensible */
+
+	bpage = buf_page_hash_get(space, offset);
+
+	if (bpage == NULL) {
+		//buf_pool_mutex_exit();
+		rw_lock_s_unlock(&page_hash_latch);
+
+		return(0);
+	}
+
+	switch (buf_page_get_state(bpage)) {
+	case BUF_BLOCK_ZIP_PAGE:
+		frame = bpage->zip.data;
+		break;
+	case BUF_BLOCK_FILE_PAGE:
+		frame = ((buf_block_t*) bpage)->frame;
+		break;
+	default:
+		ut_error;
+		break;
+	}
+
+	/* Read the natural predecessor and successor page addresses from
+	the page; NOTE that because the calling thread may have an x-latch
+	on the page, we do not acquire an s-latch on the page, this is to
+	prevent deadlocks. Even if we read values which are nonsense, the
+	algorithm will work. */
+
+	pred_offset = fil_page_get_prev(frame);
+	succ_offset = fil_page_get_next(frame);
+
+	//buf_pool_mutex_exit();
+	rw_lock_s_unlock(&page_hash_latch);
+
+	if ((offset == low) && (succ_offset == offset + 1)) {
+
+		/* This is ok, we can continue */
+		new_offset = pred_offset;
+
+	} else if ((offset == high - 1) && (pred_offset == offset - 1)) {
+
+		/* This is ok, we can continue */
+		new_offset = succ_offset;
+	} else {
+		/* Successor or predecessor not in the right order */
+
+		return(0);
+	}
+
+	low  = (new_offset / buf_read_ahead_linear_area)
+		* buf_read_ahead_linear_area;
+	high = (new_offset / buf_read_ahead_linear_area + 1)
+		* buf_read_ahead_linear_area;
+
+	if ((new_offset != low) && (new_offset != high - 1)) {
+		/* This is not a border page of the area: return */
+
+		return(0);
+	}
+
+	if (high > fil_space_get_size(space)) {
+		/* The area is not whole, return */
+
+		return(0);
+	}
+
+	/* If we got this far, read-ahead can be sensible: do it */
+
+	if (ibuf_inside()) {
+		ibuf_mode = BUF_READ_IBUF_PAGES_ONLY;
+	} else {
+		ibuf_mode = BUF_READ_ANY_PAGE;
+	}
+
+	count = 0;
+
+	/* Since Windows XP seems to schedule the i/o handler thread
+	very eagerly, and consequently it does not wait for the
+	full read batch to be posted, we use special heuristics here */
+
+	os_aio_simulated_put_read_threads_to_sleep();
+
+	for (i = low; i < high; i++) {
+		/* It is only sensible to do read-ahead in the non-sync
+		aio mode: hence FALSE as the first parameter */
+
+		if (!ibuf_bitmap_page(zip_size, i)) {
+			count += buf_read_page_low(
+				&err, FALSE,
+				ibuf_mode | OS_AIO_SIMULATED_WAKE_LATER,
+				space, zip_size, FALSE, tablespace_version, i, trx);
+			if (err == DB_TABLESPACE_DELETED) {
+				ut_print_timestamp(stderr);
+				fprintf(stderr,
+					"  InnoDB: Warning: in"
+					" linear readahead trying to access\n"
+					"InnoDB: tablespace %lu page %lu,\n"
+					"InnoDB: but the tablespace does not"
+					" exist or is just being dropped.\n",
+					(ulong) space, (ulong) i);
+			}
+		}
+	}
+
+	/* In simulated aio we wake the aio handler threads only after
+	queuing all aio requests, in native aio the following call does
+	nothing: */
+
+	os_aio_simulated_wake_handler_threads();
+
+	/* Flush pages from the end of the LRU list if necessary */
+	buf_flush_free_margin(FALSE);
+
+#ifdef UNIV_DEBUG
+	if (buf_debug_prints && (count > 0)) {
+		fprintf(stderr,
+			"LINEAR read-ahead space %lu offset %lu pages %lu\n",
+			(ulong) space, (ulong) offset, (ulong) count);
+	}
+#endif /* UNIV_DEBUG */
+
+	/* Read ahead is considered one I/O operation for the purpose of
+	LRU policy decision. */
+	buf_LRU_stat_inc_io();
+
+	buf_pool->stat.n_ra_pages_read += count;
+	return(count);
+}
+
+/********************************************************************//**
+Issues read requests for pages which the ibuf module wants to read in, in
+order to contract the insert buffer tree. Technically, this function is like
+a read-ahead function. */
+UNIV_INTERN
+void
+buf_read_ibuf_merge_pages(
+/*======================*/
+	ibool		sync,		/*!< in: TRUE if the caller
+					wants this function to wait
+					for the highest address page
+					to get read in, before this
+					function returns */
+	const ulint*	space_ids,	/*!< in: array of space ids */
+	const ib_int64_t* space_versions,/*!< in: the spaces must have
+					this version number
+					(timestamp), otherwise we
+					discard the read; we use this
+					to cancel reads if DISCARD +
+					IMPORT may have changed the
+					tablespace size */
+	const ulint*	page_nos,	/*!< in: array of page numbers
+					to read, with the highest page
+					number the last in the
+					array */
+	ulint		n_stored)	/*!< in: number of elements
+					in the arrays */
+{
+	ulint	i;
+
+	ut_ad(!ibuf_inside());
+#ifdef UNIV_IBUF_DEBUG
+	ut_a(n_stored < UNIV_PAGE_SIZE);
+#endif
+	while (buf_pool->n_pend_reads
+	       > buf_pool->curr_size / BUF_READ_AHEAD_PEND_LIMIT) {
+		os_thread_sleep(500000);
+	}
+
+	for (i = 0; i < n_stored; i++) {
+		ulint	zip_size = fil_space_get_zip_size(space_ids[i]);
+		ulint	err;
+
+		if (UNIV_UNLIKELY(zip_size == ULINT_UNDEFINED)) {
+
+			goto tablespace_deleted;
+		}
+
+		buf_read_page_low(&err, sync && (i + 1 == n_stored),
+				  BUF_READ_ANY_PAGE, space_ids[i],
+				  zip_size, TRUE, space_versions[i],
+				  page_nos[i], NULL);
+
+		if (UNIV_UNLIKELY(err == DB_TABLESPACE_DELETED)) {
+tablespace_deleted:
+			/* We have deleted or are deleting the single-table
+			tablespace: remove the entries for that page */
+
+			ibuf_merge_or_delete_for_page(NULL, space_ids[i],
+						      page_nos[i],
+						      zip_size, FALSE);
+		}
+	}
+
+	os_aio_simulated_wake_handler_threads();
+
+	/* Flush pages from the end of the LRU list if necessary */
+	buf_flush_free_margin(FALSE);
+
+#ifdef UNIV_DEBUG
+	if (buf_debug_prints) {
+		fprintf(stderr,
+			"Ibuf merge read-ahead space %lu pages %lu\n",
+			(ulong) space_ids[0], (ulong) n_stored);
+	}
+#endif /* UNIV_DEBUG */
+}
+
+/********************************************************************//**
+Issues read requests for pages which recovery wants to read in. */
+UNIV_INTERN
+void
+buf_read_recv_pages(
+/*================*/
+	ibool		sync,		/*!< in: TRUE if the caller
+					wants this function to wait
+					for the highest address page
+					to get read in, before this
+					function returns */
+	ulint		space,		/*!< in: space id */
+	ulint		zip_size,	/*!< in: compressed page size in
+					bytes, or 0 */
+	const ulint*	page_nos,	/*!< in: array of page numbers
+					to read, with the highest page
+					number the last in the
+					array */
+	ulint		n_stored)	/*!< in: number of page numbers
+					in the array */
+{
+	ib_int64_t	tablespace_version;
+	ulint		count;
+	ulint		err;
+	ulint		i;
+
+	zip_size = fil_space_get_zip_size(space);
+
+	if (UNIV_UNLIKELY(zip_size == ULINT_UNDEFINED)) {
+		/* It is a single table tablespace and the .ibd file is
+		missing: do nothing */
+
+		/* the log records should be treated here same reason
+		for http://bugs.mysql.com/bug.php?id=43948 */
+
+		if (recv_recovery_is_on()) {
+			recv_addr_t*    recv_addr;
+
+			mutex_enter(&(recv_sys->mutex));
+
+			if (recv_sys->apply_log_recs == FALSE) {
+				mutex_exit(&(recv_sys->mutex));
+				goto not_to_recover;
+			}
+
+			for (i = 0; i < n_stored; i++) {
+				/* recv_get_fil_addr_struct() */
+				recv_addr = HASH_GET_FIRST(recv_sys->addr_hash,
+						hash_calc_hash(ut_fold_ulint_pair(space, page_nos[i]),
+							recv_sys->addr_hash));
+				while (recv_addr) {
+					if ((recv_addr->space == space)
+						&& (recv_addr->page_no == page_nos[i])) {
+						break;
+					}
+					recv_addr = HASH_GET_NEXT(addr_hash, recv_addr);
+				}
+
+				if ((recv_addr == NULL)
+				    || (recv_addr->state == RECV_BEING_PROCESSED)
+				    || (recv_addr->state == RECV_PROCESSED)) {
+					continue;
+				}
+
+				recv_addr->state = RECV_PROCESSED;
+
+				ut_a(recv_sys->n_addrs);
+				recv_sys->n_addrs--;
+			}
+
+			mutex_exit(&(recv_sys->mutex));
+
+			fprintf(stderr, " (cannot find space: %lu)", space);
+		}
+not_to_recover:
+
+		return;
+	}
+
+	tablespace_version = fil_space_get_version(space);
+
+	for (i = 0; i < n_stored; i++) {
+
+		count = 0;
+
+		os_aio_print_debug = FALSE;
+
+		while (buf_pool->n_pend_reads >= recv_n_pool_free_frames / 2) {
+
+			os_aio_simulated_wake_handler_threads();
+			os_thread_sleep(10000);
+
+			count++;
+
+			if (count > 1000) {
+				fprintf(stderr,
+					"InnoDB: Error: InnoDB has waited for"
+					" 10 seconds for pending\n"
+					"InnoDB: reads to the buffer pool to"
+					" be finished.\n"
+					"InnoDB: Number of pending reads %lu,"
+					" pending pread calls %lu\n",
+					(ulong) buf_pool->n_pend_reads,
+					(ulong)os_file_n_pending_preads);
+
+				os_aio_print_debug = TRUE;
+			}
+		}
+
+		os_aio_print_debug = FALSE;
+
+		if ((i + 1 == n_stored) && sync) {
+			buf_read_page_low(&err, TRUE, BUF_READ_ANY_PAGE, space,
+					  zip_size, TRUE, tablespace_version,
+					  page_nos[i], NULL);
+		} else {
+			buf_read_page_low(&err, FALSE, BUF_READ_ANY_PAGE
+					  | OS_AIO_SIMULATED_WAKE_LATER,
+					  space, zip_size, TRUE,
+					  tablespace_version, page_nos[i], NULL);
+		}
+	}
+
+	os_aio_simulated_wake_handler_threads();
+
+	/* Flush pages from the end of the LRU list if necessary */
+	buf_flush_free_margin(FALSE);
+
+#ifdef UNIV_DEBUG
+	if (buf_debug_prints) {
+		fprintf(stderr,
+			"Recovery applies read-ahead pages %lu\n",
+			(ulong) n_stored);
+	}
+#endif /* UNIV_DEBUG */
+}
diff --git a/storage/xtradb/build/debian/README.Maintainer b/storage/xtradb/build/debian/README.Maintainer
new file mode 100644
index 00000000000..9554bdd7c62
--- /dev/null
+++ b/storage/xtradb/build/debian/README.Maintainer
@@ -0,0 +1,116 @@
+
+###########################
+##	FIXME for 5.1    ##
+###########################
+
+* put this trigger-recreation thing into the init scripts -- what?!
+* Let debian-i10n-english review all template changes before the translaters start.
+* Mark debconf translations as obsolete with debconf-updatepo.
+
+###########################################################################
+# Here are some information that are only of interest for the current and #
+# following Debian maintainers of MySQL.                                  #
+###########################################################################
+
+The debian/ directory is under SVN control, see debian/control for URL.
+
+#
+# Preparing a new version
+#
+The new orig.tar.gz (without non-free documentation) is created in /tmp/ when
+running this command:
+ 
+debian/rules get-orig-source
+
+#
+# mysqlreport
+#
+The authors e-mail address is <public@codenode.com>.
+
+#
+# Remarks to dependencies
+#
+libwrap0-dev (>= 7.6-8.3)
+	According to bug report 114582 where where build problems on
+	IA-64/sid with at least two prior versions.
+psmisc
+	/usr/bin/killall in the initscript
+
+zlib1g in libmysqlclient-dev:	
+	"mysql_config --libs" ads "-lz"
+
+Build-Dep:
+
+debhelper (>=4.1.16):
+	See po-debconf(7).
+
+autoconf (>= 2.13-20), automake1.7
+	Try to get rid of them.
+
+doxygen, tetex-bin, tetex-extra, gs
+	for ndb/docs/*tex
+
+#
+# Remarks to the start scripts
+#
+
+## initscripts rely on mysqladmin from a different package
+We have the problem that "/etc/init.d/mysql stop" relies on mysqladmin which
+is in another package (mysql-client) and a passwordless access that's maybe
+only available if the user configured his /root/.my.cnf. Can this be a problem?
+* normal mode: not because the user is required to have it. Else:
+* purge/remove: not, same as normal mode
+* upgrade: not, same as normal mode
+* first install: not, it depends on mysql-client which at least is unpacked
+                 so mysqladmin is there (to ping). It is not yet configured
+		passwordles but if there's a server running then there's a
+                /root/.my.cnf. Anyways, we simply kill anything that's mysqld.
+
+## Passwordless access for the maintainer scripts
+Another issue is that the scripts needs passwordless access. To ensure this
+a debian-sys-maint user is configured which has process and shutdown privs.
+The file with the randomly (that's important!) generated password must be
+present as long as the databases remain installed because else a new install
+would have no access. This file should be used like:
+	mysqladmin --defaults-file=/etc/mysql/debian.cnf restart
+to avoid providing the password in plaintext on a commandline where it would 
+be visible to any user via the "ps" command.
+
+## When to start the daemon?
+We aim to give the admin full control on when MySQL is running.
+Issues to be faced here:
+OLD:
+        1. Debconf asks whether MySQL should be started on boot so update-rc.d is
+           only run if the answer has been yes. The admin is likely to forget
+           this decision but update-rc.d checks for an existing line in
+           /etc/runlevel.conf and leaves it intact.
+        2. On initial install, if the answer is yes, the daemon has to be started.
+        3. On upgrades it should only be started if it was already running, everything
+           else is confusing. Especiall relying on an debconf decision made month ago
+           is considered suboptimal. See bug #274264
+        Implementation so far:
+        prerm (called on upgrade before stopping the server): 
+          check for a running server and set flag if necessary
+        preinst (called on initial install and before unpacking when upgrading):
+          check for the debconf variable and set flag if necessary
+        postinst (called on initial install and after each upgrade after unpacking):
+          call update-rc.d if debconf says yes
+          call invoce-rc.d if the flag has been set
+        Problems remaining:
+          dpkg-reconfigure and setting mysql start on boot to yes did not start mysql
+          (ok "start on boot" literally does not mean "start now" so that might have been ok)
+NEW:
+        1. --- no debconf anymore for the sake of simplicity. We have runlevel.conf,
+           the admin should use it
+        2. On initial install the server is started.
+        3. On upgrades the server is started exactly if it was running before so the
+           runlevel configuration is irrelevant. It will be preserved by the mean of
+           update-rc.d's builtin check.
+        Implementation:
+        prerm (called on upgrade before stopping the server):
+          check for a running server and set flag if necessary
+        preinst (called on initial install and before unpacking when upgrading):
+          check for $1 beeing (initial) "install" and set flag
+        postinst (called on initial install and after each upgrade after unpacking):
+          call update-rc.d
+          call invoce-rc.d if the flag has been set
diff --git a/storage/xtradb/build/debian/additions/Docs__Images__Makefile.in b/storage/xtradb/build/debian/additions/Docs__Images__Makefile.in
new file mode 100644
index 00000000000..f7316d4e345
--- /dev/null
+++ b/storage/xtradb/build/debian/additions/Docs__Images__Makefile.in
@@ -0,0 +1,6 @@
+all:
+
+distclean:
+	-rm -f Makefile
+
+.PHONY: all distclean clean install check
diff --git a/storage/xtradb/build/debian/additions/Docs__Makefile.in b/storage/xtradb/build/debian/additions/Docs__Makefile.in
new file mode 100644
index 00000000000..f7316d4e345
--- /dev/null
+++ b/storage/xtradb/build/debian/additions/Docs__Makefile.in
@@ -0,0 +1,6 @@
+all:
+
+distclean:
+	-rm -f Makefile
+
+.PHONY: all distclean clean install check
diff --git a/storage/xtradb/build/debian/additions/debian-start b/storage/xtradb/build/debian/additions/debian-start
new file mode 100644
index 00000000000..10628019e40
--- /dev/null
+++ b/storage/xtradb/build/debian/additions/debian-start
@@ -0,0 +1,31 @@
+#!/bin/bash
+#
+# This script is executed by "/etc/init.d/mysql" on every (re)start.
+# 
+# Changes to this file will be preserved when updating the Debian package.
+#
+
+source /usr/share/mysql/debian-start.inc.sh
+
+MYSQL="/usr/bin/mysql --defaults-file=/etc/mysql/debian.cnf"
+MYADMIN="/usr/bin/mysqladmin --defaults-file=/etc/mysql/debian.cnf"
+MYUPGRADE="/usr/bin/mysql_upgrade --defaults-extra-file=/etc/mysql/debian.cnf"
+MYCHECK="/usr/bin/mysqlcheck --defaults-file=/etc/mysql/debian.cnf"
+MYCHECK_SUBJECT="WARNING: mysqlcheck has found corrupt tables"
+MYCHECK_PARAMS="--all-databases --fast --silent"
+MYCHECK_RCPT="root"
+
+# The following commands should be run when the server is up but in background
+# where they do not block the server start and in one shell instance so that
+# they run sequentially. They are supposed not to echo anything to stdout.
+# If you want to disable the check for crashed tables comment
+# "check_for_crashed_tables" out.  
+# (There may be no output to stdout inside the background process!)
+echo "Checking for corrupt, not cleanly closed and upgrade needing tables."
+(
+  upgrade_system_tables_if_necessary;
+  check_root_accounts;
+  check_for_crashed_tables;
+) >&2 &
+
+exit 0
diff --git a/storage/xtradb/build/debian/additions/debian-start.inc.sh b/storage/xtradb/build/debian/additions/debian-start.inc.sh
new file mode 100644
index 00000000000..736cb3448eb
--- /dev/null
+++ b/storage/xtradb/build/debian/additions/debian-start.inc.sh
@@ -0,0 +1,72 @@
+#!/bin/bash
+#
+# This file is included by /etc/mysql/debian-start
+#
+
+## Check all unclosed tables.
+# - Requires the server to be up.
+# - Is supposed to run silently in background. 
+function check_for_crashed_tables() {
+  set -e
+  set -u
+
+  # But do it in the background to not stall the boot process.
+  logger -p daemon.info -i -t$0 "Triggering myisam-recover for all MyISAM tables"
+
+  # Checking for $? is unreliable so the size of the output is checked.
+  # Some table handlers like HEAP do not support CHECK TABLE.
+  tempfile=`tempfile`
+  # We have to use xargs in this case, because a for loop barfs on the 
+  # spaces in the thing to be looped over. 
+  LC_ALL=C $MYSQL --skip-column-names --batch -e  '
+      select concat("select count(*) into @discard from `",
+                    TABLE_SCHEMA, "`.`", TABLE_NAME, "`") 
+      from information_schema.TABLES where ENGINE="MyISAM"' | \
+    xargs -i $MYSQL --skip-column-names --silent --batch \
+                    --force -e "{}" >$tempfile 
+  if [ -s $tempfile ]; then
+    (
+      /bin/echo -e "\n" \
+        "Improperly closed tables are also reported if clients are accessing\n" \
+ 	"the tables *now*. A list of current connections is below.\n";
+       $MYADMIN processlist status
+    ) >> $tempfile
+    # Check for presence as a dependency on mailx would require an MTA.
+    if [ -x /usr/bin/mailx ]; then 
+      mailx -e -s"$MYCHECK_SUBJECT" $MYCHECK_RCPT < $tempfile 
+    fi
+    (echo "$MYCHECK_SUBJECT"; cat $tempfile) | logger -p daemon.warn -i -t$0
+  fi
+  rm $tempfile
+}
+
+## Check for tables needing an upgrade.
+# - Requires the server to be up.
+# - Is supposed to run silently in background. 
+function upgrade_system_tables_if_necessary() {
+  set -e
+  set -u
+
+  logger -p daemon.info -i -t$0 "Upgrading MySQL tables if necessary."
+
+  # Filter all "duplicate column", "duplicate key" and "unknown column"
+  # errors as the script is designed to be idempotent.
+  LC_ALL=C $MYUPGRADE \
+    2>&1 \
+    | egrep -v '^(1|@had|ERROR (1054|1060|1061))' \
+    | logger -p daemon.warn -i -t$0
+}
+
+## Check for the presence of both, root accounts with and without password.
+# This might have been caused by a bug related to mysql_install_db (#418672).
+function check_root_accounts() {
+  set -e
+  set -u
+  
+  logger -p daemon.info -i -t$0 "Checking for insecure root accounts."
+
+  ret=$( echo "SELECT count(*) FROM mysql.user WHERE user='root' and password='';" | $MYSQL --skip-column-names )
+  if [ "$ret" -ne "0" ]; then
+    logger -p daemon.warn -i -t$0 "WARNING: mysql.user contains $ret root accounts without password!"
+  fi
+}
diff --git a/storage/xtradb/build/debian/additions/echo_stderr b/storage/xtradb/build/debian/additions/echo_stderr
new file mode 100644
index 00000000000..67b3ed7cfb3
--- /dev/null
+++ b/storage/xtradb/build/debian/additions/echo_stderr
@@ -0,0 +1,2 @@
+#!/bin/bash
+echo "$*" 1>&2
diff --git a/storage/xtradb/build/debian/additions/innotop/InnoDBParser.pm b/storage/xtradb/build/debian/additions/innotop/InnoDBParser.pm
new file mode 100644
index 00000000000..3aaa7acd5b8
--- /dev/null
+++ b/storage/xtradb/build/debian/additions/innotop/InnoDBParser.pm
@@ -0,0 +1,1089 @@
+use strict;
+use warnings FATAL => 'all';
+
+package InnoDBParser;
+
+# This program is copyright (c) 2006 Baron Schwartz, baron at xaprb dot com.
+# Feedback and improvements are gratefully received.
+#
+# THIS PROGRAM IS PROVIDED "AS IS" AND WITHOUT ANY EXPRESS OR IMPLIED
+# WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTIES OF
+# MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE.
+#
+# This program is free software; you can redistribute it and/or modify it under
+# the terms of the GNU General Public License as published by the Free Software
+# Foundation, version 2; OR the Perl Artistic License.  On UNIX and similar
+# systems, you can issue `man perlgpl' or `man perlartistic' to read these
+
+# You should have received a copy of the GNU General Public License along with
+# this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+# Place, Suite 330, Boston, MA  02111-1307  USA
+
+our $VERSION = '1.6.0';
+
+use Data::Dumper;
+$Data::Dumper::Sortkeys = 1;
+use English qw(-no_match_vars);
+use List::Util qw(max);
+
+# Some common patterns
+my $d  = qr/(\d+)/;                    # Digit
+my $f  = qr/(\d+\.\d+)/;               # Float
+my $t  = qr/(\d+ \d+)/;                # Transaction ID
+my $i  = qr/((?:\d{1,3}\.){3}\d+)/;    # IP address
+my $n  = qr/([^`\s]+)/;                # MySQL object name
+my $w  = qr/(\w+)/;                    # Words
+my $fl = qr/([\w\.\/]+) line $d/;      # Filename and line number
+my $h  = qr/((?:0x)?[0-9a-f]*)/;       # Hex
+my $s  = qr/(\d{6} .\d:\d\d:\d\d)/;    # InnoDB timestamp
+
+# If you update this variable, also update the SYNOPSIS in the pod.
+my %innodb_section_headers = (
+   "TRANSACTIONS"                          => "tx",
+   "BUFFER POOL AND MEMORY"                => "bp",
+   "SEMAPHORES"                            => "sm",
+   "LOG"                                   => "lg",
+   "ROW OPERATIONS"                        => "ro",
+   "INSERT BUFFER AND ADAPTIVE HASH INDEX" => "ib",
+   "FILE I/O"                              => "io",
+   "LATEST DETECTED DEADLOCK"              => "dl",
+   "LATEST FOREIGN KEY ERROR"              => "fk",
+);
+
+my %parser_for = (
+   tx => \&parse_tx_section,
+   bp => \&parse_bp_section,
+   sm => \&parse_sm_section,
+   lg => \&parse_lg_section,
+   ro => \&parse_ro_section,
+   ib => \&parse_ib_section,
+   io => \&parse_io_section,
+   dl => \&parse_dl_section,
+   fk => \&parse_fk_section,
+);
+
+my %fk_parser_for = (
+   Transaction => \&parse_fk_transaction_error,
+   Error       => \&parse_fk_bad_constraint_error,
+   Cannot      => \&parse_fk_cant_drop_parent_error,
+);
+
+# A thread's proc_info can be at least 98 different things I've found in the
+# source.  Fortunately, most of them begin with a gerunded verb.  These are
+# the ones that don't.
+my %is_proc_info = (
+   'After create'                 => 1,
+   'Execution of init_command'    => 1,
+   'FULLTEXT initialization'      => 1,
+   'Reopen tables'                => 1,
+   'Repair done'                  => 1,
+   'Repair with keycache'         => 1,
+   'System lock'                  => 1,
+   'Table lock'                   => 1,
+   'Thread initialized'           => 1,
+   'User lock'                    => 1,
+   'copy to tmp table'            => 1,
+   'discard_or_import_tablespace' => 1,
+   'end'                          => 1,
+   'got handler lock'             => 1,
+   'got old table'                => 1,
+   'init'                         => 1,
+   'key cache'                    => 1,
+   'locks'                        => 1,
+   'malloc'                       => 1,
+   'query end'                    => 1,
+   'rename result table'          => 1,
+   'rename'                       => 1,
+   'setup'                        => 1,
+   'statistics'                   => 1,
+   'status'                       => 1,
+   'table cache'                  => 1,
+   'update'                       => 1,
+);
+
+sub new {
+   bless {}, shift;
+}
+
+# Parse the status and return it.
+# See srv_printf_innodb_monitor in innobase/srv/srv0srv.c
+# Pass in the text to parse, whether to be in debugging mode, which sections
+# to parse (hashref; if empty, parse all), and whether to parse full info from
+# locks and such (probably shouldn't unless you need to).
+sub parse_status_text {
+   my ( $self, $fulltext, $debug, $sections, $full ) = @_;
+
+   die "I can't parse undef" unless defined $fulltext;
+   $fulltext =~ s/[\r\n]+/\n/g;
+
+   $sections ||= {};
+   die '$sections must be a hashref' unless ref($sections) eq 'HASH';
+
+   my %innodb_data = (
+      got_all   => 0,         # Whether I was able to get the whole thing
+      ts        => '',        # Timestamp the server put on it
+      last_secs => 0,         # Num seconds the averages are over
+      sections  => {},        # Parsed values from each section
+   );
+
+   if ( $debug ) {
+      $innodb_data{'fulltext'} = $fulltext;
+   }
+
+   # Get the most basic info about the status: beginning and end, and whether
+   # I got the whole thing (if there has been a big deadlock and there are
+   # too many locks to print, the output might be truncated)
+   my ( $time_text ) = $fulltext =~ m/^$s INNODB MONITOR OUTPUT$/m;
+   $innodb_data{'ts'} = [ parse_innodb_timestamp( $time_text ) ];
+   $innodb_data{'timestring'} = ts_to_string($innodb_data{'ts'});
+   ( $innodb_data{'last_secs'} ) = $fulltext
+      =~ m/Per second averages calculated from the last $d seconds/;
+
+   ( my $got_all ) = $fulltext =~ m/END OF INNODB MONITOR OUTPUT/;
+   $innodb_data{'got_all'} = $got_all || 0;
+
+   # Split it into sections.  Each section begins with
+   # -----
+   # LABEL
+   # -----
+   my %innodb_sections;
+   my @matches = $fulltext
+      =~ m#\n(---+)\n([A-Z /]+)\n\1\n(.*?)(?=\n(---+)\n[A-Z /]+\n\4\n|$)#gs;
+   while ( my ( $start, $name, $text, $end ) = splice(@matches, 0, 4) ) {
+      $innodb_sections{$name} = [ $text, $end ? 1 : 0 ];
+   }
+   # The Row Operations section is a special case, because instead of ending
+   # with the beginning of another section, it ends with the end of the file.
+   # So this section is complete if the entire file is complete.
+   $innodb_sections{'ROW OPERATIONS'}->[1] ||= $innodb_data{'got_all'};
+
+   # Just for sanity's sake, make sure I understand what to do with each
+   # section
+   eval {
+      foreach my $section ( keys %innodb_sections ) {
+         my $header = $innodb_section_headers{$section};
+         die "Unknown section $section in $fulltext\n"
+            unless $header;
+         $innodb_data{'sections'}->{ $header }
+            ->{'fulltext'} = $innodb_sections{$section}->[0];
+         $innodb_data{'sections'}->{ $header }
+            ->{'complete'} = $innodb_sections{$section}->[1];
+      }
+   };
+   if ( $EVAL_ERROR ) {
+      _debug( $debug, $EVAL_ERROR);
+   }
+
+   # ################################################################
+   # Parse the detailed data out of the sections.
+   # ################################################################
+   eval {
+      foreach my $section ( keys %parser_for ) {
+         if ( defined $innodb_data{'sections'}->{$section}
+               && (!%$sections || (defined($sections->{$section} && $sections->{$section})) )) {
+            $parser_for{$section}->(
+                  $innodb_data{'sections'}->{$section},
+                  $innodb_data{'sections'}->{$section}->{'complete'},
+                  $debug,
+                  $full )
+               or delete $innodb_data{'sections'}->{$section};
+         }
+         else {
+            delete $innodb_data{'sections'}->{$section};
+         }
+      }
+   };
+   if ( $EVAL_ERROR ) {
+      _debug( $debug, $EVAL_ERROR);
+   }
+
+   return \%innodb_data;
+}
+
+# Parses the status text and returns it flattened out as a single hash.
+sub get_status_hash {
+   my ( $self, $fulltext, $debug, $sections, $full ) = @_;
+
+   # Parse the status text...
+   my $innodb_status
+      = $self->parse_status_text($fulltext, $debug, $sections, $full );
+
+   # Flatten the hierarchical structure into a single list by grabbing desired
+   # sections from it.
+   return
+      (map { 'IB_' . $_ => $innodb_status->{$_} } qw(timestring last_secs got_all)),
+      (map { 'IB_bp_' . $_ => $innodb_status->{'sections'}->{'bp'}->{$_} }
+         qw( writes_pending buf_pool_hit_rate total_mem_alloc buf_pool_reads
+            awe_mem_alloc pages_modified writes_pending_lru page_creates_sec
+            reads_pending pages_total buf_pool_hits writes_pending_single_page
+            page_writes_sec pages_read pages_written page_reads_sec
+            writes_pending_flush_list buf_pool_size add_pool_alloc
+            dict_mem_alloc pages_created buf_free complete )),
+      (map { 'IB_tx_' . $_ => $innodb_status->{'sections'}->{'tx'}->{$_} }
+         qw( num_lock_structs history_list_len purge_done_for transactions
+            purge_undo_for is_truncated trx_id_counter complete )),
+      (map { 'IB_ib_' . $_ => $innodb_status->{'sections'}->{'ib'}->{$_} }
+         qw( hash_table_size hash_searches_s non_hash_searches_s
+            bufs_in_node_heap used_cells size free_list_len seg_size inserts
+            merged_recs merges complete )),
+      (map { 'IB_lg_' . $_ => $innodb_status->{'sections'}->{'lg'}->{$_} }
+         qw( log_ios_done pending_chkp_writes last_chkp log_ios_s
+            log_flushed_to log_seq_no pending_log_writes complete )),
+      (map { 'IB_sm_' . $_ => $innodb_status->{'sections'}->{'sm'}->{$_} }
+         qw( wait_array_size rw_shared_spins rw_excl_os_waits mutex_os_waits
+            mutex_spin_rounds mutex_spin_waits rw_excl_spins rw_shared_os_waits
+            waits signal_count reservation_count complete )),
+      (map { 'IB_ro_' . $_ => $innodb_status->{'sections'}->{'ro'}->{$_} }
+         qw( queries_in_queue n_reserved_extents main_thread_state
+         main_thread_proc_no main_thread_id read_sec del_sec upd_sec ins_sec
+         read_views_open num_rows_upd num_rows_ins num_rows_read
+         queries_inside num_rows_del complete )),
+      (map { 'IB_fk_' . $_ => $innodb_status->{'sections'}->{'fk'}->{$_} }
+         qw( trigger parent_table child_index parent_index attempted_op
+         child_db timestring fk_name records col_name reason txn parent_db
+         type child_table parent_col complete )),
+      (map { 'IB_io_' . $_ => $innodb_status->{'sections'}->{'io'}->{$_} }
+         qw( pending_buffer_pool_flushes pending_pwrites pending_preads
+         pending_normal_aio_reads fsyncs_s os_file_writes pending_sync_ios
+         reads_s flush_type avg_bytes_s pending_ibuf_aio_reads writes_s
+         threads os_file_reads pending_aio_writes pending_log_ios os_fsyncs
+         pending_log_flushes complete )),
+      (map { 'IB_dl_' . $_ => $innodb_status->{'sections'}->{'dl'}->{$_} }
+         qw( timestring rolled_back txns complete ));
+
+}
+
+sub ts_to_string {
+   my $parts = shift;
+   return sprintf('%02d-%02d-%02d %02d:%02d:%02d', @$parts);
+}
+
+sub parse_innodb_timestamp {
+   my $text = shift;
+   my ( $y, $m, $d, $h, $i, $s )
+      = $text =~ m/^(\d\d)(\d\d)(\d\d) +(\d+):(\d+):(\d+)$/;
+   die("Can't get timestamp from $text\n") unless $y;
+   $y += 2000;
+   return ( $y, $m, $d, $h, $i, $s );
+}
+
+sub parse_fk_section {
+   my ( $section, $complete, $debug, $full ) = @_;
+   my $fulltext = $section->{'fulltext'};
+
+   return 0 unless $fulltext;
+
+   my ( $ts, $type ) = $fulltext =~ m/^$s\s+(\w+)/m;
+   $section->{'ts'} = [ parse_innodb_timestamp( $ts ) ];
+   $section->{'timestring'} = ts_to_string($section->{'ts'});
+   $section->{'type'} = $type;
+
+   # Decide which type of FK error happened, and dispatch to the right parser.
+   if ( $type && $fk_parser_for{$type} ) {
+      $fk_parser_for{$type}->( $section, $complete, $debug, $fulltext, $full );
+   }
+
+   delete $section->{'fulltext'} unless $debug;
+
+   return 1;
+}
+
+sub parse_fk_cant_drop_parent_error {
+   my ( $section, $complete, $debug, $fulltext, $full ) = @_;
+
+   # Parse the parent/child table info out
+   @{$section}{ qw(attempted_op parent_db parent_table) } = $fulltext
+      =~ m{Cannot $w table `(.*)/(.*)`}m;
+   @{$section}{ qw(child_db child_table) } = $fulltext
+      =~ m{because it is referenced by `(.*)/(.*)`}m;
+
+   ( $section->{'reason'} ) = $fulltext =~ m/(Cannot .*)/s;
+   $section->{'reason'} =~ s/\n(?:InnoDB: )?/ /gm
+      if $section->{'reason'};
+
+   # Certain data may not be present.  Make them '' if not present.
+   map { $section->{$_} ||= "" }
+      qw(child_index fk_name col_name parent_col);
+}
+
+# See dict/dict0dict.c, function dict_foreign_error_report
+# I don't care much about these.  There are lots of different messages, and
+# they come from someone trying to create a foreign key, or similar
+# statements.  They aren't indicative of some transaction trying to insert,
+# delete or update data.  Sometimes it is possible to parse out a lot of
+# information about the tables and indexes involved, but often the message
+# contains the DDL string the user entered, which is way too much for this
+# module to try to handle.
+sub parse_fk_bad_constraint_error {
+   my ( $section, $complete, $debug, $fulltext, $full ) = @_;
+
+   # Parse the parent/child table and index info out
+   @{$section}{ qw(child_db child_table) } = $fulltext
+      =~ m{Error in foreign key constraint of table (.*)/(.*):$}m;
+   $section->{'attempted_op'} = 'DDL';
+
+   # FK name, parent info... if possible.
+   @{$section}{ qw(fk_name col_name parent_db parent_table parent_col) }
+      = $fulltext
+      =~ m/CONSTRAINT `?$n`? FOREIGN KEY \(`?$n`?\) REFERENCES (?:`?$n`?\.)?`?$n`? \(`?$n`?\)/;
+
+   if ( !defined($section->{'fk_name'}) ) {
+      # Try to parse SQL a user might have typed in a CREATE statement or such
+      @{$section}{ qw(col_name parent_db parent_table parent_col) }
+         = $fulltext
+         =~ m/FOREIGN\s+KEY\s*\(`?$n`?\)\s+REFERENCES\s+(?:`?$n`?\.)?`?$n`?\s*\(`?$n`?\)/i;
+   }
+   $section->{'parent_db'} ||= $section->{'child_db'};
+
+   # Name of the child index (index in the same table where the FK is, see
+   # definition of dict_foreign_struct in include/dict0mem.h, where it is
+   # called foreign_index, as opposed to referenced_index which is in the
+   # parent table.  This may not be possible to find.
+   @{$section}{ qw(child_index) } = $fulltext
+      =~ m/^The index in the foreign key in table is $n$/m;
+
+   @{$section}{ qw(reason) } = $fulltext =~ m/:\s*([^:]+)(?= Constraint:|$)/ms;
+   $section->{'reason'} =~ s/\s+/ /g
+      if $section->{'reason'};
+   
+   # Certain data may not be present.  Make them '' if not present.
+   map { $section->{$_} ||= "" }
+      qw(child_index fk_name col_name parent_table parent_col);
+}
+
+# see source file row/row0ins.c
+sub parse_fk_transaction_error {
+   my ( $section, $complete, $debug, $fulltext, $full ) = @_;
+
+   # Parse the txn info out
+   my ( $txn ) = $fulltext
+      =~ m/Transaction:\n(TRANSACTION.*)\nForeign key constraint fails/s;
+   if ( $txn ) {
+      $section->{'txn'} = parse_tx_text( $txn, $complete, $debug, $full );
+   }
+
+   # Parse the parent/child table and index info out.  There are two types: an
+   # update or a delete of a parent record leaves a child orphaned
+   # (row_ins_foreign_report_err), and an insert or update of a child record has
+   # no matching parent record (row_ins_foreign_report_add_err).
+
+   @{$section}{ qw(reason child_db child_table) }
+      = $fulltext =~ m{^(Foreign key constraint fails for table `(.*)/(.*)`:)$}m;
+
+   @{$section}{ qw(fk_name col_name parent_db parent_table parent_col) }
+      = $fulltext
+      =~ m/CONSTRAINT `$n` FOREIGN KEY \(`$n`\) REFERENCES (?:`$n`\.)?`$n` \(`$n`\)/;
+   $section->{'parent_db'} ||= $section->{'child_db'};
+
+   # Special case, which I don't know how to trigger, but see
+   # innobase/row/row0ins.c row_ins_check_foreign_constraint
+   if ( $fulltext =~ m/ibd file does not currently exist!/ ) {
+      my ( $attempted_op, $index, $records )
+         = $fulltext =~ m/^Trying to (add to index) `$n` tuple:\n(.*))?/sm;
+      $section->{'child_index'} = $index;
+      $section->{'attempted_op'} = $attempted_op || '';
+      if ( $records && $full ) {
+         ( $section->{'records'} )
+            = parse_innodb_record_dump( $records, $complete, $debug );
+      }
+      @{$section}{qw(parent_db parent_table)}
+         =~ m/^But the parent table `$n`\.`$n`$/m;
+   }
+   else {
+      my ( $attempted_op, $which, $index )
+         = $fulltext =~ m/^Trying to ([\w ]*) in (child|parent) table, in index `$n` tuple:$/m;
+      if ( $which ) {
+         $section->{$which . '_index'} = $index;
+         $section->{'attempted_op'} = $attempted_op || '';
+
+         # Parse out the related records in the other table.
+         my ( $search_index, $records );
+         if ( $which eq 'child' ) {
+            ( $search_index, $records ) = $fulltext
+               =~ m/^But in parent table [^,]*, in index `$n`,\nthe closest match we can find is record:\n(.*)/ms;
+            $section->{'parent_index'} = $search_index;
+         }
+         else {
+            ( $search_index, $records ) = $fulltext
+               =~ m/^But in child table [^,]*, in index `$n`, (?:the record is not available|there is a record:\n(.*))?/ms;
+            $section->{'child_index'} = $search_index;
+         }
+         if ( $records && $full ) {
+            $section->{'records'}
+               = parse_innodb_record_dump( $records, $complete, $debug );
+         }
+         else {
+            $section->{'records'} = '';
+         }
+      }
+   }
+
+   # Parse out the tuple trying to be updated, deleted or inserted.
+   my ( $trigger ) = $fulltext =~ m/^(DATA TUPLE: \d+ fields;\n.*)$/m;
+   if ( $trigger ) {
+      $section->{'trigger'} = parse_innodb_record_dump( $trigger, $complete, $debug );
+   }
+
+   # Certain data may not be present.  Make them '' if not present.
+   map { $section->{$_} ||= "" }
+      qw(child_index fk_name col_name parent_table parent_col);
+}
+
+# There are new-style and old-style record formats.  See rem/rem0rec.c
+# TODO: write some tests for this
+sub parse_innodb_record_dump {
+   my ( $dump, $complete, $debug ) = @_;
+   return undef unless $dump;
+
+   my $result = {};
+
+   if ( $dump =~ m/PHYSICAL RECORD/ ) {
+      my $style = $dump =~ m/compact format/ ? 'new' : 'old';
+      $result->{'style'} = $style;
+
+      # This is a new-style record.
+      if ( $style eq 'new' ) {
+         @{$result}{qw( heap_no type num_fields info_bits )}
+            = $dump
+            =~ m/^(?:Record lock, heap no $d )?([A-Z ]+): n_fields $d; compact format; info bits $d$/m;
+      }
+
+      # OK, it's old-style.  Unfortunately there are variations here too.
+      elsif ( $dump =~ m/-byte offs / ) {
+         # Older-old style.
+         @{$result}{qw( heap_no type num_fields byte_offset info_bits )}
+            = $dump
+            =~ m/^(?:Record lock, heap no $d )?([A-Z ]+): n_fields $d; $d-byte offs [A-Z]+; info bits $d$/m;
+            if ( $dump !~ m/-byte offs TRUE/ ) {
+               $result->{'byte_offset'} = 0;
+            }
+      }
+      else {
+         # Newer-old style.
+         @{$result}{qw( heap_no type num_fields byte_offset info_bits )}
+            = $dump
+            =~ m/^(?:Record lock, heap no $d )?([A-Z ]+): n_fields $d; $d-byte offsets; info bits $d$/m;
+      }
+
+   }
+   else {
+      $result->{'style'} = 'tuple';
+      @{$result}{qw( type num_fields )}
+         = $dump =~ m/^(DATA TUPLE): $d fields;$/m;
+   }
+
+   # Fill in default values for things that couldn't be parsed.
+   map { $result->{$_} ||= 0 }
+      qw(heap_no num_fields byte_offset info_bits);
+   map { $result->{$_} ||= '' }
+      qw(style type );
+
+   my @fields = $dump =~ m/ (\d+:.*?;?);(?=$| \d+:)/gm;
+   $result->{'fields'} = [ map { parse_field($_, $complete, $debug ) } @fields ];
+
+   return $result;
+}
+
+# New/old-style applies here.  See rem/rem0rec.c
+# $text should not include the leading space or the second trailing semicolon.
+sub parse_field {
+   my ( $text, $complete, $debug ) = @_;
+
+   # Sample fields:
+   # '4: SQL NULL, size 4 '
+   # '1: len 6; hex 000000005601; asc     V ;'
+   # '6: SQL NULL'
+   # '5: len 30; hex 687474703a2f2f7777772e737765657477617465722e636f6d2f73746f72; asc http://www.sweetwater.com/stor;...(truncated)'
+   my ( $id, $nullsize, $len, $hex, $asc, $truncated );
+   ( $id, $nullsize ) = $text =~ m/^$d: SQL NULL, size $d $/;
+   if ( !defined($id) ) {
+      ( $id ) = $text =~ m/^$d: SQL NULL$/;
+   }
+   if ( !defined($id) ) {
+      ( $id, $len, $hex, $asc, $truncated )
+         = $text =~ m/^$d: len $d; hex $h; asc (.*);(\.\.\.\(truncated\))?$/;
+   }
+
+   die "Could not parse this field: '$text'" unless defined $id;
+   return {
+      id    => $id,
+      len   => defined($len) ? $len : defined($nullsize) ? $nullsize : 0,
+      'hex' => defined($hex) ? $hex : '',
+      asc   => defined($asc) ? $asc : '',
+      trunc => $truncated ? 1 : 0,
+   };
+
+}
+
+sub parse_dl_section {
+   my ( $dl, $complete, $debug, $full ) = @_;
+   return unless $dl;
+   my $fulltext = $dl->{'fulltext'};
+   return 0 unless $fulltext;
+
+   my ( $ts ) = $fulltext =~ m/^$s$/m;
+   return 0 unless $ts;
+
+   $dl->{'ts'} = [ parse_innodb_timestamp( $ts ) ];
+   $dl->{'timestring'} = ts_to_string($dl->{'ts'});
+   $dl->{'txns'} = {};
+
+   my @sections
+      = $fulltext
+      =~ m{
+         ^\*{3}\s([^\n]*)  # *** (1) WAITING FOR THIS...
+         (.*?)             # Followed by anything, non-greedy
+         (?=(?:^\*{3})|\z) # Followed by another three stars or EOF
+      }gmsx;
+
+
+   # Loop through each section.  There are no assumptions about how many
+   # there are, who holds and wants what locks, and who gets rolled back.
+   while ( my ($header, $body) = splice(@sections, 0, 2) ) {
+      my ( $txn_id, $what ) = $header =~ m/^\($d\) (.*):$/;
+      next unless $txn_id;
+      $dl->{'txns'}->{$txn_id} ||= {};
+      my $txn = $dl->{'txns'}->{$txn_id};
+
+      if ( $what eq 'TRANSACTION' ) {
+         $txn->{'tx'} = parse_tx_text( $body, $complete, $debug, $full );
+      }
+      else {
+         push @{$txn->{'locks'}}, parse_innodb_record_locks( $body, $complete, $debug, $full );
+      }
+   }
+
+   @{ $dl }{ qw(rolled_back) }
+      = $fulltext =~ m/^\*\*\* WE ROLL BACK TRANSACTION \($d\)$/m;
+
+   # Make sure certain values aren't undef
+   map { $dl->{$_} ||= '' } qw(rolled_back);
+
+   delete $dl->{'fulltext'} unless $debug;
+   return 1;
+}
+
+sub parse_innodb_record_locks {
+   my ( $text, $complete, $debug, $full ) = @_;
+   my @result;
+
+   foreach my $lock ( $text =~ m/(^(?:RECORD|TABLE) LOCKS?.*$)/gm ) {
+      my $hash = {};
+      @{$hash}{ qw(lock_type space_id page_no n_bits index db table txn_id lock_mode) }
+         = $lock
+         =~ m{^(RECORD|TABLE) LOCKS? (?:space id $d page no $d n bits $d index `?$n`? of )?table `$n(?:/|`\.`)$n` trx id $t lock.mode (\S+)}m;
+      ( $hash->{'special'} )
+         = $lock =~ m/^(?:RECORD|TABLE) .*? locks (rec but not gap|gap before rec)/m;
+      $hash->{'insert_intention'}
+         = $lock =~ m/^(?:RECORD|TABLE) .*? insert intention/m ? 1 : 0;
+      $hash->{'waiting'}
+         = $lock =~ m/^(?:RECORD|TABLE) .*? waiting/m ? 1 : 0;
+
+      # Some things may not be in the text, so make sure they are not
+      # undef.
+      map { $hash->{$_} ||= 0 } qw(n_bits page_no space_id);
+      map { $hash->{$_} ||= "" } qw(index special);
+      push @result, $hash;
+   }
+
+   return @result;
+}
+
+sub parse_tx_text {
+   my ( $txn, $complete, $debug, $full ) = @_;
+
+   my ( $txn_id, $txn_status, $active_secs, $proc_no, $os_thread_id )
+      = $txn
+      =~ m/^(?:---)?TRANSACTION $t, (\D*?)(?: $d sec)?, (?:process no $d, )?OS thread id $d/m;
+   my ( $thread_status, $thread_decl_inside )
+      = $txn
+      =~ m/OS thread id \d+(?: ([^,]+?))?(?:, thread declared inside InnoDB $d)?$/m;
+
+   # Parsing the line that begins 'MySQL thread id' is complicated.  The only
+   # thing always in the line is the thread and query id.  See function
+   # innobase_mysql_print_thd in InnoDB source file sql/ha_innodb.cc.
+   my ( $thread_line ) = $txn =~ m/^(MySQL thread id .*)$/m;
+   my ( $mysql_thread_id, $query_id, $hostname, $ip, $user, $query_status );
+
+   if ( $thread_line ) {
+      # These parts can always be gotten.
+      ( $mysql_thread_id, $query_id ) = $thread_line =~ m/^MySQL thread id $d, query id $d/m;
+
+      # If it's a master/slave thread, "Has (read|sent) all" may be the thread's
+      # proc_info.  In these cases, there won't be any host/ip/user info
+      ( $query_status ) = $thread_line =~ m/(Has (?:read|sent) all .*$)/m;
+      if ( defined($query_status) ) {
+         $user = 'system user';
+      }
+
+      # It may be the case that the query id is the last thing in the line.
+      elsif ( $thread_line =~ m/query id \d+ / ) {
+         # The IP address is the only non-word thing left, so it's the most
+         # useful marker for where I have to start guessing.
+         ( $hostname, $ip ) = $thread_line =~ m/query id \d+(?: ([A-Za-z]\S+))? $i/m;
+         if ( defined $ip ) {
+            ( $user, $query_status ) = $thread_line =~ m/$ip $w(?: (.*))?$/;
+         }
+         else { # OK, there wasn't an IP address.
+            # There might not be ANYTHING except the query status.
+            ( $query_status ) = $thread_line =~ m/query id \d+ (.*)$/;
+            if ( $query_status !~ m/^\w+ing/ && !exists($is_proc_info{$query_status}) ) {
+               # The remaining tokens are, in order: hostname, user, query_status.
+               # It's basically impossible to know which is which.
+               ( $hostname, $user, $query_status ) = $thread_line
+                  =~ m/query id \d+(?: ([A-Za-z]\S+))?(?: $w(?: (.*))?)?$/m;
+            }
+            else {
+               $user = 'system user';
+            }
+         }
+      }
+   }
+
+   my ( $lock_wait_status, $lock_structs, $heap_size, $row_locks, $undo_log_entries )
+      = $txn
+      =~ m/^(?:(\D*) )?$d lock struct\(s\), heap size $d(?:, $d row lock\(s\))?(?:, undo log entries $d)?$/m;
+   my ( $lock_wait_time )
+      = $txn
+      =~ m/^------- TRX HAS BEEN WAITING $d SEC/m;
+
+   my $locks;
+   # If the transaction has locks, grab the locks.
+   if ( $txn =~ m/^TABLE LOCK|RECORD LOCKS/ ) {
+      $locks = [parse_innodb_record_locks($txn, $complete, $debug, $full)];
+   }
+   
+   my ( $tables_in_use, $tables_locked )
+      = $txn
+      =~ m/^mysql tables in use $d, locked $d$/m;
+   my ( $txn_doesnt_see_ge, $txn_sees_lt )
+      = $txn
+      =~ m/^Trx read view will not see trx with id >= $t, sees < $t$/m;
+   my $has_read_view = defined($txn_doesnt_see_ge);
+   # Only a certain number of bytes of the query text are included here, at least
+   # under some circumstances.  Some versions include 300, some 600.
+   my ( $query_text )
+      = $txn
+      =~ m{
+         ^MySQL\sthread\sid\s[^\n]+\n           # This comes before the query text
+         (.*?)                                  # The query text
+         (?=                                    # Followed by any of...
+            ^Trx\sread\sview
+            |^-------\sTRX\sHAS\sBEEN\sWAITING
+            |^TABLE\sLOCK
+            |^RECORD\sLOCKS\sspace\sid
+            |^(?:---)?TRANSACTION
+            |^\*\*\*\s\(\d\)
+            |\Z
+         )
+      }xms;
+   if ( $query_text ) {
+      $query_text =~ s/\s+$//;
+   }
+   else {
+      $query_text = '';
+   }
+
+   my %stuff = (
+      active_secs        => $active_secs,
+      has_read_view      => $has_read_view,
+      heap_size          => $heap_size,
+      hostname           => $hostname,
+      ip                 => $ip,
+      lock_structs       => $lock_structs,
+      lock_wait_status   => $lock_wait_status,
+      lock_wait_time     => $lock_wait_time,
+      mysql_thread_id    => $mysql_thread_id,
+      os_thread_id       => $os_thread_id,
+      proc_no            => $proc_no,
+      query_id           => $query_id,
+      query_status       => $query_status,
+      query_text         => $query_text,
+      row_locks          => $row_locks,
+      tables_in_use      => $tables_in_use,
+      tables_locked      => $tables_locked,
+      thread_decl_inside => $thread_decl_inside,
+      thread_status      => $thread_status,
+      txn_doesnt_see_ge  => $txn_doesnt_see_ge,
+      txn_id             => $txn_id,
+      txn_sees_lt        => $txn_sees_lt,
+      txn_status         => $txn_status,
+      undo_log_entries   => $undo_log_entries,
+      user               => $user,
+   );
+   $stuff{'fulltext'} = $txn if $debug;
+   $stuff{'locks'} = $locks if $locks;
+
+   # Some things may not be in the txn text, so make sure they are not
+   # undef.
+   map { $stuff{$_} ||= 0 } qw(active_secs heap_size lock_structs
+         tables_in_use undo_log_entries tables_locked has_read_view
+         thread_decl_inside lock_wait_time proc_no row_locks);
+   map { $stuff{$_} ||= "" } qw(thread_status txn_doesnt_see_ge
+         txn_sees_lt query_status ip query_text lock_wait_status user);
+   $stuff{'hostname'} ||= $stuff{'ip'};
+
+   return \%stuff;
+}
+
+sub parse_tx_section {
+   my ( $section, $complete, $debug, $full ) = @_;
+   return unless $section && $section->{'fulltext'};
+   my $fulltext = $section->{'fulltext'};
+   $section->{'transactions'} = [];
+
+   # Handle the individual transactions
+   my @transactions = $fulltext =~ m/(---TRANSACTION \d.*?)(?=\n---TRANSACTION|$)/gs;
+   foreach my $txn ( @transactions ) {
+      my $stuff = parse_tx_text( $txn, $complete, $debug, $full );
+      delete $stuff->{'fulltext'} unless $debug;
+      push @{$section->{'transactions'}}, $stuff;
+   }
+
+   # Handle the general info
+   @{$section}{ 'trx_id_counter' }
+      = $fulltext =~ m/^Trx id counter $t$/m;
+   @{$section}{ 'purge_done_for', 'purge_undo_for' }
+      = $fulltext =~ m/^Purge done for trx's n:o < $t undo n:o < $t$/m;
+   @{$section}{ 'history_list_len' } # This isn't present in some 4.x versions
+      = $fulltext =~ m/^History list length $d$/m;
+   @{$section}{ 'num_lock_structs' }
+      = $fulltext =~ m/^Total number of lock structs in row lock hash table $d$/m;
+   @{$section}{ 'is_truncated' }
+      = $fulltext =~ m/^\.\.\. truncated\.\.\.$/m ? 1 : 0;
+
+   # Fill in things that might not be present
+   foreach ( qw(history_list_len) ) {
+      $section->{$_} ||= 0;
+   }
+
+   delete $section->{'fulltext'} unless $debug;
+   return 1;
+}
+
+# I've read the source for this section.
+sub parse_ro_section {
+   my ( $section, $complete, $debug, $full ) = @_;
+   return unless $section && $section->{'fulltext'};
+   my $fulltext = $section->{'fulltext'};
+
+   # Grab the info
+   @{$section}{ 'queries_inside', 'queries_in_queue' }
+      = $fulltext =~ m/^$d queries inside InnoDB, $d queries in queue$/m;
+   ( $section->{ 'read_views_open' } )
+      = $fulltext =~ m/^$d read views open inside InnoDB$/m;
+   ( $section->{ 'n_reserved_extents' } )
+      = $fulltext =~ m/^$d tablespace extents now reserved for B-tree/m;
+   @{$section}{ 'main_thread_proc_no', 'main_thread_id', 'main_thread_state' }
+      = $fulltext =~ m/^Main thread (?:process no. $d, )?id $d, state: (.*)$/m;
+   @{$section}{ 'num_rows_ins', 'num_rows_upd', 'num_rows_del', 'num_rows_read' }
+      = $fulltext =~ m/^Number of rows inserted $d, updated $d, deleted $d, read $d$/m;
+   @{$section}{ 'ins_sec', 'upd_sec', 'del_sec', 'read_sec' }
+      = $fulltext =~ m#^$f inserts/s, $f updates/s, $f deletes/s, $f reads/s$#m;
+   $section->{'main_thread_proc_no'} ||= 0;
+
+   map { $section->{$_} ||= 0 } qw(read_views_open n_reserved_extents);
+   delete $section->{'fulltext'} unless $debug;
+   return 1;
+}
+
+sub parse_lg_section {
+   my ( $section, $complete, $debug, $full ) = @_;
+   return unless $section;
+   my $fulltext = $section->{'fulltext'};
+
+   # Grab the info
+   ( $section->{ 'log_seq_no' } )
+      = $fulltext =~ m/Log sequence number \s*(\d.*)$/m;
+   ( $section->{ 'log_flushed_to' } )
+      = $fulltext =~ m/Log flushed up to \s*(\d.*)$/m;
+   ( $section->{ 'last_chkp' } )
+      = $fulltext =~ m/Last checkpoint at \s*(\d.*)$/m;
+   @{$section}{ 'pending_log_writes', 'pending_chkp_writes' }
+      = $fulltext =~ m/$d pending log writes, $d pending chkp writes/;
+   @{$section}{ 'log_ios_done', 'log_ios_s' }
+      = $fulltext =~ m#$d log i/o's done, $f log i/o's/second#;
+
+   delete $section->{'fulltext'} unless $debug;
+   return 1;
+}
+
+sub parse_ib_section {
+   my ( $section, $complete, $debug, $full ) = @_;
+   return unless $section && $section->{'fulltext'};
+   my $fulltext = $section->{'fulltext'};
+
+   # Some servers will output ibuf information for tablespace 0, as though there
+   # might be many tablespaces with insert buffers.  (In practice I believe
+   # the source code shows there will only ever be one).  I have to parse both
+   # cases here, but I assume there will only be one.
+   @{$section}{ 'size', 'free_list_len', 'seg_size' }
+      = $fulltext =~ m/^Ibuf(?: for space 0)?: size $d, free list len $d, seg size $d,$/m;
+   @{$section}{ 'inserts', 'merged_recs', 'merges' }
+      = $fulltext =~ m/^$d inserts, $d merged recs, $d merges$/m;
+
+   @{$section}{ 'hash_table_size', 'used_cells', 'bufs_in_node_heap' }
+      = $fulltext =~ m/^Hash table size $d, used cells $d, node heap has $d buffer\(s\)$/m;
+   @{$section}{ 'hash_searches_s', 'non_hash_searches_s' }
+      = $fulltext =~ m{^$f hash searches/s, $f non-hash searches/s$}m;
+
+   delete $section->{'fulltext'} unless $debug;
+   return 1;
+}
+
+sub parse_wait_array {
+   my ( $text, $complete, $debug, $full ) = @_;
+   my %result;
+
+   @result{ qw(thread waited_at_filename waited_at_line waited_secs) }
+      = $text =~ m/^--Thread $d has waited at $fl for $f seconds/m;
+
+   # Depending on whether it's a SYNC_MUTEX,RW_LOCK_EX,RW_LOCK_SHARED,
+   # there will be different text output
+   if ( $text =~ m/^Mutex at/m ) {
+      $result{'request_type'} = 'M';
+      @result{ qw( lock_mem_addr lock_cfile_name lock_cline lock_var) }
+         = $text =~ m/^Mutex at $h created file $fl, lock var $d$/m;
+      @result{ qw( waiters_flag )}
+         = $text =~ m/^waiters flag $d$/m;
+   }
+   else {
+      @result{ qw( request_type lock_mem_addr lock_cfile_name lock_cline) }
+         = $text =~ m/^(.)-lock on RW-latch at $h created in file $fl$/m;
+      @result{ qw( writer_thread writer_lock_mode ) }
+         = $text =~ m/^a writer \(thread id $d\) has reserved it in mode  (.*)$/m;
+      @result{ qw( num_readers waiters_flag )}
+         = $text =~ m/^number of readers $d, waiters flag $d$/m;
+      @result{ qw(last_s_file_name last_s_line ) }
+         = $text =~ m/Last time read locked in file $fl$/m;
+      @result{ qw(last_x_file_name last_x_line ) }
+         = $text =~ m/Last time write locked in file $fl$/m;
+   }
+
+   $result{'cell_waiting'} = $text =~ m/^wait has ended$/m ? 0 : 1;
+   $result{'cell_event_set'} = $text =~ m/^wait is ending$/m ? 1 : 0;
+
+   # Because there are two code paths, some things won't get set.
+   map { $result{$_} ||= '' }
+      qw(last_s_file_name last_x_file_name writer_lock_mode);
+   map { $result{$_} ||= 0 }
+      qw(num_readers lock_var last_s_line last_x_line writer_thread);
+
+   return \%result;
+}
+
+sub parse_sm_section {
+   my ( $section, $complete, $debug, $full ) = @_;
+   return 0 unless $section && $section->{'fulltext'};
+   my $fulltext = $section->{'fulltext'};
+
+   # Grab the info
+   @{$section}{ 'reservation_count', 'signal_count' }
+      = $fulltext =~ m/^OS WAIT ARRAY INFO: reservation count $d, signal count $d$/m;
+   @{$section}{ 'mutex_spin_waits', 'mutex_spin_rounds', 'mutex_os_waits' }
+      = $fulltext =~ m/^Mutex spin waits $d, rounds $d, OS waits $d$/m;
+   @{$section}{ 'rw_shared_spins', 'rw_shared_os_waits', 'rw_excl_spins', 'rw_excl_os_waits' }
+      = $fulltext =~ m/^RW-shared spins $d, OS waits $d; RW-excl spins $d, OS waits $d$/m;
+
+   # Look for info on waits.
+   my @waits = $fulltext =~ m/^(--Thread.*?)^(?=Mutex spin|--Thread)/gms;
+   $section->{'waits'} = [ map { parse_wait_array($_, $complete, $debug) } @waits ];
+   $section->{'wait_array_size'} = scalar(@waits);
+
+   delete $section->{'fulltext'} unless $debug;
+   return 1;
+}
+
+# I've read the source for this section.
+sub parse_bp_section {
+   my ( $section, $complete, $debug, $full ) = @_;
+   return unless $section && $section->{'fulltext'};
+   my $fulltext = $section->{'fulltext'};
+
+   # Grab the info
+   @{$section}{ 'total_mem_alloc', 'add_pool_alloc' }
+      = $fulltext =~ m/^Total memory allocated $d; in additional pool allocated $d$/m;
+   @{$section}{'dict_mem_alloc'}     = $fulltext =~ m/Dictionary memory allocated $d/;
+   @{$section}{'awe_mem_alloc'}      = $fulltext =~ m/$d MB of AWE memory/;
+   @{$section}{'buf_pool_size'}      = $fulltext =~ m/^Buffer pool size\s*$d$/m;
+   @{$section}{'buf_free'}           = $fulltext =~ m/^Free buffers\s*$d$/m;
+   @{$section}{'pages_total'}        = $fulltext =~ m/^Database pages\s*$d$/m;
+   @{$section}{'pages_modified'}     = $fulltext =~ m/^Modified db pages\s*$d$/m;
+   @{$section}{'pages_read', 'pages_created', 'pages_written'}
+      = $fulltext =~ m/^Pages read $d, created $d, written $d$/m;
+   @{$section}{'page_reads_sec', 'page_creates_sec', 'page_writes_sec'}
+      = $fulltext =~ m{^$f reads/s, $f creates/s, $f writes/s$}m;
+   @{$section}{'buf_pool_hits', 'buf_pool_reads'}
+      = $fulltext =~ m{Buffer pool hit rate $d / $d$}m;
+   if ($fulltext =~ m/^No buffer pool page gets since the last printout$/m) {
+      @{$section}{'buf_pool_hits', 'buf_pool_reads'} = (0, 0);
+      @{$section}{'buf_pool_hit_rate'} = '--';
+   }
+   else {
+      @{$section}{'buf_pool_hit_rate'}
+         = $fulltext =~ m{Buffer pool hit rate (\d+ / \d+)$}m;
+   }
+   @{$section}{'reads_pending'} = $fulltext =~ m/^Pending reads $d/m;
+   @{$section}{'writes_pending_lru', 'writes_pending_flush_list', 'writes_pending_single_page' }
+      = $fulltext =~ m/^Pending writes: LRU $d, flush list $d, single page $d$/m;
+
+   map { $section->{$_} ||= 0 }
+      qw(writes_pending_lru writes_pending_flush_list writes_pending_single_page
+      awe_mem_alloc dict_mem_alloc);
+   @{$section}{'writes_pending'} = List::Util::sum(
+      @{$section}{ qw(writes_pending_lru writes_pending_flush_list writes_pending_single_page) });
+
+   delete $section->{'fulltext'} unless $debug;
+   return 1;
+}
+
+# I've read the source for this.
+sub parse_io_section {
+   my ( $section, $complete, $debug, $full ) = @_;
+   return unless $section && $section->{'fulltext'};
+   my $fulltext = $section->{'fulltext'};
+   $section->{'threads'} = {};
+
+   # Grab the I/O thread info
+   my @threads = $fulltext =~ m<^(I/O thread \d+ .*)$>gm;
+   foreach my $thread (@threads) {
+      my ( $tid, $state, $purpose, $event_set )
+         = $thread =~ m{I/O thread $d state: (.+?) \((.*)\)(?: ev set)?$}m;
+      if ( defined $tid ) {
+         $section->{'threads'}->{$tid} = {
+            thread    => $tid,
+            state     => $state,
+            purpose   => $purpose,
+            event_set => $event_set ? 1 : 0,
+         };
+      }
+   }
+
+   # Grab the reads/writes/flushes info
+   @{$section}{ 'pending_normal_aio_reads', 'pending_aio_writes' }
+      = $fulltext =~ m/^Pending normal aio reads: $d, aio writes: $d,$/m;
+   @{$section}{ 'pending_ibuf_aio_reads', 'pending_log_ios', 'pending_sync_ios' }
+      = $fulltext =~ m{^ ibuf aio reads: $d, log i/o's: $d, sync i/o's: $d$}m;
+   @{$section}{ 'flush_type', 'pending_log_flushes', 'pending_buffer_pool_flushes' }
+      = $fulltext =~ m/^Pending flushes \($w\) log: $d; buffer pool: $d$/m;
+   @{$section}{ 'os_file_reads', 'os_file_writes', 'os_fsyncs' }
+      = $fulltext =~ m/^$d OS file reads, $d OS file writes, $d OS fsyncs$/m;
+   @{$section}{ 'reads_s', 'avg_bytes_s', 'writes_s', 'fsyncs_s' }
+      = $fulltext =~ m{^$f reads/s, $d avg bytes/read, $f writes/s, $f fsyncs/s$}m;
+   @{$section}{ 'pending_preads', 'pending_pwrites' }
+      = $fulltext =~ m/$d pending preads, $d pending pwrites$/m;
+   @{$section}{ 'pending_preads', 'pending_pwrites' } = (0, 0)
+      unless defined($section->{'pending_preads'});
+
+   delete $section->{'fulltext'} unless $debug;
+   return 1;
+}
+
+sub _debug {
+   my ( $debug, $msg ) = @_;
+   if ( $debug ) {
+      die $msg;
+   }
+   else {
+      warn $msg;
+   }
+   return 1;
+}
+
+1;
+
+# end_of_package
+# ############################################################################
+# Perldoc section.  I put this last as per the Dog book.
+# ############################################################################
+=pod
+
+=head1 NAME
+
+InnoDBParser - Parse InnoDB monitor text.
+
+=head1 DESCRIPTION
+
+InnoDBParser tries to parse the output of the InnoDB monitor.  One way to get
+this output is to connect to a MySQL server and issue the command SHOW ENGINE
+INNODB STATUS (omit 'ENGINE' on earlier versions of MySQL).  The goal is to
+turn text into data that something else (e.g. innotop) can use.
+
+The output comes from all over, but the place to start in the source is
+innobase/srv/srv0srv.c.
+
+=head1 SYNOPSIS
+
+   use InnoDBParser;
+   use DBI;
+
+   # Get the status text.
+   my $dbh = DBI->connect(
+      "DBI::mysql:test;host=localhost",
+      'user',
+      'password'
+   );
+   my $query = 'SHOW /*!5 ENGINE */ INNODB STATUS';
+   my $text  = $dbh->selectcol_arrayref($query)->[0];
+
+   # 1 or 0
+   my $debug = 1;
+
+   # Choose sections of the monitor text you want.  Possible values:
+   # TRANSACTIONS                          => tx
+   # BUFFER POOL AND MEMORY                => bp
+   # SEMAPHORES                            => sm
+   # LOG                                   => lg
+   # ROW OPERATIONS                        => ro
+   # INSERT BUFFER AND ADAPTIVE HASH INDEX => ib
+   # FILE I/O                              => io
+   # LATEST DETECTED DEADLOCK              => dl
+   # LATEST FOREIGN KEY ERROR              => fk
+
+   my $required_sections = {
+      tx => 1,
+   };
+
+   # Parse the status text.
+   my $parser = InnoDBParser->new;
+   $innodb_status = $parser->parse_status_text(
+      $text,
+      $debug,
+      # Omit the following parameter to get all sections.
+      $required_sections,
+   );
+
+=head1 COPYRIGHT, LICENSE AND WARRANTY
+
+This package is copyright (c) 2006 Baron Schwartz, baron at xaprb dot com.
+Feedback and improvements are gratefully received.
+
+THIS PACKAGE IS PROVIDED "AS IS" AND WITHOUT ANY EXPRESS OR IMPLIED
+WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTIES OF
+MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation, version 2; OR the Perl Artistic License.  On UNIX and similar
+systems, you can issue `man perlgpl' or `man perlartistic' to read these
+licenses.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA  02111-1307  USA
+
+=head1 AUTHOR
+
+Baron Schwartz, baron at xaprb dot com.
+
+=head1 BUGS
+
+None known, but I bet there are some.  The InnoDB monitor text wasn't really
+designed to be parsable.
+
+=head1 SEE ALSO
+
+innotop - a program that can format the parsed status information for humans
+to read and enjoy.
+
+=cut
diff --git a/storage/xtradb/build/debian/additions/innotop/changelog.innotop b/storage/xtradb/build/debian/additions/innotop/changelog.innotop
new file mode 100644
index 00000000000..baff706e235
--- /dev/null
+++ b/storage/xtradb/build/debian/additions/innotop/changelog.innotop
@@ -0,0 +1,318 @@
+Changelog for innotop and InnoDBParser:
+
+2007-11-09: version 1.6.0
+
+   * S mode crashed on non-numeric values.
+   * New user-defined columns crashed upon restart.
+   * Added --color option to control terminal coloring.
+
+2007-09-18: version 1.5.2
+
+   * Added the ability to monitor InnoDB status from a file.
+   * Changed W mode to L mode; it monitors all locks, not just lock waits.
+
+2007-09-16: version 1.5.1
+
+   * Added C (Command Summary) mode.
+   * Fixed a bug in the 'avg' aggregate function.
+
+2007-09-10: version 1.5.0
+
+   Changes:
+   * Added plugin functionality.
+   * Added group-by functionality.
+   * Moved the configuration file to a directory.
+   * Enhanced filtering and sorting on pivoted tables.
+   * Many small bug fixes.
+
+2007-07-16: version 1.4.3
+
+   Changes:
+   * Added standard --version command-line option
+   * Changed colors to cyan instead of blue; more visible on dark terminals.
+   * Added information to the filter-choosing dialog.
+   * Added column auto-completion when entering a filter expression.
+   * Changed Term::ReadKey from optional to mandatory.
+   * Clarified username in password prompting.
+   * Ten thousand words of documentation!
+
+   Bugs fixed:
+   * innotop crashed in W mode when InnoDB status data was truncated.
+   * innotop didn't display errors in tables if debug was enabled.
+   * The colored() subroutine wasn't being created in non-interactive mode.
+   * Don't prompt to save password except the first time.
+
+2007-05-03: version 1.4.2
+
+   This version contains all changes to the trunk until revision 239; some
+   changes in revisions 240:250 are included.
+
+   MAJOR CHANGES:
+
+   * Quick-filters to easily filter any column in any display
+   * Compatibility with MySQL 3.23 through 6.0
+   * Improved error handling when a server is down, permissions denied, etc
+   * Use additional SHOW INNODB STATUS information in 5.1.x
+   * Make all modes use tables consistently, so they can all be edited,
+     filtered, colored and sorted consistently
+   * Combine V, G and S modes into S mode, with v, g, and s hot-keys
+   * Let DBD driver read MySQL option files; permit connections without
+     user/pass/etc
+   * Compile SQL-like expressions into Perl subroutines; eliminate need to
+     know Perl
+   * Do not save all config data to config file, only save user's customizations
+   * Rewritten and improved command-line option handling
+   * Added --count, --delay, and other command-line options to support
+     run-and-exit operation
+   * Improve built-in variable sets
+   * Improve help screen with three-part balanced-column layout
+   * Simplify table-editor and improve hotkey support
+   * Require Perl to have high-resolution time support (Time::HiRes)
+   * Help the user choose a query to analyze or kill
+   * Enable EXPLAIN, show-full-query in T mode just like Q mode
+   * Let data-extraction access current, previous and incremental data sets
+     all at once
+
+   MINOR CHANGES:
+
+   * Column stabilizing for Q mode
+   * New color rules for T, Q, W modes
+   * Apply slave I/O filter to Q mode
+   * Improve detection of server version and other meta-data
+   * Make connection timeout a config variable
+   * Improve cross-version-compatible SQL syntax
+   * Get some information from the DBD driver instead of asking MySQL for it
+   * Improved error messages
+   * Improve server group creation/editing
+   * Improve connection/thread killing
+   * Fix broken key bindings and restore previously mapped hot-keys for
+     choosing columns
+   * Some documentation updates (but not nearly enough)
+   * Allow the user to specify graphing char in S mode (formerly G mode)
+   * Allow easy switching between variable sets in S mode
+   * Bind 'n' key globally to choose the 'next' server connection
+   * Bind '%' key globally to filter displayed tables
+   * Allow aligning columns on the decimal place for easy readability
+   * Add hide_hdr config variable to hide column headers in tables
+   * Add a feature to smartly run PURGE MASTER LOGS in Replication mode
+   * Enable debug mode as a globally configurable variable
+   * Improve error messages when an expression or filter doesn't compile or has
+     a run-time error; die on error when debug is enabled
+   * Allow user-configurable delays after executing SQL (to let the server
+     settle down before taking another measurement)
+   * Add an expression to show how long until a transaction is finished
+   * Add skip_innodb as a global config variable
+   * Add '%' after percentages to help disambiguate (user-configurable)
+   * Add column to M mode to help see how fast slave is catching up to master
+
+   BUG FIXES:
+
+   * T and W modes had wrong value for wait_status column
+   * Error tracking on connections didn't reset when the connection recovered
+   * wait_timeout on connections couldn't be set before MySQL 4.0.3
+   * There was a crash on 3.23 when wiping deadlocks
+   * Lettercase changes in some result sets (SHOW MASTER/SLAVE STATUS) between
+     MySQL versions crashed innotop
+   * Inactive connections crashed innotop upon access to DBD driver
+   * set_precision did not respect user defaults for number of digits
+   * --inc command-line option could not be negated
+   * InnoDB status parsing was not always parsing all needed information
+   * S mode (formerly G mode) could crash trying to divide non-numeric data
+   * M table didn't show Slave_open_temp_tables variable; incorrect lettercase
+   * DBD drivers with broken AutoCommit would crash innotop
+   * Some key bindings had incorrect labels
+   * Some config-file loading routines could load data for things that didn't
+     exist
+   * Headers printed too often in S mode
+   * High-resolution time was not used even when the user had it
+   * Non-interactive mode printed blank lines sometimes
+   * Q-mode header and statusbar showed different QPS numbers
+   * Formulas for key-cache and query-cache hit ratios were wrong
+   * Mac OS "Darwin" machines were mis-identified as Microsoft Windows
+   * Some multiplications crashed when given undefined input
+   * The commify transformation did not check its input and could crash
+   * Specifying an invalid mode on the command line or config file could crash
+     innotop
+
+2007-03-29: version 1.4.1
+
+   * More tweaks to display of connection errors.
+   * Fixed a problem with skip-innodb in MySQL 5.1.
+   * Fix a bug with dead connections in single-connection mode.
+   * Fix a regex to allow parsing more data from truncated deadlocks.
+   * Don't load active cxns from the config file if the cxn isn't defined.
+
+2007-03-03: version 1.4.0
+
+   * Further tweak error handling and display of connection errors
+   * More centralization of querying
+   * Fix forking so it doesn't kill all database connections
+   * Allow user to run innotop without permissions for GLOBAL variables and status
+
+2007-02-11: version 1.3.6
+
+   * Handle some connection failures so innotop doesn't crash because of one server.
+   * Enable incremental display in more modes.
+   * Tweaks to colorizing, color editor, and default color rules.
+   * Tweaks to default sorting rules.
+   * Use prepared statements for efficiency.
+   * Bug fixes and code cleanups.
+   * Data storage is keyed on clock ticks now.
+
+2007-02-03: version 1.3.5
+
+   * Bug fixes.
+   * More tools for editing configuration from within innotop.
+   * Filters and transformations are constrained to valid values.
+   * Support for colorizing rows.
+   * Sorting by multiple columns.
+   * Compress headers when display is very wide.
+   * Stabilize and limit column widths.
+   * Check config file formats when upgrading so upgrades go smoothly.
+   * Make D mode handle many connections at once.
+   * Extract simple expressions from data sets in column src property.
+     This makes innotop more awk-ish.
+
+2007-01-16: version 1.3
+
+   * Readline support.
+   * Can be used unattended, or in a pipe-and-filter mode
+     where it outputs tab-separated data to standard output.
+   * You can specify a config file on the command line.
+     Config files can be marked read-only.
+   * Monitor multiple servers simultaneously.
+   * Server groups to help manage many servers conveniently.
+   * Monitor master/slave status, and control slaves.
+   * Columns can have user-defined expressions as their data sources.
+   * Better configuration tools.
+   * InnoDB status information is merged into SHOW VARIABLES and
+     SHOW STATUS information, so you can access it all together.
+   * High-precision time support in more places.
+   * Lots of tweaks to make things display more readably and compactly.
+   * Column transformations and filters.
+
+2007-01-16: version 1.0.1
+   * NOTE: innotop is now hosted at Sourceforge, in Subversion not CVS.
+     The new project homepage is http://sourceforge.net/projects/innotop/
+   * Tweak default T/Q mode sort columns to match what people expect.
+   * Fix broken InnoDBParser.pm documentation (and hence man page).
+
+2007-01-06: version 1.0
+   * NOTE: innotop is now hosted at Sourceforge, in Subversion not CVS.
+     The new project homepage is http://sourceforge.net/projects/innotop/
+   * Prevent control characters from freaking terminal out.
+   * Set timeout to keep busy servers from closing connection.
+   * There is only one InnoDB insert buffer.
+   * Make licenses clear and consistent.
+
+2006-11-14: innotop 0.1.160, InnoDBParser version 1.69
+   * Support for ANSI color on Microsoft Windows (more readable, compact
+     display; thanks Gisbert W. Selke).
+   * Better handling of $ENV{HOME} on Windows.
+   * Added a LICENSE file to the package as per Gentoo bug:
+     http://bugs.gentoo.org/show_bug.cgi?id=147600
+
+2006-11-11: innotop 0.1.157, InnoDBParser version 1.69
+   * Add Microsoft Windows support.
+
+2006-10-19: innotop 0.1.154, InnoDBParser version 1.69
+   * Add O (Open Tables) mode
+   * Add some more checks to handle incomplete InnoDB status information
+
+2006-09-30: innotop 0.1.152, InnoDBParser version 1.69
+   * Figured out what was wrong with package $VERSION variable: it wasn't
+     after the package declaration!
+
+2006-09-28: innotop 0.1.152, InnoDBParser version 1.67
+   * Make more efforts towards crash-resistance and tolerance of completely
+     messed-up inputs.  If innotop itself is broken, it is now much harder to
+     tell, because it just keeps on running without complaining.
+   * Fix a small bug parsing out some information and displaying it.
+
+2006-09-05: innotop 0.1.149, InnoDBParser version 1.64
+   * Try to find and eliminate any parsing code that assumes pattern matches
+     will succeed.
+
+2006-09-05: innotop 0.1.149, InnoDBParser version 1.62
+   * Make innotop crash-resistant, so I can declare it STABLE finally.
+   * Instead of using SQL conditional comments, detect MySQL version.
+
+2006-08-22: innotop 0.1.147, InnoDBParser version 1.60
+   * Fix some innotop bugs with undefined values, bad formatting etc.
+
+2006-08-19: innotop 0.1.146, InnoDBParser version 1.60
+   * Make innotop handle some unexpected NULL values in Q mode.
+   * Add OS wait information to W mode, so it is now "everything that waits."
+   * Center section captions better.
+   * Make R mode more readable and compact.
+   * Make InnoDBParser parse lock waits even when they've been waiting 0 secs.
+
+2006-08-12: innotop 0.1.139, InnoDBParser version 1.59
+   * Add more documentation
+   * Tweak V mode to show more info in less space.
+   * Fix a bug in G mode.
+
+2006-08-10: innotop 0.1.132, InnoDBParser version 1.58
+   * Handle yet more types of FK error... it will never end!
+   * Handle some special cases when DEADLOCK info truncated
+   * Add a bit more FK info to F mode in innotop
+   * More tests added to the test suite
+
+2006-08-07: innotop 0.1.131, InnoDBParser version 1.55
+   * Fix another issue with configuration
+   * Handle another type of FK error
+
+2006-08-03: innotop 0.1.130, InnoDBParser version 1.54
+   * Fix an issue loading config file
+   * Add heap_no to 'D' (InnoDB Deadlock) mode to ease deadlock debugging.
+
+2006-08-02: innotop 0.1.128, InnoDBParser version 1.54
+   * Parse lock wait information from the TRANSACTION section.
+   * Even more OS-specific parsing... pain in the butt...
+   * Add 'W' (InnoDB Lock Wait) mode.
+   * Fix some minor display issues with statusbar.
+
+2006-08-02: innotop 0.1.125, InnoDBParser version 1.50
+   * Don't try to get references to Perl built-in functions like time()
+   * Handle more OS-specific variations of InnoDB status text
+   * Add some more information to various places in innotop
+
+2006-08-01: innotop 0.1.123, InnoDBParser version 1.47
+
+   * Enhance S and G modes: clear screen and re-print headers
+   * Don't crash when deadlock data is truncated
+   * Make Analyze mode say how to get back to whatever you came from
+   * Display 'nothing to display' when there is nothing
+   * Add ability to read InnoDB status text from a file (mostly helps test)
+   * Add table of Wait Array Information in Row Op/Semaphore mode
+   * Add table of lock information in InnoDB deadlock mode
+   * Ensure new features in upgrades don't get masked by existing config files
+   * Tweak default column choices for T mode
+   * Enhance foreign key parsing
+   * Enhance physical record and data tuple parsing
+   * Enhance lock parsing (handle old-style and new-style formats)
+
+2006-07-24: innotop 0.1.112, InnoDBParser version 1.36
+
+   * InnoDBParser enhancements for FK error messages.
+   * A fix to innotop to prevent it from crashing while trying to display a FK
+     error message.
+   * Some minor cosmetic changes to number formatting in innotop.
+
+2006-07-22: innotop 0.1.106, InnoDBParser version 1.35
+
+   * InnoDBParser is much more complete and accurate.
+   * Tons of bug fixes.
+   * Add partitions to EXPLAIN mode.
+   * Enhance Q mode header, add T mode header.
+   * Share some configuration variables across modes.
+   * Add formatted time columns to Q, T modes.
+   * Add command-line argument parsing.
+   * Turn off echo when asking for password.
+   * Add option to specify port when connecting.
+   * Let display-optimized-query display multiple notes.
+   * Lots of small improvements, such as showing more info in statusbar.
+
+2006-07-02: innotop 0.1.74, InnoDBParser version 1.24
+
+   * Initial release for public consumption.
diff --git a/storage/xtradb/build/debian/additions/innotop/innotop b/storage/xtradb/build/debian/additions/innotop/innotop
new file mode 100644
index 00000000000..e2bfc1bd965
--- /dev/null
+++ b/storage/xtradb/build/debian/additions/innotop/innotop
@@ -0,0 +1,9485 @@
+#!/usr/bin/perl
+
+# vim: tw=160:nowrap:expandtab:tabstop=3:shiftwidth=3:softtabstop=3
+
+use strict;
+use warnings FATAL => 'all';
+use sigtrap qw(handler finish untrapped normal-signals);
+
+use Data::Dumper;
+use DBI;
+use English qw(-no_match_vars);
+use File::Basename qw(dirname);
+use Getopt::Long;
+use List::Util qw(max min maxstr sum);
+use InnoDBParser;
+use POSIX qw(ceil);
+use Time::HiRes qw(time sleep);
+use Term::ReadKey qw(ReadMode ReadKey);
+
+# Version, license and warranty information. {{{1
+# ###########################################################################
+our $VERSION = '1.6.0';
+our $SVN_REV = sprintf("%d", q$Revision: 383 $ =~ m/(\d+)/g);
+our $SVN_URL = sprintf("%s", q$URL: https://innotop.svn.sourceforge.net/svnroot/innotop/trunk/innotop $ =~ m$svnroot/innotop/(\S+)$g);
+
+my $innotop_license = <<"LICENSE";
+
+This is innotop version $VERSION, a MySQL and InnoDB monitor.
+
+This program is copyright (c) 2006 Baron Schwartz.
+Feedback and improvements are welcome.
+
+THIS PROGRAM IS PROVIDED "AS IS" AND WITHOUT ANY EXPRESS OR IMPLIED
+WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTIES OF
+MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation, version 2; OR the Perl Artistic License.  On UNIX and similar
+systems, you can issue `man perlgpl' or `man perlartistic' to read these
+licenses.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA  02111-1307  USA.
+LICENSE
+
+# Configuration information and global setup {{{1
+# ###########################################################################
+
+# Really, really, super-global variables.
+my @config_versions = (
+   "000-000-000", "001-003-000", # config file was one big name-value hash.
+   "001-003-000", "001-004-002", # config file contained non-user-defined stuff.
+);
+
+my $clear_screen_sub;
+
+# This defines expected properties and defaults for the column definitions that
+# eventually end up in tbl_meta.
+my %col_props = (
+   hdr     => '',
+   just    => '-',
+   dec     => 0,     # Whether to align the column on the decimal point
+   num     => 0,
+   label   => '',
+   user    => 0,
+   src     => '',
+   tbl     => '',    # Helps when writing/reading custom columns in config files
+   minw    => 0,
+   maxw    => 0,
+   trans   => [],
+   agg     => 'first',  # Aggregate function
+   aggonly => 0,        # Whether to show only when tbl_meta->{aggregate} is true
+);
+
+# Actual DBI connections to MySQL servers.
+my %dbhs;
+
+# Command-line parameters {{{2
+# ###########################################################################
+
+my @opt_spec = (
+   { s => 'help',       d => 'Show this help message' },
+   { s => 'color|C!',   d => 'Use terminal coloring (default)',   c => 'color' },
+   { s => 'config|c=s', d => 'Config file to read' },
+   { s => 'nonint|n',   d => 'Non-interactive, output tab-separated fields' },
+   { s => 'count=i',    d => 'Number of updates before exiting' },
+   { s => 'delay|d=f',  d => 'Delay between updates in seconds',  c => 'interval' },
+   { s => 'mode|m=s',   d => 'Operating mode to start in',        c => 'mode' },
+   { s => 'inc|i!',     d => 'Measure incremental differences',   c => 'status_inc' },
+   { s => 'version',    d => 'Output version information and exit' },
+);
+
+# This is the container for the command-line options' values to be stored in
+# after processing.  Initial values are defaults.
+my %opts = (
+   n => !( -t STDIN && -t STDOUT ), # If in/out aren't to terminals, we're interactive
+);
+# Post-process...
+my %opt_seen;
+foreach my $spec ( @opt_spec ) {
+   my ( $long, $short ) = $spec->{s} =~ m/^(\w+)(?:\|([^!+=]*))?/;
+   $spec->{k} = $short || $long;
+   $spec->{l} = $long;
+   $spec->{t} = $short;
+   $spec->{n} = $spec->{s} =~ m/!/;
+   $opts{$spec->{k}} = undef unless defined $opts{$spec->{k}};
+   die "Duplicate option $spec->{k}" if $opt_seen{$spec->{k}}++;
+}
+
+Getopt::Long::Configure('no_ignore_case', 'bundling');
+GetOptions( map { $_->{s} => \$opts{$_->{k}} } @opt_spec) or $opts{help} = 1;
+
+if ( $opts{version} ) {
+   print "innotop  Ver $VERSION Changeset $SVN_REV from $SVN_URL\n";
+   exit(0);
+}
+
+if ( $opts{'help'} ) {
+   print "Usage: innotop <options> <innodb-status-file>\n\n";
+   my $maxw = max(map { length($_->{l}) + ($_->{n} ? 4 : 0)} @opt_spec);
+   foreach my $spec ( sort { $a->{l} cmp $b->{l} } @opt_spec ) {
+      my $long  = $spec->{n} ? "[no]$spec->{l}" : $spec->{l};
+      my $short = $spec->{t} ? "-$spec->{t}" : '';
+      printf("  --%-${maxw}s %-4s %s\n", $long, $short, $spec->{d});
+   }
+   print <<USAGE;
+
+innotop is a MySQL and InnoDB transaction/status monitor, like 'top' for
+MySQL.  It displays queries, InnoDB transactions, lock waits, deadlocks,
+foreign key errors, open tables, replication status, buffer information,
+row operations, logs, I/O operations, load graph, and more.  You can
+monitor many servers at once with innotop. 
+
+USAGE
+   exit(1);
+}
+
+# Meta-data (table definitions etc) {{{2
+# ###########################################################################
+
+# Expressions {{{3
+# Convenience so I can copy/paste these in several places...
+# ###########################################################################
+my %exprs = (
+   Host              => q{my $host = host || hostname || ''; ($host) = $host =~ m/^((?:[\d.]+(?=:))|(?:[a-zA-Z]\w+))/; return $host || ''},
+   Port              => q{my ($p) = host =~ m/:(.*)$/; return $p || 0},
+   OldVersions       => q{dulint_to_int(IB_tx_trx_id_counter) - dulint_to_int(IB_tx_purge_done_for)},
+   MaxTxnTime        => q/max(map{ $_->{active_secs} } @{ IB_tx_transactions }) || 0/,
+   NumTxns           => q{scalar @{ IB_tx_transactions } },
+   DirtyBufs         => q{ $cur->{IB_bp_pages_modified} / ($cur->{IB_bp_buf_pool_size} || 1) },
+   BufPoolFill       => q{ $cur->{IB_bp_pages_total} / ($cur->{IB_bp_buf_pool_size} || 1) },
+   ServerLoad        => q{ $cur->{Threads_connected}/(Questions||1)/Uptime_hires },
+   TxnTimeRemain     => q{ defined undo_log_entries && defined $pre->{undo_log_entries} && undo_log_entries < $pre->{undo_log_entries} ? undo_log_entries / (($pre->{undo_log_entries} - undo_log_entries)/((active_secs-$pre->{active_secs})||1))||1 : 0},
+   SlaveCatchupRate  => ' defined $cur->{seconds_behind_master} && defined $pre->{seconds_behind_master} && $cur->{seconds_behind_master} < $pre->{seconds_behind_master} ? ($pre->{seconds_behind_master}-$cur->{seconds_behind_master})/($cur->{Uptime_hires}-$pre->{Uptime_hires}) : 0',
+   QcacheHitRatio    => q{(Qcache_hits||0)/(((Com_select||0)+(Qcache_hits||0))||1)},
+);
+
+# ###########################################################################
+# Column definitions {{{3
+# Defines every column in every table. A named column has the following
+# properties:
+#    * hdr    Column header/title
+#    * label  Documentation for humans.
+#    * num    Whether it's numeric (for sorting).
+#    * just   Alignment; generated from num, user-overridable in tbl_meta
+#    * minw, maxw Auto-generated, user-overridable.
+# Values from this hash are just copied to tbl_meta, which is where everything
+# else in the program should read from.
+# ###########################################################################
+
+my %columns = (
+   active_secs                 => { hdr => 'SecsActive',          num => 1, label => 'Seconds transaction has been active', },
+   add_pool_alloc              => { hdr => 'Add\'l Pool',         num => 1, label => 'Additonal pool allocated' },
+   attempted_op                => { hdr => 'Action',              num => 0, label => 'The action that caused the error' },
+   awe_mem_alloc               => { hdr => 'AWE Memory',          num => 1, label => '[Windows] AWE memory allocated' },
+   binlog_cache_overflow       => { hdr => 'Binlog Cache',        num => 1, label => 'Transactions too big for binlog cache that went to disk' },
+   binlog_do_db                => { hdr => 'Binlog Do DB',        num => 0, label => 'binlog-do-db setting' },
+   binlog_ignore_db            => { hdr => 'Binlog Ignore DB',    num => 0, label => 'binlog-ignore-db setting' },
+   bps_in                      => { hdr => 'BpsIn',               num => 1, label => 'Bytes per second received by the server', },
+   bps_out                     => { hdr => 'BpsOut',              num => 1, label => 'Bytes per second sent by the server', },
+   buf_free                    => { hdr => 'Free Bufs',           num => 1, label => 'Buffers free in the buffer pool' },
+   buf_pool_hit_rate           => { hdr => 'Hit Rate',            num => 0, label => 'Buffer pool hit rate' },
+   buf_pool_hits               => { hdr => 'Hits',                num => 1, label => 'Buffer pool hits' },
+   buf_pool_reads              => { hdr => 'Reads',               num => 1, label => 'Buffer pool reads' },
+   buf_pool_size               => { hdr => 'Size',                num => 1, label => 'Buffer pool size' },
+   bufs_in_node_heap           => { hdr => 'Node Heap Bufs',      num => 1, label => 'Buffers in buffer pool node heap' },
+   bytes_behind_master         => { hdr => 'ByteLag',             num => 1, label => 'Bytes the slave lags the master in binlog' },
+   cell_event_set              => { hdr => 'Ending?',             num => 1, label => 'Whether the cell event is set' },
+   cell_waiting                => { hdr => 'Waiting?',            num => 1, label => 'Whether the cell is waiting' },
+   child_db                    => { hdr => 'Child DB',            num => 0, label => 'The database of the child table' },
+   child_index                 => { hdr => 'Child Index',         num => 0, label => 'The index in the child table' },
+   child_table                 => { hdr => 'Child Table',         num => 0, label => 'The child table' },
+   cmd                         => { hdr => 'Cmd',                 num => 0, label => 'Type of command being executed', },
+   cnt                         => { hdr => 'Cnt',                 num => 0, label => 'Count', agg => 'count', aggonly => 1 },
+   connect_retry               => { hdr => 'Connect Retry',       num => 1, label => 'Slave connect-retry timeout' },
+   cxn                         => { hdr => 'CXN',                 num => 0, label => 'Connection from which the data came', },
+   db                          => { hdr => 'DB',                  num => 0, label => 'Current database', },
+   dict_mem_alloc              => { hdr => 'Dict Mem',            num => 1, label => 'Dictionary memory allocated' },
+   dirty_bufs                  => { hdr => 'Dirty Buf',           num => 1, label => 'Dirty buffer pool pages' },
+   dl_txn_num                  => { hdr => 'Num',                 num => 0, label => 'Deadlocked transaction number', },
+   event_set                   => { hdr => 'Evt Set?',            num => 1, label => '[Win32] if a wait event is set', },
+   exec_master_log_pos         => { hdr => 'Exec Master Log Pos', num => 1, label => 'Exec Master Log Position' },
+   fk_name                     => { hdr => 'Constraint',          num => 0, label => 'The name of the FK constraint' },
+   free_list_len               => { hdr => 'Free List Len',       num => 1, label => 'Length of the free list' },
+   has_read_view               => { hdr => 'Rd View',             num => 1, label => 'Whether the transaction has a read view' },
+   hash_searches_s             => { hdr => 'Hash/Sec',            num => 1, label => 'Number of hash searches/sec' },
+   hash_table_size             => { hdr => 'Size',                num => 1, label => 'Number of non-hash searches/sec' },
+   heap_no                     => { hdr => 'Heap',                num => 1, label => 'Heap number' },
+   heap_size                   => { hdr => 'Heap',                num => 1, label => 'Heap size' },
+   history_list_len            => { hdr => 'History',             num => 1, label => 'History list length' },
+   host_and_domain             => { hdr => 'Host',                num => 0, label => 'Hostname/IP and domain' },
+   host_and_port               => { hdr => 'Host/IP',             num => 0, label => 'Hostname or IP address, and port number', },
+   hostname                    => { hdr => 'Host',                num => 0, label => 'Hostname' },
+   index                       => { hdr => 'Index',               num => 0, label => 'The index involved' },
+   index_ref                   => { hdr => 'Index Ref',           num => 0, label => 'Index referenced' },
+   info                        => { hdr => 'Query',               num => 0, label => 'Info or the current query', },
+   insert_intention            => { hdr => 'Ins Intent',          num => 1, label => 'Whether the thread was trying to insert' },
+   inserts                     => { hdr => 'Inserts',             num => 1, label => 'Inserts' },
+   io_bytes_s                  => { hdr => 'Bytes/Sec',           num => 1, label => 'Average I/O bytes/sec' },
+   io_flush_type               => { hdr => 'Flush Type',          num => 0, label => 'I/O Flush Type' },
+   io_fsyncs_s                 => { hdr => 'fsyncs/sec',          num => 1, label => 'I/O fsyncs/sec' },
+   io_reads_s                  => { hdr => 'Reads/Sec',           num => 1, label => 'Average I/O reads/sec' },
+   io_writes_s                 => { hdr => 'Writes/Sec',          num => 1, label => 'Average I/O writes/sec' },
+   ip                          => { hdr => 'IP',                  num => 0, label => 'IP address' },
+   is_name_locked              => { hdr => 'Locked',              num => 1, label => 'Whether table is name locked', },
+   key_buffer_hit              => { hdr => 'KCacheHit',           num => 1, label => 'Key cache hit ratio', },
+   key_len                     => { hdr => 'Key Length',          num => 1, label => 'Number of bytes used in the key' },
+   last_chkp                   => { hdr => 'Last Checkpoint',     num => 0, label => 'Last log checkpoint' },
+   last_errno                  => { hdr => 'Last Errno',          num => 1, label => 'Last error number' },
+   last_error                  => { hdr => 'Last Error',          num => 0, label => 'Last error' },
+   last_s_file_name            => { hdr => 'S-File',              num => 0, label => 'Filename where last read locked' },
+   last_s_line                 => { hdr => 'S-Line',              num => 1, label => 'Line where last read locked' },
+   last_x_file_name            => { hdr => 'X-File',              num => 0, label => 'Filename where last write locked' },
+   last_x_line                 => { hdr => 'X-Line',              num => 1, label => 'Line where last write locked' },
+   last_pct                    => { hdr => 'Pct',                 num => 1, label => 'Last Percentage' },
+   last_total                  => { hdr => 'Last Total',          num => 1, label => 'Last Total' },
+   last_value                  => { hdr => 'Last Incr',           num => 1, label => 'Last Value' },
+   load                        => { hdr => 'Load',                num => 1, label => 'Server load' },
+   lock_cfile_name             => { hdr => 'Crtd File',           num => 0, label => 'Filename where lock created' },
+   lock_cline                  => { hdr => 'Crtd Line',           num => 1, label => 'Line where lock created' },
+   lock_mem_addr               => { hdr => 'Addr',                num => 0, label => 'The lock memory address' },
+   lock_mode                   => { hdr => 'Mode',                num => 0, label => 'The lock mode' },
+   lock_structs                => { hdr => 'LStrcts',             num => 1, label => 'Number of lock structs' },
+   lock_type                   => { hdr => 'Type',                num => 0, label => 'The lock type' },
+   lock_var                    => { hdr => 'Lck Var',             num => 1, label => 'The lock variable' },
+   lock_wait_time              => { hdr => 'Wait',                num => 1, label => 'How long txn has waited for a lock' },
+   log_flushed_to              => { hdr => 'Flushed To',          num => 0, label => 'Log position flushed to' },
+   log_ios_done                => { hdr => 'IO Done',             num => 1, label => 'Log I/Os done' },
+   log_ios_s                   => { hdr => 'IO/Sec',              num => 1, label => 'Average log I/Os per sec' },
+   log_seq_no                  => { hdr => 'Sequence No.',        num => 0, label => 'Log sequence number' },
+   main_thread_id              => { hdr => 'Main Thread ID',      num => 1, label => 'Main thread ID' },
+   main_thread_proc_no         => { hdr => 'Main Thread Proc',    num => 1, label => 'Main thread process number' },
+   main_thread_state           => { hdr => 'Main Thread State',   num => 0, label => 'Main thread state' },
+   master_file                 => { hdr => 'File',                num => 0, label => 'Master file' },
+   master_host                 => { hdr => 'Master',              num => 0, label => 'Master server hostname' },
+   master_log_file             => { hdr => 'Master Log File',     num => 0, label => 'Master log file' },
+   master_port                 => { hdr => 'Master Port',         num => 1, label => 'Master port' },
+   master_pos                  => { hdr => 'Position',            num => 1, label => 'Master position' },
+   master_ssl_allowed          => { hdr => 'Master SSL Allowed',  num => 0, label => 'Master SSL Allowed' },
+   master_ssl_ca_file          => { hdr => 'Master SSL CA File',  num => 0, label => 'Master SSL Cert Auth File' },
+   master_ssl_ca_path          => { hdr => 'Master SSL CA Path',  num => 0, label => 'Master SSL Cert Auth Path' },
+   master_ssl_cert             => { hdr => 'Master SSL Cert',     num => 0, label => 'Master SSL Cert' },
+   master_ssl_cipher           => { hdr => 'Master SSL Cipher',   num => 0, label => 'Master SSL Cipher' },
+   master_ssl_key              => { hdr => 'Master SSL Key',      num => 0, label => 'Master SSL Key' },
+   master_user                 => { hdr => 'Master User',         num => 0, label => 'Master username' },
+   max_txn                     => { hdr => 'MaxTxnTime',          num => 1, label => 'MaxTxn' },
+   merged_recs                 => { hdr => 'Merged Recs',         num => 1, label => 'Merged records' },
+   merges                      => { hdr => 'Merges',              num => 1, label => 'Merges' },
+   mutex_os_waits              => { hdr => 'Waits',               num => 1, label => 'Mutex OS Waits' },
+   mutex_spin_rounds           => { hdr => 'Rounds',              num => 1, label => 'Mutex Spin Rounds' },
+   mutex_spin_waits            => { hdr => 'Spins',               num => 1, label => 'Mutex Spin Waits' },
+   mysql_thread_id             => { hdr => 'ID',                  num => 1, label => 'MySQL connection (thread) ID', },
+   name                        => { hdr => 'Name',                num => 0, label => 'Variable Name' },
+   n_bits                      => { hdr => '# Bits',              num => 1, label => 'Number of bits' },
+   non_hash_searches_s         => { hdr => 'Non-Hash/Sec',        num => 1, label => 'Non-hash searches/sec' },
+   num_deletes                 => { hdr => 'Del',                 num => 1, label => 'Number of deletes' },
+   num_deletes_sec             => { hdr => 'Del/Sec',             num => 1, label => 'Number of deletes' },
+   num_inserts                 => { hdr => 'Ins',                 num => 1, label => 'Number of inserts' },
+   num_inserts_sec             => { hdr => 'Ins/Sec',             num => 1, label => 'Number of inserts' },
+   num_readers                 => { hdr => 'Readers',             num => 1, label => 'Number of readers' },
+   num_reads                   => { hdr => 'Read',                num => 1, label => 'Number of reads' },
+   num_reads_sec               => { hdr => 'Read/Sec',            num => 1, label => 'Number of reads' },
+   num_res_ext                 => { hdr => 'BTree Extents',       num => 1, label => 'Number of extents reserved for B-Tree' },
+   num_rows                    => { hdr => 'Row Count',           num => 1, label => 'Number of rows estimated to examine' },
+   num_times_open              => { hdr => 'In Use',              num => 1, label => '# times table is opened', },
+   num_txns                    => { hdr => 'Txns',                num => 1, label => 'Number of transactions' },
+   num_updates                 => { hdr => 'Upd',                 num => 1, label => 'Number of updates' },
+   num_updates_sec             => { hdr => 'Upd/Sec',             num => 1, label => 'Number of updates' },
+   os_file_reads               => { hdr => 'OS Reads',            num => 1, label => 'OS file reads' },
+   os_file_writes              => { hdr => 'OS Writes',           num => 1, label => 'OS file writes' },
+   os_fsyncs                   => { hdr => 'OS fsyncs',           num => 1, label => 'OS fsyncs' },
+   os_thread_id                => { hdr => 'OS Thread',           num => 1, label => 'The operating system thread ID' },
+   p_aio_writes                => { hdr => 'Async Wrt',           num => 1, label => 'Pending asynchronous I/O writes' },
+   p_buf_pool_flushes          => { hdr => 'Buffer Pool Flushes', num => 1, label => 'Pending buffer pool flushes' },
+   p_ibuf_aio_reads            => { hdr => 'IBuf Async Rds',      num => 1, label => 'Pending insert buffer asynch I/O reads' },
+   p_log_flushes               => { hdr => 'Log Flushes',         num => 1, label => 'Pending log flushes' },
+   p_log_ios                   => { hdr => 'Log I/Os',            num => 1, label => 'Pending log I/O operations' },
+   p_normal_aio_reads          => { hdr => 'Async Rds',           num => 1, label => 'Pending asynchronous I/O reads' },
+   p_preads                    => { hdr => 'preads',              num => 1, label => 'Pending p-reads' },
+   p_pwrites                   => { hdr => 'pwrites',             num => 1, label => 'Pending p-writes' },
+   p_sync_ios                  => { hdr => 'Sync I/Os',           num => 1, label => 'Pending synchronous I/O operations' },
+   page_creates_sec            => { hdr => 'Creates/Sec',         num => 1, label => 'Page creates/sec' },
+   page_no                     => { hdr => 'Page',                num => 1, label => 'Page number' },
+   page_reads_sec              => { hdr => 'Reads/Sec',           num => 1, label => 'Page reads per second' },
+   page_writes_sec             => { hdr => 'Writes/Sec',          num => 1, label => 'Page writes per second' },
+   pages_created               => { hdr => 'Created',             num => 1, label => 'Pages created' },
+   pages_modified              => { hdr => 'Dirty Pages',         num => 1, label => 'Pages modified (dirty)' },
+   pages_read                  => { hdr => 'Reads',               num => 1, label => 'Pages read' },
+   pages_total                 => { hdr => 'Pages',               num => 1, label => 'Pages total' },
+   pages_written               => { hdr => 'Writes',              num => 1, label => 'Pages written' },
+   parent_col                  => { hdr => 'Parent Column',       num => 0, label => 'The referred column in the parent table', },
+   parent_db                   => { hdr => 'Parent DB',           num => 0, label => 'The database of the parent table' },
+   parent_index                => { hdr => 'Parent Index',        num => 0, label => 'The referred index in the parent table' },
+   parent_table                => { hdr => 'Parent Table',        num => 0, label => 'The parent table' },
+   part_id                     => { hdr => 'Part ID',             num => 1, label => 'Sub-part ID of the query' },
+   partitions                  => { hdr => 'Partitions',          num => 0, label => 'Query partitions used' },
+   pct                         => { hdr => 'Pct',                 num => 1, label => 'Percentage' },
+   pending_chkp_writes         => { hdr => 'Chkpt Writes',        num => 1, label => 'Pending log checkpoint writes' },
+   pending_log_writes          => { hdr => 'Log Writes',          num => 1, label => 'Pending log writes' },
+   port                        => { hdr => 'Port',                num => 1, label => 'Client port number', },
+   possible_keys               => { hdr => 'Poss. Keys',          num => 0, label => 'Possible keys' },
+   proc_no                     => { hdr => 'Proc',                num => 1, label => 'Process number' },
+   q_cache_hit                 => { hdr => 'QCacheHit',           num => 1, label => 'Query cache hit ratio', },
+   qps                         => { hdr => 'QPS',                 num => 1, label => 'How many queries/sec', },
+   queries_in_queue            => { hdr => 'Queries Queued',      num => 1, label => 'Queries in queue' },
+   queries_inside              => { hdr => 'Queries Inside',      num => 1, label => 'Queries inside InnoDB' },
+   query_id                    => { hdr => 'Query ID',            num => 1, label => 'Query ID' },
+   query_status                => { hdr => 'Query Status',        num => 0, label => 'The query status' },
+   query_text                  => { hdr => 'Query Text',          num => 0, label => 'The query text' },
+   questions                   => { hdr => 'Questions',           num => 1, label => 'How many queries the server has gotten', },
+   read_master_log_pos         => { hdr => 'Read Master Pos',     num => 1, label => 'Read master log position' },
+   read_views_open             => { hdr => 'Rd Views',            num => 1, label => 'Number of read views open' },
+   reads_pending               => { hdr => 'Pending Reads',       num => 1, label => 'Reads pending' },
+   relay_log_file              => { hdr => 'Relay File',          num => 0, label => 'Relay log file' },
+   relay_log_pos               => { hdr => 'Relay Pos',           num => 1, label => 'Relay log position' },
+   relay_log_size              => { hdr => 'Relay Size',          num => 1, label => 'Relay log size' },
+   relay_master_log_file       => { hdr => 'Relay Master File',   num => 0, label => 'Relay master log file' },
+   replicate_do_db             => { hdr => 'Do DB',               num => 0, label => 'Replicate-do-db setting' },
+   replicate_do_table          => { hdr => 'Do Table',            num => 0, label => 'Replicate-do-table setting' },
+   replicate_ignore_db         => { hdr => 'Ignore DB',           num => 0, label => 'Replicate-ignore-db setting' },
+   replicate_ignore_table      => { hdr => 'Ignore Table',        num => 0, label => 'Replicate-do-table setting' },
+   replicate_wild_do_table     => { hdr => 'Wild Do Table',       num => 0, label => 'Replicate-wild-do-table setting' },
+   replicate_wild_ignore_table => { hdr => 'Wild Ignore Table',   num => 0, label => 'Replicate-wild-ignore-table setting' },
+   request_type                => { hdr => 'Type',                num => 0, label => 'Type of lock the thread waits for' },
+   reservation_count           => { hdr => 'ResCnt',              num => 1, label => 'Reservation Count' },
+   row_locks                   => { hdr => 'RLocks',              num => 1, label => 'Number of row locks' },
+   rw_excl_os_waits            => { hdr => 'RW Waits',            num => 1, label => 'R/W Excl. OS Waits' },
+   rw_excl_spins               => { hdr => 'RW Spins',            num => 1, label => 'R/W Excl. Spins' },
+   rw_shared_os_waits          => { hdr => 'Sh Waits',            num => 1, label => 'R/W Shared OS Waits' },
+   rw_shared_spins             => { hdr => 'Sh Spins',            num => 1, label => 'R/W Shared Spins' },
+   scan_type                   => { hdr => 'Type',                num => 0, label => 'Scan type in chosen' },
+   seg_size                    => { hdr => 'Seg. Size',           num => 1, label => 'Segment size' },
+   select_type                 => { hdr => 'Select Type',         num => 0, label => 'Type of select used' },
+   signal_count                => { hdr => 'Signals',             num => 1, label => 'Signal Count' },
+   size                        => { hdr => 'Size',                num => 1, label => 'Size of the tablespace' },
+   skip_counter                => { hdr => 'Skip Counter',        num => 1, label => 'Skip counter' },
+   slave_catchup_rate          => { hdr => 'Catchup',             num => 1, label => 'How fast the slave is catching up in the binlog' },
+   slave_io_running            => { hdr => 'Slave-IO',            num => 0, label => 'Whether the slave I/O thread is running' },
+   slave_io_state              => { hdr => 'Slave IO State',      num => 0, label => 'Slave I/O thread state' },
+   slave_open_temp_tables      => { hdr => 'Temp',                num => 1, label => 'Slave open temp tables' },
+   slave_sql_running           => { hdr => 'Slave-SQL',           num => 0, label => 'Whether the slave SQL thread is running' },
+   slow                        => { hdr => 'Slow',                num => 1, label => 'How many slow queries', },
+   space_id                    => { hdr => 'Space',               num => 1, label => 'Tablespace ID' },
+   special                     => { hdr => 'Special',             num => 0, label => 'Special/Other info' },
+   state                       => { hdr => 'State',               num => 0, label => 'Connection state', maxw => 18, },
+   tables_in_use               => { hdr => 'Tbl Used',            num => 1, label => 'Number of tables in use' },
+   tables_locked               => { hdr => 'Tbl Lck',             num => 1, label => 'Number of tables locked' },
+   tbl                         => { hdr => 'Table',               num => 0, label => 'Table', },
+   thread                      => { hdr => 'Thread',              num => 1, label => 'Thread number' },
+   thread_decl_inside          => { hdr => 'Thread Inside',       num => 0, label => 'What the thread is declared inside' },
+   thread_purpose              => { hdr => 'Purpose',             num => 0, label => "The thread's purpose" },
+   thread_status               => { hdr => 'Thread Status',       num => 0, label => 'The thread status' },
+   time                        => { hdr => 'Time',                num => 1, label => 'Time since the last event', },
+   time_behind_master          => { hdr => 'TimeLag',             num => 1, label => 'Time slave lags master' },
+   timestring                  => { hdr => 'Timestring',          num => 0, label => 'Time the event occurred' },
+   total                       => { hdr => 'Total',               num => 1, label => 'Total' },
+   total_mem_alloc             => { hdr => 'Memory',              num => 1, label => 'Total memory allocated' },
+   truncates                   => { hdr => 'Trunc',               num => 0, label => 'Whether the deadlock is truncating InnoDB status' },
+   txn_doesnt_see_ge           => { hdr => "Txn Won't See",       num => 0, label => 'Where txn read view is limited' },
+   txn_id                      => { hdr => 'ID',                  num => 0, label => 'Transaction ID' },
+   txn_sees_lt                 => { hdr => 'Txn Sees',            num => 1, label => 'Where txn read view is limited' },
+   txn_status                  => { hdr => 'Txn Status',          num => 0, label => 'Transaction status' },
+   txn_time_remain             => { hdr => 'Remaining',           num => 1, label => 'Time until txn rollback/commit completes' },
+   undo_log_entries            => { hdr => 'Undo',                num => 1, label => 'Number of undo log entries' },
+   undo_for                    => { hdr => 'Undo',                num => 0, label => 'Undo for' },
+   until_condition             => { hdr => 'Until Condition',     num => 0, label => 'Slave until condition' },
+   until_log_file              => { hdr => 'Until Log File',      num => 0, label => 'Slave until log file' },
+   until_log_pos               => { hdr => 'Until Log Pos',       num => 1, label => 'Slave until log position' },
+   used_cells                  => { hdr => 'Cells Used',          num => 1, label => 'Number of cells used' },
+   used_bufs                   => { hdr => 'Used Bufs',           num => 1, label => 'Number of buffer pool pages used' },
+   user                        => { hdr => 'User',                num => 0, label => 'Database username', },
+   value                       => { hdr => 'Value',               num => 1, label => 'Value' },
+   versions                    => { hdr => 'Versions',            num => 1, label => 'Number of InnoDB MVCC versions unpurged' },
+   victim                      => { hdr => 'Victim',              num => 0, label => 'Whether this txn was the deadlock victim' },
+   wait_array_size             => { hdr => 'Wait Array Size',     num => 1, label => 'Wait Array Size' },
+   wait_status                 => { hdr => 'Lock Status',         num => 0, label => 'Status of txn locks' },
+   waited_at_filename          => { hdr => 'File',                num => 0, label => 'Filename at which thread waits' },
+   waited_at_line              => { hdr => 'Line',                num => 1, label => 'Line at which thread waits' },
+   waiters_flag                => { hdr => 'Waiters',             num => 1, label => 'Waiters Flag' },
+   waiting                     => { hdr => 'Waiting',             num => 1, label => 'Whether lock is being waited for' },
+   when                        => { hdr => 'When',                num => 0, label => 'Time scale' },
+   writer_lock_mode            => { hdr => 'Wrtr Lck Mode',       num => 0, label => 'Writer lock mode' },
+   writer_thread               => { hdr => 'Wrtr Thread',         num => 1, label => 'Writer thread ID' },
+   writes_pending              => { hdr => 'Writes',              num => 1, label => 'Number of writes pending' },
+   writes_pending_flush_list   => { hdr => 'Flush List Writes',   num => 1, label => 'Number of flush list writes pending' },
+   writes_pending_lru          => { hdr => 'LRU Writes',          num => 1, label => 'Number of LRU writes pending' },
+   writes_pending_single_page  => { hdr => '1-Page Writes',       num => 1, label => 'Number of 1-page writes pending' },
+);
+
+# Apply a default property or three.  By default, columns are not width-constrained,
+# aligned left, and sorted alphabetically, not numerically.
+foreach my $col ( values %columns ) {
+   map { $col->{$_} ||= 0 } qw(num minw maxw);
+   $col->{just} = $col->{num} ? '' : '-';
+}
+
+# Filters {{{3
+# This hash defines every filter that can be applied to a table.  These
+# become part of tbl_meta as well.  Each filter is just an expression that
+# returns true or false.
+# Properties of each entry:
+#  * func:   the subroutine
+#  * name:   the name, repeated
+#  * user:   whether it's a user-defined filter (saved in config)
+#  * text:   text of the subroutine
+#  * note:   explanation
+my %filters = ();
+
+# These are pre-processed to live in %filters above, by compiling them.
+my %builtin_filters = (
+   hide_self => {
+      text => <<'      END',
+         return ( !$set->{info} || $set->{info} ne 'SHOW FULL PROCESSLIST' )
+             && ( !$set->{query_text}    || $set->{query_text} !~ m/INNODB STATUS$/ );
+      END
+      note => 'Removes the innotop processes from the list',
+      tbls => [qw(innodb_transactions processlist)],
+   },
+   hide_inactive => {
+      text => <<'      END',
+         return ( !defined($set->{txn_status}) || $set->{txn_status} ne 'not started' )
+             && ( !defined($set->{cmd})        || $set->{cmd} !~ m/Sleep|Binlog Dump/ )
+             && ( !defined($set->{info})       || $set->{info} =~ m/\S/               );
+      END
+      note => 'Removes processes which are not doing anything',
+      tbls => [qw(innodb_transactions processlist)],
+   },
+   hide_slave_io => {
+      text => <<'      END',
+         return !$set->{state} || $set->{state} !~ m/^(?:Waiting for master|Has read all relay)/;
+      END
+      note => 'Removes slave I/O threads from the list',
+      tbls => [qw(processlist slave_io_status)],
+   },
+   table_is_open => {
+      text => <<'      END',
+         return $set->{num_times_open} + $set->{is_name_locked};
+      END
+      note => 'Removes tables that are not in use or locked',
+      tbls => [qw(open_tables)],
+   },
+   cxn_is_master => {
+      text => <<'      END',
+         return $set->{master_file} ? 1 : 0;
+      END
+      note => 'Removes servers that are not masters',
+      tbls => [qw(master_status)],
+   },
+   cxn_is_slave => {
+      text => <<'      END',
+         return $set->{master_host} ? 1 : 0;
+      END
+      note => 'Removes servers that are not slaves',
+      tbls => [qw(slave_io_status slave_sql_status)],
+   },
+   thd_is_not_waiting => {
+      text => <<'      END',
+         return $set->{thread_status} !~ m#waiting for i/o request#;
+      END
+      note => 'Removes idle I/O threads',
+      tbls => [qw(io_threads)],
+   },
+);
+foreach my $key ( keys %builtin_filters ) {
+   my ( $sub, $err ) = compile_filter($builtin_filters{$key}->{text});
+   $filters{$key} = {
+      func => $sub,
+      text => $builtin_filters{$key}->{text},
+      user => 0,
+      name => $key, # useful for later
+      note => $builtin_filters{$key}->{note},
+      tbls => $builtin_filters{$key}->{tbls},
+   }
+}
+
+# Variable sets {{{3
+# Sets (arrayrefs) of variables that are used in S mode.  They are read/written to
+# the config file.
+my %var_sets = (
+   general => {
+      text => join(
+         ', ',
+         'set_precision(Questions/Uptime_hires) as QPS',
+         'set_precision(Com_commit/Uptime_hires) as Commit_PS',
+         'set_precision((Com_rollback||0)/(Com_commit||1)) as Rollback_Commit',
+         'set_precision(('
+            . join('+', map { "($_||0)" }
+               qw(Com_delete Com_delete_multi Com_insert Com_insert_select Com_replace
+                  Com_replace_select Com_select Com_update Com_update_multi))
+            . ')/(Com_commit||1)) as Write_Commit',
+         'set_precision((Com_select+(Qcache_hits||0))/(('
+            . join('+', map { "($_||0)" }
+               qw(Com_delete Com_delete_multi Com_insert Com_insert_select Com_replace
+                  Com_replace_select Com_select Com_update Com_update_multi))
+            . ')||1)) as R_W_Ratio',
+         'set_precision(Opened_tables/Uptime_hires) as Opens_PS',
+         'percent($cur->{Open_tables}/($cur->{table_cache})) as Table_Cache_Used',
+         'set_precision(Threads_created/Uptime_hires) as Threads_PS',
+         'percent($cur->{Threads_cached}/($cur->{thread_cache_size}||1)) as Thread_Cache_Used',
+         'percent($cur->{Max_used_connections}/($cur->{max_connections}||1)) as CXN_Used_Ever',
+         'percent($cur->{Threads_connected}/($cur->{max_connections}||1)) as CXN_Used_Now',
+      ),
+   },
+   commands => {
+      text => join(
+         ', ',
+         qw(Uptime Questions Com_delete Com_delete_multi Com_insert
+         Com_insert_select Com_replace Com_replace_select Com_select Com_update
+         Com_update_multi)
+      ),
+   },
+   query_status => {
+      text => join(
+         ',',
+         qw( Uptime Select_full_join Select_full_range_join Select_range
+         Select_range_check Select_scan Slow_queries Sort_merge_passes
+         Sort_range Sort_rows Sort_scan)
+      ),
+   },
+   innodb => {
+      text => join(
+         ',',
+         qw( Uptime Innodb_row_lock_current_waits Innodb_row_lock_time
+         Innodb_row_lock_time_avg Innodb_row_lock_time_max Innodb_row_lock_waits
+         Innodb_rows_deleted Innodb_rows_inserted Innodb_rows_read
+         Innodb_rows_updated)
+      ),
+   },
+   txn => {
+      text => join(
+         ',',
+         qw( Uptime Com_begin Com_commit Com_rollback Com_savepoint
+         Com_xa_commit Com_xa_end Com_xa_prepare Com_xa_recover Com_xa_rollback
+         Com_xa_start)
+      ),
+   },
+   key_cache => {
+      text => join(
+         ',',
+         qw( Uptime Key_blocks_not_flushed Key_blocks_unused Key_blocks_used
+         Key_read_requests Key_reads Key_write_requests Key_writes )
+      ),
+   },
+   query_cache => {
+      text => join(
+         ',',
+         "percent($exprs{QcacheHitRatio}) as Hit_Pct",
+         'set_precision((Qcache_hits||0)/(Qcache_inserts||1)) as Hit_Ins',
+         'set_precision((Qcache_lowmem_prunes||0)/Uptime_hires) as Lowmem_Prunes_sec',
+         'percent(1-((Qcache_free_blocks||0)/(Qcache_total_blocks||1))) as Blocks_used',
+         qw( Qcache_free_blocks Qcache_free_memory Qcache_not_cached Qcache_queries_in_cache)
+      ),
+   },
+   handler => {
+      text => join(
+         ',',
+         qw( Uptime Handler_read_key Handler_read_first Handler_read_next
+         Handler_read_prev Handler_read_rnd Handler_read_rnd_next Handler_delete
+         Handler_update Handler_write)
+      ),
+   },
+   cxns_files_threads => {
+      text => join(
+         ',',
+         qw( Uptime Aborted_clients Aborted_connects Bytes_received Bytes_sent
+         Compression Connections Created_tmp_disk_tables Created_tmp_files
+         Created_tmp_tables Max_used_connections Open_files Open_streams
+         Open_tables Opened_tables Table_locks_immediate Table_locks_waited
+         Threads_cached Threads_connected Threads_created Threads_running)
+      ),
+   },
+   prep_stmt => {
+      text => join(
+         ',',
+         qw( Uptime Com_dealloc_sql Com_execute_sql Com_prepare_sql Com_reset
+         Com_stmt_close Com_stmt_execute Com_stmt_fetch Com_stmt_prepare
+         Com_stmt_reset Com_stmt_send_long_data )
+      ),
+   },
+   innodb_health => {
+      text => join(
+         ',',
+         "$exprs{OldVersions} as OldVersions",
+         qw(IB_sm_mutex_spin_waits IB_sm_mutex_spin_rounds IB_sm_mutex_os_waits),
+         "$exprs{NumTxns} as NumTxns",
+         "$exprs{MaxTxnTime} as MaxTxnTime",
+         qw(IB_ro_queries_inside IB_ro_queries_in_queue),
+         "set_precision($exprs{DirtyBufs} * 100) as dirty_bufs",
+         "set_precision($exprs{BufPoolFill} * 100) as buf_fill",
+         qw(IB_bp_pages_total IB_bp_pages_read IB_bp_pages_written IB_bp_pages_created)
+      ),
+   },
+   innodb_health2 => {
+      text => join(
+         ', ',
+         'percent(1-((Innodb_buffer_pool_pages_free||0)/($cur->{Innodb_buffer_pool_pages_total}||1))) as BP_page_cache_usage',
+         'percent(1-((Innodb_buffer_pool_reads||0)/(Innodb_buffer_pool_read_requests||1))) as BP_cache_hit_ratio',
+         'Innodb_buffer_pool_wait_free',
+         'Innodb_log_waits',
+      ),
+   },
+   slow_queries => {
+      text => join(
+         ', ',
+         'set_precision(Slow_queries/Uptime_hires) as Slow_PS',
+         'set_precision(Select_full_join/Uptime_hires) as Full_Join_PS',
+         'percent(Select_full_join/(Com_select||1)) as Full_Join_Ratio',
+      ),
+   },
+);
+
+# Server sets {{{3
+# Defines sets of servers between which the user can quickly switch.
+my %server_groups;
+
+# Connections {{{3
+# This hash defines server connections.  Each connection is a string that can be passed to
+# the DBI connection.  These are saved in the connections section in the config file.
+my %connections;
+# Defines the parts of connections.
+my @conn_parts = qw(user have_user pass have_pass dsn savepass dl_table);
+
+# Graph widths {{{3
+# This hash defines the max values seen for various status/variable values, for graphing.
+# These are stored in their own section in the config file.  These are just initial values:
+my %mvs = (
+   Com_select   => 50,
+   Com_insert   => 50,
+   Com_update   => 50,
+   Com_delete   => 50,
+   Questions    => 100,
+);
+
+# ###########################################################################
+# Valid Term::ANSIColor color strings.
+# ###########################################################################
+my %ansicolors = map { $_ => 1 }
+   qw( black blink blue bold clear concealed cyan dark green magenta on_black
+       on_blue on_cyan on_green on_magenta on_red on_white on_yellow red reset
+       reverse underline underscore white yellow);
+
+# ###########################################################################
+# Valid comparison operators for color rules
+# ###########################################################################
+my %comp_ops = (
+   '==' => 'Numeric equality',
+   '>'  => 'Numeric greater-than',
+   '<'  => 'Numeric less-than',
+   '>=' => 'Numeric greater-than/equal',
+   '<=' => 'Numeric less-than/equal',
+   '!=' => 'Numeric not-equal',
+   'eq' => 'String equality',
+   'gt' => 'String greater-than',
+   'lt' => 'String less-than',
+   'ge' => 'String greater-than/equal',
+   'le' => 'String less-than/equal',
+   'ne' => 'String not-equal',
+   '=~' => 'Pattern match',
+   '!~' => 'Negated pattern match',
+);
+
+# ###########################################################################
+# Valid aggregate functions.
+# ###########################################################################
+my %agg_funcs = (
+   first => sub {
+      return $_[0]
+   },
+   count => sub {
+      return 0 + @_;
+   },
+   avg   => sub {
+      my @args = grep { defined $_ } @_;
+      return (sum(map { m/([\d\.-]+)/g } @args) || 0) / (scalar(@args) || 1);
+   },
+   sum   => \&sum,
+);
+
+# ###########################################################################
+# Valid functions for transformations.
+# ###########################################################################
+my %trans_funcs = (
+   shorten      => \&shorten,
+   secs_to_time => \&secs_to_time,
+   no_ctrl_char => \&no_ctrl_char,
+   percent      => \&percent,
+   commify      => \&commify,
+   dulint_to_int => \&dulint_to_int,
+   set_precision => \&set_precision,
+);
+
+# Table definitions {{{3
+# This hash defines every table that can get displayed in every mode.  Each
+# table specifies columns and column data sources.  The column is
+# defined by the %columns hash.
+#
+# Example: foo => { src => 'bar' } means the foo column (look at
+# $columns{foo} for its definition) gets its data from the 'bar' element of
+# the current data set, whatever that is.
+#
+# These columns are post-processed after being defined, because they get stuff
+# from %columns.  After all the config is loaded for columns, there's more
+# post-processing too; the subroutines compiled from src get added to
+# the hash elements for extract_values to use.
+# ###########################################################################
+
+my %tbl_meta = (
+   adaptive_hash_index => {
+      capt => 'Adaptive Hash Index',
+      cust => {},
+      cols => {
+         cxn                 => { src => 'cxn' },
+         hash_table_size     => { src => 'IB_ib_hash_table_size', trans => [qw(shorten)], },
+         used_cells          => { src => 'IB_ib_used_cells' },
+         bufs_in_node_heap   => { src => 'IB_ib_bufs_in_node_heap' },
+         hash_searches_s     => { src => 'IB_ib_hash_searches_s' },
+         non_hash_searches_s => { src => 'IB_ib_non_hash_searches_s' },
+      },
+      visible => [ qw(cxn hash_table_size used_cells bufs_in_node_heap hash_searches_s non_hash_searches_s) ],
+      filters => [],
+      sort_cols => 'cxn',
+      sort_dir => '1',
+      innodb   => 'ib',
+      group_by => [],
+      aggregate => 0,
+   },
+   buffer_pool => {
+      capt => 'Buffer Pool',
+      cust => {},
+      cols => {
+         cxn                        => { src => 'cxn' },
+         total_mem_alloc            => { src => 'IB_bp_total_mem_alloc', trans => [qw(shorten)], },
+         awe_mem_alloc              => { src => 'IB_bp_awe_mem_alloc', trans => [qw(shorten)], },
+         add_pool_alloc             => { src => 'IB_bp_add_pool_alloc', trans => [qw(shorten)], },
+         buf_pool_size              => { src => 'IB_bp_buf_pool_size', trans => [qw(shorten)], },
+         buf_free                   => { src => 'IB_bp_buf_free' },
+         buf_pool_hit_rate          => { src => 'IB_bp_buf_pool_hit_rate' },
+         buf_pool_reads             => { src => 'IB_bp_buf_pool_reads' },
+         buf_pool_hits              => { src => 'IB_bp_buf_pool_hits' },
+         dict_mem_alloc             => { src => 'IB_bp_dict_mem_alloc' },
+         pages_total                => { src => 'IB_bp_pages_total' },
+         pages_modified             => { src => 'IB_bp_pages_modified' },
+         reads_pending              => { src => 'IB_bp_reads_pending' },
+         writes_pending             => { src => 'IB_bp_writes_pending' },
+         writes_pending_lru         => { src => 'IB_bp_writes_pending_lru' },
+         writes_pending_flush_list  => { src => 'IB_bp_writes_pending_flush_list' },
+         writes_pending_single_page => { src => 'IB_bp_writes_pending_single_page' },
+         page_creates_sec           => { src => 'IB_bp_page_creates_sec' },
+         page_reads_sec             => { src => 'IB_bp_page_reads_sec' },
+         page_writes_sec            => { src => 'IB_bp_page_writes_sec' },
+         pages_created              => { src => 'IB_bp_pages_created' },
+         pages_read                 => { src => 'IB_bp_pages_read' },
+         pages_written              => { src => 'IB_bp_pages_written' },
+      },
+      visible => [ qw(cxn buf_pool_size buf_free pages_total pages_modified buf_pool_hit_rate total_mem_alloc add_pool_alloc)],
+      filters => [],
+      sort_cols => 'cxn',
+      sort_dir => '1',
+      innodb   => 'bp',
+      group_by => [],
+      aggregate => 0,
+   },
+   # TODO: a new step in set_to_tbl: join result to itself, grouped?
+   # TODO: this would also enable pulling Q and T data together.
+   # TODO: using a SQL-ish language would also allow pivots to be easier -- treat the pivoted data as a view and SELECT from it.
+   cmd_summary => {
+      capt => 'Command Summary',
+      cust => {},
+      cols => {
+         name       => { src => 'name' },
+         total      => { src => 'total' },
+         value      => { src => 'value',                     agg   => 'sum'},
+         pct        => { src => 'value/total',               trans => [qw(percent)] },
+         last_total => { src => 'last_total' },
+         last_value => { src => 'last_value',                agg   => 'sum'},
+         last_pct   => { src => 'last_value/last_total',     trans => [qw(percent)] },
+      },
+      visible   => [qw(name value pct last_value last_pct)],
+      filters   => [qw()],
+      sort_cols => '-value',
+      sort_dir  => '1',
+      innodb    => '',
+      group_by  => [qw(name)],
+      aggregate => 1,
+   },
+   deadlock_locks => {
+      capt => 'Deadlock Locks',
+      cust => {},
+      cols => {
+         cxn              => { src => 'cxn' },
+         mysql_thread_id  => { src => 'mysql_thread_id' },
+         dl_txn_num       => { src => 'dl_txn_num' },
+         lock_type        => { src => 'lock_type' },
+         space_id         => { src => 'space_id' },
+         page_no          => { src => 'page_no' },
+         heap_no          => { src => 'heap_no' },
+         n_bits           => { src => 'n_bits' },
+         index            => { src => 'index' },
+         db               => { src => 'db' },
+         tbl              => { src => 'table' },
+         lock_mode        => { src => 'lock_mode' },
+         special          => { src => 'special' },
+         insert_intention => { src => 'insert_intention' },
+         waiting          => { src => 'waiting' },
+      },
+      visible => [ qw(cxn mysql_thread_id waiting lock_mode db tbl index special insert_intention)],
+      filters => [],
+      sort_cols => 'cxn mysql_thread_id',
+      sort_dir => '1',
+      innodb   => 'dl',
+      group_by => [],
+      aggregate => 0,
+   },
+   deadlock_transactions => {
+      capt => 'Deadlock Transactions',
+      cust => {},
+      cols => {
+         cxn                => { src => 'cxn' },
+         active_secs        => { src => 'active_secs' },
+         dl_txn_num         => { src => 'dl_txn_num' },
+         has_read_view      => { src => 'has_read_view' },
+         heap_size          => { src => 'heap_size' },
+         host_and_domain    => { src => 'hostname' },
+         hostname           => { src => $exprs{Host} },
+         ip                 => { src => 'ip' },
+         lock_structs       => { src => 'lock_structs' },
+         lock_wait_time     => { src => 'lock_wait_time', trans => [ qw(secs_to_time) ] },
+         mysql_thread_id    => { src => 'mysql_thread_id' },
+         os_thread_id       => { src => 'os_thread_id' },
+         proc_no            => { src => 'proc_no' },
+         query_id           => { src => 'query_id' },
+         query_status       => { src => 'query_status' },
+         query_text         => { src => 'query_text', trans => [ qw(no_ctrl_char) ] },
+         row_locks          => { src => 'row_locks' },
+         tables_in_use      => { src => 'tables_in_use' },
+         tables_locked      => { src => 'tables_locked' },
+         thread_decl_inside => { src => 'thread_decl_inside' },
+         thread_status      => { src => 'thread_status' },
+         'time'             => { src => 'active_secs', trans => [ qw(secs_to_time) ] },
+         timestring         => { src => 'timestring' },
+         txn_doesnt_see_ge  => { src => 'txn_doesnt_see_ge' },
+         txn_id             => { src => 'txn_id' },
+         txn_sees_lt        => { src => 'txn_sees_lt' },
+         txn_status         => { src => 'txn_status' },
+         truncates          => { src => 'truncates' },
+         undo_log_entries   => { src => 'undo_log_entries' },
+         user               => { src => 'user' },
+         victim             => { src => 'victim' },
+         wait_status        => { src => 'lock_wait_status' },
+      },
+      visible => [ qw(cxn mysql_thread_id timestring user hostname victim time undo_log_entries lock_structs query_text)],
+      filters => [],
+      sort_cols => 'cxn mysql_thread_id',
+      sort_dir => '1',
+      innodb   => 'dl',
+      group_by => [],
+      aggregate => 0,
+   },
+   explain => {
+      capt => 'EXPLAIN Results',
+      cust => {},
+      cols => {
+         part_id       => { src => 'id' },
+         select_type   => { src => 'select_type' },
+         tbl           => { src => 'table' },
+         partitions    => { src => 'partitions' },
+         scan_type     => { src => 'type' },
+         possible_keys => { src => 'possible_keys' },
+         index         => { src => 'key' },
+         key_len       => { src => 'key_len' },
+         index_ref     => { src => 'ref' },
+         num_rows      => { src => 'rows' },
+         special       => { src => 'extra' },
+      },
+      visible => [ qw(select_type tbl partitions scan_type possible_keys index key_len index_ref num_rows special)],
+      filters => [],
+      sort_cols => '',
+      sort_dir => '1',
+      innodb   => '',
+      group_by => [],
+      aggregate => 0,
+   },
+   file_io_misc => {
+      capt => 'File I/O Misc',
+      cust => {},
+      cols => {
+         cxn            => { src => 'cxn' },
+         io_bytes_s     => { src => 'IB_io_avg_bytes_s' },
+         io_flush_type  => { src => 'IB_io_flush_type' },
+         io_fsyncs_s    => { src => 'IB_io_fsyncs_s' },
+         io_reads_s     => { src => 'IB_io_reads_s' },
+         io_writes_s    => { src => 'IB_io_writes_s' },
+         os_file_reads  => { src => 'IB_io_os_file_reads' },
+         os_file_writes => { src => 'IB_io_os_file_writes' },
+         os_fsyncs      => { src => 'IB_io_os_fsyncs' },
+      },
+      visible => [ qw(cxn os_file_reads os_file_writes os_fsyncs io_reads_s io_writes_s io_bytes_s)],
+      filters => [],
+      sort_cols => 'cxn',
+      sort_dir => '1',
+      innodb   => 'io',
+      group_by => [],
+      aggregate => 0,
+   },
+   fk_error => {
+      capt => 'Foreign Key Error Info',
+      cust => {},
+      cols => {
+         timestring   => { src => 'IB_fk_timestring' },
+         child_db     => { src => 'IB_fk_child_db' },
+         child_table  => { src => 'IB_fk_child_table' },
+         child_index  => { src => 'IB_fk_child_index' },
+         fk_name      => { src => 'IB_fk_fk_name' },
+         parent_db    => { src => 'IB_fk_parent_db' },
+         parent_table => { src => 'IB_fk_parent_table' },
+         parent_col   => { src => 'IB_fk_parent_col' },
+         parent_index => { src => 'IB_fk_parent_index' },
+         attempted_op => { src => 'IB_fk_attempted_op' },
+      },
+      visible => [ qw(timestring child_db child_table child_index parent_db parent_table parent_col parent_index fk_name attempted_op)],
+      filters => [],
+      sort_cols => '',
+      sort_dir => '1',
+      innodb   => 'fk',
+      group_by => [],
+      aggregate => 0,
+   },
+   insert_buffers => {
+      capt => 'Insert Buffers',
+      cust => {},
+      cols => {
+         cxn           => { src => 'cxn' },
+         inserts       => { src => 'IB_ib_inserts' },
+         merged_recs   => { src => 'IB_ib_merged_recs' },
+         merges        => { src => 'IB_ib_merges' },
+         size          => { src => 'IB_ib_size' },
+         free_list_len => { src => 'IB_ib_free_list_len' },
+         seg_size      => { src => 'IB_ib_seg_size' },
+      },
+      visible => [ qw(cxn inserts merged_recs merges size free_list_len seg_size)],
+      filters => [],
+      sort_cols => 'cxn',
+      sort_dir => '1',
+      innodb   => 'ib',
+      group_by => [],
+      aggregate => 0,
+   },
+   innodb_locks  => {
+      capt => 'InnoDB Locks',
+      cust => {},
+      cols => {
+         cxn              => { src => 'cxn' },
+         db               => { src => 'db' },
+         index            => { src => 'index' },
+         insert_intention => { src => 'insert_intention' },
+         lock_mode        => { src => 'lock_mode' },
+         lock_type        => { src => 'lock_type' },
+         lock_wait_time   => { src => 'lock_wait_time', trans => [ qw(secs_to_time) ] },
+         mysql_thread_id  => { src => 'mysql_thread_id' },
+         n_bits           => { src => 'n_bits' },
+         page_no          => { src => 'page_no' },
+         space_id         => { src => 'space_id' },
+         special          => { src => 'special' },
+         tbl              => { src => 'table' },
+         'time'           => { src => 'active_secs', hdr => 'Active', trans => [ qw(secs_to_time) ] },
+         txn_id           => { src => 'txn_id' },
+         waiting          => { src => 'waiting' },
+      },
+      visible => [ qw(cxn mysql_thread_id lock_type waiting lock_wait_time time lock_mode db tbl index insert_intention special)],
+      filters => [],
+      sort_cols => 'cxn -lock_wait_time',
+      sort_dir => '1',
+      innodb   => 'tx',
+      colors   => [
+         { col => 'lock_wait_time', op => '>',  arg => 60, color => 'red' },
+         { col => 'lock_wait_time', op => '>',  arg => 30, color => 'yellow' },
+         { col => 'lock_wait_time', op => '>',  arg => 10, color => 'green' },
+      ],
+      group_by => [],
+      aggregate => 0,
+   },
+   innodb_transactions => {
+      capt => 'InnoDB Transactions',
+      cust => {},
+      cols => {
+         cxn                => { src => 'cxn' },
+         active_secs        => { src => 'active_secs' },
+         has_read_view      => { src => 'has_read_view' },
+         heap_size          => { src => 'heap_size' },
+         hostname           => { src => $exprs{Host} },
+         ip                 => { src => 'ip' },
+         wait_status        => { src => 'lock_wait_status' },
+         lock_wait_time     => { src => 'lock_wait_time',      trans => [ qw(secs_to_time) ] },
+         lock_structs       => { src => 'lock_structs' },
+         mysql_thread_id    => { src => 'mysql_thread_id' },
+         os_thread_id       => { src => 'os_thread_id' },
+         proc_no            => { src => 'proc_no' },
+         query_id           => { src => 'query_id' },
+         query_status       => { src => 'query_status' },
+         query_text         => { src => 'query_text',          trans => [ qw(no_ctrl_char) ] },
+         txn_time_remain    => { src => $exprs{TxnTimeRemain}, trans => [ qw(secs_to_time) ] },
+         row_locks          => { src => 'row_locks' },
+         tables_in_use      => { src => 'tables_in_use' },
+         tables_locked      => { src => 'tables_locked' },
+         thread_decl_inside => { src => 'thread_decl_inside' },
+         thread_status      => { src => 'thread_status' },
+         'time'             => { src => 'active_secs',         trans => [ qw(secs_to_time) ], agg => 'sum' },
+         txn_doesnt_see_ge  => { src => 'txn_doesnt_see_ge' },
+         txn_id             => { src => 'txn_id' },
+         txn_sees_lt        => { src => 'txn_sees_lt' },
+         txn_status         => { src => 'txn_status',          minw => 10, maxw => 10 },
+         undo_log_entries   => { src => 'undo_log_entries' },
+         user               => { src => 'user',                maxw => 10 },
+         cnt                => { src => 'mysql_thread_id',     minw => 0 },
+      },
+      visible => [ qw(cxn cnt mysql_thread_id user hostname txn_status time undo_log_entries query_text)],
+      filters => [ qw( hide_self hide_inactive ) ],
+      sort_cols => '-active_secs txn_status cxn mysql_thread_id',
+      sort_dir => '1',
+      innodb   => 'tx',
+      hide_caption => 1,
+      colors   => [
+         { col => 'wait_status', op => 'eq', arg => 'LOCK WAIT',   color => 'black on_red' },
+         { col => 'time',        op => '>',  arg => 600,           color => 'red' },
+         { col => 'time',        op => '>',  arg => 300,           color => 'yellow' },
+         { col => 'time',        op => '>',  arg => 60,            color => 'green' },
+         { col => 'time',        op => '>',  arg => 30,            color => 'cyan' },
+         { col => 'txn_status',  op => 'eq', arg => 'not started', color => 'white' },
+      ],
+      group_by => [ qw(cxn txn_status) ],
+      aggregate => 0,
+   },
+   io_threads => {
+      capt => 'I/O Threads',
+      cust => {},
+      cols => {
+         cxn            => { src => 'cxn' },
+         thread         => { src => 'thread' },
+         thread_purpose => { src => 'purpose' },
+         event_set      => { src => 'event_set' },
+         thread_status  => { src => 'state' },
+      },
+      visible => [ qw(cxn thread thread_purpose thread_status)],
+      filters => [ qw() ],
+      sort_cols => 'cxn thread',
+      sort_dir => '1',
+      innodb   => 'io',
+      group_by => [],
+      aggregate => 0,
+   },
+   log_statistics => {
+      capt => 'Log Statistics',
+      cust => {},
+      cols => {
+         cxn                 => { src => 'cxn' },
+         last_chkp           => { src => 'IB_lg_last_chkp' },
+         log_flushed_to      => { src => 'IB_lg_log_flushed_to' },
+         log_ios_done        => { src => 'IB_lg_log_ios_done' },
+         log_ios_s           => { src => 'IB_lg_log_ios_s' },
+         log_seq_no          => { src => 'IB_lg_log_seq_no' },
+         pending_chkp_writes => { src => 'IB_lg_pending_chkp_writes' },
+         pending_log_writes  => { src => 'IB_lg_pending_log_writes' },
+      },
+      visible => [ qw(cxn log_seq_no log_flushed_to last_chkp log_ios_done log_ios_s)],
+      filters => [],
+      sort_cols => 'cxn',
+      sort_dir => '1',
+      innodb   => 'lg',
+      group_by => [],
+      aggregate => 0,
+   },
+   master_status => {
+      capt => 'Master Status',
+      cust => {},
+      cols => {
+         cxn                         => { src => 'cxn' },
+         binlog_do_db                => { src => 'binlog_do_db' },
+         binlog_ignore_db            => { src => 'binlog_ignore_db' },
+         master_file                 => { src => 'file' },
+         master_pos                  => { src => 'position' },
+         binlog_cache_overflow       => { src => '(Binlog_cache_disk_use||0)/(Binlog_cache_use||1)', trans => [ qw(percent) ] },
+      },
+      visible => [ qw(cxn master_file master_pos binlog_cache_overflow)],
+      filters => [ qw(cxn_is_master) ],
+      sort_cols => 'cxn',
+      sort_dir => '1',
+      innodb   => '',
+      group_by => [],
+      aggregate => 0,
+   },
+   pending_io => {
+      capt => 'Pending I/O',
+      cust => {},
+      cols => {
+         cxn                => { src => 'cxn' },
+         p_normal_aio_reads => { src => 'IB_io_pending_normal_aio_reads' },
+         p_aio_writes       => { src => 'IB_io_pending_aio_writes' },
+         p_ibuf_aio_reads   => { src => 'IB_io_pending_ibuf_aio_reads' },
+         p_sync_ios         => { src => 'IB_io_pending_sync_ios' },
+         p_buf_pool_flushes => { src => 'IB_io_pending_buffer_pool_flushes' },
+         p_log_flushes      => { src => 'IB_io_pending_log_flushes' },
+         p_log_ios          => { src => 'IB_io_pending_log_ios' },
+         p_preads           => { src => 'IB_io_pending_preads' },
+         p_pwrites          => { src => 'IB_io_pending_pwrites' },
+      },
+      visible => [ qw(cxn p_normal_aio_reads p_aio_writes p_ibuf_aio_reads p_sync_ios p_log_flushes p_log_ios)],
+      filters => [],
+      sort_cols => 'cxn',
+      sort_dir => '1',
+      innodb   => 'io',
+      group_by => [],
+      aggregate => 0,
+   },
+   open_tables => {
+      capt => 'Open Tables',
+      cust => {},
+      cols => {
+         cxn            => { src => 'cxn' },
+         db             => { src => 'database' },
+         tbl            => { src => 'table' },
+         num_times_open => { src => 'in_use' },
+         is_name_locked => { src => 'name_locked' },
+      },
+      visible => [ qw(cxn db tbl num_times_open is_name_locked)],
+      filters => [ qw(table_is_open) ],
+      sort_cols => '-num_times_open cxn db tbl',
+      sort_dir => '1',
+      innodb   => '',
+      group_by => [],
+      aggregate => 0,
+   },
+   page_statistics => {
+      capt => 'Page Statistics',
+      cust => {},
+      cols => {
+         cxn              => { src => 'cxn' },
+         pages_read       => { src => 'IB_bp_pages_read' },
+         pages_written    => { src => 'IB_bp_pages_written' },
+         pages_created    => { src => 'IB_bp_pages_created' },
+         page_reads_sec   => { src => 'IB_bp_page_reads_sec' },
+         page_writes_sec  => { src => 'IB_bp_page_writes_sec' },
+         page_creates_sec => { src => 'IB_bp_page_creates_sec' },
+      },
+      visible => [ qw(cxn pages_read pages_written pages_created page_reads_sec page_writes_sec page_creates_sec)],
+      filters => [],
+      sort_cols => 'cxn',
+      sort_dir => '1',
+      innodb   => 'bp',
+      group_by => [],
+      aggregate => 0,
+   },
+   processlist => {
+      capt => 'MySQL Process List',
+      cust => {},
+      cols => {
+         cxn             => { src => 'cxn',        minw => 6,  maxw => 10 },
+         mysql_thread_id => { src => 'id',         minw => 6,  maxw => 0 },
+         user            => { src => 'user',       minw => 5,  maxw => 8 },
+         hostname        => { src => $exprs{Host}, minw => 13, maxw => 8, },
+         port            => { src => $exprs{Port}, minw => 0,  maxw => 0, },
+         host_and_port   => { src => 'host',       minw => 0,  maxw => 0 },
+         db              => { src => 'db',         minw => 6,  maxw => 12 },
+         cmd             => { src => 'command',    minw => 5,  maxw => 0 },
+         time            => { src => 'time',       minw => 5,  maxw => 0, trans => [ qw(secs_to_time) ], agg => 'sum' },
+         state           => { src => 'state',      minw => 0,  maxw => 0 },
+         info            => { src => 'info',       minw => 0,  maxw => 0, trans => [ qw(no_ctrl_char) ] },
+         cnt             => { src => 'id',         minw => 0,  maxw => 0 },
+      },
+      visible => [ qw(cxn cmd cnt mysql_thread_id user hostname db time info)],
+      filters => [ qw(hide_self hide_inactive hide_slave_io) ],
+      sort_cols => '-time cxn hostname mysql_thread_id',
+      sort_dir => '1',
+      innodb   => '',
+      hide_caption => 1,
+      colors   => [
+         { col => 'state',       op => 'eq', arg => 'Locked',      color => 'black on_red' },
+         { col => 'cmd',         op => 'eq', arg => 'Sleep',       color => 'white' },
+         { col => 'user',        op => 'eq', arg => 'system user', color => 'white' },
+         { col => 'cmd',         op => 'eq', arg => 'Connect',     color => 'white' },
+         { col => 'cmd',         op => 'eq', arg => 'Binlog Dump', color => 'white' },
+         { col => 'time',        op => '>',  arg => 600,           color => 'red' },
+         { col => 'time',        op => '>',  arg => 120,           color => 'yellow' },
+         { col => 'time',        op => '>',  arg => 60,            color => 'green' },
+         { col => 'time',        op => '>',  arg => 30,            color => 'cyan' },
+      ],
+      group_by => [qw(cxn cmd)],
+      aggregate => 0,
+   },
+
+   # TODO: some more columns:
+   # kb_used=hdr='BufUsed' minw='0' num='0' src='percent(1 - ((Key_blocks_unused * key_cache_block_size) / (key_buffer_size||1)))' dec='0' trans='' tbl='q_header' just='-' user='1' maxw='0' label='User-defined'
+   # retries=hdr='Retries' minw='0' num='0' src='Slave_retried_transactions' dec='0' trans='' tbl='slave_sql_status' just='-' user='1' maxw='0' label='User-defined'
+   # thd=hdr='Thd' minw='0' num='0' src='Threads_connected' dec='0' trans='' tbl='slave_sql_status' just='-' user='1' maxw='0' label='User-defined'
+
+   q_header => {
+      capt => 'Q-mode Header',
+      cust => {},
+      cols => {
+         cxn            => { src => 'cxn' },
+         questions      => { src => 'Questions' },
+         qps            => { src => 'Questions/Uptime_hires',               dec => 1, trans => [qw(shorten)] },
+         load           => { src => $exprs{ServerLoad},                     dec => 1, trans => [qw(shorten)] },
+         slow           => { src => 'Slow_queries',                         dec => 1, trans => [qw(shorten)] },
+         q_cache_hit    => { src => $exprs{QcacheHitRatio},                 dec => 1, trans => [qw(percent)] },
+         key_buffer_hit => { src => '1-(Key_reads/(Key_read_requests||1))', dec => 1, trans => [qw(percent)] },
+         bps_in         => { src => 'Bytes_received/Uptime_hires',          dec => 1, trans => [qw(shorten)] },
+         bps_out        => { src => 'Bytes_sent/Uptime_hires',              dec => 1, trans => [qw(shorten)] },
+         when           => { src => 'when' },
+      },
+      visible => [ qw(cxn when load qps slow q_cache_hit key_buffer_hit bps_in bps_out)],
+      filters => [],
+      sort_cols => 'when cxn',
+      sort_dir => '1',
+      innodb   => '',
+      hide_caption => 1,
+      group_by => [],
+      aggregate => 0,
+   },
+   row_operations => {
+      capt => 'InnoDB Row Operations',
+      cust => {},
+      cols => {
+         cxn         => { src => 'cxn' },
+         num_inserts => { src => 'IB_ro_num_rows_ins' },
+         num_updates => { src => 'IB_ro_num_rows_upd' },
+         num_reads   => { src => 'IB_ro_num_rows_read' },
+         num_deletes => { src => 'IB_ro_num_rows_del' },
+         num_inserts_sec => { src => 'IB_ro_ins_sec' },
+         num_updates_sec => { src => 'IB_ro_upd_sec' },
+         num_reads_sec   => { src => 'IB_ro_read_sec' },
+         num_deletes_sec => { src => 'IB_ro_del_sec' },
+      },
+      visible => [ qw(cxn num_inserts num_updates num_reads num_deletes num_inserts_sec
+                       num_updates_sec num_reads_sec num_deletes_sec)],
+      filters => [],
+      sort_cols => 'cxn',
+      sort_dir => '1',
+      innodb   => 'ro',
+      group_by => [],
+      aggregate => 0,
+   },
+   row_operation_misc => {
+      capt => 'Row Operation Misc',
+      cust => {},
+      cols => {
+         cxn                 => { src => 'cxn' },
+         queries_in_queue    => { src => 'IB_ro_queries_in_queue' },
+         queries_inside      => { src => 'IB_ro_queries_inside' },
+         read_views_open     => { src => 'IB_ro_read_views_open' },
+         main_thread_id      => { src => 'IB_ro_main_thread_id' },
+         main_thread_proc_no => { src => 'IB_ro_main_thread_proc_no' },
+         main_thread_state   => { src => 'IB_ro_main_thread_state' },
+         num_res_ext         => { src => 'IB_ro_n_reserved_extents' },
+      },
+      visible => [ qw(cxn queries_in_queue queries_inside read_views_open main_thread_state)],
+      filters => [],
+      sort_cols => 'cxn',
+      sort_dir => '1',
+      innodb   => 'ro',
+      group_by => [],
+      aggregate => 0,
+   },
+   semaphores => {
+      capt => 'InnoDB Semaphores',
+      cust => {},
+      cols => {
+         cxn                => { src => 'cxn' },
+         mutex_os_waits     => { src => 'IB_sm_mutex_os_waits' },
+         mutex_spin_rounds  => { src => 'IB_sm_mutex_spin_rounds' },
+         mutex_spin_waits   => { src => 'IB_sm_mutex_spin_waits' },
+         reservation_count  => { src => 'IB_sm_reservation_count' },
+         rw_excl_os_waits   => { src => 'IB_sm_rw_excl_os_waits' },
+         rw_excl_spins      => { src => 'IB_sm_rw_excl_spins' },
+         rw_shared_os_waits => { src => 'IB_sm_rw_shared_os_waits' },
+         rw_shared_spins    => { src => 'IB_sm_rw_shared_spins' },
+         signal_count       => { src => 'IB_sm_signal_count' },
+         wait_array_size    => { src => 'IB_sm_wait_array_size' },
+      },
+      visible => [ qw(cxn mutex_os_waits mutex_spin_waits mutex_spin_rounds
+         rw_excl_os_waits rw_excl_spins rw_shared_os_waits rw_shared_spins
+         signal_count reservation_count )],
+      filters => [],
+      sort_cols => 'cxn',
+      sort_dir => '1',
+      innodb   => 'sm',
+      group_by => [],
+      aggregate => 0,
+   },
+   slave_io_status => {
+      capt => 'Slave I/O Status',
+      cust => {},
+      cols => {
+         cxn                         => { src => 'cxn' },
+         connect_retry               => { src => 'connect_retry' },
+         master_host                 => { src => 'master_host', hdr => 'Master'},
+         master_log_file             => { src => 'master_log_file', hdr => 'File' },
+         master_port                 => { src => 'master_port' },
+         master_ssl_allowed          => { src => 'master_ssl_allowed' },
+         master_ssl_ca_file          => { src => 'master_ssl_ca_file' },
+         master_ssl_ca_path          => { src => 'master_ssl_ca_path' },
+         master_ssl_cert             => { src => 'master_ssl_cert' },
+         master_ssl_cipher           => { src => 'master_ssl_cipher' },
+         master_ssl_key              => { src => 'master_ssl_key' },
+         master_user                 => { src => 'master_user' },
+         read_master_log_pos         => { src => 'read_master_log_pos', hdr => 'Pos' },
+         relay_log_size              => { src => 'relay_log_space', trans => [qw(shorten)] },
+         slave_io_running            => { src => 'slave_io_running', hdr => 'On?' },
+         slave_io_state              => { src => 'slave_io_state', hdr => 'State' },
+      },
+      visible => [ qw(cxn master_host slave_io_running master_log_file relay_log_size read_master_log_pos slave_io_state)],
+      filters => [ qw( cxn_is_slave ) ],
+      sort_cols => 'slave_io_running cxn',
+      colors   => [
+         { col => 'slave_io_running',  op => 'ne', arg => 'Yes', color => 'black on_red' },
+      ],
+      sort_dir => '1',
+      innodb   => '',
+      group_by => [],
+      aggregate => 0,
+   },
+   slave_sql_status => {
+      capt => 'Slave SQL Status',
+      cust => {},
+      cols => {
+         cxn                         => { src => 'cxn' },
+         exec_master_log_pos         => { src => 'exec_master_log_pos', hdr => 'Master Pos' },
+         last_errno                  => { src => 'last_errno' },
+         last_error                  => { src => 'last_error' },
+         master_host                 => { src => 'master_host', hdr => 'Master' },
+         relay_log_file              => { src => 'relay_log_file' },
+         relay_log_pos               => { src => 'relay_log_pos' },
+         relay_log_size              => { src => 'relay_log_space', trans => [qw(shorten)] },
+         relay_master_log_file       => { src => 'relay_master_log_file', hdr => 'Master File' },
+         replicate_do_db             => { src => 'replicate_do_db' },
+         replicate_do_table          => { src => 'replicate_do_table' },
+         replicate_ignore_db         => { src => 'replicate_ignore_db' },
+         replicate_ignore_table      => { src => 'replicate_ignore_table' },
+         replicate_wild_do_table     => { src => 'replicate_wild_do_table' },
+         replicate_wild_ignore_table => { src => 'replicate_wild_ignore_table' },
+         skip_counter                => { src => 'skip_counter' },
+         slave_sql_running           => { src => 'slave_sql_running', hdr => 'On?' },
+         until_condition             => { src => 'until_condition' },
+         until_log_file              => { src => 'until_log_file' },
+         until_log_pos               => { src => 'until_log_pos' },
+         time_behind_master          => { src => 'seconds_behind_master', trans => [ qw(secs_to_time) ] },
+         bytes_behind_master         => { src => 'master_log_file && master_log_file eq relay_master_log_file ? read_master_log_pos - exec_master_log_pos : 0', trans => [qw(shorten)] },
+         slave_catchup_rate          => { src => $exprs{SlaveCatchupRate}, trans => [ qw(set_precision) ] },
+         slave_open_temp_tables      => { src => 'Slave_open_temp_tables' },
+      },
+      visible => [ qw(cxn master_host slave_sql_running time_behind_master slave_catchup_rate slave_open_temp_tables relay_log_pos last_error)],
+      filters => [ qw( cxn_is_slave ) ],
+      sort_cols => 'slave_sql_running cxn',
+      sort_dir => '1',
+      innodb   => '',
+      colors   => [
+         { col => 'slave_sql_running',  op => 'ne', arg => 'Yes', color => 'black on_red' },
+         { col => 'time_behind_master', op => '>',  arg => 600,   color => 'red' },
+         { col => 'time_behind_master', op => '>',  arg => 60,    color => 'yellow' },
+         { col => 'time_behind_master', op => '==', arg => 0,     color => 'white' },
+      ],
+      group_by => [],
+      aggregate => 0,
+   },
+   t_header => {
+      capt => 'T-Mode Header',
+      cust => {},
+      cols => {
+         cxn                         => { src => 'cxn' },
+         dirty_bufs                  => { src => $exprs{DirtyBufs},           trans => [qw(percent)] },
+         history_list_len            => { src => 'IB_tx_history_list_len' },
+         lock_structs                => { src => 'IB_tx_num_lock_structs' },
+         num_txns                    => { src => $exprs{NumTxns} },
+         max_txn                     => { src => $exprs{MaxTxnTime},          trans => [qw(secs_to_time)] },
+         undo_for                    => { src => 'IB_tx_purge_undo_for' },
+         used_bufs                   => { src => $exprs{BufPoolFill},         trans => [qw(percent)]},
+         versions                    => { src => $exprs{OldVersions} },
+      },
+      visible => [ qw(cxn history_list_len versions undo_for dirty_bufs used_bufs num_txns max_txn lock_structs)],
+      filters => [ ],
+      sort_cols => 'cxn',
+      sort_dir => '1',
+      innodb   => '',
+      colors   => [],
+      hide_caption => 1,
+      group_by => [],
+      aggregate => 0,
+   },
+   var_status => {
+      capt      => 'Variables & Status',
+      cust      => {},
+      cols      => {}, # Generated from current varset
+      visible   => [], # Generated from current varset
+      filters   => [],
+      sort_cols => '',
+      sort_dir  => 1,
+      innodb    => '',
+      temp      => 1, # Do not persist to config file.
+      hide_caption  => 1,
+      pivot     => 0,
+      group_by => [],
+      aggregate => 0,
+   },
+   wait_array => {
+      capt => 'InnoDB Wait Array',
+      cust => {},
+      cols => {
+         cxn                => { src => 'cxn' },
+         thread             => { src => 'thread' },
+         waited_at_filename => { src => 'waited_at_filename' },
+         waited_at_line     => { src => 'waited_at_line' },
+         'time'             => { src => 'waited_secs', trans => [ qw(secs_to_time) ] },
+         request_type       => { src => 'request_type' },
+         lock_mem_addr      => { src => 'lock_mem_addr' },
+         lock_cfile_name    => { src => 'lock_cfile_name' },
+         lock_cline         => { src => 'lock_cline' },
+         writer_thread      => { src => 'writer_thread' },
+         writer_lock_mode   => { src => 'writer_lock_mode' },
+         num_readers        => { src => 'num_readers' },
+         lock_var           => { src => 'lock_var' },
+         waiters_flag       => { src => 'waiters_flag' },
+         last_s_file_name   => { src => 'last_s_file_name' },
+         last_s_line        => { src => 'last_s_line' },
+         last_x_file_name   => { src => 'last_x_file_name' },
+         last_x_line        => { src => 'last_x_line' },
+         cell_waiting       => { src => 'cell_waiting' },
+         cell_event_set     => { src => 'cell_event_set' },
+      },
+      visible => [ qw(cxn thread time waited_at_filename waited_at_line request_type num_readers lock_var waiters_flag cell_waiting cell_event_set)],
+      filters => [],
+      sort_cols => 'cxn -time',
+      sort_dir => '1',
+      innodb   => 'sm',
+      group_by => [],
+      aggregate => 0,
+   },
+);
+
+# Initialize %tbl_meta from %columns and do some checks.
+foreach my $table_name ( keys %tbl_meta ) {
+   my $table = $tbl_meta{$table_name};
+   my $cols  = $table->{cols};
+
+   foreach my $col_name ( keys %$cols ) {
+      my $col_def = $table->{cols}->{$col_name};
+      die "I can't find a column named '$col_name' for '$table_name'" unless $columns{$col_name};
+      $columns{$col_name}->{referenced} = 1;
+
+      foreach my $prop ( keys %col_props ) {
+         # Each column gets non-existing values set from %columns or defaults from %col_props.
+         if ( !$col_def->{$prop} ) {
+            $col_def->{$prop}
+               = defined($columns{$col_name}->{$prop})
+               ? $columns{$col_name}->{$prop}
+               : $col_props{$prop};
+         }
+      }
+
+      # Ensure transformations and aggregate functions are valid
+      die "Unknown aggregate function '$col_def->{agg}' "
+         . "for column '$col_name' in table '$table_name'"
+         unless exists $agg_funcs{$col_def->{agg}};
+      foreach my $trans ( @{$col_def->{trans}} ) {
+         die "Unknown transformation '$trans' "
+            . "for column '$col_name' in table '$table_name'"
+            unless exists $trans_funcs{$trans};
+      }
+   }
+
+   # Ensure each column in visible and group_by exists in cols
+   foreach my $place ( qw(visible group_by) ) {
+      foreach my $col_name ( @{$table->{$place}} ) {
+         if ( !exists $cols->{$col_name} ) {
+            die "Column '$col_name' is listed in '$place' for '$table_name', but doesn't exist";
+         }
+      }
+   }
+
+   # Compile sort and color subroutines
+   $table->{sort_func}  = make_sort_func($table);
+   $table->{color_func} = make_color_func($table);
+}
+
+# This is for code cleanup:
+{
+   my @unused_cols = grep { !$columns{$_}->{referenced} } sort keys %columns;
+   if ( @unused_cols ) {
+      die "The following columns are not used: "
+         . join(' ', @unused_cols);
+   }
+}
+
+# ###########################################################################
+# Operating modes {{{3
+# ###########################################################################
+my %modes = (
+   B => {
+      hdr               => 'InnoDB Buffers',
+      cust              => {},
+      note              => 'Shows buffer info from InnoDB',
+      action_for        => {
+         i => {
+            action => sub { toggle_config('status_inc') },
+            label  => 'Toggle incremental status display',
+         },
+      },
+      display_sub       => \&display_B,
+      connections       => [],
+      server_group      => '',
+      one_connection    => 0,
+      tables            => [qw(buffer_pool page_statistics insert_buffers adaptive_hash_index)],
+      visible_tables    => [qw(buffer_pool page_statistics insert_buffers adaptive_hash_index)],
+   },
+   C => {
+      hdr               => 'Command Summary',
+      cust              => {},
+      note              => 'Shows relative magnitude of variables',
+      action_for        => {
+         s => {
+            action => sub { get_config_interactive('cmd_filter') },
+            label  => 'Choose variable prefix',
+         },
+      },
+      display_sub       => \&display_C,
+      connections       => [],
+      server_group      => '',
+      one_connection    => 0,
+      tables            => [qw(cmd_summary)],
+      visible_tables    => [qw(cmd_summary)],
+   },
+   D => {
+      hdr               => 'InnoDB Deadlocks',
+      cust              => {},
+      note              => 'View InnoDB deadlock information',
+      action_for        => {
+         c => {
+            action => sub { edit_table('deadlock_transactions') },
+            label  => 'Choose visible columns',
+         },
+         w => {
+            action => \&create_deadlock,
+            label  => 'Wipe deadlock status info by creating a deadlock',
+         },
+      },
+      display_sub       => \&display_D,
+      connections       => [],
+      server_group      => '',
+      one_connection    => 0,
+      tables            => [qw(deadlock_transactions deadlock_locks)],
+      visible_tables    => [qw(deadlock_transactions deadlock_locks)],
+   },
+   F => {
+      hdr               => 'InnoDB FK Err',
+      cust              => {},
+      note              => 'View the latest InnoDB foreign key error',
+      action_for        => {},
+      display_sub       => \&display_F,
+      connections       => [],
+      server_group      => '',
+      one_connection    => 1,
+      tables            => [qw(fk_error)],
+      visible_tables    => [qw(fk_error)],
+   },
+   I => {
+      hdr               => 'InnoDB I/O Info',
+      cust              => {},
+      note              => 'Shows I/O info (i/o, log...) from InnoDB',
+      action_for        => {
+         i => {
+            action => sub { toggle_config('status_inc') },
+            label  => 'Toggle incremental status display',
+         },
+      },
+      display_sub       => \&display_I,
+      connections       => [],
+      server_group      => '',
+      one_connection    => 0,
+      tables            => [qw(io_threads pending_io file_io_misc log_statistics)],
+      visible_tables    => [qw(io_threads pending_io file_io_misc log_statistics)],
+   },
+   L => {
+      hdr             => 'Locks',
+      cust            => {},
+      note            => 'Shows transaction locks',
+      action_for      => {
+         a => {
+            action => sub { send_cmd_to_servers('CREATE TABLE IF NOT EXISTS test.innodb_lock_monitor(a int) ENGINE=InnoDB', 0, '', []); },
+            label  => 'Start the InnoDB Lock Monitor',
+         },
+         o => {
+            action => sub { send_cmd_to_servers('DROP TABLE IF EXISTS test.innodb_lock_monitor', 0, '', []); },
+            label  => 'Stop the InnoDB Lock Monitor',
+         },
+      },
+      display_sub     => \&display_L,
+      connections     => [],
+      server_group    => '',
+      one_connection  => 0,
+      tables            => [qw(innodb_locks)],
+      visible_tables    => [qw(innodb_locks)],
+   },
+   M => {
+      hdr               => 'Replication Status',
+      cust              => {},
+      note              => 'Shows replication (master and slave) status',
+      action_for        => {
+         a => {
+            action => sub { send_cmd_to_servers('START SLAVE', 0, 'START SLAVE SQL_THREAD UNTIL MASTER_LOG_FILE = ?, MASTER_LOG_POS = ?', []); },
+            label  => 'Start slave(s)',
+         },
+         i => {
+            action => sub { toggle_config('status_inc') },
+            label  => 'Toggle incremental status display',
+         },
+         o => {
+            action => sub { send_cmd_to_servers('STOP SLAVE', 0, '', []); },
+            label  => 'Stop slave(s)',
+         },
+         b => {
+            action => sub { purge_master_logs() },
+            label  => 'Purge unused master logs',
+         },
+      },
+      display_sub       => \&display_M,
+      connections       => [],
+      server_group      => '',
+      one_connection    => 0,
+      tables            => [qw(slave_sql_status slave_io_status master_status)],
+      visible_tables    => [qw(slave_sql_status slave_io_status master_status)],
+   },
+   O => {
+      hdr               => 'Open Tables',
+      cust              => {},
+      note              => 'Shows open tables in MySQL',
+      action_for        => {
+         r => {
+            action => sub { reverse_sort('open_tables'); },
+            label  => 'Reverse sort order',
+         },
+         s => {
+            action => sub { choose_sort_cols('open_tables'); },
+            label => "Choose sort column",
+         },
+      },
+      display_sub       => \&display_O,
+      connections       => [],
+      server_group      => '',
+      one_connection    => 0,
+      tables            => [qw(open_tables)],
+      visible_tables    => [qw(open_tables)],
+   },
+   Q => {
+      hdr        => 'Query List',
+      cust       => {},
+      note       => 'Shows queries from SHOW FULL PROCESSLIST',
+      action_for => {
+         a => {
+            action => sub { toggle_filter('processlist', 'hide_self') },
+            label  => 'Toggle the innotop process',
+         },
+         c => {
+            action => sub { edit_table('processlist') },
+            label  => 'Choose visible columns',
+         },
+         e => {
+            action => sub { analyze_query('e'); },
+            label  => "Explain a thread's query",
+         },
+         f => {
+            action => sub { analyze_query('f'); },
+            label  => "Show a thread's full query",
+         },
+         h => {
+            action => sub { toggle_visible_table('Q', 'q_header') },
+            label  => 'Toggle the header on and off',
+         },
+         i => {
+            action => sub { toggle_filter('processlist', 'hide_inactive') },
+            label  => 'Toggle idle processes',
+         },
+         k => {
+            action => sub { kill_query('CONNECTION') },
+            label => "Kill a query's connection",
+         },
+         r => {
+            action => sub { reverse_sort('processlist'); },
+            label  => 'Reverse sort order',
+         },
+         s => {
+            action => sub { choose_sort_cols('processlist'); },
+            label => "Change the display's sort column",
+         },
+         x => {
+            action => sub { kill_query('QUERY') },
+            label => "Kill a query",
+         },
+      },
+      display_sub       => \&display_Q,
+      connections       => [],
+      server_group      => '',
+      one_connection    => 0,
+      tables            => [qw(q_header processlist)],
+      visible_tables    => [qw(q_header processlist)],
+   },
+   R => {
+      hdr               => 'InnoDB Row Ops',
+      cust              => {},
+      note              => 'Shows InnoDB row operation and semaphore info',
+      action_for        => {
+         i => {
+            action => sub { toggle_config('status_inc') },
+            label  => 'Toggle incremental status display',
+         },
+      },
+      display_sub       => \&display_R,
+      connections       => [],
+      server_group      => '',
+      one_connection    => 0,
+      tables            => [qw(row_operations row_operation_misc semaphores wait_array)],
+      visible_tables    => [qw(row_operations row_operation_misc semaphores wait_array)],
+   },
+   S => {
+      hdr               => 'Variables & Status',
+      cust              => {},
+      note              => 'Shows query load statistics a la vmstat',
+      action_for        => {
+         '>' => {
+            action => sub { switch_var_set('S_set', 1) },
+            label  => 'Switch to next variable set',
+         },
+         '<' => {
+            action => sub { switch_var_set('S_set', -1) },
+            label  => 'Switch to prev variable set',
+         },
+         c => {
+            action => sub {
+               choose_var_set('S_set');
+               start_S_mode();
+            },
+            label => "Choose which set to display",
+         },
+         e => {
+            action => \&edit_current_var_set,
+            label  => 'Edit the current set of variables',
+         },
+         i => {
+            action => sub { $clear_screen_sub->(); toggle_config('status_inc') },
+            label  => 'Toggle incremental status display',
+         },
+         '-' => {
+            action => sub { set_display_precision(-1) },
+            label  => 'Decrease fractional display precision',
+         },
+         '+' => {
+            action => sub { set_display_precision(1) },
+            label  => 'Increase fractional display precision',
+         },
+         g => {
+            action => sub { set_s_mode('g') },
+            label  => 'Switch to graph (tload) view',
+         },
+         s => {
+            action => sub { set_s_mode('s') },
+            label  => 'Switch to standard (vmstat) view',
+         },
+         v => {
+            action => sub { set_s_mode('v') },
+            label  => 'Switch to pivoted view',
+         },
+      },
+      display_sub       => \&display_S,
+      no_clear_screen   => 1,
+      connections       => [],
+      server_group      => '',
+      one_connection    => 0,
+      tables            => [qw(var_status)],
+      visible_tables    => [qw(var_status)],
+   },
+   T => {
+      hdr        => 'InnoDB Txns',
+      cust       => {},
+      note       => 'Shows InnoDB transactions in top-like format',
+      action_for => {
+         a => {
+            action => sub { toggle_filter('innodb_transactions', 'hide_self') },
+            label  => 'Toggle the innotop process',
+         },
+         c => {
+            action => sub { edit_table('innodb_transactions') },
+            label  => 'Choose visible columns',
+         },
+         e => {
+            action => sub { analyze_query('e'); },
+            label  => "Explain a thread's query",
+         },
+         f => {
+            action => sub { analyze_query('f'); },
+            label  => "Show a thread's full query",
+         },
+         h => {
+            action => sub { toggle_visible_table('T', 't_header') },
+            label  => 'Toggle the header on and off',
+         },
+         i => {
+            action => sub { toggle_filter('innodb_transactions', 'hide_inactive') },
+            label  => 'Toggle inactive transactions',
+         },
+         k => {
+            action => sub { kill_query('CONNECTION') },
+            label  => "Kill a transaction's connection",
+         },
+         r => {
+            action => sub { reverse_sort('innodb_transactions'); },
+            label  => 'Reverse sort order',
+         },
+         s => {
+            action => sub { choose_sort_cols('innodb_transactions'); },
+            label  => "Change the display's sort column",
+         },
+         x => {
+            action => sub { kill_query('QUERY') },
+            label  => "Kill a query",
+         },
+      },
+      display_sub       => \&display_T,
+      connections       => [],
+      server_group      => '',
+      one_connection    => 0,
+      tables            => [qw(t_header innodb_transactions)],
+      visible_tables    => [qw(t_header innodb_transactions)],
+   },
+);
+
+# ###########################################################################
+# Global key mappings {{{3
+# Keyed on a single character, which is read from the keyboard.  Uppercase
+# letters switch modes.  Lowercase letters access commands when in a mode.
+# These can be overridden by action_for in %modes.
+# ###########################################################################
+my %action_for = (
+   '$' => {
+      action => \&edit_configuration,
+      label  => 'Edit configuration settings',
+   },
+   '?' => {
+      action => \&display_help,
+      label  => 'Show help',
+   },
+   '!' => {
+      action => \&display_license,
+      label  => 'Show license and warranty',
+   },
+   '^' => {
+      action => \&edit_table,
+      label  => "Edit the displayed table(s)",
+   },
+   '#' => {
+      action => \&choose_server_groups,
+      label  => 'Select/create server groups',
+   },
+   '@' => {
+      action => \&choose_servers,
+      label  => 'Select/create server connections',
+   },
+   '/' => {
+      action => \&add_quick_filter,
+      label  => 'Quickly filter what you see',
+   },
+   '\\' => {
+      action => \&clear_quick_filters,
+      label  => 'Clear quick-filters',
+   },
+   '%' => {
+      action => \&choose_filters,
+      label  => 'Choose and edit table filters',
+   },
+   "\t" => {
+      action => \&next_server_group,
+      label  => 'Switch to the next server group',
+      key    => 'TAB',
+   },
+   '=' => {
+      action => \&toggle_aggregate,
+      label  => 'Toggle aggregation',
+   },
+   # TODO: can these be auto-generated from %modes?
+   B => {
+      action => sub { switch_mode('B') },
+      label  => '',
+   },
+   C => {
+      action => sub { switch_mode('C') },
+      label  => '',
+   },
+   D => {
+      action => sub { switch_mode('D') },
+      label  => '',
+   },
+   F => {
+      action => sub { switch_mode('F') },
+      label  => '',
+   },
+   I => {
+      action => sub { switch_mode('I') },
+      label  => '',
+   },
+   L => {
+      action => sub { switch_mode('L') },
+      label  => '',
+   },
+   M => {
+      action => sub { switch_mode('M') },
+      label  => '',
+   },
+   O => {
+      action => sub { switch_mode('O') },
+      label  => '',
+   },
+   Q => {
+      action => sub { switch_mode('Q') },
+      label  => '',
+   },
+   R => {
+      action => sub { switch_mode('R') },
+      label  => '',
+   },
+   S => {
+      action => \&start_S_mode,
+      label  => '',
+   },
+   T => {
+      action => sub { switch_mode('T') },
+      label  => '',
+   },
+   d => {
+      action => sub { get_config_interactive('interval') },
+      label  => 'Change refresh interval',
+   },
+   n => { action => \&next_server,       label => 'Switch to the next connection' },
+   p => { action => \&pause,             label => 'Pause innotop', },
+   q => { action => \&finish,            label => 'Quit innotop', },
+);
+
+# ###########################################################################
+# Sleep times after certain statements {{{3
+# ###########################################################################
+my %stmt_sleep_time_for = ();
+
+# ###########################################################################
+# Config editor key mappings {{{3
+# ###########################################################################
+my %cfg_editor_action = (
+   c => {
+      note => 'Edit columns, etc in the displayed table(s)',
+      func => \&edit_table,
+   },
+   g => {
+      note => 'Edit general configuration',
+      func => \&edit_configuration_variables,
+   },
+   k => {
+      note => 'Edit row-coloring rules',
+      func => \&edit_color_rules,
+   },
+   p => {
+      note => 'Manage plugins',
+      func => \&edit_plugins,
+   },
+   s => {
+      note => 'Edit server groups',
+      func => \&edit_server_groups,
+   },
+   S => {
+      note => 'Edit SQL statement sleep delays',
+      func => \&edit_stmt_sleep_times,
+   },
+   t => {
+      note => 'Choose which table(s) to display in this mode',
+      func => \&choose_mode_tables,
+   },
+);
+
+# ###########################################################################
+# Color editor key mappings {{{3
+# ###########################################################################
+my %color_editor_action = (
+   n => {
+      note => 'Create a new color rule',
+      func => sub {
+         my ( $tbl, $idx ) = @_;
+         my $meta = $tbl_meta{$tbl};
+
+         $clear_screen_sub->();
+         my $col;
+         do {
+            $col = prompt_list(
+               'Choose the target column for the rule',
+               '',
+               sub { return keys %{$meta->{cols}} },
+               { map { $_ => $meta->{cols}->{$_}->{label} } keys %{$meta->{cols}} });
+         } while ( !$col );
+         ( $col ) = grep { $_ } split(/\W+/, $col);
+         return $idx unless $col && exists $meta->{cols}->{$col};
+
+         $clear_screen_sub->();
+         my $op;
+         do {
+            $op = prompt_list(
+               'Choose the comparison operator for the rule',
+               '',
+               sub { return keys %comp_ops },
+               { map { $_ => $comp_ops{$_} } keys %comp_ops } );
+         } until ( $op );
+         $op =~ s/\s+//g;
+         return $idx unless $op && exists $comp_ops{$op};
+
+         my $arg;
+         do {
+            $arg = prompt('Specify an argument for the comparison');
+         } until defined $arg;
+
+         my $color;
+         do {
+            $color = prompt_list(
+               'Choose the color(s) the row should be when the rule matches',
+               '',
+               sub { return keys %ansicolors },
+               { map { $_ => $_ } keys %ansicolors } );
+         } until defined $color;
+         $color = join(' ', unique(grep { exists $ansicolors{$_} } split(/\W+/, $color)));
+         return $idx unless $color;
+
+         push @{$tbl_meta{$tbl}->{colors}}, {
+            col   => $col,
+            op    => $op,
+            arg   => $arg,
+            color => $color
+         };
+         $tbl_meta{$tbl}->{cust}->{colors} = 1;
+
+         return $idx;
+      },
+   },
+   d => {
+      note => 'Remove the selected rule',
+      func => sub {
+         my ( $tbl, $idx ) = @_;
+         my @rules = @{ $tbl_meta{$tbl}->{colors} };
+         return 0 unless @rules > 0 && $idx < @rules && $idx >= 0;
+         splice(@{$tbl_meta{$tbl}->{colors}}, $idx, 1);
+         $tbl_meta{$tbl}->{cust}->{colors} = 1;
+         return $idx == @rules ? $#rules : $idx;
+      },
+   },
+   j => {
+      note => 'Move highlight down one',
+      func => sub {
+         my ( $tbl, $idx ) = @_;
+         my $num_rules = scalar @{$tbl_meta{$tbl}->{colors}};
+         return ($idx + 1) % $num_rules;
+      },
+   },
+   k => {
+      note => 'Move highlight up one',
+      func => sub {
+         my ( $tbl, $idx ) = @_;
+         my $num_rules = scalar @{$tbl_meta{$tbl}->{colors}};
+         return ($idx - 1) % $num_rules;
+      },
+   },
+   '+' => {
+      note => 'Move selected rule up one',
+      func => sub {
+         my ( $tbl, $idx ) = @_;
+         my $meta = $tbl_meta{$tbl};
+         my $dest = $idx == 0 ? scalar(@{$meta->{colors}} - 1) : $idx - 1;
+         my $temp = $meta->{colors}->[$idx];
+         $meta->{colors}->[$idx]  = $meta->{colors}->[$dest];
+         $meta->{colors}->[$dest] = $temp;
+         $meta->{cust}->{colors} = 1;
+         return $dest;
+      },
+   },
+   '-' => {
+      note => 'Move selected rule down one',
+      func => sub {
+         my ( $tbl, $idx ) = @_;
+         my $meta = $tbl_meta{$tbl};
+         my $dest = $idx == scalar(@{$meta->{colors}} - 1) ? 0 : $idx + 1;
+         my $temp = $meta->{colors}->[$idx];
+         $meta->{colors}->[$idx]  = $meta->{colors}->[$dest];
+         $meta->{colors}->[$dest] = $temp;
+         $meta->{cust}->{colors} = 1;
+         return $dest;
+      },
+   },
+);
+
+# ###########################################################################
+# Plugin editor key mappings {{{3
+# ###########################################################################
+my %plugin_editor_action = (
+   '*' => {
+      note => 'Toggle selected plugin active/inactive',
+      func => sub {
+         my ( $plugins, $idx ) = @_;
+         my $plugin = $plugins->[$idx];
+         $plugin->{active} = $plugin->{active} ? 0 : 1;
+         return $idx;
+      },
+   },
+   j => {
+      note => 'Move highlight down one',
+      func => sub {
+         my ( $plugins, $idx ) = @_;
+         return ($idx + 1) % scalar(@$plugins);
+      },
+   },
+   k => {
+      note => 'Move highlight up one',
+      func => sub {
+         my ( $plugins, $idx ) = @_;
+         return $idx == 0 ? @$plugins - 1 : $idx - 1;
+      },
+   },
+);
+
+# ###########################################################################
+# Table editor key mappings {{{3
+# ###########################################################################
+my %tbl_editor_action = (
+   a => {
+      note => 'Add a column to the table',
+      func => sub {
+         my ( $tbl, $col ) = @_;
+         my @visible_cols = @{ $tbl_meta{$tbl}->{visible} };
+         my %all_cols     = %{ $tbl_meta{$tbl}->{cols} };
+         delete @all_cols{@visible_cols};
+         my $choice = prompt_list(
+            'Choose a column',
+            '',
+            sub { return keys %all_cols; },
+            { map { $_ => $all_cols{$_}->{label} || $all_cols{$_}->{hdr} } keys %all_cols });
+         if ( $all_cols{$choice} ) {
+            push @{$tbl_meta{$tbl}->{visible}}, $choice;
+            $tbl_meta{$tbl}->{cust}->{visible} = 1;
+            return $choice;
+         }
+         return $col;
+      },
+   },
+   n => {
+      note => 'Create a new column and add it to the table',
+      func => sub {
+         my ( $tbl, $col ) = @_;
+
+         $clear_screen_sub->();
+         print word_wrap("Choose a name for the column.  This name is not displayed, and is used only "
+               . "for internal reference.  It can contain only lowercase letters, numbers, "
+               . "and underscores.");
+         print "\n\n";
+         do {
+            $col = prompt("Enter column name");
+            $col = '' if $col =~ m/[^a-z0-9_]/;
+         } while ( !$col );
+
+         $clear_screen_sub->();
+         my $hdr;
+         do {
+            $hdr = prompt("Enter column header");
+         } while ( !$hdr );
+
+         $clear_screen_sub->();
+         print "Choose a source for the column's data\n\n";
+         my ( $src, $sub, $err );
+         do {
+            if ( $err ) {
+               print "Error: $err\n\n";
+            }
+            $src = prompt("Enter column source");
+            if ( $src ) {
+               ( $sub, $err ) = compile_expr($src);
+            }
+         } until ( !$err);
+
+         # TODO: this duplicates %col_props.
+         $tbl_meta{$tbl}->{cols}->{$col} = {
+            hdr   => $hdr,
+            src   => $src,
+            just  => '-',
+            num   => 0,
+            label => 'User-defined',
+            user  => 1,
+            tbl   => $tbl,
+            minw  => 0,
+            maxw  => 0,
+            trans => [],
+            func  => $sub,
+            dec   => 0,
+            agg   => 0,
+            aggonly => 0,
+         };
+
+         $tbl_meta{$tbl}->{visible} = [ unique(@{$tbl_meta{$tbl}->{visible}}, $col) ];
+         $tbl_meta{$tbl}->{cust}->{visible} = 1;
+         return $col;
+      },
+   },
+   d => {
+      note => 'Remove selected column',
+      func => sub {
+         my ( $tbl, $col ) = @_;
+         my @visible_cols = @{ $tbl_meta{$tbl}->{visible} };
+         my $idx          = 0;
+         return $col unless @visible_cols > 1;
+         while ( $visible_cols[$idx] ne $col ) {
+            $idx++;
+         }
+         $tbl_meta{$tbl}->{visible} = [ grep { $_ ne $col } @visible_cols ];
+         $tbl_meta{$tbl}->{cust}->{visible} = 1;
+         return $idx == $#visible_cols ? $visible_cols[$idx - 1] : $visible_cols[$idx + 1];
+      },
+   },
+   e => {
+      note => 'Edit selected column',
+      func => sub {
+         # TODO: make this editor hotkey-driven and give readline support.
+         my ( $tbl, $col ) = @_;
+         $clear_screen_sub->();
+         my $meta = $tbl_meta{$tbl}->{cols}->{$col};
+         my @prop = qw(hdr label src just num minw maxw trans agg); # TODO redundant
+
+         my $answer;
+         do {
+            # Do what the user asked...
+            if ( $answer && grep { $_ eq $answer } @prop ) {
+               # Some properties are arrays, others scalars.
+               my $ini = ref $col_props{$answer} ? join(' ', @{$meta->{$answer}}) : $meta->{$answer};
+               my $val = prompt("New value for $answer", undef, $ini);
+               $val = [ split(' ', $val) ] if ref($col_props{$answer});
+               if ( $answer eq 'trans' ) {
+                  $val = [ unique(grep{ exists $trans_funcs{$_} } @$val) ];
+               }
+               @{$meta}{$answer, 'user', 'tbl' } = ( $val, 1, $tbl );
+            }
+
+            my @display_lines = (
+               '',
+               "You are editing column $tbl.$col.\n",
+            );
+
+            push @display_lines, create_table2(
+               \@prop,
+               { map { $_ => $_ } @prop },
+               { map { $_ => ref $meta->{$_} eq 'ARRAY' ? join(' ', @{$meta->{$_}})
+                           : ref $meta->{$_}            ? '[expression code]'
+                           :                              $meta->{$_}
+                     } @prop
+               },
+               { sep => '  ' });
+            draw_screen(\@display_lines, { raw => 1 });
+            print "\n\n"; # One to add space, one to clear readline artifacts
+            $answer = prompt('Edit what? (q to quit)');
+         } while ( $answer ne 'q' );
+
+         return $col;
+      },
+   },
+   j => {
+      note => 'Move highlight down one',
+      func => sub {
+         my ( $tbl, $col ) = @_;
+         my @visible_cols = @{ $tbl_meta{$tbl}->{visible} };
+         my $idx          = 0;
+         while ( $visible_cols[$idx] ne $col ) {
+            $idx++;
+         }
+         return $visible_cols[ ($idx + 1) % @visible_cols ];
+      },
+   },
+   k => {
+      note => 'Move highlight up one',
+      func => sub {
+         my ( $tbl, $col ) = @_;
+         my @visible_cols = @{ $tbl_meta{$tbl}->{visible} };
+         my $idx          = 0;
+         while ( $visible_cols[$idx] ne $col ) {
+            $idx++;
+         }
+         return $visible_cols[ $idx - 1 ];
+      },
+   },
+   '+' => {
+      note => 'Move selected column up one',
+      func => sub {
+         my ( $tbl, $col ) = @_;
+         my $meta         = $tbl_meta{$tbl};
+         my @visible_cols = @{$meta->{visible}};
+         my $idx          = 0;
+         while ( $visible_cols[$idx] ne $col ) {
+            $idx++;
+         }
+         if ( $idx ) {
+            $visible_cols[$idx]     = $visible_cols[$idx - 1];
+            $visible_cols[$idx - 1] = $col;
+            $meta->{visible}        = \@visible_cols;
+         }
+         else {
+            shift @{$meta->{visible}};
+            push @{$meta->{visible}}, $col;
+         }
+         $meta->{cust}->{visible} = 1;
+         return $col;
+      },
+   },
+   '-' => {
+      note => 'Move selected column down one',
+      func => sub {
+         my ( $tbl, $col ) = @_;
+         my $meta         = $tbl_meta{$tbl};
+         my @visible_cols = @{$meta->{visible}};
+         my $idx          = 0;
+         while ( $visible_cols[$idx] ne $col ) {
+            $idx++;
+         }
+         if ( $idx == $#visible_cols ) {
+            unshift @{$meta->{visible}}, $col;
+            pop @{$meta->{visible}};
+         }
+         else {
+            $visible_cols[$idx]     = $visible_cols[$idx + 1];
+            $visible_cols[$idx + 1] = $col;
+            $meta->{visible}        = \@visible_cols;
+         }
+         $meta->{cust}->{visible} = 1;
+         return $col;
+      },
+   },
+   f => {
+      note => 'Choose filters',
+      func => sub {
+         my ( $tbl, $col ) = @_;
+         choose_filters($tbl);
+         return $col;
+      },
+   },
+   o => {
+      note => 'Edit color rules',
+      func => sub {
+         my ( $tbl, $col ) = @_;
+         edit_color_rules($tbl);
+         return $col;
+      },
+   },
+   s => {
+      note => 'Choose sort columns',
+      func => sub {
+         my ( $tbl, $col ) = @_;
+         choose_sort_cols($tbl);
+         return $col;
+      },
+   },
+   g => {
+      note => 'Choose group-by (aggregate) columns',
+      func => sub {
+         my ( $tbl, $col ) = @_;
+         choose_group_cols($tbl);
+         return $col;
+      },
+   },
+);
+
+# ###########################################################################
+# Global variables and environment {{{2
+# ###########################################################################
+
+my @this_term_size; # w_chars, h_chars, w_pix, h_pix
+my @last_term_size; # w_chars, h_chars, w_pix, h_pix
+my $char;
+my $windows       = $OSNAME =~ m/MSWin/;
+my $have_color    = 0;
+my $MAX_ULONG     = 4294967295; # 2^32-1
+my $num_regex     = qr/^[+-]?(?=\d|\.)\d*(?:\.\d+)?(?:E[+-]?\d+|)$/i;
+my $int_regex     = qr/^\d+$/;
+my $bool_regex    = qr/^[01]$/;
+my $term          = undef;
+my $file          = undef; # File to watch for InnoDB monitor output
+my $file_mtime    = undef; # Status of watched file
+my $file_data     = undef; # Last chunk of text read from file
+my $innodb_parser = InnoDBParser->new;
+
+my $nonfatal_errs = join('|',
+   'Access denied for user',
+   'Unknown MySQL server host',
+   'Unknown database',
+   'Can\'t connect to local MySQL server through socket',
+   'Can\'t connect to MySQL server on',
+   'MySQL server has gone away',
+   'Cannot call SHOW INNODB STATUS',
+   'Access denied',
+   'AutoCommit',
+);
+
+if ( !$opts{n} ) {
+   require Term::ReadLine;
+   $term = Term::ReadLine->new('innotop');
+}
+
+# Stores status, variables, innodb status, master/slave status etc.
+# Keyed on connection name.  Each entry is a hashref of current and past data sets,
+# keyed on clock tick.
+my %vars;
+my %info_gotten = (); # Which things have been retrieved for the current clock tick.
+
+# Stores info on currently displayed queries: cxn, connection ID, query text.
+my @current_queries;
+
+my $lines_printed       = 0;
+my $clock               = 0;   # Incremented with every wake-sleep cycle
+my $clearing_deadlocks  = 0;
+
+# Find the home directory; it's different on different OSes.
+my $homepath = $ENV{HOME} || $ENV{HOMEPATH} || $ENV{USERPROFILE} || '.';
+
+# If terminal coloring is available, use it.  The only function I want from
+# the module is the colored() function.
+eval {
+   if ( !$opts{n} ) {
+      if ( $windows ) {
+         require Win32::Console::ANSI;
+      }
+      require Term::ANSIColor;
+      import Term::ANSIColor qw(colored);
+      $have_color = 1;
+   }
+};
+if ( $EVAL_ERROR || $opts{n} ) {
+   # If there was an error, manufacture my own colored() function that does no
+   # coloring.
+   *colored = sub { pop @_; @_; };
+}
+
+if ( $opts{n} ) {
+   $clear_screen_sub = sub {};
+}
+elsif ( $windows ) {
+   $clear_screen_sub = sub { $lines_printed = 0; system("cls") };
+}
+else {
+   my $clear = `clear`;
+   $clear_screen_sub = sub { $lines_printed = 0; print $clear };
+}
+
+# ###########################################################################
+# Config storage. {{{2
+# ###########################################################################
+my %config = (
+   color => {
+      val  => $have_color,
+      note => 'Whether to use terminal coloring',
+      conf => 'ALL',
+      pat  => $bool_regex,
+   },
+   cmd_filter => {
+      val  => 'Com_',
+      note => 'Prefix for values in C mode',
+      conf => [qw(C)],
+   },
+   plugin_dir => {
+      val  => "$homepath/.innotop/plugins",
+      note => 'Directory where plugins can be found',
+      conf => 'ALL',
+   },
+   show_percent => {
+      val  => 1,
+      note => 'Show the % symbol after percentages',
+      conf => 'ALL',
+      pat  => $bool_regex,
+   },
+   skip_innodb => {
+      val  => 0,
+      note => 'Disable SHOW INNODB STATUS',
+      conf => 'ALL',
+      pat  => $bool_regex,
+   },
+   S_func => {
+      val  => 's',
+      note => 'What to display in S mode: graph, status, pivoted status',
+      conf => [qw(S)],
+      pat  => qr/^[gsv]$/,
+   },
+   cxn_timeout => {
+      val  => 28800,
+      note => 'Connection timeout for keeping unused connections alive',
+      conf => 'ALL',
+      pat  => $int_regex,
+   },
+   graph_char => {
+      val  => '*',
+      note => 'Character for drawing graphs',
+      conf => [ qw(S) ],
+      pat  => qr/^.$/,
+   },
+   show_cxn_errors_in_tbl => {
+      val  => 1,
+      note => 'Whether to display connection errors as rows in the table',
+      conf => 'ALL',
+      pat  => $bool_regex,
+   },
+   hide_hdr => {
+      val  => 0,
+      note => 'Whether to show column headers',
+      conf => 'ALL',
+      pat  => $bool_regex,
+   },
+   show_cxn_errors => {
+      val  => 1,
+      note => 'Whether to print connection errors to STDOUT',
+      conf => 'ALL',
+      pat  => $bool_regex,
+   },
+   readonly => {
+      val  => 0,
+      note => 'Whether the config file is read-only',
+      conf => [ qw() ],
+      pat  => $bool_regex,
+   },
+   global => {
+      val  => 1,
+      note => 'Whether to show GLOBAL variables and status',
+      conf => 'ALL',
+      pat  => $bool_regex,
+   },
+   header_highlight => {
+      val  => 'bold',
+      note => 'How to highlight table column headers',
+      conf => 'ALL',
+      pat  => qr/^(?:bold|underline)$/,
+   },
+   display_table_captions => {
+      val  => 1,
+      note => 'Whether to put captions on tables',
+      conf => 'ALL',
+      pat  => $bool_regex,
+   },
+   charset => {
+      val  => 'ascii',
+      note => 'What type of characters should be displayed in queries (ascii, unicode, none)',
+      conf => 'ALL',
+      pat  => qr/^(?:ascii|unicode|none)$/,
+   },
+   auto_wipe_dl => {
+      val  => 0,
+      note => 'Whether to auto-wipe InnoDB deadlocks',
+      conf => 'ALL',
+      pat  => $bool_regex,
+   },
+   max_height => {
+      val  => 30,
+      note => '[Win32] Max window height',
+      conf => 'ALL',
+   },
+   debug => {
+      val  => 0,
+      pat  => $bool_regex,
+      note => 'Debug mode (more verbose errors, uses more memory)',
+      conf => 'ALL',
+   },
+   num_digits => {
+      val  => 2,
+      pat  => $int_regex,
+      note => 'How many digits to show in fractional numbers and percents',
+      conf => 'ALL',
+   },
+   debugfile => {
+      val  => "$homepath/.innotop/core_dump",
+      note => 'A debug file in case you are interested in error output',
+   },
+   show_statusbar => {
+      val  => 1,
+      pat  => $bool_regex,
+      note => 'Whether to show the status bar in the display',
+      conf => 'ALL',
+   },
+   mode => {
+      val  => "T",
+      note => "Which mode to start in",
+      cmdline => 1,
+   },
+   status_inc => {
+      val  => 0,
+      note => 'Whether to show raw or incremental values for status variables',
+      pat  => $bool_regex,
+   },
+   interval => {
+      val  => 10,
+      pat  => qr/^(?:(?:\d*?[1-9]\d*(?:\.\d*)?)|(?:\d*\.\d*?[1-9]\d*))$/,
+      note => "The interval at which the display will be refreshed.  Fractional values allowed.",
+   },
+   num_status_sets => {
+      val  => 9,
+      pat  => $int_regex,
+      note => 'How many sets of STATUS and VARIABLES values to show',
+      conf => [ qw(S) ],
+   },
+   S_set => {
+      val  => 'general',
+      pat  => qr/^\w+$/,
+      note => 'Which set of variables to display in S (Variables & Status) mode',
+      conf => [ qw(S) ],
+   },
+);
+
+# ###########################################################################
+# Config file sections {{{2
+# The configuration file is broken up into sections like a .ini file.  This
+# variable defines those sections and the subroutines responsible for reading
+# and writing them.
+# ###########################################################################
+my %config_file_sections = (
+   plugins => {
+      reader => \&load_config_plugins,
+      writer => \&save_config_plugins,
+   },
+   group_by => {
+      reader => \&load_config_group_by,
+      writer => \&save_config_group_by,
+   },
+   filters => {
+      reader => \&load_config_filters,
+      writer => \&save_config_filters,
+   },
+   active_filters => {
+      reader => \&load_config_active_filters,
+      writer => \&save_config_active_filters,
+   },
+   visible_tables => {
+      reader => \&load_config_visible_tables,
+      writer => \&save_config_visible_tables,
+   },
+   sort_cols => {
+      reader => \&load_config_sort_cols,
+      writer => \&save_config_sort_cols,
+   },
+   active_columns => {
+      reader => \&load_config_active_columns,
+      writer => \&save_config_active_columns,
+   },
+   tbl_meta => {
+      reader => \&load_config_tbl_meta,
+      writer => \&save_config_tbl_meta,
+   },
+   general => {
+      reader => \&load_config_config,
+      writer => \&save_config_config,
+   },
+   connections => {
+      reader => \&load_config_connections,
+      writer => \&save_config_connections,
+   },
+   active_connections => {
+      reader => \&load_config_active_connections,
+      writer => \&save_config_active_connections,
+   },
+   server_groups => {
+      reader => \&load_config_server_groups,
+      writer => \&save_config_server_groups,
+   },
+   active_server_groups => {
+      reader => \&load_config_active_server_groups,
+      writer => \&save_config_active_server_groups,
+   },
+   max_values_seen => {
+      reader => \&load_config_mvs,
+      writer => \&save_config_mvs,
+   },
+   varsets => {
+      reader => \&load_config_varsets,
+      writer => \&save_config_varsets,
+   },
+   colors => {
+      reader => \&load_config_colors,
+      writer => \&save_config_colors,
+   },
+   stmt_sleep_times => {
+      reader => \&load_config_stmt_sleep_times,
+      writer => \&save_config_stmt_sleep_times,
+   },
+);
+
+# Config file sections have some dependencies, so they have to be read/written in order.
+my @ordered_config_file_sections = qw(general plugins filters active_filters tbl_meta
+   connections active_connections server_groups active_server_groups max_values_seen
+   active_columns sort_cols visible_tables varsets colors stmt_sleep_times
+   group_by);
+
+# All events for which plugins may register themselves.  Entries are arrayrefs.
+my %event_listener_for = map { $_ => [] }
+   qw(
+      extract_values
+      set_to_tbl_pre_filter set_to_tbl_pre_sort set_to_tbl_pre_group
+      set_to_tbl_pre_colorize set_to_tbl_pre_transform set_to_tbl_pre_pivot
+      set_to_tbl_pre_create set_to_tbl_post_create
+      draw_screen
+   );
+
+# All variables to which plugins have access.
+my %pluggable_vars = (
+   action_for    => \%action_for,
+   agg_funcs     => \%agg_funcs,
+   config        => \%config,
+   connections   => \%connections,
+   dbhs          => \%dbhs,
+   filters       => \%filters,
+   modes         => \%modes,
+   server_groups => \%server_groups,
+   tbl_meta      => \%tbl_meta,
+   trans_funcs   => \%trans_funcs,
+   var_sets      => \%var_sets,
+);
+
+# ###########################################################################
+# Contains logic to generate prepared statements for a given function for a
+# given DB connection.  Returns a $sth.
+# ###########################################################################
+my %stmt_maker_for = (
+   INNODB_STATUS => sub {
+      my ( $dbh ) = @_;
+      return $dbh->prepare(version_ge( $dbh, '5.0.0' )
+             ? 'SHOW ENGINE INNODB STATUS'
+             : 'SHOW INNODB STATUS');
+   },
+   SHOW_VARIABLES => sub {
+      my ( $dbh ) = @_;
+      return $dbh->prepare($config{global}->{val} && version_ge( $dbh, '4.0.3' )
+             ? 'SHOW GLOBAL VARIABLES'
+             : 'SHOW VARIABLES');
+   },
+   SHOW_STATUS => sub {
+      my ( $dbh ) = @_;
+      return $dbh->prepare($config{global}->{val} && version_ge( $dbh, '5.0.2' )
+             ? 'SHOW GLOBAL STATUS'
+             : 'SHOW STATUS');
+   },
+   KILL_QUERY => sub {
+      my ( $dbh ) = @_;
+      return $dbh->prepare(version_ge( $dbh, '5.0.0' )
+             ? 'KILL QUERY ?'
+             : 'KILL ?');
+   },
+   SHOW_MASTER_LOGS => sub {
+      my ( $dbh ) = @_;
+      return $dbh->prepare('SHOW MASTER LOGS');
+   },
+   SHOW_MASTER_STATUS => sub {
+      my ( $dbh ) = @_;
+      return $dbh->prepare('SHOW MASTER STATUS');
+   },
+   SHOW_SLAVE_STATUS => sub {
+      my ( $dbh ) = @_;
+      return $dbh->prepare('SHOW SLAVE STATUS');
+   },
+   KILL_CONNECTION => sub {
+      my ( $dbh ) = @_;
+      return $dbh->prepare(version_ge( $dbh, '5.0.0' )
+             ? 'KILL CONNECTION ?'
+             : 'KILL ?');
+   },
+   OPEN_TABLES => sub {
+      my ( $dbh ) = @_;
+      return version_ge($dbh, '4.0.0')
+         ? $dbh->prepare('SHOW OPEN TABLES')
+         : undef;
+   },
+   PROCESSLIST => sub {
+      my ( $dbh ) = @_;
+      return $dbh->prepare('SHOW FULL PROCESSLIST');
+   },
+);
+
+# Plugins!
+my %plugins = (
+);
+
+# ###########################################################################
+# Run the program {{{1
+# ###########################################################################
+
+# This config variable is only useful for MS Windows because its terminal
+# can't tell how tall it is.
+if ( !$windows ) {
+   delete $config{max_height};
+}
+
+# Try to lower my priority.
+eval { setpriority(0, 0, getpriority(0, 0) + 10); };
+
+# Print stuff to the screen immediately, don't wait for a newline.
+$OUTPUT_AUTOFLUSH = 1;
+
+# Clear the screen and load the configuration.
+$clear_screen_sub->();
+load_config();
+post_process_tbl_meta();
+
+# Make sure no changes are written to config file in non-interactive mode.
+if ( $opts{n} ) {
+   $config{readonly}->{val} = 1;
+}
+
+eval {
+
+   # Open the file for InnoDB status
+   if ( @ARGV ) {
+      my $filename = shift @ARGV;
+      open $file, "<", $filename
+         or die "Cannot open '$filename': $OS_ERROR";
+   }
+
+   # In certain modes we might have to collect data for two cycles
+   # before printing anything out, so we need to bump up the count one.
+   if ( $opts{n} && $opts{count} && $config{status_inc}->{val}
+      && $config{mode}->{val} =~ m/[S]/ )
+   {
+      $opts{count}++;
+   }
+
+   while (++$clock) {
+
+      my $mode = $config{mode}->{val} || 'T';
+      if ( !$modes{$mode} ) {
+         die "Mode '$mode' doesn't exist; try one of these:\n"
+            . join("\n", map { "  $_ $modes{$_}->{hdr}" }  sort keys %modes)
+            . "\n";
+      }
+
+      if ( !$opts{n} ) {
+         @last_term_size = @this_term_size;
+         @this_term_size = Term::ReadKey::GetTerminalSize(\*STDOUT);
+         if ( $windows ) {
+            $this_term_size[0]--;
+            $this_term_size[1]
+               = min($this_term_size[1], $config{max_height}->{val});
+         }
+         die("Can't read terminal size") unless @this_term_size;
+      }
+
+      # If there's no connection to a database server, we need to fix that...
+      if ( !%connections ) {
+         print "You have not defined any database connections.\n\n";
+         add_new_dsn();
+      }
+
+      # See whether there are any connections defined for this mode.  If there's only one
+      # connection total, assume the user wants to just use innotop for a single server
+      # and don't ask which server to connect to.  Also, if we're monitoring from a file,
+      # we just use the first connection.
+      if ( !get_connections() ) {
+         if ( $file || 1 == scalar keys %connections ) {
+            $modes{$config{mode}->{val}}->{connections} = [ keys %connections ];
+         }
+         else {
+            choose_connections();
+         }
+      }
+
+      # Term::ReadLine might have re-set $OUTPUT_AUTOFLUSH.
+      $OUTPUT_AUTOFLUSH = 1;
+
+      # Prune old data
+      my $sets = $config{num_status_sets}->{val};
+      foreach my $store ( values %vars ) {
+         delete @{$store}{ grep { $_ < $clock - $sets } keys %$store };
+      }
+      %info_gotten = ();
+
+      # Call the subroutine to display this mode.
+      $modes{$mode}->{display_sub}->();
+
+      # It may be time to quit now.
+      if ( $opts{count} && $clock >= $opts{count} ) {
+         finish();
+      }
+
+      # Wait for a bit.
+      if ( $opts{n} ) {
+         sleep($config{interval}->{val});
+      }
+      else {
+         ReadMode('cbreak');
+         $char = ReadKey($config{interval}->{val});
+         ReadMode('normal');
+      }
+
+      # Handle whatever action the key indicates.
+      do_key_action();
+
+   }
+};
+if ( $EVAL_ERROR ) {
+   core_dump( $EVAL_ERROR );
+}
+finish();
+
+# Subroutines {{{1
+# Mode functions{{{2
+# switch_mode {{{3
+sub switch_mode {
+   my $mode = shift;
+   $config{mode}->{val} = $mode;
+}
+
+# Prompting functions {{{2
+# prompt_list {{{3
+# Prompts the user for a value, given a question, initial value,
+# a completion function and a hashref of hints.
+sub prompt_list {
+   die "Can't call in non-interactive mode" if $opts{n};
+   my ( $question, $init, $completion, $hints ) = @_;
+   if ( $hints ) {
+      # Figure out how wide the table will be
+      my $max_name = max(map { length($_) } keys %$hints );
+      $max_name ||= 0;
+      $max_name +=  3;
+      my @meta_rows = create_table2(
+               [ sort keys %$hints ],
+               { map { $_ => $_ } keys %$hints },
+               { map { $_ => trunc($hints->{$_}, $this_term_size[0] - $max_name) } keys %$hints },
+               { sep => '  ' });
+      if (@meta_rows > 10) {
+         # Try to split and stack the meta rows next to each other
+         my $split = int(@meta_rows / 2);
+         @meta_rows = stack_next(
+            [@meta_rows[0..$split - 1]],
+            [@meta_rows[$split..$#meta_rows]],
+            { pad => ' | '},
+         );
+      }
+      print join( "\n",
+         '',
+         map { ref $_ ? colored(@$_) : $_ } create_caption('Choose from', @meta_rows), ''),
+         "\n";
+   }
+   $term->Attribs->{completion_function} = $completion;
+   my $answer = $term->readline("$question: ", $init);
+   $OUTPUT_AUTOFLUSH = 1;
+   $answer = '' if !defined($answer);
+   $answer =~ s/\s+$//;
+   return $answer;
+}
+
+# prompt {{{3
+# Prints out a prompt and reads from the keyboard, then validates with the
+# validation regex until the input is correct.
+sub prompt {
+   die "Can't call in non-interactive mode" if $opts{n};
+   my ( $prompt, $regex, $init, $completion ) = @_;
+   my $response;
+   my $success = 0;
+   do {
+      if ( $completion ) {
+         $term->Attribs->{completion_function} = $completion;
+      }
+      $response = $term->readline("$prompt: ", $init);
+      if ( $regex && $response !~ m/$regex/ ) {
+         print "Invalid response.\n\n";
+      }
+      else {
+         $success = 1;
+      }
+   } while ( !$success );
+   $OUTPUT_AUTOFLUSH = 1;
+   $response =~ s/\s+$//;
+   return $response;
+}
+
+# prompt_noecho {{{3
+# Unfortunately, suppressing echo with Term::ReadLine isn't reliable; the user might not
+# have that library, or it might not support that feature.
+sub prompt_noecho {
+   my ( $prompt ) = @_;
+   print colored("$prompt: ", 'underline');
+   my $response;
+   ReadMode('noecho');
+   $response = <STDIN>;
+   chomp($response);
+   ReadMode('normal');
+   return $response;
+}
+
+# do_key_action {{{3
+# Depending on whether a key was read, do something.  Keys have certain
+# actions defined in lookup tables.  Each mode may have its own lookup table,
+# which trumps the global table -- so keys can be context-sensitive.  The key
+# may be read and written in a subroutine, so it's a global.
+sub do_key_action {
+   if ( defined $char ) {
+      my $mode = $config{mode}->{val};
+      my $action
+         = defined($modes{$mode}->{action_for}->{$char})
+         ? $modes{$mode}->{action_for}->{$char}->{action}
+         : defined($action_for{$char})
+         ? $action_for{$char}->{action}
+         : sub{};
+      $action->();
+   }
+}
+
+# pause {{{3
+sub pause {
+   die "Can't call in non-interactive mode" if $opts{n};
+   my $msg = shift;
+   print defined($msg) ? "\n$msg" : "\nPress any key to continue";
+   ReadMode('cbreak');
+   my $char = ReadKey(0);
+   ReadMode('normal');
+   return $char;
+}
+
+# reverse_sort {{{3
+sub reverse_sort {
+   my $tbl = shift;
+   $tbl_meta{$tbl}->{sort_dir} *= -1;
+}
+
+# select_cxn {{{3
+# Selects connection(s).  If the mode (or argument list) has only one, returns
+# it without prompt.
+sub select_cxn {
+   my ( $prompt, @cxns ) = @_;
+   if ( !@cxns ) {
+      @cxns = get_connections();
+   }
+   if ( @cxns == 1 ) {
+      return $cxns[0];
+   }
+   my $choices = prompt_list(
+         $prompt,
+         $cxns[0],
+         sub{ return @cxns },
+         { map { $_ => $connections{$_}->{dsn} } @cxns });
+   my @result = unique(grep { my $a = $_; grep { $_ eq $a } @cxns } split(/\s+/, $choices));
+   return @result;
+}
+
+# kill_query {{{3
+# Kills a connection, or on new versions, optionally a query but not connection.
+sub kill_query {
+   my ( $q_or_c ) = @_;
+
+   my $info = choose_thread(
+      sub { 1 },
+      'Select a thread to kill the ' . $q_or_c,
+   );
+   return unless $info;
+   return unless pause("Kill $info->{id}?") =~ m/y/i;
+
+   eval {
+      do_stmt($info->{cxn}, $q_or_c eq 'QUERY' ? 'KILL_QUERY' : 'KILL_CONNECTION', $info->{id} );
+   };
+
+   if ( $EVAL_ERROR ) {
+      print "\nError: $EVAL_ERROR";
+      pause();
+   }
+}
+
+# set_display_precision {{{3
+sub set_display_precision {
+   my $dir = shift;
+   $config{num_digits}->{val} = min(9, max(0, $config{num_digits}->{val} + $dir));
+}
+
+sub toggle_visible_table {
+   my ( $mode, $table ) = @_;
+   my $visible = $modes{$mode}->{visible_tables};
+   if ( grep { $_ eq $table } @$visible ) {
+      $modes{$mode}->{visible_tables} = [ grep { $_ ne $table } @$visible ];
+   }
+   else {
+      unshift @$visible, $table;
+   }
+   $modes{$mode}->{cust}->{visible_tables} = 1;
+}
+
+# toggle_filter{{{3
+sub toggle_filter {
+   my ( $tbl, $filter ) = @_;
+   my $filters = $tbl_meta{$tbl}->{filters};
+   if ( grep { $_ eq $filter } @$filters ) {
+      $tbl_meta{$tbl}->{filters} = [ grep { $_ ne $filter } @$filters ];
+   }
+   else {
+      push @$filters, $filter;
+   }
+   $tbl_meta{$tbl}->{cust}->{filters} = 1;
+}
+
+# toggle_config {{{3
+sub toggle_config {
+   my ( $key ) = @_;
+   $config{$key}->{val} ^= 1;
+}
+
+# create_deadlock {{{3
+sub create_deadlock {
+   $clear_screen_sub->();
+
+   print "This function will deliberately cause a small deadlock, "
+      . "clearing deadlock information from the InnoDB monitor.\n\n";
+
+   my $answer = prompt("Are you sure you want to proceed?  Say 'y' if you do");
+   return 0 unless $answer eq 'y';
+
+   my ( $cxn ) = select_cxn('Clear on which server? ');
+   return unless $cxn && exists($connections{$cxn});
+
+   clear_deadlock($cxn);
+}
+
+# deadlock_thread {{{3
+sub deadlock_thread {
+   my ( $id, $tbl, $cxn ) = @_;
+
+   eval {
+      my $dbh = get_new_db_connection($cxn, 1);
+      my @stmts = (
+         "set transaction isolation level serializable",
+         (version_ge($dbh, '4.0.11') ? "start transaction" : 'begin'),
+         "select * from $tbl where a = $id",
+         "update $tbl set a = $id where a <> $id",
+      );
+
+      foreach my $stmt (@stmts[0..2]) {
+         $dbh->do($stmt);
+      }
+      sleep(1 + $id);
+      $dbh->do($stmts[-1]);
+   };
+   if ( $EVAL_ERROR ) {
+      if ( $EVAL_ERROR !~ m/Deadlock found/ ) {
+         die $EVAL_ERROR;
+      }
+   }
+   exit(0);
+}
+
+# Purges unused binlogs on the master, up to but not including the latest log.
+# TODO: guess which connections are slaves of a given master.
+sub purge_master_logs {
+   my @cxns = get_connections();
+
+   get_master_slave_status(@cxns);
+
+   # Toss out the rows that don't have master/slave status...
+   my @vars =
+      grep { $_ && ($_->{file} || $_->{master_host}) }
+      map  { $vars{$_}->{$clock} } @cxns;
+   @cxns = map { $_->{cxn} } @vars;
+
+   # Figure out which master to purge ons.
+   my @masters = map { $_->{cxn} } grep { $_->{file} } @vars;
+   my ( $master ) = select_cxn('Which master?', @masters );
+   return unless $master;
+   my ($master_status) = grep { $_->{cxn} eq $master } @vars;
+
+   # Figure out the result order (not lexical order) of master logs.
+   my @master_logs = get_master_logs($master);
+   my $i = 0;
+   my %master_logs = map { $_->{log_name} => $i++ } @master_logs;
+
+   # Ask which slave(s) are reading from this master.
+   my @slave_status = grep { $_->{master_host} } @vars;
+   my @slaves = map { $_->{cxn} } @slave_status;
+   @slaves = select_cxn("Which slaves are reading from $master?", @slaves);
+   @slave_status = grep { my $item = $_; grep { $item->{cxn} eq $_ } @slaves } @slave_status;
+   return unless @slave_status;
+
+   # Find the minimum binary log in use.
+   my $min_log = min(map { $master_logs{$_->{master_log_file}} } @slave_status);
+   my $log_name = $master_logs[$min_log]->{log_name};
+
+   my $stmt = "PURGE MASTER LOGS TO '$log_name'";
+   send_cmd_to_servers($stmt, 0, 'PURGE {MASTER | BINARY} LOGS {TO "log_name" | BEFORE "date"}', [$master]);
+}
+
+sub send_cmd_to_servers {
+   my ( $cmd, $all, $hint, $cxns ) = @_;
+   if ( $all ) {
+      @$cxns = get_connections();
+   }
+   elsif ( !@$cxns ) {
+      @$cxns = select_cxn('Which servers?', @$cxns);
+   }
+   if ( $hint ) {
+      print "\nHint: $hint\n";
+   }
+   $cmd = prompt('Command to send', undef, $cmd);
+   foreach my $cxn ( @$cxns ) {
+      eval {
+         my $sth = do_query($cxn, $cmd);
+      };
+      if ( $EVAL_ERROR ) {
+         print "Error from $cxn: $EVAL_ERROR\n";
+      }
+      else {
+         print "Success on $cxn\n";
+      }
+   }
+   pause();
+}
+
+# Display functions {{{2
+
+sub set_s_mode {
+   my ( $func ) = @_;
+   $clear_screen_sub->();
+   $config{S_func}->{val} = $func;
+}
+
+# start_S_mode {{{3
+sub start_S_mode {
+   $clear_screen_sub->();
+   switch_mode('S');
+}
+
+# display_B {{{3
+sub display_B {
+   my @display_lines;
+   my @cxns = get_connections();
+   get_innodb_status(\@cxns);
+
+   my @buffer_pool;
+   my @page_statistics;
+   my @insert_buffers;
+   my @adaptive_hash_index;
+   my %rows_for = (
+      buffer_pool         => \@buffer_pool,
+      page_statistics     => \@page_statistics,
+      insert_buffers      => \@insert_buffers,
+      adaptive_hash_index => \@adaptive_hash_index,
+   );
+
+   my @visible = get_visible_tables();
+   my %wanted  = map { $_ => 1 } @visible;
+
+   foreach my $cxn ( @cxns ) {
+      my $set = $vars{$cxn}->{$clock};
+      my $pre = $vars{$cxn}->{$clock-1} || $set;
+
+      if ( $set->{IB_bp_complete} ) {
+         if ( $wanted{buffer_pool} ) {
+            push @buffer_pool, extract_values($set, $set, $pre, 'buffer_pool');
+         }
+         if ( $wanted{page_statistics} ) {
+            push @page_statistics, extract_values($set, $set, $pre, 'page_statistics');
+         }
+      }
+      if ( $set->{IB_ib_complete} ) {
+         if ( $wanted{insert_buffers} ) {
+            push @insert_buffers, extract_values(
+               $config{status_inc}->{val} ? inc(0, $cxn) : $set, $set, $pre,
+               'insert_buffers');
+         }
+         if ( $wanted{adaptive_hash_index} ) {
+            push @adaptive_hash_index, extract_values($set, $set, $pre, 'adaptive_hash_index');
+         }
+      }
+   }
+
+   my $first_table = 0;
+   foreach my $tbl ( @visible ) {
+      push @display_lines, '', set_to_tbl($rows_for{$tbl}, $tbl);
+      push @display_lines, get_cxn_errors(@cxns)
+         if ( $config{debug}->{val} || !$first_table++ );
+   }
+
+   draw_screen(\@display_lines);
+}
+
+# display_C {{{3
+sub display_C {
+   my @display_lines;
+   my @cxns = get_connections();
+   get_status_info(@cxns);
+
+   my @cmd_summary;
+   my %rows_for = (
+      cmd_summary => \@cmd_summary,
+   );
+
+   my @visible = get_visible_tables();
+   my %wanted  = map { $_ => 1 } @visible;
+
+   # For now, I'm manually pulling these variables out and pivoting.  Eventually a SQL-ish
+   # dialect should let me join a table to a grouped and pivoted table and do this more easily.
+   # TODO: make it so.
+   my $prefix = qr/^$config{cmd_filter}->{val}/; # TODO: this is a total hack
+   my @values;
+   my ($total, $last_total) = (0, 0);
+   foreach my $cxn ( @cxns ) {
+      my $set = $vars{$cxn}->{$clock};
+      my $pre = $vars{$cxn}->{$clock-1} || $set;
+      foreach my $key ( keys %$set ) {
+         next unless $key =~ m/$prefix/i;
+         my $val = $set->{$key};
+         next unless defined $val && $val =~ m/^\d+$/;
+         my $last_val = $val - ($pre->{$key} || 0);
+         $total      += $val;
+         $last_total += $last_val;
+         push @values, {
+            name       => $key,
+            value      => $val,
+            last_value => $last_val,
+         };
+      }
+   }
+
+   # Add aggregation and turn into a real set TODO: total hack
+   if ( $wanted{cmd_summary} ) {
+      foreach my $value ( @values ) {
+         @{$value}{qw(total last_total)} = ($total, $last_total);
+         push @cmd_summary, extract_values($value, $value, $value, 'cmd_summary');
+      }
+   }
+
+   my $first_table = 0;
+   foreach my $tbl ( @visible ) {
+      push @display_lines, '', set_to_tbl($rows_for{$tbl}, $tbl);
+      push @display_lines, get_cxn_errors(@cxns)
+         if ( $config{debug}->{val} || !$first_table++ );
+   }
+
+   draw_screen(\@display_lines);
+}
+
+# display_D {{{3
+sub display_D {
+   my @display_lines;
+   my @cxns = get_connections();
+   get_innodb_status(\@cxns);
+
+   my @deadlock_transactions;
+   my @deadlock_locks;
+   my %rows_for = (
+      deadlock_transactions => \@deadlock_transactions,
+      deadlock_locks        => \@deadlock_locks,
+   );
+
+   my @visible = get_visible_tables();
+   my %wanted  = map { $_ => 1 } @visible;
+
+   foreach my $cxn ( @cxns ) {
+      my $innodb_status = $vars{$cxn}->{$clock};
+      my $prev_status   = $vars{$cxn}->{$clock-1} || $innodb_status;
+
+      if ( $innodb_status->{IB_dl_timestring} ) {
+
+         my $victim = $innodb_status->{IB_dl_rolled_back} || 0;
+
+         if ( %wanted ) {
+            foreach my $txn_id ( keys %{$innodb_status->{IB_dl_txns}} ) {
+               my $txn = $innodb_status->{IB_dl_txns}->{$txn_id};
+               my $pre = $prev_status->{IB_dl_txns}->{$txn_id} || $txn;
+
+               if ( $wanted{deadlock_transactions} ) {
+                  my $hash = extract_values($txn->{tx}, $txn->{tx}, $pre->{tx}, 'deadlock_transactions');
+                  $hash->{cxn}        = $cxn;
+                  $hash->{dl_txn_num} = $txn_id;
+                  $hash->{victim}     = $txn_id == $victim ? 'Yes' : 'No';
+                  $hash->{timestring} = $innodb_status->{IB_dl_timestring};
+                  $hash->{truncates}  = $innodb_status->{IB_dl_complete} ? 'No' : 'Yes';
+                  push @deadlock_transactions, $hash;
+               }
+
+               if ( $wanted{deadlock_locks} ) {
+                  foreach my $lock ( @{$txn->{locks}} ) {
+                     my $hash = extract_values($lock, $lock, $lock, 'deadlock_locks');
+                     $hash->{dl_txn_num}      = $txn_id;
+                     $hash->{cxn}             = $cxn;
+                     $hash->{mysql_thread_id} = $txn->{tx}->{mysql_thread_id};
+                     push @deadlock_locks, $hash;
+                  }
+               }
+
+            }
+         }
+      }
+   }
+
+   my $first_table = 0;
+   foreach my $tbl ( @visible ) {
+      push @display_lines, '', set_to_tbl($rows_for{$tbl}, $tbl);
+      push @display_lines, get_cxn_errors(@cxns)
+         if ( $config{debug}->{val} || !$first_table++ );
+   }
+
+   draw_screen(\@display_lines);
+}
+
+# display_F {{{3
+sub display_F {
+   my @display_lines;
+   my ( $cxn ) = get_connections();
+   get_innodb_status([$cxn]);
+   my $innodb_status = $vars{$cxn}->{$clock};
+
+   if ( $innodb_status->{IB_fk_timestring} ) {
+
+      push @display_lines, 'Reason: ' . $innodb_status->{IB_fk_reason};
+
+      # Display FK errors caused by invalid DML.
+      if ( $innodb_status->{IB_fk_txn} ) {
+         my $txn = $innodb_status->{IB_fk_txn};
+         push @display_lines,
+            '',
+            "User $txn->{user} from $txn->{hostname}, thread $txn->{mysql_thread_id} was executing:",
+            '', no_ctrl_char($txn->{query_text});
+      }
+
+      my @fk_table = create_table2(
+         $tbl_meta{fk_error}->{visible},
+         meta_to_hdr('fk_error'),
+         extract_values($innodb_status, $innodb_status, $innodb_status, 'fk_error'),
+         { just => '-', sep => '  '});
+      push @display_lines, '', @fk_table;
+
+   }
+   else {
+      push @display_lines, '', 'No foreign key error data.';
+   }
+   draw_screen(\@display_lines, { raw => 1 } );
+}
+
+# display_I {{{3
+sub display_I {
+   my @display_lines;
+   my @cxns = get_connections();
+   get_innodb_status(\@cxns);
+
+   my @io_threads;
+   my @pending_io;
+   my @file_io_misc;
+   my @log_statistics;
+   my %rows_for = (
+      io_threads     => \@io_threads,
+      pending_io     => \@pending_io,
+      file_io_misc   => \@file_io_misc,
+      log_statistics => \@log_statistics,
+   );
+
+   my @visible = get_visible_tables();
+   my %wanted  = map { $_ => 1 } @visible;
+
+   foreach my $cxn ( @cxns ) {
+      my $set = $vars{$cxn}->{$clock};
+      my $pre = $vars{$cxn}->{$clock-1} || $set;
+
+      if ( $set->{IB_io_complete} ) {
+         if ( $wanted{io_threads} ) {
+            my $cur_threads = $set->{IB_io_threads};
+            my $pre_threads = $pre->{IB_io_threads} || $cur_threads;
+            foreach my $key ( sort keys %$cur_threads ) {
+               my $cur_thd = $cur_threads->{$key};
+               my $pre_thd = $pre_threads->{$key} || $cur_thd;
+               my $hash = extract_values($cur_thd, $cur_thd, $pre_thd, 'io_threads');
+               $hash->{cxn} = $cxn;
+               push @io_threads, $hash;
+            }
+         }
+         if ( $wanted{pending_io} ) {
+            push @pending_io, extract_values($set, $set, $pre, 'pending_io');
+         }
+         if ( $wanted{file_io_misc} ) {
+            push @file_io_misc, extract_values(
+               $config{status_inc}->{val} ? inc(0, $cxn) : $set,
+               $set, $pre, 'file_io_misc');
+         }
+      }
+      if ( $set->{IB_lg_complete} && $wanted{log_statistics} ) {
+         push @log_statistics, extract_values($set, $set, $pre, 'log_statistics');
+      }
+   }
+
+   my $first_table = 0;
+   foreach my $tbl ( @visible ) {
+      push @display_lines, '', set_to_tbl($rows_for{$tbl}, $tbl);
+      push @display_lines, get_cxn_errors(@cxns)
+         if ( $config{debug}->{val} || !$first_table++ );
+   }
+
+   draw_screen(\@display_lines);
+}
+
+# display_L {{{3
+sub display_L {
+   my @display_lines;
+   my @cxns = get_connections();
+   get_innodb_status(\@cxns);
+
+   my @innodb_locks;
+   my %rows_for = (
+      innodb_locks => \@innodb_locks,
+   );
+
+   my @visible = get_visible_tables();
+   my %wanted  = map { $_ => 1 } @visible;
+
+   # Get info on locks
+   foreach my $cxn ( @cxns ) {
+      my $set = $vars{$cxn}->{$clock} or next;
+      my $pre = $vars{$cxn}->{$clock-1} || $set;
+
+      if ( $wanted{innodb_locks} && defined $set->{IB_tx_transactions} && @{$set->{IB_tx_transactions}} ) {
+
+         my $cur_txns = $set->{IB_tx_transactions};
+         my $pre_txns = $pre->{IB_tx_transactions} || $cur_txns;
+         my %cur_txns = map { $_->{mysql_thread_id} => $_ } @$cur_txns;
+         my %pre_txns = map { $_->{mysql_thread_id} => $_ } @$pre_txns;
+         foreach my $txn ( @$cur_txns ) {
+            foreach my $lock ( @{$txn->{locks}} ) {
+               my %hash = map { $_ => $txn->{$_} } qw(txn_id mysql_thread_id lock_wait_time active_secs);
+               map { $hash{$_} = $lock->{$_} } qw(lock_type space_id page_no n_bits index db table txn_id lock_mode special insert_intention waiting);
+               $hash{cxn} = $cxn;
+               push @innodb_locks, extract_values(\%hash, \%hash, \%hash, 'innodb_locks');
+            }
+         }
+      }
+   }
+
+   my $first_table = 0;
+   foreach my $tbl ( @visible ) {
+      push @display_lines, '', set_to_tbl($rows_for{$tbl}, $tbl);
+      push @display_lines, get_cxn_errors(@cxns)
+         if ( $config{debug}->{val} || !$first_table++ );
+   }
+
+   draw_screen(\@display_lines);
+}
+
+# display_M {{{3
+sub display_M {
+   my @display_lines;
+   my @cxns = get_connections();
+   get_master_slave_status(@cxns);
+   get_status_info(@cxns);
+
+   my @slave_sql_status;
+   my @slave_io_status;
+   my @master_status;
+   my %rows_for = (
+      slave_sql_status => \@slave_sql_status,
+      slave_io_status  => \@slave_io_status,
+      master_status    => \@master_status,
+   );
+
+   my @visible = get_visible_tables();
+   my %wanted  = map { $_ => 1 } @visible;
+
+   foreach my $cxn ( @cxns ) {
+      my $set  = $config{status_inc}->{val} ? inc(0, $cxn) : $vars{$cxn}->{$clock};
+      my $pre  = $vars{$cxn}->{$clock - 1} || $set;
+      if ( $wanted{slave_sql_status} ) {
+         push @slave_sql_status, extract_values($set, $set, $pre, 'slave_sql_status');
+      }
+      if ( $wanted{slave_io_status} ) {
+         push @slave_io_status, extract_values($set, $set, $pre, 'slave_io_status');
+      }
+      if ( $wanted{master_status} ) {
+         push @master_status, extract_values($set, $set, $pre, 'master_status');
+      }
+   }
+
+   my $first_table = 0;
+   foreach my $tbl ( @visible ) {
+      push @display_lines, '', set_to_tbl($rows_for{$tbl}, $tbl);
+      push @display_lines, get_cxn_errors(@cxns)
+         if ( $config{debug}->{val} || !$first_table++ );
+   }
+
+   draw_screen(\@display_lines);
+}
+
+# display_O {{{3
+sub display_O {
+   my @display_lines = ('');
+   my @cxns          = get_connections();
+   my @open_tables   = get_open_tables(@cxns);
+   my @tables = map { extract_values($_, $_, $_, 'open_tables') } @open_tables;
+   push @display_lines, set_to_tbl(\@tables, 'open_tables'), get_cxn_errors(@cxns);
+   draw_screen(\@display_lines);
+}
+
+# display_Q {{{3
+sub display_Q {
+   my @display_lines;
+
+   my @q_header;
+   my @processlist;
+   my %rows_for = (
+      q_header    => \@q_header,
+      processlist => \@processlist,
+   );
+
+   my @visible = $opts{n} ? 'processlist' : get_visible_tables();
+   my %wanted  = map { $_ => 1 } @visible;
+
+   # Get the data
+   my @cxns             = get_connections();
+   my @full_processlist = get_full_processlist(@cxns);
+
+   # Create header
+   if ( $wanted{q_header} ) {
+      get_status_info(@cxns);
+      foreach my $cxn ( @cxns ) {
+         my $set = $vars{$cxn}->{$clock};
+         my $pre = $vars{$cxn}->{$clock-1} || $set;
+         my $hash = extract_values($set, $set, $pre, 'q_header');
+         $hash->{cxn} = $cxn;
+         $hash->{when} = 'Total';
+         push @q_header, $hash;
+
+         if ( exists $vars{$cxn}->{$clock - 1} ) {
+            my $inc = inc(0, $cxn);
+            my $hash = extract_values($inc, $set, $pre, 'q_header');
+            $hash->{cxn} = $cxn;
+            $hash->{when} = 'Now';
+            push @q_header, $hash;
+         }
+      }
+   }
+
+   if ( $wanted{processlist} ) {
+      # TODO: save prev values
+      push @processlist, map { extract_values($_, $_, $_, 'processlist') } @full_processlist;
+   }
+
+   my $first_table = 0;
+   foreach my $tbl ( @visible ) {
+      next unless $wanted{$tbl};
+      push @display_lines, '', set_to_tbl($rows_for{$tbl}, $tbl);
+      push @display_lines, get_cxn_errors(@cxns)
+         if ( $config{debug}->{val} || !$first_table++ );
+   }
+
+   # Save queries in global variable for analysis.  The rows in %rows_for have been
+   # filtered, etc as a side effect of set_to_tbl(), so they are the same as the rows
+   # that get pushed to the screen.
+   @current_queries = map {
+      my %hash;
+      @hash{ qw(cxn id db query secs) } = @{$_}{ qw(cxn mysql_thread_id db info secs) };
+      \%hash;
+   } @{$rows_for{processlist}};
+
+   draw_screen(\@display_lines);
+}
+
+# display_R {{{3
+sub display_R {
+   my @display_lines;
+   my @cxns = get_connections();
+   get_innodb_status(\@cxns);
+
+   my @row_operations;
+   my @row_operation_misc;
+   my @semaphores;
+   my @wait_array;
+   my %rows_for = (
+      row_operations     => \@row_operations,
+      row_operation_misc => \@row_operation_misc,
+      semaphores         => \@semaphores,
+      wait_array         => \@wait_array,
+   );
+
+   my @visible = get_visible_tables();
+   my %wanted  = map { $_ => 1 } @visible;
+   my $incvar  = $config{status_inc}->{val};
+
+   foreach my $cxn ( @cxns ) {
+      my $set = $vars{$cxn}->{$clock};
+      my $pre = $vars{$cxn}->{$clock-1} || $set;
+      my $inc; # Only assigned to if wanted
+
+      if ( $set->{IB_ro_complete} ) {
+         if ( $wanted{row_operations} ) {
+            $inc ||= $incvar ? inc(0, $cxn) : $set;
+            push @row_operations, extract_values($inc, $set, $pre, 'row_operations');
+         }
+         if ( $wanted{row_operation_misc} ) {
+            push @row_operation_misc, extract_values($set, $set, $pre, 'row_operation_misc'),
+         }
+      }
+
+      if ( $set->{IB_sm_complete} && $wanted{semaphores} ) {
+         $inc ||= $incvar ? inc(0, $cxn) : $set;
+         push @semaphores, extract_values($inc, $set, $pre, 'semaphores');
+      }
+
+      if ( $set->{IB_sm_wait_array_size} && $wanted{wait_array} ) {
+         foreach my $wait ( @{$set->{IB_sm_waits}} ) {
+            my $hash = extract_values($wait, $wait, $wait, 'wait_array');
+            $hash->{cxn} = $cxn;
+            push @wait_array, $hash;
+         }
+      }
+   }
+
+   my $first_table = 0;
+   foreach my $tbl ( @visible ) {
+      push @display_lines, '', set_to_tbl($rows_for{$tbl}, $tbl);
+      push @display_lines, get_cxn_errors(@cxns)
+         if ( $config{debug}->{val} || !$first_table++ );
+   }
+
+   draw_screen(\@display_lines);
+}
+
+# display_T {{{3
+sub display_T {
+   my @display_lines;
+
+   my @t_header;
+   my @innodb_transactions;
+   my %rows_for = (
+      t_header            => \@t_header,
+      innodb_transactions => \@innodb_transactions,
+   );
+
+   my @visible = $opts{n} ? 'innodb_transactions' : get_visible_tables();
+   my %wanted  = map { $_ => 1 } @visible;
+
+   my @cxns = get_connections();
+
+   # If the header is to be shown, buffer pool data is required.
+   get_innodb_status( \@cxns, [ $wanted{t_header} ? qw(bp) : () ] );
+
+   foreach my $cxn ( get_connections() ) {
+      my $set = $vars{$cxn}->{$clock};
+      my $pre = $vars{$cxn}->{$clock-1} || $set;
+
+      next unless $set->{IB_tx_transactions};
+
+      if ( $wanted{t_header} ) {
+         my $hash = extract_values($set, $set, $pre, 't_header');
+         push @t_header, $hash;
+      }
+
+      if ( $wanted{innodb_transactions} ) {
+         my $cur_txns = $set->{IB_tx_transactions};
+         my $pre_txns = $pre->{IB_tx_transactions} || $cur_txns;
+         my %cur_txns = map { $_->{mysql_thread_id} => $_ } @$cur_txns;
+         my %pre_txns = map { $_->{mysql_thread_id} => $_ } @$pre_txns;
+         foreach my $thd_id ( sort keys %cur_txns ) {
+            my $cur_txn = $cur_txns{$thd_id};
+            my $pre_txn = $pre_txns{$thd_id} || $cur_txn;
+            my $hash    = extract_values($cur_txn, $cur_txn, $pre_txn, 'innodb_transactions');
+            $hash->{cxn} = $cxn;
+            push @innodb_transactions, $hash;
+         }
+      }
+
+   }
+
+   my $first_table = 0;
+   foreach my $tbl ( @visible ) {
+      push @display_lines, '', set_to_tbl($rows_for{$tbl}, $tbl);
+      push @display_lines, get_cxn_errors(@cxns)
+         if ( $config{debug}->{val} || !$first_table++ );
+   }
+
+   # Save queries in global variable for analysis.  The rows in %rows_for have been
+   # filtered, etc as a side effect of set_to_tbl(), so they are the same as the rows
+   # that get pushed to the screen.
+   @current_queries = map {
+      my %hash;
+      @hash{ qw(cxn id db query secs) } = @{$_}{ qw(cxn mysql_thread_id db query_text active_secs) };
+      \%hash;
+   } @{$rows_for{innodb_transactions}};
+
+   draw_screen(\@display_lines);
+}
+
+# display_S {{{3
+sub display_S {
+   my $fmt  = get_var_set('S_set');
+   my $func = $config{S_func}->{val};
+   my $inc  = $func eq 'g' || $config{status_inc}->{val};
+
+   # The table's meta-data is generated from the compiled var_set.
+   my ( $cols, $visible );
+   if ( $tbl_meta{var_status}->{fmt} && $fmt eq $tbl_meta{var_status}->{fmt} ) {
+      ( $cols, $visible ) = @{$tbl_meta{var_status}}{qw(cols visible)};
+   }
+   else {
+      ( $cols, $visible ) = compile_select_stmt($fmt);
+
+      # Apply missing values to columns.  Always apply averages across all connections.
+      map {
+         $_->{agg}   = 'avg';
+         $_->{label} = $_->{hdr};
+      } values %$cols;
+
+      $tbl_meta{var_status}->{cols}    = $cols;
+      $tbl_meta{var_status}->{visible} = $visible;
+      $tbl_meta{var_status}->{fmt}     = $fmt;
+      map { $tbl_meta{var_status}->{cols}->{$_}->{just} = ''} @$visible;
+   }
+
+   my @var_status;
+   my %rows_for = (
+      var_status => \@var_status,
+   );
+
+   my @visible = get_visible_tables();
+   my %wanted  = map { $_ => 1 } @visible;
+   my @cxns    = get_connections();
+
+   get_status_info(@cxns);
+   get_innodb_status(\@cxns);
+
+   # Set up whether to pivot and how many sets to extract.
+   $tbl_meta{var_status}->{pivot} = $func eq 'v';
+
+   my $num_sets
+      = $func eq 'v'
+      ? $config{num_status_sets}->{val}
+      : 0;
+   foreach my $set ( 0 .. $num_sets ) {
+      my @rows;
+      foreach my $cxn ( @cxns ) {
+         my $vars = $inc ? inc($set, $cxn) : $vars{$cxn}->{$clock - $set};
+         my $cur  = $vars{$cxn}->{$clock-$set};
+         my $pre  = $vars{$cxn}->{$clock-$set-1} || $cur;
+         next unless $vars && %$vars;
+         my $hash = extract_values($vars, $cur, $pre, 'var_status');
+         push @rows, $hash;
+      }
+      @rows = apply_group_by('var_status', [], @rows);
+      push @var_status, @rows;
+   }
+
+   # Recompile the sort func. TODO: avoid recompiling at every refresh.
+   # Figure out whether the data is all numeric and decide on a sort type.
+   # my $cmp
+   #   = scalar(
+   #      grep { !defined $_ || $_ !~ m/^\d+$/ }
+   #      map  { my $col = $_; map { $_->{$col} } @var_status }
+   #           $tbl_meta{var_status}->{sort_cols} =~ m/(\w+)/g)
+   #   ? 'cmp'
+   #   : '<=>';
+   $tbl_meta{var_status}->{sort_func} = make_sort_func($tbl_meta{var_status});
+
+   # ################################################################
+   # Now there is specific display code based on $config{S_func}
+   # ################################################################
+   if ( $func =~ m/s|g/ ) {
+      my $min_width = 4;
+
+      # Clear the screen if the display width changed.
+      if ( @last_term_size && $this_term_size[0] != $last_term_size[0] ) {
+         $lines_printed = 0;
+         $clear_screen_sub->();
+      }
+
+      if ( $func eq 's' ) {
+         # Decide how wide columns should be.
+         my $num_cols = scalar(@$visible);
+         my $width    = $opts{n} ? 0 : max($min_width, int(($this_term_size[0] - $num_cols + 1) / $num_cols));
+         my $g_format = $opts{n} ? ( "%s\t" x $num_cols ) : ( "%-${width}s " x $num_cols );
+
+         # Print headers every now and then.  Headers can get really long, so compact them.
+         my @hdr = @$visible;
+         if ( $opts{n} ) {
+            if ( $lines_printed == 0 ) {
+               print join("\t", @hdr), "\n";
+               $lines_printed++;
+            }
+         }
+         elsif ( $lines_printed == 0 || $lines_printed > $this_term_size[1] - 2 ) {
+            @hdr = map { donut(crunch($_, $width), $width) } @hdr;
+            print join(' ', map { sprintf( "%${width}s", donut($_, $width)) } @hdr) . "\n";
+            $lines_printed = 1;
+         }
+
+         # Design a column format for the values.
+         my $format
+            = $opts{n}
+            ? join("\t", map { '%s' } @$visible) . "\n"
+            : join(' ',  map { "%${width}s" } @hdr) . "\n";
+
+         foreach my $row ( @var_status ) {
+            printf($format, map { defined $_ ? $_ : '' } @{$row}{ @$visible });
+            $lines_printed++;
+         }
+      }
+      else { # 'g' mode
+         # Design a column format for the values.
+         my $num_cols = scalar(@$visible);
+         my $width    = $opts{n} ? 0 : int(($this_term_size[0] - $num_cols + 1) / $num_cols);
+         my $format   = $opts{n} ? ( "%s\t" x $num_cols ) : ( "%-${width}s " x $num_cols );
+         $format      =~ s/\s$/\n/;
+
+         # Print headers every now and then.
+         if ( $opts{n} ) {
+            if ( $lines_printed == 0 ) {
+               print join("\t", @$visible), "\n";
+               print join("\t", map { shorten($mvs{$_}) } @$visible), "\n";
+            }
+         }
+         elsif ( $lines_printed == 0 || $lines_printed > $this_term_size[1] - 2 ) {
+            printf($format, map { donut(crunch($_, $width), $width) } @$visible);
+            printf($format, map { shorten($mvs{$_} || 0) } @$visible);
+            $lines_printed = 2;
+         }
+
+         # Update the max ever seen, and scale by the max ever seen.
+         my $set = $var_status[0];
+         foreach my $col ( @$visible ) {
+            $set->{$col}  = 1 unless defined $set->{$col} && $set->{$col} =~ m/$num_regex/;
+            $set->{$col}  = ($set->{$col} || 1) / ($set->{Uptime_hires} || 1);
+            $mvs{$col}    = max($mvs{$col} || 1, $set->{$col});
+            $set->{$col} /= $mvs{$col};
+         }
+         printf($format, map { ( $config{graph_char}->{val} x int( $width * $set->{$_} )) || '.' } @$visible );
+         $lines_printed++;
+
+      }
+   }
+   else { # 'v'
+      my $first_table = 0;
+      my @display_lines;
+      foreach my $tbl ( @visible ) {
+         push @display_lines, '', set_to_tbl($rows_for{$tbl}, $tbl);
+         push @display_lines, get_cxn_errors(@cxns)
+            if ( $config{debug}->{val} || !$first_table++ );
+      }
+      $clear_screen_sub->();
+      draw_screen( \@display_lines );
+   }
+}
+
+# display_explain {{{3
+sub display_explain {
+   my $info = shift;
+   my $cxn   = $info->{cxn};
+   my $db    = $info->{db};
+
+   my ( $mods, $query ) = rewrite_for_explain($info->{query});
+
+   my @display_lines;
+
+   if ( $query ) {
+
+      my $part = version_ge($dbhs{$cxn}->{dbh}, '5.1.5') ? 'PARTITIONS' : '';
+      $query = "EXPLAIN $part\n" . $query;
+
+      eval {
+         if ( $db ) {
+            do_query($cxn, "use $db");
+         }
+         my $sth = do_query($cxn, $query);
+
+         my $res;
+         while ( $res = $sth->fetchrow_hashref() ) {
+            map { $res->{$_} ||= '' } ( 'partitions', keys %$res);
+            my @this_table = create_caption("Sub-Part $res->{id}",
+               create_table2(
+                  $tbl_meta{explain}->{visible},
+                  meta_to_hdr('explain'),
+                  extract_values($res, $res, $res, 'explain')));
+            @display_lines = stack_next(\@display_lines, \@this_table, { pad => '  ', vsep => 2 });
+         }
+      };
+
+      if ( $EVAL_ERROR ) {
+         push @display_lines,
+            '',
+            "The query could not be explained.  Only SELECT queries can be "
+            . "explained; innotop tries to rewrite certain REPLACE and INSERT queries "
+            . "into SELECT, but this doesn't always succeed.";
+      }
+
+   }
+   else {
+      push @display_lines, '', 'The query could not be explained.';
+   }
+
+   if ( $mods ) {
+      push @display_lines, '', '[This query has been re-written to be explainable]';
+   }
+
+   unshift @display_lines, no_ctrl_char($query);
+   draw_screen(\@display_lines, { raw => 1 } );
+}
+
+# rewrite_for_explain {{{3
+sub rewrite_for_explain {
+   my $query = shift;
+
+   my $mods = 0;
+   my $orig = $query;
+   $mods += $query =~ s/^\s*(?:replace|insert).*?select/select/is;
+   $mods += $query =~ s/^
+      \s*create\s+(?:temporary\s+)?table
+      \s+(?:\S+\s+)as\s+select/select/xis;
+   $mods += $query =~ s/\s+on\s+duplicate\s+key\s+update.*$//is;
+   return ( $mods, $query );
+}
+
+# show_optimized_query {{{3
+sub show_optimized_query {
+   my $info = shift;
+   my $cxn   = $info->{cxn};
+   my $db    = $info->{db};
+   my $meta  = $dbhs{$cxn};
+
+   my @display_lines;
+
+   my ( $mods, $query ) = rewrite_for_explain($info->{query});
+
+   if ( $mods ) {
+      push @display_lines, '[This query has been re-written to be explainable]';
+   }
+
+   if ( $query ) {
+      push @display_lines, no_ctrl_char($info->{query});
+
+      eval {
+         if ( $db ) {
+            do_query($cxn, "use $db");
+         }
+         do_query( $cxn, 'EXPLAIN EXTENDED ' . $query ) or die "Can't explain query";
+         my $sth = do_query($cxn, 'SHOW WARNINGS');
+         my $res = $sth->fetchall_arrayref({});
+
+         if ( $res ) {
+            foreach my $result ( @$res ) {
+               push @display_lines, 'Note:', no_ctrl_char($result->{message});
+            }
+         }
+         else {
+            push @display_lines, '', 'The query optimization could not be generated.';
+         }
+      };
+
+      if ( $EVAL_ERROR ) {
+         push @display_lines, '', "The optimization could not be generated: $EVAL_ERROR";
+      }
+
+   }
+   else {
+      push @display_lines, '', 'The query optimization could not be generated.';
+   }
+
+   draw_screen(\@display_lines, { raw => 1 } );
+}
+
+# display_help {{{3
+sub display_help {
+   my $mode = $config{mode}->{val};
+
+   # Get globally mapped keys, then overwrite them with mode-specific ones.
+   my %keys = map {
+         $_ => $action_for{$_}->{label}
+      } keys %action_for;
+   foreach my $key ( keys %{$modes{$mode}->{action_for}} ) {
+      $keys{$key} = $modes{$mode}->{action_for}->{$key}->{label};
+   }
+   delete $keys{'?'};
+
+   # Split them into three kinds of keys: MODE keys, action keys, and
+   # magic (special character) keys.
+   my @modes   = sort grep { m/[A-Z]/   } keys %keys;
+   my @actions = sort grep { m/[a-z]/   } keys %keys;
+   my @magic   = sort grep { m/[^A-Z]/i } keys %keys;
+
+   my @display_lines = ( '', 'Switch to a different mode:' );
+
+   # Mode keys
+   my @all_modes = map { "$_  $modes{$_}->{hdr}" } @modes;
+   my @col1 = splice(@all_modes, 0, ceil(@all_modes/3));
+   my @col2 = splice(@all_modes, 0, ceil(@all_modes/2));
+   my $max1 = max(map {length($_)} @col1);
+   my $max2 = max(map {length($_)} @col2);
+   while ( @col1 ) {
+      push @display_lines, sprintf("   %-${max1}s  %-${max2}s  %s",
+         (shift @col1      || ''),
+         (shift @col2      || ''),
+         (shift @all_modes || ''));
+   }
+
+   # Action keys
+   my @all_actions = map { "$_  $keys{$_}" } @actions;
+   @col1 = splice(@all_actions, 0, ceil(@all_actions/2));
+   $max1 = max(map {length($_)} @col1);
+   push @display_lines, '', 'Actions:';
+   while ( @col1 ) {
+      push @display_lines, sprintf("   %-${max1}s  %s",
+         (shift @col1        || ''),
+         (shift @all_actions || ''));
+   }
+
+   # Magic keys
+   my @all_magic = map { sprintf('%4s', $action_for{$_}->{key} || $_) . "  $keys{$_}" } @magic;
+   @col1 = splice(@all_magic, 0, ceil(@all_magic/2));
+   $max1 = max(map {length($_)} @col1);
+   push @display_lines, '', 'Other:';
+   while ( @col1 ) {
+      push @display_lines, sprintf("%-${max1}s%s",
+         (shift @col1      || ''),
+         (shift @all_magic || ''));
+   }
+
+   $clear_screen_sub->();
+   draw_screen(\@display_lines, { show_all => 1 } );
+   pause();
+   $clear_screen_sub->();
+}
+
+# show_full_query {{{3
+sub show_full_query {
+   my $info = shift;
+   my @display_lines = no_ctrl_char($info->{query});
+   draw_screen(\@display_lines, { raw => 1 });
+}
+
+# Formatting functions {{{2
+
+# create_table2 {{{3
+# Makes a two-column table, labels on left, data on right.
+# Takes refs of @cols, %labels and %data, %user_prefs
+sub create_table2 {
+   my ( $cols, $labels, $data, $user_prefs ) = @_;
+   my @rows;
+
+   if ( @$cols && %$data ) {
+
+      # Override defaults
+      my $p = {
+         just  => '',
+         sep   => ':',
+         just1 => '-',
+      };
+      if ( $user_prefs ) {
+         map { $p->{$_} = $user_prefs->{$_} } keys %$user_prefs;
+      }
+
+      # Fix undef values
+      map { $data->{$_} = '' unless defined $data->{$_} } @$cols;
+
+      # Format the table
+      my $max_l = max(map{ length($labels->{$_}) } @$cols);
+      my $max_v = max(map{ length($data->{$_}) } @$cols);
+      my $format    = "%$p->{just}${max_l}s$p->{sep} %$p->{just1}${max_v}s";
+      foreach my $col ( @$cols ) {
+         push @rows, sprintf($format, $labels->{$col}, $data->{$col});
+      }
+   }
+   return @rows;
+}
+
+# stack_next {{{3
+# Stacks one display section next to the other.  Accepts left-hand arrayref,
+# right-hand arrayref, and options hashref.  Tries to stack as high as
+# possible, so
+# aaaaaa
+# bbb
+# can stack ccc next to the bbb.
+# NOTE: this DOES modify its arguments, even though it returns a new array.
+sub stack_next {
+   my ( $left, $right, $user_prefs ) = @_;
+   my @result;
+
+   my $p = {
+      pad   => ' ',
+      vsep  => 0,
+   };
+   if ( $user_prefs ) {
+      map { $p->{$_} = $user_prefs->{$_} } keys %$user_prefs;
+   }
+
+   # Find out how wide the LHS can be and still let the RHS fit next to it.
+   my $pad   = $p->{pad};
+   my $max_r = max( map { length($_) } @$right) || 0;
+   my $max_l = $this_term_size[0] - $max_r - length($pad);
+
+   # Find the minimum row on the LHS that the RHS will fit next to.
+   my $i = scalar(@$left) - 1;
+   while ( $i >= 0 && length($left->[$i]) <= $max_l ) {
+      $i--;
+   }
+   $i++;
+   my $offset = $i;
+
+   if ( $i < scalar(@$left) ) {
+      # Find the max width of the section of the LHS against which the RHS
+      # will sit.
+      my $max_i_in_common = min($i + scalar(@$right) - 1, scalar(@$left) - 1);
+      my $max_width = max( map { length($_) } @{$left}[$i..$max_i_in_common]);
+
+      # Append the RHS onto the LHS until one runs out.
+      while ( $i < @$left && $i - $offset < @$right ) {
+         my $format = "%-${max_width}s$pad%${max_r}s";
+         $left->[$i] = sprintf($format, $left->[$i], $right->[$i - $offset]);
+         $i++;
+      }
+      while ( $i - $offset < @$right ) {
+         # There is more RHS to push on the end of the array
+         push @$left,
+            sprintf("%${max_width}s$pad%${max_r}s", ' ', $right->[$i - $offset]);
+         $i++;
+      }
+      push @result, @$left;
+   }
+   else {
+      # There is no room to put them side by side.  Add them below, with
+      # a blank line above them if specified.
+      push @result, @$left;
+      push @result, (' ' x $this_term_size[0]) if $p->{vsep} && @$left;
+      push @result, @$right;
+   }
+   return @result;
+}
+
+# create_caption {{{3
+sub create_caption {
+   my ( $caption, @rows ) = @_;
+   if ( @rows ) {
+
+      # Calculate the width of what will be displayed, so it can be centered
+      # in that space.  When the thing is wider than the display, center the
+      # caption in the display.
+      my $width = min($this_term_size[0], max(map { length(ref($_) ? $_->[0] : $_) } @rows));
+
+      my $cap_len = length($caption);
+
+      # It may be narrow enough to pad the sides with underscores and save a
+      # line on the screen.
+      if ( $cap_len <= $width - 6 ) {
+         my $left = int(($width - 2 - $cap_len) / 2);
+         unshift @rows,
+            ("_" x $left) . " $caption " . ("_" x ($width - $left - $cap_len - 2));
+      }
+
+      # The caption is too wide to add underscores on each side.
+      else {
+
+         # Color is supported, so we can use terminal underlining.
+         if ( $config{color}->{val} ) {
+            my $left = int(($width - $cap_len) / 2);
+            unshift @rows, [
+               (" " x $left) . $caption . (" " x ($width - $left - $cap_len)),
+               'underline',
+            ];
+         }
+
+         # Color is not supported, so we have to add a line underneath to separate the
+         # caption from whatever it's captioning.
+         else {
+            my $left = int(($width - $cap_len) / 2);
+            unshift @rows, ('-' x $width);
+            unshift @rows, (" " x $left) . $caption . (" " x ($width - $left - $cap_len));
+         }
+
+         # The caption is wider than the thing it labels, so we have to pad the
+         # thing it labels to a consistent width.
+         if ( $cap_len > $width ) {
+            @rows = map {
+               ref($_)
+                  ? [ sprintf('%-' . $cap_len . 's', $_->[0]), $_->[1] ]
+                  : sprintf('%-' . $cap_len . 's', $_);
+            } @rows;
+         }
+
+      }
+   }
+   return @rows;
+}
+
+# create_table {{{3
+# Input: an arrayref of columns, hashref of col info, and an arrayref of hashes
+# Example: [ 'a', 'b' ]
+#          { a => spec, b => spec }
+#          [ { a => 1, b => 2}, { a => 3, b => 4 } ]
+# The 'spec' is a hashref of hdr => label, just => ('-' or '').  It also supports min and max-widths
+# vi the minw and maxw params.
+# Output: an array of strings, one per row.
+# Example:
+# Column One Column Two
+# ---------- ----------
+# 1          2
+# 3          4
+sub create_table {
+   my ( $cols, $info, $data, $prefs ) = @_;
+   $prefs ||= {};
+   $prefs->{no_hdr} ||= ($opts{n} && $clock != 1);
+
+   # Truncate rows that will surely be off screen even if this is the only table.
+   if ( !$opts{n} && !$prefs->{raw} && !$prefs->{show_all} && $this_term_size[1] < @$data-1 ) {
+      $data = [ @$data[0..$this_term_size[1] - 1] ];
+   }
+
+   my @rows = ();
+
+   if ( @$cols && %$info ) {
+
+      # Fix undef values, collapse whitespace.
+      foreach my $row ( @$data ) {
+         map { $row->{$_} = collapse_ws($row->{$_}) } @$cols;
+      }
+
+      my $col_sep = $opts{n} ? "\t" : '  ';
+
+      # Find each column's max width.
+      my %width_for;
+      if ( !$opts{n} ) {
+         %width_for = map {
+            my $col_name  = $_;
+            if ( $info->{$_}->{dec} ) {
+               # Align along the decimal point
+               my $max_rodp = max(0, map { $_->{$col_name} =~ m/([^\s\d-].*)$/ ? length($1) : 0 } @$data);
+               foreach my $row ( @$data ) {
+                  my $col = $row->{$col_name};
+                  my ( $l, $r ) = $col =~ m/^([\s\d]*)(.*)$/;
+                  $row->{$col_name} = sprintf("%s%-${max_rodp}s", $l, $r);
+               }
+            }
+            my $max_width = max( length($info->{$_}->{hdr}), map { length($_->{$col_name}) } @$data);
+            if ( $info->{$col_name}->{maxw} ) {
+               $max_width = min( $max_width, $info->{$col_name}->{maxw} );
+            }
+            if ( $info->{$col_name}->{minw} ) {
+               $max_width = max( $max_width, $info->{$col_name}->{minw} );
+            }
+            $col_name => $max_width;
+         } @$cols;
+      }
+
+      # The table header.
+      if ( !$config{hide_hdr}->{val} && !$prefs->{no_hdr} ) {
+         push @rows, $opts{n}
+            ? join( $col_sep, @$cols )
+            : join( $col_sep, map { sprintf( "%-$width_for{$_}s", trunc($info->{$_}->{hdr}, $width_for{$_}) ) } @$cols );
+         if ( $config{color}->{val} && $config{header_highlight}->{val} ) {
+            push @rows, [ pop @rows, $config{header_highlight}->{val} ];
+         }
+         elsif ( !$opts{n} ) {
+            push @rows, join( $col_sep, map { "-" x $width_for{$_} } @$cols );
+         }
+      }
+
+      # The table data.
+      if ( $opts{n} ) {
+         foreach my $item ( @$data ) {
+            push @rows, join($col_sep, map { $item->{$_} } @$cols );
+         }
+      }
+      else {
+         my $format = join( $col_sep,
+            map { "%$info->{$_}->{just}$width_for{$_}s" } @$cols );
+         foreach my $item ( @$data ) {
+            my $row = sprintf($format, map { trunc($item->{$_}, $width_for{$_}) } @$cols );
+            if ( $config{color}->{val} && $item->{_color} ) {
+               push @rows, [ $row, $item->{_color} ];
+            }
+            else {
+               push @rows, $row;
+            }
+         }
+      }
+   }
+
+   return @rows;
+}
+
+# Aggregates a table.  If $group_by is an arrayref of columns, the grouping key
+# is the specified columns; otherwise it's just the empty string (e.g.
+# everything is grouped as one group).
+sub apply_group_by {
+   my ( $tbl, $group_by, @rows ) = @_;
+   my $meta = $tbl_meta{$tbl};
+   my %is_group = map { $_ => 1 } @$group_by;
+   my @non_grp  = grep { !$is_group{$_} } keys %{$meta->{cols}};
+
+   my %temp_table;
+   foreach my $row ( @rows ) {
+      my $group_key
+         = @$group_by
+         ? '{' . join('}{', map { defined $_ ? $_ : '' } @{$row}{@$group_by}) . '}'
+         : '';
+      $temp_table{$group_key} ||= [];
+      push @{$temp_table{$group_key}}, $row;
+   }
+
+   # Crush the rows together...
+   my @new_rows;
+   foreach my $key ( sort keys %temp_table ) {
+      my $group = $temp_table{$key};
+      my %new_row;
+      @new_row{@$group_by} = @{$group->[0]}{@$group_by};
+      foreach my $col ( @non_grp ) {
+         my $agg = $meta->{cols}->{$col}->{agg} || 'first';
+         $new_row{$col} = $agg_funcs{$agg}->( map { $_->{$col} } @$group );
+      }
+      push @new_rows, \%new_row;
+   }
+   return @new_rows;
+}
+
+# set_to_tbl {{{3
+# Unifies all the work of filtering, sorting etc.  Alters the input.
+# TODO: pull all the little pieces out into subroutines and stick events in each of them.
+sub set_to_tbl {
+   my ( $rows, $tbl ) = @_;
+   my $meta = $tbl_meta{$tbl} or die "No such table $tbl in tbl_meta";
+
+   if ( !$meta->{pivot} ) {
+
+      # Hook in event listeners
+      foreach my $listener ( @{$event_listener_for{set_to_tbl_pre_filter}} ) {
+         $listener->set_to_tbl_pre_filter($rows, $tbl);
+      }
+
+      # Apply filters.  Note that if the table is pivoted, filtering and sorting
+      # are applied later.
+      foreach my $filter ( @{$meta->{filters}} ) {
+         eval {
+            @$rows = grep { $filters{$filter}->{func}->($_) } @$rows;
+         };
+         if ( $EVAL_ERROR && $config{debug}->{val} ) {
+            die $EVAL_ERROR;
+         }
+      }
+
+      foreach my $listener ( @{$event_listener_for{set_to_tbl_pre_sort}} ) {
+         $listener->set_to_tbl_pre_sort($rows, $tbl);
+      }
+
+      # Sort.  Note that if the table is pivoted, sorting might have the wrong
+      # columns and it could crash.  This will only be an issue if it's possible
+      # to toggle pivoting on and off, which it's not at the moment.
+      if ( @$rows && $meta->{sort_func} && !$meta->{aggregate} ) {
+         if ( $meta->{sort_dir} > 0 ) {
+            @$rows = $meta->{sort_func}->( @$rows );
+         }
+         else {
+            @$rows = reverse $meta->{sort_func}->( @$rows );
+         }
+      }
+
+   }
+
+   # Stop altering arguments now.
+   my @rows = @$rows;
+
+   foreach my $listener ( @{$event_listener_for{set_to_tbl_pre_group}} ) {
+      $listener->set_to_tbl_pre_group(\@rows, $tbl);
+   }
+
+   # Apply group-by.
+   if ( $meta->{aggregate} ) {
+      @rows = apply_group_by($tbl, $meta->{group_by}, @rows);
+
+      # Sort.  Note that if the table is pivoted, sorting might have the wrong
+      # columns and it could crash.  This will only be an issue if it's possible
+      # to toggle pivoting on and off, which it's not at the moment.
+      if ( @rows && $meta->{sort_func} ) {
+         if ( $meta->{sort_dir} > 0 ) {
+            @rows = $meta->{sort_func}->( @rows );
+         }
+         else {
+            @rows = reverse $meta->{sort_func}->( @rows );
+         }
+      }
+
+   }
+
+   foreach my $listener ( @{$event_listener_for{set_to_tbl_pre_colorize}} ) {
+      $listener->set_to_tbl_pre_colorize(\@rows, $tbl);
+   }
+
+   if ( !$meta->{pivot} ) {
+      # Colorize.  Adds a _color column to rows.
+      if ( @rows && $meta->{color_func} ) {
+         eval {
+            foreach my $row ( @rows ) {
+               $row->{_color} = $meta->{color_func}->($row);
+            }
+         };
+         if ( $EVAL_ERROR ) {
+            pause($EVAL_ERROR);
+         }
+      }
+   }
+
+   foreach my $listener ( @{$event_listener_for{set_to_tbl_pre_transform}} ) {
+      $listener->set_to_tbl_pre_transform(\@rows, $tbl);
+   }
+
+   # Apply_transformations.
+   if ( @rows ) {
+      my $cols = $meta->{cols};
+      foreach my $col ( keys %{$rows->[0]} ) {
+         # Don't auto-vivify $tbl_meta{tbl}-{cols}->{_color}->{trans}
+         next if $col eq '_color';
+         foreach my $trans ( @{$cols->{$col}->{trans}} ) {
+            map { $_->{$col} = $trans_funcs{$trans}->($_->{$col}) } @rows;
+         }
+      }
+   }
+
+   my ($fmt_cols, $fmt_meta);
+
+   # Pivot.
+   if ( $meta->{pivot} ) {
+
+      foreach my $listener ( @{$event_listener_for{set_to_tbl_pre_pivot}} ) {
+         $listener->set_to_tbl_pre_pivot(\@rows, $tbl);
+      }
+
+      my @vars = @{$meta->{visible}};
+      my @tmp  = map { { name => $_ } } @vars;
+      my @cols = 'name';
+      foreach my $i ( 0..@$rows-1 ) {
+         my $col = "set_$i";
+         push @cols, $col;
+         foreach my $j ( 0..@vars-1 ) {
+            $tmp[$j]->{$col} = $rows[$i]->{$vars[$j]};
+         }
+      }
+      $fmt_meta = { map { $_ => { hdr => $_, just => '-' } } @cols };
+      $fmt_cols = \@cols;
+      @rows = @tmp;
+
+      # Hook in event listeners
+      foreach my $listener ( @{$event_listener_for{set_to_tbl_pre_filter}} ) {
+         $listener->set_to_tbl_pre_filter($rows, $tbl);
+      }
+
+      # Apply filters.
+      foreach my $filter ( @{$meta->{filters}} ) {
+         eval {
+            @rows = grep { $filters{$filter}->{func}->($_) } @rows;
+         };
+         if ( $EVAL_ERROR && $config{debug}->{val} ) {
+            die $EVAL_ERROR;
+         }
+      }
+
+      foreach my $listener ( @{$event_listener_for{set_to_tbl_pre_sort}} ) {
+         $listener->set_to_tbl_pre_sort($rows, $tbl);
+      }
+
+      # Sort.
+      if ( @rows && $meta->{sort_func} ) {
+         if ( $meta->{sort_dir} > 0 ) {
+            @rows = $meta->{sort_func}->( @rows );
+         }
+         else {
+            @rows = reverse $meta->{sort_func}->( @rows );
+         }
+      }
+
+   }
+   else {
+      # If the table isn't pivoted, just show all columns that are supposed to
+      # be shown; but eliminate aggonly columns if the table isn't aggregated.
+      my $aggregated = $meta->{aggregate};
+      $fmt_cols = [ grep { $aggregated || !$meta->{cols}->{$_}->{aggonly} } @{$meta->{visible}} ];
+      $fmt_meta = { map  { $_ => $meta->{cols}->{$_}                      } @$fmt_cols };
+
+      # If the table is aggregated, re-order the group_by columns to the left of
+      # the display.
+      if ( $aggregated ) {
+         my %is_group = map { $_ => 1 } @{$meta->{group_by}};
+         $fmt_cols = [ @{$meta->{group_by}}, grep { !$is_group{$_} } @$fmt_cols ];
+      }
+   }
+
+   foreach my $listener ( @{$event_listener_for{set_to_tbl_pre_create}} ) {
+      $listener->set_to_tbl_pre_create(\@rows, $tbl);
+   }
+
+   @rows = create_table( $fmt_cols, $fmt_meta, \@rows);
+   if ( !$meta->{hide_caption} && !$opts{n} && $config{display_table_captions}->{val} ) {
+      @rows = create_caption($meta->{capt}, @rows)
+   }
+
+   foreach my $listener ( @{$event_listener_for{set_to_tbl_post_create}} ) {
+      $listener->set_to_tbl_post_create(\@rows, $tbl);
+   }
+
+   return @rows;
+}
+
+# meta_to_hdr {{{3
+sub meta_to_hdr {
+   my $tbl = shift;
+   my $meta = $tbl_meta{$tbl};
+   my %labels = map { $_ => $meta->{cols}->{$_}->{hdr} } @{$meta->{visible}};
+   return \%labels;
+}
+
+# commify {{{3
+# From perlfaq5: add commas.
+sub commify {
+   my ( $num ) = @_;
+   $num = 0 unless defined $num;
+   $num =~ s/(^[-+]?\d+?(?=(?>(?:\d{3})+)(?!\d))|\G\d{3}(?=\d))/$1,/g;
+   return $num;
+}
+
+# set_precision {{{3
+# Trim to desired precision.
+sub set_precision {
+   my ( $num, $precision ) = @_;
+   $precision = $config{num_digits}->{val} if !defined $precision;
+   sprintf("%.${precision}f", $num);
+}
+
+# percent {{{3
+# Convert to percent
+sub percent {
+   my ( $num ) = @_;
+   $num = 0 unless defined $num;
+   my $digits = $config{num_digits}->{val};
+   return sprintf("%.${digits}f", $num * 100)
+      . ($config{show_percent}->{val} ? '%' : '');
+}
+
+# shorten {{{3
+sub shorten {
+   my ( $num, $opts ) = @_;
+
+   return $num if !defined($num) || $opts{n} || $num !~ m/$num_regex/;
+
+   $opts ||= {};
+   my $pad = defined $opts->{pad} ? $opts->{pad} : '';
+   my $num_digits = defined $opts->{num_digits}
+      ? $opts->{num_digits}
+      : $config{num_digits}->{val};
+   my $force = defined $opts->{force};
+
+   my $n = 0;
+   while ( $num >= 1_024 ) {
+      $num /= 1_024;
+      ++$n;
+   }
+   return sprintf(
+      $num =~ m/\./ || $n || $force
+         ? "%.${num_digits}f%s"
+         : '%d',
+      $num, ($pad,'k','M','G', 'T')[$n]);
+
+}
+
+# Utility functions {{{2
+# unique {{{3
+sub unique {
+   my %seen;
+   return grep { !$seen{$_}++ } @_;
+}
+
+# make_color_func {{{3
+sub make_color_func {
+   my ( $tbl ) = @_;
+   my @criteria;
+   foreach my $spec ( @{$tbl->{colors}} ) {
+      next unless exists $comp_ops{$spec->{op}};
+      my $val = $spec->{op} =~ m/^(?:eq|ne|le|ge|lt|gt)$/ ? "'$spec->{arg}'"
+              : $spec->{op} =~ m/^(?:=~|!~)$/             ? "m/" . quotemeta($spec->{arg}) . "/"
+              :                                             $spec->{arg};
+      push @criteria,
+         "( defined \$set->{$spec->{col}} && \$set->{$spec->{col}} $spec->{op} $val ) { return '$spec->{color}'; }";
+   }
+   return undef unless @criteria;
+   my $sub = eval 'sub { my ( $set ) = @_; if ' . join(" elsif ", @criteria) . '}';
+   die if $EVAL_ERROR;
+   return $sub;
+}
+
+# make_sort_func {{{3
+# Gets a list of sort columns from the table, like "+cxn -time" and returns a
+# subroutine that will sort that way.
+sub make_sort_func {
+   my ( $tbl ) = @_;
+   my @criteria;
+
+   # Pivoted tables can be sorted by 'name' and set_x columns; others must be
+   # sorted by existing columns.  TODO: this will crash if you toggle between
+   # pivoted and nonpivoted.  I have several other 'crash' notes about this if
+   # this ever becomes possible.
+
+   if ( $tbl->{pivot} ) {
+      # Sort type is not really possible on pivoted columns, because a 'column'
+      # contains data from an entire non-pivoted row, so there could be a mix of
+      # numeric and non-numeric data.  Thus everything has to be 'cmp' type.
+      foreach my $col ( split(/\s+/, $tbl->{sort_cols} ) ) {
+         next unless $col;
+         my ( $dir, $name ) = $col =~ m/([+-])?(\w+)$/;
+         next unless $name && $name =~ m/^(?:name|set_\d+)$/;
+         $dir ||= '+';
+         my $op = 'cmp';
+         my $df = "''";
+         push @criteria,
+            $dir eq '+'
+            ? "(\$a->{$name} || $df) $op (\$b->{$name} || $df)"
+            : "(\$b->{$name} || $df) $op (\$a->{$name} || $df)";
+      }
+   }
+   else {
+      foreach my $col ( split(/\s+/, $tbl->{sort_cols} ) ) {
+         next unless $col;
+         my ( $dir, $name ) = $col =~ m/([+-])?(\w+)$/;
+         next unless $name && $tbl->{cols}->{$name};
+         $dir ||= '+';
+         my $op = $tbl->{cols}->{$name}->{num} ? "<=>" : "cmp";
+         my $df = $tbl->{cols}->{$name}->{num} ? "0"   : "''";
+         push @criteria,
+            $dir eq '+'
+            ? "(\$a->{$name} || $df) $op (\$b->{$name} || $df)"
+            : "(\$b->{$name} || $df) $op (\$a->{$name} || $df)";
+      }
+   }
+   return sub { return @_ } unless @criteria;
+   my $sub = eval 'sub { sort {' . join("||", @criteria) . '} @_; }';
+   die if $EVAL_ERROR;
+   return $sub;
+}
+
+# trunc {{{3
+# Shortens text to specified length.
+sub trunc {
+   my ( $text, $len ) = @_;
+   if ( length($text) <= $len ) {
+      return $text;
+   }
+   return substr($text, 0, $len);
+}
+
+# donut {{{3
+# Takes out the middle of text to shorten it.
+sub donut {
+   my ( $text, $len ) = @_;
+   return $text if length($text) <= $len;
+   my $max = length($text) - $len;
+   my $min = $max - 1;
+
+   # Try to remove a single "word" from somewhere in the center
+   if ( $text =~ s/_[^_]{$min,$max}_/_/ ) {
+      return $text;
+   }
+
+   # Prefer removing the end of a "word"
+   if ( $text =~ s/([^_]+)[^_]{$max}_/$1_/ ) {
+      return $text;
+   }
+
+   $text = substr($text, 0, int($len/2))
+         . "_"
+         . substr($text, int($len/2) + $max + 1);
+   return $text;
+}
+
+# crunch {{{3
+# Removes vowels and compacts repeated letters to shorten text.
+sub crunch {
+   my ( $text, $len ) = @_;
+   return $text if $len && length($text) <= $len;
+   $text =~ s/^IB_\w\w_//;
+   $text =~ s/(?<![_ ])[aeiou]//g;
+   $text =~ s/(.)\1+/$1/g;
+   return $text;
+}
+
+# collapse_ws {{{3
+# Collapses all whitespace to a single space.
+sub collapse_ws {
+   my ( $text ) = @_;
+   return '' unless defined $text;
+   $text =~ s/\s+/ /g;
+   return $text;
+}
+
+# Strips out non-printable characters within fields, which freak terminals out.
+sub no_ctrl_char {
+   my ( $text ) = @_;
+   return '' unless defined $text;
+   my $charset = $config{charset}->{val};
+   if ( $charset && $charset eq 'unicode' ) {
+      $text =~ s/
+         ("(?:(?!(?<!\\)").)*"  # Double-quoted string
+         |'(?:(?!(?<!\\)').)*') # Or single-quoted string
+         /$1 =~ m#\p{IsC}# ? "[BINARY]" : $1/egx;
+   }
+   elsif ( $charset && $charset eq 'none' ) {
+      $text =~ s/
+         ("(?:(?!(?<!\\)").)*"
+         |'(?:(?!(?<!\\)').)*')
+         /[TEXT]/gx;
+   }
+   else { # The default is 'ascii'
+      $text =~ s/
+         ("(?:(?!(?<!\\)").)*"
+         |'(?:(?!(?<!\\)').)*')
+         /$1 =~ m#[^\040-\176]# ? "[BINARY]" : $1/egx;
+   }
+   return $text;
+}
+
+# word_wrap {{{3
+# Wraps text at word boundaries so it fits the screen.
+sub word_wrap {
+   my ( $text, $width) = @_;
+   $width ||= $this_term_size[0];
+   $text =~ s/(.{0,$width})(?:\s+|$)/$1\n/g;
+   $text =~ s/ +$//mg;
+   return $text;
+}
+
+# draw_screen {{{3
+# Prints lines to the screen.  The first argument is an arrayref.  Each
+# element of the array is either a string or an arrayref.  If it's a string it
+# just gets printed.  If it's an arrayref, the first element is the string to
+# print, and the second is args to colored().
+sub draw_screen {
+   my ( $display_lines, $prefs ) = @_;
+   if ( !$opts{n} && $config{show_statusbar}->{val} ) {
+      unshift @$display_lines, create_statusbar();
+   }
+
+   foreach my $listener ( @{$event_listener_for{draw_screen}} ) {
+      $listener->draw_screen($display_lines);
+   }
+
+   $clear_screen_sub->()
+      if $prefs->{clear} || !$modes{$config{mode}->{val}}->{no_clear_screen};
+   if ( $opts{n} || $prefs->{raw} ) {
+      my $num_lines = 0;
+      print join("\n",
+         map {
+            $num_lines++;
+            ref $_
+               ? colored($_->[0], $_->[1])
+               : $_;
+         }
+         grep { !$opts{n} || $_ } # Suppress empty lines
+         @$display_lines);
+      if ( $opts{n} && $num_lines ) {
+         print "\n";
+      }
+   }
+   else {
+      my $max_lines = $prefs->{show_all}
+         ? scalar(@$display_lines)- 1
+         : min(scalar(@$display_lines), $this_term_size[1]);
+      print join("\n",
+         map {
+            ref $_
+               ? colored(substr($_->[0], 0, $this_term_size[0]), $_->[1])
+               : substr($_, 0, $this_term_size[0]);
+         } @$display_lines[0..$max_lines - 1]);
+   }
+}
+
+# secs_to_time {{{3
+sub secs_to_time {
+   my ( $secs, $fmt ) = @_;
+   $secs ||= 0;
+   return '00:00' unless $secs;
+
+   # Decide what format to use, if not given
+   $fmt ||= $secs >= 86_400 ? 'd'
+          : $secs >= 3_600  ? 'h'
+          :                   'm';
+
+   return
+      $fmt eq 'd' ? sprintf(
+         "%d+%02d:%02d:%02d",
+         int($secs / 86_400),
+         int(($secs % 86_400) / 3_600),
+         int(($secs % 3_600) / 60),
+         $secs % 60)
+      : $fmt eq 'h' ? sprintf(
+         "%02d:%02d:%02d",
+         int(($secs % 86_400) / 3_600),
+         int(($secs % 3_600) / 60),
+         $secs % 60)
+      : sprintf(
+         "%02d:%02d",
+         int(($secs % 3_600) / 60),
+         $secs % 60);
+}
+
+# dulint_to_int {{{3
+# Takes a number that InnoDB formats as two ulint integers, like transaction IDs
+# and such, and turns it into a single integer
+sub dulint_to_int {
+   my $num = shift;
+   return 0 unless $num;
+   my ( $high, $low ) = $num =~ m/^(\d+) (\d+)$/;
+   return $low unless $high;
+   return $low + ( $high * $MAX_ULONG );
+}
+
+# create_statusbar {{{3
+sub create_statusbar {
+   my $mode = $config{mode}->{val};
+   my @cxns = sort { $a cmp $b } get_connections();
+
+   my $modeline        = ( $config{readonly}->{val} ? '[RO] ' : '' )
+                         . $modes{$mode}->{hdr} . " (? for help)";
+   my $mode_width      = length($modeline);
+   my $remaining_width = $this_term_size[0] - $mode_width - 1;
+   my $result;
+
+   # The thingie in top-right that says what we're monitoring.
+   my $cxn = '';
+
+   if ( 1 == @cxns && $dbhs{$cxns[0]} && $dbhs{$cxns[0]}->{dbh} ) {
+      $cxn = $dbhs{$cxns[0]}->{dbh}->{mysql_serverinfo} || '';
+   }
+   else {
+      if ( $modes{$mode}->{server_group} ) {
+         $cxn = "Servers: " . $modes{$mode}->{server_group};
+         my $err_count = grep { $dbhs{$_} && $dbhs{$_}->{err_count} } @cxns;
+         if ( $err_count ) {
+            $cxn .= "(" . ( scalar(@cxns) - $err_count ) . "/" . scalar(@cxns) . ")";
+         }
+      }
+      else {
+         $cxn = join(' ', map { ($dbhs{$_}->{err_count} ? '!' : '') . $_ }
+            grep { $dbhs{$_} } @cxns);
+      }
+   }
+
+   if ( 1 == @cxns ) {
+      get_driver_status(@cxns);
+      my $vars = $vars{$cxns[0]}->{$clock};
+      my $inc  = inc(0, $cxns[0]);
+
+      # Format server uptime human-readably, calculate QPS...
+      my $uptime = secs_to_time( $vars->{Uptime_hires} );
+      my $qps    = ($inc->{Questions}||0) / ($inc->{Uptime_hires}||1);
+      my $ibinfo = '';
+
+      if ( exists $vars->{IB_last_secs} ) {
+         $ibinfo .= "InnoDB $vars->{IB_last_secs}s ";
+         if ( $vars->{IB_got_all} ) {
+            if ( ($mode eq 'T' || $mode eq 'W')
+                  && $vars->{IB_tx_is_truncated} ) {
+               $ibinfo .= ':^|';
+            }
+            else {
+               $ibinfo .= ':-)';
+            }
+         }
+         else {
+            $ibinfo .= ':-(';
+         }
+      }
+      $result = sprintf(
+         "%-${mode_width}s %${remaining_width}s",
+         $modeline,
+         join(', ', grep { $_ } (
+            $cxns[0],
+            $uptime,
+            $ibinfo,
+            shorten($qps) . " QPS",
+            ($vars->{Threads} || 0) . " thd",
+            $cxn)));
+   }
+   else {
+      $result = sprintf(
+         "%-${mode_width}s %${remaining_width}s",
+         $modeline,
+         $cxn);
+   }
+
+   return $config{color}->{val} ? [ $result, 'bold reverse' ] : $result;
+}
+
+# Database connections {{{3
+sub add_new_dsn {
+   my ( $name ) = @_;
+
+   if ( defined $name ) {
+      $name =~ s/[\s:;]//g;
+   }
+
+   if ( !$name ) {
+      print word_wrap("Choose a name for the connection.  It cannot contain "
+         . "whitespace, colons or semicolons."), "\n\n";
+      do {
+         $name = prompt("Enter a name");
+         $name =~ s/[\s:;]//g;
+      } until ( $name );
+   }
+
+   my $dsn;
+   do {
+      $clear_screen_sub->();
+      print "Typical DSN strings look like\n   DBI:mysql:;host=hostname;port=port\n"
+         . "The db and port are optional and can usually be omitted.\n"
+         . "If you specify 'mysql_read_default_group=mysql' many options can be read\n"
+         . "from your mysql options files (~/.my.cnf, /etc/my.cnf).\n\n";
+      $dsn = prompt("Enter a DSN string", undef, "DBI:mysql:;mysql_read_default_group=mysql;host=$name");
+   } until ( $dsn );
+
+   $clear_screen_sub->();
+   my $dl_table = prompt("Optional: enter a table (must not exist) to use when resetting InnoDB deadlock information",
+      undef, 'test.innotop_dl');
+
+   $connections{$name} = {
+      dsn      => $dsn,
+      dl_table => $dl_table,
+   };
+}
+
+sub add_new_server_group {
+   my ( $name ) = @_;
+
+   if ( defined $name ) {
+      $name =~ s/[\s:;]//g;
+   }
+
+   if ( !$name ) {
+      print word_wrap("Choose a name for the group.  It cannot contain "
+         . "whitespace, colons or semicolons."), "\n\n";
+      do {
+         $name = prompt("Enter a name");
+         $name =~ s/[\s:;]//g;
+      } until ( $name );
+   }
+
+   my @cxns;
+   do {
+      $clear_screen_sub->();
+      @cxns = select_cxn("Choose servers for $name", keys %connections);
+   } until ( @cxns );
+
+   $server_groups{$name} = \@cxns;
+   return $name;
+}
+
+sub get_var_set {
+   my ( $name ) = @_;
+   while ( !$name || !exists($var_sets{$config{$name}->{val}}) ) {
+      $name = choose_var_set($name);
+   }
+   return $var_sets{$config{$name}->{val}}->{text};
+}
+
+sub add_new_var_set {
+   my ( $name ) = @_;
+
+   if ( defined $name ) {
+      $name =~ s/\W//g;
+   }
+
+   if ( !$name ) {
+      do {
+         $name = prompt("Enter a name");
+         $name =~ s/\W//g;
+      } until ( $name );
+   }
+
+   my $variables;
+   do {
+      $clear_screen_sub->();
+      $variables = prompt("Enter variables for $name", undef );
+   } until ( $variables );
+
+   $var_sets{$name} = { text => $variables, user => 1 };
+}
+
+sub next_server {
+   my $mode     = $config{mode}->{val};
+   my @cxns     = sort keys %connections;
+   my ($cur)    = get_connections($mode);
+   $cur         ||= $cxns[0];
+   my $pos      = grep { $_ lt $cur } @cxns;
+   my $newpos   = ($pos + 1) % @cxns;
+   $modes{$mode}->{server_group} = '';
+   $modes{$mode}->{connections} = [ $cxns[$newpos] ];
+   $clear_screen_sub->();
+}
+
+sub next_server_group {
+   my $mode = shift || $config{mode}->{val};
+   my @grps = sort keys %server_groups;
+   my $curr = $modes{$mode}->{server_group};
+
+   return unless @grps;
+
+   if ( $curr ) {
+      # Find the current group's position.
+      my $pos = 0;
+      while ( $curr ne $grps[$pos] ) {
+         $pos++;
+      }
+      $modes{$mode}->{server_group} = $grps[ ($pos + 1) % @grps ];
+   }
+   else {
+      $modes{$mode}->{server_group} = $grps[0];
+   }
+}
+
+# Get a list of connection names used in this mode.
+sub get_connections {
+   if ( $file ) {
+      return qw(file);
+   }
+   my $mode = shift || $config{mode}->{val};
+   my @connections = $modes{$mode}->{server_group}
+      ? @{$server_groups{$modes{$mode}->{server_group}}}
+      : @{$modes{$mode}->{connections}};
+   if ( $modes{$mode}->{one_connection} ) {
+      @connections = @connections ? $connections[0] : ();
+   }
+   return unique(@connections);
+}
+
+# Get a list of tables used in this mode.  If innotop is running non-interactively, just use the first.
+sub get_visible_tables {
+   my $mode = shift || $config{mode}->{val};
+   my @tbls = @{$modes{$mode}->{visible_tables}};
+   if ( $opts{n} ) {
+      return $tbls[0];
+   }
+   else {
+      return @tbls;
+   }
+}
+
+# Choose from among available connections or server groups.
+# If the mode has a server set in use, prefers that instead.
+sub choose_connections {
+   $clear_screen_sub->();
+   my $mode    = $config{mode}->{val};
+   my $meta    =  { map { $_ => $connections{$_}->{dsn} } keys %connections };
+   foreach my $group ( keys %server_groups ) {
+      $meta->{"#$group"} = join(' ', @{$server_groups{$group}});
+   }
+
+   my $choices = prompt_list("Choose connections or a group for $mode mode",
+      undef, sub { return keys %$meta }, $meta);
+
+   my @choices = unique(grep { $_ } split(/\s+/, $choices));
+   if ( @choices ) {
+      if ( $choices[0] =~ s/^#// && exists $server_groups{$choices[0]} ) {
+         $modes{$mode}->{server_group} = $choices[0];
+      }
+      else {
+         $modes{$mode}->{connections} = [ grep { exists $connections{$_} } @choices ];
+      }
+   }
+}
+
+# Accepts a DB connection name and the name of a prepared query (e.g. status, kill).
+# Also a list of params for the prepared query.  This allows not storing prepared
+# statements globally.  Returns a $sth that's been executed.
+# ERROR-HANDLING SEMANTICS: if the statement throws an error, propagate, but if the
+# connection has gone away or can't connect, DO NOT.  Just return undef.
+sub do_stmt {
+   my ( $cxn, $stmt_name, @args ) = @_;
+
+   return undef if $file;
+
+   # Test if the cxn should not even be tried
+   return undef if $dbhs{$cxn}
+      && $dbhs{$cxn}->{err_count} 
+      && ( !$dbhs{$cxn}->{dbh} || !$dbhs{$cxn}->{dbh}->{Active} || $dbhs{$cxn}->{mode} eq $config{mode}->{val} )
+      && $dbhs{$cxn}->{wake_up} > $clock;
+
+   my $sth;
+   my $retries = 1;
+   my $success = 0;
+   TRY:
+   while ( $retries-- >= 0 && !$success ) {
+
+      eval {
+         my $dbh = connect_to_db($cxn);
+
+         # If the prepared query doesn't exist, make it.
+         if ( !exists $dbhs{$cxn}->{stmts}->{$stmt_name} ) {
+            $dbhs{$cxn}->{stmts}->{$stmt_name} = $stmt_maker_for{$stmt_name}->($dbh);
+         }
+
+         $sth = $dbhs{$cxn}->{stmts}->{$stmt_name};
+         if ( $sth ) {
+            $sth->execute(@args);
+         }
+         $success = 1;
+      };
+      if ( $EVAL_ERROR ) {
+         if ( $EVAL_ERROR =~ m/$nonfatal_errs/ ) {
+            handle_cxn_error($cxn, $EVAL_ERROR);
+         }
+         else {
+            die "$cxn $stmt_name: $EVAL_ERROR";
+         }
+         if ( $retries < 0 ) {
+            $sth = undef;
+         }
+      }
+   }
+
+   if ( $sth && $sth->{NUM_OF_FIELDS} ) {
+      sleep($stmt_sleep_time_for{$stmt_name}) if $stmt_sleep_time_for{$stmt_name};
+      return $sth;
+   }
+}
+
+# Keeps track of error count, sleep times till retries, etc etc.
+# When there's an error we retry the connection every so often, increasing in
+# Fibonacci series to prevent too much banging on the server.
+sub handle_cxn_error {
+   my ( $cxn, $err ) = @_;
+   my $meta = $dbhs{$cxn};
+   $meta->{err_count}++;
+
+   # This is used so errors that have to do with permissions needed by the current
+   # mode will get displayed as long as we're in this mode, but get ignored if the
+   # mode changes.
+   $meta->{mode} = $config{mode}->{val};
+
+   # Strip garbage from the error text if possible.
+   $err =~ s/\s+/ /g;
+   if ( $err =~ m/failed: (.*?) at \S*innotop line/ ) {
+      $err = $1;
+   }
+
+   $meta->{last_err}   = $err;
+   my $sleep_time      = $meta->{this_sleep} + $meta->{prev_sleep};
+   $meta->{prev_sleep} = $meta->{this_sleep};
+   $meta->{this_sleep} = $sleep_time;
+   $meta->{wake_up}    = $clock + $sleep_time;
+   if ( $config{show_cxn_errors}->{val} ) {
+      print STDERR "Error at tick $clock $cxn $err" if $config{debug}->{val};
+   }
+}
+
+# Accepts a DB connection name and a (string) query.  Returns a $sth that's been
+# executed.
+sub do_query {
+   my ( $cxn, $query ) = @_;
+
+   return undef if $file;
+
+   # Test if the cxn should not even be tried
+   return undef if $dbhs{$cxn}
+      && $dbhs{$cxn}->{err_count} 
+      && ( !$dbhs{$cxn}->{dbh} || !$dbhs{$cxn}->{dbh}->{Active} || $dbhs{$cxn}->{mode} eq $config{mode}->{val} )
+      && $dbhs{$cxn}->{wake_up} > $clock;
+
+   my $sth;
+   my $retries = 1;
+   my $success = 0;
+   TRY:
+   while ( $retries-- >= 0 && !$success ) {
+
+      eval {
+         my $dbh = connect_to_db($cxn);
+
+         $sth = $dbh->prepare($query);
+         $sth->execute();
+         $success = 1;
+      };
+      if ( $EVAL_ERROR ) {
+         if ( $EVAL_ERROR =~ m/$nonfatal_errs/ ) {
+            handle_cxn_error($cxn, $EVAL_ERROR);
+         }
+         else {
+            die $EVAL_ERROR;
+         }
+         if ( $retries < 0 ) {
+            $sth = undef;
+         }
+      }
+   }
+ 
+   return $sth;
+}
+
+sub get_uptime {
+   my ( $cxn ) = @_;
+   $dbhs{$cxn}->{start_time} ||= time();
+   # Avoid dividing by zero
+   return (time() - $dbhs{$cxn}->{start_time}) || .001;
+}
+
+sub connect_to_db {
+   my ( $cxn ) = @_;
+
+   $dbhs{$cxn} ||= {
+      stmts      => {},  # bucket for prepared statements.
+      prev_sleep => 0,
+      this_sleep => 1,
+      wake_up    => 0,
+      start_time => 0,
+      dbh        => undef,
+   };
+   my $href = $dbhs{$cxn};
+
+   if ( !$href->{dbh} || ref($href->{dbh}) !~ m/DBI/ || !$href->{dbh}->ping ) {
+      my $dbh = get_new_db_connection($cxn);
+      @{$href}{qw(dbh err_count wake_up this_sleep start_time prev_sleep)}
+               = ($dbh, 0, 0, 1, 0, 0);
+
+      # Derive and store the server's start time in hi-res
+      my $uptime = $dbh->selectrow_hashref("show status like 'Uptime'")->{value};
+      $href->{start_time} = time() - $uptime;
+
+      # Set timeouts so an unused connection stays alive.
+      # For example, a connection might be used in Q mode but idle in T mode.
+      if ( version_ge($dbh, '4.0.3')) {
+         my $timeout = $config{cxn_timeout}->{val};
+         $dbh->do("set session wait_timeout=$timeout, interactive_timeout=$timeout");
+      }
+   }
+   return $href->{dbh};
+}
+
+# Compares versions like 5.0.27 and 4.1.15-standard-log
+sub version_ge {
+   my ( $dbh, $target ) = @_;
+   my $version = sprintf('%03d%03d%03d', $dbh->{mysql_serverinfo} =~ m/(\d+)/g);
+   return $version ge sprintf('%03d%03d%03d', $target =~ m/(\d+)/g);
+}
+
+# Extracts status values that can be gleaned from the DBD driver without doing a whole query.
+sub get_driver_status {
+   my @cxns = @_;
+   if ( !$info_gotten{driver_status}++ ) {
+      foreach my $cxn ( @cxns ) {
+         next unless $dbhs{$cxn} && $dbhs{$cxn}->{dbh} && $dbhs{$cxn}->{dbh}->{Active};
+         $vars{$cxn}->{$clock} ||= {};
+         my $vars = $vars{$cxn}->{$clock};
+         my %res = map {  $_ =~ s/ +/_/g; $_ } $dbhs{$cxn}->{dbh}->{mysql_stat} =~ m/(\w[^:]+): ([\d\.]+)/g;
+         map { $vars->{$_} ||= $res{$_} } keys %res;
+         $vars->{Uptime_hires} ||= get_uptime($cxn);
+         $vars->{cxn} = $cxn;
+      }
+   }
+}
+
+sub get_new_db_connection {
+   my ( $connection, $destroy ) = @_;
+   if ( $file ) {
+      die "You can't connect to a MySQL server while monitoring a file.  This is probably a bug.";
+   }
+
+   my $dsn = $connections{$connection}
+      or die "No connection named '$connection' is defined in your configuration";
+
+   if ( !defined $dsn->{have_user} ) {
+      my $answer = prompt("Do you want to specify a username for $connection?", undef, 'n');
+      $dsn->{have_user} = $answer && $answer =~ m/1|y/i;
+   }
+
+   if ( !defined $dsn->{have_pass} ) {
+      my $answer = prompt("Do you want to specify a password for $connection?", undef, 'n');
+      $dsn->{have_pass} = $answer && $answer =~ m/1|y/i;
+   }
+
+   if ( !$dsn->{user} && $dsn->{have_user} ) {
+      my $user = $ENV{USERNAME} || $ENV{USER} || getlogin() || getpwuid($REAL_USER_ID) || undef;
+      $dsn->{user} = prompt("Enter username for $connection", undef, $user);
+   }
+
+   if ( !defined $dsn->{user} ) {
+      $dsn->{user} = '';
+   }
+
+   if ( !$dsn->{pass} && !$dsn->{savepass} && $dsn->{have_pass} ) {
+      $dsn->{pass} = prompt_noecho("Enter password for '$dsn->{user}' on $connection");
+      print "\n";
+      if ( !defined($dsn->{savepass}) ) {
+         my $answer = prompt("Save password in plain text in the config file?", undef, 'y');
+         $dsn->{savepass} = $answer && $answer =~ m/1|y/i;
+      }
+   }
+
+   my $dbh = DBI->connect(
+      $dsn->{dsn}, $dsn->{user}, $dsn->{pass},
+      { RaiseError => 1, PrintError => 0, AutoCommit => 1 });
+   $dbh->{InactiveDestroy} = 1 unless $destroy; # Can't be set in $db_options
+   $dbh->{FetchHashKeyName} = 'NAME_lc'; # Lowercases all column names for fetchrow_hashref
+   return $dbh;
+}
+
+sub get_cxn_errors {
+   my @cxns = @_;
+   return () unless $config{show_cxn_errors_in_tbl}->{val};
+   return
+      map  { [ $_ . ': ' . $dbhs{$_}->{last_err}, 'red' ] }
+      grep { $dbhs{$_} && $dbhs{$_}->{err_count} && $dbhs{$_}->{mode} eq $config{mode}->{val} }
+      @cxns;
+}
+
+# Setup and tear-down functions {{{2
+
+# Takes a string and turns it into a hashref you can apply to %tbl_meta tables.  The string
+# can be in the form 'foo, bar, foo/bar, foo as bar' much like a SQL SELECT statement.
+sub compile_select_stmt {
+   my ($str) = @_;
+   my @exps = $str =~ m/\s*([^,]+(?i:\s+as\s+[^,\s]+)?)\s*(?=,|$)/g;
+   my %cols;
+   my @visible;
+   foreach my $exp ( @exps ) {
+      my ( $text, $colname );
+      if ( $exp =~ m/as\s+(\w+)\s*/ ) {
+         $colname = $1;
+         $exp =~ s/as\s+(\w+)\s*//;
+         $text    = $exp;
+      }
+      else {
+         $text = $colname = $exp;
+      }
+      my ($func, $err) = compile_expr($text);
+      $cols{$colname} = {
+         src  => $text,
+         hdr  => $colname,
+         num  => 0,
+         func => $func,
+      };
+      push @visible, $colname;
+   }
+   return (\%cols, \@visible);
+}
+
+# compile_filter {{{3
+sub compile_filter {
+   my ( $text ) = @_;
+   my ( $sub, $err );
+   eval "\$sub = sub { my \$set = shift; $text }";
+   if ( $EVAL_ERROR ) {
+      $EVAL_ERROR =~ s/at \(eval.*$//;
+      $sub = sub { return $EVAL_ERROR };
+      $err = $EVAL_ERROR;
+   }
+   return ( $sub, $err );
+}
+
+# compile_expr {{{3
+sub compile_expr {
+   my ( $expr ) = @_;
+   # Leave built-in functions alone so they get called as Perl functions, unless
+   # they are the only word in $expr, in which case treat them as hash keys.
+   if ( $expr =~ m/\W/ ) {
+      $expr =~ s/(?<!\{|\$)\b([A-Za-z]\w{2,})\b/is_func($1) ? $1 : "\$set->{$1}"/eg;
+   }
+   else {
+      $expr = "\$set->{$expr}";
+   }
+   my ( $sub, $err );
+   my $quoted = quotemeta($expr);
+   eval qq{
+      \$sub = sub {
+         my (\$set, \$cur, \$pre) = \@_;
+         my \$val = eval { $expr };
+         if ( \$EVAL_ERROR && \$config{debug}->{val} ) {
+            \$EVAL_ERROR =~ s/ at \\(eval.*//s;
+            die "\$EVAL_ERROR in expression $quoted";
+         }
+         return \$val;
+      }
+   };
+   if ( $EVAL_ERROR ) {
+      if ( $config{debug}->{val} ) {
+         die $EVAL_ERROR;
+      }
+      $EVAL_ERROR =~ s/ at \(eval.*$//;
+      $sub = sub { return $EVAL_ERROR };
+      $err = $EVAL_ERROR;
+   }
+   return ( $sub, $err );
+}
+
+# finish {{{3
+# This is a subroutine because it's called from a key to quit the program.
+sub finish {
+   save_config();
+   ReadMode('normal') unless $opts{n};
+   print "\n";
+   exit(0);
+}
+
+# core_dump {{{3
+sub core_dump {
+   my $msg = shift;
+   if ($config{debugfile}->{val} && $config{debug}->{val}) {
+      eval {
+         open my $file, '>>', $config{debugfile}->{val};
+         if ( %vars ) {
+            print $file "Current variables:\n" . Dumper(\%vars);
+         }
+         close $file;
+      };
+   }
+   print $msg;
+}
+
+# load_config {{{3
+sub load_config {
+
+   my $filename = $opts{c} || "$homepath/.innotop/innotop.ini";
+   my $dirname  = dirname($filename);
+   if ( -f $dirname && !$opts{c} ) {
+      # innotop got upgraded and this is the old config file.
+      my $answer = pause("Innotop's default config location has moved to $filename.  Move old config file $dirname there now? y/n");
+      if ( lc $answer eq 'y' ) {
+         rename($dirname, "$homepath/innotop.ini")
+            or die "Can't rename '$dirname': $OS_ERROR";
+         mkdir($dirname) or die "Can't create directory '$dirname': $OS_ERROR";
+         mkdir("$dirname/plugins") or die "Can't create directory '$dirname/plugins': $OS_ERROR";
+         rename("$homepath/innotop.ini", $filename)
+            or die "Can't rename '$homepath/innotop.ini' to '$filename': $OS_ERROR";
+      }
+      else {
+         print "\nInnotop will now exit so you can fix the config file.\n";
+         exit(0);
+      }
+   }
+
+   if ( ! -d $dirname ) {
+      mkdir $dirname
+         or die "Can't create directory '$dirname': $OS_ERROR";
+   }
+   if ( ! -d "$dirname/plugins" ) {
+      mkdir "$dirname/plugins"
+         or die "Can't create directory '$dirname/plugins': $OS_ERROR";
+   }
+
+   if ( -f $filename ) {
+      open my $file, "<", $filename or die("Can't open '$filename': $OS_ERROR");
+
+      # Check config file version.  Just ignore if either innotop or the file has
+      # garbage in the version number.
+      if ( defined(my $line = <$file>) && $VERSION =~ m/\d/ ) {
+         chomp $line;
+         if ( my ($maj, $min, $rev) = $line =~ m/^version=(\d+)\.(\d+)(?:\.(\d+))?$/ ) {
+            $rev ||= 0;
+            my $cfg_ver          = sprintf('%03d-%03d-%03d', $maj, $min, $rev);
+            ( $maj, $min, $rev ) = $VERSION =~ m/^(\d+)\.(\d+)(?:\.(\d+))?$/;
+            $rev ||= 0;
+            my $innotop_ver      = sprintf('%03d-%03d-%03d', $maj, $min, $rev);
+
+            if ( $cfg_ver gt $innotop_ver ) {
+               pause("The config file is for a newer version of innotop and may not be read correctly.");
+            }
+            else {
+               my @ver_history = @config_versions;
+               while ( my ($start, $end) = splice(@ver_history, 0, 2) ) {
+                  # If the config file is between the endpoints and innotop is greater than
+                  # the endpoint, innotop has a newer config file format than the file.
+                  if ( $cfg_ver ge $start && $cfg_ver lt $end && $innotop_ver ge $end ) {
+                     my $msg = "innotop's config file format has changed.  Overwrite $filename?  y or n";
+                     if ( pause($msg) eq 'n' ) {
+                        $config{readonly}->{val} = 1;
+                        print "\ninnotop will not save any configuration changes you make.";
+                        pause();
+                        print "\n";
+                     }
+                     close $file;
+                     return;
+                  }
+               }
+            }
+         }
+      }
+
+      while ( my $line = <$file> ) {
+         chomp $line;
+         next unless $line =~ m/^\[([a-z_]+)\]$/;
+         if ( exists $config_file_sections{$1} ) {
+            $config_file_sections{$1}->{reader}->($file);
+         }
+         else {
+            warn "Unknown config file section '$1'";
+         }
+      }
+      close $file or die("Can't close $filename: $OS_ERROR");
+   }
+
+}
+
+# Do some post-processing on %tbl_meta: compile src properties into func etc.
+sub post_process_tbl_meta {
+   foreach my $table ( values %tbl_meta ) {
+      foreach my $col_name ( keys %{$table->{cols}} ) {
+         my $col_def = $table->{cols}->{$col_name};
+         my ( $sub, $err ) = compile_expr($col_def->{src});
+         $col_def->{func} = $sub;
+      }
+   }
+}
+
+# load_config_plugins {{{3
+sub load_config_plugins {
+   my ( $file ) = @_;
+
+   # First, find a list of all plugins that exist on disk, and get information about them.
+   my $dir = $config{plugin_dir}->{val};
+   foreach my $p_file ( <$dir/*.pm> ) {
+      my ($package, $desc);
+      eval {
+         open my $p_in, "<", $p_file or die $OS_ERROR;
+         while ( my $line = <$p_in> ) {
+            chomp $line;
+            if ( $line =~ m/^package\s+(.*?);/ ) {
+               $package = $1;
+            }
+            elsif ( $line =~ m/^# description: (.*)/ ) {
+               $desc = $1;
+            }
+            last if $package && $desc;
+         }
+         close $p_in;
+      };
+      if ( $package ) {
+         $plugins{$package} = {
+            file   => $p_file,
+            desc   => $desc,
+            class  => $package,
+            active => 0,
+         };
+         if ( $config{debug}->{val} && $EVAL_ERROR ) {
+            die $EVAL_ERROR;
+         }
+      }
+   }
+
+   # Now read which ones the user has activated.  Each line simply represents an active plugin.
+   while ( my $line = <$file> ) {
+      chomp $line;
+      next if $line =~ m/^#/;
+      last if $line =~ m/^\[/;
+      next unless $line && $plugins{$line};
+
+      my $obj;
+      eval {
+         require $plugins{$line}->{file};
+         $obj = $line->new(%pluggable_vars);
+         foreach my $event ( $obj->register_for_events() ) {
+            my $queue = $event_listener_for{$event};
+            if ( $queue ) {
+               push @$queue, $obj;
+            }
+         }
+      };
+      if ( $config{debug}->{val} && $EVAL_ERROR ) {
+         die $EVAL_ERROR;
+      }
+      if ( $obj ) {
+         $plugins{$line}->{active} = 1;
+         $plugins{$line}->{object} = $obj;
+      }
+   }
+}
+
+# save_config_plugins {{{3
+sub save_config_plugins {
+   my $file = shift;
+   foreach my $class ( sort keys %plugins ) {
+      next unless $plugins{$class}->{active};
+      print $file "$class\n";
+   }
+}
+
+# load_config_active_server_groups {{{3
+sub load_config_active_server_groups {
+   my ( $file ) = @_;
+   while ( my $line = <$file> ) {
+      chomp $line;
+      next if $line =~ m/^#/;
+      last if $line =~ m/^\[/;
+
+      my ( $mode, $group ) = $line =~ m/^(.*?)=(.*)$/;
+      next unless $mode && $group
+         && exists $modes{$mode} && exists $server_groups{$group};
+      $modes{$mode}->{server_group} = $group;
+   }
+}
+
+# save_config_active_server_groups {{{3
+sub save_config_active_server_groups {
+   my $file = shift;
+   foreach my $mode ( sort keys %modes ) {
+      print $file "$mode=$modes{$mode}->{server_group}\n";
+   }
+}
+
+# load_config_server_groups {{{3
+sub load_config_server_groups {
+   my ( $file ) = @_;
+   while ( my $line = <$file> ) {
+      chomp $line;
+      next if $line =~ m/^#/;
+      last if $line =~ m/^\[/;
+
+      my ( $name, $rest ) = $line =~ m/^(.*?)=(.*)$/;
+      next unless $name && $rest;
+      my @vars = unique(grep { $_ && exists $connections{$_} } split(/\s+/, $rest));
+      next unless @vars;
+      $server_groups{$name} = \@vars;
+   }
+}
+
+# save_config_server_groups {{{3
+sub save_config_server_groups {
+   my $file = shift;
+   foreach my $set ( sort keys %server_groups ) {
+      print $file "$set=", join(' ', @{$server_groups{$set}}), "\n";
+   }
+}
+
+# load_config_varsets {{{3
+sub load_config_varsets {
+   my ( $file ) = @_;
+   while ( my $line = <$file> ) {
+      chomp $line;
+      next if $line =~ m/^#/;
+      last if $line =~ m/^\[/;
+
+      my ( $name, $rest ) = $line =~ m/^(.*?)=(.*)$/;
+      next unless $name && $rest;
+      $var_sets{$name} = {
+         text => $rest,
+         user => 1,
+      };
+   }
+}
+
+# save_config_varsets {{{3
+sub save_config_varsets {
+   my $file = shift;
+   foreach my $varset ( sort keys %var_sets ) {
+      next unless $var_sets{$varset}->{user};
+      print $file "$varset=$var_sets{$varset}->{text}\n";
+   }
+}
+
+# load_config_group_by {{{3
+sub load_config_group_by {
+   my ( $file ) = @_;
+   while ( my $line = <$file> ) {
+      chomp $line;
+      next if $line =~ m/^#/;
+      last if $line =~ m/^\[/;
+
+      my ( $tbl , $rest ) = $line =~ m/^(.*?)=(.*)$/;
+      next unless $tbl && exists $tbl_meta{$tbl};
+      my @parts = unique(grep { exists($tbl_meta{$tbl}->{cols}->{$_}) } split(/\s+/, $rest));
+      $tbl_meta{$tbl}->{group_by} = [ @parts ];
+      $tbl_meta{$tbl}->{cust}->{group_by} = 1;
+   }
+}
+
+# save_config_group_by {{{3
+sub save_config_group_by {
+   my $file = shift;
+   foreach my $tbl ( sort keys %tbl_meta ) {
+      next if $tbl_meta{$tbl}->{temp};
+      next unless $tbl_meta{$tbl}->{cust}->{group_by};
+      my $aref = $tbl_meta{$tbl}->{group_by};
+      print $file "$tbl=", join(' ', @$aref), "\n";
+   }
+}
+
+# load_config_filters {{{3
+sub load_config_filters {
+   my ( $file ) = @_;
+   while ( my $line = <$file> ) {
+      chomp $line;
+      next if $line =~ m/^#/;
+      last if $line =~ m/^\[/;
+
+      my ( $key, $rest ) = $line =~ m/^(.+?)=(.*)$/;
+      next unless $key && $rest;
+
+      my %parts = $rest =~ m/(\w+)='((?:(?!(?<!\\)').)*)'/g; # Properties are single-quoted
+      next unless $parts{text} && $parts{tbls};
+
+      foreach my $prop ( keys %parts ) {
+         # Un-escape escaping
+         $parts{$prop} =~ s/\\\\/\\/g;
+         $parts{$prop} =~ s/\\'/'/g;
+      }
+
+      my ( $sub, $err ) = compile_filter($parts{text});
+      my @tbls = unique(split(/\s+/, $parts{tbls}));
+      @tbls = grep { exists $tbl_meta{$_} } @tbls;
+      $filters{$key} = {
+         func => $sub,
+         text => $parts{text},
+         user => 1,
+         name => $key,
+         note => 'User-defined filter',
+         tbls => \@tbls,
+      }
+   }
+}
+
+# save_config_filters {{{3
+sub save_config_filters {
+   my $file = shift;
+   foreach my $key ( sort keys %filters ) {
+      next if !$filters{$key}->{user} || $filters{$key}->{quick};
+      my $text = $filters{$key}->{text};
+      $text =~ s/([\\'])/\\$1/g;
+      my $tbls = join(" ", @{$filters{$key}->{tbls}});
+      print $file "$key=text='$text' tbls='$tbls'\n";
+   }
+}
+
+# load_config_visible_tables {{{3
+sub load_config_visible_tables {
+   my ( $file ) = @_;
+   while ( my $line = <$file> ) {
+      chomp $line;
+      next if $line =~ m/^#/;
+      last if $line =~ m/^\[/;
+
+      my ( $mode, $rest ) = $line =~ m/^(.*?)=(.*)$/;
+      next unless $mode && exists $modes{$mode};
+      $modes{$mode}->{visible_tables} =
+         [ unique(grep { $_ && exists $tbl_meta{$_} } split(/\s+/, $rest)) ];
+      $modes{$mode}->{cust}->{visible_tables} = 1;
+   }
+}
+
+# save_config_visible_tables {{{3
+sub save_config_visible_tables {
+   my $file = shift;
+   foreach my $mode ( sort keys %modes ) {
+      next unless $modes{$mode}->{cust}->{visible_tables};
+      my $tables = $modes{$mode}->{visible_tables};
+      print $file "$mode=", join(' ', @$tables), "\n";
+   }
+}
+
+# load_config_sort_cols {{{3
+sub load_config_sort_cols {
+   my ( $file ) = @_;
+   while ( my $line = <$file> ) {
+      chomp $line;
+      next if $line =~ m/^#/;
+      last if $line =~ m/^\[/;
+
+      my ( $key , $rest ) = $line =~ m/^(.*?)=(.*)$/;
+      next unless $key && exists $tbl_meta{$key};
+      $tbl_meta{$key}->{sort_cols} = $rest;
+      $tbl_meta{$key}->{cust}->{sort_cols} = 1;
+      $tbl_meta{$key}->{sort_func} = make_sort_func($tbl_meta{$key});
+   }
+}
+
+# save_config_sort_cols {{{3
+sub save_config_sort_cols {
+   my $file = shift;
+   foreach my $tbl ( sort keys %tbl_meta ) {
+      next unless $tbl_meta{$tbl}->{cust}->{sort_cols};
+      my $col = $tbl_meta{$tbl}->{sort_cols};
+      print $file "$tbl=$col\n";
+   }
+}
+
+# load_config_active_filters {{{3
+sub load_config_active_filters {
+   my ( $file ) = @_;
+   while ( my $line = <$file> ) {
+      chomp $line;
+      next if $line =~ m/^#/;
+      last if $line =~ m/^\[/;
+
+      my ( $tbl , $rest ) = $line =~ m/^(.*?)=(.*)$/;
+      next unless $tbl && exists $tbl_meta{$tbl};
+      my @parts = unique(grep { exists($filters{$_}) } split(/\s+/, $rest));
+      @parts = grep { grep { $tbl eq $_ } @{$filters{$_}->{tbls}} } @parts;
+      $tbl_meta{$tbl}->{filters} = [ @parts ];
+      $tbl_meta{$tbl}->{cust}->{filters} = 1;
+   }
+}
+
+# save_config_active_filters {{{3
+sub save_config_active_filters {
+   my $file = shift;
+   foreach my $tbl ( sort keys %tbl_meta ) {
+      next if $tbl_meta{$tbl}->{temp};
+      next unless $tbl_meta{$tbl}->{cust}->{filters};
+      my $aref = $tbl_meta{$tbl}->{filters};
+      print $file "$tbl=", join(' ', @$aref), "\n";
+   }
+}
+
+# load_config_active_columns {{{3
+sub load_config_active_columns {
+   my ( $file ) = @_;
+   while ( my $line = <$file> ) {
+      chomp $line;
+      next if $line =~ m/^#/;
+      last if $line =~ m/^\[/;
+
+      my ( $key , $rest ) = $line =~ m/^(.*?)=(.*)$/;
+      next unless $key && exists $tbl_meta{$key};
+      my @parts = grep { exists($tbl_meta{$key}->{cols}->{$_}) } unique split(/ /, $rest);
+      $tbl_meta{$key}->{visible} = [ @parts ];
+      $tbl_meta{$key}->{cust}->{visible} = 1;
+   }
+}
+
+# save_config_active_columns {{{3
+sub save_config_active_columns {
+   my $file = shift;
+   foreach my $tbl ( sort keys %tbl_meta ) {
+      next unless $tbl_meta{$tbl}->{cust}->{visible};
+      my $aref = $tbl_meta{$tbl}->{visible};
+      print $file "$tbl=", join(' ', @$aref), "\n";
+   }
+}
+
+# save_config_tbl_meta {{{3
+sub save_config_tbl_meta {
+   my $file = shift;
+   foreach my $tbl ( sort keys %tbl_meta ) {
+      foreach my $col ( keys %{$tbl_meta{$tbl}->{cols}} ) {
+         my $meta = $tbl_meta{$tbl}->{cols}->{$col};
+         next unless $meta->{user};
+         print $file "$col=", join(
+            " ",
+            map {
+               # Some properties (trans) are arrays, others scalars
+               my $val = ref($meta->{$_}) ? join(',', @{$meta->{$_}}) : $meta->{$_};
+               $val =~ s/([\\'])/\\$1/g;  # Escape backslashes and single quotes
+               "$_='$val'";               # Enclose in single quotes
+            }
+            grep { $_ ne 'func' }
+            keys %$meta
+         ), "\n";
+      }
+   }
+}
+
+# save_config_config {{{3
+sub save_config_config {
+   my $file = shift;
+   foreach my $key ( sort keys %config ) {
+      eval {
+      if ( $key ne 'password' || $config{savepass}->{val} ) {
+         print $file "# $config{$key}->{note}\n"
+            or die "Cannot print to file: $OS_ERROR";
+         my $val = $config{$key}->{val};
+         $val = '' unless defined($val);
+         if ( ref( $val ) eq 'ARRAY' ) {
+            print $file "$key="
+               . join( " ", @$val ) . "\n"
+               or die "Cannot print to file: $OS_ERROR";
+         }
+         elsif ( ref( $val ) eq 'HASH' ) {
+            print $file "$key="
+               . join( " ",
+                  map { "$_:$val->{$_}" } keys %$val
+               ) . "\n";
+         }
+         else {
+            print $file "$key=$val\n";
+         }
+      }
+      };
+      if ( $EVAL_ERROR ) { print "$EVAL_ERROR in $key"; };
+   }
+
+}
+
+# load_config_config {{{3
+sub load_config_config {
+   my ( $file ) = @_;
+
+   # Look in the command-line parameters for things stored in the same slot.
+   my %cmdline =
+      map  { $_->{c} => $opts{$_->{k}} }
+      grep { exists $_->{c} && exists $opts{$_->{k}} }
+      @opt_spec;
+
+   while ( my $line = <$file> ) {
+      chomp $line;
+      next if $line =~ m/^#/;
+      last if $line =~ m/^\[/;
+
+      my ( $name, $val ) = $line =~ m/^(.+?)=(.*)$/;
+      next unless defined $name && defined $val;
+
+      # Values might already have been set at the command line.
+      $val = defined($cmdline{$name}) ? $cmdline{$name} : $val;
+
+      # Validate the incoming values...
+      if ( $name && exists( $config{$name} ) ) {
+         if ( !$config{$name}->{pat} || $val =~ m/$config{$name}->{pat}/ ) {
+            $config{$name}->{val} = $val;
+            $config{$name}->{read} = 1;
+         }
+      }
+   }
+}
+
+# load_config_tbl_meta {{{3
+sub load_config_tbl_meta {
+   my ( $file ) = @_;
+
+   while ( my $line = <$file> ) {
+      chomp $line;
+      next if $line =~ m/^#/;
+      last if $line =~ m/^\[/;
+
+      # Each tbl_meta section has all the properties defined in %col_props.
+      my ( $col , $rest ) = $line =~ m/^(.*?)=(.*)$/;
+      next unless $col;
+      my %parts = $rest =~ m/(\w+)='((?:(?!(?<!\\)').)*)'/g; # Properties are single-quoted
+
+      # Each section read from the config file has one extra property: which table it
+      # goes in.
+      my $tbl  = $parts{tbl}     or die "There's no table for tbl_meta $col";
+      my $meta = $tbl_meta{$tbl} or die "There's no table in tbl_meta named $tbl";
+
+      # The section is user-defined by definition (if that makes sense).
+      $parts{user} = 1;
+
+      # The column may already exist in the table, in which case this is just a
+      # customization.
+      $meta->{cols}->{$col} ||= {};
+
+      foreach my $prop ( keys %col_props ) {
+         if ( !defined($parts{$prop}) ) {
+            die "Undefined property $prop for column $col in table $tbl";
+         }
+
+         # Un-escape escaping
+         $parts{$prop} =~ s/\\\\/\\/g;
+         $parts{$prop} =~ s/\\'/'/g;
+
+         if ( ref $col_props{$prop} ) {
+            if ( $prop eq 'trans' ) {
+               $meta->{cols}->{$col}->{trans}
+                  = [ unique(grep { exists $trans_funcs{$_} } split(',', $parts{$prop})) ];
+            }
+            else {
+               $meta->{cols}->{$col}->{$prop} = [ split(',', $parts{$prop}) ];
+            }
+         }
+         else {
+            $meta->{cols}->{$col}->{$prop} = $parts{$prop};
+         }
+      }
+
+   }
+}
+
+# save_config {{{3
+sub save_config {
+   return if $config{readonly}->{val};
+   # Save to a temp file first, so a crash doesn't destroy the main config file
+   my $newname  = $opts{c} || "$homepath/.innotop/innotop.ini";
+   my $filename = $newname . '_tmp';
+   open my $file, "+>", $filename
+      or die("Can't write to $filename: $OS_ERROR");
+   print $file "version=$VERSION\n";
+
+   foreach my $section ( @ordered_config_file_sections ) {
+      die "No such config file section $section" unless $config_file_sections{$section};
+      print $file "\n[$section]\n\n";
+      $config_file_sections{$section}->{writer}->($file);
+      print $file "\n[/$section]\n";
+   }
+
+   # Now clobber the main config file with the temp.
+   close $file or die("Can't close $filename: $OS_ERROR");
+   rename($filename, $newname) or die("Can't rename $filename to $newname: $OS_ERROR");
+}
+
+# load_config_connections {{{3
+sub load_config_connections {
+   my ( $file ) = @_;
+   while ( my $line = <$file> ) {
+      chomp $line;
+      next if $line =~ m/^#/;
+      last if $line =~ m/^\[/;
+
+      my ( $key , $rest ) = $line =~ m/^(.*?)=(.*)$/;
+      next unless $key;
+      my %parts = $rest =~ m/(\S+?)=(\S*)/g;
+      my %conn  = map { $_ => $parts{$_} || '' } @conn_parts;
+      $connections{$key} = \%conn;
+   }
+}
+
+# save_config_connections {{{3
+sub save_config_connections {
+   my $file = shift;
+   foreach my $conn ( sort keys %connections ) {
+      my $href = $connections{$conn};
+      my @keys = $href->{savepass} ? @conn_parts : grep { $_ ne 'pass' } @conn_parts;
+      print $file "$conn=", join(' ', map { "$_=$href->{$_}" } grep { defined $href->{$_} } @keys), "\n";
+   }
+}
+
+sub load_config_colors {
+   my ( $file ) = @_;
+   my %rule_set_for;
+
+   while ( my $line = <$file> ) {
+      chomp $line;
+      next if $line =~ m/^#/;
+      last if $line =~ m/^\[/;
+
+      my ( $tbl, $rule ) = $line =~ m/^(.*?)=(.*)$/;
+      next unless $tbl && $rule;
+      next unless exists $tbl_meta{$tbl};
+      my %parts = $rule =~ m/(\w+)='((?:(?!(?<!\\)').)*)'/g; # Properties are single-quoted
+      next unless $parts{col} && exists $tbl_meta{$tbl}->{cols}->{$parts{col}};
+      next unless $parts{op}  && exists $comp_ops{$parts{op}};
+      next unless defined $parts{arg};
+      next unless defined $parts{color};
+      my @colors = unique(grep { exists $ansicolors{$_} } split(/\W+/, $parts{color}));
+      next unless @colors;
+
+      # Finally!  Enough validation...
+      $rule_set_for{$tbl} ||= [];
+      push @{$rule_set_for{$tbl}}, \%parts;
+   }
+
+   foreach my $tbl ( keys %rule_set_for ) {
+      $tbl_meta{$tbl}->{colors} = $rule_set_for{$tbl};
+      $tbl_meta{$tbl}->{color_func} = make_color_func($tbl_meta{$tbl});
+      $tbl_meta{$tbl}->{cust}->{colors} = 1;
+   }
+}
+
+# save_config_colors {{{3
+sub save_config_colors {
+   my $file = shift;
+   foreach my $tbl ( sort keys %tbl_meta ) {
+      my $meta = $tbl_meta{$tbl};
+      next unless $meta->{cust}->{colors};
+      foreach my $rule ( @{$meta->{colors}} ) {
+         print $file "$tbl=", join(
+            ' ',
+            map {
+               my $val = $rule->{$_};
+               $val =~ s/([\\'])/\\$1/g;  # Escape backslashes and single quotes
+               "$_='$val'";               # Enclose in single quotes
+            }
+            qw(col op arg color)
+         ), "\n";
+      }
+   }
+}
+
+# load_config_active_connections {{{3
+sub load_config_active_connections {
+   my ( $file ) = @_;
+   while ( my $line = <$file> ) {
+      chomp $line;
+      next if $line =~ m/^#/;
+      last if $line =~ m/^\[/;
+
+      my ( $key , $rest ) = $line =~ m/^(.*?)=(.*)$/;
+      next unless $key && exists $modes{$key};
+      my @parts = grep { exists $connections{$_} } split(/ /, $rest);
+      $modes{$key}->{connections} = [ @parts ] if exists $modes{$key};
+   }
+}
+
+# save_config_active_connections {{{3
+sub save_config_active_connections {
+   my $file = shift;
+   foreach my $mode ( sort keys %modes ) {
+      my @connections = get_connections($mode);
+      print $file "$mode=", join(' ', @connections), "\n";
+   }
+}
+
+# load_config_stmt_sleep_times {{{3
+sub load_config_stmt_sleep_times {
+   my ( $file ) = @_;
+   while ( my $line = <$file> ) {
+      chomp $line;
+      next if $line =~ m/^#/;
+      last if $line =~ m/^\[/;
+
+      my ( $key , $val ) = split('=', $line);
+      next unless $key && defined $val && $val =~ m/$num_regex/;
+      $stmt_sleep_time_for{$key} = $val;
+   }
+}
+
+# save_config_stmt_sleep_times {{{3
+sub save_config_stmt_sleep_times {
+   my $file = shift;
+   foreach my $key ( sort keys %stmt_sleep_time_for ) {
+      print $file "$key=$stmt_sleep_time_for{$key}\n";
+   }
+}
+
+# load_config_mvs {{{3
+sub load_config_mvs {
+   my ( $file ) = @_;
+   while ( my $line = <$file> ) {
+      chomp $line;
+      next if $line =~ m/^#/;
+      last if $line =~ m/^\[/;
+
+      my ( $key , $val ) = split('=', $line);
+      next unless $key && defined $val && $val =~ m/$num_regex/;
+      $mvs{$key} = $val;
+   }
+}
+
+# save_config_mvs {{{3
+sub save_config_mvs {
+   my $file = shift;
+   foreach my $key ( sort keys %mvs ) {
+      print $file "$key=$mvs{$key}\n";
+   }
+}
+
+# edit_configuration {{{3
+sub edit_configuration {
+   my $key = '';
+   while ( $key ne 'q' ) {
+      $clear_screen_sub->();
+      my @display_lines = '';
+
+      if ( $key && $cfg_editor_action{$key} ) {
+         $cfg_editor_action{$key}->{func}->();
+      }
+
+      # Show help
+      push @display_lines, create_caption('What configuration do you want to edit?',
+      create_table2(
+         [ sort keys %cfg_editor_action ],
+         { map { $_ => $_ } keys %cfg_editor_action },
+         { map { $_ => $cfg_editor_action{$_}->{note} } keys %cfg_editor_action },
+         { sep => '  ' }));
+
+      draw_screen(\@display_lines);
+      $key = pause('');
+   }
+}
+
+# edit_configuration_variables {{{3
+sub edit_configuration_variables {
+   $clear_screen_sub->();
+   my $mode = $config{mode}->{val};
+
+   my %config_choices
+      = map  { $_ => $config{$_}->{note} || '' }
+        # Only config values that are marked as applying to this mode.
+        grep {
+           my $key = $_;
+           $config{$key}->{conf} &&
+              ( $config{$key}->{conf} eq 'ALL'
+              || grep { $mode eq $_ } @{$config{$key}->{conf}} )
+        } keys %config;
+
+   my $key = prompt_list(
+      "Enter the name of the variable you wish to configure",
+      '',
+      sub{ return keys %config_choices },
+      \%config_choices);
+
+   if ( exists($config_choices{$key}) ) {
+      get_config_interactive($key);
+   }
+}
+
+# edit_color_rules {{{3
+sub edit_color_rules {
+   my ( $tbl ) = @_;
+   $clear_screen_sub->();
+   $tbl ||= choose_visible_table();
+   if ( $tbl && exists($tbl_meta{$tbl}) ) {
+      my $meta = $tbl_meta{$tbl};
+      my @cols = ('', qw(col op arg color));
+      my $info = { map { $_ => { hdr => $_, just => '-', } }  @cols };
+      $info->{label}->{maxw} = 30;
+      my $key;
+      my $selected_rule;
+
+      # This loop builds a tabular view of the rules.
+      do {
+
+         # Show help
+         if ( $key && $key eq '?' ) {
+            my @display_lines = '';
+            push @display_lines, create_caption('Editor key mappings',
+            create_table2(
+               [ sort keys %color_editor_action ],
+               { map { $_ => $_ } keys %color_editor_action },
+               { map { $_ => $color_editor_action{$_}->{note} } keys %color_editor_action },
+               { sep => '  ' }));
+            draw_screen(\@display_lines);
+            pause();
+            $key = '';
+         }
+         else {
+
+            # Do the action specified
+            $selected_rule ||= 0;
+            if ( $key && $color_editor_action{$key} ) {
+               $selected_rule = $color_editor_action{$key}->{func}->($tbl, $selected_rule);
+               $selected_rule ||= 0;
+            }
+
+            # Build the table of rules.  If the terminal has color, the selected rule
+            # will be highlighted; otherwise a > at the left will indicate.
+            my $data = $meta->{colors} || [];
+            foreach my $i ( 0..@$data - 1  ) {
+               $data->[$i]->{''} = $i == $selected_rule ? '>' : '';
+            }
+            my @display_lines = create_table(\@cols, $info, $data);
+
+            # Highlight selected entry
+            for my $i ( 0 .. $#display_lines ) {
+               if ( $display_lines[$i] =~ m/^>/ ) {
+                  $display_lines[$i] = [ $display_lines[$i], 'reverse' ];
+               }
+            }
+
+            # Draw the screen and wait for a command.
+            unshift @display_lines, '',
+               "Editing color rules for $meta->{capt}.  Press ? for help, q to "
+               . "quit.", '';
+            draw_screen(\@display_lines);
+            print "\n\n", word_wrap('Rules are applied in order from top to '
+               . 'bottom.  The first matching rule wins and prevents the '
+               . 'rest of the rules from being applied.');
+            $key = pause('');
+         }
+      } while ( $key ne 'q' );
+      $meta->{color_func} = make_color_func($meta);
+   }
+}
+
+# add_quick_filter {{{3
+sub add_quick_filter {
+   my $tbl = choose_visible_table();
+   if ( $tbl && exists($tbl_meta{$tbl}) ) {
+      print "\n";
+      my $response = prompt_list(
+         "Enter column name and filter text",
+         '',
+         sub { return keys %{$tbl_meta{$tbl}->{cols}} },
+         ()
+      );
+      my ( $col, $text ) = split(/\s+/, $response, 2);
+
+      # You can't filter on a nonexistent column.  But if you filter on a pivoted
+      # table, the columns are different, so on a pivoted table, allow filtering
+      # on the 'name' column.
+      # NOTE: if a table is pivoted and un-pivoted, this will likely cause crashes.
+      # Currently not an issue since there's no way to toggle pivot/nopivot.
+      return unless $col && $text &&
+         (exists($tbl_meta{$tbl}->{cols}->{$col})
+            || ($tbl_meta{$tbl}->{pivot} && $col eq 'name'));
+
+      my ( $sub, $err ) = compile_filter( "defined \$set->{$col} && \$set->{$col} =~ m/$text/" );
+      return if !$sub || $err;
+      my $name = "quick_$tbl.$col";
+      $filters{$name} = {
+         func  => $sub,
+         text  => $text,
+         user  => 1,
+         quick => 1,
+         name  => $name,
+         note  => 'Quick-filter',
+         tbls  => [$tbl],
+      };
+      push @{$tbl_meta{$tbl}->{filters}}, $name;
+   }
+}
+
+# clear_quick_filters {{{3
+sub clear_quick_filters {
+   my $tbl = choose_visible_table(
+      # Only tables that have quick-filters
+      sub {
+         my ( $tbl ) = @_;
+         return scalar grep { $filters{$_}->{quick} } @{ $tbl_meta{$tbl}->{filters} };
+      }
+   );
+   if ( $tbl && exists($tbl_meta{$tbl}) ) {
+      my @current = @{$tbl_meta{$tbl}->{filters}};
+      @current = grep { !$filters{$_}->{quick} } @current;
+      $tbl_meta{$tbl}->{filters} = \@current;
+   }
+}
+
+sub edit_plugins {
+   $clear_screen_sub->();
+
+   my @cols = ('', qw(class desc active));
+   my $info = { map { $_ => { hdr => $_, just => '-', } }  @cols };
+   my @rows = map { $plugins{$_} } sort keys %plugins;
+   my $key;
+   my $selected;
+
+   # This loop builds a tabular view of the plugins.
+   do {
+
+      # Show help
+      if ( $key && $key eq '?' ) {
+         my @display_lines = '';
+         push @display_lines, create_caption('Editor key mappings',
+         create_table2(
+            [ sort keys %plugin_editor_action ],
+            { map { $_ => $_ } keys %plugin_editor_action },
+            { map { $_ => $plugin_editor_action{$_}->{note} } keys %plugin_editor_action },
+            { sep => '  ' }));
+         draw_screen(\@display_lines);
+         pause();
+         $key = '';
+      }
+
+      # Do the action specified
+      else {
+         $selected ||= 0;
+         if ( $key && $plugin_editor_action{$key} ) {
+            $selected = $plugin_editor_action{$key}->{func}->(\@rows, $selected);
+            $selected ||= 0;
+         }
+
+         # Build the table of plugins.
+         foreach my $row ( 0.. $#rows ) {
+            $rows[$row]->{''} = $row eq $selected ? '>' : ' ';
+         }
+         my @display_lines = create_table(\@cols, $info, \@rows);
+
+         # Highlight selected entry
+         for my $i ( 0 .. $#display_lines ) {
+            if ( $display_lines[$i] =~ m/^>/ ) {
+               $display_lines[$i] = [ $display_lines[$i], 'reverse' ];
+            }
+         }
+
+         # Draw the screen and wait for a command.
+         unshift @display_lines, '',
+            "Plugin Management.  Press ? for help, q to quit.", '';
+         draw_screen(\@display_lines);
+         $key = pause('');
+      }
+   } while ( $key ne 'q' );
+}
+
+# edit_table {{{3
+sub edit_table {
+   $clear_screen_sub->();
+   my ( $tbl ) = @_;
+   $tbl ||= choose_visible_table();
+   if ( $tbl && exists($tbl_meta{$tbl}) ) {
+      my $meta = $tbl_meta{$tbl};
+      my @cols = ('', qw(name hdr label src));
+      my $info = { map { $_ => { hdr => $_, just => '-', } }  @cols };
+      $info->{label}->{maxw} = 30;
+      my $key;
+      my $selected_column;
+
+      # This loop builds a tabular view of the tbl_meta's structure, showing each column
+      # in the entry as a row.
+      do {
+
+         # Show help
+         if ( $key && $key eq '?' ) {
+            my @display_lines = '';
+            push @display_lines, create_caption('Editor key mappings',
+            create_table2(
+               [ sort keys %tbl_editor_action ],
+               { map { $_ => $_ } keys %tbl_editor_action },
+               { map { $_ => $tbl_editor_action{$_}->{note} } keys %tbl_editor_action },
+               { sep => '  ' }));
+            draw_screen(\@display_lines);
+            pause();
+            $key = '';
+         }
+         else {
+
+            # Do the action specified
+            $selected_column ||= $meta->{visible}->[0];
+            if ( $key && $tbl_editor_action{$key} ) {
+               $selected_column = $tbl_editor_action{$key}->{func}->($tbl, $selected_column);
+               $selected_column ||= $meta->{visible}->[0];
+            }
+
+            # Build the pivoted view of the table's meta-data.  If the terminal has color,
+            # The selected row will be highlighted; otherwise a > at the left will indicate.
+            my $data = [];
+            foreach my $row ( @{$meta->{visible}} ) {
+               my %hash;
+               @hash{ @cols } = @{$meta->{cols}->{$row}}{@cols};
+               $hash{src}  = '' if ref $hash{src};
+               $hash{name} = $row;
+               $hash{''}   = $row eq $selected_column ? '>' : ' ';
+               push @$data, \%hash;
+            }
+            my @display_lines = create_table(\@cols, $info, $data);
+
+            # Highlight selected entry
+            for my $i ( 0 .. $#display_lines ) {
+               if ( $display_lines[$i] =~ m/^>/ ) {
+                  $display_lines[$i] = [ $display_lines[$i], 'reverse' ];
+               }
+            }
+
+            # Draw the screen and wait for a command.
+            unshift @display_lines, '',
+               "Editing table definition for $meta->{capt}.  Press ? for help, q to quit.", '';
+            draw_screen(\@display_lines, { clear => 1 });
+            $key = pause('');
+         }
+      } while ( $key ne 'q' );
+   }
+}
+
+# choose_mode_tables {{{3
+# Choose which table(s), and in what order, to display in a given mode.
+sub choose_mode_tables {
+   my $mode = $config{mode}->{val};
+   my @tbls = @{$modes{$mode}->{visible_tables}};
+   my $new  = prompt_list(
+      "Choose tables to display",
+      join(' ', @tbls),
+      sub { return @{$modes{$mode}->{tables}} },
+      { map { $_ => $tbl_meta{$_}->{capt} } @{$modes{$mode}->{tables}} }
+   );
+   $modes{$mode}->{visible_tables} =
+      [ unique(grep { $_ && exists $tbl_meta{$_} } split(/\s+/, $new)) ];
+   $modes{$mode}->{cust}->{visible_tables} = 1;
+}
+
+# choose_visible_table {{{3
+sub choose_visible_table {
+   my ( $grep_cond ) = @_;
+   my $mode = $config{mode}->{val};
+   my @tbls
+      = grep { $grep_cond ? $grep_cond->($_) : 1 }
+        @{$modes{$mode}->{visible_tables}};
+   my $tbl = $tbls[0];
+   if ( @tbls > 1 ) {
+      $tbl = prompt_list(
+         "Choose a table",
+         '',
+         sub { return @tbls },
+         { map { $_ => $tbl_meta{$_}->{capt} } @tbls }
+      );
+   }
+   return $tbl;
+}
+
+sub toggle_aggregate {
+   my ( $tbl ) = @_;
+   $tbl ||= choose_visible_table();
+   return unless $tbl && exists $tbl_meta{$tbl};
+   my $meta = $tbl_meta{$tbl};
+   $meta->{aggregate} ^= 1;
+}
+
+sub choose_filters {
+   my ( $tbl ) = @_;
+   $tbl ||= choose_visible_table();
+   return unless $tbl && exists $tbl_meta{$tbl};
+   my $meta = $tbl_meta{$tbl};
+   $clear_screen_sub->();
+
+   print "Choose filters for $meta->{capt}:\n";
+
+   my $ini = join(' ', @{$meta->{filters}});
+   my $val = prompt_list(
+      'Choose filters',
+      $ini,
+      sub { return keys %filters },
+      {
+         map  { $_ => $filters{$_}->{note} }
+         grep { grep { $tbl eq $_ } @{$filters{$_}->{tbls}} }
+         keys %filters
+      }
+   );
+
+   my @choices = unique(split(/\s+/, $val));
+   foreach my $new ( grep { !exists($filters{$_}) } @choices ) {
+      my $answer = prompt("There is no filter called '$new'.  Create it?", undef, 'y');
+      if ( $answer eq 'y' ) {
+         create_new_filter($new, $tbl);
+      }
+   }
+   @choices = grep { exists $filters{$_} } @choices;
+   @choices = grep { grep { $tbl eq $_ } @{$filters{$_}->{tbls}} } @choices;
+   $meta->{filters} = [ @choices ];
+   $meta->{cust}->{filters} = 1;
+}
+
+sub choose_group_cols {
+   my ( $tbl ) = @_;
+   $tbl ||= choose_visible_table();
+   return unless $tbl && exists $tbl_meta{$tbl};
+   $clear_screen_sub->();
+   my $meta = $tbl_meta{$tbl};
+   my $curr = join(', ', @{$meta->{group_by}});
+   my $val = prompt_list(
+      'Group-by columns',
+      $curr,
+      sub { return keys %{$meta->{cols}} },
+      { map { $_ => $meta->{cols}->{$_}->{label} } keys %{$meta->{cols}} });
+   if ( $curr ne $val ) {
+      $meta->{group_by} = [ grep { exists $meta->{cols}->{$_} } $val =~ m/(\w+)/g ];
+      $meta->{cust}->{group_by} = 1;
+   }
+}
+
+sub choose_sort_cols {
+   my ( $tbl ) = @_;
+   $tbl ||= choose_visible_table();
+   return unless $tbl && exists $tbl_meta{$tbl};
+   $clear_screen_sub->();
+   my $meta = $tbl_meta{$tbl};
+
+   my ( $cols, $hints );
+   if ( $meta->{pivot} ) {
+      $cols  = sub { qw(name set_0) };
+      $hints = { name => 'name', set_0 => 'set_0' };
+   }
+   else {
+      $cols  = sub { return keys %{$meta->{cols}} };
+      $hints = { map { $_ => $meta->{cols}->{$_}->{label} } keys %{$meta->{cols}} };
+   }
+
+   my $val = prompt_list(
+      'Sort columns (reverse sort with -col)',
+      $meta->{sort_cols},
+      $cols,
+      $hints );
+   if ( $meta->{sort_cols} ne $val ) {
+      $meta->{sort_cols} = $val;
+      $meta->{cust}->{sort_cols} = 1;
+      $tbl_meta{$tbl}->{sort_func} = make_sort_func($tbl_meta{$tbl});
+   }
+}
+
+# create_new_filter {{{3
+sub create_new_filter {
+   my ( $filter, $tbl ) = @_;
+   $clear_screen_sub->();
+
+   if ( !$filter || $filter =~ m/\W/ ) {
+      print word_wrap("Choose a name for the filter.  This name is not displayed, and is only used "
+            . "for internal reference.  It can only contain lowercase letters, numbers, and underscores.");
+      print "\n\n";
+      do {
+         $filter = prompt("Enter filter name");
+      } while ( !$filter || $filter =~ m/\W/ );
+   }
+
+   my $completion = sub { keys %{$tbl_meta{$tbl}->{cols}} };
+   my ( $err, $sub, $body );
+   do {
+      $clear_screen_sub->();
+      print word_wrap("A filter is a Perl subroutine that accepts a hashref of columns "
+         . "called \$set, and returns a true value if the filter accepts the row.  Example:\n"
+         . "   \$set->{active_secs} > 5\n"
+         . "will only allow rows if their active_secs column is greater than 5.");
+      print "\n\n";
+      if ( $err ) {
+         print "There's an error in your filter expression: $err\n\n";
+      }
+      $body = prompt("Enter subroutine body", undef, undef, $completion);
+      ( $sub, $err ) = compile_filter($body);
+   } while ( $err );
+
+   $filters{$filter} = {
+      func => $sub,
+      text => $body,
+      user => 1,
+      name => $filter,
+      note => 'User-defined filter',
+      tbls => [$tbl],
+   };
+}
+
+# get_config_interactive {{{3
+sub get_config_interactive {
+   my $key = shift;
+   $clear_screen_sub->();
+
+   # Print help first.
+   print "Enter a new value for '$key' ($config{$key}->{note}).\n";
+
+   my $current = ref($config{$key}->{val}) ? join(" ", @{$config{$key}->{val}}) : $config{$key}->{val};
+
+   my $new_value = prompt('Enter a value', $config{$key}->{pat}, $current);
+   $config{$key}->{val} = $new_value;
+}
+
+sub edit_current_var_set {
+   my $mode = $config{mode}->{val};
+   my $name = $config{"${mode}_set"}->{val};
+   my $variables = $var_sets{$name}->{text};
+
+   my $new = $variables;
+   do {
+      $clear_screen_sub->();
+      $new = prompt("Enter variables for $name", undef, $variables);
+   } until ( $new );
+
+   if ( $new ne $variables ) {
+      @{$var_sets{$name}}{qw(text user)} = ( $new, 1);
+   }
+}
+
+
+sub choose_var_set {
+   my ( $key ) = @_;
+   $clear_screen_sub->();
+
+   my $new_value = prompt_list(
+      'Choose a set of values to display, or enter the name of a new one',
+      $config{$key}->{val},
+      sub { return keys %var_sets },
+      { map { $_ => $var_sets{$_}->{text} } keys %var_sets });
+
+   if ( !exists $var_sets{$new_value} ) {
+      add_new_var_set($new_value);
+   }
+
+   $config{$key}->{val} = $new_value if exists $var_sets{$new_value};
+}
+
+sub switch_var_set {
+   my ( $cfg_var, $dir ) = @_;
+   my @var_sets = sort keys %var_sets;
+   my $cur      = $config{$cfg_var}->{val};
+   my $pos      = grep { $_ lt $cur } @var_sets;
+   my $newpos   = ($pos + $dir) % @var_sets;
+   $config{$cfg_var}->{val} = $var_sets[$newpos];
+   $clear_screen_sub->();
+}
+
+# Online configuration and prompting functions {{{2
+
+# edit_stmt_sleep_times {{{3
+sub edit_stmt_sleep_times {
+   $clear_screen_sub->();
+   my $stmt = prompt_list('Specify a statement', '', sub { return sort keys %stmt_maker_for });
+   return unless $stmt && exists $stmt_maker_for{$stmt};
+   $clear_screen_sub->();
+   my $curr_val = $stmt_sleep_time_for{$stmt} || 0;
+   my $new_val  = prompt('Specify a sleep delay after calling this SQL', $num_regex, $curr_val);
+   if ( $new_val ) {
+      $stmt_sleep_time_for{$stmt} = $new_val;
+   }
+   else {
+      delete $stmt_sleep_time_for{$stmt};
+   }
+}
+
+# edit_server_groups {{{3
+# Choose which server connections are in a server group.  First choose a group,
+# then choose which connections are in it.
+sub edit_server_groups {
+   $clear_screen_sub->();
+   my $mode  = $config{mode}->{val};
+   my $group = $modes{$mode}->{server_group};
+   my %curr  = %server_groups;
+   my $new   = choose_or_create_server_group($group, 'to edit');
+   $clear_screen_sub->();
+   if ( exists $curr{$new} ) {
+      # Don't do this step if the user just created a new server group,
+      # because part of that process was to choose connections.
+      my $cxns  = join(' ', @{$server_groups{$new}});
+      my @conns = choose_or_create_connection($cxns, 'for this group');
+      $server_groups{$new} = \@conns;
+   }
+}
+
+# choose_server_groups {{{3
+sub choose_server_groups {
+   $clear_screen_sub->();
+   my $mode  = $config{mode}->{val};
+   my $group = $modes{$mode}->{server_group};
+   my $new   = choose_or_create_server_group($group, 'for this mode');
+   $modes{$mode}->{server_group} = $new if exists $server_groups{$new};
+}
+
+sub choose_or_create_server_group {
+   my ( $group, $prompt ) = @_;
+   my $new   = '';
+
+   my @available = sort keys %server_groups;
+
+   if ( @available ) {
+      print "You can enter the name of a new group to create it.\n";
+
+      $new = prompt_list(
+         "Choose a server group $prompt",
+         $group,
+         sub { return @available },
+         { map { $_ => join(' ', @{$server_groups{$_}}) } @available });
+
+      $new =~ s/\s.*//;
+
+      if ( !exists $server_groups{$new} ) {
+         my $answer = prompt("There is no server group called '$new'.  Create it?", undef, "y");
+         if ( $answer eq 'y' ) {
+            add_new_server_group($new);
+         }
+      }
+   }
+   else {
+      $new = add_new_server_group();
+   }
+   return $new;
+}
+
+sub choose_or_create_connection {
+   my ( $cxns, $prompt ) = @_;
+   print "You can enter the name of a new connection to create it.\n";
+
+   my @available = sort keys %connections;
+   my $new_cxns = prompt_list(
+      "Choose connections $prompt",
+      $cxns,
+      sub { return @available },
+      { map { $_ => $connections{$_}->{dsn} } @available });
+
+   my @new = unique(grep { !exists $connections{$_} } split(/\s+/, $new_cxns));
+   foreach my $new ( @new ) {
+      my $answer = prompt("There is no connection called '$new'.  Create it?", undef, "y");
+      if ( $answer eq 'y' ) {
+         add_new_dsn($new);
+      }
+   }
+
+   return unique(grep { exists $connections{$_} } split(/\s+/, $new_cxns));
+}
+
+# choose_servers {{{3
+sub choose_servers {
+   $clear_screen_sub->();
+   my $mode = $config{mode}->{val};
+   my $cxns = join(' ', get_connections());
+   my @chosen = choose_or_create_connection($cxns, 'for this mode');
+   $modes{$mode}->{connections} = \@chosen;
+   $modes{$mode}->{server_group} = ''; # Clear this because it overrides {connections}
+}
+
+# display_license {{{3
+sub display_license {
+   $clear_screen_sub->();
+
+   print $innotop_license;
+
+   pause();
+}
+
+# Data-retrieval functions {{{2
+# get_status_info {{{3
+# Get SHOW STATUS and SHOW VARIABLES together.
+sub get_status_info {
+   my @cxns = @_;
+   if ( !$info_gotten{status}++ ) {
+      foreach my $cxn ( @cxns ) {
+         $vars{$cxn}->{$clock} ||= {};
+         my $vars = $vars{$cxn}->{$clock};
+
+         my $sth = do_stmt($cxn, 'SHOW_STATUS') or next;
+         my $res = $sth->fetchall_arrayref();
+         map { $vars->{$_->[0]} = $_->[1] || 0 } @$res;
+
+         # Calculate hi-res uptime and add cxn to the hash.  This duplicates get_driver_status,
+         # but it's most important to have consistency.
+         $vars->{Uptime_hires} ||= get_uptime($cxn);
+         $vars->{cxn} = $cxn;
+
+         # Add SHOW VARIABLES to the hash
+         $sth = do_stmt($cxn, 'SHOW_VARIABLES') or next;
+         $res = $sth->fetchall_arrayref();
+         map { $vars->{$_->[0]} = $_->[1] || 0 } @$res;
+      }
+   }
+}
+
+# Chooses a thread for explaining, killing, etc...
+# First arg is a func that can be called in grep.
+sub choose_thread {
+   my ( $grep_cond, $prompt ) = @_;
+
+   # Narrow the list to queries that can be explained.
+   my %thread_for = map {
+      # Eliminate innotop's own threads.
+      $_ => $dbhs{$_}->{dbh} ? $dbhs{$_}->{dbh}->{mysql_thread_id} : 0
+   } keys %connections;
+
+   my @candidates = grep {
+      $_->{id} != $thread_for{$_->{cxn}} && $grep_cond->($_)
+   } @current_queries;
+   return unless @candidates;
+
+   # Find out which server.
+   my @cxns = unique map { $_->{cxn} } @candidates;
+   my ( $cxn ) = select_cxn('On which server', @cxns);
+   return unless $cxn && exists($connections{$cxn});
+
+   # Re-filter the list of candidates to only those on this server
+   @candidates = grep { $_->{cxn} eq $cxn } @candidates;
+
+   # Find out which thread to do.
+   my $info;
+   if ( @candidates > 1 ) {
+
+      # Sort longest-active first, then longest-idle.
+      my $sort_func = sub {
+         my ( $a, $b ) = @_;
+         return  $a->{query} && !$b->{query} ? 1
+               : $b->{query} && !$a->{query} ? -1
+               : ($a->{time} || 0) <=> ($b->{time} || 0);
+      };
+      my @threads = map { $_->{id} } reverse sort { $sort_func->($a, $b) } @candidates;
+
+      print "\n";
+      my $thread = prompt_list($prompt,
+         $threads[0],
+         sub { return @threads });
+      return unless $thread && $thread =~ m/$int_regex/;
+
+      # Find the info hash of that query on that server.
+      ( $info ) = grep { $thread == $_->{id} } @candidates;
+   }
+   else {
+      $info = $candidates[0];
+   }
+   return $info;
+}
+
+# analyze_query {{{3
+# Allows the user to show fulltext, explain, show optimized...
+sub analyze_query {
+   my ( $action ) = @_;
+
+   my $info = choose_thread(
+      sub { $_[0]->{query} },
+      'Select a thread to analyze',
+   );
+   return unless $info;
+
+   my %actions = (
+      e => \&display_explain,
+      f => \&show_full_query,
+      o => \&show_optimized_query,
+   );
+   do {
+      $actions{$action}->($info);
+      print "\n";
+      $action = pause('Press e to explain, f for full query, o for optimized query');
+   } while ( exists($actions{$action}) );
+}
+
+# inc {{{3
+# Returns the difference between two sets of variables/status/innodb stuff.
+sub inc {
+   my ( $offset, $cxn ) = @_;
+   my $vars = $vars{$cxn};
+   if ( $offset < 0 ) {
+      return $vars->{$clock};
+   }
+   elsif ( exists $vars{$clock - $offset} && !exists $vars->{$clock - $offset - 1} ) {
+      return $vars->{$clock - $offset};
+   }
+   my $cur = $vars->{$clock - $offset};
+   my $pre = $vars->{$clock - $offset - 1};
+   return {
+      # Numeric variables get subtracted, non-numeric get passed straight through.
+      map  {
+         $_ =>
+            ( (defined $cur->{$_} && $cur->{$_} =~ m/$num_regex/)
+            ?  $cur->{$_} - ($pre->{$_} || 0)
+            :  $cur->{$_} )
+      } keys %{$cur}
+   };
+}
+
+# extract_values {{{3
+# Arguments are a set of values (which may be incremental, derived from
+# current and previous), current, and previous values.
+# TODO: there are a few places that don't remember prev set so can't pass it.
+sub extract_values {
+   my ( $set, $cur, $pre, $tbl ) = @_;
+
+   # Hook in event listeners
+   foreach my $listener ( @{$event_listener_for{extract_values}} ) {
+      $listener->extract_values($set, $cur, $pre, $tbl);
+   }
+
+   my $result = {};
+   my $meta   = $tbl_meta{$tbl};
+   my $cols   = $meta->{cols};
+   foreach my $key ( keys %$cols ) {
+      my $info = $cols->{$key}
+         or die "Column '$key' doesn't exist in $tbl";
+      die "No func defined for '$key' in $tbl"
+         unless $info->{func};
+      eval {
+         $result->{$key} = $info->{func}->($set, $cur, $pre)
+      };
+      if ( $EVAL_ERROR ) {
+         if ( $config{debug}->{val} ) {
+            die $EVAL_ERROR;
+         }
+         $result->{$key} = $info->{num} ? 0 : '';
+      }
+   }
+   return $result;
+}
+
+# get_full_processlist {{{3
+sub get_full_processlist {
+   my @cxns = @_;
+   my @result;
+   foreach my $cxn ( @cxns ) {
+      my $stmt = do_stmt($cxn, 'PROCESSLIST') or next;
+      my $arr  = $stmt->fetchall_arrayref({});
+      push @result, map { $_->{cxn} = $cxn; $_ } @$arr;
+   }
+   return @result;
+}
+
+# get_open_tables {{{3
+sub get_open_tables {
+   my @cxns = @_;
+   my @result;
+   foreach my $cxn ( @cxns ) {
+      my $stmt = do_stmt($cxn, 'OPEN_TABLES') or next;
+      my $arr  = $stmt->fetchall_arrayref({});
+      push @result, map { $_->{cxn} = $cxn; $_ } @$arr;
+   }
+   return @result;
+}
+
+# get_innodb_status {{{3
+sub get_innodb_status {
+   my ( $cxns, $addl_sections ) = @_;
+   if ( !$config{skip_innodb}->{val} && !$info_gotten{innodb_status}++ ) {
+
+      # Determine which sections need to be parsed
+      my %sections_required =
+         map  { $tbl_meta{$_}->{innodb} => 1 }
+         grep { $_ && $tbl_meta{$_}->{innodb} }
+         get_visible_tables();
+
+      # Add in any other sections the caller requested.
+      foreach my $sec ( @$addl_sections ) {
+         $sections_required{$sec} = 1;
+      }
+
+      foreach my $cxn ( @$cxns ) {
+         my $innodb_status_text;
+
+         if ( $file ) { # Try to fetch status text from the file.
+            my @stat = stat($file);
+
+            # Initialize the file.
+            if ( !$file_mtime ) {
+               # Initialize to 130k from the end of the file (because the limit
+               # on the size of innodb status is 128k even with Google's patches)
+               # and try to grab the last status from the file.
+               sysseek($file, (-128 * 1_024), 2);
+            }
+
+            # Read from the file.
+            my $buffer;
+            if ( !$file_mtime || $file_mtime != $stat[9] ) {
+               $file_data = '';
+               while ( sysread($file, $buffer, 4096) ) {
+                  $file_data .= $buffer;
+               }
+               $file_mtime = $stat[9];
+            }
+
+            # Delete everything but the last InnoDB status text from the file.
+            $file_data =~ s/\A.*(?=^=====================================\n...... ........ INNODB MONITOR OUTPUT)//ms;
+            $innodb_status_text = $file_data;
+         }
+
+         else {
+            my $stmt = do_stmt($cxn, 'INNODB_STATUS') or next;
+            $innodb_status_text = $stmt->fetchrow_hashref()->{status};
+         }
+
+         next unless $innodb_status_text
+            && substr($innodb_status_text, 0, 100) =~ m/INNODB MONITOR OUTPUT/;
+
+         # Parse and merge into %vars storage
+         my %innodb_status = (
+            $innodb_parser->get_status_hash(
+               $innodb_status_text,
+               $config{debug}->{val},
+               \%sections_required,
+               0, # don't parse full lock information
+            )
+         );
+         if ( !$innodb_status{IB_got_all} && $config{auto_wipe_dl}->{val} ) {
+            clear_deadlock($cxn);
+         }
+
+         # Merge using a hash slice, which is the fastest way
+         $vars{$cxn}->{$clock} ||= {};
+         my $hash = $vars{$cxn}->{$clock};
+         @{$hash}{ keys %innodb_status } = values %innodb_status;
+         $hash->{cxn} = $cxn;
+         $hash->{Uptime_hires} ||= get_uptime($cxn);
+      }
+   }
+}
+
+# clear_deadlock {{{3
+sub clear_deadlock {
+   my ( $cxn ) = @_;
+   return if $clearing_deadlocks++;
+   my $tbl = $connections{$cxn}->{dl_table};
+   return unless $tbl;
+
+   eval {
+      # Set up the table for creating a deadlock.
+      my $engine = version_ge($dbhs{$cxn}->{dbh}, '4.1.2') ? 'engine' : 'type';
+      return unless do_query($cxn, "drop table if exists $tbl");
+      return unless do_query($cxn, "create table $tbl(a int) $engine=innodb");
+      return unless do_query($cxn, "delete from $tbl");
+      return unless do_query($cxn, "insert into $tbl(a) values(0), (1)");
+      return unless do_query($cxn, "commit"); # Or the children will block against the parent
+
+      # Fork off two children to deadlock against each other.
+      my %children;
+      foreach my $child ( 0..1 ) {
+         my $pid = fork();
+         if ( defined($pid) && $pid == 0 ) { # I am a child
+            deadlock_thread( $child, $tbl, $cxn );
+         }
+         elsif ( !defined($pid) ) {
+            die("Unable to fork for clearing deadlocks!\n");
+         }
+         # I already exited if I'm a child, so I'm the parent.
+         $children{$child} = $pid;
+      }
+
+      # Wait for the children to exit.
+      foreach my $child ( keys %children ) {
+         my $pid = waitpid($children{$child}, 0);
+      }
+
+      # Clean up.
+      do_query($cxn, "drop table $tbl");
+   };
+   if ( $EVAL_ERROR ) {
+      print $EVAL_ERROR;
+      pause();
+   }
+
+   $clearing_deadlocks = 0;
+}
+
+sub get_master_logs {
+   my @cxns = @_;
+   my @result;
+   if ( !$info_gotten{master_logs}++ ) {
+      foreach my $cxn ( @cxns ) {
+         my $stmt = do_stmt($cxn, 'SHOW_MASTER_LOGS') or next;
+         push @result, @{$stmt->fetchall_arrayref({})};
+      }
+   }
+   return @result;
+}
+
+# get_master_slave_status {{{3
+sub get_master_slave_status {
+   my @cxns = @_;
+   if ( !$info_gotten{replication_status}++ ) {
+      foreach my $cxn ( @cxns ) {
+         $vars{$cxn}->{$clock} ||= {};
+         my $vars = $vars{$cxn}->{$clock};
+         $vars->{cxn} = $cxn;
+
+         my $stmt = do_stmt($cxn, 'SHOW_MASTER_STATUS') or next;
+         my $res = $stmt->fetchall_arrayref({})->[0];
+         @{$vars}{ keys %$res } = values %$res;
+         $stmt = do_stmt($cxn, 'SHOW_SLAVE_STATUS') or next;
+         $res = $stmt->fetchall_arrayref({})->[0];
+         @{$vars}{ keys %$res } = values %$res;
+         $vars->{Uptime_hires} ||= get_uptime($cxn);
+      }
+   }
+}
+
+sub is_func {
+   my ( $word ) = @_;
+   return defined(&$word)
+      || eval "my \$x= sub { $word  }; 1"
+      || $EVAL_ERROR !~ m/^Bareword/;
+}
+
+# Documentation {{{1
+# ############################################################################
+# I put this last as per the Dog book.
+# ############################################################################
+=pod
+
+=head1 NAME
+
+innotop - MySQL and InnoDB transaction/status monitor.
+
+=head1 SYNOPSIS
+
+To monitor servers normally:
+
+ innotop
+
+To monitor InnoDB status information from a file:
+
+ innotop /var/log/mysql/mysqld.err
+
+To run innotop non-interactively in a pipe-and-filter configuration:
+
+ innotop --count 5 -d 1 -n
+
+=head1 DESCRIPTION
+
+innotop monitors MySQL servers.  Each of its modes shows you a different aspect
+of what's happening in the server.  For example, there's a mode for monitoring
+replication, one for queries, and one for transactions.  innotop refreshes its
+data periodically, so you see an updating view.
+
+innotop has lots of features for power users, but you can start and run it with
+virtually no configuration.  If you're just getting started, see
+L<"QUICK-START">.  Press '?' at any time while running innotop for
+context-sensitive help.
+
+=head1 QUICK-START
+
+To start innotop, open a terminal or command prompt.  If you have installed
+innotop on your system, you should be able to just type "innotop" and press
+Enter; otherwise, you will need to change to innotop's directory and type "perl
+innotop".
+
+The first thing innotop needs to know is how to connect to a MySQL server.  You
+can just enter the hostname of the server, for example "localhost" or
+"127.0.0.1" if the server is on the same machine as innotop.  After this innotop
+will prompt you for a DSN (data source name).  You should be able to just accept
+the defaults by pressing Enter.
+
+When innotop asks you about a table to use when resetting InnoDB deadlock
+information, just accept the default for now.  This is an advanced feature you
+can configure later (see L<"D: InnoDB Deadlocks"> for more).
+
+If you have a .my.cnf file with your MySQL connection defaults, innotop can read
+it, and you won't need to specify a username and password if it's in that file.
+Otherwise, you should answer 'y' to the next couple of prompts.
+
+After this, you should be connected, and innotop should show you something like
+the following:
+
+ InnoDB Txns (? for help) localhost, 01:11:19, InnoDB 10s :-), 50 QPS,
+ 
+ CXN        History  Versions  Undo  Dirty Buf  Used Bufs  Txns  MaxTxn
+ localhost        7      2035  0 0       0.00%     92.19%     1   07:34
+ 
+ CXN        ID     User   Host       Txn Status  Time   Undo  Query Tex
+ localhost  98379  user1  webserver  ACTIVE      07:34     0  SELECT `c
+ localhost  98450  user1  webserver  ACTIVE      01:06     0  INSERT IN
+ localhost  97750  user1  webserver  not starte  00:00     0      
+ localhost  98375  user1  appserver  not starte  00:00     0      
+
+(This sample is truncated at the right so it will fit on a terminal when running
+'man innotop')
+
+This sample comes from a quiet server with few transactions active.  If your
+server is busy, you'll see more output.  Notice the first line on the screen,
+which tells you what mode you're in and what server you're connected to.  You
+can change to other modes with keystrokes; press 'Q' to switch to a list of
+currently running queries.
+
+Press the '?' key to see what keys are active in the current mode.  You can
+press any of these keys and innotop will either take the requested action or
+prompt you for more input.  If your system has Term::ReadLine support, you can
+use TAB and other keys to auto-complete and edit input.
+
+To quit innotop, press the 'q' key.
+
+=head1 OPTIONS
+
+innotop is mostly configured via its configuration file, but some of the
+configuration options can come from the command line.  You can also specify a
+file to monitor for InnoDB status output; see L<"MONITORING A FILE"> for more
+details.
+
+You can negate some options by prefixing the option name with --no.  For
+example, --noinc (or --no-inc) negates L<"--inc">.
+
+=over
+
+=item --help
+
+Print a summary of command-line usage and exit.
+
+=item --color
+
+Enable or disable terminal coloring.  Corresponds to the L<"color"> config file
+setting.
+
+=item --config
+
+Specifies a configuration file to read.  This option is non-sticky, that is to
+say it does not persist to the configuration file itself.
+
+=item --nonint
+
+Enable non-interactive operation.  See L<"NON-INTERACTIVE OPERATION"> for more.
+
+=item --count
+
+Refresh only the specified number of times (ticks) before exiting.  Each refresh
+is a pause for L<"interval"> seconds, followed by requesting data from MySQL
+connections and printing it to the terminal.
+
+=item --delay
+
+Specifies the amount of time to pause between ticks (refreshes).  Corresponds to
+the configuration option L<"interval">.
+
+=item --mode
+
+Specifies the mode in which innotop should start.  Corresponds to the
+configuration option L<"mode">.
+
+=item --inc
+
+Specifies whether innotop should display absolute numbers or relative numbers
+(offsets from their previous values).  Corresponds to the configuration option
+L<"status_inc">.
+
+=item --version
+
+Output version information and exit.
+
+=back
+
+=head1 HOTKEYS
+
+innotop is interactive, and you control it with key-presses.
+
+=over
+
+=item *
+
+Uppercase keys switch between modes.
+
+=item *
+
+Lowercase keys initiate some action within the current mode.
+
+=item *
+
+Other keys do something special like change configuration or show the
+innotop license.
+
+=back
+
+Press '?' at any time to see the currently active keys and what they do.
+
+=head1 MODES
+
+Each of innotop's modes retrieves and displays a particular type of data from
+the servers you're monitoring.  You switch between modes with uppercase keys.
+The following is a brief description of each mode, in alphabetical order.  To
+switch to the mode, press the key listed in front of its heading in the
+following list:
+
+=over
+
+=item B: InnoDB Buffers
+
+This mode displays information about the InnoDB buffer pool, page statistics,
+insert buffer, and adaptive hash index.  The data comes from SHOW INNODB STATUS.
+
+This mode contains the L<"buffer_pool">, L<"page_statistics">,
+L<"insert_buffers">, and L<"adaptive_hash_index"> tables by default.
+
+=item C: Command Summary
+
+This mode is similar to mytop's Command Summary mode.  It shows the
+L<"cmd_summary"> table, which looks something like the following:
+
+ Command Summary (? for help) localhost, 25+07:16:43, 2.45 QPS, 3 thd, 5.0.40
+ _____________________ Command Summary _____________________
+ Name                    Value    Pct     Last Incr  Pct    
+ Select_scan             3244858  69.89%          2  100.00%
+ Select_range            1354177  29.17%          0    0.00%
+ Select_full_join          39479   0.85%          0    0.00%
+ Select_full_range_join     4097   0.09%          0    0.00%
+ Select_range_check            0   0.00%          0    0.00%
+
+The command summary table is built by extracting variables from
+L<"STATUS_VARIABLES">.  The variables must be numeric and must match the prefix
+given by the L<"cmd_filter"> configuration variable.  The variables are then
+sorted by value descending and compared to the last variable, as shown above.
+The percentage columns are percentage of the total of all variables in the
+table, so you can see the relative weight of the variables.
+
+The example shows what you see if the prefix is "Select_".  The default
+prefix is "Com_".  You can choose a prefix with the 's' key.
+
+It's rather like running SHOW VARIABLES LIKE "prefix%" with memory and
+nice formatting.
+
+Values are aggregated across all servers.  The Pct columns are not correctly
+aggregated across multiple servers.  This is a known limitation of the grouping
+algorithm that may be fixed in the future.
+
+=item D: InnoDB Deadlocks
+
+This mode shows the transactions involved in the last InnoDB deadlock.  A second
+table shows the locks each transaction held and waited for.  A deadlock is
+caused by a cycle in the waits-for graph, so there should be two locks held and
+one waited for unless the deadlock information is truncated.
+
+InnoDB puts deadlock information before some other information in the SHOW
+INNODB STATUS output.  If there are a lot of locks, the deadlock information can
+grow very large, and there is a limit on the size of the SHOW INNODB
+STATUS output.  A large deadlock can fill the entire output, or even be
+truncated, and prevent you from seeing other information at all.  If you are
+running innotop in another mode, for example T mode, and suddenly you don't see
+anything, you might want to check and see if a deadlock has wiped out the data
+you need.
+
+If it has, you can create a small deadlock to replace the large one.  Use the
+'w' key to 'wipe' the large deadlock with a small one.  This will not work
+unless you have defined a deadlock table for the connection (see L<"SERVER
+CONNECTIONS">).
+
+You can also configure innotop to automatically detect when a large deadlock
+needs to be replaced with a small one (see L<"auto_wipe_dl">).
+
+This mode displays the L<"deadlock_transactions"> and L<"deadlock_locks"> tables
+by default.
+
+=item F: InnoDB Foreign Key Errors
+
+This mode shows the last InnoDB foreign key error information, such as the
+table where it happened, when and who and what query caused it, and so on.
+
+InnoDB has a huge variety of foreign key error messages, and many of them are
+just hard to parse.  innotop doesn't always do the best job here, but there's
+so much code devoted to parsing this messy, unparseable output that innotop is
+likely never to be perfect in this regard.  If innotop doesn't show you what
+you need to see, just look at the status text directly.
+
+This mode displays the L<"fk_error"> table by default.
+
+=item I: InnoDB I/O Info
+
+This mode shows InnoDB's I/O statistics, including the I/O threads, pending I/O,
+file I/O miscellaneous, and log statistics.  It displays the L<"io_threads">,
+L<"pending_io">, L<"file_io_misc">, and L<"log_statistics"> tables by default.
+
+=item L: Locks
+
+This mode shows information about current locks.  At the moment only InnoDB
+locks are supported, and by default you'll only see locks for which transactions
+are waiting.  This information comes from the TRANSACTIONS section of the InnoDB
+status text.  If you have a very busy server, you may have frequent lock waits;
+it helps to be able to see which tables and indexes are the "hot spot" for
+locks.  If your server is running pretty well, this mode should show nothing.
+
+You can configure MySQL and innotop to monitor not only locks for which a
+transaction is waiting, but those currently held, too.  You can do this with the
+InnoDB Lock Monitor (L<http://dev.mysql.com/doc/en/innodb-monitor.html>).  It's
+not documented in the MySQL manual, but creating the lock monitor with the
+following statement also affects the output of SHOW INNODB STATUS, which innotop
+uses:
+
+  CREATE TABLE innodb_lock_monitor(a int) ENGINE=INNODB;
+
+This causes InnoDB to print its output to the MySQL file every 16 seconds or so,
+as stated in the manual, but it also makes the normal SHOW INNODB STATUS output
+include lock information, which innotop can parse and display (that's the
+undocumented feature).
+
+This means you can do what may have seemed impossible: to a limited extent
+(InnoDB truncates some information in the output), you can see which transaction
+holds the locks something else is waiting for.  You can also enable and disable
+the InnoDB Lock Monitor with the key mappings in this mode.
+
+This mode displays the L<"innodb_locks"> table by default.  Here's a sample of
+the screen when one connection is waiting for locks another connection holds:
+
+ _________________________________ InnoDB Locks __________________________
+ CXN        ID  Type    Waiting  Wait   Active  Mode  DB    Table  Index
+ localhost  12  RECORD        1  00:10   00:10  X     test  t1     PRIMARY
+ localhost  12  TABLE         0  00:10   00:10  IX    test  t1
+ localhost  12  RECORD        1  00:10   00:10  X     test  t1     PRIMARY
+ localhost  11  TABLE         0  00:00   00:25  IX    test  t1
+ localhost  11  RECORD        0  00:00   00:25  X     test  t1     PRIMARY
+
+You can see the first connection, ID 12, is waiting for a lock on the PRIMARY
+key on test.t1, and has been waiting for 10 seconds.  The second connection
+isn't waiting, because the Waiting column is 0, but it holds locks on the same
+index.  That tells you connection 11 is blocking connection 12.
+
+=item M: Master/Slave Replication Status
+
+This mode shows the output of SHOW SLAVE STATUS and SHOW MASTER STATUS in three
+tables.  The first two divide the slave's status into SQL and I/O thread status,
+and the last shows master status.  Filters are applied to eliminate non-slave
+servers from the slave tables, and non-master servers from the master table.
+
+This mode displays the L<"slave_sql_status">, L<"slave_io_status">, and
+L<"master_status"> tables by default.
+
+=item O: Open Tables
+
+This section comes from MySQL's SHOW OPEN TABLES command.  By default it is
+filtered to show tables which are in use by one or more queries, so you can
+get a quick look at which tables are 'hot'.  You can use this to guess which
+tables might be locked implicitly.
+
+This mode displays the L<"open_tables"> mode by default.
+
+=item Q: Query List
+
+This mode displays the output from SHOW FULL PROCESSLIST, much like B<mytop>'s
+query list mode.  This mode does B<not> show InnoDB-related information.  This
+is probably one of the most useful modes for general usage.
+
+There is an informative header that shows general status information about
+your server.  You can toggle it on and off with the 'h' key.  By default,
+innotop hides inactive processes and its own process.  You can toggle these on
+and off with the 'i' and 'a' keys.
+
+You can EXPLAIN a query from this mode with the 'e' key.  This displays the
+query's full text, the results of EXPLAIN, and in newer MySQL versions, even
+the optimized query resulting from EXPLAIN EXTENDED.  innotop also tries to
+rewrite certain queries to make them EXPLAIN-able.  For example, INSERT/SELECT
+statements are rewritable.
+
+This mode displays the L<"q_header"> and L<"processlist"> tables by default.
+
+=item R: InnoDB Row Operations and Semaphores
+
+This mode shows InnoDB row operations, row operation miscellaneous, semaphores,
+and information from the wait array.  It displays the L<"row_operations">,
+L<"row_operation_misc">, L<"semaphores">, and L<"wait_array"> tables by default.
+
+=item S: Variables & Status
+
+This mode calculates statistics, such as queries per second, and prints them out
+in several different styles.  You can show absolute values, or incremental values
+between ticks.
+
+You can switch between the views by pressing a key.  The 's' key prints a
+single line each time the screen updates, in the style of B<vmstat>.  The 'g'
+key changes the view to a graph of the same numbers, sort of like B<tload>.
+The 'v' key changes the view to a pivoted table of variable names on the left,
+with successive updates scrolling across the screen from left to right.  You can
+choose how many updates to put on the screen with the L<"num_status_sets">
+configuration variable.
+
+Headers may be abbreviated to fit on the screen in interactive operation.  You
+choose which variables to display with the 'c' key, which selects from
+predefined sets, or lets you create your own sets.  You can edit the current set
+with the 'e' key.
+
+This mode doesn't really display any tables like other modes.  Instead, it uses
+a table definition to extract and format the data, but it then transforms the
+result in special ways before outputting it.  It uses the L<"var_status"> table
+definition for this.
+
+=item T: InnoDB Transactions
+
+This mode shows transactions from the InnoDB monitor's output, in B<top>-like
+format.  This mode is the reason I wrote innotop.
+
+You can kill queries or processes with the 'k' and 'x' keys, and EXPLAIN a query
+with the 'e' or 'f' keys.  InnoDB doesn't print the full query in transactions,
+so explaining may not work right if the query is truncated.
+
+The informational header can be toggled on and off with the 'h' key.  By
+default, innotop hides inactive transactions and its own transaction.  You can
+toggle this on and off with the 'i' and 'a' keys.
+
+This mode displays the L<"t_header"> and L<"innodb_transactions"> tables by
+default.
+
+=back
+
+=head1 INNOTOP STATUS
+
+The first line innotop displays is a "status bar" of sorts.  What it contains
+depends on the mode you're in, and what servers you're monitoring.  The first
+few words are always the innotop mode, such as "InnoDB Txns" for T mode,
+followed by a reminder to press '?' for help at any time.
+
+=head2 ONE SERVER
+
+The simplest case is when you're monitoring a single server.  In this case, the
+name of the connection is next on the status line.  This is the name you gave
+when you created the connection -- most likely the MySQL server's hostname.
+This is followed by the server's uptime.
+
+If you're in an InnoDB mode, such as T or B, the next word is "InnoDB" followed
+by some information about the SHOW INNODB STATUS output used to render the
+screen.  The first word is the number of seconds since the last SHOW INNODB
+STATUS, which InnoDB uses to calculate some per-second statistics.  The next is
+a smiley face indicating whether the InnoDB output is truncated.  If the smiley
+face is a :-), all is well; there is no truncation.  A :^| means the transaction
+list is so long, InnoDB has only printed out some of the transactions.  Finally,
+a frown :-( means the output is incomplete, which is probably due to a deadlock
+printing too much lock information (see L<"D: InnoDB Deadlocks">).
+
+The next two words indicate the server's queries per second (QPS) and how many
+threads (connections) exist.  Finally, the server's version number is the last
+thing on the line.
+
+=head2 MULTIPLE SERVERS
+
+If you are monitoring multiple servers (see L<"SERVER CONNECTIONS">), the status
+line does not show any details about individual servers.  Instead, it shows the
+names of the connections that are active.  Again, these are connection names you
+specified, which are likely to be the server's hostname.  A connection that has
+an error is prefixed with an exclamation point.
+
+If you are monitoring a group of servers (see L<"SERVER GROUPS">), the status
+line shows the name of the group.  If any connection in the group has an
+error, the group's name is followed by the fraction of the connections that
+don't have errors.
+
+See L<"ERROR HANDLING"> for more details about innotop's error handling.
+
+=head2 MONITORING A FILE
+
+If you give a filename on the command line, innotop will not connect to ANY
+servers at all.  It will watch the specified file for InnoDB status output and
+use that as its data source.  It will always show a single connection called
+'file'.  And since it can't connect to a server, it can't determine how long the
+server it's monitoring has been up; so it calculates the server's uptime as time
+since innotop started running.
+
+=head1 SERVER ADMINISTRATION
+
+While innotop is primarily a monitor that lets you watch and analyze your
+servers, it can also send commands to servers.  The most frequently useful
+commands are killing queries and stopping or starting slaves.
+
+You can kill a connection, or in newer versions of MySQL kill a query but not a
+connection, from L<"Q: Query List"> and L<"T: InnoDB Transactions"> modes.
+Press 'k' to issue a KILL command, or 'x' to issue a KILL QUERY command.
+innotop will prompt you for the server and/or connection ID to kill (innotop
+does not prompt you if there is only one possible choice for any input).
+innotop pre-selects the longest-running query, or the oldest connection.
+Confirm the command with 'y'.
+
+In L<"M: Master/Slave Replication Status"> mode, you can start and stop slaves
+with the 'a' and 'o' keys, respectively.  You can send these commands to many
+slaves at once.  innotop fills in a default command of START SLAVE or STOP SLAVE
+for you, but you can actually edit the command and send anything you wish, such
+as SET GLOBAL SQL_SLAVE_SKIP_COUNTER=1 to make the slave skip one binlog event
+when it starts.
+
+You can also ask innotop to calculate the earliest binlog in use by any slave
+and issue a PURGE MASTER LOGS on the master.  Use the 'b' key for this.  innotop
+will prompt you for a master to run the command on, then prompt you for the
+connection names of that master's slaves (there is no way for innotop to
+determine this reliably itself).  innotop will find the minimum binlog in use by
+these slave connections and suggest it as the argument to PURGE MASTER LOGS.
+
+=head1 SERVER CONNECTIONS
+
+When you create a server connection, innotop asks you for a series of inputs, as
+follows:
+
+=over
+
+=item DSN
+
+A DSN is a Data Source Name, which is the initial argument passed to the DBI
+module for connecting to a server.  It is usually of the form
+
+ DBI:mysql:;mysql_read_default_group=mysql;host=HOSTNAME
+
+Since this DSN is passed to the DBD::mysql driver, you should read the driver's
+documentation at L<"http://search.cpan.org/dist/DBD-mysql/lib/DBD/mysql.pm"> for
+the exact details on all the options you can pass the driver in the DSN.  You
+can read more about DBI at L<http://dbi.perl.org/docs/>, and especially at
+L<http://search.cpan.org/~timb/DBI/DBI.pm>.
+
+The mysql_read_default_group=mysql option lets the DBD driver read your MySQL
+options files, such as ~/.my.cnf on UNIX-ish systems.  You can use this to avoid
+specifying a username or password for the connection.
+
+=item InnoDB Deadlock Table
+
+This optional item tells innotop a table name it can use to deliberately create
+a small deadlock (see L<"D: InnoDB Deadlocks">).  If you specify this option,
+you just need to be sure the table doesn't exist, and that innotop can create
+and drop the table with the InnoDB storage engine.  You can safely omit or just
+accept the default if you don't intend to use this.
+
+=item Username
+
+innotop will ask you if you want to specify a username.  If you say 'y', it will
+then prompt you for a user name.  If you have a MySQL option file that specifies
+your username, you don't have to specify a username.
+
+The username defaults to your login name on the system you're running innotop on.
+
+=item Password
+
+innotop will ask you if you want to specify a password.  Like the username, the
+password is optional, but there's an additional prompt that asks if you want to
+save the password in the innotop configuration file.  If you don't save it in
+the configuration file, innotop will prompt you for a password each time it
+starts.  Passwords in the innotop configuration file are saved in plain text,
+not encrypted in any way.
+
+=back
+
+Once you finish answering these questions, you should be connected to a server.
+But innotop isn't limited to monitoring a single server; you can define many
+server connections and switch between them by pressing the '@' key.  See
+L<"SWITCHING BETWEEN CONNECTIONS">.
+
+To create a new connection, press the '@' key and type the name of the new
+connection, then follow the steps given above.
+
+=head1 SERVER GROUPS
+
+If you have multiple MySQL instances, you can put them into named groups, such
+as 'all', 'masters', and 'slaves', which innotop can monitor all together.
+
+You can choose which group to monitor with the '#' key, and you can press the
+TAB key to switch to the next group.  If you're not currently monitoring a
+group, pressing TAB selects the first group.
+
+To create a group, press the '#' key and type the name of your new group, then
+type the names of the connections you want the group to contain.
+
+=head1 SWITCHING BETWEEN CONNECTIONS
+
+innotop lets you quickly switch which servers you're monitoring.  The most basic
+way is by pressing the '@' key and typing the name(s) of the connection(s) you
+want to use.  This setting is per-mode, so you can monitor different connections
+in each mode, and innotop remembers which connections you choose.
+
+You can quickly switch to the 'next' connection in alphabetical order with the
+'n' key.  If you're monitoring a server group (see L<"SERVER GROUPS">) this will
+switch to the first connection.
+
+You can also type many connection names, and innotop will fetch and display data
+from them all.  Just separate the connection names with spaces, for example
+"server1 server2."  Again, if you type the name of a connection that doesn't
+exist, innotop will prompt you for connection information and create the
+connection.
+
+Another way to monitor multiple connections at once is with server groups.  You
+can use the TAB key to switch to the 'next' group in alphabetical order, or if
+you're not monitoring any groups, TAB will switch to the first group.
+
+innotop does not fetch data in parallel from connections, so if you are
+monitoring a large group or many connections, you may notice increased delay
+between ticks.
+
+When you monitor more than one connection, innotop's status bar changes.  See
+L<"INNOTOP STATUS">.
+
+=head1 ERROR HANDLING
+
+Error handling is not that important when monitoring a single connection, but is
+crucial when you have many active connections.  A crashed server or lost
+connection should not crash innotop.  As a result, innotop will continue to run
+even when there is an error; it just won't display any information from the
+connection that had an error.  Because of this, innotop's behavior might confuse
+you.  It's a feature, not a bug!
+
+innotop does not continue to query connections that have errors, because they
+may slow innotop and make it hard to use, especially if the error is a problem
+connecting and causes a long time-out.  Instead, innotop retries the connection
+occasionally to see if the error still exists.  If so, it will wait until some
+point in the future.  The wait time increases in ticks as the Fibonacci series,
+so it tries less frequently as time passes.
+
+Since errors might only happen in certain modes because of the SQL commands
+issued in those modes, innotop keeps track of which mode caused the error.  If
+you switch to a different mode, innotop will retry the connection instead of
+waiting.
+
+By default innotop will display the problem in red text at the bottom of the
+first table on the screen.  You can disable this behavior with the
+L<"show_cxn_errors_in_tbl"> configuration option, which is enabled by default.
+If the L<"debug"> option is enabled, innotop will display the error at the
+bottom of every table, not just the first.  And if L<"show_cxn_errors"> is
+enabled, innotop will print the error text to STDOUT as well.  Error messages
+might only display in the mode that caused the error, depending on the mode and
+whether innotop is avoiding querying that connection.
+
+=head1 NON-INTERACTIVE OPERATION
+
+You can run innotop in non-interactive mode, in which case it is entirely
+controlled from the configuration file and command-line options.  To start
+innotop in non-interactive mode, give the L"<--nonint"> command-line option.
+This changes innotop's behavior in the following ways:
+
+=over
+
+=item *
+
+Certain Perl modules are not loaded.  Term::Readline is not loaded, since
+innotop doesn't prompt interactively.  Term::ANSIColor and Win32::Console::ANSI
+modules are not loaded.  Term::ReadKey is still used, since innotop may have to
+prompt for connection passwords when starting up.
+
+=item *
+
+innotop does not clear the screen after each tick.
+
+=item *
+
+innotop does not persist any changes to the configuration file.
+
+=item *
+
+If L<"--count"> is given and innotop is in incremental mode (see L<"status_inc">
+and L<"--inc">), innotop actually refreshes one more time than specified so it
+can print incremental statistics.  This suppresses output during the first
+tick, so innotop may appear to hang.
+
+=item *
+
+innotop only displays the first table in each mode.  This is so the output can
+be easily processed with other command-line utilities such as awk and sed.  To
+change which tables display in each mode, see L<"TABLES">.  Since L<"Q: Query
+List"> mode is so important, innotop automatically disables the L<"q_header">
+table.  This ensures you'll see the L<"processlist"> table, even if you have
+innotop configured to show the q_header table during interactive operation.
+Similarly, in L<"T: InnoDB Transactions"> mode, the L<"t_header"> table is
+suppressed so you see only the L<"innodb_transactions"> table.
+
+=item *
+
+All output is tab-separated instead of being column-aligned with whitespace, and
+innotop prints the full contents of each table instead of only printing one
+screenful at a time.
+
+=item *
+
+innotop only prints column headers once instead of every tick (see
+L<"hide_hdr">).  innotop does not print table captions (see
+L<"display_table_captions">).  innotop ensures there are no empty lines in the
+output.
+
+=item *
+
+innotop does not honor the L<"shorten"> transformation, which normally shortens
+some numbers to human-readable formats.
+
+=item *
+
+innotop does not print a status line (see L<"INNOTOP STATUS">).
+
+=back
+
+=head1 CONFIGURING
+
+Nearly everything about innotop is configurable.  Most things are possible to
+change with built-in commands, but you can also edit the configuration file.
+
+While running innotop, press the '$' key to bring up the configuration editing
+dialog.  Press another key to select the type of data you want to edit:
+
+=over
+
+=item S: Statement Sleep Times
+
+Edits SQL statement sleep delays, which make innotop pause for the specified
+amount of time after executing a statement.  See L<"SQL STATEMENTS"> for a
+definition of each statement and what it does.  By default innotop does not
+delay after any statements.
+
+This feature is included so you can customize the side-effects caused by
+monitoring your server.  You may not see any effects, but some innotop users
+have noticed that certain MySQL versions under very high load with InnoDB
+enabled take longer than usual to execute SHOW GLOBAL STATUS.  If innotop calls
+SHOW FULL PROCESSLIST immediately afterward, the processlist contains more
+queries than the machine actually averages at any given moment.  Configuring
+innotop to pause briefly after calling SHOW GLOBAL STATUS alleviates this
+effect.
+
+Sleep times are stored in the L<"stmt_sleep_times"> section of the configuration
+file.  Fractional-second sleeps are supported, subject to your hardware's
+limitations.
+
+=item c: Edit Columns
+
+Starts the table editor on one of the displayed tables.  See L<"TABLE EDITOR">.
+An alternative way to start the table editor without entering the configuration
+dialog is with the '^' key.
+
+=item g: General Configuration
+
+Starts the configuration editor to edit global and mode-specific configuration
+variables (see L<"MODES">).  innotop prompts you to choose a variable from among
+the global and mode-specific ones depending on the current mode.
+
+=item k: Row-Coloring Rules
+
+Starts the row-coloring rules editor on one of the displayed table(s).  See
+L<"COLORS"> for details.
+
+=item p: Manage Plugins
+
+Starts the plugin configuration editor.  See L<"PLUGINS"> for details.
+
+=item s: Server Groups
+
+Lets you create and edit server groups.  See L<"SERVER GROUPS">.
+
+=item t: Choose Displayed Tables
+
+Lets you choose which tables to display in this mode.  See L<"MODES"> and
+L<"TABLES">.
+
+=back
+
+=head1 CONFIGURATION FILE
+
+innotop's default configuration file location is in $HOME/.innotop, but can be
+overridden with the L<"--config"> command-line option.  You can edit it by hand
+safely.  innotop reads the configuration file when it starts, and writes it out
+again when it exits, so any changes you make while innotop is running will be
+lost.
+
+innotop doesn't store its entire configuration in the configuration file.  It
+has a huge set of default configuration that it holds only in memory, and the
+configuration file only overrides these defaults.  When you customize a default
+setting, innotop notices, and then stores the customizations into the file.
+This keeps the file size down, makes it easier to edit, and makes upgrades
+easier.
+
+A configuration file can be made read-only.  See L<"readonly">.
+
+The configuration file is arranged into sections like an INI file.  Each
+section begins with [section-name] and ends with [/section-name].  Each
+section's entries have a different syntax depending on the data they need to
+store.  You can put comments in the file; any line that begins with a #
+character is a comment.  innotop will not read the comments, so it won't write
+them back out to the file when it exits.  Comments in read-only configuration
+files are still useful, though.
+
+The first line in the file is innotop's version number.  This lets innotop
+notice when the file format is not backwards-compatible, and upgrade smoothly
+without destroying your customized configuration.
+
+The following list describes each section of the configuration file and the data
+it contains:
+
+=over
+
+=item general
+
+The 'general' section contains global configuration variables and variables that
+may be mode-specific, but don't belong in any other section.  The syntax is a
+simple key=value list.  innotop writes a comment above each value to help you
+edit the file by hand.
+
+=over
+
+=item S_func
+
+Controls S mode presentation (see L<"S: Variables & Status">).  If g, values are
+graphed; if s, values are like vmstat; if p, values are in a pivoted table.
+
+=item S_set
+
+Specifies which set of variables to display in L<"S: Variables & Status"> mode.
+See L<"VARIABLE SETS">.
+
+=item auto_wipe_dl
+
+Instructs innotop to automatically wipe large deadlocks when it notices them.
+When this happens you may notice a slight delay.  At the next tick, you will
+usually see the information that was being truncated by the large deadlock.
+
+=item charset
+
+Specifies what kind of characters to allow through the L<"no_ctrl_char">
+transformation.  This keeps non-printable characters from confusing a
+terminal when you monitor queries that contain binary data, such as images.
+
+The default is 'ascii', which considers anything outside normal ASCII to be a
+control character.  The other allowable values are 'unicode' and 'none'.  'none'
+considers every character a control character, which can be useful for
+collapsing ALL text fields in queries.
+
+=item cmd_filter
+
+This is the prefix that filters variables in L<"C: Command Summary"> mode.
+
+=item color
+
+Whether terminal coloring is permitted.
+
+=item cxn_timeout
+
+On MySQL versions 4.0.3 and newer, this variable is used to set the connection's
+timeout, so MySQL doesn't close the connection if it is not used for a while.
+This might happen because a connection isn't monitored in a particular mode, for
+example.
+
+=item debug
+
+This option enables more verbose errors and makes innotop more strict in some
+places.  It can help in debugging filters and other user-defined code.  It also
+makes innotop write a lot of information to L<"debugfile"> when there is a
+crash.
+
+=item debugfile
+
+A file to which innotop will write information when there is a crash.  See
+L<"FILES">.
+
+=item display_table_captions
+
+innotop displays a table caption above most tables.  This variable suppresses or
+shows captions on all tables globally.  Some tables are configured with the
+hide_caption property, which overrides this.
+
+=item global
+
+Whether to show GLOBAL variables and status.  innotop only tries to do this on
+servers which support the GLOBAL option to SHOW VARIABLES and SHOW STATUS.  In
+some MySQL versions, you need certain privileges to do this; if you don't have
+them, innotop will not be able to fetch any variable and status data.  This
+configuration variable lets you run innotop and fetch what data you can even
+without the elevated privileges.
+
+I can no longer find or reproduce the situation where GLOBAL wasn't allowed, but
+I know there was one.
+
+=item graph_char
+
+Defines the character to use when drawing graphs in L<"S: Variables & Status">
+mode.
+
+=item header_highlight
+
+Defines how to highlight column headers.  This only works if Term::ANSIColor is
+available.  Valid values are 'bold' and 'underline'.
+
+=item hide_hdr
+
+Hides column headers globally.
+
+=item interval
+
+The interval at which innotop will refresh its data (ticks).  The interval is
+implemented as a sleep time between ticks, so the true interval will vary
+depending on how long it takes innotop to fetch and render data.
+
+This variable accepts fractions of a second.
+
+=item mode
+
+The mode in which innotop should start.  Allowable arguments are the same as the
+key presses that select a mode interactively.  See L<"MODES">.
+
+=item num_digits
+
+How many digits to show in fractional numbers and percents.  This variable's
+range is between 0 and 9 and can be set directly from L<"S: Variables & Status">
+mode with the '+' and '-' keys.  It is used in the L<"set_precision">,
+L<"shorten">, and L<"percent"> transformations.
+
+=item num_status_sets
+
+Controls how many sets of status variables to display in pivoted L<"S: Variables
+& Status"> mode.  It also controls the number of old sets of variables innotop
+keeps in its memory, so the larger this variable is, the more memory innotop
+uses.
+
+=item plugin_dir
+
+Specifies where plugins can be found.  By default, innotop stores plugins in the
+'plugins' subdirectory of your innotop configuration directory.
+
+=item readonly
+
+Whether the configuration file is readonly.  This cannot be set interactively,
+because it would prevent itself from being written to the configuration file.
+
+=item show_cxn_errors
+
+Makes innotop print connection errors to STDOUT.  See L<"ERROR HANDLING">.
+
+=item show_cxn_errors_in_tbl
+
+Makes innotop display connection errors as rows in the first table on screen.
+See L<"ERROR HANDLING">.
+
+=item show_percent
+
+Adds a '%' character after the value returned by the L<"percent">
+transformation.
+
+=item show_statusbar
+
+Controls whether to show the status bar in the display.  See L<"INNOTOP
+STATUS">.
+
+=item skip_innodb
+
+Disables fetching SHOW INNODB STATUS, in case your server(s) do not have InnoDB
+enabled and you don't want innotop to try to fetch it.  This can also be useful
+when you don't have the SUPER privilege, required to run SHOW INNODB STATUS.
+
+=item status_inc
+
+Whether to show absolute or incremental values for status variables.
+Incremental values are calculated as an offset from the last value innotop saw
+for that variable.  This is a global setting, but will probably become
+mode-specific at some point.  Right now it is honored a bit inconsistently; some
+modes don't pay attention to it.
+
+=back
+
+=item plugins
+
+This section holds a list of package names of active plugins.  If the plugin
+exists, innotop will activate it.  See L<"PLUGINS"> for more information.
+
+=item filters
+
+This section holds user-defined filters (see L<"FILTERS">).  Each line is in the
+format filter_name=text='filter text' tbls='table list'.
+
+The filter text is the text of the subroutine's code.  The table list is a list
+of tables to which the filter can apply.  By default, user-defined filters apply
+to the table for which they were created, but you can manually override that by
+editing the definition in the configuration file.
+
+=item active_filters
+
+This section stores which filters are active on each table.  Each line is in the
+format table_name=filter_list.
+
+=item tbl_meta
+
+This section stores user-defined or user-customized columns (see L<"COLUMNS">).
+Each line is in the format col_name=properties, where the properties are a
+name=quoted-value list.
+
+=item connections
+
+This section holds the server connections you have defined.  Each line is in the
+format name=properties, where the properties are a name=value list.  The
+properties are self-explanatory, and the only one that is treated specially is
+'pass' which is only present if 'savepass' is set.  See L<"SERVER CONNECTIONS">.
+
+=item active_connections
+
+This section holds a list of which connections are active in each mode.  Each
+line is in the format mode_name=connection_list.
+
+=item server_groups
+
+This section holds server groups.  Each line is in the format
+name=connection_list.  See L<"SERVER GROUPS">.
+
+=item active_server_groups
+
+This section holds a list of which server group is active in each mode.  Each
+line is in the format mode_name=server_group.
+
+=item max_values_seen
+
+This section holds the maximum values seen for variables.  This is used to scale
+the graphs in L<"S: Variables & Status"> mode.  Each line is in the format
+name=value.
+
+=item active_columns
+
+This section holds table column lists.  Each line is in the format
+tbl_name=column_list.  See L<"COLUMNS">.
+
+=item sort_cols
+
+This section holds the sort definition.  Each line is in the format
+tbl_name=column_list.  If a column is prefixed with '-', that column sorts
+descending.  See L<"SORTING">.
+
+=item visible_tables
+
+This section defines which tables are visible in each mode.  Each line is in the
+format mode_name=table_list.  See L<"TABLES">.
+
+=item varsets
+
+This section defines variable sets for use in L<"S: Status & Variables"> mode.
+Each line is in the format name=variable_list.  See L<"VARIABLE SETS">.
+
+=item colors
+
+This section defines colorization rules.  Each line is in the format
+tbl_name=property_list.  See L<"COLORS">.
+
+=item stmt_sleep_times
+
+This section contains statement sleep times.  Each line is in the format
+statement_name=sleep_time.  See L<"S: Statement Sleep Times">.
+
+=item group_by
+
+This section contains column lists for table group_by expressions.  Each line is
+in the format tbl_name=column_list.  See L<"GROUPING">.
+
+=back
+
+=head1 CUSTOMIZING
+
+You can customize innotop a great deal.  For example, you can:
+
+=over
+
+=item *
+
+Choose which tables to display, and in what order.
+
+=item *
+
+Choose which columns are in those tables, and create new columns.
+
+=item *
+
+Filter which rows display with built-in filters, user-defined filters, and
+quick-filters.
+
+=item *
+
+Sort the rows to put important data first or group together related rows.
+
+=item *
+
+Highlight rows with color.
+
+=item *
+
+Customize the alignment, width, and formatting of columns, and apply
+transformations to columns to extract parts of their values or format the values
+as you wish (for example, shortening large numbers to familiar units).
+
+=item *
+
+Design your own expressions to extract and combine data as you need.  This gives
+you unlimited flexibility.
+
+=back
+
+All these and more are explained in the following sections.
+
+=head2 TABLES
+
+A table is what you'd expect: a collection of columns.  It also has some other
+properties, such as a caption.  Filters, sorting rules, and colorization rules
+belong to tables and are covered in later sections.
+
+Internally, table meta-data is defined in a data structure called %tbl_meta.
+This hash holds all built-in table definitions, which contain a lot of default
+instructions to innotop.  The meta-data includes the caption, a list of columns
+the user has customized, a list of columns, a list of visible columns, a list of
+filters, color rules, a sort-column list, sort direction, and some information
+about the table's data sources.  Most of this is customizable via the table
+editor (see L<"TABLE EDITOR">).
+
+You can choose which tables to show by pressing the '$' key.  See L<"MODES"> and
+L<"TABLES">.
+
+The table life-cycle is as follows:
+
+=over
+
+=item *
+
+Each table begins with a data source, which is an array of hashes.  See below
+for details on data sources.
+
+=item *
+
+Each element of the data source becomes a row in the final table.
+
+=item *
+
+For each element in the data source, innotop extracts values from the source and
+creates a row.  This row is another hash, which later steps will refer to as
+$set.  The values innotop extracts are determined by the table's columns.  Each
+column has an extraction subroutine, compiled from an expression (see
+L<"EXPRESSIONS">).  The resulting row is a hash whose keys are named the same as
+the column name.
+
+=item *
+
+innotop filters the rows, removing those that don't need to be displayed.  See
+L<"FILTERS">.
+
+=item *
+
+innotop sorts the rows.  See L<"SORTING">.
+
+=item *
+
+innotop groups the rows together, if specified.  See L<"GROUPING">.
+
+=item *
+
+innotop colorizes the rows.  See L<"COLORS">.
+
+=item *
+
+innotop transforms the column values in each row.  See L<"TRANSFORMATIONS">.
+
+=item *
+
+innotop optionally pivots the rows (see L<"PIVOTING">), then filters and sorts
+them.
+
+=item *
+
+innotop formats and justifies the rows as a table.  During this step, innotop
+applies further formatting to the column values, including alignment, maximum
+and minimum widths.  innotop also does final error checking to ensure there are
+no crashes due to undefined values.  innotop then adds a caption if specified,
+and the table is ready to print.
+
+=back
+
+The lifecycle is slightly different if the table is pivoted, as noted above.  To
+clarify, if the table is pivoted, the process is extract, group, transform,
+pivot, filter, sort, create.  If it's not pivoted, the process is extract,
+filter, sort, group, color, transform, create.  This slightly convoluted process
+doesn't map all that well to SQL, but pivoting complicates things pretty
+thoroughly.  Roughly speaking, filtering and sorting happen as late as needed to
+effect the final result as you might expect, but as early as possible for
+efficiency.
+
+Each built-in table is described below:
+
+=over
+
+=item adaptive_hash_index
+
+Displays data about InnoDB's adaptive hash index.  Data source:
+L<"STATUS_VARIABLES">.
+
+=item buffer_pool
+
+Displays data about InnoDB's buffer pool.  Data source: L<"STATUS_VARIABLES">.
+
+=item cmd_summary
+
+Displays weighted status variables.  Data source: L<"STATUS_VARIABLES">.
+
+=item deadlock_locks
+
+Shows which locks were held and waited for by the last detected deadlock.  Data
+source: L<"DEADLOCK_LOCKS">.
+
+=item deadlock_transactions
+
+Shows transactions involved in the last detected deadlock.  Data source:
+L<"DEADLOCK_TRANSACTIONS">.
+
+=item explain
+
+Shows the output of EXPLAIN.  Data source: L<"EXPLAIN">.
+
+=item file_io_misc
+
+Displays data about InnoDB's file and I/O operations.  Data source:
+L<"STATUS_VARIABLES">.
+
+=item fk_error
+
+Displays various data about InnoDB's last foreign key error.  Data source:
+L<"STATUS_VARIABLES">.
+
+=item innodb_locks
+
+Displays InnoDB locks.  Data source: L<"INNODB_LOCKS">.
+
+=item innodb_transactions
+
+Displays data about InnoDB's current transactions.  Data source:
+L<"INNODB_TRANSACTIONS">.
+
+=item insert_buffers
+
+Displays data about InnoDB's insert buffer.  Data source: L<"STATUS_VARIABLES">.
+
+=item io_threads
+
+Displays data about InnoDB's I/O threads.  Data source: L<"IO_THREADS">.
+
+=item log_statistics
+
+Displays data about InnoDB's logging system.  Data source: L<"STATUS_VARIABLES">.
+
+=item master_status
+
+Displays replication master status.  Data source: L<"STATUS_VARIABLES">.
+
+=item open_tables
+
+Displays open tables.  Data source: L<"OPEN_TABLES">.
+
+=item page_statistics
+
+Displays InnoDB page statistics.  Data source: L<"STATUS_VARIABLES">.
+
+=item pending_io
+
+Displays InnoDB pending I/O operations.  Data source: L<"STATUS_VARIABLES">.
+
+=item processlist
+
+Displays current MySQL processes (threads/connections).  Data source:
+L<"PROCESSLIST">.
+
+=item q_header
+
+Displays various status values.  Data source: L<"STATUS_VARIABLES">.
+
+=item row_operation_misc
+
+Displays data about InnoDB's row operations.  Data source:
+L<"STATUS_VARIABLES">.
+
+=item row_operations
+
+Displays data about InnoDB's row operations.  Data source:
+L<"STATUS_VARIABLES">.
+
+=item semaphores
+
+Displays data about InnoDB's semaphores and mutexes.  Data source:
+L<"STATUS_VARIABLES">.
+
+=item slave_io_status
+
+Displays data about the slave I/O thread.  Data source: 
+L<"STATUS_VARIABLES">.
+
+=item slave_sql_status
+
+Displays data about the slave SQL thread.  Data source: L<"STATUS_VARIABLES">.
+
+=item t_header
+
+Displays various InnoDB status values.  Data source: L<"STATUS_VARIABLES">.
+
+=item var_status
+
+Displays user-configurable data.  Data source: L<"STATUS_VARIABLES">.
+
+=item wait_array
+
+Displays data about InnoDB's OS wait array.  Data source: L<"OS_WAIT_ARRAY">.
+
+=back
+
+=head2 COLUMNS
+
+Columns belong to tables.  You can choose a table's columns by pressing the '^'
+key, which starts the L<"TABLE EDITOR"> and lets you choose and edit columns.
+Pressing 'e' from within the table editor lets you edit the column's properties:
+
+=over
+
+=item *
+
+hdr: a column header.  This appears in the first row of the table.
+
+=item *
+
+just: justification.  '-' means left-justified and '' means right-justified,
+just as with printf formatting codes (not a coincidence).
+
+=item *
+
+dec: whether to further align the column on the decimal point.
+
+=item *
+
+num: whether the column is numeric.  This affects how values are sorted
+(lexically or numerically).
+
+=item *
+
+label: a small note about the column, which appears in dialogs that help the
+user choose columns.
+
+=item *
+
+src: an expression that innotop uses to extract the column's data from its
+source (see L<"DATA SOURCES">).  See L<"EXPRESSIONS"> for more on expressions.
+
+=item *
+
+minw: specifies a minimum display width.  This helps stabilize the display,
+which makes it easier to read if the data is changing frequently.
+
+=item *
+
+maxw: similar to minw.
+
+=item *
+
+trans: a list of column transformations.  See L<"TRANSFORMATIONS">.
+
+=item *
+
+agg: an aggregate function.  See L<"GROUPING">.  The default is L<"first">.
+
+=item *
+
+aggonly: controls whether the column only shows when grouping is enabled on the
+table (see L<"GROUPING">).  By default, this is disabled.  This means columns
+will always be shown by default, whether grouping is enabled or not.  If a
+column's aggonly is set true, the column will appear when you toggle grouping on
+the table.  Several columns are set this way, such as the count column on
+L<"processlist"> and L<"innodb_transactions">, so you don't see a count when the
+grouping isn't enabled, but you do when it is.
+
+=back
+
+=head2 FILTERS
+
+Filters remove rows from the display.  They behave much like a WHERE clause in
+SQL.  innotop has several built-in filters, which remove irrelevant information
+like inactive queries, but you can define your own as well.  innotop also lets
+you create quick-filters, which do not get saved to the configuration file, and
+are just an easy way to quickly view only some rows.
+
+You can enable or disable a filter on any table.  Press the '%' key (mnemonic: %
+looks kind of like a line being filtered between two circles) and choose which
+table you want to filter, if asked.  You'll then see a list of possible filters
+and a list of filters currently enabled for that table.  Type the names of
+filters you want to apply and press Enter.
+
+=head3 USER-DEFINED FILTERS
+
+If you type a name that doesn't exist, innotop will prompt you to create the
+filter.  Filters are easy to create if you know Perl, and not hard if you don't.
+What you're doing is creating a subroutine that returns true if the row should
+be displayed.  The row is a hash reference passed to your subroutine as $set.
+
+For example, imagine you want to filter the processlist table so you only see
+queries that have been running more than five minutes.  Type a new name for your
+filter, and when prompted for the subroutine body, press TAB to initiate your
+terminal's auto-completion.  You'll see the names of the columns in the
+L<"processlist"> table (innotop generally tries to help you with auto-completion
+lists).  You want to filter on the 'time' column.  Type the text "$set->{time} >
+300" to return true when the query is more than five minutes old.  That's all
+you need to do.
+
+In other words, the code you're typing is surrounded by an implicit context,
+which looks like this:
+
+ sub filter {
+    my ( $set ) = @_;
+    # YOUR CODE HERE
+ }
+
+If your filter doesn't work, or if something else suddenly behaves differently,
+you might have made an error in your filter, and innotop is silently catching
+the error.  Try enabling L<"debug"> to make innotop throw an error instead.
+
+=head3 QUICK-FILTERS
+
+innotop's quick-filters are a shortcut to create a temporary filter that doesn't
+persist when you restart innotop.  To create a quick-filter, press the '/' key.
+innotop will prompt you for the column name and filter text.  Again, you can use
+auto-completion on column names.  The filter text can be just the text you want
+to "search for."  For example, to filter the L<"processlist"> table on queries
+that refer to the products table, type '/' and then 'info product'.
+
+The filter text can actually be any Perl regular expression, but of course a
+literal string like 'product' works fine as a regular expression.
+
+Behind the scenes innotop compiles the quick-filter into a specially tagged
+filter that is otherwise like any other filter.  It just isn't saved to the
+configuration file.
+
+To clear quick-filters, press the '\' key and innotop will clear them all at
+once.
+
+=head2 SORTING
+
+innotop has sensible built-in defaults to sort the most important rows to the
+top of the table.  Like anything else in innotop, you can customize how any
+table is sorted.
+
+To start the sort dialog, start the L<"TABLE EDITOR"> with the '^' key, choose a
+table if necessary, and press the 's' key.  You'll see a list of columns you can
+use in the sort expression and the current sort expression, if any.  Enter a
+list of columns by which you want to sort and press Enter.  If you want to
+reverse sort, prefix the column name with a minus sign.  For example, if you
+want to sort by column a ascending, then column b descending, type 'a -b'.  You
+can also explicitly add a + in front of columns you want to sort ascending, but
+it's not required.
+
+Some modes have keys mapped to open this dialog directly, and to quickly reverse
+sort direction.  Press '?' as usual to see which keys are mapped in any mode.
+
+=head2 GROUPING
+
+innotop can group, or aggregate, rows together (I use the terms
+interchangeably).  This is quite similar to an SQL GROUP BY clause.  You can
+specify to group on certain columns, or if you don't specify any, the entire set
+of rows is treated as one group.  This is quite like SQL so far, but unlike SQL,
+you can also select un-grouped columns.  innotop actually aggregates every
+column.  If you don't explicitly specify a grouping function, the default is
+'first'.  This is basically a convenience so you don't have to specify an
+aggregate function for every column you want in the result.
+
+You can quickly toggle grouping on a table with the '=' key, which toggles its
+aggregate property.  This property doesn't persist to the config file.
+
+The columns by which the table is grouped are specified in its group_by
+property.  When you turn grouping on, innotop places the group_by columns at the
+far left of the table, even if they're not supposed to be visible.  The rest of
+the visible columns appear in order after them.
+
+Two tables have default group_by lists and a count column built in:
+L<"processlist"> and L<"innodb_transactions">.  The grouping is by connection
+and status, so you can quickly see how many queries or transactions are in a
+given status on each server you're monitoring.  The time columns are aggregated
+as a sum; other columns are left at the default 'first' aggregation.
+
+By default, the table shown in L<"S: Variables & Status"> mode also uses
+grouping so you can monitor variables and status across many servers.  The
+default aggregation function in this mode is 'avg'.
+
+Valid grouping functions are defined in the %agg_funcs hash.  They include
+
+=over
+
+=item first
+
+Returns the first element in the group.
+
+=item count
+
+Returns the number of elements in the group, including undefined elements, much
+like SQL's COUNT(*).
+
+=item avg
+
+Returns the average of defined elements in the group.
+
+=item sum
+
+Returns the sum of elements in the group.
+
+=back
+
+Here's an example of grouping at work.  Suppose you have a very busy server with
+hundreds of open connections, and you want to see how many connections are in
+what status.  Using the built-in grouping rules, you can press 'Q' to enter
+L<"Q: Query List"> mode.  Press '=' to toggle grouping (if necessary, select the
+L<"processlist"> table when prompted).
+
+Your display might now look like the following:
+
+ Query List (? for help) localhost, 32:33, 0.11 QPS, 1 thd, 5.0.38-log
+ 
+ CXN        Cmd        Cnt  ID      User   Host           Time   Query       
+ localhost  Query      49    12933  webusr localhost      19:38  SELECT * FROM
+ localhost  Sending Da 23     2383  webusr localhost      12:43  SELECT col1,
+ localhost  Sleep      120     140  webusr localhost    5:18:12
+ localhost  Statistics 12    19213  webusr localhost      01:19  SELECT * FROM
+
+That's actually quite a worrisome picture.  You've got a lot of idle connections
+(Sleep), and some connections executing queries (Query and Sending Data).
+That's okay, but you also have a lot in Statistics status, collectively spending
+over a minute.  That means the query optimizer is having a really hard time
+optimizing your statements.  Something is wrong; it should normally take
+milliseconds to optimize queries.  You might not have seen this pattern if you
+didn't look at your connections in aggregate.  (This is a made-up example, but
+it can happen in real life).
+
+=head2 PIVOTING
+
+innotop can pivot a table for more compact display, similar to a Pivot Table in
+a spreadsheet (also known as a crosstab).  Pivoting a table makes columns into
+rows.  Assume you start with this table:
+
+ foo bar
+ === ===
+ 1   3
+ 2   4
+
+After pivoting, the table will look like this:
+
+ name set0 set1
+ ==== ==== ====
+ foo  1    2
+ bar  3    4
+
+To get reasonable results, you might need to group as well as pivoting.
+innotop currently does this for L<"S: Variables & Status"> mode.
+
+=head2 COLORS
+
+By default, innotop highlights rows with color so you can see at a glance which
+rows are more important.  You can customize the colorization rules and add your
+own to any table.  Open the table editor with the '^' key, choose a table if
+needed, and press 'o' to open the color editor dialog.
+
+The color editor dialog displays the rules applied to the table, in the order
+they are evaluated.  Each row is evaluated against each rule to see if the rule
+matches the row; if it does, the row gets the specified color, and no further
+rules are evaluated.  The rules look like the following:
+
+ state  eq  Locked       black on_red
+ cmd    eq  Sleep        white       
+ user   eq  system user  white       
+ cmd    eq  Connect      white       
+ cmd    eq  Binlog Dump  white       
+ time   >   600          red         
+ time   >   120          yellow      
+ time   >   60           green       
+ time   >   30           cyan        
+
+This is the default rule set for the L<"processlist"> table.  In order of
+priority, these rules make locked queries black on a red background, "gray out"
+connections from replication and sleeping queries, and make queries turn from
+cyan to red as they run longer.
+
+(For some reason, the ANSI color code "white" is actually a light gray.  Your
+terminal's display may vary; experiment to find colors you like).
+
+You can use keystrokes to move the rules up and down, which re-orders their
+priority.  You can also delete rules and add new ones.  If you add a new rule,
+innotop prompts you for the column, an operator for the comparison, a value
+against which to compare the column, and a color to assign if the rule matches.
+There is auto-completion and prompting at each step.
+
+The value in the third step needs to be correctly quoted.  innotop does not try
+to quote the value because it doesn't know whether it should treat the value as
+a string or a number.  If you want to compare the column against a string, as
+for example in the first rule above, you should enter 'Locked' surrounded by
+quotes.  If you get an error message about a bareword, you probably should have
+quoted something.
+
+=head2 EXPRESSIONS
+
+Expressions are at the core of how innotop works, and are what enables you to
+extend innotop as you wish.  Recall the table lifecycle explained in
+L<"TABLES">.  Expressions are used in the earliest step, where it extracts
+values from a data source to form rows.
+
+It does this by calling a subroutine for each column, passing it the source data
+set, a set of current values, and a set of previous values.  These are all
+needed so the subroutine can calculate things like the difference between this
+tick and the previous tick.
+
+The subroutines that extract the data from the set are compiled from
+expressions.  This gives significantly more power than just naming the values to
+fill the columns, because it allows the column's value to be calculated from
+whatever data is necessary, but avoids the need to write complicated and lengthy
+Perl code.
+
+innotop begins with a string of text that can look as simple as a value's name
+or as complicated as a full-fledged Perl expression.  It looks at each
+'bareword' token in the string and decides whether it's supposed to be a key
+into the $set hash.  A bareword is an unquoted value that isn't already
+surrounded by code-ish things like dollar signs or curly brackets.  If innotop
+decides that the bareword isn't a function or other valid Perl code, it converts
+it into a hash access.  After the whole string is processed, innotop compiles a
+subroutine, like this:
+
+ sub compute_column_value {
+    my ( $set, $cur, $pre ) = @_;
+    my $val = # EXPANDED STRING GOES HERE
+    return $val;
+ }
+
+Here's a concrete example, taken from the header table L<"q_header"> in L<"Q:
+Query List"> mode.  This expression calculates the qps, or Queries Per Second,
+column's values, from the values returned by SHOW STATUS:
+
+ Questions/Uptime_hires
+
+innotop decides both words are barewords, and transforms this expression into
+the following Perl code:
+
+ $set->{Questions}/$set->{Uptime_hires}
+
+When surrounded by the rest of the subroutine's code, this is executable Perl
+that calculates a high-resolution queries-per-second value.
+
+The arguments to the subroutine are named $set, $cur, and $pre.  In most cases,
+$set and $cur will be the same values.  However, if L<"status_inc"> is set, $cur
+will not be the same as $set, because $set will already contain values that are
+the incremental difference between $cur and $pre.
+
+Every column in innotop is computed by subroutines compiled in the same fashion.
+There is no difference between innotop's built-in columns and user-defined
+columns.  This keeps things consistent and predictable.
+
+=head2 TRANSFORMATIONS
+
+Transformations change how a value is rendered.  For example, they can take a
+number of seconds and display it in H:M:S format.  The following transformations
+are defined:
+
+=over
+
+=item commify
+
+Adds commas to large numbers every three decimal places.
+
+=item dulint_to_int
+
+Accepts two unsigned integers and converts them into a single longlong.  This is
+useful for certain operations with InnoDB, which uses two integers as
+transaction identifiers, for example.
+
+=item no_ctrl_char
+
+Removes quoted control characters from the value.  This is affected by the
+L<"charset"> configuration variable.
+
+This transformation only operates within quoted strings, for example, values to
+a SET clause in an UPDATE statement.  It will not alter the UPDATE statement,
+but will collapse the quoted string to [BINARY] or [TEXT], depending on the
+charset.
+
+=item percent
+
+Converts a number to a percentage by multiplying it by two, formatting it with
+L<"num_digits"> digits after the decimal point, and optionally adding a percent
+sign (see L<"show_percent">).
+
+=item secs_to_time
+
+Formats a number of seconds as time in days+hours:minutes:seconds format.
+
+=item set_precision
+
+Formats numbers with L<"num_digits"> number of digits after the decimal point.
+
+=item shorten
+
+Formats a number as a unit of 1024 (k/M/G/T) and with L<"num_digits"> number of
+digits after the decimal point.
+
+=back
+
+=head2 TABLE EDITOR
+
+The innotop table editor lets you customize tables with keystrokes.  You start
+the table editor with the '^' key.  If there's more than one table on the
+screen, it will prompt you to choose one of them.  Once you do, innotop will
+show you something like this:
+
+ Editing table definition for Buffer Pool.  Press ? for help, q to quit.
+ 
+ name               hdr          label                  src          
+ cxn                CXN          Connection from which  cxn          
+ buf_pool_size      Size         Buffer pool size       IB_bp_buf_poo
+ buf_free           Free Bufs    Buffers free in the b  IB_bp_buf_fre
+ pages_total        Pages        Pages total            IB_bp_pages_t
+ pages_modified     Dirty Pages  Pages modified (dirty  IB_bp_pages_m
+ buf_pool_hit_rate  Hit Rate     Buffer pool hit rate   IB_bp_buf_poo
+ total_mem_alloc    Memory       Total memory allocate  IB_bp_total_m
+ add_pool_alloc     Add'l Pool   Additonal pool alloca  IB_bp_add_poo
+
+The first line shows which table you're editing, and reminds you again to press
+'?' for a list of key mappings.  The rest is a tabular representation of the
+table's columns, because that's likely what you're trying to edit.  However, you
+can edit more than just the table's columns; this screen can start the filter
+editor, color rule editor, and more.
+
+Each row in the display shows a single column in the table you're editing, along
+with a couple of its properties such as its header and source expression (see
+L<"EXPRESSIONS">).
+
+The key mappings are Vim-style, as in many other places.  Pressing 'j' and 'k'
+moves the highlight up or down.  You can then (d)elete or (e)dit the highlighted
+column.  You can also (a)dd a column to the table.  This actually just activates
+one of the columns already defined for the table; it prompts you to choose from
+among the columns available but not currently displayed.  Finally, you can
+re-order the columns with the '+' and '-' keys.
+
+You can do more than just edit the columns with the table editor, you can also
+edit other properties, such as the table's sort expression and group-by
+expression.  Press '?' to see the full list, of course.
+
+If you want to really customize and create your own column, as opposed to just
+activating a built-in one that's not currently displayed, press the (n)ew key,
+and innotop will prompt you for the information it needs:
+
+=over
+
+=item *
+
+The column name: this needs to be a word without any funny characters, e.g. just
+letters, numbers and underscores.
+
+=item *
+
+The column header: this is the label that appears at the top of the column, in
+the table header.  This can have spaces and funny characters, but be careful not
+to make it too wide and waste space on-screen.
+
+=item *
+
+The column's data source: this is an expression that determines what data from
+the source (see L<"TABLES">) innotop will put into the column.  This can just be
+the name of an item in the source, or it can be a more complex expression, as
+described in L<"EXPRESSIONS">.
+
+=back
+
+Once you've entered the required data, your table has a new column.  There is no
+difference between this column and the built-in ones; it can have all the same
+properties and behaviors.  innotop will write the column's definition to the
+configuration file, so it will persist across sessions.
+
+Here's an example: suppose you want to track how many times your slaves have
+retried transactions.  According to the MySQL manual, the
+Slave_retried_transactions status variable gives you that data: "The total
+number of times since startup that the replication slave SQL thread has retried
+transactions. This variable was added in version 5.0.4."  This is appropriate to
+add to the L<"slave_sql_status"> table.
+
+To add the column, switch to the replication-monitoring mode with the 'M' key,
+and press the '^' key to start the table editor.  When prompted, choose
+slave_sql_status as the table, then press 'n' to create the column.  Type
+'retries' as the column name, 'Retries' as the column header, and
+'Slave_retried_transactions' as the source.  Now the column is created, and you
+see the table editor screen again.  Press 'q' to exit the table editor, and
+you'll see your column at the end of the table.
+
+=head1 VARIABLE SETS
+
+Variable sets are used in L<"S: Variables & Status"> mode to define more easily
+what variables you want to monitor.  Behind the scenes they are compiled to a
+list of expressions, and then into a column list so they can be treated just
+like columns in any other table, in terms of data extraction and
+transformations.  However, you're protected from the tedious details by a syntax
+that ought to feel very natural to you: a SQL SELECT list.
+
+The data source for variable sets, and indeed the entire S mode, is the
+combination of SHOW STATUS, SHOW VARIABLES, and SHOW INNODB STATUS.  Imagine
+that you had a huge table with one column per variable returned from those
+statements.  That's the data source for variable sets.  You can now query this
+data source just like you'd expect.  For example:
+
+ Questions, Uptime, Questions/Uptime as QPS
+
+Behind the scenes innotop will split that variable set into three expressions,
+compile them and turn them into a table definition, then extract as usual.  This
+becomes a "variable set," or a "list of variables you want to monitor."
+
+innotop lets you name and save your variable sets, and writes them to the
+configuration file.  You can choose which variable set you want to see with the
+'c' key, or activate the next and previous sets with the '>' and '<' keys.
+There are many built-in variable sets as well, which should give you a good
+start for creating your own.  Press 'e' to edit the current variable set, or
+just to see how it's defined.  To create a new one, just press 'c' and type its
+name.
+
+You may want to use some of the functions listed in L<"TRANSFORMATIONS"> to help
+format the results.  In particular, L<"set_precision"> is often useful to limit
+the number of digits you see.  Extending the above example, here's how:
+
+ Questions, Uptime, set_precision(Questions/Uptime) as QPS
+
+Actually, this still needs a little more work.  If your L<"interval"> is less
+than one second, you might be dividing by zero because Uptime is incremental in
+this mode by default.  Instead, use Uptime_hires:
+
+ Questions, Uptime, set_precision(Questions/Uptime_hires) as QPS
+
+This example is simple, but it shows how easy it is to choose which variables
+you want to monitor.
+
+=head1 PLUGINS
+
+innotop has a simple but powerful plugin mechanism by which you can extend
+or modify its existing functionality, and add new functionality.  innotop's
+plugin functionality is event-based: plugins register themselves to be called
+when events happen.  They then have a chance to influence the event.
+
+An innotop plugin is a Perl module placed in innotop's L<"plugin_dir">
+directory.  On UNIX systems, you can place a symbolic link to the module instead
+of putting the actual file there.  innotop automatically discovers the file.  If
+there is a corresponding entry in the L<"plugins"> configuration file section,
+innotop loads and activates the plugin.
+
+The module must conform to innotop's plugin interface.  Additionally, the source
+code of the module must be written in such a way that innotop can inspect the
+file and determine the package name and description.
+
+=head2 Package Source Convention
+
+innotop inspects the plugin module's source to determine the Perl package name.
+It looks for a line of the form "package Foo;" and if found, considers the
+plugin's package name to be Foo.  Of course the package name can be a valid Perl
+package name, with double semicolons and so on.
+
+It also looks for a description in the source code, to make the plugin editor
+more human-friendly.  The description is a comment line of the form "#
+description: Foo", where "Foo" is the text innotop will consider to be the
+plugin's description.
+
+=head2 Plugin Interface
+
+The innotop plugin interface is quite simple: innotop expects the plugin to be
+an object-oriented module it can call certain methods on.  The methods are
+
+=over
+
+=item new(%variables)
+
+This is the plugin's constructor.  It is passed a hash of innotop's variables,
+which it can manipulate (see L<"Plugin Variables">).  It must return a reference
+to the newly created plugin object.
+
+At construction time, innotop has only loaded the general configuration and
+created the default built-in variables with their default contents (which is
+quite a lot).  Therefore, the state of the program is exactly as in the innotop
+source code, plus the configuration variables from the L<"general"> section in
+the config file.
+
+If your plugin manipulates the variables, it is changing global data, which is
+shared by innotop and all plugins.  Plugins are loaded in the order they're
+listed in the config file.  Your plugin may load before or after another plugin,
+so there is a potential for conflict or interaction between plugins if they
+modify data other plugins use or modify.
+
+=item register_for_events()
+
+This method must return a list of events in which the plugin is interested, if
+any.  See L<"Plugin Events"> for the defined events.  If the plugin returns an
+event that's not defined, the event is ignored.
+
+=item event handlers
+
+The plugin must implement a method named the same as each event for which it has
+registered.  In other words, if the plugin returns qw(foo bar) from
+register_for_events(), it must have foo() and bar() methods.  These methods are
+callbacks for the events.  See L<"Plugin Events"> for more details about each
+event.
+
+=back
+
+=head2 Plugin Variables
+
+The plugin's constructor is passed a hash of innotop's variables, which it can
+manipulate.  It is probably a good idea if the plugin object saves a copy of it
+for later use.  The variables are defined in the innotop variable
+%pluggable_vars, and are as follows:
+
+=over
+
+=item action_for
+
+A hashref of key mappings.  These are innotop's global hot-keys.
+
+=item agg_funcs
+
+A hashref of functions that can be used for grouping.  See L<"GROUPING">.
+
+=item config
+
+The global configuration hash.
+
+=item connections
+
+A hashref of connection specifications.  These are just specifications of how to
+connect to a server.
+
+=item dbhs
+
+A hashref of innotop's database connections.  These are actual DBI connection
+objects.
+
+=item filters
+
+A hashref of filters applied to table rows.  See L<"FILTERS"> for more.
+
+=item modes
+
+A hashref of modes.  See L<"MODES"> for more.
+
+=item server_groups
+
+A hashref of server groups.  See L<"SERVER GROUPS">.
+
+=item tbl_meta
+
+A hashref of innotop's table meta-data, with one entry per table (see
+L<"TABLES"> for more information).
+
+=item trans_funcs
+
+A hashref of transformation functions.  See L<"TRANSFORMATIONS">.
+
+=item var_sets
+
+A hashref of variable sets.  See L<"VARIABLE SETS">.
+
+=back
+
+=head2 Plugin Events
+
+Each event is defined somewhere in the innotop source code.  When innotop runs
+that code, it executes the callback function for each plugin that expressed its
+interest in the event.  innotop passes some data for each event.  The events are
+defined in the %event_listener_for variable, and are as follows:
+
+=over
+
+=item extract_values($set, $cur, $pre, $tbl)
+
+This event occurs inside the function that extracts values from a data source.
+The arguments are the set of values, the current values, the previous values,
+and the table name.
+
+=item set_to_tbl
+
+Events are defined at many places in this subroutine, which is responsible for
+turning an arrayref of hashrefs into an arrayref of lines that can be printed to
+the screen.  The events all pass the same data: an arrayref of rows and the name
+of the table being created.  The events are set_to_tbl_pre_filter,
+set_to_tbl_pre_sort,set_to_tbl_pre_group, set_to_tbl_pre_colorize,
+set_to_tbl_pre_transform, set_to_tbl_pre_pivot, set_to_tbl_pre_create,
+set_to_tbl_post_create.
+
+=item draw_screen($lines)
+
+This event occurs inside the subroutine that prints the lines to the screen.
+$lines is an arrayref of strings.
+
+=back
+
+=head2 Simple Plugin Example
+
+The easiest way to explain the plugin functionality is probably with a simple
+example.  The following module adds a column to the beginning of every table and
+sets its value to 1.
+
+ use strict;
+ use warnings FATAL => 'all';
+ 
+ package Innotop::Plugin::Example;
+ # description: Adds an 'example' column to every table
+ 
+ sub new {
+    my ( $class, %vars ) = @_;
+    # Store reference to innotop's variables in $self
+    my $self = bless { %vars }, $class;
+ 
+    # Design the example column
+    my $col = {
+       hdr   => 'Example',
+       just  => '',
+       dec   => 0,
+       num   => 1,
+       label => 'Example',
+       src   => 'example', # Get data from this column in the data source
+       tbl   => '',
+       trans => [],
+    };
+ 
+    # Add the column to every table.
+    my $tbl_meta = $vars{tbl_meta};
+    foreach my $tbl ( values %$tbl_meta ) {
+       # Add the column to the list of defined columns
+       $tbl->{cols}->{example} = $col;
+       # Add the column to the list of visible columns
+       unshift @{$tbl->{visible}}, 'example';
+    }
+ 
+    # Be sure to return a reference to the object.
+    return $self;
+ }
+ 
+ # I'd like to be called when a data set is being rendered into a table, please.
+ sub register_for_events {
+    my ( $self ) = @_;
+    return qw(set_to_tbl_pre_filter);
+ }
+ 
+ # This method will be called when the event fires.
+ sub set_to_tbl_pre_filter {
+    my ( $self, $rows, $tbl ) = @_;
+    # Set the example column's data source to the value 1.
+    foreach my $row ( @$rows ) {
+       $row->{example} = 1;
+    }
+ }
+ 
+ 1;
+
+=head2 Plugin Editor
+
+The plugin editor lets you view the plugins innotop discovered and activate or
+deactivate them.  Start the editor by pressing $ to start the configuration
+editor from any mode.  Press the 'p' key to start the plugin editor.  You'll see
+a list of plugins innotop discovered.  You can use the 'j' and 'k' keys to move
+the highlight to the desired one, then press the * key to toggle it active or
+inactive.  Exit the editor and restart innotop for the changes to take effect.
+
+=head1 SQL STATEMENTS
+
+innotop uses a limited set of SQL statements to retrieve data from MySQL for
+display.  The statements are customized depending on the server version against
+which they are executed; for example, on MySQL 5 and newer, INNODB_STATUS
+executes "SHOW ENGINE INNODB STATUS", while on earlier versions it executes
+"SHOW INNODB STATUS".  The statements are as follows:
+
+ Statement           SQL executed
+ =================== ===============================
+ INNODB_STATUS       SHOW [ENGINE] INNODB STATUS
+ KILL_CONNECTION     KILL
+ KILL_QUERY          KILL QUERY
+ OPEN_TABLES         SHOW OPEN TABLES
+ PROCESSLIST         SHOW FULL PROCESSLIST
+ SHOW_MASTER_LOGS    SHOW MASTER LOGS
+ SHOW_MASTER_STATUS  SHOW MASTER STATUS
+ SHOW_SLAVE_STATUS   SHOW SLAVE STATUS
+ SHOW_STATUS         SHOW [GLOBAL] STATUS
+ SHOW_VARIABLES      SHOW [GLOBAL] VARIABLES
+
+=head1 DATA SOURCES
+
+Each time innotop extracts values to create a table (see L<"EXPRESSIONS"> and
+L<"TABLES">), it does so from a particular data source.  Largely because of the
+complex data extracted from SHOW INNODB STATUS, this is slightly messy.  SHOW
+INNODB STATUS contains a mixture of single values and repeated values that form
+nested data sets.
+
+Whenever innotop fetches data from MySQL, it adds two extra bits to each set:
+cxn and Uptime_hires.  cxn is the name of the connection from which the data
+came.  Uptime_hires is a high-resolution version of the server's Uptime status
+variable, which is important if your L<"interval"> setting is sub-second.
+
+Here are the kinds of data sources from which data is extracted:
+
+=over
+
+=item STATUS_VARIABLES
+
+This is the broadest category, into which the most kinds of data fall.  It
+begins with the combination of SHOW STATUS and SHOW VARIABLES, but other sources
+may be included as needed, for example, SHOW MASTER STATUS and SHOW SLAVE
+STATUS, as well as many of the non-repeated values from SHOW INNODB STATUS.
+
+=item DEADLOCK_LOCKS
+
+This data is extracted from the transaction list in the LATEST DETECTED DEADLOCK
+section of SHOW INNODB STATUS.  It is nested two levels deep: transactions, then
+locks.
+
+=item DEADLOCK_TRANSACTIONS
+
+This data is from the transaction list in the LATEST DETECTED DEADLOCK
+section of SHOW INNODB STATUS.  It is nested one level deep.
+
+=item EXPLAIN
+
+This data is from the result set returned by EXPLAIN.
+
+=item INNODB_TRANSACTIONS
+
+This data is from the TRANSACTIONS section of SHOW INNODB STATUS.
+
+=item IO_THREADS
+
+This data is from the list of threads in the the FILE I/O section of SHOW INNODB
+STATUS.
+
+=item INNODB_LOCKS
+
+This data is from the TRANSACTIONS section of SHOW INNODB STATUS and is nested
+two levels deep.
+
+=item OPEN_TABLES
+
+This data is from SHOW OPEN TABLES.
+
+=item PROCESSLIST
+
+This data is from SHOW FULL PROCESSLIST.
+
+=item OS_WAIT_ARRAY
+
+This data is from the SEMAPHORES section of SHOW INNODB STATUS and is nested one
+level deep.  It comes from the lines that look like this:
+
+ --Thread 1568861104 has waited at btr0cur.c line 424 ....
+
+=back
+
+=head1 MYSQL PRIVILEGES
+
+=over
+
+=item *
+
+You must connect to MySQL as a user who has the SUPER privilege for many of the
+functions.
+
+=item *
+
+If you don't have the SUPER privilege, you can still run some functions, but you
+won't necessarily see all the same data.
+
+=item *
+
+You need the PROCESS privilege to see the list of currently running queries in Q
+mode.
+
+=item *
+
+You need special privileges to start and stop slave servers.
+
+=item *
+
+You need appropriate privileges to create and drop the deadlock tables if needed
+(see L<"SERVER CONNECTIONS">).
+
+=back
+
+=head1 SYSTEM REQUIREMENTS
+
+You need Perl to run innotop, of course.  You also need a few Perl modules: DBI,
+DBD::mysql,  Term::ReadKey, and Time::HiRes.  These should be included with most
+Perl distributions, but in case they are not, I recommend using versions
+distributed with your operating system or Perl distribution, not from CPAN.
+Term::ReadKey in particular has been known to cause problems if installed from
+CPAN.
+
+If you have Term::ANSIColor, innotop will use it to format headers more readably
+and compactly.  (Under Microsoft Windows, you also need Win32::Console::ANSI for
+terminal formatting codes to be honored).  If you install Term::ReadLine,
+preferably Term::ReadLine::Gnu, you'll get nice auto-completion support.
+
+I run innotop on Gentoo GNU/Linux, Debian and Ubuntu, and I've had feedback from
+people successfully running it on Red Hat, CentOS, Solaris, and Mac OSX.  I
+don't see any reason why it won't work on other UNIX-ish operating systems, but
+I don't know for sure.  It also runs on Windows under ActivePerl without
+problem.
+
+I use innotop on MySQL versions 3.23.58, 4.0.27, 4.1.0, 4.1.22, 5.0.26, 5.1.15,
+and 5.2.3.  If it doesn't run correctly for you, that is a bug and I hope you
+report it.
+
+=head1 FILES
+
+$HOMEDIR/.innotop is used to store configuration information.  Files include the
+configuration file innotop.ini, the core_dump file which contains verbose error
+messages if L<"debug"> is enabled, and the plugins/ subdirectory.
+
+=head1 GLOSSARY OF TERMS
+
+=over
+
+=item tick
+
+A tick is a refresh event, when innotop re-fetches data from connections and
+displays it.
+
+=back
+
+=head1 ACKNOWLEDGEMENTS
+
+I'm grateful to the following people for various reasons, and hope I haven't
+forgotten to include anyone:
+
+Allen K. Smith,
+Aurimas Mikalauskas,
+Bartosz Fenski,
+Brian Miezejewski,
+Christian Hammers, 
+Cyril Scetbon,
+Dane Miller,
+David Multer,
+Dr. Frank Ullrich,
+Giuseppe Maxia,
+Google.com Site Reliability Engineers,
+Jan Pieter Kunst,
+Jari Aalto,
+Jay Pipes,
+Jeremy Zawodny,
+Johan Idren,
+Kristian Kohntopp,
+Lenz Grimmer,
+Maciej Dobrzanski,
+Michiel Betel,
+MySQL AB,
+Paul McCullagh,
+Sebastien Estienne,
+Sourceforge.net,
+Steven Kreuzer,
+The Gentoo MySQL Team,
+Trevor Price,
+Yaar Schnitman,
+and probably more people I've neglected to include.
+
+(If I misspelled your name, it's probably because I'm afraid of putting
+international characters into this documentation; earlier versions of Perl might
+not be able to compile it then).
+
+=head1 COPYRIGHT, LICENSE AND WARRANTY
+
+This program is copyright (c) 2006 Baron Schwartz.
+Feedback and improvements are welcome.
+
+THIS PROGRAM IS PROVIDED "AS IS" AND WITHOUT ANY EXPRESS OR IMPLIED
+WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTIES OF
+MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation, version 2; OR the Perl Artistic License.  On UNIX and similar
+systems, you can issue `man perlgpl' or `man perlartistic' to read these
+licenses.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA  02111-1307  USA.
+
+Execute innotop and press '!' to see this information at any time.
+
+=head1 AUTHOR
+
+Baron Schwartz.
+
+=head1 BUGS
+
+You can report bugs, ask for improvements, and get other help and support at
+L<http://sourceforge.net/projects/innotop>.  There are mailing lists, forums,
+a bug tracker, etc.  Please use these instead of contacting me directly, as it
+makes my job easier and benefits others if the discussions are permanent and
+public.  Of course, if you need to contact me in private, please do.
+
+=cut
diff --git a/storage/xtradb/build/debian/additions/innotop/innotop.1 b/storage/xtradb/build/debian/additions/innotop/innotop.1
new file mode 100644
index 00000000000..ef708c3974c
--- /dev/null
+++ b/storage/xtradb/build/debian/additions/innotop/innotop.1
@@ -0,0 +1,2086 @@
+.\" Automatically generated by Pod::Man v1.37, Pod::Parser v1.32
+.\"
+.\" Standard preamble:
+.\" ========================================================================
+.de Sh \" Subsection heading
+.br
+.if t .Sp
+.ne 5
+.PP
+\fB\\$1\fR
+.PP
+..
+.de Sp \" Vertical space (when we can't use .PP)
+.if t .sp .5v
+.if n .sp
+..
+.de Vb \" Begin verbatim text
+.ft CW
+.nf
+.ne \\$1
+..
+.de Ve \" End verbatim text
+.ft R
+.fi
+..
+.\" Set up some character translations and predefined strings.  \*(-- will
+.\" give an unbreakable dash, \*(PI will give pi, \*(L" will give a left
+.\" double quote, and \*(R" will give a right double quote.  \*(C+ will
+.\" give a nicer C++.  Capital omega is used to do unbreakable dashes and
+.\" therefore won't be available.  \*(C` and \*(C' expand to `' in nroff,
+.\" nothing in troff, for use with C<>.
+.tr \(*W-
+.ds C+ C\v'-.1v'\h'-1p'\s-2+\h'-1p'+\s0\v'.1v'\h'-1p'
+.ie n \{\
+.    ds -- \(*W-
+.    ds PI pi
+.    if (\n(.H=4u)&(1m=24u) .ds -- \(*W\h'-12u'\(*W\h'-12u'-\" diablo 10 pitch
+.    if (\n(.H=4u)&(1m=20u) .ds -- \(*W\h'-12u'\(*W\h'-8u'-\"  diablo 12 pitch
+.    ds L" ""
+.    ds R" ""
+.    ds C` ""
+.    ds C' ""
+'br\}
+.el\{\
+.    ds -- \|\(em\|
+.    ds PI \(*p
+.    ds L" ``
+.    ds R" ''
+'br\}
+.\"
+.\" If the F register is turned on, we'll generate index entries on stderr for
+.\" titles (.TH), headers (.SH), subsections (.Sh), items (.Ip), and index
+.\" entries marked with X<> in POD.  Of course, you'll have to process the
+.\" output yourself in some meaningful fashion.
+.if \nF \{\
+.    de IX
+.    tm Index:\\$1\t\\n%\t"\\$2"
+..
+.    nr % 0
+.    rr F
+.\}
+.\"
+.\" For nroff, turn off justification.  Always turn off hyphenation; it makes
+.\" way too many mistakes in technical documents.
+.hy 0
+.if n .na
+.\"
+.\" Accent mark definitions (@(#)ms.acc 1.5 88/02/08 SMI; from UCB 4.2).
+.\" Fear.  Run.  Save yourself.  No user-serviceable parts.
+.    \" fudge factors for nroff and troff
+.if n \{\
+.    ds #H 0
+.    ds #V .8m
+.    ds #F .3m
+.    ds #[ \f1
+.    ds #] \fP
+.\}
+.if t \{\
+.    ds #H ((1u-(\\\\n(.fu%2u))*.13m)
+.    ds #V .6m
+.    ds #F 0
+.    ds #[ \&
+.    ds #] \&
+.\}
+.    \" simple accents for nroff and troff
+.if n \{\
+.    ds ' \&
+.    ds ` \&
+.    ds ^ \&
+.    ds , \&
+.    ds ~ ~
+.    ds /
+.\}
+.if t \{\
+.    ds ' \\k:\h'-(\\n(.wu*8/10-\*(#H)'\'\h"|\\n:u"
+.    ds ` \\k:\h'-(\\n(.wu*8/10-\*(#H)'\`\h'|\\n:u'
+.    ds ^ \\k:\h'-(\\n(.wu*10/11-\*(#H)'^\h'|\\n:u'
+.    ds , \\k:\h'-(\\n(.wu*8/10)',\h'|\\n:u'
+.    ds ~ \\k:\h'-(\\n(.wu-\*(#H-.1m)'~\h'|\\n:u'
+.    ds / \\k:\h'-(\\n(.wu*8/10-\*(#H)'\z\(sl\h'|\\n:u'
+.\}
+.    \" troff and (daisy-wheel) nroff accents
+.ds : \\k:\h'-(\\n(.wu*8/10-\*(#H+.1m+\*(#F)'\v'-\*(#V'\z.\h'.2m+\*(#F'.\h'|\\n:u'\v'\*(#V'
+.ds 8 \h'\*(#H'\(*b\h'-\*(#H'
+.ds o \\k:\h'-(\\n(.wu+\w'\(de'u-\*(#H)/2u'\v'-.3n'\*(#[\z\(de\v'.3n'\h'|\\n:u'\*(#]
+.ds d- \h'\*(#H'\(pd\h'-\w'~'u'\v'-.25m'\f2\(hy\fP\v'.25m'\h'-\*(#H'
+.ds D- D\\k:\h'-\w'D'u'\v'-.11m'\z\(hy\v'.11m'\h'|\\n:u'
+.ds th \*(#[\v'.3m'\s+1I\s-1\v'-.3m'\h'-(\w'I'u*2/3)'\s-1o\s+1\*(#]
+.ds Th \*(#[\s+2I\s-2\h'-\w'I'u*3/5'\v'-.3m'o\v'.3m'\*(#]
+.ds ae a\h'-(\w'a'u*4/10)'e
+.ds Ae A\h'-(\w'A'u*4/10)'E
+.    \" corrections for vroff
+.if v .ds ~ \\k:\h'-(\\n(.wu*9/10-\*(#H)'\s-2\u~\d\s+2\h'|\\n:u'
+.if v .ds ^ \\k:\h'-(\\n(.wu*10/11-\*(#H)'\v'-.4m'^\v'.4m'\h'|\\n:u'
+.    \" for low resolution devices (crt and lpr)
+.if \n(.H>23 .if \n(.V>19 \
+\{\
+.    ds : e
+.    ds 8 ss
+.    ds o a
+.    ds d- d\h'-1'\(ga
+.    ds D- D\h'-1'\(hy
+.    ds th \o'bp'
+.    ds Th \o'LP'
+.    ds ae ae
+.    ds Ae AE
+.\}
+.rm #[ #] #H #V #F C
+.\" ========================================================================
+.\"
+.IX Title "INNOTOP 1p"
+.TH INNOTOP 1p "2007-11-09" "perl v5.8.8" "User Contributed Perl Documentation"
+.SH "NAME"
+innotop \- MySQL and InnoDB transaction/status monitor.
+.SH "SYNOPSIS"
+.IX Header "SYNOPSIS"
+To monitor servers normally:
+.PP
+.Vb 1
+\& innotop
+.Ve
+.PP
+To monitor InnoDB status information from a file:
+.PP
+.Vb 1
+\& innotop /var/log/mysql/mysqld.err
+.Ve
+.PP
+To run innotop non-interactively in a pipe-and-filter configuration:
+.PP
+.Vb 1
+\& innotop \-\-count 5 \-d 1 \-n
+.Ve
+.SH "DESCRIPTION"
+.IX Header "DESCRIPTION"
+innotop monitors MySQL servers.  Each of its modes shows you a different aspect
+of what's happening in the server.  For example, there's a mode for monitoring
+replication, one for queries, and one for transactions.  innotop refreshes its
+data periodically, so you see an updating view.
+.PP
+innotop has lots of features for power users, but you can start and run it with
+virtually no configuration.  If you're just getting started, see
+\&\*(L"\s-1QUICK\-START\s0\*(R".  Press '?' at any time while running innotop for
+context-sensitive help.
+.SH "QUICK-START"
+.IX Header "QUICK-START"
+To start innotop, open a terminal or command prompt.  If you have installed
+innotop on your system, you should be able to just type \*(L"innotop\*(R" and press
+Enter; otherwise, you will need to change to innotop's directory and type \*(L"perl
+innotop\*(R".
+.PP
+The first thing innotop needs to know is how to connect to a MySQL server.  You
+can just enter the hostname of the server, for example \*(L"localhost\*(R" or
+\&\*(L"127.0.0.1\*(R" if the server is on the same machine as innotop.  After this innotop
+will prompt you for a \s-1DSN\s0 (data source name).  You should be able to just accept
+the defaults by pressing Enter.
+.PP
+When innotop asks you about a table to use when resetting InnoDB deadlock
+information, just accept the default for now.  This is an advanced feature you
+can configure later (see \*(L"D: InnoDB Deadlocks\*(R" for more).
+.PP
+If you have a .my.cnf file with your MySQL connection defaults, innotop can read
+it, and you won't need to specify a username and password if it's in that file.
+Otherwise, you should answer 'y' to the next couple of prompts.
+.PP
+After this, you should be connected, and innotop should show you something like
+the following:
+.PP
+.Vb 1
+\& InnoDB Txns (? for help) localhost, 01:11:19, InnoDB 10s :\-), 50 QPS,
+.Ve
+.PP
+.Vb 2
+\& CXN        History  Versions  Undo  Dirty Buf  Used Bufs  Txns  MaxTxn
+\& localhost        7      2035  0 0       0.00%     92.19%     1   07:34
+.Ve
+.PP
+.Vb 5
+\& CXN        ID     User   Host       Txn Status  Time   Undo  Query Tex
+\& localhost  98379  user1  webserver  ACTIVE      07:34     0  SELECT `c
+\& localhost  98450  user1  webserver  ACTIVE      01:06     0  INSERT IN
+\& localhost  97750  user1  webserver  not starte  00:00     0      
+\& localhost  98375  user1  appserver  not starte  00:00     0
+.Ve
+.PP
+(This sample is truncated at the right so it will fit on a terminal when running
+\&'man innotop')
+.PP
+This sample comes from a quiet server with few transactions active.  If your
+server is busy, you'll see more output.  Notice the first line on the screen,
+which tells you what mode you're in and what server you're connected to.  You
+can change to other modes with keystrokes; press 'Q' to switch to a list of
+currently running queries.
+.PP
+Press the '?' key to see what keys are active in the current mode.  You can
+press any of these keys and innotop will either take the requested action or
+prompt you for more input.  If your system has Term::ReadLine support, you can
+use \s-1TAB\s0 and other keys to auto-complete and edit input.
+.PP
+To quit innotop, press the 'q' key.
+.SH "OPTIONS"
+.IX Header "OPTIONS"
+innotop is mostly configured via its configuration file, but some of the
+configuration options can come from the command line.  You can also specify a
+file to monitor for InnoDB status output; see \*(L"\s-1MONITORING\s0 A \s-1FILE\s0\*(R" for more
+details.
+.PP
+You can negate some options by prefixing the option name with \-\-no.  For
+example, \-\-noinc (or \-\-no\-inc) negates \*(L"\-\-inc\*(R".
+.IP "\-\-help" 4
+.IX Item "--help"
+Print a summary of command-line usage and exit.
+.IP "\-\-color" 4
+.IX Item "--color"
+Enable or disable terminal coloring.  Corresponds to the \*(L"color\*(R" config file
+setting.
+.IP "\-\-config" 4
+.IX Item "--config"
+Specifies a configuration file to read.  This option is non\-sticky, that is to
+say it does not persist to the configuration file itself.
+.IP "\-\-nonint" 4
+.IX Item "--nonint"
+Enable non-interactive operation.  See \*(L"\s-1NON\-INTERACTIVE\s0 \s-1OPERATION\s0\*(R" for more.
+.IP "\-\-count" 4
+.IX Item "--count"
+Refresh only the specified number of times (ticks) before exiting.  Each refresh
+is a pause for \*(L"interval\*(R" seconds, followed by requesting data from MySQL
+connections and printing it to the terminal.
+.IP "\-\-delay" 4
+.IX Item "--delay"
+Specifies the amount of time to pause between ticks (refreshes).  Corresponds to
+the configuration option \*(L"interval\*(R".
+.IP "\-\-mode" 4
+.IX Item "--mode"
+Specifies the mode in which innotop should start.  Corresponds to the
+configuration option \*(L"mode\*(R".
+.IP "\-\-inc" 4
+.IX Item "--inc"
+Specifies whether innotop should display absolute numbers or relative numbers
+(offsets from their previous values).  Corresponds to the configuration option
+\&\*(L"status_inc\*(R".
+.IP "\-\-version" 4
+.IX Item "--version"
+Output version information and exit.
+.SH "HOTKEYS"
+.IX Header "HOTKEYS"
+innotop is interactive, and you control it with key\-presses.
+.IP "\(bu" 4
+Uppercase keys switch between modes.
+.IP "\(bu" 4
+Lowercase keys initiate some action within the current mode.
+.IP "\(bu" 4
+Other keys do something special like change configuration or show the
+innotop license.
+.PP
+Press '?' at any time to see the currently active keys and what they do.
+.SH "MODES"
+.IX Header "MODES"
+Each of innotop's modes retrieves and displays a particular type of data from
+the servers you're monitoring.  You switch between modes with uppercase keys.
+The following is a brief description of each mode, in alphabetical order.  To
+switch to the mode, press the key listed in front of its heading in the
+following list:
+.IP "B: InnoDB Buffers" 4
+.IX Item "B: InnoDB Buffers"
+This mode displays information about the InnoDB buffer pool, page statistics,
+insert buffer, and adaptive hash index.  The data comes from \s-1SHOW\s0 \s-1INNODB\s0 \s-1STATUS\s0.
+.Sp
+This mode contains the \*(L"buffer_pool\*(R", \*(L"page_statistics\*(R",
+\&\*(L"insert_buffers\*(R", and \*(L"adaptive_hash_index\*(R" tables by default.
+.IP "C: Command Summary" 4
+.IX Item "C: Command Summary"
+This mode is similar to mytop's Command Summary mode.  It shows the
+\&\*(L"cmd_summary\*(R" table, which looks something like the following:
+.Sp
+.Vb 8
+\& Command Summary (? for help) localhost, 25+07:16:43, 2.45 QPS, 3 thd, 5.0.40
+\& _____________________ Command Summary _____________________
+\& Name                    Value    Pct     Last Incr  Pct    
+\& Select_scan             3244858  69.89%          2  100.00%
+\& Select_range            1354177  29.17%          0    0.00%
+\& Select_full_join          39479   0.85%          0    0.00%
+\& Select_full_range_join     4097   0.09%          0    0.00%
+\& Select_range_check            0   0.00%          0    0.00%
+.Ve
+.Sp
+The command summary table is built by extracting variables from
+\&\*(L"\s-1STATUS_VARIABLES\s0\*(R".  The variables must be numeric and must match the prefix
+given by the \*(L"cmd_filter\*(R" configuration variable.  The variables are then
+sorted by value descending and compared to the last variable, as shown above.
+The percentage columns are percentage of the total of all variables in the
+table, so you can see the relative weight of the variables.
+.Sp
+The example shows what you see if the prefix is \*(L"Select_\*(R".  The default
+prefix is \*(L"Com_\*(R".  You can choose a prefix with the 's' key.
+.Sp
+It's rather like running \s-1SHOW\s0 \s-1VARIABLES\s0 \s-1LIKE\s0 \*(L"prefix%\*(R" with memory and
+nice formatting.
+.Sp
+Values are aggregated across all servers.  The Pct columns are not correctly
+aggregated across multiple servers.  This is a known limitation of the grouping
+algorithm that may be fixed in the future.
+.IP "D: InnoDB Deadlocks" 4
+.IX Item "D: InnoDB Deadlocks"
+This mode shows the transactions involved in the last InnoDB deadlock.  A second
+table shows the locks each transaction held and waited for.  A deadlock is
+caused by a cycle in the waits-for graph, so there should be two locks held and
+one waited for unless the deadlock information is truncated.
+.Sp
+InnoDB puts deadlock information before some other information in the \s-1SHOW\s0
+\&\s-1INNODB\s0 \s-1STATUS\s0 output.  If there are a lot of locks, the deadlock information can
+grow very large, and there is a limit on the size of the \s-1SHOW\s0 \s-1INNODB\s0
+\&\s-1STATUS\s0 output.  A large deadlock can fill the entire output, or even be
+truncated, and prevent you from seeing other information at all.  If you are
+running innotop in another mode, for example T mode, and suddenly you don't see
+anything, you might want to check and see if a deadlock has wiped out the data
+you need.
+.Sp
+If it has, you can create a small deadlock to replace the large one.  Use the
+\&'w' key to 'wipe' the large deadlock with a small one.  This will not work
+unless you have defined a deadlock table for the connection (see \*(L"\s-1SERVER\s0 \s-1CONNECTIONS\s0\*(R").
+.Sp
+You can also configure innotop to automatically detect when a large deadlock
+needs to be replaced with a small one (see \*(L"auto_wipe_dl\*(R").
+.Sp
+This mode displays the \*(L"deadlock_transactions\*(R" and \*(L"deadlock_locks\*(R" tables
+by default.
+.IP "F: InnoDB Foreign Key Errors" 4
+.IX Item "F: InnoDB Foreign Key Errors"
+This mode shows the last InnoDB foreign key error information, such as the
+table where it happened, when and who and what query caused it, and so on.
+.Sp
+InnoDB has a huge variety of foreign key error messages, and many of them are
+just hard to parse.  innotop doesn't always do the best job here, but there's
+so much code devoted to parsing this messy, unparseable output that innotop is
+likely never to be perfect in this regard.  If innotop doesn't show you what
+you need to see, just look at the status text directly.
+.Sp
+This mode displays the \*(L"fk_error\*(R" table by default.
+.IP "I: InnoDB I/O Info" 4
+.IX Item "I: InnoDB I/O Info"
+This mode shows InnoDB's I/O statistics, including the I/O threads, pending I/O,
+file I/O miscellaneous, and log statistics.  It displays the \*(L"io_threads\*(R",
+\&\*(L"pending_io\*(R", \*(L"file_io_misc\*(R", and \*(L"log_statistics\*(R" tables by default.
+.IP "L: Locks" 4
+.IX Item "L: Locks"
+This mode shows information about current locks.  At the moment only InnoDB
+locks are supported, and by default you'll only see locks for which transactions
+are waiting.  This information comes from the \s-1TRANSACTIONS\s0 section of the InnoDB
+status text.  If you have a very busy server, you may have frequent lock waits;
+it helps to be able to see which tables and indexes are the \*(L"hot spot\*(R" for
+locks.  If your server is running pretty well, this mode should show nothing.
+.Sp
+You can configure MySQL and innotop to monitor not only locks for which a
+transaction is waiting, but those currently held, too.  You can do this with the
+InnoDB Lock Monitor (<http://dev.mysql.com/doc/en/innodb\-monitor.html>).  It's
+not documented in the MySQL manual, but creating the lock monitor with the
+following statement also affects the output of \s-1SHOW\s0 \s-1INNODB\s0 \s-1STATUS\s0, which innotop
+uses:
+.Sp
+.Vb 1
+\&  CREATE TABLE innodb_lock_monitor(a int) ENGINE=INNODB;
+.Ve
+.Sp
+This causes InnoDB to print its output to the MySQL file every 16 seconds or so,
+as stated in the manual, but it also makes the normal \s-1SHOW\s0 \s-1INNODB\s0 \s-1STATUS\s0 output
+include lock information, which innotop can parse and display (that's the
+undocumented feature).
+.Sp
+This means you can do what may have seemed impossible: to a limited extent
+(InnoDB truncates some information in the output), you can see which transaction
+holds the locks something else is waiting for.  You can also enable and disable
+the InnoDB Lock Monitor with the key mappings in this mode.
+.Sp
+This mode displays the \*(L"innodb_locks\*(R" table by default.  Here's a sample of
+the screen when one connection is waiting for locks another connection holds:
+.Sp
+.Vb 7
+\& _________________________________ InnoDB Locks __________________________
+\& CXN        ID  Type    Waiting  Wait   Active  Mode  DB    Table  Index
+\& localhost  12  RECORD        1  00:10   00:10  X     test  t1     PRIMARY
+\& localhost  12  TABLE         0  00:10   00:10  IX    test  t1
+\& localhost  12  RECORD        1  00:10   00:10  X     test  t1     PRIMARY
+\& localhost  11  TABLE         0  00:00   00:25  IX    test  t1
+\& localhost  11  RECORD        0  00:00   00:25  X     test  t1     PRIMARY
+.Ve
+.Sp
+You can see the first connection, \s-1ID\s0 12, is waiting for a lock on the \s-1PRIMARY\s0
+key on test.t1, and has been waiting for 10 seconds.  The second connection
+isn't waiting, because the Waiting column is 0, but it holds locks on the same
+index.  That tells you connection 11 is blocking connection 12.
+.IP "M: Master/Slave Replication Status" 4
+.IX Item "M: Master/Slave Replication Status"
+This mode shows the output of \s-1SHOW\s0 \s-1SLAVE\s0 \s-1STATUS\s0 and \s-1SHOW\s0 \s-1MASTER\s0 \s-1STATUS\s0 in three
+tables.  The first two divide the slave's status into \s-1SQL\s0 and I/O thread status,
+and the last shows master status.  Filters are applied to eliminate non-slave
+servers from the slave tables, and non-master servers from the master table.
+.Sp
+This mode displays the \*(L"slave_sql_status\*(R", \*(L"slave_io_status\*(R", and
+\&\*(L"master_status\*(R" tables by default.
+.IP "O: Open Tables" 4
+.IX Item "O: Open Tables"
+This section comes from MySQL's \s-1SHOW\s0 \s-1OPEN\s0 \s-1TABLES\s0 command.  By default it is
+filtered to show tables which are in use by one or more queries, so you can
+get a quick look at which tables are 'hot'.  You can use this to guess which
+tables might be locked implicitly.
+.Sp
+This mode displays the \*(L"open_tables\*(R" mode by default.
+.IP "Q: Query List" 4
+.IX Item "Q: Query List"
+This mode displays the output from \s-1SHOW\s0 \s-1FULL\s0 \s-1PROCESSLIST\s0, much like \fBmytop\fR's
+query list mode.  This mode does \fBnot\fR show InnoDB-related information.  This
+is probably one of the most useful modes for general usage.
+.Sp
+There is an informative header that shows general status information about
+your server.  You can toggle it on and off with the 'h' key.  By default,
+innotop hides inactive processes and its own process.  You can toggle these on
+and off with the 'i' and 'a' keys.
+.Sp
+You can \s-1EXPLAIN\s0 a query from this mode with the 'e' key.  This displays the
+query's full text, the results of \s-1EXPLAIN\s0, and in newer MySQL versions, even
+the optimized query resulting from \s-1EXPLAIN\s0 \s-1EXTENDED\s0.  innotop also tries to
+rewrite certain queries to make them EXPLAIN\-able.  For example, \s-1INSERT/SELECT\s0
+statements are rewritable.
+.Sp
+This mode displays the \*(L"q_header\*(R" and \*(L"processlist\*(R" tables by default.
+.IP "R: InnoDB Row Operations and Semaphores" 4
+.IX Item "R: InnoDB Row Operations and Semaphores"
+This mode shows InnoDB row operations, row operation miscellaneous, semaphores,
+and information from the wait array.  It displays the \*(L"row_operations\*(R",
+\&\*(L"row_operation_misc\*(R", \*(L"semaphores\*(R", and \*(L"wait_array\*(R" tables by default.
+.IP "S: Variables & Status" 4
+.IX Item "S: Variables & Status"
+This mode calculates statistics, such as queries per second, and prints them out
+in several different styles.  You can show absolute values, or incremental values
+between ticks.
+.Sp
+You can switch between the views by pressing a key.  The 's' key prints a
+single line each time the screen updates, in the style of \fBvmstat\fR.  The 'g'
+key changes the view to a graph of the same numbers, sort of like \fBtload\fR.
+The 'v' key changes the view to a pivoted table of variable names on the left,
+with successive updates scrolling across the screen from left to right.  You can
+choose how many updates to put on the screen with the \*(L"num_status_sets\*(R"
+configuration variable.
+.Sp
+Headers may be abbreviated to fit on the screen in interactive operation.  You
+choose which variables to display with the 'c' key, which selects from
+predefined sets, or lets you create your own sets.  You can edit the current set
+with the 'e' key.
+.Sp
+This mode doesn't really display any tables like other modes.  Instead, it uses
+a table definition to extract and format the data, but it then transforms the
+result in special ways before outputting it.  It uses the \*(L"var_status\*(R" table
+definition for this.
+.IP "T: InnoDB Transactions" 4
+.IX Item "T: InnoDB Transactions"
+This mode shows transactions from the InnoDB monitor's output, in \fBtop\fR\-like
+format.  This mode is the reason I wrote innotop.
+.Sp
+You can kill queries or processes with the 'k' and 'x' keys, and \s-1EXPLAIN\s0 a query
+with the 'e' or 'f' keys.  InnoDB doesn't print the full query in transactions,
+so explaining may not work right if the query is truncated.
+.Sp
+The informational header can be toggled on and off with the 'h' key.  By
+default, innotop hides inactive transactions and its own transaction.  You can
+toggle this on and off with the 'i' and 'a' keys.
+.Sp
+This mode displays the \*(L"t_header\*(R" and \*(L"innodb_transactions\*(R" tables by
+default.
+.SH "INNOTOP STATUS"
+.IX Header "INNOTOP STATUS"
+The first line innotop displays is a \*(L"status bar\*(R" of sorts.  What it contains
+depends on the mode you're in, and what servers you're monitoring.  The first
+few words are always the innotop mode, such as \*(L"InnoDB Txns\*(R" for T mode,
+followed by a reminder to press '?' for help at any time.
+.Sh "\s-1ONE\s0 \s-1SERVER\s0"
+.IX Subsection "ONE SERVER"
+The simplest case is when you're monitoring a single server.  In this case, the
+name of the connection is next on the status line.  This is the name you gave
+when you created the connection \*(-- most likely the MySQL server's hostname.
+This is followed by the server's uptime.
+.PP
+If you're in an InnoDB mode, such as T or B, the next word is \*(L"InnoDB\*(R" followed
+by some information about the \s-1SHOW\s0 \s-1INNODB\s0 \s-1STATUS\s0 output used to render the
+screen.  The first word is the number of seconds since the last \s-1SHOW\s0 \s-1INNODB\s0
+\&\s-1STATUS\s0, which InnoDB uses to calculate some per-second statistics.  The next is
+a smiley face indicating whether the InnoDB output is truncated.  If the smiley
+face is a :\-), all is well; there is no truncation.  A :^| means the transaction
+list is so long, InnoDB has only printed out some of the transactions.  Finally,
+a frown :\-( means the output is incomplete, which is probably due to a deadlock
+printing too much lock information (see \*(L"D: InnoDB Deadlocks\*(R").
+.PP
+The next two words indicate the server's queries per second (\s-1QPS\s0) and how many
+threads (connections) exist.  Finally, the server's version number is the last
+thing on the line.
+.Sh "\s-1MULTIPLE\s0 \s-1SERVERS\s0"
+.IX Subsection "MULTIPLE SERVERS"
+If you are monitoring multiple servers (see \*(L"\s-1SERVER\s0 \s-1CONNECTIONS\s0\*(R"), the status
+line does not show any details about individual servers.  Instead, it shows the
+names of the connections that are active.  Again, these are connection names you
+specified, which are likely to be the server's hostname.  A connection that has
+an error is prefixed with an exclamation point.
+.PP
+If you are monitoring a group of servers (see \*(L"\s-1SERVER\s0 \s-1GROUPS\s0\*(R"), the status
+line shows the name of the group.  If any connection in the group has an
+error, the group's name is followed by the fraction of the connections that
+don't have errors.
+.PP
+See \*(L"\s-1ERROR\s0 \s-1HANDLING\s0\*(R" for more details about innotop's error handling.
+.Sh "\s-1MONITORING\s0 A \s-1FILE\s0"
+.IX Subsection "MONITORING A FILE"
+If you give a filename on the command line, innotop will not connect to \s-1ANY\s0
+servers at all.  It will watch the specified file for InnoDB status output and
+use that as its data source.  It will always show a single connection called
+\&'file'.  And since it can't connect to a server, it can't determine how long the
+server it's monitoring has been up; so it calculates the server's uptime as time
+since innotop started running.
+.SH "SERVER ADMINISTRATION"
+.IX Header "SERVER ADMINISTRATION"
+While innotop is primarily a monitor that lets you watch and analyze your
+servers, it can also send commands to servers.  The most frequently useful
+commands are killing queries and stopping or starting slaves.
+.PP
+You can kill a connection, or in newer versions of MySQL kill a query but not a
+connection, from \*(L"Q: Query List\*(R" and \*(L"T: InnoDB Transactions\*(R" modes.
+Press 'k' to issue a \s-1KILL\s0 command, or 'x' to issue a \s-1KILL\s0 \s-1QUERY\s0 command.
+innotop will prompt you for the server and/or connection \s-1ID\s0 to kill (innotop
+does not prompt you if there is only one possible choice for any input).
+innotop pre-selects the longest-running query, or the oldest connection.
+Confirm the command with 'y'.
+.PP
+In \*(L"M: Master/Slave Replication Status\*(R" mode, you can start and stop slaves
+with the 'a' and 'o' keys, respectively.  You can send these commands to many
+slaves at once.  innotop fills in a default command of \s-1START\s0 \s-1SLAVE\s0 or \s-1STOP\s0 \s-1SLAVE\s0
+for you, but you can actually edit the command and send anything you wish, such
+as \s-1SET\s0 \s-1GLOBAL\s0 SQL_SLAVE_SKIP_COUNTER=1 to make the slave skip one binlog event
+when it starts.
+.PP
+You can also ask innotop to calculate the earliest binlog in use by any slave
+and issue a \s-1PURGE\s0 \s-1MASTER\s0 \s-1LOGS\s0 on the master.  Use the 'b' key for this.  innotop
+will prompt you for a master to run the command on, then prompt you for the
+connection names of that master's slaves (there is no way for innotop to
+determine this reliably itself).  innotop will find the minimum binlog in use by
+these slave connections and suggest it as the argument to \s-1PURGE\s0 \s-1MASTER\s0 \s-1LOGS\s0.
+.SH "SERVER CONNECTIONS"
+.IX Header "SERVER CONNECTIONS"
+When you create a server connection, innotop asks you for a series of inputs, as
+follows:
+.IP "\s-1DSN\s0" 4
+.IX Item "DSN"
+A \s-1DSN\s0 is a Data Source Name, which is the initial argument passed to the \s-1DBI\s0
+module for connecting to a server.  It is usually of the form
+.Sp
+.Vb 1
+\& DBI:mysql:;mysql_read_default_group=mysql;host=HOSTNAME
+.Ve
+.Sp
+Since this \s-1DSN\s0 is passed to the DBD::mysql driver, you should read the driver's
+documentation at \*(L"http://search.cpan.org/dist/DBD\-mysql/lib/DBD/mysql.pm\*(R" for
+the exact details on all the options you can pass the driver in the \s-1DSN\s0.  You
+can read more about \s-1DBI\s0 at <http://dbi.perl.org/docs/>, and especially at
+<http://search.cpan.org/~timb/DBI/DBI.pm>.
+.Sp
+The mysql_read_default_group=mysql option lets the \s-1DBD\s0 driver read your MySQL
+options files, such as ~/.my.cnf on UNIX-ish systems.  You can use this to avoid
+specifying a username or password for the connection.
+.IP "InnoDB Deadlock Table" 4
+.IX Item "InnoDB Deadlock Table"
+This optional item tells innotop a table name it can use to deliberately create
+a small deadlock (see \*(L"D: InnoDB Deadlocks\*(R").  If you specify this option,
+you just need to be sure the table doesn't exist, and that innotop can create
+and drop the table with the InnoDB storage engine.  You can safely omit or just
+accept the default if you don't intend to use this.
+.IP "Username" 4
+.IX Item "Username"
+innotop will ask you if you want to specify a username.  If you say 'y', it will
+then prompt you for a user name.  If you have a MySQL option file that specifies
+your username, you don't have to specify a username.
+.Sp
+The username defaults to your login name on the system you're running innotop on.
+.IP "Password" 4
+.IX Item "Password"
+innotop will ask you if you want to specify a password.  Like the username, the
+password is optional, but there's an additional prompt that asks if you want to
+save the password in the innotop configuration file.  If you don't save it in
+the configuration file, innotop will prompt you for a password each time it
+starts.  Passwords in the innotop configuration file are saved in plain text,
+not encrypted in any way.
+.PP
+Once you finish answering these questions, you should be connected to a server.
+But innotop isn't limited to monitoring a single server; you can define many
+server connections and switch between them by pressing the '@' key.  See
+\&\*(L"\s-1SWITCHING\s0 \s-1BETWEEN\s0 \s-1CONNECTIONS\s0\*(R".
+.PP
+To create a new connection, press the '@' key and type the name of the new
+connection, then follow the steps given above.
+.SH "SERVER GROUPS"
+.IX Header "SERVER GROUPS"
+If you have multiple MySQL instances, you can put them into named groups, such
+as 'all', 'masters', and 'slaves', which innotop can monitor all together.
+.PP
+You can choose which group to monitor with the '#' key, and you can press the
+\&\s-1TAB\s0 key to switch to the next group.  If you're not currently monitoring a
+group, pressing \s-1TAB\s0 selects the first group.
+.PP
+To create a group, press the '#' key and type the name of your new group, then
+type the names of the connections you want the group to contain.
+.SH "SWITCHING BETWEEN CONNECTIONS"
+.IX Header "SWITCHING BETWEEN CONNECTIONS"
+innotop lets you quickly switch which servers you're monitoring.  The most basic
+way is by pressing the '@' key and typing the name(s) of the connection(s) you
+want to use.  This setting is per\-mode, so you can monitor different connections
+in each mode, and innotop remembers which connections you choose.
+.PP
+You can quickly switch to the 'next' connection in alphabetical order with the
+\&'n' key.  If you're monitoring a server group (see \*(L"\s-1SERVER\s0 \s-1GROUPS\s0\*(R") this will
+switch to the first connection.
+.PP
+You can also type many connection names, and innotop will fetch and display data
+from them all.  Just separate the connection names with spaces, for example
+\&\*(L"server1 server2.\*(R"  Again, if you type the name of a connection that doesn't
+exist, innotop will prompt you for connection information and create the
+connection.
+.PP
+Another way to monitor multiple connections at once is with server groups.  You
+can use the \s-1TAB\s0 key to switch to the 'next' group in alphabetical order, or if
+you're not monitoring any groups, \s-1TAB\s0 will switch to the first group.
+.PP
+innotop does not fetch data in parallel from connections, so if you are
+monitoring a large group or many connections, you may notice increased delay
+between ticks.
+.PP
+When you monitor more than one connection, innotop's status bar changes.  See
+\&\*(L"\s-1INNOTOP\s0 \s-1STATUS\s0\*(R".
+.SH "ERROR HANDLING"
+.IX Header "ERROR HANDLING"
+Error handling is not that important when monitoring a single connection, but is
+crucial when you have many active connections.  A crashed server or lost
+connection should not crash innotop.  As a result, innotop will continue to run
+even when there is an error; it just won't display any information from the
+connection that had an error.  Because of this, innotop's behavior might confuse
+you.  It's a feature, not a bug!
+.PP
+innotop does not continue to query connections that have errors, because they
+may slow innotop and make it hard to use, especially if the error is a problem
+connecting and causes a long time\-out.  Instead, innotop retries the connection
+occasionally to see if the error still exists.  If so, it will wait until some
+point in the future.  The wait time increases in ticks as the Fibonacci series,
+so it tries less frequently as time passes.
+.PP
+Since errors might only happen in certain modes because of the \s-1SQL\s0 commands
+issued in those modes, innotop keeps track of which mode caused the error.  If
+you switch to a different mode, innotop will retry the connection instead of
+waiting.
+.PP
+By default innotop will display the problem in red text at the bottom of the
+first table on the screen.  You can disable this behavior with the
+\&\*(L"show_cxn_errors_in_tbl\*(R" configuration option, which is enabled by default.
+If the \*(L"debug\*(R" option is enabled, innotop will display the error at the
+bottom of every table, not just the first.  And if \*(L"show_cxn_errors\*(R" is
+enabled, innotop will print the error text to \s-1STDOUT\s0 as well.  Error messages
+might only display in the mode that caused the error, depending on the mode and
+whether innotop is avoiding querying that connection.
+.SH "NON-INTERACTIVE OPERATION"
+.IX Header "NON-INTERACTIVE OPERATION"
+You can run innotop in non-interactive mode, in which case it is entirely
+controlled from the configuration file and command-line options.  To start
+innotop in non-interactive mode, give the L\*(L"<\-\-nonint\*(R"> command-line option.
+This changes innotop's behavior in the following ways:
+.IP "\(bu" 4
+Certain Perl modules are not loaded.  Term::Readline is not loaded, since
+innotop doesn't prompt interactively.  Term::ANSIColor and Win32::Console::ANSI
+modules are not loaded.  Term::ReadKey is still used, since innotop may have to
+prompt for connection passwords when starting up.
+.IP "\(bu" 4
+innotop does not clear the screen after each tick.
+.IP "\(bu" 4
+innotop does not persist any changes to the configuration file.
+.IP "\(bu" 4
+If \*(L"\-\-count\*(R" is given and innotop is in incremental mode (see \*(L"status_inc\*(R"
+and \*(L"\-\-inc\*(R"), innotop actually refreshes one more time than specified so it
+can print incremental statistics.  This suppresses output during the first
+tick, so innotop may appear to hang.
+.IP "\(bu" 4
+innotop only displays the first table in each mode.  This is so the output can
+be easily processed with other command-line utilities such as awk and sed.  To
+change which tables display in each mode, see \*(L"\s-1TABLES\s0\*(R".  Since \*(L"Q: Query List\*(R" mode is so important, innotop automatically disables the \*(L"q_header\*(R"
+table.  This ensures you'll see the \*(L"processlist\*(R" table, even if you have
+innotop configured to show the q_header table during interactive operation.
+Similarly, in \*(L"T: InnoDB Transactions\*(R" mode, the \*(L"t_header\*(R" table is
+suppressed so you see only the \*(L"innodb_transactions\*(R" table.
+.IP "\(bu" 4
+All output is tab-separated instead of being column-aligned with whitespace, and
+innotop prints the full contents of each table instead of only printing one
+screenful at a time.
+.IP "\(bu" 4
+innotop only prints column headers once instead of every tick (see
+\&\*(L"hide_hdr\*(R").  innotop does not print table captions (see
+\&\*(L"display_table_captions\*(R").  innotop ensures there are no empty lines in the
+output.
+.IP "\(bu" 4
+innotop does not honor the \*(L"shorten\*(R" transformation, which normally shortens
+some numbers to human-readable formats.
+.IP "\(bu" 4
+innotop does not print a status line (see \*(L"\s-1INNOTOP\s0 \s-1STATUS\s0\*(R").
+.SH "CONFIGURING"
+.IX Header "CONFIGURING"
+Nearly everything about innotop is configurable.  Most things are possible to
+change with built-in commands, but you can also edit the configuration file.
+.PP
+While running innotop, press the '$' key to bring up the configuration editing
+dialog.  Press another key to select the type of data you want to edit:
+.IP "S: Statement Sleep Times" 4
+.IX Item "S: Statement Sleep Times"
+Edits \s-1SQL\s0 statement sleep delays, which make innotop pause for the specified
+amount of time after executing a statement.  See \*(L"\s-1SQL\s0 \s-1STATEMENTS\s0\*(R" for a
+definition of each statement and what it does.  By default innotop does not
+delay after any statements.
+.Sp
+This feature is included so you can customize the side-effects caused by
+monitoring your server.  You may not see any effects, but some innotop users
+have noticed that certain MySQL versions under very high load with InnoDB
+enabled take longer than usual to execute \s-1SHOW\s0 \s-1GLOBAL\s0 \s-1STATUS\s0.  If innotop calls
+\&\s-1SHOW\s0 \s-1FULL\s0 \s-1PROCESSLIST\s0 immediately afterward, the processlist contains more
+queries than the machine actually averages at any given moment.  Configuring
+innotop to pause briefly after calling \s-1SHOW\s0 \s-1GLOBAL\s0 \s-1STATUS\s0 alleviates this
+effect.
+.Sp
+Sleep times are stored in the \*(L"stmt_sleep_times\*(R" section of the configuration
+file.  Fractional-second sleeps are supported, subject to your hardware's
+limitations.
+.IP "c: Edit Columns" 4
+.IX Item "c: Edit Columns"
+Starts the table editor on one of the displayed tables.  See \*(L"\s-1TABLE\s0 \s-1EDITOR\s0\*(R".
+An alternative way to start the table editor without entering the configuration
+dialog is with the '^' key.
+.IP "g: General Configuration" 4
+.IX Item "g: General Configuration"
+Starts the configuration editor to edit global and mode-specific configuration
+variables (see \*(L"\s-1MODES\s0\*(R").  innotop prompts you to choose a variable from among
+the global and mode-specific ones depending on the current mode.
+.IP "k: Row-Coloring Rules" 4
+.IX Item "k: Row-Coloring Rules"
+Starts the row-coloring rules editor on one of the displayed table(s).  See
+\&\*(L"\s-1COLORS\s0\*(R" for details.
+.IP "p: Manage Plugins" 4
+.IX Item "p: Manage Plugins"
+Starts the plugin configuration editor.  See \*(L"\s-1PLUGINS\s0\*(R" for details.
+.IP "s: Server Groups" 4
+.IX Item "s: Server Groups"
+Lets you create and edit server groups.  See \*(L"\s-1SERVER\s0 \s-1GROUPS\s0\*(R".
+.IP "t: Choose Displayed Tables" 4
+.IX Item "t: Choose Displayed Tables"
+Lets you choose which tables to display in this mode.  See \*(L"\s-1MODES\s0\*(R" and
+\&\*(L"\s-1TABLES\s0\*(R".
+.SH "CONFIGURATION FILE"
+.IX Header "CONFIGURATION FILE"
+innotop's default configuration file location is in \f(CW$HOME\fR/.innotop, but can be
+overridden with the \*(L"\-\-config\*(R" command-line option.  You can edit it by hand
+safely.  innotop reads the configuration file when it starts, and writes it out
+again when it exits, so any changes you make while innotop is running will be
+lost.
+.PP
+innotop doesn't store its entire configuration in the configuration file.  It
+has a huge set of default configuration that it holds only in memory, and the
+configuration file only overrides these defaults.  When you customize a default
+setting, innotop notices, and then stores the customizations into the file.
+This keeps the file size down, makes it easier to edit, and makes upgrades
+easier.
+.PP
+A configuration file can be made read\-only.  See \*(L"readonly\*(R".
+.PP
+The configuration file is arranged into sections like an \s-1INI\s0 file.  Each
+section begins with [section\-name] and ends with [/section\-name].  Each
+section's entries have a different syntax depending on the data they need to
+store.  You can put comments in the file; any line that begins with a #
+character is a comment.  innotop will not read the comments, so it won't write
+them back out to the file when it exits.  Comments in read-only configuration
+files are still useful, though.
+.PP
+The first line in the file is innotop's version number.  This lets innotop
+notice when the file format is not backwards\-compatible, and upgrade smoothly
+without destroying your customized configuration.
+.PP
+The following list describes each section of the configuration file and the data
+it contains:
+.IP "general" 4
+.IX Item "general"
+The 'general' section contains global configuration variables and variables that
+may be mode\-specific, but don't belong in any other section.  The syntax is a
+simple key=value list.  innotop writes a comment above each value to help you
+edit the file by hand.
+.RS 4
+.IP "S_func" 4
+.IX Item "S_func"
+Controls S mode presentation (see \*(L"S: Variables & Status\*(R").  If g, values are
+graphed; if s, values are like vmstat; if p, values are in a pivoted table.
+.IP "S_set" 4
+.IX Item "S_set"
+Specifies which set of variables to display in \*(L"S: Variables & Status\*(R" mode.
+See \*(L"\s-1VARIABLE\s0 \s-1SETS\s0\*(R".
+.IP "auto_wipe_dl" 4
+.IX Item "auto_wipe_dl"
+Instructs innotop to automatically wipe large deadlocks when it notices them.
+When this happens you may notice a slight delay.  At the next tick, you will
+usually see the information that was being truncated by the large deadlock.
+.IP "charset" 4
+.IX Item "charset"
+Specifies what kind of characters to allow through the \*(L"no_ctrl_char\*(R"
+transformation.  This keeps non-printable characters from confusing a
+terminal when you monitor queries that contain binary data, such as images.
+.Sp
+The default is 'ascii', which considers anything outside normal \s-1ASCII\s0 to be a
+control character.  The other allowable values are 'unicode' and 'none'.  'none'
+considers every character a control character, which can be useful for
+collapsing \s-1ALL\s0 text fields in queries.
+.IP "cmd_filter" 4
+.IX Item "cmd_filter"
+This is the prefix that filters variables in \*(L"C: Command Summary\*(R" mode.
+.IP "color" 4
+.IX Item "color"
+Whether terminal coloring is permitted.
+.IP "cxn_timeout" 4
+.IX Item "cxn_timeout"
+On MySQL versions 4.0.3 and newer, this variable is used to set the connection's
+timeout, so MySQL doesn't close the connection if it is not used for a while.
+This might happen because a connection isn't monitored in a particular mode, for
+example.
+.IP "debug" 4
+.IX Item "debug"
+This option enables more verbose errors and makes innotop more strict in some
+places.  It can help in debugging filters and other user-defined code.  It also
+makes innotop write a lot of information to \*(L"debugfile\*(R" when there is a
+crash.
+.IP "debugfile" 4
+.IX Item "debugfile"
+A file to which innotop will write information when there is a crash.  See
+\&\*(L"\s-1FILES\s0\*(R".
+.IP "display_table_captions" 4
+.IX Item "display_table_captions"
+innotop displays a table caption above most tables.  This variable suppresses or
+shows captions on all tables globally.  Some tables are configured with the
+hide_caption property, which overrides this.
+.IP "global" 4
+.IX Item "global"
+Whether to show \s-1GLOBAL\s0 variables and status.  innotop only tries to do this on
+servers which support the \s-1GLOBAL\s0 option to \s-1SHOW\s0 \s-1VARIABLES\s0 and \s-1SHOW\s0 \s-1STATUS\s0.  In
+some MySQL versions, you need certain privileges to do this; if you don't have
+them, innotop will not be able to fetch any variable and status data.  This
+configuration variable lets you run innotop and fetch what data you can even
+without the elevated privileges.
+.Sp
+I can no longer find or reproduce the situation where \s-1GLOBAL\s0 wasn't allowed, but
+I know there was one.
+.IP "graph_char" 4
+.IX Item "graph_char"
+Defines the character to use when drawing graphs in \*(L"S: Variables & Status\*(R"
+mode.
+.IP "header_highlight" 4
+.IX Item "header_highlight"
+Defines how to highlight column headers.  This only works if Term::ANSIColor is
+available.  Valid values are 'bold' and 'underline'.
+.IP "hide_hdr" 4
+.IX Item "hide_hdr"
+Hides column headers globally.
+.IP "interval" 4
+.IX Item "interval"
+The interval at which innotop will refresh its data (ticks).  The interval is
+implemented as a sleep time between ticks, so the true interval will vary
+depending on how long it takes innotop to fetch and render data.
+.Sp
+This variable accepts fractions of a second.
+.IP "mode" 4
+.IX Item "mode"
+The mode in which innotop should start.  Allowable arguments are the same as the
+key presses that select a mode interactively.  See \*(L"\s-1MODES\s0\*(R".
+.IP "num_digits" 4
+.IX Item "num_digits"
+How many digits to show in fractional numbers and percents.  This variable's
+range is between 0 and 9 and can be set directly from \*(L"S: Variables & Status\*(R"
+mode with the '+' and '\-' keys.  It is used in the \*(L"set_precision\*(R",
+\&\*(L"shorten\*(R", and \*(L"percent\*(R" transformations.
+.IP "num_status_sets" 4
+.IX Item "num_status_sets"
+Controls how many sets of status variables to display in pivoted \*(L"S: Variables & Status\*(R" mode.  It also controls the number of old sets of variables innotop
+keeps in its memory, so the larger this variable is, the more memory innotop
+uses.
+.IP "plugin_dir" 4
+.IX Item "plugin_dir"
+Specifies where plugins can be found.  By default, innotop stores plugins in the
+\&'plugins' subdirectory of your innotop configuration directory.
+.IP "readonly" 4
+.IX Item "readonly"
+Whether the configuration file is readonly.  This cannot be set interactively,
+because it would prevent itself from being written to the configuration file.
+.IP "show_cxn_errors" 4
+.IX Item "show_cxn_errors"
+Makes innotop print connection errors to \s-1STDOUT\s0.  See \*(L"\s-1ERROR\s0 \s-1HANDLING\s0\*(R".
+.IP "show_cxn_errors_in_tbl" 4
+.IX Item "show_cxn_errors_in_tbl"
+Makes innotop display connection errors as rows in the first table on screen.
+See \*(L"\s-1ERROR\s0 \s-1HANDLING\s0\*(R".
+.IP "show_percent" 4
+.IX Item "show_percent"
+Adds a '%' character after the value returned by the \*(L"percent\*(R"
+transformation.
+.IP "show_statusbar" 4
+.IX Item "show_statusbar"
+Controls whether to show the status bar in the display.  See \*(L"\s-1INNOTOP\s0 \s-1STATUS\s0\*(R".
+.IP "skip_innodb" 4
+.IX Item "skip_innodb"
+Disables fetching \s-1SHOW\s0 \s-1INNODB\s0 \s-1STATUS\s0, in case your server(s) do not have InnoDB
+enabled and you don't want innotop to try to fetch it.  This can also be useful
+when you don't have the \s-1SUPER\s0 privilege, required to run \s-1SHOW\s0 \s-1INNODB\s0 \s-1STATUS\s0.
+.IP "status_inc" 4
+.IX Item "status_inc"
+Whether to show absolute or incremental values for status variables.
+Incremental values are calculated as an offset from the last value innotop saw
+for that variable.  This is a global setting, but will probably become
+mode-specific at some point.  Right now it is honored a bit inconsistently; some
+modes don't pay attention to it.
+.RE
+.RS 4
+.RE
+.IP "plugins" 4
+.IX Item "plugins"
+This section holds a list of package names of active plugins.  If the plugin
+exists, innotop will activate it.  See \*(L"\s-1PLUGINS\s0\*(R" for more information.
+.IP "filters" 4
+.IX Item "filters"
+This section holds user-defined filters (see \*(L"\s-1FILTERS\s0\*(R").  Each line is in the
+format filter_name=text='filter text' tbls='table list'.
+.Sp
+The filter text is the text of the subroutine's code.  The table list is a list
+of tables to which the filter can apply.  By default, user-defined filters apply
+to the table for which they were created, but you can manually override that by
+editing the definition in the configuration file.
+.IP "active_filters" 4
+.IX Item "active_filters"
+This section stores which filters are active on each table.  Each line is in the
+format table_name=filter_list.
+.IP "tbl_meta" 4
+.IX Item "tbl_meta"
+This section stores user-defined or user-customized columns (see \*(L"\s-1COLUMNS\s0\*(R").
+Each line is in the format col_name=properties, where the properties are a
+name=quoted\-value list.
+.IP "connections" 4
+.IX Item "connections"
+This section holds the server connections you have defined.  Each line is in the
+format name=properties, where the properties are a name=value list.  The
+properties are self\-explanatory, and the only one that is treated specially is
+\&'pass' which is only present if 'savepass' is set.  See \*(L"\s-1SERVER\s0 \s-1CONNECTIONS\s0\*(R".
+.IP "active_connections" 4
+.IX Item "active_connections"
+This section holds a list of which connections are active in each mode.  Each
+line is in the format mode_name=connection_list.
+.IP "server_groups" 4
+.IX Item "server_groups"
+This section holds server groups.  Each line is in the format
+name=connection_list.  See \*(L"\s-1SERVER\s0 \s-1GROUPS\s0\*(R".
+.IP "active_server_groups" 4
+.IX Item "active_server_groups"
+This section holds a list of which server group is active in each mode.  Each
+line is in the format mode_name=server_group.
+.IP "max_values_seen" 4
+.IX Item "max_values_seen"
+This section holds the maximum values seen for variables.  This is used to scale
+the graphs in \*(L"S: Variables & Status\*(R" mode.  Each line is in the format
+name=value.
+.IP "active_columns" 4
+.IX Item "active_columns"
+This section holds table column lists.  Each line is in the format
+tbl_name=column_list.  See \*(L"\s-1COLUMNS\s0\*(R".
+.IP "sort_cols" 4
+.IX Item "sort_cols"
+This section holds the sort definition.  Each line is in the format
+tbl_name=column_list.  If a column is prefixed with '\-', that column sorts
+descending.  See \*(L"\s-1SORTING\s0\*(R".
+.IP "visible_tables" 4
+.IX Item "visible_tables"
+This section defines which tables are visible in each mode.  Each line is in the
+format mode_name=table_list.  See \*(L"\s-1TABLES\s0\*(R".
+.IP "varsets" 4
+.IX Item "varsets"
+This section defines variable sets for use in \*(L"S: Status & Variables\*(R" mode.
+Each line is in the format name=variable_list.  See \*(L"\s-1VARIABLE\s0 \s-1SETS\s0\*(R".
+.IP "colors" 4
+.IX Item "colors"
+This section defines colorization rules.  Each line is in the format
+tbl_name=property_list.  See \*(L"\s-1COLORS\s0\*(R".
+.IP "stmt_sleep_times" 4
+.IX Item "stmt_sleep_times"
+This section contains statement sleep times.  Each line is in the format
+statement_name=sleep_time.  See \*(L"S: Statement Sleep Times\*(R".
+.IP "group_by" 4
+.IX Item "group_by"
+This section contains column lists for table group_by expressions.  Each line is
+in the format tbl_name=column_list.  See \*(L"\s-1GROUPING\s0\*(R".
+.SH "CUSTOMIZING"
+.IX Header "CUSTOMIZING"
+You can customize innotop a great deal.  For example, you can:
+.IP "\(bu" 4
+Choose which tables to display, and in what order.
+.IP "\(bu" 4
+Choose which columns are in those tables, and create new columns.
+.IP "\(bu" 4
+Filter which rows display with built-in filters, user-defined filters, and
+quick\-filters.
+.IP "\(bu" 4
+Sort the rows to put important data first or group together related rows.
+.IP "\(bu" 4
+Highlight rows with color.
+.IP "\(bu" 4
+Customize the alignment, width, and formatting of columns, and apply
+transformations to columns to extract parts of their values or format the values
+as you wish (for example, shortening large numbers to familiar units).
+.IP "\(bu" 4
+Design your own expressions to extract and combine data as you need.  This gives
+you unlimited flexibility.
+.PP
+All these and more are explained in the following sections.
+.Sh "\s-1TABLES\s0"
+.IX Subsection "TABLES"
+A table is what you'd expect: a collection of columns.  It also has some other
+properties, such as a caption.  Filters, sorting rules, and colorization rules
+belong to tables and are covered in later sections.
+.PP
+Internally, table meta-data is defined in a data structure called \f(CW%tbl_meta\fR.
+This hash holds all built-in table definitions, which contain a lot of default
+instructions to innotop.  The meta-data includes the caption, a list of columns
+the user has customized, a list of columns, a list of visible columns, a list of
+filters, color rules, a sort-column list, sort direction, and some information
+about the table's data sources.  Most of this is customizable via the table
+editor (see \*(L"\s-1TABLE\s0 \s-1EDITOR\s0\*(R").
+.PP
+You can choose which tables to show by pressing the '$' key.  See \*(L"\s-1MODES\s0\*(R" and
+\&\*(L"\s-1TABLES\s0\*(R".
+.PP
+The table life-cycle is as follows:
+.IP "\(bu" 4
+Each table begins with a data source, which is an array of hashes.  See below
+for details on data sources.
+.IP "\(bu" 4
+Each element of the data source becomes a row in the final table.
+.IP "\(bu" 4
+For each element in the data source, innotop extracts values from the source and
+creates a row.  This row is another hash, which later steps will refer to as
+\&\f(CW$set\fR.  The values innotop extracts are determined by the table's columns.  Each
+column has an extraction subroutine, compiled from an expression (see
+\&\*(L"\s-1EXPRESSIONS\s0\*(R").  The resulting row is a hash whose keys are named the same as
+the column name.
+.IP "\(bu" 4
+innotop filters the rows, removing those that don't need to be displayed.  See
+\&\*(L"\s-1FILTERS\s0\*(R".
+.IP "\(bu" 4
+innotop sorts the rows.  See \*(L"\s-1SORTING\s0\*(R".
+.IP "\(bu" 4
+innotop groups the rows together, if specified.  See \*(L"\s-1GROUPING\s0\*(R".
+.IP "\(bu" 4
+innotop colorizes the rows.  See \*(L"\s-1COLORS\s0\*(R".
+.IP "\(bu" 4
+innotop transforms the column values in each row.  See \*(L"\s-1TRANSFORMATIONS\s0\*(R".
+.IP "\(bu" 4
+innotop optionally pivots the rows (see \*(L"\s-1PIVOTING\s0\*(R"), then filters and sorts
+them.
+.IP "\(bu" 4
+innotop formats and justifies the rows as a table.  During this step, innotop
+applies further formatting to the column values, including alignment, maximum
+and minimum widths.  innotop also does final error checking to ensure there are
+no crashes due to undefined values.  innotop then adds a caption if specified,
+and the table is ready to print.
+.PP
+The lifecycle is slightly different if the table is pivoted, as noted above.  To
+clarify, if the table is pivoted, the process is extract, group, transform,
+pivot, filter, sort, create.  If it's not pivoted, the process is extract,
+filter, sort, group, color, transform, create.  This slightly convoluted process
+doesn't map all that well to \s-1SQL\s0, but pivoting complicates things pretty
+thoroughly.  Roughly speaking, filtering and sorting happen as late as needed to
+effect the final result as you might expect, but as early as possible for
+efficiency.
+.PP
+Each built-in table is described below:
+.IP "adaptive_hash_index" 4
+.IX Item "adaptive_hash_index"
+Displays data about InnoDB's adaptive hash index.  Data source:
+\&\*(L"\s-1STATUS_VARIABLES\s0\*(R".
+.IP "buffer_pool" 4
+.IX Item "buffer_pool"
+Displays data about InnoDB's buffer pool.  Data source: \*(L"\s-1STATUS_VARIABLES\s0\*(R".
+.IP "cmd_summary" 4
+.IX Item "cmd_summary"
+Displays weighted status variables.  Data source: \*(L"\s-1STATUS_VARIABLES\s0\*(R".
+.IP "deadlock_locks" 4
+.IX Item "deadlock_locks"
+Shows which locks were held and waited for by the last detected deadlock.  Data
+source: \*(L"\s-1DEADLOCK_LOCKS\s0\*(R".
+.IP "deadlock_transactions" 4
+.IX Item "deadlock_transactions"
+Shows transactions involved in the last detected deadlock.  Data source:
+\&\*(L"\s-1DEADLOCK_TRANSACTIONS\s0\*(R".
+.IP "explain" 4
+.IX Item "explain"
+Shows the output of \s-1EXPLAIN\s0.  Data source: \*(L"\s-1EXPLAIN\s0\*(R".
+.IP "file_io_misc" 4
+.IX Item "file_io_misc"
+Displays data about InnoDB's file and I/O operations.  Data source:
+\&\*(L"\s-1STATUS_VARIABLES\s0\*(R".
+.IP "fk_error" 4
+.IX Item "fk_error"
+Displays various data about InnoDB's last foreign key error.  Data source:
+\&\*(L"\s-1STATUS_VARIABLES\s0\*(R".
+.IP "innodb_locks" 4
+.IX Item "innodb_locks"
+Displays InnoDB locks.  Data source: \*(L"\s-1INNODB_LOCKS\s0\*(R".
+.IP "innodb_transactions" 4
+.IX Item "innodb_transactions"
+Displays data about InnoDB's current transactions.  Data source:
+\&\*(L"\s-1INNODB_TRANSACTIONS\s0\*(R".
+.IP "insert_buffers" 4
+.IX Item "insert_buffers"
+Displays data about InnoDB's insert buffer.  Data source: \*(L"\s-1STATUS_VARIABLES\s0\*(R".
+.IP "io_threads" 4
+.IX Item "io_threads"
+Displays data about InnoDB's I/O threads.  Data source: \*(L"\s-1IO_THREADS\s0\*(R".
+.IP "log_statistics" 4
+.IX Item "log_statistics"
+Displays data about InnoDB's logging system.  Data source: \*(L"\s-1STATUS_VARIABLES\s0\*(R".
+.IP "master_status" 4
+.IX Item "master_status"
+Displays replication master status.  Data source: \*(L"\s-1STATUS_VARIABLES\s0\*(R".
+.IP "open_tables" 4
+.IX Item "open_tables"
+Displays open tables.  Data source: \*(L"\s-1OPEN_TABLES\s0\*(R".
+.IP "page_statistics" 4
+.IX Item "page_statistics"
+Displays InnoDB page statistics.  Data source: \*(L"\s-1STATUS_VARIABLES\s0\*(R".
+.IP "pending_io" 4
+.IX Item "pending_io"
+Displays InnoDB pending I/O operations.  Data source: \*(L"\s-1STATUS_VARIABLES\s0\*(R".
+.IP "processlist" 4
+.IX Item "processlist"
+Displays current MySQL processes (threads/connections).  Data source:
+\&\*(L"\s-1PROCESSLIST\s0\*(R".
+.IP "q_header" 4
+.IX Item "q_header"
+Displays various status values.  Data source: \*(L"\s-1STATUS_VARIABLES\s0\*(R".
+.IP "row_operation_misc" 4
+.IX Item "row_operation_misc"
+Displays data about InnoDB's row operations.  Data source:
+\&\*(L"\s-1STATUS_VARIABLES\s0\*(R".
+.IP "row_operations" 4
+.IX Item "row_operations"
+Displays data about InnoDB's row operations.  Data source:
+\&\*(L"\s-1STATUS_VARIABLES\s0\*(R".
+.IP "semaphores" 4
+.IX Item "semaphores"
+Displays data about InnoDB's semaphores and mutexes.  Data source:
+\&\*(L"\s-1STATUS_VARIABLES\s0\*(R".
+.IP "slave_io_status" 4
+.IX Item "slave_io_status"
+Displays data about the slave I/O thread.  Data source: 
+\&\*(L"\s-1STATUS_VARIABLES\s0\*(R".
+.IP "slave_sql_status" 4
+.IX Item "slave_sql_status"
+Displays data about the slave \s-1SQL\s0 thread.  Data source: \*(L"\s-1STATUS_VARIABLES\s0\*(R".
+.IP "t_header" 4
+.IX Item "t_header"
+Displays various InnoDB status values.  Data source: \*(L"\s-1STATUS_VARIABLES\s0\*(R".
+.IP "var_status" 4
+.IX Item "var_status"
+Displays user-configurable data.  Data source: \*(L"\s-1STATUS_VARIABLES\s0\*(R".
+.IP "wait_array" 4
+.IX Item "wait_array"
+Displays data about InnoDB's \s-1OS\s0 wait array.  Data source: \*(L"\s-1OS_WAIT_ARRAY\s0\*(R".
+.Sh "\s-1COLUMNS\s0"
+.IX Subsection "COLUMNS"
+Columns belong to tables.  You can choose a table's columns by pressing the '^'
+key, which starts the \*(L"\s-1TABLE\s0 \s-1EDITOR\s0\*(R" and lets you choose and edit columns.
+Pressing 'e' from within the table editor lets you edit the column's properties:
+.IP "\(bu" 4
+hdr: a column header.  This appears in the first row of the table.
+.IP "\(bu" 4
+just: justification.  '\-' means left-justified and '' means right\-justified,
+just as with printf formatting codes (not a coincidence).
+.IP "\(bu" 4
+dec: whether to further align the column on the decimal point.
+.IP "\(bu" 4
+num: whether the column is numeric.  This affects how values are sorted
+(lexically or numerically).
+.IP "\(bu" 4
+label: a small note about the column, which appears in dialogs that help the
+user choose columns.
+.IP "\(bu" 4
+src: an expression that innotop uses to extract the column's data from its
+source (see \*(L"\s-1DATA\s0 \s-1SOURCES\s0\*(R").  See \*(L"\s-1EXPRESSIONS\s0\*(R" for more on expressions.
+.IP "\(bu" 4
+minw: specifies a minimum display width.  This helps stabilize the display,
+which makes it easier to read if the data is changing frequently.
+.IP "\(bu" 4
+maxw: similar to minw.
+.IP "\(bu" 4
+trans: a list of column transformations.  See \*(L"\s-1TRANSFORMATIONS\s0\*(R".
+.IP "\(bu" 4
+agg: an aggregate function.  See \*(L"\s-1GROUPING\s0\*(R".  The default is \*(L"first\*(R".
+.IP "\(bu" 4
+aggonly: controls whether the column only shows when grouping is enabled on the
+table (see \*(L"\s-1GROUPING\s0\*(R").  By default, this is disabled.  This means columns
+will always be shown by default, whether grouping is enabled or not.  If a
+column's aggonly is set true, the column will appear when you toggle grouping on
+the table.  Several columns are set this way, such as the count column on
+\&\*(L"processlist\*(R" and \*(L"innodb_transactions\*(R", so you don't see a count when the
+grouping isn't enabled, but you do when it is.
+.Sh "\s-1FILTERS\s0"
+.IX Subsection "FILTERS"
+Filters remove rows from the display.  They behave much like a \s-1WHERE\s0 clause in
+\&\s-1SQL\s0.  innotop has several built-in filters, which remove irrelevant information
+like inactive queries, but you can define your own as well.  innotop also lets
+you create quick\-filters, which do not get saved to the configuration file, and
+are just an easy way to quickly view only some rows.
+.PP
+You can enable or disable a filter on any table.  Press the '%' key (mnemonic: %
+looks kind of like a line being filtered between two circles) and choose which
+table you want to filter, if asked.  You'll then see a list of possible filters
+and a list of filters currently enabled for that table.  Type the names of
+filters you want to apply and press Enter.
+.PP
+\fIUSER-DEFINED \s-1FILTERS\s0\fR
+.IX Subsection "USER-DEFINED FILTERS"
+.PP
+If you type a name that doesn't exist, innotop will prompt you to create the
+filter.  Filters are easy to create if you know Perl, and not hard if you don't.
+What you're doing is creating a subroutine that returns true if the row should
+be displayed.  The row is a hash reference passed to your subroutine as \f(CW$set\fR.
+.PP
+For example, imagine you want to filter the processlist table so you only see
+queries that have been running more than five minutes.  Type a new name for your
+filter, and when prompted for the subroutine body, press \s-1TAB\s0 to initiate your
+terminal's auto\-completion.  You'll see the names of the columns in the
+\&\*(L"processlist\*(R" table (innotop generally tries to help you with auto-completion
+lists).  You want to filter on the 'time' column.  Type the text \*(L"$set\->{time} >
+300\*(R" to return true when the query is more than five minutes old.  That's all
+you need to do.
+.PP
+In other words, the code you're typing is surrounded by an implicit context,
+which looks like this:
+.PP
+.Vb 4
+\& sub filter {
+\&    my ( $set ) = @_;
+\&    # YOUR CODE HERE
+\& }
+.Ve
+.PP
+If your filter doesn't work, or if something else suddenly behaves differently,
+you might have made an error in your filter, and innotop is silently catching
+the error.  Try enabling \*(L"debug\*(R" to make innotop throw an error instead.
+.PP
+\fIQUICK-FILTERS\fR
+.IX Subsection "QUICK-FILTERS"
+.PP
+innotop's quick-filters are a shortcut to create a temporary filter that doesn't
+persist when you restart innotop.  To create a quick\-filter, press the '/' key.
+innotop will prompt you for the column name and filter text.  Again, you can use
+auto-completion on column names.  The filter text can be just the text you want
+to \*(L"search for.\*(R"  For example, to filter the \*(L"processlist\*(R" table on queries
+that refer to the products table, type '/' and then 'info product'.
+.PP
+The filter text can actually be any Perl regular expression, but of course a
+literal string like 'product' works fine as a regular expression.
+.PP
+Behind the scenes innotop compiles the quick-filter into a specially tagged
+filter that is otherwise like any other filter.  It just isn't saved to the
+configuration file.
+.PP
+To clear quick\-filters, press the '\e' key and innotop will clear them all at
+once.
+.Sh "\s-1SORTING\s0"
+.IX Subsection "SORTING"
+innotop has sensible built-in defaults to sort the most important rows to the
+top of the table.  Like anything else in innotop, you can customize how any
+table is sorted.
+.PP
+To start the sort dialog, start the \*(L"\s-1TABLE\s0 \s-1EDITOR\s0\*(R" with the '^' key, choose a
+table if necessary, and press the 's' key.  You'll see a list of columns you can
+use in the sort expression and the current sort expression, if any.  Enter a
+list of columns by which you want to sort and press Enter.  If you want to
+reverse sort, prefix the column name with a minus sign.  For example, if you
+want to sort by column a ascending, then column b descending, type 'a \-b'.  You
+can also explicitly add a + in front of columns you want to sort ascending, but
+it's not required.
+.PP
+Some modes have keys mapped to open this dialog directly, and to quickly reverse
+sort direction.  Press '?' as usual to see which keys are mapped in any mode.
+.Sh "\s-1GROUPING\s0"
+.IX Subsection "GROUPING"
+innotop can group, or aggregate, rows together (I use the terms
+interchangeably).  This is quite similar to an \s-1SQL\s0 \s-1GROUP\s0 \s-1BY\s0 clause.  You can
+specify to group on certain columns, or if you don't specify any, the entire set
+of rows is treated as one group.  This is quite like \s-1SQL\s0 so far, but unlike \s-1SQL\s0,
+you can also select un-grouped columns.  innotop actually aggregates every
+column.  If you don't explicitly specify a grouping function, the default is
+\&'first'.  This is basically a convenience so you don't have to specify an
+aggregate function for every column you want in the result.
+.PP
+You can quickly toggle grouping on a table with the '=' key, which toggles its
+aggregate property.  This property doesn't persist to the config file.
+.PP
+The columns by which the table is grouped are specified in its group_by
+property.  When you turn grouping on, innotop places the group_by columns at the
+far left of the table, even if they're not supposed to be visible.  The rest of
+the visible columns appear in order after them.
+.PP
+Two tables have default group_by lists and a count column built in:
+\&\*(L"processlist\*(R" and \*(L"innodb_transactions\*(R".  The grouping is by connection
+and status, so you can quickly see how many queries or transactions are in a
+given status on each server you're monitoring.  The time columns are aggregated
+as a sum; other columns are left at the default 'first' aggregation.
+.PP
+By default, the table shown in \*(L"S: Variables & Status\*(R" mode also uses
+grouping so you can monitor variables and status across many servers.  The
+default aggregation function in this mode is 'avg'.
+.PP
+Valid grouping functions are defined in the \f(CW%agg_funcs\fR hash.  They include
+.IP "first" 4
+.IX Item "first"
+Returns the first element in the group.
+.IP "count" 4
+.IX Item "count"
+Returns the number of elements in the group, including undefined elements, much
+like \s-1SQL\s0's \s-1COUNT\s0(*).
+.IP "avg" 4
+.IX Item "avg"
+Returns the average of defined elements in the group.
+.IP "sum" 4
+.IX Item "sum"
+Returns the sum of elements in the group.
+.PP
+Here's an example of grouping at work.  Suppose you have a very busy server with
+hundreds of open connections, and you want to see how many connections are in
+what status.  Using the built-in grouping rules, you can press 'Q' to enter
+\&\*(L"Q: Query List\*(R" mode.  Press '=' to toggle grouping (if necessary, select the
+\&\*(L"processlist\*(R" table when prompted).
+.PP
+Your display might now look like the following:
+.PP
+.Vb 1
+\& Query List (? for help) localhost, 32:33, 0.11 QPS, 1 thd, 5.0.38\-log
+.Ve
+.PP
+.Vb 5
+\& CXN        Cmd        Cnt  ID      User   Host           Time   Query       
+\& localhost  Query      49    12933  webusr localhost      19:38  SELECT * FROM
+\& localhost  Sending Da 23     2383  webusr localhost      12:43  SELECT col1,
+\& localhost  Sleep      120     140  webusr localhost    5:18:12
+\& localhost  Statistics 12    19213  webusr localhost      01:19  SELECT * FROM
+.Ve
+.PP
+That's actually quite a worrisome picture.  You've got a lot of idle connections
+(Sleep), and some connections executing queries (Query and Sending Data).
+That's okay, but you also have a lot in Statistics status, collectively spending
+over a minute.  That means the query optimizer is having a really hard time
+optimizing your statements.  Something is wrong; it should normally take
+milliseconds to optimize queries.  You might not have seen this pattern if you
+didn't look at your connections in aggregate.  (This is a made-up example, but
+it can happen in real life).
+.Sh "\s-1PIVOTING\s0"
+.IX Subsection "PIVOTING"
+innotop can pivot a table for more compact display, similar to a Pivot Table in
+a spreadsheet (also known as a crosstab).  Pivoting a table makes columns into
+rows.  Assume you start with this table:
+.PP
+.Vb 4
+\& foo bar
+\& === ===
+\& 1   3
+\& 2   4
+.Ve
+.PP
+After pivoting, the table will look like this:
+.PP
+.Vb 4
+\& name set0 set1
+\& ==== ==== ====
+\& foo  1    2
+\& bar  3    4
+.Ve
+.PP
+To get reasonable results, you might need to group as well as pivoting.
+innotop currently does this for \*(L"S: Variables & Status\*(R" mode.
+.Sh "\s-1COLORS\s0"
+.IX Subsection "COLORS"
+By default, innotop highlights rows with color so you can see at a glance which
+rows are more important.  You can customize the colorization rules and add your
+own to any table.  Open the table editor with the '^' key, choose a table if
+needed, and press 'o' to open the color editor dialog.
+.PP
+The color editor dialog displays the rules applied to the table, in the order
+they are evaluated.  Each row is evaluated against each rule to see if the rule
+matches the row; if it does, the row gets the specified color, and no further
+rules are evaluated.  The rules look like the following:
+.PP
+.Vb 9
+\& state  eq  Locked       black on_red
+\& cmd    eq  Sleep        white       
+\& user   eq  system user  white       
+\& cmd    eq  Connect      white       
+\& cmd    eq  Binlog Dump  white       
+\& time   >   600          red         
+\& time   >   120          yellow      
+\& time   >   60           green       
+\& time   >   30           cyan
+.Ve
+.PP
+This is the default rule set for the \*(L"processlist\*(R" table.  In order of
+priority, these rules make locked queries black on a red background, \*(L"gray out\*(R"
+connections from replication and sleeping queries, and make queries turn from
+cyan to red as they run longer.
+.PP
+(For some reason, the \s-1ANSI\s0 color code \*(L"white\*(R" is actually a light gray.  Your
+terminal's display may vary; experiment to find colors you like).
+.PP
+You can use keystrokes to move the rules up and down, which re-orders their
+priority.  You can also delete rules and add new ones.  If you add a new rule,
+innotop prompts you for the column, an operator for the comparison, a value
+against which to compare the column, and a color to assign if the rule matches.
+There is auto-completion and prompting at each step.
+.PP
+The value in the third step needs to be correctly quoted.  innotop does not try
+to quote the value because it doesn't know whether it should treat the value as
+a string or a number.  If you want to compare the column against a string, as
+for example in the first rule above, you should enter 'Locked' surrounded by
+quotes.  If you get an error message about a bareword, you probably should have
+quoted something.
+.Sh "\s-1EXPRESSIONS\s0"
+.IX Subsection "EXPRESSIONS"
+Expressions are at the core of how innotop works, and are what enables you to
+extend innotop as you wish.  Recall the table lifecycle explained in
+\&\*(L"\s-1TABLES\s0\*(R".  Expressions are used in the earliest step, where it extracts
+values from a data source to form rows.
+.PP
+It does this by calling a subroutine for each column, passing it the source data
+set, a set of current values, and a set of previous values.  These are all
+needed so the subroutine can calculate things like the difference between this
+tick and the previous tick.
+.PP
+The subroutines that extract the data from the set are compiled from
+expressions.  This gives significantly more power than just naming the values to
+fill the columns, because it allows the column's value to be calculated from
+whatever data is necessary, but avoids the need to write complicated and lengthy
+Perl code.
+.PP
+innotop begins with a string of text that can look as simple as a value's name
+or as complicated as a full-fledged Perl expression.  It looks at each
+\&'bareword' token in the string and decides whether it's supposed to be a key
+into the \f(CW$set\fR hash.  A bareword is an unquoted value that isn't already
+surrounded by code-ish things like dollar signs or curly brackets.  If innotop
+decides that the bareword isn't a function or other valid Perl code, it converts
+it into a hash access.  After the whole string is processed, innotop compiles a
+subroutine, like this:
+.PP
+.Vb 5
+\& sub compute_column_value {
+\&    my ( $set, $cur, $pre ) = @_;
+\&    my $val = # EXPANDED STRING GOES HERE
+\&    return $val;
+\& }
+.Ve
+.PP
+Here's a concrete example, taken from the header table \*(L"q_header\*(R" in \*(L"Q: Query List\*(R" mode.  This expression calculates the qps, or Queries Per Second,
+column's values, from the values returned by \s-1SHOW\s0 \s-1STATUS:\s0
+.PP
+.Vb 1
+\& Questions/Uptime_hires
+.Ve
+.PP
+innotop decides both words are barewords, and transforms this expression into
+the following Perl code:
+.PP
+.Vb 1
+\& $set\->{Questions}/$set\->{Uptime_hires}
+.Ve
+.PP
+When surrounded by the rest of the subroutine's code, this is executable Perl
+that calculates a high-resolution queries-per-second value.
+.PP
+The arguments to the subroutine are named \f(CW$set\fR, \f(CW$cur\fR, and \f(CW$pre\fR.  In most cases,
+\&\f(CW$set\fR and \f(CW$cur\fR will be the same values.  However, if \*(L"status_inc\*(R" is set, \f(CW$cur\fR
+will not be the same as \f(CW$set\fR, because \f(CW$set\fR will already contain values that are
+the incremental difference between \f(CW$cur\fR and \f(CW$pre\fR.
+.PP
+Every column in innotop is computed by subroutines compiled in the same fashion.
+There is no difference between innotop's built-in columns and user-defined
+columns.  This keeps things consistent and predictable.
+.Sh "\s-1TRANSFORMATIONS\s0"
+.IX Subsection "TRANSFORMATIONS"
+Transformations change how a value is rendered.  For example, they can take a
+number of seconds and display it in H:M:S format.  The following transformations
+are defined:
+.IP "commify" 4
+.IX Item "commify"
+Adds commas to large numbers every three decimal places.
+.IP "dulint_to_int" 4
+.IX Item "dulint_to_int"
+Accepts two unsigned integers and converts them into a single longlong.  This is
+useful for certain operations with InnoDB, which uses two integers as
+transaction identifiers, for example.
+.IP "no_ctrl_char" 4
+.IX Item "no_ctrl_char"
+Removes quoted control characters from the value.  This is affected by the
+\&\*(L"charset\*(R" configuration variable.
+.Sp
+This transformation only operates within quoted strings, for example, values to
+a \s-1SET\s0 clause in an \s-1UPDATE\s0 statement.  It will not alter the \s-1UPDATE\s0 statement,
+but will collapse the quoted string to [\s-1BINARY\s0] or [\s-1TEXT\s0], depending on the
+charset.
+.IP "percent" 4
+.IX Item "percent"
+Converts a number to a percentage by multiplying it by two, formatting it with
+\&\*(L"num_digits\*(R" digits after the decimal point, and optionally adding a percent
+sign (see \*(L"show_percent\*(R").
+.IP "secs_to_time" 4
+.IX Item "secs_to_time"
+Formats a number of seconds as time in days+hours:minutes:seconds format.
+.IP "set_precision" 4
+.IX Item "set_precision"
+Formats numbers with \*(L"num_digits\*(R" number of digits after the decimal point.
+.IP "shorten" 4
+.IX Item "shorten"
+Formats a number as a unit of 1024 (k/M/G/T) and with \*(L"num_digits\*(R" number of
+digits after the decimal point.
+.Sh "\s-1TABLE\s0 \s-1EDITOR\s0"
+.IX Subsection "TABLE EDITOR"
+The innotop table editor lets you customize tables with keystrokes.  You start
+the table editor with the '^' key.  If there's more than one table on the
+screen, it will prompt you to choose one of them.  Once you do, innotop will
+show you something like this:
+.PP
+.Vb 1
+\& Editing table definition for Buffer Pool.  Press ? for help, q to quit.
+.Ve
+.PP
+.Vb 9
+\& name               hdr          label                  src          
+\& cxn                CXN          Connection from which  cxn          
+\& buf_pool_size      Size         Buffer pool size       IB_bp_buf_poo
+\& buf_free           Free Bufs    Buffers free in the b  IB_bp_buf_fre
+\& pages_total        Pages        Pages total            IB_bp_pages_t
+\& pages_modified     Dirty Pages  Pages modified (dirty  IB_bp_pages_m
+\& buf_pool_hit_rate  Hit Rate     Buffer pool hit rate   IB_bp_buf_poo
+\& total_mem_alloc    Memory       Total memory allocate  IB_bp_total_m
+\& add_pool_alloc     Add\(aql Pool   Additonal pool alloca  IB_bp_add_poo
+.Ve
+.PP
+The first line shows which table you're editing, and reminds you again to press
+\&'?' for a list of key mappings.  The rest is a tabular representation of the
+table's columns, because that's likely what you're trying to edit.  However, you
+can edit more than just the table's columns; this screen can start the filter
+editor, color rule editor, and more.
+.PP
+Each row in the display shows a single column in the table you're editing, along
+with a couple of its properties such as its header and source expression (see
+\&\*(L"\s-1EXPRESSIONS\s0\*(R").
+.PP
+The key mappings are Vim\-style, as in many other places.  Pressing 'j' and 'k'
+moves the highlight up or down.  You can then (d)elete or (e)dit the highlighted
+column.  You can also (a)dd a column to the table.  This actually just activates
+one of the columns already defined for the table; it prompts you to choose from
+among the columns available but not currently displayed.  Finally, you can
+re-order the columns with the '+' and '\-' keys.
+.PP
+You can do more than just edit the columns with the table editor, you can also
+edit other properties, such as the table's sort expression and group-by
+expression.  Press '?' to see the full list, of course.
+.PP
+If you want to really customize and create your own column, as opposed to just
+activating a built-in one that's not currently displayed, press the (n)ew key,
+and innotop will prompt you for the information it needs:
+.IP "\(bu" 4
+The column name: this needs to be a word without any funny characters, e.g. just
+letters, numbers and underscores.
+.IP "\(bu" 4
+The column header: this is the label that appears at the top of the column, in
+the table header.  This can have spaces and funny characters, but be careful not
+to make it too wide and waste space on\-screen.
+.IP "\(bu" 4
+The column's data source: this is an expression that determines what data from
+the source (see \*(L"\s-1TABLES\s0\*(R") innotop will put into the column.  This can just be
+the name of an item in the source, or it can be a more complex expression, as
+described in \*(L"\s-1EXPRESSIONS\s0\*(R".
+.PP
+Once you've entered the required data, your table has a new column.  There is no
+difference between this column and the built-in ones; it can have all the same
+properties and behaviors.  innotop will write the column's definition to the
+configuration file, so it will persist across sessions.
+.PP
+Here's an example: suppose you want to track how many times your slaves have
+retried transactions.  According to the MySQL manual, the
+Slave_retried_transactions status variable gives you that data: \*(L"The total
+number of times since startup that the replication slave \s-1SQL\s0 thread has retried
+transactions. This variable was added in version 5.0.4.\*(R"  This is appropriate to
+add to the \*(L"slave_sql_status\*(R" table.
+.PP
+To add the column, switch to the replication-monitoring mode with the 'M' key,
+and press the '^' key to start the table editor.  When prompted, choose
+slave_sql_status as the table, then press 'n' to create the column.  Type
+\&'retries' as the column name, 'Retries' as the column header, and
+\&'Slave_retried_transactions' as the source.  Now the column is created, and you
+see the table editor screen again.  Press 'q' to exit the table editor, and
+you'll see your column at the end of the table.
+.SH "VARIABLE SETS"
+.IX Header "VARIABLE SETS"
+Variable sets are used in \*(L"S: Variables & Status\*(R" mode to define more easily
+what variables you want to monitor.  Behind the scenes they are compiled to a
+list of expressions, and then into a column list so they can be treated just
+like columns in any other table, in terms of data extraction and
+transformations.  However, you're protected from the tedious details by a syntax
+that ought to feel very natural to you: a \s-1SQL\s0 \s-1SELECT\s0 list.
+.PP
+The data source for variable sets, and indeed the entire S mode, is the
+combination of \s-1SHOW\s0 \s-1STATUS\s0, \s-1SHOW\s0 \s-1VARIABLES\s0, and \s-1SHOW\s0 \s-1INNODB\s0 \s-1STATUS\s0.  Imagine
+that you had a huge table with one column per variable returned from those
+statements.  That's the data source for variable sets.  You can now query this
+data source just like you'd expect.  For example:
+.PP
+.Vb 1
+\& Questions, Uptime, Questions/Uptime as QPS
+.Ve
+.PP
+Behind the scenes innotop will split that variable set into three expressions,
+compile them and turn them into a table definition, then extract as usual.  This
+becomes a \*(L"variable set,\*(R" or a \*(L"list of variables you want to monitor.\*(R"
+.PP
+innotop lets you name and save your variable sets, and writes them to the
+configuration file.  You can choose which variable set you want to see with the
+\&'c' key, or activate the next and previous sets with the '>' and '<' keys.
+There are many built-in variable sets as well, which should give you a good
+start for creating your own.  Press 'e' to edit the current variable set, or
+just to see how it's defined.  To create a new one, just press 'c' and type its
+name.
+.PP
+You may want to use some of the functions listed in \*(L"\s-1TRANSFORMATIONS\s0\*(R" to help
+format the results.  In particular, \*(L"set_precision\*(R" is often useful to limit
+the number of digits you see.  Extending the above example, here's how:
+.PP
+.Vb 1
+\& Questions, Uptime, set_precision(Questions/Uptime) as QPS
+.Ve
+.PP
+Actually, this still needs a little more work.  If your \*(L"interval\*(R" is less
+than one second, you might be dividing by zero because Uptime is incremental in
+this mode by default.  Instead, use Uptime_hires:
+.PP
+.Vb 1
+\& Questions, Uptime, set_precision(Questions/Uptime_hires) as QPS
+.Ve
+.PP
+This example is simple, but it shows how easy it is to choose which variables
+you want to monitor.
+.SH "PLUGINS"
+.IX Header "PLUGINS"
+innotop has a simple but powerful plugin mechanism by which you can extend
+or modify its existing functionality, and add new functionality.  innotop's
+plugin functionality is event\-based: plugins register themselves to be called
+when events happen.  They then have a chance to influence the event.
+.PP
+An innotop plugin is a Perl module placed in innotop's \*(L"plugin_dir\*(R"
+directory.  On \s-1UNIX\s0 systems, you can place a symbolic link to the module instead
+of putting the actual file there.  innotop automatically discovers the file.  If
+there is a corresponding entry in the \*(L"plugins\*(R" configuration file section,
+innotop loads and activates the plugin.
+.PP
+The module must conform to innotop's plugin interface.  Additionally, the source
+code of the module must be written in such a way that innotop can inspect the
+file and determine the package name and description.
+.Sh "Package Source Convention"
+.IX Subsection "Package Source Convention"
+innotop inspects the plugin module's source to determine the Perl package name.
+It looks for a line of the form \*(L"package Foo;\*(R" and if found, considers the
+plugin's package name to be Foo.  Of course the package name can be a valid Perl
+package name, with double semicolons and so on.
+.PP
+It also looks for a description in the source code, to make the plugin editor
+more human\-friendly.  The description is a comment line of the form \*(L"#
+description: Foo\*(R", where \*(L"Foo\*(R" is the text innotop will consider to be the
+plugin's description.
+.Sh "Plugin Interface"
+.IX Subsection "Plugin Interface"
+The innotop plugin interface is quite simple: innotop expects the plugin to be
+an object-oriented module it can call certain methods on.  The methods are
+.IP "new(%variables)" 4
+.IX Item "new(%variables)"
+This is the plugin's constructor.  It is passed a hash of innotop's variables,
+which it can manipulate (see \*(L"Plugin Variables\*(R").  It must return a reference
+to the newly created plugin object.
+.Sp
+At construction time, innotop has only loaded the general configuration and
+created the default built-in variables with their default contents (which is
+quite a lot).  Therefore, the state of the program is exactly as in the innotop
+source code, plus the configuration variables from the \*(L"general\*(R" section in
+the config file.
+.Sp
+If your plugin manipulates the variables, it is changing global data, which is
+shared by innotop and all plugins.  Plugins are loaded in the order they're
+listed in the config file.  Your plugin may load before or after another plugin,
+so there is a potential for conflict or interaction between plugins if they
+modify data other plugins use or modify.
+.IP "\fIregister_for_events()\fR" 4
+.IX Item "register_for_events()"
+This method must return a list of events in which the plugin is interested, if
+any.  See \*(L"Plugin Events\*(R" for the defined events.  If the plugin returns an
+event that's not defined, the event is ignored.
+.IP "event handlers" 4
+.IX Item "event handlers"
+The plugin must implement a method named the same as each event for which it has
+registered.  In other words, if the plugin returns qw(foo bar) from
+\&\fIregister_for_events()\fR, it must have \fIfoo()\fR and \fIbar()\fR methods.  These methods are
+callbacks for the events.  See \*(L"Plugin Events\*(R" for more details about each
+event.
+.Sh "Plugin Variables"
+.IX Subsection "Plugin Variables"
+The plugin's constructor is passed a hash of innotop's variables, which it can
+manipulate.  It is probably a good idea if the plugin object saves a copy of it
+for later use.  The variables are defined in the innotop variable
+\&\f(CW%pluggable_vars\fR, and are as follows:
+.IP "action_for" 4
+.IX Item "action_for"
+A hashref of key mappings.  These are innotop's global hot\-keys.
+.IP "agg_funcs" 4
+.IX Item "agg_funcs"
+A hashref of functions that can be used for grouping.  See \*(L"\s-1GROUPING\s0\*(R".
+.IP "config" 4
+.IX Item "config"
+The global configuration hash.
+.IP "connections" 4
+.IX Item "connections"
+A hashref of connection specifications.  These are just specifications of how to
+connect to a server.
+.IP "dbhs" 4
+.IX Item "dbhs"
+A hashref of innotop's database connections.  These are actual \s-1DBI\s0 connection
+objects.
+.IP "filters" 4
+.IX Item "filters"
+A hashref of filters applied to table rows.  See \*(L"\s-1FILTERS\s0\*(R" for more.
+.IP "modes" 4
+.IX Item "modes"
+A hashref of modes.  See \*(L"\s-1MODES\s0\*(R" for more.
+.IP "server_groups" 4
+.IX Item "server_groups"
+A hashref of server groups.  See \*(L"\s-1SERVER\s0 \s-1GROUPS\s0\*(R".
+.IP "tbl_meta" 4
+.IX Item "tbl_meta"
+A hashref of innotop's table meta\-data, with one entry per table (see
+\&\*(L"\s-1TABLES\s0\*(R" for more information).
+.IP "trans_funcs" 4
+.IX Item "trans_funcs"
+A hashref of transformation functions.  See \*(L"\s-1TRANSFORMATIONS\s0\*(R".
+.IP "var_sets" 4
+.IX Item "var_sets"
+A hashref of variable sets.  See \*(L"\s-1VARIABLE\s0 \s-1SETS\s0\*(R".
+.Sh "Plugin Events"
+.IX Subsection "Plugin Events"
+Each event is defined somewhere in the innotop source code.  When innotop runs
+that code, it executes the callback function for each plugin that expressed its
+interest in the event.  innotop passes some data for each event.  The events are
+defined in the \f(CW%event_listener_for\fR variable, and are as follows:
+.ie n .IP "extract_values($set, $cur\fR, \f(CW$pre\fR, \f(CW$tbl)" 4
+.el .IP "extract_values($set, \f(CW$cur\fR, \f(CW$pre\fR, \f(CW$tbl\fR)" 4
+.IX Item "extract_values($set, $cur, $pre, $tbl)"
+This event occurs inside the function that extracts values from a data source.
+The arguments are the set of values, the current values, the previous values,
+and the table name.
+.IP "set_to_tbl" 4
+.IX Item "set_to_tbl"
+Events are defined at many places in this subroutine, which is responsible for
+turning an arrayref of hashrefs into an arrayref of lines that can be printed to
+the screen.  The events all pass the same data: an arrayref of rows and the name
+of the table being created.  The events are set_to_tbl_pre_filter,
+set_to_tbl_pre_sort,set_to_tbl_pre_group, set_to_tbl_pre_colorize,
+set_to_tbl_pre_transform, set_to_tbl_pre_pivot, set_to_tbl_pre_create,
+set_to_tbl_post_create.
+.IP "draw_screen($lines)" 4
+.IX Item "draw_screen($lines)"
+This event occurs inside the subroutine that prints the lines to the screen.
+\&\f(CW$lines\fR is an arrayref of strings.
+.Sh "Simple Plugin Example"
+.IX Subsection "Simple Plugin Example"
+The easiest way to explain the plugin functionality is probably with a simple
+example.  The following module adds a column to the beginning of every table and
+sets its value to 1.
+.PP
+.Vb 2
+\& use strict;
+\& use warnings FATAL => \(aqall\(aq;
+.Ve
+.PP
+.Vb 2
+\& package Innotop::Plugin::Example;
+\& # description: Adds an \(aqexample\(aq column to every table
+.Ve
+.PP
+.Vb 4
+\& sub new {
+\&    my ( $class, %vars ) = @_;
+\&    # Store reference to innotop\(aqs variables in $self
+\&    my $self = bless { %vars }, $class;
+.Ve
+.PP
+.Vb 11
+\&    # Design the example column
+\&    my $col = {
+\&       hdr   => \(aqExample\(aq,
+\&       just  => \(aq\(aq,
+\&       dec   => 0,
+\&       num   => 1,
+\&       label => \(aqExample\(aq,
+\&       src   => \(aqexample\(aq, # Get data from this column in the data source
+\&       tbl   => \(aq\(aq,
+\&       trans => [],
+\&    };
+.Ve
+.PP
+.Vb 8
+\&    # Add the column to every table.
+\&    my $tbl_meta = $vars{tbl_meta};
+\&    foreach my $tbl ( values %$tbl_meta ) {
+\&       # Add the column to the list of defined columns
+\&       $tbl\->{cols}\->{example} = $col;
+\&       # Add the column to the list of visible columns
+\&       unshift @{$tbl\->{visible}}, \(aqexample\(aq;
+\&    }
+.Ve
+.PP
+.Vb 3
+\&    # Be sure to return a reference to the object.
+\&    return $self;
+\& }
+.Ve
+.PP
+.Vb 5
+\& # I\(aqd like to be called when a data set is being rendered into a table, please.
+\& sub register_for_events {
+\&    my ( $self ) = @_;
+\&    return qw(set_to_tbl_pre_filter);
+\& }
+.Ve
+.PP
+.Vb 8
+\& # This method will be called when the event fires.
+\& sub set_to_tbl_pre_filter {
+\&    my ( $self, $rows, $tbl ) = @_;
+\&    # Set the example column\(aqs data source to the value 1.
+\&    foreach my $row ( @$rows ) {
+\&       $row\->{example} = 1;
+\&    }
+\& }
+.Ve
+.PP
+.Vb 1
+\& 1;
+.Ve
+.Sh "Plugin Editor"
+.IX Subsection "Plugin Editor"
+The plugin editor lets you view the plugins innotop discovered and activate or
+deactivate them.  Start the editor by pressing $ to start the configuration
+editor from any mode.  Press the 'p' key to start the plugin editor.  You'll see
+a list of plugins innotop discovered.  You can use the 'j' and 'k' keys to move
+the highlight to the desired one, then press the * key to toggle it active or
+inactive.  Exit the editor and restart innotop for the changes to take effect.
+.SH "SQL STATEMENTS"
+.IX Header "SQL STATEMENTS"
+innotop uses a limited set of \s-1SQL\s0 statements to retrieve data from MySQL for
+display.  The statements are customized depending on the server version against
+which they are executed; for example, on MySQL 5 and newer, \s-1INNODB_STATUS\s0
+executes \*(L"\s-1SHOW\s0 \s-1ENGINE\s0 \s-1INNODB\s0 \s-1STATUS\s0\*(R", while on earlier versions it executes
+\&\*(L"\s-1SHOW\s0 \s-1INNODB\s0 \s-1STATUS\s0\*(R".  The statements are as follows:
+.PP
+.Vb 12
+\& Statement           SQL executed
+\& =================== ===============================
+\& INNODB_STATUS       SHOW [ENGINE] INNODB STATUS
+\& KILL_CONNECTION     KILL
+\& KILL_QUERY          KILL QUERY
+\& OPEN_TABLES         SHOW OPEN TABLES
+\& PROCESSLIST         SHOW FULL PROCESSLIST
+\& SHOW_MASTER_LOGS    SHOW MASTER LOGS
+\& SHOW_MASTER_STATUS  SHOW MASTER STATUS
+\& SHOW_SLAVE_STATUS   SHOW SLAVE STATUS
+\& SHOW_STATUS         SHOW [GLOBAL] STATUS
+\& SHOW_VARIABLES      SHOW [GLOBAL] VARIABLES
+.Ve
+.SH "DATA SOURCES"
+.IX Header "DATA SOURCES"
+Each time innotop extracts values to create a table (see \*(L"\s-1EXPRESSIONS\s0\*(R" and
+\&\*(L"\s-1TABLES\s0\*(R"), it does so from a particular data source.  Largely because of the
+complex data extracted from \s-1SHOW\s0 \s-1INNODB\s0 \s-1STATUS\s0, this is slightly messy.  \s-1SHOW\s0
+\&\s-1INNODB\s0 \s-1STATUS\s0 contains a mixture of single values and repeated values that form
+nested data sets.
+.PP
+Whenever innotop fetches data from MySQL, it adds two extra bits to each set:
+cxn and Uptime_hires.  cxn is the name of the connection from which the data
+came.  Uptime_hires is a high-resolution version of the server's Uptime status
+variable, which is important if your \*(L"interval\*(R" setting is sub\-second.
+.PP
+Here are the kinds of data sources from which data is extracted:
+.IP "\s-1STATUS_VARIABLES\s0" 4
+.IX Item "STATUS_VARIABLES"
+This is the broadest category, into which the most kinds of data fall.  It
+begins with the combination of \s-1SHOW\s0 \s-1STATUS\s0 and \s-1SHOW\s0 \s-1VARIABLES\s0, but other sources
+may be included as needed, for example, \s-1SHOW\s0 \s-1MASTER\s0 \s-1STATUS\s0 and \s-1SHOW\s0 \s-1SLAVE\s0
+\&\s-1STATUS\s0, as well as many of the non-repeated values from \s-1SHOW\s0 \s-1INNODB\s0 \s-1STATUS\s0.
+.IP "\s-1DEADLOCK_LOCKS\s0" 4
+.IX Item "DEADLOCK_LOCKS"
+This data is extracted from the transaction list in the \s-1LATEST\s0 \s-1DETECTED\s0 \s-1DEADLOCK\s0
+section of \s-1SHOW\s0 \s-1INNODB\s0 \s-1STATUS\s0.  It is nested two levels deep: transactions, then
+locks.
+.IP "\s-1DEADLOCK_TRANSACTIONS\s0" 4
+.IX Item "DEADLOCK_TRANSACTIONS"
+This data is from the transaction list in the \s-1LATEST\s0 \s-1DETECTED\s0 \s-1DEADLOCK\s0
+section of \s-1SHOW\s0 \s-1INNODB\s0 \s-1STATUS\s0.  It is nested one level deep.
+.IP "\s-1EXPLAIN\s0" 4
+.IX Item "EXPLAIN"
+This data is from the result set returned by \s-1EXPLAIN\s0.
+.IP "\s-1INNODB_TRANSACTIONS\s0" 4
+.IX Item "INNODB_TRANSACTIONS"
+This data is from the \s-1TRANSACTIONS\s0 section of \s-1SHOW\s0 \s-1INNODB\s0 \s-1STATUS\s0.
+.IP "\s-1IO_THREADS\s0" 4
+.IX Item "IO_THREADS"
+This data is from the list of threads in the the \s-1FILE\s0 I/O section of \s-1SHOW\s0 \s-1INNODB\s0
+\&\s-1STATUS\s0.
+.IP "\s-1INNODB_LOCKS\s0" 4
+.IX Item "INNODB_LOCKS"
+This data is from the \s-1TRANSACTIONS\s0 section of \s-1SHOW\s0 \s-1INNODB\s0 \s-1STATUS\s0 and is nested
+two levels deep.
+.IP "\s-1OPEN_TABLES\s0" 4
+.IX Item "OPEN_TABLES"
+This data is from \s-1SHOW\s0 \s-1OPEN\s0 \s-1TABLES\s0.
+.IP "\s-1PROCESSLIST\s0" 4
+.IX Item "PROCESSLIST"
+This data is from \s-1SHOW\s0 \s-1FULL\s0 \s-1PROCESSLIST\s0.
+.IP "\s-1OS_WAIT_ARRAY\s0" 4
+.IX Item "OS_WAIT_ARRAY"
+This data is from the \s-1SEMAPHORES\s0 section of \s-1SHOW\s0 \s-1INNODB\s0 \s-1STATUS\s0 and is nested one
+level deep.  It comes from the lines that look like this:
+.Sp
+.Vb 1
+\& \-\-Thread 1568861104 has waited at btr0cur.c line 424 ....
+.Ve
+.SH "MYSQL PRIVILEGES"
+.IX Header "MYSQL PRIVILEGES"
+.IP "\(bu" 4
+You must connect to MySQL as a user who has the \s-1SUPER\s0 privilege for many of the
+functions.
+.IP "\(bu" 4
+If you don't have the \s-1SUPER\s0 privilege, you can still run some functions, but you
+won't necessarily see all the same data.
+.IP "\(bu" 4
+You need the \s-1PROCESS\s0 privilege to see the list of currently running queries in Q
+mode.
+.IP "\(bu" 4
+You need special privileges to start and stop slave servers.
+.IP "\(bu" 4
+You need appropriate privileges to create and drop the deadlock tables if needed
+(see \*(L"\s-1SERVER\s0 \s-1CONNECTIONS\s0\*(R").
+.SH "SYSTEM REQUIREMENTS"
+.IX Header "SYSTEM REQUIREMENTS"
+You need Perl to run innotop, of course.  You also need a few Perl modules: \s-1DBI\s0,
+DBD::mysql,  Term::ReadKey, and Time::HiRes.  These should be included with most
+Perl distributions, but in case they are not, I recommend using versions
+distributed with your operating system or Perl distribution, not from \s-1CPAN\s0.
+Term::ReadKey in particular has been known to cause problems if installed from
+\&\s-1CPAN\s0.
+.PP
+If you have Term::ANSIColor, innotop will use it to format headers more readably
+and compactly.  (Under Microsoft Windows, you also need Win32::Console::ANSI for
+terminal formatting codes to be honored).  If you install Term::ReadLine,
+preferably Term::ReadLine::Gnu, you'll get nice auto-completion support.
+.PP
+I run innotop on Gentoo GNU/Linux, Debian and Ubuntu, and I've had feedback from
+people successfully running it on Red Hat, CentOS, Solaris, and Mac \s-1OSX\s0.  I
+don't see any reason why it won't work on other UNIX-ish operating systems, but
+I don't know for sure.  It also runs on Windows under ActivePerl without
+problem.
+.PP
+I use innotop on MySQL versions 3.23.58, 4.0.27, 4.1.0, 4.1.22, 5.0.26, 5.1.15,
+and 5.2.3.  If it doesn't run correctly for you, that is a bug and I hope you
+report it.
+.SH "FILES"
+.IX Header "FILES"
+$HOMEDIR/.innotop is used to store configuration information.  Files include the
+configuration file innotop.ini, the core_dump file which contains verbose error
+messages if \*(L"debug\*(R" is enabled, and the plugins/ subdirectory.
+.SH "GLOSSARY OF TERMS"
+.IX Header "GLOSSARY OF TERMS"
+.IP "tick" 4
+.IX Item "tick"
+A tick is a refresh event, when innotop re-fetches data from connections and
+displays it.
+.SH "ACKNOWLEDGEMENTS"
+.IX Header "ACKNOWLEDGEMENTS"
+I'm grateful to the following people for various reasons, and hope I haven't
+forgotten to include anyone:
+.PP
+Allen K. Smith,
+Aurimas Mikalauskas,
+Bartosz Fenski,
+Brian Miezejewski,
+Christian Hammers, 
+Cyril Scetbon,
+Dane Miller,
+David Multer,
+Dr. Frank Ullrich,
+Giuseppe Maxia,
+Google.com Site Reliability Engineers,
+Jan Pieter Kunst,
+Jari Aalto,
+Jay Pipes,
+Jeremy Zawodny,
+Johan Idren,
+Kristian Kohntopp,
+Lenz Grimmer,
+Maciej Dobrzanski,
+Michiel Betel,
+MySQL \s-1AB\s0,
+Paul McCullagh,
+Sebastien Estienne,
+Sourceforge.net,
+Steven Kreuzer,
+The Gentoo MySQL Team,
+Trevor Price,
+Yaar Schnitman,
+and probably more people I've neglected to include.
+.PP
+(If I misspelled your name, it's probably because I'm afraid of putting
+international characters into this documentation; earlier versions of Perl might
+not be able to compile it then).
+.SH "COPYRIGHT, LICENSE AND WARRANTY"
+.IX Header "COPYRIGHT, LICENSE AND WARRANTY"
+This program is copyright (c) 2006 Baron Schwartz.
+Feedback and improvements are welcome.
+.PP
+\&\s-1THIS\s0 \s-1PROGRAM\s0 \s-1IS\s0 \s-1PROVIDED\s0 \*(L"\s-1AS\s0 \s-1IS\s0\*(R" \s-1AND\s0 \s-1WITHOUT\s0 \s-1ANY\s0 \s-1EXPRESS\s0 \s-1OR\s0 \s-1IMPLIED\s0
+\&\s-1WARRANTIES\s0, \s-1INCLUDING\s0, \s-1WITHOUT\s0 \s-1LIMITATION\s0, \s-1THE\s0 \s-1IMPLIED\s0 \s-1WARRANTIES\s0 \s-1OF\s0
+\&\s-1MERCHANTIBILITY\s0 \s-1AND\s0 \s-1FITNESS\s0 \s-1FOR\s0 A \s-1PARTICULAR\s0 \s-1PURPOSE\s0.
+.PP
+This program is free software; you can redistribute it and/or modify it under
+the terms of the \s-1GNU\s0 General Public License as published by the Free Software
+Foundation, version 2; \s-1OR\s0 the Perl Artistic License.  On \s-1UNIX\s0 and similar
+systems, you can issue `man perlgpl' or `man perlartistic' to read these
+licenses.
+.PP
+You should have received a copy of the \s-1GNU\s0 General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, \s-1MA\s0  02111\-1307  \s-1USA\s0.
+.PP
+Execute innotop and press '!' to see this information at any time.
+.SH "AUTHOR"
+.IX Header "AUTHOR"
+Baron Schwartz.
+.SH "BUGS"
+.IX Header "BUGS"
+You can report bugs, ask for improvements, and get other help and support at
+<http://sourceforge.net/projects/innotop>.  There are mailing lists, forums,
+a bug tracker, etc.  Please use these instead of contacting me directly, as it
+makes my job easier and benefits others if the discussions are permanent and
+public.  Of course, if you need to contact me in private, please do.
diff --git a/storage/xtradb/build/debian/additions/msql2mysql.1 b/storage/xtradb/build/debian/additions/msql2mysql.1
new file mode 100644
index 00000000000..8fe05e7415d
--- /dev/null
+++ b/storage/xtradb/build/debian/additions/msql2mysql.1
@@ -0,0 +1,16 @@
+.TH mysql 1 "17 March 2003" "MySQL 3.23" "MySQL database"
+.SH NAME
+msql2mysql \- MySQL importer for msql style data.
+.SH SYNOPSIS
+msql2mysql [options]
+.SH DESCRIPTION
+This program imports old msql database files.
+
+For more information start the program with '--help'.
+.SH "SEE ALSO"
+mysql (1), mysqld (1)
+.SH AUTHOR
+This manpage was written by Christian Hammers <ch@debian.org>.
+
+MySQL is available at http://www.mysql.com/.
+.\" end of man page
diff --git a/storage/xtradb/build/debian/additions/my.cnf b/storage/xtradb/build/debian/additions/my.cnf
new file mode 100644
index 00000000000..997523b9c2f
--- /dev/null
+++ b/storage/xtradb/build/debian/additions/my.cnf
@@ -0,0 +1,129 @@
+#
+# The MySQL database server configuration file.
+#
+# You can copy this to one of:
+# - "/etc/mysql/my.cnf" to set global options,
+# - "~/.my.cnf" to set user-specific options.
+# 
+# One can use all long options that the program supports.
+# Run program with --help to get a list of available options and with
+# --print-defaults to see which it would actually understand and use.
+#
+# For explanations see
+# http://dev.mysql.com/doc/mysql/en/server-system-variables.html
+
+# This will be passed to all mysql clients
+# It has been reported that passwords should be enclosed with ticks/quotes
+# escpecially if they contain "#" chars...
+# Remember to edit /etc/mysql/debian.cnf when changing the socket location.
+[client]
+port		= 3306
+socket		= /var/run/mysqld/mysqld.sock
+
+# Here is entries for some specific programs
+# The following values assume you have at least 32M ram
+
+# This was formally known as [safe_mysqld]. Both versions are currently parsed.
+[mysqld_safe]
+socket		= /var/run/mysqld/mysqld.sock
+nice		= 0
+
+[mysqld]
+#
+# * Basic Settings
+#
+user		= mysql
+pid-file	= /var/run/mysqld/mysqld.pid
+socket		= /var/run/mysqld/mysqld.sock
+port		= 3306
+basedir		= /usr
+datadir		= /var/lib/mysql
+tmpdir		= /tmp
+language	= /usr/share/mysql/english
+skip-external-locking
+#
+# For compatibility to other Debian packages that still use
+# libmysqlclient10 and libmysqlclient12.
+old_passwords	= 1
+#
+# Instead of skip-networking the default is now to listen only on
+# localhost which is more compatible and is not less secure.
+bind-address		= 127.0.0.1
+#
+# * Fine Tuning
+#
+key_buffer		= 16M
+max_allowed_packet	= 16M
+thread_stack		= 128K
+thread_cache_size       = 8
+# This replaces the startup script and checks MyISAM tables if needed
+# the first time they are touched
+myisam-recover         = BACKUP
+#max_connections        = 100
+#table_cache            = 64
+#thread_concurrency     = 10
+#
+# * Query Cache Configuration
+#
+query_cache_limit	= 1M
+query_cache_size        = 16M
+#
+# * Logging and Replication
+#
+# Both location gets rotated by the cronjob.
+# Be aware that this log type is a performance killer.
+# As of 5.1 you can enable the  at runtime!
+#log_type           = FILE
+#general_log		= /var/log/mysql/mysql.log
+#
+# Error logging goes to syslog due to /etc/mysql/conf.d/mysqld_safe_syslog.cnf.
+#
+# Here you can see queries with especially long duration
+#log_slow_queries	= /var/log/mysql/mysql-slow.log
+#long_query_time = 2
+#log-queries-not-using-indexes
+#
+# The following can be used as easy to replay backup logs or for replication.
+# note: if you are setting up a replication slave, see README.Debian about
+#       other settings you may need to change.
+#server-id		= 1
+#log_bin			= /var/log/mysql/mysql-bin.log
+expire_logs_days	= 10
+max_binlog_size         = 100M
+#binlog_do_db		= include_database_name
+#binlog_ignore_db	= include_database_name
+#
+# * InnoDB
+#
+# InnoDB is enabled by default with a 10MB datafile in /var/lib/mysql/.
+# Read the manual for more InnoDB related options. There are many!
+#
+# * Security Features
+#
+# Read the manual, too, if you want chroot!
+# chroot = /var/lib/mysql/
+#
+# For generating SSL certificates I recommend the OpenSSL GUI "tinyca".
+#
+# ssl-ca=/etc/mysql/cacert.pem
+# ssl-cert=/etc/mysql/server-cert.pem
+# ssl-key=/etc/mysql/server-key.pem
+
+
+
+[mysqldump]
+quick
+quote-names
+max_allowed_packet	= 16M
+
+[mysql]
+#no-auto-rehash	# faster start of mysql but no tab completition
+
+[isamchk]
+key_buffer		= 16M
+
+#
+# * IMPORTANT: Additional settings that can override those from this file!
+#   The files must end with '.cnf', otherwise they'll be ignored.
+#
+!includedir /etc/mysql/conf.d/
diff --git a/storage/xtradb/build/debian/additions/my_print_defaults.1 b/storage/xtradb/build/debian/additions/my_print_defaults.1
new file mode 100644
index 00000000000..ebef4157016
--- /dev/null
+++ b/storage/xtradb/build/debian/additions/my_print_defaults.1
@@ -0,0 +1,16 @@
+.TH mysql 1 "17 March 2003" "MySQL 3.23" "MySQL database"
+.SH NAME
+my_print_defaults \- MySQL helper script that prints defaults.
+.SH SYNOPSIS
+my_print_defaults [options]
+.SH DESCRIPTION
+Prints all arguments that is give to some program using the default files.
+
+For more information start the program with '--help'.
+.SH "SEE ALSO"
+mysql (1), mysqld (1)
+.SH AUTHOR
+This manpage was written by Christian Hammers <ch@debian.org>.
+
+MySQL is available at http://www.mysql.com/.
+.\" end of man page
diff --git a/storage/xtradb/build/debian/additions/myisam_ftdump.1 b/storage/xtradb/build/debian/additions/myisam_ftdump.1
new file mode 100644
index 00000000000..e2de358efcc
--- /dev/null
+++ b/storage/xtradb/build/debian/additions/myisam_ftdump.1
@@ -0,0 +1,16 @@
+.TH mysql 1 "17 March 2003" "MySQL 3.23" "MySQL database"
+.SH NAME
+myisam_ftdump \- Dumps full text tables.
+.SH SYNOPSIS
+myisam_ftdump [options]
+.SH DESCRIPTION
+Dumps information and contents of full text tables.
+
+For more information start the program with '--help'.
+.SH "SEE ALSO"
+mysql (1), mysqld (1)
+.SH AUTHOR
+This manpage was written by Christian Hammers <ch@debian.org>.
+
+MySQL is available at http://www.mysql.com/.
+.\" end of man page
diff --git a/storage/xtradb/build/debian/additions/myisamchk.1 b/storage/xtradb/build/debian/additions/myisamchk.1
new file mode 100644
index 00000000000..fe7f34961e0
--- /dev/null
+++ b/storage/xtradb/build/debian/additions/myisamchk.1
@@ -0,0 +1,17 @@
+.TH mysql 1 "17 March 2003" "MySQL 3.23" "MySQL database"
+.SH NAME
+myisamchk \- Checks MySQL myisam type databases. 
+.SH SYNOPSIS
+myisamchk [options]
+.SH DESCRIPTION
+Description, check and repair of ISAM tables.
+Used without options all tables on the command will be checked for errors
+
+For more information start the program with '--help'.
+.SH "SEE ALSO"
+mysql (1), mysqld (1)
+.SH AUTHOR
+This manpage was written by Christian Hammers <ch@debian.org>.
+
+MySQL is available at http://www.mysql.com/.
+.\" end of man page
diff --git a/storage/xtradb/build/debian/additions/myisamlog.1 b/storage/xtradb/build/debian/additions/myisamlog.1
new file mode 100644
index 00000000000..959d547df94
--- /dev/null
+++ b/storage/xtradb/build/debian/additions/myisamlog.1
@@ -0,0 +1,16 @@
+.TH mysql 1 "17 March 2003" "MySQL 3.23" "MySQL database"
+.SH NAME
+myisamlog \- MySQL helper script.
+.SH SYNOPSIS
+myisamlog [options]
+.SH DESCRIPTION
+Function unknown. Mail to ch@debian.org.
+
+For more information start the program with '--help'.
+.SH "SEE ALSO"
+mysql (1), mysqld (1)
+.SH AUTHOR
+This manpage was written by Christian Hammers <ch@debian.org>.
+
+MySQL is available at http://www.mysql.com/.
+.\" end of man page
diff --git a/storage/xtradb/build/debian/additions/myisampack.1 b/storage/xtradb/build/debian/additions/myisampack.1
new file mode 100644
index 00000000000..93168304a17
--- /dev/null
+++ b/storage/xtradb/build/debian/additions/myisampack.1
@@ -0,0 +1,19 @@
+.TH mysql 1 "17 March 2003" "MySQL 3.23" "MySQL database"
+.SH NAME
+myisampack \- Compresses MySQL database files.
+.SH SYNOPSIS
+myisampack [options]
+.SH DESCRIPTION
+Pack a MyISAM-table to take much less space.
+Keys are not updated, you must run myisamchk -rq on the datafile
+afterwards to update the keys.
+You should give the .MYI file as the filename argument.
+
+For more information start the program with '--help'.
+.SH "SEE ALSO"
+mysql (1), mysqld (1)
+.SH AUTHOR
+This manpage was written by Christian Hammers <ch@debian.org>.
+
+MySQL is available at http://www.mysql.com/.
+.\" end of man page
diff --git a/storage/xtradb/build/debian/additions/mysql-server.lintian-overrides b/storage/xtradb/build/debian/additions/mysql-server.lintian-overrides
new file mode 100644
index 00000000000..ae589c2472e
--- /dev/null
+++ b/storage/xtradb/build/debian/additions/mysql-server.lintian-overrides
@@ -0,0 +1,2 @@
+W: mysql-dfsg source: maintainer-script-lacks-debhelper-token debian/percona-xtradb-server.postinst
+W: percona-xtradb-server: possible-bashism-in-maintainer-script postinst:68 'p{("a".."z","A".."Z",0..9)[int(rand(62))]}'
diff --git a/storage/xtradb/build/debian/additions/mysql_config.1 b/storage/xtradb/build/debian/additions/mysql_config.1
new file mode 100644
index 00000000000..88095e22b9e
--- /dev/null
+++ b/storage/xtradb/build/debian/additions/mysql_config.1
@@ -0,0 +1,17 @@
+.TH mysql 1 "17 March 2003" "MySQL 3.23" "MySQL database"
+.SH NAME
+mysqlconfig \- MySQL compile settings.
+.SH SYNOPSIS
+mysqlconfig [options]
+.SH DESCRIPTION
+This program is only useful for people who want to compile agains
+libmysqlclient.
+
+For more information start the program with '--help'.
+.SH "SEE ALSO"
+mysql (1), mysqld (1)
+.SH AUTHOR
+This manpage was written by Christian Hammers <ch@debian.org>.
+
+MySQL is available at http://www.mysql.com/.
+.\" end of man page
diff --git a/storage/xtradb/build/debian/additions/mysql_convert_table_format.1 b/storage/xtradb/build/debian/additions/mysql_convert_table_format.1
new file mode 100644
index 00000000000..3c23581df43
--- /dev/null
+++ b/storage/xtradb/build/debian/additions/mysql_convert_table_format.1
@@ -0,0 +1,17 @@
+.TH mysql 1 "17 March 2003" "MySQL 3.23" "MySQL database"
+.SH NAME
+mysql_convert_table_format \- MySQL table converter.
+.SH SYNOPSIS
+mysql_convert_table_format [options]
+.SH DESCRIPTION
+Conversion of a MySQL tables to other table types.
+If no tables has been specifed, all tables in the database will be converted.
+
+For more information start the program with '--help'.
+.SH "SEE ALSO"
+mysql (1), mysqld (1)
+.SH AUTHOR
+This manpage was written by Christian Hammers <ch@debian.org>.
+
+MySQL is available at http://www.mysql.com/.
+.\" end of man page
diff --git a/storage/xtradb/build/debian/additions/mysql_find_rows.1 b/storage/xtradb/build/debian/additions/mysql_find_rows.1
new file mode 100644
index 00000000000..35a70b1f960
--- /dev/null
+++ b/storage/xtradb/build/debian/additions/mysql_find_rows.1
@@ -0,0 +1,18 @@
+.TH mysql 1 "17 March 2003" "MySQL 3.23" "MySQL database"
+.SH NAME
+mysql_find_rows \- MySQL shell skript for searching in update logs.
+.SH SYNOPSIS
+mysql_find_rows [options]
+.SH DESCRIPTION
+Prints all SQL queries that matches a regexp or contains a 'use
+database' or 'set ..' command to stdout.  A SQL query may contain
+newlines.  This is useful to find things in a MySQL update log.
+
+For more information start the program with '--help'.
+.SH "SEE ALSO"
+mysql (1), mysqld (1)
+.SH AUTHOR
+This manpage was written by Christian Hammers <ch@debian.org>.
+
+MySQL is available at http://www.mysql.com/.
+.\" end of man page
diff --git a/storage/xtradb/build/debian/additions/mysql_fix_extensions.1 b/storage/xtradb/build/debian/additions/mysql_fix_extensions.1
new file mode 100644
index 00000000000..3f0a028ca3f
--- /dev/null
+++ b/storage/xtradb/build/debian/additions/mysql_fix_extensions.1
@@ -0,0 +1,18 @@
+.TH mysql 1 "17 March 2003" "MySQL 3.23" "MySQL database"
+.SH NAME
+mysql_fix_extensions \- Corrects MySQL database file names.
+.SH SYNOPSIS
+mysql_fix_extensions <datadir>
+.SH DESCRIPTION
+Makes .frm lowercase and .MYI/MYD/ISM/ISD uppercase
+useful when datafiles are copied from windows.
+Does not work with RAID, with InnoDB or BDB tables.
+
+For more information start the program with '--help'.
+.SH "SEE ALSO"
+mysql (1), mysqld (8)
+.SH AUTHOR
+This manpage was written by Christian Hammers <ch@debian.org>.
+
+MySQL is available at http://www.mysql.com/.
+.\" end of man page
diff --git a/storage/xtradb/build/debian/additions/mysql_install_db.1 b/storage/xtradb/build/debian/additions/mysql_install_db.1
new file mode 100644
index 00000000000..11f1f2967a2
--- /dev/null
+++ b/storage/xtradb/build/debian/additions/mysql_install_db.1
@@ -0,0 +1,16 @@
+.TH mysql 1 "17 March 2003" "MySQL 3.23" "MySQL database"
+.SH NAME
+mysql_install_db \- MySQL helper program.
+.SH SYNOPSIS
+mysql_install_db [options]
+.SH DESCRIPTION
+This program is normally not needed by any user.
+
+For more information start the program with '--help'.
+.SH "SEE ALSO"
+mysql (1), mysqld (1)
+.SH AUTHOR
+This manpage was written by Christian Hammers <ch@debian.org>.
+
+MySQL is available at http://www.mysql.com/.
+.\" end of man page
diff --git a/storage/xtradb/build/debian/additions/mysql_secure_installation.1 b/storage/xtradb/build/debian/additions/mysql_secure_installation.1
new file mode 100644
index 00000000000..d65b7f5d09d
--- /dev/null
+++ b/storage/xtradb/build/debian/additions/mysql_secure_installation.1
@@ -0,0 +1,17 @@
+.TH mysql 1 "17 March 2003" "MySQL 3.23" "MySQL database"
+.SH NAME
+mysql_secure_installation \- Secures the MySQL access control lists.
+.SH SYNOPSIS
+mysql_secure_installation [options]
+.SH DESCRIPTION
+This interactive programm suggests changes like removing anonymous users that
+are supposed to make your installation more secure.
+
+For more information start the program with '--help'.
+.SH "SEE ALSO"
+mysql (1), mysqld (8)
+.SH AUTHOR
+This manpage was written by Christian Hammers <ch@debian.org>.
+
+MySQL is available at http://www.mysql.com/.
+.\" end of man page
diff --git a/storage/xtradb/build/debian/additions/mysql_setpermission.1 b/storage/xtradb/build/debian/additions/mysql_setpermission.1
new file mode 100644
index 00000000000..77167e0d58f
--- /dev/null
+++ b/storage/xtradb/build/debian/additions/mysql_setpermission.1
@@ -0,0 +1,23 @@
+.TH mysql 1 "17 March 2003" "MySQL 3.23" "MySQL database"
+.SH NAME
+mysql_setpermission \- Adds MySQL users or changes passwords.
+.SH SYNOPSIS
+mysql_setpermission [options]
+.SH DESCRIPTION
+The permission setter is a little program which can help you add users
+or databases or change passwords in MySQL. Keep in mind that we don't
+check permissions which already been set in MySQL. So if you can't
+connect to MySQL using the permission you just added, take a look at
+the permissions which have already been set in MySQL.
+
+The permission setter first reads your .my.cnf file in your Home
+directory if it exists.
+
+For more information start the program with '--help'.
+.SH "SEE ALSO"
+mysql (1), mysqld (1)
+.SH AUTHOR
+This manpage was written by Christian Hammers <ch@debian.org>.
+
+MySQL is available at http://www.mysql.com/.
+.\" end of man page
diff --git a/storage/xtradb/build/debian/additions/mysql_tableinfo.1 b/storage/xtradb/build/debian/additions/mysql_tableinfo.1
new file mode 100644
index 00000000000..1de4f5d5943
--- /dev/null
+++ b/storage/xtradb/build/debian/additions/mysql_tableinfo.1
@@ -0,0 +1,322 @@
+.\" Automatically generated by Pod::Man v1.34, Pod::Parser v1.13
+.\"
+.\" Standard preamble:
+.\" ========================================================================
+.de Sh \" Subsection heading
+.br
+.if t .Sp
+.ne 5
+.PP
+\fB\\$1\fR
+.PP
+..
+.de Sp \" Vertical space (when we can't use .PP)
+.if t .sp .5v
+.if n .sp
+..
+.de Vb \" Begin verbatim text
+.ft CW
+.nf
+.ne \\$1
+..
+.de Ve \" End verbatim text
+.ft R
+.fi
+..
+.\" Set up some character translations and predefined strings.  \*(-- will
+.\" give an unbreakable dash, \*(PI will give pi, \*(L" will give a left
+.\" double quote, and \*(R" will give a right double quote.  | will give a
+.\" real vertical bar.  \*(C+ will give a nicer C++.  Capital omega is used to
+.\" do unbreakable dashes and therefore won't be available.  \*(C` and \*(C'
+.\" expand to `' in nroff, nothing in troff, for use with C<>.
+.tr \(*W-|\(bv\*(Tr
+.ds C+ C\v'-.1v'\h'-1p'\s-2+\h'-1p'+\s0\v'.1v'\h'-1p'
+.ie n \{\
+.    ds -- \(*W-
+.    ds PI pi
+.    if (\n(.H=4u)&(1m=24u) .ds -- \(*W\h'-12u'\(*W\h'-12u'-\" diablo 10 pitch
+.    if (\n(.H=4u)&(1m=20u) .ds -- \(*W\h'-12u'\(*W\h'-8u'-\"  diablo 12 pitch
+.    ds L" ""
+.    ds R" ""
+.    ds C` ""
+.    ds C' ""
+'br\}
+.el\{\
+.    ds -- \|\(em\|
+.    ds PI \(*p
+.    ds L" ``
+.    ds R" ''
+'br\}
+.\"
+.\" If the F register is turned on, we'll generate index entries on stderr for
+.\" titles (.TH), headers (.SH), subsections (.Sh), items (.Ip), and index
+.\" entries marked with X<> in POD.  Of course, you'll have to process the
+.\" output yourself in some meaningful fashion.
+.if \nF \{\
+.    de IX
+.    tm Index:\\$1\t\\n%\t"\\$2"
+..
+.    nr % 0
+.    rr F
+.\}
+.\"
+.\" For nroff, turn off justification.  Always turn off hyphenation; it makes
+.\" way too many mistakes in technical documents.
+.hy 0
+.if n .na
+.\"
+.\" Accent mark definitions (@(#)ms.acc 1.5 88/02/08 SMI; from UCB 4.2).
+.\" Fear.  Run.  Save yourself.  No user-serviceable parts.
+.    \" fudge factors for nroff and troff
+.if n \{\
+.    ds #H 0
+.    ds #V .8m
+.    ds #F .3m
+.    ds #[ \f1
+.    ds #] \fP
+.\}
+.if t \{\
+.    ds #H ((1u-(\\\\n(.fu%2u))*.13m)
+.    ds #V .6m
+.    ds #F 0
+.    ds #[ \&
+.    ds #] \&
+.\}
+.    \" simple accents for nroff and troff
+.if n \{\
+.    ds ' \&
+.    ds ` \&
+.    ds ^ \&
+.    ds , \&
+.    ds ~ ~
+.    ds /
+.\}
+.if t \{\
+.    ds ' \\k:\h'-(\\n(.wu*8/10-\*(#H)'\'\h"|\\n:u"
+.    ds ` \\k:\h'-(\\n(.wu*8/10-\*(#H)'\`\h'|\\n:u'
+.    ds ^ \\k:\h'-(\\n(.wu*10/11-\*(#H)'^\h'|\\n:u'
+.    ds , \\k:\h'-(\\n(.wu*8/10)',\h'|\\n:u'
+.    ds ~ \\k:\h'-(\\n(.wu-\*(#H-.1m)'~\h'|\\n:u'
+.    ds / \\k:\h'-(\\n(.wu*8/10-\*(#H)'\z\(sl\h'|\\n:u'
+.\}
+.    \" troff and (daisy-wheel) nroff accents
+.ds : \\k:\h'-(\\n(.wu*8/10-\*(#H+.1m+\*(#F)'\v'-\*(#V'\z.\h'.2m+\*(#F'.\h'|\\n:u'\v'\*(#V'
+.ds 8 \h'\*(#H'\(*b\h'-\*(#H'
+.ds o \\k:\h'-(\\n(.wu+\w'\(de'u-\*(#H)/2u'\v'-.3n'\*(#[\z\(de\v'.3n'\h'|\\n:u'\*(#]
+.ds d- \h'\*(#H'\(pd\h'-\w'~'u'\v'-.25m'\f2\(hy\fP\v'.25m'\h'-\*(#H'
+.ds D- D\\k:\h'-\w'D'u'\v'-.11m'\z\(hy\v'.11m'\h'|\\n:u'
+.ds th \*(#[\v'.3m'\s+1I\s-1\v'-.3m'\h'-(\w'I'u*2/3)'\s-1o\s+1\*(#]
+.ds Th \*(#[\s+2I\s-2\h'-\w'I'u*3/5'\v'-.3m'o\v'.3m'\*(#]
+.ds ae a\h'-(\w'a'u*4/10)'e
+.ds Ae A\h'-(\w'A'u*4/10)'E
+.    \" corrections for vroff
+.if v .ds ~ \\k:\h'-(\\n(.wu*9/10-\*(#H)'\s-2\u~\d\s+2\h'|\\n:u'
+.if v .ds ^ \\k:\h'-(\\n(.wu*10/11-\*(#H)'\v'-.4m'^\v'.4m'\h'|\\n:u'
+.    \" for low resolution devices (crt and lpr)
+.if \n(.H>23 .if \n(.V>19 \
+\{\
+.    ds : e
+.    ds 8 ss
+.    ds o a
+.    ds d- d\h'-1'\(ga
+.    ds D- D\h'-1'\(hy
+.    ds th \o'bp'
+.    ds Th \o'LP'
+.    ds ae ae
+.    ds Ae AE
+.\}
+.rm #[ #] #H #V #F C
+.\" ========================================================================
+.\"
+.IX Title "MYSQL_TABLEINFO 1"
+.TH MYSQL_TABLEINFO 1 "2003-04-05" "perl v5.8.0" "User Contributed Perl Documentation"
+.SH "NAME"
+mysql_tableinfo \- creates and populates information tables with 
+the output of SHOW DATABASES, SHOW TABLES (or SHOW TABLE STATUS), 
+SHOW COLUMNS and SHOW INDEX.
+.PP
+This is version 1.1.
+.SH "SYNOPSIS"
+.IX Header "SYNOPSIS"
+.Vb 1
+\&  mysql_tableinfo [OPTIONS] database_to_write [database_like_wild] [table_like_wild]
+.Ve
+.PP
+.Vb 2
+\&  Do not backquote (``) database_to_write, 
+\&  and do not quote ('') database_like_wild or table_like_wild
+.Ve
+.PP
+.Vb 1
+\&  Examples:
+.Ve
+.PP
+.Vb 1
+\&  mysql_tableinfo info
+.Ve
+.PP
+.Vb 1
+\&  mysql_tableinfo info this_db
+.Ve
+.PP
+.Vb 1
+\&  mysql_tableinfo info %a% b%
+.Ve
+.PP
+.Vb 1
+\&  mysql_tableinfo info --clear-only
+.Ve
+.PP
+.Vb 1
+\&  mysql_tableinfo info --col --idx --table-status
+.Ve
+.SH "DESCRIPTION"
+.IX Header "DESCRIPTION"
+mysql_tableinfo asks a MySQL server information about its
+databases, tables, table columns and index, and stores this
+in tables called `db`, `tbl` (or `tbl_status`), `col`, `idx` 
+(with an optional prefix specified with \-\-prefix).
+After that, you can query these information tables, for example
+to build your admin scripts with \s-1SQL\s0 queries, like
+.PP
+\&\s-1SELECT\s0 \s-1CONCAT\s0(\*(L"\s-1CHECK\s0 \s-1TABLE\s0 \*(R",`database`,\*(L".\*(R",`table`,\*(L" \s-1EXTENDED\s0;\*(R") 
+\&\s-1FROM\s0 info.tbl \s-1WHERE\s0 ... ;
+.PP
+as people usually do with some other \s-1RDBMS\s0
+(note: to increase the speed of your queries on the info tables,
+you may add some index on them).
+.PP
+The database_like_wild and table_like_wild instructs the program
+to gather information only about databases and tables
+whose names match these patterns. If the info
+tables already exist, their rows matching the patterns are simply
+deleted and replaced by the new ones. That is,
+old rows not matching the patterns are not touched.
+If the database_like_wild and table_like_wild arguments
+are not specified on the command-line they default to \*(L"%\*(R".
+.PP
+The program :
+.PP
+\&\- does \s-1CREATE\s0 \s-1DATABASE\s0 \s-1IF\s0 \s-1NOT\s0 \s-1EXISTS\s0 database_to_write
+where database_to_write is the database name specified on the command\-line.
+.PP
+\&\- does \s-1CREATE\s0 \s-1TABLE\s0 \s-1IF\s0 \s-1NOT\s0 \s-1EXISTS\s0 database_to_write.`db`
+.PP
+\&\- fills database_to_write.`db` with the output of
+\&\s-1SHOW\s0 \s-1DATABASES\s0 \s-1LIKE\s0 database_like_wild
+.PP
+\&\- does \s-1CREATE\s0 \s-1TABLE\s0 \s-1IF\s0 \s-1NOT\s0 \s-1EXISTS\s0 database_to_write.`tbl`
+(respectively database_to_write.`tbl_status`
+if the \-\-tbl\-status option is on)
+.PP
+\&\- for every found database,
+fills database_to_write.`tbl` (respectively database_to_write.`tbl_status`)
+with the output of 
+\&\s-1SHOW\s0 \s-1TABLES\s0 \s-1FROM\s0 found_db \s-1LIKE\s0 table_like_wild
+(respectively \s-1SHOW\s0 \s-1TABLE\s0 \s-1STATUS\s0 \s-1FROM\s0 found_db \s-1LIKE\s0 table_like_wild)
+.PP
+\&\- if the \-\-col option is on,
+    * does \s-1CREATE\s0 \s-1TABLE\s0 \s-1IF\s0 \s-1NOT\s0 \s-1EXISTS\s0 database_to_write.`col`
+    * for every found table,
+      fills database_to_write.`col` with the output of 
+      \s-1SHOW\s0 \s-1COLUMNS\s0 \s-1FROM\s0 found_tbl \s-1FROM\s0 found_db
+.PP
+\&\- if the \-\-idx option is on,
+    * does \s-1CREATE\s0 \s-1TABLE\s0 \s-1IF\s0 \s-1NOT\s0 \s-1EXISTS\s0 database_to_write.`idx`
+    * for every found table,
+      fills database_to_write.`idx` with the output of 
+      \s-1SHOW\s0 \s-1INDEX\s0 \s-1FROM\s0 found_tbl \s-1FROM\s0 found_db
+.PP
+Some options may modify this general scheme (see below).
+.PP
+As mentioned, the contents of the info tables are the output of
+\&\s-1SHOW\s0 commands. In fact the contents are slightly more complete :
+.PP
+\&\- the `tbl` (or `tbl_status`) info table 
+  has an extra column which contains the database name,
+.PP
+\&\- the `col` info table
+  has an extra column which contains the table name,
+  and an extra column which contains, for each described column,
+  the number of this column in the table owning it (this extra column
+  is called `Seq_in_table`). `Seq_in_table` makes it possible for you
+  to retrieve your columns in sorted order, when you are querying
+  the `col` table. 
+.PP
+\&\- the `index` info table
+  has an extra column which contains the database name.
+.PP
+Caution: info tables contain certain columns (e.g.
+Database, Table, Null...) whose names, as they are MySQL reserved words,
+need to be backquoted (`...`) when used in \s-1SQL\s0 statements.
+.PP
+Caution: as information fetching and info tables filling happen at the
+same time, info tables may contain inaccurate information about
+themselves.
+.SH "OPTIONS"
+.IX Header "OPTIONS"
+.IP "\-\-clear" 4
+.IX Item "--clear"
+Does \s-1DROP\s0 \s-1TABLE\s0 on the info tables (only those that the program is
+going to fill, for example if you do not use \-\-col it won't drop
+the `col` table) and processes normally. Does not drop database_to_write.
+.IP "\-\-clear\-only" 4
+.IX Item "--clear-only"
+Same as \-\-clear but exits after the DROPs.
+.IP "\-\-col" 4
+.IX Item "--col"
+Adds columns information (into table `col`).
+.IP "\-\-idx" 4
+.IX Item "--idx"
+Adds index information (into table `idx`).
+.IP "\-\-prefix prefix" 4
+.IX Item "--prefix prefix"
+The info tables are named from the concatenation of prefix and,
+respectively, db, tbl (or tbl_status), col, idx. Do not quote ('')
+or backquote (``) prefix.
+.IP "\-q, \-\-quiet" 4
+.IX Item "-q, --quiet"
+Does not warn you about what the script is going to do (\s-1DROP\s0 \s-1TABLE\s0 etc)
+and does not ask for a confirmation before starting.
+.IP "\-\-tbl\-status" 4
+.IX Item "--tbl-status"
+Instead of using \s-1SHOW\s0 \s-1TABLES\s0, uses \s-1SHOW\s0 \s-1TABLE\s0 \s-1STATUS\s0
+(much more complete information, but slower). 
+.IP "\-\-help" 4
+.IX Item "--help"
+Display helpscreen and exit
+.IP "\-u, \-\-user=#" 4
+.IX Item "-u, --user=#"
+user for database login if not current user. Give a user
+who has sufficient privileges (\s-1CREATE\s0, ...).
+.IP "\-p, \-\-password=# (INSECURE)" 4
+.IX Item "-p, --password=# (INSECURE)"
+password to use when connecting to server.
+WARNING: Providing a password on command line is insecure as it is visible through /proc to anyone for a short time.
+.IP "\-h, \-\-host=#" 4
+.IX Item "-h, --host=#"
+host to connect to
+.IP "\-P, \-\-port=#" 4
+.IX Item "-P, --port=#"
+port to use when connecting to server
+.IP "\-S, \-\-socket=#" 4
+.IX Item "-S, --socket=#"
+\&\s-1UNIX\s0 domain socket to use when connecting to server
+.SH "WARRANTY"
+.IX Header "WARRANTY"
+This software is free and comes without warranty of any kind. You
+should never trust backup software without studying the code yourself.
+Study the code inside this script and only rely on it if \fIyou\fR believe
+that it does the right thing for you.
+.Sp
+Patches adding bug fixes, documentation and new features are welcome.
+.SH "TO DO"
+.IX Header "TO DO"
+Use extended inserts to be faster (for servers with many databases
+or tables). But to do that, must care about net\-buffer\-length.
+.SH "AUTHOR"
+.IX Header "AUTHOR"
+2002\-06\-18 Guilhem Bichot (guilhem.bichot@mines\-paris.org)
+.Sp
+And all the authors of mysqlhotcopy, which served as a model for 
+the structure of the program.
diff --git a/storage/xtradb/build/debian/additions/mysql_waitpid.1 b/storage/xtradb/build/debian/additions/mysql_waitpid.1
new file mode 100644
index 00000000000..f6877865ba8
--- /dev/null
+++ b/storage/xtradb/build/debian/additions/mysql_waitpid.1
@@ -0,0 +1,20 @@
+.TH mysql 1 "17 March 2003" "MySQL 3.23" "MySQL database"
+.SH NAME
+mysql_waitpid \- Waits a specified amount of seconds for a PID to terminate.
+.SH SYNOPSIS
+mysql_waitpid [options] <pid> <seconds>
+.SH DESCRIPTION
+Description: Waits for a program, which program id is #pid, to
+terminate within #time seconds. If the program terminates within
+this time, or if the #pid no longer exists, value 0 is returned.
+Otherwise 1 is returned. Both #pid and #time must be positive
+integer arguments.
+
+See mysql_waitpid for options.
+.SH "SEE ALSO"
+mysql (1), mysqld (1)
+.SH AUTHOR
+This manpage was written by Christian Hammers <ch@debian.org>.
+
+MySQL is available at http://www.mysql.com/.
+.\" end of man page
diff --git a/storage/xtradb/build/debian/additions/mysqlbinlog.1 b/storage/xtradb/build/debian/additions/mysqlbinlog.1
new file mode 100644
index 00000000000..fcdf2a083f4
--- /dev/null
+++ b/storage/xtradb/build/debian/additions/mysqlbinlog.1
@@ -0,0 +1,17 @@
+.TH mysql 1 "17 March 2003" "MySQL 3.23" "MySQL database"
+.SH NAME
+mysqlbinlog \- Dumps MySQL binary logs.
+.SH SYNOPSIS
+mysqlbinlog [options]
+.SH DESCRIPTION
+Dumps a MySQL binary log in a format usable for viewing or for pipeing to
+the mysql command line client
+
+For more information start the program with '--help'.
+.SH "SEE ALSO"
+mysql (1), mysqld (1)
+.SH AUTHOR
+This manpage was written by Christian Hammers <ch@debian.org>.
+
+MySQL is available at http://www.mysql.com/.
+.\" end of man page
diff --git a/storage/xtradb/build/debian/additions/mysqlbug.1 b/storage/xtradb/build/debian/additions/mysqlbug.1
new file mode 100644
index 00000000000..133330dd897
--- /dev/null
+++ b/storage/xtradb/build/debian/additions/mysqlbug.1
@@ -0,0 +1,14 @@
+.TH mysql 1 "17 March 2003" "MySQL 3.23" "MySQL database"
+.SH NAME
+mysqlbug \- MySQL bug reporting tool.
+.SH SYNOPSIS
+mysqlbug [options]
+.SH DESCRIPTION
+Interactive bug reporting tool. Use reportbug on Debian systems.
+.SH "SEE ALSO"
+mysql (1), mysqld (1)
+.SH AUTHOR
+This manpage was written by Christian Hammers <ch@debian.org>.
+
+MySQL is available at http://www.mysql.com/.
+.\" end of man page
diff --git a/storage/xtradb/build/debian/additions/mysqlcheck.1 b/storage/xtradb/build/debian/additions/mysqlcheck.1
new file mode 100644
index 00000000000..b36ba2d1eb1
--- /dev/null
+++ b/storage/xtradb/build/debian/additions/mysqlcheck.1
@@ -0,0 +1,28 @@
+.TH mysql 1 "17 March 2003" "MySQL 3.23" "MySQL database"
+.SH NAME
+mysqlcheck \- MySQL program for repairing, checking and optimizing tables.
+.SH SYNOPSIS
+mysqlcheck | mysqlanalyze | mysqloptimize [options]
+.SH DESCRIPTION
+This program can be used to CHECK (-c,-m,-C), REPAIR (-r), ANALYZE (-a)
+or OPTIMIZE (-o) tables. Some of the options (like -e or -q) can be
+used same time. It works on MyISAM and in some cases on BDB tables.
+Please consult the MySQL manual for latest information about the
+above. The options -c,-r,-a and -o are exclusive to each other, which
+means that the last option will be used, if several was specified.
+
+The option -c will be used by default, if none was specified. You
+can change the default behavior by making a symbolic link, or
+copying this file somewhere with another name, the alternatives are:
+mysqlrepair:   The default option will be -r
+mysqlanalyze:  The default option will be -a
+mysqloptimize: The default option will be -o
+
+For more information start the program with '--help'.
+.SH "SEE ALSO"
+mysql (1), mysqld (8)
+.SH AUTHOR
+This manpage was written by Christian Hammers <ch@debian.org>.
+
+MySQL is available at http://www.mysql.com/.
+.\" end of man page
diff --git a/storage/xtradb/build/debian/additions/mysqld_safe_syslog.cnf b/storage/xtradb/build/debian/additions/mysqld_safe_syslog.cnf
new file mode 100644
index 00000000000..3b0445d6bd8
--- /dev/null
+++ b/storage/xtradb/build/debian/additions/mysqld_safe_syslog.cnf
@@ -0,0 +1,2 @@
+[mysqld_safe]
+syslog
diff --git a/storage/xtradb/build/debian/additions/mysqldumpslow.1 b/storage/xtradb/build/debian/additions/mysqldumpslow.1
new file mode 100644
index 00000000000..0431ef04cbb
--- /dev/null
+++ b/storage/xtradb/build/debian/additions/mysqldumpslow.1
@@ -0,0 +1,50 @@
+.TH mysql 1 "17 March 2003" "MySQL 3.23" "MySQL database"
+.SH NAME
+mysqldumpslow \- Parse and summarize the MySQL slow query log.
+.SH SYNOPSIS
+mysqldumpslow [options]
+.SH DESCRIPTION
+This program parses and summarizes a 'slow query log'.
+
+.TP
+\fB\-v\fR 
+verbose
+.TP
+\fB\-d\fR 
+debug
+.TP
+\fB\-s=WORD\fR 
+what to sort by (t, at, l, al, r, ar etc)
+.TP
+\fB\-r\fR  
+reverse the sort order (largest last instead of first)
+.TP
+\fB\-t=NUMBER\fR 
+just show the top n queries
+.TP
+\fB\-a\fR  
+don't abstract all numbers to N and strings to 'S'
+.TP
+\fB\-n=NUMBER\fR
+abstract numbers with at least n digits within names
+.TP
+\fB\-g=WORD\fR 
+grep: only consider stmts that include this string
+.TP
+\fB\-h=WORD\fR
+hostname of db server for *-slow.log filename (can be wildcard)
+.TP
+\fB\-i=WORD\fR 
+name of server instance (if using mysql.server startup script)
+.TP
+\fB\-l\fR  
+don't subtract lock time from total time
+
+.SH "SEE ALSO"
+mysql (1), mysqld (1)
+.SH AUTHOR
+This manpage was written by Christian Hammers <ch@debian.org> based on
+the commends in the code.
+
+MySQL is available at http://www.mysql.com/.
+.\" end of man page
diff --git a/storage/xtradb/build/debian/additions/mysqlimport.1 b/storage/xtradb/build/debian/additions/mysqlimport.1
new file mode 100644
index 00000000000..9007307a328
--- /dev/null
+++ b/storage/xtradb/build/debian/additions/mysqlimport.1
@@ -0,0 +1,20 @@
+.TH mysql 1 "17 March 2003" "MySQL 3.23" "MySQL database"
+.SH NAME
+mysqlimport \- Imports text files with MySQL database queries.
+.SH SYNOPSIS
+mysqlimport [options]
+.SH DESCRIPTION
+Loads tables from text files in various formats.  The base name of the
+text file must be the name of the table that should be used.
+If one uses sockets to connect to the MySQL server, the server will open and
+read the text file directly. In other cases the client will open the text
+file. The SQL command 'LOAD DATA INFILE' is used to import the rows.
+
+For more information start the program with '--help'.
+.SH "SEE ALSO"
+mysql (1), mysqld (1)
+.SH AUTHOR
+This manpage was written by Christian Hammers <ch@debian.org>.
+
+MySQL is available at http://www.mysql.com/.
+.\" end of man page
diff --git a/storage/xtradb/build/debian/additions/mysqlmanager.1 b/storage/xtradb/build/debian/additions/mysqlmanager.1
new file mode 100644
index 00000000000..ebb69adbd09
--- /dev/null
+++ b/storage/xtradb/build/debian/additions/mysqlmanager.1
@@ -0,0 +1,49 @@
+.TH mysql 1 "March 2005" "MySQL 4.1" "MySQL database"
+.SH NAME
+mysqlmanager \- Manages instances of MySQL server.
+.SH SYNOPSIS
+.B mysqlmanager
+[\fIOPTIONS\fR]
+.SH DESCRIPTION
+Manages instances of MySQL server.
+.TP
+\-?, \fB\-\-help\fR
+Display this help and exit.
+.TP
+\fB\-P\fR, \fB\-\-port=\fR#
+Port number to listen on.
+.TP
+\fB\-l\fR, \fB\-\-log\fR=\fIname\fR
+Path to log file.
+.TP
+\fB\-b\fR, \fB\-\-bind\-address=\fR#
+Address to listen on.
+.HP
+\fB\-B\fR, \fB\-\-tcp\-backlog=\fR# Size of TCP/IP listen queue.
+.HP
+\fB\-g\fR, \fB\-\-greeting\fR=\fIname\fR Set greeting on connect.
+.TP
+\fB\-m\fR, \fB\-\-max\-command\-len=\fR#
+Maximum command length.
+.TP
+\fB\-d\fR, \fB\-\-one\-thread\fR
+Use one thread ( for debugging).
+.TP
+\fB\-C\fR, \fB\-\-connect\-retries=\fR#
+Number of attempts to establish MySQL connection.
+.TP
+\fB\-p\fR, \fB\-\-password\-file\fR=\fIname\fR
+Password file for manager.
+.HP
+\fB\-f\fR, \fB\-\-pid\-file\fR=\fIname\fR Pid file to use.
+.TP
+\fB\-V\fR, \fB\-\-version\fR
+Output version information and exit.
+.SH "SEE ALSO"
+The full documentation for
+.B mysqlmanager
+is available in the package mysql-doc-4.1 or on the MySQL
+homepage www.mysql.com.
+.SH AUTHOR
+This manpage was created by Christian Hammers <ch@debian.org>
+using help2man.
diff --git a/storage/xtradb/build/debian/additions/mysqlreport b/storage/xtradb/build/debian/additions/mysqlreport
new file mode 100644
index 00000000000..402a5be835d
--- /dev/null
+++ b/storage/xtradb/build/debian/additions/mysqlreport
@@ -0,0 +1,1298 @@
+#!/usr/bin/perl -w
+
+# mysqlreport v3.5 Apr 16 2008
+# http://hackmysql.com/mysqlreport
+
+# mysqlreport makes an easy-to-read report of important MySQL status values.
+# Copyright 2006-2008 Daniel Nichter
+#
+# This program is free software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public License
+# as published by the Free Software Foundation; either version 2
+# of the License, or (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# The GNU General Public License is available at:
+# http://www.gnu.org/copyleft/gpl.html
+
+use strict;
+use File::Temp qw(tempfile);
+use DBI;
+use Getopt::Long;
+eval { require Term::ReadKey; };
+my $RK = ($@ ? 0 : 1);
+
+sub have_op;
+
+my $WIN = ($^O eq 'MSWin32' ? 1 : 0);
+my %op;
+my %mycnf; # ~/.my.cnf
+my ($tmpfile_fh, $tmpfile);
+my ($stat_name, $stat_val, $stat_label);
+my $MySQL_version;
+my (%stats, %vars); # SHOW STATUS, SHOW VARIABLES
+my (%DMS_vals, %Com_vals, %ib_vals);
+my ($dbh, $query);
+my ($questions, $key_read_ratio, $key_write_ratio, $dms, $slow_query_t);
+my ($key_cache_block_size, $key_buffer_used, $key_buffer_usage);
+my ($qc_mem_used, $qc_hi_r, $qc_ip_r); # Query Cache
+my $have_innodb_vals;
+my ($ib_bp_used, $ib_bp_total, $ib_bp_read_ratio);
+my ($relative_live, $relative_infiles);
+my $real_uptime;
+my (%stats_present, %stats_past); # For relative reports
+      
+GetOptions (
+   \%op,
+   "user=s",
+   "password:s",
+   "host=s",
+   "port=s",
+   "socket=s",
+   "no-mycnf",
+   "infile|in=s",
+   "outfile=s",
+   "flush-status",
+   "email=s",
+   "r|relative:i",
+   "c|report-count=i",
+   "detach",
+   "help|?",
+   "debug"
+);
+
+show_help_and_exit() if $op{'help'};
+
+get_user_mycnf() unless $op{'no-mycnf'};
+
+# Command line options override ~/.my.cnf
+$mycnf{'host'}   = $op{'host'}   if have_op 'host';
+$mycnf{'port'}   = $op{'port'}   if have_op 'port';
+$mycnf{'socket'} = $op{'socket'} if have_op 'socket'; 
+$mycnf{'user'}   = $op{'user'}   if have_op 'user';
+
+$mycnf{'user'} ||= $ENV{'USER'};
+
+if(exists $op{'password'})
+{
+   if($op{'password'} eq '') # Prompt for password
+   {
+      Term::ReadKey::ReadMode(2) if $RK;
+      print "Password for database user $mycnf{'user'}: ";
+      chomp($mycnf{'pass'} = <STDIN>);
+      Term::ReadKey::ReadMode(0), print "\n" if $RK;
+   }
+   else { $mycnf{'pass'} = $op{'password'}; } # Use password given on command line
+}
+
+$op{'com'} ||= 3;
+$op{'c'}   ||= 1; # Used in collect_reports() if --r given integer value
+
+$relative_live    = 0;
+$relative_infiles = 0;
+
+if(defined $op{'r'})
+{
+   if($op{r}) { $relative_live    = 1; }  # if -r was given an integer value
+   else       { $relative_infiles = 1; }
+}
+
+# The report is written to a tmp file first.
+# Later it will be moved to $op{'outfile'} or emailed $op{'email'} if needed.
+($tmpfile_fh, $tmpfile) = tempfile() or die "Cannot open temporary file for writing: $!\n";
+
+if($op{'detach'})
+{
+   $SIG{'TERM'} = 'sig_handler';
+
+   if(fork())
+   {
+      print "mysqlreport has forked and detached.\n";
+      print "While running detached, mysqlreport writes reports to '$tmpfile'.\n";
+
+      exit;
+   }
+
+   open(STDIN, "</dev/null");
+   open(STDOUT, "> $tmpfile") or die "Cannot dup STDOUT: $!\n";
+   open(STDERR, "> $tmpfile") or die "Cannot dup STDERR: $!\n";
+}
+
+select $tmpfile_fh;
+$| = 1 if ($op{'detach'} || $relative_live);
+
+print "tmp file: $tmpfile\n" if $op{debug};
+
+# Connect to MySQL
+if(!$op{'infile'} && !$relative_infiles)
+{
+   connect_to_MySQL();
+}
+
+$have_innodb_vals = 1; # This might be set to 0 later in get_MySQL_version()
+
+if(defined $op{'r'})
+{
+   if($relative_live)
+   { 
+      print STDERR "mysqlreport is writing relative reports to '$tmpfile'.\n" unless $op{'detach'}; 
+      get_MySQL_version();
+      collect_reports();
+   }
+
+   if($relative_infiles) { read_relative_infiles(); }
+}
+else
+{
+   if(!$op{'infile'})
+   {
+      get_MySQL_version();
+      get_vals();
+      get_vars();
+   }
+   else
+   {
+      read_infile($op{'infile'});
+   }
+
+   get_Com_values();
+
+   set_myisam_vals();
+   set_ib_vals() if $have_innodb_vals;
+
+   write_report();
+}
+
+exit_tasks_and_cleanup();
+
+exit;
+
+#
+# Subroutines
+#
+sub show_help_and_exit
+{
+   print <<"HELP";
+mysqlreport v3.5 Apr 16 2008
+mysqlreport makes an easy-to-read report of important MySQL status values.
+
+Command line options (abbreviations work):
+   --user USER       Connect to MySQL as USER
+   --password PASS   Use PASS or prompt for MySQL user's password
+   --host ADDRESS    Connect to MySQL at ADDRESS
+   --port PORT       Connect to MySQL at PORT
+   --socket SOCKET   Connect to MySQL at SOCKET
+   --no-mycnf        Don't read ~/.my.cnf
+   --infile FILE     Read status values from FILE instead of MySQL
+   --outfile FILE    Write report to FILE
+   --email ADDRESS   Email report to ADDRESS (doesn't work on Windows)
+   --flush-status    Issue FLUSH STATUS; after getting current values
+   --relative X      Generate relative reports. If X is an integer,
+                     reports are live from the MySQL server X seconds apart.
+                     If X is a list of infiles (file1 file2 etc.),
+                     reports are generated from the infiles in the order
+                     that they are given.
+   --report-count N  Collect N number of live relative reports (default 1)
+   --detach          Fork and detach from terminal (run in background)
+   --help            Prints this
+   --debug           Print debugging information
+
+Visit http://hackmysql.com/mysqlreport for more information.
+HELP
+
+   exit;
+}
+
+sub get_user_mycnf
+{
+   print "get_user_mycnf\n" if $op{debug};
+
+   return if $WIN;
+   open MYCNF, "$ENV{HOME}/.my.cnf" or return;
+   while(<MYCNF>)
+   {
+      if(/^(.+?)\s*=\s*"?(.+?)"?\s*$/)
+      {
+         $mycnf{$1} = $2;
+         print "get_user_mycnf: read '$1 = $2'\n" if $op{debug};
+      }
+   }
+   $mycnf{'pass'} ||= $mycnf{'password'} if exists $mycnf{'password'};
+   close MYCNF;
+}
+
+sub connect_to_MySQL
+{
+   print "connect_to_MySQL\n" if $op{debug};
+
+   my $dsn;
+
+   if($mycnf{'socket'} && -S $mycnf{'socket'})
+   {
+      $dsn = "DBI:mysql:mysql_socket=$mycnf{socket}";
+   }
+   elsif($mycnf{'host'})
+   {
+      $dsn = "DBI:mysql:host=$mycnf{host}" . ($mycnf{port} ? ";port=$mycnf{port}" : "");
+   }
+   else
+   {
+      $dsn = "DBI:mysql:host=localhost";
+   }
+
+   print "connect_to_MySQL: DBI DSN: $dsn\n" if $op{debug};
+
+   $dbh = DBI->connect($dsn, $mycnf{'user'}, $mycnf{'pass'}) or die;
+}
+
+sub collect_reports
+{
+   print "collect_reports\n" if $op{debug};
+
+   my $i;
+
+   get_vals();
+   get_vars();
+
+   get_Com_values();
+
+   %stats_past = %stats;
+
+   set_myisam_vals();
+   set_ib_vals() if $have_innodb_vals;
+
+   print "#\n# Beginning report, 0 0:0:0\n#\n";
+
+   write_report();
+
+   for($i = 0; $i < $op{'c'}; $i++)
+   {
+      $dbh->disconnect();
+
+      sleep($op{'r'});
+
+      connect_to_MySQL();
+
+      print "\n#\n# Interval report " , $i + 1 , ", +", sec_to_dhms(($i + 1) * $op{'r'}), "\n#\n";
+
+      get_vals();
+
+      write_relative_report();
+   }
+}
+
+sub read_relative_infiles
+{
+   print "read_relative_infiles\n" if $op{debug};
+
+   my $slurp;    # Used to check infiles for multiple sets of status values
+   my $n_stats;  # Number of multiple sets of status values in an infile
+   my $infile;
+   my $report_n; # Report number
+
+   $report_n = 1;
+
+   foreach $infile (@ARGV)
+   {
+      # Read all of infile into $slurp
+      open INFILE, "< $infile" or warn and next;
+      $slurp = do { local $/;  <INFILE> };
+      close INFILE;
+
+      $n_stats = 0;
+
+      # Count number of status value sets
+      $n_stats++ while $slurp =~ /Aborted_clients/g;
+
+      print "read_relative_infiles: found $n_stats sets of status values in file '$infile'\n"
+         if $op{debug};
+
+      if($n_stats == 1)
+      {
+         read_infile($infile);
+         relative_infile_report($report_n++);
+      }
+
+      if($n_stats > 1)
+      {
+         my @tmpfile_fh;
+         my @tmpfile_name;
+         my $i;
+         my $stat_n;  # Status value set number
+
+         # Create a tmp file for each set of status values
+         for($i = 0; $i < $n_stats; $i++)
+         {
+            my ($fh, $name) = tempfile()
+               or die "read_relative_infiles: cannot open temporary file for writing: $!\n";
+
+            push(@tmpfile_fh, $fh);
+            push(@tmpfile_name, $name);
+
+            print "read_relative_infiles: created tmp file '$name' for set $i\n" if $op{debug};
+         }
+
+         $i = 0;
+         $stat_n = 0;
+
+         select $tmpfile_fh[$i];
+
+         # Read infile again and copy each set of status values to seperate tmp files
+         open INFILE, "< $infile" or warn and next;
+         while(<INFILE>)
+         {
+            next if /^\+/;
+            next if /^$/;
+
+            # The infile must begin with the system variable values.
+            # Therefore, the first occurance of Aborted_clients indicates the beginning
+            # of the first set of status values if no sets have occured yet ($stat_n == 0).
+            # In this case, the following status values are printed to the current fh,
+            # along with the system variable values read thus far, until Aborted_clients
+            # occurs again. Then begins the second and subsequent sets of status values.
+
+            if(/Aborted_clients/)
+            {
+               print and next if $stat_n++ == 0;
+               select $tmpfile_fh[++$i];
+            }
+
+            print;
+         }
+         close INFILE;
+
+         # Re-select the main tmp file into which the reports are being written.
+         select $tmpfile_fh;
+
+         for($i = 0; $i < $n_stats; $i++)
+         {
+            close $tmpfile_fh[$i];
+
+            print "read_relative_infiles: reading set $i tmp file '$tmpfile_name[$i]'\n"
+               if $op{debug};
+
+            read_infile($tmpfile_name[$i]);
+            relative_infile_report($report_n++);
+
+            if($WIN) { `del $tmpfile_name[$i]`;   }
+            else     { `rm -f $tmpfile_name[$i]`; }
+
+            print "read_relative_infiles: deleted set $i tmp file '$tmpfile_name[$i]'\n"
+               if $op{debug};
+         }
+
+      } # if($n_stats > 1)
+   } # foreach $infile (@files)
+}
+
+sub relative_infile_report
+{
+   print "relative_infile_report\n" if $op{debug};
+
+   my $report_n = shift;
+
+   if($report_n == 1)
+   {
+      get_Com_values();
+
+      %stats_past = %stats;
+
+      set_myisam_vals();
+      set_ib_vals() if $have_innodb_vals;
+
+      print "#\n# Beginning report, 0 0:0:0\n#\n";
+
+      write_report();
+   }
+   else
+   {
+      print "\n#\n# Interval report ", $report_n - 1, ", +",
+         sec_to_dhms($stats{Uptime} - $stats_past{Uptime}),
+         "\n#\n";
+
+      write_relative_report();
+   }
+}
+
+sub get_vals
+{
+   print "get_vals\n" if $op{debug};
+
+   my @row;
+
+   # Get status values
+   if($MySQL_version >= 50002)
+   {
+      $query = $dbh->prepare("SHOW GLOBAL STATUS;");
+   }
+   else
+   {
+      $query = $dbh->prepare("SHOW STATUS;");
+   }
+   $query->execute();
+   while(@row = $query->fetchrow_array()) { $stats{$row[0]} = $row[1]; }
+
+   $real_uptime = $stats{'Uptime'};
+}
+
+sub get_vars
+{
+   print "get_vars\n" if $op{debug};
+
+   my @row;
+
+   # Get server system variables
+   $query = $dbh->prepare("SHOW VARIABLES;");
+   $query->execute();
+   while(@row = $query->fetchrow_array()) { $vars{$row[0]} = $row[1]; }
+
+   # table_cache was renamed to table_open_cache in MySQL 5.1.3
+   if($MySQL_version >= 50103)
+   {
+      $vars{'table_cache'} = $vars{'table_open_cache'};
+   }
+}
+
+sub read_infile
+{
+   print "read_infile\n" if $op{debug};
+
+   my $infile = shift;
+
+   # Default required system variable values if not set in INFILE.
+   # As of mysqlreport v3.5 the direct output from SHOW VARIABLES;
+   # can be put into INFILE instead. See http://hackmysql.com/mysqlreportdoc
+   # for details.
+   $vars{'version'} = "0.0.0"         if !exists $vars{'version'};
+   $vars{'table_cache'} = 64          if !exists $vars{'table_cache'};
+   $vars{'max_connections'} = 100     if !exists $vars{'max_connections'};
+   $vars{'key_buffer_size'} = 8388600 if !exists $vars{'key_buffer_size'}; # 8M
+   $vars{'thread_cache_size'} = 0     if !exists $vars{'thread_cache_size'}; 
+   $vars{'tmp_table_size'} = 0        if !exists $vars{'tmp_table_size'};
+   $vars{'long_query_time'} = '?'     if !exists $vars{'long_query_time'};
+   $vars{'log_slow_queries'} = '?'    if !exists $vars{'log_slow_queries'};
+
+   # One should also add:
+   #    key_cache_block_size
+   #    query_cache_size
+   # to INFILE if needed.
+
+   open INFILE, "< $infile" or die "Cannot open INFILE '$infile': $!\n";
+
+   while(<INFILE>)
+   {
+      last if !defined $_;
+
+      next if /^\+/;  # skip divider lines 
+      next if /^$/;   # skip blank lines
+
+      next until /(Aborted_clients|back_log|=)/;
+
+      if($1 eq 'Aborted_clients')  # status values
+      {
+         print "read_infile: start stats\n" if $op{debug};
+
+         while($_)
+         {
+            chomp;
+            if(/([A-Za-z_]+)[\s\t|]+(\d+)/)
+            {
+               $stats{$1} = $2;
+               print "read_infile: save $1 = $2\n" if $op{debug};
+            }
+            else { print "read_infile: ignore '$_'\n" if $op{debug}; }
+
+            last if $1 eq 'Uptime';  # exit while() if end of status values
+            $_ = <INFILE>; # otherwise, read next line of status values
+         }
+      }
+      elsif($1 eq  'back_log')  # system variable values
+      {
+         print "read_infile: start vars\n" if $op{debug};
+
+         while($_)
+         {
+            chomp;
+            if(/([A-Za-z_]+)[\s\t|]+([\w\.\-]+)/)  # This will exclude some vars
+            {                                      # like pid_file which we don't need
+               $vars{$1} = $2;
+               print "read_infile: save $1 = $2\n" if $op{debug};
+            }
+            else { print "read_infile: ignore '$_'\n" if $op{debug}; }
+
+            last if $1 eq 'wait_timeout';  # exit while() if end of vars
+            $_ = <INFILE>; # otherwise, read next line of vars
+         }
+      }
+      elsif($1 eq '=')  # old style, manually added system variable values
+      {
+         print "read_infile: start old vars\n" if $op{debug};
+
+         while($_ && $_ =~ /=/)
+         {
+            chomp;
+            if(/^\s*(\w+)\s*=\s*([0-9.]+)(M*)\s*$/)  # e.g.: key_buffer_size = 128M
+            {
+               $vars{$1} = ($3 ? $2 * 1024 * 1024 : $2);
+               print "read_infile: read '$_' as $1 = $vars{$1}\n" if $op{debug};
+            }
+            else { print "read_infile: ignore '$_'\n" if $op{debug}; }
+
+            $_ = <INFILE>; # otherwise, read next line of old vars
+         }
+
+         redo;
+      }
+      else
+      {
+         print "read_infile: unrecognized line: '$_'\n" if $op{debug};
+      }
+   }
+
+   close INFILE;
+
+   $real_uptime = $stats{'Uptime'};
+
+   $vars{'table_cache'} = $vars{'table_open_cache'} if exists $vars{'table_open_cache'};
+
+   get_MySQL_version();
+}
+
+sub get_MySQL_version
+{
+   print "get_MySQL_version\n" if $op{debug};
+
+   return if $MySQL_version;
+
+   my ($major, $minor, $patch);
+
+   if($op{'infile'} || $relative_infiles)
+   {
+      ($major, $minor, $patch) = ($vars{'version'} =~ /(\d{1,2})\.(\d{1,2})\.(\d{1,2})/);
+   }
+   else
+   {
+      my @row;
+
+      $query = $dbh->prepare("SHOW VARIABLES LIKE 'version';");
+      $query->execute();
+      @row = $query->fetchrow_array();
+      ($major, $minor, $patch) = ($row[1] =~ /(\d{1,2})\.(\d{1,2})\.(\d{1,2})/);
+   }
+
+   $MySQL_version = sprintf("%d%02d%02d", $major, $minor, $patch);
+
+   # Innodb_ status values were added in 5.0.2
+   if($MySQL_version < 50002)
+   {
+      $have_innodb_vals = 0;
+      print "get_MySQL_version: no InnoDB reports because MySQL version is older than 5.0.2\n" if $op{debug};
+   }
+}
+
+sub set_myisam_vals
+{
+   print "set_myisam_vals\n" if $op{debug};
+
+   $questions = $stats{'Questions'};
+
+   $key_read_ratio = sprintf "%.2f",
+                     ($stats{'Key_read_requests'} ?
+                      100 - ($stats{'Key_reads'} / $stats{'Key_read_requests'}) * 100 :
+                      0);
+
+   $key_write_ratio = sprintf "%.2f",
+                      ($stats{'Key_write_requests'} ?
+                       100 - ($stats{'Key_writes'} / $stats{'Key_write_requests'}) * 100 :
+                       0);
+
+   $key_cache_block_size = (defined $vars{'key_cache_block_size'} ?
+                            $vars{'key_cache_block_size'} :
+                            1024);
+
+   $key_buffer_used = $stats{'Key_blocks_used'} * $key_cache_block_size;
+
+   if(defined $stats{'Key_blocks_unused'}) # MySQL 4.1.2+
+   {
+      $key_buffer_usage =  $vars{'key_buffer_size'} -
+                           ($stats{'Key_blocks_unused'} * $key_cache_block_size);
+   }
+   else { $key_buffer_usage = -1; }
+
+   # Data Manipulation Statements: http://dev.mysql.com/doc/refman/5.0/en/data-manipulation.html
+   %DMS_vals =
+   (
+      SELECT  => $stats{'Com_select'},
+      INSERT  => $stats{'Com_insert'}  + $stats{'Com_insert_select'},
+      REPLACE => $stats{'Com_replace'} + $stats{'Com_replace_select'},
+      UPDATE  => $stats{'Com_update'}  +
+                 (exists $stats{'Com_update_multi'} ? $stats{'Com_update_multi'} : 0),
+      DELETE  => $stats{'Com_delete'}  +
+                 (exists $stats{'Com_delete_multi'} ? $stats{'Com_delete_multi'} : 0)
+   );
+
+   $dms = $DMS_vals{SELECT} + $DMS_vals{INSERT} + $DMS_vals{REPLACE} + $DMS_vals{UPDATE} + $DMS_vals{DELETE};
+
+   $slow_query_t = format_u_time($vars{long_query_time});
+
+}
+
+sub set_ib_vals
+{
+   print "set_ib_vals\n" if $op{debug};
+
+   $ib_bp_used  = ($stats{'Innodb_buffer_pool_pages_total'} -
+                   $stats{'Innodb_buffer_pool_pages_free'}) *
+                   $stats{'Innodb_page_size'};
+
+   $ib_bp_total = $stats{'Innodb_buffer_pool_pages_total'} * $stats{'Innodb_page_size'};
+
+   $ib_bp_read_ratio = sprintf "%.2f",
+                       ($stats{'Innodb_buffer_pool_read_requests'} ?
+                        100 - ($stats{'Innodb_buffer_pool_reads'} /
+                           $stats{'Innodb_buffer_pool_read_requests'}) * 100 :
+                        0);
+}
+
+sub write_relative_report
+{
+   print "write_relative_report\n" if $op{debug};
+
+   %stats_present = %stats;
+
+   for(keys %stats)
+   {
+      if($stats_past{$_} =~ /\d+/)
+      {
+         if($stats_present{$_} >= $stats_past{$_}) # Avoid negative values
+         {
+            $stats{$_} = $stats_present{$_} - $stats_past{$_};
+         }
+      }
+   }
+
+   # These values are either "at present" or "high water marks".
+   # Therefore, it is more logical to not relativize these values.
+   # Doing otherwise causes strange and misleading values.
+   $stats{'Key_blocks_used'}      = $stats_present{'Key_blocks_used'};
+   $stats{'Open_tables'}          = $stats_present{'Open_tables'};
+   $stats{'Max_used_connections'} = $stats_present{'Max_used_connections'};
+   $stats{'Threads_running'}      = $stats_present{'Threads_running'};
+   $stats{'Threads_connected'}    = $stats_present{'Threads_connected'};
+   $stats{'Threads_cached'}       = $stats_present{'Threads_cached'};
+   $stats{'Qcache_free_blocks'}   = $stats_present{'Qcache_free_blocks'};
+   $stats{'Qcache_total_blocks'}  = $stats_present{'Qcache_total_blocks'};
+   $stats{'Qcache_free_memory'}   = $stats_present{'Qcache_free_memory'};
+   if($have_innodb_vals)
+   {
+      $stats{'Innodb_page_size'}                 = $stats_present{'Innodb_page_size'};
+      $stats{'Innodb_buffer_pool_pages_data'}    = $stats_present{'Innodb_buffer_pool_pages_data'};
+      $stats{'Innodb_buffer_pool_pages_dirty'}   = $stats_present{'Innodb_buffer_pool_pages_dirty'};
+      $stats{'Innodb_buffer_pool_pages_free'}    = $stats_present{'Innodb_buffer_pool_pages_free'};
+      $stats{'Innodb_buffer_pool_pages_latched'} = $stats_present{'Innodb_buffer_pool_pages_latched'};
+      $stats{'Innodb_buffer_pool_pages_misc'}    = $stats_present{'Innodb_buffer_pool_pages_misc'};
+      $stats{'Innodb_buffer_pool_pages_total'}   = $stats_present{'Innodb_buffer_pool_pages_total'};
+      $stats{'Innodb_data_pending_fsyncs'}       = $stats_present{'Innodb_data_pending_fsyncs'};
+      $stats{'Innodb_data_pending_reads'}        = $stats_present{'Innodb_data_pending_reads'};
+      $stats{'Innodb_data_pending_writes'}       = $stats_present{'Innodb_data_pending_writes'};
+
+      # Innodb_row_lock_ values were added in MySQL 5.0.3
+      if($MySQL_version >= 50003)
+      {
+         $stats{'Innodb_row_lock_current_waits'} = $stats_present{'Innodb_row_lock_current_waits'};
+         $stats{'Innodb_row_lock_time_avg'}      = $stats_present{'Innodb_row_lock_time_avg'};
+         $stats{'Innodb_row_lock_time_max'}      = $stats_present{'Innodb_row_lock_time_max'};
+      }
+   }
+
+   get_Com_values();
+
+   %stats_past = %stats_present;
+
+   set_myisam_vals();
+   set_ib_vals() if $have_innodb_vals;
+
+   write_report();
+}
+
+sub write_report
+{
+   print "write_report\n" if $op{debug};
+
+   $~ = 'MYSQL_TIME', write;
+   $~ = 'KEY_BUFF_MAX', write;
+   if($key_buffer_usage != -1) { $~ = 'KEY_BUFF_USAGE', write }
+   $~ = 'KEY_RATIOS', write;
+   write_DTQ();
+   $~ = 'SLOW_DMS', write;
+   write_DMS();
+   write_Com();
+   $~ = 'SAS', write; 
+   write_qcache(); 
+   $~ = 'REPORT_END', write;
+   $~ = 'TAB', write;
+
+   write_InnoDB() if $have_innodb_vals;
+}
+
+sub sec_to_dhms # Seconds to days hours:minutes:seconds
+{
+   my $s = shift;
+   my ($d, $h, $m) = (0, 0, 0);
+
+   return '0 0:0:0' if $s <= 0;
+
+   if($s >= 86400)
+   {
+      $d = int $s / 86400;
+      $s -= $d * 86400;
+   }
+
+   if($s >= 3600)
+   {
+     $h = int $s / 3600;
+     $s -= $h * 3600;
+   }
+   
+   $m = int $s / 60;
+   $s -= $m * 60;
+   
+   return "$d $h:$m:$s";
+}
+
+sub make_short
+{
+   my ($number, $kb, $d) = @_;
+   my $n = 0;
+   my $short;
+
+   $d ||= 2;
+
+   if($kb) { while ($number > 1023) { $number /= 1024; $n++; }; }
+   else { while ($number > 999) { $number /= 1000; $n++; }; }
+
+   $short = sprintf "%.${d}f%s", $number, ('','k','M','G','T')[$n];
+   if($short =~ /^(.+)\.(00)$/) { return $1; } # 12.00 -> 12 but not 12.00k -> 12k
+
+   return $short;
+}
+
+# What began as a simple but great idea has become the new standard:
+# long_query_time in microseconds. For MySQL 5.1.21+ and 6.0.4+ this
+# is now standard. For 4.1 and 5.0 patches, the architects of this
+# idea provide: http://www.mysqlperformanceblog.com/mysql-patches/
+# Relevant notes in MySQL manual:
+# http://dev.mysql.com/doc/refman/5.1/en/slow-query-log.html
+# http://dev.mysql.com/doc/refman/6.0/en/slow-query-log.html
+#
+# The format_u_time sub simply beautifies long_query_time.
+
+sub format_u_time  # format microsecond (�) time value
+{
+   # 0.000000 - 0.000999 = 0 - 999 �
+   # 0.001000 - 0.999999 = 1 ms - 999.999 ms
+   # 1.000000 - n.nnnnnn = 1 s - n.nnnnn s
+
+   my $t = shift;
+   my $f;  # formatted � time
+   my $u = chr(($WIN ? 230 : 181));
+
+   $t = 0 if $t < 0;
+
+   if($t > 0 && $t <= 0.000999)
+   {
+      $f = ($t * 1000000) . " $u";
+   }
+   elsif($t >= 0.001000 && $t <= 0.999999)
+   {
+      $f = ($t * 1000) . ' ms';
+   }
+   elsif($t >= 1)
+   {
+      $f = ($t * 1) . ' s';  # * 1 to remove insignificant zeros
+   }
+   else
+   {
+      $f = 0;  # $t should = 0 at this point
+   }
+
+   return $f;
+}
+
+sub perc # Percentage
+{
+   my($is, $of) = @_;
+   $is = 0 if (not defined $is);
+   return sprintf "%.2f", ($is * 100) / ($of ||= 1);
+}
+
+sub t # Time average per second
+{
+   my $val = shift;
+   return 0 if !$val;
+   return(make_short($val / $stats{'Uptime'}, 0, 1));
+}
+
+sub email_report # Email given report to $op{'email'}
+{
+   print "email_report\n" if $op{debug};
+
+   return if $WIN;
+
+   my $report = shift;
+
+   open SENDMAIL, "|/usr/sbin/sendmail -t";
+   print SENDMAIL "From: mysqlreport\n";
+   print SENDMAIL "To: $op{email}\n";
+   print SENDMAIL "Subject: MySQL status report on " . ($mycnf{'host'} || 'localhost') . "\n\n";
+   print SENDMAIL `cat $report`;
+   close SENDMAIL;
+}
+
+sub cat_report # Print given report to screen
+{
+   print "cat_report\n" if $op{debug};
+
+   my $report = shift;
+   my @report;
+
+   open REPORT, "< $report";
+   @report = <REPORT>;
+   close REPORT;
+   print @report;
+}
+
+sub get_Com_values
+{
+   print "get_Com_values\n" if $op{debug};
+
+   %Com_vals = ();
+
+   # Make copy of just the Com_ values
+   for(keys %stats)
+   {
+      if(grep /^Com_/, $_ and $stats{$_} > 0)
+      {
+         /^Com_(.*)/;
+         $Com_vals{$1} = $stats{$_};
+      }
+   }
+
+   # Remove DMS values
+   delete $Com_vals{'select'};
+   delete $Com_vals{'insert'};
+   delete $Com_vals{'insert_select'};
+   delete $Com_vals{'replace'};
+   delete $Com_vals{'replace_select'};
+   delete $Com_vals{'update'};
+   delete $Com_vals{'update_multi'} if exists $Com_vals{'update_multi'};
+   delete $Com_vals{'delete'};
+   delete $Com_vals{'delete_multi'} if exists $Com_vals{'delete_multi'};
+}
+
+sub write_DTQ # Write DTQ report in descending order by values
+{
+   print "write_DTQ\n" if $op{debug};
+
+   $~ = 'DTQ';
+
+   my %DTQ;
+   my $first = 1;
+
+   # Total Com values
+   $stat_val = 0;
+   for(values %Com_vals) { $stat_val += $_; }
+   $DTQ{'Com_'} = $stat_val;
+
+   $DTQ{'DMS'}      = $dms;
+   $DTQ{'QC Hits'}  = $stats{'Qcache_hits'} if $stats{'Qcache_hits'} != 0;
+   $DTQ{'COM_QUIT'} = int (($stats{'Connections'} - 2) - ($stats{'Aborted_clients'} / 2));
+
+   $stat_val = 0;
+   for(values %DTQ) { $stat_val += $_; }
+   if($questions != $stat_val)
+   {
+      $DTQ{($questions > $stat_val ? '+Unknown' : '-Unknown')} = abs $questions - $stat_val;
+   }
+
+   for(sort { $DTQ{$b} <=> $DTQ{$a} } keys(%DTQ))
+   {
+      if($first) { $stat_label = '%Total:'; $first = 0; }
+      else       { $stat_label = ''; }
+
+      $stat_name = $_;
+      $stat_val  = $DTQ{$_};
+      write;
+   }
+}
+
+sub write_DMS # Write DMS report in descending order by values
+{
+   print "write_DMS\n" if $op{debug};
+
+   $~ = 'DMS';
+
+   for(sort { $DMS_vals{$b} <=> $DMS_vals{$a} } keys(%DMS_vals))
+   {
+      $stat_name = $_;
+      $stat_val  = $DMS_vals{$_};
+      write;
+   }
+}
+
+sub write_Com # Write COM report in descending order by values
+{
+   print "write_Com\n" if $op{debug};
+
+   my $i = $op{'com'};
+
+   $~ = 'COM_1';
+
+   # Total Com values and write first line of COM report
+   $stat_label = '%Total:' unless $op{'dtq'};
+   $stat_val   = 0;
+   for(values %Com_vals) { $stat_val += $_; }
+   write;
+
+   $~ = 'COM_2';
+
+   # Sort remaining Com values, print only the top $op{'com'} number of values
+   for(sort { $Com_vals{$b} <=> $Com_vals{$a} } keys(%Com_vals))
+   {
+      $stat_name = $_;
+      $stat_val  = $Com_vals{$_};
+      write;
+
+      last if !(--$i);
+   }
+}
+
+sub write_qcache
+{
+   print "write_qcache\n" if $op{debug};
+
+   # Query cache was added in 4.0.1, but have_query_cache was added in 4.0.2,
+   # ergo this method is slightly more reliable
+   return if not exists $vars{'query_cache_size'};
+   return if $vars{'query_cache_size'} == 0;
+
+   $qc_mem_used = $vars{'query_cache_size'} - $stats{'Qcache_free_memory'};
+   $qc_hi_r = sprintf "%.2f", $stats{'Qcache_hits'} / ($stats{'Qcache_inserts'} ||= 1);
+   $qc_ip_r = sprintf "%.2f", $stats{'Qcache_inserts'} / ($stats{'Qcache_lowmem_prunes'} ||= 1);
+
+   $~ = 'QCACHE';
+   write;
+}
+
+sub write_InnoDB
+{
+   print "write_InnoDB\n" if $op{debug};
+
+   return if not defined $stats{'Innodb_page_size'};
+
+   $stats{'Innodb_buffer_pool_pages_latched'} = 0 if not defined $stats{'Innodb_buffer_pool_pages_latched'};
+
+   $~ = 'IB';
+   write;
+
+   # Innodb_row_lock_ values were added in MySQL 5.0.3
+   if($MySQL_version >= 50003)
+   {
+      $~ = 'IB_LOCK';
+      write;
+   }
+
+   # Data, Pages, Rows
+   $~ = 'IB_DPR';
+   write;
+}
+
+sub have_op
+{
+   my $key = shift;
+   return 1 if (exists $op{$key} && $op{$key} ne '');
+   return 0;
+}
+
+sub sig_handler
+{
+   print "\nReceived signal at " , scalar localtime , "\n";
+   exit_tasks_and_cleanup();
+   exit;
+}
+
+sub exit_tasks_and_cleanup
+{
+   print "exit_tasks_and_cleanup\n" if $op{debug};
+
+   close $tmpfile_fh;
+   select STDOUT unless $op{'detach'};
+
+   email_report($tmpfile) if $op{'email'};
+
+   cat_report($tmpfile) unless $op{'detach'};
+
+   if($op{'outfile'})
+   {
+      if($WIN) { `move $tmpfile $op{outfile}`; }
+      else     { `mv $tmpfile $op{outfile}`;   }
+   }
+   else
+   {
+      if($WIN) { `del $tmpfile`;   }
+      else     { `rm -f $tmpfile`; }
+   }
+
+   if(!$op{'infile'} && !$relative_infiles)
+   {
+      if($op{'flush-status'})
+      {
+         $query = $dbh->prepare("FLUSH STATUS;");
+         $query->execute();
+      }
+
+      $query->finish();
+      $dbh->disconnect();
+   }
+}
+
+#
+# Formats
+#
+
+format MYSQL_TIME =
+MySQL @<<<<<<<<<<<<<<<<  uptime @<<<<<<<<<<<   @>>>>>>>>>>>>>>>>>>>>>>>>
+$vars{'version'}, sec_to_dhms($real_uptime), (($op{infile} || $relative_infiles) ? '' : scalar localtime)
+.
+
+format KEY_BUFF_MAX =
+
+__ Key _________________________________________________________________
+Buffer used   @>>>>>> of @>>>>>>  %Used: @>>>>>
+make_short($key_buffer_used, 1), make_short($vars{'key_buffer_size'}, 1), perc($key_buffer_used, $vars{'key_buffer_size'})
+.
+
+format KEY_BUFF_USAGE =
+  Current     @>>>>>>            %Usage: @>>>>>
+make_short($key_buffer_usage, 1), perc($key_buffer_usage, $vars{'key_buffer_size'})
+.
+
+format KEY_RATIOS =
+Write hit     @>>>>>%
+$key_write_ratio
+Read hit      @>>>>>%
+$key_read_ratio
+
+__ Questions ___________________________________________________________
+Total       @>>>>>>>>  @>>>>>/s
+make_short($questions), t($questions)
+.
+
+format DTQ =
+  @<<<<<<<  @>>>>>>>>  @>>>>>/s  @>>>>>> @>>>>>
+$stat_name, make_short($stat_val), t($stat_val), $stat_label, perc($stat_val, $questions)
+.
+
+format SLOW_DMS =
+Slow @<<<<<<< @>>>>>>  @>>>>>/s          @>>>>>  %DMS: @>>>>>  Log: @>> 
+$slow_query_t, make_short($stats{'Slow_queries'}), t($stats{'Slow_queries'}), perc($stats{'Slow_queries'}, $questions), perc($stats{'Slow_queries'}, $dms), $vars{'log_slow_queries'}
+DMS         @>>>>>>>>  @>>>>>/s          @>>>>>
+make_short($dms), t($dms), perc($dms, $questions)
+.
+
+format DMS =
+  @<<<<<<<  @>>>>>>>>  @>>>>>/s          @>>>>>        @>>>>>
+$stat_name, make_short($stat_val), t($stat_val), perc($stat_val, $questions), perc($stat_val, $dms)
+.
+
+format COM_1 =
+Com_        @>>>>>>>>  @>>>>>/s          @>>>>>
+make_short($stat_val), t($stat_val), perc($stat_val, $questions)
+.
+
+format COM_2 =
+  @<<<<<<<<<< @>>>>>>  @>>>>>/s          @>>>>>
+$stat_name, make_short($stat_val), t($stat_val), perc($stat_val, $questions)
+.
+
+format SAS =
+
+__ SELECT and Sort _____________________________________________________
+Scan          @>>>>>>   @>>>>/s %SELECT: @>>>>>
+make_short($stats{'Select_scan'}), t($stats{'Select_scan'}), perc($stats{'Select_scan'}, $stats{'Com_select'})
+Range         @>>>>>>   @>>>>/s          @>>>>>
+make_short($stats{'Select_range'}), t($stats{'Select_range'}), perc($stats{'Select_range'}, $stats{'Com_select'})
+Full join     @>>>>>>   @>>>>/s          @>>>>>
+make_short($stats{'Select_full_join'}), t($stats{'Select_full_join'}), perc($stats{'Select_full_join'}, $stats{'Com_select'})
+Range check   @>>>>>>   @>>>>/s          @>>>>>
+make_short($stats{'Select_range_check'}), t($stats{'Select_range_check'}), perc($stats{'Select_range_check'}, $stats{'Com_select'})
+Full rng join @>>>>>>   @>>>>/s          @>>>>>
+make_short($stats{'Select_full_range_join'}), t($stats{'Select_full_range_join'}), perc($stats{'Select_full_range_join'}, $stats{'Com_select'})
+Sort scan     @>>>>>>   @>>>>/s
+make_short($stats{'Sort_scan'}), t($stats{'Sort_scan'})
+Sort range    @>>>>>>   @>>>>/s
+make_short($stats{'Sort_range'}), t($stats{'Sort_range'})
+Sort mrg pass @>>>>>>   @>>>>/s
+make_short($stats{'Sort_merge_passes'}), t($stats{'Sort_merge_passes'})
+.
+
+format QCACHE =
+
+__ Query Cache _________________________________________________________
+Memory usage  @>>>>>> of @>>>>>>  %Used: @>>>>>
+make_short($qc_mem_used, 1), make_short($vars{'query_cache_size'}, 1), perc($qc_mem_used, $vars{'query_cache_size'})
+Block Fragmnt @>>>>>%
+perc($stats{'Qcache_free_blocks'}, $stats{'Qcache_total_blocks'})
+Hits          @>>>>>>   @>>>>/s
+make_short($stats{'Qcache_hits'}), t($stats{'Qcache_hits'})
+Inserts       @>>>>>>   @>>>>/s
+make_short($stats{'Qcache_inserts'}), t($stats{'Qcache_inserts'})
+Insrt:Prune @>>>>>>:1   @>>>>/s
+make_short($qc_ip_r), t($stats{'Qcache_inserts'} - $stats{'Qcache_lowmem_prunes'})
+Hit:Insert  @>>>>>>:1
+$qc_hi_r, t($qc_hi_r)
+.
+
+# Not really the end...
+format REPORT_END =
+
+__ Table Locks _________________________________________________________
+Waited      @>>>>>>>>  @>>>>>/s  %Total: @>>>>>
+make_short($stats{'Table_locks_waited'}), t($stats{'Table_locks_waited'}), perc($stats{'Table_locks_waited'}, $stats{'Table_locks_waited'} + $stats{'Table_locks_immediate'});
+Immediate   @>>>>>>>>  @>>>>>/s
+make_short($stats{'Table_locks_immediate'}), t($stats{'Table_locks_immediate'})
+
+__ Tables ______________________________________________________________
+Open        @>>>>>>>> of @>>>    %Cache: @>>>>>
+$stats{'Open_tables'}, $vars{'table_cache'}, perc($stats{'Open_tables'}, $vars{'table_cache'})
+Opened      @>>>>>>>>  @>>>>>/s
+make_short($stats{'Opened_tables'}), t($stats{'Opened_tables'})
+
+__ Connections _________________________________________________________
+Max used    @>>>>>>>> of @>>>      %Max: @>>>>>
+$stats{'Max_used_connections'}, $vars{'max_connections'}, perc($stats{'Max_used_connections'}, $vars{'max_connections'})
+Total       @>>>>>>>>  @>>>>>/s
+make_short($stats{'Connections'}), t($stats{'Connections'})
+
+__ Created Temp ________________________________________________________
+Disk table  @>>>>>>>>  @>>>>>/s
+make_short($stats{'Created_tmp_disk_tables'}), t($stats{'Created_tmp_disk_tables'})
+Table       @>>>>>>>>  @>>>>>/s    Size: @>>>>>
+make_short($stats{'Created_tmp_tables'}), t($stats{'Created_tmp_tables'}), make_short($vars{'tmp_table_size'}, 1, 1)
+File        @>>>>>>>>  @>>>>>/s
+make_short($stats{'Created_tmp_files'}), t($stats{'Created_tmp_files'})
+.
+
+format TAB =
+
+__ Threads _____________________________________________________________
+Running     @>>>>>>>> of @>>>
+$stats{'Threads_running'}, $stats{'Threads_connected'}
+Cached      @>>>>>>>> of @>>>      %Hit: @>>>>>
+$stats{'Threads_cached'}, $vars{'thread_cache_size'}, make_short(100 - perc($stats{'Threads_created'}, $stats{'Connections'}))
+Created     @>>>>>>>>  @>>>>>/s
+make_short($stats{'Threads_created'}), t($stats{'Threads_created'})
+Slow        @>>>>>>>>  @>>>>>/s
+$stats{'Slow_launch_threads'}, t($stats{'Slow_launch_threads'})
+
+__ Aborted _____________________________________________________________
+Clients     @>>>>>>>>  @>>>>>/s
+make_short($stats{'Aborted_clients'}), t($stats{'Aborted_clients'})
+Connects    @>>>>>>>>  @>>>>>/s
+make_short($stats{'Aborted_connects'}), t($stats{'Aborted_connects'})
+
+__ Bytes _______________________________________________________________
+Sent        @>>>>>>>>  @>>>>>/s
+make_short($stats{'Bytes_sent'}), t($stats{'Bytes_sent'})
+Received    @>>>>>>>>  @>>>>>/s
+make_short($stats{'Bytes_received'}), t($stats{'Bytes_received'})
+.
+
+format IB =
+
+__ InnoDB Buffer Pool __________________________________________________
+Usage         @>>>>>> of @>>>>>>  %Used: @>>>>>
+make_short($ib_bp_used, 1), make_short($ib_bp_total, 1), perc($ib_bp_used, $ib_bp_total)
+Read hit      @>>>>>%
+$ib_bp_read_ratio;
+Pages
+  Free      @>>>>>>>>            %Total: @>>>>>
+make_short($stats{'Innodb_buffer_pool_pages_free'}), perc($stats{'Innodb_buffer_pool_pages_free'}, $stats{'Innodb_buffer_pool_pages_total'})
+  Data      @>>>>>>>>                    @>>>>> %Drty: @>>>>>
+make_short($stats{'Innodb_buffer_pool_pages_data'}), perc($stats{'Innodb_buffer_pool_pages_data'}, $stats{'Innodb_buffer_pool_pages_total'}), perc($stats{'Innodb_buffer_pool_pages_dirty'}, $stats{'Innodb_buffer_pool_pages_data'})
+  Misc      @>>>>>>>>                    @>>>>>
+  $stats{'Innodb_buffer_pool_pages_misc'}, perc($stats{'Innodb_buffer_pool_pages_misc'}, $stats{'Innodb_buffer_pool_pages_total'})
+  Latched   @>>>>>>>>                    @>>>>>
+$stats{'Innodb_buffer_pool_pages_latched'}, perc($stats{'Innodb_buffer_pool_pages_latched'}, $stats{'Innodb_buffer_pool_pages_total'})
+Reads       @>>>>>>>>  @>>>>>/s  
+make_short($stats{'Innodb_buffer_pool_read_requests'}), t($stats{'Innodb_buffer_pool_read_requests'})
+  From file @>>>>>>>>  @>>>>>/s          @>>>>>
+make_short($stats{'Innodb_buffer_pool_reads'}), t($stats{'Innodb_buffer_pool_reads'}), perc($stats{'Innodb_buffer_pool_reads'}, $stats{'Innodb_buffer_pool_read_requests'})
+  Ahead Rnd @>>>>>>>>  @>>>>>/s
+$stats{'Innodb_buffer_pool_read_ahead_rnd'}, t($stats{'Innodb_buffer_pool_read_ahead_rnd'})
+  Ahead Sql @>>>>>>>>  @>>>>>/s
+$stats{'Innodb_buffer_pool_read_ahead_seq'}, t($stats{'Innodb_buffer_pool_read_ahead_seq'})
+Writes      @>>>>>>>>  @>>>>>/s
+make_short($stats{'Innodb_buffer_pool_write_requests'}), t($stats{'Innodb_buffer_pool_write_requests'})
+Flushes     @>>>>>>>>  @>>>>>/s
+make_short($stats{'Innodb_buffer_pool_pages_flushed'}), t($stats{'Innodb_buffer_pool_pages_flushed'})
+Wait Free   @>>>>>>>>  @>>>>>/s
+$stats{'Innodb_buffer_pool_wait_free'}, t($stats{'Innodb_buffer_pool_wait_free'})
+.
+
+format IB_LOCK =
+
+__ InnoDB Lock _________________________________________________________
+Waits       @>>>>>>>>  @>>>>>/s
+$stats{'Innodb_row_lock_waits'}, t($stats{'Innodb_row_lock_waits'})
+Current     @>>>>>>>>
+$stats{'Innodb_row_lock_current_waits'}
+Time acquiring
+  Total     @>>>>>>>> ms
+$stats{'Innodb_row_lock_time'}
+  Average   @>>>>>>>> ms
+$stats{'Innodb_row_lock_time_avg'}
+  Max       @>>>>>>>> ms
+$stats{'Innodb_row_lock_time_max'}
+.
+
+format IB_DPR =
+
+__ InnoDB Data, Pages, Rows ____________________________________________
+Data
+  Reads     @>>>>>>>>  @>>>>>/s
+make_short($stats{'Innodb_data_reads'}), t($stats{'Innodb_data_reads'})
+  Writes    @>>>>>>>>  @>>>>>/s
+make_short($stats{'Innodb_data_writes'}), t($stats{'Innodb_data_writes'})
+  fsync     @>>>>>>>>  @>>>>>/s
+make_short($stats{'Innodb_data_fsyncs'}), t($stats{'Innodb_data_fsyncs'})
+  Pending
+    Reads   @>>>>>>>>
+$stats{'Innodb_data_pending_reads'}, t($stats{'Innodb_data_pending_reads'})
+    Writes  @>>>>>>>>
+$stats{'Innodb_data_pending_writes'}, t($stats{'Innodb_data_pending_writes'})
+    fsync   @>>>>>>>>
+$stats{'Innodb_data_pending_fsyncs'}, t($stats{'Innodb_data_pending_fsyncs'})
+
+Pages
+  Created   @>>>>>>>>  @>>>>>/s
+make_short($stats{'Innodb_pages_created'}), t($stats{'Innodb_pages_created'})
+  Read      @>>>>>>>>  @>>>>>/s
+make_short($stats{'Innodb_pages_read'}), t($stats{'Innodb_pages_read'})
+  Written   @>>>>>>>>  @>>>>>/s
+make_short($stats{'Innodb_pages_written'}), t($stats{'Innodb_pages_written'})
+
+Rows
+  Deleted   @>>>>>>>>  @>>>>>/s
+make_short($stats{'Innodb_rows_deleted'}), t($stats{'Innodb_rows_deleted'})
+  Inserted  @>>>>>>>>  @>>>>>/s
+make_short($stats{'Innodb_rows_inserted'}), t($stats{'Innodb_rows_inserted'})
+  Read      @>>>>>>>>  @>>>>>/s
+make_short($stats{'Innodb_rows_read'}), t($stats{'Innodb_rows_read'})
+  Updated   @>>>>>>>>  @>>>>>/s
+make_short($stats{'Innodb_rows_updated'}), t($stats{'Innodb_rows_updated'})
+.
diff --git a/storage/xtradb/build/debian/additions/mysqlreport.1 b/storage/xtradb/build/debian/additions/mysqlreport.1
new file mode 100644
index 00000000000..5ae6b9e3b92
--- /dev/null
+++ b/storage/xtradb/build/debian/additions/mysqlreport.1
@@ -0,0 +1,180 @@
+.TH "mysqlreport" "1" "2.5 2006-09-01 (docrev 2006-05-19)" "Daniel Nichter" "MYSQL"
+.SH "NAME"
+.LP 
+mysqlreport \- Makes a friendly report of important MySQL status values
+.SH "SYNTAX"
+.LP 
+mysqlreport [\fIoptions\fP]
+.SH "DESCRIPTION"
+.LP 
+mysqlreport makes a friendly report of important MySQL status values. Actually,
+it makes a friendly report of nearly every status value from SHOW STATUS.
+Unlike SHOW STATUS which simply dumps over 100 values to screen in one long
+list, mysqlreport interprets and formats the values and presents the basic
+values and many more inferred values in a human\-readable format. Numerous
+example reports are available at the mysqlreport web page at
+http://hackmysql.com/mysqlreport.
+
+The benefit of mysqlreport is that it allows you to very quickly see a wide
+array of performance indicators for your MySQL server which would otherwise
+need to be calculated by hand from all the various SHOW STATUS values. For
+example, the Index Read Ratio is an important value but it's not present in
+SHOW STATUS; it's an inferred value (the ratio of Key_reads to
+Key_read_requests).
+
+This documentation outlines all the command line options in mysqlreport, most
+of which control which reports are printed. This document does not address
+how to interpret these reports; that topic is covered in the document Guide
+To Understanding mysqlreport at http://hackmysql.com/mysqlreportguide.
+
+.SH "OPTIONS"
+Technically, command line options are in the form \-\-option, but \-option works
+too. All options can be abbreviated if the abbreviation is unique. For example,
+option \-\-host can be abbreviated \-\-ho but not \-\-h because \-\-h is ambiguous: it
+could mean \-\-host or \-\-help.
+
+.LP 
+
+.TP 
+\fB\-\-help\fR
+Output help information and exit.
+
+.TP 
+\fB\-\-user USER\fR
+
+.TP 
+\fB\-\-password\fR
+As of version 2.3 \-\-password can take the password on the
+command line like "\-\-password FOO". Using \-\-password
+alone without giving a password on the command line
+causes mysqlreport to prompt for a password.
+
+.TP 
+\fB\-\-host ADDRESS\fR
+
+.TP 
+\fB\-\-port PORT\fR
+
+.TP
+\fB\-\-socket SOCKET\fR
+
+.TP 
+\fB\-\-no\-mycnf\fR
+\-\-no\-mycnf makes mysqlreport not read ~/.my.cnf which it does by default
+otherwise. \-\-user and \-\-password always override values from ~/.my.cnf.
+
+.TP 
+\fB\-\-dtq\fR
+Print Distribution of Total Queries (DTQ) report (under
+Total in Questions report). Queries (or Questions) can
+be divided into four main areas: DMS (see \-\-dms below),
+Com_ (see \-\-com below), COM_QUIT (see COM_QUIT and
+Questions at http://hackmysql.com/com_quit), and
+Unknown. \-\-dtq lists the number of queries in each of
+these areas in descending order.
+
+.TP 
+\fB\-\-dms\fR
+Print Data Manipulation Statements (DMS) report (under
+DMS in Questions report). DMS are those from the MySQL
+manual section 13.2. Data Manipulation Statements.
+(Currently, mysqlreport considers only SELECT, INSERT,
+REPLACE, UPDATE, and DELETE.) Each DMS is listed in
+descending order by count.
+
+.TP 
+\fB\-\-com N\fR
+Print top N number of non\-DMS Com_ status values in
+descending order (after DMS in Questions report). If N
+is not given, default is 3. Such non\-DMS Com_ values
+include Com_change_db, Com_show_tables, Com_rollback,
+etc.
+
+.TP 
+\fB\-\-sas\fR
+Print report for Select_ and Sort_ status values (after
+Questions report). See MySQL Select and Sort Status
+Variables at http://hackmysql.com/selectandsort.
+
+.TP
+\fB\-\-tab\fR
+Print Threads, Aborted, and Bytes status reports (after
+Created temp report). As of mysqlreport v2.3 the
+Threads report reports on all Threads_ status values.
+
+.TP
+\fB\-\-qcache\fR
+Print Query Cache report.
+.TP
+\fB\-\-all\fR
+Equivalent to "\-\-dtq \-\-dms \-\-com 3 \-\-sas \-\-qcache".
+(Notice \-\-tab is not invoked by \-\-all.)
+
+.TP
+\fB\-\-infile FILE\fR
+Instead of getting SHOW STATUS values from MySQL, read
+values from FILE. FILE is often a copy of the output of
+SHOW STATUS including formatting characters (|, +, \-).
+mysqlreport expects FILE to have the format
+" value number " where value is only alpha and
+underscore characters (A\-Z and _) and number is a
+positive integer. Anything before, between, or after
+value and number is ignored. mysqlreport also needs
+the following MySQL server variables: version,
+table_cache, max_connections, key_buffer_size,
+query_cache_size. These values can be specified in
+INFILE in the format "name = value" where name is one
+of the aforementioned server variables and value is a
+positive integer with or without a trailing M and
+possible periods (for version). For example, to specify
+an 18M key_buffer_size: key_buffer_size = 18M. Or, a
+256 table_cache: table_cache = 256. The M implies
+Megabytes not million, so 18M means 18,874,368 not
+18,000,000. If these server variables are not specified
+the following defaults are used (respectively) which
+may cause strange values to be reported: 0.0.0, 64,
+100, 8M, 0.
+
+.TP
+\fB\-\-outfile FILE\fR  
+After printing the report to screen, print the report
+to FILE too. Internally, mysqlreport always writes the
+report to a temp file first: /tmp/mysqlreport.PID on
+*nix, c:\mysqlreport.PID on Windows (PID is the
+script's process ID). Then it prints the temp file to
+screen. Then if \-\-outfile is specified, the temp file
+is copied to OUTFILE. After \-\-email (below), the temp
+file is deleted.
+
+.TP
+\fB\-\-email ADDRESS\fR
+After printing the report to screen, email the report
+to ADDRESS. This option requires sendmail in
+/usr/sbin/, therefore it does not work on Windows.
+/usr/sbin/sendmail can be a sym link to qmail, for
+example, or any MTA that emulates sendmail's \-t
+command line option and operation. The FROM: field is
+"mysqlreport", SUBJECT: is "MySQL status report".
+
+.TP
+\fB\-\-flush\-status\fR
+Execute a "FLUSH STATUS;" after generating the reports.
+If you do not have permissions in MySQL to do this an
+error from DBD::mysql::st will be printed after the
+reports.
+
+.SH "AUTHORS"
+.LP 
+Daniel Nichter
+
+If mysqlreport breaks, send me a message from 
+http://hackmysql.com/feedback 
+with the error.
+
+.SH "SEE ALSO"
+.LP 
+mytop(1)
+.LP
+The comprehensive Guide To Understanding mysqlreport at 
+http://hackmysql.com/mysqlreportguide.
+
diff --git a/storage/xtradb/build/debian/additions/mysqltest.1 b/storage/xtradb/build/debian/additions/mysqltest.1
new file mode 100644
index 00000000000..3469765fe3b
--- /dev/null
+++ b/storage/xtradb/build/debian/additions/mysqltest.1
@@ -0,0 +1,16 @@
+.TH mysql 1 "17 March 2003" "MySQL 3.23" "MySQL database"
+.SH NAME
+mysqltest \- Regressiontest program for MySQL.
+.SH SYNOPSIS
+mysqltest [options]
+.SH DESCRIPTION
+Runs a test against the mysql server and compares output with a results file.
+
+For more information start the program with '--help'.
+.SH "SEE ALSO"
+mysql (1), mysqld (1)
+.SH AUTHOR
+This manpage was written by Christian Hammers <ch@debian.org>.
+
+MySQL is available at http://www.mysql.com/.
+.\" end of man page
diff --git a/storage/xtradb/build/debian/additions/pack_isam.1 b/storage/xtradb/build/debian/additions/pack_isam.1
new file mode 100644
index 00000000000..cad153eedee
--- /dev/null
+++ b/storage/xtradb/build/debian/additions/pack_isam.1
@@ -0,0 +1,19 @@
+.TH mysql 1 "17 March 2003" "MySQL 3.23" "MySQL database"
+.SH NAME
+myisampack \- Compresses MySQL database files.
+.SH SYNOPSIS
+myisampack [options]
+.SH DESCRIPTION
+Pack a ISAM-table to take much smaller space
+Keys are not updated, so you must run isamchk -rq on any table
+that has keys after you have compressed it
+You should give the .ISM file as the filename argument
+
+For more information start the program with '--help'.
+.SH "SEE ALSO"
+mysql (1), mysqld (1)
+.SH AUTHOR
+This manpage was written by Christian Hammers <ch@debian.org>.
+
+MySQL is available at http://www.mysql.com/.
+.\" end of man page
diff --git a/storage/xtradb/build/debian/additions/resolve_stack_dump.1 b/storage/xtradb/build/debian/additions/resolve_stack_dump.1
new file mode 100644
index 00000000000..2a1e2770275
--- /dev/null
+++ b/storage/xtradb/build/debian/additions/resolve_stack_dump.1
@@ -0,0 +1,16 @@
+.TH mysql 1 "17 March 2003" "MySQL 3.23" "MySQL database"
+.SH NAME
+resolve_stack_dump \- MySQL helper program for reporting bugs.
+.SH SYNOPSIS
+resolve_stack_dump [options]
+.SH DESCRIPTION
+Resolve numeric stack strace dump into symbols.
+
+For more information start the program with '--help'.
+.SH "SEE ALSO"
+mysql (1), mysqld (1)
+.SH AUTHOR
+This manpage was written by Christian Hammers <ch@debian.org>.
+
+MySQL is available at http://www.mysql.com/.
+.\" end of man page
diff --git a/storage/xtradb/build/debian/additions/resolveip.1 b/storage/xtradb/build/debian/additions/resolveip.1
new file mode 100644
index 00000000000..7aa9439394d
--- /dev/null
+++ b/storage/xtradb/build/debian/additions/resolveip.1
@@ -0,0 +1,16 @@
+.TH mysql 1 "17 March 2003" "MySQL 3.23" "MySQL database"
+.SH NAME
+resolveip \- MySQL helper program to retrive IP addresses.
+.SH SYNOPSIS
+resolveip [options]
+.SH DESCRIPTION
+Get hostname based on IP-address or IP-address based on hostname.
+
+For more information start the program with '--help'.
+.SH "SEE ALSO"
+mysql (1), mysqld (1)
+.SH AUTHOR
+This manpage was written by Christian Hammers <ch@debian.org>.
+
+MySQL is available at http://www.mysql.com/.
+.\" end of man page
diff --git a/storage/xtradb/build/debian/changelog b/storage/xtradb/build/debian/changelog
new file mode 100644
index 00000000000..6dc9b561634
--- /dev/null
+++ b/storage/xtradb/build/debian/changelog
@@ -0,0 +1,4186 @@
+percona-xtradb-dfsg-5.1 (5.1.36-1) experimental; urgency=low
+
+  [TODO]
+  * Link libmysqlclient.so to libmysqlclient_r.so to help applications
+    like Apache where some modules, like libaprutil, want to use the thread
+    safe library and some, like PHP, do not. As the client library just copies
+    data between client and server, we do not expect significant performance
+    losses. (thanks to Stefan Fritsch). Closes: #450535
+    
+    Add the following to libmysqlclient16.links: 
+    usr/lib/libmysqlclient_r.so.16.0.0 usr/lib/libmysqlclient.so.16.0.0
+
+  * Ex-maintainer upload :)
+  * New upstream release.
+  * SECURITY: Upstream fix for "mysql client does not escape strings in 
+    --html mode." (CVE-2008-4456) Closes: #526254
+  * Upstream fixes REPEAT() function. Closes: #447028
+  * Upstream fixes problems when mixing ORDER and GROUP BY. Closes: #470854
+  * There were many innodb fixes in the last two years, probably
+    also for this unreproducible crash. CLoses: #447713
+  * Removed amd64 specific -fPIC compiler option that was introduced
+    especially for building the NDB cluster module which is no longer
+    part of this package (thanks to Modestas Vainius). Closes: #508406
+  * Put /etc/mysql/conf.d to mysql-server-5.1.dirs (thanks to Alexander 
+    Gerasiov). Closes: #515145
+  * Fixed mysql-test suite by adding 50_mysql-test__db_test.dpatch.
+    It now passes 100% of the tests again. Also Closes: #533999
+  * Preinst now prevents Installation if NDB configuration is detected.
+  * Applied Ubuntu patch that fixes privilege bootstrapping in postinst
+    (thanks to Mathias Gug). Closes: #535492
+  * Applied Ubuntu patch that sets the debconf prio for the root password
+    question to high and prevents it from being asked on 5.0 -> 5.1 upgrades
+    (thanks to Mathias Gug). Closes: #535500
+  * Removed the check for ISAM tables as the only supported upgrade path is
+    from lenny's MySQL-5.0.
+  * Added /etc/mysql/conf.d/mysqld_safe_syslog.cnf which enables mysqld_safe
+    to pipe all mysqld output into the syslog. The reason for not letting dpkg
+    handle it via a normal config file change was that my.cnf is usually
+    heavily tuned by the admin so the setting would go lost too easily.
+  * Updated mysqlreport to version 3.5 (including two minor patches by me).
+
+ -- Christian Hammers <ch@debian.org>  Wed, 01 Jul 2009 20:54:58 +0200
+
+mysql-dfsg-5.1 (5.1.34-1) experimental; urgency=low
+
+  * New upstream release.
+
+ -- Norbert Tretkowski <nobse@debian.org>  Mon, 20 Apr 2009 20:23:10 +0200
+
+mysql-dfsg-5.1 (5.1.33-2) experimental; urgency=low
+
+  * Remove no longer active developers from uploaders field.
+  * Drop workaround for upgrades from MySQL 3.23, not necessary any more.
+
+ -- Norbert Tretkowski <nobse@debian.org>  Tue, 07 Apr 2009 11:23:25 +0200
+
+mysql-dfsg-5.1 (5.1.33-1) experimental; urgency=low
+
+  * New upstream release.
+
+ -- Norbert Tretkowski <nobse@debian.org>  Thu, 02 Apr 2009 21:12:23 +0200
+
+mysql-dfsg-5.1 (5.1.32-1) experimental; urgency=low
+
+  * New upstream release.
+
+ -- Norbert Tretkowski <nobse@debian.org>  Fri, 06 Mar 2009 18:48:23 +0100
+
+mysql-dfsg-5.1 (5.1.31-2) experimental; urgency=low
+
+  * Update SSL certificates, and re-enable SSL related tests when running
+    the testsuite.
+
+ -- Norbert Tretkowski <nobse@debian.org>  Tue, 10 Feb 2009 16:08:42 +0100
+
+mysql-dfsg-5.1 (5.1.31-1) experimental; urgency=low
+
+  * New upstream release.
+
+ -- Norbert Tretkowski <nobse@debian.org>  Sun, 08 Feb 2009 17:07:11 +0100
+
+mysql-dfsg-5.1 (5.1.30-2) experimental; urgency=low
+
+  * Drop MySQL Cluster support, it's deprecated since 5.1.24-RC.
+  * Fix FTBFS if build twice in a row. (closes: #487091)
+
+ -- Norbert Tretkowski <nobse@debian.org>  Fri, 05 Dec 2008 21:04:55 +0100
+
+mysql-dfsg-5.1 (5.1.30-1) experimental; urgency=low
+
+  * New upstream release.
+
+ -- Norbert Tretkowski <nobse@debian.org>  Thu, 27 Nov 2008 09:09:55 +0100
+
+mysql-dfsg-5.1 (5.1.29rc-1) experimental; urgency=low
+
+  * New upstream release.
+
+ -- Norbert Tretkowski <nobse@debian.org>  Mon, 27 Oct 2008 20:00:43 +0100
+
+mysql-dfsg-5.1 (5.1.26rc-1) experimental; urgency=low
+
+  * New upstream release.
+
+ -- Norbert Tretkowski <nobse@debian.org>  Mon, 14 Jul 2008 21:46:59 +0200
+
+mysql-dfsg-5.1 (5.1.25rc-1) experimental; urgency=low
+
+  * New upstream release.
+
+ -- Norbert Tretkowski <nobse@debian.org>  Sat, 21 Jun 2008 13:55:02 +0200
+
+mysql-dfsg-5.1 (5.1.24rc-1) experimental; urgency=low
+
+  * New upstream release.
+  * Ignore errors in testsuite on ia64 and s390.
+
+ -- Norbert Tretkowski <nobse@debian.org>  Wed, 16 Apr 2008 22:03:44 +0200
+
+mysql-dfsg-5.1 (5.1.23rc-1) experimental; urgency=low
+
+  * New upstream release.
+
+  [ Christian Hammers ]
+  * Add PIC support for NDB libraries on amd64 (thanks to Monty Taylor).
+  * Add extra information when aborting due to a detected downgrade (thanks to
+    Raphael Pinson).
+  * Move libndbclient.so.3 to its own package as it now has a version != 0
+    (thanks to Raphael Pinson for reminding me).
+
+  [ Monty Taylor ]
+  * Remove 85_ndb__staticlib.dpatch since we have a libndbclient package now.
+  * Add myself to the uploaders so that I don't get complaints about package
+    signing.
+  * Add libndbclient-dev package to go with libndbclient3.
+
+  [ Norbert Tretkowski ]
+  * Update patches:
+    + 41_scripts__mysql_install_db.sh__no_test.dpatch
+  * Drop patches:
+    + 70_upstream_debian__configure.dpatch
+    + 71_upstream_debian__Makefile.in.dpatch
+    + 99_TEMP_minmax.dpatch
+  * Remove Adam Conrad from uploaders on his request. Thanks for your work in
+    the past!
+  * Ignore errors in testsuite on amd64 and i386.
+
+ -- Norbert Tretkowski <nobse@debian.org>  Fri, 29 Feb 2008 10:38:27 +0100
+
+mysql-dfsg-5.1 (5.1.22rc-1) experimental; urgency=low
+
+  * New upstream version.
+  * Let mysql-server-5.1 pre-depend on debconf as it uses it in the preinst.
+  * Fixed mysql-client-5.1 menu entry for upcoming menu policy 1.4.
+
+ -- Christian Hammers <ch@debian.org>  Tue, 02 Oct 2007 22:45:37 +0200
+
+mysql-dfsg-5.1 (5.1.21beta-1) experimental; urgency=low
+
+  * My "Greetings from FrOSCon!" release.
+  * New upstream version.
+  * libmysqlclient.so.15 has been superseded by libmysqlclient.so.16.
+  * Renamed libmysqlclient15-dev to libmysqlclient-dev but added an empty
+    package libmysqlclient15-dev to ease the transition for packages with
+    a versioned build-dep to libmysqlclient15-dev which is something that
+    currently does not work with "Provides:".
+  * Synced with 5.0 branch up to subversion release r909.
+  * Commented out most of the compile conditionals in the hope that
+    all architectures can be build the same way.
+  * Added a lot of new binaries and manpages.
+  * Switched to plugin based engines.
+
+ -- Christian Hammers <ch@debian.org>  Sat, 25 Aug 2007 14:24:40 +0200
+
+mysql-dfsg-5.1 (5.1.19beta-1) experimental; urgency=low
+
+  * New upstream release.
+
+ -- Christian Hammers <ch@debian.org>  Mon, 11 Jun 2007 23:18:35 +0200
+
+mysql-dfsg-5.1 (5.1.16beta-4) experimental; urgency=high
+
+  * Merged with 5.0 r850:
+    * SECURITY:
+      In some previous versions mysql_install_db was not idempotent and did
+      always create passwordless root accounts although it should only on
+      initial installs (thanks to Olaf van der Spek). Closes: #418672
+    * Added check for passwordless root accounts to debian-start.
+    * As MySQL-5.0 is, at least currently, incompatible with Kernel 2.4 the
+      installation is aborted for such old kernels. Debian Etch does not
+      support them anyway according to the release notes but this might be 
+      unexpected and many production servers still have self build ones 
+      installed (thanks to Marc-Christian Petersen). See: #416841
+    * Adjusted TeX build-deps to texlive.
+    * Added innotop. 
+    * Changed maintainer email address to
+      pkg-mysql-commits@lists.alioth.debian.org 
+
+ -- Christian Hammers <ch@debian.org>  Thu, 19 Apr 2007 19:29:29 +0200
+
+mysql-dfsg-5.1 (5.1.16beta-3) experimental; urgency=low
+
+  * Merged with 5.0 r837:
+    * Activated the blackhole engine as it's needed for replicating partition
+      designs (thanks to Cyril SCETBON). 
+    * Fixed segfault on i486 systems without cpuid instruction (thanks to
+      Lennart Sorensen). Closes: #410474
+    * Only use of the non-essential debconf package in postrm if it is
+      still installed (thanks to Michael Ablassmeier). Closes: #416838
+
+ -- Christian Hammers <ch@debian.org>  Sun, 18 Mar 2007 21:48:11 +0100
+
+mysql-dfsg-5.1 (5.1.16beta-2) experimental; urgency=low
+
+  * Merged with 5.0 r818:
+    * Fixed FTBFS on Sparc introduced with the "make -j" trick in
+      5.0.32-8 (thanks to Frank Lichtenheld). Closes: #415026
+
+ -- Christian Hammers <ch@debian.org>  Sun, 18 Mar 2007 21:20:11 +0100
+
+mysql-dfsg-5.1 (5.1.16beta-1) experimental; urgency=low
+
+  * New upstream release. 
+    * SECURITY: Using an INFORMATION_SCHEMA table with ORDER BY in a subquery
+      could cause a server crash (CVE-2007-1420).
+    * Added temporary patch 90_TEMP_sqlparse-ifdef to avoid build problems.
+  * Merged with 5.0 r809:
+    * Updated mysqlreport to latest upstream (and patched --help usage
+      message and "return if qcache_size==0").
+  * Merged with 5.0 r798:
+    * Adapt MAKE_J to use the -j option with the number of available
+      processors. (thanks to Raphael Pinson).
+  * Merged with 5.0 r758:
+    * Changed minimum required version in dh_makeshlibs to 5.0.27-1 as
+      5.0.26 had an ABI breakage in it!
+      This is the cause for Perl programs crashing with the following error: 
+      Transactions not supported by database at /usr/lib/perl5/DBI.pm line 672
+    * Added some more comments to the default my.cnf.
+    * Added support for /etc/mysql/conf.d/.
+    * The debian-start script that runs on every server start now first upgrades
+      the system tables (if neccessary) and then check them as it sometimes did
+      not work the other way around (e.g. for MediaWiki). The script now uses 
+      mysql_update instead of mysql_update_script as recommended. See: 409780
+
+ -- Christian Hammers <ch@debian.org>  Fri,  2 Mar 2007 01:00:55 +0100
+
+mysql-dfsg-5.1 (5.1.15beta-1) experimental; urgency=low
+
+  * New upstream release.
+  [Monty Taylor]
+  * Removed patches/25_mysys__default.c - fixed upstream.
+  * Removed patches/26_client__mysql_upgrade.c - fixed upstream.
+  * Removed patches/29_scripts__mysqlbug.sh - fixed upstream.
+  * Removed patches/39_scripts__mysqld_safe.sh__port_dir - fixed upstream.
+  * Removed patches/42_scripts__mysqldumpslow__slowdir - fixed upstream.
+  * Removed patches/45_warn-CLI-passwords - fixed upstream.
+  * Removed patches/89_ndb__records.dpatch - fixed upstream.
+  * Removed patches/86_ndbapi_tc_selection.dpatch - fixed upstream.
+  [Christian Hammers]
+  * Synced with 5.0.32-4.
+    * mysql-server-5.0 pre-depends on adduser now and has --disabled-login
+      explicitly added to be on the safe side (thanks to the puiparts team).
+      Closes: #408362
+    * Corrections the terminology regarding NDB in the comments of all config
+      files and init scripts (thanks to Geert Vanderkelen of MySQL).
+
+ -- Christian Hammers <ch@debian.org>  Wed,  7 Feb 2007 11:34:52 -0200
+
+mysql-dfsg-5.1 (5.1.14beta-2) experimental; urgency=low
+
+  [Christian Hammers]
+  * Readded 85_ndb__staticlib.dpatch with slight modifications. 
+  * Backported debian-start scripts from 5.0.
+  [Monty Taylor]
+  * Now build-depends on bison.
+  * Updated to standards 3.7.2.
+  * Removed references to comp_err.
+  * build-depend on automake1.9 to match upstream 
+  * Merged runlevel changes from 5.0.
+  * Added 26_client__mysql_upgrade.c.dpatch to fix a segfault in mysql_upgrade
+    when using a password. It's been fixed upstream in 5.1.15. 
+  * Moved BDB check to sanity_checks() and added a note about deprecation.
+  * Use my_print_defaults instead of mysqld --print-defaults
+  * Changed NDB Data and Management node startup seqence. Prevented both
+    from restarting on upgrade to address rolling upgrade issues.
+  * Added a "start-initial" option to the Data Node init script to support
+    initial node starts.
+  * Added 86_ndbapi_tc_selection.dpatch to fix a bug that causes a segfault
+    when using the NdbApi. http://bugs.mysql.com/bug.php?id=24914
+    Fixed in 5.1.15
+  * Added 89_ndb__records.dpatch to fix
+    http://bugs.mysql.com/bug.php?id=25567, which causes a table scan per
+    table per query.
+
+ -- Christian Hammers <ch@debian.org>  Wed, 31 Jan 2007 01:17:35 +0100
+
+mysql-dfsg-5.1 (5.1.14beta-1) experimental; urgency=low
+
+  * New upstream.
+  * Removed references to mysql_explain_log
+  * Changed context for patch to mysqld_multi.1
+  * Removed 70_kfreebsd.dpatch - applied to upstream
+  * Removed 87_ps_Hurd - applied to upstream
+  * Replaced --without-readline to --with-libedit to configure options, as
+    --without-readline doesn't seem to do the right thing anymore.
+
+ -- Monty Taylor <mordred@inaugust.com>  Wed, 10 Jan 2007 12:59:55 -0800
+
+mysql-dfsg-5.1 (5.1.11beta-1) experimental; urgency=low
+
+  * Starting new 5.1 branch!
+  * FIXME: Following patch couldn't be applied:
+      ## 85_ndb__staticlib.dpatch by  <ch@debian.org>
+  * FIXME: Following patch couldn't be applied:
+      ## 86_PATH_MAX.dpatch
+
+ -- Christian Hammers <ch@debian.org>  Sat, 29 Jul 2006 11:35:42 +0200
+
+mysql-dfsg-5.0 (5.0.51a-19) UNRELEASED; urgency=low
+
+  * New patch 50_fix_mysqldump2.dpatch from 5.0.60 to fix dumping databases
+    from mysql 4.0 server. (closes: #507789)
+  * Don't create a guest account during bootstrap. (closes: #463704)
+
+ -- Norbert Tretkowski <nobse@debian.org>  Thu, 04 Dec 2008 23:07:19 +0100
+
+mysql-dfsg-5.0 (5.0.51a-18) testing-proposed-updates; urgency=high
+
+  * SECURITY:
+    Fix for CVE-2008-4098: Inadequate validation of paths used in DATA
+    DIRECTORY and INDEX DIRECTORY clauses of CREATE TABLE statements enabled
+    attackers to write to tables in other databases to which they could not
+    ordinarily have access.
+
+ -- Devin Carraway <devin@debian.org>  Tue, 25 Nov 2008 05:38:45 +0000
+
+mysql-dfsg-5.0 (5.0.51a-17) testing-proposed-updates; urgency=low
+
+  * Don't use commented out passwords from debian.cnf. (closes: #453820)
+  * Update watch file to recognize releases > 5.0.45.
+
+ -- Norbert Tretkowski <nobse@debian.org>  Sun, 02 Nov 2008 13:31:32 +0100
+
+mysql-dfsg-5.0 (5.0.51a-16) unstable; urgency=low
+
+  * New patch 60_rpl_test_failure.dpatch from 5.0.54 to fix a race condition
+    with the rpl_packet test in some cases. (closes: #501413)
+
+ -- Norbert Tretkowski <nobse@debian.org>  Thu, 09 Oct 2008 08:50:43 +0200
+
+mysql-dfsg-5.0 (5.0.51a-15) unstable; urgency=high
+
+  * SECURITY:
+    Fix for CVE-2008-3963: An empty bit-string literal (b'') caused a server
+    crash. Now the value is parsed as an empty bit value (which is treated as
+    an empty string in string context or 0 in numeric context).
+    (closes: #498362)
+
+ -- Norbert Tretkowski <nobse@debian.org>  Sun, 14 Sep 2008 18:27:46 +0200
+
+mysql-dfsg-5.0 (5.0.51a-14) unstable; urgency=low
+
+  * Update debconf translations:
+    - Swedish, from Martin Bagge. (closes: #491688)
+    - Netherlands, from Thijs Kinkhorst. (closes: #492723)
+
+ -- Norbert Tretkowski <nobse@debian.org>  Sun, 07 Sep 2008 20:18:31 +0200
+
+mysql-dfsg-5.0 (5.0.51a-13) unstable; urgency=medium
+
+  * New patch 59_fix_relay_logs_corruption.dpatch from 5.0.56 to fix
+    corruption in relay logs. (closes: #463515)
+
+ -- Norbert Tretkowski <nobse@debian.org>  Wed, 03 Sep 2008 09:13:46 +0200
+
+mysql-dfsg-5.0 (5.0.51a-12) unstable; urgency=low
+
+  * Disable rpl_ndb_innodb_trans test when running the testsuite, fails
+    randomly on i386. (closes: #494238)
+
+ -- Norbert Tretkowski <nobse@debian.org>  Sat, 09 Aug 2008 15:56:45 +0200
+
+mysql-dfsg-5.0 (5.0.51a-11) unstable; urgency=low
+
+  * Disable innodb_handler test when running the testsuite, fails randomly
+    on s390. (closes: #491363)
+
+ -- Norbert Tretkowski <nobse@debian.org>  Wed, 23 Jul 2008 08:34:51 +0200
+
+mysql-dfsg-5.0 (5.0.51a-10) unstable; urgency=high
+
+  * Merge testing-security upload to finally fix CVE-2008-2079, thanks to
+    Devin Carraway and Steffen Joeris. (closes: #480292)
+  * New patch 58_disable-ndb-backup-print.dpatch from 5.0.54 to disable
+    ndb_backup_print, ndb_alter_table and ndb_replace tests when running the
+    testsuite. (closes: #474893)
+  * Reenable error handling in testsuite on i386, disabling it was just a
+    workaround for the problem which is now fixed with the above patch.
+  * Update debconf translations:
+    - Vietnamese, from Clytie Siddall. (closes: #486443)
+    - Spanish, from Javier Fernández-Sanguino Peña. (closes: #488740)
+    - Slovak, from helix84. (closes: #489266)
+  * Make lintian happy:
+    - Fix build-dependency on -1 revision.
+    - Fix deprecated chown usage.
+    - Fix spelling error in description.
+
+ -- Norbert Tretkowski <nobse@debian.org>  Tue, 15 Jul 2008 19:37:35 +0200
+
+mysql-dfsg-5.0 (5.0.51a-9+lenny2) testing-security; urgency=high
+
+  * Non-maintainer upload by the security team.
+  * Correct error number in symlink.test to avoid FTBFS on some archs.
+
+ -- Steffen Joeris <white@debian.org>  Sun, 13 Jul 2008 11:44:57 +0000
+
+mysql-dfsg-5.0 (5.0.51a-9+lenny1) testing-security; urgency=high
+
+  * Non-maintainer upload by the security team.
+  * Correct and expand 92_SECURITY_CVE-2008-2079.dpatch to cover all symlinks
+    and check the output of fn_format(). (closes: #480292)
+    Fixes: CVE-2008-2079
+
+ -- Steffen Joeris <white@debian.org>  Sat, 12 Jul 2008 05:30:39 +0000
+
+mysql-dfsg-5.0 (5.0.51a-9) unstable; urgency=low
+
+  * Ignore errors in testsuite on i386. (workaround for #474893)
+
+ -- Norbert Tretkowski <nobse@debian.org>  Wed, 25 Jun 2008 15:07:03 +0200
+
+mysql-dfsg-5.0 (5.0.51a-8) unstable; urgency=low
+
+  * New patch 80_fix_user_setup_on_localhost.dpatch from Daniel Hahler to fix
+    a duplicate key error when install MySQL server on a host with hostname
+    localhost. (closes: #478319)
+  * Really fix build on non-linux systems, this time without producing a build
+    error on some architectures. (closes: #485971)
+  * Update debconf translations:
+    - French, from Christian Perrier. (closes: #478553)
+    - German, from Alwin Meschede. (closes: #478672)
+    - Italian, from Luca Monducci. (closes: #479363)
+    - Czech, from Miroslav Kure. (closes: #480924)
+    - Galician, from Jacobo Tarrio. (closes: #480965)
+    - Basque, from Piarres Beobide. (closes: #481840)
+    - Swedish, from Martin Bagge. (closes: #482466, #486307)
+    - Turkish, from Mert Dirik. (closes: #484704)
+    - Russian, from Yuri Kozlov. (closes: #486149)
+    - Finnish, from Esko Arajärvi. (closes: #486554)
+    - Portuguese, from Miguel Figueiredo. (closes: #486709)
+    - Romanian, from Eddy Petrișor. (closes: #486944)
+    - Japanese, from Hideki Yamane. (closes: #487270)
+
+ -- Norbert Tretkowski <nobse@debian.org>  Sat, 21 Jun 2008 19:20:48 +0200
+
+mysql-dfsg-5.0 (5.0.51a-7) unstable; urgency=high
+
+  [ Norbert Tretkowski ]
+  * SECURITY:
+    Fix for CVE-2008-2079: It was possible to circumvent privileges through
+    the creation of MyISAM tables employing the DATA DIRECTORY and INDEX
+    DIRECTORY options to overwrite existing table files in the MySQL data
+    directory. Use of the MySQL data directory in DATA DIRECTORY and INDEX
+    DIRECTORY is now disallowed. Patch from openSUSE 11.0, thanks to Michal
+    Marek. (closes: #480292)
+  * Fix build on non-linux systems, like hurd-i386. (closes: #480362)
+  * Include symlinks for mysqlcheck. (closes: #480647)
+
+  [ Monty Taylor ]
+  * Remove ndb_cpcd, as it is only for the NDB test suite and not useful as a
+    public program.
+  * Fix debian-start.inc.sh for table names with characters needing quotes.
+    Thanks Felix Rublack! (closes: #480525, #481154, #481303, #484012) 
+  * Delete mysql-common.README.Debian. Nothing in it was relevant, and the
+    useful information is in mysql-server anyway. (closes: #480940)
+  * Remove a spurious HOME= in logrotate script.  
+
+ -- Norbert Tretkowski <nobse@debian.org>  Thu, 05 Jun 2008 11:49:45 +0200
+
+mysql-dfsg-5.0 (5.0.51a-6) unstable; urgency=low
+
+  * Fix debian-start.inc.sh to not print the row counts of the tables
+    queried. (closes: #478256, #479697)
+
+ -- Monty Taylor <mordred@inaugust.com>  Wed, 14 May 2008 00:47:46 -0700
+
+mysql-dfsg-5.0 (5.0.51a-5) unstable; urgency=medium
+
+  * New patch 57_fix_mysql_replication.dpatch from 5.0.54 to fix directory for
+    relay logs when using replication.
+
+ -- Norbert Tretkowski <nobse@debian.org>  Sun, 27 Apr 2008 13:55:04 +0200
+
+mysql-dfsg-5.0 (5.0.51a-4) unstable; urgency=low
+
+  [ Monty Taylor ]
+  * Remove build of ndb docs, since they are not installed. Removed build deps
+    on TeX and doxygen since that's all they were there for.
+  * Replace script in check_for_crashed_tables with a myisam-recover option
+    and a script to trigger a check of those tables. (thanks HarrisonF and
+    kolbe)
+  * Replace direct calls to test suite with calls to the make targets used by
+    the MySQL build and qa teams for releases.
+  * Add --skip-ndbcluster to the postinst bootstrap command. It's really a
+    workaround for a bug in 5.1, but it's probably a good idea anyway since we
+    certainly don't need cluster to spin up, and if people have enabled
+    cluster in their my.cnf file, there could be postinst issues if cluster
+    isn't running.
+  * Remove reference to configure options that no longer exist.
+  * Add myself to uploaders.
+
+  [ Norbert Tretkowski ]
+  * New patch 56_fix_order_by.dpatch from Ubuntu to fix ORDER BY not working
+    with GROUP BY. (closes: #471737)
+  * Add note about filename extensions in the /etc/mysql/conf.d/ directory in
+    my.cnf. (closes: #461759)
+  * Confirm password on install, patch from Nicolas Valcárcel.
+    (closes: #471887)
+  * Remove Adam Conrad from uploaders on his request. Thanks for your work in
+    the past!
+  * Use lsb_release to detect distribution.
+
+ -- Norbert Tretkowski <nobse@debian.org>  Sat, 05 Apr 2008 21:51:43 +0200
+
+mysql-dfsg-5.0 (5.0.51a-3) unstable; urgency=low
+
+  * Disable patch 60_raise-max-keylength.dpatch in default build, but still
+    ship it in the source package.
+
+ -- Norbert Tretkowski <nobse@debian.org>  Sun, 17 Feb 2008 18:54:42 +0100
+
+mysql-dfsg-5.0 (5.0.51a-2) unstable; urgency=low
+
+  * Replace 54_ssl-client-support.dpatch added in 5.0.51-2 with patch from
+    upstream.
+  * Ignore errors in testsuite on powerpc.
+
+ -- Norbert Tretkowski <nobse@debian.org>  Sun, 17 Feb 2008 12:42:58 +0100
+
+mysql-dfsg-5.0 (5.0.51a-1) unstable; urgency=low
+
+  [ Norbert Tretkowski ]
+  * New upstream security hotfix release. Low priority upload anyway because
+    5.0.51-3 already contained all security fixes.
+  * Remove patches:
+    + debian/patches/51_mysqlcheck-result.dpatch
+    + debian/patches/92_SECURITY_CVE-2007-6303.dpatch
+    + debian/patches/93_SECURITY_CVE-2007-6304.dpatch
+    + debian/patches/94_SECURITY_CVE-2008-0226+0227.dpatch
+  * Add recommendation on libhtml-template-perl to -server package, used by
+    ndb_size. (closes: #462265)
+  * New patch 60_raise-max-keylength.dpatch to raise the maximum key length to
+    4005 bytes or 1335 UTF-8 characters. (closes: #463137)
+  * New patch 51_sort-order.dpatch from 5.0.52 to fix incorrect order when
+    using range conditions on 2 tables or more.
+  * Support DEB_BUILD_OPTIONS option 'nocheck' to skip tests.
+  * Update mysqlreport to 3.4a release.
+
+  [ Luk Claes ]
+  * Updated Japanese debconf translation. (closes: #462158)
+
+ -- Norbert Tretkowski <nobse@debian.org>  Wed, 06 Feb 2008 11:57:45 +0100
+
+mysql-dfsg-5.0 (5.0.51-3) unstable; urgency=high
+
+  * SECURITY:
+    Fix for CVE-2008-0226 and CVE-2008-0227: Three vulnerabilities in yaSSL
+    versions 1.7.5 and earlier were discovered that could lead to a server
+    crash or execution of unauthorized code. The exploit requires a server
+    with yaSSL enabled and TCP/IP connections enabled, but does not require
+    valid MySQL account credentials. The exploit does not apply to OpenSSL.
+    (closes: #460873)
+  * Fix LSB header in init scripts (patch from Petter Reinholdtsen).
+    (closes: #458798)
+  * Run testsuite on all archs, but ignore errors on alpha, arm, armel, hppa,
+    mipsel and sparc. (closes: #460402)
+
+ -- Norbert Tretkowski <nobse@debian.org>  Wed, 23 Jan 2008 11:37:11 +0100
+
+mysql-dfsg-5.0 (5.0.51-2) unstable; urgency=low
+
+  [ Monty Taylor ]
+  * Added --with-system-type to set the version_compile_os field.
+  * Cleaned up some lintian warnings.
+  * Removed 43_scripts__mysql_update__password.dpatch since we don't use
+    mysql_upgrade_shell anymore and use mysql_upgrade instead.
+  * Removed 88_mctype_attrib.dpatch, http://bugs.mysql.com/bug.php?id=25118 is
+    closed with http://lists.mysql.com/commits/24337
+  * Added mysql-community/mysql-enterprise virtual packages in provides and
+    conflicts to ease transitions between versions.
+
+  [ Norbert Tretkowski ]
+  * Add -fPIC to CFLAGS to allow other packages to be built against
+    libmysqld.a on amd64. (closes: #457915)
+  * New patch 55_testsuite-2008.dpatch to fix FTBFS in testsuite.
+    (closes: #458695)
+  * New patch 54_ssl-client-support.dpatch to fix SSL client support.
+  * Don't run testsuite on alpha, arm, hppa, mipsel and sparc.
+
+ -- Norbert Tretkowski <nobse@debian.org>  Wed, 02 Jan 2008 18:40:04 +0100
+
+mysql-dfsg-5.0 (5.0.51-1) unstable; urgency=low
+
+  * New upstream release.
+    + Fix a crash in mysql_client_test due to gcc 4.x optimizations.
+      (closes: #452558)
+  * Update patches:
+    + debian/patches/41_scripts__mysql_install_db.sh__no_test.dpatch
+    + debian/patches/89_ndb__staticlib.dpatch
+  * Run testsuite after build.
+  * Re-add manpages, they are licensed under GPL now and redistribution is
+    permitted.
+  * Drop linux-libc-dev build-dependency, it's now being pulled by libc-dev
+    which is build-essential. (closes: #431018)
+  * Remove old optimizations for MySQL 3.23.x, they are no longer required.
+    (closes: #436552)
+  * Don't fail when upgrading mysql-common if $datadir is empty or not defined
+    (patch from Edward Allcutt). (closes: #453127)
+  * New patch from 5.0.52 to fix mysqldump because 'null' is shown as type of
+    fields for view with bad definer. (closes: #454227)
+  * New patch from 5.0.52 to fix mysqlcheck test result.
+  * New patch from 5.0.52 to fix wrong optimization in ndb code when building
+    with gcc 4.2.x.
+  * New patch from 5.0.54 to fix wrong number output due to integer overflow
+    when building with gcc 4.2.x.
+  * New Finnish debconf translation from Esko Arajärvi. (closes: #448776)
+  * Update Basque debconf translation from Aitor Ibañez. (closes: #456193)
+  * Add Vcs-* and Homepage fields to source stanza in control file.
+  * Update mysqlreport to 3.2 release.
+  * Let mysql-server-5.0 pre-depend on debconf, because it's preinst is using
+    it.
+  * Drop menu item for innotop.
+
+ -- Norbert Tretkowski <nobse@debian.org>  Fri, 14 Dec 2007 09:59:36 +0100
+
+mysql-dfsg-5.0 (5.0.45-5) unstable; urgency=high
+
+  * SECURITY:
+    Fix for CVE-2007-6303: ALTER VIEW retained the original DEFINER value,
+    even when altered by another user, which could allow that user to gain the
+    access rights of the view. Now ALTER VIEW is allowed only to the original
+    definer or users with the SUPER privilege. (closes: #455737)
+  * SECURITY:
+    Fix for CVE-2007-6304: When using a FEDERATED table, the local server can
+    be forced to crash if the remote server returns a result with fewer columns
+    than expected.
+
+ -- Norbert Tretkowski <nobse@debian.org>  Wed, 12 Dec 2007 20:23:43 +0100
+
+mysql-dfsg-5.0 (5.0.45-4) unstable; urgency=high
+
+  * SECURITY:
+    Fix for CVE-2007-5969: Using RENAME TABLE against a table with explicit
+    DATA DIRECTORY and INDEX DIRECTORY options can be used to overwrite system
+    table information by replacing the file to which the symlink points.
+    (closes: #455010)
+
+ -- Norbert Tretkowski <nobse@debian.org>  Sun, 09 Dec 2007 12:29:54 +0100
+
+mysql-dfsg-5.0 (5.0.45-3) unstable; urgency=high
+
+  * SECURITY:
+    Fix for CVE-2007-5925: The convert_search_mode_to_innobase function in
+    ha_innodb.cc in the InnoDB engine in MySQL 5.1.23-BK and earlier allows
+    remote authenticated users to cause a denial of service (database crash)
+    via a certain CONTAINS operation on an indexed column, which triggers an
+    assertion error. (closes: #451235)
+
+ -- Norbert Tretkowski <nobse@debian.org>  Thu, 15 Nov 2007 18:40:11 +0100
+
+mysql-dfsg-5.0 (5.0.45-2) unstable; urgency=low
+
+  * Package is now team-maintained. (closes: #421026)
+
+  [ Sean Finney ]
+  * New/updated debconf translations:
+    - Spanish, from Javier Fernández-Sanguino Peña (closes: #426442).
+    - German, from Alwin Meschede (closes: #426545).
+    - Danish, from Claus Hindsgaul (closes: #426783).
+    - French, from Christian Perrier (closes: #430944).
+  * Add Recommends on libterm-readkey-perl for mysql-client-5.0 package, used
+    by mysqlreport add-on to mask password entry (closes: #438375).
+
+  [ Norbert Tretkowski ]
+  * Add myself to uploaders.
+  * Suggest usage of an update statement on the user table to change the mysql
+    root user password instead using mysqladmin, to catch all root users from
+    all hosts. (closes: #435744)
+  * Remove informations about a crash in the server during flush-logs when
+    having expire_logs_days enabled but log-bin not, this bug was fixed in
+    5.0.32 already. (closes: #368547)
+  * Disable log_bin option in default config file and add a note to the NEWS
+    file. (closes: #349661)
+  * Fix FTBFS if build twice in a row. (closes: #442684)
+  * Remove check for buggy options from init script.
+  * Update innotop to 1.6.0 release.
+  * Add mysqlreport and innotop to mysql-client description.
+  * Use shorter server version string.
+
+ -- Norbert Tretkowski <nobse@debian.org>  Wed, 14 Nov 2007 20:00:06 +0100
+
+mysql-dfsg-5.0 (5.0.45-1) unstable; urgency=low
+
+  * New upstream release.
+
+  [sean finney]
+  * removed patches that are incorporated into the latest release:
+    - 70_cpuid_on_i486.dpatch
+    - 91_SECURITY_CVE-2007-2691_alter-drop
+  * new patch 90_upstreamdebiandir.dpatch to keep a few lingering references
+    to the upstream ./debian dir out of the build, at least until we find
+    a nice way to collaborate on sharing the directory.
+  * updated CRUFT list to fix double-build breakage (closes: #424590).
+  * add conditional build-deps for linux-libc-dev to fix FTBFS for
+    non-linux arch's (closes: #431018).
+  * added notes to my.cnf and README.Debian about setting tmpdir when
+    configuring a replication slave.  thanks to Rudy Gevaert for pointing
+    this out (closes: #431825).
+
+ -- sean finney <seanius@debian.org>  Tue, 17 Jul 2007 23:50:33 +0200
+
+mysql-dfsg-5.0 (5.0.41a-1) unstable; urgency=high
+
+  [sean finney]
+  * SECURITY:
+    Fix for CVE-2007-2691: DROP/RENAME TABLE statements (closes: #424778).
+  [Christian Hammers]
+  * Removed all manpages from the source (therefore the "41a") as they 
+    are not licensed under the GPL and redistribution is not permitted
+    (thanks to Mathias Gug). Closes: #430018
+  * Added linux-libc-dev to the build-depends as else an illegal dependency to
+    asm/atomic.h is generated in /usr/include/mysql/my_global.h. Closes: 424276
+  [Christian Perrier]
+  * Debconf templates and debian/control reviewed by the debian-l10n-
+    english team as part of the Smith review project. Closes: #419974
+  * Debconf translation updates:
+    - French. Closes: #422187
+    - Galician. Closes: #420118
+    - Italian. Closes: #421349
+    - Brazilian Portuguese. Closes: #421516
+    - Arabic. Closes: #421751
+    - Czech. Closes: #421766
+    - Portuguese. Closes: #422428
+
+ -- Christian Hammers <ch@debian.org>  Sun, 24 Jun 2007 21:12:42 +0200
+
+mysql-dfsg-5.0 (5.0.41-2) unstable; urgency=low
+
+  * the previous "translation changes" inadvertently introduced unrelated
+    changes in the package control file.
+
+ -- sean finney <seanius@debian.org>  Sun, 13 May 2007 12:32:45 +0200
+
+mysql-dfsg-5.0 (5.0.41-1) unstable; urgency=low
+
+  * New upstream release
+  [sean finney]
+  * Bump the priority of the debconf prompt for the root password to high, to 
+    ensure the question shows up in a default installation (closes: #418672).
+  * Debconf templates and debian/control reviewed by the debian-l10n-
+    english team as part of the Smith review project. Closes: #419974
+  * Debconf translation updates:
+    - French. Closes: #422187
+    - Galician. Closes: #420118
+    - Italian. Closes: #421349
+    - Brazilian Portuguese. Closes: #421516
+    - Arabic. Closes: #421751
+    - Czech. Closes: #421766
+    - Portuguese. Closes: #422428
+  * massaged the local PATH_MAX patch.
+  * removed temp sql parsing patch which has been incorporated upstream
+  * upstream no longer includes the mysql_create_system_tables command,
+    so removed our local patches for it.
+  * the following issues may have been fixed in a previous version of
+    mysql-server-5.0, but the exact version is not clear so they will be
+    marked as fixed in this version. 
+  * lots of NDB-related fixes, including those related to problems with
+    AUTO_INCREMENT (closes: #310878).
+  * fix for "connections remaining in sleep state" (closes: #318011).
+  * fix for "denies queries randomly" (closes: #399602).
+  * problems indexing on char() binary fields were ISAM specific, which is
+    no longer supported (closes: #326698).
+  * fix for problems with "complicated joins" (closes: 348682).
+  * fix for problems with "flushing logs, server crash" (closes: #348682).
+  * fix for AUTO_INCREMENT and duplicate keys (closes: #416145).
+  * fix for "DROP FUNCTIONS doesn't work" (closes: #290670).
+
+ -- sean finney <seanius@debian.org>  Sat, 12 May 2007 12:10:20 +0200
+
+mysql-dfsg-5.0 (5.0.38-3) unstable; urgency=low
+
+  * Added innotop. 
+  * Changed maintainer email address to
+    pkg-mysql-commits@lists.alioth.debian.org 
+
+ -- Christian Hammers <ch@debian.org>  Thu, 19 Apr 2007 19:21:15 +0200
+
+mysql-dfsg-5.0 (5.0.38-2) unstable; urgency=high
+
+  * SECURITY:
+    In some previous versions mysql_install_db was not idempotent and did
+    always create passwordless root accounts although it should only on
+    initial installs (thanks to Olaf van der Spek). Closes: #418672
+  * Added check for passwordless root accounts to debian-start.
+  * As MySQL-5.0 is, at least currently, incompatible with Kernel 2.4 the
+    installation is aborted for such old kernels. Debian Etch does not support
+    them anyway according to the release notes but this might be unexpected
+    and many production servers still have self build ones installed (thanks
+    to Marc-Christian Petersen). See: #416841
+  * Adjusted TeX build-deps to texlive.
+
+ -- Christian Hammers <ch@debian.org>  Tue, 17 Apr 2007 01:00:41 +0200
+
+mysql-dfsg-5.0 (5.0.38-1) unstable; urgency=low
+
+  * New upstream release.
+  * Activated the blackhole engine as it's needed for replicating partition
+    designs (thanks to Cyril SCETBON). 
+  * Fixed segfault on i486 systems without cpuid instruction (thanks to
+    Lennart Sorensen). Closes: #410474
+  * Only use of the non-essential debconf package in postrm if it is still
+    installed (thanks to Michael Ablassmeier). Closes: #416838
+
+ -- Christian Hammers <ch@debian.org>  Thu,  5 Apr 2007 22:43:41 +0200
+
+mysql-dfsg-5.0 (5.0.36-1) unstable; urgency=low
+
+  * New upstream release.
+    Closes: #400460, #408159, #408533
+
+ -- Christian Hammers <ch@debian.org>  Thu, 22 Mar 2007 22:16:31 +0100
+
+mysql-dfsg-5.0 (5.0.32-10) unstable; urgency=high
+
+  * Really fixed FTBFS on Sparc introduced with the "make -j" trick in 
+    5.0.32-8 (thanks to Frank Lichtenheld). Closes: #415026
+
+ -- Christian Hammers <ch@debian.org>  Sun, 18 Mar 2007 20:52:33 +0100
+
+mysql-dfsg-5.0 (5.0.32-9) unstable; urgency=high
+
+  * Fixed FTBFS on Sparc introduced with the "make -j" trick in 5.0.32-8
+    (thanks to Frank Lichtenheld). Closes: #415026
+
+ -- Christian Hammers <ch@debian.org>  Tue, 15 Mar 2007 18:55:42 +0100
+
+mysql-dfsg-5.0 (5.0.32-8) unstable; urgency=high
+
+  [Sean Finney]
+  * SECURITY:
+    - CVE-2007-1420: Single Row Subselect DoS.  Specially crafted subselect
+      queries could crash the mysql server.  Patch backported from upstream
+      changeset 19685 (46_CVE-2007-1420_subselect_dos.dpatch) 
+      closes: #414790.
+  [Christian Hammers]
+  * Adapt MAKE_J to use the -j option with the number of available processors.
+    (thanks to Raphael Pinson).
+  * Updated mysqlreport to latest upstream (and patched --help usage message
+    and "return if qcache_size==0").
+
+ -- sean finney <seanius@debian.org>  Wed, 14 Mar 2007 20:19:08 +0100
+
+mysql-dfsg-5.0 (5.0.32-7) unstable; urgency=low
+
+  * Updated French Debconf translation (thanks to Christian Perrier).
+    Closes: #411330
+  * Updated Danish Debconf translation (thanks to Claus Hindsgaul).
+    Closes: #411328
+  * Updated Portuguese Debconf translation (thanks to "Traduz").
+    Closes: #411339
+  * Updated Czech Debconf translation (thanks to Miroslav Kure).
+    Closes: #411341
+  * Added Norwegian Debconf translation (thanks to Bjorn Steensrud).
+    Closes: #411345
+  * Updated Spanish Debconf translation (thanks to Javier Fernandez-Sanguino
+    Pena). Closes: #411347
+  * Updated Japanese Debconf translation (thanks to Hideki Yamane).
+    Closes: #411368
+  * Updated Swedish Debconf translation (thanks to Andreas Henriksson).
+    Closes: #411370
+  * Updated Italian Debconf translation (thanks to Luca Monducci).
+    Closes: #411377
+  * Updated Galician Debconf translation (thanks to Jacobo Tarrio).
+    Closes: #411379
+  * Updated Russian Debconf translation (thanks to Yuriy Talakan).
+    Closes: #411442
+  * Updated Basque Debconf translation (thanks to Piarres Beobide).
+    Closes: #411457
+  * Updated German Debconf translation (thanks to Alwin Meschede).
+    Closes: #411480
+  * Updated Dutch Debconf translation (thanks to Thijs Kinkhorst).
+  * Updated Brazilian Portuguese translation (thanks to Andre Luis Lopes).
+    Closes: #411536
+  * Updated Romanian Debconf translation (thanks to Stan Ioan-Eugen). 
+    Closes: #411764
+
+ -- Christian Hammers <ch@debian.org>  Fri, 16 Feb 2007 23:20:42 +0100
+
+mysql-dfsg-5.0 (5.0.32-6) unstable; urgency=low
+
+  * Changed wording in Debconf templates to better fit to the graphical
+    interface (thanks to Frank Kuester). Closes: #411165
+  * Lintian suggested style changes to some other Debconf questions.
+  * Removed accidently stdout output from init script.
+
+ -- Christian Hammers <ch@debian.org>  Fri, 16 Feb 2007 20:29:18 +0100
+
+mysql-dfsg-5.0 (5.0.32-5) unstable; urgency=medium
+
+  * Backported upstream patch for a bug that crashed the server when using
+    certain join/group/limit combinations. 
+    Users of the Joomla CMS seemed to be affected by this. Closes: #403721
+  * The debian-start script that runs on every server start now first upgrades
+    the system tables (if neccessary) and then check them as it sometimes did
+    not work the other way around (e.g. for MediaWiki). The script now uses 
+    mysql_update instead of mysql_update_script as recommended. Closes: 409780
+  * Remove the Debconf generated config file in postrm.
+
+ -- Christian Hammers <ch@debian.org>  Thu, 15 Feb 2007 04:47:04 +0100
+
+mysql-dfsg-5.0 (5.0.32-4) unstable; urgency=high
+
+  [Christian Hammers]
+  * Changed minimum required version in dh_makeshlibs to 5.0.27-1 as
+    5.0.26 had an ABI breakage in it!
+    This is the cause for Perl programs crashing with the following error: 
+    "Transactions not supported by database at /usr/lib/perl5/DBI.pm line 672"
+  * The old_passwords setting that is set according to a Debconf question is
+    now written to /etc/mysql/conf.d/old_passwords.cnf instead directly to the
+    conffile /etc/mysql/my.cnf which would be fobidden by policy (thanks to
+    Robert Bihlmeyer). Closes: #409750
+  * Added some more comments to the default my.cnf.
+  [Monty Taylor]
+  * Added bison to build dependencies.
+  * Added a "start-initial" option to the Data Node init script to support
+    initial node starts.
+  * Changed NDB Data and Management node startup seqence. Prevented both from
+    restarting on upgrade to address rolling upgrade issues.
+  * Updated build-depends to depend on automake1.9 instead of automake1.8
+    to match what upstream uses. 
+
+ -- Christian Hammers <ch@debian.org>  Wed, 31 Jan 2007 01:14:09 +0100
+
+mysql-dfsg-5.0 (5.0.32-3) unstable; urgency=high
+
+  * mysql-server-5.0 pre-depends on adduser now and has --disabled-login
+    explicitly added to be on the safe side (thanks to the puiparts team).
+    Closes: #408362
+  * Corrections the terminology regarding NDB in the comments of all config
+    files and init scripts (thanks to Geert Vanderkelen of MySQL).
+  * Updated Swedish Debconf translation (thanks to Andreas Henriksson).
+    Closes: #407859
+  * Updated Czech Debconf translation (thanks to Miroslav Kure).
+    Closes: #407809
+
+ -- Christian Hammers <ch@debian.org>  Thu, 11 Jan 2007 11:18:47 +0100
+
+mysql-dfsg-5.0 (5.0.32-2) unstable; urgency=high
+
+  * The last upload suffered from a regression that made NDB totally
+    unusable and caused a dependency to libmysqlclient15-dev in the
+    mysql-server-5.0 package. The relevant 85_* patch was re-added again.
+    Closes: #406435
+  * Added lintian-overrides for an error that does not affect our packages.
+    There are now only warnings and not errors left.
+
+ -- Christian Hammers <ch@debian.org>  Tue,  9 Jan 2007 23:55:10 +0100
+
+mysql-dfsg-5.0 (5.0.32-1) unstable; urgency=high
+
+  * New upstream version.
+    * SECURITY: mysql_fix_privilege_tables.sql altered the 
+      table_privs.table_priv column to contain too few privileges, causing
+      loss of the CREATE VIEW and SHOW VIEW privileges. (MySQL Bug#20589)
+    * SECURITY (DoS): ALTER TABLE statements that performed both RENAME TO
+      and {ENABLE|DISABLE} KEYS operations caused a server crash. (MySQL
+      Bug#24089)
+    * SECURITY (DoS): LAST_DAY('0000-00-00') could cause a server crash.
+      (MySQL Bug#23653)
+    * SECURITY (DoS): Using EXPLAIN caused a server crash for queries that 
+      selected from INFORMATION_SCHEMA in a subquery in the FROM clause.
+      (MySQL Bug#22413)
+    * SECURITY (DoS): Invalidating the query cache (e.g. when using stored procedures) 
+      caused a server crash for INSERT INTO ... SELECT statements that 
+      selected from a view. (MySQL Bug#20045)
+    * Using mysql_upgrade with a password crashed the server. Closes: #406229
+    * yaSSL crashed on pre-Pentium Intel and Cyrix CPUs. (MySQL Bug#21765)
+      Closes: #383759
+    * Lots of small fixes to the NDB cluster storage engine.
+  * Updated Japanese Debconf template (thanks to Hideki Yamane).
+    Closes: #405793
+  * Fixed comment regarding "mycheck" in debian-start (thanks to
+    Enrico Zini). Closes: #405787
+
+ -- Christian Hammers <ch@debian.org>  Sat,  6 Jan 2007 14:26:20 +0100
+
+mysql-dfsg-5.0 (5.0.30-3) unstable; urgency=low
+
+  * Updated Brazilian Debconf translation (thanks to Andre Luis Lopes).
+    Closes: #403821
+  * Added Romanian Debconf translation (thanks to Stan Ioan-Eugen).
+    Closes: #403943
+  * Updated Spanish Debconf translation (thanks to Javier Fernandez-Sanguino
+    Pena). Closes: #404084
+  * Updated Galician Debconf translation (thanks to Jacobo Tarrio).
+    Closes: #404318
+  * Updated Dutch Debconf translation (thanks to Vincent Zweije).
+    Closes: #404566
+  * Updated Danish Debconf translation (thanks to Claus Hindsgaul).
+    Closes: #405018
+
+ -- Christian Hammers <ch@debian.org>  Thu, 21 Dec 2006 21:35:09 +0100
+
+mysql-dfsg-5.0 (5.0.30-2) unstable; urgency=high
+
+  * Fixed upstream regression in header files that lead to FTBFS for
+    mysql-admin, mysql-query-browser and probably other pacakges.
+    (thanks to Andreas Henriksson). Closes: #403081, #403082
+  * Fixed some upstream scripts by replacing /etc by /etc/mysql (thanks to
+    Julien Antony). Closes: #401083
+  * Updated French Debconf translation (thanks to Christian Perrier).
+    Closes: #401434
+  * Added Spanish Debconf translation (thanks to Javier Fernandez-Sanguino
+    Pena). Closes: #401953
+  * Marked a Debconf question that is just a dummy and only internally
+    used as not-needing-translation. Closes: #403163
+  * Fixed mysqlslowdump patch to not remove the usage() function (thanks 
+    to Monty Tailor).
+
+ -- Christian Hammers <ch@debian.org>  Sun,  3 Dec 2006 19:20:10 +0100
+
+mysql-dfsg-5.0 (5.0.30-1) unstable; urgency=low
+
+  * New upstream version (switch to the MySQL Enterprise branch).
+  * Upstream bugfix for the Innodb performance bug:
+    "Very poor performance with multiple queries running
+     concurrently (Bug#15815)".
+  * Upstream bugfix for a possible server crash:
+    "Selecting from a MERGE table could result in a server crash if the
+     underlying tables had fewer indexes than the MERGE table itself
+     (Bug#22937)"
+  * Upstream bugfies for *lot* of NDB problems.
+  * Upstream bugfix for Innodb optimizer bug. Closes: #397597
+  * Updated Italian Debconf translation (thanks to Luca Monducci).
+    Closes: #401305 
+  * Updated debian/watch file to MySQL Enterprise branch.
+
+ -- Christian Hammers <ch@debian.org>  Sat,  2 Dec 2006 16:36:38 +0100
+
+mysql-dfsg-5.0 (5.0.27-2) unstable; urgency=medium
+
+  * Disabled YaSSL x86 assembler as it was reported to crash applications
+    like pam-mysql or proftpd-mysql which are linked against libmysqlclient
+    on i486 and Cyrix (i586) CPUs. Closes: #385147
+  * Adjusted mysql-server-4.1 priority to extra and section to oldlibs
+    according to the ftp masters overrides.
+  * Updated German Debconf translation (thanks to Alwin Meschede).
+    Closes: #400809
+
+ -- Christian Hammers <ch@debian.org>  Wed, 22 Nov 2006 13:36:31 +0100
+
+mysql-dfsg-5.0 (5.0.27-1) unstable; urgency=medium
+
+  * New upstream version (but no codechange, the only difference to 5.0.26
+    was a patch to the ABI change which Debian already included.
+  * When dist-upgrading from mysql-server-4.1/sarge dpkg does not longer
+    ask unnecessary "config file has changed" questions regarding
+    /etc/init.d/mysql, /etc/logrotate.d/mysql-server and
+    /etc/mysql/debian-start just because these files previously belonged
+    to mysql-server-4.1 and not to mysql-server-5.0.
+    To archive this mysql-server-5.0 now pre-depends on mysql-common which
+    provides current versions of those files.
+  * The automatic run mysql_upgrade now works with non-standard datadir
+    settings, too (thanks to Benjami Villoslada). Closes: #394607
+  * Debconf now asks if the old_passwords option is really needed.
+  * Improved explanations of the old_passwords variable in my.cnf.
+  * Removed possibly leftover cron script from MySQL-4.1 (thanks to
+    Mario Oyorzabal Salgado). Closes: #390889
+  * Postrm ignores failed "userdel mysql".
+  * Updated Danish Debconf translation (thanks to Claus Hindsgaul).
+    Closes: #398784
+  * Added Euskarian Debconf translation (thanks to Piarres Beobide).
+    Closes: #399045
+  * Updated Japanese Debconf translation (thanks to Hideki Yamane).
+    Closes: #399074
+  * Updated German Debconf translation (thanks to Alwin Meschede).
+    Closes: #399087
+  * New Portuguese debconf translations from Miguel Figueiredo. 
+    Closes: #398186
+
+ -- Christian Hammers <ch@debian.org>  Tue,  7 Nov 2006 21:26:25 +0100
+
+mysql-dfsg-5.0 (5.0.26-3) unstable; urgency=high
+
+  [sean finney]
+  * Fix for the deadly ISAM trap.  Now during upgrades we will do our
+    very best to convert pre-existing ISAM format tables using the
+    binaries from the previous package.  Success is not guaranteed, but
+    this is probably as good as it gets.  Note that this also necessitates
+    re-introducing an (empty transitional) mysql-server-4.1 package.
+    Closes: #354544, #354850
+  * Remove a couple spurious and wrongly placed WARNING statements from
+    45_warn-CLI-passwords.dpatch.  thanks to Dan Jacobsen for pointing these
+    out.  Closes: #394262
+
+ -- sean finney <seanius@debian.org>  Fri, 03 Nov 2006 18:34:46 +0100
+
+mysql-dfsg-5.0 (5.0.26-2) unstable; urgency=high
+
+  * Fixed FTBFS for Alpha by applying an upstream patch (thanks to Falk
+    Hueffner). Closes: #395921
+
+ -- Christian Hammers <ch@debian.org>  Sat, 28 Oct 2006 20:13:46 +0200
+
+mysql-dfsg-5.0 (5.0.26-1) unstable; urgency=high
+
+  * SECURITY: 
+    This combined release of 5.0.25 and 5.0.26 fixes lot of possible server
+    crashs so it should get into Etch. Quoting the changelog (bug numbers are
+    bugs.mysql.com ones):
+    - character_set_results can be NULL to signify no conversion, but some
+      code did not check for NULL, resulting in a server crash. (Bug#21913)
+    - Using cursors with READ COMMITTED isolation level could cause InnoDB to
+      crash. (Bug#19834)
+    - Some prepared statements caused a server crash when executed a second
+      time. (Bug#21166)
+    - When DROP DATABASE or SHOW OPEN TABLES was issued while concurrently 
+      issuing DROP TABLE (or RENAME TABLE, CREATE TABLE LIKE or any other 
+      statement that required a name lock) in another connection, the server 
+      crashed. (Bug#21216)
+    - Use of zero-length variable names caused a server crash. (Bug#20908)
+    - For InnoDB tables, the server could crash when executing NOT IN () 
+      subqueries. (Bug#21077) 
+    - Repeated DROP TABLE statements in a stored procedure could sometimes
+      cause the server to crash. (Bug#19399)
+    - Performing an INSERT on a view that was defined using a SELECT that 
+      specified a collation and a column alias caused the server to crash 
+      (Bug#21086).
+    - A query of the form shown here caused the server to crash. (Bug#21007)
+    - NDB Cluster: Some queries involving joins on very large NDB tables could
+      crash the MySQL server. (Bug#21059)
+    - The character set was not being properly initialized for CAST() with a
+      type like CHAR(2) BINARY, which resulted in incorrect results or even a 
+      server crash. (Bug#17903)
+    - For certain queries, the server incorrectly resolved a reference to an
+      aggregate function and crashed. (Bug#20868)
+    - The server crashed when using the range access method to execut a
+      subquery with a ORDER BY DESC  clause. (Bug#20869)
+    - Triggers on tables in the mysql database caused a server crash. Triggers
+      for tables in this database now are disallowed. (Bug#18361)
+    - Using SELECT on a corrupt MyISAM table using the dynamic record format 
+      could cause a server crash. (Bug#19835) 
+    - Use of MIN() or MAX()  with GROUP BY on a ucs2  column could cause a
+      server crash. (Bug#20076)
+    - Selecting from a MERGE table could result in a server crash if the
+      underlying tables had fewer indexes than the MERGE table itself. 
+      (Bug#21617, Bug#22937)
+
+  * New upstream release.
+    - This bug would cause trouble for Sarge->Etch upgrades, it was supposed to
+      have been fixed in 5.0.16 but that apparently did not fix the whole
+      problem:
+      Using tables from MySQL 4.x in MySQL 5.x, in particular those with VARCHAR
+      fields and using INSERT DELAYED to update data in the table would result in
+      either data corruption or a server crash. (Bug#16611, Bug#16218, Bug#17294) 
+      Closes: #386337
+    - Fixes data corruption as an automatic client reconnect used to set
+      the wrong character set. Closes: #365050
+    - Fixes an undefined ulong type in an include file. Closes: #389102
+    - Fixes wrong output format when using Unicode characters. Closes: #355302
+    - Fixes mysql_upgrade when using a password. Closes: #371841
+ 
+  [Christian Hammers]
+  * Removed --sysconfdir from debian/rules as it puts /etc/mysql/ at the
+    end of the my.cnf search patch thus overriding $HOME/my.cnf
+    (thanks to Christoph Biedl). Closes: #394992
+  * The provided patch from bug #385947 was wrong, the variable is called
+    BLOCKSIZE not BLOCK_SIZE according to "strings `which df`" (thanks to
+    Bruno Muller). Closes: #385947
+
+  [sean finney]
+  * new dutch debconf translations from Vincent Zweije (closes: #392809).
+  * new japanese debconf translations from Hideki Yamane (closes: #391625).
+  * new italian debconf translations from Luca Monducci (closes: #391741).
+  * new french debconf translations from Christian Perrier (closes: #393334).
+  * ran debconf-updatepo to merge the fuzzies into svn.
+  * massage the following patches so they continue to apply cleanly:
+    - 44_scripts__mysql_config__libs.dpatch to cleanly apply.
+    - 45_warn-CLI-passwords.dpatch
+    - 96_TEMP__libmysqlclient_ssl_symbols.dpatch (note, this patch might
+      no longer be needed, but is retained "just in case" after massaging it)
+  * the following patches have been incorporated upstream:
+    - 70_kfreebsd.dpatch
+    - 80_hurd_mach.dpatch
+    - 87_ps_Hurd.dpatch
+    - 90_TEMP__client__mysql_upgrade__O_EXEC.dpatch
+    - 91_TEMP__client__mysql_upgrade__password.dpatch
+    - 92_TEMP__client__mysql_upgrade__defaultgroups.dpatch
+    - 94_TEMP__CVE-2006-4227.dpatch
+    - 95_TEMP__CVE-2006-4226.dpatch
+  * the udf_example.cc has disappeared from the source code, but there's
+    a udf_example.c which seems to be a good example to use instead :)
+  * update documentation in the configuration to no longer reference
+    using my.cnf in the DATADIR, as it's never been the recommended
+    method for debian systems and hasn't worked since 5.0 was released
+    anyway (closes: #393868).
+
+ -- Christian Hammers <ch@debian.org>  Wed, 25 Oct 2006 19:54:04 +0200
+
+mysql-dfsg-5.0 (5.0.24a-9) unstable; urgency=medium
+
+  * Having expire_logs_days enabled but log-bin not crashes the server. Using
+    both or none of those options is safe. To prevent this happening during the 
+    nightly log rotation via /etc/logrotate.d/mysql the initscript checks for 
+    malicious combination of options. See: #368547
+  * The Sarge package "mysql-server" which used to include the mysqld daemon
+    may still be in unselected-configured state (i.e. after a remove but not
+    purge) in which case its now obsolete cronscript has to be moved away
+    (thanks to Charles Lepple). Closes: #385669
+  * Updated Danish Debconf translation (thanks to Claus Hindsgaul).
+    Closes: #390315
+  * Updated Frensh Debconf translation (thanks to Christian Perrier).
+    Closes: #390980
+
+ -- Christian Hammers <ch@debian.org>  Tue,  3 Oct 2006 14:55:31 +0200
+
+mysql-dfsg-5.0 (5.0.24a-8) unstable; urgency=low
+
+  * (broken upload)
+
+ -- Christian Hammers <ch@debian.org>  Tue,  3 Oct 2006 14:55:31 +0200
+
+mysql-dfsg-5.0 (5.0.24a-7) unstable; urgency=low
+
+  * Stopped mysql_config from announcing unnecessary library dependencies
+    which until now cause "NEEDED" dependencies in the "readelf -d" output
+    of libraries who only depend on libmysqlclient.so (thanks to Michal
+    Cihar). Closes: #390692
+
+ -- Christian Hammers <ch@debian.org>  Sun,  1 Oct 2006 23:59:43 +0200
+
+mysql-dfsg-5.0 (5.0.24a-6) unstable; urgency=low
+
+  [sean finney]
+  * finally add support for setting a root password at install.
+    while this is not a random password as requested in one bug
+    report, we believe it is the best solution and provides a
+    means to set a random password via preseeding if it's really
+    desired (Closes: #316127, #298295).
+
+ -- sean finney <seanius@debian.org>  Sun, 01 Oct 2006 23:34:30 +0200
+
+mysql-dfsg-5.0 (5.0.24a-5) unstable; urgency=low
+
+  * Added ${shlibs:Depends} to debian/control section libmysqlclient-dev as it
+    contains the experimental /usr/lib/mysql/libndbclient.so.0.0.0.
+  * Bumped standards version to 3.7.2.
+  * Added LSB info section to init scripts.
+  * Rephrased Debconf templates as suggested by lintian.
+  * Added benchmark suite in /usr/share/mysql/sql-bench/.
+  * The mysql.timezone* tables are now filled by the postinst script (thanks
+    to Mark Sheppard). Closes: #388491
+  * Moved Debconf install notes to README.Debian. Displaying them with
+    medium priority was a bug anyway. Closes: #388941
+  * Replaced /usr/bin/mysql_upgrade by /usr/bin/mysql_upgrade_shell in
+    /etc/mysql/debian-start.sh as it works without errors (thanks to Javier
+    Kohen). Closes: #389443
+
+ -- Christian Hammers <ch@debian.org>  Wed, 20 Sep 2006 15:01:42 +0200
+
+mysql-dfsg-5.0 (5.0.24a-4) unstable; urgency=high
+
+  * libmysqlclient.so.15 from 5.0.24 accidentaly exports some symbols that are
+    historically exported by OpenSSL's libcrypto.so. This bug was supposed to
+    be fixed in 5.0.24a bug according to the mysql bug tracking system will
+    only be fixed in 5.0.25 so I backported the patch. People already reported
+    crashing apps due to this (thanks to Duncan Simpson). See also: #385348
+    Closes: #388262
+  * Fixed BLOCKSIZE to BLOCK_SIZE in initscript (thanks to Bruno Muller).
+    Closes: #385947
+  * Added hint to "--extended-insert=0" to mysqldump manpage (thanks to Martin
+    Schulze).
+  * Documented the meaning of "NDB" in README.Debian (thanks to Dan Jacobson).
+    Closes: #386274
+  * Added patch to build on hurd-i386 (thanks to Cyril Brulebois). Closes: #387369
+  * Fixed debian-start script to work together with the recend LSB modifications in
+    the initscript (thanks to wens). Closes: #387481
+  * Reverted tmpdir change in my.cnf back to /tmp to comply with FHS (thanks
+    to Alessandro Valente). Closes: #382778
+  * Added logcheck filter rule (thanks to Paul Wise). Closes: #381043
+  * I will definetly not disable InnoDB but added a note to the default my.cnf
+    that disabling it saves about 100MB virtual memory (thanks to Olivier
+    Berger). Closes: #384399
+  * Added thread_cache_size=8 to default my.cnf as this variable seems to have
+    a negligible memory footprint but can improve performance when lots of
+    threads connect simultaneously as often seen on web servers.
+
+ -- Christian Hammers <ch@debian.org>  Mon,  4 Sep 2006 00:21:50 +0200
+
+mysql-dfsg-5.0 (5.0.24a-3) unstable; urgency=low
+
+  * Fixed potential tempfile problem in the newly added mysqlreport script.
+
+ -- Christian Hammers <ch@debian.org>  Sun,  3 Sep 2006 23:17:24 +0200
+
+mysql-dfsg-5.0 (5.0.24a-2) unstable; urgency=low
+
+  * Added "mysqlreport" (GPL'ed) from hackmysql.com.
+  * Temporarily disabled expire_days option as it causes the server
+    to crash. See #368547
+  * Made output of init scripts LSB compliant (thanks to David Haerdeman).
+    Closes: #385874
+
+ -- Christian Hammers <ch@debian.org>  Sun,  3 Sep 2006 19:06:53 +0200
+
+mysql-dfsg-5.0 (5.0.24a-1) unstable; urgency=high
+
+  * New upstream version.
+  * The shared library in the 5.0.24 upstream release accidently exported 
+    some symbols that are also exported by the OpenSSL libraries (notably
+    BN_bin2bn) causing unexpected behaviour in applications using these 
+    functions (thanks to Peter Cernak). Closes: #385348
+  * Added note about possible crash on certain i486 clone CPUs.
+  * Made recipient address of startup mysqlcheck output configurable
+    (thanks to Mattias Guns). Closes: #385119
+
+ -- Christian Hammers <ch@debian.org>  Mon, 28 Aug 2006 01:22:12 +0200
+
+mysql-dfsg-5.0 (5.0.24-3) unstable; urgency=high
+
+  * SECURITY:
+    CVE-2006-4226:
+    When run on case-sensitive filesystems, MySQL allows remote
+    authenticated users to create or access a database when the database 
+    name differs only in case from a database for which they have
+    permissions.
+    CVE-2006-4227:
+    MySQL evaluates arguments of suid routines in the security context of
+    the routine's definer instead of the routine's caller, which allows
+    remote authenticated users to gain privileges through a routine that 
+    has been made available using GRANT EXECUTE.
+    Thanks to Stefan Fritsch for reporting. Closes: #384798
+
+ -- Christian Hammers <ch@debian.org>  Sat, 26 Aug 2006 04:55:17 +0200
+
+mysql-dfsg-5.0 (5.0.24-2) unstable; urgency=high
+
+  * 5.0.24-1 introduced an ABI incompatibility, which this patch reverts.
+    Programs compiled against 5.0.24-1 are not compatible with any other
+    version and needs a rebuild.
+    This bug already caused a lot of segfaults and crashes in various 
+    programs. Thanks to Chad MILLER from MySQL for quickly providing a patch.
+    The shlibdeps version has been increased to 5.0.24-2.
+    Closes: #384047, #384221, #383700
+
+ -- Christian Hammers <ch@debian.org>  Fri, 25 Aug 2006 21:47:35 +0200
+
+mysql-dfsg-5.0 (5.0.24-1) unstable; urgency=high
+ 
+  * SECURITY: Upstream fixes a security bug which allows a user to continue
+    accessing a table using a MERGE TABLE after the right to direct access to
+    the database has been revoked (CVE-2006-4031, MySQL bug #15195).
+    (Well they did not exactly fixed it, they documented the behaviour and
+    allow the admin to disable merge table alltogether...). Closes: #380271
+  * SECURITY: Applied patch that fixes a possibly insecure filehandling
+    in the recently added mysql_upgrade binary file (MySQL bug #10320).
+  * New upstream version.
+    - Fixes nasty MySQL bug #19618 that leads to crashes when using
+      "SELECT ... WHERE ... not in (1, -1)" (e.g. vbulletin was affected).
+    - Fixes upstream bug #16803 so that linking ~/.mysql_history to /dev/null
+      now has the desired effect of having no history.
+  * Really fixed the runlevels. Closes: #377651
+  * Added patch for broken upstream handling of "host=" to mysql_upgrade.c.
+  * Adjusted /etc/mysql/debian-start to new mysql_upgrade.c
+
+ -- Christian Hammers <ch@debian.org>  Tue,  8 Aug 2006 00:44:13 +0200
+
+mysql-dfsg-5.0 (5.0.22-5) unstable; urgency=low
+
+  * Added further line to the logcheck ignore files (thanks to Paul Wise).
+    Closes: #381038
+
+ -- Christian Hammers <ch@debian.org>  Wed,  2 Aug 2006 00:28:50 +0200
+
+mysql-dfsg-5.0 (5.0.22-4) unstable; urgency=low
+
+  * Upstream fixes a bug in the (never released) version 5.0.23 which could
+    maybe used to crash the server if the mysqlmanager daemon is in use
+    which is not yet the default in Debian. (CVE-2006-3486 *DISPUTED*)
+  * Changed runlevel priority of mysqld from 20 to 19 so that it gets started
+    before apache and proftpd etc. which might depend on an already running
+    database server (thanks to Martin Gruner). Closes: #377651
+  * Added patch which sets PATH_MAX in ndb (thanks to Cyril Brulebois).
+    Closes: #378949
+  * Activated YaSSL as licence issues are settled according to:
+    http://bugs.mysql.com/?id=16755. This also closes the FTBFS bug
+    regarding OpenSSL as it is discouraged to use now. Closes: #368639
+  * Removed SSL-MINI-HOWTO as the official documentation is good enough now.
+  * mysql_upgrade no longer gives --password on the commandline which would
+    be insecure (thanks to Dean Gaudet). Closes: #379199
+  * Adjusted debian/patches/45* to make consecutive builds in the same source
+    tree possible (thanks to Bob Tanner). Closes: #368661
+  * mysql-server-5.0 is now suggesting tinyca as yaSSL is enabled and tinyca
+    was found to be really cool :)
+  * Moved tempdir from /tmp to /var/tmp as it will more likely have enough
+    free space as /tmp is often on the root partition and /var or at least
+    /var/tmp is on a bigger one.
+
+ -- Christian Hammers <ch@debian.org>  Mon, 10 Jul 2006 23:30:26 +0200
+
+mysql-dfsg-5.0 (5.0.22-3) unstable; urgency=low
+
+  * Added patch for MySQL bug #19618: "select x from x
+    where x not in(1,-1)" may crash the server" (thanks to
+    Ruben Puettmann).
+
+ -- Christian Hammers <ch@debian.org>  Fri,  9 Jun 2006 01:41:44 +0200
+
+mysql-dfsg-5.0 (5.0.22-2) unstable; urgency=high
+
+  * Fixed debian-sys-maint related bug in postinst (thanks to
+    Jean-Christophe Dubacq). Closes: #369970
+  * The last upload was a security patch (which I did not know as I
+    uploaded before the announcement came). I now added the CVE id for
+    reference and set urgency to high as the last entry did not.
+
+ -- Christian Hammers <ch@debian.org>  Wed, 31 May 2006 01:04:11 +0200
+
+mysql-dfsg-5.0 (5.0.22-1) unstable; urgency=low
+
+  * SECURITY: This upstream release fixes an SQL-injection with multibyte 
+    encoding problem. (CVE-2006-2753)
+  * New upstream release.
+  * Upstream fixes REPAIR TABLE problem. Closes: #354300
+  * Upstream fixes problem that empty strings in varchar and text columns
+    are displayed as NULL. Closes: #368663
+
+ -- Christian Hammers <ch@debian.org>  Tue, 30 May 2006 23:43:24 +0200
+
+mysql-dfsg-5.0 (5.0.21-4) unstable; urgency=low
+
+  * Added "BLOCKSIZE=" to the diskfree check (thanks to Farzad FARID).
+    Closes: #367027, #367083
+  * Further fixed mysql_upgrade upstream script (thanks to Andreas Pakulat)
+    Closes: #366155
+  * Adjusted the /proc test in debian/rules from /proc/1 to /proc/self
+    to make building on grsec systems possible (thanks to K. Rosenegger).
+    Closes: #366824
+  * Updated Russion Debconf translation (thanks to Yuriy Talakan).
+    Closes: #367141
+  * Updated Czech Debconf translation (thanks to Kiroslav Kure).
+    Closes: #367160
+  * Updated Galician Debconf translation (thanks to Jacobo Tarrio).
+    Closes: #367384
+  * Updated Swedish Debconf translation (thanks to Daniel Nylander).
+    Closes: #368186
+
+ -- Christian Hammers <ch@debian.org>  Wed, 10 May 2006 08:45:42 +0200
+
+mysql-dfsg-5.0 (5.0.21-3) unstable; urgency=low
+
+  * Fixed FTBFS problem which was caused by a patch that modifies Makefile.am
+    as well as Makefile.in and was not deteced because my desktop was fast
+    enough to patch both files within the same second and so fooled automake.
+    (thanks to Blars Blarson for notifying me). Closes: #366534
+
+ -- Christian Hammers <ch@debian.org>  Sat,  6 May 2006 19:03:58 +0200
+
+mysql-dfsg-5.0 (5.0.21-2) unstable; urgency=low
+
+  * Fixed bug in postinst that did not correctly rewrite 
+    /etc/mysql/debian.cnf (thanks to Daniel Leidert). 
+    Closes: #365433, #366155
+
+ -- Christian Hammers <ch@debian.org>  Thu,  4 May 2006 02:37:03 +0200
+
+mysql-dfsg-5.0 (5.0.21-1) unstable; urgency=high
+
+  * SECURITY: New upstream release with some security relevant bugfixes:
+    * "Buffer over-read in check_connection with usernames lacking a
+      trailing null byte" (CVE-2006-1516)
+    * "Anonymous Login Handshake - Information Leakage" (CVE-2006-1517)
+    * "COM_TABLE_DUMP Information Leakage and Arbitrary command execution"
+       (CVE-2006-1518)
+    Closes: #365938, #365939
+  * Added diskfree check to the init script (thanks to Tim Baverstock).
+    Closes: #365460
+  * First amd64 upload!
+
+ -- Christian Hammers <ch@debian.org>  Sat, 29 Apr 2006 04:31:27 +0200
+  
+mysql-dfsg-5.0 (5.0.20a-2) unstable; urgency=low
+
+  * The new mysql-upgrade which is started from /etc/mysql/debian-start
+    does now use the debian-sys-maint user for authentication (thanks to
+    Philipp). Closes: #364991
+  * Wrote patch debian/patches/43* which adds a password option to
+    mysql_update. See MySQL bug #19400.
+  * Added "Provides: libmysqlclient-dev" to libmysqlclient15-dev as I saw no
+    obvious reasons against it (problems should be documented in
+    debian/README.Maintainer!) (thanks to Olaf van der Spek). Closes: #364899
+  * Updated Netherlands debconf translation (thanks to Vincent Zweije)
+    Closes: #364464
+  * Updated French debconf translation (thanks to Christian Perrier)
+    Closes: #364401
+  * Updated Danish debconf translation (thanks to Claus Hindsgaul)
+    Closes: #365135
+
+ -- Christian Hammers <ch@debian.org>  Wed, 26 Apr 2006 01:14:53 +0200
+
+mysql-dfsg-5.0 (5.0.20a-1) unstable; urgency=low
+
+  * New upstream release.
+  * Added the new mysql_upgrade script and added it to
+    /etc/mysql/debian-start (thanks to Alessandro Polverini). 
+    The script is currently very noise that is a known bug and will be
+    fixed in the next release!
+    Closes: #363458
+  * No longer creates the "test" database. This actuallay had been tried
+    to archive before (at least patches) exists but apparently was not the
+    case in the last versions (thanks to Olaf van der Spek). Closes: #362126
+  * Reformatted libmysqlclient15off.NEWS.Debian to changelog format
+    (thanks to Peter Palfrader). Closes: #363062
+
+ -- Christian Hammers <ch@debian.org>  Sat, 15 Apr 2006 13:05:22 +0200
+
+mysql-dfsg-5.0 (5.0.20-1) unstable; urgency=high
+
+  * Upstream contains a fix for a nasty bug (MySQL#18153) that users 
+    already experienced and that caused corrupted triggers after
+    REPAIR/OPTIMIZE/ALTER TABLE statements.
+    (thanks to Jerome Despatis for pointing out)
+  * Added patch for the "updates on multiple tables is buggy after 
+    upgrading from 4.1 to 5.0" problem which MySQL has been committed
+    for the upcoming 5.0.21 release. Closes #352704
+  * Added Netherlands debconf translation (thanks to Vincent Zweije).
+    Closes: #360443
+  * Added Galician debconf translation (thanks to Jacobo Tarrio).
+    Closes: #361257
+
+ -- Christian Hammers <ch@debian.org>  Fri,  7 Apr 2006 00:00:43 +0200
+
+mysql-dfsg-5.0 (5.0.19-3) unstable; urgency=high
+
+  [ Christian Hammers ]
+  * Fixed libmysqlclient15.README.Debian regarding package name changes
+    (thanks to Leppo).
+  * Moved libheap.a etc. back to /usr/lib/mysql/ as their names are just
+    too generic. Closes: #353924
+  [ Sean Finney ]
+  * updated danish debconf translation, thanks to Claus Hindsgaul
+    (closes: #357424).
+  [ Adam Conrad ]
+  * Send stderr from 'find' in preinst to /dev/null to tidy up chatter.
+  * Backport patch for CVE-2006-0903 from the upcoming release to resolve
+    a log bypass vulnerability when using non-binary logs (closes: #359701)
+
+ -- Adam Conrad <adconrad@0c3.net>  Tue,  4 Apr 2006 15:23:18 +1000
+
+mysql-dfsg-5.0 (5.0.19-2) unstable; urgency=medium
+
+  * New upstream release.
+  * Renamed package libmysqlclient15 to libmysqlclient15off due to
+    binary incompatible changes.
+    See /usr/share/doc/libmysqlclient15off/README.Debian
+  * Updated Czech debconf translation (thanks to Miroslav Kure).
+    Closes: #356503
+  * Updated French debconf translation (thanks to Christian Perrier).
+    Closes: #356332
+  * Improved README.Debian (thanks to Olaf van der Spek). Closes: #355702
+  * Fixed 5.0.18-8 changelog by saying in which package the NEWS.Debian
+    file is (thanks to Ross Boylan). Closes: #355978
+
+ -- Christian Hammers <ch@debian.org>  Fri, 17 Mar 2006 02:32:19 +0100
+
+mysql-dfsg-5.0 (5.0.19-1) experimental; urgency=medium
+
+  * New upstream release.
+  * SECURITY: CVE-2006-3081: A bug where str_to_date(1,NULL) lead to a 
+    server crash has been fixed. 
+    (this note has been added subsequently for reference)
+  * Renamed package libmysqlclient15 to libmysqlclient15off.
+    See /usr/share/doc/libmysqlclient15off/NEWS.Debian
+  * Updated Czech debconf translation (thanks to Miroslav Kure).
+    Closes: #356503
+  * Updated French debconf translation (thanks to Christian Perrier).
+    Closes: #356332
+  * Improved README.Debian (thanks to Olaf van der Spek). Closes: #355702
+  * Fixed 5.0.18-8 changelog by saying in which package the NEWS.Debian
+    file is (thanks to Ross Boylan). Closes: #355978
+
+ -- Christian Hammers <ch@debian.org>  Tue, 14 Mar 2006 22:56:13 +0100
+
+mysql-dfsg-5.0 (5.0.18-9) unstable; urgency=medium
+
+  [ Christian Hammers ]
+  * When using apt-get the check for left-over ISAM tables can abort the
+    installation of mysql-server-5.0 but not prevent the mysql-server-4.1
+    package from getting removed. The only thing I can do is reflect this
+    in the Debconf notice that is shown and suggest to reinstall
+    mysql-server-4.1 for converting. See: #354850
+  * Suggests removing of /etc/cron.daily/mysql-server in last NEWS message
+    (thanks to Mourad De Clerck). Closes: #354111
+  * Added versioned symbols for kfreebsd and Hurd, too (thanks to Aurelien
+    Jarno and Michael Bank). Closes: #353971 
+  * Added versioned symbols for kfreebsd, too (thanks to Aurelien Jarno).
+    Closes: #353971
+  [ Adam Conrad ]
+  * Add 39_scripts__mysqld_safe.sh__port_dir.dpatch to ensure that the
+    permissions on /var/run/mysqld are always correct, even on a tmpfs.
+
+ -- Christian Hammers <ch@debian.org>  Mon,  6 Mar 2006 21:42:13 +0100
+
+mysql-dfsg-5.0 (5.0.18-8) unstable; urgency=low
+
+  * The rotation of the binary logs is now configured via
+    expire-logs-days in /etc/mysql/my.cnf and handled completely
+    by the server and no longer in configured in debian-log-rotate.conf
+    and handled by a cron job. Thanks to David Johnson.
+    See /usr/share/doc/mysql-server-5.0/NEWS.Debian
+  * Ran aspell over some files in debian/ and learned a lot :)
+  * debian/rules: Added check if versioned symbols are really there.
+  * Updated SSL-MINI-HOWTO.
+  * Updated copyright (removed the parts regarding the now removed
+    BerkeleyDB table handler and mysql-doc package).
+  * Relocated a variable in preinst (thanks to Michael Heldebrant).
+    Closes: #349258, #352587, #351216
+  * Updated Danish debconf translation (thanks to Claus Hindsgaul).
+    Closes: #349013  
+  * Updated Swedish debconf translation (thanks to Daniel Nylander).
+    Closes: #349522
+  * Updated French debconf translation (thanks to Christian Perrier).
+    Closes: #349592
+  * Fixed typo in README.Debian (thanks to Vincent Ricard).
+  * Prolonged waiting time for mysqld in the init script. Closes: #352070
+
+ -- Christian Hammers <ch@debian.org>  Mon, 23 Jan 2006 23:13:46 +0100
+
+mysql-dfsg-5.0 (5.0.18-7) unstable; urgency=low
+
+  * Made mailx in debian-start.inc.sh optional and changed the dependency on it
+    on it to a mere recommendation. Closes: #316297
+  * the previous FTBFS patches for GNU/Hurd inadvertently led to configure
+    being regenerating, losing a couple trivial things like our versioned
+    symbols patch, causing many nasty problems (closes: #348854).
+
+ -- sean finney <seanius@debian.org>  Fri, 20 Jan 2006 20:59:27 +0100
+
+mysql-dfsg-5.0 (5.0.18-6) unstable; urgency=low
+
+  * Added version comment (thanks to Daniel van Eeden). 
+  * Added two patches to build on GNU/Hurd (thanks to Michael Bank).
+    Closes: #348182
+  * Abort upgrade if old and now unsupported ISAM tables are present
+    (thanks to David Coe). Closes: #345895
+
+ -- Christian Hammers <ch@debian.org>  Tue, 17 Jan 2006 19:25:59 +0100
+
+mysql-dfsg-5.0 (5.0.18-5) unstable; urgency=low
+
+  * Bump shlibdeps for libmysqlclient15 to (>= 5.0.15-1), which was
+    the first non-beta release from upstream, as well as being shortly
+    after we broke the ABI in Debian by introducing versioned symbols.
+
+ -- Adam Conrad <adconrad@0c3.net>  Fri, 13 Jan 2006 13:18:03 +1100
+
+mysql-dfsg-5.0 (5.0.18-4) unstable; urgency=low
+
+  * Munge our dependencies further to smooth upgrades even more, noting
+    that we really need 5.0 to conflict with 4.1, and stealing a page from
+    the book of mysql-common, it doesn't hurt to hint package managers in
+    the direction of "hey, this stuff is a complete replacement for 4.1"
+  * Change the description of mysql-server and mysql-client to remove the
+    references to it being "transition", and instead point out that it's
+    the way to get the "current best version" of each package installed.
+
+ -- Adam Conrad <adconrad@0c3.net>  Wed, 11 Jan 2006 11:39:45 +1100
+
+mysql-dfsg-5.0 (5.0.18-3) unstable; urgency=low
+
+  * Make the mysql-{client,server}-5.0 conflict against mysql-{client,server}
+    versioned, so they can be installed side-by-side and upgrade properly.
+  * Add myself to Uploaders; since I have access to the alioth repository.
+
+ -- Adam Conrad <adconrad@0c3.net>  Tue, 10 Jan 2006 19:15:48 +1100
+
+mysql-dfsg-5.0 (5.0.18-2) unstable; urgency=low
+
+  * Removed the transitional package that forced an upgrade from
+    mysql-server-4.1 to mysql-server-5.0 as I was convinced that
+    having a general "mysql-server" package with adjusted dependencies
+    is enough (thanks to Adam Conrad).
+  * Updated logcheck.ignore files (thanks to Jamie McCarthy). Closes: #340193
+
+ -- Christian Hammers <ch@debian.org>  Mon,  9 Jan 2006 21:54:53 +0100
+
+mysql-dfsg-5.0 (5.0.18-1) unstable; urgency=low
+
+  * New upstream version. 
+  * Added empty transitional packages that force an upgrade from the
+    server and client packages that have been present in Sarge.
+  * Fixed SSL-MINI-HOWTO (thanks to Jonas Smedegaard). Closes: #340589 
+
+ -- Christian Hammers <ch@debian.org>  Mon,  2 Jan 2006 21:17:51 +0100
+
+mysql-dfsg-5.0 (5.0.17-1) unstable; urgency=low
+
+  * Never released as Debian package.
+
+ -- Christian Hammers <ch@debian.org>  Thu, 22 Dec 2005 07:49:52 +0100
+
+mysql-dfsg-5.0 (5.0.16-1) unstable; urgency=low
+
+  * New upstream version.
+  * Removed the error logs from the logrotate script as Debian does
+    not use them anymore. Closes: #339628
+
+ -- Christian Hammers <ch@debian.org>  Tue, 22 Nov 2005 01:19:11 +0100
+
+mysql-dfsg-5.0 (5.0.15-2) unstable; urgency=medium
+
+  * Added 14_configure__gcc-atomic.h.diff to fix FTBFS on m68k
+    (thanks to Stephen R Marenka). Closes: #337082
+  * Removed dynamic linking against libstdc++ as it was not really
+    needed (thanks to Adam Conrad). Closes: #328613
+  * Fixed the "/var/lib/mysql is a symlink" workaround that accidently
+    left a stalled symlink (thanks to Thomas Lamy). Closes: #336759
+  * As the init script cannot distinguish between a broken startup and
+    one that just takes very long the "failed" message now says
+    "or took more than 6s" (thanks to Olaf van der Spek). Closes: #335547
+
+ -- Christian Hammers <ch@debian.org>  Thu,  3 Nov 2005 22:00:15 +0100
+
+mysql-dfsg-5.0 (5.0.15-1) unstable; urgency=low
+
+  * New upstream version. 5.0 has finally been declared STABLE!
+  * Added small patch to debian/rules that fixed sporadic build errors
+    where stdout and stderr were piped together, got mixed up and broke
+  * Added --with-big-tables to ./configure (thanks to tj.trevelyan).
+    Closes: #333090
+  * Added capability to parse "-rc" to debian/watch.
+  * Fixed cronscript (thanks to Andrew Deason). Closes: #335244
+  * Added Swedish debconf translation (thanks to Daniel Nylander).
+    Closes: #333670
+  * Added comment to README.Debian regarding applications that manually
+    set new-style passwords... Closes: #334444
+  * Sean Finney:
+    - Fix duplicate reference to [-e|--extended-insert]. Closes: #334957
+    - Fix default behavior for mysqldumpslow. Closes: #334517
+    - Reference documentation issue in mysql manpage. Closes: #335219
+
+ -- Christian Hammers <ch@debian.org>  Fri, 30 Sep 2005 00:10:39 +0200
+
+mysql-dfsg-5.0 (5.0.13rc-1) unstable; urgency=low
+
+  * New upstream release. Now "release-candidate"! 
+  * Removed any dynamic link dependencies to libndbclient.so.0 which
+    is due to its version only distributed as a static library.
+  * Sean Finney:
+    - FTBFS fix related to stripping rpath in debian/rules
+
+ -- Christian Hammers <ch@debian.org>  Mon, 26 Sep 2005 22:09:26 +0200
+
+mysql-dfsg-5.0 (5.0.12beta-5) unstable; urgency=low
+
+  * The recent FTBFS were probably result of a timing bug in the
+    debian/patches/75_*.dpatch file where Makefile.in got patched just
+    before the Makefile.shared which it depended on. For that reason
+    only some of the autobuilders failed. Closes: #330149
+  * Fixed chrpath removal (option -k had to be added).
+  * Corrected debconf dependency as requested by Joey Hess.
+
+ -- Christian Hammers <ch@debian.org>  Mon, 26 Sep 2005 18:37:07 +0200
+
+mysql-dfsg-5.0 (5.0.12beta-4) unstable; urgency=low
+
+  * Removed experimental shared library libndbclient.so.0.0.0 as it
+    is doomed to cause trouble as long as it is present in both MySQL 4.1
+    and 5.0 without real soname and its own package. We still have
+    libndbclient.a for developers. (thanks to Adam Conrad and 
+    mediaforest.net). Closes: #329772
+
+ -- Christian Hammers <ch@debian.org>  Fri, 23 Sep 2005 12:36:48 +0200
+
+mysql-dfsg-5.0 (5.0.12beta-3) unstable; urgency=medium
+
+  * Symbol versioning support!  wooooohoooooo!
+    (thanks to Steve Langasek) Closes: #236288
+  * Moved libndbcclient.so.0 to the -dev package as it is provided by
+    libmysqlclient14 and -15 which must be installable simultaneously.
+  * Removed mysql-*-doc suggestions.
+
+ -- Christian Hammers <ch@debian.org>  Tue, 20 Sep 2005 00:07:03 +0200
+
+mysql-dfsg-5.0 (5.0.12beta-2) unstable; urgency=low
+
+  * Added patch to build on GNU/kFreeBSD (thanks to Aurelien Jarno).
+    Closes: #327702
+  * Added patch that was already been present on the 4.1 branch which
+    makes the "status" command of the init script more sensible
+    (thanks to Stephen Gildea). Closes: #311836
+  * Added Vietnamese Debconf translation (thanks to Clytie Siddal).
+    Closes: #313006
+  * Updated German Debconf translation (thanks to Jens Seidel).
+    Closes: #313957
+  * Corrected commends in example debian-log-rotate.conf. The default is
+    unlike the mysql-sever-4.1 package which needed to stay backwards
+    compatible now 2 to avoid filling up the disk endlessly.
+  * Fixed watch file to be "-beta" aware.
+
+ -- Christian Hammers <ch@debian.org>  Thu, 15 Sep 2005 20:50:19 +0200
+
+mysql-dfsg-5.0 (5.0.12beta-1) unstable; urgency=medium
+
+  * Christian Hammers:
+    - New upstream release.
+    - Changed build-dep to libreadline5-dev as requested by Matthias Klose.
+      Closes: #326316
+    - Applied fix for changed output format of SHOW MASTER LOGS for
+      binary log rotation (thanks to Martin Krueger). Closes: #326427, #326427
+    - Removed explicit setting of $PATH as I saw no sense in it and
+      it introduced a bug (thanks to Quim Calpe). Closes: #326769
+    - Removed PID file creation from /etc/init.d/mysql-ndb as it does
+      not work with this daemon (thanks to Quim Calpe).
+    - Updated French Debconf translation (thanks to Christian Perrier).
+      Closes: #324805
+    - Moved conflicts line in debian/control from libmysqlclient15 to
+      libmysqlclient15-dev and removed some pre-sarge conflicts as
+      suggested by Adam Majer. Closes: #324623
+  * Sean Finney:
+    - For posterity, CAN-2005-2558 has been fixed since 5.0.7beta.
+
+ -- Christian Hammers <ch@debian.org>  Thu, 15 Sep 2005 19:58:22 +0200
+
+mysql-dfsg-5.0 (5.0.11beta-3) unstable; urgency=low
+
+  * Temporarily build only with -O2 to circumvent gcc internal errors
+    (thanks to Matthias Klose). Related to: #321165 
+
+ -- Christian Hammers <ch@debian.org>  Thu, 18 Aug 2005 15:44:04 +0200
+
+mysql-dfsg-5.0 (5.0.11beta-2) unstable; urgency=low
+
+  * Fixed README.Debian regarding the status of mysql-doc.
+  * Added "set +e" around chgrp in mysql-server-5.0.preinst to
+    not fail on .journal files (thanks to Christophe Nowicki).
+    Closes: #318435
+
+ -- Christian Hammers <ch@debian.org>  Sun, 14 Aug 2005 18:02:08 +0200
+
+mysql-dfsg-5.0 (5.0.11beta-1) unstable; urgency=low
+
+  * New upstream version. 
+  * Added Danish Debconf translations (thanks to Claus Hindsgaul).
+    Closes: #322384
+  * Updated Czech Debconf translations (thanks to Miroslav Kure).
+    Closes: #321765
+
+ -- Christian Hammers <ch@debian.org>  Sat, 13 Aug 2005 11:56:15 +0000
+
+mysql-dfsg-5.0 (5.0.10beta-1) unstable; urgency=low
+
+  * New upstream release.
+  * Christian Hammers:
+    - Added check for mounted /proc to debian/rules.
+  * Sean Finney:
+    - fix for fix_mysql_privilege_tables/mysql_fix_privilege_tables typo
+      in mysql-server-5.0's README.Debian (see #319838).
+
+ -- Christian Hammers <ch@debian.org>  Sun, 31 Jul 2005 00:30:45 +0200
+
+mysql-dfsg-5.0 (5.0.7beta-1) unstable; urgency=low
+
+  * Second try for new upstream release. 
+  * Renamed mysql-common-5.0 to mysql-common as future libmysqlclient16
+    from e.g. MySQL-5.1 would else introduce mysql-common-5.1 which makes
+    a simultanous installation of libmysqlclient14 impossible as that
+    depends on either mysql-common or mysql-common-5.0 but not on future
+    versions. Thus we decided to always let the newest MySQL version
+    provide mysql-common.
+  * Added ${misc:Depends} as suggested by debhelper manpage. 
+  * Raised standard in control file to 3.6.2.
+  * Removed DH_COMPAT from rules in faviour of debian/compat.
+  * Checkes for presence of init script before executing it in preinst.
+    Referres: 315959
+  * Added 60_includes_mysys.h__gcc40.dpatch for GCC-4.0 compatibility.
+
+ -- Christian Hammers <ch@debian.org>  Wed, 29 Jun 2005 00:39:05 +0200
+
+mysql-dfsg-5.0 (5.0.5beta-1) unstable; urgency=low
+
+  * New major release! Still beta so be carefull...
+  * Added federated storage engine.
+
+ -- Christian Hammers <ch@debian.org>  Wed,  8 Jun 2005 19:29:45 +0200
+
+mysql-dfsg-4.1 (4.1.12-1) unstable; urgency=low
+
+  * Christian Hammers:
+    - New upstream release.
+    - Disabled BerkeleyDB finally. It has been obsoleted by InnoDB.
+  * Sean Finney:
+    - Updated French translation from Christian Perrier (Closes: #310526).
+    - Updated Japanese translation from Hideki Yamane (Closes: #310263).
+    - Updated Russian translation from Yuriy Talakan (Closes: #310197).
+
+ -- Christian Hammers <ch@debian.org>  Sat,  4 Jun 2005 05:49:11 +0200
+
+mysql-dfsg-4.1 (4.1.11a-4) unstable; urgency=high
+
+  * Fixed FTBFS problem which was caused due to the fact that last uploads
+    BerkeleyDB patch was tried to applied on all architectures and not only
+    on those where BerkeleyDB is actually beeing built. Closes: #310296
+
+ -- Christian Hammers <ch@debian.org>  Mon, 23 May 2005 00:54:51 +0200
+
+mysql-dfsg-4.1 (4.1.11a-3) unstable; urgency=high
+
+  * Added patch from Piotr Roszatycki to compile the bundled db3 library
+    that is needed for the BerkeleyDB support with versioned symbols so
+    that mysqld no longer crashes when it gets linked together with the
+    Debian db3 version which happens when e.g. using libnss-db.
+    Closes: #308966
+
+ -- Christian Hammers <ch@debian.org>  Thu, 19 May 2005 01:41:14 +0200
+
+mysql-dfsg-4.1 (4.1.11a-2) unstable; urgency=high
+
+  * Okay, the hackery with /var/lib/dpkg/info/mysql-server.list will not
+    stand and is removed from the preinst of mysql-server.
+  * New workaround for the symlink problem that does not involve mucking
+    with dpkg's file lists is storing the symlinks in a temporary location
+    across upgrades.
+    As this sometimes fails since apt-get does not always call new.preinst
+    before old.postrm, some remarks were added to README.Debian and the
+    Debconf installation notes to minimize the inconvinience this causes.
+
+ -- sean finney <seanius@debian.org>  Sun, 15 May 2005 10:25:31 -0400
+
+mysql-dfsg-4.1 (4.1.11a-1) unstable; urgency=high
+
+  * Added the "a" to the version number to be able to upload a new
+    .orig.tar.gz file which now has the non-free Docs/ directory removed
+    as this has been forgotten in the 4.1.11 release (thanks to Goeran
+    Weinholt). Closes: #308691
+  * The Woody package listed /var/lib/mysql and /var/log/mysql in its
+    /var/lib/dpkg/info/mysql-server.list. These directories are often
+    replaced by symlinks to data partitions which triggers a dpkg bug
+    that causes these symlinks to be removed on upgrades. The new preinst
+    prevents this by removing the two lines from the .list file
+    (thanks to Andreas Barth and Jamin W. Collins). See dpkg bug #287978.
+  * Updated French Debconf translation (thanks to Christian Perrier).
+    Closes: #308353
+
+ -- Christian Hammers <ch@debian.org>  Thu, 12 May 2005 21:52:46 +0200
+
+mysql-dfsg-4.1 (4.1.11-3) unstable; urgency=high
+
+  * The "do you want to remove /var/lib/mysql when purging the package" flag
+    from old versions is removed once this package is beeing installed so
+    that purging an old Woody mysql-server package while having a
+    mysql-server-4.1 package installed can no longer lead to the removal of
+    all databases. Additionaly clarified the wording of this versions Debconf
+    template and added a check that skips this purge in the postrm script
+    if another mysql-server* package has /usr/sbin/mysqld installed.
+    (thanks to Adrian Bunk for spotting that problem) Closes: #307473
+  * Cronfile was not beeing installed as the filename was not in the
+    correct format for "dh_installcron --name" (thanks to Tomislav
+    Gountchev). Closes: #302712
+
+ -- Christian Hammers <ch@debian.org>  Sat, 23 Apr 2005 22:55:15 +0200
+
+mysql-dfsg-4.1 (4.1.11-2) unstable; urgency=low
+
+  * Sean Finney:
+    - don't freak out if we can't remove /etc/mysql during purge.
+    - debian/rules clean works again.
+  * Christian Hammers:
+    - Fixed typo in README.Debian (thanks to Joerg Rieger). Closes: #304897
+    - Completely removed the passwordless test user as it was not only
+      insecure but also lead to irritations as MySQL checks first the
+      permissions of this user and then those of a password having one.
+      See bug report from Hilko Bengen for details. Closes: #301741
+
+ -- Christian Hammers <ch@debian.org>  Sat, 16 Apr 2005 15:55:00 +0200
+
+mysql-dfsg-4.1 (4.1.11-1) unstable; urgency=low
+
+  * New upstream version. 
+  * Upstream fix for charset/collation problem. Closes: #282256
+  * Upstream fix for subselect crash. Closes: #297687
+  * Corrected minor issue in Debconf template regarding skip-networking
+    (thanks to Isaac Clerencia). Closes: #303417
+  * Made dependency to gawk unnecessary (thanks to Zoran Dzelajlija).
+    Closes: #302284
+  * Removed obsolete 50_innodb_mixlen.dpatch.
+  * Removed obsolete 51_CAN-2004-0957_db_grant_underscore.dpatch.
+
+ -- Christian Hammers <ch@debian.org>  Fri,  8 Apr 2005 00:23:53 +0200
+
+mysql-dfsg-4.1 (4.1.10a-7) unstable; urgency=low
+
+  * Sean Finney:
+    - fix for the mysteriously disappeared cronjob.  thanks to
+      Peter Palfrader <weasel@debian.org> for pointing out this omission.
+      (closes: #302712).
+
+ -- sean finney <seanius@debian.org>  Sat, 02 Apr 2005 16:54:13 -0500
+
+mysql-dfsg-4.1 (4.1.10a-6) unstable; urgency=high
+
+  * Sean Finney:
+    - the previous upload did not completely address the issue.  this one
+      should do so.  d'oh.
+
+ -- sean finney <seanius@debian.org>  Thu, 31 Mar 2005 03:35:50 +0000
+
+mysql-dfsg-4.1 (4.1.10a-5) unstable; urgency=high
+
+  * Sean Finney:
+    - the following security issue is addressed in this upload:
+      CAN-2004-0957 (grant privilege escalation on tables with underscores)
+      thanks to sergei at mysql for all his help with this.
+
+ -- sean finney <seanius@debian.org>  Wed, 30 Mar 2005 21:19:26 -0500
+
+mysql-dfsg-4.1 (4.1.10a-4) unstable; urgency=low
+
+  * Sean Finney:
+    - FTBFS fix for amd64/gcc-4.0.  Thanks to Andreas Jochens <aj@andaco.de>
+      for reporting this (closes: #301807).
+    - ANSI-compatible quoting fix in daily cron job.  thanks to 
+      Karl Hammar <karl@aspodata.se> for pointing out the problem in
+      the 4.0 branch.
+    - Added myself as a co-maintainer in the control file (closes: #295312).
+
+ -- sean finney <seanius@debian.org>  Tue, 29 Mar 2005 18:54:42 -0500
+
+mysql-dfsg-4.1 (4.1.10a-3) unstable; urgency=low
+
+  * BerkeleyDB is now disabled by default as its use is discouraged by MySQL.
+  * Added embedded server libraries as they finally do compile.
+    They are currently in libmysqlclient-dev as they are still 
+    experimental and only available as .a library (thanks to Keith Packard).
+    Closes: #297062
+  * Fixed obsolete "tail" syntax (thanks to Sven Mueller). Closes: #301413
+  * Added CAN numbers for the latest security bugfix upload.
+  * Updated manpage of mysqlmanager (thanks to Justin Pryzby). Closes: #299844
+  * Added comments to default configuration.
+
+ -- Christian Hammers <ch@debian.org>  Sun, 20 Mar 2005 17:40:18 +0100
+
+mysql-dfsg-4.1 (4.1.10a-2) unstable; urgency=low
+
+  * Disabled "--with-mysqld-ldflags=-all-static" as it causes sig11 crashes
+    if LDAP is used for groups in /etc/nsswitch.conf. Confirmed by Sean Finney
+    and Daniel Dehennin. Closes: #299382
+
+ -- Christian Hammers <ch@debian.org>  Mon, 14 Mar 2005 03:01:03 +0100
+
+mysql-dfsg-4.1 (4.1.10a-1) unstable; urgency=high
+
+  * SECURITY:
+    - The following security related updates are addressed: 
+      CAN-2005-0711 (temporary file creation with "CREATE TEMPORARY TABLE")
+      CAN-2005-0709 (arbitrary library injection in udf_init())
+      CAN-2005-0710 (arbitrary code execution via "CREATE FUNCTION")
+      Closes: #299029, #299031, #299065
+  * New Upstream Release.
+    - Fixes some server crash conditions.
+    - Upstream includes fix for TMPDIR overriding my.cnf tmpdir setting
+      Closes: #294347
+    - Fixes InnoDB error message. Closes: #298875
+    - Fixes resouce limiting. Closes: #285044
+  * Improved checking whether or not the server is alive in the init script
+    which should make it possible to run several mysqld instances in
+    different chroot environments. Closes: #297772
+  * Fixed cron script name as dots are not allowed (thanks to Michel
+    v/d Ven). Closes: #298447
+  * Added -O3 and --with-mysqld-ldflags=-all-static as MySQL recommends to
+    build the server binary statically in order to gain about 13% more
+    performance (thanks to Marcin Kowalski).
+  * Added patch to let mysqld_safe react to signals (thanks to Erich 
+    Schubert). Closes: #208364
+  * (Thanks to Sean Finney for doing a great share of work for this release!)
+
+ -- Christian Hammers <ch@debian.org>  Thu,  3 Mar 2005 02:36:39 +0100
+
+mysql-dfsg-4.1 (4.1.10-4) unstable; urgency=medium
+
+  * Fixed bug that prevented MySQL from starting after upgrades.
+    Closes: #297198, #296403
+  * Added comment about logging to syslog to the default my.cnf
+    and the logrotate script (thanks to Ryszard Lach). Closes: #295507
+
+ -- Christian Hammers <ch@debian.org>  Thu,  3 Mar 2005 00:28:02 +0100
+
+mysql-dfsg-4.1 (4.1.10-3) unstable; urgency=low
+
+  * Sean Finney: Cronjobs now exit silently when the server package
+    has been removed but not purged (thanks to Vineet Kumar).
+    Closes: #297404
+  * Fixed comments of /etc/mysql/debian-log-rotate.conf (thanks to
+    Philip Ross). Closes: #297467
+  * Made mysqld_safe reacting sane on signals (thanks to Erich Schubert).
+    Closes: #208364
+ 
+ -- Christian Hammers <ch@debian.org>  Tue,  1 Mar 2005 19:44:34 +0100
+
+mysql-dfsg-4.1 (4.1.10-2) unstable; urgency=low
+
+  * Converted to dpatch.
+  * debian/ is now maintained via Subversion on svn.debian.org. 
+
+ -- Christian Hammers <ch@debian.org>  Tue,  1 Mar 2005 02:16:36 +0100
+
+mysql-dfsg-4.1 (4.1.10-1) unstable; urgency=low
+
+  * New upstream version.
+  * Upstream fixed memleak bug. Closes: #205587
+  * Added debian/copyright.more for personal reference.
+  * Lowered default query cache size as suggested by Arjen from MySQL.
+  * Switched from log to log-bin as suggested by Arjen from MySQL.
+  * Fixed typo in my.cnf (thanks to Sebastian Feltel). Closes: #295247
+  * Replaced --defaults-extra-file by --defaults-file in Debian scripts
+    as former lets password/host etc be overwriteable by /root/.my.cnf.
+    Added socket to /etc/mysql/debian.cnf to let it work. (thanks to
+    SATOH Fumiyasu). Closes: #295170
+
+ -- Christian Hammers <ch@debian.org>  Tue, 15 Feb 2005 23:47:02 +0100
+
+mysql-dfsg-4.1 (4.1.9-4) unstable; urgency=low
+
+  * Improved the way mysqld is started and registered with update-rc.d
+    in cases where the admin modifies the runlevel configuration.
+    Most notably removed the debconf question whether or not mysql should
+    start on when booting. Closes: #274264
+  * Renamed configuration option old-passwords to the more preferred
+    naming convention old_passwords. Same for some others (thanks to
+    Patrice Pawlak). Closes: #293983
+
+ -- Christian Hammers <ch@debian.org>  Tue,  8 Feb 2005 02:21:18 +0100
+
+mysql-dfsg-4.1 (4.1.9-3) unstable; urgency=low
+
+  * Renamed ca_ES.po to ca.po to reach a broader audience (thanks to 
+    Christian Perrier). Closes: #293786 
+  * Expicitly disabled mysqlfs support as it has never been enabled by
+    configure during the autodetection but fails due to broken upstream
+    code when users try to build the package theirselves while having
+    liborbit-dev installed which triggers the mysqlfs autodetection
+    (thanks to Max Kellermann). Closes: #293431
+  * Added dependencies to gawk as one script does not work with original-awk
+    (thanks to Petr Ferschmann). Closes: #291634
+
+ -- Christian Hammers <ch@debian.org>  Sun,  6 Feb 2005 23:33:11 +0100
+
+mysql-dfsg-4.1 (4.1.9-2) unstable; urgency=high
+
+  * SECURITY:
+    For historical reasons /usr/share/mysql/ was owned and writable by
+    the user "mysql". This is a security problem as some scripts that
+    are run by root are in this directory and could be modified and used
+    by a malicious user who already has mysql privileges to gain full root
+    rights (thanks to Matt Brubeck). Closes: #293345
+  * Changed "skip-networking" to "bind-address 127.0.0.1" which is more
+    compatible and not less secure but maybe even more, as less people enable
+    networking for all interfaces (thanks to Arjen Lentz).
+  * Enabled InnoDB by default as recommended by Arjen Lentz from MySQL.
+  * Added remarks about hosts.allow to README.Debian (thanks to David
+    Chappell). Closes: #291300
+  * mysql-server-4.1 now provides mysql-server (thanks to Paul van den Berg).
+    Closes: #287735
+
+ -- Christian Hammers <ch@debian.org>  Wed,  2 Feb 2005 23:31:55 +0100
+
+mysql-dfsg-4.1 (4.1.9-1) unstable; urgency=low
+
+  * New upstream version.
+  * mysql-client-4.1 now provides "mysql-client" so that packages depending
+    on mysql-client (ca. 40) can now be used with MySQL-4.1, too.
+
+ -- Christian Hammers <ch@debian.org>  Sun, 23 Jan 2005 22:52:48 +0100
+
+mysql-dfsg-4.1 (4.1.8a-6) unstable; urgency=high
+
+  * SECURITY:
+    Javier Fernandez-Sanguino Pena from the Debian Security Audit Project
+    discovered a temporary file vulnerability in the mysqlaccess script of
+    MySQL that could allow an unprivileged user to let root overwrite
+    arbitrary files via a symlink attack and could also could unveil the
+    contents of a temporary file which might contain sensitive information.
+    (CAN-2005-0004, http://lists.mysql.com/internals/20600) Closes: #291122
+
+ -- Christian Hammers <ch@debian.org>  Tue, 18 Jan 2005 23:11:48 +0100
+
+mysql-dfsg-4.1 (4.1.8a-5) unstable; urgency=medium
+
+  * Fixed important upstream bug that causes from_unixtime(0) to return
+    NULL instead of "1970-01-01 00:00:00" which fails on NOT NULL columns.
+    Closes: #287792
+  * Fixes upstream bug in mysql_list_fields() . Closes: #282486
+  * Fixes bug that lead to double rotated logfiles when mysql-server 4.0
+    was previously installed (thanks to Olaf van der Spek). Closes: #289851
+  * Fixed typo in README.Debian (thanks to Mark Nipper). Closes: #289131
+  * Changed max_allowed_packet in my.cnf to 16M as in 4.0.x (thanks to
+    Olaf van der Spek). Closes: #289840
+  * Updated French debconf translation (thanks to Christian Perrier).
+    Closes: #287955
+
+ -- Christian Hammers <ch@debian.org>  Thu, 13 Jan 2005 01:29:05 +0100
+
+mysql-dfsg-4.1 (4.1.8a-4) unstable; urgency=low
+
+  * Broken patch again :-(
+
+ -- Christian Hammers <ch@debian.org>  Sun,  9 Jan 2005 23:47:55 +0100
+
+mysql-dfsg-4.1 (4.1.8a-3) unstable; urgency=low
+
+  * The mutex patch was a bit too x86 centric. This broke the alpha build.
+
+ -- Christian Hammers <ch@debian.org>  Sun,  9 Jan 2005 14:18:49 +0100
+
+mysql-dfsg-4.1 (4.1.8a-2) unstable; urgency=medium
+ 
+  * Some Makefiles that were patched by me got overwritten by the GNU
+    autotools, probably because I also patched ./configure. Fixed now,
+    the critical mutex patch is now back in again. Closes: #286961
+  * Added patch to make MySQL compile on ARM (thanks to Adam Majer).
+    Closes: #285071
+
+ -- Christian Hammers <ch@debian.org>  Thu,  6 Jan 2005 09:30:13 +0100
+
+mysql-dfsg-4.1 (4.1.8a-1) unstable; urgency=medium
+
+  * Upstream 4.1.8 had some problems in their GNU Autotools files so they
+    released 4.1.8a. Debian's 4.1.8 was fixed by running autoreconf but this
+    again overwrote MySQL changes to ltmain.sh which are supposed to fix some
+    problems on uncommon architectures (maybe the FTBFS on alpha, arm, m68k
+    and sparc?).
+  * libmysqlclient_r.so.14 from 4.1.8-3 also missed a link dependency to
+    libz which lead to unresolved symbols visible with "ldd -r" (thanks
+    to Laurent Bonnaud). Closes: #287573
+
+ -- Christian Hammers <ch@debian.org>  Wed, 29 Dec 2004 14:26:33 +0100
+
+mysql-dfsg-4.1 (4.1.8-3) unstable; urgency=low
+
+  * Fixed checking for error messages by forcing english language
+    output by adding LC_ALL=C to debian-start (thanks to Rene
+    Konasz) Closes: #285709
+  * Fixed bashisms in Debian scripts. Closes: #286863
+  * Updated Japanese Debconf translation (thanks to Hideki Yamane).
+    Closes: #287003
+  * Improved 4.0 to 4.1 upgrade if /var/lib/mysql is a symlink
+    (thanks to Thomas Lamy). Closes: #286560
+  * Added patch for FTBFS problem where no LinuxThreads can be found.
+    I don't know if this still applies but it should not hurt.
+    The patch is debian/patches/configure__AMD64-LinuxThreads-vs-NPTL.diff
+
+ -- Christian Hammers <ch@debian.org>  Sun, 26 Dec 2004 14:04:20 +0100
+
+mysql-dfsg-4.1 (4.1.8-2) unstable; urgency=low
+
+  * If /var/lib/mysql is a symlink then it is kept as such.
+  * Added the old-passwords option to the default my.cnf to stay
+    compatible to clients that are still compiled to libmysqlclient10
+    and libmysqlclient12 for licence reasons. 
+  * Adjusted tetex build-deps to ease backporting (thanks to Norbert
+    Tretkowski from backports.org).
+
+ -- Christian Hammers <ch@debian.org>  Tue, 21 Dec 2004 01:00:27 +0100
+
+mysql-dfsg-4.1 (4.1.8-1) unstable; urgency=medium
+
+  * New upstream version. Closes: #286175
+  * Added conflict to libmysqlclient-dev (thanks to Adam Majer).
+    Closes: #286538
+  * Added debconf-updatepo to debian/rules:clean.
+  * Updated Japanese Debconf translation (thanks to Hideki Yamane).
+    Closes: #285107
+  * Updated French Debconf translation (thanks to Christian Perrier).
+    Closes: #285977
+  * Renamed cz.po to cs.po (thanks to Miroslav Kure). Closes: #285438
+  * Aplied patch for changed server notice to debian-start (thanks to
+    Adam Majer). Closes: #286035
+  * Changed nice value in default my.cnf as nohup changed its behaviour
+    (thanks to Dariush Pietrzak). Closes: #285446
+  * Increased verbosity of preinst script in cases where it cannot stop
+    a running server (thanks to Jan Minar). Closes: #285982
+  * Splitted the code parts of /etc/mysql/debian-start to
+    /usr/share/mysql/debian-start.inc.sh (thanks to Jan Minar).
+    Closes: #285988
+
+ -- Christian Hammers <ch@debian.org>  Mon, 20 Dec 2004 00:33:21 +0100
+
+mysql-dfsg-4.1 (4.1.7-4) unstable; urgency=medium
+
+  * Removed OpenSSL support.
+    After a short discussion with MySQL, I decided to drop OpenSSL support as
+    1. MySQL started shipping their binaries without it, too and do not
+       seem to support it in favour of using a different library somewhen.
+    2. MySQL did not adjust their licence to grant permission to link
+       against OpenSSL.
+    3. Even if they did, third parties who use libmysqlclient.so often
+       do not realise licencing problems or even do not want OpenSSL.
+    (thanks to Jordi Mallach and the responders to MySQL bug #6924)
+    Closes: #283786
+  * debian/control: Improved depends and conflicts to mysql-4.0.
+
+ -- Christian Hammers <ch@debian.org>  Thu,  2 Dec 2004 22:02:28 +0100
+
+mysql-dfsg-4.1 (4.1.7-3) unstable; urgency=low
+
+  * Raised version to make it higher as the one in experimental. 
+
+ -- Christian Hammers <ch@debian.org>  Wed,  1 Dec 2004 21:09:20 +0100
+
+mysql-dfsg-4.1 (4.1.7-2) unstable; urgency=low
+
+  * Patched scripts/mysql_install_db so that it no longer creates a
+    passwordless test database during installation (thanks to Patrick
+    Schnorbus). Closes: #281158
+  * Added Czech debconf translation (thanks to Miroslav Kure).
+    Closes: #283222
+
+ -- Christian Hammers <ch@debian.org>  Wed,  1 Dec 2004 01:29:31 +0100
+
+mysql-dfsg-4.1 (4.1.7-1) unstable; urgency=low
+
+  * New upstream branch! 
+  * Adjusted debian/control to make this package suitable to get parallel
+    to version 4.0.x into unstable and sarge. The package names are
+    different so that "mysql-server" still defaults to the rock-stable
+    4.0 instead to this announced-to-be-stable 4.1.
+  * Added --with-mutex=i86/gcc-assemler to the Berkeley-DB configure
+    to prevent the use of NPLT threads when compiling under kernel 2.6
+    because the binaries are else not runable on kernel 2.4 hosts.
+    Closes: #278638, #274598 
+
+ -- Christian Hammers <ch@debian.org>  Sun, 31 Oct 2004 20:15:03 +0100
+
+mysql-dfsg (4.1.6-1) experimental; urgency=low
+
+  * New upstream version.
+  * Fixed symlinks in libmysqlclient-dev package. Closes: #277028
+  * This time I did not update the libtool files as they were pretty
+    up to date and I want to have a shorter diff file.
+
+ -- Christian Hammers <ch@debian.org>  Wed, 20 Oct 2004 00:07:58 +0200
+
+mysql-dfsg (4.1.5-3) experimental; urgency=low
+
+  * debian/postinst: mysql_install_db changed parameter from --IN-RPM
+    to --rpm which caused problems during installs. Closes: #276320
+
+ -- Christian Hammers <ch@debian.org>  Sat, 16 Oct 2004 20:36:46 +0200
+
+mysql-dfsg (4.1.5-2) experimental; urgency=low
+
+  * Activated support for ndb clustering (thanks to Kevin M. Rosenberg).
+    Closes: #275109
+
+ -- Christian Hammers <ch@debian.org>  Wed,  6 Oct 2004 01:58:00 +0200
+
+mysql-dfsg (4.1.5-1) experimental; urgency=low
+
+  * WARNING:
+    The upstream branch 4.1 is still considered BETA.
+    The Debian packages for 4.1 were done without big testing. If you miss
+    a new functionality or binary, contact me and I check add the relevant
+    configure option or include the program.
+  * New MAJOR upstream version.
+    Thanks to the great demand here's now the first MySQL 4.1 experimental
+    release. FEEDBACK IS WELCOME.
+  * 4.0->4.1 notes:
+    - debian/patches/alpha.diff could not be applied, I fix that later
+    - debian/patches/scripts__mysql_install_db.sh.diff was obsolete
+    - debian/patches/scripts__Makefile.in was neccessary due to a dependency
+      to the removed non-free Docs/ directory. Upstream has been contacted.
+    - Build-Deps: += automake1.7
+    - debian/rules: embedded servers examples did not compile, removed
+
+ -- Christian Hammers <ch@debian.org>  Sun, 26 Sep 2004 19:46:47 +0200
+
+mysql-dfsg (4.0.21-3) unstable; urgency=low
+
+  * Upstream tried to fix a security bug in mysqlhotcopy and broke it :-)
+    Applied a patch (see debian/patches) from Martin Pitt. Closes: #271632
+  * Between 4.0.20 and 4.0.21 the Debian specific changes in
+    /usr/bin/mysqld_safe that piped the error log to syslog got lost
+    and are now back again. 
+  * Fixed capitalization in debconf headings.
+  * Changed wording of the initscript status message to make heartbeat
+    happier. Closes: #271591
+
+ -- Christian Hammers <ch@debian.org>  Fri, 17 Sep 2004 18:42:25 +0200
+
+mysql-dfsg (4.0.21-2) unstable; urgency=medium
+
+  * The dependencies between mysql-client and libmysqlclient12 were
+    too loose, when upgrading only the client this can lead to non working
+    binaries due to relocation errors (thanks to Dominic Cleal).
+    Closes: #271803
+  * Fixed typo in mysqldump.1 manpage (thanks to Nicolas Francois).
+    Closes: #271334
+
+ -- Christian Hammers <ch@debian.org>  Wed, 15 Sep 2004 15:38:11 +0200
+
+mysql-dfsg (4.0.21-1) unstable; urgency=high
+
+  * SECURITY:
+    This upstream version fixes some security problems that might at least
+    allow a DoS attack on the server.
+    * Fixed an old bug in concurrent accesses to `MERGE' tables (even
+      one `MERGE' table and `MyISAM' tables), that could've resulted in
+      a crash or hang of the server. (Bug #2408)
+    * Fixed bug in privilege checking where, under some conditions, one
+      was able to grant privileges on the database, he has no privileges
+      on. (Bug #3933)
+    * Fixed crash in `MATCH ... AGAINST()' on a phrase search operator
+      with a missing closing double quote. (Bug #3870)
+    * Fixed potential memory overrun in `mysql_real_connect()' (which
+      required a compromised DNS server and certain operating systems).
+      (Bug #4017)
+  * New upstream version.
+    * Fixes bug that made x="foo" in WHERE sometimes the same as x="foo ".
+      Closes: #211618
+  * Updated Japanese Debconf translation (thanks to Hideki Yamane).
+    Closes: #271097
+
+ -- Christian Hammers <ch@debian.org>  Sat, 11 Sep 2004 23:15:44 +0200
+
+mysql-dfsg (4.0.20-14) unstable; urgency=low
+
+  * Dave Rolsky spottet that -DBIG_JOINS was not properly enabled.
+    It allowes joining 64 instead of an 32 tables to join.
+
+ -- Christian Hammers <ch@debian.org>  Thu,  9 Sep 2004 20:24:02 +0200
+
+mysql-dfsg (4.0.20-13) unstable; urgency=medium
+
+  * Fixed a bug in the initscript which caused the check for not properly
+    closed i.e. corrupt tables that is executed when the server starts
+    not to run in background as supposed.
+    Although the check does not repair anything on servers with several
+    thousand tables the script was reported to take some minutes which
+    is quite annoying. (Thanks to Jakob Goldbach). Closes: #270800
+
+ -- Christian Hammers <ch@debian.org>  Thu,  9 Sep 2004 17:11:05 +0200
+
+mysql-dfsg (4.0.20-12) unstable; urgency=medium
+
+  * Filter messages regarding table handles that do not support CHECK TABLE
+    in the script that checks for corrupted tables on every start which lead
+    to unnecessary mails (thanks to David Everly). Closes: #269811 
+  * Added a note to the corrupt-table-check mail which notes that a
+    false-positive is reported in the case that immediately after starting
+    the server a client starts using a table (thanks to Uwe Kappe).
+    Closes: #269985
+  * Added "quote-names" as default to the [mysqldump] section in
+    /etc/mysql/my.cnf as too many users stumble over dump files that
+    could not be read in again due to the valid use of reserved words
+    as table names. This has also be done by upstream in 4.1.1 and has
+    no known drawbacks. Closes: #269865
+  * Binary logs can now be rotated as well. Defaults to off, though, for
+    compatibilty reasons (thanks to Mark Ferlatte). Closes: #94230, #269110
+  * The mysql user "debian-sys-maint" now gets all possible rights which
+    makes binary logging possible and helps other package maintainer who
+    wants to use it to create package specific databases and users.
+  * Added example how to change daemon nice level via /etc/mysql/my.cnf
+  * Updated French debconf translations (thanks to Christian Perrier).
+    Closes: #265811
+  * Renamed options in the default config file that still had old names
+    (thanks to Yves Kreis). Closes: #266445
+  * Fixed spelling in debconf note.
+  * Added -l and -L to dh_shlibdeps.
+
+ -- Christian Hammers <ch@debian.org>  Fri,  3 Sep 2004 20:10:46 +0200
+
+mysql-dfsg (4.0.20-11) unstable; urgency=high
+
+  * SECURITY
+    This version fixes a security flaw in mysqlhotcopy which created
+    temporary files in /tmp which had predictable filenames and such
+    could be used for a tempfile run attack.
+    The issue has been recorded as CAN-2004-0457.
+
+ -- Christian Hammers <ch@debian.org>  Sat, 14 Aug 2004 18:27:19 +0200
+
+mysql-dfsg (4.0.20-10) unstable; urgency=low
+
+  * MySQL finally updated their copyright page and installed v1.5 of
+    the "Free/Libre and Open Source Software License (FLOSS) - Exception"
+    which will hopefully end the license hell they created by putting the
+    client libraries under GPL instead of LGPL which conflicts with PHP and
+    other software that used to link against MySQL.
+    The license text is not yet in any release MySQL version but visible
+    on their web site and copied into the debian/copyright file.
+    Special thanks to Zak Greant <zak@mysql.com> and the debian-legal list
+    for helping to solve this release critical problem.
+    Closes: #242449
+  * Updated Brazil debconf translation (thanks to Andre Luis Lopes).
+    Closes: #264233
+  * Updated Japanese debconf translation (thanks to Hideki Yamane).
+    Closes: #264620
+  * Fixed minor typo in debconf description (thanks to TROJETTE Mohammed
+    Adnene). Closes: #264840
+  * Improved init and preinst script which now detects stalled servers which
+    do no longer communicate but are present in the process list (thanks to
+    Henrik Johansson). Closes: #263215
+
+ -- Christian Hammers <ch@debian.org>  Mon,  9 Aug 2004 19:44:28 +0200
+
+mysql-dfsg (4.0.20-9) unstable; urgency=medium
+
+  * Partly reverted the last patch which gave the mysql-user
+    "debian-sys-maint" more rights as there are old versions of MySQL which
+    have fewer privlige columns. Now only those are set (thanks to Alan Tam).
+    Closes: #263111
+
+ -- Christian Hammers <ch@debian.org>  Tue,  3 Aug 2004 13:03:02 +0200
+
+mysql-dfsg (4.0.20-8) unstable; urgency=low
+
+  * The mysqlcheck that is started from the initscript will now be
+    backgrounded because it might else prevent the boot process to continue.
+    It also now notifies root by mail and syslog if a table is corrupt.
+  * The "debian-sys-maint" MySQL user now has almost full rights so that other
+    packages might use this account to create databases and user (thanks to
+    Andreas Barth). Closes: #262541
+  * Added paranoid rules for logcheck.
+
+ -- Christian Hammers <ch@debian.org>  Sun,  1 Aug 2004 21:00:55 +0200
+
+mysql-dfsg (4.0.20-8) unstable; urgency=low
+
+  * Upload stalled. Not released.
+
+ -- Christian Hammers <ch@debian.org>  Sun,  1 Aug 2004 20:27:55 +0200
+
+mysql-dfsg (4.0.20-7) unstable; urgency=medium
+
+  * Solved the upstream bug that error messages of the server are written
+    in a file that is then rotated away leaving mysqld logging effectively
+    to /dev/null. It now logs to a /usr/bin/logger process which puts the
+    messages into the syslog.
+    Modified files: /etc/init.d/mysql, /usr/bin/mysqld_safe and the 
+    logchecker files. Closes: #254070
+  * The initscript does no longer call mysqlcheck directly but via
+    /etc/mysql/debian-start which is a user customizable config script.
+  * Splitted the debconf "install and update notes" and only show them
+    when it is appropriate (thanks to Steve Langasek). Closes: #240515
+  * Added NEWS.Debian.
+  * Added hint to -DBIG_ROWS, which is currently not used, to README.Debian.
+  * Corrected typo in myisampack manpage (thanks to Marc Lehmann). 
+    Closes: #207090
+  * Added Catalan debconf translation (thanks to Aleix Badia i Bosch).
+    Closes: #236651
+
+ -- Christian Hammers <ch@debian.org>  Wed, 28 Jul 2004 01:41:51 +0200
+
+mysql-dfsg (4.0.20-6) unstable; urgency=low
+
+  * The build arch detected by configure was "pc-linux-gnu (i686)"
+    instead of "pc-linux-gnu (i386)". Was no problem AFAIK but
+    Adam Majer asked me to explicitly change it to i386. Closes: #261382
+  * Removed some unused shell scripts from /usr/share/mysql.
+  * Added lintian overrides.
+  * Removed rpath by using chrpath.
+
+ -- Christian Hammers <ch@debian.org>  Mon, 26 Jul 2004 00:17:12 +0200
+
+mysql-dfsg (4.0.20-5) unstable; urgency=medium
+
+  * The mysqlcheck in the init script is only called when the server
+    is really alive. Also, the mysql-user 'debian-sys-maint' now has
+    global select rights (thanks to Nathan Poznick). Closes: #261130 
+  * Moved the debconf question whether to remove the databases or not
+    from mysql-server.config to mysql-server.postrm so that it shows
+    up on purge time and not months earlier (thanks to Wouter Verhelst).
+    Closes: #251838
+
+ -- Christian Hammers <ch@debian.org>  Fri, 23 Jul 2004 22:41:13 +0200
+
+mysql-dfsg (4.0.20-4) unstable; urgency=low
+
+  * Added a "mysqlcheck -A --fast" to the 'start' section of the
+    init script to help admins detect corrupt tables after a server crash.
+    Currently it exists with an error message but leaves the server
+    running. Feedback appreciated!
+  * Made postinst script more robust by calling db_stop earlier and
+    so prevent pipe-deadlocks.
+  * Fixed minor typos in initscript (thanks to "C.Y.M."). Closes: 259518
+  * Added the undocumented "-DBIG_JOINS" that MySQL apparently uses in
+    their MAX binaries. It enables 62 instead of 30 tables in a "join".
+    (thanks to Dave Rolsky). Closes: #260843
+  * Added a "df --portability /var/lib/mysql/." check to the preinst
+    script as users experienced hard to kill hanging mysqlds in such
+    a situation (thanks to Vaidas Pilkauskas). Closes: #260306
+
+ -- Christian Hammers <ch@debian.org>  Fri, 23 Jul 2004 00:51:32 +0200
+
+mysql-dfsg (4.0.20-3) unstable; urgency=low
+
+  * Improved tolerance if the init script has been deleted (thanks to
+    Leonid Shulov for spotting the problem).
+  * Minor wording changes to README.Debian generalizing /root/ by $HOME
+    (thanks to Santiago Vila). Closes: #257725
+  * Added Japanese debconf translation (thanks to Hideki Yamane).
+    Closes: #256485
+  * Fixed commend in my.cnf regarding logfile directory (thanks to Jayen
+    Ashar). Closes: #253434
+  * Correted "ease to" by "ease of" in package description (thanks to
+    Johannes Berg). Closes: #253510 
+
+ -- Christian Hammers <ch@debian.org>  Fri,  9 Jul 2004 00:57:42 +0200
+
+mysql-dfsg (4.0.20-2) unstable; urgency=low
+
+  * Removed RPM .spec file from the included documentation as it is pretty
+    useless (thanks to Loic Minier).
+  * Added turkish debconf translation (thanks to Recai Oktas). Closes: #252802
+
+ -- Christian Hammers <ch@debian.org>  Sun,  6 Jun 2004 14:48:26 +0200
+
+mysql-dfsg (4.0.20-1) unstable; urgency=low
+
+  * New upstream version. 
+
+ -- Christian Hammers <ch@debian.org>  Mon, 31 May 2004 23:36:39 +0200
+
+mysql-dfsg (4.0.18-8) unstable; urgency=low
+
+  * Updated french translation (thanks to Christian Perrier). Closes: #246789
+
+ -- Christian Hammers <ch@debian.org>  Tue,  4 May 2004 23:26:54 +0200
+
+mysql-dfsg (4.0.18-7) unstable; urgency=low
+
+  * Added CVE ids for the recent security fixes.
+    4.0.18-4 is CAN-2004-0381 (mysqlbug) and
+    4.0.18-6 is CAN-2004-0388 (mysql_multi)
+
+ -- Christian Hammers <ch@debian.org>  Mon, 19 Apr 2004 18:32:03 +0200
+
+mysql-dfsg (4.0.18-6) unstable; urgency=medium
+
+  * SECURITY:
+    Fixed minor tempfile-run security problem in mysqld_multi.
+    Unprivileged users could create symlinks to files which were then
+    unknowingly overwritten by run when this script gets executed.
+    Upstream informed. Thanks to Martin Schulze for finding this.
+
+ -- Christian Hammers <ch@debian.org>  Wed,  7 Apr 2004 01:28:22 +0200
+
+mysql-dfsg (4.0.18-5) unstable; urgency=low
+
+  * Little improvements in debian scripts for last upload. 
+  * Added check to logrotate script for the case that a mysql
+    server is running but not be accessible with the username and
+    password from /etc/mysql/debian.conf (thanks to Jeffrey W. Baker).
+    Closes: 239421
+
+ -- Christian Hammers <ch@debian.org>  Sun,  4 Apr 2004 15:27:40 +0200
+
+mysql-dfsg (4.0.18-4) unstable; urgency=medium
+
+  * SECURITY: 
+    Aplied fix for unprobable tempfile-symlink security problem in 
+    mysqlbug reported by Shaun Colley on bugtraq on 2004-03-24.
+  * Updated french debconf translation (thanks to Christian Perrier).
+    Closes: #236878 
+  * Updated portugesian debconf translation (thanks to Nuno Senica).
+    Closes: #239168
+  * Updated german debconf translation (thanks to Alwin Meschede).
+    Closes: #241749
+  * Improved debconf template regarding fix_privileges_tables (thanks 
+    to Matt Zimmermann for suggestions). Closes: #219400
+  * Improved README.Debian regarding to password settings (thanks to
+    Yann Dirson). Closes: #241328
+
+ -- Christian Hammers <ch@debian.org>  Sat,  3 Apr 2004 19:52:15 +0200
+
+mysql-dfsg (4.0.18-3) unstable; urgency=medium
+
+  * Added Build-Depend to po-debconf to let it build everywhere.
+
+ -- Christian Hammers <ch@debian.org>  Wed, 31 Mar 2004 23:43:33 +0200
+
+mysql-dfsg (4.0.18-2) unstable; urgency=low
+
+  * Added a "2>/dev/null" to a "which" command as there are two
+    "which" versions in Debian of which one needs it. Closes: #235363
+
+ -- Christian Hammers <ch@debian.org>  Tue,  2 Mar 2004 23:31:28 +0100
+
+mysql-dfsg (4.0.18-1) unstable; urgency=low
+
+  * New upstream version.
+  * Should now compile and run on ia64 (thanks to Thorsten Werner and
+    David Mosberger-Tang). Closes: #226863 #228834 
+  * Converted init scripts to invoce-rc.d (thanks to Erich Schubert).
+    Closes: 232118 
+  * Secondlast upload changed logfile location. Closes: #182655
+  * Updated Brasilian translation (thanks to Andre Luis Lopes). Closes:
+    #219847
+
+ -- Christian Hammers <ch@debian.org>  Tue, 17 Feb 2004 23:44:58 +0100
+
+mysql-dfsg (4.0.17-2) unstable; urgency=low
+
+  * Improved manpage for mysqldumpslow.1 (thanks to Anthony DeRobertis).
+    Closes: #231039
+  * Improved stopping of crashed daemons in init script (thanks to
+    Matthias Urlichs). Closes: #230327
+
+ -- Christian Hammers <ch@debian.org>  Mon,  9 Feb 2004 21:54:29 +0100
+
+mysql-dfsg (4.0.17-1) unstable; urgency=low
+
+  * Made logging into /var/log/mysql/ the default. Closes: #225206
+ 
+  * New upstream version. Closes: #225028
+  * Turned on a 25MB query cache by default (thanks to Cyril Bouthors).
+    Closes: #226789
+  * Updated russian translation (thanks to Ilgiz Kalmetev). Closes: #219263
+  * Upstream fixes the problem that AND was not commutative (thanks for
+    Iain D Broadfoot for mentioning). Closes: #227927
+  * Fixed minor typo in my.cnf comments (thanks to James Renken). 
+    Closes: #221496
+  * Better documents regex. Closes: #214952
+  * Fixed minor germanism in debconf template (thanks to Marc Haber).
+    Closes: #224148
+  * Added explaining comment to my.cnf regarding quoted passwords 
+    (Thanks to Patrick von der Hagen). Closes: #224906
+  * Changed "find -exec" to "find -print0 | xargs -0" in preinst to
+    speed it up. Thanks to Cyril Bouthors. Closes: #220229
+
+ -- Christian Hammers <ch@debian.org>  Sun, 18 Jan 2004 16:16:25 +0100
+
+mysql-dfsg (4.0.16-2) unstable; urgency=low
+
+  * Tried to repair undefined weak symbols by adding a little Makefile
+    patch. Closes: #215973
+
+ -- Christian Hammers <ch@debian.org>  Mon, 27 Oct 2003 22:52:10 +0100
+
+mysql-dfsg (4.0.16-1) unstable; urgency=low
+
+  * New upstream release.
+    (Mostly little memory problems and other bugfixes it seems)
+  * Replaced "." by ":" in chown calls to comply with the env setting
+  	"_POSIX2_VERSION=2000112" (thanks to Robert Luberda). Closes: #217399
+  * Adjusted syntax in my.cnf to 4.x standard (thanks to Guillaume Plessis).
+    Closes: #217273
+  * Improved README.Debian password instructions (thanks to Levi Waldron).
+    Closes: #215046
+  * Improved NIS warning debconf-template (thanks to Jeff Breidenbach).
+    Closes: #215791
+  * Explicitly added libssl-dev to the libmysqlclient-dev package as it
+    is needed for mysql_config and the libmysqlclient package only depends
+    on libssl which has no unnumbered .so version (thanks to Simon Peter
+    and Davor Ocelic). Closes: #214436, #216162
+  * Added "-lwrap" to "mysql_config --libmysqld-libs" and filed it as
+    upstream bug #1650 (thanks to Noah Levitt). Closes: #214636
+
+ -- Christian Hammers <ch@debian.org>  Sat, 25 Oct 2003 01:09:27 +0200
+
+mysql-dfsg (4.0.15a-1) unstable; urgency=low
+
+  * Same package as 4.0.15-2 but I could not convince the Debian
+    installer to move the packages out of incoming.
+
+ -- Christian Hammers <ch@debian.org>  Tue,  7 Oct 2003 15:10:26 +0200
+
+mysql-dfsg (4.0.15-2) unstable; urgency=low
+
+  * Updated package description (thanks to Adrian Bunk). Closes: #210988 
+  * Fixed small typos in manpages (thanks to Nicolas Francois).
+    Closes: #211983
+  * More updates to package description (thanks to Matthias Lutz/ddtp).
+    Closes: #213456
+  * Updated standards to 3.6.1.
+  * Closes "new 4.0.15 available" bug. Closes: #213349
+  * Updated README.Debian with notes regarding the MySQL manual section
+    "2.4 Post-installation Setup and Testing" (thanks to Daniel B.).
+    Closes: #210841
+
+ -- Christian Hammers <ch@debian.org>  Fri,  3 Oct 2003 15:59:39 +0200
+
+mysql-dfsg (4.0.15-1) unstable; urgency=high
+
+  * SECURITY:
+    Users who are able to use the "ALTER TABLE" command on the "mysql"
+    database may be able to exploit this vulnerability to gain a shell with
+    the privileges of the mysql server (usually running as the 'mysql' user).
+    Closes: #210403
+  * Fixes small description typos (thanks to Oscar Jarkvik).
+  * Updated Brazilian Portuguese debconf translation. (thanks to Andre Luis
+    Lopes). Closes: 208030
+  * Replaced depricated '.' by ':' in chown (thanks to Matt Zimmerman).
+  * Fixed manpage typo (thanks to Marc Lehmann). Closes: #207090
+
+ -- Christian Hammers <ch@debian.org>  Fri,  3 Oct 2003 15:59:35 +0200
+
+mysql-dfsg (4.0.14-1) unstable; urgency=low
+
+  * New upstream version. 
+
+ -- Christian Hammers <ch@debian.org>  Sun, 24 Aug 2003 16:40:36 +0200
+
+mysql-dfsg (4.0.13-3) unstable; urgency=low
+
+  * Now start mysqld as default unless you choose not when configurig
+    with debconf priority low. So packages depending on the server when
+    installing can access it. Thanks Matt Zimmermann (Closes: #200277)
+  * Made mysql-server de-installable if the config and database files were
+    removed by hand before. Thanks to Ard van Breemen (Closes: #200304)
+
+ -- Christian Hammers <ch@debian.org>  Tue,  8 Jul 2003 22:30:40 +0200
+
+mysql-dfsg (4.0.13-2) unstable; urgency=low
+
+  * Added "nice" option for mysqld_safe to give mysqld a different priority.
+    Submitted to upstream as MySQL Bug #627. Closes: #192087
+  * Fixed possible unbound variable in init script. Closes: #194621
+  * Fixed french debconf translation (thx Christian Perrier) Closes: #194739
+  * Get rid of automake1.5 (for Eric Dorland). 
+
+ -- Christian Hammers <ch@debian.org>  Wed, 11 Jun 2003 18:58:32 +0200
+
+mysql-dfsg (4.0.13-1) unstable; urgency=medium
+
+  * New upstream version.
+    !!! Fixes a very bad natural join bug which justifies the urgency=medium.
+    !!! http://bugs.mysql.com/bug.php?id=291
+  * Fixed mysql_fix_privileges manpage (Frederic Briere) Closes: #191776
+  * preinst: "which" is more chatty normal executable than as builtin.
+    (Thanks to David B Harris). Closes: #188659
+
+ -- Christian Hammers <ch@debian.org>  Tue,  6 May 2003 22:03:45 +0200
+
+mysql-dfsg (4.0.12-3) unstable; urgency=medium
+
+  * Reincluded new way of creating my debian-sys-maint user from
+    an old release from experimental. Now works again with old
+    and new privilege table format. (Thanks to Vincent Danjean
+    for spotting the problem) Closes: #188201
+  * Reincluded hurd build dependency fix from 3.23 branch.
+    (Thanks to Robert Millan). Closes: #185929
+  * Fixed soname in libmysqlclient-dev.  Closes: #188160
+  * Remove /var/log/mysql/ when purging the package. Closes: #188064
+  * Removed /usr/share/doc/mysql/ from mysql-server. Closes: #188066
+  * Let group "adm" be able to read logfiles. Closes: #188067
+  * Do not call usermod on every upgrade. Closes: #188248
+    (Thanks to Philippe Troin for the last three)
+  * Fixed mysql-server.preinst so that it works on shells where 
+    which is a builtin, too. (Thanks to Erich Schubert) Closes: #181525
+
+ -- Christian Hammers <ch@debian.org>  Fri, 11 Apr 2003 11:32:45 +0200
+
+mysql-dfsg (4.0.12-2) unstable; urgency=low
+
+  *
+  * NEW MAJOR UPSTREAM RELEASE:
+  *
+    MySQL 4 has finally been declared as 'stable'. Hurray! Read changelogs.
+    Thanks to all testers, esp. Jose Luis Tallon, of the versions
+    that were in the "experimental" section before.
+  * Modified postinst script to run mysql_fix_privileges on every update.
+    IMPORTANT: Please report if this breaks anything, it is not supposed to.
+  * Wrote a SSL-MINI-HOWTO.txt!
+  * Added zlib1g-dev to libmysqlclient12-dev. Closes: 186656
+  * Changed section of libmysqlclient12-dev to libdevel.
+  * Added even more selfwritten manpages.
+  * Fixed typos.
+
+ -- Christian Hammers <ch@debian.org>  Sun,  6 Apr 2003 13:47:32 +0200
+
+mysql-dfsg (4.0.10.gamma-1) experimental; urgency=low
+
+  * New upstream version.
+  * They merged some of my patches from debian/patches. Whoa!
+  * This release should fix the error-logfile problem where mysqld
+    keeps the error.log open while logrotate removes it.
+
+ -- Christian Hammers <ch@debian.org>  Wed, 12 Feb 2003 22:39:48 +0100
+
+mysql-dfsg (4.0.9.gamma-1) experimental; urgency=low
+
+  * New upstream version. 
+  * Updated the GNU autoconf files to make building on MIPS work.
+    See bug #176829.
+
+ -- Christian Hammers <ch@debian.org>  Wed, 29 Jan 2003 22:07:44 +0100
+
+mysql-dfsg (4.0.8.gamma-1) experimental; urgency=low
+
+  * New upstream release. 
+  * Improved logging of init script. Closes: #174790
+  * We have now libmysqlclient.so.12 instead of .11.
+
+ -- Christian Hammers <ch@debian.org>  Thu,  9 Jan 2003 20:14:11 +0100
+
+mysql-dfsg (4.0.7.gamma-1) experimental; urgency=high
+
+  * SECURITY: This version fixes an upstream security release that is only
+              present in the 4.x branch which is currently only in the
+              experimental distribution and therefore will not get a DSA. 
+  * New upstream release.
+
+ -- Christian Hammers <ch@debian.org>  Sat, 28 Dec 2002 15:51:39 +0100
+
+mysql-dfsg (4.0.6.gamma-2) experimental; urgency=low
+
+  * Added --system to addgroup. Closes: #173866
+
+ -- Christian Hammers <ch@debian.org>  Sat, 21 Dec 2002 15:28:26 +0100
+
+mysql-dfsg (4.0.6.gamma-1) experimental; urgency=low
+
+  * New upstream version. Now Gamma!
+  * There are no longer changes to the .orig.tar.gz neccessary to make diff
+    happy. docs/ has still to be deleted, although, as it is non-free.
+  * Incorporated patches from unstable.
+  * Added mysqlmanager and a couple of other new scripts.
+  * Enabled libmysqld embedded server library.
+  * Enabled SSL and Virtual-IO support.
+    (CORBA based MySQL-FS seems to be not existing..)
+
+ -- Christian Hammers <ch@debian.org>  Fri, 20 Dec 2002 22:30:51 +0100
+
+mysql-dfsg (4.0.5a.beta-3) experimental; urgency=low
+
+  * Modified postinst to work with old and new mysql.user table format
+    and fixed spelling typo in postinst. Thanks to Roger Aich.
+  * Updated config.{guess,sub} to make the mipsel porters happy.
+    Thanks to Ryan Murray. Closes: #173553
+
+ -- Christian Hammers <ch@debian.org>  Wed, 18 Dec 2002 15:56:34 +0100
+
+mysql-dfsg (4.0.5a.beta-2) experimental; urgency=low
+
+  * Upstream removed option "--skip-gemini". So did I. Closes: 173142
+
+ -- Christian Hammers <ch@debian.org>  Tue, 17 Dec 2002 10:35:49 +0100
+
+mysql-dfsg (4.0.5a.beta-1) experimental; urgency=low
+
+  * First 4.x experimental package due to continuous user requests :-)
+    Please test and report!
+  * upstream: safe_mysqld has been renamed to mysqld_safe
+  * upstream: new library soname version libmysqlclient.so.11
+  * Renamed libmysqlclientXX-dev to libmysqlclient-dev as I don't plan to
+    support more than one development environment and this makes the 
+    dependencies easier.
+  * FIXME: Skipped parts of the debian/patches/alpha patch as the global.h 
+    is not existing.
+  * FIXME: How to get rid this? Old ltconfig patch already applied.
+    "lintian: binary-or-shlib-defines-rpath ./usr/bin/mysql /usr/lib/mysql"
+
+ -- Christian Hammers <ch@debian.org>  Sun,  1 Dec 2002 18:32:32 +0100
+
+mysql-dfsg (3.23.53-4) unstable; urgency=medium
+
+  * Fixed errno.h problem. Closes: #168533, #168535 
+
+ -- Christian Hammers <ch@debian.org>  Sun, 10 Nov 2002 18:32:08 +0100
+
+mysql-dfsg (3.23.53-3) unstable; urgency=medium
+
+  * Changed automake build-dep to unversioned automake1.4. Closes: #166391
+  * Fixed description. Closes: #167270
+    (Thanks to Soren Boll Overgaard)
+
+ -- Christian Hammers <ch@debian.org>  Tue,  5 Nov 2002 01:25:01 +0100
+ 
+mysql-dfsg (3.23.53-2) unstable; urgency=low
+
+  * Reverted user creation in init scripts. Closes: #166432
+    (Thanks to Birzan George Cristian) 
+
+ -- Christian Hammers <ch@debian.org>  Thu, 31 Oct 2002 15:36:25 +0100
+
+mysql-dfsg (3.23.53-1) unstable; urgency=low
+
+  * New upstream release. 
+
+ -- Christian Hammers <ch@debian.org>  Thu, 24 Oct 2002 23:04:16 +0200
+
+mysql-dfsg (3.23.52-3) unstable; urgency=low
+
+  * Substituted the first-install 'debian-sys-maint' user creation by
+    something ANSI SQL compliant. Closes: #163497
+    (Thanks to Karl Hammar)
+  * Tightend dependency to debhelper (>= 4.0.12) to be sure that
+    debconf-utils gets installed, too, as I use dh_installdebconf.
+  * Fixed upstream manpage bug in mysqldump.1. Closes: #159779
+    (Thanks to Colin Watson) 
+  * Added comment about MIN_WORD_LEN to mysql-server.README.Debian
+    (Thanks to Philipp Dreimann)
+  * Added a dependency for zlib1g-dev to libmysqlclient10-dev.
+    (Thanks to Jordi Mallach)
+
+ -- Christian Hammers <ch@debian.org>  Sun, 15 Sep 2002 17:14:44 +0200
+
+mysql-dfsg (3.23.52-2) unstable; urgency=low
+
+  * Fixed typo in preinst scripts.
+  * Removed bashism in init script.
+  * Fixed ambiguous debconf example. Closes: #158884
+
+ -- Christian Hammers <ch@debian.org>  Fri, 30 Aug 2002 00:51:29 +0200
+
+mysql-dfsg (3.23.52-1) unstable; urgency=low
+
+  * New upstream version. Closes: #157731
+  * Clearified the meaning of the debian-sys-maint special user in the
+    README.Debian file. Closes: #153702
+  * Wrote some words regarding the skip-networking in README.Debian.
+    Closes: #157038
+  * Added dependency to passwd. 
+  * Fixes typo and unnecessarily complication in is_mysql_alive().
+  * Added check for /etc/mysql/my.cnf in init script.
+
+ -- Christian Hammers <ch@debian.org>  Tue, 27 Aug 2002 01:53:32 +0200
+
+mysql-dfsg (3.23.51-4) unstable; urgency=low
+
+  * Added a compressed "nm mysqld" output to allow people to trace
+    core dumps with /usr/bin/resolve_stack_dump as suggested in the
+    INSTALL-SOURCE file. Thanks to atudor@labs.agilent.com for the hint.
+
+ -- Christian Hammers <ch@debian.org>  Wed, 24 Jul 2002 20:44:55 +0200
+
+mysql-dfsg (3.23.51-3) unstable; urgency=low
+
+  * Corrected copyright file: the MySQL client library is licenced under
+    the LGPL-2 not the GPL. From version 4.x it actually will be GPL this
+    is why parts of http://www.mysql.com/ already say so. Closes: #153591
+  * Corrected german translation.
+    Thanks to Roland Rosenfeld <roland@spinnaker.de>. Closes: #151903 
+
+ -- Christian Hammers <ch@debian.org>  Thu, 11 Jul 2002 20:32:28 +0200
+
+mysql-dfsg (3.23.51-2) unstable; urgency=low
+
+  * Improved NIS tolerance in preinst script. 
+
+ -- Christian Hammers <ch@debian.org>  Sun,  7 Jul 2002 04:43:28 +0200
+
+mysql-dfsg (3.23.51-1) unstable; urgency=medium
+
+  * New upstream version.
+  * I applied a patch that fixes a binary imcompatibility in
+    the shared libary libmysqlclient.so.10 between 3.23.50 and
+    some versions earlier. Upstream has been contacted and asked
+    for clarification. Closes: #149952
+  * Added support for NIS i.e. it shows a warning and fails if the
+    needed 'mysql' user does not exists but works if it does.
+    Closes: #143282, #147869
+  * Substituted $0 in init scripts by something really weird so that
+    "./S20mysql restart" works now, too. (BTW: S20? install file-rc!!!)
+    Closes: #148658
+  * Now postinst works even if /etc/init.d/mysql is removed. Closes: #151021
+  * Decided to leave "set +x" in postinst but wrote comment. Closes: #151022
+
+ -- Christian Hammers <ch@debian.org>  Sun,  7 Jul 2002 04:43:25 +0200
+
+mysql-dfsg (3.23.50-1) unstable; urgency=medium
+
+  * New upstream version.
+    Fixes a very annoying and important bug that lets all mysql programs
+    including perl scripts etc. segfault when using the read_default_group()
+    function. 3.23.50 is currently a pre-release and expected to be released
+    next week. I plan to propose it for woody as soon as its stability has
+    been proven. The following bug reports are all regarding this issue.
+    Closes: #144960, #145322, #136798, #138143,
+
+ -- Christian Hammers <ch@debian.org>  Sat, 18 May 2002 21:14:01 +0200
+
+mysql-dfsg (3.23.49x-1) unstable; urgency=low
+
+  * I had to split the package to seperate the manual as it is not GPL
+    like the rest of the software and docs but under a license that
+    e.g. forbids selling printed versions. 
+    .
+    The upstream authors were contacted a while ago but did not like to
+    change the situation.
+    .
+    The names of the resulting packages have not changed as the manual
+    already was in a seperate mysql-doc package due to it's size. 
+    The source packages are now splitted from one "mysql" to 
+    "mysql-dfsg" in main and "mysql-nonfree" in non-free.
+  * No code change! 
+    The "x" at the end of the version number ist just to be able to 
+    upload a new source package. ("a" was already taken by upstream 
+    for their binary upload correction)  
+
+ -- Christian Hammers <ch@debian.org>  Wed,  8 May 2002 02:01:41 +0200
+
+mysql (3.23.49-8) unstable; urgency=low
+
+  * Substituted $0 in init script to let e.g. "/etc# ./init.d/mysql restart"
+    works, too. Closes: #141555
+
+ -- Christian Hammers <ch@debian.org>  Sun,  7 Apr 2002 15:00:44 +0200
+
+mysql (3.23.49-7) unstable; urgency=low
+
+  * The Makefiles are totally broken for the --enable-local-infile
+    option. I now patched libmysql/libmysql.c#mysql_init() manually.
+    Closes: #138347 
+
+ -- Christian Hammers <ch@debian.org>  Fri, 29 Mar 2002 23:55:15 +0100
+
+mysql (3.23.49-6) unstable; urgency=low
+
+  * Moved mysqlcheck from server to client package. Closes: #139799
+  * Added manpage for mysqlhotcopy. Regarding: #87097 
+  * Added 'sharedscripts' directive to the logrotate script.
+  * Replaced grep by /usr/bin/getent to let the group/user checking work
+    on NIS/LDAP systems, too. Closes: #115677, #101529
+
+ -- Christian Hammers <ch@debian.org>  Fri, 22 Mar 2002 22:40:51 +0100
+
+mysql (3.23.49-5) unstable; urgency=low
+
+  * Added skip-innodb to default my.cnf.
+  * Enabled --enable-local-infile, it seems to be a new option that
+    defaults to disable a formerly enabled feaure. Closes: #137115
+
+ -- Christian Hammers <ch@debian.org>  Sat, 16 Mar 2002 00:29:10 +0100
+
+mysql (3.23.49-4) unstable; urgency=medium
+
+  * Recompiled against fixed libz.
+
+  * Enabled --enable-local-infile, it seems to be a new option that
+    defaults to disable a formerly enabled feaure. Closes: #137115
+  * Fixed README.compile_on_potato. Closes: #136529 
+  * Now a ext3 .jounal file in /var/lib/mysql does not prevent the
+    installation (happens when creating a jounal on an already mounted
+    partition). Closes: #137146
+
+ -- Christian Hammers <ch@debian.org>  Wed, 13 Mar 2002 13:34:24 +0100
+
+mysql (3.23.49-3) unstable; urgency=low
+
+  * Added Russian translation. Closes: #135846
+  * Fixed installation of .info documents. Closes: #135030
+
+ -- Christian Hammers <ch@debian.org>  Wed, 27 Feb 2002 23:36:35 +0100
+
+mysql (3.23.49-2) unstable; urgency=low
+
+  * Updated french translation and split template files. Closes: #134754 
+  * Fixed a small debian.cnf related bug in mysql-server.postinst.
+
+ -- Christian Hammers <ch@debian.org>  Tue, 19 Feb 2002 23:13:58 +0100
+
+mysql (3.23.49-1) unstable; urgency=low
+
+  * New upstream release.
+    (Mainly InnoDB related fixes)
+  * Exported a $HOME variable in the scripts so that /root/.my.cnf
+    is not read anymore. This will avoid problems when admins put 
+    only passwords but no usernames in this file. Closes: #132048
+  * New debian-sys-maint password algorithm (now ~96bit :-)) Closes: #133863
+  * Recreating debian-sys-main pwd on every install to help people who
+    accidently delete user or password files...
+  * Added /var/log/mysql so that user can put the binary logs in there as
+    mysql cannot write the .001 etc files itself in /var/log which is 
+    owned by root.
+
+ -- Christian Hammers <ch@debian.org>  Thu, 14 Feb 2002 22:17:45 +0100
+
+mysql (3.23.47-6) unstable; urgency=low
+
+  * Dropped a sentence about the new debian-sys-maint user in the
+    debconf note and updated the README.Debian.  Related: #132048
+  * Added more french translation. Closes: #132390 
+
+ -- Christian Hammers <ch@debian.org>  Wed,  6 Feb 2002 09:41:29 +0100
+
+mysql (3.23.47-5) unstable; urgency=low
+
+  * Fixed grammar error in template. Closes: #132238 
+  * Really fixed typo in logrotate script. Closes: #131711
+
+ -- Christian Hammers <ch@debian.org>  Tue,  5 Feb 2002 14:20:08 +0100
+
+mysql (3.23.47-4) unstable; urgency=medium
+
+  * Fixes typo in postinst that let init script fail. Closes: #131743
+  * Fixed bashism bug that failed on ash. Closes: #131697
+  * Fixed typo in logrotate script. Closes: #131711
+
+ -- Christian Hammers <ch@debian.org>  Thu, 31 Jan 2002 23:58:46 +0100
+
+mysql (3.23.47-3) unstable; urgency=low
+
+  * Added new Debian specific mysql user called 'debian-sys-maint' which
+    is used for pinging the server status, flushing the logs or shutting
+    down the server in maintenance scripts. The credentials of this user
+    are stored in the UID0-only readable file /etc/mysql/debian.cnf.
+    Closes: #129887, #130326, #99274
+  * Fixed unintended server startup at boottime. Closes: #122676, #130105
+  * New upstream fixes command line parsing bug: Closes: #128473
+  * Fixed manpage headers to let apropos work: Closes: #119122
+  * Added "status" options for /etc/init.d/mysql. Closes: #129020
+
+ -- Christian Hammers <ch@debian.org>  Sun, 27 Jan 2002 19:46:11 +0100
+
+mysql (3.23.47-2) unstable; urgency=low
+
+  * Enhanced init scripts by using mysqladmin instead of kill $pid.
+    Thanks to Aaron Brick. 
+
+ -- Christian Hammers <ch@debian.org>  Fri, 18 Jan 2002 01:42:23 +0100
+
+mysql (3.23.47-1) unstable; urgency=low
+
+  * New upstream release.
+  * Updated brazilian translation of debconf descriptions. Closes: #123332
+
+ -- Christian Hammers <ch@debian.org>  Sun,  6 Jan 2002 21:11:17 +0100
+
+mysql (3.23.46-3) unstable; urgency=low
+
+  * Fixed bug in postinst where a script was accidently called with
+    "bash -c <script> -IN_RPM" prevting the first argument to take effect
+    and then leading to failures on hosts with unresolvable hostnames.
+    Closes: #126147
+  * Small changes and comments in postinst. 
+
+ -- Christian Hammers <ch@debian.org>  Sat, 22 Dec 2001 14:03:02 +0100
+
+mysql (3.23.46-2) unstable; urgency=low
+
+  * Start/stop behaviour now configurable via debconf. Closes: #112174 
+
+ -- Christian Hammers <ch@debian.org>  Sun,  9 Dec 2001 21:38:54 +0100
+
+mysql (3.23.46-1) unstable; urgency=low
+
+  * New upstream release. 
+    Only few fixes, mainly innodb related. 
+
+ -- Christian Hammers <ch@debian.org>  Sun,  2 Dec 2001 03:08:48 +0100
+
+mysql (3.23.45-1) unstable; urgency=low
+
+  * New upstream version. 
+    Only few fixes, mainly innodb related. 
+  * Added debconf note regarding the skip-networking option.
+
+ -- Christian Hammers <ch@debian.org>  Sun, 25 Nov 2001 16:50:37 +0100
+
+mysql (3.23.44-2) unstable; urgency=low
+
+  * Finally removed debconf toggled "skip-networking" line add/remove 
+    code for /etc/mysql/my.cnf. I don't like editing a file that's tagged
+    as configuration file.
+    I disabled networking by default for security reasons. Better ideas?
+
+ -- Christian Hammers <ch@debian.org>  Fri, 16 Nov 2001 02:11:02 +0100
+
+mysql (3.23.44-1) unstable; urgency=low
+
+  * New upstream release.
+    - fixes replication bug (core dump)
+  * Made description better english :) Thanks to D. Welton. 
+
+ -- Christian Hammers <ch@debian.org>  Sun, 11 Nov 2001 15:44:07 +0100
+
+mysql (3.23.43-4) unstable; urgency=low
+
+  * Disabled statically linking. 
+
+ -- Christian Hammers <ch@debian.org>  Sat, 10 Nov 2001 03:15:56 +0100
+
+mysql (3.23.43-3) unstable; urgency=low
+
+  * Changed compiler settings after one user reported instabilities.
+    See #116631 for more information. 
+
+ -- Christian Hammers <ch@debian.org>  Tue, 30 Oct 2001 21:39:17 +0100
+
+mysql (3.23.43-2) unstable; urgency=low
+
+  * Patched sparc mutexes again. Closes: #113430 
+
+ -- Christian Hammers <ch@debian.org>  Sun,  7 Oct 2001 15:09:00 +0200
+
+mysql (3.23.43-1) unstable; urgency=low
+
+  * New upstream version.
+    - Fixed some unlikely(sic!) bugs and core dumps.
+    - Fixed a bug with BDB tables and UNIQUE columns that are NULL.
+    - [more minor bugs were fixed; see changelog]
+  * Adjusted build depends on libwrap0 for IA-64. Closes: #114582
+  * Added the mysqlcheck binary. Closes: #114490
+  * Fixed rules for arm architecture. Closes: #88186
+  * Renamed mysql_print_defaults to the original name my_print_defaults.
+    Isn't as descriptive but else I'd have to patch too much. Closes: #114492
+
+ -- Christian Hammers <ch@debian.org>  Fri,  5 Oct 2001 22:24:40 +0200
+
+mysql (3.23.42-2) unstable; urgency=low
+
+  * Applied patch for m68k compile. Closes: #112904 
+
+ -- Christian Hammers <ch@debian.org>  Sun, 23 Sep 2001 21:32:57 +0200
+
+mysql (3.23.42-1) unstable; urgency=low
+
+  * New upstream releae.
+    Fixes critical bug with InnoDB and large BLOBs. 
+
+ -- Christian Hammers <ch@debian.org>  Tue, 18 Sep 2001 22:25:47 +0200
+
+mysql (3.23.41-2) unstable; urgency=low
+
+  * Fixed shlibs.local problem. Closes: #111573 
+  * Replaced emacs by sensible-editor in mysqlbug.sh. Thanks Hans Ginzel.
+
+ -- Christian Hammers <ch@debian.org>  Sun,  9 Sep 2001 17:16:42 +0200
+
+mysql (3.23.41-1) unstable; urgency=low
+
+  * New upstream release
+  * Fixed build problem on ia64. Closes: #110624
+
+ -- Christian Hammers <ch@debian.org>  Tue, 14 Aug 2001 23:20:35 +0200
+
+mysql (3.23.40-1) unstable; urgency=low
+
+  * New upstream release
+
+ -- Christian Hammers <ch@debian.org>  Sun,  5 Aug 2001 19:46:18 +0200
+
+mysql (3.23.39-5) unstable; urgency=low
+
+  * Added debconf template for brazil. Closes: #106934, #106752 
+  * Tightened dependencies on debconf.
+  * Adjusted mysql.err permissions in logrotate script to 0600. Closes: #105672
+
+ -- Christian Hammers <ch@debian.org>  Mon, 30 Jul 2001 00:10:12 +0200
+
+mysql (3.23.39-4.1) unstable; urgency=low
+
+  * Maintainer-requested NMU.
+  * Fixing thread mutexes on Sparc and Alpha
+    (closes: Bug#101783)
+  * Added --enable-assembler for sparc.  This should
+    allow mysql on sparc to use assembler versions of
+    some string functions (read: should speed up a bit).
+
+ -- Christopher C. Chimelis <chris@debian.org>  Fri, 13 Jul 2001 15:09:30 -0400
+  
+mysql (3.23.39-4) unstable; urgency=low
+
+  * Porting fixes.
+
+ -- Christian Hammers <ch@debian.org>  Mon,  9 Jul 2001 17:56:54 +0200
+
+mysql (3.23.39-3.1) unstable; urgency=low
+
+  * NMU (for porting)
+  * Update config.sub and config.guess for hppa, sh & s390.
+  * Add --with-client-ldflags=-lstdc++ to configure line.  Closes: #100884
+
+ -- Matthew Wilcox <willy@debian.org>  Sun,  8 Jul 2001 19:26:59 -0600
+
+mysql (3.23.39-3) unstable; urgency=low
+
+  * Disabled berkeley-db on sparc again. Mutexes aren't working again :-( 
+
+ -- Christian Hammers <ch@debian.org>  Sat,  7 Jul 2001 18:30:08 +0200
+
+mysql (3.23.39-2) unstable; urgency=low
+
+  * Bugfixed the m68k mutex patch. Thanks to Michael Fedrowitz. Closes: #103145 
+  * Removed config.cache files in bdb/ and innobase/. Closes: #103143
+
+ -- Christian Hammers <ch@debian.org>  Wed,  4 Jul 2001 22:06:58 +0200
+
+mysql (3.23.39-1) unstable; urgency=low
+
+  * New upstream release. Minor bugfixes only. 
+
+ -- Christian Hammers <ch@debian.org>  Thu, 14 Jun 2001 13:53:03 +0200
+
+mysql (3.23.38-4) unstable; urgency=low
+
+  * Added logcheck files. Closes: #99131
+    (I can't let the usermod away since I don't know of an easy way to
+    retrive "passwd" information in a shell script considering that 
+    people use different storage methods like LDAP/NIS instead of passwd.)
+
+ -- Christian Hammers <ch@debian.org>  Fri,  8 Jun 2001 21:04:25 +0200
+
+mysql (3.23.38-3) unstable; urgency=low
+
+  * Explicit pointet to /root/.my.cnf to let /etc/init.d/mysql stop
+    work in sudo environments with $HOME!=/root work, too. Closes: #98324
+  * Removes empty /etc/mysql on purge. Closes: #98164
+
+ -- Christian Hammers <ch@debian.org>  Tue, 22 May 2001 10:13:06 +0200
+
+mysql (3.23.38-2) unstable; urgency=low
+
+  * Added depends to libdbd-mysql-perl for mysql-server. Closes: #94306
+
+ -- Christian Hammers <ch@debian.org>  Sat, 19 May 2001 19:43:26 +0200
+
+mysql (3.23.38-1) unstable; urgency=low
+
+  * New upstream release. 
+  * Added Build-Depends to procps. Closes: #96768
+
+ -- Christian Hammers <ch@debian.org>  Sun, 13 May 2001 17:30:15 +0200
+
+mysql (3.23.37-5) unstable; urgency=low
+
+  * Applied mutex patch for bdb support on m68k. 
+    Thanks to Michael Fedrowitz for the patch.
+
+ -- Christian Hammers <ch@debian.org>  Mon,  7 May 2001 12:30:40 +0200
+
+mysql (3.23.37-4) unstable; urgency=low
+
+  * Enable bdb support for m68k architecture.
+
+ -- Christian Hammers <ch@debian.org>  Sat,  5 May 2001 16:47:36 +0200
+
+mysql (3.23.37-3) unstable; urgency=low
+
+  * Added thread-safe client library. Thanks to Shane Wegner. Closes: #95441 
+
+ -- Christian Hammers <ch@debian.org>  Sat, 28 Apr 2001 09:45:00 -0400
+
+mysql (3.23.37-2) unstable; urgency=low
+
+  * Added sparc to the list of BDB supporting architectures after some
+    tests on vore.debian.org and mails with Ben Collons.
+
+ -- Christian Hammers <ch@debian.org>  Fri, 27 Apr 2001 09:30:09 -0400
+
+mysql (3.23.37-1) unstable; urgency=low
+
+  * New upstream version. 
+  * Added gemini table support.
+  * Does anybody know how to enable SSL?
+  * Fixed ARM compilation problem. Closes: #88186
+
+ -- Christian Hammers <ch@debian.org>  Sat, 21 Apr 2001 11:48:46 -0400
+
+mysql (3.23.36-2) unstable; urgency=low
+
+  * Added patch by Christopher C. Chimelis <chris@debian.org> to make
+    Berkeley db3 work again on Alpha architecture. Closes: #92787
+
+ -- Christian Hammers <ch@debian.org>  Tue,  3 Apr 2001 23:41:46 +0200
+
+mysql (3.23.36-1) unstable; urgency=high
+
+  * New upstream version.
+  * SECURITY FIX: One could place database tables outside the database
+    directory by using '..' in one of the mysql helper programs where the
+    table name was not checked correctly. This could lead to root compromise
+    if the server would be running as root else you could at least do bad
+    things as user mysql.
+  * upstream: Fixed bug when thread creation failed.
+  * upstream: Fixed problem in Innobase with non-latin1 charsets
+  * upstream: Fixed a core-dump bug when using very complex query with DISTINGT
+  * upstream: many others so called minor bugs...
+  * fixes bug in init script. Closes: #90257 
+    (this report was agains some older problem that has been fixed too in .33)
+
+ -- Christian Hammers <ch@debian.org>  Fri, 30 Mar 2001 02:55:12 +0200
+
+mysql (3.23.35-1) unstable; urgency=medium
+
+  * New upstream relase.
+  * Fixes problem in ORDER BY clause. People using 3.33.34 should upgrade!
+  * Includes innobase support.
+    (Hope this is not such a catastrophe like berkeley db...)
+
+ -- Christian Hammers <ch@debian.org>  Fri, 16 Mar 2001 23:30:30 +0100
+
+mysql (3.23.33-3) unstable; urgency=low
+
+  * Forgot #!/bin/sh at top of mysql-doc.postinst. Closes: #89801 
+
+ -- Christian Hammers <ch@vore.debian.org>  Thu, 15 Mar 2001 20:38:35 -0500
+
+mysql (3.23.33-2) unstable; urgency=low
+
+  * Added some missing scripts and manpages. Closes: #84068
+  * Added dependency to perl-5.6. Closes: #81942 
+  * Added french templates somewhen ago. Closes: #83790
+  * Added patch to get db3 working on Alpha. Closes: #86033
+    Thanks to Christopher C. Chimelis <chris@debian.org>. The patch
+    itself is included as debian/patch.alpha, too.
+
+ -- Christian Hammers <ch@debian.org>  Sun, 18 Feb 2001 06:40:40 +0100
+
+mysql (3.23.33-1) unstable; urgency=high
+
+  * Fixes two security bugs that allowes crashing the server and maybe
+    gaining the UID of the process that is linked against libmysqlclient!
+
+ -- Christian Hammers <ch@debian.org>  Tue, 13 Feb 2001 23:01:18 +0100
+
+mysql (3.23.32-1) unstable; urgency=low
+
+  * New upstream releaes.
+    (just minor fixes)
+  * Added french and german debconf templates. 
+
+ -- Christian Hammers <ch@debian.org>  Sun,  4 Feb 2001 17:27:07 +0100
+
+mysql (3.23.31-1) unstable; urgency=high
+
+  * New upstream release.
+  * Fixes security bug that was announced at BUGTRAQ mailing list.
+    (Disappointingly not by mysql.com!). And allows a buffer overflow
+    and therefore access to the mysql UID and all databases when already
+    having a valid account. Closes: #82881
+
+ -- Christian Hammers <ch@debian.org>  Sat, 20 Jan 2001 11:14:36 +0100
+
+mysql (3.23.30-2) unstable; urgency=low
+
+  * Recompiled with new dpkg-dev. 
+
+ -- Christian Hammers <ch@debian.org>  Sun, 14 Jan 2001 22:20:55 +0100
+
+mysql (3.23.30-1) unstable; urgency=low
+
+  * New upstream release. 
+
+ -- Christian Hammers <ch@debian.org>  Sun,  7 Jan 2001 22:10:18 +0100
+
+mysql (3.23.28-10) testing unstable; urgency=low
+
+  * I must upload to "testing" to get it into woody, right?! 
+
+ -- Christian Hammers <ch@debian.org>  Fri, 29 Dec 2000 14:43:57 +0100
+
+mysql (3.23.28-9) unstable; urgency=low
+
+  * Made it a replacement for libmysqlclient9. 
+
+ -- Christian Hammers <ch@westend.com>  Mon, 25 Dec 2000 19:15:04 +0100
+
+mysql (3.23.28-8) unstable; urgency=low
+
+  * Applied patch from a user to get the skip-networking option working!
+    Approved from a mysql employee but please test anyways.
+    This finally: Closes: #79672, #78634, #79660, #79658
+
+ -- Christian Hammers <ch@debian.org>  Sat, 16 Dec 2000 14:01:36 +0100
+
+mysql (3.23.28-6) unstable; urgency=medium
+
+  * Fixed error in postinst. Closes: #79392, #79400, #79451, #79550
+  * Added .info files again on user request. Closes: #78988, #75737 
+
+ -- Christian Hammers <ch@debian.org>  Wed, 13 Dec 2000 21:18:24 +0100
+
+mysql (3.23.28-5) unstable; urgency=low
+
+  * Fixed a stupid bug in mysql-server.postinst regarding the 
+    configuration of skip-networking. Closes: #78639, 78634
+  * Used patched bdb which hopefully enables mutexes on Alpha. Closes: #78197
+  * Added dependency to adduser. Closes: #76798
+
+ -- Christian Hammers <ch@debian.org>  Sun, 10 Dec 2000 16:55:48 +0100
+
+mysql (3.23.28-4) unstable; urgency=low
+
+  [never uploaded]
+  * Fixed a stupid bug in mysql-server.postinst regarding the 
+    configuration of skip-networking. Closes: #78639, 78634
+  * Used patched bdb which hopefully enables mutexes on Alpha. Closes: #78197
+
+ -- Christian Hammers <ch@debian.org>  Sun,  3 Dec 2000 17:49:44 +0100
+
+mysql (3.23.28-3) unstable; urgency=low
+
+  * This time really fixed m68k build error. Closes: #78235 
+
+ -- Christian Hammers <ch@debian.org>  Sun,  3 Dec 2000 15:02:55 +0100
+
+mysql (3.23.28-2) unstable; urgency=low
+
+  * Adjusted rules file to make it buildable on m86k. Closes: #78235 
+
+ -- Christian Hammers <ch@debian.org>  Fri,  1 Dec 2000 20:07:26 +0100
+
+mysql (3.23.28-1) unstable; urgency=low
+
+  * New upstream vesrion. Now gamma! 
+  * Changed umask of mysql.log making it o-rw
+  * Disabled listening on network reachable TCP ports by default due to
+    security considerations.
+
+ -- Christian Hammers <ch@debian.org>  Thu, 23 Nov 2000 20:12:50 +0100
+
+mysql (3.23.27-1) unstable; urgency=low
+
+  * New upstream version.
+  * Closes: #75711
+
+ -- Christian Hammers <ch@debian.org>  Sun, 29 Oct 2000 14:29:51 +0100
+
+mysql (3.23.25-4) unstable; urgency=low
+
+  * Recompiled to get rid of the dependency for zlib1 (libc5).
+    Closes: #74952, #74939
+
+ -- Christian Hammers <ch@debian.org>  Tue, 17 Oct 2000 14:34:52 +0200
+
+mysql (3.23.25-3.1) unstable; urgency=low
+
+  * Maintainer-approved NMU.
+  * Includes patch to fix and enable db3 support on Alpha.
+  * Enable support for thread mutexes in db3 on sparc
+    (it works after all, according to Ben Collins)
+  * Removed atomic_ functions for Alpha since they are no
+    longer supported in the current glibc in woody.
+  * Cleaned up rules file a bit.
+
+ -- Christopher C. Chimelis <chris@debian.org>  Sat, 14 Oct 2000 04:22:02 -0400
+
+mysql (3.23.25-3) unstable; urgency=low
+
+  * Upstream decided not to include my_config.h,my_dir.h into the installed
+    header files. As this file contains at least informative material
+    and more important is checked by several autoconf scripts I
+    included it by hand again.
+  * Made building of berkeley db conditional to architecture until
+    I get response whether it works on sparc/alpha now.
+
+ -- Christian Hammers <ch@debian.org>  Wed, 11 Oct 2000 23:58:38 +0200
+
+mysql (3.23.25-2) unstable; urgency=medium
+
+  * Last build went terrible wrong.. Here's the changelog again:
+  * New upstream release.
+  * Shared library version was raised from 9 to 10.
+    Maintainers of packets using libmysqlclient9 must recompile!
+
+ -- Christian Hammers <ch@debian.org>  Wed, 11 Oct 2000 01:16:34 +0200
+
+mysql (3.23.25-1) unstable; urgency=low
+
+  * New upstream release.
+  * Shared library version was raised from 9 to 10.
+    Maintainers of packets using libmysqlclient9 must recompile!
+
+ -- Christian Hammers <ch@debian.org>  Sat,  7 Oct 2000 18:21:51 +0200
+
+mysql (3.23.24-2) unstable; urgency=low
+
+  * Applied upstream patch regarding quoting of mysqldump.
+  * Updated to db-3.1.17-patched (from www.mysql.com)
+
+ -- Christian Hammers <ch@debian.org>  Fri, 15 Sep 2000 18:58:14 +0200
+
+mysql (3.23.24-1) unstable; urgency=medium
+
+  * New upstream version with some important fixes.
+  * upstream: Last version corrupted CHAR/VARCHAR/BLOB columns with 
+    chararacters above ASCII 128! Check and repair all these tables. 
+  * upstream: fixed small memory leak
+  * upstream: fixed problem with BDB tables and reading on unique
+              (not primary) key.
+  * Disabled BDB tables on all architectures except i386 due to many
+    bug reports (see #71206).  -> HELP APPRECIATED <-
+
+ -- Christian Hammers <ch@debian.org>  Tue, 12 Sep 2000 06:18:54 +0200
+
+mysql (3.23.23-2) unstable; urgency=low
+
+  * Strange... "nohup nice" gives differnet results and let therefore
+    crash safe_mysqld when starting up. Apparently it seems to be
+    kernel dependand. Now fixed by another conditional. This
+    more or less Closes: #71057
+  * This bug was reported (accidently) in the following identical reports:
+    Closes: #71253, #71254, #71257, #71258, #71259, #71262, #71266, #71267
+    Closes: #71268, #71271, #71275, #71277, #71278, #71283, #71291
+
+ -- Christian Hammers <ch@debian.org>  Sat,  9 Sep 2000 20:13:50 +0200
+
+mysql (3.23.23-1) unstable; urgency=low
+
+  * New upstream version. Feature freeze!
+  * Fixed source build problem. Closes: #70707 
+
+ -- Christian Hammers <ch@debian.org>  Thu, 31 Aug 2000 10:03:35 +0200
+
+mysql (3.23.22b-1) unstable; urgency=low
+
+  * Reorganised docs. Now we have several small html files instead of
+    one with almost 2M. Closes: 70431
+  * Removed pdf,ps and html from source package shrinked it about 3M
+    (therefore the .orig.tar.gz is called 3.23.22b!)
+  * -> Last upload failed due to problems at the FTP site so here the 
+    -> changelog again:
+  * Fixes memory leak, commit/rollback, reserved word "MASTER" ...
+  * Added Berkeley DB3 source code to the Debian diff to be able to
+    compile with bdb transaction support! (Great feature!!!)
+  * Upstream correction of error message. Closes: #68939
+  * Upstream correction of reserved word "source".
+
+ -- Christian Hammers <ch@debian.org>  Fri, 25 Aug 2000 19:21:24 +0200
+
+mysql (3.23.22-1) unstable; urgency=low
+
+  * New upstream version.
+  * Fixes memory leak, commit/rollback, reserved word "MASTER" ...
+  * Added Berkeley DB3 source code to the Debian diff to be able to
+    compile with bdb transaction support! (Great feature!!!)
+  * Upstream correction of error message. Closes: #68939
+  * Upstream correction of reserved word "source".
+
+ -- Christian Hammers <ch@debian.org>  Sun, 20 Aug 2000 09:05:48 +0200
+
+mysql (3.23.21-4) unstable; urgency=low
+
+  * Added libmysqlclient9.shlibs and shlibs.local file. Closes: #68669
+
+ -- Christian Hammers <ch@debian.org>  Wed,  9 Aug 2000 14:22:49 +0200
+
+mysql (3.23.21-3) unstable; urgency=low
+
+  * Let "/etc/init.d/mysql restart" wait until the pid has been
+    removed before (but max 6 seconds) before restarting. Closes: 65070
+  * Added build dependencies. 
+
+ -- Christian Hammers <ch@debian.org>  Sun, 30 Jul 2000 16:16:48 +0200
+
+mysql (3.23.21-2) unstable; urgency=low
+
+  * Typo in safe_mysqld prevents start.
+
+ -- Christian Hammers <ch@debian.org>  Sat, 29 Jul 2000 13:40:50 +0200
+
+mysql (3.23.21-1) unstable; urgency=low
+
+  * New upstream version.
+
+ -- Christian Hammers <ch@debian.org>  Mon, 10 Jul 2000 22:54:17 +0200
+
+mysql (3.23.20-1) unstable; urgency=low
+
+  * MySQL finally got fully GPL'ed! This means that there is only one
+    souce package and only main/* binary packages from now on.
+  * Fixed symlink in libmysqlclient9-dev. Closes: 66452
+  * Apart from that the usual bug fixes for BETA software.
+
+ -- Christian Hammers <ch@debian.org>  Mon,  3 Jul 2000 20:05:38 +0200
+
+mysql-pd (3.23.16-1) unstable; urgency=low
+
+  * New upstream release. (Actually a brand new upstream branch!)
+  * Added mysql-common package as the configuration file can be used
+    by all versions of the mysql client library.
+    Did some more package reorganisations, too. See README.Debian file!
+  * libmysqlclient.so raised major version from 6 to 9.
+  * Minor beautifications in the debian/ directory.
+
+ -- Christian Hammers <ch@debian.org>  Sat, 27 May 2000 20:30:01 +0200
+
+mysql-gpl (3.22.30-2) frozen unstable; urgency=low
+
+  * Fixed path in libmysqlclient.la. Closes: #58875 
+
+ -- Christian Hammers <ch@debian.org>  Sat, 25 Jan 2000 20:27:29 -0700
+
+mysql-gpl (3.22.30-1) frozen unstable; urgency=low
+
+  * A small change in the libmysqlclient6 causes mysqladmin to print an
+    shared library error when displaying the defaults. Everything else
+    works fine so this error wasn't detected untill now. Closes: #58033
+  * TcX released a new MySQL version that includes another security patch,
+    this time against mysqlaccess. The author told me that it would be 
+    fine if I just included the new .c in this source since I don't want
+    go to 3.22.32 in frozen.
+  * ->Release Manager: Although the version number increased there is
+    no new coded except for the shared library. The rest is the same
+    as in mysql-server and mysql-client.
+
+ -- Christian Hammers <ch@debian.org>  Tue, 15 Feb 2000 23:26:54 +0100
+
+mysql-gpl (3.22.29-1) unstable; urgency=low
+
+  * New upstream version.
+
+ -- Christian Hammers <ch@debian.org>  Thu,  6 Jan 2000 20:37:23 +0100
+
+mysql-gpl (3.22.27a-3) unstable; urgency=low
+
+  * Use system readline instead of bundled version. Closes: #50069 
+    Any objections ?
+
+ -- Christian Hammers <ch@debian.org>  Sun, 14 Nov 1999 18:09:48 +0100
+
+mysql-gpl (3.22.27a-2) unstable; urgency=low
+
+  * Now building mysql-gpl-doc in binary-indep. 
+
+ -- Christian Hammers <ch@debian.org>  Sat, 23 Oct 1999 04:22:36 +0200
+
+mysql-gpl (3.22.27a-1) unstable; urgency=low
+
+  * Adjusted version number to allow new orig.tar.gz.
+    The old seems broken :-( People reported compilation problems.
+  * Changed mysql-gpl-doc to "Architecture: all". 
+
+ -- Christian Hammers <ch@debian.org>  Sun, 17 Oct 1999 13:01:35 +0200
+
+mysql-gpl (3.22.27-1) unstable; urgency=low
+
+  * New upstream release. Fixes charset problem.
+
+ -- Christian Hammers <ch@debian.org>  Mon, 11 Oct 1999 18:01:40 +0200
+
+mysql-gpl (3.22.26a-1) unstable; urgency=low
+
+  * New upstream version. Just some small bug fixes.
+  * FHS compliance.
+
+ -- Christian Hammers <ch@debian.org>  Sun,  3 Oct 1999 10:16:14 +0200
+
+mysql-gpl (3.22.25-2) unstable; urgency=low
+
+  * Added conflict to all old mysql-dev packages. (fixes: #42966) 
+
+ -- Christian Hammers <ch@debian.org>  Sun, 15 Aug 1999 11:35:46 +0200
+
+mysql-gpl (3.22.25-1) unstable; urgency=low
+
+  * New upstream version. (We are waiting for 3.23.x !)
+  * Fixes some upstream small bugs.
+
+ -- Christian Hammers <ch@debian.org>  Sun, 18 Jul 1999 22:02:06 +0200
+
+mysql-gpl (3.22.23b-4) unstable; urgency=low
+
+  * Rebuild for new perl. 
+
+ -- Christian Hammers <ch@debian.org>  Thu,  8 Jul 1999 01:09:57 +0200
+
+mysql-gpl (3.22.23b-3) unstable; urgency=low
+
+  * libmysqlclient had the wrong socket path.
+
+ -- Christian Hammers <ch@debian.org>  Sun, 03 Jul 1999 23:13:30 +0200
+
+mysql-gpl (3.22.23b-2) unstable; urgency=low
+
+  * Missed one replace tag to an very old version of mysql-devel.
+
+ -- Christian Hammers <ch@debian.org>  Sun, 27 Jun 1999 19:13:30 +0200
+
+mysql-gpl (3.22.23b-1) unstable; urgency=low
+
+  * New upstream minor version.
+  * Cleaned up the dependencies a bit.
+
+ -- Christian Hammers <ch@debian.org>  Sun, 27 Jun 1999 19:13:30 +0200
+
+mysql-gpl (3.22.22-1) unstable; urgency=low
+
+  * New upstream version. (closes Bug#36493,37340)
+  * New maintainer upload.
+  * Package reorganisation: We prepare for the GPL'ed server which will
+  *  be released soon and make the structure more clear to the user.
+
+ -- Christian Hammers <ch@debian.org>  Mon,  3 May 1999 20:43:41 +0200
+
+mysql (3.22.21-1) unstable; urgency=low
+
+  * Never released. TcX was too fast :-)
+
+ -- Christian Hammers <ch@debian.org>  Tue, 20 Apr 1999 17:22:04 +0200
+
+mysql-freebits (3.21.33b-3) unstable; urgency=low
+
+  * Recompile with libncurses
+
+ -- Scott Hanson <shanson@debian.org>  Sat, 31 Oct 1998 15:04:39 +0100
+
+mysql-freebits (3.21.33b-2) unstable; urgency=low
+
+  * Recompile with libstdc++2.9 (fixes #27792)
+
+ -- Scott Hanson <shanson@debian.org>  Mon, 12 Oct 1998 18:47:25 +0200
+
+mysql-freebits (3.21.33b-1) unstable; urgency=low
+
+  * New upstream version (probably the last for 3.21)
+
+ -- Scott Hanson <shanson@debian.org>  Tue,  8 Sep 1998 18:59:37 +0200
+
+mysql-freebits (3.21.33-4) unstable; urgency=low
+
+  * Separate out non-free source files, move mysql-base, mysql-dev, and
+  *  mysql-doc to main distribution
+  * Locale files /usr/share/mysql/ now in server, not base; therefore...
+  * Add conflict to mysql-server <=3.21.33-3
+
+ -- Scott Hanson <shanson@debian.org>  Fri, 31 Jul 1998 19:16:08 +0200
+
+mysql (3.21.33-3) unstable; urgency=low
+
+  * Release to unstable with moved socket (fixes #24574)
+  * Add conflict to old libdbd-mysql-perl package
+
+ -- Scott Hanson <shanson@debian.org>  Wed, 22 Jul 1998 22:17:43 +0200
+
+mysql (3.21.33-2) experimental; urgency=low
+
+  * Move socket from /tmp to /var/run (see #24574) 
+  * Release to experimental, since this breaks everything statically
+  *  linked to libmysqlclient!
+
+ -- Scott Hanson <shanson@debian.org>  Wed, 15 Jul 1998 19:37:01 +0200
+
+mysql (3.21.33-1) unstable; urgency=low
+
+  * New upstream release
+
+ -- Scott Hanson <shanson@debian.org>  Sun, 12 Jul 1998 08:18:18 +0200
+
+mysql (3.21.32a-1) unstable; urgency=low
+
+  * New upstream release 
+  * Lintian bugs: ldconfig, missing manpage, call to perl5
+  * Lintian bug shlib-with-non-pic-code _not_ yet fixed 
+
+ -- Scott Hanson <shanson@debian.org>  Sat,  4 Jul 1998 07:57:13 +0200
+
+mysql (3.21.31-1) unstable frozen; urgency=low
+
+  * New upstream release for hamm and slink (bug fixes only)
+  * Fix unsecure use of temp file in mysqlbug (fixes #23606) 
+  * Added brief licensing information to control file
+
+ -- Scott Hanson <shanson@debian.org>  Tue, 16 Jun 1998 10:52:44 +0200
+
+mysql (3.21.30-3) unstable; urgency=low
+
+  * Restore missing shared library dependencies for mysql-server
+
+ -- Scott Hanson <shanson@debian.org>  Mon, 15 Jun 1998 07:51:58 +0200
+
+mysql (3.21.30-2) unstable; urgency=low
+
+  * Simplify debian/rules (fixes #17662)
+  * Edit manual.texi to add "Debian notes" to documentation
+  * Add note about passwords on command line (fixes #16471)
+  * Add note about getting privleges for users (fixes #22891)
+  * Correct "Possible license changes" heading (fixes #22711)
+  * Add uninstalled header files to /usr/doc/mysql-dev/examples (fixes #22627)
+  * Add udf_example.cc to /usr/doc/mysql-dev/examples (fixes #22710)
+
+ -- Scott Hanson <shanson@debian.org>  Sun,  7 Jun 1998 13:05:37 +0200
+
+mysql (3.21.30-1) unstable; urgency=low
+
+  * Stable upstream release
+
+ -- Scott Hanson <shanson@debian.org>  Tue, 12 May 1998 22:13:25 +0200
+
+mysql (3.21.29gamma-1) unstable; urgency=low
+
+  * New upstream release
+  * Do not create 'mysql' subdirectory for libs and headers (fixes #19020)
+  * Remove 'CXX=gcc' flag from configure (g++ now standard)
+      
+ -- Scott Hanson <shanson@debian.org>  Sun, 12 Apr 1998 18:38:03 +0200
+
+mysql (3.21.28gamma-1) unstable; urgency=low
+
+  * New upstream release
+  * Unstable-only release; hamm stays at 3.21.25 for now
+
+ -- Scott Hanson <shanson@debian.org>  Thu,  2 Apr 1998 21:33:51 +0200
+
+mysql (3.21.25gamma-3) unstable frozen; urgency=low
+
+  * Have mysql-base suggest perl >= 5.004 for mysqlaccess (fixes #19593)
+  * Fix shlibs to refer to mysql-base rather than the no-longer-existant mysql
+
+ -- Scott Hanson <shanson@debian.org>  Thu, 26 Mar 1998 18:22:59 +0100
+
+mysql (3.21.25gamma-2) unstable; urgency=low
+
+  * Restore libmysqlclient.so symlink to mysql-dev (fixes #19036)
+
+ -- Scott Hanson <shanson@debian.org>  Sun,  8 Mar 1998 10:46:43 +0100
+
+mysql (3.21.25gamma-1) unstable; urgency=low
+
+  * Check if running as root in init.d script (fixes #18577)
+  * New upstream release
+
+ -- Scott Hanson <shanson@debian.org>  Fri, 27 Feb 1998 20:01:30 +0100
+
+mysql (3.21.24gamma-1) unstable; urgency=low
+
+  * New upstream release
+
+ -- Scott Hanson <shanson@debian.org>  Mon, 23 Feb 1998 08:14:17 +0100
+
+mysql (3.21.23beta-3) unstable; urgency=low
+
+  * Squashed errors found by lintian
+
+ -- Scott Hanson <shanson@debian.org>  Tue, 17 Feb 1998 20:19:01 +0100
+
+mysql (3.21.23beta-2) unstable; urgency=low
+
+  * Fixed overlaps with old mysql package (fixes #17843)
+
+ -- Scott Hanson <shanson@debian.org>  Thu,  5 Feb 1998 22:55:00 +0100
+
+mysql (3.21.23beta-1) unstable; urgency=low
+
+  * New upstream release
+  * Fix include lines in mysql.h (fixes #17827)
+  * Move /usr/include/mysql to mysql-dev
+
+ -- Scott Hanson <shanson@debian.org>  Wed,  4 Feb 1998 19:59:14 +0100
+
+mysql (3.21.22beta-3) unstable; urgency=low
+
+  * Correct descriptions in control file (fixes #17698)
+  * Clean up output of shutdown script
+
+ -- Scott Hanson <shanson@debian.org>  Sat, 31 Jan 1998 19:04:29 +0100
+
+mysql (3.21.22beta-2) unstable; urgency=low
+
+  * Split out mysql-dev and mysql-bench subpackages
+
+ -- Scott Hanson <shanson@debian.org>  Wed, 28 Jan 1998 19:52:27 +0100
+
+mysql (3.21.22beta-1) unstable; urgency=low
+
+  * New upstream release 
+  
+ -- Scott Hanson <shanson@debian.org>  Wed, 28 Jan 1998 18:59:09 +0100
+
+mysql (3.21.21a.beta-2) unstable; urgency=low
+
+  * Compile with libpthreads from libc6-dev_2.0.6-3 rather than statically 
+    linking to patched libpthreads (see changes to 3.20.29-2)
+  
+ -- Scott Hanson <shanson@debian.org>  Sun, 25 Jan 1998 13:17:15 +0100
+
+mysql (3.21.21a.beta-1) unstable; urgency=low
+
+  * Put initial database, mysql_install_db, safe_mysqld, isamlog and 
+    isamchk in mysql-server 	
+  * Correct upstream release number so source packages are correctly built
+
+ -- Scott Hanson <shanson@debian.org>  Mon, 19 Jan 1998 07:52:48 +0100
+
+mysql (3.21.21.beta-1) unstable; urgency=low
+
+  * Use debhelper where possible in rules
+  * Split binary packages into mysql-base, mysql-client, mysql-doc	
+  * New upstream release
+
+ -- Scott Hanson <shanson@debian.org>  Thu, 15 Jan 1998 08:12:17 +0100
+
+mysql (3.21.19.beta-1) unstable; urgency=low
+
+  * Offer to set root password in mysql_install_db
+  * Kill `pidof mysqld` on shutdown rather than use mysqladmin
+  * New upstream version
+
+ -- Scott Hanson <shanson@debian.org>  Fri,  9 Jan 1998 20:06:35 +0100
+
+mysql (3.21.17a.beta-2) unstable; urgency=low
+
+  * Remove perl stuff (it's going back into libdbd-mysql-perl)
+  * Remove conflict with libdbd-mysql-perl
+  * Do not compress *html files (fixes #16314)
+
+ -- Scott Hanson <shanson@debian.org>  Tue, 30 Dec 1997 07:34:20 +0100
+
+mysql (3.21.17a.beta-1) unstable; urgency=low
+
+  * Add conflict to libdbd-mysql-perl
+  * Use --pid-file option to place pid file in /var/run rather than patching 
+  * Add install-info to postinst and postrm
+  * Add filename to message shown by mysql_install_db (fixes #16621)
+  * New upstream version
+
+ -- Scott Hanson <shanson@debian.org>  Sun, 21 Dec 1997 19:41:45 +0100
+
+mysql (3.20.32a-5) unstable; urgency=low
+
+  * Move mysqld to /usr/lib/mysql, per policy discussion
+  * Adjust makefiles so perl libs get installed 
+
+ -- Scott Hanson <shanson@debian.org>  Wed,  3 Dec 1997 22:37:45 +0100
+
+mysql (3.20.32a-4) unstable; urgency=low
+
+  * Move mysqld to /usr/sbin to comply with FSSTND
+
+ -- Scott Hanson <shanson@debian.org>  Mon,  3 Nov 1997 20:12:29 +0100
+
+mysql (3.20.32a-3) unstable; urgency=low
+
+  * Comment out tests in mysql_install_db... for real this time!
+
+ -- Scott Hanson <shanson@debian.org>  Mon,  3 Nov 1997 07:32:53 +0100
+
+mysql (3.20.32a-2) unstable; urgency=low
+
+  * Comment out tests in mysql_install_db (fixes #14304)
+
+ -- Scott Hanson <shanson@debian.org>  Sat,  1 Nov 1997 18:45:25 +0100
+
+mysql (3.20.32a-1) unstable; urgency=low
+
+  * New upstream version
+
+ -- Scott Hanson <shanson@debian.org>  Wed, 29 Oct 1997 07:11:42 +0100
+
+mysql (3.20.29-2) unstable; urgency=low
+ 
+  * New maintainer
+  * Statically link mysqld to patched glibc-2.0.5 libpthread 
+    (works around #13586; see README.debian.glibc-2.0.5)
+  * Conflict with libpthread0 (fixes #13448)
+  * Don't link libg++, avoiding problems with glibc libpthread 
+
+ -- Scott Hanson <shanson@debian.org>  Thu, 16 Oct 1997 19:25:23 +0200
+
+mysql (3.20.29-1) unstable; urgency=low
+
+  * New upstream version
+  * Recompiled with libc6
+  * Include mysql-faq_toc.html (fixes #10885)
+  * Reworked /etc/init.d/mysql script (thanks to Heiko)
+  * Remove file /usr/lib/libmysqlclient.so.4 when package is removed.
+  * Use absolute path specification for conffile
+  * Use /usr/bin/perl instead of /bin/perl (fixes #10654)
+  * Do not depend on mysql (fixes #12427)
+  * Installed missing manpage for Mysql perl module
+  * Don't use debstd anymore
+  * Pristine source
+  * Set section to `non-free/devel'
+  * Upgraded to standards version 2.3.0.0
+
+ -- Christian Schwarz <schwarz@debian.org>  Fri, 12 Sep 1997 02:12:58 +0200
+
+mysql (3.20.16beta-2) unstable; urgency=low
+
+  * Uses /usr/bin/perl instead of /bin/perl (fixes bug #9731)
+  * Don't run mysqld with --log option
+  * Don't install regex manual pages
+  * Suggest package mysql-manual
+  * Fixed typo in changelog
+  * Upgrade to policy 2.1.3.2
+
+ -- Christian Schwarz <schwarz@debian.org>  Sun, 11 May 1997 14:19:26 +0200
+
+mysql (3.20.16beta-1) unstable; urgency=low
+
+  * Initial Release.
+
+ -- Christian Schwarz <schwarz@debian.org>  Sat, 12 Apr 1997 13:51:28 +0200
diff --git a/storage/xtradb/build/debian/compat b/storage/xtradb/build/debian/compat
new file mode 100644
index 00000000000..b8626c4cff2
--- /dev/null
+++ b/storage/xtradb/build/debian/compat
@@ -0,0 +1 @@
+4
diff --git a/storage/xtradb/build/debian/control b/storage/xtradb/build/debian/control
new file mode 100644
index 00000000000..4d229f20a2d
--- /dev/null
+++ b/storage/xtradb/build/debian/control
@@ -0,0 +1,118 @@
+Source: percona-xtradb-dfsg-5.1
+Section: misc
+Priority: optional
+Maintainer: Percona SQL Development Team <mysql-dev@percona.com>
+Uploaders: Aleksandr Kuzminsky <aleksandr.kuzminsky@percona.com>
+Build-Depends: libtool (>= 1.4.2-7), procps | hurd,  debhelper (>= 4.1.16), file (>= 3.28-1), libncurses5-dev (>= 5.0-6), perl (>= 5.6.0), libwrap0-dev (>= 7.6-8.3), zlib1g-dev (>= 1:1.1.3-5), libreadline5-dev | libreadline-dev, psmisc, po-debconf, chrpath, automake1.9, doxygen, gs, dpatch, gawk, bison, lsb-release, fakeroot
+Standards-Version: 3.8.0
+Homepage: http://www.percona.com/
+Vcs-Browser: http://bazaar.launchpad.net/~percona-dev/percona-xtradb/release-1.0/files
+Vcs-Bzr: bzr+ssh://bazaar.launchpad.net/~percona-dev/percona-xtradb/release-1.0/
+
+Package: libpercona-xtradb-client16
+Section: libs
+Architecture: any
+Depends: percona-xtradb-common (>= ${source:Version}), ${shlibs:Depends}
+Description: Percona SQL database client library
+ Percona SQL is a fast, stable and true multi-user, multi-threaded SQL database
+ server. SQL (Structured Query Language) is the most popular database query
+ language in the world. The main goals of Percona SQL are speed, robustness and
+ ease of use.
+ .
+ This package includes the client library.
+
+Package: libpercona-xtradb-client15-dev
+Architecture: all
+Section: libdevel
+Depends: libpercona-xtradb-client-dev (>= ${source:Version})
+Description: Percona SQL database development files - empty transitional package
+ This is an empty package that depends on libpercona-xtradb-client-dev to ease the
+ transition for packages with versioned build-deps on libpercona-xtradb-client15-dev.
+
+Package: libpercona-xtradb-client-dev
+Architecture: any
+Section: libdevel
+Depends: libpercona-xtradb-client16 (>= ${source:Version}), zlib1g-dev, , ${shlibs:Depends}
+Conflicts: libmysqlclient14-dev, libmysqlclient12-dev, libmysqlclient10-dev, libmysqlclient15-dev, libmysqlclient16-dev
+Replaces: libmysqlclient14-dev, libmysqlclient12-dev, libmysqlclient10-dev, libmysqlclient15-dev, libmysqlclient16-dev
+Description: Percona SQL database development files
+ Percona SQL is a fast, stable and true multi-user, multi-threaded SQL database
+ server. SQL (Structured Query Language) is the most popular database query
+ language in the world. The main goals of Percona SQL are speed, robustness and
+ ease of use.
+ .
+ This package includes development libraries and header files.
+
+Package: percona-xtradb-common
+Section: database
+Architecture: all
+Depends: ${shlibs:Depends}, ${misc:Depends}
+Conflicts: mysql-common-4.1, mysql-common-5.0, mysql-common-5.1, mysql-common
+Provides: mysql-common
+Replaces: mysql-common-4.1, mysql-common-5.0, mysql-common-5.1, mysql-common
+Description: Percona SQL database common files (e.g. /etc/mysql/my.cnf)
+ Percona SQL is a fast, stable and true multi-user, multi-threaded SQL database
+ server. SQL (Structured Query Language) is the most popular database query
+ language in the world. The main goals of Percona SQL are speed, robustness and
+ ease of use.
+ .
+ This package includes files needed by all versions of the client library
+ (e.g. /etc/mysql/my.cnf).
+
+Package: percona-xtradb-client-5.1
+Architecture: any
+Depends: debianutils (>=1.6), libdbi-perl, percona-xtradb-common (>= ${source:Version}), libpercona-xtradb-client16 (>= ${source:Version}), ${perl:Depends}, ${shlibs:Depends}, ${misc:Depends}
+Provides: virtual-mysql-client, mysql-client, mysql-client-4.1, percona-xtradb-client, percona-xtradb-client-5.1
+Conflicts: mysql-client (<< ${source:Version}), mysql-client-5.0, mysql-client-5.1, percona-xtradb-client-5.0
+Replaces: mysql-client (<< ${source:Version}), mysql-client-5.0, mysql-client-5.1, percona-xtradb-client-5.0
+Description: Percona SQL database client binaries
+ Percona SQL is a fast, stable and true multi-user, multi-threaded SQL database
+ server. SQL (Structured Query Language) is the most popular database query
+ language in the world. The main goals of Percona SQL are speed, robustness and
+ ease of use.
+ .
+ This package includes the client binaries and the additional tools
+ innotop and mysqlreport.
+
+Package: percona-xtradb-server-5.1
+Architecture: any                                                               
+Suggests: tinyca
+Recommends: mailx, libhtml-template-perl
+Pre-Depends: percona-xtradb-common (>= ${source:Version}), adduser (>= 3.40), debconf
+Depends: percona-xtradb-client-5.1 (>= ${source:Version}), libdbi-perl, perl (>= 5.6), ${shlibs:Depends}, ${misc:Depends}, psmisc, passwd, lsb-base (>= 3.0-10)
+Conflicts: mysql-server (<< ${source:Version}), mysql-server-4.1, percona-xtradb-server-5.0
+Provides: mysql-server, virtual-mysql-server, mysql-server-5.0, percona-xtradb-server-5.1
+Replaces: mysql-server (<< ${source:Version}), mysql-server-5.0, percona-xtradb-server-5.0
+Description: Percona SQL database server binaries                                     
+ Percona SQL is a fast, stable and true multi-user, multi-threaded SQL database
+ server. SQL (Structured Query Language) is the most popular database query
+ language in the world. The main goals of Percona SQL are speed, robustness and
+ ease of use.
+ .
+ This package includes the server binaries.
+
+Package: percona-xtradb-server
+Section: database
+Architecture: all
+Depends: percona-xtradb-server-5.1
+Description: Percona SQL database server (metapackage depending on the latest version)
+ This is an empty package that depends on the current "best" version of
+ percona-xtradb-server (currently percona-xtradb-server-5.1), as determined by the Percona SQL
+ maintainers. Install this package if in doubt about which Percona SQL
+ version you need. That will install the version recommended by the
+ package maintainers.
+ .
+ Percona SQL is a fast, stable and true multi-user, multi-threaded SQL database
+ server. SQL (Structured Query Language) is the most popular database query
+ language in the world. The main goals of Percona SQL are speed, robustness and
+ ease of use.
+
+Package: percona-xtradb-client
+Section: database
+Architecture: all
+Depends: percona-xtradb-client-5.1
+Description: Percona SQL database client (metapackage depending on the latest version)
+ This is an empty package that depends on the current "best" version of
+ percona-xtradb-client (currently percona-xtradb-client-5.1), as determined by the Percona SQL
+ maintainers.  Install this package if in doubt about which Percona SQL version
+ you want, as this is the one we consider to be in the best shape.
diff --git a/storage/xtradb/build/debian/copyright b/storage/xtradb/build/debian/copyright
new file mode 100644
index 00000000000..47fab54fc59
--- /dev/null
+++ b/storage/xtradb/build/debian/copyright
@@ -0,0 +1,169 @@
+
+== MySQL ==
+
+The Debian package of MySQL was first debianzed on 1997-04-12 by Christian
+Schwarz <schwarz@debian.org> and ist maintained since 1999-04-20 by 
+Christian Hammers <ch@debian.org>.
+
+It can be downloaded from http://www.mysql.com/
+
+Copyright:
+
+According to the file "COPYING" all parts of this package are licenced
+under the terms of the GNU GPL Version 2 of which a copy is available
+in /usr/share/common-licenses.
+
+To allow free software with other licences than the GPL to link against the 
+shared library, special terms for "derived works" are defined in the file
+"EXCEPTIONS-CLIENT" which is quoted below.
+
+More information can be found on http://www.mysql.com/company/legal/licensing/
+
+The manual had to be removed as it is not free in the sense of the
+Debian Free Software Guidelines (DFSG). 
+
+> Appendix I MySQL FLOSS License Exception
+> ****************************************
+> 
+>                      Version 0.3, 10 February 2005
+> 
+> The MySQL AB Exception for Free/Libre and Open Source Software-only
+> Applications Using MySQL Client Libraries (the "FLOSS Exception").
+> 
+> Exception Intent
+> ================
+> 
+> We want specified Free/Libre and Open Source Software ("FLOSS")
+> applications to be able to use specified GPL-licensed MySQL client
+> libraries (the "Program") despite the fact that not all FLOSS licenses
+> are compatible with version 2 of the GNU General Public License (the
+> "GPL").
+> 
+> Legal Terms and Conditions
+> ==========================
+> 
+> As a special exception to the terms and conditions of version 2.0 of the
+> GPL:
+> 
+>   1. You are free to distribute a Derivative Work that is formed
+>      entirely from the Program and one or more works (each, a "FLOSS
+>      Work") licensed under one or more of the licenses listed below in
+>      section 1, as long as:
+> 
+>        a. You obey the GPL in all respects for the Program and the
+>           Derivative Work, except for identifiable sections of the
+>           Derivative Work which are not derived from the Program, and
+>           which can reasonably be considered independent and separate
+>           works in themselves,
+> 
+>        b. all identifiable sections of the Derivative Work which are not
+>           derived from the Program, and which can reasonably be
+>           considered independent and separate works in themselves,
+> 
+>          i
+>                are distributed subject to one of the FLOSS licenses
+>                listed below, and
+> 
+>          ii
+>                the object code or executable form of those sections are
+>                accompanied by the complete corresponding
+>                machine-readable source code for those sections on the
+>                same medium and under the same FLOSS license as the
+>                corresponding object code or executable forms of those
+>                sections, and
+> 
+>        c. any works which are aggregated with the Program or with a
+>           Derivative Work on a volume of a storage or distribution
+>           medium in accordance with the GPL, can reasonably be
+>           considered independent and separate works in themselves which
+>           are not derivatives of either the Program, a Derivative Work
+>           or a FLOSS Work.
+> 
+>      If the above conditions are not met, then the Program may only be
+>      copied, modified, distributed or used under the terms and
+>      conditions of the GPL or another valid licensing option from MySQL
+>      AB.
+> 
+>   2. FLOSS License List
+> 
+>      *License name*                           *Version(s)/Copyright Date*
+>      Academic Free License                    2.0
+>      Apache Software License                  1.0/1.1/2.0
+>      Apple Public Source License              2.0
+>      Artistic license                         From Perl 5.8.0
+>      BSD license                              "July 22 1999"
+>      Common Public License                    1.0
+>      GNU Library or "Lesser" General Public   2.0/2.1
+>      License (LGPL)                           
+>      Jabber Open Source License               1.0
+>      MIT license                              -
+>      Mozilla Public License (MPL)             1.0/1.1
+>      Open Software License                    2.0
+>      OpenSSL license (with original SSLeay    "2003" ("1998")
+>      license)                                 
+>      PHP License                              3.0
+>      Python license (CNRI Python License)     -
+>      Python Software Foundation License       2.1.1
+>      Sleepycat License                        "1999"
+>      W3C License                              "2001"
+>      X11 License                              "2001"
+>      Zlib/libpng License                      -
+>      Zope Public License                      2.0
+> 
+>      Due to the many variants of some of the above licenses, we require
+>      that any version follow the 2003 version of the Free Software
+>      Foundation's Free Software Definition
+>      (`http://www.gnu.org/philosophy/free-sw.html') or version 1.9 of
+>      the Open Source Definition by the Open Source Initiative
+>      (`http://www.opensource.org/docs/definition.php').
+> 
+>   3. Definitions
+> 
+>        a. Terms used, but not defined, herein shall have the meaning
+>           provided in the GPL.
+> 
+>        b. Derivative Work means a derivative work under copyright law.
+> 
+>   4. Applicability This FLOSS Exception applies to all Programs that
+>      contain a notice placed by MySQL AB saying that the Program may be
+>      distributed under the terms of this FLOSS Exception.  If you
+>      create or distribute a work which is a Derivative Work of both the
+>      Program and any other work licensed under the GPL, then this FLOSS
+>      Exception is not available for that work; thus, you must remove
+>      the FLOSS Exception notice from that work and comply with the GPL
+>      in all respects, including by retaining all GPL notices.  You may
+>      choose to redistribute a copy of the Program exclusively under the
+>      terms of the GPL by removing the FLOSS Exception notice from that
+>      copy of the Program, provided that the copy has never been
+>      modified by you or any third party.
+ 
+
+== innotop ==
+
+Author: Baron Schwartz <baron@xaprb.com>
+URL:    http://innotop.sourceforge.net
+
+License:
+> This software is dual licensed, either GPL version 2 or Artistic License.
+>
+> This package is free software; you can redistribute it and/or modify
+> it under the terms of the GNU General Public License as published by
+> the Free Software Foundation; either version 2 of the License, or
+> (at your option) any later version.
+>
+> This package is distributed in the hope that it will be useful,
+> but WITHOUT ANY WARRANTY; without even the implied warranty of
+> MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+> GNU General Public License for more details.
+>
+> You should have received a copy of the GNU General Public License
+> along with this package; if not, write to the Free Software
+> Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
+
+On Debian systems, the complete text of the GNU General Public License and the
+Artistic License can be found in `/usr/share/common-licenses/'.
+
+The upstream author explained here: http://bugs.gentoo.org/show_bug.cgi?id=14760
+that these licenses also apply to the following files:
+- innotop.html
+- InnoDBParser.pm
diff --git a/storage/xtradb/build/debian/libpercona-xtradb-client-dev.README.Maintainer b/storage/xtradb/build/debian/libpercona-xtradb-client-dev.README.Maintainer
new file mode 100644
index 00000000000..f24cdcd519d
--- /dev/null
+++ b/storage/xtradb/build/debian/libpercona-xtradb-client-dev.README.Maintainer
@@ -0,0 +1,4 @@
+The examples directory includes files that might be needed by some
+developers:
+- header files not installed by default
+- the example file udf_example.c
diff --git a/storage/xtradb/build/debian/libpercona-xtradb-client-dev.dirs b/storage/xtradb/build/debian/libpercona-xtradb-client-dev.dirs
new file mode 100644
index 00000000000..f6ad2870431
--- /dev/null
+++ b/storage/xtradb/build/debian/libpercona-xtradb-client-dev.dirs
@@ -0,0 +1,2 @@
+usr/include/
+usr/lib/
diff --git a/storage/xtradb/build/debian/libpercona-xtradb-client-dev.docs b/storage/xtradb/build/debian/libpercona-xtradb-client-dev.docs
new file mode 100644
index 00000000000..eccf2c9c565
--- /dev/null
+++ b/storage/xtradb/build/debian/libpercona-xtradb-client-dev.docs
@@ -0,0 +1 @@
+EXCEPTIONS-CLIENT
diff --git a/storage/xtradb/build/debian/libpercona-xtradb-client-dev.examples b/storage/xtradb/build/debian/libpercona-xtradb-client-dev.examples
new file mode 100644
index 00000000000..f1649c311c4
--- /dev/null
+++ b/storage/xtradb/build/debian/libpercona-xtradb-client-dev.examples
@@ -0,0 +1 @@
+sql/udf_example.c
diff --git a/storage/xtradb/build/debian/libpercona-xtradb-client-dev.files b/storage/xtradb/build/debian/libpercona-xtradb-client-dev.files
new file mode 100644
index 00000000000..6803365b5ea
--- /dev/null
+++ b/storage/xtradb/build/debian/libpercona-xtradb-client-dev.files
@@ -0,0 +1,7 @@
+usr/bin/mysql_config
+usr/include/mysql/*.h
+usr/lib/libmysqlclient.a
+usr/lib/libmysqlclient.la
+usr/lib/mysql/*.a
+usr/lib/mysql/*.la
+usr/share/man/man1/mysql_config.1
diff --git a/storage/xtradb/build/debian/libpercona-xtradb-client-dev.links b/storage/xtradb/build/debian/libpercona-xtradb-client-dev.links
new file mode 100644
index 00000000000..0481d1a0020
--- /dev/null
+++ b/storage/xtradb/build/debian/libpercona-xtradb-client-dev.links
@@ -0,0 +1,2 @@
+usr/lib/libmysqlclient.so.16	usr/lib/libmysqlclient.so
+usr/lib/libmysqlclient_r.so.16	usr/lib/libmysqlclient_r.so
diff --git a/storage/xtradb/build/debian/libpercona-xtradb-client16.dirs b/storage/xtradb/build/debian/libpercona-xtradb-client16.dirs
new file mode 100644
index 00000000000..2964de6141b
--- /dev/null
+++ b/storage/xtradb/build/debian/libpercona-xtradb-client16.dirs
@@ -0,0 +1 @@
+usr/lib/
diff --git a/storage/xtradb/build/debian/libpercona-xtradb-client16.docs b/storage/xtradb/build/debian/libpercona-xtradb-client16.docs
new file mode 100644
index 00000000000..eccf2c9c565
--- /dev/null
+++ b/storage/xtradb/build/debian/libpercona-xtradb-client16.docs
@@ -0,0 +1 @@
+EXCEPTIONS-CLIENT
diff --git a/storage/xtradb/build/debian/libpercona-xtradb-client16.files b/storage/xtradb/build/debian/libpercona-xtradb-client16.files
new file mode 100644
index 00000000000..5162b7b9639
--- /dev/null
+++ b/storage/xtradb/build/debian/libpercona-xtradb-client16.files
@@ -0,0 +1 @@
+usr/lib/libmysqlclient*.so.*
diff --git a/storage/xtradb/build/debian/libpercona-xtradb-client16.postinst b/storage/xtradb/build/debian/libpercona-xtradb-client16.postinst
new file mode 100644
index 00000000000..29d3b86f978
--- /dev/null
+++ b/storage/xtradb/build/debian/libpercona-xtradb-client16.postinst
@@ -0,0 +1,12 @@
+#!/bin/bash -e
+
+# dh_installdeb will replace this with shell code automatically
+# generated by other debhelper scripts.
+
+#DEBHELPER#
+
+exit 0
+
+# vim: ts=4
+
+
diff --git a/storage/xtradb/build/debian/patches/00list b/storage/xtradb/build/debian/patches/00list
new file mode 100644
index 00000000000..5c50ab1ba53
--- /dev/null
+++ b/storage/xtradb/build/debian/patches/00list
@@ -0,0 +1,6 @@
+33_scripts__mysql_create_system_tables__no_test.dpatch
+38_scripts__mysqld_safe.sh__signals.dpatch
+41_scripts__mysql_install_db.sh__no_test.dpatch
+44_scripts__mysql_config__libs.dpatch
+50_mysql-test__db_test.dpatch
+60_percona_support.dpatch
diff --git a/storage/xtradb/build/debian/patches/01_MAKEFILES__Docs_Images_Makefile.in.dpatch b/storage/xtradb/build/debian/patches/01_MAKEFILES__Docs_Images_Makefile.in.dpatch
new file mode 100644
index 00000000000..ca138afa746
--- /dev/null
+++ b/storage/xtradb/build/debian/patches/01_MAKEFILES__Docs_Images_Makefile.in.dpatch
@@ -0,0 +1,776 @@
+#! /bin/sh /usr/share/dpatch/dpatch-run
+## 01_MAKEFILES__Docs_Makefile.in.dpatch by  <ch@debian.org>
+##
+## All lines beginning with `## DP:' are a description of the patch.
+## DP: Creates Docs/Makefile.in
+
+@DPATCH@
+
+--- old/Docs/Images/Makefile.in	2005-03-01 02:08:01.877429040 +0100
++++ new/Docs/Images/Makefile.in	2005-02-28 21:21:24.000000000 +0100
+@@ -0,0 +1,765 @@
++# Makefile.in generated by automake 1.7.9 from Makefile.am.
++# @configure_input@
++
++# Copyright 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003
++# Free Software Foundation, Inc.
++# This Makefile.in is free software; the Free Software Foundation
++# gives unlimited permission to copy and/or distribute it,
++# with or without modifications, as long as this notice is preserved.
++
++# This program is distributed in the hope that it will be useful,
++# but WITHOUT ANY WARRANTY, to the extent permitted by law; without
++# even the implied warranty of MERCHANTABILITY or FITNESS FOR A
++# PARTICULAR PURPOSE.
++
++@SET_MAKE@
++
++# Copyright (C) 2000 MySQL AB & MySQL Finland AB & TCX DataKonsult AB
++# 
++# This program is free software; you can redistribute it and/or modify
++# it under the terms of the GNU General Public License as published by
++# the Free Software Foundation; either version 2 of the License, or
++# (at your option) any later version.
++# 
++# This program is distributed in the hope that it will be useful,
++# but WITHOUT ANY WARRANTY; without even the implied warranty of
++# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
++# GNU General Public License for more details.
++# 
++# You should have received a copy of the GNU General Public License
++# along with this program; if not, write to the Free Software
++# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
++
++# Process this file with automake to create Makefile.in
++
++srcdir = @srcdir@
++top_srcdir = @top_srcdir@
++VPATH = @srcdir@
++pkgdatadir = $(datadir)/@PACKAGE@
++pkglibdir = $(libdir)/@PACKAGE@
++pkgincludedir = $(includedir)/@PACKAGE@
++top_builddir = .
++
++am__cd = CDPATH="$${ZSH_VERSION+.}$(PATH_SEPARATOR)" && cd
++INSTALL = @INSTALL@
++install_sh_DATA = $(install_sh) -c -m 644
++install_sh_PROGRAM = $(install_sh) -c
++install_sh_SCRIPT = $(install_sh) -c
++INSTALL_HEADER = $(INSTALL_DATA)
++transform = $(program_transform_name)
++NORMAL_INSTALL = :
++PRE_INSTALL = :
++POST_INSTALL = :
++NORMAL_UNINSTALL = :
++PRE_UNINSTALL = :
++POST_UNINSTALL = :
++build_triplet = @build@
++host_triplet = @host@
++target_triplet = @target@
++ACLOCAL = @ACLOCAL@
++ALLOCA = @ALLOCA@
++AMDEP_FALSE = @AMDEP_FALSE@
++AMDEP_TRUE = @AMDEP_TRUE@
++AMTAR = @AMTAR@
++AR = @AR@
++AS = @AS@
++ASSEMBLER_FALSE = @ASSEMBLER_FALSE@
++ASSEMBLER_TRUE = @ASSEMBLER_TRUE@
++ASSEMBLER_sparc32_FALSE = @ASSEMBLER_sparc32_FALSE@
++ASSEMBLER_sparc32_TRUE = @ASSEMBLER_sparc32_TRUE@
++ASSEMBLER_sparc64_FALSE = @ASSEMBLER_sparc64_FALSE@
++ASSEMBLER_sparc64_TRUE = @ASSEMBLER_sparc64_TRUE@
++ASSEMBLER_x86_FALSE = @ASSEMBLER_x86_FALSE@
++ASSEMBLER_x86_TRUE = @ASSEMBLER_x86_TRUE@
++AUTOCONF = @AUTOCONF@
++AUTOHEADER = @AUTOHEADER@
++AUTOMAKE = @AUTOMAKE@
++AVAILABLE_LANGUAGES = @AVAILABLE_LANGUAGES@
++AVAILABLE_LANGUAGES_ERRORS = @AVAILABLE_LANGUAGES_ERRORS@
++AWK = @AWK@
++CC = @CC@
++CCAS = @CCAS@
++CCASFLAGS = @CCASFLAGS@
++CCDEPMODE = @CCDEPMODE@
++CC_VERSION = @CC_VERSION@
++CFLAGS = @CFLAGS@
++CHARSETS_NEED_SOURCE = @CHARSETS_NEED_SOURCE@
++CHARSET_OBJS = @CHARSET_OBJS@
++CHARSET_SRCS = @CHARSET_SRCS@
++CHECK_PID = @CHECK_PID@
++CHMOD = @CHMOD@
++CLIENT_EXTRA_LDFLAGS = @CLIENT_EXTRA_LDFLAGS@
++CLIENT_LIBS = @CLIENT_LIBS@
++CMP = @CMP@
++COMPILATION_COMMENT = @COMPILATION_COMMENT@
++COMPILE_PSTACK_FALSE = @COMPILE_PSTACK_FALSE@
++COMPILE_PSTACK_TRUE = @COMPILE_PSTACK_TRUE@
++CONF_COMMAND = @CONF_COMMAND@
++CP = @CP@
++CPP = @CPP@
++CPPFLAGS = @CPPFLAGS@
++CXX = @CXX@
++CXXCPP = @CXXCPP@
++CXXDEPMODE = @CXXDEPMODE@
++CXXFLAGS = @CXXFLAGS@
++CXXLDFLAGS = @CXXLDFLAGS@
++CXX_VERSION = @CXX_VERSION@
++CYGPATH_W = @CYGPATH_W@
++DEFS = @DEFS@
++DEPDIR = @DEPDIR@
++DOT_FRM_VERSION = @DOT_FRM_VERSION@
++DVIS = @DVIS@
++ECHO = @ECHO@
++ECHO_C = @ECHO_C@
++ECHO_N = @ECHO_N@
++ECHO_T = @ECHO_T@
++EGREP = @EGREP@
++EXEEXT = @EXEEXT@
++F77 = @F77@
++FFLAGS = @FFLAGS@
++FIND_PROC = @FIND_PROC@
++GETCONF = @GETCONF@
++GXX = @GXX@
++HAVE_NETWARE_FALSE = @HAVE_NETWARE_FALSE@
++HAVE_NETWARE_TRUE = @HAVE_NETWARE_TRUE@
++HOSTNAME = @HOSTNAME@
++INSTALL_DATA = @INSTALL_DATA@
++INSTALL_PROGRAM = @INSTALL_PROGRAM@
++INSTALL_SCRIPT = @INSTALL_SCRIPT@
++INSTALL_STRIP_PROGRAM = @INSTALL_STRIP_PROGRAM@
++IS_LINUX = @IS_LINUX@
++KILL = @KILL@
++LD = @LD@
++LDFLAGS = @LDFLAGS@
++LIBDL = @LIBDL@
++LIBOBJS = @LIBOBJS@
++LIBS = @LIBS@
++LIBTOOL = @LIBTOOL@
++LIB_EXTRA_CCFLAGS = @LIB_EXTRA_CCFLAGS@
++LM_CFLAGS = @LM_CFLAGS@
++LN = @LN@
++LN_CP_F = @LN_CP_F@
++LN_S = @LN_S@
++LOCAL_FALSE = @LOCAL_FALSE@
++LOCAL_TRUE = @LOCAL_TRUE@
++LTLIBOBJS = @LTLIBOBJS@
++MACHINE_TYPE = @MACHINE_TYPE@
++MAINT = @MAINT@
++MAINTAINER_MODE_FALSE = @MAINTAINER_MODE_FALSE@
++MAINTAINER_MODE_TRUE = @MAINTAINER_MODE_TRUE@
++MAKEINFO = @MAKEINFO@
++MAKE_BINARY_DISTRIBUTION_OPTIONS = @MAKE_BINARY_DISTRIBUTION_OPTIONS@
++MAKE_SHELL = @MAKE_SHELL@
++MT_INCLUDES = @MT_INCLUDES@
++MT_LD_ADD = @MT_LD_ADD@
++MV = @MV@
++MYSQLD_DEFAULT_SWITCHES = @MYSQLD_DEFAULT_SWITCHES@
++MYSQLD_EXTRA_LDFLAGS = @MYSQLD_EXTRA_LDFLAGS@
++MYSQLD_USER = @MYSQLD_USER@
++MYSQL_BASE_VERSION = @MYSQL_BASE_VERSION@
++MYSQL_NO_DASH_VERSION = @MYSQL_NO_DASH_VERSION@
++MYSQL_SERVER_SUFFIX = @MYSQL_SERVER_SUFFIX@
++MYSQL_TCP_PORT = @MYSQL_TCP_PORT@
++MYSQL_TCP_PORT_DEFAULT = @MYSQL_TCP_PORT_DEFAULT@
++MYSQL_UNIX_ADDR = @MYSQL_UNIX_ADDR@
++MYSQL_VERSION_ID = @MYSQL_VERSION_ID@
++NOINST_LDFLAGS = @NOINST_LDFLAGS@
++OBJEXT = @OBJEXT@
++PACKAGE = @PACKAGE@
++PACKAGE_BUGREPORT = @PACKAGE_BUGREPORT@
++PACKAGE_NAME = @PACKAGE_NAME@
++PACKAGE_STRING = @PACKAGE_STRING@
++PACKAGE_TARNAME = @PACKAGE_TARNAME@
++PACKAGE_VERSION = @PACKAGE_VERSION@
++PATH_SEPARATOR = @PATH_SEPARATOR@
++PDFMANUAL = @PDFMANUAL@
++PERL = @PERL@
++PERL5 = @PERL5@
++PROTOCOL_VERSION = @PROTOCOL_VERSION@
++PS = @PS@
++RANLIB = @RANLIB@
++RM = @RM@
++SAVE_ASFLAGS = @SAVE_ASFLAGS@
++SAVE_CFLAGS = @SAVE_CFLAGS@
++SAVE_CXXFLAGS = @SAVE_CXXFLAGS@
++SAVE_CXXLDFLAGS = @SAVE_CXXLDFLAGS@
++SAVE_LDFLAGS = @SAVE_LDFLAGS@
++SED = @SED@
++SET_MAKE = @SET_MAKE@
++SHARED_LIB_VERSION = @SHARED_LIB_VERSION@
++SHELL = @SHELL@
++STRIP = @STRIP@
++SYSTEM_TYPE = @SYSTEM_TYPE@
++TAR = @TAR@
++TERMCAP_LIB = @TERMCAP_LIB@
++THREAD_LOBJECTS = @THREAD_LOBJECTS@
++THREAD_LPROGRAMS = @THREAD_LPROGRAMS@
++VERSION = @VERSION@
++WRAPLIBS = @WRAPLIBS@
++YACC = @YACC@
++ac_ct_AR = @ac_ct_AR@
++ac_ct_CC = @ac_ct_CC@
++ac_ct_CXX = @ac_ct_CXX@
++ac_ct_F77 = @ac_ct_F77@
++ac_ct_GETCONF = @ac_ct_GETCONF@
++ac_ct_RANLIB = @ac_ct_RANLIB@
++ac_ct_STRIP = @ac_ct_STRIP@
++am__fastdepCC_FALSE = @am__fastdepCC_FALSE@
++am__fastdepCC_TRUE = @am__fastdepCC_TRUE@
++am__fastdepCXX_FALSE = @am__fastdepCXX_FALSE@
++am__fastdepCXX_TRUE = @am__fastdepCXX_TRUE@
++am__include = @am__include@
++am__leading_dot = @am__leading_dot@
++am__quote = @am__quote@
++bdb_includes = @bdb_includes@
++bdb_libs = @bdb_libs@
++bdb_libs_with_path = @bdb_libs_with_path@
++bench_dirs = @bench_dirs@
++bindir = @bindir@
++build = @build@
++build_alias = @build_alias@
++build_cpu = @build_cpu@
++build_os = @build_os@
++build_vendor = @build_vendor@
++datadir = @datadir@
++default_charset = @default_charset@
++docs_dirs = @docs_dirs@
++exec_prefix = @exec_prefix@
++host = @host@
++host_alias = @host_alias@
++host_cpu = @host_cpu@
++host_os = @host_os@
++host_vendor = @host_vendor@
++includedir = @includedir@
++infodir = @infodir@
++innodb_includes = @innodb_includes@
++innodb_libs = @innodb_libs@
++innodb_system_libs = @innodb_system_libs@
++install_sh = @install_sh@
++isam_libs = @isam_libs@
++libdir = @libdir@
++libexecdir = @libexecdir@
++libmysqld_dirs = @libmysqld_dirs@
++linked_client_targets = @linked_client_targets@
++linked_netware_sources = @linked_netware_sources@
++localstatedir = @localstatedir@
++man_dirs = @man_dirs@
++mandir = @mandir@
++netware_dir = @netware_dir@
++oldincludedir = @oldincludedir@
++openssl_includes = @openssl_includes@
++openssl_libs = @openssl_libs@
++orbit_idl = @orbit_idl@
++orbit_includes = @orbit_includes@
++orbit_libs = @orbit_libs@
++prefix = @prefix@
++program_transform_name = @program_transform_name@
++pstack_dirs = @pstack_dirs@
++pstack_libs = @pstack_libs@
++readline_dir = @readline_dir@
++readline_link = @readline_link@
++sbindir = @sbindir@
++server_scripts = @server_scripts@
++sharedstatedir = @sharedstatedir@
++sql_client_dirs = @sql_client_dirs@
++sql_server_dirs = @sql_server_dirs@
++sysconfdir = @sysconfdir@
++target = @target@
++target_alias = @target_alias@
++target_cpu = @target_cpu@
++target_os = @target_os@
++target_vendor = @target_vendor@
++thread_dirs = @thread_dirs@
++tools_dirs = @tools_dirs@
++uname_prog = @uname_prog@
++vio_dir = @vio_dir@
++vio_libs = @vio_libs@
++
++AUTOMAKE_OPTIONS = foreign
++
++# These are built from source in the Docs directory
++EXTRA_DIST = INSTALL-SOURCE README COPYING EXCEPTIONS-CLIENT
++SUBDIRS = . include @docs_dirs@ @readline_dir@ \
++			@thread_dirs@ pstack @sql_client_dirs@ \
++			@sql_server_dirs@ scripts @man_dirs@ tests \
++			BUILD netware os2 @libmysqld_dirs@ \
++			@bench_dirs@ support-files @tools_dirs@
++
++
++# Relink after clean
++linked_sources = linked_client_sources linked_server_sources \
++		 linked_libmysql_sources linked_libmysql_r_sources \
++		 linked_libmysqld_sources  linked_libmysqldex_sources \
++		 linked_include_sources @linked_netware_sources@
++
++
++CLEANFILES = $(linked_sources)
++subdir = .
++ACLOCAL_M4 = $(top_srcdir)/aclocal.m4
++mkinstalldirs = $(SHELL) $(top_srcdir)/mkinstalldirs
++CONFIG_HEADER = config.h
++CONFIG_CLEAN_FILES = bdb/Makefile
++DIST_SOURCES =
++
++RECURSIVE_TARGETS = info-recursive dvi-recursive pdf-recursive \
++	ps-recursive install-info-recursive uninstall-info-recursive \
++	all-recursive install-data-recursive install-exec-recursive \
++	installdirs-recursive install-recursive uninstall-recursive \
++	check-recursive installcheck-recursive
++DIST_COMMON = README $(srcdir)/Makefile.in $(srcdir)/configure COPYING \
++	ChangeLog Makefile.am acconfig.h acinclude.m4 aclocal.m4 \
++	config.guess config.h.in config.sub configure configure.in \
++	depcomp install-sh ltconfig ltmain.sh missing mkinstalldirs
++DIST_SUBDIRS = $(SUBDIRS)
++all: config.h
++	$(MAKE) $(AM_MAKEFLAGS) all-recursive
++
++.SUFFIXES:
++
++am__CONFIG_DISTCLEAN_FILES = config.status config.cache config.log \
++ configure.lineno
++$(srcdir)/Makefile.in: @MAINTAINER_MODE_TRUE@ Makefile.am  $(top_srcdir)/configure.in $(ACLOCAL_M4)
++	cd $(top_srcdir) && \
++	  $(AUTOMAKE) --foreign  Makefile
++Makefile: @MAINTAINER_MODE_TRUE@ $(srcdir)/Makefile.in  $(top_builddir)/config.status
++	cd $(top_builddir) && $(SHELL) ./config.status $@ $(am__depfiles_maybe)
++
++$(top_builddir)/config.status: $(srcdir)/configure $(CONFIG_STATUS_DEPENDENCIES)
++	$(SHELL) ./config.status --recheck
++$(srcdir)/configure: @MAINTAINER_MODE_TRUE@ $(srcdir)/configure.in $(ACLOCAL_M4) $(CONFIGURE_DEPENDENCIES)
++	cd $(srcdir) && $(AUTOCONF)
++
++$(ACLOCAL_M4): @MAINTAINER_MODE_TRUE@ configure.in acinclude.m4
++	cd $(srcdir) && $(ACLOCAL) $(ACLOCAL_AMFLAGS)
++
++stamp-h1: $(srcdir)/config.h.in $(top_builddir)/config.status
++	@rm -f stamp-h1
++	cd $(top_builddir) && $(SHELL) ./config.status config.h
++
++$(srcdir)/config.h.in: @MAINTAINER_MODE_TRUE@ $(top_srcdir)/configure.in $(ACLOCAL_M4) $(top_srcdir)/acconfig.h
++	cd $(top_srcdir) && $(AUTOHEADER)
++	touch $(srcdir)/config.h.in
++
++distclean-hdr:
++	-rm -f config.h stamp-h1
++bdb/Makefile: $(top_builddir)/config.status $(top_srcdir)/bdb/Makefile.in
++	cd $(top_builddir) && $(SHELL) ./config.status $@
++
++mostlyclean-libtool:
++	-rm -f *.lo
++
++clean-libtool:
++	-rm -rf .libs _libs
++
++distclean-libtool:
++	-rm -f libtool
++uninstall-info-am:
++
++# This directory's subdirectories are mostly independent; you can cd
++# into them and run `make' without going through this Makefile.
++# To change the values of `make' variables: instead of editing Makefiles,
++# (1) if the variable is set in `config.status', edit `config.status'
++#     (which will cause the Makefiles to be regenerated when you run `make');
++# (2) otherwise, pass the desired values on the `make' command line.
++$(RECURSIVE_TARGETS):
++	@set fnord $$MAKEFLAGS; amf=$$2; \
++	dot_seen=no; \
++	target=`echo $@ | sed s/-recursive//`; \
++	list='$(SUBDIRS)'; for subdir in $$list; do \
++	  echo "Making $$target in $$subdir"; \
++	  if test "$$subdir" = "."; then \
++	    dot_seen=yes; \
++	    local_target="$$target-am"; \
++	  else \
++	    local_target="$$target"; \
++	  fi; \
++	  (cd $$subdir && $(MAKE) $(AM_MAKEFLAGS) $$local_target) \
++	   || case "$$amf" in *=*) exit 1;; *k*) fail=yes;; *) exit 1;; esac; \
++	done; \
++	if test "$$dot_seen" = "no"; then \
++	  $(MAKE) $(AM_MAKEFLAGS) "$$target-am" || exit 1; \
++	fi; test -z "$$fail"
++
++mostlyclean-recursive clean-recursive distclean-recursive \
++maintainer-clean-recursive:
++	@set fnord $$MAKEFLAGS; amf=$$2; \
++	dot_seen=no; \
++	case "$@" in \
++	  distclean-* | maintainer-clean-*) list='$(DIST_SUBDIRS)' ;; \
++	  *) list='$(SUBDIRS)' ;; \
++	esac; \
++	rev=''; for subdir in $$list; do \
++	  if test "$$subdir" = "."; then :; else \
++	    rev="$$subdir $$rev"; \
++	  fi; \
++	done; \
++	rev="$$rev ."; \
++	target=`echo $@ | sed s/-recursive//`; \
++	for subdir in $$rev; do \
++	  echo "Making $$target in $$subdir"; \
++	  if test "$$subdir" = "."; then \
++	    local_target="$$target-am"; \
++	  else \
++	    local_target="$$target"; \
++	  fi; \
++	  (cd $$subdir && $(MAKE) $(AM_MAKEFLAGS) $$local_target) \
++	   || case "$$amf" in *=*) exit 1;; *k*) fail=yes;; *) exit 1;; esac; \
++	done && test -z "$$fail"
++tags-recursive:
++	list='$(SUBDIRS)'; for subdir in $$list; do \
++	  test "$$subdir" = . || (cd $$subdir && $(MAKE) $(AM_MAKEFLAGS) tags); \
++	done
++ctags-recursive:
++	list='$(SUBDIRS)'; for subdir in $$list; do \
++	  test "$$subdir" = . || (cd $$subdir && $(MAKE) $(AM_MAKEFLAGS) ctags); \
++	done
++
++ETAGS = etags
++ETAGSFLAGS =
++
++CTAGS = ctags
++CTAGSFLAGS =
++
++ID: $(HEADERS) $(SOURCES) $(LISP) $(TAGS_FILES)
++	list='$(SOURCES) $(HEADERS) $(LISP) $(TAGS_FILES)'; \
++	unique=`for i in $$list; do \
++	    if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \
++	  done | \
++	  $(AWK) '    { files[$$0] = 1; } \
++	       END { for (i in files) print i; }'`; \
++	mkid -fID $$unique
++
++TAGS: tags-recursive $(HEADERS) $(SOURCES) config.h.in $(TAGS_DEPENDENCIES) \
++		$(TAGS_FILES) $(LISP)
++	tags=; \
++	here=`pwd`; \
++	if (etags --etags-include --version) >/dev/null 2>&1; then \
++	  include_option=--etags-include; \
++	else \
++	  include_option=--include; \
++	fi; \
++	list='$(SUBDIRS)'; for subdir in $$list; do \
++	  if test "$$subdir" = .; then :; else \
++	    test -f $$subdir/TAGS && \
++	      tags="$$tags $$include_option=$$here/$$subdir/TAGS"; \
++	  fi; \
++	done; \
++	list='$(SOURCES) $(HEADERS) config.h.in $(LISP) $(TAGS_FILES)'; \
++	unique=`for i in $$list; do \
++	    if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \
++	  done | \
++	  $(AWK) '    { files[$$0] = 1; } \
++	       END { for (i in files) print i; }'`; \
++	test -z "$(ETAGS_ARGS)$$tags$$unique" \
++	  || $(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \
++	     $$tags $$unique
++
++ctags: CTAGS
++CTAGS: ctags-recursive $(HEADERS) $(SOURCES) config.h.in $(TAGS_DEPENDENCIES) \
++		$(TAGS_FILES) $(LISP)
++	tags=; \
++	here=`pwd`; \
++	list='$(SOURCES) $(HEADERS) config.h.in $(LISP) $(TAGS_FILES)'; \
++	unique=`for i in $$list; do \
++	    if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \
++	  done | \
++	  $(AWK) '    { files[$$0] = 1; } \
++	       END { for (i in files) print i; }'`; \
++	test -z "$(CTAGS_ARGS)$$tags$$unique" \
++	  || $(CTAGS) $(CTAGSFLAGS) $(AM_CTAGSFLAGS) $(CTAGS_ARGS) \
++	     $$tags $$unique
++
++GTAGS:
++	here=`$(am__cd) $(top_builddir) && pwd` \
++	  && cd $(top_srcdir) \
++	  && gtags -i $(GTAGS_ARGS) $$here
++
++distclean-tags:
++	-rm -f TAGS ID GTAGS GRTAGS GSYMS GPATH tags
++DISTFILES = $(DIST_COMMON) $(DIST_SOURCES) $(TEXINFOS) $(EXTRA_DIST)
++
++top_distdir = .
++distdir = $(PACKAGE)-$(VERSION)
++
++am__remove_distdir = \
++  { test ! -d $(distdir) \
++    || { find $(distdir) -type d ! -perm -200 -exec chmod u+w {} ';' \
++         && rm -fr $(distdir); }; }
++
++GZIP_ENV = --best
++distuninstallcheck_listfiles = find . -type f -print
++distcleancheck_listfiles = find . -type f -print
++
++distdir: $(DISTFILES)
++	$(am__remove_distdir)
++	mkdir $(distdir)
++	$(mkinstalldirs) $(distdir)/bdb $(distdir)/include
++	@srcdirstrip=`echo "$(srcdir)" | sed 's|.|.|g'`; \
++	topsrcdirstrip=`echo "$(top_srcdir)" | sed 's|.|.|g'`; \
++	list='$(DISTFILES)'; for file in $$list; do \
++	  case $$file in \
++	    $(srcdir)/*) file=`echo "$$file" | sed "s|^$$srcdirstrip/||"`;; \
++	    $(top_srcdir)/*) file=`echo "$$file" | sed "s|^$$topsrcdirstrip/|$(top_builddir)/|"`;; \
++	  esac; \
++	  if test -f $$file || test -d $$file; then d=.; else d=$(srcdir); fi; \
++	  dir=`echo "$$file" | sed -e 's,/[^/]*$$,,'`; \
++	  if test "$$dir" != "$$file" && test "$$dir" != "."; then \
++	    dir="/$$dir"; \
++	    $(mkinstalldirs) "$(distdir)$$dir"; \
++	  else \
++	    dir=''; \
++	  fi; \
++	  if test -d $$d/$$file; then \
++	    if test -d $(srcdir)/$$file && test $$d != $(srcdir); then \
++	      cp -pR $(srcdir)/$$file $(distdir)$$dir || exit 1; \
++	    fi; \
++	    cp -pR $$d/$$file $(distdir)$$dir || exit 1; \
++	  else \
++	    test -f $(distdir)/$$file \
++	    || cp -p $$d/$$file $(distdir)/$$file \
++	    || exit 1; \
++	  fi; \
++	done
++	list='$(SUBDIRS)'; for subdir in $$list; do \
++	  if test "$$subdir" = .; then :; else \
++	    test -d $(distdir)/$$subdir \
++	    || mkdir $(distdir)/$$subdir \
++	    || exit 1; \
++	    (cd $$subdir && \
++	      $(MAKE) $(AM_MAKEFLAGS) \
++	        top_distdir="$(top_distdir)" \
++	        distdir=../$(distdir)/$$subdir \
++	        distdir) \
++	      || exit 1; \
++	  fi; \
++	done
++	$(MAKE) $(AM_MAKEFLAGS) \
++	  top_distdir="$(top_distdir)" distdir="$(distdir)" \
++	  dist-hook
++	-find $(distdir) -type d ! -perm -777 -exec chmod a+rwx {} \; -o \
++	  ! -type d ! -perm -444 -links 1 -exec chmod a+r {} \; -o \
++	  ! -type d ! -perm -400 -exec chmod a+r {} \; -o \
++	  ! -type d ! -perm -444 -exec $(SHELL) $(install_sh) -c -m a+r {} {} \; \
++	|| chmod -R a+r $(distdir)
++dist-gzip: distdir
++	$(AMTAR) chof - $(distdir) | GZIP=$(GZIP_ENV) gzip -c >$(distdir).tar.gz
++	$(am__remove_distdir)
++
++dist dist-all: distdir
++	$(AMTAR) chof - $(distdir) | GZIP=$(GZIP_ENV) gzip -c >$(distdir).tar.gz
++	$(am__remove_distdir)
++
++# This target untars the dist file and tries a VPATH configuration.  Then
++# it guarantees that the distribution is self-contained by making another
++# tarfile.
++distcheck: dist
++	$(am__remove_distdir)
++	GZIP=$(GZIP_ENV) gunzip -c $(distdir).tar.gz | $(AMTAR) xf -
++	chmod -R a-w $(distdir); chmod a+w $(distdir)
++	mkdir $(distdir)/_build
++	mkdir $(distdir)/_inst
++	chmod a-w $(distdir)
++	dc_install_base=`$(am__cd) $(distdir)/_inst && pwd | sed -e 's,^[^:\\/]:[\\/],/,'` \
++	  && dc_destdir="$${TMPDIR-/tmp}/am-dc-$$$$/" \
++	  && cd $(distdir)/_build \
++	  && ../configure --srcdir=.. --prefix="$$dc_install_base" \
++	    $(DISTCHECK_CONFIGURE_FLAGS) \
++	  && $(MAKE) $(AM_MAKEFLAGS) \
++	  && $(MAKE) $(AM_MAKEFLAGS) dvi \
++	  && $(MAKE) $(AM_MAKEFLAGS) check \
++	  && $(MAKE) $(AM_MAKEFLAGS) install \
++	  && $(MAKE) $(AM_MAKEFLAGS) installcheck \
++	  && $(MAKE) $(AM_MAKEFLAGS) uninstall \
++	  && $(MAKE) $(AM_MAKEFLAGS) distuninstallcheck_dir="$$dc_install_base" \
++	        distuninstallcheck \
++	  && chmod -R a-w "$$dc_install_base" \
++	  && ({ \
++	       (cd ../.. && $(mkinstalldirs) "$$dc_destdir") \
++	       && $(MAKE) $(AM_MAKEFLAGS) DESTDIR="$$dc_destdir" install \
++	       && $(MAKE) $(AM_MAKEFLAGS) DESTDIR="$$dc_destdir" uninstall \
++	       && $(MAKE) $(AM_MAKEFLAGS) DESTDIR="$$dc_destdir" \
++	            distuninstallcheck_dir="$$dc_destdir" distuninstallcheck; \
++	      } || { rm -rf "$$dc_destdir"; exit 1; }) \
++	  && rm -rf "$$dc_destdir" \
++	  && $(MAKE) $(AM_MAKEFLAGS) dist-gzip \
++	  && rm -f $(distdir).tar.gz \
++	  && $(MAKE) $(AM_MAKEFLAGS) distcleancheck
++	$(am__remove_distdir)
++	@echo "$(distdir).tar.gz is ready for distribution" | \
++	  sed 'h;s/./=/g;p;x;p;x'
++distuninstallcheck:
++	@cd $(distuninstallcheck_dir) \
++	&& test `$(distuninstallcheck_listfiles) | wc -l` -le 1 \
++	   || { echo "ERROR: files left after uninstall:" ; \
++	        if test -n "$(DESTDIR)"; then \
++	          echo "  (check DESTDIR support)"; \
++	        fi ; \
++	        $(distuninstallcheck_listfiles) ; \
++	        exit 1; } >&2
++distcleancheck: distclean
++	@if test '$(srcdir)' = . ; then \
++	  echo "ERROR: distcleancheck can only run from a VPATH build" ; \
++	  exit 1 ; \
++	fi
++	@test `$(distcleancheck_listfiles) | wc -l` -eq 0 \
++	  || { echo "ERROR: files left in build directory after distclean:" ; \
++	       $(distcleancheck_listfiles) ; \
++	       exit 1; } >&2
++check-am: all-am
++check: check-recursive
++all-am: Makefile config.h
++installdirs: installdirs-recursive
++installdirs-am:
++
++install: install-recursive
++install-exec: install-exec-recursive
++install-data: install-data-recursive
++uninstall: uninstall-recursive
++
++install-am: all-am
++	@$(MAKE) $(AM_MAKEFLAGS) install-exec-am install-data-am
++
++installcheck: installcheck-recursive
++install-strip:
++	$(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \
++	  install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \
++	  `test -z '$(STRIP)' || \
++	    echo "INSTALL_PROGRAM_ENV=STRIPPROG='$(STRIP)'"` install
++mostlyclean-generic:
++
++clean-generic:
++	-test -z "$(CLEANFILES)" || rm -f $(CLEANFILES)
++
++distclean-generic:
++	-rm -f $(CONFIG_CLEAN_FILES)
++
++maintainer-clean-generic:
++	@echo "This command is intended for maintainers to use"
++	@echo "it deletes files that may require special tools to rebuild."
++clean: clean-recursive
++
++clean-am: clean-generic clean-libtool mostlyclean-am
++
++distclean: distclean-recursive
++	-rm -f $(am__CONFIG_DISTCLEAN_FILES)
++	-rm -f Makefile
++distclean-am: clean-am distclean-generic distclean-hdr distclean-libtool \
++	distclean-tags
++
++dvi: dvi-recursive
++
++dvi-am:
++
++info: info-recursive
++
++info-am:
++
++install-data-am:
++
++install-exec-am:
++
++install-info: install-info-recursive
++
++install-man:
++
++installcheck-am:
++
++maintainer-clean: maintainer-clean-recursive
++	-rm -f $(am__CONFIG_DISTCLEAN_FILES)
++	-rm -rf $(top_srcdir)/autom4te.cache
++	-rm -f Makefile
++maintainer-clean-am: distclean-am maintainer-clean-generic
++
++mostlyclean: mostlyclean-recursive
++
++mostlyclean-am: mostlyclean-generic mostlyclean-libtool
++
++pdf: pdf-recursive
++
++pdf-am:
++
++ps: ps-recursive
++
++ps-am:
++
++uninstall-am: uninstall-info-am
++
++uninstall-info: uninstall-info-recursive
++
++.PHONY: $(RECURSIVE_TARGETS) CTAGS GTAGS all all-am check check-am clean \
++	clean-generic clean-libtool clean-recursive ctags \
++	ctags-recursive dist dist-all dist-gzip distcheck distclean \
++	distclean-generic distclean-hdr distclean-libtool \
++	distclean-recursive distclean-tags distcleancheck distdir \
++	distuninstallcheck dvi dvi-am dvi-recursive info info-am \
++	info-recursive install install-am install-data install-data-am \
++	install-data-recursive install-exec install-exec-am \
++	install-exec-recursive install-info install-info-am \
++	install-info-recursive install-man install-recursive \
++	install-strip installcheck installcheck-am installdirs \
++	installdirs-am installdirs-recursive maintainer-clean \
++	maintainer-clean-generic maintainer-clean-recursive mostlyclean \
++	mostlyclean-generic mostlyclean-libtool mostlyclean-recursive \
++	pdf pdf-am pdf-recursive ps ps-am ps-recursive tags \
++	tags-recursive uninstall uninstall-am uninstall-info-am \
++	uninstall-info-recursive uninstall-recursive
++
++
++# This is just so that the linking is done early.
++config.h: $(linked_sources)
++
++linked_include_sources:
++	cd include; $(MAKE) link_sources
++	echo timestamp > linked_include_sources
++
++linked_client_sources:  @linked_client_targets@
++	cd client; $(MAKE) link_sources
++	echo timestamp > linked_client_sources
++
++linked_libmysql_sources:
++	cd libmysql; $(MAKE) link_sources
++	echo timestamp > linked_libmysql_sources
++
++linked_libmysql_r_sources: linked_libmysql_sources
++	cd libmysql_r; $(MAKE) link_sources
++	echo timestamp > linked_libmysql_r_sources
++
++linked_libmysqld_sources:
++	cd libmysqld; $(MAKE) link_sources
++	echo timestamp > linked_libmysqld_sources
++
++linked_libmysqldex_sources:
++	cd libmysqld/examples; $(MAKE) link_sources
++	echo timestamp > linked_libmysqldex_sources
++
++linked_netware_sources:
++	cd @netware_dir@; $(MAKE) link_sources
++	echo timestamp > linked_netware_sources
++
++#avoid recursive make calls in sql directory
++linked_server_sources:
++	cd sql; rm -f mini_client_errors.c;@LN_CP_F@ ../libmysql/errmsg.c mini_client_errors.c
++	echo timestamp > linked_server_sources
++
++# Create permission databases
++init-db:		all
++	$(top_builddir)/scripts/mysql_install_db
++
++bin-dist:		all
++	$(top_builddir)/scripts/make_binary_distribution @MAKE_BINARY_DISTRIBUTION_OPTIONS@
++
++# Remove BK's "SCCS" subdirectories from source distribution
++dist-hook:
++	rm -rf `find $(distdir) -type d -name SCCS`
++
++tags:
++	support-files/build-tags
++.PHONY:		init-db bin-dist
++
++# Test installation
++
++test:
++	cd mysql-test ; ./mysql-test-run
++# Tell versions [3.59,3.63) of GNU make to not export all variables.
++# Otherwise a system limit (for SysV at least) may be exceeded.
++.NOEXPORT:
diff --git a/storage/xtradb/build/debian/patches/01_MAKEFILES__Docs_Makefile.in.dpatch b/storage/xtradb/build/debian/patches/01_MAKEFILES__Docs_Makefile.in.dpatch
new file mode 100644
index 00000000000..6440697bc02
--- /dev/null
+++ b/storage/xtradb/build/debian/patches/01_MAKEFILES__Docs_Makefile.in.dpatch
@@ -0,0 +1,776 @@
+#! /bin/sh /usr/share/dpatch/dpatch-run
+## 01_MAKEFILES__Docs_Makefile.in.dpatch by  <ch@debian.org>
+##
+## All lines beginning with `## DP:' are a description of the patch.
+## DP: Creates Docs/Makefile.in
+
+@DPATCH@
+
+--- old/Docs/Makefile.in	2005-03-01 02:08:01.877429040 +0100
++++ new/Docs/Makefile.in	2005-02-28 21:21:24.000000000 +0100
+@@ -0,0 +1,765 @@
++# Makefile.in generated by automake 1.7.9 from Makefile.am.
++# @configure_input@
++
++# Copyright 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003
++# Free Software Foundation, Inc.
++# This Makefile.in is free software; the Free Software Foundation
++# gives unlimited permission to copy and/or distribute it,
++# with or without modifications, as long as this notice is preserved.
++
++# This program is distributed in the hope that it will be useful,
++# but WITHOUT ANY WARRANTY, to the extent permitted by law; without
++# even the implied warranty of MERCHANTABILITY or FITNESS FOR A
++# PARTICULAR PURPOSE.
++
++@SET_MAKE@
++
++# Copyright (C) 2000 MySQL AB & MySQL Finland AB & TCX DataKonsult AB
++# 
++# This program is free software; you can redistribute it and/or modify
++# it under the terms of the GNU General Public License as published by
++# the Free Software Foundation; either version 2 of the License, or
++# (at your option) any later version.
++# 
++# This program is distributed in the hope that it will be useful,
++# but WITHOUT ANY WARRANTY; without even the implied warranty of
++# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
++# GNU General Public License for more details.
++# 
++# You should have received a copy of the GNU General Public License
++# along with this program; if not, write to the Free Software
++# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
++
++# Process this file with automake to create Makefile.in
++
++srcdir = @srcdir@
++top_srcdir = @top_srcdir@
++VPATH = @srcdir@
++pkgdatadir = $(datadir)/@PACKAGE@
++pkglibdir = $(libdir)/@PACKAGE@
++pkgincludedir = $(includedir)/@PACKAGE@
++top_builddir = .
++
++am__cd = CDPATH="$${ZSH_VERSION+.}$(PATH_SEPARATOR)" && cd
++INSTALL = @INSTALL@
++install_sh_DATA = $(install_sh) -c -m 644
++install_sh_PROGRAM = $(install_sh) -c
++install_sh_SCRIPT = $(install_sh) -c
++INSTALL_HEADER = $(INSTALL_DATA)
++transform = $(program_transform_name)
++NORMAL_INSTALL = :
++PRE_INSTALL = :
++POST_INSTALL = :
++NORMAL_UNINSTALL = :
++PRE_UNINSTALL = :
++POST_UNINSTALL = :
++build_triplet = @build@
++host_triplet = @host@
++target_triplet = @target@
++ACLOCAL = @ACLOCAL@
++ALLOCA = @ALLOCA@
++AMDEP_FALSE = @AMDEP_FALSE@
++AMDEP_TRUE = @AMDEP_TRUE@
++AMTAR = @AMTAR@
++AR = @AR@
++AS = @AS@
++ASSEMBLER_FALSE = @ASSEMBLER_FALSE@
++ASSEMBLER_TRUE = @ASSEMBLER_TRUE@
++ASSEMBLER_sparc32_FALSE = @ASSEMBLER_sparc32_FALSE@
++ASSEMBLER_sparc32_TRUE = @ASSEMBLER_sparc32_TRUE@
++ASSEMBLER_sparc64_FALSE = @ASSEMBLER_sparc64_FALSE@
++ASSEMBLER_sparc64_TRUE = @ASSEMBLER_sparc64_TRUE@
++ASSEMBLER_x86_FALSE = @ASSEMBLER_x86_FALSE@
++ASSEMBLER_x86_TRUE = @ASSEMBLER_x86_TRUE@
++AUTOCONF = @AUTOCONF@
++AUTOHEADER = @AUTOHEADER@
++AUTOMAKE = @AUTOMAKE@
++AVAILABLE_LANGUAGES = @AVAILABLE_LANGUAGES@
++AVAILABLE_LANGUAGES_ERRORS = @AVAILABLE_LANGUAGES_ERRORS@
++AWK = @AWK@
++CC = @CC@
++CCAS = @CCAS@
++CCASFLAGS = @CCASFLAGS@
++CCDEPMODE = @CCDEPMODE@
++CC_VERSION = @CC_VERSION@
++CFLAGS = @CFLAGS@
++CHARSETS_NEED_SOURCE = @CHARSETS_NEED_SOURCE@
++CHARSET_OBJS = @CHARSET_OBJS@
++CHARSET_SRCS = @CHARSET_SRCS@
++CHECK_PID = @CHECK_PID@
++CHMOD = @CHMOD@
++CLIENT_EXTRA_LDFLAGS = @CLIENT_EXTRA_LDFLAGS@
++CLIENT_LIBS = @CLIENT_LIBS@
++CMP = @CMP@
++COMPILATION_COMMENT = @COMPILATION_COMMENT@
++COMPILE_PSTACK_FALSE = @COMPILE_PSTACK_FALSE@
++COMPILE_PSTACK_TRUE = @COMPILE_PSTACK_TRUE@
++CONF_COMMAND = @CONF_COMMAND@
++CP = @CP@
++CPP = @CPP@
++CPPFLAGS = @CPPFLAGS@
++CXX = @CXX@
++CXXCPP = @CXXCPP@
++CXXDEPMODE = @CXXDEPMODE@
++CXXFLAGS = @CXXFLAGS@
++CXXLDFLAGS = @CXXLDFLAGS@
++CXX_VERSION = @CXX_VERSION@
++CYGPATH_W = @CYGPATH_W@
++DEFS = @DEFS@
++DEPDIR = @DEPDIR@
++DOT_FRM_VERSION = @DOT_FRM_VERSION@
++DVIS = @DVIS@
++ECHO = @ECHO@
++ECHO_C = @ECHO_C@
++ECHO_N = @ECHO_N@
++ECHO_T = @ECHO_T@
++EGREP = @EGREP@
++EXEEXT = @EXEEXT@
++F77 = @F77@
++FFLAGS = @FFLAGS@
++FIND_PROC = @FIND_PROC@
++GETCONF = @GETCONF@
++GXX = @GXX@
++HAVE_NETWARE_FALSE = @HAVE_NETWARE_FALSE@
++HAVE_NETWARE_TRUE = @HAVE_NETWARE_TRUE@
++HOSTNAME = @HOSTNAME@
++INSTALL_DATA = @INSTALL_DATA@
++INSTALL_PROGRAM = @INSTALL_PROGRAM@
++INSTALL_SCRIPT = @INSTALL_SCRIPT@
++INSTALL_STRIP_PROGRAM = @INSTALL_STRIP_PROGRAM@
++IS_LINUX = @IS_LINUX@
++KILL = @KILL@
++LD = @LD@
++LDFLAGS = @LDFLAGS@
++LIBDL = @LIBDL@
++LIBOBJS = @LIBOBJS@
++LIBS = @LIBS@
++LIBTOOL = @LIBTOOL@
++LIB_EXTRA_CCFLAGS = @LIB_EXTRA_CCFLAGS@
++LM_CFLAGS = @LM_CFLAGS@
++LN = @LN@
++LN_CP_F = @LN_CP_F@
++LN_S = @LN_S@
++LOCAL_FALSE = @LOCAL_FALSE@
++LOCAL_TRUE = @LOCAL_TRUE@
++LTLIBOBJS = @LTLIBOBJS@
++MACHINE_TYPE = @MACHINE_TYPE@
++MAINT = @MAINT@
++MAINTAINER_MODE_FALSE = @MAINTAINER_MODE_FALSE@
++MAINTAINER_MODE_TRUE = @MAINTAINER_MODE_TRUE@
++MAKEINFO = @MAKEINFO@
++MAKE_BINARY_DISTRIBUTION_OPTIONS = @MAKE_BINARY_DISTRIBUTION_OPTIONS@
++MAKE_SHELL = @MAKE_SHELL@
++MT_INCLUDES = @MT_INCLUDES@
++MT_LD_ADD = @MT_LD_ADD@
++MV = @MV@
++MYSQLD_DEFAULT_SWITCHES = @MYSQLD_DEFAULT_SWITCHES@
++MYSQLD_EXTRA_LDFLAGS = @MYSQLD_EXTRA_LDFLAGS@
++MYSQLD_USER = @MYSQLD_USER@
++MYSQL_BASE_VERSION = @MYSQL_BASE_VERSION@
++MYSQL_NO_DASH_VERSION = @MYSQL_NO_DASH_VERSION@
++MYSQL_SERVER_SUFFIX = @MYSQL_SERVER_SUFFIX@
++MYSQL_TCP_PORT = @MYSQL_TCP_PORT@
++MYSQL_TCP_PORT_DEFAULT = @MYSQL_TCP_PORT_DEFAULT@
++MYSQL_UNIX_ADDR = @MYSQL_UNIX_ADDR@
++MYSQL_VERSION_ID = @MYSQL_VERSION_ID@
++NOINST_LDFLAGS = @NOINST_LDFLAGS@
++OBJEXT = @OBJEXT@
++PACKAGE = @PACKAGE@
++PACKAGE_BUGREPORT = @PACKAGE_BUGREPORT@
++PACKAGE_NAME = @PACKAGE_NAME@
++PACKAGE_STRING = @PACKAGE_STRING@
++PACKAGE_TARNAME = @PACKAGE_TARNAME@
++PACKAGE_VERSION = @PACKAGE_VERSION@
++PATH_SEPARATOR = @PATH_SEPARATOR@
++PDFMANUAL = @PDFMANUAL@
++PERL = @PERL@
++PERL5 = @PERL5@
++PROTOCOL_VERSION = @PROTOCOL_VERSION@
++PS = @PS@
++RANLIB = @RANLIB@
++RM = @RM@
++SAVE_ASFLAGS = @SAVE_ASFLAGS@
++SAVE_CFLAGS = @SAVE_CFLAGS@
++SAVE_CXXFLAGS = @SAVE_CXXFLAGS@
++SAVE_CXXLDFLAGS = @SAVE_CXXLDFLAGS@
++SAVE_LDFLAGS = @SAVE_LDFLAGS@
++SED = @SED@
++SET_MAKE = @SET_MAKE@
++SHARED_LIB_VERSION = @SHARED_LIB_VERSION@
++SHELL = @SHELL@
++STRIP = @STRIP@
++SYSTEM_TYPE = @SYSTEM_TYPE@
++TAR = @TAR@
++TERMCAP_LIB = @TERMCAP_LIB@
++THREAD_LOBJECTS = @THREAD_LOBJECTS@
++THREAD_LPROGRAMS = @THREAD_LPROGRAMS@
++VERSION = @VERSION@
++WRAPLIBS = @WRAPLIBS@
++YACC = @YACC@
++ac_ct_AR = @ac_ct_AR@
++ac_ct_CC = @ac_ct_CC@
++ac_ct_CXX = @ac_ct_CXX@
++ac_ct_F77 = @ac_ct_F77@
++ac_ct_GETCONF = @ac_ct_GETCONF@
++ac_ct_RANLIB = @ac_ct_RANLIB@
++ac_ct_STRIP = @ac_ct_STRIP@
++am__fastdepCC_FALSE = @am__fastdepCC_FALSE@
++am__fastdepCC_TRUE = @am__fastdepCC_TRUE@
++am__fastdepCXX_FALSE = @am__fastdepCXX_FALSE@
++am__fastdepCXX_TRUE = @am__fastdepCXX_TRUE@
++am__include = @am__include@
++am__leading_dot = @am__leading_dot@
++am__quote = @am__quote@
++bdb_includes = @bdb_includes@
++bdb_libs = @bdb_libs@
++bdb_libs_with_path = @bdb_libs_with_path@
++bench_dirs = @bench_dirs@
++bindir = @bindir@
++build = @build@
++build_alias = @build_alias@
++build_cpu = @build_cpu@
++build_os = @build_os@
++build_vendor = @build_vendor@
++datadir = @datadir@
++default_charset = @default_charset@
++docs_dirs = @docs_dirs@
++exec_prefix = @exec_prefix@
++host = @host@
++host_alias = @host_alias@
++host_cpu = @host_cpu@
++host_os = @host_os@
++host_vendor = @host_vendor@
++includedir = @includedir@
++infodir = @infodir@
++innodb_includes = @innodb_includes@
++innodb_libs = @innodb_libs@
++innodb_system_libs = @innodb_system_libs@
++install_sh = @install_sh@
++isam_libs = @isam_libs@
++libdir = @libdir@
++libexecdir = @libexecdir@
++libmysqld_dirs = @libmysqld_dirs@
++linked_client_targets = @linked_client_targets@
++linked_netware_sources = @linked_netware_sources@
++localstatedir = @localstatedir@
++man_dirs = @man_dirs@
++mandir = @mandir@
++netware_dir = @netware_dir@
++oldincludedir = @oldincludedir@
++openssl_includes = @openssl_includes@
++openssl_libs = @openssl_libs@
++orbit_idl = @orbit_idl@
++orbit_includes = @orbit_includes@
++orbit_libs = @orbit_libs@
++prefix = @prefix@
++program_transform_name = @program_transform_name@
++pstack_dirs = @pstack_dirs@
++pstack_libs = @pstack_libs@
++readline_dir = @readline_dir@
++readline_link = @readline_link@
++sbindir = @sbindir@
++server_scripts = @server_scripts@
++sharedstatedir = @sharedstatedir@
++sql_client_dirs = @sql_client_dirs@
++sql_server_dirs = @sql_server_dirs@
++sysconfdir = @sysconfdir@
++target = @target@
++target_alias = @target_alias@
++target_cpu = @target_cpu@
++target_os = @target_os@
++target_vendor = @target_vendor@
++thread_dirs = @thread_dirs@
++tools_dirs = @tools_dirs@
++uname_prog = @uname_prog@
++vio_dir = @vio_dir@
++vio_libs = @vio_libs@
++
++AUTOMAKE_OPTIONS = foreign
++
++# These are built from source in the Docs directory
++EXTRA_DIST = INSTALL-SOURCE README COPYING EXCEPTIONS-CLIENT
++SUBDIRS = . include @docs_dirs@ @readline_dir@ \
++			@thread_dirs@ pstack @sql_client_dirs@ \
++			@sql_server_dirs@ scripts @man_dirs@ tests \
++			BUILD netware os2 @libmysqld_dirs@ \
++			@bench_dirs@ support-files @tools_dirs@
++
++
++# Relink after clean
++linked_sources = linked_client_sources linked_server_sources \
++		 linked_libmysql_sources linked_libmysql_r_sources \
++		 linked_libmysqld_sources  linked_libmysqldex_sources \
++		 linked_include_sources @linked_netware_sources@
++
++
++CLEANFILES = $(linked_sources)
++subdir = .
++ACLOCAL_M4 = $(top_srcdir)/aclocal.m4
++mkinstalldirs = $(SHELL) $(top_srcdir)/mkinstalldirs
++CONFIG_HEADER = config.h
++CONFIG_CLEAN_FILES = bdb/Makefile
++DIST_SOURCES =
++
++RECURSIVE_TARGETS = info-recursive dvi-recursive pdf-recursive \
++	ps-recursive install-info-recursive uninstall-info-recursive \
++	all-recursive install-data-recursive install-exec-recursive \
++	installdirs-recursive install-recursive uninstall-recursive \
++	check-recursive installcheck-recursive
++DIST_COMMON = README $(srcdir)/Makefile.in $(srcdir)/configure COPYING \
++	ChangeLog Makefile.am acconfig.h acinclude.m4 aclocal.m4 \
++	config.guess config.h.in config.sub configure configure.in \
++	depcomp install-sh ltconfig ltmain.sh missing mkinstalldirs
++DIST_SUBDIRS = $(SUBDIRS)
++all: config.h
++	$(MAKE) $(AM_MAKEFLAGS) all-recursive
++
++.SUFFIXES:
++
++am__CONFIG_DISTCLEAN_FILES = config.status config.cache config.log \
++ configure.lineno
++$(srcdir)/Makefile.in: @MAINTAINER_MODE_TRUE@ Makefile.am  $(top_srcdir)/configure.in $(ACLOCAL_M4)
++	cd $(top_srcdir) && \
++	  $(AUTOMAKE) --foreign  Makefile
++Makefile: @MAINTAINER_MODE_TRUE@ $(srcdir)/Makefile.in  $(top_builddir)/config.status
++	cd $(top_builddir) && $(SHELL) ./config.status $@ $(am__depfiles_maybe)
++
++$(top_builddir)/config.status: $(srcdir)/configure $(CONFIG_STATUS_DEPENDENCIES)
++	$(SHELL) ./config.status --recheck
++$(srcdir)/configure: @MAINTAINER_MODE_TRUE@ $(srcdir)/configure.in $(ACLOCAL_M4) $(CONFIGURE_DEPENDENCIES)
++	cd $(srcdir) && $(AUTOCONF)
++
++$(ACLOCAL_M4): @MAINTAINER_MODE_TRUE@ configure.in acinclude.m4
++	cd $(srcdir) && $(ACLOCAL) $(ACLOCAL_AMFLAGS)
++
++stamp-h1: $(srcdir)/config.h.in $(top_builddir)/config.status
++	@rm -f stamp-h1
++	cd $(top_builddir) && $(SHELL) ./config.status config.h
++
++$(srcdir)/config.h.in: @MAINTAINER_MODE_TRUE@ $(top_srcdir)/configure.in $(ACLOCAL_M4) $(top_srcdir)/acconfig.h
++	cd $(top_srcdir) && $(AUTOHEADER)
++	touch $(srcdir)/config.h.in
++
++distclean-hdr:
++	-rm -f config.h stamp-h1
++bdb/Makefile: $(top_builddir)/config.status $(top_srcdir)/bdb/Makefile.in
++	cd $(top_builddir) && $(SHELL) ./config.status $@
++
++mostlyclean-libtool:
++	-rm -f *.lo
++
++clean-libtool:
++	-rm -rf .libs _libs
++
++distclean-libtool:
++	-rm -f libtool
++uninstall-info-am:
++
++# This directory's subdirectories are mostly independent; you can cd
++# into them and run `make' without going through this Makefile.
++# To change the values of `make' variables: instead of editing Makefiles,
++# (1) if the variable is set in `config.status', edit `config.status'
++#     (which will cause the Makefiles to be regenerated when you run `make');
++# (2) otherwise, pass the desired values on the `make' command line.
++$(RECURSIVE_TARGETS):
++	@set fnord $$MAKEFLAGS; amf=$$2; \
++	dot_seen=no; \
++	target=`echo $@ | sed s/-recursive//`; \
++	list='$(SUBDIRS)'; for subdir in $$list; do \
++	  echo "Making $$target in $$subdir"; \
++	  if test "$$subdir" = "."; then \
++	    dot_seen=yes; \
++	    local_target="$$target-am"; \
++	  else \
++	    local_target="$$target"; \
++	  fi; \
++	  (cd $$subdir && $(MAKE) $(AM_MAKEFLAGS) $$local_target) \
++	   || case "$$amf" in *=*) exit 1;; *k*) fail=yes;; *) exit 1;; esac; \
++	done; \
++	if test "$$dot_seen" = "no"; then \
++	  $(MAKE) $(AM_MAKEFLAGS) "$$target-am" || exit 1; \
++	fi; test -z "$$fail"
++
++mostlyclean-recursive clean-recursive distclean-recursive \
++maintainer-clean-recursive:
++	@set fnord $$MAKEFLAGS; amf=$$2; \
++	dot_seen=no; \
++	case "$@" in \
++	  distclean-* | maintainer-clean-*) list='$(DIST_SUBDIRS)' ;; \
++	  *) list='$(SUBDIRS)' ;; \
++	esac; \
++	rev=''; for subdir in $$list; do \
++	  if test "$$subdir" = "."; then :; else \
++	    rev="$$subdir $$rev"; \
++	  fi; \
++	done; \
++	rev="$$rev ."; \
++	target=`echo $@ | sed s/-recursive//`; \
++	for subdir in $$rev; do \
++	  echo "Making $$target in $$subdir"; \
++	  if test "$$subdir" = "."; then \
++	    local_target="$$target-am"; \
++	  else \
++	    local_target="$$target"; \
++	  fi; \
++	  (cd $$subdir && $(MAKE) $(AM_MAKEFLAGS) $$local_target) \
++	   || case "$$amf" in *=*) exit 1;; *k*) fail=yes;; *) exit 1;; esac; \
++	done && test -z "$$fail"
++tags-recursive:
++	list='$(SUBDIRS)'; for subdir in $$list; do \
++	  test "$$subdir" = . || (cd $$subdir && $(MAKE) $(AM_MAKEFLAGS) tags); \
++	done
++ctags-recursive:
++	list='$(SUBDIRS)'; for subdir in $$list; do \
++	  test "$$subdir" = . || (cd $$subdir && $(MAKE) $(AM_MAKEFLAGS) ctags); \
++	done
++
++ETAGS = etags
++ETAGSFLAGS =
++
++CTAGS = ctags
++CTAGSFLAGS =
++
++ID: $(HEADERS) $(SOURCES) $(LISP) $(TAGS_FILES)
++	list='$(SOURCES) $(HEADERS) $(LISP) $(TAGS_FILES)'; \
++	unique=`for i in $$list; do \
++	    if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \
++	  done | \
++	  $(AWK) '    { files[$$0] = 1; } \
++	       END { for (i in files) print i; }'`; \
++	mkid -fID $$unique
++
++TAGS: tags-recursive $(HEADERS) $(SOURCES) config.h.in $(TAGS_DEPENDENCIES) \
++		$(TAGS_FILES) $(LISP)
++	tags=; \
++	here=`pwd`; \
++	if (etags --etags-include --version) >/dev/null 2>&1; then \
++	  include_option=--etags-include; \
++	else \
++	  include_option=--include; \
++	fi; \
++	list='$(SUBDIRS)'; for subdir in $$list; do \
++	  if test "$$subdir" = .; then :; else \
++	    test -f $$subdir/TAGS && \
++	      tags="$$tags $$include_option=$$here/$$subdir/TAGS"; \
++	  fi; \
++	done; \
++	list='$(SOURCES) $(HEADERS) config.h.in $(LISP) $(TAGS_FILES)'; \
++	unique=`for i in $$list; do \
++	    if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \
++	  done | \
++	  $(AWK) '    { files[$$0] = 1; } \
++	       END { for (i in files) print i; }'`; \
++	test -z "$(ETAGS_ARGS)$$tags$$unique" \
++	  || $(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \
++	     $$tags $$unique
++
++ctags: CTAGS
++CTAGS: ctags-recursive $(HEADERS) $(SOURCES) config.h.in $(TAGS_DEPENDENCIES) \
++		$(TAGS_FILES) $(LISP)
++	tags=; \
++	here=`pwd`; \
++	list='$(SOURCES) $(HEADERS) config.h.in $(LISP) $(TAGS_FILES)'; \
++	unique=`for i in $$list; do \
++	    if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \
++	  done | \
++	  $(AWK) '    { files[$$0] = 1; } \
++	       END { for (i in files) print i; }'`; \
++	test -z "$(CTAGS_ARGS)$$tags$$unique" \
++	  || $(CTAGS) $(CTAGSFLAGS) $(AM_CTAGSFLAGS) $(CTAGS_ARGS) \
++	     $$tags $$unique
++
++GTAGS:
++	here=`$(am__cd) $(top_builddir) && pwd` \
++	  && cd $(top_srcdir) \
++	  && gtags -i $(GTAGS_ARGS) $$here
++
++distclean-tags:
++	-rm -f TAGS ID GTAGS GRTAGS GSYMS GPATH tags
++DISTFILES = $(DIST_COMMON) $(DIST_SOURCES) $(TEXINFOS) $(EXTRA_DIST)
++
++top_distdir = .
++distdir = $(PACKAGE)-$(VERSION)
++
++am__remove_distdir = \
++  { test ! -d $(distdir) \
++    || { find $(distdir) -type d ! -perm -200 -exec chmod u+w {} ';' \
++         && rm -fr $(distdir); }; }
++
++GZIP_ENV = --best
++distuninstallcheck_listfiles = find . -type f -print
++distcleancheck_listfiles = find . -type f -print
++
++distdir: $(DISTFILES)
++	$(am__remove_distdir)
++	mkdir $(distdir)
++	$(mkinstalldirs) $(distdir)/bdb $(distdir)/include
++	@srcdirstrip=`echo "$(srcdir)" | sed 's|.|.|g'`; \
++	topsrcdirstrip=`echo "$(top_srcdir)" | sed 's|.|.|g'`; \
++	list='$(DISTFILES)'; for file in $$list; do \
++	  case $$file in \
++	    $(srcdir)/*) file=`echo "$$file" | sed "s|^$$srcdirstrip/||"`;; \
++	    $(top_srcdir)/*) file=`echo "$$file" | sed "s|^$$topsrcdirstrip/|$(top_builddir)/|"`;; \
++	  esac; \
++	  if test -f $$file || test -d $$file; then d=.; else d=$(srcdir); fi; \
++	  dir=`echo "$$file" | sed -e 's,/[^/]*$$,,'`; \
++	  if test "$$dir" != "$$file" && test "$$dir" != "."; then \
++	    dir="/$$dir"; \
++	    $(mkinstalldirs) "$(distdir)$$dir"; \
++	  else \
++	    dir=''; \
++	  fi; \
++	  if test -d $$d/$$file; then \
++	    if test -d $(srcdir)/$$file && test $$d != $(srcdir); then \
++	      cp -pR $(srcdir)/$$file $(distdir)$$dir || exit 1; \
++	    fi; \
++	    cp -pR $$d/$$file $(distdir)$$dir || exit 1; \
++	  else \
++	    test -f $(distdir)/$$file \
++	    || cp -p $$d/$$file $(distdir)/$$file \
++	    || exit 1; \
++	  fi; \
++	done
++	list='$(SUBDIRS)'; for subdir in $$list; do \
++	  if test "$$subdir" = .; then :; else \
++	    test -d $(distdir)/$$subdir \
++	    || mkdir $(distdir)/$$subdir \
++	    || exit 1; \
++	    (cd $$subdir && \
++	      $(MAKE) $(AM_MAKEFLAGS) \
++	        top_distdir="$(top_distdir)" \
++	        distdir=../$(distdir)/$$subdir \
++	        distdir) \
++	      || exit 1; \
++	  fi; \
++	done
++	$(MAKE) $(AM_MAKEFLAGS) \
++	  top_distdir="$(top_distdir)" distdir="$(distdir)" \
++	  dist-hook
++	-find $(distdir) -type d ! -perm -777 -exec chmod a+rwx {} \; -o \
++	  ! -type d ! -perm -444 -links 1 -exec chmod a+r {} \; -o \
++	  ! -type d ! -perm -400 -exec chmod a+r {} \; -o \
++	  ! -type d ! -perm -444 -exec $(SHELL) $(install_sh) -c -m a+r {} {} \; \
++	|| chmod -R a+r $(distdir)
++dist-gzip: distdir
++	$(AMTAR) chof - $(distdir) | GZIP=$(GZIP_ENV) gzip -c >$(distdir).tar.gz
++	$(am__remove_distdir)
++
++dist dist-all: distdir
++	$(AMTAR) chof - $(distdir) | GZIP=$(GZIP_ENV) gzip -c >$(distdir).tar.gz
++	$(am__remove_distdir)
++
++# This target untars the dist file and tries a VPATH configuration.  Then
++# it guarantees that the distribution is self-contained by making another
++# tarfile.
++distcheck: dist
++	$(am__remove_distdir)
++	GZIP=$(GZIP_ENV) gunzip -c $(distdir).tar.gz | $(AMTAR) xf -
++	chmod -R a-w $(distdir); chmod a+w $(distdir)
++	mkdir $(distdir)/_build
++	mkdir $(distdir)/_inst
++	chmod a-w $(distdir)
++	dc_install_base=`$(am__cd) $(distdir)/_inst && pwd | sed -e 's,^[^:\\/]:[\\/],/,'` \
++	  && dc_destdir="$${TMPDIR-/tmp}/am-dc-$$$$/" \
++	  && cd $(distdir)/_build \
++	  && ../configure --srcdir=.. --prefix="$$dc_install_base" \
++	    $(DISTCHECK_CONFIGURE_FLAGS) \
++	  && $(MAKE) $(AM_MAKEFLAGS) \
++	  && $(MAKE) $(AM_MAKEFLAGS) dvi \
++	  && $(MAKE) $(AM_MAKEFLAGS) check \
++	  && $(MAKE) $(AM_MAKEFLAGS) install \
++	  && $(MAKE) $(AM_MAKEFLAGS) installcheck \
++	  && $(MAKE) $(AM_MAKEFLAGS) uninstall \
++	  && $(MAKE) $(AM_MAKEFLAGS) distuninstallcheck_dir="$$dc_install_base" \
++	        distuninstallcheck \
++	  && chmod -R a-w "$$dc_install_base" \
++	  && ({ \
++	       (cd ../.. && $(mkinstalldirs) "$$dc_destdir") \
++	       && $(MAKE) $(AM_MAKEFLAGS) DESTDIR="$$dc_destdir" install \
++	       && $(MAKE) $(AM_MAKEFLAGS) DESTDIR="$$dc_destdir" uninstall \
++	       && $(MAKE) $(AM_MAKEFLAGS) DESTDIR="$$dc_destdir" \
++	            distuninstallcheck_dir="$$dc_destdir" distuninstallcheck; \
++	      } || { rm -rf "$$dc_destdir"; exit 1; }) \
++	  && rm -rf "$$dc_destdir" \
++	  && $(MAKE) $(AM_MAKEFLAGS) dist-gzip \
++	  && rm -f $(distdir).tar.gz \
++	  && $(MAKE) $(AM_MAKEFLAGS) distcleancheck
++	$(am__remove_distdir)
++	@echo "$(distdir).tar.gz is ready for distribution" | \
++	  sed 'h;s/./=/g;p;x;p;x'
++distuninstallcheck:
++	@cd $(distuninstallcheck_dir) \
++	&& test `$(distuninstallcheck_listfiles) | wc -l` -le 1 \
++	   || { echo "ERROR: files left after uninstall:" ; \
++	        if test -n "$(DESTDIR)"; then \
++	          echo "  (check DESTDIR support)"; \
++	        fi ; \
++	        $(distuninstallcheck_listfiles) ; \
++	        exit 1; } >&2
++distcleancheck: distclean
++	@if test '$(srcdir)' = . ; then \
++	  echo "ERROR: distcleancheck can only run from a VPATH build" ; \
++	  exit 1 ; \
++	fi
++	@test `$(distcleancheck_listfiles) | wc -l` -eq 0 \
++	  || { echo "ERROR: files left in build directory after distclean:" ; \
++	       $(distcleancheck_listfiles) ; \
++	       exit 1; } >&2
++check-am: all-am
++check: check-recursive
++all-am: Makefile config.h
++installdirs: installdirs-recursive
++installdirs-am:
++
++install: install-recursive
++install-exec: install-exec-recursive
++install-data: install-data-recursive
++uninstall: uninstall-recursive
++
++install-am: all-am
++	@$(MAKE) $(AM_MAKEFLAGS) install-exec-am install-data-am
++
++installcheck: installcheck-recursive
++install-strip:
++	$(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \
++	  install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \
++	  `test -z '$(STRIP)' || \
++	    echo "INSTALL_PROGRAM_ENV=STRIPPROG='$(STRIP)'"` install
++mostlyclean-generic:
++
++clean-generic:
++	-test -z "$(CLEANFILES)" || rm -f $(CLEANFILES)
++
++distclean-generic:
++	-rm -f $(CONFIG_CLEAN_FILES)
++
++maintainer-clean-generic:
++	@echo "This command is intended for maintainers to use"
++	@echo "it deletes files that may require special tools to rebuild."
++clean: clean-recursive
++
++clean-am: clean-generic clean-libtool mostlyclean-am
++
++distclean: distclean-recursive
++	-rm -f $(am__CONFIG_DISTCLEAN_FILES)
++	-rm -f Makefile
++distclean-am: clean-am distclean-generic distclean-hdr distclean-libtool \
++	distclean-tags
++
++dvi: dvi-recursive
++
++dvi-am:
++
++info: info-recursive
++
++info-am:
++
++install-data-am:
++
++install-exec-am:
++
++install-info: install-info-recursive
++
++install-man:
++
++installcheck-am:
++
++maintainer-clean: maintainer-clean-recursive
++	-rm -f $(am__CONFIG_DISTCLEAN_FILES)
++	-rm -rf $(top_srcdir)/autom4te.cache
++	-rm -f Makefile
++maintainer-clean-am: distclean-am maintainer-clean-generic
++
++mostlyclean: mostlyclean-recursive
++
++mostlyclean-am: mostlyclean-generic mostlyclean-libtool
++
++pdf: pdf-recursive
++
++pdf-am:
++
++ps: ps-recursive
++
++ps-am:
++
++uninstall-am: uninstall-info-am
++
++uninstall-info: uninstall-info-recursive
++
++.PHONY: $(RECURSIVE_TARGETS) CTAGS GTAGS all all-am check check-am clean \
++	clean-generic clean-libtool clean-recursive ctags \
++	ctags-recursive dist dist-all dist-gzip distcheck distclean \
++	distclean-generic distclean-hdr distclean-libtool \
++	distclean-recursive distclean-tags distcleancheck distdir \
++	distuninstallcheck dvi dvi-am dvi-recursive info info-am \
++	info-recursive install install-am install-data install-data-am \
++	install-data-recursive install-exec install-exec-am \
++	install-exec-recursive install-info install-info-am \
++	install-info-recursive install-man install-recursive \
++	install-strip installcheck installcheck-am installdirs \
++	installdirs-am installdirs-recursive maintainer-clean \
++	maintainer-clean-generic maintainer-clean-recursive mostlyclean \
++	mostlyclean-generic mostlyclean-libtool mostlyclean-recursive \
++	pdf pdf-am pdf-recursive ps ps-am ps-recursive tags \
++	tags-recursive uninstall uninstall-am uninstall-info-am \
++	uninstall-info-recursive uninstall-recursive
++
++
++# This is just so that the linking is done early.
++config.h: $(linked_sources)
++
++linked_include_sources:
++	cd include; $(MAKE) link_sources
++	echo timestamp > linked_include_sources
++
++linked_client_sources:  @linked_client_targets@
++	cd client; $(MAKE) link_sources
++	echo timestamp > linked_client_sources
++
++linked_libmysql_sources:
++	cd libmysql; $(MAKE) link_sources
++	echo timestamp > linked_libmysql_sources
++
++linked_libmysql_r_sources: linked_libmysql_sources
++	cd libmysql_r; $(MAKE) link_sources
++	echo timestamp > linked_libmysql_r_sources
++
++linked_libmysqld_sources:
++	cd libmysqld; $(MAKE) link_sources
++	echo timestamp > linked_libmysqld_sources
++
++linked_libmysqldex_sources:
++	cd libmysqld/examples; $(MAKE) link_sources
++	echo timestamp > linked_libmysqldex_sources
++
++linked_netware_sources:
++	cd @netware_dir@; $(MAKE) link_sources
++	echo timestamp > linked_netware_sources
++
++#avoid recursive make calls in sql directory
++linked_server_sources:
++	cd sql; rm -f mini_client_errors.c;@LN_CP_F@ ../libmysql/errmsg.c mini_client_errors.c
++	echo timestamp > linked_server_sources
++
++# Create permission databases
++init-db:		all
++	$(top_builddir)/scripts/mysql_install_db
++
++bin-dist:		all
++	$(top_builddir)/scripts/make_binary_distribution @MAKE_BINARY_DISTRIBUTION_OPTIONS@
++
++# Remove BK's "SCCS" subdirectories from source distribution
++dist-hook:
++	rm -rf `find $(distdir) -type d -name SCCS`
++
++tags:
++	support-files/build-tags
++.PHONY:		init-db bin-dist
++
++# Test installation
++
++test:
++	cd mysql-test ; ./mysql-test-run
++# Tell versions [3.59,3.63) of GNU make to not export all variables.
++# Otherwise a system limit (for SysV at least) may be exceeded.
++.NOEXPORT:
diff --git a/storage/xtradb/build/debian/patches/33_scripts__mysql_create_system_tables__no_test.dpatch b/storage/xtradb/build/debian/patches/33_scripts__mysql_create_system_tables__no_test.dpatch
new file mode 100644
index 00000000000..0fd166d1fc7
--- /dev/null
+++ b/storage/xtradb/build/debian/patches/33_scripts__mysql_create_system_tables__no_test.dpatch
@@ -0,0 +1,29 @@
+#! /bin/sh /usr/share/dpatch/dpatch-run
+## 33_scripts__mysql_create_system_tables__no_test.dpatch by  <ch@debian.org>
+##
+## All lines beginning with `## DP:' are a description of the patch.
+## DP: scripts__mysql_create_system_tables__no_test
+## DP: A user with no password prevents a normal user from login under certain
+## DP: circumstances as it is checked first. See #301741.
+## DP: http://bugs.mysql.com/bug.php?id=6901
+
+@DPATCH@
+--- old/scripts/mysql_system_tables_data.sql	2008-12-04 22:59:44.000000000 +0100
++++ new/scripts/mysql_system_tables_data.sql	2008-12-04 23:00:07.000000000 +0100
+@@ -11,8 +11,6 @@
+ -- Fill "db" table with default grants for anyone to
+ -- access database 'test' and 'test_%' if "db" table didn't exist
+ CREATE TEMPORARY TABLE tmp_db LIKE db;
+-INSERT INTO tmp_db VALUES ('%','test','','Y','Y','Y','Y','Y','Y','N','Y','Y','Y','Y','Y','Y','Y','Y','N','N','Y','Y');
+-INSERT INTO tmp_db VALUES ('%','test\_%','','Y','Y','Y','Y','Y','Y','N','Y','Y','Y','Y','Y','Y','Y','Y','N','N','Y','Y');
+ INSERT INTO db SELECT * FROM tmp_db WHERE @had_db_table=0;
+ DROP TABLE tmp_db;
+ 
+@@ -24,7 +22,5 @@
+ INSERT INTO tmp_user VALUES ('localhost','root','','Y','Y','Y','Y','Y','Y','Y','Y','Y','Y','Y','Y','Y','Y','Y','Y','Y','Y','Y','Y','Y','Y','Y','Y','Y','Y','Y','Y','','','','',0,0,0,0);
+ REPLACE INTO tmp_user SELECT @current_hostname,'root','','Y','Y','Y','Y','Y','Y','Y','Y','Y','Y','Y','Y','Y','Y','Y','Y','Y','Y','Y','Y','Y','Y','Y','Y','Y','Y','Y','Y','','','','',0,0,0,0 FROM dual WHERE LOWER( @current_hostname) != 'localhost';
+ REPLACE INTO tmp_user VALUES ('127.0.0.1','root','','Y','Y','Y','Y','Y','Y','Y','Y','Y','Y','Y','Y','Y','Y','Y','Y','Y','Y','Y','Y','Y','Y','Y','Y','Y','Y','Y','Y','','','','',0,0,0,0);
+-INSERT INTO tmp_user (host,user) VALUES ('localhost','');
+-INSERT INTO tmp_user (host,user) SELECT @current_hostname,'' FROM dual WHERE LOWER(@current_hostname ) != 'localhost';
+ INSERT INTO user SELECT * FROM tmp_user WHERE @had_user_table=0;
+ DROP TABLE tmp_user;
diff --git a/storage/xtradb/build/debian/patches/38_scripts__mysqld_safe.sh__signals.dpatch b/storage/xtradb/build/debian/patches/38_scripts__mysqld_safe.sh__signals.dpatch
new file mode 100644
index 00000000000..154bc0ad1c4
--- /dev/null
+++ b/storage/xtradb/build/debian/patches/38_scripts__mysqld_safe.sh__signals.dpatch
@@ -0,0 +1,43 @@
+#! /bin/sh /usr/share/dpatch/dpatch-run
+## 38_scripts__mysqld_safe.sh__signals.dpatch by  <ch@debian.org>
+##
+## All lines beginning with `## DP:' are a description of the patch.
+## DP: Executes /etc/init.d/mysql on signals
+## DP: Reported as http://bugs.mysql.com/bug.php?id=31361
+
+@DPATCH@
+
+--- old/scripts/mysqld_safe.sh	2006-07-29 13:12:34.000000000 +0200
++++ old/scripts/mysqld_safe.sh	2006-07-29 13:14:08.000000000 +0200
+@@ -16,8 +16,6 @@
+ # This command can be used as pipe to syslog. With "-s" it also logs to stderr.
+ ERR_LOGGER="logger -p daemon.err -t mysqld_safe -i"
+ 
+-trap '' 1 2 3 15			# we shouldn't let anyone kill us
+-
+ umask 007
+ 
+ defaults=
+@@ -122,7 +122,7 @@
+       # sed buffers output (only GNU sed supports a -u (unbuffered) option)
+       # which means that messages may not get sent to syslog until the
+       # mysqld process quits.
+-      cmd="$cmd 2>&1 | logger -t '$syslog_tag_mysqld' -p daemon.error"
++      cmd="$cmd 2>&1 | logger -t '$syslog_tag_mysqld' -p daemon.error & wait"
+       ;;
+     *)
+       echo "Internal program error (non-fatal):" \
+@@ -352,6 +350,13 @@
+ fi
+ 
+ #
++# From now on, we catch signals to do a proper shutdown of mysqld
++# when signalled to do so.
++#
++trap '/usr/bin/mysqladmin --defaults-extra-file=/etc/mysql/debian.cnf refresh' 1 # HUP
++trap '/usr/bin/mysqladmin --defaults-extra-file=/etc/mysql/debian.cnf shutdown' 2 3 15 # INT QUIT and TERM
++
++#
+ # Uncomment the following lines if you want all tables to be automatically
+ # checked and repaired during startup. You should add sensible key_buffer
+ # and sort_buffer values to my.cnf to improve check performance or require
diff --git a/storage/xtradb/build/debian/patches/41_scripts__mysql_install_db.sh__no_test.dpatch b/storage/xtradb/build/debian/patches/41_scripts__mysql_install_db.sh__no_test.dpatch
new file mode 100644
index 00000000000..e79ac71cc7b
--- /dev/null
+++ b/storage/xtradb/build/debian/patches/41_scripts__mysql_install_db.sh__no_test.dpatch
@@ -0,0 +1,20 @@
+#! /bin/sh /usr/share/dpatch/dpatch-run
+## 41_scripts__mysql_install_db.sh__no_test.dpatch by  <ch@debian.org>
+##
+## All lines beginning with `## DP:' are a description of the patch.
+## DP: scripts__mysql_install_db.sh__no_test
+## DP: http://bugs.mysql.com/bug.php?id=6901
+
+@DPATCH@
+
+--- mysql-dfsg-5.1-5.1.23rc.orig/scripts/mysql_install_db.sh	2008-01-29 22:41:20.000000000 +0100
++++ mysql-dfsg-5.1-5.1.23rc/scripts/mysql_install_db.sh	2008-02-28 10:08:11.000000000 +0100
+@@ -306,7 +306,7 @@
+ fi
+ 
+ # Create database directories
+-for dir in $ldata $ldata/mysql $ldata/test
++for dir in $ldata $ldata/mysql
+ do
+   if test ! -d $dir
+   then
diff --git a/storage/xtradb/build/debian/patches/44_scripts__mysql_config__libs.dpatch b/storage/xtradb/build/debian/patches/44_scripts__mysql_config__libs.dpatch
new file mode 100644
index 00000000000..b35ba4912f3
--- /dev/null
+++ b/storage/xtradb/build/debian/patches/44_scripts__mysql_config__libs.dpatch
@@ -0,0 +1,24 @@
+#! /bin/sh /usr/share/dpatch/dpatch-run
+## 99-unnamed.dpatch by  <ch@debian.org>
+##
+## All lines beginning with `## DP:' are a description of the patch.
+## DP: Removes unnecessary library dependencies. See #390692
+
+@DPATCH@
+diff -Nur mysql-dfsg-5.1-5.1.31.orig/scripts/mysql_config.sh mysql-dfsg-5.1-5.1.31/scripts/mysql_config.sh
+--- mysql-dfsg-5.1-5.1.31.orig/scripts/mysql_config.sh  2009-01-19 17:30:55.000000000 +0100
++++ mysql-dfsg-5.1-5.1.31/scripts/mysql_config.sh       2009-02-08 17:17:48.000000000 +0100
+@@ -104,10 +104,10 @@
+ 
+ # Create options 
+ # We intentionally add a space to the beginning and end of lib strings, simplifies replace later
+-libs=" $ldflags -L$pkglibdir -lmysqlclient @ZLIB_DEPS@ @NON_THREADED_LIBS@"
++libs=" $ldflags -L$pkglibdir -lmysqlclient"
+ libs="$libs @openssl_libs@ @STATIC_NSS_FLAGS@ "
+-libs_r=" $ldflags -L$pkglibdir -lmysqlclient_r @ZLIB_DEPS@ @LIBS@ @openssl_libs@ "
+-embedded_libs=" $ldflags -L$pkglibdir -lmysqld @LIBDL@ @ZLIB_DEPS@ @LIBS@ @WRAPLIBS@ @innodb_system_libs@ @openssl_libs@ "
++libs_r=" $ldflags -L$pkglibdir -lmysqlclient_r @openssl_libs@ "
++embedded_libs=" $ldflags -L$pkglibdir -lmysqld @LIBDL@ @WRAPLIBS@ @innodb_system_libs@ @openssl_libs@ "
+ 
+ if [ -r "$pkglibdir/libmygcc.a" ]; then
+   # When linking against the static library with a different version of GCC
diff --git a/storage/xtradb/build/debian/patches/50_mysql-test__db_test.dpatch b/storage/xtradb/build/debian/patches/50_mysql-test__db_test.dpatch
new file mode 100644
index 00000000000..6a5cab91c39
--- /dev/null
+++ b/storage/xtradb/build/debian/patches/50_mysql-test__db_test.dpatch
@@ -0,0 +1,23 @@
+#! /bin/sh /usr/share/dpatch/dpatch-run
+## 50_mysql-test__db_test.dpatch by Christian Hammers <ch@debian.org>
+##
+## All lines beginning with `## DP:' are a description of the patch.
+## DP: Patch 33_scripts__mysql_create_system_tables__no_test removes the
+## DP: rights for anybody to connect to the test database but the test
+## DP: suite depends on them.
+
+@DPATCH@
+
+--- old/mysql-test/mysql-test-run.pl	2009-06-16 14:24:09.000000000 +0200
++++ new/mysql-test/mysql-test-run.pl	2009-07-04 00:03:34.000000000 +0200
+@@ -2717,6 +2717,10 @@
+     mtr_appendfile_to_file("$sql_dir/mysql_system_tables_data.sql",
+                           $bootstrap_sql_file);
+ 
++    mtr_tofile($bootstrap_sql_file, "-- Debian removed the default privileges on the 'test' database\n");
++    mtr_tofile($bootstrap_sql_file, "INSERT INTO mysql.db VALUES ('%','test','','Y','Y','Y','Y','Y','Y','N','Y','Y','Y','Y','Y','Y','Y','Y','N','N','Y','Y');\n");
++    
++
+     # Add test data for timezone - this is just a subset, on a real
+     # system these tables will be populated either by mysql_tzinfo_to_sql
+     # or by downloading the timezone table package from our website
diff --git a/storage/xtradb/build/debian/patches/60_percona_support.dpatch b/storage/xtradb/build/debian/patches/60_percona_support.dpatch
new file mode 100644
index 00000000000..e69d0dd5f76
--- /dev/null
+++ b/storage/xtradb/build/debian/patches/60_percona_support.dpatch
@@ -0,0 +1,16 @@
+#! /bin/sh /usr/share/dpatch/dpatch-run
+
+@DPATCH@
+
+--- a/scripts/mysql_install_db.sh	2009-08-08 09:20:07.000000000 +0000
++++ b/scripts/mysql_install_db.sh	2009-08-08 09:29:23.000000000 +0000
+@@ -469,6 +469,9 @@
+   echo
+   echo "Please report any problems with the $scriptdir/mysqlbug script!"
+   echo
++  echo "For commercial support please contact Percona at http://www.percona.com/contacts.html"
++  echo
++
+ fi
+
+ exit 0
diff --git a/storage/xtradb/build/debian/percona-xtradb-client-5.1.README.Debian b/storage/xtradb/build/debian/percona-xtradb-client-5.1.README.Debian
new file mode 100644
index 00000000000..b245638f9c9
--- /dev/null
+++ b/storage/xtradb/build/debian/percona-xtradb-client-5.1.README.Debian
@@ -0,0 +1,4 @@
+FAQ:
+
+Q: My <tab> completition is gone, why?
+A: You have "no-auto-rehash" in the "[mysql]" section of /etc/mysql/my.cnf!
diff --git a/storage/xtradb/build/debian/percona-xtradb-client-5.1.dirs b/storage/xtradb/build/debian/percona-xtradb-client-5.1.dirs
new file mode 100644
index 00000000000..ceda5922c5d
--- /dev/null
+++ b/storage/xtradb/build/debian/percona-xtradb-client-5.1.dirs
@@ -0,0 +1,3 @@
+usr/bin/
+usr/share/man/man1/
+usr/share/perl5/
diff --git a/storage/xtradb/build/debian/percona-xtradb-client-5.1.docs b/storage/xtradb/build/debian/percona-xtradb-client-5.1.docs
new file mode 100644
index 00000000000..8b8692f0d90
--- /dev/null
+++ b/storage/xtradb/build/debian/percona-xtradb-client-5.1.docs
@@ -0,0 +1,3 @@
+debian/additions/innotop/changelog.innotop
+EXCEPTIONS-CLIENT
+README
diff --git a/storage/xtradb/build/debian/percona-xtradb-client-5.1.files b/storage/xtradb/build/debian/percona-xtradb-client-5.1.files
new file mode 100644
index 00000000000..9ba5fe35054
--- /dev/null
+++ b/storage/xtradb/build/debian/percona-xtradb-client-5.1.files
@@ -0,0 +1,39 @@
+usr/bin/innotop
+usr/bin/myisam_ftdump
+usr/bin/mysql
+usr/bin/mysqlaccess
+usr/bin/mysqladmin
+usr/bin/mysqlbug
+usr/bin/mysqlcheck
+usr/bin/mysql_client_test
+usr/bin/mysqldump
+usr/bin/mysqldumpslow
+usr/bin/mysql_find_rows
+usr/bin/mysql_fix_extensions
+usr/bin/mysqlimport
+usr/bin/mysqlreport
+usr/bin/mysqlshow
+usr/bin/mysql_waitpid
+usr/sbin/mysqlmanager
+usr/share/lintian/overrides/percona-xtradb-client-5.1
+usr/share/man/man1/innotop.1
+usr/share/man/man1/myisam_ftdump.1
+usr/share/man/man1/mysql.1
+usr/share/man/man1/mysqlaccess.1
+usr/share/man/man1/mysqladmin.1
+usr/share/man/man1/mysqlbug.1
+usr/share/man/man1/mysqlcheck.1
+usr/share/man/man1/mysqldump.1
+usr/share/man/man1/mysqldumpslow.1
+usr/share/man/man1/mysql_find_rows.1
+usr/share/man/man1/mysql_fix_extensions.1
+usr/share/man/man1/mysqlimport.1
+usr/share/man/man1/mysqlmanager.1
+usr/share/man/man1/mysqlmanagerc.1
+usr/share/man/man1/mysqlmanager-pwgen.1
+usr/share/man/man1/mysqlreport.1
+usr/share/man/man1/mysqlshow.1
+usr/share/man/man1/mysql_tableinfo.1
+usr/share/man/man1/mysql_waitpid.1
+usr/share/man/man1/mysql_client_test.1
+usr/share/perl5/InnoDBParser.pm
diff --git a/storage/xtradb/build/debian/percona-xtradb-client-5.1.links b/storage/xtradb/build/debian/percona-xtradb-client-5.1.links
new file mode 100644
index 00000000000..247369fa218
--- /dev/null
+++ b/storage/xtradb/build/debian/percona-xtradb-client-5.1.links
@@ -0,0 +1,3 @@
+usr/bin/mysqlcheck usr/bin/mysqlrepair
+usr/bin/mysqlcheck usr/bin/mysqlanalyze
+usr/bin/mysqlcheck usr/bin/mysqloptimize
diff --git a/storage/xtradb/build/debian/percona-xtradb-client-5.1.lintian-overrides b/storage/xtradb/build/debian/percona-xtradb-client-5.1.lintian-overrides
new file mode 100644
index 00000000000..d36909f47f2
--- /dev/null
+++ b/storage/xtradb/build/debian/percona-xtradb-client-5.1.lintian-overrides
@@ -0,0 +1,3 @@
+percona-xtradb-client-5.1: package-has-a-duplicate-relation
+percona-xtradb-client-5.1: wrong-name-for-upstream-changelog usr/share/doc/percona-xtradb-client-5.1/changelog.innotop.gz
+percona-xtradb-client-5.1: pkg-not-in-package-test innotop
diff --git a/storage/xtradb/build/debian/percona-xtradb-client-5.1.menu b/storage/xtradb/build/debian/percona-xtradb-client-5.1.menu
new file mode 100644
index 00000000000..1378555c423
--- /dev/null
+++ b/storage/xtradb/build/debian/percona-xtradb-client-5.1.menu
@@ -0,0 +1,3 @@
+# According to /usr/share/menu/ policy 1.4, not /usr/share/doc/debian-policy/
+?package(innotop):needs="text" section="Applications/Data Management"\
+  title="innotop" command="/usr/bin/innotop"
diff --git a/storage/xtradb/build/debian/percona-xtradb-common.dirs b/storage/xtradb/build/debian/percona-xtradb-common.dirs
new file mode 100644
index 00000000000..a5a88ede9c1
--- /dev/null
+++ b/storage/xtradb/build/debian/percona-xtradb-common.dirs
@@ -0,0 +1 @@
+etc/mysql/conf.d/
diff --git a/storage/xtradb/build/debian/percona-xtradb-common.files b/storage/xtradb/build/debian/percona-xtradb-common.files
new file mode 100644
index 00000000000..931f37a0237
--- /dev/null
+++ b/storage/xtradb/build/debian/percona-xtradb-common.files
@@ -0,0 +1,2 @@
+etc/mysql/my.cnf
+usr/share/percona-xtradb-common/internal-use-only
diff --git a/storage/xtradb/build/debian/percona-xtradb-common.lintian-overrides b/storage/xtradb/build/debian/percona-xtradb-common.lintian-overrides
new file mode 100644
index 00000000000..7f58feb498d
--- /dev/null
+++ b/storage/xtradb/build/debian/percona-xtradb-common.lintian-overrides
@@ -0,0 +1,2 @@
+script-not-executable ./usr/share/percona-xtradb-common/internal-use-only/_etc_init.d_mysql
+script-not-executable ./usr/share/percona-xtradb-common/internal-use-only/_etc_mysql_debian-start
diff --git a/storage/xtradb/build/debian/percona-xtradb-common.postrm b/storage/xtradb/build/debian/percona-xtradb-common.postrm
new file mode 100644
index 00000000000..3ca45870165
--- /dev/null
+++ b/storage/xtradb/build/debian/percona-xtradb-common.postrm
@@ -0,0 +1,7 @@
+#!/bin/bash
+
+if [ "$1" = "purge" ]; then
+  rmdir /etc/mysql 2>/dev/null || true
+fi
+
+#DEBHELPER#
diff --git a/storage/xtradb/build/debian/percona-xtradb-server-5.1.NEWS b/storage/xtradb/build/debian/percona-xtradb-server-5.1.NEWS
new file mode 100644
index 00000000000..a3042dc2918
--- /dev/null
+++ b/storage/xtradb/build/debian/percona-xtradb-server-5.1.NEWS
@@ -0,0 +1,34 @@
+mysql-dfsg-5.1 (5.1.38-1) unstable; urgency=low
+
+  * Please read http://dev.mysql.com/doc/refman/5.1/en/upgrading-from-5-0.html
+  * Make sure to do a REPAIR TABLE on all tables that use UTF-8 and have a
+    FULLTEXT index.
+
+ -- Christian Hammers <ch@debian.org>  Sat,  4 Jul 2009 02:31:21 +0200
+
+mysql-dfsg-5.0 (5.1.14beta-2) unstable; urgency=low
+
+  * The BerkeleyDB Storage Engine is no longer supported. If the options
+    have-bdb or skip-bdb are found, MySQL will not start. If you have BDB
+    tables, you should change them to use another storage engine before 
+    upgrading to 5.1.   
+
+ -- Monty Taylor <mordred@inaugust.com>  Thu, 18 Jan 2007 12:28:21 -0800
+
+mysql-dfsg-5.0 (5.0.45-2) unstable; urgency=low
+
+  * Binary logging is now disabled by default. If you really need it (e.g. on
+    a replication master), remove the comment from the log_bin line in my.cnf.
+
+ -- Norbert Tretkowski <nobse@debian.org>  Sat, 10 Nov 2007 16:26:35 +0100
+
+mysql-dfsg-5.0 (5.0.18-9) unstable; urgency=low
+
+  * Rotation of the binary logs is now configured in /etc/mysql/my.cnf with
+    "expire-logs-days" which defaults to 20 days. The old file
+    /etc/mysql/debian-log-rotate.conf should be removed together with
+    /etc/cron.daily/mysql-server after this value has been adjusted. Note that
+    the old variable defined the number of files whereas the new one defines 
+    a time span in days.
+
+ -- Christian Hammers <ch@debian.org>  Tue, 24 Jan 2006 22:18:21 +0100
diff --git a/storage/xtradb/build/debian/percona-xtradb-server-5.1.README.Debian b/storage/xtradb/build/debian/percona-xtradb-server-5.1.README.Debian
new file mode 100644
index 00000000000..741243f1ec3
--- /dev/null
+++ b/storage/xtradb/build/debian/percona-xtradb-server-5.1.README.Debian
@@ -0,0 +1,109 @@
+* MYSQL WON'T START OR STOP?:
+=============================
+You may never ever delete the special mysql user "debian-sys-maint". This
+user together with the credentials in /etc/mysql/debian.cnf are used by the
+init scripts to stop the server as they would require knowledge of the mysql
+root users password else.
+So in most of the times you can fix the situation by making sure that the
+debian.cnf file contains the right password, e.g. by setting a new one
+(remember to do a "flush privileges" then).
+
+* WHAT TO DO AFTER UPGRADES:
+============================
+The privilege tables are automatically updated so all there is left is read
+the changelogs on dev.mysql.com to see if any changes affect custom apps.
+
+* WHAT TO DO AFTER INSTALLATION:
+================================
+The MySQL manual describes certain steps to do at this stage in a separate
+chapter.  They are not necessary as the Debian packages does them
+automatically.
+
+The only thing that is left over for the admin is 
+ - setting the passwords
+ - creating new users and databases
+ - read the rest of this text
+
+* DOWNGRADING TO 4.0 or 4.1:
+============================
+Unsupported. Period.
+But if you do and get problems or make interesting experiences, mail me, it
+might help others.
+Ok, if you really want, I would recommend to "mysqldump --opt" all tables,
+then purge 4.1, delete /var/lib/mysql, install 4.0 and insert the dumps.  Be
+carefully, though, with the "mysql" table, you might not simply overwrite that
+one as the password for the mysql "debian-sys-maint" user is stored in
+/etc/mysql/debian.cnf and needed by /etc/init.d/ to start mysql and check if
+it's alive. 
+
+* SOME APPLICATION CAN NO LONGER CONNECT:
+=========================================
+This application is probably linked against libmysqlclient12 or below and
+somebody has created a mysql user with new-style passwords.
+The old_passwords=1 option in /etc/mysql/my.cnf might help. If not the
+application that inserted the user has to be changed or the application that
+tries to connect updated to libmysqlclient14 or -15.
+
+* NETWORKING:
+=============
+For security reasons, the Debian package has enabled networking only on the
+loop-back device using "bind-address" in /etc/mysql/my.cnf.  Check with
+"netstat -tlnp" where it is listening. If your connection is aborted
+immediately see if "mysqld: all" or similar is in /etc/hosts.allow and read
+hosts_access(5).
+
+* WHERE IS THE DOCUMENTATION?:
+==============================
+Unfortunately due to licensing restrictions, debian currently not able
+to provide the mysql-doc package in any format.  For the most up to date
+documentation, please go to http://dev.mysql.com/doc.
+
+* PASSWORDS:
+============
+It is strongly recommended to set a password for the mysql root user (which
+  /usr/bin/mysql -u root -D mysql -e "update user set password=password('new-password') where user='root'"
+  /usr/bin/mysql -u root -e "flush privileges"
+If you already had a password set add "-p" before "-u" to the lines above.
+
+
+If you are tired to type the password in every time or want to automate your
+scripts you can store it in the file $HOME/.my.cnf. It should be chmod 0600
+(-rw------- username username .my.cnf) to ensure that nobody else can read
+it.  Every other configuration parameter can be stored there, too. You will
+find an example below and more information in the MySQL manual in
+/usr/share/doc/mysql-doc or www.mysql.com.
+
+ATTENTION: It is necessary, that a .my.cnf from root always contains a "user"
+line wherever there is a "password" line, else, the Debian maintenance
+scripts, that use /etc/mysql/debian.cnf, will use the username
+"debian-sys-maint" but the password that is in root's .my.cnf. Also note,
+that every change you make in the /root/.my.cnf will affect the mysql cron
+script, too.
+
+        # an example of $HOME/.my.cnf
+	[client]
+	user		= your-mysql-username
+	password	= enter-your-good-new-password-here
+
+* BIG_ROWS FOR EVEN MORE ROWS IN A TABLE:
+=========================================
+If you ever run out of rows in a table there is the possibility of building
+the package with "-DBIG_ROWS" which, according to a MySQL employee on
+packagers@lists.mysql.com should lead to a 64bit row index (I guess > 2^32
+rows) but also to an approx. 5% performance loss.
+
+* BerkeleyDB Storage Engine
+===========================
+Support for BerkeleyDB has been removed in 5.1, and consequently both the
+have-bdb and skip-bdb configuration options will cause the server to fail. 
+Removing the options from /etc/mysql/my.cnf will fix this problem.
+
+* FURTHER NOTES ON REPLICATION
+===============================
+If the MySQL server is acting as a replication slave, you should not
+set --tmpdir to point to a directory on a memory-based filesystem or to
+a directory that is cleared when the server host restarts. A replication
+slave needs some of its temporary files to survive a machine restart so
+that it can replicate temporary tables or LOAD DATA INFILE operations. If
+files in the temporary file directory are lost when the server restarts,
+replication fails.
diff --git a/storage/xtradb/build/debian/percona-xtradb-server-5.1.config b/storage/xtradb/build/debian/percona-xtradb-server-5.1.config
new file mode 100644
index 00000000000..75f81c4e4d1
--- /dev/null
+++ b/storage/xtradb/build/debian/percona-xtradb-server-5.1.config
@@ -0,0 +1,46 @@
+#!/bin/bash -e
+
+. /usr/share/debconf/confmodule
+
+if [ -n "$DEBIAN_SCRIPT_DEBUG" ]; then set -v -x; DEBIAN_SCRIPT_TRACE=1; fi
+${DEBIAN_SCRIPT_TRACE:+ echo "#42#DEBUG# RUNNING $0 $*" 1>&2 }
+
+CNF=/etc/mysql/my.cnf
+
+# Beware that there are two ypwhich one of them needs the 2>/dev/null!
+if test -n "`which ypwhich 2>/dev/null`"  &&  ypwhich >/dev/null 2>&1; then
+  db_input high percona-xtradb-server-5.1/nis_warning || true
+  db_go
+fi
+
+# only ask this question on fresh installs, during "reconfiguration" and when 
+# not upgrading from an existing 5.0 installation.
+# there is also an additional check for empty root passwords in the
+# postinst script when the tools are available for us to use.
+if [ "$1" = "configure" ] && ([ -z "$2" ] && [ ! -e "/var/lib/mysql/debian-5.0.flag" ] ) || [ "$1" = "reconfigure" ]; then
+  while :; do
+    RET=""
+    db_input high percona-xtradb-server/root_password || true
+    db_go
+    db_get percona-xtradb-server/root_password
+    # if password isn't empty we ask for password verification
+    if [ -z "$RET" ]; then
+      db_fset percona-xtradb-server/root_password seen false
+      db_fset percona-xtradb-server/root_password_again seen false
+      break
+    fi
+    ROOT_PW="$RET"
+    db_input high percona-xtradb-server/root_password_again || true
+    db_go
+    db_get percona-xtradb-server/root_password_again
+    if [ "$RET" == "$ROOT_PW" ]; then
+      ROOT_PW=''
+      break
+    fi
+    db_fset percona-xtradb-server/password_mismatch seen false
+    db_input critical percona-xtradb-server/password_mismatch
+    db_set percona-xtradb-server/root_password "" 
+    db_set percona-xtradb-server/root_password_again ""
+    db_go
+  done
+fi
diff --git a/storage/xtradb/build/debian/percona-xtradb-server-5.1.dirs b/storage/xtradb/build/debian/percona-xtradb-server-5.1.dirs
new file mode 100644
index 00000000000..29c2e756a00
--- /dev/null
+++ b/storage/xtradb/build/debian/percona-xtradb-server-5.1.dirs
@@ -0,0 +1,9 @@
+etc/init.d
+etc/logrotate.d
+etc/mysql/conf.d
+usr/bin
+usr/sbin
+usr/share/man/man8
+usr/share/mysql
+var/run/mysqld
+var/lib/mysql-upgrade
diff --git a/storage/xtradb/build/debian/percona-xtradb-server-5.1.docs b/storage/xtradb/build/debian/percona-xtradb-server-5.1.docs
new file mode 100644
index 00000000000..eccf2c9c565
--- /dev/null
+++ b/storage/xtradb/build/debian/percona-xtradb-server-5.1.docs
@@ -0,0 +1 @@
+EXCEPTIONS-CLIENT
diff --git a/storage/xtradb/build/debian/percona-xtradb-server-5.1.files b/storage/xtradb/build/debian/percona-xtradb-server-5.1.files
new file mode 100644
index 00000000000..4741b588136
--- /dev/null
+++ b/storage/xtradb/build/debian/percona-xtradb-server-5.1.files
@@ -0,0 +1,53 @@
+usr/lib/mysql/*so*
+etc/mysql/debian-start
+etc/mysql/conf.d/mysqld_safe_syslog.cnf
+usr/bin/msql2mysql
+usr/bin/my_print_defaults
+usr/bin/myisamchk
+usr/bin/myisamlog
+usr/bin/myisampack
+usr/bin/mysql_convert_table_format
+usr/bin/mysql_fix_privilege_tables
+usr/bin/mysql_install_db
+usr/bin/mysql_secure_installation
+usr/bin/mysql_setpermission
+usr/bin/mysql_tzinfo_to_sql
+usr/bin/mysql_upgrade
+usr/bin/mysql_zap
+usr/bin/mysqlbinlog
+usr/bin/mysqld_multi
+usr/bin/mysqld_safe
+usr/bin/mysqlhotcopy
+usr/bin/mysqltest
+usr/bin/perror
+usr/bin/replace
+usr/bin/resolve_stack_dump
+usr/bin/resolveip
+usr/sbin/mysqld
+usr/share/doc/percona-xtradb-server-5.1/
+usr/share/lintian/overrides/percona-xtradb-server-5.1
+usr/share/man/man1/msql2mysql.1
+usr/share/man/man1/myisamchk.1
+usr/share/man/man1/myisamlog.1
+usr/share/man/man1/myisampack.1
+usr/share/man/man1/my_print_defaults.1
+usr/share/man/man1/mysqlbinlog.1
+usr/share/man/man1/mysql_convert_table_format.1
+usr/share/man/man1/mysqld_multi.1
+usr/share/man/man1/mysqld_safe.1
+usr/share/man/man1/mysql_fix_privilege_tables.1
+usr/share/man/man1/mysqlhotcopy.1
+usr/share/man/man1/mysql_install_db.1
+usr/share/man/man1/mysql_secure_installation.1
+usr/share/man/man1/mysql_setpermission.1
+usr/share/man/man1/mysql_upgrade.1
+usr/share/man/man1/mysqltest.1
+usr/share/man/man1/mysql_zap.1
+usr/share/man/man1/perror.1
+usr/share/man/man1/replace.1
+usr/share/man/man1/resolveip.1
+usr/share/man/man1/resolve_stack_dump.1
+usr/share/man/man1/innochecksum.1
+usr/share/man/man1/mysql_tzinfo_to_sql.1
+usr/share/man/man8/mysqld.8
+usr/share/mysql/
diff --git a/storage/xtradb/build/debian/percona-xtradb-server-5.1.links b/storage/xtradb/build/debian/percona-xtradb-server-5.1.links
new file mode 100644
index 00000000000..082680fe5ed
--- /dev/null
+++ b/storage/xtradb/build/debian/percona-xtradb-server-5.1.links
@@ -0,0 +1,2 @@
+usr/share/mysql/mysql-test/mysql-test-run.pl usr/share/mysql/mysql-test/mysql-test-run
+usr/share/mysql/mysql-test/mysql-test-run.pl usr/share/mysql/mysql-test/mtr
diff --git a/storage/xtradb/build/debian/percona-xtradb-server-5.1.lintian-overrides b/storage/xtradb/build/debian/percona-xtradb-server-5.1.lintian-overrides
new file mode 100644
index 00000000000..a3ffb465fd5
--- /dev/null
+++ b/storage/xtradb/build/debian/percona-xtradb-server-5.1.lintian-overrides
@@ -0,0 +1,4 @@
+percona-xtradb-server-5.1: possible-bashism-in-maintainer-script postinst:81 'p{("a".."z","A".."Z",0..9)[int(rand(62))]}'
+percona-xtradb-server-5.1: possible-bashism-in-maintainer-script preinst:33 '${cmd/ */}'
+percona-xtradb-server-5.1: statically-linked-binary ./usr/bin/mysql_tzinfo_to_sql
+percona-xtradb-server-5.1: statically-linked-binary ./usr/sbin/mysqld
diff --git a/storage/xtradb/build/debian/percona-xtradb-server-5.1.logcheck.ignore.paranoid b/storage/xtradb/build/debian/percona-xtradb-server-5.1.logcheck.ignore.paranoid
new file mode 100644
index 00000000000..00cc5c3e29d
--- /dev/null
+++ b/storage/xtradb/build/debian/percona-xtradb-server-5.1.logcheck.ignore.paranoid
@@ -0,0 +1,9 @@
+/etc/init.d/mysql\[[0-9]+\]: Check that mysqld is running and that the socket: '/var/run/mysqld/mysqld.sock' exists\!$
+/etc/init.d/mysql\[[0-9]+\]: '/usr/bin/mysqladmin --defaults-(extra-)?file=/etc/mysql/debian.cnf ping' resulted in$
+/etc/mysql/debian-start\[[0-9]+\]: Checking for crashed MySQL tables\.$
+mysqld\[[0-9]+\]: $
+mysqld\[[0-9]+\]: Version: .* socket: '/var/run/mysqld/mysqld.sock'  port: 3306$
+mysqld\[[0-9]+\]: Warning: Ignoring user change to 'mysql' because the user was set to 'mysql' earlier on the command line$
+mysqld_safe\[[0-9]+\]: started$
+usermod\[[0-9]+\]: change user `mysql' GID from `([0-9]+)' to `\1'$
+usermod\[[0-9]+\]: change user `mysql' shell from `/bin/false' to `/bin/false'$
diff --git a/storage/xtradb/build/debian/percona-xtradb-server-5.1.logcheck.ignore.server b/storage/xtradb/build/debian/percona-xtradb-server-5.1.logcheck.ignore.server
new file mode 100644
index 00000000000..37f25cb01ea
--- /dev/null
+++ b/storage/xtradb/build/debian/percona-xtradb-server-5.1.logcheck.ignore.server
@@ -0,0 +1,32 @@
+/etc/init.d/mysql\[[0-9]+\]: [0-9]+ processes alive and '/usr/bin/mysqladmin --defaults-(extra-)?file=/etc/mysql/debian.cnf ping' resulted in$
+/etc/init.d/mysql\[[0-9]+\]: Check that mysqld is running and that the socket: '/var/run/mysqld/mysqld.sock' exists\!$
+/etc/init.d/mysql\[[0-9]+\]: '/usr/bin/mysqladmin --defaults-(extra-)?file=/etc/mysql/debian.cnf ping' resulted in$
+/etc/mysql/debian-start\[[0-9]+\]: Checking for crashed MySQL tables\.$
+mysqld\[[0-9]+\]: ?$
+mysqld\[[0-9]+\]: .*InnoDB: Shutdown completed
+mysqld\[[0-9]+\]: .*InnoDB: Started;
+mysqld\[[0-9]+\]: .*InnoDB: Starting shutdown\.\.\.$
+mysqld\[[0-9]+\]: .*\[Note\] /usr/sbin/mysqld: Normal shutdown$
+mysqld\[[0-9]+\]: .*\[Note\] /usr/sbin/mysqld: ready for connections\.$
+mysqld\[[0-9]+\]: .*\[Note\] /usr/sbin/mysqld: Shutdown complete$
+mysqld\[[0-9]+\]: /usr/sbin/mysqld: ready for connections\.$
+mysqld\[[0-9]+\]: .*/usr/sbin/mysqld: Shutdown Complete$
+mysqld\[[0-9]+\]: Version: .* socket
+mysqld\[[0-9]+\]: Warning: Ignoring user change to 'mysql' because the user was set to 'mysql' earlier on the command line$
+mysqld_safe\[[0-9]+\]: ?$
+mysqld_safe\[[0-9]+\]: able to use the new GRANT command!$
+mysqld_safe\[[0-9]+\]: ended$
+mysqld_safe\[[0-9]+\]: http://www.mysql.com$
+mysqld_safe\[[0-9]+\]: NOTE:  If you are upgrading from a MySQL <= 3.22.10 you should run$
+mysqld_safe\[[0-9]+\]: PLEASE REMEMBER TO SET A PASSWORD FOR THE MySQL root USER !$
+mysqld_safe\[[0-9]+\]: Please report any problems with the /usr/bin/mysqlbug script!$
+mysqld_safe\[[0-9]+\]: See the manual for more instructions.$
+mysqld_safe\[[0-9]+\]: started$
+mysqld_safe\[[0-9]+\]: Support MySQL by buying support/licenses at https://order.mysql.com$
+mysqld_safe\[[0-9]+\]: The latest information about MySQL is available on the web at$
+mysqld_safe\[[0-9]+\]: the /usr/bin/mysql_fix_privilege_tables. Otherwise you will not be$
+mysqld_safe\[[0-9]+\]: To do so, start the server, then issue the following commands:$
+mysqld_safe\[[0-9]+\]: /usr/bin/mysqladmin -u root -h app109 password 'new-password'$
+mysqld_safe\[[0-9]+\]: /usr/bin/mysqladmin -u root password 'new-password'$
+usermod\[[0-9]+\]: change user `mysql' GID from `([0-9]+)' to `\1'$
+usermod\[[0-9]+\]: change user `mysql' shell from `/bin/false' to `/bin/false'$
diff --git a/storage/xtradb/build/debian/percona-xtradb-server-5.1.logcheck.ignore.workstation b/storage/xtradb/build/debian/percona-xtradb-server-5.1.logcheck.ignore.workstation
new file mode 100644
index 00000000000..37f25cb01ea
--- /dev/null
+++ b/storage/xtradb/build/debian/percona-xtradb-server-5.1.logcheck.ignore.workstation
@@ -0,0 +1,32 @@
+/etc/init.d/mysql\[[0-9]+\]: [0-9]+ processes alive and '/usr/bin/mysqladmin --defaults-(extra-)?file=/etc/mysql/debian.cnf ping' resulted in$
+/etc/init.d/mysql\[[0-9]+\]: Check that mysqld is running and that the socket: '/var/run/mysqld/mysqld.sock' exists\!$
+/etc/init.d/mysql\[[0-9]+\]: '/usr/bin/mysqladmin --defaults-(extra-)?file=/etc/mysql/debian.cnf ping' resulted in$
+/etc/mysql/debian-start\[[0-9]+\]: Checking for crashed MySQL tables\.$
+mysqld\[[0-9]+\]: ?$
+mysqld\[[0-9]+\]: .*InnoDB: Shutdown completed
+mysqld\[[0-9]+\]: .*InnoDB: Started;
+mysqld\[[0-9]+\]: .*InnoDB: Starting shutdown\.\.\.$
+mysqld\[[0-9]+\]: .*\[Note\] /usr/sbin/mysqld: Normal shutdown$
+mysqld\[[0-9]+\]: .*\[Note\] /usr/sbin/mysqld: ready for connections\.$
+mysqld\[[0-9]+\]: .*\[Note\] /usr/sbin/mysqld: Shutdown complete$
+mysqld\[[0-9]+\]: /usr/sbin/mysqld: ready for connections\.$
+mysqld\[[0-9]+\]: .*/usr/sbin/mysqld: Shutdown Complete$
+mysqld\[[0-9]+\]: Version: .* socket
+mysqld\[[0-9]+\]: Warning: Ignoring user change to 'mysql' because the user was set to 'mysql' earlier on the command line$
+mysqld_safe\[[0-9]+\]: ?$
+mysqld_safe\[[0-9]+\]: able to use the new GRANT command!$
+mysqld_safe\[[0-9]+\]: ended$
+mysqld_safe\[[0-9]+\]: http://www.mysql.com$
+mysqld_safe\[[0-9]+\]: NOTE:  If you are upgrading from a MySQL <= 3.22.10 you should run$
+mysqld_safe\[[0-9]+\]: PLEASE REMEMBER TO SET A PASSWORD FOR THE MySQL root USER !$
+mysqld_safe\[[0-9]+\]: Please report any problems with the /usr/bin/mysqlbug script!$
+mysqld_safe\[[0-9]+\]: See the manual for more instructions.$
+mysqld_safe\[[0-9]+\]: started$
+mysqld_safe\[[0-9]+\]: Support MySQL by buying support/licenses at https://order.mysql.com$
+mysqld_safe\[[0-9]+\]: The latest information about MySQL is available on the web at$
+mysqld_safe\[[0-9]+\]: the /usr/bin/mysql_fix_privilege_tables. Otherwise you will not be$
+mysqld_safe\[[0-9]+\]: To do so, start the server, then issue the following commands:$
+mysqld_safe\[[0-9]+\]: /usr/bin/mysqladmin -u root -h app109 password 'new-password'$
+mysqld_safe\[[0-9]+\]: /usr/bin/mysqladmin -u root password 'new-password'$
+usermod\[[0-9]+\]: change user `mysql' GID from `([0-9]+)' to `\1'$
+usermod\[[0-9]+\]: change user `mysql' shell from `/bin/false' to `/bin/false'$
diff --git a/storage/xtradb/build/debian/percona-xtradb-server-5.1.mysql.init b/storage/xtradb/build/debian/percona-xtradb-server-5.1.mysql.init
new file mode 100644
index 00000000000..0e0d4f9987f
--- /dev/null
+++ b/storage/xtradb/build/debian/percona-xtradb-server-5.1.mysql.init
@@ -0,0 +1,182 @@
+#!/bin/bash
+#
+### BEGIN INIT INFO
+# Provides:          mysql
+# Required-Start:    $remote_fs $syslog
+# Required-Stop:     $remote_fs $syslog
+# Should-Start:      $network $named $time
+# Should-Stop:       $network $named $time
+# Default-Start:     2 3 4 5
+# Default-Stop:      0 1 6
+# Short-Description: Start and stop the mysql database server daemon
+# Description:       Controls the main MySQL database server daemon "mysqld"
+#                    and its wrapper script "mysqld_safe".
+### END INIT INFO
+#
+set -e
+set -u
+${DEBIAN_SCRIPT_DEBUG:+ set -v -x}
+
+test -x /usr/sbin/mysqld || exit 0
+
+. /lib/lsb/init-functions
+
+SELF=$(cd $(dirname $0); pwd -P)/$(basename $0)
+CONF=/etc/mysql/my.cnf
+MYADMIN="/usr/bin/mysqladmin --defaults-file=/etc/mysql/debian.cnf"
+
+# priority can be overriden and "-s" adds output to stderr
+ERR_LOGGER="logger -p daemon.err -t /etc/init.d/mysql -i"
+
+# Safeguard (relative paths, core dumps..)
+cd /
+umask 077
+
+# mysqladmin likes to read /root/.my.cnf. This is usually not what I want
+# as many admins e.g. only store a password without a username there and
+# so break my scripts.
+export HOME=/etc/mysql/
+
+## Fetch a particular option from mysql's invocation.
+#
+# Usage: void mysqld_get_param option
+mysqld_get_param() {
+	/usr/sbin/mysqld --print-defaults \
+		| tr " " "\n" \
+		| grep -- "--$1" \
+		| tail -n 1 \
+		| cut -d= -f2
+}
+
+## Do some sanity checks before even trying to start mysqld.
+sanity_checks() {
+  # check for config file
+  if [ ! -r /etc/mysql/my.cnf ]; then
+    log_warning_msg "$0: WARNING: /etc/mysql/my.cnf cannot be read. See README.Debian.gz"
+    echo                "WARNING: /etc/mysql/my.cnf cannot be read. See README.Debian.gz" | $ERR_LOGGER
+  fi
+
+  # check for diskspace shortage
+  datadir=`mysqld_get_param datadir`
+  if LC_ALL=C BLOCKSIZE= df --portability $datadir/. | tail -n 1 | awk '{ exit ($4>4096) }'; then
+    log_failure_msg "$0: ERROR: The partition with $datadir is too full!"
+    echo                "ERROR: The partition with $datadir is too full!" | $ERR_LOGGER
+    exit 1
+  fi
+}
+
+## Checks if there is a server running and if so if it is accessible.
+#
+# check_alive insists on a pingable server
+# check_dead also fails if there is a lost mysqld in the process list
+#
+# Usage: boolean mysqld_status [check_alive|check_dead] [warn|nowarn]
+mysqld_status () {
+    ping_output=`$MYADMIN ping 2>&1`; ping_alive=$(( ! $? ))
+
+    ps_alive=0
+    pidfile=`mysqld_get_param pid-file`
+    if [ -f "$pidfile" ] && ps `cat $pidfile` >/dev/null 2>&1; then ps_alive=1; fi
+    
+    if [ "$1" = "check_alive"  -a  $ping_alive = 1 ] ||
+       [ "$1" = "check_dead"   -a  $ping_alive = 0  -a  $ps_alive = 0 ]; then
+	return 0 # EXIT_SUCCESS
+    else
+  	if [ "$2" = "warn" ]; then
+  	    echo -e "$ps_alive processes alive and '$MYADMIN ping' resulted in\n$ping_output\n" | $ERR_LOGGER -p daemon.debug
+	fi
+  	return 1 # EXIT_FAILURE
+    fi
+}
+
+#
+# main()
+#
+
+case "${1:-''}" in
+  'start')
+	sanity_checks;
+	# Start daemon
+	log_daemon_msg "Starting MySQL database server" "mysqld"
+	if mysqld_status check_alive nowarn; then
+	   log_progress_msg "already running"
+	   log_end_msg 0
+	else
+  	    /usr/bin/mysqld_safe > /dev/null 2>&1 &
+	    # 6s was reported in #352070 to be too few when using ndbcluster
+	    for i in 1 2 3 4 5 6 7 8 9 10 11 12 13 14; do
+                sleep 1
+	        if mysqld_status check_alive nowarn ; then break; fi
+		log_progress_msg "."
+	    done
+	    if mysqld_status check_alive warn; then
+                log_end_msg 0
+	        # Now start mysqlcheck or whatever the admin wants.
+	        output=$(/etc/mysql/debian-start)
+		[ -n "$output" ] && log_action_msg "$output"
+	    else
+	        log_end_msg 1
+		log_failure_msg "Please take a look at the syslog"
+	    fi
+	fi
+	;;
+
+  'stop')
+	# * As a passwordless mysqladmin (e.g. via ~/.my.cnf) must be possible
+	# at least for cron, we can rely on it here, too. (although we have 
+	# to specify it explicit as e.g. sudo environments points to the normal
+	# users home and not /root)
+	log_daemon_msg "Stopping MySQL database server" "mysqld"
+	if ! mysqld_status check_dead nowarn; then
+	  set +e
+	  shutdown_out=`$MYADMIN shutdown 2>&1`; r=$?
+	  set -e
+	  if [ "$r" -ne 0 ]; then
+	    log_end_msg 1
+	    [ "$VERBOSE" != "no" ] && log_failure_msg "Error: $shutdown_out"
+	    log_daemon_msg "Killing MySQL database server by signal" "mysqld"
+	    killall -15 mysqld
+            server_down=
+	    for i in 1 2 3 4 5 6 7 8 9 10; do
+              sleep 1
+              if mysqld_status check_dead nowarn; then server_down=1; break; fi
+            done
+          if test -z "$server_down"; then killall -9 mysqld; fi
+	  fi
+        fi
+
+        if ! mysqld_status check_dead warn; then
+	  log_end_msg 1
+	  log_failure_msg "Please stop MySQL manually and read /usr/share/doc/percona-xtradb-server-5.1/README.Debian.gz!"
+	  exit -1
+	else
+	  log_end_msg 0
+        fi
+	;;
+
+  'restart')
+	set +e; $SELF stop; set -e
+	$SELF start 
+	;;
+
+  'reload'|'force-reload')
+  	log_daemon_msg "Reloading MySQL database server" "mysqld"
+	$MYADMIN reload
+	log_end_msg 0
+	;;
+
+  'status')
+	if mysqld_status check_alive nowarn; then
+	  log_action_msg "$($MYADMIN version)"
+	else
+	  log_action_msg "MySQL is stopped."
+	  exit 3
+	fi
+  	;;
+
+  *)
+	echo "Usage: $SELF start|stop|restart|reload|force-reload|status"
+	exit 1
+	;;
+esac
+
diff --git a/storage/xtradb/build/debian/percona-xtradb-server-5.1.percona-xtradb-server.logrotate b/storage/xtradb/build/debian/percona-xtradb-server-5.1.percona-xtradb-server.logrotate
new file mode 100644
index 00000000000..0f0de516b13
--- /dev/null
+++ b/storage/xtradb/build/debian/percona-xtradb-server-5.1.percona-xtradb-server.logrotate
@@ -0,0 +1,27 @@
+# - I put everything in one block and added sharedscripts, so that mysql gets 
+#   flush-logs'd only once.
+#   Else the binary logs would automatically increase by n times every day.
+# - The error log is obsolete, messages go to syslog now.
+/var/log/mysql.log /var/log/mysql/mysql.log /var/log/mysql/mysql-slow.log {
+	daily
+	rotate 7
+	missingok
+	create 640 mysql adm
+	compress
+	sharedscripts
+	postrotate
+		test -x /usr/bin/mysqladmin || exit 0
+
+		# If this fails, check debian.conf! 
+		MYADMIN="/usr/bin/mysqladmin --defaults-file=/etc/mysql/debian.cnf"
+		if [ -z "`$MYADMIN ping 2>/dev/null`" ]; then
+		  # Really no mysqld or rather a missing debian-sys-maint user?
+		  # If this occurs and is not a error please report a bug.
+		  if ps cax | grep -q mysqld; then
+ 		    exit 1
+		  fi 
+		else
+		  $MYADMIN flush-logs
+		fi
+	endscript
+}
diff --git a/storage/xtradb/build/debian/percona-xtradb-server-5.1.postinst b/storage/xtradb/build/debian/percona-xtradb-server-5.1.postinst
new file mode 100644
index 00000000000..24a3f37c646
--- /dev/null
+++ b/storage/xtradb/build/debian/percona-xtradb-server-5.1.postinst
@@ -0,0 +1,277 @@
+#!/bin/bash -e
+
+. /usr/share/debconf/confmodule
+
+if [ -n "$DEBIAN_SCRIPT_DEBUG" ]; then set -v -x; DEBIAN_SCRIPT_TRACE=1; fi
+${DEBIAN_SCRIPT_TRACE:+ echo "#42#DEBUG# RUNNING $0 $*" 1>&2 }
+ 
+export PATH=$PATH:/sbin:/usr/sbin:/bin:/usr/bin
+
+# This command can be used as pipe to syslog. With "-s" it also logs to stderr.
+ERR_LOGGER="logger -p daemon.err -t mysqld_safe -i"
+
+invoke() {
+  if [ -x /usr/sbin/invoke-rc.d ]; then
+    invoke-rc.d mysql $1
+  else
+    /etc/init.d/mysql $1
+  fi
+}
+
+MYSQL_BOOTSTRAP="/usr/sbin/mysqld --bootstrap --user=mysql --skip-grant-tables"
+
+test_mysql_access() {
+       mysql --no-defaults -u root -h localhost </dev/null >/dev/null 2>&1
+}
+
+# call with $1 = "online" to connect to the server, otherwise it bootstraps
+set_mysql_rootpw() {
+       # forget we ever saw the password.  don't use reset to keep the seen status
+       db_set percona-xtradb-server/root_password ""
+
+       tfile=`mktemp`
+       if [ ! -f "$tfile" ]; then
+               return 1
+       fi
+
+       # this avoids us having to call "test" or "[" on $rootpw
+       cat << EOF > $tfile
+USE mysql;
+UPDATE user SET password=PASSWORD("$rootpw") WHERE user='root';
+FLUSH PRIVILEGES;
+EOF
+       if grep -q 'PASSWORD("")' $tfile; then
+               retval=0
+       elif [ "$1" = "online" ]; then
+               mysql --no-defaults -u root -h localhost <$tfile >/dev/null
+               retval=$?
+       else
+               $MYSQL_BOOTSTRAP <$tfile
+               retval=$?
+       fi
+       rm -f $tfile
+       return $retval
+}
+
+# This is necessary because mysql_install_db removes the pid file in /var/run
+# and because changed configuration options should take effect immediately.
+# In case the server wasn't running at all it should be ok if the stop
+# script fails. I can't tell at this point because of the cleaned /var/run.
+set +e; invoke stop; set -e
+    
+case "$1" in
+  configure)
+    mysql_datadir=/usr/share/mysql
+    mysql_statedir=/var/lib/mysql
+    mysql_rundir=/var/run/mysqld
+    mysql_logdir=/var/log
+    mysql_cfgdir=/etc/mysql
+    mysql_newlogdir=/var/log/mysql
+    mysql_upgradedir=/var/lib/mysql-upgrade
+
+    # first things first, if the following symlink exists, it is a preserved
+    # copy the old data dir from a mysql upgrade that would have otherwise
+    # been replaced by an empty mysql dir.  this should restore it.
+    for dir in DATADIR LOGDIR; do
+        if [ "$dir" = "DATADIR" ]; then targetdir=$mysql_statedir; else targetdir=$mysql_newlogdir; fi
+        savelink="$mysql_upgradedir/$dir.link"
+        if [ -L "$savelink" ]; then
+            # If the targetdir was a symlink before we upgraded it is supposed
+            # to be either still be present or not existing anymore now.
+            if [ -L "$targetdir" ]; then
+                rm "$savelink"
+            elif [ ! -d "$targetdir" ]; then
+                mv "$savelink" "$targetdir"
+            else
+                # this should never even happen, but just in case...
+                mysql_tmp=`mktemp -d -t mysql-symlink-restore-XXXXXX`
+                echo "this is very strange!  see $mysql_tmp/README..." >&2
+                mv "$targetdir" "$mysql_tmp"
+                cat << EOF > "$mysql_tmp/README"
+
+if you're reading this, it's most likely because you had replaced /var/lib/mysql
+with a symlink, then upgraded to a new version of mysql, and then dpkg
+removed your symlink (see #182747 and others).  the mysql packages noticed
+that this happened, and as a workaround have restored it.  however, because
+/var/lib/mysql seems to have been re-created in the meantime, and because
+we don't want to rm -rf something we don't know as much about, we're going
+to leave this unexpected directory here.  if your database looks normal,
+and this is not a symlink to your database, you should be able to blow
+this all away.
+
+EOF
+            fi
+        fi
+	rmdir $mysql_upgradedir 2>/dev/null || true
+    done
+    
+    # Ensure the existence and right permissions for the database and
+    # log files.
+    if [ ! -d "$mysql_statedir"       -a ! -L "$mysql_statedir"       ]; then mkdir "$mysql_statedir"; fi
+    if [ ! -d "$mysql_statedir/mysql" -a ! -L "$mysql_statedir/mysql" ]; then mkdir "$mysql_statedir/mysql"; fi
+    if [ ! -d "$mysql_newlogdir"      -a ! -L "$mysql_newlogdir"      ]; then mkdir "$mysql_newlogdir"; fi
+    # When creating an ext3 jounal on an already mounted filesystem like e.g.
+    # /var/lib/mysql, you get a .journal file that is not modifyable by chown.
+    # The mysql_datadir must not be writable by the mysql user under any
+    # circumstances as it contains scripts that are executed by root.
+    set +e
+    chown -R 0:0 $mysql_datadir
+    chown -R mysql $mysql_statedir
+    chown -R mysql $mysql_rundir
+    chown -R mysql:adm $mysql_newlogdir;	chmod 2750 $mysql_newlogdir;
+    for i in log err; do
+      touch             $mysql_logdir/mysql.$i
+      chown mysql:adm   $mysql_logdir/mysql.$i
+      chmod 0640        $mysql_logdir/mysql.$i
+    done
+    set -e
+
+    # This is important to avoid dataloss when there is a removed
+    # percona-xtradb-server version from Woody lying around which used the same
+    # data directory and then somewhen gets purged by the admin.
+    db_set percona-xtradb-server/postrm_remove_database false || true
+
+    # To avoid downgrades.
+    touch $mysql_statedir/debian-5.1.flag
+
+    # initiate databases. Output is not allowed by debconf :-(
+    # Debian: beware of the bashisms... 
+    # Debian: can safely run on upgrades with existing databases 
+    set +e
+    /bin/bash /usr/bin/mysql_install_db --rpm 2>&1 | $ERR_LOGGER
+    if [ "$?" != "0" ]; then
+      echo "ATTENTION: An error has occured. More info is in the syslog!"
+    fi
+    set -e
+    
+    ## On every reconfiguration the maintenance user is recreated.
+    #
+    # - It is easier to regenerate the password every time but as people
+    #   use fancy rsync scripts and file alteration monitors, the existing
+    #   password is used and existing files not touched.
+    # - The mysqld statement is like that in mysql_install_db because the
+    #   server is not already running. This has some implications:
+    # 	- The amount of newlines and semicolons in the query is important!
+    #   - GRANT is not possible with --skip-grant-tables and "INSERT
+    #     (user,host..) VALUES" is not --ansi compliant
+    # - The echo is just for readability. ash's buildin has no "-e" so use /bin/echo.
+    # - The Super_priv, Show_db_priv, Create_tmp_table_priv and Lock_tables_priv
+    #   may not be present as old Woody 3.23 databases did not have it and the
+    #   admin might not already have run mysql_upgrade which adds them.
+    #   As the binlog cron scripts to need at least the Super_priv, I do first
+    #   the old query which always succeeds and then the new which may or may not.
+
+    # recreate the credentials file if not present or without mysql_upgrade stanza
+    dc=$mysql_cfgdir/debian.cnf; 
+    if [ -e "$dc" -a -n "`fgrep mysql_upgrade $dc 2>/dev/null`" ]; then
+        pass="`sed -n 's/^[     ]*password *= *// p' $dc | head -n 1`"
+    else
+	pass=`perl -e 'print map{("a".."z","A".."Z",0..9)[int(rand(62))]}(1..16)'`;
+        if [ ! -d "$mysql_cfgdir" ]; then install -o 0 -g 0 -m 0755 -d $mysql_cfgdir; fi
+        cat /dev/null > $dc
+        echo "# Automatically generated for Debian scripts. DO NOT TOUCH!" >>$dc
+        echo "[client]"                                                    >>$dc
+        echo "host     = localhost"                                        >>$dc
+        echo "user     = debian-sys-maint"                                 >>$dc
+        echo "password = $pass"                                            >>$dc
+        echo "socket   = $mysql_rundir/mysqld.sock"                        >>$dc
+        echo "[mysql_upgrade]"                                             >>$dc
+        echo "host     = localhost"                                        >>$dc
+        echo "user     = debian-sys-maint"                                 >>$dc
+        echo "password = $pass"                                            >>$dc
+        echo "socket   = $mysql_rundir/mysqld.sock"                        >>$dc
+        echo "basedir  = /usr"                                             >>$dc
+    fi
+    # If this dir chmod go+w then the admin did it. But this file should not.
+    chown 0:0 $dc
+    chmod 0600 $dc
+
+    # update privilege tables
+    password_column_fix_query=`/bin/echo -e \
+        "USE mysql\n" \
+        "ALTER TABLE user CHANGE Password Password char(41) character set latin1 collate latin1_bin DEFAULT '' NOT NULL"`;
+    replace_query=`/bin/echo -e \
+        "USE mysql\n" \
+        "REPLACE INTO user SET " \
+        "  host='localhost', user='debian-sys-maint', password=password('$pass'), " \
+        "  Select_priv='Y', Insert_priv='Y', Update_priv='Y', Delete_priv='Y', " \
+        "  Create_priv='Y', Drop_priv='Y', Reload_priv='Y', Shutdown_priv='Y', " \
+        "  Process_priv='Y',  File_priv='Y', Grant_priv='Y', References_priv='Y', " \
+        "  Index_priv='Y', Alter_priv='Y', Super_priv='Y', Show_db_priv='Y', "\
+        "  Create_tmp_table_priv='Y', Lock_tables_priv='Y', Execute_priv='Y', "\
+        "  Repl_slave_priv='Y', Repl_client_priv='Y', Create_view_priv='Y', "\
+        "  Show_view_priv='Y', Create_routine_priv='Y', Alter_routine_priv='Y', "\
+        "  Create_user_priv='Y', Event_priv='Y', Trigger_priv='Y' "`;
+    fix_privs=`/bin/echo -e \
+        "USE mysql;\n" \
+        "ALTER TABLE user ADD column Create_view_priv enum('N','Y') CHARACTER SET utf8 NOT NULL DEFAULT 'N'; " \
+        "ALTER TABLE user ADD column Show_view_priv enum('N','Y') CHARACTER SET utf8 NOT NULL DEFAULT 'N'; " \
+        "ALTER TABLE user ADD column Create_routine_priv enum('N','Y') CHARACTER SET utf8 NOT NULL DEFAULT 'N'; " \
+        "ALTER TABLE user ADD column Alter_routine_priv enum('N','Y') CHARACTER SET utf8 NOT NULL DEFAULT 'N'; " \
+        "ALTER TABLE user ADD column Create_user_priv enum('N','Y') CHARACTER SET utf8 NOT NULL DEFAULT 'N'; " \
+        "ALTER TABLE user ADD column Event_priv enum('N','Y') CHARACTER SET utf8 NOT NULL DEFAULT 'N'; " \
+        "ALTER TABLE user ADD column Trigger_priv enum('N','Y') CHARACTER SET utf8 NOT NULL DEFAULT 'N'; " `
+    # Engines supported by etch should be installed per default. The query sequence is supposed
+    # to be aborted if the CREATE TABLE fails due to an already existent table in which case the
+    # admin might already have chosen to remove one or more plugins. Newlines are necessary.
+    install_plugins=`/bin/echo -e \
+        "USE mysql;\n" \
+        "CREATE TABLE plugin (name char(64) COLLATE utf8_bin NOT NULL DEFAULT '', " \
+        "  dl char(128) COLLATE utf8_bin NOT NULL DEFAULT '', " \
+        "  PRIMARY KEY (name)) ENGINE=MyISAM DEFAULT CHARSET=utf8 COLLATE=utf8_bin COMMENT='MySQL plugins';\n" \
+        "INSERT INTO plugin VALUES ('innodb',    'ha_innodb.so');\n" \
+        "INSERT INTO plugin VALUES ('federated', 'ha_federated.so');\n" \
+        "INSERT INTO plugin VALUES ('blackhole', 'ha_blackhole.so');\n" \
+        "INSERT INTO plugin VALUES ('archive',   'ha_archive.so');" `
+
+    # Upgrade password column format before the root password gets set.
+    echo "$password_column_fix_query"                        | $MYSQL_BOOTSTRAP 2>&1 | $ERR_LOGGER
+
+    db_get percona-xtradb-server/root_password && rootpw="$RET"
+    if ! set_mysql_rootpw; then
+        password_error="yes"
+    fi
+
+    echo "$fix_privs"                                        | $MYSQL_BOOTSTRAP 2>&1 | $ERR_LOGGER
+    echo "$replace_query"                                    | $MYSQL_BOOTSTRAP 2>&1 | $ERR_LOGGER
+    set +e
+    echo "$install_plugins"                                  | $MYSQL_BOOTSTRAP 2>&1 | $ERR_LOGGER
+    set -e
+  ;;
+
+  abort-upgrade|abort-remove|abort-configure)
+  ;;
+
+  *)
+    echo "postinst called with unknown argument '$1'" 1>&2
+    exit 1
+  ;;
+esac
+
+# here we check to see if we can connect as root without a password
+# this should catch upgrades from previous versions where the root
+# password wasn't set.  if there is a password, or if the connection
+# fails for any other reason, nothing happens.
+if [ "$1" = "configure" ]; then
+       if test_mysql_access; then
+               db_input medium percona-xtradb-server/root_password || true
+               db_go
+               db_get percona-xtradb-server/root_password && rootpw="$RET"
+
+               if ! set_mysql_rootpw "online"; then
+                       password_error="yes"
+               fi
+       fi
+
+       if [ "$password_error" = "yes" ]; then
+               db_input high percona-xtradb-server/error_setting_password || true
+               db_go
+       fi
+
+fi
+
+db_stop # in case invoke failes
+
+#DEBHELPER#
+
+exit 0
diff --git a/storage/xtradb/build/debian/percona-xtradb-server-5.1.postrm b/storage/xtradb/build/debian/percona-xtradb-server-5.1.postrm
new file mode 100644
index 00000000000..083a42bd861
--- /dev/null
+++ b/storage/xtradb/build/debian/percona-xtradb-server-5.1.postrm
@@ -0,0 +1,83 @@
+#!/bin/bash -e
+
+# It is possible that Debconf has already been removed, too.
+if [ -f /usr/share/debconf/confmodule ]; then
+  . /usr/share/debconf/confmodule
+fi
+
+if [ -n "$DEBIAN_SCRIPT_DEBUG" ]; then set -v -x; DEBIAN_SCRIPT_TRACE=1; fi
+${DEBIAN_SCRIPT_TRACE:+ echo "#42#DEBUG# RUNNING $0 $*" 1>&2 }
+
+MYADMIN="/usr/bin/mysqladmin --defaults-file=/etc/mysql/debian.cnf"
+
+# Try to stop the server in a sane way. If it does not success let the admin
+# do it himself. No database directories should be removed while the server
+# is running!
+stop_server() {
+  set +e
+  if [ -x /usr/sbin/invoke-rc.d ]; then
+    invoke-rc.d mysql stop
+  else
+    /etc/init.d/mysql stop
+  fi
+  errno=$?
+  set -e
+
+  if [ "$?" != 0 ]; then
+    echo "Trying to stop the MySQL server resulted in exitcode $?." 1>&2
+    echo "Stop it yourself and try again!" 1>&2
+    exit 1
+  fi
+}
+
+case "$1" in
+  purge|remove|upgrade|failed-upgrade|abort-install|abort-upgrade|disappear)
+    if [ -n "`$MYADMIN ping 2>/dev/null`" ]; then
+      stop_server
+      sleep 2
+    fi
+  ;;
+  *)
+    echo "postrm called with unknown argument '$1'" 1>&2
+    exit 1
+  ;;
+esac
+
+#
+# - Do NOT purge logs or data if another percona-xtradb-server* package is installed (#307473)
+# - Remove the mysql user only after all his owned files are purged.
+#   
+if [ "$1" = "purge" -a ! \( -x /usr/sbin/mysqld -o -L /usr/sbin/mysqld \) ]; then
+  # we remove the mysql user only after all his owned files are purged
+  rm -f /var/log/mysql.{log,err}{,.0,.[1234567].gz}
+  rm -rf /var/log/mysql
+
+  db_input high percona-xtradb-server-5.1/postrm_remove_databases || true
+  db_go || true
+  db_get percona-xtradb-server-5.1/postrm_remove_databases || true
+  if [ "$RET" = "true" ]; then
+    # never remove the debian.cnf when the databases are still existing
+    # else we ran into big trouble on the next install!
+    rm -f /etc/mysql/debian.cnf
+    rm -rf /var/lib/mysql
+    rm -rf /var/run/mysqld
+    userdel mysql || true
+  fi
+
+  # (normally) Automatically added by dh_installinit
+  if [ "$1" = "purge" ] ; then
+        update-rc.d mysql remove >/dev/null || exit 0
+  fi
+  # (normally) End automatically added section
+fi
+
+# (normally) Automatically added by dh_installdebconf
+if [ "$1" = purge ] && [ -e /usr/share/debconf/confmodule ]; then
+        . /usr/share/debconf/confmodule
+        db_purge
+fi
+# (normally) End automatically added section
+
+# no DEBHELPER here, "update-rc.d remove" fails if percona-xtradb-server-5.1 is installed
+
+exit 0
diff --git a/storage/xtradb/build/debian/percona-xtradb-server-5.1.preinst b/storage/xtradb/build/debian/percona-xtradb-server-5.1.preinst
new file mode 100644
index 00000000000..a338e4edd8f
--- /dev/null
+++ b/storage/xtradb/build/debian/percona-xtradb-server-5.1.preinst
@@ -0,0 +1,186 @@
+#!/bin/bash -e
+#
+# summary of how this script can be called:
+#        * <new-preinst> install
+#        * <new-preinst> install <old-version>
+#        * <new-preinst> upgrade <old-version>
+#        * <old-preinst> abort-upgrade <new-version>
+#
+
+. /usr/share/debconf/confmodule
+
+if [ -n "$DEBIAN_SCRIPT_DEBUG" ]; then set -v -x; DEBIAN_SCRIPT_TRACE=1; fi
+${DEBIAN_SCRIPT_TRACE:+ echo "#42#DEBUG# RUNNING $0 $*" 1>&2 }
+
+export PATH=$PATH:/sbin:/usr/sbin:/bin:/usr/bin
+MYADMIN="/usr/bin/mysqladmin --defaults-file=/etc/mysql/debian.cnf"
+DATADIR=/var/lib/mysql
+LOGDIR=/var/log/mysql
+UPGRADEDIR=/var/lib/mysql-upgrade
+
+# Try to stop the server in a sane way. If it does not success let the admin
+# do it himself. No database directories should be removed while the server
+# is running! Another mysqld in e.g. a different chroot is fine for us.
+stop_server() {
+    if [ ! -x /etc/init.d/mysql ]; then return; fi
+
+    set +e
+    if [ -x /usr/sbin/invoke-rc.d ]; then
+      cmd="invoke-rc.d mysql stop"
+    else
+      cmd="/etc/init.d/mysql stop"
+    fi
+    $cmd
+    errno=$?
+    set -e
+   
+    # 0=ok, 100=no init script (fresh install)
+    if [ "$errno" != 0 -a "$errno" != 100 ]; then
+      echo "${cmd/ */} returned $errno" 1>&2
+      echo "There is a MySQL server running, but we failed in our attempts to stop it." 1>&2
+      echo "Stop it yourself and try again!" 1>&2
+      db_stop  	
+      exit 1
+    fi
+}
+
+################################ main() ##########################
+
+this_version=5.1
+
+# Check kernel version
+if dpkg --compare-versions `uname -r` lt 2.6; then
+  /bin/echo -e "\nPROBLEM: MySQL-5.x is currently incompatible with kernel 2.4. Aborting.";
+  /bin/echo -e "See http://bugs.debian.org/416841 for more information.\n"
+  exit 1
+fi
+
+# Abort if an NDB cluster is in use.
+if egrep -q -r '^[^#]*ndb.connectstring' /etc/mysql/; then
+  db_fset percona-xtradb-server/no_upgrade_when_using_ndb seen false || true
+  db_input high percona-xtradb-server/no_upgrade_when_using_ndb || true
+  db_go
+  db_stop
+  exit 1
+fi
+
+# Safe the user from stupidities.
+show_downgrade_warning=0
+for i in `ls $DATADIR/debian-*.flag 2>/dev/null`; do
+  found_version=`echo $i | sed 's/.*debian-\([0-9\.]\+\).flag/\1/'`
+  if dpkg --compare-versions "$this_version" '<<' "$found_version"; then
+    show_downgrade_warning=1
+    break;
+  fi
+done
+if [ "$show_downgrade_warning" = 1 ]; then
+  db_fset percona-xtradb-server-$this_version/really_downgrade seen false || true
+  db_input medium percona-xtradb-server-$this_version/really_downgrade || true
+  db_go
+  db_get percona-xtradb-server-$this_version/really_downgrade || true
+  if [ "$RET" = "true" ]; then
+    rm -f $DATADIR/debian-*.flag
+    touch $DATADIR/debian-$this_version.flag
+  else
+    echo "Aborting downgrade from (at least) $found_version to $this_version." 1>&2
+    echo "If are sure you want to downgrade to $this_version, remove the file" 1>&2
+    echo "$DATADIR/debian-*.flag and try installing again." 1>&2
+    db_stop
+    exit 1
+  fi
+fi
+
+# to be sure
+stop_server
+
+# If we use NIS then errors should be tolerated. It's up to the
+# user to ensure that the mysql user is correctly setup.
+# Beware that there are two ypwhich one of them needs the 2>/dev/null!
+if test -n "`which ypwhich 2>/dev/null`"  &&  ypwhich >/dev/null 2>&1; then
+  set +e
+fi
+
+#
+# Now we have to ensure the following state:
+# /etc/passwd: mysql:x:100:101:MySQL Server:/var/lib/mysql:/bin/false
+# /etc/group:  mysql:x:101:
+# 
+# Sadly there could any state be present on the system so we have to
+# modify everything carefully i.e. not doing a chown before creating
+# the user etc...
+#
+
+# creating mysql group if he isn't already there
+if ! getent group mysql >/dev/null; then
+ 	# Adding system group: mysql.
+	addgroup --system mysql >/dev/null
+fi
+
+# creating mysql user if he isn't already there
+if ! getent passwd mysql >/dev/null; then
+	# Adding system user: mysql.
+	adduser \
+	  --system \
+          --disabled-login \
+	  --ingroup mysql \
+	  --home $DATADIR \
+	  --gecos "MySQL Server" \
+	  --shell /bin/false \
+	  mysql  >/dev/null
+fi
+
+# end of NIS tolerance zone
+set -e
+
+# if there's a symlink, let's store where it's pointing, because otherwise
+# it's going to be lost in some situations
+for dir in DATADIR LOGDIR; do
+    checkdir=`eval echo "$"$dir`
+    if [ -L "$checkdir" ]; then
+	mkdir -p "$UPGRADEDIR"
+	cp -d "$checkdir" "$UPGRADEDIR/$dir.link"
+    fi
+done
+
+# creating mysql home directory
+if [ ! -d $DATADIR -a ! -L $DATADIR ]; then
+ 	mkdir $DATADIR
+fi
+
+# checking disc space
+if LC_ALL=C BLOCKSIZE= df --portability $DATADIR/. | tail -n 1 | awk '{ exit ($4>1000) }'; then
+  echo "ERROR: There's not enough space in $DATADIR/" 1>&2
+  db_stop
+  exit 1
+fi
+
+# Since the home directory was created before putting the user into
+# the mysql group and moreover we cannot guarantee that the 
+# permissions were correctly *before* calling this script, we fix them now.
+# In case we use NIS and no mysql user is present then this script should
+# better fail now than later..
+# The "set +e" is necessary as e.g. a ".journal" of a ext3 partition is
+# not chgrp'able (#318435).
+set +e
+chown mysql:mysql $DATADIR
+find $DATADIR -follow -not -group mysql -print0 2>/dev/null \
+  | xargs -0 --no-run-if-empty chgrp mysql
+set -e
+
+# Some files below /etc/ were possibly in the percona-xtradb-server-5.0/etch package
+# before. They get overwritten by current ones to avoid unnecessary dpkg questions.
+while read md5 file; do
+  if [ "`md5sum $file 2>/dev/null`" = "$md5  $file" ]; then
+    cp /usr/share/percona-xtradb-common/internal-use-only/`echo $file | sed 's�/�_�g'` $file
+  fi
+done <<EOT
+6691f2fdc5c6d27ff0260eb79813e1bc  /etc/init.d/mysql
+b53b9552d44661361d39157c3c7c51d3  /etc/logrotate.d/percona-xtradb-server
+57f3e58f72582ca55100dc1ba0f1a8ae  /etc/mysql/debian-start
+EOT
+
+db_stop
+
+#DEBHELPER#
+
+exit 0
diff --git a/storage/xtradb/build/debian/percona-xtradb-server-5.1.prerm b/storage/xtradb/build/debian/percona-xtradb-server-5.1.prerm
new file mode 100644
index 00000000000..03e9ea37420
--- /dev/null
+++ b/storage/xtradb/build/debian/percona-xtradb-server-5.1.prerm
@@ -0,0 +1,8 @@
+#!/bin/bash -e
+
+. /usr/share/debconf/confmodule
+
+if [ -n "$DEBIAN_SCRIPT_DEBUG" ]; then set -v -x; DEBIAN_SCRIPT_TRACE=1; fi
+${DEBIAN_SCRIPT_TRACE:+ echo "#42#DEBUG# RUNNING $0 $*" 1>&2 }
+
+#DEBHELPER#
diff --git a/storage/xtradb/build/debian/percona-xtradb-server-5.1.templates b/storage/xtradb/build/debian/percona-xtradb-server-5.1.templates
new file mode 100644
index 00000000000..efa318640db
--- /dev/null
+++ b/storage/xtradb/build/debian/percona-xtradb-server-5.1.templates
@@ -0,0 +1,90 @@
+# These templates have been reviewed by the debian-l10n-english
+# team
+#
+# If modifications/additions/rewording are needed, please ask
+# for an advice to debian-l10n-english@lists.debian.org
+#
+# Even minor modifications require translation updates and such
+# changes should be coordinated with translators and reviewers.
+
+Template: percona-xtradb-server-5.1/really_downgrade
+Type: boolean
+Default: false
+_Description: Really proceed with downgrade?
+ A file named /var/lib/mysql/debian-*.flag exists on this system.
+ .
+ Such file is an indication that a percona-xtradb-server package with a higher
+ version has been installed earlier.
+ .
+ There is no guarantee that the version you're currently installing
+ will be able to use the current databases.
+
+Template: percona-xtradb-server-5.1/nis_warning
+Type: note
+#flag:translate!:3,5
+_Description: Important note for NIS/YP users
+ To use MySQL, the following entries for users and groups should be added
+ to the system:
+ .
+  /etc/passwd   : mysql:x:100:101:Percona SQL Server:/var/lib/mysql:/bin/false
+  /etc/group    : mysql:x:101:
+ .
+ You should also check the permissions and the owner of the
+ /var/lib/mysql directory:
+ .
+  /var/lib/mysql: drwxr-xr-x   mysql    mysql
+
+Template: percona-xtradb-server-5.1/postrm_remove_databases
+Type: boolean
+Default: false
+_Description: Remove all Percona SQL databases?
+ The /var/lib/mysql directory which contains the Percona SQL databases is about
+ to be removed.
+ .
+ If you're removing the Percona SQL package in order to later install a more
+ recent version or if a different percona-xtradb-server package is already
+ using it, the data should be kept.
+
+Template: percona-xtradb-server-5.1/start_on_boot
+Type: boolean
+Default: true
+_Description: Start the Percona SQL server on boot?
+ The Percona SQL server can be launched automatically at boot time or manually
+ with the '/etc/init.d/mysql start' command.
+
+Template: percona-xtradb-server/root_password
+Type: password
+_Description: New password for the Percona SQL "root" user:
+ While not mandatory, it is highly recommended that you set a password
+ for the Percona SQL administrative "root" user.
+ .
+ If that field is left blank, the password will not be changed.
+
+Template: percona-xtradb-server/root_password_again
+Type: password
+_Description: Repeat password for the Percona SQL "root" user:
+
+Template: percona-xtradb-server/error_setting_password
+Type: error
+_Description: Unable to set password for the Percona SQL "root" user
+ An error occurred while setting the password for the Percona SQL
+ administrative user. This may have happened because the account
+ already has a password, or because of a communication problem with
+ the Percona SQL server.
+ .
+ You should check the account's password after the package installation.
+ .
+ Please read the /usr/share/doc/percona-xtradb-server-5.1/README.Debian file
+ for more information.
+
+Template: percona-xtradb-server/password_mismatch
+Type: error
+_Description: Password input error
+ The two passwords you entered were not the same. Please try again.
+
+Template: percona-xtradb-server/no_upgrade_when_using_ndb
+Type: error
+_Description: NDB Cluster seems to be in use
+ Percona-SQL-5.1 has orphaned NDB Cluster support. Please migrate to the new
+ mysql-cluster package and remove all lines starting with "ndb" from
+ all config files below /etc/mysql/.
diff --git a/storage/xtradb/build/debian/po/POTFILES.in b/storage/xtradb/build/debian/po/POTFILES.in
new file mode 100644
index 00000000000..b3a73d17bd1
--- /dev/null
+++ b/storage/xtradb/build/debian/po/POTFILES.in
@@ -0,0 +1 @@
+[type: gettext/rfc822deb] percona-xtradb-server-5.1.templates
diff --git a/storage/xtradb/build/debian/po/ar.po b/storage/xtradb/build/debian/po/ar.po
new file mode 100644
index 00000000000..6a51c1f8919
--- /dev/null
+++ b/storage/xtradb/build/debian/po/ar.po
@@ -0,0 +1,267 @@
+# translation of templates.po to Arabic
+# Copyright (C) YEAR THE PACKAGE'S COPYRIGHT HOLDER
+# This file is distributed under the same license as the PACKAGE package.
+#
+# Ossama M. Khayat <okhayat@yahoo.com>, 2007.
+msgid ""
+msgstr ""
+"Project-Id-Version: templates\n"
+"Report-Msgid-Bugs-To: percona-xtradb-dfsg-5.1@packages.debian.org\n"
+"POT-Creation-Date: 2010-02-15 17:10-0500\n"
+"PO-Revision-Date: 2007-05-01 13:04+0300\n"
+"Last-Translator: Ossama M. Khayat <okhayat@yahoo.com>\n"
+"Language-Team: Arabic <support@arabeyes.org>\n"
+"MIME-Version: 1.0\n"
+"Content-Type: text/plain; charset=UTF-8\n"
+"Content-Transfer-Encoding: 8bit\n"
+"X-Generator: KBabel 1.11.4\n"
+"Plural-Forms: nplurals=6; plural=n==1 ? 0 : n==0 ? 1 : n==2 ? 2: n%100>=3 && "
+"n%100<=10 ? 3 : n%100>=11 && n%100<=99 ? 4 : 5\n"
+": n%100>=3 && n%100<=10 ? 3 : n%100>=11 && n%100<=99 ? 4 : 5\n"
+": n%100>=3 && n%100<=10 ? 3 : n%100>=11 && n%100<=99 ? 4 : 5\n"
+": n%100>=3 && n%100<=10 ? 3 : n%100>=11 && n%100<=99 ? 4 : 5\n"
+": n%100>=3 && n%100<=10 ? 3 : n%100>=11 && n%100<=99 ? 4 : 5\n"
+": n%100>=3 && n%100<=10 ? 3 : n%100>=11 && n%100<=99 ? 4 : 5\n"
+": n%100>=3 && n%100<=10 ? 3 : n%100>=11 && n%100<=99 ? 4 : 5\n"
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:2001
+msgid "Really proceed with downgrade?"
+msgstr "هل فعلاً تريد التثبيط؟"
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:2001
+msgid "A file named /var/lib/mysql/debian-*.flag exists on this system."
+msgstr "هناك ملف مسمى /var/lib/mysql/debian-*.flag موجود على هذا النظام."
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:2001
+msgid ""
+"Such file is an indication that a mysql-server package with a higher version "
+"has been installed earlier."
+msgstr ""
+"هذا الملف دلالة على أن نسخة أحدث من حزمة mysql-server تم تثبيتها مسبقاً."
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:2001
+msgid ""
+"There is no guarantee that the version you're currently installing will be "
+"able to use the current databases."
+msgstr ""
+"ليست هناك أية ضمانة أن النسخة التي تقوم بتثبيتها ستكون قادرة على استخدام "
+"قواعد البيانات الحالية."
+
+#. Type: note
+#. Description
+#: ../percona-xtradb-server-5.1.templates:3001
+msgid "Important note for NIS/YP users"
+msgstr "ملاحظة هامة لمستخدمي NIS/YP"
+
+#. Type: note
+#. Description
+#: ../percona-xtradb-server-5.1.templates:3001
+msgid ""
+"To use MySQL, the following entries for users and groups should be added to "
+"the system:"
+msgstr ""
+"كي تستخدم MySQL، يجب إضافة المُدخلات التالية الخاصة بالمستخدمين والمجموعات "
+"إلى النظام:"
+
+#. Type: note
+#. Description
+#: ../percona-xtradb-server-5.1.templates:3001
+msgid ""
+"You should also check the permissions and the owner of the /var/lib/mysql "
+"directory:"
+msgstr "عليك أيضاً أن تقوم بالتأكد من صلاحيات مالك الملف /var/lib/mysql: "
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:4001
+#, fuzzy
+#| msgid "Remove all MySQL databases?"
+msgid "Remove all Percona SQL databases?"
+msgstr "إزالة جميع قواعد بيانات MySQL؟"
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:4001
+#, fuzzy
+#| msgid ""
+#| "The /var/lib/mysql directory which contains the MySQL databases is about "
+#| "to be removed."
+msgid ""
+"The /var/lib/mysql directory which contains the Percona SQL databases is "
+"about to be removed."
+msgstr "الدليل /var/lib/mysql الذي يحتوي قواعد بيانات MySQL ستتم إزالته."
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:4001
+#, fuzzy
+#| msgid ""
+#| "If you're removing the MySQL package in order to later install a more "
+#| "recent version or if a different mysql-server package is already using "
+#| "it, the data should be kept."
+msgid ""
+"If you're removing the Percona SQL package in order to later install a more "
+"recent version or if a different mysql-server package is already using it, "
+"the data should be kept."
+msgstr ""
+"إن كنت تقوم بإزالة حزمة MySQL كي تقوم لاحقاً بتثبيت نسخة أحدث أو إن كانت حزمة "
+"mysql-server مختلفة تستخدمها، فيجب إبقاء البيانات."
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:5001
+#, fuzzy
+#| msgid "Start the MySQL server on boot?"
+msgid "Start the Percona SQL server on boot?"
+msgstr "تشغيل خادم MySQL عند الإقلاع؟"
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:5001
+#, fuzzy
+#| msgid ""
+#| "The MySQL server can be launched automatically at boot time or manually "
+#| "with the '/etc/init.d/mysql start' command."
+msgid ""
+"The Percona SQL server can be launched automatically at boot time or "
+"manually with the '/etc/init.d/mysql start' command."
+msgstr ""
+"يمكن تشغيل خادم MySQL آلياً وقت الإقلاع أو يدوياً باستخدام الأمر '/etc/init.d/"
+"mysql start'."
+
+#. Type: password
+#. Description
+#: ../percona-xtradb-server-5.1.templates:6001
+#, fuzzy
+#| msgid "New password for the MySQL \"root\" user:"
+msgid "New password for the Percona SQL \"root\" user:"
+msgstr "كلمة المرور الجديدة لمستخد \"root\" الخاص بـMySQL:"
+
+#. Type: password
+#. Description
+#: ../percona-xtradb-server-5.1.templates:6001
+#, fuzzy
+#| msgid ""
+#| "While not mandatory, it is highly recommended that you set a password for "
+#| "the MySQL administrative \"root\" user."
+msgid ""
+"While not mandatory, it is highly recommended that you set a password for "
+"the Percona SQL administrative \"root\" user."
+msgstr ""
+"مع أنه ليس إجبارياً، ولكن من المستحسن أن تقوم بتعيين كلمة مرور خاصة بمستخدم "
+"MySQL الإداري \"root\"."
+
+#. Type: password
+#. Description
+#: ../percona-xtradb-server-5.1.templates:6001
+msgid "If that field is left blank, the password will not be changed."
+msgstr "إن ترك الحقل فارغاً، فلن يتم تغيير كلمة المرور."
+
+#. Type: password
+#. Description
+#: ../percona-xtradb-server-5.1.templates:7001
+#, fuzzy
+#| msgid "New password for the MySQL \"root\" user:"
+msgid "Repeat password for the Percona SQL \"root\" user:"
+msgstr "كلمة المرور الجديدة لمستخد \"root\" الخاص بـMySQL:"
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:8001
+#, fuzzy
+#| msgid "Unable to set password for the MySQL \"root\" user"
+msgid "Unable to set password for the Percona SQL \"root\" user"
+msgstr "تعذر تعيين كلمة مرور للمستخدم \"root\" الخاص بـMySQL."
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:8001
+#, fuzzy
+#| msgid ""
+#| "An error occurred while setting the password for the MySQL administrative "
+#| "user. This may have happened because the account already has a password, "
+#| "or because of a communication problem with the MySQL server."
+msgid ""
+"An error occurred while setting the password for the Percona SQL "
+"administrative user. This may have happened because the account already has "
+"a password, or because of a communication problem with the Percona SQL "
+"server."
+msgstr ""
+"حدث خطأ أثناء تعيين كلمة المرور لمستخدم MySQL الإداري. قد يكون هذا حدث بسبب "
+"أن حساب المستخدم له كلمة مرور معيّنة مسبقاً، أو بسبب مشكلة في الاتصال مع خادم "
+"MySQL."
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:8001
+#, fuzzy
+#| msgid ""
+#| "You should check the account's password after tha package installation."
+msgid "You should check the account's password after the package installation."
+msgstr "يجب عليك التحقق من كلمة مرور الحساب عقب تثبيت الحزمة."
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:8001
+#, fuzzy
+#| msgid ""
+#| "Please read the /usr/share/doc/mysql-server-5.1/README.Debian file for "
+#| "more information."
+msgid ""
+"Please read the /usr/share/doc/mysql-server-5.1/README.Debian file for more "
+"information."
+msgstr ""
+"الرجاء قراءة الملف /usr/share/doc/mysql-server-5.1/README.Debian للمزيد من "
+"المعلومات."
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:9001
+msgid "Password input error"
+msgstr ""
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:9001
+msgid "The two passwords you entered were not the same. Please try again."
+msgstr ""
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:10001
+msgid "NDB Cluster seems to be in use"
+msgstr ""
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:10001
+msgid ""
+"Percona-SQL-5.1 has orphaned NDB Cluster support. Please migrate to the new "
+"mysql-cluster package and remove all lines starting with \"ndb\" from all "
+"config files below /etc/mysql/."
+msgstr ""
+
+#~ msgid ""
+#~ "Support MySQL connections from hosts running Debian \"sarge\" or older?"
+#~ msgstr ""
+#~ "هل تريد دعم اتصالات MySQL من الأجهزة التي تعمل على ديبيان \"sarge\" أو "
+#~ "أقدم؟"
+
+#~ msgid ""
+#~ "In old versions of MySQL clients on Debian, passwords were not stored "
+#~ "securely. This has been improved since then, however clients (such as "
+#~ "PHP) from hosts running Debian 3.1 Sarge will not be able to connect to "
+#~ "recent accounts or accounts whose password have been changed."
+#~ msgstr ""
+#~ "في إصدارات عملاء MySQL القديمة من ديبيان، لم تكن كلمات المرور تحفظ بشكل "
+#~ "آمن. ولقد حل هذه المشكلة بعدها، غير أن العملاء (مثل PHP) المتصلين من "
+#~ "أجهزة تعمل على ديبيان Sarge 3.1 لن يكونوا قادرين على الاتصال باستخدام "
+#~ "الحسابات الحديثة أو الحسابات التي تم تغيير كلمة مرورها."
diff --git a/storage/xtradb/build/debian/po/ca.po b/storage/xtradb/build/debian/po/ca.po
new file mode 100644
index 00000000000..94fadb6d5df
--- /dev/null
+++ b/storage/xtradb/build/debian/po/ca.po
@@ -0,0 +1,342 @@
+# mysql-dfsg (debconf) translation to Catalan.
+# Copyright (C) 1999, 2000, 2001, 2002, 2003, 2004 Free Software Foundation, Inc.
+# Aleix Badia i Bosch <abadia@ica.es> 2004
+#
+msgid ""
+msgstr ""
+"Project-Id-Version: mysql-dfsg-4.1\n"
+"Report-Msgid-Bugs-To: percona-xtradb-dfsg-5.1@packages.debian.org\n"
+"POT-Creation-Date: 2010-02-15 17:10-0500\n"
+"PO-Revision-Date: 2004-01-31 19:20GMT\n"
+"Last-Translator: Aleix Badia i Bosch <abadia@ica.es>\n"
+"Language-Team: Debian L10n Catalan <debian-l10n-catalan@lists.debian.org>\n"
+"MIME-Version: 1.0\n"
+"Content-Type: text/plain; charset=ISO-8859-1\n"
+"Content-Transfer-Encoding: 8bit\n"
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:2001
+msgid "Really proceed with downgrade?"
+msgstr ""
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:2001
+msgid "A file named /var/lib/mysql/debian-*.flag exists on this system."
+msgstr ""
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:2001
+msgid ""
+"Such file is an indication that a mysql-server package with a higher version "
+"has been installed earlier."
+msgstr ""
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:2001
+msgid ""
+"There is no guarantee that the version you're currently installing will be "
+"able to use the current databases."
+msgstr ""
+
+#. Type: note
+#. Description
+#: ../percona-xtradb-server-5.1.templates:3001
+#, fuzzy
+#| msgid "Important note for NIS/YP users!"
+msgid "Important note for NIS/YP users"
+msgstr "Nota important pels usuaris de NIS/YP"
+
+#. Type: note
+#. Description
+#: ../percona-xtradb-server-5.1.templates:3001
+msgid ""
+"To use MySQL, the following entries for users and groups should be added to "
+"the system:"
+msgstr ""
+
+#. Type: note
+#. Description
+#: ../percona-xtradb-server-5.1.templates:3001
+msgid ""
+"You should also check the permissions and the owner of the /var/lib/mysql "
+"directory:"
+msgstr ""
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:4001
+msgid "Remove all Percona SQL databases?"
+msgstr ""
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:4001
+msgid ""
+"The /var/lib/mysql directory which contains the Percona SQL databases is "
+"about to be removed."
+msgstr ""
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:4001
+msgid ""
+"If you're removing the Percona SQL package in order to later install a more "
+"recent version or if a different mysql-server package is already using it, "
+"the data should be kept."
+msgstr ""
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:5001
+#, fuzzy
+#| msgid "Should MySQL start on boot?"
+msgid "Start the Percona SQL server on boot?"
+msgstr "Voleu que el MySQL s'inici� a l'arrencada ?"
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:5001
+#, fuzzy
+msgid ""
+"The Percona SQL server can be launched automatically at boot time or "
+"manually with the '/etc/init.d/mysql start' command."
+msgstr ""
+"El MySQL es pot executar a l'arrencada o nom�s si executeu manualment '/etc/"
+"init.d/mysql start'. Seleccioneu 's�' si voleu que s'inicialitzi "
+"autom�ticament."
+
+#. Type: password
+#. Description
+#: ../percona-xtradb-server-5.1.templates:6001
+msgid "New password for the Percona SQL \"root\" user:"
+msgstr ""
+
+#. Type: password
+#. Description
+#: ../percona-xtradb-server-5.1.templates:6001
+msgid ""
+"While not mandatory, it is highly recommended that you set a password for "
+"the Percona SQL administrative \"root\" user."
+msgstr ""
+
+#. Type: password
+#. Description
+#: ../percona-xtradb-server-5.1.templates:6001
+msgid "If that field is left blank, the password will not be changed."
+msgstr ""
+
+#. Type: password
+#. Description
+#: ../percona-xtradb-server-5.1.templates:7001
+msgid "Repeat password for the Percona SQL \"root\" user:"
+msgstr ""
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:8001
+msgid "Unable to set password for the Percona SQL \"root\" user"
+msgstr ""
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:8001
+msgid ""
+"An error occurred while setting the password for the Percona SQL "
+"administrative user. This may have happened because the account already has "
+"a password, or because of a communication problem with the Percona SQL "
+"server."
+msgstr ""
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:8001
+msgid "You should check the account's password after the package installation."
+msgstr ""
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:8001
+msgid ""
+"Please read the /usr/share/doc/mysql-server-5.1/README.Debian file for more "
+"information."
+msgstr ""
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:9001
+msgid "Password input error"
+msgstr ""
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:9001
+msgid "The two passwords you entered were not the same. Please try again."
+msgstr ""
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:10001
+msgid "NDB Cluster seems to be in use"
+msgstr ""
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:10001
+msgid ""
+"Percona-SQL-5.1 has orphaned NDB Cluster support. Please migrate to the new "
+"mysql-cluster package and remove all lines starting with \"ndb\" from all "
+"config files below /etc/mysql/."
+msgstr ""
+
+#, fuzzy
+#~ msgid ""
+#~ "To use mysql you must install an equivalent user and group to the "
+#~ "following and ensure yourself that /var/lib/mysql has the right "
+#~ "permissions (the uid/gid may be different)."
+#~ msgstr ""
+#~ "Per utilitzar la base de dades de MySQL heu d'afegir un usuari i grup "
+#~ "equivalent al seg�ent i assegurar-vos que el directori /var/lib/mysql "
+#~ "tingui els permisos correctes."
+
+#~ msgid ""
+#~ "/etc/passwd:      mysql:x:100:101:MySQL Server:/var/lib/mysql:/bin/false"
+#~ msgstr ""
+#~ "/etc/passwd:      mysql:x:100:101:MySQL Server:/var/lib/mysql:/bin/false"
+
+#~ msgid "/etc/group:       mysql:x:101:"
+#~ msgstr "/etc/group:       mysql:x:101:"
+
+#~ msgid "/var/lib/mysql:   drwxr-xr-x   mysql    mysql"
+#~ msgstr "/var/lib/mysql:   drwxr-xr-x   mysql    mysql"
+
+#, fuzzy
+#~ msgid "Please also read http://www.mysql.com/doc/en/Upgrade.html"
+#~ msgstr ""
+#~ "Feu una ullada al document: http://www.mysql.com/doc/en/Upgrade.html"
+
+#, fuzzy
+#~ msgid ""
+#~ "MySQL will only install if you have a non-numeric hostname that is "
+#~ "resolvable via the /etc/hosts file. E.g. if the \"hostname\" command "
+#~ "returns \"myhostname\" then there must be a line like \"10.0.0.1 "
+#~ "myhostname\"."
+#~ msgstr ""
+#~ "El MySQL nom�s s'instal�la en cas de tenir un nom d'ordinador central que "
+#~ "no sigui num�ric  i que es pugui resoldre a trav�s del fitxer /etc/hosts. "
+#~ "Ex. si l'ordre \"hostname\" retorna \"myhostname\", llavors hi ha d'haver "
+#~ "una l�nia com la seg�ent \"10.0.0.1 myhostname\"."
+
+#, fuzzy
+#~ msgid ""
+#~ "A new mysql user \"debian-sys-maint\" will be created. This mysql account "
+#~ "is used in the start/stop and cron scripts. Don't delete."
+#~ msgstr ""
+#~ "Es crea un nou usuari de mysql \"debian-sys-maint\". S'utilitza per les "
+#~ "seq��ncies d'inicialitzaci� i aturada del cron, no el suprimiu."
+
+#, fuzzy
+#~ msgid ""
+#~ "Please remember to set a PASSWORD for the MySQL root user! If you use a /"
+#~ "root/.my.cnf, always write the \"user\" and the \"password\" lines in "
+#~ "there, never only the password!"
+#~ msgstr ""
+#~ "Recordeu posar una contrasenya al superusuari del MySQL. Si utilitzeu un "
+#~ "fitxer /root/.my.cnf, escriviu sempre all� les l�nies \"user\" i "
+#~ "\"password\".; mai nom�s la contrasenya. Per a m�s informaci� feu una "
+#~ "ullada a /usr/share/doc/mysql-server/README.Debian."
+
+#, fuzzy
+#~ msgid ""
+#~ "Should I remove all databases below /var/lib/mysql as you are purging the "
+#~ "mysql-server package?"
+#~ msgstr ""
+#~ "Voleu suprimir totes les bases de dades en purgar el paquet mysql-server ?"
+
+#~ msgid ""
+#~ "Networking is disabled by default for security reasons. You can enable it "
+#~ "by commenting out the skip-networking option in /etc/mysql/my.cnf."
+#~ msgstr ""
+#~ "La xarxa est� inhabilitada per defecte per a raons de seguretat. La podeu "
+#~ "habilitar descomentant l'opci� de skip-networking del fitxer /etc/mysql/"
+#~ "my.cnf."
+
+#~ msgid "security and update notice"
+#~ msgstr "Av�s de seguretat i actualitzaci�"
+
+#~ msgid "Please run mysql_fix_privilege_tables !"
+#~ msgstr "Executeu mysql_fix_privilege_tables"
+
+#~ msgid ""
+#~ "I will ensure secure permissions of /var/lib/mysql by replacing GIDs "
+#~ "other than root and mysql with mysql."
+#~ msgstr ""
+#~ "S'asseguren els permisos de seguretat de /var/lib/mysql canviant a mysql "
+#~ "tots els GIDs diferents a root i mysql."
+
+#~ msgid ""
+#~ "Instructions how to enable SSL support are in /usr/share/doc/mysql-server/"
+#~ msgstr ""
+#~ "Per habilitar el suport de SSL podeu seguir les instruccions de /usr/"
+#~ "share/doc/mysql-server/"
+
+#~ msgid "mysql_fix_privileges_tables will be executed"
+#~ msgstr "s'executa mysql_fix_privileges_tables"
+
+#~ msgid ""
+#~ "The latest MySQL versions have an enhanced, more fine grained, privilege "
+#~ "system. To make use of it, some new fields must be added to the tables "
+#~ "in  the \"mysql\" database. This is done by the "
+#~ "mysql_fix_privilege_tables script during this upgrade regardless of if "
+#~ "the server is currently running or not!"
+#~ msgstr ""
+#~ "Les �ltimes versions de MySQL tenen un sistema de privilegis m�s "
+#~ "elaborat. Per utilitzar-lo cal afegir nous camps a les taules de la base "
+#~ "de dades \"mysql\". Aquesta tasca la realitza la seq��ncia "
+#~ "mysql_fix_privilege_tables durant l'actualitzaci� independentment de si "
+#~ "el servidor s'est� executant o no!"
+
+#~ msgid ""
+#~ "This script is not supposed to give any user more rights that he had "
+#~ "before, if you encounter such a case, please contact me."
+#~ msgstr ""
+#~ "Aquesta seq��ncia no assigna privilegis d'usuari diferents als que ja "
+#~ "tenia, en cas que us trob�ssiu en aquesta situaci�, poseu-vos en contacte "
+#~ "amb mi."
+
+#~ msgid ""
+#~ "Should I remove everything below /var/lib/mysql when you purge the mysql-"
+#~ "server package with the \"dpkg --purge mysql-server\" command (i.e. "
+#~ "remove everything including the configuration) somewhen? (default is not)"
+#~ msgstr ""
+#~ "Voleu suprimir tots els continguts de /var/lib/mysql quan es purgui el "
+#~ "paquet mysql-server amb l'ordre \"dpkg --purge mysql-server\". (ex. "
+#~ "suprimir-ho tot incl�s la configuraci�) ? (per defecte no)"
+
+#~ msgid "Make MySQL reachable via network?"
+#~ msgstr "Voleu fer accessible el MySQL via xarxa ?"
+
+#~ msgid ""
+#~ "Should MySQL listen on a network reachable TCP port? This is not "
+#~ "necessary for use on a single computer and could be a security problem."
+#~ msgstr ""
+#~ "Voleu que el MySQL escolti a un port TCP accessible des de la xarxa ? "
+#~ "Aquesta opci� no �s imprescindible en ordinadors a�llats i podria "
+#~ "provocar un problema de seguretat."
+
+#~ msgid "Enable chroot mode?"
+#~ msgstr "Permetre el mode chroot ?"
+
+#~ msgid ""
+#~ "MySQL is able to jail itself into the /var/lib/mysql_jail directory so "
+#~ "that users cannot modify any files outside this directory. This improves "
+#~ "resistence against crackers, too, as they are not able to modify system "
+#~ "files."
+#~ msgstr ""
+#~ "El MySQL es pot executar en una entorn tancat al directori /var/lib/"
+#~ "mysql_jail perqu� els usuaris no puguin modificar cap fitxer fora del "
+#~ "directori.Aquesta opci� tamb� augmenta la seguretat envers els crackers, "
+#~ "jaque no poden modificar els fitxers del sistema."
diff --git a/storage/xtradb/build/debian/po/cs.po b/storage/xtradb/build/debian/po/cs.po
new file mode 100644
index 00000000000..01d95723aa9
--- /dev/null
+++ b/storage/xtradb/build/debian/po/cs.po
@@ -0,0 +1,361 @@
+#
+#    Translators, if you are not familiar with the PO format, gettext
+#    documentation is worth reading, especially sections dedicated to
+#    this format, e.g. by running:
+#         info -n '(gettext)PO Files'
+#         info -n '(gettext)Header Entry'
+#
+#    Some information specific to po-debconf are available at
+#            /usr/share/doc/po-debconf/README-trans
+#         or http://www.debian.org/intl/l10n/po-debconf/README-trans
+#
+#    Developers do not need to manually edit POT or PO files.
+#
+msgid ""
+msgstr ""
+"Project-Id-Version: mysql-dfsg-5.1\n"
+"Report-Msgid-Bugs-To: percona-xtradb-dfsg-5.1@packages.debian.org\n"
+"POT-Creation-Date: 2010-02-15 17:10-0500\n"
+"PO-Revision-Date: 2007-05-01 13:01+0200\n"
+"Last-Translator: Miroslav Kure <kurem@debian.cz>\n"
+"Language-Team: Czech <debian-l10n-czech@lists.debian.org>\n"
+"MIME-Version: 1.0\n"
+"Content-Type: text/plain; charset=UTF-8\n"
+"Content-Transfer-Encoding: 8bit\n"
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:2001
+msgid "Really proceed with downgrade?"
+msgstr "Opravdu pokračovat v degradaci?"
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:2001
+msgid "A file named /var/lib/mysql/debian-*.flag exists on this system."
+msgstr "V systému existuje soubor /var/lib/mysql/debian-*.flag."
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:2001
+msgid ""
+"Such file is an indication that a mysql-server package with a higher version "
+"has been installed earlier."
+msgstr "To znamená, že již byl nainstalován balík mysql-server s vyšší verzí."
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:2001
+msgid ""
+"There is no guarantee that the version you're currently installing will be "
+"able to use the current databases."
+msgstr ""
+"Neexistuje žádná záruka, že momentálně instalovaná verze bude umět pracovat "
+"se stávajícími databázemi."
+
+#. Type: note
+#. Description
+#: ../percona-xtradb-server-5.1.templates:3001
+msgid "Important note for NIS/YP users"
+msgstr "Důležitá poznámka pro uživatele NIS/YP"
+
+#. Type: note
+#. Description
+#: ../percona-xtradb-server-5.1.templates:3001
+msgid ""
+"To use MySQL, the following entries for users and groups should be added to "
+"the system:"
+msgstr ""
+"Abyste mohli MySQL používat, musíte v systému založit následující uživatele "
+"a skupiny:"
+
+#. Type: note
+#. Description
+#: ../percona-xtradb-server-5.1.templates:3001
+msgid ""
+"You should also check the permissions and the owner of the /var/lib/mysql "
+"directory:"
+msgstr ""
+"Také byste měli zkontrolovat vlastníka a oprávnění adresáře /var/lib/mysql:"
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:4001
+#, fuzzy
+#| msgid "Remove all MySQL databases?"
+msgid "Remove all Percona SQL databases?"
+msgstr "Odstranit všechny MySQL databáze?"
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:4001
+#, fuzzy
+#| msgid ""
+#| "The /var/lib/mysql directory which contains the MySQL databases is about "
+#| "to be removed."
+msgid ""
+"The /var/lib/mysql directory which contains the Percona SQL databases is "
+"about to be removed."
+msgstr ""
+"Adresář /var/lib/mysql, ve kterém se nachází MySQL databáze, bude odstraněn."
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:4001
+#, fuzzy
+#| msgid ""
+#| "If you're removing the MySQL package in order to later install a more "
+#| "recent version or if a different mysql-server package is already using "
+#| "it, the data should be kept."
+msgid ""
+"If you're removing the Percona SQL package in order to later install a more "
+"recent version or if a different mysql-server package is already using it, "
+"the data should be kept."
+msgstr ""
+"Jestliže odstraňujete balík MySQL za účelem instalace novější verze MySQL, "
+"nebo pokud tato data souběžně využívá jiný balík mysql-server, měli byste "
+"data ponechat."
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:5001
+#, fuzzy
+#| msgid "Start the MySQL server on boot?"
+msgid "Start the Percona SQL server on boot?"
+msgstr "Spustit MySQL server při startu systému?"
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:5001
+#, fuzzy
+#| msgid ""
+#| "The MySQL server can be launched automatically at boot time or manually "
+#| "with the '/etc/init.d/mysql start' command."
+msgid ""
+"The Percona SQL server can be launched automatically at boot time or "
+"manually with the '/etc/init.d/mysql start' command."
+msgstr ""
+"MySQL se může spouštět automaticky při startu systému, nebo ručně příkazem '/"
+"etc/init.d/mysql start'."
+
+#. Type: password
+#. Description
+#: ../percona-xtradb-server-5.1.templates:6001
+#, fuzzy
+#| msgid "New password for the MySQL \"root\" user:"
+msgid "New password for the Percona SQL \"root\" user:"
+msgstr "Nové heslo MySQL uživatele \"root\":"
+
+#. Type: password
+#. Description
+#: ../percona-xtradb-server-5.1.templates:6001
+#, fuzzy
+#| msgid ""
+#| "While not mandatory, it is highly recommended that you set a password for "
+#| "the MySQL administrative \"root\" user."
+msgid ""
+"While not mandatory, it is highly recommended that you set a password for "
+"the Percona SQL administrative \"root\" user."
+msgstr ""
+"Přestože to není nezbytné, je silně doporučeno nastavit heslo u "
+"správcovského MySQL účtu \"root\"."
+
+#. Type: password
+#. Description
+#: ../percona-xtradb-server-5.1.templates:6001
+msgid "If that field is left blank, the password will not be changed."
+msgstr "Ponecháte-li pole prázdné, heslo se nezmění."
+
+#. Type: password
+#. Description
+#: ../percona-xtradb-server-5.1.templates:7001
+#, fuzzy
+#| msgid "New password for the MySQL \"root\" user:"
+msgid "Repeat password for the Percona SQL \"root\" user:"
+msgstr "Nové heslo MySQL uživatele \"root\":"
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:8001
+#, fuzzy
+#| msgid "Unable to set password for the MySQL \"root\" user"
+msgid "Unable to set password for the Percona SQL \"root\" user"
+msgstr "Nelze nastavit heslo MySQL uživatele \"root\""
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:8001
+#, fuzzy
+#| msgid ""
+#| "An error occurred while setting the password for the MySQL administrative "
+#| "user. This may have happened because the account already has a password, "
+#| "or because of a communication problem with the MySQL server."
+msgid ""
+"An error occurred while setting the password for the Percona SQL "
+"administrative user. This may have happened because the account already has "
+"a password, or because of a communication problem with the Percona SQL "
+"server."
+msgstr ""
+"Během nastavování hesla pro správcovského uživatele MySQL se vyskytla chyba. "
+"To se mohlo stát třeba proto, protože uživatel již měl heslo nastaveno, nebo "
+"protože nastal problém v komunikaci s MySQL serverem."
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:8001
+#, fuzzy
+#| msgid ""
+#| "You should check the account's password after tha package installation."
+msgid "You should check the account's password after the package installation."
+msgstr "Po instalaci balíku byste měli heslo ověřit."
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:8001
+#, fuzzy
+#| msgid ""
+#| "Please read the /usr/share/doc/mysql-server-5.1/README.Debian file for "
+#| "more information."
+msgid ""
+"Please read the /usr/share/doc/mysql-server-5.1/README.Debian file for more "
+"information."
+msgstr ""
+"Více informací naleznete v /usr/share/doc/mysql-server-5.1/README.Debian."
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:9001
+msgid "Password input error"
+msgstr ""
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:9001
+msgid "The two passwords you entered were not the same. Please try again."
+msgstr ""
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:10001
+msgid "NDB Cluster seems to be in use"
+msgstr ""
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:10001
+msgid ""
+"Percona-SQL-5.1 has orphaned NDB Cluster support. Please migrate to the new "
+"mysql-cluster package and remove all lines starting with \"ndb\" from all "
+"config files below /etc/mysql/."
+msgstr ""
+
+#~ msgid "Cannot upgrade if ISAM tables are present!"
+#~ msgstr "Aktualizace nelze provést pokud jsou přítomny tabulky ISAM!"
+
+#~ msgid ""
+#~ "Recent versions of MySQL can no longer use the old ISAM table format and "
+#~ "it is necessary to convert your tables to e.g. MyISAM before upgrading by "
+#~ "using \"mysql_convert_table_format\" or \"ALTER TABLE x ENGINE=MyISAM\". "
+#~ "The installation of mysql-server-5.1 will now abort. In case your old "
+#~ "mysql-server-4.1 gets removed nevertheless just reinstall it to convert "
+#~ "those tables."
+#~ msgstr ""
+#~ "Poslední verze MySQL již nemohou používat starý formát tabulek ISAM a "
+#~ "před aktualizací je nutné převést tyto tabulky např. do formátu MyISAM "
+#~ "pomocí \"mysql_convert_table_format\" nebo \"ALTER TABLE x ENGINE=MyISAM"
+#~ "\". Instalace mysql-server-5.1 se nyní přeruší. V případě, že se mezitím "
+#~ "odinstaloval původní mysql-server-4.1, jednoduše jej znovu nainstalujte a "
+#~ "tabulky převeďte."
+
+#~ msgid ""
+#~ "Support MySQL connections from hosts running Debian \"sarge\" or older?"
+#~ msgstr ""
+#~ "Podporovat MySQL připojení z počítačů používajících Debian Sarge nebo "
+#~ "starší?"
+
+#~ msgid ""
+#~ "In old versions of MySQL clients on Debian, passwords were not stored "
+#~ "securely. This has been improved since then, however clients (such as "
+#~ "PHP) from hosts running Debian 3.1 Sarge will not be able to connect to "
+#~ "recent accounts or accounts whose password have been changed."
+#~ msgstr ""
+#~ "Způsob, jakým se dříve ukládala hesla, nebyl příliš bezpečný. To se nyní "
+#~ "zlepšilo, ale nevýhodou je, že se klienti z Debianu 3.1 Sarge (např. PHP) "
+#~ "nebudou moci připojit na nové účty, nebo na účty, u nichž se heslo "
+#~ "změnilo."
+
+#~ msgid ""
+#~ "To use mysql you must install an equivalent user and group to the "
+#~ "following and ensure yourself that /var/lib/mysql has the right "
+#~ "permissions (the uid/gid may be different)."
+#~ msgstr ""
+#~ "Abyste mohli mysql používat, musíte do následujících souborů přidat "
+#~ "ekvivalentního uživatele a skupinu a zajistit, že /var/lib/mysql má "
+#~ "správná práva (uid/gid se mohou lišit)."
+
+#~ msgid "Remove the databases used by all MySQL versions?"
+#~ msgstr "Odstranit databáze používané všemi verzemi MySQL?"
+
+#~ msgid ""
+#~ "If you do not provide a password no changes will be made to the account."
+#~ msgstr "Nezadáte-li heslo, žádné změny se s účtem neprovedou."
+
+#~ msgid ""
+#~ "When installation finishes, you should verify that the account is "
+#~ "properly protected with a password (see README.Debian for more "
+#~ "information)."
+#~ msgstr ""
+#~ "Po skončení instalace byste měli ověřit, že je účet chráněn heslem (více "
+#~ "informací naleznete v souboru README.Debian)."
+
+#~ msgid "Update Hints"
+#~ msgstr "Poznámky k aktualizaci"
+
+#~ msgid ""
+#~ "You have to run \"mysql_upgrade\" after the upgrade, else tables can be  "
+#~ "corrupted! This script also enhances the privilege tables but is not  "
+#~ "supposed to give any user more rights that he had before,"
+#~ msgstr ""
+#~ "Po aktualizaci ještě musíte spustit \"mysql_upgrade\", protože jinak by "
+#~ "se tabulky mohly narušit! Tento skript také rozšiřuje tabulky privilegií, "
+#~ "ovšem neměl by uživatelům přidat více práv, než měli dosud."
+
+#~ msgid "Please also read http://www.mysql.com/doc/en/Upgrade.html"
+#~ msgstr "Také si přečtěte http://www.mysql.com/doc/en/Upgrade.html"
+
+#~ msgid ""
+#~ "MySQL will only install if you have a non-numeric hostname that is "
+#~ "resolvable via the /etc/hosts file. E.g. if the \"hostname\" command "
+#~ "returns \"myhostname\" then there must be a line like \"10.0.0.1 "
+#~ "myhostname\"."
+#~ msgstr ""
+#~ "MySQL se nainstaluje pouze v případě, že používáte nenumerické jméno "
+#~ "počítače, které se dá přeložit přes soubor /etc/hosts. Např. když příkaz "
+#~ "\"hostname\" vrátí \"diamond\", tak v /etc/hosts musí existovat obdobný "
+#~ "řádek jako \"10.0.0.1 diamond\"."
+
+#~ msgid ""
+#~ "A new mysql user \"debian-sys-maint\" will be created. This mysql account "
+#~ "is used in the start/stop and cron scripts. Don't delete."
+#~ msgstr ""
+#~ "Bude vytvořen nový mysql uživatel \"debian-sys-maint\". Tento mysql účet "
+#~ "se používá ve startovacích, ukončovacích a cronových skriptech. Nemažte "
+#~ "jej."
+
+#~ msgid ""
+#~ "Please remember to set a PASSWORD for the MySQL root user! If you use a /"
+#~ "root/.my.cnf, always write the \"user\" and the \"password\" lines in "
+#~ "there, never only the password!"
+#~ msgstr ""
+#~ "Nezapomeňte nastavit heslo pro účet administrátora MySQL! Používáte-li /"
+#~ "root/.my.cnf, vždy zde zadejte jak řádek \"user\", tak řádek \"password"
+#~ "\". Nikdy zde nezadávejte jenom heslo!"
+
+#~ msgid ""
+#~ "Should I remove the complete /var/lib/mysql directory tree which is used "
+#~ "by all MySQL versions, not necessarily only the one you are about to "
+#~ "purge?"
+#~ msgstr ""
+#~ "Mám odstranit kompletní adresářový strom /var/lib/mysql, který se používá "
+#~ "pro všechny verze MySQL, tedy ne nutně pouze pro verzi, kterou se "
+#~ "chystáte vyčistit?"
diff --git a/storage/xtradb/build/debian/po/da.po b/storage/xtradb/build/debian/po/da.po
new file mode 100644
index 00000000000..a44088472a6
--- /dev/null
+++ b/storage/xtradb/build/debian/po/da.po
@@ -0,0 +1,397 @@
+#
+#    Translators, if you are not familiar with the PO format, gettext
+#    documentation is worth reading, especially sections dedicated to
+#    this format, e.g. by running:
+#         info -n '(gettext)PO Files'
+#         info -n '(gettext)Header Entry'
+#    Some information specific to po-debconf are available at
+#            /usr/share/doc/po-debconf/README-trans
+#         or http://www.debian.org/intl/l10n/po-debconf/README-trans#
+#    Developers do not need to manually edit POT or PO files.
+#
+# Claus Hindsgaul <claus_h@image.dk>, 2005, 2006.
+# Claus Hindsgaul <claus.hindsgaul@gmail.com>, 2006, 2007.
+msgid ""
+msgstr ""
+"Project-Id-Version: mysql-dfsg-4.1\n"
+"Report-Msgid-Bugs-To: percona-xtradb-dfsg-5.1@packages.debian.org\n"
+"POT-Creation-Date: 2010-02-15 17:10-0500\n"
+"PO-Revision-Date: 2007-05-30 22:41+0200\n"
+"Last-Translator: Claus Hindsgaul <claus.hindsgaul@gmail.com>\n"
+"Language-Team: Danish\n"
+"MIME-Version: 1.0\n"
+"Content-Type: text/plain; charset=ISO-8859-1\n"
+"Content-Transfer-Encoding: 8bit\n"
+"X-Generator: KBabel 1.11.4\n"
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:2001
+msgid "Really proceed with downgrade?"
+msgstr "�nsker du virkelig at forts�tte nedgraderingen?"
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:2001
+msgid "A file named /var/lib/mysql/debian-*.flag exists on this system."
+msgstr ""
+"Der er en fil med navnet  /var/lib/mysql/debian-*.flag p� dette system."
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:2001
+msgid ""
+"Such file is an indication that a mysql-server package with a higher version "
+"has been installed earlier."
+msgstr ""
+"S�dan en fil tyder p� at der tidligere har v�ret installeret en h�jere "
+"version af mysql-server-pakken."
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:2001
+msgid ""
+"There is no guarantee that the version you're currently installing will be "
+"able to use the current databases."
+msgstr ""
+"Det kan ikke garanteres at den version, du er ved at installere, kan benytte "
+"data fra de eksisterende databaser."
+
+#. Type: note
+#. Description
+#: ../percona-xtradb-server-5.1.templates:3001
+msgid "Important note for NIS/YP users"
+msgstr "Vigtig oplysning til NIS/YP-brugere"
+
+#. Type: note
+#. Description
+#: ../percona-xtradb-server-5.1.templates:3001
+msgid ""
+"To use MySQL, the following entries for users and groups should be added to "
+"the system:"
+msgstr ""
+"Nedenst�ende linjer for brugere og grupper skal tilf�jes dette system for at "
+"benytte MySQL:"
+
+#. Type: note
+#. Description
+#: ../percona-xtradb-server-5.1.templates:3001
+msgid ""
+"You should also check the permissions and the owner of the /var/lib/mysql "
+"directory:"
+msgstr ""
+"Du b�r ogs� tjekke filrettighederne og ejerskabet af mappen /var/lib/mysql:"
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:4001
+#, fuzzy
+#| msgid "Remove all MySQL databases?"
+msgid "Remove all Percona SQL databases?"
+msgstr "Fjern alle MySQL-databaser?"
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:4001
+#, fuzzy
+#| msgid ""
+#| "The /var/lib/mysql directory which contains the MySQL databases is about "
+#| "to be removed."
+msgid ""
+"The /var/lib/mysql directory which contains the Percona SQL databases is "
+"about to be removed."
+msgstr ""
+"Mappen /var/lib/mysql, der indeholder MySQL-databaserne, er ved at blive "
+"fjernet."
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:4001
+#, fuzzy
+#| msgid ""
+#| "If you're removing the MySQL package in order to later install a more "
+#| "recent version or if a different mysql-server package is already using "
+#| "it, the data should be kept."
+msgid ""
+"If you're removing the Percona SQL package in order to later install a more "
+"recent version or if a different mysql-server package is already using it, "
+"the data should be kept."
+msgstr ""
+"Hvis du fjerner MySQL-pakken for senere at installere en nyere version, "
+"eller hvis en anden mysql-server-pakke allerede benytter den, b�r dataene "
+"bevares."
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:5001
+#, fuzzy
+#| msgid "Start the MySQL server on boot?"
+msgid "Start the Percona SQL server on boot?"
+msgstr "Start MySQL-serveren under systemopstart?"
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:5001
+#, fuzzy
+#| msgid ""
+#| "The MySQL server can be launched automatically at boot time or manually "
+#| "with the '/etc/init.d/mysql start' command."
+msgid ""
+"The Percona SQL server can be launched automatically at boot time or "
+"manually with the '/etc/init.d/mysql start' command."
+msgstr ""
+"MySQL-serveren kan enten startes op automatisk under systemopstarten, eller "
+"manuelt med kommandoen '/etc/init.d/mysql start'."
+
+#. Type: password
+#. Description
+#: ../percona-xtradb-server-5.1.templates:6001
+#, fuzzy
+#| msgid "New password for the MySQL \"root\" user:"
+msgid "New password for the Percona SQL \"root\" user:"
+msgstr "Ny adgangskode for MySQL's \"root\"-bruger:"
+
+#. Type: password
+#. Description
+#: ../percona-xtradb-server-5.1.templates:6001
+#, fuzzy
+#| msgid ""
+#| "While not mandatory, it is highly recommended that you set a password for "
+#| "the MySQL administrative \"root\" user."
+msgid ""
+"While not mandatory, it is highly recommended that you set a password for "
+"the Percona SQL administrative \"root\" user."
+msgstr ""
+"Selvom det ikke kr�ves, anbefales det kraftigt, at du s�tter en adgangskode "
+"for MySQL's administrationsbruger \"root\"."
+
+#. Type: password
+#. Description
+#: ../percona-xtradb-server-5.1.templates:6001
+msgid "If that field is left blank, the password will not be changed."
+msgstr "Hvis du lader dette felt st� tomt, vil adgangskoden ikke blive �ndret."
+
+#. Type: password
+#. Description
+#: ../percona-xtradb-server-5.1.templates:7001
+#, fuzzy
+#| msgid "New password for the MySQL \"root\" user:"
+msgid "Repeat password for the Percona SQL \"root\" user:"
+msgstr "Ny adgangskode for MySQL's \"root\"-bruger:"
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:8001
+#, fuzzy
+#| msgid "Unable to set password for the MySQL \"root\" user"
+msgid "Unable to set password for the Percona SQL \"root\" user"
+msgstr "Kunne ikke s�tte adgangskoden for MySQL's \"root\"-bruger"
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:8001
+#, fuzzy
+#| msgid ""
+#| "An error occurred while setting the password for the MySQL administrative "
+#| "user. This may have happened because the account already has a password, "
+#| "or because of a communication problem with the MySQL server."
+msgid ""
+"An error occurred while setting the password for the Percona SQL "
+"administrative user. This may have happened because the account already has "
+"a password, or because of a communication problem with the Percona SQL "
+"server."
+msgstr ""
+"Der opstod en fejl, da adgangskoden for MySQL's administrationsbruger blev "
+"fors�gt �ndret. Dette kan v�re sket, fordi brugeren allerede har en "
+"adgangskode, eller fordi der var problemer med at kommunikere med MySQL-"
+"serveren."
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:8001
+msgid "You should check the account's password after the package installation."
+msgstr "Du b�r tjekke kontoens adgangskode efter pakkeinstallationen."
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:8001
+#, fuzzy
+#| msgid ""
+#| "Please read the /usr/share/doc/mysql-server-5.1/README.Debian file for "
+#| "more information."
+msgid ""
+"Please read the /usr/share/doc/mysql-server-5.1/README.Debian file for more "
+"information."
+msgstr ""
+"Se filen /usr/share/doc/mysql-server-5.1/README.Debian for yderligere "
+"oplysninger."
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:9001
+msgid "Password input error"
+msgstr ""
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:9001
+msgid "The two passwords you entered were not the same. Please try again."
+msgstr ""
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:10001
+msgid "NDB Cluster seems to be in use"
+msgstr ""
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:10001
+msgid ""
+"Percona-SQL-5.1 has orphaned NDB Cluster support. Please migrate to the new "
+"mysql-cluster package and remove all lines starting with \"ndb\" from all "
+"config files below /etc/mysql/."
+msgstr ""
+
+#~ msgid "Cannot upgrade if ISAM tables are present!"
+#~ msgstr "Kan ikke opgradere hvis der er ISAM-tabeller!"
+
+#~ msgid ""
+#~ "Recent versions of MySQL can no longer use the old ISAM table format and "
+#~ "it is necessary to convert your tables to e.g. MyISAM before upgrading by "
+#~ "using \"mysql_convert_table_format\" or \"ALTER TABLE x ENGINE=MyISAM\". "
+#~ "The installation of mysql-server-5.1 will now abort. In case your old "
+#~ "mysql-server-4.1 gets removed nevertheless just reinstall it to convert "
+#~ "those tables."
+#~ msgstr ""
+#~ "Nyere versioner af MySQL kan ikke l�ngere benytte det gamle ISAM-"
+#~ "tabelformat, og det er derfor n�dvendigt at konvertere dine tabeller til "
+#~ "f.eks. MyISAM forud for opgraderingen med \"mysql_convert_table_format\" "
+#~ "eller \"ALTER TABLE x ENGINE=MyISAM\". Installationen af mysql-server-5.1 "
+#~ "afbrydes nu. Skulle din gamle mysql-server-4.1 alligevel bliver "
+#~ "afinstalleret, s� geninstall�r den blot og konverter tabellerne."
+
+#~ msgid ""
+#~ "Support MySQL connections from hosts running Debian \"sarge\" or older?"
+#~ msgstr ""
+#~ "Underst�t MySQL-forbindelser fra maskiner, der k�rer Debian \"Sarge\" "
+#~ "eller �ldre?"
+
+#~ msgid ""
+#~ "In old versions of MySQL clients on Debian, passwords were not stored "
+#~ "securely. This has been improved since then, however clients (such as "
+#~ "PHP) from hosts running Debian 3.1 Sarge will not be able to connect to "
+#~ "recent accounts or accounts whose password have been changed."
+#~ msgstr ""
+#~ "Gamle udgaver af MySQL-klienter p� Debian gemte ikke adgangskoderne "
+#~ "sikkert. Dette er blevet forbedret siden da, men klienter (f.eks. PHP) "
+#~ "fra maskiner, der k�rer Debian 3.1 Sarge vil ikke kunne forbinde til "
+#~ "nyere konti eller konti, hvis adgangskode er blevet �ndret."
+
+#~ msgid ""
+#~ "To use mysql you must install an equivalent user and group to the "
+#~ "following and ensure yourself that /var/lib/mysql has the right "
+#~ "permissions (the uid/gid may be different)."
+#~ msgstr ""
+#~ "For at kunne bruge mysql skal du installere en bruger og en gruppe, der "
+#~ "svarer til nedenst�ende, og sikre dig at /var/lib/mysql har de rigtige "
+#~ "adgangsrettigheder (uid/gid kan afvige)."
+
+#~ msgid ""
+#~ "/etc/passwd:      mysql:x:100:101:MySQL Server:/var/lib/mysql:/bin/false"
+#~ msgstr ""
+#~ "etc/passwd:      mysql:x:100:101:MySQL Server:/var/lib/mysql:/bin/false"
+
+#~ msgid "/etc/group:       mysql:x:101:"
+#~ msgstr "/etc/group:       mysql:x:101:"
+
+#~ msgid "/var/lib/mysql:   drwxr-xr-x   mysql    mysql"
+#~ msgstr "/var/lib/mysql:   drwxr-xr-x   mysql    mysql"
+
+#~ msgid "Remove the databases used by all MySQL versions?"
+#~ msgstr "Fjern de databaser, der benyttes af samtlige MySQL-versioner?"
+
+#~ msgid ""
+#~ "If you do not provide a password no changes will be made to the account."
+#~ msgstr "Hvis du ikke angiver en adgangskode, vil kontoen ikke blive �ndret."
+
+#~ msgid ""
+#~ "When installation finishes, you should verify that the account is "
+#~ "properly protected with a password (see README.Debian for more "
+#~ "information)."
+#~ msgstr ""
+#~ "N�r installationen afsluttes, b�r du tjekke at kontoen er ordentligt "
+#~ "beskyttet med en adgangskode (se README.Debian for yderligere "
+#~ "oplysninger)."
+
+#~ msgid "Update Hints"
+#~ msgstr "Opdateringstips"
+
+#~ msgid ""
+#~ "You have to run \"mysql_upgrade\" after the upgrade, else tables can be  "
+#~ "corrupted! This script also enhances the privilege tables but is not  "
+#~ "supposed to give any user more rights that he had before,"
+#~ msgstr ""
+#~ "Du skal k�re \"mysql_upgrade\" efter opgraderingen, da tabellerne eller "
+#~ "kan blive �delagt! Dette script forbedrer ogs� rettighedstabellerne, men "
+#~ "burde ikke give nogen bruger flere rettigheder, end han havde tidligere,"
+
+#~ msgid "Please also read http://www.mysql.com/doc/en/Upgrade.html"
+#~ msgstr "L�s ogs� http://www.mysql.com/doc/en/Upgrade.html"
+
+#~ msgid "Install Hints"
+#~ msgstr "Installationstips"
+
+#~ msgid ""
+#~ "On upgrades from MySQL 3.23, as shipped with Debian Woody, symlinks in "
+#~ "place of /var/lib/mysql or /var/log/mysql gets accidently removed and "
+#~ "have manually be restored."
+#~ msgstr ""
+#~ "Ved opgraderinger fra MySQL 3.23, der fulgte med Debian Woody, kan de "
+#~ "symbolske /var/lib/mysql or /var/log/mysql blive fjernet ved et uheld, og "
+#~ "m� genskabes manuelt."
+
+#~ msgid ""
+#~ "MySQL will only install if you have a non-numeric hostname that is "
+#~ "resolvable via the /etc/hosts file. E.g. if the \"hostname\" command "
+#~ "returns \"myhostname\" then there must be a line like \"10.0.0.1 "
+#~ "myhostname\"."
+#~ msgstr ""
+#~ "MySQL vil kun blive installeret, hvis du har et ikke-numerisk v�rtsnavn, "
+#~ "som kan sl�s op i filen /ets/hosts. Hvis f.eks. kommandoen \"hostname\" "
+#~ "svarer med \"mitvaertsnavn\", skal du have en linje a'la \"10.0.0.1 "
+#~ "mitvaertsnavn\" i /etc/hosts."
+
+#~ msgid ""
+#~ "A new mysql user \"debian-sys-maint\" will be created. This mysql account "
+#~ "is used in the start/stop and cron scripts. Don't delete."
+#~ msgstr ""
+#~ "Det vil blive oprettet en ny mysql-bruger, \"debian-sys-maint\". Denne "
+#~ "mysql-konto bruges i start/stop-cron-scripterne. Slet den ikke."
+
+#~ msgid ""
+#~ "Please remember to set a PASSWORD for the MySQL root user! If you use a /"
+#~ "root/.my.cnf, always write the \"user\" and the \"password\" lines in "
+#~ "there, never only the password!"
+#~ msgstr ""
+#~ "Husk at s�tte en ADGANGSKODE for MySQLs root-bruger! Hvis du bruger en /"
+#~ "etc/.my.cnf, s� skriv altid \"user\"- og \"password\"-linjer ind her, "
+#~ "ikke kun adgangskoden!"
+
+#~ msgid ""
+#~ "Should I remove the complete /var/lib/mysql directory tree which is used "
+#~ "by all MySQL versions, not necessarily only the one you are about to "
+#~ "purge?"
+#~ msgstr ""
+#~ "Skal jeg fjerne hele mappetr�et /var/lib/mysql, som benyttes af alle "
+#~ "MySQL-versioner, ikke kun den version, du er ved at slette?"
+
+#~ msgid ""
+#~ "Rarely, e.g. on new major versions, the privilege system is improved. To "
+#~ "make use of it mysql_fix_privilege_tables must be executed manually. The "
+#~ "script is not supposed to give any user more rights that he had before,"
+#~ msgstr ""
+#~ "En sj�lden gang imellem, f.eks. ved nye hovedversioner, sker det at "
+#~ "rettighedssystemet forbedres. For at g�re brug af dette, skal "
+#~ "mysql_fix_privilege_tables k�res manuelt. Scriptet vil ikke give nogen "
+#~ "bruger flere rettigheder, end vedkommende havde tidligere,"
diff --git a/storage/xtradb/build/debian/po/de.po b/storage/xtradb/build/debian/po/de.po
new file mode 100644
index 00000000000..c5dcdaff234
--- /dev/null
+++ b/storage/xtradb/build/debian/po/de.po
@@ -0,0 +1,277 @@
+# translation of mysql-dfsg-5.1_5.0.41-2_de.po to german
+#
+#    Translators, if you are not familiar with the PO format, gettext
+#    documentation is worth reading, especially sections dedicated to
+#    this format, e.g. by running:
+#         info -n '(gettext)PO Files'
+#         info -n '(gettext)Header Entry'
+#    Some information specific to po-debconf are available at
+#            /usr/share/doc/po-debconf/README-trans
+#         or http://www.debian.org/intl/l10n/po-debconf/README-trans#
+#    Developers do not need to manually edit POT or PO files.
+#
+# Alwin Meschede <ameschede@gmx.de>, 2006, 2007.
+msgid ""
+msgstr ""
+"Project-Id-Version: mysql-dfsg-5.1_5.0.41-2_de\n"
+"Report-Msgid-Bugs-To: percona-xtradb-dfsg-5.1@packages.debian.org\n"
+"POT-Creation-Date: 2010-02-15 17:10-0500\n"
+"PO-Revision-Date: 2007-05-29 16:05+0200\n"
+"Last-Translator: Alwin Meschede <ameschede@gmx.de>\n"
+"Language-Team: german <debian-l10n-german@lists.debian.org>\n"
+"MIME-Version: 1.0\n"
+"Content-Type: text/plain; charset=UTF-8\n"
+"Content-Transfer-Encoding: 8bit\n"
+"X-Generator: KBabel 1.11.4\n"
+"Plural-Forms:  nplurals=2; plural=(n != 1);\n"
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:2001
+msgid "Really proceed with downgrade?"
+msgstr "Möchten Sie wirklich eine ältere Version einspielen?"
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:2001
+msgid "A file named /var/lib/mysql/debian-*.flag exists on this system."
+msgstr ""
+"Auf diesem System existiert eine Datei mit dem Namen /var/lib/mysql/debian-*."
+"flag"
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:2001
+msgid ""
+"Such file is an indication that a mysql-server package with a higher version "
+"has been installed earlier."
+msgstr ""
+"Diese Datei ist ein Hinweis darauf, dass früher ein MySQL-Server-Paket mit "
+"einer höheren Version installiert war."
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:2001
+msgid ""
+"There is no guarantee that the version you're currently installing will be "
+"able to use the current databases."
+msgstr ""
+"Es kann nicht garantiert werden, dass die gegenwärtig zu installierende "
+"Version dessen Daten benutzen kann."
+
+#. Type: note
+#. Description
+#: ../percona-xtradb-server-5.1.templates:3001
+msgid "Important note for NIS/YP users"
+msgstr "Wichtige Anmerkung für NIS/YP-Benutzer!"
+
+#. Type: note
+#. Description
+#: ../percona-xtradb-server-5.1.templates:3001
+msgid ""
+"To use MySQL, the following entries for users and groups should be added to "
+"the system:"
+msgstr ""
+"Um MySQL benutzen zu können, sollten die folgenden Benutzer und Gruppen dem "
+"System hinzugefügt werden:"
+
+#. Type: note
+#. Description
+#: ../percona-xtradb-server-5.1.templates:3001
+msgid ""
+"You should also check the permissions and the owner of the /var/lib/mysql "
+"directory:"
+msgstr ""
+"Sie sollten außerdem Besitzer und Zugriffsrechte des Verzeichnisses /var/lib/"
+"mysql überprüfen:"
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:4001
+#, fuzzy
+#| msgid "Remove all MySQL databases?"
+msgid "Remove all Percona SQL databases?"
+msgstr "Alle MySQL-Datenbanken entfernen?"
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:4001
+#, fuzzy
+#| msgid ""
+#| "The /var/lib/mysql directory which contains the MySQL databases is about "
+#| "to be removed."
+msgid ""
+"The /var/lib/mysql directory which contains the Percona SQL databases is "
+"about to be removed."
+msgstr ""
+"Das Verzeichnis /var/lib/mysql mit den MySQL-Datenbanken soll entfernt "
+"werden."
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:4001
+#, fuzzy
+#| msgid ""
+#| "If you're removing the MySQL package in order to later install a more "
+#| "recent version or if a different mysql-server package is already using "
+#| "it, the data should be kept."
+msgid ""
+"If you're removing the Percona SQL package in order to later install a more "
+"recent version or if a different mysql-server package is already using it, "
+"the data should be kept."
+msgstr ""
+"Falls geplant ist, nur eine höhere Version von MySQL zu installieren oder "
+"ein anderes mysql-server-Paket dieses bereits benutzt, sollten die Daten "
+"behalten werden."
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:5001
+#, fuzzy
+#| msgid "Start the MySQL server on boot?"
+msgid "Start the Percona SQL server on boot?"
+msgstr "Soll MySQL automatisch beim Booten starten?"
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:5001
+#, fuzzy
+#| msgid ""
+#| "The MySQL server can be launched automatically at boot time or manually "
+#| "with the '/etc/init.d/mysql start' command."
+msgid ""
+"The Percona SQL server can be launched automatically at boot time or "
+"manually with the '/etc/init.d/mysql start' command."
+msgstr ""
+"Der MySQL-Dienst kann entweder automatisch beim Systemstart oder manuell "
+"durch Eingabe des Befehls »/etc/init.d/mysql start« gestartet werden."
+
+#. Type: password
+#. Description
+#: ../percona-xtradb-server-5.1.templates:6001
+#, fuzzy
+#| msgid "New password for the MySQL \"root\" user:"
+msgid "New password for the Percona SQL \"root\" user:"
+msgstr "Neues Passwort für den MySQL »root«-Benutzer:"
+
+#. Type: password
+#. Description
+#: ../percona-xtradb-server-5.1.templates:6001
+#, fuzzy
+#| msgid ""
+#| "While not mandatory, it is highly recommended that you set a password for "
+#| "the MySQL administrative \"root\" user."
+msgid ""
+"While not mandatory, it is highly recommended that you set a password for "
+"the Percona SQL administrative \"root\" user."
+msgstr ""
+"Obwohl es nicht zwingend erforderlich ist, wird nachdrücklich empfohlen für "
+"den administrativen MySQL »root«-Benutzer ein Passwort zu setzen."
+
+#. Type: password
+#. Description
+#: ../percona-xtradb-server-5.1.templates:6001
+msgid "If that field is left blank, the password will not be changed."
+msgstr "Wenn dieses Feld freigelassen wird, wird das Passwort nicht geändert."
+
+#. Type: password
+#. Description
+#: ../percona-xtradb-server-5.1.templates:7001
+#, fuzzy
+#| msgid "New password for the MySQL \"root\" user:"
+msgid "Repeat password for the Percona SQL \"root\" user:"
+msgstr "Neues Passwort für den MySQL »root«-Benutzer:"
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:8001
+#, fuzzy
+#| msgid "Unable to set password for the MySQL \"root\" user"
+msgid "Unable to set password for the Percona SQL \"root\" user"
+msgstr "Konnte für den MySQL-»root«-Benutzer kein Passwort setzen"
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:8001
+#, fuzzy
+#| msgid ""
+#| "An error occurred while setting the password for the MySQL administrative "
+#| "user. This may have happened because the account already has a password, "
+#| "or because of a communication problem with the MySQL server."
+msgid ""
+"An error occurred while setting the password for the Percona SQL "
+"administrative user. This may have happened because the account already has "
+"a password, or because of a communication problem with the Percona SQL "
+"server."
+msgstr ""
+"Beim setzen des Passworts für den administrativen MySQL-Benutzer ist ein "
+"Fehler aufgetreten. Dies könnte daran liegen, dass der Benutzer bereits ein "
+"Passwort hat oder dass es ein Problem mit der Kommunikation mit dem MySQL-"
+"Server gibt."
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:8001
+msgid "You should check the account's password after the package installation."
+msgstr ""
+"Sie sollten das Passwort des administrativen Benutzers nach der "
+"Paketinstallation prüfen."
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:8001
+#, fuzzy
+#| msgid ""
+#| "Please read the /usr/share/doc/mysql-server-5.1/README.Debian file for "
+#| "more information."
+msgid ""
+"Please read the /usr/share/doc/mysql-server-5.1/README.Debian file for more "
+"information."
+msgstr ""
+"Für weitere Informationen lesen Sie /usr/share/doc/mysql-server-5.1/README."
+"Debian"
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:9001
+msgid "Password input error"
+msgstr ""
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:9001
+msgid "The two passwords you entered were not the same. Please try again."
+msgstr ""
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:10001
+msgid "NDB Cluster seems to be in use"
+msgstr ""
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:10001
+msgid ""
+"Percona-SQL-5.1 has orphaned NDB Cluster support. Please migrate to the new "
+"mysql-cluster package and remove all lines starting with \"ndb\" from all "
+"config files below /etc/mysql/."
+msgstr ""
+
+#~ msgid ""
+#~ "Support MySQL connections from hosts running Debian \"sarge\" or older?"
+#~ msgstr ""
+#~ "Sollen MySQL-Verbindungen von Rechnern mit Debian »Sarge« oder älter "
+#~ "unterstützt werden?"
+
+#~ msgid ""
+#~ "In old versions of MySQL clients on Debian, passwords were not stored "
+#~ "securely. This has been improved since then, however clients (such as "
+#~ "PHP) from hosts running Debian 3.1 Sarge will not be able to connect to "
+#~ "recent accounts or accounts whose password have been changed."
+#~ msgstr ""
+#~ "Alte Versionen der MySQL-Clients für Debian speicherten Passwörter nicht "
+#~ "sehr sicher. Dies wurde verbessert, allerdings werden Clients (z. B. PHP) "
+#~ "von Hosts mit Debian 3.1 Sarge sich nicht mehr mit MySQL-Konten verbinden "
+#~ "können, die neu angelegt werden oder deren Passwort geändert wird. Siehe "
+#~ "auch /usr/share/doc/mysql-server-5.1/README.Debian."
diff --git a/storage/xtradb/build/debian/po/es.po b/storage/xtradb/build/debian/po/es.po
new file mode 100644
index 00000000000..a37942fd59d
--- /dev/null
+++ b/storage/xtradb/build/debian/po/es.po
@@ -0,0 +1,405 @@
+# mysql-dfsg-5 translation to spanish
+# Copyright (C) 2005-2007 Software in the Public Interest, SPI Inc.
+# This file is distributed under the same license as the XXXX package.
+#
+# Changes:
+# - Initial translation
+#       Jesus Aneiros, 2006
+# - Updated
+#       Javier Fernandez-Sanguino, 2006-2007
+# - Revision
+#       Nacho Barrientos Arias
+#       Fernando Cerezal
+#       David Martínez Moreno
+#       Ricardo Mones
+#       Carlos Galisteo
+#       Javier Fernandez-Sanguino
+#
+#
+#  Traductores, si no conoce el formato PO, merece la pena leer la 
+#  documentación de gettext, especialmente las secciones dedicadas a este
+#  formato, por ejemplo ejecutando:
+#         info -n '(gettext)PO Files'
+#         info -n '(gettext)Header Entry'
+#
+# Equipo de traducción al español, por favor lean antes de traducir
+# los siguientes documentos:
+#
+# - El proyecto de traducción de Debian al español
+#   http://www.debian.org/intl/spanish/
+#   especialmente las notas y normas de traducción en
+#   http://www.debian.org/intl/spanish/notas
+#
+# - La guía de traducción de po's de debconf:
+#   /usr/share/doc/po-debconf/README-trans
+#   o http://www.debian.org/intl/l10n/po-debconf/README-trans
+#
+# Si tiene dudas o consultas sobre esta traducción consulte con el último
+# traductor (campo Last-Translator) y ponga en copia a la lista de
+# traducción de Debian al español (<debian-l10n-spanish@lists.debian.org>)
+msgid ""
+msgstr ""
+"Project-Id-Version: mysql-dfsg-5.1_5.0.24-3\n"
+"Report-Msgid-Bugs-To: percona-xtradb-dfsg-5.1@packages.debian.org\n"
+"POT-Creation-Date: 2010-02-15 17:10-0500\n"
+"PO-Revision-Date: 2007-05-28 22:21+0200\n"
+"Last-Translator: Javier Fernández-Sanguino <jfs@debian.org>\n"
+"Language-Team: Debian l10 Spanish <debian-l10n-spanish@lists.debian.org>\n"
+"MIME-Version: 1.0\n"
+"Content-Type: text/plain; charset=UTF-8\n"
+"Content-Transfer-Encoding: 8bit\n"
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:2001
+msgid "Really proceed with downgrade?"
+msgstr "¿Desea realmente continuar con la desactualización?"
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:2001
+msgid "A file named /var/lib/mysql/debian-*.flag exists on this system."
+msgstr ""
+"Existe un archivo con el nombre /var/lib/mysql/debian-*.flag en este sistema."
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:2001
+msgid ""
+"Such file is an indication that a mysql-server package with a higher version "
+"has been installed earlier."
+msgstr ""
+"Este fichero indica que se instaló previamente una versión superior del "
+"paquete mysql-server."
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:2001
+msgid ""
+"There is no guarantee that the version you're currently installing will be "
+"able to use the current databases."
+msgstr ""
+"No se puede garantizar que la versión que está instalando pueda usar la base "
+"de datos actual."
+
+#. Type: note
+#. Description
+#: ../percona-xtradb-server-5.1.templates:3001
+msgid "Important note for NIS/YP users"
+msgstr "Nota importante para los usuarios de NIS/YP"
+
+#. Type: note
+#. Description
+#: ../percona-xtradb-server-5.1.templates:3001
+msgid ""
+"To use MySQL, the following entries for users and groups should be added to "
+"the system:"
+msgstr ""
+"Deben añadirse las siguientes entradas para usuarios y grupos en el sistema "
+"para poder utilizar MySQL:"
+
+#. Type: note
+#. Description
+#: ../percona-xtradb-server-5.1.templates:3001
+msgid ""
+"You should also check the permissions and the owner of the /var/lib/mysql "
+"directory:"
+msgstr ""
+"También debería comprobar los permisos y el propietario del directorio /var/"
+"lib/mysql:"
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:4001
+#, fuzzy
+#| msgid "Remove all MySQL databases?"
+msgid "Remove all Percona SQL databases?"
+msgstr "¿Desea eliminar todas las bases de datos MySQL?"
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:4001
+#, fuzzy
+#| msgid ""
+#| "The /var/lib/mysql directory which contains the MySQL databases is about "
+#| "to be removed."
+msgid ""
+"The /var/lib/mysql directory which contains the Percona SQL databases is "
+"about to be removed."
+msgstr ""
+"El directorio /var/lib/mysql contiene bases de datos MySQL que van a "
+"eliminarse."
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:4001
+#, fuzzy
+#| msgid ""
+#| "If you're removing the MySQL package in order to later install a more "
+#| "recent version or if a different mysql-server package is already using "
+#| "it, the data should be kept."
+msgid ""
+"If you're removing the Percona SQL package in order to later install a more "
+"recent version or if a different mysql-server package is already using it, "
+"the data should be kept."
+msgstr ""
+"Debería mantener los datos si tiene planificado instalar una versión de "
+"MySQL más reciente o si hay un paquete «mysql-server» distinto que los está "
+"utilizando."
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:5001
+#, fuzzy
+#| msgid "Start the MySQL server on boot?"
+msgid "Start the Percona SQL server on boot?"
+msgstr "¿Debería ejecutarse el servidor MySQL al iniciarse el sistema?"
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:5001
+#, fuzzy
+#| msgid ""
+#| "The MySQL server can be launched automatically at boot time or manually "
+#| "with the '/etc/init.d/mysql start' command."
+msgid ""
+"The Percona SQL server can be launched automatically at boot time or "
+"manually with the '/etc/init.d/mysql start' command."
+msgstr ""
+"El servidor MySQL puede iniciarse en el momento de arranque del sistema o "
+"manualmente si escribe la orden «/etc/init.d/mysql start»."
+
+#. Type: password
+#. Description
+#: ../percona-xtradb-server-5.1.templates:6001
+#, fuzzy
+#| msgid "New password for the MySQL \"root\" user:"
+msgid "New password for the Percona SQL \"root\" user:"
+msgstr "Nueva contraseña para el usuario «root» de MySQL:"
+
+#. Type: password
+#. Description
+#: ../percona-xtradb-server-5.1.templates:6001
+#, fuzzy
+#| msgid ""
+#| "While not mandatory, it is highly recommended that you set a password for "
+#| "the MySQL administrative \"root\" user."
+msgid ""
+"While not mandatory, it is highly recommended that you set a password for "
+"the Percona SQL administrative \"root\" user."
+msgstr ""
+"Se recomienda que configure una contraseña para el usuario "
+"«root» (administrador) de MySQL, aunque no es obligatorio."
+
+#. Type: password
+#. Description
+#: ../percona-xtradb-server-5.1.templates:6001
+msgid "If that field is left blank, the password will not be changed."
+msgstr "No se modificará la contraseña si deja el espacio en blanco."
+
+#. Type: password
+#. Description
+#: ../percona-xtradb-server-5.1.templates:7001
+#, fuzzy
+#| msgid "New password for the MySQL \"root\" user:"
+msgid "Repeat password for the Percona SQL \"root\" user:"
+msgstr "Nueva contraseña para el usuario «root» de MySQL:"
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:8001
+#, fuzzy
+#| msgid "Unable to set password for the MySQL \"root\" user"
+msgid "Unable to set password for the Percona SQL \"root\" user"
+msgstr "No se pudo fijar la contraseña para el usuario «root» de MySQL"
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:8001
+#, fuzzy
+#| msgid ""
+#| "An error occurred while setting the password for the MySQL administrative "
+#| "user. This may have happened because the account already has a password, "
+#| "or because of a communication problem with the MySQL server."
+msgid ""
+"An error occurred while setting the password for the Percona SQL "
+"administrative user. This may have happened because the account already has "
+"a password, or because of a communication problem with the Percona SQL "
+"server."
+msgstr ""
+"Se produjo un error mientras intentaba fijar la contraseña para el usuario "
+"administrador de MySQL. Esto puede haber sucedido porque la cuenta ya tenía "
+"una contraseña o porque se produjo un error de comunicación con el servidor "
+"MySQL."
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:8001
+msgid "You should check the account's password after the package installation."
+msgstr ""
+"Debería comprobar la contraseña de la cuenta después de la instalación del "
+"paquete."
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:8001
+#, fuzzy
+#| msgid ""
+#| "Please read the /usr/share/doc/mysql-server-5.1/README.Debian file for "
+#| "more information."
+msgid ""
+"Please read the /usr/share/doc/mysql-server-5.1/README.Debian file for more "
+"information."
+msgstr ""
+"Consulte /usr/share/doc/mysql-server-5.1/README.Debian para más información."
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:9001
+msgid "Password input error"
+msgstr ""
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:9001
+msgid "The two passwords you entered were not the same. Please try again."
+msgstr ""
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:10001
+msgid "NDB Cluster seems to be in use"
+msgstr ""
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:10001
+msgid ""
+"Percona-SQL-5.1 has orphaned NDB Cluster support. Please migrate to the new "
+"mysql-cluster package and remove all lines starting with \"ndb\" from all "
+"config files below /etc/mysql/."
+msgstr ""
+
+#~ msgid "Cannot upgrade if ISAM tables are present!"
+#~ msgstr "¡No se puede actualizar si ya hay tablas ISAM!"
+
+#~ msgid ""
+#~ "Recent versions of MySQL can no longer use the old ISAM table format and "
+#~ "it is necessary to convert your tables to e.g. MyISAM before upgrading by "
+#~ "using \"mysql_convert_table_format\" or \"ALTER TABLE x ENGINE=MyISAM\". "
+#~ "The installation of mysql-server-5.1 will now abort. In case your old "
+#~ "mysql-server-4.1 gets removed nevertheless just reinstall it to convert "
+#~ "those tables."
+#~ msgstr ""
+#~ "Las versiones recientes de MySQL ya no soportan el antiguo formato de "
+#~ "tabla ISAM. Antes de realizar la actualización es necesario convertir sus "
+#~ "tablas a por ejemplo, MyISAM, usando «mysql_convert_table_format» o «ALTER "
+#~ "TABLE x ENGINE=MyISAM». Se va a interrumpir ahora la instalación de mysql-"
+#~ "server-5.1. Si aún así su mysql-server-4.1 se elimina aún así, puede "
+#~ "reinstalarlo para convertir ese tipo de tablas."
+
+#~ msgid ""
+#~ "Support MySQL connections from hosts running Debian \"sarge\" or older?"
+#~ msgstr ""
+#~ "¿Soportar las conexiones MySQL establecidadas desde sistemas que ejecutan "
+#~ "Debian Sarge o versiones anteriores?"
+
+#~ msgid ""
+#~ "In old versions of MySQL clients on Debian, passwords were not stored "
+#~ "securely. This has been improved since then, however clients (such as "
+#~ "PHP) from hosts running Debian 3.1 Sarge will not be able to connect to "
+#~ "recent accounts or accounts whose password have been changed."
+#~ msgstr ""
+#~ "No era muy segura la forma en la que se almacenaban las contraseñas en "
+#~ "versiones anteriores del cliente de MySQL en Debian. Este problema se ha "
+#~ "mejorado posteriormente con el inconveniente, sin embargo, de que "
+#~ "clientes (por ejemplo, PHP) en sistemas que ejecutan Debian 3.1 «Sarge» no "
+#~ "podrán conectarse a cuentas que son nuevas o a las que se le haya "
+#~ "cambiado la contraseña."
+
+#~ msgid ""
+#~ "To use mysql you must install an equivalent user and group to the "
+#~ "following and ensure yourself that /var/lib/mysql has the right "
+#~ "permissions (the uid/gid may be different)."
+#~ msgstr ""
+#~ "Para utilizar mysql debe instalar un usuario y grupo equivalente al "
+#~ "siguiente y asegurarse de que /var/lib/mysql tiene los permisos correctos "
+#~ "(los valores del «uid» y del «gid» pueden ser diferentes)."
+
+#~ msgid ""
+#~ "/etc/passwd:      mysql:x:100:101:MySQL Server:/var/lib/mysql:/bin/false"
+#~ msgstr ""
+#~ "/etc/passwd:      mysql:x:100:101:MySQL Server:/var/lib/mysql:/bin/false"
+
+#~ msgid "/etc/group:       mysql:x:101:"
+#~ msgstr "/etc/group:       mysql:x:101:"
+
+#~ msgid "/var/lib/mysql:   drwxr-xr-x   mysql    mysql"
+#~ msgstr "/var/lib/mysql:   drwxr-xr-x   mysql    mysql"
+
+#~ msgid "Remove the databases used by all MySQL versions?"
+#~ msgstr ""
+#~ "¿Eliminar las bases de datos utilizadas por todas las versiones de MySQL?"
+
+#~ msgid ""
+#~ "If you do not provide a password no changes will be made to the account."
+#~ msgstr ""
+#~ "No se hará ningún cambio en la cuenta si no introduce una contraseña."
+
+#~ msgid ""
+#~ "When installation finishes, you should verify that the account is "
+#~ "properly protected with a password (see README.Debian for more "
+#~ "information)."
+#~ msgstr ""
+#~ "Debería confirmar que la contraseña está correctamente protegida con una "
+#~ "contraseña cuando termine la instalación (consulte el fichero README."
+#~ "Debian si desea más información)."
+
+#~ msgid "Install Hints"
+#~ msgstr "Sugerencias para la instalación"
+
+#~ msgid ""
+#~ "On upgrades from MySQL 3.23, as shipped with Debian Woody, symlinks in "
+#~ "place of /var/lib/mysql or /var/log/mysql gets accidently removed and "
+#~ "have manually be restored."
+#~ msgstr ""
+#~ "Al actualizar a la versión de MySQL 3.23, la vrsión proporcionada en "
+#~ "Debian Woody, se eliminan de manera accidental, los enlaces simbólicos a «/"
+#~ "var/lib/mysql» o «/var/log/mysql» y tienen que restaurarse manualmente."
+
+#~ msgid ""
+#~ "MySQL will only install if you have a non-numeric hostname that is "
+#~ "resolvable via the /etc/hosts file. E.g. if the \"hostname\" command "
+#~ "returns \"myhostname\" then there must be a line like \"10.0.0.1 "
+#~ "myhostname\"."
+#~ msgstr ""
+#~ "Sólo se instalará MySQL si tiene un nombre de equipo que no sea una "
+#~ "dirección IP y pueda resolverse a través del archivo /etc/hosts. Por "
+#~ "ejemplo, si la orden «hostname» devuelve «MiNombreEquipo» entonces deberá "
+#~ "existir una línea «10.0.0.1 MiNombreEquipo» en dicho archivo."
+
+#~ msgid ""
+#~ "A new mysql user \"debian-sys-maint\" will be created. This mysql account "
+#~ "is used in the start/stop and cron scripts. Don't delete."
+#~ msgstr ""
+#~ "Se creará un nuevo usuario «debian-sys-maint». Esta cuenta de mysql se "
+#~ "utilizará en los scripts de inicio y parada y en los scripts «cron». No la "
+#~ "elimine."
+
+#~ msgid ""
+#~ "Please remember to set a PASSWORD for the MySQL root user! If you use a /"
+#~ "root/.my.cnf, always write the \"user\" and the \"password\" lines in "
+#~ "there, never only the password!"
+#~ msgstr ""
+#~ "¡Por favor, recuerde crear una CONTRASEÑA para el usuario «root» de MySQL! "
+#~ "¡Si utiliza /root/.my.cnf debe escribir las líneas «user» y «password» en "
+#~ "dicho archivo, no incluya sólo la contraseña!"
+
+#~ msgid ""
+#~ "Should I remove the complete /var/lib/mysql directory tree which is used "
+#~ "by all MySQL versions, not necessarily only the one you are about to "
+#~ "purge?"
+#~ msgstr ""
+#~ "¿Debería eliminar el árbol de directorio /var/lib/mysql completo? Tenga "
+#~ "en cuenta que lo utilizan todas las versiones de MySQL y no sólo la que "
+#~ "está a punto de purgar."
diff --git a/storage/xtradb/build/debian/po/eu.po b/storage/xtradb/build/debian/po/eu.po
new file mode 100644
index 00000000000..b091e6c01cf
--- /dev/null
+++ b/storage/xtradb/build/debian/po/eu.po
@@ -0,0 +1,295 @@
+# SOME DESCRIPTIVE TITLE.
+# Copyright (C) YEAR THE PACKAGE'S COPYRIGHT HOLDER
+# This file is distributed under the same license as the PACKAGE package.
+# Piarres BEobide <pi@beobide.net>, 2006.
+msgid ""
+msgstr ""
+"Project-Id-Version: mysql-dfsg-5.1_5.0.26-3-debconf_eu\n"
+"Report-Msgid-Bugs-To: percona-xtradb-dfsg-5.1@packages.debian.org\n"
+"POT-Creation-Date: 2010-02-15 17:10-0500\n"
+"PO-Revision-Date: 2007-02-19 09:33+0100\n"
+"Last-Translator: Piarres Beobide <pi@beobide.net>\n"
+"Language-Team: Euskara <Librezale@librezale.org>\n"
+"MIME-Version: 1.0\n"
+"Content-Type: text/plain; charset=UTF-8\n"
+"Content-Transfer-Encoding: 8bit\n"
+"Plural-Forms: nplurals=2; plural=(n != 1);\n"
+"X-Generator: Pootle 0.10.1\n"
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:2001
+#, fuzzy
+#| msgid "Do you really want to downgrade?"
+msgid "Really proceed with downgrade?"
+msgstr "Benetan bertsio zaharragora itzuli nahi duzu?"
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:2001
+msgid "A file named /var/lib/mysql/debian-*.flag exists on this system."
+msgstr ""
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:2001
+#, fuzzy
+#| msgid ""
+#| "WARNING: The file /var/lib/mysql/debian-*.flag exists. This indicates "
+#| "that a mysql-server package with a higher version has been installed "
+#| "before. It can not be guaranteed that this version can use its data."
+msgid ""
+"Such file is an indication that a mysql-server package with a higher version "
+"has been installed earlier."
+msgstr ""
+"Oharra: /var/lib/mysql/debian-*.flag dago.. Honek aurretik bertsio "
+"berriagoko mysql-zerbitzari bat instalatu dela adierazten du. Ezin da "
+"ziurtatu bertsio honek datu horiek erabili ahal izango dituenik."
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:2001
+msgid ""
+"There is no guarantee that the version you're currently installing will be "
+"able to use the current databases."
+msgstr ""
+
+#. Type: note
+#. Description
+#: ../percona-xtradb-server-5.1.templates:3001
+#, fuzzy
+#| msgid "Important note for NIS/YP users!"
+msgid "Important note for NIS/YP users"
+msgstr "NIS/YP erabiltzaileentzat ohar garrantzitsua!"
+
+#. Type: note
+#. Description
+#: ../percona-xtradb-server-5.1.templates:3001
+msgid ""
+"To use MySQL, the following entries for users and groups should be added to "
+"the system:"
+msgstr ""
+
+#. Type: note
+#. Description
+#: ../percona-xtradb-server-5.1.templates:3001
+msgid ""
+"You should also check the permissions and the owner of the /var/lib/mysql "
+"directory:"
+msgstr ""
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:4001
+msgid "Remove all Percona SQL databases?"
+msgstr ""
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:4001
+msgid ""
+"The /var/lib/mysql directory which contains the Percona SQL databases is "
+"about to be removed."
+msgstr ""
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:4001
+#, fuzzy
+#| msgid ""
+#| "The script is about to remove the data directory /var/lib/mysql. If it is "
+#| "planned to just install a higher MySQL version or if a different mysql-"
+#| "server package is already using it, the data should be kept."
+msgid ""
+"If you're removing the Percona SQL package in order to later install a more "
+"recent version or if a different mysql-server package is already using it, "
+"the data should be kept."
+msgstr ""
+"Script-a /var/lib/mysql data direktorioa ezabatzera doa. MySQL bertsio "
+"berriago bat instalatu behar bada edo beste mysql-server pakete bat berau "
+"erabiltzen ari bada, datuak mantendu egingo dira."
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:5001
+#, fuzzy
+#| msgid "Should MySQL start on boot?"
+msgid "Start the Percona SQL server on boot?"
+msgstr "Sistema abiaraztean MySQL abiarazi behar al da?"
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:5001
+#, fuzzy
+#| msgid ""
+#| "The MySQL can start automatically on boot time or only if you manually "
+#| "type '/etc/init.d/mysql start'."
+msgid ""
+"The Percona SQL server can be launched automatically at boot time or "
+"manually with the '/etc/init.d/mysql start' command."
+msgstr ""
+"MySQL abiaraztean automatikoki abiarazi daiteke edo eskuz /etc/init.d/mysql "
+"start' eginaz."
+
+#. Type: password
+#. Description
+#: ../percona-xtradb-server-5.1.templates:6001
+#, fuzzy
+#| msgid "New password for MySQL \"root\" user:"
+msgid "New password for the Percona SQL \"root\" user:"
+msgstr "MySQL \"root\" erabiltzailearen pasahitz berria:"
+
+#. Type: password
+#. Description
+#: ../percona-xtradb-server-5.1.templates:6001
+#, fuzzy
+#| msgid ""
+#| "It is highly recommended that you set a password for the MySQL "
+#| "administrative \"root\" user."
+msgid ""
+"While not mandatory, it is highly recommended that you set a password for "
+"the Percona SQL administrative \"root\" user."
+msgstr ""
+"Oso gomendagarria da MySQL administratzaile \"root\" erabiltzaileari "
+"pasahitz bat ezartzea."
+
+#. Type: password
+#. Description
+#: ../percona-xtradb-server-5.1.templates:6001
+msgid "If that field is left blank, the password will not be changed."
+msgstr ""
+
+#. Type: password
+#. Description
+#: ../percona-xtradb-server-5.1.templates:7001
+#, fuzzy
+#| msgid "New password for MySQL \"root\" user:"
+msgid "Repeat password for the Percona SQL \"root\" user:"
+msgstr "MySQL \"root\" erabiltzailearen pasahitz berria:"
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:8001
+#, fuzzy
+#| msgid "Unable to set password for MySQL \"root\" user"
+msgid "Unable to set password for the Percona SQL \"root\" user"
+msgstr "Ezinda MySQL \"root\" erabiltzailearen pasahitza ezarri"
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:8001
+#, fuzzy
+#| msgid ""
+#| "It seems an error occurred while setting the password for the MySQL "
+#| "administrative user.  This may have happened because the user already has "
+#| "a password, or because there was a problem communicating with the MySQL "
+#| "server."
+msgid ""
+"An error occurred while setting the password for the Percona SQL "
+"administrative user. This may have happened because the account already has "
+"a password, or because of a communication problem with the Percona SQL "
+"server."
+msgstr ""
+"Dirudienez errore bat gertatu da MySQL administratzaile kontuaren pasahitza "
+"ezartzean.  Hau erabiltzaileak dagoeneko pasahitz bat duelako edo MySQL "
+"zerbitzariarekiko konexioan erroreak daudelako gertatu daiteke."
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:8001
+msgid "You should check the account's password after the package installation."
+msgstr ""
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:8001
+msgid ""
+"Please read the /usr/share/doc/mysql-server-5.1/README.Debian file for more "
+"information."
+msgstr ""
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:9001
+msgid "Password input error"
+msgstr ""
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:9001
+msgid "The two passwords you entered were not the same. Please try again."
+msgstr ""
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:10001
+msgid "NDB Cluster seems to be in use"
+msgstr ""
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:10001
+msgid ""
+"Percona-SQL-5.1 has orphaned NDB Cluster support. Please migrate to the new "
+"mysql-cluster package and remove all lines starting with \"ndb\" from all "
+"config files below /etc/mysql/."
+msgstr ""
+
+#~ msgid ""
+#~ "Support MySQL connections from hosts running Debian \"sarge\" or older?"
+#~ msgstr ""
+#~ "Debian \"sarge\" edo zaharragoak erabiltzen duten ostalarietatik MySQL "
+#~ "konexioak onartu?"
+
+#, fuzzy
+#~| msgid ""
+#~| "The way passwords were stored was not very secure. This has been "
+#~| "improved with the drawback that clients (e.g. PHP) from hosts running "
+#~| "Debian 3.1 Sarge will not be able to connect to account which are new or "
+#~| "whose password have been changed. See /usr/share/doc/mysql-server-5.1/"
+#~| "README.Debian."
+#~ msgid ""
+#~ "In old versions of MySQL clients on Debian, passwords were not stored "
+#~ "securely. This has been improved since then, however clients (such as "
+#~ "PHP) from hosts running Debian 3.1 Sarge will not be able to connect to "
+#~ "recent accounts or accounts whose password have been changed."
+#~ msgstr ""
+#~ "Pasahitzak biltegiratzeko modua ez da oso ziurra. Hau hobetua izan da "
+#~ "baina Debian 3.1 Sarge erabiltzaileak ezingo dira kontu berri edo "
+#~ "pasahitza aldatu duten kontuetara konektatu. Begiratu /usr/share/doc/"
+#~ "mysql-server-5.1/README.Debian argibide gehiagorako."
+
+#~ msgid ""
+#~ "To use mysql you must install an equivalent user and group to the "
+#~ "following and ensure yourself that /var/lib/mysql has the right "
+#~ "permissions (the uid/gid may be different)."
+#~ msgstr ""
+#~ "Mysql erabili ahal izateko beharrezko erabiltzaile eta taldea sortu eta /"
+#~ "var/lib/mysql-ek beharrezko baimenak dituela ziurtatu behar duzu (uid/gid-"
+#~ "a ezberdina izan daiteke)"
+
+#~ msgid ""
+#~ "/etc/passwd:      mysql:x:100:101:MySQL Server:/var/lib/mysql:/bin/false"
+#~ msgstr ""
+#~ "/etc/passwd:      mysql:x:100:101:MySQL Server:/var/lib/mysql:/bin/false"
+
+#~ msgid "/etc/group:       mysql:x:101:"
+#~ msgstr "/etc/group:       mysql:x:101:"
+
+#~ msgid "/var/lib/mysql:   drwxr-xr-x   mysql    mysql"
+#~ msgstr "/var/lib/mysql:   drwxr-xr-x   mysql    mysql"
+
+#~ msgid "Remove the databases used by all MySQL versions?"
+#~ msgstr "MySQL bertsio guztiek erabilitako databaseak ezabatu?"
+
+#~ msgid ""
+#~ "If you do not provide a password no changes will be made to the account."
+#~ msgstr "Ez baduzu pasahitzik ezartzen ez da aldaketarik egingo kontuan."
+
+#~ msgid ""
+#~ "When installation finishes, you should verify that the account is "
+#~ "properly protected with a password (see README.Debian for more "
+#~ "information)."
+#~ msgstr ""
+#~ "Instalazio amaitzean, kontua pasahitzez babesturik dagoela ziurtatu "
+#~ "beharko zenuke (README.Debian irakurri xehetasun gehiagotarako)"
diff --git a/storage/xtradb/build/debian/po/fr.po b/storage/xtradb/build/debian/po/fr.po
new file mode 100644
index 00000000000..b4dcce8658b
--- /dev/null
+++ b/storage/xtradb/build/debian/po/fr.po
@@ -0,0 +1,274 @@
+# translation of fr.po to French
+#    Translators, if you are not familiar with the PO format, gettext
+#    documentation is worth reading, especially sections dedicated to
+#    this format, e.g. by running:
+#         info -n '(gettext)PO Files'
+#         info -n '(gettext)Header Entry'
+#
+#    Some information specific to po-debconf are available at
+#            /usr/share/doc/po-debconf/README-trans
+#         or http://www.debian.org/intl/l10n/po-debconf/README-trans
+#
+#    Developers do not need to manually edit POT or PO files.
+#
+# Christian Perrier <bubulle@debian.org>, 2004, 2006, 2007.
+msgid ""
+msgstr ""
+"Project-Id-Version: fr\n"
+"Report-Msgid-Bugs-To: percona-xtradb-dfsg-5.1@packages.debian.org\n"
+"POT-Creation-Date: 2010-02-15 17:10-0500\n"
+"PO-Revision-Date: 2007-04-19 22:43+0200\n"
+"Last-Translator: Christian Perrier <bubulle@debian.org>\n"
+"Language-Team: French <debian-l10n-french@lists.debian.org>\n"
+"MIME-Version: 1.0\n"
+"Content-Type: text/plain; charset=UTF-8\n"
+"Content-Transfer-Encoding: 8bit\n"
+"debian.org>\n"
+"X-Generator: KBabel 1.11.4\n"
+"Plural-Forms: Plural-Forms: nplurals=2; plural=n>1;\n"
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:2001
+msgid "Really proceed with downgrade?"
+msgstr "Faut-il vraiment revenir à la version précédente ?"
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:2001
+msgid "A file named /var/lib/mysql/debian-*.flag exists on this system."
+msgstr "Un fichier /var/lib/mysql/debian-*.flag est présent sur ce système."
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:2001
+msgid ""
+"Such file is an indication that a mysql-server package with a higher version "
+"has been installed earlier."
+msgstr ""
+"Cela indique qu'une version plus récente du paquet mysql-server a été "
+"précédemment installée."
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:2001
+msgid ""
+"There is no guarantee that the version you're currently installing will be "
+"able to use the current databases."
+msgstr "Il n'est pas garanti que cette version puisse en utiliser les données."
+
+#. Type: note
+#. Description
+#: ../percona-xtradb-server-5.1.templates:3001
+msgid "Important note for NIS/YP users"
+msgstr "Note importante pour les utilisateurs NIS/YP"
+
+#. Type: note
+#. Description
+#: ../percona-xtradb-server-5.1.templates:3001
+msgid ""
+"To use MySQL, the following entries for users and groups should be added to "
+"the system:"
+msgstr ""
+"Pour pouvoir utiliser MySQL, les utilisateurs et les groupes suivants "
+"doivent être ajoutés au système :"
+
+#. Type: note
+#. Description
+#: ../percona-xtradb-server-5.1.templates:3001
+msgid ""
+"You should also check the permissions and the owner of the /var/lib/mysql "
+"directory:"
+msgstr ""
+"Vous devez également vérifier le propriétaire et les permissions du "
+"répertoire /var/lib/mysql :"
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:4001
+#, fuzzy
+#| msgid "Remove all MySQL databases?"
+msgid "Remove all Percona SQL databases?"
+msgstr "Faut-il supprimer toutes les bases de données MySQL ?"
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:4001
+#, fuzzy
+#| msgid ""
+#| "The /var/lib/mysql directory which contains the MySQL databases is about "
+#| "to be removed."
+msgid ""
+"The /var/lib/mysql directory which contains the Percona SQL databases is "
+"about to be removed."
+msgstr ""
+"Le répertoire /var/lib/mysql qui contient les bases de données de MySQL va "
+"être supprimé."
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:4001
+#, fuzzy
+#| msgid ""
+#| "If you're removing the MySQL package in order to later install a more "
+#| "recent version or if a different mysql-server package is already using "
+#| "it, the data should be kept."
+msgid ""
+"If you're removing the Percona SQL package in order to later install a more "
+"recent version or if a different mysql-server package is already using it, "
+"the data should be kept."
+msgstr ""
+"Si vous prévoyez d'installer une version plus récente de MySQL ou si un "
+"autre paquet mysql-server les utilise déjà, vous devriez les conserver."
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:5001
+#, fuzzy
+#| msgid "Start the MySQL server on boot?"
+msgid "Start the Percona SQL server on boot?"
+msgstr "Faut-il lancer MySQL au démarrage ?"
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:5001
+#, fuzzy
+#| msgid ""
+#| "The MySQL server can be launched automatically at boot time or manually "
+#| "with the '/etc/init.d/mysql start' command."
+msgid ""
+"The Percona SQL server can be launched automatically at boot time or "
+"manually with the '/etc/init.d/mysql start' command."
+msgstr ""
+"MySQL peut être lancé soit au démarrage, soit en entrant la commande « /etc/"
+"init.d/mysql start »."
+
+#. Type: password
+#. Description
+#: ../percona-xtradb-server-5.1.templates:6001
+#, fuzzy
+#| msgid "New password for the MySQL \"root\" user:"
+msgid "New password for the Percona SQL \"root\" user:"
+msgstr "Nouveau mot de passe du superutilisateur de MySQL :"
+
+#. Type: password
+#. Description
+#: ../percona-xtradb-server-5.1.templates:6001
+#, fuzzy
+#| msgid ""
+#| "While not mandatory, it is highly recommended that you set a password for "
+#| "the MySQL administrative \"root\" user."
+msgid ""
+"While not mandatory, it is highly recommended that you set a password for "
+"the Percona SQL administrative \"root\" user."
+msgstr ""
+"Il est très fortement recommandé d'établir un mot de passe pour le compte "
+"d'administration de MySQL (« root »)."
+
+#. Type: password
+#. Description
+#: ../percona-xtradb-server-5.1.templates:6001
+msgid "If that field is left blank, the password will not be changed."
+msgstr "Si ce champ est laissé vide, le mot de passe ne sera pas changé."
+
+#. Type: password
+#. Description
+#: ../percona-xtradb-server-5.1.templates:7001
+#, fuzzy
+#| msgid "New password for the MySQL \"root\" user:"
+msgid "Repeat password for the Percona SQL \"root\" user:"
+msgstr "Nouveau mot de passe du superutilisateur de MySQL :"
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:8001
+#, fuzzy
+#| msgid "Unable to set password for the MySQL \"root\" user"
+msgid "Unable to set password for the Percona SQL \"root\" user"
+msgstr ""
+"Impossible de changer le mot de passe de l'utilisateur « root » de MySQL"
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:8001
+#, fuzzy
+#| msgid ""
+#| "An error occurred while setting the password for the MySQL administrative "
+#| "user. This may have happened because the account already has a password, "
+#| "or because of a communication problem with the MySQL server."
+msgid ""
+"An error occurred while setting the password for the Percona SQL "
+"administrative user. This may have happened because the account already has "
+"a password, or because of a communication problem with the Percona SQL "
+"server."
+msgstr ""
+"Une erreur s'est produite lors du changement de mot de passe du compte "
+"d'administration. Un mot de passe existait peut-être déjà ou il n'a pas été "
+"possible de communiquer avec le serveur MySQL."
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:8001
+msgid "You should check the account's password after the package installation."
+msgstr ""
+"Vous devriez vérifier le mot de passe de ce compte après l'installation du "
+"paquet."
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:8001
+#, fuzzy
+#| msgid ""
+#| "Please read the /usr/share/doc/mysql-server-5.1/README.Debian file for "
+#| "more information."
+msgid ""
+"Please read the /usr/share/doc/mysql-server-5.1/README.Debian file for more "
+"information."
+msgstr ""
+"Veuillez consulter le fichier /usr/share/doc/mysql-server-5.1/README.Debian "
+"pour plus d'informations."
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:9001
+msgid "Password input error"
+msgstr ""
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:9001
+msgid "The two passwords you entered were not the same. Please try again."
+msgstr ""
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:10001
+msgid "NDB Cluster seems to be in use"
+msgstr ""
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:10001
+msgid ""
+"Percona-SQL-5.1 has orphaned NDB Cluster support. Please migrate to the new "
+"mysql-cluster package and remove all lines starting with \"ndb\" from all "
+"config files below /etc/mysql/."
+msgstr ""
+
+#~ msgid ""
+#~ "Support MySQL connections from hosts running Debian \"sarge\" or older?"
+#~ msgstr ""
+#~ "Gérer les connexions d'hôtes qui utilisent les versions Debian « sarge » "
+#~ "ou antérieures  ?"
+
+#~ msgid ""
+#~ "In old versions of MySQL clients on Debian, passwords were not stored "
+#~ "securely. This has been improved since then, however clients (such as "
+#~ "PHP) from hosts running Debian 3.1 Sarge will not be able to connect to "
+#~ "recent accounts or accounts whose password have been changed."
+#~ msgstr ""
+#~ "La méthode de stockage des mots de passe n'était pas très sûre dans les "
+#~ "version précédentes de ce paquet. Cette méthode a été améliorée mais les "
+#~ "modifications empêchent la connexion avec de nouveaux comptes ou des "
+#~ "comptes dont le mot de passe a été modifié, pour les clients (p. ex. PHP) "
+#~ "depuis des hôtes qui utilisent Debian 3.1 « sarge »."
diff --git a/storage/xtradb/build/debian/po/gl.po b/storage/xtradb/build/debian/po/gl.po
new file mode 100644
index 00000000000..ef845649f18
--- /dev/null
+++ b/storage/xtradb/build/debian/po/gl.po
@@ -0,0 +1,264 @@
+# Galician translation of mysql-dfsg-5.1's debconf templates
+# This file is distributed under the same license as the mysql-dfsg-5.1 package.
+# Jacobo Tarrio <jtarrio@debian.org>, 2007.
+#
+msgid ""
+msgstr ""
+"Project-Id-Version: mysql-dfsg-5.1\n"
+"Report-Msgid-Bugs-To: percona-xtradb-dfsg-5.1@packages.debian.org\n"
+"POT-Creation-Date: 2010-02-15 17:10-0500\n"
+"PO-Revision-Date: 2007-04-20 09:44+0200\n"
+"Last-Translator: Jacobo Tarrio <jtarrio@debian.org>\n"
+"Language-Team: Galician <proxecto@trasno.net>\n"
+"MIME-Version: 1.0\n"
+"Content-Type: text/plain; charset=UTF-8\n"
+"Content-Transfer-Encoding: 8bit\n"
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:2001
+msgid "Really proceed with downgrade?"
+msgstr "¿Quere pasar a unha versión anterior?"
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:2001
+msgid "A file named /var/lib/mysql/debian-*.flag exists on this system."
+msgstr "Neste sistema hai un ficheiro chamado /var/lib/mysql/debian-*.flag."
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:2001
+msgid ""
+"Such file is an indication that a mysql-server package with a higher version "
+"has been installed earlier."
+msgstr ""
+"Este ficheiro indica que antes se instalou un paquete mysql-server cunha "
+"versión superior."
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:2001
+msgid ""
+"There is no guarantee that the version you're currently installing will be "
+"able to use the current databases."
+msgstr ""
+"Non se pode garantir que a versión que está a instalar poida empregar as "
+"bases de datos actuais."
+
+#. Type: note
+#. Description
+#: ../percona-xtradb-server-5.1.templates:3001
+msgid "Important note for NIS/YP users"
+msgstr "Nota importante para os usuarios de NIS/YP"
+
+#. Type: note
+#. Description
+#: ../percona-xtradb-server-5.1.templates:3001
+msgid ""
+"To use MySQL, the following entries for users and groups should be added to "
+"the system:"
+msgstr ""
+"Para empregar MySQL deberían engadirse ao sistema as seguintes entradas de "
+"usuarios e grupos:"
+
+#. Type: note
+#. Description
+#: ../percona-xtradb-server-5.1.templates:3001
+msgid ""
+"You should also check the permissions and the owner of the /var/lib/mysql "
+"directory:"
+msgstr ""
+"Tamén debería comprobar os permisos e o propietario do directorio /var/lib/"
+"mysql:"
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:4001
+#, fuzzy
+#| msgid "Remove all MySQL databases?"
+msgid "Remove all Percona SQL databases?"
+msgstr "¿Eliminar tódalas bases de datos de MySQL?"
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:4001
+#, fuzzy
+#| msgid ""
+#| "The /var/lib/mysql directory which contains the MySQL databases is about "
+#| "to be removed."
+msgid ""
+"The /var/lib/mysql directory which contains the Percona SQL databases is "
+"about to be removed."
+msgstr ""
+"Hase eliminar o directorio /var/lib/mysql, que contén as bases de datos de "
+"MySQL."
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:4001
+#, fuzzy
+#| msgid ""
+#| "If you're removing the MySQL package in order to later install a more "
+#| "recent version or if a different mysql-server package is already using "
+#| "it, the data should be kept."
+msgid ""
+"If you're removing the Percona SQL package in order to later install a more "
+"recent version or if a different mysql-server package is already using it, "
+"the data should be kept."
+msgstr ""
+"Se está a eliminar o paquete MySQL para instalar despois unha versión máis "
+"recente ou se xa hai un paquete mysql-server diferente a empregalo, debería "
+"conservar os datos."
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:5001
+#, fuzzy
+#| msgid "Start the MySQL server on boot?"
+msgid "Start the Percona SQL server on boot?"
+msgstr "¿Iniciar o servidor MySQL co ordenador?"
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:5001
+#, fuzzy
+#| msgid ""
+#| "The MySQL server can be launched automatically at boot time or manually "
+#| "with the '/etc/init.d/mysql start' command."
+msgid ""
+"The Percona SQL server can be launched automatically at boot time or "
+"manually with the '/etc/init.d/mysql start' command."
+msgstr ""
+"Pódese iniciar automaticamente o servidor MySQL ao iniciar o ordenador, ou "
+"manualmente coa orde \"/etc/init.d/mysql start\"."
+
+#. Type: password
+#. Description
+#: ../percona-xtradb-server-5.1.templates:6001
+#, fuzzy
+#| msgid "New password for the MySQL \"root\" user:"
+msgid "New password for the Percona SQL \"root\" user:"
+msgstr "Novo contrasinal para o usuario \"root\" de MySQL:"
+
+#. Type: password
+#. Description
+#: ../percona-xtradb-server-5.1.templates:6001
+#, fuzzy
+#| msgid ""
+#| "While not mandatory, it is highly recommended that you set a password for "
+#| "the MySQL administrative \"root\" user."
+msgid ""
+"While not mandatory, it is highly recommended that you set a password for "
+"the Percona SQL administrative \"root\" user."
+msgstr ""
+"Aínda que non é obrigatorio, recoméndase encarecidamente que estableza un "
+"contrasinal para o usuario administrativo \"root\" de MySQL."
+
+#. Type: password
+#. Description
+#: ../percona-xtradb-server-5.1.templates:6001
+msgid "If that field is left blank, the password will not be changed."
+msgstr "Se deixa o campo en branco, non se ha cambiar o contrasinal."
+
+#. Type: password
+#. Description
+#: ../percona-xtradb-server-5.1.templates:7001
+#, fuzzy
+#| msgid "New password for the MySQL \"root\" user:"
+msgid "Repeat password for the Percona SQL \"root\" user:"
+msgstr "Novo contrasinal para o usuario \"root\" de MySQL:"
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:8001
+#, fuzzy
+#| msgid "Unable to set password for the MySQL \"root\" user"
+msgid "Unable to set password for the Percona SQL \"root\" user"
+msgstr "Non se puido establecer o contrasinal do usuario \"root\" de MySQL"
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:8001
+#, fuzzy
+#| msgid ""
+#| "An error occurred while setting the password for the MySQL administrative "
+#| "user. This may have happened because the account already has a password, "
+#| "or because of a communication problem with the MySQL server."
+msgid ""
+"An error occurred while setting the password for the Percona SQL "
+"administrative user. This may have happened because the account already has "
+"a password, or because of a communication problem with the Percona SQL "
+"server."
+msgstr ""
+"Houbo un erro ao establecer o contrasinal do usuario administrativo de "
+"MySQL. Puido ocorrer porque o usuario xa teña un contrasinal ou debido a un "
+"problema de comunicacións co servidor MySQL."
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:8001
+#, fuzzy
+#| msgid ""
+#| "You should check the account's password after tha package installation."
+msgid "You should check the account's password after the package installation."
+msgstr "Debería comprobar o contrasinal da conta trala instalación do paquete."
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:8001
+#, fuzzy
+#| msgid ""
+#| "Please read the /usr/share/doc/mysql-server-5.1/README.Debian file for "
+#| "more information."
+msgid ""
+"Please read the /usr/share/doc/mysql-server-5.1/README.Debian file for more "
+"information."
+msgstr ""
+"Consulte o ficheiro /usr/share/doc/mysql-server-5.1/README.Debian para máis "
+"información."
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:9001
+msgid "Password input error"
+msgstr ""
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:9001
+msgid "The two passwords you entered were not the same. Please try again."
+msgstr ""
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:10001
+msgid "NDB Cluster seems to be in use"
+msgstr ""
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:10001
+msgid ""
+"Percona-SQL-5.1 has orphaned NDB Cluster support. Please migrate to the new "
+"mysql-cluster package and remove all lines starting with \"ndb\" from all "
+"config files below /etc/mysql/."
+msgstr ""
+
+#~ msgid ""
+#~ "Support MySQL connections from hosts running Debian \"sarge\" or older?"
+#~ msgstr ""
+#~ "¿Soportar as conexións a MySQL de máquinas que empreguen Debian \"sarge\" "
+#~ "ou anterior?"
+
+#~ msgid ""
+#~ "In old versions of MySQL clients on Debian, passwords were not stored "
+#~ "securely. This has been improved since then, however clients (such as "
+#~ "PHP) from hosts running Debian 3.1 Sarge will not be able to connect to "
+#~ "recent accounts or accounts whose password have been changed."
+#~ msgstr ""
+#~ "Nas versións antigas dos clientes MySQL de Debian, os contrasinais non se "
+#~ "armacenaban de xeito seguro. Isto mellorouse desde aquela; nembargantes, "
+#~ "os clientes (tales coma PHP) das máquinas que executen Debian 3.1 Sarge "
+#~ "non se han poder conectar a contas recentes ou a contas nas que se "
+#~ "cambiara o contrasinal."
diff --git a/storage/xtradb/build/debian/po/it.po b/storage/xtradb/build/debian/po/it.po
new file mode 100644
index 00000000000..405709d60c3
--- /dev/null
+++ b/storage/xtradb/build/debian/po/it.po
@@ -0,0 +1,266 @@
+# Italian (it) translation of debconf templates for mysql-dfsg-5.1
+# Copyright (C) 2006 Software in the Public Interest
+# This file is distributed under the same license as the mysql-dfsg-5.1 package.
+# Luca Monducci <luca.mo@tiscali.it>, 2006, 2007.
+#
+msgid ""
+msgstr ""
+"Project-Id-Version: mysql-dfsg-5.1 5.0.38 italian debconf templates\n"
+"Report-Msgid-Bugs-To: percona-xtradb-dfsg-5.1@packages.debian.org\n"
+"POT-Creation-Date: 2010-02-15 17:10-0500\n"
+"PO-Revision-Date: 2007-04-22 15:43+0200\n"
+"Last-Translator: Luca Monducci <luca.mo@tiscali.it>\n"
+"Language-Team: Italian <debian-l10n-italian@lists.debian.org>\n"
+"MIME-Version: 1.0\n"
+"Content-Type: text/plain; charset=UTF-8\n"
+"Content-Transfer-Encoding: 8bit\n"
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:2001
+msgid "Really proceed with downgrade?"
+msgstr "Procedere realmente con l'abbassamento di versione?"
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:2001
+msgid "A file named /var/lib/mysql/debian-*.flag exists on this system."
+msgstr ""
+"Su questo sistema esiste un file con nome /var/lib/mysql/debian-*.flag."
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:2001
+msgid ""
+"Such file is an indication that a mysql-server package with a higher version "
+"has been installed earlier."
+msgstr ""
+"Quel file indica che in precedenza è stata installata una versione superiore "
+"del pacchetto mysql-server."
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:2001
+msgid ""
+"There is no guarantee that the version you're currently installing will be "
+"able to use the current databases."
+msgstr ""
+"Non è garantito che la versione che si sta installando sia in grado di usare "
+"i database presenti."
+
+#. Type: note
+#. Description
+#: ../percona-xtradb-server-5.1.templates:3001
+msgid "Important note for NIS/YP users"
+msgstr "Nota importante per gli utenti NIS/YP"
+
+#. Type: note
+#. Description
+#: ../percona-xtradb-server-5.1.templates:3001
+msgid ""
+"To use MySQL, the following entries for users and groups should be added to "
+"the system:"
+msgstr ""
+"Per usare MySQL i seguenti utenti e gruppi devono essere aggiunti al sistema:"
+
+#. Type: note
+#. Description
+#: ../percona-xtradb-server-5.1.templates:3001
+msgid ""
+"You should also check the permissions and the owner of the /var/lib/mysql "
+"directory:"
+msgstr ""
+"Inoltre si devono verificare i permessi e il proprietario della directory /"
+"var/lib/mysql:"
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:4001
+#, fuzzy
+#| msgid "Remove all MySQL databases?"
+msgid "Remove all Percona SQL databases?"
+msgstr "Eliminare tutti i database MySQL?"
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:4001
+#, fuzzy
+#| msgid ""
+#| "The /var/lib/mysql directory which contains the MySQL databases is about "
+#| "to be removed."
+msgid ""
+"The /var/lib/mysql directory which contains the Percona SQL databases is "
+"about to be removed."
+msgstr ""
+"La directory /var/lib/mysql contenente i database di MySQL sta per essere "
+"eliminata."
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:4001
+#, fuzzy
+#| msgid ""
+#| "If you're removing the MySQL package in order to later install a more "
+#| "recent version or if a different mysql-server package is already using "
+#| "it, the data should be kept."
+msgid ""
+"If you're removing the Percona SQL package in order to later install a more "
+"recent version or if a different mysql-server package is already using it, "
+"the data should be kept."
+msgstr ""
+"Se si rimuove il pacchetto MySQL per poi installare una versione più recente "
+"oppure se sono già in uso da un altro pacchetto mysql-server, i dati non "
+"devono essere eliminati."
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:5001
+#, fuzzy
+#| msgid "Start the MySQL server on boot?"
+msgid "Start the Percona SQL server on boot?"
+msgstr "Lanciare il server MySQL all'avvio?"
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:5001
+#, fuzzy
+#| msgid ""
+#| "The MySQL server can be launched automatically at boot time or manually "
+#| "with the '/etc/init.d/mysql start' command."
+msgid ""
+"The Percona SQL server can be launched automatically at boot time or "
+"manually with the '/etc/init.d/mysql start' command."
+msgstr ""
+"Il server MySQL può essere lanciato automaticamente all'avvio del sistema "
+"oppure manualmente con il comando «/etc/init.d/mysql start»."
+
+#. Type: password
+#. Description
+#: ../percona-xtradb-server-5.1.templates:6001
+#, fuzzy
+#| msgid "New password for the MySQL \"root\" user:"
+msgid "New password for the Percona SQL \"root\" user:"
+msgstr "Nuova password per l'utente «root» di MySQL:"
+
+#. Type: password
+#. Description
+#: ../percona-xtradb-server-5.1.templates:6001
+#, fuzzy
+#| msgid ""
+#| "While not mandatory, it is highly recommended that you set a password for "
+#| "the MySQL administrative \"root\" user."
+msgid ""
+"While not mandatory, it is highly recommended that you set a password for "
+"the Percona SQL administrative \"root\" user."
+msgstr ""
+"Sebbene non sia obbligatoria, si raccomanda d'impostare una password per "
+"l'utente d'amministrazione «root» di MySQL."
+
+#. Type: password
+#. Description
+#: ../percona-xtradb-server-5.1.templates:6001
+msgid "If that field is left blank, the password will not be changed."
+msgstr "Se questo campo è lasciato vuoto, la password non viene cambiata."
+
+#. Type: password
+#. Description
+#: ../percona-xtradb-server-5.1.templates:7001
+#, fuzzy
+#| msgid "New password for the MySQL \"root\" user:"
+msgid "Repeat password for the Percona SQL \"root\" user:"
+msgstr "Nuova password per l'utente «root» di MySQL:"
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:8001
+#, fuzzy
+#| msgid "Unable to set password for the MySQL \"root\" user"
+msgid "Unable to set password for the Percona SQL \"root\" user"
+msgstr "Impossibile impostare la password per l'utente «root» di MySQL"
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:8001
+#, fuzzy
+#| msgid ""
+#| "An error occurred while setting the password for the MySQL administrative "
+#| "user. This may have happened because the account already has a password, "
+#| "or because of a communication problem with the MySQL server."
+msgid ""
+"An error occurred while setting the password for the Percona SQL "
+"administrative user. This may have happened because the account already has "
+"a password, or because of a communication problem with the Percona SQL "
+"server."
+msgstr ""
+"Si è verificato un errore durante l'impostazione della password per l'utente "
+"d'amministrazione di MySQL. Questo può essere accaduto perché l'utente ha "
+"già una password oppure a causa di un problema di connessione con il server "
+"MySQL."
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:8001
+#, fuzzy
+#| msgid ""
+#| "You should check the account's password after tha package installation."
+msgid "You should check the account's password after the package installation."
+msgstr ""
+"Al termine dell'installazione si deve verificare la password dell'account."
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:8001
+#, fuzzy
+#| msgid ""
+#| "Please read the /usr/share/doc/mysql-server-5.1/README.Debian file for "
+#| "more information."
+msgid ""
+"Please read the /usr/share/doc/mysql-server-5.1/README.Debian file for more "
+"information."
+msgstr ""
+"Per maggiori informazioni si consulti il file /usr/share/doc/mysql-server-"
+"5.1/README.Debian."
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:9001
+msgid "Password input error"
+msgstr ""
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:9001
+msgid "The two passwords you entered were not the same. Please try again."
+msgstr ""
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:10001
+msgid "NDB Cluster seems to be in use"
+msgstr ""
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:10001
+msgid ""
+"Percona-SQL-5.1 has orphaned NDB Cluster support. Please migrate to the new "
+"mysql-cluster package and remove all lines starting with \"ndb\" from all "
+"config files below /etc/mysql/."
+msgstr ""
+
+#~ msgid ""
+#~ "Support MySQL connections from hosts running Debian \"sarge\" or older?"
+#~ msgstr ""
+#~ "Supporto a connessioni MySQL da macchine con Debian «sarge» o antecedente"
+
+#~ msgid ""
+#~ "In old versions of MySQL clients on Debian, passwords were not stored "
+#~ "securely. This has been improved since then, however clients (such as "
+#~ "PHP) from hosts running Debian 3.1 Sarge will not be able to connect to "
+#~ "recent accounts or accounts whose password have been changed."
+#~ msgstr ""
+#~ "Nelle precedenti versioni dei client MySQL su Debian le password non "
+#~ "erano memorizzate in modo sicuro. Questo è stato migliorato ma i client "
+#~ "(per esempio PHP) presenti su una macchina con Debian 3.1 Sarge non sono "
+#~ "più in grado di connettersi a un nuovo account né ad account le cui "
+#~ "password siano state cambiate."
diff --git a/storage/xtradb/build/debian/po/ja.po b/storage/xtradb/build/debian/po/ja.po
new file mode 100644
index 00000000000..16af16b4d9f
--- /dev/null
+++ b/storage/xtradb/build/debian/po/ja.po
@@ -0,0 +1,394 @@
+#
+#    Translators, if you are not familiar with the PO format, gettext
+#    documentation is worth reading, especially sections dedicated to
+#    this format, e.g. by running:
+#         info -n '(gettext)PO Files'
+#         info -n '(gettext)Header Entry'
+#
+#    Some information specific to po-debconf are available at
+#            /usr/share/doc/po-debconf/README-trans
+#         or http://www.debian.org/intl/l10n/po-debconf/README-trans
+#
+#    Developers do not need to manually edit POT or PO files.
+#
+#
+msgid ""
+msgstr ""
+"Project-Id-Version: mysql-dfsg-5.1 5.0.32-6\n"
+"Report-Msgid-Bugs-To: percona-xtradb-dfsg-5.1@packages.debian.org\n"
+"POT-Creation-Date: 2010-02-15 17:10-0500\n"
+"PO-Revision-Date: 2007-02-18 22:25+0900\n"
+"Last-Translator: Hideki Yamane (Debian-JP) <henrich@debian.or.jp>\n"
+"Language-Team: Japanese <debian-japanese@lists.debian.org>\n"
+"MIME-Version: 1.0\n"
+"Content-Type: text/plain; charset=UTF-8\n"
+"Content-Transfer-Encoding: 8bit\n"
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:2001
+#, fuzzy
+#| msgid "Do you really want to downgrade?"
+msgid "Really proceed with downgrade?"
+msgstr "本当にダウングレードしますか?"
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:2001
+msgid "A file named /var/lib/mysql/debian-*.flag exists on this system."
+msgstr ""
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:2001
+#, fuzzy
+#| msgid ""
+#| "WARNING: The file /var/lib/mysql/debian-*.flag exists. This indicates "
+#| "that a mysql-server package with a higher version has been installed "
+#| "before. It can not be guaranteed that this version can use its data."
+msgid ""
+"Such file is an indication that a mysql-server package with a higher version "
+"has been installed earlier."
+msgstr ""
+"警告: /var/lib/mysql/debian-*.flag ファイルが存在しています。これは、以前によ"
+"り新しいバージョンの mysql-server パッケージがインストールされていたことを示"
+"します。データをこのバージョンで使えるかどうか、保証できません。"
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:2001
+msgid ""
+"There is no guarantee that the version you're currently installing will be "
+"able to use the current databases."
+msgstr ""
+
+#. Type: note
+#. Description
+#: ../percona-xtradb-server-5.1.templates:3001
+#, fuzzy
+#| msgid "Important note for NIS/YP users!"
+msgid "Important note for NIS/YP users"
+msgstr "NIS/YP ユーザへ重要な注意!"
+
+#. Type: note
+#. Description
+#: ../percona-xtradb-server-5.1.templates:3001
+msgid ""
+"To use MySQL, the following entries for users and groups should be added to "
+"the system:"
+msgstr ""
+
+#. Type: note
+#. Description
+#: ../percona-xtradb-server-5.1.templates:3001
+msgid ""
+"You should also check the permissions and the owner of the /var/lib/mysql "
+"directory:"
+msgstr ""
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:4001
+msgid "Remove all Percona SQL databases?"
+msgstr ""
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:4001
+msgid ""
+"The /var/lib/mysql directory which contains the Percona SQL databases is "
+"about to be removed."
+msgstr ""
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:4001
+#, fuzzy
+#| msgid ""
+#| "The script is about to remove the data directory /var/lib/mysql. If it is "
+#| "planned to just install a higher MySQL version or if a different mysql-"
+#| "server package is already using it, the data should be kept."
+msgid ""
+"If you're removing the Percona SQL package in order to later install a more "
+"recent version or if a different mysql-server package is already using it, "
+"the data should be kept."
+msgstr ""
+"このスクリプトはデータのディレクトリ /var/lib/mysql を削除するためのもので"
+"す。単に新しいバージョンの MySQL をインストールしようとしている、あるいは別"
+"の mysql-server パッケージを既に使っている場合、データは保持する必要がありま"
+"す。"
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:5001
+#, fuzzy
+#| msgid "Should MySQL start on boot?"
+msgid "Start the Percona SQL server on boot?"
+msgstr "MySQL をシステム起動時に開始しますか?"
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:5001
+#, fuzzy
+#| msgid ""
+#| "The MySQL can start automatically on boot time or only if you manually "
+#| "type '/etc/init.d/mysql start'."
+msgid ""
+"The Percona SQL server can be launched automatically at boot time or "
+"manually with the '/etc/init.d/mysql start' command."
+msgstr ""
+"MySQL の起動方法について、システム起動時に自動的に開始するか、あるいは '/etc/"
+"init.d/mysql start' と手で入力した時のみ起動するかを選べます。"
+
+#. Type: password
+#. Description
+#: ../percona-xtradb-server-5.1.templates:6001
+#, fuzzy
+#| msgid "New password for MySQL \"root\" user:"
+msgid "New password for the Percona SQL \"root\" user:"
+msgstr "MySQL の \"root\" ユーザに対する新しいパスワード:"
+
+#. Type: password
+#. Description
+#: ../percona-xtradb-server-5.1.templates:6001
+#, fuzzy
+#| msgid ""
+#| "It is highly recommended that you set a password for the MySQL "
+#| "administrative \"root\" user."
+msgid ""
+"While not mandatory, it is highly recommended that you set a password for "
+"the Percona SQL administrative \"root\" user."
+msgstr ""
+"MySQL を管理する \"root\" ユーザのパスワードを設定することを強くお勧めしま"
+"す。"
+
+#. Type: password
+#. Description
+#: ../percona-xtradb-server-5.1.templates:6001
+msgid "If that field is left blank, the password will not be changed."
+msgstr ""
+
+#. Type: password
+#. Description
+#: ../percona-xtradb-server-5.1.templates:7001
+#, fuzzy
+#| msgid "New password for MySQL \"root\" user:"
+msgid "Repeat password for the Percona SQL \"root\" user:"
+msgstr "MySQL の \"root\" ユーザに対する新しいパスワード:"
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:8001
+#, fuzzy
+#| msgid "Unable to set password for MySQL \"root\" user"
+msgid "Unable to set password for the Percona SQL \"root\" user"
+msgstr "MySQL の \"root\" ユーザのパスワードを設定できません"
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:8001
+#, fuzzy
+#| msgid ""
+#| "It seems an error occurred while setting the password for the MySQL "
+#| "administrative user.  This may have happened because the user already has "
+#| "a password, or because there was a problem communicating with the MySQL "
+#| "server."
+msgid ""
+"An error occurred while setting the password for the Percona SQL "
+"administrative user. This may have happened because the account already has "
+"a password, or because of a communication problem with the Percona SQL "
+"server."
+msgstr ""
+"MySQL の管理者ユーザに対してパスワードを設定しようとした際、エラーが発生した"
+"ようです。これは既に管理者ユーザにパスワードが設定されていたか、MySQL サーバ"
+"との接続に問題があったためだと思われます。"
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:8001
+msgid "You should check the account's password after the package installation."
+msgstr ""
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:8001
+#, fuzzy
+#| msgid ""
+#| "See /usr/share/doc/mysql-server-5.1/README.Debian for more information."
+msgid ""
+"Please read the /usr/share/doc/mysql-server-5.1/README.Debian file for more "
+"information."
+msgstr ""
+"詳細は /usr/share/doc/mysql-server-5.1/README.Debian を参照してください。"
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:9001
+msgid "Password input error"
+msgstr ""
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:9001
+msgid "The two passwords you entered were not the same. Please try again."
+msgstr ""
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:10001
+msgid "NDB Cluster seems to be in use"
+msgstr ""
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:10001
+msgid ""
+"Percona-SQL-5.1 has orphaned NDB Cluster support. Please migrate to the new "
+"mysql-cluster package and remove all lines starting with \"ndb\" from all "
+"config files below /etc/mysql/."
+msgstr ""
+
+#~ msgid "Cannot upgrade if ISAM tables are present!"
+#~ msgstr "ISAM テーブルがある場合はアップグレードできません!"
+
+#~ msgid ""
+#~ "Recent versions of MySQL can no longer use the old ISAM table format and "
+#~ "it is necessary to convert your tables to e.g. MyISAM before upgrading by "
+#~ "using \"mysql_convert_table_format\" or \"ALTER TABLE x ENGINE=MyISAM\". "
+#~ "The installation of mysql-server-5.1 will now abort. In case your old "
+#~ "mysql-server-4.1 gets removed nevertheless just reinstall it to convert "
+#~ "those tables."
+#~ msgstr ""
+#~ "MySQL の最近のバージョンでは以前の ISAM テーブル形式は利用できなくなってい"
+#~ "ます。そのため、例えば \"mysql_convert_table_format\" あるいは \"ALTER "
+#~ "TABLE x ENGINE=MyISAM\" としてアップグレード前に MyISAM にコンバートするこ"
+#~ "となどが必要です。mysql-server-5.1 のインストールを中断します。以前の "
+#~ "mysql-server-4.1 が削除されてしまった場合であっても、テーブルをコンバート"
+#~ "するために再インストールをしてください。"
+
+#~ msgid ""
+#~ "Support MySQL connections from hosts running Debian \"sarge\" or older?"
+#~ msgstr ""
+#~ "Debian \"Sarge\" あるいはそれよりも古いバージョンが稼働しているホストから"
+#~ "の MySQL 接続をサポートしますか?"
+
+#, fuzzy
+#~| msgid ""
+#~| "The way passwords were stored was not very secure. This has been "
+#~| "improved with the drawback that clients (e.g. PHP) from hosts running "
+#~| "Debian 3.1 Sarge will not be able to connect to account which are new or "
+#~| "whose password have been changed. See /usr/share/doc/mysql-server-5.1/"
+#~| "README.Debian."
+#~ msgid ""
+#~ "In old versions of MySQL clients on Debian, passwords were not stored "
+#~ "securely. This has been improved since then, however clients (such as "
+#~ "PHP) from hosts running Debian 3.1 Sarge will not be able to connect to "
+#~ "recent accounts or accounts whose password have been changed."
+#~ msgstr ""
+#~ "パスワードの保存方法は、あまり安全な方法で行われていませんでした。これは改"
+#~ "善されましたが、Debian 3.1 Sarge が稼働しているホストからクライアント "
+#~ "(PHP など) が新しいアカウントやパスワードが変更されたアカウントには接続で"
+#~ "きなくなるという欠点もでています。詳細については /usr/share/doc/mysql-"
+#~ "server-5.1/README.Debian を参照してください。"
+
+#~ msgid ""
+#~ "To use mysql you must install an equivalent user and group to the "
+#~ "following and ensure yourself that /var/lib/mysql has the right "
+#~ "permissions (the uid/gid may be different)."
+#~ msgstr ""
+#~ "mysql を利用するには 以下のユーザとグループを作成し、/var/lib/mysql が正し"
+#~ "い権限になっているかどうかを確認する必要があります (おそらく uid/gid が違"
+#~ "います)。"
+
+#~ msgid ""
+#~ "/etc/passwd:      mysql:x:100:101:MySQL Server:/var/lib/mysql:/bin/false"
+#~ msgstr ""
+#~ "/etc/passwd:      mysql:x:100:101:MySQL Server:/var/lib/mysql:/bin/false"
+
+#~ msgid "/etc/group:       mysql:x:101:"
+#~ msgstr "/etc/group:       mysql:x:101:"
+
+#~ msgid "/var/lib/mysql:   drwxr-xr-x   mysql    mysql"
+#~ msgstr "/var/lib/mysql:   drwxr-xr-x   mysql    mysql"
+
+#~ msgid "Remove the databases used by all MySQL versions?"
+#~ msgstr "全バージョンの MySQL で利用されているデータベースを削除しますか?"
+
+#~ msgid ""
+#~ "If you do not provide a password no changes will be made to the account."
+#~ msgstr ""
+#~ "パスワードを入力しない場合、アカウントに対して変更は加えられません。"
+
+#~ msgid ""
+#~ "When installation finishes, you should verify that the account is "
+#~ "properly protected with a password (see README.Debian for more "
+#~ "information)."
+#~ msgstr ""
+#~ "インストールが終了した際、アカウントがパスワードできちんと保護されているか"
+#~ "どうかを確認してください (詳細については README.Debian を参照してくださ"
+#~ "い)。"
+
+#~ msgid "Install Hints"
+#~ msgstr "インストールのヒント"
+
+#~ msgid ""
+#~ "On upgrades from MySQL 3.23, as shipped with Debian Woody, symlinks in "
+#~ "place of /var/lib/mysql or /var/log/mysql gets accidently removed and "
+#~ "have manually be restored."
+#~ msgstr ""
+#~ "Debian Woody でリリースされた MySQL 3.23 からのアップグレードでは、/var/"
+#~ "lib/mysql あるいは /var/log/mysql の代わりにシンボリックリンクは偶然にも削"
+#~ "除されてしまっているので、手動でのリストアが必要になります。"
+
+#~ msgid ""
+#~ "MySQL will only install if you have a non-numeric hostname that is "
+#~ "resolvable via the /etc/hosts file. E.g. if the \"hostname\" command "
+#~ "returns \"myhostname\" then there must be a line like \"10.0.0.1 "
+#~ "myhostname\"."
+#~ msgstr ""
+#~ "MySQL は /etc/hosts ファイル経由で解決できる「数字のみで構成されてない」ホ"
+#~ "スト名の場合のみ、インストールされます。つまり、\"hostname\" コマンドが "
+#~ "\"myhostname\" を返すなら、\"10.0.0.1 myhostname\" という行が /etc/hosts "
+#~ "ファイルにあるはずです。"
+
+#~ msgid ""
+#~ "A new mysql user \"debian-sys-maint\" will be created. This mysql account "
+#~ "is used in the start/stop and cron scripts. Don't delete."
+#~ msgstr ""
+#~ "新規に mysql ユーザとして \"debian-sys-maint\" が作成されます。この mysql "
+#~ "アカウントは start/stop 時と cron スクリプトで利用されます。消さないでくだ"
+#~ "さい。"
+
+#~ msgid ""
+#~ "Please remember to set a PASSWORD for the MySQL root user! If you use a /"
+#~ "root/.my.cnf, always write the \"user\" and the \"password\" lines in "
+#~ "there, never only the password!"
+#~ msgstr ""
+#~ "MySQL の root ユーザに対して「パスワードの設定」を忘れないでください! /"
+#~ "root/.my.cnf を使っている場合、このファイル中の \"user\" 行と \"password"
+#~ "\" 行を記述してください。決してパスワードだけではいけません!"
+
+#~ msgid ""
+#~ "Should I remove the complete /var/lib/mysql directory tree which is used "
+#~ "by all MySQL versions, not necessarily only the one you are about to "
+#~ "purge?"
+#~ msgstr ""
+#~ "これから purge しようとしているバージョンだけでは無く、全てのバージョンの "
+#~ "MySQL が /var/lib/mysql ディレクトリを使用しています。このディレクトリを完"
+#~ "全に削除しますか?"
+
+#~ msgid "Update Hints"
+#~ msgstr "更新のヒント"
+
+#~ msgid ""
+#~ "Rarely, e.g. on new major versions, the privilege system is improved. To "
+#~ "make use of it mysql_fix_privilege_tables must be executed manually. The "
+#~ "script is not supposed to give any user more rights that he had before,"
+#~ msgstr ""
+#~ "まれに、つまりは新しいメジャーバージョンにおいて、特権システムが改善されま"
+#~ "す。これを行うには、mysql_fix_privilege_table を手動で実行する必要がありま"
+#~ "す。スクリプトは、どのようなユーザに対しても、以前に保持していた以上の権限"
+#~ "を与えるようにはなっていません。"
+
+#~ msgid "Please also read http://www.mysql.com/doc/en/Upgrade.html"
+#~ msgstr "http://www.mysql.com/doc/ja/Upgrade.html も参照してください"
diff --git a/storage/xtradb/build/debian/po/nb.po b/storage/xtradb/build/debian/po/nb.po
new file mode 100644
index 00000000000..1235cdc1b2c
--- /dev/null
+++ b/storage/xtradb/build/debian/po/nb.po
@@ -0,0 +1,297 @@
+# translation of mysql_nb.po to Norwegian Bokmål
+# Copyright (C) YEAR THE PACKAGE'S COPYRIGHT HOLDER
+# This file is distributed under the same license as the PACKAGE package.
+#
+# Bjørn Steensrud <bjornst@powertech.no>, 2007.
+msgid ""
+msgstr ""
+"Project-Id-Version: mysql_nb\n"
+"Report-Msgid-Bugs-To: percona-xtradb-dfsg-5.1@packages.debian.org\n"
+"POT-Creation-Date: 2010-02-15 17:10-0500\n"
+"PO-Revision-Date: 2007-02-18 12:13+0100\n"
+"Last-Translator: Bjørn Steensrud <bjornst@powertech.no>\n"
+"Language-Team: Norwegian Bokmål <i18n-nb@lister.ping.uio.no>\n"
+"MIME-Version: 1.0\n"
+"Content-Type: text/plain; charset=UTF-8\n"
+"Content-Transfer-Encoding: 8bit\n"
+"X-Generator: KBabel 1.11.2\n"
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:2001
+#, fuzzy
+#| msgid "Do you really want to downgrade?"
+msgid "Really proceed with downgrade?"
+msgstr "Er du sikker på at du vil nedgradere?"
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:2001
+msgid "A file named /var/lib/mysql/debian-*.flag exists on this system."
+msgstr ""
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:2001
+#, fuzzy
+#| msgid ""
+#| "WARNING: The file /var/lib/mysql/debian-*.flag exists. This indicates "
+#| "that a mysql-server package with a higher version has been installed "
+#| "before. It can not be guaranteed that this version can use its data."
+msgid ""
+"Such file is an indication that a mysql-server package with a higher version "
+"has been installed earlier."
+msgstr ""
+"ADVARSEL: Fila /var/lib/mysql/debian-*.flag finnes. Dette viser at en mysql-"
+"server-pakke med et høyere versjonsnummer har vært installert før. Det kan "
+"ikke garanteres at denne versjonen kan bruke data fra den høyere versjonen."
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:2001
+msgid ""
+"There is no guarantee that the version you're currently installing will be "
+"able to use the current databases."
+msgstr ""
+
+#. Type: note
+#. Description
+#: ../percona-xtradb-server-5.1.templates:3001
+#, fuzzy
+#| msgid "Important note for NIS/YP users!"
+msgid "Important note for NIS/YP users"
+msgstr "Viktig merknad for NIS/YP-brukere!"
+
+#. Type: note
+#. Description
+#: ../percona-xtradb-server-5.1.templates:3001
+msgid ""
+"To use MySQL, the following entries for users and groups should be added to "
+"the system:"
+msgstr ""
+
+#. Type: note
+#. Description
+#: ../percona-xtradb-server-5.1.templates:3001
+msgid ""
+"You should also check the permissions and the owner of the /var/lib/mysql "
+"directory:"
+msgstr ""
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:4001
+msgid "Remove all Percona SQL databases?"
+msgstr ""
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:4001
+msgid ""
+"The /var/lib/mysql directory which contains the Percona SQL databases is "
+"about to be removed."
+msgstr ""
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:4001
+#, fuzzy
+#| msgid ""
+#| "The script is about to remove the data directory /var/lib/mysql. If it is "
+#| "planned to just install a higher MySQL version or if a different mysql-"
+#| "server package is already using it, the data should be kept."
+msgid ""
+"If you're removing the Percona SQL package in order to later install a more "
+"recent version or if a different mysql-server package is already using it, "
+"the data should be kept."
+msgstr ""
+"Dette skriptet skal til å fjerne data-mappa /var/lib/mysql. Denne mappa bør "
+"beholdes hvis det bare skal installeres en høyere MySQL-versjon, eller hvis "
+"en annen mysql-server-pakke allerede bruker den."
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:5001
+#, fuzzy
+#| msgid "Should MySQL start on boot?"
+msgid "Start the Percona SQL server on boot?"
+msgstr "Skal MySQL startes ved maskinoppstart?"
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:5001
+#, fuzzy
+#| msgid ""
+#| "The MySQL can start automatically on boot time or only if you manually "
+#| "type '/etc/init.d/mysql start'."
+msgid ""
+"The Percona SQL server can be launched automatically at boot time or "
+"manually with the '/etc/init.d/mysql start' command."
+msgstr ""
+"MySQL kan startes automatisk når maskinen starter, eller bare hvis du "
+"skriver «/etc/init.d/mysql start»."
+
+#. Type: password
+#. Description
+#: ../percona-xtradb-server-5.1.templates:6001
+#, fuzzy
+#| msgid "New password for MySQL \"root\" user:"
+msgid "New password for the Percona SQL \"root\" user:"
+msgstr "Nytt passord for MySQLs «root»-bruker:"
+
+#. Type: password
+#. Description
+#: ../percona-xtradb-server-5.1.templates:6001
+#, fuzzy
+#| msgid ""
+#| "It is highly recommended that you set a password for the MySQL "
+#| "administrative \"root\" user."
+msgid ""
+"While not mandatory, it is highly recommended that you set a password for "
+"the Percona SQL administrative \"root\" user."
+msgstr ""
+"Det anbefales sterkt at du oppgir et passord for den administrative «root»-"
+"brukeren i MySQl."
+
+#. Type: password
+#. Description
+#: ../percona-xtradb-server-5.1.templates:6001
+msgid "If that field is left blank, the password will not be changed."
+msgstr ""
+
+#. Type: password
+#. Description
+#: ../percona-xtradb-server-5.1.templates:7001
+#, fuzzy
+#| msgid "New password for MySQL \"root\" user:"
+msgid "Repeat password for the Percona SQL \"root\" user:"
+msgstr "Nytt passord for MySQLs «root»-bruker:"
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:8001
+#, fuzzy
+#| msgid "Unable to set password for MySQL \"root\" user"
+msgid "Unable to set password for the Percona SQL \"root\" user"
+msgstr "Klarer ikke angi passord for MySQLs «root»-bruker"
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:8001
+#, fuzzy
+#| msgid ""
+#| "It seems an error occurred while setting the password for the MySQL "
+#| "administrative user.  This may have happened because the user already has "
+#| "a password, or because there was a problem communicating with the MySQL "
+#| "server."
+msgid ""
+"An error occurred while setting the password for the Percona SQL "
+"administrative user. This may have happened because the account already has "
+"a password, or because of a communication problem with the Percona SQL "
+"server."
+msgstr ""
+"Det ser ut til at det oppsto en feil mens det ble satt et passord for MySQLs "
+"administrative bruker. Dette kan være fordi brukeren allerede har et "
+"passord, eller fordi det var et kommunikasjonsproblem med MySQL-tjeneren."
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:8001
+msgid "You should check the account's password after the package installation."
+msgstr ""
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:8001
+msgid ""
+"Please read the /usr/share/doc/mysql-server-5.1/README.Debian file for more "
+"information."
+msgstr ""
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:9001
+msgid "Password input error"
+msgstr ""
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:9001
+msgid "The two passwords you entered were not the same. Please try again."
+msgstr ""
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:10001
+msgid "NDB Cluster seems to be in use"
+msgstr ""
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:10001
+msgid ""
+"Percona-SQL-5.1 has orphaned NDB Cluster support. Please migrate to the new "
+"mysql-cluster package and remove all lines starting with \"ndb\" from all "
+"config files below /etc/mysql/."
+msgstr ""
+
+#~ msgid ""
+#~ "Support MySQL connections from hosts running Debian \"sarge\" or older?"
+#~ msgstr ""
+#~ "Skal MySQL-tilkoblinger støttes fra vertsmaskiner som kjører Debian "
+#~ "«sarge» eller eldre?"
+
+#, fuzzy
+#~| msgid ""
+#~| "The way passwords were stored was not very secure. This has been "
+#~| "improved with the drawback that clients (e.g. PHP) from hosts running "
+#~| "Debian 3.1 Sarge will not be able to connect to account which are new or "
+#~| "whose password have been changed. See /usr/share/doc/mysql-server-5.1/"
+#~| "README.Debian."
+#~ msgid ""
+#~ "In old versions of MySQL clients on Debian, passwords were not stored "
+#~ "securely. This has been improved since then, however clients (such as "
+#~ "PHP) from hosts running Debian 3.1 Sarge will not be able to connect to "
+#~ "recent accounts or accounts whose password have been changed."
+#~ msgstr ""
+#~ "Passord ble tidligere lagret på en lite sikker måte. Dette er nå "
+#~ "forbedret, med den ulempen at klienter (f.eks. PHP) fra verter som kjører "
+#~ "Debian 3.1 Sarge ikke vil kunne koble til en konto som er ny eller har "
+#~ "fått endret passordet. Se /usr/share/doc/mysql-server-5.1/README.Debian."
+
+#~ msgid ""
+#~ "To use mysql you must install an equivalent user and group to the "
+#~ "following and ensure yourself that /var/lib/mysql has the right "
+#~ "permissions (the uid/gid may be different)."
+#~ msgstr ""
+#~ "For å bruke MySQL må du installere en bruker og gruppe tilsvarende den "
+#~ "nedenfor og se til at /var/lib/mysql har riktige rettigheter (uid/gid kan "
+#~ "være forskjellig)."
+
+#~ msgid ""
+#~ "/etc/passwd:      mysql:x:100:101:MySQL Server:/var/lib/mysql:/bin/false"
+#~ msgstr ""
+#~ "/etc/passwd:      mysql:x:100:101:MySQL Server:/var/lib/mysql:/bin/false"
+
+#~ msgid "/etc/group:       mysql:x:101:"
+#~ msgstr "/etc/group:       mysql:x:101:"
+
+#~ msgid "/var/lib/mysql:   drwxr-xr-x   mysql    mysql"
+#~ msgstr "/var/lib/mysql:   drwxr-xr-x   mysql    mysql"
+
+#~ msgid "Remove the databases used by all MySQL versions?"
+#~ msgstr "Skal databasene brukt av alle MySQL-versjoner fjernes?"
+
+#~ msgid ""
+#~ "If you do not provide a password no changes will be made to the account."
+#~ msgstr ""
+#~ "Hvis du ikke oppgir et passord blir det ikke gjort noen endringer med "
+#~ "kontoen."
+
+#~ msgid ""
+#~ "When installation finishes, you should verify that the account is "
+#~ "properly protected with a password (see README.Debian for more "
+#~ "information)."
+#~ msgstr ""
+#~ "Når installasjonen er ferdig bør det sjekkes at kontoen er ordentlig "
+#~ "beskyttet med et passord (mer informasjon finnes i README.Debian)."
diff --git a/storage/xtradb/build/debian/po/nl.po b/storage/xtradb/build/debian/po/nl.po
new file mode 100644
index 00000000000..bfe418117e6
--- /dev/null
+++ b/storage/xtradb/build/debian/po/nl.po
@@ -0,0 +1,302 @@
+# Dutch mysql-dfsg-5.1 po-debconf translation,
+# Copyright (C) 2006 THE PACKAGE'S COPYRIGHT HOLDER
+# This file is distributed under the same license as the mysql-dfsg-5.1 package.
+# Vincent Zweije <zweije@xs4all.nl>, 2006.
+#
+msgid ""
+msgstr ""
+"Project-Id-Version: mysql-dfsg-5.1 5.0.30-1\n"
+"Report-Msgid-Bugs-To: percona-xtradb-dfsg-5.1@packages.debian.org\n"
+"POT-Creation-Date: 2010-02-15 17:10-0500\n"
+"PO-Revision-Date: 2006-02-19 10:20+0100\n"
+"Last-Translator: Thijs Kinkhorst <thijs@debian.org>\n"
+"Language-Team: Debian-Dutch <debian-l10n-dutch@lists.debian.org>\n"
+"MIME-Version: 1.0\n"
+"Content-Type: text/plain; charset=utf-8\n"
+"Content-Transfer-Encoding: 8bit\n"
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:2001
+#, fuzzy
+#| msgid "Do you really want to downgrade?"
+msgid "Really proceed with downgrade?"
+msgstr "Wilt u echt een oude versie herstellen?"
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:2001
+msgid "A file named /var/lib/mysql/debian-*.flag exists on this system."
+msgstr ""
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:2001
+#, fuzzy
+#| msgid ""
+#| "WARNING: The file /var/lib/mysql/debian-*.flag exists. This indicates "
+#| "that a mysql-server package with a higher version has been installed "
+#| "before. It can not be guaranteed that this version can use its data."
+msgid ""
+"Such file is an indication that a mysql-server package with a higher version "
+"has been installed earlier."
+msgstr ""
+"Waarschuwing: waarschijnlijk is een hogere versie van het mysql-server "
+"pakket geïnstalleerd geweest (het bestand /var/lib/mysql/debian-*.flag "
+"bestaat). Er is geen garantie dat de gegevensbestanden, bewerkt met die "
+"hogere versie, kunnen worden gebruikt met de versie van mysql die u nu "
+"installeert."
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:2001
+msgid ""
+"There is no guarantee that the version you're currently installing will be "
+"able to use the current databases."
+msgstr ""
+
+#. Type: note
+#. Description
+#: ../percona-xtradb-server-5.1.templates:3001
+#, fuzzy
+#| msgid "Important note for NIS/YP users!"
+msgid "Important note for NIS/YP users"
+msgstr "Belangrijke opmerking voor gebruikers van NIS/YP!"
+
+#. Type: note
+#. Description
+#: ../percona-xtradb-server-5.1.templates:3001
+msgid ""
+"To use MySQL, the following entries for users and groups should be added to "
+"the system:"
+msgstr ""
+
+#. Type: note
+#. Description
+#: ../percona-xtradb-server-5.1.templates:3001
+msgid ""
+"You should also check the permissions and the owner of the /var/lib/mysql "
+"directory:"
+msgstr ""
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:4001
+msgid "Remove all Percona SQL databases?"
+msgstr ""
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:4001
+msgid ""
+"The /var/lib/mysql directory which contains the Percona SQL databases is "
+"about to be removed."
+msgstr ""
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:4001
+#, fuzzy
+#| msgid ""
+#| "The script is about to remove the data directory /var/lib/mysql. If it is "
+#| "planned to just install a higher MySQL version or if a different mysql-"
+#| "server package is already using it, the data should be kept."
+msgid ""
+"If you're removing the Percona SQL package in order to later install a more "
+"recent version or if a different mysql-server package is already using it, "
+"the data should be kept."
+msgstr ""
+"Het script staat op het punt de datamap /var/lib/mysql te verwijderen. Als "
+"het plan alleen maar is om een hogere MySQL versie te installeren, of als "
+"een ander mysql-serverpakket de datamap al gebruikt, dan zou de data moeten "
+"worden behouden."
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:5001
+#, fuzzy
+#| msgid "Should MySQL start on boot?"
+msgid "Start the Percona SQL server on boot?"
+msgstr "Moet MySQL starten als de computer start?"
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:5001
+#, fuzzy
+#| msgid ""
+#| "The MySQL can start automatically on boot time or only if you manually "
+#| "type '/etc/init.d/mysql start'."
+msgid ""
+"The Percona SQL server can be launched automatically at boot time or "
+"manually with the '/etc/init.d/mysql start' command."
+msgstr ""
+"MySQL kan automatisch starten bij het starten van de computer, of slechts "
+"wanneer u '/etc/init.d/mysql start' handmatig uitvoert."
+
+#. Type: password
+#. Description
+#: ../percona-xtradb-server-5.1.templates:6001
+#, fuzzy
+#| msgid "New password for MySQL \"root\" user:"
+msgid "New password for the Percona SQL \"root\" user:"
+msgstr "Nieuw wachtwoord voor de MySQL \"root\"-gebruiker:"
+
+#. Type: password
+#. Description
+#: ../percona-xtradb-server-5.1.templates:6001
+#, fuzzy
+#| msgid ""
+#| "It is highly recommended that you set a password for the MySQL "
+#| "administrative \"root\" user."
+msgid ""
+"While not mandatory, it is highly recommended that you set a password for "
+"the Percona SQL administrative \"root\" user."
+msgstr ""
+"Het wordt sterk aangeraden een wachtwoord in te stellen voor de "
+"administratieve MySQL \"root\"-gebruiker."
+
+#. Type: password
+#. Description
+#: ../percona-xtradb-server-5.1.templates:6001
+msgid "If that field is left blank, the password will not be changed."
+msgstr ""
+
+#. Type: password
+#. Description
+#: ../percona-xtradb-server-5.1.templates:7001
+#, fuzzy
+#| msgid "New password for MySQL \"root\" user:"
+msgid "Repeat password for the Percona SQL \"root\" user:"
+msgstr "Nieuw wachtwoord voor de MySQL \"root\"-gebruiker:"
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:8001
+#, fuzzy
+#| msgid "Unable to set password for MySQL \"root\" user"
+msgid "Unable to set password for the Percona SQL \"root\" user"
+msgstr "Kan het wachtwoord voor de MySQL \"root\"-gebruiker niet instellen"
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:8001
+#, fuzzy
+#| msgid ""
+#| "It seems an error occurred while setting the password for the MySQL "
+#| "administrative user.  This may have happened because the user already has "
+#| "a password, or because there was a problem communicating with the MySQL "
+#| "server."
+msgid ""
+"An error occurred while setting the password for the Percona SQL "
+"administrative user. This may have happened because the account already has "
+"a password, or because of a communication problem with the Percona SQL "
+"server."
+msgstr ""
+"Er lijkt een fout te zijn opgetreden bij het instellen van het wachtwoord "
+"van de MySQL administratieve gebruiker. Dat kan komen doordat de gebruiker "
+"al een wachtwoord heeft, of omdat er een probleem was bij het communiceren "
+"met de MySQL server."
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:8001
+msgid "You should check the account's password after the package installation."
+msgstr ""
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:8001
+msgid ""
+"Please read the /usr/share/doc/mysql-server-5.1/README.Debian file for more "
+"information."
+msgstr ""
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:9001
+msgid "Password input error"
+msgstr ""
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:9001
+msgid "The two passwords you entered were not the same. Please try again."
+msgstr ""
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:10001
+msgid "NDB Cluster seems to be in use"
+msgstr ""
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:10001
+msgid ""
+"Percona-SQL-5.1 has orphaned NDB Cluster support. Please migrate to the new "
+"mysql-cluster package and remove all lines starting with \"ndb\" from all "
+"config files below /etc/mysql/."
+msgstr ""
+
+#~ msgid ""
+#~ "Support MySQL connections from hosts running Debian \"sarge\" or older?"
+#~ msgstr ""
+#~ "Moet u MySQL-verbindingen accepteren van computers die Debian \"sarge\" "
+#~ "of ouder draaien?"
+
+#, fuzzy
+#~| msgid ""
+#~| "The way passwords were stored was not very secure. This has been "
+#~| "improved with the drawback that clients (e.g. PHP) from hosts running "
+#~| "Debian 3.1 Sarge will not be able to connect to account which are new or "
+#~| "whose password have been changed. See /usr/share/doc/mysql-server-5.1/"
+#~| "README.Debian."
+#~ msgid ""
+#~ "In old versions of MySQL clients on Debian, passwords were not stored "
+#~ "securely. This has been improved since then, however clients (such as "
+#~ "PHP) from hosts running Debian 3.1 Sarge will not be able to connect to "
+#~ "recent accounts or accounts whose password have been changed."
+#~ msgstr ""
+#~ "De wijze waarop wachtwoorden werden bewaard was niet erg veilig. Dit is "
+#~ "verbeterd, maar helaas zullen programma's van computers die Debian 3.1 "
+#~ "Sarge draaien, geen verbinding meer kunnen maken met accounts die nieuw "
+#~ "zijn, of waarvan het wachtwoord is gewijzigd. Zie /usr/share/doc/mysql-"
+#~ "server-5.1/README.Debian."
+
+#~ msgid ""
+#~ "To use mysql you must install an equivalent user and group to the "
+#~ "following and ensure yourself that /var/lib/mysql has the right "
+#~ "permissions (the uid/gid may be different)."
+#~ msgstr ""
+#~ "Om mysql te gebruiken dient u een gebruiker en groep aan te maken, "
+#~ "gelijkwaardig aan onderstaand voorbeeld, en u dient ervoor te zorgen dat /"
+#~ "var/lib/mysql de bijbehorende toegangsrechten heeft (uid en gid mogen "
+#~ "anders zijn)."
+
+#~ msgid ""
+#~ "/etc/passwd:      mysql:x:100:101:MySQL Server:/var/lib/mysql:/bin/false"
+#~ msgstr ""
+#~ "/etc/passwd:      mysql:x:100:101:MySQL Server:/var/lib/mysql:/bin/false"
+
+#~ msgid "/etc/group:       mysql:x:101:"
+#~ msgstr "/etc/group:       mysql:x:101:"
+
+#~ msgid "/var/lib/mysql:   drwxr-xr-x   mysql    mysql"
+#~ msgstr "/var/lib/mysql:   drwxr-xr-x   mysql    mysql"
+
+#~ msgid "Remove the databases used by all MySQL versions?"
+#~ msgstr "Databases verwijderen die door alle MySQL versies worden gebruikt?"
+
+#~ msgid ""
+#~ "If you do not provide a password no changes will be made to the account."
+#~ msgstr ""
+#~ "Indien u geen wachtwoord opgeeft zal het account niet worden gewijzigd."
+
+#~ msgid ""
+#~ "When installation finishes, you should verify that the account is "
+#~ "properly protected with a password (see README.Debian for more "
+#~ "information)."
+#~ msgstr ""
+#~ "Wanneer de installatie klaar is, dient u te verifiëren dat het account "
+#~ "netjes beschermd is met een wachtwoord (zie README.Debian voor meer "
+#~ "informatie)."
diff --git a/storage/xtradb/build/debian/po/pt.po b/storage/xtradb/build/debian/po/pt.po
new file mode 100644
index 00000000000..3372b11b06b
--- /dev/null
+++ b/storage/xtradb/build/debian/po/pt.po
@@ -0,0 +1,322 @@
+# Portuguese translation for mysql-dfsg-5.1's debconf messages
+# Copyright (C) 2006 Miguel Figueiredo <elmig@debianpt.org>
+# This file is distributed under the same license as the mysql-dfsg-5.1 package.
+# Miguel Figueiredo <elmig@debianpt.org>
+#
+msgid ""
+msgstr ""
+"Project-Id-Version: mysql-dfsg-5.1\n"
+"Report-Msgid-Bugs-To: percona-xtradb-dfsg-5.1@packages.debian.org\n"
+"POT-Creation-Date: 2010-02-15 17:10-0500\n"
+"PO-Revision-Date: 2007-05-05 21:01+0100\n"
+"Last-Translator: Miguel Figueiredo <elmig@debianpt.org>\n"
+"Language-Team: Portuguese <traduz@debianpt.org>\n"
+"MIME-Version: 1.0\n"
+"Content-Type: text/plain; charset=UTF-8\n"
+"Content-Transfer-Encoding: 8bit\n"
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:2001
+msgid "Really proceed with downgrade?"
+msgstr "Deseja mesmo fazer downgrade?"
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:2001
+msgid "A file named /var/lib/mysql/debian-*.flag exists on this system."
+msgstr "Existe um ficheiro chamado /var/lib/mysql/debian-*.flag neste sistema."
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:2001
+msgid ""
+"Such file is an indication that a mysql-server package with a higher version "
+"has been installed earlier."
+msgstr ""
+"Tal ficheiro significa que anteriormente foi instalado um pacote mysql-"
+"server com um número de versão superior."
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:2001
+msgid ""
+"There is no guarantee that the version you're currently installing will be "
+"able to use the current databases."
+msgstr ""
+"Não existe nenhuma garantia que a versão que está actualmente a instalar "
+"seja capaz de utilizar as bases de dados actuais."
+
+#. Type: note
+#. Description
+#: ../percona-xtradb-server-5.1.templates:3001
+msgid "Important note for NIS/YP users"
+msgstr "Nota importante para utilizadores de NIS/YP"
+
+#. Type: note
+#. Description
+#: ../percona-xtradb-server-5.1.templates:3001
+msgid ""
+"To use MySQL, the following entries for users and groups should be added to "
+"the system:"
+msgstr ""
+"Para utilizar o MySQL, têm de ser acrescentadas as seguintes entradas para "
+"os utilizadores e grupos:"
+
+#. Type: note
+#. Description
+#: ../percona-xtradb-server-5.1.templates:3001
+msgid ""
+"You should also check the permissions and the owner of the /var/lib/mysql "
+"directory:"
+msgstr ""
+"Deve também verificar as permissões e o dono do directório /var/lib/mysql :"
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:4001
+#, fuzzy
+#| msgid "Remove all MySQL databases?"
+msgid "Remove all Percona SQL databases?"
+msgstr "Remover todas as bases de dados MySQL?"
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:4001
+#, fuzzy
+#| msgid ""
+#| "The /var/lib/mysql directory which contains the MySQL databases is about "
+#| "to be removed."
+msgid ""
+"The /var/lib/mysql directory which contains the Percona SQL databases is "
+"about to be removed."
+msgstr ""
+"O directório /var/lib/mysql que contém as bases de dados MySQL está prestes "
+"a ser removido."
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:4001
+#, fuzzy
+#| msgid ""
+#| "If you're removing the MySQL package in order to later install a more "
+#| "recent version or if a different mysql-server package is already using "
+#| "it, the data should be kept."
+msgid ""
+"If you're removing the Percona SQL package in order to later install a more "
+"recent version or if a different mysql-server package is already using it, "
+"the data should be kept."
+msgstr ""
+"Se está a remover o pacote MySQL de modo a posteriormente instalar uma "
+"versão mais recente ou se um pacote mysq-server já está os está a utilizar, "
+"os dados devem ser mantidos."
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:5001
+#, fuzzy
+#| msgid "Start the MySQL server on boot?"
+msgid "Start the Percona SQL server on boot?"
+msgstr "Iniciar o servidor MySQL no arranque?"
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:5001
+#, fuzzy
+#| msgid ""
+#| "The MySQL server can be launched automatically at boot time or manually "
+#| "with the '/etc/init.d/mysql start' command."
+msgid ""
+"The Percona SQL server can be launched automatically at boot time or "
+"manually with the '/etc/init.d/mysql start' command."
+msgstr ""
+"O MySQL pode ser automaticamente lançado no arranque ou manualmente através "
+"do comando '/etc/init.d/mysql start'."
+
+#. Type: password
+#. Description
+#: ../percona-xtradb-server-5.1.templates:6001
+#, fuzzy
+#| msgid "New password for the MySQL \"root\" user:"
+msgid "New password for the Percona SQL \"root\" user:"
+msgstr "Nova palavra-passe para o utilizador \"root\" do MySQL:"
+
+#. Type: password
+#. Description
+#: ../percona-xtradb-server-5.1.templates:6001
+#, fuzzy
+#| msgid ""
+#| "While not mandatory, it is highly recommended that you set a password for "
+#| "the MySQL administrative \"root\" user."
+msgid ""
+"While not mandatory, it is highly recommended that you set a password for "
+"the Percona SQL administrative \"root\" user."
+msgstr ""
+"Embora não seja mandatório, É fortemente recomendado que defina uma palavra-"
+"passe para o utilizador administrativo \"root\" do MySQL."
+
+#. Type: password
+#. Description
+#: ../percona-xtradb-server-5.1.templates:6001
+msgid "If that field is left blank, the password will not be changed."
+msgstr ""
+"Se esse campo for deixado em branco, a palavra-passe não irá ser alterada."
+
+#. Type: password
+#. Description
+#: ../percona-xtradb-server-5.1.templates:7001
+#, fuzzy
+#| msgid "New password for the MySQL \"root\" user:"
+msgid "Repeat password for the Percona SQL \"root\" user:"
+msgstr "Nova palavra-passe para o utilizador \"root\" do MySQL:"
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:8001
+#, fuzzy
+#| msgid "Unable to set password for the MySQL \"root\" user"
+msgid "Unable to set password for the Percona SQL \"root\" user"
+msgstr ""
+"Não foi possível definir a palavra-passe para o utilizador \"root\" do MySQL"
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:8001
+#, fuzzy
+#| msgid ""
+#| "An error occurred while setting the password for the MySQL administrative "
+#| "user. This may have happened because the account already has a password, "
+#| "or because of a communication problem with the MySQL server."
+msgid ""
+"An error occurred while setting the password for the Percona SQL "
+"administrative user. This may have happened because the account already has "
+"a password, or because of a communication problem with the Percona SQL "
+"server."
+msgstr ""
+"Ocorreu um erro enquanto era definida a palavra-passe para o utilizador "
+"administrativo do MySQL. Isto pode ter acontecido porque a cona já tem uma "
+"palavra-passe, ou porque ocorreu um problema ao comunicação com o servidor "
+"MySQL."
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:8001
+#, fuzzy
+#| msgid ""
+#| "You should check the account's password after tha package installation."
+msgid "You should check the account's password after the package installation."
+msgstr ""
+"Você deve verificar a palavra-passe da conta após a instalação do pacote."
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:8001
+#, fuzzy
+#| msgid ""
+#| "Please read the /usr/share/doc/mysql-server-5.1/README.Debian file for "
+#| "more information."
+msgid ""
+"Please read the /usr/share/doc/mysql-server-5.1/README.Debian file for more "
+"information."
+msgstr ""
+"Para mais informação por favor leia o ficheiro /usr/share/doc/mysql-server-"
+"5.1/README.Debian."
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:9001
+msgid "Password input error"
+msgstr ""
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:9001
+msgid "The two passwords you entered were not the same. Please try again."
+msgstr ""
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:10001
+msgid "NDB Cluster seems to be in use"
+msgstr ""
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:10001
+msgid ""
+"Percona-SQL-5.1 has orphaned NDB Cluster support. Please migrate to the new "
+"mysql-cluster package and remove all lines starting with \"ndb\" from all "
+"config files below /etc/mysql/."
+msgstr ""
+
+#~ msgid "Cannot upgrade if ISAM tables are present!"
+#~ msgstr "Não é possível actualizar se estiverem presentes tabelas ISAM!"
+
+#~ msgid ""
+#~ "Recent versions of MySQL can no longer use the old ISAM table format and "
+#~ "it is necessary to convert your tables to e.g. MyISAM before upgrading by "
+#~ "using \"mysql_convert_table_format\" or \"ALTER TABLE x ENGINE=MyISAM\". "
+#~ "The installation of mysql-server-5.1 will now abort. In case your old "
+#~ "mysql-server-4.1 gets removed nevertheless just reinstall it to convert "
+#~ "those tables."
+#~ msgstr ""
+#~ "As versões recentes de MySQL já não podem utilizar o antigo formato de "
+#~ "tabelas ISAM e é por isso necessário converter as suas tabelas pra e.g. "
+#~ "MyISAM antes da actualização, utilizando \"mysql_convert_table_format\" "
+#~ "ou \"ALTER TABLE x ENGINE=MyISAM\". A instalação de mysql-server-5.1 irá "
+#~ "agora ser cancelada. Se o seu antigo mysql-server-4.1 for removido apenas "
+#~ "reinstale para converter essas tabelas."
+
+#~ msgid ""
+#~ "Support MySQL connections from hosts running Debian \"sarge\" or older?"
+#~ msgstr ""
+#~ "Suportar ligações MySQL de máquinas que corram Debian \"sarge\" ou mais "
+#~ "antigos?"
+
+#~ msgid ""
+#~ "In old versions of MySQL clients on Debian, passwords were not stored "
+#~ "securely. This has been improved since then, however clients (such as "
+#~ "PHP) from hosts running Debian 3.1 Sarge will not be able to connect to "
+#~ "recent accounts or accounts whose password have been changed."
+#~ msgstr ""
+#~ "Nas versões antigas de clientes de MySQL em Debian, as palavras-passe não "
+#~ "eram guardadas de forma segura. Isto foi melhorado desde aí, no entanto "
+#~ "os clientes (como o PHP) de máquinas que corram Debian 3.1 Sarge não irão "
+#~ "conseguir ligar-se a contas novas ou cuja palavra-passe foi alterada."
+
+#~ msgid ""
+#~ "To use mysql you must install an equivalent user and group to the "
+#~ "following and ensure yourself that /var/lib/mysql has the right "
+#~ "permissions (the uid/gid may be different)."
+#~ msgstr ""
+#~ "Para utilizar mysql e instalar um utilizador e grupo equivalentes para o "
+#~ "seguinte e assegurar-se que /var/lib/mysql têm as permissões correctas (o "
+#~ "uid/gid podem ser diferentes)."
+
+#~ msgid ""
+#~ "/etc/passwd:      mysql:x:100:101:MySQL Server:/var/lib/mysql:/bin/false"
+#~ msgstr ""
+#~ "/etc/passwd:      mysql:x:100:101:MySQL Server:/var/lib/mysql:/bin/false"
+
+#~ msgid "/etc/group:       mysql:x:101:"
+#~ msgstr "/etc/group:       mysql:x:101:"
+
+#~ msgid "/var/lib/mysql:   drwxr-xr-x   mysql    mysql"
+#~ msgstr "/var/lib/mysql:   drwxr-xr-x   mysql    mysql"
+
+#~ msgid "Remove the databases used by all MySQL versions?"
+#~ msgstr "Remover as bases de dados utilizadas por todas as versões de MySQL?"
+
+#~ msgid ""
+#~ "If you do not provide a password no changes will be made to the account."
+#~ msgstr ""
+#~ "Se não disponibilizar uma password não serão feitas alterações nesta "
+#~ "conta."
+
+#~ msgid ""
+#~ "When installation finishes, you should verify that the account is "
+#~ "properly protected with a password (see README.Debian for more "
+#~ "information)."
+#~ msgstr ""
+#~ "Quando terminar a instalação, deve verificar se a conta está devidamente "
+#~ "protegida com uma password (para mais informações veja README.Debian)."
diff --git a/storage/xtradb/build/debian/po/pt_BR.po b/storage/xtradb/build/debian/po/pt_BR.po
new file mode 100644
index 00000000000..e04a5a2393d
--- /dev/null
+++ b/storage/xtradb/build/debian/po/pt_BR.po
@@ -0,0 +1,458 @@
+# Brazilian Portuguese (pt_BR) debconf template translation for
+# Debian's mysql-dfsg source package.
+# Debian-BR Project <debian-l10n-portuguese@lists.debian.org>
+# André Luís Lopes, <andrelop@debian.org> , 2004
+# André Luís Lopes, <andrelop@debian.org> , 2006
+# André Luís Lopes, <andrelop@debian.org> , 2007
+#
+msgid ""
+msgstr ""
+"Project-Id-Version: mysql-dfsg-5.1\n"
+"Report-Msgid-Bugs-To: percona-xtradb-dfsg-5.1@packages.debian.org\n"
+"POT-Creation-Date: 2010-02-15 17:10-0500\n"
+"PO-Revision-Date: 2007-04-21 15:59-0300\n"
+"Last-Translator: André Luís Lopes <andrelop@debian.org>\n"
+"Language-Team: Debian-BR Project <debian-l10n-portuguese@lists.debian.org>\n"
+"MIME-Version: 1.0\n"
+"Content-Type: text/plain; charset=UTF-8\n"
+"Content-Transfer-Encoding: 8bit\n"
+"pt_BR utf-8\n"
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:2001
+msgid "Really proceed with downgrade?"
+msgstr "Realmente proceder com o rebaixamento de versão?"
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:2001
+msgid "A file named /var/lib/mysql/debian-*.flag exists on this system."
+msgstr "Um arquivo de nome /var/lib/mysql/debian-*.flag existe no sistema."
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:2001
+msgid ""
+"Such file is an indication that a mysql-server package with a higher version "
+"has been installed earlier."
+msgstr ""
+"A presença de um arquivo como este é uma indicação de que um pacote mysql-"
+"server com um número de versão mais alto já foi instalado anteriormente."
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:2001
+msgid ""
+"There is no guarantee that the version you're currently installing will be "
+"able to use the current databases."
+msgstr ""
+"Não há garantias de que a versão que você está instalando no momento "
+"conseguirá utilizar as bases de dados existentes."
+
+#. Type: note
+#. Description
+#: ../percona-xtradb-server-5.1.templates:3001
+msgid "Important note for NIS/YP users"
+msgstr "Aviso importante para usuários NIS/YP"
+
+#. Type: note
+#. Description
+#: ../percona-xtradb-server-5.1.templates:3001
+msgid ""
+"To use MySQL, the following entries for users and groups should be added to "
+"the system:"
+msgstr ""
+"Para utilizar o MySQL, as seguintes entradas para usuários e grupos devem "
+"ser adicionadas ao sistema:"
+
+#. Type: note
+#. Description
+#: ../percona-xtradb-server-5.1.templates:3001
+msgid ""
+"You should also check the permissions and the owner of the /var/lib/mysql "
+"directory:"
+msgstr ""
+"Você deverá também checar as permissões e o dono do diretório /var/lib/mysql:"
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:4001
+#, fuzzy
+#| msgid "Remove all MySQL databases?"
+msgid "Remove all Percona SQL databases?"
+msgstr "Remover todas as bases de dados do MySQL?"
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:4001
+#, fuzzy
+#| msgid ""
+#| "The /var/lib/mysql directory which contains the MySQL databases is about "
+#| "to be removed."
+msgid ""
+"The /var/lib/mysql directory which contains the Percona SQL databases is "
+"about to be removed."
+msgstr ""
+"O diretório /var/lib/mysql, o qual contém as bases de dados do MySQL, está "
+"prestes a ser removido."
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:4001
+#, fuzzy
+#| msgid ""
+#| "If you're removing the MySQL package in order to later install a more "
+#| "recent version or if a different mysql-server package is already using "
+#| "it, the data should be kept."
+msgid ""
+"If you're removing the Percona SQL package in order to later install a more "
+"recent version or if a different mysql-server package is already using it, "
+"the data should be kept."
+msgstr ""
+"Caso você esteja removendo o pacote MySQL para posteriormente instalar uma "
+"versão mais recente ou, caso uma versão diferente do pacote mysql-server "
+"esteja sendo utilizada, os dados deverão ser mantidos."
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:5001
+#, fuzzy
+#| msgid "Start the MySQL server on boot?"
+msgid "Start the Percona SQL server on boot?"
+msgstr "Iniciar o servidor MySQL junto a inicialização da máquina?"
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:5001
+#, fuzzy
+#| msgid ""
+#| "The MySQL server can be launched automatically at boot time or manually "
+#| "with the '/etc/init.d/mysql start' command."
+msgid ""
+"The Percona SQL server can be launched automatically at boot time or "
+"manually with the '/etc/init.d/mysql start' command."
+msgstr ""
+"O servidor MySQL pode ser iniciado automaticamente junto a inicialização da "
+"máquina ou manualmente com o comando '/etc/init.d/mysql start'."
+
+#. Type: password
+#. Description
+#: ../percona-xtradb-server-5.1.templates:6001
+#, fuzzy
+#| msgid "New password for the MySQL \"root\" user:"
+msgid "New password for the Percona SQL \"root\" user:"
+msgstr "Nova senha para o usuário \"root\" do MySQL:"
+
+#. Type: password
+#. Description
+#: ../percona-xtradb-server-5.1.templates:6001
+#, fuzzy
+#| msgid ""
+#| "While not mandatory, it is highly recommended that you set a password for "
+#| "the MySQL administrative \"root\" user."
+msgid ""
+"While not mandatory, it is highly recommended that you set a password for "
+"the Percona SQL administrative \"root\" user."
+msgstr ""
+"Apesar de não ser mandatório, é altamente recomendado que você defina uma "
+"senha para o usuário administrativo \"root\" do MySQL."
+
+#. Type: password
+#. Description
+#: ../percona-xtradb-server-5.1.templates:6001
+msgid "If that field is left blank, the password will not be changed."
+msgstr "Caso este campo seja deixado em branco, a senha não sera mudada."
+
+#. Type: password
+#. Description
+#: ../percona-xtradb-server-5.1.templates:7001
+#, fuzzy
+#| msgid "New password for the MySQL \"root\" user:"
+msgid "Repeat password for the Percona SQL \"root\" user:"
+msgstr "Nova senha para o usuário \"root\" do MySQL:"
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:8001
+#, fuzzy
+#| msgid "Unable to set password for the MySQL \"root\" user"
+msgid "Unable to set password for the Percona SQL \"root\" user"
+msgstr "Impossível definir senha para o usuário \"root\" do MySQL"
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:8001
+#, fuzzy
+#| msgid ""
+#| "An error occurred while setting the password for the MySQL administrative "
+#| "user. This may have happened because the account already has a password, "
+#| "or because of a communication problem with the MySQL server."
+msgid ""
+"An error occurred while setting the password for the Percona SQL "
+"administrative user. This may have happened because the account already has "
+"a password, or because of a communication problem with the Percona SQL "
+"server."
+msgstr ""
+"Um erro ocorreu durante a definição da senha para o usuário administrativo "
+"do MySQL. Isso pode ter acontecido devido a esse usuário já possuir uma "
+"senha definida ou devido a ocorrência de um problema de comunicação com o "
+"servidor MySQL."
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:8001
+#, fuzzy
+#| msgid ""
+#| "You should check the account's password after tha package installation."
+msgid "You should check the account's password after the package installation."
+msgstr "Você deverá checar a senha dessa conta após a instalação deste pacote."
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:8001
+#, fuzzy
+#| msgid ""
+#| "Please read the /usr/share/doc/mysql-server-5.1/README.Debian file for "
+#| "more information."
+msgid ""
+"Please read the /usr/share/doc/mysql-server-5.1/README.Debian file for more "
+"information."
+msgstr ""
+"Por favor, leia o arquivo /usr/share/doc/mysql-server-5.1/README.Debian para "
+"maiores informações."
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:9001
+msgid "Password input error"
+msgstr ""
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:9001
+msgid "The two passwords you entered were not the same. Please try again."
+msgstr ""
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:10001
+msgid "NDB Cluster seems to be in use"
+msgstr ""
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:10001
+msgid ""
+"Percona-SQL-5.1 has orphaned NDB Cluster support. Please migrate to the new "
+"mysql-cluster package and remove all lines starting with \"ndb\" from all "
+"config files below /etc/mysql/."
+msgstr ""
+
+#~ msgid ""
+#~ "Support MySQL connections from hosts running Debian \"sarge\" or older?"
+#~ msgstr ""
+#~ "Suportar conexões MySQL originadas de hosts executando o Debian \"sarge\" "
+#~ "ou mais antigos ?"
+
+#~ msgid ""
+#~ "In old versions of MySQL clients on Debian, passwords were not stored "
+#~ "securely. This has been improved since then, however clients (such as "
+#~ "PHP) from hosts running Debian 3.1 Sarge will not be able to connect to "
+#~ "recent accounts or accounts whose password have been changed."
+#~ msgstr ""
+#~ "Em versões antigas dos clientes MySQL no Debian, as senhas não eram "
+#~ "armazenadas de forma segura. Isto foi corrigido desde então, porém, "
+#~ "clientes (como o PHP) em hosts executando o Debian 3.1 Sarge não serão "
+#~ "capazes de conectar em contas recentes ou contas as quais as senhas "
+#~ "tenham sido modificadas."
+
+#~ msgid ""
+#~ "To use mysql you must install an equivalent user and group to the "
+#~ "following and ensure yourself that /var/lib/mysql has the right "
+#~ "permissions (the uid/gid may be different)."
+#~ msgstr ""
+#~ "Para utilizar o MySQL, você deve instalar um usuário e um grupo "
+#~ "equivalentes ao usuário e grupo a seguir para se certificar de que o "
+#~ "diretório /var/lib/mysql possua as permissões correctas (o uid/gid podem "
+#~ "ser diferentes)."
+
+#~ msgid ""
+#~ "/etc/passwd:      mysql:x:100:101:MySQL Server:/var/lib/mysql:/bin/false"
+#~ msgstr ""
+#~ "/etc/passwd:      mysql:x:100:101:MySQL Server:/var/lib/mysql:/bin/false"
+
+#~ msgid "/etc/group:       mysql:x:101:"
+#~ msgstr "/etc/group:       mysql:x:101:"
+
+#~ msgid "/var/lib/mysql:   drwxr-xr-x   mysql    mysql"
+#~ msgstr "/var/lib/mysql:   drwxr-xr-x   mysql    mysql"
+
+#~ msgid "Remove the databases used by all MySQL versions?"
+#~ msgstr "Remover as bases de dados utilizadas por todas as versões do MySQL?"
+
+#~ msgid ""
+#~ "If you do not provide a password no changes will be made to the account."
+#~ msgstr ""
+#~ "Caso você não forneça uma senha, nenhuma mudança será feita na conta."
+
+#~ msgid ""
+#~ "When installation finishes, you should verify that the account is "
+#~ "properly protected with a password (see README.Debian for more "
+#~ "information)."
+#~ msgstr ""
+#~ "Quando a instalação finalizar, você deverá verificar se a conta está "
+#~ "apropriadamente protegida com uma senha (consulte o arquivo README.Debian "
+#~ "para maiores informações)."
+
+#~ msgid "internal"
+#~ msgstr "interno"
+
+#~ msgid "Only internally used."
+#~ msgstr "Somente utilizado internamente."
+
+#, fuzzy
+#~ msgid "Update Hints"
+#~ msgstr "Dicas de atualização"
+
+#, fuzzy
+#~ msgid ""
+#~ "Rarely, e.g. on new major versions, the privilege system is improved. To "
+#~ "make use of it mysql_fix_privilege_tables must be executed manually. The "
+#~ "script is not supposed to give any user more rights that he had before,"
+#~ msgstr ""
+#~ "Raramente, por exemplo, em novas versões maiores, o sistema de "
+#~ "privilégios é melhorado. Para fazer uso disso, o script "
+#~ "mysql_fix_privilege_tables deve ser executado manualmente. O script não "
+#~ "atribuirá a nenhum usuário mais direitos do que os mesmos já possuíam "
+#~ "anteriormente."
+
+#~ msgid "Please also read http://www.mysql.com/doc/en/Upgrade.html"
+#~ msgstr "Por favor, leia http://www.mysql.com/doc/en/Upgrade.html"
+
+#, fuzzy
+#~ msgid "Install Hints"
+#~ msgstr "Dicas de instalação"
+
+#, fuzzy
+#~ msgid ""
+#~ "MySQL will only install if you have a non-numeric hostname that is "
+#~ "resolvable via the /etc/hosts file. E.g. if the \"hostname\" command "
+#~ "returns \"myhostname\" then there must be a line like \"10.0.0.1 "
+#~ "myhostname\"."
+#~ msgstr ""
+#~ "O MySQL será instalado somente caso você possua um nome de host NÃO "
+#~ "NUMÉRICO que possa ser resolvido através do arquivo /etc/hosts, ou seja, "
+#~ "caso o comando \"hostname\" retorne \"myhostname\", uma linha como "
+#~ "\"10.0.0.1 myhostname\" deverá existir no arquivo /etc/hosts."
+
+#~ msgid ""
+#~ "A new mysql user \"debian-sys-maint\" will be created. This mysql account "
+#~ "is used in the start/stop and cron scripts. Don't delete."
+#~ msgstr ""
+#~ "Um novo usuário MySQL de nome \"debian-sys-maint\" será criado. Essa "
+#~ "conta MySQL é utilizada pelos scripts de inicialização/parada e pelos "
+#~ "scripts cron. Não remova esse usuário."
+
+#, fuzzy
+#~ msgid ""
+#~ "Please remember to set a PASSWORD for the MySQL root user! If you use a /"
+#~ "root/.my.cnf, always write the \"user\" and the \"password\" lines in "
+#~ "there, never only the password!"
+#~ msgstr ""
+#~ "Por favor, lembre-se de definir uma SENHA para o usuário root do MySQL ! "
+#~ "Caso você utilize um arquivo /root/.my.cnf, sempre inclua as linhas \"user"
+#~ "\" e \"password\" nesse arquivo, nunca somente a senha ! Consulte o "
+#~ "arquivo /usr/share/doc/mysql-server/README.Debian para mais informações."
+
+#~ msgid ""
+#~ "Should I remove all databases below /var/lib/mysql as you are purging the "
+#~ "mysql-server package?"
+#~ msgstr ""
+#~ "Todas as base de dados sob o diretório /var/lib/mysql devem ser removidas "
+#~ "quando você remover o pacote pacote mysql-server ?"
+
+#~ msgid ""
+#~ "Networking is disabled by default for security reasons. You can enable it "
+#~ "by commenting out the skip-networking option in /etc/mysql/my.cnf."
+#~ msgstr ""
+#~ "O suporte ao funcionamento em rede está desativado por padrão por "
+#~ "questões de segurança. Você poderá ativá-lo comentando a opção 'skip-"
+#~ "networking' no arquivo /etc/mysql/my.cnf."
+
+#~ msgid "security and update notice"
+#~ msgstr "aviso de segurança e actualização"
+
+#~ msgid ""
+#~ "Should I remove everything below /var/lib/mysql when you purge the mysql-"
+#~ "server package with the \"dpkg --purge mysql-server\" command (i.e. "
+#~ "remove everything including the configuration) somewhen? (default is not)"
+#~ msgstr ""
+#~ "Devo remover tudo abaixo de /var/lib/mysql quando fizer o purge do pacote "
+#~ "mysql-server com o comando \"dpkg --purge mysql-server\" (ou seja, "
+#~ "remover tudo incluíndo a configuração)? (o padrão é não remover)"
+
+#~ msgid "Make MySQL reachable via network?"
+#~ msgstr "Fazer com que o MySQL seja acessível via rede?"
+
+#~ msgid ""
+#~ "Should MySQL listen on a network reachable TCP port? This is not "
+#~ "necessary for use on a single computer and could be a security problem."
+#~ msgstr ""
+#~ "O MySQL deve aguardar ligações numa porta TCP acessível via rede? Isto "
+#~ "não é necessário para uso num único computador e pode ser um problema de "
+#~ "segurança."
+
+#~ msgid "Enable chroot mode?"
+#~ msgstr "Activar o modo chroot?"
+
+#~ msgid ""
+#~ "MySQL is able to jail itself into the /var/lib/mysql_jail directory so "
+#~ "that users cannot modify any files outside this directory. This improves "
+#~ "resistence against crackers, too, as they are not able to modify system "
+#~ "files."
+#~ msgstr ""
+#~ "O MySQL é capaz de se prender no diretório /var/lib/mysql_jail, assim os "
+#~ "utilizadores não poderão modificar ficheiros fora deste directório. Isto "
+#~ "aumenta também a resistência contra crackers, pois eles não poderão "
+#~ "modificar arquivos de sistema."
+
+#~ msgid "Please run mysql_fix_privilege_tables !"
+#~ msgstr "Por favor execute mysql_fix_privilege_tables !"
+
+#~ msgid ""
+#~ "I will ensure secure permissions of /var/lib/mysql by replacing GIDs "
+#~ "other than root and mysql with mysql."
+#~ msgstr ""
+#~ "Permissões seguras para o diretório /var/lib/mysql serão asseguradas "
+#~ "substituíndo GIDs diferentes de root e mysql por mysql."
+
+#~ msgid ""
+#~ "Instructions how to enable SSL support are in /usr/share/doc/mysql-server/"
+#~ msgstr ""
+#~ "Instruções sobre como activar o suporte de SSL estão disponíveis no "
+#~ "directório /usr/share/doc/mysql-server/."
+
+#, fuzzy
+#~ msgid "mysql_fix_privileges_tables should be executed"
+#~ msgstr "mysql_fix_privileges_tables será executado"
+
+#, fuzzy
+#~ msgid ""
+#~ "The latest MySQL versions have an enhanced, more fine grained, privilege "
+#~ "system. To make use of it, some new fields must be added to the tables "
+#~ "in  the \"mysql\" database. This will not happen automatically."
+#~ msgstr ""
+#~ "As últimas versões do MySQL possuem um sistema de privilégios melhorado e "
+#~ "mais refinado. Para utilizá-lo, alguns novos campos devem ser adicionados "
+#~ "as tabelas na base de dados \"mysql\". Isto é feito pelo script "
+#~ "mysql_fix_privileges_tables durante esta actualização independente do "
+#~ "servidor estar a correr ou não !"
+
+#~ msgid ""
+#~ "This script is not supposed to give any user more rights that he had "
+#~ "before, if you encounter such a case, please contact me."
+#~ msgstr ""
+#~ "Este script não deverá fornecer mais direitos a um utilizador além dos "
+#~ "quais ele já possua anteriormente. SE encontrar um caso desses, por favor "
+#~ "entre em contacto com o mantainer deste pacote Debian."
diff --git a/storage/xtradb/build/debian/po/ro.po b/storage/xtradb/build/debian/po/ro.po
new file mode 100644
index 00000000000..37902bfd913
--- /dev/null
+++ b/storage/xtradb/build/debian/po/ro.po
@@ -0,0 +1,319 @@
+# Romanian translation of mysql-dfsg.
+# Copyright (C) 2006 THE mysql-dfsg'S COPYRIGHT HOLDER
+# This file is distributed under the same license as the mysql-dfsg package.
+#
+# Stan Ioan-Eugen <stan.ieugen@gmail.com>, 2006.
+msgid ""
+msgstr ""
+"Project-Id-Version: po-debconf://mysql-dfsg\n"
+"Report-Msgid-Bugs-To: percona-xtradb-dfsg-5.1@packages.debian.org\n"
+"POT-Creation-Date: 2010-02-15 17:10-0500\n"
+"PO-Revision-Date: 2006-12-20 21:27+0200\n"
+"Last-Translator: stan ioan-eugen <stan.ieugen@gmail.com>\n"
+"Language-Team: romanian <debian-l10n-romanian@lists.debian.org>\n"
+"MIME-Version: 1.0\n"
+"Content-Type: text/plain; charset=UTF-8\n"
+"Content-Transfer-Encoding: 8bit\n"
+"X-Generator: KBabel 1.11.4\n"
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:2001
+#, fuzzy
+#| msgid "Do you really want to downgrade?"
+msgid "Really proceed with downgrade?"
+msgstr "Sunteţi sigur că doriţi să instalaţi o versiune mai veche?"
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:2001
+msgid "A file named /var/lib/mysql/debian-*.flag exists on this system."
+msgstr ""
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:2001
+#, fuzzy
+#| msgid ""
+#| "WARNING: The file /var/lib/mysql/debian-*.flag exists. This indicates "
+#| "that a mysql-server package with a higher version has been installed "
+#| "before. It can not be guaranteed that this version can use its data."
+msgid ""
+"Such file is an indication that a mysql-server package with a higher version "
+"has been installed earlier."
+msgstr ""
+"AVERTISMENT: Fişierul /var/lib/mysql/debian-*.flag există. Acest lucru "
+"indică faptul că anterior a fost instalată o versiune nouă a pachetului "
+"mysql-server. Nu se poate garanta că versiunea instalată acum poate folosi "
+"datele versiunii instalate anterior."
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:2001
+msgid ""
+"There is no guarantee that the version you're currently installing will be "
+"able to use the current databases."
+msgstr ""
+
+#. Type: note
+#. Description
+#: ../percona-xtradb-server-5.1.templates:3001
+#, fuzzy
+#| msgid "Important note for NIS/YP users!"
+msgid "Important note for NIS/YP users"
+msgstr "Notă importantă pentru utilizatorii NIS/YP!"
+
+#. Type: note
+#. Description
+#: ../percona-xtradb-server-5.1.templates:3001
+msgid ""
+"To use MySQL, the following entries for users and groups should be added to "
+"the system:"
+msgstr ""
+
+#. Type: note
+#. Description
+#: ../percona-xtradb-server-5.1.templates:3001
+msgid ""
+"You should also check the permissions and the owner of the /var/lib/mysql "
+"directory:"
+msgstr ""
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:4001
+msgid "Remove all Percona SQL databases?"
+msgstr ""
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:4001
+msgid ""
+"The /var/lib/mysql directory which contains the Percona SQL databases is "
+"about to be removed."
+msgstr ""
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:4001
+#, fuzzy
+#| msgid ""
+#| "The script is about to remove the data directory /var/lib/mysql. If it is "
+#| "planned to just install a higher MySQL version or if a different mysql-"
+#| "server package is already using it, the data should be kept."
+msgid ""
+"If you're removing the Percona SQL package in order to later install a more "
+"recent version or if a different mysql-server package is already using it, "
+"the data should be kept."
+msgstr ""
+"Scriptul urmează să şteargă directorul de date /var/lib/mysql. Dacă plănuiţi "
+"doar să instalaţi o versiune nouă MySQL sau datele sunt folosite de către un "
+"alt pachet mysql-server, atunci ar trebui păstraţi datele."
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:5001
+#, fuzzy
+#| msgid "Should MySQL start on boot?"
+msgid "Start the Percona SQL server on boot?"
+msgstr "Doriţi ca MySQL să pornească la initializarea sistemului?"
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:5001
+#, fuzzy
+#| msgid ""
+#| "The MySQL can start automatically on boot time or only if you manually "
+#| "type '/etc/init.d/mysql start'."
+msgid ""
+"The Percona SQL server can be launched automatically at boot time or "
+"manually with the '/etc/init.d/mysql start' command."
+msgstr ""
+"MySQL poate porni automat la iniţializarea sistemului sau doar dacă rulaţi "
+"comanda „/etc/init.d/mysql start”."
+
+#. Type: password
+#. Description
+#: ../percona-xtradb-server-5.1.templates:6001
+#, fuzzy
+#| msgid "New password for MySQL \"root\" user:"
+msgid "New password for the Percona SQL \"root\" user:"
+msgstr "Noua parolă pentru utilizatorul „root” al MySQL:"
+
+#. Type: password
+#. Description
+#: ../percona-xtradb-server-5.1.templates:6001
+#, fuzzy
+#| msgid ""
+#| "It is highly recommended that you set a password for the MySQL "
+#| "administrative \"root\" user."
+msgid ""
+"While not mandatory, it is highly recommended that you set a password for "
+"the Percona SQL administrative \"root\" user."
+msgstr ""
+"Este recomandat să stabiliţi o parolă pentru utilizatorul administrativ "
+"„root” al MySQL."
+
+#. Type: password
+#. Description
+#: ../percona-xtradb-server-5.1.templates:6001
+msgid "If that field is left blank, the password will not be changed."
+msgstr ""
+
+#. Type: password
+#. Description
+#: ../percona-xtradb-server-5.1.templates:7001
+#, fuzzy
+#| msgid "New password for MySQL \"root\" user:"
+msgid "Repeat password for the Percona SQL \"root\" user:"
+msgstr "Noua parolă pentru utilizatorul „root” al MySQL:"
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:8001
+#, fuzzy
+#| msgid "Unable to set password for MySQL \"root\" user"
+msgid "Unable to set password for the Percona SQL \"root\" user"
+msgstr "Nu s-a putut stabili parola pentru utilizatorul „root” al MySQL"
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:8001
+#, fuzzy
+#| msgid ""
+#| "It seems an error occurred while setting the password for the MySQL "
+#| "administrative user.  This may have happened because the user already has "
+#| "a password, or because there was a problem communicating with the MySQL "
+#| "server."
+msgid ""
+"An error occurred while setting the password for the Percona SQL "
+"administrative user. This may have happened because the account already has "
+"a password, or because of a communication problem with the Percona SQL "
+"server."
+msgstr ""
+"Se pare că a intervenit o eroare în stabilirea parolei pentru utilizatorul "
+"administrativ al MySQL. Acest lucru se poate întâmpla dacă utilizatorul are "
+"deja o parolă, sau a existat o problemă în comunicarea cu serverul MySQL."
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:8001
+msgid "You should check the account's password after the package installation."
+msgstr ""
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:8001
+msgid ""
+"Please read the /usr/share/doc/mysql-server-5.1/README.Debian file for more "
+"information."
+msgstr ""
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:9001
+msgid "Password input error"
+msgstr ""
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:9001
+msgid "The two passwords you entered were not the same. Please try again."
+msgstr ""
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:10001
+msgid "NDB Cluster seems to be in use"
+msgstr ""
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:10001
+msgid ""
+"Percona-SQL-5.1 has orphaned NDB Cluster support. Please migrate to the new "
+"mysql-cluster package and remove all lines starting with \"ndb\" from all "
+"config files below /etc/mysql/."
+msgstr ""
+
+#~ msgid "Cannot upgrade if ISAM tables are present!"
+#~ msgstr "Nu se poate face actualizarea dacă sunt prezente tabele ISAM!"
+
+#~ msgid ""
+#~ "Recent versions of MySQL can no longer use the old ISAM table format and "
+#~ "it is necessary to convert your tables to e.g. MyISAM before upgrading by "
+#~ "using \"mysql_convert_table_format\" or \"ALTER TABLE x ENGINE=MyISAM\". "
+#~ "The installation of mysql-server-5.1 will now abort. In case your old "
+#~ "mysql-server-4.1 gets removed nevertheless just reinstall it to convert "
+#~ "those tables."
+#~ msgstr ""
+#~ "Versiunile recente MySQL nu mai pot folosi vechiul format de tabele ISAM "
+#~ "şieste necesar să convertiţi tabelele dumneavoastră de ex. la formatul "
+#~ "MyISAM înainte de a face actualizarea folosind comanda "
+#~ "„mysql_convert_table_format” sau „ALTER TABLE x ENGINE=MyISAM”. "
+#~ "Instalarea mysql-server-5.1 va eşua. În caz că ştergeţiversiunea "
+#~ "anterioară mysql-server-4.1 va trebui reinstalată pentru a converti "
+#~ "tabelele."
+
+#~ msgid ""
+#~ "Support MySQL connections from hosts running Debian \"sarge\" or older?"
+#~ msgstr ""
+#~ "Suportaţi conexiuni MySQL de la staţii ce rulează sistemul Debian „sarge” "
+#~ "sau mai vechi?"
+
+#, fuzzy
+#~| msgid ""
+#~| "The way passwords were stored was not very secure. This has been "
+#~| "improved with the drawback that clients (e.g. PHP) from hosts running "
+#~| "Debian 3.1 Sarge will not be able to connect to account which are new or "
+#~| "whose password have been changed. See /usr/share/doc/mysql-server-5.1/"
+#~| "README.Debian."
+#~ msgid ""
+#~ "In old versions of MySQL clients on Debian, passwords were not stored "
+#~ "securely. This has been improved since then, however clients (such as "
+#~ "PHP) from hosts running Debian 3.1 Sarge will not be able to connect to "
+#~ "recent accounts or accounts whose password have been changed."
+#~ msgstr ""
+#~ "Modul în care erau păstrate parolele nu era foarte sigur. Acest lucru a "
+#~ "fost îmbunătăţitcu dezajantajul că clienţii (de ex. PHP) de pe staţii ce "
+#~ "rulează sistemul Debian 3.1 Sargenu se vor putea conecta la conturi noi "
+#~ "sau ale căror parole au fost schimbate. Citiţi /usr/share/doc/mysql-"
+#~ "server-5.1/README.Debian."
+
+#~ msgid ""
+#~ "To use mysql you must install an equivalent user and group to the "
+#~ "following and ensure yourself that /var/lib/mysql has the right "
+#~ "permissions (the uid/gid may be different)."
+#~ msgstr ""
+#~ "Pentru a folosi mysql trebuie să adăugaţi un utilizator şi grup "
+#~ "echivalent şi să vă asiguraţi că /var/lib/mysql are permisiunile "
+#~ "stabilite corect (uid/gid pot aveavalori diferite)."
+
+#~ msgid ""
+#~ "/etc/passwd:      mysql:x:100:101:MySQL Server:/var/lib/mysql:/bin/false"
+#~ msgstr ""
+#~ "/etc/passwd:\tmysql:x:100:101:MySQL Server:/var/lib/mysql:/bin/false"
+
+#~ msgid "/etc/group:       mysql:x:101:"
+#~ msgstr "/etc/group:\tmysql:x:101:"
+
+#~ msgid "/var/lib/mysql:   drwxr-xr-x   mysql    mysql"
+#~ msgstr "/var/lib/mysql:\tdrwxr-xr-x\tmysql\tmysql"
+
+#~ msgid "Remove the databases used by all MySQL versions?"
+#~ msgstr "Doriţi să ştergeţi bazele de date folosite de toate versiune MySQL?"
+
+#~ msgid ""
+#~ "If you do not provide a password no changes will be made to the account."
+#~ msgstr ""
+#~ "Dacă nu introduceţi nici o parolă, nici o schimbare nu va fi luată în "
+#~ "considerare."
+
+#~ msgid ""
+#~ "When installation finishes, you should verify that the account is "
+#~ "properly protected with a password (see README.Debian for more "
+#~ "information)."
+#~ msgstr ""
+#~ "După finalizarea instalării, ar trebui să verificaţi dacă contul este "
+#~ "protejat cu o parolă (citiţi fişierul README.Debian pentru informaţii "
+#~ "suplimentare)."
diff --git a/storage/xtradb/build/debian/po/ru.po b/storage/xtradb/build/debian/po/ru.po
new file mode 100644
index 00000000000..14867533070
--- /dev/null
+++ b/storage/xtradb/build/debian/po/ru.po
@@ -0,0 +1,305 @@
+# translation of mysql-dfsg-5.1_5.0.32-6_ru.po to Russian
+# Russian messages:
+#    Translators, if you are not familiar with the PO format, gettext
+#    documentation is worth reading, especially sections dedicated to
+#    this format, e.g. by running:
+#         info -n '(gettext)PO Files'
+#         info -n '(gettext)Header Entry'#
+#    Some information specific to po-debconf are available at
+#            /usr/share/doc/po-debconf/README-trans
+#         or http://www.debian.org/intl/l10n/po-debconf/README-trans#
+#    Developers do not need to manually edit POT or PO files.
+# Ilgiz Kalmetev <translator@ilgiz.pp.ru>, 2003.
+# Yuriy Talakan' <yt@amur.elektra.ru>, 2005, 2006.
+# Yuriy Talakan' <yt@drsk.ru>, 2007.
+#
+msgid ""
+msgstr ""
+"Project-Id-Version: mysql-dfsg-5.1_5.0.32-6_ru\n"
+"Report-Msgid-Bugs-To: percona-xtradb-dfsg-5.1@packages.debian.org\n"
+"POT-Creation-Date: 2010-02-15 17:10-0500\n"
+"PO-Revision-Date: 2007-02-19 11:28+0900\n"
+"Last-Translator: Yuriy Talakan' <yt@drsk.ru>\n"
+"Language-Team: Russian <debian-l10n-russian@lists.debian.org>\n"
+"MIME-Version: 1.0\n"
+"Content-Type: text/plain; charset=UTF-8\n"
+"Content-Transfer-Encoding: 8bit\n"
+"X-Generator: KBabel 1.9.1\n"
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:2001
+#, fuzzy
+#| msgid "Do you really want to downgrade?"
+msgid "Really proceed with downgrade?"
+msgstr "Вы действительно желаете понизить версию?"
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:2001
+msgid "A file named /var/lib/mysql/debian-*.flag exists on this system."
+msgstr ""
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:2001
+#, fuzzy
+#| msgid ""
+#| "WARNING: The file /var/lib/mysql/debian-*.flag exists. This indicates "
+#| "that a mysql-server package with a higher version has been installed "
+#| "before. It can not be guaranteed that this version can use its data."
+msgid ""
+"Such file is an indication that a mysql-server package with a higher version "
+"has been installed earlier."
+msgstr ""
+"ВНИМАНИЕ: Найден файл /var/lib/mysql/debian-*.flag. Это означает, что ранее "
+"был установлен пакет mysql-server более высокой версии. Невозможно "
+"гарантировать, что текущая версия сможет использовать его данные."
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:2001
+msgid ""
+"There is no guarantee that the version you're currently installing will be "
+"able to use the current databases."
+msgstr ""
+
+#. Type: note
+#. Description
+#: ../percona-xtradb-server-5.1.templates:3001
+#, fuzzy
+#| msgid "Important note for NIS/YP users!"
+msgid "Important note for NIS/YP users"
+msgstr "Важное замечание для пользователей NIS/YP!"
+
+#. Type: note
+#. Description
+#: ../percona-xtradb-server-5.1.templates:3001
+msgid ""
+"To use MySQL, the following entries for users and groups should be added to "
+"the system:"
+msgstr ""
+
+#. Type: note
+#. Description
+#: ../percona-xtradb-server-5.1.templates:3001
+msgid ""
+"You should also check the permissions and the owner of the /var/lib/mysql "
+"directory:"
+msgstr ""
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:4001
+msgid "Remove all Percona SQL databases?"
+msgstr ""
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:4001
+msgid ""
+"The /var/lib/mysql directory which contains the Percona SQL databases is "
+"about to be removed."
+msgstr ""
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:4001
+#, fuzzy
+#| msgid ""
+#| "The script is about to remove the data directory /var/lib/mysql. If it is "
+#| "planned to just install a higher MySQL version or if a different mysql-"
+#| "server package is already using it, the data should be kept."
+msgid ""
+"If you're removing the Percona SQL package in order to later install a more "
+"recent version or if a different mysql-server package is already using it, "
+"the data should be kept."
+msgstr ""
+"Сценарий собирается удалить директорию данных /var/lib/mysql. Если "
+"планируется установить новую версию MySQL или есть другие пакеты mysql-"
+"server, использующие эту директорию, то данные надо сохранить."
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:5001
+#, fuzzy
+#| msgid "Should MySQL start on boot?"
+msgid "Start the Percona SQL server on boot?"
+msgstr "Запускать MySQL при загрузке системы?"
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:5001
+#, fuzzy
+#| msgid ""
+#| "The MySQL can start automatically on boot time or only if you manually "
+#| "type '/etc/init.d/mysql start'."
+msgid ""
+"The Percona SQL server can be launched automatically at boot time or "
+"manually with the '/etc/init.d/mysql start' command."
+msgstr ""
+"MySQL может запускаться при загрузке системы, либо только если вы вручную "
+"введете команду '/etc/init.d/mysql start'. "
+
+#. Type: password
+#. Description
+#: ../percona-xtradb-server-5.1.templates:6001
+#, fuzzy
+#| msgid "New password for MySQL \"root\" user:"
+msgid "New password for the Percona SQL \"root\" user:"
+msgstr "Новый пароль для MySQL пользователя \"root\":"
+
+#. Type: password
+#. Description
+#: ../percona-xtradb-server-5.1.templates:6001
+#, fuzzy
+#| msgid ""
+#| "It is highly recommended that you set a password for the MySQL "
+#| "administrative \"root\" user."
+msgid ""
+"While not mandatory, it is highly recommended that you set a password for "
+"the Percona SQL administrative \"root\" user."
+msgstr ""
+"Крайне рекомендуется установить пароль для административного MySQL "
+"пользователя \"root\"."
+
+#. Type: password
+#. Description
+#: ../percona-xtradb-server-5.1.templates:6001
+msgid "If that field is left blank, the password will not be changed."
+msgstr ""
+
+#. Type: password
+#. Description
+#: ../percona-xtradb-server-5.1.templates:7001
+#, fuzzy
+#| msgid "New password for MySQL \"root\" user:"
+msgid "Repeat password for the Percona SQL \"root\" user:"
+msgstr "Новый пароль для MySQL пользователя \"root\":"
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:8001
+#, fuzzy
+#| msgid "Unable to set password for MySQL \"root\" user"
+msgid "Unable to set password for the Percona SQL \"root\" user"
+msgstr "Невозможно задать пароль MySQL пользователю \"root\""
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:8001
+#, fuzzy
+#| msgid ""
+#| "It seems an error occurred while setting the password for the MySQL "
+#| "administrative user.  This may have happened because the user already has "
+#| "a password, or because there was a problem communicating with the MySQL "
+#| "server."
+msgid ""
+"An error occurred while setting the password for the Percona SQL "
+"administrative user. This may have happened because the account already has "
+"a password, or because of a communication problem with the Percona SQL "
+"server."
+msgstr ""
+"В процессе задания пароля административного MySQL пользователя произошла "
+"ошибка.  Это могло произойти если у пользователя уже был задан пароль, или "
+"из-за проблем соединения с сервером MySQL."
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:8001
+msgid "You should check the account's password after the package installation."
+msgstr ""
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:8001
+msgid ""
+"Please read the /usr/share/doc/mysql-server-5.1/README.Debian file for more "
+"information."
+msgstr ""
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:9001
+msgid "Password input error"
+msgstr ""
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:9001
+msgid "The two passwords you entered were not the same. Please try again."
+msgstr ""
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:10001
+msgid "NDB Cluster seems to be in use"
+msgstr ""
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:10001
+msgid ""
+"Percona-SQL-5.1 has orphaned NDB Cluster support. Please migrate to the new "
+"mysql-cluster package and remove all lines starting with \"ndb\" from all "
+"config files below /etc/mysql/."
+msgstr ""
+
+#~ msgid ""
+#~ "Support MySQL connections from hosts running Debian \"sarge\" or older?"
+#~ msgstr ""
+#~ "Нужна поддержка подключений к MySQL с машин, работающих под Debian \"sarge"
+#~ "\" или старше?"
+
+#, fuzzy
+#~| msgid ""
+#~| "The way passwords were stored was not very secure. This has been "
+#~| "improved with the drawback that clients (e.g. PHP) from hosts running "
+#~| "Debian 3.1 Sarge will not be able to connect to account which are new or "
+#~| "whose password have been changed. See /usr/share/doc/mysql-server-5.1/"
+#~| "README.Debian."
+#~ msgid ""
+#~ "In old versions of MySQL clients on Debian, passwords were not stored "
+#~ "securely. This has been improved since then, however clients (such as "
+#~ "PHP) from hosts running Debian 3.1 Sarge will not be able to connect to "
+#~ "recent accounts or accounts whose password have been changed."
+#~ msgstr ""
+#~ "Метод хранения пароля был не очень безопасен. Это было сделано из-за "
+#~ "того, клиенты (например, PHP) с машин, работающих под Debian 3.1 Sarge не "
+#~ "смогут подключиться к учетной записи если она новая, или пароль был "
+#~ "изменен. См. /usr/share/doc/mysql-server-5.1/README.Debian."
+
+#~ msgid ""
+#~ "To use mysql you must install an equivalent user and group to the "
+#~ "following and ensure yourself that /var/lib/mysql has the right "
+#~ "permissions (the uid/gid may be different)."
+#~ msgstr ""
+#~ "Чтобы использовать mysql, Вы должны установить эквивалентные пользователя "
+#~ "и группу, как указано ниже и убедиться, что /var/lib/mysql имеет "
+#~ "правильные права (uid/gid могут отличаться)."
+
+#~ msgid ""
+#~ "/etc/passwd:      mysql:x:100:101:MySQL Server:/var/lib/mysql:/bin/false"
+#~ msgstr ""
+#~ "/etc/passwd:      mysql:x:100:101:MySQL Server:/var/lib/mysql:/bin/false"
+
+#~ msgid "/etc/group:       mysql:x:101:"
+#~ msgstr "/etc/group:       mysql:x:101:"
+
+#~ msgid "/var/lib/mysql:   drwxr-xr-x   mysql    mysql"
+#~ msgstr "/var/lib/mysql:   drwxr-xr-x   mysql    mysql"
+
+#~ msgid "Remove the databases used by all MySQL versions?"
+#~ msgstr "Удалить базы данных, используемые всеми версиями MySQL?"
+
+#~ msgid ""
+#~ "If you do not provide a password no changes will be made to the account."
+#~ msgstr "Если вы не зададите пароль, то учетная запись не будет изменена."
+
+#~ msgid ""
+#~ "When installation finishes, you should verify that the account is "
+#~ "properly protected with a password (see README.Debian for more "
+#~ "information)."
+#~ msgstr ""
+#~ "Когда установка завершится, вы должны убедиться, что учетная запись "
+#~ "защищена паролем (подробную информацию см. в README.Debian)."
diff --git a/storage/xtradb/build/debian/po/sv.po b/storage/xtradb/build/debian/po/sv.po
new file mode 100644
index 00000000000..ea9da131e58
--- /dev/null
+++ b/storage/xtradb/build/debian/po/sv.po
@@ -0,0 +1,400 @@
+# Translators, if you are not familiar with the PO format, gettext
+# documentation is worth reading, especially sections dedicated to
+# this format, e.g. by running:
+# info -n '(gettext)PO Files'
+# info -n '(gettext)Header Entry'
+# Some information specific to po-debconf are available at
+# /usr/share/doc/po-debconf/README-trans
+# or http://www.debian.org/intl/l10n/po-debconf/README-trans
+# Developers do not need to manually edit POT or PO files.
+# , fuzzy
+#
+#
+msgid ""
+msgstr ""
+"Project-Id-Version: mysql-dfsg-5.1 5.0.21-3\n"
+"Report-Msgid-Bugs-To: percona-xtradb-dfsg-5.1@packages.debian.org\n"
+"POT-Creation-Date: 2010-02-15 17:10-0500\n"
+"PO-Revision-Date: 2007-02-18 14:48+0100\n"
+"Last-Translator: Andreas Henriksson <andreas@fatal.se>\n"
+"Language-Team: Swedish <tp-sv@listor.tp-sv.se>\n"
+"MIME-Version: 1.0\n"
+"Content-Type: text/plain; charset=iso-8859-1\n"
+"Content-Transfer-Encoding: 8bit\n"
+"X-Poedit-Language: Swedish\n"
+"X-Poedit-Country: SWEDEN\n"
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:2001
+#, fuzzy
+#| msgid "Do you really want to downgrade?"
+msgid "Really proceed with downgrade?"
+msgstr "Vill du verkligen nedgradera?"
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:2001
+msgid "A file named /var/lib/mysql/debian-*.flag exists on this system."
+msgstr ""
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:2001
+#, fuzzy
+#| msgid ""
+#| "WARNING: The file /var/lib/mysql/debian-*.flag exists. This indicates "
+#| "that a mysql-server package with a higher version has been installed "
+#| "before. It can not be guaranteed that this version can use its data."
+msgid ""
+"Such file is an indication that a mysql-server package with a higher version "
+"has been installed earlier."
+msgstr ""
+"VARNING: Filen /var/lib/mysql/debian-*.flag existerar. Detta betyder att "
+"paketet mysql-server med h�gre versionsnummer har installerats tidigare. Det "
+"kan inte garanteras att denna version kan anv�nda dess data."
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:2001
+msgid ""
+"There is no guarantee that the version you're currently installing will be "
+"able to use the current databases."
+msgstr ""
+
+#. Type: note
+#. Description
+#: ../percona-xtradb-server-5.1.templates:3001
+#, fuzzy
+#| msgid "Important note for NIS/YP users!"
+msgid "Important note for NIS/YP users"
+msgstr "Viktig notering f�r NIS/YP-anv�ndare!"
+
+#. Type: note
+#. Description
+#: ../percona-xtradb-server-5.1.templates:3001
+msgid ""
+"To use MySQL, the following entries for users and groups should be added to "
+"the system:"
+msgstr ""
+
+#. Type: note
+#. Description
+#: ../percona-xtradb-server-5.1.templates:3001
+msgid ""
+"You should also check the permissions and the owner of the /var/lib/mysql "
+"directory:"
+msgstr ""
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:4001
+msgid "Remove all Percona SQL databases?"
+msgstr ""
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:4001
+msgid ""
+"The /var/lib/mysql directory which contains the Percona SQL databases is "
+"about to be removed."
+msgstr ""
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:4001
+#, fuzzy
+#| msgid ""
+#| "The script is about to remove the data directory /var/lib/mysql. If it is "
+#| "planned to just install a higher MySQL version or if a different mysql-"
+#| "server package is already using it, the data should be kept."
+msgid ""
+"If you're removing the Percona SQL package in order to later install a more "
+"recent version or if a different mysql-server package is already using it, "
+"the data should be kept."
+msgstr ""
+"Scriptet kommer strax ta bort data-katalogen /var/lib/mysql. Om det "
+"planerade var att bara installera en h�gre MySQL-version eller om ett annan "
+"mysql-server paket redan anv�nde det, skall datan sparas."
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:5001
+#, fuzzy
+#| msgid "Should MySQL start on boot?"
+msgid "Start the Percona SQL server on boot?"
+msgstr "Ska MySQL startas n�r systemet startar upp?"
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:5001
+#, fuzzy
+#| msgid ""
+#| "The MySQL can start automatically on boot time or only if you manually "
+#| "type '/etc/init.d/mysql start'."
+msgid ""
+"The Percona SQL server can be launched automatically at boot time or "
+"manually with the '/etc/init.d/mysql start' command."
+msgstr ""
+"MySQL kan startas n�r systemet startas upp eller endast om du manuellt "
+"skriver '/etc/init.d/mysql start'."
+
+#. Type: password
+#. Description
+#: ../percona-xtradb-server-5.1.templates:6001
+#, fuzzy
+#| msgid "New password for MySQL \"root\" user:"
+msgid "New password for the Percona SQL \"root\" user:"
+msgstr "Nytt l�senord f�r MySQLs \"root\"-anv�ndare:"
+
+#. Type: password
+#. Description
+#: ../percona-xtradb-server-5.1.templates:6001
+#, fuzzy
+#| msgid ""
+#| "It is highly recommended that you set a password for the MySQL "
+#| "administrative \"root\" user."
+msgid ""
+"While not mandatory, it is highly recommended that you set a password for "
+"the Percona SQL administrative \"root\" user."
+msgstr ""
+"Det �r starkt rekommenderat att du s�tter ett l�senord f�r MySQLs "
+"administrativa \"root\"-anv�ndare."
+
+#. Type: password
+#. Description
+#: ../percona-xtradb-server-5.1.templates:6001
+msgid "If that field is left blank, the password will not be changed."
+msgstr ""
+
+#. Type: password
+#. Description
+#: ../percona-xtradb-server-5.1.templates:7001
+#, fuzzy
+#| msgid "New password for MySQL \"root\" user:"
+msgid "Repeat password for the Percona SQL \"root\" user:"
+msgstr "Nytt l�senord f�r MySQLs \"root\"-anv�ndare:"
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:8001
+#, fuzzy
+#| msgid "Unable to set password for MySQL \"root\" user"
+msgid "Unable to set password for the Percona SQL \"root\" user"
+msgstr "Lyckades inte s�tta l�senord f�r MySQLs \"root\"-anv�ndare"
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:8001
+#, fuzzy
+#| msgid ""
+#| "It seems an error occurred while setting the password for the MySQL "
+#| "administrative user.  This may have happened because the user already has "
+#| "a password, or because there was a problem communicating with the MySQL "
+#| "server."
+msgid ""
+"An error occurred while setting the password for the Percona SQL "
+"administrative user. This may have happened because the account already has "
+"a password, or because of a communication problem with the Percona SQL "
+"server."
+msgstr ""
+"Det verkar som ett fel uppstod n�r det skulle s�ttas ett l�senord f�r MySQLs "
+"administrativa anv�ndare. Detta kan ha skett f�r att anv�ndaren redan har "
+"ett l�senord satt, eller p� grund av problem med att kommunicera med MySQL-"
+"servern."
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:8001
+msgid "You should check the account's password after the package installation."
+msgstr ""
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:8001
+#, fuzzy
+#| msgid ""
+#| "See /usr/share/doc/mysql-server-5.1/README.Debian for more information."
+msgid ""
+"Please read the /usr/share/doc/mysql-server-5.1/README.Debian file for more "
+"information."
+msgstr "Se /usr/share/doc/mysql-server-5.1/README.Debian f�r mer information."
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:9001
+msgid "Password input error"
+msgstr ""
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:9001
+msgid "The two passwords you entered were not the same. Please try again."
+msgstr ""
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:10001
+msgid "NDB Cluster seems to be in use"
+msgstr ""
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:10001
+msgid ""
+"Percona-SQL-5.1 has orphaned NDB Cluster support. Please migrate to the new "
+"mysql-cluster package and remove all lines starting with \"ndb\" from all "
+"config files below /etc/mysql/."
+msgstr ""
+
+#~ msgid "Cannot upgrade if ISAM tables are present!"
+#~ msgstr "Kan inte uppgradera om ISAM-tabeller finns!"
+
+#~ msgid ""
+#~ "Recent versions of MySQL can no longer use the old ISAM table format and "
+#~ "it is necessary to convert your tables to e.g. MyISAM before upgrading by "
+#~ "using \"mysql_convert_table_format\" or \"ALTER TABLE x ENGINE=MyISAM\". "
+#~ "The installation of mysql-server-5.1 will now abort. In case your old "
+#~ "mysql-server-4.1 gets removed nevertheless just reinstall it to convert "
+#~ "those tables."
+#~ msgstr ""
+#~ "Senaste versionerna av MySQL kan inte l�ngre anv�nda gamla ISAM-"
+#~ "tabellformatet och det �r n�dv�ndigt att konvertera dina tabeller till "
+#~ "exempelvis MyISAM f�re uppgradering med \"mysql_convert_table_format\" "
+#~ "eller \"ALTER TABLE x ENGINE=MyISAM\". Installationen av mysql-server-5.1 "
+#~ "kommer nu att avbrytas. Om ditt gamla mysql-server-4.1-paket tas bort �r "
+#~ "det bara att installera om det f�r att konvertera de tabellerna."
+
+#~ msgid ""
+#~ "Support MySQL connections from hosts running Debian \"sarge\" or older?"
+#~ msgstr ""
+#~ "Beh�ver du MySQL-anslutningar fr�n system som k�r Debian \"Sarge\" eller "
+#~ "�ldre?"
+
+#, fuzzy
+#~| msgid ""
+#~| "The way passwords were stored was not very secure. This has been "
+#~| "improved with the drawback that clients (e.g. PHP) from hosts running "
+#~| "Debian 3.1 Sarge will not be able to connect to account which are new or "
+#~| "whose password have been changed. See /usr/share/doc/mysql-server-5.1/"
+#~| "README.Debian."
+#~ msgid ""
+#~ "In old versions of MySQL clients on Debian, passwords were not stored "
+#~ "securely. This has been improved since then, however clients (such as "
+#~ "PHP) from hosts running Debian 3.1 Sarge will not be able to connect to "
+#~ "recent accounts or accounts whose password have been changed."
+#~ msgstr ""
+#~ "S�ttet som l�senorden lagrades p� var inte s�rskilt s�kert. Detta har "
+#~ "f�rb�ttrats p� bekostnad av att klienter (t.ex. PHP) fr�n system som k�r "
+#~ "Debian 3.1 Sarge inte kan ansluta till konton som �r nya eller vars "
+#~ "l�senord har �ndrats. Se /usr/share/doc/mysql-server-5.1/README.Debian."
+
+#~ msgid ""
+#~ "To use mysql you must install an equivalent user and group to the "
+#~ "following and ensure yourself that /var/lib/mysql has the right "
+#~ "permissions (the uid/gid may be different)."
+#~ msgstr ""
+#~ "F�r att anv�nda MySQL m�ste du installera en motsvarande anv�ndare och "
+#~ "grupp till f�ljande och se till att /var/lib/mysql har korrekta "
+#~ "r�ttigheter satta (uid/gid kan vara olika)."
+
+#~ msgid ""
+#~ "/etc/passwd:      mysql:x:100:101:MySQL Server:/var/lib/mysql:/bin/false"
+#~ msgstr ""
+#~ "/etc/passwd:      mysql:x:100:101:MySQL Server:/var/lib/mysql:/bin/false"
+
+#~ msgid "/etc/group:       mysql:x:101:"
+#~ msgstr "/etc/group:       mysql:x:101:"
+
+#~ msgid "/var/lib/mysql:   drwxr-xr-x   mysql    mysql"
+#~ msgstr "/var/lib/mysql:   drwxr-xr-x   mysql    mysql"
+
+#~ msgid "Remove the databases used by all MySQL versions?"
+#~ msgstr "Ta bort databaserna som anv�nds av alla MySQL-versioner?"
+
+#~ msgid ""
+#~ "If you do not provide a password no changes will be made to the account."
+#~ msgstr ""
+#~ "Om du inte anger ett l�senord kommer inga �ndringar att g�ras f�r kontot."
+
+#~ msgid ""
+#~ "When installation finishes, you should verify that the account is "
+#~ "properly protected with a password (see README.Debian for more "
+#~ "information)."
+#~ msgstr ""
+#~ "N�r installationen �r klar, b�r du kontrollera s� att kontot �r riktigt "
+#~ "skyddat av ett l�senord (l�s README.Debian f�r mer information)."
+
+#~ msgid "Update Hints"
+#~ msgstr "Uppdateringstips"
+
+#~ msgid ""
+#~ "You have to run \"mysql_upgrade\" after the upgrade, else tables can be  "
+#~ "corrupted! This script also enhances the privilege tables but is not  "
+#~ "supposed to give any user more rights that he had before,"
+#~ msgstr ""
+#~ "Du m�ste k�ra \"mysql_upgrade\" efter uppgraderingen, annars kan "
+#~ "tabellerna vara skadade! Detta skript ut�kar �ven privilegietabellerna "
+#~ "men �r inte t�nkte att ge n�gon anv�ndare mer befogenhet �n vad han hade "
+#~ "tidigare,"
+
+#~ msgid "Please also read http://www.mysql.com/doc/en/Upgrade.html"
+#~ msgstr "L�s �ven http://www.mysql.com/doc/en/Upgrade.html"
+
+#~ msgid "Install Hints"
+#~ msgstr "Installationstips"
+
+#~ msgid ""
+#~ "On upgrades from MySQL 3.23, as shipped with Debian Woody, symlinks in "
+#~ "place of /var/lib/mysql or /var/log/mysql gets accidently removed and "
+#~ "have manually be restored."
+#~ msgstr ""
+#~ "Vid uppgraderingar fr�n MySQL 3.23 som skickades med Debian Woody har "
+#~ "symboliska l�nkar i /var/lib/mysql eller /var/log/mysql av misstag tagits "
+#~ "bort och m�ste manuellt �terskapas."
+
+#~ msgid ""
+#~ "MySQL will only install if you have a non-numeric hostname that is "
+#~ "resolvable via the /etc/hosts file. E.g. if the \"hostname\" command "
+#~ "returns \"myhostname\" then there must be a line like \"10.0.0.1 "
+#~ "myhostname\"."
+#~ msgstr ""
+#~ "MySQL kan endast installeras om du har ett icke-numeriskt v�rdnamn som "
+#~ "kan sl�s upp via filen /etc/hosts. Exempelvis om kommandot \"hostname\" "
+#~ "returnerar \"mittnamn\" s� b�r det finnas en rad som liknar \"10.0.0.1 "
+#~ "mittnamn\"."
+
+#~ msgid ""
+#~ "A new mysql user \"debian-sys-maint\" will be created. This mysql account "
+#~ "is used in the start/stop and cron scripts. Don't delete."
+#~ msgstr ""
+#~ "En ny MySQL-anv�ndare kallad \"debian-sys-maint\" kommer att skapas. "
+#~ "Detta MySQL-konto anv�nds f�r start/stopp och cron-skript. Ta inte bort "
+#~ "det."
+
+#~ msgid ""
+#~ "Please remember to set a PASSWORD for the MySQL root user! If you use a /"
+#~ "root/.my.cnf, always write the \"user\" and the \"password\" lines in "
+#~ "there, never only the password!"
+#~ msgstr ""
+#~ "T�nk p� att s�tta ett L�SENORD f�r MySQL:s root-anv�ndare! Om du "
+#~ "anv�nder /root/.my.cnf, skriv d� alltid en \"user\"-rad och en \"password"
+#~ "\"-rad i den, aldrig med endast l�senordet!"
+
+#~ msgid ""
+#~ "Should I remove the complete /var/lib/mysql directory tree which is used "
+#~ "by all MySQL versions, not necessarily only the one you are about to "
+#~ "purge?"
+#~ msgstr ""
+#~ "Ska jag ta bort hela katalogtr�det i /var/lib/mysql som anv�nds av alla "
+#~ "MySQL-versioner och inte bara f�r den som du nu kommer att rensa ut?"
+
+#~ msgid ""
+#~ "Rarely, e.g. on new major versions, the privilege system is improved. To "
+#~ "make use of it mysql_fix_privilege_tables must be executed manually. The "
+#~ "script is not supposed to give any user more rights that he had before,"
+#~ msgstr ""
+#~ "S�llan, exempelvis i nya st�rre versioner, har beh�righetssystemet "
+#~ "f�rb�ttrats. F�r att anv�nda det m�ste skriptet "
+#~ "mysql_fix_privilege_tables k�ras manuellt. Skriptet �r inte t�nkt att ge "
+#~ "n�gon anv�ndare h�gre beh�righet �n han hade tidigare."
diff --git a/storage/xtradb/build/debian/po/templates.pot b/storage/xtradb/build/debian/po/templates.pot
new file mode 100644
index 00000000000..bbddfe37f83
--- /dev/null
+++ b/storage/xtradb/build/debian/po/templates.pot
@@ -0,0 +1,187 @@
+# SOME DESCRIPTIVE TITLE.
+# Copyright (C) YEAR THE PACKAGE'S COPYRIGHT HOLDER
+# This file is distributed under the same license as the PACKAGE package.
+# FIRST AUTHOR <EMAIL@ADDRESS>, YEAR.
+#
+#, fuzzy
+msgid ""
+msgstr ""
+"Project-Id-Version: PACKAGE VERSION\n"
+"Report-Msgid-Bugs-To: percona-xtradb-dfsg-5.1@packages.debian.org\n"
+"POT-Creation-Date: 2010-02-15 17:10-0500\n"
+"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n"
+"Last-Translator: FULL NAME <EMAIL@ADDRESS>\n"
+"Language-Team: LANGUAGE <LL@li.org>\n"
+"MIME-Version: 1.0\n"
+"Content-Type: text/plain; charset=CHARSET\n"
+"Content-Transfer-Encoding: 8bit\n"
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:2001
+msgid "Really proceed with downgrade?"
+msgstr ""
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:2001
+msgid "A file named /var/lib/mysql/debian-*.flag exists on this system."
+msgstr ""
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:2001
+msgid ""
+"Such file is an indication that a mysql-server package with a higher version "
+"has been installed earlier."
+msgstr ""
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:2001
+msgid ""
+"There is no guarantee that the version you're currently installing will be "
+"able to use the current databases."
+msgstr ""
+
+#. Type: note
+#. Description
+#: ../percona-xtradb-server-5.1.templates:3001
+msgid "Important note for NIS/YP users"
+msgstr ""
+
+#. Type: note
+#. Description
+#: ../percona-xtradb-server-5.1.templates:3001
+msgid ""
+"To use MySQL, the following entries for users and groups should be added to "
+"the system:"
+msgstr ""
+
+#. Type: note
+#. Description
+#: ../percona-xtradb-server-5.1.templates:3001
+msgid ""
+"You should also check the permissions and the owner of the /var/lib/mysql "
+"directory:"
+msgstr ""
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:4001
+msgid "Remove all Percona SQL databases?"
+msgstr ""
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:4001
+msgid ""
+"The /var/lib/mysql directory which contains the Percona SQL databases is "
+"about to be removed."
+msgstr ""
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:4001
+msgid ""
+"If you're removing the Percona SQL package in order to later install a more "
+"recent version or if a different mysql-server package is already using it, "
+"the data should be kept."
+msgstr ""
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:5001
+msgid "Start the Percona SQL server on boot?"
+msgstr ""
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:5001
+msgid ""
+"The Percona SQL server can be launched automatically at boot time or "
+"manually with the '/etc/init.d/mysql start' command."
+msgstr ""
+
+#. Type: password
+#. Description
+#: ../percona-xtradb-server-5.1.templates:6001
+msgid "New password for the Percona SQL \"root\" user:"
+msgstr ""
+
+#. Type: password
+#. Description
+#: ../percona-xtradb-server-5.1.templates:6001
+msgid ""
+"While not mandatory, it is highly recommended that you set a password for "
+"the Percona SQL administrative \"root\" user."
+msgstr ""
+
+#. Type: password
+#. Description
+#: ../percona-xtradb-server-5.1.templates:6001
+msgid "If that field is left blank, the password will not be changed."
+msgstr ""
+
+#. Type: password
+#. Description
+#: ../percona-xtradb-server-5.1.templates:7001
+msgid "Repeat password for the Percona SQL \"root\" user:"
+msgstr ""
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:8001
+msgid "Unable to set password for the Percona SQL \"root\" user"
+msgstr ""
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:8001
+msgid ""
+"An error occurred while setting the password for the Percona SQL "
+"administrative user. This may have happened because the account already has "
+"a password, or because of a communication problem with the Percona SQL "
+"server."
+msgstr ""
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:8001
+msgid "You should check the account's password after the package installation."
+msgstr ""
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:8001
+msgid ""
+"Please read the /usr/share/doc/mysql-server-5.1/README.Debian file for more "
+"information."
+msgstr ""
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:9001
+msgid "Password input error"
+msgstr ""
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:9001
+msgid "The two passwords you entered were not the same. Please try again."
+msgstr ""
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:10001
+msgid "NDB Cluster seems to be in use"
+msgstr ""
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:10001
+msgid ""
+"Percona-SQL-5.1 has orphaned NDB Cluster support. Please migrate to the new "
+"mysql-cluster package and remove all lines starting with \"ndb\" from all "
+"config files below /etc/mysql/."
+msgstr ""
diff --git a/storage/xtradb/build/debian/po/tr.po b/storage/xtradb/build/debian/po/tr.po
new file mode 100644
index 00000000000..e19ddeedcfa
--- /dev/null
+++ b/storage/xtradb/build/debian/po/tr.po
@@ -0,0 +1,342 @@
+# Turkish translation of mysql-server.
+# This file is distributed under the same license as the mysql-server package.
+# Gürkan Aslan <gurkan@iaslan.com>, 2004
+#
+msgid ""
+msgstr ""
+"Project-Id-Version: mysql-dfsg-4.1\n"
+"Report-Msgid-Bugs-To: percona-xtradb-dfsg-5.1@packages.debian.org\n"
+"POT-Creation-Date: 2010-02-15 17:10-0500\n"
+"PO-Revision-Date: 2004-06-05 08:53+0300\n"
+"Last-Translator: Gürkan Aslan <gurkan@iaslan.com>\n"
+"Language-Team: Turkish <debian-l10n-turkish@lists.debian.org>\n"
+"MIME-Version: 1.0\n"
+"Content-Type: text/plain; charset=UTF-8\n"
+"Content-Transfer-Encoding: 8bit\n"
+"Plural-Forms:  nplurals=1; plural=0;\n"
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:2001
+msgid "Really proceed with downgrade?"
+msgstr ""
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:2001
+msgid "A file named /var/lib/mysql/debian-*.flag exists on this system."
+msgstr ""
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:2001
+msgid ""
+"Such file is an indication that a mysql-server package with a higher version "
+"has been installed earlier."
+msgstr ""
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:2001
+msgid ""
+"There is no guarantee that the version you're currently installing will be "
+"able to use the current databases."
+msgstr ""
+
+#. Type: note
+#. Description
+#: ../percona-xtradb-server-5.1.templates:3001
+#, fuzzy
+#| msgid "Important note for NIS/YP users!"
+msgid "Important note for NIS/YP users"
+msgstr "NIS/YP kullanıcıları için önemli not!"
+
+#. Type: note
+#. Description
+#: ../percona-xtradb-server-5.1.templates:3001
+msgid ""
+"To use MySQL, the following entries for users and groups should be added to "
+"the system:"
+msgstr ""
+
+#. Type: note
+#. Description
+#: ../percona-xtradb-server-5.1.templates:3001
+msgid ""
+"You should also check the permissions and the owner of the /var/lib/mysql "
+"directory:"
+msgstr ""
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:4001
+msgid "Remove all Percona SQL databases?"
+msgstr ""
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:4001
+msgid ""
+"The /var/lib/mysql directory which contains the Percona SQL databases is "
+"about to be removed."
+msgstr ""
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:4001
+msgid ""
+"If you're removing the Percona SQL package in order to later install a more "
+"recent version or if a different mysql-server package is already using it, "
+"the data should be kept."
+msgstr ""
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:5001
+#, fuzzy
+#| msgid "Should MySQL start on boot?"
+msgid "Start the Percona SQL server on boot?"
+msgstr "MySQL açılış sırasında başlatılsın mı?"
+
+#. Type: boolean
+#. Description
+#: ../percona-xtradb-server-5.1.templates:5001
+#, fuzzy
+msgid ""
+"The Percona SQL server can be launched automatically at boot time or "
+"manually with the '/etc/init.d/mysql start' command."
+msgstr ""
+"MySQL açılış sırasında veya '/etc/init.d/mysql start' komutunu vermeniz "
+"halinde elle başlatılabilir. Eğer açılışta otomatik olarak başlatılmasını "
+"istiyorsanız burada 'evet'i seçin."
+
+#. Type: password
+#. Description
+#: ../percona-xtradb-server-5.1.templates:6001
+msgid "New password for the Percona SQL \"root\" user:"
+msgstr ""
+
+#. Type: password
+#. Description
+#: ../percona-xtradb-server-5.1.templates:6001
+msgid ""
+"While not mandatory, it is highly recommended that you set a password for "
+"the Percona SQL administrative \"root\" user."
+msgstr ""
+
+#. Type: password
+#. Description
+#: ../percona-xtradb-server-5.1.templates:6001
+msgid "If that field is left blank, the password will not be changed."
+msgstr ""
+
+#. Type: password
+#. Description
+#: ../percona-xtradb-server-5.1.templates:7001
+msgid "Repeat password for the Percona SQL \"root\" user:"
+msgstr ""
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:8001
+msgid "Unable to set password for the Percona SQL \"root\" user"
+msgstr ""
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:8001
+msgid ""
+"An error occurred while setting the password for the Percona SQL "
+"administrative user. This may have happened because the account already has "
+"a password, or because of a communication problem with the Percona SQL "
+"server."
+msgstr ""
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:8001
+msgid "You should check the account's password after the package installation."
+msgstr ""
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:8001
+msgid ""
+"Please read the /usr/share/doc/mysql-server-5.1/README.Debian file for more "
+"information."
+msgstr ""
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:9001
+msgid "Password input error"
+msgstr ""
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:9001
+msgid "The two passwords you entered were not the same. Please try again."
+msgstr ""
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:10001
+msgid "NDB Cluster seems to be in use"
+msgstr ""
+
+#. Type: error
+#. Description
+#: ../percona-xtradb-server-5.1.templates:10001
+msgid ""
+"Percona-SQL-5.1 has orphaned NDB Cluster support. Please migrate to the new "
+"mysql-cluster package and remove all lines starting with \"ndb\" from all "
+"config files below /etc/mysql/."
+msgstr ""
+
+#~ msgid ""
+#~ "To use mysql you must install an equivalent user and group to the "
+#~ "following and ensure yourself that /var/lib/mysql has the right "
+#~ "permissions (the uid/gid may be different)."
+#~ msgstr ""
+#~ "Mysql'i kullanmak için aşağıdakiyle eşdeğer bir kullanıcı ve grup "
+#~ "tanımlamalı, ve /var/lib/mysql izinlerinin uygun şekilde ayarlandığından "
+#~ "emin olmalısınız (uid/gid farklı olabilir)."
+
+#~ msgid ""
+#~ "/etc/passwd:      mysql:x:100:101:MySQL Server:/var/lib/mysql:/bin/false"
+#~ msgstr ""
+#~ "/etc/passwd:      mysql:x:100:101:MySQL Server:/var/lib/mysql:/bin/false"
+
+#~ msgid "/etc/group:       mysql:x:101:"
+#~ msgstr "/etc/group:       mysql:x:101:"
+
+#~ msgid "/var/lib/mysql:   drwxr-xr-x   mysql    mysql"
+#~ msgstr "/var/lib/mysql:   drwxr-xr-x   mysql    mysql"
+
+#, fuzzy
+#~ msgid "Please also read http://www.mysql.com/doc/en/Upgrade.html"
+#~ msgstr "Lütfen http://www.mysql.com/doc/en/Upgrade.html belgesini okuyun"
+
+#, fuzzy
+#~ msgid ""
+#~ "MySQL will only install if you have a non-numeric hostname that is "
+#~ "resolvable via the /etc/hosts file. E.g. if the \"hostname\" command "
+#~ "returns \"myhostname\" then there must be a line like \"10.0.0.1 "
+#~ "myhostname\"."
+#~ msgstr ""
+#~ "MySQL sadece /etc/hosts dosyası yoluyla çözülebilir NUMERİK OLMAYAN bir "
+#~ "makine adına sahipseniz kurulacaktır. Örneğin, eğer \"hostname\" komutu "
+#~ "\"makinem\" ismini döndürüyorsa, bu dosya içinde \"10.0.0.1 makinem\" "
+#~ "gibi bir satır olmalıdır."
+
+#, fuzzy
+#~ msgid ""
+#~ "A new mysql user \"debian-sys-maint\" will be created. This mysql account "
+#~ "is used in the start/stop and cron scripts. Don't delete."
+#~ msgstr ""
+#~ "Yeni mysql kullanıcısı \"debian-sys-maint\" yaratılacak. Bu hesap, "
+#~ "başlangıç betiklerinde ve cron içinde kullanılıyor. Bu hesabı silmeyin."
+
+#, fuzzy
+#~ msgid ""
+#~ "Please remember to set a PASSWORD for the MySQL root user! If you use a /"
+#~ "root/.my.cnf, always write the \"user\" and the \"password\" lines in "
+#~ "there, never only the password!"
+#~ msgstr ""
+#~ "Lütfen MySQL root kullanıcısı için bir PAROLA girmeyi unutmayın! Eğer /"
+#~ "root/.my.cnf kullanıyorsanız, \"user\" ve \"password\" satırlarını her "
+#~ "zaman buraya ekleyin, sadece parolayı değil! Daha fazla bilgi için /usr/"
+#~ "share/doc/mysql-server/README.Debian dosyasını okuyun."
+
+#, fuzzy
+#~ msgid ""
+#~ "Should I remove all databases below /var/lib/mysql as you are purging the "
+#~ "mysql-server package?"
+#~ msgstr ""
+#~ "mysql-server paketi kaldırıldıktan sonra bütün veritabanları silinsin mi?"
+
+#~ msgid ""
+#~ "Networking is disabled by default for security reasons. You can enable it "
+#~ "by commenting out the skip-networking option in /etc/mysql/my.cnf."
+#~ msgstr ""
+#~ "Ağ, öntanımlı olarak güvenlik gerekçeleriyle devre dışı bırakıldı. Bu "
+#~ "özelliği /etc/mysql/my.cnf dosyası içindeki \"skip-networking\" "
+#~ "seçeneğini kaldırarak etkinleştirebilirsiniz."
+
+#~ msgid "security and update notice"
+#~ msgstr "güvenlik ve güncelleme duyurusu"
+
+#~ msgid ""
+#~ "Should I remove everything below /var/lib/mysql when you purge the mysql-"
+#~ "server package with the \"dpkg --purge mysql-server\" command (i.e. "
+#~ "remove everything including the configuration) somewhen? (default is not)"
+#~ msgstr ""
+#~ "mysql-server paketini temizlemek için \"dpkg --purge mysql-server\" "
+#~ "komutunu kullandığınızda (yani yapılandırma dahil herşeyi silmek) /var/"
+#~ "lib/mysql altındaki herşeyi sileyim mi? (öntanımlı cevap hayır'dır)."
+
+#~ msgid "Please run mysql_fix_privilege_tables !"
+#~ msgstr "Lütfen mysql_fix_privilege_tables komutunu çalıştırın!"
+
+#~ msgid ""
+#~ "I will ensure secure permissions of /var/lib/mysql by replacing GIDs "
+#~ "other than root and mysql with mysql."
+#~ msgstr ""
+#~ "/var/lib/mysql'in izinlerinin güvenli olmasını sağlamak amacıyla, buna "
+#~ "ait GID'leri root ve mysql'den farklı olacak şekilde değiştireceğim."
+
+#~ msgid ""
+#~ "Instructions how to enable SSL support are in /usr/share/doc/mysql-server/"
+#~ msgstr ""
+#~ "SSL desteğini nasıl etkinleştirebileceğinize ilişkin talimatlar /usr/"
+#~ "share/doc/mysql-server/ içinde."
+
+#~ msgid "mysql_fix_privileges_tables will be executed"
+#~ msgstr "mysql_fix_privileges_tables çalıştırılacak"
+
+#~ msgid ""
+#~ "The latest MySQL versions have an enhanced, more fine grained, privilege "
+#~ "system. To make use of it, some new fields must be added to the tables "
+#~ "in  the \"mysql\" database. This is done by the "
+#~ "mysql_fix_privilege_tables script during this upgrade regardless of if "
+#~ "the server is currently running or not!"
+#~ msgstr ""
+#~ "En son MySQL sürümleri zenginleştirilmiş, daha ayrıntılandırılmış bir "
+#~ "ayrıcalık (privilege) sistemine sahiptir. Yeni sistemi kullanmak için, "
+#~ "\"mysql\" veritabanındaki tablolara bazı yeni alanlar eklenmelidir. Bu "
+#~ "işlem, sunucunun çalışıp çalışmamasına bağlı olmaksızın "
+#~ "mysql_fix_privilege_tables betiği tarafından bu yükseltme sırasında "
+#~ "yapılır."
+
+#~ msgid ""
+#~ "This script is not supposed to give any user more rights that he had "
+#~ "before, if you encounter such a case, please contact me."
+#~ msgstr ""
+#~ "Bu betiğin hiç bir kullanıcıya öncekinden daha fazla hak kazandırmadığı "
+#~ "varsayılıyor. Eğer bunun aksinde bir durumla karşılaşırsanız, lütfen "
+#~ "benimle bağlantıya geçin."
+
+#~ msgid "Make MySQL reachable via network?"
+#~ msgstr "MySQL network üzerinden ulaşılabilir olsun mu?"
+
+#~ msgid ""
+#~ "Should MySQL listen on a network reachable TCP port? This is not "
+#~ "necessary for use on a single computer and could be a security problem."
+#~ msgstr ""
+#~ "MySQL ağ üzerinde ulaşılabilen bir TCP portunu dinlesin mi? Tek olan bir "
+#~ "bilgisayar için bu ayar gerekli değildir ve bir güvenlik sorunu "
+#~ "oluşturabilir."
+
+#~ msgid "Enable chroot mode?"
+#~ msgstr "chroot kipi etkinleştirilsin mi?"
+
+#~ msgid ""
+#~ "MySQL is able to jail itself into the /var/lib/mysql_jail directory so "
+#~ "that users cannot modify any files outside this directory. This improves "
+#~ "resistence against crackers, too, as they are not able to modify system "
+#~ "files."
+#~ msgstr ""
+#~ "MySQL kendini /var/lib/mysql_jail dizinine hapsederek kullanıcıların bu "
+#~ "dizin dışındaki hiç bir dosyayı değiştirmemesini sağlayabilir. Bu "
+#~ "düzenleme, sistem dosyalarını değiştirmelerini engelleyeceğinden, "
+#~ "cracker'lara karşı dayanıklılığı arttırır."
diff --git a/storage/xtradb/build/debian/rules b/storage/xtradb/build/debian/rules
new file mode 100755
index 00000000000..f166009da2c
--- /dev/null
+++ b/storage/xtradb/build/debian/rules
@@ -0,0 +1,322 @@
+#!/usr/bin/make -f
+
+export DH_VERBOSE=1
+
+PACKAGE=percona-xtradb-dfsg-5.1
+
+include /usr/share/dpatch/dpatch.make
+
+TMP=$(CURDIR)/debian/tmp/
+
+ARCH = $(shell dpkg-architecture -qDEB_BUILD_ARCH)
+ARCH_OS = $(shell dpkg-architecture -qDEB_BUILD_ARCH_OS)
+DEB_BUILD_GNU_TYPE ?= $(shell dpkg-architecture -qDEB_BUILD_GNU_TYPE)
+DEB_HOST_GNU_TYPE  ?= $(shell dpkg-architecture -qDEB_HOST_GNU_TYPE)
+DEBVERSION = $(shell dpkg-parsechangelog | awk '/^Version: / { print $$2 }' | sed 's/^.*-//' )
+
+DEB_SOURCE_PACKAGE ?= $(strip $(shell egrep '^Source: ' debian/control | cut -f 2 -d ':'))
+DEB_VERSION ?= $(shell dpkg-parsechangelog | egrep '^Version:' | cut -f 2 -d ' ')
+DEB_NOEPOCH_VERSION ?= $(shell echo $(DEB_VERSION) | cut -d: -f2-)
+DEB_UPSTREAM_VERSION ?= $(shell echo $(DEB_NOEPOCH_VERSION) | sed 's/-[^-]*$$//')
+DEB_UPSTREAM_VERSION_MAJOR_MINOR := $(shell echo $(DEB_UPSTREAM_VERSION) | sed -r -n 's/^([0-9]+\.[0-9]+).*/\1/p')
+
+DISTRIBUTION = $(shell echo "Percona SQL Server (GPL), XtraDB 10")
+
+MAKE_J = -j$(shell if [ -f /proc/cpuinfo ] ; then grep -c processor.* /proc/cpuinfo ; else echo 1 ; fi)
+ifeq (${MAKE_J}, -j0)
+  MAKE_J = -j1
+endif
+
+MAKE_TEST_TARGET=test-force
+ifneq ($(findstring $(DEB_BUILD_OPTIONS),fulltest),)
+# make test-bt is the testsuite run by the MySQL build team 
+# before a release, but it is long
+    MAKE_TEST_TARGET=test-bt
+endif
+
+USE_ASSEMBLER=--enable-assembler 
+
+#ifneq ($(findstring $(ARCH), alpha amd64 arm armel ia64 i386 hppa mipsel powerpc s390 sparc),)
+#   TESTSUITE_FAIL_CMD=true
+#else
+   TESTSUITE_FAIL_CMD=exit 1
+#endif
+
+# This causes seg11 crashes if LDAP is used for groups in /etc/nsswitch.conf
+# so it is disabled by default although, according to MySQL, it brings >10%
+# performance gain if enabled. See #299382.
+ifeq ($(STATIC_MYSQLD), 1)
+    USE_STATIC_MYSQLD=--with-mysqld-ldflags=-all-static
+endif
+		
+configure: patch configure-stamp
+configure-stamp:
+	@echo "RULES.configure-stamp"
+	dh_testdir
+
+ifneq ($(ARCH_OS),hurd)
+	if [ ! -d /proc/self ]; then echo "/proc IS NEEDED" 1>&2; exit 1; fi 
+endif
+
+	sh -c  'PATH=$${MYSQL_BUILD_PATH:-"/bin:/usr/bin"} \
+	    	CC=$${MYSQL_BUILD_CC:-gcc} \
+	    	CFLAGS=$${MYSQL_BUILD_CFLAGS:-"-O3 -DBIG_JOINS=1 ${FORCE_FPIC_CFLAGS}"} \
+	    	CXX=$${MYSQL_BUILD_CXX:-g++} \
+	    	CXXFLAGS=$${MYSQL_BUILD_CXXFLAGS:-"-O3 -DBIG_JOINS=1 -felide-constructors -fno-exceptions -fno-rtti ${FORCE_FPIC_CFLAGS}"} \
+	    ./configure \
+		--build=${DEB_BUILD_GNU_TYPE} \
+		--host=${DEB_HOST_GNU_TYPE} \
+		\
+		--prefix=/usr \
+	        --exec-prefix=/usr \
+	        --libexecdir=/usr/sbin \
+	        --datadir=/usr/share \
+	        --localstatedir=/var/lib/mysql \
+	        --includedir=/usr/include \
+	        --infodir=/usr/share/info \
+	        --mandir=/usr/share/man \
+		\
+		--with-server-suffix="-$(DEBVERSION)" \
+		--with-comment="($(DISTRIBUTION))" \
+		--with-system-type="debian-linux-gnu" \
+		\
+		--enable-shared \
+		--enable-static \
+		--enable-thread-safe-client \
+	        $(USE_ASSEMBLER) \
+		--enable-local-infile \
+		$(FORCE_FPIC) \
+		--with-fast-mutexes \
+                --with-big-tables \
+		--with-unix-socket-path=/var/run/mysqld/mysqld.sock \
+	       	--with-mysqld-user=mysql \
+		--with-libwrap \
+		$(USE_STATIC_MYSQLD) \
+		--with-ssl \
+	    	--without-docs \
+		--with-extra-charsets=all \
+		--with-plugins=max-no-ndb \
+		\
+		--without-embedded-server \
+		--with-embedded-privilege-control'
+		
+	#       --sysconfdir=/etc/mysql  -- Appends /etc/mysql after ~/ in the my.cnf search patch!
+	#
+	#	--with-debug
+	
+	touch configure-stamp
+
+
+build: build-stamp
+build-stamp: configure
+	dh_testdir
+
+	$(MAKE) $(MAKE_J)
+
+ifeq ($(findstring $(DEB_BUILD_OPTIONS),nocheck),)
+	if [ ! -f testsuite-stamp ] ; then \
+	  $(MAKE) $(MAKE_TEST_TARGET) || $(TESTSUITE_FAIL_CMD) ; \
+	fi
+endif
+
+	touch testsuite-stamp
+
+	touch build-stamp
+	
+
+clean: clean-patched unpatch
+	rm -rf debian/patched
+clean-patched:
+	@echo "RULES.clean-patched"
+	dh_testdir 
+	dh_testroot
+	rm -f configure-stamp
+	rm -f build-stamp
+	rm -f testsuite-stamp
+	
+	[ ! -f Makefile ] || $(MAKE) clean
+	[ ! -d mysql-test/var ] || rm -rf mysql-test/var
+
+	# We like to see how long this is neccessary
+	@echo "CRUFT BEGIN" 
+	@find -type l -print0 | xargs --no-run-if-empty -0 rm -v
+	@find -name .deps -type d -print0 | xargs --no-run-if-empty -0 rm -rfv
+	@rm -vrf ndb/docs/.doxy* ndb/docs/*html ndb/docs/*pdf innobase/autom4te.cache
+	@for i in \
+	  readline/Makefile \
+	  sql-bench/Makefile \
+	  scripts/make_win_binary_distribution \
+	  scripts/mysqlbug \
+	  sql/gen_lex_hash \
+	  sql/lex_hash.h \
+	  strings/ctype_autoconf.c \
+	  config.log \
+	  config.cache \
+	  ; \
+	do \
+	  rm -vf $$i; \
+	done
+	@echo "CRUFT END"
+
+	debconf-updatepo
+	dh_clean -v
+
+	
+install:
+install: build
+	@echo "RULES.install"
+	dh_testdir
+	dh_testroot
+	dh_clean -k
+	dh_installdirs
+
+	# some self written manpages which hopefully
+	# gets overwritten sooner or later with upstreams
+	mkdir -p $(TMP)/usr/share/man/man1/
+	mkdir -p $(TMP)/usr/share/man/man8/
+	cp debian/additions/*.1 $(TMP)/usr/share/man/man1/
+	mkdir -p $(TMP)/etc/mysql/conf.d/
+	cp debian/additions/mysqld_safe_syslog.cnf $(TMP)/etc/mysql/conf.d/
+	ln -s mysqlmanager.1 $(TMP)/usr/share/man/man1/mysqlmanager-pwgen.1
+	ln -s mysqlmanager.1 $(TMP)/usr/share/man/man1/mysqlmanagerc.1
+
+	# make install (trailing slash needed for innobase)
+	$(MAKE) install DESTDIR=$(TMP)/
+		
+	# After installing, remove rpath to make lintian happy.
+	set +e; \
+	find ./debian/tmp/ -type f -print0 \
+		| xargs -0 --no-run-if-empty chrpath -k 2>/dev/null \
+		| fgrep RPATH= \
+		| cut -d: -f 1 \
+		| xargs --no-run-if-empty chrpath -d; \
+	set -e
+
+	# libmysqlclient: move shared libraries (but not the rest like libheap.a & co)
+	mv $(TMP)/usr/lib/mysql/libmysqlclient* $(TMP)/usr/lib
+	perl -pi -e 's#/usr/lib/mysql#/usr/lib#' $(TMP)/usr/lib/libmysqlclient.la
+	perl -pi -e 's#/usr/lib/mysql#/usr/lib#' $(TMP)/usr/lib/libmysqlclient_r.la
+	# Check if our beloved versioned symbols are really there
+	if [ "`objdump -T $(TMP)/usr/lib/libmysqlclient.so.16.0.0 | grep -c libmysqlclient_16`" -lt 500 ]; then \
+	  echo "ERROR: versioned symbols are absent"; \
+	  exit 1; \
+	fi     
+
+	# libmysqlclient-dev: forgotten header file since 3.23.25?
+	cp include/my_config.h $(TMP)/usr/include/mysql/
+	cp include/my_dir.h $(TMP)/usr/include/mysql/
+
+	# percona-xtradb-common: We now provide our own config file.
+	install -d $(TMP)/etc/mysql
+	install -m 0644 debian/additions/my.cnf $(TMP)/etc/mysql/my.cnf
+
+	# percona-xtradb-client
+	install -m 0755 debian/additions/mysqlreport $(TMP)/usr/bin/
+	install -m 0755 debian/additions/innotop/innotop $(TMP)/usr/bin/
+	install -m 0644 debian/additions/innotop/innotop.1 $(TMP)/usr/share/man/man1/
+	install -m 0644 -D debian/additions/innotop/InnoDBParser.pm $(TMP)/usr/share/perl5/InnoDBParser.pm
+
+	# percona-xtradb-server
+	install -m 0755 scripts/mysqld_safe $(TMP)/usr/bin/mysqld_safe
+	mkdir -p $(TMP)/usr/share/doc/percona-xtradb-server-5.1/examples
+	mv $(TMP)/usr/share/mysql/*cnf 	    $(TMP)/usr/share/doc/percona-xtradb-server-5.1/examples/
+	rm -vf $(TMP)/usr/share/mysql/mi_test_all* \
+	       $(TMP)/usr/share/mysql/mysql-log-rotate \
+	       $(TMP)/usr/share/mysql/mysql.server \
+	       $(TMP)/usr/share/mysql/binary-configure
+	nm -n sql/mysqld |gzip -9 > $(TMP)/usr/share/doc/percona-xtradb-server-5.1/mysqld.sym.gz
+	install -m 0755 debian/additions/echo_stderr $(TMP)/usr/share/mysql/
+	install -m 0755 debian/additions/debian-start $(TMP)/etc/mysql/
+	install -m 0755 debian/additions/debian-start.inc.sh $(TMP)/usr/share/mysql/
+	# lintian overrides
+	mkdir -p $(TMP)/usr/share/lintian/overrides/
+	cp debian/percona-xtradb-common.lintian-overrides     $(TMP)/usr/share/lintian/overrides/percona-xtradb-common
+	cp debian/percona-xtradb-server-5.1.lintian-overrides $(TMP)/usr/share/lintian/overrides/percona-xtradb-server-5.1
+	cp debian/percona-xtradb-client-5.1.lintian-overrides $(TMP)/usr/share/lintian/overrides/percona-xtradb-client-5.1
+
+	# For 5.0 -> 5.1 transition
+	d=$(TMP)/usr/share/percona-xtradb-common/internal-use-only/; \
+	mkdir -p $$d; \
+	cp debian/percona-xtradb-server-5.1.mysql.init $$d/_etc_init.d_mysql; \
+	cp debian/percona-xtradb-server-5.1.logrotate $$d/_etc_logrotate.d_percona-xtradb-server; \
+	cp debian/additions/debian-start $$d/_etc_mysql_debian-start;
+
+	dh_movefiles
+	
+# Build architecture-independent files here.
+binary-indep: build install
+	@echo "RULES.binary-indep"
+	dh_testdir -i
+	dh_testroot -i
+	dh_installdebconf -i
+	dh_installdocs -i
+	dh_installexamples -i
+	dh_installmenu -i
+	dh_installlogrotate -i
+	dh_installinit -i
+	dh_installcron -i 
+	dh_installman -i
+	dh_installinfo -i
+	dh_installlogcheck -i
+	dh_installchangelogs -i
+	dh_link -i
+	dh_compress -i
+	dh_fixperms -i
+	dh_installdeb -i
+	dh_perl -i
+	dh_gencontrol -i
+	dh_md5sums -i
+	dh_builddeb -i
+	
+# Build architecture-dependent files here.
+binary-arch: build install
+	@echo "RULES.binary-arch"
+	dh_testdir 
+	dh_testroot
+
+	dh_installdebconf -a
+	dh_installdocs -a
+	dh_installexamples -a
+	dh_installmenu -a
+	dh_installlogrotate -a --name percona-xtradb-server
+	# Start mysql in runlevel 19 before 20 where apache, proftpd etc gets
+	# started which might depend on a running database server.
+	dh_installinit -a --name=mysql -- defaults 19 21
+	dh_installcron -a --name percona-xtradb-server
+	dh_installman -a
+	dh_installinfo -a
+	dh_installlogcheck -a
+	dh_installchangelogs -a
+	dh_strip -a
+	dh_link -a	# .so muss nach .so.1.2.3 installier werden!
+	dh_compress -a
+	dh_fixperms -a
+	dh_makeshlibs -a
+	dh_makeshlibs -plibmysqlclient16 -V'libmysqlclient16 (>= 5.1.21-1)'
+	dh_installdeb -a
+	dh_perl -a
+	dh_shlibdeps -a -l debian/libmysqlclient16/usr/lib -L libmysqlclient16
+	dh_gencontrol -a
+	dh_md5sums -a
+	dh_builddeb -a
+
+source diff:
+	@echo >&2 'source and diff are obsolete - use dpkg-source -b'; false
+
+binary:	binary-indep binary-arch
+
+get-orig-source:
+	@wget -nv -T10 -t3 \
+	  -O /tmp/mysql-$(DEB_UPSTREAM_VERSION).tar.gz \
+	  http://ftp.gwdg.de/pub/misc/mysql/Downloads/MySQL-$(DEB_UPSTREAM_VERSION_MAJOR_MINOR)/mysql-$(DEB_UPSTREAM_VERSION).tar.gz
+	@tar xfz /tmp/mysql-$(DEB_UPSTREAM_VERSION).tar.gz -C /tmp
+	@rm -rf /tmp/mysql-$(DEB_UPSTREAM_VERSION)/Docs
+	@rm -rf /tmp/mysql-$(DEB_UPSTREAM_VERSION)/debian
+	@mv /tmp/mysql-$(DEB_UPSTREAM_VERSION) /tmp/$(DEB_SOURCE_PACKAGE)-$(DEB_UPSTREAM_VERSION).orig
+	@cd /tmp ; tar czf $(DEB_SOURCE_PACKAGE)_$(DEB_UPSTREAM_VERSION).orig.tar.gz $(DEB_SOURCE_PACKAGE)-$(DEB_UPSTREAM_VERSION).orig
+	@rm -f /tmp/mysql-$(DEB_UPSTREAM_VERSION).tar.gz
+	@rm -rf /tmp/$(DEB_SOURCE_PACKAGE)-$(DEB_UPSTREAM_VERSION).orig
+
+.PHONY: clean clean-patched configure build binary binary-indep binary-arch install patch unpatch
+
+# vim: ts=8
diff --git a/storage/xtradb/build/debian/source.lintian-overrides b/storage/xtradb/build/debian/source.lintian-overrides
new file mode 100644
index 00000000000..7a93dd28f84
--- /dev/null
+++ b/storage/xtradb/build/debian/source.lintian-overrides
@@ -0,0 +1,2 @@
+maintainer-script-lacks-debhelper-token debian/mysql-server-5.1.postinst
+maintainer-script-lacks-debhelper-token debian/mysql-server-5.1.postrm
diff --git a/storage/xtradb/build/debian/watch b/storage/xtradb/build/debian/watch
new file mode 100644
index 00000000000..f6fdd67bd8d
--- /dev/null
+++ b/storage/xtradb/build/debian/watch
@@ -0,0 +1,3 @@
+version=3
+opts="uversionmangle=s/-(rc|beta)/$1/" \
+  ftp://sunsite.informatik.rwth-aachen.de/pub/mirror/www.mysql.com/Downloads/MySQL-5.1/mysql-([\d\.]*(?:-beta|-rc)?).tar.gz debian
diff --git a/storage/xtradb/build/percona-sql.spec b/storage/xtradb/build/percona-sql.spec
new file mode 100644
index 00000000000..d0fa6f2a41d
--- /dev/null
+++ b/storage/xtradb/build/percona-sql.spec
@@ -0,0 +1,1644 @@
+#############################################################################
+#
+# This is the spec file for the distribution specific RPM files
+#
+##############################################################################
+
+##############################################################################
+# Some common macro definitions
+##############################################################################
+
+# Required arguments
+# mysqlversion		- e.g. 5.1.37
+# pluginversion	- Version of InnoDB plugin taken as the basis, e.g. 1.0.3
+# redhatversion	- 5 or 4
+# xtradbversion	- The XtraDB release, eg. 6
+
+%define mysql_vendor  Percona, Inc
+%define redhatversion %(cat /etc/redhat-release | awk '{ print $3}' | awk -F. '{ print $1}')
+%define community 1
+%define mysqlversion 5.1.45
+%define pluginversion 1.0.6
+%define xtradbversion 10
+%define distribution  rhel%{redhatversion}
+%define release       %{xtradbversion}.%{distribution}
+
+%define mysqld_user	mysql
+%define mysqld_group	mysql
+%define mysqldatadir	/var/lib/mysql
+%define see_base For a description of MySQL see the base MySQL RPM or http://www.mysql.com
+
+# ------------------------------------------------------------------------------
+# Meta information, don't remove!
+# ------------------------------------------------------------------------------
+# norootforbuild
+
+# ------------------------------------------------------------------------------
+# On SuSE 9 no separate "debuginfo" package is built. To enable basic
+# debugging on that platform, we don't strip binaries on SuSE 9. We
+# disable the strip of binaries by redefining the RPM macro
+# "__os_install_post" leaving out the script calls that normally does
+# this. We do this in all cases, as on platforms where "debuginfo" is
+# created, a script "find-debuginfo.sh" will be called that will do
+# the strip anyway, part of separating the executable and debug
+# information into separate files put into separate packages.
+#
+# Some references (shows more advanced conditional usage):
+# http://www.redhat.com/archives/rpm-list/2001-November/msg00257.html
+# http://www.redhat.com/archives/rpm-list/2003-February/msg00275.html
+# http://www.redhat.com/archives/rhl-devel-list/2004-January/msg01546.html
+# http://lists.opensuse.org/archive/opensuse-commit/2006-May/1171.html
+# ------------------------------------------------------------------------------
+%define __os_install_post /usr/lib/rpm/brp-compress
+
+# ------------------------------------------------------------------------------
+# We don't package all files installed into the build root by intention -
+# See BUG#998 for details.
+# ------------------------------------------------------------------------------
+%define _unpackaged_files_terminate_build 0
+
+# ------------------------------------------------------------------------------
+# RPM build tools now automatically detects Perl module dependencies. This 
+# detection gives problems as it is broken in some versions, and it also
+# give unwanted dependencies from mandatory scripts in our package.
+# Might not be possible to disable in all RPM tool versions, but here we
+# try. We keep the "AutoReqProv: no" for the "test" sub package, as disabling
+# here might fail, and that package has the most problems.
+# See http://fedoraproject.org/wiki/Packaging/Perl#Filtering_Requires:_and_Provides
+#     http://www.wideopen.com/archives/rpm-list/2002-October/msg00343.html
+# ------------------------------------------------------------------------------
+%undefine __perl_provides
+%undefine __perl_requires
+
+##############################################################################
+# Command line handling
+##############################################################################
+
+# ----------------------------------------------------------------------
+# use "rpmbuild --with yassl" or "rpm --define '_with_yassl 1'" (for RPM 3.x)
+# to build with yaSSL support (off by default)
+# ----------------------------------------------------------------------
+%{?_with_yassl:%define YASSL_BUILD 1}
+%{!?_with_yassl:%define YASSL_BUILD 0}
+
+# ----------------------------------------------------------------------
+# use "rpmbuild --without libgcc" or "rpm --define '_without_libgcc 1'" (for RPM 3.x)
+# to include libgcc (as libmygcc) (on by default)
+# ----------------------------------------------------------------------
+%{!?_with_libgcc: %{!?_without_libgcc: %define WITH_LIBGCC 1}}
+%{?_with_libgcc:%define WITH_LIBGCC 1}
+%{?_without_libgcc:%define WITH_LIBGCC 0}
+
+
+# On SuSE 9 no separate "debuginfo" package is built. To enable basic
+# debugging on that platform, we don't strip binaries on SuSE 9. We
+# disable the strip of binaries by redefining the RPM macro
+# "__os_install_post" leaving out the script calls that normally does
+# this. We do this in all cases, as on platforms where "debuginfo" is
+# created, a script "find-debuginfo.sh" will be called that will do
+# the strip anyway, part of separating the executable and debug
+# information into separate files put into separate packages.
+#
+# Some references (shows more advanced conditional usage):
+# http://www.redhat.com/archives/rpm-list/2001-November/msg00257.html
+# http://www.redhat.com/archives/rpm-list/2003-February/msg00275.html
+# http://www.redhat.com/archives/rhl-devel-list/2004-January/msg01546.html
+# http://lists.opensuse.org/archive/opensuse-commit/2006-May/1171.html
+
+%define __os_install_post /usr/lib/rpm/brp-compress
+
+%define server_suffix  -51
+%define package_suffix -51
+%define ndbug_comment Percona SQL Server (GPL), XtraDB %{xtradbversion}
+%define debug_comment Percona SQL Server - Debug (GPL), XtraDB %{xtradbversion}
+%define commercial 0
+%define YASSL_BUILD 1
+%define EMBEDDED_BUILD 0
+%define PARTITION_BUILD 1
+%define CLUSTER_BUILD 0
+%define COMMUNITY_BUILD 1
+%define INNODB_BUILD 1
+%define PERCONA_PLUGIN_BUILD 1
+%define MARIA_BUILD 0
+%define NORMAL_TEST_MODE test-bt
+%define DEBUG_TEST_MODE test-bt-debug
+
+%define BUILD_DEBUG 0
+
+
+%if %{COMMUNITY_BUILD}
+%define cluster_package_prefix -cluster
+%else
+%define cluster_package_prefix -
+%endif
+
+%define lic_type GNU GPL v2
+%define lic_files COPYING README
+%define src_dir mysql-%{mysqlversion}
+
+Source1: percona-xtradb-%{pluginversion}-%{xtradbversion}.tar.gz
+Patch0: percona-support.patch
+
+Patch01: show_patches.patch
+Patch02: slow_extended.patch
+Patch03: profiling_slow.patch
+Patch04: microsec_process.patch
+Patch05: userstat.patch
+Patch06: optimizer_fix.patch
+Patch07: mysql-test_for_xtradb.diff
+Patch08: show_temp_51.patch
+
+
+%define perconaxtradbplugin percona-xtradb-%{pluginversion}-%{xtradbversion}.tar.gz
+
+##############################################################################
+# Main spec file section
+##############################################################################
+
+Name:		Percona-XtraDB%{package_suffix}
+Summary:	Percona-XtraDB: a very fast and reliable SQL database server
+Group:		Applications/Databases
+Version:	%{mysqlversion}
+Release:	%{release}
+Distribution:	Red Hat Enterprise Linux %{redhatversion}
+License:    GPL	version 2 http://www.gnu.org/licenses/gpl-2.0.html
+Source:		%{src_dir}.tar.gz
+URL:		http://www.percona.com/
+Packager:	%{mysql_vendor} MySQL Development Team <mysql-dev@percona.com>
+Vendor:		%{mysql_vendor}
+Provides:	msqlormysql MySQL-server Percona-XtraDB-server
+BuildRequires:  gperf perl readline-devel gcc-c++ ncurses-devel zlib-devel libtool automake autoconf time ccache bison
+
+# Think about what you use here since the first step is to
+# run a rm -rf
+BuildRoot:    %{_tmppath}/%{name}-%{version}-build
+
+# From the manual
+%description
+The Percona-XtraDB software delivers a very fast, multi-threaded, multi-user,
+and robust SQL (Structured Query Language) database server. Percona-XtraDB Server
+is intended for mission-critical, heavy-load production systems as well
+as for embedding into mass-deployed software. 
+
+Percona Inc. provides commercial support of Percona-XtraDB Server.
+For more information visist our web site http://www.percona.com/
+
+##############################################################################
+# Sub package definition
+##############################################################################
+
+%package -n Percona-XtraDB-server%{package_suffix}
+Summary:	%{ndbug_comment} for Red Hat Enterprise Linux %{redhatversion}
+Group:		Applications/Databases
+Requires:	 chkconfig coreutils shadow-utils grep procps
+Provides:	msqlormysql mysql-server MySQL-server Percona-XtraDB-server
+Obsoletes:	MySQL mysql mysql-server MySQL-server MySQL-server-community MySQL-server-percona
+
+%description -n Percona-XtraDB-server%{package_suffix}
+The Percona-XtraDB software delivers a very fast, multi-threaded, multi-user,
+and robust SQL (Structured Query Language) database server. Percona-XtraDB Server
+is intended for mission-critical, heavy-load production systems as well
+as for embedding into mass-deployed software. 
+
+Percona Inc. provides commercial support of Percona-XtraDB Server.
+For more information visist our web site http://www.percona.com/
+
+This package includes the Percona-XtraDB server binary 
+%if %{INNODB_BUILD}
+(configured including XtraDB)
+%endif
+as well as related utilities to run and administer a Percona-XtraDB server.
+
+If you want to access and work with the database, you have to install
+package "Percona-XtraDB-client%{package_suffix}" as well!
+
+# ------------------------------------------------------------------------------
+
+%package -n Percona-XtraDB-client%{package_suffix}
+Summary: Percona-XtraDB - Client
+Group: Applications/Databases
+Obsoletes: mysql-client MySQL-client MySQL-client-community MySQL-client-percona
+Provides: mysql-client MySQL-client Percona-XtraDB-client mysql MySQL
+
+%description -n Percona-XtraDB-client%{package_suffix}
+This package contains the standard Percona-XtraDB clients and administration tools. 
+
+%{see_base}
+
+
+# ------------------------------------------------------------------------------
+
+%package -n Percona-XtraDB-test%{package_suffix}
+Requires: mysql-client perl
+Summary: Percona-XtraDB - Test suite
+Group: Applications/Databases
+Provides: mysql-test MySQL-test Percona-XtraDB-test
+Obsoletes: mysql-test MySQL-test MySQL-test-community MySQL-test-percona
+AutoReqProv: no
+
+%description -n Percona-XtraDB-test%{package_suffix}
+This package contains the Percona-XtraDB regression test suite.
+
+%{see_base}
+
+# ------------------------------------------------------------------------------
+
+%package -n Percona-XtraDB-devel%{package_suffix}
+Summary: Percona-XtraDB - Development header files and libraries
+Group: Applications/Databases
+Provides: mysql-devel MySQL-devel Percona-XtraDB-devel
+Obsoletes: mysql-devel MySQL-devel MySQL-devel-community MySQL-devel-percona
+
+%description -n Percona-XtraDB-devel%{package_suffix}
+This package contains the development header files and libraries
+necessary to develop Percona-XtraDB client applications.
+
+%{see_base}
+
+# ------------------------------------------------------------------------------
+
+%package -n Percona-XtraDB-shared%{package_suffix}
+Summary: Percona-XtraDB - Shared libraries
+Group: Applications/Databases
+Provides: mysql-shared MySQL-shared Percona-XtraDB-shared
+# Obsoletes below to correct old missing Provides:/Obsoletes
+Obsoletes: mysql-shared MySQL-shared-standard MySQL-shared-pro
+Obsoletes: MySQL-shared-pro-cert MySQL-shared-pro-gpl
+Obsoletes: MySQL-shared-pro-gpl-cert MySQL-shared MySQL-shared-community MySQL-shared-percona
+
+%description -n Percona-XtraDB-shared%{package_suffix}
+This package contains the shared libraries (*.so*) which certain
+languages and applications need to dynamically load and use MySQL.
+
+# ------------------------------------------------------------------------------
+
+%if %{PERCONA_PLUGIN_BUILD}
+
+%package -n Percona-XtraDB-%{pluginversion}-%{xtradbversion}
+Requires: Percona-XtraDB-devel
+Summary: Percona XtraDB Storage engine for MySQL
+Group: Applications/Databases
+Provides: percona-xtradb-plugin Percona-XtraDB-plugin
+Obsoletes: percona-xtradb-plugin Percona-XtraDB-plugin
+
+%description -n Percona-XtraDB-%{pluginversion}-%{xtradbversion}
+This package contains the Percona-XtraDB storage engine for MySQL server.
+
+An enhanced version of the InnoDB storage engine, including all 
+of InnoDB's robust, reliable ACID-compliant design and advanced 
+MVCC architecture, and builds on that solid foundation with more 
+features, more tunability, more metrics, and more scalability. 
+In particular, it is designed to scale better on many cores, 
+to use memory more efficiently, and to be more convenient and useful.
+The new features are especially designed to reduce the need for 
+awkward workarounds to many of InnoDB's limitations. We choose 
+features and fixes based on customer requests and on our best 
+judgment as a high-performance consulting company.
+
+%endif
+
+##############################################################################
+# 
+##############################################################################
+
+%prep
+
+%setup -n %{src_dir}
+
+%patch01 -p1
+%patch02 -p1
+%patch03 -p1
+%patch04 -p1
+%patch05 -p1
+%patch06 -p1
+%patch07 -p1
+%patch08 -p1
+
+if [ "%{redhatversion}" = "5" ] ; then 
+tar xfz $RPM_SOURCE_DIR/%{perconaxtradbplugin} -C storage/innobase --strip-components=1
+else
+tar xfz $RPM_SOURCE_DIR/%{perconaxtradbplugin} -C storage/innobase --strip-path=1
+fi
+%patch0 -p1
+
+cd storage/innobase && bash -x ./setup.sh
+
+##############################################################################
+# The actual build
+##############################################################################
+
+%build
+
+BuildMySQL() {
+# Get flags from environment. RPM_OPT_FLAGS seems not to be set anywhere.
+CFLAGS=${CFLAGS:-$RPM_OPT_FLAGS}
+CXXFLAGS=${CXXFLAGS:-$RPM_OPT_FLAGS}
+# Evaluate current setting of $DEBUG
+if [ $DEBUG -gt 0 ] ; then
+	OPT_COMMENT='--with-comment="%{debug_comment}"'
+	OPT_DEBUG='--with-debug'
+	CFLAGS=`echo   " $CFLAGS "   | \
+	    sed -e 's/ -O[0-9]* / /' -e 's/ -unroll2 / /' -e 's/ -ip / /' \
+	        -e 's/^ //' -e 's/ $//'`
+	CXXFLAGS=`echo " $CXXFLAGS " | \
+	    sed -e 's/ -O[0-9]* / /' -e 's/ -unroll2 / /' -e 's/ -ip / /' \
+	        -e 's/^ //' -e 's/ $//'`
+else
+	OPT_COMMENT='--with-comment="%{ndbug_comment}"'
+	OPT_DEBUG=''
+fi
+
+echo "BUILD =================="
+echo $*
+
+# The --enable-assembler simply does nothing on systems that does not
+# support assembler speedups.
+sh -c  "CFLAGS=\"$CFLAGS\" \
+	CXXFLAGS=\"$CXXFLAGS\" \
+	AM_CPPFLAGS=\"$AM_CPPFLAGS\" \
+	LDFLAGS=\"$LDFLAGS\" \
+	./configure \
+ 	    $* \
+	    --enable-assembler \
+	    --enable-local-infile \
+            --with-mysqld-user=%{mysqld_user} \
+            --with-unix-socket-path=/var/lib/mysql/mysql.sock \
+	    --with-pic \
+            --prefix=/ \
+%if %{CLUSTER_BUILD}
+	    --with-extra-charsets=all \
+%else
+	    --with-extra-charsets=complex \
+%endif
+%if %{YASSL_BUILD}
+	    --with-ssl \
+%else
+	    --without-ssl \
+%endif
+            --exec-prefix=%{_exec_prefix} \
+            --libexecdir=%{_sbindir} \
+            --libdir=%{_libdir} \
+            --sysconfdir=%{_sysconfdir} \
+            --datadir=%{_datadir} \
+            --localstatedir=%{mysqldatadir} \
+            --infodir=%{_infodir} \
+            --includedir=%{_includedir} \
+            --mandir=%{_mandir} \
+	    --enable-thread-safe-client \
+        --enable-profiling \
+%if %{?ndbug_comment:1}0
+	    $OPT_COMMENT \
+%endif
+	    $OPT_DEBUG \
+%if %{commercial}
+            --with-libedit \
+%else
+	    --with-readline \
+%endif
+	    ; make "
+}
+# end of function definition "BuildMySQL"
+
+
+BuildServer() {
+BuildMySQL "--enable-shared \
+%if %{?server_suffix:1}0
+		--with-server-suffix='%{server_suffix}' \
+%endif
+%if %{CLUSTER_BUILD}
+		--with-plugin-ndbcluster \
+%else
+		--without-plugin-ndbcluster \
+%endif
+%if %{MARIA_BUILD}
+		--with-plugin-maria \
+		--with-maria-tmp-tables \
+%else
+		--without-plugin-maria \
+%endif
+%if %{INNODB_BUILD}
+		--with-plugin-innobase \
+		--without-plugin-innodb_plugin \
+%else
+		--without-plugin-innobase \
+		--without-plugin-innodb_plugin \
+%endif
+%if %{PARTITION_BUILD}
+		--with-plugin-partition \
+%else
+		--without-plugin-partition \
+%endif
+		--with-plugin-csv \
+		--with-plugin-archive \
+		--with-plugin-blackhole \
+		--with-plugin-federated \
+%if %{EMBEDDED_BUILD}
+		--with-embedded-server \
+%else
+		--without-embedded-server \
+%endif
+		--without-bench \
+		--with-zlib-dir=bundled \
+		--with-big-tables"
+
+if [ -n "$MYSQL_CONFLOG_DEST" ] ; then
+	cp -fp config.log "$MYSQL_CONFLOG_DEST"
+fi
+
+#if [ -f sql/.libs/mysqld ] ; then
+#	nm --numeric-sort sql/.libs/mysqld > sql/mysqld.sym
+#else
+#	nm --numeric-sort sql/mysqld > sql/mysqld.sym
+#fi
+}
+# end of function definition "BuildServer"
+
+
+RBR=$RPM_BUILD_ROOT
+MBD=$RPM_BUILD_DIR/%{src_dir}
+
+# Clean up the BuildRoot first
+[ "$RBR" != "/" ] && [ -d $RBR ] && rm -rf $RBR;
+mkdir -p $RBR%{_libdir}/mysql $RBR%{_sbindir}
+
+# Use gcc for C and C++ code (to avoid a dependency on libstdc++ and
+# including exceptions into the code
+if [ -z "$CXX" -a -z "$CC" ] ; then
+	export CC="gcc" CXX="gcc"
+fi
+
+if [ "%{redhatversion}" = "5" ] ; then 
+export CFLAGS="-static-libgcc -O2 -fno-omit-frame-pointer  -g -pipe -Wall -Wp,-D_FORTIFY_SOURCE=2 -fexceptions -fstack-protector --param=ssp-buffer-size=4 -mtune=generic"
+export CXXFLAGS="-static-libgcc -O2 -fno-omit-frame-pointer -g -pipe -Wall -Wp,-D_FORTIFY_SOURCE=2 -fexceptions -fstack-protector --param=ssp-buffer-size=4 -mtune=generic"
+fi
+
+if [ "%{redhatversion}" != "5" ] ; then
+export CFLAGS="-static-libgcc -O2 -g -fno-omit-frame-pointer -pipe "
+export CXXFLAGS="-static-libgcc -O2 -g -fno-omit-frame-pointer -pipe "
+fi 
+
+
+# Create the shared libs seperately to avoid a dependency for the client utilities
+DEBUG=0
+BuildMySQL "--enable-shared"
+
+# Install shared libraries
+cp -av libmysql/.libs/*.so*   $RBR/%{_libdir}
+cp -av libmysql_r/.libs/*.so* $RBR/%{_libdir}
+mkdir -p $RBR%{_libdir}/mysql/plugin
+cp -av storage/innobase/.libs/*.so* $RBR%{_libdir}/mysql/plugin
+cp -av storage/innobase/scripts/install_innodb_plugins.sql $RBR%{_libdir}/mysql/plugin
+
+pushd $RBR%{_libdir}/mysql
+tar cfz percona-xtradb-%{pluginversion}-%{xtradbversion}-%{mysqlversion}.$RPM_ARCH.tar.gz plugin
+mv percona-xtradb-%{pluginversion}-%{xtradbversion}-%{mysqlversion}.$RPM_ARCH.tar.gz %{_topdir}
+popd
+
+##############################################################################
+
+# Include libgcc.a in the devel subpackage (BUG 4921)
+%if %{WITH_LIBGCC}
+libgcc=`$CC $CFLAGS --print-libgcc-file`
+install -m 644 "$libgcc" $RBR%{_libdir}/mysql/libmygcc.a
+%endif
+
+##############################################################################
+
+# Now create a debug server
+%if %{BUILD_DEBUG}
+DEBUG=1
+make clean
+
+( BuildServer )   # subshell, so that CFLAGS + CXXFLAGS are modified only locally
+
+if [ "$MYSQL_RPMBUILD_TEST" != "no" ] ; then
+	MTR_BUILD_THREAD=auto make %{DEBUG_TEST_MODE}
+fi
+
+# Get the debug server and its .sym file from the build tree
+#if [ -f sql/.libs/mysqld ] ; then
+#	cp sql/.libs/mysqld $RBR%{_sbindir}/mysqld-debug
+#else
+#	cp sql/mysqld       $RBR%{_sbindir}/mysqld-debug
+#fi
+#cp libmysqld/libmysqld.a    $RBR%{_libdir}/mysql/libmysqld-debug.a
+#cp sql/mysqld.sym           $RBR%{_libdir}/mysql/mysqld-debug.sym
+
+%endif
+
+# Now, the default server
+DEBUG=0
+make clean
+
+BuildServer
+
+if [ "$MYSQL_RPMBUILD_TEST" != "no" ] ; then
+	MTR_BUILD_THREAD=auto make %{NORMAL_TEST_MODE}
+fi
+
+# Now, build plugin 
+#BUILDSO=0
+#make clean
+
+#BuildServer
+
+#if [ "$MYSQL_RPMBUILD_TEST" != "no" ] ; then
+#	MTR_BUILD_THREAD=auto make %{NORMAL_TEST_MODE}
+#fi
+
+%install
+RBR=$RPM_BUILD_ROOT
+MBD=$RPM_BUILD_DIR/%{src_dir}
+
+# Ensure that needed directories exists
+install -d $RBR%{_sysconfdir}/{logrotate.d,init.d}
+install -d $RBR%{mysqldatadir}/mysql
+install -d $RBR%{_datadir}/mysql-test
+install -d $RBR%{_datadir}/mysql/SELinux/RHEL4
+install -d $RBR%{_includedir}
+install -d $RBR%{_libdir}
+install -d $RBR%{_mandir}
+install -d $RBR%{_sbindir}
+install -d $RBR%{_libdir}/mysql/plugin
+
+make DESTDIR=$RBR benchdir_root=%{_datadir} install
+
+# install symbol files ( for stack trace resolution)
+#install -m644 $MBD/sql/mysqld.sym $RBR%{_libdir}/mysql/mysqld.sym
+
+# Install logrotate and autostart
+install -m644 $MBD/support-files/mysql-log-rotate \
+        $RBR%{_sysconfdir}/logrotate.d/mysql
+install -m755 $MBD/support-files/mysql.server \
+        $RBR%{_sysconfdir}/init.d/mysql
+
+# in RPMs, it is unlikely that anybody should use "sql-bench"
+rm -fr $RBR%{_datadir}/sql-bench
+
+# Create a symlink "rcmysql", pointing to the init.script. SuSE users
+# will appreciate that, as all services usually offer this.
+ln -s %{_sysconfdir}/init.d/mysql $RBR%{_sbindir}/rcmysql
+
+# Touch the place where the my.cnf config file and mysqlmanager.passwd
+# (MySQL Instance Manager password file) might be located
+# Just to make sure it's in the file list and marked as a config file
+touch $RBR%{_sysconfdir}/my.cnf
+touch $RBR%{_sysconfdir}/mysqlmanager.passwd
+
+# Install SELinux files in datadir
+install -m600 $MBD/support-files/RHEL4-SElinux/mysql.{fc,te} \
+	$RBR%{_datadir}/mysql/SELinux/RHEL4
+
+
+##############################################################################
+#  Post processing actions, i.e. when installed
+##############################################################################
+
+%pre -n Percona-XtraDB-server%{package_suffix}
+# Check if we can safely upgrade.  An upgrade is only safe if it's from one
+# of our RPMs in the same version family.
+
+installed=`rpm -q --whatprovides mysql-server 2> /dev/null`
+if [ $? -eq 0 -a -n "$installed" ]; then
+  vendor=`rpm -q --queryformat='%{VENDOR}' "$installed" 2>&1`
+  version=`rpm -q --queryformat='%{VERSION}' "$installed" 2>&1`
+  myvendor='%{mysql_vendor}'
+  myversion='%{mysqlversion}'
+
+  old_family=`echo $version   | sed -n -e 's,^\([1-9][0-9]*\.[0-9][0-9]*\)\..*$,\1,p'`
+  new_family=`echo $myversion | sed -n -e 's,^\([1-9][0-9]*\.[0-9][0-9]*\)\..*$,\1,p'`
+
+  [ -z "$vendor" ] && vendor='<unknown>'
+  [ -z "$old_family" ] && old_family="<unrecognized version $version>"
+  [ -z "$new_family" ] && new_family="<bad package specification: version $myversion>"
+
+  error_text=
+#  if [ "$vendor" != "$myvendor" ]; then
+#    error_text="$error_text
+#The current MySQL server package is provided by a different
+#vendor ($vendor) than $myvendor.  Some files may be installed
+#to different locations, including log files and the service
+#startup script in %{_sysconfdir}/init.d/.
+#"
+#  fi
+
+  if [ "$old_family" != "$new_family" ]; then
+    error_text="$error_text
+Upgrading directly from MySQL $old_family to MySQL $new_family may not
+be safe in all cases.  A manual dump and restore using mysqldump is
+recommended.  It is important to review the MySQL manual's Upgrading
+section for version-specific incompatibilities.
+"
+  fi
+
+  if [ -n "$error_text" ]; then
+    cat <<HERE >&2
+
+******************************************************************
+A MySQL server package ($installed) is installed.
+$error_text
+A manual upgrade is required.
+
+- Ensure that you have a complete, working backup of your data and my.cnf
+  files
+- Shut down the MySQL server cleanly
+- Remove the existing MySQL packages.  Usually this command will
+  list the packages you should remove:
+  rpm -qa | grep -i '^mysql-'
+
+  You may choose to use 'rpm --nodeps -ev <package-name>' to remove
+  the package which contains the mysqlclient shared library.  The
+  library will be reinstalled by the MySQL-shared-compat package.
+- Install the new MySQL packages supplied by $myvendor
+- Ensure that the MySQL server is started
+- Run the 'mysql_upgrade' program
+
+This is a brief description of the upgrade process.  Important details
+can be found in the MySQL manual, in the Upgrading section.
+******************************************************************
+HERE
+    exit 1
+  fi
+fi
+
+# Shut down a previously installed server first
+if [ -x %{_sysconfdir}/init.d/mysql ] ; then
+	%{_sysconfdir}/init.d/mysql stop > /dev/null 2>&1
+	echo "Giving mysqld 5 seconds to exit nicely"
+	sleep 5
+fi
+
+%post -n Percona-XtraDB-server%{package_suffix}
+mysql_datadir=%{mysqldatadir}
+
+# ----------------------------------------------------------------------
+# Create data directory
+# ----------------------------------------------------------------------
+mkdir -p $mysql_datadir/{mysql,test}
+
+# ----------------------------------------------------------------------
+# Make MySQL start/shutdown automatically when the machine does it.
+# ----------------------------------------------------------------------
+if [ -x /sbin/chkconfig ] ; then
+	/sbin/chkconfig --add mysql
+fi
+
+# ----------------------------------------------------------------------
+# Create a MySQL user and group. Do not report any problems if it already
+# exists.
+# ----------------------------------------------------------------------
+groupadd -r %{mysqld_group} 2> /dev/null || true
+useradd -M -r -d $mysql_datadir -s /bin/bash -c "MySQL server" -g %{mysqld_group} %{mysqld_user} 2> /dev/null || true 
+# The user may already exist, make sure it has the proper group nevertheless (BUG#12823)
+usermod -g %{mysqld_group} %{mysqld_user} 2> /dev/null || true
+
+# ----------------------------------------------------------------------
+# Change permissions so that the user that will run the MySQL daemon
+# owns all database files.
+# ----------------------------------------------------------------------
+chown -R %{mysqld_user}:%{mysqld_group} $mysql_datadir
+
+# ----------------------------------------------------------------------
+# Initiate databases
+# ----------------------------------------------------------------------
+%{_bindir}/mysql_install_db --rpm --user=%{mysqld_user}
+
+# ----------------------------------------------------------------------
+# FIXME upgrade databases if needed would go here - but it cannot be
+# automated yet
+# ----------------------------------------------------------------------
+
+# ----------------------------------------------------------------------
+# Change permissions again to fix any new files.
+# ----------------------------------------------------------------------
+chown -R %{mysqld_user}:%{mysqld_group} $mysql_datadir
+
+# ----------------------------------------------------------------------
+# Fix permissions for the permission database so that only the user
+# can read them.
+# ----------------------------------------------------------------------
+chmod -R og-rw $mysql_datadir/mysql
+
+# ----------------------------------------------------------------------
+# install SELinux files - but don't override existing ones
+# ----------------------------------------------------------------------
+SETARGETDIR=/etc/selinux/targeted/src/policy
+SEDOMPROG=$SETARGETDIR/domains/program
+SECONPROG=$SETARGETDIR/file_contexts/program
+if [ -f /etc/redhat-release ] && \
+   (grep -q "Red Hat Enterprise Linux .. release 4" /etc/redhat-release \
+    || grep -q "CentOS release 4" /etc/redhat-release) ; then
+   echo
+   echo
+   echo 'Notes regarding SELinux on this platform:'
+   echo '========================================='
+   echo
+   echo 'The default policy might cause server startup to fail because it is '
+   echo 'not allowed to access critical files. In this case, please update '
+   echo 'your installation. '
+   echo
+   echo 'The default policy might also cause inavailability of SSL related '
+   echo 'features because the server is not allowed to access /dev/random '
+   echo 'and /dev/urandom. If this is a problem, please do the following: '
+   echo 
+   echo '  1) install selinux-policy-targeted-sources from your OS vendor'
+   echo '  2) add the following two lines to '$SEDOMPROG/mysqld.te':'
+   echo '       allow mysqld_t random_device_t:chr_file read;'
+   echo '       allow mysqld_t urandom_device_t:chr_file read;'
+   echo '  3) cd to '$SETARGETDIR' and issue the following command:'
+   echo '       make load'
+   echo
+   echo
+fi
+
+if [ -x sbin/restorecon ] ; then
+	sbin/restorecon -R var/lib/mysql
+fi
+
+# Restart in the same way that mysqld will be started normally.
+if [ -x %{_sysconfdir}/init.d/mysql ] ; then
+	%{_sysconfdir}/init.d/mysql start
+	echo "Giving mysqld 2 seconds to start"
+	sleep 2
+fi
+
+# Allow mysqld_safe to start mysqld and print a message before we exit
+sleep 2
+
+%if %{CLUSTER_BUILD}
+%post -n MySQL%{cluster_package_prefix}storage%{package_suffix}
+# Create cluster directory if needed
+mkdir -p /var/lib/mysql-cluster
+%endif
+
+%preun -n Percona-XtraDB-server%{package_suffix}
+if [ $1 = 0 ] ; then
+	# Stop MySQL before uninstalling it
+	if [ -x %{_sysconfdir}/init.d/mysql ] ; then
+		%{_sysconfdir}/init.d/mysql stop > /dev/null
+		# Don't start it automatically anymore
+		if [ -x /sbin/chkconfig ] ; then
+			/sbin/chkconfig --del mysql
+		fi
+	fi
+fi
+
+# We do not remove the mysql user since it may still own a lot of
+# database files.
+
+# ----------------------------------------------------------------------
+# Clean up the BuildRoot after build is done
+# ----------------------------------------------------------------------
+%clean
+[ "$RPM_BUILD_ROOT" != "/" ] && [ -d $RPM_BUILD_ROOT ] && rm -rf $RPM_BUILD_ROOT;
+
+##############################################################################
+#  Files section
+##############################################################################
+
+%files -n Percona-XtraDB-server%{package_suffix}
+%defattr(-,root,root,0755)
+
+%doc %{lic_files}
+%doc support-files/my-*.cnf
+%if %{CLUSTER_BUILD}
+%doc support-files/ndb-*.ini
+%endif
+
+%doc %attr(644, root, root) %{_infodir}/mysql.info*
+
+%if %{INNODB_BUILD}
+%doc %attr(644, root, man) %{_mandir}/man1/innochecksum.1*
+%endif
+%doc %attr(644, root, man) %{_mandir}/man1/my_print_defaults.1*
+%doc %attr(644, root, man) %{_mandir}/man1/myisam_ftdump.1*
+%doc %attr(644, root, man) %{_mandir}/man1/myisamchk.1*
+%doc %attr(644, root, man) %{_mandir}/man1/myisamlog.1*
+%doc %attr(644, root, man) %{_mandir}/man1/myisampack.1*
+%doc %attr(644, root, man) %{_mandir}/man1/mysql_convert_table_format.1*
+%doc %attr(644, root, man) %{_mandir}/man1/mysql_fix_extensions.1*
+%doc %attr(644, root, man) %{_mandir}/man8/mysqld.8*
+%doc %attr(644, root, man) %{_mandir}/man1/mysqld_multi.1*
+%doc %attr(644, root, man) %{_mandir}/man1/mysqld_safe.1*
+%doc %attr(644, root, man) %{_mandir}/man1/mysql_fix_privilege_tables.1*
+%doc %attr(644, root, man) %{_mandir}/man1/mysql_install_db.1*
+%doc %attr(644, root, man) %{_mandir}/man1/mysql_secure_installation.1*
+%doc %attr(644, root, man) %{_mandir}/man1/mysql_setpermission.1*
+%doc %attr(644, root, man) %{_mandir}/man1/mysql_upgrade.1*
+%doc %attr(644, root, man) %{_mandir}/man1/mysqlhotcopy.1*
+%doc %attr(644, root, man) %{_mandir}/man1/mysqlman.1*
+%doc %attr(644, root, man) %{_mandir}/man8/mysqlmanager.8*
+%doc %attr(644, root, man) %{_mandir}/man1/mysql.server.1*
+%doc %attr(644, root, man) %{_mandir}/man1/mysqltest.1*
+%doc %attr(644, root, man) %{_mandir}/man1/mysql_tzinfo_to_sql.1*
+%doc %attr(644, root, man) %{_mandir}/man1/mysql_zap.1*
+%doc %attr(644, root, man) %{_mandir}/man1/mysqlbug.1*
+%doc %attr(644, root, man) %{_mandir}/man1/perror.1*
+%doc %attr(644, root, man) %{_mandir}/man1/replace.1*
+%doc %attr(644, root, man) %{_mandir}/man1/resolve_stack_dump.1*
+%doc %attr(644, root, man) %{_mandir}/man1/resolveip.1*
+
+%ghost %config(noreplace,missingok) %{_sysconfdir}/my.cnf
+%ghost %config(noreplace,missingok) %{_sysconfdir}/mysqlmanager.passwd
+
+%if %{INNODB_BUILD}
+%attr(755, root, root) %{_bindir}/innochecksum
+%endif
+%attr(755, root, root) %{_bindir}/my_print_defaults
+%attr(755, root, root) %{_bindir}/myisam_ftdump
+%attr(755, root, root) %{_bindir}/myisamchk
+%attr(755, root, root) %{_bindir}/myisamlog
+%attr(755, root, root) %{_bindir}/myisampack
+%attr(755, root, root) %{_bindir}/mysql_convert_table_format
+%attr(755, root, root) %{_bindir}/mysql_fix_extensions
+%attr(755, root, root) %{_bindir}/mysql_fix_privilege_tables
+%attr(755, root, root) %{_bindir}/mysql_install_db
+%attr(755, root, root) %{_bindir}/mysql_secure_installation
+%attr(755, root, root) %{_bindir}/mysql_setpermission
+%attr(755, root, root) %{_bindir}/mysql_tzinfo_to_sql
+%attr(755, root, root) %{_bindir}/mysql_upgrade
+%attr(755, root, root) %{_bindir}/mysql_zap
+%attr(755, root, root) %{_bindir}/mysqlbug
+%attr(755, root, root) %{_bindir}/mysqld_multi
+%attr(755, root, root) %{_bindir}/mysqld_safe
+%attr(755, root, root) %{_bindir}/mysqldumpslow
+%attr(755, root, root) %{_bindir}/mysqlhotcopy
+%attr(755, root, root) %{_bindir}/mysqltest
+%attr(755, root, root) %{_bindir}/perror
+%attr(755, root, root) %{_bindir}/replace
+%attr(755, root, root) %{_bindir}/resolve_stack_dump
+%attr(755, root, root) %{_bindir}/resolveip
+
+%attr(755, root, root) %{_sbindir}/mysqld
+%if %{BUILD_DEBUG}
+%attr(755, root, root) %{_sbindir}/mysqld-debug
+%endif
+%attr(755, root, root) %{_sbindir}/mysqlmanager
+%attr(755, root, root) %{_sbindir}/rcmysql
+#%attr(644, root, root) %{_libdir}/mysql/mysqld.sym
+%if %{BUILD_DEBUG}
+#%attr(644, root, root) %{_libdir}/mysql/mysqld-debug.sym
+%endif
+
+%attr(644, root, root) %config(noreplace,missingok) %{_sysconfdir}/logrotate.d/mysql
+%attr(755, root, root) %{_sysconfdir}/init.d/mysql
+
+%attr(755, root, root) %{_datadir}/mysql/
+
+%files -n Percona-XtraDB-client%{package_suffix}
+%defattr(-, root, root, 0755)
+%attr(755, root, root) %{_bindir}/msql2mysql
+%attr(755, root, root) %{_bindir}/mysql
+%attr(755, root, root) %{_bindir}/mysql_find_rows
+%attr(755, root, root) %{_bindir}/mysql_waitpid
+%attr(755, root, root) %{_bindir}/mysqlaccess
+%attr(755, root, root) %{_bindir}/mysqladmin
+%attr(755, root, root) %{_bindir}/mysqlbinlog
+%attr(755, root, root) %{_bindir}/mysqlcheck
+%attr(755, root, root) %{_bindir}/mysqldump
+%attr(755, root, root) %{_bindir}/mysqlimport
+%attr(755, root, root) %{_bindir}/mysqlshow
+%attr(755, root, root) %{_bindir}/mysqlslap
+
+%doc %attr(644, root, man) %{_mandir}/man1/msql2mysql.1*
+%doc %attr(644, root, man) %{_mandir}/man1/mysql.1*
+%doc %attr(644, root, man) %{_mandir}/man1/mysql_find_rows.1*
+%doc %attr(644, root, man) %{_mandir}/man1/mysql_waitpid.1*
+%doc %attr(644, root, man) %{_mandir}/man1/mysqlaccess.1*
+%doc %attr(644, root, man) %{_mandir}/man1/mysqladmin.1*
+%doc %attr(644, root, man) %{_mandir}/man1/mysqlbinlog.1*
+%doc %attr(644, root, man) %{_mandir}/man1/mysqlcheck.1*
+%doc %attr(644, root, man) %{_mandir}/man1/mysqldump.1*
+%doc %attr(644, root, man) %{_mandir}/man1/mysqlimport.1*
+%doc %attr(644, root, man) %{_mandir}/man1/mysqlshow.1*
+%doc %attr(644, root, man) %{_mandir}/man1/mysqlslap.1*
+
+%post -n Percona-XtraDB-shared%{package_suffix}
+/sbin/ldconfig
+
+%postun -n Percona-XtraDB-shared%{package_suffix}
+/sbin/ldconfig
+
+%if %{CLUSTER_BUILD}
+%files -n MySQL%{cluster_package_prefix}storage%{package_suffix}
+%defattr(-,root,root,0755)
+%attr(755, root, root) %{_sbindir}/ndbd
+%doc %attr(644, root, man) %{_mandir}/man8/ndbd.8*
+
+%files -n MySQL%{cluster_package_prefix}management%{package_suffix}
+%defattr(-,root,root,0755)
+%attr(755, root, root) %{_sbindir}/ndb_mgmd
+%doc %attr(644, root, man) %{_mandir}/man8/ndb_mgmd.8*
+
+%files -n MySQL%{cluster_package_prefix}tools%{package_suffix}
+%defattr(-,root,root,0755)
+%attr(755, root, root) %{_bindir}/ndb_config
+%attr(755, root, root) %{_bindir}/ndb_desc
+%attr(755, root, root) %{_bindir}/ndb_error_reporter
+%attr(755, root, root) %{_bindir}/ndb_mgm
+%attr(755, root, root) %{_bindir}/ndb_restore
+%attr(755, root, root) %{_bindir}/ndb_select_all
+%attr(755, root, root) %{_bindir}/ndb_select_count
+%attr(755, root, root) %{_bindir}/ndb_show_tables
+%attr(755, root, root) %{_bindir}/ndb_size.pl
+%attr(755, root, root) %{_bindir}/ndb_test_platform
+%attr(755, root, root) %{_bindir}/ndb_waiter
+%doc %attr(644, root, man) %{_mandir}/man1/ndb_config.1*
+%doc %attr(644, root, man) %{_mandir}/man1/ndb_desc.1*
+%doc %attr(644, root, man) %{_mandir}/man1/ndb_error_reporter.1*
+%doc %attr(644, root, man) %{_mandir}/man1/ndb_mgm.1*
+%doc %attr(644, root, man) %{_mandir}/man1/ndb_restore.1*
+%doc %attr(644, root, man) %{_mandir}/man1/ndb_select_all.1*
+%doc %attr(644, root, man) %{_mandir}/man1/ndb_select_count.1*
+%doc %attr(644, root, man) %{_mandir}/man1/ndb_show_tables.1*
+%doc %attr(644, root, man) %{_mandir}/man1/ndb_size.pl.1*
+%doc %attr(644, root, man) %{_mandir}/man1/ndb_waiter.1*
+
+%files -n MySQL%{cluster_package_prefix}extra%{package_suffix}
+%defattr(-,root,root,0755)
+%attr(755, root, root) %{_bindir}/ndb_delete_all
+%attr(755, root, root) %{_bindir}/ndb_drop_index
+%attr(755, root, root) %{_bindir}/ndb_drop_table
+%attr(755, root, root) %{_sbindir}/ndb_cpcd
+%doc %attr(644, root, man) %{_mandir}/man1/ndb_delete_all.1*
+%doc %attr(644, root, man) %{_mandir}/man1/ndb_drop_index.1*
+%doc %attr(644, root, man) %{_mandir}/man1/ndb_drop_table.1*
+%doc %attr(644, root, man) %{_mandir}/man1/ndb_cpcd.1*
+%endif
+
+%files -n Percona-XtraDB-devel%{package_suffix}
+%defattr(-, root, root, 0755)
+%if %{commercial}
+%else
+%doc EXCEPTIONS-CLIENT
+%endif
+%doc %attr(644, root, man) %{_mandir}/man1/comp_err.1*
+%doc %attr(644, root, man) %{_mandir}/man1/mysql_config.1*
+%attr(755, root, root) %{_bindir}/mysql_config
+%dir %attr(755, root, root) %{_libdir}/mysql
+%{_includedir}/mysql
+%{_datadir}/aclocal/mysql.m4
+%{_libdir}/mysql/libdbug.a
+%{_libdir}/mysql/libheap.a
+%if %{WITH_LIBGCC}
+%{_libdir}/mysql/libmygcc.a
+%endif
+%{_libdir}/mysql/libmyisam.a
+%{_libdir}/mysql/libmyisammrg.a
+%{_libdir}/mysql/libmysqlclient.a
+%{_libdir}/mysql/libmysqlclient.la
+%{_libdir}/mysql/libmysqlclient_r.a
+%{_libdir}/mysql/libmysqlclient_r.la
+%{_libdir}/mysql/libmystrings.a
+%{_libdir}/mysql/libmysys.a
+%{_libdir}/mysql/libvio.a
+%{_libdir}/mysql/libz.a
+%{_libdir}/mysql/libz.la
+%if %{CLUSTER_BUILD}
+%{_libdir}/mysql/libndbclient.a
+%{_libdir}/mysql/libndbclient.la
+%endif
+
+%files -n Percona-XtraDB-shared%{package_suffix}
+%defattr(-, root, root, 0755)
+# Shared libraries (omit for architectures that don't support them)
+%{_libdir}/*.so*
+
+%files -n Percona-XtraDB-test%{package_suffix}
+%defattr(-, root, root, 0755)
+%{_datadir}/mysql-test
+%attr(755, root, root) %{_bindir}/mysql_client_test
+%doc %attr(644, root, man) %{_mandir}/man1/mysql_client_test.1*
+%doc %attr(644, root, man) %{_mandir}/man1/mysql-stress-test.pl.1*
+%doc %attr(644, root, man) %{_mandir}/man1/mysql-test-run.pl.1*
+
+%files -n Percona-XtraDB-%{pluginversion}-%{xtradbversion}
+%defattr(-, root, root, 0755) 
+%attr(644, root, root) %{_libdir}/mysql/plugin/ha_innodb.so*
+%attr(644, root, root) %{_libdir}/mysql/plugin/install_innodb_plugins.sql
+
+##############################################################################
+# The spec file changelog only includes changes made to the spec file
+# itself - note that they must be ordered by date (important when
+# merging BK trees)
+##############################################################################
+%changelog
+* Mon Mar 22 2010 Aleksandr Kuzminsky <aleksandr.kuzminsky@percona.com>
+
+XtraDB Release 10
+
+* Thu Feb 11 2010 Aleksandr Kuzminsky <aleksandr.kuzminsky@percona.com>
+
+Package name changed to Percona-XtraDB
+
+* Tue Jan 05 2010 Aleksandr Kuzminsky <aleksandr.kuzminsky@percona.com>
+
+- Corrected emails
+- -m64 is removed from CFLAGS
+
+* Tue Apr 21 2009 Aleksandr Kuzminsky <aleksandr.kuzminsky@percona.com>
+
+- Adoption for XtraDB Storage Engine
+
+* Fri Nov 07 2008 Joerg Bruehe <joerg@mysql.com>
+
+- Modify CFLAGS and CXXFLAGS such that a debug build is not optimized.
+  This should cover both gcc and icc flags.  Fixes bug#40546.
+
+* Mon Aug 18 2008 Joerg Bruehe <joerg@mysql.com>
+
+- Get rid of the "warning: Installed (but unpackaged) file(s) found:"
+  Some generated files aren't needed in RPMs:
+  - the "sql-bench/" subdirectory
+  Some files were missing:
+  - /usr/share/aclocal/mysql.m4  ("devel" subpackage)
+  - Manuals for embedded tests   ("test" subpackage)
+  - Manual "mysqlbug" ("server" subpackage)
+  - Manual "mysql_find_rows" ("client" subpackage)
+
+* Wed Jun 11 2008 Kent Boortz <kent@mysql.com>
+
+- Removed the Example storage engine, it is not to be in products
+ 
+* Fri Apr 04 2008 Daniel Fischer <df@mysql.com>
+
+- Added Cluster+InnoDB product
+
+* Mon Mar 31 2008 Kent Boortz <kent@mysql.com>
+
+- Made the "Federated" storage engine an option
+
+* Tue Mar 11 2008 Joerg Bruehe <joerg@mysql.com>
+
+- Cleanup: Remove manual file "mysql_tableinfo.1".
+
+* Mon Feb 18 2008 Timothy Smith <tim@mysql.com>
+
+- Require a manual upgrade if the alread-installed mysql-server is
+  from another vendor, or is of a different major version.
+
+* Fri Dec 14 2007 Joerg Bruehe <joerg@mysql.com>
+
+- Add the "%doc" directive for all man pages and other documentation;
+  also, some re-ordering to reduce differences between spec files.
+
+* Fri Dec 14 2007 Joerg Bruehe <joerg@mysql.com>
+
+- Added "client/mysqlslap" (bug#32077)
+ 
+* Wed Oct 31 2007 Joerg Bruehe <joerg@mysql.com>
+ 
+- Explicitly handle InnoDB using its own variable and "--with"/"--without"
+  options, because the "configure" default is "yes".
+  Also, fix the specification of "community" to include "partitioning".
+ 
+* Mon Sep 03 2007 Kent Boortz <kent@mysql.com>
+
+- Let libmygcc be included unless "--without libgcc" is given.
+
+* Sun Sep 02 2007 Kent Boortz <kent@mysql.com>
+
+- Changed SSL flag given to configure to "--with-ssl"
+- Removed symbolic link "safe_mysqld"
+- Removed script and man page for "mysql_explain_log"
+- Removed scripts "mysql_tableinfo" and "mysql_upgrade_shell"
+- Removed "comp_err" from list to install
+- Removed duplicates of "libndbclient.a" and "libndbclient.la"
+
+* Tue Jul 17 2007 Joerg Bruehe <joerg@mysql.com>
+
+- Add the man page for "mysql-stress-test.pl" to the "test" RPM
+  (consistency in fixing bug#21023, the script is handled by "Makefile.am")
+
+* Wed Jul 11 2007 Daniel Fischer <df@mysql.com>
+
+- Change the way broken SELinux policies on RHEL4 and CentOS 4
+  are handled to be more likely to actually work
+
+* Thu Jun 05 2007 kent Boortz <kent@mysql.com>
+
+- Enabled the CSV engine in all builds
+
+* Thu May  3 2007 Mads Martin Joergensen <mmj@mysql.com>
+
+- Spring cleanup
+
+* Thu Apr 19 2007 Mads Martin Joergensen <mmj@mysql.com>
+
+- If sbin/restorecon exists then run it
+
+* Wed Apr 18 2007 Kent Boortz <kent@mysql.com>
+
+- Packed unpacked files
+
+   /usr/sbin/ndb_cpcd
+   /usr/bin/mysql_upgrade_shell
+   /usr/bin/innochecksum
+   /usr/share/man/man1/ndb_cpcd.1.gz
+   /usr/share/man/man1/innochecksum.1.gz
+   /usr/share/man/man1/mysql_fix_extensions.1.gz
+   /usr/share/man/man1/mysql_secure_installation.1.gz
+   /usr/share/man/man1/mysql_tableinfo.1.gz
+   /usr/share/man/man1/mysql_waitpid.1.gz
+
+- Commands currently not installed but that has man pages
+
+   /usr/share/man/man1/make_win_bin_dist.1.gz
+   /usr/share/man/man1/make_win_src_distribution.1.gz
+   /usr/share/man/man1/mysql-stress-test.pl.1.gz
+   /usr/share/man/man1/ndb_print_backup_file.1.gz
+   /usr/share/man/man1/ndb_print_schema_file.1.gz
+   /usr/share/man/man1/ndb_print_sys_file.1.gz
+
+* Thu Mar 22 2007 Joerg Bruehe <joerg@mysql.com>
+
+- Add "comment" options to the test runs, for better log analysis.
+
+* Wed Mar 21 2007 Joerg Bruehe <joerg@mysql.com>
+
+- Add even more man pages.
+
+* Fri Mar 16 2007 Joerg Bruehe <joerg@mysql.com>
+
+- Build the server twice, once as "mysqld-debug" and once as "mysqld";
+  test them both, and include them in the resulting file.
+- Consequences of the fix for bug#20166:
+  Remove "mysql_create_system_tables",
+  new "mysql_fix_privilege_tables.sql" is included implicitly.
+
+* Wed Mar 14 2007 Daniel Fischer <df@mysql.com>
+
+- Adjust compile options some more and change naming of community
+  cluster RPMs to explicitly say 'cluster'.
+
+* Mon Mar 12 2007 Daniel Fischer <df@mysql.com>
+
+- Adjust compile options and other settings for 5.0 community builds.
+
+* Fri Mar 02 2007 Joerg Bruehe <joerg@mysql.com>
+
+- Add several man pages which are now created.
+
+* Mon Jan 29 2007 Mads Martin Joergensen <mmj@mysql.com>
+
+- Make sure SELinux works correctly. Files from Colin Charles.
+
+* Fri Jan 05 2007 Kent Boortz <kent@mysql.com>
+
+- Add CFLAGS to gcc call with --print-libgcc-file, to make sure the
+  correct "libgcc.a" path is returned for the 32/64 bit architecture.
+
+* Tue Dec 19 2006 Joerg Bruehe <joerg@mysql.com>
+
+- The man page for "mysqld" is now in section 8.
+
+* Thu Dec 14 2006 Joerg Bruehe <joerg@mysql.com>
+
+- Include the new man pages for "my_print_defaults" and "mysql_tzinfo_to_sql"
+  in the server RPM.
+- The "mysqlmanager" man page was relocated to section 8, reflect that.
+
+* Fri Nov 17 2006 Mads Martin Joergensen <mmj@mysql.com>
+
+- Really fix obsoletes/provides for community -> this
+- Make it possible to not run test by setting
+  MYSQL_RPMBUILD_TEST to "no"
+
+* Wed Nov 15 2006 Joerg Bruehe <joerg@mysql.com>
+
+- Switch from "make test*" to explicit calls of the test suite,
+  so that "report features" can be used.
+
+* Wed Nov 15 2006 Kent Boortz <kent@mysql.com>
+
+- Added "--with cluster" and "--define cluster{_gpl}"
+
+* Tue Oct 24 2006 Mads Martin Joergensen <mmj@mysql.com>
+
+- Shared need to Provide/Obsolete mysql-shared
+
+* Mon Oct 23 2006 Mads Martin Joergensen <mmj@mysql.com>
+
+- Run sbin/restorecon after db init (Bug#12676)
+
+* Thu Jul 06 2006 Joerg Bruehe <joerg@mysql.com>
+
+- Correct a typing error in my previous change.
+
+* Tue Jul 04 2006 Joerg Bruehe <joerg@mysql.com>
+
+- Use the Perl script to run the tests, because it will automatically check
+  whether the server is configured with SSL.
+
+* Wed Jun 28 2006 Joerg Bruehe <joerg@mysql.com>
+
+- Revert all previous attempts to call "mysql_upgrade" during RPM upgrade,
+  there are some more aspects which need to be solved before this is possible.
+  For now, just ensure the binary "mysql_upgrade" is delivered and installed.
+
+* Wed Jun 28 2006 Joerg Bruehe <joerg@mysql.com>
+
+- Move "mysqldumpslow" from the client RPM to the server RPM (bug#20216).
+
+* Wed Jun 21 2006 Joerg Bruehe <joerg@mysql.com>
+
+- To run "mysql_upgrade", we need a running server;
+  start it in isolation and skip password checks.
+
+* Sat May 23 2006 Kent Boortz <kent@mysql.com>
+
+- Always compile for PIC, position independent code.
+
+* Fri Apr 28 2006 Kent Boortz <kent@mysql.com>
+
+- Install and run "mysql_upgrade"
+
+* Sat Apr 01 2006 Kent Boortz <kent@mysql.com>
+
+- Allow to override $LDFLAGS
+
+* Fri Jan 06 2006 Lenz Grimmer <lenz@mysql.com>
+
+- added a MySQL-test subpackage (BUG#16070)
+
+* Tue Dec 27 2005 Joerg Bruehe <joerg@mysql.com>
+
+- Some minor alignment with the 4.1 version
+
+* Wed Dec 14 2005 Rodrigo Novo <rodrigo@mysql.com>
+
+- Cosmetic changes: source code location & rpm packager
+- Protect "nm -D" against libtool weirdness
+- Add libz.a & libz.la to the list of files for subpackage -devel
+- moved --with-zlib-dir=bundled out of BuildMySQL, as it doesn't makes
+  sense for the shared package
+
+* Tue Nov 22 2005 Joerg Bruehe <joerg@mysql.com>
+
+- Extend the file existence check for "init.d/mysql" on un-install
+  to also guard the call to "insserv"/"chkconfig".
+
+* Wed Nov 16 2005 Lenz Grimmer <lenz@mysql.com>
+
+- added mysql_client_test to the "client" subpackage (BUG#14546)
+
+* Tue Nov 15 2005 Lenz Grimmer <lenz@mysql.com>
+
+- changed default definitions to build a standard GPL release when not
+  defining anything else
+- install the shared libs more elegantly by using "make install"
+
+* Wed Oct 19 2005 Kent Boortz <kent@mysql.com>
+
+- Made yaSSL support an option (off by default)
+
+* Wed Oct 19 2005 Kent Boortz <kent@mysql.com>
+
+- Enabled yaSSL support
+
+* Thu Oct 13 2005 Lenz Grimmer <lenz@mysql.com>
+
+- added a usermod call to assign a potential existing mysql user to the
+  correct user group (BUG#12823)
+- added a separate macro "mysqld_group" to be able to define the
+  user group of the mysql user seperately, if desired.
+
+* Fri Oct 1 2005 Kent Boortz <kent@mysql.com>
+
+- Copy the config.log file to location outside
+  the build tree
+
+* Fri Sep 30 2005 Lenz Grimmer <lenz@mysql.com>
+
+- don't use install-strip to install the binaries (strip segfaults on
+  icc-compiled binaries on IA64)
+
+* Thu Sep 22 2005 Lenz Grimmer <lenz@mysql.com>
+
+- allow overriding the CFLAGS (needed for Intel icc compiles)
+- replace the CPPFLAGS=-DBIG_TABLES with "--with-big-tables" configure option
+
+* Fri Aug 19 2005 Joerg Bruehe <joerg@mysql.com>
+
+- Protect against failing tests.
+
+* Thu Aug 04 2005 Lenz Grimmer <lenz@mysql.com>
+
+- Fixed the creation of the mysql user group account in the postinstall
+  section (BUG 12348)
+
+* Fri Jul 29 2005 Lenz Grimmer <lenz@mysql.com>
+
+- Fixed external RPM Requirements to better suit the target distribution
+  (BUG 12233)
+
+* Fri Jul 15 2005 Lenz Grimmer <lenz@mysql.com>
+
+- create a "mysql" user group and assign the mysql user account to that group
+  in the server postinstall section. (BUG 10984)
+
+* Wed Jun 01 2005 Lenz Grimmer <lenz@mysql.com>
+
+- use "mysqldatadir" variable instead of hard-coding the path multiple times
+- use the "mysqld_user" variable on all occasions a user name is referenced
+- removed (incomplete) Brazilian translations
+- removed redundant release tags from the subpackage descriptions
+
+* Fri May 27 2005 Lenz Grimmer <lenz@mysql.com>
+
+- fixed file list (removed libnisam.a and libmerge.a from the devel subpackage)
+- force running the test suite
+
+* Wed Apr 20 2005 Lenz Grimmer <lenz@mysql.com>
+
+- Enabled the "blackhole" storage engine for the Max RPM
+
+* Wed Apr 13 2005 Lenz Grimmer <lenz@mysql.com>
+
+- removed the MySQL manual files (html/ps/texi) - they have been removed
+  from the MySQL sources and are now available seperately.
+
+* Mon Apr 4 2005 Petr Chardin <petr@mysql.com>
+
+- old mysqlmanager, mysqlmanagerc and mysqlmanager-pwger renamed into
+  mysqltestmanager, mysqltestmanager and mysqltestmanager-pwgen respectively
+
+* Fri Mar 18 2005 Lenz Grimmer <lenz@mysql.com>
+
+- Disabled RAID in the Max binaries once and for all (it has finally been
+  removed from the source tree)
+
+* Sun Feb 20 2005 Petr Chardin <petr@mysql.com>
+
+- Install MySQL Instance Manager together with mysqld, touch mysqlmanager
+  password file
+
+* Mon Feb 14 2005 Lenz Grimmer <lenz@mysql.com>
+
+- Fixed the compilation comments and moved them into the separate build sections
+  for Max and Standard
+
+* Mon Feb 7 2005 Tomas Ulin <tomas@mysql.com>
+
+- enabled the "Ndbcluster" storage engine for the max binary
+- added extra make install in ndb subdir after Max build to get ndb binaries
+- added packages for ndbcluster storage engine
+
+* Fri Jan 14 2005 Lenz Grimmer <lenz@mysql.com>
+
+- replaced obsoleted "BuildPrereq" with "BuildRequires" instead
+
+* Thu Jan 13 2005 Lenz Grimmer <lenz@mysql.com>
+
+- enabled the "Federated" storage engine for the max binary
+
+* Tue Jan 04 2005 Petr Chardin <petr@mysql.com>
+
+- ISAM and merge storage engines were purged. As well as appropriate
+  tools and manpages (isamchk and isamlog)
+
+* Thu Dec 31 2004 Lenz Grimmer <lenz@mysql.com>
+
+- enabled the "Archive" storage engine for the max binary
+- enabled the "CSV" storage engine for the max binary
+- enabled the "Example" storage engine for the max binary
+
+* Thu Aug 26 2004 Lenz Grimmer <lenz@mysql.com>
+
+- MySQL-Max now requires MySQL-server instead of MySQL (BUG 3860)
+
+* Fri Aug 20 2004 Lenz Grimmer <lenz@mysql.com>
+
+- do not link statically on IA64/AMD64 as these systems do not have
+  a patched glibc installed
+
+* Tue Aug 10 2004 Lenz Grimmer <lenz@mysql.com>
+
+- Added libmygcc.a to the devel subpackage (required to link applications
+  against the the embedded server libmysqld.a) (BUG 4921)
+
+* Mon Aug 09 2004 Lenz Grimmer <lenz@mysql.com>
+
+- Added EXCEPTIONS-CLIENT to the "devel" package
+
+* Thu Jul 29 2004 Lenz Grimmer <lenz@mysql.com>
+
+- disabled OpenSSL in the Max binaries again (the RPM packages were the
+  only exception to this anyway) (BUG 1043)
+
+* Wed Jun 30 2004 Lenz Grimmer <lenz@mysql.com>
+
+- fixed server postinstall (mysql_install_db was called with the wrong
+  parameter)
+
+* Thu Jun 24 2004 Lenz Grimmer <lenz@mysql.com>
+
+- added mysql_tzinfo_to_sql to the server subpackage
+- run "make clean" instead of "make distclean"
+
+* Mon Apr 05 2004 Lenz Grimmer <lenz@mysql.com>
+
+- added ncurses-devel to the build prerequisites (BUG 3377)
+
+* Thu Feb 12 2004 Lenz Grimmer <lenz@mysql.com>
+
+- when using gcc, _always_ use CXX=gcc 
+- replaced Copyright with License field (Copyright is obsolete)
+
+* Tue Feb 03 2004 Lenz Grimmer <lenz@mysql.com>
+
+- added myisam_ftdump to the Server package
+
+* Tue Jan 13 2004 Lenz Grimmer <lenz@mysql.com>
+
+- link the mysql client against libreadline instead of libedit (BUG 2289)
+
+* Mon Dec 22 2003 Lenz Grimmer <lenz@mysql.com>
+
+- marked /etc/logrotate.d/mysql as a config file (BUG 2156)
+
+* Fri Dec 13 2003 Lenz Grimmer <lenz@mysql.com>
+
+- fixed file permissions (BUG 1672)
+
+* Thu Dec 11 2003 Lenz Grimmer <lenz@mysql.com>
+
+- made testing for gcc3 a bit more robust
+
+* Fri Dec 05 2003 Lenz Grimmer <lenz@mysql.com>
+
+- added missing file mysql_create_system_tables to the server subpackage
+
+* Fri Nov 21 2003 Lenz Grimmer <lenz@mysql.com>
+
+- removed dependency on MySQL-client from the MySQL-devel subpackage
+  as it is not really required. (BUG 1610)
+
+* Fri Aug 29 2003 Lenz Grimmer <lenz@mysql.com>
+
+- Fixed BUG 1162 (removed macro names from the changelog)
+- Really fixed BUG 998 (disable the checking for installed but
+  unpackaged files)
+
+* Tue Aug 05 2003 Lenz Grimmer <lenz@mysql.com>
+
+- Fixed BUG 959 (libmysqld not being compiled properly)
+- Fixed BUG 998 (RPM build errors): added missing files to the
+  distribution (mysql_fix_extensions, mysql_tableinfo, mysqldumpslow,
+  mysql_fix_privilege_tables.1), removed "-n" from install section.
+
+* Wed Jul 09 2003 Lenz Grimmer <lenz@mysql.com>
+
+- removed the GIF Icon (file was not included in the sources anyway)
+- removed unused variable shared_lib_version
+- do not run automake before building the standard binary
+  (should not be necessary)
+- add server suffix '-standard' to standard binary (to be in line
+  with the binary tarball distributions)
+- Use more RPM macros (_exec_prefix, _sbindir, _libdir, _sysconfdir,
+  _datadir, _includedir) throughout the spec file.
+- allow overriding CC and CXX (required when building with other compilers)
+
+* Fri May 16 2003 Lenz Grimmer <lenz@mysql.com>
+
+- re-enabled RAID again
+
+* Wed Apr 30 2003 Lenz Grimmer <lenz@mysql.com>
+
+- disabled MyISAM RAID (--with-raid) - it throws an assertion which
+  needs to be investigated first.
+
+* Mon Mar 10 2003 Lenz Grimmer <lenz@mysql.com>
+
+- added missing file mysql_secure_installation to server subpackage
+  (BUG 141)
+
+* Tue Feb 11 2003 Lenz Grimmer <lenz@mysql.com>
+
+- re-added missing pre- and post(un)install scripts to server subpackage
+- added config file /etc/my.cnf to the file list (just for completeness)
+- make sure to create the datadir with 755 permissions
+
+* Mon Jan 27 2003 Lenz Grimmer <lenz@mysql.com>
+
+- removed unused CC and CXX variables
+- CFLAGS and CXXFLAGS should honor RPM_OPT_FLAGS
+
+* Fri Jan 24 2003 Lenz Grimmer <lenz@mysql.com>
+
+- renamed package "MySQL" to "MySQL-server"
+- fixed Copyright tag
+- added mysql_waitpid to client subpackage (required for mysql-test-run)
+
+* Wed Nov 27 2002 Lenz Grimmer <lenz@mysql.com>
+
+- moved init script from /etc/rc.d/init.d to /etc/init.d (the majority of 
+  Linux distributions now support this scheme as proposed by the LSB either
+  directly or via a compatibility symlink)
+- Use new "restart" init script action instead of starting and stopping
+  separately
+- Be more flexible in activating the automatic bootup - use insserv (on
+  older SuSE versions) or chkconfig (Red Hat, newer SuSE versions and
+  others) to create the respective symlinks
+
+* Wed Sep 25 2002 Lenz Grimmer <lenz@mysql.com>
+
+- MySQL-Max now requires MySQL >= 4.0 to avoid version mismatches
+  (mixing 3.23 and 4.0 packages)
+
+* Fri Aug 09 2002 Lenz Grimmer <lenz@mysql.com>
+ 
+- Turn off OpenSSL in MySQL-Max for now until it works properly again
+- enable RAID for the Max binary instead
+- added compatibility link: safe_mysqld -> mysqld_safe to ease the
+  transition from 3.23
+
+* Thu Jul 18 2002 Lenz Grimmer <lenz@mysql.com>
+
+- Reworked the build steps a little bit: the Max binary is supposed
+  to include OpenSSL, which cannot be linked statically, thus trying
+	to statically link against a special glibc is futile anyway
+- because of this, it is not required to make yet another build run
+  just to compile the shared libs (saves a lot of time)
+- updated package description of the Max subpackage
+- clean up the BuildRoot directory afterwards
+
+* Mon Jul 15 2002 Lenz Grimmer <lenz@mysql.com>
+
+- Updated Packager information
+- Fixed the build options: the regular package is supposed to
+  include InnoDB and linked statically, while the Max package
+	should include BDB and SSL support
+
+* Fri May 03 2002 Lenz Grimmer <lenz@mysql.com>
+
+- Use more RPM macros (e.g. infodir, mandir) to make the spec
+  file more portable
+- reorganized the installation of documentation files: let RPM
+  take care of this
+- reorganized the file list: actually install man pages along
+  with the binaries of the respective subpackage
+- do not include libmysqld.a in the devel subpackage as well, if we
+  have a special "embedded" subpackage
+- reworked the package descriptions
+
+* Mon Oct  8 2001 Monty
+
+- Added embedded server as a separate RPM
+
+* Fri Apr 13 2001 Monty
+
+- Added mysqld-max to the distribution
+
+* Tue Jan 2  2001  Monty
+
+- Added mysql-test to the bench package
+
+* Fri Aug 18 2000 Tim Smith <tim@mysql.com>
+
+- Added separate libmysql_r directory; now both a threaded
+  and non-threaded library is shipped.
+
+* Wed Sep 28 1999 David Axmark <davida@mysql.com>
+
+- Added the support-files/my-example.cnf to the docs directory.
+
+- Removed devel dependency on base since it is about client
+  development.
+
+* Wed Sep 8 1999 David Axmark <davida@mysql.com>
+
+- Cleaned up some for 3.23.
+
+* Thu Jul 1 1999 David Axmark <davida@mysql.com>
+
+- Added support for shared libraries in a separate sub
+  package. Original fix by David Fox (dsfox@cogsci.ucsd.edu)
+
+- The --enable-assembler switch is now automatically disables on
+  platforms there assembler code is unavailable. This should allow
+  building this RPM on non i386 systems.
+
+* Mon Feb 22 1999 David Axmark <david@detron.se>
+
+- Removed unportable cc switches from the spec file. The defaults can
+  now be overridden with environment variables. This feature is used
+  to compile the official RPM with optimal (but compiler version
+  specific) switches.
+
+- Removed the repetitive description parts for the sub rpms. Maybe add
+  again if RPM gets a multiline macro capability.
+
+- Added support for a pt_BR translation. Translation contributed by
+  Jorge Godoy <jorge@bestway.com.br>.
+
+* Wed Nov 4 1998 David Axmark <david@detron.se>
+
+- A lot of changes in all the rpm and install scripts. This may even
+  be a working RPM :-)
+
+* Sun Aug 16 1998 David Axmark <david@detron.se>
+
+- A developers changelog for MySQL is available in the source RPM. And
+  there is a history of major user visible changed in the Reference
+  Manual.  Only RPM specific changes will be documented here.
diff --git a/storage/xtradb/data/data0data.c b/storage/xtradb/data/data0data.c
new file mode 100644
index 00000000000..0715b49bf9c
--- /dev/null
+++ b/storage/xtradb/data/data0data.c
@@ -0,0 +1,779 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/********************************************************************//**
+@file data/data0data.c
+SQL data field and tuple
+
+Created 5/30/1994 Heikki Tuuri
+*************************************************************************/
+
+#include "data0data.h"
+
+#ifdef UNIV_NONINL
+#include "data0data.ic"
+#endif
+
+#ifndef UNIV_HOTBACKUP
+#include "rem0rec.h"
+#include "rem0cmp.h"
+#include "page0page.h"
+#include "page0zip.h"
+#include "dict0dict.h"
+#include "btr0cur.h"
+
+#include <ctype.h>
+#endif /* !UNIV_HOTBACKUP */
+
+#ifdef UNIV_DEBUG
+/** Dummy variable to catch access to uninitialized fields.  In the
+debug version, dtuple_create() will make all fields of dtuple_t point
+to data_error. */
+UNIV_INTERN byte	data_error;
+
+# ifndef UNIV_DEBUG_VALGRIND
+/** this is used to fool the compiler in dtuple_validate */
+UNIV_INTERN ulint	data_dummy;
+# endif /* !UNIV_DEBUG_VALGRIND */
+#endif /* UNIV_DEBUG */
+
+#ifndef UNIV_HOTBACKUP
+/*********************************************************************//**
+Tests if dfield data length and content is equal to the given.
+@return	TRUE if equal */
+UNIV_INTERN
+ibool
+dfield_data_is_binary_equal(
+/*========================*/
+	const dfield_t*	field,	/*!< in: field */
+	ulint		len,	/*!< in: data length or UNIV_SQL_NULL */
+	const byte*	data)	/*!< in: data */
+{
+	if (len != dfield_get_len(field)) {
+
+		return(FALSE);
+	}
+
+	if (len == UNIV_SQL_NULL) {
+
+		return(TRUE);
+	}
+
+	if (0 != memcmp(dfield_get_data(field), data, len)) {
+
+		return(FALSE);
+	}
+
+	return(TRUE);
+}
+
+/************************************************************//**
+Compare two data tuples, respecting the collation of character fields.
+@return 1, 0 , -1 if tuple1 is greater, equal, less, respectively,
+than tuple2 */
+UNIV_INTERN
+int
+dtuple_coll_cmp(
+/*============*/
+	const dtuple_t*	tuple1,	/*!< in: tuple 1 */
+	const dtuple_t*	tuple2)	/*!< in: tuple 2 */
+{
+	ulint	n_fields;
+	ulint	i;
+
+	ut_ad(tuple1 && tuple2);
+	ut_ad(tuple1->magic_n == DATA_TUPLE_MAGIC_N);
+	ut_ad(tuple2->magic_n == DATA_TUPLE_MAGIC_N);
+	ut_ad(dtuple_check_typed(tuple1));
+	ut_ad(dtuple_check_typed(tuple2));
+
+	n_fields = dtuple_get_n_fields(tuple1);
+
+	if (n_fields != dtuple_get_n_fields(tuple2)) {
+
+		return(n_fields < dtuple_get_n_fields(tuple2) ? -1 : 1);
+	}
+
+	for (i = 0; i < n_fields; i++) {
+		int		cmp;
+		const dfield_t*	field1	= dtuple_get_nth_field(tuple1, i);
+		const dfield_t*	field2	= dtuple_get_nth_field(tuple2, i);
+
+		cmp = cmp_dfield_dfield(field1, field2);
+
+		if (cmp) {
+			return(cmp);
+		}
+	}
+
+	return(0);
+}
+
+/*********************************************************************//**
+Sets number of fields used in a tuple. Normally this is set in
+dtuple_create, but if you want later to set it smaller, you can use this. */
+UNIV_INTERN
+void
+dtuple_set_n_fields(
+/*================*/
+	dtuple_t*	tuple,		/*!< in: tuple */
+	ulint		n_fields)	/*!< in: number of fields */
+{
+	ut_ad(tuple);
+
+	tuple->n_fields = n_fields;
+	tuple->n_fields_cmp = n_fields;
+}
+
+/**********************************************************//**
+Checks that a data field is typed.
+@return	TRUE if ok */
+static
+ibool
+dfield_check_typed_no_assert(
+/*=========================*/
+	const dfield_t*	field)	/*!< in: data field */
+{
+	if (dfield_get_type(field)->mtype > DATA_MYSQL
+	    || dfield_get_type(field)->mtype < DATA_VARCHAR) {
+
+		fprintf(stderr,
+			"InnoDB: Error: data field type %lu, len %lu\n",
+			(ulong) dfield_get_type(field)->mtype,
+			(ulong) dfield_get_len(field));
+		return(FALSE);
+	}
+
+	return(TRUE);
+}
+
+/**********************************************************//**
+Checks that a data tuple is typed.
+@return	TRUE if ok */
+UNIV_INTERN
+ibool
+dtuple_check_typed_no_assert(
+/*=========================*/
+	const dtuple_t*	tuple)	/*!< in: tuple */
+{
+	const dfield_t*	field;
+	ulint		i;
+
+	if (dtuple_get_n_fields(tuple) > REC_MAX_N_FIELDS) {
+		fprintf(stderr,
+			"InnoDB: Error: index entry has %lu fields\n",
+			(ulong) dtuple_get_n_fields(tuple));
+dump:
+		fputs("InnoDB: Tuple contents: ", stderr);
+		dtuple_print(stderr, tuple);
+		putc('\n', stderr);
+
+		return(FALSE);
+	}
+
+	for (i = 0; i < dtuple_get_n_fields(tuple); i++) {
+
+		field = dtuple_get_nth_field(tuple, i);
+
+		if (!dfield_check_typed_no_assert(field)) {
+			goto dump;
+		}
+	}
+
+	return(TRUE);
+}
+#endif /* !UNIV_HOTBACKUP */
+
+#ifdef UNIV_DEBUG
+/**********************************************************//**
+Checks that a data field is typed. Asserts an error if not.
+@return	TRUE if ok */
+UNIV_INTERN
+ibool
+dfield_check_typed(
+/*===============*/
+	const dfield_t*	field)	/*!< in: data field */
+{
+	if (dfield_get_type(field)->mtype > DATA_MYSQL
+	    || dfield_get_type(field)->mtype < DATA_VARCHAR) {
+
+		fprintf(stderr,
+			"InnoDB: Error: data field type %lu, len %lu\n",
+			(ulong) dfield_get_type(field)->mtype,
+			(ulong) dfield_get_len(field));
+
+		ut_error;
+	}
+
+	return(TRUE);
+}
+
+/**********************************************************//**
+Checks that a data tuple is typed. Asserts an error if not.
+@return	TRUE if ok */
+UNIV_INTERN
+ibool
+dtuple_check_typed(
+/*===============*/
+	const dtuple_t*	tuple)	/*!< in: tuple */
+{
+	const dfield_t*	field;
+	ulint		i;
+
+	for (i = 0; i < dtuple_get_n_fields(tuple); i++) {
+
+		field = dtuple_get_nth_field(tuple, i);
+
+		ut_a(dfield_check_typed(field));
+	}
+
+	return(TRUE);
+}
+
+/**********************************************************//**
+Validates the consistency of a tuple which must be complete, i.e,
+all fields must have been set.
+@return	TRUE if ok */
+UNIV_INTERN
+ibool
+dtuple_validate(
+/*============*/
+	const dtuple_t*	tuple)	/*!< in: tuple */
+{
+	const dfield_t*	field;
+	ulint		n_fields;
+	ulint		len;
+	ulint		i;
+
+	ut_ad(tuple->magic_n == DATA_TUPLE_MAGIC_N);
+
+	n_fields = dtuple_get_n_fields(tuple);
+
+	/* We dereference all the data of each field to test
+	for memory traps */
+
+	for (i = 0; i < n_fields; i++) {
+
+		field = dtuple_get_nth_field(tuple, i);
+		len = dfield_get_len(field);
+
+		if (!dfield_is_null(field)) {
+
+			const byte*	data = dfield_get_data(field);
+#ifndef UNIV_DEBUG_VALGRIND
+			ulint		j;
+
+			for (j = 0; j < len; j++) {
+
+				data_dummy  += *data; /* fool the compiler not
+						      to optimize out this
+						      code */
+				data++;
+			}
+#endif /* !UNIV_DEBUG_VALGRIND */
+
+			UNIV_MEM_ASSERT_RW(data, len);
+		}
+	}
+
+	ut_a(dtuple_check_typed(tuple));
+
+	return(TRUE);
+}
+#endif /* UNIV_DEBUG */
+
+#ifndef UNIV_HOTBACKUP
+/*************************************************************//**
+Pretty prints a dfield value according to its data type. */
+UNIV_INTERN
+void
+dfield_print(
+/*=========*/
+	const dfield_t*	dfield)	/*!< in: dfield */
+{
+	const byte*	data;
+	ulint		len;
+	ulint		i;
+
+	len = dfield_get_len(dfield);
+	data = dfield_get_data(dfield);
+
+	if (dfield_is_null(dfield)) {
+		fputs("NULL", stderr);
+
+		return;
+	}
+
+	switch (dtype_get_mtype(dfield_get_type(dfield))) {
+	case DATA_CHAR:
+	case DATA_VARCHAR:
+		for (i = 0; i < len; i++) {
+			int	c = *data++;
+			putc(isprint(c) ? c : ' ', stderr);
+		}
+
+		if (dfield_is_ext(dfield)) {
+			fputs("(external)", stderr);
+		}
+		break;
+	case DATA_INT:
+		ut_a(len == 4); /* only works for 32-bit integers */
+		fprintf(stderr, "%d", (int)mach_read_from_4(data));
+		break;
+	default:
+		ut_error;
+	}
+}
+
+/*************************************************************//**
+Pretty prints a dfield value according to its data type. Also the hex string
+is printed if a string contains non-printable characters. */
+UNIV_INTERN
+void
+dfield_print_also_hex(
+/*==================*/
+	const dfield_t*	dfield)	/*!< in: dfield */
+{
+	const byte*	data;
+	ulint		len;
+	ulint		prtype;
+	ulint		i;
+	ibool		print_also_hex;
+
+	len = dfield_get_len(dfield);
+	data = dfield_get_data(dfield);
+
+	if (dfield_is_null(dfield)) {
+		fputs("NULL", stderr);
+
+		return;
+	}
+
+	prtype = dtype_get_prtype(dfield_get_type(dfield));
+
+	switch (dtype_get_mtype(dfield_get_type(dfield))) {
+		dulint	id;
+	case DATA_INT:
+		switch (len) {
+			ulint	val;
+		case 1:
+			val = mach_read_from_1(data);
+
+			if (!(prtype & DATA_UNSIGNED)) {
+				val &= ~0x80;
+				fprintf(stderr, "%ld", (long) val);
+			} else {
+				fprintf(stderr, "%lu", (ulong) val);
+			}
+			break;
+
+		case 2:
+			val = mach_read_from_2(data);
+
+			if (!(prtype & DATA_UNSIGNED)) {
+				val &= ~0x8000;
+				fprintf(stderr, "%ld", (long) val);
+			} else {
+				fprintf(stderr, "%lu", (ulong) val);
+			}
+			break;
+
+		case 3:
+			val = mach_read_from_3(data);
+
+			if (!(prtype & DATA_UNSIGNED)) {
+				val &= ~0x800000;
+				fprintf(stderr, "%ld", (long) val);
+			} else {
+				fprintf(stderr, "%lu", (ulong) val);
+			}
+			break;
+
+		case 4:
+			val = mach_read_from_4(data);
+
+			if (!(prtype & DATA_UNSIGNED)) {
+				val &= ~0x80000000;
+				fprintf(stderr, "%ld", (long) val);
+			} else {
+				fprintf(stderr, "%lu", (ulong) val);
+			}
+			break;
+
+		case 6:
+			id = mach_read_from_6(data);
+			fprintf(stderr, "{%lu %lu}",
+				ut_dulint_get_high(id),
+				ut_dulint_get_low(id));
+			break;
+
+		case 7:
+			id = mach_read_from_7(data);
+			fprintf(stderr, "{%lu %lu}",
+				ut_dulint_get_high(id),
+				ut_dulint_get_low(id));
+			break;
+		case 8:
+			id = mach_read_from_8(data);
+			fprintf(stderr, "{%lu %lu}",
+				ut_dulint_get_high(id),
+				ut_dulint_get_low(id));
+			break;
+		default:
+			goto print_hex;
+		}
+		break;
+
+	case DATA_SYS:
+		switch (prtype & DATA_SYS_PRTYPE_MASK) {
+		case DATA_TRX_ID:
+			id = mach_read_from_6(data);
+
+			fprintf(stderr, "trx_id " TRX_ID_FMT,
+				TRX_ID_PREP_PRINTF(id));
+			break;
+
+		case DATA_ROLL_PTR:
+			id = mach_read_from_7(data);
+
+			fprintf(stderr, "roll_ptr {%lu %lu}",
+				ut_dulint_get_high(id), ut_dulint_get_low(id));
+			break;
+
+		case DATA_ROW_ID:
+			id = mach_read_from_6(data);
+
+			fprintf(stderr, "row_id {%lu %lu}",
+				ut_dulint_get_high(id), ut_dulint_get_low(id));
+			break;
+
+		default:
+			id = mach_dulint_read_compressed(data);
+
+			fprintf(stderr, "mix_id {%lu %lu}",
+				ut_dulint_get_high(id), ut_dulint_get_low(id));
+		}
+		break;
+
+	case DATA_CHAR:
+	case DATA_VARCHAR:
+		print_also_hex = FALSE;
+
+		for (i = 0; i < len; i++) {
+			int c = *data++;
+
+			if (!isprint(c)) {
+				print_also_hex = TRUE;
+
+				fprintf(stderr, "\\x%02x", (unsigned char) c);
+			} else {
+				putc(c, stderr);
+			}
+		}
+
+		if (dfield_is_ext(dfield)) {
+			fputs("(external)", stderr);
+		}
+
+		if (!print_also_hex) {
+			break;
+		}
+
+		data = dfield_get_data(dfield);
+		/* fall through */
+
+	case DATA_BINARY:
+	default:
+print_hex:
+		fputs(" Hex: ",stderr);
+
+		for (i = 0; i < len; i++) {
+			fprintf(stderr, "%02lx", (ulint) *data++);
+		}
+
+		if (dfield_is_ext(dfield)) {
+			fputs("(external)", stderr);
+		}
+	}
+}
+
+/*************************************************************//**
+Print a dfield value using ut_print_buf. */
+static
+void
+dfield_print_raw(
+/*=============*/
+	FILE*		f,		/*!< in: output stream */
+	const dfield_t*	dfield)		/*!< in: dfield */
+{
+	ulint	len	= dfield_get_len(dfield);
+	if (!dfield_is_null(dfield)) {
+		ulint	print_len = ut_min(len, 1000);
+		ut_print_buf(f, dfield_get_data(dfield), print_len);
+		if (len != print_len) {
+			fprintf(f, "(total %lu bytes%s)",
+				(ulong) len,
+				dfield_is_ext(dfield) ? ", external" : "");
+		}
+	} else {
+		fputs(" SQL NULL", f);
+	}
+}
+
+/**********************************************************//**
+The following function prints the contents of a tuple. */
+UNIV_INTERN
+void
+dtuple_print(
+/*=========*/
+	FILE*		f,	/*!< in: output stream */
+	const dtuple_t*	tuple)	/*!< in: tuple */
+{
+	ulint		n_fields;
+	ulint		i;
+
+	n_fields = dtuple_get_n_fields(tuple);
+
+	fprintf(f, "DATA TUPLE: %lu fields;\n", (ulong) n_fields);
+
+	for (i = 0; i < n_fields; i++) {
+		fprintf(f, " %lu:", (ulong) i);
+
+		dfield_print_raw(f, dtuple_get_nth_field(tuple, i));
+
+		putc(';', f);
+		putc('\n', f);
+	}
+
+	ut_ad(dtuple_validate(tuple));
+}
+
+/**************************************************************//**
+Moves parts of long fields in entry to the big record vector so that
+the size of tuple drops below the maximum record size allowed in the
+database. Moves data only from those fields which are not necessary
+to determine uniquely the insertion place of the tuple in the index.
+@return own: created big record vector, NULL if we are not able to
+shorten the entry enough, i.e., if there are too many fixed-length or
+short fields in entry or the index is clustered */
+UNIV_INTERN
+big_rec_t*
+dtuple_convert_big_rec(
+/*===================*/
+	dict_index_t*	index,	/*!< in: index */
+	dtuple_t*	entry,	/*!< in/out: index entry */
+	ulint*		n_ext)	/*!< in/out: number of
+				externally stored columns */
+{
+	mem_heap_t*	heap;
+	big_rec_t*	vector;
+	dfield_t*	dfield;
+	dict_field_t*	ifield;
+	ulint		size;
+	ulint		n_fields;
+	ulint		local_len;
+	ulint		local_prefix_len;
+
+	if (UNIV_UNLIKELY(!dict_index_is_clust(index))) {
+		return(NULL);
+	}
+
+	if (dict_table_get_format(index->table) < DICT_TF_FORMAT_ZIP) {
+		/* up to MySQL 5.1: store a 768-byte prefix locally */
+		local_len = BTR_EXTERN_FIELD_REF_SIZE + DICT_MAX_INDEX_COL_LEN;
+	} else {
+		/* new-format table: do not store any BLOB prefix locally */
+		local_len = BTR_EXTERN_FIELD_REF_SIZE;
+	}
+
+	ut_a(dtuple_check_typed_no_assert(entry));
+
+	size = rec_get_converted_size(index, entry, *n_ext);
+
+	if (UNIV_UNLIKELY(size > 1000000000)) {
+		fprintf(stderr,
+			"InnoDB: Warning: tuple size very big: %lu\n",
+			(ulong) size);
+		fputs("InnoDB: Tuple contents: ", stderr);
+		dtuple_print(stderr, entry);
+		putc('\n', stderr);
+	}
+
+	heap = mem_heap_create(size + dtuple_get_n_fields(entry)
+			       * sizeof(big_rec_field_t) + 1000);
+
+	vector = mem_heap_alloc(heap, sizeof(big_rec_t));
+
+	vector->heap = heap;
+	vector->fields = mem_heap_alloc(heap, dtuple_get_n_fields(entry)
+					* sizeof(big_rec_field_t));
+
+	/* Decide which fields to shorten: the algorithm is to look for
+	a variable-length field that yields the biggest savings when
+	stored externally */
+
+	n_fields = 0;
+
+	while (page_zip_rec_needs_ext(rec_get_converted_size(index, entry,
+							     *n_ext),
+				      dict_table_is_comp(index->table),
+				      dict_index_get_n_fields(index),
+				      dict_table_zip_size(index->table))) {
+		ulint			i;
+		ulint			longest		= 0;
+		ulint			longest_i	= ULINT_MAX;
+		byte*			data;
+		big_rec_field_t*	b;
+
+		for (i = dict_index_get_n_unique_in_tree(index);
+		     i < dtuple_get_n_fields(entry); i++) {
+			ulint	savings;
+
+			dfield = dtuple_get_nth_field(entry, i);
+			ifield = dict_index_get_nth_field(index, i);
+
+			/* Skip fixed-length, NULL, externally stored,
+			or short columns */
+
+			if (ifield->fixed_len
+			    || dfield_is_null(dfield)
+			    || dfield_is_ext(dfield)
+			    || dfield_get_len(dfield) <= local_len
+			    || dfield_get_len(dfield)
+			    <= BTR_EXTERN_FIELD_REF_SIZE * 2) {
+				goto skip_field;
+			}
+
+			savings = dfield_get_len(dfield) - local_len;
+
+			/* Check that there would be savings */
+			if (longest >= savings) {
+				goto skip_field;
+			}
+
+			/* In DYNAMIC and COMPRESSED format, store
+			locally any non-BLOB columns whose maximum
+			length does not exceed 256 bytes.  This is
+			because there is no room for the "external
+			storage" flag when the maximum length is 255
+			bytes or less. This restriction trivially
+			holds in REDUNDANT and COMPACT format, because
+			there we always store locally columns whose
+			length is up to local_len == 788 bytes.
+			@see rec_init_offsets_comp_ordinary */
+			if (ifield->col->mtype != DATA_BLOB
+			    && ifield->col->len < 256) {
+				goto skip_field;
+			}
+
+			longest_i = i;
+			longest = savings;
+
+skip_field:
+			continue;
+		}
+
+		if (!longest) {
+			/* Cannot shorten more */
+
+			mem_heap_free(heap);
+
+			return(NULL);
+		}
+
+		/* Move data from field longest_i to big rec vector.
+
+		We store the first bytes locally to the record. Then
+		we can calculate all ordering fields in all indexes
+		from locally stored data. */
+
+		dfield = dtuple_get_nth_field(entry, longest_i);
+		ifield = dict_index_get_nth_field(index, longest_i);
+		local_prefix_len = local_len - BTR_EXTERN_FIELD_REF_SIZE;
+
+		b = &vector->fields[n_fields];
+		b->field_no = longest_i;
+		b->len = dfield_get_len(dfield) - local_prefix_len;
+		b->data = (char*) dfield_get_data(dfield) + local_prefix_len;
+
+		/* Allocate the locally stored part of the column. */
+		data = mem_heap_alloc(heap, local_len);
+
+		/* Copy the local prefix. */
+		memcpy(data, dfield_get_data(dfield), local_prefix_len);
+		/* Clear the extern field reference (BLOB pointer). */
+		memset(data + local_prefix_len, 0, BTR_EXTERN_FIELD_REF_SIZE);
+#if 0
+		/* The following would fail the Valgrind checks in
+		page_cur_insert_rec_low() and page_cur_insert_rec_zip().
+		The BLOB pointers in the record will be initialized after
+		the record and the BLOBs have been written. */
+		UNIV_MEM_ALLOC(data + local_prefix_len,
+			       BTR_EXTERN_FIELD_REF_SIZE);
+#endif
+
+		dfield_set_data(dfield, data, local_len);
+		dfield_set_ext(dfield);
+
+		n_fields++;
+		(*n_ext)++;
+		ut_ad(n_fields < dtuple_get_n_fields(entry));
+	}
+
+	vector->n_fields = n_fields;
+	return(vector);
+}
+
+/**************************************************************//**
+Puts back to entry the data stored in vector. Note that to ensure the
+fields in entry can accommodate the data, vector must have been created
+from entry with dtuple_convert_big_rec. */
+UNIV_INTERN
+void
+dtuple_convert_back_big_rec(
+/*========================*/
+	dict_index_t*	index __attribute__((unused)),	/*!< in: index */
+	dtuple_t*	entry,	/*!< in: entry whose data was put to vector */
+	big_rec_t*	vector)	/*!< in, own: big rec vector; it is
+				freed in this function */
+{
+	big_rec_field_t*		b	= vector->fields;
+	const big_rec_field_t* const	end	= b + vector->n_fields;
+
+	for (; b < end; b++) {
+		dfield_t*	dfield;
+		ulint		local_len;
+
+		dfield = dtuple_get_nth_field(entry, b->field_no);
+		local_len = dfield_get_len(dfield);
+
+		ut_ad(dfield_is_ext(dfield));
+		ut_ad(local_len >= BTR_EXTERN_FIELD_REF_SIZE);
+
+		local_len -= BTR_EXTERN_FIELD_REF_SIZE;
+
+		ut_ad(local_len <= DICT_MAX_INDEX_COL_LEN);
+
+		dfield_set_data(dfield,
+				(char*) b->data - local_len,
+				b->len + local_len);
+	}
+
+	mem_heap_free(vector->heap);
+}
+#endif /* !UNIV_HOTBACKUP */
diff --git a/storage/xtradb/data/data0type.c b/storage/xtradb/data/data0type.c
new file mode 100644
index 00000000000..e834fd2ec55
--- /dev/null
+++ b/storage/xtradb/data/data0type.c
@@ -0,0 +1,297 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file data/data0type.c
+Data types
+
+Created 1/16/1996 Heikki Tuuri
+*******************************************************/
+
+#include "data0type.h"
+
+#ifdef UNIV_NONINL
+#include "data0type.ic"
+#endif
+
+#ifndef UNIV_HOTBACKUP
+# include "ha_prototypes.h"
+
+/* At the database startup we store the default-charset collation number of
+this MySQL installation to this global variable. If we have < 4.1.2 format
+column definitions, or records in the insert buffer, we use this
+charset-collation code for them. */
+
+UNIV_INTERN ulint	data_mysql_default_charset_coll;
+
+/*********************************************************************//**
+Determine how many bytes the first n characters of the given string occupy.
+If the string is shorter than n characters, returns the number of bytes
+the characters in the string occupy.
+@return	length of the prefix, in bytes */
+UNIV_INTERN
+ulint
+dtype_get_at_most_n_mbchars(
+/*========================*/
+	ulint		prtype,		/*!< in: precise type */
+	ulint		mbminlen,	/*!< in: minimum length of a
+					multi-byte character */
+	ulint		mbmaxlen,	/*!< in: maximum length of a
+					multi-byte character */
+	ulint		prefix_len,	/*!< in: length of the requested
+					prefix, in characters, multiplied by
+					dtype_get_mbmaxlen(dtype) */
+	ulint		data_len,	/*!< in: length of str (in bytes) */
+	const char*	str)		/*!< in: the string whose prefix
+					length is being determined */
+{
+	ut_a(data_len != UNIV_SQL_NULL);
+	ut_ad(!mbmaxlen || !(prefix_len % mbmaxlen));
+
+	if (mbminlen != mbmaxlen) {
+		ut_a(!(prefix_len % mbmaxlen));
+		return(innobase_get_at_most_n_mbchars(
+			dtype_get_charset_coll(prtype),
+			prefix_len, data_len, str));
+	}
+
+	if (prefix_len < data_len) {
+
+		return(prefix_len);
+
+	}
+
+	return(data_len);
+}
+#endif /* UNIV_HOTBACKUP */
+
+/*********************************************************************//**
+Checks if a data main type is a string type. Also a BLOB is considered a
+string type.
+@return	TRUE if string type */
+UNIV_INTERN
+ibool
+dtype_is_string_type(
+/*=================*/
+	ulint	mtype)	/*!< in: InnoDB main data type code: DATA_CHAR, ... */
+{
+	if (mtype <= DATA_BLOB
+	    || mtype == DATA_MYSQL
+	    || mtype == DATA_VARMYSQL) {
+
+		return(TRUE);
+	}
+
+	return(FALSE);
+}
+
+/*********************************************************************//**
+Checks if a type is a binary string type. Note that for tables created with
+< 4.0.14, we do not know if a DATA_BLOB column is a BLOB or a TEXT column. For
+those DATA_BLOB columns this function currently returns FALSE.
+@return	TRUE if binary string type */
+UNIV_INTERN
+ibool
+dtype_is_binary_string_type(
+/*========================*/
+	ulint	mtype,	/*!< in: main data type */
+	ulint	prtype)	/*!< in: precise type */
+{
+	if ((mtype == DATA_FIXBINARY)
+	    || (mtype == DATA_BINARY)
+	    || (mtype == DATA_BLOB && (prtype & DATA_BINARY_TYPE))) {
+
+		return(TRUE);
+	}
+
+	return(FALSE);
+}
+
+/*********************************************************************//**
+Checks if a type is a non-binary string type. That is, dtype_is_string_type is
+TRUE and dtype_is_binary_string_type is FALSE. Note that for tables created
+with < 4.0.14, we do not know if a DATA_BLOB column is a BLOB or a TEXT column.
+For those DATA_BLOB columns this function currently returns TRUE.
+@return	TRUE if non-binary string type */
+UNIV_INTERN
+ibool
+dtype_is_non_binary_string_type(
+/*============================*/
+	ulint	mtype,	/*!< in: main data type */
+	ulint	prtype)	/*!< in: precise type */
+{
+	if (dtype_is_string_type(mtype) == TRUE
+	    && dtype_is_binary_string_type(mtype, prtype) == FALSE) {
+
+		return(TRUE);
+	}
+
+	return(FALSE);
+}
+
+/*********************************************************************//**
+Forms a precise type from the < 4.1.2 format precise type plus the
+charset-collation code.
+@return precise type, including the charset-collation code */
+UNIV_INTERN
+ulint
+dtype_form_prtype(
+/*==============*/
+	ulint	old_prtype,	/*!< in: the MySQL type code and the flags
+				DATA_BINARY_TYPE etc. */
+	ulint	charset_coll)	/*!< in: MySQL charset-collation code */
+{
+	ut_a(old_prtype < 256 * 256);
+	ut_a(charset_coll < 256);
+
+	return(old_prtype + (charset_coll << 16));
+}
+
+/*********************************************************************//**
+Validates a data type structure.
+@return	TRUE if ok */
+UNIV_INTERN
+ibool
+dtype_validate(
+/*===========*/
+	const dtype_t*	type)	/*!< in: type struct to validate */
+{
+	ut_a(type);
+	ut_a(type->mtype >= DATA_VARCHAR);
+	ut_a(type->mtype <= DATA_MYSQL);
+
+	if (type->mtype == DATA_SYS) {
+		ut_a((type->prtype & DATA_MYSQL_TYPE_MASK) < DATA_N_SYS_COLS);
+	}
+
+#ifndef UNIV_HOTBACKUP
+	ut_a(type->mbminlen <= type->mbmaxlen);
+#endif /* !UNIV_HOTBACKUP */
+
+	return(TRUE);
+}
+
+#ifndef UNIV_HOTBACKUP
+/*********************************************************************//**
+Prints a data type structure. */
+UNIV_INTERN
+void
+dtype_print(
+/*========*/
+	const dtype_t*	type)	/*!< in: type */
+{
+	ulint	mtype;
+	ulint	prtype;
+	ulint	len;
+
+	ut_a(type);
+
+	mtype = type->mtype;
+	prtype = type->prtype;
+
+	switch (mtype) {
+	case DATA_VARCHAR:
+		fputs("DATA_VARCHAR", stderr);
+		break;
+
+	case DATA_CHAR:
+		fputs("DATA_CHAR", stderr);
+		break;
+
+	case DATA_BINARY:
+		fputs("DATA_BINARY", stderr);
+		break;
+
+	case DATA_FIXBINARY:
+		fputs("DATA_FIXBINARY", stderr);
+		break;
+
+	case DATA_BLOB:
+		fputs("DATA_BLOB", stderr);
+		break;
+
+	case DATA_INT:
+		fputs("DATA_INT", stderr);
+		break;
+
+	case DATA_MYSQL:
+		fputs("DATA_MYSQL", stderr);
+		break;
+
+	case DATA_SYS:
+		fputs("DATA_SYS", stderr);
+		break;
+
+	case DATA_FLOAT:
+		fputs("DATA_FLOAT", stderr);
+		break;
+
+	case DATA_DOUBLE:
+		fputs("DATA_DOUBLE", stderr);
+		break;
+
+	case DATA_DECIMAL:
+		fputs("DATA_DECIMAL", stderr);
+		break;
+
+	case DATA_VARMYSQL:
+		fputs("DATA_VARMYSQL", stderr);
+		break;
+
+	default:
+		fprintf(stderr, "type %lu", (ulong) mtype);
+		break;
+	}
+
+	len = type->len;
+
+	if ((type->mtype == DATA_SYS)
+	    || (type->mtype == DATA_VARCHAR)
+	    || (type->mtype == DATA_CHAR)) {
+		putc(' ', stderr);
+		if (prtype == DATA_ROW_ID) {
+			fputs("DATA_ROW_ID", stderr);
+			len = DATA_ROW_ID_LEN;
+		} else if (prtype == DATA_ROLL_PTR) {
+			fputs("DATA_ROLL_PTR", stderr);
+			len = DATA_ROLL_PTR_LEN;
+		} else if (prtype == DATA_TRX_ID) {
+			fputs("DATA_TRX_ID", stderr);
+			len = DATA_TRX_ID_LEN;
+		} else if (prtype == DATA_ENGLISH) {
+			fputs("DATA_ENGLISH", stderr);
+		} else {
+			fprintf(stderr, "prtype %lu", (ulong) prtype);
+		}
+	} else {
+		if (prtype & DATA_UNSIGNED) {
+			fputs(" DATA_UNSIGNED", stderr);
+		}
+
+		if (prtype & DATA_BINARY_TYPE) {
+			fputs(" DATA_BINARY_TYPE", stderr);
+		}
+
+		if (prtype & DATA_NOT_NULL) {
+			fputs(" DATA_NOT_NULL", stderr);
+		}
+	}
+
+	fprintf(stderr, " len %lu", (ulong) len);
+}
+#endif /* !UNIV_HOTBACKUP */
diff --git a/storage/xtradb/dict/dict0boot.c b/storage/xtradb/dict/dict0boot.c
new file mode 100644
index 00000000000..43cfced65a0
--- /dev/null
+++ b/storage/xtradb/dict/dict0boot.c
@@ -0,0 +1,549 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2010, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file dict/dict0boot.c
+Data dictionary creation and booting
+
+Created 4/18/1996 Heikki Tuuri
+*******************************************************/
+
+#include "dict0boot.h"
+
+#ifdef UNIV_NONINL
+#include "dict0boot.ic"
+#endif
+
+#include "dict0crea.h"
+#include "btr0btr.h"
+#include "dict0load.h"
+#include "dict0load.h"
+#include "trx0trx.h"
+#include "srv0srv.h"
+#include "ibuf0ibuf.h"
+#include "buf0flu.h"
+#include "log0recv.h"
+#include "os0file.h"
+
+/**********************************************************************//**
+Gets a pointer to the dictionary header and x-latches its page.
+@return	pointer to the dictionary header, page x-latched */
+UNIV_INTERN
+dict_hdr_t*
+dict_hdr_get(
+/*=========*/
+	mtr_t*	mtr)	/*!< in: mtr */
+{
+	buf_block_t*	block;
+	dict_hdr_t*	header;
+
+	block = buf_page_get(DICT_HDR_SPACE, 0, DICT_HDR_PAGE_NO,
+			     RW_X_LATCH, mtr);
+	header = DICT_HDR + buf_block_get_frame(block);
+
+	buf_block_dbg_add_level(block, SYNC_DICT_HEADER);
+
+	return(header);
+}
+
+/**********************************************************************//**
+Returns a new table, index, or space id. */
+UNIV_INTERN
+void
+dict_hdr_get_new_id(
+/*================*/
+	dulint*	table_id,	/*!< out: table id (not assigned if NULL) */
+	dulint*	index_id,	/*!< out: index id (not assigned if NULL) */
+	ulint*	space_id)	/*!< out: space id (not assigned if NULL) */
+{
+	dict_hdr_t*	dict_hdr;
+	dulint		id;
+	mtr_t		mtr;
+
+	mtr_start(&mtr);
+
+	dict_hdr = dict_hdr_get(&mtr);
+
+	if (table_id) {
+		id = mtr_read_dulint(dict_hdr + DICT_HDR_TABLE_ID, &mtr);
+		id = ut_dulint_add(id, 1);
+		mlog_write_dulint(dict_hdr + DICT_HDR_TABLE_ID, id, &mtr);
+		*table_id = id;
+	}
+
+	if (index_id) {
+		id = mtr_read_dulint(dict_hdr + DICT_HDR_INDEX_ID, &mtr);
+		id = ut_dulint_add(id, 1);
+		mlog_write_dulint(dict_hdr + DICT_HDR_INDEX_ID, id, &mtr);
+		*index_id = id;
+	}
+
+	if (space_id) {
+		*space_id = mtr_read_ulint(dict_hdr + DICT_HDR_MAX_SPACE_ID,
+					   MLOG_4BYTES, &mtr);
+		if (fil_assign_new_space_id(space_id)) {
+			mlog_write_ulint(dict_hdr + DICT_HDR_MAX_SPACE_ID,
+					 *space_id, MLOG_4BYTES, &mtr);
+		}
+	}
+
+	mtr_commit(&mtr);
+}
+
+/**********************************************************************//**
+Writes the current value of the row id counter to the dictionary header file
+page. */
+UNIV_INTERN
+void
+dict_hdr_flush_row_id(void)
+/*=======================*/
+{
+	dict_hdr_t*	dict_hdr;
+	dulint		id;
+	mtr_t		mtr;
+
+	ut_ad(mutex_own(&(dict_sys->mutex)));
+
+	id = dict_sys->row_id;
+
+	mtr_start(&mtr);
+
+	dict_hdr = dict_hdr_get(&mtr);
+
+	mlog_write_dulint(dict_hdr + DICT_HDR_ROW_ID, id, &mtr);
+
+	mtr_commit(&mtr);
+}
+
+/*****************************************************************//**
+Creates the file page for the dictionary header. This function is
+called only at the database creation.
+@return	TRUE if succeed */
+static
+ibool
+dict_hdr_create(
+/*============*/
+	mtr_t*	mtr)	/*!< in: mtr */
+{
+	buf_block_t*	block;
+	dict_hdr_t*	dict_header;
+	ulint		root_page_no;
+
+	ut_ad(mtr);
+
+	/* Create the dictionary header file block in a new, allocated file
+	segment in the system tablespace */
+	block = fseg_create(DICT_HDR_SPACE, 0,
+			    DICT_HDR + DICT_HDR_FSEG_HEADER, mtr);
+
+	ut_a(DICT_HDR_PAGE_NO == buf_block_get_page_no(block));
+
+	dict_header = dict_hdr_get(mtr);
+
+	/* Start counting row, table, index, and tree ids from
+	DICT_HDR_FIRST_ID */
+	mlog_write_dulint(dict_header + DICT_HDR_ROW_ID,
+			  ut_dulint_create(0, DICT_HDR_FIRST_ID), mtr);
+
+	mlog_write_dulint(dict_header + DICT_HDR_TABLE_ID,
+			  ut_dulint_create(0, DICT_HDR_FIRST_ID), mtr);
+
+	mlog_write_dulint(dict_header + DICT_HDR_INDEX_ID,
+			  ut_dulint_create(0, DICT_HDR_FIRST_ID), mtr);
+
+	mlog_write_ulint(dict_header + DICT_HDR_MAX_SPACE_ID,
+			 0, MLOG_4BYTES, mtr);
+
+	/* Obsolete, but we must initialize it anyway. */
+	mlog_write_ulint(dict_header + DICT_HDR_MIX_ID_LOW,
+			 DICT_HDR_FIRST_ID, MLOG_4BYTES, mtr);
+
+	/* Create the B-tree roots for the clustered indexes of the basic
+	system tables */
+
+	/*--------------------------*/
+	root_page_no = btr_create(DICT_CLUSTERED | DICT_UNIQUE,
+				  DICT_HDR_SPACE, 0, DICT_TABLES_ID,
+				  dict_ind_redundant, mtr);
+	if (root_page_no == FIL_NULL) {
+
+		return(FALSE);
+	}
+
+	mlog_write_ulint(dict_header + DICT_HDR_TABLES, root_page_no,
+			 MLOG_4BYTES, mtr);
+	/*--------------------------*/
+	root_page_no = btr_create(DICT_UNIQUE, DICT_HDR_SPACE, 0,
+				  DICT_TABLE_IDS_ID,
+				  dict_ind_redundant, mtr);
+	if (root_page_no == FIL_NULL) {
+
+		return(FALSE);
+	}
+
+	mlog_write_ulint(dict_header + DICT_HDR_TABLE_IDS, root_page_no,
+			 MLOG_4BYTES, mtr);
+	/*--------------------------*/
+	root_page_no = btr_create(DICT_CLUSTERED | DICT_UNIQUE,
+				  DICT_HDR_SPACE, 0, DICT_COLUMNS_ID,
+				  dict_ind_redundant, mtr);
+	if (root_page_no == FIL_NULL) {
+
+		return(FALSE);
+	}
+
+	mlog_write_ulint(dict_header + DICT_HDR_COLUMNS, root_page_no,
+			 MLOG_4BYTES, mtr);
+	/*--------------------------*/
+	root_page_no = btr_create(DICT_CLUSTERED | DICT_UNIQUE,
+				  DICT_HDR_SPACE, 0, DICT_INDEXES_ID,
+				  dict_ind_redundant, mtr);
+	if (root_page_no == FIL_NULL) {
+
+		return(FALSE);
+	}
+
+	mlog_write_ulint(dict_header + DICT_HDR_INDEXES, root_page_no,
+			 MLOG_4BYTES, mtr);
+	/*--------------------------*/
+	root_page_no = btr_create(DICT_CLUSTERED | DICT_UNIQUE,
+				  DICT_HDR_SPACE, 0, DICT_FIELDS_ID,
+				  dict_ind_redundant, mtr);
+	if (root_page_no == FIL_NULL) {
+
+		return(FALSE);
+	}
+
+	mlog_write_ulint(dict_header + DICT_HDR_FIELDS, root_page_no,
+			 MLOG_4BYTES, mtr);
+	/*--------------------------*/
+
+	return(TRUE);
+}
+
+/*****************************************************************//**
+Initializes the data dictionary memory structures when the database is
+started. This function is also called when the data dictionary is created. */
+UNIV_INTERN
+void
+dict_boot(void)
+/*===========*/
+{
+	dict_table_t*	table;
+	dict_index_t*	index;
+	dict_hdr_t*	dict_hdr;
+	mem_heap_t*	heap;
+	mtr_t		mtr;
+	ulint		error;
+
+	mtr_start(&mtr);
+
+	/* Create the hash tables etc. */
+	dict_init();
+
+	heap = mem_heap_create(450);
+
+	mutex_enter(&(dict_sys->mutex));
+
+	/* Get the dictionary header */
+	dict_hdr = dict_hdr_get(&mtr);
+
+	if (ut_dulint_cmp(mtr_read_dulint(dict_hdr + DICT_HDR_XTRADB_MARK, &mtr),
+			  DICT_HDR_XTRADB_FLAG) != 0) {
+		/* not extended yet by XtraDB, need to be extended */
+		ulint	root_page_no;
+
+		root_page_no = btr_create(DICT_CLUSTERED | DICT_UNIQUE,
+					  DICT_HDR_SPACE, 0, DICT_STATS_ID,
+					  dict_ind_redundant, &mtr);
+		if (root_page_no == FIL_NULL) {
+			fprintf(stderr, "InnoDB: Warning: failed to create SYS_STATS btr.\n");
+			srv_use_sys_stats_table = FALSE;
+		} else {
+			mlog_write_ulint(dict_hdr + DICT_HDR_STATS, root_page_no,
+					 MLOG_4BYTES, &mtr);
+			mlog_write_dulint(dict_hdr + DICT_HDR_XTRADB_MARK,
+					  DICT_HDR_XTRADB_FLAG, &mtr);
+		}
+		mtr_commit(&mtr);
+		/* restart mtr */
+		mtr_start(&mtr);
+		dict_hdr = dict_hdr_get(&mtr);
+	}
+
+	/* Because we only write new row ids to disk-based data structure
+	(dictionary header) when it is divisible by
+	DICT_HDR_ROW_ID_WRITE_MARGIN, in recovery we will not recover
+	the latest value of the row id counter. Therefore we advance
+	the counter at the database startup to avoid overlapping values.
+	Note that when a user after database startup first time asks for
+	a new row id, then because the counter is now divisible by
+	..._MARGIN, it will immediately be updated to the disk-based
+	header. */
+
+	dict_sys->row_id = ut_dulint_add(
+		ut_dulint_align_up(mtr_read_dulint(dict_hdr + DICT_HDR_ROW_ID,
+						   &mtr),
+				   DICT_HDR_ROW_ID_WRITE_MARGIN),
+		DICT_HDR_ROW_ID_WRITE_MARGIN);
+
+	/* Insert into the dictionary cache the descriptions of the basic
+	system tables */
+	/*-------------------------*/
+	table = dict_mem_table_create("SYS_TABLES", DICT_HDR_SPACE, 8, 0);
+	table->n_mysql_handles_opened = 1; /* for pin */
+
+	dict_mem_table_add_col(table, heap, "NAME", DATA_BINARY, 0, 0);
+	dict_mem_table_add_col(table, heap, "ID", DATA_BINARY, 0, 0);
+	/* ROW_FORMAT = (N_COLS >> 31) ? COMPACT : REDUNDANT */
+	dict_mem_table_add_col(table, heap, "N_COLS", DATA_INT, 0, 4);
+	/* TYPE is either DICT_TABLE_ORDINARY, or (TYPE & DICT_TF_COMPACT)
+	and (TYPE & DICT_TF_FORMAT_MASK) are nonzero and TYPE = table->flags */
+	dict_mem_table_add_col(table, heap, "TYPE", DATA_INT, 0, 4);
+	dict_mem_table_add_col(table, heap, "MIX_ID", DATA_BINARY, 0, 0);
+	/* MIX_LEN may contain additional table flags when
+	ROW_FORMAT!=REDUNDANT.  Currently, these flags include
+	DICT_TF2_TEMPORARY. */
+	dict_mem_table_add_col(table, heap, "MIX_LEN", DATA_INT, 0, 4);
+	dict_mem_table_add_col(table, heap, "CLUSTER_NAME", DATA_BINARY, 0, 0);
+	dict_mem_table_add_col(table, heap, "SPACE", DATA_INT, 0, 4);
+
+	table->id = DICT_TABLES_ID;
+
+	dict_table_add_to_cache(table, heap);
+	dict_sys->sys_tables = table;
+	mem_heap_empty(heap);
+
+	index = dict_mem_index_create("SYS_TABLES", "CLUST_IND",
+				      DICT_HDR_SPACE,
+				      DICT_UNIQUE | DICT_CLUSTERED, 1);
+
+	dict_mem_index_add_field(index, "NAME", 0);
+
+	index->id = DICT_TABLES_ID;
+
+	error = dict_index_add_to_cache(table, index,
+					mtr_read_ulint(dict_hdr
+						       + DICT_HDR_TABLES,
+						       MLOG_4BYTES, &mtr),
+					FALSE);
+	ut_a(error == DB_SUCCESS);
+
+	/*-------------------------*/
+	index = dict_mem_index_create("SYS_TABLES", "ID_IND",
+				      DICT_HDR_SPACE, DICT_UNIQUE, 1);
+	dict_mem_index_add_field(index, "ID", 0);
+
+	index->id = DICT_TABLE_IDS_ID;
+	error = dict_index_add_to_cache(table, index,
+					mtr_read_ulint(dict_hdr
+						       + DICT_HDR_TABLE_IDS,
+						       MLOG_4BYTES, &mtr),
+					FALSE);
+	ut_a(error == DB_SUCCESS);
+
+	/*-------------------------*/
+	table = dict_mem_table_create("SYS_COLUMNS", DICT_HDR_SPACE, 7, 0);
+	table->n_mysql_handles_opened = 1; /* for pin */
+
+	dict_mem_table_add_col(table, heap, "TABLE_ID", DATA_BINARY, 0, 0);
+	dict_mem_table_add_col(table, heap, "POS", DATA_INT, 0, 4);
+	dict_mem_table_add_col(table, heap, "NAME", DATA_BINARY, 0, 0);
+	dict_mem_table_add_col(table, heap, "MTYPE", DATA_INT, 0, 4);
+	dict_mem_table_add_col(table, heap, "PRTYPE", DATA_INT, 0, 4);
+	dict_mem_table_add_col(table, heap, "LEN", DATA_INT, 0, 4);
+	dict_mem_table_add_col(table, heap, "PREC", DATA_INT, 0, 4);
+
+	table->id = DICT_COLUMNS_ID;
+
+	dict_table_add_to_cache(table, heap);
+	dict_sys->sys_columns = table;
+	mem_heap_empty(heap);
+
+	index = dict_mem_index_create("SYS_COLUMNS", "CLUST_IND",
+				      DICT_HDR_SPACE,
+				      DICT_UNIQUE | DICT_CLUSTERED, 2);
+
+	dict_mem_index_add_field(index, "TABLE_ID", 0);
+	dict_mem_index_add_field(index, "POS", 0);
+
+	index->id = DICT_COLUMNS_ID;
+	error = dict_index_add_to_cache(table, index,
+					mtr_read_ulint(dict_hdr
+						       + DICT_HDR_COLUMNS,
+						       MLOG_4BYTES, &mtr),
+					FALSE);
+	ut_a(error == DB_SUCCESS);
+
+	/*-------------------------*/
+	table = dict_mem_table_create("SYS_INDEXES", DICT_HDR_SPACE, 7, 0);
+	table->n_mysql_handles_opened = 1; /* for pin */
+
+	dict_mem_table_add_col(table, heap, "TABLE_ID", DATA_BINARY, 0, 0);
+	dict_mem_table_add_col(table, heap, "ID", DATA_BINARY, 0, 0);
+	dict_mem_table_add_col(table, heap, "NAME", DATA_BINARY, 0, 0);
+	dict_mem_table_add_col(table, heap, "N_FIELDS", DATA_INT, 0, 4);
+	dict_mem_table_add_col(table, heap, "TYPE", DATA_INT, 0, 4);
+	dict_mem_table_add_col(table, heap, "SPACE", DATA_INT, 0, 4);
+	dict_mem_table_add_col(table, heap, "PAGE_NO", DATA_INT, 0, 4);
+
+	/* The '+ 2' below comes from the fields DB_TRX_ID, DB_ROLL_PTR */
+#if DICT_SYS_INDEXES_PAGE_NO_FIELD != 6 + 2
+#error "DICT_SYS_INDEXES_PAGE_NO_FIELD != 6 + 2"
+#endif
+#if DICT_SYS_INDEXES_SPACE_NO_FIELD != 5 + 2
+#error "DICT_SYS_INDEXES_SPACE_NO_FIELD != 5 + 2"
+#endif
+#if DICT_SYS_INDEXES_TYPE_FIELD != 4 + 2
+#error "DICT_SYS_INDEXES_TYPE_FIELD != 4 + 2"
+#endif
+#if DICT_SYS_INDEXES_NAME_FIELD != 2 + 2
+#error "DICT_SYS_INDEXES_NAME_FIELD != 2 + 2"
+#endif
+
+	table->id = DICT_INDEXES_ID;
+	dict_table_add_to_cache(table, heap);
+	dict_sys->sys_indexes = table;
+	mem_heap_empty(heap);
+
+	index = dict_mem_index_create("SYS_INDEXES", "CLUST_IND",
+				      DICT_HDR_SPACE,
+				      DICT_UNIQUE | DICT_CLUSTERED, 2);
+
+	dict_mem_index_add_field(index, "TABLE_ID", 0);
+	dict_mem_index_add_field(index, "ID", 0);
+
+	index->id = DICT_INDEXES_ID;
+	error = dict_index_add_to_cache(table, index,
+					mtr_read_ulint(dict_hdr
+						       + DICT_HDR_INDEXES,
+						       MLOG_4BYTES, &mtr),
+					FALSE);
+	ut_a(error == DB_SUCCESS);
+
+	/*-------------------------*/
+	table = dict_mem_table_create("SYS_FIELDS", DICT_HDR_SPACE, 3, 0);
+	table->n_mysql_handles_opened = 1; /* for pin */
+
+	dict_mem_table_add_col(table, heap, "INDEX_ID", DATA_BINARY, 0, 0);
+	dict_mem_table_add_col(table, heap, "POS", DATA_INT, 0, 4);
+	dict_mem_table_add_col(table, heap, "COL_NAME", DATA_BINARY, 0, 0);
+
+	table->id = DICT_FIELDS_ID;
+	dict_table_add_to_cache(table, heap);
+	dict_sys->sys_fields = table;
+	mem_heap_empty(heap);
+
+	index = dict_mem_index_create("SYS_FIELDS", "CLUST_IND",
+				      DICT_HDR_SPACE,
+				      DICT_UNIQUE | DICT_CLUSTERED, 2);
+
+	dict_mem_index_add_field(index, "INDEX_ID", 0);
+	dict_mem_index_add_field(index, "POS", 0);
+
+	index->id = DICT_FIELDS_ID;
+	error = dict_index_add_to_cache(table, index,
+					mtr_read_ulint(dict_hdr
+						       + DICT_HDR_FIELDS,
+						       MLOG_4BYTES, &mtr),
+					FALSE);
+	ut_a(error == DB_SUCCESS);
+
+	/*-------------------------*/
+	table = dict_mem_table_create("SYS_STATS", DICT_HDR_SPACE, 3, 0);
+	table->n_mysql_handles_opened = 1; /* for pin */
+
+	dict_mem_table_add_col(table, heap, "INDEX_ID", DATA_BINARY, 0, 0);
+	dict_mem_table_add_col(table, heap, "KEY_COLS", DATA_INT, 0, 4);
+	dict_mem_table_add_col(table, heap, "DIFF_VALS", DATA_BINARY, 0, 0);
+
+	/* The '+ 2' below comes from the fields DB_TRX_ID, DB_ROLL_PTR */
+#if DICT_SYS_STATS_DIFF_VALS_FIELD != 2 + 2
+#error "DICT_SYS_STATS_DIFF_VALS_FIELD != 2 + 2"
+#endif
+
+	table->id = DICT_STATS_ID;
+	dict_table_add_to_cache(table, heap);
+	dict_sys->sys_stats = table;
+	mem_heap_empty(heap);
+
+	index = dict_mem_index_create("SYS_STATS", "CLUST_IND",
+				      DICT_HDR_SPACE,
+				      DICT_UNIQUE | DICT_CLUSTERED, 2);
+
+	dict_mem_index_add_field(index, "INDEX_ID", 0);
+	dict_mem_index_add_field(index, "KEY_COLS", 0);
+
+	index->id = DICT_STATS_ID;
+	error = dict_index_add_to_cache(table, index,
+					mtr_read_ulint(dict_hdr
+						       + DICT_HDR_STATS,
+						       MLOG_4BYTES, &mtr),
+					FALSE);
+	ut_a(error == DB_SUCCESS);
+
+	mem_heap_free(heap);
+
+	mtr_commit(&mtr);
+	/*-------------------------*/
+
+	/* Initialize the insert buffer table and index for each tablespace */
+
+	ibuf_init_at_db_start();
+
+	/* Load definitions of other indexes on system tables */
+
+	dict_load_sys_table(dict_sys->sys_tables);
+	dict_load_sys_table(dict_sys->sys_columns);
+	dict_load_sys_table(dict_sys->sys_indexes);
+	dict_load_sys_table(dict_sys->sys_fields);
+	dict_load_sys_table(dict_sys->sys_stats);
+
+	mutex_exit(&(dict_sys->mutex));
+}
+
+/*****************************************************************//**
+Inserts the basic system table data into themselves in the database
+creation. */
+static
+void
+dict_insert_initial_data(void)
+/*==========================*/
+{
+	/* Does nothing yet */
+}
+
+/*****************************************************************//**
+Creates and initializes the data dictionary at the database creation. */
+UNIV_INTERN
+void
+dict_create(void)
+/*=============*/
+{
+	mtr_t	mtr;
+
+	mtr_start(&mtr);
+
+	dict_hdr_create(&mtr);
+
+	mtr_commit(&mtr);
+
+	dict_boot();
+
+	dict_insert_initial_data();
+}
diff --git a/storage/xtradb/dict/dict0crea.c b/storage/xtradb/dict/dict0crea.c
new file mode 100644
index 00000000000..a6d0e11740a
--- /dev/null
+++ b/storage/xtradb/dict/dict0crea.c
@@ -0,0 +1,1733 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2010, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file dict/dict0crea.c
+Database object creation
+
+Created 1/8/1996 Heikki Tuuri
+*******************************************************/
+
+#include "dict0crea.h"
+
+#ifdef UNIV_NONINL
+#include "dict0crea.ic"
+#endif
+
+#include "btr0pcur.h"
+#include "btr0btr.h"
+#include "page0page.h"
+#include "mach0data.h"
+#include "dict0boot.h"
+#include "dict0dict.h"
+#include "que0que.h"
+#include "row0ins.h"
+#include "row0mysql.h"
+#include "pars0pars.h"
+#include "trx0roll.h"
+#include "usr0sess.h"
+#include "ut0vec.h"
+
+/*****************************************************************//**
+Based on a table object, this function builds the entry to be inserted
+in the SYS_TABLES system table.
+@return	the tuple which should be inserted */
+static
+dtuple_t*
+dict_create_sys_tables_tuple(
+/*=========================*/
+	const dict_table_t*	table,	/*!< in: table */
+	mem_heap_t*		heap)	/*!< in: memory heap from
+					which the memory for the built
+					tuple is allocated */
+{
+	dict_table_t*	sys_tables;
+	dtuple_t*	entry;
+	dfield_t*	dfield;
+	byte*		ptr;
+
+	ut_ad(table);
+	ut_ad(heap);
+
+	sys_tables = dict_sys->sys_tables;
+
+	entry = dtuple_create(heap, 8 + DATA_N_SYS_COLS);
+
+	dict_table_copy_types(entry, sys_tables);
+
+	/* 0: NAME -----------------------------*/
+	dfield = dtuple_get_nth_field(entry, 0/*NAME*/);
+
+	dfield_set_data(dfield, table->name, ut_strlen(table->name));
+	/* 3: ID -------------------------------*/
+	dfield = dtuple_get_nth_field(entry, 1/*ID*/);
+
+	ptr = mem_heap_alloc(heap, 8);
+	mach_write_to_8(ptr, table->id);
+
+	dfield_set_data(dfield, ptr, 8);
+	/* 4: N_COLS ---------------------------*/
+	dfield = dtuple_get_nth_field(entry, 2/*N_COLS*/);
+
+#if DICT_TF_COMPACT != 1
+#error
+#endif
+
+	ptr = mem_heap_alloc(heap, 4);
+	mach_write_to_4(ptr, table->n_def
+			| ((table->flags & DICT_TF_COMPACT) << 31));
+	dfield_set_data(dfield, ptr, 4);
+	/* 5: TYPE -----------------------------*/
+	dfield = dtuple_get_nth_field(entry, 3/*TYPE*/);
+
+	ptr = mem_heap_alloc(heap, 4);
+	if (table->flags & (~DICT_TF_COMPACT & ~(~0 << DICT_TF_BITS))) {
+		ut_a(table->flags & DICT_TF_COMPACT);
+		ut_a(dict_table_get_format(table) >= DICT_TF_FORMAT_ZIP);
+		ut_a(((ulonglong) table->flags & DICT_TF_ZSSIZE_MASK)
+		     <= (ulonglong) (DICT_TF_ZSSIZE_MAX << DICT_TF_ZSSIZE_SHIFT));
+		ut_a(!(table->flags & (~0 << DICT_TF2_BITS)));
+		mach_write_to_4(ptr, table->flags & ~(~0 << DICT_TF_BITS));
+	} else {
+		mach_write_to_4(ptr, DICT_TABLE_ORDINARY);
+	}
+
+	dfield_set_data(dfield, ptr, 4);
+	/* 6: MIX_ID (obsolete) ---------------------------*/
+	dfield = dtuple_get_nth_field(entry, 4/*MIX_ID*/);
+
+	ptr = mem_heap_zalloc(heap, 8);
+
+	dfield_set_data(dfield, ptr, 8);
+	/* 7: MIX_LEN (additional flags) --------------------------*/
+
+	dfield = dtuple_get_nth_field(entry, 5/*MIX_LEN*/);
+
+	ptr = mem_heap_alloc(heap, 4);
+	mach_write_to_4(ptr, table->flags >> DICT_TF2_SHIFT);
+
+	dfield_set_data(dfield, ptr, 4);
+	/* 8: CLUSTER_NAME ---------------------*/
+	dfield = dtuple_get_nth_field(entry, 6/*CLUSTER_NAME*/);
+	dfield_set_null(dfield); /* not supported */
+
+	/* 9: SPACE ----------------------------*/
+	dfield = dtuple_get_nth_field(entry, 7/*SPACE*/);
+
+	ptr = mem_heap_alloc(heap, 4);
+	mach_write_to_4(ptr, table->space);
+
+	dfield_set_data(dfield, ptr, 4);
+	/*----------------------------------*/
+
+	return(entry);
+}
+
+/*****************************************************************//**
+Based on a table object, this function builds the entry to be inserted
+in the SYS_COLUMNS system table.
+@return	the tuple which should be inserted */
+static
+dtuple_t*
+dict_create_sys_columns_tuple(
+/*==========================*/
+	const dict_table_t*	table,	/*!< in: table */
+	ulint			i,	/*!< in: column number */
+	mem_heap_t*		heap)	/*!< in: memory heap from
+					which the memory for the built
+					tuple is allocated */
+{
+	dict_table_t*		sys_columns;
+	dtuple_t*		entry;
+	const dict_col_t*	column;
+	dfield_t*		dfield;
+	byte*			ptr;
+	const char*		col_name;
+
+	ut_ad(table);
+	ut_ad(heap);
+
+	column = dict_table_get_nth_col(table, i);
+
+	sys_columns = dict_sys->sys_columns;
+
+	entry = dtuple_create(heap, 7 + DATA_N_SYS_COLS);
+
+	dict_table_copy_types(entry, sys_columns);
+
+	/* 0: TABLE_ID -----------------------*/
+	dfield = dtuple_get_nth_field(entry, 0/*TABLE_ID*/);
+
+	ptr = mem_heap_alloc(heap, 8);
+	mach_write_to_8(ptr, table->id);
+
+	dfield_set_data(dfield, ptr, 8);
+	/* 1: POS ----------------------------*/
+	dfield = dtuple_get_nth_field(entry, 1/*POS*/);
+
+	ptr = mem_heap_alloc(heap, 4);
+	mach_write_to_4(ptr, i);
+
+	dfield_set_data(dfield, ptr, 4);
+	/* 4: NAME ---------------------------*/
+	dfield = dtuple_get_nth_field(entry, 2/*NAME*/);
+
+	col_name = dict_table_get_col_name(table, i);
+	dfield_set_data(dfield, col_name, ut_strlen(col_name));
+	/* 5: MTYPE --------------------------*/
+	dfield = dtuple_get_nth_field(entry, 3/*MTYPE*/);
+
+	ptr = mem_heap_alloc(heap, 4);
+	mach_write_to_4(ptr, column->mtype);
+
+	dfield_set_data(dfield, ptr, 4);
+	/* 6: PRTYPE -------------------------*/
+	dfield = dtuple_get_nth_field(entry, 4/*PRTYPE*/);
+
+	ptr = mem_heap_alloc(heap, 4);
+	mach_write_to_4(ptr, column->prtype);
+
+	dfield_set_data(dfield, ptr, 4);
+	/* 7: LEN ----------------------------*/
+	dfield = dtuple_get_nth_field(entry, 5/*LEN*/);
+
+	ptr = mem_heap_alloc(heap, 4);
+	mach_write_to_4(ptr, column->len);
+
+	dfield_set_data(dfield, ptr, 4);
+	/* 8: PREC ---------------------------*/
+	dfield = dtuple_get_nth_field(entry, 6/*PREC*/);
+
+	ptr = mem_heap_alloc(heap, 4);
+	mach_write_to_4(ptr, 0/* unused */);
+
+	dfield_set_data(dfield, ptr, 4);
+	/*---------------------------------*/
+
+	return(entry);
+}
+
+/***************************************************************//**
+Builds a table definition to insert.
+@return	DB_SUCCESS or error code */
+static
+ulint
+dict_build_table_def_step(
+/*======================*/
+	que_thr_t*	thr,	/*!< in: query thread */
+	tab_node_t*	node)	/*!< in: table create node */
+{
+	dict_table_t*	table;
+	dtuple_t*	row;
+	ulint		error;
+	ulint		flags;
+	const char*	path_or_name;
+	ibool		is_path;
+	mtr_t		mtr;
+	ulint		space = 0;
+	ibool		file_per_table;
+
+	ut_ad(mutex_own(&(dict_sys->mutex)));
+
+	table = node->table;
+
+	/* Cache the global variable "srv_file_per_table" to
+	a local variable before using it. Please note
+	"srv_file_per_table" is not under dict_sys mutex
+	protection, and could be changed while executing
+	this function. So better to cache the current value
+	to a local variable, and all future reference to
+	"srv_file_per_table" should use this local variable. */
+	file_per_table = srv_file_per_table;
+
+	dict_hdr_get_new_id(&table->id, NULL, NULL);
+
+	thr_get_trx(thr)->table_id = table->id;
+
+	if (file_per_table) {
+		/* Get a new space id if srv_file_per_table is set */
+		dict_hdr_get_new_id(NULL, NULL, &space);
+
+		if (UNIV_UNLIKELY(space == ULINT_UNDEFINED)) {
+			return(DB_ERROR);
+		}
+
+		/* We create a new single-table tablespace for the table.
+		We initially let it be 4 pages:
+		- page 0 is the fsp header and an extent descriptor page,
+		- page 1 is an ibuf bitmap page,
+		- page 2 is the first inode page,
+		- page 3 will contain the root of the clustered index of the
+		table we create here. */
+
+		if (table->dir_path_of_temp_table) {
+			/* We place tables created with CREATE TEMPORARY
+			TABLE in the tmp dir of mysqld server */
+
+			path_or_name = table->dir_path_of_temp_table;
+			is_path = TRUE;
+		} else {
+			path_or_name = table->name;
+			is_path = FALSE;
+		}
+
+		ut_ad(dict_table_get_format(table) <= DICT_TF_FORMAT_MAX);
+		ut_ad(!dict_table_zip_size(table)
+		      || dict_table_get_format(table) >= DICT_TF_FORMAT_ZIP);
+
+		flags = table->flags & ~(~0 << DICT_TF_BITS);
+		error = fil_create_new_single_table_tablespace(
+			space, path_or_name, is_path,
+			flags == DICT_TF_COMPACT ? 0 : flags,
+			FIL_IBD_FILE_INITIAL_SIZE);
+		table->space = (unsigned int) space;
+
+		if (error != DB_SUCCESS) {
+
+			return(error);
+		}
+
+		mtr_start(&mtr);
+
+		fsp_header_init(table->space, FIL_IBD_FILE_INITIAL_SIZE, &mtr);
+
+		mtr_commit(&mtr);
+	} else {
+		/* Create in the system tablespace: disallow new features */
+		table->flags &= (~0 << DICT_TF_BITS) | DICT_TF_COMPACT;
+	}
+
+	row = dict_create_sys_tables_tuple(table, node->heap);
+
+	ins_node_set_new_row(node->tab_def, row);
+
+	return(DB_SUCCESS);
+}
+
+/***************************************************************//**
+Builds a column definition to insert.
+@return	DB_SUCCESS */
+static
+ulint
+dict_build_col_def_step(
+/*====================*/
+	tab_node_t*	node)	/*!< in: table create node */
+{
+	dtuple_t*	row;
+
+	row = dict_create_sys_columns_tuple(node->table, node->col_no,
+					    node->heap);
+	ins_node_set_new_row(node->col_def, row);
+
+	return(DB_SUCCESS);
+}
+
+/*****************************************************************//**
+Based on an index object, this function builds the entry to be inserted
+in the SYS_INDEXES system table.
+@return	the tuple which should be inserted */
+static
+dtuple_t*
+dict_create_sys_indexes_tuple(
+/*==========================*/
+	const dict_index_t*	index,	/*!< in: index */
+	mem_heap_t*		heap)	/*!< in: memory heap from
+					which the memory for the built
+					tuple is allocated */
+{
+	dict_table_t*	sys_indexes;
+	dict_table_t*	table;
+	dtuple_t*	entry;
+	dfield_t*	dfield;
+	byte*		ptr;
+
+	ut_ad(mutex_own(&(dict_sys->mutex)));
+	ut_ad(index);
+	ut_ad(heap);
+
+	sys_indexes = dict_sys->sys_indexes;
+
+	table = dict_table_get_low(index->table_name);
+
+	entry = dtuple_create(heap, 7 + DATA_N_SYS_COLS);
+
+	dict_table_copy_types(entry, sys_indexes);
+
+	/* 0: TABLE_ID -----------------------*/
+	dfield = dtuple_get_nth_field(entry, 0/*TABLE_ID*/);
+
+	ptr = mem_heap_alloc(heap, 8);
+	mach_write_to_8(ptr, table->id);
+
+	dfield_set_data(dfield, ptr, 8);
+	/* 1: ID ----------------------------*/
+	dfield = dtuple_get_nth_field(entry, 1/*ID*/);
+
+	ptr = mem_heap_alloc(heap, 8);
+	mach_write_to_8(ptr, index->id);
+
+	dfield_set_data(dfield, ptr, 8);
+	/* 4: NAME --------------------------*/
+	dfield = dtuple_get_nth_field(entry, 2/*NAME*/);
+
+	dfield_set_data(dfield, index->name, ut_strlen(index->name));
+	/* 5: N_FIELDS ----------------------*/
+	dfield = dtuple_get_nth_field(entry, 3/*N_FIELDS*/);
+
+	ptr = mem_heap_alloc(heap, 4);
+	mach_write_to_4(ptr, index->n_fields);
+
+	dfield_set_data(dfield, ptr, 4);
+	/* 6: TYPE --------------------------*/
+	dfield = dtuple_get_nth_field(entry, 4/*TYPE*/);
+
+	ptr = mem_heap_alloc(heap, 4);
+	mach_write_to_4(ptr, index->type);
+
+	dfield_set_data(dfield, ptr, 4);
+	/* 7: SPACE --------------------------*/
+
+#if DICT_SYS_INDEXES_SPACE_NO_FIELD != 7
+#error "DICT_SYS_INDEXES_SPACE_NO_FIELD != 7"
+#endif
+
+	dfield = dtuple_get_nth_field(entry, 5/*SPACE*/);
+
+	ptr = mem_heap_alloc(heap, 4);
+	mach_write_to_4(ptr, index->space);
+
+	dfield_set_data(dfield, ptr, 4);
+	/* 8: PAGE_NO --------------------------*/
+
+#if DICT_SYS_INDEXES_PAGE_NO_FIELD != 8
+#error "DICT_SYS_INDEXES_PAGE_NO_FIELD != 8"
+#endif
+
+	dfield = dtuple_get_nth_field(entry, 6/*PAGE_NO*/);
+
+	ptr = mem_heap_alloc(heap, 4);
+	mach_write_to_4(ptr, FIL_NULL);
+
+	dfield_set_data(dfield, ptr, 4);
+	/*--------------------------------*/
+
+	return(entry);
+}
+
+/*****************************************************************//**
+Based on an index object, this function builds the entry to be inserted
+in the SYS_FIELDS system table.
+@return	the tuple which should be inserted */
+static
+dtuple_t*
+dict_create_sys_fields_tuple(
+/*=========================*/
+	const dict_index_t*	index,	/*!< in: index */
+	ulint			i,	/*!< in: field number */
+	mem_heap_t*		heap)	/*!< in: memory heap from
+					which the memory for the built
+					tuple is allocated */
+{
+	dict_table_t*	sys_fields;
+	dtuple_t*	entry;
+	dict_field_t*	field;
+	dfield_t*	dfield;
+	byte*		ptr;
+	ibool		index_contains_column_prefix_field	= FALSE;
+	ulint		j;
+
+	ut_ad(index);
+	ut_ad(heap);
+
+	for (j = 0; j < index->n_fields; j++) {
+		if (dict_index_get_nth_field(index, j)->prefix_len > 0) {
+			index_contains_column_prefix_field = TRUE;
+			break;
+		}
+	}
+
+	field = dict_index_get_nth_field(index, i);
+
+	sys_fields = dict_sys->sys_fields;
+
+	entry = dtuple_create(heap, 3 + DATA_N_SYS_COLS);
+
+	dict_table_copy_types(entry, sys_fields);
+
+	/* 0: INDEX_ID -----------------------*/
+	dfield = dtuple_get_nth_field(entry, 0/*INDEX_ID*/);
+
+	ptr = mem_heap_alloc(heap, 8);
+	mach_write_to_8(ptr, index->id);
+
+	dfield_set_data(dfield, ptr, 8);
+	/* 1: POS + PREFIX LENGTH ----------------------------*/
+
+	dfield = dtuple_get_nth_field(entry, 1/*POS*/);
+
+	ptr = mem_heap_alloc(heap, 4);
+
+	if (index_contains_column_prefix_field) {
+		/* If there are column prefix fields in the index, then
+		we store the number of the field to the 2 HIGH bytes
+		and the prefix length to the 2 low bytes, */
+
+		mach_write_to_4(ptr, (i << 16) + field->prefix_len);
+	} else {
+		/* Else we store the number of the field to the 2 LOW bytes.
+		This is to keep the storage format compatible with
+		InnoDB versions < 4.0.14. */
+
+		mach_write_to_4(ptr, i);
+	}
+
+	dfield_set_data(dfield, ptr, 4);
+	/* 4: COL_NAME -------------------------*/
+	dfield = dtuple_get_nth_field(entry, 2/*COL_NAME*/);
+
+	dfield_set_data(dfield, field->name,
+			ut_strlen(field->name));
+	/*---------------------------------*/
+
+	return(entry);
+}
+
+/*****************************************************************//**
+Based on an index object, this function builds the entry to be inserted
+in the SYS_STATS system table.
+@return	the tuple which should be inserted */
+static
+dtuple_t*
+dict_create_sys_stats_tuple(
+/*========================*/
+	const dict_index_t*	index,
+	ulint			i,
+	mem_heap_t*		heap)
+{
+	dict_table_t*	sys_stats;
+	dtuple_t*	entry;
+	dfield_t*	dfield;
+	byte*		ptr;
+
+	ut_ad(index);
+	ut_ad(heap);
+
+	sys_stats = dict_sys->sys_stats;
+
+	entry = dtuple_create(heap, 3 + DATA_N_SYS_COLS);
+
+	dict_table_copy_types(entry, sys_stats);
+
+	/* 0: INDEX_ID -----------------------*/
+	dfield = dtuple_get_nth_field(entry, 0/*INDEX_ID*/);
+	ptr = mem_heap_alloc(heap, 8);
+	mach_write_to_8(ptr, index->id);
+	dfield_set_data(dfield, ptr, 8);
+	/* 1: KEY_COLS -----------------------*/
+	dfield = dtuple_get_nth_field(entry, 1/*KEY_COLS*/);
+	ptr = mem_heap_alloc(heap, 4);
+	mach_write_to_4(ptr, i);
+	dfield_set_data(dfield, ptr, 4);
+	/* 4: DIFF_VALS ----------------------*/
+	dfield = dtuple_get_nth_field(entry, 2/*DIFF_VALS*/);
+	ptr = mem_heap_alloc(heap, 8);
+	mach_write_to_8(ptr, ut_dulint_zero); /* initial value is 0 */
+	dfield_set_data(dfield, ptr, 8);
+
+	return(entry);
+}
+
+/*****************************************************************//**
+Creates the tuple with which the index entry is searched for writing the index
+tree root page number, if such a tree is created.
+@return	the tuple for search */
+static
+dtuple_t*
+dict_create_search_tuple(
+/*=====================*/
+	const dtuple_t*	tuple,	/*!< in: the tuple inserted in the SYS_INDEXES
+				table */
+	mem_heap_t*	heap)	/*!< in: memory heap from which the memory for
+				the built tuple is allocated */
+{
+	dtuple_t*	search_tuple;
+	const dfield_t*	field1;
+	dfield_t*	field2;
+
+	ut_ad(tuple && heap);
+
+	search_tuple = dtuple_create(heap, 2);
+
+	field1 = dtuple_get_nth_field(tuple, 0);
+	field2 = dtuple_get_nth_field(search_tuple, 0);
+
+	dfield_copy(field2, field1);
+
+	field1 = dtuple_get_nth_field(tuple, 1);
+	field2 = dtuple_get_nth_field(search_tuple, 1);
+
+	dfield_copy(field2, field1);
+
+	ut_ad(dtuple_validate(search_tuple));
+
+	return(search_tuple);
+}
+
+/***************************************************************//**
+Builds an index definition row to insert.
+@return	DB_SUCCESS or error code */
+static
+ulint
+dict_build_index_def_step(
+/*======================*/
+	que_thr_t*	thr,	/*!< in: query thread */
+	ind_node_t*	node)	/*!< in: index create node */
+{
+	dict_table_t*	table;
+	dict_index_t*	index;
+	dtuple_t*	row;
+	trx_t*		trx;
+
+	ut_ad(mutex_own(&(dict_sys->mutex)));
+
+	trx = thr_get_trx(thr);
+
+	index = node->index;
+
+	table = dict_table_get_low(index->table_name);
+
+	if (table == NULL) {
+		return(DB_TABLE_NOT_FOUND);
+	}
+
+	trx->table_id = table->id;
+
+	node->table = table;
+
+	ut_ad((UT_LIST_GET_LEN(table->indexes) > 0)
+	      || dict_index_is_clust(index));
+
+	dict_hdr_get_new_id(NULL, &index->id, NULL);
+
+	/* Inherit the space id from the table; we store all indexes of a
+	table in the same tablespace */
+
+	index->space = table->space;
+	node->page_no = FIL_NULL;
+	row = dict_create_sys_indexes_tuple(index, node->heap);
+	node->ind_row = row;
+
+	ins_node_set_new_row(node->ind_def, row);
+
+	/* Note that the index was created by this transaction. */
+	index->trx_id = (ib_uint64_t) ut_conv_dulint_to_longlong(trx->id);
+
+	return(DB_SUCCESS);
+}
+
+/***************************************************************//**
+Builds a field definition row to insert.
+@return	DB_SUCCESS */
+static
+ulint
+dict_build_field_def_step(
+/*======================*/
+	ind_node_t*	node)	/*!< in: index create node */
+{
+	dict_index_t*	index;
+	dtuple_t*	row;
+
+	index = node->index;
+
+	row = dict_create_sys_fields_tuple(index, node->field_no, node->heap);
+
+	ins_node_set_new_row(node->field_def, row);
+
+	return(DB_SUCCESS);
+}
+
+/***************************************************************//**
+Builds a row for storing stats to insert.
+@return DB_SUCCESS */
+static
+ulint
+dict_build_stats_def_step(
+/*======================*/
+	ind_node_t*	node)
+{
+	dict_index_t*	index;
+	dtuple_t*	row;
+
+	index = node->index;
+
+	row = dict_create_sys_stats_tuple(index, node->stats_no, node->heap);
+
+	ins_node_set_new_row(node->stats_def, row);
+
+	return(DB_SUCCESS);
+}
+
+/***************************************************************//**
+Creates an index tree for the index if it is not a member of a cluster.
+@return	DB_SUCCESS or DB_OUT_OF_FILE_SPACE */
+static
+ulint
+dict_create_index_tree_step(
+/*========================*/
+	ind_node_t*	node)	/*!< in: index create node */
+{
+	dict_index_t*	index;
+	dict_table_t*	sys_indexes;
+	dict_table_t*	table;
+	dtuple_t*	search_tuple;
+	ulint		zip_size;
+	btr_pcur_t	pcur;
+	mtr_t		mtr;
+
+	ut_ad(mutex_own(&(dict_sys->mutex)));
+
+	index = node->index;
+	table = node->table;
+
+	sys_indexes = dict_sys->sys_indexes;
+
+	/* Run a mini-transaction in which the index tree is allocated for
+	the index and its root address is written to the index entry in
+	sys_indexes */
+
+	mtr_start(&mtr);
+
+	search_tuple = dict_create_search_tuple(node->ind_row, node->heap);
+
+	btr_pcur_open(UT_LIST_GET_FIRST(sys_indexes->indexes),
+		      search_tuple, PAGE_CUR_L, BTR_MODIFY_LEAF,
+		      &pcur, &mtr);
+
+	btr_pcur_move_to_next_user_rec(&pcur, &mtr);
+
+	zip_size = dict_table_zip_size(index->table);
+
+	node->page_no = btr_create(index->type, index->space, zip_size,
+				   index->id, index, &mtr);
+	/* printf("Created a new index tree in space %lu root page %lu\n",
+	index->space, index->page_no); */
+
+	page_rec_write_index_page_no(btr_pcur_get_rec(&pcur),
+				     DICT_SYS_INDEXES_PAGE_NO_FIELD,
+				     node->page_no, &mtr);
+	btr_pcur_close(&pcur);
+	mtr_commit(&mtr);
+
+	if (node->page_no == FIL_NULL) {
+
+		return(DB_OUT_OF_FILE_SPACE);
+	}
+
+	return(DB_SUCCESS);
+}
+
+/*******************************************************************//**
+Drops the index tree associated with a row in SYS_INDEXES table. */
+UNIV_INTERN
+void
+dict_drop_index_tree(
+/*=================*/
+	rec_t*	rec,	/*!< in/out: record in the clustered index
+			of SYS_INDEXES table */
+	mtr_t*	mtr)	/*!< in: mtr having the latch on the record page */
+{
+	ulint		root_page_no;
+	ulint		space;
+	ulint		zip_size;
+	const byte*	ptr;
+	ulint		len;
+
+	ut_ad(mutex_own(&(dict_sys->mutex)));
+	ut_a(!dict_table_is_comp(dict_sys->sys_indexes));
+	ptr = rec_get_nth_field_old(rec, DICT_SYS_INDEXES_PAGE_NO_FIELD, &len);
+
+	ut_ad(len == 4);
+
+	root_page_no = mtr_read_ulint(ptr, MLOG_4BYTES, mtr);
+
+	if (root_page_no == FIL_NULL) {
+		/* The tree has already been freed */
+
+		return;
+	}
+
+	ptr = rec_get_nth_field_old(rec,
+				    DICT_SYS_INDEXES_SPACE_NO_FIELD, &len);
+
+	ut_ad(len == 4);
+
+	space = mtr_read_ulint(ptr, MLOG_4BYTES, mtr);
+	zip_size = fil_space_get_zip_size(space);
+
+	if (UNIV_UNLIKELY(zip_size == ULINT_UNDEFINED)) {
+		/* It is a single table tablespace and the .ibd file is
+		missing: do nothing */
+
+		return;
+	}
+
+	/* We free all the pages but the root page first; this operation
+	may span several mini-transactions */
+
+	btr_free_but_not_root(space, zip_size, root_page_no);
+
+	/* Then we free the root page in the same mini-transaction where
+	we write FIL_NULL to the appropriate field in the SYS_INDEXES
+	record: this mini-transaction marks the B-tree totally freed */
+
+	/* printf("Dropping index tree in space %lu root page %lu\n", space,
+	root_page_no); */
+	btr_free_root(space, zip_size, root_page_no, mtr);
+
+	page_rec_write_index_page_no(rec,
+				     DICT_SYS_INDEXES_PAGE_NO_FIELD,
+				     FIL_NULL, mtr);
+}
+
+/*******************************************************************//**
+Truncates the index tree associated with a row in SYS_INDEXES table.
+@return	new root page number, or FIL_NULL on failure */
+UNIV_INTERN
+ulint
+dict_truncate_index_tree(
+/*=====================*/
+	dict_table_t*	table,	/*!< in: the table the index belongs to */
+	ulint		space,	/*!< in: 0=truncate,
+				nonzero=create the index tree in the
+				given tablespace */
+	btr_pcur_t*	pcur,	/*!< in/out: persistent cursor pointing to
+				record in the clustered index of
+				SYS_INDEXES table. The cursor may be
+				repositioned in this call. */
+	mtr_t*		mtr)	/*!< in: mtr having the latch
+				on the record page. The mtr may be
+				committed and restarted in this call. */
+{
+	ulint		root_page_no;
+	ibool		drop = !space;
+	ulint		zip_size;
+	ulint		type;
+	dulint		index_id;
+	rec_t*		rec;
+	const byte*	ptr;
+	ulint		len;
+	dict_index_t*	index;
+
+	ut_ad(mutex_own(&(dict_sys->mutex)));
+	ut_a(!dict_table_is_comp(dict_sys->sys_indexes));
+	rec = btr_pcur_get_rec(pcur);
+	ptr = rec_get_nth_field_old(rec, DICT_SYS_INDEXES_PAGE_NO_FIELD, &len);
+
+	ut_ad(len == 4);
+
+	root_page_no = mtr_read_ulint(ptr, MLOG_4BYTES, mtr);
+
+	if (drop && root_page_no == FIL_NULL) {
+		/* The tree has been freed. */
+
+		ut_print_timestamp(stderr);
+		fprintf(stderr, "  InnoDB: Trying to TRUNCATE"
+			" a missing index of table %s!\n", table->name);
+		drop = FALSE;
+	}
+
+	ptr = rec_get_nth_field_old(rec,
+				    DICT_SYS_INDEXES_SPACE_NO_FIELD, &len);
+
+	ut_ad(len == 4);
+
+	if (drop) {
+		space = mtr_read_ulint(ptr, MLOG_4BYTES, mtr);
+	}
+
+	zip_size = fil_space_get_zip_size(space);
+
+	if (UNIV_UNLIKELY(zip_size == ULINT_UNDEFINED)) {
+		/* It is a single table tablespace and the .ibd file is
+		missing: do nothing */
+
+		ut_print_timestamp(stderr);
+		fprintf(stderr, "  InnoDB: Trying to TRUNCATE"
+			" a missing .ibd file of table %s!\n", table->name);
+		return(FIL_NULL);
+	}
+
+	ptr = rec_get_nth_field_old(rec,
+				    DICT_SYS_INDEXES_TYPE_FIELD, &len);
+	ut_ad(len == 4);
+	type = mach_read_from_4(ptr);
+
+	ptr = rec_get_nth_field_old(rec, 1, &len);
+	ut_ad(len == 8);
+	index_id = mach_read_from_8(ptr);
+
+	if (!drop) {
+
+		goto create;
+	}
+
+	/* We free all the pages but the root page first; this operation
+	may span several mini-transactions */
+
+	btr_free_but_not_root(space, zip_size, root_page_no);
+
+	/* Then we free the root page in the same mini-transaction where
+	we create the b-tree and write its new root page number to the
+	appropriate field in the SYS_INDEXES record: this mini-transaction
+	marks the B-tree totally truncated */
+
+	btr_page_get(space, zip_size, root_page_no, RW_X_LATCH, mtr);
+
+	btr_free_root(space, zip_size, root_page_no, mtr);
+create:
+	/* We will temporarily write FIL_NULL to the PAGE_NO field
+	in SYS_INDEXES, so that the database will not get into an
+	inconsistent state in case it crashes between the mtr_commit()
+	below and the following mtr_commit() call. */
+	page_rec_write_index_page_no(rec, DICT_SYS_INDEXES_PAGE_NO_FIELD,
+				     FIL_NULL, mtr);
+
+	/* We will need to commit the mini-transaction in order to avoid
+	deadlocks in the btr_create() call, because otherwise we would
+	be freeing and allocating pages in the same mini-transaction. */
+	btr_pcur_store_position(pcur, mtr);
+	mtr_commit(mtr);
+
+	mtr_start(mtr);
+	btr_pcur_restore_position(BTR_MODIFY_LEAF, pcur, mtr);
+
+	/* Find the index corresponding to this SYS_INDEXES record. */
+	for (index = UT_LIST_GET_FIRST(table->indexes);
+	     index;
+	     index = UT_LIST_GET_NEXT(indexes, index)) {
+		if (!ut_dulint_cmp(index->id, index_id)) {
+			root_page_no = btr_create(type, space, zip_size,
+						  index_id, index, mtr);
+			index->page = (unsigned int) root_page_no;
+			return(root_page_no);
+		}
+	}
+
+	ut_print_timestamp(stderr);
+	fprintf(stderr,
+		"  InnoDB: Index %lu %lu of table %s is missing\n"
+		"InnoDB: from the data dictionary during TRUNCATE!\n",
+		ut_dulint_get_high(index_id),
+		ut_dulint_get_low(index_id),
+		table->name);
+
+	return(FIL_NULL);
+}
+
+/*********************************************************************//**
+Creates a table create graph.
+@return	own: table create node */
+UNIV_INTERN
+tab_node_t*
+tab_create_graph_create(
+/*====================*/
+	dict_table_t*	table,	/*!< in: table to create, built as a memory data
+				structure */
+	mem_heap_t*	heap)	/*!< in: heap where created */
+{
+	tab_node_t*	node;
+
+	node = mem_heap_alloc(heap, sizeof(tab_node_t));
+
+	node->common.type = QUE_NODE_CREATE_TABLE;
+
+	node->table = table;
+
+	node->state = TABLE_BUILD_TABLE_DEF;
+	node->heap = mem_heap_create(256);
+
+	node->tab_def = ins_node_create(INS_DIRECT, dict_sys->sys_tables,
+					heap);
+	node->tab_def->common.parent = node;
+
+	node->col_def = ins_node_create(INS_DIRECT, dict_sys->sys_columns,
+					heap);
+	node->col_def->common.parent = node;
+
+	node->commit_node = commit_node_create(heap);
+	node->commit_node->common.parent = node;
+
+	return(node);
+}
+
+/*********************************************************************//**
+Creates an index create graph.
+@return	own: index create node */
+UNIV_INTERN
+ind_node_t*
+ind_create_graph_create(
+/*====================*/
+	dict_index_t*	index,	/*!< in: index to create, built as a memory data
+				structure */
+	mem_heap_t*	heap)	/*!< in: heap where created */
+{
+	ind_node_t*	node;
+
+	node = mem_heap_alloc(heap, sizeof(ind_node_t));
+
+	node->common.type = QUE_NODE_CREATE_INDEX;
+
+	node->index = index;
+
+	node->state = INDEX_BUILD_INDEX_DEF;
+	node->page_no = FIL_NULL;
+	node->heap = mem_heap_create(256);
+
+	node->ind_def = ins_node_create(INS_DIRECT,
+					dict_sys->sys_indexes, heap);
+	node->ind_def->common.parent = node;
+
+	node->field_def = ins_node_create(INS_DIRECT,
+					  dict_sys->sys_fields, heap);
+	node->field_def->common.parent = node;
+
+	if (srv_use_sys_stats_table) {
+		node->stats_def = ins_node_create(INS_DIRECT,
+						  dict_sys->sys_stats, heap);
+		node->stats_def->common.parent = node;
+	} else {
+		node->stats_def = NULL;
+	}
+
+	node->commit_node = commit_node_create(heap);
+	node->commit_node->common.parent = node;
+
+	return(node);
+}
+
+/*********************************************************************//**
+*/
+UNIV_INTERN
+ind_node_t*
+ind_insert_stats_graph_create(
+/*==========================*/
+	dict_index_t*	index,
+	mem_heap_t*	heap)
+{
+	ind_node_t*	node;
+
+	node = mem_heap_alloc(heap, sizeof(ind_node_t));
+
+	node->common.type = QUE_NODE_INSERT_STATS;
+
+	node->index = index;
+
+	node->state = INDEX_BUILD_STATS_COLS;
+	node->page_no = FIL_NULL;
+	node->heap = mem_heap_create(256);
+
+	node->ind_def = NULL;
+	node->field_def = NULL;
+
+	node->stats_def = ins_node_create(INS_DIRECT,
+					  dict_sys->sys_stats, heap);
+	node->stats_def->common.parent = node;
+	node->stats_no = 0;
+
+	node->commit_node = commit_node_create(heap);
+	node->commit_node->common.parent = node;
+
+	return(node);
+}
+
+/***********************************************************//**
+Creates a table. This is a high-level function used in SQL execution graphs.
+@return	query thread to run next or NULL */
+UNIV_INTERN
+que_thr_t*
+dict_create_table_step(
+/*===================*/
+	que_thr_t*	thr)	/*!< in: query thread */
+{
+	tab_node_t*	node;
+	ulint		err	= DB_ERROR;
+	trx_t*		trx;
+
+	ut_ad(thr);
+	ut_ad(mutex_own(&(dict_sys->mutex)));
+
+	trx = thr_get_trx(thr);
+
+	node = thr->run_node;
+
+	ut_ad(que_node_get_type(node) == QUE_NODE_CREATE_TABLE);
+
+	if (thr->prev_node == que_node_get_parent(node)) {
+		node->state = TABLE_BUILD_TABLE_DEF;
+	}
+
+	if (node->state == TABLE_BUILD_TABLE_DEF) {
+
+		/* DO THE CHECKS OF THE CONSISTENCY CONSTRAINTS HERE */
+
+		err = dict_build_table_def_step(thr, node);
+
+		if (err != DB_SUCCESS) {
+
+			goto function_exit;
+		}
+
+		node->state = TABLE_BUILD_COL_DEF;
+		node->col_no = 0;
+
+		thr->run_node = node->tab_def;
+
+		return(thr);
+	}
+
+	if (node->state == TABLE_BUILD_COL_DEF) {
+
+		if (node->col_no < (node->table)->n_def) {
+
+			err = dict_build_col_def_step(node);
+
+			if (err != DB_SUCCESS) {
+
+				goto function_exit;
+			}
+
+			node->col_no++;
+
+			thr->run_node = node->col_def;
+
+			return(thr);
+		} else {
+			node->state = TABLE_COMMIT_WORK;
+		}
+	}
+
+	if (node->state == TABLE_COMMIT_WORK) {
+
+		/* Table was correctly defined: do NOT commit the transaction
+		(CREATE TABLE does NOT do an implicit commit of the current
+		transaction) */
+
+		node->state = TABLE_ADD_TO_CACHE;
+
+		/* thr->run_node = node->commit_node;
+
+		return(thr); */
+	}
+
+	if (node->state == TABLE_ADD_TO_CACHE) {
+
+		dict_table_add_to_cache(node->table, node->heap);
+
+		err = DB_SUCCESS;
+	}
+
+function_exit:
+	trx->error_state = err;
+
+	if (err == DB_SUCCESS) {
+		/* Ok: do nothing */
+
+	} else if (err == DB_LOCK_WAIT) {
+
+		return(NULL);
+	} else {
+		/* SQL error detected */
+
+		return(NULL);
+	}
+
+	thr->run_node = que_node_get_parent(node);
+
+	return(thr);
+}
+
+/***********************************************************//**
+Creates an index. This is a high-level function used in SQL execution
+graphs.
+@return	query thread to run next or NULL */
+UNIV_INTERN
+que_thr_t*
+dict_create_index_step(
+/*===================*/
+	que_thr_t*	thr)	/*!< in: query thread */
+{
+	ind_node_t*	node;
+	ulint		err	= DB_ERROR;
+	trx_t*		trx;
+
+	ut_ad(thr);
+	ut_ad(mutex_own(&(dict_sys->mutex)));
+
+	trx = thr_get_trx(thr);
+
+	node = thr->run_node;
+
+	ut_ad(que_node_get_type(node) == QUE_NODE_CREATE_INDEX);
+
+	if (thr->prev_node == que_node_get_parent(node)) {
+		node->state = INDEX_BUILD_INDEX_DEF;
+	}
+
+	if (node->state == INDEX_BUILD_INDEX_DEF) {
+		/* DO THE CHECKS OF THE CONSISTENCY CONSTRAINTS HERE */
+		err = dict_build_index_def_step(thr, node);
+
+		if (err != DB_SUCCESS) {
+
+			goto function_exit;
+		}
+
+		node->state = INDEX_BUILD_FIELD_DEF;
+		node->field_no = 0;
+		node->stats_no = 0;
+
+		thr->run_node = node->ind_def;
+
+		return(thr);
+	}
+
+	if (node->state == INDEX_BUILD_FIELD_DEF) {
+
+		if (node->field_no < (node->index)->n_fields) {
+
+			err = dict_build_field_def_step(node);
+
+			if (err != DB_SUCCESS) {
+
+				goto function_exit;
+			}
+
+			node->field_no++;
+
+			thr->run_node = node->field_def;
+
+			return(thr);
+		} else {
+			node->state = INDEX_ADD_TO_CACHE;
+		}
+	}
+
+	if (node->state == INDEX_ADD_TO_CACHE) {
+
+		dulint	index_id = node->index->id;
+
+		err = dict_index_add_to_cache(
+			node->table, node->index, FIL_NULL,
+			trx_is_strict(trx)
+			|| dict_table_get_format(node->table)
+			>= DICT_TF_FORMAT_ZIP);
+
+		node->index = dict_index_get_if_in_cache_low(index_id);
+		ut_a(!node->index == (err != DB_SUCCESS));
+
+		if (err != DB_SUCCESS) {
+
+			goto function_exit;
+		}
+
+		if (srv_use_sys_stats_table
+		    && !((node->table->flags >> DICT_TF2_SHIFT) & DICT_TF2_TEMPORARY)) {
+			node->state = INDEX_BUILD_STATS_COLS;
+		} else {
+			node->state = INDEX_CREATE_INDEX_TREE;
+		}
+	}
+	if (node->state == INDEX_BUILD_STATS_COLS) {
+		if (node->stats_no <= dict_index_get_n_unique(node->index)) {
+
+			err = dict_build_stats_def_step(node);
+
+			if (err != DB_SUCCESS) {
+
+				goto function_exit;
+			}
+
+			node->stats_no++;
+
+			thr->run_node = node->stats_def;
+
+			return(thr);
+		} else {
+			node->state = INDEX_CREATE_INDEX_TREE;
+		}
+	}
+
+	if (node->state == INDEX_CREATE_INDEX_TREE) {
+
+		err = dict_create_index_tree_step(node);
+
+		if (err != DB_SUCCESS) {
+			dict_index_remove_from_cache(node->table, node->index);
+			node->index = NULL;
+
+			goto function_exit;
+		}
+
+		node->index->page = node->page_no;
+		node->state = INDEX_COMMIT_WORK;
+	}
+
+	if (node->state == INDEX_COMMIT_WORK) {
+
+		/* Index was correctly defined: do NOT commit the transaction
+		(CREATE INDEX does NOT currently do an implicit commit of
+		the current transaction) */
+
+		node->state = INDEX_CREATE_INDEX_TREE;
+
+		/* thr->run_node = node->commit_node;
+
+		return(thr); */
+	}
+
+function_exit:
+	trx->error_state = err;
+
+	if (err == DB_SUCCESS) {
+		/* Ok: do nothing */
+
+	} else if (err == DB_LOCK_WAIT) {
+
+		return(NULL);
+	} else {
+		/* SQL error detected */
+
+		return(NULL);
+	}
+
+	thr->run_node = que_node_get_parent(node);
+
+	return(thr);
+}
+
+/****************************************************************//**
+*/
+UNIV_INTERN
+que_thr_t*
+dict_insert_stats_step(
+/*===================*/
+	que_thr_t*	thr)	/*!< in: query thread */
+{
+	ind_node_t*	node;
+	ulint		err	= DB_ERROR;
+	trx_t*		trx;
+
+	ut_ad(thr);
+
+	trx = thr_get_trx(thr);
+
+	node = thr->run_node;
+
+	if (thr->prev_node == que_node_get_parent(node)) {
+		node->state = INDEX_BUILD_STATS_COLS;
+	}
+
+	if (node->state == INDEX_BUILD_STATS_COLS) {
+		if (node->stats_no <= dict_index_get_n_unique(node->index)) {
+
+			err = dict_build_stats_def_step(node);
+
+			if (err != DB_SUCCESS) {
+
+				goto function_exit;
+			}
+
+			node->stats_no++;
+
+			thr->run_node = node->stats_def;
+
+			return(thr);
+		} else {
+			node->state = INDEX_COMMIT_WORK;
+		}
+	}
+
+	if (node->state == INDEX_COMMIT_WORK) {
+
+		/* do not commit transaction here for now */
+	}
+
+function_exit:
+	trx->error_state = err;
+
+	if (err == DB_SUCCESS) {
+	} else {
+		return(NULL);
+	}
+
+	thr->run_node = que_node_get_parent(node);
+
+	return(thr);
+}
+
+/****************************************************************//**
+Creates the foreign key constraints system tables inside InnoDB
+at database creation or database start if they are not found or are
+not of the right form.
+@return	DB_SUCCESS or error code */
+UNIV_INTERN
+ulint
+dict_create_or_check_foreign_constraint_tables(void)
+/*================================================*/
+{
+	dict_table_t*	table1;
+	dict_table_t*	table2;
+	ulint		error;
+	trx_t*		trx;
+
+	mutex_enter(&(dict_sys->mutex));
+
+	table1 = dict_table_get_low("SYS_FOREIGN");
+	table2 = dict_table_get_low("SYS_FOREIGN_COLS");
+
+	if (table1 && table2
+	    && UT_LIST_GET_LEN(table1->indexes) == 3
+	    && UT_LIST_GET_LEN(table2->indexes) == 1) {
+
+		/* Foreign constraint system tables have already been
+		created, and they are ok */
+
+		table1->n_mysql_handles_opened = 1; /* for pin */
+		table2->n_mysql_handles_opened = 1; /* for pin */
+
+		mutex_exit(&(dict_sys->mutex));
+
+		return(DB_SUCCESS);
+	}
+
+	mutex_exit(&(dict_sys->mutex));
+
+	trx = trx_allocate_for_mysql();
+
+	trx->op_info = "creating foreign key sys tables";
+
+	row_mysql_lock_data_dictionary(trx);
+
+	if (table1) {
+		fprintf(stderr,
+			"InnoDB: dropping incompletely created"
+			" SYS_FOREIGN table\n");
+		row_drop_table_for_mysql("SYS_FOREIGN", trx, TRUE);
+	}
+
+	if (table2) {
+		fprintf(stderr,
+			"InnoDB: dropping incompletely created"
+			" SYS_FOREIGN_COLS table\n");
+		row_drop_table_for_mysql("SYS_FOREIGN_COLS", trx, TRUE);
+	}
+
+	fprintf(stderr,
+		"InnoDB: Creating foreign key constraint system tables\n");
+
+	/* NOTE: in dict_load_foreigns we use the fact that
+	there are 2 secondary indexes on SYS_FOREIGN, and they
+	are defined just like below */
+
+	/* NOTE: when designing InnoDB's foreign key support in 2001, we made
+	an error and made the table names and the foreign key id of type
+	'CHAR' (internally, really a VARCHAR). We should have made the type
+	VARBINARY, like in other InnoDB system tables, to get a clean
+	design. */
+
+	error = que_eval_sql(NULL,
+			     "PROCEDURE CREATE_FOREIGN_SYS_TABLES_PROC () IS\n"
+			     "BEGIN\n"
+			     "CREATE TABLE\n"
+			     "SYS_FOREIGN(ID CHAR, FOR_NAME CHAR,"
+			     " REF_NAME CHAR, N_COLS INT);\n"
+			     "CREATE UNIQUE CLUSTERED INDEX ID_IND"
+			     " ON SYS_FOREIGN (ID);\n"
+			     "CREATE INDEX FOR_IND"
+			     " ON SYS_FOREIGN (FOR_NAME);\n"
+			     "CREATE INDEX REF_IND"
+			     " ON SYS_FOREIGN (REF_NAME);\n"
+			     "CREATE TABLE\n"
+			     "SYS_FOREIGN_COLS(ID CHAR, POS INT,"
+			     " FOR_COL_NAME CHAR, REF_COL_NAME CHAR);\n"
+			     "CREATE UNIQUE CLUSTERED INDEX ID_IND"
+			     " ON SYS_FOREIGN_COLS (ID, POS);\n"
+			     "END;\n"
+			     , FALSE, trx);
+
+	if (error != DB_SUCCESS) {
+		fprintf(stderr, "InnoDB: error %lu in creation\n",
+			(ulong) error);
+
+		ut_a(error == DB_OUT_OF_FILE_SPACE
+		     || error == DB_TOO_MANY_CONCURRENT_TRXS);
+
+		fprintf(stderr,
+			"InnoDB: creation failed\n"
+			"InnoDB: tablespace is full\n"
+			"InnoDB: dropping incompletely created"
+			" SYS_FOREIGN tables\n");
+
+		row_drop_table_for_mysql("SYS_FOREIGN", trx, TRUE);
+		row_drop_table_for_mysql("SYS_FOREIGN_COLS", trx, TRUE);
+
+		error = DB_MUST_GET_MORE_FILE_SPACE;
+	}
+
+	trx_commit_for_mysql(trx);
+
+	table1 = dict_table_get_low("SYS_FOREIGN");
+	table2 = dict_table_get_low("SYS_FOREIGN_COLS");
+	table1->n_mysql_handles_opened = 1; /* for pin */
+	table2->n_mysql_handles_opened = 1; /* for pin */
+
+	row_mysql_unlock_data_dictionary(trx);
+
+	trx_free_for_mysql(trx);
+
+	if (error == DB_SUCCESS) {
+		fprintf(stderr,
+			"InnoDB: Foreign key constraint system tables"
+			" created\n");
+	}
+
+	return(error);
+}
+
+/****************************************************************//**
+Evaluate the given foreign key SQL statement.
+@return	error code or DB_SUCCESS */
+static
+ulint
+dict_foreign_eval_sql(
+/*==================*/
+	pars_info_t*	info,	/*!< in: info struct, or NULL */
+	const char*	sql,	/*!< in: SQL string to evaluate */
+	dict_table_t*	table,	/*!< in: table */
+	dict_foreign_t*	foreign,/*!< in: foreign */
+	trx_t*		trx)	/*!< in: transaction */
+{
+	ulint		error;
+	FILE*		ef	= dict_foreign_err_file;
+
+	error = que_eval_sql(info, sql, FALSE, trx);
+
+	if (error == DB_DUPLICATE_KEY) {
+		mutex_enter(&dict_foreign_err_mutex);
+		rewind(ef);
+		ut_print_timestamp(ef);
+		fputs(" Error in foreign key constraint creation for table ",
+		      ef);
+		ut_print_name(ef, trx, TRUE, table->name);
+		fputs(".\nA foreign key constraint of name ", ef);
+		ut_print_name(ef, trx, TRUE, foreign->id);
+		fputs("\nalready exists."
+		      " (Note that internally InnoDB adds 'databasename'\n"
+		      "in front of the user-defined constraint name.)\n"
+		      "Note that InnoDB's FOREIGN KEY system tables store\n"
+		      "constraint names as case-insensitive, with the\n"
+		      "MySQL standard latin1_swedish_ci collation. If you\n"
+		      "create tables or databases whose names differ only in\n"
+		      "the character case, then collisions in constraint\n"
+		      "names can occur. Workaround: name your constraints\n"
+		      "explicitly with unique names.\n",
+		      ef);
+
+		mutex_exit(&dict_foreign_err_mutex);
+
+		return(error);
+	}
+
+	if (error != DB_SUCCESS) {
+		fprintf(stderr,
+			"InnoDB: Foreign key constraint creation failed:\n"
+			"InnoDB: internal error number %lu\n", (ulong) error);
+
+		mutex_enter(&dict_foreign_err_mutex);
+		ut_print_timestamp(ef);
+		fputs(" Internal error in foreign key constraint creation"
+		      " for table ", ef);
+		ut_print_name(ef, trx, TRUE, table->name);
+		fputs(".\n"
+		      "See the MySQL .err log in the datadir"
+		      " for more information.\n", ef);
+		mutex_exit(&dict_foreign_err_mutex);
+
+		return(error);
+	}
+
+	return(DB_SUCCESS);
+}
+
+/********************************************************************//**
+Add a single foreign key field definition to the data dictionary tables in
+the database.
+@return	error code or DB_SUCCESS */
+static
+ulint
+dict_create_add_foreign_field_to_dictionary(
+/*========================================*/
+	ulint		field_nr,	/*!< in: foreign field number */
+	dict_table_t*	table,		/*!< in: table */
+	dict_foreign_t*	foreign,	/*!< in: foreign */
+	trx_t*		trx)		/*!< in: transaction */
+{
+	pars_info_t*	info = pars_info_create();
+
+	pars_info_add_str_literal(info, "id", foreign->id);
+
+	pars_info_add_int4_literal(info, "pos", field_nr);
+
+	pars_info_add_str_literal(info, "for_col_name",
+				  foreign->foreign_col_names[field_nr]);
+
+	pars_info_add_str_literal(info, "ref_col_name",
+				  foreign->referenced_col_names[field_nr]);
+
+	return(dict_foreign_eval_sql(
+		       info,
+		       "PROCEDURE P () IS\n"
+		       "BEGIN\n"
+		       "INSERT INTO SYS_FOREIGN_COLS VALUES"
+		       "(:id, :pos, :for_col_name, :ref_col_name);\n"
+		       "END;\n",
+		       table, foreign, trx));
+}
+
+/********************************************************************//**
+Add a single foreign key definition to the data dictionary tables in the
+database. We also generate names to constraints that were not named by the
+user. A generated constraint has a name of the format
+databasename/tablename_ibfk_NUMBER, where the numbers start from 1, and
+are given locally for this table, that is, the number is not global, as in
+the old format constraints < 4.0.18 it used to be.
+@return	error code or DB_SUCCESS */
+static
+ulint
+dict_create_add_foreign_to_dictionary(
+/*==================================*/
+	ulint*		id_nr,	/*!< in/out: number to use in id generation;
+				incremented if used */
+	dict_table_t*	table,	/*!< in: table */
+	dict_foreign_t*	foreign,/*!< in: foreign */
+	trx_t*		trx)	/*!< in: transaction */
+{
+	ulint		error;
+	ulint		i;
+
+	pars_info_t*	info = pars_info_create();
+
+	if (foreign->id == NULL) {
+		/* Generate a new constraint id */
+		ulint	namelen	= strlen(table->name);
+		char*	id	= mem_heap_alloc(foreign->heap, namelen + 20);
+		/* no overflow if number < 1e13 */
+		sprintf(id, "%s_ibfk_%lu", table->name, (ulong) (*id_nr)++);
+		foreign->id = id;
+	}
+
+	pars_info_add_str_literal(info, "id", foreign->id);
+
+	pars_info_add_str_literal(info, "for_name", table->name);
+
+	pars_info_add_str_literal(info, "ref_name",
+				  foreign->referenced_table_name);
+
+	pars_info_add_int4_literal(info, "n_cols",
+				   foreign->n_fields + (foreign->type << 24));
+
+	error = dict_foreign_eval_sql(info,
+				      "PROCEDURE P () IS\n"
+				      "BEGIN\n"
+				      "INSERT INTO SYS_FOREIGN VALUES"
+				      "(:id, :for_name, :ref_name, :n_cols);\n"
+				      "END;\n"
+				      , table, foreign, trx);
+
+	if (error != DB_SUCCESS) {
+
+		return(error);
+	}
+
+	for (i = 0; i < foreign->n_fields; i++) {
+		error = dict_create_add_foreign_field_to_dictionary(
+			i, table, foreign, trx);
+
+		if (error != DB_SUCCESS) {
+
+			return(error);
+		}
+	}
+
+	error = dict_foreign_eval_sql(NULL,
+				      "PROCEDURE P () IS\n"
+				      "BEGIN\n"
+				      "COMMIT WORK;\n"
+				      "END;\n"
+				      , table, foreign, trx);
+
+	return(error);
+}
+
+/********************************************************************//**
+Adds foreign key definitions to data dictionary tables in the database.
+@return	error code or DB_SUCCESS */
+UNIV_INTERN
+ulint
+dict_create_add_foreigns_to_dictionary(
+/*===================================*/
+	ulint		start_id,/*!< in: if we are actually doing ALTER TABLE
+				ADD CONSTRAINT, we want to generate constraint
+				numbers which are bigger than in the table so
+				far; we number the constraints from
+				start_id + 1 up; start_id should be set to 0 if
+				we are creating a new table, or if the table
+				so far has no constraints for which the name
+				was generated here */
+	dict_table_t*	table,	/*!< in: table */
+	trx_t*		trx)	/*!< in: transaction */
+{
+	dict_foreign_t*	foreign;
+	ulint		number	= start_id + 1;
+	ulint		error;
+
+	ut_ad(mutex_own(&(dict_sys->mutex)));
+
+	if (NULL == dict_table_get_low("SYS_FOREIGN")) {
+		fprintf(stderr,
+			"InnoDB: table SYS_FOREIGN not found"
+			" in internal data dictionary\n");
+
+		return(DB_ERROR);
+	}
+
+	for (foreign = UT_LIST_GET_FIRST(table->foreign_list);
+	     foreign;
+	     foreign = UT_LIST_GET_NEXT(foreign_list, foreign)) {
+
+		error = dict_create_add_foreign_to_dictionary(&number, table,
+							      foreign, trx);
+
+		if (error != DB_SUCCESS) {
+
+			return(error);
+		}
+	}
+
+	return(DB_SUCCESS);
+}
diff --git a/storage/xtradb/dict/dict0dict.c b/storage/xtradb/dict/dict0dict.c
new file mode 100644
index 00000000000..1d0517f5cc7
--- /dev/null
+++ b/storage/xtradb/dict/dict0dict.c
@@ -0,0 +1,5347 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2010, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/******************************************************************//**
+@file dict/dict0dict.c
+Data dictionary system
+
+Created 1/8/1996 Heikki Tuuri
+***********************************************************************/
+
+#include "dict0dict.h"
+
+#ifdef UNIV_NONINL
+#include "dict0dict.ic"
+#endif
+
+/** dummy index for ROW_FORMAT=REDUNDANT supremum and infimum records */
+UNIV_INTERN dict_index_t*	dict_ind_redundant;
+/** dummy index for ROW_FORMAT=COMPACT supremum and infimum records */
+UNIV_INTERN dict_index_t*	dict_ind_compact;
+
+#ifndef UNIV_HOTBACKUP
+#include "buf0buf.h"
+#include "data0type.h"
+#include "mach0data.h"
+#include "dict0boot.h"
+#include "dict0mem.h"
+#include "dict0crea.h"
+#include "trx0undo.h"
+#include "btr0btr.h"
+#include "btr0cur.h"
+#include "btr0sea.h"
+#include "page0zip.h"
+#include "page0page.h"
+#include "pars0pars.h"
+#include "pars0sym.h"
+#include "que0que.h"
+#include "rem0cmp.h"
+#include "row0merge.h"
+#include "m_ctype.h" /* my_isspace() */
+#include "ha_prototypes.h" /* innobase_strcasecmp() */
+#include "srv0start.h" /* SRV_LOG_SPACE_FIRST_ID */
+
+#include <ctype.h>
+
+/** the dictionary system */
+UNIV_INTERN dict_sys_t*	dict_sys	= NULL;
+
+/** @brief the data dictionary rw-latch protecting dict_sys
+
+table create, drop, etc. reserve this in X-mode; implicit or
+backround operations purge, rollback, foreign key checks reserve this
+in S-mode; we cannot trust that MySQL protects implicit or background
+operations a table drop since MySQL does not know of them; therefore
+we need this; NOTE: a transaction which reserves this must keep book
+on the mode in trx_struct::dict_operation_lock_mode */
+UNIV_INTERN rw_lock_t	dict_operation_lock;
+
+#define	DICT_HEAP_SIZE		100	/*!< initial memory heap size when
+					creating a table or index object */
+#define DICT_POOL_PER_TABLE_HASH 512	/*!< buffer pool max size per table
+					hash table fixed size in bytes */
+#define DICT_POOL_PER_VARYING	4	/*!< buffer pool max size per data
+					dictionary varying size in bytes */
+
+/** Identifies generated InnoDB foreign key names */
+static char	dict_ibfk[] = "_ibfk_";
+
+/** array of mutexes protecting dict_index_t::stat_n_diff_key_vals[] */
+#define DICT_INDEX_STAT_MUTEX_SIZE	32
+static mutex_t	dict_index_stat_mutex[DICT_INDEX_STAT_MUTEX_SIZE];
+
+/*******************************************************************//**
+Tries to find column names for the index and sets the col field of the
+index.
+@return TRUE if the column names were found */
+static
+ibool
+dict_index_find_cols(
+/*=================*/
+	dict_table_t*	table,	/*!< in: table */
+	dict_index_t*	index);	/*!< in: index */
+/*******************************************************************//**
+Builds the internal dictionary cache representation for a clustered
+index, containing also system fields not defined by the user.
+@return	own: the internal representation of the clustered index */
+static
+dict_index_t*
+dict_index_build_internal_clust(
+/*============================*/
+	const dict_table_t*	table,	/*!< in: table */
+	dict_index_t*		index);	/*!< in: user representation of
+					a clustered index */
+/*******************************************************************//**
+Builds the internal dictionary cache representation for a non-clustered
+index, containing also system fields not defined by the user.
+@return	own: the internal representation of the non-clustered index */
+static
+dict_index_t*
+dict_index_build_internal_non_clust(
+/*================================*/
+	const dict_table_t*	table,	/*!< in: table */
+	dict_index_t*		index);	/*!< in: user representation of
+					a non-clustered index */
+/**********************************************************************//**
+Removes a foreign constraint struct from the dictionary cache. */
+static
+void
+dict_foreign_remove_from_cache(
+/*===========================*/
+	dict_foreign_t*	foreign);	/*!< in, own: foreign constraint */
+/**********************************************************************//**
+Prints a column data. */
+static
+void
+dict_col_print_low(
+/*===============*/
+	const dict_table_t*	table,	/*!< in: table */
+	const dict_col_t*	col);	/*!< in: column */
+/**********************************************************************//**
+Prints an index data. */
+static
+void
+dict_index_print_low(
+/*=================*/
+	dict_index_t*	index);	/*!< in: index */
+/**********************************************************************//**
+Prints a field data. */
+static
+void
+dict_field_print_low(
+/*=================*/
+	const dict_field_t*	field);	/*!< in: field */
+/*********************************************************************//**
+Frees a foreign key struct. */
+static
+void
+dict_foreign_free(
+/*==============*/
+	dict_foreign_t*	foreign);	/*!< in, own: foreign key struct */
+
+/* Stream for storing detailed information about the latest foreign key
+and unique key errors */
+UNIV_INTERN FILE*	dict_foreign_err_file		= NULL;
+/* mutex protecting the foreign and unique error buffers */
+UNIV_INTERN mutex_t	dict_foreign_err_mutex;
+
+/******************************************************************//**
+Makes all characters in a NUL-terminated UTF-8 string lower case. */
+UNIV_INTERN
+void
+dict_casedn_str(
+/*============*/
+	char*	a)	/*!< in/out: string to put in lower case */
+{
+	innobase_casedn_str(a);
+}
+
+/********************************************************************//**
+Checks if the database name in two table names is the same.
+@return	TRUE if same db name */
+UNIV_INTERN
+ibool
+dict_tables_have_same_db(
+/*=====================*/
+	const char*	name1,	/*!< in: table name in the form
+				dbname '/' tablename */
+	const char*	name2)	/*!< in: table name in the form
+				dbname '/' tablename */
+{
+	for (; *name1 == *name2; name1++, name2++) {
+		if (*name1 == '/') {
+			return(TRUE);
+		}
+		ut_a(*name1); /* the names must contain '/' */
+	}
+	return(FALSE);
+}
+
+/********************************************************************//**
+Return the end of table name where we have removed dbname and '/'.
+@return	table name */
+UNIV_INTERN
+const char*
+dict_remove_db_name(
+/*================*/
+	const char*	name)	/*!< in: table name in the form
+				dbname '/' tablename */
+{
+	const char*	s = strchr(name, '/');
+	ut_a(s);
+
+	return(s + 1);
+}
+
+/********************************************************************//**
+Get the database name length in a table name.
+@return	database name length */
+UNIV_INTERN
+ulint
+dict_get_db_name_len(
+/*=================*/
+	const char*	name)	/*!< in: table name in the form
+				dbname '/' tablename */
+{
+	const char*	s;
+	s = strchr(name, '/');
+	ut_a(s);
+	return(s - name);
+}
+
+/********************************************************************//**
+Reserves the dictionary system mutex for MySQL. */
+UNIV_INTERN
+void
+dict_mutex_enter_for_mysql(void)
+/*============================*/
+{
+	mutex_enter(&(dict_sys->mutex));
+}
+
+/********************************************************************//**
+Releases the dictionary system mutex for MySQL. */
+UNIV_INTERN
+void
+dict_mutex_exit_for_mysql(void)
+/*===========================*/
+{
+	mutex_exit(&(dict_sys->mutex));
+}
+
+/** Get the mutex that protects index->stat_n_diff_key_vals[] */
+#define GET_INDEX_STAT_MUTEX(index) \
+	(&dict_index_stat_mutex[ut_fold_dulint(index->id) \
+	 			% DICT_INDEX_STAT_MUTEX_SIZE])
+
+/**********************************************************************//**
+Lock the appropriate mutex to protect index->stat_n_diff_key_vals[].
+index->id is used to pick the right mutex and it should not change
+before dict_index_stat_mutex_exit() is called on this index. */
+UNIV_INTERN
+void
+dict_index_stat_mutex_enter(
+/*========================*/
+	const dict_index_t*	index)	/*!< in: index */
+{
+	ut_ad(index != NULL);
+	ut_ad(index->magic_n == DICT_INDEX_MAGIC_N);
+	ut_ad(index->cached);
+	ut_ad(!index->to_be_dropped);
+
+	mutex_enter(GET_INDEX_STAT_MUTEX(index));
+}
+
+/**********************************************************************//**
+Unlock the appropriate mutex that protects index->stat_n_diff_key_vals[]. */
+UNIV_INTERN
+void
+dict_index_stat_mutex_exit(
+/*=======================*/
+	const dict_index_t*	index)	/*!< in: index */
+{
+	ut_ad(index != NULL);
+	ut_ad(index->magic_n == DICT_INDEX_MAGIC_N);
+	ut_ad(index->cached);
+	ut_ad(!index->to_be_dropped);
+
+	mutex_exit(GET_INDEX_STAT_MUTEX(index));
+}
+
+/********************************************************************//**
+Decrements the count of open MySQL handles to a table. */
+UNIV_INTERN
+void
+dict_table_decrement_handle_count(
+/*==============================*/
+	dict_table_t*	table,		/*!< in/out: table */
+	ibool		dict_locked)	/*!< in: TRUE=data dictionary locked */
+{
+	if (!dict_locked) {
+		mutex_enter(&dict_sys->mutex);
+	}
+
+	ut_ad(mutex_own(&dict_sys->mutex));
+	ut_a(table->n_mysql_handles_opened > 0);
+
+	table->n_mysql_handles_opened--;
+
+	if (!dict_locked) {
+		mutex_exit(&dict_sys->mutex);
+	}
+}
+#endif /* !UNIV_HOTBACKUP */
+
+/**********************************************************************//**
+Returns a column's name.
+@return column name. NOTE: not guaranteed to stay valid if table is
+modified in any way (columns added, etc.). */
+UNIV_INTERN
+const char*
+dict_table_get_col_name(
+/*====================*/
+	const dict_table_t*	table,	/*!< in: table */
+	ulint			col_nr)	/*!< in: column number */
+{
+	ulint		i;
+	const char*	s;
+
+	ut_ad(table);
+	ut_ad(col_nr < table->n_def);
+	ut_ad(table->magic_n == DICT_TABLE_MAGIC_N);
+
+	s = table->col_names;
+	if (s) {
+		for (i = 0; i < col_nr; i++) {
+			s += strlen(s) + 1;
+		}
+	}
+
+	return(s);
+}
+
+#ifndef UNIV_HOTBACKUP
+/********************************************************************//**
+Acquire the autoinc lock. */
+UNIV_INTERN
+void
+dict_table_autoinc_lock(
+/*====================*/
+	dict_table_t*	table)	/*!< in/out: table */
+{
+	mutex_enter(&table->autoinc_mutex);
+}
+
+/********************************************************************//**
+Unconditionally set the autoinc counter. */
+UNIV_INTERN
+void
+dict_table_autoinc_initialize(
+/*==========================*/
+	dict_table_t*	table,	/*!< in/out: table */
+	ib_uint64_t	value)	/*!< in: next value to assign to a row */
+{
+	ut_ad(mutex_own(&table->autoinc_mutex));
+
+	table->autoinc = value;
+}
+
+/********************************************************************//**
+Reads the next autoinc value (== autoinc counter value), 0 if not yet
+initialized.
+@return	value for a new row, or 0 */
+UNIV_INTERN
+ib_uint64_t
+dict_table_autoinc_read(
+/*====================*/
+	const dict_table_t*	table)	/*!< in: table */
+{
+	ut_ad(mutex_own(&table->autoinc_mutex));
+
+	return(table->autoinc);
+}
+
+/********************************************************************//**
+Updates the autoinc counter if the value supplied is greater than the
+current value. */
+UNIV_INTERN
+void
+dict_table_autoinc_update_if_greater(
+/*=================================*/
+
+	dict_table_t*	table,	/*!< in/out: table */
+	ib_uint64_t	value)	/*!< in: value which was assigned to a row */
+{
+	ut_ad(mutex_own(&table->autoinc_mutex));
+
+	if (value > table->autoinc) {
+
+		table->autoinc = value;
+	}
+}
+
+/********************************************************************//**
+Release the autoinc lock. */
+UNIV_INTERN
+void
+dict_table_autoinc_unlock(
+/*======================*/
+	dict_table_t*	table)	/*!< in/out: table */
+{
+	mutex_exit(&table->autoinc_mutex);
+}
+
+/**********************************************************************//**
+Looks for an index with the given table and index id.
+NOTE that we do not reserve the dictionary mutex.
+@return	index or NULL if not found from cache */
+UNIV_INTERN
+dict_index_t*
+dict_index_get_on_id_low(
+/*=====================*/
+	dict_table_t*	table,	/*!< in: table */
+	dulint		id)	/*!< in: index id */
+{
+	dict_index_t*	index;
+
+	index = dict_table_get_first_index(table);
+
+	while (index) {
+		if (0 == ut_dulint_cmp(id, index->id)) {
+			/* Found */
+
+			return(index);
+		}
+
+		index = dict_table_get_next_index(index);
+	}
+
+	return(NULL);
+}
+#endif /* !UNIV_HOTBACKUP */
+
+/********************************************************************//**
+Looks for column n in an index.
+@return position in internal representation of the index;
+ULINT_UNDEFINED if not contained */
+UNIV_INTERN
+ulint
+dict_index_get_nth_col_pos(
+/*=======================*/
+	const dict_index_t*	index,	/*!< in: index */
+	ulint			n)	/*!< in: column number */
+{
+	const dict_field_t*	field;
+	const dict_col_t*	col;
+	ulint			pos;
+	ulint			n_fields;
+
+	ut_ad(index);
+	ut_ad(index->magic_n == DICT_INDEX_MAGIC_N);
+
+	col = dict_table_get_nth_col(index->table, n);
+
+	if (dict_index_is_clust(index)) {
+
+		return(dict_col_get_clust_pos(col, index));
+	}
+
+	n_fields = dict_index_get_n_fields(index);
+
+	for (pos = 0; pos < n_fields; pos++) {
+		field = dict_index_get_nth_field(index, pos);
+
+		if (col == field->col && field->prefix_len == 0) {
+
+			return(pos);
+		}
+	}
+
+	return(ULINT_UNDEFINED);
+}
+
+#ifndef UNIV_HOTBACKUP
+/********************************************************************//**
+Returns TRUE if the index contains a column or a prefix of that column.
+@return	TRUE if contains the column or its prefix */
+UNIV_INTERN
+ibool
+dict_index_contains_col_or_prefix(
+/*==============================*/
+	const dict_index_t*	index,	/*!< in: index */
+	ulint			n)	/*!< in: column number */
+{
+	const dict_field_t*	field;
+	const dict_col_t*	col;
+	ulint			pos;
+	ulint			n_fields;
+
+	ut_ad(index);
+	ut_ad(index->magic_n == DICT_INDEX_MAGIC_N);
+
+	if (dict_index_is_clust(index)) {
+
+		return(TRUE);
+	}
+
+	col = dict_table_get_nth_col(index->table, n);
+
+	n_fields = dict_index_get_n_fields(index);
+
+	for (pos = 0; pos < n_fields; pos++) {
+		field = dict_index_get_nth_field(index, pos);
+
+		if (col == field->col) {
+
+			return(TRUE);
+		}
+	}
+
+	return(FALSE);
+}
+
+/********************************************************************//**
+Looks for a matching field in an index. The column has to be the same. The
+column in index must be complete, or must contain a prefix longer than the
+column in index2. That is, we must be able to construct the prefix in index2
+from the prefix in index.
+@return position in internal representation of the index;
+ULINT_UNDEFINED if not contained */
+UNIV_INTERN
+ulint
+dict_index_get_nth_field_pos(
+/*=========================*/
+	const dict_index_t*	index,	/*!< in: index from which to search */
+	const dict_index_t*	index2,	/*!< in: index */
+	ulint			n)	/*!< in: field number in index2 */
+{
+	const dict_field_t*	field;
+	const dict_field_t*	field2;
+	ulint			n_fields;
+	ulint			pos;
+
+	ut_ad(index);
+	ut_ad(index->magic_n == DICT_INDEX_MAGIC_N);
+
+	field2 = dict_index_get_nth_field(index2, n);
+
+	n_fields = dict_index_get_n_fields(index);
+
+	for (pos = 0; pos < n_fields; pos++) {
+		field = dict_index_get_nth_field(index, pos);
+
+		if (field->col == field2->col
+		    && (field->prefix_len == 0
+			|| (field->prefix_len >= field2->prefix_len
+			    && field2->prefix_len != 0))) {
+
+			return(pos);
+		}
+	}
+
+	return(ULINT_UNDEFINED);
+}
+
+/**********************************************************************//**
+Returns a table object based on table id.
+@return	table, NULL if does not exist */
+UNIV_INTERN
+dict_table_t*
+dict_table_get_on_id(
+/*=================*/
+	dulint	table_id,	/*!< in: table id */
+	trx_t*	trx)		/*!< in: transaction handle */
+{
+	dict_table_t*	table;
+
+	if (trx->dict_operation_lock_mode == RW_X_LATCH) {
+
+		/* Note: An X latch implies that the transaction
+		already owns the dictionary mutex. */
+
+		ut_ad(mutex_own(&dict_sys->mutex));
+
+		return(dict_table_get_on_id_low(table_id));
+	}
+
+	mutex_enter(&(dict_sys->mutex));
+
+	table = dict_table_get_on_id_low(table_id);
+
+	dict_table_LRU_trim(table);
+
+	mutex_exit(&(dict_sys->mutex));
+
+	return(table);
+}
+
+/********************************************************************//**
+Looks for column n position in the clustered index.
+@return	position in internal representation of the clustered index */
+UNIV_INTERN
+ulint
+dict_table_get_nth_col_pos(
+/*=======================*/
+	const dict_table_t*	table,	/*!< in: table */
+	ulint			n)	/*!< in: column number */
+{
+	return(dict_index_get_nth_col_pos(dict_table_get_first_index(table),
+					  n));
+}
+
+/********************************************************************//**
+Checks if a column is in the ordering columns of the clustered index of a
+table. Column prefixes are treated like whole columns.
+@return	TRUE if the column, or its prefix, is in the clustered key */
+UNIV_INTERN
+ibool
+dict_table_col_in_clustered_key(
+/*============================*/
+	const dict_table_t*	table,	/*!< in: table */
+	ulint			n)	/*!< in: column number */
+{
+	const dict_index_t*	index;
+	const dict_field_t*	field;
+	const dict_col_t*	col;
+	ulint			pos;
+	ulint			n_fields;
+
+	ut_ad(table);
+
+	col = dict_table_get_nth_col(table, n);
+
+	index = dict_table_get_first_index(table);
+
+	n_fields = dict_index_get_n_unique(index);
+
+	for (pos = 0; pos < n_fields; pos++) {
+		field = dict_index_get_nth_field(index, pos);
+
+		if (col == field->col) {
+
+			return(TRUE);
+		}
+	}
+
+	return(FALSE);
+}
+
+/**********************************************************************//**
+Inits the data dictionary module. */
+UNIV_INTERN
+void
+dict_init(void)
+/*===========*/
+{
+	int	i;
+
+	dict_sys = mem_alloc(sizeof(dict_sys_t));
+
+	mutex_create(&dict_sys->mutex, SYNC_DICT);
+
+	dict_sys->table_hash = hash_create(buf_pool_get_curr_size()
+					   / (DICT_POOL_PER_TABLE_HASH
+					      * UNIV_WORD_SIZE));
+	dict_sys->table_id_hash = hash_create(buf_pool_get_curr_size()
+					      / (DICT_POOL_PER_TABLE_HASH
+						 * UNIV_WORD_SIZE));
+	dict_sys->size = 0;
+
+	UT_LIST_INIT(dict_sys->table_LRU);
+
+	rw_lock_create(&dict_operation_lock, SYNC_DICT_OPERATION);
+
+	dict_foreign_err_file = os_file_create_tmpfile();
+	ut_a(dict_foreign_err_file);
+
+	mutex_create(&dict_foreign_err_mutex, SYNC_ANY_LATCH);
+
+	for (i = 0; i < DICT_INDEX_STAT_MUTEX_SIZE; i++) {
+		mutex_create(&dict_index_stat_mutex[i], SYNC_INDEX_TREE);
+	}
+}
+
+/**********************************************************************//**
+Returns a table object and optionally increment its MySQL open handle count.
+NOTE! This is a high-level function to be used mainly from outside the
+'dict' directory. Inside this directory dict_table_get_low is usually the
+appropriate function.
+@return	table, NULL if does not exist */
+UNIV_INTERN
+dict_table_t*
+dict_table_get(
+/*===========*/
+	const char*	table_name,	/*!< in: table name */
+	ibool		inc_mysql_count)/*!< in: whether to increment the open
+					handle count on the table */
+{
+	dict_table_t*	table;
+
+	mutex_enter(&(dict_sys->mutex));
+
+	table = dict_table_get_low(table_name);
+
+	if (inc_mysql_count && table) {
+		table->n_mysql_handles_opened++;
+	}
+
+	dict_table_LRU_trim(table);
+
+	mutex_exit(&(dict_sys->mutex));
+
+	if (table != NULL) {
+		if (!table->stat_initialized && !table->is_corrupt) {
+			/* If table->ibd_file_missing == TRUE, this will
+			print an error message and return without doing
+			anything. */
+			dict_update_statistics(table, FALSE);
+		}
+	}
+
+	return(table);
+}
+#endif /* !UNIV_HOTBACKUP */
+
+/**********************************************************************//**
+Adds system columns to a table object. */
+UNIV_INTERN
+void
+dict_table_add_system_columns(
+/*==========================*/
+	dict_table_t*	table,	/*!< in/out: table */
+	mem_heap_t*	heap)	/*!< in: temporary heap */
+{
+	ut_ad(table);
+	ut_ad(table->n_def == table->n_cols - DATA_N_SYS_COLS);
+	ut_ad(table->magic_n == DICT_TABLE_MAGIC_N);
+	ut_ad(!table->cached);
+
+	/* NOTE: the system columns MUST be added in the following order
+	(so that they can be indexed by the numerical value of DATA_ROW_ID,
+	etc.) and as the last columns of the table memory object.
+	The clustered index will not always physically contain all
+	system columns. */
+
+	dict_mem_table_add_col(table, heap, "DB_ROW_ID", DATA_SYS,
+			       DATA_ROW_ID | DATA_NOT_NULL,
+			       DATA_ROW_ID_LEN);
+#if DATA_ROW_ID != 0
+#error "DATA_ROW_ID != 0"
+#endif
+	dict_mem_table_add_col(table, heap, "DB_TRX_ID", DATA_SYS,
+			       DATA_TRX_ID | DATA_NOT_NULL,
+			       DATA_TRX_ID_LEN);
+#if DATA_TRX_ID != 1
+#error "DATA_TRX_ID != 1"
+#endif
+	dict_mem_table_add_col(table, heap, "DB_ROLL_PTR", DATA_SYS,
+			       DATA_ROLL_PTR | DATA_NOT_NULL,
+			       DATA_ROLL_PTR_LEN);
+#if DATA_ROLL_PTR != 2
+#error "DATA_ROLL_PTR != 2"
+#endif
+
+	/* This check reminds that if a new system column is added to
+	the program, it should be dealt with here */
+#if DATA_N_SYS_COLS != 3
+#error "DATA_N_SYS_COLS != 3"
+#endif
+}
+
+#ifndef UNIV_HOTBACKUP
+/**********************************************************************//**
+Adds a table object to the dictionary cache. */
+UNIV_INTERN
+void
+dict_table_add_to_cache(
+/*====================*/
+	dict_table_t*	table,	/*!< in: table */
+	mem_heap_t*	heap)	/*!< in: temporary heap */
+{
+	ulint	fold;
+	ulint	id_fold;
+	ulint	i;
+	ulint	row_len;
+
+	/* The lower limit for what we consider a "big" row */
+#define BIG_ROW_SIZE 1024
+
+	ut_ad(mutex_own(&(dict_sys->mutex)));
+
+	dict_table_add_system_columns(table, heap);
+
+	table->cached = TRUE;
+
+	fold = ut_fold_string(table->name);
+	id_fold = ut_fold_dulint(table->id);
+
+	row_len = 0;
+	for (i = 0; i < table->n_def; i++) {
+		ulint	col_len = dict_col_get_max_size(
+			dict_table_get_nth_col(table, i));
+
+		row_len += col_len;
+
+		/* If we have a single unbounded field, or several gigantic
+		fields, mark the maximum row size as BIG_ROW_SIZE. */
+		if (row_len >= BIG_ROW_SIZE || col_len >= BIG_ROW_SIZE) {
+			row_len = BIG_ROW_SIZE;
+
+			break;
+		}
+	}
+
+	table->big_rows = row_len >= BIG_ROW_SIZE;
+
+	/* Look for a table with the same name: error if such exists */
+	{
+		dict_table_t*	table2;
+		HASH_SEARCH(name_hash, dict_sys->table_hash, fold,
+			    dict_table_t*, table2, ut_ad(table2->cached),
+			    ut_strcmp(table2->name, table->name) == 0);
+		ut_a(table2 == NULL);
+
+#ifdef UNIV_DEBUG
+		/* Look for the same table pointer with a different name */
+		HASH_SEARCH_ALL(name_hash, dict_sys->table_hash,
+				dict_table_t*, table2, ut_ad(table2->cached),
+				table2 == table);
+		ut_ad(table2 == NULL);
+#endif /* UNIV_DEBUG */
+	}
+
+	/* Look for a table with the same id: error if such exists */
+	{
+		dict_table_t*	table2;
+		HASH_SEARCH(id_hash, dict_sys->table_id_hash, id_fold,
+			    dict_table_t*, table2, ut_ad(table2->cached),
+			    ut_dulint_cmp(table2->id, table->id) == 0);
+		ut_a(table2 == NULL);
+
+#ifdef UNIV_DEBUG
+		/* Look for the same table pointer with a different id */
+		HASH_SEARCH_ALL(id_hash, dict_sys->table_id_hash,
+				dict_table_t*, table2, ut_ad(table2->cached),
+				table2 == table);
+		ut_ad(table2 == NULL);
+#endif /* UNIV_DEBUG */
+	}
+
+	/* Add table to hash table of tables */
+	HASH_INSERT(dict_table_t, name_hash, dict_sys->table_hash, fold,
+		    table);
+
+	/* Add table to hash table of tables based on table id */
+	HASH_INSERT(dict_table_t, id_hash, dict_sys->table_id_hash, id_fold,
+		    table);
+	/* Add table to LRU list of tables */
+	UT_LIST_ADD_FIRST(table_LRU, dict_sys->table_LRU, table);
+
+	dict_sys->size += mem_heap_get_size(table->heap)
+		+ strlen(table->name) + 1;
+}
+
+/**********************************************************************//**
+Looks for an index with the given id. NOTE that we do not reserve
+the dictionary mutex: this function is for emergency purposes like
+printing info of a corrupt database page!
+@return	index or NULL if not found from cache */
+UNIV_INTERN
+dict_index_t*
+dict_index_find_on_id_low(
+/*======================*/
+	dulint	id)	/*!< in: index id */
+{
+	dict_table_t*	table;
+	dict_index_t*	index;
+
+	table = UT_LIST_GET_FIRST(dict_sys->table_LRU);
+
+	while (table) {
+		index = dict_table_get_first_index(table);
+
+		while (index) {
+			if (0 == ut_dulint_cmp(id, index->id)) {
+				/* Found */
+
+				return(index);
+			}
+
+			index = dict_table_get_next_index(index);
+		}
+
+		table = UT_LIST_GET_NEXT(table_LRU, table);
+	}
+
+	return(NULL);
+}
+
+/**********************************************************************//**
+Renames a table object.
+@return	TRUE if success */
+UNIV_INTERN
+ibool
+dict_table_rename_in_cache(
+/*=======================*/
+	dict_table_t*	table,		/*!< in/out: table */
+	const char*	new_name,	/*!< in: new name */
+	ibool		rename_also_foreigns)/*!< in: in ALTER TABLE we want
+					to preserve the original table name
+					in constraints which reference it */
+{
+	dict_foreign_t*	foreign;
+	dict_index_t*	index;
+	ulint		fold;
+	char		old_name[MAX_TABLE_NAME_LEN + 1];
+
+	ut_ad(table);
+	ut_ad(mutex_own(&(dict_sys->mutex)));
+
+	/* store the old/current name to an automatic variable */
+	if (strlen(table->name) + 1 <= sizeof(old_name)) {
+		memcpy(old_name, table->name, strlen(table->name) + 1);
+	} else {
+		ut_print_timestamp(stderr);
+		fprintf(stderr, "InnoDB: too long table name: '%s', "
+			"max length is %d\n", table->name,
+			MAX_TABLE_NAME_LEN);
+		ut_error;
+	}
+
+	fold = ut_fold_string(new_name);
+
+	/* Look for a table with the same name: error if such exists */
+	{
+		dict_table_t*	table2;
+		HASH_SEARCH(name_hash, dict_sys->table_hash, fold,
+			    dict_table_t*, table2, ut_ad(table2->cached),
+			    (ut_strcmp(table2->name, new_name) == 0));
+		if (UNIV_LIKELY_NULL(table2)) {
+			ut_print_timestamp(stderr);
+			fputs("  InnoDB: Error: dictionary cache"
+			      " already contains a table ", stderr);
+			ut_print_name(stderr, NULL, TRUE, new_name);
+			fputs("\n"
+			      "InnoDB: cannot rename table ", stderr);
+			ut_print_name(stderr, NULL, TRUE, old_name);
+			putc('\n', stderr);
+			return(FALSE);
+		}
+	}
+
+	/* If the table is stored in a single-table tablespace, rename the
+	.ibd file */
+
+	if (table->space != 0) {
+		if (table->dir_path_of_temp_table != NULL) {
+			ut_print_timestamp(stderr);
+			fputs("  InnoDB: Error: trying to rename a"
+			      " TEMPORARY TABLE ", stderr);
+			ut_print_name(stderr, NULL, TRUE, old_name);
+			fputs(" (", stderr);
+			ut_print_filename(stderr,
+					  table->dir_path_of_temp_table);
+			fputs(" )\n", stderr);
+			return(FALSE);
+		} else if (!fil_rename_tablespace(old_name, table->space,
+						  new_name)) {
+			return(FALSE);
+		}
+	}
+
+	/* Remove table from the hash tables of tables */
+	HASH_DELETE(dict_table_t, name_hash, dict_sys->table_hash,
+		    ut_fold_string(old_name), table);
+
+	if (strlen(new_name) > strlen(table->name)) {
+		/* We allocate MAX_TABLE_NAME_LEN+1 bytes here to avoid
+		memory fragmentation, we assume a repeated calls of
+		ut_realloc() with the same size do not cause fragmentation */
+		ut_a(strlen(new_name) <= MAX_TABLE_NAME_LEN);
+		table->name = ut_realloc(table->name, MAX_TABLE_NAME_LEN + 1);
+	}
+	memcpy(table->name, new_name, strlen(new_name) + 1);
+
+	/* Add table to hash table of tables */
+	HASH_INSERT(dict_table_t, name_hash, dict_sys->table_hash, fold,
+		    table);
+
+	dict_sys->size += strlen(new_name) - strlen(old_name);
+	ut_a(dict_sys->size > 0);
+
+	/* Update the table_name field in indexes */
+	index = dict_table_get_first_index(table);
+
+	while (index != NULL) {
+		index->table_name = table->name;
+
+		index = dict_table_get_next_index(index);
+	}
+
+	if (!rename_also_foreigns) {
+		/* In ALTER TABLE we think of the rename table operation
+		in the direction table -> temporary table (#sql...)
+		as dropping the table with the old name and creating
+		a new with the new name. Thus we kind of drop the
+		constraints from the dictionary cache here. The foreign key
+		constraints will be inherited to the new table from the
+		system tables through a call of dict_load_foreigns. */
+
+		/* Remove the foreign constraints from the cache */
+		foreign = UT_LIST_GET_LAST(table->foreign_list);
+
+		while (foreign != NULL) {
+			dict_foreign_remove_from_cache(foreign);
+			foreign = UT_LIST_GET_LAST(table->foreign_list);
+		}
+
+		/* Reset table field in referencing constraints */
+
+		foreign = UT_LIST_GET_FIRST(table->referenced_list);
+
+		while (foreign != NULL) {
+			foreign->referenced_table = NULL;
+			foreign->referenced_index = NULL;
+
+			foreign = UT_LIST_GET_NEXT(referenced_list, foreign);
+		}
+
+		/* Make the list of referencing constraints empty */
+
+		UT_LIST_INIT(table->referenced_list);
+
+		return(TRUE);
+	}
+
+	/* Update the table name fields in foreign constraints, and update also
+	the constraint id of new format >= 4.0.18 constraints. Note that at
+	this point we have already changed table->name to the new name. */
+
+	foreign = UT_LIST_GET_FIRST(table->foreign_list);
+
+	while (foreign != NULL) {
+		if (ut_strlen(foreign->foreign_table_name)
+		    < ut_strlen(table->name)) {
+			/* Allocate a longer name buffer;
+			TODO: store buf len to save memory */
+
+			foreign->foreign_table_name
+				= mem_heap_alloc(foreign->heap,
+						 ut_strlen(table->name) + 1);
+		}
+
+		strcpy(foreign->foreign_table_name, table->name);
+
+		if (strchr(foreign->id, '/')) {
+			ulint	db_len;
+			char*	old_id;
+
+			/* This is a >= 4.0.18 format id */
+
+			old_id = mem_strdup(foreign->id);
+
+			if (ut_strlen(foreign->id) > ut_strlen(old_name)
+			    + ((sizeof dict_ibfk) - 1)
+			    && !memcmp(foreign->id, old_name,
+				       ut_strlen(old_name))
+			    && !memcmp(foreign->id + ut_strlen(old_name),
+				       dict_ibfk, (sizeof dict_ibfk) - 1)) {
+
+				/* This is a generated >= 4.0.18 format id */
+
+				if (strlen(table->name) > strlen(old_name)) {
+					foreign->id = mem_heap_alloc(
+						foreign->heap,
+						strlen(table->name)
+						+ strlen(old_id) + 1);
+				}
+
+				/* Replace the prefix 'databasename/tablename'
+				with the new names */
+				strcpy(foreign->id, table->name);
+				strcat(foreign->id,
+				       old_id + ut_strlen(old_name));
+			} else {
+				/* This is a >= 4.0.18 format id where the user
+				gave the id name */
+				db_len = dict_get_db_name_len(table->name) + 1;
+
+				if (dict_get_db_name_len(table->name)
+				    > dict_get_db_name_len(foreign->id)) {
+
+					foreign->id = mem_heap_alloc(
+						foreign->heap,
+						db_len + strlen(old_id) + 1);
+				}
+
+				/* Replace the database prefix in id with the
+				one from table->name */
+
+				ut_memcpy(foreign->id, table->name, db_len);
+
+				strcpy(foreign->id + db_len,
+				       dict_remove_db_name(old_id));
+			}
+
+			mem_free(old_id);
+		}
+
+		foreign = UT_LIST_GET_NEXT(foreign_list, foreign);
+	}
+
+	foreign = UT_LIST_GET_FIRST(table->referenced_list);
+
+	while (foreign != NULL) {
+		if (ut_strlen(foreign->referenced_table_name)
+		    < ut_strlen(table->name)) {
+			/* Allocate a longer name buffer;
+			TODO: store buf len to save memory */
+
+			foreign->referenced_table_name = mem_heap_alloc(
+				foreign->heap, strlen(table->name) + 1);
+		}
+
+		strcpy(foreign->referenced_table_name, table->name);
+
+		foreign = UT_LIST_GET_NEXT(referenced_list, foreign);
+	}
+
+	return(TRUE);
+}
+
+/**********************************************************************//**
+Change the id of a table object in the dictionary cache. This is used in
+DISCARD TABLESPACE. */
+UNIV_INTERN
+void
+dict_table_change_id_in_cache(
+/*==========================*/
+	dict_table_t*	table,	/*!< in/out: table object already in cache */
+	dulint		new_id)	/*!< in: new id to set */
+{
+	ut_ad(table);
+	ut_ad(mutex_own(&(dict_sys->mutex)));
+	ut_ad(table->magic_n == DICT_TABLE_MAGIC_N);
+
+	/* Remove the table from the hash table of id's */
+
+	HASH_DELETE(dict_table_t, id_hash, dict_sys->table_id_hash,
+		    ut_fold_dulint(table->id), table);
+	table->id = new_id;
+
+	/* Add the table back to the hash table */
+	HASH_INSERT(dict_table_t, id_hash, dict_sys->table_id_hash,
+		    ut_fold_dulint(table->id), table);
+}
+
+/**********************************************************************//**
+Removes a table object from the dictionary cache. */
+UNIV_INTERN
+void
+dict_table_remove_from_cache(
+/*=========================*/
+	dict_table_t*	table)	/*!< in, own: table */
+{
+	dict_foreign_t*	foreign;
+	dict_index_t*	index;
+	ulint		size;
+
+	ut_ad(table);
+	ut_ad(mutex_own(&(dict_sys->mutex)));
+	ut_ad(table->magic_n == DICT_TABLE_MAGIC_N);
+
+#if 0
+	fputs("Removing table ", stderr);
+	ut_print_name(stderr, table->name, ULINT_UNDEFINED);
+	fputs(" from dictionary cache\n", stderr);
+#endif
+
+	/* Remove the foreign constraints from the cache */
+	foreign = UT_LIST_GET_LAST(table->foreign_list);
+
+	while (foreign != NULL) {
+		dict_foreign_remove_from_cache(foreign);
+		foreign = UT_LIST_GET_LAST(table->foreign_list);
+	}
+
+	/* Reset table field in referencing constraints */
+
+	foreign = UT_LIST_GET_FIRST(table->referenced_list);
+
+	while (foreign != NULL) {
+		foreign->referenced_table = NULL;
+		foreign->referenced_index = NULL;
+
+		foreign = UT_LIST_GET_NEXT(referenced_list, foreign);
+	}
+
+	/* Remove the indexes from the cache */
+	index = UT_LIST_GET_LAST(table->indexes);
+
+	while (index != NULL) {
+		dict_index_remove_from_cache(table, index);
+		index = UT_LIST_GET_LAST(table->indexes);
+	}
+
+	/* Remove table from the hash tables of tables */
+	HASH_DELETE(dict_table_t, name_hash, dict_sys->table_hash,
+		    ut_fold_string(table->name), table);
+	HASH_DELETE(dict_table_t, id_hash, dict_sys->table_id_hash,
+		    ut_fold_dulint(table->id), table);
+
+	/* Remove table from LRU list of tables */
+	UT_LIST_REMOVE(table_LRU, dict_sys->table_LRU, table);
+
+	size = mem_heap_get_size(table->heap) + strlen(table->name) + 1;
+
+	ut_ad(dict_sys->size >= size);
+
+	dict_sys->size -= size;
+
+	dict_mem_table_free(table);
+}
+
+/**************************************************************************
+Frees tables from the end of table_LRU if the dictionary cache occupies
+too much space. */
+UNIV_INTERN
+void
+dict_table_LRU_trim(
+/*================*/
+	dict_table_t*	self)
+{
+	dict_table_t*	table;
+	dict_table_t*	prev_table;
+	dict_foreign_t*	foreign;
+	ulint		n_removed;
+	ulint		n_have_parent;
+	ulint		cached_foreign_tables;
+
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(mutex_own(&(dict_sys->mutex)));
+#endif /* UNIV_SYNC_DEBUG */
+
+retry:
+	n_removed = n_have_parent = 0;
+	table = UT_LIST_GET_LAST(dict_sys->table_LRU);
+
+	while ( srv_dict_size_limit && table
+		&& ((dict_sys->table_hash->n_cells
+		     + dict_sys->table_id_hash->n_cells) * sizeof(hash_cell_t)
+		    + dict_sys->size) > srv_dict_size_limit ) {
+		prev_table = UT_LIST_GET_PREV(table_LRU, table);
+
+		if (table == self || table->n_mysql_handles_opened || table->is_corrupt)
+			goto next_loop;
+
+		cached_foreign_tables = 0;
+		foreign = UT_LIST_GET_FIRST(table->foreign_list);
+		while (foreign != NULL) {
+			if (foreign->referenced_table)
+				cached_foreign_tables++;
+			foreign = UT_LIST_GET_NEXT(foreign_list, foreign);
+		}
+
+		if (cached_foreign_tables == 0) {
+			dict_table_remove_from_cache(table);
+			n_removed++;
+		} else {
+			n_have_parent++;
+		}
+next_loop:
+		table = prev_table;
+	}
+
+	if ( srv_dict_size_limit && n_have_parent && n_removed
+		&& ((dict_sys->table_hash->n_cells
+		     + dict_sys->table_id_hash->n_cells) * sizeof(hash_cell_t)
+		    + dict_sys->size) > srv_dict_size_limit )
+		goto retry;
+}
+
+/****************************************************************//**
+If the given column name is reserved for InnoDB system columns, return
+TRUE.
+@return	TRUE if name is reserved */
+UNIV_INTERN
+ibool
+dict_col_name_is_reserved(
+/*======================*/
+	const char*	name)	/*!< in: column name */
+{
+	/* This check reminds that if a new system column is added to
+	the program, it should be dealt with here. */
+#if DATA_N_SYS_COLS != 3
+#error "DATA_N_SYS_COLS != 3"
+#endif
+
+	static const char*	reserved_names[] = {
+		"DB_ROW_ID", "DB_TRX_ID", "DB_ROLL_PTR"
+	};
+
+	ulint			i;
+
+	for (i = 0; i < UT_ARR_SIZE(reserved_names); i++) {
+		if (innobase_strcasecmp(name, reserved_names[i]) == 0) {
+
+			return(TRUE);
+		}
+	}
+
+	return(FALSE);
+}
+
+/****************************************************************//**
+If an undo log record for this table might not fit on a single page,
+return TRUE.
+@return	TRUE if the undo log record could become too big */
+static
+ibool
+dict_index_too_big_for_undo(
+/*========================*/
+	const dict_table_t*	table,		/*!< in: table */
+	const dict_index_t*	new_index)	/*!< in: index */
+{
+	/* Make sure that all column prefixes will fit in the undo log record
+	in trx_undo_page_report_modify() right after trx_undo_page_init(). */
+
+	ulint			i;
+	const dict_index_t*	clust_index
+		= dict_table_get_first_index(table);
+	ulint			undo_page_len
+		= TRX_UNDO_PAGE_HDR - TRX_UNDO_PAGE_HDR_SIZE
+		+ 2 /* next record pointer */
+		+ 1 /* type_cmpl */
+		+ 11 /* trx->undo_no */ + 11 /* table->id */
+		+ 1 /* rec_get_info_bits() */
+		+ 11 /* DB_TRX_ID */
+		+ 11 /* DB_ROLL_PTR */
+		+ 10 + FIL_PAGE_DATA_END /* trx_undo_left() */
+		+ 2/* pointer to previous undo log record */;
+
+	if (UNIV_UNLIKELY(!clust_index)) {
+		ut_a(dict_index_is_clust(new_index));
+		clust_index = new_index;
+	}
+
+	/* Add the size of the ordering columns in the
+	clustered index. */
+	for (i = 0; i < clust_index->n_uniq; i++) {
+		const dict_col_t*	col
+			= dict_index_get_nth_col(clust_index, i);
+
+		/* Use the maximum output size of
+		mach_write_compressed(), although the encoded
+		length should always fit in 2 bytes. */
+		undo_page_len += 5 + dict_col_get_max_size(col);
+	}
+
+	/* Add the old values of the columns to be updated.
+	First, the amount and the numbers of the columns.
+	These are written by mach_write_compressed() whose
+	maximum output length is 5 bytes.  However, given that
+	the quantities are below REC_MAX_N_FIELDS (10 bits),
+	the maximum length is 2 bytes per item. */
+	undo_page_len += 2 * (dict_table_get_n_cols(table) + 1);
+
+	for (i = 0; i < clust_index->n_def; i++) {
+		const dict_col_t*	col
+			= dict_index_get_nth_col(clust_index, i);
+		ulint			max_size
+			= dict_col_get_max_size(col);
+		ulint			fixed_size
+			= dict_col_get_fixed_size(col,
+						  dict_table_is_comp(table));
+
+		if (fixed_size) {
+			/* Fixed-size columns are stored locally. */
+			max_size = fixed_size;
+		} else if (max_size <= BTR_EXTERN_FIELD_REF_SIZE * 2) {
+			/* Short columns are stored locally. */
+		} else if (!col->ord_part) {
+			/* See if col->ord_part would be set
+			because of new_index. */
+			ulint	j;
+
+			for (j = 0; j < new_index->n_uniq; j++) {
+				if (dict_index_get_nth_col(
+					    new_index, j) == col) {
+
+					goto is_ord_part;
+				}
+			}
+
+			/* This is not an ordering column in any index.
+			Thus, it can be stored completely externally. */
+			max_size = BTR_EXTERN_FIELD_REF_SIZE;
+		} else {
+is_ord_part:
+			/* This is an ordering column in some index.
+			A long enough prefix must be written to the
+			undo log.  See trx_undo_page_fetch_ext(). */
+
+			if (max_size > REC_MAX_INDEX_COL_LEN) {
+				max_size = REC_MAX_INDEX_COL_LEN;
+			}
+
+			max_size += BTR_EXTERN_FIELD_REF_SIZE;
+		}
+
+		undo_page_len += 5 + max_size;
+	}
+
+	return(undo_page_len >= UNIV_PAGE_SIZE);
+}
+
+/****************************************************************//**
+If a record of this index might not fit on a single B-tree page,
+return TRUE.
+@return	TRUE if the index record could become too big */
+static
+ibool
+dict_index_too_big_for_tree(
+/*========================*/
+	const dict_table_t*	table,		/*!< in: table */
+	const dict_index_t*	new_index)	/*!< in: index */
+{
+	ulint	zip_size;
+	ulint	comp;
+	ulint	i;
+	/* maximum possible storage size of a record */
+	ulint	rec_max_size;
+	/* maximum allowed size of a record on a leaf page */
+	ulint	page_rec_max;
+	/* maximum allowed size of a node pointer record */
+	ulint	page_ptr_max;
+
+	comp = dict_table_is_comp(table);
+	zip_size = dict_table_zip_size(table);
+
+	if (zip_size && zip_size < UNIV_PAGE_SIZE) {
+		/* On a compressed page, two records must fit in the
+		uncompressed page modification log.  On compressed
+		pages with zip_size == UNIV_PAGE_SIZE, this limit will
+		never be reached. */
+		ut_ad(comp);
+		/* The maximum allowed record size is the size of
+		an empty page, minus a byte for recoding the heap
+		number in the page modification log.  The maximum
+		allowed node pointer size is half that. */
+		page_rec_max = page_zip_empty_size(new_index->n_fields,
+						   zip_size) - 1;
+		page_ptr_max = page_rec_max / 2;
+		/* On a compressed page, there is a two-byte entry in
+		the dense page directory for every record.  But there
+		is no record header. */
+		rec_max_size = 2;
+	} else {
+		/* The maximum allowed record size is half a B-tree
+		page.  No additional sparse page directory entry will
+		be generated for the first few user records. */
+		page_rec_max = page_get_free_space_of_empty(comp) / 2;
+		page_ptr_max = page_rec_max;
+		/* Each record has a header. */
+		rec_max_size = comp
+			? REC_N_NEW_EXTRA_BYTES
+			: REC_N_OLD_EXTRA_BYTES;
+	}
+
+	if (comp) {
+		/* Include the "null" flags in the
+		maximum possible record size. */
+		rec_max_size += UT_BITS_IN_BYTES(new_index->n_nullable);
+	} else {
+		/* For each column, include a 2-byte offset and a
+		"null" flag.  The 1-byte format is only used in short
+		records that do not contain externally stored columns.
+		Such records could never exceed the page limit, even
+		when using the 2-byte format. */
+		rec_max_size += 2 * new_index->n_fields;
+	}
+
+	/* Compute the maximum possible record size. */
+	for (i = 0; i < new_index->n_fields; i++) {
+		const dict_field_t*	field
+			= dict_index_get_nth_field(new_index, i);
+		const dict_col_t*	col
+			= dict_field_get_col(field);
+		ulint			field_max_size;
+		ulint			field_ext_max_size;
+
+		/* In dtuple_convert_big_rec(), variable-length columns
+		that are longer than BTR_EXTERN_FIELD_REF_SIZE * 2
+		may be chosen for external storage.
+
+		Fixed-length columns, and all columns of secondary
+		index records are always stored inline. */
+
+		/* Determine the maximum length of the index field.
+		The field_ext_max_size should be computed as the worst
+		case in rec_get_converted_size_comp() for
+		REC_STATUS_ORDINARY records. */
+
+		field_max_size = dict_col_get_fixed_size(col, comp);
+		if (field_max_size) {
+			/* dict_index_add_col() should guarantee this */
+			ut_ad(!field->prefix_len
+			      || field->fixed_len == field->prefix_len);
+			/* Fixed lengths are not encoded
+			in ROW_FORMAT=COMPACT. */
+			field_ext_max_size = 0;
+			goto add_field_size;
+		}
+
+		field_max_size = dict_col_get_max_size(col);
+		field_ext_max_size = field_max_size < 256 ? 1 : 2;
+
+		if (field->prefix_len) {
+			if (field->prefix_len < field_max_size) {
+				field_max_size = field->prefix_len;
+			}
+		} else if (field_max_size > BTR_EXTERN_FIELD_REF_SIZE * 2
+			   && dict_index_is_clust(new_index)) {
+
+			/* In the worst case, we have a locally stored
+			column of BTR_EXTERN_FIELD_REF_SIZE * 2 bytes.
+			The length can be stored in one byte.  If the
+			column were stored externally, the lengths in
+			the clustered index page would be
+			BTR_EXTERN_FIELD_REF_SIZE and 2. */
+			field_max_size = BTR_EXTERN_FIELD_REF_SIZE * 2;
+			field_ext_max_size = 1;
+		}
+
+		if (comp) {
+			/* Add the extra size for ROW_FORMAT=COMPACT.
+			For ROW_FORMAT=REDUNDANT, these bytes were
+			added to rec_max_size before this loop. */
+			rec_max_size += field_ext_max_size;
+		}
+add_field_size:
+		rec_max_size += field_max_size;
+
+		/* Check the size limit on leaf pages. */
+		if (UNIV_UNLIKELY(rec_max_size >= page_rec_max)) {
+
+			return(TRUE);
+		}
+
+		/* Check the size limit on non-leaf pages.  Records
+		stored in non-leaf B-tree pages consist of the unique
+		columns of the record (the key columns of the B-tree)
+		and a node pointer field.  When we have processed the
+		unique columns, rec_max_size equals the size of the
+		node pointer record minus the node pointer column. */
+		if (i + 1 == dict_index_get_n_unique_in_tree(new_index)
+		    && rec_max_size + REC_NODE_PTR_SIZE >= page_ptr_max) {
+
+			return(TRUE);
+		}
+	}
+
+	return(FALSE);
+}
+
+/**********************************************************************//**
+Adds an index to the dictionary cache.
+@return	DB_SUCCESS, DB_TOO_BIG_RECORD, or DB_CORRUPTION */
+UNIV_INTERN
+ulint
+dict_index_add_to_cache(
+/*====================*/
+	dict_table_t*	table,	/*!< in: table on which the index is */
+	dict_index_t*	index,	/*!< in, own: index; NOTE! The index memory
+				object is freed in this function! */
+	ulint		page_no,/*!< in: root page number of the index */
+	ibool		strict)	/*!< in: TRUE=refuse to create the index
+				if records could be too big to fit in
+				an B-tree page */
+{
+	dict_index_t*	new_index;
+	ulint		n_ord;
+	ulint		i;
+
+	ut_ad(index);
+	ut_ad(mutex_own(&(dict_sys->mutex)));
+	ut_ad(index->n_def == index->n_fields);
+	ut_ad(index->magic_n == DICT_INDEX_MAGIC_N);
+
+	ut_ad(mem_heap_validate(index->heap));
+	ut_a(!dict_index_is_clust(index)
+	     || UT_LIST_GET_LEN(table->indexes) == 0);
+
+	if (!dict_index_find_cols(table, index)) {
+
+		dict_mem_index_free(index);
+		return(DB_CORRUPTION);
+	}
+
+	/* Build the cache internal representation of the index,
+	containing also the added system fields */
+
+	if (dict_index_is_clust(index)) {
+		new_index = dict_index_build_internal_clust(table, index);
+	} else {
+		new_index = dict_index_build_internal_non_clust(table, index);
+	}
+
+	/* Set the n_fields value in new_index to the actual defined
+	number of fields in the cache internal representation */
+
+	new_index->n_fields = new_index->n_def;
+
+	if (strict && dict_index_too_big_for_tree(table, new_index)) {
+too_big:
+		dict_mem_index_free(new_index);
+		dict_mem_index_free(index);
+		return(DB_TOO_BIG_RECORD);
+	}
+
+	if (UNIV_UNLIKELY(index->type & DICT_UNIVERSAL)) {
+		n_ord = new_index->n_fields;
+	} else {
+		n_ord = new_index->n_uniq;
+	}
+
+	switch (dict_table_get_format(table)) {
+	case DICT_TF_FORMAT_51:
+		/* ROW_FORMAT=REDUNDANT and ROW_FORMAT=COMPACT store
+		prefixes of externally stored columns locally within
+		the record.  There are no special considerations for
+		the undo log record size. */
+		goto undo_size_ok;
+
+	case DICT_TF_FORMAT_ZIP:
+		/* In ROW_FORMAT=DYNAMIC and ROW_FORMAT=COMPRESSED,
+		column prefix indexes require that prefixes of
+		externally stored columns are written to the undo log.
+		This may make the undo log record bigger than the
+		record on the B-tree page.  The maximum size of an
+		undo log record is the page size.  That must be
+		checked for below. */
+		break;
+
+#if DICT_TF_FORMAT_ZIP != DICT_TF_FORMAT_MAX
+# error "DICT_TF_FORMAT_ZIP != DICT_TF_FORMAT_MAX"
+#endif
+	}
+
+	for (i = 0; i < n_ord; i++) {
+		const dict_field_t*	field
+			= dict_index_get_nth_field(new_index, i);
+		const dict_col_t*	col
+			= dict_field_get_col(field);
+
+		/* In dtuple_convert_big_rec(), variable-length columns
+		that are longer than BTR_EXTERN_FIELD_REF_SIZE * 2
+		may be chosen for external storage.  If the column appears
+		in an ordering column of an index, a longer prefix of
+		REC_MAX_INDEX_COL_LEN will be copied to the undo log
+		by trx_undo_page_report_modify() and
+		trx_undo_page_fetch_ext().  It suffices to check the
+		capacity of the undo log whenever new_index includes
+		a column prefix on a column that may be stored externally. */
+
+		if (field->prefix_len /* prefix index */
+		    && !col->ord_part /* not yet ordering column */
+		    && !dict_col_get_fixed_size(col, TRUE) /* variable-length */
+		    && dict_col_get_max_size(col)
+		    > BTR_EXTERN_FIELD_REF_SIZE * 2 /* long enough */) {
+
+			if (dict_index_too_big_for_undo(table, new_index)) {
+				/* An undo log record might not fit in
+				a single page.  Refuse to create this index. */
+
+				goto too_big;
+			}
+
+			break;
+		}
+	}
+
+undo_size_ok:
+	/* Flag the ordering columns */
+
+	for (i = 0; i < n_ord; i++) {
+
+		dict_index_get_nth_field(new_index, i)->col->ord_part = 1;
+	}
+
+	/* Add the new index as the last index for the table */
+
+	UT_LIST_ADD_LAST(indexes, table->indexes, new_index);
+	new_index->table = table;
+	new_index->table_name = table->name;
+
+	new_index->search_info = btr_search_info_create(new_index->heap);
+
+	new_index->stat_index_size = 1;
+	new_index->stat_n_leaf_pages = 1;
+
+	new_index->page = page_no;
+	rw_lock_create(&new_index->lock, SYNC_INDEX_TREE);
+
+	if (!UNIV_UNLIKELY(new_index->type & DICT_UNIVERSAL)) {
+
+		new_index->stat_n_diff_key_vals = mem_heap_alloc(
+			new_index->heap,
+			(1 + dict_index_get_n_unique(new_index))
+			* sizeof(ib_int64_t));
+		/* Give some sensible values to stat_n_... in case we do
+		not calculate statistics quickly enough */
+
+		for (i = 0; i <= dict_index_get_n_unique(new_index); i++) {
+
+			new_index->stat_n_diff_key_vals[i] = 100;
+		}
+	}
+
+	dict_sys->size += mem_heap_get_size(new_index->heap);
+
+	dict_mem_index_free(index);
+
+	return(DB_SUCCESS);
+}
+
+/**********************************************************************//**
+Removes an index from the dictionary cache. */
+UNIV_INTERN
+void
+dict_index_remove_from_cache(
+/*=========================*/
+	dict_table_t*	table,	/*!< in/out: table */
+	dict_index_t*	index)	/*!< in, own: index */
+{
+	ulint		size;
+	ulint		retries = 0;
+	btr_search_t*	info;
+
+	ut_ad(table && index);
+	ut_ad(table->magic_n == DICT_TABLE_MAGIC_N);
+	ut_ad(index->magic_n == DICT_INDEX_MAGIC_N);
+	ut_ad(mutex_own(&(dict_sys->mutex)));
+
+	/* remove all entry of the index from adaptive hash index,
+	because removing from adaptive hash index needs dict_index */
+	if (btr_search_enabled && srv_dict_size_limit)
+		btr_search_drop_page_hash_index_on_index(index);
+
+	/* We always create search info whether or not adaptive
+	hash index is enabled or not. */
+	info = index->search_info;
+	ut_ad(info);
+
+	/* We are not allowed to free the in-memory index struct
+ 	dict_index_t until all entries in the adaptive hash index
+	that point to any of the page belonging to his b-tree index
+	are dropped. This is so because dropping of these entries
+	require access to dict_index_t struct. To avoid such scenario
+	We keep a count of number of such pages in the search_info and
+	only free the dict_index_t struct when this count drops to
+	zero. */
+
+	for (;;) {
+		ulint ref_count = btr_search_info_get_ref_count(info);
+		if (ref_count == 0) {
+			break;
+		}
+
+		/* Sleep for 10ms before trying again. */
+		os_thread_sleep(10000);
+		++retries;
+
+		if (retries % 500 == 0) {
+			/* No luck after 5 seconds of wait. */
+			fprintf(stderr, "InnoDB: Error: Waited for"
+					" %lu secs for hash index"
+					" ref_count (%lu) to drop"
+					" to 0.\n"
+					"index: \"%s\""
+					" table: \"%s\"\n",
+					retries/100,
+					ref_count,
+					index->name,
+					table->name);
+		}
+
+		/* To avoid a hang here we commit suicide if the
+		ref_count doesn't drop to zero in 600 seconds. */
+		if (retries >= 60000) {
+			ut_error;
+		}
+	}
+
+	rw_lock_free(&index->lock);
+
+	/* Remove the index from the list of indexes of the table */
+	UT_LIST_REMOVE(indexes, table->indexes, index);
+
+	size = mem_heap_get_size(index->heap);
+
+	ut_ad(dict_sys->size >= size);
+
+	dict_sys->size -= size;
+
+	dict_mem_index_free(index);
+}
+
+/*******************************************************************//**
+Tries to find column names for the index and sets the col field of the
+index.
+@return TRUE if the column names were found */
+static
+ibool
+dict_index_find_cols(
+/*=================*/
+	dict_table_t*	table,	/*!< in: table */
+	dict_index_t*	index)	/*!< in: index */
+{
+	ulint		i;
+
+	ut_ad(table && index);
+	ut_ad(table->magic_n == DICT_TABLE_MAGIC_N);
+	ut_ad(mutex_own(&(dict_sys->mutex)));
+
+	for (i = 0; i < index->n_fields; i++) {
+		ulint		j;
+		dict_field_t*	field = dict_index_get_nth_field(index, i);
+
+		for (j = 0; j < table->n_cols; j++) {
+			if (!strcmp(dict_table_get_col_name(table, j),
+				    field->name)) {
+				field->col = dict_table_get_nth_col(table, j);
+
+				goto found;
+			}
+		}
+
+#ifdef UNIV_DEBUG
+		/* It is an error not to find a matching column. */
+		fputs("InnoDB: Error: no matching column for ", stderr);
+		ut_print_name(stderr, NULL, FALSE, field->name);
+		fputs(" in ", stderr);
+		dict_index_name_print(stderr, NULL, index);
+		fputs("!\n", stderr);
+#endif /* UNIV_DEBUG */
+		return(FALSE);
+
+found:
+		;
+	}
+
+	return(TRUE);
+}
+#endif /* !UNIV_HOTBACKUP */
+
+/*******************************************************************//**
+Adds a column to index. */
+UNIV_INTERN
+void
+dict_index_add_col(
+/*===============*/
+	dict_index_t*		index,		/*!< in/out: index */
+	const dict_table_t*	table,		/*!< in: table */
+	dict_col_t*		col,		/*!< in: column */
+	ulint			prefix_len)	/*!< in: column prefix length */
+{
+	dict_field_t*	field;
+	const char*	col_name;
+
+	col_name = dict_table_get_col_name(table, dict_col_get_no(col));
+
+	dict_mem_index_add_field(index, col_name, prefix_len);
+
+	field = dict_index_get_nth_field(index, index->n_def - 1);
+
+	field->col = col;
+	field->fixed_len = (unsigned int) dict_col_get_fixed_size(
+		col, dict_table_is_comp(table));
+
+	if (prefix_len && field->fixed_len > prefix_len) {
+		field->fixed_len = (unsigned int) prefix_len;
+	}
+
+	/* Long fixed-length fields that need external storage are treated as
+	variable-length fields, so that the extern flag can be embedded in
+	the length word. */
+
+	if (field->fixed_len > DICT_MAX_INDEX_COL_LEN) {
+		field->fixed_len = 0;
+	}
+#if DICT_MAX_INDEX_COL_LEN != 768
+	/* The comparison limit above must be constant.  If it were
+	changed, the disk format of some fixed-length columns would
+	change, which would be a disaster. */
+# error "DICT_MAX_INDEX_COL_LEN != 768"
+#endif
+
+	if (!(col->prtype & DATA_NOT_NULL)) {
+		index->n_nullable++;
+	}
+}
+
+#ifndef UNIV_HOTBACKUP
+/*******************************************************************//**
+Copies fields contained in index2 to index1. */
+static
+void
+dict_index_copy(
+/*============*/
+	dict_index_t*		index1,	/*!< in: index to copy to */
+	dict_index_t*		index2,	/*!< in: index to copy from */
+	const dict_table_t*	table,	/*!< in: table */
+	ulint			start,	/*!< in: first position to copy */
+	ulint			end)	/*!< in: last position to copy */
+{
+	dict_field_t*	field;
+	ulint		i;
+
+	/* Copy fields contained in index2 */
+
+	for (i = start; i < end; i++) {
+
+		field = dict_index_get_nth_field(index2, i);
+		dict_index_add_col(index1, table, field->col,
+				   field->prefix_len);
+	}
+}
+
+/*******************************************************************//**
+Copies types of fields contained in index to tuple. */
+UNIV_INTERN
+void
+dict_index_copy_types(
+/*==================*/
+	dtuple_t*		tuple,		/*!< in/out: data tuple */
+	const dict_index_t*	index,		/*!< in: index */
+	ulint			n_fields)	/*!< in: number of
+						field types to copy */
+{
+	ulint		i;
+
+	if (UNIV_UNLIKELY(index->type & DICT_UNIVERSAL)) {
+		dtuple_set_types_binary(tuple, n_fields);
+
+		return;
+	}
+
+	for (i = 0; i < n_fields; i++) {
+		const dict_field_t*	ifield;
+		dtype_t*		dfield_type;
+
+		ifield = dict_index_get_nth_field(index, i);
+		dfield_type = dfield_get_type(dtuple_get_nth_field(tuple, i));
+		dict_col_copy_type(dict_field_get_col(ifield), dfield_type);
+	}
+}
+
+/*******************************************************************//**
+Copies types of columns contained in table to tuple and sets all
+fields of the tuple to the SQL NULL value.  This function should
+be called right after dtuple_create(). */
+UNIV_INTERN
+void
+dict_table_copy_types(
+/*==================*/
+	dtuple_t*		tuple,	/*!< in/out: data tuple */
+	const dict_table_t*	table)	/*!< in: table */
+{
+	ulint		i;
+
+	for (i = 0; i < dtuple_get_n_fields(tuple); i++) {
+
+		dfield_t*	dfield	= dtuple_get_nth_field(tuple, i);
+		dtype_t*	dtype	= dfield_get_type(dfield);
+
+		dfield_set_null(dfield);
+		dict_col_copy_type(dict_table_get_nth_col(table, i), dtype);
+	}
+}
+
+/*******************************************************************//**
+Builds the internal dictionary cache representation for a clustered
+index, containing also system fields not defined by the user.
+@return	own: the internal representation of the clustered index */
+static
+dict_index_t*
+dict_index_build_internal_clust(
+/*============================*/
+	const dict_table_t*	table,	/*!< in: table */
+	dict_index_t*		index)	/*!< in: user representation of
+					a clustered index */
+{
+	dict_index_t*	new_index;
+	dict_field_t*	field;
+	ulint		fixed_size;
+	ulint		trx_id_pos;
+	ulint		i;
+	ibool*		indexed;
+
+	ut_ad(table && index);
+	ut_ad(dict_index_is_clust(index));
+	ut_ad(mutex_own(&(dict_sys->mutex)));
+	ut_ad(table->magic_n == DICT_TABLE_MAGIC_N);
+
+	/* Create a new index object with certainly enough fields */
+	new_index = dict_mem_index_create(table->name,
+					  index->name, table->space,
+					  index->type,
+					  index->n_fields + table->n_cols);
+
+	/* Copy other relevant data from the old index struct to the new
+	struct: it inherits the values */
+
+	new_index->n_user_defined_cols = index->n_fields;
+
+	new_index->id = index->id;
+
+	/* Copy the fields of index */
+	dict_index_copy(new_index, index, table, 0, index->n_fields);
+
+	if (UNIV_UNLIKELY(index->type & DICT_UNIVERSAL)) {
+		/* No fixed number of fields determines an entry uniquely */
+
+		new_index->n_uniq = REC_MAX_N_FIELDS;
+
+	} else if (dict_index_is_unique(index)) {
+		/* Only the fields defined so far are needed to identify
+		the index entry uniquely */
+
+		new_index->n_uniq = new_index->n_def;
+	} else {
+		/* Also the row id is needed to identify the entry */
+		new_index->n_uniq = 1 + new_index->n_def;
+	}
+
+	new_index->trx_id_offset = 0;
+
+	if (!dict_index_is_ibuf(index)) {
+		/* Add system columns, trx id first */
+
+		trx_id_pos = new_index->n_def;
+
+#if DATA_ROW_ID != 0
+# error "DATA_ROW_ID != 0"
+#endif
+#if DATA_TRX_ID != 1
+# error "DATA_TRX_ID != 1"
+#endif
+#if DATA_ROLL_PTR != 2
+# error "DATA_ROLL_PTR != 2"
+#endif
+
+		if (!dict_index_is_unique(index)) {
+			dict_index_add_col(new_index, table,
+					   dict_table_get_sys_col(
+						   table, DATA_ROW_ID),
+					   0);
+			trx_id_pos++;
+		}
+
+		dict_index_add_col(new_index, table,
+				   dict_table_get_sys_col(table, DATA_TRX_ID),
+				   0);
+
+		dict_index_add_col(new_index, table,
+				   dict_table_get_sys_col(table,
+							  DATA_ROLL_PTR),
+				   0);
+
+		for (i = 0; i < trx_id_pos; i++) {
+
+			fixed_size = dict_col_get_fixed_size(
+				dict_index_get_nth_col(new_index, i),
+				dict_table_is_comp(table));
+
+			if (fixed_size == 0) {
+				new_index->trx_id_offset = 0;
+
+				break;
+			}
+
+			if (dict_index_get_nth_field(new_index, i)->prefix_len
+			    > 0) {
+				new_index->trx_id_offset = 0;
+
+				break;
+			}
+
+			new_index->trx_id_offset += (unsigned int) fixed_size;
+		}
+
+	}
+
+	/* Remember the table columns already contained in new_index */
+	indexed = mem_zalloc(table->n_cols * sizeof *indexed);
+
+	/* Mark the table columns already contained in new_index */
+	for (i = 0; i < new_index->n_def; i++) {
+
+		field = dict_index_get_nth_field(new_index, i);
+
+		/* If there is only a prefix of the column in the index
+		field, do not mark the column as contained in the index */
+
+		if (field->prefix_len == 0) {
+
+			indexed[field->col->ind] = TRUE;
+		}
+	}
+
+	/* Add to new_index non-system columns of table not yet included
+	there */
+	for (i = 0; i + DATA_N_SYS_COLS < (ulint) table->n_cols; i++) {
+
+		dict_col_t*	col = dict_table_get_nth_col(table, i);
+		ut_ad(col->mtype != DATA_SYS);
+
+		if (!indexed[col->ind]) {
+			dict_index_add_col(new_index, table, col, 0);
+		}
+	}
+
+	mem_free(indexed);
+
+	ut_ad(dict_index_is_ibuf(index)
+	      || (UT_LIST_GET_LEN(table->indexes) == 0));
+
+	new_index->cached = TRUE;
+
+	return(new_index);
+}
+
+/*******************************************************************//**
+Builds the internal dictionary cache representation for a non-clustered
+index, containing also system fields not defined by the user.
+@return	own: the internal representation of the non-clustered index */
+static
+dict_index_t*
+dict_index_build_internal_non_clust(
+/*================================*/
+	const dict_table_t*	table,	/*!< in: table */
+	dict_index_t*		index)	/*!< in: user representation of
+					a non-clustered index */
+{
+	dict_field_t*	field;
+	dict_index_t*	new_index;
+	dict_index_t*	clust_index;
+	ulint		i;
+	ibool*		indexed;
+
+	ut_ad(table && index);
+	ut_ad(!dict_index_is_clust(index));
+	ut_ad(mutex_own(&(dict_sys->mutex)));
+	ut_ad(table->magic_n == DICT_TABLE_MAGIC_N);
+
+	/* The clustered index should be the first in the list of indexes */
+	clust_index = UT_LIST_GET_FIRST(table->indexes);
+
+	ut_ad(clust_index);
+	ut_ad(dict_index_is_clust(clust_index));
+	ut_ad(!(clust_index->type & DICT_UNIVERSAL));
+
+	/* Create a new index */
+	new_index = dict_mem_index_create(
+		table->name, index->name, index->space, index->type,
+		index->n_fields + 1 + clust_index->n_uniq);
+
+	/* Copy other relevant data from the old index
+	struct to the new struct: it inherits the values */
+
+	new_index->n_user_defined_cols = index->n_fields;
+
+	new_index->id = index->id;
+
+	/* Copy fields from index to new_index */
+	dict_index_copy(new_index, index, table, 0, index->n_fields);
+
+	/* Remember the table columns already contained in new_index */
+	indexed = mem_zalloc(table->n_cols * sizeof *indexed);
+
+	/* Mark the table columns already contained in new_index */
+	for (i = 0; i < new_index->n_def; i++) {
+
+		field = dict_index_get_nth_field(new_index, i);
+
+		/* If there is only a prefix of the column in the index
+		field, do not mark the column as contained in the index */
+
+		if (field->prefix_len == 0) {
+
+			indexed[field->col->ind] = TRUE;
+		}
+	}
+
+	/* Add to new_index the columns necessary to determine the clustered
+	index entry uniquely */
+
+	for (i = 0; i < clust_index->n_uniq; i++) {
+
+		field = dict_index_get_nth_field(clust_index, i);
+
+		if (!indexed[field->col->ind]) {
+			dict_index_add_col(new_index, table, field->col,
+					   field->prefix_len);
+		}
+	}
+
+	mem_free(indexed);
+
+	if (dict_index_is_unique(index)) {
+		new_index->n_uniq = index->n_fields;
+	} else {
+		new_index->n_uniq = new_index->n_def;
+	}
+
+	/* Set the n_fields value in new_index to the actual defined
+	number of fields */
+
+	new_index->n_fields = new_index->n_def;
+
+	new_index->cached = TRUE;
+
+	return(new_index);
+}
+
+/*====================== FOREIGN KEY PROCESSING ========================*/
+
+/*********************************************************************//**
+Checks if a table is referenced by foreign keys.
+@return	TRUE if table is referenced by a foreign key */
+UNIV_INTERN
+ibool
+dict_table_is_referenced_by_foreign_key(
+/*====================================*/
+	const dict_table_t*	table)	/*!< in: InnoDB table */
+{
+	return(UT_LIST_GET_LEN(table->referenced_list) > 0);
+}
+
+/*********************************************************************//**
+Check if the index is referenced by a foreign key, if TRUE return foreign
+else return NULL
+@return pointer to foreign key struct if index is defined for foreign
+key, otherwise NULL */
+UNIV_INTERN
+dict_foreign_t*
+dict_table_get_referenced_constraint(
+/*=================================*/
+	dict_table_t*	table,	/*!< in: InnoDB table */
+	dict_index_t*	index)	/*!< in: InnoDB index */
+{
+	dict_foreign_t*	foreign;
+
+	ut_ad(index != NULL);
+	ut_ad(table != NULL);
+
+	for (foreign = UT_LIST_GET_FIRST(table->referenced_list);
+	     foreign;
+	     foreign = UT_LIST_GET_NEXT(referenced_list, foreign)) {
+
+		if (foreign->referenced_index == index) {
+
+			return(foreign);
+		}
+	}
+
+	return(NULL);
+}
+
+/*********************************************************************//**
+Checks if a index is defined for a foreign key constraint. Index is a part
+of a foreign key constraint if the index is referenced by foreign key
+or index is a foreign key index.
+@return pointer to foreign key struct if index is defined for foreign
+key, otherwise NULL */
+UNIV_INTERN
+dict_foreign_t*
+dict_table_get_foreign_constraint(
+/*==============================*/
+	dict_table_t*	table,	/*!< in: InnoDB table */
+	dict_index_t*	index)	/*!< in: InnoDB index */
+{
+	dict_foreign_t*	foreign;
+
+	ut_ad(index != NULL);
+	ut_ad(table != NULL);
+
+	for (foreign = UT_LIST_GET_FIRST(table->foreign_list);
+	     foreign;
+	     foreign = UT_LIST_GET_NEXT(foreign_list, foreign)) {
+
+		if (foreign->foreign_index == index
+		    || foreign->referenced_index == index) {
+
+			return(foreign);
+		}
+	}
+
+	return(NULL);
+}
+
+/*********************************************************************//**
+Frees a foreign key struct. */
+static
+void
+dict_foreign_free(
+/*==============*/
+	dict_foreign_t*	foreign)	/*!< in, own: foreign key struct */
+{
+	mem_heap_free(foreign->heap);
+}
+
+/**********************************************************************//**
+Removes a foreign constraint struct from the dictionary cache. */
+static
+void
+dict_foreign_remove_from_cache(
+/*===========================*/
+	dict_foreign_t*	foreign)	/*!< in, own: foreign constraint */
+{
+	ut_ad(mutex_own(&(dict_sys->mutex)));
+	ut_a(foreign);
+
+	if (foreign->referenced_table) {
+		UT_LIST_REMOVE(referenced_list,
+			       foreign->referenced_table->referenced_list,
+			       foreign);
+	}
+
+	if (foreign->foreign_table) {
+		UT_LIST_REMOVE(foreign_list,
+			       foreign->foreign_table->foreign_list,
+			       foreign);
+	}
+
+	dict_foreign_free(foreign);
+}
+
+/**********************************************************************//**
+Looks for the foreign constraint from the foreign and referenced lists
+of a table.
+@return	foreign constraint */
+static
+dict_foreign_t*
+dict_foreign_find(
+/*==============*/
+	dict_table_t*	table,	/*!< in: table object */
+	const char*	id)	/*!< in: foreign constraint id */
+{
+	dict_foreign_t*	foreign;
+
+	ut_ad(mutex_own(&(dict_sys->mutex)));
+
+	foreign = UT_LIST_GET_FIRST(table->foreign_list);
+
+	while (foreign) {
+		if (ut_strcmp(id, foreign->id) == 0) {
+
+			return(foreign);
+		}
+
+		foreign = UT_LIST_GET_NEXT(foreign_list, foreign);
+	}
+
+	foreign = UT_LIST_GET_FIRST(table->referenced_list);
+
+	while (foreign) {
+		if (ut_strcmp(id, foreign->id) == 0) {
+
+			return(foreign);
+		}
+
+		foreign = UT_LIST_GET_NEXT(referenced_list, foreign);
+	}
+
+	return(NULL);
+}
+
+/*********************************************************************//**
+Tries to find an index whose first fields are the columns in the array,
+in the same order and is not marked for deletion and is not the same
+as types_idx.
+@return	matching index, NULL if not found */
+static
+dict_index_t*
+dict_foreign_find_index(
+/*====================*/
+	dict_table_t*	table,	/*!< in: table */
+	const char**	columns,/*!< in: array of column names */
+	ulint		n_cols,	/*!< in: number of columns */
+	dict_index_t*	types_idx, /*!< in: NULL or an index to whose types the
+				   column types must match */
+	ibool		check_charsets,
+				/*!< in: whether to check charsets.
+				only has an effect if types_idx != NULL */
+	ulint		check_null)
+				/*!< in: nonzero if none of the columns must
+				be declared NOT NULL */
+{
+	dict_index_t*	index;
+
+	index = dict_table_get_first_index(table);
+
+	while (index != NULL) {
+		/* Ignore matches that refer to the same instance
+		or the index is to be dropped */
+		if (index->to_be_dropped || types_idx == index) {
+
+			goto next_rec;
+
+		} else if (dict_index_get_n_fields(index) >= n_cols) {
+			ulint		i;
+
+			for (i = 0; i < n_cols; i++) {
+				dict_field_t*	field;
+				const char*	col_name;
+
+				field = dict_index_get_nth_field(index, i);
+
+				col_name = dict_table_get_col_name(
+					table, dict_col_get_no(field->col));
+
+				if (field->prefix_len != 0) {
+					/* We do not accept column prefix
+					indexes here */
+
+					break;
+				}
+
+				if (0 != innobase_strcasecmp(columns[i],
+							     col_name)) {
+					break;
+				}
+
+				if (check_null
+				    && (field->col->prtype & DATA_NOT_NULL)) {
+
+					return(NULL);
+				}
+
+				if (types_idx && !cmp_cols_are_equal(
+					    dict_index_get_nth_col(index, i),
+					    dict_index_get_nth_col(types_idx,
+								   i),
+					    check_charsets)) {
+
+					break;
+				}
+			}
+
+			if (i == n_cols) {
+				/* We found a matching index */
+
+				return(index);
+			}
+		}
+
+next_rec:
+		index = dict_table_get_next_index(index);
+	}
+
+	return(NULL);
+}
+
+/**********************************************************************//**
+Find an index that is equivalent to the one passed in and is not marked
+for deletion.
+@return	index equivalent to foreign->foreign_index, or NULL */
+UNIV_INTERN
+dict_index_t*
+dict_foreign_find_equiv_index(
+/*==========================*/
+	dict_foreign_t*	foreign)/*!< in: foreign key */
+{
+	ut_a(foreign != NULL);
+
+	/* Try to find an index which contains the columns as the
+	first fields and in the right order, and the types are the
+	same as in foreign->foreign_index */
+
+	return(dict_foreign_find_index(
+		       foreign->foreign_table,
+		       foreign->foreign_col_names, foreign->n_fields,
+		       foreign->foreign_index, TRUE, /* check types */
+		       FALSE/* allow columns to be NULL */));
+}
+
+/**********************************************************************//**
+Returns an index object by matching on the name and column names and
+if more than one index matches return the index with the max id
+@return	matching index, NULL if not found */
+UNIV_INTERN
+dict_index_t*
+dict_table_get_index_by_max_id(
+/*===========================*/
+	dict_table_t*	table,	/*!< in: table */
+	const char*	name,	/*!< in: the index name to find */
+	const char**	columns,/*!< in: array of column names */
+	ulint		n_cols)	/*!< in: number of columns */
+{
+	dict_index_t*	index;
+	dict_index_t*	found;
+
+	found = NULL;
+	index = dict_table_get_first_index(table);
+
+	while (index != NULL) {
+		if (ut_strcmp(index->name, name) == 0
+		    && dict_index_get_n_ordering_defined_by_user(index)
+		    == n_cols) {
+
+			ulint		i;
+
+			for (i = 0; i < n_cols; i++) {
+				dict_field_t*	field;
+				const char*	col_name;
+
+				field = dict_index_get_nth_field(index, i);
+
+				col_name = dict_table_get_col_name(
+					table, dict_col_get_no(field->col));
+
+				if (0 != innobase_strcasecmp(
+					    columns[i], col_name)) {
+
+					break;
+				}
+			}
+
+			if (i == n_cols) {
+				/* We found a matching index, select
+				the index with the higher id*/
+
+				if (!found
+				    || ut_dulint_cmp(index->id, found->id) > 0) {
+
+					found = index;
+				}
+			}
+		}
+
+		index = dict_table_get_next_index(index);
+	}
+
+	return(found);
+}
+
+/**********************************************************************//**
+Report an error in a foreign key definition. */
+static
+void
+dict_foreign_error_report_low(
+/*==========================*/
+	FILE*		file,	/*!< in: output stream */
+	const char*	name)	/*!< in: table name */
+{
+	rewind(file);
+	ut_print_timestamp(file);
+	fprintf(file, " Error in foreign key constraint of table %s:\n",
+		name);
+}
+
+/**********************************************************************//**
+Report an error in a foreign key definition. */
+static
+void
+dict_foreign_error_report(
+/*======================*/
+	FILE*		file,	/*!< in: output stream */
+	dict_foreign_t*	fk,	/*!< in: foreign key constraint */
+	const char*	msg)	/*!< in: the error message */
+{
+	mutex_enter(&dict_foreign_err_mutex);
+	dict_foreign_error_report_low(file, fk->foreign_table_name);
+	fputs(msg, file);
+	fputs(" Constraint:\n", file);
+	dict_print_info_on_foreign_key_in_create_format(file, NULL, fk, TRUE);
+	putc('\n', file);
+	if (fk->foreign_index) {
+		fputs("The index in the foreign key in table is ", file);
+		ut_print_name(file, NULL, FALSE, fk->foreign_index->name);
+		fputs("\n"
+		      "See " REFMAN "innodb-foreign-key-constraints.html\n"
+		      "for correct foreign key definition.\n",
+		      file);
+	}
+	mutex_exit(&dict_foreign_err_mutex);
+}
+
+/**********************************************************************//**
+Adds a foreign key constraint object to the dictionary cache. May free
+the object if there already is an object with the same identifier in.
+At least one of the foreign table and the referenced table must already
+be in the dictionary cache!
+@return	DB_SUCCESS or error code */
+UNIV_INTERN
+ulint
+dict_foreign_add_to_cache(
+/*======================*/
+	dict_foreign_t*	foreign,	/*!< in, own: foreign key constraint */
+	ibool		check_charsets)	/*!< in: TRUE=check charset
+					compatibility */
+{
+	dict_table_t*	for_table;
+	dict_table_t*	ref_table;
+	dict_foreign_t*	for_in_cache		= NULL;
+	dict_index_t*	index;
+	ibool		added_to_referenced_list= FALSE;
+	FILE*		ef			= dict_foreign_err_file;
+
+	ut_ad(mutex_own(&(dict_sys->mutex)));
+
+	for_table = dict_table_check_if_in_cache_low(
+		foreign->foreign_table_name);
+
+	ref_table = dict_table_check_if_in_cache_low(
+		foreign->referenced_table_name);
+	ut_a(for_table || ref_table);
+
+	if (for_table) {
+		for_in_cache = dict_foreign_find(for_table, foreign->id);
+	}
+
+	if (!for_in_cache && ref_table) {
+		for_in_cache = dict_foreign_find(ref_table, foreign->id);
+	}
+
+	if (for_in_cache) {
+		/* Free the foreign object */
+		mem_heap_free(foreign->heap);
+	} else {
+		for_in_cache = foreign;
+	}
+
+	if (for_in_cache->referenced_table == NULL && ref_table) {
+		index = dict_foreign_find_index(
+			ref_table,
+			for_in_cache->referenced_col_names,
+			for_in_cache->n_fields, for_in_cache->foreign_index,
+			check_charsets, FALSE);
+
+		if (index == NULL) {
+			dict_foreign_error_report(
+				ef, for_in_cache,
+				"there is no index in referenced table"
+				" which would contain\n"
+				"the columns as the first columns,"
+				" or the data types in the\n"
+				"referenced table do not match"
+				" the ones in table.");
+
+			if (for_in_cache == foreign) {
+				mem_heap_free(foreign->heap);
+			}
+
+			return(DB_CANNOT_ADD_CONSTRAINT);
+		}
+
+		for_in_cache->referenced_table = ref_table;
+		for_in_cache->referenced_index = index;
+		UT_LIST_ADD_LAST(referenced_list,
+				 ref_table->referenced_list,
+				 for_in_cache);
+		added_to_referenced_list = TRUE;
+	}
+
+	if (for_in_cache->foreign_table == NULL && for_table) {
+		index = dict_foreign_find_index(
+			for_table,
+			for_in_cache->foreign_col_names,
+			for_in_cache->n_fields,
+			for_in_cache->referenced_index, check_charsets,
+			for_in_cache->type
+			& (DICT_FOREIGN_ON_DELETE_SET_NULL
+			   | DICT_FOREIGN_ON_UPDATE_SET_NULL));
+
+		if (index == NULL) {
+			dict_foreign_error_report(
+				ef, for_in_cache,
+				"there is no index in the table"
+				" which would contain\n"
+				"the columns as the first columns,"
+				" or the data types in the\n"
+				"table do not match"
+				" the ones in the referenced table\n"
+				"or one of the ON ... SET NULL columns"
+				" is declared NOT NULL.");
+
+			if (for_in_cache == foreign) {
+				if (added_to_referenced_list) {
+					UT_LIST_REMOVE(
+						referenced_list,
+						ref_table->referenced_list,
+						for_in_cache);
+				}
+
+				mem_heap_free(foreign->heap);
+			}
+
+			return(DB_CANNOT_ADD_CONSTRAINT);
+		}
+
+		for_in_cache->foreign_table = for_table;
+		for_in_cache->foreign_index = index;
+		UT_LIST_ADD_LAST(foreign_list,
+				 for_table->foreign_list,
+				 for_in_cache);
+	}
+
+	return(DB_SUCCESS);
+}
+
+/*********************************************************************//**
+Scans from pointer onwards. Stops if is at the start of a copy of
+'string' where characters are compared without case sensitivity, and
+only outside `` or "" quotes. Stops also at NUL.
+@return	scanned up to this */
+static
+const char*
+dict_scan_to(
+/*=========*/
+	const char*	ptr,	/*!< in: scan from */
+	const char*	string)	/*!< in: look for this */
+{
+	char	quote	= '\0';
+
+	for (; *ptr; ptr++) {
+		if (*ptr == quote) {
+			/* Closing quote character: do not look for
+			starting quote or the keyword. */
+			quote = '\0';
+		} else if (quote) {
+			/* Within quotes: do nothing. */
+		} else if (*ptr == '`' || *ptr == '"') {
+			/* Starting quote: remember the quote character. */
+			quote = *ptr;
+		} else {
+			/* Outside quotes: look for the keyword. */
+			ulint	i;
+			for (i = 0; string[i]; i++) {
+				if (toupper((int)(unsigned char)(ptr[i]))
+				    != toupper((int)(unsigned char)
+					       (string[i]))) {
+					goto nomatch;
+				}
+			}
+			break;
+nomatch:
+			;
+		}
+	}
+
+	return(ptr);
+}
+
+/*********************************************************************//**
+Accepts a specified string. Comparisons are case-insensitive.
+@return if string was accepted, the pointer is moved after that, else
+ptr is returned */
+static
+const char*
+dict_accept(
+/*========*/
+	struct charset_info_st*	cs,/*!< in: the character set of ptr */
+	const char*	ptr,	/*!< in: scan from this */
+	const char*	string,	/*!< in: accept only this string as the next
+				non-whitespace string */
+	ibool*		success)/*!< out: TRUE if accepted */
+{
+	const char*	old_ptr = ptr;
+	const char*	old_ptr2;
+
+	*success = FALSE;
+
+	while (my_isspace(cs, *ptr)) {
+		ptr++;
+	}
+
+	old_ptr2 = ptr;
+
+	ptr = dict_scan_to(ptr, string);
+
+	if (*ptr == '\0' || old_ptr2 != ptr) {
+		return(old_ptr);
+	}
+
+	*success = TRUE;
+
+	return(ptr + ut_strlen(string));
+}
+
+/*********************************************************************//**
+Scans an id. For the lexical definition of an 'id', see the code below.
+Strips backquotes or double quotes from around the id.
+@return	scanned to */
+static
+const char*
+dict_scan_id(
+/*=========*/
+	struct charset_info_st*	cs,/*!< in: the character set of ptr */
+	const char*	ptr,	/*!< in: scanned to */
+	mem_heap_t*	heap,	/*!< in: heap where to allocate the id
+				(NULL=id will not be allocated, but it
+				will point to string near ptr) */
+	const char**	id,	/*!< out,own: the id; NULL if no id was
+				scannable */
+	ibool		table_id,/*!< in: TRUE=convert the allocated id
+				as a table name; FALSE=convert to UTF-8 */
+	ibool		accept_also_dot)
+				/*!< in: TRUE if also a dot can appear in a
+				non-quoted id; in a quoted id it can appear
+				always */
+{
+	char		quote	= '\0';
+	ulint		len	= 0;
+	const char*	s;
+	char*		str;
+	char*		dst;
+
+	*id = NULL;
+
+	while (my_isspace(cs, *ptr)) {
+		ptr++;
+	}
+
+	if (*ptr == '\0') {
+
+		return(ptr);
+	}
+
+	if (*ptr == '`' || *ptr == '"') {
+		quote = *ptr++;
+	}
+
+	s = ptr;
+
+	if (quote) {
+		for (;;) {
+			if (!*ptr) {
+				/* Syntax error */
+				return(ptr);
+			}
+			if (*ptr == quote) {
+				ptr++;
+				if (*ptr != quote) {
+					break;
+				}
+			}
+			ptr++;
+			len++;
+		}
+	} else {
+		while (!my_isspace(cs, *ptr) && *ptr != '(' && *ptr != ')'
+		       && (accept_also_dot || *ptr != '.')
+		       && *ptr != ',' && *ptr != '\0') {
+
+			ptr++;
+		}
+
+		len = ptr - s;
+	}
+
+	if (UNIV_UNLIKELY(!heap)) {
+		/* no heap given: id will point to source string */
+		*id = s;
+		return(ptr);
+	}
+
+	if (quote) {
+		char*	d;
+		str = d = mem_heap_alloc(heap, len + 1);
+		while (len--) {
+			if ((*d++ = *s++) == quote) {
+				s++;
+			}
+		}
+		*d++ = 0;
+		len = d - str;
+		ut_ad(*s == quote);
+		ut_ad(s + 1 == ptr);
+	} else {
+		str = mem_heap_strdupl(heap, s, len);
+	}
+
+	if (!table_id) {
+convert_id:
+		/* Convert the identifier from connection character set
+		to UTF-8. */
+		len = 3 * len + 1;
+		*id = dst = mem_heap_alloc(heap, len);
+
+		innobase_convert_from_id(cs, dst, str, len);
+	} else if (!strncmp(str, srv_mysql50_table_name_prefix,
+			    sizeof srv_mysql50_table_name_prefix)) {
+		/* This is a pre-5.1 table name
+		containing chars other than [A-Za-z0-9].
+		Discard the prefix and use raw UTF-8 encoding. */
+		str += sizeof srv_mysql50_table_name_prefix;
+		len -= sizeof srv_mysql50_table_name_prefix;
+		goto convert_id;
+	} else {
+		/* Encode using filename-safe characters. */
+		len = 5 * len + 1;
+		*id = dst = mem_heap_alloc(heap, len);
+
+		innobase_convert_from_table_id(cs, dst, str, len);
+	}
+
+	return(ptr);
+}
+
+/*********************************************************************//**
+Tries to scan a column name.
+@return	scanned to */
+static
+const char*
+dict_scan_col(
+/*==========*/
+	struct charset_info_st*	cs,	/*!< in: the character set of ptr */
+	const char*		ptr,	/*!< in: scanned to */
+	ibool*			success,/*!< out: TRUE if success */
+	dict_table_t*		table,	/*!< in: table in which the column is */
+	const dict_col_t**	column,	/*!< out: pointer to column if success */
+	mem_heap_t*		heap,	/*!< in: heap where to allocate */
+	const char**		name)	/*!< out,own: the column name;
+					NULL if no name was scannable */
+{
+	ulint		i;
+
+	*success = FALSE;
+
+	ptr = dict_scan_id(cs, ptr, heap, name, FALSE, TRUE);
+
+	if (*name == NULL) {
+
+		return(ptr);	/* Syntax error */
+	}
+
+	if (table == NULL) {
+		*success = TRUE;
+		*column = NULL;
+	} else {
+		for (i = 0; i < dict_table_get_n_cols(table); i++) {
+
+			const char*	col_name = dict_table_get_col_name(
+				table, i);
+
+			if (0 == innobase_strcasecmp(col_name, *name)) {
+				/* Found */
+
+				*success = TRUE;
+				*column = dict_table_get_nth_col(table, i);
+				strcpy((char*) *name, col_name);
+
+				break;
+			}
+		}
+	}
+
+	return(ptr);
+}
+
+/*********************************************************************//**
+Scans a table name from an SQL string.
+@return	scanned to */
+static
+const char*
+dict_scan_table_name(
+/*=================*/
+	struct charset_info_st*	cs,/*!< in: the character set of ptr */
+	const char*	ptr,	/*!< in: scanned to */
+	dict_table_t**	table,	/*!< out: table object or NULL */
+	const char*	name,	/*!< in: foreign key table name */
+	ibool*		success,/*!< out: TRUE if ok name found */
+	mem_heap_t*	heap,	/*!< in: heap where to allocate the id */
+	const char**	ref_name)/*!< out,own: the table name;
+				NULL if no name was scannable */
+{
+	const char*	database_name	= NULL;
+	ulint		database_name_len = 0;
+	const char*	table_name	= NULL;
+	ulint		table_name_len;
+	const char*	scan_name;
+	char*		ref;
+
+	*success = FALSE;
+	*table = NULL;
+
+	ptr = dict_scan_id(cs, ptr, heap, &scan_name, TRUE, FALSE);
+
+	if (scan_name == NULL) {
+
+		return(ptr);	/* Syntax error */
+	}
+
+	if (*ptr == '.') {
+		/* We scanned the database name; scan also the table name */
+
+		ptr++;
+
+		database_name = scan_name;
+		database_name_len = strlen(database_name);
+
+		ptr = dict_scan_id(cs, ptr, heap, &table_name, TRUE, FALSE);
+
+		if (table_name == NULL) {
+
+			return(ptr);	/* Syntax error */
+		}
+	} else {
+		/* To be able to read table dumps made with InnoDB-4.0.17 or
+		earlier, we must allow the dot separator between the database
+		name and the table name also to appear within a quoted
+		identifier! InnoDB used to print a constraint as:
+		... REFERENCES `databasename.tablename` ...
+		starting from 4.0.18 it is
+		... REFERENCES `databasename`.`tablename` ... */
+		const char* s;
+
+		for (s = scan_name; *s; s++) {
+			if (*s == '.') {
+				database_name = scan_name;
+				database_name_len = s - scan_name;
+				scan_name = ++s;
+				break;/* to do: multiple dots? */
+			}
+		}
+
+		table_name = scan_name;
+	}
+
+	if (database_name == NULL) {
+		/* Use the database name of the foreign key table */
+
+		database_name = name;
+		database_name_len = dict_get_db_name_len(name);
+	}
+
+	table_name_len = strlen(table_name);
+
+	/* Copy database_name, '/', table_name, '\0' */
+	ref = mem_heap_alloc(heap, database_name_len + table_name_len + 2);
+	memcpy(ref, database_name, database_name_len);
+	ref[database_name_len] = '/';
+	memcpy(ref + database_name_len + 1, table_name, table_name_len + 1);
+#ifndef __WIN__
+	if (srv_lower_case_table_names) {
+#endif /* !__WIN__ */
+		/* The table name is always put to lower case on Windows. */
+		innobase_casedn_str(ref);
+#ifndef __WIN__
+	}
+#endif /* !__WIN__ */
+
+	*success = TRUE;
+	*ref_name = ref;
+	*table = dict_table_get_low(ref);
+
+	return(ptr);
+}
+
+/*********************************************************************//**
+Skips one id. The id is allowed to contain also '.'.
+@return	scanned to */
+static
+const char*
+dict_skip_word(
+/*===========*/
+	struct charset_info_st*	cs,/*!< in: the character set of ptr */
+	const char*	ptr,	/*!< in: scanned to */
+	ibool*		success)/*!< out: TRUE if success, FALSE if just spaces
+				left in string or a syntax error */
+{
+	const char*	start;
+
+	*success = FALSE;
+
+	ptr = dict_scan_id(cs, ptr, NULL, &start, FALSE, TRUE);
+
+	if (start) {
+		*success = TRUE;
+	}
+
+	return(ptr);
+}
+
+/*********************************************************************//**
+Removes MySQL comments from an SQL string. A comment is either
+(a) '#' to the end of the line,
+(b) '--[space]' to the end of the line, or
+(c) '[slash][asterisk]' till the next '[asterisk][slash]' (like the familiar
+C comment syntax).
+@return own: SQL string stripped from comments; the caller must free
+this with mem_free()! */
+static
+char*
+dict_strip_comments(
+/*================*/
+	const char*	sql_string,	/*!< in: SQL string */
+	size_t		sql_length)	/*!< in: length of sql_string */
+{
+	char*		str;
+	const char*	sptr;
+	const char*	eptr	= sql_string + sql_length;
+	char*		ptr;
+	/* unclosed quote character (0 if none) */
+	char		quote	= 0;
+
+	str = mem_alloc(sql_length + 1);
+
+	sptr = sql_string;
+	ptr = str;
+
+	for (;;) {
+scan_more:
+		if (sptr >= eptr || *sptr == '\0') {
+end_of_string:
+			*ptr = '\0';
+
+			ut_a(ptr <= str + sql_length);
+
+			return(str);
+		}
+
+		if (*sptr == quote) {
+			/* Closing quote character: do not look for
+			starting quote or comments. */
+			quote = 0;
+		} else if (quote) {
+			/* Within quotes: do not look for
+			starting quotes or comments. */
+		} else if (*sptr == '"' || *sptr == '`' || *sptr == '\'') {
+			/* Starting quote: remember the quote character. */
+			quote = *sptr;
+		} else if (*sptr == '#'
+			   || (sptr[0] == '-' && sptr[1] == '-'
+			       && sptr[2] == ' ')) {
+			for (;;) {
+				if (++sptr >= eptr) {
+					goto end_of_string;
+				}
+
+				/* In Unix a newline is 0x0A while in Windows
+				it is 0x0D followed by 0x0A */
+
+				switch (*sptr) {
+				case (char) 0X0A:
+				case (char) 0x0D:
+				case '\0':
+					goto scan_more;
+				}
+			}
+		} else if (!quote && *sptr == '/' && *(sptr + 1) == '*') {
+			sptr += 2;
+			for (;;) {
+				if (sptr >= eptr) {
+					goto end_of_string;
+				}
+
+				switch (*sptr) {
+				case '\0':
+					goto scan_more;
+				case '*':
+					if (sptr[1] == '/') {
+						sptr += 2;
+						goto scan_more;
+					}
+				}
+
+				sptr++;
+			}
+		}
+
+		*ptr = *sptr;
+
+		ptr++;
+		sptr++;
+	}
+}
+
+/*********************************************************************//**
+Finds the highest [number] for foreign key constraints of the table. Looks
+only at the >= 4.0.18-format id's, which are of the form
+databasename/tablename_ibfk_[number].
+@return	highest number, 0 if table has no new format foreign key constraints */
+static
+ulint
+dict_table_get_highest_foreign_id(
+/*==============================*/
+	dict_table_t*	table)	/*!< in: table in the dictionary memory cache */
+{
+	dict_foreign_t*	foreign;
+	char*		endp;
+	ulint		biggest_id	= 0;
+	ulint		id;
+	ulint		len;
+
+	ut_a(table);
+
+	len = ut_strlen(table->name);
+	foreign = UT_LIST_GET_FIRST(table->foreign_list);
+
+	while (foreign) {
+		if (ut_strlen(foreign->id) > ((sizeof dict_ibfk) - 1) + len
+		    && 0 == ut_memcmp(foreign->id, table->name, len)
+		    && 0 == ut_memcmp(foreign->id + len,
+				      dict_ibfk, (sizeof dict_ibfk) - 1)
+		    && foreign->id[len + ((sizeof dict_ibfk) - 1)] != '0') {
+			/* It is of the >= 4.0.18 format */
+
+			id = strtoul(foreign->id + len
+				     + ((sizeof dict_ibfk) - 1),
+				     &endp, 10);
+			if (*endp == '\0') {
+				ut_a(id != biggest_id);
+
+				if (id > biggest_id) {
+					biggest_id = id;
+				}
+			}
+		}
+
+		foreign = UT_LIST_GET_NEXT(foreign_list, foreign);
+	}
+
+	return(biggest_id);
+}
+
+/*********************************************************************//**
+Reports a simple foreign key create clause syntax error. */
+static
+void
+dict_foreign_report_syntax_err(
+/*===========================*/
+	const char*	name,		/*!< in: table name */
+	const char*	start_of_latest_foreign,
+					/*!< in: start of the foreign key clause
+					in the SQL string */
+	const char*	ptr)		/*!< in: place of the syntax error */
+{
+	FILE*	ef = dict_foreign_err_file;
+
+	mutex_enter(&dict_foreign_err_mutex);
+	dict_foreign_error_report_low(ef, name);
+	fprintf(ef, "%s:\nSyntax error close to:\n%s\n",
+		start_of_latest_foreign, ptr);
+	mutex_exit(&dict_foreign_err_mutex);
+}
+
+/*********************************************************************//**
+Scans a table create SQL string and adds to the data dictionary the foreign
+key constraints declared in the string. This function should be called after
+the indexes for a table have been created. Each foreign key constraint must
+be accompanied with indexes in both participating tables. The indexes are
+allowed to contain more fields than mentioned in the constraint.
+@return	error code or DB_SUCCESS */
+static
+ulint
+dict_create_foreign_constraints_low(
+/*================================*/
+	trx_t*		trx,	/*!< in: transaction */
+	mem_heap_t*	heap,	/*!< in: memory heap */
+	struct charset_info_st*	cs,/*!< in: the character set of sql_string */
+	const char*	sql_string,
+				/*!< in: CREATE TABLE or ALTER TABLE statement
+				where foreign keys are declared like:
+				FOREIGN KEY (a, b) REFERENCES table2(c, d),
+				table2 can be written also with the database
+				name before it: test.table2; the default
+				database is the database of parameter name */
+	const char*	name,	/*!< in: table full name in the normalized form
+				database_name/table_name */
+	ibool		reject_fks)
+				/*!< in: if TRUE, fail with error code
+				DB_CANNOT_ADD_CONSTRAINT if any foreign
+				keys are found. */
+{
+	dict_table_t*	table;
+	dict_table_t*	referenced_table;
+	dict_table_t*	table_to_alter;
+	ulint		highest_id_so_far	= 0;
+	dict_index_t*	index;
+	dict_foreign_t*	foreign;
+	const char*	ptr			= sql_string;
+	const char*	start_of_latest_foreign	= sql_string;
+	FILE*		ef			= dict_foreign_err_file;
+	const char*	constraint_name;
+	ibool		success;
+	ulint		error;
+	const char*	ptr1;
+	const char*	ptr2;
+	ulint		i;
+	ulint		j;
+	ibool		is_on_delete;
+	ulint		n_on_deletes;
+	ulint		n_on_updates;
+	const dict_col_t*columns[500];
+	const char*	column_names[500];
+	const char*	referenced_table_name;
+
+	ut_ad(mutex_own(&(dict_sys->mutex)));
+
+	table = dict_table_get_low(name);
+
+	if (table == NULL) {
+		mutex_enter(&dict_foreign_err_mutex);
+		dict_foreign_error_report_low(ef, name);
+		fprintf(ef,
+			"Cannot find the table in the internal"
+			" data dictionary of InnoDB.\n"
+			"Create table statement:\n%s\n", sql_string);
+		mutex_exit(&dict_foreign_err_mutex);
+
+		return(DB_ERROR);
+	}
+
+	/* First check if we are actually doing an ALTER TABLE, and in that
+	case look for the table being altered */
+
+	ptr = dict_accept(cs, ptr, "ALTER", &success);
+
+	if (!success) {
+
+		goto loop;
+	}
+
+	ptr = dict_accept(cs, ptr, "TABLE", &success);
+
+	if (!success) {
+
+		goto loop;
+	}
+
+	/* We are doing an ALTER TABLE: scan the table name we are altering */
+
+	ptr = dict_scan_table_name(cs, ptr, &table_to_alter, name,
+				   &success, heap, &referenced_table_name);
+	if (!success) {
+		fprintf(stderr,
+			"InnoDB: Error: could not find"
+			" the table being ALTERED in:\n%s\n",
+			sql_string);
+
+		return(DB_ERROR);
+	}
+
+	/* Starting from 4.0.18 and 4.1.2, we generate foreign key id's in the
+	format databasename/tablename_ibfk_[number], where [number] is local
+	to the table; look for the highest [number] for table_to_alter, so
+	that we can assign to new constraints higher numbers. */
+
+	/* If we are altering a temporary table, the table name after ALTER
+	TABLE does not correspond to the internal table name, and
+	table_to_alter is NULL. TODO: should we fix this somehow? */
+
+	if (table_to_alter == NULL) {
+		highest_id_so_far = 0;
+	} else {
+		highest_id_so_far = dict_table_get_highest_foreign_id(
+			table_to_alter);
+	}
+
+	/* Scan for foreign key declarations in a loop */
+loop:
+	/* Scan either to "CONSTRAINT" or "FOREIGN", whichever is closer */
+
+	ptr1 = dict_scan_to(ptr, "CONSTRAINT");
+	ptr2 = dict_scan_to(ptr, "FOREIGN");
+
+	constraint_name = NULL;
+
+	if (ptr1 < ptr2) {
+		/* The user may have specified a constraint name. Pick it so
+		that we can store 'databasename/constraintname' as the id of
+		of the constraint to system tables. */
+		ptr = ptr1;
+
+		ptr = dict_accept(cs, ptr, "CONSTRAINT", &success);
+
+		ut_a(success);
+
+		if (!my_isspace(cs, *ptr) && *ptr != '"' && *ptr != '`') {
+			goto loop;
+		}
+
+		while (my_isspace(cs, *ptr)) {
+			ptr++;
+		}
+
+		/* read constraint name unless got "CONSTRAINT FOREIGN" */
+		if (ptr != ptr2) {
+			ptr = dict_scan_id(cs, ptr, heap,
+					   &constraint_name, FALSE, FALSE);
+		}
+	} else {
+		ptr = ptr2;
+	}
+
+	if (*ptr == '\0') {
+		/* The proper way to reject foreign keys for temporary
+		tables would be to split the lexing and syntactical
+		analysis of foreign key clauses from the actual adding
+		of them, so that ha_innodb.cc could first parse the SQL
+		command, determine if there are any foreign keys, and
+		if so, immediately reject the command if the table is a
+		temporary one. For now, this kludge will work. */
+		if (reject_fks && (UT_LIST_GET_LEN(table->foreign_list) > 0)) {
+
+			return(DB_CANNOT_ADD_CONSTRAINT);
+		}
+
+		/**********************************************************/
+		/* The following call adds the foreign key constraints
+		to the data dictionary system tables on disk */
+
+		error = dict_create_add_foreigns_to_dictionary(
+			highest_id_so_far, table, trx);
+		return(error);
+	}
+
+	start_of_latest_foreign = ptr;
+
+	ptr = dict_accept(cs, ptr, "FOREIGN", &success);
+
+	if (!success) {
+		goto loop;
+	}
+
+	if (!my_isspace(cs, *ptr)) {
+		goto loop;
+	}
+
+	ptr = dict_accept(cs, ptr, "KEY", &success);
+
+	if (!success) {
+		goto loop;
+	}
+
+	ptr = dict_accept(cs, ptr, "(", &success);
+
+	if (!success) {
+		/* MySQL allows also an index id before the '('; we
+		skip it */
+		ptr = dict_skip_word(cs, ptr, &success);
+
+		if (!success) {
+			dict_foreign_report_syntax_err(
+				name, start_of_latest_foreign, ptr);
+
+			return(DB_CANNOT_ADD_CONSTRAINT);
+		}
+
+		ptr = dict_accept(cs, ptr, "(", &success);
+
+		if (!success) {
+			/* We do not flag a syntax error here because in an
+			ALTER TABLE we may also have DROP FOREIGN KEY abc */
+
+			goto loop;
+		}
+	}
+
+	i = 0;
+
+	/* Scan the columns in the first list */
+col_loop1:
+	ut_a(i < (sizeof column_names) / sizeof *column_names);
+	ptr = dict_scan_col(cs, ptr, &success, table, columns + i,
+			    heap, column_names + i);
+	if (!success) {
+		mutex_enter(&dict_foreign_err_mutex);
+		dict_foreign_error_report_low(ef, name);
+		fprintf(ef, "%s:\nCannot resolve column name close to:\n%s\n",
+			start_of_latest_foreign, ptr);
+		mutex_exit(&dict_foreign_err_mutex);
+
+		return(DB_CANNOT_ADD_CONSTRAINT);
+	}
+
+	i++;
+
+	ptr = dict_accept(cs, ptr, ",", &success);
+
+	if (success) {
+		goto col_loop1;
+	}
+
+	ptr = dict_accept(cs, ptr, ")", &success);
+
+	if (!success) {
+		dict_foreign_report_syntax_err(
+			name, start_of_latest_foreign, ptr);
+		return(DB_CANNOT_ADD_CONSTRAINT);
+	}
+
+	/* Try to find an index which contains the columns
+	as the first fields and in the right order */
+
+	index = dict_foreign_find_index(table, column_names, i,
+					NULL, TRUE, FALSE);
+
+	if (!index) {
+		mutex_enter(&dict_foreign_err_mutex);
+		dict_foreign_error_report_low(ef, name);
+		fputs("There is no index in table ", ef);
+		ut_print_name(ef, NULL, TRUE, name);
+		fprintf(ef, " where the columns appear\n"
+			"as the first columns. Constraint:\n%s\n"
+			"See " REFMAN "innodb-foreign-key-constraints.html\n"
+			"for correct foreign key definition.\n",
+			start_of_latest_foreign);
+		mutex_exit(&dict_foreign_err_mutex);
+
+		return(DB_CANNOT_ADD_CONSTRAINT);
+	}
+	ptr = dict_accept(cs, ptr, "REFERENCES", &success);
+
+	if (!success || !my_isspace(cs, *ptr)) {
+		dict_foreign_report_syntax_err(
+			name, start_of_latest_foreign, ptr);
+		return(DB_CANNOT_ADD_CONSTRAINT);
+	}
+
+	/* Let us create a constraint struct */
+
+	foreign = dict_mem_foreign_create();
+
+	if (constraint_name) {
+		ulint	db_len;
+
+		/* Catenate 'databasename/' to the constraint name specified
+		by the user: we conceive the constraint as belonging to the
+		same MySQL 'database' as the table itself. We store the name
+		to foreign->id. */
+
+		db_len = dict_get_db_name_len(table->name);
+
+		foreign->id = mem_heap_alloc(
+			foreign->heap, db_len + strlen(constraint_name) + 2);
+
+		ut_memcpy(foreign->id, table->name, db_len);
+		foreign->id[db_len] = '/';
+		strcpy(foreign->id + db_len + 1, constraint_name);
+	}
+
+	foreign->foreign_table = table;
+	foreign->foreign_table_name = mem_heap_strdup(foreign->heap,
+						      table->name);
+	foreign->foreign_index = index;
+	foreign->n_fields = (unsigned int) i;
+	foreign->foreign_col_names = mem_heap_alloc(foreign->heap,
+						    i * sizeof(void*));
+	for (i = 0; i < foreign->n_fields; i++) {
+		foreign->foreign_col_names[i] = mem_heap_strdup(
+			foreign->heap,
+			dict_table_get_col_name(table,
+						dict_col_get_no(columns[i])));
+	}
+
+	ptr = dict_scan_table_name(cs, ptr, &referenced_table, name,
+				   &success, heap, &referenced_table_name);
+
+	/* Note that referenced_table can be NULL if the user has suppressed
+	checking of foreign key constraints! */
+
+	if (!success || (!referenced_table && trx->check_foreigns)) {
+		dict_foreign_free(foreign);
+
+		mutex_enter(&dict_foreign_err_mutex);
+		dict_foreign_error_report_low(ef, name);
+		fprintf(ef, "%s:\nCannot resolve table name close to:\n"
+			"%s\n",
+			start_of_latest_foreign, ptr);
+		mutex_exit(&dict_foreign_err_mutex);
+
+		return(DB_CANNOT_ADD_CONSTRAINT);
+	}
+
+	ptr = dict_accept(cs, ptr, "(", &success);
+
+	if (!success) {
+		dict_foreign_free(foreign);
+		dict_foreign_report_syntax_err(name, start_of_latest_foreign,
+					       ptr);
+		return(DB_CANNOT_ADD_CONSTRAINT);
+	}
+
+	/* Scan the columns in the second list */
+	i = 0;
+
+col_loop2:
+	ptr = dict_scan_col(cs, ptr, &success, referenced_table, columns + i,
+			    heap, column_names + i);
+	i++;
+
+	if (!success) {
+		dict_foreign_free(foreign);
+
+		mutex_enter(&dict_foreign_err_mutex);
+		dict_foreign_error_report_low(ef, name);
+		fprintf(ef, "%s:\nCannot resolve column name close to:\n"
+			"%s\n",
+			start_of_latest_foreign, ptr);
+		mutex_exit(&dict_foreign_err_mutex);
+
+		return(DB_CANNOT_ADD_CONSTRAINT);
+	}
+
+	ptr = dict_accept(cs, ptr, ",", &success);
+
+	if (success) {
+		goto col_loop2;
+	}
+
+	ptr = dict_accept(cs, ptr, ")", &success);
+
+	if (!success || foreign->n_fields != i) {
+		dict_foreign_free(foreign);
+
+		dict_foreign_report_syntax_err(name, start_of_latest_foreign,
+					       ptr);
+		return(DB_CANNOT_ADD_CONSTRAINT);
+	}
+
+	n_on_deletes = 0;
+	n_on_updates = 0;
+
+scan_on_conditions:
+	/* Loop here as long as we can find ON ... conditions */
+
+	ptr = dict_accept(cs, ptr, "ON", &success);
+
+	if (!success) {
+
+		goto try_find_index;
+	}
+
+	ptr = dict_accept(cs, ptr, "DELETE", &success);
+
+	if (!success) {
+		ptr = dict_accept(cs, ptr, "UPDATE", &success);
+
+		if (!success) {
+			dict_foreign_free(foreign);
+
+			dict_foreign_report_syntax_err(
+				name, start_of_latest_foreign, ptr);
+			return(DB_CANNOT_ADD_CONSTRAINT);
+		}
+
+		is_on_delete = FALSE;
+		n_on_updates++;
+	} else {
+		is_on_delete = TRUE;
+		n_on_deletes++;
+	}
+
+	ptr = dict_accept(cs, ptr, "RESTRICT", &success);
+
+	if (success) {
+		goto scan_on_conditions;
+	}
+
+	ptr = dict_accept(cs, ptr, "CASCADE", &success);
+
+	if (success) {
+		if (is_on_delete) {
+			foreign->type |= DICT_FOREIGN_ON_DELETE_CASCADE;
+		} else {
+			foreign->type |= DICT_FOREIGN_ON_UPDATE_CASCADE;
+		}
+
+		goto scan_on_conditions;
+	}
+
+	ptr = dict_accept(cs, ptr, "NO", &success);
+
+	if (success) {
+		ptr = dict_accept(cs, ptr, "ACTION", &success);
+
+		if (!success) {
+			dict_foreign_free(foreign);
+			dict_foreign_report_syntax_err(
+				name, start_of_latest_foreign, ptr);
+
+			return(DB_CANNOT_ADD_CONSTRAINT);
+		}
+
+		if (is_on_delete) {
+			foreign->type |= DICT_FOREIGN_ON_DELETE_NO_ACTION;
+		} else {
+			foreign->type |= DICT_FOREIGN_ON_UPDATE_NO_ACTION;
+		}
+
+		goto scan_on_conditions;
+	}
+
+	ptr = dict_accept(cs, ptr, "SET", &success);
+
+	if (!success) {
+		dict_foreign_free(foreign);
+		dict_foreign_report_syntax_err(name, start_of_latest_foreign,
+					       ptr);
+		return(DB_CANNOT_ADD_CONSTRAINT);
+	}
+
+	ptr = dict_accept(cs, ptr, "NULL", &success);
+
+	if (!success) {
+		dict_foreign_free(foreign);
+		dict_foreign_report_syntax_err(name, start_of_latest_foreign,
+					       ptr);
+		return(DB_CANNOT_ADD_CONSTRAINT);
+	}
+
+	for (j = 0; j < foreign->n_fields; j++) {
+		if ((dict_index_get_nth_col(foreign->foreign_index, j)->prtype)
+		    & DATA_NOT_NULL) {
+
+			/* It is not sensible to define SET NULL
+			if the column is not allowed to be NULL! */
+
+			dict_foreign_free(foreign);
+
+			mutex_enter(&dict_foreign_err_mutex);
+			dict_foreign_error_report_low(ef, name);
+			fprintf(ef, "%s:\n"
+				"You have defined a SET NULL condition"
+				" though some of the\n"
+				"columns are defined as NOT NULL.\n",
+				start_of_latest_foreign);
+			mutex_exit(&dict_foreign_err_mutex);
+
+			return(DB_CANNOT_ADD_CONSTRAINT);
+		}
+	}
+
+	if (is_on_delete) {
+		foreign->type |= DICT_FOREIGN_ON_DELETE_SET_NULL;
+	} else {
+		foreign->type |= DICT_FOREIGN_ON_UPDATE_SET_NULL;
+	}
+
+	goto scan_on_conditions;
+
+try_find_index:
+	if (n_on_deletes > 1 || n_on_updates > 1) {
+		/* It is an error to define more than 1 action */
+
+		dict_foreign_free(foreign);
+
+		mutex_enter(&dict_foreign_err_mutex);
+		dict_foreign_error_report_low(ef, name);
+		fprintf(ef, "%s:\n"
+			"You have twice an ON DELETE clause"
+			" or twice an ON UPDATE clause.\n",
+			start_of_latest_foreign);
+		mutex_exit(&dict_foreign_err_mutex);
+
+		return(DB_CANNOT_ADD_CONSTRAINT);
+	}
+
+	/* Try to find an index which contains the columns as the first fields
+	and in the right order, and the types are the same as in
+	foreign->foreign_index */
+
+	if (referenced_table) {
+		index = dict_foreign_find_index(referenced_table,
+						column_names, i,
+						foreign->foreign_index,
+						TRUE, FALSE);
+		if (!index) {
+			dict_foreign_free(foreign);
+			mutex_enter(&dict_foreign_err_mutex);
+			dict_foreign_error_report_low(ef, name);
+			fprintf(ef, "%s:\n"
+				"Cannot find an index in the"
+				" referenced table where the\n"
+				"referenced columns appear as the"
+				" first columns, or column types\n"
+				"in the table and the referenced table"
+				" do not match for constraint.\n"
+				"Note that the internal storage type of"
+				" ENUM and SET changed in\n"
+				"tables created with >= InnoDB-4.1.12,"
+				" and such columns in old tables\n"
+				"cannot be referenced by such columns"
+				" in new tables.\n"
+				"See " REFMAN
+				"innodb-foreign-key-constraints.html\n"
+				"for correct foreign key definition.\n",
+				start_of_latest_foreign);
+			mutex_exit(&dict_foreign_err_mutex);
+
+			return(DB_CANNOT_ADD_CONSTRAINT);
+		}
+	} else {
+		ut_a(trx->check_foreigns == FALSE);
+		index = NULL;
+	}
+
+	foreign->referenced_index = index;
+	foreign->referenced_table = referenced_table;
+
+	foreign->referenced_table_name
+		= mem_heap_strdup(foreign->heap, referenced_table_name);
+
+	foreign->referenced_col_names = mem_heap_alloc(foreign->heap,
+						       i * sizeof(void*));
+	for (i = 0; i < foreign->n_fields; i++) {
+		foreign->referenced_col_names[i]
+			= mem_heap_strdup(foreign->heap, column_names[i]);
+	}
+
+	/* We found an ok constraint definition: add to the lists */
+
+	UT_LIST_ADD_LAST(foreign_list, table->foreign_list, foreign);
+
+	if (referenced_table) {
+		UT_LIST_ADD_LAST(referenced_list,
+				 referenced_table->referenced_list,
+				 foreign);
+	}
+
+	goto loop;
+}
+
+/*********************************************************************//**
+Scans a table create SQL string and adds to the data dictionary the foreign
+key constraints declared in the string. This function should be called after
+the indexes for a table have been created. Each foreign key constraint must
+be accompanied with indexes in both participating tables. The indexes are
+allowed to contain more fields than mentioned in the constraint.
+@return	error code or DB_SUCCESS */
+UNIV_INTERN
+ulint
+dict_create_foreign_constraints(
+/*============================*/
+	trx_t*		trx,		/*!< in: transaction */
+	const char*	sql_string,	/*!< in: table create statement where
+					foreign keys are declared like:
+					FOREIGN KEY (a, b) REFERENCES
+					table2(c, d), table2 can be written
+					also with the database
+					name before it: test.table2; the
+					default database id the database of
+					parameter name */
+	size_t		sql_length,	/*!< in: length of sql_string */
+	const char*	name,		/*!< in: table full name in the
+					normalized form
+					database_name/table_name */
+	ibool		reject_fks)	/*!< in: if TRUE, fail with error
+					code DB_CANNOT_ADD_CONSTRAINT if
+					any foreign keys are found. */
+{
+	char*			str;
+	ulint			err;
+	mem_heap_t*		heap;
+
+	ut_a(trx);
+	ut_a(trx->mysql_thd);
+
+	str = dict_strip_comments(sql_string, sql_length);
+	heap = mem_heap_create(10000);
+
+	err = dict_create_foreign_constraints_low(
+		trx, heap, innobase_get_charset(trx->mysql_thd), str, name,
+		reject_fks);
+
+	mem_heap_free(heap);
+	mem_free(str);
+
+	return(err);
+}
+
+/**********************************************************************//**
+Parses the CONSTRAINT id's to be dropped in an ALTER TABLE statement.
+@return DB_SUCCESS or DB_CANNOT_DROP_CONSTRAINT if syntax error or the
+constraint id does not match */
+UNIV_INTERN
+ulint
+dict_foreign_parse_drop_constraints(
+/*================================*/
+	mem_heap_t*	heap,			/*!< in: heap from which we can
+						allocate memory */
+	trx_t*		trx,			/*!< in: transaction */
+	dict_table_t*	table,			/*!< in: table */
+	ulint*		n,			/*!< out: number of constraints
+						to drop */
+	const char***	constraints_to_drop)	/*!< out: id's of the
+						constraints to drop */
+{
+	dict_foreign_t*		foreign;
+	ibool			success;
+	char*			str;
+	size_t			len;
+	const char*		ptr;
+	const char*		id;
+	FILE*			ef	= dict_foreign_err_file;
+	struct charset_info_st*	cs;
+
+	ut_a(trx);
+	ut_a(trx->mysql_thd);
+
+	cs = innobase_get_charset(trx->mysql_thd);
+
+	*n = 0;
+
+	*constraints_to_drop = mem_heap_alloc(heap, 1000 * sizeof(char*));
+
+	ptr = innobase_get_stmt(trx->mysql_thd, &len);
+
+	str = dict_strip_comments(ptr, len);
+
+	ptr = str;
+
+	ut_ad(mutex_own(&(dict_sys->mutex)));
+loop:
+	ptr = dict_scan_to(ptr, "DROP");
+
+	if (*ptr == '\0') {
+		mem_free(str);
+
+		return(DB_SUCCESS);
+	}
+
+	ptr = dict_accept(cs, ptr, "DROP", &success);
+
+	if (!my_isspace(cs, *ptr)) {
+
+		goto loop;
+	}
+
+	ptr = dict_accept(cs, ptr, "FOREIGN", &success);
+
+	if (!success || !my_isspace(cs, *ptr)) {
+
+		goto loop;
+	}
+
+	ptr = dict_accept(cs, ptr, "KEY", &success);
+
+	if (!success) {
+
+		goto syntax_error;
+	}
+
+	ptr = dict_scan_id(cs, ptr, heap, &id, FALSE, TRUE);
+
+	if (id == NULL) {
+
+		goto syntax_error;
+	}
+
+	ut_a(*n < 1000);
+	(*constraints_to_drop)[*n] = id;
+	(*n)++;
+
+	/* Look for the given constraint id */
+
+	foreign = UT_LIST_GET_FIRST(table->foreign_list);
+
+	while (foreign != NULL) {
+		if (0 == strcmp(foreign->id, id)
+		    || (strchr(foreign->id, '/')
+			&& 0 == strcmp(id,
+				       dict_remove_db_name(foreign->id)))) {
+			/* Found */
+			break;
+		}
+
+		foreign = UT_LIST_GET_NEXT(foreign_list, foreign);
+	}
+
+	if (foreign == NULL) {
+		mutex_enter(&dict_foreign_err_mutex);
+		rewind(ef);
+		ut_print_timestamp(ef);
+		fputs(" Error in dropping of a foreign key constraint"
+		      " of table ", ef);
+		ut_print_name(ef, NULL, TRUE, table->name);
+		fputs(",\n"
+		      "in SQL command\n", ef);
+		fputs(str, ef);
+		fputs("\nCannot find a constraint with the given id ", ef);
+		ut_print_name(ef, NULL, FALSE, id);
+		fputs(".\n", ef);
+		mutex_exit(&dict_foreign_err_mutex);
+
+		mem_free(str);
+
+		return(DB_CANNOT_DROP_CONSTRAINT);
+	}
+
+	goto loop;
+
+syntax_error:
+	mutex_enter(&dict_foreign_err_mutex);
+	rewind(ef);
+	ut_print_timestamp(ef);
+	fputs(" Syntax error in dropping of a"
+	      " foreign key constraint of table ", ef);
+	ut_print_name(ef, NULL, TRUE, table->name);
+	fprintf(ef, ",\n"
+		"close to:\n%s\n in SQL command\n%s\n", ptr, str);
+	mutex_exit(&dict_foreign_err_mutex);
+
+	mem_free(str);
+
+	return(DB_CANNOT_DROP_CONSTRAINT);
+}
+
+/*==================== END OF FOREIGN KEY PROCESSING ====================*/
+
+/**********************************************************************//**
+Returns an index object if it is found in the dictionary cache.
+Assumes that dict_sys->mutex is already being held.
+@return	index, NULL if not found */
+UNIV_INTERN
+dict_index_t*
+dict_index_get_if_in_cache_low(
+/*===========================*/
+	dulint	index_id)	/*!< in: index id */
+{
+	ut_ad(mutex_own(&(dict_sys->mutex)));
+
+	return(dict_index_find_on_id_low(index_id));
+}
+
+#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
+/**********************************************************************//**
+Returns an index object if it is found in the dictionary cache.
+@return	index, NULL if not found */
+UNIV_INTERN
+dict_index_t*
+dict_index_get_if_in_cache(
+/*=======================*/
+	dulint	index_id)	/*!< in: index id */
+{
+	dict_index_t*	index;
+
+	if (dict_sys == NULL) {
+		return(NULL);
+	}
+
+	mutex_enter(&(dict_sys->mutex));
+
+	index = dict_index_get_if_in_cache_low(index_id);
+
+	mutex_exit(&(dict_sys->mutex));
+
+	return(index);
+}
+#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
+
+#ifdef UNIV_DEBUG
+/**********************************************************************//**
+Checks that a tuple has n_fields_cmp value in a sensible range, so that
+no comparison can occur with the page number field in a node pointer.
+@return	TRUE if ok */
+UNIV_INTERN
+ibool
+dict_index_check_search_tuple(
+/*==========================*/
+	const dict_index_t*	index,	/*!< in: index tree */
+	const dtuple_t*		tuple)	/*!< in: tuple used in a search */
+{
+	ut_a(index);
+	ut_a(dtuple_get_n_fields_cmp(tuple)
+	     <= dict_index_get_n_unique_in_tree(index));
+	return(TRUE);
+}
+#endif /* UNIV_DEBUG */
+
+/**********************************************************************//**
+Builds a node pointer out of a physical record and a page number.
+@return	own: node pointer */
+UNIV_INTERN
+dtuple_t*
+dict_index_build_node_ptr(
+/*======================*/
+	const dict_index_t*	index,	/*!< in: index */
+	const rec_t*		rec,	/*!< in: record for which to build node
+					pointer */
+	ulint			page_no,/*!< in: page number to put in node
+					pointer */
+	mem_heap_t*		heap,	/*!< in: memory heap where pointer
+					created */
+	ulint			level)	/*!< in: level of rec in tree:
+					0 means leaf level */
+{
+	dtuple_t*	tuple;
+	dfield_t*	field;
+	byte*		buf;
+	ulint		n_unique;
+
+	if (UNIV_UNLIKELY(index->type & DICT_UNIVERSAL)) {
+		/* In a universal index tree, we take the whole record as
+		the node pointer if the record is on the leaf level,
+		on non-leaf levels we remove the last field, which
+		contains the page number of the child page */
+
+		ut_a(!dict_table_is_comp(index->table));
+		n_unique = rec_get_n_fields_old(rec);
+
+		if (level > 0) {
+			ut_a(n_unique > 1);
+			n_unique--;
+		}
+	} else {
+		n_unique = dict_index_get_n_unique_in_tree(index);
+	}
+
+	tuple = dtuple_create(heap, n_unique + 1);
+
+	/* When searching in the tree for the node pointer, we must not do
+	comparison on the last field, the page number field, as on upper
+	levels in the tree there may be identical node pointers with a
+	different page number; therefore, we set the n_fields_cmp to one
+	less: */
+
+	dtuple_set_n_fields_cmp(tuple, n_unique);
+
+	dict_index_copy_types(tuple, index, n_unique);
+
+	buf = mem_heap_alloc(heap, 4);
+
+	mach_write_to_4(buf, page_no);
+
+	field = dtuple_get_nth_field(tuple, n_unique);
+	dfield_set_data(field, buf, 4);
+
+	dtype_set(dfield_get_type(field), DATA_SYS_CHILD, DATA_NOT_NULL, 4);
+
+	rec_copy_prefix_to_dtuple(tuple, rec, index, n_unique, heap);
+	dtuple_set_info_bits(tuple, dtuple_get_info_bits(tuple)
+			     | REC_STATUS_NODE_PTR);
+
+	ut_ad(dtuple_check_typed(tuple));
+
+	return(tuple);
+}
+
+/**********************************************************************//**
+Copies an initial segment of a physical record, long enough to specify an
+index entry uniquely.
+@return	pointer to the prefix record */
+UNIV_INTERN
+rec_t*
+dict_index_copy_rec_order_prefix(
+/*=============================*/
+	const dict_index_t*	index,	/*!< in: index */
+	const rec_t*		rec,	/*!< in: record for which to
+					copy prefix */
+	ulint*			n_fields,/*!< out: number of fields copied */
+	byte**			buf,	/*!< in/out: memory buffer for the
+					copied prefix, or NULL */
+	ulint*			buf_size)/*!< in/out: buffer size */
+{
+	ulint		n;
+
+	UNIV_PREFETCH_R(rec);
+
+	if (UNIV_UNLIKELY(index->type & DICT_UNIVERSAL)) {
+		ut_a(!dict_table_is_comp(index->table));
+		n = rec_get_n_fields_old(rec);
+	} else {
+		n = dict_index_get_n_unique_in_tree(index);
+	}
+
+	*n_fields = n;
+	return(rec_copy_prefix_to_buf(rec, index, n, buf, buf_size));
+}
+
+/**********************************************************************//**
+Builds a typed data tuple out of a physical record.
+@return	own: data tuple */
+UNIV_INTERN
+dtuple_t*
+dict_index_build_data_tuple(
+/*========================*/
+	dict_index_t*	index,	/*!< in: index tree */
+	rec_t*		rec,	/*!< in: record for which to build data tuple */
+	ulint		n_fields,/*!< in: number of data fields */
+	mem_heap_t*	heap)	/*!< in: memory heap where tuple created */
+{
+	dtuple_t*	tuple;
+
+	ut_ad(dict_table_is_comp(index->table)
+	      || n_fields <= rec_get_n_fields_old(rec));
+
+	tuple = dtuple_create(heap, n_fields);
+
+	dict_index_copy_types(tuple, index, n_fields);
+
+	rec_copy_prefix_to_dtuple(tuple, rec, index, n_fields, heap);
+
+	ut_ad(dtuple_check_typed(tuple));
+
+	return(tuple);
+}
+
+/*********************************************************************//**
+Calculates the minimum record length in an index. */
+UNIV_INTERN
+ulint
+dict_index_calc_min_rec_len(
+/*========================*/
+	const dict_index_t*	index)	/*!< in: index */
+{
+	ulint	sum	= 0;
+	ulint	i;
+	ulint	comp	= dict_table_is_comp(index->table);
+
+	if (comp) {
+		ulint nullable = 0;
+		sum = REC_N_NEW_EXTRA_BYTES;
+		for (i = 0; i < dict_index_get_n_fields(index); i++) {
+			const dict_col_t*	col
+				= dict_index_get_nth_col(index, i);
+			ulint	size = dict_col_get_fixed_size(col, comp);
+			sum += size;
+			if (!size) {
+				size = col->len;
+				sum += size < 128 ? 1 : 2;
+			}
+			if (!(col->prtype & DATA_NOT_NULL)) {
+				nullable++;
+			}
+		}
+
+		/* round the NULL flags up to full bytes */
+		sum += UT_BITS_IN_BYTES(nullable);
+
+		return(sum);
+	}
+
+	for (i = 0; i < dict_index_get_n_fields(index); i++) {
+		sum += dict_col_get_fixed_size(
+			dict_index_get_nth_col(index, i), comp);
+	}
+
+	if (sum > 127) {
+		sum += 2 * dict_index_get_n_fields(index);
+	} else {
+		sum += dict_index_get_n_fields(index);
+	}
+
+	sum += REC_N_OLD_EXTRA_BYTES;
+
+	return(sum);
+}
+
+/*********************************************************************//**
+functions to use SYS_STATS system table. */
+static
+ibool
+dict_reload_statistics(
+/*===================*/
+	dict_table_t*	table,
+	ulint*		sum_of_index_sizes)
+{
+	dict_index_t*	index;
+	ulint		size;
+	mem_heap_t*	heap;
+
+	index = dict_table_get_first_index(table);
+
+	if (index == NULL) {
+		/* Table definition is corrupt */
+
+		return(FALSE);
+	}
+
+	heap = mem_heap_create(1000);
+
+	while (index) {
+		if (table->is_corrupt) {
+			ut_a(srv_pass_corrupt_table);
+			mem_heap_free(heap);
+			return(FALSE);
+		}
+
+		size = btr_get_size(index, BTR_TOTAL_SIZE);
+
+		index->stat_index_size = size;
+
+		*sum_of_index_sizes += size;
+
+		size = btr_get_size(index, BTR_N_LEAF_PAGES);
+
+		if (size == 0) {
+			/* The root node of the tree is a leaf */
+			size = 1;
+		}
+
+		index->stat_n_leaf_pages = size;
+
+/*===========================================*/
+{
+	dict_table_t*	sys_stats;
+	dict_index_t*	sys_index;
+	btr_pcur_t	pcur;
+	dtuple_t*	tuple;
+	dfield_t*	dfield;
+	ulint		key_cols;
+	ulint		n_cols;
+	const rec_t*	rec;
+	const byte*	field;
+	ulint		len;
+	ib_int64_t*	stat_n_diff_key_vals_tmp;
+	byte*		buf;
+	ulint		i;
+	mtr_t		mtr;
+
+	n_cols = dict_index_get_n_unique(index);
+	stat_n_diff_key_vals_tmp = mem_heap_zalloc(heap, (n_cols + 1) * sizeof(ib_int64_t));
+
+	sys_stats = dict_sys->sys_stats;
+	sys_index = UT_LIST_GET_FIRST(sys_stats->indexes);
+	ut_a(!dict_table_is_comp(sys_stats));
+
+	tuple = dtuple_create(heap, 1);
+	dfield = dtuple_get_nth_field(tuple, 0);
+
+	buf = mem_heap_alloc(heap, 8);
+	mach_write_to_8(buf, index->id);
+
+	dfield_set_data(dfield, buf, 8);
+	dict_index_copy_types(tuple, sys_index, 1);
+
+	mtr_start(&mtr);
+
+	btr_pcur_open_on_user_rec(sys_index, tuple, PAGE_CUR_GE,
+				  BTR_SEARCH_LEAF, &pcur, &mtr);
+	for (i = 0; i <= n_cols; i++) {
+		rec = btr_pcur_get_rec(&pcur);
+
+		if (!btr_pcur_is_on_user_rec(&pcur)
+		    || ut_dulint_cmp(mach_read_from_8(rec_get_nth_field_old(rec, 0, &len)),
+				     index->id)) {
+			/* not found: even 1 if not found should not be alowed */
+			fprintf(stderr, "InnoDB: Warning: stats for %s/%s (%lu/%lu)"
+				        " not fonund in SYS_STATS\n",
+					index->table_name, index->name, i, n_cols);
+			btr_pcur_close(&pcur);
+			mtr_commit(&mtr);
+			mem_heap_free(heap);
+			return(FALSE);
+		}
+
+		if (rec_get_deleted_flag(rec, 0)) {
+			goto next_rec;
+		}
+
+		field = rec_get_nth_field_old(rec, 1, &len);
+		ut_a(len == 4);
+
+		key_cols = mach_read_from_4(field);
+
+		ut_a(i == key_cols);
+
+		field = rec_get_nth_field_old(rec, DICT_SYS_STATS_DIFF_VALS_FIELD, &len);
+		ut_a(len == 8);
+
+		stat_n_diff_key_vals_tmp[i] = ut_conv_dulint_to_longlong(mach_read_from_8(field));
+next_rec:
+		btr_pcur_move_to_next_user_rec(&pcur, &mtr);
+	}
+
+	btr_pcur_close(&pcur);
+	mtr_commit(&mtr);
+
+	dict_index_stat_mutex_enter(index);
+	for (i = 0; i <= n_cols; i++) {
+		index->stat_n_diff_key_vals[i] = stat_n_diff_key_vals_tmp[i];
+	}
+	dict_index_stat_mutex_exit(index);
+}
+/*===========================================*/
+
+		index = dict_table_get_next_index(index);
+	}
+
+	mem_heap_free(heap);
+	return(TRUE);
+}
+
+static
+void
+dict_store_statistics(
+/*==================*/
+	dict_table_t*	table)
+{
+	dict_index_t*	index;
+	mem_heap_t*	heap;
+
+	index = dict_table_get_first_index(table);
+
+	ut_a(index);
+
+	heap = mem_heap_create(1000);
+
+	while (index) {
+		if (table->is_corrupt) {
+			ut_a(srv_pass_corrupt_table);
+			mem_heap_free(heap);
+			return;
+		}
+
+/*===========================================*/
+{
+	dict_table_t*	sys_stats;
+	dict_index_t*	sys_index;
+	btr_pcur_t	pcur;
+	dtuple_t*	tuple;
+	dfield_t*	dfield;
+	ulint		key_cols;
+	ulint		n_cols;
+	ulint		rests;
+	const rec_t*	rec;
+	const byte*	field;
+	ulint		len;
+	ib_int64_t*	stat_n_diff_key_vals_tmp;
+	byte*		buf;
+	ulint		i;
+	mtr_t		mtr;
+
+	n_cols = dict_index_get_n_unique(index);
+	stat_n_diff_key_vals_tmp = mem_heap_zalloc(heap, (n_cols + 1) * sizeof(ib_int64_t));
+
+	dict_index_stat_mutex_enter(index);
+	for (i = 0; i <= n_cols; i++) {
+		stat_n_diff_key_vals_tmp[i] = index->stat_n_diff_key_vals[i];
+	}
+	dict_index_stat_mutex_exit(index);
+
+	sys_stats = dict_sys->sys_stats;
+	sys_index = UT_LIST_GET_FIRST(sys_stats->indexes);
+	ut_a(!dict_table_is_comp(sys_stats));
+
+	tuple = dtuple_create(heap, 1);
+	dfield = dtuple_get_nth_field(tuple, 0);
+
+	buf = mem_heap_alloc(heap, 8);
+	mach_write_to_8(buf, index->id);
+
+	dfield_set_data(dfield, buf, 8);
+	dict_index_copy_types(tuple, sys_index, 1);
+
+	mtr_start(&mtr);
+
+	btr_pcur_open_on_user_rec(sys_index, tuple, PAGE_CUR_GE,
+				  BTR_MODIFY_LEAF, &pcur, &mtr);
+	rests = n_cols + 1;
+	for (i = 0; i <= n_cols; i++) {
+		rec = btr_pcur_get_rec(&pcur);
+
+		if (!btr_pcur_is_on_user_rec(&pcur)
+		    || ut_dulint_cmp(mach_read_from_8(rec_get_nth_field_old(rec, 0, &len)),
+				     index->id)) {
+			/* not found */
+			btr_pcur_close(&pcur);
+			mtr_commit(&mtr);
+			break;
+		}
+
+		if (rec_get_deleted_flag(rec, 0)) {
+			goto next_rec;
+		}
+
+		field = rec_get_nth_field_old(rec, 1, &len);
+		ut_a(len == 4);
+
+		key_cols = mach_read_from_4(field);
+
+		field = rec_get_nth_field_old(rec, DICT_SYS_STATS_DIFF_VALS_FIELD, &len);
+		ut_a(len == 8);
+
+		mlog_write_dulint((byte*)field,
+				ut_dulint_create((ulint) (stat_n_diff_key_vals_tmp[key_cols] >> 32),
+						(ulint) stat_n_diff_key_vals_tmp[key_cols] & 0xFFFFFFFF),
+				&mtr);
+
+		rests--;
+
+next_rec:
+		btr_pcur_move_to_next_user_rec(&pcur, &mtr);
+	}
+	btr_pcur_close(&pcur);
+	mtr_commit(&mtr);
+
+	if (rests) {
+		fprintf(stderr, "InnoDB: Warning: failed to store %lu stats entries"
+				" of %s/%s to SYS_STATS system table.\n",
+				rests, index->table_name, index->name);
+	}
+}
+/*===========================================*/
+
+		index = dict_table_get_next_index(index);
+	}
+
+	mem_heap_free(heap);
+}
+
+/*********************************************************************//**
+Calculates new estimates for table and index statistics. The statistics
+are used in query optimization. */
+UNIV_INTERN
+void
+dict_update_statistics_low(
+/*=======================*/
+	dict_table_t*	table,		/*!< in/out: table */
+	ibool		has_dict_mutex __attribute__((unused)),
+					/*!< in: TRUE if the caller has the
+					dictionary mutex */
+	ibool		sync)		/*!< in: TRUE if must update SYS_STATS */
+{
+	dict_index_t*	index;
+	ulint		sum_of_index_sizes	= 0;
+
+	if (table->ibd_file_missing) {
+		ut_print_timestamp(stderr);
+		fprintf(stderr,
+			"  InnoDB: cannot calculate statistics for table %s\n"
+			"InnoDB: because the .ibd file is missing.  For help,"
+			" please refer to\n"
+			"InnoDB: " REFMAN "innodb-troubleshooting.html\n",
+			table->name);
+
+		return;
+	}
+
+	if (srv_use_sys_stats_table && !((table->flags >> DICT_TF2_SHIFT) & DICT_TF2_TEMPORARY) && !sync) {
+		/* reload statistics from SYS_STATS table */
+		if (dict_reload_statistics(table, &sum_of_index_sizes)) {
+			/* success */
+#ifdef UNIV_DEBUG
+			fprintf(stderr, "InnoDB: DEBUG: reload_statistics is scceeded for %s.\n",
+					table->name);
+#endif
+			goto end;
+		}
+	}
+#ifdef UNIV_DEBUG
+	fprintf(stderr, "InnoDB: DEBUG: update_statistics for %s.\n",
+			table->name);
+#endif
+	sum_of_index_sizes = 0;
+
+	/* Find out the sizes of the indexes and how many different values
+	for the key they approximately have */
+
+	index = dict_table_get_first_index(table);
+
+	if (index == NULL) {
+		/* Table definition is corrupt */
+
+		return;
+	}
+
+	do {
+		if (table->is_corrupt) {
+			ut_a(srv_pass_corrupt_table);
+			return;
+		}
+
+		if (UNIV_LIKELY
+		    (srv_force_recovery < SRV_FORCE_NO_IBUF_MERGE
+		     || (srv_force_recovery < SRV_FORCE_NO_LOG_REDO
+			 && dict_index_is_clust(index)))) {
+			ulint	size;
+			size = btr_get_size(index, BTR_TOTAL_SIZE);
+
+			index->stat_index_size = size;
+
+			sum_of_index_sizes += size;
+
+			size = btr_get_size(index, BTR_N_LEAF_PAGES);
+
+			if (size == 0) {
+				/* The root node of the tree is a leaf */
+				size = 1;
+			}
+
+			index->stat_n_leaf_pages = size;
+
+			btr_estimate_number_of_different_key_vals(index);
+		} else {
+			/* If we have set a high innodb_force_recovery
+			level, do not calculate statistics, as a badly
+			corrupted index can cause a crash in it.
+			Initialize some bogus index cardinality
+			statistics, so that the data can be queried in
+			various means, also via secondary indexes. */
+			ulint	i;
+
+			sum_of_index_sizes++;
+			index->stat_index_size = index->stat_n_leaf_pages = 1;
+
+			for (i = dict_index_get_n_unique(index); i; ) {
+				index->stat_n_diff_key_vals[i--] = 1;
+			}
+		}
+
+		index = dict_table_get_next_index(index);
+	} while (index);
+
+	if (srv_use_sys_stats_table && !((table->flags >> DICT_TF2_SHIFT) & DICT_TF2_TEMPORARY)) {
+		/* store statistics to SYS_STATS table */
+		dict_store_statistics(table);
+	}
+end:
+	index = dict_table_get_first_index(table);
+
+	dict_index_stat_mutex_enter(index);
+
+	table->stat_n_rows = index->stat_n_diff_key_vals[
+		dict_index_get_n_unique(index)];
+
+	dict_index_stat_mutex_exit(index);
+
+	table->stat_clustered_index_size = index->stat_index_size;
+
+	table->stat_sum_of_other_index_sizes = sum_of_index_sizes
+		- index->stat_index_size;
+
+	table->stat_initialized = TRUE;
+
+	table->stat_modified_counter = 0;
+}
+
+/*********************************************************************//**
+Calculates new estimates for table and index statistics. The statistics
+are used in query optimization. */
+UNIV_INTERN
+void
+dict_update_statistics(
+/*===================*/
+	dict_table_t*	table,	/*!< in/out: table */
+	ibool		sync)
+{
+	dict_update_statistics_low(table, FALSE, sync);
+}
+
+/**********************************************************************//**
+Prints info of a foreign key constraint. */
+static
+void
+dict_foreign_print_low(
+/*===================*/
+	dict_foreign_t*	foreign)	/*!< in: foreign key constraint */
+{
+	ulint	i;
+
+	ut_ad(mutex_own(&(dict_sys->mutex)));
+
+	fprintf(stderr, "  FOREIGN KEY CONSTRAINT %s: %s (",
+		foreign->id, foreign->foreign_table_name);
+
+	for (i = 0; i < foreign->n_fields; i++) {
+		fprintf(stderr, " %s", foreign->foreign_col_names[i]);
+	}
+
+	fprintf(stderr, " )\n"
+		"             REFERENCES %s (",
+		foreign->referenced_table_name);
+
+	for (i = 0; i < foreign->n_fields; i++) {
+		fprintf(stderr, " %s", foreign->referenced_col_names[i]);
+	}
+
+	fputs(" )\n", stderr);
+}
+
+/**********************************************************************//**
+Prints a table data. */
+UNIV_INTERN
+void
+dict_table_print(
+/*=============*/
+	dict_table_t*	table)	/*!< in: table */
+{
+	mutex_enter(&(dict_sys->mutex));
+	dict_table_print_low(table);
+	mutex_exit(&(dict_sys->mutex));
+}
+
+/**********************************************************************//**
+Prints a table data when we know the table name. */
+UNIV_INTERN
+void
+dict_table_print_by_name(
+/*=====================*/
+	const char*	name)	/*!< in: table name */
+{
+	dict_table_t*	table;
+
+	mutex_enter(&(dict_sys->mutex));
+
+	table = dict_table_get_low(name);
+
+	ut_a(table);
+
+	dict_table_print_low(table);
+	mutex_exit(&(dict_sys->mutex));
+}
+
+/**********************************************************************//**
+Prints a table data. */
+UNIV_INTERN
+void
+dict_table_print_low(
+/*=================*/
+	dict_table_t*	table)	/*!< in: table */
+{
+	dict_index_t*	index;
+	dict_foreign_t*	foreign;
+	ulint		i;
+
+	ut_ad(mutex_own(&(dict_sys->mutex)));
+
+	if (srv_stats_auto_update)
+		dict_update_statistics_low(table, TRUE, FALSE);
+
+	fprintf(stderr,
+		"--------------------------------------\n"
+		"TABLE: name %s, id %lu %lu, flags %lx, columns %lu,"
+		" indexes %lu, appr.rows %lu\n"
+		"  COLUMNS: ",
+		table->name,
+		(ulong) ut_dulint_get_high(table->id),
+		(ulong) ut_dulint_get_low(table->id),
+		(ulong) table->flags,
+		(ulong) table->n_cols,
+		(ulong) UT_LIST_GET_LEN(table->indexes),
+		(ulong) table->stat_n_rows);
+
+	for (i = 0; i < (ulint) table->n_cols; i++) {
+		dict_col_print_low(table, dict_table_get_nth_col(table, i));
+		fputs("; ", stderr);
+	}
+
+	putc('\n', stderr);
+
+	index = UT_LIST_GET_FIRST(table->indexes);
+
+	while (index != NULL) {
+		dict_index_print_low(index);
+		index = UT_LIST_GET_NEXT(indexes, index);
+	}
+
+	foreign = UT_LIST_GET_FIRST(table->foreign_list);
+
+	while (foreign != NULL) {
+		dict_foreign_print_low(foreign);
+		foreign = UT_LIST_GET_NEXT(foreign_list, foreign);
+	}
+
+	foreign = UT_LIST_GET_FIRST(table->referenced_list);
+
+	while (foreign != NULL) {
+		dict_foreign_print_low(foreign);
+		foreign = UT_LIST_GET_NEXT(referenced_list, foreign);
+	}
+}
+
+/**********************************************************************//**
+Prints a column data. */
+static
+void
+dict_col_print_low(
+/*===============*/
+	const dict_table_t*	table,	/*!< in: table */
+	const dict_col_t*	col)	/*!< in: column */
+{
+	dtype_t	type;
+
+	ut_ad(mutex_own(&(dict_sys->mutex)));
+
+	dict_col_copy_type(col, &type);
+	fprintf(stderr, "%s: ", dict_table_get_col_name(table,
+							dict_col_get_no(col)));
+
+	dtype_print(&type);
+}
+
+/**********************************************************************//**
+Prints an index data. */
+static
+void
+dict_index_print_low(
+/*=================*/
+	dict_index_t*	index)	/*!< in: index */
+{
+	ib_int64_t	n_vals;
+	ulint		i;
+	const char*	type_string;
+
+	ut_ad(mutex_own(&(dict_sys->mutex)));
+
+	dict_index_stat_mutex_enter(index);
+
+	if (index->n_user_defined_cols > 0) {
+		n_vals = index->stat_n_diff_key_vals[
+			index->n_user_defined_cols];
+	} else {
+		n_vals = index->stat_n_diff_key_vals[1];
+	}
+
+	dict_index_stat_mutex_exit(index);
+
+	if (dict_index_is_clust(index)) {
+		type_string = "clustered index";
+	} else if (dict_index_is_unique(index)) {
+		type_string = "unique index";
+	} else {
+		type_string = "secondary index";
+	}
+
+	fprintf(stderr,
+		"  INDEX: name %s, id %lu %lu, fields %lu/%lu,"
+		" uniq %lu, type %lu\n"
+		"   root page %lu, appr.key vals %lu,"
+		" leaf pages %lu, size pages %lu\n"
+		"   FIELDS: ",
+		index->name,
+		(ulong) ut_dulint_get_high(index->id),
+		(ulong) ut_dulint_get_low(index->id),
+		(ulong) index->n_user_defined_cols,
+		(ulong) index->n_fields,
+		(ulong) index->n_uniq,
+		(ulong) index->type,
+		(ulong) index->page,
+		(ulong) n_vals,
+		(ulong) index->stat_n_leaf_pages,
+		(ulong) index->stat_index_size);
+
+	for (i = 0; i < index->n_fields; i++) {
+		dict_field_print_low(dict_index_get_nth_field(index, i));
+	}
+
+	putc('\n', stderr);
+
+#ifdef UNIV_BTR_PRINT
+	btr_print_size(index);
+
+	btr_print_index(index, 7);
+#endif /* UNIV_BTR_PRINT */
+}
+
+/**********************************************************************//**
+Prints a field data. */
+static
+void
+dict_field_print_low(
+/*=================*/
+	const dict_field_t*	field)	/*!< in: field */
+{
+	ut_ad(mutex_own(&(dict_sys->mutex)));
+
+	fprintf(stderr, " %s", field->name);
+
+	if (field->prefix_len != 0) {
+		fprintf(stderr, "(%lu)", (ulong) field->prefix_len);
+	}
+}
+
+/**********************************************************************//**
+Outputs info on a foreign key of a table in a format suitable for
+CREATE TABLE. */
+UNIV_INTERN
+void
+dict_print_info_on_foreign_key_in_create_format(
+/*============================================*/
+	FILE*		file,		/*!< in: file where to print */
+	trx_t*		trx,		/*!< in: transaction */
+	dict_foreign_t*	foreign,	/*!< in: foreign key constraint */
+	ibool		add_newline)	/*!< in: whether to add a newline */
+{
+	const char*	stripped_id;
+	ulint	i;
+
+	if (strchr(foreign->id, '/')) {
+		/* Strip the preceding database name from the constraint id */
+		stripped_id = foreign->id + 1
+			+ dict_get_db_name_len(foreign->id);
+	} else {
+		stripped_id = foreign->id;
+	}
+
+	putc(',', file);
+
+	if (add_newline) {
+		/* SHOW CREATE TABLE wants constraints each printed nicely
+		on its own line, while error messages want no newlines
+		inserted. */
+		fputs("\n ", file);
+	}
+
+	fputs(" CONSTRAINT ", file);
+	ut_print_name(file, trx, FALSE, stripped_id);
+	fputs(" FOREIGN KEY (", file);
+
+	for (i = 0;;) {
+		ut_print_name(file, trx, FALSE, foreign->foreign_col_names[i]);
+		if (++i < foreign->n_fields) {
+			fputs(", ", file);
+		} else {
+			break;
+		}
+	}
+
+	fputs(") REFERENCES ", file);
+
+	if (dict_tables_have_same_db(foreign->foreign_table_name,
+				     foreign->referenced_table_name)) {
+		/* Do not print the database name of the referenced table */
+		ut_print_name(file, trx, TRUE,
+			      dict_remove_db_name(
+				      foreign->referenced_table_name));
+	} else {
+		ut_print_name(file, trx, TRUE,
+			      foreign->referenced_table_name);
+	}
+
+	putc(' ', file);
+	putc('(', file);
+
+	for (i = 0;;) {
+		ut_print_name(file, trx, FALSE,
+			      foreign->referenced_col_names[i]);
+		if (++i < foreign->n_fields) {
+			fputs(", ", file);
+		} else {
+			break;
+		}
+	}
+
+	putc(')', file);
+
+	if (foreign->type & DICT_FOREIGN_ON_DELETE_CASCADE) {
+		fputs(" ON DELETE CASCADE", file);
+	}
+
+	if (foreign->type & DICT_FOREIGN_ON_DELETE_SET_NULL) {
+		fputs(" ON DELETE SET NULL", file);
+	}
+
+	if (foreign->type & DICT_FOREIGN_ON_DELETE_NO_ACTION) {
+		fputs(" ON DELETE NO ACTION", file);
+	}
+
+	if (foreign->type & DICT_FOREIGN_ON_UPDATE_CASCADE) {
+		fputs(" ON UPDATE CASCADE", file);
+	}
+
+	if (foreign->type & DICT_FOREIGN_ON_UPDATE_SET_NULL) {
+		fputs(" ON UPDATE SET NULL", file);
+	}
+
+	if (foreign->type & DICT_FOREIGN_ON_UPDATE_NO_ACTION) {
+		fputs(" ON UPDATE NO ACTION", file);
+	}
+}
+
+/**********************************************************************//**
+Outputs info on foreign keys of a table. */
+UNIV_INTERN
+void
+dict_print_info_on_foreign_keys(
+/*============================*/
+	ibool		create_table_format, /*!< in: if TRUE then print in
+				a format suitable to be inserted into
+				a CREATE TABLE, otherwise in the format
+				of SHOW TABLE STATUS */
+	FILE*		file,	/*!< in: file where to print */
+	trx_t*		trx,	/*!< in: transaction */
+	dict_table_t*	table)	/*!< in: table */
+{
+	dict_foreign_t*	foreign;
+
+	mutex_enter(&(dict_sys->mutex));
+
+	foreign = UT_LIST_GET_FIRST(table->foreign_list);
+
+	if (foreign == NULL) {
+		mutex_exit(&(dict_sys->mutex));
+
+		return;
+	}
+
+	while (foreign != NULL) {
+		if (create_table_format) {
+			dict_print_info_on_foreign_key_in_create_format(
+				file, trx, foreign, TRUE);
+		} else {
+			ulint	i;
+			fputs("; (", file);
+
+			for (i = 0; i < foreign->n_fields; i++) {
+				if (i) {
+					putc(' ', file);
+				}
+
+				ut_print_name(file, trx, FALSE,
+					      foreign->foreign_col_names[i]);
+			}
+
+			fputs(") REFER ", file);
+			ut_print_name(file, trx, TRUE,
+				      foreign->referenced_table_name);
+			putc('(', file);
+
+			for (i = 0; i < foreign->n_fields; i++) {
+				if (i) {
+					putc(' ', file);
+				}
+				ut_print_name(
+					file, trx, FALSE,
+					foreign->referenced_col_names[i]);
+			}
+
+			putc(')', file);
+
+			if (foreign->type == DICT_FOREIGN_ON_DELETE_CASCADE) {
+				fputs(" ON DELETE CASCADE", file);
+			}
+
+			if (foreign->type == DICT_FOREIGN_ON_DELETE_SET_NULL) {
+				fputs(" ON DELETE SET NULL", file);
+			}
+
+			if (foreign->type & DICT_FOREIGN_ON_DELETE_NO_ACTION) {
+				fputs(" ON DELETE NO ACTION", file);
+			}
+
+			if (foreign->type & DICT_FOREIGN_ON_UPDATE_CASCADE) {
+				fputs(" ON UPDATE CASCADE", file);
+			}
+
+			if (foreign->type & DICT_FOREIGN_ON_UPDATE_SET_NULL) {
+				fputs(" ON UPDATE SET NULL", file);
+			}
+
+			if (foreign->type & DICT_FOREIGN_ON_UPDATE_NO_ACTION) {
+				fputs(" ON UPDATE NO ACTION", file);
+			}
+		}
+
+		foreign = UT_LIST_GET_NEXT(foreign_list, foreign);
+	}
+
+	mutex_exit(&(dict_sys->mutex));
+}
+
+/********************************************************************//**
+Displays the names of the index and the table. */
+UNIV_INTERN
+void
+dict_index_name_print(
+/*==================*/
+	FILE*			file,	/*!< in: output stream */
+	trx_t*			trx,	/*!< in: transaction */
+	const dict_index_t*	index)	/*!< in: index to print */
+{
+	fputs("index ", file);
+	ut_print_name(file, trx, FALSE, index->name);
+	fputs(" of table ", file);
+	ut_print_name(file, trx, TRUE, index->table_name);
+}
+#endif /* !UNIV_HOTBACKUP */
+
+/**********************************************************************//**
+Inits dict_ind_redundant and dict_ind_compact. */
+UNIV_INTERN
+void
+dict_ind_init(void)
+/*===============*/
+{
+	dict_table_t*		table;
+
+	/* create dummy table and index for REDUNDANT infimum and supremum */
+	table = dict_mem_table_create("SYS_DUMMY1", DICT_HDR_SPACE, 1, 0);
+	dict_mem_table_add_col(table, NULL, NULL, DATA_CHAR,
+			       DATA_ENGLISH | DATA_NOT_NULL, 8);
+
+	dict_ind_redundant = dict_mem_index_create("SYS_DUMMY1", "SYS_DUMMY1",
+						   DICT_HDR_SPACE, 0, 1);
+	dict_index_add_col(dict_ind_redundant, table,
+			   dict_table_get_nth_col(table, 0), 0);
+	dict_ind_redundant->table = table;
+	/* create dummy table and index for COMPACT infimum and supremum */
+	table = dict_mem_table_create("SYS_DUMMY2",
+				      DICT_HDR_SPACE, 1, DICT_TF_COMPACT);
+	dict_mem_table_add_col(table, NULL, NULL, DATA_CHAR,
+			       DATA_ENGLISH | DATA_NOT_NULL, 8);
+	dict_ind_compact = dict_mem_index_create("SYS_DUMMY2", "SYS_DUMMY2",
+						 DICT_HDR_SPACE, 0, 1);
+	dict_index_add_col(dict_ind_compact, table,
+			   dict_table_get_nth_col(table, 0), 0);
+	dict_ind_compact->table = table;
+
+	/* avoid ut_ad(index->cached) in dict_index_get_n_unique_in_tree */
+	dict_ind_redundant->cached = dict_ind_compact->cached = TRUE;
+}
+
+/**********************************************************************//**
+Frees dict_ind_redundant and dict_ind_compact. */
+static
+void
+dict_ind_free(void)
+/*===============*/
+{
+	dict_table_t*	table;
+
+	table = dict_ind_compact->table;
+	dict_mem_index_free(dict_ind_compact);
+	dict_ind_compact = NULL;
+	dict_mem_table_free(table);
+
+	table = dict_ind_redundant->table;
+	dict_mem_index_free(dict_ind_redundant);
+	dict_ind_redundant = NULL;
+	dict_mem_table_free(table);
+}
+
+#ifndef UNIV_HOTBACKUP
+/**********************************************************************//**
+Get index by name
+@return	index, NULL if does not exist */
+UNIV_INTERN
+dict_index_t*
+dict_table_get_index_on_name(
+/*=========================*/
+	dict_table_t*	table,	/*!< in: table */
+	const char*	name)	/*!< in: name of the index to find */
+{
+	dict_index_t*	index;
+
+	index = dict_table_get_first_index(table);
+
+	while (index != NULL) {
+		if (ut_strcmp(index->name, name) == 0) {
+
+			return(index);
+		}
+
+		index = dict_table_get_next_index(index);
+	}
+
+	return(NULL);
+
+}
+
+/**********************************************************************//**
+Replace the index passed in with another equivalent index in the tables
+foreign key list. */
+UNIV_INTERN
+void
+dict_table_replace_index_in_foreign_list(
+/*=====================================*/
+	dict_table_t*	table,  /*!< in/out: table */
+	dict_index_t*	index)	/*!< in: index to be replaced */
+{
+	dict_foreign_t*	foreign;
+
+	for (foreign = UT_LIST_GET_FIRST(table->foreign_list);
+	     foreign;
+	     foreign = UT_LIST_GET_NEXT(foreign_list, foreign)) {
+
+		if (foreign->foreign_index == index) {
+			dict_index_t*	new_index
+				= dict_foreign_find_equiv_index(foreign);
+			ut_a(new_index);
+
+			foreign->foreign_index = new_index;
+		}
+	}
+}
+
+/**********************************************************************//**
+In case there is more than one index with the same name return the index
+with the min(id).
+@return	index, NULL if does not exist */
+UNIV_INTERN
+dict_index_t*
+dict_table_get_index_on_name_and_min_id(
+/*=====================================*/
+	dict_table_t*	table,	/*!< in: table */
+	const char*	name)	/*!< in: name of the index to find */
+{
+	dict_index_t*	index;
+	dict_index_t*	min_index; /* Index with matching name and min(id) */
+
+	min_index = NULL;
+	index = dict_table_get_first_index(table);
+
+	while (index != NULL) {
+		if (ut_strcmp(index->name, name) == 0) {
+			if (!min_index
+			    || ut_dulint_cmp(index->id, min_index->id) < 0) {
+
+				min_index = index;
+			}
+		}
+
+		index = dict_table_get_next_index(index);
+	}
+
+	return(min_index);
+
+}
+
+#ifdef UNIV_DEBUG
+/**********************************************************************//**
+Check for duplicate index entries in a table [using the index name] */
+UNIV_INTERN
+void
+dict_table_check_for_dup_indexes(
+/*=============================*/
+	const dict_table_t*	table,	/*!< in: Check for dup indexes
+					in this table */
+	ibool			tmp_ok)	/*!< in: TRUE=allow temporary
+					index names */
+{
+	/* Check for duplicates, ignoring indexes that are marked
+	as to be dropped */
+
+	const dict_index_t*	index1;
+	const dict_index_t*	index2;
+
+	ut_ad(mutex_own(&dict_sys->mutex));
+
+	/* The primary index _must_ exist */
+	ut_a(UT_LIST_GET_LEN(table->indexes) > 0);
+
+	index1 = UT_LIST_GET_FIRST(table->indexes);
+
+	do {
+		ut_ad(tmp_ok || *index1->name != TEMP_INDEX_PREFIX);
+
+		index2 = UT_LIST_GET_NEXT(indexes, index1);
+
+		while (index2) {
+
+			if (!index2->to_be_dropped) {
+				ut_ad(ut_strcmp(index1->name, index2->name));
+			}
+
+			index2 = UT_LIST_GET_NEXT(indexes, index2);
+		}
+
+		index1 = UT_LIST_GET_NEXT(indexes, index1);
+	} while (index1);
+}
+#endif /* UNIV_DEBUG */
+
+/**************************************************************************
+Closes the data dictionary module. */
+UNIV_INTERN
+void
+dict_close(void)
+/*============*/
+{
+	ulint	i;
+
+	/* Free the hash elements. We don't remove them from the table
+	because we are going to destroy the table anyway. */
+	for (i = 0; i < hash_get_n_cells(dict_sys->table_hash); i++) {
+		dict_table_t*	table;
+
+		table = HASH_GET_FIRST(dict_sys->table_hash, i);
+
+		while (table) {
+			dict_table_t*	prev_table = table;
+
+			table = HASH_GET_NEXT(name_hash, prev_table);
+#ifdef UNIV_DEBUG
+			ut_a(prev_table->magic_n == DICT_TABLE_MAGIC_N);
+#endif
+			/* Acquire only because it's a pre-condition. */
+			mutex_enter(&dict_sys->mutex);
+
+			dict_table_remove_from_cache(prev_table);
+
+			mutex_exit(&dict_sys->mutex);
+		}
+	}
+
+	hash_table_free(dict_sys->table_hash);
+
+	/* The elements are the same instance as in dict_sys->table_hash,
+	therefore we don't delete the individual elements. */
+	hash_table_free(dict_sys->table_id_hash);
+
+	dict_ind_free();
+
+	mutex_free(&dict_sys->mutex);
+
+	rw_lock_free(&dict_operation_lock);
+	memset(&dict_operation_lock, 0x0, sizeof(dict_operation_lock));
+
+	mutex_free(&dict_foreign_err_mutex);
+
+	mem_free(dict_sys);
+	dict_sys = NULL;
+
+	for (i = 0; i < DICT_INDEX_STAT_MUTEX_SIZE; i++) {
+		mutex_free(&dict_index_stat_mutex[i]);
+	}
+}
+
+/*************************************************************************
+set is_corrupt flag by space_id*/
+
+void
+dict_table_set_corrupt_by_space(
+/*============================*/
+	ulint	space_id,
+	ibool	need_mutex)
+{
+	dict_table_t*	table;
+	ibool		found = FALSE;
+
+	ut_a(!trx_sys_sys_space(space_id) && space_id < SRV_LOG_SPACE_FIRST_ID);
+
+	if (need_mutex)
+		mutex_enter(&(dict_sys->mutex));
+
+	table = UT_LIST_GET_FIRST(dict_sys->table_LRU);
+
+	while (table) {
+		if (table->space == space_id) {
+			table->is_corrupt = TRUE;
+			found = TRUE;
+		}
+
+		table = UT_LIST_GET_NEXT(table_LRU, table);
+	}
+
+	if (need_mutex)
+		mutex_exit(&(dict_sys->mutex));
+
+	if (!found) {
+		fprintf(stderr, "InnoDB: space to be marked as "
+			"crashed was not found for id %lu.\n",
+			(ulong) space_id);
+	}
+}
+#endif /* !UNIV_HOTBACKUP */
diff --git a/storage/xtradb/dict/dict0load.c b/storage/xtradb/dict/dict0load.c
new file mode 100644
index 00000000000..43c0810fe67
--- /dev/null
+++ b/storage/xtradb/dict/dict0load.c
@@ -0,0 +1,1572 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2010, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file dict/dict0load.c
+Loads to the memory cache database object definitions
+from dictionary tables
+
+Created 4/24/1996 Heikki Tuuri
+*******************************************************/
+
+#include "dict0load.h"
+#include "mysql_version.h"
+
+#ifdef UNIV_NONINL
+#include "dict0load.ic"
+#endif
+
+#include "btr0pcur.h"
+#include "btr0btr.h"
+#include "page0page.h"
+#include "mach0data.h"
+#include "dict0dict.h"
+#include "dict0boot.h"
+#include "rem0cmp.h"
+#include "srv0start.h"
+#include "srv0srv.h"
+#include "trx0sys.h"
+
+/****************************************************************//**
+Compare the name of an index column.
+@return	TRUE if the i'th column of index is 'name'. */
+static
+ibool
+name_of_col_is(
+/*===========*/
+	const dict_table_t*	table,	/*!< in: table */
+	const dict_index_t*	index,	/*!< in: index */
+	ulint			i,	/*!< in: index field offset */
+	const char*		name)	/*!< in: name to compare to */
+{
+	ulint	tmp = dict_col_get_no(dict_field_get_col(
+					      dict_index_get_nth_field(
+						      index, i)));
+
+	return(strcmp(name, dict_table_get_col_name(table, tmp)) == 0);
+}
+
+/********************************************************************//**
+Finds the first table name in the given database.
+@return own: table name, NULL if does not exist; the caller must free
+the memory in the string! */
+UNIV_INTERN
+char*
+dict_get_first_table_name_in_db(
+/*============================*/
+	const char*	name)	/*!< in: database name which ends in '/' */
+{
+	dict_table_t*	sys_tables;
+	btr_pcur_t	pcur;
+	dict_index_t*	sys_index;
+	dtuple_t*	tuple;
+	mem_heap_t*	heap;
+	dfield_t*	dfield;
+	const rec_t*	rec;
+	const byte*	field;
+	ulint		len;
+	mtr_t		mtr;
+
+	ut_ad(mutex_own(&(dict_sys->mutex)));
+
+	heap = mem_heap_create(1000);
+
+	mtr_start(&mtr);
+
+	sys_tables = dict_table_get_low("SYS_TABLES");
+	sys_index = UT_LIST_GET_FIRST(sys_tables->indexes);
+	ut_a(!dict_table_is_comp(sys_tables));
+
+	tuple = dtuple_create(heap, 1);
+	dfield = dtuple_get_nth_field(tuple, 0);
+
+	dfield_set_data(dfield, name, ut_strlen(name));
+	dict_index_copy_types(tuple, sys_index, 1);
+
+	btr_pcur_open_on_user_rec(sys_index, tuple, PAGE_CUR_GE,
+				  BTR_SEARCH_LEAF, &pcur, &mtr);
+loop:
+	rec = btr_pcur_get_rec(&pcur);
+
+	if (!btr_pcur_is_on_user_rec(&pcur)) {
+		/* Not found */
+
+		btr_pcur_close(&pcur);
+		mtr_commit(&mtr);
+		mem_heap_free(heap);
+
+		return(NULL);
+	}
+
+	field = rec_get_nth_field_old(rec, 0, &len);
+
+	if (len < strlen(name)
+	    || ut_memcmp(name, field, strlen(name)) != 0) {
+		/* Not found */
+
+		btr_pcur_close(&pcur);
+		mtr_commit(&mtr);
+		mem_heap_free(heap);
+
+		return(NULL);
+	}
+
+	if (!rec_get_deleted_flag(rec, 0)) {
+
+		/* We found one */
+
+		char*	table_name = mem_strdupl((char*) field, len);
+
+		btr_pcur_close(&pcur);
+		mtr_commit(&mtr);
+		mem_heap_free(heap);
+
+		return(table_name);
+	}
+
+	btr_pcur_move_to_next_user_rec(&pcur, &mtr);
+
+	goto loop;
+}
+
+/********************************************************************//**
+Prints to the standard output information on all tables found in the data
+dictionary system table. */
+UNIV_INTERN
+void
+dict_print(void)
+/*============*/
+{
+	dict_table_t*	sys_tables;
+	dict_index_t*	sys_index;
+	dict_table_t*	table;
+	btr_pcur_t	pcur;
+	const rec_t*	rec;
+	const byte*	field;
+	ulint		len;
+	mtr_t		mtr;
+
+	/* Enlarge the fatal semaphore wait timeout during the InnoDB table
+	monitor printout */
+
+	mutex_enter(&kernel_mutex);
+	srv_fatal_semaphore_wait_threshold += 7200; /* 2 hours */
+	mutex_exit(&kernel_mutex);
+
+	mutex_enter(&(dict_sys->mutex));
+
+	mtr_start(&mtr);
+
+	sys_tables = dict_table_get_low("SYS_TABLES");
+	sys_index = UT_LIST_GET_FIRST(sys_tables->indexes);
+
+	btr_pcur_open_at_index_side(TRUE, sys_index, BTR_SEARCH_LEAF, &pcur,
+				    TRUE, &mtr);
+loop:
+	btr_pcur_move_to_next_user_rec(&pcur, &mtr);
+
+	rec = btr_pcur_get_rec(&pcur);
+
+	if (!btr_pcur_is_on_user_rec(&pcur)) {
+		/* end of index */
+
+		btr_pcur_close(&pcur);
+		mtr_commit(&mtr);
+
+		mutex_exit(&(dict_sys->mutex));
+
+		/* Restore the fatal semaphore wait timeout */
+
+		mutex_enter(&kernel_mutex);
+		srv_fatal_semaphore_wait_threshold -= 7200; /* 2 hours */
+		mutex_exit(&kernel_mutex);
+
+		return;
+	}
+
+	field = rec_get_nth_field_old(rec, 0, &len);
+
+	if (!rec_get_deleted_flag(rec, 0)) {
+
+		/* We found one */
+
+		char*	table_name = mem_strdupl((char*) field, len);
+
+		btr_pcur_store_position(&pcur, &mtr);
+
+		mtr_commit(&mtr);
+
+		table = dict_table_get_low(table_name);
+		mem_free(table_name);
+
+		if (table == NULL) {
+			fputs("InnoDB: Failed to load table ", stderr);
+			ut_print_namel(stderr, NULL, TRUE, (char*) field, len);
+			putc('\n', stderr);
+		} else {
+			/* The table definition was corrupt if there
+			is no index */
+
+			if (srv_stats_auto_update && dict_table_get_first_index(table)) {
+				dict_update_statistics_low(table, TRUE, FALSE);
+			}
+
+			dict_table_print_low(table);
+		}
+
+		mtr_start(&mtr);
+
+		btr_pcur_restore_position(BTR_SEARCH_LEAF, &pcur, &mtr);
+	}
+
+	goto loop;
+}
+
+/********************************************************************//**
+Determine the flags of a table described in SYS_TABLES.
+@return compressed page size in kilobytes; or 0 if the tablespace is
+uncompressed, ULINT_UNDEFINED on error */
+static
+ulint
+dict_sys_tables_get_flags(
+/*======================*/
+	const rec_t*	rec)	/*!< in: a record of SYS_TABLES */
+{
+	const byte*	field;
+	ulint		len;
+	ulint		n_cols;
+	ulint		flags;
+
+	field = rec_get_nth_field_old(rec, 5, &len);
+	ut_a(len == 4);
+
+	flags = mach_read_from_4(field);
+
+	if (UNIV_LIKELY(flags == DICT_TABLE_ORDINARY)) {
+		return(0);
+	}
+
+	field = rec_get_nth_field_old(rec, 4/*N_COLS*/, &len);
+	n_cols = mach_read_from_4(field);
+
+	if (UNIV_UNLIKELY(!(n_cols & 0x80000000UL))) {
+		/* New file formats require ROW_FORMAT=COMPACT. */
+		return(ULINT_UNDEFINED);
+	}
+
+	switch (flags & (DICT_TF_FORMAT_MASK | DICT_TF_COMPACT)) {
+	default:
+	case DICT_TF_FORMAT_51 << DICT_TF_FORMAT_SHIFT:
+	case DICT_TF_FORMAT_51 << DICT_TF_FORMAT_SHIFT | DICT_TF_COMPACT:
+		/* flags should be DICT_TABLE_ORDINARY,
+		or DICT_TF_FORMAT_MASK should be nonzero. */
+		return(ULINT_UNDEFINED);
+
+	case DICT_TF_FORMAT_ZIP << DICT_TF_FORMAT_SHIFT | DICT_TF_COMPACT:
+#if DICT_TF_FORMAT_MAX > DICT_TF_FORMAT_ZIP
+# error "missing case labels for DICT_TF_FORMAT_ZIP .. DICT_TF_FORMAT_MAX"
+#endif
+		/* We support this format. */
+		break;
+	}
+
+	if (UNIV_UNLIKELY((flags & DICT_TF_ZSSIZE_MASK)
+			  > (DICT_TF_ZSSIZE_MAX << DICT_TF_ZSSIZE_SHIFT))) {
+		/* Unsupported compressed page size. */
+		return(ULINT_UNDEFINED);
+	}
+
+	if (UNIV_UNLIKELY(flags & (~0 << DICT_TF_BITS))) {
+		/* Some unused bits are set. */
+		return(ULINT_UNDEFINED);
+	}
+
+	return(flags);
+}
+
+/********************************************************************//**
+In a crash recovery we already have all the tablespace objects created.
+This function compares the space id information in the InnoDB data dictionary
+to what we already read with fil_load_single_table_tablespaces().
+
+In a normal startup, we create the tablespace objects for every table in
+InnoDB's data dictionary, if the corresponding .ibd file exists.
+We also scan the biggest space id, and store it to fil_system. */
+UNIV_INTERN
+void
+dict_check_tablespaces_and_store_max_id(
+/*====================================*/
+	ibool	in_crash_recovery)	/*!< in: are we doing a crash recovery */
+{
+	dict_table_t*	sys_tables;
+	dict_index_t*	sys_index;
+	btr_pcur_t	pcur;
+	const rec_t*	rec;
+	ulint		max_space_id;
+	mtr_t		mtr;
+
+	mutex_enter(&(dict_sys->mutex));
+
+	mtr_start(&mtr);
+
+	sys_tables = dict_table_get_low("SYS_TABLES");
+	sys_index = UT_LIST_GET_FIRST(sys_tables->indexes);
+	ut_a(!dict_table_is_comp(sys_tables));
+
+	max_space_id = mtr_read_ulint(dict_hdr_get(&mtr)
+				      + DICT_HDR_MAX_SPACE_ID,
+				      MLOG_4BYTES, &mtr);
+	fil_set_max_space_id_if_bigger(max_space_id);
+
+	btr_pcur_open_at_index_side(TRUE, sys_index, BTR_SEARCH_LEAF, &pcur,
+				    TRUE, &mtr);
+loop:
+	btr_pcur_move_to_next_user_rec(&pcur, &mtr);
+
+	rec = btr_pcur_get_rec(&pcur);
+
+	if (!btr_pcur_is_on_user_rec(&pcur)) {
+		/* end of index */
+
+		btr_pcur_close(&pcur);
+		mtr_commit(&mtr);
+
+		/* We must make the tablespace cache aware of the biggest
+		known space id */
+
+		/* printf("Biggest space id in data dictionary %lu\n",
+		max_space_id); */
+		fil_set_max_space_id_if_bigger(max_space_id);
+
+		mutex_exit(&(dict_sys->mutex));
+
+		return;
+	}
+
+	if (!rec_get_deleted_flag(rec, 0)) {
+
+		/* We found one */
+		const byte*	field;
+		ulint		len;
+		ulint		space_id;
+		ulint		flags;
+		char*		name;
+
+		field = rec_get_nth_field_old(rec, 0, &len);
+		name = mem_strdupl((char*) field, len);
+
+		flags = dict_sys_tables_get_flags(rec);
+		if (UNIV_UNLIKELY(flags == ULINT_UNDEFINED)) {
+
+			field = rec_get_nth_field_old(rec, 5, &len);
+			flags = mach_read_from_4(field);
+
+			ut_print_timestamp(stderr);
+			fputs("  InnoDB: Error: table ", stderr);
+			ut_print_filename(stderr, name);
+			fprintf(stderr, "\n"
+				"InnoDB: in InnoDB data dictionary"
+				" has unknown type %lx.\n",
+				(ulong) flags);
+
+			goto loop;
+		}
+
+		field = rec_get_nth_field_old(rec, 9, &len);
+		ut_a(len == 4);
+
+		space_id = mach_read_from_4(field);
+
+		btr_pcur_store_position(&pcur, &mtr);
+
+		mtr_commit(&mtr);
+
+		if (trx_sys_sys_space(space_id)) {
+			/* The system tablespace always exists. */
+		} else if (in_crash_recovery) {
+			/* Check that the tablespace (the .ibd file) really
+			exists; print a warning to the .err log if not.
+			Do not print warnings for temporary tables. */
+			ibool	is_temp;
+
+			field = rec_get_nth_field_old(rec, 4, &len);
+			if (0x80000000UL &  mach_read_from_4(field)) {
+				/* ROW_FORMAT=COMPACT: read the is_temp
+				flag from SYS_TABLES.MIX_LEN. */
+				field = rec_get_nth_field_old(rec, 7, &len);
+				is_temp = mach_read_from_4(field)
+					& DICT_TF2_TEMPORARY;
+			} else {
+				/* For tables created with old versions
+				of InnoDB, SYS_TABLES.MIX_LEN may contain
+				garbage.  Such tables would always be
+				in ROW_FORMAT=REDUNDANT.  Pretend that
+				all such tables are non-temporary.  That is,
+				do not suppress error printouts about
+				temporary tables not being found. */
+				is_temp = FALSE;
+			}
+
+			fil_space_for_table_exists_in_mem(
+				space_id, name, is_temp, TRUE, !is_temp);
+		} else {
+			/* It is a normal database startup: create the space
+			object and check that the .ibd file exists. */
+
+			fil_open_single_table_tablespace(FALSE, space_id,
+							 flags, name);
+		}
+
+		mem_free(name);
+
+		if (space_id > max_space_id) {
+			max_space_id = space_id;
+		}
+
+		mtr_start(&mtr);
+
+		btr_pcur_restore_position(BTR_SEARCH_LEAF, &pcur, &mtr);
+	}
+
+	goto loop;
+}
+
+/********************************************************************//**
+Loads definitions for table columns. */
+static
+void
+dict_load_columns(
+/*==============*/
+	dict_table_t*	table,	/*!< in: table */
+	mem_heap_t*	heap)	/*!< in: memory heap for temporary storage */
+{
+	dict_table_t*	sys_columns;
+	dict_index_t*	sys_index;
+	btr_pcur_t	pcur;
+	dtuple_t*	tuple;
+	dfield_t*	dfield;
+	const rec_t*	rec;
+	const byte*	field;
+	ulint		len;
+	byte*		buf;
+	char*		name;
+	ulint		mtype;
+	ulint		prtype;
+	ulint		col_len;
+	ulint		i;
+	mtr_t		mtr;
+
+	ut_ad(mutex_own(&(dict_sys->mutex)));
+
+	mtr_start(&mtr);
+
+	sys_columns = dict_table_get_low("SYS_COLUMNS");
+	sys_index = UT_LIST_GET_FIRST(sys_columns->indexes);
+	ut_a(!dict_table_is_comp(sys_columns));
+
+	tuple = dtuple_create(heap, 1);
+	dfield = dtuple_get_nth_field(tuple, 0);
+
+	buf = mem_heap_alloc(heap, 8);
+	mach_write_to_8(buf, table->id);
+
+	dfield_set_data(dfield, buf, 8);
+	dict_index_copy_types(tuple, sys_index, 1);
+
+	btr_pcur_open_on_user_rec(sys_index, tuple, PAGE_CUR_GE,
+				  BTR_SEARCH_LEAF, &pcur, &mtr);
+	for (i = 0; i + DATA_N_SYS_COLS < (ulint) table->n_cols; i++) {
+
+		rec = btr_pcur_get_rec(&pcur);
+
+		ut_a(btr_pcur_is_on_user_rec(&pcur));
+
+		ut_a(!rec_get_deleted_flag(rec, 0));
+
+		field = rec_get_nth_field_old(rec, 0, &len);
+		ut_ad(len == 8);
+		ut_a(ut_dulint_cmp(table->id, mach_read_from_8(field)) == 0);
+
+		field = rec_get_nth_field_old(rec, 1, &len);
+		ut_ad(len == 4);
+		ut_a(i == mach_read_from_4(field));
+
+		ut_a(name_of_col_is(sys_columns, sys_index, 4, "NAME"));
+
+		field = rec_get_nth_field_old(rec, 4, &len);
+		name = mem_heap_strdupl(heap, (char*) field, len);
+
+		field = rec_get_nth_field_old(rec, 5, &len);
+		mtype = mach_read_from_4(field);
+
+		field = rec_get_nth_field_old(rec, 6, &len);
+		prtype = mach_read_from_4(field);
+
+		if (dtype_get_charset_coll(prtype) == 0
+		    && dtype_is_string_type(mtype)) {
+			/* The table was created with < 4.1.2. */
+
+			if (dtype_is_binary_string_type(mtype, prtype)) {
+				/* Use the binary collation for
+				string columns of binary type. */
+
+				prtype = dtype_form_prtype(
+					prtype,
+					DATA_MYSQL_BINARY_CHARSET_COLL);
+			} else {
+				/* Use the default charset for
+				other than binary columns. */
+
+				prtype = dtype_form_prtype(
+					prtype,
+					data_mysql_default_charset_coll);
+			}
+		}
+
+		field = rec_get_nth_field_old(rec, 7, &len);
+		col_len = mach_read_from_4(field);
+
+		ut_a(name_of_col_is(sys_columns, sys_index, 8, "PREC"));
+
+		dict_mem_table_add_col(table, heap, name,
+				       mtype, prtype, col_len);
+		btr_pcur_move_to_next_user_rec(&pcur, &mtr);
+	}
+
+	btr_pcur_close(&pcur);
+	mtr_commit(&mtr);
+}
+
+/********************************************************************//**
+Loads definitions for index fields. */
+static
+void
+dict_load_fields(
+/*=============*/
+	dict_index_t*	index,	/*!< in: index whose fields to load */
+	mem_heap_t*	heap)	/*!< in: memory heap for temporary storage */
+{
+	dict_table_t*	sys_fields;
+	dict_index_t*	sys_index;
+	btr_pcur_t	pcur;
+	dtuple_t*	tuple;
+	dfield_t*	dfield;
+	ulint		pos_and_prefix_len;
+	ulint		prefix_len;
+	const rec_t*	rec;
+	const byte*	field;
+	ulint		len;
+	byte*		buf;
+	ulint		i;
+	mtr_t		mtr;
+
+	ut_ad(mutex_own(&(dict_sys->mutex)));
+
+	mtr_start(&mtr);
+
+	sys_fields = dict_table_get_low("SYS_FIELDS");
+	sys_index = UT_LIST_GET_FIRST(sys_fields->indexes);
+	ut_a(!dict_table_is_comp(sys_fields));
+
+	tuple = dtuple_create(heap, 1);
+	dfield = dtuple_get_nth_field(tuple, 0);
+
+	buf = mem_heap_alloc(heap, 8);
+	mach_write_to_8(buf, index->id);
+
+	dfield_set_data(dfield, buf, 8);
+	dict_index_copy_types(tuple, sys_index, 1);
+
+	btr_pcur_open_on_user_rec(sys_index, tuple, PAGE_CUR_GE,
+				  BTR_SEARCH_LEAF, &pcur, &mtr);
+	for (i = 0; i < index->n_fields; i++) {
+
+		rec = btr_pcur_get_rec(&pcur);
+
+		ut_a(btr_pcur_is_on_user_rec(&pcur));
+
+		/* There could be delete marked records in SYS_FIELDS
+		because SYS_FIELDS.INDEX_ID can be updated
+		by ALTER TABLE ADD INDEX. */
+
+		if (rec_get_deleted_flag(rec, 0)) {
+
+			goto next_rec;
+		}
+
+		field = rec_get_nth_field_old(rec, 0, &len);
+		ut_ad(len == 8);
+
+		field = rec_get_nth_field_old(rec, 1, &len);
+		ut_a(len == 4);
+
+		/* The next field stores the field position in the index
+		and a possible column prefix length if the index field
+		does not contain the whole column. The storage format is
+		like this: if there is at least one prefix field in the index,
+		then the HIGH 2 bytes contain the field number (== i) and the
+		low 2 bytes the prefix length for the field. Otherwise the
+		field number (== i) is contained in the 2 LOW bytes. */
+
+		pos_and_prefix_len = mach_read_from_4(field);
+
+		ut_a((pos_and_prefix_len & 0xFFFFUL) == i
+		     || (pos_and_prefix_len & 0xFFFF0000UL) == (i << 16));
+
+		if ((i == 0 && pos_and_prefix_len > 0)
+		    || (pos_and_prefix_len & 0xFFFF0000UL) > 0) {
+
+			prefix_len = pos_and_prefix_len & 0xFFFFUL;
+		} else {
+			prefix_len = 0;
+		}
+
+		ut_a(name_of_col_is(sys_fields, sys_index, 4, "COL_NAME"));
+
+		field = rec_get_nth_field_old(rec, 4, &len);
+
+		dict_mem_index_add_field(index,
+					 mem_heap_strdupl(heap,
+							  (char*) field, len),
+					 prefix_len);
+
+next_rec:
+		btr_pcur_move_to_next_user_rec(&pcur, &mtr);
+	}
+
+	btr_pcur_close(&pcur);
+	mtr_commit(&mtr);
+}
+
+/********************************************************************//**
+Loads definitions for table indexes. Adds them to the data dictionary
+cache.
+@return DB_SUCCESS if ok, DB_CORRUPTION if corruption of dictionary
+table or DB_UNSUPPORTED if table has unknown index type */
+static
+ulint
+dict_load_indexes(
+/*==============*/
+	dict_table_t*	table,	/*!< in: table */
+	mem_heap_t*	heap)	/*!< in: memory heap for temporary storage */
+{
+	dict_table_t*	sys_indexes;
+	dict_index_t*	sys_index;
+	dict_index_t*	index;
+	btr_pcur_t	pcur;
+	dtuple_t*	tuple;
+	dfield_t*	dfield;
+	const rec_t*	rec;
+	const byte*	field;
+	ulint		len;
+	ulint		name_len;
+	char*		name_buf;
+	ulint		type;
+	ulint		space;
+	ulint		page_no;
+	ulint		n_fields;
+	byte*		buf;
+	ibool		is_sys_table;
+	dulint		id;
+	mtr_t		mtr;
+	ulint		error = DB_SUCCESS;
+
+	ut_ad(mutex_own(&(dict_sys->mutex)));
+
+	if ((ut_dulint_get_high(table->id) == 0)
+	    && (ut_dulint_get_low(table->id) < DICT_HDR_FIRST_ID)) {
+		is_sys_table = TRUE;
+	} else {
+		is_sys_table = FALSE;
+	}
+
+	mtr_start(&mtr);
+
+	sys_indexes = dict_table_get_low("SYS_INDEXES");
+	sys_index = UT_LIST_GET_FIRST(sys_indexes->indexes);
+	ut_a(!dict_table_is_comp(sys_indexes));
+
+	tuple = dtuple_create(heap, 1);
+	dfield = dtuple_get_nth_field(tuple, 0);
+
+	buf = mem_heap_alloc(heap, 8);
+	mach_write_to_8(buf, table->id);
+
+	dfield_set_data(dfield, buf, 8);
+	dict_index_copy_types(tuple, sys_index, 1);
+
+	btr_pcur_open_on_user_rec(sys_index, tuple, PAGE_CUR_GE,
+				  BTR_SEARCH_LEAF, &pcur, &mtr);
+	for (;;) {
+		if (!btr_pcur_is_on_user_rec(&pcur)) {
+
+			break;
+		}
+
+		rec = btr_pcur_get_rec(&pcur);
+
+		field = rec_get_nth_field_old(rec, 0, &len);
+		ut_ad(len == 8);
+
+		if (ut_memcmp(buf, field, len) != 0) {
+			break;
+		} else if (rec_get_deleted_flag(rec, 0)) {
+			/* Skip delete marked records */
+			goto next_rec;
+		}
+
+		field = rec_get_nth_field_old(rec, 1, &len);
+		ut_ad(len == 8);
+		id = mach_read_from_8(field);
+
+		ut_a(name_of_col_is(sys_indexes, sys_index, 4, "NAME"));
+
+		field = rec_get_nth_field_old(rec, 4, &name_len);
+		name_buf = mem_heap_strdupl(heap, (char*) field, name_len);
+
+		field = rec_get_nth_field_old(rec, 5, &len);
+		n_fields = mach_read_from_4(field);
+
+		field = rec_get_nth_field_old(rec, 6, &len);
+		type = mach_read_from_4(field);
+
+		field = rec_get_nth_field_old(rec, 7, &len);
+		space = mach_read_from_4(field);
+
+		ut_a(name_of_col_is(sys_indexes, sys_index, 8, "PAGE_NO"));
+
+		field = rec_get_nth_field_old(rec, 8, &len);
+		page_no = mach_read_from_4(field);
+
+		/* We check for unsupported types first, so that the
+		subsequent checks are relevant for the supported types. */
+		if (type & ~(DICT_CLUSTERED | DICT_UNIQUE)) {
+
+			fprintf(stderr,
+				"InnoDB: Error: unknown type %lu"
+				" of index %s of table %s\n",
+				(ulong) type, name_buf, table->name);
+
+			error = DB_UNSUPPORTED;
+			goto func_exit;
+		} else if (page_no == FIL_NULL) {
+
+			fprintf(stderr,
+				"InnoDB: Error: trying to load index %s"
+				" for table %s\n"
+				"InnoDB: but the index tree has been freed!\n",
+				name_buf, table->name);
+
+			error = DB_CORRUPTION;
+			goto func_exit;
+		} else if ((type & DICT_CLUSTERED) == 0
+			    && NULL == dict_table_get_first_index(table)) {
+
+			fputs("InnoDB: Error: trying to load index ",
+			      stderr);
+			ut_print_name(stderr, NULL, FALSE, name_buf);
+			fputs(" for table ", stderr);
+			ut_print_name(stderr, NULL, TRUE, table->name);
+			fputs("\nInnoDB: but the first index"
+			      " is not clustered!\n", stderr);
+
+			error = DB_CORRUPTION;
+			goto func_exit;
+		} else if (is_sys_table
+			   && ((type & DICT_CLUSTERED)
+			       || ((table == dict_sys->sys_tables)
+				   && (name_len == (sizeof "ID_IND") - 1)
+				   && (0 == ut_memcmp(name_buf,
+						      "ID_IND", name_len))))) {
+
+			/* The index was created in memory already at booting
+			of the database server */
+		} else {
+			index = dict_mem_index_create(table->name, name_buf,
+						      space, type, n_fields);
+			index->id = id;
+
+			dict_load_fields(index, heap);
+			error = dict_index_add_to_cache(table, index, page_no,
+							FALSE);
+			/* The data dictionary tables should never contain
+			invalid index definitions.  If we ignored this error
+			and simply did not load this index definition, the
+			.frm file would disagree with the index definitions
+			inside InnoDB. */
+			if (UNIV_UNLIKELY(error != DB_SUCCESS)) {
+
+				goto func_exit;
+			}
+		}
+
+next_rec:
+		btr_pcur_move_to_next_user_rec(&pcur, &mtr);
+	}
+
+func_exit:
+	btr_pcur_close(&pcur);
+	mtr_commit(&mtr);
+
+	return(error);
+}
+
+/********************************************************************//**
+Loads a table definition and also all its index definitions, and also
+the cluster definition if the table is a member in a cluster. Also loads
+all foreign key constraints where the foreign key is in the table or where
+a foreign key references columns in this table. Adds all these to the data
+dictionary cache.
+@return table, NULL if does not exist; if the table is stored in an
+.ibd file, but the file does not exist, then we set the
+ibd_file_missing flag TRUE in the table object we return */
+UNIV_INTERN
+dict_table_t*
+dict_load_table(
+/*============*/
+	const char*	name)	/*!< in: table name in the
+				databasename/tablename format */
+{
+	ibool		ibd_file_missing	= FALSE;
+	dict_table_t*	table;
+	dict_table_t*	sys_tables;
+	btr_pcur_t	pcur;
+	dict_index_t*	sys_index;
+	dtuple_t*	tuple;
+	mem_heap_t*	heap;
+	dfield_t*	dfield;
+	const rec_t*	rec;
+	const byte*	field;
+	ulint		len;
+	ulint		space;
+	ulint		n_cols;
+	ulint		flags;
+	ulint		err;
+	mtr_t		mtr;
+
+	ut_ad(mutex_own(&(dict_sys->mutex)));
+
+	heap = mem_heap_create(32000);
+
+	mtr_start(&mtr);
+
+	sys_tables = dict_table_get_low("SYS_TABLES");
+	sys_index = UT_LIST_GET_FIRST(sys_tables->indexes);
+	ut_a(!dict_table_is_comp(sys_tables));
+
+	tuple = dtuple_create(heap, 1);
+	dfield = dtuple_get_nth_field(tuple, 0);
+
+	dfield_set_data(dfield, name, ut_strlen(name));
+	dict_index_copy_types(tuple, sys_index, 1);
+
+	btr_pcur_open_on_user_rec(sys_index, tuple, PAGE_CUR_GE,
+				  BTR_SEARCH_LEAF, &pcur, &mtr);
+	rec = btr_pcur_get_rec(&pcur);
+
+	if (!btr_pcur_is_on_user_rec(&pcur)
+	    || rec_get_deleted_flag(rec, 0)) {
+		/* Not found */
+err_exit:
+		btr_pcur_close(&pcur);
+		mtr_commit(&mtr);
+		mem_heap_free(heap);
+
+		return(NULL);
+	}
+
+	field = rec_get_nth_field_old(rec, 0, &len);
+
+	/* Check if the table name in record is the searched one */
+	if (len != ut_strlen(name) || ut_memcmp(name, field, len) != 0) {
+
+		goto err_exit;
+	}
+
+	ut_a(name_of_col_is(sys_tables, sys_index, 9, "SPACE"));
+
+	field = rec_get_nth_field_old(rec, 9, &len);
+	space = mach_read_from_4(field);
+
+	/* Check if the tablespace exists and has the right name */
+	if (!trx_sys_sys_space(space)) {
+		flags = dict_sys_tables_get_flags(rec);
+
+		if (UNIV_UNLIKELY(flags == ULINT_UNDEFINED)) {
+			field = rec_get_nth_field_old(rec, 5, &len);
+			flags = mach_read_from_4(field);
+
+			ut_print_timestamp(stderr);
+			fputs("  InnoDB: Error: table ", stderr);
+			ut_print_filename(stderr, name);
+			fprintf(stderr, "\n"
+				"InnoDB: in InnoDB data dictionary"
+				" has unknown type %lx.\n",
+				(ulong) flags);
+			goto err_exit;
+		}
+	} else {
+		flags = 0;
+	}
+
+	ut_a(name_of_col_is(sys_tables, sys_index, 4, "N_COLS"));
+
+	field = rec_get_nth_field_old(rec, 4, &len);
+	n_cols = mach_read_from_4(field);
+
+	/* The high-order bit of N_COLS is the "compact format" flag.
+	For tables in that format, MIX_LEN may hold additional flags. */
+	if (n_cols & 0x80000000UL) {
+		ulint	flags2;
+
+		flags |= DICT_TF_COMPACT;
+
+		ut_a(name_of_col_is(sys_tables, sys_index, 7, "MIX_LEN"));
+		field = rec_get_nth_field_old(rec, 7, &len);
+
+		flags2 = mach_read_from_4(field);
+
+		if (flags2 & (~0 << (DICT_TF2_BITS - DICT_TF2_SHIFT))) {
+			ut_print_timestamp(stderr);
+			fputs("  InnoDB: Warning: table ", stderr);
+			ut_print_filename(stderr, name);
+			fprintf(stderr, "\n"
+				"InnoDB: in InnoDB data dictionary"
+				" has unknown flags %lx.\n",
+				(ulong) flags2);
+
+			flags2 &= ~(~0 << (DICT_TF2_BITS - DICT_TF2_SHIFT));
+		}
+
+		flags |= flags2 << DICT_TF2_SHIFT;
+	}
+
+	/* See if the tablespace is available. */
+	if (trx_sys_sys_space(space)) {
+		/* The system tablespace is always available. */
+	} else if (!fil_space_for_table_exists_in_mem(
+			   space, name,
+			   (flags >> DICT_TF2_SHIFT) & DICT_TF2_TEMPORARY,
+			   FALSE, FALSE)) {
+
+		if ((flags >> DICT_TF2_SHIFT) & DICT_TF2_TEMPORARY) {
+			/* Do not bother to retry opening temporary tables. */
+			ibd_file_missing = TRUE;
+		} else {
+			ut_print_timestamp(stderr);
+			fprintf(stderr,
+				"  InnoDB: error: space object of table");
+			ut_print_filename(stderr, name);
+			fprintf(stderr, ",\n"
+				"InnoDB: space id %lu did not exist in memory."
+				" Retrying an open.\n",
+				(ulong) space);
+			/* Try to open the tablespace */
+			if (!fil_open_single_table_tablespace(
+				    TRUE, space,
+				    flags == DICT_TF_COMPACT ? 0 :
+				    flags & ~(~0 << DICT_TF_BITS), name)) {
+				/* We failed to find a sensible
+				tablespace file */
+
+				ibd_file_missing = TRUE;
+			}
+		}
+	}
+
+	table = dict_mem_table_create(name, space, n_cols & ~0x80000000UL,
+				      flags);
+
+	table->ibd_file_missing = (unsigned int) ibd_file_missing;
+
+	ut_a(name_of_col_is(sys_tables, sys_index, 3, "ID"));
+
+	field = rec_get_nth_field_old(rec, 3, &len);
+	table->id = mach_read_from_8(field);
+
+	btr_pcur_close(&pcur);
+	mtr_commit(&mtr);
+
+	dict_load_columns(table, heap);
+
+	dict_table_add_to_cache(table, heap);
+
+	mem_heap_empty(heap);
+
+	err = dict_load_indexes(table, heap);
+
+	/* Initialize table foreign_child value. Its value could be
+	changed when dict_load_foreigns() is called below */
+	table->fk_max_recusive_level = 0;
+
+	/* If the force recovery flag is set, we open the table irrespective
+	of the error condition, since the user may want to dump data from the
+	clustered index. However we load the foreign key information only if
+	all indexes were loaded. */
+	if (err == DB_SUCCESS) {
+		err = dict_load_foreigns(table->name, TRUE, TRUE);
+
+		if (err != DB_SUCCESS) {
+			dict_table_remove_from_cache(table);
+			table = NULL;
+		}
+	} else if (!srv_force_recovery) {
+		dict_table_remove_from_cache(table);
+		table = NULL;
+	}
+
+	table->fk_max_recusive_level = 0;
+#if 0
+	if (err != DB_SUCCESS && table != NULL) {
+
+		mutex_enter(&dict_foreign_err_mutex);
+
+		ut_print_timestamp(stderr);
+
+		fprintf(stderr,
+			"  InnoDB: Error: could not make a foreign key"
+			" definition to match\n"
+			"InnoDB: the foreign key table"
+			" or the referenced table!\n"
+			"InnoDB: The data dictionary of InnoDB is corrupt."
+			" You may need to drop\n"
+			"InnoDB: and recreate the foreign key table"
+			" or the referenced table.\n"
+			"InnoDB: Submit a detailed bug report"
+			" to http://bugs.mysql.com\n"
+			"InnoDB: Latest foreign key error printout:\n%s\n",
+			dict_foreign_err_buf);
+
+		mutex_exit(&dict_foreign_err_mutex);
+	}
+#endif /* 0 */
+	mem_heap_free(heap);
+
+	return(table);
+}
+
+/***********************************************************************//**
+Loads a table object based on the table id.
+@return	table; NULL if table does not exist */
+UNIV_INTERN
+dict_table_t*
+dict_load_table_on_id(
+/*==================*/
+	dulint	table_id)	/*!< in: table id */
+{
+	byte		id_buf[8];
+	btr_pcur_t	pcur;
+	mem_heap_t*	heap;
+	dtuple_t*	tuple;
+	dfield_t*	dfield;
+	dict_index_t*	sys_table_ids;
+	dict_table_t*	sys_tables;
+	const rec_t*	rec;
+	const byte*	field;
+	ulint		len;
+	dict_table_t*	table;
+	mtr_t		mtr;
+
+	ut_ad(mutex_own(&(dict_sys->mutex)));
+
+	table = NULL;
+
+	/* NOTE that the operation of this function is protected by
+	the dictionary mutex, and therefore no deadlocks can occur
+	with other dictionary operations. */
+
+	mtr_start(&mtr);
+	/*---------------------------------------------------*/
+	/* Get the secondary index based on ID for table SYS_TABLES */
+	sys_tables = dict_sys->sys_tables;
+	sys_table_ids = dict_table_get_next_index(
+		dict_table_get_first_index(sys_tables));
+	ut_a(!dict_table_is_comp(sys_tables));
+	heap = mem_heap_create(256);
+
+	tuple  = dtuple_create(heap, 1);
+	dfield = dtuple_get_nth_field(tuple, 0);
+
+	/* Write the table id in byte format to id_buf */
+	mach_write_to_8(id_buf, table_id);
+
+	dfield_set_data(dfield, id_buf, 8);
+	dict_index_copy_types(tuple, sys_table_ids, 1);
+
+	btr_pcur_open_on_user_rec(sys_table_ids, tuple, PAGE_CUR_GE,
+				  BTR_SEARCH_LEAF, &pcur, &mtr);
+	rec = btr_pcur_get_rec(&pcur);
+
+	if (!btr_pcur_is_on_user_rec(&pcur)) {
+		/* Not found */
+		goto func_exit;
+	}
+
+	/* Find the first record that is not delete marked */
+	while (rec_get_deleted_flag(rec, 0)) {
+		if (!btr_pcur_move_to_next_user_rec(&pcur, &mtr)) {
+			goto func_exit;
+		}
+		rec = btr_pcur_get_rec(&pcur);
+	}
+
+	/*---------------------------------------------------*/
+	/* Now we have the record in the secondary index containing the
+	table ID and NAME */
+
+	rec = btr_pcur_get_rec(&pcur);
+	field = rec_get_nth_field_old(rec, 0, &len);
+	ut_ad(len == 8);
+
+	/* Check if the table id in record is the one searched for */
+	if (ut_dulint_cmp(table_id, mach_read_from_8(field)) != 0) {
+		goto func_exit;
+	}
+
+	/* Now we get the table name from the record */
+	field = rec_get_nth_field_old(rec, 1, &len);
+	/* Load the table definition to memory */
+	table = dict_load_table(mem_heap_strdupl(heap, (char*) field, len));
+func_exit:
+	btr_pcur_close(&pcur);
+	mtr_commit(&mtr);
+	mem_heap_free(heap);
+
+	return(table);
+}
+
+/********************************************************************//**
+This function is called when the database is booted. Loads system table
+index definitions except for the clustered index which is added to the
+dictionary cache at booting before calling this function. */
+UNIV_INTERN
+void
+dict_load_sys_table(
+/*================*/
+	dict_table_t*	table)	/*!< in: system table */
+{
+	mem_heap_t*	heap;
+
+	ut_ad(mutex_own(&(dict_sys->mutex)));
+
+	heap = mem_heap_create(1000);
+
+	dict_load_indexes(table, heap);
+
+	mem_heap_free(heap);
+}
+
+/********************************************************************//**
+Loads foreign key constraint col names (also for the referenced table). */
+static
+void
+dict_load_foreign_cols(
+/*===================*/
+	const char*	id,	/*!< in: foreign constraint id as a
+				null-terminated string */
+	dict_foreign_t*	foreign)/*!< in: foreign constraint object */
+{
+	dict_table_t*	sys_foreign_cols;
+	dict_index_t*	sys_index;
+	btr_pcur_t	pcur;
+	dtuple_t*	tuple;
+	dfield_t*	dfield;
+	const rec_t*	rec;
+	const byte*	field;
+	ulint		len;
+	ulint		i;
+	mtr_t		mtr;
+
+	ut_ad(mutex_own(&(dict_sys->mutex)));
+
+	foreign->foreign_col_names = mem_heap_alloc(
+		foreign->heap, foreign->n_fields * sizeof(void*));
+
+	foreign->referenced_col_names = mem_heap_alloc(
+		foreign->heap, foreign->n_fields * sizeof(void*));
+	mtr_start(&mtr);
+
+	sys_foreign_cols = dict_table_get_low("SYS_FOREIGN_COLS");
+	sys_index = UT_LIST_GET_FIRST(sys_foreign_cols->indexes);
+	ut_a(!dict_table_is_comp(sys_foreign_cols));
+
+	tuple = dtuple_create(foreign->heap, 1);
+	dfield = dtuple_get_nth_field(tuple, 0);
+
+	dfield_set_data(dfield, id, ut_strlen(id));
+	dict_index_copy_types(tuple, sys_index, 1);
+
+	btr_pcur_open_on_user_rec(sys_index, tuple, PAGE_CUR_GE,
+				  BTR_SEARCH_LEAF, &pcur, &mtr);
+	for (i = 0; i < foreign->n_fields; i++) {
+
+		rec = btr_pcur_get_rec(&pcur);
+
+		ut_a(btr_pcur_is_on_user_rec(&pcur));
+		ut_a(!rec_get_deleted_flag(rec, 0));
+
+		field = rec_get_nth_field_old(rec, 0, &len);
+		ut_a(len == ut_strlen(id));
+		ut_a(ut_memcmp(id, field, len) == 0);
+
+		field = rec_get_nth_field_old(rec, 1, &len);
+		ut_a(len == 4);
+		ut_a(i == mach_read_from_4(field));
+
+		field = rec_get_nth_field_old(rec, 4, &len);
+		foreign->foreign_col_names[i] = mem_heap_strdupl(
+			foreign->heap, (char*) field, len);
+
+		field = rec_get_nth_field_old(rec, 5, &len);
+		foreign->referenced_col_names[i] = mem_heap_strdupl(
+			foreign->heap, (char*) field, len);
+
+		btr_pcur_move_to_next_user_rec(&pcur, &mtr);
+	}
+
+	btr_pcur_close(&pcur);
+	mtr_commit(&mtr);
+}
+
+/***********************************************************************//**
+Loads a foreign key constraint to the dictionary cache.
+@return	DB_SUCCESS or error code */
+static
+ulint
+dict_load_foreign(
+/*==============*/
+	const char*	id,	/*!< in: foreign constraint id as a
+				null-terminated string */
+	ibool		check_charsets,
+				/*!< in: TRUE=check charset compatibility */
+	ibool		check_recursive)
+				/*!< in: Whether to record the foreign table
+				parent count to avoid unlimited recursive
+				load of chained foreign tables */
+{
+	dict_foreign_t*	foreign;
+	dict_table_t*	sys_foreign;
+	btr_pcur_t	pcur;
+	dict_index_t*	sys_index;
+	dtuple_t*	tuple;
+	mem_heap_t*	heap2;
+	dfield_t*	dfield;
+	const rec_t*	rec;
+	const byte*	field;
+	ulint		len;
+	ulint		n_fields_and_type;
+	mtr_t		mtr;
+	dict_table_t*	for_table;
+	dict_table_t*	ref_table;
+
+	ut_ad(mutex_own(&(dict_sys->mutex)));
+
+	heap2 = mem_heap_create(1000);
+
+	mtr_start(&mtr);
+
+	sys_foreign = dict_table_get_low("SYS_FOREIGN");
+	sys_index = UT_LIST_GET_FIRST(sys_foreign->indexes);
+	ut_a(!dict_table_is_comp(sys_foreign));
+
+	tuple = dtuple_create(heap2, 1);
+	dfield = dtuple_get_nth_field(tuple, 0);
+
+	dfield_set_data(dfield, id, ut_strlen(id));
+	dict_index_copy_types(tuple, sys_index, 1);
+
+	btr_pcur_open_on_user_rec(sys_index, tuple, PAGE_CUR_GE,
+				  BTR_SEARCH_LEAF, &pcur, &mtr);
+	rec = btr_pcur_get_rec(&pcur);
+
+	if (!btr_pcur_is_on_user_rec(&pcur)
+	    || rec_get_deleted_flag(rec, 0)) {
+		/* Not found */
+
+		fprintf(stderr,
+			"InnoDB: Error A: cannot load foreign constraint %s\n",
+			id);
+
+		btr_pcur_close(&pcur);
+		mtr_commit(&mtr);
+		mem_heap_free(heap2);
+
+		return(DB_ERROR);
+	}
+
+	field = rec_get_nth_field_old(rec, 0, &len);
+
+	/* Check if the id in record is the searched one */
+	if (len != ut_strlen(id) || ut_memcmp(id, field, len) != 0) {
+
+		fprintf(stderr,
+			"InnoDB: Error B: cannot load foreign constraint %s\n",
+			id);
+
+		btr_pcur_close(&pcur);
+		mtr_commit(&mtr);
+		mem_heap_free(heap2);
+
+		return(DB_ERROR);
+	}
+
+	/* Read the table names and the number of columns associated
+	with the constraint */
+
+	mem_heap_free(heap2);
+
+	foreign = dict_mem_foreign_create();
+
+	n_fields_and_type = mach_read_from_4(
+		rec_get_nth_field_old(rec, 5, &len));
+
+	ut_a(len == 4);
+
+	/* We store the type in the bits 24..29 of n_fields_and_type. */
+
+	foreign->type = (unsigned int) (n_fields_and_type >> 24);
+	foreign->n_fields = (unsigned int) (n_fields_and_type & 0x3FFUL);
+
+	foreign->id = mem_heap_strdup(foreign->heap, id);
+
+	field = rec_get_nth_field_old(rec, 3, &len);
+	foreign->foreign_table_name = mem_heap_strdupl(
+		foreign->heap, (char*) field, len);
+
+	field = rec_get_nth_field_old(rec, 4, &len);
+	foreign->referenced_table_name = mem_heap_strdupl(
+		foreign->heap, (char*) field, len);
+
+	btr_pcur_close(&pcur);
+	mtr_commit(&mtr);
+
+	dict_load_foreign_cols(id, foreign);
+
+	ref_table = dict_table_check_if_in_cache_low(
+			foreign->referenced_table_name);
+
+	/* We could possibly wind up in a deep recursive calls if
+	we call dict_table_get_low() again here if there
+	is a chain of tables concatenated together with
+	foreign constraints. In such case, each table is
+	both a parent and child of the other tables, and
+	act as a "link" in such table chains.
+	To avoid such scenario, we would need to check the
+	number of ancesters the current table has. If that
+	exceeds DICT_FK_MAX_CHAIN_LEN, we will stop loading
+	the child table.
+	Foreign constraints are loaded in a Breath First fashion,
+	that is, the index on FOR_NAME is scanned first, and then
+	index on REF_NAME. So foreign constrains in which
+	current table is a child (foreign table) are loaded first,
+	and then those constraints where current table is a
+	parent (referenced) table.
+	Thus we could check the parent (ref_table) table's
+	reference count (fk_max_recusive_level) to know how deep the
+	recursive call is. If the parent table (ref_table) is already
+	loaded, and its fk_max_recusive_level is larger than
+	DICT_FK_MAX_CHAIN_LEN, we will stop the recursive loading
+	by skipping loading the child table. It will not affect foreign
+	constraint check for DMLs since child table will be loaded
+	at that time for the constraint check. */
+	if (!ref_table
+	    || ref_table->fk_max_recusive_level < DICT_FK_MAX_RECURSIVE_LOAD) {
+
+		/* If the foreign table is not yet in the dictionary cache, we
+		have to load it so that we are able to make type comparisons
+		in the next function call. */
+
+		for_table = dict_table_get_low(foreign->foreign_table_name);
+
+		if (for_table && ref_table && check_recursive) {
+			/* This is to record the longest chain of ancesters
+			this table has, if the parent has more ancesters
+			than this table has, record it after add 1 (for this
+			parent */
+			if (ref_table->fk_max_recusive_level
+			    >= for_table->fk_max_recusive_level) {
+				for_table->fk_max_recusive_level =
+					 ref_table->fk_max_recusive_level + 1;
+			}
+		}
+	}
+
+	/* Note that there may already be a foreign constraint object in
+	the dictionary cache for this constraint: then the following
+	call only sets the pointers in it to point to the appropriate table
+	and index objects and frees the newly created object foreign.
+	Adding to the cache should always succeed since we are not creating
+	a new foreign key constraint but loading one from the data
+	dictionary. */
+
+	return(dict_foreign_add_to_cache(foreign, check_charsets));
+}
+
+/***********************************************************************//**
+Loads foreign key constraints where the table is either the foreign key
+holder or where the table is referenced by a foreign key. Adds these
+constraints to the data dictionary. Note that we know that the dictionary
+cache already contains all constraints where the other relevant table is
+already in the dictionary cache.
+@return	DB_SUCCESS or error code */
+UNIV_INTERN
+ulint
+dict_load_foreigns(
+/*===============*/
+	const char*	table_name,	/*!< in: table name */
+	ibool		check_recursive,/*!< in: Whether to check recursive
+					load of tables chained by FK */
+	ibool		check_charsets)	/*!< in: TRUE=check charset
+					compatibility */
+{
+	btr_pcur_t	pcur;
+	mem_heap_t*	heap;
+	dtuple_t*	tuple;
+	dfield_t*	dfield;
+	dict_index_t*	sec_index;
+	dict_table_t*	sys_foreign;
+	const rec_t*	rec;
+	const byte*	field;
+	ulint		len;
+	char*		id ;
+	ulint		err;
+	mtr_t		mtr;
+
+	ut_ad(mutex_own(&(dict_sys->mutex)));
+
+	sys_foreign = dict_table_get_low("SYS_FOREIGN");
+
+	if (sys_foreign == NULL) {
+		/* No foreign keys defined yet in this database */
+
+		fprintf(stderr,
+			"InnoDB: Error: no foreign key system tables"
+			" in the database\n");
+
+		return(DB_ERROR);
+	}
+
+	ut_a(!dict_table_is_comp(sys_foreign));
+	mtr_start(&mtr);
+
+	/* Get the secondary index based on FOR_NAME from table
+	SYS_FOREIGN */
+
+	sec_index = dict_table_get_next_index(
+		dict_table_get_first_index(sys_foreign));
+start_load:
+	heap = mem_heap_create(256);
+
+	tuple  = dtuple_create(heap, 1);
+	dfield = dtuple_get_nth_field(tuple, 0);
+
+	dfield_set_data(dfield, table_name, ut_strlen(table_name));
+	dict_index_copy_types(tuple, sec_index, 1);
+
+	btr_pcur_open_on_user_rec(sec_index, tuple, PAGE_CUR_GE,
+				  BTR_SEARCH_LEAF, &pcur, &mtr);
+loop:
+	rec = btr_pcur_get_rec(&pcur);
+
+	if (!btr_pcur_is_on_user_rec(&pcur)) {
+		/* End of index */
+
+		goto load_next_index;
+	}
+
+	/* Now we have the record in the secondary index containing a table
+	name and a foreign constraint ID */
+
+	rec = btr_pcur_get_rec(&pcur);
+	field = rec_get_nth_field_old(rec, 0, &len);
+
+	/* Check if the table name in the record is the one searched for; the
+	following call does the comparison in the latin1_swedish_ci
+	charset-collation, in a case-insensitive way. */
+
+	if (0 != cmp_data_data(dfield_get_type(dfield)->mtype,
+			       dfield_get_type(dfield)->prtype,
+			       dfield_get_data(dfield), dfield_get_len(dfield),
+			       field, len)) {
+
+		goto load_next_index;
+	}
+
+	/* Since table names in SYS_FOREIGN are stored in a case-insensitive
+	order, we have to check that the table name matches also in a binary
+	string comparison. On Unix, MySQL allows table names that only differ
+	in character case. */
+
+	if (0 != ut_memcmp(field, table_name, len)) {
+
+		goto next_rec;
+	}
+
+	if (rec_get_deleted_flag(rec, 0)) {
+
+		goto next_rec;
+	}
+
+	/* Now we get a foreign key constraint id */
+	field = rec_get_nth_field_old(rec, 1, &len);
+	id = mem_heap_strdupl(heap, (char*) field, len);
+
+	btr_pcur_store_position(&pcur, &mtr);
+
+	mtr_commit(&mtr);
+
+	/* Load the foreign constraint definition to the dictionary cache */
+
+	err = dict_load_foreign(id, check_charsets, check_recursive);
+
+	if (err != DB_SUCCESS) {
+		btr_pcur_close(&pcur);
+		mem_heap_free(heap);
+
+		return(err);
+	}
+
+	mtr_start(&mtr);
+
+	btr_pcur_restore_position(BTR_SEARCH_LEAF, &pcur, &mtr);
+next_rec:
+	btr_pcur_move_to_next_user_rec(&pcur, &mtr);
+
+	goto loop;
+
+load_next_index:
+	btr_pcur_close(&pcur);
+	mtr_commit(&mtr);
+	mem_heap_free(heap);
+
+	sec_index = dict_table_get_next_index(sec_index);
+
+	if (sec_index != NULL) {
+
+		mtr_start(&mtr);
+
+		/* Switch to scan index on REF_NAME, fk_max_recusive_level
+		already been updated when scanning FOR_NAME index, no need to
+		update again */
+		check_recursive = FALSE;
+
+		goto start_load;
+	}
+
+	return(DB_SUCCESS);
+}
diff --git a/storage/xtradb/dict/dict0mem.c b/storage/xtradb/dict/dict0mem.c
new file mode 100644
index 00000000000..f2d219bfd4f
--- /dev/null
+++ b/storage/xtradb/dict/dict0mem.c
@@ -0,0 +1,323 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/******************************************************************//**
+@file dict/dict0mem.c
+Data dictionary memory object creation
+
+Created 1/8/1996 Heikki Tuuri
+***********************************************************************/
+
+#include "dict0mem.h"
+
+#ifdef UNIV_NONINL
+#include "dict0mem.ic"
+#endif
+
+#include "rem0rec.h"
+#include "data0type.h"
+#include "mach0data.h"
+#include "dict0dict.h"
+#ifndef UNIV_HOTBACKUP
+# include "lock0lock.h"
+#endif /* !UNIV_HOTBACKUP */
+
+#define	DICT_HEAP_SIZE		100	/*!< initial memory heap size when
+					creating a table or index object */
+
+/**********************************************************************//**
+Creates a table memory object.
+@return	own: table object */
+UNIV_INTERN
+dict_table_t*
+dict_mem_table_create(
+/*==================*/
+	const char*	name,	/*!< in: table name */
+	ulint		space,	/*!< in: space where the clustered index of
+				the table is placed; this parameter is
+				ignored if the table is made a member of
+				a cluster */
+	ulint		n_cols,	/*!< in: number of columns */
+	ulint		flags)	/*!< in: table flags */
+{
+	dict_table_t*	table;
+	mem_heap_t*	heap;
+
+	ut_ad(name);
+	ut_a(!(flags & (~0 << DICT_TF2_BITS)));
+
+	heap = mem_heap_create(DICT_HEAP_SIZE);
+
+	table = mem_heap_zalloc(heap, sizeof(dict_table_t));
+
+	table->heap = heap;
+
+	table->flags = (unsigned int) flags;
+	table->name = ut_malloc(strlen(name) + 1);
+	memcpy(table->name, name, strlen(name) + 1);
+	table->space = (unsigned int) space;
+	table->n_cols = (unsigned int) (n_cols + DATA_N_SYS_COLS);
+
+	table->cols = mem_heap_alloc(heap, (n_cols + DATA_N_SYS_COLS)
+				     * sizeof(dict_col_t));
+
+#ifndef UNIV_HOTBACKUP
+	table->autoinc_lock = mem_heap_alloc(heap, lock_get_size());
+
+	mutex_create(&table->autoinc_mutex, SYNC_DICT_AUTOINC_MUTEX);
+
+	table->autoinc = 0;
+
+	/* The number of transactions that are either waiting on the
+	AUTOINC lock or have been granted the lock. */
+	table->n_waiting_or_granted_auto_inc_locks = 0;
+
+	table->is_corrupt = FALSE;
+#endif /* !UNIV_HOTBACKUP */
+
+	ut_d(table->magic_n = DICT_TABLE_MAGIC_N);
+	return(table);
+}
+
+/****************************************************************//**
+Free a table memory object. */
+UNIV_INTERN
+void
+dict_mem_table_free(
+/*================*/
+	dict_table_t*	table)		/*!< in: table */
+{
+	ut_ad(table);
+	ut_ad(table->magic_n == DICT_TABLE_MAGIC_N);
+	ut_d(table->cached = FALSE);
+
+#ifndef UNIV_HOTBACKUP
+	mutex_free(&(table->autoinc_mutex));
+#endif /* UNIV_HOTBACKUP */
+	ut_free(table->name);
+	mem_heap_free(table->heap);
+}
+
+/****************************************************************//**
+Append 'name' to 'col_names'.  @see dict_table_t::col_names
+@return	new column names array */
+static
+const char*
+dict_add_col_name(
+/*==============*/
+	const char*	col_names,	/*!< in: existing column names, or
+					NULL */
+	ulint		cols,		/*!< in: number of existing columns */
+	const char*	name,		/*!< in: new column name */
+	mem_heap_t*	heap)		/*!< in: heap */
+{
+	ulint	old_len;
+	ulint	new_len;
+	ulint	total_len;
+	char*	res;
+
+	ut_ad(!cols == !col_names);
+
+	/* Find out length of existing array. */
+	if (col_names) {
+		const char*	s = col_names;
+		ulint		i;
+
+		for (i = 0; i < cols; i++) {
+			s += strlen(s) + 1;
+		}
+
+		old_len = s - col_names;
+	} else {
+		old_len = 0;
+	}
+
+	new_len = strlen(name) + 1;
+	total_len = old_len + new_len;
+
+	res = mem_heap_alloc(heap, total_len);
+
+	if (old_len > 0) {
+		memcpy(res, col_names, old_len);
+	}
+
+	memcpy(res + old_len, name, new_len);
+
+	return(res);
+}
+
+/**********************************************************************//**
+Adds a column definition to a table. */
+UNIV_INTERN
+void
+dict_mem_table_add_col(
+/*===================*/
+	dict_table_t*	table,	/*!< in: table */
+	mem_heap_t*	heap,	/*!< in: temporary memory heap, or NULL */
+	const char*	name,	/*!< in: column name, or NULL */
+	ulint		mtype,	/*!< in: main datatype */
+	ulint		prtype,	/*!< in: precise type */
+	ulint		len)	/*!< in: precision */
+{
+	dict_col_t*	col;
+#ifndef UNIV_HOTBACKUP
+	ulint		mbminlen;
+	ulint		mbmaxlen;
+#endif /* !UNIV_HOTBACKUP */
+	ulint		i;
+
+	ut_ad(table);
+	ut_ad(table->magic_n == DICT_TABLE_MAGIC_N);
+	ut_ad(!heap == !name);
+
+	i = table->n_def++;
+
+	if (name) {
+		if (UNIV_UNLIKELY(table->n_def == table->n_cols)) {
+			heap = table->heap;
+		}
+		if (UNIV_LIKELY(i) && UNIV_UNLIKELY(!table->col_names)) {
+			/* All preceding column names are empty. */
+			char* s = mem_heap_zalloc(heap, table->n_def);
+			table->col_names = s;
+		}
+
+		table->col_names = dict_add_col_name(table->col_names,
+						     i, name, heap);
+	}
+
+	col = dict_table_get_nth_col(table, i);
+
+	col->ind = (unsigned int) i;
+	col->ord_part = 0;
+
+	col->mtype = (unsigned int) mtype;
+	col->prtype = (unsigned int) prtype;
+	col->len = (unsigned int) len;
+
+#ifndef UNIV_HOTBACKUP
+	dtype_get_mblen(mtype, prtype, &mbminlen, &mbmaxlen);
+
+	col->mbminlen = (unsigned int) mbminlen;
+	col->mbmaxlen = (unsigned int) mbmaxlen;
+#endif /* !UNIV_HOTBACKUP */
+}
+
+/**********************************************************************//**
+Creates an index memory object.
+@return	own: index object */
+UNIV_INTERN
+dict_index_t*
+dict_mem_index_create(
+/*==================*/
+	const char*	table_name,	/*!< in: table name */
+	const char*	index_name,	/*!< in: index name */
+	ulint		space,		/*!< in: space where the index tree is
+					placed, ignored if the index is of
+					the clustered type */
+	ulint		type,		/*!< in: DICT_UNIQUE,
+					DICT_CLUSTERED, ... ORed */
+	ulint		n_fields)	/*!< in: number of fields */
+{
+	dict_index_t*	index;
+	mem_heap_t*	heap;
+
+	ut_ad(table_name && index_name);
+
+	heap = mem_heap_create(DICT_HEAP_SIZE);
+	index = mem_heap_zalloc(heap, sizeof(dict_index_t));
+
+	index->heap = heap;
+
+	index->type = type;
+#ifndef UNIV_HOTBACKUP
+	index->space = (unsigned int) space;
+#endif /* !UNIV_HOTBACKUP */
+	index->name = mem_heap_strdup(heap, index_name);
+	index->table_name = table_name;
+	index->n_fields = (unsigned int) n_fields;
+	index->fields = mem_heap_alloc(heap, 1 + n_fields
+				       * sizeof(dict_field_t));
+	/* The '1 +' above prevents allocation
+	of an empty mem block */
+#ifdef UNIV_DEBUG
+	index->magic_n = DICT_INDEX_MAGIC_N;
+#endif /* UNIV_DEBUG */
+	return(index);
+}
+
+/**********************************************************************//**
+Creates and initializes a foreign constraint memory object.
+@return	own: foreign constraint struct */
+UNIV_INTERN
+dict_foreign_t*
+dict_mem_foreign_create(void)
+/*=========================*/
+{
+	dict_foreign_t*	foreign;
+	mem_heap_t*	heap;
+
+	heap = mem_heap_create(100);
+
+	foreign = mem_heap_zalloc(heap, sizeof(dict_foreign_t));
+
+	foreign->heap = heap;
+
+	return(foreign);
+}
+
+/**********************************************************************//**
+Adds a field definition to an index. NOTE: does not take a copy
+of the column name if the field is a column. The memory occupied
+by the column name may be released only after publishing the index. */
+UNIV_INTERN
+void
+dict_mem_index_add_field(
+/*=====================*/
+	dict_index_t*	index,		/*!< in: index */
+	const char*	name,		/*!< in: column name */
+	ulint		prefix_len)	/*!< in: 0 or the column prefix length
+					in a MySQL index like
+					INDEX (textcol(25)) */
+{
+	dict_field_t*	field;
+
+	ut_ad(index);
+	ut_ad(index->magic_n == DICT_INDEX_MAGIC_N);
+
+	index->n_def++;
+
+	field = dict_index_get_nth_field(index, index->n_def - 1);
+
+	field->name = name;
+	field->prefix_len = (unsigned int) prefix_len;
+}
+
+/**********************************************************************//**
+Frees an index memory object. */
+UNIV_INTERN
+void
+dict_mem_index_free(
+/*================*/
+	dict_index_t*	index)	/*!< in: index */
+{
+	ut_ad(index);
+	ut_ad(index->magic_n == DICT_INDEX_MAGIC_N);
+
+	mem_heap_free(index->heap);
+}
diff --git a/storage/xtradb/dyn/dyn0dyn.c b/storage/xtradb/dyn/dyn0dyn.c
new file mode 100644
index 00000000000..e1275f040f3
--- /dev/null
+++ b/storage/xtradb/dyn/dyn0dyn.c
@@ -0,0 +1,65 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file dyn/dyn0dyn.c
+The dynamically allocated array
+
+Created 2/5/1996 Heikki Tuuri
+*******************************************************/
+
+#include "dyn0dyn.h"
+#ifdef UNIV_NONINL
+#include "dyn0dyn.ic"
+#endif
+
+/************************************************************//**
+Adds a new block to a dyn array.
+@return	created block */
+UNIV_INTERN
+dyn_block_t*
+dyn_array_add_block(
+/*================*/
+	dyn_array_t*	arr)	/*!< in: dyn array */
+{
+	mem_heap_t*	heap;
+	dyn_block_t*	block;
+
+	ut_ad(arr);
+	ut_ad(arr->magic_n == DYN_BLOCK_MAGIC_N);
+
+	if (arr->heap == NULL) {
+		UT_LIST_INIT(arr->base);
+		UT_LIST_ADD_FIRST(list, arr->base, arr);
+
+		arr->heap = mem_heap_create(sizeof(dyn_block_t));
+	}
+
+	block = dyn_array_get_last_block(arr);
+	block->used = block->used | DYN_BLOCK_FULL_FLAG;
+
+	heap = arr->heap;
+
+	block = mem_heap_alloc(heap, sizeof(dyn_block_t));
+
+	block->used = 0;
+
+	UT_LIST_ADD_LAST(list, arr->base, block);
+
+	return(block);
+}
diff --git a/storage/xtradb/eval/eval0eval.c b/storage/xtradb/eval/eval0eval.c
new file mode 100644
index 00000000000..589b0fa1576
--- /dev/null
+++ b/storage/xtradb/eval/eval0eval.c
@@ -0,0 +1,852 @@
+/*****************************************************************************
+
+Copyright (c) 1997, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file eval/eval0eval.c
+SQL evaluator: evaluates simple data structures, like expressions, in
+a query graph
+
+Created 12/29/1997 Heikki Tuuri
+*******************************************************/
+
+#include "eval0eval.h"
+
+#ifdef UNIV_NONINL
+#include "eval0eval.ic"
+#endif
+
+#include "data0data.h"
+#include "row0sel.h"
+
+/** The RND function seed */
+static ulint	eval_rnd	= 128367121;
+
+/** Dummy adress used when we should allocate a buffer of size 0 in
+eval_node_alloc_val_buf */
+
+static byte	eval_dummy;
+
+/*****************************************************************//**
+Allocate a buffer from global dynamic memory for a value of a que_node.
+NOTE that this memory must be explicitly freed when the query graph is
+freed. If the node already has an allocated buffer, that buffer is freed
+here. NOTE that this is the only function where dynamic memory should be
+allocated for a query node val field.
+@return	pointer to allocated buffer */
+UNIV_INTERN
+byte*
+eval_node_alloc_val_buf(
+/*====================*/
+	que_node_t*	node,	/*!< in: query graph node; sets the val field
+				data field to point to the new buffer, and
+				len field equal to size */
+	ulint		size)	/*!< in: buffer size */
+{
+	dfield_t*	dfield;
+	byte*		data;
+
+	ut_ad(que_node_get_type(node) == QUE_NODE_SYMBOL
+	      || que_node_get_type(node) == QUE_NODE_FUNC);
+
+	dfield = que_node_get_val(node);
+
+	data = dfield_get_data(dfield);
+
+	if (data && data != &eval_dummy) {
+		mem_free(data);
+	}
+
+	if (size == 0) {
+		data = &eval_dummy;
+	} else {
+		data = mem_alloc(size);
+	}
+
+	que_node_set_val_buf_size(node, size);
+
+	dfield_set_data(dfield, data, size);
+
+	return(data);
+}
+
+/*****************************************************************//**
+Free the buffer from global dynamic memory for a value of a que_node,
+if it has been allocated in the above function. The freeing for pushed
+column values is done in sel_col_prefetch_buf_free. */
+UNIV_INTERN
+void
+eval_node_free_val_buf(
+/*===================*/
+	que_node_t*	node)	/*!< in: query graph node */
+{
+	dfield_t*	dfield;
+	byte*		data;
+
+	ut_ad(que_node_get_type(node) == QUE_NODE_SYMBOL
+	      || que_node_get_type(node) == QUE_NODE_FUNC);
+
+	dfield = que_node_get_val(node);
+
+	data = dfield_get_data(dfield);
+
+	if (que_node_get_val_buf_size(node) > 0) {
+		ut_a(data);
+
+		mem_free(data);
+	}
+}
+
+/*****************************************************************//**
+Evaluates a comparison node.
+@return	the result of the comparison */
+UNIV_INTERN
+ibool
+eval_cmp(
+/*=====*/
+	func_node_t*	cmp_node)	/*!< in: comparison node */
+{
+	que_node_t*	arg1;
+	que_node_t*	arg2;
+	int		res;
+	ibool		val;
+	int		func;
+
+	ut_ad(que_node_get_type(cmp_node) == QUE_NODE_FUNC);
+
+	arg1 = cmp_node->args;
+	arg2 = que_node_get_next(arg1);
+
+	res = cmp_dfield_dfield(que_node_get_val(arg1),
+				que_node_get_val(arg2));
+	val = TRUE;
+
+	func = cmp_node->func;
+
+	if (func == '=') {
+		if (res != 0) {
+			val = FALSE;
+		}
+	} else if (func == '<') {
+		if (res != -1) {
+			val = FALSE;
+		}
+	} else if (func == PARS_LE_TOKEN) {
+		if (res == 1) {
+			val = FALSE;
+		}
+	} else if (func == PARS_NE_TOKEN) {
+		if (res == 0) {
+			val = FALSE;
+		}
+	} else if (func == PARS_GE_TOKEN) {
+		if (res == -1) {
+			val = FALSE;
+		}
+	} else {
+		ut_ad(func == '>');
+
+		if (res != 1) {
+			val = FALSE;
+		}
+	}
+
+	eval_node_set_ibool_val(cmp_node, val);
+
+	return(val);
+}
+
+/*****************************************************************//**
+Evaluates a logical operation node. */
+UNIV_INLINE
+void
+eval_logical(
+/*=========*/
+	func_node_t*	logical_node)	/*!< in: logical operation node */
+{
+	que_node_t*	arg1;
+	que_node_t*	arg2;
+	ibool		val1;
+	ibool		val2 = 0; /* remove warning */
+	ibool		val = 0;  /* remove warning */
+	int		func;
+
+	ut_ad(que_node_get_type(logical_node) == QUE_NODE_FUNC);
+
+	arg1 = logical_node->args;
+	arg2 = que_node_get_next(arg1); /* arg2 is NULL if func is 'NOT' */
+
+	val1 = eval_node_get_ibool_val(arg1);
+
+	if (arg2) {
+		val2 = eval_node_get_ibool_val(arg2);
+	}
+
+	func = logical_node->func;
+
+	if (func == PARS_AND_TOKEN) {
+		val = val1 & val2;
+	} else if (func == PARS_OR_TOKEN) {
+		val = val1 | val2;
+	} else if (func == PARS_NOT_TOKEN) {
+		val = TRUE - val1;
+	} else {
+		ut_error;
+	}
+
+	eval_node_set_ibool_val(logical_node, val);
+}
+
+/*****************************************************************//**
+Evaluates an arithmetic operation node. */
+UNIV_INLINE
+void
+eval_arith(
+/*=======*/
+	func_node_t*	arith_node)	/*!< in: arithmetic operation node */
+{
+	que_node_t*	arg1;
+	que_node_t*	arg2;
+	lint		val1;
+	lint		val2 = 0; /* remove warning */
+	lint		val;
+	int		func;
+
+	ut_ad(que_node_get_type(arith_node) == QUE_NODE_FUNC);
+
+	arg1 = arith_node->args;
+	arg2 = que_node_get_next(arg1); /* arg2 is NULL if func is unary '-' */
+
+	val1 = eval_node_get_int_val(arg1);
+
+	if (arg2) {
+		val2 = eval_node_get_int_val(arg2);
+	}
+
+	func = arith_node->func;
+
+	if (func == '+') {
+		val = val1 + val2;
+	} else if ((func == '-') && arg2) {
+		val = val1 - val2;
+	} else if (func == '-') {
+		val = -val1;
+	} else if (func == '*') {
+		val = val1 * val2;
+	} else {
+		ut_ad(func == '/');
+		val = val1 / val2;
+	}
+
+	eval_node_set_int_val(arith_node, val);
+}
+
+/*****************************************************************//**
+Evaluates an aggregate operation node. */
+UNIV_INLINE
+void
+eval_aggregate(
+/*===========*/
+	func_node_t*	node)	/*!< in: aggregate operation node */
+{
+	que_node_t*	arg;
+	lint		val;
+	lint		arg_val;
+	int		func;
+
+	ut_ad(que_node_get_type(node) == QUE_NODE_FUNC);
+
+	val = eval_node_get_int_val(node);
+
+	func = node->func;
+
+	if (func == PARS_COUNT_TOKEN) {
+
+		val = val + 1;
+	} else {
+		ut_ad(func == PARS_SUM_TOKEN);
+
+		arg = node->args;
+		arg_val = eval_node_get_int_val(arg);
+
+		val = val + arg_val;
+	}
+
+	eval_node_set_int_val(node, val);
+}
+
+/*****************************************************************//**
+Evaluates a predefined function node where the function is not relevant
+in benchmarks. */
+static
+void
+eval_predefined_2(
+/*==============*/
+	func_node_t*	func_node)	/*!< in: predefined function node */
+{
+	que_node_t*	arg;
+	que_node_t*	arg1;
+	que_node_t*	arg2 = 0; /* remove warning (??? bug ???) */
+	lint		int_val;
+	byte*		data;
+	ulint		len1;
+	ulint		len2;
+	int		func;
+	ulint		i;
+
+	ut_ad(que_node_get_type(func_node) == QUE_NODE_FUNC);
+
+	arg1 = func_node->args;
+
+	if (arg1) {
+		arg2 = que_node_get_next(arg1);
+	}
+
+	func = func_node->func;
+
+	if (func == PARS_PRINTF_TOKEN) {
+
+		arg = arg1;
+
+		while (arg) {
+			dfield_print(que_node_get_val(arg));
+
+			arg = que_node_get_next(arg);
+		}
+
+		putc('\n', stderr);
+
+	} else if (func == PARS_ASSERT_TOKEN) {
+
+		if (!eval_node_get_ibool_val(arg1)) {
+			fputs("SQL assertion fails in a stored procedure!\n",
+			      stderr);
+		}
+
+		ut_a(eval_node_get_ibool_val(arg1));
+
+		/* This function, or more precisely, a debug procedure,
+		returns no value */
+
+	} else if (func == PARS_RND_TOKEN) {
+
+		len1 = (ulint)eval_node_get_int_val(arg1);
+		len2 = (ulint)eval_node_get_int_val(arg2);
+
+		ut_ad(len2 >= len1);
+
+		if (len2 > len1) {
+			int_val = (lint) (len1
+					  + (eval_rnd % (len2 - len1 + 1)));
+		} else {
+			int_val = (lint) len1;
+		}
+
+		eval_rnd = ut_rnd_gen_next_ulint(eval_rnd);
+
+		eval_node_set_int_val(func_node, int_val);
+
+	} else if (func == PARS_RND_STR_TOKEN) {
+
+		len1 = (ulint)eval_node_get_int_val(arg1);
+
+		data = eval_node_ensure_val_buf(func_node, len1);
+
+		for (i = 0; i < len1; i++) {
+			data[i] = (byte)(97 + (eval_rnd % 3));
+
+			eval_rnd = ut_rnd_gen_next_ulint(eval_rnd);
+		}
+	} else {
+		ut_error;
+	}
+}
+
+/*****************************************************************//**
+Evaluates a notfound-function node. */
+UNIV_INLINE
+void
+eval_notfound(
+/*==========*/
+	func_node_t*	func_node)	/*!< in: function node */
+{
+	que_node_t*	arg1;
+	que_node_t*	arg2;
+	sym_node_t*	cursor;
+	sel_node_t*	sel_node;
+	ibool		ibool_val;
+
+	arg1 = func_node->args;
+	arg2 = que_node_get_next(arg1);
+
+	ut_ad(func_node->func == PARS_NOTFOUND_TOKEN);
+
+	cursor = arg1;
+
+	ut_ad(que_node_get_type(cursor) == QUE_NODE_SYMBOL);
+
+	if (cursor->token_type == SYM_LIT) {
+
+		ut_ad(ut_memcmp(dfield_get_data(que_node_get_val(cursor)),
+				"SQL", 3) == 0);
+
+		sel_node = cursor->sym_table->query_graph->last_sel_node;
+	} else {
+		sel_node = cursor->alias->cursor_def;
+	}
+
+	if (sel_node->state == SEL_NODE_NO_MORE_ROWS) {
+		ibool_val = TRUE;
+	} else {
+		ibool_val = FALSE;
+	}
+
+	eval_node_set_ibool_val(func_node, ibool_val);
+}
+
+/*****************************************************************//**
+Evaluates a substr-function node. */
+UNIV_INLINE
+void
+eval_substr(
+/*========*/
+	func_node_t*	func_node)	/*!< in: function node */
+{
+	que_node_t*	arg1;
+	que_node_t*	arg2;
+	que_node_t*	arg3;
+	dfield_t*	dfield;
+	byte*		str1;
+	ulint		len1;
+	ulint		len2;
+
+	arg1 = func_node->args;
+	arg2 = que_node_get_next(arg1);
+
+	ut_ad(func_node->func == PARS_SUBSTR_TOKEN);
+
+	arg3 = que_node_get_next(arg2);
+
+	str1 = dfield_get_data(que_node_get_val(arg1));
+
+	len1 = (ulint)eval_node_get_int_val(arg2);
+	len2 = (ulint)eval_node_get_int_val(arg3);
+
+	dfield = que_node_get_val(func_node);
+
+	dfield_set_data(dfield, str1 + len1, len2);
+}
+
+/*****************************************************************//**
+Evaluates a replstr-procedure node. */
+static
+void
+eval_replstr(
+/*=========*/
+	func_node_t*	func_node)	/*!< in: function node */
+{
+	que_node_t*	arg1;
+	que_node_t*	arg2;
+	que_node_t*	arg3;
+	que_node_t*	arg4;
+	byte*		str1;
+	byte*		str2;
+	ulint		len1;
+	ulint		len2;
+
+	arg1 = func_node->args;
+	arg2 = que_node_get_next(arg1);
+
+	ut_ad(que_node_get_type(arg1) == QUE_NODE_SYMBOL);
+
+	arg3 = que_node_get_next(arg2);
+	arg4 = que_node_get_next(arg3);
+
+	str1 = dfield_get_data(que_node_get_val(arg1));
+	str2 = dfield_get_data(que_node_get_val(arg2));
+
+	len1 = (ulint)eval_node_get_int_val(arg3);
+	len2 = (ulint)eval_node_get_int_val(arg4);
+
+	if ((dfield_get_len(que_node_get_val(arg1)) < len1 + len2)
+	    || (dfield_get_len(que_node_get_val(arg2)) < len2)) {
+
+		ut_error;
+	}
+
+	ut_memcpy(str1 + len1, str2, len2);
+}
+
+/*****************************************************************//**
+Evaluates an instr-function node. */
+static
+void
+eval_instr(
+/*=======*/
+	func_node_t*	func_node)	/*!< in: function node */
+{
+	que_node_t*	arg1;
+	que_node_t*	arg2;
+	dfield_t*	dfield1;
+	dfield_t*	dfield2;
+	lint		int_val;
+	byte*		str1;
+	byte*		str2;
+	byte		match_char;
+	ulint		len1;
+	ulint		len2;
+	ulint		i;
+	ulint		j;
+
+	arg1 = func_node->args;
+	arg2 = que_node_get_next(arg1);
+
+	dfield1 = que_node_get_val(arg1);
+	dfield2 = que_node_get_val(arg2);
+
+	str1 = dfield_get_data(dfield1);
+	str2 = dfield_get_data(dfield2);
+
+	len1 = dfield_get_len(dfield1);
+	len2 = dfield_get_len(dfield2);
+
+	if (len2 == 0) {
+		ut_error;
+	}
+
+	match_char = str2[0];
+
+	for (i = 0; i < len1; i++) {
+		/* In this outer loop, the number of matched characters is 0 */
+
+		if (str1[i] == match_char) {
+
+			if (i + len2 > len1) {
+
+				break;
+			}
+
+			for (j = 1;; j++) {
+				/* We have already matched j characters */
+
+				if (j == len2) {
+					int_val = i + 1;
+
+					goto match_found;
+				}
+
+				if (str1[i + j] != str2[j]) {
+
+					break;
+				}
+			}
+		}
+	}
+
+	int_val = 0;
+
+match_found:
+	eval_node_set_int_val(func_node, int_val);
+}
+
+/*****************************************************************//**
+Evaluates a predefined function node. */
+UNIV_INLINE
+void
+eval_binary_to_number(
+/*==================*/
+	func_node_t*	func_node)	/*!< in: function node */
+{
+	que_node_t*	arg1;
+	dfield_t*	dfield;
+	byte*		str1;
+	byte*		str2;
+	ulint		len1;
+	ulint		int_val;
+
+	arg1 = func_node->args;
+
+	dfield = que_node_get_val(arg1);
+
+	str1 = dfield_get_data(dfield);
+	len1 = dfield_get_len(dfield);
+
+	if (len1 > 4) {
+		ut_error;
+	}
+
+	if (len1 == 4) {
+		str2 = str1;
+	} else {
+		int_val = 0;
+		str2 = (byte*)&int_val;
+
+		ut_memcpy(str2 + (4 - len1), str1, len1);
+	}
+
+	eval_node_copy_and_alloc_val(func_node, str2, 4);
+}
+
+/*****************************************************************//**
+Evaluates a predefined function node. */
+static
+void
+eval_concat(
+/*========*/
+	func_node_t*	func_node)	/*!< in: function node */
+{
+	que_node_t*	arg;
+	dfield_t*	dfield;
+	byte*		data;
+	ulint		len;
+	ulint		len1;
+
+	arg = func_node->args;
+	len = 0;
+
+	while (arg) {
+		len1 = dfield_get_len(que_node_get_val(arg));
+
+		len += len1;
+
+		arg = que_node_get_next(arg);
+	}
+
+	data = eval_node_ensure_val_buf(func_node, len);
+
+	arg = func_node->args;
+	len = 0;
+
+	while (arg) {
+		dfield = que_node_get_val(arg);
+		len1 = dfield_get_len(dfield);
+
+		ut_memcpy(data + len, dfield_get_data(dfield), len1);
+
+		len += len1;
+
+		arg = que_node_get_next(arg);
+	}
+}
+
+/*****************************************************************//**
+Evaluates a predefined function node. If the first argument is an integer,
+this function looks at the second argument which is the integer length in
+bytes, and converts the integer to a VARCHAR.
+If the first argument is of some other type, this function converts it to
+BINARY. */
+UNIV_INLINE
+void
+eval_to_binary(
+/*===========*/
+	func_node_t*	func_node)	/*!< in: function node */
+{
+	que_node_t*	arg1;
+	que_node_t*	arg2;
+	dfield_t*	dfield;
+	byte*		str1;
+	ulint		len;
+	ulint		len1;
+
+	arg1 = func_node->args;
+
+	str1 = dfield_get_data(que_node_get_val(arg1));
+
+	if (dtype_get_mtype(que_node_get_data_type(arg1)) != DATA_INT) {
+
+		len = dfield_get_len(que_node_get_val(arg1));
+
+		dfield = que_node_get_val(func_node);
+
+		dfield_set_data(dfield, str1, len);
+
+		return;
+	}
+
+	arg2 = que_node_get_next(arg1);
+
+	len1 = (ulint)eval_node_get_int_val(arg2);
+
+	if (len1 > 4) {
+
+		ut_error;
+	}
+
+	dfield = que_node_get_val(func_node);
+
+	dfield_set_data(dfield, str1 + (4 - len1), len1);
+}
+
+/*****************************************************************//**
+Evaluates a predefined function node. */
+UNIV_INLINE
+void
+eval_predefined(
+/*============*/
+	func_node_t*	func_node)	/*!< in: function node */
+{
+	que_node_t*	arg1;
+	lint		int_val;
+	byte*		data;
+	int		func;
+
+	func = func_node->func;
+
+	arg1 = func_node->args;
+
+	if (func == PARS_LENGTH_TOKEN) {
+
+		int_val = (lint)dfield_get_len(que_node_get_val(arg1));
+
+	} else if (func == PARS_TO_CHAR_TOKEN) {
+
+		/* Convert number to character string as a
+		signed decimal integer. */
+
+		ulint	uint_val;
+		int	int_len;
+
+		int_val = eval_node_get_int_val(arg1);
+
+		/* Determine the length of the string. */
+
+		if (int_val == 0) {
+			int_len = 1; /* the number 0 occupies 1 byte */
+		} else {
+			int_len = 0;
+			if (int_val < 0) {
+				uint_val = ((ulint) -int_val - 1) + 1;
+				int_len++; /* reserve space for minus sign */
+			} else {
+				uint_val = (ulint) int_val;
+			}
+			for (; uint_val > 0; int_len++) {
+				uint_val /= 10;
+			}
+		}
+
+		/* allocate the string */
+		data = eval_node_ensure_val_buf(func_node, int_len + 1);
+
+		/* add terminating NUL character */
+		data[int_len] = 0;
+
+		/* convert the number */
+
+		if (int_val == 0) {
+			data[0] = '0';
+		} else {
+			int tmp;
+			if (int_val < 0) {
+				data[0] = '-'; /* preceding minus sign */
+				uint_val = ((ulint) -int_val - 1) + 1;
+			} else {
+				uint_val = (ulint) int_val;
+			}
+			for (tmp = int_len; uint_val > 0; uint_val /= 10) {
+				data[--tmp] = (byte)
+					('0' + (byte)(uint_val % 10));
+			}
+		}
+
+		dfield_set_len(que_node_get_val(func_node), int_len);
+
+		return;
+
+	} else if (func == PARS_TO_NUMBER_TOKEN) {
+
+		int_val = atoi((char*)
+			       dfield_get_data(que_node_get_val(arg1)));
+
+	} else if (func == PARS_SYSDATE_TOKEN) {
+		int_val = (lint)ut_time();
+	} else {
+		eval_predefined_2(func_node);
+
+		return;
+	}
+
+	eval_node_set_int_val(func_node, int_val);
+}
+
+/*****************************************************************//**
+Evaluates a function node. */
+UNIV_INTERN
+void
+eval_func(
+/*======*/
+	func_node_t*	func_node)	/*!< in: function node */
+{
+	que_node_t*	arg;
+	ulint		class;
+	ulint		func;
+
+	ut_ad(que_node_get_type(func_node) == QUE_NODE_FUNC);
+
+	class = func_node->class;
+	func = func_node->func;
+
+	arg = func_node->args;
+
+	/* Evaluate first the argument list */
+	while (arg) {
+		eval_exp(arg);
+
+		/* The functions are not defined for SQL null argument
+		values, except for eval_cmp and notfound */
+
+		if (dfield_is_null(que_node_get_val(arg))
+		    && (class != PARS_FUNC_CMP)
+		    && (func != PARS_NOTFOUND_TOKEN)
+		    && (func != PARS_PRINTF_TOKEN)) {
+			ut_error;
+		}
+
+		arg = que_node_get_next(arg);
+	}
+
+	if (class == PARS_FUNC_CMP) {
+		eval_cmp(func_node);
+	} else if (class == PARS_FUNC_ARITH) {
+		eval_arith(func_node);
+	} else if (class == PARS_FUNC_AGGREGATE) {
+		eval_aggregate(func_node);
+	} else if (class == PARS_FUNC_PREDEFINED) {
+
+		if (func == PARS_NOTFOUND_TOKEN) {
+			eval_notfound(func_node);
+		} else if (func == PARS_SUBSTR_TOKEN) {
+			eval_substr(func_node);
+		} else if (func == PARS_REPLSTR_TOKEN) {
+			eval_replstr(func_node);
+		} else if (func == PARS_INSTR_TOKEN) {
+			eval_instr(func_node);
+		} else if (func == PARS_BINARY_TO_NUMBER_TOKEN) {
+			eval_binary_to_number(func_node);
+		} else if (func == PARS_CONCAT_TOKEN) {
+			eval_concat(func_node);
+		} else if (func == PARS_TO_BINARY_TOKEN) {
+			eval_to_binary(func_node);
+		} else {
+			eval_predefined(func_node);
+		}
+	} else {
+		ut_ad(class == PARS_FUNC_LOGICAL);
+
+		eval_logical(func_node);
+	}
+}
diff --git a/storage/xtradb/eval/eval0proc.c b/storage/xtradb/eval/eval0proc.c
new file mode 100644
index 00000000000..3a4218d92bf
--- /dev/null
+++ b/storage/xtradb/eval/eval0proc.c
@@ -0,0 +1,295 @@
+/*****************************************************************************
+
+Copyright (c) 1998, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file eval/eval0proc.c
+Executes SQL stored procedures and their control structures
+
+Created 1/20/1998 Heikki Tuuri
+*******************************************************/
+
+#include "eval0proc.h"
+
+#ifdef UNIV_NONINL
+#include "eval0proc.ic"
+#endif
+
+/**********************************************************************//**
+Performs an execution step of an if-statement node.
+@return	query thread to run next or NULL */
+UNIV_INTERN
+que_thr_t*
+if_step(
+/*====*/
+	que_thr_t*	thr)	/*!< in: query thread */
+{
+	if_node_t*	node;
+	elsif_node_t*	elsif_node;
+
+	ut_ad(thr);
+
+	node = thr->run_node;
+	ut_ad(que_node_get_type(node) == QUE_NODE_IF);
+
+	if (thr->prev_node == que_node_get_parent(node)) {
+
+		/* Evaluate the condition */
+
+		eval_exp(node->cond);
+
+		if (eval_node_get_ibool_val(node->cond)) {
+
+			/* The condition evaluated to TRUE: start execution
+			from the first statement in the statement list */
+
+			thr->run_node = node->stat_list;
+
+		} else if (node->else_part) {
+			thr->run_node = node->else_part;
+
+		} else if (node->elsif_list) {
+			elsif_node = node->elsif_list;
+
+			for (;;) {
+				eval_exp(elsif_node->cond);
+
+				if (eval_node_get_ibool_val(
+					    elsif_node->cond)) {
+
+					/* The condition evaluated to TRUE:
+					start execution from the first
+					statement in the statement list */
+
+					thr->run_node = elsif_node->stat_list;
+
+					break;
+				}
+
+				elsif_node = que_node_get_next(elsif_node);
+
+				if (elsif_node == NULL) {
+					thr->run_node = NULL;
+
+					break;
+				}
+			}
+		} else {
+			thr->run_node = NULL;
+		}
+	} else {
+		/* Move to the next statement */
+		ut_ad(que_node_get_next(thr->prev_node) == NULL);
+
+		thr->run_node = NULL;
+	}
+
+	if (thr->run_node == NULL) {
+		thr->run_node = que_node_get_parent(node);
+	}
+
+	return(thr);
+}
+
+/**********************************************************************//**
+Performs an execution step of a while-statement node.
+@return	query thread to run next or NULL */
+UNIV_INTERN
+que_thr_t*
+while_step(
+/*=======*/
+	que_thr_t*	thr)	/*!< in: query thread */
+{
+	while_node_t*	node;
+
+	ut_ad(thr);
+
+	node = thr->run_node;
+	ut_ad(que_node_get_type(node) == QUE_NODE_WHILE);
+
+	ut_ad((thr->prev_node == que_node_get_parent(node))
+	      || (que_node_get_next(thr->prev_node) == NULL));
+
+	/* Evaluate the condition */
+
+	eval_exp(node->cond);
+
+	if (eval_node_get_ibool_val(node->cond)) {
+
+		/* The condition evaluated to TRUE: start execution
+		from the first statement in the statement list */
+
+		thr->run_node = node->stat_list;
+	} else {
+		thr->run_node = que_node_get_parent(node);
+	}
+
+	return(thr);
+}
+
+/**********************************************************************//**
+Performs an execution step of an assignment statement node.
+@return	query thread to run next or NULL */
+UNIV_INTERN
+que_thr_t*
+assign_step(
+/*========*/
+	que_thr_t*	thr)	/*!< in: query thread */
+{
+	assign_node_t*	node;
+
+	ut_ad(thr);
+
+	node = thr->run_node;
+	ut_ad(que_node_get_type(node) == QUE_NODE_ASSIGNMENT);
+
+	/* Evaluate the value to assign */
+
+	eval_exp(node->val);
+
+	eval_node_copy_val(node->var->alias, node->val);
+
+	thr->run_node = que_node_get_parent(node);
+
+	return(thr);
+}
+
+/**********************************************************************//**
+Performs an execution step of a for-loop node.
+@return	query thread to run next or NULL */
+UNIV_INTERN
+que_thr_t*
+for_step(
+/*=====*/
+	que_thr_t*	thr)	/*!< in: query thread */
+{
+	for_node_t*	node;
+	que_node_t*	parent;
+	lint		loop_var_value;
+
+	ut_ad(thr);
+
+	node = thr->run_node;
+
+	ut_ad(que_node_get_type(node) == QUE_NODE_FOR);
+
+	parent = que_node_get_parent(node);
+
+	if (thr->prev_node != parent) {
+
+		/* Move to the next statement */
+		thr->run_node = que_node_get_next(thr->prev_node);
+
+		if (thr->run_node != NULL) {
+
+			return(thr);
+		}
+
+		/* Increment the value of loop_var */
+
+		loop_var_value = 1 + eval_node_get_int_val(node->loop_var);
+	} else {
+		/* Initialize the loop */
+
+		eval_exp(node->loop_start_limit);
+		eval_exp(node->loop_end_limit);
+
+		loop_var_value = eval_node_get_int_val(node->loop_start_limit);
+
+		node->loop_end_value
+                  = (int) eval_node_get_int_val(node->loop_end_limit);
+	}
+
+	/* Check if we should do another loop */
+
+	if (loop_var_value > node->loop_end_value) {
+
+		/* Enough loops done */
+
+		thr->run_node = parent;
+	} else {
+		eval_node_set_int_val(node->loop_var, loop_var_value);
+
+		thr->run_node = node->stat_list;
+	}
+
+	return(thr);
+}
+
+/**********************************************************************//**
+Performs an execution step of an exit statement node.
+@return	query thread to run next or NULL */
+UNIV_INTERN
+que_thr_t*
+exit_step(
+/*======*/
+	que_thr_t*	thr)	/*!< in: query thread */
+{
+	exit_node_t*	node;
+	que_node_t*	loop_node;
+
+	ut_ad(thr);
+
+	node = thr->run_node;
+
+	ut_ad(que_node_get_type(node) == QUE_NODE_EXIT);
+
+	/* Loops exit by setting thr->run_node as the loop node's parent, so
+	find our containing loop node and get its parent. */
+
+	loop_node = que_node_get_containing_loop_node(node);
+
+	/* If someone uses an EXIT statement outside of a loop, this will
+	trigger. */
+	ut_a(loop_node);
+
+	thr->run_node = que_node_get_parent(loop_node);
+
+	return(thr);
+}
+
+/**********************************************************************//**
+Performs an execution step of a return-statement node.
+@return	query thread to run next or NULL */
+UNIV_INTERN
+que_thr_t*
+return_step(
+/*========*/
+	que_thr_t*	thr)	/*!< in: query thread */
+{
+	return_node_t*	node;
+	que_node_t*	parent;
+
+	ut_ad(thr);
+
+	node = thr->run_node;
+
+	ut_ad(que_node_get_type(node) == QUE_NODE_RETURN);
+
+	parent = node;
+
+	while (que_node_get_type(parent) != QUE_NODE_PROC) {
+
+		parent = que_node_get_parent(parent);
+	}
+
+	ut_a(parent);
+
+	thr->run_node = que_node_get_parent(parent);
+
+	return(thr);
+}
diff --git a/storage/xtradb/fil/fil0fil.c b/storage/xtradb/fil/fil0fil.c
new file mode 100644
index 00000000000..a8520187013
--- /dev/null
+++ b/storage/xtradb/fil/fil0fil.c
@@ -0,0 +1,5438 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2010, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file fil/fil0fil.c
+The tablespace memory cache
+
+Created 10/25/1995 Heikki Tuuri
+*******************************************************/
+
+#include "fil0fil.h"
+
+#include "mem0mem.h"
+#include "hash0hash.h"
+#include "os0file.h"
+#include "mach0data.h"
+#include "buf0buf.h"
+#include "buf0flu.h"
+#include "log0recv.h"
+#include "fsp0fsp.h"
+#include "srv0srv.h"
+#include "srv0start.h"
+#include "mtr0mtr.h"
+#include "mtr0log.h"
+#include "dict0dict.h"
+#include "page0page.h"
+#include "page0zip.h"
+#include "trx0trx.h"
+#include "trx0sys.h"
+#include "pars0pars.h"
+#include "row0mysql.h"
+#include "row0row.h"
+#include "que0que.h"
+#ifndef UNIV_HOTBACKUP
+# include "buf0lru.h"
+# include "ibuf0ibuf.h"
+# include "sync0sync.h"
+# include "os0sync.h"
+#else /* !UNIV_HOTBACKUP */
+static ulint srv_data_read, srv_data_written;
+#endif /* !UNIV_HOTBACKUP */
+
+/*
+		IMPLEMENTATION OF THE TABLESPACE MEMORY CACHE
+		=============================================
+
+The tablespace cache is responsible for providing fast read/write access to
+tablespaces and logs of the database. File creation and deletion is done
+in other modules which know more of the logic of the operation, however.
+
+A tablespace consists of a chain of files. The size of the files does not
+have to be divisible by the database block size, because we may just leave
+the last incomplete block unused. When a new file is appended to the
+tablespace, the maximum size of the file is also specified. At the moment,
+we think that it is best to extend the file to its maximum size already at
+the creation of the file, because then we can avoid dynamically extending
+the file when more space is needed for the tablespace.
+
+A block's position in the tablespace is specified with a 32-bit unsigned
+integer. The files in the chain are thought to be catenated, and the block
+corresponding to an address n is the nth block in the catenated file (where
+the first block is named the 0th block, and the incomplete block fragments
+at the end of files are not taken into account). A tablespace can be extended
+by appending a new file at the end of the chain.
+
+Our tablespace concept is similar to the one of Oracle.
+
+To acquire more speed in disk transfers, a technique called disk striping is
+sometimes used. This means that logical block addresses are divided in a
+round-robin fashion across several disks. Windows NT supports disk striping,
+so there we do not need to support it in the database. Disk striping is
+implemented in hardware in RAID disks. We conclude that it is not necessary
+to implement it in the database. Oracle 7 does not support disk striping,
+either.
+
+Another trick used at some database sites is replacing tablespace files by
+raw disks, that is, the whole physical disk drive, or a partition of it, is
+opened as a single file, and it is accessed through byte offsets calculated
+from the start of the disk or the partition. This is recommended in some
+books on database tuning to achieve more speed in i/o. Using raw disk
+certainly prevents the OS from fragmenting disk space, but it is not clear
+if it really adds speed. We measured on the Pentium 100 MHz + NT + NTFS file
+system + EIDE Conner disk only a negligible difference in speed when reading
+from a file, versus reading from a raw disk.
+
+To have fast access to a tablespace or a log file, we put the data structures
+to a hash table. Each tablespace and log file is given an unique 32-bit
+identifier.
+
+Some operating systems do not support many open files at the same time,
+though NT seems to tolerate at least 900 open files. Therefore, we put the
+open files in an LRU-list. If we need to open another file, we may close the
+file at the end of the LRU-list. When an i/o-operation is pending on a file,
+the file cannot be closed. We take the file nodes with pending i/o-operations
+out of the LRU-list and keep a count of pending operations. When an operation
+completes, we decrement the count and return the file node to the LRU-list if
+the count drops to zero. */
+
+/** When mysqld is run, the default directory "." is the mysqld datadir,
+but in the MySQL Embedded Server Library and ibbackup it is not the default
+directory, and we must set the base file path explicitly */
+UNIV_INTERN const char*	fil_path_to_mysql_datadir	= ".";
+
+/** The number of fsyncs done to the log */
+UNIV_INTERN ulint	fil_n_log_flushes			= 0;
+
+/** Number of pending redo log flushes */
+UNIV_INTERN ulint	fil_n_pending_log_flushes		= 0;
+/** Number of pending tablespace flushes */
+UNIV_INTERN ulint	fil_n_pending_tablespace_flushes	= 0;
+
+/** The null file address */
+UNIV_INTERN fil_addr_t	fil_addr_null = {FIL_NULL, 0};
+
+/** File node of a tablespace or the log data space */
+struct fil_node_struct {
+	fil_space_t*	space;	/*!< backpointer to the space where this node
+				belongs */
+	char*		name;	/*!< path to the file */
+	ibool		open;	/*!< TRUE if file open */
+	os_file_t	handle;	/*!< OS handle to the file, if file open */
+	ibool		is_raw_disk;/*!< TRUE if the 'file' is actually a raw
+				device or a raw disk partition */
+	ulint		size;	/*!< size of the file in database pages, 0 if
+				not known yet; the possible last incomplete
+				megabyte may be ignored if space == 0 */
+	ulint		n_pending;
+				/*!< count of pending i/o's on this file;
+				closing of the file is not allowed if
+				this is > 0 */
+	ulint		n_pending_flushes;
+				/*!< count of pending flushes on this file;
+				closing of the file is not allowed if
+				this is > 0 */
+	ib_int64_t	modification_counter;/*!< when we write to the file we
+				increment this by one */
+	ib_int64_t	flush_counter;/*!< up to what
+				modification_counter value we have
+				flushed the modifications to disk */
+	UT_LIST_NODE_T(fil_node_t) chain;
+				/*!< link field for the file chain */
+	UT_LIST_NODE_T(fil_node_t) LRU;
+				/*!< link field for the LRU list */
+	ulint		magic_n;/*!< FIL_NODE_MAGIC_N */
+};
+
+/** Value of fil_node_struct::magic_n */
+#define	FIL_NODE_MAGIC_N	89389
+
+/** Tablespace or log data space: let us call them by a common name space */
+struct fil_space_struct {
+	char*		name;	/*!< space name = the path to the first file in
+				it */
+	ulint		id;	/*!< space id */
+	ib_int64_t	tablespace_version;
+				/*!< in DISCARD/IMPORT this timestamp
+				is used to check if we should ignore
+				an insert buffer merge request for a
+				page because it actually was for the
+				previous incarnation of the space */
+	ibool		mark;	/*!< this is set to TRUE at database startup if
+				the space corresponds to a table in the InnoDB
+				data dictionary; so we can print a warning of
+				orphaned tablespaces */
+	ibool		stop_ios;/*!< TRUE if we want to rename the
+				.ibd file of tablespace and want to
+				stop temporarily posting of new i/o
+				requests on the file */
+	ibool		stop_ibuf_merges;
+				/*!< we set this TRUE when we start
+				deleting a single-table tablespace */
+	ibool		is_being_deleted;
+				/*!< this is set to TRUE when we start
+				deleting a single-table tablespace and its
+				file; when this flag is set no further i/o
+				or flush requests can be placed on this space,
+				though there may be such requests still being
+				processed on this space */
+	ulint		purpose;/*!< FIL_TABLESPACE, FIL_LOG, or
+				FIL_ARCH_LOG */
+	UT_LIST_BASE_NODE_T(fil_node_t) chain;
+				/*!< base node for the file chain */
+	ulint		size;	/*!< space size in pages; 0 if a single-table
+				tablespace whose size we do not know yet;
+				last incomplete megabytes in data files may be
+				ignored if space == 0 */
+	ulint		flags;	/*!< compressed page size and file format, or 0 */
+	ulint		n_reserved_extents;
+				/*!< number of reserved free extents for
+				ongoing operations like B-tree page split */
+	ulint		n_pending_flushes; /*!< this is positive when flushing
+				the tablespace to disk; dropping of the
+				tablespace is forbidden if this is positive */
+	ulint		n_pending_ibuf_merges;/*!< this is positive
+				when merging insert buffer entries to
+				a page so that we may need to access
+				the ibuf bitmap page in the
+				tablespade: dropping of the tablespace
+				is forbidden if this is positive */
+	hash_node_t	hash;	/*!< hash chain node */
+	hash_node_t	name_hash;/*!< hash chain the name_hash table */
+#ifndef UNIV_HOTBACKUP
+	rw_lock_t	latch;	/*!< latch protecting the file space storage
+				allocation */
+#endif /* !UNIV_HOTBACKUP */
+	UT_LIST_NODE_T(fil_space_t) unflushed_spaces;
+				/*!< list of spaces with at least one unflushed
+				file we have written to */
+	ibool		is_in_unflushed_spaces; /*!< TRUE if this space is
+				currently in unflushed_spaces */
+	ibool		is_corrupt;
+	UT_LIST_NODE_T(fil_space_t) space_list;
+				/*!< list of all spaces */
+	ulint		magic_n;/*!< FIL_SPACE_MAGIC_N */
+};
+
+/** Value of fil_space_struct::magic_n */
+#define	FIL_SPACE_MAGIC_N	89472
+
+/** The tablespace memory cache */
+typedef	struct fil_system_struct	fil_system_t;
+
+/** The tablespace memory cache; also the totality of logs (the log
+data space) is stored here; below we talk about tablespaces, but also
+the ib_logfiles form a 'space' and it is handled here */
+
+struct fil_system_struct {
+#ifndef UNIV_HOTBACKUP
+	mutex_t		mutex;		/*!< The mutex protecting the cache */
+#endif /* !UNIV_HOTBACKUP */
+	hash_table_t*	spaces;		/*!< The hash table of spaces in the
+					system; they are hashed on the space
+					id */
+	hash_table_t*	name_hash;	/*!< hash table based on the space
+					name */
+	UT_LIST_BASE_NODE_T(fil_node_t) LRU;
+					/*!< base node for the LRU list of the
+					most recently used open files with no
+					pending i/o's; if we start an i/o on
+					the file, we first remove it from this
+					list, and return it to the start of
+					the list when the i/o ends;
+					log files and the system tablespace are
+					not put to this list: they are opened
+					after the startup, and kept open until
+					shutdown */
+	UT_LIST_BASE_NODE_T(fil_space_t) unflushed_spaces;
+					/*!< base node for the list of those
+					tablespaces whose files contain
+					unflushed writes; those spaces have
+					at least one file node where
+					modification_counter > flush_counter */
+	ulint		n_open;		/*!< number of files currently open */
+	ulint		max_n_open;	/*!< n_open is not allowed to exceed
+					this */
+	ib_int64_t	modification_counter;/*!< when we write to a file we
+					increment this by one */
+	ulint		max_assigned_id;/*!< maximum space id in the existing
+					tables, or assigned during the time
+					mysqld has been up; at an InnoDB
+					startup we scan the data dictionary
+					and set here the maximum of the
+					space id's of the tables there */
+	ib_int64_t	tablespace_version;
+					/*!< a counter which is incremented for
+					every space object memory creation;
+					every space mem object gets a
+					'timestamp' from this; in DISCARD/
+					IMPORT this is used to check if we
+					should ignore an insert buffer merge
+					request */
+	UT_LIST_BASE_NODE_T(fil_space_t) space_list;
+					/*!< list of all file spaces */
+	ibool		space_id_reuse_warned;
+					/* !< TRUE if fil_space_create()
+					has issued a warning about
+					potential space_id reuse */
+};
+
+/** The tablespace memory cache. This variable is NULL before the module is
+initialized. */
+static fil_system_t*	fil_system	= NULL;
+
+
+/********************************************************************//**
+NOTE: you must call fil_mutex_enter_and_prepare_for_io() first!
+
+Prepares a file node for i/o. Opens the file if it is closed. Updates the
+pending i/o's field in the node and the system appropriately. Takes the node
+off the LRU list if it is in the LRU list. The caller must hold the fil_sys
+mutex. */
+static
+void
+fil_node_prepare_for_io(
+/*====================*/
+	fil_node_t*	node,	/*!< in: file node */
+	fil_system_t*	system,	/*!< in: tablespace memory cache */
+	fil_space_t*	space);	/*!< in: space */
+/********************************************************************//**
+Updates the data structures when an i/o operation finishes. Updates the
+pending i/o's field in the node appropriately. */
+static
+void
+fil_node_complete_io(
+/*=================*/
+	fil_node_t*	node,	/*!< in: file node */
+	fil_system_t*	system,	/*!< in: tablespace memory cache */
+	ulint		type);	/*!< in: OS_FILE_WRITE or OS_FILE_READ; marks
+				the node as modified if
+				type == OS_FILE_WRITE */
+/*******************************************************************//**
+Checks if a single-table tablespace for a given table name exists in the
+tablespace memory cache.
+@return	space id, ULINT_UNDEFINED if not found */
+static
+ulint
+fil_get_space_id_for_table(
+/*=======================*/
+	const char*	name);	/*!< in: table name in the standard
+				'databasename/tablename' format */
+/*******************************************************************//**
+Frees a space object from the tablespace memory cache. Closes the files in
+the chain but does not delete them. There must not be any pending i/o's or
+flushes on the files. */
+static
+ibool
+fil_space_free(
+/*===========*/
+				/* out: TRUE if success */
+	ulint		id,	/* in: space id */
+	ibool		own_mutex);/* in: TRUE if own system->mutex */
+/********************************************************************//**
+Reads data from a space to a buffer. Remember that the possible incomplete
+blocks at the end of file are ignored: they are not taken into account when
+calculating the byte offset within a space.
+@return DB_SUCCESS, or DB_TABLESPACE_DELETED if we are trying to do
+i/o on a tablespace which does not exist */
+UNIV_INLINE
+ulint
+fil_read(
+/*=====*/
+	ibool	sync,		/*!< in: TRUE if synchronous aio is desired */
+	ulint	space_id,	/*!< in: space id */
+	ulint	zip_size,	/*!< in: compressed page size in bytes;
+				0 for uncompressed pages */
+	ulint	block_offset,	/*!< in: offset in number of blocks */
+	ulint	byte_offset,	/*!< in: remainder of offset in bytes; in aio
+				this must be divisible by the OS block size */
+	ulint	len,		/*!< in: how many bytes to read; this must not
+				cross a file boundary; in aio this must be a
+				block size multiple */
+	void*	buf,		/*!< in/out: buffer where to store data read;
+				in aio this must be appropriately aligned */
+	void*	message)	/*!< in: message for aio handler if non-sync
+				aio used, else ignored */
+{
+	return(fil_io(OS_FILE_READ, sync, space_id, zip_size, block_offset,
+					  byte_offset, len, buf, message));
+}
+
+/********************************************************************//**
+Writes data to a space from a buffer. Remember that the possible incomplete
+blocks at the end of file are ignored: they are not taken into account when
+calculating the byte offset within a space.
+@return DB_SUCCESS, or DB_TABLESPACE_DELETED if we are trying to do
+i/o on a tablespace which does not exist */
+UNIV_INLINE
+ulint
+fil_write(
+/*======*/
+	ibool	sync,		/*!< in: TRUE if synchronous aio is desired */
+	ulint	space_id,	/*!< in: space id */
+	ulint	zip_size,	/*!< in: compressed page size in bytes;
+				0 for uncompressed pages */
+	ulint	block_offset,	/*!< in: offset in number of blocks */
+	ulint	byte_offset,	/*!< in: remainder of offset in bytes; in aio
+				this must be divisible by the OS block size */
+	ulint	len,		/*!< in: how many bytes to write; this must
+				not cross a file boundary; in aio this must
+				be a block size multiple */
+	void*	buf,		/*!< in: buffer from which to write; in aio
+				this must be appropriately aligned */
+	void*	message)	/*!< in: message for aio handler if non-sync
+				aio used, else ignored */
+{
+	return(fil_io(OS_FILE_WRITE, sync, space_id, zip_size, block_offset,
+					   byte_offset, len, buf, message));
+}
+
+/*******************************************************************//**
+Returns the table space by a given id, NULL if not found. */
+UNIV_INLINE
+fil_space_t*
+fil_space_get_by_id(
+/*================*/
+	ulint	id)	/*!< in: space id */
+{
+	fil_space_t*	space;
+
+	ut_ad(mutex_own(&fil_system->mutex));
+
+	HASH_SEARCH(hash, fil_system->spaces, id,
+		    fil_space_t*, space,
+		    ut_ad(space->magic_n == FIL_SPACE_MAGIC_N),
+		    space->id == id);
+
+	return(space);
+}
+
+/*******************************************************************//**
+Returns the table space by a given name, NULL if not found. */
+UNIV_INLINE
+fil_space_t*
+fil_space_get_by_name(
+/*==================*/
+	const char*	name)	/*!< in: space name */
+{
+	fil_space_t*	space;
+	ulint		fold;
+
+	ut_ad(mutex_own(&fil_system->mutex));
+
+	fold = ut_fold_string(name);
+
+	HASH_SEARCH(name_hash, fil_system->name_hash, fold,
+		    fil_space_t*, space,
+		    ut_ad(space->magic_n == FIL_SPACE_MAGIC_N),
+		    !strcmp(name, space->name));
+
+	return(space);
+}
+
+#ifndef UNIV_HOTBACKUP
+/*******************************************************************//**
+Returns the version number of a tablespace, -1 if not found.
+@return version number, -1 if the tablespace does not exist in the
+memory cache */
+UNIV_INTERN
+ib_int64_t
+fil_space_get_version(
+/*==================*/
+	ulint	id)	/*!< in: space id */
+{
+	fil_space_t*	space;
+	ib_int64_t	version		= -1;
+
+	ut_ad(fil_system);
+
+	mutex_enter(&fil_system->mutex);
+
+	space = fil_space_get_by_id(id);
+
+	if (space) {
+		version = space->tablespace_version;
+	}
+
+	mutex_exit(&fil_system->mutex);
+
+	return(version);
+}
+
+/*******************************************************************//**
+Returns the latch of a file space.
+@return	latch protecting storage allocation */
+UNIV_INTERN
+rw_lock_t*
+fil_space_get_latch(
+/*================*/
+	ulint	id,	/*!< in: space id */
+	ulint*	flags)	/*!< out: tablespace flags */
+{
+	fil_space_t*	space;
+
+	ut_ad(fil_system);
+
+	mutex_enter(&fil_system->mutex);
+
+	space = fil_space_get_by_id(id);
+
+	ut_a(space);
+
+	if (flags) {
+		*flags = space->flags;
+	}
+
+	mutex_exit(&fil_system->mutex);
+
+	return(&(space->latch));
+}
+
+/*******************************************************************//**
+Returns the type of a file space.
+@return	FIL_TABLESPACE or FIL_LOG */
+UNIV_INTERN
+ulint
+fil_space_get_type(
+/*===============*/
+	ulint	id)	/*!< in: space id */
+{
+	fil_space_t*	space;
+
+	ut_ad(fil_system);
+
+	mutex_enter(&fil_system->mutex);
+
+	space = fil_space_get_by_id(id);
+
+	ut_a(space);
+
+	mutex_exit(&fil_system->mutex);
+
+	return(space->purpose);
+}
+#endif /* !UNIV_HOTBACKUP */
+
+/**********************************************************************//**
+Checks if all the file nodes in a space are flushed. The caller must hold
+the fil_system mutex.
+@return	TRUE if all are flushed */
+static
+ibool
+fil_space_is_flushed(
+/*=================*/
+	fil_space_t*	space)	/*!< in: space */
+{
+	fil_node_t*	node;
+
+	ut_ad(mutex_own(&fil_system->mutex));
+
+	node = UT_LIST_GET_FIRST(space->chain);
+
+	while (node) {
+		if (node->modification_counter > node->flush_counter) {
+
+			return(FALSE);
+		}
+
+		node = UT_LIST_GET_NEXT(chain, node);
+	}
+
+	return(TRUE);
+}
+
+/*******************************************************************//**
+Appends a new file to the chain of files of a space. File must be closed. */
+UNIV_INTERN
+void
+fil_node_create(
+/*============*/
+	const char*	name,	/*!< in: file name (file must be closed) */
+	ulint		size,	/*!< in: file size in database blocks, rounded
+				downwards to an integer */
+	ulint		id,	/*!< in: space id where to append */
+	ibool		is_raw)	/*!< in: TRUE if a raw device or
+				a raw disk partition */
+{
+	fil_node_t*	node;
+	fil_space_t*	space;
+
+	ut_a(fil_system);
+	ut_a(name);
+
+	mutex_enter(&fil_system->mutex);
+
+	node = mem_alloc(sizeof(fil_node_t));
+
+	node->name = mem_strdup(name);
+	node->open = FALSE;
+
+	ut_a(!is_raw || srv_start_raw_disk_in_use);
+
+	node->is_raw_disk = is_raw;
+	node->size = size;
+	node->magic_n = FIL_NODE_MAGIC_N;
+	node->n_pending = 0;
+	node->n_pending_flushes = 0;
+
+	node->modification_counter = 0;
+	node->flush_counter = 0;
+
+	space = fil_space_get_by_id(id);
+
+	if (!space) {
+		ut_print_timestamp(stderr);
+		fprintf(stderr,
+			"  InnoDB: Error: Could not find tablespace %lu for\n"
+			"InnoDB: file ", (ulong) id);
+		ut_print_filename(stderr, name);
+		fputs(" in the tablespace memory cache.\n", stderr);
+		mem_free(node->name);
+
+		mem_free(node);
+
+		mutex_exit(&fil_system->mutex);
+
+		return;
+	}
+
+	space->size += size;
+
+	node->space = space;
+
+	UT_LIST_ADD_LAST(chain, space->chain, node);
+
+	if (id < SRV_LOG_SPACE_FIRST_ID && fil_system->max_assigned_id < id) {
+
+		fil_system->max_assigned_id = id;
+	}
+
+	mutex_exit(&fil_system->mutex);
+}
+
+/********************************************************************//**
+Opens a the file of a node of a tablespace. The caller must own the fil_system
+mutex. */
+static
+void
+fil_node_open_file(
+/*===============*/
+	fil_node_t*	node,	/*!< in: file node */
+	fil_system_t*	system,	/*!< in: tablespace memory cache */
+	fil_space_t*	space)	/*!< in: space */
+{
+	ib_uint64_t	size_bytes;
+	ulint		size_low;
+	ulint		size_high;
+	ibool		ret;
+	ibool		success;
+	byte*		buf2;
+	byte*		page;
+	ulint		space_id;
+	ulint		flags;
+
+	ut_ad(mutex_own(&(system->mutex)));
+	ut_a(node->n_pending == 0);
+	ut_a(node->open == FALSE);
+
+	if (node->size == 0) {
+		/* It must be a single-table tablespace and we do not know the
+		size of the file yet. First we open the file in the normal
+		mode, no async I/O here, for simplicity. Then do some checks,
+		and close the file again.
+		NOTE that we could not use the simple file read function
+		os_file_read() in Windows to read from a file opened for
+		async I/O! */
+
+		node->handle = os_file_create_simple_no_error_handling(
+			node->name, OS_FILE_OPEN, OS_FILE_READ_ONLY, &success);
+		if (!success) {
+			/* The following call prints an error message */
+			os_file_get_last_error(TRUE);
+
+			ut_print_timestamp(stderr);
+
+			fprintf(stderr,
+				"  InnoDB: Fatal error: cannot open %s\n."
+				"InnoDB: Have you deleted .ibd files"
+				" under a running mysqld server?\n",
+				node->name);
+			ut_a(0);
+		}
+
+		os_file_get_size(node->handle, &size_low, &size_high);
+
+		size_bytes = (((ib_uint64_t)size_high) << 32)
+			+ (ib_uint64_t)size_low;
+#ifdef UNIV_HOTBACKUP
+		if (trx_sys_sys_space(space->id)) {
+			node->size = (ulint) (size_bytes / UNIV_PAGE_SIZE);
+			os_file_close(node->handle);
+			goto add_size;
+		}
+#endif /* UNIV_HOTBACKUP */
+		ut_a(space->purpose != FIL_LOG);
+		ut_a(!trx_sys_sys_space(space->id));
+
+		if (size_bytes < FIL_IBD_FILE_INITIAL_SIZE * UNIV_PAGE_SIZE) {
+			fprintf(stderr,
+				"InnoDB: Error: the size of single-table"
+				" tablespace file %s\n"
+				"InnoDB: is only %lu %lu,"
+				" should be at least %lu!\n",
+				node->name,
+				(ulong) size_high,
+				(ulong) size_low,
+				(ulong) (FIL_IBD_FILE_INITIAL_SIZE
+					 * UNIV_PAGE_SIZE));
+
+			ut_a(0);
+		}
+
+		/* Read the first page of the tablespace */
+
+		buf2 = ut_malloc(2 * UNIV_PAGE_SIZE);
+		/* Align the memory for file i/o if we might have O_DIRECT
+		set */
+		page = ut_align(buf2, UNIV_PAGE_SIZE);
+
+		success = os_file_read(node->handle, page, 0, 0,
+				       UNIV_PAGE_SIZE);
+		space_id = fsp_header_get_space_id(page);
+		flags = fsp_header_get_flags(page);
+
+		ut_free(buf2);
+
+		/* Close the file now that we have read the space id from it */
+
+		os_file_close(node->handle);
+
+		if (UNIV_UNLIKELY(space_id != space->id)) {
+			fprintf(stderr,
+				"InnoDB: Error: tablespace id is %lu"
+				" in the data dictionary\n"
+				"InnoDB: but in file %s it is %lu!\n",
+				space->id, node->name, space_id);
+
+			ut_error;
+		}
+
+		if (UNIV_UNLIKELY(space_id == ULINT_UNDEFINED
+				  || trx_sys_sys_space(space_id))) {
+			fprintf(stderr,
+				"InnoDB: Error: tablespace id %lu"
+				" in file %s is not sensible\n",
+				(ulong) space_id, node->name);
+
+			ut_error;
+		}
+
+		if (UNIV_UNLIKELY(space->flags != flags)) {
+			fprintf(stderr,
+				"InnoDB: Error: table flags are %lx"
+				" in the data dictionary\n"
+				"InnoDB: but the flags in file %s are %lx!\n",
+				space->flags, node->name, flags);
+
+			ut_error;
+		}
+
+		if (size_bytes >= 1024 * 1024) {
+			/* Truncate the size to whole megabytes. */
+			size_bytes = ut_2pow_round(size_bytes, 1024 * 1024);
+		}
+
+		if (!(flags & DICT_TF_ZSSIZE_MASK)) {
+			node->size = (ulint) (size_bytes / UNIV_PAGE_SIZE);
+		} else {
+			node->size = (ulint)
+				(size_bytes
+				 / dict_table_flags_to_zip_size(flags));
+		}
+
+#ifdef UNIV_HOTBACKUP
+add_size:
+#endif /* UNIV_HOTBACKUP */
+		space->size += node->size;
+	}
+
+	/* printf("Opening file %s\n", node->name); */
+
+	/* Open the file for reading and writing, in Windows normally in the
+	unbuffered async I/O mode, though global variables may make
+	os_file_create() to fall back to the normal file I/O mode. */
+
+	if (space->purpose == FIL_LOG) {
+		node->handle = os_file_create(node->name, OS_FILE_OPEN,
+					      OS_FILE_AIO, OS_LOG_FILE, &ret);
+	} else if (node->is_raw_disk) {
+		node->handle = os_file_create(node->name,
+					      OS_FILE_OPEN_RAW,
+					      OS_FILE_AIO, OS_DATA_FILE, &ret);
+	} else {
+		node->handle = os_file_create(node->name, OS_FILE_OPEN,
+					      OS_FILE_AIO, OS_DATA_FILE, &ret);
+	}
+
+	ut_a(ret);
+
+	node->open = TRUE;
+
+	system->n_open++;
+
+	if (space->purpose == FIL_TABLESPACE && !trx_sys_sys_space(space->id)) {
+		/* Put the node to the LRU list */
+		UT_LIST_ADD_FIRST(LRU, system->LRU, node);
+	}
+}
+
+/**********************************************************************//**
+Closes a file. */
+static
+void
+fil_node_close_file(
+/*================*/
+	fil_node_t*	node,	/*!< in: file node */
+	fil_system_t*	system)	/*!< in: tablespace memory cache */
+{
+	ibool	ret;
+
+	ut_ad(node && system);
+	ut_ad(mutex_own(&(system->mutex)));
+	ut_a(node->open);
+	ut_a(node->n_pending == 0);
+	ut_a(node->n_pending_flushes == 0);
+	ut_a(node->modification_counter == node->flush_counter);
+
+	ret = os_file_close(node->handle);
+	ut_a(ret);
+
+	/* printf("Closing file %s\n", node->name); */
+
+	node->open = FALSE;
+	ut_a(system->n_open > 0);
+	system->n_open--;
+
+	if (node->space->purpose == FIL_TABLESPACE && !trx_sys_sys_space(node->space->id)) {
+		ut_a(UT_LIST_GET_LEN(system->LRU) > 0);
+
+		/* The node is in the LRU list, remove it */
+		UT_LIST_REMOVE(LRU, system->LRU, node);
+	}
+}
+
+/********************************************************************//**
+Tries to close a file in the LRU list. The caller must hold the fil_sys
+mutex.
+@return TRUE if success, FALSE if should retry later; since i/o's
+generally complete in < 100 ms, and as InnoDB writes at most 128 pages
+from the buffer pool in a batch, and then immediately flushes the
+files, there is a good chance that the next time we find a suitable
+node from the LRU list */
+static
+ibool
+fil_try_to_close_file_in_LRU(
+/*=========================*/
+	ibool	print_info)	/*!< in: if TRUE, prints information why it
+				cannot close a file */
+{
+	fil_node_t*	node;
+
+	ut_ad(mutex_own(&fil_system->mutex));
+
+	node = UT_LIST_GET_LAST(fil_system->LRU);
+
+	if (print_info) {
+		fprintf(stderr,
+			"InnoDB: fil_sys open file LRU len %lu\n",
+			(ulong) UT_LIST_GET_LEN(fil_system->LRU));
+	}
+
+	while (node != NULL) {
+		if (node->modification_counter == node->flush_counter
+		    && node->n_pending_flushes == 0) {
+
+			fil_node_close_file(node, fil_system);
+
+			return(TRUE);
+		}
+
+		if (print_info && node->n_pending_flushes > 0) {
+			fputs("InnoDB: cannot close file ", stderr);
+			ut_print_filename(stderr, node->name);
+			fprintf(stderr, ", because n_pending_flushes %lu\n",
+				(ulong) node->n_pending_flushes);
+		}
+
+		if (print_info
+		    && node->modification_counter != node->flush_counter) {
+			fputs("InnoDB: cannot close file ", stderr);
+			ut_print_filename(stderr, node->name);
+			fprintf(stderr,
+				", because mod_count %ld != fl_count %ld\n",
+				(long) node->modification_counter,
+				(long) node->flush_counter);
+		}
+
+		node = UT_LIST_GET_PREV(LRU, node);
+	}
+
+	return(FALSE);
+}
+
+/*******************************************************************//**
+Reserves the fil_system mutex and tries to make sure we can open at least one
+file while holding it. This should be called before calling
+fil_node_prepare_for_io(), because that function may need to open a file. */
+static
+void
+fil_mutex_enter_and_prepare_for_io(
+/*===============================*/
+	ulint	space_id)	/*!< in: space id */
+{
+	fil_space_t*	space;
+	ibool		success;
+	ibool		print_info	= FALSE;
+	ulint		count		= 0;
+	ulint		count2		= 0;
+
+retry:
+	mutex_enter(&fil_system->mutex);
+
+	if (trx_sys_sys_space(space_id) || space_id >= SRV_LOG_SPACE_FIRST_ID) {
+		/* We keep log files and system tablespace files always open;
+		this is important in preventing deadlocks in this module, as
+		a page read completion often performs another read from the
+		insert buffer. The insert buffer is in tablespace 0, and we
+		cannot end up waiting in this function. */
+
+		return;
+	}
+
+	if (fil_system->n_open < fil_system->max_n_open) {
+
+		return;
+	}
+
+	space = fil_space_get_by_id(space_id);
+
+	if (space != NULL && space->stop_ios) {
+		/* We are going to do a rename file and want to stop new i/o's
+		for a while */
+
+		if (count2 > 20000) {
+			fputs("InnoDB: Warning: tablespace ", stderr);
+			ut_print_filename(stderr, space->name);
+			fprintf(stderr,
+				" has i/o ops stopped for a long time %lu\n",
+				(ulong) count2);
+		}
+
+		mutex_exit(&fil_system->mutex);
+
+		os_thread_sleep(20000);
+
+		count2++;
+
+		goto retry;
+	}
+
+	/* If the file is already open, no need to do anything; if the space
+	does not exist, we handle the situation in the function which called
+	this function */
+
+	if (!space || UT_LIST_GET_FIRST(space->chain)->open) {
+
+		return;
+	}
+
+	if (count > 1) {
+		print_info = TRUE;
+	}
+
+	/* Too many files are open, try to close some */
+close_more:
+	success = fil_try_to_close_file_in_LRU(print_info);
+
+	if (success && fil_system->n_open >= fil_system->max_n_open) {
+
+		goto close_more;
+	}
+
+	if (fil_system->n_open < fil_system->max_n_open) {
+		/* Ok */
+
+		return;
+	}
+
+	if (count >= 2) {
+		ut_print_timestamp(stderr);
+		fprintf(stderr,
+			"  InnoDB: Warning: too many (%lu) files stay open"
+			" while the maximum\n"
+			"InnoDB: allowed value would be %lu.\n"
+			"InnoDB: You may need to raise the value of"
+			" innodb_open_files in\n"
+			"InnoDB: my.cnf.\n",
+			(ulong) fil_system->n_open,
+			(ulong) fil_system->max_n_open);
+
+		return;
+	}
+
+	mutex_exit(&fil_system->mutex);
+
+#ifndef UNIV_HOTBACKUP
+	/* Wake the i/o-handler threads to make sure pending i/o's are
+	performed */
+	os_aio_simulated_wake_handler_threads();
+
+	os_thread_sleep(20000);
+#endif
+	/* Flush tablespaces so that we can close modified files in the LRU
+	list */
+
+	fil_flush_file_spaces(FIL_TABLESPACE);
+
+	count++;
+
+	goto retry;
+}
+
+/*******************************************************************//**
+Frees a file node object from a tablespace memory cache. */
+static
+void
+fil_node_free(
+/*==========*/
+	fil_node_t*	node,	/*!< in, own: file node */
+	fil_system_t*	system,	/*!< in: tablespace memory cache */
+	fil_space_t*	space)	/*!< in: space where the file node is chained */
+{
+	ut_ad(node && system && space);
+	ut_ad(mutex_own(&(system->mutex)));
+	ut_a(node->magic_n == FIL_NODE_MAGIC_N);
+	ut_a(node->n_pending == 0);
+
+	if (node->open) {
+		/* We fool the assertion in fil_node_close_file() to think
+		there are no unflushed modifications in the file */
+
+		node->modification_counter = node->flush_counter;
+
+		if (space->is_in_unflushed_spaces
+		    && fil_space_is_flushed(space)) {
+
+			space->is_in_unflushed_spaces = FALSE;
+
+			UT_LIST_REMOVE(unflushed_spaces,
+				       system->unflushed_spaces,
+				       space);
+		}
+
+		fil_node_close_file(node, system);
+	}
+
+	space->size -= node->size;
+
+	UT_LIST_REMOVE(chain, space->chain, node);
+
+	mem_free(node->name);
+	mem_free(node);
+}
+
+#ifdef UNIV_LOG_ARCHIVE
+/****************************************************************//**
+Drops files from the start of a file space, so that its size is cut by
+the amount given. */
+UNIV_INTERN
+void
+fil_space_truncate_start(
+/*=====================*/
+	ulint	id,		/*!< in: space id */
+	ulint	trunc_len)	/*!< in: truncate by this much; it is an error
+				if this does not equal to the combined size of
+				some initial files in the space */
+{
+	fil_node_t*	node;
+	fil_space_t*	space;
+
+	mutex_enter(&fil_system->mutex);
+
+	space = fil_space_get_by_id(id);
+
+	ut_a(space);
+
+	while (trunc_len > 0) {
+		node = UT_LIST_GET_FIRST(space->chain);
+
+		ut_a(node->size * UNIV_PAGE_SIZE <= trunc_len);
+
+		trunc_len -= node->size * UNIV_PAGE_SIZE;
+
+		fil_node_free(node, fil_system, space);
+	}
+
+	mutex_exit(&fil_system->mutex);
+}
+#endif /* UNIV_LOG_ARCHIVE */
+
+/*******************************************************************//**
+Creates a space memory object and puts it to the tablespace memory cache. If
+there is an error, prints an error message to the .err log.
+@return	TRUE if success */
+UNIV_INTERN
+ibool
+fil_space_create(
+/*=============*/
+	const char*	name,	/*!< in: space name */
+	ulint		id,	/*!< in: space id */
+	ulint		flags,	/*!< in: compressed page size
+				and file format, or 0 */
+	ulint		purpose)/*!< in: FIL_TABLESPACE, or FIL_LOG if log */
+{
+	fil_space_t*	space;
+
+	/* The tablespace flags (FSP_SPACE_FLAGS) should be 0 for
+	ROW_FORMAT=COMPACT
+	((table->flags & ~(~0 << DICT_TF_BITS)) == DICT_TF_COMPACT) and
+	ROW_FORMAT=REDUNDANT (table->flags == 0).  For any other
+	format, the tablespace flags should equal
+	(table->flags & ~(~0 << DICT_TF_BITS)). */
+	ut_a(flags != DICT_TF_COMPACT);
+	ut_a(!(flags & (~0UL << DICT_TF_BITS)));
+
+try_again:
+	/*printf(
+	"InnoDB: Adding tablespace %lu of name %s, purpose %lu\n", id, name,
+	purpose);*/
+
+	ut_a(fil_system);
+	ut_a(name);
+
+	mutex_enter(&fil_system->mutex);
+
+	space = fil_space_get_by_name(name);
+
+	if (UNIV_LIKELY_NULL(space)) {
+		ulint	namesake_id;
+
+		ut_print_timestamp(stderr);
+		fprintf(stderr,
+			"  InnoDB: Warning: trying to init to the"
+			" tablespace memory cache\n"
+			"InnoDB: a tablespace %lu of name ", (ulong) id);
+		ut_print_filename(stderr, name);
+		fprintf(stderr, ",\n"
+			"InnoDB: but a tablespace %lu of the same name\n"
+			"InnoDB: already exists in the"
+			" tablespace memory cache!\n",
+			(ulong) space->id);
+
+		if (trx_sys_sys_space(id) || purpose != FIL_TABLESPACE) {
+
+			mutex_exit(&fil_system->mutex);
+
+			return(FALSE);
+		}
+
+		fprintf(stderr,
+			"InnoDB: We assume that InnoDB did a crash recovery,"
+			" and you had\n"
+			"InnoDB: an .ibd file for which the table"
+			" did not exist in the\n"
+			"InnoDB: InnoDB internal data dictionary in the"
+			" ibdata files.\n"
+			"InnoDB: We assume that you later removed the"
+			" .ibd and .frm files,\n"
+			"InnoDB: and are now trying to recreate the table."
+			" We now remove the\n"
+			"InnoDB: conflicting tablespace object"
+			" from the memory cache and try\n"
+			"InnoDB: the init again.\n");
+
+		namesake_id = space->id;
+
+		mutex_exit(&fil_system->mutex);
+
+		fil_space_free(namesake_id, FALSE);
+
+		goto try_again;
+	}
+
+	space = fil_space_get_by_id(id);
+
+	if (UNIV_LIKELY_NULL(space)) {
+		fprintf(stderr,
+			"InnoDB: Error: trying to add tablespace %lu"
+			" of name ", (ulong) id);
+		ut_print_filename(stderr, name);
+		fprintf(stderr, "\n"
+			"InnoDB: to the tablespace memory cache,"
+			" but tablespace\n"
+			"InnoDB: %lu of name ", (ulong) space->id);
+		ut_print_filename(stderr, space->name);
+		fputs(" already exists in the tablespace\n"
+		      "InnoDB: memory cache!\n", stderr);
+
+		mutex_exit(&fil_system->mutex);
+
+		return(FALSE);
+	}
+
+	space = mem_alloc(sizeof(fil_space_t));
+
+	space->name = mem_strdup(name);
+	space->id = id;
+
+	fil_system->tablespace_version++;
+	space->tablespace_version = fil_system->tablespace_version;
+	space->mark = FALSE;
+
+	if (UNIV_LIKELY(purpose == FIL_TABLESPACE && !recv_recovery_on)
+	    && UNIV_UNLIKELY(id > fil_system->max_assigned_id)) {
+		if (!fil_system->space_id_reuse_warned) {
+			fil_system->space_id_reuse_warned = TRUE;
+
+			ut_print_timestamp(stderr);
+			fprintf(stderr,
+				"  InnoDB: Warning: allocated tablespace %lu,"
+				" old maximum was %lu\n",
+				(ulong) id,
+				(ulong) fil_system->max_assigned_id);
+		}
+
+		fil_system->max_assigned_id = id;
+	}
+
+	space->stop_ios = FALSE;
+	space->stop_ibuf_merges = FALSE;
+	space->is_being_deleted = FALSE;
+	space->purpose = purpose;
+	space->size = 0;
+	space->flags = flags;
+
+	space->n_reserved_extents = 0;
+
+	space->n_pending_flushes = 0;
+	space->n_pending_ibuf_merges = 0;
+
+	UT_LIST_INIT(space->chain);
+	space->magic_n = FIL_SPACE_MAGIC_N;
+
+	rw_lock_create(&space->latch, SYNC_FSP);
+
+	HASH_INSERT(fil_space_t, hash, fil_system->spaces, id, space);
+
+	HASH_INSERT(fil_space_t, name_hash, fil_system->name_hash,
+		    ut_fold_string(name), space);
+	space->is_in_unflushed_spaces = FALSE;
+
+	space->is_corrupt = FALSE;
+
+	UT_LIST_ADD_LAST(space_list, fil_system->space_list, space);
+
+	mutex_exit(&fil_system->mutex);
+
+	return(TRUE);
+}
+
+/*******************************************************************//**
+Assigns a new space id for a new single-table tablespace. This works simply by
+incrementing the global counter. If 4 billion id's is not enough, we may need
+to recycle id's.
+@return	TRUE if assigned, FALSE if not */
+UNIV_INTERN
+ibool
+fil_assign_new_space_id(
+/*====================*/
+	ulint*	space_id)	/*!< in/out: space id */
+{
+	ulint	id;
+	ibool	success;
+
+	mutex_enter(&fil_system->mutex);
+
+	id = *space_id;
+
+	if (id < fil_system->max_assigned_id) {
+		id = fil_system->max_assigned_id;
+	}
+
+	id++;
+
+	if (id > (SRV_LOG_SPACE_FIRST_ID / 2) && (id % 1000000UL == 0)) {
+		ut_print_timestamp(stderr);
+		fprintf(stderr,
+			"InnoDB: Warning: you are running out of new"
+			" single-table tablespace id's.\n"
+			"InnoDB: Current counter is %lu and it"
+			" must not exceed %lu!\n"
+			"InnoDB: To reset the counter to zero"
+			" you have to dump all your tables and\n"
+			"InnoDB: recreate the whole InnoDB installation.\n",
+			(ulong) id,
+			(ulong) SRV_LOG_SPACE_FIRST_ID);
+	}
+
+	success = (id < SRV_LOG_SPACE_FIRST_ID);
+
+	if (success) {
+		*space_id = fil_system->max_assigned_id = id;
+	} else {
+		ut_print_timestamp(stderr);
+		fprintf(stderr,
+			"InnoDB: You have run out of single-table"
+			" tablespace id's!\n"
+			"InnoDB: Current counter is %lu.\n"
+			"InnoDB: To reset the counter to zero you"
+			" have to dump all your tables and\n"
+			"InnoDB: recreate the whole InnoDB installation.\n",
+			(ulong) id);
+		*space_id = ULINT_UNDEFINED;
+	}
+
+	mutex_exit(&fil_system->mutex);
+
+	return(success);
+}
+
+/*******************************************************************//**
+Frees a space object from the tablespace memory cache. Closes the files in
+the chain but does not delete them. There must not be any pending i/o's or
+flushes on the files.
+@return	TRUE if success */
+static
+ibool
+fil_space_free(
+/*===========*/
+					/* out: TRUE if success */
+	ulint		id,		/* in: space id */
+	ibool		own_mutex)	/* in: TRUE if own system->mutex */
+{
+	fil_space_t*	space;
+	fil_space_t*	namespace;
+	fil_node_t*	fil_node;
+
+	if (!own_mutex) {
+		mutex_enter(&fil_system->mutex);
+	}
+
+	space = fil_space_get_by_id(id);
+
+	if (!space) {
+		ut_print_timestamp(stderr);
+		fprintf(stderr,
+			"  InnoDB: Error: trying to remove tablespace %lu"
+			" from the cache but\n"
+			"InnoDB: it is not there.\n", (ulong) id);
+
+		mutex_exit(&fil_system->mutex);
+
+		return(FALSE);
+	}
+
+	HASH_DELETE(fil_space_t, hash, fil_system->spaces, id, space);
+
+	namespace = fil_space_get_by_name(space->name);
+	ut_a(namespace);
+	ut_a(space == namespace);
+
+	HASH_DELETE(fil_space_t, name_hash, fil_system->name_hash,
+		    ut_fold_string(space->name), space);
+
+	if (space->is_in_unflushed_spaces) {
+		space->is_in_unflushed_spaces = FALSE;
+
+		UT_LIST_REMOVE(unflushed_spaces, fil_system->unflushed_spaces,
+			       space);
+	}
+
+	UT_LIST_REMOVE(space_list, fil_system->space_list, space);
+
+	ut_a(space->magic_n == FIL_SPACE_MAGIC_N);
+	ut_a(0 == space->n_pending_flushes);
+
+	fil_node = UT_LIST_GET_FIRST(space->chain);
+
+	while (fil_node != NULL) {
+		fil_node_free(fil_node, fil_system, space);
+
+		fil_node = UT_LIST_GET_FIRST(space->chain);
+	}
+
+	ut_a(0 == UT_LIST_GET_LEN(space->chain));
+
+	if (!own_mutex) {
+		mutex_exit(&fil_system->mutex);
+	}
+
+	rw_lock_free(&(space->latch));
+
+	mem_free(space->name);
+	mem_free(space);
+
+	return(TRUE);
+}
+
+/*******************************************************************//**
+Returns the size of the space in pages. The tablespace must be cached in the
+memory cache.
+@return	space size, 0 if space not found */
+UNIV_INTERN
+ulint
+fil_space_get_size(
+/*===============*/
+	ulint	id)	/*!< in: space id */
+{
+	fil_node_t*	node;
+	fil_space_t*	space;
+	ulint		size;
+
+	ut_ad(fil_system);
+
+	fil_mutex_enter_and_prepare_for_io(id);
+
+	space = fil_space_get_by_id(id);
+
+	if (space == NULL) {
+		mutex_exit(&fil_system->mutex);
+
+		return(0);
+	}
+
+	if (space->size == 0 && space->purpose == FIL_TABLESPACE) {
+		ut_a(id != 0);
+
+		ut_a(1 == UT_LIST_GET_LEN(space->chain));
+
+		node = UT_LIST_GET_FIRST(space->chain);
+
+		/* It must be a single-table tablespace and we have not opened
+		the file yet; the following calls will open it and update the
+		size fields */
+
+		fil_node_prepare_for_io(node, fil_system, space);
+		fil_node_complete_io(node, fil_system, OS_FILE_READ);
+	}
+
+	size = space->size;
+
+	mutex_exit(&fil_system->mutex);
+
+	return(size);
+}
+
+/*******************************************************************//**
+Returns the flags of the space. The tablespace must be cached
+in the memory cache.
+@return	flags, ULINT_UNDEFINED if space not found */
+UNIV_INTERN
+ulint
+fil_space_get_flags(
+/*================*/
+	ulint	id)	/*!< in: space id */
+{
+	fil_node_t*	node;
+	fil_space_t*	space;
+	ulint		flags;
+
+	ut_ad(fil_system);
+
+	if (UNIV_UNLIKELY(!id)) {
+		return(0);
+	}
+
+	fil_mutex_enter_and_prepare_for_io(id);
+
+	space = fil_space_get_by_id(id);
+
+	if (space == NULL) {
+		mutex_exit(&fil_system->mutex);
+
+		return(ULINT_UNDEFINED);
+	}
+
+	if (space->size == 0 && space->purpose == FIL_TABLESPACE) {
+		ut_a(id != 0);
+
+		ut_a(1 == UT_LIST_GET_LEN(space->chain));
+
+		node = UT_LIST_GET_FIRST(space->chain);
+
+		/* It must be a single-table tablespace and we have not opened
+		the file yet; the following calls will open it and update the
+		size fields */
+
+		fil_node_prepare_for_io(node, fil_system, space);
+		fil_node_complete_io(node, fil_system, OS_FILE_READ);
+	}
+
+	flags = space->flags;
+
+	mutex_exit(&fil_system->mutex);
+
+	return(flags);
+}
+
+/*******************************************************************//**
+Returns the compressed page size of the space, or 0 if the space
+is not compressed. The tablespace must be cached in the memory cache.
+@return	compressed page size, ULINT_UNDEFINED if space not found */
+UNIV_INTERN
+ulint
+fil_space_get_zip_size(
+/*===================*/
+	ulint	id)	/*!< in: space id */
+{
+	ulint	flags;
+
+	flags = fil_space_get_flags(id);
+
+	if (flags && flags != ULINT_UNDEFINED) {
+
+		return(dict_table_flags_to_zip_size(flags));
+	}
+
+	return(flags);
+}
+
+/*******************************************************************//**
+Checks if the pair space, page_no refers to an existing page in a tablespace
+file space. The tablespace must be cached in the memory cache.
+@return	TRUE if the address is meaningful */
+UNIV_INTERN
+ibool
+fil_check_adress_in_tablespace(
+/*===========================*/
+	ulint	id,	/*!< in: space id */
+	ulint	page_no)/*!< in: page number */
+{
+	if (fil_space_get_size(id) > page_no) {
+
+		return(TRUE);
+	}
+
+	return(FALSE);
+}
+
+/****************************************************************//**
+Initializes the tablespace memory cache. */
+UNIV_INTERN
+void
+fil_init(
+/*=====*/
+	ulint	hash_size,	/*!< in: hash table size */
+	ulint	max_n_open)	/*!< in: max number of open files */
+{
+	ut_a(fil_system == NULL);
+
+	ut_a(hash_size > 0);
+	ut_a(max_n_open > 0);
+
+	fil_system = mem_zalloc(sizeof(fil_system_t));
+
+	mutex_create(&fil_system->mutex, SYNC_ANY_LATCH);
+
+	fil_system->spaces = hash_create(hash_size);
+	fil_system->name_hash = hash_create(hash_size);
+
+	UT_LIST_INIT(fil_system->LRU);
+
+	fil_system->max_n_open = max_n_open;
+
+	fil_system->max_assigned_id = TRX_SYS_SPACE_MAX;
+}
+
+/*******************************************************************//**
+Opens all log files and system tablespace data files. They stay open until the
+database server shutdown. This should be called at a server startup after the
+space objects for the log and the system tablespace have been created. The
+purpose of this operation is to make sure we never run out of file descriptors
+if we need to read from the insert buffer or to write to the log. */
+UNIV_INTERN
+void
+fil_open_log_and_system_tablespace_files(void)
+/*==========================================*/
+{
+	fil_space_t*	space;
+	fil_node_t*	node;
+
+	mutex_enter(&fil_system->mutex);
+
+	space = UT_LIST_GET_FIRST(fil_system->space_list);
+
+	while (space != NULL) {
+		if (space->purpose != FIL_TABLESPACE || trx_sys_sys_space(space->id)) {
+			node = UT_LIST_GET_FIRST(space->chain);
+
+			while (node != NULL) {
+				if (!node->open) {
+					fil_node_open_file(node, fil_system,
+							   space);
+				}
+				if (fil_system->max_n_open
+				    < 10 + fil_system->n_open) {
+					fprintf(stderr,
+						"InnoDB: Warning: you must"
+						" raise the value of"
+						" innodb_open_files in\n"
+						"InnoDB: my.cnf! Remember that"
+						" InnoDB keeps all log files"
+						" and all system\n"
+						"InnoDB: tablespace files open"
+						" for the whole time mysqld is"
+						" running, and\n"
+						"InnoDB: needs to open also"
+						" some .ibd files if the"
+						" file-per-table storage\n"
+						"InnoDB: model is used."
+						" Current open files %lu,"
+						" max allowed"
+						" open files %lu.\n",
+						(ulong) fil_system->n_open,
+						(ulong) fil_system->max_n_open);
+				}
+				node = UT_LIST_GET_NEXT(chain, node);
+			}
+		}
+		space = UT_LIST_GET_NEXT(space_list, space);
+	}
+
+	mutex_exit(&fil_system->mutex);
+}
+
+/*******************************************************************//**
+Closes all open files. There must not be any pending i/o's or not flushed
+modifications in the files. */
+UNIV_INTERN
+void
+fil_close_all_files(void)
+/*=====================*/
+{
+	fil_space_t*	space;
+	fil_node_t*	node;
+
+	mutex_enter(&fil_system->mutex);
+
+	space = UT_LIST_GET_FIRST(fil_system->space_list);
+
+	while (space != NULL) {
+		fil_space_t*	prev_space = space;
+
+		node = UT_LIST_GET_FIRST(space->chain);
+
+		while (node != NULL) {
+			if (node->open) {
+				fil_node_close_file(node, fil_system);
+			}
+			node = UT_LIST_GET_NEXT(chain, node);
+		}
+		space = UT_LIST_GET_NEXT(space_list, space);
+		fil_space_free(prev_space->id, TRUE);
+	}
+
+	mutex_exit(&fil_system->mutex);
+}
+
+/*******************************************************************//**
+Sets the max tablespace id counter if the given number is bigger than the
+previous value. */
+UNIV_INTERN
+void
+fil_set_max_space_id_if_bigger(
+/*===========================*/
+	ulint	max_id)	/*!< in: maximum known id */
+{
+	if (max_id >= SRV_LOG_SPACE_FIRST_ID) {
+		fprintf(stderr,
+			"InnoDB: Fatal error: max tablespace id"
+			" is too high, %lu\n", (ulong) max_id);
+		ut_error;
+	}
+
+	mutex_enter(&fil_system->mutex);
+
+	if (fil_system->max_assigned_id < max_id) {
+
+		fil_system->max_assigned_id = max_id;
+	}
+
+	mutex_exit(&fil_system->mutex);
+}
+
+/****************************************************************//**
+Writes the flushed lsn and the latest archived log number to the page header
+of the first page of a data file of the system tablespace (space 0),
+which is uncompressed. */
+static
+ulint
+fil_write_lsn_and_arch_no_to_file(
+/*==============================*/
+	ulint		sum_of_sizes,	/*!< in: combined size of previous files
+					in space, in database pages */
+	ib_uint64_t	lsn,		/*!< in: lsn to write */
+	ulint		arch_log_no __attribute__((unused)))
+					/*!< in: archived log number to write */
+{
+	byte*	buf1;
+	byte*	buf;
+
+	buf1 = mem_alloc(2 * UNIV_PAGE_SIZE);
+	buf = ut_align(buf1, UNIV_PAGE_SIZE);
+
+	fil_read(TRUE, 0, 0, sum_of_sizes, 0, UNIV_PAGE_SIZE, buf, NULL);
+
+	mach_write_ull(buf + FIL_PAGE_FILE_FLUSH_LSN, lsn);
+
+	fil_write(TRUE, 0, 0, sum_of_sizes, 0, UNIV_PAGE_SIZE, buf, NULL);
+
+	mem_free(buf1);
+
+	return(DB_SUCCESS);
+}
+
+/****************************************************************//**
+Writes the flushed lsn and the latest archived log number to the page
+header of the first page of each data file in the system tablespace.
+@return	DB_SUCCESS or error number */
+UNIV_INTERN
+ulint
+fil_write_flushed_lsn_to_data_files(
+/*================================*/
+	ib_uint64_t	lsn,		/*!< in: lsn to write */
+	ulint		arch_log_no)	/*!< in: latest archived log
+					file number */
+{
+	fil_space_t*	space;
+	fil_node_t*	node;
+	ulint		sum_of_sizes;
+	ulint		err;
+
+	mutex_enter(&fil_system->mutex);
+
+	space = UT_LIST_GET_FIRST(fil_system->space_list);
+
+	while (space) {
+		/* We only write the lsn to all existing data files which have
+		been open during the lifetime of the mysqld process; they are
+		represented by the space objects in the tablespace memory
+		cache. Note that all data files in the system tablespace 0 are
+		always open. */
+
+		if (space->purpose == FIL_TABLESPACE
+		    && space->id == 0) {
+			sum_of_sizes = 0;
+
+			node = UT_LIST_GET_FIRST(space->chain);
+			while (node) {
+				mutex_exit(&fil_system->mutex);
+
+				err = fil_write_lsn_and_arch_no_to_file(
+					sum_of_sizes, lsn, arch_log_no);
+				if (err != DB_SUCCESS) {
+
+					return(err);
+				}
+
+				mutex_enter(&fil_system->mutex);
+
+				sum_of_sizes += node->size;
+				node = UT_LIST_GET_NEXT(chain, node);
+			}
+		}
+		space = UT_LIST_GET_NEXT(space_list, space);
+	}
+
+	mutex_exit(&fil_system->mutex);
+
+	return(DB_SUCCESS);
+}
+
+/*******************************************************************//**
+Reads the flushed lsn and arch no fields from a data file at database
+startup. */
+UNIV_INTERN
+void
+fil_read_flushed_lsn_and_arch_log_no(
+/*=================================*/
+	os_file_t	data_file,		/*!< in: open data file */
+	ibool		one_read_already,	/*!< in: TRUE if min and max
+						parameters below already
+						contain sensible data */
+#ifdef UNIV_LOG_ARCHIVE
+	ulint*		min_arch_log_no,	/*!< in/out: */
+	ulint*		max_arch_log_no,	/*!< in/out: */
+#endif /* UNIV_LOG_ARCHIVE */
+	ib_uint64_t*	min_flushed_lsn,	/*!< in/out: */
+	ib_uint64_t*	max_flushed_lsn)	/*!< in/out: */
+{
+	byte*		buf;
+	byte*		buf2;
+	ib_uint64_t	flushed_lsn;
+
+	buf2 = ut_malloc(2 * UNIV_PAGE_SIZE);
+	/* Align the memory for a possible read from a raw device */
+	buf = ut_align(buf2, UNIV_PAGE_SIZE);
+
+	os_file_read(data_file, buf, 0, 0, UNIV_PAGE_SIZE);
+
+	flushed_lsn = mach_read_ull(buf + FIL_PAGE_FILE_FLUSH_LSN);
+
+	ut_free(buf2);
+
+	if (!one_read_already) {
+		*min_flushed_lsn = flushed_lsn;
+		*max_flushed_lsn = flushed_lsn;
+#ifdef UNIV_LOG_ARCHIVE
+		*min_arch_log_no = arch_log_no;
+		*max_arch_log_no = arch_log_no;
+#endif /* UNIV_LOG_ARCHIVE */
+		return;
+	}
+
+	if (*min_flushed_lsn > flushed_lsn) {
+		*min_flushed_lsn = flushed_lsn;
+	}
+	if (*max_flushed_lsn < flushed_lsn) {
+		*max_flushed_lsn = flushed_lsn;
+	}
+#ifdef UNIV_LOG_ARCHIVE
+	if (*min_arch_log_no > arch_log_no) {
+		*min_arch_log_no = arch_log_no;
+	}
+	if (*max_arch_log_no < arch_log_no) {
+		*max_arch_log_no = arch_log_no;
+	}
+#endif /* UNIV_LOG_ARCHIVE */
+}
+
+/*================ SINGLE-TABLE TABLESPACES ==========================*/
+
+#ifndef UNIV_HOTBACKUP
+/*******************************************************************//**
+Increments the count of pending insert buffer page merges, if space is not
+being deleted.
+@return	TRUE if being deleted, and ibuf merges should be skipped */
+UNIV_INTERN
+ibool
+fil_inc_pending_ibuf_merges(
+/*========================*/
+	ulint	id)	/*!< in: space id */
+{
+	fil_space_t*	space;
+
+	mutex_enter(&fil_system->mutex);
+
+	space = fil_space_get_by_id(id);
+
+	if (space == NULL) {
+		fprintf(stderr,
+			"InnoDB: Error: trying to do ibuf merge to a"
+			" dropped tablespace %lu\n",
+			(ulong) id);
+	}
+
+	if (space == NULL || space->stop_ibuf_merges) {
+		mutex_exit(&fil_system->mutex);
+
+		return(TRUE);
+	}
+
+	space->n_pending_ibuf_merges++;
+
+	mutex_exit(&fil_system->mutex);
+
+	return(FALSE);
+}
+
+/*******************************************************************//**
+Decrements the count of pending insert buffer page merges. */
+UNIV_INTERN
+void
+fil_decr_pending_ibuf_merges(
+/*=========================*/
+	ulint	id)	/*!< in: space id */
+{
+	fil_space_t*	space;
+
+	mutex_enter(&fil_system->mutex);
+
+	space = fil_space_get_by_id(id);
+
+	if (space == NULL) {
+		fprintf(stderr,
+			"InnoDB: Error: decrementing ibuf merge of a"
+			" dropped tablespace %lu\n",
+			(ulong) id);
+	}
+
+	if (space != NULL) {
+		space->n_pending_ibuf_merges--;
+	}
+
+	mutex_exit(&fil_system->mutex);
+}
+#endif /* !UNIV_HOTBACKUP */
+
+/********************************************************//**
+Creates the database directory for a table if it does not exist yet. */
+static
+void
+fil_create_directory_for_tablename(
+/*===============================*/
+	const char*	name)	/*!< in: name in the standard
+				'databasename/tablename' format */
+{
+	const char*	namend;
+	char*		path;
+	ulint		len;
+
+	len = strlen(fil_path_to_mysql_datadir);
+	namend = strchr(name, '/');
+	ut_a(namend);
+	path = mem_alloc(len + (namend - name) + 2);
+
+	memcpy(path, fil_path_to_mysql_datadir, len);
+	path[len] = '/';
+	memcpy(path + len + 1, name, namend - name);
+	path[len + (namend - name) + 1] = 0;
+
+	srv_normalize_path_for_win(path);
+
+	ut_a(os_file_create_directory(path, FALSE));
+	mem_free(path);
+}
+
+#ifndef UNIV_HOTBACKUP
+/********************************************************//**
+Writes a log record about an .ibd file create/rename/delete. */
+static
+void
+fil_op_write_log(
+/*=============*/
+	ulint		type,		/*!< in: MLOG_FILE_CREATE,
+					MLOG_FILE_CREATE2,
+					MLOG_FILE_DELETE, or
+					MLOG_FILE_RENAME */
+	ulint		space_id,	/*!< in: space id */
+	ulint		log_flags,	/*!< in: redo log flags (stored
+					in the page number field) */
+	ulint		flags,		/*!< in: compressed page size
+					and file format
+					if type==MLOG_FILE_CREATE2, or 0 */
+	const char*	name,		/*!< in: table name in the familiar
+					'databasename/tablename' format, or
+					the file path in the case of
+					MLOG_FILE_DELETE */
+	const char*	new_name,	/*!< in: if type is MLOG_FILE_RENAME,
+					the new table name in the
+					'databasename/tablename' format */
+	mtr_t*		mtr)		/*!< in: mini-transaction handle */
+{
+	byte*	log_ptr;
+	ulint	len;
+
+	log_ptr = mlog_open(mtr, 11 + 2 + 1);
+
+	if (!log_ptr) {
+		/* Logging in mtr is switched off during crash recovery:
+		in that case mlog_open returns NULL */
+		return;
+	}
+
+	log_ptr = mlog_write_initial_log_record_for_file_op(
+		type, space_id, log_flags, log_ptr, mtr);
+	if (type == MLOG_FILE_CREATE2) {
+		mach_write_to_4(log_ptr, flags);
+		log_ptr += 4;
+	}
+	/* Let us store the strings as null-terminated for easier readability
+	and handling */
+
+	len = strlen(name) + 1;
+
+	mach_write_to_2(log_ptr, len);
+	log_ptr += 2;
+	mlog_close(mtr, log_ptr);
+
+	mlog_catenate_string(mtr, (byte*) name, len);
+
+	if (type == MLOG_FILE_RENAME) {
+		len = strlen(new_name) + 1;
+		log_ptr = mlog_open(mtr, 2 + len);
+		ut_a(log_ptr);
+		mach_write_to_2(log_ptr, len);
+		log_ptr += 2;
+		mlog_close(mtr, log_ptr);
+
+		mlog_catenate_string(mtr, (byte*) new_name, len);
+	}
+}
+#endif
+
+/*******************************************************************//**
+Parses the body of a log record written about an .ibd file operation. That is,
+the log record part after the standard (type, space id, page no) header of the
+log record.
+
+If desired, also replays the delete or rename operation if the .ibd file
+exists and the space id in it matches. Replays the create operation if a file
+at that path does not exist yet. If the database directory for the file to be
+created does not exist, then we create the directory, too.
+
+Note that ibbackup --apply-log sets fil_path_to_mysql_datadir to point to the
+datadir that we should use in replaying the file operations.
+@return end of log record, or NULL if the record was not completely
+contained between ptr and end_ptr */
+UNIV_INTERN
+byte*
+fil_op_log_parse_or_replay(
+/*=======================*/
+	byte*	ptr,		/*!< in: buffer containing the log record body,
+				or an initial segment of it, if the record does
+				not fir completely between ptr and end_ptr */
+	byte*	end_ptr,	/*!< in: buffer end */
+	ulint	type,		/*!< in: the type of this log record */
+	ulint	space_id,	/*!< in: the space id of the tablespace in
+				question, or 0 if the log record should
+				only be parsed but not replayed */
+	ulint	log_flags)	/*!< in: redo log flags
+				(stored in the page number parameter) */
+{
+	ulint		name_len;
+	ulint		new_name_len;
+	const char*	name;
+	const char*	new_name	= NULL;
+	ulint		flags		= 0;
+
+	if (type == MLOG_FILE_CREATE2) {
+		if (end_ptr < ptr + 4) {
+
+			return(NULL);
+		}
+
+		flags = mach_read_from_4(ptr);
+		ptr += 4;
+	}
+
+	if (end_ptr < ptr + 2) {
+
+		return(NULL);
+	}
+
+	name_len = mach_read_from_2(ptr);
+
+	ptr += 2;
+
+	if (end_ptr < ptr + name_len) {
+
+		return(NULL);
+	}
+
+	name = (const char*) ptr;
+
+	ptr += name_len;
+
+	if (type == MLOG_FILE_RENAME) {
+		if (end_ptr < ptr + 2) {
+
+			return(NULL);
+		}
+
+		new_name_len = mach_read_from_2(ptr);
+
+		ptr += 2;
+
+		if (end_ptr < ptr + new_name_len) {
+
+			return(NULL);
+		}
+
+		new_name = (const char*) ptr;
+
+		ptr += new_name_len;
+	}
+
+	/* We managed to parse a full log record body */
+	/*
+	printf("Parsed log rec of type %lu space %lu\n"
+	"name %s\n", type, space_id, name);
+
+	if (type == MLOG_FILE_RENAME) {
+	printf("new name %s\n", new_name);
+	}
+	*/
+	if (!space_id) {
+
+		return(ptr);
+	}
+
+	/* Let us try to perform the file operation, if sensible. Note that
+	ibbackup has at this stage already read in all space id info to the
+	fil0fil.c data structures.
+
+	NOTE that our algorithm is not guaranteed to work correctly if there
+	were renames of tables during the backup. See ibbackup code for more
+	on the problem. */
+
+	switch (type) {
+	case MLOG_FILE_DELETE:
+		if (fil_tablespace_exists_in_mem(space_id)) {
+			ut_a(fil_delete_tablespace(space_id));
+		}
+
+		break;
+
+	case MLOG_FILE_RENAME:
+		/* We do the rename based on space id, not old file name;
+		this should guarantee that after the log replay each .ibd file
+		has the correct name for the latest log sequence number; the
+		proof is left as an exercise :) */
+
+		if (fil_tablespace_exists_in_mem(space_id)) {
+			/* Create the database directory for the new name, if
+			it does not exist yet */
+			fil_create_directory_for_tablename(new_name);
+
+			/* Rename the table if there is not yet a tablespace
+			with the same name */
+
+			if (fil_get_space_id_for_table(new_name)
+			    == ULINT_UNDEFINED) {
+				/* We do not care of the old name, that is
+				why we pass NULL as the first argument */
+				if (!fil_rename_tablespace(NULL, space_id,
+							   new_name)) {
+					ut_error;
+				}
+			}
+		}
+
+		break;
+
+	case MLOG_FILE_CREATE:
+	case MLOG_FILE_CREATE2:
+		if (fil_tablespace_exists_in_mem(space_id)) {
+			/* Do nothing */
+		} else if (fil_get_space_id_for_table(name)
+			   != ULINT_UNDEFINED) {
+			/* Do nothing */
+		} else if (log_flags & MLOG_FILE_FLAG_TEMP) {
+			/* Temporary table, do nothing */
+		} else {
+			/* Create the database directory for name, if it does
+			not exist yet */
+			fil_create_directory_for_tablename(name);
+
+			if (fil_create_new_single_table_tablespace(
+				    space_id, name, FALSE, flags,
+				    FIL_IBD_FILE_INITIAL_SIZE) != DB_SUCCESS) {
+				ut_error;
+			}
+		}
+
+		break;
+
+	default:
+		ut_error;
+	}
+
+	return(ptr);
+}
+
+/*******************************************************************//**
+Deletes a single-table tablespace. The tablespace must be cached in the
+memory cache.
+@return	TRUE if success */
+UNIV_INTERN
+ibool
+fil_delete_tablespace(
+/*==================*/
+	ulint	id)	/*!< in: space id */
+{
+	ibool		success;
+	fil_space_t*	space;
+	fil_node_t*	node;
+	ulint		count		= 0;
+	char*		path;
+
+	ut_a(id != 0);
+stop_ibuf_merges:
+	mutex_enter(&fil_system->mutex);
+
+	space = fil_space_get_by_id(id);
+
+	if (space != NULL) {
+		space->stop_ibuf_merges = TRUE;
+
+		if (space->n_pending_ibuf_merges == 0) {
+			mutex_exit(&fil_system->mutex);
+
+			count = 0;
+
+			goto try_again;
+		} else {
+			if (count > 5000) {
+				ut_print_timestamp(stderr);
+				fputs("  InnoDB: Warning: trying to"
+				      " delete tablespace ", stderr);
+				ut_print_filename(stderr, space->name);
+				fprintf(stderr, ",\n"
+					"InnoDB: but there are %lu pending"
+					" ibuf merges on it.\n"
+					"InnoDB: Loop %lu.\n",
+					(ulong) space->n_pending_ibuf_merges,
+					(ulong) count);
+			}
+
+			mutex_exit(&fil_system->mutex);
+
+			os_thread_sleep(20000);
+			count++;
+
+			goto stop_ibuf_merges;
+		}
+	}
+
+	mutex_exit(&fil_system->mutex);
+	count = 0;
+
+try_again:
+	mutex_enter(&fil_system->mutex);
+
+	space = fil_space_get_by_id(id);
+
+	if (space == NULL) {
+		ut_print_timestamp(stderr);
+		fprintf(stderr,
+			"  InnoDB: Error: cannot delete tablespace %lu\n"
+			"InnoDB: because it is not found in the"
+			" tablespace memory cache.\n",
+			(ulong) id);
+
+		mutex_exit(&fil_system->mutex);
+
+		return(FALSE);
+	}
+
+	ut_a(space);
+	ut_a(space->n_pending_ibuf_merges == 0);
+
+	space->is_being_deleted = TRUE;
+
+	ut_a(UT_LIST_GET_LEN(space->chain) == 1);
+	node = UT_LIST_GET_FIRST(space->chain);
+
+	if (space->n_pending_flushes > 0 || node->n_pending > 0) {
+		if (count > 1000) {
+			ut_print_timestamp(stderr);
+			fputs("  InnoDB: Warning: trying to"
+			      " delete tablespace ", stderr);
+			ut_print_filename(stderr, space->name);
+			fprintf(stderr, ",\n"
+				"InnoDB: but there are %lu flushes"
+				" and %lu pending i/o's on it\n"
+				"InnoDB: Loop %lu.\n",
+				(ulong) space->n_pending_flushes,
+				(ulong) node->n_pending,
+				(ulong) count);
+		}
+		mutex_exit(&fil_system->mutex);
+		os_thread_sleep(20000);
+
+		count++;
+
+		goto try_again;
+	}
+
+	path = mem_strdup(space->name);
+
+	mutex_exit(&fil_system->mutex);
+#ifndef UNIV_HOTBACKUP
+	/* Invalidate in the buffer pool all pages belonging to the
+	tablespace. Since we have set space->is_being_deleted = TRUE, readahead
+	or ibuf merge can no longer read more pages of this tablespace to the
+	buffer pool. Thus we can clean the tablespace out of the buffer pool
+	completely and permanently. The flag is_being_deleted also prevents
+	fil_flush() from being applied to this tablespace. */
+
+	buf_LRU_invalidate_tablespace(id);
+#endif
+	/* printf("Deleting tablespace %s id %lu\n", space->name, id); */
+
+	success = fil_space_free(id, FALSE);
+
+	if (success) {
+		success = os_file_delete(path);
+
+		if (!success) {
+			success = os_file_delete_if_exists(path);
+		}
+	}
+
+	if (success) {
+#ifndef UNIV_HOTBACKUP
+		/* Write a log record about the deletion of the .ibd
+		file, so that ibbackup can replay it in the
+		--apply-log phase. We use a dummy mtr and the familiar
+		log write mechanism. */
+		mtr_t		mtr;
+
+		/* When replaying the operation in ibbackup, do not try
+		to write any log record */
+		mtr_start(&mtr);
+
+		fil_op_write_log(MLOG_FILE_DELETE, id, 0, 0, path, NULL, &mtr);
+		mtr_commit(&mtr);
+#endif
+		mem_free(path);
+
+		return(TRUE);
+	}
+
+	mem_free(path);
+
+	return(FALSE);
+}
+
+#ifndef UNIV_HOTBACKUP
+/*******************************************************************//**
+Discards a single-table tablespace. The tablespace must be cached in the
+memory cache. Discarding is like deleting a tablespace, but
+1) we do not drop the table from the data dictionary;
+2) we remove all insert buffer entries for the tablespace immediately; in DROP
+TABLE they are only removed gradually in the background;
+3) when the user does IMPORT TABLESPACE, the tablespace will have the same id
+as it originally had.
+@return	TRUE if success */
+UNIV_INTERN
+ibool
+fil_discard_tablespace(
+/*===================*/
+	ulint	id)	/*!< in: space id */
+{
+	ibool	success;
+
+	success = fil_delete_tablespace(id);
+
+	if (!success) {
+		fprintf(stderr,
+			"InnoDB: Warning: cannot delete tablespace %lu"
+			" in DISCARD TABLESPACE.\n"
+			"InnoDB: But let us remove the"
+			" insert buffer entries for this tablespace.\n",
+			(ulong) id);
+	}
+
+	/* Remove all insert buffer entries for the tablespace */
+
+	ibuf_delete_for_discarded_space(id);
+
+	return(success);
+}
+#endif /* !UNIV_HOTBACKUP */
+
+/*******************************************************************//**
+Renames the memory cache structures of a single-table tablespace.
+@return	TRUE if success */
+static
+ibool
+fil_rename_tablespace_in_mem(
+/*=========================*/
+	fil_space_t*	space,	/*!< in: tablespace memory object */
+	fil_node_t*	node,	/*!< in: file node of that tablespace */
+	const char*	path)	/*!< in: new name */
+{
+	fil_space_t*	space2;
+	const char*	old_name	= space->name;
+
+	ut_ad(mutex_own(&fil_system->mutex));
+
+	space2 = fil_space_get_by_name(old_name);
+	if (space != space2) {
+		fputs("InnoDB: Error: cannot find ", stderr);
+		ut_print_filename(stderr, old_name);
+		fputs(" in tablespace memory cache\n", stderr);
+
+		return(FALSE);
+	}
+
+	space2 = fil_space_get_by_name(path);
+	if (space2 != NULL) {
+		fputs("InnoDB: Error: ", stderr);
+		ut_print_filename(stderr, path);
+		fputs(" is already in tablespace memory cache\n", stderr);
+
+		return(FALSE);
+	}
+
+	HASH_DELETE(fil_space_t, name_hash, fil_system->name_hash,
+		    ut_fold_string(space->name), space);
+	mem_free(space->name);
+	mem_free(node->name);
+
+	space->name = mem_strdup(path);
+	node->name = mem_strdup(path);
+
+	HASH_INSERT(fil_space_t, name_hash, fil_system->name_hash,
+		    ut_fold_string(path), space);
+	return(TRUE);
+}
+
+/*******************************************************************//**
+Allocates a file name for a single-table tablespace. The string must be freed
+by caller with mem_free().
+@return	own: file name */
+static
+char*
+fil_make_ibd_name(
+/*==============*/
+	const char*	name,		/*!< in: table name or a dir path of a
+					TEMPORARY table */
+	ibool		is_temp)	/*!< in: TRUE if it is a dir path */
+{
+	ulint	namelen		= strlen(name);
+	ulint	dirlen		= strlen(fil_path_to_mysql_datadir);
+	char*	filename	= mem_alloc(namelen + dirlen + sizeof "/.ibd");
+
+	if (is_temp) {
+		memcpy(filename, name, namelen);
+		memcpy(filename + namelen, ".ibd", sizeof ".ibd");
+	} else {
+		memcpy(filename, fil_path_to_mysql_datadir, dirlen);
+		filename[dirlen] = '/';
+
+		memcpy(filename + dirlen + 1, name, namelen);
+		memcpy(filename + dirlen + namelen + 1, ".ibd", sizeof ".ibd");
+	}
+
+	srv_normalize_path_for_win(filename);
+
+	return(filename);
+}
+
+/*******************************************************************//**
+Renames a single-table tablespace. The tablespace must be cached in the
+tablespace memory cache.
+@return	TRUE if success */
+UNIV_INTERN
+ibool
+fil_rename_tablespace(
+/*==================*/
+	const char*	old_name,	/*!< in: old table name in the standard
+					databasename/tablename format of
+					InnoDB, or NULL if we do the rename
+					based on the space id only */
+	ulint		id,		/*!< in: space id */
+	const char*	new_name)	/*!< in: new table name in the standard
+					databasename/tablename format
+					of InnoDB */
+{
+	ibool		success;
+	fil_space_t*	space;
+	fil_node_t*	node;
+	ulint		count		= 0;
+	char*		path;
+	ibool		old_name_was_specified		= TRUE;
+	char*		old_path;
+
+	ut_a(id != 0);
+
+	if (old_name == NULL) {
+		old_name = "(name not specified)";
+		old_name_was_specified = FALSE;
+	}
+retry:
+	count++;
+
+	if (count > 1000) {
+		ut_print_timestamp(stderr);
+		fputs("  InnoDB: Warning: problems renaming ", stderr);
+		ut_print_filename(stderr, old_name);
+		fputs(" to ", stderr);
+		ut_print_filename(stderr, new_name);
+		fprintf(stderr, ", %lu iterations\n", (ulong) count);
+	}
+
+	mutex_enter(&fil_system->mutex);
+
+	space = fil_space_get_by_id(id);
+
+	if (space == NULL) {
+		fprintf(stderr,
+			"InnoDB: Error: cannot find space id %lu"
+			" in the tablespace memory cache\n"
+			"InnoDB: though the table ", (ulong) id);
+		ut_print_filename(stderr, old_name);
+		fputs(" in a rename operation should have that id\n", stderr);
+		mutex_exit(&fil_system->mutex);
+
+		return(FALSE);
+	}
+
+	if (count > 25000) {
+		space->stop_ios = FALSE;
+		mutex_exit(&fil_system->mutex);
+
+		return(FALSE);
+	}
+
+	/* We temporarily close the .ibd file because we do not trust that
+	operating systems can rename an open file. For the closing we have to
+	wait until there are no pending i/o's or flushes on the file. */
+
+	space->stop_ios = TRUE;
+
+	ut_a(UT_LIST_GET_LEN(space->chain) == 1);
+	node = UT_LIST_GET_FIRST(space->chain);
+
+	if (node->n_pending > 0 || node->n_pending_flushes > 0) {
+		/* There are pending i/o's or flushes, sleep for a while and
+		retry */
+
+		mutex_exit(&fil_system->mutex);
+
+		os_thread_sleep(20000);
+
+		goto retry;
+
+	} else if (node->modification_counter > node->flush_counter) {
+		/* Flush the space */
+
+		mutex_exit(&fil_system->mutex);
+
+		os_thread_sleep(20000);
+
+		fil_flush(id);
+
+		goto retry;
+
+	} else if (node->open) {
+		/* Close the file */
+
+		fil_node_close_file(node, fil_system);
+	}
+
+	/* Check that the old name in the space is right */
+
+	if (old_name_was_specified) {
+		old_path = fil_make_ibd_name(old_name, FALSE);
+
+		ut_a(strcmp(space->name, old_path) == 0);
+		ut_a(strcmp(node->name, old_path) == 0);
+	} else {
+		old_path = mem_strdup(space->name);
+	}
+
+	/* Rename the tablespace and the node in the memory cache */
+	path = fil_make_ibd_name(new_name, FALSE);
+	success = fil_rename_tablespace_in_mem(space, node, path);
+
+	if (success) {
+		success = os_file_rename(old_path, path);
+
+		if (!success) {
+			/* We have to revert the changes we made
+			to the tablespace memory cache */
+
+			ut_a(fil_rename_tablespace_in_mem(space, node,
+							  old_path));
+		}
+	}
+
+	mem_free(path);
+	mem_free(old_path);
+
+	space->stop_ios = FALSE;
+
+	mutex_exit(&fil_system->mutex);
+
+#ifndef UNIV_HOTBACKUP
+	if (success) {
+		mtr_t		mtr;
+
+		mtr_start(&mtr);
+
+		fil_op_write_log(MLOG_FILE_RENAME, id, 0, 0, old_name, new_name,
+				 &mtr);
+		mtr_commit(&mtr);
+	}
+#endif
+	return(success);
+}
+
+/*******************************************************************//**
+Creates a new single-table tablespace to a database directory of MySQL.
+Database directories are under the 'datadir' of MySQL. The datadir is the
+directory of a running mysqld program. We can refer to it by simply the
+path '.'. Tables created with CREATE TEMPORARY TABLE we place in the temp
+dir of the mysqld server.
+@return	DB_SUCCESS or error code */
+UNIV_INTERN
+ulint
+fil_create_new_single_table_tablespace(
+/*===================================*/
+	ulint		space_id,	/*!< in: space id */
+	const char*	tablename,	/*!< in: the table name in the usual
+					databasename/tablename format
+					of InnoDB, or a dir path to a temp
+					table */
+	ibool		is_temp,	/*!< in: TRUE if a table created with
+					CREATE TEMPORARY TABLE */
+	ulint		flags,		/*!< in: tablespace flags */
+	ulint		size)		/*!< in: the initial size of the
+					tablespace file in pages,
+					must be >= FIL_IBD_FILE_INITIAL_SIZE */
+{
+	os_file_t	file;
+	ibool		ret;
+	ulint		err;
+	byte*		buf2;
+	byte*		page;
+	ibool		success;
+	char*		path;
+
+	ut_a(space_id > 0);
+	ut_a(space_id < SRV_LOG_SPACE_FIRST_ID);
+	ut_a(size >= FIL_IBD_FILE_INITIAL_SIZE);
+	/* The tablespace flags (FSP_SPACE_FLAGS) should be 0 for
+	ROW_FORMAT=COMPACT
+	((table->flags & ~(~0 << DICT_TF_BITS)) == DICT_TF_COMPACT) and
+	ROW_FORMAT=REDUNDANT (table->flags == 0).  For any other
+	format, the tablespace flags should equal
+	(table->flags & ~(~0 << DICT_TF_BITS)). */
+	ut_a(flags != DICT_TF_COMPACT);
+	ut_a(!(flags & (~0UL << DICT_TF_BITS)));
+
+	path = fil_make_ibd_name(tablename, is_temp);
+
+	file = os_file_create(path, OS_FILE_CREATE, OS_FILE_NORMAL,
+			      OS_DATA_FILE, &ret);
+	if (ret == FALSE) {
+		ut_print_timestamp(stderr);
+		fputs("  InnoDB: Error creating file ", stderr);
+		ut_print_filename(stderr, path);
+		fputs(".\n", stderr);
+
+		/* The following call will print an error message */
+
+		err = os_file_get_last_error(TRUE);
+
+		if (err == OS_FILE_ALREADY_EXISTS) {
+			fputs("InnoDB: The file already exists though"
+			      " the corresponding table did not\n"
+			      "InnoDB: exist in the InnoDB data dictionary."
+			      " Have you moved InnoDB\n"
+			      "InnoDB: .ibd files around without using the"
+			      " SQL commands\n"
+			      "InnoDB: DISCARD TABLESPACE and"
+			      " IMPORT TABLESPACE, or did\n"
+			      "InnoDB: mysqld crash in the middle of"
+			      " CREATE TABLE? You can\n"
+			      "InnoDB: resolve the problem by"
+			      " removing the file ", stderr);
+			ut_print_filename(stderr, path);
+			fputs("\n"
+			      "InnoDB: under the 'datadir' of MySQL.\n",
+			      stderr);
+
+			mem_free(path);
+			return(DB_TABLESPACE_ALREADY_EXISTS);
+		}
+
+		if (err == OS_FILE_DISK_FULL) {
+
+			mem_free(path);
+			return(DB_OUT_OF_FILE_SPACE);
+		}
+
+		mem_free(path);
+		return(DB_ERROR);
+	}
+
+	ret = os_file_set_size(path, file, size * UNIV_PAGE_SIZE, 0);
+
+	if (!ret) {
+		err = DB_OUT_OF_FILE_SPACE;
+error_exit:
+		os_file_close(file);
+error_exit2:
+		os_file_delete(path);
+
+		mem_free(path);
+		return(err);
+	}
+
+	/* printf("Creating tablespace %s id %lu\n", path, space_id); */
+
+	/* We have to write the space id to the file immediately and flush the
+	file to disk. This is because in crash recovery we must be aware what
+	tablespaces exist and what are their space id's, so that we can apply
+	the log records to the right file. It may take quite a while until
+	buffer pool flush algorithms write anything to the file and flush it to
+	disk. If we would not write here anything, the file would be filled
+	with zeros from the call of os_file_set_size(), until a buffer pool
+	flush would write to it. */
+
+	buf2 = ut_malloc(3 * UNIV_PAGE_SIZE);
+	/* Align the memory for file i/o if we might have O_DIRECT set */
+	page = ut_align(buf2, UNIV_PAGE_SIZE);
+
+	memset(page, '\0', UNIV_PAGE_SIZE);
+
+	fsp_header_init_fields(page, space_id, flags);
+	mach_write_to_4(page + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID, space_id);
+
+	if (!(flags & DICT_TF_ZSSIZE_MASK)) {
+		buf_flush_init_for_writing(page, NULL, 0);
+		ret = os_file_write(path, file, page, 0, 0, UNIV_PAGE_SIZE);
+	} else {
+		page_zip_des_t	page_zip;
+		ulint		zip_size;
+
+		zip_size = ((PAGE_ZIP_MIN_SIZE >> 1)
+			    << ((flags & DICT_TF_ZSSIZE_MASK)
+				>> DICT_TF_ZSSIZE_SHIFT));
+
+		page_zip_set_size(&page_zip, zip_size);
+		page_zip.data = page + UNIV_PAGE_SIZE;
+#ifdef UNIV_DEBUG
+		page_zip.m_start =
+#endif /* UNIV_DEBUG */
+			page_zip.m_end = page_zip.m_nonempty =
+			page_zip.n_blobs = 0;
+		buf_flush_init_for_writing(page, &page_zip, 0);
+		ret = os_file_write(path, file, page_zip.data, 0, 0, zip_size);
+	}
+
+	ut_free(buf2);
+
+	if (!ret) {
+		fputs("InnoDB: Error: could not write the first page"
+		      " to tablespace ", stderr);
+		ut_print_filename(stderr, path);
+		putc('\n', stderr);
+		err = DB_ERROR;
+		goto error_exit;
+	}
+
+	ret = os_file_flush(file);
+
+	if (!ret) {
+		fputs("InnoDB: Error: file flush of tablespace ", stderr);
+		ut_print_filename(stderr, path);
+		fputs(" failed\n", stderr);
+		err = DB_ERROR;
+		goto error_exit;
+	}
+
+	os_file_close(file);
+
+	success = fil_space_create(path, space_id, flags, FIL_TABLESPACE);
+
+	if (!success) {
+		err = DB_ERROR;
+		goto error_exit2;
+	}
+
+	fil_node_create(path, size, space_id, FALSE);
+
+#ifndef UNIV_HOTBACKUP
+	{
+		mtr_t		mtr;
+
+		mtr_start(&mtr);
+
+		fil_op_write_log(flags
+				 ? MLOG_FILE_CREATE2
+				 : MLOG_FILE_CREATE,
+				 space_id,
+				 is_temp ? MLOG_FILE_FLAG_TEMP : 0,
+				 flags,
+				 tablename, NULL, &mtr);
+
+		mtr_commit(&mtr);
+	}
+#endif
+	mem_free(path);
+	return(DB_SUCCESS);
+}
+
+#ifndef UNIV_HOTBACKUP
+/********************************************************************//**
+It is possible, though very improbable, that the lsn's in the tablespace to be
+imported have risen above the current system lsn, if a lengthy purge, ibuf
+merge, or rollback was performed on a backup taken with ibbackup. If that is
+the case, reset page lsn's in the file. We assume that mysqld was shut down
+after it performed these cleanup operations on the .ibd file, so that it at
+the shutdown stamped the latest lsn to the FIL_PAGE_FILE_FLUSH_LSN in the
+first page of the .ibd file, and we can determine whether we need to reset the
+lsn's just by looking at that flush lsn.
+@return	TRUE if success */
+UNIV_INTERN
+ibool
+fil_reset_too_high_lsns(
+/*====================*/
+	const char*	name,		/*!< in: table name in the
+					databasename/tablename format */
+	ib_uint64_t	current_lsn)	/*!< in: reset lsn's if the lsn stamped
+					to FIL_PAGE_FILE_FLUSH_LSN in the
+					first page is too high */
+{
+	os_file_t	file;
+	char*		filepath;
+	byte*		page;
+	byte*		buf2;
+	ib_uint64_t	flush_lsn;
+	ulint		space_id;
+	ib_int64_t	file_size;
+	ib_int64_t	offset;
+	ulint		zip_size;
+	ibool		success;
+	page_zip_des_t	page_zip;
+
+	filepath = fil_make_ibd_name(name, FALSE);
+
+	file = os_file_create_simple_no_error_handling(
+		filepath, OS_FILE_OPEN, OS_FILE_READ_WRITE, &success);
+	if (!success) {
+		/* The following call prints an error message */
+		os_file_get_last_error(TRUE);
+
+		ut_print_timestamp(stderr);
+
+		fputs("  InnoDB: Error: trying to open a table,"
+		      " but could not\n"
+		      "InnoDB: open the tablespace file ", stderr);
+		ut_print_filename(stderr, filepath);
+		fputs("!\n", stderr);
+		mem_free(filepath);
+
+		return(FALSE);
+	}
+
+	/* Read the first page of the tablespace */
+
+	buf2 = ut_malloc(3 * UNIV_PAGE_SIZE);
+	/* Align the memory for file i/o if we might have O_DIRECT set */
+	page = ut_align(buf2, UNIV_PAGE_SIZE);
+
+	success = os_file_read(file, page, 0, 0, UNIV_PAGE_SIZE);
+	if (!success) {
+
+		goto func_exit;
+	}
+
+	/* We have to read the file flush lsn from the header of the file */
+
+	flush_lsn = mach_read_ull(page + FIL_PAGE_FILE_FLUSH_LSN);
+
+	if (current_lsn >= flush_lsn) {
+		/* Ok */
+		success = TRUE;
+
+		goto func_exit;
+	}
+
+	space_id = fsp_header_get_space_id(page);
+	zip_size = fsp_header_get_zip_size(page);
+
+	page_zip_des_init(&page_zip);
+	page_zip_set_size(&page_zip, zip_size);
+	if (zip_size) {
+		page_zip.data = page + UNIV_PAGE_SIZE;
+	}
+
+	ut_print_timestamp(stderr);
+	fprintf(stderr,
+		"  InnoDB: Flush lsn in the tablespace file %lu"
+		" to be imported\n"
+		"InnoDB: is %llu, which exceeds current"
+		" system lsn %llu.\n"
+		"InnoDB: We reset the lsn's in the file ",
+		(ulong) space_id,
+		flush_lsn, current_lsn);
+	ut_print_filename(stderr, filepath);
+	fputs(".\n", stderr);
+
+	ut_a(ut_is_2pow(zip_size));
+	ut_a(zip_size <= UNIV_PAGE_SIZE);
+
+	/* Loop through all the pages in the tablespace and reset the lsn and
+	the page checksum if necessary */
+
+	file_size = os_file_get_size_as_iblonglong(file);
+
+	for (offset = 0; offset < file_size;
+	     offset += zip_size ? zip_size : UNIV_PAGE_SIZE) {
+		success = os_file_read(file, page,
+				       (ulint)(offset & 0xFFFFFFFFUL),
+				       (ulint)(offset >> 32),
+				       zip_size ? zip_size : UNIV_PAGE_SIZE);
+		if (!success) {
+
+			goto func_exit;
+		}
+		if (mach_read_ull(page + FIL_PAGE_LSN) > current_lsn) {
+			/* We have to reset the lsn */
+
+			if (zip_size) {
+				memcpy(page_zip.data, page, zip_size);
+				buf_flush_init_for_writing(
+					page, &page_zip, current_lsn);
+				success = os_file_write(
+					filepath, file, page_zip.data,
+					(ulint) offset & 0xFFFFFFFFUL,
+					(ulint) (offset >> 32), zip_size);
+			} else {
+				buf_flush_init_for_writing(
+					page, NULL, current_lsn);
+				success = os_file_write(
+					filepath, file, page,
+					(ulint)(offset & 0xFFFFFFFFUL),
+					(ulint)(offset >> 32),
+					UNIV_PAGE_SIZE);
+			}
+
+			if (!success) {
+
+				goto func_exit;
+			}
+		}
+	}
+
+	success = os_file_flush(file);
+	if (!success) {
+
+		goto func_exit;
+	}
+
+	/* We now update the flush_lsn stamp at the start of the file */
+	success = os_file_read(file, page, 0, 0,
+			       zip_size ? zip_size : UNIV_PAGE_SIZE);
+	if (!success) {
+
+		goto func_exit;
+	}
+
+	mach_write_ull(page + FIL_PAGE_FILE_FLUSH_LSN, current_lsn);
+
+	success = os_file_write(filepath, file, page, 0, 0,
+				zip_size ? zip_size : UNIV_PAGE_SIZE);
+	if (!success) {
+
+		goto func_exit;
+	}
+	success = os_file_flush(file);
+func_exit:
+	os_file_close(file);
+	ut_free(buf2);
+	mem_free(filepath);
+
+	return(success);
+}
+
+/********************************************************************//**
+Tries to open a single-table tablespace and optionally checks the space id is
+right in it. If does not succeed, prints an error message to the .err log. This
+function is used to open a tablespace when we start up mysqld, and also in
+IMPORT TABLESPACE.
+NOTE that we assume this operation is used either at the database startup
+or under the protection of the dictionary mutex, so that two users cannot
+race here. This operation does not leave the file associated with the
+tablespace open, but closes it after we have looked at the space id in it.
+@return	TRUE if success */
+UNIV_INTERN
+ibool
+fil_open_single_table_tablespace(
+/*=============================*/
+	ibool		check_space_id,	/*!< in: should we check that the space
+					id in the file is right; we assume
+					that this function runs much faster
+					if no check is made, since accessing
+					the file inode probably is much
+					faster (the OS caches them) than
+					accessing the first page of the file */
+	ulint		id,		/*!< in: space id */
+	ulint		flags,		/*!< in: tablespace flags */
+	const char*	name)		/*!< in: table name in the
+					databasename/tablename format */
+{
+	os_file_t	file;
+	char*		filepath;
+	ibool		success;
+	byte*		buf2;
+	byte*		page;
+	ulint		space_id;
+	ulint		space_flags;
+
+	filepath = fil_make_ibd_name(name, FALSE);
+
+	/* The tablespace flags (FSP_SPACE_FLAGS) should be 0 for
+	ROW_FORMAT=COMPACT
+	((table->flags & ~(~0 << DICT_TF_BITS)) == DICT_TF_COMPACT) and
+	ROW_FORMAT=REDUNDANT (table->flags == 0).  For any other
+	format, the tablespace flags should equal
+	(table->flags & ~(~0 << DICT_TF_BITS)). */
+	ut_a(flags != DICT_TF_COMPACT);
+	ut_a(!(flags & (~0UL << DICT_TF_BITS)));
+
+	file = os_file_create_simple_no_error_handling(
+		filepath, OS_FILE_OPEN, OS_FILE_READ_WRITE, &success);
+	if (!success) {
+		/* The following call prints an error message */
+		os_file_get_last_error(TRUE);
+
+		ut_print_timestamp(stderr);
+
+		fputs("  InnoDB: Error: trying to open a table,"
+		      " but could not\n"
+		      "InnoDB: open the tablespace file ", stderr);
+		ut_print_filename(stderr, filepath);
+		fputs("!\n"
+		      "InnoDB: Have you moved InnoDB .ibd files around"
+		      " without using the\n"
+		      "InnoDB: commands DISCARD TABLESPACE and"
+		      " IMPORT TABLESPACE?\n"
+		      "InnoDB: It is also possible that this is"
+		      " a temporary table #sql...,\n"
+		      "InnoDB: and MySQL removed the .ibd file for this.\n"
+		      "InnoDB: Please refer to\n"
+		      "InnoDB: " REFMAN "innodb-troubleshooting-datadict.html\n"
+		      "InnoDB: for how to resolve the issue.\n", stderr);
+
+		mem_free(filepath);
+
+		return(FALSE);
+	}
+
+	if (!check_space_id) {
+		space_id = id;
+
+		goto skip_check;
+	}
+
+	/* Read the first page of the tablespace */
+
+	buf2 = ut_malloc(2 * UNIV_PAGE_SIZE);
+	/* Align the memory for file i/o if we might have O_DIRECT set */
+	page = ut_align(buf2, UNIV_PAGE_SIZE);
+
+	success = os_file_read(file, page, 0, 0, UNIV_PAGE_SIZE);
+
+	/* We have to read the tablespace id and flags from the file. */
+
+	space_id = fsp_header_get_space_id(page);
+	space_flags = fsp_header_get_flags(page);
+
+	if (srv_expand_import
+	    && (space_id != id || space_flags != (flags & ~(~0 << DICT_TF_BITS)))) {
+		ibool		file_is_corrupt = FALSE;
+		byte*		buf3;
+		byte*		descr_page;
+		ibool		descr_is_corrupt = FALSE;
+		dulint		old_id[31];
+		dulint		new_id[31];
+		ulint		root_page[31];
+		ulint		n_index;
+		os_file_t	info_file = (os_file_t) -1;
+		char*		info_file_path;
+		ulint	i;
+		int		len;
+		ib_uint64_t	current_lsn;
+		ulint		size_low, size_high, size, free_limit;
+		ib_int64_t	size_bytes, free_limit_bytes;
+		dict_table_t*	table;
+		dict_index_t*	index;
+		fil_system_t*	system;
+		fil_node_t*	node = NULL;
+		fil_space_t*	space;
+
+		buf3 = ut_malloc(2 * UNIV_PAGE_SIZE);
+		descr_page = ut_align(buf3, UNIV_PAGE_SIZE);
+
+		current_lsn = log_get_lsn();
+
+		/* check the header page's consistency */
+		if (buf_page_is_corrupted(page,
+					  dict_table_flags_to_zip_size(space_flags))) {
+			fprintf(stderr, "InnoDB: page 0 of %s seems corrupt.\n", filepath);
+			file_is_corrupt = TRUE;
+			descr_is_corrupt = TRUE;
+		}
+
+		/* store as first descr page */
+		memcpy(descr_page, page, UNIV_PAGE_SIZE);
+
+		/* get free limit (page number) of the table space */
+/* these should be same to the definition in fsp0fsp.c */
+#define FSP_HEADER_OFFSET	FIL_PAGE_DATA
+#define	FSP_FREE_LIMIT		12
+		free_limit = mach_read_from_4(FSP_HEADER_OFFSET + FSP_FREE_LIMIT + page);
+		free_limit_bytes = (ib_int64_t)free_limit * (ib_int64_t)UNIV_PAGE_SIZE;
+
+		/* overwrite fsp header */
+		fsp_header_init_fields(page, id, flags);
+		mach_write_to_4(page + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID, id);
+		space_id = id;
+		space_flags = flags;
+		if (mach_read_ull(page + FIL_PAGE_FILE_FLUSH_LSN) > current_lsn)
+			mach_write_ull(page + FIL_PAGE_FILE_FLUSH_LSN, current_lsn);
+		mach_write_to_4(page + FIL_PAGE_SPACE_OR_CHKSUM,
+				srv_use_checksums
+				? (!srv_fast_checksum
+				   ? buf_calc_page_new_checksum(page)
+				   : buf_calc_page_new_checksum_32(page))
+						: BUF_NO_CHECKSUM_MAGIC);
+		mach_write_to_4(page + UNIV_PAGE_SIZE - FIL_PAGE_END_LSN_OLD_CHKSUM,
+				srv_use_checksums
+				? buf_calc_page_old_checksum(page)
+						: BUF_NO_CHECKSUM_MAGIC);
+		success = os_file_write(filepath, file, page, 0, 0, UNIV_PAGE_SIZE);
+
+		/* get file size */
+		os_file_get_size(file, &size_low, &size_high);
+		size_bytes = (((ib_int64_t)size_high) << 32)
+				+ (ib_int64_t)size_low;
+
+		if (size_bytes < free_limit_bytes) {
+			free_limit_bytes = size_bytes;
+			fprintf(stderr, "InnoDB: free limit of %s is larger than its real size.\n", filepath);
+			file_is_corrupt = TRUE;
+		}
+
+		/* get cruster index information */
+		table = dict_table_get_low(name);
+		index = dict_table_get_first_index(table);
+		ut_a(index->page==3);
+
+		/* read metadata from .exp file */
+		n_index = 0;
+		memset(old_id, 0, sizeof(old_id));
+		memset(new_id, 0, sizeof(new_id));
+		memset(root_page, 0, sizeof(root_page));
+
+		info_file_path = fil_make_ibd_name(name, FALSE);
+		len = strlen(info_file_path);
+		info_file_path[len - 3] = 'e';
+		info_file_path[len - 2] = 'x';
+		info_file_path[len - 1] = 'p';
+
+		info_file = os_file_create_simple_no_error_handling(
+				info_file_path, OS_FILE_OPEN, OS_FILE_READ_ONLY, &success);
+		if (!success) {
+			fprintf(stderr, "InnoDB: cannot open %s\n", info_file_path);
+			file_is_corrupt = TRUE;
+			goto skip_info;
+		}
+		success = os_file_read(info_file, page, 0, 0, UNIV_PAGE_SIZE);
+		if (!success) {
+			fprintf(stderr, "InnoDB: cannot read %s\n", info_file_path);
+			file_is_corrupt = TRUE;
+			goto skip_info;
+		}
+		if (mach_read_from_4(page) != 0x78706f72UL
+		    || mach_read_from_4(page + 4) != 0x74696e66UL) {
+			fprintf(stderr, "InnoDB: %s seems not to be a correct .exp file\n", info_file_path);
+			file_is_corrupt = TRUE;
+			goto skip_info;
+		}
+
+		fprintf(stderr, "InnoDB: import: extended import of %s is started.\n", name);
+
+		n_index = mach_read_from_4(page + 8);
+		fprintf(stderr, "InnoDB: import: %lu indexes are detected.\n", (ulong)n_index);
+		for (i = 0; i < n_index; i++) {
+			new_id[i] =
+				dict_table_get_index_on_name(table,
+						(char*)(page + (i + 1) * 512 + 12))->id;
+			old_id[i] = mach_read_from_8(page + (i + 1) * 512);
+			root_page[i] = mach_read_from_4(page + (i + 1) * 512 + 8);
+		}
+
+skip_info:
+		if (info_file != (os_file_t) -1)
+			os_file_close(info_file);
+
+		/*
+		if (size_bytes >= 1024 * 1024) {
+			size_bytes = ut_2pow_round(size_bytes, 1024 * 1024);
+		}
+		*/
+		if (!(flags & DICT_TF_ZSSIZE_MASK)) {
+			mem_heap_t*	heap = NULL;
+			ulint		offsets_[REC_OFFS_NORMAL_SIZE];
+			ulint*		offsets = offsets_;
+			ib_int64_t	offset;
+
+			size = (ulint) (size_bytes / UNIV_PAGE_SIZE);
+			/* over write space id of all pages */
+			rec_offs_init(offsets_);
+
+			fprintf(stderr, "InnoDB: Progress in %%:");
+
+			for (offset = 0; offset < free_limit_bytes; offset += UNIV_PAGE_SIZE) {
+				ulint		checksum_field;
+				ulint		old_checksum_field;
+				ibool		page_is_corrupt;
+
+				success = os_file_read(file, page,
+							(ulint)(offset & 0xFFFFFFFFUL),
+							(ulint)(offset >> 32), UNIV_PAGE_SIZE);
+
+				page_is_corrupt = FALSE;
+
+				/* check consistency */
+				if (memcmp(page + FIL_PAGE_LSN + 4,
+					   page + UNIV_PAGE_SIZE
+					   - FIL_PAGE_END_LSN_OLD_CHKSUM + 4, 4)) {
+
+					page_is_corrupt = TRUE;
+				}
+
+				if (mach_read_from_4(page + FIL_PAGE_OFFSET)
+				    != offset / UNIV_PAGE_SIZE) {
+
+					page_is_corrupt = TRUE;
+				}
+
+				checksum_field = mach_read_from_4(page
+								  + FIL_PAGE_SPACE_OR_CHKSUM);
+
+				old_checksum_field = mach_read_from_4(
+					page + UNIV_PAGE_SIZE
+					- FIL_PAGE_END_LSN_OLD_CHKSUM);
+
+				if (old_checksum_field != mach_read_from_4(page
+									   + FIL_PAGE_LSN)
+				    && old_checksum_field != BUF_NO_CHECKSUM_MAGIC
+				    && old_checksum_field
+				    != buf_calc_page_old_checksum(page)) {
+
+					page_is_corrupt = TRUE;
+				}
+
+				if (!srv_fast_checksum
+				    && checksum_field != 0
+				    && checksum_field != BUF_NO_CHECKSUM_MAGIC
+				    && checksum_field
+				    != buf_calc_page_new_checksum(page)) {
+
+					page_is_corrupt = TRUE;
+				}
+
+				if (srv_fast_checksum
+				    && checksum_field != 0
+				    && checksum_field != BUF_NO_CHECKSUM_MAGIC
+				    && checksum_field
+				    != buf_calc_page_new_checksum_32(page)
+				    && checksum_field
+				    != buf_calc_page_new_checksum(page)) {
+
+					page_is_corrupt = TRUE;
+				}
+
+				/* if it is free page, inconsistency is acceptable */
+				if (!offset) {
+					/* header page*/
+					/* it should be overwritten already */
+					ut_a(!page_is_corrupt);
+
+				} else if (!((offset / UNIV_PAGE_SIZE) % UNIV_PAGE_SIZE)) {
+					/* descr page (not header) */
+					if (page_is_corrupt) {
+						file_is_corrupt = TRUE;
+						descr_is_corrupt = TRUE;
+					} else {
+						ut_a(fil_page_get_type(page) == FIL_PAGE_TYPE_XDES);
+						descr_is_corrupt = FALSE;
+					}
+
+					/* store as descr page */
+					memcpy(descr_page, page, UNIV_PAGE_SIZE);
+
+				} else if (descr_is_corrupt) {
+					/* unknown state of the page */
+					if (page_is_corrupt) {
+						file_is_corrupt = TRUE;
+					}
+
+				} else {
+					/* check free page or not */
+					/* These definitions should be same to fsp0fsp.c */
+#define	FSP_HEADER_SIZE		(32 + 5 * FLST_BASE_NODE_SIZE)
+
+#define	XDES_BITMAP		(FLST_NODE_SIZE + 12)
+#define	XDES_BITS_PER_PAGE	2
+#define	XDES_FREE_BIT		0
+#define	XDES_SIZE							\
+	(XDES_BITMAP + UT_BITS_IN_BYTES(FSP_EXTENT_SIZE * XDES_BITS_PER_PAGE))
+#define	XDES_ARR_OFFSET		(FSP_HEADER_OFFSET + FSP_HEADER_SIZE)
+
+					/*descr = descr_page + XDES_ARR_OFFSET + XDES_SIZE * xdes_calc_descriptor_index(zip_size, offset)*/
+					/*xdes_get_bit(descr, XDES_FREE_BIT, page % FSP_EXTENT_SIZE, mtr)*/
+					byte*	descr;
+					ulint	index;
+					ulint	byte_index;
+					ulint	bit_index;
+
+					descr = descr_page + XDES_ARR_OFFSET
+						+ XDES_SIZE * (ut_2pow_remainder((offset / UNIV_PAGE_SIZE), UNIV_PAGE_SIZE) / FSP_EXTENT_SIZE);
+
+					index = XDES_FREE_BIT + XDES_BITS_PER_PAGE * ((offset / UNIV_PAGE_SIZE) % FSP_EXTENT_SIZE);
+					byte_index = index / 8;
+					bit_index = index % 8;
+
+					if (ut_bit_get_nth(mach_read_from_1(descr + XDES_BITMAP + byte_index), bit_index)) {
+						/* free page */
+						if (page_is_corrupt) {
+							goto skip_write;
+						}
+					} else {
+						/* not free */
+						if (page_is_corrupt) {
+							file_is_corrupt = TRUE;
+						}
+					}
+				}
+
+				if (page_is_corrupt) {
+					fprintf(stderr, " [errp:%lld]", offset / UNIV_PAGE_SIZE);
+
+					/* cannot treat corrupt page */
+					goto skip_write;
+				}
+
+				if (mach_read_from_4(page + FIL_PAGE_OFFSET) || !offset) {
+					mach_write_to_4(page + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID, id);
+
+					for (i = 0; (ulint) i < n_index; i++) {
+                                                if ((ulint) (offset / UNIV_PAGE_SIZE) == root_page[i]) {
+							/* this is index root page */
+							mach_write_to_4(page + FIL_PAGE_DATA + PAGE_BTR_SEG_LEAF
+											+ FSEG_HDR_SPACE, id);
+							mach_write_to_4(page + FIL_PAGE_DATA + PAGE_BTR_SEG_TOP
+											+ FSEG_HDR_SPACE, id);
+							break;
+						}
+					}
+
+					if (fil_page_get_type(page) == FIL_PAGE_INDEX) {
+						dulint tmp = mach_read_from_8(page + (PAGE_HEADER + PAGE_INDEX_ID));
+
+						if (mach_read_from_2(page + PAGE_HEADER + PAGE_LEVEL) == 0
+						    && ut_dulint_cmp(old_id[0], tmp) == 0) {
+							/* leaf page of cluster index, reset trx_id of records */
+							rec_t*	rec;
+							rec_t*	supremum;
+							ulint	n_recs;
+
+							supremum = page_get_supremum_rec(page);
+							rec = page_rec_get_next(page_get_infimum_rec(page));
+							n_recs = page_get_n_recs(page);
+
+							while (rec && rec != supremum && n_recs > 0) {
+								ulint	n_fields;
+								ulint	i;
+								ulint	offset = index->trx_id_offset;
+								offsets = rec_get_offsets(rec, index, offsets,
+										ULINT_UNDEFINED, &heap);
+								n_fields = rec_offs_n_fields(offsets);
+								if (!offset) {
+									offset = row_get_trx_id_offset(rec, index, offsets);
+								}
+								trx_write_trx_id(rec + offset, ut_dulint_create(0, 1));
+
+								for (i = 0; i < n_fields; i++) {
+									if (rec_offs_nth_extern(offsets, i)) {
+										ulint	local_len;
+										byte*	data;
+
+										data = rec_get_nth_field(rec, offsets, i, &local_len);
+
+										local_len -= BTR_EXTERN_FIELD_REF_SIZE;
+
+										mach_write_to_4(data + local_len + BTR_EXTERN_SPACE_ID, id);
+									}
+								}
+
+								rec = page_rec_get_next(rec);
+								n_recs--;
+							}
+						}
+
+						for (i = 0; i < n_index; i++) {
+							if (ut_dulint_cmp(old_id[i], tmp) == 0) {
+								mach_write_to_8(page + (PAGE_HEADER + PAGE_INDEX_ID), new_id[i]);
+								break;
+							}
+						}
+					}
+
+					if (mach_read_ull(page + FIL_PAGE_LSN) > current_lsn) {
+						mach_write_ull(page + FIL_PAGE_LSN, current_lsn);
+						mach_write_ull(page + UNIV_PAGE_SIZE - FIL_PAGE_END_LSN_OLD_CHKSUM,
+										current_lsn);
+					}
+
+					mach_write_to_4(page + FIL_PAGE_SPACE_OR_CHKSUM,
+							srv_use_checksums
+							? (!srv_fast_checksum
+							   ? buf_calc_page_new_checksum(page)
+							   : buf_calc_page_new_checksum_32(page))
+									: BUF_NO_CHECKSUM_MAGIC);
+					mach_write_to_4(page + UNIV_PAGE_SIZE - FIL_PAGE_END_LSN_OLD_CHKSUM,
+							srv_use_checksums
+							? buf_calc_page_old_checksum(page)
+									: BUF_NO_CHECKSUM_MAGIC);
+
+					success = os_file_write(filepath, file, page,
+								(ulint)(offset & 0xFFFFFFFFUL),
+								(ulint)(offset >> 32), UNIV_PAGE_SIZE);
+				}
+
+skip_write:
+				if (free_limit_bytes
+				    && ((ib_int64_t)((offset + UNIV_PAGE_SIZE) * 100) / free_limit_bytes)
+					!= ((offset * 100) / free_limit_bytes)) {
+					fprintf(stderr, " %lu",
+						(ulong)((ib_int64_t)((offset + UNIV_PAGE_SIZE) * 100) / free_limit_bytes));
+				}
+			}
+
+			fprintf(stderr, " done.\n");
+
+			/* update SYS_INDEXES set root page */
+			index = dict_table_get_first_index(table);
+			while (index) {
+				for (i = 0; i < n_index; i++) {
+					if (ut_dulint_cmp(new_id[i], index->id) == 0) {
+						break;
+					}
+				}
+
+				if (i != n_index
+				    && root_page[i] != index->page) {
+					/* must update */
+					ulint	error;
+					trx_t*	trx;
+					pars_info_t*	info = NULL;
+
+					trx = trx_allocate_for_mysql();
+					trx->op_info = "extended import";
+
+					info = pars_info_create();
+
+					pars_info_add_dulint_literal(info, "indexid", new_id[i]);
+					pars_info_add_int4_literal(info, "new_page", (lint) root_page[i]);
+
+					error = que_eval_sql(info,
+						"PROCEDURE UPDATE_INDEX_PAGE () IS\n"
+						"BEGIN\n"
+						"UPDATE SYS_INDEXES"
+						" SET PAGE_NO = :new_page"
+						" WHERE ID = :indexid;\n"
+						"COMMIT WORK;\n"
+						"END;\n",
+						FALSE, trx);
+
+					if (error != DB_SUCCESS) {
+						fprintf(stderr, "InnoDB: failed to update SYS_INDEXES\n");
+					}
+
+					trx_commit_for_mysql(trx);
+
+					trx_free_for_mysql(trx);
+
+					index->page = root_page[i];
+				}
+
+				index = dict_table_get_next_index(index);
+			}
+			if (UNIV_LIKELY_NULL(heap)) {
+				mem_heap_free(heap);
+			}
+		} else {
+			/* zip page? */
+			size = (ulint)
+			(size_bytes
+					/ dict_table_flags_to_zip_size(flags));
+			fprintf(stderr, "InnoDB: import: table %s seems to be in newer format."
+					" It may not be able to treated for now.\n", name);
+		}
+		/* .exp file should be removed */
+		success = os_file_delete(info_file_path);
+		if (!success) {
+			success = os_file_delete_if_exists(info_file_path);
+		}
+		mem_free(info_file_path);
+
+		system	= fil_system;
+		mutex_enter(&(system->mutex));
+		space = fil_space_get_by_id(id);
+		if (space)
+			node = UT_LIST_GET_FIRST(space->chain);
+		if (node && node->size < size) {
+			space->size += (size - node->size);
+			node->size = size;
+		}
+		mutex_exit(&(system->mutex));
+
+		ut_free(buf3);
+
+		if (file_is_corrupt) {
+			ut_print_timestamp(stderr);
+			fputs("  InnoDB: Error: file ",
+			      stderr);
+			ut_print_filename(stderr, filepath);
+			fprintf(stderr, " seems to be corrupt.\n"
+				"InnoDB: anyway, all not corrupt pages were tried to be converted to salvage.\n"
+				"InnoDB: ##### CAUTION #####\n"
+				"InnoDB: ## The .ibd must cause to crash InnoDB, though re-import would seem to be succeeded.\n"
+				"InnoDB: ## If you don't have knowledge about salvaging data from .ibd, you should not use the file.\n"
+				"InnoDB: ###################\n");
+			success = FALSE;
+
+			ut_free(buf2);
+
+			goto func_exit;
+		}
+	}
+
+	ut_free(buf2);
+
+	if (UNIV_UNLIKELY(space_id != id
+			  || space_flags != (flags & ~(~0 << DICT_TF_BITS)))) {
+		ut_print_timestamp(stderr);
+
+		fputs("  InnoDB: Error: tablespace id and flags in file ",
+		      stderr);
+		ut_print_filename(stderr, filepath);
+		fprintf(stderr, " are %lu and %lu, but in the InnoDB\n"
+			"InnoDB: data dictionary they are %lu and %lu.\n"
+			"InnoDB: Have you moved InnoDB .ibd files"
+			" around without using the\n"
+			"InnoDB: commands DISCARD TABLESPACE and"
+			" IMPORT TABLESPACE?\n"
+			"InnoDB: Please refer to\n"
+			"InnoDB: " REFMAN "innodb-troubleshooting-datadict.html\n"
+			"InnoDB: for how to resolve the issue.\n",
+			(ulong) space_id, (ulong) space_flags,
+			(ulong) id, (ulong) flags);
+
+		success = FALSE;
+
+		goto func_exit;
+	}
+
+skip_check:
+	success = fil_space_create(filepath, space_id, flags, FIL_TABLESPACE);
+
+	if (!success) {
+		goto func_exit;
+	}
+
+	/* We do not measure the size of the file, that is why we pass the 0
+	below */
+
+	fil_node_create(filepath, 0, space_id, FALSE);
+func_exit:
+	os_file_close(file);
+	mem_free(filepath);
+
+	return(success);
+}
+#endif /* !UNIV_HOTBACKUP */
+
+#ifdef UNIV_HOTBACKUP
+/*******************************************************************//**
+Allocates a file name for an old version of a single-table tablespace.
+The string must be freed by caller with mem_free()!
+@return	own: file name */
+static
+char*
+fil_make_ibbackup_old_name(
+/*=======================*/
+	const char*	name)		/*!< in: original file name */
+{
+	static const char suffix[] = "_ibbackup_old_vers_";
+	ulint	len	= strlen(name);
+	char*	path	= mem_alloc(len + (15 + sizeof suffix));
+
+	memcpy(path, name, len);
+	memcpy(path + len, suffix, (sizeof suffix) - 1);
+	ut_sprintf_timestamp_without_extra_chars(path + len + sizeof suffix);
+	return(path);
+}
+#endif /* UNIV_HOTBACKUP */
+
+/********************************************************************//**
+Opens an .ibd file and adds the associated single-table tablespace to the
+InnoDB fil0fil.c data structures. */
+static
+void
+fil_load_single_table_tablespace(
+/*=============================*/
+	const char*	dbname,		/*!< in: database name */
+	const char*	filename)	/*!< in: file name (not a path),
+					including the .ibd extension */
+{
+	os_file_t	file;
+	char*		filepath;
+	ibool		success;
+	byte*		buf2;
+	byte*		page;
+	ulint		space_id;
+	ulint		flags;
+	ulint		size_low;
+	ulint		size_high;
+	ib_uint64_t	size;
+#ifdef UNIV_HOTBACKUP
+	fil_space_t*	space;
+#endif
+	filepath = mem_alloc(strlen(dbname) + strlen(filename)
+			     + strlen(fil_path_to_mysql_datadir) + 3);
+
+	sprintf(filepath, "%s/%s/%s", fil_path_to_mysql_datadir, dbname,
+		filename);
+	srv_normalize_path_for_win(filepath);
+#ifdef __WIN__
+# ifndef UNIV_HOTBACKUP
+	/* If lower_case_table_names is 0 or 2, then MySQL allows database
+	directory names with upper case letters. On Windows, all table and
+	database names in InnoDB are internally always in lower case. Put the
+	file path to lower case, so that we are consistent with InnoDB's
+	internal data dictionary. */
+
+	dict_casedn_str(filepath);
+# endif /* !UNIV_HOTBACKUP */
+#endif
+	file = os_file_create_simple_no_error_handling(
+		filepath, OS_FILE_OPEN, OS_FILE_READ_ONLY, &success);
+	if (!success) {
+		/* The following call prints an error message */
+		os_file_get_last_error(TRUE);
+
+		fprintf(stderr,
+			"InnoDB: Error: could not open single-table tablespace"
+			" file\n"
+			"InnoDB: %s!\n"
+			"InnoDB: We do not continue the crash recovery,"
+			" because the table may become\n"
+			"InnoDB: corrupt if we cannot apply the log records"
+			" in the InnoDB log to it.\n"
+			"InnoDB: To fix the problem and start mysqld:\n"
+			"InnoDB: 1) If there is a permission problem"
+			" in the file and mysqld cannot\n"
+			"InnoDB: open the file, you should"
+			" modify the permissions.\n"
+			"InnoDB: 2) If the table is not needed, or you can"
+			" restore it from a backup,\n"
+			"InnoDB: then you can remove the .ibd file,"
+			" and InnoDB will do a normal\n"
+			"InnoDB: crash recovery and ignore that table.\n"
+			"InnoDB: 3) If the file system or the"
+			" disk is broken, and you cannot remove\n"
+			"InnoDB: the .ibd file, you can set"
+			" innodb_force_recovery > 0 in my.cnf\n"
+			"InnoDB: and force InnoDB to continue crash"
+			" recovery here.\n", filepath);
+
+		mem_free(filepath);
+
+		if (srv_force_recovery > 0) {
+			fprintf(stderr,
+				"InnoDB: innodb_force_recovery"
+				" was set to %lu. Continuing crash recovery\n"
+				"InnoDB: even though we cannot access"
+				" the .ibd file of this table.\n",
+				srv_force_recovery);
+			return;
+		}
+
+		exit(1);
+	}
+
+	success = os_file_get_size(file, &size_low, &size_high);
+
+	if (!success) {
+		/* The following call prints an error message */
+		os_file_get_last_error(TRUE);
+
+		fprintf(stderr,
+			"InnoDB: Error: could not measure the size"
+			" of single-table tablespace file\n"
+			"InnoDB: %s!\n"
+			"InnoDB: We do not continue crash recovery,"
+			" because the table will become\n"
+			"InnoDB: corrupt if we cannot apply the log records"
+			" in the InnoDB log to it.\n"
+			"InnoDB: To fix the problem and start mysqld:\n"
+			"InnoDB: 1) If there is a permission problem"
+			" in the file and mysqld cannot\n"
+			"InnoDB: access the file, you should"
+			" modify the permissions.\n"
+			"InnoDB: 2) If the table is not needed,"
+			" or you can restore it from a backup,\n"
+			"InnoDB: then you can remove the .ibd file,"
+			" and InnoDB will do a normal\n"
+			"InnoDB: crash recovery and ignore that table.\n"
+			"InnoDB: 3) If the file system or the disk is broken,"
+			" and you cannot remove\n"
+			"InnoDB: the .ibd file, you can set"
+			" innodb_force_recovery > 0 in my.cnf\n"
+			"InnoDB: and force InnoDB to continue"
+			" crash recovery here.\n", filepath);
+
+		os_file_close(file);
+		mem_free(filepath);
+
+		if (srv_force_recovery > 0) {
+			fprintf(stderr,
+				"InnoDB: innodb_force_recovery"
+				" was set to %lu. Continuing crash recovery\n"
+				"InnoDB: even though we cannot access"
+				" the .ibd file of this table.\n",
+				srv_force_recovery);
+			return;
+		}
+
+		exit(1);
+	}
+
+	/* TODO: What to do in other cases where we cannot access an .ibd
+	file during a crash recovery? */
+
+	/* Every .ibd file is created >= 4 pages in size. Smaller files
+	cannot be ok. */
+
+	size = (((ib_uint64_t)size_high) << 32) + (ib_uint64_t)size_low;
+#ifndef UNIV_HOTBACKUP
+	if (size < FIL_IBD_FILE_INITIAL_SIZE * UNIV_PAGE_SIZE) {
+		fprintf(stderr,
+			"InnoDB: Error: the size of single-table tablespace"
+			" file %s\n"
+			"InnoDB: is only %lu %lu, should be at least %lu!",
+			filepath,
+			(ulong) size_high,
+			(ulong) size_low, (ulong) (4 * UNIV_PAGE_SIZE));
+		os_file_close(file);
+		mem_free(filepath);
+
+		return;
+	}
+#endif
+	/* Read the first page of the tablespace if the size big enough */
+
+	buf2 = ut_malloc(2 * UNIV_PAGE_SIZE);
+	/* Align the memory for file i/o if we might have O_DIRECT set */
+	page = ut_align(buf2, UNIV_PAGE_SIZE);
+
+	if (size >= FIL_IBD_FILE_INITIAL_SIZE * UNIV_PAGE_SIZE) {
+		success = os_file_read(file, page, 0, 0, UNIV_PAGE_SIZE);
+
+		/* We have to read the tablespace id from the file */
+
+		space_id = fsp_header_get_space_id(page);
+		flags = fsp_header_get_flags(page);
+	} else {
+		space_id = ULINT_UNDEFINED;
+		flags = 0;
+	}
+
+#ifndef UNIV_HOTBACKUP
+	if (space_id == ULINT_UNDEFINED || trx_sys_sys_space(space_id)) {
+		fprintf(stderr,
+			"InnoDB: Error: tablespace id %lu in file %s"
+			" is not sensible\n",
+			(ulong) space_id,
+			filepath);
+		goto func_exit;
+	}
+#else
+	if (space_id == ULINT_UNDEFINED || trx_sys_sys_space(space_id)) {
+		char*	new_path;
+
+		fprintf(stderr,
+			"InnoDB: Renaming tablespace %s of id %lu,\n"
+			"InnoDB: to %s_ibbackup_old_vers_<timestamp>\n"
+			"InnoDB: because its size %" PRId64 " is too small"
+			" (< 4 pages 16 kB each),\n"
+			"InnoDB: or the space id in the file header"
+			" is not sensible.\n"
+			"InnoDB: This can happen in an ibbackup run,"
+			" and is not dangerous.\n",
+			filepath, space_id, filepath, size);
+		os_file_close(file);
+
+		new_path = fil_make_ibbackup_old_name(filepath);
+		ut_a(os_file_rename(filepath, new_path));
+
+		ut_free(buf2);
+		mem_free(filepath);
+		mem_free(new_path);
+
+		return;
+	}
+
+	/* A backup may contain the same space several times, if the space got
+	renamed at a sensitive time. Since it is enough to have one version of
+	the space, we rename the file if a space with the same space id
+	already exists in the tablespace memory cache. We rather rename the
+	file than delete it, because if there is a bug, we do not want to
+	destroy valuable data. */
+
+	mutex_enter(&fil_system->mutex);
+
+	space = fil_space_get_by_id(space_id);
+
+	if (space) {
+		char*	new_path;
+
+		fprintf(stderr,
+			"InnoDB: Renaming tablespace %s of id %lu,\n"
+			"InnoDB: to %s_ibbackup_old_vers_<timestamp>\n"
+			"InnoDB: because space %s with the same id\n"
+			"InnoDB: was scanned earlier. This can happen"
+			" if you have renamed tables\n"
+			"InnoDB: during an ibbackup run.\n",
+			filepath, space_id, filepath,
+			space->name);
+		os_file_close(file);
+
+		new_path = fil_make_ibbackup_old_name(filepath);
+
+		mutex_exit(&fil_system->mutex);
+
+		ut_a(os_file_rename(filepath, new_path));
+
+		ut_free(buf2);
+		mem_free(filepath);
+		mem_free(new_path);
+
+		return;
+	}
+	mutex_exit(&fil_system->mutex);
+#endif
+	success = fil_space_create(filepath, space_id, flags, FIL_TABLESPACE);
+
+	if (!success) {
+
+		if (srv_force_recovery > 0) {
+			fprintf(stderr,
+				"InnoDB: innodb_force_recovery"
+				" was set to %lu. Continuing crash recovery\n"
+				"InnoDB: even though the tablespace creation"
+				" of this table failed.\n",
+				srv_force_recovery);
+			goto func_exit;
+		}
+
+		exit(1);
+	}
+
+	/* We do not use the size information we have about the file, because
+	the rounding formula for extents and pages is somewhat complex; we
+	let fil_node_open() do that task. */
+
+	fil_node_create(filepath, 0, space_id, FALSE);
+func_exit:
+	os_file_close(file);
+	ut_free(buf2);
+	mem_free(filepath);
+}
+
+/***********************************************************************//**
+A fault-tolerant function that tries to read the next file name in the
+directory. We retry 100 times if os_file_readdir_next_file() returns -1. The
+idea is to read as much good data as we can and jump over bad data.
+@return 0 if ok, -1 if error even after the retries, 1 if at the end
+of the directory */
+static
+int
+fil_file_readdir_next_file(
+/*=======================*/
+	ulint*		err,	/*!< out: this is set to DB_ERROR if an error
+				was encountered, otherwise not changed */
+	const char*	dirname,/*!< in: directory name or path */
+	os_file_dir_t	dir,	/*!< in: directory stream */
+	os_file_stat_t*	info)	/*!< in/out: buffer where the info is returned */
+{
+	ulint	i;
+	int	ret;
+
+	for (i = 0; i < 100; i++) {
+		ret = os_file_readdir_next_file(dirname, dir, info);
+
+		if (ret != -1) {
+
+			return(ret);
+		}
+
+		fprintf(stderr,
+			"InnoDB: Error: os_file_readdir_next_file()"
+			" returned -1 in\n"
+			"InnoDB: directory %s\n"
+			"InnoDB: Crash recovery may have failed"
+			" for some .ibd files!\n", dirname);
+
+		*err = DB_ERROR;
+	}
+
+	return(-1);
+}
+
+/********************************************************************//**
+At the server startup, if we need crash recovery, scans the database
+directories under the MySQL datadir, looking for .ibd files. Those files are
+single-table tablespaces. We need to know the space id in each of them so that
+we know into which file we should look to check the contents of a page stored
+in the doublewrite buffer, also to know where to apply log records where the
+space id is != 0.
+@return	DB_SUCCESS or error number */
+UNIV_INTERN
+ulint
+fil_load_single_table_tablespaces(void)
+/*===================================*/
+{
+	int		ret;
+	char*		dbpath		= NULL;
+	ulint		dbpath_len	= 100;
+	os_file_dir_t	dir;
+	os_file_dir_t	dbdir;
+	os_file_stat_t	dbinfo;
+	os_file_stat_t	fileinfo;
+	ulint		err		= DB_SUCCESS;
+
+	/* The datadir of MySQL is always the default directory of mysqld */
+
+	dir = os_file_opendir(fil_path_to_mysql_datadir, TRUE);
+
+	if (dir == NULL) {
+
+		return(DB_ERROR);
+	}
+
+	dbpath = mem_alloc(dbpath_len);
+
+	/* Scan all directories under the datadir. They are the database
+	directories of MySQL. */
+
+	ret = fil_file_readdir_next_file(&err, fil_path_to_mysql_datadir, dir,
+					 &dbinfo);
+	while (ret == 0) {
+		ulint len;
+		/* printf("Looking at %s in datadir\n", dbinfo.name); */
+
+		if (dbinfo.type == OS_FILE_TYPE_FILE
+		    || dbinfo.type == OS_FILE_TYPE_UNKNOWN) {
+
+			goto next_datadir_item;
+		}
+
+		/* We found a symlink or a directory; try opening it to see
+		if a symlink is a directory */
+
+		len = strlen(fil_path_to_mysql_datadir)
+			+ strlen (dbinfo.name) + 2;
+		if (len > dbpath_len) {
+			dbpath_len = len;
+
+			if (dbpath) {
+				mem_free(dbpath);
+			}
+
+			dbpath = mem_alloc(dbpath_len);
+		}
+		sprintf(dbpath, "%s/%s", fil_path_to_mysql_datadir,
+			dbinfo.name);
+		srv_normalize_path_for_win(dbpath);
+
+		dbdir = os_file_opendir(dbpath, FALSE);
+
+		if (dbdir != NULL) {
+			/* printf("Opened dir %s\n", dbinfo.name); */
+
+			/* We found a database directory; loop through it,
+			looking for possible .ibd files in it */
+
+			ret = fil_file_readdir_next_file(&err, dbpath, dbdir,
+							 &fileinfo);
+			while (ret == 0) {
+				/* printf(
+				"     Looking at file %s\n", fileinfo.name); */
+
+				if (fileinfo.type == OS_FILE_TYPE_DIR) {
+
+					goto next_file_item;
+				}
+
+				/* We found a symlink or a file */
+				if (strlen(fileinfo.name) > 4
+				    && 0 == strcmp(fileinfo.name
+						   + strlen(fileinfo.name) - 4,
+						   ".ibd")) {
+					/* The name ends in .ibd; try opening
+					the file */
+					fil_load_single_table_tablespace(
+						dbinfo.name, fileinfo.name);
+				}
+next_file_item:
+				ret = fil_file_readdir_next_file(&err,
+								 dbpath, dbdir,
+								 &fileinfo);
+			}
+
+			if (0 != os_file_closedir(dbdir)) {
+				fputs("InnoDB: Warning: could not"
+				      " close database directory ", stderr);
+				ut_print_filename(stderr, dbpath);
+				putc('\n', stderr);
+
+				err = DB_ERROR;
+			}
+		}
+
+next_datadir_item:
+		ret = fil_file_readdir_next_file(&err,
+						 fil_path_to_mysql_datadir,
+						 dir, &dbinfo);
+	}
+
+	mem_free(dbpath);
+
+	if (0 != os_file_closedir(dir)) {
+		fprintf(stderr,
+			"InnoDB: Error: could not close MySQL datadir\n");
+
+		return(DB_ERROR);
+	}
+
+	return(err);
+}
+
+/*******************************************************************//**
+Returns TRUE if a single-table tablespace does not exist in the memory cache,
+or is being deleted there.
+@return	TRUE if does not exist or is being\ deleted */
+UNIV_INTERN
+ibool
+fil_tablespace_deleted_or_being_deleted_in_mem(
+/*===========================================*/
+	ulint		id,	/*!< in: space id */
+	ib_int64_t	version)/*!< in: tablespace_version should be this; if
+				you pass -1 as the value of this, then this
+				parameter is ignored */
+{
+	fil_space_t*	space;
+
+	ut_ad(fil_system);
+
+	mutex_enter(&fil_system->mutex);
+
+	space = fil_space_get_by_id(id);
+
+	if (space == NULL || space->is_being_deleted) {
+		mutex_exit(&fil_system->mutex);
+
+		return(TRUE);
+	}
+
+	if (version != ((ib_int64_t)-1)
+	    && space->tablespace_version != version) {
+		mutex_exit(&fil_system->mutex);
+
+		return(TRUE);
+	}
+
+	mutex_exit(&fil_system->mutex);
+
+	return(FALSE);
+}
+
+/*******************************************************************//**
+Returns TRUE if a single-table tablespace exists in the memory cache.
+@return	TRUE if exists */
+UNIV_INTERN
+ibool
+fil_tablespace_exists_in_mem(
+/*=========================*/
+	ulint	id)	/*!< in: space id */
+{
+	fil_space_t*	space;
+
+	ut_ad(fil_system);
+
+	mutex_enter(&fil_system->mutex);
+
+	space = fil_space_get_by_id(id);
+
+	mutex_exit(&fil_system->mutex);
+
+	return(space != NULL);
+}
+
+/*******************************************************************//**
+Returns TRUE if a matching tablespace exists in the InnoDB tablespace memory
+cache. Note that if we have not done a crash recovery at the database startup,
+there may be many tablespaces which are not yet in the memory cache.
+@return	TRUE if a matching tablespace exists in the memory cache */
+UNIV_INTERN
+ibool
+fil_space_for_table_exists_in_mem(
+/*==============================*/
+	ulint		id,		/*!< in: space id */
+	const char*	name,		/*!< in: table name in the standard
+					'databasename/tablename' format or
+					the dir path to a temp table */
+	ibool		is_temp,	/*!< in: TRUE if created with CREATE
+					TEMPORARY TABLE */
+	ibool		mark_space,	/*!< in: in crash recovery, at database
+					startup we mark all spaces which have
+					an associated table in the InnoDB
+					data dictionary, so that
+					we can print a warning about orphaned
+					tablespaces */
+	ibool		print_error_if_does_not_exist)
+					/*!< in: print detailed error
+					information to the .err log if a
+					matching tablespace is not found from
+					memory */
+{
+	fil_space_t*	namespace;
+	fil_space_t*	space;
+	char*		path;
+
+	ut_ad(fil_system);
+
+	mutex_enter(&fil_system->mutex);
+
+	path = fil_make_ibd_name(name, is_temp);
+
+	/* Look if there is a space with the same id */
+
+	space = fil_space_get_by_id(id);
+
+	/* Look if there is a space with the same name; the name is the
+	directory path from the datadir to the file */
+
+	namespace = fil_space_get_by_name(path);
+	if (space && space == namespace) {
+		/* Found */
+
+		if (mark_space) {
+			space->mark = TRUE;
+		}
+
+		mem_free(path);
+		mutex_exit(&fil_system->mutex);
+
+		return(TRUE);
+	}
+
+	if (!print_error_if_does_not_exist) {
+
+		mem_free(path);
+		mutex_exit(&fil_system->mutex);
+
+		return(FALSE);
+	}
+
+	if (space == NULL) {
+		if (namespace == NULL) {
+			ut_print_timestamp(stderr);
+			fputs("  InnoDB: Error: table ", stderr);
+			ut_print_filename(stderr, name);
+			fprintf(stderr, "\n"
+				"InnoDB: in InnoDB data dictionary"
+				" has tablespace id %lu,\n"
+				"InnoDB: but tablespace with that id"
+				" or name does not exist. Have\n"
+				"InnoDB: you deleted or moved .ibd files?\n"
+				"InnoDB: This may also be a table created with"
+				" CREATE TEMPORARY TABLE\n"
+				"InnoDB: whose .ibd and .frm files"
+				" MySQL automatically removed, but the\n"
+				"InnoDB: table still exists in the"
+				" InnoDB internal data dictionary.\n",
+				(ulong) id);
+		} else {
+			ut_print_timestamp(stderr);
+			fputs("  InnoDB: Error: table ", stderr);
+			ut_print_filename(stderr, name);
+			fprintf(stderr, "\n"
+				"InnoDB: in InnoDB data dictionary has"
+				" tablespace id %lu,\n"
+				"InnoDB: but a tablespace with that id"
+				" does not exist. There is\n"
+				"InnoDB: a tablespace of name %s and id %lu,"
+				" though. Have\n"
+				"InnoDB: you deleted or moved .ibd files?\n",
+				(ulong) id, namespace->name,
+				(ulong) namespace->id);
+		}
+error_exit:
+		fputs("InnoDB: Please refer to\n"
+		      "InnoDB: " REFMAN "innodb-troubleshooting-datadict.html\n"
+		      "InnoDB: for how to resolve the issue.\n", stderr);
+
+		mem_free(path);
+		mutex_exit(&fil_system->mutex);
+
+		return(FALSE);
+	}
+
+	if (0 != strcmp(space->name, path)) {
+		ut_print_timestamp(stderr);
+		fputs("  InnoDB: Error: table ", stderr);
+		ut_print_filename(stderr, name);
+		fprintf(stderr, "\n"
+			"InnoDB: in InnoDB data dictionary has"
+			" tablespace id %lu,\n"
+			"InnoDB: but the tablespace with that id"
+			" has name %s.\n"
+			"InnoDB: Have you deleted or moved .ibd files?\n",
+			(ulong) id, space->name);
+
+		if (namespace != NULL) {
+			fputs("InnoDB: There is a tablespace"
+			      " with the right name\n"
+			      "InnoDB: ", stderr);
+			ut_print_filename(stderr, namespace->name);
+			fprintf(stderr, ", but its id is %lu.\n",
+				(ulong) namespace->id);
+		}
+
+		goto error_exit;
+	}
+
+	mem_free(path);
+	mutex_exit(&fil_system->mutex);
+
+	return(FALSE);
+}
+
+/*******************************************************************//**
+Checks if a single-table tablespace for a given table name exists in the
+tablespace memory cache.
+@return	space id, ULINT_UNDEFINED if not found */
+static
+ulint
+fil_get_space_id_for_table(
+/*=======================*/
+	const char*	name)	/*!< in: table name in the standard
+				'databasename/tablename' format */
+{
+	fil_space_t*	namespace;
+	ulint		id		= ULINT_UNDEFINED;
+	char*		path;
+
+	ut_ad(fil_system);
+
+	mutex_enter(&fil_system->mutex);
+
+	path = fil_make_ibd_name(name, FALSE);
+
+	/* Look if there is a space with the same name; the name is the
+	directory path to the file */
+
+	namespace = fil_space_get_by_name(path);
+
+	if (namespace) {
+		id = namespace->id;
+	}
+
+	mem_free(path);
+
+	mutex_exit(&fil_system->mutex);
+
+	return(id);
+}
+
+/**********************************************************************//**
+Tries to extend a data file so that it would accommodate the number of pages
+given. The tablespace must be cached in the memory cache. If the space is big
+enough already, does nothing.
+@return	TRUE if success */
+UNIV_INTERN
+ibool
+fil_extend_space_to_desired_size(
+/*=============================*/
+	ulint*	actual_size,	/*!< out: size of the space after extension;
+				if we ran out of disk space this may be lower
+				than the desired size */
+	ulint	space_id,	/*!< in: space id */
+	ulint	size_after_extend)/*!< in: desired size in pages after the
+				extension; if the current space size is bigger
+				than this already, the function does nothing */
+{
+	fil_node_t*	node;
+	fil_space_t*	space;
+	byte*		buf2;
+	byte*		buf;
+	ulint		buf_size;
+	ulint		start_page_no;
+	ulint		file_start_page_no;
+	ulint		offset_high;
+	ulint		offset_low;
+	ulint		page_size;
+	ibool		success		= TRUE;
+
+	fil_mutex_enter_and_prepare_for_io(space_id);
+
+	space = fil_space_get_by_id(space_id);
+	ut_a(space);
+
+	if (space->size >= size_after_extend) {
+		/* Space already big enough */
+
+		*actual_size = space->size;
+
+		mutex_exit(&fil_system->mutex);
+
+		return(TRUE);
+	}
+
+	page_size = dict_table_flags_to_zip_size(space->flags);
+	if (!page_size) {
+		page_size = UNIV_PAGE_SIZE;
+	}
+
+	node = UT_LIST_GET_LAST(space->chain);
+
+	fil_node_prepare_for_io(node, fil_system, space);
+
+	start_page_no = space->size;
+	file_start_page_no = space->size - node->size;
+
+	/* Extend at most 64 pages at a time */
+	buf_size = ut_min(64, size_after_extend - start_page_no) * page_size;
+	buf2 = mem_alloc(buf_size + page_size);
+	buf = ut_align(buf2, page_size);
+
+	memset(buf, 0, buf_size);
+
+	while (start_page_no < size_after_extend) {
+		ulint	n_pages = ut_min(buf_size / page_size,
+					 size_after_extend - start_page_no);
+
+		offset_high = (start_page_no - file_start_page_no)
+			/ (4096 * ((1024 * 1024) / page_size));
+		offset_low  = ((start_page_no - file_start_page_no)
+			       % (4096 * ((1024 * 1024) / page_size)))
+			* page_size;
+#ifdef UNIV_HOTBACKUP
+		success = os_file_write(node->name, node->handle, buf,
+					offset_low, offset_high,
+					page_size * n_pages);
+#else
+		success = os_aio(OS_FILE_WRITE, OS_AIO_SYNC,
+				 node->name, node->handle, buf,
+				 offset_low, offset_high,
+				 page_size * n_pages,
+				 NULL, NULL, NULL);
+#endif
+		if (success) {
+			node->size += n_pages;
+			space->size += n_pages;
+
+			os_has_said_disk_full = FALSE;
+		} else {
+			/* Let us measure the size of the file to determine
+			how much we were able to extend it */
+
+			n_pages = ((ulint)
+				   (os_file_get_size_as_iblonglong(
+					   node->handle)
+				    / page_size)) - node->size;
+
+			node->size += n_pages;
+			space->size += n_pages;
+
+			break;
+		}
+
+		start_page_no += n_pages;
+	}
+
+	mem_free(buf2);
+
+	fil_node_complete_io(node, fil_system, OS_FILE_WRITE);
+
+	*actual_size = space->size;
+
+#ifndef UNIV_HOTBACKUP
+	if (space_id == 0) {
+		ulint pages_per_mb = (1024 * 1024) / page_size;
+
+		/* Keep the last data file size info up to date, rounded to
+		full megabytes */
+
+		srv_data_file_sizes[srv_n_data_files - 1]
+			= (node->size / pages_per_mb) * pages_per_mb;
+	}
+#endif /* !UNIV_HOTBACKUP */
+
+	/*
+	printf("Extended %s to %lu, actual size %lu pages\n", space->name,
+	size_after_extend, *actual_size); */
+	mutex_exit(&fil_system->mutex);
+
+	fil_flush(space_id);
+
+	return(success);
+}
+
+#ifdef UNIV_HOTBACKUP
+/********************************************************************//**
+Extends all tablespaces to the size stored in the space header. During the
+ibbackup --apply-log phase we extended the spaces on-demand so that log records
+could be applied, but that may have left spaces still too small compared to
+the size stored in the space header. */
+UNIV_INTERN
+void
+fil_extend_tablespaces_to_stored_len(void)
+/*======================================*/
+{
+	fil_space_t*	space;
+	byte*		buf;
+	ulint		actual_size;
+	ulint		size_in_header;
+	ulint		error;
+	ibool		success;
+
+	buf = mem_alloc(UNIV_PAGE_SIZE);
+
+	mutex_enter(&fil_system->mutex);
+
+	space = UT_LIST_GET_FIRST(fil_system->space_list);
+
+	while (space) {
+		ut_a(space->purpose == FIL_TABLESPACE);
+
+		mutex_exit(&fil_system->mutex); /* no need to protect with a
+					      mutex, because this is a
+					      single-threaded operation */
+		error = fil_read(TRUE, space->id,
+				 dict_table_flags_to_zip_size(space->flags),
+				 0, 0, UNIV_PAGE_SIZE, buf, NULL);
+		ut_a(error == DB_SUCCESS);
+
+		size_in_header = fsp_get_size_low(buf);
+
+		success = fil_extend_space_to_desired_size(
+			&actual_size, space->id, size_in_header);
+		if (!success) {
+			fprintf(stderr,
+				"InnoDB: Error: could not extend the"
+				" tablespace of %s\n"
+				"InnoDB: to the size stored in header,"
+				" %lu pages;\n"
+				"InnoDB: size after extension %lu pages\n"
+				"InnoDB: Check that you have free disk space"
+				" and retry!\n",
+				space->name, size_in_header, actual_size);
+			exit(1);
+		}
+
+		mutex_enter(&fil_system->mutex);
+
+		space = UT_LIST_GET_NEXT(space_list, space);
+	}
+
+	mutex_exit(&fil_system->mutex);
+
+	mem_free(buf);
+}
+#endif
+
+/*========== RESERVE FREE EXTENTS (for a B-tree split, for example) ===*/
+
+/*******************************************************************//**
+Tries to reserve free extents in a file space.
+@return	TRUE if succeed */
+UNIV_INTERN
+ibool
+fil_space_reserve_free_extents(
+/*===========================*/
+	ulint	id,		/*!< in: space id */
+	ulint	n_free_now,	/*!< in: number of free extents now */
+	ulint	n_to_reserve)	/*!< in: how many one wants to reserve */
+{
+	fil_space_t*	space;
+	ibool		success;
+
+	ut_ad(fil_system);
+
+	mutex_enter(&fil_system->mutex);
+
+	space = fil_space_get_by_id(id);
+
+	ut_a(space);
+
+	if (space->n_reserved_extents + n_to_reserve > n_free_now) {
+		success = FALSE;
+	} else {
+		space->n_reserved_extents += n_to_reserve;
+		success = TRUE;
+	}
+
+	mutex_exit(&fil_system->mutex);
+
+	return(success);
+}
+
+/*******************************************************************//**
+Releases free extents in a file space. */
+UNIV_INTERN
+void
+fil_space_release_free_extents(
+/*===========================*/
+	ulint	id,		/*!< in: space id */
+	ulint	n_reserved)	/*!< in: how many one reserved */
+{
+	fil_space_t*	space;
+
+	ut_ad(fil_system);
+
+	mutex_enter(&fil_system->mutex);
+
+	space = fil_space_get_by_id(id);
+
+	ut_a(space);
+	ut_a(space->n_reserved_extents >= n_reserved);
+
+	space->n_reserved_extents -= n_reserved;
+
+	mutex_exit(&fil_system->mutex);
+}
+
+/*******************************************************************//**
+Gets the number of reserved extents. If the database is silent, this number
+should be zero. */
+UNIV_INTERN
+ulint
+fil_space_get_n_reserved_extents(
+/*=============================*/
+	ulint	id)		/*!< in: space id */
+{
+	fil_space_t*	space;
+	ulint		n;
+
+	ut_ad(fil_system);
+
+	mutex_enter(&fil_system->mutex);
+
+	space = fil_space_get_by_id(id);
+
+	ut_a(space);
+
+	n = space->n_reserved_extents;
+
+	mutex_exit(&fil_system->mutex);
+
+	return(n);
+}
+
+/*============================ FILE I/O ================================*/
+
+/********************************************************************//**
+NOTE: you must call fil_mutex_enter_and_prepare_for_io() first!
+
+Prepares a file node for i/o. Opens the file if it is closed. Updates the
+pending i/o's field in the node and the system appropriately. Takes the node
+off the LRU list if it is in the LRU list. The caller must hold the fil_sys
+mutex. */
+static
+void
+fil_node_prepare_for_io(
+/*====================*/
+	fil_node_t*	node,	/*!< in: file node */
+	fil_system_t*	system,	/*!< in: tablespace memory cache */
+	fil_space_t*	space)	/*!< in: space */
+{
+	ut_ad(node && system && space);
+	ut_ad(mutex_own(&(system->mutex)));
+
+	if (system->n_open > system->max_n_open + 5) {
+		ut_print_timestamp(stderr);
+		fprintf(stderr,
+			"  InnoDB: Warning: open files %lu"
+			" exceeds the limit %lu\n",
+			(ulong) system->n_open,
+			(ulong) system->max_n_open);
+	}
+
+	if (node->open == FALSE) {
+		/* File is closed: open it */
+		ut_a(node->n_pending == 0);
+
+		fil_node_open_file(node, system, space);
+	}
+
+	if (node->n_pending == 0 && space->purpose == FIL_TABLESPACE
+	    && !trx_sys_sys_space(space->id)) {
+		/* The node is in the LRU list, remove it */
+
+		ut_a(UT_LIST_GET_LEN(system->LRU) > 0);
+
+		UT_LIST_REMOVE(LRU, system->LRU, node);
+	}
+
+	node->n_pending++;
+}
+
+/********************************************************************//**
+Updates the data structures when an i/o operation finishes. Updates the
+pending i/o's field in the node appropriately. */
+static
+void
+fil_node_complete_io(
+/*=================*/
+	fil_node_t*	node,	/*!< in: file node */
+	fil_system_t*	system,	/*!< in: tablespace memory cache */
+	ulint		type)	/*!< in: OS_FILE_WRITE or OS_FILE_READ; marks
+				the node as modified if
+				type == OS_FILE_WRITE */
+{
+	ut_ad(node);
+	ut_ad(system);
+	ut_ad(mutex_own(&(system->mutex)));
+
+	ut_a(node->n_pending > 0);
+
+	node->n_pending--;
+
+	if (type == OS_FILE_WRITE) {
+		system->modification_counter++;
+		node->modification_counter = system->modification_counter;
+
+		if (!node->space->is_in_unflushed_spaces) {
+
+			node->space->is_in_unflushed_spaces = TRUE;
+			UT_LIST_ADD_FIRST(unflushed_spaces,
+					  system->unflushed_spaces,
+					  node->space);
+		}
+	}
+
+	if (node->n_pending == 0 && node->space->purpose == FIL_TABLESPACE
+	    && !trx_sys_sys_space(node->space->id)) {
+		/* The node must be put back to the LRU list */
+		UT_LIST_ADD_FIRST(LRU, system->LRU, node);
+	}
+}
+
+/********************************************************************//**
+Report information about an invalid page access. */
+static
+void
+fil_report_invalid_page_access(
+/*===========================*/
+	ulint		block_offset,	/*!< in: block offset */
+	ulint		space_id,	/*!< in: space id */
+	const char*	space_name,	/*!< in: space name */
+	ulint		byte_offset,	/*!< in: byte offset */
+	ulint		len,		/*!< in: I/O length */
+	ulint		type)		/*!< in: I/O type */
+{
+	fprintf(stderr,
+		"InnoDB: Error: trying to access page number %lu"
+		" in space %lu,\n"
+		"InnoDB: space name %s,\n"
+		"InnoDB: which is outside the tablespace bounds.\n"
+		"InnoDB: Byte offset %lu, len %lu, i/o type %lu.\n"
+		"InnoDB: If you get this error at mysqld startup,"
+		" please check that\n"
+		"InnoDB: your my.cnf matches the ibdata files"
+		" that you have in the\n"
+		"InnoDB: MySQL server.\n",
+		(ulong) block_offset, (ulong) space_id, space_name,
+		(ulong) byte_offset, (ulong) len, (ulong) type);
+}
+
+/********************************************************************//**
+Reads or writes data. This operation is asynchronous (aio).
+@return DB_SUCCESS, or DB_TABLESPACE_DELETED if we are trying to do
+i/o on a tablespace which does not exist */
+UNIV_INTERN
+ulint
+_fil_io(
+/*===*/
+	ulint	type,		/*!< in: OS_FILE_READ or OS_FILE_WRITE,
+				ORed to OS_FILE_LOG, if a log i/o
+				and ORed to OS_AIO_SIMULATED_WAKE_LATER
+				if simulated aio and we want to post a
+				batch of i/os; NOTE that a simulated batch
+				may introduce hidden chances of deadlocks,
+				because i/os are not actually handled until
+				all have been posted: use with great
+				caution! */
+	ibool	sync,		/*!< in: TRUE if synchronous aio is desired */
+	ulint	space_id,	/*!< in: space id */
+	ulint	zip_size,	/*!< in: compressed page size in bytes;
+				0 for uncompressed pages */
+	ulint	block_offset,	/*!< in: offset in number of blocks */
+	ulint	byte_offset,	/*!< in: remainder of offset in bytes; in
+				aio this must be divisible by the OS block
+				size */
+	ulint	len,		/*!< in: how many bytes to read or write; this
+				must not cross a file boundary; in aio this
+				must be a block size multiple */
+	void*	buf,		/*!< in/out: buffer where to store read data
+				or from where to write; in aio this must be
+				appropriately aligned */
+	void*	message,	/*!< in: message for aio handler if non-sync
+				aio used, else ignored */
+	trx_t*	trx)
+{
+	ulint		mode;
+	fil_space_t*	space;
+	fil_node_t*	node;
+	ulint		offset_high;
+	ulint		offset_low;
+	ibool		ret;
+	ulint		is_log;
+	ulint		wake_later;
+
+	is_log = type & OS_FILE_LOG;
+	type = type & ~OS_FILE_LOG;
+
+	wake_later = type & OS_AIO_SIMULATED_WAKE_LATER;
+	type = type & ~OS_AIO_SIMULATED_WAKE_LATER;
+
+	ut_ad(byte_offset < UNIV_PAGE_SIZE);
+	ut_ad(!zip_size || !byte_offset);
+	ut_ad(ut_is_2pow(zip_size));
+	ut_ad(buf);
+	ut_ad(len > 0);
+//#if (1 << UNIV_PAGE_SIZE_SHIFT) != UNIV_PAGE_SIZE
+//# error "(1 << UNIV_PAGE_SIZE_SHIFT) != UNIV_PAGE_SIZE"
+//#endif
+	ut_ad(fil_validate());
+#ifndef UNIV_HOTBACKUP
+# ifndef UNIV_LOG_DEBUG
+	/* ibuf bitmap pages must be read in the sync aio mode: */
+	ut_ad(recv_no_ibuf_operations || (type == OS_FILE_WRITE)
+	      || !ibuf_bitmap_page(zip_size, block_offset)
+	      || sync || is_log);
+	ut_ad(!ibuf_inside() || is_log || (type == OS_FILE_WRITE)
+	      || ibuf_page(space_id, zip_size, block_offset, NULL));
+# endif /* UNIV_LOG_DEBUG */
+	if (sync) {
+		mode = OS_AIO_SYNC;
+	} else if (is_log) {
+		mode = OS_AIO_LOG;
+	} else if (type == OS_FILE_READ
+		   && !recv_no_ibuf_operations
+		   && ibuf_page(space_id, zip_size, block_offset, NULL)) {
+		mode = OS_AIO_IBUF;
+	} else {
+		mode = OS_AIO_NORMAL;
+	}
+#else /* !UNIV_HOTBACKUP */
+	ut_a(sync);
+	mode = OS_AIO_SYNC;
+#endif /* !UNIV_HOTBACKUP */
+
+	if (type == OS_FILE_READ) {
+		srv_data_read+= len;
+	} else if (type == OS_FILE_WRITE) {
+		srv_data_written+= len;
+	}
+
+	/* Reserve the fil_system mutex and make sure that we can open at
+	least one file while holding it, if the file is not already open */
+
+	fil_mutex_enter_and_prepare_for_io(space_id);
+
+	space = fil_space_get_by_id(space_id);
+
+	if (!space) {
+		mutex_exit(&fil_system->mutex);
+
+		ut_print_timestamp(stderr);
+		fprintf(stderr,
+			"  InnoDB: Error: trying to do i/o"
+			" to a tablespace which does not exist.\n"
+			"InnoDB: i/o type %lu, space id %lu,"
+			" page no. %lu, i/o length %lu bytes\n",
+			(ulong) type, (ulong) space_id, (ulong) block_offset,
+			(ulong) len);
+
+		return(DB_TABLESPACE_DELETED);
+	}
+
+	ut_ad((mode != OS_AIO_IBUF) || (space->purpose == FIL_TABLESPACE));
+
+	node = UT_LIST_GET_FIRST(space->chain);
+
+	for (;;) {
+		if (UNIV_UNLIKELY(node == NULL)) {
+			fil_report_invalid_page_access(
+				block_offset, space_id, space->name,
+				byte_offset, len, type);
+
+			ut_error;
+		}
+
+		if (space->id != 0 && node->size == 0) {
+			/* We do not know the size of a single-table tablespace
+			before we open the file */
+
+			break;
+		}
+
+		if (node->size > block_offset) {
+			/* Found! */
+			break;
+		} else {
+			block_offset -= node->size;
+			node = UT_LIST_GET_NEXT(chain, node);
+		}
+	}
+
+	/* Open file if closed */
+	fil_node_prepare_for_io(node, fil_system, space);
+
+	/* Check that at least the start offset is within the bounds of a
+	single-table tablespace */
+	if (UNIV_UNLIKELY(node->size <= block_offset)
+	    && space->id != 0 && space->purpose == FIL_TABLESPACE) {
+
+		fil_report_invalid_page_access(
+			block_offset, space_id, space->name, byte_offset,
+			len, type);
+
+		ut_error;
+	}
+
+	/* Now we have made the changes in the data structures of fil_system */
+	mutex_exit(&fil_system->mutex);
+
+	/* Calculate the low 32 bits and the high 32 bits of the file offset */
+
+	if (!zip_size) {
+		offset_high = (block_offset >> (32 - UNIV_PAGE_SIZE_SHIFT));
+		offset_low  = ((block_offset << UNIV_PAGE_SIZE_SHIFT)
+			       & 0xFFFFFFFFUL) + byte_offset;
+
+		ut_a(node->size - block_offset
+		     >= ((byte_offset + len + (UNIV_PAGE_SIZE - 1))
+			 / UNIV_PAGE_SIZE));
+	} else {
+		ulint	zip_size_shift;
+		switch (zip_size) {
+		case 1024: zip_size_shift = 10; break;
+		case 2048: zip_size_shift = 11; break;
+		case 4096: zip_size_shift = 12; break;
+		case 8192: zip_size_shift = 13; break;
+		case 16384: zip_size_shift = 14; break;
+		default: ut_error;
+		}
+		offset_high = block_offset >> (32 - zip_size_shift);
+		offset_low = (block_offset << zip_size_shift & 0xFFFFFFFFUL)
+			+ byte_offset;
+		ut_a(node->size - block_offset
+		     >= (len + (zip_size - 1)) / zip_size);
+	}
+
+	/* Do aio */
+
+	ut_a(byte_offset % OS_FILE_LOG_BLOCK_SIZE == 0);
+	ut_a((len % OS_FILE_LOG_BLOCK_SIZE) == 0);
+
+	if (srv_pass_corrupt_table && space->is_corrupt) {
+		/* should ignore i/o for the crashed space */
+		mutex_enter(&fil_system->mutex);
+		fil_node_complete_io(node, fil_system, type);
+		mutex_exit(&fil_system->mutex);
+		if (mode == OS_AIO_NORMAL) {
+			ut_a(space->purpose == FIL_TABLESPACE);
+			buf_page_io_complete(message, trx);
+		}
+		if (type == OS_FILE_READ) {
+			return(DB_TABLESPACE_DELETED);
+		} else {
+			return(DB_SUCCESS);
+		}
+	} else {
+		ut_a(!space->is_corrupt);
+#ifdef UNIV_HOTBACKUP
+	/* In ibbackup do normal i/o, not aio */
+	if (type == OS_FILE_READ) {
+		ret = os_file_read(node->handle, buf, offset_low, offset_high,
+				   len);
+	} else {
+		ret = os_file_write(node->name, node->handle, buf,
+				    offset_low, offset_high, len);
+	}
+#else
+	/* Queue the aio request */
+	ret = os_aio(type, mode | wake_later, node->name, node->handle, buf,
+		     offset_low, offset_high, len, node, message, trx);
+#endif
+	} /**/
+
+	ut_a(ret);
+
+	if (mode == OS_AIO_SYNC) {
+		/* The i/o operation is already completed when we return from
+		os_aio: */
+
+		mutex_enter(&fil_system->mutex);
+
+		fil_node_complete_io(node, fil_system, type);
+
+		mutex_exit(&fil_system->mutex);
+
+		ut_ad(fil_validate());
+	}
+
+	return(DB_SUCCESS);
+}
+
+/********************************************************************//**
+Confirm whether the parameters are valid or not */
+UNIV_INTERN
+ibool
+fil_area_is_exist(
+/*==============*/
+	ulint	space_id,	/*!< in: space id */
+	ulint	zip_size __attribute__((unused)),
+				/*!< in: compressed page size in bytes;
+				0 for uncompressed pages */
+	ulint	block_offset,	/*!< in: offset in number of blocks */
+	ulint	byte_offset __attribute__((unused)),
+				/*!< in: remainder of offset in bytes; in
+				aio this must be divisible by the OS block
+				size */
+	ulint	len __attribute__((unused)))
+				/*!< in: how many bytes to read or write; this
+				must not cross a file boundary; in aio this
+				must be a block size multiple */
+{
+	fil_space_t*	space;
+	fil_node_t*	node;
+
+	/* Reserve the fil_system mutex and make sure that we can open at
+	least one file while holding it, if the file is not already open */
+
+	fil_mutex_enter_and_prepare_for_io(space_id);
+
+	space = fil_space_get_by_id(space_id);
+
+	if (!space) {
+		mutex_exit(&fil_system->mutex);
+		return(FALSE);
+	}
+
+	node = UT_LIST_GET_FIRST(space->chain);
+
+	for (;;) {
+		if (UNIV_UNLIKELY(node == NULL)) {
+			mutex_exit(&fil_system->mutex);
+			return(FALSE);
+		}
+
+		if (space->id != 0 && node->size == 0) {
+			/* We do not know the size of a single-table tablespace
+			before we open the file */
+
+			break;
+		}
+
+		if (node->size > block_offset) {
+			/* Found! */
+			break;
+		} else {
+			block_offset -= node->size;
+			node = UT_LIST_GET_NEXT(chain, node);
+		}
+	}
+
+	/* Open file if closed */
+	fil_node_prepare_for_io(node, fil_system, space);
+	fil_node_complete_io(node, fil_system, OS_FILE_READ);
+
+	/* Check that at least the start offset is within the bounds of a
+	single-table tablespace */
+	if (UNIV_UNLIKELY(node->size <= block_offset)
+	    && space->id != 0 && space->purpose == FIL_TABLESPACE) {
+		mutex_exit(&fil_system->mutex);
+		return(FALSE);
+	}
+
+	mutex_exit(&fil_system->mutex);
+	return(TRUE);
+}
+
+#ifndef UNIV_HOTBACKUP
+/**********************************************************************//**
+Waits for an aio operation to complete. This function is used to write the
+handler for completed requests. The aio array of pending requests is divided
+into segments (see os0file.c for more info). The thread specifies which
+segment it wants to wait for. */
+UNIV_INTERN
+void
+fil_aio_wait(
+/*=========*/
+	ulint	segment)	/*!< in: the number of the segment in the aio
+				array to wait for */
+{
+	ibool		ret;
+	fil_node_t*	fil_node;
+	void*		message;
+	ulint		type;
+
+	ut_ad(fil_validate());
+
+	if (os_aio_use_native_aio) {
+		srv_set_io_thread_op_info(segment, "native aio handle");
+#ifdef WIN_ASYNC_IO
+		ret = os_aio_windows_handle(segment, 0, &fil_node,
+					    &message, &type);
+#else
+		ret = 0; /* Eliminate compiler warning */
+		ut_error;
+#endif
+	} else {
+		srv_set_io_thread_op_info(segment, "simulated aio handle");
+
+		ret = os_aio_simulated_handle(segment, &fil_node,
+					      &message, &type);
+	}
+
+	ut_a(ret);
+
+	srv_set_io_thread_op_info(segment, "complete io for fil node");
+
+	mutex_enter(&fil_system->mutex);
+
+	fil_node_complete_io(fil_node, fil_system, type);
+
+	mutex_exit(&fil_system->mutex);
+
+	ut_ad(fil_validate());
+
+	/* Do the i/o handling */
+	/* IMPORTANT: since i/o handling for reads will read also the insert
+	buffer in tablespace 0, you have to be very careful not to introduce
+	deadlocks in the i/o system. We keep tablespace 0 data files always
+	open, and use a special i/o thread to serve insert buffer requests. */
+
+	if (fil_node->space->purpose == FIL_TABLESPACE) {
+		srv_set_io_thread_op_info(segment, "complete io for buf page");
+		buf_page_io_complete(message, NULL);
+	} else {
+		srv_set_io_thread_op_info(segment, "complete io for log");
+		log_io_complete(message);
+	}
+}
+#endif /* UNIV_HOTBACKUP */
+
+/**********************************************************************//**
+Flushes to disk possible writes cached by the OS. If the space does not exist
+or is being dropped, does not do anything. */
+UNIV_INTERN
+void
+fil_flush(
+/*======*/
+	ulint	space_id)	/*!< in: file space id (this can be a group of
+				log files or a tablespace of the database) */
+{
+	fil_space_t*	space;
+	fil_node_t*	node;
+	os_file_t	file;
+	ib_int64_t	old_mod_counter;
+
+	mutex_enter(&fil_system->mutex);
+
+	space = fil_space_get_by_id(space_id);
+
+	if (!space || space->is_being_deleted) {
+		mutex_exit(&fil_system->mutex);
+
+		return;
+	}
+
+	space->n_pending_flushes++;	/*!< prevent dropping of the space while
+					we are flushing */
+	node = UT_LIST_GET_FIRST(space->chain);
+
+	while (node) {
+		if (node->modification_counter > node->flush_counter) {
+			ut_a(node->open);
+
+			/* We want to flush the changes at least up to
+			old_mod_counter */
+			old_mod_counter = node->modification_counter;
+
+			if (space->purpose == FIL_TABLESPACE) {
+				fil_n_pending_tablespace_flushes++;
+			} else {
+				fil_n_pending_log_flushes++;
+				fil_n_log_flushes++;
+			}
+#ifdef __WIN__
+			if (node->is_raw_disk) {
+
+				goto skip_flush;
+			}
+#endif
+retry:
+			if (node->n_pending_flushes > 0) {
+				/* We want to avoid calling os_file_flush() on
+				the file twice at the same time, because we do
+				not know what bugs OS's may contain in file
+				i/o; sleep for a while */
+
+				mutex_exit(&fil_system->mutex);
+
+				os_thread_sleep(20000);
+
+				mutex_enter(&fil_system->mutex);
+
+				if (node->flush_counter >= old_mod_counter) {
+
+					goto skip_flush;
+				}
+
+				goto retry;
+			}
+
+			ut_a(node->open);
+			file = node->handle;
+			node->n_pending_flushes++;
+
+			mutex_exit(&fil_system->mutex);
+
+			/* fprintf(stderr, "Flushing to file %s\n",
+			node->name); */
+
+			os_file_flush(file);
+
+			mutex_enter(&fil_system->mutex);
+
+			node->n_pending_flushes--;
+skip_flush:
+			if (node->flush_counter < old_mod_counter) {
+				node->flush_counter = old_mod_counter;
+
+				if (space->is_in_unflushed_spaces
+				    && fil_space_is_flushed(space)) {
+
+					space->is_in_unflushed_spaces = FALSE;
+
+					UT_LIST_REMOVE(
+						unflushed_spaces,
+						fil_system->unflushed_spaces,
+						space);
+				}
+			}
+
+			if (space->purpose == FIL_TABLESPACE) {
+				fil_n_pending_tablespace_flushes--;
+			} else {
+				fil_n_pending_log_flushes--;
+			}
+		}
+
+		node = UT_LIST_GET_NEXT(chain, node);
+	}
+
+	space->n_pending_flushes--;
+
+	mutex_exit(&fil_system->mutex);
+}
+
+/**********************************************************************//**
+Flushes to disk the writes in file spaces of the given type possibly cached by
+the OS. */
+UNIV_INTERN
+void
+fil_flush_file_spaces(
+/*==================*/
+	ulint	purpose)	/*!< in: FIL_TABLESPACE, FIL_LOG */
+{
+	fil_space_t*	space;
+	ulint*		space_ids;
+	ulint		n_space_ids;
+	ulint		i;
+
+	mutex_enter(&fil_system->mutex);
+
+	n_space_ids = UT_LIST_GET_LEN(fil_system->unflushed_spaces);
+	if (n_space_ids == 0) {
+
+		mutex_exit(&fil_system->mutex);
+		return;
+	}
+
+	/* Assemble a list of space ids to flush.  Previously, we
+	traversed fil_system->unflushed_spaces and called UT_LIST_GET_NEXT()
+	on a space that was just removed from the list by fil_flush().
+	Thus, the space could be dropped and the memory overwritten. */
+	space_ids = mem_alloc(n_space_ids * sizeof *space_ids);
+
+	n_space_ids = 0;
+
+	for (space = UT_LIST_GET_FIRST(fil_system->unflushed_spaces);
+	     space;
+	     space = UT_LIST_GET_NEXT(unflushed_spaces, space)) {
+
+		if (space->purpose == purpose && !space->is_being_deleted) {
+
+			space_ids[n_space_ids++] = space->id;
+		}
+	}
+
+	mutex_exit(&fil_system->mutex);
+
+	/* Flush the spaces.  It will not hurt to call fil_flush() on
+	a non-existing space id. */
+	for (i = 0; i < n_space_ids; i++) {
+
+		fil_flush(space_ids[i]);
+	}
+
+	mem_free(space_ids);
+}
+
+/******************************************************************//**
+Checks the consistency of the tablespace cache.
+@return	TRUE if ok */
+UNIV_INTERN
+ibool
+fil_validate(void)
+/*==============*/
+{
+	fil_space_t*	space;
+	fil_node_t*	fil_node;
+	ulint		n_open		= 0;
+	ulint		i;
+
+	mutex_enter(&fil_system->mutex);
+
+	/* Look for spaces in the hash table */
+
+	for (i = 0; i < hash_get_n_cells(fil_system->spaces); i++) {
+
+		space = HASH_GET_FIRST(fil_system->spaces, i);
+
+		while (space != NULL) {
+			UT_LIST_VALIDATE(chain, fil_node_t, space->chain,
+					 ut_a(ut_list_node_313->open
+					      || !ut_list_node_313->n_pending));
+
+			fil_node = UT_LIST_GET_FIRST(space->chain);
+
+			while (fil_node != NULL) {
+				if (fil_node->n_pending > 0) {
+					ut_a(fil_node->open);
+				}
+
+				if (fil_node->open) {
+					n_open++;
+				}
+				fil_node = UT_LIST_GET_NEXT(chain, fil_node);
+			}
+			space = HASH_GET_NEXT(hash, space);
+		}
+	}
+
+	ut_a(fil_system->n_open == n_open);
+
+	UT_LIST_VALIDATE(LRU, fil_node_t, fil_system->LRU, (void) 0);
+
+	fil_node = UT_LIST_GET_FIRST(fil_system->LRU);
+
+	while (fil_node != NULL) {
+		ut_a(fil_node->n_pending == 0);
+		ut_a(fil_node->open);
+		ut_a(fil_node->space->purpose == FIL_TABLESPACE);
+		ut_a(!trx_sys_sys_space(fil_node->space->id));
+
+		fil_node = UT_LIST_GET_NEXT(LRU, fil_node);
+	}
+
+	mutex_exit(&fil_system->mutex);
+
+	return(TRUE);
+}
+
+/********************************************************************//**
+Returns TRUE if file address is undefined.
+@return	TRUE if undefined */
+UNIV_INTERN
+ibool
+fil_addr_is_null(
+/*=============*/
+	fil_addr_t	addr)	/*!< in: address */
+{
+	return(addr.page == FIL_NULL);
+}
+
+/********************************************************************//**
+Get the predecessor of a file page.
+@return	FIL_PAGE_PREV */
+UNIV_INTERN
+ulint
+fil_page_get_prev(
+/*==============*/
+	const byte*	page)	/*!< in: file page */
+{
+	return(mach_read_from_4(page + FIL_PAGE_PREV));
+}
+
+/********************************************************************//**
+Get the successor of a file page.
+@return	FIL_PAGE_NEXT */
+UNIV_INTERN
+ulint
+fil_page_get_next(
+/*==============*/
+	const byte*	page)	/*!< in: file page */
+{
+	return(mach_read_from_4(page + FIL_PAGE_NEXT));
+}
+
+/*********************************************************************//**
+Sets the file page type. */
+UNIV_INTERN
+void
+fil_page_set_type(
+/*==============*/
+	byte*	page,	/*!< in/out: file page */
+	ulint	type)	/*!< in: type */
+{
+	ut_ad(page);
+
+	mach_write_to_2(page + FIL_PAGE_TYPE, type);
+}
+
+/*********************************************************************//**
+Gets the file page type.
+@return type; NOTE that if the type has not been written to page, the
+return value not defined */
+UNIV_INTERN
+ulint
+fil_page_get_type(
+/*==============*/
+	const byte*	page)	/*!< in: file page */
+{
+	ut_ad(page);
+
+	return(mach_read_from_2(page + FIL_PAGE_TYPE));
+}
+
+/********************************************************************
+Initializes the tablespace memory cache. */
+UNIV_INTERN
+void
+fil_close(void)
+/*===========*/
+{
+#ifndef UNIV_HOTBACKUP
+	/* The mutex should already have been freed. */
+	ut_ad(fil_system->mutex.magic_n == 0);
+#endif /* !UNIV_HOTBACKUP */
+
+	hash_table_free(fil_system->spaces);
+
+	hash_table_free(fil_system->name_hash);
+
+	ut_a(UT_LIST_GET_LEN(fil_system->LRU) == 0);
+	ut_a(UT_LIST_GET_LEN(fil_system->unflushed_spaces) == 0);
+	ut_a(UT_LIST_GET_LEN(fil_system->space_list) == 0);
+
+	mem_free(fil_system);
+
+	fil_system = NULL;
+}
+
+/*************************************************************************
+Return local hash table informations. */
+
+ulint
+fil_system_hash_cells(void)
+/*=======================*/
+{
+       if (fil_system) {
+               return (fil_system->spaces->n_cells
+                       + fil_system->name_hash->n_cells);
+       } else {
+               return 0;
+       }
+}
+
+ulint
+fil_system_hash_nodes(void)
+/*=======================*/
+{
+       if (fil_system) {
+               return (UT_LIST_GET_LEN(fil_system->space_list)
+                       * (sizeof(fil_space_t) + MEM_BLOCK_HEADER_SIZE));
+       } else {
+               return 0;
+       }
+}
+
+/*************************************************************************
+functions to access is_corrupt flag of fil_space_t*/
+
+ibool
+fil_space_is_corrupt(
+/*=================*/
+	ulint	space_id)
+{
+	fil_space_t*	space;
+	ibool		ret = FALSE;
+
+	mutex_enter(&fil_system->mutex);
+
+	space = fil_space_get_by_id(space_id);
+
+	if (space && space->is_corrupt) {
+		ret = TRUE;
+	}
+
+	mutex_exit(&fil_system->mutex);
+
+	return(ret);
+}
+
+void
+fil_space_set_corrupt(
+/*==================*/
+	ulint	space_id)
+{
+	fil_space_t*	space;
+
+	mutex_enter(&fil_system->mutex);
+
+	space = fil_space_get_by_id(space_id);
+
+	if (space) {
+		space->is_corrupt = TRUE;
+	}
+
+	mutex_exit(&fil_system->mutex);
+}
+
diff --git a/storage/xtradb/fsp/fsp0fsp.c b/storage/xtradb/fsp/fsp0fsp.c
new file mode 100644
index 00000000000..cd28186109f
--- /dev/null
+++ b/storage/xtradb/fsp/fsp0fsp.c
@@ -0,0 +1,4346 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2010, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/******************************************************************//**
+@file fsp/fsp0fsp.c
+File space management
+
+Created 11/29/1995 Heikki Tuuri
+***********************************************************************/
+
+#include "fsp0fsp.h"
+
+#ifdef UNIV_NONINL
+#include "fsp0fsp.ic"
+#endif
+
+#include "buf0buf.h"
+#include "fil0fil.h"
+#include "mtr0log.h"
+#include "ut0byte.h"
+#include "page0page.h"
+#include "page0zip.h"
+#ifdef UNIV_HOTBACKUP
+# include "fut0lst.h"
+#else /* UNIV_HOTBACKUP */
+# include "sync0sync.h"
+# include "fut0fut.h"
+# include "srv0srv.h"
+# include "ibuf0ibuf.h"
+# include "btr0btr.h"
+# include "btr0sea.h"
+# include "dict0boot.h"
+# include "log0log.h"
+#endif /* UNIV_HOTBACKUP */
+#include "dict0mem.h"
+#include "trx0sys.h"
+
+#define FSP_HEADER_OFFSET	FIL_PAGE_DATA	/* Offset of the space header
+						within a file page */
+
+/* The data structures in files are defined just as byte strings in C */
+typedef	byte	fsp_header_t;
+typedef	byte	xdes_t;
+
+/*			SPACE HEADER
+			============
+
+File space header data structure: this data structure is contained in the
+first page of a space. The space for this header is reserved in every extent
+descriptor page, but used only in the first. */
+
+/*-------------------------------------*/
+#define FSP_SPACE_ID		0	/* space id */
+#define FSP_NOT_USED		4	/* this field contained a value up to
+					which we know that the modifications
+					in the database have been flushed to
+					the file space; not used now */
+#define	FSP_SIZE		8	/* Current size of the space in
+					pages */
+#define	FSP_FREE_LIMIT		12	/* Minimum page number for which the
+					free list has not been initialized:
+					the pages >= this limit are, by
+					definition, free; note that in a
+					single-table tablespace where size
+					< 64 pages, this number is 64, i.e.,
+					we have initialized the space
+					about the first extent, but have not
+					physically allocted those pages to the
+					file */
+#define	FSP_SPACE_FLAGS		16	/* table->flags & ~DICT_TF_COMPACT */
+#define	FSP_FRAG_N_USED		20	/* number of used pages in the
+					FSP_FREE_FRAG list */
+#define	FSP_FREE		24	/* list of free extents */
+#define	FSP_FREE_FRAG		(24 + FLST_BASE_NODE_SIZE)
+					/* list of partially free extents not
+					belonging to any segment */
+#define	FSP_FULL_FRAG		(24 + 2 * FLST_BASE_NODE_SIZE)
+					/* list of full extents not belonging
+					to any segment */
+#define FSP_SEG_ID		(24 + 3 * FLST_BASE_NODE_SIZE)
+					/* 8 bytes which give the first unused
+					segment id */
+#define FSP_SEG_INODES_FULL	(32 + 3 * FLST_BASE_NODE_SIZE)
+					/* list of pages containing segment
+					headers, where all the segment inode
+					slots are reserved */
+#define FSP_SEG_INODES_FREE	(32 + 4 * FLST_BASE_NODE_SIZE)
+					/* list of pages containing segment
+					headers, where not all the segment
+					header slots are reserved */
+/*-------------------------------------*/
+/* File space header size */
+#define	FSP_HEADER_SIZE		(32 + 5 * FLST_BASE_NODE_SIZE)
+
+#define	FSP_FREE_ADD		4	/* this many free extents are added
+					to the free list from above
+					FSP_FREE_LIMIT at a time */
+
+/*			FILE SEGMENT INODE
+			==================
+
+Segment inode which is created for each segment in a tablespace. NOTE: in
+purge we assume that a segment having only one currently used page can be
+freed in a few steps, so that the freeing cannot fill the file buffer with
+bufferfixed file pages. */
+
+typedef	byte	fseg_inode_t;
+
+#define FSEG_INODE_PAGE_NODE	FSEG_PAGE_DATA
+					/* the list node for linking
+					segment inode pages */
+
+#define FSEG_ARR_OFFSET		(FSEG_PAGE_DATA + FLST_NODE_SIZE)
+/*-------------------------------------*/
+#define	FSEG_ID			0	/* 8 bytes of segment id: if this is
+					ut_dulint_zero, it means that the
+					header is unused */
+#define FSEG_NOT_FULL_N_USED	8
+					/* number of used segment pages in
+					the FSEG_NOT_FULL list */
+#define	FSEG_FREE		12
+					/* list of free extents of this
+					segment */
+#define	FSEG_NOT_FULL		(12 + FLST_BASE_NODE_SIZE)
+					/* list of partially free extents */
+#define	FSEG_FULL		(12 + 2 * FLST_BASE_NODE_SIZE)
+					/* list of full extents */
+#define	FSEG_MAGIC_N		(12 + 3 * FLST_BASE_NODE_SIZE)
+					/* magic number used in debugging */
+#define	FSEG_FRAG_ARR		(16 + 3 * FLST_BASE_NODE_SIZE)
+					/* array of individual pages
+					belonging to this segment in fsp
+					fragment extent lists */
+#define FSEG_FRAG_ARR_N_SLOTS	(FSP_EXTENT_SIZE / 2)
+					/* number of slots in the array for
+					the fragment pages */
+#define	FSEG_FRAG_SLOT_SIZE	4	/* a fragment page slot contains its
+					page number within space, FIL_NULL
+					means that the slot is not in use */
+/*-------------------------------------*/
+#define FSEG_INODE_SIZE					\
+	(16 + 3 * FLST_BASE_NODE_SIZE			\
+	 + FSEG_FRAG_ARR_N_SLOTS * FSEG_FRAG_SLOT_SIZE)
+
+#define FSP_SEG_INODES_PER_PAGE(zip_size)		\
+	(((zip_size ? zip_size : UNIV_PAGE_SIZE)	\
+	  - FSEG_ARR_OFFSET - 10) / FSEG_INODE_SIZE)
+				/* Number of segment inodes which fit on a
+				single page */
+
+#define FSEG_MAGIC_N_VALUE	97937874
+
+#define	FSEG_FILLFACTOR		8	/* If this value is x, then if
+					the number of unused but reserved
+					pages in a segment is less than
+					reserved pages * 1/x, and there are
+					at least FSEG_FRAG_LIMIT used pages,
+					then we allow a new empty extent to
+					be added to the segment in
+					fseg_alloc_free_page. Otherwise, we
+					use unused pages of the segment. */
+
+#define FSEG_FRAG_LIMIT		FSEG_FRAG_ARR_N_SLOTS
+					/* If the segment has >= this many
+					used pages, it may be expanded by
+					allocating extents to the segment;
+					until that only individual fragment
+					pages are allocated from the space */
+
+#define	FSEG_FREE_LIST_LIMIT	40	/* If the reserved size of a segment
+					is at least this many extents, we
+					allow extents to be put to the free
+					list of the extent: at most
+					FSEG_FREE_LIST_MAX_LEN many */
+#define	FSEG_FREE_LIST_MAX_LEN	4
+
+
+/*			EXTENT DESCRIPTOR
+			=================
+
+File extent descriptor data structure: contains bits to tell which pages in
+the extent are free and which contain old tuple version to clean. */
+
+/*-------------------------------------*/
+#define	XDES_ID			0	/* The identifier of the segment
+					to which this extent belongs */
+#define XDES_FLST_NODE		8	/* The list node data structure
+					for the descriptors */
+#define	XDES_STATE		(FLST_NODE_SIZE + 8)
+					/* contains state information
+					of the extent */
+#define	XDES_BITMAP		(FLST_NODE_SIZE + 12)
+					/* Descriptor bitmap of the pages
+					in the extent */
+/*-------------------------------------*/
+
+#define	XDES_BITS_PER_PAGE	2	/* How many bits are there per page */
+#define	XDES_FREE_BIT		0	/* Index of the bit which tells if
+					the page is free */
+#define	XDES_CLEAN_BIT		1	/* NOTE: currently not used!
+					Index of the bit which tells if
+					there are old versions of tuples
+					on the page */
+/* States of a descriptor */
+#define	XDES_FREE		1	/* extent is in free list of space */
+#define	XDES_FREE_FRAG		2	/* extent is in free fragment list of
+					space */
+#define	XDES_FULL_FRAG		3	/* extent is in full fragment list of
+					space */
+#define	XDES_FSEG		4	/* extent belongs to a segment */
+
+/* File extent data structure size in bytes. */
+#define	XDES_SIZE							\
+	(XDES_BITMAP + UT_BITS_IN_BYTES(FSP_EXTENT_SIZE * XDES_BITS_PER_PAGE))
+
+/* Offset of the descriptor array on a descriptor page */
+#define	XDES_ARR_OFFSET		(FSP_HEADER_OFFSET + FSP_HEADER_SIZE)
+
+#ifndef UNIV_HOTBACKUP
+/* Flag to indicate if we have printed the tablespace full error. */
+static ibool fsp_tbs_full_error_printed = FALSE;
+
+/**********************************************************************//**
+Returns an extent to the free list of a space. */
+static
+void
+fsp_free_extent(
+/*============*/
+	ulint		space,	/*!< in: space id */
+	ulint		zip_size,/*!< in: compressed page size in bytes
+				or 0 for uncompressed pages */
+	ulint		page,	/*!< in: page offset in the extent */
+	mtr_t*		mtr);	/*!< in: mtr */
+/**********************************************************************//**
+Frees an extent of a segment to the space free list. */
+static
+void
+fseg_free_extent(
+/*=============*/
+	fseg_inode_t*	seg_inode, /*!< in: segment inode */
+	ulint		space,	/*!< in: space id */
+	ulint		zip_size,/*!< in: compressed page size in bytes
+				or 0 for uncompressed pages */
+	ulint		page,	/*!< in: page offset in the extent */
+	mtr_t*		mtr);	/*!< in: mtr handle */
+/**********************************************************************//**
+Calculates the number of pages reserved by a segment, and how
+many pages are currently used.
+@return	number of reserved pages */
+static
+ulint
+fseg_n_reserved_pages_low(
+/*======================*/
+	fseg_inode_t*	header,	/*!< in: segment inode */
+	ulint*		used,	/*!< out: number of pages used (not
+				more than reserved) */
+	mtr_t*		mtr);	/*!< in: mtr handle */
+/********************************************************************//**
+Marks a page used. The page must reside within the extents of the given
+segment. */
+static
+void
+fseg_mark_page_used(
+/*================*/
+	fseg_inode_t*	seg_inode,/*!< in: segment inode */
+	ulint		space,	/*!< in: space id */
+	ulint		zip_size,/*!< in: compressed page size in bytes
+				or 0 for uncompressed pages */
+	ulint		page,	/*!< in: page offset */
+	mtr_t*		mtr);	/*!< in: mtr */
+/**********************************************************************//**
+Returns the first extent descriptor for a segment. We think of the extent
+lists of the segment catenated in the order FSEG_FULL -> FSEG_NOT_FULL
+-> FSEG_FREE.
+@return	the first extent descriptor, or NULL if none */
+static
+xdes_t*
+fseg_get_first_extent(
+/*==================*/
+	fseg_inode_t*	inode,	/*!< in: segment inode */
+	ulint		space,	/*!< in: space id */
+	ulint		zip_size,/*!< in: compressed page size in bytes
+				or 0 for uncompressed pages */
+	mtr_t*		mtr);	/*!< in: mtr */
+/**********************************************************************//**
+Puts new extents to the free list if
+there are free extents above the free limit. If an extent happens
+to contain an extent descriptor page, the extent is put to
+the FSP_FREE_FRAG list with the page marked as used. */
+static
+void
+fsp_fill_free_list(
+/*===============*/
+	ibool		init_space,	/*!< in: TRUE if this is a single-table
+					tablespace and we are only initing
+					the tablespace's first extent
+					descriptor page and ibuf bitmap page;
+					then we do not allocate more extents */
+	ulint		space,		/*!< in: space */
+	fsp_header_t*	header,		/*!< in: space header */
+	mtr_t*		mtr);		/*!< in: mtr */
+/**********************************************************************//**
+Allocates a single free page from a segment. This function implements
+the intelligent allocation strategy which tries to minimize file space
+fragmentation.
+@return	the allocated page number, FIL_NULL if no page could be allocated */
+static
+ulint
+fseg_alloc_free_page_low(
+/*=====================*/
+	ulint		space,	/*!< in: space */
+	ulint		zip_size,/*!< in: compressed page size in bytes
+				or 0 for uncompressed pages */
+	fseg_inode_t*	seg_inode, /*!< in: segment inode */
+	ulint		hint,	/*!< in: hint of which page would be desirable */
+	byte		direction, /*!< in: if the new page is needed because
+				of an index page split, and records are
+				inserted there in order, into which
+				direction they go alphabetically: FSP_DOWN,
+				FSP_UP, FSP_NO_DIR */
+	mtr_t*		mtr);	/*!< in: mtr handle */
+#endif /* !UNIV_HOTBACKUP */
+
+/**********************************************************************//**
+Reads the file space size stored in the header page.
+@return	tablespace size stored in the space header */
+UNIV_INTERN
+ulint
+fsp_get_size_low(
+/*=============*/
+	page_t*	page)	/*!< in: header page (page 0 in the tablespace) */
+{
+	return(mach_read_from_4(page + FSP_HEADER_OFFSET + FSP_SIZE));
+}
+
+#ifndef UNIV_HOTBACKUP
+/**********************************************************************//**
+Gets a pointer to the space header and x-locks its page.
+@return	pointer to the space header, page x-locked */
+UNIV_INLINE
+fsp_header_t*
+fsp_get_space_header(
+/*=================*/
+	ulint	id,	/*!< in: space id */
+	ulint	zip_size,/*!< in: compressed page size in bytes
+			or 0 for uncompressed pages */
+	mtr_t*	mtr)	/*!< in: mtr */
+{
+	buf_block_t*	block;
+	fsp_header_t*	header;
+
+	ut_ad(ut_is_2pow(zip_size));
+	ut_ad(zip_size <= UNIV_PAGE_SIZE);
+	ut_ad(!zip_size || zip_size >= PAGE_ZIP_MIN_SIZE);
+	ut_ad(id || !zip_size);
+
+	block = buf_page_get(id, zip_size, 0, RW_X_LATCH, mtr);
+
+	if (srv_pass_corrupt_table && !block) {
+		return(0);
+	}
+	ut_a(block);
+
+	header = FSP_HEADER_OFFSET + buf_block_get_frame(block);
+	buf_block_dbg_add_level(block, SYNC_FSP_PAGE);
+
+	ut_ad(id == mach_read_from_4(FSP_SPACE_ID + header));
+	ut_ad(zip_size == dict_table_flags_to_zip_size(
+		      mach_read_from_4(FSP_SPACE_FLAGS + header)));
+	return(header);
+}
+
+/**********************************************************************//**
+Gets a descriptor bit of a page.
+@return	TRUE if free */
+UNIV_INLINE
+ibool
+xdes_get_bit(
+/*=========*/
+	const xdes_t*	descr,	/*!< in: descriptor */
+	ulint		bit,	/*!< in: XDES_FREE_BIT or XDES_CLEAN_BIT */
+	ulint		offset,	/*!< in: page offset within extent:
+				0 ... FSP_EXTENT_SIZE - 1 */
+	mtr_t*		mtr)	/*!< in: mtr */
+{
+	ulint	index;
+	ulint	byte_index;
+	ulint	bit_index;
+
+	ut_ad(mtr_memo_contains_page(mtr, descr, MTR_MEMO_PAGE_X_FIX));
+	ut_ad((bit == XDES_FREE_BIT) || (bit == XDES_CLEAN_BIT));
+	ut_ad(offset < FSP_EXTENT_SIZE);
+
+	index = bit + XDES_BITS_PER_PAGE * offset;
+
+	byte_index = index / 8;
+	bit_index = index % 8;
+
+	return(ut_bit_get_nth(mtr_read_ulint(descr + XDES_BITMAP + byte_index,
+					     MLOG_1BYTE, mtr),
+			      bit_index));
+}
+
+/**********************************************************************//**
+Sets a descriptor bit of a page. */
+UNIV_INLINE
+void
+xdes_set_bit(
+/*=========*/
+	xdes_t*	descr,	/*!< in: descriptor */
+	ulint	bit,	/*!< in: XDES_FREE_BIT or XDES_CLEAN_BIT */
+	ulint	offset,	/*!< in: page offset within extent:
+			0 ... FSP_EXTENT_SIZE - 1 */
+	ibool	val,	/*!< in: bit value */
+	mtr_t*	mtr)	/*!< in: mtr */
+{
+	ulint	index;
+	ulint	byte_index;
+	ulint	bit_index;
+	ulint	descr_byte;
+
+	ut_ad(mtr_memo_contains_page(mtr, descr, MTR_MEMO_PAGE_X_FIX));
+	ut_ad((bit == XDES_FREE_BIT) || (bit == XDES_CLEAN_BIT));
+	ut_ad(offset < FSP_EXTENT_SIZE);
+
+	index = bit + XDES_BITS_PER_PAGE * offset;
+
+	byte_index = index / 8;
+	bit_index = index % 8;
+
+	descr_byte = mtr_read_ulint(descr + XDES_BITMAP + byte_index,
+				    MLOG_1BYTE, mtr);
+	descr_byte = ut_bit_set_nth(descr_byte, bit_index, val);
+
+	mlog_write_ulint(descr + XDES_BITMAP + byte_index, descr_byte,
+			 MLOG_1BYTE, mtr);
+}
+
+/**********************************************************************//**
+Looks for a descriptor bit having the desired value. Starts from hint
+and scans upward; at the end of the extent the search is wrapped to
+the start of the extent.
+@return	bit index of the bit, ULINT_UNDEFINED if not found */
+UNIV_INLINE
+ulint
+xdes_find_bit(
+/*==========*/
+	xdes_t*	descr,	/*!< in: descriptor */
+	ulint	bit,	/*!< in: XDES_FREE_BIT or XDES_CLEAN_BIT */
+	ibool	val,	/*!< in: desired bit value */
+	ulint	hint,	/*!< in: hint of which bit position would be desirable */
+	mtr_t*	mtr)	/*!< in: mtr */
+{
+	ulint	i;
+
+	ut_ad(descr && mtr);
+	ut_ad(val <= TRUE);
+	ut_ad(hint < FSP_EXTENT_SIZE);
+	ut_ad(mtr_memo_contains_page(mtr, descr, MTR_MEMO_PAGE_X_FIX));
+	for (i = hint; i < FSP_EXTENT_SIZE; i++) {
+		if (val == xdes_get_bit(descr, bit, i, mtr)) {
+
+			return(i);
+		}
+	}
+
+	for (i = 0; i < hint; i++) {
+		if (val == xdes_get_bit(descr, bit, i, mtr)) {
+
+			return(i);
+		}
+	}
+
+	return(ULINT_UNDEFINED);
+}
+
+/**********************************************************************//**
+Looks for a descriptor bit having the desired value. Scans the extent in
+a direction opposite to xdes_find_bit.
+@return	bit index of the bit, ULINT_UNDEFINED if not found */
+UNIV_INLINE
+ulint
+xdes_find_bit_downward(
+/*===================*/
+	xdes_t*	descr,	/*!< in: descriptor */
+	ulint	bit,	/*!< in: XDES_FREE_BIT or XDES_CLEAN_BIT */
+	ibool	val,	/*!< in: desired bit value */
+	ulint	hint,	/*!< in: hint of which bit position would be desirable */
+	mtr_t*	mtr)	/*!< in: mtr */
+{
+	ulint	i;
+
+	ut_ad(descr && mtr);
+	ut_ad(val <= TRUE);
+	ut_ad(hint < FSP_EXTENT_SIZE);
+	ut_ad(mtr_memo_contains_page(mtr, descr, MTR_MEMO_PAGE_X_FIX));
+	for (i = hint + 1; i > 0; i--) {
+		if (val == xdes_get_bit(descr, bit, i - 1, mtr)) {
+
+			return(i - 1);
+		}
+	}
+
+	for (i = FSP_EXTENT_SIZE - 1; i > hint; i--) {
+		if (val == xdes_get_bit(descr, bit, i, mtr)) {
+
+			return(i);
+		}
+	}
+
+	return(ULINT_UNDEFINED);
+}
+
+/**********************************************************************//**
+Returns the number of used pages in a descriptor.
+@return	number of pages used */
+UNIV_INLINE
+ulint
+xdes_get_n_used(
+/*============*/
+	const xdes_t*	descr,	/*!< in: descriptor */
+	mtr_t*		mtr)	/*!< in: mtr */
+{
+	ulint	i;
+	ulint	count	= 0;
+
+	ut_ad(descr && mtr);
+	ut_ad(mtr_memo_contains_page(mtr, descr, MTR_MEMO_PAGE_X_FIX));
+	for (i = 0; i < FSP_EXTENT_SIZE; i++) {
+		if (FALSE == xdes_get_bit(descr, XDES_FREE_BIT, i, mtr)) {
+			count++;
+		}
+	}
+
+	return(count);
+}
+
+/**********************************************************************//**
+Returns true if extent contains no used pages.
+@return	TRUE if totally free */
+UNIV_INLINE
+ibool
+xdes_is_free(
+/*=========*/
+	const xdes_t*	descr,	/*!< in: descriptor */
+	mtr_t*		mtr)	/*!< in: mtr */
+{
+	if (0 == xdes_get_n_used(descr, mtr)) {
+
+		return(TRUE);
+	}
+
+	return(FALSE);
+}
+
+/**********************************************************************//**
+Returns true if extent contains no free pages.
+@return	TRUE if full */
+UNIV_INLINE
+ibool
+xdes_is_full(
+/*=========*/
+	const xdes_t*	descr,	/*!< in: descriptor */
+	mtr_t*		mtr)	/*!< in: mtr */
+{
+	if (FSP_EXTENT_SIZE == xdes_get_n_used(descr, mtr)) {
+
+		return(TRUE);
+	}
+
+	return(FALSE);
+}
+
+/**********************************************************************//**
+Sets the state of an xdes. */
+UNIV_INLINE
+void
+xdes_set_state(
+/*===========*/
+	xdes_t*	descr,	/*!< in/out: descriptor */
+	ulint	state,	/*!< in: state to set */
+	mtr_t*	mtr)	/*!< in: mtr handle */
+{
+	ut_ad(descr && mtr);
+	ut_ad(state >= XDES_FREE);
+	ut_ad(state <= XDES_FSEG);
+	ut_ad(mtr_memo_contains_page(mtr, descr, MTR_MEMO_PAGE_X_FIX));
+
+	mlog_write_ulint(descr + XDES_STATE, state, MLOG_4BYTES, mtr);
+}
+
+/**********************************************************************//**
+Gets the state of an xdes.
+@return	state */
+UNIV_INLINE
+ulint
+xdes_get_state(
+/*===========*/
+	const xdes_t*	descr,	/*!< in: descriptor */
+	mtr_t*		mtr)	/*!< in: mtr handle */
+{
+	ulint	state;
+
+	ut_ad(descr && mtr);
+	ut_ad(mtr_memo_contains_page(mtr, descr, MTR_MEMO_PAGE_X_FIX));
+
+	state = mtr_read_ulint(descr + XDES_STATE, MLOG_4BYTES, mtr);
+	ut_ad(state - 1 < XDES_FSEG);
+	return(state);
+}
+
+/**********************************************************************//**
+Inits an extent descriptor to the free and clean state. */
+UNIV_INLINE
+void
+xdes_init(
+/*======*/
+	xdes_t*	descr,	/*!< in: descriptor */
+	mtr_t*	mtr)	/*!< in: mtr */
+{
+	ulint	i;
+
+	ut_ad(descr && mtr);
+	ut_ad(mtr_memo_contains_page(mtr, descr, MTR_MEMO_PAGE_X_FIX));
+	ut_ad((XDES_SIZE - XDES_BITMAP) % 4 == 0);
+
+	for (i = XDES_BITMAP; i < XDES_SIZE; i += 4) {
+		mlog_write_ulint(descr + i, 0xFFFFFFFFUL, MLOG_4BYTES, mtr);
+	}
+
+	xdes_set_state(descr, XDES_FREE, mtr);
+}
+
+/********************************************************************//**
+Calculates the page where the descriptor of a page resides.
+@return	descriptor page offset */
+UNIV_INLINE
+ulint
+xdes_calc_descriptor_page(
+/*======================*/
+	ulint	zip_size,	/*!< in: compressed page size in bytes;
+				0 for uncompressed pages */
+	ulint	offset)		/*!< in: page offset */
+{
+#ifndef DOXYGEN /* Doxygen gets confused of these */
+# if PAGE_ZIP_MIN_SIZE <= XDES_ARR_OFFSET \
+		+ (PAGE_ZIP_MIN_SIZE / FSP_EXTENT_SIZE) * XDES_SIZE
+#  error
+# endif
+#endif /* !DOXYGEN */
+	ut_a(UNIV_PAGE_SIZE > XDES_ARR_OFFSET + (UNIV_PAGE_SIZE / FSP_EXTENT_SIZE) * XDES_SIZE);
+	ut_ad(ut_is_2pow(zip_size));
+
+	if (!zip_size) {
+		return(ut_2pow_round(offset, UNIV_PAGE_SIZE));
+	} else {
+		ut_ad(zip_size > XDES_ARR_OFFSET
+		      + (zip_size / FSP_EXTENT_SIZE) * XDES_SIZE);
+		return(ut_2pow_round(offset, zip_size));
+	}
+}
+
+/********************************************************************//**
+Calculates the descriptor index within a descriptor page.
+@return	descriptor index */
+UNIV_INLINE
+ulint
+xdes_calc_descriptor_index(
+/*=======================*/
+	ulint	zip_size,	/*!< in: compressed page size in bytes;
+				0 for uncompressed pages */
+	ulint	offset)		/*!< in: page offset */
+{
+	ut_ad(ut_is_2pow(zip_size));
+
+	if (!zip_size) {
+		return(ut_2pow_remainder(offset, UNIV_PAGE_SIZE)
+		       / FSP_EXTENT_SIZE);
+	} else {
+		return(ut_2pow_remainder(offset, zip_size) / FSP_EXTENT_SIZE);
+	}
+}
+
+/********************************************************************//**
+Gets pointer to a the extent descriptor of a page. The page where the extent
+descriptor resides is x-locked. If the page offset is equal to the free limit
+of the space, adds new extents from above the free limit to the space free
+list, if not free limit == space size. This adding is necessary to make the
+descriptor defined, as they are uninitialized above the free limit.
+@return pointer to the extent descriptor, NULL if the page does not
+exist in the space or if the offset exceeds the free limit */
+UNIV_INLINE
+xdes_t*
+xdes_get_descriptor_with_space_hdr(
+/*===============================*/
+	fsp_header_t*	sp_header,/*!< in/out: space header, x-latched */
+	ulint		space,	/*!< in: space id */
+	ulint		offset,	/*!< in: page offset;
+				if equal to the free limit,
+				we try to add new extents to
+				the space free list */
+	mtr_t*		mtr)	/*!< in: mtr handle */
+{
+	ulint	limit;
+	ulint	size;
+	ulint	zip_size;
+	ulint	descr_page_no;
+	page_t*	descr_page;
+
+	ut_ad(mtr);
+	ut_ad(mtr_memo_contains(mtr, fil_space_get_latch(space, NULL),
+				MTR_MEMO_X_LOCK));
+	ut_ad(mtr_memo_contains_page(mtr, sp_header, MTR_MEMO_PAGE_S_FIX)
+	      || mtr_memo_contains_page(mtr, sp_header, MTR_MEMO_PAGE_X_FIX));
+	ut_ad(page_offset(sp_header) == FSP_HEADER_OFFSET);
+	/* Read free limit and space size */
+	limit = mach_read_from_4(sp_header + FSP_FREE_LIMIT);
+	size  = mach_read_from_4(sp_header + FSP_SIZE);
+	zip_size = dict_table_flags_to_zip_size(
+		mach_read_from_4(sp_header + FSP_SPACE_FLAGS));
+
+	/* If offset is >= size or > limit, return NULL */
+
+	if ((offset >= size) || (offset > limit)) {
+
+		return(NULL);
+	}
+
+	/* If offset is == limit, fill free list of the space. */
+
+	if (offset == limit) {
+		fsp_fill_free_list(FALSE, space, sp_header, mtr);
+	}
+
+	descr_page_no = xdes_calc_descriptor_page(zip_size, offset);
+
+	if (descr_page_no == 0) {
+		/* It is on the space header page */
+
+		descr_page = page_align(sp_header);
+	} else {
+		buf_block_t*	block;
+
+		block = buf_page_get(space, zip_size, descr_page_no,
+				     RW_X_LATCH, mtr);
+		buf_block_dbg_add_level(block, SYNC_FSP_PAGE);
+
+		descr_page = buf_block_get_frame(block);
+	}
+
+	return(descr_page + XDES_ARR_OFFSET
+	       + XDES_SIZE * xdes_calc_descriptor_index(zip_size, offset));
+}
+
+/********************************************************************//**
+Gets pointer to a the extent descriptor of a page. The page where the
+extent descriptor resides is x-locked. If the page offset is equal to
+the free limit of the space, adds new extents from above the free limit
+to the space free list, if not free limit == space size. This adding
+is necessary to make the descriptor defined, as they are uninitialized
+above the free limit.
+@return pointer to the extent descriptor, NULL if the page does not
+exist in the space or if the offset exceeds the free limit */
+static
+xdes_t*
+xdes_get_descriptor(
+/*================*/
+	ulint	space,	/*!< in: space id */
+	ulint	zip_size,/*!< in: compressed page size in bytes
+			or 0 for uncompressed pages */
+	ulint	offset,	/*!< in: page offset; if equal to the free limit,
+			we try to add new extents to the space free list */
+	mtr_t*	mtr)	/*!< in: mtr handle */
+{
+	buf_block_t*	block;
+	fsp_header_t*	sp_header;
+
+	block = buf_page_get(space, zip_size, 0, RW_X_LATCH, mtr);
+
+	if (srv_pass_corrupt_table && !block) {
+		return(0);
+	}
+	ut_a(block);
+
+	buf_block_dbg_add_level(block, SYNC_FSP_PAGE);
+
+	sp_header = FSP_HEADER_OFFSET + buf_block_get_frame(block);
+	return(xdes_get_descriptor_with_space_hdr(sp_header, space, offset,
+						  mtr));
+}
+
+/********************************************************************//**
+Gets pointer to a the extent descriptor if the file address
+of the descriptor list node is known. The page where the
+extent descriptor resides is x-locked.
+@return	pointer to the extent descriptor */
+UNIV_INLINE
+xdes_t*
+xdes_lst_get_descriptor(
+/*====================*/
+	ulint		space,	/*!< in: space id */
+	ulint		zip_size,/*!< in: compressed page size in bytes
+				or 0 for uncompressed pages */
+	fil_addr_t	lst_node,/*!< in: file address of the list node
+				contained in the descriptor */
+	mtr_t*		mtr)	/*!< in: mtr handle */
+{
+	xdes_t*	descr;
+
+	ut_ad(mtr);
+	ut_ad(mtr_memo_contains(mtr, fil_space_get_latch(space, NULL),
+				MTR_MEMO_X_LOCK));
+	descr = fut_get_ptr(space, zip_size, lst_node, RW_X_LATCH, mtr)
+		- XDES_FLST_NODE;
+
+	return(descr);
+}
+
+/********************************************************************//**
+Returns page offset of the first page in extent described by a descriptor.
+@return	offset of the first page in extent */
+UNIV_INLINE
+ulint
+xdes_get_offset(
+/*============*/
+	xdes_t*	descr)	/*!< in: extent descriptor */
+{
+	ut_ad(descr);
+
+	return(page_get_page_no(page_align(descr))
+	       + ((page_offset(descr) - XDES_ARR_OFFSET) / XDES_SIZE)
+	       * FSP_EXTENT_SIZE);
+}
+#endif /* !UNIV_HOTBACKUP */
+
+/***********************************************************//**
+Inits a file page whose prior contents should be ignored. */
+static
+void
+fsp_init_file_page_low(
+/*===================*/
+	buf_block_t*	block)	/*!< in: pointer to a page */
+{
+	page_t*		page	= buf_block_get_frame(block);
+	page_zip_des_t*	page_zip= buf_block_get_page_zip(block);
+
+#ifndef UNIV_HOTBACKUP
+	block->check_index_page_at_flush = FALSE;
+#endif /* !UNIV_HOTBACKUP */
+
+	if (UNIV_LIKELY_NULL(page_zip)) {
+		memset(page, 0, UNIV_PAGE_SIZE);
+		memset(page_zip->data, 0, page_zip_get_size(page_zip));
+		mach_write_to_4(page + FIL_PAGE_OFFSET,
+				buf_block_get_page_no(block));
+		mach_write_to_4(page
+				+ FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID,
+				buf_block_get_space(block));
+		memcpy(page_zip->data + FIL_PAGE_OFFSET,
+		       page + FIL_PAGE_OFFSET, 4);
+		memcpy(page_zip->data + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID,
+		       page + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID, 4);
+		return;
+	}
+
+	memset(page, 0, UNIV_PAGE_SIZE);
+	mach_write_to_4(page + FIL_PAGE_OFFSET, buf_block_get_page_no(block));
+	mach_write_to_4(page + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID,
+			buf_block_get_space(block));
+}
+
+#ifndef UNIV_HOTBACKUP
+/***********************************************************//**
+Inits a file page whose prior contents should be ignored. */
+static
+void
+fsp_init_file_page(
+/*===============*/
+	buf_block_t*	block,	/*!< in: pointer to a page */
+	mtr_t*		mtr)	/*!< in: mtr */
+{
+	fsp_init_file_page_low(block);
+
+	mlog_write_initial_log_record(buf_block_get_frame(block),
+				      MLOG_INIT_FILE_PAGE, mtr);
+}
+#endif /* !UNIV_HOTBACKUP */
+
+/***********************************************************//**
+Parses a redo log record of a file page init.
+@return	end of log record or NULL */
+UNIV_INTERN
+byte*
+fsp_parse_init_file_page(
+/*=====================*/
+	byte*		ptr,	/*!< in: buffer */
+	byte*		end_ptr __attribute__((unused)), /*!< in: buffer end */
+	buf_block_t*	block)	/*!< in: block or NULL */
+{
+	ut_ad(ptr && end_ptr);
+
+	if (block) {
+		fsp_init_file_page_low(block);
+	}
+
+	return(ptr);
+}
+
+/**********************************************************************//**
+Initializes the fsp system. */
+UNIV_INTERN
+void
+fsp_init(void)
+/*==========*/
+{
+	/* Does nothing at the moment */
+}
+
+/**********************************************************************//**
+Writes the space id and compressed page size to a tablespace header.
+This function is used past the buffer pool when we in fil0fil.c create
+a new single-table tablespace. */
+UNIV_INTERN
+void
+fsp_header_init_fields(
+/*===================*/
+	page_t*	page,		/*!< in/out: first page in the space */
+	ulint	space_id,	/*!< in: space id */
+	ulint	flags)		/*!< in: tablespace flags (FSP_SPACE_FLAGS):
+				0, or table->flags if newer than COMPACT */
+{
+	/* The tablespace flags (FSP_SPACE_FLAGS) should be 0 for
+	ROW_FORMAT=COMPACT (table->flags == DICT_TF_COMPACT) and
+	ROW_FORMAT=REDUNDANT (table->flags == 0).  For any other
+	format, the tablespace flags should equal table->flags. */
+	ut_a(flags != DICT_TF_COMPACT);
+
+	mach_write_to_4(FSP_HEADER_OFFSET + FSP_SPACE_ID + page,
+			space_id);
+	mach_write_to_4(FSP_HEADER_OFFSET + FSP_SPACE_FLAGS + page,
+			flags);
+}
+
+#ifndef UNIV_HOTBACKUP
+/**********************************************************************//**
+Initializes the space header of a new created space and creates also the
+insert buffer tree root if space == 0. */
+UNIV_INTERN
+void
+fsp_header_init(
+/*============*/
+	ulint	space,		/*!< in: space id */
+	ulint	size,		/*!< in: current size in blocks */
+	mtr_t*	mtr)		/*!< in: mini-transaction handle */
+{
+	fsp_header_t*	header;
+	buf_block_t*	block;
+	page_t*		page;
+	ulint		flags;
+	ulint		zip_size;
+
+	ut_ad(mtr);
+
+	mtr_x_lock(fil_space_get_latch(space, &flags), mtr);
+
+	zip_size = dict_table_flags_to_zip_size(flags);
+	block = buf_page_create(space, 0, zip_size, mtr);
+	buf_page_get(space, zip_size, 0, RW_X_LATCH, mtr);
+	buf_block_dbg_add_level(block, SYNC_FSP_PAGE);
+
+	/* The prior contents of the file page should be ignored */
+
+	fsp_init_file_page(block, mtr);
+	page = buf_block_get_frame(block);
+
+	mlog_write_ulint(page + FIL_PAGE_TYPE, FIL_PAGE_TYPE_FSP_HDR,
+			 MLOG_2BYTES, mtr);
+
+	header = FSP_HEADER_OFFSET + page;
+
+	mlog_write_ulint(header + FSP_SPACE_ID, space, MLOG_4BYTES, mtr);
+	mlog_write_ulint(header + FSP_NOT_USED, 0, MLOG_4BYTES, mtr);
+
+	mlog_write_ulint(header + FSP_SIZE, size, MLOG_4BYTES, mtr);
+	mlog_write_ulint(header + FSP_FREE_LIMIT, 0, MLOG_4BYTES, mtr);
+	mlog_write_ulint(header + FSP_SPACE_FLAGS, flags,
+			 MLOG_4BYTES, mtr);
+	mlog_write_ulint(header + FSP_FRAG_N_USED, 0, MLOG_4BYTES, mtr);
+
+	flst_init(header + FSP_FREE, mtr);
+	flst_init(header + FSP_FREE_FRAG, mtr);
+	flst_init(header + FSP_FULL_FRAG, mtr);
+	flst_init(header + FSP_SEG_INODES_FULL, mtr);
+	flst_init(header + FSP_SEG_INODES_FREE, mtr);
+
+	mlog_write_dulint(header + FSP_SEG_ID, ut_dulint_create(0, 1), mtr);
+	if (space == TRX_SYS_SPACE || space == TRX_DOUBLEWRITE_SPACE) {
+		fsp_fill_free_list(FALSE, space, header, mtr);
+		btr_create(DICT_CLUSTERED | DICT_UNIVERSAL | DICT_IBUF,
+			   space, 0, ut_dulint_add(DICT_IBUF_ID_MIN, space),
+			   dict_ind_redundant, mtr);
+	} else {
+		fsp_fill_free_list(TRUE, space, header, mtr);
+	}
+}
+#endif /* !UNIV_HOTBACKUP */
+
+/**********************************************************************//**
+Reads the space id from the first page of a tablespace.
+@return	space id, ULINT UNDEFINED if error */
+UNIV_INTERN
+ulint
+fsp_header_get_space_id(
+/*====================*/
+	const page_t*	page)	/*!< in: first page of a tablespace */
+{
+	ulint	fsp_id;
+	ulint	id;
+
+	fsp_id = mach_read_from_4(FSP_HEADER_OFFSET + page + FSP_SPACE_ID);
+
+	id = mach_read_from_4(page + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID);
+
+	if (id != fsp_id) {
+		fprintf(stderr,
+			"InnoDB: Error: space id in fsp header %lu,"
+			" but in the page header %lu\n",
+			(ulong) fsp_id, (ulong) id);
+
+		return(ULINT_UNDEFINED);
+	}
+
+	return(id);
+}
+
+/**********************************************************************//**
+Reads the space flags from the first page of a tablespace.
+@return	flags */
+UNIV_INTERN
+ulint
+fsp_header_get_flags(
+/*=================*/
+	const page_t*	page)	/*!< in: first page of a tablespace */
+{
+	ut_ad(!page_offset(page));
+
+	return(mach_read_from_4(FSP_HEADER_OFFSET + FSP_SPACE_FLAGS + page));
+}
+
+/**********************************************************************//**
+Reads the compressed page size from the first page of a tablespace.
+@return	compressed page size in bytes, or 0 if uncompressed */
+UNIV_INTERN
+ulint
+fsp_header_get_zip_size(
+/*====================*/
+	const page_t*	page)	/*!< in: first page of a tablespace */
+{
+	ulint	flags = fsp_header_get_flags(page);
+
+	return(dict_table_flags_to_zip_size(flags));
+}
+
+#ifndef UNIV_HOTBACKUP
+/**********************************************************************//**
+Increases the space size field of a space. */
+UNIV_INTERN
+void
+fsp_header_inc_size(
+/*================*/
+	ulint	space,	/*!< in: space id */
+	ulint	size_inc,/*!< in: size increment in pages */
+	mtr_t*	mtr)	/*!< in: mini-transaction handle */
+{
+	fsp_header_t*	header;
+	ulint		size;
+	ulint		flags;
+
+	ut_ad(mtr);
+
+	mtr_x_lock(fil_space_get_latch(space, &flags), mtr);
+
+	header = fsp_get_space_header(space,
+				      dict_table_flags_to_zip_size(flags),
+				      mtr);
+
+	size = mtr_read_ulint(header + FSP_SIZE, MLOG_4BYTES, mtr);
+
+	mlog_write_ulint(header + FSP_SIZE, size + size_inc, MLOG_4BYTES,
+			 mtr);
+}
+
+/**********************************************************************//**
+Gets the current free limit of the system tablespace.  The free limit
+means the place of the first page which has never been put to the
+free list for allocation.  The space above that address is initialized
+to zero.  Sets also the global variable log_fsp_current_free_limit.
+@return	free limit in megabytes */
+UNIV_INTERN
+ulint
+fsp_header_get_free_limit(void)
+/*===========================*/
+{
+	fsp_header_t*	header;
+	ulint		limit;
+	mtr_t		mtr;
+
+	mtr_start(&mtr);
+
+	mtr_x_lock(fil_space_get_latch(0, NULL), &mtr);
+
+	header = fsp_get_space_header(0, 0, &mtr);
+
+	limit = mtr_read_ulint(header + FSP_FREE_LIMIT, MLOG_4BYTES, &mtr);
+
+	limit /= ((1024 * 1024) / UNIV_PAGE_SIZE);
+
+	log_fsp_current_free_limit_set_and_checkpoint(limit);
+
+	mtr_commit(&mtr);
+
+	return(limit);
+}
+
+/**********************************************************************//**
+Gets the size of the system tablespace from the tablespace header.  If
+we do not have an auto-extending data file, this should be equal to
+the size of the data files.  If there is an auto-extending data file,
+this can be smaller.
+@return	size in pages */
+UNIV_INTERN
+ulint
+fsp_header_get_tablespace_size(void)
+/*================================*/
+{
+	fsp_header_t*	header;
+	ulint		size;
+	mtr_t		mtr;
+
+	mtr_start(&mtr);
+
+	mtr_x_lock(fil_space_get_latch(0, NULL), &mtr);
+
+	header = fsp_get_space_header(0, 0, &mtr);
+
+	size = mtr_read_ulint(header + FSP_SIZE, MLOG_4BYTES, &mtr);
+
+	mtr_commit(&mtr);
+
+	return(size);
+}
+
+/***********************************************************************//**
+Tries to extend a single-table tablespace so that a page would fit in the
+data file.
+@return	TRUE if success */
+static
+ibool
+fsp_try_extend_data_file_with_pages(
+/*================================*/
+	ulint		space,		/*!< in: space */
+	ulint		page_no,	/*!< in: page number */
+	fsp_header_t*	header,		/*!< in: space header */
+	mtr_t*		mtr)		/*!< in: mtr */
+{
+	ibool	success;
+	ulint	actual_size;
+	ulint	size;
+
+	ut_a(space != 0);
+
+	size = mtr_read_ulint(header + FSP_SIZE, MLOG_4BYTES, mtr);
+
+	ut_a(page_no >= size);
+
+	success = fil_extend_space_to_desired_size(&actual_size, space,
+						   page_no + 1);
+	/* actual_size now has the space size in pages; it may be less than
+	we wanted if we ran out of disk space */
+
+	mlog_write_ulint(header + FSP_SIZE, actual_size, MLOG_4BYTES, mtr);
+
+	return(success);
+}
+
+/***********************************************************************//**
+Tries to extend the last data file of a tablespace if it is auto-extending.
+@return	FALSE if not auto-extending */
+static
+ibool
+fsp_try_extend_data_file(
+/*=====================*/
+	ulint*		actual_increase,/*!< out: actual increase in pages, where
+					we measure the tablespace size from
+					what the header field says; it may be
+					the actual file size rounded down to
+					megabyte */
+	ulint		space,		/*!< in: space */
+	fsp_header_t*	header,		/*!< in: space header */
+	mtr_t*		mtr)		/*!< in: mtr */
+{
+	ulint	size;
+	ulint	zip_size;
+	ulint	new_size;
+	ulint	old_size;
+	ulint	size_increase;
+	ulint	actual_size;
+	ibool	success;
+
+	*actual_increase = 0;
+
+	if (space == 0 && !srv_auto_extend_last_data_file) {
+
+		/* We print the error message only once to avoid
+		spamming the error log. Note that we don't need
+		to reset the flag to FALSE as dealing with this
+		error requires server restart. */
+		if (fsp_tbs_full_error_printed == FALSE) {
+			fprintf(stderr,
+				"InnoDB: Error: Data file(s) ran"
+				" out of space.\n"
+				"Please add another data file or"
+				" use \'autoextend\' for the last"
+				" data file.\n");
+			fsp_tbs_full_error_printed = TRUE;
+		}
+		return(FALSE);
+	}
+
+	size = mtr_read_ulint(header + FSP_SIZE, MLOG_4BYTES, mtr);
+	zip_size = dict_table_flags_to_zip_size(
+		mach_read_from_4(header + FSP_SPACE_FLAGS));
+
+	old_size = size;
+
+	if (space == 0) {
+		if (!srv_last_file_size_max) {
+			size_increase = SRV_AUTO_EXTEND_INCREMENT;
+		} else {
+			if (srv_last_file_size_max
+			    < srv_data_file_sizes[srv_n_data_files - 1]) {
+
+				fprintf(stderr,
+					"InnoDB: Error: Last data file size"
+					" is %lu, max size allowed %lu\n",
+					(ulong) srv_data_file_sizes[
+						srv_n_data_files - 1],
+					(ulong) srv_last_file_size_max);
+			}
+
+			size_increase = srv_last_file_size_max
+				- srv_data_file_sizes[srv_n_data_files - 1];
+			if (size_increase > SRV_AUTO_EXTEND_INCREMENT) {
+				size_increase = SRV_AUTO_EXTEND_INCREMENT;
+			}
+		}
+	} else {
+		/* We extend single-table tablespaces first one extent
+		at a time, but for bigger tablespaces more. It is not
+		enough to extend always by one extent, because some
+		extents are frag page extents. */
+		ulint	extent_size;	/*!< one megabyte, in pages */
+
+		if (!zip_size) {
+			extent_size = FSP_EXTENT_SIZE;
+		} else {
+			extent_size = FSP_EXTENT_SIZE
+				* UNIV_PAGE_SIZE / zip_size;
+		}
+
+		if (size < extent_size) {
+			/* Let us first extend the file to extent_size */
+			success = fsp_try_extend_data_file_with_pages(
+				space, extent_size - 1, header, mtr);
+			if (!success) {
+				new_size = mtr_read_ulint(header + FSP_SIZE,
+							  MLOG_4BYTES, mtr);
+
+				*actual_increase = new_size - old_size;
+
+				return(FALSE);
+			}
+
+			size = extent_size;
+		}
+
+		if (size < 32 * extent_size) {
+			size_increase = extent_size;
+		} else {
+			/* Below in fsp_fill_free_list() we assume
+			that we add at most FSP_FREE_ADD extents at
+			a time */
+			size_increase = FSP_FREE_ADD * extent_size;
+		}
+	}
+
+	if (size_increase == 0) {
+
+		return(TRUE);
+	}
+
+	success = fil_extend_space_to_desired_size(&actual_size, space,
+						   size + size_increase);
+	/* We ignore any fragments of a full megabyte when storing the size
+	to the space header */
+
+	if (!zip_size) {
+		new_size = ut_calc_align_down(actual_size,
+					      (1024 * 1024) / UNIV_PAGE_SIZE);
+	} else {
+		new_size = ut_calc_align_down(actual_size,
+					      (1024 * 1024) / zip_size);
+	}
+	mlog_write_ulint(header + FSP_SIZE, new_size, MLOG_4BYTES, mtr);
+
+	*actual_increase = new_size - old_size;
+
+	return(TRUE);
+}
+
+/**********************************************************************//**
+Puts new extents to the free list if there are free extents above the free
+limit. If an extent happens to contain an extent descriptor page, the extent
+is put to the FSP_FREE_FRAG list with the page marked as used. */
+static
+void
+fsp_fill_free_list(
+/*===============*/
+	ibool		init_space,	/*!< in: TRUE if this is a single-table
+					tablespace and we are only initing
+					the tablespace's first extent
+					descriptor page and ibuf bitmap page;
+					then we do not allocate more extents */
+	ulint		space,		/*!< in: space */
+	fsp_header_t*	header,		/*!< in/out: space header */
+	mtr_t*		mtr)		/*!< in: mtr */
+{
+	ulint	limit;
+	ulint	size;
+	ulint	zip_size;
+	xdes_t*	descr;
+	ulint	count		= 0;
+	ulint	frag_n_used;
+	ulint	actual_increase;
+	ulint	i;
+	mtr_t	ibuf_mtr;
+
+	ut_ad(header && mtr);
+	ut_ad(page_offset(header) == FSP_HEADER_OFFSET);
+
+	/* Check if we can fill free list from above the free list limit */
+	size = mtr_read_ulint(header + FSP_SIZE, MLOG_4BYTES, mtr);
+	limit = mtr_read_ulint(header + FSP_FREE_LIMIT, MLOG_4BYTES, mtr);
+
+	zip_size = dict_table_flags_to_zip_size(
+		mach_read_from_4(FSP_SPACE_FLAGS + header));
+	ut_a(ut_is_2pow(zip_size));
+	ut_a(zip_size <= UNIV_PAGE_SIZE);
+	ut_a(!zip_size || zip_size >= PAGE_ZIP_MIN_SIZE);
+
+	if (space == 0 && srv_auto_extend_last_data_file
+	    && size < limit + FSP_EXTENT_SIZE * FSP_FREE_ADD) {
+
+		/* Try to increase the last data file size */
+		fsp_try_extend_data_file(&actual_increase, space, header, mtr);
+		size = mtr_read_ulint(header + FSP_SIZE, MLOG_4BYTES, mtr);
+	}
+
+	if (space != 0 && !init_space
+	    && size < limit + FSP_EXTENT_SIZE * FSP_FREE_ADD) {
+
+		/* Try to increase the .ibd file size */
+		fsp_try_extend_data_file(&actual_increase, space, header, mtr);
+		size = mtr_read_ulint(header + FSP_SIZE, MLOG_4BYTES, mtr);
+	}
+
+	i = limit;
+
+	while ((init_space && i < 1)
+	       || ((i + FSP_EXTENT_SIZE <= size) && (count < FSP_FREE_ADD))) {
+
+		ibool	init_xdes;
+		if (zip_size) {
+			init_xdes = ut_2pow_remainder(i, zip_size) == 0;
+		} else {
+			init_xdes = ut_2pow_remainder(i, UNIV_PAGE_SIZE) == 0;
+		}
+
+		mlog_write_ulint(header + FSP_FREE_LIMIT, i + FSP_EXTENT_SIZE,
+				 MLOG_4BYTES, mtr);
+
+		/* Update the free limit info in the log system and make
+		a checkpoint */
+		if (space == 0) {
+			ut_a(!zip_size);
+			log_fsp_current_free_limit_set_and_checkpoint(
+				(i + FSP_EXTENT_SIZE)
+				/ ((1024 * 1024) / UNIV_PAGE_SIZE));
+		}
+
+		if (UNIV_UNLIKELY(init_xdes)) {
+
+			buf_block_t*	block;
+
+			/* We are going to initialize a new descriptor page
+			and a new ibuf bitmap page: the prior contents of the
+			pages should be ignored. */
+
+			if (i > 0) {
+				block = buf_page_create(
+					space, i, zip_size, mtr);
+				buf_page_get(space, zip_size, i,
+					     RW_X_LATCH, mtr);
+				buf_block_dbg_add_level(block,
+							SYNC_FSP_PAGE);
+
+				fsp_init_file_page(block, mtr);
+				mlog_write_ulint(buf_block_get_frame(block)
+						 + FIL_PAGE_TYPE,
+						 FIL_PAGE_TYPE_XDES,
+						 MLOG_2BYTES, mtr);
+			}
+
+			/* Initialize the ibuf bitmap page in a separate
+			mini-transaction because it is low in the latching
+			order, and we must be able to release its latch
+			before returning from the fsp routine */
+
+			mtr_start(&ibuf_mtr);
+
+			block = buf_page_create(space,
+						    i + FSP_IBUF_BITMAP_OFFSET,
+						    zip_size, &ibuf_mtr);
+			buf_page_get(space, zip_size,
+				     i + FSP_IBUF_BITMAP_OFFSET,
+				     RW_X_LATCH, &ibuf_mtr);
+			buf_block_dbg_add_level(block, SYNC_FSP_PAGE);
+
+			fsp_init_file_page(block, &ibuf_mtr);
+
+			ibuf_bitmap_page_init(block, &ibuf_mtr);
+
+			mtr_commit(&ibuf_mtr);
+		}
+
+		descr = xdes_get_descriptor_with_space_hdr(header, space, i,
+							   mtr);
+		xdes_init(descr, mtr);
+
+//#if UNIV_PAGE_SIZE % FSP_EXTENT_SIZE
+//# error "UNIV_PAGE_SIZE % FSP_EXTENT_SIZE != 0"
+//#endif
+//#if PAGE_ZIP_MIN_SIZE % FSP_EXTENT_SIZE
+//# error "PAGE_ZIP_MIN_SIZE % FSP_EXTENT_SIZE != 0"
+//#endif
+
+		if (UNIV_UNLIKELY(init_xdes)) {
+
+			/* The first page in the extent is a descriptor page
+			and the second is an ibuf bitmap page: mark them
+			used */
+
+			xdes_set_bit(descr, XDES_FREE_BIT, 0, FALSE, mtr);
+			xdes_set_bit(descr, XDES_FREE_BIT,
+				     FSP_IBUF_BITMAP_OFFSET, FALSE, mtr);
+			xdes_set_state(descr, XDES_FREE_FRAG, mtr);
+
+			flst_add_last(header + FSP_FREE_FRAG,
+				      descr + XDES_FLST_NODE, mtr);
+			frag_n_used = mtr_read_ulint(header + FSP_FRAG_N_USED,
+						     MLOG_4BYTES, mtr);
+			mlog_write_ulint(header + FSP_FRAG_N_USED,
+					 frag_n_used + 2, MLOG_4BYTES, mtr);
+		} else {
+			flst_add_last(header + FSP_FREE,
+				      descr + XDES_FLST_NODE, mtr);
+			count++;
+		}
+
+		i += FSP_EXTENT_SIZE;
+	}
+}
+
+/**********************************************************************//**
+Allocates a new free extent.
+@return	extent descriptor, NULL if cannot be allocated */
+static
+xdes_t*
+fsp_alloc_free_extent(
+/*==================*/
+	ulint	space,	/*!< in: space id */
+	ulint	zip_size,/*!< in: compressed page size in bytes
+			or 0 for uncompressed pages */
+	ulint	hint,	/*!< in: hint of which extent would be desirable: any
+			page offset in the extent goes; the hint must not
+			be > FSP_FREE_LIMIT */
+	mtr_t*	mtr)	/*!< in: mtr */
+{
+	fsp_header_t*	header;
+	fil_addr_t	first;
+	xdes_t*		descr;
+
+	ut_ad(mtr);
+
+	header = fsp_get_space_header(space, zip_size, mtr);
+
+	descr = xdes_get_descriptor_with_space_hdr(header, space, hint, mtr);
+
+	if (descr && (xdes_get_state(descr, mtr) == XDES_FREE)) {
+		/* Ok, we can take this extent */
+	} else {
+		/* Take the first extent in the free list */
+		first = flst_get_first(header + FSP_FREE, mtr);
+
+		if (fil_addr_is_null(first)) {
+			fsp_fill_free_list(FALSE, space, header, mtr);
+
+			first = flst_get_first(header + FSP_FREE, mtr);
+		}
+
+		if (fil_addr_is_null(first)) {
+
+			return(NULL);	/* No free extents left */
+		}
+
+		descr = xdes_lst_get_descriptor(space, zip_size, first, mtr);
+	}
+
+	flst_remove(header + FSP_FREE, descr + XDES_FLST_NODE, mtr);
+
+	return(descr);
+}
+
+/**********************************************************************//**
+Allocates a single free page from a space. The page is marked as used.
+@return	the page offset, FIL_NULL if no page could be allocated */
+static
+ulint
+fsp_alloc_free_page(
+/*================*/
+	ulint	space,	/*!< in: space id */
+	ulint	zip_size,/*!< in: compressed page size in bytes
+			or 0 for uncompressed pages */
+	ulint	hint,	/*!< in: hint of which page would be desirable */
+	mtr_t*	mtr)	/*!< in: mtr handle */
+{
+	fsp_header_t*	header;
+	fil_addr_t	first;
+	xdes_t*		descr;
+	buf_block_t*	block;
+	ulint		free;
+	ulint		frag_n_used;
+	ulint		page_no;
+	ulint		space_size;
+	ibool		success;
+
+	ut_ad(mtr);
+
+	header = fsp_get_space_header(space, zip_size, mtr);
+
+	/* Get the hinted descriptor */
+	descr = xdes_get_descriptor_with_space_hdr(header, space, hint, mtr);
+
+	if (descr && (xdes_get_state(descr, mtr) == XDES_FREE_FRAG)) {
+		/* Ok, we can take this extent */
+	} else {
+		/* Else take the first extent in free_frag list */
+		first = flst_get_first(header + FSP_FREE_FRAG, mtr);
+
+		if (fil_addr_is_null(first)) {
+			/* There are no partially full fragments: allocate
+			a free extent and add it to the FREE_FRAG list. NOTE
+			that the allocation may have as a side-effect that an
+			extent containing a descriptor page is added to the
+			FREE_FRAG list. But we will allocate our page from the
+			the free extent anyway. */
+
+			descr = fsp_alloc_free_extent(space, zip_size,
+						      hint, mtr);
+
+			if (descr == NULL) {
+				/* No free space left */
+
+				return(FIL_NULL);
+			}
+
+			xdes_set_state(descr, XDES_FREE_FRAG, mtr);
+			flst_add_last(header + FSP_FREE_FRAG,
+				      descr + XDES_FLST_NODE, mtr);
+		} else {
+			descr = xdes_lst_get_descriptor(space, zip_size,
+							first, mtr);
+		}
+
+		/* Reset the hint */
+		hint = 0;
+	}
+
+	/* Now we have in descr an extent with at least one free page. Look
+	for a free page in the extent. */
+
+	free = xdes_find_bit(descr, XDES_FREE_BIT, TRUE,
+			     hint % FSP_EXTENT_SIZE, mtr);
+	if (free == ULINT_UNDEFINED) {
+
+		ut_print_buf(stderr, ((byte*)descr) - 500, 1000);
+		putc('\n', stderr);
+
+		ut_error;
+	}
+
+	page_no = xdes_get_offset(descr) + free;
+
+	space_size = mtr_read_ulint(header + FSP_SIZE, MLOG_4BYTES, mtr);
+
+	if (space_size <= page_no) {
+		/* It must be that we are extending a single-table tablespace
+		whose size is still < 64 pages */
+
+		ut_a(space != 0);
+		if (page_no >= FSP_EXTENT_SIZE) {
+			fprintf(stderr,
+				"InnoDB: Error: trying to extend a"
+				" single-table tablespace %lu\n"
+				"InnoDB: by single page(s) though the"
+				" space size %lu. Page no %lu.\n",
+				(ulong) space, (ulong) space_size,
+				(ulong) page_no);
+			return(FIL_NULL);
+		}
+		success = fsp_try_extend_data_file_with_pages(space, page_no,
+							      header, mtr);
+		if (!success) {
+			/* No disk space left */
+			return(FIL_NULL);
+		}
+	}
+
+	xdes_set_bit(descr, XDES_FREE_BIT, free, FALSE, mtr);
+
+	/* Update the FRAG_N_USED field */
+	frag_n_used = mtr_read_ulint(header + FSP_FRAG_N_USED, MLOG_4BYTES,
+				     mtr);
+	frag_n_used++;
+	mlog_write_ulint(header + FSP_FRAG_N_USED, frag_n_used, MLOG_4BYTES,
+			 mtr);
+	if (xdes_is_full(descr, mtr)) {
+		/* The fragment is full: move it to another list */
+		flst_remove(header + FSP_FREE_FRAG, descr + XDES_FLST_NODE,
+			    mtr);
+		xdes_set_state(descr, XDES_FULL_FRAG, mtr);
+
+		flst_add_last(header + FSP_FULL_FRAG, descr + XDES_FLST_NODE,
+			      mtr);
+		mlog_write_ulint(header + FSP_FRAG_N_USED,
+				 frag_n_used - FSP_EXTENT_SIZE, MLOG_4BYTES,
+				 mtr);
+	}
+
+	/* Initialize the allocated page to the buffer pool, so that it can
+	be obtained immediately with buf_page_get without need for a disk
+	read. */
+
+	buf_page_create(space, page_no, zip_size, mtr);
+
+	block = buf_page_get(space, zip_size, page_no, RW_X_LATCH, mtr);
+	buf_block_dbg_add_level(block, SYNC_FSP_PAGE);
+
+	/* Prior contents of the page should be ignored */
+	fsp_init_file_page(block, mtr);
+
+	return(page_no);
+}
+
+/**********************************************************************//**
+Frees a single page of a space. The page is marked as free and clean. */
+static
+void
+fsp_free_page(
+/*==========*/
+	ulint	space,	/*!< in: space id */
+	ulint	zip_size,/*!< in: compressed page size in bytes
+			or 0 for uncompressed pages */
+	ulint	page,	/*!< in: page offset */
+	mtr_t*	mtr)	/*!< in: mtr handle */
+{
+	fsp_header_t*	header;
+	xdes_t*		descr;
+	ulint		state;
+	ulint		frag_n_used;
+
+	ut_ad(mtr);
+
+	/* fprintf(stderr, "Freeing page %lu in space %lu\n", page, space); */
+
+	header = fsp_get_space_header(space, zip_size, mtr);
+
+	descr = xdes_get_descriptor_with_space_hdr(header, space, page, mtr);
+
+	state = xdes_get_state(descr, mtr);
+
+	if (state != XDES_FREE_FRAG && state != XDES_FULL_FRAG) {
+		fprintf(stderr,
+			"InnoDB: Error: File space extent descriptor"
+			" of page %lu has state %lu\n",
+			(ulong) page,
+			(ulong) state);
+		fputs("InnoDB: Dump of descriptor: ", stderr);
+		ut_print_buf(stderr, ((byte*)descr) - 50, 200);
+		putc('\n', stderr);
+
+		if (state == XDES_FREE) {
+			/* We put here some fault tolerance: if the page
+			is already free, return without doing anything! */
+
+			return;
+		}
+
+		ut_error;
+	}
+
+	if (xdes_get_bit(descr, XDES_FREE_BIT, page % FSP_EXTENT_SIZE, mtr)) {
+		fprintf(stderr,
+			"InnoDB: Error: File space extent descriptor"
+			" of page %lu says it is free\n"
+			"InnoDB: Dump of descriptor: ", (ulong) page);
+		ut_print_buf(stderr, ((byte*)descr) - 50, 200);
+		putc('\n', stderr);
+
+		/* We put here some fault tolerance: if the page
+		is already free, return without doing anything! */
+
+		return;
+	}
+
+	xdes_set_bit(descr, XDES_FREE_BIT, page % FSP_EXTENT_SIZE, TRUE, mtr);
+	xdes_set_bit(descr, XDES_CLEAN_BIT, page % FSP_EXTENT_SIZE, TRUE, mtr);
+
+	frag_n_used = mtr_read_ulint(header + FSP_FRAG_N_USED, MLOG_4BYTES,
+				     mtr);
+	if (state == XDES_FULL_FRAG) {
+		/* The fragment was full: move it to another list */
+		flst_remove(header + FSP_FULL_FRAG, descr + XDES_FLST_NODE,
+			    mtr);
+		xdes_set_state(descr, XDES_FREE_FRAG, mtr);
+		flst_add_last(header + FSP_FREE_FRAG, descr + XDES_FLST_NODE,
+			      mtr);
+		mlog_write_ulint(header + FSP_FRAG_N_USED,
+				 frag_n_used + FSP_EXTENT_SIZE - 1,
+				 MLOG_4BYTES, mtr);
+	} else {
+		ut_a(frag_n_used > 0);
+		mlog_write_ulint(header + FSP_FRAG_N_USED, frag_n_used - 1,
+				 MLOG_4BYTES, mtr);
+	}
+
+	if (xdes_is_free(descr, mtr)) {
+		/* The extent has become free: move it to another list */
+		flst_remove(header + FSP_FREE_FRAG, descr + XDES_FLST_NODE,
+			    mtr);
+		fsp_free_extent(space, zip_size, page, mtr);
+	}
+}
+
+/**********************************************************************//**
+Returns an extent to the free list of a space. */
+static
+void
+fsp_free_extent(
+/*============*/
+	ulint	space,	/*!< in: space id */
+	ulint	zip_size,/*!< in: compressed page size in bytes
+			or 0 for uncompressed pages */
+	ulint	page,	/*!< in: page offset in the extent */
+	mtr_t*	mtr)	/*!< in: mtr */
+{
+	fsp_header_t*	header;
+	xdes_t*		descr;
+
+	ut_ad(mtr);
+
+	header = fsp_get_space_header(space, zip_size, mtr);
+
+	descr = xdes_get_descriptor_with_space_hdr(header, space, page, mtr);
+
+	if (xdes_get_state(descr, mtr) == XDES_FREE) {
+
+		ut_print_buf(stderr, (byte*)descr - 500, 1000);
+		putc('\n', stderr);
+
+		ut_error;
+	}
+
+	xdes_init(descr, mtr);
+
+	flst_add_last(header + FSP_FREE, descr + XDES_FLST_NODE, mtr);
+}
+
+/**********************************************************************//**
+Returns the nth inode slot on an inode page.
+@return	segment inode */
+UNIV_INLINE
+fseg_inode_t*
+fsp_seg_inode_page_get_nth_inode(
+/*=============================*/
+	page_t*	page,	/*!< in: segment inode page */
+	ulint	i,	/*!< in: inode index on page */
+	ulint	zip_size __attribute__((unused)),
+			/*!< in: compressed page size, or 0 */
+	mtr_t*	mtr __attribute__((unused)))
+			/*!< in: mini-transaction handle */
+{
+	ut_ad(i < FSP_SEG_INODES_PER_PAGE(zip_size));
+	ut_ad(mtr_memo_contains_page(mtr, page, MTR_MEMO_PAGE_X_FIX));
+
+	return(page + FSEG_ARR_OFFSET + FSEG_INODE_SIZE * i);
+}
+
+/**********************************************************************//**
+Looks for a used segment inode on a segment inode page.
+@return	segment inode index, or ULINT_UNDEFINED if not found */
+static
+ulint
+fsp_seg_inode_page_find_used(
+/*=========================*/
+	page_t*	page,	/*!< in: segment inode page */
+	ulint	zip_size,/*!< in: compressed page size, or 0 */
+	mtr_t*	mtr)	/*!< in: mini-transaction handle */
+{
+	ulint		i;
+	fseg_inode_t*	inode;
+
+	for (i = 0; i < FSP_SEG_INODES_PER_PAGE(zip_size); i++) {
+
+		inode = fsp_seg_inode_page_get_nth_inode(
+			page, i, zip_size, mtr);
+
+		if (!ut_dulint_is_zero(mach_read_from_8(inode + FSEG_ID))) {
+			/* This is used */
+
+			ut_ad(mach_read_from_4(inode + FSEG_MAGIC_N)
+			      == FSEG_MAGIC_N_VALUE);
+			return(i);
+		}
+	}
+
+	return(ULINT_UNDEFINED);
+}
+
+/**********************************************************************//**
+Looks for an unused segment inode on a segment inode page.
+@return	segment inode index, or ULINT_UNDEFINED if not found */
+static
+ulint
+fsp_seg_inode_page_find_free(
+/*=========================*/
+	page_t*	page,	/*!< in: segment inode page */
+	ulint	i,	/*!< in: search forward starting from this index */
+	ulint	zip_size,/*!< in: compressed page size, or 0 */
+	mtr_t*	mtr)	/*!< in: mini-transaction handle */
+{
+	fseg_inode_t*	inode;
+
+	if (srv_pass_corrupt_table && !page) {
+		return(ULINT_UNDEFINED);
+	}
+	ut_a(page);
+
+	for (; i < FSP_SEG_INODES_PER_PAGE(zip_size); i++) {
+
+		inode = fsp_seg_inode_page_get_nth_inode(
+			page, i, zip_size, mtr);
+
+		if (ut_dulint_is_zero(mach_read_from_8(inode + FSEG_ID))) {
+			/* This is unused */
+
+			return(i);
+		}
+
+		ut_ad(mach_read_from_4(inode + FSEG_MAGIC_N)
+		      == FSEG_MAGIC_N_VALUE);
+	}
+
+	return(ULINT_UNDEFINED);
+}
+
+/**********************************************************************//**
+Allocates a new file segment inode page.
+@return	TRUE if could be allocated */
+static
+ibool
+fsp_alloc_seg_inode_page(
+/*=====================*/
+	fsp_header_t*	space_header,	/*!< in: space header */
+	mtr_t*		mtr)		/*!< in: mini-transaction handle */
+{
+	fseg_inode_t*	inode;
+	buf_block_t*	block;
+	page_t*		page;
+	ulint		page_no;
+	ulint		space;
+	ulint		zip_size;
+	ulint		i;
+
+	ut_ad(page_offset(space_header) == FSP_HEADER_OFFSET);
+
+	space = page_get_space_id(page_align(space_header));
+	zip_size = dict_table_flags_to_zip_size(
+		mach_read_from_4(FSP_SPACE_FLAGS + space_header));
+
+	page_no = fsp_alloc_free_page(space, zip_size, 0, mtr);
+
+	if (page_no == FIL_NULL) {
+
+		return(FALSE);
+	}
+
+	block = buf_page_get(space, zip_size, page_no, RW_X_LATCH, mtr);
+	buf_block_dbg_add_level(block, SYNC_FSP_PAGE);
+
+	block->check_index_page_at_flush = FALSE;
+
+	page = buf_block_get_frame(block);
+
+	mlog_write_ulint(page + FIL_PAGE_TYPE, FIL_PAGE_INODE,
+			 MLOG_2BYTES, mtr);
+
+	for (i = 0; i < FSP_SEG_INODES_PER_PAGE(zip_size); i++) {
+
+		inode = fsp_seg_inode_page_get_nth_inode(page, i,
+							 zip_size, mtr);
+
+		mlog_write_dulint(inode + FSEG_ID, ut_dulint_zero, mtr);
+	}
+
+	flst_add_last(space_header + FSP_SEG_INODES_FREE,
+		      page + FSEG_INODE_PAGE_NODE, mtr);
+	return(TRUE);
+}
+
+/**********************************************************************//**
+Allocates a new file segment inode.
+@return	segment inode, or NULL if not enough space */
+static
+fseg_inode_t*
+fsp_alloc_seg_inode(
+/*================*/
+	fsp_header_t*	space_header,	/*!< in: space header */
+	mtr_t*		mtr)		/*!< in: mini-transaction handle */
+{
+	ulint		page_no;
+	buf_block_t*	block;
+	page_t*		page;
+	fseg_inode_t*	inode;
+	ibool		success;
+	ulint		zip_size;
+	ulint		n;
+
+	ut_ad(page_offset(space_header) == FSP_HEADER_OFFSET);
+
+	if (flst_get_len(space_header + FSP_SEG_INODES_FREE, mtr) == 0) {
+		/* Allocate a new segment inode page */
+
+		success = fsp_alloc_seg_inode_page(space_header, mtr);
+
+		if (!success) {
+
+			return(NULL);
+		}
+	}
+
+	page_no = flst_get_first(space_header + FSP_SEG_INODES_FREE, mtr).page;
+
+	zip_size = dict_table_flags_to_zip_size(
+		mach_read_from_4(FSP_SPACE_FLAGS + space_header));
+	block = buf_page_get(page_get_space_id(page_align(space_header)),
+			     zip_size, page_no, RW_X_LATCH, mtr);
+	buf_block_dbg_add_level(block, SYNC_FSP_PAGE);
+
+	page = buf_block_get_frame(block);
+
+	if (srv_pass_corrupt_table && !page) {
+		return(0);
+	}
+	ut_a(page);
+
+	n = fsp_seg_inode_page_find_free(page, 0, zip_size, mtr);
+
+	ut_a(n != ULINT_UNDEFINED);
+
+	inode = fsp_seg_inode_page_get_nth_inode(page, n, zip_size, mtr);
+
+	if (ULINT_UNDEFINED == fsp_seg_inode_page_find_free(page, n + 1,
+							    zip_size, mtr)) {
+		/* There are no other unused headers left on the page: move it
+		to another list */
+
+		flst_remove(space_header + FSP_SEG_INODES_FREE,
+			    page + FSEG_INODE_PAGE_NODE, mtr);
+
+		flst_add_last(space_header + FSP_SEG_INODES_FULL,
+			      page + FSEG_INODE_PAGE_NODE, mtr);
+	}
+
+	ut_ad(ut_dulint_is_zero(mach_read_from_8(inode + FSEG_ID))
+	      || mach_read_from_4(inode + FSEG_MAGIC_N) == FSEG_MAGIC_N_VALUE);
+	return(inode);
+}
+
+/**********************************************************************//**
+Frees a file segment inode. */
+static
+void
+fsp_free_seg_inode(
+/*===============*/
+	ulint		space,	/*!< in: space id */
+	ulint		zip_size,/*!< in: compressed page size in bytes
+				or 0 for uncompressed pages */
+	fseg_inode_t*	inode,	/*!< in: segment inode */
+	mtr_t*		mtr)	/*!< in: mini-transaction handle */
+{
+	page_t*		page;
+	fsp_header_t*	space_header;
+
+	page = page_align(inode);
+
+	space_header = fsp_get_space_header(space, zip_size, mtr);
+
+	ut_ad(mach_read_from_4(inode + FSEG_MAGIC_N) == FSEG_MAGIC_N_VALUE);
+
+	if (ULINT_UNDEFINED
+	    == fsp_seg_inode_page_find_free(page, 0, zip_size, mtr)) {
+
+		/* Move the page to another list */
+
+		flst_remove(space_header + FSP_SEG_INODES_FULL,
+			    page + FSEG_INODE_PAGE_NODE, mtr);
+
+		flst_add_last(space_header + FSP_SEG_INODES_FREE,
+			      page + FSEG_INODE_PAGE_NODE, mtr);
+	}
+
+	mlog_write_dulint(inode + FSEG_ID, ut_dulint_zero, mtr);
+	mlog_write_ulint(inode + FSEG_MAGIC_N, 0xfa051ce3, MLOG_4BYTES, mtr);
+
+	if (ULINT_UNDEFINED
+	    == fsp_seg_inode_page_find_used(page, zip_size, mtr)) {
+
+		/* There are no other used headers left on the page: free it */
+
+		flst_remove(space_header + FSP_SEG_INODES_FREE,
+			    page + FSEG_INODE_PAGE_NODE, mtr);
+
+		fsp_free_page(space, zip_size, page_get_page_no(page), mtr);
+	}
+}
+
+/**********************************************************************//**
+Returns the file segment inode, page x-latched.
+@return	segment inode, page x-latched; NULL if the inode is free */
+static
+fseg_inode_t*
+fseg_inode_try_get(
+/*===============*/
+	fseg_header_t*	header,	/*!< in: segment header */
+	ulint		space,	/*!< in: space id */
+	ulint		zip_size,/*!< in: compressed page size in bytes
+				or 0 for uncompressed pages */
+	mtr_t*		mtr)	/*!< in: mtr handle */
+{
+	fil_addr_t	inode_addr;
+	fseg_inode_t*	inode;
+
+	inode_addr.page = mach_read_from_4(header + FSEG_HDR_PAGE_NO);
+	inode_addr.boffset = mach_read_from_2(header + FSEG_HDR_OFFSET);
+	ut_ad(space == mach_read_from_4(header + FSEG_HDR_SPACE));
+
+	inode = fut_get_ptr(space, zip_size, inode_addr, RW_X_LATCH, mtr);
+
+	if (srv_pass_corrupt_table && !inode) {
+		return(0);
+	}
+	ut_a(inode);
+
+	if (UNIV_UNLIKELY
+	    (ut_dulint_is_zero(mach_read_from_8(inode + FSEG_ID)))) {
+
+		inode = NULL;
+	} else {
+		ut_ad(mach_read_from_4(inode + FSEG_MAGIC_N)
+		      == FSEG_MAGIC_N_VALUE);
+	}
+
+	return(inode);
+}
+
+/**********************************************************************//**
+Returns the file segment inode, page x-latched.
+@return	segment inode, page x-latched */
+static
+fseg_inode_t*
+fseg_inode_get(
+/*===========*/
+	fseg_header_t*	header,	/*!< in: segment header */
+	ulint		space,	/*!< in: space id */
+	ulint		zip_size,/*!< in: compressed page size in bytes
+				or 0 for uncompressed pages */
+	mtr_t*		mtr)	/*!< in: mtr handle */
+{
+	fseg_inode_t*	inode
+		= fseg_inode_try_get(header, space, zip_size, mtr);
+	ut_a(srv_pass_corrupt_table || inode);
+	return(inode);
+}
+
+/**********************************************************************//**
+Gets the page number from the nth fragment page slot.
+@return	page number, FIL_NULL if not in use */
+UNIV_INLINE
+ulint
+fseg_get_nth_frag_page_no(
+/*======================*/
+	fseg_inode_t*	inode,	/*!< in: segment inode */
+	ulint		n,	/*!< in: slot index */
+	mtr_t*		mtr __attribute__((unused))) /*!< in: mtr handle */
+{
+	ut_ad(inode && mtr);
+	ut_ad(n < FSEG_FRAG_ARR_N_SLOTS);
+	ut_ad(mtr_memo_contains_page(mtr, inode, MTR_MEMO_PAGE_X_FIX));
+	ut_ad(mach_read_from_4(inode + FSEG_MAGIC_N) == FSEG_MAGIC_N_VALUE);
+	return(mach_read_from_4(inode + FSEG_FRAG_ARR
+				+ n * FSEG_FRAG_SLOT_SIZE));
+}
+
+/**********************************************************************//**
+Sets the page number in the nth fragment page slot. */
+UNIV_INLINE
+void
+fseg_set_nth_frag_page_no(
+/*======================*/
+	fseg_inode_t*	inode,	/*!< in: segment inode */
+	ulint		n,	/*!< in: slot index */
+	ulint		page_no,/*!< in: page number to set */
+	mtr_t*		mtr)	/*!< in: mtr handle */
+{
+	ut_ad(inode && mtr);
+	ut_ad(n < FSEG_FRAG_ARR_N_SLOTS);
+	ut_ad(mtr_memo_contains_page(mtr, inode, MTR_MEMO_PAGE_X_FIX));
+	ut_ad(mach_read_from_4(inode + FSEG_MAGIC_N) == FSEG_MAGIC_N_VALUE);
+
+	mlog_write_ulint(inode + FSEG_FRAG_ARR + n * FSEG_FRAG_SLOT_SIZE,
+			 page_no, MLOG_4BYTES, mtr);
+}
+
+/**********************************************************************//**
+Finds a fragment page slot which is free.
+@return	slot index; ULINT_UNDEFINED if none found */
+static
+ulint
+fseg_find_free_frag_page_slot(
+/*==========================*/
+	fseg_inode_t*	inode,	/*!< in: segment inode */
+	mtr_t*		mtr)	/*!< in: mtr handle */
+{
+	ulint	i;
+	ulint	page_no;
+
+	ut_ad(inode && mtr);
+
+	for (i = 0; i < FSEG_FRAG_ARR_N_SLOTS; i++) {
+		page_no = fseg_get_nth_frag_page_no(inode, i, mtr);
+
+		if (page_no == FIL_NULL) {
+
+			return(i);
+		}
+	}
+
+	return(ULINT_UNDEFINED);
+}
+
+/**********************************************************************//**
+Finds a fragment page slot which is used and last in the array.
+@return	slot index; ULINT_UNDEFINED if none found */
+static
+ulint
+fseg_find_last_used_frag_page_slot(
+/*===============================*/
+	fseg_inode_t*	inode,	/*!< in: segment inode */
+	mtr_t*		mtr)	/*!< in: mtr handle */
+{
+	ulint	i;
+	ulint	page_no;
+
+	ut_ad(inode && mtr);
+
+	for (i = 0; i < FSEG_FRAG_ARR_N_SLOTS; i++) {
+		page_no = fseg_get_nth_frag_page_no(
+			inode, FSEG_FRAG_ARR_N_SLOTS - i - 1, mtr);
+
+		if (page_no != FIL_NULL) {
+
+			return(FSEG_FRAG_ARR_N_SLOTS - i - 1);
+		}
+	}
+
+	return(ULINT_UNDEFINED);
+}
+
+/**********************************************************************//**
+Calculates reserved fragment page slots.
+@return	number of fragment pages */
+static
+ulint
+fseg_get_n_frag_pages(
+/*==================*/
+	fseg_inode_t*	inode,	/*!< in: segment inode */
+	mtr_t*		mtr)	/*!< in: mtr handle */
+{
+	ulint	i;
+	ulint	count	= 0;
+
+	ut_ad(inode && mtr);
+
+	for (i = 0; i < FSEG_FRAG_ARR_N_SLOTS; i++) {
+		if (FIL_NULL != fseg_get_nth_frag_page_no(inode, i, mtr)) {
+			count++;
+		}
+	}
+
+	return(count);
+}
+
+/**********************************************************************//**
+Creates a new segment.
+@return the block where the segment header is placed, x-latched, NULL
+if could not create segment because of lack of space */
+UNIV_INTERN
+buf_block_t*
+fseg_create_general(
+/*================*/
+	ulint	space,	/*!< in: space id */
+	ulint	page,	/*!< in: page where the segment header is placed: if
+			this is != 0, the page must belong to another segment,
+			if this is 0, a new page will be allocated and it
+			will belong to the created segment */
+	ulint	byte_offset, /*!< in: byte offset of the created segment header
+			on the page */
+	ibool	has_done_reservation, /*!< in: TRUE if the caller has already
+			done the reservation for the pages with
+			fsp_reserve_free_extents (at least 2 extents: one for
+			the inode and the other for the segment) then there is
+			no need to do the check for this individual
+			operation */
+	mtr_t*	mtr)	/*!< in: mtr */
+{
+	ulint		flags;
+	ulint		zip_size;
+	fsp_header_t*	space_header;
+	fseg_inode_t*	inode;
+	dulint		seg_id;
+	buf_block_t*	block	= 0; /* remove warning */
+	fseg_header_t*	header	= 0; /* remove warning */
+	rw_lock_t*	latch;
+	ibool		success;
+	ulint		n_reserved;
+	ulint		i;
+
+	ut_ad(mtr);
+	ut_ad(byte_offset + FSEG_HEADER_SIZE
+	      <= UNIV_PAGE_SIZE - FIL_PAGE_DATA_END);
+
+	latch = fil_space_get_latch(space, &flags);
+	zip_size = dict_table_flags_to_zip_size(flags);
+
+	if (page != 0) {
+		block = buf_page_get(space, zip_size, page, RW_X_LATCH, mtr);
+		header = byte_offset + buf_block_get_frame(block);
+	}
+
+	ut_ad(!mutex_own(&kernel_mutex)
+	      || mtr_memo_contains(mtr, latch, MTR_MEMO_X_LOCK));
+
+	mtr_x_lock(latch, mtr);
+
+	if (rw_lock_get_x_lock_count(latch) == 1) {
+		/* This thread did not own the latch before this call: free
+		excess pages from the insert buffer free list */
+
+		if (space == IBUF_SPACE_ID) {
+			ibuf_free_excess_pages();
+		}
+	}
+
+	if (!has_done_reservation) {
+		success = fsp_reserve_free_extents(&n_reserved, space, 2,
+						   FSP_NORMAL, mtr);
+		if (!success) {
+			return(NULL);
+		}
+	}
+
+	space_header = fsp_get_space_header(space, zip_size, mtr);
+
+	inode = fsp_alloc_seg_inode(space_header, mtr);
+
+	if (inode == NULL) {
+
+		goto funct_exit;
+	}
+
+	/* Read the next segment id from space header and increment the
+	value in space header */
+
+	seg_id = mtr_read_dulint(space_header + FSP_SEG_ID, mtr);
+
+	mlog_write_dulint(space_header + FSP_SEG_ID, ut_dulint_add(seg_id, 1),
+			  mtr);
+
+	mlog_write_dulint(inode + FSEG_ID, seg_id, mtr);
+	mlog_write_ulint(inode + FSEG_NOT_FULL_N_USED, 0, MLOG_4BYTES, mtr);
+
+	flst_init(inode + FSEG_FREE, mtr);
+	flst_init(inode + FSEG_NOT_FULL, mtr);
+	flst_init(inode + FSEG_FULL, mtr);
+
+	mlog_write_ulint(inode + FSEG_MAGIC_N, FSEG_MAGIC_N_VALUE,
+			 MLOG_4BYTES, mtr);
+	for (i = 0; i < FSEG_FRAG_ARR_N_SLOTS; i++) {
+		fseg_set_nth_frag_page_no(inode, i, FIL_NULL, mtr);
+	}
+
+	if (page == 0) {
+		page = fseg_alloc_free_page_low(space, zip_size,
+						inode, 0, FSP_UP, mtr);
+
+		if (page == FIL_NULL) {
+
+			fsp_free_seg_inode(space, zip_size, inode, mtr);
+
+			goto funct_exit;
+		}
+
+		block = buf_page_get(space, zip_size, page, RW_X_LATCH, mtr);
+		header = byte_offset + buf_block_get_frame(block);
+		mlog_write_ulint(header - byte_offset + FIL_PAGE_TYPE,
+				 FIL_PAGE_TYPE_SYS, MLOG_2BYTES, mtr);
+	}
+
+	mlog_write_ulint(header + FSEG_HDR_OFFSET,
+			 page_offset(inode), MLOG_2BYTES, mtr);
+
+	mlog_write_ulint(header + FSEG_HDR_PAGE_NO,
+			 page_get_page_no(page_align(inode)),
+			 MLOG_4BYTES, mtr);
+
+	mlog_write_ulint(header + FSEG_HDR_SPACE, space, MLOG_4BYTES, mtr);
+
+funct_exit:
+	if (!has_done_reservation) {
+
+		fil_space_release_free_extents(space, n_reserved);
+	}
+
+	return(block);
+}
+
+/**********************************************************************//**
+Creates a new segment.
+@return the block where the segment header is placed, x-latched, NULL
+if could not create segment because of lack of space */
+UNIV_INTERN
+buf_block_t*
+fseg_create(
+/*========*/
+	ulint	space,	/*!< in: space id */
+	ulint	page,	/*!< in: page where the segment header is placed: if
+			this is != 0, the page must belong to another segment,
+			if this is 0, a new page will be allocated and it
+			will belong to the created segment */
+	ulint	byte_offset, /*!< in: byte offset of the created segment header
+			on the page */
+	mtr_t*	mtr)	/*!< in: mtr */
+{
+	return(fseg_create_general(space, page, byte_offset, FALSE, mtr));
+}
+
+/**********************************************************************//**
+Calculates the number of pages reserved by a segment, and how many pages are
+currently used.
+@return	number of reserved pages */
+static
+ulint
+fseg_n_reserved_pages_low(
+/*======================*/
+	fseg_inode_t*	inode,	/*!< in: segment inode */
+	ulint*		used,	/*!< out: number of pages used (not
+				more than reserved) */
+	mtr_t*		mtr)	/*!< in: mtr handle */
+{
+	ulint	ret;
+
+	ut_ad(inode && used && mtr);
+	ut_ad(mtr_memo_contains_page(mtr, inode, MTR_MEMO_PAGE_X_FIX));
+
+	*used = mtr_read_ulint(inode + FSEG_NOT_FULL_N_USED, MLOG_4BYTES, mtr)
+		+ FSP_EXTENT_SIZE * flst_get_len(inode + FSEG_FULL, mtr)
+		+ fseg_get_n_frag_pages(inode, mtr);
+
+	ret = fseg_get_n_frag_pages(inode, mtr)
+		+ FSP_EXTENT_SIZE * flst_get_len(inode + FSEG_FREE, mtr)
+		+ FSP_EXTENT_SIZE * flst_get_len(inode + FSEG_NOT_FULL, mtr)
+		+ FSP_EXTENT_SIZE * flst_get_len(inode + FSEG_FULL, mtr);
+
+	return(ret);
+}
+
+/**********************************************************************//**
+Calculates the number of pages reserved by a segment, and how many pages are
+currently used.
+@return	number of reserved pages */
+UNIV_INTERN
+ulint
+fseg_n_reserved_pages(
+/*==================*/
+	fseg_header_t*	header,	/*!< in: segment header */
+	ulint*		used,	/*!< out: number of pages used (<= reserved) */
+	mtr_t*		mtr)	/*!< in: mtr handle */
+{
+	ulint		ret;
+	fseg_inode_t*	inode;
+	ulint		space;
+	ulint		flags;
+	ulint		zip_size;
+	rw_lock_t*	latch;
+
+	space = page_get_space_id(page_align(header));
+	latch = fil_space_get_latch(space, &flags);
+	zip_size = dict_table_flags_to_zip_size(flags);
+
+	ut_ad(!mutex_own(&kernel_mutex)
+	      || mtr_memo_contains(mtr, latch, MTR_MEMO_X_LOCK));
+
+	mtr_x_lock(latch, mtr);
+
+	inode = fseg_inode_get(header, space, zip_size, mtr);
+
+	ret = fseg_n_reserved_pages_low(inode, used, mtr);
+
+	return(ret);
+}
+
+/*********************************************************************//**
+Tries to fill the free list of a segment with consecutive free extents.
+This happens if the segment is big enough to allow extents in the free list,
+the free list is empty, and the extents can be allocated consecutively from
+the hint onward. */
+static
+void
+fseg_fill_free_list(
+/*================*/
+	fseg_inode_t*	inode,	/*!< in: segment inode */
+	ulint		space,	/*!< in: space id */
+	ulint		zip_size,/*!< in: compressed page size in bytes
+				or 0 for uncompressed pages */
+	ulint		hint,	/*!< in: hint which extent would be good as
+				the first extent */
+	mtr_t*		mtr)	/*!< in: mtr */
+{
+	xdes_t*	descr;
+	ulint	i;
+	dulint	seg_id;
+	ulint	reserved;
+	ulint	used;
+
+	ut_ad(inode && mtr);
+	ut_ad(!((page_offset(inode) - FSEG_ARR_OFFSET) % FSEG_INODE_SIZE));
+
+	reserved = fseg_n_reserved_pages_low(inode, &used, mtr);
+
+	if (reserved < FSEG_FREE_LIST_LIMIT * FSP_EXTENT_SIZE) {
+
+		/* The segment is too small to allow extents in free list */
+
+		return;
+	}
+
+	if (flst_get_len(inode + FSEG_FREE, mtr) > 0) {
+		/* Free list is not empty */
+
+		return;
+	}
+
+	for (i = 0; i < FSEG_FREE_LIST_MAX_LEN; i++) {
+		descr = xdes_get_descriptor(space, zip_size, hint, mtr);
+
+		if ((descr == NULL)
+		    || (XDES_FREE != xdes_get_state(descr, mtr))) {
+
+			/* We cannot allocate the desired extent: stop */
+
+			return;
+		}
+
+		descr = fsp_alloc_free_extent(space, zip_size, hint, mtr);
+
+		xdes_set_state(descr, XDES_FSEG, mtr);
+
+		seg_id = mtr_read_dulint(inode + FSEG_ID, mtr);
+		ut_ad(mach_read_from_4(inode + FSEG_MAGIC_N)
+		      == FSEG_MAGIC_N_VALUE);
+		mlog_write_dulint(descr + XDES_ID, seg_id, mtr);
+
+		flst_add_last(inode + FSEG_FREE, descr + XDES_FLST_NODE, mtr);
+		hint += FSP_EXTENT_SIZE;
+	}
+}
+
+/*********************************************************************//**
+Allocates a free extent for the segment: looks first in the free list of the
+segment, then tries to allocate from the space free list. NOTE that the extent
+returned still resides in the segment free list, it is not yet taken off it!
+@return allocated extent, still placed in the segment free list, NULL
+if could not be allocated */
+static
+xdes_t*
+fseg_alloc_free_extent(
+/*===================*/
+	fseg_inode_t*	inode,	/*!< in: segment inode */
+	ulint		space,	/*!< in: space id */
+	ulint		zip_size,/*!< in: compressed page size in bytes
+				or 0 for uncompressed pages */
+	mtr_t*		mtr)	/*!< in: mtr */
+{
+	xdes_t*		descr;
+	dulint		seg_id;
+	fil_addr_t	first;
+
+	ut_ad(!((page_offset(inode) - FSEG_ARR_OFFSET) % FSEG_INODE_SIZE));
+	ut_ad(mach_read_from_4(inode + FSEG_MAGIC_N) == FSEG_MAGIC_N_VALUE);
+
+	if (flst_get_len(inode + FSEG_FREE, mtr) > 0) {
+		/* Segment free list is not empty, allocate from it */
+
+		first = flst_get_first(inode + FSEG_FREE, mtr);
+
+		descr = xdes_lst_get_descriptor(space, zip_size, first, mtr);
+	} else {
+		/* Segment free list was empty, allocate from space */
+		descr = fsp_alloc_free_extent(space, zip_size, 0, mtr);
+
+		if (descr == NULL) {
+
+			return(NULL);
+		}
+
+		seg_id = mtr_read_dulint(inode + FSEG_ID, mtr);
+
+		xdes_set_state(descr, XDES_FSEG, mtr);
+		mlog_write_dulint(descr + XDES_ID, seg_id, mtr);
+		flst_add_last(inode + FSEG_FREE, descr + XDES_FLST_NODE, mtr);
+
+		/* Try to fill the segment free list */
+		fseg_fill_free_list(inode, space, zip_size,
+				    xdes_get_offset(descr) + FSP_EXTENT_SIZE,
+				    mtr);
+	}
+
+	return(descr);
+}
+
+/**********************************************************************//**
+Allocates a single free page from a segment. This function implements
+the intelligent allocation strategy which tries to minimize file space
+fragmentation.
+@return	the allocated page number, FIL_NULL if no page could be allocated */
+static
+ulint
+fseg_alloc_free_page_low(
+/*=====================*/
+	ulint		space,	/*!< in: space */
+	ulint		zip_size,/*!< in: compressed page size in bytes
+				or 0 for uncompressed pages */
+	fseg_inode_t*	seg_inode, /*!< in: segment inode */
+	ulint		hint,	/*!< in: hint of which page would be desirable */
+	byte		direction, /*!< in: if the new page is needed because
+				of an index page split, and records are
+				inserted there in order, into which
+				direction they go alphabetically: FSP_DOWN,
+				FSP_UP, FSP_NO_DIR */
+	mtr_t*		mtr)	/*!< in: mtr handle */
+{
+	fsp_header_t*	space_header;
+	ulint		space_size;
+	dulint		seg_id;
+	ulint		used;
+	ulint		reserved;
+	xdes_t*		descr;		/*!< extent of the hinted page */
+	ulint		ret_page;	/*!< the allocated page offset, FIL_NULL
+					if could not be allocated */
+	xdes_t*		ret_descr;	/*!< the extent of the allocated page */
+	ibool		frag_page_allocated = FALSE;
+	ibool		success;
+	ulint		n;
+
+	ut_ad(mtr);
+	ut_ad((direction >= FSP_UP) && (direction <= FSP_NO_DIR));
+	ut_ad(mach_read_from_4(seg_inode + FSEG_MAGIC_N)
+	      == FSEG_MAGIC_N_VALUE);
+	ut_ad(!((page_offset(seg_inode) - FSEG_ARR_OFFSET) % FSEG_INODE_SIZE));
+	seg_id = mtr_read_dulint(seg_inode + FSEG_ID, mtr);
+
+	ut_ad(!ut_dulint_is_zero(seg_id));
+
+	reserved = fseg_n_reserved_pages_low(seg_inode, &used, mtr);
+
+	space_header = fsp_get_space_header(space, zip_size, mtr);
+
+	descr = xdes_get_descriptor_with_space_hdr(space_header, space,
+						   hint, mtr);
+	if (descr == NULL) {
+		/* Hint outside space or too high above free limit: reset
+		hint */
+		hint = 0;
+		descr = xdes_get_descriptor(space, zip_size, hint, mtr);
+	}
+
+	/* In the big if-else below we look for ret_page and ret_descr */
+	/*-------------------------------------------------------------*/
+	if ((xdes_get_state(descr, mtr) == XDES_FSEG)
+	    && (0 == ut_dulint_cmp(mtr_read_dulint(descr + XDES_ID,
+						   mtr), seg_id))
+	    && (xdes_get_bit(descr, XDES_FREE_BIT,
+			     hint % FSP_EXTENT_SIZE, mtr) == TRUE)) {
+
+		/* 1. We can take the hinted page
+		=================================*/
+		ret_descr = descr;
+		ret_page = hint;
+		/*-----------------------------------------------------------*/
+	} else if ((xdes_get_state(descr, mtr) == XDES_FREE)
+		   && ((reserved - used) < reserved / FSEG_FILLFACTOR)
+		   && (used >= FSEG_FRAG_LIMIT)) {
+
+		/* 2. We allocate the free extent from space and can take
+		=========================================================
+		the hinted page
+		===============*/
+		ret_descr = fsp_alloc_free_extent(space, zip_size, hint, mtr);
+
+		ut_a(ret_descr == descr);
+
+		xdes_set_state(ret_descr, XDES_FSEG, mtr);
+		mlog_write_dulint(ret_descr + XDES_ID, seg_id, mtr);
+		flst_add_last(seg_inode + FSEG_FREE,
+			      ret_descr + XDES_FLST_NODE, mtr);
+
+		/* Try to fill the segment free list */
+		fseg_fill_free_list(seg_inode, space, zip_size,
+				    hint + FSP_EXTENT_SIZE, mtr);
+		ret_page = hint;
+		/*-----------------------------------------------------------*/
+	} else if ((direction != FSP_NO_DIR)
+		   && ((reserved - used) < reserved / FSEG_FILLFACTOR)
+		   && (used >= FSEG_FRAG_LIMIT)
+		   && (!!(ret_descr
+			  = fseg_alloc_free_extent(seg_inode,
+						   space, zip_size, mtr)))) {
+
+		/* 3. We take any free extent (which was already assigned above
+		===============================================================
+		in the if-condition to ret_descr) and take the lowest or
+		========================================================
+		highest page in it, depending on the direction
+		==============================================*/
+		ret_page = xdes_get_offset(ret_descr);
+
+		if (direction == FSP_DOWN) {
+			ret_page += FSP_EXTENT_SIZE - 1;
+		}
+		/*-----------------------------------------------------------*/
+	} else if ((xdes_get_state(descr, mtr) == XDES_FSEG)
+		   && (0 == ut_dulint_cmp(mtr_read_dulint(descr + XDES_ID,
+							  mtr), seg_id))
+		   && (!xdes_is_full(descr, mtr))) {
+
+		/* 4. We can take the page from the same extent as the
+		======================================================
+		hinted page (and the extent already belongs to the
+		==================================================
+		segment)
+		========*/
+		ret_descr = descr;
+		ret_page = xdes_get_offset(ret_descr)
+			+ xdes_find_bit(ret_descr, XDES_FREE_BIT, TRUE,
+					hint % FSP_EXTENT_SIZE, mtr);
+		/*-----------------------------------------------------------*/
+	} else if (reserved - used > 0) {
+		/* 5. We take any unused page from the segment
+		==============================================*/
+		fil_addr_t	first;
+
+		if (flst_get_len(seg_inode + FSEG_NOT_FULL, mtr) > 0) {
+			first = flst_get_first(seg_inode + FSEG_NOT_FULL,
+					       mtr);
+		} else if (flst_get_len(seg_inode + FSEG_FREE, mtr) > 0) {
+			first = flst_get_first(seg_inode + FSEG_FREE, mtr);
+		} else {
+			ut_error;
+			return(FIL_NULL);
+		}
+
+		ret_descr = xdes_lst_get_descriptor(space, zip_size,
+						    first, mtr);
+		ret_page = xdes_get_offset(ret_descr)
+			+ xdes_find_bit(ret_descr, XDES_FREE_BIT, TRUE,
+					0, mtr);
+		/*-----------------------------------------------------------*/
+	} else if (used < FSEG_FRAG_LIMIT) {
+		/* 6. We allocate an individual page from the space
+		===================================================*/
+		ret_page = fsp_alloc_free_page(space, zip_size, hint, mtr);
+		ret_descr = NULL;
+
+		frag_page_allocated = TRUE;
+
+		if (ret_page != FIL_NULL) {
+			/* Put the page in the fragment page array of the
+			segment */
+			n = fseg_find_free_frag_page_slot(seg_inode, mtr);
+			ut_a(n != FIL_NULL);
+
+			fseg_set_nth_frag_page_no(seg_inode, n, ret_page,
+						  mtr);
+		}
+		/*-----------------------------------------------------------*/
+	} else {
+		/* 7. We allocate a new extent and take its first page
+		======================================================*/
+		ret_descr = fseg_alloc_free_extent(seg_inode,
+						   space, zip_size, mtr);
+
+		if (ret_descr == NULL) {
+			ret_page = FIL_NULL;
+		} else {
+			ret_page = xdes_get_offset(ret_descr);
+		}
+	}
+
+	if (ret_page == FIL_NULL) {
+		/* Page could not be allocated */
+
+		return(FIL_NULL);
+	}
+
+	if (space != 0) {
+		space_size = fil_space_get_size(space);
+
+		if (space_size <= ret_page) {
+			/* It must be that we are extending a single-table
+			tablespace whose size is still < 64 pages */
+
+			if (ret_page >= FSP_EXTENT_SIZE) {
+				fprintf(stderr,
+					"InnoDB: Error (2): trying to extend"
+					" a single-table tablespace %lu\n"
+					"InnoDB: by single page(s) though"
+					" the space size %lu. Page no %lu.\n",
+					(ulong) space, (ulong) space_size,
+					(ulong) ret_page);
+				return(FIL_NULL);
+			}
+
+			success = fsp_try_extend_data_file_with_pages(
+				space, ret_page, space_header, mtr);
+			if (!success) {
+				/* No disk space left */
+				return(FIL_NULL);
+			}
+		}
+	}
+
+	if (!frag_page_allocated) {
+		/* Initialize the allocated page to buffer pool, so that it
+		can be obtained immediately with buf_page_get without need
+		for a disk read */
+		buf_block_t*	block;
+		ulint		zip_size = dict_table_flags_to_zip_size(
+			mach_read_from_4(FSP_SPACE_FLAGS + space_header));
+
+		block = buf_page_create(space, ret_page, zip_size, mtr);
+		buf_block_dbg_add_level(block, SYNC_FSP_PAGE);
+
+		if (UNIV_UNLIKELY(block != buf_page_get(space, zip_size,
+							ret_page, RW_X_LATCH,
+							mtr))) {
+			ut_error;
+		}
+
+		/* The prior contents of the page should be ignored */
+		fsp_init_file_page(block, mtr);
+
+		/* At this point we know the extent and the page offset.
+		The extent is still in the appropriate list (FSEG_NOT_FULL
+		or FSEG_FREE), and the page is not yet marked as used. */
+
+		ut_ad(xdes_get_descriptor(space, zip_size, ret_page, mtr)
+		      == ret_descr);
+		ut_ad(xdes_get_bit(ret_descr, XDES_FREE_BIT,
+				   ret_page % FSP_EXTENT_SIZE, mtr) == TRUE);
+
+		fseg_mark_page_used(seg_inode, space, zip_size, ret_page, mtr);
+	}
+
+	buf_reset_check_index_page_at_flush(space, ret_page);
+
+	return(ret_page);
+}
+
+/**********************************************************************//**
+Allocates a single free page from a segment. This function implements
+the intelligent allocation strategy which tries to minimize file space
+fragmentation.
+@return	allocated page offset, FIL_NULL if no page could be allocated */
+UNIV_INTERN
+ulint
+fseg_alloc_free_page_general(
+/*=========================*/
+	fseg_header_t*	seg_header,/*!< in: segment header */
+	ulint		hint,	/*!< in: hint of which page would be desirable */
+	byte		direction,/*!< in: if the new page is needed because
+				of an index page split, and records are
+				inserted there in order, into which
+				direction they go alphabetically: FSP_DOWN,
+				FSP_UP, FSP_NO_DIR */
+	ibool		has_done_reservation, /*!< in: TRUE if the caller has
+				already done the reservation for the page
+				with fsp_reserve_free_extents, then there
+				is no need to do the check for this individual
+				page */
+	mtr_t*		mtr)	/*!< in: mtr handle */
+{
+	fseg_inode_t*	inode;
+	ulint		space;
+	ulint		flags;
+	ulint		zip_size;
+	rw_lock_t*	latch;
+	ibool		success;
+	ulint		page_no;
+	ulint		n_reserved;
+
+	space = page_get_space_id(page_align(seg_header));
+
+	latch = fil_space_get_latch(space, &flags);
+
+	zip_size = dict_table_flags_to_zip_size(flags);
+
+	ut_ad(!mutex_own(&kernel_mutex)
+	      || mtr_memo_contains(mtr, latch, MTR_MEMO_X_LOCK));
+
+	mtr_x_lock(latch, mtr);
+
+	if (rw_lock_get_x_lock_count(latch) == 1) {
+		/* This thread did not own the latch before this call: free
+		excess pages from the insert buffer free list */
+
+		if (space == IBUF_SPACE_ID) {
+			ibuf_free_excess_pages();
+		}
+	}
+
+	inode = fseg_inode_get(seg_header, space, zip_size, mtr);
+
+	if (!has_done_reservation) {
+		success = fsp_reserve_free_extents(&n_reserved, space, 2,
+						   FSP_NORMAL, mtr);
+		if (!success) {
+			return(FIL_NULL);
+		}
+	}
+
+	page_no = fseg_alloc_free_page_low(space, zip_size,
+					   inode, hint, direction, mtr);
+	if (!has_done_reservation) {
+		fil_space_release_free_extents(space, n_reserved);
+	}
+
+	return(page_no);
+}
+
+/**********************************************************************//**
+Allocates a single free page from a segment. This function implements
+the intelligent allocation strategy which tries to minimize file space
+fragmentation.
+@return	allocated page offset, FIL_NULL if no page could be allocated */
+UNIV_INTERN
+ulint
+fseg_alloc_free_page(
+/*=================*/
+	fseg_header_t*	seg_header,/*!< in: segment header */
+	ulint		hint,	/*!< in: hint of which page would be desirable */
+	byte		direction,/*!< in: if the new page is needed because
+				of an index page split, and records are
+				inserted there in order, into which
+				direction they go alphabetically: FSP_DOWN,
+				FSP_UP, FSP_NO_DIR */
+	mtr_t*		mtr)	/*!< in: mtr handle */
+{
+	return(fseg_alloc_free_page_general(seg_header, hint, direction,
+					    FALSE, mtr));
+}
+
+/**********************************************************************//**
+Checks that we have at least 2 frag pages free in the first extent of a
+single-table tablespace, and they are also physically initialized to the data
+file. That is we have already extended the data file so that those pages are
+inside the data file. If not, this function extends the tablespace with
+pages.
+@return	TRUE if there were >= 3 free pages, or we were able to extend */
+static
+ibool
+fsp_reserve_free_pages(
+/*===================*/
+	ulint		space,		/*!< in: space id, must be != 0 */
+	fsp_header_t*	space_header,	/*!< in: header of that space,
+					x-latched */
+	ulint		size,		/*!< in: size of the tablespace in pages,
+					must be < FSP_EXTENT_SIZE / 2 */
+	mtr_t*		mtr)		/*!< in: mtr */
+{
+	xdes_t*	descr;
+	ulint	n_used;
+
+	ut_a(space != 0);
+	ut_a(size < FSP_EXTENT_SIZE / 2);
+
+	descr = xdes_get_descriptor_with_space_hdr(space_header, space, 0,
+						   mtr);
+	n_used = xdes_get_n_used(descr, mtr);
+
+	ut_a(n_used <= size);
+
+	if (size >= n_used + 2) {
+
+		return(TRUE);
+	}
+
+	return(fsp_try_extend_data_file_with_pages(space, n_used + 1,
+						   space_header, mtr));
+}
+
+/**********************************************************************//**
+Reserves free pages from a tablespace. All mini-transactions which may
+use several pages from the tablespace should call this function beforehand
+and reserve enough free extents so that they certainly will be able
+to do their operation, like a B-tree page split, fully. Reservations
+must be released with function fil_space_release_free_extents!
+
+The alloc_type below has the following meaning: FSP_NORMAL means an
+operation which will probably result in more space usage, like an
+insert in a B-tree; FSP_UNDO means allocation to undo logs: if we are
+deleting rows, then this allocation will in the long run result in
+less space usage (after a purge); FSP_CLEANING means allocation done
+in a physical record delete (like in a purge) or other cleaning operation
+which will result in less space usage in the long run. We prefer the latter
+two types of allocation: when space is scarce, FSP_NORMAL allocations
+will not succeed, but the latter two allocations will succeed, if possible.
+The purpose is to avoid dead end where the database is full but the
+user cannot free any space because these freeing operations temporarily
+reserve some space.
+
+Single-table tablespaces whose size is < 32 pages are a special case. In this
+function we would liberally reserve several 64 page extents for every page
+split or merge in a B-tree. But we do not want to waste disk space if the table
+only occupies < 32 pages. That is why we apply different rules in that special
+case, just ensuring that there are 3 free pages available.
+@return	TRUE if we were able to make the reservation */
+UNIV_INTERN
+ibool
+fsp_reserve_free_extents(
+/*=====================*/
+	ulint*	n_reserved,/*!< out: number of extents actually reserved; if we
+			return TRUE and the tablespace size is < 64 pages,
+			then this can be 0, otherwise it is n_ext */
+	ulint	space,	/*!< in: space id */
+	ulint	n_ext,	/*!< in: number of extents to reserve */
+	ulint	alloc_type,/*!< in: FSP_NORMAL, FSP_UNDO, or FSP_CLEANING */
+	mtr_t*	mtr)	/*!< in: mtr */
+{
+	fsp_header_t*	space_header;
+	rw_lock_t*	latch;
+	ulint		n_free_list_ext;
+	ulint		free_limit;
+	ulint		size;
+	ulint		flags;
+	ulint		zip_size;
+	ulint		n_free;
+	ulint		n_free_up;
+	ulint		reserve;
+	ibool		success;
+	ulint		n_pages_added;
+
+	ut_ad(mtr);
+	*n_reserved = n_ext;
+
+	latch = fil_space_get_latch(space, &flags);
+	zip_size = dict_table_flags_to_zip_size(flags);
+
+	ut_ad(!mutex_own(&kernel_mutex)
+	      || mtr_memo_contains(mtr, latch, MTR_MEMO_X_LOCK));
+
+	mtr_x_lock(latch, mtr);
+
+	space_header = fsp_get_space_header(space, zip_size, mtr);
+try_again:
+	size = mtr_read_ulint(space_header + FSP_SIZE, MLOG_4BYTES, mtr);
+
+	if (size < FSP_EXTENT_SIZE / 2) {
+		/* Use different rules for small single-table tablespaces */
+		*n_reserved = 0;
+		return(fsp_reserve_free_pages(space, space_header, size, mtr));
+	}
+
+	n_free_list_ext = flst_get_len(space_header + FSP_FREE, mtr);
+
+	free_limit = mtr_read_ulint(space_header + FSP_FREE_LIMIT,
+				    MLOG_4BYTES, mtr);
+
+	/* Below we play safe when counting free extents above the free limit:
+	some of them will contain extent descriptor pages, and therefore
+	will not be free extents */
+
+	n_free_up = (size - free_limit) / FSP_EXTENT_SIZE;
+
+	if (n_free_up > 0) {
+		n_free_up--;
+		if (!zip_size) {
+			n_free_up -= n_free_up
+				/ (UNIV_PAGE_SIZE / FSP_EXTENT_SIZE);
+		} else {
+			n_free_up -= n_free_up
+				/ (zip_size / FSP_EXTENT_SIZE);
+		}
+	}
+
+	n_free = n_free_list_ext + n_free_up;
+
+	if (alloc_type == FSP_NORMAL) {
+		/* We reserve 1 extent + 0.5 % of the space size to undo logs
+		and 1 extent + 0.5 % to cleaning operations; NOTE: this source
+		code is duplicated in the function below! */
+
+		reserve = 2 + ((size / FSP_EXTENT_SIZE) * 2) / 200;
+
+		if (n_free <= reserve + n_ext) {
+
+			goto try_to_extend;
+		}
+	} else if (alloc_type == FSP_UNDO) {
+		/* We reserve 0.5 % of the space size to cleaning operations */
+
+		reserve = 1 + ((size / FSP_EXTENT_SIZE) * 1) / 200;
+
+		if (n_free <= reserve + n_ext) {
+
+			goto try_to_extend;
+		}
+	} else {
+		ut_a(alloc_type == FSP_CLEANING);
+	}
+
+	success = fil_space_reserve_free_extents(space, n_free, n_ext);
+
+	if (success) {
+		return(TRUE);
+	}
+try_to_extend:
+	success = fsp_try_extend_data_file(&n_pages_added, space,
+					   space_header, mtr);
+	if (success && n_pages_added > 0) {
+
+		goto try_again;
+	}
+
+	return(FALSE);
+}
+
+/**********************************************************************//**
+This function should be used to get information on how much we still
+will be able to insert new data to the database without running out the
+tablespace. Only free extents are taken into account and we also subtract
+the safety margin required by the above function fsp_reserve_free_extents.
+@return	available space in kB */
+UNIV_INTERN
+ullint
+fsp_get_available_space_in_free_extents(
+/*====================================*/
+	ulint	space)	/*!< in: space id */
+{
+	fsp_header_t*	space_header;
+	ulint		n_free_list_ext;
+	ulint		free_limit;
+	ulint		size;
+	ulint		flags;
+	ulint		zip_size;
+	ulint		n_free;
+	ulint		n_free_up;
+	ulint		reserve;
+	rw_lock_t*	latch;
+	mtr_t		mtr;
+
+	ut_ad(!mutex_own(&kernel_mutex));
+
+	mtr_start(&mtr);
+
+	latch = fil_space_get_latch(space, &flags);
+	zip_size = dict_table_flags_to_zip_size(flags);
+
+	mtr_x_lock(latch, &mtr);
+
+	space_header = fsp_get_space_header(space, zip_size, &mtr);
+
+	size = mtr_read_ulint(space_header + FSP_SIZE, MLOG_4BYTES, &mtr);
+
+	n_free_list_ext = flst_get_len(space_header + FSP_FREE, &mtr);
+
+	free_limit = mtr_read_ulint(space_header + FSP_FREE_LIMIT,
+				    MLOG_4BYTES, &mtr);
+	mtr_commit(&mtr);
+
+	if (size < FSP_EXTENT_SIZE) {
+		ut_a(space != 0);	/* This must be a single-table
+					tablespace */
+
+		return(0);		/* TODO: count free frag pages and
+					return a value based on that */
+	}
+
+	/* Below we play safe when counting free extents above the free limit:
+	some of them will contain extent descriptor pages, and therefore
+	will not be free extents */
+
+	n_free_up = (size - free_limit) / FSP_EXTENT_SIZE;
+
+	if (n_free_up > 0) {
+		n_free_up--;
+		if (!zip_size) {
+			n_free_up -= n_free_up
+				/ (UNIV_PAGE_SIZE / FSP_EXTENT_SIZE);
+		} else {
+			n_free_up -= n_free_up
+				/ (zip_size / FSP_EXTENT_SIZE);
+		}
+	}
+
+	n_free = n_free_list_ext + n_free_up;
+
+	/* We reserve 1 extent + 0.5 % of the space size to undo logs
+	and 1 extent + 0.5 % to cleaning operations; NOTE: this source
+	code is duplicated in the function above! */
+
+	reserve = 2 + ((size / FSP_EXTENT_SIZE) * 2) / 200;
+
+	if (reserve > n_free) {
+		return(0);
+	}
+
+	if (!zip_size) {
+		return((ullint) (n_free - reserve)
+		       * FSP_EXTENT_SIZE
+		       * (UNIV_PAGE_SIZE / 1024));
+	} else {
+		return((ullint) (n_free - reserve)
+		       * FSP_EXTENT_SIZE
+		       * (zip_size / 1024));
+	}
+}
+
+/********************************************************************//**
+Marks a page used. The page must reside within the extents of the given
+segment. */
+static
+void
+fseg_mark_page_used(
+/*================*/
+	fseg_inode_t*	seg_inode,/*!< in: segment inode */
+	ulint		space,	/*!< in: space id */
+	ulint		zip_size,/*!< in: compressed page size in bytes
+				or 0 for uncompressed pages */
+	ulint		page,	/*!< in: page offset */
+	mtr_t*		mtr)	/*!< in: mtr */
+{
+	xdes_t*	descr;
+	ulint	not_full_n_used;
+
+	ut_ad(seg_inode && mtr);
+	ut_ad(!((page_offset(seg_inode) - FSEG_ARR_OFFSET) % FSEG_INODE_SIZE));
+	ut_ad(mach_read_from_4(seg_inode + FSEG_MAGIC_N)
+	      == FSEG_MAGIC_N_VALUE);
+
+	descr = xdes_get_descriptor(space, zip_size, page, mtr);
+
+	ut_ad(mtr_read_ulint(seg_inode + FSEG_ID, MLOG_4BYTES, mtr)
+	      == mtr_read_ulint(descr + XDES_ID, MLOG_4BYTES, mtr));
+
+	if (xdes_is_free(descr, mtr)) {
+		/* We move the extent from the free list to the
+		NOT_FULL list */
+		flst_remove(seg_inode + FSEG_FREE, descr + XDES_FLST_NODE,
+			    mtr);
+		flst_add_last(seg_inode + FSEG_NOT_FULL,
+			      descr + XDES_FLST_NODE, mtr);
+	}
+
+	ut_ad(xdes_get_bit(descr, XDES_FREE_BIT, page % FSP_EXTENT_SIZE, mtr)
+	      == TRUE);
+	/* We mark the page as used */
+	xdes_set_bit(descr, XDES_FREE_BIT, page % FSP_EXTENT_SIZE, FALSE, mtr);
+
+	not_full_n_used = mtr_read_ulint(seg_inode + FSEG_NOT_FULL_N_USED,
+					 MLOG_4BYTES, mtr);
+	not_full_n_used++;
+	mlog_write_ulint(seg_inode + FSEG_NOT_FULL_N_USED, not_full_n_used,
+			 MLOG_4BYTES, mtr);
+	if (xdes_is_full(descr, mtr)) {
+		/* We move the extent from the NOT_FULL list to the
+		FULL list */
+		flst_remove(seg_inode + FSEG_NOT_FULL,
+			    descr + XDES_FLST_NODE, mtr);
+		flst_add_last(seg_inode + FSEG_FULL,
+			      descr + XDES_FLST_NODE, mtr);
+
+		mlog_write_ulint(seg_inode + FSEG_NOT_FULL_N_USED,
+				 not_full_n_used - FSP_EXTENT_SIZE,
+				 MLOG_4BYTES, mtr);
+	}
+}
+
+/**********************************************************************//**
+Frees a single page of a segment. */
+static
+void
+fseg_free_page_low(
+/*===============*/
+	fseg_inode_t*	seg_inode, /*!< in: segment inode */
+	ulint		space,	/*!< in: space id */
+	ulint		zip_size,/*!< in: compressed page size in bytes
+				or 0 for uncompressed pages */
+	ulint		page,	/*!< in: page offset */
+	mtr_t*		mtr)	/*!< in: mtr handle */
+{
+	xdes_t*	descr;
+	ulint	not_full_n_used;
+	ulint	state;
+	dulint	descr_id;
+	dulint	seg_id;
+	ulint	i;
+
+	ut_ad(seg_inode && mtr);
+	ut_ad(mach_read_from_4(seg_inode + FSEG_MAGIC_N)
+	      == FSEG_MAGIC_N_VALUE);
+	ut_ad(!((page_offset(seg_inode) - FSEG_ARR_OFFSET) % FSEG_INODE_SIZE));
+
+	/* Drop search system page hash index if the page is found in
+	the pool and is hashed */
+
+	btr_search_drop_page_hash_when_freed(space, zip_size, page);
+
+	descr = xdes_get_descriptor(space, zip_size, page, mtr);
+
+	if (srv_pass_corrupt_table && !descr) {
+		/* The page may be corrupt. pass it. */
+		return;
+	}
+
+	ut_a(descr);
+	if (xdes_get_bit(descr, XDES_FREE_BIT, page % FSP_EXTENT_SIZE, mtr)) {
+		fputs("InnoDB: Dump of the tablespace extent descriptor: ",
+		      stderr);
+		ut_print_buf(stderr, descr, 40);
+
+		fprintf(stderr, "\n"
+			"InnoDB: Serious error! InnoDB is trying to"
+			" free page %lu\n"
+			"InnoDB: though it is already marked as free"
+			" in the tablespace!\n"
+			"InnoDB: The tablespace free space info is corrupt.\n"
+			"InnoDB: You may need to dump your"
+			" InnoDB tables and recreate the whole\n"
+			"InnoDB: database!\n", (ulong) page);
+crash:
+		fputs("InnoDB: Please refer to\n"
+		      "InnoDB: " REFMAN "forcing-recovery.html\n"
+		      "InnoDB: about forcing recovery.\n", stderr);
+		ut_error;
+	}
+
+	state = xdes_get_state(descr, mtr);
+
+	if (state != XDES_FSEG) {
+		/* The page is in the fragment pages of the segment */
+
+		for (i = 0;; i++) {
+			if (fseg_get_nth_frag_page_no(seg_inode, i, mtr)
+			    == page) {
+
+				fseg_set_nth_frag_page_no(seg_inode, i,
+							  FIL_NULL, mtr);
+				break;
+			}
+		}
+
+		fsp_free_page(space, zip_size, page, mtr);
+
+		return;
+	}
+
+	/* If we get here, the page is in some extent of the segment */
+
+	descr_id = mtr_read_dulint(descr + XDES_ID, mtr);
+	seg_id = mtr_read_dulint(seg_inode + FSEG_ID, mtr);
+#if 0
+	fprintf(stderr,
+		"InnoDB: InnoDB is freeing space %lu page %lu,\n"
+		"InnoDB: which belongs to descr seg %lu %lu\n"
+		"InnoDB: segment %lu %lu.\n",
+		(ulong) space, (ulong) page,
+		(ulong) ut_dulint_get_high(descr_id),
+		(ulong) ut_dulint_get_low(descr_id),
+		(ulong) ut_dulint_get_high(seg_id),
+		(ulong) ut_dulint_get_low(seg_id));
+#endif /* 0 */
+	if (0 != ut_dulint_cmp(descr_id, seg_id)) {
+		fputs("InnoDB: Dump of the tablespace extent descriptor: ",
+		      stderr);
+		ut_print_buf(stderr, descr, 40);
+		fputs("\nInnoDB: Dump of the segment inode: ", stderr);
+		ut_print_buf(stderr, seg_inode, 40);
+		putc('\n', stderr);
+
+		fprintf(stderr,
+			"InnoDB: Serious error: InnoDB is trying to"
+			" free space %lu page %lu,\n"
+			"InnoDB: which does not belong to"
+			" segment %lu %lu but belongs\n"
+			"InnoDB: to segment %lu %lu.\n",
+			(ulong) space, (ulong) page,
+			(ulong) ut_dulint_get_high(descr_id),
+			(ulong) ut_dulint_get_low(descr_id),
+			(ulong) ut_dulint_get_high(seg_id),
+			(ulong) ut_dulint_get_low(seg_id));
+		goto crash;
+	}
+
+	not_full_n_used = mtr_read_ulint(seg_inode + FSEG_NOT_FULL_N_USED,
+					 MLOG_4BYTES, mtr);
+	if (xdes_is_full(descr, mtr)) {
+		/* The fragment is full: move it to another list */
+		flst_remove(seg_inode + FSEG_FULL,
+			    descr + XDES_FLST_NODE, mtr);
+		flst_add_last(seg_inode + FSEG_NOT_FULL,
+			      descr + XDES_FLST_NODE, mtr);
+		mlog_write_ulint(seg_inode + FSEG_NOT_FULL_N_USED,
+				 not_full_n_used + FSP_EXTENT_SIZE - 1,
+				 MLOG_4BYTES, mtr);
+	} else {
+		ut_a(not_full_n_used > 0);
+		mlog_write_ulint(seg_inode + FSEG_NOT_FULL_N_USED,
+				 not_full_n_used - 1, MLOG_4BYTES, mtr);
+	}
+
+	xdes_set_bit(descr, XDES_FREE_BIT, page % FSP_EXTENT_SIZE, TRUE, mtr);
+	xdes_set_bit(descr, XDES_CLEAN_BIT, page % FSP_EXTENT_SIZE, TRUE, mtr);
+
+	if (xdes_is_free(descr, mtr)) {
+		/* The extent has become free: free it to space */
+		flst_remove(seg_inode + FSEG_NOT_FULL,
+			    descr + XDES_FLST_NODE, mtr);
+		fsp_free_extent(space, zip_size, page, mtr);
+	}
+}
+
+/**********************************************************************//**
+Frees a single page of a segment. */
+UNIV_INTERN
+void
+fseg_free_page(
+/*===========*/
+	fseg_header_t*	seg_header, /*!< in: segment header */
+	ulint		space,	/*!< in: space id */
+	ulint		page,	/*!< in: page offset */
+	mtr_t*		mtr)	/*!< in: mtr handle */
+{
+	ulint		flags;
+	ulint		zip_size;
+	fseg_inode_t*	seg_inode;
+	rw_lock_t*	latch;
+
+	latch = fil_space_get_latch(space, &flags);
+	zip_size = dict_table_flags_to_zip_size(flags);
+
+	ut_ad(!mutex_own(&kernel_mutex)
+	      || mtr_memo_contains(mtr, latch, MTR_MEMO_X_LOCK));
+
+	mtr_x_lock(latch, mtr);
+
+	seg_inode = fseg_inode_get(seg_header, space, zip_size, mtr);
+
+	fseg_free_page_low(seg_inode, space, zip_size, page, mtr);
+
+#ifdef UNIV_DEBUG_FILE_ACCESSES
+	buf_page_set_file_page_was_freed(space, page);
+#endif
+}
+
+/**********************************************************************//**
+Frees an extent of a segment to the space free list. */
+static
+void
+fseg_free_extent(
+/*=============*/
+	fseg_inode_t*	seg_inode, /*!< in: segment inode */
+	ulint		space,	/*!< in: space id */
+	ulint		zip_size,/*!< in: compressed page size in bytes
+				or 0 for uncompressed pages */
+	ulint		page,	/*!< in: a page in the extent */
+	mtr_t*		mtr)	/*!< in: mtr handle */
+{
+	ulint	first_page_in_extent;
+	xdes_t*	descr;
+	ulint	not_full_n_used;
+	ulint	descr_n_used;
+	ulint	i;
+
+	ut_ad(seg_inode && mtr);
+
+	descr = xdes_get_descriptor(space, zip_size, page, mtr);
+
+	ut_a(xdes_get_state(descr, mtr) == XDES_FSEG);
+	ut_a(0 == ut_dulint_cmp(mtr_read_dulint(descr + XDES_ID, mtr),
+				mtr_read_dulint(seg_inode + FSEG_ID, mtr)));
+	ut_ad(mach_read_from_4(seg_inode + FSEG_MAGIC_N)
+	      == FSEG_MAGIC_N_VALUE);
+
+	first_page_in_extent = page - (page % FSP_EXTENT_SIZE);
+
+	for (i = 0; i < FSP_EXTENT_SIZE; i++) {
+		if (FALSE == xdes_get_bit(descr, XDES_FREE_BIT, i, mtr)) {
+
+			/* Drop search system page hash index if the page is
+			found in the pool and is hashed */
+
+			btr_search_drop_page_hash_when_freed(
+				space, zip_size, first_page_in_extent + i);
+		}
+	}
+
+	if (xdes_is_full(descr, mtr)) {
+		flst_remove(seg_inode + FSEG_FULL,
+			    descr + XDES_FLST_NODE, mtr);
+	} else if (xdes_is_free(descr, mtr)) {
+		flst_remove(seg_inode + FSEG_FREE,
+			    descr + XDES_FLST_NODE, mtr);
+	} else {
+		flst_remove(seg_inode + FSEG_NOT_FULL,
+			    descr + XDES_FLST_NODE, mtr);
+
+		not_full_n_used = mtr_read_ulint(
+			seg_inode + FSEG_NOT_FULL_N_USED, MLOG_4BYTES, mtr);
+
+		descr_n_used = xdes_get_n_used(descr, mtr);
+		ut_a(not_full_n_used >= descr_n_used);
+		mlog_write_ulint(seg_inode + FSEG_NOT_FULL_N_USED,
+				 not_full_n_used - descr_n_used,
+				 MLOG_4BYTES, mtr);
+	}
+
+	fsp_free_extent(space, zip_size, page, mtr);
+
+#ifdef UNIV_DEBUG_FILE_ACCESSES
+	for (i = 0; i < FSP_EXTENT_SIZE; i++) {
+
+		buf_page_set_file_page_was_freed(space,
+						 first_page_in_extent + i);
+	}
+#endif
+}
+
+/**********************************************************************//**
+Frees part of a segment. This function can be used to free a segment by
+repeatedly calling this function in different mini-transactions. Doing
+the freeing in a single mini-transaction might result in too big a
+mini-transaction.
+@return	TRUE if freeing completed */
+UNIV_INTERN
+ibool
+fseg_free_step(
+/*===========*/
+	fseg_header_t*	header,	/*!< in, own: segment header; NOTE: if the header
+				resides on the first page of the frag list
+				of the segment, this pointer becomes obsolete
+				after the last freeing step */
+	mtr_t*		mtr)	/*!< in: mtr */
+{
+	ulint		n;
+	ulint		page;
+	xdes_t*		descr;
+	fseg_inode_t*	inode;
+	ulint		space;
+	ulint		flags;
+	ulint		zip_size;
+	ulint		header_page;
+	rw_lock_t*	latch;
+
+	space = page_get_space_id(page_align(header));
+	header_page = page_get_page_no(page_align(header));
+
+	latch = fil_space_get_latch(space, &flags);
+	zip_size = dict_table_flags_to_zip_size(flags);
+
+	ut_ad(!mutex_own(&kernel_mutex)
+	      || mtr_memo_contains(mtr, latch, MTR_MEMO_X_LOCK));
+
+	mtr_x_lock(latch, mtr);
+
+	descr = xdes_get_descriptor(space, zip_size, header_page, mtr);
+
+	if (srv_pass_corrupt_table && !descr) {
+		/* The page may be corrupt. pass it. */
+		return(TRUE);
+	}
+
+	/* Check that the header resides on a page which has not been
+	freed yet */
+
+	ut_a(descr);
+	ut_a(xdes_get_bit(descr, XDES_FREE_BIT,
+			  header_page % FSP_EXTENT_SIZE, mtr) == FALSE);
+	inode = fseg_inode_try_get(header, space, zip_size, mtr);
+
+	if (UNIV_UNLIKELY(inode == NULL)) {
+		fprintf(stderr, "double free of inode from %u:%u\n",
+			(unsigned) space, (unsigned) header_page);
+		return(TRUE);
+	}
+
+	descr = fseg_get_first_extent(inode, space, zip_size, mtr);
+
+	if (descr != NULL) {
+		/* Free the extent held by the segment */
+		page = xdes_get_offset(descr);
+
+		fseg_free_extent(inode, space, zip_size, page, mtr);
+
+		return(FALSE);
+	}
+
+	/* Free a frag page */
+	n = fseg_find_last_used_frag_page_slot(inode, mtr);
+
+	if (n == ULINT_UNDEFINED) {
+		/* Freeing completed: free the segment inode */
+		fsp_free_seg_inode(space, zip_size, inode, mtr);
+
+		return(TRUE);
+	}
+
+	fseg_free_page_low(inode, space, zip_size,
+			   fseg_get_nth_frag_page_no(inode, n, mtr), mtr);
+
+	n = fseg_find_last_used_frag_page_slot(inode, mtr);
+
+	if (n == ULINT_UNDEFINED) {
+		/* Freeing completed: free the segment inode */
+		fsp_free_seg_inode(space, zip_size, inode, mtr);
+
+		return(TRUE);
+	}
+
+	return(FALSE);
+}
+
+/**********************************************************************//**
+Frees part of a segment. Differs from fseg_free_step because this function
+leaves the header page unfreed.
+@return	TRUE if freeing completed, except the header page */
+UNIV_INTERN
+ibool
+fseg_free_step_not_header(
+/*======================*/
+	fseg_header_t*	header,	/*!< in: segment header which must reside on
+				the first fragment page of the segment */
+	mtr_t*		mtr)	/*!< in: mtr */
+{
+	ulint		n;
+	ulint		page;
+	xdes_t*		descr;
+	fseg_inode_t*	inode;
+	ulint		space;
+	ulint		flags;
+	ulint		zip_size;
+	ulint		page_no;
+	rw_lock_t*	latch;
+
+	space = page_get_space_id(page_align(header));
+
+	latch = fil_space_get_latch(space, &flags);
+	zip_size = dict_table_flags_to_zip_size(flags);
+
+	ut_ad(!mutex_own(&kernel_mutex)
+	      || mtr_memo_contains(mtr, latch, MTR_MEMO_X_LOCK));
+
+	mtr_x_lock(latch, mtr);
+
+	inode = fseg_inode_get(header, space, zip_size, mtr);
+
+	if (srv_pass_corrupt_table && !inode) {
+		/* ignore the corruption */
+		return(TRUE);
+	}
+	ut_a(inode);
+
+	descr = fseg_get_first_extent(inode, space, zip_size, mtr);
+
+	if (descr != NULL) {
+		/* Free the extent held by the segment */
+		page = xdes_get_offset(descr);
+
+		fseg_free_extent(inode, space, zip_size, page, mtr);
+
+		return(FALSE);
+	}
+
+	/* Free a frag page */
+
+	n = fseg_find_last_used_frag_page_slot(inode, mtr);
+
+	if (n == ULINT_UNDEFINED) {
+		ut_error;
+	}
+
+	page_no = fseg_get_nth_frag_page_no(inode, n, mtr);
+
+	if (page_no == page_get_page_no(page_align(header))) {
+
+		return(TRUE);
+	}
+
+	fseg_free_page_low(inode, space, zip_size, page_no, mtr);
+
+	return(FALSE);
+}
+
+/**********************************************************************//**
+Returns the first extent descriptor for a segment. We think of the extent
+lists of the segment catenated in the order FSEG_FULL -> FSEG_NOT_FULL
+-> FSEG_FREE.
+@return	the first extent descriptor, or NULL if none */
+static
+xdes_t*
+fseg_get_first_extent(
+/*==================*/
+	fseg_inode_t*	inode,	/*!< in: segment inode */
+	ulint		space,	/*!< in: space id */
+	ulint		zip_size,/*!< in: compressed page size in bytes
+				or 0 for uncompressed pages */
+	mtr_t*		mtr)	/*!< in: mtr */
+{
+	fil_addr_t	first;
+	xdes_t*		descr;
+
+	ut_ad(inode && mtr);
+
+	ut_ad(space == page_get_space_id(page_align(inode)));
+	ut_ad(mach_read_from_4(inode + FSEG_MAGIC_N) == FSEG_MAGIC_N_VALUE);
+
+	first = fil_addr_null;
+
+	if (flst_get_len(inode + FSEG_FULL, mtr) > 0) {
+
+		first = flst_get_first(inode + FSEG_FULL, mtr);
+
+	} else if (flst_get_len(inode + FSEG_NOT_FULL, mtr) > 0) {
+
+		first = flst_get_first(inode + FSEG_NOT_FULL, mtr);
+
+	} else if (flst_get_len(inode + FSEG_FREE, mtr) > 0) {
+
+		first = flst_get_first(inode + FSEG_FREE, mtr);
+	}
+
+	if (first.page == FIL_NULL) {
+
+		return(NULL);
+	}
+	descr = xdes_lst_get_descriptor(space, zip_size, first, mtr);
+
+	return(descr);
+}
+
+/*******************************************************************//**
+Validates a segment.
+@return	TRUE if ok */
+static
+ibool
+fseg_validate_low(
+/*==============*/
+	fseg_inode_t*	inode, /*!< in: segment inode */
+	mtr_t*		mtr2)	/*!< in: mtr */
+{
+	ulint		space;
+	dulint		seg_id;
+	mtr_t		mtr;
+	xdes_t*		descr;
+	fil_addr_t	node_addr;
+	ulint		n_used		= 0;
+	ulint		n_used2		= 0;
+
+	ut_ad(mtr_memo_contains_page(mtr2, inode, MTR_MEMO_PAGE_X_FIX));
+	ut_ad(mach_read_from_4(inode + FSEG_MAGIC_N) == FSEG_MAGIC_N_VALUE);
+
+	space = page_get_space_id(page_align(inode));
+
+	seg_id = mtr_read_dulint(inode + FSEG_ID, mtr2);
+	n_used = mtr_read_ulint(inode + FSEG_NOT_FULL_N_USED,
+				MLOG_4BYTES, mtr2);
+	flst_validate(inode + FSEG_FREE, mtr2);
+	flst_validate(inode + FSEG_NOT_FULL, mtr2);
+	flst_validate(inode + FSEG_FULL, mtr2);
+
+	/* Validate FSEG_FREE list */
+	node_addr = flst_get_first(inode + FSEG_FREE, mtr2);
+
+	while (!fil_addr_is_null(node_addr)) {
+		ulint	flags;
+		ulint	zip_size;
+
+		mtr_start(&mtr);
+		mtr_x_lock(fil_space_get_latch(space, &flags), &mtr);
+		zip_size = dict_table_flags_to_zip_size(flags);
+
+		descr = xdes_lst_get_descriptor(space, zip_size,
+						node_addr, &mtr);
+
+		ut_a(xdes_get_n_used(descr, &mtr) == 0);
+		ut_a(xdes_get_state(descr, &mtr) == XDES_FSEG);
+		ut_a(!ut_dulint_cmp(mtr_read_dulint(descr + XDES_ID, &mtr),
+				    seg_id));
+
+		node_addr = flst_get_next_addr(descr + XDES_FLST_NODE, &mtr);
+		mtr_commit(&mtr);
+	}
+
+	/* Validate FSEG_NOT_FULL list */
+
+	node_addr = flst_get_first(inode + FSEG_NOT_FULL, mtr2);
+
+	while (!fil_addr_is_null(node_addr)) {
+		ulint	flags;
+		ulint	zip_size;
+
+		mtr_start(&mtr);
+		mtr_x_lock(fil_space_get_latch(space, &flags), &mtr);
+		zip_size = dict_table_flags_to_zip_size(flags);
+
+		descr = xdes_lst_get_descriptor(space, zip_size,
+						node_addr, &mtr);
+
+		ut_a(xdes_get_n_used(descr, &mtr) > 0);
+		ut_a(xdes_get_n_used(descr, &mtr) < FSP_EXTENT_SIZE);
+		ut_a(xdes_get_state(descr, &mtr) == XDES_FSEG);
+		ut_a(!ut_dulint_cmp(mtr_read_dulint(descr + XDES_ID, &mtr),
+				    seg_id));
+
+		n_used2 += xdes_get_n_used(descr, &mtr);
+
+		node_addr = flst_get_next_addr(descr + XDES_FLST_NODE, &mtr);
+		mtr_commit(&mtr);
+	}
+
+	/* Validate FSEG_FULL list */
+
+	node_addr = flst_get_first(inode + FSEG_FULL, mtr2);
+
+	while (!fil_addr_is_null(node_addr)) {
+		ulint	flags;
+		ulint	zip_size;
+
+		mtr_start(&mtr);
+		mtr_x_lock(fil_space_get_latch(space, &flags), &mtr);
+		zip_size = dict_table_flags_to_zip_size(flags);
+
+		descr = xdes_lst_get_descriptor(space, zip_size,
+						node_addr, &mtr);
+
+		ut_a(xdes_get_n_used(descr, &mtr) == FSP_EXTENT_SIZE);
+		ut_a(xdes_get_state(descr, &mtr) == XDES_FSEG);
+		ut_a(!ut_dulint_cmp(mtr_read_dulint(descr + XDES_ID, &mtr),
+				    seg_id));
+
+		node_addr = flst_get_next_addr(descr + XDES_FLST_NODE, &mtr);
+		mtr_commit(&mtr);
+	}
+
+	ut_a(n_used == n_used2);
+
+	return(TRUE);
+}
+
+#ifdef UNIV_DEBUG
+/*******************************************************************//**
+Validates a segment.
+@return	TRUE if ok */
+UNIV_INTERN
+ibool
+fseg_validate(
+/*==========*/
+	fseg_header_t*	header, /*!< in: segment header */
+	mtr_t*		mtr)	/*!< in: mtr */
+{
+	fseg_inode_t*	inode;
+	ibool		ret;
+	ulint		space;
+	ulint		flags;
+	ulint		zip_size;
+
+	space = page_get_space_id(page_align(header));
+
+	mtr_x_lock(fil_space_get_latch(space, &flags), mtr);
+	zip_size = dict_table_flags_to_zip_size(flags);
+
+	inode = fseg_inode_get(header, space, zip_size, mtr);
+
+	ret = fseg_validate_low(inode, mtr);
+
+	return(ret);
+}
+#endif /* UNIV_DEBUG */
+
+/*******************************************************************//**
+Writes info of a segment. */
+static
+void
+fseg_print_low(
+/*===========*/
+	fseg_inode_t*	inode, /*!< in: segment inode */
+	mtr_t*		mtr)	/*!< in: mtr */
+{
+	ulint	space;
+	ulint	seg_id_low;
+	ulint	seg_id_high;
+	ulint	n_used;
+	ulint	n_frag;
+	ulint	n_free;
+	ulint	n_not_full;
+	ulint	n_full;
+	ulint	reserved;
+	ulint	used;
+	ulint	page_no;
+	dulint	 d_var;
+
+	ut_ad(mtr_memo_contains_page(mtr, inode, MTR_MEMO_PAGE_X_FIX));
+	space = page_get_space_id(page_align(inode));
+	page_no = page_get_page_no(page_align(inode));
+
+	reserved = fseg_n_reserved_pages_low(inode, &used, mtr);
+
+	d_var = mtr_read_dulint(inode + FSEG_ID, mtr);
+
+	seg_id_low = ut_dulint_get_low(d_var);
+	seg_id_high = ut_dulint_get_high(d_var);
+
+	n_used = mtr_read_ulint(inode + FSEG_NOT_FULL_N_USED,
+				MLOG_4BYTES, mtr);
+	n_frag = fseg_get_n_frag_pages(inode, mtr);
+	n_free = flst_get_len(inode + FSEG_FREE, mtr);
+	n_not_full = flst_get_len(inode + FSEG_NOT_FULL, mtr);
+	n_full = flst_get_len(inode + FSEG_FULL, mtr);
+
+	fprintf(stderr,
+		"SEGMENT id %lu %lu space %lu; page %lu;"
+		" res %lu used %lu; full ext %lu\n"
+		"fragm pages %lu; free extents %lu;"
+		" not full extents %lu: pages %lu\n",
+		(ulong) seg_id_high, (ulong) seg_id_low,
+		(ulong) space, (ulong) page_no,
+		(ulong) reserved, (ulong) used, (ulong) n_full,
+		(ulong) n_frag, (ulong) n_free, (ulong) n_not_full,
+		(ulong) n_used);
+	ut_ad(mach_read_from_4(inode + FSEG_MAGIC_N) == FSEG_MAGIC_N_VALUE);
+}
+
+#ifdef UNIV_BTR_PRINT
+/*******************************************************************//**
+Writes info of a segment. */
+UNIV_INTERN
+void
+fseg_print(
+/*=======*/
+	fseg_header_t*	header, /*!< in: segment header */
+	mtr_t*		mtr)	/*!< in: mtr */
+{
+	fseg_inode_t*	inode;
+	ulint		space;
+	ulint		flags;
+	ulint		zip_size;
+
+	space = page_get_space_id(page_align(header));
+
+	mtr_x_lock(fil_space_get_latch(space, &flags), mtr);
+	zip_size = dict_table_flags_to_zip_size(flags);
+
+	inode = fseg_inode_get(header, space, zip_size, mtr);
+
+	fseg_print_low(inode, mtr);
+}
+#endif /* UNIV_BTR_PRINT */
+
+/*******************************************************************//**
+Validates the file space system and its segments.
+@return	TRUE if ok */
+UNIV_INTERN
+ibool
+fsp_validate(
+/*=========*/
+	ulint	space)	/*!< in: space id */
+{
+	fsp_header_t*	header;
+	fseg_inode_t*	seg_inode;
+	page_t*		seg_inode_page;
+	rw_lock_t*	latch;
+	ulint		size;
+	ulint		flags;
+	ulint		zip_size;
+	ulint		free_limit;
+	ulint		frag_n_used;
+	mtr_t		mtr;
+	mtr_t		mtr2;
+	xdes_t*		descr;
+	fil_addr_t	node_addr;
+	fil_addr_t	next_node_addr;
+	ulint		descr_count	= 0;
+	ulint		n_used		= 0;
+	ulint		n_used2		= 0;
+	ulint		n_full_frag_pages;
+	ulint		n;
+	ulint		seg_inode_len_free;
+	ulint		seg_inode_len_full;
+
+	latch = fil_space_get_latch(space, &flags);
+	zip_size = dict_table_flags_to_zip_size(flags);
+	ut_a(ut_is_2pow(zip_size));
+	ut_a(zip_size <= UNIV_PAGE_SIZE);
+	ut_a(!zip_size || zip_size >= PAGE_ZIP_MIN_SIZE);
+
+	/* Start first a mini-transaction mtr2 to lock out all other threads
+	from the fsp system */
+	mtr_start(&mtr2);
+	mtr_x_lock(latch, &mtr2);
+
+	mtr_start(&mtr);
+	mtr_x_lock(latch, &mtr);
+
+	header = fsp_get_space_header(space, zip_size, &mtr);
+
+	size = mtr_read_ulint(header + FSP_SIZE, MLOG_4BYTES, &mtr);
+	free_limit = mtr_read_ulint(header + FSP_FREE_LIMIT,
+				    MLOG_4BYTES, &mtr);
+	frag_n_used = mtr_read_ulint(header + FSP_FRAG_N_USED,
+				     MLOG_4BYTES, &mtr);
+
+	n_full_frag_pages = FSP_EXTENT_SIZE
+		* flst_get_len(header + FSP_FULL_FRAG, &mtr);
+
+	if (UNIV_UNLIKELY(free_limit > size)) {
+
+		ut_a(space != 0);
+		ut_a(size < FSP_EXTENT_SIZE);
+	}
+
+	flst_validate(header + FSP_FREE, &mtr);
+	flst_validate(header + FSP_FREE_FRAG, &mtr);
+	flst_validate(header + FSP_FULL_FRAG, &mtr);
+
+	mtr_commit(&mtr);
+
+	/* Validate FSP_FREE list */
+	mtr_start(&mtr);
+	mtr_x_lock(latch, &mtr);
+
+	header = fsp_get_space_header(space, zip_size, &mtr);
+	node_addr = flst_get_first(header + FSP_FREE, &mtr);
+
+	mtr_commit(&mtr);
+
+	while (!fil_addr_is_null(node_addr)) {
+		mtr_start(&mtr);
+		mtr_x_lock(latch, &mtr);
+
+		descr_count++;
+		descr = xdes_lst_get_descriptor(space, zip_size,
+						node_addr, &mtr);
+
+		ut_a(xdes_get_n_used(descr, &mtr) == 0);
+		ut_a(xdes_get_state(descr, &mtr) == XDES_FREE);
+
+		node_addr = flst_get_next_addr(descr + XDES_FLST_NODE, &mtr);
+		mtr_commit(&mtr);
+	}
+
+	/* Validate FSP_FREE_FRAG list */
+	mtr_start(&mtr);
+	mtr_x_lock(latch, &mtr);
+
+	header = fsp_get_space_header(space, zip_size, &mtr);
+	node_addr = flst_get_first(header + FSP_FREE_FRAG, &mtr);
+
+	mtr_commit(&mtr);
+
+	while (!fil_addr_is_null(node_addr)) {
+		mtr_start(&mtr);
+		mtr_x_lock(latch, &mtr);
+
+		descr_count++;
+		descr = xdes_lst_get_descriptor(space, zip_size,
+						node_addr, &mtr);
+
+		ut_a(xdes_get_n_used(descr, &mtr) > 0);
+		ut_a(xdes_get_n_used(descr, &mtr) < FSP_EXTENT_SIZE);
+		ut_a(xdes_get_state(descr, &mtr) == XDES_FREE_FRAG);
+
+		n_used += xdes_get_n_used(descr, &mtr);
+		node_addr = flst_get_next_addr(descr + XDES_FLST_NODE, &mtr);
+
+		mtr_commit(&mtr);
+	}
+
+	/* Validate FSP_FULL_FRAG list */
+	mtr_start(&mtr);
+	mtr_x_lock(latch, &mtr);
+
+	header = fsp_get_space_header(space, zip_size, &mtr);
+	node_addr = flst_get_first(header + FSP_FULL_FRAG, &mtr);
+
+	mtr_commit(&mtr);
+
+	while (!fil_addr_is_null(node_addr)) {
+		mtr_start(&mtr);
+		mtr_x_lock(latch, &mtr);
+
+		descr_count++;
+		descr = xdes_lst_get_descriptor(space, zip_size,
+						node_addr, &mtr);
+
+		ut_a(xdes_get_n_used(descr, &mtr) == FSP_EXTENT_SIZE);
+		ut_a(xdes_get_state(descr, &mtr) == XDES_FULL_FRAG);
+
+		node_addr = flst_get_next_addr(descr + XDES_FLST_NODE, &mtr);
+		mtr_commit(&mtr);
+	}
+
+	/* Validate segments */
+	mtr_start(&mtr);
+	mtr_x_lock(latch, &mtr);
+
+	header = fsp_get_space_header(space, zip_size, &mtr);
+
+	node_addr = flst_get_first(header + FSP_SEG_INODES_FULL, &mtr);
+
+	seg_inode_len_full = flst_get_len(header + FSP_SEG_INODES_FULL, &mtr);
+
+	mtr_commit(&mtr);
+
+	while (!fil_addr_is_null(node_addr)) {
+
+		n = 0;
+		do {
+			mtr_start(&mtr);
+			mtr_x_lock(latch, &mtr);
+
+			seg_inode_page = fut_get_ptr(
+				space, zip_size, node_addr, RW_X_LATCH, &mtr)
+				- FSEG_INODE_PAGE_NODE;
+
+			seg_inode = fsp_seg_inode_page_get_nth_inode(
+				seg_inode_page, n, zip_size, &mtr);
+			ut_a(!ut_dulint_is_zero(
+				     mach_read_from_8(seg_inode + FSEG_ID)));
+			fseg_validate_low(seg_inode, &mtr);
+
+			descr_count += flst_get_len(seg_inode + FSEG_FREE,
+						    &mtr);
+			descr_count += flst_get_len(seg_inode + FSEG_FULL,
+						    &mtr);
+			descr_count += flst_get_len(seg_inode + FSEG_NOT_FULL,
+						    &mtr);
+
+			n_used2 += fseg_get_n_frag_pages(seg_inode, &mtr);
+
+			next_node_addr = flst_get_next_addr(
+				seg_inode_page + FSEG_INODE_PAGE_NODE, &mtr);
+			mtr_commit(&mtr);
+		} while (++n < FSP_SEG_INODES_PER_PAGE(zip_size));
+
+		node_addr = next_node_addr;
+	}
+
+	mtr_start(&mtr);
+	mtr_x_lock(latch, &mtr);
+
+	header = fsp_get_space_header(space, zip_size, &mtr);
+
+	node_addr = flst_get_first(header + FSP_SEG_INODES_FREE, &mtr);
+
+	seg_inode_len_free = flst_get_len(header + FSP_SEG_INODES_FREE, &mtr);
+
+	mtr_commit(&mtr);
+
+	while (!fil_addr_is_null(node_addr)) {
+
+		n = 0;
+
+		do {
+			mtr_start(&mtr);
+			mtr_x_lock(latch, &mtr);
+
+			seg_inode_page = fut_get_ptr(
+				space, zip_size, node_addr, RW_X_LATCH, &mtr)
+				- FSEG_INODE_PAGE_NODE;
+
+			seg_inode = fsp_seg_inode_page_get_nth_inode(
+				seg_inode_page, n, zip_size, &mtr);
+			if (!ut_dulint_is_zero(
+				    mach_read_from_8(seg_inode + FSEG_ID))) {
+				fseg_validate_low(seg_inode, &mtr);
+
+				descr_count += flst_get_len(
+					seg_inode + FSEG_FREE, &mtr);
+				descr_count += flst_get_len(
+					seg_inode + FSEG_FULL, &mtr);
+				descr_count += flst_get_len(
+					seg_inode + FSEG_NOT_FULL, &mtr);
+				n_used2 += fseg_get_n_frag_pages(
+					seg_inode, &mtr);
+			}
+
+			next_node_addr = flst_get_next_addr(
+				seg_inode_page + FSEG_INODE_PAGE_NODE, &mtr);
+			mtr_commit(&mtr);
+		} while (++n < FSP_SEG_INODES_PER_PAGE(zip_size));
+
+		node_addr = next_node_addr;
+	}
+
+	ut_a(descr_count * FSP_EXTENT_SIZE == free_limit);
+	if (!zip_size) {
+		ut_a(n_used + n_full_frag_pages
+		     == n_used2 + 2 * ((free_limit + (UNIV_PAGE_SIZE - 1))
+				       / UNIV_PAGE_SIZE)
+		     + seg_inode_len_full + seg_inode_len_free);
+	} else {
+		ut_a(n_used + n_full_frag_pages
+		     == n_used2 + 2 * ((free_limit + (zip_size - 1))
+				       / zip_size)
+		     + seg_inode_len_full + seg_inode_len_free);
+	}
+	ut_a(frag_n_used == n_used);
+
+	mtr_commit(&mtr2);
+
+	return(TRUE);
+}
+
+/*******************************************************************//**
+Prints info of a file space. */
+UNIV_INTERN
+void
+fsp_print(
+/*======*/
+	ulint	space)	/*!< in: space id */
+{
+	fsp_header_t*	header;
+	fseg_inode_t*	seg_inode;
+	page_t*		seg_inode_page;
+	rw_lock_t*	latch;
+	ulint		flags;
+	ulint		zip_size;
+	ulint		size;
+	ulint		free_limit;
+	ulint		frag_n_used;
+	fil_addr_t	node_addr;
+	fil_addr_t	next_node_addr;
+	ulint		n_free;
+	ulint		n_free_frag;
+	ulint		n_full_frag;
+	ulint		seg_id_low;
+	ulint		seg_id_high;
+	ulint		n;
+	ulint		n_segs		= 0;
+	dulint		d_var;
+	mtr_t		mtr;
+	mtr_t		mtr2;
+
+	latch = fil_space_get_latch(space, &flags);
+	zip_size = dict_table_flags_to_zip_size(flags);
+
+	/* Start first a mini-transaction mtr2 to lock out all other threads
+	from the fsp system */
+
+	mtr_start(&mtr2);
+
+	mtr_x_lock(latch, &mtr2);
+
+	mtr_start(&mtr);
+
+	mtr_x_lock(latch, &mtr);
+
+	header = fsp_get_space_header(space, zip_size, &mtr);
+
+	size = mtr_read_ulint(header + FSP_SIZE, MLOG_4BYTES, &mtr);
+
+	free_limit = mtr_read_ulint(header + FSP_FREE_LIMIT, MLOG_4BYTES,
+				    &mtr);
+	frag_n_used = mtr_read_ulint(header + FSP_FRAG_N_USED, MLOG_4BYTES,
+				     &mtr);
+	n_free = flst_get_len(header + FSP_FREE, &mtr);
+	n_free_frag = flst_get_len(header + FSP_FREE_FRAG, &mtr);
+	n_full_frag = flst_get_len(header + FSP_FULL_FRAG, &mtr);
+
+	d_var = mtr_read_dulint(header + FSP_SEG_ID, &mtr);
+
+	seg_id_low = ut_dulint_get_low(d_var);
+	seg_id_high = ut_dulint_get_high(d_var);
+
+	fprintf(stderr,
+		"FILE SPACE INFO: id %lu\n"
+		"size %lu, free limit %lu, free extents %lu\n"
+		"not full frag extents %lu: used pages %lu,"
+		" full frag extents %lu\n"
+		"first seg id not used %lu %lu\n",
+		(ulong) space,
+		(ulong) size, (ulong) free_limit, (ulong) n_free,
+		(ulong) n_free_frag, (ulong) frag_n_used, (ulong) n_full_frag,
+		(ulong) seg_id_high, (ulong) seg_id_low);
+
+	mtr_commit(&mtr);
+
+	/* Print segments */
+
+	mtr_start(&mtr);
+	mtr_x_lock(latch, &mtr);
+
+	header = fsp_get_space_header(space, zip_size, &mtr);
+
+	node_addr = flst_get_first(header + FSP_SEG_INODES_FULL, &mtr);
+
+	mtr_commit(&mtr);
+
+	while (!fil_addr_is_null(node_addr)) {
+
+		n = 0;
+
+		do {
+
+			mtr_start(&mtr);
+			mtr_x_lock(latch, &mtr);
+
+			seg_inode_page = fut_get_ptr(
+				space, zip_size, node_addr, RW_X_LATCH, &mtr)
+				- FSEG_INODE_PAGE_NODE;
+
+			seg_inode = fsp_seg_inode_page_get_nth_inode(
+				seg_inode_page, n, zip_size, &mtr);
+			ut_a(!ut_dulint_is_zero(
+				     mach_read_from_8(seg_inode + FSEG_ID)));
+			fseg_print_low(seg_inode, &mtr);
+
+			n_segs++;
+
+			next_node_addr = flst_get_next_addr(
+				seg_inode_page + FSEG_INODE_PAGE_NODE, &mtr);
+			mtr_commit(&mtr);
+		} while (++n < FSP_SEG_INODES_PER_PAGE(zip_size));
+
+		node_addr = next_node_addr;
+	}
+
+	mtr_start(&mtr);
+	mtr_x_lock(latch, &mtr);
+
+	header = fsp_get_space_header(space, zip_size, &mtr);
+
+	node_addr = flst_get_first(header + FSP_SEG_INODES_FREE, &mtr);
+
+	mtr_commit(&mtr);
+
+	while (!fil_addr_is_null(node_addr)) {
+
+		n = 0;
+
+		do {
+
+			mtr_start(&mtr);
+			mtr_x_lock(latch, &mtr);
+
+			seg_inode_page = fut_get_ptr(
+				space, zip_size, node_addr, RW_X_LATCH, &mtr)
+				- FSEG_INODE_PAGE_NODE;
+
+			seg_inode = fsp_seg_inode_page_get_nth_inode(
+				seg_inode_page, n, zip_size, &mtr);
+			if (!ut_dulint_is_zero(
+				    mach_read_from_8(seg_inode + FSEG_ID))) {
+
+				fseg_print_low(seg_inode, &mtr);
+				n_segs++;
+			}
+
+			next_node_addr = flst_get_next_addr(
+				seg_inode_page + FSEG_INODE_PAGE_NODE, &mtr);
+			mtr_commit(&mtr);
+		} while (++n < FSP_SEG_INODES_PER_PAGE(zip_size));
+
+		node_addr = next_node_addr;
+	}
+
+	mtr_commit(&mtr2);
+
+	fprintf(stderr, "NUMBER of file segments: %lu\n", (ulong) n_segs);
+}
+#endif /* !UNIV_HOTBACKUP */
diff --git a/storage/xtradb/fut/fut0fut.c b/storage/xtradb/fut/fut0fut.c
new file mode 100644
index 00000000000..20b45a575e6
--- /dev/null
+++ b/storage/xtradb/fut/fut0fut.c
@@ -0,0 +1,31 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/******************************************************************//**
+@file fut/fut0fut.c
+File-based utilities
+
+Created 12/13/1995 Heikki Tuuri
+***********************************************************************/
+
+#include "fut0fut.h"
+
+#ifdef UNIV_NONINL
+#include "fut0fut.ic"
+#endif
+
diff --git a/storage/xtradb/fut/fut0lst.c b/storage/xtradb/fut/fut0lst.c
new file mode 100644
index 00000000000..a1e21c22725
--- /dev/null
+++ b/storage/xtradb/fut/fut0lst.c
@@ -0,0 +1,530 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/******************************************************************//**
+@file fut/fut0lst.c
+File-based list utilities
+
+Created 11/28/1995 Heikki Tuuri
+***********************************************************************/
+
+#include "fut0lst.h"
+
+#ifdef UNIV_NONINL
+#include "fut0lst.ic"
+#endif
+
+#include "buf0buf.h"
+#include "page0page.h"
+
+/********************************************************************//**
+Adds a node to an empty list. */
+static
+void
+flst_add_to_empty(
+/*==============*/
+	flst_base_node_t*	base,	/*!< in: pointer to base node of
+					empty list */
+	flst_node_t*		node,	/*!< in: node to add */
+	mtr_t*			mtr)	/*!< in: mini-transaction handle */
+{
+	ulint		space;
+	fil_addr_t	node_addr;
+	ulint		len;
+
+	ut_ad(mtr && base && node);
+	ut_ad(base != node);
+	ut_ad(mtr_memo_contains_page(mtr, base, MTR_MEMO_PAGE_X_FIX));
+	ut_ad(mtr_memo_contains_page(mtr, node, MTR_MEMO_PAGE_X_FIX));
+	len = flst_get_len(base, mtr);
+	ut_a(len == 0);
+
+	buf_ptr_get_fsp_addr(node, &space, &node_addr);
+
+	/* Update first and last fields of base node */
+	flst_write_addr(base + FLST_FIRST, node_addr, mtr);
+	flst_write_addr(base + FLST_LAST, node_addr, mtr);
+
+	/* Set prev and next fields of node to add */
+	flst_write_addr(node + FLST_PREV, fil_addr_null, mtr);
+	flst_write_addr(node + FLST_NEXT, fil_addr_null, mtr);
+
+	/* Update len of base node */
+	mlog_write_ulint(base + FLST_LEN, len + 1, MLOG_4BYTES, mtr);
+}
+
+/********************************************************************//**
+Adds a node as the last node in a list. */
+UNIV_INTERN
+void
+flst_add_last(
+/*==========*/
+	flst_base_node_t*	base,	/*!< in: pointer to base node of list */
+	flst_node_t*		node,	/*!< in: node to add */
+	mtr_t*			mtr)	/*!< in: mini-transaction handle */
+{
+	ulint		space;
+	fil_addr_t	node_addr;
+	ulint		len;
+	fil_addr_t	last_addr;
+	flst_node_t*	last_node;
+
+	ut_ad(mtr && base && node);
+	ut_ad(base != node);
+	ut_ad(mtr_memo_contains_page(mtr, base, MTR_MEMO_PAGE_X_FIX));
+	ut_ad(mtr_memo_contains_page(mtr, node, MTR_MEMO_PAGE_X_FIX));
+	len = flst_get_len(base, mtr);
+	last_addr = flst_get_last(base, mtr);
+
+	buf_ptr_get_fsp_addr(node, &space, &node_addr);
+
+	/* If the list is not empty, call flst_insert_after */
+	if (len != 0) {
+		if (last_addr.page == node_addr.page) {
+			last_node = page_align(node) + last_addr.boffset;
+		} else {
+			ulint	zip_size = fil_space_get_zip_size(space);
+
+			last_node = fut_get_ptr(space, zip_size, last_addr,
+						RW_X_LATCH, mtr);
+		}
+
+		flst_insert_after(base, last_node, node, mtr);
+	} else {
+		/* else call flst_add_to_empty */
+		flst_add_to_empty(base, node, mtr);
+	}
+}
+
+/********************************************************************//**
+Adds a node as the first node in a list. */
+UNIV_INTERN
+void
+flst_add_first(
+/*===========*/
+	flst_base_node_t*	base,	/*!< in: pointer to base node of list */
+	flst_node_t*		node,	/*!< in: node to add */
+	mtr_t*			mtr)	/*!< in: mini-transaction handle */
+{
+	ulint		space;
+	fil_addr_t	node_addr;
+	ulint		len;
+	fil_addr_t	first_addr;
+	flst_node_t*	first_node;
+
+	ut_ad(mtr && base && node);
+	ut_ad(base != node);
+	ut_ad(mtr_memo_contains_page(mtr, base, MTR_MEMO_PAGE_X_FIX));
+	ut_ad(mtr_memo_contains_page(mtr, node, MTR_MEMO_PAGE_X_FIX));
+	len = flst_get_len(base, mtr);
+	first_addr = flst_get_first(base, mtr);
+
+	buf_ptr_get_fsp_addr(node, &space, &node_addr);
+
+	/* If the list is not empty, call flst_insert_before */
+	if (len != 0) {
+		if (first_addr.page == node_addr.page) {
+			first_node = page_align(node) + first_addr.boffset;
+		} else {
+			ulint	zip_size = fil_space_get_zip_size(space);
+
+			first_node = fut_get_ptr(space, zip_size, first_addr,
+						 RW_X_LATCH, mtr);
+		}
+
+		flst_insert_before(base, node, first_node, mtr);
+	} else {
+		/* else call flst_add_to_empty */
+		flst_add_to_empty(base, node, mtr);
+	}
+}
+
+/********************************************************************//**
+Inserts a node after another in a list. */
+UNIV_INTERN
+void
+flst_insert_after(
+/*==============*/
+	flst_base_node_t*	base,	/*!< in: pointer to base node of list */
+	flst_node_t*		node1,	/*!< in: node to insert after */
+	flst_node_t*		node2,	/*!< in: node to add */
+	mtr_t*			mtr)	/*!< in: mini-transaction handle */
+{
+	ulint		space;
+	fil_addr_t	node1_addr;
+	fil_addr_t	node2_addr;
+	flst_node_t*	node3;
+	fil_addr_t	node3_addr;
+	ulint		len;
+
+	ut_ad(mtr && node1 && node2 && base);
+	ut_ad(base != node1);
+	ut_ad(base != node2);
+	ut_ad(node2 != node1);
+	ut_ad(mtr_memo_contains_page(mtr, base, MTR_MEMO_PAGE_X_FIX));
+	ut_ad(mtr_memo_contains_page(mtr, node1, MTR_MEMO_PAGE_X_FIX));
+	ut_ad(mtr_memo_contains_page(mtr, node2, MTR_MEMO_PAGE_X_FIX));
+
+	buf_ptr_get_fsp_addr(node1, &space, &node1_addr);
+	buf_ptr_get_fsp_addr(node2, &space, &node2_addr);
+
+	node3_addr = flst_get_next_addr(node1, mtr);
+
+	/* Set prev and next fields of node2 */
+	flst_write_addr(node2 + FLST_PREV, node1_addr, mtr);
+	flst_write_addr(node2 + FLST_NEXT, node3_addr, mtr);
+
+	if (!fil_addr_is_null(node3_addr)) {
+		/* Update prev field of node3 */
+		ulint	zip_size = fil_space_get_zip_size(space);
+
+		node3 = fut_get_ptr(space, zip_size,
+				    node3_addr, RW_X_LATCH, mtr);
+		flst_write_addr(node3 + FLST_PREV, node2_addr, mtr);
+	} else {
+		/* node1 was last in list: update last field in base */
+		flst_write_addr(base + FLST_LAST, node2_addr, mtr);
+	}
+
+	/* Set next field of node1 */
+	flst_write_addr(node1 + FLST_NEXT, node2_addr, mtr);
+
+	/* Update len of base node */
+	len = flst_get_len(base, mtr);
+	mlog_write_ulint(base + FLST_LEN, len + 1, MLOG_4BYTES, mtr);
+}
+
+/********************************************************************//**
+Inserts a node before another in a list. */
+UNIV_INTERN
+void
+flst_insert_before(
+/*===============*/
+	flst_base_node_t*	base,	/*!< in: pointer to base node of list */
+	flst_node_t*		node2,	/*!< in: node to insert */
+	flst_node_t*		node3,	/*!< in: node to insert before */
+	mtr_t*			mtr)	/*!< in: mini-transaction handle */
+{
+	ulint		space;
+	flst_node_t*	node1;
+	fil_addr_t	node1_addr;
+	fil_addr_t	node2_addr;
+	fil_addr_t	node3_addr;
+	ulint		len;
+
+	ut_ad(mtr && node2 && node3 && base);
+	ut_ad(base != node2);
+	ut_ad(base != node3);
+	ut_ad(node2 != node3);
+	ut_ad(mtr_memo_contains_page(mtr, base, MTR_MEMO_PAGE_X_FIX));
+	ut_ad(mtr_memo_contains_page(mtr, node2, MTR_MEMO_PAGE_X_FIX));
+	ut_ad(mtr_memo_contains_page(mtr, node3, MTR_MEMO_PAGE_X_FIX));
+
+	buf_ptr_get_fsp_addr(node2, &space, &node2_addr);
+	buf_ptr_get_fsp_addr(node3, &space, &node3_addr);
+
+	node1_addr = flst_get_prev_addr(node3, mtr);
+
+	/* Set prev and next fields of node2 */
+	flst_write_addr(node2 + FLST_PREV, node1_addr, mtr);
+	flst_write_addr(node2 + FLST_NEXT, node3_addr, mtr);
+
+	if (!fil_addr_is_null(node1_addr)) {
+		ulint	zip_size = fil_space_get_zip_size(space);
+		/* Update next field of node1 */
+		node1 = fut_get_ptr(space, zip_size, node1_addr,
+				    RW_X_LATCH, mtr);
+		flst_write_addr(node1 + FLST_NEXT, node2_addr, mtr);
+	} else {
+		/* node3 was first in list: update first field in base */
+		flst_write_addr(base + FLST_FIRST, node2_addr, mtr);
+	}
+
+	/* Set prev field of node3 */
+	flst_write_addr(node3 + FLST_PREV, node2_addr, mtr);
+
+	/* Update len of base node */
+	len = flst_get_len(base, mtr);
+	mlog_write_ulint(base + FLST_LEN, len + 1, MLOG_4BYTES, mtr);
+}
+
+/********************************************************************//**
+Removes a node. */
+UNIV_INTERN
+void
+flst_remove(
+/*========*/
+	flst_base_node_t*	base,	/*!< in: pointer to base node of list */
+	flst_node_t*		node2,	/*!< in: node to remove */
+	mtr_t*			mtr)	/*!< in: mini-transaction handle */
+{
+	ulint		space;
+	ulint		zip_size;
+	flst_node_t*	node1;
+	fil_addr_t	node1_addr;
+	fil_addr_t	node2_addr;
+	flst_node_t*	node3;
+	fil_addr_t	node3_addr;
+	ulint		len;
+
+	ut_ad(mtr && node2 && base);
+	ut_ad(mtr_memo_contains_page(mtr, base, MTR_MEMO_PAGE_X_FIX));
+	ut_ad(mtr_memo_contains_page(mtr, node2, MTR_MEMO_PAGE_X_FIX));
+
+	buf_ptr_get_fsp_addr(node2, &space, &node2_addr);
+	zip_size = fil_space_get_zip_size(space);
+
+	node1_addr = flst_get_prev_addr(node2, mtr);
+	node3_addr = flst_get_next_addr(node2, mtr);
+
+	if (!fil_addr_is_null(node1_addr)) {
+
+		/* Update next field of node1 */
+
+		if (node1_addr.page == node2_addr.page) {
+
+			node1 = page_align(node2) + node1_addr.boffset;
+		} else {
+			node1 = fut_get_ptr(space, zip_size,
+					    node1_addr, RW_X_LATCH, mtr);
+		}
+
+		ut_ad(node1 != node2);
+
+		flst_write_addr(node1 + FLST_NEXT, node3_addr, mtr);
+	} else {
+		/* node2 was first in list: update first field in base */
+		flst_write_addr(base + FLST_FIRST, node3_addr, mtr);
+	}
+
+	if (!fil_addr_is_null(node3_addr)) {
+		/* Update prev field of node3 */
+
+		if (node3_addr.page == node2_addr.page) {
+
+			node3 = page_align(node2) + node3_addr.boffset;
+		} else {
+			node3 = fut_get_ptr(space, zip_size,
+					    node3_addr, RW_X_LATCH, mtr);
+		}
+
+		ut_ad(node2 != node3);
+
+		flst_write_addr(node3 + FLST_PREV, node1_addr, mtr);
+	} else {
+		/* node2 was last in list: update last field in base */
+		flst_write_addr(base + FLST_LAST, node1_addr, mtr);
+	}
+
+	/* Update len of base node */
+	len = flst_get_len(base, mtr);
+	ut_ad(len > 0);
+
+	mlog_write_ulint(base + FLST_LEN, len - 1, MLOG_4BYTES, mtr);
+}
+
+/********************************************************************//**
+Cuts off the tail of the list, including the node given. The number of
+nodes which will be removed must be provided by the caller, as this function
+does not measure the length of the tail. */
+UNIV_INTERN
+void
+flst_cut_end(
+/*=========*/
+	flst_base_node_t*	base,	/*!< in: pointer to base node of list */
+	flst_node_t*		node2,	/*!< in: first node to remove */
+	ulint			n_nodes,/*!< in: number of nodes to remove,
+					must be >= 1 */
+	mtr_t*			mtr)	/*!< in: mini-transaction handle */
+{
+	ulint		space;
+	flst_node_t*	node1;
+	fil_addr_t	node1_addr;
+	fil_addr_t	node2_addr;
+	ulint		len;
+
+	ut_ad(mtr && node2 && base);
+	ut_ad(mtr_memo_contains_page(mtr, base, MTR_MEMO_PAGE_X_FIX));
+	ut_ad(mtr_memo_contains_page(mtr, node2, MTR_MEMO_PAGE_X_FIX));
+	ut_ad(n_nodes > 0);
+
+	buf_ptr_get_fsp_addr(node2, &space, &node2_addr);
+
+	node1_addr = flst_get_prev_addr(node2, mtr);
+
+	if (!fil_addr_is_null(node1_addr)) {
+
+		/* Update next field of node1 */
+
+		if (node1_addr.page == node2_addr.page) {
+
+			node1 = page_align(node2) + node1_addr.boffset;
+		} else {
+			node1 = fut_get_ptr(space,
+					    fil_space_get_zip_size(space),
+					    node1_addr, RW_X_LATCH, mtr);
+		}
+
+		flst_write_addr(node1 + FLST_NEXT, fil_addr_null, mtr);
+	} else {
+		/* node2 was first in list: update the field in base */
+		flst_write_addr(base + FLST_FIRST, fil_addr_null, mtr);
+	}
+
+	flst_write_addr(base + FLST_LAST, node1_addr, mtr);
+
+	/* Update len of base node */
+	len = flst_get_len(base, mtr);
+	ut_ad(len >= n_nodes);
+
+	mlog_write_ulint(base + FLST_LEN, len - n_nodes, MLOG_4BYTES, mtr);
+}
+
+/********************************************************************//**
+Cuts off the tail of the list, not including the given node. The number of
+nodes which will be removed must be provided by the caller, as this function
+does not measure the length of the tail. */
+UNIV_INTERN
+void
+flst_truncate_end(
+/*==============*/
+	flst_base_node_t*	base,	/*!< in: pointer to base node of list */
+	flst_node_t*		node2,	/*!< in: first node not to remove */
+	ulint			n_nodes,/*!< in: number of nodes to remove */
+	mtr_t*			mtr)	/*!< in: mini-transaction handle */
+{
+	fil_addr_t	node2_addr;
+	ulint		len;
+	ulint		space;
+
+	ut_ad(mtr && node2 && base);
+	ut_ad(mtr_memo_contains_page(mtr, base, MTR_MEMO_PAGE_X_FIX));
+	ut_ad(mtr_memo_contains_page(mtr, node2, MTR_MEMO_PAGE_X_FIX));
+	if (n_nodes == 0) {
+
+		ut_ad(fil_addr_is_null(flst_get_next_addr(node2, mtr)));
+
+		return;
+	}
+
+	buf_ptr_get_fsp_addr(node2, &space, &node2_addr);
+
+	/* Update next field of node2 */
+	flst_write_addr(node2 + FLST_NEXT, fil_addr_null, mtr);
+
+	flst_write_addr(base + FLST_LAST, node2_addr, mtr);
+
+	/* Update len of base node */
+	len = flst_get_len(base, mtr);
+	ut_ad(len >= n_nodes);
+
+	mlog_write_ulint(base + FLST_LEN, len - n_nodes, MLOG_4BYTES, mtr);
+}
+
+/********************************************************************//**
+Validates a file-based list.
+@return	TRUE if ok */
+UNIV_INTERN
+ibool
+flst_validate(
+/*==========*/
+	const flst_base_node_t*	base,	/*!< in: pointer to base node of list */
+	mtr_t*			mtr1)	/*!< in: mtr */
+{
+	ulint			space;
+	ulint			zip_size;
+	const flst_node_t*	node;
+	fil_addr_t		node_addr;
+	fil_addr_t		base_addr;
+	ulint			len;
+	ulint			i;
+	mtr_t			mtr2;
+
+	ut_ad(base);
+	ut_ad(mtr_memo_contains_page(mtr1, base, MTR_MEMO_PAGE_X_FIX));
+
+	/* We use two mini-transaction handles: the first is used to
+	lock the base node, and prevent other threads from modifying the
+	list. The second is used to traverse the list. We cannot run the
+	second mtr without committing it at times, because if the list
+	is long, then the x-locked pages could fill the buffer resulting
+	in a deadlock. */
+
+	/* Find out the space id */
+	buf_ptr_get_fsp_addr(base, &space, &base_addr);
+	zip_size = fil_space_get_zip_size(space);
+
+	len = flst_get_len(base, mtr1);
+	node_addr = flst_get_first(base, mtr1);
+
+	for (i = 0; i < len; i++) {
+		mtr_start(&mtr2);
+
+		node = fut_get_ptr(space, zip_size,
+				   node_addr, RW_X_LATCH, &mtr2);
+		node_addr = flst_get_next_addr(node, &mtr2);
+
+		mtr_commit(&mtr2); /* Commit mtr2 each round to prevent buffer
+				   becoming full */
+	}
+
+	ut_a(fil_addr_is_null(node_addr));
+
+	node_addr = flst_get_last(base, mtr1);
+
+	for (i = 0; i < len; i++) {
+		mtr_start(&mtr2);
+
+		node = fut_get_ptr(space, zip_size,
+				   node_addr, RW_X_LATCH, &mtr2);
+		node_addr = flst_get_prev_addr(node, &mtr2);
+
+		mtr_commit(&mtr2); /* Commit mtr2 each round to prevent buffer
+				   becoming full */
+	}
+
+	ut_a(fil_addr_is_null(node_addr));
+
+	return(TRUE);
+}
+
+/********************************************************************//**
+Prints info of a file-based list. */
+UNIV_INTERN
+void
+flst_print(
+/*=======*/
+	const flst_base_node_t*	base,	/*!< in: pointer to base node of list */
+	mtr_t*			mtr)	/*!< in: mtr */
+{
+	const buf_frame_t*	frame;
+	ulint			len;
+
+	ut_ad(base && mtr);
+	ut_ad(mtr_memo_contains_page(mtr, base, MTR_MEMO_PAGE_X_FIX));
+	frame = page_align((byte*) base);
+
+	len = flst_get_len(base, mtr);
+
+	fprintf(stderr,
+		"FILE-BASED LIST:\n"
+		"Base node in space %lu page %lu byte offset %lu; len %lu\n",
+		(ulong) page_get_space_id(frame),
+		(ulong) page_get_page_no(frame),
+		(ulong) page_offset(base), (ulong) len);
+}
diff --git a/storage/xtradb/ha/ha0ha.c b/storage/xtradb/ha/ha0ha.c
new file mode 100644
index 00000000000..7f11917de0a
--- /dev/null
+++ b/storage/xtradb/ha/ha0ha.c
@@ -0,0 +1,464 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/********************************************************************//**
+@file ha/ha0ha.c
+The hash table with external chains
+
+Created 8/22/1994 Heikki Tuuri
+*************************************************************************/
+
+#include "ha0ha.h"
+#ifdef UNIV_NONINL
+#include "ha0ha.ic"
+#endif
+
+#ifdef UNIV_DEBUG
+# include "buf0buf.h"
+#endif /* UNIV_DEBUG */
+#include "btr0sea.h"
+#include "page0page.h"
+
+/*************************************************************//**
+Creates a hash table with at least n array cells.  The actual number
+of cells is chosen to be a prime number slightly bigger than n.
+@return	own: created table */
+UNIV_INTERN
+hash_table_t*
+ha_create_func(
+/*===========*/
+	ulint	n,		/*!< in: number of array cells */
+#ifdef UNIV_SYNC_DEBUG
+	ulint	mutex_level,	/*!< in: level of the mutexes in the latching
+				order: this is used in the debug version */
+#endif /* UNIV_SYNC_DEBUG */
+	ulint	n_mutexes)	/*!< in: number of mutexes to protect the
+				hash table: must be a power of 2, or 0 */
+{
+	hash_table_t*	table;
+#ifndef UNIV_HOTBACKUP
+	ulint		i;
+#endif /* !UNIV_HOTBACKUP */
+
+	ut_ad(ut_is_2pow(n_mutexes));
+	table = hash_create(n);
+
+#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
+# ifndef UNIV_HOTBACKUP
+	table->adaptive = TRUE;
+# endif /* !UNIV_HOTBACKUP */
+#endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */
+	/* Creating MEM_HEAP_BTR_SEARCH type heaps can potentially fail,
+	but in practise it never should in this case, hence the asserts. */
+
+	if (n_mutexes == 0) {
+		table->heap = mem_heap_create_in_btr_search(
+			ut_min(4096, MEM_MAX_ALLOC_IN_BUF));
+		ut_a(table->heap);
+
+		return(table);
+	}
+
+#ifndef UNIV_HOTBACKUP
+	hash_create_mutexes(table, n_mutexes, mutex_level);
+
+	table->heaps = mem_alloc(n_mutexes * sizeof(void*));
+
+	for (i = 0; i < n_mutexes; i++) {
+		table->heaps[i] = mem_heap_create_in_btr_search(4096);
+		ut_a(table->heaps[i]);
+	}
+#endif /* !UNIV_HOTBACKUP */
+
+	return(table);
+}
+
+/*************************************************************//**
+Empties a hash table and frees the memory heaps. */
+UNIV_INTERN
+void
+ha_clear(
+/*=====*/
+	hash_table_t*	table)	/*!< in, own: hash table */
+{
+	ulint	i;
+	ulint	n;
+
+	ut_ad(table);
+	ut_ad(table->magic_n == HASH_TABLE_MAGIC_N);
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(rw_lock_own(&btr_search_latch, RW_LOCK_EXCLUSIVE));
+#endif /* UNIV_SYNC_DEBUG */
+
+#ifndef UNIV_HOTBACKUP
+	/* Free the memory heaps. */
+	n = table->n_mutexes;
+
+	for (i = 0; i < n; i++) {
+		mem_heap_free(table->heaps[i]);
+	}
+#endif /* !UNIV_HOTBACKUP */
+
+	/* Clear the hash table. */
+	n = hash_get_n_cells(table);
+
+	for (i = 0; i < n; i++) {
+		hash_get_nth_cell(table, i)->node = NULL;
+	}
+}
+
+/*************************************************************//**
+Inserts an entry into a hash table. If an entry with the same fold number
+is found, its node is updated to point to the new data, and no new node
+is inserted. If btr_search_enabled is set to FALSE, we will only allow
+updating existing nodes, but no new node is allowed to be added.
+@return	TRUE if succeed, FALSE if no more memory could be allocated */
+UNIV_INTERN
+ibool
+ha_insert_for_fold_func(
+/*====================*/
+	hash_table_t*	table,	/*!< in: hash table */
+	ulint		fold,	/*!< in: folded value of data; if a node with
+				the same fold value already exists, it is
+				updated to point to the same data, and no new
+				node is created! */
+#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
+	buf_block_t*	block,	/*!< in: buffer block containing the data */
+#endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */
+	void*		data)	/*!< in: data, must not be NULL */
+{
+	hash_cell_t*	cell;
+	ha_node_t*	node;
+	ha_node_t*	prev_node;
+	ulint		hash;
+
+	ut_ad(data);
+	ut_ad(table);
+	ut_ad(table->magic_n == HASH_TABLE_MAGIC_N);
+#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
+	ut_a(block->frame == page_align(data));
+#endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */
+	ASSERT_HASH_MUTEX_OWN(table, fold);
+
+	hash = hash_calc_hash(fold, table);
+
+	cell = hash_get_nth_cell(table, hash);
+
+	prev_node = cell->node;
+
+	while (prev_node != NULL) {
+		if (prev_node->fold == fold) {
+#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
+# ifndef UNIV_HOTBACKUP
+			if (table->adaptive) {
+				buf_block_t* prev_block = prev_node->block;
+				ut_a(prev_block->frame
+				     == page_align(prev_node->data));
+				ut_a(prev_block->n_pointers > 0);
+				prev_block->n_pointers--;
+				block->n_pointers++;
+			}
+			ut_ad(!btr_search_fully_disabled);
+# endif /* !UNIV_HOTBACKUP */
+
+			prev_node->block = block;
+#endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */
+			prev_node->data = data;
+
+			return(TRUE);
+		}
+
+		prev_node = prev_node->next;
+	}
+
+	/* We are in the process of disabling hash index, do not add
+	new chain node */
+	if (!btr_search_enabled) {
+		ut_ad(!btr_search_fully_disabled);
+		return(TRUE);
+	}
+
+	/* We have to allocate a new chain node */
+
+	node = mem_heap_alloc(hash_get_heap(table, fold), sizeof(ha_node_t));
+
+	if (node == NULL) {
+		/* It was a btr search type memory heap and at the moment
+		no more memory could be allocated: return */
+
+		ut_ad(hash_get_heap(table, fold)->type & MEM_HEAP_BTR_SEARCH);
+
+		return(FALSE);
+	}
+
+	ha_node_set_data(node, block, data);
+
+#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
+# ifndef UNIV_HOTBACKUP
+	if (table->adaptive) {
+		block->n_pointers++;
+	}
+# endif /* !UNIV_HOTBACKUP */
+#endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */
+
+	node->fold = fold;
+
+	node->next = NULL;
+
+	prev_node = cell->node;
+
+	if (prev_node == NULL) {
+
+		cell->node = node;
+
+		return(TRUE);
+	}
+
+	while (prev_node->next != NULL) {
+
+		prev_node = prev_node->next;
+	}
+
+	prev_node->next = node;
+
+	return(TRUE);
+}
+
+/***********************************************************//**
+Deletes a hash node. */
+UNIV_INTERN
+void
+ha_delete_hash_node(
+/*================*/
+	hash_table_t*	table,		/*!< in: hash table */
+	ha_node_t*	del_node)	/*!< in: node to be deleted */
+{
+	ut_ad(table);
+	ut_ad(table->magic_n == HASH_TABLE_MAGIC_N);
+#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
+# ifndef UNIV_HOTBACKUP
+	if (table->adaptive) {
+		ut_a(del_node->block->frame = page_align(del_node->data));
+		ut_a(del_node->block->n_pointers > 0);
+		del_node->block->n_pointers--;
+	}
+# endif /* !UNIV_HOTBACKUP */
+#endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */
+
+	HASH_DELETE_AND_COMPACT(ha_node_t, next, table, del_node);
+}
+
+/*********************************************************//**
+Looks for an element when we know the pointer to the data, and updates
+the pointer to data, if found. */
+UNIV_INTERN
+void
+ha_search_and_update_if_found_func(
+/*===============================*/
+	hash_table_t*	table,	/*!< in/out: hash table */
+	ulint		fold,	/*!< in: folded value of the searched data */
+	void*		data,	/*!< in: pointer to the data */
+#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
+	buf_block_t*	new_block,/*!< in: block containing new_data */
+#endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */
+	void*		new_data)/*!< in: new pointer to the data */
+{
+	ha_node_t*	node;
+
+	ut_ad(table);
+	ut_ad(table->magic_n == HASH_TABLE_MAGIC_N);
+	ASSERT_HASH_MUTEX_OWN(table, fold);
+#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
+	ut_a(new_block->frame == page_align(new_data));
+#endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */
+
+	node = ha_search_with_data(table, fold, data);
+
+	if (node) {
+#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
+# ifndef UNIV_HOTBACKUP
+		if (table->adaptive) {
+			ut_a(node->block->n_pointers > 0);
+			node->block->n_pointers--;
+			new_block->n_pointers++;
+		}
+# endif /* !UNIV_HOTBACKUP */
+
+		node->block = new_block;
+#endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */
+		node->data = new_data;
+	}
+}
+
+#ifndef UNIV_HOTBACKUP
+/*****************************************************************//**
+Removes from the chain determined by fold all nodes whose data pointer
+points to the page given. */
+UNIV_INTERN
+void
+ha_remove_all_nodes_to_page(
+/*========================*/
+	hash_table_t*	table,	/*!< in: hash table */
+	ulint		fold,	/*!< in: fold value */
+	const page_t*	page)	/*!< in: buffer page */
+{
+	ha_node_t*	node;
+
+	ut_ad(table);
+	ut_ad(table->magic_n == HASH_TABLE_MAGIC_N);
+	ASSERT_HASH_MUTEX_OWN(table, fold);
+
+	node = ha_chain_get_first(table, fold);
+
+	while (node) {
+		if (page_align(ha_node_get_data(node)) == page) {
+
+			/* Remove the hash node */
+
+			ha_delete_hash_node(table, node);
+
+			/* Start again from the first node in the chain
+			because the deletion may compact the heap of
+			nodes and move other nodes! */
+
+			node = ha_chain_get_first(table, fold);
+		} else {
+			node = ha_chain_get_next(node);
+		}
+	}
+#ifdef UNIV_DEBUG
+	/* Check that all nodes really got deleted */
+
+	node = ha_chain_get_first(table, fold);
+
+	while (node) {
+		ut_a(page_align(ha_node_get_data(node)) != page);
+
+		node = ha_chain_get_next(node);
+	}
+#endif
+}
+
+#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
+/*************************************************************//**
+Validates a given range of the cells in hash table.
+@return	TRUE if ok */
+UNIV_INTERN
+ibool
+ha_validate(
+/*========*/
+	hash_table_t*	table,		/*!< in: hash table */
+	ulint		start_index,	/*!< in: start index */
+	ulint		end_index)	/*!< in: end index */
+{
+	hash_cell_t*	cell;
+	ha_node_t*	node;
+	ibool		ok	= TRUE;
+	ulint		i;
+
+	ut_ad(table);
+	ut_ad(table->magic_n == HASH_TABLE_MAGIC_N);
+	ut_a(start_index <= end_index);
+	ut_a(start_index < hash_get_n_cells(table));
+	ut_a(end_index < hash_get_n_cells(table));
+
+	for (i = start_index; i <= end_index; i++) {
+
+		cell = hash_get_nth_cell(table, i);
+
+		node = cell->node;
+
+		while (node) {
+			if (hash_calc_hash(node->fold, table) != i) {
+				ut_print_timestamp(stderr);
+				fprintf(stderr,
+					"InnoDB: Error: hash table node"
+					" fold value %lu does not\n"
+					"InnoDB: match the cell number %lu.\n",
+					(ulong) node->fold, (ulong) i);
+
+				ok = FALSE;
+			}
+
+			node = node->next;
+		}
+	}
+
+	return(ok);
+}
+#endif /* defined UNIV_AHI_DEBUG || defined UNIV_DEBUG */
+
+/*************************************************************//**
+Prints info of a hash table. */
+UNIV_INTERN
+void
+ha_print_info(
+/*==========*/
+	FILE*		file,	/*!< in: file where to print */
+	hash_table_t*	table)	/*!< in: hash table */
+{
+#ifdef UNIV_DEBUG
+/* Some of the code here is disabled for performance reasons in production
+builds, see http://bugs.mysql.com/36941 */
+#define PRINT_USED_CELLS
+#endif /* UNIV_DEBUG */
+
+#ifdef PRINT_USED_CELLS
+	hash_cell_t*	cell;
+	ulint		cells	= 0;
+	ulint		i;
+#endif /* PRINT_USED_CELLS */
+	ulint		n_bufs;
+
+	ut_ad(table);
+	ut_ad(table->magic_n == HASH_TABLE_MAGIC_N);
+#ifdef PRINT_USED_CELLS
+	for (i = 0; i < hash_get_n_cells(table); i++) {
+
+		cell = hash_get_nth_cell(table, i);
+
+		if (cell->node) {
+
+			cells++;
+		}
+	}
+#endif /* PRINT_USED_CELLS */
+
+	fprintf(file, "Hash table size %lu",
+		(ulong) hash_get_n_cells(table));
+
+#ifdef PRINT_USED_CELLS
+	fprintf(file, ", used cells %lu", (ulong) cells);
+#endif /* PRINT_USED_CELLS */
+
+	if (table->heaps == NULL && table->heap != NULL) {
+
+		/* This calculation is intended for the adaptive hash
+		index: how many buffer frames we have reserved? */
+
+		n_bufs = UT_LIST_GET_LEN(table->heap->base) - 1;
+
+		if (table->heap->free_block) {
+			n_bufs++;
+		}
+
+		fprintf(file, ", node heap has %lu buffer(s)\n",
+			(ulong) n_bufs);
+	}
+}
+#endif /* !UNIV_HOTBACKUP */
diff --git a/storage/xtradb/ha/ha0storage.c b/storage/xtradb/ha/ha0storage.c
new file mode 100644
index 00000000000..698e34f1166
--- /dev/null
+++ b/storage/xtradb/ha/ha0storage.c
@@ -0,0 +1,184 @@
+/*****************************************************************************
+
+Copyright (c) 2007, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file ha/ha0storage.c
+Hash storage.
+Provides a data structure that stores chunks of data in
+its own storage, avoiding duplicates.
+
+Created September 22, 2007 Vasil Dimov
+*******************************************************/
+
+#include "univ.i"
+#include "ha0storage.h"
+#include "hash0hash.h"
+#include "mem0mem.h"
+#include "ut0rnd.h"
+
+#ifdef UNIV_NONINL
+#include "ha0storage.ic"
+#endif
+
+/*******************************************************************//**
+Retrieves a data from a storage. If it is present, a pointer to the
+stored copy of data is returned, otherwise NULL is returned. */
+static
+const void*
+ha_storage_get(
+/*===========*/
+	ha_storage_t*	storage,	/*!< in: hash storage */
+	const void*	data,		/*!< in: data to check for */
+	ulint		data_len)	/*!< in: data length */
+{
+	ha_storage_node_t*	node;
+	ulint			fold;
+
+	/* avoid repetitive calls to ut_fold_binary() in the HASH_SEARCH
+	macro */
+	fold = ut_fold_binary(data, data_len);
+
+#define IS_FOUND	\
+	node->data_len == data_len && memcmp(node->data, data, data_len) == 0
+
+	HASH_SEARCH(
+		next,			/* node->"next" */
+		storage->hash,		/* the hash table */
+		fold,			/* key */
+		ha_storage_node_t*,	/* type of node->next */
+		node,			/* auxiliary variable */
+		,			/* assertion */
+		IS_FOUND);		/* search criteria */
+
+	if (node == NULL) {
+
+		return(NULL);
+	}
+	/* else */
+
+	return(node->data);
+}
+
+/*******************************************************************//**
+Copies data into the storage and returns a pointer to the copy. If the
+same data chunk is already present, then pointer to it is returned.
+Data chunks are considered to be equal if len1 == len2 and
+memcmp(data1, data2, len1) == 0. If "data" is not present (and thus
+data_len bytes need to be allocated) and the size of storage is going to
+become more than "memlim" then "data" is not added and NULL is returned.
+To disable this behavior "memlim" can be set to 0, which stands for
+"no limit". */
+UNIV_INTERN
+const void*
+ha_storage_put_memlim(
+/*==================*/
+	ha_storage_t*	storage,	/*!< in/out: hash storage */
+	const void*	data,		/*!< in: data to store */
+	ulint		data_len,	/*!< in: data length */
+	ulint		memlim)		/*!< in: memory limit to obey */
+{
+	void*			raw;
+	ha_storage_node_t*	node;
+	const void*		data_copy;
+	ulint			fold;
+
+	/* check if data chunk is already present */
+	data_copy = ha_storage_get(storage, data, data_len);
+	if (data_copy != NULL) {
+
+		return(data_copy);
+	}
+
+	/* not present */
+
+	/* check if we are allowed to allocate data_len bytes */
+	if (memlim > 0
+	    && ha_storage_get_size(storage) + data_len > memlim) {
+
+		return(NULL);
+	}
+
+	/* we put the auxiliary node struct and the data itself in one
+	continuous block */
+	raw = mem_heap_alloc(storage->heap,
+			     sizeof(ha_storage_node_t) + data_len);
+
+	node = (ha_storage_node_t*) raw;
+	data_copy = (byte*) raw + sizeof(*node);
+
+	memcpy((byte*) raw + sizeof(*node), data, data_len);
+
+	node->data_len = data_len;
+	node->data = data_copy;
+
+	/* avoid repetitive calls to ut_fold_binary() in the HASH_INSERT
+	macro */
+	fold = ut_fold_binary(data, data_len);
+
+	HASH_INSERT(
+		ha_storage_node_t,	/* type used in the hash chain */
+		next,			/* node->"next" */
+		storage->hash,		/* the hash table */
+		fold,			/* key */
+		node);			/* add this data to the hash */
+
+	/* the output should not be changed because it will spoil the
+	hash table */
+	return(data_copy);
+}
+
+#ifdef UNIV_COMPILE_TEST_FUNCS
+
+void
+test_ha_storage()
+{
+	ha_storage_t*	storage;
+	char		buf[1024];
+	int		i;
+	const void*	stored[256];
+	const void*	p;
+
+	storage = ha_storage_create(0, 0);
+
+	for (i = 0; i < 256; i++) {
+
+		memset(buf, i, sizeof(buf));
+		stored[i] = ha_storage_put(storage, buf, sizeof(buf));
+	}
+
+	//ha_storage_empty(&storage);
+
+	for (i = 255; i >= 0; i--) {
+
+		memset(buf, i, sizeof(buf));
+		p = ha_storage_put(storage, buf, sizeof(buf));
+
+		if (p != stored[i]) {
+
+			fprintf(stderr, "ha_storage_put() returned %p "
+				"instead of %p, i=%d\n", p, stored[i], i);
+			return;
+		}
+	}
+
+	fprintf(stderr, "all ok\n");
+
+	ha_storage_free(storage);
+}
+
+#endif /* UNIV_COMPILE_TEST_FUNCS */
diff --git a/storage/xtradb/ha/hash0hash.c b/storage/xtradb/ha/hash0hash.c
new file mode 100644
index 00000000000..0f4fc55d895
--- /dev/null
+++ b/storage/xtradb/ha/hash0hash.c
@@ -0,0 +1,242 @@
+/*****************************************************************************
+
+Copyright (c) 1997, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file ha/hash0hash.c
+The simple hash table utility
+
+Created 5/20/1997 Heikki Tuuri
+*******************************************************/
+
+#include "hash0hash.h"
+#ifdef UNIV_NONINL
+#include "hash0hash.ic"
+#endif
+
+#include "mem0mem.h"
+
+#ifndef UNIV_HOTBACKUP
+/************************************************************//**
+Reserves the mutex for a fold value in a hash table. */
+UNIV_INTERN
+void
+hash_mutex_enter(
+/*=============*/
+	hash_table_t*	table,	/*!< in: hash table */
+	ulint		fold)	/*!< in: fold */
+{
+	mutex_enter(hash_get_mutex(table, fold));
+}
+
+/************************************************************//**
+Releases the mutex for a fold value in a hash table. */
+UNIV_INTERN
+void
+hash_mutex_exit(
+/*============*/
+	hash_table_t*	table,	/*!< in: hash table */
+	ulint		fold)	/*!< in: fold */
+{
+	mutex_exit(hash_get_mutex(table, fold));
+}
+
+/************************************************************//**
+Reserves all the mutexes of a hash table, in an ascending order. */
+UNIV_INTERN
+void
+hash_mutex_enter_all(
+/*=================*/
+	hash_table_t*	table)	/*!< in: hash table */
+{
+	ulint	i;
+
+	for (i = 0; i < table->n_mutexes; i++) {
+
+		mutex_enter(table->mutexes + i);
+	}
+}
+
+/************************************************************//**
+Releases all the mutexes of a hash table. */
+UNIV_INTERN
+void
+hash_mutex_exit_all(
+/*================*/
+	hash_table_t*	table)	/*!< in: hash table */
+{
+	ulint	i;
+
+	for (i = 0; i < table->n_mutexes; i++) {
+
+		mutex_exit(table->mutexes + i);
+	}
+}
+#endif /* !UNIV_HOTBACKUP */
+
+/*************************************************************//**
+Creates a hash table with >= n array cells. The actual number of cells is
+chosen to be a prime number slightly bigger than n.
+@return	own: created table */
+UNIV_INTERN
+hash_table_t*
+hash_create(
+/*========*/
+	ulint	n)	/*!< in: number of array cells */
+{
+	hash_cell_t*	array;
+	ulint		prime;
+	hash_table_t*	table;
+
+	prime = ut_find_prime(n);
+
+	table = mem_alloc(sizeof(hash_table_t));
+
+	array = ut_malloc(sizeof(hash_cell_t) * prime);
+
+	table->array = array;
+	table->n_cells = prime;
+#ifndef UNIV_HOTBACKUP
+# if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
+	table->adaptive = FALSE;
+# endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */
+	table->n_mutexes = 0;
+	table->mutexes = NULL;
+	table->heaps = NULL;
+#endif /* !UNIV_HOTBACKUP */
+	table->heap = NULL;
+	ut_d(table->magic_n = HASH_TABLE_MAGIC_N);
+
+	/* Initialize the cell array */
+	hash_table_clear(table);
+
+	return(table);
+}
+
+/*************************************************************//**
+*/
+UNIV_INTERN
+ulint
+hash_create_needed(
+/*===============*/
+	ulint	n)
+{
+	ulint	prime;
+	ulint	offset;
+
+	prime = ut_find_prime(n);
+
+	offset = (sizeof(hash_table_t) + 7) / 8;
+	offset *= 8;
+
+	return(offset + sizeof(hash_cell_t) * prime);
+}
+
+UNIV_INTERN
+void
+hash_create_init(
+/*=============*/
+	hash_table_t*	table,
+	ulint		n)
+{
+	ulint	prime;
+	ulint	offset;
+
+	prime = ut_find_prime(n);
+
+	offset = (sizeof(hash_table_t) + 7) / 8;
+	offset *= 8;
+
+	table->array = (hash_cell_t*)(((byte*)table) + offset);
+	table->n_cells = prime;
+# if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
+	table->adaptive = FALSE;
+# endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */
+	table->n_mutexes = 0;
+	table->mutexes = NULL;
+	table->heaps = NULL;
+	table->heap = NULL;
+	ut_d(table->magic_n = HASH_TABLE_MAGIC_N);
+
+	/* Initialize the cell array */
+	hash_table_clear(table);
+}
+
+UNIV_INTERN
+void
+hash_create_reuse(
+/*==============*/
+	hash_table_t*	table)
+{
+	ulint	offset;
+
+	offset = (sizeof(hash_table_t) + 7) / 8;
+	offset *= 8;
+
+	table->array = (hash_cell_t*)(((byte*)table) + offset);
+	ut_ad(table->magic_n == HASH_TABLE_MAGIC_N);
+}
+
+/*************************************************************//**
+Frees a hash table. */
+UNIV_INTERN
+void
+hash_table_free(
+/*============*/
+	hash_table_t*	table)	/*!< in, own: hash table */
+{
+	ut_ad(table);
+	ut_ad(table->magic_n == HASH_TABLE_MAGIC_N);
+#ifndef UNIV_HOTBACKUP
+	ut_a(table->mutexes == NULL);
+#endif /* !UNIV_HOTBACKUP */
+
+	ut_free(table->array);
+	mem_free(table);
+}
+
+#ifndef UNIV_HOTBACKUP
+/*************************************************************//**
+Creates a mutex array to protect a hash table. */
+UNIV_INTERN
+void
+hash_create_mutexes_func(
+/*=====================*/
+	hash_table_t*	table,		/*!< in: hash table */
+#ifdef UNIV_SYNC_DEBUG
+	ulint		sync_level,	/*!< in: latching order level of the
+					mutexes: used in the debug version */
+#endif /* UNIV_SYNC_DEBUG */
+	ulint		n_mutexes)	/*!< in: number of mutexes, must be a
+					power of 2 */
+{
+	ulint	i;
+
+	ut_ad(table);
+	ut_ad(table->magic_n == HASH_TABLE_MAGIC_N);
+	ut_a(n_mutexes > 0);
+	ut_a(ut_is_2pow(n_mutexes));
+
+	table->mutexes = mem_alloc(n_mutexes * sizeof(mutex_t));
+
+	for (i = 0; i < n_mutexes; i++) {
+		mutex_create(table->mutexes + i, sync_level);
+	}
+
+	table->n_mutexes = n_mutexes;
+}
+#endif /* !UNIV_HOTBACKUP */
diff --git a/storage/xtradb/ha_innodb.def b/storage/xtradb/ha_innodb.def
new file mode 100644
index 00000000000..e0faa62deb1
--- /dev/null
+++ b/storage/xtradb/ha_innodb.def
@@ -0,0 +1,4 @@
+EXPORTS
+	_mysql_plugin_interface_version_
+	_mysql_sizeof_struct_st_plugin_
+	_mysql_plugin_declarations_
diff --git a/storage/xtradb/handler/ha_innodb.cc b/storage/xtradb/handler/ha_innodb.cc
new file mode 100644
index 00000000000..f511918e845
--- /dev/null
+++ b/storage/xtradb/handler/ha_innodb.cc
@@ -0,0 +1,12109 @@
+/*****************************************************************************
+
+Copyright (c) 2000, 2010, MySQL AB & Innobase Oy. All Rights Reserved.
+Copyright (c) 2008, 2009 Google Inc.
+Copyright (c) 2009, Percona Inc.
+
+Portions of this file contain modifications contributed and copyrighted by
+Google, Inc. Those modifications are gratefully acknowledged and are described
+briefly in the InnoDB documentation. The contributions by Google are
+incorporated with their permission, and subject to the conditions contained in
+the file COPYING.Google.
+
+Portions of this file contain modifications contributed and copyrighted
+by Percona Inc.. Those modifications are
+gratefully acknowledged and are described briefly in the InnoDB
+documentation. The contributions by Percona Inc. are incorporated with
+their permission, and subject to the conditions contained in the file
+COPYING.Percona.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/* TODO list for the InnoDB handler in 5.0:
+  - Remove the flag trx->active_trans and look at trx->conc_state
+  - fix savepoint functions to use savepoint storage area
+  - Find out what kind of problems the OS X case-insensitivity causes to
+    table and database names; should we 'normalize' the names like we do
+    in Windows?
+*/
+
+#ifdef USE_PRAGMA_IMPLEMENTATION
+#pragma implementation				// gcc: Class implementation
+#endif
+
+#ifndef MYSQL_SERVER
+#define MYSQL_SERVER
+#endif
+
+#include <mysql_priv.h>
+#ifdef MYSQL_SERVER
+#include <log_event.h>
+#endif /* MYSQL_SERVER */
+
+#include <m_ctype.h>
+#include <mysys_err.h>
+#include <mysql/plugin.h>
+
+/** @file ha_innodb.cc */
+
+/* Include necessary InnoDB headers */
+extern "C" {
+#include "univ.i"
+#include "buf0lru.h"
+#include "btr0sea.h"
+#include "os0file.h"
+#include "os0thread.h"
+#include "srv0start.h"
+#include "srv0srv.h"
+#include "trx0roll.h"
+#include "trx0trx.h"
+#include "trx0sys.h"
+#include "mtr0mtr.h"
+#include "row0ins.h"
+#include "row0mysql.h"
+#include "row0sel.h"
+#include "row0upd.h"
+#include "log0log.h"
+#include "lock0lock.h"
+#include "dict0crea.h"
+#include "btr0cur.h"
+#include "btr0btr.h"
+#include "fsp0fsp.h"
+#include "sync0sync.h"
+#include "fil0fil.h"
+#include "trx0xa.h"
+#include "row0merge.h"
+#include "thr0loc.h"
+#include "dict0boot.h"
+#include "ha_prototypes.h"
+#include "ut0mem.h"
+#include "ibuf0ibuf.h"
+}
+
+#include "ha_innodb.h"
+#include "i_s.h"
+
+#ifdef MYSQL_SERVER
+// Defined in trx0sys.c
+extern char		trx_sys_mysql_master_log_name[];
+extern ib_int64_t	trx_sys_mysql_master_log_pos;
+extern char		trx_sys_mysql_relay_log_name[];
+extern ib_int64_t	trx_sys_mysql_relay_log_pos;
+#endif /* MYSQL_SERVER */
+
+#ifndef MYSQL_SERVER
+# ifndef MYSQL_PLUGIN_IMPORT
+#  define MYSQL_PLUGIN_IMPORT /* nothing */
+# endif /* MYSQL_PLUGIN_IMPORT */
+
+#if MYSQL_VERSION_ID < 50124
+/* this is defined in mysql_priv.h inside #ifdef MYSQL_SERVER
+but we need it here */
+bool check_global_access(THD *thd, ulong want_access);
+#endif /* MYSQL_VERSION_ID < 50124 */
+#endif /* MYSQL_SERVER */
+
+/** to protect innobase_open_files */
+static pthread_mutex_t innobase_share_mutex;
+/** to force correct commit order in binlog */
+static pthread_mutex_t prepare_commit_mutex;
+static ulong commit_threads = 0;
+static pthread_mutex_t commit_threads_m;
+static pthread_cond_t commit_cond;
+static pthread_mutex_t commit_cond_m;
+static bool innodb_inited = 0;
+
+C_MODE_START
+static int index_cond_func_innodb(void *arg);
+C_MODE_END
+
+
+
+#define INSIDE_HA_INNOBASE_CC
+
+/* In the Windows plugin, the return value of current_thd is
+undefined.  Map it to NULL. */
+
+#define EQ_CURRENT_THD(thd) ((thd) == current_thd)
+
+
+static struct handlerton* innodb_hton_ptr;
+
+static const long AUTOINC_OLD_STYLE_LOCKING = 0;
+static const long AUTOINC_NEW_STYLE_LOCKING = 1;
+static const long AUTOINC_NO_LOCKING = 2;
+
+static long innobase_mirrored_log_groups, innobase_log_files_in_group,
+	innobase_log_buffer_size,
+	innobase_additional_mem_pool_size, innobase_file_io_threads,
+	innobase_force_recovery, innobase_open_files,
+	innobase_autoinc_lock_mode;
+static ulong innobase_commit_concurrency = 0;
+static ulong innobase_read_io_threads;
+static ulong innobase_write_io_threads;
+
+static ulong innobase_page_size;
+
+static my_bool innobase_thread_concurrency_timer_based;
+static long long innobase_buffer_pool_size, innobase_log_file_size;
+
+/** Percentage of the buffer pool to reserve for 'old' blocks.
+Connected to buf_LRU_old_ratio. */
+static uint innobase_old_blocks_pct;
+
+/* The default values for the following char* start-up parameters
+are determined in innobase_init below: */
+
+static char*	innobase_data_home_dir			= NULL;
+static char*	innobase_data_file_path			= NULL;
+static char*	innobase_log_group_home_dir		= NULL;
+static char*	innobase_file_format_name		= NULL;
+static char*	innobase_change_buffering		= NULL;
+static char*	innobase_doublewrite_file		= NULL;
+
+/* Note: This variable can be set to on/off and any of the supported
+file formats in the configuration file, but can only be set to any
+of the supported file formats during runtime. */
+static char*	innobase_file_format_check		= NULL;
+
+static char*	innobase_file_flush_method		= NULL;
+
+/* Below we have boolean-valued start-up parameters, and their default
+values */
+
+static ulong	innobase_fast_shutdown			= 1;
+#ifdef UNIV_LOG_ARCHIVE
+static my_bool	innobase_log_archive			= FALSE;
+static char*	innobase_log_arch_dir			= NULL;
+#endif /* UNIV_LOG_ARCHIVE */
+static my_bool	innobase_use_doublewrite		= TRUE;
+static my_bool	innobase_use_checksums			= TRUE;
+static my_bool	innobase_fast_checksum			= FALSE;
+static my_bool	innobase_extra_undoslots		= FALSE;
+static my_bool	innobase_fast_recovery			= FALSE;
+static my_bool	innobase_recovery_stats			= TRUE;
+static my_bool	innobase_locks_unsafe_for_binlog	= FALSE;
+static my_bool	innobase_overwrite_relay_log_info	= FALSE;
+static my_bool	innobase_rollback_on_timeout		= FALSE;
+static my_bool	innobase_create_status_file		= FALSE;
+static my_bool	innobase_stats_on_metadata		= TRUE;
+static my_bool	innobase_use_sys_stats_table		= FALSE;
+static my_bool	innobase_buffer_pool_shm_checksum	= TRUE;
+
+static char*	internal_innobase_data_file_path	= NULL;
+
+static char*	innodb_version_str = (char*) INNODB_VERSION_STR;
+
+/* The following counter is used to convey information to InnoDB
+about server activity: in selects it is not sensible to call
+srv_active_wake_master_thread after each fetch or search, we only do
+it every INNOBASE_WAKE_INTERVAL'th step. */
+
+#define INNOBASE_WAKE_INTERVAL	32
+static ulong	innobase_active_counter	= 0;
+
+static hash_table_t*	innobase_open_tables;
+
+#ifdef __NETWARE__	/* some special cleanup for NetWare */
+bool nw_panic = FALSE;
+#endif
+
+/** Allowed values of innodb_change_buffering */
+static const char* innobase_change_buffering_values[IBUF_USE_COUNT] = {
+	"none",		/* IBUF_USE_NONE */
+	"inserts"	/* IBUF_USE_INSERT */
+};
+
+static INNOBASE_SHARE *get_share(const char *table_name);
+static void free_share(INNOBASE_SHARE *share);
+static int innobase_close_connection(handlerton *hton, THD* thd);
+static int innobase_commit(handlerton *hton, THD* thd, bool all);
+static int innobase_rollback(handlerton *hton, THD* thd, bool all);
+static int innobase_rollback_to_savepoint(handlerton *hton, THD* thd,
+           void *savepoint);
+static int innobase_savepoint(handlerton *hton, THD* thd, void *savepoint);
+static int innobase_release_savepoint(handlerton *hton, THD* thd,
+           void *savepoint);
+static handler *innobase_create_handler(handlerton *hton,
+                                        TABLE_SHARE *table,
+                                        MEM_ROOT *mem_root);
+/* "GEN_CLUST_INDEX" is the name reserved for Innodb default
+system primary index. */
+static const char innobase_index_reserve_name[]= "GEN_CLUST_INDEX";
+
+/** @brief Initialize the default value of innodb_commit_concurrency.
+
+Once InnoDB is running, the innodb_commit_concurrency must not change
+from zero to nonzero. (Bug #42101)
+
+The initial default value is 0, and without this extra initialization,
+SET GLOBAL innodb_commit_concurrency=DEFAULT would set the parameter
+to 0, even if it was initially set to nonzero at the command line
+or configuration file. */
+static
+void
+innobase_commit_concurrency_init_default(void);
+/*==========================================*/
+
+/************************************************************//**
+Validate the file format name and return its corresponding id.
+@return	valid file format id */
+static
+uint
+innobase_file_format_name_lookup(
+/*=============================*/
+	const char*	format_name);		/*!< in: pointer to file format
+						name */
+/************************************************************//**
+Validate the file format check config parameters, as a side effect it
+sets the srv_check_file_format_at_startup variable.
+@return	true if one of  "on" or "off" */
+static
+bool
+innobase_file_format_check_on_off(
+/*==============================*/
+	const char*	format_check);		/*!< in: parameter value */
+/************************************************************//**
+Validate the file format check config parameters, as a side effect it
+sets the srv_check_file_format_at_startup variable.
+@return	the format_id if valid config value, otherwise, return -1 */
+static
+int
+innobase_file_format_validate_and_set(
+/*================================*/
+	const char*	format_check);		/*!< in: parameter value */
+/****************************************************************//**
+Return alter table flags supported in an InnoDB database. */
+static
+uint
+innobase_alter_table_flags(
+/*=======================*/
+	uint	flags);
+
+static const char innobase_hton_name[]= "InnoDB";
+
+/*************************************************************//**
+Check for a valid value of innobase_commit_concurrency.
+@return	0 for valid innodb_commit_concurrency */
+static
+int
+innobase_commit_concurrency_validate(
+/*=================================*/
+	THD*				thd,	/*!< in: thread handle */
+	struct st_mysql_sys_var*	var,	/*!< in: pointer to system
+						variable */
+	void*				save,	/*!< out: immediate result
+						for update function */
+	struct st_mysql_value*		value)	/*!< in: incoming string */
+{
+	long long	intbuf;
+	ulong		commit_concurrency;
+
+	DBUG_ENTER("innobase_commit_concurrency_validate");
+
+	if (value->val_int(value, &intbuf)) {
+		/* The value is NULL. That is invalid. */
+		DBUG_RETURN(1);
+	}
+
+	*reinterpret_cast<ulong*>(save) = commit_concurrency
+		= static_cast<ulong>(intbuf);
+
+	/* Allow the value to be updated, as long as it remains zero
+	or nonzero. */
+	DBUG_RETURN(!(!commit_concurrency == !innobase_commit_concurrency));
+}
+
+static MYSQL_THDVAR_BOOL(support_xa, PLUGIN_VAR_OPCMDARG,
+  "Enable InnoDB support for the XA two-phase commit",
+  /* check_func */ NULL, /* update_func */ NULL,
+  /* default */ TRUE);
+
+static MYSQL_THDVAR_BOOL(table_locks, PLUGIN_VAR_OPCMDARG,
+  "Enable InnoDB locking in LOCK TABLES",
+  /* check_func */ NULL, /* update_func */ NULL,
+  /* default */ TRUE);
+
+static MYSQL_THDVAR_BOOL(strict_mode, PLUGIN_VAR_OPCMDARG,
+  "Use strict mode when evaluating create options.",
+  NULL, NULL, FALSE);
+
+static MYSQL_THDVAR_ULONG(lock_wait_timeout, PLUGIN_VAR_RQCMDARG,
+  "Timeout in seconds an InnoDB transaction may wait for a lock before being rolled back. Values above 100000000 disable the timeout.",
+  NULL, NULL, 50, 1, 1024 * 1024 * 1024, 0);
+
+static MYSQL_THDVAR_ULONG(flush_log_at_trx_commit_session, PLUGIN_VAR_RQCMDARG,
+  "Control innodb_flush_log_at_trx_commit for each sessions. "
+  "The value 0~2 are same meanings to innodb_flush_log_at_trx_commit. "
+  "The value 3 regards innodb_flush_log_at_trx_commit (default).",
+  NULL, NULL, 3, 0, 3, 0);
+
+
+static handler *innobase_create_handler(handlerton *hton,
+                                        TABLE_SHARE *table,
+                                        MEM_ROOT *mem_root)
+{
+  return new (mem_root) ha_innobase(hton, table);
+}
+
+/*******************************************************************//**
+This function is used to prepare an X/Open XA distributed transaction.
+@return	0 or error number */
+static
+int
+innobase_xa_prepare(
+/*================*/
+        handlerton*	hton,	/*!< in: InnoDB handlerton */
+	THD*		thd,	/*!< in: handle to the MySQL thread of
+				the user whose XA transaction should
+				be prepared */
+	bool		all);	/*!< in: TRUE - commit transaction
+				FALSE - the current SQL statement
+				ended */
+/*******************************************************************//**
+This function is used to recover X/Open XA distributed transactions.
+@return	number of prepared transactions stored in xid_list */
+static
+int
+innobase_xa_recover(
+/*================*/
+	handlerton*	hton,	/*!< in: InnoDB handlerton */
+	XID*		xid_list,/*!< in/out: prepared transactions */
+	uint		len);	/*!< in: number of slots in xid_list */
+/*******************************************************************//**
+This function is used to commit one X/Open XA distributed transaction
+which is in the prepared state
+@return	0 or error number */
+static
+int
+innobase_commit_by_xid(
+/*===================*/
+	handlerton* hton,
+	XID*	xid);	/*!< in: X/Open XA transaction identification */
+/*******************************************************************//**
+This function is used to rollback one X/Open XA distributed transaction
+which is in the prepared state
+@return	0 or error number */
+static
+int
+innobase_rollback_by_xid(
+/*=====================*/
+	handlerton*	hton,	/*!< in: InnoDB handlerton */
+	XID*		xid);	/*!< in: X/Open XA transaction
+				identification */
+/*******************************************************************//**
+Create a consistent view for a cursor based on current transaction
+which is created if the corresponding MySQL thread still lacks one.
+This consistent view is then used inside of MySQL when accessing records
+using a cursor.
+@return	pointer to cursor view or NULL */
+static
+void*
+innobase_create_cursor_view(
+/*========================*/
+	handlerton*	hton,	/*!< in: innobase hton */
+	THD*		thd);	/*!< in: user thread handle */
+/*******************************************************************//**
+Set the given consistent cursor view to a transaction which is created
+if the corresponding MySQL thread still lacks one. If the given
+consistent cursor view is NULL global read view of a transaction is
+restored to a transaction read view. */
+static
+void
+innobase_set_cursor_view(
+/*=====================*/
+	handlerton* hton,
+	THD*	thd,	/*!< in: user thread handle */
+	void*	curview);/*!< in: Consistent cursor view to be set */
+/*******************************************************************//**
+Close the given consistent cursor view of a transaction and restore
+global read view to a transaction read view. Transaction is created if the
+corresponding MySQL thread still lacks one. */
+static
+void
+innobase_close_cursor_view(
+/*=======================*/
+	handlerton* hton,
+	THD*	thd,	/*!< in: user thread handle */
+	void*	curview);/*!< in: Consistent read view to be closed */
+/*****************************************************************//**
+Removes all tables in the named database inside InnoDB. */
+static
+void
+innobase_drop_database(
+/*===================*/
+	handlerton* hton, /*!< in: handlerton of Innodb */
+	char*	path);	/*!< in: database path; inside InnoDB the name
+			of the last directory in the path is used as
+			the database name: for example, in 'mysql/data/test'
+			the database name is 'test' */
+/*******************************************************************//**
+Closes an InnoDB database. */
+static
+int
+innobase_end(handlerton *hton, ha_panic_function type);
+
+/*****************************************************************//**
+Creates an InnoDB transaction struct for the thd if it does not yet have one.
+Starts a new InnoDB transaction if a transaction is not yet started. And
+assigns a new snapshot for a consistent read if the transaction does not yet
+have one.
+@return	0 */
+static
+int
+innobase_start_trx_and_assign_read_view(
+/*====================================*/
+			/* out: 0 */
+	handlerton* hton, /* in: Innodb handlerton */
+	THD*	thd);	/* in: MySQL thread handle of the user for whom
+			the transaction should be committed */
+/****************************************************************//**
+Flushes InnoDB logs to disk and makes a checkpoint. Really, a commit flushes
+the logs, and the name of this function should be innobase_checkpoint.
+@return	TRUE if error */
+static
+bool
+innobase_flush_logs(
+/*================*/
+	handlerton*	hton);	/*!< in: InnoDB handlerton */
+
+/************************************************************************//**
+Implements the SHOW INNODB STATUS command. Sends the output of the InnoDB
+Monitor to the client. */
+static
+bool
+innodb_show_status(
+/*===============*/
+	handlerton*	hton,	/*!< in: the innodb handlerton */
+	THD*	thd,	/*!< in: the MySQL query thread of the caller */
+	stat_print_fn *stat_print);
+static
+bool innobase_show_status(handlerton *hton, THD* thd, 
+                          stat_print_fn* stat_print,
+                          enum ha_stat_type stat_type);
+
+/*****************************************************************//**
+Commits a transaction in an InnoDB database. */
+static
+void
+innobase_commit_low(
+/*================*/
+	trx_t*	trx);	/*!< in: transaction handle */
+
+static SHOW_VAR innodb_status_variables[]= {
+  {"buffer_pool_pages_data",
+  (char*) &export_vars.innodb_buffer_pool_pages_data,	  SHOW_LONG},
+  {"buffer_pool_pages_dirty",
+  (char*) &export_vars.innodb_buffer_pool_pages_dirty,	  SHOW_LONG},
+  {"buffer_pool_pages_flushed",
+  (char*) &export_vars.innodb_buffer_pool_pages_flushed,  SHOW_LONG},
+  {"buffer_pool_pages_free",
+  (char*) &export_vars.innodb_buffer_pool_pages_free,	  SHOW_LONG},
+#ifdef UNIV_DEBUG
+  {"buffer_pool_pages_latched",
+  (char*) &export_vars.innodb_buffer_pool_pages_latched,  SHOW_LONG},
+#endif /* UNIV_DEBUG */
+  {"buffer_pool_pages_misc",
+  (char*) &export_vars.innodb_buffer_pool_pages_misc,	  SHOW_LONG},
+  {"buffer_pool_pages_total",
+  (char*) &export_vars.innodb_buffer_pool_pages_total,	  SHOW_LONG},
+  {"buffer_pool_read_ahead",
+  (char*) &export_vars.innodb_buffer_pool_read_ahead,	  SHOW_LONG},
+  {"buffer_pool_read_ahead_evicted",
+  (char*) &export_vars.innodb_buffer_pool_read_ahead_evicted, SHOW_LONG},
+  {"buffer_pool_read_requests",
+  (char*) &export_vars.innodb_buffer_pool_read_requests,  SHOW_LONG},
+  {"buffer_pool_reads",
+  (char*) &export_vars.innodb_buffer_pool_reads,	  SHOW_LONG},
+  {"buffer_pool_wait_free",
+  (char*) &export_vars.innodb_buffer_pool_wait_free,	  SHOW_LONG},
+  {"buffer_pool_write_requests",
+  (char*) &export_vars.innodb_buffer_pool_write_requests, SHOW_LONG},
+  {"data_fsyncs",
+  (char*) &export_vars.innodb_data_fsyncs,		  SHOW_LONG},
+  {"data_pending_fsyncs",
+  (char*) &export_vars.innodb_data_pending_fsyncs,	  SHOW_LONG},
+  {"data_pending_reads",
+  (char*) &export_vars.innodb_data_pending_reads,	  SHOW_LONG},
+  {"data_pending_writes",
+  (char*) &export_vars.innodb_data_pending_writes,	  SHOW_LONG},
+  {"data_read",
+  (char*) &export_vars.innodb_data_read,		  SHOW_LONG},
+  {"data_reads",
+  (char*) &export_vars.innodb_data_reads,		  SHOW_LONG},
+  {"data_writes",
+  (char*) &export_vars.innodb_data_writes,		  SHOW_LONG},
+  {"data_written",
+  (char*) &export_vars.innodb_data_written,		  SHOW_LONG},
+  {"dblwr_pages_written",
+  (char*) &export_vars.innodb_dblwr_pages_written,	  SHOW_LONG},
+  {"deadlocks",
+  (char*) &export_vars.innodb_deadlocks,                  SHOW_LONG}, 
+  {"dblwr_writes",
+  (char*) &export_vars.innodb_dblwr_writes,		  SHOW_LONG},
+  {"dict_tables",
+  (char*) &export_vars.innodb_dict_tables,		  SHOW_LONG},
+  {"have_atomic_builtins",
+  (char*) &export_vars.innodb_have_atomic_builtins,	  SHOW_BOOL},
+  {"log_waits",
+  (char*) &export_vars.innodb_log_waits,		  SHOW_LONG},
+  {"log_write_requests",
+  (char*) &export_vars.innodb_log_write_requests,	  SHOW_LONG},
+  {"log_writes",
+  (char*) &export_vars.innodb_log_writes,		  SHOW_LONG},
+  {"os_log_fsyncs",
+  (char*) &export_vars.innodb_os_log_fsyncs,		  SHOW_LONG},
+  {"os_log_pending_fsyncs",
+  (char*) &export_vars.innodb_os_log_pending_fsyncs,	  SHOW_LONG},
+  {"os_log_pending_writes",
+  (char*) &export_vars.innodb_os_log_pending_writes,	  SHOW_LONG},
+  {"os_log_written",
+  (char*) &export_vars.innodb_os_log_written,		  SHOW_LONG},
+  {"page_size",
+  (char*) &export_vars.innodb_page_size,		  SHOW_LONG},
+  {"pages_created",
+  (char*) &export_vars.innodb_pages_created,		  SHOW_LONG},
+  {"pages_read",
+  (char*) &export_vars.innodb_pages_read,		  SHOW_LONG},
+  {"pages_written",
+  (char*) &export_vars.innodb_pages_written,		  SHOW_LONG},
+  {"row_lock_current_waits",
+  (char*) &export_vars.innodb_row_lock_current_waits,	  SHOW_LONG},
+  {"row_lock_time",
+  (char*) &export_vars.innodb_row_lock_time,		  SHOW_LONGLONG},
+  {"row_lock_time_avg",
+  (char*) &export_vars.innodb_row_lock_time_avg,	  SHOW_LONG},
+  {"row_lock_time_max",
+  (char*) &export_vars.innodb_row_lock_time_max,	  SHOW_LONG},
+  {"row_lock_waits",
+  (char*) &export_vars.innodb_row_lock_waits,		  SHOW_LONG},
+  {"rows_deleted",
+  (char*) &export_vars.innodb_rows_deleted,		  SHOW_LONG},
+  {"rows_inserted",
+  (char*) &export_vars.innodb_rows_inserted,		  SHOW_LONG},
+  {"rows_read",
+  (char*) &export_vars.innodb_rows_read,		  SHOW_LONG},
+  {"rows_updated",
+  (char*) &export_vars.innodb_rows_updated,		  SHOW_LONG},
+  {NullS, NullS, SHOW_LONG}
+};
+
+/* General functions */
+
+/******************************************************************//**
+Returns true if the thread is the replication thread on the slave
+server. Used in srv_conc_enter_innodb() to determine if the thread
+should be allowed to enter InnoDB - the replication thread is treated
+differently than other threads. Also used in
+srv_conc_force_exit_innodb().
+@return	true if thd is the replication thread */
+extern "C" UNIV_INTERN
+ibool
+thd_is_replication_slave_thread(
+/*============================*/
+	void*	thd)	/*!< in: thread handle (THD*) */
+{
+	return((ibool) thd_slave_thread((THD*) thd));
+}
+
+/******************************************************************//**
+Save some CPU by testing the value of srv_thread_concurrency in inline
+functions. */
+static inline
+void
+innodb_srv_conc_enter_innodb(
+/*=========================*/
+	trx_t*	trx)	/*!< in: transaction handle */
+{
+	if (UNIV_LIKELY(!srv_thread_concurrency)) {
+
+		return;
+	}
+
+	srv_conc_enter_innodb(trx);
+}
+
+/******************************************************************//**
+Save some CPU by testing the value of srv_thread_concurrency in inline
+functions. */
+static inline
+void
+innodb_srv_conc_exit_innodb(
+/*========================*/
+	trx_t*	trx)	/*!< in: transaction handle */
+{
+	if (UNIV_LIKELY(!trx->declared_to_be_inside_innodb)) {
+
+		return;
+	}
+
+	srv_conc_exit_innodb(trx);
+}
+
+/******************************************************************//**
+Releases possible search latch and InnoDB thread FIFO ticket. These should
+be released at each SQL statement end, and also when mysqld passes the
+control to the client. It does no harm to release these also in the middle
+of an SQL statement. */
+static inline
+void
+innobase_release_stat_resources(
+/*============================*/
+	trx_t*	trx)	/*!< in: transaction object */
+{
+	if (trx->has_search_latch) {
+		trx_search_latch_release_if_reserved(trx);
+	}
+
+	if (trx->declared_to_be_inside_innodb) {
+		/* Release our possible ticket in the FIFO */
+
+		srv_conc_force_exit_innodb(trx);
+	}
+}
+
+/******************************************************************//**
+Returns true if the transaction this thread is processing has edited
+non-transactional tables. Used by the deadlock detector when deciding
+which transaction to rollback in case of a deadlock - we try to avoid
+rolling back transactions that have edited non-transactional tables.
+@return	true if non-transactional tables have been edited */
+extern "C" UNIV_INTERN
+ibool
+thd_has_edited_nontrans_tables(
+/*===========================*/
+	void*	thd)	/*!< in: thread handle (THD*) */
+{
+	return((ibool) thd_non_transactional_update((THD*) thd));
+}
+
+/******************************************************************//**
+Returns true if the thread is executing a SELECT statement.
+@return	true if thd is executing SELECT */
+extern "C" UNIV_INTERN
+ibool
+thd_is_select(
+/*==========*/
+	const void*	thd)	/*!< in: thread handle (THD*) */
+{
+	return(thd_sql_command((const THD*) thd) == SQLCOM_SELECT);
+}
+
+/******************************************************************//**
+Returns true if the thread supports XA,
+global value of innodb_supports_xa if thd is NULL.
+@return	true if thd has XA support */
+extern "C" UNIV_INTERN
+ibool
+thd_supports_xa(
+/*============*/
+	void*	thd)	/*!< in: thread handle (THD*), or NULL to query
+			the global innodb_supports_xa */
+{
+	return(THDVAR((THD*) thd, support_xa));
+}
+
+/******************************************************************//**
+Returns the lock wait timeout for the current connection.
+@return	the lock wait timeout, in seconds */
+extern "C" UNIV_INTERN
+ulong
+thd_lock_wait_timeout(
+/*==================*/
+	void*	thd)	/*!< in: thread handle (THD*), or NULL to query
+			the global innodb_lock_wait_timeout */
+{
+	/* According to <mysql/plugin.h>, passing thd == NULL
+	returns the global value of the session variable. */
+	return(THDVAR((THD*) thd, lock_wait_timeout));
+}
+
+/******************************************************************//**
+*/
+extern "C" UNIV_INTERN
+ulong
+thd_flush_log_at_trx_commit_session(
+/*================================*/
+	void*	thd)
+{
+	return(THDVAR((THD*) thd, flush_log_at_trx_commit_session));
+}
+
+/********************************************************************//**
+Obtain the InnoDB transaction of a MySQL thread.
+@return	reference to transaction pointer */
+static inline
+trx_t*&
+thd_to_trx(
+/*=======*/
+	THD*	thd)	/*!< in: MySQL thread */
+{
+	return(*(trx_t**) thd_ha_data(thd, innodb_hton_ptr));
+}
+
+/********************************************************************//**
+Call this function when mysqld passes control to the client. That is to
+avoid deadlocks on the adaptive hash S-latch possibly held by thd. For more
+documentation, see handler.cc.
+@return	0 */
+static
+int
+innobase_release_temporary_latches(
+/*===============================*/
+	handlerton*	hton,	/*!< in: handlerton */
+	THD*		thd)	/*!< in: MySQL thread */
+{
+	trx_t*	trx;
+
+	DBUG_ASSERT(hton == innodb_hton_ptr);
+
+	if (!innodb_inited) {
+
+		return(0);
+	}
+
+	trx = thd_to_trx(thd);
+
+	if (trx) {
+		innobase_release_stat_resources(trx);
+	}
+	return(0);
+}
+
+/********************************************************************//**
+Increments innobase_active_counter and every INNOBASE_WAKE_INTERVALth
+time calls srv_active_wake_master_thread. This function should be used
+when a single database operation may introduce a small need for
+server utility activity, like checkpointing. */
+static inline
+void
+innobase_active_small(void)
+/*=======================*/
+{
+	innobase_active_counter++;
+
+	if ((innobase_active_counter % INNOBASE_WAKE_INTERVAL) == 0) {
+		srv_active_wake_master_thread();
+	}
+}
+
+/********************************************************************//**
+Converts an InnoDB error code to a MySQL error code and also tells to MySQL
+about a possible transaction rollback inside InnoDB caused by a lock wait
+timeout or a deadlock.
+@return	MySQL error code */
+extern "C" UNIV_INTERN
+int
+convert_error_code_to_mysql(
+/*========================*/
+	int	error,	/*!< in: InnoDB error code */
+	ulint	flags,	/*!< in: InnoDB table flags, or 0 */
+	THD*	thd)	/*!< in: user thread handle or NULL */
+{
+	switch (error) {
+	case DB_SUCCESS:
+		return(0);
+
+	case DB_INTERRUPTED:
+		my_error(ER_QUERY_INTERRUPTED, MYF(0));
+		/* fall through */
+
+	case DB_FOREIGN_EXCEED_MAX_CASCADE:
+		push_warning_printf(thd, MYSQL_ERROR::WARN_LEVEL_WARN,
+				    HA_ERR_ROW_IS_REFERENCED,
+				    "InnoDB: Cannot delete/update "
+				    "rows with cascading foreign key "
+				    "constraints that exceed max "
+				    "depth of %d. Please "
+				    "drop extra constraints and try "
+				    "again", DICT_FK_MAX_RECURSIVE_LOAD);
+
+		/* fall through */
+
+	case DB_ERROR:
+	default:
+		return(-1); /* unspecified error */
+
+	case DB_DUPLICATE_KEY:
+		/* Be cautious with returning this error, since
+		mysql could re-enter the storage layer to get
+		duplicated key info, the operation requires a
+		valid table handle and/or transaction information,
+		which might not always be available in the error
+		handling stage. */
+		return(HA_ERR_FOUND_DUPP_KEY);
+
+	case DB_FOREIGN_DUPLICATE_KEY:
+		return(HA_ERR_FOREIGN_DUPLICATE_KEY);
+
+	case DB_MISSING_HISTORY:
+		return(HA_ERR_TABLE_DEF_CHANGED);
+
+	case DB_RECORD_NOT_FOUND:
+		return(HA_ERR_NO_ACTIVE_RECORD);
+
+	case DB_DEADLOCK:
+		/* Since we rolled back the whole transaction, we must
+		tell it also to MySQL so that MySQL knows to empty the
+		cached binlog for this transaction */
+
+		if (thd) {
+			thd_mark_transaction_to_rollback(thd, TRUE);
+		}
+
+		return(HA_ERR_LOCK_DEADLOCK);
+
+	case DB_LOCK_WAIT_TIMEOUT:
+		/* Starting from 5.0.13, we let MySQL just roll back the
+		latest SQL statement in a lock wait timeout. Previously, we
+		rolled back the whole transaction. */
+
+		if (thd) {
+			thd_mark_transaction_to_rollback(
+				thd, (bool)row_rollback_on_timeout);
+		}
+
+		return(HA_ERR_LOCK_WAIT_TIMEOUT);
+
+	case DB_NO_REFERENCED_ROW:
+		return(HA_ERR_NO_REFERENCED_ROW);
+
+	case DB_ROW_IS_REFERENCED:
+		return(HA_ERR_ROW_IS_REFERENCED);
+
+	case DB_CANNOT_ADD_CONSTRAINT:
+		return(HA_ERR_CANNOT_ADD_FOREIGN);
+
+	case DB_CANNOT_DROP_CONSTRAINT:
+
+		return(HA_ERR_ROW_IS_REFERENCED); /* TODO: This is a bit
+						misleading, a new MySQL error
+						code should be introduced */
+
+	case DB_COL_APPEARS_TWICE_IN_INDEX:
+	case DB_CORRUPTION:
+		return(HA_ERR_CRASHED);
+
+	case DB_OUT_OF_FILE_SPACE:
+		return(HA_ERR_RECORD_FILE_FULL);
+
+	case DB_TABLE_IS_BEING_USED:
+		return(HA_ERR_WRONG_COMMAND);
+
+	case DB_TABLE_NOT_FOUND:
+		return(HA_ERR_NO_SUCH_TABLE);
+
+	case DB_TOO_BIG_RECORD:
+		my_error(ER_TOO_BIG_ROWSIZE, MYF(0),
+			 page_get_free_space_of_empty(flags
+						      & DICT_TF_COMPACT) / 2);
+		return(HA_ERR_TO_BIG_ROW);
+
+	case DB_NO_SAVEPOINT:
+		return(HA_ERR_NO_SAVEPOINT);
+
+	case DB_LOCK_TABLE_FULL:
+		/* Since we rolled back the whole transaction, we must
+		tell it also to MySQL so that MySQL knows to empty the
+		cached binlog for this transaction */
+
+		if (thd) {
+			thd_mark_transaction_to_rollback(thd, TRUE);
+		}
+
+		return(HA_ERR_LOCK_TABLE_FULL);
+
+	case DB_PRIMARY_KEY_IS_NULL:
+		return(ER_PRIMARY_CANT_HAVE_NULL);
+
+	case DB_TOO_MANY_CONCURRENT_TRXS:
+		/* New error code HA_ERR_TOO_MANY_CONCURRENT_TRXS is only
+		available in 5.1.38 and later, but the plugin should still
+		work with previous versions of MySQL. */
+#ifdef HA_ERR_TOO_MANY_CONCURRENT_TRXS
+		return(HA_ERR_TOO_MANY_CONCURRENT_TRXS);
+#else /* HA_ERR_TOO_MANY_CONCURRENT_TRXS */
+		return(HA_ERR_RECORD_FILE_FULL);
+#endif /* HA_ERR_TOO_MANY_CONCURRENT_TRXS */
+	case DB_UNSUPPORTED:
+		return(HA_ERR_UNSUPPORTED);
+	}
+}
+
+/*************************************************************//**
+Prints info of a THD object (== user session thread) to the given file. */
+extern "C" UNIV_INTERN
+void
+innobase_mysql_print_thd(
+/*=====================*/
+	FILE*	f,		/*!< in: output stream */
+	void*	thd,		/*!< in: pointer to a MySQL THD object */
+	uint	max_query_len)	/*!< in: max query length to print, or 0 to
+				   use the default max length */
+{
+	char	buffer[1024];
+
+	fputs(thd_security_context((THD*) thd, buffer, sizeof buffer,
+				   max_query_len), f);
+	putc('\n', f);
+}
+
+/******************************************************************//**
+Get the variable length bounds of the given character set. */
+extern "C" UNIV_INTERN
+void
+innobase_get_cset_width(
+/*====================*/
+	ulint	cset,		/*!< in: MySQL charset-collation code */
+	ulint*	mbminlen,	/*!< out: minimum length of a char (in bytes) */
+	ulint*	mbmaxlen)	/*!< out: maximum length of a char (in bytes) */
+{
+	CHARSET_INFO*	cs;
+	ut_ad(cset < 256);
+	ut_ad(mbminlen);
+	ut_ad(mbmaxlen);
+
+	cs = all_charsets[cset];
+	if (cs) {
+		*mbminlen = cs->mbminlen;
+		*mbmaxlen = cs->mbmaxlen;
+	} else {
+		THD*	thd = current_thd;
+
+		if (thd && thd_sql_command(thd) == SQLCOM_DROP_TABLE) {
+
+			/* Fix bug#46256: allow tables to be dropped if the
+			collation is not found, but issue a warning. */
+			if ((global_system_variables.log_warnings)
+			    && (cset != 0)){
+
+				sql_print_warning(
+					"Unknown collation #%lu.", cset);
+			}
+		} else {
+
+			ut_a(cset == 0);
+		}
+
+		*mbminlen = *mbmaxlen = 0;
+	}
+}
+
+/******************************************************************//**
+Converts an identifier to a table name. */
+extern "C" UNIV_INTERN
+void
+innobase_convert_from_table_id(
+/*===========================*/
+	struct charset_info_st*	cs,	/*!< in: the 'from' character set */
+	char*			to,	/*!< out: converted identifier */
+	const char*		from,	/*!< in: identifier to convert */
+	ulint			len)	/*!< in: length of 'to', in bytes */
+{
+	uint	errors;
+
+	strconvert(cs, from, &my_charset_filename, to, (uint) len, &errors);
+}
+
+/******************************************************************//**
+Converts an identifier to UTF-8. */
+extern "C" UNIV_INTERN
+void
+innobase_convert_from_id(
+/*=====================*/
+	struct charset_info_st*	cs,	/*!< in: the 'from' character set */
+	char*			to,	/*!< out: converted identifier */
+	const char*		from,	/*!< in: identifier to convert */
+	ulint			len)	/*!< in: length of 'to', in bytes */
+{
+	uint	errors;
+
+	strconvert(cs, from, system_charset_info, to, (uint) len, &errors);
+}
+
+/******************************************************************//**
+Compares NUL-terminated UTF-8 strings case insensitively.
+@return	0 if a=b, <0 if a<b, >1 if a>b */
+extern "C" UNIV_INTERN
+int
+innobase_strcasecmp(
+/*================*/
+	const char*	a,	/*!< in: first string to compare */
+	const char*	b)	/*!< in: second string to compare */
+{
+	return(my_strcasecmp(system_charset_info, a, b));
+}
+
+/******************************************************************//**
+Makes all characters in a NUL-terminated UTF-8 string lower case. */
+extern "C" UNIV_INTERN
+void
+innobase_casedn_str(
+/*================*/
+	char*	a)	/*!< in/out: string to put in lower case */
+{
+	my_casedn_str(system_charset_info, a);
+}
+
+/**********************************************************************//**
+Determines the connection character set.
+@return	connection character set */
+extern "C" UNIV_INTERN
+struct charset_info_st*
+innobase_get_charset(
+/*=================*/
+	void*	mysql_thd)	/*!< in: MySQL thread handle */
+{
+	return(thd_charset((THD*) mysql_thd));
+}
+
+/**********************************************************************//**
+Determines the current SQL statement.
+@return	SQL statement string */
+extern "C" UNIV_INTERN
+const char*
+innobase_get_stmt(
+/*==============*/
+	void*	mysql_thd,	/*!< in: MySQL thread handle */
+	size_t*	length)		/*!< out: length of the SQL statement */
+{
+#if MYSQL_VERSION_ID >= 50142
+	LEX_STRING* stmt;
+
+	stmt = thd_query_string((THD*) mysql_thd);
+	*length = stmt->length;
+	return(stmt->str);
+#else
+	const char*	stmt_str = thd_query((THD*) mysql_thd);
+	*length = strlen(stmt_str);
+	return(stmt_str);
+#endif
+}
+
+#if defined (__WIN__) && defined (MYSQL_DYNAMIC_PLUGIN)
+extern MYSQL_PLUGIN_IMPORT MY_TMPDIR mysql_tmpdir_list;
+/*******************************************************************//**
+Map an OS error to an errno value. The OS error number is stored in
+_doserrno and the mapped value is stored in errno) */
+extern "C"
+void __cdecl
+_dosmaperr(
+	unsigned long);	/*!< in: OS error value */
+
+/*********************************************************************//**
+Creates a temporary file.
+@return	temporary file descriptor, or < 0 on error */
+extern "C" UNIV_INTERN
+int
+innobase_mysql_tmpfile(void)
+/*========================*/
+{
+	int	fd;				/* handle of opened file */
+	HANDLE	osfh;				/* OS handle of opened file */
+	char*	tmpdir;				/* point to the directory
+						where to create file */
+	TCHAR	path_buf[MAX_PATH - 14];	/* buffer for tmp file path.
+						The length cannot be longer
+						than MAX_PATH - 14, or
+						GetTempFileName will fail. */
+	char	filename[MAX_PATH];		/* name of the tmpfile */
+	DWORD	fileaccess = GENERIC_READ	/* OS file access */
+			     | GENERIC_WRITE
+			     | DELETE;
+	DWORD	fileshare = FILE_SHARE_READ	/* OS file sharing mode */
+			    | FILE_SHARE_WRITE
+			    | FILE_SHARE_DELETE;
+	DWORD	filecreate = CREATE_ALWAYS;	/* OS method of open/create */
+	DWORD	fileattrib =			/* OS file attribute flags */
+			     FILE_ATTRIBUTE_NORMAL
+			     | FILE_FLAG_DELETE_ON_CLOSE
+			     | FILE_ATTRIBUTE_TEMPORARY
+			     | FILE_FLAG_SEQUENTIAL_SCAN;
+
+	DBUG_ENTER("innobase_mysql_tmpfile");
+
+	tmpdir = my_tmpdir(&mysql_tmpdir_list);
+
+	/* The tmpdir parameter can not be NULL for GetTempFileName. */
+	if (!tmpdir) {
+		uint	ret;
+
+		/* Use GetTempPath to determine path for temporary files. */
+		ret = GetTempPath(sizeof(path_buf), path_buf);
+		if (ret > sizeof(path_buf) || (ret == 0)) {
+
+			_dosmaperr(GetLastError());	/* map error */
+			DBUG_RETURN(-1);
+		}
+
+		tmpdir = path_buf;
+	}
+
+	/* Use GetTempFileName to generate a unique filename. */
+	if (!GetTempFileName(tmpdir, "ib", 0, filename)) {
+
+		_dosmaperr(GetLastError());	/* map error */
+		DBUG_RETURN(-1);
+	}
+
+	DBUG_PRINT("info", ("filename: %s", filename));
+
+	/* Open/Create the file. */
+	osfh = CreateFile(filename, fileaccess, fileshare, NULL,
+			  filecreate, fileattrib, NULL);
+	if (osfh == INVALID_HANDLE_VALUE) {
+
+		/* open/create file failed! */
+		_dosmaperr(GetLastError());	/* map error */
+		DBUG_RETURN(-1);
+	}
+
+	do {
+		/* Associates a CRT file descriptor with the OS file handle. */
+		fd = _open_osfhandle((intptr_t) osfh, 0);
+	} while (fd == -1 && errno == EINTR);
+
+	if (fd == -1) {
+		/* Open failed, close the file handle. */
+
+		_dosmaperr(GetLastError());	/* map error */
+		CloseHandle(osfh);		/* no need to check if
+						CloseHandle fails */
+	}
+
+	DBUG_RETURN(fd);
+}
+#else
+/*********************************************************************//**
+Creates a temporary file.
+@return	temporary file descriptor, or < 0 on error */
+extern "C" UNIV_INTERN
+int
+innobase_mysql_tmpfile(void)
+/*========================*/
+{
+	int	fd2 = -1;
+	File	fd = mysql_tmpfile("ib");
+	if (fd >= 0) {
+		/* Copy the file descriptor, so that the additional resources
+		allocated by create_temp_file() can be freed by invoking
+		my_close().
+
+		Because the file descriptor returned by this function
+		will be passed to fdopen(), it will be closed by invoking
+		fclose(), which in turn will invoke close() instead of
+		my_close(). */
+		fd2 = dup(fd);
+		if (fd2 < 0) {
+			DBUG_PRINT("error",("Got error %d on dup",fd2));
+			my_errno=errno;
+			my_error(EE_OUT_OF_FILERESOURCES,
+				 MYF(ME_BELL+ME_WAITTANG),
+				 "ib*", my_errno);
+		}
+		my_close(fd, MYF(MY_WME));
+	}
+	return(fd2);
+}
+#endif /* defined (__WIN__) && defined (MYSQL_DYNAMIC_PLUGIN) */
+
+/*********************************************************************//**
+Wrapper around MySQL's copy_and_convert function.
+@return	number of bytes copied to 'to' */
+extern "C" UNIV_INTERN
+ulint
+innobase_convert_string(
+/*====================*/
+	void*		to,		/*!< out: converted string */
+	ulint		to_length,	/*!< in: number of bytes reserved
+					for the converted string */
+	CHARSET_INFO*	to_cs,		/*!< in: character set to convert to */
+	const void*	from,		/*!< in: string to convert */
+	ulint		from_length,	/*!< in: number of bytes to convert */
+	CHARSET_INFO*	from_cs,	/*!< in: character set to convert from */
+	uint*		errors)		/*!< out: number of errors encountered
+					during the conversion */
+{
+  return(copy_and_convert((char*)to, (uint32) to_length, to_cs,
+                          (const char*)from, (uint32) from_length, from_cs,
+                          errors));
+}
+
+/*******************************************************************//**
+Formats the raw data in "data" (in InnoDB on-disk format) that is of
+type DATA_(CHAR|VARCHAR|MYSQL|VARMYSQL) using "charset_coll" and writes
+the result to "buf". The result is converted to "system_charset_info".
+Not more than "buf_size" bytes are written to "buf".
+The result is always NUL-terminated (provided buf_size > 0) and the
+number of bytes that were written to "buf" is returned (including the
+terminating NUL).
+@return	number of bytes that were written */
+extern "C" UNIV_INTERN
+ulint
+innobase_raw_format(
+/*================*/
+	const char*	data,		/*!< in: raw data */
+	ulint		data_len,	/*!< in: raw data length
+					in bytes */
+	ulint		charset_coll,	/*!< in: charset collation */
+	char*		buf,		/*!< out: output buffer */
+	ulint		buf_size)	/*!< in: output buffer size
+					in bytes */
+{
+	/* XXX we use a hard limit instead of allocating
+	but_size bytes from the heap */
+	CHARSET_INFO*	data_cs;
+	char		buf_tmp[8192];
+	ulint		buf_tmp_used;
+	uint		num_errors;
+
+	data_cs = all_charsets[charset_coll];
+
+	buf_tmp_used = innobase_convert_string(buf_tmp, sizeof(buf_tmp),
+					       system_charset_info,
+					       data, data_len, data_cs,
+					       &num_errors);
+
+	return(ut_str_sql_format(buf_tmp, buf_tmp_used, buf, buf_size));
+}
+
+/*********************************************************************//**
+Compute the next autoinc value.
+
+For MySQL replication the autoincrement values can be partitioned among
+the nodes. The offset is the start or origin of the autoincrement value
+for a particular node. For n nodes the increment will be n and the offset
+will be in the interval [1, n]. The formula tries to allocate the next
+value for a particular node.
+
+Note: This function is also called with increment set to the number of
+values we want to reserve for multi-value inserts e.g.,
+
+	INSERT INTO T VALUES(), (), ();
+
+innobase_next_autoinc() will be called with increment set to
+n * 3 where autoinc_lock_mode != TRADITIONAL because we want
+to reserve 3 values for the multi-value INSERT above.
+@return	the next value */
+static
+ulonglong
+innobase_next_autoinc(
+/*==================*/
+	ulonglong	current,	/*!< in: Current value */
+	ulonglong	increment,	/*!< in: increment current by */
+	ulonglong	offset,		/*!< in: AUTOINC offset */
+	ulonglong	max_value)	/*!< in: max value for type */
+{
+	ulonglong	next_value;
+
+	/* Should never be 0. */
+	ut_a(increment > 0);
+
+	/* According to MySQL documentation, if the offset is greater than
+	the increment then the offset is ignored. */
+	if (offset > increment) {
+		offset = 0;
+	}
+
+	if (max_value <= current) {
+		next_value = max_value;
+	} else if (offset <= 1) {
+		/* Offset 0 and 1 are the same, because there must be at
+		least one node in the system. */
+		if (max_value - current <= increment) {
+			next_value = max_value;
+		} else {
+			next_value = current + increment;
+		}
+	} else if (max_value > current) {
+		if (current > offset) {
+			next_value = ((current - offset) / increment) + 1;
+		} else {
+			next_value = ((offset - current) / increment) + 1;
+		}
+
+		ut_a(increment > 0);
+		ut_a(next_value > 0);
+
+		/* Check for multiplication overflow. */
+		if (increment > (max_value / next_value)) {
+
+			next_value = max_value;
+		} else {
+			next_value *= increment;
+
+			ut_a(max_value >= next_value);
+
+			/* Check for overflow. */
+			if (max_value - next_value <= offset) {
+				next_value = max_value;
+			} else {
+				next_value += offset;
+			}
+		}
+	} else {
+		next_value = max_value;
+	}
+
+	ut_a(next_value <= max_value);
+
+	return(next_value);
+}
+
+/*********************************************************************//**
+Initializes some fields in an InnoDB transaction object. */
+static
+void
+innobase_trx_init(
+/*==============*/
+	THD*	thd,	/*!< in: user thread handle */
+	trx_t*	trx)	/*!< in/out: InnoDB transaction handle */
+{
+	DBUG_ENTER("innobase_trx_init");
+	DBUG_ASSERT(EQ_CURRENT_THD(thd));
+	DBUG_ASSERT(thd == trx->mysql_thd);
+
+	trx->check_foreigns = !thd_test_options(
+		thd, OPTION_NO_FOREIGN_KEY_CHECKS);
+
+	trx->check_unique_secondary = !thd_test_options(
+		thd, OPTION_RELAXED_UNIQUE_CHECKS);
+
+#ifdef EXTENDED_SLOWLOG
+	if (thd_log_slow_verbosity(thd) & SLOG_V_INNODB) {
+		trx->take_stats = TRUE;
+	} else {
+		trx->take_stats = FALSE;
+	}
+#else
+	trx->take_stats = FALSE;
+#endif
+
+	DBUG_VOID_RETURN;
+}
+
+/*********************************************************************//**
+Allocates an InnoDB transaction for a MySQL handler object.
+@return	InnoDB transaction handle */
+extern "C" UNIV_INTERN
+trx_t*
+innobase_trx_allocate(
+/*==================*/
+	THD*	thd)	/*!< in: user thread handle */
+{
+	trx_t*	trx;
+
+	DBUG_ENTER("innobase_trx_allocate");
+	DBUG_ASSERT(thd != NULL);
+	DBUG_ASSERT(EQ_CURRENT_THD(thd));
+
+	trx = trx_allocate_for_mysql();
+
+	trx->mysql_thd = thd;
+
+	innobase_trx_init(thd, trx);
+
+	DBUG_RETURN(trx);
+}
+
+/*********************************************************************//**
+Gets the InnoDB transaction handle for a MySQL handler object, creates
+an InnoDB transaction struct if the corresponding MySQL thread struct still
+lacks one.
+@return	InnoDB transaction handle */
+static
+trx_t*
+check_trx_exists(
+/*=============*/
+	THD*	thd)	/*!< in: user thread handle */
+{
+	trx_t*&	trx = thd_to_trx(thd);
+
+	ut_ad(EQ_CURRENT_THD(thd));
+
+	if (trx == NULL) {
+		trx = innobase_trx_allocate(thd);
+	} else if (UNIV_UNLIKELY(trx->magic_n != TRX_MAGIC_N)) {
+		mem_analyze_corruption(trx);
+		ut_error;
+	}
+
+	innobase_trx_init(thd, trx);
+
+	return(trx);
+}
+
+
+/*************************************************************************
+Gets current trx. */
+extern "C"
+trx_t*
+innobase_get_trx()
+{
+	THD *thd=current_thd;
+	if (likely(thd != 0)) {
+		trx_t*& trx = thd_to_trx(thd);
+		return(trx);
+	} else {
+		return(NULL);
+	}
+}
+
+extern "C"
+ibool
+innobase_get_slow_log()
+{
+#ifdef EXTENDED_SLOWLOG
+	return((ibool) thd_opt_slow_log());
+#else
+	return(FALSE);
+#endif
+}
+
+/*********************************************************************//**
+Construct ha_innobase handler. */
+UNIV_INTERN
+ha_innobase::ha_innobase(handlerton *hton, TABLE_SHARE *table_arg)
+  :handler(hton, table_arg),
+  int_table_flags(HA_REC_NOT_IN_SEQ |
+		  HA_NULL_IN_KEY |
+		  HA_CAN_INDEX_BLOBS |
+		  HA_CAN_SQL_HANDLER |
+		  HA_PRIMARY_KEY_REQUIRED_FOR_POSITION |
+		  HA_PRIMARY_KEY_IN_READ_INDEX |
+		  HA_BINLOG_ROW_CAPABLE |
+		  HA_CAN_GEOMETRY | HA_PARTIAL_COLUMN_READ |
+		  HA_TABLE_SCAN_ON_INDEX),
+  start_of_scan(0),
+  num_write_row(0)
+{}
+
+/*********************************************************************//**
+Destruct ha_innobase handler. */
+UNIV_INTERN
+ha_innobase::~ha_innobase()
+{
+}
+
+/*********************************************************************//**
+Updates the user_thd field in a handle and also allocates a new InnoDB
+transaction handle if needed, and updates the transaction fields in the
+prebuilt struct. */
+UNIV_INTERN inline
+void
+ha_innobase::update_thd(
+/*====================*/
+	THD*	thd)	/*!< in: thd to use the handle */
+{
+	trx_t*		trx;
+
+	trx = check_trx_exists(thd);
+
+	if (prebuilt->trx != trx) {
+
+		row_update_prebuilt_trx(prebuilt, trx);
+	}
+
+	user_thd = thd;
+}
+
+/*********************************************************************//**
+Updates the user_thd field in a handle and also allocates a new InnoDB
+transaction handle if needed, and updates the transaction fields in the
+prebuilt struct. */
+UNIV_INTERN
+void
+ha_innobase::update_thd()
+/*=====================*/
+{
+	THD*	thd = ha_thd();
+	ut_ad(EQ_CURRENT_THD(thd));
+	update_thd(thd);
+}
+
+/*********************************************************************//**
+Registers that InnoDB takes part in an SQL statement, so that MySQL knows to
+roll back the statement if the statement results in an error. This MUST be
+called for every SQL statement that may be rolled back by MySQL. Calling this
+several times to register the same statement is allowed, too. */
+static inline
+void
+innobase_register_stmt(
+/*===================*/
+        handlerton*	hton,	/*!< in: Innobase hton */
+	THD*	thd)	/*!< in: MySQL thd (connection) object */
+{
+	DBUG_ASSERT(hton == innodb_hton_ptr);
+	/* Register the statement */
+	trans_register_ha(thd, FALSE, hton);
+}
+
+/*********************************************************************//**
+Registers an InnoDB transaction in MySQL, so that the MySQL XA code knows
+to call the InnoDB prepare and commit, or rollback for the transaction. This
+MUST be called for every transaction for which the user may call commit or
+rollback. Calling this several times to register the same transaction is
+allowed, too.
+This function also registers the current SQL statement. */
+static inline
+void
+innobase_register_trx_and_stmt(
+/*===========================*/
+        handlerton *hton, /*!< in: Innobase handlerton */
+	THD*	thd)	/*!< in: MySQL thd (connection) object */
+{
+	/* NOTE that actually innobase_register_stmt() registers also
+	the transaction in the AUTOCOMMIT=1 mode. */
+
+	innobase_register_stmt(hton, thd);
+
+	if (thd_test_options(thd, OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN)) {
+
+		/* No autocommit mode, register for a transaction */
+		trans_register_ha(thd, TRUE, hton);
+	}
+}
+
+/*   BACKGROUND INFO: HOW THE MYSQL QUERY CACHE WORKS WITH INNODB
+     ------------------------------------------------------------
+
+1) The use of the query cache for TBL is disabled when there is an
+uncommitted change to TBL.
+
+2) When a change to TBL commits, InnoDB stores the current value of
+its global trx id counter, let us denote it by INV_TRX_ID, to the table object
+in the InnoDB data dictionary, and does only allow such transactions whose
+id <= INV_TRX_ID to use the query cache.
+
+3) When InnoDB does an INSERT/DELETE/UPDATE to a table TBL, or an implicit
+modification because an ON DELETE CASCADE, we invalidate the MySQL query cache
+of TBL immediately.
+
+How this is implemented inside InnoDB:
+
+1) Since every modification always sets an IX type table lock on the InnoDB
+table, it is easy to check if there can be uncommitted modifications for a
+table: just check if there are locks in the lock list of the table.
+
+2) When a transaction inside InnoDB commits, it reads the global trx id
+counter and stores the value INV_TRX_ID to the tables on which it had a lock.
+
+3) If there is an implicit table change from ON DELETE CASCADE or SET NULL,
+InnoDB calls an invalidate method for the MySQL query cache for that table.
+
+How this is implemented inside sql_cache.cc:
+
+1) The query cache for an InnoDB table TBL is invalidated immediately at an
+INSERT/UPDATE/DELETE, just like in the case of MyISAM. No need to delay
+invalidation to the transaction commit.
+
+2) To store or retrieve a value from the query cache of an InnoDB table TBL,
+any query must first ask InnoDB's permission. We must pass the thd as a
+parameter because InnoDB will look at the trx id, if any, associated with
+that thd.
+
+3) Use of the query cache for InnoDB tables is now allowed also when
+AUTOCOMMIT==0 or we are inside BEGIN ... COMMIT. Thus transactions no longer
+put restrictions on the use of the query cache.
+*/
+
+/******************************************************************//**
+The MySQL query cache uses this to check from InnoDB if the query cache at
+the moment is allowed to operate on an InnoDB table. The SQL query must
+be a non-locking SELECT.
+
+The query cache is allowed to operate on certain query only if this function
+returns TRUE for all tables in the query.
+
+If thd is not in the autocommit state, this function also starts a new
+transaction for thd if there is no active trx yet, and assigns a consistent
+read view to it if there is no read view yet.
+
+Why a deadlock of threads is not possible: the query cache calls this function
+at the start of a SELECT processing. Then the calling thread cannot be
+holding any InnoDB semaphores. The calling thread is holding the
+query cache mutex, and this function will reserver the InnoDB kernel mutex.
+Thus, the 'rank' in sync0sync.h of the MySQL query cache mutex is above
+the InnoDB kernel mutex.
+@return TRUE if permitted, FALSE if not; note that the value FALSE
+does not mean we should invalidate the query cache: invalidation is
+called explicitly */
+static
+my_bool
+innobase_query_caching_of_table_permitted(
+/*======================================*/
+	THD*	thd,		/*!< in: thd of the user who is trying to
+				store a result to the query cache or
+				retrieve it */
+	char*	full_name,	/*!< in: concatenation of database name,
+				the null character NUL, and the table
+				name */
+	uint	full_name_len,	/*!< in: length of the full name, i.e.
+				len(dbname) + len(tablename) + 1 */
+	ulonglong *unused)	/*!< unused for this engine */
+{
+	ibool	is_autocommit;
+	trx_t*	trx;
+	char	norm_name[1000];
+
+	ut_a(full_name_len < 999);
+
+	trx = check_trx_exists(thd);
+
+	if (trx->isolation_level == TRX_ISO_SERIALIZABLE) {
+		/* In the SERIALIZABLE mode we add LOCK IN SHARE MODE to every
+		plain SELECT if AUTOCOMMIT is not on. */
+
+		return((my_bool)FALSE);
+	}
+
+	if (trx->has_search_latch) {
+		sql_print_error("The calling thread is holding the adaptive "
+				"search, latch though calling "
+				"innobase_query_caching_of_table_permitted.");
+
+		mutex_enter(&kernel_mutex);
+		trx_print(stderr, trx, 1024);
+		mutex_exit(&kernel_mutex);
+	}
+
+	innobase_release_stat_resources(trx);
+
+	if (!thd_test_options(thd, OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN)) {
+
+		is_autocommit = TRUE;
+	} else {
+		is_autocommit = FALSE;
+
+	}
+
+	if (is_autocommit && trx->n_mysql_tables_in_use == 0) {
+		/* We are going to retrieve the query result from the query
+		cache. This cannot be a store operation to the query cache
+		because then MySQL would have locks on tables already.
+
+		TODO: if the user has used LOCK TABLES to lock the table,
+		then we open a transaction in the call of row_.. below.
+		That trx can stay open until UNLOCK TABLES. The same problem
+		exists even if we do not use the query cache. MySQL should be
+		modified so that it ALWAYS calls some cleanup function when
+		the processing of a query ends!
+
+		We can imagine we instantaneously serialize this consistent
+		read trx to the current trx id counter. If trx2 would have
+		changed the tables of a query result stored in the cache, and
+		trx2 would have already committed, making the result obsolete,
+		then trx2 would have already invalidated the cache. Thus we
+		can trust the result in the cache is ok for this query. */
+
+		return((my_bool)TRUE);
+	}
+
+	/* Normalize the table name to InnoDB format */
+
+	memcpy(norm_name, full_name, full_name_len);
+
+	norm_name[strlen(norm_name)] = '/'; /* InnoDB uses '/' as the
+					    separator between db and table */
+	norm_name[full_name_len] = '\0';
+#ifdef __WIN__
+	innobase_casedn_str(norm_name);
+#endif
+	/* The call of row_search_.. will start a new transaction if it is
+	not yet started */
+
+	if (trx->active_trans == 0) {
+
+		innobase_register_trx_and_stmt(innodb_hton_ptr, thd);
+		trx->active_trans = 1;
+	}
+
+	if (row_search_check_if_query_cache_permitted(trx, norm_name)) {
+
+		/* printf("Query cache for %s permitted\n", norm_name); */
+
+		return((my_bool)TRUE);
+	}
+
+	/* printf("Query cache for %s NOT permitted\n", norm_name); */
+
+	return((my_bool)FALSE);
+}
+
+/*****************************************************************//**
+Invalidates the MySQL query cache for the table. */
+extern "C" UNIV_INTERN
+void
+innobase_invalidate_query_cache(
+/*============================*/
+	trx_t*		trx,		/*!< in: transaction which
+					modifies the table */
+	const char*	full_name,	/*!< in: concatenation of
+					database name, null char NUL,
+					table name, null char NUL;
+					NOTE that in Windows this is
+					always in LOWER CASE! */
+	ulint		full_name_len)	/*!< in: full name length where
+					also the null chars count */
+{
+	/* Note that the sync0sync.h rank of the query cache mutex is just
+	above the InnoDB kernel mutex. The caller of this function must not
+	have latches of a lower rank. */
+
+	/* Argument TRUE below means we are using transactions */
+#ifdef HAVE_QUERY_CACHE
+	mysql_query_cache_invalidate4((THD*) trx->mysql_thd,
+				      full_name,
+				      (uint32) full_name_len,
+				      TRUE);
+#endif
+}
+
+/*****************************************************************//**
+Convert an SQL identifier to the MySQL system_charset_info (UTF-8)
+and quote it if needed.
+@return	pointer to the end of buf */
+static
+char*
+innobase_convert_identifier(
+/*========================*/
+	char*		buf,	/*!< out: buffer for converted identifier */
+	ulint		buflen,	/*!< in: length of buf, in bytes */
+	const char*	id,	/*!< in: identifier to convert */
+	ulint		idlen,	/*!< in: length of id, in bytes */
+	void*		thd,	/*!< in: MySQL connection thread, or NULL */
+	ibool		file_id)/*!< in: TRUE=id is a table or database name;
+				FALSE=id is an UTF-8 string */
+{
+	char nz[NAME_LEN + 1];
+#if MYSQL_VERSION_ID >= 50141
+	char nz2[NAME_LEN + 1 + EXPLAIN_FILENAME_MAX_EXTRA_LENGTH];
+#else /* MYSQL_VERSION_ID >= 50141 */
+	char nz2[NAME_LEN + 1 + sizeof srv_mysql50_table_name_prefix];
+#endif /* MYSQL_VERSION_ID >= 50141 */
+
+	const char*	s	= id;
+	int		q;
+
+	if (file_id) {
+		/* Decode the table name.  The MySQL function expects
+		a NUL-terminated string.  The input and output strings
+		buffers must not be shared. */
+
+		if (UNIV_UNLIKELY(idlen > (sizeof nz) - 1)) {
+			idlen = (sizeof nz) - 1;
+		}
+
+		memcpy(nz, id, idlen);
+		nz[idlen] = 0;
+
+		s = nz2;
+#if MYSQL_VERSION_ID >= 50141
+		idlen = explain_filename((THD*) thd, nz, nz2, sizeof nz2,
+					 EXPLAIN_PARTITIONS_AS_COMMENT);
+		goto no_quote;
+#else /* MYSQL_VERSION_ID >= 50141 */
+		idlen = filename_to_tablename(nz, nz2, sizeof nz2);
+#endif /* MYSQL_VERSION_ID >= 50141 */
+	}
+
+	/* See if the identifier needs to be quoted. */
+	if (UNIV_UNLIKELY(!thd)) {
+		q = '"';
+	} else {
+		q = get_quote_char_for_identifier((THD*) thd, s, (int) idlen);
+	}
+
+	if (q == EOF) {
+#if MYSQL_VERSION_ID >= 50141
+no_quote:
+#endif /* MYSQL_VERSION_ID >= 50141 */
+		if (UNIV_UNLIKELY(idlen > buflen)) {
+			idlen = buflen;
+		}
+		memcpy(buf, s, idlen);
+		return(buf + idlen);
+	}
+
+	/* Quote the identifier. */
+	if (buflen < 2) {
+		return(buf);
+	}
+
+	*buf++ = q;
+	buflen--;
+
+	for (; idlen; idlen--) {
+		int	c = *s++;
+		if (UNIV_UNLIKELY(c == q)) {
+			if (UNIV_UNLIKELY(buflen < 3)) {
+				break;
+			}
+
+			*buf++ = c;
+			*buf++ = c;
+			buflen -= 2;
+		} else {
+			if (UNIV_UNLIKELY(buflen < 2)) {
+				break;
+			}
+
+			*buf++ = c;
+			buflen--;
+		}
+	}
+
+	*buf++ = q;
+	return(buf);
+}
+
+/*****************************************************************//**
+Convert a table or index name to the MySQL system_charset_info (UTF-8)
+and quote it if needed.
+@return	pointer to the end of buf */
+extern "C" UNIV_INTERN
+char*
+innobase_convert_name(
+/*==================*/
+	char*		buf,	/*!< out: buffer for converted identifier */
+	ulint		buflen,	/*!< in: length of buf, in bytes */
+	const char*	id,	/*!< in: identifier to convert */
+	ulint		idlen,	/*!< in: length of id, in bytes */
+	void*		thd,	/*!< in: MySQL connection thread, or NULL */
+	ibool		table_id)/*!< in: TRUE=id is a table or database name;
+				FALSE=id is an index name */
+{
+	char*		s	= buf;
+	const char*	bufend	= buf + buflen;
+
+	if (table_id) {
+		const char*	slash = (const char*) memchr(id, '/', idlen);
+		if (!slash) {
+
+			goto no_db_name;
+		}
+
+		/* Print the database name and table name separately. */
+		s = innobase_convert_identifier(s, bufend - s, id, slash - id,
+						thd, TRUE);
+		if (UNIV_LIKELY(s < bufend)) {
+			*s++ = '.';
+			s = innobase_convert_identifier(s, bufend - s,
+							slash + 1, idlen
+							- (slash - id) - 1,
+							thd, TRUE);
+		}
+	} else if (UNIV_UNLIKELY(*id == TEMP_INDEX_PREFIX)) {
+		/* Temporary index name (smart ALTER TABLE) */
+		const char temp_index_suffix[]= "--temporary--";
+
+		s = innobase_convert_identifier(buf, buflen, id + 1, idlen - 1,
+						thd, FALSE);
+		if (s - buf + (sizeof temp_index_suffix - 1) < buflen) {
+			memcpy(s, temp_index_suffix,
+			       sizeof temp_index_suffix - 1);
+			s += sizeof temp_index_suffix - 1;
+		}
+	} else {
+no_db_name:
+		s = innobase_convert_identifier(buf, buflen, id, idlen,
+						thd, table_id);
+	}
+
+	return(s);
+
+}
+
+/**********************************************************************//**
+Determines if the currently running transaction has been interrupted.
+@return	TRUE if interrupted */
+extern "C" UNIV_INTERN
+ibool
+trx_is_interrupted(
+/*===============*/
+	trx_t*	trx)	/*!< in: transaction */
+{
+	return(trx && trx->mysql_thd && thd_killed((THD*) trx->mysql_thd));
+}
+
+/**********************************************************************//**
+Determines if the currently running transaction is in strict mode.
+@return	TRUE if strict */
+extern "C" UNIV_INTERN
+ibool
+trx_is_strict(
+/*==========*/
+	trx_t*	trx)	/*!< in: transaction */
+{
+	return(trx && trx->mysql_thd
+	       && THDVAR((THD*) trx->mysql_thd, strict_mode));
+}
+
+/**************************************************************//**
+Resets some fields of a prebuilt struct. The template is used in fast
+retrieval of just those column values MySQL needs in its processing. */
+static
+void
+reset_template(
+/*===========*/
+	row_prebuilt_t*	prebuilt)	/*!< in/out: prebuilt struct */
+{
+	prebuilt->keep_other_fields_on_keyread = 0;
+	prebuilt->read_just_key = 0;
+}
+
+/*****************************************************************//**
+Call this when you have opened a new table handle in HANDLER, before you
+call index_read_idx() etc. Actually, we can let the cursor stay open even
+over a transaction commit! Then you should call this before every operation,
+fetch next etc. This function inits the necessary things even after a
+transaction commit. */
+UNIV_INTERN
+void
+ha_innobase::init_table_handle_for_HANDLER(void)
+/*============================================*/
+{
+	/* If current thd does not yet have a trx struct, create one.
+	If the current handle does not yet have a prebuilt struct, create
+	one. Update the trx pointers in the prebuilt struct. Normally
+	this operation is done in external_lock. */
+
+	update_thd(ha_thd());
+
+	/* Initialize the prebuilt struct much like it would be inited in
+	external_lock */
+
+	innobase_release_stat_resources(prebuilt->trx);
+
+	/* If the transaction is not started yet, start it */
+
+	trx_start_if_not_started(prebuilt->trx);
+
+	/* Assign a read view if the transaction does not have it yet */
+
+	trx_assign_read_view(prebuilt->trx);
+
+	/* Set the MySQL flag to mark that there is an active transaction */
+
+	if (prebuilt->trx->active_trans == 0) {
+
+		innobase_register_trx_and_stmt(ht, user_thd);
+
+		prebuilt->trx->active_trans = 1;
+	}
+
+	/* We did the necessary inits in this function, no need to repeat them
+	in row_search_for_mysql */
+
+	prebuilt->sql_stat_start = FALSE;
+
+	/* We let HANDLER always to do the reads as consistent reads, even
+	if the trx isolation level would have been specified as SERIALIZABLE */
+
+	prebuilt->select_lock_type = LOCK_NONE;
+	prebuilt->stored_select_lock_type = LOCK_NONE;
+
+	/* Always fetch all columns in the index record */
+
+	prebuilt->hint_need_to_fetch_extra_cols = ROW_RETRIEVE_ALL_COLS;
+
+	/* We want always to fetch all columns in the whole row? Or do
+	we???? */
+
+	prebuilt->used_in_HANDLER = TRUE;
+	reset_template(prebuilt);
+}
+
+/*********************************************************************//**
+Opens an InnoDB database.
+@return	0 on success, error code on failure */
+static
+int
+innobase_init(
+/*==========*/
+	void	*p)	/*!< in: InnoDB handlerton */
+{
+	static char	current_dir[3];		/*!< Set if using current lib */
+	int		err;
+	bool		ret;
+	char		*default_path;
+	uint		format_id;
+
+	DBUG_ENTER("innobase_init");
+        handlerton *innobase_hton= (handlerton *)p;
+        innodb_hton_ptr = innobase_hton;
+
+        innobase_hton->state = SHOW_OPTION_YES;
+        innobase_hton->db_type= DB_TYPE_INNODB;
+        innobase_hton->savepoint_offset=sizeof(trx_named_savept_t);
+        innobase_hton->close_connection=innobase_close_connection;
+        innobase_hton->savepoint_set=innobase_savepoint;
+        innobase_hton->savepoint_rollback=innobase_rollback_to_savepoint;
+        innobase_hton->savepoint_release=innobase_release_savepoint;
+        innobase_hton->commit=innobase_commit;
+        innobase_hton->rollback=innobase_rollback;
+        innobase_hton->prepare=innobase_xa_prepare;
+        innobase_hton->recover=innobase_xa_recover;
+        innobase_hton->commit_by_xid=innobase_commit_by_xid;
+        innobase_hton->rollback_by_xid=innobase_rollback_by_xid;
+        innobase_hton->create_cursor_read_view=innobase_create_cursor_view;
+        innobase_hton->set_cursor_read_view=innobase_set_cursor_view;
+        innobase_hton->close_cursor_read_view=innobase_close_cursor_view;
+        innobase_hton->create=innobase_create_handler;
+        innobase_hton->drop_database=innobase_drop_database;
+        innobase_hton->panic=innobase_end;
+        innobase_hton->start_consistent_snapshot=innobase_start_trx_and_assign_read_view;
+        innobase_hton->flush_logs=innobase_flush_logs;
+        innobase_hton->show_status=innobase_show_status;
+        innobase_hton->flags=HTON_NO_FLAGS;
+        innobase_hton->release_temporary_latches=innobase_release_temporary_latches;
+	innobase_hton->alter_table_flags = innobase_alter_table_flags;
+
+	ut_a(DATA_MYSQL_TRUE_VARCHAR == (ulint)MYSQL_TYPE_VARCHAR);
+
+#ifdef UNIV_DEBUG
+	static const char	test_filename[] = "-@";
+	char			test_tablename[sizeof test_filename
+				+ sizeof srv_mysql50_table_name_prefix];
+	if ((sizeof test_tablename) - 1
+			!= filename_to_tablename(test_filename, test_tablename,
+			sizeof test_tablename)
+			|| strncmp(test_tablename,
+			srv_mysql50_table_name_prefix,
+			sizeof srv_mysql50_table_name_prefix)
+			|| strcmp(test_tablename
+			+ sizeof srv_mysql50_table_name_prefix,
+			test_filename)) {
+		sql_print_error("tablename encoding has been changed");
+		goto error;
+	}
+#endif /* UNIV_DEBUG */
+
+	srv_page_size = 0;
+	srv_page_size_shift = 0;
+
+	if (innobase_page_size != (1 << 14)) {
+		uint n_shift;
+
+		fprintf(stderr,
+			"InnoDB: Warning: innodb_page_size has been changed from default value 16384. (###EXPERIMENTAL### operation)\n");
+		for (n_shift = 12; n_shift <= UNIV_PAGE_SIZE_SHIFT_MAX; n_shift++) {
+			if (innobase_page_size == ((ulong)1 << n_shift)) {
+				srv_page_size_shift = n_shift;
+				srv_page_size = (1 << srv_page_size_shift);
+				fprintf(stderr,
+					"InnoDB: The universal page size of the database is set to %lu.\n",
+					srv_page_size);
+				break;
+			}
+		}
+	} else {
+		srv_page_size_shift = 14;
+		srv_page_size = (1 << srv_page_size_shift);
+	}
+
+	if (!srv_page_size_shift) {
+		fprintf(stderr,
+			"InnoDB: Error: %lu is not valid value for innodb_page_size.\n",
+			innobase_page_size);
+		goto error;
+	}
+
+#ifndef MYSQL_SERVER
+	innodb_overwrite_relay_log_info = FALSE;
+#endif
+
+#ifdef HAVE_REPLICATION
+#ifdef MYSQL_SERVER
+	/* read master log position from relay-log.info if exists */
+	char fname[FN_REFLEN+128];
+	int pos;
+	int info_fd;
+	IO_CACHE info_file;
+
+	fname[0] = '\0';
+
+	if(innobase_overwrite_relay_log_info) {
+
+	fprintf(stderr,
+		"InnoDB: Warning: innodb_overwrite_relay_log_info is enabled."
+		" Updates in other storage engines may have problem with consistency.\n");
+
+	bzero((char*) &info_file, sizeof(info_file));
+	fn_format(fname, relay_log_info_file, mysql_data_home, "", 4+32);
+
+	int error=0;
+
+	if (!access(fname,F_OK)) {
+		/* exist */
+		if ((info_fd = my_open(fname, O_RDWR|O_BINARY, MYF(MY_WME))) < 0) {
+			error=1;
+		} else if (init_io_cache(&info_file, info_fd, IO_SIZE*2,
+					READ_CACHE, 0L, 0, MYF(MY_WME))) {
+			error=1;
+		}
+
+		if (error) {
+relay_info_error:
+			if (info_fd >= 0)
+				my_close(info_fd, MYF(0));
+			fname[0] = '\0';
+			goto skip_relay;
+		}
+	} else {
+		fname[0] = '\0';
+		goto skip_relay;
+	}
+
+	if (init_strvar_from_file(fname, sizeof(fname), &info_file, "") || /* dummy (it is relay-log) */
+	    init_intvar_from_file(&pos, &info_file, BIN_LOG_HEADER_SIZE)) { 
+		end_io_cache(&info_file);
+		error=1;
+		goto relay_info_error;
+	}
+
+	fprintf(stderr,
+		"InnoDB: relay-log.info is detected.\n"
+		"InnoDB: relay log: position %u, file name %s\n",
+		pos, fname);
+
+	strncpy(trx_sys_mysql_relay_log_name, fname, TRX_SYS_MYSQL_MASTER_LOG_NAME_LEN);
+	trx_sys_mysql_relay_log_pos = (ib_int64_t) pos;
+
+	if (init_strvar_from_file(fname, sizeof(fname), &info_file, "") ||
+	    init_intvar_from_file(&pos, &info_file, 0)) {
+		end_io_cache(&info_file);
+		error=1;
+		goto relay_info_error;
+	}
+
+	fprintf(stderr,
+		"InnoDB: master log: position %u, file name %s\n",
+		pos, fname);
+
+	strncpy(trx_sys_mysql_master_log_name, fname, TRX_SYS_MYSQL_MASTER_LOG_NAME_LEN);
+	trx_sys_mysql_master_log_pos = (ib_int64_t) pos;
+
+	end_io_cache(&info_file);
+	if (info_fd >= 0)
+		my_close(info_fd, MYF(0));
+	}
+skip_relay:
+#endif /* MYSQL_SERVER */
+#endif /* HAVE_REPLICATION */
+
+	/* Check that values don't overflow on 32-bit systems. */
+	if (sizeof(ulint) == 4) {
+		if (innobase_buffer_pool_size > UINT_MAX32) {
+			sql_print_error(
+				"innobase_buffer_pool_size can't be over 4GB"
+				" on 32-bit systems");
+
+			goto error;
+		}
+
+		if (innobase_log_file_size > UINT_MAX32) {
+			sql_print_error(
+				"innobase_log_file_size can't be over 4GB"
+				" on 32-bit systems");
+
+			goto error;
+		}
+	}
+
+	os_innodb_umask = (ulint)my_umask;
+
+	/* First calculate the default path for innodb_data_home_dir etc.,
+	in case the user has not given any value.
+
+	Note that when using the embedded server, the datadirectory is not
+	necessarily the current directory of this program. */
+
+	if (mysqld_embedded) {
+		default_path = mysql_real_data_home;
+		fil_path_to_mysql_datadir = mysql_real_data_home;
+	} else {
+		/* It's better to use current lib, to keep paths short */
+		current_dir[0] = FN_CURLIB;
+		current_dir[1] = FN_LIBCHAR;
+		current_dir[2] = 0;
+		default_path = current_dir;
+	}
+
+	ut_a(default_path);
+
+	if (specialflag & SPECIAL_NO_PRIOR) {
+		srv_set_thread_priorities = FALSE;
+	} else {
+		srv_set_thread_priorities = TRUE;
+		srv_query_thread_priority = QUERY_PRIOR;
+	}
+
+	/* Set InnoDB initialization parameters according to the values
+	read from MySQL .cnf file */
+
+	/*--------------- Data files -------------------------*/
+
+	/* The default dir for data files is the datadir of MySQL */
+
+	srv_data_home = (innobase_data_home_dir ? innobase_data_home_dir :
+			 default_path);
+
+	/* Set default InnoDB data file size to 10 MB and let it be
+	auto-extending. Thus users can use InnoDB in >= 4.0 without having
+	to specify any startup options. */
+
+	if (!innobase_data_file_path) {
+		innobase_data_file_path = (char*) "ibdata1:10M:autoextend";
+	}
+
+	/* Since InnoDB edits the argument in the next call, we make another
+	copy of it: */
+
+	internal_innobase_data_file_path = my_strdup(innobase_data_file_path,
+						   MYF(MY_FAE));
+
+	ret = (bool) srv_parse_data_file_paths_and_sizes(
+		internal_innobase_data_file_path);
+	if (ret == FALSE) {
+		sql_print_error(
+			"InnoDB: syntax error in innodb_data_file_path");
+mem_free_and_error:
+		srv_free_paths_and_sizes();
+		my_free(internal_innobase_data_file_path,
+						MYF(MY_ALLOW_ZERO_PTR));
+		goto error;
+	}
+
+	srv_doublewrite_file = innobase_doublewrite_file;
+
+	srv_extra_undoslots = (ibool) innobase_extra_undoslots;
+
+	srv_use_sys_stats_table = (ibool) innobase_use_sys_stats_table;
+
+	/* -------------- Log files ---------------------------*/
+
+	/* The default dir for log files is the datadir of MySQL */
+
+	if (!innobase_log_group_home_dir) {
+		innobase_log_group_home_dir = default_path;
+	}
+
+#ifdef UNIV_LOG_ARCHIVE
+	/* Since innodb_log_arch_dir has no relevance under MySQL,
+	starting from 4.0.6 we always set it the same as
+	innodb_log_group_home_dir: */
+
+	innobase_log_arch_dir = innobase_log_group_home_dir;
+
+	srv_arch_dir = innobase_log_arch_dir;
+#endif /* UNIG_LOG_ARCHIVE */
+
+	ret = (bool)
+		srv_parse_log_group_home_dirs(innobase_log_group_home_dir);
+
+	if (ret == FALSE || innobase_mirrored_log_groups != 1) {
+	  sql_print_error("syntax error in innodb_log_group_home_dir, or a "
+			  "wrong number of mirrored log groups");
+
+		goto mem_free_and_error;
+	}
+
+	/* Validate the file format by animal name */
+	if (innobase_file_format_name != NULL) {
+
+		format_id = innobase_file_format_name_lookup(
+			innobase_file_format_name);
+
+		if (format_id > DICT_TF_FORMAT_MAX) {
+
+			sql_print_error("InnoDB: wrong innodb_file_format.");
+
+			goto mem_free_and_error;
+		}
+	} else {
+		/* Set it to the default file format id. Though this
+		should never happen. */
+		format_id = 0;
+	}
+
+	srv_file_format = format_id;
+
+	/* Given the type of innobase_file_format_name we have little
+	choice but to cast away the constness from the returned name.
+	innobase_file_format_name is used in the MySQL set variable
+	interface and so can't be const. */
+
+	innobase_file_format_name = 
+		(char*) trx_sys_file_format_id_to_name(format_id);
+
+	/* Process innobase_file_format_check variable */
+	ut_a(innobase_file_format_check != NULL);
+
+	/* As a side effect it will set srv_check_file_format_at_startup
+	on valid input. First we check for "on"/"off". */
+	if (!innobase_file_format_check_on_off(innobase_file_format_check)) {
+
+		/* Did the user specify a format name that we support ?
+		As a side effect it will update the variable
+		srv_check_file_format_at_startup */
+		if (innobase_file_format_validate_and_set(
+				innobase_file_format_check) < 0) {
+
+			sql_print_error("InnoDB: invalid "
+					"innodb_file_format_check value: "
+					"should be either 'on' or 'off' or "
+					"any value up to %s or its "
+					"equivalent numeric id",
+					trx_sys_file_format_id_to_name(
+						DICT_TF_FORMAT_MAX));
+
+			goto mem_free_and_error;
+		}
+	}
+
+	if (innobase_change_buffering) {
+		ulint	use;
+
+		for (use = 0;
+		     use < UT_ARR_SIZE(innobase_change_buffering_values);
+		     use++) {
+			if (!innobase_strcasecmp(
+				    innobase_change_buffering,
+				    innobase_change_buffering_values[use])) {
+				ibuf_use = (ibuf_use_t) use;
+				goto innobase_change_buffering_inited_ok;
+			}
+		}
+
+		sql_print_error("InnoDB: invalid value "
+				"innodb_change_buffering=%s",
+				innobase_change_buffering);
+		goto mem_free_and_error;
+	}
+
+innobase_change_buffering_inited_ok:
+	ut_a((ulint) ibuf_use < UT_ARR_SIZE(innobase_change_buffering_values));
+	innobase_change_buffering = (char*)
+		innobase_change_buffering_values[ibuf_use];
+
+	/* --------------------------------------------------*/
+
+	srv_file_flush_method_str = innobase_file_flush_method;
+
+	srv_n_log_groups = (ulint) innobase_mirrored_log_groups;
+	srv_n_log_files = (ulint) innobase_log_files_in_group;
+	srv_log_file_size = (ulint) innobase_log_file_size;
+
+	srv_thread_concurrency_timer_based =
+		(ibool) innobase_thread_concurrency_timer_based;
+
+#ifdef UNIV_LOG_ARCHIVE
+	srv_log_archive_on = (ulint) innobase_log_archive;
+#endif /* UNIV_LOG_ARCHIVE */
+	srv_log_buffer_size = (ulint) innobase_log_buffer_size;
+
+	srv_buf_pool_size = (ulint) innobase_buffer_pool_size;
+
+	srv_mem_pool_size = (ulint) innobase_additional_mem_pool_size;
+
+	srv_n_file_io_threads = (ulint) innobase_file_io_threads;
+	srv_n_read_io_threads = (ulint) innobase_read_io_threads;
+	srv_n_write_io_threads = (ulint) innobase_write_io_threads;
+
+	srv_read_ahead &= 3;
+	srv_adaptive_checkpoint %= 3;
+
+	srv_force_recovery = (ulint) innobase_force_recovery;
+
+	srv_recovery_stats = (ibool) innobase_recovery_stats;
+
+	srv_use_doublewrite_buf = (ibool) innobase_use_doublewrite;
+	srv_use_checksums = (ibool) innobase_use_checksums;
+	srv_fast_checksum = (ibool) innobase_fast_checksum;
+	srv_buffer_pool_shm_checksum = (ibool) innobase_buffer_pool_shm_checksum;
+
+#ifdef HAVE_LARGE_PAGES
+        if ((os_use_large_pages = (ibool) my_use_large_pages))
+		os_large_page_size = (ulint) opt_large_page_size;
+#endif
+
+	row_rollback_on_timeout = (ibool) innobase_rollback_on_timeout;
+
+	srv_locks_unsafe_for_binlog = (ibool) innobase_locks_unsafe_for_binlog;
+
+	srv_max_n_open_files = (ulint) innobase_open_files;
+	srv_innodb_status = (ibool) innobase_create_status_file;
+
+	srv_print_verbose_log = mysqld_embedded ? 0 : 1;
+
+	/* Store the default charset-collation number of this MySQL
+	installation */
+
+	data_mysql_default_charset_coll = (ulint)default_charset_info->number;
+
+	ut_a(DATA_MYSQL_LATIN1_SWEDISH_CHARSET_COLL ==
+					my_charset_latin1.number);
+	ut_a(DATA_MYSQL_BINARY_CHARSET_COLL == my_charset_bin.number);
+
+	/* Store the latin1_swedish_ci character ordering table to InnoDB. For
+	non-latin1_swedish_ci charsets we use the MySQL comparison functions,
+	and consequently we do not need to know the ordering internally in
+	InnoDB. */
+
+	ut_a(0 == strcmp(my_charset_latin1.name, "latin1_swedish_ci"));
+	srv_latin1_ordering = my_charset_latin1.sort_order;
+
+	innobase_old_blocks_pct = buf_LRU_old_ratio_update(
+		innobase_old_blocks_pct, FALSE);
+
+	innobase_commit_concurrency_init_default();
+
+	/* Since we in this module access directly the fields of a trx
+	struct, and due to different headers and flags it might happen that
+	mutex_t has a different size in this module and in InnoDB
+	modules, we check at run time that the size is the same in
+	these compilation modules. */
+
+	err = innobase_start_or_create_for_mysql();
+
+	if (err != DB_SUCCESS) {
+		goto mem_free_and_error;
+	}
+
+#ifdef HAVE_REPLICATION
+#ifdef MYSQL_SERVER
+	if(innobase_overwrite_relay_log_info) {
+	/* If InnoDB progressed from relay-log.info, overwrite it */
+	if (fname[0] == '\0') {
+		fprintf(stderr,
+			"InnoDB: something wrong with relay-info.log. InnoDB will not overwrite it.\n");
+	} else if (0 != strcmp(fname, trx_sys_mysql_master_log_name)
+		   || pos != trx_sys_mysql_master_log_pos) {
+		/* Overwrite relay-log.info */
+		bzero((char*) &info_file, sizeof(info_file));
+		fn_format(fname, relay_log_info_file, mysql_data_home, "", 4+32);
+
+		int error = 0;
+
+		if (!access(fname,F_OK)) {
+			/* exist */
+			if ((info_fd = my_open(fname, O_RDWR|O_BINARY, MYF(MY_WME))) < 0) {
+				error = 1;
+			} else if (init_io_cache(&info_file, info_fd, IO_SIZE*2,
+						WRITE_CACHE, 0L, 0, MYF(MY_WME))) {
+				error = 1;
+			}
+
+			if (error) {
+				if (info_fd >= 0)
+					my_close(info_fd, MYF(0));
+				goto skip_overwrite;
+			}
+		} else {
+			error = 1;
+			goto skip_overwrite;
+		}
+
+		char buff[FN_REFLEN*2+22*2+4], *pos;
+
+		my_b_seek(&info_file, 0L);
+		pos=strmov(buff, trx_sys_mysql_relay_log_name);
+		*pos++='\n';
+		pos=longlong10_to_str(trx_sys_mysql_relay_log_pos, pos, 10);
+		*pos++='\n';
+		pos=strmov(pos, trx_sys_mysql_master_log_name);
+		*pos++='\n';
+		pos=longlong10_to_str(trx_sys_mysql_master_log_pos, pos, 10);
+		*pos='\n';
+
+		if (my_b_write(&info_file, (uchar*) buff, (size_t) (pos-buff)+1))
+			error = 1;
+		if (flush_io_cache(&info_file))
+			error = 1;
+
+		end_io_cache(&info_file);
+		if (info_fd >= 0)
+			my_close(info_fd, MYF(0));
+skip_overwrite:
+		if (error) {
+			fprintf(stderr,
+				"InnoDB: ERROR: error occured during overwriting relay-log.info.\n");
+		} else {
+			fprintf(stderr,
+				"InnoDB: relay-log.info was overwritten.\n");
+		}
+	} else {
+		fprintf(stderr,
+			"InnoDB: InnoDB and relay-log.info are synchronized. InnoDB will not overwrite it.\n");
+	}
+	}
+#endif /* MYSQL_SERVER */
+#endif /* HAVE_REPLICATION */
+
+	innobase_open_tables = hash_create(200);
+	pthread_mutex_init(&innobase_share_mutex, MY_MUTEX_INIT_FAST);
+	pthread_mutex_init(&prepare_commit_mutex, MY_MUTEX_INIT_FAST);
+	pthread_mutex_init(&commit_threads_m, MY_MUTEX_INIT_FAST);
+	pthread_mutex_init(&commit_cond_m, MY_MUTEX_INIT_FAST);
+	pthread_cond_init(&commit_cond, NULL);
+	innodb_inited= 1;
+#ifdef MYSQL_DYNAMIC_PLUGIN
+	if (innobase_hton != p) {
+		innobase_hton = reinterpret_cast<handlerton*>(p);
+		*innobase_hton = *innodb_hton_ptr;
+	}
+#endif /* MYSQL_DYNAMIC_PLUGIN */
+
+	/* Get the current high water mark format. */
+	innobase_file_format_check = (char*) trx_sys_file_format_max_get();
+
+	btr_search_fully_disabled = (!btr_search_enabled);
+	DBUG_RETURN(FALSE);
+error:
+	DBUG_RETURN(TRUE);
+}
+
+/*******************************************************************//**
+Closes an InnoDB database.
+@return	TRUE if error */
+static
+int
+innobase_end(
+/*=========*/
+	handlerton*		hton,	/*!< in/out: InnoDB handlerton */
+	ha_panic_function	type __attribute__((unused)))
+					/*!< in: ha_panic() parameter */
+{
+	int	err= 0;
+
+	DBUG_ENTER("innobase_end");
+	DBUG_ASSERT(hton == innodb_hton_ptr);
+
+#ifdef __NETWARE__	/* some special cleanup for NetWare */
+	if (nw_panic) {
+		set_panic_flag_for_netware();
+	}
+#endif
+	if (innodb_inited) {
+
+		srv_fast_shutdown = (ulint) innobase_fast_shutdown;
+		innodb_inited = 0;
+		hash_table_free(innobase_open_tables);
+		innobase_open_tables = NULL;
+		if (innobase_shutdown_for_mysql() != DB_SUCCESS) {
+			err = 1;
+		}
+		srv_free_paths_and_sizes();
+		my_free(internal_innobase_data_file_path,
+						MYF(MY_ALLOW_ZERO_PTR));
+		pthread_mutex_destroy(&innobase_share_mutex);
+		pthread_mutex_destroy(&prepare_commit_mutex);
+		pthread_mutex_destroy(&commit_threads_m);
+		pthread_mutex_destroy(&commit_cond_m);
+		pthread_cond_destroy(&commit_cond);
+	}
+
+	DBUG_RETURN(err);
+}
+
+/****************************************************************//**
+Flushes InnoDB logs to disk and makes a checkpoint. Really, a commit flushes
+the logs, and the name of this function should be innobase_checkpoint.
+@return	TRUE if error */
+static
+bool
+innobase_flush_logs(
+/*================*/
+	handlerton*	hton)	/*!< in/out: InnoDB handlerton */
+{
+	bool	result = 0;
+
+	DBUG_ENTER("innobase_flush_logs");
+	DBUG_ASSERT(hton == innodb_hton_ptr);
+
+	log_buffer_flush_to_disk();
+
+	DBUG_RETURN(result);
+}
+
+/****************************************************************//**
+Return alter table flags supported in an InnoDB database. */
+static
+uint
+innobase_alter_table_flags(
+/*=======================*/
+	uint	flags)
+{
+	return(HA_ONLINE_ADD_INDEX_NO_WRITES
+		| HA_ONLINE_DROP_INDEX_NO_WRITES
+		| HA_ONLINE_ADD_UNIQUE_INDEX_NO_WRITES
+		| HA_ONLINE_DROP_UNIQUE_INDEX_NO_WRITES
+		| HA_ONLINE_ADD_PK_INDEX_NO_WRITES);
+}
+
+/*****************************************************************//**
+Commits a transaction in an InnoDB database. */
+static
+void
+innobase_commit_low(
+/*================*/
+	trx_t*	trx)	/*!< in: transaction handle */
+{
+	if (trx->conc_state == TRX_NOT_STARTED) {
+
+		return;
+	}
+
+#ifdef HAVE_REPLICATION
+#ifdef MYSQL_SERVER
+	THD *thd=current_thd;
+
+	if (thd && thd->slave_thread) {
+		/* Update the replication position info inside InnoDB.
+		   In embedded server, does nothing. */
+		const char *log_file_name, *group_relay_log_name;
+		ulonglong log_pos, relay_log_pos;
+		bool res = rpl_get_position_info(&log_file_name, &log_pos,
+						 &group_relay_log_name,
+						 &relay_log_pos);
+		if (res) {
+			trx->mysql_master_log_file_name = log_file_name;
+			trx->mysql_master_log_pos = (ib_int64_t)log_pos;
+			trx->mysql_relay_log_file_name = group_relay_log_name;
+			trx->mysql_relay_log_pos = (ib_int64_t)relay_log_pos;
+		}
+	}
+#endif /* MYSQL_SERVER */
+#endif /* HAVE_REPLICATION */
+
+	trx_commit_for_mysql(trx);
+}
+
+/*****************************************************************//**
+Creates an InnoDB transaction struct for the thd if it does not yet have one.
+Starts a new InnoDB transaction if a transaction is not yet started. And
+assigns a new snapshot for a consistent read if the transaction does not yet
+have one.
+@return	0 */
+static
+int
+innobase_start_trx_and_assign_read_view(
+/*====================================*/
+        handlerton *hton, /*!< in: Innodb handlerton */ 
+	THD*	thd)	/*!< in: MySQL thread handle of the user for whom
+			the transaction should be committed */
+{
+	trx_t*	trx;
+
+	DBUG_ENTER("innobase_start_trx_and_assign_read_view");
+	DBUG_ASSERT(hton == innodb_hton_ptr);
+
+	/* Create a new trx struct for thd, if it does not yet have one */
+
+	trx = check_trx_exists(thd);
+
+	/* This is just to play safe: release a possible FIFO ticket and
+	search latch. Since we will reserve the kernel mutex, we have to
+	release the search system latch first to obey the latching order. */
+
+	innobase_release_stat_resources(trx);
+
+	/* If the transaction is not started yet, start it */
+
+	trx_start_if_not_started(trx);
+
+	/* Assign a read view if the transaction does not have it yet */
+
+	trx_assign_read_view(trx);
+
+	/* Set the MySQL flag to mark that there is an active transaction */
+
+	if (trx->active_trans == 0) {
+		innobase_register_trx_and_stmt(hton, thd);
+		trx->active_trans = 1;
+	}
+
+	DBUG_RETURN(0);
+}
+
+/*****************************************************************//**
+Commits a transaction in an InnoDB database or marks an SQL statement
+ended.
+@return	0 */
+static
+int
+innobase_commit(
+/*============*/
+        handlerton *hton, /*!< in: Innodb handlerton */ 
+	THD* 	thd,	/*!< in: MySQL thread handle of the user for whom
+			the transaction should be committed */
+	bool	all)	/*!< in:	TRUE - commit transaction
+				FALSE - the current SQL statement ended */
+{
+	trx_t*		trx;
+
+	DBUG_ENTER("innobase_commit");
+	DBUG_ASSERT(hton == innodb_hton_ptr);
+	DBUG_PRINT("trans", ("ending transaction"));
+
+	trx = check_trx_exists(thd);
+
+	/* Since we will reserve the kernel mutex, we have to release
+	the search system latch first to obey the latching order. */
+
+	if (trx->has_search_latch) {
+		trx_search_latch_release_if_reserved(trx);
+	}
+
+	/* The flag trx->active_trans is set to 1 in
+
+	1. ::external_lock(),
+	2. ::start_stmt(),
+	3. innobase_query_caching_of_table_permitted(),
+	4. innobase_savepoint(),
+	5. ::init_table_handle_for_HANDLER(),
+	6. innobase_start_trx_and_assign_read_view(),
+	7. ::transactional_table_lock()
+
+	and it is only set to 0 in a commit or a rollback. If it is 0 we know
+	there cannot be resources to be freed and we could return immediately.
+	For the time being, we play safe and do the cleanup though there should
+	be nothing to clean up. */
+
+	if (trx->active_trans == 0
+		&& trx->conc_state != TRX_NOT_STARTED) {
+
+		sql_print_error("trx->active_trans == 0, but"
+			" trx->conc_state != TRX_NOT_STARTED");
+	}
+	if (all
+		|| (!thd_test_options(thd, OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN))) {
+
+		/* We were instructed to commit the whole transaction, or
+		this is an SQL statement end and autocommit is on */
+
+		/* We need current binlog position for ibbackup to work.
+		Note, the position is current because of
+		prepare_commit_mutex */
+retry:
+		if (innobase_commit_concurrency > 0) {
+			pthread_mutex_lock(&commit_cond_m);
+			commit_threads++;
+
+			if (commit_threads > innobase_commit_concurrency) {
+				commit_threads--;
+				pthread_cond_wait(&commit_cond,
+					&commit_cond_m);
+				pthread_mutex_unlock(&commit_cond_m);
+				goto retry;
+			}
+			else {
+				pthread_mutex_unlock(&commit_cond_m);
+			}
+		}
+
+		/* The following calls to read the MySQL binary log
+		file name and the position return consistent results:
+		1) Other InnoDB transactions cannot intervene between
+		these calls as we are holding prepare_commit_mutex.
+		2) Binary logging of other engines is not relevant
+		to InnoDB as all InnoDB requires is that committing
+		InnoDB transactions appear in the same order in the
+		MySQL binary log as they appear in InnoDB logs.
+		3) A MySQL log file rotation cannot happen because
+		MySQL protects against this by having a counter of
+		transactions in prepared state and it only allows
+		a rotation when the counter drops to zero. See
+		LOCK_prep_xids and COND_prep_xids in log.cc. */
+		trx->mysql_log_file_name = mysql_bin_log_file_name();
+		trx->mysql_log_offset = (ib_int64_t) mysql_bin_log_file_pos();
+
+		/* Don't do write + flush right now. For group commit
+		to work we want to do the flush after releasing the
+		prepare_commit_mutex. */
+		trx->flush_log_later = TRUE;
+		innobase_commit_low(trx);
+		trx->flush_log_later = FALSE;
+
+		if (innobase_commit_concurrency > 0) {
+			pthread_mutex_lock(&commit_cond_m);
+			commit_threads--;
+			pthread_cond_signal(&commit_cond);
+			pthread_mutex_unlock(&commit_cond_m);
+		}
+
+		if (trx->active_trans == 2) {
+
+			pthread_mutex_unlock(&prepare_commit_mutex);
+		}
+
+		/* Now do a write + flush of logs. */
+		trx_commit_complete_for_mysql(trx);
+		trx->active_trans = 0;
+
+	} else {
+		/* We just mark the SQL statement ended and do not do a
+		transaction commit */
+
+		/* If we had reserved the auto-inc lock for some
+		table in this SQL statement we release it now */
+
+		row_unlock_table_autoinc_for_mysql(trx);
+
+		/* Store the current undo_no of the transaction so that we
+		know where to roll back if we have to roll back the next
+		SQL statement */
+
+		trx_mark_sql_stat_end(trx);
+	}
+
+	trx->n_autoinc_rows = 0; /* Reset the number AUTO-INC rows required */
+
+	if (trx->declared_to_be_inside_innodb) {
+		/* Release our possible ticket in the FIFO */
+
+		srv_conc_force_exit_innodb(trx);
+	}
+
+	/* Tell the InnoDB server that there might be work for utility
+	threads: */
+	srv_active_wake_master_thread();
+
+	DBUG_RETURN(0);
+}
+
+/*****************************************************************//**
+Rolls back a transaction or the latest SQL statement.
+@return	0 or error number */
+static
+int
+innobase_rollback(
+/*==============*/
+        handlerton *hton, /*!< in: Innodb handlerton */ 
+	THD*	thd,	/*!< in: handle to the MySQL thread of the user
+			whose transaction should be rolled back */
+	bool	all)	/*!< in:	TRUE - commit transaction
+				FALSE - the current SQL statement ended */
+{
+	int	error = 0;
+	trx_t*	trx;
+
+	DBUG_ENTER("innobase_rollback");
+	DBUG_ASSERT(hton == innodb_hton_ptr);
+	DBUG_PRINT("trans", ("aborting transaction"));
+
+	trx = check_trx_exists(thd);
+
+	/* Release a possible FIFO ticket and search latch. Since we will
+	reserve the kernel mutex, we have to release the search system latch
+	first to obey the latching order. */
+
+	innobase_release_stat_resources(trx);
+
+	trx->n_autoinc_rows = 0; /* Reset the number AUTO-INC rows required */
+
+	/* If we had reserved the auto-inc lock for some table (if
+	we come here to roll back the latest SQL statement) we
+	release it now before a possibly lengthy rollback */
+
+	row_unlock_table_autoinc_for_mysql(trx);
+
+	if (all
+		|| !thd_test_options(thd, OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN)) {
+
+		error = trx_rollback_for_mysql(trx);
+		trx->active_trans = 0;
+	} else {
+		error = trx_rollback_last_sql_stat_for_mysql(trx);
+	}
+
+	DBUG_RETURN(convert_error_code_to_mysql(error, 0, NULL));
+}
+
+/*****************************************************************//**
+Rolls back a transaction
+@return	0 or error number */
+static
+int
+innobase_rollback_trx(
+/*==================*/
+	trx_t*	trx)	/*!< in: transaction */
+{
+	int	error = 0;
+
+	DBUG_ENTER("innobase_rollback_trx");
+	DBUG_PRINT("trans", ("aborting transaction"));
+
+	/* Release a possible FIFO ticket and search latch. Since we will
+	reserve the kernel mutex, we have to release the search system latch
+	first to obey the latching order. */
+
+	innobase_release_stat_resources(trx);
+
+	/* If we had reserved the auto-inc lock for some table (if
+	we come here to roll back the latest SQL statement) we
+	release it now before a possibly lengthy rollback */
+
+	row_unlock_table_autoinc_for_mysql(trx);
+
+	error = trx_rollback_for_mysql(trx);
+
+	DBUG_RETURN(convert_error_code_to_mysql(error, 0, NULL));
+}
+
+/*****************************************************************//**
+Rolls back a transaction to a savepoint.
+@return 0 if success, HA_ERR_NO_SAVEPOINT if no savepoint with the
+given name */
+static
+int
+innobase_rollback_to_savepoint(
+/*===========================*/
+        handlerton *hton,       /*!< in: Innodb handlerton */ 
+	THD*	thd,		/*!< in: handle to the MySQL thread of the user
+				whose transaction should be rolled back */
+	void*	savepoint)	/*!< in: savepoint data */
+{
+	ib_int64_t	mysql_binlog_cache_pos;
+	int		error = 0;
+	trx_t*		trx;
+	char		name[64];
+
+	DBUG_ENTER("innobase_rollback_to_savepoint");
+	DBUG_ASSERT(hton == innodb_hton_ptr);
+
+	trx = check_trx_exists(thd);
+
+	/* Release a possible FIFO ticket and search latch. Since we will
+	reserve the kernel mutex, we have to release the search system latch
+	first to obey the latching order. */
+
+	innobase_release_stat_resources(trx);
+
+	/* TODO: use provided savepoint data area to store savepoint data */
+
+	longlong2str((ulint)savepoint, name, 36, 1);
+
+	error = (int) trx_rollback_to_savepoint_for_mysql(trx, name,
+						&mysql_binlog_cache_pos);
+	DBUG_RETURN(convert_error_code_to_mysql(error, 0, NULL));
+}
+
+/*****************************************************************//**
+Release transaction savepoint name.
+@return 0 if success, HA_ERR_NO_SAVEPOINT if no savepoint with the
+given name */
+static
+int
+innobase_release_savepoint(
+/*=======================*/
+        handlerton*	hton,	/*!< in: handlerton for Innodb */
+	THD*	thd,		/*!< in: handle to the MySQL thread of the user
+				whose transaction should be rolled back */
+	void*	savepoint)	/*!< in: savepoint data */
+{
+	int		error = 0;
+	trx_t*		trx;
+	char		name[64];
+
+	DBUG_ENTER("innobase_release_savepoint");
+	DBUG_ASSERT(hton == innodb_hton_ptr);
+
+	trx = check_trx_exists(thd);
+
+	/* TODO: use provided savepoint data area to store savepoint data */
+
+	longlong2str((ulint)savepoint, name, 36, 1);
+
+	error = (int) trx_release_savepoint_for_mysql(trx, name);
+
+	DBUG_RETURN(convert_error_code_to_mysql(error, 0, NULL));
+}
+
+/*****************************************************************//**
+Sets a transaction savepoint.
+@return	always 0, that is, always succeeds */
+static
+int
+innobase_savepoint(
+/*===============*/
+	handlerton*	hton,   /*!< in: handle to the Innodb handlerton */
+	THD*	thd,		/*!< in: handle to the MySQL thread */
+	void*	savepoint)	/*!< in: savepoint data */
+{
+	int	error = 0;
+	trx_t*	trx;
+
+	DBUG_ENTER("innobase_savepoint");
+	DBUG_ASSERT(hton == innodb_hton_ptr);
+
+	/*
+	  In the autocommit mode there is no sense to set a savepoint
+	  (unless we are in sub-statement), so SQL layer ensures that
+	  this method is never called in such situation.
+	*/
+#ifdef MYSQL_SERVER /* plugins cannot access thd->in_sub_stmt */
+	DBUG_ASSERT(thd_test_options(thd, OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN) ||
+		thd->in_sub_stmt);
+#endif /* MYSQL_SERVER */
+
+	trx = check_trx_exists(thd);
+
+	/* Release a possible FIFO ticket and search latch. Since we will
+	reserve the kernel mutex, we have to release the search system latch
+	first to obey the latching order. */
+
+	innobase_release_stat_resources(trx);
+
+	/* cannot happen outside of transaction */
+	DBUG_ASSERT(trx->active_trans);
+
+	/* TODO: use provided savepoint data area to store savepoint data */
+	char name[64];
+	longlong2str((ulint)savepoint,name,36,1);
+
+	error = (int) trx_savepoint_for_mysql(trx, name, (ib_int64_t)0);
+
+	DBUG_RETURN(convert_error_code_to_mysql(error, 0, NULL));
+}
+
+/*****************************************************************//**
+Frees a possible InnoDB trx object associated with the current THD.
+@return	0 or error number */
+static
+int
+innobase_close_connection(
+/*======================*/
+        handlerton*	hton,	/*!< in:  innobase handlerton */
+	THD*	thd)	/*!< in: handle to the MySQL thread of the user
+			whose resources should be free'd */
+{
+	trx_t*	trx;
+
+	DBUG_ENTER("innobase_close_connection");
+	DBUG_ASSERT(hton == innodb_hton_ptr);
+	trx = thd_to_trx(thd);
+
+	ut_a(trx);
+
+	if (trx->active_trans == 0
+		&& trx->conc_state != TRX_NOT_STARTED) {
+
+		sql_print_error("trx->active_trans == 0, but"
+			" trx->conc_state != TRX_NOT_STARTED");
+	}
+
+
+	if (trx->conc_state != TRX_NOT_STARTED &&
+		global_system_variables.log_warnings) {
+		sql_print_warning(
+			"MySQL is closing a connection that has an active "
+			"InnoDB transaction.  %lu row modifications will "
+			"roll back.",
+			(ulong) trx->undo_no.low);
+	}
+
+	innobase_rollback_trx(trx);
+
+	thr_local_free(trx->mysql_thread_id);
+	trx_free_for_mysql(trx);
+
+	DBUG_RETURN(0);
+}
+
+
+/*************************************************************************//**
+** InnoDB database tables
+*****************************************************************************/
+
+/****************************************************************//**
+Get the record format from the data dictionary.
+@return one of ROW_TYPE_REDUNDANT, ROW_TYPE_COMPACT,
+ROW_TYPE_COMPRESSED, ROW_TYPE_DYNAMIC */
+UNIV_INTERN
+enum row_type
+ha_innobase::get_row_type() const
+/*=============================*/
+{
+	if (prebuilt && prebuilt->table) {
+		const ulint	flags = prebuilt->table->flags;
+
+		if (UNIV_UNLIKELY(!flags)) {
+			return(ROW_TYPE_REDUNDANT);
+		}
+
+		ut_ad(flags & DICT_TF_COMPACT);
+
+		switch (flags & DICT_TF_FORMAT_MASK) {
+		case DICT_TF_FORMAT_51 << DICT_TF_FORMAT_SHIFT:
+			return(ROW_TYPE_COMPACT);
+		case DICT_TF_FORMAT_ZIP << DICT_TF_FORMAT_SHIFT:
+			if (flags & DICT_TF_ZSSIZE_MASK) {
+				return(ROW_TYPE_COMPRESSED);
+			} else {
+				return(ROW_TYPE_DYNAMIC);
+			}
+#if DICT_TF_FORMAT_ZIP != DICT_TF_FORMAT_MAX
+# error "DICT_TF_FORMAT_ZIP != DICT_TF_FORMAT_MAX"
+#endif
+		}
+	}
+	ut_ad(0);
+	return(ROW_TYPE_NOT_USED);
+}
+
+
+
+/****************************************************************//**
+Get the table flags to use for the statement.
+@return	table flags */
+UNIV_INTERN
+handler::Table_flags
+ha_innobase::table_flags() const
+/*============================*/
+{
+       /* Need to use tx_isolation here since table flags is (also)
+          called before prebuilt is inited. */
+        ulong const tx_isolation = thd_tx_isolation(ha_thd());
+        if (tx_isolation <= ISO_READ_COMMITTED)
+                return int_table_flags;
+        return int_table_flags | HA_BINLOG_STMT_CAPABLE;
+}
+
+/****************************************************************//**
+Gives the file extension of an InnoDB single-table tablespace. */
+static const char* ha_innobase_exts[] = {
+  ".ibd",
+  NullS
+};
+
+/****************************************************************//**
+Returns the table type (storage engine name).
+@return	table type */
+UNIV_INTERN
+const char*
+ha_innobase::table_type() const
+/*===========================*/
+{
+	return(innobase_hton_name);
+}
+
+/****************************************************************//**
+Returns the index type. */
+UNIV_INTERN
+const char*
+ha_innobase::index_type(
+/*====================*/
+	uint)
+				/*!< out: index type */
+{
+	return("BTREE");
+}
+
+/****************************************************************//**
+Returns the table file name extension.
+@return	file extension string */
+UNIV_INTERN
+const char**
+ha_innobase::bas_ext() const
+/*========================*/
+{
+	return(ha_innobase_exts);
+}
+
+/****************************************************************//**
+Returns the operations supported for indexes.
+@return	flags of supported operations */
+UNIV_INTERN
+ulong
+ha_innobase::index_flags(
+/*=====================*/
+	uint,
+	uint,
+	bool)
+const
+{
+	return(HA_READ_NEXT | HA_READ_PREV | HA_READ_ORDER
+	       | HA_READ_RANGE | HA_KEYREAD_ONLY | HA_DO_INDEX_COND_PUSHDOWN);
+}
+
+/****************************************************************//**
+Returns the maximum number of keys.
+@return	MAX_KEY */
+UNIV_INTERN
+uint
+ha_innobase::max_supported_keys() const
+/*===================================*/
+{
+	return(MAX_KEY);
+}
+
+/****************************************************************//**
+Returns the maximum key length.
+@return	maximum supported key length, in bytes */
+UNIV_INTERN
+uint
+ha_innobase::max_supported_key_length() const
+/*=========================================*/
+{
+	/* An InnoDB page must store >= 2 keys; a secondary key record
+	must also contain the primary key value: max key length is
+	therefore set to slightly less than 1 / 4 of page size which
+	is 16 kB; but currently MySQL does not work with keys whose
+	size is > MAX_KEY_LENGTH */
+	return(3500);
+}
+
+/****************************************************************//**
+Returns the key map of keys that are usable for scanning.
+@return	key_map_full */
+UNIV_INTERN
+const key_map*
+ha_innobase::keys_to_use_for_scanning()
+{
+	return(&key_map_full);
+}
+
+/****************************************************************//**
+Determines if table caching is supported.
+@return	HA_CACHE_TBL_ASKTRANSACT */
+UNIV_INTERN
+uint8
+ha_innobase::table_cache_type()
+{
+	return(HA_CACHE_TBL_ASKTRANSACT);
+}
+
+/****************************************************************//**
+Determines if the primary key is clustered index.
+@return	true */
+UNIV_INTERN
+bool
+ha_innobase::primary_key_is_clustered()
+{
+	return(true);
+}
+
+/*****************************************************************//**
+Normalizes a table name string. A normalized name consists of the
+database name catenated to '/' and table name. An example:
+test/mytable. On Windows normalization puts both the database name and the
+table name always to lower case. */
+static
+void
+normalize_table_name(
+/*=================*/
+	char*		norm_name,	/*!< out: normalized name as a
+					null-terminated string */
+	const char*	name)		/*!< in: table name string */
+{
+	char*	name_ptr;
+	char*	db_ptr;
+	char*	ptr;
+
+	/* Scan name from the end */
+
+	ptr = strend(name)-1;
+
+	while (ptr >= name && *ptr != '\\' && *ptr != '/') {
+		ptr--;
+	}
+
+	name_ptr = ptr + 1;
+
+	DBUG_ASSERT(ptr > name);
+
+	ptr--;
+
+	while (ptr >= name && *ptr != '\\' && *ptr != '/') {
+		ptr--;
+	}
+
+	db_ptr = ptr + 1;
+
+	memcpy(norm_name, db_ptr, strlen(name) + 1 - (db_ptr - name));
+
+	norm_name[name_ptr - db_ptr - 1] = '/';
+
+#ifdef __WIN__
+	innobase_casedn_str(norm_name);
+#endif
+}
+
+/********************************************************************//**
+Get the upper limit of the MySQL integral and floating-point type.
+@return maximum allowed value for the field */
+static
+ulonglong
+innobase_get_int_col_max_value(
+/*===========================*/
+	const Field*	field)	/*!< in: MySQL field */
+{
+	ulonglong	max_value = 0;
+
+	switch(field->key_type()) {
+	/* TINY */
+	case HA_KEYTYPE_BINARY:
+		max_value = 0xFFULL;
+		break;
+	case HA_KEYTYPE_INT8:
+		max_value = 0x7FULL;
+		break;
+	/* SHORT */
+	case HA_KEYTYPE_USHORT_INT:
+		max_value = 0xFFFFULL;
+		break;
+	case HA_KEYTYPE_SHORT_INT:
+		max_value = 0x7FFFULL;
+		break;
+	/* MEDIUM */
+	case HA_KEYTYPE_UINT24:
+		max_value = 0xFFFFFFULL;
+		break;
+	case HA_KEYTYPE_INT24:
+		max_value = 0x7FFFFFULL;
+		break;
+	/* LONG */
+	case HA_KEYTYPE_ULONG_INT:
+		max_value = 0xFFFFFFFFULL;
+		break;
+	case HA_KEYTYPE_LONG_INT:
+		max_value = 0x7FFFFFFFULL;
+		break;
+	/* BIG */
+	case HA_KEYTYPE_ULONGLONG:
+		max_value = 0xFFFFFFFFFFFFFFFFULL;
+		break;
+	case HA_KEYTYPE_LONGLONG:
+		max_value = 0x7FFFFFFFFFFFFFFFULL;
+		break;
+	case HA_KEYTYPE_FLOAT:
+		/* We use the maximum as per IEEE754-2008 standard, 2^24 */
+		max_value = 0x1000000ULL;
+		break;
+	case HA_KEYTYPE_DOUBLE:
+		/* We use the maximum as per IEEE754-2008 standard, 2^53 */
+		max_value = 0x20000000000000ULL;
+		break;
+	default:
+		ut_error;
+	}
+
+	return(max_value);
+}
+
+/*******************************************************************//**
+This function checks whether the index column information
+is consistent between KEY info from mysql and that from innodb index.
+@return TRUE if all column types match. */
+static
+ibool
+innobase_match_index_columns(
+/*=========================*/
+	const KEY*		key_info,	/*!< in: Index info
+						from mysql */
+	const dict_index_t*	index_info)	/*!< in: Index info
+						from Innodb */
+{
+	const KEY_PART_INFO*	key_part;
+	const KEY_PART_INFO*	key_end;
+	const dict_field_t*	innodb_idx_fld;
+	const dict_field_t*	innodb_idx_fld_end;
+
+	DBUG_ENTER("innobase_match_index_columns");
+
+	/* Check whether user defined index column count matches */
+	if (key_info->key_parts != index_info->n_user_defined_cols) {
+		DBUG_RETURN(FALSE);
+	}
+
+	key_part = key_info->key_part;
+	key_end = key_part + key_info->key_parts;
+	innodb_idx_fld = index_info->fields;
+	innodb_idx_fld_end = index_info->fields + index_info->n_fields;
+
+	/* Check each index column's datatype. We do not check
+	column name because there exists case that index
+	column name got modified in mysql but such change does not
+	propagate to InnoDB.
+	One hidden assumption here is that the index column sequences
+	are matched up between those in mysql and Innodb. */
+	for (; key_part != key_end; ++key_part) {
+		ulint	col_type;
+		ibool	is_unsigned;
+		ulint	mtype = innodb_idx_fld->col->mtype;
+
+		/* Need to translate to InnoDB column type before
+		comparison. */
+		col_type = get_innobase_type_from_mysql_type(&is_unsigned,
+							     key_part->field);
+
+		/* Ignore Innodb specific system columns. */
+		while (mtype == DATA_SYS) {
+			innodb_idx_fld++;
+
+			if (innodb_idx_fld >= innodb_idx_fld_end) {
+				DBUG_RETURN(FALSE);
+			}
+		}
+
+		if (col_type != mtype) {
+			/* Column Type mismatches */
+			DBUG_RETURN(FALSE);
+		}
+
+		innodb_idx_fld++;
+	}
+
+	DBUG_RETURN(TRUE);
+}
+
+/*******************************************************************//**
+This function builds a translation table in INNOBASE_SHARE
+structure for fast index location with mysql array number from its
+table->key_info structure. This also provides the necessary translation
+between the key order in mysql key_info and Innodb ib_table->indexes if
+they are not fully matched with each other.
+Note we do not have any mutex protecting the translation table
+building based on the assumption that there is no concurrent
+index creation/drop and DMLs that requires index lookup. All table
+handle will be closed before the index creation/drop.
+@return TRUE if index translation table built successfully */
+static
+ibool
+innobase_build_index_translation(
+/*=============================*/
+	const TABLE*		table,	  /*!< in: table in MySQL data
+					  dictionary */
+	dict_table_t*		ib_table, /*!< in: table in Innodb data
+					  dictionary */
+	INNOBASE_SHARE*		share)	  /*!< in/out: share structure
+					  where index translation table
+					  will be constructed in. */
+{
+	ulint		mysql_num_index;
+	ulint		ib_num_index;
+	dict_index_t**	index_mapping;
+	ibool		ret = TRUE;
+
+	DBUG_ENTER("innobase_build_index_translation");
+
+	mysql_num_index = table->s->keys;
+	ib_num_index = UT_LIST_GET_LEN(ib_table->indexes);
+
+	index_mapping = share->idx_trans_tbl.index_mapping;
+
+	/* If there exists inconsistency between MySQL and InnoDB dictionary
+	(metadata) information, the number of index defined in MySQL
+	could exceed that in InnoDB, do not build index translation
+	table in such case */
+	if (UNIV_UNLIKELY(ib_num_index < mysql_num_index)) {
+		ret = FALSE;
+		goto func_exit;
+	}
+
+	/* If index entry count is non-zero, nothing has
+	changed since last update, directly return TRUE */
+	if (share->idx_trans_tbl.index_count) {
+		/* Index entry count should still match mysql_num_index */
+		ut_a(share->idx_trans_tbl.index_count == mysql_num_index);
+		goto func_exit;
+	}
+
+	/* The number of index increased, rebuild the mapping table */
+	if (mysql_num_index > share->idx_trans_tbl.array_size) {
+		index_mapping = (dict_index_t**) my_realloc(index_mapping,
+							mysql_num_index *
+							sizeof(*index_mapping),
+							MYF(MY_ALLOW_ZERO_PTR));
+
+		if (!index_mapping) {
+			ret = FALSE;
+			goto func_exit;
+		}
+
+		share->idx_trans_tbl.array_size = mysql_num_index;
+	}
+
+
+	/* For each index in the mysql key_info array, fetch its
+	corresponding InnoDB index pointer into index_mapping
+	array. */
+	for (ulint count = 0; count < mysql_num_index; count++) {
+
+		/* Fetch index pointers into index_mapping according to mysql
+		index sequence */
+		index_mapping[count] = dict_table_get_index_on_name(
+			ib_table, table->key_info[count].name);
+
+		if (!index_mapping[count]) {
+			sql_print_error("Cannot find index %s in InnoDB "
+					"index dictionary.",
+					table->key_info[count].name);
+			ret = FALSE;
+			goto func_exit;
+		}
+
+		/* Double check fetched index has the same
+		column info as those in mysql key_info. */
+		if (!innobase_match_index_columns(&table->key_info[count],
+					          index_mapping[count])) {
+			sql_print_error("Found index %s whose column info "
+					"does not match that of MySQL.",
+					table->key_info[count].name);
+			ret = FALSE;
+			goto func_exit;
+		}
+	}
+
+	/* Successfully built the translation table */
+	share->idx_trans_tbl.index_count = mysql_num_index;
+
+func_exit:
+	if (!ret) {
+		/* Build translation table failed. */
+		my_free(index_mapping, MYF(MY_ALLOW_ZERO_PTR));
+
+		share->idx_trans_tbl.array_size = 0;
+		share->idx_trans_tbl.index_count = 0;
+		index_mapping = NULL;
+	}
+
+	share->idx_trans_tbl.index_mapping = index_mapping;
+
+	DBUG_RETURN(ret);
+}
+
+/*******************************************************************//**
+This function uses index translation table to quickly locate the
+requested index structure.
+Note we do not have mutex protection for the index translatoin table
+access, it is based on the assumption that there is no concurrent
+translation table rebuild (fter create/drop index) and DMLs that
+require index lookup.
+@return dict_index_t structure for requested index. NULL if
+fail to locate the index structure. */
+static
+dict_index_t*
+innobase_index_lookup(
+/*==================*/
+	INNOBASE_SHARE*	share,	/*!< in: share structure for index
+				translation table. */
+	uint		keynr)	/*!< in: index number for the requested
+				index */
+{
+	if (!share->idx_trans_tbl.index_mapping
+	    || keynr >= share->idx_trans_tbl.index_count) {
+		return(NULL);
+	}
+
+	return(share->idx_trans_tbl.index_mapping[keynr]);
+}
+
+/************************************************************************
+Set the autoinc column max value. This should only be called once from
+ha_innobase::open(). Therefore there's no need for a covering lock. */
+UNIV_INTERN
+void
+ha_innobase::innobase_initialize_autoinc()
+/*======================================*/
+{
+	ulonglong	auto_inc;
+	const Field*	field = table->found_next_number_field;
+
+	if (field != NULL) {
+		auto_inc = innobase_get_int_col_max_value(field);
+	} else {
+		/* We have no idea what's been passed in to us as the
+		autoinc column. We set it to the 0, effectively disabling
+		updates to the table. */
+		auto_inc = 0;
+
+		ut_print_timestamp(stderr);
+		fprintf(stderr, "  InnoDB: Unable to determine the AUTOINC "
+				"column name\n");
+	}
+
+	if (srv_force_recovery >= SRV_FORCE_NO_IBUF_MERGE) {
+		/* If the recovery level is set so high that writes
+		are disabled we force the AUTOINC counter to 0
+		value effectively disabling writes to the table.
+		Secondly, we avoid reading the table in case the read
+		results in failure due to a corrupted table/index.
+
+		We will not return an error to the client, so that the
+		tables can be dumped with minimal hassle.  If an error
+		were returned in this case, the first attempt to read
+		the table would fail and subsequent SELECTs would succeed. */
+		auto_inc = 0;
+	} else if (field == NULL) {
+		/* This is a far more serious error, best to avoid
+		opening the table and return failure. */
+		my_error(ER_AUTOINC_READ_FAILED, MYF(0));
+	} else {
+		dict_index_t*	index;
+		const char*	col_name;
+		ulonglong	read_auto_inc;
+		ulint		err;
+
+		update_thd(ha_thd());
+
+		ut_a(prebuilt->trx == thd_to_trx(user_thd));
+
+		col_name = field->field_name;
+		index = innobase_get_index(table->s->next_number_index);
+
+		/* Execute SELECT MAX(col_name) FROM TABLE; */
+		err = row_search_max_autoinc(index, col_name, &read_auto_inc);
+
+		switch (err) {
+		case DB_SUCCESS: {
+			ulonglong	col_max_value;
+
+			col_max_value = innobase_get_int_col_max_value(field);
+
+			/* At the this stage we do not know the increment
+			nor the offset, so use a default increment of 1. */
+
+			auto_inc = innobase_next_autoinc(
+				read_auto_inc, 1, 1, col_max_value);
+
+			break;
+		}
+		case DB_RECORD_NOT_FOUND:
+			ut_print_timestamp(stderr);
+			fprintf(stderr, "  InnoDB: MySQL and InnoDB data "
+				"dictionaries are out of sync.\n"
+				"InnoDB: Unable to find the AUTOINC column "
+				"%s in the InnoDB table %s.\n"
+				"InnoDB: We set the next AUTOINC column "
+				"value to 0,\n"
+				"InnoDB: in effect disabling the AUTOINC "
+				"next value generation.\n"
+				"InnoDB: You can either set the next "
+				"AUTOINC value explicitly using ALTER TABLE\n"
+				"InnoDB: or fix the data dictionary by "
+				"recreating the table.\n",
+				col_name, index->table->name);
+
+			/* This will disable the AUTOINC generation. */
+			auto_inc = 0;
+
+			/* We want the open to succeed, so that the user can
+			take corrective action. ie. reads should succeed but
+			updates should fail. */
+			err = DB_SUCCESS;
+			break;
+		default:
+			/* row_search_max_autoinc() should only return
+			one of DB_SUCCESS or DB_RECORD_NOT_FOUND. */
+			ut_error;
+		}
+	}
+
+	dict_table_autoinc_initialize(prebuilt->table, auto_inc);
+}
+
+/*****************************************************************//**
+Creates and opens a handle to a table which already exists in an InnoDB
+database.
+@return	1 if error, 0 if success */
+UNIV_INTERN
+int
+ha_innobase::open(
+/*==============*/
+	const char*	name,		/*!< in: table name */
+	int		mode,		/*!< in: not used */
+	uint		test_if_locked)	/*!< in: not used */
+{
+	dict_table_t*	ib_table;
+	char		norm_name[1000];
+	THD*		thd;
+	ulint		retries = 0;
+	char*		is_part = NULL;
+
+	DBUG_ENTER("ha_innobase::open");
+
+	UT_NOT_USED(mode);
+	UT_NOT_USED(test_if_locked);
+
+	thd = ha_thd();
+
+	/* Under some cases MySQL seems to call this function while
+	holding btr_search_latch. This breaks the latching order as
+	we acquire dict_sys->mutex below and leads to a deadlock. */
+	if (thd != NULL) {
+		innobase_release_temporary_latches(ht, thd);
+	}
+
+	normalize_table_name(norm_name, name);
+
+	user_thd = NULL;
+
+	if (!(share=get_share(name))) {
+
+		DBUG_RETURN(1);
+	}
+
+	if (share->ib_table && share->ib_table->is_corrupt) {
+		free_share(share);
+
+		DBUG_RETURN(HA_ERR_CRASHED_ON_USAGE);
+	}
+
+	/* Create buffers for packing the fields of a record. Why
+	table->stored_rec_length did not work here? Obviously, because char
+	fields when packed actually became 1 byte longer, when we also
+	stored the string length as the first byte. */
+
+	upd_and_key_val_buff_len =
+				table->s->stored_rec_length + table->s->max_key_length
+							+ MAX_REF_PARTS * 3;
+	if (!(uchar*) my_multi_malloc(MYF(MY_WME),
+			&upd_buff, upd_and_key_val_buff_len,
+			&key_val_buff, upd_and_key_val_buff_len,
+			NullS)) {
+		free_share(share);
+
+		DBUG_RETURN(1);
+	}
+
+	/* We look for pattern #P# to see if the table is partitioned
+	MySQL table. The retry logic for partitioned tables is a
+	workaround for http://bugs.mysql.com/bug.php?id=33349. Look
+	at support issue https://support.mysql.com/view.php?id=21080
+	for more details. */
+	is_part = strstr(norm_name, "#P#");
+retry:
+	/* Get pointer to a table object in InnoDB dictionary cache */
+	ib_table = dict_table_get(norm_name, TRUE);
+	
+	if (ib_table && ib_table->is_corrupt) {
+		free_share(share);
+		my_free(upd_buff, MYF(0));
+
+		DBUG_RETURN(HA_ERR_CRASHED_ON_USAGE);
+	}
+
+	if (share->ib_table) {
+		ut_a(share->ib_table == ib_table);
+	} else {
+		share->ib_table = ib_table;
+	}
+
+	if (NULL == ib_table) {
+		if (is_part && retries < 10) {
+			++retries;
+			os_thread_sleep(100000);
+			goto retry;
+		}
+
+		if (is_part) {
+			sql_print_error("Failed to open table %s after "
+					"%lu attempts.\n", norm_name,
+					retries);
+		}
+
+		sql_print_error("Cannot find or open table %s from\n"
+				"the internal data dictionary of InnoDB "
+				"though the .frm file for the\n"
+				"table exists. Maybe you have deleted and "
+				"recreated InnoDB data\n"
+				"files but have forgotten to delete the "
+				"corresponding .frm files\n"
+				"of InnoDB tables, or you have moved .frm "
+				"files to another database?\n"
+				"or, the table contains indexes that this "
+				"version of the engine\n"
+				"doesn't support.\n"
+				"See " REFMAN "innodb-troubleshooting.html\n"
+				"how you can resolve the problem.\n",
+				norm_name);
+		free_share(share);
+		my_free(upd_buff, MYF(0));
+		my_errno = ENOENT;
+
+		DBUG_RETURN(HA_ERR_NO_SUCH_TABLE);
+	}
+
+	if (ib_table->ibd_file_missing && !thd_tablespace_op(thd)) {
+		sql_print_error("MySQL is trying to open a table handle but "
+				"the .ibd file for\ntable %s does not exist.\n"
+				"Have you deleted the .ibd file from the "
+				"database directory under\nthe MySQL datadir, "
+				"or have you used DISCARD TABLESPACE?\n"
+				"See " REFMAN "innodb-troubleshooting.html\n"
+				"how you can resolve the problem.\n",
+				norm_name);
+		free_share(share);
+		my_free(upd_buff, MYF(0));
+		my_errno = ENOENT;
+
+		dict_table_decrement_handle_count(ib_table, FALSE);
+		DBUG_RETURN(HA_ERR_NO_SUCH_TABLE);
+	}
+
+	prebuilt = row_create_prebuilt(ib_table);
+
+	prebuilt->mysql_row_len = table->s->stored_rec_length;;
+	prebuilt->default_rec = table->s->default_values;
+	ut_ad(prebuilt->default_rec);
+
+	/* Looks like MySQL-3.23 sometimes has primary key number != 0 */
+
+	primary_key = table->s->primary_key;
+	key_used_on_scan = primary_key;
+
+	if (!innobase_build_index_translation(table, ib_table, share)) {
+		  sql_print_error("Build InnoDB index translation table for"
+				  " Table %s failed", name);
+	}
+
+	/* Allocate a buffer for a 'row reference'. A row reference is
+	a string of bytes of length ref_length which uniquely specifies
+	a row in our table. Note that MySQL may also compare two row
+	references for equality by doing a simple memcmp on the strings
+	of length ref_length! */
+
+	if (!row_table_got_default_clust_index(ib_table)) {
+		prebuilt->clust_index_was_generated = FALSE;
+
+		if (UNIV_UNLIKELY(primary_key >= MAX_KEY)) {
+			sql_print_error("Table %s has a primary key in "
+					"InnoDB data dictionary, but not "
+					"in MySQL!", name);
+
+			/* This mismatch could cause further problems
+			if not attended, bring this to the user's attention
+			by printing a warning in addition to log a message
+			in the errorlog */
+			push_warning_printf(thd, MYSQL_ERROR::WARN_LEVEL_WARN,
+					    ER_NO_SUCH_INDEX,
+					    "InnoDB: Table %s has a "
+					    "primary key in InnoDB data "
+					    "dictionary, but not in "
+					    "MySQL!", name);
+
+			/* If primary_key >= MAX_KEY, its (primary_key)
+			value could be out of bound if continue to index
+			into key_info[] array. Find InnoDB primary index,
+			and assign its key_length to ref_length.
+			In addition, since MySQL indexes are sorted starting
+			with primary index, unique index etc., initialize
+			ref_length to the first index key length in
+			case we fail to find InnoDB cluster index.
+
+			Please note, this will not resolve the primary
+			index mismatch problem, other side effects are
+			possible if users continue to use the table.
+			However, we allow this table to be opened so
+			that user can adopt necessary measures for the
+			mismatch while still being accessible to the table
+			date. */
+			ref_length = table->key_info[0].key_length;
+
+			/* Find correspoinding cluster index
+			key length in MySQL's key_info[] array */
+			for (ulint i = 0; i < table->s->keys; i++) {
+				dict_index_t*	index;
+				index = innobase_get_index(i);
+				if (dict_index_is_clust(index)) {
+					ref_length =
+						 table->key_info[i].key_length;
+				}
+			}
+		} else {
+			/* MySQL allocates the buffer for ref.
+			key_info->key_length includes space for all key
+			columns + one byte for each column that may be
+			NULL. ref_length must be as exact as possible to
+			save space, because all row reference buffers are
+			allocated based on ref_length. */
+
+			ref_length = table->key_info[primary_key].key_length;
+		}
+	} else {
+		if (primary_key != MAX_KEY) {
+			sql_print_error(
+				"Table %s has no primary key in InnoDB data "
+				"dictionary, but has one in MySQL! If you "
+				"created the table with a MySQL version < "
+				"3.23.54 and did not define a primary key, "
+				"but defined a unique key with all non-NULL "
+				"columns, then MySQL internally treats that "
+				"key as the primary key. You can fix this "
+				"error by dump + DROP + CREATE + reimport "
+				"of the table.", name);
+
+			/* This mismatch could cause further problems
+			if not attended, bring this to the user attention
+			by printing a warning in addition to log a message
+			in the errorlog */
+			push_warning_printf(thd, MYSQL_ERROR::WARN_LEVEL_WARN,
+					    ER_NO_SUCH_INDEX,
+					    "InnoDB: Table %s has no "
+					    "primary key in InnoDB data "
+					    "dictionary, but has one in "
+					    "MySQL!", name);
+		}
+
+		prebuilt->clust_index_was_generated = TRUE;
+
+		ref_length = DATA_ROW_ID_LEN;
+
+		/* If we automatically created the clustered index, then
+		MySQL does not know about it, and MySQL must NOT be aware
+		of the index used on scan, to make it avoid checking if we
+		update the column of the index. That is why we assert below
+		that key_used_on_scan is the undefined value MAX_KEY.
+		The column is the row id in the automatical generation case,
+		and it will never be updated anyway. */
+
+		if (key_used_on_scan != MAX_KEY) {
+			sql_print_warning(
+				"Table %s key_used_on_scan is %lu even "
+				"though there is no primary key inside "
+				"InnoDB.", name, (ulong) key_used_on_scan);
+		}
+	}
+
+	/* Index block size in InnoDB: used by MySQL in query optimization */
+	stats.block_size = 16 * 1024;
+
+	/* Init table lock structure */
+	thr_lock_data_init(&share->lock,&lock,(void*) 0);
+
+	if (prebuilt->table) {
+		/* We update the highest file format in the system table
+		space, if this table has higher file format setting. */
+
+		trx_sys_file_format_max_upgrade(
+			(const char**) &innobase_file_format_check,
+			dict_table_get_format(prebuilt->table));
+	}
+
+	/* Only if the table has an AUTOINC column. */
+	if (prebuilt->table != NULL && table->found_next_number_field != NULL) {
+		dict_table_autoinc_lock(prebuilt->table);
+
+		/* Since a table can already be "open" in InnoDB's internal
+		data dictionary, we only init the autoinc counter once, the
+		first time the table is loaded. We can safely reuse the
+		autoinc value from a previous MySQL open. */
+		if (dict_table_autoinc_read(prebuilt->table) == 0) {
+
+			innobase_initialize_autoinc();
+		}
+
+		dict_table_autoinc_unlock(prebuilt->table);
+	}
+
+	info(HA_STATUS_NO_LOCK | HA_STATUS_VARIABLE | HA_STATUS_CONST);
+
+	DBUG_RETURN(0);
+}
+
+UNIV_INTERN
+uint
+ha_innobase::max_supported_key_part_length() const
+{
+	return(DICT_MAX_INDEX_COL_LEN - 1);
+}
+
+/******************************************************************//**
+Closes a handle to an InnoDB table.
+@return	0 */
+UNIV_INTERN
+int
+ha_innobase::close(void)
+/*====================*/
+{
+	THD*	thd;
+
+	DBUG_ENTER("ha_innobase::close");
+
+	thd = ha_thd();
+	if (thd != NULL) {
+		innobase_release_temporary_latches(ht, thd);
+	}
+
+	row_prebuilt_free(prebuilt, FALSE);
+
+	my_free(upd_buff, MYF(0));
+	free_share(share);
+
+	/* Tell InnoDB server that there might be work for
+	utility threads: */
+
+	srv_active_wake_master_thread();
+
+	DBUG_RETURN(0);
+}
+
+/* The following accessor functions should really be inside MySQL code! */
+
+/**************************************************************//**
+Gets field offset for a field in a table.
+@return	offset */
+static inline
+uint
+get_field_offset(
+/*=============*/
+	TABLE*	table,	/*!< in: MySQL table object */
+	Field*	field)	/*!< in: MySQL field object */
+{
+	return((uint) (field->ptr - table->record[0]));
+}
+
+/**************************************************************//**
+Checks if a field in a record is SQL NULL. Uses the record format
+information in table to track the null bit in record.
+@return	1 if NULL, 0 otherwise */
+static inline
+uint
+field_in_record_is_null(
+/*====================*/
+	TABLE*	table,	/*!< in: MySQL table object */
+	Field*	field,	/*!< in: MySQL field object */
+	char*	record)	/*!< in: a row in MySQL format */
+{
+	int	null_offset;
+
+	if (!field->null_ptr) {
+
+		return(0);
+	}
+
+	null_offset = (uint) ((char*) field->null_ptr
+					- (char*) table->record[0]);
+
+	if (record[null_offset] & field->null_bit) {
+
+		return(1);
+	}
+
+	return(0);
+}
+
+/**************************************************************//**
+Sets a field in a record to SQL NULL. Uses the record format
+information in table to track the null bit in record. */
+static inline
+void
+set_field_in_record_to_null(
+/*========================*/
+	TABLE*	table,	/*!< in: MySQL table object */
+	Field*	field,	/*!< in: MySQL field object */
+	char*	record)	/*!< in: a row in MySQL format */
+{
+	int	null_offset;
+
+	null_offset = (uint) ((char*) field->null_ptr
+					- (char*) table->record[0]);
+
+	record[null_offset] = record[null_offset] | field->null_bit;
+}
+
+/*************************************************************//**
+InnoDB uses this function to compare two data fields for which the data type
+is such that we must use MySQL code to compare them. NOTE that the prototype
+of this function is in rem0cmp.c in InnoDB source code! If you change this
+function, remember to update the prototype there!
+@return	1, 0, -1, if a is greater, equal, less than b, respectively */
+extern "C" UNIV_INTERN
+int
+innobase_mysql_cmp(
+/*===============*/
+	int		mysql_type,	/*!< in: MySQL type */
+	uint		charset_number,	/*!< in: number of the charset */
+	const unsigned char* a,		/*!< in: data field */
+	unsigned int	a_length,	/*!< in: data field length,
+					not UNIV_SQL_NULL */
+	const unsigned char* b,		/*!< in: data field */
+	unsigned int	b_length)	/*!< in: data field length,
+					not UNIV_SQL_NULL */
+{
+	CHARSET_INFO*		charset;
+	enum_field_types	mysql_tp;
+	int			ret;
+
+	DBUG_ASSERT(a_length != UNIV_SQL_NULL);
+	DBUG_ASSERT(b_length != UNIV_SQL_NULL);
+
+	mysql_tp = (enum_field_types) mysql_type;
+
+	switch (mysql_tp) {
+
+	case MYSQL_TYPE_BIT:
+	case MYSQL_TYPE_STRING:
+	case MYSQL_TYPE_VAR_STRING:
+	case MYSQL_TYPE_TINY_BLOB:
+	case MYSQL_TYPE_MEDIUM_BLOB:
+	case MYSQL_TYPE_BLOB:
+	case MYSQL_TYPE_LONG_BLOB:
+	case MYSQL_TYPE_VARCHAR:
+		/* Use the charset number to pick the right charset struct for
+		the comparison. Since the MySQL function get_charset may be
+		slow before Bar removes the mutex operation there, we first
+		look at 2 common charsets directly. */
+
+		if (charset_number == default_charset_info->number) {
+			charset = default_charset_info;
+		} else if (charset_number == my_charset_latin1.number) {
+			charset = &my_charset_latin1;
+		} else {
+			charset = get_charset(charset_number, MYF(MY_WME));
+
+			if (charset == NULL) {
+			  sql_print_error("InnoDB needs charset %lu for doing "
+					  "a comparison, but MySQL cannot "
+					  "find that charset.",
+					  (ulong) charset_number);
+				ut_a(0);
+			}
+		}
+
+		/* Starting from 4.1.3, we use strnncollsp() in comparisons of
+		non-latin1_swedish_ci strings. NOTE that the collation order
+		changes then: 'b\0\0...' is ordered BEFORE 'b  ...'. Users
+		having indexes on such data need to rebuild their tables! */
+
+		ret = charset->coll->strnncollsp(charset,
+				  a, a_length,
+						 b, b_length, 0);
+		if (ret < 0) {
+			return(-1);
+		} else if (ret > 0) {
+			return(1);
+		} else {
+			return(0);
+		}
+	default:
+		ut_error;
+	}
+
+	return(0);
+}
+
+/**************************************************************//**
+Converts a MySQL type to an InnoDB type. Note that this function returns
+the 'mtype' of InnoDB. InnoDB differentiates between MySQL's old <= 4.1
+VARCHAR and the new true VARCHAR in >= 5.0.3 by the 'prtype'.
+@return	DATA_BINARY, DATA_VARCHAR, ... */
+extern "C" UNIV_INTERN
+ulint
+get_innobase_type_from_mysql_type(
+/*==============================*/
+	ulint*		unsigned_flag,	/*!< out: DATA_UNSIGNED if an
+					'unsigned type';
+					at least ENUM and SET,
+					and unsigned integer
+					types are 'unsigned types' */
+	const void*	f)		/*!< in: MySQL Field */
+{
+	const class Field* field = reinterpret_cast<const class Field*>(f);
+
+	/* The following asserts try to check that the MySQL type code fits in
+	8 bits: this is used in ibuf and also when DATA_NOT_NULL is ORed to
+	the type */
+
+	DBUG_ASSERT((ulint)MYSQL_TYPE_STRING < 256);
+	DBUG_ASSERT((ulint)MYSQL_TYPE_VAR_STRING < 256);
+	DBUG_ASSERT((ulint)MYSQL_TYPE_DOUBLE < 256);
+	DBUG_ASSERT((ulint)MYSQL_TYPE_FLOAT < 256);
+	DBUG_ASSERT((ulint)MYSQL_TYPE_DECIMAL < 256);
+
+	if (field->flags & UNSIGNED_FLAG) {
+
+		*unsigned_flag = DATA_UNSIGNED;
+	} else {
+		*unsigned_flag = 0;
+	}
+
+	if (field->real_type() == MYSQL_TYPE_ENUM
+		|| field->real_type() == MYSQL_TYPE_SET) {
+
+		/* MySQL has field->type() a string type for these, but the
+		data is actually internally stored as an unsigned integer
+		code! */
+
+		*unsigned_flag = DATA_UNSIGNED; /* MySQL has its own unsigned
+						flag set to zero, even though
+						internally this is an unsigned
+						integer type */
+		return(DATA_INT);
+	}
+
+	switch (field->type()) {
+		/* NOTE that we only allow string types in DATA_MYSQL and
+		DATA_VARMYSQL */
+	case MYSQL_TYPE_VAR_STRING: /* old <= 4.1 VARCHAR */
+	case MYSQL_TYPE_VARCHAR:    /* new >= 5.0.3 true VARCHAR */
+		if (field->binary()) {
+			return(DATA_BINARY);
+		} else if (strcmp(
+				   field->charset()->name,
+				   "latin1_swedish_ci") == 0) {
+			return(DATA_VARCHAR);
+		} else {
+			return(DATA_VARMYSQL);
+		}
+	case MYSQL_TYPE_BIT:
+	case MYSQL_TYPE_STRING: if (field->binary()) {
+
+			return(DATA_FIXBINARY);
+		} else if (strcmp(
+				   field->charset()->name,
+				   "latin1_swedish_ci") == 0) {
+			return(DATA_CHAR);
+		} else {
+			return(DATA_MYSQL);
+		}
+	case MYSQL_TYPE_NEWDECIMAL:
+		return(DATA_FIXBINARY);
+	case MYSQL_TYPE_LONG:
+	case MYSQL_TYPE_LONGLONG:
+	case MYSQL_TYPE_TINY:
+	case MYSQL_TYPE_SHORT:
+	case MYSQL_TYPE_INT24:
+	case MYSQL_TYPE_DATE:
+	case MYSQL_TYPE_DATETIME:
+	case MYSQL_TYPE_YEAR:
+	case MYSQL_TYPE_NEWDATE:
+	case MYSQL_TYPE_TIME:
+	case MYSQL_TYPE_TIMESTAMP:
+		return(DATA_INT);
+	case MYSQL_TYPE_FLOAT:
+		return(DATA_FLOAT);
+	case MYSQL_TYPE_DOUBLE:
+		return(DATA_DOUBLE);
+	case MYSQL_TYPE_DECIMAL:
+		return(DATA_DECIMAL);
+	case MYSQL_TYPE_GEOMETRY:
+	case MYSQL_TYPE_TINY_BLOB:
+	case MYSQL_TYPE_MEDIUM_BLOB:
+	case MYSQL_TYPE_BLOB:
+	case MYSQL_TYPE_LONG_BLOB:
+		return(DATA_BLOB);
+	case MYSQL_TYPE_NULL:
+		/* MySQL currently accepts "NULL" datatype, but will
+		reject such datatype in the next release. We will cope
+		with it and not trigger assertion failure in 5.1 */
+		break;
+	default:
+		ut_error;
+	}
+
+	return(0);
+}
+
+/*******************************************************************//**
+Writes an unsigned integer value < 64k to 2 bytes, in the little-endian
+storage format. */
+static inline
+void
+innobase_write_to_2_little_endian(
+/*==============================*/
+	byte*	buf,	/*!< in: where to store */
+	ulint	val)	/*!< in: value to write, must be < 64k */
+{
+	ut_a(val < 256 * 256);
+
+	buf[0] = (byte)(val & 0xFF);
+	buf[1] = (byte)(val / 256);
+}
+
+/*******************************************************************//**
+Reads an unsigned integer value < 64k from 2 bytes, in the little-endian
+storage format.
+@return	value */
+static inline
+uint
+innobase_read_from_2_little_endian(
+/*===============================*/
+	const uchar*	buf)	/*!< in: from where to read */
+{
+	return (uint) ((ulint)(buf[0]) + 256 * ((ulint)(buf[1])));
+}
+
+/*******************************************************************//**
+Stores a key value for a row to a buffer.
+@return	key value length as stored in buff */
+UNIV_INTERN
+uint
+ha_innobase::store_key_val_for_row(
+/*===============================*/
+	uint		keynr,	/*!< in: key number */
+	char*		buff,	/*!< in/out: buffer for the key value (in MySQL
+				format) */
+	uint		buff_len,/*!< in: buffer length */
+	const uchar*	record)/*!< in: row in MySQL format */
+{
+	KEY*		key_info	= table->key_info + keynr;
+	KEY_PART_INFO*	key_part	= key_info->key_part;
+	KEY_PART_INFO*	end		= key_part + key_info->key_parts;
+	char*		buff_start	= buff;
+	enum_field_types mysql_type;
+	Field*		field;
+	ibool		is_null;
+
+	DBUG_ENTER("store_key_val_for_row");
+
+	/* The format for storing a key field in MySQL is the following:
+
+	1. If the column can be NULL, then in the first byte we put 1 if the
+	field value is NULL, 0 otherwise.
+
+	2. If the column is of a BLOB type (it must be a column prefix field
+	in this case), then we put the length of the data in the field to the
+	next 2 bytes, in the little-endian format. If the field is SQL NULL,
+	then these 2 bytes are set to 0. Note that the length of data in the
+	field is <= column prefix length.
+
+	3. In a column prefix field, prefix_len next bytes are reserved for
+	data. In a normal field the max field length next bytes are reserved
+	for data. For a VARCHAR(n) the max field length is n. If the stored
+	value is the SQL NULL then these data bytes are set to 0.
+
+	4. We always use a 2 byte length for a true >= 5.0.3 VARCHAR. Note that
+	in the MySQL row format, the length is stored in 1 or 2 bytes,
+	depending on the maximum allowed length. But in the MySQL key value
+	format, the length always takes 2 bytes.
+
+	We have to zero-fill the buffer so that MySQL is able to use a
+	simple memcmp to compare two key values to determine if they are
+	equal. MySQL does this to compare contents of two 'ref' values. */
+
+	bzero(buff, buff_len);
+
+	for (; key_part != end; key_part++) {
+		is_null = FALSE;
+
+		if (key_part->null_bit) {
+			if (record[key_part->null_offset]
+						& key_part->null_bit) {
+				*buff = 1;
+				is_null = TRUE;
+			} else {
+				*buff = 0;
+			}
+			buff++;
+		}
+
+		field = key_part->field;
+		mysql_type = field->type();
+
+		if (mysql_type == MYSQL_TYPE_VARCHAR) {
+						/* >= 5.0.3 true VARCHAR */
+			ulint		lenlen;
+			ulint		len;
+			const byte*	data;
+			ulint		key_len;
+			ulint		true_len;
+			CHARSET_INFO*	cs;
+			int		error=0;
+
+			key_len = key_part->length;
+
+			if (is_null) {
+				buff += key_len + 2;
+
+				continue;
+			}
+			cs = field->charset();
+
+			lenlen = (ulint)
+				(((Field_varstring*)field)->length_bytes);
+
+			data = row_mysql_read_true_varchar(&len,
+				(byte*) (record
+				+ (ulint)get_field_offset(table, field)),
+				lenlen);
+
+			true_len = len;
+
+			/* For multi byte character sets we need to calculate
+			the true length of the key */
+
+			if (len > 0 && cs->mbmaxlen > 1) {
+				true_len = (ulint) cs->cset->well_formed_len(cs,
+						(const char *) data,
+						(const char *) data + len,
+                                                (uint) (key_len /
+                                                        cs->mbmaxlen),
+						&error);
+			}
+
+			/* In a column prefix index, we may need to truncate
+			the stored value: */
+
+			if (true_len > key_len) {
+				true_len = key_len;
+			}
+
+			/* The length in a key value is always stored in 2
+			bytes */
+
+			row_mysql_store_true_var_len((byte*)buff, true_len, 2);
+			buff += 2;
+
+			memcpy(buff, data, true_len);
+
+			/* Note that we always reserve the maximum possible
+			length of the true VARCHAR in the key value, though
+			only len first bytes after the 2 length bytes contain
+			actual data. The rest of the space was reset to zero
+			in the bzero() call above. */
+
+			buff += key_len;
+
+		} else if (mysql_type == MYSQL_TYPE_TINY_BLOB
+			|| mysql_type == MYSQL_TYPE_MEDIUM_BLOB
+			|| mysql_type == MYSQL_TYPE_BLOB
+			|| mysql_type == MYSQL_TYPE_LONG_BLOB
+			/* MYSQL_TYPE_GEOMETRY data is treated
+			as BLOB data in innodb. */
+			|| mysql_type == MYSQL_TYPE_GEOMETRY) {
+
+			CHARSET_INFO*	cs;
+			ulint		key_len;
+			ulint		true_len;
+			int		error=0;
+			ulint		blob_len;
+			const byte*	blob_data;
+
+			ut_a(key_part->key_part_flag & HA_PART_KEY_SEG);
+
+			key_len = key_part->length;
+
+			if (is_null) {
+				buff += key_len + 2;
+
+				continue;
+			}
+
+			cs = field->charset();
+
+			blob_data = row_mysql_read_blob_ref(&blob_len,
+				(byte*) (record
+				+ (ulint)get_field_offset(table, field)),
+					(ulint) field->pack_length());
+
+			true_len = blob_len;
+
+			ut_a(get_field_offset(table, field)
+				== key_part->offset);
+
+			/* For multi byte character sets we need to calculate
+			the true length of the key */
+
+			if (blob_len > 0 && cs->mbmaxlen > 1) {
+				true_len = (ulint) cs->cset->well_formed_len(cs,
+						(const char *) blob_data,
+						(const char *) blob_data
+							+ blob_len,
+                                                (uint) (key_len /
+                                                        cs->mbmaxlen),
+						&error);
+			}
+
+			/* All indexes on BLOB and TEXT are column prefix
+			indexes, and we may need to truncate the data to be
+			stored in the key value: */
+
+			if (true_len > key_len) {
+				true_len = key_len;
+			}
+
+			/* MySQL reserves 2 bytes for the length and the
+			storage of the number is little-endian */
+
+			innobase_write_to_2_little_endian(
+					(byte*)buff, true_len);
+			buff += 2;
+
+			memcpy(buff, blob_data, true_len);
+
+			/* Note that we always reserve the maximum possible
+			length of the BLOB prefix in the key value. */
+
+			buff += key_len;
+		} else {
+			/* Here we handle all other data types except the
+			true VARCHAR, BLOB and TEXT. Note that the column
+			value we store may be also in a column prefix
+			index. */
+
+			CHARSET_INFO*		cs;
+			ulint			true_len;
+			ulint			key_len;
+			const uchar*		src_start;
+			int			error=0;
+			enum_field_types	real_type;
+
+			key_len = key_part->length;
+
+			if (is_null) {
+				 buff += key_len;
+
+				 continue;
+			}
+
+			src_start = record + key_part->offset;
+			real_type = field->real_type();
+			true_len = key_len;
+
+			/* Character set for the field is defined only
+			to fields whose type is string and real field
+			type is not enum or set. For these fields check
+			if character set is multi byte. */
+
+			if (real_type != MYSQL_TYPE_ENUM
+				&& real_type != MYSQL_TYPE_SET
+				&& ( mysql_type == MYSQL_TYPE_VAR_STRING
+					|| mysql_type == MYSQL_TYPE_STRING)) {
+
+				cs = field->charset();
+
+				/* For multi byte character sets we need to
+				calculate the true length of the key */
+
+				if (key_len > 0 && cs->mbmaxlen > 1) {
+
+					true_len = (ulint)
+						cs->cset->well_formed_len(cs,
+							(const char *)src_start,
+							(const char *)src_start
+								+ key_len,
+                                                        (uint) (key_len /
+                                                                cs->mbmaxlen),
+							&error);
+				}
+			}
+
+			memcpy(buff, src_start, true_len);
+			buff += true_len;
+
+			/* Pad the unused space with spaces. Note that no
+			padding is ever needed for UCS-2 because in MySQL,
+			all UCS2 characters are 2 bytes, as MySQL does not
+			support surrogate pairs, which are needed to represent
+			characters in the range U+10000 to U+10FFFF. */
+
+			if (true_len < key_len) {
+				ulint pad_len = key_len - true_len;
+				memset(buff, ' ', pad_len);
+				buff += pad_len;
+			}
+		}
+	}
+
+	ut_a(buff <= buff_start + buff_len);
+
+	DBUG_RETURN((uint)(buff - buff_start));
+}
+
+/**************************************************************//**
+Builds a 'template' to the prebuilt struct. The template is used in fast
+retrieval of just those column values MySQL needs in its processing. */
+static
+void
+build_template(
+/*===========*/
+	row_prebuilt_t*	prebuilt,	/*!< in/out: prebuilt struct */
+	THD*		thd,		/*!< in: current user thread, used
+					only if templ_type is
+					ROW_MYSQL_REC_FIELDS */
+	TABLE*		table,		/* in: MySQL table */
+        ha_innobase*    file,           /* in: ha_innobase handler */
+	uint		templ_type)	/* in: ROW_MYSQL_WHOLE_ROW or
+					ROW_MYSQL_REC_FIELDS */
+{
+	dict_index_t*	index;
+	dict_index_t*	clust_index;
+	mysql_row_templ_t* templ;
+	Field*		field;
+	ulint		n_fields, n_stored_fields;
+	ulint		n_requested_fields	= 0;
+	ibool		fetch_all_in_key	= FALSE;
+	ibool		fetch_primary_key_cols	= FALSE;
+	ulint		sql_idx, innodb_idx=0;
+	/* byte offset of the end of last requested column */
+	ulint		mysql_prefix_len	= 0;
+        ibool           do_idx_cond_push= FALSE;
+	ibool           need_second_pass= FALSE;
+        
+	if (prebuilt->select_lock_type == LOCK_X) {
+		/* We always retrieve the whole clustered index record if we
+		use exclusive row level locks, for example, if the read is
+		done in an UPDATE statement. */
+
+		templ_type = ROW_MYSQL_WHOLE_ROW;
+	}
+
+	if (templ_type == ROW_MYSQL_REC_FIELDS) {
+		if (prebuilt->hint_need_to_fetch_extra_cols
+			== ROW_RETRIEVE_ALL_COLS) {
+
+			/* We know we must at least fetch all columns in the
+			key, or all columns in the table */
+
+			if (prebuilt->read_just_key) {
+				/* MySQL has instructed us that it is enough
+				to fetch the columns in the key; looks like
+				MySQL can set this flag also when there is
+				only a prefix of the column in the key: in
+				that case we retrieve the whole column from
+				the clustered index */
+
+				fetch_all_in_key = TRUE;
+			} else {
+				templ_type = ROW_MYSQL_WHOLE_ROW;
+			}
+		} else if (prebuilt->hint_need_to_fetch_extra_cols
+			== ROW_RETRIEVE_PRIMARY_KEY) {
+			/* We must at least fetch all primary key cols. Note
+			   that if the clustered index was internally generated
+			   by InnoDB on the row id (no primary key was
+			   defined), then row_search_for_mysql() will always
+			   retrieve the row id to a special buffer in the
+			   prebuilt struct. */
+
+			fetch_primary_key_cols = TRUE;
+		}
+	}
+
+	clust_index = dict_table_get_first_index(prebuilt->table);
+
+	if (templ_type == ROW_MYSQL_REC_FIELDS) {
+		index = prebuilt->index;
+	} else {
+		index = clust_index;
+	}
+
+	if (index == clust_index) {
+		prebuilt->need_to_access_clustered = TRUE;
+	} else {
+		prebuilt->need_to_access_clustered = FALSE;
+		/* Below we check column by column if we need to access
+		the clustered index */
+	}
+
+	n_fields = (ulint)table->s->fields; /* number of columns */
+	n_stored_fields= (ulint)table->s->stored_fields; /* number of stored columns */
+
+	if (!prebuilt->mysql_template) {
+		prebuilt->mysql_template = (mysql_row_templ_t*)
+			mem_alloc(n_stored_fields * sizeof(mysql_row_templ_t));
+	}
+
+	prebuilt->template_type = templ_type;
+	prebuilt->null_bitmap_len = table->s->null_bytes;
+
+	prebuilt->templ_contains_blob = FALSE;
+
+        
+        /*
+          Setup index condition pushdown (note: we don't need to check if
+          this is a scan on primary key as that is checked in idx_cond_push)
+        */
+        if (file->active_index == file->pushed_idx_cond_keyno && 
+            file->active_index != MAX_KEY && 
+            templ_type == ROW_MYSQL_REC_FIELDS)
+          do_idx_cond_push= need_second_pass= TRUE;
+
+	/* Note that in InnoDB, i is the column number. MySQL calls columns
+	'fields'. */
+	for (sql_idx = 0; sql_idx < n_fields; sql_idx++) {
+		templ = prebuilt->mysql_template + n_requested_fields;
+		field = table->field[sql_idx];
+		if (!field->stored_in_db)
+		  goto skip_field;
+
+		if (UNIV_LIKELY(templ_type == ROW_MYSQL_REC_FIELDS)) {
+			/* Decide which columns we should fetch
+			and which we can skip. */
+			register const ibool	index_contains_field =
+				dict_index_contains_col_or_prefix(index, innodb_idx);
+                        register const ibool    index_covers_field = 
+                                field->part_of_key.is_set(file->active_index);
+
+			if (!index_contains_field && prebuilt->read_just_key) {
+				/* If this is a 'key read', we do not need
+				columns that are not in the key */
+
+				goto skip_field;
+			}
+
+			if (index_contains_field && fetch_all_in_key) {
+				/* This field is needed in the query */
+
+				goto include_field;
+			}
+
+			if (bitmap_is_set(table->read_set, sql_idx) ||
+			    bitmap_is_set(table->write_set, sql_idx)) {
+				/* This field is needed in the query */
+
+				goto include_field;
+			}
+
+			if (fetch_primary_key_cols
+				&& dict_table_col_in_clustered_key(
+					index->table, innodb_idx)) {
+				/* This field is needed in the query */
+
+				goto include_field;
+			}
+
+			/* This field is not needed in the query, skip it */
+
+			goto skip_field;
+include_field:
+			if (do_idx_cond_push && 
+                            ((need_second_pass && !index_covers_field) || 
+                             (!need_second_pass && index_covers_field)))
+			  goto skip_field;
+		}
+		n_requested_fields++;
+
+		templ->col_no = innodb_idx;
+
+		if (index == clust_index) {
+			templ->rec_field_no = dict_col_get_clust_pos(
+				&index->table->cols[innodb_idx], index);
+		} else {
+			templ->rec_field_no = dict_index_get_nth_col_pos(
+								index, innodb_idx);
+		}
+
+		if (templ->rec_field_no == ULINT_UNDEFINED) {
+			prebuilt->need_to_access_clustered = TRUE;
+		}
+
+		if (field->null_ptr) {
+			templ->mysql_null_byte_offset =
+				(ulint) ((char*) field->null_ptr
+					- (char*) table->record[0]);
+
+			templ->mysql_null_bit_mask = (ulint) field->null_bit;
+		} else {
+			templ->mysql_null_bit_mask = 0;
+		}
+
+		templ->mysql_col_offset = (ulint)
+					get_field_offset(table, field);
+
+		templ->mysql_col_len = (ulint) field->pack_length();
+		if (mysql_prefix_len < templ->mysql_col_offset
+				+ templ->mysql_col_len) {
+			mysql_prefix_len = templ->mysql_col_offset
+				+ templ->mysql_col_len;
+		}
+		templ->type = index->table->cols[innodb_idx].mtype;
+		templ->mysql_type = (ulint)field->type();
+
+		if (templ->mysql_type == DATA_MYSQL_TRUE_VARCHAR) {
+			templ->mysql_length_bytes = (ulint)
+				(((Field_varstring*)field)->length_bytes);
+		}
+
+		templ->charset = dtype_get_charset_coll(
+				index->table->cols[innodb_idx].prtype);
+		templ->mbminlen = index->table->cols[innodb_idx].mbminlen;
+		templ->mbmaxlen = index->table->cols[innodb_idx].mbmaxlen;
+		templ->is_unsigned = index->table->cols[innodb_idx].prtype
+							& DATA_UNSIGNED;
+		if (templ->type == DATA_BLOB) {
+			prebuilt->templ_contains_blob = TRUE;
+		}
+skip_field:
+		if (need_second_pass && (sql_idx+1 == n_fields))
+		{
+                  prebuilt->n_index_fields= n_requested_fields;
+		  need_second_pass= FALSE;
+		  sql_idx= (~(ulint)0); /* to start from 0 */
+		  innodb_idx= (~(ulint)0); /* to start from 0 */ ///psergey-merge-merge-last-change
+		}
+                if (field->stored_in_db) {
+                    innodb_idx++;
+                }
+	}
+
+	prebuilt->n_template = n_requested_fields;
+	prebuilt->mysql_prefix_len = mysql_prefix_len;
+
+        if (do_idx_cond_push)
+        {
+          prebuilt->idx_cond_func= index_cond_func_innodb;
+          prebuilt->idx_cond_func_arg= file;
+        }
+        else
+        {
+          prebuilt->idx_cond_func= NULL;
+          prebuilt->n_index_fields= n_requested_fields;
+        }
+
+	if (index != clust_index && prebuilt->need_to_access_clustered) {
+		/* Change rec_field_no's to correspond to the clustered index
+		record */
+		for (ulint i = do_idx_cond_push? prebuilt->n_index_fields : 0; 
+                     i < n_requested_fields; i++) {
+			templ = prebuilt->mysql_template + i;
+			templ->rec_field_no = dict_col_get_clust_pos(
+				&index->table->cols[templ->col_no],
+				clust_index);
+		}
+	}
+}
+
+/********************************************************************//**
+This special handling is really to overcome the limitations of MySQL's
+binlogging. We need to eliminate the non-determinism that will arise in
+INSERT ... SELECT type of statements, since MySQL binlog only stores the
+min value of the autoinc interval. Once that is fixed we can get rid of
+the special lock handling.
+@return	DB_SUCCESS if all OK else error code */
+UNIV_INTERN
+ulint
+ha_innobase::innobase_lock_autoinc(void)
+/*====================================*/
+{
+	ulint		error = DB_SUCCESS;
+
+	switch (innobase_autoinc_lock_mode) {
+	case AUTOINC_NO_LOCKING:
+		/* Acquire only the AUTOINC mutex. */
+		dict_table_autoinc_lock(prebuilt->table);
+		break;
+
+	case AUTOINC_NEW_STYLE_LOCKING:
+		/* For simple (single/multi) row INSERTs, we fallback to the
+		old style only if another transaction has already acquired
+		the AUTOINC lock on behalf of a LOAD FILE or INSERT ... SELECT
+		etc. type of statement. */
+		if (thd_sql_command(user_thd) == SQLCOM_INSERT
+		    || thd_sql_command(user_thd) == SQLCOM_REPLACE) {
+			dict_table_t*	table = prebuilt->table;
+
+			/* Acquire the AUTOINC mutex. */
+			dict_table_autoinc_lock(table);
+
+			/* We need to check that another transaction isn't
+			already holding the AUTOINC lock on the table. */
+			if (table->n_waiting_or_granted_auto_inc_locks) {
+				/* Release the mutex to avoid deadlocks. */
+				dict_table_autoinc_unlock(table);
+			} else {
+				break;
+			}
+		}
+		/* Fall through to old style locking. */
+
+	case AUTOINC_OLD_STYLE_LOCKING:
+		error = row_lock_table_autoinc_for_mysql(prebuilt);
+
+		if (error == DB_SUCCESS) {
+
+			/* Acquire the AUTOINC mutex. */
+			dict_table_autoinc_lock(prebuilt->table);
+		}
+		break;
+
+	default:
+		ut_error;
+	}
+
+	return(ulong(error));
+}
+
+/********************************************************************//**
+Reset the autoinc value in the table.
+@return	DB_SUCCESS if all went well else error code */
+UNIV_INTERN
+ulint
+ha_innobase::innobase_reset_autoinc(
+/*================================*/
+	ulonglong	autoinc)	/*!< in: value to store */
+{
+	ulint		error;
+
+	error = innobase_lock_autoinc();
+
+	if (error == DB_SUCCESS) {
+
+		dict_table_autoinc_initialize(prebuilt->table, autoinc);
+
+		dict_table_autoinc_unlock(prebuilt->table);
+	}
+
+	return(ulong(error));
+}
+
+/********************************************************************//**
+Store the autoinc value in the table. The autoinc value is only set if
+it's greater than the existing autoinc value in the table.
+@return	DB_SUCCESS if all went well else error code */
+UNIV_INTERN
+ulint
+ha_innobase::innobase_set_max_autoinc(
+/*==================================*/
+	ulonglong	auto_inc)	/*!< in: value to store */
+{
+	ulint		error;
+
+	error = innobase_lock_autoinc();
+
+	if (error == DB_SUCCESS) {
+
+		dict_table_autoinc_update_if_greater(prebuilt->table, auto_inc);
+
+		dict_table_autoinc_unlock(prebuilt->table);
+	}
+
+	return(ulong(error));
+}
+
+/********************************************************************//**
+Stores a row in an InnoDB database, to the table specified in this
+handle.
+@return	error code */
+UNIV_INTERN
+int
+ha_innobase::write_row(
+/*===================*/
+	uchar*	record)	/*!< in: a row in MySQL format */
+{
+	ulint		error = 0;
+        int             error_result= 0;
+	ibool		auto_inc_used= FALSE;
+	ulint		sql_command;
+	trx_t*		trx = thd_to_trx(user_thd);
+
+	DBUG_ENTER("ha_innobase::write_row");
+
+	if (prebuilt->trx != trx) {
+	  sql_print_error("The transaction object for the table handle is at "
+			  "%p, but for the current thread it is at %p",
+			  (const void*) prebuilt->trx, (const void*) trx);
+
+		fputs("InnoDB: Dump of 200 bytes around prebuilt: ", stderr);
+		ut_print_buf(stderr, ((const byte*)prebuilt) - 100, 200);
+		fputs("\n"
+			"InnoDB: Dump of 200 bytes around ha_data: ",
+			stderr);
+		ut_print_buf(stderr, ((const byte*) trx) - 100, 200);
+		putc('\n', stderr);
+		ut_error;
+	}
+
+	ha_statistic_increment(&SSV::ha_write_count);
+
+	if (share->ib_table->is_corrupt) {
+		DBUG_RETURN(HA_ERR_CRASHED);
+	}
+
+	if (table->timestamp_field_type & TIMESTAMP_AUTO_SET_ON_INSERT)
+		table->timestamp_field->set_time();
+
+	sql_command = thd_sql_command(user_thd);
+
+	if ((sql_command == SQLCOM_ALTER_TABLE
+	     || sql_command == SQLCOM_OPTIMIZE
+	     || sql_command == SQLCOM_CREATE_INDEX
+	     || sql_command == SQLCOM_DROP_INDEX)
+	    && num_write_row >= 10000) {
+		/* ALTER TABLE is COMMITted at every 10000 copied rows.
+		The IX table lock for the original table has to be re-issued.
+		As this method will be called on a temporary table where the
+		contents of the original table is being copied to, it is
+		a bit tricky to determine the source table.  The cursor
+		position in the source table need not be adjusted after the
+		intermediate COMMIT, since writes by other transactions are
+		being blocked by a MySQL table lock TL_WRITE_ALLOW_READ. */
+
+		dict_table_t*	src_table;
+		enum lock_mode	mode;
+
+		num_write_row = 0;
+
+		/* Commit the transaction.  This will release the table
+		locks, so they have to be acquired again. */
+
+		/* Altering an InnoDB table */
+		/* Get the source table. */
+		src_table = lock_get_src_table(
+				prebuilt->trx, prebuilt->table, &mode);
+		if (!src_table) {
+no_commit:
+			/* Unknown situation: do not commit */
+			/*
+			ut_print_timestamp(stderr);
+			fprintf(stderr,
+				"  InnoDB: ALTER TABLE is holding lock"
+				" on %lu tables!\n",
+				prebuilt->trx->mysql_n_tables_locked);
+			*/
+			;
+		} else if (src_table == prebuilt->table) {
+			/* Source table is not in InnoDB format:
+			no need to re-acquire locks on it. */
+
+			/* Altering to InnoDB format */
+			innobase_commit(ht, user_thd, 1);
+			/* Note that this transaction is still active. */
+			prebuilt->trx->active_trans = 1;
+			/* We will need an IX lock on the destination table. */
+			prebuilt->sql_stat_start = TRUE;
+		} else {
+			/* Ensure that there are no other table locks than
+			LOCK_IX and LOCK_AUTO_INC on the destination table. */
+
+			if (!lock_is_table_exclusive(prebuilt->table,
+							prebuilt->trx)) {
+				goto no_commit;
+			}
+
+			/* Commit the transaction.  This will release the table
+			locks, so they have to be acquired again. */
+			innobase_commit(ht, user_thd, 1);
+			/* Note that this transaction is still active. */
+			prebuilt->trx->active_trans = 1;
+			/* Re-acquire the table lock on the source table. */
+			row_lock_table_for_mysql(prebuilt, src_table, mode);
+			/* We will need an IX lock on the destination table. */
+			prebuilt->sql_stat_start = TRUE;
+		}
+	}
+
+	num_write_row++;
+
+	/* This is the case where the table has an auto-increment column */
+	if (table->next_number_field && record == table->record[0]) {
+
+		/* Reset the error code before calling
+		innobase_get_auto_increment(). */
+		prebuilt->autoinc_error = DB_SUCCESS;
+
+		if ((error = update_auto_increment())) {
+			/* We don't want to mask autoinc overflow errors. */
+
+			/* Handle the case where the AUTOINC sub-system
+			failed during initialization. */
+			if (prebuilt->autoinc_error == DB_UNSUPPORTED) {
+				error_result = ER_AUTOINC_READ_FAILED;
+				/* Set the error message to report too. */
+				my_error(ER_AUTOINC_READ_FAILED, MYF(0));
+				goto func_exit;
+			} else if (prebuilt->autoinc_error != DB_SUCCESS) {
+				error = (int) prebuilt->autoinc_error;
+				goto report_error;
+			}
+
+			/* MySQL errors are passed straight back. */
+			error_result = (int) error;
+			goto func_exit;
+		}
+
+		auto_inc_used = TRUE;
+	}
+
+	if (prebuilt->mysql_template == NULL
+	    || prebuilt->template_type != ROW_MYSQL_WHOLE_ROW) {
+
+		/* Build the template used in converting quickly between
+		the two database formats */
+
+		build_template(prebuilt, NULL, table, this, ROW_MYSQL_WHOLE_ROW);
+	}
+
+	innodb_srv_conc_enter_innodb(prebuilt->trx);
+
+	error = row_insert_for_mysql((byte*) record, prebuilt);
+
+#ifdef EXTENDED_FOR_USERSTAT
+	if (error == DB_SUCCESS) rows_changed++;
+#endif
+
+	/* Handle duplicate key errors */
+	if (auto_inc_used) {
+		ulint		err;
+		ulonglong	auto_inc;
+		ulonglong	col_max_value;
+
+		/* Note the number of rows processed for this statement, used
+		by get_auto_increment() to determine the number of AUTO-INC
+		values to reserve. This is only useful for a mult-value INSERT
+		and is a statement level counter.*/
+		if (trx->n_autoinc_rows > 0) {
+			--trx->n_autoinc_rows;
+		}
+
+		/* We need the upper limit of the col type to check for
+		whether we update the table autoinc counter or not. */
+		col_max_value = innobase_get_int_col_max_value(
+			table->next_number_field);
+
+		/* Get the value that MySQL attempted to store in the table.*/
+		auto_inc = table->next_number_field->val_int();
+
+		switch (error) {
+		case DB_DUPLICATE_KEY:
+
+			/* A REPLACE command and LOAD DATA INFILE REPLACE
+			handle a duplicate key error themselves, but we
+			must update the autoinc counter if we are performing
+			those statements. */
+
+			switch (sql_command) {
+			case SQLCOM_LOAD:
+				if ((trx->duplicates
+				    & (TRX_DUP_IGNORE | TRX_DUP_REPLACE))) {
+
+					goto set_max_autoinc;
+				}
+				break;
+
+			case SQLCOM_REPLACE:
+			case SQLCOM_INSERT_SELECT:
+			case SQLCOM_REPLACE_SELECT:
+				goto set_max_autoinc;
+
+			default:
+				break;
+			}
+
+			break;
+
+		case DB_SUCCESS:
+			/* If the actual value inserted is greater than
+			the upper limit of the interval, then we try and
+			update the table upper limit. Note: last_value
+			will be 0 if get_auto_increment() was not called.*/
+
+			if (auto_inc >= prebuilt->autoinc_last_value) {
+set_max_autoinc:
+				/* This should filter out the negative
+				values set explicitly by the user. */
+				if (auto_inc <= col_max_value) {
+					ut_a(prebuilt->autoinc_increment > 0);
+
+					ulonglong	need;
+					ulonglong	offset;
+
+					offset = prebuilt->autoinc_offset;
+					need = prebuilt->autoinc_increment;
+
+					auto_inc = innobase_next_autoinc(
+						auto_inc,
+						need, offset, col_max_value);
+
+					err = innobase_set_max_autoinc(
+						auto_inc);
+
+					if (err != DB_SUCCESS) {
+						error = err;
+					}
+				}
+			}
+			break;
+		}
+	}
+
+	innodb_srv_conc_exit_innodb(prebuilt->trx);
+
+report_error:
+	error_result = convert_error_code_to_mysql((int) error,
+						   prebuilt->table->flags,
+						   user_thd);
+
+func_exit:
+	innobase_active_small();
+
+	if (share->ib_table->is_corrupt) {
+		DBUG_RETURN(HA_ERR_CRASHED);
+	}
+
+	DBUG_RETURN(error_result);
+}
+
+/**********************************************************************//**
+Checks which fields have changed in a row and stores information
+of them to an update vector.
+@return	error number or 0 */
+static
+int
+calc_row_difference(
+/*================*/
+	upd_t*		uvect,		/*!< in/out: update vector */
+	uchar*		old_row,	/*!< in: old row in MySQL format */
+	uchar*		new_row,	/*!< in: new row in MySQL format */
+	struct st_table* table,		/*!< in: table in MySQL data
+					dictionary */
+	uchar*		upd_buff,	/*!< in: buffer to use */
+	ulint		buff_len,	/*!< in: buffer length */
+	row_prebuilt_t*	prebuilt,	/*!< in: InnoDB prebuilt struct */
+	THD*		thd)		/*!< in: user thread */
+{
+	uchar*		original_upd_buff = upd_buff;
+	Field*		field;
+	enum_field_types field_mysql_type;
+	uint		n_fields;
+	ulint		o_len;
+	ulint		n_len;
+	ulint		col_pack_len;
+	const byte*	new_mysql_row_col;
+	const byte*	o_ptr;
+	const byte*	n_ptr;
+	byte*		buf;
+	upd_field_t*	ufield;
+	ulint		col_type;
+	ulint		n_changed = 0;
+	dfield_t	dfield;
+	dict_index_t*	clust_index;
+	uint		sql_idx, innodb_idx= 0;
+
+	n_fields = table->s->fields;
+	clust_index = dict_table_get_first_index(prebuilt->table);
+
+	/* We use upd_buff to convert changed fields */
+	buf = (byte*) upd_buff;
+
+	for (sql_idx = 0; sql_idx < n_fields; sql_idx++) {
+		field = table->field[sql_idx];
+		if (!field->stored_in_db)
+		  continue;
+
+		o_ptr = (const byte*) old_row + get_field_offset(table, field);
+		n_ptr = (const byte*) new_row + get_field_offset(table, field);
+
+		/* Use new_mysql_row_col and col_pack_len save the values */
+
+		new_mysql_row_col = n_ptr;
+		col_pack_len = field->pack_length();
+
+		o_len = col_pack_len;
+		n_len = col_pack_len;
+
+		/* We use o_ptr and n_ptr to dig up the actual data for
+		comparison. */
+
+		field_mysql_type = field->type();
+
+		col_type = prebuilt->table->cols[innodb_idx].mtype;
+
+		switch (col_type) {
+
+		case DATA_BLOB:
+			o_ptr = row_mysql_read_blob_ref(&o_len, o_ptr, o_len);
+			n_ptr = row_mysql_read_blob_ref(&n_len, n_ptr, n_len);
+
+			break;
+
+		case DATA_VARCHAR:
+		case DATA_BINARY:
+		case DATA_VARMYSQL:
+			if (field_mysql_type == MYSQL_TYPE_VARCHAR) {
+				/* This is a >= 5.0.3 type true VARCHAR where
+				the real payload data length is stored in
+				1 or 2 bytes */
+
+				o_ptr = row_mysql_read_true_varchar(
+					&o_len, o_ptr,
+					(ulint)
+					(((Field_varstring*)field)->length_bytes));
+
+				n_ptr = row_mysql_read_true_varchar(
+					&n_len, n_ptr,
+					(ulint)
+					(((Field_varstring*)field)->length_bytes));
+			}
+
+			break;
+		default:
+			;
+		}
+
+		if (field->null_ptr) {
+			if (field_in_record_is_null(table, field,
+							(char*) old_row)) {
+				o_len = UNIV_SQL_NULL;
+			}
+
+			if (field_in_record_is_null(table, field,
+							(char*) new_row)) {
+				n_len = UNIV_SQL_NULL;
+			}
+		}
+
+		if (o_len != n_len || (o_len != UNIV_SQL_NULL &&
+					0 != memcmp(o_ptr, n_ptr, o_len))) {
+			/* The field has changed */
+
+			ufield = uvect->fields + n_changed;
+
+			/* Let us use a dummy dfield to make the conversion
+			from the MySQL column format to the InnoDB format */
+
+			dict_col_copy_type(prebuilt->table->cols + innodb_idx,
+					   dfield_get_type(&dfield));
+
+			if (n_len != UNIV_SQL_NULL) {
+				buf = row_mysql_store_col_in_innobase_format(
+					&dfield,
+					(byte*)buf,
+					TRUE,
+					new_mysql_row_col,
+					col_pack_len,
+					dict_table_is_comp(prebuilt->table));
+				dfield_copy_data(&ufield->new_val, &dfield);
+			} else {
+				dfield_set_null(&ufield->new_val);
+			}
+
+			ufield->exp = NULL;
+			ufield->orig_len = 0;
+			ufield->field_no = dict_col_get_clust_pos(
+				&prebuilt->table->cols[innodb_idx], clust_index);
+			n_changed++;
+		}
+                if (field->stored_in_db)
+                  innodb_idx++;
+	}
+
+	uvect->n_fields = n_changed;
+	uvect->info_bits = 0;
+
+	ut_a(buf <= (byte*)original_upd_buff + buff_len);
+
+	return(0);
+}
+
+/**********************************************************************//**
+Updates a row given as a parameter to a new value. Note that we are given
+whole rows, not just the fields which are updated: this incurs some
+overhead for CPU when we check which fields are actually updated.
+TODO: currently InnoDB does not prevent the 'Halloween problem':
+in a searched update a single row can get updated several times
+if its index columns are updated!
+@return	error number or 0 */
+UNIV_INTERN
+int
+ha_innobase::update_row(
+/*====================*/
+	const uchar*	old_row,	/*!< in: old row in MySQL format */
+	uchar*		new_row)	/*!< in: new row in MySQL format */
+{
+	upd_t*		uvect;
+	int		error = 0;
+	trx_t*		trx = thd_to_trx(user_thd);
+
+	DBUG_ENTER("ha_innobase::update_row");
+
+	ut_a(prebuilt->trx == trx);
+
+	ha_statistic_increment(&SSV::ha_update_count);
+
+	if (share->ib_table->is_corrupt) {
+		DBUG_RETURN(HA_ERR_CRASHED);
+	}
+
+	if (table->timestamp_field_type & TIMESTAMP_AUTO_SET_ON_UPDATE)
+		table->timestamp_field->set_time();
+
+	if (prebuilt->upd_node) {
+		uvect = prebuilt->upd_node->update;
+	} else {
+		uvect = row_get_prebuilt_update_vector(prebuilt);
+	}
+
+	/* Build an update vector from the modified fields in the rows
+	(uses upd_buff of the handle) */
+
+	calc_row_difference(uvect, (uchar*) old_row, new_row, table,
+			upd_buff, (ulint)upd_and_key_val_buff_len,
+			prebuilt, user_thd);
+
+	/* This is not a delete */
+	prebuilt->upd_node->is_delete = FALSE;
+
+	ut_a(prebuilt->template_type == ROW_MYSQL_WHOLE_ROW);
+
+	innodb_srv_conc_enter_innodb(trx);
+
+	error = row_update_for_mysql((byte*) old_row, prebuilt);
+
+	/* We need to do some special AUTOINC handling for the following case:
+
+	INSERT INTO t (c1,c2) VALUES(x,y) ON DUPLICATE KEY UPDATE ...
+
+	We need to use the AUTOINC counter that was actually used by
+	MySQL in the UPDATE statement, which can be different from the
+	value used in the INSERT statement.*/
+
+	if (error == DB_SUCCESS
+	    && table->next_number_field
+	    && new_row == table->record[0]
+	    && thd_sql_command(user_thd) == SQLCOM_INSERT
+	    && (trx->duplicates & (TRX_DUP_IGNORE | TRX_DUP_REPLACE))
+		== TRX_DUP_IGNORE)  {
+
+		ulonglong	auto_inc;
+		ulonglong	col_max_value;
+
+		auto_inc = table->next_number_field->val_int();
+
+		/* We need the upper limit of the col type to check for
+		whether we update the table autoinc counter or not. */
+		col_max_value = innobase_get_int_col_max_value(
+			table->next_number_field);
+
+		if (auto_inc <= col_max_value && auto_inc != 0) {
+
+			ulonglong	need;
+			ulonglong	offset;
+
+			offset = prebuilt->autoinc_offset;
+			need = prebuilt->autoinc_increment;
+
+			auto_inc = innobase_next_autoinc(
+				auto_inc, need, offset, col_max_value);
+
+			error = innobase_set_max_autoinc(auto_inc);
+		}
+	}
+
+#ifdef EXTENDED_FOR_USERSTAT
+	if (error == DB_SUCCESS) rows_changed++;
+#endif
+
+	innodb_srv_conc_exit_innodb(trx);
+
+	error = convert_error_code_to_mysql(error,
+					    prebuilt->table->flags, user_thd);
+
+	if (error == 0 /* success */
+	    && uvect->n_fields == 0 /* no columns were updated */) {
+
+		/* This is the same as success, but instructs
+		MySQL that the row is not really updated and it
+		should not increase the count of updated rows.
+		This is fix for http://bugs.mysql.com/29157 */
+		error = HA_ERR_RECORD_IS_THE_SAME;
+	}
+
+	/* Tell InnoDB server that there might be work for
+	utility threads: */
+
+	innobase_active_small();
+
+	if (share->ib_table->is_corrupt) {
+		DBUG_RETURN(HA_ERR_CRASHED);
+	}
+
+	DBUG_RETURN(error);
+}
+
+/**********************************************************************//**
+Deletes a row given as the parameter.
+@return	error number or 0 */
+UNIV_INTERN
+int
+ha_innobase::delete_row(
+/*====================*/
+	const uchar*	record)	/*!< in: a row in MySQL format */
+{
+	int		error = 0;
+	trx_t*		trx = thd_to_trx(user_thd);
+
+	DBUG_ENTER("ha_innobase::delete_row");
+
+	ut_a(prebuilt->trx == trx);
+
+	ha_statistic_increment(&SSV::ha_delete_count);
+
+	if (share->ib_table->is_corrupt) {
+		DBUG_RETURN(HA_ERR_CRASHED);
+	}
+
+	if (!prebuilt->upd_node) {
+		row_get_prebuilt_update_vector(prebuilt);
+	}
+
+	/* This is a delete */
+
+	prebuilt->upd_node->is_delete = TRUE;
+
+	innodb_srv_conc_enter_innodb(trx);
+
+	error = row_update_for_mysql((byte*) record, prebuilt);
+
+#ifdef EXTENDED_FOR_USERSTAT
+	if (error == DB_SUCCESS) rows_changed++;
+#endif
+
+	innodb_srv_conc_exit_innodb(trx);
+
+	error = convert_error_code_to_mysql(
+		error, prebuilt->table->flags, user_thd);
+
+	/* Tell the InnoDB server that there might be work for
+	utility threads: */
+
+	innobase_active_small();
+
+	if (share->ib_table->is_corrupt) {
+		DBUG_RETURN(HA_ERR_CRASHED);
+	}
+
+	DBUG_RETURN(error);
+}
+
+/**********************************************************************//**
+Removes a new lock set on a row, if it was not read optimistically. This can
+be called after a row has been read in the processing of an UPDATE or a DELETE
+query, if the option innodb_locks_unsafe_for_binlog is set. */
+UNIV_INTERN
+void
+ha_innobase::unlock_row(void)
+/*=========================*/
+{
+	DBUG_ENTER("ha_innobase::unlock_row");
+
+	/* Consistent read does not take any locks, thus there is
+	nothing to unlock. */
+
+	if (prebuilt->select_lock_type == LOCK_NONE) {
+		DBUG_VOID_RETURN;
+	}
+
+	switch (prebuilt->row_read_type) {
+	case ROW_READ_WITH_LOCKS:
+		if (!srv_locks_unsafe_for_binlog
+		    && prebuilt->trx->isolation_level
+		    > TRX_ISO_READ_COMMITTED) {
+			break;
+		}
+		/* fall through */
+	case ROW_READ_TRY_SEMI_CONSISTENT:
+		row_unlock_for_mysql(prebuilt, FALSE);
+		break;
+	case ROW_READ_DID_SEMI_CONSISTENT:
+		prebuilt->row_read_type = ROW_READ_TRY_SEMI_CONSISTENT;
+		break;
+	}
+
+	DBUG_VOID_RETURN;
+}
+
+/* See handler.h and row0mysql.h for docs on this function. */
+UNIV_INTERN
+bool
+ha_innobase::was_semi_consistent_read(void)
+/*=======================================*/
+{
+	return(prebuilt->row_read_type == ROW_READ_DID_SEMI_CONSISTENT);
+}
+
+/* See handler.h and row0mysql.h for docs on this function. */
+UNIV_INTERN
+void
+ha_innobase::try_semi_consistent_read(bool yes)
+/*===========================================*/
+{
+	ut_a(prebuilt->trx == thd_to_trx(ha_thd()));
+
+	/* Row read type is set to semi consistent read if this was
+	requested by the MySQL and either innodb_locks_unsafe_for_binlog
+	option is used or this session is using READ COMMITTED isolation
+	level. */
+
+	if (yes
+	    && (srv_locks_unsafe_for_binlog
+		|| prebuilt->trx->isolation_level <= TRX_ISO_READ_COMMITTED)) {
+		prebuilt->row_read_type = ROW_READ_TRY_SEMI_CONSISTENT;
+	} else {
+		prebuilt->row_read_type = ROW_READ_WITH_LOCKS;
+	}
+}
+
+/******************************************************************//**
+Initializes a handle to use an index.
+@return	0 or error number */
+UNIV_INTERN
+int
+ha_innobase::index_init(
+/*====================*/
+	uint	keynr,	/*!< in: key (index) number */
+	bool sorted)	/*!< in: 1 if result MUST be sorted according to index */
+{
+	DBUG_ENTER("index_init");
+
+	DBUG_RETURN(change_active_index(keynr));
+}
+
+/******************************************************************//**
+Currently does nothing.
+@return	0 */
+UNIV_INTERN
+int
+ha_innobase::index_end(void)
+/*========================*/
+{
+	int	error	= 0;
+	DBUG_ENTER("index_end");
+	active_index=MAX_KEY;
+	in_range_check_pushed_down= FALSE;
+	ds_mrr.dsmrr_close();
+	DBUG_RETURN(error);
+}
+
+/*********************************************************************//**
+Converts a search mode flag understood by MySQL to a flag understood
+by InnoDB. */
+static inline
+ulint
+convert_search_mode_to_innobase(
+/*============================*/
+	enum ha_rkey_function	find_flag)
+{
+	switch (find_flag) {
+	case HA_READ_KEY_EXACT:
+		/* this does not require the index to be UNIQUE */
+		return(PAGE_CUR_GE);
+	case HA_READ_KEY_OR_NEXT:
+		return(PAGE_CUR_GE);
+	case HA_READ_KEY_OR_PREV:
+		return(PAGE_CUR_LE);
+	case HA_READ_AFTER_KEY:	
+		return(PAGE_CUR_G);
+	case HA_READ_BEFORE_KEY:
+		return(PAGE_CUR_L);
+	case HA_READ_PREFIX:
+		return(PAGE_CUR_GE);
+	case HA_READ_PREFIX_LAST:
+		return(PAGE_CUR_LE);
+	case HA_READ_PREFIX_LAST_OR_PREV:
+		return(PAGE_CUR_LE);
+		/* In MySQL-4.0 HA_READ_PREFIX and HA_READ_PREFIX_LAST always
+		pass a complete-field prefix of a key value as the search
+		tuple. I.e., it is not allowed that the last field would
+		just contain n first bytes of the full field value.
+		MySQL uses a 'padding' trick to convert LIKE 'abc%'
+		type queries so that it can use as a search tuple
+		a complete-field-prefix of a key value. Thus, the InnoDB
+		search mode PAGE_CUR_LE_OR_EXTENDS is never used.
+		TODO: when/if MySQL starts to use also partial-field
+		prefixes, we have to deal with stripping of spaces
+		and comparison of non-latin1 char type fields in
+		innobase_mysql_cmp() to get PAGE_CUR_LE_OR_EXTENDS to
+		work correctly. */
+	case HA_READ_MBR_CONTAIN:
+	case HA_READ_MBR_INTERSECT:
+	case HA_READ_MBR_WITHIN:
+	case HA_READ_MBR_DISJOINT:
+	case HA_READ_MBR_EQUAL:
+		return(PAGE_CUR_UNSUPP);
+	/* do not use "default:" in order to produce a gcc warning:
+	enumeration value '...' not handled in switch
+	(if -Wswitch or -Wall is used) */
+	}
+
+	my_error(ER_CHECK_NOT_IMPLEMENTED, MYF(0), "this functionality");
+
+	return(PAGE_CUR_UNSUPP);
+}
+
+/*
+   BACKGROUND INFO: HOW A SELECT SQL QUERY IS EXECUTED
+   ---------------------------------------------------
+The following does not cover all the details, but explains how we determine
+the start of a new SQL statement, and what is associated with it.
+
+For each table in the database the MySQL interpreter may have several
+table handle instances in use, also in a single SQL query. For each table
+handle instance there is an InnoDB  'prebuilt' struct which contains most
+of the InnoDB data associated with this table handle instance.
+
+  A) if the user has not explicitly set any MySQL table level locks:
+
+  1) MySQL calls ::external_lock to set an 'intention' table level lock on
+the table of the handle instance. There we set
+prebuilt->sql_stat_start = TRUE. The flag sql_stat_start should be set
+true if we are taking this table handle instance to use in a new SQL
+statement issued by the user. We also increment trx->n_mysql_tables_in_use.
+
+  2) If prebuilt->sql_stat_start == TRUE we 'pre-compile' the MySQL search
+instructions to prebuilt->template of the table handle instance in
+::index_read. The template is used to save CPU time in large joins.
+
+  3) In row_search_for_mysql, if prebuilt->sql_stat_start is true, we
+allocate a new consistent read view for the trx if it does not yet have one,
+or in the case of a locking read, set an InnoDB 'intention' table level
+lock on the table.
+
+  4) We do the SELECT. MySQL may repeatedly call ::index_read for the
+same table handle instance, if it is a join.
+
+  5) When the SELECT ends, MySQL removes its intention table level locks
+in ::external_lock. When trx->n_mysql_tables_in_use drops to zero,
+ (a) we execute a COMMIT there if the autocommit is on,
+ (b) we also release possible 'SQL statement level resources' InnoDB may
+have for this SQL statement. The MySQL interpreter does NOT execute
+autocommit for pure read transactions, though it should. That is why the
+table handler in that case has to execute the COMMIT in ::external_lock.
+
+  B) If the user has explicitly set MySQL table level locks, then MySQL
+does NOT call ::external_lock at the start of the statement. To determine
+when we are at the start of a new SQL statement we at the start of
+::index_read also compare the query id to the latest query id where the
+table handle instance was used. If it has changed, we know we are at the
+start of a new SQL statement. Since the query id can theoretically
+overwrap, we use this test only as a secondary way of determining the
+start of a new SQL statement. */
+
+
+/**********************************************************************//**
+Positions an index cursor to the index specified in the handle. Fetches the
+row if any.
+@return	0, HA_ERR_KEY_NOT_FOUND, or error number */
+UNIV_INTERN
+int
+ha_innobase::index_read(
+/*====================*/
+	uchar*		buf,		/*!< in/out: buffer for the returned
+					row */
+	const uchar*	key_ptr,	/*!< in: key value; if this is NULL
+					we position the cursor at the
+					start or end of index; this can
+					also contain an InnoDB row id, in
+					which case key_len is the InnoDB
+					row id length; the key value can
+					also be a prefix of a full key value,
+					and the last column can be a prefix
+					of a full column */
+	uint			key_len,/*!< in: key value length */
+	enum ha_rkey_function find_flag)/*!< in: search flags from my_base.h */
+{
+	ulint		mode;
+	dict_index_t*	index;
+	ulint		match_mode	= 0;
+	int		error;
+	ulint		ret;
+
+	DBUG_ENTER("index_read");
+
+	ut_a(prebuilt->trx == thd_to_trx(user_thd));
+
+	ha_statistic_increment(&SSV::ha_read_key_count);
+
+	if (share->ib_table->is_corrupt) {
+		DBUG_RETURN(HA_ERR_CRASHED);
+	}
+
+	index = prebuilt->index;
+
+	if (UNIV_UNLIKELY(index == NULL)) {
+		prebuilt->index_usable = FALSE;
+		DBUG_RETURN(HA_ERR_CRASHED);
+	}
+	if (UNIV_UNLIKELY(!prebuilt->index_usable)) {
+		DBUG_RETURN(HA_ERR_TABLE_DEF_CHANGED);
+	}
+
+	/* Note that if the index for which the search template is built is not
+	necessarily prebuilt->index, but can also be the clustered index */
+
+	if (prebuilt->sql_stat_start) {
+		build_template(prebuilt, user_thd, table, this,
+                               ROW_MYSQL_REC_FIELDS);
+	}
+
+	if (key_ptr) {
+		/* Convert the search key value to InnoDB format into
+		prebuilt->search_tuple */
+
+		row_sel_convert_mysql_key_to_innobase(
+			prebuilt->search_tuple,
+			(byte*) key_val_buff,
+			(ulint)upd_and_key_val_buff_len,
+			index,
+			(byte*) key_ptr,
+			(ulint) key_len,
+			prebuilt->trx);
+	} else {
+		/* We position the cursor to the last or the first entry
+		in the index */
+
+		dtuple_set_n_fields(prebuilt->search_tuple, 0);
+	}
+
+	mode = convert_search_mode_to_innobase(find_flag);
+
+	match_mode = 0;
+
+	if (find_flag == HA_READ_KEY_EXACT) {
+
+		match_mode = ROW_SEL_EXACT;
+
+	} else if (find_flag == HA_READ_PREFIX
+		   || find_flag == HA_READ_PREFIX_LAST) {
+
+		match_mode = ROW_SEL_EXACT_PREFIX;
+	}
+
+	last_match_mode = (uint) match_mode;
+
+	if (mode != PAGE_CUR_UNSUPP) {
+
+		innodb_srv_conc_enter_innodb(prebuilt->trx);
+
+		ret = row_search_for_mysql((byte*) buf, mode, prebuilt,
+					   match_mode, 0);
+
+		innodb_srv_conc_exit_innodb(prebuilt->trx);
+	} else {
+
+		ret = DB_UNSUPPORTED;
+	}
+
+	if (share->ib_table->is_corrupt) {
+		DBUG_RETURN(HA_ERR_CRASHED);
+	}
+
+	switch (ret) {
+	case DB_SUCCESS:
+		error = 0;
+		table->status = 0;
+		break;
+	case DB_RECORD_NOT_FOUND:
+		error = HA_ERR_KEY_NOT_FOUND;
+		table->status = STATUS_NOT_FOUND;
+		break;
+	case DB_END_OF_INDEX:
+		error = HA_ERR_KEY_NOT_FOUND;
+		table->status = STATUS_NOT_FOUND;
+		break;
+	default:
+		error = convert_error_code_to_mysql((int) ret,
+						    prebuilt->table->flags,
+						    user_thd);
+		table->status = STATUS_NOT_FOUND;
+		break;
+	}
+
+	DBUG_RETURN(error);
+}
+
+/*******************************************************************//**
+The following functions works like index_read, but it find the last
+row with the current key value or prefix.
+@return	0, HA_ERR_KEY_NOT_FOUND, or an error code */
+UNIV_INTERN
+int
+ha_innobase::index_read_last(
+/*=========================*/
+	uchar*		buf,	/*!< out: fetched row */
+	const uchar*	key_ptr,/*!< in: key value, or a prefix of a full
+				key value */
+	uint		key_len)/*!< in: length of the key val or prefix
+				in bytes */
+{
+	return(index_read(buf, key_ptr, key_len, HA_READ_PREFIX_LAST));
+}
+
+/********************************************************************//**
+Get the index for a handle. Does not change active index.
+@return	NULL or index instance. */
+UNIV_INTERN
+dict_index_t*
+ha_innobase::innobase_get_index(
+/*============================*/
+	uint		keynr)	/*!< in: use this index; MAX_KEY means always
+				clustered index, even if it was internally
+				generated by InnoDB */
+{
+	KEY*		key = 0;
+	dict_index_t*	index = 0;
+
+	DBUG_ENTER("innobase_get_index");
+	ha_statistic_increment(&SSV::ha_read_key_count);
+
+	if (keynr != MAX_KEY && table->s->keys > 0) {
+		key = table->key_info + keynr;
+
+		index = innobase_index_lookup(share, keynr);
+
+		if (index) {
+			ut_a(ut_strcmp(index->name, key->name) == 0);
+		} else {
+			/* Can't find index with keynr in the translation
+			table. Only print message if the index translation
+			table exists */
+			if (share->idx_trans_tbl.index_mapping) {
+				sql_print_error("InnoDB could not find "
+						"index %s key no %u for "
+						"table %s through its "
+						"index translation table",
+						key ? key->name : "NULL",
+						keynr,
+						prebuilt->table->name);
+			}
+
+			index = dict_table_get_index_on_name(prebuilt->table,
+							     key->name);
+		}
+	} else {
+		index = dict_table_get_first_index(prebuilt->table);
+	}
+
+	if (!index) {
+		sql_print_error(
+			"Innodb could not find key n:o %u with name %s "
+			"from dict cache for table %s",
+			keynr, key ? key->name : "NULL",
+			prebuilt->table->name);
+	}
+
+	DBUG_RETURN(index);
+}
+
+/********************************************************************//**
+Changes the active index of a handle.
+@return	0 or error code */
+UNIV_INTERN
+int
+ha_innobase::change_active_index(
+/*=============================*/
+	uint	keynr)	/*!< in: use this index; MAX_KEY means always clustered
+			index, even if it was internally generated by
+			InnoDB */
+{
+	DBUG_ENTER("change_active_index");
+
+	if (share->ib_table->is_corrupt) {
+		DBUG_RETURN(HA_ERR_CRASHED);
+	}
+
+	ut_ad(user_thd == ha_thd());
+	ut_a(prebuilt->trx == thd_to_trx(user_thd));
+
+	active_index = keynr;
+
+	prebuilt->index = innobase_get_index(keynr);
+
+	if (UNIV_UNLIKELY(!prebuilt->index)) {
+		sql_print_warning("InnoDB: change_active_index(%u) failed",
+				  keynr);
+		prebuilt->index_usable = FALSE;
+		DBUG_RETURN(1);
+	}
+
+	prebuilt->index_usable = row_merge_is_index_usable(prebuilt->trx,
+							   prebuilt->index);
+
+	if (UNIV_UNLIKELY(!prebuilt->index_usable)) {
+		push_warning_printf(user_thd, MYSQL_ERROR::WARN_LEVEL_WARN,
+				    HA_ERR_TABLE_DEF_CHANGED,
+				    "InnoDB: insufficient history for index %u",
+				    keynr);
+		/* The caller seems to ignore this.  Thus, we must check
+		this again in row_search_for_mysql(). */
+		DBUG_RETURN(convert_error_code_to_mysql(DB_MISSING_HISTORY,
+                                                        0, NULL));
+	}
+
+	ut_a(prebuilt->search_tuple != 0);
+
+	dtuple_set_n_fields(prebuilt->search_tuple, prebuilt->index->n_fields);
+
+	dict_index_copy_types(prebuilt->search_tuple, prebuilt->index,
+			      prebuilt->index->n_fields);
+
+	/* MySQL changes the active index for a handle also during some
+	queries, for example SELECT MAX(a), SUM(a) first retrieves the MAX()
+	and then calculates the sum. Previously we played safe and used
+	the flag ROW_MYSQL_WHOLE_ROW below, but that caused unnecessary
+	copying. Starting from MySQL-4.1 we use a more efficient flag here. */
+
+	build_template(prebuilt, user_thd, table, this, ROW_MYSQL_REC_FIELDS);
+
+	DBUG_RETURN(0);
+}
+
+/**********************************************************************//**
+Positions an index cursor to the index specified in keynr. Fetches the
+row if any.
+??? This is only used to read whole keys ???
+@return	error number or 0 */
+UNIV_INTERN
+int
+ha_innobase::index_read_idx(
+/*========================*/
+	uchar*		buf,		/*!< in/out: buffer for the returned
+					row */
+	uint		keynr,		/*!< in: use this index */
+	const uchar*	key,		/*!< in: key value; if this is NULL
+					we position the cursor at the
+					start or end of index */
+	uint		key_len,	/*!< in: key value length */
+	enum ha_rkey_function find_flag)/*!< in: search flags from my_base.h */
+{
+	if (change_active_index(keynr)) {
+
+		return(1);
+	}
+
+	return(index_read(buf, key, key_len, find_flag));
+}
+
+/***********************************************************************//**
+Reads the next or previous row from a cursor, which must have previously been
+positioned using index_read.
+@return	0, HA_ERR_END_OF_FILE, or error number */
+UNIV_INTERN
+int
+ha_innobase::general_fetch(
+/*=======================*/
+	uchar*	buf,		/*!< in/out: buffer for next row in MySQL
+				format */
+	uint	direction,	/*!< in: ROW_SEL_NEXT or ROW_SEL_PREV */
+	uint	match_mode)	/*!< in: 0, ROW_SEL_EXACT, or
+				ROW_SEL_EXACT_PREFIX */
+{
+	ulint		ret;
+	int		error	= 0;
+
+	DBUG_ENTER("general_fetch");
+
+	if (share->ib_table->is_corrupt) {
+		DBUG_RETURN(HA_ERR_CRASHED);
+	}
+
+	ut_a(prebuilt->trx == thd_to_trx(user_thd));
+
+	innodb_srv_conc_enter_innodb(prebuilt->trx);
+
+	ret = row_search_for_mysql(
+		(byte*)buf, 0, prebuilt, match_mode, direction);
+
+	innodb_srv_conc_exit_innodb(prebuilt->trx);
+
+	if (share->ib_table->is_corrupt) {
+		DBUG_RETURN(HA_ERR_CRASHED);
+	}
+
+	switch (ret) {
+	case DB_SUCCESS:
+		error = 0;
+		table->status = 0;
+#ifdef EXTENDED_FOR_USERSTAT
+		rows_read++;
+		if (active_index >= 0 && active_index < MAX_KEY)
+			index_rows_read[active_index]++;
+#endif
+		break;
+	case DB_RECORD_NOT_FOUND:
+		error = HA_ERR_END_OF_FILE;
+		table->status = STATUS_NOT_FOUND;
+		break;
+	case DB_END_OF_INDEX:
+		error = HA_ERR_END_OF_FILE;
+		table->status = STATUS_NOT_FOUND;
+		break;
+	default:
+		error = convert_error_code_to_mysql(
+			(int) ret, prebuilt->table->flags, user_thd);
+		table->status = STATUS_NOT_FOUND;
+		break;
+	}
+
+	DBUG_RETURN(error);
+}
+
+/***********************************************************************//**
+Reads the next row from a cursor, which must have previously been
+positioned using index_read.
+@return	0, HA_ERR_END_OF_FILE, or error number */
+UNIV_INTERN
+int
+ha_innobase::index_next(
+/*====================*/
+	uchar*		buf)	/*!< in/out: buffer for next row in MySQL
+				format */
+{
+	ha_statistic_increment(&SSV::ha_read_next_count);
+
+	return(general_fetch(buf, ROW_SEL_NEXT, 0));
+}
+
+/*******************************************************************//**
+Reads the next row matching to the key value given as the parameter.
+@return	0, HA_ERR_END_OF_FILE, or error number */
+UNIV_INTERN
+int
+ha_innobase::index_next_same(
+/*=========================*/
+	uchar*		buf,	/*!< in/out: buffer for the row */
+	const uchar*	key,	/*!< in: key value */
+	uint		keylen)	/*!< in: key value length */
+{
+	ha_statistic_increment(&SSV::ha_read_next_count);
+
+	return(general_fetch(buf, ROW_SEL_NEXT, last_match_mode));
+}
+
+/***********************************************************************//**
+Reads the previous row from a cursor, which must have previously been
+positioned using index_read.
+@return	0, HA_ERR_END_OF_FILE, or error number */
+UNIV_INTERN
+int
+ha_innobase::index_prev(
+/*====================*/
+	uchar*	buf)	/*!< in/out: buffer for previous row in MySQL format */
+{
+	ha_statistic_increment(&SSV::ha_read_prev_count);
+
+	return(general_fetch(buf, ROW_SEL_PREV, 0));
+}
+
+/********************************************************************//**
+Positions a cursor on the first record in an index and reads the
+corresponding row to buf.
+@return	0, HA_ERR_END_OF_FILE, or error code */
+UNIV_INTERN
+int
+ha_innobase::index_first(
+/*=====================*/
+	uchar*	buf)	/*!< in/out: buffer for the row */
+{
+	int	error;
+
+	DBUG_ENTER("index_first");
+	ha_statistic_increment(&SSV::ha_read_first_count);
+
+	error = index_read(buf, NULL, 0, HA_READ_AFTER_KEY);
+
+	/* MySQL does not seem to allow this to return HA_ERR_KEY_NOT_FOUND */
+
+	if (error == HA_ERR_KEY_NOT_FOUND) {
+		error = HA_ERR_END_OF_FILE;
+	}
+
+	DBUG_RETURN(error);
+}
+
+/********************************************************************//**
+Positions a cursor on the last record in an index and reads the
+corresponding row to buf.
+@return	0, HA_ERR_END_OF_FILE, or error code */
+UNIV_INTERN
+int
+ha_innobase::index_last(
+/*====================*/
+	uchar*	buf)	/*!< in/out: buffer for the row */
+{
+	int	error;
+
+	DBUG_ENTER("index_last");
+	ha_statistic_increment(&SSV::ha_read_last_count);
+
+	error = index_read(buf, NULL, 0, HA_READ_BEFORE_KEY);
+
+	/* MySQL does not seem to allow this to return HA_ERR_KEY_NOT_FOUND */
+
+	if (error == HA_ERR_KEY_NOT_FOUND) {
+		error = HA_ERR_END_OF_FILE;
+	}
+
+	DBUG_RETURN(error);
+}
+
+/****************************************************************//**
+Initialize a table scan.
+@return	0 or error number */
+UNIV_INTERN
+int
+ha_innobase::rnd_init(
+/*==================*/
+	bool	scan)	/*!< in: TRUE if table/index scan FALSE otherwise */
+{
+	int	err;
+
+	/* Store the active index value so that we can restore the original
+	value after a scan */
+
+	if (prebuilt->clust_index_was_generated) {
+		err = change_active_index(MAX_KEY);
+	} else {
+		err = change_active_index(primary_key);
+	}
+
+	/* Don't use semi-consistent read in random row reads (by position).
+	This means we must disable semi_consistent_read if scan is false */
+
+	if (!scan) {
+		try_semi_consistent_read(0);
+	}
+
+	start_of_scan = 1;
+
+	return(err);
+}
+
+/*****************************************************************//**
+Ends a table scan.
+@return	0 or error number */
+UNIV_INTERN
+int
+ha_innobase::rnd_end(void)
+/*======================*/
+{
+	return(index_end());
+}
+
+/*****************************************************************//**
+Reads the next row in a table scan (also used to read the FIRST row
+in a table scan).
+@return	0, HA_ERR_END_OF_FILE, or error number */
+UNIV_INTERN
+int
+ha_innobase::rnd_next(
+/*==================*/
+	uchar*	buf)	/*!< in/out: returns the row in this buffer,
+			in MySQL format */
+{
+	int	error;
+
+	DBUG_ENTER("rnd_next");
+	ha_statistic_increment(&SSV::ha_read_rnd_next_count);
+
+	if (start_of_scan) {
+		error = index_first(buf);
+
+		if (error == HA_ERR_KEY_NOT_FOUND) {
+			error = HA_ERR_END_OF_FILE;
+		}
+
+		start_of_scan = 0;
+	} else {
+		error = general_fetch(buf, ROW_SEL_NEXT, 0);
+	}
+
+	DBUG_RETURN(error);
+}
+
+/**********************************************************************//**
+Fetches a row from the table based on a row reference.
+@return	0, HA_ERR_KEY_NOT_FOUND, or error code */
+UNIV_INTERN
+int
+ha_innobase::rnd_pos(
+/*=================*/
+	uchar*	buf,	/*!< in/out: buffer for the row */
+	uchar*	pos)	/*!< in: primary key value of the row in the
+			MySQL format, or the row id if the clustered
+			index was internally generated by InnoDB; the
+			length of data in pos has to be ref_length */
+{
+	int		error;
+	uint		keynr	= active_index;
+	DBUG_ENTER("rnd_pos");
+	DBUG_DUMP("key", pos, ref_length);
+
+	ha_statistic_increment(&SSV::ha_read_rnd_count);
+
+	ut_a(prebuilt->trx == thd_to_trx(ha_thd()));
+
+	if (prebuilt->clust_index_was_generated) {
+		/* No primary key was defined for the table and we
+		generated the clustered index from the row id: the
+		row reference is the row id, not any key value
+		that MySQL knows of */
+
+		error = change_active_index(MAX_KEY);
+	} else {
+		error = change_active_index(primary_key);
+	}
+
+	if (error) {
+		DBUG_PRINT("error", ("Got error: %d", error));
+		DBUG_RETURN(error);
+	}
+
+	/* Note that we assume the length of the row reference is fixed
+	for the table, and it is == ref_length */
+
+	error = index_read(buf, pos, ref_length, HA_READ_KEY_EXACT);
+
+	if (error) {
+		DBUG_PRINT("error", ("Got error: %d", error));
+	}
+
+	change_active_index(keynr);
+
+	DBUG_RETURN(error);
+}
+
+/*********************************************************************//**
+Stores a reference to the current row to 'ref' field of the handle. Note
+that in the case where we have generated the clustered index for the
+table, the function parameter is illogical: we MUST ASSUME that 'record'
+is the current 'position' of the handle, because if row ref is actually
+the row id internally generated in InnoDB, then 'record' does not contain
+it. We just guess that the row id must be for the record where the handle
+was positioned the last time. */
+UNIV_INTERN
+void
+ha_innobase::position(
+/*==================*/
+	const uchar*	record)	/*!< in: row in MySQL format */
+{
+	uint		len;
+
+	ut_a(prebuilt->trx == thd_to_trx(ha_thd()));
+
+	if (prebuilt->clust_index_was_generated) {
+		/* No primary key was defined for the table and we
+		generated the clustered index from row id: the
+		row reference will be the row id, not any key value
+		that MySQL knows of */
+
+		len = DATA_ROW_ID_LEN;
+
+		memcpy(ref, prebuilt->row_id, len);
+	} else {
+		len = store_key_val_for_row(primary_key, (char*)ref,
+							 ref_length, record);
+	}
+
+	/* We assume that the 'ref' value len is always fixed for the same
+	table. */
+
+	if (len != ref_length) {
+	  sql_print_error("Stored ref len is %lu, but table ref len is %lu",
+			  (ulong) len, (ulong) ref_length);
+	}
+}
+
+/* limit innodb monitor access to users with PROCESS privilege.
+See http://bugs.mysql.com/32710 for expl. why we choose PROCESS. */
+#define IS_MAGIC_TABLE_AND_USER_DENIED_ACCESS(table_name, thd) \
+	(row_is_magic_monitor_table(table_name) \
+	 && check_global_access(thd, PROCESS_ACL))
+
+/*****************************************************************//**
+Creates a table definition to an InnoDB database. */
+static
+int
+create_table_def(
+/*=============*/
+	trx_t*		trx,		/*!< in: InnoDB transaction handle */
+	TABLE*		form,		/*!< in: information on table
+					columns and indexes */
+	const char*	table_name,	/*!< in: table name */
+	const char*	path_of_temp_table,/*!< in: if this is a table explicitly
+					created by the user with the
+					TEMPORARY keyword, then this
+					parameter is the dir path where the
+					table should be placed if we create
+					an .ibd file for it (no .ibd extension
+					in the path, though); otherwise this
+					is NULL */
+	ulint		flags)		/*!< in: table flags */
+{
+	Field*		field;
+	dict_table_t*	table;
+	ulint		n_cols;
+	int		error;
+	ulint		col_type;
+	ulint		col_len;
+	ulint		nulls_allowed;
+	ulint		unsigned_type;
+	ulint		binary_type;
+	ulint		long_true_varchar;
+	ulint		charset_no;
+	ulint		i;
+
+	DBUG_ENTER("create_table_def");
+	DBUG_PRINT("enter", ("table_name: %s", table_name));
+
+	ut_a(trx->mysql_thd != NULL);
+	if (IS_MAGIC_TABLE_AND_USER_DENIED_ACCESS(table_name,
+						  (THD*) trx->mysql_thd)) {
+		DBUG_RETURN(HA_ERR_GENERIC);
+	}
+
+	n_cols = form->s->fields;
+
+	/* We pass 0 as the space id, and determine at a lower level the space
+	id where to store the table */
+
+	table = dict_mem_table_create(table_name, 0, form->s->stored_fields, flags);
+
+	if (path_of_temp_table) {
+		table->dir_path_of_temp_table =
+			mem_heap_strdup(table->heap, path_of_temp_table);
+	}
+
+	for (i = 0; i < n_cols; i++) {
+		field = form->field[i];
+		if (!field->stored_in_db)
+		  continue;
+
+		col_type = get_innobase_type_from_mysql_type(&unsigned_type,
+							     field);
+
+		if (!col_type) {
+			push_warning_printf(
+				(THD*) trx->mysql_thd,
+				MYSQL_ERROR::WARN_LEVEL_WARN,
+				ER_CANT_CREATE_TABLE,
+				"Error creating table '%s' with "
+				"column '%s'. Please check its "
+				"column type and try to re-create "
+				"the table with an appropriate "
+				"column type.",
+				table->name, (char*) field->field_name);
+			goto err_col;
+		}
+
+		if (field->null_ptr) {
+			nulls_allowed = 0;
+		} else {
+			nulls_allowed = DATA_NOT_NULL;
+		}
+
+		if (field->binary()) {
+			binary_type = DATA_BINARY_TYPE;
+		} else {
+			binary_type = 0;
+		}
+
+		charset_no = 0;
+
+		if (dtype_is_string_type(col_type)) {
+
+			charset_no = (ulint)field->charset()->number;
+
+			if (UNIV_UNLIKELY(charset_no >= 256)) {
+				/* in data0type.h we assume that the
+				number fits in one byte in prtype */
+				push_warning_printf(
+					(THD*) trx->mysql_thd,
+					MYSQL_ERROR::WARN_LEVEL_WARN,
+					ER_CANT_CREATE_TABLE,
+					"In InnoDB, charset-collation codes"
+					" must be below 256."
+					" Unsupported code %lu.",
+					(ulong) charset_no);
+				DBUG_RETURN(ER_CANT_CREATE_TABLE);
+			}
+		}
+
+		ut_a(field->type() < 256); /* we assume in dtype_form_prtype()
+					   that this fits in one byte */
+		col_len = field->pack_length();
+
+		/* The MySQL pack length contains 1 or 2 bytes length field
+		for a true VARCHAR. Let us subtract that, so that the InnoDB
+		column length in the InnoDB data dictionary is the real
+		maximum byte length of the actual data. */
+
+		long_true_varchar = 0;
+
+		if (field->type() == MYSQL_TYPE_VARCHAR) {
+			col_len -= ((Field_varstring*)field)->length_bytes;
+
+			if (((Field_varstring*)field)->length_bytes == 2) {
+				long_true_varchar = DATA_LONG_TRUE_VARCHAR;
+			}
+		}
+
+		/* First check whether the column to be added has a
+		system reserved name. */
+		if (dict_col_name_is_reserved(field->field_name)){
+			my_error(ER_WRONG_COLUMN_NAME, MYF(0),
+				 field->field_name);
+err_col:
+			dict_mem_table_free(table);
+			trx_commit_for_mysql(trx);
+
+			error = DB_ERROR;
+			goto error_ret;
+		}
+
+		dict_mem_table_add_col(table, table->heap,
+			(char*) field->field_name,
+			col_type,
+			dtype_form_prtype(
+				(ulint)field->type()
+				| nulls_allowed | unsigned_type
+				| binary_type | long_true_varchar,
+				charset_no),
+			col_len);
+	}
+
+	error = row_create_table_for_mysql(table, trx);
+
+	if (error == DB_DUPLICATE_KEY) {
+		char buf[100];
+		char* buf_end = innobase_convert_identifier(
+			buf, sizeof buf - 1, table_name, strlen(table_name),
+			trx->mysql_thd, TRUE);
+
+		*buf_end = '\0';
+		my_error(ER_TABLE_EXISTS_ERROR, MYF(0), buf);
+	}
+
+error_ret:
+	error = convert_error_code_to_mysql(error, flags, NULL);
+
+	DBUG_RETURN(error);
+}
+
+/*****************************************************************//**
+Creates an index in an InnoDB database. */
+static
+int
+create_index(
+/*=========*/
+	trx_t*		trx,		/*!< in: InnoDB transaction handle */
+	TABLE*		form,		/*!< in: information on table
+					columns and indexes */
+	ulint		flags,		/*!< in: InnoDB table flags */
+	const char*	table_name,	/*!< in: table name */
+	uint		key_num)	/*!< in: index number */
+{
+	Field*		field;
+	dict_index_t*	index;
+	int		error;
+	ulint		n_fields;
+	KEY*		key;
+	KEY_PART_INFO*	key_part;
+	ulint		ind_type;
+	ulint		col_type;
+	ulint		prefix_len;
+	ulint		is_unsigned;
+	ulint		i;
+	ulint		j;
+	ulint*		field_lengths;
+
+	DBUG_ENTER("create_index");
+
+	key = form->key_info + key_num;
+
+	n_fields = key->key_parts;
+
+	/* Assert that "GEN_CLUST_INDEX" cannot be used as non-primary index */
+	ut_a(innobase_strcasecmp(key->name, innobase_index_reserve_name) != 0);
+
+	ind_type = 0;
+
+	if (key_num == form->s->primary_key) {
+		ind_type = ind_type | DICT_CLUSTERED;
+	}
+
+	if (key->flags & HA_NOSAME ) {
+		ind_type = ind_type | DICT_UNIQUE;
+	}
+
+	/* We pass 0 as the space id, and determine at a lower level the space
+	id where to store the table */
+
+	index = dict_mem_index_create(table_name, key->name, 0,
+				      ind_type, n_fields);
+
+	field_lengths = (ulint*) my_malloc(sizeof(ulint) * n_fields,
+		MYF(MY_FAE));
+
+	for (i = 0; i < n_fields; i++) {
+		key_part = key->key_part + i;
+
+		/* (The flag HA_PART_KEY_SEG denotes in MySQL a column prefix
+		field in an index: we only store a specified number of first
+		bytes of the column to the index field.) The flag does not
+		seem to be properly set by MySQL. Let us fall back on testing
+		the length of the key part versus the column. */
+
+		field = NULL;
+		for (j = 0; j < form->s->fields; j++) {
+
+			field = form->field[j];
+
+			if (0 == innobase_strcasecmp(
+					field->field_name,
+					key_part->field->field_name)) {
+				/* Found the corresponding column */
+
+				break;
+			}
+		}
+
+		ut_a(j < form->s->fields);
+
+		col_type = get_innobase_type_from_mysql_type(
+					&is_unsigned, key_part->field);
+
+		if (DATA_BLOB == col_type
+			|| (key_part->length < field->pack_length()
+				&& field->type() != MYSQL_TYPE_VARCHAR)
+			|| (field->type() == MYSQL_TYPE_VARCHAR
+				&& key_part->length < field->pack_length()
+				- ((Field_varstring*)field)->length_bytes)) {
+
+			prefix_len = key_part->length;
+
+			if (col_type == DATA_INT
+				|| col_type == DATA_FLOAT
+				|| col_type == DATA_DOUBLE
+				|| col_type == DATA_DECIMAL) {
+				sql_print_error(
+					"MySQL is trying to create a column "
+					"prefix index field, on an "
+					"inappropriate data type. Table "
+					"name %s, column name %s.",
+					table_name,
+					key_part->field->field_name);
+
+				prefix_len = 0;
+			}
+		} else {
+			prefix_len = 0;
+		}
+
+		field_lengths[i] = key_part->length;
+
+		dict_mem_index_add_field(index,
+			(char*) key_part->field->field_name, prefix_len);
+	}
+
+	/* Even though we've defined max_supported_key_part_length, we
+	still do our own checking using field_lengths to be absolutely
+	sure we don't create too long indexes. */
+	error = row_create_index_for_mysql(index, trx, field_lengths);
+
+	error = convert_error_code_to_mysql(error, flags, NULL);
+
+	my_free(field_lengths, MYF(0));
+
+	DBUG_RETURN(error);
+}
+
+/*****************************************************************//**
+Creates an index to an InnoDB table when the user has defined no
+primary index. */
+static
+int
+create_clustered_index_when_no_primary(
+/*===================================*/
+	trx_t*		trx,		/*!< in: InnoDB transaction handle */
+	ulint		flags,		/*!< in: InnoDB table flags */
+	const char*	table_name)	/*!< in: table name */
+{
+	dict_index_t*	index;
+	int		error;
+
+	/* We pass 0 as the space id, and determine at a lower level the space
+	id where to store the table */
+	index = dict_mem_index_create(table_name,
+				      innobase_index_reserve_name,
+				      0, DICT_CLUSTERED, 0);
+
+	error = row_create_index_for_mysql(index, trx, NULL);
+
+	error = convert_error_code_to_mysql(error, flags, NULL);
+
+	return(error);
+}
+
+/*****************************************************************//**
+Validates the create options. We may build on this function
+in future. For now, it checks two specifiers:
+KEY_BLOCK_SIZE and ROW_FORMAT
+If innodb_strict_mode is not set then this function is a no-op
+@return	TRUE if valid. */
+static
+ibool
+create_options_are_valid(
+/*=====================*/
+	THD*		thd,		/*!< in: connection thread. */
+	TABLE*		form,		/*!< in: information on table
+					columns and indexes */
+	HA_CREATE_INFO*	create_info)	/*!< in: create info. */
+{
+	ibool 	kbs_specified	= FALSE;
+	ibool	ret		= TRUE;
+
+
+	ut_ad(thd != NULL);
+
+	/* If innodb_strict_mode is not set don't do any validation. */
+	if (!(THDVAR(thd, strict_mode))) {
+		return(TRUE);
+	}
+
+	ut_ad(form != NULL);
+	ut_ad(create_info != NULL);
+
+	/* First check if KEY_BLOCK_SIZE was specified. */
+	if (create_info->key_block_size
+	    || (create_info->used_fields & HA_CREATE_USED_KEY_BLOCK_SIZE)) {
+
+		kbs_specified = TRUE;
+		switch (create_info->key_block_size) {
+		case 1:
+		case 2:
+		case 4:
+		case 8:
+		case 16:
+			/* Valid value. */
+			break;
+		default:
+			push_warning_printf(thd, MYSQL_ERROR::WARN_LEVEL_WARN,
+					    ER_ILLEGAL_HA_CREATE_OPTION,
+					    "InnoDB: invalid"
+					    " KEY_BLOCK_SIZE = %lu."
+					    " Valid values are"
+					    " [1, 2, 4, 8, 16]",
+					    create_info->key_block_size);
+			ret = FALSE;
+		}
+	}
+	
+	/* If KEY_BLOCK_SIZE was specified, check for its
+	dependencies. */
+	if (kbs_specified && !srv_file_per_table) {
+		push_warning(thd, MYSQL_ERROR::WARN_LEVEL_WARN,
+			     ER_ILLEGAL_HA_CREATE_OPTION,
+			     "InnoDB: KEY_BLOCK_SIZE"
+			     " requires innodb_file_per_table.");
+		ret = FALSE;
+	}
+
+	if (kbs_specified && srv_file_format < DICT_TF_FORMAT_ZIP) {
+		push_warning(thd, MYSQL_ERROR::WARN_LEVEL_WARN,
+			     ER_ILLEGAL_HA_CREATE_OPTION,
+			     "InnoDB: KEY_BLOCK_SIZE"
+			     " requires innodb_file_format >"
+			     " Antelope.");
+		ret = FALSE;
+	}
+
+	/* Now check for ROW_FORMAT specifier. */
+	if (create_info->used_fields & HA_CREATE_USED_ROW_FORMAT) {
+		switch (form->s->row_type) {
+			const char* row_format_name;
+		case ROW_TYPE_COMPRESSED:
+		case ROW_TYPE_DYNAMIC:
+			row_format_name
+				= form->s->row_type == ROW_TYPE_COMPRESSED
+				? "COMPRESSED"
+				: "DYNAMIC";
+
+			/* These two ROW_FORMATs require
+			srv_file_per_table and srv_file_format */
+			if (!srv_file_per_table) {
+				push_warning_printf(
+					thd,
+					MYSQL_ERROR::WARN_LEVEL_WARN,
+					ER_ILLEGAL_HA_CREATE_OPTION,
+					"InnoDB: ROW_FORMAT=%s"
+					" requires innodb_file_per_table.",
+					row_format_name);
+					ret = FALSE;
+
+			}
+
+			if (srv_file_format < DICT_TF_FORMAT_ZIP) {
+				push_warning_printf(
+					thd,
+					MYSQL_ERROR::WARN_LEVEL_WARN,
+					ER_ILLEGAL_HA_CREATE_OPTION,
+					"InnoDB: ROW_FORMAT=%s"
+					" requires innodb_file_format >"
+					" Antelope.",
+					row_format_name);
+					ret = FALSE;
+			}
+
+			/* Cannot specify KEY_BLOCK_SIZE with
+			ROW_FORMAT = DYNAMIC.
+			However, we do allow COMPRESSED to be
+			specified with KEY_BLOCK_SIZE. */
+			if (kbs_specified
+			    && form->s->row_type == ROW_TYPE_DYNAMIC) {
+				push_warning_printf(
+					thd,
+					MYSQL_ERROR::WARN_LEVEL_WARN,
+					ER_ILLEGAL_HA_CREATE_OPTION,
+					"InnoDB: cannot specify"
+					" ROW_FORMAT = DYNAMIC with"
+					" KEY_BLOCK_SIZE.");
+					ret = FALSE;
+			}
+
+			break;
+
+		case ROW_TYPE_REDUNDANT:
+		case ROW_TYPE_COMPACT:
+		case ROW_TYPE_DEFAULT:
+			/* Default is COMPACT. */
+			row_format_name
+				= form->s->row_type == ROW_TYPE_REDUNDANT
+				? "REDUNDANT"
+				: "COMPACT";
+
+			/* Cannot specify KEY_BLOCK_SIZE with these
+			format specifiers. */
+			if (kbs_specified) {
+				push_warning_printf(
+					thd,
+					MYSQL_ERROR::WARN_LEVEL_WARN,
+					ER_ILLEGAL_HA_CREATE_OPTION,
+					"InnoDB: cannot specify"
+					" ROW_FORMAT = %s with"
+					" KEY_BLOCK_SIZE.",
+					row_format_name);
+					ret = FALSE;
+			}
+
+			break;
+
+		default:
+			push_warning(thd,
+				     MYSQL_ERROR::WARN_LEVEL_WARN,
+				     ER_ILLEGAL_HA_CREATE_OPTION,
+				     "InnoDB: invalid ROW_FORMAT specifier.");
+			ret = FALSE;
+
+		}
+	}
+
+	return(ret);
+}
+
+/*****************************************************************//**
+Update create_info.  Used in SHOW CREATE TABLE et al. */
+UNIV_INTERN
+void
+ha_innobase::update_create_info(
+/*============================*/
+	HA_CREATE_INFO* create_info)	/*!< in/out: create info */
+{
+  if (!(create_info->used_fields & HA_CREATE_USED_AUTO)) {
+    ha_innobase::info(HA_STATUS_AUTO);
+    create_info->auto_increment_value = stats.auto_increment_value;
+  }
+}
+
+/*****************************************************************//**
+Creates a new table to an InnoDB database.
+@return	error number */
+UNIV_INTERN
+int
+ha_innobase::create(
+/*================*/
+	const char*	name,		/*!< in: table name */
+	TABLE*		form,		/*!< in: information on table
+					columns and indexes */
+	HA_CREATE_INFO*	create_info)	/*!< in: more information of the
+					created table, contains also the
+					create statement string */
+{
+	int		error;
+	dict_table_t*	innobase_table;
+	trx_t*		parent_trx;
+	trx_t*		trx;
+	int		primary_key_no;
+	uint		i;
+	char		name2[FN_REFLEN];
+	char		norm_name[FN_REFLEN];
+	THD*		thd = ha_thd();
+	ib_int64_t	auto_inc_value;
+	ulint		flags;
+	/* Cache the value of innodb_file_format, in case it is
+	modified by another thread while the table is being created. */
+	const ulint	file_format = srv_file_format;
+	const char*	stmt;
+	size_t		stmt_len;
+	enum row_type	row_type;
+
+	DBUG_ENTER("ha_innobase::create");
+
+	DBUG_ASSERT(thd != NULL);
+	DBUG_ASSERT(create_info != NULL);
+
+#ifdef __WIN__
+	/* Names passed in from server are in two formats:
+	1. <database_name>/<table_name>: for normal table creation
+	2. full path: for temp table creation, or sym link
+
+	When srv_file_per_table is on and mysqld_embedded is off,
+	check for full path pattern, i.e.
+	X:\dir\...,		X is a driver letter, or
+	\\dir1\dir2\...,	UNC path
+	returns error if it is in full path format, but not creating a temp.
+	table. Currently InnoDB does not support symbolic link on Windows. */
+
+	if (srv_file_per_table
+	    && !mysqld_embedded
+	    && (!create_info->options & HA_LEX_CREATE_TMP_TABLE)) {
+
+		if ((name[1] == ':')
+		    || (name[0] == '\\' && name[1] == '\\')) {
+			sql_print_error("Cannot create table %s\n", name);
+			DBUG_RETURN(HA_ERR_GENERIC);
+		}
+	}
+#endif
+
+	if (form->s->stored_fields > 1000) {
+		/* The limit probably should be REC_MAX_N_FIELDS - 3 = 1020,
+		but we play safe here */
+
+		DBUG_RETURN(HA_ERR_TO_BIG_ROW);
+	}
+
+	/* Get the transaction associated with the current thd, or create one
+	if not yet created */
+
+	parent_trx = check_trx_exists(thd);
+
+	/* In case MySQL calls this in the middle of a SELECT query, release
+	possible adaptive hash latch to avoid deadlocks of threads */
+
+	trx_search_latch_release_if_reserved(parent_trx);
+
+	trx = innobase_trx_allocate(thd);
+
+	if (lower_case_table_names) {
+		srv_lower_case_table_names = TRUE;
+	} else {
+		srv_lower_case_table_names = FALSE;
+	}
+
+	strcpy(name2, name);
+
+	normalize_table_name(norm_name, name2);
+
+	/* Latch the InnoDB data dictionary exclusively so that no deadlocks
+	or lock waits can happen in it during a table create operation.
+	Drop table etc. do this latching in row0mysql.c. */
+
+	row_mysql_lock_data_dictionary(trx);
+
+	/* Create the table definition in InnoDB */
+
+	flags = 0;
+
+	/* Validate create options if innodb_strict_mode is set. */
+	if (!create_options_are_valid(thd, form, create_info)) {
+		error = ER_ILLEGAL_HA_CREATE_OPTION;
+		goto cleanup;
+	}
+
+	if (create_info->key_block_size
+	    || (create_info->used_fields & HA_CREATE_USED_KEY_BLOCK_SIZE)) {
+		/* Determine the page_zip.ssize corresponding to the
+		requested page size (key_block_size) in kilobytes. */
+
+		ulint	ssize, ksize;
+		ulint	key_block_size = create_info->key_block_size;
+
+		for (ssize = ksize = 1; ssize <= DICT_TF_ZSSIZE_MAX;
+		     ssize++, ksize <<= 1) {
+			if (key_block_size == ksize) {
+				flags = ssize << DICT_TF_ZSSIZE_SHIFT
+					| DICT_TF_COMPACT
+					| DICT_TF_FORMAT_ZIP
+					  << DICT_TF_FORMAT_SHIFT;
+				break;
+			}
+		}
+
+		if (!srv_file_per_table) {
+			push_warning(thd, MYSQL_ERROR::WARN_LEVEL_WARN,
+				     ER_ILLEGAL_HA_CREATE_OPTION,
+				     "InnoDB: KEY_BLOCK_SIZE"
+				     " requires innodb_file_per_table.");
+			flags = 0;
+		}
+
+		if (file_format < DICT_TF_FORMAT_ZIP) {
+			push_warning(thd, MYSQL_ERROR::WARN_LEVEL_WARN,
+				     ER_ILLEGAL_HA_CREATE_OPTION,
+				     "InnoDB: KEY_BLOCK_SIZE"
+				     " requires innodb_file_format >"
+				     " Antelope.");
+			flags = 0;
+		}
+
+		if (!flags) {
+			push_warning_printf(thd, MYSQL_ERROR::WARN_LEVEL_WARN,
+					    ER_ILLEGAL_HA_CREATE_OPTION,
+					    "InnoDB: ignoring"
+					    " KEY_BLOCK_SIZE=%lu.",
+					    create_info->key_block_size);
+		}
+	}
+
+	row_type = form->s->row_type;
+
+	if (flags) {
+		/* KEY_BLOCK_SIZE was specified. */
+		if (!(create_info->used_fields & HA_CREATE_USED_ROW_FORMAT)) {
+			/* ROW_FORMAT was not specified;
+			default to ROW_FORMAT=COMPRESSED */
+			row_type = ROW_TYPE_COMPRESSED;
+		} else if (row_type != ROW_TYPE_COMPRESSED) {
+			/* ROW_FORMAT other than COMPRESSED
+			ignores KEY_BLOCK_SIZE.  It does not
+			make sense to reject conflicting
+			KEY_BLOCK_SIZE and ROW_FORMAT, because
+			such combinations can be obtained
+			with ALTER TABLE anyway. */
+			push_warning_printf(
+				thd,
+				MYSQL_ERROR::WARN_LEVEL_WARN,
+				ER_ILLEGAL_HA_CREATE_OPTION,
+				"InnoDB: ignoring KEY_BLOCK_SIZE=%lu"
+				" unless ROW_FORMAT=COMPRESSED.",
+				create_info->key_block_size);
+			flags = 0;
+		}
+	} else {
+		/* No KEY_BLOCK_SIZE */
+		if (row_type == ROW_TYPE_COMPRESSED) {
+			/* ROW_FORMAT=COMPRESSED without
+			KEY_BLOCK_SIZE implies half the
+			maximum KEY_BLOCK_SIZE. */
+			flags = (DICT_TF_ZSSIZE_MAX - 1)
+				<< DICT_TF_ZSSIZE_SHIFT
+				| DICT_TF_COMPACT
+				| DICT_TF_FORMAT_ZIP
+				<< DICT_TF_FORMAT_SHIFT;
+//#if DICT_TF_ZSSIZE_MAX < 1
+//# error "DICT_TF_ZSSIZE_MAX < 1"
+//#endif
+		}
+	}
+
+	switch (row_type) {
+		const char* row_format_name;
+	case ROW_TYPE_REDUNDANT:
+		break;
+	case ROW_TYPE_COMPRESSED:
+	case ROW_TYPE_DYNAMIC:
+		row_format_name
+			= row_type == ROW_TYPE_COMPRESSED
+			? "COMPRESSED"
+			: "DYNAMIC";
+
+		if (!srv_file_per_table) {
+			push_warning_printf(
+				thd,
+				MYSQL_ERROR::WARN_LEVEL_WARN,
+				ER_ILLEGAL_HA_CREATE_OPTION,
+				"InnoDB: ROW_FORMAT=%s"
+				" requires innodb_file_per_table.",
+				row_format_name);
+		} else if (file_format < DICT_TF_FORMAT_ZIP) {
+			push_warning_printf(
+				thd,
+				MYSQL_ERROR::WARN_LEVEL_WARN,
+				ER_ILLEGAL_HA_CREATE_OPTION,
+				"InnoDB: ROW_FORMAT=%s"
+				" requires innodb_file_format >"
+				" Antelope.",
+				row_format_name);
+		} else {
+			flags |= DICT_TF_COMPACT
+				| (DICT_TF_FORMAT_ZIP
+				   << DICT_TF_FORMAT_SHIFT);
+			break;
+		}
+
+		/* fall through */
+	case ROW_TYPE_NOT_USED:
+	case ROW_TYPE_FIXED:
+	default:
+		push_warning(thd,
+			     MYSQL_ERROR::WARN_LEVEL_WARN,
+			     ER_ILLEGAL_HA_CREATE_OPTION,
+			     "InnoDB: assuming ROW_FORMAT=COMPACT.");
+	case ROW_TYPE_DEFAULT:
+	case ROW_TYPE_COMPACT:
+		flags = DICT_TF_COMPACT;
+		break;
+	}
+
+	/* Look for a primary key */
+
+	primary_key_no= (form->s->primary_key != MAX_KEY ?
+			 (int) form->s->primary_key :
+			 -1);
+
+	/* Our function innobase_get_mysql_key_number_for_index assumes
+	the primary key is always number 0, if it exists */
+
+	ut_a(primary_key_no == -1 || primary_key_no == 0);
+
+	/* Check for name conflicts (with reserved name) for
+	any user indices to be created. */
+	if (innobase_index_name_is_reserved(trx, form->key_info,
+					    form->s->keys)) {
+		error = -1;
+		goto cleanup;
+	}
+
+	if (create_info->options & HA_LEX_CREATE_TMP_TABLE) {
+		flags |= DICT_TF2_TEMPORARY << DICT_TF2_SHIFT;
+	}
+
+	error = create_table_def(trx, form, norm_name,
+		create_info->options & HA_LEX_CREATE_TMP_TABLE ? name2 : NULL,
+		flags);
+
+	if (error) {
+		goto cleanup;
+	}
+
+
+	/* Create the keys */
+
+	if (form->s->keys == 0 || primary_key_no == -1) {
+		/* Create an index which is used as the clustered index;
+		order the rows by their row id which is internally generated
+		by InnoDB */
+
+		error = create_clustered_index_when_no_primary(
+			trx, flags, norm_name);
+		if (error) {
+			goto cleanup;
+		}
+	}
+
+	if (primary_key_no != -1) {
+		/* In InnoDB the clustered index must always be created
+		first */
+		if ((error = create_index(trx, form, flags, norm_name,
+					  (uint) primary_key_no))) {
+			goto cleanup;
+		}
+	}
+
+	for (i = 0; i < form->s->keys; i++) {
+
+		if (i != (uint) primary_key_no) {
+
+			if ((error = create_index(trx, form, flags, norm_name,
+						  i))) {
+				goto cleanup;
+			}
+		}
+	}
+
+	stmt = innobase_get_stmt(thd, &stmt_len);
+
+	if (stmt) {
+		error = row_table_add_foreign_constraints(
+			trx, stmt, stmt_len, norm_name,
+			create_info->options & HA_LEX_CREATE_TMP_TABLE);
+
+		error = convert_error_code_to_mysql(error, flags, NULL);
+
+		if (error) {
+			goto cleanup;
+		}
+	}
+
+	innobase_commit_low(trx);
+
+	row_mysql_unlock_data_dictionary(trx);
+
+	/* Flush the log to reduce probability that the .frm files and
+	the InnoDB data dictionary get out-of-sync if the user runs
+	with innodb_flush_log_at_trx_commit = 0 */
+
+	log_buffer_flush_to_disk();
+
+	innobase_table = dict_table_get(norm_name, FALSE);
+
+	DBUG_ASSERT(innobase_table != 0);
+
+	if (innobase_table) {
+		/* We update the highest file format in the system table
+		space, if this table has higher file format setting. */
+
+		trx_sys_file_format_max_upgrade(
+			(const char**) &innobase_file_format_check,
+			dict_table_get_format(innobase_table));
+	}
+
+	/* Note: We can't call update_thd() as prebuilt will not be
+	setup at this stage and so we use thd. */
+
+	/* We need to copy the AUTOINC value from the old table if
+	this is an ALTER TABLE or CREATE INDEX because CREATE INDEX
+	does a table copy too. */
+
+	if (((create_info->used_fields & HA_CREATE_USED_AUTO)
+	    || thd_sql_command(thd) == SQLCOM_ALTER_TABLE
+	    || thd_sql_command(thd) == SQLCOM_CREATE_INDEX)
+	    && create_info->auto_increment_value > 0) {
+
+		/* Query was one of :
+		CREATE TABLE ...AUTO_INCREMENT = x; or
+		ALTER TABLE...AUTO_INCREMENT = x;   or
+		CREATE INDEX x on t(...);
+		Find out a table definition from the dictionary and get
+		the current value of the auto increment field. Set a new
+		value to the auto increment field if the value is greater
+		than the maximum value in the column. */
+
+		auto_inc_value = create_info->auto_increment_value;
+
+		dict_table_autoinc_lock(innobase_table);
+		dict_table_autoinc_initialize(innobase_table, auto_inc_value);
+		dict_table_autoinc_unlock(innobase_table);
+	}
+
+	/* Tell the InnoDB server that there might be work for
+	utility threads: */
+
+	srv_active_wake_master_thread();
+
+	trx_free_for_mysql(trx);
+
+	DBUG_RETURN(0);
+
+cleanup:
+	innobase_commit_low(trx);
+
+	row_mysql_unlock_data_dictionary(trx);
+
+	trx_free_for_mysql(trx);
+
+	DBUG_RETURN(error);
+}
+
+/*****************************************************************//**
+Discards or imports an InnoDB tablespace.
+@return	0 == success, -1 == error */
+UNIV_INTERN
+int
+ha_innobase::discard_or_import_tablespace(
+/*======================================*/
+	my_bool discard)	/*!< in: TRUE if discard, else import */
+{
+	dict_table_t*	dict_table;
+	trx_t*		trx;
+	int		err;
+
+	DBUG_ENTER("ha_innobase::discard_or_import_tablespace");
+
+	ut_a(prebuilt->trx);
+	ut_a(prebuilt->trx->magic_n == TRX_MAGIC_N);
+	ut_a(prebuilt->trx == thd_to_trx(ha_thd()));
+
+	dict_table = prebuilt->table;
+	trx = prebuilt->trx;
+
+	if (discard) {
+		err = row_discard_tablespace_for_mysql(dict_table->name, trx);
+	} else {
+		err = row_import_tablespace_for_mysql(dict_table->name, trx);
+	}
+
+	err = convert_error_code_to_mysql(err, dict_table->flags, NULL);
+
+	DBUG_RETURN(err);
+}
+
+/*****************************************************************//**
+Deletes all rows of an InnoDB table.
+@return	error number */
+UNIV_INTERN
+int
+ha_innobase::delete_all_rows(void)
+/*==============================*/
+{
+	int		error;
+
+	DBUG_ENTER("ha_innobase::delete_all_rows");
+
+	/* Get the transaction associated with the current thd, or create one
+	if not yet created, and update prebuilt->trx */
+
+	update_thd(ha_thd());
+
+	if (thd_sql_command(user_thd) != SQLCOM_TRUNCATE) {
+	fallback:
+		/* We only handle TRUNCATE TABLE t as a special case.
+		DELETE FROM t will have to use ha_innobase::delete_row(),
+		because DELETE is transactional while TRUNCATE is not. */
+		DBUG_RETURN(my_errno=HA_ERR_WRONG_COMMAND);
+	}
+
+	if (share->ib_table->is_corrupt) {
+		DBUG_RETURN(HA_ERR_CRASHED);
+	}
+
+	/* Truncate the table in InnoDB */
+
+	error = row_truncate_table_for_mysql(prebuilt->table, prebuilt->trx);
+	if (error == DB_ERROR) {
+		/* Cannot truncate; resort to ha_innobase::delete_row() */
+		goto fallback;
+	}
+
+	if (share->ib_table->is_corrupt) {
+		DBUG_RETURN(HA_ERR_CRASHED);
+	}
+
+	error = convert_error_code_to_mysql(error, prebuilt->table->flags,
+					    NULL);
+
+	DBUG_RETURN(error);
+}
+
+/*****************************************************************//**
+Drops a table from an InnoDB database. Before calling this function,
+MySQL calls innobase_commit to commit the transaction of the current user.
+Then the current user cannot have locks set on the table. Drop table
+operation inside InnoDB will remove all locks any user has on the table
+inside InnoDB.
+@return	error number */
+UNIV_INTERN
+int
+ha_innobase::delete_table(
+/*======================*/
+	const char*	name)	/*!< in: table name */
+{
+	ulint	name_len;
+	int	error;
+	trx_t*	parent_trx;
+	trx_t*	trx;
+	THD	*thd = ha_thd();
+	char	norm_name[1000];
+
+	DBUG_ENTER("ha_innobase::delete_table");
+
+	/* Strangely, MySQL passes the table name without the '.frm'
+	extension, in contrast to ::create */
+	normalize_table_name(norm_name, name);
+
+	if (IS_MAGIC_TABLE_AND_USER_DENIED_ACCESS(norm_name, thd)) {
+		DBUG_RETURN(HA_ERR_GENERIC);
+	}
+
+	/* Get the transaction associated with the current thd, or create one
+	if not yet created */
+
+	parent_trx = check_trx_exists(thd);
+
+	/* In case MySQL calls this in the middle of a SELECT query, release
+	possible adaptive hash latch to avoid deadlocks of threads */
+
+	trx_search_latch_release_if_reserved(parent_trx);
+
+	trx = innobase_trx_allocate(thd);
+
+	if (lower_case_table_names) {
+		srv_lower_case_table_names = TRUE;
+	} else {
+		srv_lower_case_table_names = FALSE;
+	}
+
+	name_len = strlen(name);
+
+	ut_a(name_len < 1000);
+
+	/* Drop the table in InnoDB */
+
+	error = row_drop_table_for_mysql(norm_name, trx,
+					 thd_sql_command(thd)
+					 == SQLCOM_DROP_DB);
+
+	/* Flush the log to reduce probability that the .frm files and
+	the InnoDB data dictionary get out-of-sync if the user runs
+	with innodb_flush_log_at_trx_commit = 0 */
+
+	log_buffer_flush_to_disk();
+
+	/* Tell the InnoDB server that there might be work for
+	utility threads: */
+
+	srv_active_wake_master_thread();
+
+	innobase_commit_low(trx);
+
+	trx_free_for_mysql(trx);
+
+	error = convert_error_code_to_mysql(error, 0, NULL);
+
+	DBUG_RETURN(error);
+}
+
+/*****************************************************************//**
+Removes all tables in the named database inside InnoDB. */
+static
+void
+innobase_drop_database(
+/*===================*/
+	handlerton *hton, /*!< in: handlerton of Innodb */
+	char*	path)	/*!< in: database path; inside InnoDB the name
+			of the last directory in the path is used as
+			the database name: for example, in 'mysql/data/test'
+			the database name is 'test' */
+{
+	ulint	len		= 0;
+	trx_t*	trx;
+	char*	ptr;
+	int	error;
+	char*	namebuf;
+	THD*	thd		= current_thd;
+
+	/* Get the transaction associated with the current thd, or create one
+	if not yet created */
+
+	DBUG_ASSERT(hton == innodb_hton_ptr);
+
+	/* In the Windows plugin, thd = current_thd is always NULL */
+	if (thd) {
+		trx_t*	parent_trx = check_trx_exists(thd);
+
+		/* In case MySQL calls this in the middle of a SELECT
+		query, release possible adaptive hash latch to avoid
+		deadlocks of threads */
+
+		trx_search_latch_release_if_reserved(parent_trx);
+	}
+
+	ptr = strend(path) - 2;
+
+	while (ptr >= path && *ptr != '\\' && *ptr != '/') {
+		ptr--;
+		len++;
+	}
+
+	ptr++;
+	namebuf = (char*) my_malloc((uint) len + 2, MYF(0));
+
+	memcpy(namebuf, ptr, len);
+	namebuf[len] = '/';
+	namebuf[len + 1] = '\0';
+#ifdef	__WIN__
+	innobase_casedn_str(namebuf);
+#endif
+#if defined __WIN__ && !defined MYSQL_SERVER
+	/* In the Windows plugin, thd = current_thd is always NULL */
+	trx = trx_allocate_for_mysql();
+	trx->mysql_thd = NULL;
+#else
+	trx = innobase_trx_allocate(thd);
+#endif
+	error = row_drop_database_for_mysql(namebuf, trx);
+	my_free(namebuf, MYF(0));
+
+	/* Flush the log to reduce probability that the .frm files and
+	the InnoDB data dictionary get out-of-sync if the user runs
+	with innodb_flush_log_at_trx_commit = 0 */
+
+	log_buffer_flush_to_disk();
+
+	/* Tell the InnoDB server that there might be work for
+	utility threads: */
+
+	srv_active_wake_master_thread();
+
+	innobase_commit_low(trx);
+	trx_free_for_mysql(trx);
+}
+/*********************************************************************//**
+Renames an InnoDB table.
+@return	0 or error code */
+static
+int
+innobase_rename_table(
+/*==================*/
+	trx_t*		trx,	/*!< in: transaction */
+	const char*	from,	/*!< in: old name of the table */
+	const char*	to,	/*!< in: new name of the table */
+	ibool		lock_and_commit)
+				/*!< in: TRUE=lock data dictionary and commit */
+{
+	int	error;
+	char*	norm_to;
+	char*	norm_from;
+	DBUG_ENTER("innobase_rename_table");
+
+	if (lower_case_table_names) {
+		srv_lower_case_table_names = TRUE;
+	} else {
+		srv_lower_case_table_names = FALSE;
+	}
+
+	// Magic number 64 arbitrary
+	norm_to = (char*) my_malloc(strlen(to) + 64, MYF(0));
+	norm_from = (char*) my_malloc(strlen(from) + 64, MYF(0));
+
+	normalize_table_name(norm_to, to);
+	normalize_table_name(norm_from, from);
+
+	/* Serialize data dictionary operations with dictionary mutex:
+	no deadlocks can occur then in these operations */
+
+	if (lock_and_commit) {
+		row_mysql_lock_data_dictionary(trx);
+	}
+
+	error = row_rename_table_for_mysql(
+		norm_from, norm_to, trx, lock_and_commit);
+
+	if (error != DB_SUCCESS) {
+		FILE* ef = dict_foreign_err_file;
+
+		DBUG_PRINT("info", ("rename failed: %d", error));
+		fputs("InnoDB: Renaming table ", ef);
+		ut_print_name(ef, trx, TRUE, norm_from);
+		fputs(" to ", ef);
+		ut_print_name(ef, trx, TRUE, norm_to);
+		fputs(" failed!\n", ef);
+	}
+
+	if (lock_and_commit) {
+		row_mysql_unlock_data_dictionary(trx);
+
+		/* Flush the log to reduce probability that the .frm
+		files and the InnoDB data dictionary get out-of-sync
+		if the user runs with innodb_flush_log_at_trx_commit = 0 */
+
+		log_buffer_flush_to_disk();
+	}
+
+	my_free(norm_to, MYF(0));
+	my_free(norm_from, MYF(0));
+
+	DBUG_RETURN(error);
+}
+/*********************************************************************//**
+Renames an InnoDB table.
+@return	0 or error code */
+UNIV_INTERN
+int
+ha_innobase::rename_table(
+/*======================*/
+	const char*	from,	/*!< in: old name of the table */
+	const char*	to)	/*!< in: new name of the table */
+{
+	trx_t*	trx;
+	int	error;
+	trx_t*	parent_trx;
+	THD*	thd		= ha_thd();
+
+	DBUG_ENTER("ha_innobase::rename_table");
+
+	/* Get the transaction associated with the current thd, or create one
+	if not yet created */
+
+	parent_trx = check_trx_exists(thd);
+
+	/* In case MySQL calls this in the middle of a SELECT query, release
+	possible adaptive hash latch to avoid deadlocks of threads */
+
+	trx_search_latch_release_if_reserved(parent_trx);
+
+	trx = innobase_trx_allocate(thd);
+
+	error = innobase_rename_table(trx, from, to, TRUE);
+
+	/* Tell the InnoDB server that there might be work for
+	utility threads: */
+
+	srv_active_wake_master_thread();
+
+	innobase_commit_low(trx);
+	trx_free_for_mysql(trx);
+
+	/* Add a special case to handle the Duplicated Key error
+	and return DB_ERROR instead.
+	This is to avoid a possible SIGSEGV error from mysql error
+	handling code. Currently, mysql handles the Duplicated Key
+	error by re-entering the storage layer and getting dup key
+	info by calling get_dup_key(). This operation requires a valid
+	table handle ('row_prebuilt_t' structure) which could no
+	longer be available in the error handling stage. The suggested
+	solution is to report a 'table exists' error message (since
+	the dup key error here is due to an existing table whose name
+	is the one we are trying to rename to) and return the generic
+	error code. */
+	if (error == (int) DB_DUPLICATE_KEY) {
+		my_error(ER_TABLE_EXISTS_ERROR, MYF(0), to);
+
+		error = DB_ERROR;
+	}
+
+	error = convert_error_code_to_mysql(error, 0, NULL);
+
+	DBUG_RETURN(error);
+}
+
+/*********************************************************************//**
+Estimates the number of index records in a range.
+@return	estimated number of rows */
+UNIV_INTERN
+ha_rows
+ha_innobase::records_in_range(
+/*==========================*/
+	uint			keynr,		/*!< in: index number */
+	key_range		*min_key,	/*!< in: start key value of the
+						   range, may also be 0 */
+	key_range		*max_key)	/*!< in: range end key val, may
+						   also be 0 */
+{
+	KEY*		key;
+	dict_index_t*	index;
+	uchar*		key_val_buff2	= (uchar*) my_malloc(
+						  table->s->stored_rec_length
+					+ table->s->max_key_length + 100,
+								MYF(MY_FAE));
+	ulint		buff2_len = table->s->stored_rec_length
+					+ table->s->max_key_length + 100;
+	dtuple_t*	range_start;
+	dtuple_t*	range_end;
+	ib_int64_t	n_rows;
+	ulint		mode1;
+	ulint		mode2;
+	mem_heap_t*	heap;
+
+	DBUG_ENTER("records_in_range");
+
+	ut_a(prebuilt->trx == thd_to_trx(ha_thd()));
+
+	prebuilt->trx->op_info = (char*)"estimating records in index range";
+
+	/* In case MySQL calls this in the middle of a SELECT query, release
+	possible adaptive hash latch to avoid deadlocks of threads */
+
+	trx_search_latch_release_if_reserved(prebuilt->trx);
+
+	active_index = keynr;
+
+	key = table->key_info + active_index;
+
+	index = innobase_get_index(keynr);
+
+	/* There exists possibility of not being able to find requested
+	index due to inconsistency between MySQL and InoDB dictionary info.
+	Necessary message should have been printed in innobase_get_index() */
+	if (UNIV_UNLIKELY(!index)) {
+		n_rows = HA_POS_ERROR;
+		goto func_exit;
+	}
+	if (UNIV_UNLIKELY(!row_merge_is_index_usable(prebuilt->trx, index))) {
+		n_rows = HA_ERR_TABLE_DEF_CHANGED;
+		goto func_exit;
+	}
+
+	heap = mem_heap_create(2 * (key->key_parts * sizeof(dfield_t)
+				    + sizeof(dtuple_t)));
+
+	range_start = dtuple_create(heap, key->key_parts);
+	dict_index_copy_types(range_start, index, key->key_parts);
+
+	range_end = dtuple_create(heap, key->key_parts);
+	dict_index_copy_types(range_end, index, key->key_parts);
+
+	row_sel_convert_mysql_key_to_innobase(
+				range_start, (byte*) key_val_buff,
+				(ulint)upd_and_key_val_buff_len,
+				index,
+				(byte*) (min_key ? min_key->key :
+					 (const uchar*) 0),
+				(ulint) (min_key ? min_key->length : 0),
+				prebuilt->trx);
+
+	row_sel_convert_mysql_key_to_innobase(
+				range_end, (byte*) key_val_buff2,
+				buff2_len, index,
+				(byte*) (max_key ? max_key->key :
+					 (const uchar*) 0),
+				(ulint) (max_key ? max_key->length : 0),
+				prebuilt->trx);
+
+	mode1 = convert_search_mode_to_innobase(min_key ? min_key->flag :
+						HA_READ_KEY_EXACT);
+	mode2 = convert_search_mode_to_innobase(max_key ? max_key->flag :
+						HA_READ_KEY_EXACT);
+
+	if (mode1 != PAGE_CUR_UNSUPP && mode2 != PAGE_CUR_UNSUPP) {
+
+		n_rows = btr_estimate_n_rows_in_range(index, range_start,
+						      mode1, range_end,
+						      mode2);
+	} else {
+
+		n_rows = HA_POS_ERROR;
+	}
+
+	mem_heap_free(heap);
+
+func_exit:
+	my_free(key_val_buff2, MYF(0));
+
+	prebuilt->trx->op_info = (char*)"";
+
+	/* The MySQL optimizer seems to believe an estimate of 0 rows is
+	always accurate and may return the result 'Empty set' based on that.
+	The accuracy is not guaranteed, and even if it were, for a locking
+	read we should anyway perform the search to set the next-key lock.
+	Add 1 to the value to make sure MySQL does not make the assumption! */
+
+	if (n_rows == 0) {
+		n_rows = 1;
+	}
+
+	DBUG_RETURN((ha_rows) n_rows);
+}
+
+/*********************************************************************//**
+Gives an UPPER BOUND to the number of rows in a table. This is used in
+filesort.cc.
+@return	upper bound of rows */
+UNIV_INTERN
+ha_rows
+ha_innobase::estimate_rows_upper_bound(void)
+/*======================================*/
+{
+	dict_index_t*	index;
+	ulonglong	estimate;
+	ulonglong	local_data_file_length;
+
+	DBUG_ENTER("estimate_rows_upper_bound");
+
+	/* We do not know if MySQL can call this function before calling
+	external_lock(). To be safe, update the thd of the current table
+	handle. */
+
+	update_thd(ha_thd());
+
+	prebuilt->trx->op_info = (char*)
+				 "calculating upper bound for table rows";
+
+	/* In case MySQL calls this in the middle of a SELECT query, release
+	possible adaptive hash latch to avoid deadlocks of threads */
+
+	trx_search_latch_release_if_reserved(prebuilt->trx);
+
+	index = dict_table_get_first_index(prebuilt->table);
+
+	ut_a(index->stat_n_leaf_pages > 0);
+
+	local_data_file_length =
+		((ulonglong) index->stat_n_leaf_pages) * UNIV_PAGE_SIZE;
+
+
+	/* Calculate a minimum length for a clustered index record and from
+	that an upper bound for the number of rows. Since we only calculate
+	new statistics in row0mysql.c when a table has grown by a threshold
+	factor, we must add a safety factor 2 in front of the formula below. */
+
+	estimate = 2 * local_data_file_length /
+					 dict_index_calc_min_rec_len(index);
+
+	prebuilt->trx->op_info = (char*)"";
+
+	DBUG_RETURN((ha_rows) estimate);
+}
+
+/*********************************************************************//**
+How many seeks it will take to read through the table. This is to be
+comparable to the number returned by records_in_range so that we can
+decide if we should scan the table or use keys.
+@return	estimated time measured in disk seeks */
+UNIV_INTERN
+double
+ha_innobase::scan_time()
+/*====================*/
+{
+	/* Since MySQL seems to favor table scans too much over index
+	searches, we pretend that a sequential read takes the same time
+	as a random disk read, that is, we do not divide the following
+	by 10, which would be physically realistic. */
+
+	return((double) (prebuilt->table->stat_clustered_index_size));
+}
+
+/******************************************************************//**
+Calculate the time it takes to read a set of ranges through an index
+This enables us to optimise reads for clustered indexes.
+@return	estimated time measured in disk seeks */
+UNIV_INTERN
+double
+ha_innobase::read_time(
+/*===================*/
+	uint	index,	/*!< in: key number */
+	uint	ranges,	/*!< in: how many ranges */
+	ha_rows rows)	/*!< in: estimated number of rows in the ranges */
+{
+	ha_rows total_rows;
+	double	time_for_scan;
+
+	if (index != table->s->primary_key) {
+		/* Not clustered */
+		return(handler::read_time(index, ranges, rows));
+	}
+
+	if (rows <= 2) {
+
+		return((double) rows);
+	}
+
+	/* Assume that the read time is proportional to the scan time for all
+	rows + at most one seek per range. */
+
+	time_for_scan = scan_time();
+
+	if ((total_rows = estimate_rows_upper_bound()) < rows) {
+
+		return(time_for_scan);
+	}
+
+	return(ranges + (double) rows / (double) total_rows * time_for_scan);
+}
+
+UNIV_INTERN
+bool
+ha_innobase::is_corrupt() const
+{
+	if (share->ib_table)
+		return ((bool)share->ib_table->is_corrupt);
+	else
+		return (FALSE);
+}
+
+/*********************************************************************//**
+Calculates the key number used inside MySQL for an Innobase index. We will
+first check the "index translation table" for a match of the index to get
+the index number. If there does not exist an "index translation table",
+or not able to find the index in the translation table, then we will fall back
+to the traditional way of looping through dict_index_t list to find a
+match. In this case, we have to take into account if we generated a
+default clustered index for the table
+@return the key number used inside MySQL */
+static
+unsigned int
+innobase_get_mysql_key_number_for_index(
+/*====================================*/
+	INNOBASE_SHARE*		share,	/*!< in: share structure for index
+					translation table. */
+	const TABLE*		table,	/*!< in: table in MySQL data
+					dictionary */
+	dict_table_t*		ib_table,/*!< in: table in Innodb data
+					dictionary */
+        const dict_index_t*     index)	/*!< in: index */
+{
+	const dict_index_t*	ind;
+	unsigned int		i;
+
+	ut_ad(index);
+	ut_ad(ib_table);
+	ut_ad(table);
+	ut_ad(share);
+
+	/* If index does not belong to the table of share structure. Search
+	index->table instead */
+	if (index->table != ib_table) {
+		i = 0;
+		ind = dict_table_get_first_index(index->table);
+
+		while (index != ind) {
+			ind = dict_table_get_next_index(ind);
+			i++;
+		}
+
+		if (row_table_got_default_clust_index(index->table)) {
+			ut_a(i > 0);
+			i--;
+		}
+
+		return(i);
+	}
+
+	/* If index translation table exists, we will first check
+	the index through index translation table for a match. */
+        if (share->idx_trans_tbl.index_mapping) {
+		for (i = 0; i < share->idx_trans_tbl.index_count; i++) {
+			if (share->idx_trans_tbl.index_mapping[i] == index) {
+				return(i);
+			}
+		}
+
+		/* Print an error message if we cannot find the index
+		** in the "index translation table". */
+		sql_print_error("Cannot find index %s in InnoDB index "
+				"translation table.", index->name);
+	}
+
+	/* If we do not have an "index translation table", or not able
+	to find the index in the translation table, we'll directly find
+	matching index in the dict_index_t list */
+	for (i = 0; i < table->s->keys; i++) {
+		ind = dict_table_get_index_on_name(
+			ib_table, table->key_info[i].name);
+
+        	if (index == ind) {
+			return(i);
+		}
+        }
+
+	sql_print_error("Cannot find matching index number for index %s "
+			 "in InnoDB index list.", index->name);
+
+        return(0);
+}
+/*********************************************************************//**
+Returns statistics information of the table to the MySQL interpreter,
+in various fields of the handle object. */
+UNIV_INTERN
+int
+ha_innobase::info(
+/*==============*/
+	uint flag)	/*!< in: what information MySQL requests */
+{
+	dict_table_t*	ib_table;
+	dict_index_t*	index;
+	ha_rows		rec_per_key;
+	ib_int64_t	n_rows;
+	char		path[FN_REFLEN];
+	os_file_stat_t	stat_info;
+
+	DBUG_ENTER("info");
+
+	/* If we are forcing recovery at a high level, we will suppress
+	statistics calculation on tables, because that may crash the
+	server if an index is badly corrupted. */
+
+	/* We do not know if MySQL can call this function before calling
+	external_lock(). To be safe, update the thd of the current table
+	handle. */
+
+	update_thd(ha_thd());
+
+	/* In case MySQL calls this in the middle of a SELECT query, release
+	possible adaptive hash latch to avoid deadlocks of threads */
+
+	prebuilt->trx->op_info = (char*)"returning various info to MySQL";
+
+	trx_search_latch_release_if_reserved(prebuilt->trx);
+
+	ib_table = prebuilt->table;
+
+	if (flag & HA_STATUS_TIME) {
+		if ((innobase_stats_on_metadata
+		     || thd_sql_command(user_thd) == SQLCOM_ANALYZE)
+		    && !share->ib_table->is_corrupt) {
+			/* In sql_show we call with this flag: update
+			then statistics so that they are up-to-date */
+
+			if (srv_use_sys_stats_table && !((ib_table->flags >> DICT_TF2_SHIFT) & DICT_TF2_TEMPORARY)
+			    && thd_sql_command(user_thd) == SQLCOM_ANALYZE) {
+				/* If the indexes on the table don't have enough rows in SYS_STATS system table, */
+				/* they need to be created. */
+				dict_index_t*	index;
+
+				prebuilt->trx->op_info = "confirming rows of SYS_STATS to store statistics";
+
+				ut_a(prebuilt->trx->conc_state == TRX_NOT_STARTED);
+
+				for (index = dict_table_get_first_index(ib_table);
+				     index != NULL;
+				     index = dict_table_get_next_index(index)) {
+					row_insert_stats_for_mysql(index, prebuilt->trx);
+					innobase_commit_low(prebuilt->trx);
+				}
+
+				ut_a(prebuilt->trx->conc_state == TRX_NOT_STARTED);
+			}
+
+			prebuilt->trx->op_info = "updating table statistics";
+
+			dict_update_statistics(ib_table,
+				(thd_sql_command(user_thd) == SQLCOM_ANALYZE)?TRUE:FALSE);
+
+			prebuilt->trx->op_info = "returning various info to MySQL";
+		}
+
+		my_snprintf(path, sizeof(path), "%s/%s%s",
+				mysql_data_home, ib_table->name, reg_ext);
+
+		unpack_filename(path,path);
+
+		/* Note that we do not know the access time of the table,
+		nor the CHECK TABLE time, nor the UPDATE or INSERT time. */
+
+		if (os_file_get_status(path,&stat_info)) {
+			stats.create_time = (ulong) stat_info.ctime;
+		}
+	}
+
+	if (flag & HA_STATUS_VARIABLE) {
+		n_rows = ib_table->stat_n_rows;
+
+		/* Because we do not protect stat_n_rows by any mutex in a
+		delete, it is theoretically possible that the value can be
+		smaller than zero! TODO: fix this race.
+
+		The MySQL optimizer seems to assume in a left join that n_rows
+		is an accurate estimate if it is zero. Of course, it is not,
+		since we do not have any locks on the rows yet at this phase.
+		Since SHOW TABLE STATUS seems to call this function with the
+		HA_STATUS_TIME flag set, while the left join optimizer does not
+		set that flag, we add one to a zero value if the flag is not
+		set. That way SHOW TABLE STATUS will show the best estimate,
+		while the optimizer never sees the table empty. */
+
+		if (n_rows < 0) {
+			n_rows = 0;
+		}
+
+		if (n_rows == 0 && !(flag & HA_STATUS_TIME)) {
+			n_rows++;
+		}
+
+		/* Fix bug#40386: Not flushing query cache after truncate.
+		n_rows can not be 0 unless the table is empty, set to 1
+		instead. The original problem of bug#29507 is actually
+		fixed in the server code. */
+		if (thd_sql_command(user_thd) == SQLCOM_TRUNCATE) {
+
+			n_rows = 1;
+
+			/* We need to reset the prebuilt value too, otherwise
+			checks for values greater than the last value written
+			to the table will fail and the autoinc counter will
+			not be updated. This will force write_row() into
+			attempting an update of the table's AUTOINC counter. */
+
+			prebuilt->autoinc_last_value = 0;
+		}
+
+		stats.records = (ha_rows)n_rows;
+		stats.deleted = 0;
+		stats.data_file_length = ((ulonglong)
+				ib_table->stat_clustered_index_size)
+					* UNIV_PAGE_SIZE;
+		stats.index_file_length = ((ulonglong)
+				ib_table->stat_sum_of_other_index_sizes)
+					* UNIV_PAGE_SIZE;
+
+		/* Since fsp_get_available_space_in_free_extents() is
+		acquiring latches inside InnoDB, we do not call it if we
+		are asked by MySQL to avoid locking. Another reason to
+		avoid the call is that it uses quite a lot of CPU.
+		See Bug#38185. */
+		if (flag & HA_STATUS_NO_LOCK) {
+			/* We do not update delete_length if no
+			locking is requested so the "old" value can
+			remain. delete_length is initialized to 0 in
+			the ha_statistics' constructor. */
+		} else if (UNIV_UNLIKELY
+			   (srv_force_recovery >= SRV_FORCE_NO_IBUF_MERGE)) {
+			/* Avoid accessing the tablespace if
+			innodb_crash_recovery is set to a high value. */
+			stats.delete_length = 0;
+		} else if (srv_stats_update_need_lock) {
+
+			/* lock the data dictionary to avoid races with
+			ibd_file_missing and tablespace_discarded */
+			row_mysql_lock_data_dictionary(prebuilt->trx);
+
+			/* ib_table->space must be an existent tablespace */
+			if (!ib_table->ibd_file_missing
+			    && !ib_table->tablespace_discarded) {
+
+				stats.delete_length =
+					fsp_get_available_space_in_free_extents(
+						ib_table->space) * 1024;
+			} else {
+
+				THD*	thd;
+
+				thd = ha_thd();
+
+				push_warning_printf(
+					thd,
+					MYSQL_ERROR::WARN_LEVEL_WARN,
+					ER_CANT_GET_STAT,
+					"InnoDB: Trying to get the free "
+					"space for table %s but its "
+					"tablespace has been discarded or "
+					"the .ibd file is missing. Setting "
+					"the free space to zero.",
+					ib_table->name);
+
+				stats.delete_length = 0;
+			}
+
+			row_mysql_unlock_data_dictionary(prebuilt->trx);
+		}
+
+		stats.check_time = 0;
+		stats.mrr_length_per_rec= ref_length +  8; // 8 = max(sizeof(void *));
+
+		if (stats.records == 0) {
+			stats.mean_rec_length = 0;
+		} else {
+			stats.mean_rec_length = (ulong) (stats.data_file_length / stats.records);
+		}
+	}
+
+	if (flag & HA_STATUS_CONST) {
+		ulong	i;
+		/* Verify the number of index in InnoDB and MySQL
+		matches up. If prebuilt->clust_index_was_generated
+		holds, InnoDB defines GEN_CLUST_INDEX internally */
+		ulint	num_innodb_index = UT_LIST_GET_LEN(ib_table->indexes)
+					- prebuilt->clust_index_was_generated;
+
+		if (table->s->keys != num_innodb_index) {
+			sql_print_error("Table %s contains %lu "
+					"indexes inside InnoDB, which "
+					"is different from the number of "
+					"indexes %u defined in the MySQL ",
+					ib_table->name, num_innodb_index,
+					table->s->keys);
+		}
+
+		for (i = 0; i < table->s->keys; i++) {
+			ulong	j;
+			/* We could get index quickly through internal
+			index mapping with the index translation table.
+			The identity of index (match up index name with
+			that of table->key_info[i]) is already verified in
+			innobase_get_index().  */
+			index = innobase_get_index(i);
+
+			if (index == NULL) {
+				sql_print_error("Table %s contains fewer "
+						"indexes inside InnoDB than "
+						"are defined in the MySQL "
+						".frm file. Have you mixed up "
+						".frm files from different "
+						"installations? See "
+						REFMAN
+						"innodb-troubleshooting.html\n",
+						ib_table->name);
+				break;
+			}
+
+			for (j = 0; j < table->key_info[i].key_parts; j++) {
+
+				if (j + 1 > index->n_uniq) {
+					sql_print_error(
+"Index %s of %s has %lu columns unique inside InnoDB, but MySQL is asking "
+"statistics for %lu columns. Have you mixed up .frm files from different "
+"installations? "
+"See " REFMAN "innodb-troubleshooting.html\n",
+							index->name,
+							ib_table->name,
+							(unsigned long)
+							index->n_uniq, j + 1);
+					break;
+				}
+
+				dict_index_stat_mutex_enter(index);
+
+				if (index->stat_n_diff_key_vals[j + 1] == 0) {
+
+					rec_per_key = stats.records;
+				} else {
+					rec_per_key = (ha_rows)(stats.records /
+					 index->stat_n_diff_key_vals[j + 1]);
+				}
+
+				dict_index_stat_mutex_exit(index);
+
+				/* Since MySQL seems to favor table scans
+				too much over index searches, we pretend
+				index selectivity is 2 times better than
+				our estimate: */
+
+				rec_per_key = rec_per_key / 2;
+
+				if (rec_per_key == 0) {
+					rec_per_key = 1;
+				}
+
+				table->key_info[i].rec_per_key[j]=
+				  rec_per_key >= ~(ulong) 0 ? ~(ulong) 0 :
+				  (ulong) rec_per_key;
+			}
+		}
+	}
+
+	if (srv_force_recovery >= SRV_FORCE_NO_IBUF_MERGE) {
+
+		goto func_exit;
+	}
+
+	if (flag & HA_STATUS_ERRKEY) {
+		const dict_index_t*	err_index;
+
+		ut_a(prebuilt->trx);
+		ut_a(prebuilt->trx->magic_n == TRX_MAGIC_N);
+
+		err_index = trx_get_error_info(prebuilt->trx);
+
+		if (err_index) {
+			errkey = innobase_get_mysql_key_number_for_index(
+					share, table, ib_table, err_index);
+		} else {
+			errkey = (unsigned int) prebuilt->trx->error_key_num;
+		}
+	}
+
+	if ((flag & HA_STATUS_AUTO) && table->found_next_number_field) {
+		stats.auto_increment_value = innobase_peek_autoinc();
+	}
+
+func_exit:
+	prebuilt->trx->op_info = (char*)"";
+
+	DBUG_RETURN(0);
+}
+
+/**********************************************************************//**
+Updates index cardinalities of the table, based on 8 random dives into
+each index tree. This does NOT calculate exact statistics on the table.
+@return	returns always 0 (success) */
+UNIV_INTERN
+int
+ha_innobase::analyze(
+/*=================*/
+	THD*		thd,		/*!< in: connection thread handle */
+	HA_CHECK_OPT*	check_opt)	/*!< in: currently ignored */
+{
+	if (share->ib_table->is_corrupt) {
+		return(HA_ADMIN_CORRUPT);
+	}
+
+	/* Simply call ::info() with all the flags */
+	info(HA_STATUS_TIME | HA_STATUS_CONST | HA_STATUS_VARIABLE);
+
+	if (share->ib_table->is_corrupt) {
+		return(HA_ADMIN_CORRUPT);
+	}
+
+	return(0);
+}
+
+/**********************************************************************//**
+This is mapped to "ALTER TABLE tablename ENGINE=InnoDB", which rebuilds
+the table in MySQL. */
+UNIV_INTERN
+int
+ha_innobase::optimize(
+/*==================*/
+	THD*		thd,		/*!< in: connection thread handle */
+	HA_CHECK_OPT*	check_opt)	/*!< in: currently ignored */
+{
+	return(HA_ADMIN_TRY_ALTER);
+}
+
+/*******************************************************************//**
+Tries to check that an InnoDB table is not corrupted. If corruption is
+noticed, prints to stderr information about it. In case of corruption
+may also assert a failure and crash the server.
+@return	HA_ADMIN_CORRUPT or HA_ADMIN_OK */
+UNIV_INTERN
+int
+ha_innobase::check(
+/*===============*/
+	THD*		thd,		/*!< in: user thread handle */
+	HA_CHECK_OPT*	check_opt)	/*!< in: check options, currently
+					ignored */
+{
+	dict_index_t*	index;
+	ulint		n_rows;
+	ulint		n_rows_in_table	= ULINT_UNDEFINED;
+	ibool		is_ok		= TRUE;
+	ulint		old_isolation_level;
+
+	DBUG_ENTER("ha_innobase::check");
+	DBUG_ASSERT(thd == ha_thd());
+	ut_a(prebuilt->trx);
+	ut_a(prebuilt->trx->magic_n == TRX_MAGIC_N);
+	ut_a(prebuilt->trx == thd_to_trx(thd));
+
+	if (prebuilt->mysql_template == NULL) {
+		/* Build the template; we will use a dummy template
+		in index scans done in checking */
+
+		build_template(prebuilt, NULL, table, this, ROW_MYSQL_WHOLE_ROW);
+	}
+
+	if (prebuilt->table->ibd_file_missing) {
+		sql_print_error("InnoDB: Error:\n"
+			"InnoDB: MySQL is trying to use a table handle"
+			" but the .ibd file for\n"
+			"InnoDB: table %s does not exist.\n"
+			"InnoDB: Have you deleted the .ibd file"
+			" from the database directory under\n"
+			"InnoDB: the MySQL datadir, or have you"
+			" used DISCARD TABLESPACE?\n"
+			"InnoDB: Please refer to\n"
+			"InnoDB: " REFMAN "innodb-troubleshooting.html\n"
+			"InnoDB: how you can resolve the problem.\n",
+			prebuilt->table->name);
+		DBUG_RETURN(HA_ADMIN_CORRUPT);
+	}
+
+	prebuilt->trx->op_info = "checking table";
+
+	old_isolation_level = prebuilt->trx->isolation_level;
+
+	/* We must run the index record counts at an isolation level
+	>= READ COMMITTED, because a dirty read can see a wrong number
+	of records in some index; to play safe, we use always
+	REPEATABLE READ here */
+
+	prebuilt->trx->isolation_level = TRX_ISO_REPEATABLE_READ;
+
+	/* Enlarge the fatal lock wait timeout during CHECK TABLE. */
+	mutex_enter(&kernel_mutex);
+	srv_fatal_semaphore_wait_threshold += 7200; /* 2 hours */
+	mutex_exit(&kernel_mutex);
+
+	for (index = dict_table_get_first_index(prebuilt->table);
+	     index != NULL;
+	     index = dict_table_get_next_index(index)) {
+#if 0
+		fputs("Validating index ", stderr);
+		ut_print_name(stderr, trx, FALSE, index->name);
+		putc('\n', stderr);
+#endif
+
+		if (!btr_validate_index(index, prebuilt->trx)) {
+			is_ok = FALSE;
+			push_warning_printf(thd, MYSQL_ERROR::WARN_LEVEL_WARN,
+					    ER_NOT_KEYFILE,
+					    "InnoDB: The B-tree of"
+					    " index '%-.200s' is corrupted.",
+					    index->name);
+			continue;
+		}
+
+		/* Instead of invoking change_active_index(), set up
+		a dummy template for non-locking reads, disabling
+		access to the clustered index. */
+		prebuilt->index = index;
+
+		prebuilt->index_usable = row_merge_is_index_usable(
+			prebuilt->trx, prebuilt->index);
+
+		if (UNIV_UNLIKELY(!prebuilt->index_usable)) {
+			push_warning_printf(thd, MYSQL_ERROR::WARN_LEVEL_WARN,
+					    HA_ERR_TABLE_DEF_CHANGED,
+					    "InnoDB: Insufficient history for"
+					    " index '%-.200s'",
+					    index->name);
+			continue;
+		}
+
+		prebuilt->sql_stat_start = TRUE;
+		prebuilt->template_type = ROW_MYSQL_DUMMY_TEMPLATE;
+		prebuilt->n_template = 0;
+		prebuilt->need_to_access_clustered = FALSE;
+
+		dtuple_set_n_fields(prebuilt->search_tuple, 0);
+
+		prebuilt->select_lock_type = LOCK_NONE;
+
+		if (!row_check_index_for_mysql(prebuilt, index, &n_rows)) {
+			push_warning_printf(thd, MYSQL_ERROR::WARN_LEVEL_WARN,
+					    ER_NOT_KEYFILE,
+					    "InnoDB: The B-tree of"
+					    " index '%-.200s' is corrupted.",
+					    index->name);
+			is_ok = FALSE;
+		}
+
+		if (thd_killed(user_thd)) {
+			break;
+		}
+
+#if 0
+		fprintf(stderr, "%lu entries in index %s\n", n_rows,
+			index->name);
+#endif
+
+		if (index == dict_table_get_first_index(prebuilt->table)) {
+			n_rows_in_table = n_rows;
+		} else if (n_rows != n_rows_in_table) {
+			push_warning_printf(thd, MYSQL_ERROR::WARN_LEVEL_WARN,
+					    ER_NOT_KEYFILE,
+					    "InnoDB: Index '%-.200s'"
+					    " contains %lu entries,"
+					    " should be %lu.",
+					    index->name,
+					    (ulong) n_rows,
+					    (ulong) n_rows_in_table);
+			is_ok = FALSE;
+		}
+	}
+
+	/* Restore the original isolation level */
+	prebuilt->trx->isolation_level = old_isolation_level;
+
+	/* We validate also the whole adaptive hash index for all tables
+	at every CHECK TABLE */
+
+	if (!btr_search_validate()) {
+		push_warning(thd, MYSQL_ERROR::WARN_LEVEL_WARN,
+			     ER_NOT_KEYFILE,
+			     "InnoDB: The adaptive hash index is corrupted.");
+		is_ok = FALSE;
+	}
+
+	/* Restore the fatal lock wait timeout after CHECK TABLE. */
+	mutex_enter(&kernel_mutex);
+	srv_fatal_semaphore_wait_threshold -= 7200; /* 2 hours */
+	mutex_exit(&kernel_mutex);
+
+	prebuilt->trx->op_info = "";
+	if (thd_killed(user_thd)) {
+		my_error(ER_QUERY_INTERRUPTED, MYF(0));
+	}
+
+	if (share->ib_table->is_corrupt) {
+		return(HA_ADMIN_CORRUPT);
+	}
+
+	DBUG_RETURN(is_ok ? HA_ADMIN_OK : HA_ADMIN_CORRUPT);
+}
+
+/*************************************************************//**
+Adds information about free space in the InnoDB tablespace to a table comment
+which is printed out when a user calls SHOW TABLE STATUS. Adds also info on
+foreign keys.
+@return	table comment + InnoDB free space + info on foreign keys */
+UNIV_INTERN
+char*
+ha_innobase::update_table_comment(
+/*==============================*/
+	const char*	comment)/*!< in: table comment defined by user */
+{
+	uint	length = (uint) strlen(comment);
+	char*	str;
+	long	flen;
+
+	/* We do not know if MySQL can call this function before calling
+	external_lock(). To be safe, update the thd of the current table
+	handle. */
+
+	if (length > 64000 - 3) {
+		return((char*)comment); /* string too long */
+	}
+
+	update_thd(ha_thd());
+
+	prebuilt->trx->op_info = (char*)"returning table comment";
+
+	/* In case MySQL calls this in the middle of a SELECT query, release
+	possible adaptive hash latch to avoid deadlocks of threads */
+
+	trx_search_latch_release_if_reserved(prebuilt->trx);
+	str = NULL;
+
+	/* output the data to a temporary file */
+
+	mutex_enter(&srv_dict_tmpfile_mutex);
+	rewind(srv_dict_tmpfile);
+
+	fprintf(srv_dict_tmpfile, "InnoDB free: %llu kB",
+		fsp_get_available_space_in_free_extents(
+			prebuilt->table->space));
+
+	dict_print_info_on_foreign_keys(FALSE, srv_dict_tmpfile,
+				prebuilt->trx, prebuilt->table);
+	flen = ftell(srv_dict_tmpfile);
+	if (flen < 0) {
+		flen = 0;
+	} else if (length + flen + 3 > 64000) {
+		flen = 64000 - 3 - length;
+	}
+
+	/* allocate buffer for the full string, and
+	read the contents of the temporary file */
+
+	str = (char*) my_malloc(length + flen + 3, MYF(0));
+
+	if (str) {
+		char* pos	= str + length;
+		if (length) {
+			memcpy(str, comment, length);
+			*pos++ = ';';
+			*pos++ = ' ';
+		}
+		rewind(srv_dict_tmpfile);
+		flen = (uint) fread(pos, 1, flen, srv_dict_tmpfile);
+		pos[flen] = 0;
+	}
+
+	mutex_exit(&srv_dict_tmpfile_mutex);
+
+	prebuilt->trx->op_info = (char*)"";
+
+	return(str ? str : (char*) comment);
+}
+
+/*******************************************************************//**
+Gets the foreign key create info for a table stored in InnoDB.
+@return own: character string in the form which can be inserted to the
+CREATE TABLE statement, MUST be freed with
+ha_innobase::free_foreign_key_create_info */
+UNIV_INTERN
+char*
+ha_innobase::get_foreign_key_create_info(void)
+/*==========================================*/
+{
+	char*	str	= 0;
+	long	flen;
+
+	ut_a(prebuilt != NULL);
+
+	/* We do not know if MySQL can call this function before calling
+	external_lock(). To be safe, update the thd of the current table
+	handle. */
+
+	update_thd(ha_thd());
+
+	prebuilt->trx->op_info = (char*)"getting info on foreign keys";
+
+	/* In case MySQL calls this in the middle of a SELECT query,
+	release possible adaptive hash latch to avoid
+	deadlocks of threads */
+
+	trx_search_latch_release_if_reserved(prebuilt->trx);
+
+	mutex_enter(&srv_dict_tmpfile_mutex);
+	rewind(srv_dict_tmpfile);
+
+	/* output the data to a temporary file */
+	dict_print_info_on_foreign_keys(TRUE, srv_dict_tmpfile,
+				prebuilt->trx, prebuilt->table);
+	prebuilt->trx->op_info = (char*)"";
+
+	flen = ftell(srv_dict_tmpfile);
+	if (flen < 0) {
+		flen = 0;
+	} else if (flen > 64000 - 1) {
+		flen = 64000 - 1;
+	}
+
+	/* allocate buffer for the string, and
+	read the contents of the temporary file */
+
+	str = (char*) my_malloc(flen + 1, MYF(0));
+
+	if (str) {
+		rewind(srv_dict_tmpfile);
+		flen = (uint) fread(str, 1, flen, srv_dict_tmpfile);
+		str[flen] = 0;
+	}
+
+	mutex_exit(&srv_dict_tmpfile_mutex);
+
+	return(str);
+}
+
+
+UNIV_INTERN
+int
+ha_innobase::get_foreign_key_list(THD *thd, List<FOREIGN_KEY_INFO> *f_key_list)
+{
+  dict_foreign_t* foreign;
+
+  DBUG_ENTER("get_foreign_key_list");
+  ut_a(prebuilt != NULL);
+  update_thd(ha_thd());
+  prebuilt->trx->op_info = (char*)"getting list of foreign keys";
+  trx_search_latch_release_if_reserved(prebuilt->trx);
+  mutex_enter(&(dict_sys->mutex));
+  foreign = UT_LIST_GET_FIRST(prebuilt->table->foreign_list);
+
+  while (foreign != NULL) {
+	  uint i;
+	  FOREIGN_KEY_INFO f_key_info;
+	  LEX_STRING *name= 0;
+          uint ulen;
+          char uname[NAME_LEN+1];           /* Unencoded name */
+          char db_name[NAME_LEN+1];
+	  const char *tmp_buff;
+
+	  tmp_buff= foreign->id;
+	  i= 0;
+	  while (tmp_buff[i] != '/')
+		  i++;
+	  tmp_buff+= i + 1;
+	  f_key_info.forein_id = thd_make_lex_string(thd, 0,
+		  tmp_buff, (uint) strlen(tmp_buff), 1);
+	  tmp_buff= foreign->referenced_table_name;
+
+          /* Database name */
+	  i= 0;
+	  while (tmp_buff[i] != '/')
+          {
+            db_name[i]= tmp_buff[i];
+            i++;
+          }
+          db_name[i]= 0;
+          ulen= filename_to_tablename(db_name, uname, sizeof(uname));
+	  f_key_info.referenced_db = thd_make_lex_string(thd, 0,
+		  uname, ulen, 1);
+
+          /* Table name */
+	  tmp_buff+= i + 1;
+          ulen= filename_to_tablename(tmp_buff, uname, sizeof(uname));
+	  f_key_info.referenced_table = thd_make_lex_string(thd, 0,
+		  uname, ulen, 1);
+
+	  for (i= 0;;) {
+		  tmp_buff= foreign->foreign_col_names[i];
+		  name = thd_make_lex_string(thd, name,
+			  tmp_buff, (uint) strlen(tmp_buff), 1);
+		  f_key_info.foreign_fields.push_back(name);
+		  tmp_buff= foreign->referenced_col_names[i];
+		  name = thd_make_lex_string(thd, name,
+			tmp_buff, (uint) strlen(tmp_buff), 1);
+		  f_key_info.referenced_fields.push_back(name);
+		  if (++i >= foreign->n_fields)
+			  break;
+	  }
+
+          ulong length;
+          if (foreign->type & DICT_FOREIGN_ON_DELETE_CASCADE)
+          {
+            length=7;
+            tmp_buff= "CASCADE";
+          }
+          else if (foreign->type & DICT_FOREIGN_ON_DELETE_SET_NULL)
+          {
+            length=8;
+            tmp_buff= "SET NULL";
+          }
+          else if (foreign->type & DICT_FOREIGN_ON_DELETE_NO_ACTION)
+          {
+            length=9;
+            tmp_buff= "NO ACTION";
+          }
+          else
+          {
+            length=8;
+            tmp_buff= "RESTRICT";
+          }
+	  f_key_info.delete_method = thd_make_lex_string(
+		  thd, f_key_info.delete_method, tmp_buff, length, 1);
+
+
+          if (foreign->type & DICT_FOREIGN_ON_UPDATE_CASCADE)
+          {
+            length=7;
+            tmp_buff= "CASCADE";
+          }
+          else if (foreign->type & DICT_FOREIGN_ON_UPDATE_SET_NULL)
+          {
+            length=8;
+            tmp_buff= "SET NULL";
+          }
+          else if (foreign->type & DICT_FOREIGN_ON_UPDATE_NO_ACTION)
+          {
+            length=9;
+            tmp_buff= "NO ACTION";
+          }
+          else
+          {
+            length=8;
+            tmp_buff= "RESTRICT";
+          }
+	  f_key_info.update_method = thd_make_lex_string(
+		  thd, f_key_info.update_method, tmp_buff, length, 1);
+          if (foreign->referenced_index &&
+              foreign->referenced_index->name)
+          {
+	    f_key_info.referenced_key_name = thd_make_lex_string(
+		    thd, f_key_info.referenced_key_name,
+		    foreign->referenced_index->name,
+		    (uint) strlen(foreign->referenced_index->name), 1);
+          }
+          else
+            f_key_info.referenced_key_name= 0;
+
+	  FOREIGN_KEY_INFO *pf_key_info = (FOREIGN_KEY_INFO *)
+		  thd_memdup(thd, &f_key_info, sizeof(FOREIGN_KEY_INFO));
+	  f_key_list->push_back(pf_key_info);
+	  foreign = UT_LIST_GET_NEXT(foreign_list, foreign);
+  }
+  mutex_exit(&(dict_sys->mutex));
+  prebuilt->trx->op_info = (char*)"";
+
+  DBUG_RETURN(0);
+}
+
+/*****************************************************************//**
+Checks if ALTER TABLE may change the storage engine of the table.
+Changing storage engines is not allowed for tables for which there
+are foreign key constraints (parent or child tables).
+@return	TRUE if can switch engines */
+UNIV_INTERN
+bool
+ha_innobase::can_switch_engines(void)
+/*=================================*/
+{
+	bool	can_switch;
+
+	DBUG_ENTER("ha_innobase::can_switch_engines");
+
+	ut_a(prebuilt->trx == thd_to_trx(ha_thd()));
+
+	prebuilt->trx->op_info =
+			"determining if there are foreign key constraints";
+	row_mysql_lock_data_dictionary(prebuilt->trx);
+
+	can_switch = !UT_LIST_GET_FIRST(prebuilt->table->referenced_list)
+			&& !UT_LIST_GET_FIRST(prebuilt->table->foreign_list);
+
+	row_mysql_unlock_data_dictionary(prebuilt->trx);
+	prebuilt->trx->op_info = "";
+
+	DBUG_RETURN(can_switch);
+}
+
+/*******************************************************************//**
+Checks if a table is referenced by a foreign key. The MySQL manual states that
+a REPLACE is either equivalent to an INSERT, or DELETE(s) + INSERT. Only a
+delete is then allowed internally to resolve a duplicate key conflict in
+REPLACE, not an update.
+@return	> 0 if referenced by a FOREIGN KEY */
+UNIV_INTERN
+uint
+ha_innobase::referenced_by_foreign_key(void)
+/*========================================*/
+{
+	if (dict_table_is_referenced_by_foreign_key(prebuilt->table)) {
+
+		return(1);
+	}
+
+	return(0);
+}
+
+/*******************************************************************//**
+Frees the foreign key create info for a table stored in InnoDB, if it is
+non-NULL. */
+UNIV_INTERN
+void
+ha_innobase::free_foreign_key_create_info(
+/*======================================*/
+	char*	str)	/*!< in, own: create info string to free */
+{
+	if (str) {
+		my_free(str, MYF(0));
+	}
+}
+
+/*******************************************************************//**
+Tells something additional to the handler about how to do things.
+@return	0 or error number */
+UNIV_INTERN
+int
+ha_innobase::extra(
+/*===============*/
+	enum ha_extra_function operation)
+			   /*!< in: HA_EXTRA_FLUSH or some other flag */
+{
+	/* Warning: since it is not sure that MySQL calls external_lock
+	before calling this function, the trx field in prebuilt can be
+	obsolete! */
+
+	switch (operation) {
+		case HA_EXTRA_FLUSH:
+			if (prebuilt->blob_heap) {
+				row_mysql_prebuilt_free_blob_heap(prebuilt);
+			}
+			break;
+		case HA_EXTRA_RESET_STATE:
+			reset_template(prebuilt);
+                        /* Reset index condition pushdown state */
+                        pushed_idx_cond= FALSE;
+                        pushed_idx_cond_keyno= MAX_KEY;
+                        prebuilt->idx_cond_func= NULL;
+			break;
+		case HA_EXTRA_NO_KEYREAD:
+			prebuilt->read_just_key = 0;
+			break;
+		case HA_EXTRA_KEYREAD:
+			prebuilt->read_just_key = 1;
+			break;
+		case HA_EXTRA_KEYREAD_PRESERVE_FIELDS:
+			prebuilt->keep_other_fields_on_keyread = 1;
+			break;
+
+			/* IMPORTANT: prebuilt->trx can be obsolete in
+			this method, because it is not sure that MySQL
+			calls external_lock before this method with the
+			parameters below.  We must not invoke update_thd()
+			either, because the calling threads may change.
+			CAREFUL HERE, OR MEMORY CORRUPTION MAY OCCUR! */
+		case HA_EXTRA_IGNORE_DUP_KEY:
+			thd_to_trx(ha_thd())->duplicates |= TRX_DUP_IGNORE;
+			break;
+		case HA_EXTRA_WRITE_CAN_REPLACE:
+			thd_to_trx(ha_thd())->duplicates |= TRX_DUP_REPLACE;
+			break;
+		case HA_EXTRA_WRITE_CANNOT_REPLACE:
+			thd_to_trx(ha_thd())->duplicates &= ~TRX_DUP_REPLACE;
+			break;
+		case HA_EXTRA_NO_IGNORE_DUP_KEY:
+			thd_to_trx(ha_thd())->duplicates &=
+				~(TRX_DUP_IGNORE | TRX_DUP_REPLACE);
+			break;
+		default:/* Do nothing */
+			;
+	}
+
+	return(0);
+}
+
+UNIV_INTERN
+int
+ha_innobase::reset()
+{
+	if (prebuilt->blob_heap) {
+		row_mysql_prebuilt_free_blob_heap(prebuilt);
+	}
+
+	reset_template(prebuilt);
+
+	/* Reset index condition pushdown state */
+	pushed_idx_cond_keyno= MAX_KEY;
+	pushed_idx_cond= NULL;
+	ds_mrr.dsmrr_close();
+	prebuilt->idx_cond_func= NULL;
+
+	/* TODO: This should really be reset in reset_template() but for now
+	it's safer to do it explicitly here. */
+
+	/* This is a statement level counter. */
+	prebuilt->autoinc_last_value = 0;
+
+	return(0);
+}
+
+/******************************************************************//**
+MySQL calls this function at the start of each SQL statement inside LOCK
+TABLES. Inside LOCK TABLES the ::external_lock method does not work to
+mark SQL statement borders. Note also a special case: if a temporary table
+is created inside LOCK TABLES, MySQL has not called external_lock() at all
+on that table.
+MySQL-5.0 also calls this before each statement in an execution of a stored
+procedure. To make the execution more deterministic for binlogging, MySQL-5.0
+locks all tables involved in a stored procedure with full explicit table
+locks (thd_in_lock_tables(thd) holds in store_lock()) before executing the
+procedure.
+@return	0 or error code */
+UNIV_INTERN
+int
+ha_innobase::start_stmt(
+/*====================*/
+	THD*		thd,	/*!< in: handle to the user thread */
+	thr_lock_type	lock_type)
+{
+	trx_t*		trx;
+
+	update_thd(thd);
+
+	trx = prebuilt->trx;
+
+	/* Here we release the search latch and the InnoDB thread FIFO ticket
+	if they were reserved. They should have been released already at the
+	end of the previous statement, but because inside LOCK TABLES the
+	lock count method does not work to mark the end of a SELECT statement,
+	that may not be the case. We MUST release the search latch before an
+	INSERT, for example. */
+
+	innobase_release_stat_resources(trx);
+
+	/* Reset the AUTOINC statement level counter for multi-row INSERTs. */
+	trx->n_autoinc_rows = 0;
+
+	prebuilt->sql_stat_start = TRUE;
+	prebuilt->hint_need_to_fetch_extra_cols = 0;
+	reset_template(prebuilt);
+
+	if (!prebuilt->mysql_has_locked) {
+		/* This handle is for a temporary table created inside
+		this same LOCK TABLES; since MySQL does NOT call external_lock
+		in this case, we must use x-row locks inside InnoDB to be
+		prepared for an update of a row */
+
+		prebuilt->select_lock_type = LOCK_X;
+	} else {
+		if (trx->isolation_level != TRX_ISO_SERIALIZABLE
+			&& thd_sql_command(thd) == SQLCOM_SELECT
+			&& lock_type == TL_READ) {
+
+			/* For other than temporary tables, we obtain
+			no lock for consistent read (plain SELECT). */
+
+			prebuilt->select_lock_type = LOCK_NONE;
+		} else {
+			/* Not a consistent read: restore the
+			select_lock_type value. The value of
+			stored_select_lock_type was decided in:
+			1) ::store_lock(),
+			2) ::external_lock(),
+			3) ::init_table_handle_for_HANDLER(), and
+			4) ::transactional_table_lock(). */
+
+			prebuilt->select_lock_type =
+				prebuilt->stored_select_lock_type;
+		}
+	}
+
+	trx->detailed_error[0] = '\0';
+
+	/* Set the MySQL flag to mark that there is an active transaction */
+	if (trx->active_trans == 0) {
+
+		innobase_register_trx_and_stmt(ht, thd);
+		trx->active_trans = 1;
+	} else {
+		innobase_register_stmt(ht, thd);
+	}
+
+	return(0);
+}
+
+/******************************************************************//**
+Maps a MySQL trx isolation level code to the InnoDB isolation level code
+@return	InnoDB isolation level */
+static inline
+ulint
+innobase_map_isolation_level(
+/*=========================*/
+	enum_tx_isolation	iso)	/*!< in: MySQL isolation level code */
+{
+	switch(iso) {
+		case ISO_REPEATABLE_READ: return(TRX_ISO_REPEATABLE_READ);
+		case ISO_READ_COMMITTED: return(TRX_ISO_READ_COMMITTED);
+		case ISO_SERIALIZABLE: return(TRX_ISO_SERIALIZABLE);
+		case ISO_READ_UNCOMMITTED: return(TRX_ISO_READ_UNCOMMITTED);
+		default: ut_a(0); return(0);
+	}
+}
+
+/******************************************************************//**
+As MySQL will execute an external lock for every new table it uses when it
+starts to process an SQL statement (an exception is when MySQL calls
+start_stmt for the handle) we can use this function to store the pointer to
+the THD in the handle. We will also use this function to communicate
+to InnoDB that a new SQL statement has started and that we must store a
+savepoint to our transaction handle, so that we are able to roll back
+the SQL statement in case of an error.
+@return	0 */
+UNIV_INTERN
+int
+ha_innobase::external_lock(
+/*=======================*/
+	THD*	thd,		/*!< in: handle to the user thread */
+	int	lock_type)	/*!< in: lock type */
+{
+	trx_t*		trx;
+
+	DBUG_ENTER("ha_innobase::external_lock");
+	DBUG_PRINT("enter",("lock_type: %d", lock_type));
+
+	update_thd(thd);
+
+	/* Statement based binlogging does not work in isolation level
+	READ UNCOMMITTED and READ COMMITTED since the necessary
+	locks cannot be taken. In this case, we print an
+	informative error message and return with an error. */
+	if (lock_type == F_WRLCK)
+	{
+		ulong const binlog_format= thd_binlog_format(thd);
+		ulong const tx_isolation = thd_tx_isolation(ha_thd());
+		if (tx_isolation <= ISO_READ_COMMITTED
+                   && binlog_format == BINLOG_FORMAT_STMT
+#if MYSQL_VERSION_ID > 50140
+                   && thd_binlog_filter_ok(thd)
+#endif /* MYSQL_VERSION_ID > 50140 */
+		   )
+		{
+			char buf[256];
+			my_snprintf(buf, sizeof(buf),
+				    "Transaction level '%s' in"
+				    " InnoDB is not safe for binlog mode '%s'",
+				    tx_isolation_names[tx_isolation],
+				    binlog_format_names[binlog_format]);
+			my_error(ER_BINLOG_LOGGING_IMPOSSIBLE, MYF(0), buf);
+			DBUG_RETURN(HA_ERR_LOGGING_IMPOSSIBLE);
+		}
+	}
+
+
+	trx = prebuilt->trx;
+
+	prebuilt->sql_stat_start = TRUE;
+	prebuilt->hint_need_to_fetch_extra_cols = 0;
+
+	reset_template(prebuilt);
+
+	if (lock_type == F_WRLCK) {
+
+		/* If this is a SELECT, then it is in UPDATE TABLE ...
+		or SELECT ... FOR UPDATE */
+		prebuilt->select_lock_type = LOCK_X;
+		prebuilt->stored_select_lock_type = LOCK_X;
+	}
+
+	if (lock_type != F_UNLCK) {
+		/* MySQL is setting a new table lock */
+
+		trx->detailed_error[0] = '\0';
+
+		/* Set the MySQL flag to mark that there is an active
+		transaction */
+		if (trx->active_trans == 0) {
+
+			innobase_register_trx_and_stmt(ht, thd);
+			trx->active_trans = 1;
+		} else if (trx->n_mysql_tables_in_use == 0) {
+			innobase_register_stmt(ht, thd);
+		}
+
+		if (trx->isolation_level == TRX_ISO_SERIALIZABLE
+			&& prebuilt->select_lock_type == LOCK_NONE
+			&& thd_test_options(thd,
+				OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN)) {
+
+			/* To get serializable execution, we let InnoDB
+			conceptually add 'LOCK IN SHARE MODE' to all SELECTs
+			which otherwise would have been consistent reads. An
+			exception is consistent reads in the AUTOCOMMIT=1 mode:
+			we know that they are read-only transactions, and they
+			can be serialized also if performed as consistent
+			reads. */
+
+			prebuilt->select_lock_type = LOCK_S;
+			prebuilt->stored_select_lock_type = LOCK_S;
+		}
+
+		/* Starting from 4.1.9, no InnoDB table lock is taken in LOCK
+		TABLES if AUTOCOMMIT=1. It does not make much sense to acquire
+		an InnoDB table lock if it is released immediately at the end
+		of LOCK TABLES, and InnoDB's table locks in that case cause
+		VERY easily deadlocks.
+
+		We do not set InnoDB table locks if user has not explicitly
+		requested a table lock. Note that thd_in_lock_tables(thd)
+		can hold in some cases, e.g., at the start of a stored
+		procedure call (SQLCOM_CALL). */
+
+		if (prebuilt->select_lock_type != LOCK_NONE) {
+
+			if (thd_sql_command(thd) == SQLCOM_LOCK_TABLES
+			    && THDVAR(thd, table_locks)
+			    && thd_test_options(thd, OPTION_NOT_AUTOCOMMIT)
+			    && thd_in_lock_tables(thd)) {
+
+				ulint	error = row_lock_table_for_mysql(
+					prebuilt, NULL, 0);
+
+				if (error != DB_SUCCESS) {
+					error = convert_error_code_to_mysql(
+						(int) error, 0, thd);
+					DBUG_RETURN((int) error);
+				}
+			}
+
+			trx->mysql_n_tables_locked++;
+		}
+
+		trx->n_mysql_tables_in_use++;
+		prebuilt->mysql_has_locked = TRUE;
+
+		DBUG_RETURN(0);
+	}
+
+	/* MySQL is releasing a table lock */
+
+	trx->n_mysql_tables_in_use--;
+	prebuilt->mysql_has_locked = FALSE;
+
+	/* Release a possible FIFO ticket and search latch. Since we
+	may reserve the kernel mutex, we have to release the search
+	system latch first to obey the latching order. */
+
+	innobase_release_stat_resources(trx);
+
+	/* If the MySQL lock count drops to zero we know that the current SQL
+	statement has ended */
+
+	if (trx->n_mysql_tables_in_use == 0) {
+#ifdef EXTENDED_SLOWLOG
+		increment_thd_innodb_stats(thd,
+					(unsigned long long) ut_conv_dulint_to_longlong(trx->id),
+					trx->io_reads,
+					trx->io_read,
+					trx->io_reads_wait_timer,
+					trx->lock_que_wait_timer,
+					trx->innodb_que_wait_timer,
+					trx->distinct_page_access);
+
+		trx->io_reads = 0;
+		trx->io_read = 0;
+		trx->io_reads_wait_timer = 0;
+		trx->lock_que_wait_timer = 0;
+		trx->innodb_que_wait_timer = 0;
+		trx->distinct_page_access = 0;
+		if (trx->distinct_page_access_hash)
+			memset(trx->distinct_page_access_hash, 0, DPAH_SIZE);
+#endif
+
+		trx->mysql_n_tables_locked = 0;
+		prebuilt->used_in_HANDLER = FALSE;
+
+		if (!thd_test_options(thd, OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN)) {
+			if (trx->active_trans != 0) {
+				innobase_commit(ht, thd, TRUE);
+			}
+		} else {
+			if (trx->isolation_level <= TRX_ISO_READ_COMMITTED
+						&& trx->global_read_view) {
+
+				/* At low transaction isolation levels we let
+				each consistent read set its own snapshot */
+
+				read_view_close_for_mysql(trx);
+			}
+		}
+	}
+
+	DBUG_RETURN(0);
+}
+
+/******************************************************************//**
+With this function MySQL request a transactional lock to a table when
+user issued query LOCK TABLES..WHERE ENGINE = InnoDB.
+@return	error code */
+UNIV_INTERN
+int
+ha_innobase::transactional_table_lock(
+/*==================================*/
+	THD*	thd,		/*!< in: handle to the user thread */
+	int	lock_type)	/*!< in: lock type */
+{
+	trx_t*		trx;
+
+	DBUG_ENTER("ha_innobase::transactional_table_lock");
+	DBUG_PRINT("enter",("lock_type: %d", lock_type));
+
+	/* We do not know if MySQL can call this function before calling
+	external_lock(). To be safe, update the thd of the current table
+	handle. */
+
+	update_thd(thd);
+
+	if (share->ib_table->is_corrupt) {
+		DBUG_RETURN(HA_ERR_CRASHED);
+	}
+
+	if (prebuilt->table->ibd_file_missing && !thd_tablespace_op(thd)) {
+		ut_print_timestamp(stderr);
+		fprintf(stderr,
+			"  InnoDB: MySQL is trying to use a table handle"
+			" but the .ibd file for\n"
+			"InnoDB: table %s does not exist.\n"
+			"InnoDB: Have you deleted the .ibd file"
+			" from the database directory under\n"
+			"InnoDB: the MySQL datadir?"
+			"InnoDB: See " REFMAN
+			"innodb-troubleshooting.html\n"
+			"InnoDB: how you can resolve the problem.\n",
+			prebuilt->table->name);
+		DBUG_RETURN(HA_ERR_CRASHED);
+	}
+
+	trx = prebuilt->trx;
+
+	prebuilt->sql_stat_start = TRUE;
+	prebuilt->hint_need_to_fetch_extra_cols = 0;
+
+	reset_template(prebuilt);
+
+	if (lock_type == F_WRLCK) {
+		prebuilt->select_lock_type = LOCK_X;
+		prebuilt->stored_select_lock_type = LOCK_X;
+	} else if (lock_type == F_RDLCK) {
+		prebuilt->select_lock_type = LOCK_S;
+		prebuilt->stored_select_lock_type = LOCK_S;
+	} else {
+		ut_print_timestamp(stderr);
+		fprintf(stderr, "  InnoDB error:\n"
+"MySQL is trying to set transactional table lock with corrupted lock type\n"
+"to table %s, lock type %d does not exist.\n",
+				prebuilt->table->name, lock_type);
+		DBUG_RETURN(HA_ERR_CRASHED);
+	}
+
+	/* MySQL is setting a new transactional table lock */
+
+	/* Set the MySQL flag to mark that there is an active transaction */
+	if (trx->active_trans == 0) {
+
+		innobase_register_trx_and_stmt(ht, thd);
+		trx->active_trans = 1;
+	}
+
+	if (THDVAR(thd, table_locks) && thd_in_lock_tables(thd)) {
+		ulint	error = DB_SUCCESS;
+
+		error = row_lock_table_for_mysql(prebuilt, NULL, 0);
+
+		if (error != DB_SUCCESS) {
+			error = convert_error_code_to_mysql(
+				(int) error, prebuilt->table->flags, thd);
+			DBUG_RETURN((int) error);
+		}
+
+		if (thd_test_options(thd, OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN)) {
+
+			/* Store the current undo_no of the transaction
+			so that we know where to roll back if we have
+			to roll back the next SQL statement */
+
+			trx_mark_sql_stat_end(trx);
+		}
+	}
+
+	DBUG_RETURN(0);
+}
+
+/************************************************************************//**
+Here we export InnoDB status variables to MySQL. */
+static
+void
+innodb_export_status(void)
+/*======================*/
+{
+	if (innodb_inited) {
+		srv_export_innodb_status();
+	}
+}
+
+/************************************************************************//**
+Implements the SHOW INNODB STATUS command. Sends the output of the InnoDB
+Monitor to the client. */
+static
+bool
+innodb_show_status(
+/*===============*/
+	handlerton*	hton,	/*!< in: the innodb handlerton */
+	THD*	thd,	/*!< in: the MySQL query thread of the caller */
+	stat_print_fn *stat_print)
+{
+	trx_t*			trx;
+	static const char	truncated_msg[] = "... truncated...\n";
+	const long		MAX_STATUS_SIZE = 64000;
+	ulint			trx_list_start = ULINT_UNDEFINED;
+	ulint			trx_list_end = ULINT_UNDEFINED;
+
+	DBUG_ENTER("innodb_show_status");
+	DBUG_ASSERT(hton == innodb_hton_ptr);
+
+	trx = check_trx_exists(thd);
+
+	innobase_release_stat_resources(trx);
+
+	/* We let the InnoDB Monitor to output at most MAX_STATUS_SIZE
+	bytes of text. */
+
+	long	flen, usable_len;
+	char*	str;
+
+	mutex_enter(&srv_monitor_file_mutex);
+	rewind(srv_monitor_file);
+	srv_printf_innodb_monitor(srv_monitor_file, FALSE,
+				  &trx_list_start, &trx_list_end);
+	flen = ftell(srv_monitor_file);
+	os_file_set_eof(srv_monitor_file);
+
+	if (flen < 0) {
+		flen = 0;
+	}
+
+	if (flen > MAX_STATUS_SIZE) {
+		usable_len = MAX_STATUS_SIZE;
+	} else {
+		usable_len = flen;
+	}
+
+	/* allocate buffer for the string, and
+	read the contents of the temporary file */
+
+	if (!(str = (char*) my_malloc(usable_len + 1, MYF(0)))) {
+	  mutex_exit(&srv_monitor_file_mutex);
+	  DBUG_RETURN(TRUE);
+	}
+
+	rewind(srv_monitor_file);
+	if (flen < MAX_STATUS_SIZE) {
+		/* Display the entire output. */
+		flen = (long) fread(str, 1, flen, srv_monitor_file);
+	} else if (trx_list_end < (ulint) flen
+			&& trx_list_start < trx_list_end
+			&& trx_list_start + (flen - trx_list_end)
+			< MAX_STATUS_SIZE - sizeof truncated_msg - 1) {
+		/* Omit the beginning of the list of active transactions. */
+		long len = (long) fread(str, 1, trx_list_start, srv_monitor_file);
+		memcpy(str + len, truncated_msg, sizeof truncated_msg - 1);
+		len += sizeof truncated_msg - 1;
+		usable_len = (MAX_STATUS_SIZE - 1) - len;
+		fseek(srv_monitor_file, flen - usable_len, SEEK_SET);
+		len += (long) fread(str + len, 1, usable_len, srv_monitor_file);
+		flen = len;
+	} else {
+		/* Omit the end of the output. */
+		flen = (long) fread(str, 1, MAX_STATUS_SIZE - 1, srv_monitor_file);
+	}
+
+	mutex_exit(&srv_monitor_file_mutex);
+
+	bool result = FALSE;
+
+	if (stat_print(thd, innobase_hton_name, (uint) strlen(innobase_hton_name),
+			STRING_WITH_LEN(""), str, flen)) {
+		result= TRUE;
+	}
+	my_free(str, MYF(0));
+
+	DBUG_RETURN(FALSE);
+}
+
+/************************************************************************//**
+Implements the SHOW MUTEX STATUS command.
+@return TRUE on failure, FALSE on success. */
+static
+bool
+innodb_mutex_show_status(
+/*=====================*/
+	handlerton*	hton,		/*!< in: the innodb handlerton */
+	THD*		thd,		/*!< in: the MySQL query thread of the
+					caller */
+	stat_print_fn*	stat_print)	/*!< in: function for printing
+					statistics */
+{
+	char buf1[IO_SIZE], buf2[IO_SIZE];
+	mutex_t*	mutex;
+	rw_lock_t*	lock;
+	ulint		block_mutex_oswait_count = 0;
+	ulint		block_lock_oswait_count = 0;
+	mutex_t*	block_mutex = NULL;
+	rw_lock_t*	block_lock = NULL;
+#ifdef UNIV_DEBUG
+	ulint	  rw_lock_count= 0;
+	ulint	  rw_lock_count_spin_loop= 0;
+	ulint	  rw_lock_count_spin_rounds= 0;
+	ulint	  rw_lock_count_os_wait= 0;
+	ulint	  rw_lock_count_os_yield= 0;
+	ulonglong rw_lock_wait_time= 0;
+#endif /* UNIV_DEBUG */
+	uint	  hton_name_len= (uint) strlen(innobase_hton_name), buf1len, buf2len;
+	DBUG_ENTER("innodb_mutex_show_status");
+	DBUG_ASSERT(hton == innodb_hton_ptr);
+
+	mutex_enter(&mutex_list_mutex);
+
+	for (mutex = UT_LIST_GET_FIRST(mutex_list); mutex != NULL;
+	     mutex = UT_LIST_GET_NEXT(list, mutex)) {
+		if (mutex->count_os_wait == 0) {
+			continue;
+		}
+
+		if (buf_pool_is_block_mutex(mutex)) {
+			block_mutex = mutex;
+			block_mutex_oswait_count += mutex->count_os_wait;
+			continue;
+		}
+#ifdef UNIV_DEBUG
+		if (mutex->mutex_type != 1) {
+			if (mutex->count_using > 0) {
+				buf1len= my_snprintf(buf1, sizeof(buf1),
+					"%s:%s",
+					mutex->cmutex_name, mutex->cfile_name);
+				buf2len= my_snprintf(buf2, sizeof(buf2),
+					"count=%lu, spin_waits=%lu,"
+					" spin_rounds=%lu, "
+					"os_waits=%lu, os_yields=%lu,"
+					" os_wait_times=%lu",
+					mutex->count_using,
+					mutex->count_spin_loop,
+					mutex->count_spin_rounds,
+					mutex->count_os_wait,
+					mutex->count_os_yield,
+					(ulong) (mutex->lspent_time/1000));
+
+				if (stat_print(thd, innobase_hton_name,
+						hton_name_len, buf1, buf1len,
+						buf2, buf2len)) {
+					mutex_exit(&mutex_list_mutex);
+					DBUG_RETURN(1);
+				}
+			}
+		} else {
+			rw_lock_count += mutex->count_using;
+			rw_lock_count_spin_loop += mutex->count_spin_loop;
+			rw_lock_count_spin_rounds += mutex->count_spin_rounds;
+			rw_lock_count_os_wait += mutex->count_os_wait;
+			rw_lock_count_os_yield += mutex->count_os_yield;
+			rw_lock_wait_time += mutex->lspent_time;
+		}
+#else /* UNIV_DEBUG */
+		buf1len= (uint) my_snprintf(buf1, sizeof(buf1), "%s",
+				     mutex->cmutex_name);
+		buf2len= (uint) my_snprintf(buf2, sizeof(buf2), "os_waits=%lu",
+				     (ulong) mutex->count_os_wait);
+
+		if (stat_print(thd, innobase_hton_name,
+			       hton_name_len, buf1, buf1len,
+			       buf2, buf2len)) {
+			mutex_exit(&mutex_list_mutex);
+			DBUG_RETURN(1);
+		}
+#endif /* UNIV_DEBUG */
+	}
+
+	if (block_mutex) {
+		buf1len = (uint) my_snprintf(buf1, sizeof buf1,
+					     "combined %s",
+					     block_mutex->cmutex_name);
+		buf2len = (uint) my_snprintf(buf2, sizeof buf2,
+					     "os_waits=%lu",
+					     (ulong) block_mutex_oswait_count);
+
+		if (stat_print(thd, innobase_hton_name,
+			       hton_name_len, buf1, buf1len,
+			       buf2, buf2len)) {
+			mutex_exit(&mutex_list_mutex);
+			DBUG_RETURN(1);
+		}
+	}
+
+	mutex_exit(&mutex_list_mutex);
+
+	mutex_enter(&rw_lock_list_mutex);
+
+	for (lock = UT_LIST_GET_FIRST(rw_lock_list); lock != NULL;
+	     lock = UT_LIST_GET_NEXT(list, lock)) {
+		if (lock->count_os_wait == 0) {
+			continue;
+		}
+
+		if (buf_pool_is_block_lock(lock)) {
+			block_lock = lock;
+			block_lock_oswait_count += lock->count_os_wait;
+			continue;
+		}
+
+		buf1len = my_snprintf(buf1, sizeof buf1, "%s",
+				     lock->lock_name);
+		buf2len = my_snprintf(buf2, sizeof buf2, "os_waits=%lu",
+				      (ulong) lock->count_os_wait);
+
+		if (stat_print(thd, innobase_hton_name,
+			       hton_name_len, buf1, buf1len,
+			       buf2, buf2len)) {
+			mutex_exit(&rw_lock_list_mutex);
+			DBUG_RETURN(1);
+		}
+	}
+
+	if (block_lock) {
+		buf1len = (uint) my_snprintf(buf1, sizeof buf1,
+					     "combined %s",
+					     block_lock->lock_name);
+		buf2len = (uint) my_snprintf(buf2, sizeof buf2,
+					     "os_waits=%lu",
+					     (ulong) block_lock_oswait_count);
+
+		if (stat_print(thd, innobase_hton_name,
+			       hton_name_len, buf1, buf1len,
+			       buf2, buf2len)) {
+			mutex_exit(&rw_lock_list_mutex);
+			DBUG_RETURN(1);
+		}
+	}
+
+	mutex_exit(&rw_lock_list_mutex);
+
+#ifdef UNIV_DEBUG
+	buf2len = my_snprintf(buf2, sizeof buf2,
+			     "count=%lu, spin_waits=%lu, spin_rounds=%lu, "
+			     "os_waits=%lu, os_yields=%lu, os_wait_times=%lu",
+			      (ulong) rw_lock_count,
+			      (ulong) rw_lock_count_spin_loop,
+			      (ulong) rw_lock_count_spin_rounds,
+			      (ulong) rw_lock_count_os_wait,
+			      (ulong) rw_lock_count_os_yield,
+			      (ulong) (rw_lock_wait_time / 1000));
+
+	if (stat_print(thd, innobase_hton_name, hton_name_len,
+			STRING_WITH_LEN("rw_lock_mutexes"), buf2, buf2len)) {
+		DBUG_RETURN(1);
+	}
+#endif /* UNIV_DEBUG */
+
+	DBUG_RETURN(FALSE);
+}
+
+static
+bool innobase_show_status(handlerton *hton, THD* thd, 
+                          stat_print_fn* stat_print,
+                          enum ha_stat_type stat_type)
+{
+	DBUG_ASSERT(hton == innodb_hton_ptr);
+
+	switch (stat_type) {
+	case HA_ENGINE_STATUS:
+		return innodb_show_status(hton, thd, stat_print);
+	case HA_ENGINE_MUTEX:
+		return innodb_mutex_show_status(hton, thd, stat_print);
+	default:
+		return(FALSE);
+	}
+}
+
+/************************************************************************//**
+ Handling the shared INNOBASE_SHARE structure that is needed to provide table
+ locking.
+****************************************************************************/
+
+static INNOBASE_SHARE* get_share(const char* table_name)
+{
+	INNOBASE_SHARE *share;
+	pthread_mutex_lock(&innobase_share_mutex);
+
+	ulint	fold = ut_fold_string(table_name);
+
+	HASH_SEARCH(table_name_hash, innobase_open_tables, fold,
+		    INNOBASE_SHARE*, share,
+		    ut_ad(share->use_count > 0),
+		    !strcmp(share->table_name, table_name));
+
+	if (!share) {
+
+		uint length = (uint) strlen(table_name);
+
+		/* TODO: invoke HASH_MIGRATE if innobase_open_tables
+		grows too big */
+
+		share = (INNOBASE_SHARE *) my_malloc(sizeof(*share)+length+1,
+			MYF(MY_FAE | MY_ZEROFILL));
+
+		share->table_name = (char*) memcpy(share + 1,
+						   table_name, length + 1);
+
+		HASH_INSERT(INNOBASE_SHARE, table_name_hash,
+			    innobase_open_tables, fold, share);
+
+		thr_lock_init(&share->lock);
+
+		/* Index translation table initialization */
+		share->idx_trans_tbl.index_mapping = NULL;
+		share->idx_trans_tbl.index_count = 0;
+		share->idx_trans_tbl.array_size = 0;
+	}
+
+	share->use_count++;
+	pthread_mutex_unlock(&innobase_share_mutex);
+
+	return(share);
+}
+
+static void free_share(INNOBASE_SHARE* share)
+{
+	pthread_mutex_lock(&innobase_share_mutex);
+
+#ifdef UNIV_DEBUG
+	INNOBASE_SHARE* share2;
+	ulint	fold = ut_fold_string(share->table_name);
+
+	HASH_SEARCH(table_name_hash, innobase_open_tables, fold,
+		    INNOBASE_SHARE*, share2,
+		    ut_ad(share->use_count > 0),
+		    !strcmp(share->table_name, share2->table_name));
+
+	ut_a(share2 == share);
+#endif /* UNIV_DEBUG */
+
+	if (!--share->use_count) {
+		ulint	fold = ut_fold_string(share->table_name);
+
+		HASH_DELETE(INNOBASE_SHARE, table_name_hash,
+			    innobase_open_tables, fold, share);
+		thr_lock_delete(&share->lock);
+
+		/* Free any memory from index translation table */
+		my_free(share->idx_trans_tbl.index_mapping,
+			MYF(MY_ALLOW_ZERO_PTR));
+
+		my_free(share, MYF(0));
+
+		/* TODO: invoke HASH_MIGRATE if innobase_open_tables
+		shrinks too much */
+	}
+
+	pthread_mutex_unlock(&innobase_share_mutex);
+}
+
+/*****************************************************************//**
+Converts a MySQL table lock stored in the 'lock' field of the handle to
+a proper type before storing pointer to the lock into an array of pointers.
+MySQL also calls this if it wants to reset some table locks to a not-locked
+state during the processing of an SQL query. An example is that during a
+SELECT the read lock is released early on the 'const' tables where we only
+fetch one row. MySQL does not call this when it releases all locks at the
+end of an SQL statement.
+@return	pointer to the next element in the 'to' array */
+UNIV_INTERN
+THR_LOCK_DATA**
+ha_innobase::store_lock(
+/*====================*/
+	THD*			thd,		/*!< in: user thread handle */
+	THR_LOCK_DATA**		to,		/*!< in: pointer to an array
+						of pointers to lock structs;
+						pointer to the 'lock' field
+						of current handle is stored
+						next to this array */
+	enum thr_lock_type	lock_type)	/*!< in: lock type to store in
+						'lock'; this may also be
+						TL_IGNORE */
+{
+	trx_t*		trx;
+
+	/* Note that trx in this function is NOT necessarily prebuilt->trx
+	because we call update_thd() later, in ::external_lock()! Failure to
+	understand this caused a serious memory corruption bug in 5.1.11. */
+
+	trx = check_trx_exists(thd);
+
+	/* NOTE: MySQL can call this function with lock 'type' TL_IGNORE!
+	Be careful to ignore TL_IGNORE if we are going to do something with
+	only 'real' locks! */
+
+	/* If no MySQL table is in use, we need to set the isolation level
+	of the transaction. */
+
+	if (lock_type != TL_IGNORE
+	    && trx->n_mysql_tables_in_use == 0) {
+		trx->isolation_level = innobase_map_isolation_level(
+			(enum_tx_isolation) thd_tx_isolation(thd));
+
+		if (trx->isolation_level <= TRX_ISO_READ_COMMITTED
+		    && trx->global_read_view) {
+
+			/* At low transaction isolation levels we let
+			each consistent read set its own snapshot */
+
+			read_view_close_for_mysql(trx);
+		}
+	}
+
+	DBUG_ASSERT(EQ_CURRENT_THD(thd));
+	const bool in_lock_tables = thd_in_lock_tables(thd);
+	const uint sql_command = thd_sql_command(thd);
+
+	if (sql_command == SQLCOM_DROP_TABLE) {
+
+		/* MySQL calls this function in DROP TABLE though this table
+		handle may belong to another thd that is running a query. Let
+		us in that case skip any changes to the prebuilt struct. */ 
+
+	} else if ((lock_type == TL_READ && in_lock_tables)
+		   || (lock_type == TL_READ_HIGH_PRIORITY && in_lock_tables)
+		   || lock_type == TL_READ_WITH_SHARED_LOCKS
+		   || lock_type == TL_READ_NO_INSERT
+		   || (lock_type != TL_IGNORE
+		       && sql_command != SQLCOM_SELECT)) {
+
+		/* The OR cases above are in this order:
+		1) MySQL is doing LOCK TABLES ... READ LOCAL, or we
+		are processing a stored procedure or function, or
+		2) (we do not know when TL_READ_HIGH_PRIORITY is used), or
+		3) this is a SELECT ... IN SHARE MODE, or
+		4) we are doing a complex SQL statement like
+		INSERT INTO ... SELECT ... and the logical logging (MySQL
+		binlog) requires the use of a locking read, or
+		MySQL is doing LOCK TABLES ... READ.
+		5) we let InnoDB do locking reads for all SQL statements that
+		are not simple SELECTs; note that select_lock_type in this
+		case may get strengthened in ::external_lock() to LOCK_X.
+		Note that we MUST use a locking read in all data modifying
+		SQL statements, because otherwise the execution would not be
+		serializable, and also the results from the update could be
+		unexpected if an obsolete consistent read view would be
+		used. */
+
+		ulint	isolation_level;
+
+		isolation_level = trx->isolation_level;
+
+		if ((srv_locks_unsafe_for_binlog
+		     || isolation_level <= TRX_ISO_READ_COMMITTED)
+		    && isolation_level != TRX_ISO_SERIALIZABLE
+		    && (lock_type == TL_READ || lock_type == TL_READ_NO_INSERT)
+		    && (sql_command == SQLCOM_INSERT_SELECT
+			|| sql_command == SQLCOM_REPLACE_SELECT
+			|| sql_command == SQLCOM_UPDATE
+			|| sql_command == SQLCOM_CREATE_TABLE
+			|| sql_command == SQLCOM_SET_OPTION)) {
+
+			/* If we either have innobase_locks_unsafe_for_binlog
+			option set or this session is using READ COMMITTED
+			isolation level and isolation level of the transaction
+			is not set to serializable and MySQL is doing
+			INSERT INTO...SELECT or REPLACE INTO...SELECT
+			or UPDATE ... = (SELECT ...) or CREATE  ...
+			SELECT... or SET ... = (SELECT ...) without
+			FOR UPDATE or IN SHARE MODE in select,
+			then we use consistent read for select. */
+
+			prebuilt->select_lock_type = LOCK_NONE;
+			prebuilt->stored_select_lock_type = LOCK_NONE;
+		} else if (sql_command == SQLCOM_CHECKSUM) {
+			/* Use consistent read for checksum table */
+
+			prebuilt->select_lock_type = LOCK_NONE;
+			prebuilt->stored_select_lock_type = LOCK_NONE;
+		} else {
+			prebuilt->select_lock_type = LOCK_S;
+			prebuilt->stored_select_lock_type = LOCK_S;
+		}
+
+	} else if (lock_type != TL_IGNORE) {
+
+		/* We set possible LOCK_X value in external_lock, not yet
+		here even if this would be SELECT ... FOR UPDATE */
+
+		prebuilt->select_lock_type = LOCK_NONE;
+		prebuilt->stored_select_lock_type = LOCK_NONE;
+	}
+
+	if (lock_type != TL_IGNORE && lock.type == TL_UNLOCK) {
+
+		/* Starting from 5.0.7, we weaken also the table locks
+		set at the start of a MySQL stored procedure call, just like
+		we weaken the locks set at the start of an SQL statement.
+		MySQL does set in_lock_tables TRUE there, but in reality
+		we do not need table locks to make the execution of a
+		single transaction stored procedure call deterministic
+		(if it does not use a consistent read). */
+
+		if (lock_type == TL_READ
+		    && sql_command == SQLCOM_LOCK_TABLES) {
+			/* We come here if MySQL is processing LOCK TABLES
+			... READ LOCAL. MyISAM under that table lock type
+			reads the table as it was at the time the lock was
+			granted (new inserts are allowed, but not seen by the
+			reader). To get a similar effect on an InnoDB table,
+			we must use LOCK TABLES ... READ. We convert the lock
+			type here, so that for InnoDB, READ LOCAL is
+			equivalent to READ. This will change the InnoDB
+			behavior in mysqldump, so that dumps of InnoDB tables
+			are consistent with dumps of MyISAM tables. */
+
+			lock_type = TL_READ_NO_INSERT;
+		}
+
+		/* If we are not doing a LOCK TABLE, DISCARD/IMPORT
+		TABLESPACE or TRUNCATE TABLE then allow multiple
+		writers. Note that ALTER TABLE uses a TL_WRITE_ALLOW_READ
+		< TL_WRITE_CONCURRENT_INSERT.
+
+		We especially allow multiple writers if MySQL is at the
+		start of a stored procedure call (SQLCOM_CALL) or a
+		stored function call (MySQL does have in_lock_tables
+		TRUE there). */
+
+		if ((lock_type >= TL_WRITE_CONCURRENT_INSERT
+		     && lock_type <= TL_WRITE)
+		    && !(in_lock_tables
+			 && sql_command == SQLCOM_LOCK_TABLES)
+		    && !thd_tablespace_op(thd)
+		    && sql_command != SQLCOM_TRUNCATE
+		    && sql_command != SQLCOM_OPTIMIZE
+		    && sql_command != SQLCOM_CREATE_TABLE) {
+
+			lock_type = TL_WRITE_ALLOW_WRITE;
+		}
+
+		/* In queries of type INSERT INTO t1 SELECT ... FROM t2 ...
+		MySQL would use the lock TL_READ_NO_INSERT on t2, and that
+		would conflict with TL_WRITE_ALLOW_WRITE, blocking all inserts
+		to t2. Convert the lock to a normal read lock to allow
+		concurrent inserts to t2.
+
+		We especially allow concurrent inserts if MySQL is at the
+		start of a stored procedure call (SQLCOM_CALL)
+		(MySQL does have thd_in_lock_tables() TRUE there). */
+
+		if (lock_type == TL_READ_NO_INSERT
+		    && sql_command != SQLCOM_LOCK_TABLES) {
+
+			lock_type = TL_READ;
+		}
+
+		lock.type = lock_type;
+	}
+
+	*to++= &lock;
+
+	return(to);
+}
+
+/*********************************************************************//**
+Read the next autoinc value. Acquire the relevant locks before reading
+the AUTOINC value. If SUCCESS then the table AUTOINC mutex will be locked
+on return and all relevant locks acquired.
+@return	DB_SUCCESS or error code */
+UNIV_INTERN
+ulint
+ha_innobase::innobase_get_autoinc(
+/*==============================*/
+	ulonglong*	value)		/*!< out: autoinc value */
+{
+ 	*value = 0;
+ 
+	prebuilt->autoinc_error = innobase_lock_autoinc();
+
+	if (prebuilt->autoinc_error == DB_SUCCESS) {
+
+		/* Determine the first value of the interval */
+		*value = dict_table_autoinc_read(prebuilt->table);
+
+		/* It should have been initialized during open. */
+		if (*value == 0) {
+			prebuilt->autoinc_error = DB_UNSUPPORTED;
+			dict_table_autoinc_unlock(prebuilt->table);
+		}
+	}
+
+	return(prebuilt->autoinc_error);
+}
+
+/*******************************************************************//**
+This function reads the global auto-inc counter. It doesn't use the 
+AUTOINC lock even if the lock mode is set to TRADITIONAL.
+@return	the autoinc value */
+UNIV_INTERN
+ulonglong
+ha_innobase::innobase_peek_autoinc(void)
+/*====================================*/
+{
+	ulonglong	auto_inc;
+	dict_table_t*	innodb_table;
+
+	ut_a(prebuilt != NULL);
+	ut_a(prebuilt->table != NULL);
+
+	innodb_table = prebuilt->table;
+
+	dict_table_autoinc_lock(innodb_table);
+
+	auto_inc = dict_table_autoinc_read(innodb_table);
+
+	ut_a(auto_inc > 0);
+
+	dict_table_autoinc_unlock(innodb_table);
+
+	return(auto_inc);
+}
+
+/*********************************************************************//**
+This function initializes the auto-inc counter if it has not been
+initialized yet. This function does not change the value of the auto-inc
+counter if it already has been initialized. Returns the value of the
+auto-inc counter in *first_value, and ULONGLONG_MAX in *nb_reserved_values (as
+we have a table-level lock). offset, increment, nb_desired_values are ignored.
+*first_value is set to -1 if error (deadlock or lock wait timeout) */
+UNIV_INTERN
+void
+ha_innobase::get_auto_increment(
+/*============================*/
+        ulonglong	offset,              /*!< in: table autoinc offset */
+        ulonglong	increment,           /*!< in: table autoinc increment */
+        ulonglong	nb_desired_values,   /*!< in: number of values reqd */
+        ulonglong	*first_value,        /*!< out: the autoinc value */
+        ulonglong	*nb_reserved_values) /*!< out: count of reserved values */
+{
+	trx_t*		trx;
+	ulint		error;
+	ulonglong	autoinc = 0;
+
+	/* Prepare prebuilt->trx in the table handle */
+	update_thd(ha_thd());
+
+	error = innobase_get_autoinc(&autoinc);
+
+	if (error != DB_SUCCESS) {
+		*first_value = (~(ulonglong) 0);
+		return;
+	}
+
+	/* This is a hack, since nb_desired_values seems to be accurate only
+	for the first call to get_auto_increment() for multi-row INSERT and
+	meaningless for other statements e.g, LOAD etc. Subsequent calls to
+	this method for the same statement results in different values which
+	don't make sense. Therefore we store the value the first time we are
+	called and count down from that as rows are written (see write_row()).
+	*/
+
+	trx = prebuilt->trx;
+
+	/* Note: We can't rely on *first_value since some MySQL engines,
+	in particular the partition engine, don't initialize it to 0 when
+	invoking this method. So we are not sure if it's guaranteed to
+	be 0 or not. */
+
+	/* We need the upper limit of the col type to check for
+	whether we update the table autoinc counter or not. */
+	ulonglong	col_max_value = innobase_get_int_col_max_value(
+		table->next_number_field);
+
+	/* Called for the first time ? */
+	if (trx->n_autoinc_rows == 0) {
+
+		trx->n_autoinc_rows = (ulint) nb_desired_values;
+
+		/* It's possible for nb_desired_values to be 0:
+		e.g., INSERT INTO T1(C) SELECT C FROM T2; */
+		if (nb_desired_values == 0) {
+
+			trx->n_autoinc_rows = 1;
+		}
+
+		set_if_bigger(*first_value, autoinc);
+	/* Not in the middle of a mult-row INSERT. */
+	} else if (prebuilt->autoinc_last_value == 0) {
+		set_if_bigger(*first_value, autoinc);
+	/* Check for -ve values. */
+	} else if (*first_value > col_max_value && trx->n_autoinc_rows > 0) {
+		/* Set to next logical value. */
+		ut_a(autoinc > trx->n_autoinc_rows);
+		*first_value = (autoinc - trx->n_autoinc_rows) - 1;
+	}
+
+	*nb_reserved_values = trx->n_autoinc_rows;
+
+	/* With old style AUTOINC locking we only update the table's
+	AUTOINC counter after attempting to insert the row. */
+	if (innobase_autoinc_lock_mode != AUTOINC_OLD_STYLE_LOCKING) {
+		ulonglong	need;
+		ulonglong	current;
+		ulonglong	next_value;
+
+		current = *first_value > col_max_value ? autoinc : *first_value;
+		need = *nb_reserved_values * increment;
+
+		/* Compute the last value in the interval */
+		next_value = innobase_next_autoinc(
+			current, need, offset, col_max_value);
+
+		prebuilt->autoinc_last_value = next_value;
+
+		if (prebuilt->autoinc_last_value < *first_value) {
+			*first_value = (~(ulonglong) 0);
+		} else {
+			/* Update the table autoinc variable */
+			dict_table_autoinc_update_if_greater(
+				prebuilt->table, prebuilt->autoinc_last_value);
+		}
+	} else {
+		/* This will force write_row() into attempting an update
+		of the table's AUTOINC counter. */
+		prebuilt->autoinc_last_value = 0;
+	}
+
+	/* The increment to be used to increase the AUTOINC value, we use
+	this in write_row() and update_row() to increase the autoinc counter
+	for columns that are filled by the user. We need the offset and
+	the increment. */
+	prebuilt->autoinc_offset = offset;
+	prebuilt->autoinc_increment = increment;
+
+	dict_table_autoinc_unlock(prebuilt->table);
+}
+
+/*******************************************************************//**
+Reset the auto-increment counter to the given value, i.e. the next row
+inserted will get the given value. This is called e.g. after TRUNCATE
+is emulated by doing a 'DELETE FROM t'. HA_ERR_WRONG_COMMAND is
+returned by storage engines that don't support this operation.
+@return	0 or error code */
+UNIV_INTERN
+int
+ha_innobase::reset_auto_increment(
+/*==============================*/
+	ulonglong	value)		/*!< in: new value for table autoinc */
+{
+	DBUG_ENTER("ha_innobase::reset_auto_increment");
+
+	int	error;
+
+	update_thd(ha_thd());
+
+	error = row_lock_table_autoinc_for_mysql(prebuilt);
+
+	if (error != DB_SUCCESS) {
+		error = convert_error_code_to_mysql(error,
+						    prebuilt->table->flags,
+						    user_thd);
+
+		DBUG_RETURN(error);
+	}
+
+	/* The next value can never be 0. */
+	if (value == 0) {
+		value = 1;
+	}
+
+	innobase_reset_autoinc(value);
+
+	DBUG_RETURN(0);
+}
+
+/* See comment in handler.cc */
+UNIV_INTERN
+bool
+ha_innobase::get_error_message(int error, String *buf)
+{
+	trx_t*	trx = check_trx_exists(ha_thd());
+
+	buf->copy(trx->detailed_error, (uint) strlen(trx->detailed_error),
+		system_charset_info);
+
+	return(FALSE);
+}
+
+/*******************************************************************//**
+Compares two 'refs'. A 'ref' is the (internal) primary key value of the row.
+If there is no explicitly declared non-null unique key or a primary key, then
+InnoDB internally uses the row id as the primary key.
+@return	< 0 if ref1 < ref2, 0 if equal, else > 0 */
+UNIV_INTERN
+int
+ha_innobase::cmp_ref(
+/*=================*/
+	const uchar*	ref1,	/*!< in: an (internal) primary key value in the
+				MySQL key value format */
+	const uchar*	ref2)	/*!< in: an (internal) primary key value in the
+				MySQL key value format */
+{
+	enum_field_types mysql_type;
+	Field*		field;
+	KEY_PART_INFO*	key_part;
+	KEY_PART_INFO*	key_part_end;
+	uint		len1;
+	uint		len2;
+	int		result;
+
+	if (prebuilt->clust_index_was_generated) {
+		/* The 'ref' is an InnoDB row id */
+
+		return(memcmp(ref1, ref2, DATA_ROW_ID_LEN));
+	}
+
+	/* Do a type-aware comparison of primary key fields. PK fields
+	are always NOT NULL, so no checks for NULL are performed. */
+
+	key_part = table->key_info[table->s->primary_key].key_part;
+
+	key_part_end = key_part
+			+ table->key_info[table->s->primary_key].key_parts;
+
+	for (; key_part != key_part_end; ++key_part) {
+		field = key_part->field;
+		mysql_type = field->type();
+
+		if (mysql_type == MYSQL_TYPE_TINY_BLOB
+			|| mysql_type == MYSQL_TYPE_MEDIUM_BLOB
+			|| mysql_type == MYSQL_TYPE_BLOB
+			|| mysql_type == MYSQL_TYPE_LONG_BLOB) {
+
+			/* In the MySQL key value format, a column prefix of
+			a BLOB is preceded by a 2-byte length field */
+
+			len1 = innobase_read_from_2_little_endian(ref1);
+			len2 = innobase_read_from_2_little_endian(ref2);
+
+			ref1 += 2;
+			ref2 += 2;
+			result = ((Field_blob*)field)->cmp( ref1, len1,
+                                                            ref2, len2);
+		} else {
+			result = field->key_cmp(ref1, ref2);
+		}
+
+		if (result) {
+
+			return(result);
+		}
+
+		ref1 += key_part->store_length;
+		ref2 += key_part->store_length;
+	}
+
+	return(0);
+}
+
+/*******************************************************************//**
+Ask InnoDB if a query to a table can be cached.
+@return	TRUE if query caching of the table is permitted */
+UNIV_INTERN
+my_bool
+ha_innobase::register_query_cache_table(
+/*====================================*/
+	THD*		thd,		/*!< in: user thread handle */
+	char*		table_key,	/*!< in: concatenation of database name,
+					the null character NUL,
+					and the table name */
+	uint		key_length,	/*!< in: length of the full name, i.e.
+					len(dbname) + len(tablename) + 1 */
+	qc_engine_callback*
+			call_back,	/*!< out: pointer to function for
+					checking if query caching
+					is permitted */
+	ulonglong	*engine_data)	/*!< in/out: data to call_back */
+{
+	*call_back = innobase_query_caching_of_table_permitted;
+	*engine_data = 0;
+	return(innobase_query_caching_of_table_permitted(thd, table_key,
+							 key_length,
+							 engine_data));
+}
+
+UNIV_INTERN
+char*
+ha_innobase::get_mysql_bin_log_name()
+{
+	return(trx_sys_mysql_bin_log_name);
+}
+
+UNIV_INTERN
+ulonglong
+ha_innobase::get_mysql_bin_log_pos()
+{
+	/* trx... is ib_int64_t, which is a typedef for a 64-bit integer
+	(__int64 or longlong) so it's ok to cast it to ulonglong. */
+
+	return(trx_sys_mysql_bin_log_pos);
+}
+
+/******************************************************************//**
+This function is used to find the storage length in bytes of the first n
+characters for prefix indexes using a multibyte character set. The function
+finds charset information and returns length of prefix_len characters in the
+index field in bytes.
+@return	number of bytes occupied by the first n characters */
+extern "C" UNIV_INTERN
+ulint
+innobase_get_at_most_n_mbchars(
+/*===========================*/
+	ulint charset_id,	/*!< in: character set id */
+	ulint prefix_len,	/*!< in: prefix length in bytes of the index
+				(this has to be divided by mbmaxlen to get the
+				number of CHARACTERS n in the prefix) */
+	ulint data_len,		/*!< in: length of the string in bytes */
+	const char* str)	/*!< in: character string */
+{
+	ulint char_length;	/*!< character length in bytes */
+	ulint n_chars;		/*!< number of characters in prefix */
+	CHARSET_INFO* charset;	/*!< charset used in the field */
+
+	charset = get_charset((uint) charset_id, MYF(MY_WME));
+
+	ut_ad(charset);
+	ut_ad(charset->mbmaxlen);
+
+	/* Calculate how many characters at most the prefix index contains */
+
+	n_chars = prefix_len / charset->mbmaxlen;
+
+	/* If the charset is multi-byte, then we must find the length of the
+	first at most n chars in the string. If the string contains less
+	characters than n, then we return the length to the end of the last
+	character. */
+
+	if (charset->mbmaxlen > 1) {
+		/* my_charpos() returns the byte length of the first n_chars
+		characters, or a value bigger than the length of str, if
+		there were not enough full characters in str.
+
+		Why does the code below work:
+		Suppose that we are looking for n UTF-8 characters.
+
+		1) If the string is long enough, then the prefix contains at
+		least n complete UTF-8 characters + maybe some extra
+		characters + an incomplete UTF-8 character. No problem in
+		this case. The function returns the pointer to the
+		end of the nth character.
+
+		2) If the string is not long enough, then the string contains
+		the complete value of a column, that is, only complete UTF-8
+		characters, and we can store in the column prefix index the
+		whole string. */
+
+		char_length = my_charpos(charset, str,
+						str + data_len, (int) n_chars);
+		if (char_length > data_len) {
+			char_length = data_len;
+		}
+	} else {
+		if (data_len < prefix_len) {
+			char_length = data_len;
+		} else {
+			char_length = prefix_len;
+		}
+	}
+
+	return(char_length);
+}
+
+/*******************************************************************//**
+This function is used to prepare an X/Open XA distributed transaction.
+@return	0 or error number */
+static
+int
+innobase_xa_prepare(
+/*================*/
+        handlerton*	hton,	/*!< in: InnoDB handlerton */
+	THD*		thd,	/*!< in: handle to the MySQL thread of
+				the user whose XA transaction should
+				be prepared */
+	bool		all)	/*!< in: TRUE - commit transaction
+				FALSE - the current SQL statement
+				ended */
+{
+	int error = 0;
+	trx_t* trx = check_trx_exists(thd);
+
+	DBUG_ASSERT(hton == innodb_hton_ptr);
+
+	/* we use support_xa value as it was seen at transaction start
+	time, not the current session variable value. Any possible changes
+	to the session variable take effect only in the next transaction */
+	if (!trx->support_xa) {
+
+		return(0);
+	}
+
+	thd_get_xid(thd, (MYSQL_XID*) &trx->xid);
+
+	/* Release a possible FIFO ticket and search latch. Since we will
+	reserve the kernel mutex, we have to release the search system latch
+	first to obey the latching order. */
+
+	innobase_release_stat_resources(trx);
+
+	if (trx->active_trans == 0 && trx->conc_state != TRX_NOT_STARTED) {
+
+	  sql_print_error("trx->active_trans == 0, but trx->conc_state != "
+			  "TRX_NOT_STARTED");
+	}
+
+	if (all
+		|| (!thd_test_options(thd, OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN))) {
+
+		/* We were instructed to prepare the whole transaction, or
+		this is an SQL statement end and autocommit is on */
+
+		ut_ad(trx->active_trans);
+
+		error = (int) trx_prepare_for_mysql(trx);
+	} else {
+		/* We just mark the SQL statement ended and do not do a
+		transaction prepare */
+
+		/* If we had reserved the auto-inc lock for some
+		table in this SQL statement we release it now */
+
+		row_unlock_table_autoinc_for_mysql(trx);
+
+		/* Store the current undo_no of the transaction so that we
+		know where to roll back if we have to roll back the next
+		SQL statement */
+
+		trx_mark_sql_stat_end(trx);
+	}
+
+	/* Tell the InnoDB server that there might be work for utility
+	threads: */
+
+	srv_active_wake_master_thread();
+
+	if (thd_sql_command(thd) != SQLCOM_XA_PREPARE &&
+	    (all || !thd_test_options(thd, OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN)))
+	{
+		if (srv_enable_unsafe_group_commit && !THDVAR(thd, support_xa)) {
+			/* choose group commit rather than binlog order */
+			return(error);
+		}
+
+		/* For ibbackup to work the order of transactions in binlog
+		and InnoDB must be the same. Consider the situation
+
+		  thread1> prepare; write to binlog; ...
+			  <context switch>
+		  thread2> prepare; write to binlog; commit
+		  thread1>			     ... commit
+
+		To ensure this will not happen we're taking the mutex on
+		prepare, and releasing it on commit.
+
+		Note: only do it for normal commits, done via ha_commit_trans.
+		If 2pc protocol is executed by external transaction
+		coordinator, it will be just a regular MySQL client
+		executing XA PREPARE and XA COMMIT commands.
+		In this case we cannot know how many minutes or hours
+		will be between XA PREPARE and XA COMMIT, and we don't want
+		to block for undefined period of time. */
+		pthread_mutex_lock(&prepare_commit_mutex);
+		trx->active_trans = 2;
+	}
+
+	return(error);
+}
+
+/*******************************************************************//**
+This function is used to recover X/Open XA distributed transactions.
+@return	number of prepared transactions stored in xid_list */
+static
+int
+innobase_xa_recover(
+/*================*/
+	handlerton*	hton,	/*!< in: InnoDB handlerton */
+	XID*		xid_list,/*!< in/out: prepared transactions */
+	uint		len)	/*!< in: number of slots in xid_list */
+{
+	DBUG_ASSERT(hton == innodb_hton_ptr);
+
+	if (len == 0 || xid_list == NULL) {
+
+		return(0);
+	}
+
+	return(trx_recover_for_mysql(xid_list, len));
+}
+
+/*******************************************************************//**
+This function is used to commit one X/Open XA distributed transaction
+which is in the prepared state
+@return	0 or error number */
+static
+int
+innobase_commit_by_xid(
+/*===================*/
+        handlerton *hton,
+	XID*	xid)	/*!< in: X/Open XA transaction identification */
+{
+	trx_t*	trx;
+
+	DBUG_ASSERT(hton == innodb_hton_ptr);
+
+	trx = trx_get_trx_by_xid(xid);
+
+	if (trx) {
+		innobase_commit_low(trx);
+
+		return(XA_OK);
+	} else {
+		return(XAER_NOTA);
+	}
+}
+
+/*******************************************************************//**
+This function is used to rollback one X/Open XA distributed transaction
+which is in the prepared state
+@return	0 or error number */
+static
+int
+innobase_rollback_by_xid(
+/*=====================*/
+	handlerton*	hton,	/*!< in: InnoDB handlerton */
+	XID*		xid)	/*!< in: X/Open XA transaction
+				identification */
+{
+	trx_t*	trx;
+
+	DBUG_ASSERT(hton == innodb_hton_ptr);
+
+	trx = trx_get_trx_by_xid(xid);
+
+	if (trx) {
+		return(innobase_rollback_trx(trx));
+	} else {
+		return(XAER_NOTA);
+	}
+}
+
+/*******************************************************************//**
+Create a consistent view for a cursor based on current transaction
+which is created if the corresponding MySQL thread still lacks one.
+This consistent view is then used inside of MySQL when accessing records
+using a cursor.
+@return	pointer to cursor view or NULL */
+static
+void*
+innobase_create_cursor_view(
+/*========================*/
+        handlerton *hton, /*!< in: innobase hton */
+	THD* thd)	  /*!< in: user thread handle */
+{
+	DBUG_ASSERT(hton == innodb_hton_ptr);
+
+	return(read_cursor_view_create_for_mysql(check_trx_exists(thd)));
+}
+
+/*******************************************************************//**
+Close the given consistent cursor view of a transaction and restore
+global read view to a transaction read view. Transaction is created if the
+corresponding MySQL thread still lacks one. */
+static
+void
+innobase_close_cursor_view(
+/*=======================*/
+        handlerton *hton,
+	THD*	thd,	/*!< in: user thread handle */
+	void*	curview)/*!< in: Consistent read view to be closed */
+{
+	DBUG_ASSERT(hton == innodb_hton_ptr);
+
+	read_cursor_view_close_for_mysql(check_trx_exists(thd),
+					 (cursor_view_t*) curview);
+}
+
+/*******************************************************************//**
+Set the given consistent cursor view to a transaction which is created
+if the corresponding MySQL thread still lacks one. If the given
+consistent cursor view is NULL global read view of a transaction is
+restored to a transaction read view. */
+static
+void
+innobase_set_cursor_view(
+/*=====================*/
+        handlerton *hton,
+	THD*	thd,	/*!< in: user thread handle */
+	void*	curview)/*!< in: Consistent cursor view to be set */
+{
+	DBUG_ASSERT(hton == innodb_hton_ptr);
+
+	read_cursor_set_for_mysql(check_trx_exists(thd),
+				  (cursor_view_t*) curview);
+}
+
+/*******************************************************************//**
+If col_name is not NULL, check whether the named column is being
+renamed in the table. If col_name is not provided, check
+whether any one of columns in the table is being renamed.
+@return true if the column is being renamed */
+static
+bool
+check_column_being_renamed(
+/*=======================*/
+	const TABLE*	table,		/*!< in: MySQL table */
+	const char*	col_name)	/*!< in: name of the column */
+{
+	uint		k;
+	Field*		field;
+
+	for (k = 0; k < table->s->fields; k++) {
+		field = table->field[k];
+
+		if (field->flags & FIELD_IS_RENAMED) {
+
+			/* If col_name is not provided, return
+			if the field is marked as being renamed. */
+			if (!col_name) {
+				return(true);
+			}
+
+			/* If col_name is provided, return only
+			if names match */
+			if (innobase_strcasecmp(field->field_name,
+						col_name) == 0) {
+				return(true);
+			}
+		}
+	}
+
+	return(false);
+}
+
+/*******************************************************************//**
+Check whether any of the given columns is being renamed in the table.
+@return true if any of col_names is being renamed in table */
+static
+bool
+column_is_being_renamed(
+/*====================*/
+	TABLE*		table,		/*!< in: MySQL table */
+	uint		n_cols,		/*!< in: number of columns */
+	const char**	col_names)	/*!< in: names of the columns */
+{
+	uint		j;
+
+	for (j = 0; j < n_cols; j++) {
+		if (check_column_being_renamed(table, col_names[j])) {
+			return(true);
+		}
+	}
+
+	return(false);
+}
+
+/***********************************************************************
+Check whether a column in table "table" is being renamed and if this column
+is part of a foreign key, either part of another table, referencing this
+table or part of this table, referencing another table. */
+static
+bool
+foreign_key_column_is_being_renamed(
+/*================================*/
+					/* out: true if a column that
+					participates in a foreign key definition
+					is being renamed */
+	row_prebuilt_t*	prebuilt,	/* in: InnoDB prebuilt struct */
+	TABLE*		table)		/* in: MySQL table */
+{
+	dict_foreign_t*	foreign;
+
+	/* check whether there are foreign keys at all */
+	if (UT_LIST_GET_LEN(prebuilt->table->foreign_list) == 0
+	    && UT_LIST_GET_LEN(prebuilt->table->referenced_list) == 0) {
+		/* no foreign keys involved with prebuilt->table */
+
+		return(false);
+	}
+
+	row_mysql_lock_data_dictionary(prebuilt->trx);
+
+	/* Check whether any column in the foreign key constraints which refer
+	to this table is being renamed. */
+	for (foreign = UT_LIST_GET_FIRST(prebuilt->table->referenced_list);
+	     foreign != NULL;
+	     foreign = UT_LIST_GET_NEXT(referenced_list, foreign)) {
+
+		if (column_is_being_renamed(table, foreign->n_fields,
+					    foreign->referenced_col_names)) {
+
+			row_mysql_unlock_data_dictionary(prebuilt->trx);
+			return(true);
+		}
+	}
+
+	/* Check whether any column in the foreign key constraints in the
+	table is being renamed. */
+	for (foreign = UT_LIST_GET_FIRST(prebuilt->table->foreign_list);
+	     foreign != NULL;
+	     foreign = UT_LIST_GET_NEXT(foreign_list, foreign)) {
+
+		if (column_is_being_renamed(table, foreign->n_fields,
+					    foreign->foreign_col_names)) {
+
+			row_mysql_unlock_data_dictionary(prebuilt->trx);
+			return(true);
+		}
+	}
+
+	row_mysql_unlock_data_dictionary(prebuilt->trx);
+
+	return(false);
+}
+
+UNIV_INTERN
+bool
+ha_innobase::check_if_incompatible_data(
+	HA_CREATE_INFO*	info,
+	uint		table_changes)
+{
+	enum row_type row_type, info_row_type;
+	DBUG_ENTER("ha_innobase::check_if_incompatible_data");
+
+	if (table_changes != IS_EQUAL_YES) {
+
+		DBUG_PRINT("info", ("table_changes != IS_EQUAL_YES "
+				    "-> COMPATIBLE_DATA_NO"));
+		DBUG_RETURN(COMPATIBLE_DATA_NO);
+	}
+
+	/* Check that auto_increment value was not changed */
+	if ((info->used_fields & HA_CREATE_USED_AUTO) &&
+		info->auto_increment_value != 0) {
+
+		DBUG_PRINT("info", ("auto_increment_value changed -> "
+				    "COMPATIBLE_DATA_NO"));
+		DBUG_RETURN(COMPATIBLE_DATA_NO);
+	}
+
+	/* For column rename operation, MySQL does not supply enough
+	information (new column name etc.) for InnoDB to make appropriate
+	system metadata change. To avoid system metadata inconsistency,
+	currently we can just request a table rebuild/copy by returning
+	COMPATIBLE_DATA_NO */
+	if (check_column_being_renamed(table, NULL)) {
+        	DBUG_RETURN(COMPATIBLE_DATA_NO);
+	}
+
+	/* Check if a column participating in a foreign key is being renamed.
+	There is no mechanism for updating InnoDB foreign key definitions. */
+	if (foreign_key_column_is_being_renamed(prebuilt, table)) {
+
+		DBUG_RETURN(COMPATIBLE_DATA_NO);
+	}
+
+	/* Check that row format didn't change */
+	row_type = get_row_type();
+	info_row_type = info->row_type;
+	/* Default is compact. */
+	if (info_row_type == ROW_TYPE_DEFAULT)
+		info_row_type = ROW_TYPE_COMPACT;
+	if ((info->used_fields & HA_CREATE_USED_ROW_FORMAT) &&
+	    get_row_type() != ((info->row_type == ROW_TYPE_DEFAULT)
+				? ROW_TYPE_COMPACT : info->row_type)) {
+
+		DBUG_PRINT("info", ("get_row_type()=%d != info->row_type=%d -> "
+				    "COMPATIBLE_DATA_NO",
+				    row_type, info->row_type));
+		DBUG_RETURN(COMPATIBLE_DATA_NO);
+	}
+
+	/* Specifying KEY_BLOCK_SIZE requests a rebuild of the table. */
+	if (info->used_fields & HA_CREATE_USED_KEY_BLOCK_SIZE) {
+		DBUG_PRINT("info", ("HA_CREATE_USED_KEY_BLOCK_SIZE -> "
+				    "COMPATIBLE_DATA_NO"));
+		DBUG_RETURN(COMPATIBLE_DATA_NO);
+	}
+
+	DBUG_PRINT("info", (" -> COMPATIBLE_DATA_YES"));
+	DBUG_RETURN(COMPATIBLE_DATA_YES);
+}
+
+/************************************************************//**
+Validate the file format name and return its corresponding id.
+@return	valid file format id */
+static
+uint
+innobase_file_format_name_lookup(
+/*=============================*/
+	const char*	format_name)	/*!< in: pointer to file format name */
+{
+	char*	endp;
+	uint	format_id;
+
+	ut_a(format_name != NULL);
+
+	/* The format name can contain the format id itself instead of
+	the name and we check for that. */
+	format_id = (uint) strtoul(format_name, &endp, 10);
+
+	/* Check for valid parse. */
+	if (*endp == '\0' && *format_name != '\0') {
+
+		if (format_id <= DICT_TF_FORMAT_MAX) {
+
+			return(format_id);
+		}
+	} else {
+
+		for (format_id = 0; format_id <= DICT_TF_FORMAT_MAX;
+		     format_id++) {
+			const char*	name;
+
+			name = trx_sys_file_format_id_to_name(format_id);
+
+			if (!innobase_strcasecmp(format_name, name)) {
+
+				return(format_id);
+			}
+		}
+	}
+
+	return(DICT_TF_FORMAT_MAX + 1);
+}
+
+/************************************************************//**
+Validate the file format check value, is it one of "on" or "off",
+as a side effect it sets the srv_check_file_format_at_startup variable.
+@return	true if config value one of "on" or  "off" */
+static
+bool
+innobase_file_format_check_on_off(
+/*==============================*/
+	const char*	format_check)	/*!< in: parameter value */
+{
+	bool		ret = true;
+
+	if (!innobase_strcasecmp(format_check, "off")) {
+
+		/* Set the value to disable checking. */
+		srv_check_file_format_at_startup = DICT_TF_FORMAT_MAX + 1;
+
+	} else if (!innobase_strcasecmp(format_check, "on")) {
+
+		/* Set the value to the lowest supported format. */
+		srv_check_file_format_at_startup = DICT_TF_FORMAT_51;
+	} else {
+		ret = FALSE;
+	}
+
+	return(ret);
+}
+
+/************************************************************//**
+Validate the file format check config parameters, as a side effect it
+sets the srv_check_file_format_at_startup variable.
+@return the format_id if valid config value, otherwise, return -1 */
+static
+int
+innobase_file_format_validate_and_set(
+/*================================*/
+	const char*	format_check)	/*!< in: parameter value */
+{
+	uint		format_id;
+
+	format_id = innobase_file_format_name_lookup(format_check);
+
+	if (format_id < DICT_TF_FORMAT_MAX + 1) {
+		srv_check_file_format_at_startup = format_id;
+
+		return((int) format_id);
+	} else {
+		return(-1);
+	}
+}
+
+/*************************************************************//**
+Check if it is a valid file format. This function is registered as
+a callback with MySQL.
+@return	0 for valid file format */
+static
+int
+innodb_file_format_name_validate(
+/*=============================*/
+	THD*				thd,	/*!< in: thread handle */
+	struct st_mysql_sys_var*	var,	/*!< in: pointer to system
+						variable */
+	void*				save,	/*!< out: immediate result
+						for update function */
+	struct st_mysql_value*		value)	/*!< in: incoming string */
+{
+	const char*	file_format_input;
+	char		buff[STRING_BUFFER_USUAL_SIZE];
+	int		len = sizeof(buff);
+
+	ut_a(save != NULL);
+	ut_a(value != NULL);
+
+	file_format_input = value->val_str(value, buff, &len);
+
+	if (file_format_input != NULL) {
+		uint	format_id;
+
+		format_id = innobase_file_format_name_lookup(
+			file_format_input);
+
+		if (format_id <= DICT_TF_FORMAT_MAX) {
+
+			/* Save a pointer to the name in the
+			'file_format_name_map' constant array. */
+			*static_cast<const char**>(save) =
+			    trx_sys_file_format_id_to_name(format_id);
+
+			return(0);
+		}
+	}
+
+	*static_cast<const char**>(save) = NULL;
+	return(1);
+}
+
+/****************************************************************//**
+Update the system variable innodb_file_format using the "saved"
+value. This function is registered as a callback with MySQL. */
+static
+void
+innodb_file_format_name_update(
+/*===========================*/
+	THD*				thd,		/*!< in: thread handle */
+	struct st_mysql_sys_var*	var,		/*!< in: pointer to
+							system variable */
+	void*				var_ptr,	/*!< out: where the
+							formal string goes */
+	const void*			save)		/*!< in: immediate result
+							from check function */
+{
+	const char* format_name;
+
+	ut_a(var_ptr != NULL);
+	ut_a(save != NULL);
+
+	format_name = *static_cast<const char*const*>(save);
+
+	if (format_name) {
+		uint	format_id;
+
+		format_id = innobase_file_format_name_lookup(format_name);
+
+		if (format_id <= DICT_TF_FORMAT_MAX) {
+			srv_file_format = format_id;
+		}
+	}
+
+	*static_cast<const char**>(var_ptr)
+		= trx_sys_file_format_id_to_name(srv_file_format);
+}
+
+/*************************************************************//**
+Check if valid argument to innodb_file_format_check. This
+function is registered as a callback with MySQL.
+@return	0 for valid file format */
+static
+int
+innodb_file_format_check_validate(
+/*==============================*/
+	THD*				thd,	/*!< in: thread handle */
+	struct st_mysql_sys_var*	var,	/*!< in: pointer to system
+						variable */
+	void*				save,	/*!< out: immediate result
+						for update function */
+	struct st_mysql_value*		value)	/*!< in: incoming string */
+{
+	const char*	file_format_input;
+	char		buff[STRING_BUFFER_USUAL_SIZE];
+	int		len = sizeof(buff);
+	int		format_id;
+
+	ut_a(save != NULL);
+	ut_a(value != NULL);
+
+	file_format_input = value->val_str(value, buff, &len);
+
+	if (file_format_input != NULL) {
+
+		/* Check if user set on/off, we want to print a suitable
+		message if they did so. */
+
+		if (innobase_file_format_check_on_off(file_format_input)) {
+			push_warning_printf(thd,
+				MYSQL_ERROR::WARN_LEVEL_WARN,
+				ER_WRONG_ARGUMENTS,
+				"InnoDB: invalid innodb_file_format_check "
+				"value; on/off can only be set at startup or "
+				"in the configuration file");
+		} else {
+			format_id = innobase_file_format_validate_and_set(
+							file_format_input);
+
+			if (format_id >= 0) {
+				/* Save a pointer to the name in the
+				'file_format_name_map' constant array. */
+				*static_cast<const char**>(save) =
+				    trx_sys_file_format_id_to_name(
+							(uint)format_id);
+
+				return(0);
+
+			} else {
+				push_warning_printf(thd,
+				  MYSQL_ERROR::WARN_LEVEL_WARN,
+				  ER_WRONG_ARGUMENTS,
+				  "InnoDB: invalid innodb_file_format_check "
+				  "value; can be any format up to %s "
+				  "or its equivalent numeric id",
+				  trx_sys_file_format_id_to_name(
+						DICT_TF_FORMAT_MAX));
+			}
+		}
+	}
+
+	*static_cast<const char**>(save) = NULL;
+	return(1);
+}
+
+/****************************************************************//**
+Update the system variable innodb_file_format_check using the "saved"
+value. This function is registered as a callback with MySQL. */
+static
+void
+innodb_file_format_check_update(
+/*============================*/
+	THD*				thd,		/*!< in: thread handle */
+	struct st_mysql_sys_var*	var,		/*!< in: pointer to
+							system variable */
+	void*				var_ptr,	/*!< out: where the
+							formal string goes */
+	const void*			save)		/*!< in: immediate result
+							from check function */
+{
+	const char*	format_name_in;
+	const char**	format_name_out;
+	uint		format_id;
+
+	ut_a(save != NULL);
+	ut_a(var_ptr != NULL);
+
+	format_name_in = *static_cast<const char*const*>(save);
+
+	if (!format_name_in) {
+
+		return;
+	}
+
+	format_id = innobase_file_format_name_lookup(format_name_in);
+
+	if (format_id > DICT_TF_FORMAT_MAX) {
+		/* DEFAULT is "on", which is invalid at runtime. */
+		push_warning_printf(thd, MYSQL_ERROR::WARN_LEVEL_WARN,
+				    ER_WRONG_ARGUMENTS,
+				    "Ignoring SET innodb_file_format=%s",
+				    format_name_in);
+		return;
+	}
+
+	format_name_out = static_cast<const char**>(var_ptr);
+
+	/* Update the max format id in the system tablespace. */
+	if (trx_sys_file_format_max_set(format_id, format_name_out)) {
+		ut_print_timestamp(stderr);
+		fprintf(stderr,
+			" [Info] InnoDB: the file format in the system "
+			"tablespace is now set to %s.\n", *format_name_out);
+	}
+}
+
+/****************************************************************//**
+Update the system variable innodb_adaptive_hash_index using the "saved"
+value. This function is registered as a callback with MySQL. */
+static
+void
+innodb_adaptive_hash_index_update(
+/*==============================*/
+	THD*				thd,		/*!< in: thread handle */
+	struct st_mysql_sys_var*	var,		/*!< in: pointer to
+							system variable */
+	void*				var_ptr,	/*!< out: where the
+							formal string goes */
+	const void*			save)		/*!< in: immediate result
+							from check function */
+{
+	if (*(my_bool*) save) {
+		btr_search_enable();
+	} else {
+		btr_search_disable();
+	}
+}
+
+/****************************************************************//**
+Update the system variable innodb_old_blocks_pct using the "saved"
+value. This function is registered as a callback with MySQL. */
+static
+void
+innodb_old_blocks_pct_update(
+/*=========================*/
+	THD*				thd,	/*!< in: thread handle */
+	struct st_mysql_sys_var*	var,	/*!< in: pointer to
+						system variable */
+	void*				var_ptr,/*!< out: where the
+						formal string goes */
+	const void*			save)	/*!< in: immediate result
+						from check function */
+{
+	innobase_old_blocks_pct = buf_LRU_old_ratio_update(
+		*static_cast<const uint*>(save), TRUE);
+}
+
+/*************************************************************//**
+Find the corresponding ibuf_use_t value that indexes into
+innobase_change_buffering_values[] array for the input
+change buffering option name.
+@return	corresponding IBUF_USE_* value for the input variable
+name, or IBUF_USE_COUNT if not able to find a match */
+static
+ibuf_use_t
+innodb_find_change_buffering_value(
+/*===============================*/
+	const char*	input_name)	/*!< in: input change buffering
+					option name */
+{
+	ulint	use;
+
+	for (use = 0; use < UT_ARR_SIZE(innobase_change_buffering_values);
+	     use++) {
+		/* found a match */
+		if (!innobase_strcasecmp(
+			input_name, innobase_change_buffering_values[use])) {
+			return((ibuf_use_t)use);
+		}
+	}
+
+	/* Did not find any match */
+	return(IBUF_USE_COUNT);
+}
+
+/*************************************************************//**
+Check if it is a valid value of innodb_change_buffering. This function is
+registered as a callback with MySQL.
+@return	0 for valid innodb_change_buffering */
+static
+int
+innodb_change_buffering_validate(
+/*=============================*/
+	THD*				thd,	/*!< in: thread handle */
+	struct st_mysql_sys_var*	var,	/*!< in: pointer to system
+						variable */
+	void*				save,	/*!< out: immediate result
+						for update function */
+	struct st_mysql_value*		value)	/*!< in: incoming string */
+{
+	const char*	change_buffering_input;
+	char		buff[STRING_BUFFER_USUAL_SIZE];
+	int		len = sizeof(buff);
+
+	ut_a(save != NULL);
+	ut_a(value != NULL);
+
+	change_buffering_input = value->val_str(value, buff, &len);
+
+	if (change_buffering_input != NULL) {
+		ibuf_use_t	use;
+
+		use = innodb_find_change_buffering_value(
+			change_buffering_input);
+
+		if (use != IBUF_USE_COUNT) {
+			/* Find a matching change_buffering option value. */
+			*static_cast<const char**>(save) =
+				innobase_change_buffering_values[use];
+
+			return(0);
+		}
+	}
+
+	/* No corresponding change buffering option for user supplied
+	"change_buffering_input" */
+	return(1);
+}
+
+/****************************************************************//**
+Update the system variable innodb_change_buffering using the "saved"
+value. This function is registered as a callback with MySQL. */
+static
+void
+innodb_change_buffering_update(
+/*===========================*/
+	THD*				thd,	/*!< in: thread handle */
+	struct st_mysql_sys_var*	var,	/*!< in: pointer to
+						system variable */
+	void*				var_ptr,/*!< out: where the
+						formal string goes */
+	const void*			save)	/*!< in: immediate result
+						from check function */
+{
+	ibuf_use_t	use;
+
+	ut_a(var_ptr != NULL);
+	ut_a(save != NULL);
+
+	use = innodb_find_change_buffering_value(
+		*static_cast<const char*const*>(save));
+
+	ut_a(use < IBUF_USE_COUNT);
+
+	ibuf_use = use;
+	*static_cast<const char**>(var_ptr) =
+		 *static_cast<const char*const*>(save);
+}
+
+static int show_innodb_vars(THD *thd, SHOW_VAR *var, char *buff)
+{
+  innodb_export_status();
+  var->type= SHOW_ARRAY;
+  var->value= (char *) &innodb_status_variables;
+  return 0;
+}
+
+/***********************************************************************
+This function checks each index name for a table against reserved
+system default primary index name 'GEN_CLUST_INDEX'. If a name matches,
+this function pushes an warning message to the client, and returns true. */
+extern "C" UNIV_INTERN
+bool
+innobase_index_name_is_reserved(
+/*============================*/
+					/* out: true if an index name
+					matches the reserved name */
+	const trx_t*	trx,		/* in: InnoDB transaction handle */
+	const KEY*	key_info,	/* in: Indexes to be created */
+	ulint		num_of_keys)	/* in: Number of indexes to
+					be created. */
+{
+	const KEY*	key;
+	uint		key_num;	/* index number */
+
+	for (key_num = 0; key_num < num_of_keys; key_num++) {
+		key = &key_info[key_num];
+
+		if (innobase_strcasecmp(key->name,
+					innobase_index_reserve_name) == 0) {
+			/* Push warning to mysql */
+			push_warning_printf((THD*) trx->mysql_thd,
+					    MYSQL_ERROR::WARN_LEVEL_WARN,
+					    ER_WRONG_NAME_FOR_INDEX,
+					    "Cannot Create Index with name "
+					    "'%s'. The name is reserved "
+					    "for the system default primary "
+					    "index.",
+					    innobase_index_reserve_name);
+
+			my_error(ER_WRONG_NAME_FOR_INDEX, MYF(0),
+				 innobase_index_reserve_name);
+
+			return(true);
+		}
+	}
+
+	return(false);
+}
+
+static SHOW_VAR innodb_status_variables_export[]= {
+  {"Innodb",                   (char*) &show_innodb_vars, SHOW_FUNC},
+  {NullS, NullS, SHOW_LONG}
+};
+
+static struct st_mysql_storage_engine innobase_storage_engine=
+{ MYSQL_HANDLERTON_INTERFACE_VERSION };
+
+/* plugin options */
+static MYSQL_SYSVAR_BOOL(checksums, innobase_use_checksums,
+  PLUGIN_VAR_NOCMDARG | PLUGIN_VAR_READONLY,
+  "Enable InnoDB checksums validation (enabled by default). "
+  "Disable with --skip-innodb-checksums.",
+  NULL, NULL, TRUE);
+
+static MYSQL_SYSVAR_BOOL(fast_checksum, innobase_fast_checksum,
+  PLUGIN_VAR_NOCMDARG | PLUGIN_VAR_READONLY,
+  "Change the algorithm of checksum for the whole of datapage to 4-bytes word based. "
+  "The original checksum is checked after the new one. It may be slow for reading page"
+  " which has orginal checksum. Overwrite the page or recreate the InnoDB database, "
+  "if you want the entire benefit for performance at once. "
+  "#### Attention: The checksum is not compatible for normal or disabled version! ####",
+  NULL, NULL, FALSE);
+
+static MYSQL_SYSVAR_ULONG(page_size, innobase_page_size,
+  PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
+  "###EXPERIMENTAL###: The universal page size of the database. Changing for created database is not supported. Use on your own risk!",
+  NULL, NULL, (1 << 14), (1 << 12), (1 << UNIV_PAGE_SIZE_SHIFT_MAX), 0);
+
+static MYSQL_SYSVAR_STR(data_home_dir, innobase_data_home_dir,
+  PLUGIN_VAR_READONLY,
+  "The common part for InnoDB table spaces.",
+  NULL, NULL, NULL);
+
+static MYSQL_SYSVAR_BOOL(extra_undoslots, innobase_extra_undoslots,
+  PLUGIN_VAR_NOCMDARG | PLUGIN_VAR_READONLY,
+  "Enable to use about 4000 undo slots instead of default 1024. "
+  "#### Attention: Once you enable this parameter, "
+  "don't use the datafile for normal mysqld or ibbackup! ####",
+  NULL, NULL, FALSE);
+
+static MYSQL_SYSVAR_BOOL(fast_recovery, innobase_fast_recovery,
+  PLUGIN_VAR_NOCMDARG | PLUGIN_VAR_READONLY,
+  "obsolete option. affects nothing.",
+  NULL, NULL, FALSE);
+
+static MYSQL_SYSVAR_BOOL(recovery_stats, innobase_recovery_stats,
+  PLUGIN_VAR_NOCMDARG | PLUGIN_VAR_READONLY,
+  "Output statistics of recovery process after it.",
+  NULL, NULL, FALSE);
+
+static MYSQL_SYSVAR_ULINT(use_purge_thread, srv_use_purge_thread,
+  PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
+  "Number of purge devoted threads. #### over 1 is EXPERIMENTAL ####",
+  NULL, NULL, 1, 0, 64, 0);
+
+static MYSQL_SYSVAR_BOOL(overwrite_relay_log_info, innobase_overwrite_relay_log_info,
+  PLUGIN_VAR_NOCMDARG | PLUGIN_VAR_READONLY,
+  "During InnoDB crash recovery on slave overwrite relay-log.info "
+  "to align master log file position if information in InnoDB and relay-log.info is different.",
+  NULL, NULL, FALSE);
+
+static MYSQL_SYSVAR_BOOL(doublewrite, innobase_use_doublewrite,
+  PLUGIN_VAR_NOCMDARG | PLUGIN_VAR_READONLY,
+  "Enable InnoDB doublewrite buffer (enabled by default). "
+  "Disable with --skip-innodb-doublewrite.",
+  NULL, NULL, TRUE);
+
+static MYSQL_SYSVAR_ULONG(io_capacity, srv_io_capacity,
+  PLUGIN_VAR_RQCMDARG,
+  "Number of IOPs the server can do. Tunes the background IO rate",
+  NULL, NULL, 200, 100, ~0L, 0);
+
+static MYSQL_SYSVAR_ULONG(fast_shutdown, innobase_fast_shutdown,
+  PLUGIN_VAR_OPCMDARG,
+  "Speeds up the shutdown process of the InnoDB storage engine. Possible "
+  "values are 0, 1 (faster)"
+  /*
+    NetWare can't close unclosed files, can't automatically kill remaining
+    threads, etc, so on this OS we disable the crash-like InnoDB shutdown.
+  */
+  IF_NETWARE("", " or 2 (fastest - crash-like)")
+  ".",
+  NULL, NULL, 1, 0, IF_NETWARE(1,2), 0);
+
+static MYSQL_SYSVAR_BOOL(file_per_table, srv_file_per_table,
+  PLUGIN_VAR_NOCMDARG,
+  "Stores each InnoDB table to an .ibd file in the database dir.",
+  NULL, NULL, FALSE);
+
+static MYSQL_SYSVAR_STR(file_format, innobase_file_format_name,
+  PLUGIN_VAR_RQCMDARG,
+  "File format to use for new tables in .ibd files.",
+  innodb_file_format_name_validate,
+  innodb_file_format_name_update, "Antelope");
+
+/* If a new file format is introduced, the file format
+name needs to be updated accordingly. Please refer to
+file_format_name_map[] defined in trx0sys.c for the next
+file format name. */
+static MYSQL_SYSVAR_STR(file_format_check, innobase_file_format_check,
+  PLUGIN_VAR_OPCMDARG,
+  "The highest file format in the tablespace.",
+  innodb_file_format_check_validate,
+  innodb_file_format_check_update, "Barracuda");
+
+static MYSQL_SYSVAR_ULONG(flush_log_at_trx_commit, srv_flush_log_at_trx_commit,
+  PLUGIN_VAR_OPCMDARG,
+  "Set to 0 (write and flush once per second),"
+  " 1 (write and flush at each commit)"
+  " or 2 (write at commit, flush once per second).",
+  NULL, NULL, 1, 0, 2, 0);
+
+static MYSQL_SYSVAR_STR(flush_method, innobase_file_flush_method,
+  PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
+  "With which method to flush data.", NULL, NULL, NULL);
+
+static MYSQL_SYSVAR_BOOL(locks_unsafe_for_binlog, innobase_locks_unsafe_for_binlog,
+  PLUGIN_VAR_NOCMDARG | PLUGIN_VAR_READONLY,
+  "Force InnoDB to not use next-key locking, to use only row-level locking.",
+  NULL, NULL, FALSE);
+
+static MYSQL_SYSVAR_ULONG(show_verbose_locks, srv_show_verbose_locks,
+  PLUGIN_VAR_OPCMDARG,
+  "Whether to show records locked in SHOW INNODB STATUS.",
+  NULL, NULL, 0, 0, 1, 0);
+
+static MYSQL_SYSVAR_ULONG(show_locks_held, srv_show_locks_held,
+  PLUGIN_VAR_RQCMDARG,
+  "Number of locks held to print for each InnoDB transaction in SHOW INNODB STATUS.",
+  NULL, NULL, 10, 0, 1000, 0);
+
+#ifdef UNIV_LOG_ARCHIVE
+static MYSQL_SYSVAR_STR(log_arch_dir, innobase_log_arch_dir,
+  PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
+  "Where full logs should be archived.", NULL, NULL, NULL);
+
+static MYSQL_SYSVAR_BOOL(log_archive, innobase_log_archive,
+  PLUGIN_VAR_OPCMDARG | PLUGIN_VAR_READONLY,
+  "Set to 1 if you want to have logs archived.", NULL, NULL, FALSE);
+#endif /* UNIV_LOG_ARCHIVE */
+
+static MYSQL_SYSVAR_STR(log_group_home_dir, innobase_log_group_home_dir,
+  PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
+  "Path to InnoDB log files.", NULL, NULL, NULL);
+
+static MYSQL_SYSVAR_ULONG(max_dirty_pages_pct, srv_max_buf_pool_modified_pct,
+  PLUGIN_VAR_RQCMDARG,
+  "Percentage of dirty pages allowed in bufferpool.",
+  NULL, NULL, 75, 0, 99, 0);
+
+static MYSQL_SYSVAR_BOOL(adaptive_flushing, srv_adaptive_flushing,
+  PLUGIN_VAR_NOCMDARG,
+  "Attempt flushing dirty pages to avoid IO bursts at checkpoints.",
+  NULL, NULL, FALSE);
+
+static MYSQL_SYSVAR_ULONG(max_purge_lag, srv_max_purge_lag,
+  PLUGIN_VAR_RQCMDARG,
+  "Desired maximum length of the purge queue (0 = no limit)",
+  NULL, NULL, 0, 0, ~0L, 0);
+
+static MYSQL_SYSVAR_BOOL(rollback_on_timeout, innobase_rollback_on_timeout,
+  PLUGIN_VAR_OPCMDARG | PLUGIN_VAR_READONLY,
+  "Roll back the complete transaction on lock wait timeout, for 4.x compatibility (disabled by default)",
+  NULL, NULL, FALSE);
+
+static MYSQL_SYSVAR_BOOL(status_file, innobase_create_status_file,
+  PLUGIN_VAR_OPCMDARG | PLUGIN_VAR_NOSYSVAR,
+  "Enable SHOW INNODB STATUS output in the innodb_status.<pid> file",
+  NULL, NULL, FALSE);
+
+static MYSQL_SYSVAR_BOOL(stats_on_metadata, innobase_stats_on_metadata,
+  PLUGIN_VAR_OPCMDARG,
+  "Enable statistics gathering for metadata commands such as SHOW TABLE STATUS (on by default)",
+  NULL, NULL, TRUE);
+
+static MYSQL_SYSVAR_ULONGLONG(stats_sample_pages, srv_stats_sample_pages,
+  PLUGIN_VAR_RQCMDARG,
+  "The number of index pages to sample when calculating statistics (default 8)",
+  NULL, NULL, 8, 1, ~0ULL, 0);
+
+const char *innobase_stats_method_names[]=
+{
+  "nulls_equal",
+  "nulls_unequal",
+  "nulls_ignored",
+  NullS
+};
+TYPELIB innobase_stats_method_typelib=
+{
+  array_elements(innobase_stats_method_names) - 1, "innobase_stats_method_typelib",
+  innobase_stats_method_names, NULL
+};
+static MYSQL_SYSVAR_ENUM(stats_method, srv_stats_method,
+  PLUGIN_VAR_RQCMDARG,
+  "Specifies how InnoDB index statistics collection code should threat NULLs. "
+  "Possible values of name are same to for 'myisam_stats_method'. "
+  "This is startup parameter.",
+  NULL, NULL, 0, &innobase_stats_method_typelib);
+
+static MYSQL_SYSVAR_ULONG(stats_auto_update, srv_stats_auto_update,
+  PLUGIN_VAR_RQCMDARG,
+  "Enable/Disable InnoDB's auto update statistics of indexes. "
+  "(except for ANALYZE TABLE command) 0:disable 1:enable",
+  NULL, NULL, 1, 0, 1, 0);
+
+static MYSQL_SYSVAR_ULINT(stats_update_need_lock, srv_stats_update_need_lock,
+  PLUGIN_VAR_RQCMDARG,
+  "Enable/Disable InnoDB's update statistics which needs to lock dictionary. "
+  "e.g. Data_free.",
+  NULL, NULL, 1, 0, 1, 0);
+
+static MYSQL_SYSVAR_BOOL(use_sys_stats_table, innobase_use_sys_stats_table,
+  PLUGIN_VAR_NOCMDARG | PLUGIN_VAR_READONLY,
+  "Enable to use SYS_STATS system table to store statistics statically, "
+  "And avoids to calculate statistics at every first open of the tables. "
+  "This option may make the opportunities of update statistics less. "
+  "So you should use ANALYZE TABLE command intentionally.",
+  NULL, NULL, FALSE);
+
+static MYSQL_SYSVAR_BOOL(adaptive_hash_index, btr_search_enabled,
+  PLUGIN_VAR_OPCMDARG,
+  "Enable InnoDB adaptive hash index (enabled by default).  "
+  "Disable with --skip-innodb-adaptive-hash-index.",
+  NULL, innodb_adaptive_hash_index_update, TRUE);
+
+static MYSQL_SYSVAR_ULONG(replication_delay, srv_replication_delay,
+  PLUGIN_VAR_RQCMDARG,
+  "Replication thread delay (ms) on the slave server if "
+  "innodb_thread_concurrency is reached (0 by default)",
+  NULL, NULL, 0, 0, ~0UL, 0);
+
+static MYSQL_SYSVAR_LONG(additional_mem_pool_size, innobase_additional_mem_pool_size,
+  PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
+  "Size of a memory pool InnoDB uses to store data dictionary information and other internal data structures.",
+  NULL, NULL, 8*1024*1024L, 512*1024L, LONG_MAX, 1024);
+
+static MYSQL_SYSVAR_ULONG(autoextend_increment, srv_auto_extend_increment,
+  PLUGIN_VAR_RQCMDARG,
+  "Data file autoextend increment in megabytes",
+  NULL, NULL, 8L, 1L, 1000L, 0);
+
+static MYSQL_SYSVAR_LONGLONG(buffer_pool_size, innobase_buffer_pool_size,
+  PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
+  "The size of the memory buffer InnoDB uses to cache data and indexes of its tables.",
+  NULL, NULL, 128*1024*1024L, 32*1024*1024L, LONGLONG_MAX, 1024*1024L);
+
+static MYSQL_SYSVAR_UINT(buffer_pool_shm_key, srv_buffer_pool_shm_key,
+  PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
+  "[experimental] The key value of shared memory segment for the buffer pool. 0 (default) disables the feature.",
+  NULL, NULL, 0, 0, INT_MAX32, 0);
+
+static MYSQL_SYSVAR_BOOL(buffer_pool_shm_checksum, innobase_buffer_pool_shm_checksum,
+  PLUGIN_VAR_NOCMDARG | PLUGIN_VAR_READONLY,
+  "Enable buffer_pool_shm checksum validation (enabled by default).",
+  NULL, NULL, TRUE);
+
+static MYSQL_SYSVAR_ULONG(commit_concurrency, innobase_commit_concurrency,
+  PLUGIN_VAR_RQCMDARG,
+  "Helps in performance tuning in heavily concurrent environments.",
+  innobase_commit_concurrency_validate, NULL, 0, 0, 1000, 0);
+
+static MYSQL_SYSVAR_ULONG(concurrency_tickets, srv_n_free_tickets_to_enter,
+  PLUGIN_VAR_RQCMDARG,
+  "Number of times a thread is allowed to enter InnoDB within the same SQL query after it has once got the ticket",
+  NULL, NULL, 500L, 1L, ~0L, 0);
+
+static MYSQL_SYSVAR_LONG(file_io_threads, innobase_file_io_threads,
+  PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY | PLUGIN_VAR_NOSYSVAR,
+  "Number of file I/O threads in InnoDB.",
+  NULL, NULL, 4, 4, 64, 0);
+
+static MYSQL_SYSVAR_ULONG(read_io_threads, innobase_read_io_threads,
+  PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
+  "Number of background read I/O threads in InnoDB.",
+  NULL, NULL, 4, 1, 64, 0);
+
+static MYSQL_SYSVAR_ULONG(write_io_threads, innobase_write_io_threads,
+  PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
+  "Number of background write I/O threads in InnoDB.",
+  NULL, NULL, 4, 1, 64, 0);
+
+static MYSQL_SYSVAR_LONG(force_recovery, innobase_force_recovery,
+  PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
+  "Helps to save your data in case the disk image of the database becomes corrupt.",
+  NULL, NULL, 0, 0, 6, 0);
+
+static MYSQL_SYSVAR_LONG(log_buffer_size, innobase_log_buffer_size,
+  PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
+  "The size of the buffer which InnoDB uses to write log to the log files on disk.",
+  NULL, NULL, 8*1024*1024L, 256*1024L, LONG_MAX, 1024);
+
+static MYSQL_SYSVAR_LONGLONG(log_file_size, innobase_log_file_size,
+  PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
+  "Size of each log file in a log group.",
+  NULL, NULL, 5*1024*1024L, 1*1024*1024L, LONGLONG_MAX, 1024*1024L);
+
+static MYSQL_SYSVAR_LONG(log_files_in_group, innobase_log_files_in_group,
+  PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
+  "Number of log files in the log group. InnoDB writes to the files in a circular fashion. Value 3 is recommended here.",
+  NULL, NULL, 2, 2, 100, 0);
+
+static MYSQL_SYSVAR_LONG(mirrored_log_groups, innobase_mirrored_log_groups,
+  PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
+  "Number of identical copies of log groups we keep for the database. Currently this should be set to 1.",
+  NULL, NULL, 1, 1, 10, 0);
+
+static MYSQL_SYSVAR_UINT(old_blocks_pct, innobase_old_blocks_pct,
+  PLUGIN_VAR_RQCMDARG,
+  "Percentage of the buffer pool to reserve for 'old' blocks.",
+  NULL, innodb_old_blocks_pct_update, 100 * 3 / 8, 5, 95, 0);
+
+static MYSQL_SYSVAR_UINT(old_blocks_time, buf_LRU_old_threshold_ms,
+  PLUGIN_VAR_RQCMDARG,
+  "Move blocks to the 'new' end of the buffer pool if the first access"
+  " was at least this many milliseconds ago."
+  " The timeout is disabled if 0 (the default).",
+  NULL, NULL, 0, 0, UINT_MAX32, 0);
+
+static MYSQL_SYSVAR_LONG(open_files, innobase_open_files,
+  PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
+  "How many files at the maximum InnoDB keeps open at the same time.",
+  NULL, NULL, 300L, 10L, LONG_MAX, 0);
+
+static MYSQL_SYSVAR_ULONG(sync_spin_loops, srv_n_spin_wait_rounds,
+  PLUGIN_VAR_RQCMDARG,
+  "Count of spin-loop rounds in InnoDB mutexes (30 by default)",
+  NULL, NULL, 30L, 0L, ~0L, 0);
+
+static MYSQL_SYSVAR_ULONG(spin_wait_delay, srv_spin_wait_delay,
+  PLUGIN_VAR_OPCMDARG,
+  "Maximum delay between polling for a spin lock (6 by default)",
+  NULL, NULL, 6L, 0L, ~0L, 0);
+
+static MYSQL_SYSVAR_BOOL(thread_concurrency_timer_based,
+  innobase_thread_concurrency_timer_based,
+  PLUGIN_VAR_NOCMDARG | PLUGIN_VAR_READONLY,
+  "Use InnoDB timer based concurrency throttling. ",
+  NULL, NULL, FALSE);
+
+static MYSQL_SYSVAR_ULONG(thread_concurrency, srv_thread_concurrency,
+  PLUGIN_VAR_RQCMDARG,
+  "Helps in performance tuning in heavily concurrent environments. Sets the maximum number of threads allowed inside InnoDB. Value 0 will disable the thread throttling.",
+  NULL, NULL, 0, 0, 1000, 0);
+
+static MYSQL_SYSVAR_ULONG(thread_sleep_delay, srv_thread_sleep_delay,
+  PLUGIN_VAR_RQCMDARG,
+  "Time of innodb thread sleeping before joining InnoDB queue (usec). Value 0 disable a sleep",
+  NULL, NULL, 10000L, 0L, ~0L, 0);
+
+static MYSQL_SYSVAR_STR(data_file_path, innobase_data_file_path,
+  PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
+  "Path to individual files and their sizes.",
+  NULL, NULL, NULL);
+
+static MYSQL_SYSVAR_STR(doublewrite_file, innobase_doublewrite_file,
+  PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
+  "Path to special datafile for doublewrite buffer. (default is "": not used) ### ONLY FOR EXPERTS!!! ###",
+  NULL, NULL, NULL);
+
+static MYSQL_SYSVAR_LONG(autoinc_lock_mode, innobase_autoinc_lock_mode,
+  PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
+  "The AUTOINC lock modes supported by InnoDB:               "
+  "0 => Old style AUTOINC locking (for backward"
+  " compatibility)                                           "
+  "1 => New style AUTOINC locking                            "
+  "2 => No AUTOINC locking (unsafe for SBR)",
+  NULL, NULL,
+  AUTOINC_NEW_STYLE_LOCKING,	/* Default setting */
+  AUTOINC_OLD_STYLE_LOCKING,	/* Minimum value */
+  AUTOINC_NO_LOCKING, 0);	/* Maximum value */
+
+static MYSQL_SYSVAR_STR(version, innodb_version_str,
+  PLUGIN_VAR_NOCMDOPT | PLUGIN_VAR_READONLY,
+  "Percona-InnoDB-plugin version", NULL, NULL, INNODB_VERSION_STR);
+
+static MYSQL_SYSVAR_BOOL(use_sys_malloc, srv_use_sys_malloc,
+  PLUGIN_VAR_NOCMDARG | PLUGIN_VAR_READONLY,
+  "Use OS memory allocator instead of InnoDB's internal memory allocator",
+  NULL, NULL, TRUE);
+
+static MYSQL_SYSVAR_STR(change_buffering, innobase_change_buffering,
+  PLUGIN_VAR_RQCMDARG,
+  "Buffer changes to reduce random access: "
+  "OFF, ON, none, inserts.",
+  innodb_change_buffering_validate,
+  innodb_change_buffering_update, "inserts"); 
+
+static MYSQL_SYSVAR_ULONG(read_ahead_threshold, srv_read_ahead_threshold,
+  PLUGIN_VAR_RQCMDARG,
+  "Number of pages that must be accessed sequentially for InnoDB to "
+  "trigger a readahead.",
+  NULL, NULL, 56, 0, 64, 0);
+
+static MYSQL_SYSVAR_LONGLONG(ibuf_max_size, srv_ibuf_max_size,
+  PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
+  "The maximum size of the insert buffer. (in bytes)",
+  NULL, NULL, LONGLONG_MAX, 0, LONGLONG_MAX, 0);
+
+static MYSQL_SYSVAR_ULONG(ibuf_active_contract, srv_ibuf_active_contract,
+  PLUGIN_VAR_RQCMDARG,
+  "Enable/Disable active_contract of insert buffer. 0:disable 1:enable",
+  NULL, NULL, 1, 0, 1, 0);
+
+static MYSQL_SYSVAR_ULONG(ibuf_accel_rate, srv_ibuf_accel_rate,
+  PLUGIN_VAR_RQCMDARG,
+  "Tunes amount of insert buffer processing of background, in addition to innodb_io_capacity. (in percentage)",
+  NULL, NULL, 100, 100, 999999999, 0);
+
+static MYSQL_SYSVAR_ULINT(checkpoint_age_target, srv_checkpoint_age_target,
+  PLUGIN_VAR_RQCMDARG,
+  "Control soft limit of checkpoint age. (0 : not control)",
+  NULL, NULL, 0, 0, ~0UL, 0);
+
+static MYSQL_SYSVAR_ULONG(flush_neighbor_pages, srv_flush_neighbor_pages,
+  PLUGIN_VAR_RQCMDARG,
+  "Enable/Disable flushing also neighbor pages. 0:disable 1:enable",
+  NULL, NULL, 1, 0, 1, 0);
+
+static
+void
+innodb_read_ahead_update(
+  THD* thd,
+  struct st_mysql_sys_var*     var,
+  void*        var_ptr,
+  const void*  save)
+{
+  *(long *)var_ptr= (*(long *)save) & 3;
+}
+const char *read_ahead_names[]=
+{
+  "none", /* 0 */
+  "random",
+  "linear",
+  "both", /* 3 */
+  /* For compatibility of the older patch */
+  "0", /* 4 ("none" + 4) */
+  "1",
+  "2",
+  "3", /* 7 ("both" + 4) */
+  NullS
+};
+TYPELIB read_ahead_typelib=
+{
+  array_elements(read_ahead_names) - 1, "read_ahead_typelib",
+  read_ahead_names, NULL
+};
+static MYSQL_SYSVAR_ENUM(read_ahead, srv_read_ahead,
+  PLUGIN_VAR_RQCMDARG,
+  "Control read ahead activity (none, random, [linear], both). [from 1.0.5: random read ahead is ignored]",
+  NULL, innodb_read_ahead_update, 2, &read_ahead_typelib);
+
+static
+void
+innodb_adaptive_checkpoint_update(
+  THD* thd,
+  struct st_mysql_sys_var*     var,
+  void*        var_ptr,
+  const void*  save)
+{
+  *(long *)var_ptr= (*(long *)save) % 3;
+}
+const char *adaptive_checkpoint_names[]=
+{
+  "none", /* 0 */
+  "reflex", /* 1 */
+  "estimate", /* 2 */
+  /* For compatibility of the older patch */
+  "0", /* 3 ("none" + 3) */
+  "1", /* 4 ("reflex" + 3) */
+  "2", /* 5 ("estimate" + 3) */
+  NullS
+};
+TYPELIB adaptive_checkpoint_typelib=
+{
+  array_elements(adaptive_checkpoint_names) - 1, "adaptive_checkpoint_typelib",
+  adaptive_checkpoint_names, NULL
+};
+static MYSQL_SYSVAR_ENUM(adaptive_checkpoint, srv_adaptive_checkpoint,
+  PLUGIN_VAR_RQCMDARG,
+  "Enable/Disable flushing along modified age. (none, reflex, [estimate])",
+  NULL, innodb_adaptive_checkpoint_update, 2, &adaptive_checkpoint_typelib);
+
+static MYSQL_SYSVAR_ULONG(enable_unsafe_group_commit, srv_enable_unsafe_group_commit,
+  PLUGIN_VAR_RQCMDARG,
+  "Enable/Disable unsafe group commit when support_xa=OFF and use with binlog or other XA storage engine.",
+  NULL, NULL, 0, 0, 1, 0);
+
+static MYSQL_SYSVAR_ULONG(expand_import, srv_expand_import,
+  PLUGIN_VAR_RQCMDARG,
+  "Enable/Disable converting automatically *.ibd files when import tablespace.",
+  NULL, NULL, 0, 0, 1, 0);
+
+static MYSQL_SYSVAR_ULONG(extra_rsegments, srv_extra_rsegments,
+  PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
+  "Number of extra user rollback segments when create new database.",
+  NULL, NULL, 0, 0, 126, 0);
+
+static MYSQL_SYSVAR_ULONG(dict_size_limit, srv_dict_size_limit,
+  PLUGIN_VAR_RQCMDARG,
+  "Limit the allocated memory for dictionary cache. (0: unlimited)",
+  NULL, NULL, 0, 0, LONG_MAX, 0);
+
+static MYSQL_SYSVAR_UINT(auto_lru_dump, srv_auto_lru_dump,
+  PLUGIN_VAR_RQCMDARG,
+  "Time in seconds between automatic buffer pool dumps. "
+  "0 (the default) disables automatic dumps.",
+  NULL, NULL, 0, 0, UINT_MAX32, 0);
+
+static	MYSQL_SYSVAR_ULINT(pass_corrupt_table, srv_pass_corrupt_table,
+  PLUGIN_VAR_RQCMDARG,
+  "Pass corruptions of user tables as 'corrupt table' instead of not crashing itself, "
+  "when used with file_per_table. "
+  "All file io for the datafile after detected as corrupt are disabled, "
+  "except for the deletion.",
+  NULL, NULL, 0, 0, 1, 0);
+
+static struct st_mysql_sys_var* innobase_system_variables[]= {
+  MYSQL_SYSVAR(page_size),
+  MYSQL_SYSVAR(additional_mem_pool_size),
+  MYSQL_SYSVAR(autoextend_increment),
+  MYSQL_SYSVAR(buffer_pool_size),
+  MYSQL_SYSVAR(buffer_pool_shm_key),
+  MYSQL_SYSVAR(buffer_pool_shm_checksum),
+  MYSQL_SYSVAR(checksums),
+  MYSQL_SYSVAR(fast_checksum),
+  MYSQL_SYSVAR(commit_concurrency),
+  MYSQL_SYSVAR(concurrency_tickets),
+  MYSQL_SYSVAR(data_file_path),
+  MYSQL_SYSVAR(doublewrite_file),
+  MYSQL_SYSVAR(data_home_dir),
+  MYSQL_SYSVAR(doublewrite),
+  MYSQL_SYSVAR(extra_undoslots),
+  MYSQL_SYSVAR(fast_recovery),
+  MYSQL_SYSVAR(recovery_stats),
+  MYSQL_SYSVAR(fast_shutdown),
+  MYSQL_SYSVAR(file_io_threads),
+  MYSQL_SYSVAR(read_io_threads),
+  MYSQL_SYSVAR(write_io_threads),
+  MYSQL_SYSVAR(file_per_table),
+  MYSQL_SYSVAR(file_format),
+  MYSQL_SYSVAR(file_format_check),
+  MYSQL_SYSVAR(flush_log_at_trx_commit),
+  MYSQL_SYSVAR(flush_method),
+  MYSQL_SYSVAR(force_recovery),
+  MYSQL_SYSVAR(locks_unsafe_for_binlog),
+  MYSQL_SYSVAR(lock_wait_timeout),
+#ifdef UNIV_LOG_ARCHIVE
+  MYSQL_SYSVAR(log_arch_dir),
+  MYSQL_SYSVAR(log_archive),
+#endif /* UNIV_LOG_ARCHIVE */
+  MYSQL_SYSVAR(log_buffer_size),
+  MYSQL_SYSVAR(log_file_size),
+  MYSQL_SYSVAR(log_files_in_group),
+  MYSQL_SYSVAR(log_group_home_dir),
+  MYSQL_SYSVAR(max_dirty_pages_pct),
+  MYSQL_SYSVAR(adaptive_flushing),
+  MYSQL_SYSVAR(max_purge_lag),
+  MYSQL_SYSVAR(mirrored_log_groups),
+  MYSQL_SYSVAR(old_blocks_pct),
+  MYSQL_SYSVAR(old_blocks_time),
+  MYSQL_SYSVAR(open_files),
+  MYSQL_SYSVAR(overwrite_relay_log_info),
+  MYSQL_SYSVAR(rollback_on_timeout),
+  MYSQL_SYSVAR(stats_on_metadata),
+  MYSQL_SYSVAR(stats_method),
+  MYSQL_SYSVAR(stats_auto_update),
+  MYSQL_SYSVAR(stats_update_need_lock),
+  MYSQL_SYSVAR(use_sys_stats_table),
+  MYSQL_SYSVAR(stats_sample_pages),
+  MYSQL_SYSVAR(adaptive_hash_index),
+  MYSQL_SYSVAR(replication_delay),
+  MYSQL_SYSVAR(status_file),
+  MYSQL_SYSVAR(strict_mode),
+  MYSQL_SYSVAR(support_xa),
+  MYSQL_SYSVAR(sync_spin_loops),
+  MYSQL_SYSVAR(spin_wait_delay),
+  MYSQL_SYSVAR(table_locks),
+  MYSQL_SYSVAR(thread_concurrency),
+  MYSQL_SYSVAR(thread_concurrency_timer_based),
+  MYSQL_SYSVAR(thread_sleep_delay),
+  MYSQL_SYSVAR(autoinc_lock_mode),
+  MYSQL_SYSVAR(show_verbose_locks),
+  MYSQL_SYSVAR(show_locks_held),
+  MYSQL_SYSVAR(version),
+  MYSQL_SYSVAR(ibuf_max_size),
+  MYSQL_SYSVAR(ibuf_active_contract),
+  MYSQL_SYSVAR(ibuf_accel_rate),
+  MYSQL_SYSVAR(checkpoint_age_target),
+  MYSQL_SYSVAR(flush_neighbor_pages),
+  MYSQL_SYSVAR(read_ahead),
+  MYSQL_SYSVAR(adaptive_checkpoint),
+  MYSQL_SYSVAR(flush_log_at_trx_commit_session),
+  MYSQL_SYSVAR(enable_unsafe_group_commit),
+  MYSQL_SYSVAR(expand_import),
+  MYSQL_SYSVAR(extra_rsegments),
+  MYSQL_SYSVAR(dict_size_limit),
+  MYSQL_SYSVAR(use_sys_malloc),
+  MYSQL_SYSVAR(change_buffering),
+  MYSQL_SYSVAR(read_ahead_threshold),
+  MYSQL_SYSVAR(io_capacity),
+  MYSQL_SYSVAR(auto_lru_dump),
+  MYSQL_SYSVAR(use_purge_thread),
+  MYSQL_SYSVAR(pass_corrupt_table),
+  NULL
+};
+
+mysql_declare_plugin(xtradb)
+{
+  MYSQL_STORAGE_ENGINE_PLUGIN,
+  &innobase_storage_engine,
+  innobase_hton_name,
+  "Percona",
+  "Percona-XtraDB, Supports transactions, row-level locking, and foreign keys",
+  PLUGIN_LICENSE_GPL,
+  innobase_init, /* Plugin Init */
+  NULL, /* Plugin Deinit */
+  INNODB_VERSION_SHORT,
+  innodb_status_variables_export,/* status variables             */
+  innobase_system_variables, /* system variables */
+  NULL /* reserved */
+},
+i_s_innodb_rseg,
+i_s_innodb_buffer_pool_pages,
+i_s_innodb_buffer_pool_pages_index,
+i_s_innodb_buffer_pool_pages_blob,
+i_s_innodb_trx,
+i_s_innodb_locks,
+i_s_innodb_lock_waits,
+i_s_innodb_cmp,
+i_s_innodb_cmp_reset,
+i_s_innodb_cmpmem,
+i_s_innodb_cmpmem_reset,
+i_s_innodb_table_stats,
+i_s_innodb_index_stats,
+i_s_innodb_admin_command,
+i_s_innodb_sys_tables,
+i_s_innodb_sys_indexes,
+i_s_innodb_sys_stats,
+i_s_innodb_patches
+mysql_declare_plugin_end;
+maria_declare_plugin(xtradb)
+{ /* InnoDB */
+  MYSQL_STORAGE_ENGINE_PLUGIN,
+  &innobase_storage_engine,
+  innobase_hton_name,
+  "Percona",
+  "XtraDB engine based on InnoDB plugin. Supports transactions, row-level locking, and foreign keys",
+  PLUGIN_LICENSE_GPL,
+  innobase_init, /* Plugin Init */
+  NULL, /* Plugin Deinit */
+  INNODB_VERSION_SHORT,
+  innodb_status_variables_export,/* status variables             */
+  innobase_system_variables, /* system variables */
+  INNODB_VERSION_STR,         /* string version */
+  MariaDB_PLUGIN_MATURITY_STABLE /* maturity */
+},
+i_s_innodb_rseg_maria,
+i_s_innodb_buffer_pool_pages_maria,
+i_s_innodb_buffer_pool_pages_index_maria,
+i_s_innodb_buffer_pool_pages_blob_maria,
+i_s_innodb_trx_maria,
+i_s_innodb_locks_maria,
+i_s_innodb_lock_waits_maria,
+i_s_innodb_cmp_maria,
+i_s_innodb_cmp_reset_maria,
+i_s_innodb_cmpmem_maria,
+i_s_innodb_cmpmem_reset_maria,
+i_s_innodb_table_stats_maria,
+i_s_innodb_index_stats_maria,
+i_s_innodb_admin_command_maria,
+i_s_innodb_sys_tables_maria,
+i_s_innodb_sys_indexes_maria,
+i_s_innodb_sys_stats_maria,
+i_s_innodb_patches_maria
+maria_declare_plugin_end;
+
+
+/** @brief Initialize the default value of innodb_commit_concurrency.
+
+Once InnoDB is running, the innodb_commit_concurrency must not change
+from zero to nonzero. (Bug #42101)
+
+The initial default value is 0, and without this extra initialization,
+SET GLOBAL innodb_commit_concurrency=DEFAULT would set the parameter
+to 0, even if it was initially set to nonzero at the command line
+or configuration file. */
+static
+void
+innobase_commit_concurrency_init_default(void)
+/*==========================================*/
+{
+	MYSQL_SYSVAR_NAME(commit_concurrency).def_val
+		= innobase_commit_concurrency;
+}
+
+#ifdef UNIV_COMPILE_TEST_FUNCS
+
+typedef struct innobase_convert_name_test_struct {
+	char*		buf;
+	ulint		buflen;
+	const char*	id;
+	ulint		idlen;
+	void*		thd;
+	ibool		file_id;
+
+	const char*	expected;
+} innobase_convert_name_test_t;
+
+void
+test_innobase_convert_name()
+{
+	char	buf[1024];
+	ulint	i;
+
+	innobase_convert_name_test_t test_input[] = {
+		{buf, sizeof(buf), "abcd", 4, NULL, TRUE, "\"abcd\""},
+		{buf, 7, "abcd", 4, NULL, TRUE, "\"abcd\""},
+		{buf, 6, "abcd", 4, NULL, TRUE, "\"abcd\""},
+		{buf, 5, "abcd", 4, NULL, TRUE, "\"abc\""},
+		{buf, 4, "abcd", 4, NULL, TRUE, "\"ab\""},
+
+		{buf, sizeof(buf), "ab@0060cd", 9, NULL, TRUE, "\"ab`cd\""},
+		{buf, 9, "ab@0060cd", 9, NULL, TRUE, "\"ab`cd\""},
+		{buf, 8, "ab@0060cd", 9, NULL, TRUE, "\"ab`cd\""},
+		{buf, 7, "ab@0060cd", 9, NULL, TRUE, "\"ab`cd\""},
+		{buf, 6, "ab@0060cd", 9, NULL, TRUE, "\"ab`c\""},
+		{buf, 5, "ab@0060cd", 9, NULL, TRUE, "\"ab`\""},
+		{buf, 4, "ab@0060cd", 9, NULL, TRUE, "\"ab\""},
+
+		{buf, sizeof(buf), "ab\"cd", 5, NULL, TRUE,
+			"\"#mysql50#ab\"\"cd\""},
+		{buf, 17, "ab\"cd", 5, NULL, TRUE,
+			"\"#mysql50#ab\"\"cd\""},
+		{buf, 16, "ab\"cd", 5, NULL, TRUE,
+			"\"#mysql50#ab\"\"c\""},
+		{buf, 15, "ab\"cd", 5, NULL, TRUE,
+			"\"#mysql50#ab\"\"\""},
+		{buf, 14, "ab\"cd", 5, NULL, TRUE,
+			"\"#mysql50#ab\""},
+		{buf, 13, "ab\"cd", 5, NULL, TRUE,
+			"\"#mysql50#ab\""},
+		{buf, 12, "ab\"cd", 5, NULL, TRUE,
+			"\"#mysql50#a\""},
+		{buf, 11, "ab\"cd", 5, NULL, TRUE,
+			"\"#mysql50#\""},
+		{buf, 10, "ab\"cd", 5, NULL, TRUE,
+			"\"#mysql50\""},
+
+		{buf, sizeof(buf), "ab/cd", 5, NULL, TRUE, "\"ab\".\"cd\""},
+		{buf, 9, "ab/cd", 5, NULL, TRUE, "\"ab\".\"cd\""},
+		{buf, 8, "ab/cd", 5, NULL, TRUE, "\"ab\".\"c\""},
+		{buf, 7, "ab/cd", 5, NULL, TRUE, "\"ab\".\"\""},
+		{buf, 6, "ab/cd", 5, NULL, TRUE, "\"ab\"."},
+		{buf, 5, "ab/cd", 5, NULL, TRUE, "\"ab\"."},
+		{buf, 4, "ab/cd", 5, NULL, TRUE, "\"ab\""},
+		{buf, 3, "ab/cd", 5, NULL, TRUE, "\"a\""},
+		{buf, 2, "ab/cd", 5, NULL, TRUE, "\"\""},
+		/* XXX probably "" is a better result in this case
+		{buf, 1, "ab/cd", 5, NULL, TRUE, "."},
+		*/
+		{buf, 0, "ab/cd", 5, NULL, TRUE, ""},
+	};
+
+	for (i = 0; i < sizeof(test_input) / sizeof(test_input[0]); i++) {
+
+		char*	end;
+		ibool	ok = TRUE;
+		size_t	res_len;
+
+		fprintf(stderr, "TESTING %lu, %s, %lu, %s\n",
+			test_input[i].buflen,
+			test_input[i].id,
+			test_input[i].idlen,
+			test_input[i].expected);
+
+		end = innobase_convert_name(
+			test_input[i].buf,
+			test_input[i].buflen,
+			test_input[i].id,
+			test_input[i].idlen,
+			test_input[i].thd,
+			test_input[i].file_id);
+
+		res_len = (size_t) (end - test_input[i].buf);
+
+		if (res_len != strlen(test_input[i].expected)) {
+
+			fprintf(stderr, "unexpected len of the result: %u, "
+				"expected: %u\n", (unsigned) res_len,
+				(unsigned) strlen(test_input[i].expected));
+			ok = FALSE;
+		}
+
+		if (memcmp(test_input[i].buf,
+			   test_input[i].expected,
+			   strlen(test_input[i].expected)) != 0
+		    || !ok) {
+
+			fprintf(stderr, "unexpected result: %.*s, "
+				"expected: %s\n", (int) res_len,
+				test_input[i].buf,
+				test_input[i].expected);
+			ok = FALSE;
+		}
+
+		if (ok) {
+			fprintf(stderr, "OK: res: %.*s\n\n", (int) res_len,
+				buf);
+		} else {
+			fprintf(stderr, "FAILED\n\n");
+			return;
+		}
+	}
+}
+
+#endif /* UNIV_COMPILE_TEST_FUNCS */
+
+
+/****************************************************************************
+ * DS-MRR implementation 
+ ***************************************************************************/
+
+/**
+ * Multi Range Read interface, DS-MRR calls
+ */
+
+int ha_innobase::multi_range_read_init(RANGE_SEQ_IF *seq, void *seq_init_param,
+                          uint n_ranges, uint mode, HANDLER_BUFFER *buf)
+{
+  return ds_mrr.dsmrr_init(this, seq, seq_init_param, n_ranges, mode, buf);
+}
+
+int ha_innobase::multi_range_read_next(char **range_info)
+{
+  return ds_mrr.dsmrr_next(range_info);
+}
+
+ha_rows ha_innobase::multi_range_read_info_const(uint keyno, RANGE_SEQ_IF *seq,
+                                                 void *seq_init_param,  
+                                                 uint n_ranges, uint *bufsz,
+                                                 uint *flags, 
+                                                 COST_VECT *cost)
+{
+  /* See comments in ha_myisam::multi_range_read_info_const */
+  ds_mrr.init(this, table);
+
+  if (prebuilt->select_lock_type != LOCK_NONE)
+    *flags |= HA_MRR_USE_DEFAULT_IMPL;
+
+  ha_rows res= ds_mrr.dsmrr_info_const(keyno, seq, seq_init_param, n_ranges,
+                                       bufsz, flags, cost);
+  return res;
+}
+
+ha_rows ha_innobase::multi_range_read_info(uint keyno, uint n_ranges, 
+                                           uint keys, uint *bufsz, 
+                                           uint *flags, COST_VECT *cost)
+{
+  ds_mrr.init(this, table);
+  ha_rows res= ds_mrr.dsmrr_info(keyno, n_ranges, keys, bufsz, flags, cost);
+  return res;
+}
+
+
+
+/**
+ * Index Condition Pushdown interface implementation
+ */
+
+C_MODE_START
+
+/*
+  Index condition check function to be called from within Innobase.
+  See note on ICP_RESULT for return values description.
+*/
+
+static int index_cond_func_innodb(void *arg)
+{
+  ha_innobase *h= (ha_innobase*)arg;
+  if (h->end_range)
+  {
+    if (h->compare_key2(h->end_range) > 0)
+      return ICP_OUT_OF_RANGE; /* caller should return HA_ERR_END_OF_FILE already */
+  }
+  return h->pushed_idx_cond->val_int()? ICP_MATCH : ICP_NO_MATCH;
+}
+
+C_MODE_END
+
+
+Item *ha_innobase::idx_cond_push(uint keyno_arg, Item* idx_cond_arg)
+{
+  if ((keyno_arg != primary_key) && (prebuilt->select_lock_type == LOCK_NONE))
+  {
+    pushed_idx_cond_keyno= keyno_arg;
+    pushed_idx_cond= idx_cond_arg;
+    in_range_check_pushed_down= TRUE;
+    return NULL; /* Table handler will check the entire condition */
+  }
+  return idx_cond_arg; /* Table handler will not make any checks */
+}
+
diff --git a/storage/xtradb/handler/ha_innodb.h b/storage/xtradb/handler/ha_innodb.h
new file mode 100644
index 00000000000..50a43aaebed
--- /dev/null
+++ b/storage/xtradb/handler/ha_innodb.h
@@ -0,0 +1,349 @@
+/*****************************************************************************
+
+Copyright (c) 2000, 2010, MySQL AB & Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/*
+  This file is based on ha_berkeley.h of MySQL distribution
+
+  This file defines the Innodb handler: the interface between MySQL and
+  Innodb
+*/
+
+#ifdef USE_PRAGMA_INTERFACE
+#pragma interface			/* gcc class implementation */
+#endif
+
+/* Structure defines translation table between mysql index and innodb
+index structures */
+typedef struct innodb_idx_translate_struct {
+	ulint		index_count;	/*!< number of valid index entries
+					in the index_mapping array */
+	ulint		array_size;	/*!< array size of index_mapping */
+	dict_index_t**	index_mapping;	/*!< index pointer array directly
+					maps to index in Innodb from MySQL
+					array index */
+} innodb_idx_translate_t;
+
+
+/** InnoDB table share */
+typedef struct st_innobase_share {
+	THR_LOCK		lock;		/*!< MySQL lock protecting
+						this structure */
+	const char*		table_name;	/*!< InnoDB table name */
+	uint			use_count;	/*!< reference count,
+						incremented in get_share()
+						and decremented in
+						free_share() */
+	void*			table_name_hash;/*!< hash table chain node */
+	innodb_idx_translate_t	idx_trans_tbl;	/*!< index translation
+						table between MySQL and
+						Innodb */
+	dict_table_t*		ib_table;
+} INNOBASE_SHARE;
+
+
+/** InnoDB B-tree index */
+struct dict_index_struct;
+/** Prebuilt structures in an Innobase table handle used within MySQL */
+struct row_prebuilt_struct;
+
+/** InnoDB B-tree index */
+typedef struct dict_index_struct dict_index_t;
+/** Prebuilt structures in an Innobase table handle used within MySQL */
+typedef struct row_prebuilt_struct row_prebuilt_t;
+
+/** The class defining a handle to an Innodb table */
+class ha_innobase: public handler
+{
+	row_prebuilt_t*	prebuilt;	/*!< prebuilt struct in InnoDB, used
+					to save CPU time with prebuilt data
+					structures*/
+	THD*		user_thd;	/*!< the thread handle of the user
+					currently using the handle; this is
+					set in external_lock function */
+	THR_LOCK_DATA	lock;
+	INNOBASE_SHARE*	share;		/*!< information for MySQL
+					table locking */
+
+	uchar*		upd_buff;	/*!< buffer used in updates */
+	uchar*		key_val_buff;	/*!< buffer used in converting
+					search key values from MySQL format
+					to Innodb format */
+	ulong		upd_and_key_val_buff_len;
+					/* the length of each of the previous
+					two buffers */
+	Table_flags	int_table_flags;
+	uint		primary_key;
+	ulong		start_of_scan;	/*!< this is set to 1 when we are
+					starting a table scan but have not
+					yet fetched any row, else 0 */
+	uint		last_match_mode;/* match mode of the latest search:
+					ROW_SEL_EXACT, ROW_SEL_EXACT_PREFIX,
+					or undefined */
+	uint		num_write_row;	/*!< number of write_row() calls */
+
+	uint store_key_val_for_row(uint keynr, char* buff, uint buff_len,
+                                   const uchar* record);
+	inline void update_thd(THD* thd);
+	void update_thd();
+	int change_active_index(uint keynr);
+	int general_fetch(uchar* buf, uint direction, uint match_mode);
+	ulint innobase_lock_autoinc();
+	ulonglong innobase_peek_autoinc();
+	ulint innobase_set_max_autoinc(ulonglong auto_inc);
+	ulint innobase_reset_autoinc(ulonglong auto_inc);
+	ulint innobase_get_autoinc(ulonglong* value);
+	ulint innobase_update_autoinc(ulonglong	auto_inc);
+	void innobase_initialize_autoinc();
+	dict_index_t* innobase_get_index(uint keynr);
+
+	/* Init values for the class: */
+ public:
+	ha_innobase(handlerton *hton, TABLE_SHARE *table_arg);
+	~ha_innobase();
+	/*
+	  Get the row type from the storage engine.  If this method returns
+	  ROW_TYPE_NOT_USED, the information in HA_CREATE_INFO should be used.
+	*/
+	enum row_type get_row_type() const;
+
+	const char* table_type() const;
+	const char* index_type(uint key_number);
+	const char** bas_ext() const;
+	Table_flags table_flags() const;
+	ulong index_flags(uint idx, uint part, bool all_parts) const;
+	uint max_supported_keys() const;
+	uint max_supported_key_length() const;
+	uint max_supported_key_part_length() const;
+	const key_map* keys_to_use_for_scanning();
+
+	int open(const char *name, int mode, uint test_if_locked);
+	int close(void);
+	double scan_time();
+	double read_time(uint index, uint ranges, ha_rows rows);
+	bool is_corrupt() const;
+
+	int write_row(uchar * buf);
+	int update_row(const uchar * old_data, uchar * new_data);
+	int delete_row(const uchar * buf);
+	bool was_semi_consistent_read();
+	void try_semi_consistent_read(bool yes);
+	void unlock_row();
+
+	int index_init(uint index, bool sorted);
+	int index_end();
+	int index_read(uchar * buf, const uchar * key,
+		uint key_len, enum ha_rkey_function find_flag);
+	int index_read_idx(uchar * buf, uint index, const uchar * key,
+			   uint key_len, enum ha_rkey_function find_flag);
+	int index_read_last(uchar * buf, const uchar * key, uint key_len);
+	int index_next(uchar * buf);
+	int index_next_same(uchar * buf, const uchar *key, uint keylen);
+	int index_prev(uchar * buf);
+	int index_first(uchar * buf);
+	int index_last(uchar * buf);
+
+	int rnd_init(bool scan);
+	int rnd_end();
+	int rnd_next(uchar *buf);
+	int rnd_pos(uchar * buf, uchar *pos);
+
+	void position(const uchar *record);
+	int info(uint);
+	int analyze(THD* thd,HA_CHECK_OPT* check_opt);
+	int optimize(THD* thd,HA_CHECK_OPT* check_opt);
+	int discard_or_import_tablespace(my_bool discard);
+	int extra(enum ha_extra_function operation);
+        int reset();
+	int external_lock(THD *thd, int lock_type);
+	int transactional_table_lock(THD *thd, int lock_type);
+	int start_stmt(THD *thd, thr_lock_type lock_type);
+	void position(uchar *record);
+	ha_rows records_in_range(uint inx, key_range *min_key, key_range
+								*max_key);
+	ha_rows estimate_rows_upper_bound();
+
+	void update_create_info(HA_CREATE_INFO* create_info);
+	int create(const char *name, register TABLE *form,
+					HA_CREATE_INFO *create_info);
+	int delete_all_rows();
+	int delete_table(const char *name);
+	int rename_table(const char* from, const char* to);
+	int check(THD* thd, HA_CHECK_OPT* check_opt);
+	char* update_table_comment(const char* comment);
+	char* get_foreign_key_create_info();
+	int get_foreign_key_list(THD *thd, List<FOREIGN_KEY_INFO> *f_key_list);
+	bool can_switch_engines();
+	uint referenced_by_foreign_key();
+	void free_foreign_key_create_info(char* str);
+	THR_LOCK_DATA **store_lock(THD *thd, THR_LOCK_DATA **to,
+					enum thr_lock_type lock_type);
+	void init_table_handle_for_HANDLER();
+        virtual void get_auto_increment(ulonglong offset, ulonglong increment,
+                                        ulonglong nb_desired_values,
+                                        ulonglong *first_value,
+                                        ulonglong *nb_reserved_values);
+	int reset_auto_increment(ulonglong value);
+
+	virtual bool get_error_message(int error, String *buf);
+
+	uint8 table_cache_type();
+	/*
+	  ask handler about permission to cache table during query registration
+	*/
+	my_bool register_query_cache_table(THD *thd, char *table_key,
+					   uint key_length,
+					   qc_engine_callback *call_back,
+					   ulonglong *engine_data);
+	static char *get_mysql_bin_log_name();
+	static ulonglong get_mysql_bin_log_pos();
+	bool primary_key_is_clustered();
+	int cmp_ref(const uchar *ref1, const uchar *ref2);
+	/** Fast index creation (smart ALTER TABLE) @see handler0alter.cc @{ */
+	int add_index(TABLE *table_arg, KEY *key_info, uint num_of_keys);
+	int prepare_drop_index(TABLE *table_arg, uint *key_num,
+			       uint num_of_keys);
+	int final_drop_index(TABLE *table_arg);
+	/** @} */
+	bool check_if_incompatible_data(HA_CREATE_INFO *info,
+					uint table_changes);
+	bool check_if_supported_virtual_columns(void) { return TRUE; }
+public:
+  /**
+   * Multi Range Read interface
+   */
+  int multi_range_read_init(RANGE_SEQ_IF *seq, void *seq_init_param,
+                            uint n_ranges, uint mode, HANDLER_BUFFER *buf);
+  int multi_range_read_next(char **range_info);
+  ha_rows multi_range_read_info_const(uint keyno, RANGE_SEQ_IF *seq,
+                                      void *seq_init_param, 
+                                      uint n_ranges, uint *bufsz,
+                                      uint *flags, COST_VECT *cost);
+  ha_rows multi_range_read_info(uint keyno, uint n_ranges, uint keys,
+                                uint *bufsz, uint *flags, COST_VECT *cost);
+  DsMrr_impl ds_mrr;
+
+  Item *idx_cond_push(uint keyno, Item* idx_cond);
+};
+
+/* Some accessor functions which the InnoDB plugin needs, but which
+can not be added to mysql/plugin.h as part of the public interface;
+the definitions are bracketed with #ifdef INNODB_COMPATIBILITY_HOOKS */
+
+#ifndef INNODB_COMPATIBILITY_HOOKS
+#error InnoDB needs MySQL to be built with #define INNODB_COMPATIBILITY_HOOKS
+#endif
+
+extern "C" {
+struct charset_info_st *thd_charset(MYSQL_THD thd);
+#if MYSQL_VERSION_ID >= 50142
+LEX_STRING *thd_query_string(MYSQL_THD thd);
+#else
+char **thd_query(MYSQL_THD thd);
+#endif
+
+/** Get the file name of the MySQL binlog.
+ * @return the name of the binlog file
+ */
+const char* mysql_bin_log_file_name(void);
+
+/** Get the current position of the MySQL binlog.
+ * @return byte offset from the beginning of the binlog
+ */
+ulonglong mysql_bin_log_file_pos(void);
+
+/**
+  Check if a user thread is a replication slave thread
+  @param thd  user thread
+  @retval 0 the user thread is not a replication slave thread
+  @retval 1 the user thread is a replication slave thread
+*/
+int thd_slave_thread(const MYSQL_THD thd);
+
+/**
+  Check if a user thread is running a non-transactional update
+  @param thd  user thread
+  @retval 0 the user thread is not running a non-transactional update
+  @retval 1 the user thread is running a non-transactional update
+*/
+int thd_non_transactional_update(const MYSQL_THD thd);
+
+/**
+  Get the user thread's binary logging format
+  @param thd  user thread
+  @return Value to be used as index into the binlog_format_names array
+*/
+int thd_binlog_format(const MYSQL_THD thd);
+
+/**
+  Mark transaction to rollback and mark error as fatal to a sub-statement.
+  @param  thd   Thread handle
+  @param  all   TRUE <=> rollback main transaction.
+*/
+void thd_mark_transaction_to_rollback(MYSQL_THD thd, bool all);
+
+#if MYSQL_VERSION_ID > 50140
+/**
+  Check if binary logging is filtered for thread's current db.
+  @param  thd   Thread handle
+  @retval 1 the query is not filtered, 0 otherwise.
+*/
+bool thd_binlog_filter_ok(const MYSQL_THD thd);
+#endif /* MYSQL_VERSION_ID > 50140 */
+}
+
+typedef struct trx_struct trx_t;
+/********************************************************************//**
+@file handler/ha_innodb.h
+Converts an InnoDB error code to a MySQL error code and also tells to MySQL
+about a possible transaction rollback inside InnoDB caused by a lock wait
+timeout or a deadlock.
+@return	MySQL error code */
+extern "C"
+int
+convert_error_code_to_mysql(
+/*========================*/
+	int		error,	/*!< in: InnoDB error code */
+	ulint		flags,	/*!< in: InnoDB table flags, or 0 */
+	MYSQL_THD	thd);	/*!< in: user thread handle or NULL */
+
+/*********************************************************************//**
+Allocates an InnoDB transaction for a MySQL handler object.
+@return	InnoDB transaction handle */
+extern "C"
+trx_t*
+innobase_trx_allocate(
+/*==================*/
+	MYSQL_THD	thd);	/*!< in: user thread handle */
+
+
+/*********************************************************************//**
+This function checks each index name for a table against reserved
+system default primary index name 'GEN_CLUST_INDEX'. If a name
+matches, this function pushes an warning message to the client,
+and returns true. */
+extern "C"
+bool
+innobase_index_name_is_reserved(
+/*============================*/
+					/* out: true if the index name
+					matches the reserved name */
+	const trx_t*	trx,		/* in: InnoDB transaction handle */
+	const KEY*	key_info,	/* in: Indexes to be created */
+	ulint		num_of_keys);	/* in: Number of indexes to
+					be created. */
+
diff --git a/storage/xtradb/handler/handler0alter.cc b/storage/xtradb/handler/handler0alter.cc
new file mode 100644
index 00000000000..3a32ed9cf36
--- /dev/null
+++ b/storage/xtradb/handler/handler0alter.cc
@@ -0,0 +1,1243 @@
+/*****************************************************************************
+
+Copyright (c) 2005, 2010, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file handler/handler0alter.cc
+Smart ALTER TABLE
+*******************************************************/
+
+#include <mysql_priv.h>
+#include <mysqld_error.h>
+
+extern "C" {
+#include "log0log.h"
+#include "row0merge.h"
+#include "srv0srv.h"
+#include "trx0trx.h"
+#include "trx0roll.h"
+#include "ha_prototypes.h"
+#include "handler0alter.h"
+}
+
+#include "ha_innodb.h"
+
+/*************************************************************//**
+Copies an InnoDB column to a MySQL field.  This function is
+adapted from row_sel_field_store_in_mysql_format(). */
+static
+void
+innobase_col_to_mysql(
+/*==================*/
+	const dict_col_t*	col,	/*!< in: InnoDB column */
+	const uchar*		data,	/*!< in: InnoDB column data */
+	ulint			len,	/*!< in: length of data, in bytes */
+	Field*			field)	/*!< in/out: MySQL field */
+{
+	uchar*	ptr;
+	uchar*	dest	= field->ptr;
+	ulint	flen	= field->pack_length();
+
+	switch (col->mtype) {
+	case DATA_INT:
+		ut_ad(len == flen);
+
+		/* Convert integer data from Innobase to little-endian
+		format, sign bit restored to normal */
+
+		for (ptr = dest + len; ptr != dest; ) {
+			*--ptr = *data++;
+		}
+
+		if (!(field->flags & UNSIGNED_FLAG)) {
+			((byte*) dest)[len - 1] ^= 0x80;
+		}
+
+		break;
+
+	case DATA_VARCHAR:
+	case DATA_VARMYSQL:
+	case DATA_BINARY:
+		field->reset();
+
+		if (field->type() == MYSQL_TYPE_VARCHAR) {
+			/* This is a >= 5.0.3 type true VARCHAR. Store the
+			length of the data to the first byte or the first
+			two bytes of dest. */
+
+			dest = row_mysql_store_true_var_len(
+				dest, len, flen - field->key_length());
+		}
+
+		/* Copy the actual data */
+		memcpy(dest, data, len);
+		break;
+
+	case DATA_BLOB:
+		/* Store a pointer to the BLOB buffer to dest: the BLOB was
+		already copied to the buffer in row_sel_store_mysql_rec */
+
+		row_mysql_store_blob_ref(dest, flen, data, len);
+		break;
+
+#ifdef UNIV_DEBUG
+	case DATA_MYSQL:
+		ut_ad(flen >= len);
+		ut_ad(col->mbmaxlen >= col->mbminlen);
+		ut_ad(col->mbmaxlen > col->mbminlen || flen == len);
+		memcpy(dest, data, len);
+		break;
+
+	default:
+	case DATA_SYS_CHILD:
+	case DATA_SYS:
+		/* These column types should never be shipped to MySQL. */
+		ut_ad(0);
+
+	case DATA_CHAR:
+	case DATA_FIXBINARY:
+	case DATA_FLOAT:
+	case DATA_DOUBLE:
+	case DATA_DECIMAL:
+		/* Above are the valid column types for MySQL data. */
+		ut_ad(flen == len);
+#else /* UNIV_DEBUG */
+	default:
+#endif /* UNIV_DEBUG */
+		memcpy(dest, data, len);
+	}
+}
+
+/*************************************************************//**
+Copies an InnoDB record to table->record[0]. */
+extern "C" UNIV_INTERN
+void
+innobase_rec_to_mysql(
+/*==================*/
+	TABLE*			table,		/*!< in/out: MySQL table */
+	const rec_t*		rec,		/*!< in: record */
+	const dict_index_t*	index,		/*!< in: index */
+	const ulint*		offsets)	/*!< in: rec_get_offsets(
+						rec, index, ...) */
+{
+	uint	n_fields	= table->s->fields;
+	uint	i;
+
+	ut_ad(n_fields == dict_table_get_n_user_cols(index->table));
+
+	for (i = 0; i < n_fields; i++) {
+		Field*		field	= table->field[i];
+		ulint		ipos;
+		ulint		ilen;
+		const uchar*	ifield;
+
+		field->reset();
+
+		ipos = dict_index_get_nth_col_pos(index, i);
+
+		if (UNIV_UNLIKELY(ipos == ULINT_UNDEFINED)) {
+null_field:
+			field->set_null();
+			continue;
+		}
+
+		ifield = rec_get_nth_field(rec, offsets, ipos, &ilen);
+
+		/* Assign the NULL flag */
+		if (ilen == UNIV_SQL_NULL) {
+			ut_ad(field->real_maybe_null());
+			goto null_field;
+		}
+
+		field->set_notnull();
+
+		innobase_col_to_mysql(
+			dict_field_get_col(
+				dict_index_get_nth_field(index, ipos)),
+			ifield, ilen, field);
+	}
+}
+
+/*************************************************************//**
+Resets table->record[0]. */
+extern "C" UNIV_INTERN
+void
+innobase_rec_reset(
+/*===============*/
+	TABLE*			table)		/*!< in/out: MySQL table */
+{
+	uint	n_fields	= table->s->fields;
+	uint	i;
+
+	for (i = 0; i < n_fields; i++) {
+		table->field[i]->set_default();
+	}
+}
+
+/******************************************************************//**
+Removes the filename encoding of a database and table name. */
+static
+void
+innobase_convert_tablename(
+/*=======================*/
+	char*	s)	/*!< in: identifier; out: decoded identifier */
+{
+	uint	errors;
+
+	char*	slash = strchr(s, '/');
+
+	if (slash) {
+		char*	t;
+		/* Temporarily replace the '/' with NUL. */
+		*slash = 0;
+		/* Convert the database name. */
+		strconvert(&my_charset_filename, s, system_charset_info,
+			   s, slash - s + 1, &errors);
+
+		t = s + strlen(s);
+		ut_ad(slash >= t);
+		/* Append a  '.' after the database name. */
+		*t++ = '.';
+		slash++;
+		/* Convert the table name. */
+		strconvert(&my_charset_filename, slash, system_charset_info,
+			   t, slash - t + strlen(slash), &errors);
+	} else {
+		strconvert(&my_charset_filename, s,
+			   system_charset_info, s, strlen(s), &errors);
+	}
+}
+
+/*******************************************************************//**
+This function checks that index keys are sensible.
+@return	0 or error number */
+static
+int
+innobase_check_index_keys(
+/*======================*/
+	const KEY*		key_info,	/*!< in: Indexes to be
+						created */
+	ulint			num_of_keys,	/*!< in: Number of
+						indexes to be created */
+	const dict_table_t*	table)		/*!< in: Existing indexes */
+{
+	ulint		key_num;
+
+	ut_ad(key_info);
+	ut_ad(num_of_keys);
+
+	for (key_num = 0; key_num < num_of_keys; key_num++) {
+		const KEY&	key = key_info[key_num];
+
+		/* Check that the same index name does not appear
+		twice in indexes to be created. */
+
+		for (ulint i = 0; i < key_num; i++) {
+			const KEY&	key2 = key_info[i];
+
+			if (0 == strcmp(key.name, key2.name)) {
+				my_error(ER_WRONG_NAME_FOR_INDEX, MYF(0),
+					 key.name);
+
+				return(ER_WRONG_NAME_FOR_INDEX);
+			}
+		}
+
+		/* Check that the same index name does not already exist. */
+
+		for (const dict_index_t* index
+			     = dict_table_get_first_index(table);
+		     index; index = dict_table_get_next_index(index)) {
+
+			if (0 == strcmp(key.name, index->name)) {
+				my_error(ER_WRONG_NAME_FOR_INDEX, MYF(0),
+					 key.name);
+
+				return(ER_WRONG_NAME_FOR_INDEX);
+			}
+		}
+
+		/* Check that MySQL does not try to create a column
+		prefix index field on an inappropriate data type and
+		that the same column does not appear twice in the index. */
+
+		for (ulint i = 0; i < key.key_parts; i++) {
+			const KEY_PART_INFO&	key_part1
+				= key.key_part[i];
+			const Field*		field
+				= key_part1.field;
+			ibool			is_unsigned;
+
+			switch (get_innobase_type_from_mysql_type(
+					&is_unsigned, field)) {
+			default:
+				break;
+			case DATA_INT:
+			case DATA_FLOAT:
+			case DATA_DOUBLE:
+			case DATA_DECIMAL:
+				if (field->type() == MYSQL_TYPE_VARCHAR) {
+					if (key_part1.length
+					    >= field->pack_length()
+					    - ((Field_varstring*) field)
+					    ->length_bytes) {
+						break;
+					}
+				} else {
+					if (key_part1.length
+					    >= field->pack_length()) {
+						break;
+					}
+				}
+
+				my_error(ER_WRONG_KEY_COLUMN, MYF(0),
+					 field->field_name);
+				return(ER_WRONG_KEY_COLUMN);
+			}
+
+			for (ulint j = 0; j < i; j++) {
+				const KEY_PART_INFO&	key_part2
+					= key.key_part[j];
+
+				if (strcmp(key_part1.field->field_name,
+					   key_part2.field->field_name)) {
+					continue;
+				}
+
+				my_error(ER_WRONG_KEY_COLUMN, MYF(0),
+					 key_part1.field->field_name);
+				return(ER_WRONG_KEY_COLUMN);
+			}
+		}
+	}
+
+	return(0);
+}
+
+/*******************************************************************//**
+Create index field definition for key part */
+static
+void
+innobase_create_index_field_def(
+/*============================*/
+	KEY_PART_INFO*		key_part,	/*!< in: MySQL key definition */
+	mem_heap_t*		heap,		/*!< in: memory heap */
+	merge_index_field_t*	index_field)	/*!< out: index field
+						definition for key_part */
+{
+	Field*		field;
+	ibool		is_unsigned;
+	ulint		col_type;
+
+	DBUG_ENTER("innobase_create_index_field_def");
+
+	ut_ad(key_part);
+	ut_ad(index_field);
+
+	field = key_part->field;
+	ut_a(field);
+
+	col_type = get_innobase_type_from_mysql_type(&is_unsigned, field);
+
+	if (DATA_BLOB == col_type
+	    || (key_part->length < field->pack_length()
+		&& field->type() != MYSQL_TYPE_VARCHAR)
+	    || (field->type() == MYSQL_TYPE_VARCHAR
+		&& key_part->length < field->pack_length()
+			- ((Field_varstring*)field)->length_bytes)) {
+
+		index_field->prefix_len = key_part->length;
+	} else {
+		index_field->prefix_len = 0;
+	}
+
+	index_field->field_name = mem_heap_strdup(heap, field->field_name);
+
+	DBUG_VOID_RETURN;
+}
+
+/*******************************************************************//**
+Create index definition for key */
+static
+void
+innobase_create_index_def(
+/*======================*/
+	KEY*			key,		/*!< in: key definition */
+	bool			new_primary,	/*!< in: TRUE=generating
+						a new primary key
+						on the table */
+	bool			key_primary,	/*!< in: TRUE if this key
+						is a primary key */
+	merge_index_def_t*	index,		/*!< out: index definition */
+	mem_heap_t*		heap)		/*!< in: heap where memory
+						is allocated */
+{
+	ulint	i;
+	ulint	len;
+	ulint	n_fields = key->key_parts;
+	char*	index_name;
+
+	DBUG_ENTER("innobase_create_index_def");
+
+	index->fields = (merge_index_field_t*) mem_heap_alloc(
+		heap, n_fields * sizeof *index->fields);
+
+	index->ind_type = 0;
+	index->n_fields = n_fields;
+	len = strlen(key->name) + 1;
+	index->name = index_name = (char*) mem_heap_alloc(heap,
+							  len + !new_primary);
+
+	if (UNIV_LIKELY(!new_primary)) {
+		*index_name++ = TEMP_INDEX_PREFIX;
+	}
+
+	memcpy(index_name, key->name, len);
+
+	if (key->flags & HA_NOSAME) {
+		index->ind_type |= DICT_UNIQUE;
+	}
+
+	if (key_primary) {
+		index->ind_type |= DICT_CLUSTERED;
+	}
+
+	for (i = 0; i < n_fields; i++) {
+		innobase_create_index_field_def(&key->key_part[i], heap,
+						&index->fields[i]);
+	}
+
+	DBUG_VOID_RETURN;
+}
+
+/*******************************************************************//**
+Copy index field definition */
+static
+void
+innobase_copy_index_field_def(
+/*==========================*/
+	const dict_field_t*	field,		/*!< in: definition to copy */
+	merge_index_field_t*	index_field)	/*!< out: copied definition */
+{
+	DBUG_ENTER("innobase_copy_index_field_def");
+	DBUG_ASSERT(field != NULL);
+	DBUG_ASSERT(index_field != NULL);
+
+	index_field->field_name = field->name;
+	index_field->prefix_len = field->prefix_len;
+
+	DBUG_VOID_RETURN;
+}
+
+/*******************************************************************//**
+Copy index definition for the index */
+static
+void
+innobase_copy_index_def(
+/*====================*/
+	const dict_index_t*	index,	/*!< in: index definition to copy */
+	merge_index_def_t*	new_index,/*!< out: Index definition */
+	mem_heap_t*		heap)	/*!< in: heap where allocated */
+{
+	ulint	n_fields;
+	ulint	i;
+
+	DBUG_ENTER("innobase_copy_index_def");
+
+	/* Note that we take only those fields that user defined to be
+	in the index.  In the internal representation more colums were
+	added and those colums are not copied .*/
+
+	n_fields = index->n_user_defined_cols;
+
+	new_index->fields = (merge_index_field_t*) mem_heap_alloc(
+		heap, n_fields * sizeof *new_index->fields);
+
+	/* When adding a PRIMARY KEY, we may convert a previous
+	clustered index to a secondary index (UNIQUE NOT NULL). */
+	new_index->ind_type = index->type & ~DICT_CLUSTERED;
+	new_index->n_fields = n_fields;
+	new_index->name = index->name;
+
+	for (i = 0; i < n_fields; i++) {
+		innobase_copy_index_field_def(&index->fields[i],
+					      &new_index->fields[i]);
+	}
+
+	DBUG_VOID_RETURN;
+}
+
+/*******************************************************************//**
+Create an index table where indexes are ordered as follows:
+
+IF a new primary key is defined for the table THEN
+
+	1) New primary key
+	2) Original secondary indexes
+	3) New secondary indexes
+
+ELSE
+
+	1) All new indexes in the order they arrive from MySQL
+
+ENDIF
+
+
+@return	key definitions or NULL */
+static
+merge_index_def_t*
+innobase_create_key_def(
+/*====================*/
+	trx_t*		trx,		/*!< in: trx */
+	const dict_table_t*table,		/*!< in: table definition */
+	mem_heap_t*	heap,		/*!< in: heap where space for key
+					definitions are allocated */
+	KEY*		key_info,	/*!< in: Indexes to be created */
+	ulint&		n_keys)		/*!< in/out: Number of indexes to
+					be created */
+{
+	ulint			i = 0;
+	merge_index_def_t*	indexdef;
+	merge_index_def_t*	indexdefs;
+	bool			new_primary;
+
+	DBUG_ENTER("innobase_create_key_def");
+
+	indexdef = indexdefs = (merge_index_def_t*)
+		mem_heap_alloc(heap, sizeof *indexdef
+			       * (n_keys + UT_LIST_GET_LEN(table->indexes)));
+
+	/* If there is a primary key, it is always the first index
+	defined for the table. */
+
+	new_primary = !my_strcasecmp(system_charset_info,
+				     key_info->name, "PRIMARY");
+
+	/* If there is a UNIQUE INDEX consisting entirely of NOT NULL
+	columns and if the index does not contain column prefix(es)
+	(only prefix/part of the column is indexed), MySQL will treat the
+	index as a PRIMARY KEY unless the table already has one. */
+
+	if (!new_primary && (key_info->flags & HA_NOSAME)
+	    && (!(key_info->flags & HA_KEY_HAS_PART_KEY_SEG))
+	    && row_table_got_default_clust_index(table)) {
+		uint    key_part = key_info->key_parts;
+
+		new_primary = TRUE;
+
+		while (key_part--) {
+			if (key_info->key_part[key_part].key_type
+			    & FIELDFLAG_MAYBE_NULL) {
+				new_primary = FALSE;
+				break;
+			}
+		}
+	}
+
+	if (new_primary) {
+		const dict_index_t*	index;
+
+		/* Create the PRIMARY key index definition */
+		innobase_create_index_def(&key_info[i++], TRUE, TRUE,
+					  indexdef++, heap);
+
+		row_mysql_lock_data_dictionary(trx);
+
+		index = dict_table_get_first_index(table);
+
+		/* Copy the index definitions of the old table.  Skip
+		the old clustered index if it is a generated clustered
+		index or a PRIMARY KEY.  If the clustered index is a
+		UNIQUE INDEX, it must be converted to a secondary index. */
+
+		if (dict_index_get_nth_col(index, 0)->mtype == DATA_SYS
+		    || !my_strcasecmp(system_charset_info,
+				      index->name, "PRIMARY")) {
+			index = dict_table_get_next_index(index);
+		}
+
+		while (index) {
+			innobase_copy_index_def(index, indexdef++, heap);
+			index = dict_table_get_next_index(index);
+		}
+
+		row_mysql_unlock_data_dictionary(trx);
+	}
+
+	/* Create definitions for added secondary indexes. */
+
+	while (i < n_keys) {
+		innobase_create_index_def(&key_info[i++], new_primary, FALSE,
+					  indexdef++, heap);
+	}
+
+	n_keys = indexdef - indexdefs;
+
+	DBUG_RETURN(indexdefs);
+}
+
+/*******************************************************************//**
+Create a temporary tablename using query id, thread id, and id
+@return	temporary tablename */
+static
+char*
+innobase_create_temporary_tablename(
+/*================================*/
+	mem_heap_t*	heap,		/*!< in: memory heap */
+	char		id,		/*!< in: identifier [0-9a-zA-Z] */
+	const char*     table_name)	/*!< in: table name */
+{
+	char*			name;
+	ulint			len;
+	static const char	suffix[] = "@0023 "; /* "# " */
+
+	len = strlen(table_name);
+
+	name = (char*) mem_heap_alloc(heap, len + sizeof suffix);
+	memcpy(name, table_name, len);
+	memcpy(name + len, suffix, sizeof suffix);
+	name[len + (sizeof suffix - 2)] = id;
+
+	return(name);
+}
+
+/*******************************************************************//**
+Create indexes.
+@return	0 or error number */
+UNIV_INTERN
+int
+ha_innobase::add_index(
+/*===================*/
+	TABLE*	table,		/*!< in: Table where indexes are created */
+	KEY*	key_info,	/*!< in: Indexes to be created */
+	uint	num_of_keys)	/*!< in: Number of indexes to be created */
+{
+	dict_index_t**	index;		/*!< Index to be created */
+	dict_table_t*	innodb_table;	/*!< InnoDB table in dictionary */
+	dict_table_t*	indexed_table;	/*!< Table where indexes are created */
+	merge_index_def_t* index_defs;	/*!< Index definitions */
+	mem_heap_t*     heap;		/*!< Heap for index definitions */
+	trx_t*		trx;		/*!< Transaction */
+	ulint		num_of_idx;
+	ulint		num_created	= 0;
+	ibool		dict_locked	= FALSE;
+	ulint		new_primary;
+	int		error;
+
+	DBUG_ENTER("ha_innobase::add_index");
+	ut_a(table);
+	ut_a(key_info);
+	ut_a(num_of_keys);
+
+	if (srv_created_new_raw || srv_force_recovery) {
+		DBUG_RETURN(HA_ERR_WRONG_COMMAND);
+	}
+
+	update_thd();
+
+	heap = mem_heap_create(1024);
+
+	/* In case MySQL calls this in the middle of a SELECT query, release
+	possible adaptive hash latch to avoid deadlocks of threads. */
+	trx_search_latch_release_if_reserved(prebuilt->trx);
+	trx_start_if_not_started(prebuilt->trx);
+
+	/* Create a background transaction for the operations on
+	the data dictionary tables. */
+	trx = innobase_trx_allocate(user_thd);
+	trx_start_if_not_started(trx);
+
+	innodb_table = indexed_table
+		= dict_table_get(prebuilt->table->name, FALSE);
+
+	if (UNIV_UNLIKELY(!innodb_table)) {
+		error = HA_ERR_NO_SUCH_TABLE;
+		goto err_exit;
+	}
+
+	/* Check if the index name is reserved. */
+	if (innobase_index_name_is_reserved(trx, key_info, num_of_keys)) {
+                error = ER_WRONG_NAME_FOR_INDEX;
+	} else {
+		/* Check that index keys are sensible */
+		error = innobase_check_index_keys(key_info, num_of_keys,
+						  innodb_table);
+	}
+
+	if (UNIV_UNLIKELY(error)) {
+err_exit:
+		mem_heap_free(heap);
+		trx_general_rollback_for_mysql(trx, NULL);
+		trx_free_for_mysql(trx);
+		trx_commit_for_mysql(prebuilt->trx);
+		DBUG_RETURN(error);
+	}
+
+	/* Create table containing all indexes to be built in this
+	alter table add index so that they are in the correct order
+	in the table. */
+
+	num_of_idx = num_of_keys;
+
+	index_defs = innobase_create_key_def(
+		trx, innodb_table, heap, key_info, num_of_idx);
+
+	new_primary = DICT_CLUSTERED & index_defs[0].ind_type;
+
+	/* Allocate memory for dictionary index definitions */
+
+	index = (dict_index_t**) mem_heap_alloc(
+		heap, num_of_idx * sizeof *index);
+
+	/* Flag this transaction as a dictionary operation, so that
+	the data dictionary will be locked in crash recovery. */
+	trx_set_dict_operation(trx, TRX_DICT_OP_INDEX);
+
+	/* Acquire a lock on the table before creating any indexes. */
+	error = row_merge_lock_table(prebuilt->trx, innodb_table,
+				     new_primary ? LOCK_X : LOCK_S);
+
+	if (UNIV_UNLIKELY(error != DB_SUCCESS)) {
+
+		goto error_handling;
+	}
+
+	/* Latch the InnoDB data dictionary exclusively so that no deadlocks
+	or lock waits can happen in it during an index create operation. */
+
+	row_mysql_lock_data_dictionary(trx);
+	dict_locked = TRUE;
+
+	ut_d(dict_table_check_for_dup_indexes(innodb_table, FALSE));
+
+	/* If a new primary key is defined for the table we need
+	to drop the original table and rebuild all indexes. */
+
+	if (UNIV_UNLIKELY(new_primary)) {
+		/* This transaction should be the only one
+		operating on the table. */
+		ut_a(innodb_table->n_mysql_handles_opened == 1);
+
+		char*	new_table_name = innobase_create_temporary_tablename(
+			heap, '1', innodb_table->name);
+
+		/* Clone the table. */
+		trx_set_dict_operation(trx, TRX_DICT_OP_TABLE);
+		indexed_table = row_merge_create_temporary_table(
+			new_table_name, index_defs, innodb_table, trx);
+
+		if (!indexed_table) {
+
+			switch (trx->error_state) {
+			case DB_TABLESPACE_ALREADY_EXISTS:
+			case DB_DUPLICATE_KEY:
+				innobase_convert_tablename(new_table_name);
+				my_error(HA_ERR_TABLE_EXIST, MYF(0),
+					 new_table_name);
+				error = HA_ERR_TABLE_EXIST;
+				break;
+			default:
+				error = convert_error_code_to_mysql(
+					trx->error_state, innodb_table->flags,
+					user_thd);
+			}
+
+			ut_d(dict_table_check_for_dup_indexes(innodb_table,
+							      FALSE));
+			row_mysql_unlock_data_dictionary(trx);
+			goto err_exit;
+		}
+
+		trx->table_id = indexed_table->id;
+	}
+
+	/* Create the indexes in SYS_INDEXES and load into dictionary. */
+
+	for (ulint i = 0; i < num_of_idx; i++) {
+
+		index[i] = row_merge_create_index(trx, indexed_table,
+						  &index_defs[i]);
+
+		if (!index[i]) {
+			error = trx->error_state;
+			goto error_handling;
+		}
+
+		num_created++;
+	}
+
+	ut_ad(error == DB_SUCCESS);
+
+	/* We will need to rebuild index translation table. Set
+	valid index entry count in the translation table to zero */
+	share->idx_trans_tbl.index_count = 0;
+
+	/* Commit the data dictionary transaction in order to release
+	the table locks on the system tables.  This means that if
+	MySQL crashes while creating a new primary key inside
+	row_merge_build_indexes(), indexed_table will not be dropped
+	by trx_rollback_active().  It will have to be recovered or
+	dropped by the database administrator. */
+	trx_commit_for_mysql(trx);
+
+	row_mysql_unlock_data_dictionary(trx);
+	dict_locked = FALSE;
+
+	ut_a(trx->n_active_thrs == 0);
+	ut_a(UT_LIST_GET_LEN(trx->signals) == 0);
+
+	if (UNIV_UNLIKELY(new_primary)) {
+		/* A primary key is to be built.  Acquire an exclusive
+		table lock also on the table that is being created. */
+		ut_ad(indexed_table != innodb_table);
+
+		error = row_merge_lock_table(prebuilt->trx, indexed_table,
+					     LOCK_X);
+
+		if (UNIV_UNLIKELY(error != DB_SUCCESS)) {
+
+			goto error_handling;
+		}
+	}
+
+	/* Read the clustered index of the table and build indexes
+	based on this information using temporary files and merge sort. */
+	error = row_merge_build_indexes(prebuilt->trx,
+					innodb_table, indexed_table,
+					index, num_of_idx, table);
+
+error_handling:
+	/* After an error, remove all those index definitions from the
+	dictionary which were defined. */
+
+	switch (error) {
+		const char*	old_name;
+		char*		tmp_name;
+	case DB_SUCCESS:
+		ut_a(!dict_locked);
+		row_mysql_lock_data_dictionary(trx);
+		dict_locked = TRUE;
+
+		ut_d(dict_table_check_for_dup_indexes(prebuilt->table, TRUE));
+
+		if (!new_primary) {
+			error = row_merge_rename_indexes(trx, indexed_table);
+
+			if (error != DB_SUCCESS) {
+				row_merge_drop_indexes(trx, indexed_table,
+						       index, num_created);
+			}
+
+			goto convert_error;
+		}
+
+		/* If a new primary key was defined for the table and
+		there was no error at this point, we can now rename
+		the old table as a temporary table, rename the new
+		temporary table as the old table and drop the old table. */
+		old_name = innodb_table->name;
+		tmp_name = innobase_create_temporary_tablename(heap, '2',
+							       old_name);
+
+		error = row_merge_rename_tables(innodb_table, indexed_table,
+						tmp_name, trx);
+
+		if (error != DB_SUCCESS) {
+
+			row_merge_drop_table(trx, indexed_table);
+
+			switch (error) {
+			case DB_TABLESPACE_ALREADY_EXISTS:
+			case DB_DUPLICATE_KEY:
+				innobase_convert_tablename(tmp_name);
+				my_error(HA_ERR_TABLE_EXIST, MYF(0), tmp_name);
+				error = HA_ERR_TABLE_EXIST;
+				break;
+			default:
+				goto convert_error;
+			}
+			break;
+		}
+
+		trx_commit_for_mysql(prebuilt->trx);
+		row_prebuilt_free(prebuilt, TRUE);
+		prebuilt = row_create_prebuilt(indexed_table);
+
+		indexed_table->n_mysql_handles_opened++;
+
+		error = row_merge_drop_table(trx, innodb_table);
+		innodb_table = indexed_table;
+		goto convert_error;
+
+	case DB_TOO_BIG_RECORD:
+		my_error(HA_ERR_TO_BIG_ROW, MYF(0));
+		goto error;
+	case DB_PRIMARY_KEY_IS_NULL:
+		my_error(ER_PRIMARY_CANT_HAVE_NULL, MYF(0));
+		/* fall through */
+	case DB_DUPLICATE_KEY:
+error:
+		prebuilt->trx->error_info = NULL;
+		/* fall through */
+	default:
+		trx->error_state = DB_SUCCESS;
+
+		if (new_primary) {
+			if (indexed_table != innodb_table) {
+				row_merge_drop_table(trx, indexed_table);
+			}
+		} else {
+			if (!dict_locked) {
+				row_mysql_lock_data_dictionary(trx);
+				dict_locked = TRUE;
+			}
+
+			row_merge_drop_indexes(trx, indexed_table,
+					       index, num_created);
+		}
+
+convert_error:
+		error = convert_error_code_to_mysql(error,
+						    innodb_table->flags,
+						    user_thd);
+	}
+
+	mem_heap_free(heap);
+	trx_commit_for_mysql(trx);
+	if (prebuilt->trx) {
+		trx_commit_for_mysql(prebuilt->trx);
+	}
+
+	if (dict_locked) {
+		ut_d(dict_table_check_for_dup_indexes(innodb_table, FALSE));
+		row_mysql_unlock_data_dictionary(trx);
+	}
+
+	trx_free_for_mysql(trx);
+
+	/* There might be work for utility threads.*/
+	srv_active_wake_master_thread();
+
+	DBUG_RETURN(error);
+}
+
+/*******************************************************************//**
+Prepare to drop some indexes of a table.
+@return	0 or error number */
+UNIV_INTERN
+int
+ha_innobase::prepare_drop_index(
+/*============================*/
+	TABLE*	table,		/*!< in: Table where indexes are dropped */
+	uint*	key_num,	/*!< in: Key nums to be dropped */
+	uint	num_of_keys)	/*!< in: Number of keys to be dropped */
+{
+	trx_t*		trx;
+	int		err = 0;
+	uint 		n_key;
+
+	DBUG_ENTER("ha_innobase::prepare_drop_index");
+	ut_ad(table);
+	ut_ad(key_num);
+	ut_ad(num_of_keys);
+	if (srv_created_new_raw || srv_force_recovery) {
+		DBUG_RETURN(HA_ERR_WRONG_COMMAND);
+	}
+
+	update_thd();
+
+	trx_search_latch_release_if_reserved(prebuilt->trx);
+	trx = prebuilt->trx;
+
+	/* Test and mark all the indexes to be dropped */
+
+	row_mysql_lock_data_dictionary(trx);
+	ut_d(dict_table_check_for_dup_indexes(prebuilt->table, FALSE));
+
+	/* Check that none of the indexes have previously been flagged
+	for deletion. */
+	{
+		const dict_index_t*	index
+			= dict_table_get_first_index(prebuilt->table);
+		do {
+			ut_a(!index->to_be_dropped);
+			index = dict_table_get_next_index(index);
+		} while (index);
+	}
+
+	for (n_key = 0; n_key < num_of_keys; n_key++) {
+		const KEY*	key;
+		dict_index_t*	index;
+
+		key = table->key_info + key_num[n_key];
+		index = dict_table_get_index_on_name_and_min_id(
+			prebuilt->table, key->name);
+
+		if (!index) {
+			sql_print_error("InnoDB could not find key n:o %u "
+					"with name %s for table %s",
+					key_num[n_key],
+					key ? key->name : "NULL",
+					prebuilt->table->name);
+
+			err = HA_ERR_KEY_NOT_FOUND;
+			goto func_exit;
+		}
+
+		/* Refuse to drop the clustered index.  It would be
+		better to automatically generate a clustered index,
+		but mysql_alter_table() will call this method only
+		after ha_innobase::add_index(). */
+
+		if (dict_index_is_clust(index)) {
+			my_error(ER_REQUIRES_PRIMARY_KEY, MYF(0));
+			err = -1;
+			goto func_exit;
+		}
+
+		index->to_be_dropped = TRUE;
+	}
+
+	/* If FOREIGN_KEY_CHECK = 1 you may not drop an index defined
+	for a foreign key constraint because InnoDB requires that both
+	tables contain indexes for the constraint.  Note that CREATE
+	INDEX id ON table does a CREATE INDEX and DROP INDEX, and we
+	can ignore here foreign keys because a new index for the
+	foreign key has already been created.
+
+	We check for the foreign key constraints after marking the
+	candidate indexes for deletion, because when we check for an
+	equivalent foreign index we don't want to select an index that
+	is later deleted. */
+
+	if (trx->check_foreigns
+	    && thd_sql_command(user_thd) != SQLCOM_CREATE_INDEX) {
+		dict_index_t*	index;
+
+		for (index = dict_table_get_first_index(prebuilt->table);
+		     index;
+		     index = dict_table_get_next_index(index)) {
+			dict_foreign_t*	foreign;
+
+			if (!index->to_be_dropped) {
+
+				continue;
+			}
+
+			/* Check if the index is referenced. */
+			foreign = dict_table_get_referenced_constraint(
+				prebuilt->table, index);
+
+			if (foreign) {
+index_needed:
+				trx_set_detailed_error(
+					trx,
+					"Index needed in foreign key "
+					"constraint");
+
+				trx->error_info = index;
+
+				err = HA_ERR_DROP_INDEX_FK;
+				break;
+			} else {
+				/* Check if this index references some
+				other table */
+				foreign = dict_table_get_foreign_constraint(
+					prebuilt->table, index);
+
+				if (foreign) {
+					ut_a(foreign->foreign_index == index);
+
+					/* Search for an equivalent index that
+					the foreign key constraint could use
+					if this index were to be deleted. */
+					if (!dict_foreign_find_equiv_index(
+						foreign)) {
+
+						goto index_needed;
+					}
+				}
+			}
+		}
+	} else if (thd_sql_command(user_thd) == SQLCOM_CREATE_INDEX) {
+		/* This is a drop of a foreign key constraint index that
+		was created by MySQL when the constraint was added.  MySQL
+		does this when the user creates an index explicitly which
+		can be used in place of the automatically generated index. */
+
+		dict_index_t*	index;
+
+		for (index = dict_table_get_first_index(prebuilt->table);
+		     index;
+		     index = dict_table_get_next_index(index)) {
+			dict_foreign_t*	foreign;
+
+			if (!index->to_be_dropped) {
+
+				continue;
+			}
+
+			/* Check if this index references some other table */
+			foreign = dict_table_get_foreign_constraint(
+				prebuilt->table, index);
+
+			if (foreign == NULL) {
+
+				continue;
+			}
+
+			ut_a(foreign->foreign_index == index);
+
+			/* Search for an equivalent index that the
+			foreign key constraint could use if this index
+			were to be deleted. */
+
+			if (!dict_foreign_find_equiv_index(foreign)) {
+				trx_set_detailed_error(
+					trx,
+					"Index needed in foreign key "
+					"constraint");
+
+				trx->error_info = foreign->foreign_index;
+
+				err = HA_ERR_DROP_INDEX_FK;
+				break;
+			}
+		}
+	}
+
+func_exit:
+	if (err) {
+		/* Undo our changes since there was some sort of error. */
+		dict_index_t*	index
+			= dict_table_get_first_index(prebuilt->table);
+
+		do {
+			index->to_be_dropped = FALSE;
+			index = dict_table_get_next_index(index);
+		} while (index);
+	}
+
+	ut_d(dict_table_check_for_dup_indexes(prebuilt->table, FALSE));
+	row_mysql_unlock_data_dictionary(trx);
+
+	DBUG_RETURN(err);
+}
+
+/*******************************************************************//**
+Drop the indexes that were passed to a successful prepare_drop_index().
+@return	0 or error number */
+UNIV_INTERN
+int
+ha_innobase::final_drop_index(
+/*==========================*/
+	TABLE*	table)		/*!< in: Table where indexes are dropped */
+{
+	dict_index_t*	index;		/*!< Index to be dropped */
+	trx_t*		trx;		/*!< Transaction */
+	int		err;
+
+	DBUG_ENTER("ha_innobase::final_drop_index");
+	ut_ad(table);
+
+	if (srv_created_new_raw || srv_force_recovery) {
+		DBUG_RETURN(HA_ERR_WRONG_COMMAND);
+	}
+
+	update_thd();
+
+	trx_search_latch_release_if_reserved(prebuilt->trx);
+	trx_start_if_not_started(prebuilt->trx);
+
+	/* Create a background transaction for the operations on
+	the data dictionary tables. */
+	trx = innobase_trx_allocate(user_thd);
+	trx_start_if_not_started(trx);
+
+	/* Flag this transaction as a dictionary operation, so that
+	the data dictionary will be locked in crash recovery. */
+	trx_set_dict_operation(trx, TRX_DICT_OP_INDEX);
+
+	/* Lock the table exclusively, to ensure that no active
+	transaction depends on an index that is being dropped. */
+	err = convert_error_code_to_mysql(
+		row_merge_lock_table(prebuilt->trx, prebuilt->table, LOCK_X),
+		prebuilt->table->flags, user_thd);
+
+	row_mysql_lock_data_dictionary(trx);
+	ut_d(dict_table_check_for_dup_indexes(prebuilt->table, FALSE));
+
+	if (UNIV_UNLIKELY(err)) {
+
+		/* Unmark the indexes to be dropped. */
+		for (index = dict_table_get_first_index(prebuilt->table);
+		     index; index = dict_table_get_next_index(index)) {
+
+			index->to_be_dropped = FALSE;
+		}
+
+		goto func_exit;
+	}
+
+	/* Drop indexes marked to be dropped */
+
+	index = dict_table_get_first_index(prebuilt->table);
+
+	while (index) {
+		dict_index_t*	next_index;
+
+		next_index = dict_table_get_next_index(index);
+
+		if (index->to_be_dropped) {
+
+			row_merge_drop_index(index, prebuilt->table, trx);
+		}
+
+		index = next_index;
+	}
+
+	/* Check that all flagged indexes were dropped. */
+	for (index = dict_table_get_first_index(prebuilt->table);
+	     index; index = dict_table_get_next_index(index)) {
+		ut_a(!index->to_be_dropped);
+	}
+
+	/* We will need to rebuild index translation table. Set
+	valid index entry count in the translation table to zero */
+	share->idx_trans_tbl.index_count = 0;
+
+func_exit:
+	ut_d(dict_table_check_for_dup_indexes(prebuilt->table, FALSE));
+	trx_commit_for_mysql(trx);
+	trx_commit_for_mysql(prebuilt->trx);
+	row_mysql_unlock_data_dictionary(trx);
+
+	/* Flush the log to reduce probability that the .frm files and
+	the InnoDB data dictionary get out-of-sync if the user runs
+	with innodb_flush_log_at_trx_commit = 0 */
+
+	log_buffer_flush_to_disk();
+
+	trx_free_for_mysql(trx);
+
+	/* Tell the InnoDB server that there might be work for
+	utility threads: */
+
+	srv_active_wake_master_thread();
+
+	DBUG_RETURN(err);
+}
diff --git a/storage/xtradb/handler/i_s.cc b/storage/xtradb/handler/i_s.cc
new file mode 100644
index 00000000000..0f656528315
--- /dev/null
+++ b/storage/xtradb/handler/i_s.cc
@@ -0,0 +1,4516 @@
+/*****************************************************************************
+
+Copyright (c) 2007, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file handler/i_s.cc
+InnoDB INFORMATION SCHEMA tables interface to MySQL.
+
+Created July 18, 2007 Vasil Dimov
+*******************************************************/
+
+#include <mysql_priv.h>
+#include <mysqld_error.h>
+
+#include <m_ctype.h>
+#include <hash.h>
+#include <myisampack.h>
+#include <mysys_err.h>
+#include <my_sys.h>
+#include "i_s.h"
+#include "innodb_patch_info.h"
+#include <mysql/plugin.h>
+
+extern "C" {
+#include "trx0i_s.h"
+#include "trx0trx.h" /* for TRX_QUE_STATE_STR_MAX_LEN */
+#include "buf0buddy.h" /* for i_s_cmpmem */
+#include "buf0buf.h" /* for buf_pool and PAGE_ZIP_MIN_SIZE */
+#include "ha_prototypes.h" /* for innobase_convert_name() */
+#include "srv0start.h" /* for srv_was_started */
+#include "btr0btr.h" /* for btr_page_get_index_id */
+#include "trx0rseg.h" /* for trx_rseg_struct */
+#include "trx0sys.h" /* for trx_sys */
+#include "dict0dict.h" /* for dict_sys */
+#include "btr0pcur.h"
+#include "buf0lru.h" /* for XTRA_LRU_[DUMP/RESTORE] */
+}
+
+static const char plugin_author[] = "Innobase Oy";
+
+#define OK(expr)		\
+	if ((expr) != 0) {	\
+		DBUG_RETURN(1);	\
+	}
+
+#define RETURN_IF_INNODB_NOT_STARTED(plugin_name)			\
+do {									\
+	if (!srv_was_started) {						\
+		push_warning_printf(thd, MYSQL_ERROR::WARN_LEVEL_WARN,	\
+				    ER_CANT_FIND_SYSTEM_REC,		\
+				    "InnoDB: SELECTing from "		\
+				    "INFORMATION_SCHEMA.%s but "	\
+				    "the InnoDB storage engine "	\
+				    "is not installed", plugin_name);	\
+		DBUG_RETURN(0);						\
+	}								\
+} while (0)
+
+#if !defined __STRICT_ANSI__ && defined __GNUC__ && (__GNUC__) > 2 && !defined __INTEL_COMPILER
+#define STRUCT_FLD(name, value)	name: value
+#else
+#define STRUCT_FLD(name, value)	value
+#endif
+
+/* Don't use a static const variable here, as some C++ compilers (notably
+HPUX aCC: HP ANSI C++ B3910B A.03.65) can't handle it. */
+#define END_OF_ST_FIELD_INFO \
+	{STRUCT_FLD(field_name,		NULL), \
+	 STRUCT_FLD(field_length,	0), \
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_NULL), \
+	 STRUCT_FLD(value,		0), \
+	 STRUCT_FLD(field_flags,	0), \
+	 STRUCT_FLD(old_name,		""), \
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)}
+
+/*
+Use the following types mapping:
+
+C type	ST_FIELD_INFO::field_type
+---------------------------------
+long			MYSQL_TYPE_LONGLONG
+(field_length=MY_INT64_NUM_DECIMAL_DIGITS)
+
+long unsigned		MYSQL_TYPE_LONGLONG
+(field_length=MY_INT64_NUM_DECIMAL_DIGITS, field_flags=MY_I_S_UNSIGNED)
+
+char*			MYSQL_TYPE_STRING
+(field_length=n)
+
+float			MYSQL_TYPE_FLOAT
+(field_length=0 is ignored)
+
+void*			MYSQL_TYPE_LONGLONG
+(field_length=MY_INT64_NUM_DECIMAL_DIGITS, field_flags=MY_I_S_UNSIGNED)
+
+boolean (if else)	MYSQL_TYPE_LONG
+(field_length=1)
+
+time_t			MYSQL_TYPE_DATETIME
+(field_length=0 ignored)
+---------------------------------
+*/
+
+/* XXX these are defined in mysql_priv.h inside #ifdef MYSQL_SERVER */
+bool schema_table_store_record(THD *thd, TABLE *table);
+void localtime_to_TIME(MYSQL_TIME *to, struct tm *from);
+bool check_global_access(THD *thd, ulong want_access);
+
+/*******************************************************************//**
+Common function to fill any of the dynamic tables:
+INFORMATION_SCHEMA.innodb_trx
+INFORMATION_SCHEMA.innodb_locks
+INFORMATION_SCHEMA.innodb_lock_waits
+@return	0 on success */
+static
+int
+trx_i_s_common_fill_table(
+/*======================*/
+	THD*		thd,	/*!< in: thread */
+	TABLE_LIST*	tables,	/*!< in/out: tables to fill */
+	COND*		cond);	/*!< in: condition (not used) */
+
+/*******************************************************************//**
+Unbind a dynamic INFORMATION_SCHEMA table.
+@return	0 on success */
+static
+int
+i_s_common_deinit(
+/*==============*/
+	void*	p);	/*!< in/out: table schema object */
+
+/*******************************************************************//**
+Auxiliary function to store time_t value in MYSQL_TYPE_DATETIME
+field.
+@return	0 on success */
+static
+int
+field_store_time_t(
+/*===============*/
+	Field*	field,	/*!< in/out: target field for storage */
+	time_t	time)	/*!< in: value to store */
+{
+	MYSQL_TIME	my_time;
+	struct tm	tm_time;
+
+#if 0
+	/* use this if you are sure that `variables' and `time_zone'
+	are always initialized */
+	thd->variables.time_zone->gmt_sec_to_TIME(
+		&my_time, (my_time_t) time);
+#else
+	localtime_r(&time, &tm_time);
+	localtime_to_TIME(&my_time, &tm_time);
+	my_time.time_type = MYSQL_TIMESTAMP_DATETIME;
+#endif
+
+	return(field->store_time(&my_time, MYSQL_TIMESTAMP_DATETIME));
+}
+
+/*******************************************************************//**
+Auxiliary function to store char* value in MYSQL_TYPE_STRING field.
+@return	0 on success */
+static
+int
+field_store_string(
+/*===============*/
+	Field*		field,	/*!< in/out: target field for storage */
+	const char*	str)	/*!< in: NUL-terminated utf-8 string,
+				or NULL */
+{
+	int	ret;
+
+	if (str != NULL) {
+
+		ret = field->store(str, strlen(str),
+				   system_charset_info);
+		field->set_notnull();
+	} else {
+
+		ret = 0; /* success */
+		field->set_null();
+	}
+
+	return(ret);
+}
+
+/*******************************************************************//**
+Auxiliary function to store ulint value in MYSQL_TYPE_LONGLONG field.
+If the value is ULINT_UNDEFINED then the field it set to NULL.
+@return	0 on success */
+static
+int
+field_store_ulint(
+/*==============*/
+	Field*	field,	/*!< in/out: target field for storage */
+	ulint	n)	/*!< in: value to store */
+{
+	int	ret;
+
+	if (n != ULINT_UNDEFINED) {
+
+		ret = field->store(n);
+		field->set_notnull();
+	} else {
+
+		ret = 0; /* success */
+		field->set_null();
+	}
+
+	return(ret);
+}
+
+/* Fields of the dynamic table INFORMATION_SCHEMA.innodb_patches */
+static ST_FIELD_INFO	innodb_patches_fields_info[] =
+{
+#define IDX_PATCH_NAME		0
+	{STRUCT_FLD(field_name,		"name"),
+	 STRUCT_FLD(field_length,	255),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_STRING),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	0),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define IDX_PATCH_DESCR		1
+	{STRUCT_FLD(field_name,		"description"),
+	 STRUCT_FLD(field_length,	255),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_STRING),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	0),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define IDX_PATCH_COMMENT		2
+	{STRUCT_FLD(field_name,		"comment"),
+	 STRUCT_FLD(field_length,	100),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_STRING),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	0),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define IDX_PATCH_LINK			3
+	{STRUCT_FLD(field_name,		"link"),
+	 STRUCT_FLD(field_length,	255),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_STRING),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	0),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+	END_OF_ST_FIELD_INFO
+};
+
+static struct st_mysql_information_schema	i_s_info =
+{
+	MYSQL_INFORMATION_SCHEMA_INTERFACE_VERSION
+};
+
+/***********************************************************************
+Fill the dynamic table information_schema.innodb_patches */
+static
+int
+innodb_patches_fill(
+/*=============*/
+				/* out: 0 on success, 1 on failure */
+	THD*		thd,	/* in: thread */
+	TABLE_LIST*	tables,	/* in/out: tables to fill */
+	COND*		cond)	/* in: condition (ignored) */
+{
+	TABLE*	table	= (TABLE *) tables->table;
+	int	status	= 0;
+	int	i;
+	Field**	fields;
+
+
+	DBUG_ENTER("innodb_patches_fill");
+	fields = table->field;
+
+	/* deny access to non-superusers */
+	if (check_global_access(thd, PROCESS_ACL)) {
+
+		DBUG_RETURN(0);
+	}
+
+	RETURN_IF_INNODB_NOT_STARTED(tables->schema_table_name);
+	
+	for (i = 0; innodb_enhancements[i].file; i++) {
+
+   	field_store_string(fields[0],innodb_enhancements[i].file);
+   	field_store_string(fields[1],innodb_enhancements[i].name);
+   	field_store_string(fields[2],innodb_enhancements[i].comment);
+   	field_store_string(fields[3],innodb_enhancements[i].link);
+
+	if (schema_table_store_record(thd, table)) {
+		status = 1;
+		break;
+	}
+
+	}
+
+
+	DBUG_RETURN(status);
+}
+
+/***********************************************************************
+Bind the dynamic table information_schema.innodb_patches. */
+static
+int
+innodb_patches_init(
+/*=========*/
+			/* out: 0 on success */
+	void*	p)	/* in/out: table schema object */
+{
+	DBUG_ENTER("innodb_patches_init");
+	ST_SCHEMA_TABLE* schema = (ST_SCHEMA_TABLE*) p;
+
+	schema->fields_info = innodb_patches_fields_info;
+	schema->fill_table = innodb_patches_fill;
+
+	DBUG_RETURN(0);
+}
+
+
+UNIV_INTERN struct st_mysql_plugin      i_s_innodb_patches =
+{
+        /* the plugin type (a MYSQL_XXX_PLUGIN value) */
+        /* int */
+        STRUCT_FLD(type, MYSQL_INFORMATION_SCHEMA_PLUGIN),
+
+        /* pointer to type-specific plugin descriptor */
+        /* void* */
+        STRUCT_FLD(info, &i_s_info),
+
+        /* plugin name */
+        /* const char* */
+        STRUCT_FLD(name, "XTRADB_ENHANCEMENTS"),
+
+        /* plugin author (for SHOW PLUGINS) */
+        /* const char* */
+        STRUCT_FLD(author, "Percona"),
+
+        /* general descriptive text (for SHOW PLUGINS) */
+        /* const char* */
+        STRUCT_FLD(descr, "Enhancements applied to InnoDB plugin"),
+
+        /* the plugin license (PLUGIN_LICENSE_XXX) */
+        /* int */
+        STRUCT_FLD(license, PLUGIN_LICENSE_GPL),
+
+        /* the function to invoke when plugin is loaded */
+        /* int (*)(void*); */
+        STRUCT_FLD(init, innodb_patches_init),
+
+        /* the function to invoke when plugin is unloaded */
+        /* int (*)(void*); */
+        STRUCT_FLD(deinit, i_s_common_deinit),
+
+        /* plugin version (for SHOW PLUGINS) */
+        /* unsigned int */
+        STRUCT_FLD(version, INNODB_VERSION_SHORT),
+
+        /* struct st_mysql_show_var* */
+        STRUCT_FLD(status_vars, NULL),
+
+        /* struct st_mysql_sys_var** */
+        STRUCT_FLD(system_vars, NULL),
+
+        /* reserved for dependency checking */
+        /* void* */
+        STRUCT_FLD(__reserved1, NULL)
+};
+
+UNIV_INTERN struct st_maria_plugin      i_s_innodb_patches_maria =
+{
+        /* the plugin type (a MYSQL_XXX_PLUGIN value) */
+        /* int */
+        STRUCT_FLD(type, MYSQL_INFORMATION_SCHEMA_PLUGIN),
+
+        /* pointer to type-specific plugin descriptor */
+        /* void* */
+        STRUCT_FLD(info, &i_s_info),
+
+        /* plugin name */
+        /* const char* */
+        STRUCT_FLD(name, "XTRADB_ENHANCEMENTS"),
+
+        /* plugin author (for SHOW PLUGINS) */
+        /* const char* */
+        STRUCT_FLD(author, "Percona"),
+
+        /* general descriptive text (for SHOW PLUGINS) */
+        /* const char* */
+        STRUCT_FLD(descr, "Enhancements applied to InnoDB plugin"),
+
+        /* the plugin license (PLUGIN_LICENSE_XXX) */
+        /* int */
+        STRUCT_FLD(license, PLUGIN_LICENSE_GPL),
+
+        /* the function to invoke when plugin is loaded */
+        /* int (*)(void*); */
+        STRUCT_FLD(init, innodb_patches_init),
+
+        /* the function to invoke when plugin is unloaded */
+        /* int (*)(void*); */
+        STRUCT_FLD(deinit, i_s_common_deinit),
+
+        /* plugin version (for SHOW PLUGINS) */
+        /* unsigned int */
+        STRUCT_FLD(version, INNODB_VERSION_SHORT),
+
+        /* struct st_mysql_show_var* */
+        STRUCT_FLD(status_vars, NULL),
+
+        /* struct st_mysql_sys_var** */
+        STRUCT_FLD(system_vars, NULL),
+
+        /* string version */
+        /* const char * */
+        STRUCT_FLD(version_info, "1.0"),
+
+        /* Maturity */
+        /* int */
+        STRUCT_FLD(maturity, MariaDB_PLUGIN_MATURITY_STABLE)
+};
+
+
+static ST_FIELD_INFO	i_s_innodb_buffer_pool_pages_fields_info[] =
+{
+	{STRUCT_FLD(field_name,		"page_type"),
+	 STRUCT_FLD(field_length,	64),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_STRING),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_MAYBE_NULL),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+	{STRUCT_FLD(field_name,		"space_id"),
+	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+	{STRUCT_FLD(field_name,		"page_no"),
+	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+	{STRUCT_FLD(field_name,		"lru_position"),
+	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+	{STRUCT_FLD(field_name,		"fix_count"),
+	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+	{STRUCT_FLD(field_name,		"flush_type"),
+	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+	END_OF_ST_FIELD_INFO
+};
+
+static ST_FIELD_INFO	i_s_innodb_buffer_pool_pages_index_fields_info[] =
+{
+	{STRUCT_FLD(field_name,		"index_id"),
+	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+	{STRUCT_FLD(field_name,		"space_id"),
+	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+	{STRUCT_FLD(field_name,		"page_no"),
+	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+	{STRUCT_FLD(field_name,		"n_recs"),
+	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+	{STRUCT_FLD(field_name,		"data_size"),
+	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+	{STRUCT_FLD(field_name,		"hashed"),
+	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+	{STRUCT_FLD(field_name,		"access_time"),
+	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+	{STRUCT_FLD(field_name,		"modified"),
+	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+	{STRUCT_FLD(field_name,		"dirty"),
+	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+	{STRUCT_FLD(field_name,		"old"),
+	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+	{STRUCT_FLD(field_name,		"lru_position"),
+	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+	{STRUCT_FLD(field_name,		"fix_count"),
+	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+	{STRUCT_FLD(field_name,		"flush_type"),
+	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+	END_OF_ST_FIELD_INFO
+};
+
+static ST_FIELD_INFO	i_s_innodb_buffer_pool_pages_blob_fields_info[] =
+{
+	{STRUCT_FLD(field_name,		"space_id"),
+	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+	{STRUCT_FLD(field_name,		"page_no"),
+	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+	{STRUCT_FLD(field_name,		"compressed"),
+	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+	{STRUCT_FLD(field_name,		"part_len"),
+	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+	{STRUCT_FLD(field_name,		"next_page_no"),
+	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+	{STRUCT_FLD(field_name,		"lru_position"),
+	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+	{STRUCT_FLD(field_name,		"fix_count"),
+	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+	{STRUCT_FLD(field_name,		"flush_type"),
+	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+	END_OF_ST_FIELD_INFO
+};
+
+/***********************************************************************
+Fill the dynamic table information_schema.innodb_buffer_pool_pages. */
+static
+int
+i_s_innodb_buffer_pool_pages_fill(
+/*================*/
+				/* out: 0 on success, 1 on failure */
+	THD*		thd,	/* in: thread */
+	TABLE_LIST*	tables,	/* in/out: tables to fill */
+	COND*		cond)	/* in: condition (ignored) */
+{
+	TABLE*	table	= (TABLE *) tables->table;
+	int	status	= 0;
+
+  ulint		n_chunks, n_blocks;
+
+	buf_chunk_t*	chunk;
+
+	DBUG_ENTER("i_s_innodb_buffer_pool_pages_fill");
+
+	/* deny access to non-superusers */
+	if (check_global_access(thd, PROCESS_ACL)) {
+
+		DBUG_RETURN(0);
+	}
+
+	RETURN_IF_INNODB_NOT_STARTED(tables->schema_table_name);
+
+	buf_pool_mutex_enter();
+	
+	chunk = buf_pool->chunks;
+  
+	for (n_chunks = buf_pool->n_chunks; n_chunks--; chunk++) {
+		buf_block_t*	block		= chunk->blocks;
+
+    for (n_blocks	= chunk->size; n_blocks--; block++) {
+      const buf_frame_t* frame = block->frame;
+  
+      char page_type[64];
+
+      switch(fil_page_get_type(frame))
+      {
+      case FIL_PAGE_INDEX:
+        strcpy(page_type, "index");
+        break;
+      case FIL_PAGE_UNDO_LOG:
+        strcpy(page_type, "undo_log");
+        break;
+      case FIL_PAGE_INODE:
+        strcpy(page_type, "inode");
+        break;
+      case FIL_PAGE_IBUF_FREE_LIST:
+        strcpy(page_type, "ibuf_free_list");
+        break;
+      case FIL_PAGE_TYPE_ALLOCATED:
+        strcpy(page_type, "allocated");
+        break;
+      case FIL_PAGE_IBUF_BITMAP:
+        strcpy(page_type, "bitmap");
+        break;
+      case FIL_PAGE_TYPE_SYS:
+        strcpy(page_type, "sys");
+        break;
+      case FIL_PAGE_TYPE_TRX_SYS:
+        strcpy(page_type, "trx_sys");
+        break;
+      case FIL_PAGE_TYPE_FSP_HDR:
+        strcpy(page_type, "fsp_hdr");
+        break;
+      case FIL_PAGE_TYPE_XDES:
+        strcpy(page_type, "xdes");
+        break;
+      case FIL_PAGE_TYPE_BLOB:
+        strcpy(page_type, "blob");
+        break;
+      case FIL_PAGE_TYPE_ZBLOB:
+        strcpy(page_type, "zblob");
+        break;
+      case FIL_PAGE_TYPE_ZBLOB2:
+        strcpy(page_type, "zblob2");
+        break;
+      default:
+        sprintf(page_type, "unknown (type=%li)", fil_page_get_type(frame));
+      }
+      
+      field_store_string(table->field[0], page_type);
+      table->field[1]->store(block->page.space);
+      table->field[2]->store(block->page.offset);
+      table->field[3]->store(0);
+      table->field[4]->store(block->page.buf_fix_count);
+      table->field[5]->store(block->page.flush_type);
+
+      if (schema_table_store_record(thd, table)) {
+        status = 1;
+        break;
+      }
+      
+    }      
+	}
+
+	buf_pool_mutex_exit();
+
+	DBUG_RETURN(status);
+}
+
+/***********************************************************************
+Fill the dynamic table information_schema.innodb_buffer_pool_pages_index. */
+static
+int
+i_s_innodb_buffer_pool_pages_index_fill(
+/*================*/
+				/* out: 0 on success, 1 on failure */
+	THD*		thd,	/* in: thread */
+	TABLE_LIST*	tables,	/* in/out: tables to fill */
+	COND*		cond)	/* in: condition (ignored) */
+{
+	TABLE*	table	= (TABLE *) tables->table;
+	int	status	= 0;
+
+  ulint		n_chunks, n_blocks;
+  dulint		index_id;
+
+	buf_chunk_t*	chunk;
+
+	DBUG_ENTER("i_s_innodb_buffer_pool_pages_index_fill");
+
+	/* deny access to non-superusers */
+	if (check_global_access(thd, PROCESS_ACL)) {
+
+		DBUG_RETURN(0);
+	}
+
+	RETURN_IF_INNODB_NOT_STARTED(tables->schema_table_name);
+
+	buf_pool_mutex_enter();
+	
+	chunk = buf_pool->chunks;
+  
+	for (n_chunks = buf_pool->n_chunks; n_chunks--; chunk++) {
+		buf_block_t*	block		= chunk->blocks;
+
+		for (n_blocks	= chunk->size; n_blocks--; block++) {
+			const buf_frame_t* frame = block->frame;
+  
+      if (fil_page_get_type(frame) == FIL_PAGE_INDEX) {
+        index_id = btr_page_get_index_id(frame);
+        table->field[0]->store(ut_conv_dulint_to_longlong(index_id));
+        table->field[1]->store(block->page.space);
+        table->field[2]->store(block->page.offset);
+        table->field[3]->store(page_get_n_recs(frame));
+        table->field[4]->store(page_get_data_size(frame));
+        table->field[5]->store(block->is_hashed);
+        table->field[6]->store(block->page.access_time);
+        table->field[7]->store(block->page.newest_modification != 0);
+        table->field[8]->store(block->page.oldest_modification != 0);
+        table->field[9]->store(block->page.old);
+        table->field[10]->store(0);
+        table->field[11]->store(block->page.buf_fix_count);
+        table->field[12]->store(block->page.flush_type);
+          
+        if (schema_table_store_record(thd, table)) {
+          status = 1;
+          break;
+        }
+      }      
+    }
+	}
+
+	buf_pool_mutex_exit();
+
+	DBUG_RETURN(status);
+}
+
+/***********************************************************************
+Fill the dynamic table information_schema.innodb_buffer_pool_pages_index. */
+static
+int
+i_s_innodb_buffer_pool_pages_blob_fill(
+/*================*/
+				/* out: 0 on success, 1 on failure */
+	THD*		thd,	/* in: thread */
+	TABLE_LIST*	tables,	/* in/out: tables to fill */
+	COND*		cond)	/* in: condition (ignored) */
+{
+	TABLE*	table	= (TABLE *) tables->table;
+	int	status	= 0;
+
+  ulint		n_chunks, n_blocks;
+	buf_chunk_t*	chunk;
+	page_zip_des_t*	block_page_zip;
+
+	ulint		part_len;
+	ulint		next_page_no;
+
+	DBUG_ENTER("i_s_innodb_buffer_pool_pages_blob_fill");
+
+	/* deny access to non-superusers */
+	if (check_global_access(thd, PROCESS_ACL)) {
+
+		DBUG_RETURN(0);
+	}
+
+	RETURN_IF_INNODB_NOT_STARTED(tables->schema_table_name);
+
+	buf_pool_mutex_enter();
+	
+	chunk = buf_pool->chunks;
+    
+	for (n_chunks = buf_pool->n_chunks; n_chunks--; chunk++) {
+		buf_block_t*	block		= chunk->blocks;
+    block_page_zip = buf_block_get_page_zip(block);
+
+    for (n_blocks	= chunk->size; n_blocks--; block++) {
+      const buf_frame_t* frame = block->frame;
+
+      if (fil_page_get_type(frame) == FIL_PAGE_TYPE_BLOB) {
+
+        if (UNIV_LIKELY_NULL(block_page_zip)) {
+          part_len = 0; /* hmm, can't figure it out */
+  
+          next_page_no = mach_read_from_4(
+            buf_block_get_frame(block)
+            + FIL_PAGE_NEXT);        
+        } else {
+          part_len = mach_read_from_4(
+            buf_block_get_frame(block)
+            + FIL_PAGE_DATA
+            + 0 /*BTR_BLOB_HDR_PART_LEN*/);
+  
+          next_page_no = mach_read_from_4(
+            buf_block_get_frame(block)
+            + FIL_PAGE_DATA
+            + 4 /*BTR_BLOB_HDR_NEXT_PAGE_NO*/);
+        }
+
+        table->field[0]->store(block->page.space);
+        table->field[1]->store(block->page.offset);
+        table->field[2]->store(block_page_zip != NULL);
+        table->field[3]->store(part_len);
+
+        if(next_page_no == FIL_NULL)
+        {
+          table->field[4]->store(0);
+        } else {
+          table->field[4]->store(block->page.offset);
+        }
+
+        table->field[5]->store(0);
+        table->field[6]->store(block->page.buf_fix_count);
+        table->field[7]->store(block->page.flush_type);
+  
+        if (schema_table_store_record(thd, table)) {
+          status = 1;
+          break;
+        }
+
+      }
+    }      
+	}
+
+	buf_pool_mutex_exit();
+
+	DBUG_RETURN(status);
+}
+
+/***********************************************************************
+Bind the dynamic table information_schema.innodb_buffer_pool_pages. */
+static
+int
+i_s_innodb_buffer_pool_pages_init(
+/*=========*/
+			/* out: 0 on success */
+	void*	p)	/* in/out: table schema object */
+{
+	DBUG_ENTER("i_s_innodb_buffer_pool_pages_init");
+	ST_SCHEMA_TABLE* schema = (ST_SCHEMA_TABLE*) p;
+
+	schema->fields_info = i_s_innodb_buffer_pool_pages_fields_info;
+	schema->fill_table = i_s_innodb_buffer_pool_pages_fill;
+
+	DBUG_RETURN(0);
+}
+
+/***********************************************************************
+Bind the dynamic table information_schema.innodb_buffer_pool_pages. */
+static
+int
+i_s_innodb_buffer_pool_pages_index_init(
+/*=========*/
+			/* out: 0 on success */
+	void*	p)	/* in/out: table schema object */
+{
+	DBUG_ENTER("i_s_innodb_buffer_pool_pages_index_init");
+	ST_SCHEMA_TABLE* schema = (ST_SCHEMA_TABLE*) p;
+
+	schema->fields_info = i_s_innodb_buffer_pool_pages_index_fields_info;
+	schema->fill_table = i_s_innodb_buffer_pool_pages_index_fill;
+
+	DBUG_RETURN(0);
+}
+
+/***********************************************************************
+Bind the dynamic table information_schema.innodb_buffer_pool_pages. */
+static
+int
+i_s_innodb_buffer_pool_pages_blob_init(
+/*=========*/
+			/* out: 0 on success */
+	void*	p)	/* in/out: table schema object */
+{
+	DBUG_ENTER("i_s_innodb_buffer_pool_pages_blob_init");
+	ST_SCHEMA_TABLE* schema = (ST_SCHEMA_TABLE*) p;
+
+	schema->fields_info = i_s_innodb_buffer_pool_pages_blob_fields_info;
+	schema->fill_table = i_s_innodb_buffer_pool_pages_blob_fill;
+
+	DBUG_RETURN(0);
+}
+
+
+UNIV_INTERN struct st_mysql_plugin	i_s_innodb_buffer_pool_pages =
+{
+	/* the plugin type (a MYSQL_XXX_PLUGIN value) */
+	/* int */
+	STRUCT_FLD(type, MYSQL_INFORMATION_SCHEMA_PLUGIN),
+
+	/* pointer to type-specific plugin descriptor */
+	/* void* */
+	STRUCT_FLD(info, &i_s_info),
+
+	/* plugin name */
+	/* const char* */
+	STRUCT_FLD(name, "INNODB_BUFFER_POOL_PAGES"),
+
+	/* plugin author (for SHOW PLUGINS) */
+	/* const char* */
+	STRUCT_FLD(author, plugin_author),
+
+	/* general descriptive text (for SHOW PLUGINS) */
+	/* const char* */
+	STRUCT_FLD(descr, "InnoDB buffer pool pages"),
+
+	/* the plugin license (PLUGIN_LICENSE_XXX) */
+	/* int */
+	STRUCT_FLD(license, PLUGIN_LICENSE_GPL),
+
+	/* the function to invoke when plugin is loaded */
+	/* int (*)(void*); */
+	STRUCT_FLD(init, i_s_innodb_buffer_pool_pages_init),
+
+	/* the function to invoke when plugin is unloaded */
+	/* int (*)(void*); */
+	STRUCT_FLD(deinit, i_s_common_deinit),
+
+	/* plugin version (for SHOW PLUGINS) */
+	/* unsigned int */
+	STRUCT_FLD(version, 0x0100 /* 1.0 */),
+
+	/* struct st_mysql_show_var* */
+	STRUCT_FLD(status_vars, NULL),
+
+	/* struct st_mysql_sys_var** */
+	STRUCT_FLD(system_vars, NULL),
+
+	/* reserved for dependency checking */
+	/* void* */
+	STRUCT_FLD(__reserved1, NULL)
+};
+
+UNIV_INTERN struct st_maria_plugin      i_s_innodb_buffer_pool_pages_maria =
+{
+        /* the plugin type (a MYSQL_XXX_PLUGIN value) */
+        /* int */
+        STRUCT_FLD(type, MYSQL_INFORMATION_SCHEMA_PLUGIN),
+
+        /* pointer to type-specific plugin descriptor */
+        /* void* */
+        STRUCT_FLD(info, &i_s_info),
+
+        /* plugin name */
+        /* const char* */
+        STRUCT_FLD(name, "INNODB_BUFFER_POOL_PAGES"),
+
+        /* plugin author (for SHOW PLUGINS) */
+        /* const char* */
+        STRUCT_FLD(author, plugin_author),
+
+        /* general descriptive text (for SHOW PLUGINS) */
+        /* const char* */
+        STRUCT_FLD(descr, "InnoDB buffer pool pages"),
+
+        /* the plugin license (PLUGIN_LICENSE_XXX) */
+        /* int */
+        STRUCT_FLD(license, PLUGIN_LICENSE_GPL),
+
+        /* the function to invoke when plugin is loaded */
+        /* int (*)(void*); */
+        STRUCT_FLD(init, i_s_innodb_buffer_pool_pages_init),
+
+        /* the function to invoke when plugin is unloaded */
+        /* int (*)(void*); */
+        STRUCT_FLD(deinit, i_s_common_deinit),
+
+        /* plugin version (for SHOW PLUGINS) */
+        /* unsigned int */
+        STRUCT_FLD(version, 0x0100 /* 1.0 */),
+
+        /* struct st_mysql_show_var* */
+        STRUCT_FLD(status_vars, NULL),
+
+        /* struct st_mysql_sys_var** */
+        STRUCT_FLD(system_vars, NULL),
+
+        /* string version */
+        /* const char * */
+        STRUCT_FLD(version_info, "1.0"),
+
+        /* Maturity */
+        /* int */
+        STRUCT_FLD(maturity, MariaDB_PLUGIN_MATURITY_STABLE)
+};
+
+UNIV_INTERN struct st_mysql_plugin	i_s_innodb_buffer_pool_pages_index =
+{
+	/* the plugin type (a MYSQL_XXX_PLUGIN value) */
+	/* int */
+	STRUCT_FLD(type, MYSQL_INFORMATION_SCHEMA_PLUGIN),
+
+	/* pointer to type-specific plugin descriptor */
+	/* void* */
+	STRUCT_FLD(info, &i_s_info),
+
+	/* plugin name */
+	/* const char* */
+	STRUCT_FLD(name, "INNODB_BUFFER_POOL_PAGES_INDEX"),
+
+	/* plugin author (for SHOW PLUGINS) */
+	/* const char* */
+	STRUCT_FLD(author, plugin_author),
+
+	/* general descriptive text (for SHOW PLUGINS) */
+	/* const char* */
+	STRUCT_FLD(descr, "InnoDB buffer pool index pages"),
+
+	/* the plugin license (PLUGIN_LICENSE_XXX) */
+	/* int */
+	STRUCT_FLD(license, PLUGIN_LICENSE_GPL),
+
+	/* the function to invoke when plugin is loaded */
+	/* int (*)(void*); */
+	STRUCT_FLD(init, i_s_innodb_buffer_pool_pages_index_init),
+
+	/* the function to invoke when plugin is unloaded */
+	/* int (*)(void*); */
+	STRUCT_FLD(deinit, i_s_common_deinit),
+
+	/* plugin version (for SHOW PLUGINS) */
+	/* unsigned int */
+	STRUCT_FLD(version, 0x0100 /* 1.0 */),
+
+	/* struct st_mysql_show_var* */
+	STRUCT_FLD(status_vars, NULL),
+
+	/* struct st_mysql_sys_var** */
+	STRUCT_FLD(system_vars, NULL),
+
+	/* reserved for dependency checking */
+	/* void* */
+	STRUCT_FLD(__reserved1, NULL)
+};
+
+UNIV_INTERN struct st_maria_plugin i_s_innodb_buffer_pool_pages_index_maria =
+{
+        /* the plugin type (a MYSQL_XXX_PLUGIN value) */
+        /* int */
+        STRUCT_FLD(type, MYSQL_INFORMATION_SCHEMA_PLUGIN),
+
+        /* pointer to type-specific plugin descriptor */
+        /* void* */
+        STRUCT_FLD(info, &i_s_info),
+
+        /* plugin name */
+        /* const char* */
+        STRUCT_FLD(name, "INNODB_BUFFER_POOL_PAGES_INDEX"),
+
+        /* plugin author (for SHOW PLUGINS) */
+        /* const char* */
+        STRUCT_FLD(author, plugin_author),
+
+        /* general descriptive text (for SHOW PLUGINS) */
+        /* const char* */
+        STRUCT_FLD(descr, "InnoDB buffer pool index pages"),
+
+        /* the plugin license (PLUGIN_LICENSE_XXX) */
+        /* int */
+        STRUCT_FLD(license, PLUGIN_LICENSE_GPL),
+
+        /* the function to invoke when plugin is loaded */
+        /* int (*)(void*); */
+        STRUCT_FLD(init, i_s_innodb_buffer_pool_pages_index_init),
+
+        /* the function to invoke when plugin is unloaded */
+        /* int (*)(void*); */
+        STRUCT_FLD(deinit, i_s_common_deinit),
+
+        /* plugin version (for SHOW PLUGINS) */
+        /* unsigned int */
+        STRUCT_FLD(version, 0x0100 /* 1.0 */),
+
+        /* struct st_mysql_show_var* */
+        STRUCT_FLD(status_vars, NULL),
+
+        /* struct st_mysql_sys_var** */
+        STRUCT_FLD(system_vars, NULL),
+
+        /* string version */
+        /* const char * */
+        STRUCT_FLD(version_info, "1.0"),
+
+        /* Maturity */
+        /* int */
+        STRUCT_FLD(maturity, MariaDB_PLUGIN_MATURITY_STABLE)
+};
+
+UNIV_INTERN struct st_mysql_plugin	i_s_innodb_buffer_pool_pages_blob =
+{
+	/* the plugin type (a MYSQL_XXX_PLUGIN value) */
+	/* int */
+	STRUCT_FLD(type, MYSQL_INFORMATION_SCHEMA_PLUGIN),
+
+	/* pointer to type-specific plugin descriptor */
+	/* void* */
+	STRUCT_FLD(info, &i_s_info),
+
+	/* plugin name */
+	/* const char* */
+	STRUCT_FLD(name, "INNODB_BUFFER_POOL_PAGES_BLOB"),
+
+	/* plugin author (for SHOW PLUGINS) */
+	/* const char* */
+	STRUCT_FLD(author, plugin_author),
+
+	/* general descriptive text (for SHOW PLUGINS) */
+	/* const char* */
+	STRUCT_FLD(descr, "InnoDB buffer pool blob pages"),
+
+	/* the plugin license (PLUGIN_LICENSE_XXX) */
+	/* int */
+	STRUCT_FLD(license, PLUGIN_LICENSE_GPL),
+
+	/* the function to invoke when plugin is loaded */
+	/* int (*)(void*); */
+	STRUCT_FLD(init, i_s_innodb_buffer_pool_pages_blob_init),
+
+	/* the function to invoke when plugin is unloaded */
+	/* int (*)(void*); */
+	STRUCT_FLD(deinit, i_s_common_deinit),
+
+	/* plugin version (for SHOW PLUGINS) */
+	/* unsigned int */
+	STRUCT_FLD(version, 0x0100 /* 1.0 */),
+
+	/* struct st_mysql_show_var* */
+	STRUCT_FLD(status_vars, NULL),
+
+	/* struct st_mysql_sys_var** */
+	STRUCT_FLD(system_vars, NULL),
+
+	/* reserved for dependency checking */
+	/* void* */
+	STRUCT_FLD(__reserved1, NULL)
+};
+
+UNIV_INTERN struct st_maria_plugin i_s_innodb_buffer_pool_pages_blob_maria =
+{
+        /* the plugin type (a MYSQL_XXX_PLUGIN value) */
+        /* int */
+        STRUCT_FLD(type, MYSQL_INFORMATION_SCHEMA_PLUGIN),
+
+        /* pointer to type-specific plugin descriptor */
+        /* void* */
+        STRUCT_FLD(info, &i_s_info),
+
+        /* plugin name */
+        /* const char* */
+        STRUCT_FLD(name, "INNODB_BUFFER_POOL_PAGES_BLOB"),
+
+        /* plugin author (for SHOW PLUGINS) */
+        /* const char* */
+        STRUCT_FLD(author, plugin_author),
+
+        /* general descriptive text (for SHOW PLUGINS) */
+        /* const char* */
+        STRUCT_FLD(descr, "InnoDB buffer pool blob pages"),
+
+        /* the plugin license (PLUGIN_LICENSE_XXX) */
+        /* int */
+        STRUCT_FLD(license, PLUGIN_LICENSE_GPL),
+
+        /* the function to invoke when plugin is loaded */
+        /* int (*)(void*); */
+        STRUCT_FLD(init, i_s_innodb_buffer_pool_pages_blob_init),
+
+        /* the function to invoke when plugin is unloaded */
+        /* int (*)(void*); */
+        STRUCT_FLD(deinit, i_s_common_deinit),
+
+        /* plugin version (for SHOW PLUGINS) */
+        /* unsigned int */
+        STRUCT_FLD(version, 0x0100 /* 1.0 */),
+
+        /* struct st_mysql_show_var* */
+        STRUCT_FLD(status_vars, NULL),
+
+        /* struct st_mysql_sys_var** */
+        STRUCT_FLD(system_vars, NULL),
+
+        /* string version */
+        /* const char * */
+        STRUCT_FLD(version_info, "1.0"),
+
+        /* Maturity */
+        /* int */
+        STRUCT_FLD(maturity, MariaDB_PLUGIN_MATURITY_STABLE)
+};
+
+
+/* Fields of the dynamic table INFORMATION_SCHEMA.innodb_trx */
+static ST_FIELD_INFO	innodb_trx_fields_info[] =
+{
+#define IDX_TRX_ID		0
+	{STRUCT_FLD(field_name,		"trx_id"),
+	 STRUCT_FLD(field_length,	TRX_ID_MAX_LEN + 1),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_STRING),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	0),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define IDX_TRX_STATE		1
+	{STRUCT_FLD(field_name,		"trx_state"),
+	 STRUCT_FLD(field_length,	TRX_QUE_STATE_STR_MAX_LEN + 1),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_STRING),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	0),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define IDX_TRX_STARTED		2
+	{STRUCT_FLD(field_name,		"trx_started"),
+	 STRUCT_FLD(field_length,	0),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_DATETIME),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	0),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define IDX_TRX_REQUESTED_LOCK_ID	3
+	{STRUCT_FLD(field_name,		"trx_requested_lock_id"),
+	 STRUCT_FLD(field_length,	TRX_I_S_LOCK_ID_MAX_LEN + 1),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_STRING),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_MAYBE_NULL),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define IDX_TRX_WAIT_STARTED	4
+	{STRUCT_FLD(field_name,		"trx_wait_started"),
+	 STRUCT_FLD(field_length,	0),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_DATETIME),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_MAYBE_NULL),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define IDX_TRX_WEIGHT		5
+	{STRUCT_FLD(field_name,		"trx_weight"),
+	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define IDX_TRX_MYSQL_THREAD_ID	6
+	{STRUCT_FLD(field_name,		"trx_mysql_thread_id"),
+	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define IDX_TRX_QUERY		7
+	{STRUCT_FLD(field_name,		"trx_query"),
+	 STRUCT_FLD(field_length,	TRX_I_S_TRX_QUERY_MAX_LEN),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_STRING),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_MAYBE_NULL),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+	END_OF_ST_FIELD_INFO
+};
+
+/*******************************************************************//**
+Read data from cache buffer and fill the INFORMATION_SCHEMA.innodb_trx
+table with it.
+@return	0 on success */
+static
+int
+fill_innodb_trx_from_cache(
+/*=======================*/
+	trx_i_s_cache_t*	cache,	/*!< in: cache to read from */
+	THD*			thd,	/*!< in: used to call
+					schema_table_store_record() */
+	TABLE*			table)	/*!< in/out: fill this table */
+{
+	Field**	fields;
+	ulint	rows_num;
+	char	lock_id[TRX_I_S_LOCK_ID_MAX_LEN + 1];
+	ulint	i;
+
+	DBUG_ENTER("fill_innodb_trx_from_cache");
+
+	fields = table->field;
+
+	rows_num = trx_i_s_cache_get_rows_used(cache,
+					       I_S_INNODB_TRX);
+
+	for (i = 0; i < rows_num; i++) {
+
+		i_s_trx_row_t*	row;
+		char		trx_id[TRX_ID_MAX_LEN + 1];
+
+		row = (i_s_trx_row_t*)
+			trx_i_s_cache_get_nth_row(
+				cache, I_S_INNODB_TRX, i);
+
+		/* trx_id */
+		ut_snprintf(trx_id, sizeof(trx_id), TRX_ID_FMT, row->trx_id);
+		OK(field_store_string(fields[IDX_TRX_ID], trx_id));
+
+		/* trx_state */
+		OK(field_store_string(fields[IDX_TRX_STATE],
+				      row->trx_state));
+
+		/* trx_started */
+		OK(field_store_time_t(fields[IDX_TRX_STARTED],
+				      (time_t) row->trx_started));
+
+		/* trx_requested_lock_id */
+		/* trx_wait_started */
+		if (row->trx_wait_started != 0) {
+
+			OK(field_store_string(
+				   fields[IDX_TRX_REQUESTED_LOCK_ID],
+				   trx_i_s_create_lock_id(
+					   row->requested_lock_row,
+					   lock_id, sizeof(lock_id))));
+			/* field_store_string() sets it no notnull */
+
+			OK(field_store_time_t(
+				   fields[IDX_TRX_WAIT_STARTED],
+				   (time_t) row->trx_wait_started));
+			fields[IDX_TRX_WAIT_STARTED]->set_notnull();
+		} else {
+
+			fields[IDX_TRX_REQUESTED_LOCK_ID]->set_null();
+			fields[IDX_TRX_WAIT_STARTED]->set_null();
+		}
+
+		/* trx_weight */
+		OK(fields[IDX_TRX_WEIGHT]->store((longlong) row->trx_weight,
+						 true));
+
+		/* trx_mysql_thread_id */
+		OK(fields[IDX_TRX_MYSQL_THREAD_ID]->store(
+			   row->trx_mysql_thread_id));
+
+		/* trx_query */
+		OK(field_store_string(fields[IDX_TRX_QUERY],
+				      row->trx_query));
+
+		OK(schema_table_store_record(thd, table));
+	}
+
+	DBUG_RETURN(0);
+}
+
+/*******************************************************************//**
+Bind the dynamic table INFORMATION_SCHEMA.innodb_trx
+@return	0 on success */
+static
+int
+innodb_trx_init(
+/*============*/
+	void*	p)	/*!< in/out: table schema object */
+{
+	ST_SCHEMA_TABLE*	schema;
+
+	DBUG_ENTER("innodb_trx_init");
+
+	schema = (ST_SCHEMA_TABLE*) p;
+
+	schema->fields_info = innodb_trx_fields_info;
+	schema->fill_table = trx_i_s_common_fill_table;
+
+	DBUG_RETURN(0);
+}
+
+
+UNIV_INTERN struct st_mysql_plugin	i_s_innodb_trx =
+{
+	/* the plugin type (a MYSQL_XXX_PLUGIN value) */
+	/* int */
+	STRUCT_FLD(type, MYSQL_INFORMATION_SCHEMA_PLUGIN),
+
+	/* pointer to type-specific plugin descriptor */
+	/* void* */
+	STRUCT_FLD(info, &i_s_info),
+
+	/* plugin name */
+	/* const char* */
+	STRUCT_FLD(name, "INNODB_TRX"),
+
+	/* plugin author (for SHOW PLUGINS) */
+	/* const char* */
+	STRUCT_FLD(author, plugin_author),
+
+	/* general descriptive text (for SHOW PLUGINS) */
+	/* const char* */
+	STRUCT_FLD(descr, "InnoDB transactions"),
+
+	/* the plugin license (PLUGIN_LICENSE_XXX) */
+	/* int */
+	STRUCT_FLD(license, PLUGIN_LICENSE_GPL),
+
+	/* the function to invoke when plugin is loaded */
+	/* int (*)(void*); */
+	STRUCT_FLD(init, innodb_trx_init),
+
+	/* the function to invoke when plugin is unloaded */
+	/* int (*)(void*); */
+	STRUCT_FLD(deinit, i_s_common_deinit),
+
+	/* plugin version (for SHOW PLUGINS) */
+	/* unsigned int */
+	STRUCT_FLD(version, INNODB_VERSION_SHORT),
+
+	/* struct st_mysql_show_var* */
+	STRUCT_FLD(status_vars, NULL),
+
+	/* struct st_mysql_sys_var** */
+	STRUCT_FLD(system_vars, NULL),
+
+	/* reserved for dependency checking */
+	/* void* */
+	STRUCT_FLD(__reserved1, NULL)
+};
+
+
+UNIV_INTERN struct st_maria_plugin      i_s_innodb_trx_maria =
+{
+        /* the plugin type (a MYSQL_XXX_PLUGIN value) */
+        /* int */
+        STRUCT_FLD(type, MYSQL_INFORMATION_SCHEMA_PLUGIN),
+
+        /* pointer to type-specific plugin descriptor */
+        /* void* */
+        STRUCT_FLD(info, &i_s_info),
+
+        /* plugin name */
+        /* const char* */
+        STRUCT_FLD(name, "INNODB_TRX"),
+
+        /* plugin author (for SHOW PLUGINS) */
+        /* const char* */
+        STRUCT_FLD(author, plugin_author),
+
+        /* general descriptive text (for SHOW PLUGINS) */
+        /* const char* */
+        STRUCT_FLD(descr, "InnoDB transactions"),
+
+        /* the plugin license (PLUGIN_LICENSE_XXX) */
+        /* int */
+        STRUCT_FLD(license, PLUGIN_LICENSE_GPL),
+
+        /* the function to invoke when plugin is loaded */
+        /* int (*)(void*); */
+        STRUCT_FLD(init, innodb_trx_init),
+
+        /* the function to invoke when plugin is unloaded */
+        /* int (*)(void*); */
+        STRUCT_FLD(deinit, i_s_common_deinit),
+
+        /* plugin version (for SHOW PLUGINS) */
+        /* unsigned int */
+        STRUCT_FLD(version, INNODB_VERSION_SHORT),
+
+        /* struct st_mysql_show_var* */
+        STRUCT_FLD(status_vars, NULL),
+
+        /* struct st_mysql_sys_var** */
+        STRUCT_FLD(system_vars, NULL),
+
+        /* string version */
+        /* const char * */
+        STRUCT_FLD(version_info, "1.0"),
+
+        /* Maturity */
+        /* int */
+        STRUCT_FLD(maturity, MariaDB_PLUGIN_MATURITY_STABLE)
+};
+
+/* Fields of the dynamic table INFORMATION_SCHEMA.innodb_locks */
+static ST_FIELD_INFO	innodb_locks_fields_info[] =
+{
+#define IDX_LOCK_ID		0
+	{STRUCT_FLD(field_name,		"lock_id"),
+	 STRUCT_FLD(field_length,	TRX_I_S_LOCK_ID_MAX_LEN + 1),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_STRING),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	0),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define IDX_LOCK_TRX_ID		1
+	{STRUCT_FLD(field_name,		"lock_trx_id"),
+	 STRUCT_FLD(field_length,	TRX_ID_MAX_LEN + 1),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_STRING),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	0),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define IDX_LOCK_MODE		2
+	{STRUCT_FLD(field_name,		"lock_mode"),
+	 /* S[,GAP] X[,GAP] IS[,GAP] IX[,GAP] AUTO_INC UNKNOWN */
+	 STRUCT_FLD(field_length,	32),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_STRING),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	0),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define IDX_LOCK_TYPE		3
+	{STRUCT_FLD(field_name,		"lock_type"),
+	 STRUCT_FLD(field_length,	32 /* RECORD|TABLE|UNKNOWN */),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_STRING),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	0),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define IDX_LOCK_TABLE		4
+	{STRUCT_FLD(field_name,		"lock_table"),
+	 STRUCT_FLD(field_length,	1024),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_STRING),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	0),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define IDX_LOCK_INDEX		5
+	{STRUCT_FLD(field_name,		"lock_index"),
+	 STRUCT_FLD(field_length,	1024),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_STRING),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_MAYBE_NULL),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define IDX_LOCK_SPACE		6
+	{STRUCT_FLD(field_name,		"lock_space"),
+	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED | MY_I_S_MAYBE_NULL),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define IDX_LOCK_PAGE		7
+	{STRUCT_FLD(field_name,		"lock_page"),
+	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED | MY_I_S_MAYBE_NULL),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define IDX_LOCK_REC		8
+	{STRUCT_FLD(field_name,		"lock_rec"),
+	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED | MY_I_S_MAYBE_NULL),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define IDX_LOCK_DATA		9
+	{STRUCT_FLD(field_name,		"lock_data"),
+	 STRUCT_FLD(field_length,	TRX_I_S_LOCK_DATA_MAX_LEN),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_STRING),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_MAYBE_NULL),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+	END_OF_ST_FIELD_INFO
+};
+
+/*******************************************************************//**
+Read data from cache buffer and fill the INFORMATION_SCHEMA.innodb_locks
+table with it.
+@return	0 on success */
+static
+int
+fill_innodb_locks_from_cache(
+/*=========================*/
+	trx_i_s_cache_t*	cache,	/*!< in: cache to read from */
+	THD*			thd,	/*!< in: MySQL client connection */
+	TABLE*			table)	/*!< in/out: fill this table */
+{
+	Field**	fields;
+	ulint	rows_num;
+	char	lock_id[TRX_I_S_LOCK_ID_MAX_LEN + 1];
+	ulint	i;
+
+	DBUG_ENTER("fill_innodb_locks_from_cache");
+
+	fields = table->field;
+
+	rows_num = trx_i_s_cache_get_rows_used(cache,
+					       I_S_INNODB_LOCKS);
+
+	for (i = 0; i < rows_num; i++) {
+
+		i_s_locks_row_t*	row;
+
+		/* note that the decoded database or table name is
+		never expected to be longer than NAME_LEN;
+		NAME_LEN for database name
+		2 for surrounding quotes around database name
+		NAME_LEN for table name
+		2 for surrounding quotes around table name
+		1 for the separating dot (.)
+		9 for the #mysql50# prefix */
+		char			buf[2 * NAME_LEN + 14];
+		const char*		bufend;
+
+		char			lock_trx_id[TRX_ID_MAX_LEN + 1];
+
+		row = (i_s_locks_row_t*)
+			trx_i_s_cache_get_nth_row(
+				cache, I_S_INNODB_LOCKS, i);
+
+		/* lock_id */
+		trx_i_s_create_lock_id(row, lock_id, sizeof(lock_id));
+		OK(field_store_string(fields[IDX_LOCK_ID],
+				      lock_id));
+
+		/* lock_trx_id */
+		ut_snprintf(lock_trx_id, sizeof(lock_trx_id),
+			    TRX_ID_FMT, row->lock_trx_id);
+		OK(field_store_string(fields[IDX_LOCK_TRX_ID], lock_trx_id));
+
+		/* lock_mode */
+		OK(field_store_string(fields[IDX_LOCK_MODE],
+				      row->lock_mode));
+
+		/* lock_type */
+		OK(field_store_string(fields[IDX_LOCK_TYPE],
+				      row->lock_type));
+
+		/* lock_table */
+		bufend = innobase_convert_name(buf, sizeof(buf),
+					       row->lock_table,
+					       strlen(row->lock_table),
+					       thd, TRUE);
+		OK(fields[IDX_LOCK_TABLE]->store(buf, bufend - buf,
+						 system_charset_info));
+
+		/* lock_index */
+		if (row->lock_index != NULL) {
+
+			bufend = innobase_convert_name(buf, sizeof(buf),
+						       row->lock_index,
+						       strlen(row->lock_index),
+						       thd, FALSE);
+			OK(fields[IDX_LOCK_INDEX]->store(buf, bufend - buf,
+							 system_charset_info));
+			fields[IDX_LOCK_INDEX]->set_notnull();
+		} else {
+
+			fields[IDX_LOCK_INDEX]->set_null();
+		}
+
+		/* lock_space */
+		OK(field_store_ulint(fields[IDX_LOCK_SPACE],
+				     row->lock_space));
+
+		/* lock_page */
+		OK(field_store_ulint(fields[IDX_LOCK_PAGE],
+				     row->lock_page));
+
+		/* lock_rec */
+		OK(field_store_ulint(fields[IDX_LOCK_REC],
+				     row->lock_rec));
+
+		/* lock_data */
+		OK(field_store_string(fields[IDX_LOCK_DATA],
+				      row->lock_data));
+
+		OK(schema_table_store_record(thd, table));
+	}
+
+	DBUG_RETURN(0);
+}
+
+/*******************************************************************//**
+Bind the dynamic table INFORMATION_SCHEMA.innodb_locks
+@return	0 on success */
+static
+int
+innodb_locks_init(
+/*==============*/
+	void*	p)	/*!< in/out: table schema object */
+{
+	ST_SCHEMA_TABLE*	schema;
+
+	DBUG_ENTER("innodb_locks_init");
+
+	schema = (ST_SCHEMA_TABLE*) p;
+
+	schema->fields_info = innodb_locks_fields_info;
+	schema->fill_table = trx_i_s_common_fill_table;
+
+	DBUG_RETURN(0);
+}
+
+UNIV_INTERN struct st_mysql_plugin	i_s_innodb_locks =
+{
+	/* the plugin type (a MYSQL_XXX_PLUGIN value) */
+	/* int */
+	STRUCT_FLD(type, MYSQL_INFORMATION_SCHEMA_PLUGIN),
+
+	/* pointer to type-specific plugin descriptor */
+	/* void* */
+	STRUCT_FLD(info, &i_s_info),
+
+	/* plugin name */
+	/* const char* */
+	STRUCT_FLD(name, "INNODB_LOCKS"),
+
+	/* plugin author (for SHOW PLUGINS) */
+	/* const char* */
+	STRUCT_FLD(author, plugin_author),
+
+	/* general descriptive text (for SHOW PLUGINS) */
+	/* const char* */
+	STRUCT_FLD(descr, "InnoDB conflicting locks"),
+
+	/* the plugin license (PLUGIN_LICENSE_XXX) */
+	/* int */
+	STRUCT_FLD(license, PLUGIN_LICENSE_GPL),
+
+	/* the function to invoke when plugin is loaded */
+	/* int (*)(void*); */
+	STRUCT_FLD(init, innodb_locks_init),
+
+	/* the function to invoke when plugin is unloaded */
+	/* int (*)(void*); */
+	STRUCT_FLD(deinit, i_s_common_deinit),
+
+	/* plugin version (for SHOW PLUGINS) */
+	/* unsigned int */
+	STRUCT_FLD(version, INNODB_VERSION_SHORT),
+
+	/* struct st_mysql_show_var* */
+	STRUCT_FLD(status_vars, NULL),
+
+	/* struct st_mysql_sys_var** */
+	STRUCT_FLD(system_vars, NULL),
+
+	/* reserved for dependency checking */
+	/* void* */
+	STRUCT_FLD(__reserved1, NULL)
+};
+
+UNIV_INTERN struct st_maria_plugin      i_s_innodb_locks_maria =
+{
+        /* the plugin type (a MYSQL_XXX_PLUGIN value) */
+        /* int */
+        STRUCT_FLD(type, MYSQL_INFORMATION_SCHEMA_PLUGIN),
+
+        /* pointer to type-specific plugin descriptor */
+        /* void* */
+        STRUCT_FLD(info, &i_s_info),
+
+        /* plugin name */
+        /* const char* */
+        STRUCT_FLD(name, "INNODB_LOCKS"),
+
+        /* plugin author (for SHOW PLUGINS) */
+        /* const char* */
+        STRUCT_FLD(author, plugin_author),
+
+        /* general descriptive text (for SHOW PLUGINS) */
+        /* const char* */
+        STRUCT_FLD(descr, "InnoDB conflicting locks"),
+
+        /* the plugin license (PLUGIN_LICENSE_XXX) */
+        /* int */
+        STRUCT_FLD(license, PLUGIN_LICENSE_GPL),
+
+        /* the function to invoke when plugin is loaded */
+        /* int (*)(void*); */
+        STRUCT_FLD(init, innodb_locks_init),
+
+        /* the function to invoke when plugin is unloaded */
+        /* int (*)(void*); */
+        STRUCT_FLD(deinit, i_s_common_deinit),
+
+        /* plugin version (for SHOW PLUGINS) */
+        /* unsigned int */
+        STRUCT_FLD(version, INNODB_VERSION_SHORT),
+
+        /* struct st_mysql_show_var* */
+        STRUCT_FLD(status_vars, NULL),
+
+        /* struct st_mysql_sys_var** */
+        STRUCT_FLD(system_vars, NULL),
+
+        /* string version */
+        /* const char * */
+        STRUCT_FLD(version_info, "1.0"),
+
+        /* Maturity */
+        /* int */
+        STRUCT_FLD(maturity, MariaDB_PLUGIN_MATURITY_STABLE)
+};
+
+/* Fields of the dynamic table INFORMATION_SCHEMA.innodb_lock_waits */
+static ST_FIELD_INFO	innodb_lock_waits_fields_info[] =
+{
+#define IDX_REQUESTING_TRX_ID	0
+	{STRUCT_FLD(field_name,		"requesting_trx_id"),
+	 STRUCT_FLD(field_length,	TRX_ID_MAX_LEN + 1),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_STRING),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	0),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define IDX_REQUESTED_LOCK_ID	1
+	{STRUCT_FLD(field_name,		"requested_lock_id"),
+	 STRUCT_FLD(field_length,	TRX_I_S_LOCK_ID_MAX_LEN + 1),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_STRING),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	0),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define IDX_BLOCKING_TRX_ID	2
+	{STRUCT_FLD(field_name,		"blocking_trx_id"),
+	 STRUCT_FLD(field_length,	TRX_ID_MAX_LEN + 1),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_STRING),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	0),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+#define IDX_BLOCKING_LOCK_ID	3
+	{STRUCT_FLD(field_name,		"blocking_lock_id"),
+	 STRUCT_FLD(field_length,	TRX_I_S_LOCK_ID_MAX_LEN + 1),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_STRING),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	0),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+	END_OF_ST_FIELD_INFO
+};
+
+/*******************************************************************//**
+Read data from cache buffer and fill the
+INFORMATION_SCHEMA.innodb_lock_waits table with it.
+@return	0 on success */
+static
+int
+fill_innodb_lock_waits_from_cache(
+/*==============================*/
+	trx_i_s_cache_t*	cache,	/*!< in: cache to read from */
+	THD*			thd,	/*!< in: used to call
+					schema_table_store_record() */
+	TABLE*			table)	/*!< in/out: fill this table */
+{
+	Field**	fields;
+	ulint	rows_num;
+	char	requested_lock_id[TRX_I_S_LOCK_ID_MAX_LEN + 1];
+	char	blocking_lock_id[TRX_I_S_LOCK_ID_MAX_LEN + 1];
+	ulint	i;
+
+	DBUG_ENTER("fill_innodb_lock_waits_from_cache");
+
+	fields = table->field;
+
+	rows_num = trx_i_s_cache_get_rows_used(cache,
+					       I_S_INNODB_LOCK_WAITS);
+
+	for (i = 0; i < rows_num; i++) {
+
+		i_s_lock_waits_row_t*	row;
+
+		char	requesting_trx_id[TRX_ID_MAX_LEN + 1];
+		char	blocking_trx_id[TRX_ID_MAX_LEN + 1];
+
+		row = (i_s_lock_waits_row_t*)
+			trx_i_s_cache_get_nth_row(
+				cache, I_S_INNODB_LOCK_WAITS, i);
+
+		/* requesting_trx_id */
+		ut_snprintf(requesting_trx_id, sizeof(requesting_trx_id),
+			    TRX_ID_FMT, row->requested_lock_row->lock_trx_id);
+		OK(field_store_string(fields[IDX_REQUESTING_TRX_ID],
+				      requesting_trx_id));
+
+		/* requested_lock_id */
+		OK(field_store_string(
+			   fields[IDX_REQUESTED_LOCK_ID],
+			   trx_i_s_create_lock_id(
+				   row->requested_lock_row,
+				   requested_lock_id,
+				   sizeof(requested_lock_id))));
+
+		/* blocking_trx_id */
+		ut_snprintf(blocking_trx_id, sizeof(blocking_trx_id),
+			    TRX_ID_FMT, row->blocking_lock_row->lock_trx_id);
+		OK(field_store_string(fields[IDX_BLOCKING_TRX_ID],
+				      blocking_trx_id));
+
+		/* blocking_lock_id */
+		OK(field_store_string(
+			   fields[IDX_BLOCKING_LOCK_ID],
+			   trx_i_s_create_lock_id(
+				   row->blocking_lock_row,
+				   blocking_lock_id,
+				   sizeof(blocking_lock_id))));
+
+		OK(schema_table_store_record(thd, table));
+	}
+
+	DBUG_RETURN(0);
+}
+
+/*******************************************************************//**
+Bind the dynamic table INFORMATION_SCHEMA.innodb_lock_waits
+@return	0 on success */
+static
+int
+innodb_lock_waits_init(
+/*===================*/
+	void*	p)	/*!< in/out: table schema object */
+{
+	ST_SCHEMA_TABLE*	schema;
+
+	DBUG_ENTER("innodb_lock_waits_init");
+
+	schema = (ST_SCHEMA_TABLE*) p;
+
+	schema->fields_info = innodb_lock_waits_fields_info;
+	schema->fill_table = trx_i_s_common_fill_table;
+
+	DBUG_RETURN(0);
+}
+
+UNIV_INTERN struct st_mysql_plugin	i_s_innodb_lock_waits =
+{
+	/* the plugin type (a MYSQL_XXX_PLUGIN value) */
+	/* int */
+	STRUCT_FLD(type, MYSQL_INFORMATION_SCHEMA_PLUGIN),
+
+	/* pointer to type-specific plugin descriptor */
+	/* void* */
+	STRUCT_FLD(info, &i_s_info),
+
+	/* plugin name */
+	/* const char* */
+	STRUCT_FLD(name, "INNODB_LOCK_WAITS"),
+
+	/* plugin author (for SHOW PLUGINS) */
+	/* const char* */
+	STRUCT_FLD(author, "Innobase Oy"),
+
+	/* general descriptive text (for SHOW PLUGINS) */
+	/* const char* */
+	STRUCT_FLD(descr, "InnoDB which lock is blocking which"),
+
+	/* the plugin license (PLUGIN_LICENSE_XXX) */
+	/* int */
+	STRUCT_FLD(license, PLUGIN_LICENSE_GPL),
+
+	/* the function to invoke when plugin is loaded */
+	/* int (*)(void*); */
+	STRUCT_FLD(init, innodb_lock_waits_init),
+
+	/* the function to invoke when plugin is unloaded */
+	/* int (*)(void*); */
+	STRUCT_FLD(deinit, i_s_common_deinit),
+
+	/* plugin version (for SHOW PLUGINS) */
+	/* unsigned int */
+	STRUCT_FLD(version, INNODB_VERSION_SHORT),
+
+	/* struct st_mysql_show_var* */
+	STRUCT_FLD(status_vars, NULL),
+
+	/* struct st_mysql_sys_var** */
+	STRUCT_FLD(system_vars, NULL),
+
+	/* reserved for dependency checking */
+	/* void* */
+	STRUCT_FLD(__reserved1, NULL)
+};
+
+UNIV_INTERN struct st_maria_plugin      i_s_innodb_lock_waits_maria =
+{
+        /* the plugin type (a MYSQL_XXX_PLUGIN value) */
+        /* int */
+        STRUCT_FLD(type, MYSQL_INFORMATION_SCHEMA_PLUGIN),
+
+        /* pointer to type-specific plugin descriptor */
+        /* void* */
+        STRUCT_FLD(info, &i_s_info),
+
+        /* plugin name */
+        /* const char* */
+        STRUCT_FLD(name, "INNODB_LOCK_WAITS"),
+
+        /* plugin author (for SHOW PLUGINS) */
+        /* const char* */
+        STRUCT_FLD(author, "Innobase Oy"),
+
+        /* general descriptive text (for SHOW PLUGINS) */
+        /* const char* */
+        STRUCT_FLD(descr, "InnoDB which lock is blocking which"),
+
+        /* the plugin license (PLUGIN_LICENSE_XXX) */
+        /* int */
+        STRUCT_FLD(license, PLUGIN_LICENSE_GPL),
+
+        /* the function to invoke when plugin is loaded */
+        /* int (*)(void*); */
+        STRUCT_FLD(init, innodb_lock_waits_init),
+
+        /* the function to invoke when plugin is unloaded */
+        /* int (*)(void*); */
+        STRUCT_FLD(deinit, i_s_common_deinit),
+
+        /* plugin version (for SHOW PLUGINS) */
+        /* unsigned int */
+        STRUCT_FLD(version, INNODB_VERSION_SHORT),
+
+        /* struct st_mysql_show_var* */
+        STRUCT_FLD(status_vars, NULL),
+
+        /* struct st_mysql_sys_var** */
+        STRUCT_FLD(system_vars, NULL),
+
+        /* string version */
+        /* const char * */
+        STRUCT_FLD(version_info, "1.0"),
+
+        /* Maturity */
+        /* int */
+        STRUCT_FLD(maturity, MariaDB_PLUGIN_MATURITY_STABLE)
+};
+
+/*******************************************************************//**
+Common function to fill any of the dynamic tables:
+INFORMATION_SCHEMA.innodb_trx
+INFORMATION_SCHEMA.innodb_locks
+INFORMATION_SCHEMA.innodb_lock_waits
+@return	0 on success */
+static
+int
+trx_i_s_common_fill_table(
+/*======================*/
+	THD*		thd,	/*!< in: thread */
+	TABLE_LIST*	tables,	/*!< in/out: tables to fill */
+	COND*		cond)	/*!< in: condition (not used) */
+{
+	const char*		table_name;
+	int			ret;
+	trx_i_s_cache_t*	cache;
+
+	DBUG_ENTER("trx_i_s_common_fill_table");
+
+	/* deny access to non-superusers */
+	if (check_global_access(thd, PROCESS_ACL)) {
+
+		DBUG_RETURN(0);
+	}
+
+	/* minimize the number of places where global variables are
+	referenced */
+	cache = trx_i_s_cache;
+
+	/* which table we have to fill? */
+	table_name = tables->schema_table_name;
+	/* or table_name = tables->schema_table->table_name; */
+
+	RETURN_IF_INNODB_NOT_STARTED(table_name);
+
+	/* update the cache */
+	trx_i_s_cache_start_write(cache);
+	trx_i_s_possibly_fetch_data_into_cache(cache);
+	trx_i_s_cache_end_write(cache);
+
+	if (trx_i_s_cache_is_truncated(cache)) {
+
+		/* XXX show warning to user if possible */
+		fprintf(stderr, "Warning: data in %s truncated due to "
+			"memory limit of %d bytes\n", table_name,
+			TRX_I_S_MEM_LIMIT);
+	}
+
+	ret = 0;
+
+	trx_i_s_cache_start_read(cache);
+
+	if (innobase_strcasecmp(table_name, "innodb_trx") == 0) {
+
+		if (fill_innodb_trx_from_cache(
+			cache, thd, tables->table) != 0) {
+
+			ret = 1;
+		}
+
+	} else if (innobase_strcasecmp(table_name, "innodb_locks") == 0) {
+
+		if (fill_innodb_locks_from_cache(
+			cache, thd, tables->table) != 0) {
+
+			ret = 1;
+		}
+
+	} else if (innobase_strcasecmp(table_name, "innodb_lock_waits") == 0) {
+
+		if (fill_innodb_lock_waits_from_cache(
+			cache, thd, tables->table) != 0) {
+
+			ret = 1;
+		}
+
+	} else {
+
+		/* huh! what happened!? */
+		fprintf(stderr,
+			"InnoDB: trx_i_s_common_fill_table() was "
+			"called to fill unknown table: %s.\n"
+			"This function only knows how to fill "
+			"innodb_trx, innodb_locks and "
+			"innodb_lock_waits tables.\n", table_name);
+
+		ret = 1;
+	}
+
+	trx_i_s_cache_end_read(cache);
+
+#if 0
+	DBUG_RETURN(ret);
+#else
+	/* if this function returns something else than 0 then a
+	deadlock occurs between the mysqld server and mysql client,
+	see http://bugs.mysql.com/29900 ; when that bug is resolved
+	we can enable the DBUG_RETURN(ret) above */
+	DBUG_RETURN(0);
+#endif
+}
+
+/* Fields of the dynamic table information_schema.innodb_cmp. */
+static ST_FIELD_INFO	i_s_cmp_fields_info[] =
+{
+	{STRUCT_FLD(field_name,		"page_size"),
+	 STRUCT_FLD(field_length,	5),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	0),
+	 STRUCT_FLD(old_name,		"Compressed Page Size"),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+	{STRUCT_FLD(field_name,		"compress_ops"),
+	 STRUCT_FLD(field_length,	MY_INT32_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	0),
+	 STRUCT_FLD(old_name,		"Total Number of Compressions"),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+	{STRUCT_FLD(field_name,		"compress_ops_ok"),
+	 STRUCT_FLD(field_length,	MY_INT32_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	0),
+	 STRUCT_FLD(old_name,		"Total Number of"
+					" Successful Compressions"),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+	{STRUCT_FLD(field_name,		"compress_time"),
+	 STRUCT_FLD(field_length,	MY_INT32_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	0),
+	 STRUCT_FLD(old_name,		"Total Duration of Compressions,"
+		    " in Seconds"),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+	{STRUCT_FLD(field_name,		"uncompress_ops"),
+	 STRUCT_FLD(field_length,	MY_INT32_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	0),
+	 STRUCT_FLD(old_name,		"Total Number of Decompressions"),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+	{STRUCT_FLD(field_name,		"uncompress_time"),
+	 STRUCT_FLD(field_length,	MY_INT32_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	0),
+	 STRUCT_FLD(old_name,		"Total Duration of Decompressions,"
+		    " in Seconds"),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+	END_OF_ST_FIELD_INFO
+};
+
+
+/*******************************************************************//**
+Fill the dynamic table information_schema.innodb_cmp or
+innodb_cmp_reset.
+@return	0 on success, 1 on failure */
+static
+int
+i_s_cmp_fill_low(
+/*=============*/
+	THD*		thd,	/*!< in: thread */
+	TABLE_LIST*	tables,	/*!< in/out: tables to fill */
+	COND*		cond,	/*!< in: condition (ignored) */
+	ibool		reset)	/*!< in: TRUE=reset cumulated counts */
+{
+	TABLE*	table	= (TABLE *) tables->table;
+	int	status	= 0;
+
+	DBUG_ENTER("i_s_cmp_fill_low");
+
+	/* deny access to non-superusers */
+	if (check_global_access(thd, PROCESS_ACL)) {
+
+		DBUG_RETURN(0);
+	}
+
+	RETURN_IF_INNODB_NOT_STARTED(tables->schema_table_name);
+
+	for (uint i = 0; i < PAGE_ZIP_NUM_SSIZE - 1; i++) {
+		page_zip_stat_t*	zip_stat = &page_zip_stat[i];
+
+		table->field[0]->store(PAGE_ZIP_MIN_SIZE << i);
+
+		/* The cumulated counts are not protected by any
+		mutex.  Thus, some operation in page0zip.c could
+		increment a counter between the time we read it and
+		clear it.  We could introduce mutex protection, but it
+		could cause a measureable performance hit in
+		page0zip.c. */
+		table->field[1]->store(zip_stat->compressed);
+		table->field[2]->store(zip_stat->compressed_ok);
+		table->field[3]->store(
+			(ulong) (zip_stat->compressed_usec / 1000000));
+		table->field[4]->store(zip_stat->decompressed);
+		table->field[5]->store(
+			(ulong) (zip_stat->decompressed_usec / 1000000));
+
+		if (reset) {
+			memset(zip_stat, 0, sizeof *zip_stat);
+		}
+
+		if (schema_table_store_record(thd, table)) {
+			status = 1;
+			break;
+		}
+	}
+
+	DBUG_RETURN(status);
+}
+
+/*******************************************************************//**
+Fill the dynamic table information_schema.innodb_cmp.
+@return	0 on success, 1 on failure */
+static
+int
+i_s_cmp_fill(
+/*=========*/
+	THD*		thd,	/*!< in: thread */
+	TABLE_LIST*	tables,	/*!< in/out: tables to fill */
+	COND*		cond)	/*!< in: condition (ignored) */
+{
+	return(i_s_cmp_fill_low(thd, tables, cond, FALSE));
+}
+
+/*******************************************************************//**
+Fill the dynamic table information_schema.innodb_cmp_reset.
+@return	0 on success, 1 on failure */
+static
+int
+i_s_cmp_reset_fill(
+/*===============*/
+	THD*		thd,	/*!< in: thread */
+	TABLE_LIST*	tables,	/*!< in/out: tables to fill */
+	COND*		cond)	/*!< in: condition (ignored) */
+{
+	return(i_s_cmp_fill_low(thd, tables, cond, TRUE));
+}
+
+/*******************************************************************//**
+Bind the dynamic table information_schema.innodb_cmp.
+@return	0 on success */
+static
+int
+i_s_cmp_init(
+/*=========*/
+	void*	p)	/*!< in/out: table schema object */
+{
+	DBUG_ENTER("i_s_cmp_init");
+	ST_SCHEMA_TABLE* schema = (ST_SCHEMA_TABLE*) p;
+
+	schema->fields_info = i_s_cmp_fields_info;
+	schema->fill_table = i_s_cmp_fill;
+
+	DBUG_RETURN(0);
+}
+
+/*******************************************************************//**
+Bind the dynamic table information_schema.innodb_cmp_reset.
+@return	0 on success */
+static
+int
+i_s_cmp_reset_init(
+/*===============*/
+	void*	p)	/*!< in/out: table schema object */
+{
+	DBUG_ENTER("i_s_cmp_reset_init");
+	ST_SCHEMA_TABLE* schema = (ST_SCHEMA_TABLE*) p;
+
+	schema->fields_info = i_s_cmp_fields_info;
+	schema->fill_table = i_s_cmp_reset_fill;
+
+	DBUG_RETURN(0);
+}
+
+UNIV_INTERN struct st_mysql_plugin	i_s_innodb_cmp =
+{
+	/* the plugin type (a MYSQL_XXX_PLUGIN value) */
+	/* int */
+	STRUCT_FLD(type, MYSQL_INFORMATION_SCHEMA_PLUGIN),
+
+	/* pointer to type-specific plugin descriptor */
+	/* void* */
+	STRUCT_FLD(info, &i_s_info),
+
+	/* plugin name */
+	/* const char* */
+	STRUCT_FLD(name, "INNODB_CMP"),
+
+	/* plugin author (for SHOW PLUGINS) */
+	/* const char* */
+	STRUCT_FLD(author, plugin_author),
+
+	/* general descriptive text (for SHOW PLUGINS) */
+	/* const char* */
+	STRUCT_FLD(descr, "Statistics for the InnoDB compression"),
+
+	/* the plugin license (PLUGIN_LICENSE_XXX) */
+	/* int */
+	STRUCT_FLD(license, PLUGIN_LICENSE_GPL),
+
+	/* the function to invoke when plugin is loaded */
+	/* int (*)(void*); */
+	STRUCT_FLD(init, i_s_cmp_init),
+
+	/* the function to invoke when plugin is unloaded */
+	/* int (*)(void*); */
+	STRUCT_FLD(deinit, i_s_common_deinit),
+
+	/* plugin version (for SHOW PLUGINS) */
+	/* unsigned int */
+	STRUCT_FLD(version, INNODB_VERSION_SHORT),
+
+	/* struct st_mysql_show_var* */
+	STRUCT_FLD(status_vars, NULL),
+
+	/* struct st_mysql_sys_var** */
+	STRUCT_FLD(system_vars, NULL),
+
+	/* reserved for dependency checking */
+	/* void* */
+	STRUCT_FLD(__reserved1, NULL)
+};
+
+UNIV_INTERN struct st_maria_plugin      i_s_innodb_cmp_maria =
+{
+        /* the plugin type (a MYSQL_XXX_PLUGIN value) */
+        /* int */
+        STRUCT_FLD(type, MYSQL_INFORMATION_SCHEMA_PLUGIN),
+
+        /* pointer to type-specific plugin descriptor */
+        /* void* */
+        STRUCT_FLD(info, &i_s_info),
+
+        /* plugin name */
+        /* const char* */
+        STRUCT_FLD(name, "INNODB_CMP"),
+
+        /* plugin author (for SHOW PLUGINS) */
+        /* const char* */
+        STRUCT_FLD(author, plugin_author),
+
+        /* general descriptive text (for SHOW PLUGINS) */
+        /* const char* */
+        STRUCT_FLD(descr, "Statistics for the InnoDB compression"),
+
+        /* the plugin license (PLUGIN_LICENSE_XXX) */
+        /* int */
+        STRUCT_FLD(license, PLUGIN_LICENSE_GPL),
+
+        /* the function to invoke when plugin is loaded */
+        /* int (*)(void*); */
+        STRUCT_FLD(init, i_s_cmp_init),
+
+        /* the function to invoke when plugin is unloaded */
+        /* int (*)(void*); */
+        STRUCT_FLD(deinit, i_s_common_deinit),
+
+        /* plugin version (for SHOW PLUGINS) */
+        /* unsigned int */
+        STRUCT_FLD(version, INNODB_VERSION_SHORT),
+
+        /* struct st_mysql_show_var* */
+        STRUCT_FLD(status_vars, NULL),
+
+        /* struct st_mysql_sys_var** */
+        STRUCT_FLD(system_vars, NULL),
+
+        /* string version */
+        /* const char * */
+        STRUCT_FLD(version_info, "1.0"),
+
+        /* Maturity */
+        /* int */
+        STRUCT_FLD(maturity, MariaDB_PLUGIN_MATURITY_STABLE)
+};
+
+UNIV_INTERN struct st_mysql_plugin	i_s_innodb_cmp_reset =
+{
+	/* the plugin type (a MYSQL_XXX_PLUGIN value) */
+	/* int */
+	STRUCT_FLD(type, MYSQL_INFORMATION_SCHEMA_PLUGIN),
+
+	/* pointer to type-specific plugin descriptor */
+	/* void* */
+	STRUCT_FLD(info, &i_s_info),
+
+	/* plugin name */
+	/* const char* */
+	STRUCT_FLD(name, "INNODB_CMP_RESET"),
+
+	/* plugin author (for SHOW PLUGINS) */
+	/* const char* */
+	STRUCT_FLD(author, plugin_author),
+
+	/* general descriptive text (for SHOW PLUGINS) */
+	/* const char* */
+	STRUCT_FLD(descr, "Statistics for the InnoDB compression;"
+		   " reset cumulated counts"),
+
+	/* the plugin license (PLUGIN_LICENSE_XXX) */
+	/* int */
+	STRUCT_FLD(license, PLUGIN_LICENSE_GPL),
+
+	/* the function to invoke when plugin is loaded */
+	/* int (*)(void*); */
+	STRUCT_FLD(init, i_s_cmp_reset_init),
+
+	/* the function to invoke when plugin is unloaded */
+	/* int (*)(void*); */
+	STRUCT_FLD(deinit, i_s_common_deinit),
+
+	/* plugin version (for SHOW PLUGINS) */
+	/* unsigned int */
+	STRUCT_FLD(version, INNODB_VERSION_SHORT),
+
+	/* struct st_mysql_show_var* */
+	STRUCT_FLD(status_vars, NULL),
+
+	/* struct st_mysql_sys_var** */
+	STRUCT_FLD(system_vars, NULL),
+
+	/* reserved for dependency checking */
+	/* void* */
+	STRUCT_FLD(__reserved1, NULL)
+};
+
+UNIV_INTERN struct st_maria_plugin      i_s_innodb_cmp_reset_maria =
+{
+        /* the plugin type (a MYSQL_XXX_PLUGIN value) */
+        /* int */
+        STRUCT_FLD(type, MYSQL_INFORMATION_SCHEMA_PLUGIN),
+
+        /* pointer to type-specific plugin descriptor */
+        /* void* */
+        STRUCT_FLD(info, &i_s_info),
+
+        /* plugin name */
+        /* const char* */
+        STRUCT_FLD(name, "INNODB_CMP_RESET"),
+
+        /* plugin author (for SHOW PLUGINS) */
+        /* const char* */
+        STRUCT_FLD(author, plugin_author),
+
+        /* general descriptive text (for SHOW PLUGINS) */
+        /* const char* */
+        STRUCT_FLD(descr, "Statistics for the InnoDB compression;"
+                   " reset cumulated counts"),
+
+        /* the plugin license (PLUGIN_LICENSE_XXX) */
+        /* int */
+        STRUCT_FLD(license, PLUGIN_LICENSE_GPL),
+
+        /* the function to invoke when plugin is loaded */
+        /* int (*)(void*); */
+        STRUCT_FLD(init, i_s_cmp_reset_init),
+
+        /* the function to invoke when plugin is unloaded */
+        /* int (*)(void*); */
+        STRUCT_FLD(deinit, i_s_common_deinit),
+
+        /* plugin version (for SHOW PLUGINS) */
+        /* unsigned int */
+        STRUCT_FLD(version, INNODB_VERSION_SHORT),
+
+        /* struct st_mysql_show_var* */
+        STRUCT_FLD(status_vars, NULL),
+
+        /* struct st_mysql_sys_var** */
+        STRUCT_FLD(system_vars, NULL),
+
+        /* string version */
+        /* const char * */
+        STRUCT_FLD(version_info, "1.0"),
+
+        /* Maturity */
+        /* int */
+        STRUCT_FLD(maturity, MariaDB_PLUGIN_MATURITY_STABLE)
+};
+/* Fields of the dynamic table information_schema.innodb_cmpmem. */
+static ST_FIELD_INFO	i_s_cmpmem_fields_info[] =
+{
+	{STRUCT_FLD(field_name,		"page_size"),
+	 STRUCT_FLD(field_length,	5),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	0),
+	 STRUCT_FLD(old_name,		"Buddy Block Size"),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+	{STRUCT_FLD(field_name,		"pages_used"),
+	 STRUCT_FLD(field_length,	MY_INT32_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	0),
+	 STRUCT_FLD(old_name,		"Currently in Use"),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+	{STRUCT_FLD(field_name,		"pages_free"),
+	 STRUCT_FLD(field_length,	MY_INT32_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	0),
+	 STRUCT_FLD(old_name,		"Currently Available"),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+	{STRUCT_FLD(field_name,		"relocation_ops"),
+	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	0),
+	 STRUCT_FLD(old_name,		"Total Number of Relocations"),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+	{STRUCT_FLD(field_name,		"relocation_time"),
+	 STRUCT_FLD(field_length,	MY_INT32_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	0),
+	 STRUCT_FLD(old_name,		"Total Duration of Relocations,"
+		    " in Seconds"),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+	END_OF_ST_FIELD_INFO
+};
+
+/*******************************************************************//**
+Fill the dynamic table information_schema.innodb_cmpmem or
+innodb_cmpmem_reset.
+@return	0 on success, 1 on failure */
+static
+int
+i_s_cmpmem_fill_low(
+/*================*/
+	THD*		thd,	/*!< in: thread */
+	TABLE_LIST*	tables,	/*!< in/out: tables to fill */
+	COND*		cond,	/*!< in: condition (ignored) */
+	ibool		reset)	/*!< in: TRUE=reset cumulated counts */
+{
+	TABLE*	table	= (TABLE *) tables->table;
+	int	status	= 0;
+
+	DBUG_ENTER("i_s_cmpmem_fill_low");
+
+	/* deny access to non-superusers */
+	if (check_global_access(thd, PROCESS_ACL)) {
+
+		DBUG_RETURN(0);
+	}
+
+	RETURN_IF_INNODB_NOT_STARTED(tables->schema_table_name);
+
+	//buf_pool_mutex_enter();
+	mutex_enter(&zip_free_mutex);
+
+	for (uint x = 0; x <= BUF_BUDDY_SIZES; x++) {
+		buf_buddy_stat_t*	buddy_stat = &buf_buddy_stat[x];
+
+		table->field[0]->store(BUF_BUDDY_LOW << x);
+		table->field[1]->store(buddy_stat->used);
+		table->field[2]->store(UNIV_LIKELY(x < BUF_BUDDY_SIZES)
+				       ? UT_LIST_GET_LEN(buf_pool->zip_free[x])
+				       : 0);
+		table->field[3]->store((longlong) buddy_stat->relocated, true);
+		table->field[4]->store(
+			(ulong) (buddy_stat->relocated_usec / 1000000));
+
+		if (reset) {
+			/* This is protected by buf_pool_mutex. */
+			buddy_stat->relocated = 0;
+			buddy_stat->relocated_usec = 0;
+		}
+
+		if (schema_table_store_record(thd, table)) {
+			status = 1;
+			break;
+		}
+	}
+
+	//buf_pool_mutex_exit();
+	mutex_exit(&zip_free_mutex);
+	DBUG_RETURN(status);
+}
+
+/*******************************************************************//**
+Fill the dynamic table information_schema.innodb_cmpmem.
+@return	0 on success, 1 on failure */
+static
+int
+i_s_cmpmem_fill(
+/*============*/
+	THD*		thd,	/*!< in: thread */
+	TABLE_LIST*	tables,	/*!< in/out: tables to fill */
+	COND*		cond)	/*!< in: condition (ignored) */
+{
+	return(i_s_cmpmem_fill_low(thd, tables, cond, FALSE));
+}
+
+/*******************************************************************//**
+Fill the dynamic table information_schema.innodb_cmpmem_reset.
+@return	0 on success, 1 on failure */
+static
+int
+i_s_cmpmem_reset_fill(
+/*==================*/
+	THD*		thd,	/*!< in: thread */
+	TABLE_LIST*	tables,	/*!< in/out: tables to fill */
+	COND*		cond)	/*!< in: condition (ignored) */
+{
+	return(i_s_cmpmem_fill_low(thd, tables, cond, TRUE));
+}
+
+/*******************************************************************//**
+Bind the dynamic table information_schema.innodb_cmpmem.
+@return	0 on success */
+static
+int
+i_s_cmpmem_init(
+/*============*/
+	void*	p)	/*!< in/out: table schema object */
+{
+	DBUG_ENTER("i_s_cmpmem_init");
+	ST_SCHEMA_TABLE* schema = (ST_SCHEMA_TABLE*) p;
+
+	schema->fields_info = i_s_cmpmem_fields_info;
+	schema->fill_table = i_s_cmpmem_fill;
+
+	DBUG_RETURN(0);
+}
+
+/*******************************************************************//**
+Bind the dynamic table information_schema.innodb_cmpmem_reset.
+@return	0 on success */
+static
+int
+i_s_cmpmem_reset_init(
+/*==================*/
+	void*	p)	/*!< in/out: table schema object */
+{
+	DBUG_ENTER("i_s_cmpmem_reset_init");
+	ST_SCHEMA_TABLE* schema = (ST_SCHEMA_TABLE*) p;
+
+	schema->fields_info = i_s_cmpmem_fields_info;
+	schema->fill_table = i_s_cmpmem_reset_fill;
+
+	DBUG_RETURN(0);
+}
+
+UNIV_INTERN struct st_mysql_plugin	i_s_innodb_cmpmem =
+{
+	/* the plugin type (a MYSQL_XXX_PLUGIN value) */
+	/* int */
+	STRUCT_FLD(type, MYSQL_INFORMATION_SCHEMA_PLUGIN),
+
+	/* pointer to type-specific plugin descriptor */
+	/* void* */
+	STRUCT_FLD(info, &i_s_info),
+
+	/* plugin name */
+	/* const char* */
+	STRUCT_FLD(name, "INNODB_CMPMEM"),
+
+	/* plugin author (for SHOW PLUGINS) */
+	/* const char* */
+	STRUCT_FLD(author, plugin_author),
+
+	/* general descriptive text (for SHOW PLUGINS) */
+	/* const char* */
+	STRUCT_FLD(descr, "Statistics for the InnoDB compressed buffer pool"),
+
+	/* the plugin license (PLUGIN_LICENSE_XXX) */
+	/* int */
+	STRUCT_FLD(license, PLUGIN_LICENSE_GPL),
+
+	/* the function to invoke when plugin is loaded */
+	/* int (*)(void*); */
+	STRUCT_FLD(init, i_s_cmpmem_init),
+
+	/* the function to invoke when plugin is unloaded */
+	/* int (*)(void*); */
+	STRUCT_FLD(deinit, i_s_common_deinit),
+
+	/* plugin version (for SHOW PLUGINS) */
+	/* unsigned int */
+	STRUCT_FLD(version, INNODB_VERSION_SHORT),
+
+	/* struct st_mysql_show_var* */
+	STRUCT_FLD(status_vars, NULL),
+
+	/* struct st_mysql_sys_var** */
+	STRUCT_FLD(system_vars, NULL),
+
+	/* reserved for dependency checking */
+	/* void* */
+	STRUCT_FLD(__reserved1, NULL)
+};
+
+UNIV_INTERN struct st_maria_plugin      i_s_innodb_cmpmem_maria =
+{
+        /* the plugin type (a MYSQL_XXX_PLUGIN value) */
+        /* int */
+        STRUCT_FLD(type, MYSQL_INFORMATION_SCHEMA_PLUGIN),
+
+        /* pointer to type-specific plugin descriptor */
+        /* void* */
+        STRUCT_FLD(info, &i_s_info),
+
+        /* plugin name */
+        /* const char* */
+        STRUCT_FLD(name, "INNODB_CMPMEM"),
+
+        /* plugin author (for SHOW PLUGINS) */
+        /* const char* */
+        STRUCT_FLD(author, plugin_author),
+
+        /* general descriptive text (for SHOW PLUGINS) */
+        /* const char* */
+        STRUCT_FLD(descr, "Statistics for the InnoDB compressed buffer pool"),
+
+        /* the plugin license (PLUGIN_LICENSE_XXX) */
+        /* int */
+        STRUCT_FLD(license, PLUGIN_LICENSE_GPL),
+
+        /* the function to invoke when plugin is loaded */
+        /* int (*)(void*); */
+        STRUCT_FLD(init, i_s_cmpmem_init),
+
+        /* the function to invoke when plugin is unloaded */
+        /* int (*)(void*); */
+        STRUCT_FLD(deinit, i_s_common_deinit),
+
+        /* plugin version (for SHOW PLUGINS) */
+        /* unsigned int */
+        STRUCT_FLD(version, INNODB_VERSION_SHORT),
+
+        /* struct st_mysql_show_var* */
+        STRUCT_FLD(status_vars, NULL),
+
+        /* struct st_mysql_sys_var** */
+        STRUCT_FLD(system_vars, NULL),
+
+        /* string version */
+        /* const char * */
+        STRUCT_FLD(version_info, "1.0"),
+
+        /* Maturity */
+        /* int */
+        STRUCT_FLD(maturity, MariaDB_PLUGIN_MATURITY_STABLE)
+};
+
+UNIV_INTERN struct st_mysql_plugin	i_s_innodb_cmpmem_reset =
+{
+	/* the plugin type (a MYSQL_XXX_PLUGIN value) */
+	/* int */
+	STRUCT_FLD(type, MYSQL_INFORMATION_SCHEMA_PLUGIN),
+
+	/* pointer to type-specific plugin descriptor */
+	/* void* */
+	STRUCT_FLD(info, &i_s_info),
+
+	/* plugin name */
+	/* const char* */
+	STRUCT_FLD(name, "INNODB_CMPMEM_RESET"),
+
+	/* plugin author (for SHOW PLUGINS) */
+	/* const char* */
+	STRUCT_FLD(author, plugin_author),
+
+	/* general descriptive text (for SHOW PLUGINS) */
+	/* const char* */
+	STRUCT_FLD(descr, "Statistics for the InnoDB compressed buffer pool;"
+		   " reset cumulated counts"),
+
+	/* the plugin license (PLUGIN_LICENSE_XXX) */
+	/* int */
+	STRUCT_FLD(license, PLUGIN_LICENSE_GPL),
+
+	/* the function to invoke when plugin is loaded */
+	/* int (*)(void*); */
+	STRUCT_FLD(init, i_s_cmpmem_reset_init),
+
+	/* the function to invoke when plugin is unloaded */
+	/* int (*)(void*); */
+	STRUCT_FLD(deinit, i_s_common_deinit),
+
+	/* plugin version (for SHOW PLUGINS) */
+	/* unsigned int */
+	STRUCT_FLD(version, INNODB_VERSION_SHORT),
+
+	/* struct st_mysql_show_var* */
+	STRUCT_FLD(status_vars, NULL),
+
+	/* struct st_mysql_sys_var** */
+	STRUCT_FLD(system_vars, NULL),
+
+	/* reserved for dependency checking */
+	/* void* */
+	STRUCT_FLD(__reserved1, NULL)
+};
+
+UNIV_INTERN struct st_maria_plugin      i_s_innodb_cmpmem_reset_maria =
+{
+        /* the plugin type (a MYSQL_XXX_PLUGIN value) */
+        /* int */
+        STRUCT_FLD(type, MYSQL_INFORMATION_SCHEMA_PLUGIN),
+
+        /* pointer to type-specific plugin descriptor */
+        /* void* */
+        STRUCT_FLD(info, &i_s_info),
+
+        /* plugin name */
+        /* const char* */
+        STRUCT_FLD(name, "INNODB_CMPMEM_RESET"),
+
+        /* plugin author (for SHOW PLUGINS) */
+        /* const char* */
+        STRUCT_FLD(author, plugin_author),
+
+        /* general descriptive text (for SHOW PLUGINS) */
+        /* const char* */
+        STRUCT_FLD(descr, "Statistics for the InnoDB compressed buffer pool;"
+                   " reset cumulated counts"),
+
+        /* the plugin license (PLUGIN_LICENSE_XXX) */
+        /* int */
+        STRUCT_FLD(license, PLUGIN_LICENSE_GPL),
+
+        /* the function to invoke when plugin is loaded */
+        /* int (*)(void*); */
+        STRUCT_FLD(init, i_s_cmpmem_reset_init),
+
+        /* the function to invoke when plugin is unloaded */
+        /* int (*)(void*); */
+        STRUCT_FLD(deinit, i_s_common_deinit),
+
+        /* plugin version (for SHOW PLUGINS) */
+        /* unsigned int */
+        STRUCT_FLD(version, INNODB_VERSION_SHORT),
+
+        /* struct st_mysql_show_var* */
+        STRUCT_FLD(status_vars, NULL),
+
+        /* struct st_mysql_sys_var** */
+        STRUCT_FLD(system_vars, NULL),
+
+        /* string version */
+        /* const char * */
+        STRUCT_FLD(version_info, "1.0"),
+
+        /* Maturity */
+        /* int */
+        STRUCT_FLD(maturity, MariaDB_PLUGIN_MATURITY_STABLE)
+};
+
+/*******************************************************************//**
+Unbind a dynamic INFORMATION_SCHEMA table.
+@return	0 on success */
+static
+int
+i_s_common_deinit(
+/*==============*/
+	void*	p)	/*!< in/out: table schema object */
+{
+	DBUG_ENTER("i_s_common_deinit");
+
+	/* Do nothing */
+
+	DBUG_RETURN(0);
+}
+
+/***********************************************************************
+*/
+static ST_FIELD_INFO	i_s_innodb_rseg_fields_info[] =
+{
+	{STRUCT_FLD(field_name,		"rseg_id"),
+	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+	{STRUCT_FLD(field_name,		"space_id"),
+	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+	{STRUCT_FLD(field_name,		"zip_size"),
+	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+	{STRUCT_FLD(field_name,		"page_no"),
+	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+	{STRUCT_FLD(field_name,		"max_size"),
+	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+	{STRUCT_FLD(field_name,		"curr_size"),
+	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+	END_OF_ST_FIELD_INFO
+};
+
+static
+int
+i_s_innodb_rseg_fill(
+/*=================*/
+	THD*		thd,	/* in: thread */
+	TABLE_LIST*	tables,	/* in/out: tables to fill */
+	COND*		cond)	/* in: condition (ignored) */
+{
+	TABLE*	table	= (TABLE *) tables->table;
+	int	status	= 0;
+	trx_rseg_t*	rseg;
+
+	DBUG_ENTER("i_s_innodb_rseg_fill");
+
+	/* deny access to non-superusers */
+	if (check_global_access(thd, PROCESS_ACL)) {
+
+		DBUG_RETURN(0);
+	}
+
+	RETURN_IF_INNODB_NOT_STARTED(tables->schema_table_name);
+
+	rseg = UT_LIST_GET_FIRST(trx_sys->rseg_list);
+
+	while (rseg) {
+		table->field[0]->store(rseg->id);
+		table->field[1]->store(rseg->space);
+		table->field[2]->store(rseg->zip_size);
+		table->field[3]->store(rseg->page_no);
+		table->field[4]->store(rseg->max_size);
+		table->field[5]->store(rseg->curr_size);
+
+		if (schema_table_store_record(thd, table)) {
+			status = 1;
+			break;
+		}
+
+		rseg = UT_LIST_GET_NEXT(rseg_list, rseg);
+	}
+
+	DBUG_RETURN(status);
+}
+
+static
+int
+i_s_innodb_rseg_init(
+/*=================*/
+			/* out: 0 on success */
+	void*	p)	/* in/out: table schema object */
+{
+	DBUG_ENTER("i_s_innodb_rseg_init");
+	ST_SCHEMA_TABLE* schema = (ST_SCHEMA_TABLE*) p;
+
+	schema->fields_info = i_s_innodb_rseg_fields_info;
+	schema->fill_table = i_s_innodb_rseg_fill;
+
+	DBUG_RETURN(0);
+}
+
+UNIV_INTERN struct st_mysql_plugin	i_s_innodb_rseg =
+{
+	/* the plugin type (a MYSQL_XXX_PLUGIN value) */
+	/* int */
+	STRUCT_FLD(type, MYSQL_INFORMATION_SCHEMA_PLUGIN),
+
+	/* pointer to type-specific plugin descriptor */
+	/* void* */
+	STRUCT_FLD(info, &i_s_info),
+
+	/* plugin name */
+	/* const char* */
+	STRUCT_FLD(name, "INNODB_RSEG"),
+
+	/* plugin author (for SHOW PLUGINS) */
+	/* const char* */
+	STRUCT_FLD(author, plugin_author),
+
+	/* general descriptive text (for SHOW PLUGINS) */
+	/* const char* */
+	STRUCT_FLD(descr, "InnoDB rollback segment information"),
+
+	/* the plugin license (PLUGIN_LICENSE_XXX) */
+	/* int */
+	STRUCT_FLD(license, PLUGIN_LICENSE_GPL),
+
+	/* the function to invoke when plugin is loaded */
+	/* int (*)(void*); */
+	STRUCT_FLD(init, i_s_innodb_rseg_init),
+
+	/* the function to invoke when plugin is unloaded */
+	/* int (*)(void*); */
+	STRUCT_FLD(deinit, i_s_common_deinit),
+
+	/* plugin version (for SHOW PLUGINS) */
+	/* unsigned int */
+	STRUCT_FLD(version, 0x0100 /* 1.0 */),
+
+	/* struct st_mysql_show_var* */
+	STRUCT_FLD(status_vars, NULL),
+
+	/* struct st_mysql_sys_var** */
+	STRUCT_FLD(system_vars, NULL),
+
+	/* reserved for dependency checking */
+	/* void* */
+	STRUCT_FLD(__reserved1, NULL)
+};
+
+UNIV_INTERN struct st_maria_plugin      i_s_innodb_rseg_maria =
+{
+        /* the plugin type (a MYSQL_XXX_PLUGIN value) */
+        /* int */
+        STRUCT_FLD(type, MYSQL_INFORMATION_SCHEMA_PLUGIN),
+
+        /* pointer to type-specific plugin descriptor */
+        /* void* */
+        STRUCT_FLD(info, &i_s_info),
+
+        /* plugin name */
+        /* const char* */
+        STRUCT_FLD(name, "INNODB_RSEG"),
+
+        /* plugin author (for SHOW PLUGINS) */
+        /* const char* */
+        STRUCT_FLD(author, plugin_author),
+
+        /* general descriptive text (for SHOW PLUGINS) */
+        /* const char* */
+        STRUCT_FLD(descr, "InnoDB rollback segment information"),
+
+        /* the plugin license (PLUGIN_LICENSE_XXX) */
+        /* int */
+        STRUCT_FLD(license, PLUGIN_LICENSE_GPL),
+
+        /* the function to invoke when plugin is loaded */
+        /* int (*)(void*); */
+        STRUCT_FLD(init, i_s_innodb_rseg_init),
+
+        /* the function to invoke when plugin is unloaded */
+        /* int (*)(void*); */
+        STRUCT_FLD(deinit, i_s_common_deinit),
+
+        /* plugin version (for SHOW PLUGINS) */
+        /* unsigned int */
+        STRUCT_FLD(version, 0x0100 /* 1.0 */),
+
+        /* struct st_mysql_show_var* */
+        STRUCT_FLD(status_vars, NULL),
+
+        /* struct st_mysql_sys_var** */
+        STRUCT_FLD(system_vars, NULL),
+
+        /* string version */
+        /* const char * */
+        STRUCT_FLD(version_info, "1.0"),
+
+        /* Maturity */
+        /* int */
+        STRUCT_FLD(maturity, MariaDB_PLUGIN_MATURITY_STABLE)
+};
+
+/***********************************************************************
+*/
+static ST_FIELD_INFO	i_s_innodb_table_stats_info[] =
+{
+	{STRUCT_FLD(field_name,		"table_schema"),
+	 STRUCT_FLD(field_length,	NAME_LEN),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_STRING),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	0),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+	{STRUCT_FLD(field_name,		"table_name"),
+	 STRUCT_FLD(field_length,	NAME_LEN),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_STRING),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	0),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+	{STRUCT_FLD(field_name,		"rows"),
+	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+	{STRUCT_FLD(field_name,		"clust_size"),
+	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+	{STRUCT_FLD(field_name,		"other_size"),
+	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+	{STRUCT_FLD(field_name,		"modified"),
+	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+	END_OF_ST_FIELD_INFO
+};
+
+static ST_FIELD_INFO	i_s_innodb_index_stats_info[] =
+{
+	{STRUCT_FLD(field_name,		"table_schema"),
+	 STRUCT_FLD(field_length,	NAME_LEN),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_STRING),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	0),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+	{STRUCT_FLD(field_name,		"table_name"),
+	 STRUCT_FLD(field_length,	NAME_LEN),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_STRING),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	0),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+	{STRUCT_FLD(field_name,		"index_name"),
+	 STRUCT_FLD(field_length,	NAME_LEN),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_STRING),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	0),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+	{STRUCT_FLD(field_name,		"fields"),
+	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+	{STRUCT_FLD(field_name,		"row_per_keys"),
+	 STRUCT_FLD(field_length,	256),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_STRING),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	0),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+	{STRUCT_FLD(field_name,		"index_size"),
+	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+	{STRUCT_FLD(field_name,		"leaf_pages"),
+	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+	END_OF_ST_FIELD_INFO
+};
+
+static
+int
+i_s_innodb_table_stats_fill(
+/*========================*/
+	THD*		thd,
+	TABLE_LIST*	tables,
+	COND*		cond)
+{
+	TABLE*	i_s_table	= (TABLE *) tables->table;
+	int	status	= 0;
+	dict_table_t*	table;
+
+	DBUG_ENTER("i_s_innodb_table_stats_fill");
+
+	/* deny access to non-superusers */
+	if (check_global_access(thd, PROCESS_ACL)) {
+		DBUG_RETURN(0);
+	}
+
+	mutex_enter(&(dict_sys->mutex));
+
+	table = UT_LIST_GET_FIRST(dict_sys->table_LRU);
+
+	while (table) {
+		char	buf[NAME_LEN * 2 + 2];
+		char*	ptr;
+
+		if (table->stat_clustered_index_size == 0) {
+			table = UT_LIST_GET_NEXT(table_LRU, table);
+			continue;
+		}
+
+		buf[NAME_LEN * 2 + 1] = 0;
+		strncpy(buf, table->name, NAME_LEN * 2 + 1);
+		ptr = strchr(buf, '/');
+		if (ptr) {
+			*ptr = '\0';
+			++ptr;
+		} else {
+			ptr = buf;
+		}
+
+		field_store_string(i_s_table->field[0], buf);
+		field_store_string(i_s_table->field[1], ptr);
+		i_s_table->field[2]->store(table->stat_n_rows, 1);
+		i_s_table->field[3]->store(table->stat_clustered_index_size);
+		i_s_table->field[4]->store(table->stat_sum_of_other_index_sizes);
+		i_s_table->field[5]->store(table->stat_modified_counter);
+
+		if (schema_table_store_record(thd, i_s_table)) {
+			status = 1;
+			break;
+		}
+
+		table = UT_LIST_GET_NEXT(table_LRU, table);
+	}
+
+	mutex_exit(&(dict_sys->mutex));
+
+	DBUG_RETURN(status);
+}
+
+static
+int
+i_s_innodb_index_stats_fill(
+/*========================*/
+	THD*		thd,
+	TABLE_LIST*	tables,
+	COND*		cond)
+{
+	TABLE*	i_s_table	= (TABLE *) tables->table;
+	int	status	= 0;
+	dict_table_t*	table;
+	dict_index_t*	index;
+
+	DBUG_ENTER("i_s_innodb_index_stats_fill");
+
+	/* deny access to non-superusers */
+	if (check_global_access(thd, PROCESS_ACL)) {
+		DBUG_RETURN(0);
+	}
+
+	mutex_enter(&(dict_sys->mutex));
+
+	table = UT_LIST_GET_FIRST(dict_sys->table_LRU);
+
+	while (table) {
+		if (table->stat_clustered_index_size == 0) {
+			table = UT_LIST_GET_NEXT(table_LRU, table);
+			continue;
+		}
+
+		ib_int64_t	n_rows = table->stat_n_rows;
+
+		if (n_rows < 0) {
+			n_rows = 0;
+		}
+
+		index = dict_table_get_first_index(table);
+
+		while (index) {
+			char	buff[256+1];
+			char	row_per_keys[256+1];
+			char	buf[NAME_LEN * 2 + 2];
+			char*	ptr;
+			ulint	i;
+
+			buf[NAME_LEN * 2 + 1] = 0;
+			strncpy(buf, table->name, NAME_LEN * 2 + 1);
+			ptr = strchr(buf, '/');
+			if (ptr) {
+				*ptr = '\0';
+				++ptr;
+			} else {
+				ptr = buf;
+			}
+
+			field_store_string(i_s_table->field[0], buf);
+			field_store_string(i_s_table->field[1], ptr);
+			field_store_string(i_s_table->field[2], index->name);
+			i_s_table->field[3]->store(index->n_uniq);
+
+			row_per_keys[0] = '\0';
+
+			/* It is remained optimistic operation still for now */
+			//dict_index_stat_mutex_enter(index);
+			if (index->stat_n_diff_key_vals) {
+				for (i = 1; i <= index->n_uniq; i++) {
+					ib_int64_t	rec_per_key;
+					if (index->stat_n_diff_key_vals[i]) {
+						rec_per_key = n_rows / index->stat_n_diff_key_vals[i];
+					} else {
+						rec_per_key = n_rows;
+					}
+					ut_snprintf(buff, 256, (i == index->n_uniq)?"%llu":"%llu, ",
+						 rec_per_key);
+					strncat(row_per_keys, buff, 256 - strlen(row_per_keys));
+				}
+			}
+			//dict_index_stat_mutex_exit(index);
+
+			field_store_string(i_s_table->field[4], row_per_keys);
+
+			i_s_table->field[5]->store(index->stat_index_size);
+			i_s_table->field[6]->store(index->stat_n_leaf_pages);
+
+			if (schema_table_store_record(thd, i_s_table)) {
+				status = 1;
+				break;
+			}
+
+			index = dict_table_get_next_index(index);
+		}
+
+		if (status == 1) {
+			break;
+		}
+
+		table = UT_LIST_GET_NEXT(table_LRU, table);
+	}
+
+	mutex_exit(&(dict_sys->mutex));
+
+	DBUG_RETURN(status);
+}
+
+static
+int
+i_s_innodb_table_stats_init(
+/*========================*/
+	void*   p)
+{
+	DBUG_ENTER("i_s_innodb_table_stats_init");
+	ST_SCHEMA_TABLE* schema = (ST_SCHEMA_TABLE*) p;
+
+	schema->fields_info = i_s_innodb_table_stats_info;
+	schema->fill_table = i_s_innodb_table_stats_fill;
+
+	DBUG_RETURN(0);
+}
+
+static
+int
+i_s_innodb_index_stats_init(
+/*========================*/
+	void*	p)
+{
+	DBUG_ENTER("i_s_innodb_index_stats_init");
+	ST_SCHEMA_TABLE* schema = (ST_SCHEMA_TABLE*) p;
+
+	schema->fields_info = i_s_innodb_index_stats_info;
+	schema->fill_table = i_s_innodb_index_stats_fill;
+
+	DBUG_RETURN(0);
+}
+
+UNIV_INTERN struct st_mysql_plugin	i_s_innodb_table_stats =
+{
+	STRUCT_FLD(type, MYSQL_INFORMATION_SCHEMA_PLUGIN),
+	STRUCT_FLD(info, &i_s_info),
+	STRUCT_FLD(name, "INNODB_TABLE_STATS"),
+	STRUCT_FLD(author, plugin_author),
+	STRUCT_FLD(descr, "InnoDB table statistics in memory"),
+	STRUCT_FLD(license, PLUGIN_LICENSE_GPL),
+	STRUCT_FLD(init, i_s_innodb_table_stats_init),
+	STRUCT_FLD(deinit, i_s_common_deinit),
+	STRUCT_FLD(version, 0x0100 /* 1.0 */),
+	STRUCT_FLD(status_vars, NULL),
+	STRUCT_FLD(system_vars, NULL),
+	STRUCT_FLD(__reserved1, NULL)
+};
+
+UNIV_INTERN struct st_maria_plugin      i_s_innodb_table_stats_maria =
+{
+        STRUCT_FLD(type, MYSQL_INFORMATION_SCHEMA_PLUGIN),
+        STRUCT_FLD(info, &i_s_info),
+        STRUCT_FLD(name, "INNODB_TABLE_STATS"),
+        STRUCT_FLD(author, plugin_author),
+        STRUCT_FLD(descr, "InnoDB table statistics in memory"),
+        STRUCT_FLD(license, PLUGIN_LICENSE_GPL),
+        STRUCT_FLD(init, i_s_innodb_table_stats_init),
+        STRUCT_FLD(deinit, i_s_common_deinit),
+        STRUCT_FLD(version, 0x0100 /* 1.0 */),
+        STRUCT_FLD(status_vars, NULL),
+        STRUCT_FLD(system_vars, NULL),
+        STRUCT_FLD(version_info, "1.0"),
+        STRUCT_FLD(maturity, MariaDB_PLUGIN_MATURITY_STABLE)
+};
+
+UNIV_INTERN struct st_mysql_plugin	i_s_innodb_index_stats =
+{
+	STRUCT_FLD(type, MYSQL_INFORMATION_SCHEMA_PLUGIN),
+	STRUCT_FLD(info, &i_s_info),
+	STRUCT_FLD(name, "INNODB_INDEX_STATS"),
+	STRUCT_FLD(author, plugin_author),
+	STRUCT_FLD(descr, "InnoDB index statistics in memory"),
+	STRUCT_FLD(license, PLUGIN_LICENSE_GPL),
+	STRUCT_FLD(init, i_s_innodb_index_stats_init),
+	STRUCT_FLD(deinit, i_s_common_deinit),
+	STRUCT_FLD(version, 0x0100 /* 1.0 */),
+	STRUCT_FLD(status_vars, NULL),
+	STRUCT_FLD(system_vars, NULL),
+	STRUCT_FLD(__reserved1, NULL)
+};
+
+UNIV_INTERN struct st_maria_plugin      i_s_innodb_index_stats_maria =
+{
+        STRUCT_FLD(type, MYSQL_INFORMATION_SCHEMA_PLUGIN),
+        STRUCT_FLD(info, &i_s_info),
+        STRUCT_FLD(name, "INNODB_INDEX_STATS"),
+        STRUCT_FLD(author, plugin_author),
+        STRUCT_FLD(descr, "InnoDB index statistics in memory"),
+        STRUCT_FLD(license, PLUGIN_LICENSE_GPL),
+        STRUCT_FLD(init, i_s_innodb_index_stats_init),
+        STRUCT_FLD(deinit, i_s_common_deinit),
+        STRUCT_FLD(version, 0x0100 /* 1.0 */),
+        STRUCT_FLD(status_vars, NULL),
+        STRUCT_FLD(system_vars, NULL),
+        STRUCT_FLD(version_info, "1.0"),
+        STRUCT_FLD(maturity, MariaDB_PLUGIN_MATURITY_STABLE)
+};
+
+/***********************************************************************
+*/
+static ST_FIELD_INFO	i_s_innodb_admin_command_info[] =
+{
+	{STRUCT_FLD(field_name,		"result_message"),
+	 STRUCT_FLD(field_length,	1024),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_STRING),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	0),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+	END_OF_ST_FIELD_INFO
+};
+
+#ifndef INNODB_COMPATIBILITY_HOOKS
+#error InnoDB needs MySQL to be built with #define INNODB_COMPATIBILITY_HOOKS
+#endif
+
+extern "C" {
+char **thd_query(MYSQL_THD thd);
+}
+
+static
+int
+i_s_innodb_admin_command_fill(
+/*==========================*/
+	THD*		thd,
+	TABLE_LIST*	tables,
+	COND*		cond)
+{
+	TABLE*	i_s_table	= (TABLE *) tables->table;
+	char**	query_str;
+	char*	ptr;
+	char	quote	= '\0';
+	const char*	command_head = "XTRA_";
+
+	DBUG_ENTER("i_s_innodb_admin_command_fill");
+
+	/* deny access to non-superusers */
+	if (check_global_access(thd, PROCESS_ACL)) {
+		DBUG_RETURN(0);
+	}
+
+	if(thd_sql_command(thd) != SQLCOM_SELECT) {
+		field_store_string(i_s_table->field[0],
+			"SELECT command is only accepted.");
+		goto end_func;
+	}
+
+	query_str = thd_query(thd);
+	ptr = *query_str;
+	
+	for (; *ptr; ptr++) {
+		if (*ptr == quote) {
+			quote = '\0';
+		} else if (quote) {
+		} else if (*ptr == '`' || *ptr == '"') {
+			quote = *ptr;
+		} else {
+			long	i;
+			for (i = 0; command_head[i]; i++) {
+				if (toupper((int)(unsigned char)(ptr[i]))
+				    != toupper((int)(unsigned char)
+				      (command_head[i]))) {
+					goto nomatch;
+				}
+			}
+			break;
+nomatch:
+			;
+		}
+	}
+
+	if (!*ptr) {
+		field_store_string(i_s_table->field[0],
+			"No XTRA_* command in the SQL statement."
+			" Please add /*!XTRA_xxxx*/ to the SQL.");
+		goto end_func;
+	}
+
+	if (!strncasecmp("XTRA_HELLO", ptr, 10)) {
+		/* This is example command XTRA_HELLO */
+
+		ut_print_timestamp(stderr);
+		fprintf(stderr, " InnoDB: administration command test for XtraDB"
+				" 'XTRA_HELLO' was detected.\n");
+
+		field_store_string(i_s_table->field[0],
+			"Hello!");
+		goto end_func;
+	}
+	else if (!strncasecmp("XTRA_LRU_DUMP", ptr, 13)) {
+		ut_print_timestamp(stderr);
+		fprintf(stderr, " InnoDB: administration command 'XTRA_LRU_DUMP'"
+				" was detected.\n");
+
+		if (buf_LRU_file_dump()) {
+			field_store_string(i_s_table->field[0],
+				"XTRA_LRU_DUMP was succeeded.");
+		} else {
+			field_store_string(i_s_table->field[0],
+				"XTRA_LRU_DUMP was failed.");
+		}
+
+		goto end_func;
+	}
+	else if (!strncasecmp("XTRA_LRU_RESTORE", ptr, 16)) {
+		ut_print_timestamp(stderr);
+		fprintf(stderr, " InnoDB: administration command 'XTRA_LRU_RESTORE'"
+				" was detected.\n");
+
+		if (buf_LRU_file_restore()) {
+			field_store_string(i_s_table->field[0],
+				"XTRA_LRU_RESTORE was succeeded.");
+		} else {
+			field_store_string(i_s_table->field[0],
+				"XTRA_LRU_RESTORE was failed.");
+		}
+
+		goto end_func;
+	}
+
+	field_store_string(i_s_table->field[0],
+		"Undefined XTRA_* command.");
+	goto end_func;
+
+end_func:
+	if (schema_table_store_record(thd, i_s_table)) {
+		DBUG_RETURN(1);
+	} else {
+		DBUG_RETURN(0);
+	}
+}
+
+static
+int
+i_s_innodb_admin_command_init(
+/*==========================*/
+	void*	p)
+{
+	DBUG_ENTER("i_s_innodb_admin_command_init");
+	ST_SCHEMA_TABLE* schema = (ST_SCHEMA_TABLE*) p;
+
+	schema->fields_info = i_s_innodb_admin_command_info;
+	schema->fill_table = i_s_innodb_admin_command_fill;
+
+	DBUG_RETURN(0);
+}
+
+UNIV_INTERN struct st_mysql_plugin	i_s_innodb_admin_command =
+{
+	STRUCT_FLD(type, MYSQL_INFORMATION_SCHEMA_PLUGIN),
+	STRUCT_FLD(info, &i_s_info),
+	STRUCT_FLD(name, "XTRADB_ADMIN_COMMAND"),
+	STRUCT_FLD(author, plugin_author),
+	STRUCT_FLD(descr, "XtraDB specific command acceptor"),
+	STRUCT_FLD(license, PLUGIN_LICENSE_GPL),
+	STRUCT_FLD(init, i_s_innodb_admin_command_init),
+	STRUCT_FLD(deinit, i_s_common_deinit),
+	STRUCT_FLD(version, 0x0100 /* 1.0 */),
+	STRUCT_FLD(status_vars, NULL),
+	STRUCT_FLD(system_vars, NULL),
+	STRUCT_FLD(__reserved1, NULL)
+};
+
+UNIV_INTERN struct st_maria_plugin      i_s_innodb_admin_command_maria =
+{
+        STRUCT_FLD(type, MYSQL_INFORMATION_SCHEMA_PLUGIN),
+        STRUCT_FLD(info, &i_s_info),
+        STRUCT_FLD(name, "XTRADB_ADMIN_COMMAND"),
+        STRUCT_FLD(author, plugin_author),
+        STRUCT_FLD(descr, "XtraDB specific command acceptor"),
+        STRUCT_FLD(license, PLUGIN_LICENSE_GPL),
+        STRUCT_FLD(init, i_s_innodb_admin_command_init),
+        STRUCT_FLD(deinit, i_s_common_deinit),
+        STRUCT_FLD(version, 0x0100 /* 1.0 */),
+        STRUCT_FLD(status_vars, NULL),
+        STRUCT_FLD(system_vars, NULL),
+        STRUCT_FLD(version_info, "1.0"),
+        STRUCT_FLD(maturity, MariaDB_PLUGIN_MATURITY_STABLE)
+};
+
+static ST_FIELD_INFO	i_s_innodb_sys_tables_info[] =
+{
+	{STRUCT_FLD(field_name,		"SCHEMA"),
+	 STRUCT_FLD(field_length,	NAME_LEN),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_STRING),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	0),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+	{STRUCT_FLD(field_name,		"NAME"),
+	 STRUCT_FLD(field_length,	NAME_LEN),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_STRING),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	0),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+	{STRUCT_FLD(field_name,		"ID"),
+	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+	{STRUCT_FLD(field_name,		"N_COLS"),
+	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+	{STRUCT_FLD(field_name,		"TYPE"),
+	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+	{STRUCT_FLD(field_name,		"MIX_ID"),
+	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+	{STRUCT_FLD(field_name,		"MIX_LEN"),
+	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+	{STRUCT_FLD(field_name,		"CLUSTER_NAME"),
+	 STRUCT_FLD(field_length,	NAME_LEN),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_STRING),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	0),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+	{STRUCT_FLD(field_name,		"SPACE"),
+	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+	 END_OF_ST_FIELD_INFO
+};
+
+static ST_FIELD_INFO	i_s_innodb_sys_indexes_info[] =
+{
+	{STRUCT_FLD(field_name,		"TABLE_ID"),
+	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+	{STRUCT_FLD(field_name,		"ID"),
+	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+	{STRUCT_FLD(field_name,		"NAME"),
+	 STRUCT_FLD(field_length,	NAME_LEN),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_STRING),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	0),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+	{STRUCT_FLD(field_name,		"N_FIELDS"),
+	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+	{STRUCT_FLD(field_name,		"TYPE"),
+	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+	{STRUCT_FLD(field_name,		"SPACE"),
+	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+	{STRUCT_FLD(field_name,		"PAGE_NO"),
+	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+	 END_OF_ST_FIELD_INFO
+};
+
+static ST_FIELD_INFO	i_s_innodb_sys_stats_info[] =
+{
+	{STRUCT_FLD(field_name,		"INDEX_ID"),
+	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+	{STRUCT_FLD(field_name,		"KEY_COLS"),
+	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+	{STRUCT_FLD(field_name,		"DIFF_VALS"),
+	 STRUCT_FLD(field_length,	MY_INT64_NUM_DECIMAL_DIGITS),
+	 STRUCT_FLD(field_type,		MYSQL_TYPE_LONGLONG),
+	 STRUCT_FLD(value,		0),
+	 STRUCT_FLD(field_flags,	MY_I_S_UNSIGNED),
+	 STRUCT_FLD(old_name,		""),
+	 STRUCT_FLD(open_method,	SKIP_OPEN_TABLE)},
+
+	END_OF_ST_FIELD_INFO
+};
+
+static
+int
+copy_string_field(
+/*==============*/
+	TABLE*			table,
+	int			table_field,
+	const rec_t*		rec,
+	int			rec_field)
+{
+	int		status;
+	const byte*	data;
+	ulint		len;
+
+	/*fprintf(stderr, "copy_string_field %d %d\n", table_field, rec_field);*/
+
+	data = rec_get_nth_field_old(rec, rec_field, &len);
+	if (len == UNIV_SQL_NULL) {
+		table->field[table_field]->set_null();
+		status = 0; /* success */
+	} else {
+		table->field[table_field]->set_notnull();
+		status = table->field[table_field]->store(
+			(char *) data, len, system_charset_info);
+	}
+
+	return status;
+}
+
+static
+int
+copy_name_fields(
+/*=============*/
+	TABLE*			table,
+	int			table_field_1,
+	const rec_t*		rec,
+	int			rec_field)
+{
+	int		status;
+	const byte*	data;
+	ulint		len;
+
+	data = rec_get_nth_field_old(rec, rec_field, &len);
+	if (len == UNIV_SQL_NULL) {
+		table->field[table_field_1]->set_null();
+		table->field[table_field_1 + 1]->set_null();
+		status = 0; /* success */
+	} else {
+		char	buf[NAME_LEN * 2 + 2];
+		char*	ptr;
+
+		if (len > NAME_LEN * 2 + 1) {
+			table->field[table_field_1]->set_null();
+			status = field_store_string(table->field[table_field_1 + 1],
+						    "###TOO LONG NAME###");
+			goto end_func;
+		}
+
+		strncpy(buf, (char*)data, len);
+		buf[len] = '\0';
+		ptr = strchr(buf, '/');
+		if (ptr) {
+			*ptr = '\0';
+			++ptr;
+
+			status = field_store_string(table->field[table_field_1], buf);
+			status |= field_store_string(table->field[table_field_1 + 1], ptr);
+		} else {
+			table->field[table_field_1]->set_null();
+			status = field_store_string(table->field[table_field_1 + 1], buf);
+		}
+	}
+
+end_func:
+	return status;
+}
+
+static
+int
+copy_int_field(
+/*===========*/
+	TABLE*			table,
+	int			table_field,
+	const rec_t*		rec,
+	int			rec_field)
+{
+	int		status;
+	const byte*	data;
+	ulint		len;
+
+	/*fprintf(stderr, "copy_int_field %d %d\n", table_field, rec_field);*/
+
+	data = rec_get_nth_field_old(rec, rec_field, &len);
+	if (len == UNIV_SQL_NULL) {
+		table->field[table_field]->set_null();
+		status = 0; /* success */
+	} else {
+		table->field[table_field]->set_notnull();
+		status = table->field[table_field]->store(
+			mach_read_from_4(data), true);
+	}
+
+	return status;
+}
+
+static
+int
+copy_id_field(
+/*==========*/
+	TABLE*			table,
+	int			table_field,
+	const rec_t*		rec,
+	int			rec_field)
+{
+	int		status;
+	const byte*	data;
+	ulint		len;
+
+	/*fprintf(stderr, "copy_id_field %d %d\n", table_field, rec_field);*/
+
+	data = rec_get_nth_field_old(rec, rec_field, &len);
+	if (len == UNIV_SQL_NULL) {
+		table->field[table_field]->set_null();
+		status = 0; /* success */
+	} else {
+		table->field[table_field]->set_notnull();
+		status = table->field[table_field]->store(
+			ut_conv_dulint_to_longlong(mach_read_from_8(data)), true);
+	}
+
+	return status;
+}
+
+static
+int
+copy_sys_tables_rec(
+/*================*/
+	TABLE*			table,
+	const dict_index_t*	index,
+	const rec_t*		rec
+)
+{
+	int	status;
+	int	field;
+
+	/* NAME */
+	field = dict_index_get_nth_col_pos(index, 0);
+	status = copy_name_fields(table, 0, rec, field);
+	if (status) {
+		return status;
+	}
+	/* ID */
+	field = dict_index_get_nth_col_pos(index, 1);
+	status = copy_id_field(table, 2, rec, field);
+	if (status) {
+		return status;
+	}
+	/* N_COLS */
+	field = dict_index_get_nth_col_pos(index, 2);
+	status = copy_int_field(table, 3, rec, field);
+	if (status) {
+		return status;
+	}
+	/* TYPE */
+	field = dict_index_get_nth_col_pos(index, 3);
+	status = copy_int_field(table, 4, rec, field);
+	if (status) {
+		return status;
+	}
+	/* MIX_ID */
+	field = dict_index_get_nth_col_pos(index, 4);
+	status = copy_id_field(table, 5, rec, field);
+	if (status) {
+		return status;
+	}
+	/* MIX_LEN */
+	field = dict_index_get_nth_col_pos(index, 5);
+	status = copy_int_field(table, 6, rec, field);
+	if (status) {
+		return status;
+	}
+	/* CLUSTER_NAME */
+	field = dict_index_get_nth_col_pos(index, 6);
+	status = copy_string_field(table, 7, rec, field);
+	if (status) {
+		return status;
+	}
+	/* SPACE */
+	field = dict_index_get_nth_col_pos(index, 7);
+	status = copy_int_field(table, 8, rec, field);
+	if (status) {
+		return status;
+	}
+
+	return 0;
+}
+
+static
+int
+copy_sys_indexes_rec(
+/*=================*/
+	TABLE*			table,
+	const dict_index_t*	index,
+	const rec_t*		rec
+)
+{
+	int	status;
+	int	field;
+
+	/* TABLE_ID */
+	field = dict_index_get_nth_col_pos(index, 0);
+	status = copy_id_field(table, 0, rec, field);
+	if (status) {
+		return status;
+	}
+	/* ID */
+	field = dict_index_get_nth_col_pos(index, 1);
+	status = copy_id_field(table, 1, rec, field);
+	if (status) {
+		return status;
+	}
+	/* NAME */
+	field = dict_index_get_nth_col_pos(index, 2);
+	status = copy_string_field(table, 2, rec, field);
+	if (status) {
+		return status;
+	}
+	/* N_FIELDS */
+	field = dict_index_get_nth_col_pos(index, 3);
+	status = copy_int_field(table, 3, rec, field);
+	if (status) {
+		return status;
+	}
+	/* TYPE */
+	field = dict_index_get_nth_col_pos(index, 4);
+	status = copy_int_field(table, 4, rec, field);
+	if (status) {
+		return status;
+	}
+	/* SPACE */
+	field = dict_index_get_nth_col_pos(index, 5);
+	status = copy_int_field(table, 5, rec, field);
+	if (status) {
+		return status;
+	}
+	/* PAGE_NO */
+	field = dict_index_get_nth_col_pos(index, 6);
+	status = copy_int_field(table, 6, rec, field);
+	if (status) {
+		return status;
+	}
+
+	return 0;
+}
+
+static
+int
+copy_sys_stats_rec(
+/*===============*/
+	TABLE*			table,
+	const dict_index_t*	index,
+	const rec_t*		rec
+)
+{
+	int	status;
+	int	field;
+
+	/* INDEX_ID */
+	field = dict_index_get_nth_col_pos(index, 0);
+	status = copy_id_field(table, 0, rec, field);
+	if (status) {
+		return status;
+	}
+	/* KEY_COLS */
+	field = dict_index_get_nth_col_pos(index, 1);
+	status = copy_int_field(table, 1, rec, field);
+	if (status) {
+		return status;
+	}
+	/* DIFF_VALS */
+	field = dict_index_get_nth_col_pos(index, 2);
+	status = copy_id_field(table, 2, rec, field);
+	if (status) {
+		return status;
+	}
+
+	return 0;
+}
+
+static
+int
+i_s_innodb_schema_table_fill(
+/*=========================*/
+	THD*		thd,
+	TABLE_LIST*	tables,
+	COND*		cond)
+{
+	int		status	= 0;
+	TABLE*		table	= (TABLE *) tables->table;
+	const char*	table_name = tables->schema_table_name;
+	dict_table_t*	innodb_table;
+	dict_index_t*	index;
+	btr_pcur_t	pcur;
+	const rec_t*	rec;
+	mtr_t		mtr;
+	int		id;
+
+	DBUG_ENTER("i_s_innodb_schema_table_fill");
+
+	/* deny access to non-superusers */
+	if (check_global_access(thd, PROCESS_ACL)) {
+		DBUG_RETURN(0);
+	}
+
+	if (innobase_strcasecmp(table_name, "innodb_sys_tables") == 0) {
+		id = 0;
+	} else if (innobase_strcasecmp(table_name, "innodb_sys_indexes") == 0) {
+		id = 1;
+	} else if (innobase_strcasecmp(table_name, "innodb_sys_stats") == 0) {
+		id = 2;
+	} else {
+		DBUG_RETURN(1);
+	}
+
+
+	RETURN_IF_INNODB_NOT_STARTED(tables->schema_table_name);
+
+	mutex_enter(&(dict_sys->mutex));
+
+	mtr_start(&mtr);
+
+	if (id == 0) {
+		innodb_table = dict_table_get_low("SYS_TABLES");
+	} else if (id == 1) {
+		innodb_table = dict_table_get_low("SYS_INDEXES");
+	} else {
+		innodb_table = dict_table_get_low("SYS_STATS");
+	}
+	index = UT_LIST_GET_FIRST(innodb_table->indexes);
+
+	btr_pcur_open_at_index_side(TRUE, index, BTR_SEARCH_LEAF, &pcur,
+				    TRUE, &mtr);
+	for (;;) {
+		btr_pcur_move_to_next_user_rec(&pcur, &mtr);
+
+		rec = btr_pcur_get_rec(&pcur);
+		if (!btr_pcur_is_on_user_rec(&pcur)) {
+			/* end of index */
+			btr_pcur_close(&pcur);
+			mtr_commit(&mtr);
+			break;
+		}
+		if (rec_get_deleted_flag(rec, 0)) {
+			/* record marked as deleted */
+			btr_pcur_close(&pcur);
+			mtr_commit(&mtr);
+			continue;
+		}
+
+		if (id == 0) {
+			status = copy_sys_tables_rec(table, index, rec);
+		} else if (id == 1) {
+			status = copy_sys_indexes_rec(table, index, rec);
+		} else {
+			status = copy_sys_stats_rec(table, index, rec);
+		}
+		if (status) {
+			btr_pcur_close(&pcur);
+			mtr_commit(&mtr);
+			break;
+		}
+
+#if 0
+		btr_pcur_store_position(&pcur, &mtr);
+		mtr_commit(&mtr);
+
+		status = schema_table_store_record(thd, table);
+		if (status) {
+			btr_pcur_close(&pcur);
+			break;
+		}
+
+		mtr_start(&mtr);
+		btr_pcur_restore_position(BTR_SEARCH_LEAF, &pcur, &mtr);
+#else
+		status = schema_table_store_record(thd, table);
+		if (status) {
+			btr_pcur_close(&pcur);
+			mtr_commit(&mtr);
+			break;
+		}
+#endif
+	}
+
+	mutex_exit(&(dict_sys->mutex));
+
+	DBUG_RETURN(status);
+}
+
+static
+int
+i_s_innodb_sys_tables_init(
+/*=======================*/
+	void*   p)
+{
+	DBUG_ENTER("i_s_innodb_sys_tables_init");
+	ST_SCHEMA_TABLE* schema = (ST_SCHEMA_TABLE*) p;
+
+	schema->fields_info = i_s_innodb_sys_tables_info;
+	schema->fill_table = i_s_innodb_schema_table_fill;
+
+	DBUG_RETURN(0);
+}
+
+static
+int
+i_s_innodb_sys_indexes_init(
+/*========================*/
+	void*   p)
+{
+	DBUG_ENTER("i_s_innodb_sys_indexes_init");
+	ST_SCHEMA_TABLE* schema = (ST_SCHEMA_TABLE*) p;
+
+	schema->fields_info = i_s_innodb_sys_indexes_info;
+	schema->fill_table = i_s_innodb_schema_table_fill;
+
+	DBUG_RETURN(0);
+}
+
+static
+int
+i_s_innodb_sys_stats_init(
+/*======================*/
+	void*	p)
+{
+	DBUG_ENTER("i_s_innodb_sys_stats_init");
+	ST_SCHEMA_TABLE* schema = (ST_SCHEMA_TABLE*) p;
+
+	schema->fields_info = i_s_innodb_sys_stats_info;
+	schema->fill_table = i_s_innodb_schema_table_fill;
+
+	DBUG_RETURN(0);
+}
+
+UNIV_INTERN struct st_mysql_plugin   i_s_innodb_sys_tables =
+{
+	STRUCT_FLD(type, MYSQL_INFORMATION_SCHEMA_PLUGIN),
+	STRUCT_FLD(info, &i_s_info),
+	STRUCT_FLD(name, "INNODB_SYS_TABLES"),
+	STRUCT_FLD(author, plugin_author),
+	STRUCT_FLD(descr, "InnoDB SYS_TABLES table"),
+	STRUCT_FLD(license, PLUGIN_LICENSE_GPL),
+	STRUCT_FLD(init, i_s_innodb_sys_tables_init),
+	STRUCT_FLD(deinit, i_s_common_deinit),
+	STRUCT_FLD(version, 0x0100 /* 1.0 */),
+	STRUCT_FLD(status_vars, NULL),
+	STRUCT_FLD(system_vars, NULL),
+	STRUCT_FLD(__reserved1, NULL)
+};
+
+UNIV_INTERN struct st_maria_plugin   i_s_innodb_sys_tables_maria =
+{
+	STRUCT_FLD(type, MYSQL_INFORMATION_SCHEMA_PLUGIN),
+	STRUCT_FLD(info, &i_s_info),
+	STRUCT_FLD(name, "INNODB_SYS_TABLES"),
+	STRUCT_FLD(author, plugin_author),
+	STRUCT_FLD(descr, "InnoDB SYS_TABLES table"),
+	STRUCT_FLD(license, PLUGIN_LICENSE_GPL),
+	STRUCT_FLD(init, i_s_innodb_sys_tables_init),
+	STRUCT_FLD(deinit, i_s_common_deinit),
+	STRUCT_FLD(version, 0x0100 /* 1.0 */),
+	STRUCT_FLD(status_vars, NULL),
+	STRUCT_FLD(system_vars, NULL),
+        STRUCT_FLD(version_info, "1.0"),
+        STRUCT_FLD(maturity, MariaDB_PLUGIN_MATURITY_BETA)
+};
+
+UNIV_INTERN struct st_mysql_plugin   i_s_innodb_sys_indexes =
+{
+	STRUCT_FLD(type, MYSQL_INFORMATION_SCHEMA_PLUGIN),
+	STRUCT_FLD(info, &i_s_info),
+	STRUCT_FLD(name, "INNODB_SYS_INDEXES"),
+	STRUCT_FLD(author, plugin_author),
+	STRUCT_FLD(descr, "InnoDB SYS_INDEXES table"),
+	STRUCT_FLD(license, PLUGIN_LICENSE_GPL),
+	STRUCT_FLD(init, i_s_innodb_sys_indexes_init),
+	STRUCT_FLD(deinit, i_s_common_deinit),
+	STRUCT_FLD(version, 0x0100 /* 1.0 */),
+	STRUCT_FLD(status_vars, NULL),
+	STRUCT_FLD(system_vars, NULL),
+	STRUCT_FLD(__reserved1, NULL)
+};
+
+UNIV_INTERN struct st_maria_plugin   i_s_innodb_sys_indexes_maria =
+{
+	STRUCT_FLD(type, MYSQL_INFORMATION_SCHEMA_PLUGIN),
+	STRUCT_FLD(info, &i_s_info),
+	STRUCT_FLD(name, "INNODB_SYS_INDEXES"),
+	STRUCT_FLD(author, plugin_author),
+	STRUCT_FLD(descr, "InnoDB SYS_INDEXES table"),
+	STRUCT_FLD(license, PLUGIN_LICENSE_GPL),
+	STRUCT_FLD(init, i_s_innodb_sys_indexes_init),
+	STRUCT_FLD(deinit, i_s_common_deinit),
+	STRUCT_FLD(version, 0x0100 /* 1.0 */),
+	STRUCT_FLD(status_vars, NULL),
+	STRUCT_FLD(system_vars, NULL),
+        STRUCT_FLD(version_info, "1.0"),
+        STRUCT_FLD(maturity, MariaDB_PLUGIN_MATURITY_BETA)
+};
+
+UNIV_INTERN struct st_mysql_plugin   i_s_innodb_sys_stats =
+{
+	STRUCT_FLD(type, MYSQL_INFORMATION_SCHEMA_PLUGIN),
+	STRUCT_FLD(info, &i_s_info),
+	STRUCT_FLD(name, "INNODB_SYS_STATS"),
+	STRUCT_FLD(author, plugin_author),
+	STRUCT_FLD(descr, "InnoDB SYS_STATS table"),
+	STRUCT_FLD(license, PLUGIN_LICENSE_GPL),
+	STRUCT_FLD(init, i_s_innodb_sys_stats_init),
+	STRUCT_FLD(deinit, i_s_common_deinit),
+	STRUCT_FLD(version, 0x0100 /* 1.0 */),
+	STRUCT_FLD(status_vars, NULL),
+	STRUCT_FLD(system_vars, NULL),
+	STRUCT_FLD(__reserved1, NULL)
+};
+
+UNIV_INTERN struct st_maria_plugin   i_s_innodb_sys_stats_maria =
+{
+	STRUCT_FLD(type, MYSQL_INFORMATION_SCHEMA_PLUGIN),
+	STRUCT_FLD(info, &i_s_info),
+	STRUCT_FLD(name, "INNODB_SYS_STATS"),
+	STRUCT_FLD(author, plugin_author),
+	STRUCT_FLD(descr, "InnoDB SYS_STATS table"),
+	STRUCT_FLD(license, PLUGIN_LICENSE_GPL),
+	STRUCT_FLD(init, i_s_innodb_sys_stats_init),
+	STRUCT_FLD(deinit, i_s_common_deinit),
+	STRUCT_FLD(version, 0x0100 /* 1.0 */),
+	STRUCT_FLD(status_vars, NULL),
+	STRUCT_FLD(system_vars, NULL),
+        STRUCT_FLD(version_info, "1.0"),
+        STRUCT_FLD(maturity, MariaDB_PLUGIN_MATURITY_BETA)
+};
+
diff --git a/storage/xtradb/handler/i_s.h b/storage/xtradb/handler/i_s.h
new file mode 100644
index 00000000000..7a5c3ead5ed
--- /dev/null
+++ b/storage/xtradb/handler/i_s.h
@@ -0,0 +1,67 @@
+/*****************************************************************************
+
+Copyright (c) 2007, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file handler/i_s.h
+InnoDB INFORMATION SCHEMA tables interface to MySQL.
+
+Created July 18, 2007 Vasil Dimov
+*******************************************************/
+
+#ifndef i_s_h
+#define i_s_h
+
+extern struct st_mysql_plugin	i_s_innodb_buffer_pool_pages;
+extern struct st_mysql_plugin	i_s_innodb_buffer_pool_pages_index;
+extern struct st_mysql_plugin	i_s_innodb_buffer_pool_pages_blob;
+extern struct st_mysql_plugin	i_s_innodb_trx;
+extern struct st_mysql_plugin	i_s_innodb_locks;
+extern struct st_mysql_plugin	i_s_innodb_lock_waits;
+extern struct st_mysql_plugin	i_s_innodb_cmp;
+extern struct st_mysql_plugin	i_s_innodb_cmp_reset;
+extern struct st_mysql_plugin	i_s_innodb_cmpmem;
+extern struct st_mysql_plugin	i_s_innodb_cmpmem_reset;
+extern struct st_mysql_plugin	i_s_innodb_patches;
+extern struct st_mysql_plugin	i_s_innodb_rseg;
+extern struct st_mysql_plugin	i_s_innodb_table_stats;
+extern struct st_mysql_plugin	i_s_innodb_index_stats;
+extern struct st_mysql_plugin	i_s_innodb_admin_command;
+extern struct st_mysql_plugin   i_s_innodb_sys_tables;
+extern struct st_mysql_plugin   i_s_innodb_sys_indexes;
+extern struct st_mysql_plugin	i_s_innodb_sys_stats;
+
+extern struct st_maria_plugin i_s_innodb_buffer_pool_pages_maria;
+extern struct st_maria_plugin i_s_innodb_buffer_pool_pages_index_maria;
+extern struct st_maria_plugin i_s_innodb_buffer_pool_pages_blob_maria;
+extern struct st_maria_plugin i_s_innodb_trx_maria;
+extern struct st_maria_plugin i_s_innodb_locks_maria;
+extern struct st_maria_plugin i_s_innodb_lock_waits_maria;
+extern struct st_maria_plugin i_s_innodb_cmp_maria;
+extern struct st_maria_plugin i_s_innodb_cmp_reset_maria;
+extern struct st_maria_plugin i_s_innodb_cmpmem_maria;
+extern struct st_maria_plugin i_s_innodb_cmpmem_reset_maria;
+extern struct st_maria_plugin i_s_innodb_patches_maria;
+extern struct st_maria_plugin i_s_innodb_rseg_maria;
+extern struct st_maria_plugin i_s_innodb_table_stats_maria;
+extern struct st_maria_plugin i_s_innodb_index_stats_maria;
+extern struct st_maria_plugin i_s_innodb_admin_command_maria;
+extern struct st_maria_plugin i_s_innodb_sys_tables_maria;
+extern struct st_maria_plugin i_s_innodb_sys_indexes_maria;
+extern struct st_maria_plugin i_s_innodb_sys_stats_maria;
+
+#endif /* i_s_h */
diff --git a/storage/xtradb/handler/innodb_patch_info.h b/storage/xtradb/handler/innodb_patch_info.h
new file mode 100644
index 00000000000..e68f12d0fec
--- /dev/null
+++ b/storage/xtradb/handler/innodb_patch_info.h
@@ -0,0 +1,52 @@
+/* Copyright (C) 2002-2006 MySQL AB
+  
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; version 2 of the License.
+  
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */
+
+#ifdef USE_PRAGMA_INTERFACE
+#pragma interface                      /* gcc class implementation */
+#endif
+
+struct innodb_enhancement {
+       const char *file;
+       const char *name;
+       const char *comment;
+       const char *link;
+}innodb_enhancements[] = {
+{"xtradb_show_enhancements","I_S.XTRADB_ENHANCEMENTS","","http://www.percona.com/docs/wiki/percona-xtradb"},
+{"innodb_show_status","Improvements to SHOW INNODB STATUS","Memory information and lock info fixes","http://www.percona.com/docs/wiki/percona-xtradb"},
+{"innodb_io","Improvements to InnoDB IO","","http://www.percona.com/docs/wiki/percona-xtradb"},
+{"innodb_opt_lru_count","Fix of buffer_pool mutex","Decreases contention on buffer_pool mutex on LRU operations","http://www.percona.com/docs/wiki/percona-xtradb"},
+{"innodb_buffer_pool_pages","Information of buffer pool content","","http://www.percona.com/docs/wiki/percona-xtradb"},
+{"innodb_expand_undo_slots","expandable maximum number of undo slots","from 1024 (default) to about 4000","http://www.percona.com/docs/wiki/percona-xtradb"},
+{"innodb_extra_rseg","allow to create extra rollback segments","When create new db, the new parameter allows to create more rollback segments","http://www.percona.com/docs/wiki/percona-xtradb"},
+{"innodb_overwrite_relay_log_info","overwrite relay-log.info when slave recovery","Building as plugin, it is not used.","http://www.percona.com/docs/wiki/percona-xtradb:innodb_overwrite_relay_log_info"},
+{"innodb_thread_concurrency_timer_based","use InnoDB timer based concurrency throttling (backport from MySQL 5.4.0)","",""},
+{"innodb_expand_import","convert .ibd file automatically when import tablespace","the files are generated by xtrabackup export mode.","http://www.percona.com/docs/wiki/percona-xtradb"},
+{"innodb_dict_size_limit","Limit dictionary cache size","Variable innodb_dict_size_limit in bytes","http://www.percona.com/docs/wiki/percona-xtradb"},
+{"innodb_split_buf_pool_mutex","More fix of buffer_pool mutex","Spliting buf_pool_mutex and optimizing based on innodb_opt_lru_count","http://www.percona.com/docs/wiki/percona-xtradb"},
+{"innodb_stats","Additional features about InnoDB statistics/optimizer","","http://www.percona.com/docs/wiki/percona-xtradb"},
+{"innodb_recovery_patches","Bugfixes and adjustments about recovery process","","http://www.percona.com/docs/wiki/percona-xtradb"},
+{"innodb_purge_thread","Enable to use purge devoted thread","","http://www.percona.com/docs/wiki/percona-xtradb"},
+{"innodb_admin_command_base","XtraDB specific command interface through i_s","","http://www.percona.com/docs/wiki/percona-xtradb"},
+{"innodb_show_lock_name","Show mutex/lock name instead of crated file/line","","http://www.percona.com/docs/wiki/percona-xtradb"},
+{"innodb_extend_slow","Extended statistics in slow.log","It is InnoDB-part only. It needs to patch also to mysqld.","http://www.percona.com/docs/wiki/percona-xtradb"},
+{"innodb_lru_dump_restore","Dump and restore command for content of buffer pool","","http://www.percona.com/docs/wiki/percona-xtradb"},
+{"innodb_separate_doublewrite","Add option 'innodb_doublewrite_file' to separate doublewrite dedicated tablespace","","http://www.percona.com/docs/wiki/percona-xtradb"},
+{"innodb_pass_corrupt_table","Treat tables as corrupt instead of crash, when meet corrupt blocks","","http://www.percona.com/docs/wiki/percona-xtradb"},
+{"innodb_fast_checksum","Using the checksum on 32bit-unit calculation","incompatible for unpatched ver.","http://www.percona.com/docs/wiki/percona-xtradb"},
+{"innodb_files_extend","allow >4GB transaction log files, and can vary universal page size of datafiles","incompatible for unpatched ver.","http://www.percona.com/docs/wiki/percona-xtradb"},
+{"innodb_sys_tables_sys_indexes","Expose InnoDB SYS_TABLES and SYS_INDEXES schema tables","","http://www.percona.com/docs/wiki/percona-xtradb"},
+{"innodb_buffer_pool_shm","Put buffer pool contents to shared memory segment and reuse it at clean restart [experimental]","","http://www.percona.com/docs/wiki/percona-xtradb"},
+{NULL, NULL, NULL, NULL}
+};
diff --git a/storage/xtradb/handler/mysql_addons.cc b/storage/xtradb/handler/mysql_addons.cc
new file mode 100644
index 00000000000..eae1fe9fbc2
--- /dev/null
+++ b/storage/xtradb/handler/mysql_addons.cc
@@ -0,0 +1,42 @@
+/*****************************************************************************
+
+Copyright (c) 2007, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file handler/mysql_addons.cc
+This file contains functions that need to be added to
+MySQL code but have not been added yet.
+
+Whenever you add a function here submit a MySQL bug
+report (feature request) with the implementation. Then
+write the bug number in the comment before the
+function in this file.
+
+When MySQL commits the function it can be deleted from
+here. In a perfect world this file exists but is empty.
+
+Created November 07, 2007 Vasil Dimov
+*******************************************************/
+
+#ifndef MYSQL_SERVER
+#define MYSQL_SERVER
+#endif /* MYSQL_SERVER */
+
+#include <mysql_priv.h>
+
+#include "mysql_addons.h"
+#include "univ.i"
diff --git a/storage/xtradb/ibuf/ibuf0ibuf.c b/storage/xtradb/ibuf/ibuf0ibuf.c
new file mode 100644
index 00000000000..e01c2d6b800
--- /dev/null
+++ b/storage/xtradb/ibuf/ibuf0ibuf.c
@@ -0,0 +1,3646 @@
+/*****************************************************************************
+
+Copyright (c) 1997, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file ibuf/ibuf0ibuf.c
+Insert buffer
+
+Created 7/19/1997 Heikki Tuuri
+*******************************************************/
+
+#include "ibuf0ibuf.h"
+
+/** Number of bits describing a single page */
+#define IBUF_BITS_PER_PAGE	4
+#if IBUF_BITS_PER_PAGE % 2
+# error "IBUF_BITS_PER_PAGE must be an even number!"
+#endif
+/** The start address for an insert buffer bitmap page bitmap */
+#define IBUF_BITMAP		PAGE_DATA
+
+#ifdef UNIV_NONINL
+#include "ibuf0ibuf.ic"
+#endif
+
+#ifndef UNIV_HOTBACKUP
+
+#include "buf0buf.h"
+#include "buf0rea.h"
+#include "fsp0fsp.h"
+#include "trx0sys.h"
+#include "fil0fil.h"
+#include "thr0loc.h"
+#include "rem0rec.h"
+#include "btr0cur.h"
+#include "btr0pcur.h"
+#include "btr0btr.h"
+#include "sync0sync.h"
+#include "dict0boot.h"
+#include "fut0lst.h"
+#include "lock0lock.h"
+#include "log0recv.h"
+#include "que0que.h"
+
+/*	STRUCTURE OF AN INSERT BUFFER RECORD
+
+In versions < 4.1.x:
+
+1. The first field is the page number.
+2. The second field is an array which stores type info for each subsequent
+   field. We store the information which affects the ordering of records, and
+   also the physical storage size of an SQL NULL value. E.g., for CHAR(10) it
+   is 10 bytes.
+3. Next we have the fields of the actual index record.
+
+In versions >= 4.1.x:
+
+Note that contary to what we planned in the 1990's, there will only be one
+insert buffer tree, and that is in the system tablespace of InnoDB.
+
+1. The first field is the space id.
+2. The second field is a one-byte marker (0) which differentiates records from
+   the < 4.1.x storage format.
+3. The third field is the page number.
+4. The fourth field contains the type info, where we have also added 2 bytes to
+   store the charset. In the compressed table format of 5.0.x we must add more
+   information here so that we can build a dummy 'index' struct which 5.0.x
+   can use in the binary search on the index page in the ibuf merge phase.
+5. The rest of the fields contain the fields of the actual index record.
+
+In versions >= 5.0.3:
+
+The first byte of the fourth field is an additional marker (0) if the record
+is in the compact format.  The presence of this marker can be detected by
+looking at the length of the field modulo DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE.
+
+The high-order bit of the character set field in the type info is the
+"nullable" flag for the field. */
+
+
+/*	PREVENTING DEADLOCKS IN THE INSERT BUFFER SYSTEM
+
+If an OS thread performs any operation that brings in disk pages from
+non-system tablespaces into the buffer pool, or creates such a page there,
+then the operation may have as a side effect an insert buffer index tree
+compression. Thus, the tree latch of the insert buffer tree may be acquired
+in the x-mode, and also the file space latch of the system tablespace may
+be acquired in the x-mode.
+
+Also, an insert to an index in a non-system tablespace can have the same
+effect. How do we know this cannot lead to a deadlock of OS threads? There
+is a problem with the i\o-handler threads: they break the latching order
+because they own x-latches to pages which are on a lower level than the
+insert buffer tree latch, its page latches, and the tablespace latch an
+insert buffer operation can reserve.
+
+The solution is the following: Let all the tree and page latches connected
+with the insert buffer be later in the latching order than the fsp latch and
+fsp page latches.
+
+Insert buffer pages must be such that the insert buffer is never invoked
+when these pages are accessed as this would result in a recursion violating
+the latching order. We let a special i/o-handler thread take care of i/o to
+the insert buffer pages and the ibuf bitmap pages, as well as the fsp bitmap
+pages and the first inode page, which contains the inode of the ibuf tree: let
+us call all these ibuf pages. To prevent deadlocks, we do not let a read-ahead
+access both non-ibuf and ibuf pages.
+
+Then an i/o-handler for the insert buffer never needs to access recursively the
+insert buffer tree and thus obeys the latching order. On the other hand, other
+i/o-handlers for other tablespaces may require access to the insert buffer,
+but because all kinds of latches they need to access there are later in the
+latching order, no violation of the latching order occurs in this case,
+either.
+
+A problem is how to grow and contract an insert buffer tree. As it is later
+in the latching order than the fsp management, we have to reserve the fsp
+latch first, before adding or removing pages from the insert buffer tree.
+We let the insert buffer tree have its own file space management: a free
+list of pages linked to the tree root. To prevent recursive using of the
+insert buffer when adding pages to the tree, we must first load these pages
+to memory, obtaining a latch on them, and only after that add them to the
+free list of the insert buffer tree. More difficult is removing of pages
+from the free list. If there is an excess of pages in the free list of the
+ibuf tree, they might be needed if some thread reserves the fsp latch,
+intending to allocate more file space. So we do the following: if a thread
+reserves the fsp latch, we check the writer count field of the latch. If
+this field has value 1, it means that the thread did not own the latch
+before entering the fsp system, and the mtr of the thread contains no
+modifications to the fsp pages. Now we are free to reserve the ibuf latch,
+and check if there is an excess of pages in the free list. We can then, in a
+separate mini-transaction, take them out of the free list and free them to
+the fsp system.
+
+To avoid deadlocks in the ibuf system, we divide file pages into three levels:
+
+(1) non-ibuf pages,
+(2) ibuf tree pages and the pages in the ibuf tree free list, and
+(3) ibuf bitmap pages.
+
+No OS thread is allowed to access higher level pages if it has latches to
+lower level pages; even if the thread owns a B-tree latch it must not access
+the B-tree non-leaf pages if it has latches on lower level pages. Read-ahead
+is only allowed for level 1 and 2 pages. Dedicated i/o-handler threads handle
+exclusively level 1 i/o. A dedicated i/o handler thread handles exclusively
+level 2 i/o. However, if an OS thread does the i/o handling for itself, i.e.,
+it uses synchronous aio, it can access any pages, as long as it obeys the
+access order rules. */
+
+/** Buffer pool size per the maximum insert buffer size */
+#define IBUF_POOL_SIZE_PER_MAX_SIZE	2
+
+/** Table name for the insert buffer. */
+#define IBUF_TABLE_NAME		"SYS_IBUF_TABLE"
+
+/** Operations that can currently be buffered. */
+UNIV_INTERN ibuf_use_t	ibuf_use		= IBUF_USE_INSERT;
+
+/** The insert buffer control structure */
+UNIV_INTERN ibuf_t*	ibuf			= NULL;
+
+/** Counter for ibuf_should_try() */
+UNIV_INTERN ulint	ibuf_flush_count	= 0;
+
+#ifdef UNIV_IBUF_COUNT_DEBUG
+/** Number of tablespaces in the ibuf_counts array */
+#define IBUF_COUNT_N_SPACES	4
+/** Number of pages within each tablespace in the ibuf_counts array */
+#define IBUF_COUNT_N_PAGES	130000
+
+/** Buffered entry counts for file pages, used in debugging */
+static ulint	ibuf_counts[IBUF_COUNT_N_SPACES][IBUF_COUNT_N_PAGES];
+
+/******************************************************************//**
+Checks that the indexes to ibuf_counts[][] are within limits. */
+UNIV_INLINE
+void
+ibuf_count_check(
+/*=============*/
+	ulint	space_id,	/*!< in: space identifier */
+	ulint	page_no)	/*!< in: page number */
+{
+	if (space_id < IBUF_COUNT_N_SPACES && page_no < IBUF_COUNT_N_PAGES) {
+		return;
+	}
+
+	fprintf(stderr,
+		"InnoDB: UNIV_IBUF_COUNT_DEBUG limits space_id and page_no\n"
+		"InnoDB: and breaks crash recovery.\n"
+		"InnoDB: space_id=%lu, should be 0<=space_id<%lu\n"
+		"InnoDB: page_no=%lu, should be 0<=page_no<%lu\n",
+		(ulint) space_id, (ulint) IBUF_COUNT_N_SPACES,
+		(ulint) page_no, (ulint) IBUF_COUNT_N_PAGES);
+	ut_error;
+}
+#endif
+
+/** @name Offsets to the per-page bits in the insert buffer bitmap */
+/* @{ */
+#define	IBUF_BITMAP_FREE	0	/*!< Bits indicating the
+					amount of free space */
+#define IBUF_BITMAP_BUFFERED	2	/*!< TRUE if there are buffered
+					changes for the page */
+#define IBUF_BITMAP_IBUF	3	/*!< TRUE if page is a part of
+					the ibuf tree, excluding the
+					root page, or is in the free
+					list of the ibuf */
+/* @} */
+
+/** The mutex used to block pessimistic inserts to ibuf trees */
+static mutex_t	ibuf_pessimistic_insert_mutex;
+
+/** The mutex protecting the insert buffer structs */
+static mutex_t	ibuf_mutex;
+
+/** The mutex protecting the insert buffer bitmaps */
+static mutex_t	ibuf_bitmap_mutex;
+
+/** The area in pages from which contract looks for page numbers for merge */
+#define	IBUF_MERGE_AREA			8
+
+/** Inside the merge area, pages which have at most 1 per this number less
+buffered entries compared to maximum volume that can buffered for a single
+page are merged along with the page whose buffer became full */
+#define IBUF_MERGE_THRESHOLD		4
+
+/** In ibuf_contract at most this number of pages is read to memory in one
+batch, in order to merge the entries for them in the insert buffer */
+#define	IBUF_MAX_N_PAGES_MERGED		IBUF_MERGE_AREA
+
+/** If the combined size of the ibuf trees exceeds ibuf->max_size by this
+many pages, we start to contract it in connection to inserts there, using
+non-synchronous contract */
+#define IBUF_CONTRACT_ON_INSERT_NON_SYNC	0
+
+/** If the combined size of the ibuf trees exceeds ibuf->max_size by this
+many pages, we start to contract it in connection to inserts there, using
+synchronous contract */
+#define IBUF_CONTRACT_ON_INSERT_SYNC		5
+
+/** If the combined size of the ibuf trees exceeds ibuf->max_size by
+this many pages, we start to contract it synchronous contract, but do
+not insert */
+#define IBUF_CONTRACT_DO_NOT_INSERT		10
+
+/* TODO: how to cope with drop table if there are records in the insert
+buffer for the indexes of the table? Is there actually any problem,
+because ibuf merge is done to a page when it is read in, and it is
+still physically like the index page even if the index would have been
+dropped! So, there seems to be no problem. */
+
+/******************************************************************//**
+Sets the flag in the current OS thread local storage denoting that it is
+inside an insert buffer routine. */
+UNIV_INLINE
+void
+ibuf_enter(void)
+/*============*/
+{
+	ibool*	ptr;
+
+	ptr = thr_local_get_in_ibuf_field();
+
+	ut_ad(*ptr == FALSE);
+
+	*ptr = TRUE;
+}
+
+/******************************************************************//**
+Sets the flag in the current OS thread local storage denoting that it is
+exiting an insert buffer routine. */
+UNIV_INLINE
+void
+ibuf_exit(void)
+/*===========*/
+{
+	ibool*	ptr;
+
+	ptr = thr_local_get_in_ibuf_field();
+
+	ut_ad(*ptr == TRUE);
+
+	*ptr = FALSE;
+}
+
+/******************************************************************//**
+Returns TRUE if the current OS thread is performing an insert buffer
+routine.
+
+For instance, a read-ahead of non-ibuf pages is forbidden by threads
+that are executing an insert buffer routine.
+@return TRUE if inside an insert buffer routine */
+UNIV_INTERN
+ibool
+ibuf_inside(void)
+/*=============*/
+{
+	return(*thr_local_get_in_ibuf_field());
+}
+
+/******************************************************************//**
+Gets the ibuf header page and x-latches it.
+@return	insert buffer header page */
+static
+page_t*
+ibuf_header_page_get(
+/*=================*/
+	mtr_t*	mtr)	/*!< in: mtr */
+{
+	buf_block_t*	block;
+
+	ut_ad(!ibuf_inside());
+
+	block = buf_page_get(
+		IBUF_SPACE_ID, 0, FSP_IBUF_HEADER_PAGE_NO, RW_X_LATCH, mtr);
+	buf_block_dbg_add_level(block, SYNC_IBUF_HEADER);
+
+	return(buf_block_get_frame(block));
+}
+
+/******************************************************************//**
+Gets the root page and x-latches it.
+@return	insert buffer tree root page */
+static
+page_t*
+ibuf_tree_root_get(
+/*===============*/
+	mtr_t*		mtr)	/*!< in: mtr */
+{
+	buf_block_t*	block;
+
+	ut_ad(ibuf_inside());
+
+	mtr_x_lock(dict_index_get_lock(ibuf->index), mtr);
+
+	block = buf_page_get(
+		IBUF_SPACE_ID, 0, FSP_IBUF_TREE_ROOT_PAGE_NO, RW_X_LATCH, mtr);
+
+	buf_block_dbg_add_level(block, SYNC_TREE_NODE);
+
+	return(buf_block_get_frame(block));
+}
+
+#ifdef UNIV_IBUF_COUNT_DEBUG
+/******************************************************************//**
+Gets the ibuf count for a given page.
+@return number of entries in the insert buffer currently buffered for
+this page */
+UNIV_INTERN
+ulint
+ibuf_count_get(
+/*===========*/
+	ulint	space,	/*!< in: space id */
+	ulint	page_no)/*!< in: page number */
+{
+	ibuf_count_check(space, page_no);
+
+	return(ibuf_counts[space][page_no]);
+}
+
+/******************************************************************//**
+Sets the ibuf count for a given page. */
+static
+void
+ibuf_count_set(
+/*===========*/
+	ulint	space,	/*!< in: space id */
+	ulint	page_no,/*!< in: page number */
+	ulint	val)	/*!< in: value to set */
+{
+	ibuf_count_check(space, page_no);
+	ut_a(val < UNIV_PAGE_SIZE);
+
+	ibuf_counts[space][page_no] = val;
+}
+#endif
+
+/******************************************************************//**
+Closes insert buffer and frees the data structures. */
+UNIV_INTERN
+void
+ibuf_close(void)
+/*============*/
+{
+	mutex_free(&ibuf_pessimistic_insert_mutex);
+	memset(&ibuf_pessimistic_insert_mutex,
+	       0x0, sizeof(ibuf_pessimistic_insert_mutex));
+
+	mutex_free(&ibuf_mutex);
+	memset(&ibuf_mutex, 0x0, sizeof(ibuf_mutex));
+
+	mutex_free(&ibuf_bitmap_mutex);
+	memset(&ibuf_bitmap_mutex, 0x0, sizeof(ibuf_mutex));
+
+	mem_free(ibuf);
+	ibuf = NULL;
+}
+
+/******************************************************************//**
+Updates the size information of the ibuf, assuming the segment size has not
+changed. */
+static
+void
+ibuf_size_update(
+/*=============*/
+	const page_t*	root,	/*!< in: ibuf tree root */
+	mtr_t*		mtr)	/*!< in: mtr */
+{
+	ut_ad(mutex_own(&ibuf_mutex));
+
+	ibuf->free_list_len = flst_get_len(root + PAGE_HEADER
+					   + PAGE_BTR_IBUF_FREE_LIST, mtr);
+
+	ibuf->height = 1 + btr_page_get_level(root, mtr);
+
+	/* the '1 +' is the ibuf header page */
+	ibuf->size = ibuf->seg_size - (1 + ibuf->free_list_len);
+
+	ibuf->empty = page_get_n_recs(root) == 0;
+}
+
+/******************************************************************//**
+Creates the insert buffer data structure at a database startup and initializes
+the data structures for the insert buffer. */
+UNIV_INTERN
+void
+ibuf_init_at_db_start(void)
+/*=======================*/
+{
+	page_t*		root;
+	mtr_t		mtr;
+	dict_table_t*	table;
+	mem_heap_t*	heap;
+	dict_index_t*	index;
+	ulint		n_used;
+	page_t*		header_page;
+	ulint		error;
+
+	ibuf = mem_alloc(sizeof(ibuf_t));
+
+	memset(ibuf, 0, sizeof(*ibuf));
+
+	/* Note that also a pessimistic delete can sometimes make a B-tree
+	grow in size, as the references on the upper levels of the tree can
+	change */
+
+	ibuf->max_size = ut_min( buf_pool_get_curr_size() / UNIV_PAGE_SIZE
+		/ IBUF_POOL_SIZE_PER_MAX_SIZE, (ulint) srv_ibuf_max_size / UNIV_PAGE_SIZE);
+
+	srv_ibuf_max_size = (long long) ibuf->max_size * UNIV_PAGE_SIZE;
+
+	mutex_create(&ibuf_pessimistic_insert_mutex,
+		     SYNC_IBUF_PESS_INSERT_MUTEX);
+
+	mutex_create(&ibuf_mutex, SYNC_IBUF_MUTEX);
+
+	mutex_create(&ibuf_bitmap_mutex, SYNC_IBUF_BITMAP_MUTEX);
+
+	mtr_start(&mtr);
+
+	mutex_enter(&ibuf_mutex);
+
+	mtr_x_lock(fil_space_get_latch(IBUF_SPACE_ID, NULL), &mtr);
+
+	header_page = ibuf_header_page_get(&mtr);
+
+	fseg_n_reserved_pages(header_page + IBUF_HEADER + IBUF_TREE_SEG_HEADER,
+			      &n_used, &mtr);
+	ibuf_enter();
+
+	ut_ad(n_used >= 2);
+
+	ibuf->seg_size = n_used;
+
+	{
+		buf_block_t*	block;
+
+		block = buf_page_get(
+			IBUF_SPACE_ID, 0, FSP_IBUF_TREE_ROOT_PAGE_NO,
+			RW_X_LATCH, &mtr);
+		buf_block_dbg_add_level(block, SYNC_TREE_NODE);
+
+		root = buf_block_get_frame(block);
+	}
+
+	ibuf_size_update(root, &mtr);
+	mutex_exit(&ibuf_mutex);
+
+	mtr_commit(&mtr);
+
+	ibuf_exit();
+
+	heap = mem_heap_create(450);
+
+	/* Use old-style record format for the insert buffer. */
+	table = dict_mem_table_create(IBUF_TABLE_NAME, IBUF_SPACE_ID, 1, 0);
+	table->n_mysql_handles_opened = 1; /* for pin */
+
+	dict_mem_table_add_col(table, heap, "DUMMY_COLUMN", DATA_BINARY, 0, 0);
+
+	table->id = ut_dulint_add(DICT_IBUF_ID_MIN, IBUF_SPACE_ID);
+
+	dict_table_add_to_cache(table, heap);
+	mem_heap_free(heap);
+
+	index = dict_mem_index_create(
+		IBUF_TABLE_NAME, "CLUST_IND",
+		IBUF_SPACE_ID, DICT_CLUSTERED | DICT_UNIVERSAL | DICT_IBUF, 1);
+
+	dict_mem_index_add_field(index, "DUMMY_COLUMN", 0);
+
+	index->id = ut_dulint_add(DICT_IBUF_ID_MIN, IBUF_SPACE_ID);
+
+	error = dict_index_add_to_cache(table, index,
+					FSP_IBUF_TREE_ROOT_PAGE_NO, FALSE);
+	ut_a(error == DB_SUCCESS);
+
+	ibuf->index = dict_table_get_first_index(table);
+}
+#endif /* !UNIV_HOTBACKUP */
+/*********************************************************************//**
+Initializes an ibuf bitmap page. */
+UNIV_INTERN
+void
+ibuf_bitmap_page_init(
+/*==================*/
+	buf_block_t*	block,	/*!< in: bitmap page */
+	mtr_t*		mtr)	/*!< in: mtr */
+{
+	page_t*	page;
+	ulint	byte_offset;
+	ulint	zip_size = buf_block_get_zip_size(block);
+
+	ut_a(ut_is_2pow(zip_size));
+
+	page = buf_block_get_frame(block);
+	fil_page_set_type(page, FIL_PAGE_IBUF_BITMAP);
+
+	/* Write all zeros to the bitmap */
+
+	if (!zip_size) {
+		byte_offset = UT_BITS_IN_BYTES(UNIV_PAGE_SIZE
+					       * IBUF_BITS_PER_PAGE);
+	} else {
+		byte_offset = UT_BITS_IN_BYTES(zip_size * IBUF_BITS_PER_PAGE);
+	}
+
+	memset(page + IBUF_BITMAP, 0, byte_offset);
+
+	/* The remaining area (up to the page trailer) is uninitialized. */
+
+#ifndef UNIV_HOTBACKUP
+	mlog_write_initial_log_record(page, MLOG_IBUF_BITMAP_INIT, mtr);
+#endif /* !UNIV_HOTBACKUP */
+}
+
+/*********************************************************************//**
+Parses a redo log record of an ibuf bitmap page init.
+@return	end of log record or NULL */
+UNIV_INTERN
+byte*
+ibuf_parse_bitmap_init(
+/*===================*/
+	byte*		ptr,	/*!< in: buffer */
+	byte*		end_ptr __attribute__((unused)), /*!< in: buffer end */
+	buf_block_t*	block,	/*!< in: block or NULL */
+	mtr_t*		mtr)	/*!< in: mtr or NULL */
+{
+	ut_ad(ptr && end_ptr);
+
+	if (block) {
+		ibuf_bitmap_page_init(block, mtr);
+	}
+
+	return(ptr);
+}
+#ifndef UNIV_HOTBACKUP
+/********************************************************************//**
+Gets the desired bits for a given page from a bitmap page.
+@return	value of bits */
+UNIV_INLINE
+ulint
+ibuf_bitmap_page_get_bits(
+/*======================*/
+	const page_t*	page,	/*!< in: bitmap page */
+	ulint		page_no,/*!< in: page whose bits to get */
+	ulint		zip_size,/*!< in: compressed page size in bytes;
+				0 for uncompressed pages */
+	ulint		bit,	/*!< in: IBUF_BITMAP_FREE,
+				IBUF_BITMAP_BUFFERED, ... */
+	mtr_t*		mtr __attribute__((unused)))
+				/*!< in: mtr containing an
+				x-latch to the bitmap page */
+{
+	ulint	byte_offset;
+	ulint	bit_offset;
+	ulint	map_byte;
+	ulint	value;
+
+	ut_ad(bit < IBUF_BITS_PER_PAGE);
+#if IBUF_BITS_PER_PAGE % 2
+# error "IBUF_BITS_PER_PAGE % 2 != 0"
+#endif
+	ut_ad(ut_is_2pow(zip_size));
+	ut_ad(mtr_memo_contains_page(mtr, page, MTR_MEMO_PAGE_X_FIX));
+
+	if (!zip_size) {
+		bit_offset = (page_no % UNIV_PAGE_SIZE) * IBUF_BITS_PER_PAGE
+			+ bit;
+	} else {
+		bit_offset = (page_no & (zip_size - 1)) * IBUF_BITS_PER_PAGE
+			+ bit;
+	}
+
+	byte_offset = bit_offset / 8;
+	bit_offset = bit_offset % 8;
+
+	ut_ad(byte_offset + IBUF_BITMAP < UNIV_PAGE_SIZE);
+
+	map_byte = mach_read_from_1(page + IBUF_BITMAP + byte_offset);
+
+	value = ut_bit_get_nth(map_byte, bit_offset);
+
+	if (bit == IBUF_BITMAP_FREE) {
+		ut_ad(bit_offset + 1 < 8);
+
+		value = value * 2 + ut_bit_get_nth(map_byte, bit_offset + 1);
+	}
+
+	return(value);
+}
+
+/********************************************************************//**
+Sets the desired bit for a given page in a bitmap page. */
+static
+void
+ibuf_bitmap_page_set_bits(
+/*======================*/
+	page_t*	page,	/*!< in: bitmap page */
+	ulint	page_no,/*!< in: page whose bits to set */
+	ulint	zip_size,/*!< in: compressed page size in bytes;
+			0 for uncompressed pages */
+	ulint	bit,	/*!< in: IBUF_BITMAP_FREE, IBUF_BITMAP_BUFFERED, ... */
+	ulint	val,	/*!< in: value to set */
+	mtr_t*	mtr)	/*!< in: mtr containing an x-latch to the bitmap page */
+{
+	ulint	byte_offset;
+	ulint	bit_offset;
+	ulint	map_byte;
+
+	ut_ad(bit < IBUF_BITS_PER_PAGE);
+#if IBUF_BITS_PER_PAGE % 2
+# error "IBUF_BITS_PER_PAGE % 2 != 0"
+#endif
+	ut_ad(ut_is_2pow(zip_size));
+	ut_ad(mtr_memo_contains_page(mtr, page, MTR_MEMO_PAGE_X_FIX));
+#ifdef UNIV_IBUF_COUNT_DEBUG
+	ut_a((bit != IBUF_BITMAP_BUFFERED) || (val != FALSE)
+	     || (0 == ibuf_count_get(page_get_space_id(page),
+				     page_no)));
+#endif
+	if (!zip_size) {
+		bit_offset = (page_no % UNIV_PAGE_SIZE) * IBUF_BITS_PER_PAGE
+			+ bit;
+	} else {
+		bit_offset = (page_no & (zip_size - 1)) * IBUF_BITS_PER_PAGE
+			+ bit;
+	}
+
+	byte_offset = bit_offset / 8;
+	bit_offset = bit_offset % 8;
+
+	ut_ad(byte_offset + IBUF_BITMAP < UNIV_PAGE_SIZE);
+
+	map_byte = mach_read_from_1(page + IBUF_BITMAP + byte_offset);
+
+	if (bit == IBUF_BITMAP_FREE) {
+		ut_ad(bit_offset + 1 < 8);
+		ut_ad(val <= 3);
+
+		map_byte = ut_bit_set_nth(map_byte, bit_offset, val / 2);
+		map_byte = ut_bit_set_nth(map_byte, bit_offset + 1, val % 2);
+	} else {
+		ut_ad(val <= 1);
+		map_byte = ut_bit_set_nth(map_byte, bit_offset, val);
+	}
+
+	mlog_write_ulint(page + IBUF_BITMAP + byte_offset, map_byte,
+			 MLOG_1BYTE, mtr);
+}
+
+/********************************************************************//**
+Calculates the bitmap page number for a given page number.
+@return	the bitmap page number where the file page is mapped */
+UNIV_INLINE
+ulint
+ibuf_bitmap_page_no_calc(
+/*=====================*/
+	ulint	zip_size,	/*!< in: compressed page size in bytes;
+				0 for uncompressed pages */
+	ulint	page_no)	/*!< in: tablespace page number */
+{
+	ut_ad(ut_is_2pow(zip_size));
+
+	if (!zip_size) {
+		return(FSP_IBUF_BITMAP_OFFSET
+		       + (page_no & ~(UNIV_PAGE_SIZE - 1)));
+	} else {
+		return(FSP_IBUF_BITMAP_OFFSET
+		       + (page_no & ~(zip_size - 1)));
+	}
+}
+
+/********************************************************************//**
+Gets the ibuf bitmap page where the bits describing a given file page are
+stored.
+@return bitmap page where the file page is mapped, that is, the bitmap
+page containing the descriptor bits for the file page; the bitmap page
+is x-latched */
+static
+page_t*
+ibuf_bitmap_get_map_page_func(
+/*==========================*/
+	ulint		space,	/*!< in: space id of the file page */
+	ulint		page_no,/*!< in: page number of the file page */
+	ulint		zip_size,/*!< in: compressed page size in bytes;
+				0 for uncompressed pages */
+	const char*	file,	/*!< in: file name */
+	ulint		line,	/*!< in: line where called */
+	mtr_t*		mtr)	/*!< in: mtr */
+{
+	buf_block_t*	block;
+
+	block = buf_page_get_gen(space, zip_size,
+				 ibuf_bitmap_page_no_calc(zip_size, page_no),
+				 RW_X_LATCH, NULL, BUF_GET,
+				 file, line, mtr);
+	buf_block_dbg_add_level(block, SYNC_IBUF_BITMAP);
+
+	return(buf_block_get_frame(block));
+}
+
+/********************************************************************//**
+Gets the ibuf bitmap page where the bits describing a given file page are
+stored.
+@return bitmap page where the file page is mapped, that is, the bitmap
+page containing the descriptor bits for the file page; the bitmap page
+is x-latched
+@param space	in: space id of the file page
+@param page_no	in: page number of the file page
+@param zip_size	in: compressed page size in bytes; 0 for uncompressed pages
+@param mtr	in: mini-transaction */
+#define ibuf_bitmap_get_map_page(space, page_no, zip_size, mtr)		\
+	ibuf_bitmap_get_map_page_func(space, page_no, zip_size,		\
+				      __FILE__, __LINE__, mtr)
+
+/************************************************************************//**
+Sets the free bits of the page in the ibuf bitmap. This is done in a separate
+mini-transaction, hence this operation does not restrict further work to only
+ibuf bitmap operations, which would result if the latch to the bitmap page
+were kept. */
+UNIV_INLINE
+void
+ibuf_set_free_bits_low(
+/*===================*/
+	ulint			zip_size,/*!< in: compressed page size in bytes;
+					0 for uncompressed pages */
+	const buf_block_t*	block,	/*!< in: index page; free bits are set if
+					the index is non-clustered and page
+					level is 0 */
+	ulint			val,	/*!< in: value to set: < 4 */
+	mtr_t*			mtr)	/*!< in/out: mtr */
+{
+	page_t*	bitmap_page;
+	ulint	space;
+	ulint	page_no;
+
+	if (!page_is_leaf(buf_block_get_frame(block))) {
+
+		return;
+	}
+
+	space = buf_block_get_space(block);
+	page_no = buf_block_get_page_no(block);
+	bitmap_page = ibuf_bitmap_get_map_page(space, page_no, zip_size, mtr);
+#ifdef UNIV_IBUF_DEBUG
+# if 0
+	fprintf(stderr,
+		"Setting space %lu page %lu free bits to %lu should be %lu\n",
+		space, page_no, val,
+		ibuf_index_page_calc_free(zip_size, block));
+# endif
+
+	ut_a(val <= ibuf_index_page_calc_free(zip_size, block));
+#endif /* UNIV_IBUF_DEBUG */
+	ibuf_bitmap_page_set_bits(bitmap_page, page_no, zip_size,
+				  IBUF_BITMAP_FREE, val, mtr);
+}
+
+/************************************************************************//**
+Sets the free bit of the page in the ibuf bitmap. This is done in a separate
+mini-transaction, hence this operation does not restrict further work to only
+ibuf bitmap operations, which would result if the latch to the bitmap page
+were kept. */
+UNIV_INTERN
+void
+ibuf_set_free_bits_func(
+/*====================*/
+	buf_block_t*	block,	/*!< in: index page of a non-clustered index;
+				free bit is reset if page level is 0 */
+#ifdef UNIV_IBUF_DEBUG
+	ulint		max_val,/*!< in: ULINT_UNDEFINED or a maximum
+				value which the bits must have before
+				setting; this is for debugging */
+#endif /* UNIV_IBUF_DEBUG */
+	ulint		val)	/*!< in: value to set: < 4 */
+{
+	mtr_t	mtr;
+	page_t*	page;
+	page_t*	bitmap_page;
+	ulint	space;
+	ulint	page_no;
+	ulint	zip_size;
+
+	page = buf_block_get_frame(block);
+
+	if (!page_is_leaf(page)) {
+
+		return;
+	}
+
+	mtr_start(&mtr);
+
+	space = buf_block_get_space(block);
+	page_no = buf_block_get_page_no(block);
+	zip_size = buf_block_get_zip_size(block);
+	bitmap_page = ibuf_bitmap_get_map_page(space, page_no, zip_size, &mtr);
+
+#ifdef UNIV_IBUF_DEBUG
+	if (max_val != ULINT_UNDEFINED) {
+		ulint	old_val;
+
+		old_val = ibuf_bitmap_page_get_bits(
+			bitmap_page, page_no, zip_size,
+			IBUF_BITMAP_FREE, &mtr);
+# if 0
+		if (old_val != max_val) {
+			fprintf(stderr,
+				"Ibuf: page %lu old val %lu max val %lu\n",
+				page_get_page_no(page),
+				old_val, max_val);
+		}
+# endif
+
+		ut_a(old_val <= max_val);
+	}
+# if 0
+	fprintf(stderr, "Setting page no %lu free bits to %lu should be %lu\n",
+		page_get_page_no(page), val,
+		ibuf_index_page_calc_free(zip_size, block));
+# endif
+
+	ut_a(val <= ibuf_index_page_calc_free(zip_size, block));
+#endif /* UNIV_IBUF_DEBUG */
+	ibuf_bitmap_page_set_bits(bitmap_page, page_no, zip_size,
+				  IBUF_BITMAP_FREE, val, &mtr);
+	mtr_commit(&mtr);
+}
+
+/************************************************************************//**
+Resets the free bits of the page in the ibuf bitmap. This is done in a
+separate mini-transaction, hence this operation does not restrict
+further work to only ibuf bitmap operations, which would result if the
+latch to the bitmap page were kept.  NOTE: The free bits in the insert
+buffer bitmap must never exceed the free space on a page.  It is safe
+to decrement or reset the bits in the bitmap in a mini-transaction
+that is committed before the mini-transaction that affects the free
+space. */
+UNIV_INTERN
+void
+ibuf_reset_free_bits(
+/*=================*/
+	buf_block_t*	block)	/*!< in: index page; free bits are set to 0
+				if the index is a non-clustered
+				non-unique, and page level is 0 */
+{
+	ibuf_set_free_bits(block, 0, ULINT_UNDEFINED);
+}
+
+/**********************************************************************//**
+Updates the free bits for an uncompressed page to reflect the present
+state.  Does this in the mtr given, which means that the latching
+order rules virtually prevent any further operations for this OS
+thread until mtr is committed.  NOTE: The free bits in the insert
+buffer bitmap must never exceed the free space on a page.  It is safe
+to set the free bits in the same mini-transaction that updated the
+page. */
+UNIV_INTERN
+void
+ibuf_update_free_bits_low(
+/*======================*/
+	const buf_block_t*	block,		/*!< in: index page */
+	ulint			max_ins_size,	/*!< in: value of
+						maximum insert size
+						with reorganize before
+						the latest operation
+						performed to the page */
+	mtr_t*			mtr)		/*!< in/out: mtr */
+{
+	ulint	before;
+	ulint	after;
+
+	ut_a(!buf_block_get_page_zip(block));
+
+	before = ibuf_index_page_calc_free_bits(0, max_ins_size);
+
+	after = ibuf_index_page_calc_free(0, block);
+
+	/* This approach cannot be used on compressed pages, since the
+	computed value of "before" often does not match the current
+	state of the bitmap.  This is because the free space may
+	increase or decrease when a compressed page is reorganized. */
+	if (before != after) {
+		ibuf_set_free_bits_low(0, block, after, mtr);
+	}
+}
+
+/**********************************************************************//**
+Updates the free bits for a compressed page to reflect the present
+state.  Does this in the mtr given, which means that the latching
+order rules virtually prevent any further operations for this OS
+thread until mtr is committed.  NOTE: The free bits in the insert
+buffer bitmap must never exceed the free space on a page.  It is safe
+to set the free bits in the same mini-transaction that updated the
+page. */
+UNIV_INTERN
+void
+ibuf_update_free_bits_zip(
+/*======================*/
+	buf_block_t*	block,	/*!< in/out: index page */
+	mtr_t*		mtr)	/*!< in/out: mtr */
+{
+	page_t*	bitmap_page;
+	ulint	space;
+	ulint	page_no;
+	ulint	zip_size;
+	ulint	after;
+
+	space = buf_block_get_space(block);
+	page_no = buf_block_get_page_no(block);
+	zip_size = buf_block_get_zip_size(block);
+
+	ut_a(page_is_leaf(buf_block_get_frame(block)));
+	ut_a(zip_size);
+
+	bitmap_page = ibuf_bitmap_get_map_page(space, page_no, zip_size, mtr);
+
+	after = ibuf_index_page_calc_free_zip(zip_size, block);
+
+	if (after == 0) {
+		/* We move the page to the front of the buffer pool LRU list:
+		the purpose of this is to prevent those pages to which we
+		cannot make inserts using the insert buffer from slipping
+		out of the buffer pool */
+
+		buf_page_make_young(&block->page);
+	}
+
+	ibuf_bitmap_page_set_bits(bitmap_page, page_no, zip_size,
+				  IBUF_BITMAP_FREE, after, mtr);
+}
+
+/**********************************************************************//**
+Updates the free bits for the two pages to reflect the present state.
+Does this in the mtr given, which means that the latching order rules
+virtually prevent any further operations until mtr is committed.
+NOTE: The free bits in the insert buffer bitmap must never exceed the
+free space on a page.  It is safe to set the free bits in the same
+mini-transaction that updated the pages. */
+UNIV_INTERN
+void
+ibuf_update_free_bits_for_two_pages_low(
+/*====================================*/
+	ulint		zip_size,/*!< in: compressed page size in bytes;
+				0 for uncompressed pages */
+	buf_block_t*	block1,	/*!< in: index page */
+	buf_block_t*	block2,	/*!< in: index page */
+	mtr_t*		mtr)	/*!< in: mtr */
+{
+	ulint	state;
+
+	/* As we have to x-latch two random bitmap pages, we have to acquire
+	the bitmap mutex to prevent a deadlock with a similar operation
+	performed by another OS thread. */
+
+	mutex_enter(&ibuf_bitmap_mutex);
+
+	state = ibuf_index_page_calc_free(zip_size, block1);
+
+	ibuf_set_free_bits_low(zip_size, block1, state, mtr);
+
+	state = ibuf_index_page_calc_free(zip_size, block2);
+
+	ibuf_set_free_bits_low(zip_size, block2, state, mtr);
+
+	mutex_exit(&ibuf_bitmap_mutex);
+}
+
+/**********************************************************************//**
+Returns TRUE if the page is one of the fixed address ibuf pages.
+@return	TRUE if a fixed address ibuf i/o page */
+UNIV_INLINE
+ibool
+ibuf_fixed_addr_page(
+/*=================*/
+	ulint	space,	/*!< in: space id */
+	ulint	zip_size,/*!< in: compressed page size in bytes;
+			0 for uncompressed pages */
+	ulint	page_no)/*!< in: page number */
+{
+	return((space == IBUF_SPACE_ID && page_no == IBUF_TREE_ROOT_PAGE_NO)
+	       || ibuf_bitmap_page(zip_size, page_no));
+}
+
+/***********************************************************************//**
+Checks if a page is a level 2 or 3 page in the ibuf hierarchy of pages.
+Must not be called when recv_no_ibuf_operations==TRUE.
+@return	TRUE if level 2 or level 3 page */
+UNIV_INTERN
+ibool
+ibuf_page(
+/*======*/
+	ulint	space,	/*!< in: space id */
+	ulint	zip_size,/*!< in: compressed page size in bytes, or 0 */
+	ulint	page_no,/*!< in: page number */
+	mtr_t*	mtr)	/*!< in: mtr which will contain an x-latch to the
+			bitmap page if the page is not one of the fixed
+			address ibuf pages, or NULL, in which case a new
+			transaction is created. */
+{
+	ibool	ret;
+	mtr_t	local_mtr;
+	page_t*	bitmap_page;
+
+	ut_ad(!recv_no_ibuf_operations);
+
+	if (ibuf_fixed_addr_page(space, zip_size, page_no)) {
+
+		return(TRUE);
+	} else if (space != IBUF_SPACE_ID) {
+
+		return(FALSE);
+	}
+
+	ut_ad(fil_space_get_type(IBUF_SPACE_ID) == FIL_TABLESPACE);
+
+	if (mtr == NULL) {
+		mtr = &local_mtr;
+		mtr_start(mtr);
+	}
+
+	bitmap_page = ibuf_bitmap_get_map_page(space, page_no, zip_size, mtr);
+
+	ret = ibuf_bitmap_page_get_bits(bitmap_page, page_no, zip_size,
+					IBUF_BITMAP_IBUF, mtr);
+
+	if (mtr == &local_mtr) {
+		mtr_commit(mtr);
+	}
+
+	return(ret);
+}
+
+/********************************************************************//**
+Returns the page number field of an ibuf record.
+@return	page number */
+static
+ulint
+ibuf_rec_get_page_no(
+/*=================*/
+	const rec_t*	rec)	/*!< in: ibuf record */
+{
+	const byte*	field;
+	ulint		len;
+
+	ut_ad(ibuf_inside());
+	ut_ad(rec_get_n_fields_old(rec) > 2);
+
+	field = rec_get_nth_field_old(rec, 1, &len);
+
+	if (len == 1) {
+		/* This is of the >= 4.1.x record format */
+		ut_a(trx_sys_multiple_tablespace_format);
+
+		field = rec_get_nth_field_old(rec, 2, &len);
+	} else {
+		ut_a(trx_doublewrite_must_reset_space_ids);
+		ut_a(!trx_sys_multiple_tablespace_format);
+
+		field = rec_get_nth_field_old(rec, 0, &len);
+	}
+
+	ut_a(len == 4);
+
+	return(mach_read_from_4(field));
+}
+
+/********************************************************************//**
+Returns the space id field of an ibuf record. For < 4.1.x format records
+returns 0.
+@return	space id */
+static
+ulint
+ibuf_rec_get_space(
+/*===============*/
+	const rec_t*	rec)	/*!< in: ibuf record */
+{
+	const byte*	field;
+	ulint		len;
+
+	ut_ad(ibuf_inside());
+	ut_ad(rec_get_n_fields_old(rec) > 2);
+
+	field = rec_get_nth_field_old(rec, 1, &len);
+
+	if (len == 1) {
+		/* This is of the >= 4.1.x record format */
+
+		ut_a(trx_sys_multiple_tablespace_format);
+		field = rec_get_nth_field_old(rec, 0, &len);
+		ut_a(len == 4);
+
+		return(mach_read_from_4(field));
+	}
+
+	ut_a(trx_doublewrite_must_reset_space_ids);
+	ut_a(!trx_sys_multiple_tablespace_format);
+
+	return(0);
+}
+
+/********************************************************************//**
+Creates a dummy index for inserting a record to a non-clustered index.
+
+@return	dummy index */
+static
+dict_index_t*
+ibuf_dummy_index_create(
+/*====================*/
+	ulint		n,	/*!< in: number of fields */
+	ibool		comp)	/*!< in: TRUE=use compact record format */
+{
+	dict_table_t*	table;
+	dict_index_t*	index;
+
+	table = dict_mem_table_create("IBUF_DUMMY",
+				      DICT_HDR_SPACE, n,
+				      comp ? DICT_TF_COMPACT : 0);
+
+	index = dict_mem_index_create("IBUF_DUMMY", "IBUF_DUMMY",
+				      DICT_HDR_SPACE, 0, n);
+
+	index->table = table;
+
+	/* avoid ut_ad(index->cached) in dict_index_get_n_unique_in_tree */
+	index->cached = TRUE;
+
+	return(index);
+}
+/********************************************************************//**
+Add a column to the dummy index */
+static
+void
+ibuf_dummy_index_add_col(
+/*=====================*/
+	dict_index_t*	index,	/*!< in: dummy index */
+	const dtype_t*	type,	/*!< in: the data type of the column */
+	ulint		len)	/*!< in: length of the column */
+{
+	ulint	i	= index->table->n_def;
+	dict_mem_table_add_col(index->table, NULL, NULL,
+			       dtype_get_mtype(type),
+			       dtype_get_prtype(type),
+			       dtype_get_len(type));
+	dict_index_add_col(index, index->table,
+			   dict_table_get_nth_col(index->table, i), len);
+}
+/********************************************************************//**
+Deallocates a dummy index for inserting a record to a non-clustered index. */
+static
+void
+ibuf_dummy_index_free(
+/*==================*/
+	dict_index_t*	index)	/*!< in, own: dummy index */
+{
+	dict_table_t*	table = index->table;
+
+	dict_mem_index_free(index);
+	dict_mem_table_free(table);
+}
+
+/*********************************************************************//**
+Builds the entry to insert into a non-clustered index when we have the
+corresponding record in an ibuf index.
+
+NOTE that as we copy pointers to fields in ibuf_rec, the caller must
+hold a latch to the ibuf_rec page as long as the entry is used!
+
+@return own: entry to insert to a non-clustered index */
+UNIV_INLINE
+dtuple_t*
+ibuf_build_entry_pre_4_1_x(
+/*=======================*/
+	const rec_t*	ibuf_rec,	/*!< in: record in an insert buffer */
+	mem_heap_t*	heap,		/*!< in: heap where built */
+	dict_index_t**	pindex)		/*!< out, own: dummy index that
+					describes the entry */
+{
+	ulint		i;
+	ulint		len;
+	const byte*	types;
+	dtuple_t*	tuple;
+	ulint		n_fields;
+
+	ut_a(trx_doublewrite_must_reset_space_ids);
+	ut_a(!trx_sys_multiple_tablespace_format);
+
+	n_fields = rec_get_n_fields_old(ibuf_rec) - 2;
+	tuple = dtuple_create(heap, n_fields);
+	types = rec_get_nth_field_old(ibuf_rec, 1, &len);
+
+	ut_a(len == n_fields * DATA_ORDER_NULL_TYPE_BUF_SIZE);
+
+	for (i = 0; i < n_fields; i++) {
+		const byte*	data;
+		dfield_t*	field;
+
+		field = dtuple_get_nth_field(tuple, i);
+
+		data = rec_get_nth_field_old(ibuf_rec, i + 2, &len);
+
+		dfield_set_data(field, data, len);
+
+		dtype_read_for_order_and_null_size(
+			dfield_get_type(field),
+			types + i * DATA_ORDER_NULL_TYPE_BUF_SIZE);
+	}
+
+	*pindex = ibuf_dummy_index_create(n_fields, FALSE);
+
+	return(tuple);
+}
+
+/*********************************************************************//**
+Builds the entry to insert into a non-clustered index when we have the
+corresponding record in an ibuf index.
+
+NOTE that as we copy pointers to fields in ibuf_rec, the caller must
+hold a latch to the ibuf_rec page as long as the entry is used!
+
+@return own: entry to insert to a non-clustered index */
+static
+dtuple_t*
+ibuf_build_entry_from_ibuf_rec(
+/*===========================*/
+	const rec_t*	ibuf_rec,	/*!< in: record in an insert buffer */
+	mem_heap_t*	heap,		/*!< in: heap where built */
+	dict_index_t**	pindex)		/*!< out, own: dummy index that
+					describes the entry */
+{
+	dtuple_t*	tuple;
+	dfield_t*	field;
+	ulint		n_fields;
+	const byte*	types;
+	const byte*	data;
+	ulint		len;
+	ulint		i;
+	dict_index_t*	index;
+
+	data = rec_get_nth_field_old(ibuf_rec, 1, &len);
+
+	if (len > 1) {
+		/* This a < 4.1.x format record */
+
+		return(ibuf_build_entry_pre_4_1_x(ibuf_rec, heap, pindex));
+	}
+
+	/* This a >= 4.1.x format record */
+
+	ut_a(trx_sys_multiple_tablespace_format);
+	ut_a(*data == 0);
+	ut_a(rec_get_n_fields_old(ibuf_rec) > 4);
+
+	n_fields = rec_get_n_fields_old(ibuf_rec) - 4;
+
+	tuple = dtuple_create(heap, n_fields);
+
+	types = rec_get_nth_field_old(ibuf_rec, 3, &len);
+
+	ut_a(len % DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE <= 1);
+	index = ibuf_dummy_index_create(
+		n_fields, len % DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE);
+
+	if (len % DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE) {
+		/* compact record format */
+		len--;
+		ut_a(*types == 0);
+		types++;
+	}
+
+	ut_a(len == n_fields * DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE);
+
+	for (i = 0; i < n_fields; i++) {
+		field = dtuple_get_nth_field(tuple, i);
+
+		data = rec_get_nth_field_old(ibuf_rec, i + 4, &len);
+
+		dfield_set_data(field, data, len);
+
+		dtype_new_read_for_order_and_null_size(
+			dfield_get_type(field),
+			types + i * DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE);
+
+		ibuf_dummy_index_add_col(index, dfield_get_type(field), len);
+	}
+
+	/* Prevent an ut_ad() failure in page_zip_write_rec() by
+	adding system columns to the dummy table pointed to by the
+	dummy secondary index.  The insert buffer is only used for
+	secondary indexes, whose records never contain any system
+	columns, such as DB_TRX_ID. */
+	ut_d(dict_table_add_system_columns(index->table, index->table->heap));
+
+	*pindex = index;
+
+	return(tuple);
+}
+
+/********************************************************************//**
+Returns the space taken by a stored non-clustered index entry if converted to
+an index record.
+@return size of index record in bytes + an upper limit of the space
+taken in the page directory */
+static
+ulint
+ibuf_rec_get_volume(
+/*================*/
+	const rec_t*	ibuf_rec)/*!< in: ibuf record */
+{
+	dtype_t		dtype;
+	ibool		new_format	= FALSE;
+	ulint		data_size	= 0;
+	ulint		n_fields;
+	const byte*	types;
+	const byte*	data;
+	ulint		len;
+	ulint		i;
+	ulint		comp;
+
+	ut_ad(ibuf_inside());
+	ut_ad(rec_get_n_fields_old(ibuf_rec) > 2);
+
+	data = rec_get_nth_field_old(ibuf_rec, 1, &len);
+
+	if (len > 1) {
+		/* < 4.1.x format record */
+
+		ut_a(trx_doublewrite_must_reset_space_ids);
+		ut_a(!trx_sys_multiple_tablespace_format);
+
+		n_fields = rec_get_n_fields_old(ibuf_rec) - 2;
+
+		types = rec_get_nth_field_old(ibuf_rec, 1, &len);
+
+		ut_ad(len == n_fields * DATA_ORDER_NULL_TYPE_BUF_SIZE);
+		comp = 0;
+	} else {
+		/* >= 4.1.x format record */
+
+		ut_a(trx_sys_multiple_tablespace_format);
+		ut_a(*data == 0);
+
+		types = rec_get_nth_field_old(ibuf_rec, 3, &len);
+
+		comp = len % DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE;
+
+		ut_a(comp <= 1);
+		if (comp) {
+			/* compact record format */
+			ulint		volume;
+			dict_index_t*	dummy_index;
+			mem_heap_t*	heap = mem_heap_create(500);
+			dtuple_t*	entry = ibuf_build_entry_from_ibuf_rec(
+				ibuf_rec, heap, &dummy_index);
+			volume = rec_get_converted_size(dummy_index, entry, 0);
+			ibuf_dummy_index_free(dummy_index);
+			mem_heap_free(heap);
+			return(volume + page_dir_calc_reserved_space(1));
+		}
+
+		n_fields = rec_get_n_fields_old(ibuf_rec) - 4;
+
+		new_format = TRUE;
+	}
+
+	for (i = 0; i < n_fields; i++) {
+		if (new_format) {
+			data = rec_get_nth_field_old(ibuf_rec, i + 4, &len);
+
+			dtype_new_read_for_order_and_null_size(
+				&dtype, types + i
+				* DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE);
+		} else {
+			data = rec_get_nth_field_old(ibuf_rec, i + 2, &len);
+
+			dtype_read_for_order_and_null_size(
+				&dtype, types + i
+				* DATA_ORDER_NULL_TYPE_BUF_SIZE);
+		}
+
+		if (len == UNIV_SQL_NULL) {
+			data_size += dtype_get_sql_null_size(&dtype, comp);
+		} else {
+			data_size += len;
+		}
+	}
+
+	return(data_size + rec_get_converted_extra_size(data_size, n_fields, 0)
+	       + page_dir_calc_reserved_space(1));
+}
+
+/*********************************************************************//**
+Builds the tuple to insert to an ibuf tree when we have an entry for a
+non-clustered index.
+
+NOTE that the original entry must be kept because we copy pointers to
+its fields.
+
+@return	own: entry to insert into an ibuf index tree */
+static
+dtuple_t*
+ibuf_entry_build(
+/*=============*/
+	dict_index_t*	index,	/*!< in: non-clustered index */
+	const dtuple_t*	entry,	/*!< in: entry for a non-clustered index */
+	ulint		space,	/*!< in: space id */
+	ulint		page_no,/*!< in: index page number where entry should
+				be inserted */
+	mem_heap_t*	heap)	/*!< in: heap into which to build */
+{
+	dtuple_t*	tuple;
+	dfield_t*	field;
+	const dfield_t*	entry_field;
+	ulint		n_fields;
+	byte*		buf;
+	byte*		buf2;
+	ulint		i;
+
+	/* Starting from 4.1.x, we have to build a tuple whose
+	(1) first field is the space id,
+	(2) the second field a single marker byte (0) to tell that this
+	is a new format record,
+	(3) the third contains the page number, and
+	(4) the fourth contains the relevent type information of each data
+	field; the length of this field % DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE is
+	(a) 0 for b-trees in the old format, and
+	(b) 1 for b-trees in the compact format, the first byte of the field
+	being the marker (0);
+	(5) and the rest of the fields are copied from entry. All fields
+	in the tuple are ordered like the type binary in our insert buffer
+	tree. */
+
+	n_fields = dtuple_get_n_fields(entry);
+
+	tuple = dtuple_create(heap, n_fields + 4);
+
+	/* Store the space id in tuple */
+
+	field = dtuple_get_nth_field(tuple, 0);
+
+	buf = mem_heap_alloc(heap, 4);
+
+	mach_write_to_4(buf, space);
+
+	dfield_set_data(field, buf, 4);
+
+	/* Store the marker byte field in tuple */
+
+	field = dtuple_get_nth_field(tuple, 1);
+
+	buf = mem_heap_alloc(heap, 1);
+
+	/* We set the marker byte zero */
+
+	mach_write_to_1(buf, 0);
+
+	dfield_set_data(field, buf, 1);
+
+	/* Store the page number in tuple */
+
+	field = dtuple_get_nth_field(tuple, 2);
+
+	buf = mem_heap_alloc(heap, 4);
+
+	mach_write_to_4(buf, page_no);
+
+	dfield_set_data(field, buf, 4);
+
+	/* Store the type info in buf2, and add the fields from entry to
+	tuple */
+	buf2 = mem_heap_alloc(heap, n_fields
+			      * DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE
+			      + dict_table_is_comp(index->table));
+	if (dict_table_is_comp(index->table)) {
+		*buf2++ = 0; /* write the compact format indicator */
+	}
+	for (i = 0; i < n_fields; i++) {
+		ulint			fixed_len;
+		const dict_field_t*	ifield;
+
+		/* We add 4 below because we have the 4 extra fields at the
+		start of an ibuf record */
+
+		field = dtuple_get_nth_field(tuple, i + 4);
+		entry_field = dtuple_get_nth_field(entry, i);
+		dfield_copy(field, entry_field);
+
+		ifield = dict_index_get_nth_field(index, i);
+		/* Prefix index columns of fixed-length columns are of
+		fixed length.  However, in the function call below,
+		dfield_get_type(entry_field) contains the fixed length
+		of the column in the clustered index.  Replace it with
+		the fixed length of the secondary index column. */
+		fixed_len = ifield->fixed_len;
+
+#ifdef UNIV_DEBUG
+		if (fixed_len) {
+			/* dict_index_add_col() should guarantee these */
+			ut_ad(fixed_len <= (ulint)
+			      dfield_get_type(entry_field)->len);
+			if (ifield->prefix_len) {
+				ut_ad(ifield->prefix_len == fixed_len);
+			} else {
+				ut_ad(fixed_len == (ulint)
+				      dfield_get_type(entry_field)->len);
+			}
+		}
+#endif /* UNIV_DEBUG */
+
+		dtype_new_store_for_order_and_null_size(
+			buf2 + i * DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE,
+			dfield_get_type(entry_field), fixed_len);
+	}
+
+	/* Store the type info in buf2 to field 3 of tuple */
+
+	field = dtuple_get_nth_field(tuple, 3);
+
+	if (dict_table_is_comp(index->table)) {
+		buf2--;
+	}
+
+	dfield_set_data(field, buf2, n_fields
+			* DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE
+			+ dict_table_is_comp(index->table));
+	/* Set all the types in the new tuple binary */
+
+	dtuple_set_types_binary(tuple, n_fields + 4);
+
+	return(tuple);
+}
+
+/*********************************************************************//**
+Builds a search tuple used to search buffered inserts for an index page.
+This is for < 4.1.x format records
+@return	own: search tuple */
+static
+dtuple_t*
+ibuf_search_tuple_build(
+/*====================*/
+	ulint		space,	/*!< in: space id */
+	ulint		page_no,/*!< in: index page number */
+	mem_heap_t*	heap)	/*!< in: heap into which to build */
+{
+	dtuple_t*	tuple;
+	dfield_t*	field;
+	byte*		buf;
+
+	ut_a(space == 0);
+	ut_a(trx_doublewrite_must_reset_space_ids);
+	ut_a(!trx_sys_multiple_tablespace_format);
+
+	tuple = dtuple_create(heap, 1);
+
+	/* Store the page number in tuple */
+
+	field = dtuple_get_nth_field(tuple, 0);
+
+	buf = mem_heap_alloc(heap, 4);
+
+	mach_write_to_4(buf, page_no);
+
+	dfield_set_data(field, buf, 4);
+
+	dtuple_set_types_binary(tuple, 1);
+
+	return(tuple);
+}
+
+/*********************************************************************//**
+Builds a search tuple used to search buffered inserts for an index page.
+This is for >= 4.1.x format records.
+@return	own: search tuple */
+static
+dtuple_t*
+ibuf_new_search_tuple_build(
+/*========================*/
+	ulint		space,	/*!< in: space id */
+	ulint		page_no,/*!< in: index page number */
+	mem_heap_t*	heap)	/*!< in: heap into which to build */
+{
+	dtuple_t*	tuple;
+	dfield_t*	field;
+	byte*		buf;
+
+	ut_a(trx_sys_multiple_tablespace_format);
+
+	tuple = dtuple_create(heap, 3);
+
+	/* Store the space id in tuple */
+
+	field = dtuple_get_nth_field(tuple, 0);
+
+	buf = mem_heap_alloc(heap, 4);
+
+	mach_write_to_4(buf, space);
+
+	dfield_set_data(field, buf, 4);
+
+	/* Store the new format record marker byte */
+
+	field = dtuple_get_nth_field(tuple, 1);
+
+	buf = mem_heap_alloc(heap, 1);
+
+	mach_write_to_1(buf, 0);
+
+	dfield_set_data(field, buf, 1);
+
+	/* Store the page number in tuple */
+
+	field = dtuple_get_nth_field(tuple, 2);
+
+	buf = mem_heap_alloc(heap, 4);
+
+	mach_write_to_4(buf, page_no);
+
+	dfield_set_data(field, buf, 4);
+
+	dtuple_set_types_binary(tuple, 3);
+
+	return(tuple);
+}
+
+/*********************************************************************//**
+Checks if there are enough pages in the free list of the ibuf tree that we
+dare to start a pessimistic insert to the insert buffer.
+@return	TRUE if enough free pages in list */
+UNIV_INLINE
+ibool
+ibuf_data_enough_free_for_insert(void)
+/*==================================*/
+{
+	ut_ad(mutex_own(&ibuf_mutex));
+
+	/* We want a big margin of free pages, because a B-tree can sometimes
+	grow in size also if records are deleted from it, as the node pointers
+	can change, and we must make sure that we are able to delete the
+	inserts buffered for pages that we read to the buffer pool, without
+	any risk of running out of free space in the insert buffer. */
+
+	return(ibuf->free_list_len >= (ibuf->size / 2) + 3 * ibuf->height);
+}
+
+/*********************************************************************//**
+Checks if there are enough pages in the free list of the ibuf tree that we
+should remove them and free to the file space management.
+@return	TRUE if enough free pages in list */
+UNIV_INLINE
+ibool
+ibuf_data_too_much_free(void)
+/*=========================*/
+{
+	ut_ad(mutex_own(&ibuf_mutex));
+
+	return(ibuf->free_list_len >= 3 + (ibuf->size / 2) + 3 * ibuf->height);
+}
+
+/*********************************************************************//**
+Allocates a new page from the ibuf file segment and adds it to the free
+list.
+@return	DB_SUCCESS, or DB_STRONG_FAIL if no space left */
+static
+ulint
+ibuf_add_free_page(void)
+/*====================*/
+{
+	mtr_t	mtr;
+	page_t*	header_page;
+	ulint	flags;
+	ulint	zip_size;
+	ulint	page_no;
+	page_t*	page;
+	page_t*	root;
+	page_t*	bitmap_page;
+
+	mtr_start(&mtr);
+
+	/* Acquire the fsp latch before the ibuf header, obeying the latching
+	order */
+	mtr_x_lock(fil_space_get_latch(IBUF_SPACE_ID, &flags), &mtr);
+	zip_size = dict_table_flags_to_zip_size(flags);
+
+	header_page = ibuf_header_page_get(&mtr);
+
+	/* Allocate a new page: NOTE that if the page has been a part of a
+	non-clustered index which has subsequently been dropped, then the
+	page may have buffered inserts in the insert buffer, and these
+	should be deleted from there. These get deleted when the page
+	allocation creates the page in buffer. Thus the call below may end
+	up calling the insert buffer routines and, as we yet have no latches
+	to insert buffer tree pages, these routines can run without a risk
+	of a deadlock. This is the reason why we created a special ibuf
+	header page apart from the ibuf tree. */
+
+	page_no = fseg_alloc_free_page(
+		header_page + IBUF_HEADER + IBUF_TREE_SEG_HEADER, 0, FSP_UP,
+		&mtr);
+
+	if (page_no == FIL_NULL) {
+		mtr_commit(&mtr);
+
+		return(DB_STRONG_FAIL);
+	}
+
+	{
+		buf_block_t*	block;
+
+		block = buf_page_get(
+			IBUF_SPACE_ID, 0, page_no, RW_X_LATCH, &mtr);
+
+		buf_block_dbg_add_level(block, SYNC_TREE_NODE_NEW);
+
+
+		page = buf_block_get_frame(block);
+	}
+
+	ibuf_enter();
+
+	mutex_enter(&ibuf_mutex);
+
+	root = ibuf_tree_root_get(&mtr);
+
+	/* Add the page to the free list and update the ibuf size data */
+
+	flst_add_last(root + PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST,
+		      page + PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST_NODE, &mtr);
+
+	mlog_write_ulint(page + FIL_PAGE_TYPE, FIL_PAGE_IBUF_FREE_LIST,
+			 MLOG_2BYTES, &mtr);
+
+	ibuf->seg_size++;
+	ibuf->free_list_len++;
+
+	/* Set the bit indicating that this page is now an ibuf tree page
+	(level 2 page) */
+
+	bitmap_page = ibuf_bitmap_get_map_page(
+		IBUF_SPACE_ID, page_no, zip_size, &mtr);
+
+	ibuf_bitmap_page_set_bits(
+		bitmap_page, page_no, zip_size, IBUF_BITMAP_IBUF, TRUE, &mtr);
+
+	mtr_commit(&mtr);
+
+	mutex_exit(&ibuf_mutex);
+
+	ibuf_exit();
+
+	return(DB_SUCCESS);
+}
+
+/*********************************************************************//**
+Removes a page from the free list and frees it to the fsp system. */
+static
+void
+ibuf_remove_free_page(void)
+/*=======================*/
+{
+	mtr_t	mtr;
+	mtr_t	mtr2;
+	page_t*	header_page;
+	ulint	flags;
+	ulint	zip_size;
+	ulint	page_no;
+	page_t*	page;
+	page_t*	root;
+	page_t*	bitmap_page;
+
+	mtr_start(&mtr);
+
+	/* Acquire the fsp latch before the ibuf header, obeying the latching
+	order */
+	mtr_x_lock(fil_space_get_latch(IBUF_SPACE_ID, &flags), &mtr);
+	zip_size = dict_table_flags_to_zip_size(flags);
+
+	header_page = ibuf_header_page_get(&mtr);
+
+	/* Prevent pessimistic inserts to insert buffer trees for a while */
+	mutex_enter(&ibuf_pessimistic_insert_mutex);
+
+	ibuf_enter();
+
+	mutex_enter(&ibuf_mutex);
+
+	if (!ibuf_data_too_much_free()) {
+
+		mutex_exit(&ibuf_mutex);
+
+		ibuf_exit();
+
+		mutex_exit(&ibuf_pessimistic_insert_mutex);
+
+		mtr_commit(&mtr);
+
+		return;
+	}
+
+	mtr_start(&mtr2);
+
+	root = ibuf_tree_root_get(&mtr2);
+
+	page_no = flst_get_last(root + PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST,
+				&mtr2).page;
+
+	/* NOTE that we must release the latch on the ibuf tree root
+	because in fseg_free_page we access level 1 pages, and the root
+	is a level 2 page. */
+
+	mtr_commit(&mtr2);
+	mutex_exit(&ibuf_mutex);
+
+	ibuf_exit();
+
+	/* Since pessimistic inserts were prevented, we know that the
+	page is still in the free list. NOTE that also deletes may take
+	pages from the free list, but they take them from the start, and
+	the free list was so long that they cannot have taken the last
+	page from it. */
+
+	fseg_free_page(header_page + IBUF_HEADER + IBUF_TREE_SEG_HEADER,
+		       IBUF_SPACE_ID, page_no, &mtr);
+
+#ifdef UNIV_DEBUG_FILE_ACCESSES
+	buf_page_reset_file_page_was_freed(IBUF_SPACE_ID, page_no);
+#endif
+
+	ibuf_enter();
+
+	mutex_enter(&ibuf_mutex);
+
+	root = ibuf_tree_root_get(&mtr);
+
+	ut_ad(page_no == flst_get_last(root + PAGE_HEADER
+				       + PAGE_BTR_IBUF_FREE_LIST, &mtr).page);
+
+	{
+		buf_block_t*	block;
+
+		block = buf_page_get(
+			IBUF_SPACE_ID, 0, page_no, RW_X_LATCH, &mtr);
+
+		buf_block_dbg_add_level(block, SYNC_TREE_NODE);
+
+
+		page = buf_block_get_frame(block);
+	}
+
+	/* Remove the page from the free list and update the ibuf size data */
+
+	flst_remove(root + PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST,
+		    page + PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST_NODE, &mtr);
+
+	ibuf->seg_size--;
+	ibuf->free_list_len--;
+
+	mutex_exit(&ibuf_pessimistic_insert_mutex);
+
+	/* Set the bit indicating that this page is no more an ibuf tree page
+	(level 2 page) */
+
+	bitmap_page = ibuf_bitmap_get_map_page(
+		IBUF_SPACE_ID, page_no, zip_size, &mtr);
+
+	ibuf_bitmap_page_set_bits(
+		bitmap_page, page_no, zip_size, IBUF_BITMAP_IBUF, FALSE, &mtr);
+
+#ifdef UNIV_DEBUG_FILE_ACCESSES
+	buf_page_set_file_page_was_freed(IBUF_SPACE_ID, page_no);
+#endif
+	mtr_commit(&mtr);
+
+	mutex_exit(&ibuf_mutex);
+
+	ibuf_exit();
+}
+
+/***********************************************************************//**
+Frees excess pages from the ibuf free list. This function is called when an OS
+thread calls fsp services to allocate a new file segment, or a new page to a
+file segment, and the thread did not own the fsp latch before this call. */
+UNIV_INTERN
+void
+ibuf_free_excess_pages(void)
+/*========================*/
+{
+	ulint		i;
+
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(rw_lock_own(fil_space_get_latch(IBUF_SPACE_ID, NULL),
+			  RW_LOCK_EX));
+#endif /* UNIV_SYNC_DEBUG */
+
+	ut_ad(rw_lock_get_x_lock_count(
+		fil_space_get_latch(IBUF_SPACE_ID, NULL)) == 1);
+
+	ut_ad(!ibuf_inside());
+
+	/* NOTE: We require that the thread did not own the latch before,
+	because then we know that we can obey the correct latching order
+	for ibuf latches */
+
+	if (!ibuf) {
+		/* Not yet initialized; not sure if this is possible, but
+		does no harm to check for it. */
+
+		return;
+	}
+
+	/* Free at most a few pages at a time, so that we do not delay the
+	requested service too much */
+
+	for (i = 0; i < 4; i++) {
+
+		mutex_enter(&ibuf_mutex);
+
+		if (!ibuf_data_too_much_free()) {
+
+			mutex_exit(&ibuf_mutex);
+
+			return;
+		}
+
+		mutex_exit(&ibuf_mutex);
+
+		ibuf_remove_free_page();
+	}
+}
+
+/*********************************************************************//**
+Reads page numbers from a leaf in an ibuf tree.
+@return a lower limit for the combined volume of records which will be
+merged */
+static
+ulint
+ibuf_get_merge_page_nos(
+/*====================*/
+	ibool		contract,/*!< in: TRUE if this function is called to
+				contract the tree, FALSE if this is called
+				when a single page becomes full and we look
+				if it pays to read also nearby pages */
+	rec_t*		rec,	/*!< in: record from which we read up and down
+				in the chain of records */
+	ulint*		space_ids,/*!< in/out: space id's of the pages */
+	ib_int64_t*	space_versions,/*!< in/out: tablespace version
+				timestamps; used to prevent reading in old
+				pages after DISCARD + IMPORT tablespace */
+	ulint*		page_nos,/*!< in/out: buffer for at least
+				IBUF_MAX_N_PAGES_MERGED many page numbers;
+				the page numbers are in an ascending order */
+	ulint*		n_stored)/*!< out: number of page numbers stored to
+				page_nos in this function */
+{
+	ulint	prev_page_no;
+	ulint	prev_space_id;
+	ulint	first_page_no;
+	ulint	first_space_id;
+	ulint	rec_page_no;
+	ulint	rec_space_id;
+	ulint	sum_volumes;
+	ulint	volume_for_page;
+	ulint	rec_volume;
+	ulint	limit;
+	ulint	n_pages;
+
+	*n_stored = 0;
+
+	limit = ut_min(IBUF_MAX_N_PAGES_MERGED, buf_pool->curr_size / 4);
+
+	if (page_rec_is_supremum(rec)) {
+
+		rec = page_rec_get_prev(rec);
+	}
+
+	if (page_rec_is_infimum(rec)) {
+
+		rec = page_rec_get_next(rec);
+	}
+
+	if (page_rec_is_supremum(rec)) {
+
+		return(0);
+	}
+
+	first_page_no = ibuf_rec_get_page_no(rec);
+	first_space_id = ibuf_rec_get_space(rec);
+	n_pages = 0;
+	prev_page_no = 0;
+	prev_space_id = 0;
+
+	/* Go backwards from the first rec until we reach the border of the
+	'merge area', or the page start or the limit of storeable pages is
+	reached */
+
+	while (!page_rec_is_infimum(rec) && UNIV_LIKELY(n_pages < limit)) {
+
+		rec_page_no = ibuf_rec_get_page_no(rec);
+		rec_space_id = ibuf_rec_get_space(rec);
+
+		if (rec_space_id != first_space_id
+		    || (rec_page_no / IBUF_MERGE_AREA)
+		    != (first_page_no / IBUF_MERGE_AREA)) {
+
+			break;
+		}
+
+		if (rec_page_no != prev_page_no
+		    || rec_space_id != prev_space_id) {
+			n_pages++;
+		}
+
+		prev_page_no = rec_page_no;
+		prev_space_id = rec_space_id;
+
+		rec = page_rec_get_prev(rec);
+	}
+
+	rec = page_rec_get_next(rec);
+
+	/* At the loop start there is no prev page; we mark this with a pair
+	of space id, page no (0, 0) for which there can never be entries in
+	the insert buffer */
+
+	prev_page_no = 0;
+	prev_space_id = 0;
+	sum_volumes = 0;
+	volume_for_page = 0;
+
+	while (*n_stored < limit) {
+		if (page_rec_is_supremum(rec)) {
+			/* When no more records available, mark this with
+			another 'impossible' pair of space id, page no */
+			rec_page_no = 1;
+			rec_space_id = 0;
+		} else {
+			rec_page_no = ibuf_rec_get_page_no(rec);
+			rec_space_id = ibuf_rec_get_space(rec);
+			ut_ad(rec_page_no > IBUF_TREE_ROOT_PAGE_NO);
+		}
+
+#ifdef UNIV_IBUF_DEBUG
+		ut_a(*n_stored < IBUF_MAX_N_PAGES_MERGED);
+#endif
+		if ((rec_space_id != prev_space_id
+		     || rec_page_no != prev_page_no)
+		    && (prev_space_id != 0 || prev_page_no != 0)) {
+
+			if ((prev_page_no == first_page_no
+			     && prev_space_id == first_space_id)
+			    || contract
+			    || (volume_for_page
+				> ((IBUF_MERGE_THRESHOLD - 1)
+				   * 4 * UNIV_PAGE_SIZE
+				   / IBUF_PAGE_SIZE_PER_FREE_SPACE)
+				/ IBUF_MERGE_THRESHOLD)) {
+
+				space_ids[*n_stored] = prev_space_id;
+				space_versions[*n_stored]
+					= fil_space_get_version(prev_space_id);
+				page_nos[*n_stored] = prev_page_no;
+
+				(*n_stored)++;
+
+				sum_volumes += volume_for_page;
+			}
+
+			if (rec_space_id != first_space_id
+			    || rec_page_no / IBUF_MERGE_AREA
+			    != first_page_no / IBUF_MERGE_AREA) {
+
+				break;
+			}
+
+			volume_for_page = 0;
+		}
+
+		if (rec_page_no == 1 && rec_space_id == 0) {
+			/* Supremum record */
+
+			break;
+		}
+
+		rec_volume = ibuf_rec_get_volume(rec);
+
+		volume_for_page += rec_volume;
+
+		prev_page_no = rec_page_no;
+		prev_space_id = rec_space_id;
+
+		rec = page_rec_get_next(rec);
+	}
+
+#ifdef UNIV_IBUF_DEBUG
+	ut_a(*n_stored <= IBUF_MAX_N_PAGES_MERGED);
+#endif
+#if 0
+	fprintf(stderr, "Ibuf merge batch %lu pages %lu volume\n",
+		*n_stored, sum_volumes);
+#endif
+	return(sum_volumes);
+}
+
+/*********************************************************************//**
+Contracts insert buffer trees by reading pages to the buffer pool.
+@return a lower limit for the combined size in bytes of entries which
+will be merged from ibuf trees to the pages read, 0 if ibuf is
+empty */
+static
+ulint
+ibuf_contract_ext(
+/*==============*/
+	ulint*	n_pages,/*!< out: number of pages to which merged */
+	ibool	sync)	/*!< in: TRUE if the caller wants to wait for the
+			issued read with the highest tablespace address
+			to complete */
+{
+	btr_pcur_t	pcur;
+	ulint		page_nos[IBUF_MAX_N_PAGES_MERGED];
+	ulint		space_ids[IBUF_MAX_N_PAGES_MERGED];
+	ib_int64_t	space_versions[IBUF_MAX_N_PAGES_MERGED];
+	ulint		n_stored;
+	ulint		sum_sizes;
+	mtr_t		mtr;
+
+	*n_pages = 0;
+	ut_ad(!ibuf_inside());
+
+	mutex_enter(&ibuf_mutex);
+
+	if (ibuf->empty) {
+ibuf_is_empty:
+		mutex_exit(&ibuf_mutex);
+
+		return(0);
+	}
+
+	mtr_start(&mtr);
+
+	ibuf_enter();
+
+	/* Open a cursor to a randomly chosen leaf of the tree, at a random
+	position within the leaf */
+
+	btr_pcur_open_at_rnd_pos(ibuf->index, BTR_SEARCH_LEAF, &pcur, &mtr);
+
+	if (page_get_n_recs(btr_pcur_get_page(&pcur)) == 0) {
+		/* When the ibuf tree is emptied completely, the last record
+		is removed using an optimistic delete and ibuf_size_update
+		is not called, causing ibuf->empty to remain FALSE. If we do
+		not reset it to TRUE here then database shutdown will hang
+		in the loop in ibuf_contract_for_n_pages. */
+
+		ibuf->empty = TRUE;
+
+		ibuf_exit();
+
+		mtr_commit(&mtr);
+		btr_pcur_close(&pcur);
+
+		goto ibuf_is_empty;
+	}
+
+	mutex_exit(&ibuf_mutex);
+
+	sum_sizes = ibuf_get_merge_page_nos(TRUE, btr_pcur_get_rec(&pcur),
+					    space_ids, space_versions,
+					    page_nos, &n_stored);
+#if 0 /* defined UNIV_IBUF_DEBUG */
+	fprintf(stderr, "Ibuf contract sync %lu pages %lu volume %lu\n",
+		sync, n_stored, sum_sizes);
+#endif
+	ibuf_exit();
+
+	mtr_commit(&mtr);
+	btr_pcur_close(&pcur);
+
+	buf_read_ibuf_merge_pages(sync, space_ids, space_versions, page_nos,
+				  n_stored);
+	*n_pages = n_stored;
+
+	return(sum_sizes + 1);
+}
+
+/*********************************************************************//**
+Contracts insert buffer trees by reading pages to the buffer pool.
+@return a lower limit for the combined size in bytes of entries which
+will be merged from ibuf trees to the pages read, 0 if ibuf is
+empty */
+UNIV_INTERN
+ulint
+ibuf_contract(
+/*==========*/
+	ibool	sync)	/*!< in: TRUE if the caller wants to wait for the
+			issued read with the highest tablespace address
+			to complete */
+{
+	ulint	n_pages;
+
+	return(ibuf_contract_ext(&n_pages, sync));
+}
+
+/*********************************************************************//**
+Contracts insert buffer trees by reading pages to the buffer pool.
+@return a lower limit for the combined size in bytes of entries which
+will be merged from ibuf trees to the pages read, 0 if ibuf is
+empty */
+UNIV_INTERN
+ulint
+ibuf_contract_for_n_pages(
+/*======================*/
+	ibool	sync,	/*!< in: TRUE if the caller wants to wait for the
+			issued read with the highest tablespace address
+			to complete */
+	ulint	n_pages)/*!< in: try to read at least this many pages to
+			the buffer pool and merge the ibuf contents to
+			them */
+{
+	ulint	sum_bytes	= 0;
+	ulint	sum_pages	= 0;
+	ulint	n_bytes;
+	ulint	n_pag2;
+
+	while (sum_pages < n_pages) {
+		n_bytes = ibuf_contract_ext(&n_pag2, sync);
+
+		if (n_bytes == 0) {
+			return(sum_bytes);
+		}
+
+		sum_bytes += n_bytes;
+		sum_pages += n_pag2;
+	}
+
+	return(sum_bytes);
+}
+
+/*********************************************************************//**
+Contract insert buffer trees after insert if they are too big. */
+UNIV_INLINE
+void
+ibuf_contract_after_insert(
+/*=======================*/
+	ulint	entry_size)	/*!< in: size of a record which was inserted
+				into an ibuf tree */
+{
+	ibool	sync;
+	ulint	sum_sizes;
+	ulint	size;
+
+	mutex_enter(&ibuf_mutex);
+
+	if (!srv_ibuf_active_contract) {
+	if (ibuf->size < ibuf->max_size + IBUF_CONTRACT_ON_INSERT_NON_SYNC) {
+		mutex_exit(&ibuf_mutex);
+
+		return;
+	}
+	}
+
+	sync = FALSE;
+
+	if (ibuf->size >= ibuf->max_size + IBUF_CONTRACT_ON_INSERT_SYNC) {
+
+		sync = TRUE;
+	}
+
+	mutex_exit(&ibuf_mutex);
+
+	/* Contract at least entry_size many bytes */
+	sum_sizes = 0;
+	size = 1;
+
+	while ((size > 0) && (sum_sizes < entry_size)) {
+
+		size = ibuf_contract(sync);
+		sum_sizes += size;
+	}
+}
+
+/*********************************************************************//**
+Gets an upper limit for the combined size of entries buffered in the insert
+buffer for a given page.
+@return upper limit for the volume of buffered inserts for the index
+page, in bytes; UNIV_PAGE_SIZE, if the entries for the index page span
+several pages in the insert buffer */
+static
+ulint
+ibuf_get_volume_buffered(
+/*=====================*/
+	btr_pcur_t*	pcur,	/*!< in: pcur positioned at a place in an
+				insert buffer tree where we would insert an
+				entry for the index page whose number is
+				page_no, latch mode has to be BTR_MODIFY_PREV
+				or BTR_MODIFY_TREE */
+	ulint		space,	/*!< in: space id */
+	ulint		page_no,/*!< in: page number of an index page */
+	mtr_t*		mtr)	/*!< in: mtr */
+{
+	ulint	volume;
+	rec_t*	rec;
+	page_t*	page;
+	ulint	prev_page_no;
+	page_t*	prev_page;
+	ulint	next_page_no;
+	page_t*	next_page;
+
+	ut_a(trx_sys_multiple_tablespace_format);
+
+	ut_ad((pcur->latch_mode == BTR_MODIFY_PREV)
+	      || (pcur->latch_mode == BTR_MODIFY_TREE));
+
+	/* Count the volume of records earlier in the alphabetical order than
+	pcur */
+
+	volume = 0;
+
+	rec = btr_pcur_get_rec(pcur);
+	page = page_align(rec);
+
+	if (page_rec_is_supremum(rec)) {
+		rec = page_rec_get_prev(rec);
+	}
+
+	for (;;) {
+		if (page_rec_is_infimum(rec)) {
+
+			break;
+		}
+
+		if (page_no != ibuf_rec_get_page_no(rec)
+		    || space != ibuf_rec_get_space(rec)) {
+
+			goto count_later;
+		}
+
+		volume += ibuf_rec_get_volume(rec);
+
+		rec = page_rec_get_prev(rec);
+	}
+
+	/* Look at the previous page */
+
+	prev_page_no = btr_page_get_prev(page, mtr);
+
+	if (prev_page_no == FIL_NULL) {
+
+		goto count_later;
+	}
+
+	{
+		buf_block_t*	block;
+
+		block = buf_page_get(
+			IBUF_SPACE_ID, 0, prev_page_no, RW_X_LATCH, mtr);
+
+		buf_block_dbg_add_level(block, SYNC_TREE_NODE);
+
+
+		prev_page = buf_block_get_frame(block);
+	}
+
+#ifdef UNIV_BTR_DEBUG
+	ut_a(btr_page_get_next(prev_page, mtr)
+	     == page_get_page_no(page));
+#endif /* UNIV_BTR_DEBUG */
+
+	rec = page_get_supremum_rec(prev_page);
+	rec = page_rec_get_prev(rec);
+
+	for (;;) {
+		if (page_rec_is_infimum(rec)) {
+
+			/* We cannot go to yet a previous page, because we
+			do not have the x-latch on it, and cannot acquire one
+			because of the latching order: we have to give up */
+
+			return(UNIV_PAGE_SIZE);
+		}
+
+		if (page_no != ibuf_rec_get_page_no(rec)
+		    || space != ibuf_rec_get_space(rec)) {
+
+			goto count_later;
+		}
+
+		volume += ibuf_rec_get_volume(rec);
+
+		rec = page_rec_get_prev(rec);
+	}
+
+count_later:
+	rec = btr_pcur_get_rec(pcur);
+
+	if (!page_rec_is_supremum(rec)) {
+		rec = page_rec_get_next(rec);
+	}
+
+	for (;;) {
+		if (page_rec_is_supremum(rec)) {
+
+			break;
+		}
+
+		if (page_no != ibuf_rec_get_page_no(rec)
+		    || space != ibuf_rec_get_space(rec)) {
+
+			return(volume);
+		}
+
+		volume += ibuf_rec_get_volume(rec);
+
+		rec = page_rec_get_next(rec);
+	}
+
+	/* Look at the next page */
+
+	next_page_no = btr_page_get_next(page, mtr);
+
+	if (next_page_no == FIL_NULL) {
+
+		return(volume);
+	}
+
+	{
+		buf_block_t*	block;
+
+		block = buf_page_get(
+			IBUF_SPACE_ID, 0, next_page_no, RW_X_LATCH, mtr);
+
+		buf_block_dbg_add_level(block, SYNC_TREE_NODE);
+
+
+		next_page = buf_block_get_frame(block);
+	}
+
+#ifdef UNIV_BTR_DEBUG
+	ut_a(btr_page_get_prev(next_page, mtr) == page_get_page_no(page));
+#endif /* UNIV_BTR_DEBUG */
+
+	rec = page_get_infimum_rec(next_page);
+	rec = page_rec_get_next(rec);
+
+	for (;;) {
+		if (page_rec_is_supremum(rec)) {
+
+			/* We give up */
+
+			return(UNIV_PAGE_SIZE);
+		}
+
+		if (page_no != ibuf_rec_get_page_no(rec)
+		    || space != ibuf_rec_get_space(rec)) {
+
+			return(volume);
+		}
+
+		volume += ibuf_rec_get_volume(rec);
+
+		rec = page_rec_get_next(rec);
+	}
+}
+
+/*********************************************************************//**
+Reads the biggest tablespace id from the high end of the insert buffer
+tree and updates the counter in fil_system. */
+UNIV_INTERN
+void
+ibuf_update_max_tablespace_id(void)
+/*===============================*/
+{
+	ulint		max_space_id;
+	const rec_t*	rec;
+	const byte*	field;
+	ulint		len;
+	btr_pcur_t	pcur;
+	mtr_t		mtr;
+
+	ut_a(!dict_table_is_comp(ibuf->index->table));
+
+	ibuf_enter();
+
+	mtr_start(&mtr);
+
+	btr_pcur_open_at_index_side(
+		FALSE, ibuf->index, BTR_SEARCH_LEAF, &pcur, TRUE, &mtr);
+
+	btr_pcur_move_to_prev(&pcur, &mtr);
+
+	if (btr_pcur_is_before_first_on_page(&pcur)) {
+		/* The tree is empty */
+
+		max_space_id = 0;
+	} else {
+		rec = btr_pcur_get_rec(&pcur);
+
+		field = rec_get_nth_field_old(rec, 0, &len);
+
+		ut_a(len == 4);
+
+		max_space_id = mach_read_from_4(field);
+	}
+
+	mtr_commit(&mtr);
+	ibuf_exit();
+
+	/* printf("Maximum space id in insert buffer %lu\n", max_space_id); */
+
+	fil_set_max_space_id_if_bigger(max_space_id);
+}
+
+/*********************************************************************//**
+Makes an index insert to the insert buffer, instead of directly to the disk
+page, if this is possible.
+@return	DB_SUCCESS, DB_FAIL, DB_STRONG_FAIL */
+static
+ulint
+ibuf_insert_low(
+/*============*/
+	ulint		mode,	/*!< in: BTR_MODIFY_PREV or BTR_MODIFY_TREE */
+	const dtuple_t*	entry,	/*!< in: index entry to insert */
+	ulint		entry_size,
+				/*!< in: rec_get_converted_size(index, entry) */
+	dict_index_t*	index,	/*!< in: index where to insert; must not be
+				unique or clustered */
+	ulint		space,	/*!< in: space id where to insert */
+	ulint		zip_size,/*!< in: compressed page size in bytes, or 0 */
+	ulint		page_no,/*!< in: page number where to insert */
+	que_thr_t*	thr)	/*!< in: query thread */
+{
+	big_rec_t*	dummy_big_rec;
+	btr_pcur_t	pcur;
+	btr_cur_t*	cursor;
+	dtuple_t*	ibuf_entry;
+	mem_heap_t*	heap;
+	ulint		buffered;
+	rec_t*		ins_rec;
+	ibool		old_bit_value;
+	page_t*		bitmap_page;
+	page_t*		root;
+	ulint		err;
+	ibool		do_merge;
+	ulint		space_ids[IBUF_MAX_N_PAGES_MERGED];
+	ib_int64_t	space_versions[IBUF_MAX_N_PAGES_MERGED];
+	ulint		page_nos[IBUF_MAX_N_PAGES_MERGED];
+	ulint		n_stored;
+	ulint		bits;
+	mtr_t		mtr;
+	mtr_t		bitmap_mtr;
+
+	ut_a(!dict_index_is_clust(index));
+	ut_ad(dtuple_check_typed(entry));
+	ut_ad(ut_is_2pow(zip_size));
+
+	ut_a(trx_sys_multiple_tablespace_format);
+
+	do_merge = FALSE;
+
+	mutex_enter(&ibuf_mutex);
+
+	if (ibuf->size >= ibuf->max_size + IBUF_CONTRACT_DO_NOT_INSERT) {
+		/* Insert buffer is now too big, contract it but do not try
+		to insert */
+
+		mutex_exit(&ibuf_mutex);
+
+#ifdef UNIV_IBUF_DEBUG
+		fputs("Ibuf too big\n", stderr);
+#endif
+		/* Use synchronous contract (== TRUE) */
+		ibuf_contract(TRUE);
+
+		return(DB_STRONG_FAIL);
+	}
+
+	mutex_exit(&ibuf_mutex);
+
+	if (mode == BTR_MODIFY_TREE) {
+		mutex_enter(&ibuf_pessimistic_insert_mutex);
+
+		ibuf_enter();
+
+		mutex_enter(&ibuf_mutex);
+
+		while (!ibuf_data_enough_free_for_insert()) {
+
+			mutex_exit(&ibuf_mutex);
+
+			ibuf_exit();
+
+			mutex_exit(&ibuf_pessimistic_insert_mutex);
+
+			err = ibuf_add_free_page();
+
+			if (err == DB_STRONG_FAIL) {
+
+				return(err);
+			}
+
+			mutex_enter(&ibuf_pessimistic_insert_mutex);
+
+			ibuf_enter();
+
+			mutex_enter(&ibuf_mutex);
+		}
+	} else {
+		ibuf_enter();
+	}
+
+	heap = mem_heap_create(512);
+
+	/* Build the entry which contains the space id and the page number as
+	the first fields and the type information for other fields, and which
+	will be inserted to the insert buffer. */
+
+	ibuf_entry = ibuf_entry_build(index, entry, space, page_no, heap);
+
+	/* Open a cursor to the insert buffer tree to calculate if we can add
+	the new entry to it without exceeding the free space limit for the
+	page. */
+
+	mtr_start(&mtr);
+
+	btr_pcur_open(ibuf->index, ibuf_entry, PAGE_CUR_LE, mode, &pcur, &mtr);
+
+	/* Find out the volume of already buffered inserts for the same index
+	page */
+	buffered = ibuf_get_volume_buffered(&pcur, space, page_no, &mtr);
+
+#ifdef UNIV_IBUF_COUNT_DEBUG
+	ut_a((buffered == 0) || ibuf_count_get(space, page_no));
+#endif
+	mtr_start(&bitmap_mtr);
+
+	bitmap_page = ibuf_bitmap_get_map_page(space, page_no,
+					       zip_size, &bitmap_mtr);
+
+	/* We check if the index page is suitable for buffered entries */
+
+	if (buf_page_peek(space, page_no)
+	    || lock_rec_expl_exist_on_page(space, page_no)) {
+		err = DB_STRONG_FAIL;
+
+		mtr_commit(&bitmap_mtr);
+
+		goto function_exit;
+	}
+
+	bits = ibuf_bitmap_page_get_bits(bitmap_page, page_no, zip_size,
+					 IBUF_BITMAP_FREE, &bitmap_mtr);
+
+	if (buffered + entry_size + page_dir_calc_reserved_space(1)
+	    > ibuf_index_page_calc_free_from_bits(zip_size, bits)) {
+		mtr_commit(&bitmap_mtr);
+
+		/* It may not fit */
+		err = DB_STRONG_FAIL;
+
+		do_merge = TRUE;
+
+		ibuf_get_merge_page_nos(FALSE, btr_pcur_get_rec(&pcur),
+					space_ids, space_versions,
+					page_nos, &n_stored);
+		goto function_exit;
+	}
+
+	/* Set the bitmap bit denoting that the insert buffer contains
+	buffered entries for this index page, if the bit is not set yet */
+
+	old_bit_value = ibuf_bitmap_page_get_bits(
+		bitmap_page, page_no, zip_size,
+		IBUF_BITMAP_BUFFERED, &bitmap_mtr);
+
+	if (!old_bit_value) {
+		ibuf_bitmap_page_set_bits(bitmap_page, page_no, zip_size,
+					  IBUF_BITMAP_BUFFERED, TRUE,
+					  &bitmap_mtr);
+	}
+
+	mtr_commit(&bitmap_mtr);
+
+	cursor = btr_pcur_get_btr_cur(&pcur);
+
+	if (mode == BTR_MODIFY_PREV) {
+		err = btr_cur_optimistic_insert(BTR_NO_LOCKING_FLAG, cursor,
+						ibuf_entry, &ins_rec,
+						&dummy_big_rec, 0, thr, &mtr);
+		if (err == DB_SUCCESS) {
+			/* Update the page max trx id field */
+			page_update_max_trx_id(btr_cur_get_block(cursor), NULL,
+					       thr_get_trx(thr)->id, &mtr);
+		}
+	} else {
+		ut_ad(mode == BTR_MODIFY_TREE);
+
+		/* We acquire an x-latch to the root page before the insert,
+		because a pessimistic insert releases the tree x-latch,
+		which would cause the x-latching of the root after that to
+		break the latching order. */
+
+		root = ibuf_tree_root_get(&mtr);
+
+		err = btr_cur_pessimistic_insert(BTR_NO_LOCKING_FLAG
+						 | BTR_NO_UNDO_LOG_FLAG,
+						 cursor,
+						 ibuf_entry, &ins_rec,
+						 &dummy_big_rec, 0, thr, &mtr);
+		if (err == DB_SUCCESS) {
+			/* Update the page max trx id field */
+			page_update_max_trx_id(btr_cur_get_block(cursor), NULL,
+					       thr_get_trx(thr)->id, &mtr);
+		}
+
+		ibuf_size_update(root, &mtr);
+	}
+
+function_exit:
+#ifdef UNIV_IBUF_COUNT_DEBUG
+	if (err == DB_SUCCESS) {
+		fprintf(stderr,
+			"Incrementing ibuf count of space %lu page %lu\n"
+			"from %lu by 1\n", space, page_no,
+			ibuf_count_get(space, page_no));
+
+		ibuf_count_set(space, page_no,
+			       ibuf_count_get(space, page_no) + 1);
+	}
+#endif
+	if (mode == BTR_MODIFY_TREE) {
+
+		mutex_exit(&ibuf_mutex);
+		mutex_exit(&ibuf_pessimistic_insert_mutex);
+	}
+
+	mtr_commit(&mtr);
+	btr_pcur_close(&pcur);
+	ibuf_exit();
+
+	mem_heap_free(heap);
+
+	if (err == DB_SUCCESS) {
+		mutex_enter(&ibuf_mutex);
+
+		ibuf->empty = FALSE;
+		ibuf->n_inserts++;
+
+		mutex_exit(&ibuf_mutex);
+
+		if (mode == BTR_MODIFY_TREE) {
+			ibuf_contract_after_insert(entry_size);
+		}
+	}
+
+	if (do_merge) {
+#ifdef UNIV_IBUF_DEBUG
+		ut_a(n_stored <= IBUF_MAX_N_PAGES_MERGED);
+#endif
+		buf_read_ibuf_merge_pages(FALSE, space_ids, space_versions,
+					  page_nos, n_stored);
+	}
+
+	return(err);
+}
+
+/*********************************************************************//**
+Makes an index insert to the insert buffer, instead of directly to the disk
+page, if this is possible. Does not do insert if the index is clustered
+or unique.
+@return	TRUE if success */
+UNIV_INTERN
+ibool
+ibuf_insert(
+/*========*/
+	const dtuple_t*	entry,	/*!< in: index entry to insert */
+	dict_index_t*	index,	/*!< in: index where to insert */
+	ulint		space,	/*!< in: space id where to insert */
+	ulint		zip_size,/*!< in: compressed page size in bytes, or 0 */
+	ulint		page_no,/*!< in: page number where to insert */
+	que_thr_t*	thr)	/*!< in: query thread */
+{
+	ulint	err;
+	ulint	entry_size;
+
+	ut_a(trx_sys_multiple_tablespace_format);
+	ut_ad(dtuple_check_typed(entry));
+	ut_ad(ut_is_2pow(zip_size));
+
+	ut_a(!dict_index_is_clust(index));
+
+	switch (UNIV_EXPECT(ibuf_use, IBUF_USE_INSERT)) {
+	case IBUF_USE_NONE:
+		return(FALSE);
+	case IBUF_USE_INSERT:
+		goto do_insert;
+	case IBUF_USE_COUNT:
+		break;
+	}
+
+	ut_error; /* unknown value of ibuf_use */
+
+do_insert:
+	entry_size = rec_get_converted_size(index, entry, 0);
+
+	if (entry_size
+	    >= (page_get_free_space_of_empty(dict_table_is_comp(index->table))
+		/ 2)) {
+		return(FALSE);
+	}
+
+	err = ibuf_insert_low(BTR_MODIFY_PREV, entry, entry_size,
+			      index, space, zip_size, page_no, thr);
+	if (err == DB_FAIL) {
+		err = ibuf_insert_low(BTR_MODIFY_TREE, entry, entry_size,
+				      index, space, zip_size, page_no, thr);
+	}
+
+	if (err == DB_SUCCESS) {
+#ifdef UNIV_IBUF_DEBUG
+		/* fprintf(stderr, "Ibuf insert for page no %lu of index %s\n",
+		page_no, index->name); */
+#endif
+		return(TRUE);
+
+	} else {
+		ut_a(err == DB_STRONG_FAIL);
+
+		return(FALSE);
+	}
+}
+
+/********************************************************************//**
+During merge, inserts to an index page a secondary index entry extracted
+from the insert buffer. */
+static
+void
+ibuf_insert_to_index_page(
+/*======================*/
+	dtuple_t*	entry,	/*!< in: buffered entry to insert */
+	buf_block_t*	block,	/*!< in/out: index page where the buffered entry
+				should be placed */
+	dict_index_t*	index,	/*!< in: record descriptor */
+	mtr_t*		mtr)	/*!< in: mtr */
+{
+	page_cur_t	page_cur;
+	ulint		low_match;
+	page_t*		page		= buf_block_get_frame(block);
+	rec_t*		rec;
+	page_t*		bitmap_page;
+	ulint		old_bits;
+
+	ut_ad(ibuf_inside());
+	ut_ad(dtuple_check_typed(entry));
+
+	if (UNIV_UNLIKELY(dict_table_is_comp(index->table)
+			  != (ibool)!!page_is_comp(page))) {
+		fputs("InnoDB: Trying to insert a record from"
+		      " the insert buffer to an index page\n"
+		      "InnoDB: but the 'compact' flag does not match!\n",
+		      stderr);
+		goto dump;
+	}
+
+	rec = page_rec_get_next(page_get_infimum_rec(page));
+
+	if (UNIV_UNLIKELY(rec_get_n_fields(rec, index)
+			  != dtuple_get_n_fields(entry))) {
+		fputs("InnoDB: Trying to insert a record from"
+		      " the insert buffer to an index page\n"
+		      "InnoDB: but the number of fields does not match!\n",
+		      stderr);
+dump:
+		buf_page_print(page, 0);
+
+		dtuple_print(stderr, entry);
+
+		fputs("InnoDB: The table where where"
+		      " this index record belongs\n"
+		      "InnoDB: is now probably corrupt."
+		      " Please run CHECK TABLE on\n"
+		      "InnoDB: your tables.\n"
+		      "InnoDB: Submit a detailed bug report to"
+		      " http://bugs.mysql.com!\n", stderr);
+
+		return;
+	}
+
+	low_match = page_cur_search(block, index, entry,
+				    PAGE_CUR_LE, &page_cur);
+
+	if (low_match == dtuple_get_n_fields(entry)) {
+		page_zip_des_t*	page_zip;
+
+		rec = page_cur_get_rec(&page_cur);
+		page_zip = buf_block_get_page_zip(block);
+
+		btr_cur_del_unmark_for_ibuf(rec, page_zip, mtr);
+	} else {
+		rec = page_cur_tuple_insert(&page_cur, entry, index, 0, mtr);
+
+		if (UNIV_LIKELY(rec != NULL)) {
+			return;
+		}
+
+		/* If the record did not fit, reorganize */
+
+		btr_page_reorganize(block, index, mtr);
+		page_cur_search(block, index, entry, PAGE_CUR_LE, &page_cur);
+
+		/* This time the record must fit */
+		if (UNIV_UNLIKELY
+		    (!page_cur_tuple_insert(&page_cur, entry, index,
+					    0, mtr))) {
+			ulint	space;
+			ulint	page_no;
+			ulint	zip_size;
+
+			ut_print_timestamp(stderr);
+
+			fprintf(stderr,
+				"  InnoDB: Error: Insert buffer insert"
+				" fails; page free %lu,"
+				" dtuple size %lu\n",
+				(ulong) page_get_max_insert_size(
+					page, 1),
+				(ulong) rec_get_converted_size(
+					index, entry, 0));
+			fputs("InnoDB: Cannot insert index record ",
+			      stderr);
+			dtuple_print(stderr, entry);
+			fputs("\nInnoDB: The table where"
+			      " this index record belongs\n"
+			      "InnoDB: is now probably corrupt."
+			      " Please run CHECK TABLE on\n"
+			      "InnoDB: that table.\n", stderr);
+
+			space = page_get_space_id(page);
+			zip_size = buf_block_get_zip_size(block);
+			page_no = page_get_page_no(page);
+
+			bitmap_page = ibuf_bitmap_get_map_page(
+				space, page_no, zip_size, mtr);
+			old_bits = ibuf_bitmap_page_get_bits(
+				bitmap_page, page_no, zip_size,
+				IBUF_BITMAP_FREE, mtr);
+
+			fprintf(stderr,
+				"InnoDB: space %lu, page %lu,"
+				" zip_size %lu, bitmap bits %lu\n",
+				(ulong) space, (ulong) page_no,
+				(ulong) zip_size, (ulong) old_bits);
+
+			fputs("InnoDB: Submit a detailed bug report"
+			      " to http://bugs.mysql.com\n", stderr);
+		}
+	}
+}
+
+/*********************************************************************//**
+Deletes from ibuf the record on which pcur is positioned. If we have to
+resort to a pessimistic delete, this function commits mtr and closes
+the cursor.
+@return	TRUE if mtr was committed and pcur closed in this operation */
+static
+ibool
+ibuf_delete_rec(
+/*============*/
+	ulint		space,	/*!< in: space id */
+	ulint		page_no,/*!< in: index page number where the record
+				should belong */
+	btr_pcur_t*	pcur,	/*!< in: pcur positioned on the record to
+				delete, having latch mode BTR_MODIFY_LEAF */
+	const dtuple_t*	search_tuple,
+				/*!< in: search tuple for entries of page_no */
+	mtr_t*		mtr)	/*!< in: mtr */
+{
+	ibool		success;
+	page_t*		root;
+	ulint		err;
+
+	ut_ad(ibuf_inside());
+	ut_ad(page_rec_is_user_rec(btr_pcur_get_rec(pcur)));
+	ut_ad(ibuf_rec_get_page_no(btr_pcur_get_rec(pcur)) == page_no);
+	ut_ad(ibuf_rec_get_space(btr_pcur_get_rec(pcur)) == space);
+
+	success = btr_cur_optimistic_delete(btr_pcur_get_btr_cur(pcur), mtr);
+
+	if (success) {
+#ifdef UNIV_IBUF_COUNT_DEBUG
+		fprintf(stderr,
+			"Decrementing ibuf count of space %lu page %lu\n"
+			"from %lu by 1\n", space, page_no,
+			ibuf_count_get(space, page_no));
+		ibuf_count_set(space, page_no,
+			       ibuf_count_get(space, page_no) - 1);
+#endif
+		return(FALSE);
+	}
+
+	ut_ad(page_rec_is_user_rec(btr_pcur_get_rec(pcur)));
+	ut_ad(ibuf_rec_get_page_no(btr_pcur_get_rec(pcur)) == page_no);
+	ut_ad(ibuf_rec_get_space(btr_pcur_get_rec(pcur)) == space);
+
+	/* We have to resort to a pessimistic delete from ibuf */
+	btr_pcur_store_position(pcur, mtr);
+
+	btr_pcur_commit_specify_mtr(pcur, mtr);
+
+	mutex_enter(&ibuf_mutex);
+
+	mtr_start(mtr);
+
+	success = btr_pcur_restore_position(BTR_MODIFY_TREE, pcur, mtr);
+
+	if (!success) {
+		if (fil_space_get_flags(space) == ULINT_UNDEFINED) {
+			/* The tablespace has been dropped.  It is possible
+			that another thread has deleted the insert buffer
+			entry.  Do not complain. */
+			goto commit_and_exit;
+		}
+
+		fprintf(stderr,
+			"InnoDB: ERROR: Submit the output to"
+			" http://bugs.mysql.com\n"
+			"InnoDB: ibuf cursor restoration fails!\n"
+			"InnoDB: ibuf record inserted to page %lu\n",
+			(ulong) page_no);
+		fflush(stderr);
+
+		rec_print_old(stderr, btr_pcur_get_rec(pcur));
+		rec_print_old(stderr, pcur->old_rec);
+		dtuple_print(stderr, search_tuple);
+
+		rec_print_old(stderr,
+			      page_rec_get_next(btr_pcur_get_rec(pcur)));
+		fflush(stderr);
+
+		btr_pcur_commit_specify_mtr(pcur, mtr);
+
+		fputs("InnoDB: Validating insert buffer tree:\n", stderr);
+		if (!btr_validate_index(ibuf->index, NULL)) {
+			ut_error;
+		}
+
+		fprintf(stderr, "InnoDB: ibuf tree ok\n");
+		fflush(stderr);
+
+		goto func_exit;
+	}
+
+	root = ibuf_tree_root_get(mtr);
+
+	btr_cur_pessimistic_delete(&err, TRUE, btr_pcur_get_btr_cur(pcur),
+				   RB_NONE, mtr);
+	ut_a(err == DB_SUCCESS);
+
+#ifdef UNIV_IBUF_COUNT_DEBUG
+	ibuf_count_set(space, page_no, ibuf_count_get(space, page_no) - 1);
+#endif
+	ibuf_size_update(root, mtr);
+
+commit_and_exit:
+	btr_pcur_commit_specify_mtr(pcur, mtr);
+
+func_exit:
+	btr_pcur_close(pcur);
+
+	mutex_exit(&ibuf_mutex);
+
+	return(TRUE);
+}
+
+/*********************************************************************//**
+When an index page is read from a disk to the buffer pool, this function
+inserts to the page the possible index entries buffered in the insert buffer.
+The entries are deleted from the insert buffer. If the page is not read, but
+created in the buffer pool, this function deletes its buffered entries from
+the insert buffer; there can exist entries for such a page if the page
+belonged to an index which subsequently was dropped. */
+UNIV_INTERN
+void
+ibuf_merge_or_delete_for_page(
+/*==========================*/
+	buf_block_t*	block,	/*!< in: if page has been read from
+				disk, pointer to the page x-latched,
+				else NULL */
+	ulint		space,	/*!< in: space id of the index page */
+	ulint		page_no,/*!< in: page number of the index page */
+	ulint		zip_size,/*!< in: compressed page size in bytes,
+				or 0 */
+	ibool		update_ibuf_bitmap)/*!< in: normally this is set
+				to TRUE, but if we have deleted or are
+				deleting the tablespace, then we
+				naturally do not want to update a
+				non-existent bitmap page */
+{
+	mem_heap_t*	heap;
+	btr_pcur_t	pcur;
+	dtuple_t*	search_tuple;
+	ulint		n_inserts;
+#ifdef UNIV_IBUF_DEBUG
+	ulint		volume;
+#endif
+	page_zip_des_t*	page_zip		= NULL;
+	ibool		tablespace_being_deleted = FALSE;
+	ibool		corruption_noticed	= FALSE;
+	mtr_t		mtr;
+
+	ut_ad(!block || buf_block_get_space(block) == space);
+	ut_ad(!block || buf_block_get_page_no(block) == page_no);
+	ut_ad(!block || buf_block_get_zip_size(block) == zip_size);
+
+	if (srv_force_recovery >= SRV_FORCE_NO_IBUF_MERGE
+	    || trx_sys_hdr_page(space, page_no)) {
+		return;
+	}
+
+	/* We cannot refer to zip_size in the following, because
+	zip_size is passed as ULINT_UNDEFINED (it is unknown) when
+	buf_read_ibuf_merge_pages() is merging (discarding) changes
+	for a dropped tablespace.  When block != NULL or
+	update_ibuf_bitmap is specified, the zip_size must be known.
+	That is why we will repeat the check below, with zip_size in
+	place of 0.  Passing zip_size as 0 assumes that the
+	uncompressed page size always is a power-of-2 multiple of the
+	compressed page size. */
+
+	if (ibuf_fixed_addr_page(space, 0, page_no)
+	    || fsp_descr_page(0, page_no)) {
+		return;
+	}
+
+	if (UNIV_LIKELY(update_ibuf_bitmap)) {
+		ut_a(ut_is_2pow(zip_size));
+
+		if (ibuf_fixed_addr_page(space, zip_size, page_no)
+		    || fsp_descr_page(zip_size, page_no)) {
+			return;
+		}
+
+		/* If the following returns FALSE, we get the counter
+		incremented, and must decrement it when we leave this
+		function. When the counter is > 0, that prevents tablespace
+		from being dropped. */
+
+		tablespace_being_deleted = fil_inc_pending_ibuf_merges(space);
+
+		if (UNIV_UNLIKELY(tablespace_being_deleted)) {
+			/* Do not try to read the bitmap page from space;
+			just delete the ibuf records for the page */
+
+			block = NULL;
+			update_ibuf_bitmap = FALSE;
+		} else {
+			page_t*	bitmap_page;
+
+			mtr_start(&mtr);
+
+			bitmap_page = ibuf_bitmap_get_map_page(
+				space, page_no, zip_size, &mtr);
+
+			if (!ibuf_bitmap_page_get_bits(bitmap_page, page_no,
+						       zip_size,
+						       IBUF_BITMAP_BUFFERED,
+						       &mtr)) {
+				/* No inserts buffered for this page */
+				mtr_commit(&mtr);
+
+				if (!tablespace_being_deleted) {
+					fil_decr_pending_ibuf_merges(space);
+				}
+
+				return;
+			}
+			mtr_commit(&mtr);
+		}
+	} else if (block
+		   && (ibuf_fixed_addr_page(space, zip_size, page_no)
+		      || fsp_descr_page(zip_size, page_no))) {
+
+		return;
+	}
+
+	ibuf_enter();
+
+	heap = mem_heap_create(512);
+
+	if (!trx_sys_multiple_tablespace_format) {
+		ut_a(trx_doublewrite_must_reset_space_ids);
+		search_tuple = ibuf_search_tuple_build(space, page_no, heap);
+	} else {
+		search_tuple = ibuf_new_search_tuple_build(space, page_no,
+							   heap);
+	}
+
+	if (block) {
+		/* Move the ownership of the x-latch on the page to this OS
+		thread, so that we can acquire a second x-latch on it. This
+		is needed for the insert operations to the index page to pass
+		the debug checks. */
+
+		rw_lock_x_lock_move_ownership(&(block->lock));
+		page_zip = buf_block_get_page_zip(block);
+
+		if (UNIV_UNLIKELY(fil_page_get_type(block->frame)
+				  != FIL_PAGE_INDEX)
+		    || UNIV_UNLIKELY(!page_is_leaf(block->frame))) {
+
+			page_t*	bitmap_page;
+
+			corruption_noticed = TRUE;
+
+			ut_print_timestamp(stderr);
+
+			mtr_start(&mtr);
+
+			fputs("  InnoDB: Dump of the ibuf bitmap page:\n",
+			      stderr);
+
+			bitmap_page = ibuf_bitmap_get_map_page(space, page_no,
+							       zip_size, &mtr);
+			buf_page_print(bitmap_page, 0);
+
+			mtr_commit(&mtr);
+
+			fputs("\nInnoDB: Dump of the page:\n", stderr);
+
+			buf_page_print(block->frame, 0);
+
+			fprintf(stderr,
+				"InnoDB: Error: corruption in the tablespace."
+				" Bitmap shows insert\n"
+				"InnoDB: buffer records to page n:o %lu"
+				" though the page\n"
+				"InnoDB: type is %lu, which is"
+				" not an index leaf page!\n"
+				"InnoDB: We try to resolve the problem"
+				" by skipping the insert buffer\n"
+				"InnoDB: merge for this page."
+				" Please run CHECK TABLE on your tables\n"
+				"InnoDB: to determine if they are corrupt"
+				" after this.\n\n"
+				"InnoDB: Please submit a detailed bug report"
+				" to http://bugs.mysql.com\n\n",
+				(ulong) page_no,
+				(ulong)
+				fil_page_get_type(block->frame));
+		}
+	}
+
+	n_inserts = 0;
+#ifdef UNIV_IBUF_DEBUG
+	volume = 0;
+#endif
+loop:
+	mtr_start(&mtr);
+
+	if (block) {
+		ibool success;
+
+		success = buf_page_get_known_nowait(
+			RW_X_LATCH, block,
+			BUF_KEEP_OLD, __FILE__, __LINE__, &mtr);
+
+		ut_a(success);
+
+		buf_block_dbg_add_level(block, SYNC_TREE_NODE);
+	}
+
+	/* Position pcur in the insert buffer at the first entry for this
+	index page */
+	btr_pcur_open_on_user_rec(
+		ibuf->index, search_tuple, PAGE_CUR_GE, BTR_MODIFY_LEAF,
+		&pcur, &mtr);
+
+	if (!btr_pcur_is_on_user_rec(&pcur)) {
+		ut_ad(btr_pcur_is_after_last_in_tree(&pcur, &mtr));
+
+		goto reset_bit;
+	}
+
+	for (;;) {
+		rec_t*	rec;
+
+		ut_ad(btr_pcur_is_on_user_rec(&pcur));
+
+		rec = btr_pcur_get_rec(&pcur);
+
+		/* Check if the entry is for this index page */
+		if (ibuf_rec_get_page_no(rec) != page_no
+		    || ibuf_rec_get_space(rec) != space) {
+
+			if (block) {
+				page_header_reset_last_insert(
+					block->frame, page_zip, &mtr);
+			}
+
+			goto reset_bit;
+		}
+
+		if (UNIV_UNLIKELY(corruption_noticed)) {
+			fputs("InnoDB: Discarding record\n ", stderr);
+			rec_print_old(stderr, rec);
+			fputs("\nInnoDB: from the insert buffer!\n\n", stderr);
+		} else if (block) {
+			/* Now we have at pcur a record which should be
+			inserted to the index page; NOTE that the call below
+			copies pointers to fields in rec, and we must
+			keep the latch to the rec page until the
+			insertion is finished! */
+			dtuple_t*	entry;
+			trx_id_t	max_trx_id;
+			dict_index_t*	dummy_index;
+
+			max_trx_id = page_get_max_trx_id(page_align(rec));
+			page_update_max_trx_id(block, page_zip, max_trx_id,
+					       &mtr);
+
+			entry = ibuf_build_entry_from_ibuf_rec(
+				rec, heap, &dummy_index);
+#ifdef UNIV_IBUF_DEBUG
+			volume += rec_get_converted_size(dummy_index, entry, 0)
+				+ page_dir_calc_reserved_space(1);
+			ut_a(volume <= 4 * UNIV_PAGE_SIZE
+			     / IBUF_PAGE_SIZE_PER_FREE_SPACE);
+#endif
+			ibuf_insert_to_index_page(entry, block,
+						  dummy_index, &mtr);
+			ibuf_dummy_index_free(dummy_index);
+		}
+
+		n_inserts++;
+
+		/* Delete the record from ibuf */
+		if (ibuf_delete_rec(space, page_no, &pcur, search_tuple,
+				    &mtr)) {
+			/* Deletion was pessimistic and mtr was committed:
+			we start from the beginning again */
+
+			goto loop;
+		} else if (btr_pcur_is_after_last_on_page(&pcur)) {
+			mtr_commit(&mtr);
+			btr_pcur_close(&pcur);
+
+			goto loop;
+		}
+	}
+
+reset_bit:
+#ifdef UNIV_IBUF_COUNT_DEBUG
+	if (ibuf_count_get(space, page_no) > 0) {
+		/* btr_print_tree(ibuf_data->index->tree, 100);
+		ibuf_print(); */
+	}
+#endif
+	if (UNIV_LIKELY(update_ibuf_bitmap)) {
+		page_t*	bitmap_page;
+
+		bitmap_page = ibuf_bitmap_get_map_page(
+			space, page_no, zip_size, &mtr);
+
+		ibuf_bitmap_page_set_bits(
+			bitmap_page, page_no, zip_size,
+			IBUF_BITMAP_BUFFERED, FALSE, &mtr);
+
+		if (block) {
+			ulint old_bits = ibuf_bitmap_page_get_bits(
+				bitmap_page, page_no, zip_size,
+				IBUF_BITMAP_FREE, &mtr);
+
+			ulint new_bits = ibuf_index_page_calc_free(
+				zip_size, block);
+
+			if (old_bits != new_bits) {
+				ibuf_bitmap_page_set_bits(
+					bitmap_page, page_no, zip_size,
+					IBUF_BITMAP_FREE, new_bits, &mtr);
+			}
+		}
+	}
+
+	mtr_commit(&mtr);
+	btr_pcur_close(&pcur);
+	mem_heap_free(heap);
+
+	/* Protect our statistics keeping from race conditions */
+	mutex_enter(&ibuf_mutex);
+
+	ibuf->n_merges++;
+	ibuf->n_merged_recs += n_inserts;
+
+	mutex_exit(&ibuf_mutex);
+
+	if (update_ibuf_bitmap && !tablespace_being_deleted) {
+
+		fil_decr_pending_ibuf_merges(space);
+	}
+
+	ibuf_exit();
+
+#ifdef UNIV_IBUF_COUNT_DEBUG
+	ut_a(ibuf_count_get(space, page_no) == 0);
+#endif
+}
+
+/*********************************************************************//**
+Deletes all entries in the insert buffer for a given space id. This is used
+in DISCARD TABLESPACE and IMPORT TABLESPACE.
+NOTE: this does not update the page free bitmaps in the space. The space will
+become CORRUPT when you call this function! */
+UNIV_INTERN
+void
+ibuf_delete_for_discarded_space(
+/*============================*/
+	ulint	space)	/*!< in: space id */
+{
+	mem_heap_t*	heap;
+	btr_pcur_t	pcur;
+	dtuple_t*	search_tuple;
+	rec_t*		ibuf_rec;
+	ulint		page_no;
+	ibool		closed;
+	ulint		n_inserts;
+	mtr_t		mtr;
+
+	heap = mem_heap_create(512);
+
+	/* Use page number 0 to build the search tuple so that we get the
+	cursor positioned at the first entry for this space id */
+
+	search_tuple = ibuf_new_search_tuple_build(space, 0, heap);
+
+	n_inserts = 0;
+loop:
+	ibuf_enter();
+
+	mtr_start(&mtr);
+
+	/* Position pcur in the insert buffer at the first entry for the
+	space */
+	btr_pcur_open_on_user_rec(
+		ibuf->index, search_tuple, PAGE_CUR_GE, BTR_MODIFY_LEAF,
+		&pcur, &mtr);
+
+	if (!btr_pcur_is_on_user_rec(&pcur)) {
+		ut_ad(btr_pcur_is_after_last_in_tree(&pcur, &mtr));
+
+		goto leave_loop;
+	}
+
+	for (;;) {
+		ut_ad(btr_pcur_is_on_user_rec(&pcur));
+
+		ibuf_rec = btr_pcur_get_rec(&pcur);
+
+		/* Check if the entry is for this space */
+		if (ibuf_rec_get_space(ibuf_rec) != space) {
+
+			goto leave_loop;
+		}
+
+		page_no = ibuf_rec_get_page_no(ibuf_rec);
+
+		n_inserts++;
+
+		/* Delete the record from ibuf */
+		closed = ibuf_delete_rec(space, page_no, &pcur, search_tuple,
+					 &mtr);
+		if (closed) {
+			/* Deletion was pessimistic and mtr was committed:
+			we start from the beginning again */
+
+			ibuf_exit();
+
+			goto loop;
+		}
+
+		if (btr_pcur_is_after_last_on_page(&pcur)) {
+			mtr_commit(&mtr);
+			btr_pcur_close(&pcur);
+
+			ibuf_exit();
+
+			goto loop;
+		}
+	}
+
+leave_loop:
+	mtr_commit(&mtr);
+	btr_pcur_close(&pcur);
+
+	/* Protect our statistics keeping from race conditions */
+	mutex_enter(&ibuf_mutex);
+
+	ibuf->n_merges++;
+	ibuf->n_merged_recs += n_inserts;
+
+	mutex_exit(&ibuf_mutex);
+
+	ibuf_exit();
+
+	mem_heap_free(heap);
+}
+
+/******************************************************************//**
+Looks if the insert buffer is empty.
+@return	TRUE if empty */
+UNIV_INTERN
+ibool
+ibuf_is_empty(void)
+/*===============*/
+{
+	ibool		is_empty;
+	const page_t*	root;
+	mtr_t		mtr;
+
+	ibuf_enter();
+
+	mutex_enter(&ibuf_mutex);
+
+	mtr_start(&mtr);
+
+	root = ibuf_tree_root_get(&mtr);
+
+	if (page_get_n_recs(root) == 0) {
+
+		is_empty = TRUE;
+
+		if (ibuf->empty == FALSE) {
+			fprintf(stderr,
+				"InnoDB: Warning: insert buffer tree is empty"
+				" but the data struct does not\n"
+				"InnoDB: know it. This condition is legal"
+				" if the master thread has not yet\n"
+				"InnoDB: run to completion.\n");
+		}
+	} else {
+		ut_a(ibuf->empty == FALSE);
+
+		is_empty = FALSE;
+	}
+
+	mtr_commit(&mtr);
+
+	mutex_exit(&ibuf_mutex);
+
+	ibuf_exit();
+
+	return(is_empty);
+}
+
+/******************************************************************//**
+Prints info of ibuf. */
+UNIV_INTERN
+void
+ibuf_print(
+/*=======*/
+	FILE*	file)	/*!< in: file where to print */
+{
+#ifdef UNIV_IBUF_COUNT_DEBUG
+	ulint		i;
+	ulint		j;
+#endif
+
+	mutex_enter(&ibuf_mutex);
+
+	fprintf(file,
+		"Ibuf: size %lu, free list len %lu, seg size %lu,\n"
+		"%lu inserts, %lu merged recs, %lu merges\n",
+		(ulong) ibuf->size,
+		(ulong) ibuf->free_list_len,
+		(ulong) ibuf->seg_size,
+		(ulong) ibuf->n_inserts,
+		(ulong) ibuf->n_merged_recs,
+		(ulong) ibuf->n_merges);
+#ifdef UNIV_IBUF_COUNT_DEBUG
+	for (i = 0; i < IBUF_COUNT_N_SPACES; i++) {
+		for (j = 0; j < IBUF_COUNT_N_PAGES; j++) {
+			ulint	count = ibuf_count_get(i, j);
+
+			if (count > 0) {
+				fprintf(stderr,
+					"Ibuf count for space/page %lu/%lu"
+					" is %lu\n",
+					(ulong) i, (ulong) j, (ulong) count);
+			}
+		}
+	}
+#endif /* UNIV_IBUF_COUNT_DEBUG */
+
+	mutex_exit(&ibuf_mutex);
+}
+#endif /* !UNIV_HOTBACKUP */
diff --git a/storage/xtradb/include/btr0btr.h b/storage/xtradb/include/btr0btr.h
new file mode 100644
index 00000000000..5e6a76c7d21
--- /dev/null
+++ b/storage/xtradb/include/btr0btr.h
@@ -0,0 +1,517 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2010, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/btr0btr.h
+The B-tree
+
+Created 6/2/1994 Heikki Tuuri
+*******************************************************/
+
+#ifndef btr0btr_h
+#define btr0btr_h
+
+#include "univ.i"
+
+#include "dict0dict.h"
+#include "data0data.h"
+#include "page0cur.h"
+#include "mtr0mtr.h"
+#include "btr0types.h"
+
+#ifndef UNIV_HOTBACKUP
+/** Maximum record size which can be stored on a page, without using the
+special big record storage structure */
+#define	BTR_PAGE_MAX_REC_SIZE	(UNIV_PAGE_SIZE / 2 - 200)
+
+/** @brief Maximum depth of a B-tree in InnoDB.
+
+Note that this isn't a maximum as such; none of the tree operations
+avoid producing trees bigger than this. It is instead a "max depth
+that other code must work with", useful for e.g.  fixed-size arrays
+that must store some information about each level in a tree. In other
+words: if a B-tree with bigger depth than this is encountered, it is
+not acceptable for it to lead to mysterious memory corruption, but it
+is acceptable for the program to die with a clear assert failure. */
+#define BTR_MAX_LEVELS		100
+
+/** Latching modes for btr_cur_search_to_nth_level(). */
+enum btr_latch_mode {
+	/** Search a record on a leaf page and S-latch it. */
+	BTR_SEARCH_LEAF = RW_S_LATCH,
+	/** (Prepare to) modify a record on a leaf page and X-latch it. */
+	BTR_MODIFY_LEAF	= RW_X_LATCH,
+	/** Obtain no latches. */
+	BTR_NO_LATCHES = RW_NO_LATCH,
+	/** Start modifying the entire B-tree. */
+	BTR_MODIFY_TREE = 33,
+	/** Continue modifying the entire B-tree. */
+	BTR_CONT_MODIFY_TREE = 34,
+	/** Search the previous record. */
+	BTR_SEARCH_PREV = 35,
+	/** Modify the previous record. */
+	BTR_MODIFY_PREV = 36
+};
+
+/** If this is ORed to btr_latch_mode, it means that the search tuple
+will be inserted to the index, at the searched position */
+#define BTR_INSERT		512
+
+/** This flag ORed to btr_latch_mode says that we do the search in query
+optimization */
+#define BTR_ESTIMATE		1024
+
+/** This flag ORed to btr_latch_mode says that we can ignore possible
+UNIQUE definition on secondary indexes when we decide if we can use
+the insert buffer to speed up inserts */
+#define BTR_IGNORE_SEC_UNIQUE	2048
+
+/**************************************************************//**
+Gets the root node of a tree and x-latches it.
+@return	root page, x-latched */
+UNIV_INTERN
+page_t*
+btr_root_get(
+/*=========*/
+	dict_index_t*	index,	/*!< in: index tree */
+	mtr_t*		mtr);	/*!< in: mtr */
+/**************************************************************//**
+Gets a buffer page and declares its latching order level. */
+UNIV_INLINE
+buf_block_t*
+btr_block_get(
+/*==========*/
+	ulint	space,		/*!< in: space id */
+	ulint	zip_size,	/*!< in: compressed page size in bytes
+				or 0 for uncompressed pages */
+	ulint	page_no,	/*!< in: page number */
+	ulint	mode,		/*!< in: latch mode */
+	mtr_t*	mtr);		/*!< in: mtr */
+/**************************************************************//**
+Gets a buffer page and declares its latching order level. */
+UNIV_INLINE
+page_t*
+btr_page_get(
+/*=========*/
+	ulint	space,		/*!< in: space id */
+	ulint	zip_size,	/*!< in: compressed page size in bytes
+				or 0 for uncompressed pages */
+	ulint	page_no,	/*!< in: page number */
+	ulint	mode,		/*!< in: latch mode */
+	mtr_t*	mtr);		/*!< in: mtr */
+#endif /* !UNIV_HOTBACKUP */
+/**************************************************************//**
+Gets the index id field of a page.
+@return	index id */
+UNIV_INLINE
+dulint
+btr_page_get_index_id(
+/*==================*/
+	const page_t*	page);	/*!< in: index page */
+#ifndef UNIV_HOTBACKUP
+/********************************************************//**
+Gets the node level field in an index page.
+@return	level, leaf level == 0 */
+UNIV_INLINE
+ulint
+btr_page_get_level_low(
+/*===================*/
+	const page_t*	page);	/*!< in: index page */
+/********************************************************//**
+Gets the node level field in an index page.
+@return	level, leaf level == 0 */
+UNIV_INLINE
+ulint
+btr_page_get_level(
+/*===============*/
+	const page_t*	page,	/*!< in: index page */
+	mtr_t*		mtr);	/*!< in: mini-transaction handle */
+/********************************************************//**
+Gets the next index page number.
+@return	next page number */
+UNIV_INLINE
+ulint
+btr_page_get_next(
+/*==============*/
+	const page_t*	page,	/*!< in: index page */
+	mtr_t*		mtr);	/*!< in: mini-transaction handle */
+/********************************************************//**
+Gets the previous index page number.
+@return	prev page number */
+UNIV_INLINE
+ulint
+btr_page_get_prev(
+/*==============*/
+	const page_t*	page,	/*!< in: index page */
+	mtr_t*		mtr);	/*!< in: mini-transaction handle */
+/*************************************************************//**
+Gets pointer to the previous user record in the tree. It is assumed
+that the caller has appropriate latches on the page and its neighbor.
+@return	previous user record, NULL if there is none */
+UNIV_INTERN
+rec_t*
+btr_get_prev_user_rec(
+/*==================*/
+	rec_t*	rec,	/*!< in: record on leaf level */
+	mtr_t*	mtr);	/*!< in: mtr holding a latch on the page, and if
+			needed, also to the previous page */
+/*************************************************************//**
+Gets pointer to the next user record in the tree. It is assumed
+that the caller has appropriate latches on the page and its neighbor.
+@return	next user record, NULL if there is none */
+UNIV_INTERN
+rec_t*
+btr_get_next_user_rec(
+/*==================*/
+	rec_t*	rec,	/*!< in: record on leaf level */
+	mtr_t*	mtr);	/*!< in: mtr holding a latch on the page, and if
+			needed, also to the next page */
+/**************************************************************//**
+Releases the latch on a leaf page and bufferunfixes it. */
+UNIV_INLINE
+void
+btr_leaf_page_release(
+/*==================*/
+	buf_block_t*	block,		/*!< in: buffer block */
+	ulint		latch_mode,	/*!< in: BTR_SEARCH_LEAF or
+					BTR_MODIFY_LEAF */
+	mtr_t*		mtr);		/*!< in: mtr */
+/**************************************************************//**
+Gets the child node file address in a node pointer.
+NOTE: the offsets array must contain all offsets for the record since
+we read the last field according to offsets and assume that it contains
+the child page number. In other words offsets must have been retrieved
+with rec_get_offsets(n_fields=ULINT_UNDEFINED).
+@return	child node address */
+UNIV_INLINE
+ulint
+btr_node_ptr_get_child_page_no(
+/*===========================*/
+	const rec_t*	rec,	/*!< in: node pointer record */
+	const ulint*	offsets);/*!< in: array returned by rec_get_offsets() */
+/************************************************************//**
+Creates the root node for a new index tree.
+@return	page number of the created root, FIL_NULL if did not succeed */
+UNIV_INTERN
+ulint
+btr_create(
+/*=======*/
+	ulint		type,	/*!< in: type of the index */
+	ulint		space,	/*!< in: space where created */
+	ulint		zip_size,/*!< in: compressed page size in bytes
+				or 0 for uncompressed pages */
+	dulint		index_id,/*!< in: index id */
+	dict_index_t*	index,	/*!< in: index */
+	mtr_t*		mtr);	/*!< in: mini-transaction handle */
+/************************************************************//**
+Frees a B-tree except the root page, which MUST be freed after this
+by calling btr_free_root. */
+UNIV_INTERN
+void
+btr_free_but_not_root(
+/*==================*/
+	ulint	space,		/*!< in: space where created */
+	ulint	zip_size,	/*!< in: compressed page size in bytes
+				or 0 for uncompressed pages */
+	ulint	root_page_no);	/*!< in: root page number */
+/************************************************************//**
+Frees the B-tree root page. Other tree MUST already have been freed. */
+UNIV_INTERN
+void
+btr_free_root(
+/*==========*/
+	ulint	space,		/*!< in: space where created */
+	ulint	zip_size,	/*!< in: compressed page size in bytes
+				or 0 for uncompressed pages */
+	ulint	root_page_no,	/*!< in: root page number */
+	mtr_t*	mtr);		/*!< in: a mini-transaction which has already
+				been started */
+/*************************************************************//**
+Makes tree one level higher by splitting the root, and inserts
+the tuple. It is assumed that mtr contains an x-latch on the tree.
+NOTE that the operation of this function must always succeed,
+we cannot reverse it: therefore enough free disk space must be
+guaranteed to be available before this function is called.
+@return	inserted record */
+UNIV_INTERN
+rec_t*
+btr_root_raise_and_insert(
+/*======================*/
+	btr_cur_t*	cursor,	/*!< in: cursor at which to insert: must be
+				on the root page; when the function returns,
+				the cursor is positioned on the predecessor
+				of the inserted record */
+	const dtuple_t*	tuple,	/*!< in: tuple to insert */
+	ulint		n_ext,	/*!< in: number of externally stored columns */
+	mtr_t*		mtr);	/*!< in: mtr */
+/*************************************************************//**
+Reorganizes an index page.
+IMPORTANT: if btr_page_reorganize() is invoked on a compressed leaf
+page of a non-clustered index, the caller must update the insert
+buffer free bits in the same mini-transaction in such a way that the
+modification will be redo-logged.
+@return	TRUE on success, FALSE on failure */
+UNIV_INTERN
+ibool
+btr_page_reorganize(
+/*================*/
+	buf_block_t*	block,	/*!< in: page to be reorganized */
+	dict_index_t*	index,	/*!< in: record descriptor */
+	mtr_t*		mtr);	/*!< in: mtr */
+/*************************************************************//**
+Decides if the page should be split at the convergence point of
+inserts converging to left.
+@return	TRUE if split recommended */
+UNIV_INTERN
+ibool
+btr_page_get_split_rec_to_left(
+/*===========================*/
+	btr_cur_t*	cursor,	/*!< in: cursor at which to insert */
+	rec_t**		split_rec);/*!< out: if split recommended,
+				the first record on upper half page,
+				or NULL if tuple should be first */
+/*************************************************************//**
+Decides if the page should be split at the convergence point of
+inserts converging to right.
+@return	TRUE if split recommended */
+UNIV_INTERN
+ibool
+btr_page_get_split_rec_to_right(
+/*============================*/
+	btr_cur_t*	cursor,	/*!< in: cursor at which to insert */
+	rec_t**		split_rec);/*!< out: if split recommended,
+				the first record on upper half page,
+				or NULL if tuple should be first */
+/*************************************************************//**
+Splits an index page to halves and inserts the tuple. It is assumed
+that mtr holds an x-latch to the index tree. NOTE: the tree x-latch is
+released within this function! NOTE that the operation of this
+function must always succeed, we cannot reverse it: therefore enough
+free disk space (2 pages) must be guaranteed to be available before
+this function is called.
+
+@return inserted record */
+UNIV_INTERN
+rec_t*
+btr_page_split_and_insert(
+/*======================*/
+	btr_cur_t*	cursor,	/*!< in: cursor at which to insert; when the
+				function returns, the cursor is positioned
+				on the predecessor of the inserted record */
+	const dtuple_t*	tuple,	/*!< in: tuple to insert */
+	ulint		n_ext,	/*!< in: number of externally stored columns */
+	mtr_t*		mtr);	/*!< in: mtr */
+/*******************************************************//**
+Inserts a data tuple to a tree on a non-leaf level. It is assumed
+that mtr holds an x-latch on the tree. */
+UNIV_INTERN
+void
+btr_insert_on_non_leaf_level_func(
+/*==============================*/
+	dict_index_t*	index,	/*!< in: index */
+	ulint		level,	/*!< in: level, must be > 0 */
+	dtuple_t*	tuple,	/*!< in: the record to be inserted */
+	const char*	file,	/*!< in: file name */
+	ulint		line,	/*!< in: line where called */
+	mtr_t*		mtr);	/*!< in: mtr */
+# define btr_insert_on_non_leaf_level(i,l,t,m)				\
+	btr_insert_on_non_leaf_level_func(i,l,t,__FILE__,__LINE__,m)
+#endif /* !UNIV_HOTBACKUP */
+/****************************************************************//**
+Sets a record as the predefined minimum record. */
+UNIV_INTERN
+void
+btr_set_min_rec_mark(
+/*=================*/
+	rec_t*	rec,	/*!< in/out: record */
+	mtr_t*	mtr);	/*!< in: mtr */
+#ifndef UNIV_HOTBACKUP
+/*************************************************************//**
+Deletes on the upper level the node pointer to a page. */
+UNIV_INTERN
+void
+btr_node_ptr_delete(
+/*================*/
+	dict_index_t*	index,	/*!< in: index tree */
+	buf_block_t*	block,	/*!< in: page whose node pointer is deleted */
+	mtr_t*		mtr);	/*!< in: mtr */
+#ifdef UNIV_DEBUG
+/************************************************************//**
+Checks that the node pointer to a page is appropriate.
+@return	TRUE */
+UNIV_INTERN
+ibool
+btr_check_node_ptr(
+/*===============*/
+	dict_index_t*	index,	/*!< in: index tree */
+	buf_block_t*	block,	/*!< in: index page */
+	mtr_t*		mtr);	/*!< in: mtr */
+#endif /* UNIV_DEBUG */
+/*************************************************************//**
+Tries to merge the page first to the left immediate brother if such a
+brother exists, and the node pointers to the current page and to the
+brother reside on the same page. If the left brother does not satisfy these
+conditions, looks at the right brother. If the page is the only one on that
+level lifts the records of the page to the father page, thus reducing the
+tree height. It is assumed that mtr holds an x-latch on the tree and on the
+page. If cursor is on the leaf level, mtr must also hold x-latches to
+the brothers, if they exist.
+@return	TRUE on success */
+UNIV_INTERN
+ibool
+btr_compress(
+/*=========*/
+	btr_cur_t*	cursor,	/*!< in: cursor on the page to merge or lift;
+				the page must not be empty: in record delete
+				use btr_discard_page if the page would become
+				empty */
+	mtr_t*		mtr);	/*!< in: mtr */
+/*************************************************************//**
+Discards a page from a B-tree. This is used to remove the last record from
+a B-tree page: the whole page must be removed at the same time. This cannot
+be used for the root page, which is allowed to be empty. */
+UNIV_INTERN
+void
+btr_discard_page(
+/*=============*/
+	btr_cur_t*	cursor,	/*!< in: cursor on the page to discard: not on
+				the root page */
+	mtr_t*		mtr);	/*!< in: mtr */
+#endif /* !UNIV_HOTBACKUP */
+/****************************************************************//**
+Parses the redo log record for setting an index record as the predefined
+minimum record.
+@return	end of log record or NULL */
+UNIV_INTERN
+byte*
+btr_parse_set_min_rec_mark(
+/*=======================*/
+	byte*	ptr,	/*!< in: buffer */
+	byte*	end_ptr,/*!< in: buffer end */
+	ulint	comp,	/*!< in: nonzero=compact page format */
+	page_t*	page,	/*!< in: page or NULL */
+	mtr_t*	mtr);	/*!< in: mtr or NULL */
+/***********************************************************//**
+Parses a redo log record of reorganizing a page.
+@return	end of log record or NULL */
+UNIV_INTERN
+byte*
+btr_parse_page_reorganize(
+/*======================*/
+	byte*		ptr,	/*!< in: buffer */
+	byte*		end_ptr,/*!< in: buffer end */
+	dict_index_t*	index,	/*!< in: record descriptor */
+	buf_block_t*	block,	/*!< in: page to be reorganized, or NULL */
+	mtr_t*		mtr);	/*!< in: mtr or NULL */
+#ifndef UNIV_HOTBACKUP
+/**************************************************************//**
+Gets the number of pages in a B-tree.
+@return	number of pages */
+UNIV_INTERN
+ulint
+btr_get_size(
+/*=========*/
+	dict_index_t*	index,	/*!< in: index */
+	ulint		flag);	/*!< in: BTR_N_LEAF_PAGES or BTR_TOTAL_SIZE */
+/**************************************************************//**
+Allocates a new file page to be used in an index tree. NOTE: we assume
+that the caller has made the reservation for free extents!
+@return	new allocated block, x-latched; NULL if out of space */
+UNIV_INTERN
+buf_block_t*
+btr_page_alloc(
+/*===========*/
+	dict_index_t*	index,		/*!< in: index tree */
+	ulint		hint_page_no,	/*!< in: hint of a good page */
+	byte		file_direction,	/*!< in: direction where a possible
+					page split is made */
+	ulint		level,		/*!< in: level where the page is placed
+					in the tree */
+	mtr_t*		mtr);		/*!< in: mtr */
+/**************************************************************//**
+Frees a file page used in an index tree. NOTE: cannot free field external
+storage pages because the page must contain info on its level. */
+UNIV_INTERN
+void
+btr_page_free(
+/*==========*/
+	dict_index_t*	index,	/*!< in: index tree */
+	buf_block_t*	block,	/*!< in: block to be freed, x-latched */
+	mtr_t*		mtr);	/*!< in: mtr */
+/**************************************************************//**
+Frees a file page used in an index tree. Can be used also to BLOB
+external storage pages, because the page level 0 can be given as an
+argument. */
+UNIV_INTERN
+void
+btr_page_free_low(
+/*==============*/
+	dict_index_t*	index,	/*!< in: index tree */
+	buf_block_t*	block,	/*!< in: block to be freed, x-latched */
+	ulint		level,	/*!< in: page level */
+	mtr_t*		mtr);	/*!< in: mtr */
+#ifdef UNIV_BTR_PRINT
+/*************************************************************//**
+Prints size info of a B-tree. */
+UNIV_INTERN
+void
+btr_print_size(
+/*===========*/
+	dict_index_t*	index);	/*!< in: index tree */
+/**************************************************************//**
+Prints directories and other info of all nodes in the index. */
+UNIV_INTERN
+void
+btr_print_index(
+/*============*/
+	dict_index_t*	index,	/*!< in: index */
+	ulint		width);	/*!< in: print this many entries from start
+				and end */
+#endif /* UNIV_BTR_PRINT */
+/************************************************************//**
+Checks the size and number of fields in a record based on the definition of
+the index.
+@return	TRUE if ok */
+UNIV_INTERN
+ibool
+btr_index_rec_validate(
+/*===================*/
+	const rec_t*		rec,		/*!< in: index record */
+	const dict_index_t*	index,		/*!< in: index */
+	ibool			dump_on_error);	/*!< in: TRUE if the function
+						should print hex dump of record
+						and page on error */
+/**************************************************************//**
+Checks the consistency of an index tree.
+@return	TRUE if ok */
+UNIV_INTERN
+ibool
+btr_validate_index(
+/*===============*/
+	dict_index_t*	index,	/*!< in: index */
+	trx_t*		trx);	/*!< in: transaction or NULL */
+
+#define BTR_N_LEAF_PAGES	1
+#define BTR_TOTAL_SIZE		2
+#endif /* !UNIV_HOTBACKUP */
+
+#ifndef UNIV_NONINL
+#include "btr0btr.ic"
+#endif
+
+#endif
diff --git a/storage/xtradb/include/btr0btr.ic b/storage/xtradb/include/btr0btr.ic
new file mode 100644
index 00000000000..c9c38f3c3b3
--- /dev/null
+++ b/storage/xtradb/include/btr0btr.ic
@@ -0,0 +1,316 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2010, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/btr0btr.ic
+The B-tree
+
+Created 6/2/1994 Heikki Tuuri
+*******************************************************/
+
+#include "mach0data.h"
+#ifndef UNIV_HOTBACKUP
+#include "mtr0mtr.h"
+#include "mtr0log.h"
+#include "page0zip.h"
+#include "srv0srv.h"
+#define BTR_MAX_NODE_LEVEL	50	/*!< Maximum B-tree page level
+					(not really a hard limit).
+					Used in debug assertions
+					in btr_page_set_level and
+					btr_page_get_level_low */
+
+/**************************************************************//**
+Gets a buffer page and declares its latching order level. */
+UNIV_INLINE
+buf_block_t*
+btr_block_get(
+/*==========*/
+	ulint	space,		/*!< in: space id */
+	ulint	zip_size,	/*!< in: compressed page size in bytes
+				or 0 for uncompressed pages */
+	ulint	page_no,	/*!< in: page number */
+	ulint	mode,		/*!< in: latch mode */
+	mtr_t*	mtr)		/*!< in: mtr */
+{
+	buf_block_t*	block;
+
+	block = buf_page_get(space, zip_size, page_no, mode, mtr);
+
+	ut_a(srv_pass_corrupt_table || block);
+
+	if (block && mode != RW_NO_LATCH) {
+
+		buf_block_dbg_add_level(block, SYNC_TREE_NODE);
+	}
+
+	return(block);
+}
+
+/**************************************************************//**
+Gets a buffer page and declares its latching order level. */
+UNIV_INLINE
+page_t*
+btr_page_get(
+/*=========*/
+	ulint	space,		/*!< in: space id */
+	ulint	zip_size,	/*!< in: compressed page size in bytes
+				or 0 for uncompressed pages */
+	ulint	page_no,	/*!< in: page number */
+	ulint	mode,		/*!< in: latch mode */
+	mtr_t*	mtr)		/*!< in: mtr */
+{
+	return(buf_block_get_frame(btr_block_get(space, zip_size, page_no,
+						 mode, mtr)));
+}
+
+/**************************************************************//**
+Sets the index id field of a page. */
+UNIV_INLINE
+void
+btr_page_set_index_id(
+/*==================*/
+	page_t*		page,	/*!< in: page to be created */
+	page_zip_des_t*	page_zip,/*!< in: compressed page whose uncompressed
+				part will be updated, or NULL */
+	dulint		id,	/*!< in: index id */
+	mtr_t*		mtr)	/*!< in: mtr */
+{
+	if (UNIV_LIKELY_NULL(page_zip)) {
+		mach_write_to_8(page + (PAGE_HEADER + PAGE_INDEX_ID), id);
+		page_zip_write_header(page_zip,
+				      page + (PAGE_HEADER + PAGE_INDEX_ID),
+				      8, mtr);
+	} else {
+		mlog_write_dulint(page + (PAGE_HEADER + PAGE_INDEX_ID),
+				  id, mtr);
+	}
+}
+#endif /* !UNIV_HOTBACKUP */
+
+/**************************************************************//**
+Gets the index id field of a page.
+@return	index id */
+UNIV_INLINE
+dulint
+btr_page_get_index_id(
+/*==================*/
+	const page_t*	page)	/*!< in: index page */
+{
+	return(mach_read_from_8(page + PAGE_HEADER + PAGE_INDEX_ID));
+}
+
+#ifndef UNIV_HOTBACKUP
+/********************************************************//**
+Gets the node level field in an index page.
+@return	level, leaf level == 0 */
+UNIV_INLINE
+ulint
+btr_page_get_level_low(
+/*===================*/
+	const page_t*	page)	/*!< in: index page */
+{
+	ulint	level;
+
+	ut_ad(page);
+
+	level = mach_read_from_2(page + PAGE_HEADER + PAGE_LEVEL);
+
+	ut_ad(level <= BTR_MAX_NODE_LEVEL);
+
+	return(level);
+}
+
+/********************************************************//**
+Gets the node level field in an index page.
+@return	level, leaf level == 0 */
+UNIV_INLINE
+ulint
+btr_page_get_level(
+/*===============*/
+	const page_t*	page,	/*!< in: index page */
+	mtr_t*		mtr __attribute__((unused)))
+				/*!< in: mini-transaction handle */
+{
+	ut_ad(page && mtr);
+
+	return(btr_page_get_level_low(page));
+}
+
+/********************************************************//**
+Sets the node level field in an index page. */
+UNIV_INLINE
+void
+btr_page_set_level(
+/*===============*/
+	page_t*		page,	/*!< in: index page */
+	page_zip_des_t*	page_zip,/*!< in: compressed page whose uncompressed
+				part will be updated, or NULL */
+	ulint		level,	/*!< in: level, leaf level == 0 */
+	mtr_t*		mtr)	/*!< in: mini-transaction handle */
+{
+	ut_ad(page && mtr);
+	ut_ad(level <= BTR_MAX_NODE_LEVEL);
+
+	if (UNIV_LIKELY_NULL(page_zip)) {
+		mach_write_to_2(page + (PAGE_HEADER + PAGE_LEVEL), level);
+		page_zip_write_header(page_zip,
+				      page + (PAGE_HEADER + PAGE_LEVEL),
+				      2, mtr);
+	} else {
+		mlog_write_ulint(page + (PAGE_HEADER + PAGE_LEVEL), level,
+				 MLOG_2BYTES, mtr);
+	}
+}
+
+/********************************************************//**
+Gets the next index page number.
+@return	next page number */
+UNIV_INLINE
+ulint
+btr_page_get_next(
+/*==============*/
+	const page_t*	page,	/*!< in: index page */
+	mtr_t*		mtr __attribute__((unused)))
+				/*!< in: mini-transaction handle */
+{
+	ut_ad(page && mtr);
+	ut_ad(mtr_memo_contains_page(mtr, page, MTR_MEMO_PAGE_X_FIX)
+	      || mtr_memo_contains_page(mtr, page, MTR_MEMO_PAGE_S_FIX));
+
+	return(mach_read_from_4(page + FIL_PAGE_NEXT));
+}
+
+/********************************************************//**
+Sets the next index page field. */
+UNIV_INLINE
+void
+btr_page_set_next(
+/*==============*/
+	page_t*		page,	/*!< in: index page */
+	page_zip_des_t*	page_zip,/*!< in: compressed page whose uncompressed
+				part will be updated, or NULL */
+	ulint		next,	/*!< in: next page number */
+	mtr_t*		mtr)	/*!< in: mini-transaction handle */
+{
+	ut_ad(page && mtr);
+
+	if (UNIV_LIKELY_NULL(page_zip)) {
+		mach_write_to_4(page + FIL_PAGE_NEXT, next);
+		page_zip_write_header(page_zip, page + FIL_PAGE_NEXT, 4, mtr);
+	} else {
+		mlog_write_ulint(page + FIL_PAGE_NEXT, next, MLOG_4BYTES, mtr);
+	}
+}
+
+/********************************************************//**
+Gets the previous index page number.
+@return	prev page number */
+UNIV_INLINE
+ulint
+btr_page_get_prev(
+/*==============*/
+	const page_t*	page,	/*!< in: index page */
+	mtr_t*	mtr __attribute__((unused))) /*!< in: mini-transaction handle */
+{
+	ut_ad(page && mtr);
+
+	return(mach_read_from_4(page + FIL_PAGE_PREV));
+}
+
+/********************************************************//**
+Sets the previous index page field. */
+UNIV_INLINE
+void
+btr_page_set_prev(
+/*==============*/
+	page_t*		page,	/*!< in: index page */
+	page_zip_des_t*	page_zip,/*!< in: compressed page whose uncompressed
+				part will be updated, or NULL */
+	ulint		prev,	/*!< in: previous page number */
+	mtr_t*		mtr)	/*!< in: mini-transaction handle */
+{
+	ut_ad(page && mtr);
+
+	if (UNIV_LIKELY_NULL(page_zip)) {
+		mach_write_to_4(page + FIL_PAGE_PREV, prev);
+		page_zip_write_header(page_zip, page + FIL_PAGE_PREV, 4, mtr);
+	} else {
+		mlog_write_ulint(page + FIL_PAGE_PREV, prev, MLOG_4BYTES, mtr);
+	}
+}
+
+/**************************************************************//**
+Gets the child node file address in a node pointer.
+NOTE: the offsets array must contain all offsets for the record since
+we read the last field according to offsets and assume that it contains
+the child page number. In other words offsets must have been retrieved
+with rec_get_offsets(n_fields=ULINT_UNDEFINED).
+@return	child node address */
+UNIV_INLINE
+ulint
+btr_node_ptr_get_child_page_no(
+/*===========================*/
+	const rec_t*	rec,	/*!< in: node pointer record */
+	const ulint*	offsets)/*!< in: array returned by rec_get_offsets() */
+{
+	const byte*	field;
+	ulint		len;
+	ulint		page_no;
+
+	ut_ad(!rec_offs_comp(offsets) || rec_get_node_ptr_flag(rec));
+
+	/* The child address is in the last field */
+	field = rec_get_nth_field(rec, offsets,
+				  rec_offs_n_fields(offsets) - 1, &len);
+
+	ut_ad(len == 4);
+
+	page_no = mach_read_from_4(field);
+
+	if (UNIV_UNLIKELY(page_no == 0)) {
+		fprintf(stderr,
+			"InnoDB: a nonsensical page number 0"
+			" in a node ptr record at offset %lu\n",
+			(ulong) page_offset(rec));
+		buf_page_print(page_align(rec), 0);
+	}
+
+	return(page_no);
+}
+
+/**************************************************************//**
+Releases the latches on a leaf page and bufferunfixes it. */
+UNIV_INLINE
+void
+btr_leaf_page_release(
+/*==================*/
+	buf_block_t*	block,		/*!< in: buffer block */
+	ulint		latch_mode,	/*!< in: BTR_SEARCH_LEAF or
+					BTR_MODIFY_LEAF */
+	mtr_t*		mtr)		/*!< in: mtr */
+{
+	ut_ad(latch_mode == BTR_SEARCH_LEAF || latch_mode == BTR_MODIFY_LEAF);
+	ut_ad(!mtr_memo_contains(mtr, block, MTR_MEMO_MODIFY));
+
+	mtr_memo_release(mtr, block,
+			 latch_mode == BTR_SEARCH_LEAF
+			 ? MTR_MEMO_PAGE_S_FIX
+			 : MTR_MEMO_PAGE_X_FIX);
+}
+#endif /* !UNIV_HOTBACKUP */
diff --git a/storage/xtradb/include/btr0cur.h b/storage/xtradb/include/btr0cur.h
new file mode 100644
index 00000000000..e151fdcb563
--- /dev/null
+++ b/storage/xtradb/include/btr0cur.h
@@ -0,0 +1,764 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2010, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/btr0cur.h
+The index tree cursor
+
+Created 10/16/1994 Heikki Tuuri
+*******************************************************/
+
+#ifndef btr0cur_h
+#define btr0cur_h
+
+#include "univ.i"
+#include "dict0dict.h"
+#include "page0cur.h"
+#include "btr0types.h"
+
+/* Mode flags for btr_cur operations; these can be ORed */
+#define BTR_NO_UNDO_LOG_FLAG	1	/* do no undo logging */
+#define BTR_NO_LOCKING_FLAG	2	/* do no record lock checking */
+#define BTR_KEEP_SYS_FLAG	4	/* sys fields will be found from the
+					update vector or inserted entry */
+
+#ifndef UNIV_HOTBACKUP
+#include "que0types.h"
+#include "row0types.h"
+#include "ha0ha.h"
+
+#define BTR_CUR_ADAPT
+#define BTR_CUR_HASH_ADAPT
+
+#ifdef UNIV_DEBUG
+/*********************************************************//**
+Returns the page cursor component of a tree cursor.
+@return	pointer to page cursor component */
+UNIV_INLINE
+page_cur_t*
+btr_cur_get_page_cur(
+/*=================*/
+	const btr_cur_t*	cursor);/*!< in: tree cursor */
+#else /* UNIV_DEBUG */
+# define btr_cur_get_page_cur(cursor) (&(cursor)->page_cur)
+#endif /* UNIV_DEBUG */
+/*********************************************************//**
+Returns the buffer block on which the tree cursor is positioned.
+@return	pointer to buffer block */
+UNIV_INLINE
+buf_block_t*
+btr_cur_get_block(
+/*==============*/
+	btr_cur_t*	cursor);/*!< in: tree cursor */
+/*********************************************************//**
+Returns the record pointer of a tree cursor.
+@return	pointer to record */
+UNIV_INLINE
+rec_t*
+btr_cur_get_rec(
+/*============*/
+	btr_cur_t*	cursor);/*!< in: tree cursor */
+/*********************************************************//**
+Returns the compressed page on which the tree cursor is positioned.
+@return	pointer to compressed page, or NULL if the page is not compressed */
+UNIV_INLINE
+page_zip_des_t*
+btr_cur_get_page_zip(
+/*=================*/
+	btr_cur_t*	cursor);/*!< in: tree cursor */
+/*********************************************************//**
+Invalidates a tree cursor by setting record pointer to NULL. */
+UNIV_INLINE
+void
+btr_cur_invalidate(
+/*===============*/
+	btr_cur_t*	cursor);/*!< in: tree cursor */
+/*********************************************************//**
+Returns the page of a tree cursor.
+@return	pointer to page */
+UNIV_INLINE
+page_t*
+btr_cur_get_page(
+/*=============*/
+	btr_cur_t*	cursor);/*!< in: tree cursor */
+/*********************************************************//**
+Returns the index of a cursor.
+@return	index */
+UNIV_INLINE
+dict_index_t*
+btr_cur_get_index(
+/*==============*/
+	btr_cur_t*	cursor);/*!< in: B-tree cursor */
+/*********************************************************//**
+Positions a tree cursor at a given record. */
+UNIV_INLINE
+void
+btr_cur_position(
+/*=============*/
+	dict_index_t*	index,	/*!< in: index */
+	rec_t*		rec,	/*!< in: record in tree */
+	buf_block_t*	block,	/*!< in: buffer block of rec */
+	btr_cur_t*	cursor);/*!< in: cursor */
+/********************************************************************//**
+Searches an index tree and positions a tree cursor on a given level.
+NOTE: n_fields_cmp in tuple must be set so that it cannot be compared
+to node pointer page number fields on the upper levels of the tree!
+Note that if mode is PAGE_CUR_LE, which is used in inserts, then
+cursor->up_match and cursor->low_match both will have sensible values.
+If mode is PAGE_CUR_GE, then up_match will a have a sensible value. */
+UNIV_INTERN
+void
+btr_cur_search_to_nth_level(
+/*========================*/
+	dict_index_t*	index,	/*!< in: index */
+	ulint		level,	/*!< in: the tree level of search */
+	const dtuple_t*	tuple,	/*!< in: data tuple; NOTE: n_fields_cmp in
+				tuple must be set so that it cannot get
+				compared to the node ptr page number field! */
+	ulint		mode,	/*!< in: PAGE_CUR_L, ...;
+				NOTE that if the search is made using a unique
+				prefix of a record, mode should be PAGE_CUR_LE,
+				not PAGE_CUR_GE, as the latter may end up on
+				the previous page of the record! Inserts
+				should always be made using PAGE_CUR_LE to
+				search the position! */
+	ulint		latch_mode, /*!< in: BTR_SEARCH_LEAF, ..., ORed with
+				BTR_INSERT and BTR_ESTIMATE;
+				cursor->left_block is used to store a pointer
+				to the left neighbor page, in the cases
+				BTR_SEARCH_PREV and BTR_MODIFY_PREV;
+				NOTE that if has_search_latch
+				is != 0, we maybe do not have a latch set
+				on the cursor page, we assume
+				the caller uses his search latch
+				to protect the record! */
+	btr_cur_t*	cursor, /*!< in/out: tree cursor; the cursor page is
+				s- or x-latched, but see also above! */
+	ulint		has_search_latch,/*!< in: latch mode the caller
+				currently has on btr_search_latch:
+				RW_S_LATCH, or 0 */
+	const char*	file,	/*!< in: file name */
+	ulint		line,	/*!< in: line where called */
+	mtr_t*		mtr);	/*!< in: mtr */
+/*****************************************************************//**
+Opens a cursor at either end of an index. */
+UNIV_INTERN
+void
+btr_cur_open_at_index_side_func(
+/*============================*/
+	ibool		from_left,	/*!< in: TRUE if open to the low end,
+					FALSE if to the high end */
+	dict_index_t*	index,		/*!< in: index */
+	ulint		latch_mode,	/*!< in: latch mode */
+	btr_cur_t*	cursor,		/*!< in: cursor */
+	const char*	file,		/*!< in: file name */
+	ulint		line,		/*!< in: line where called */
+	mtr_t*		mtr);		/*!< in: mtr */
+#define btr_cur_open_at_index_side(f,i,l,c,m)				\
+	btr_cur_open_at_index_side_func(f,i,l,c,__FILE__,__LINE__,m)
+/**********************************************************************//**
+Positions a cursor at a randomly chosen position within a B-tree. */
+UNIV_INTERN
+void
+btr_cur_open_at_rnd_pos_func(
+/*=========================*/
+	dict_index_t*	index,		/*!< in: index */
+	ulint		latch_mode,	/*!< in: BTR_SEARCH_LEAF, ... */
+	btr_cur_t*	cursor,		/*!< in/out: B-tree cursor */
+	const char*	file,		/*!< in: file name */
+	ulint		line,		/*!< in: line where called */
+	mtr_t*		mtr);		/*!< in: mtr */
+#define btr_cur_open_at_rnd_pos(i,l,c,m)				\
+	btr_cur_open_at_rnd_pos_func(i,l,c,__FILE__,__LINE__,m)
+/*************************************************************//**
+Tries to perform an insert to a page in an index tree, next to cursor.
+It is assumed that mtr holds an x-latch on the page. The operation does
+not succeed if there is too little space on the page. If there is just
+one record on the page, the insert will always succeed; this is to
+prevent trying to split a page with just one record.
+@return	DB_SUCCESS, DB_WAIT_LOCK, DB_FAIL, or error number */
+UNIV_INTERN
+ulint
+btr_cur_optimistic_insert(
+/*======================*/
+	ulint		flags,	/*!< in: undo logging and locking flags: if not
+				zero, the parameters index and thr should be
+				specified */
+	btr_cur_t*	cursor,	/*!< in: cursor on page after which to insert;
+				cursor stays valid */
+	dtuple_t*	entry,	/*!< in/out: entry to insert */
+	rec_t**		rec,	/*!< out: pointer to inserted record if
+				succeed */
+	big_rec_t**	big_rec,/*!< out: big rec vector whose fields have to
+				be stored externally by the caller, or
+				NULL */
+	ulint		n_ext,	/*!< in: number of externally stored columns */
+	que_thr_t*	thr,	/*!< in: query thread or NULL */
+	mtr_t*		mtr);	/*!< in: mtr; if this function returns
+				DB_SUCCESS on a leaf page of a secondary
+				index in a compressed tablespace, the
+				mtr must be committed before latching
+				any further pages */
+/*************************************************************//**
+Performs an insert on a page of an index tree. It is assumed that mtr
+holds an x-latch on the tree and on the cursor page. If the insert is
+made on the leaf level, to avoid deadlocks, mtr must also own x-latches
+to brothers of page, if those brothers exist.
+@return	DB_SUCCESS or error number */
+UNIV_INTERN
+ulint
+btr_cur_pessimistic_insert(
+/*=======================*/
+	ulint		flags,	/*!< in: undo logging and locking flags: if not
+				zero, the parameter thr should be
+				specified; if no undo logging is specified,
+				then the caller must have reserved enough
+				free extents in the file space so that the
+				insertion will certainly succeed */
+	btr_cur_t*	cursor,	/*!< in: cursor after which to insert;
+				cursor stays valid */
+	dtuple_t*	entry,	/*!< in/out: entry to insert */
+	rec_t**		rec,	/*!< out: pointer to inserted record if
+				succeed */
+	big_rec_t**	big_rec,/*!< out: big rec vector whose fields have to
+				be stored externally by the caller, or
+				NULL */
+	ulint		n_ext,	/*!< in: number of externally stored columns */
+	que_thr_t*	thr,	/*!< in: query thread or NULL */
+	mtr_t*		mtr);	/*!< in: mtr */
+/*************************************************************//**
+Updates a record when the update causes no size changes in its fields.
+@return	DB_SUCCESS or error number */
+UNIV_INTERN
+ulint
+btr_cur_update_in_place(
+/*====================*/
+	ulint		flags,	/*!< in: undo logging and locking flags */
+	btr_cur_t*	cursor,	/*!< in: cursor on the record to update;
+				cursor stays valid and positioned on the
+				same record */
+	const upd_t*	update,	/*!< in: update vector */
+	ulint		cmpl_info,/*!< in: compiler info on secondary index
+				updates */
+	que_thr_t*	thr,	/*!< in: query thread */
+	mtr_t*		mtr);	/*!< in: mtr; must be committed before
+				latching any further pages */
+/*************************************************************//**
+Tries to update a record on a page in an index tree. It is assumed that mtr
+holds an x-latch on the page. The operation does not succeed if there is too
+little space on the page or if the update would result in too empty a page,
+so that tree compression is recommended.
+@return DB_SUCCESS, or DB_OVERFLOW if the updated record does not fit,
+DB_UNDERFLOW if the page would become too empty, or DB_ZIP_OVERFLOW if
+there is not enough space left on the compressed page */
+UNIV_INTERN
+ulint
+btr_cur_optimistic_update(
+/*======================*/
+	ulint		flags,	/*!< in: undo logging and locking flags */
+	btr_cur_t*	cursor,	/*!< in: cursor on the record to update;
+				cursor stays valid and positioned on the
+				same record */
+	const upd_t*	update,	/*!< in: update vector; this must also
+				contain trx id and roll ptr fields */
+	ulint		cmpl_info,/*!< in: compiler info on secondary index
+				updates */
+	que_thr_t*	thr,	/*!< in: query thread */
+	mtr_t*		mtr);	/*!< in: mtr; must be committed before
+				latching any further pages */
+/*************************************************************//**
+Performs an update of a record on a page of a tree. It is assumed
+that mtr holds an x-latch on the tree and on the cursor page. If the
+update is made on the leaf level, to avoid deadlocks, mtr must also
+own x-latches to brothers of page, if those brothers exist.
+@return	DB_SUCCESS or error code */
+UNIV_INTERN
+ulint
+btr_cur_pessimistic_update(
+/*=======================*/
+	ulint		flags,	/*!< in: undo logging, locking, and rollback
+				flags */
+	btr_cur_t*	cursor,	/*!< in: cursor on the record to update */
+	mem_heap_t**	heap,	/*!< in/out: pointer to memory heap, or NULL */
+	big_rec_t**	big_rec,/*!< out: big rec vector whose fields have to
+				be stored externally by the caller, or NULL */
+	const upd_t*	update,	/*!< in: update vector; this is allowed also
+				contain trx id and roll ptr fields, but
+				the values in update vector have no effect */
+	ulint		cmpl_info,/*!< in: compiler info on secondary index
+				updates */
+	que_thr_t*	thr,	/*!< in: query thread */
+	mtr_t*		mtr);	/*!< in: mtr; must be committed before
+				latching any further pages */
+/***********************************************************//**
+Marks a clustered index record deleted. Writes an undo log record to
+undo log on this delete marking. Writes in the trx id field the id
+of the deleting transaction, and in the roll ptr field pointer to the
+undo log record created.
+@return	DB_SUCCESS, DB_LOCK_WAIT, or error number */
+UNIV_INTERN
+ulint
+btr_cur_del_mark_set_clust_rec(
+/*===========================*/
+	ulint		flags,	/*!< in: undo logging and locking flags */
+	btr_cur_t*	cursor,	/*!< in: cursor */
+	ibool		val,	/*!< in: value to set */
+	que_thr_t*	thr,	/*!< in: query thread */
+	mtr_t*		mtr);	/*!< in: mtr */
+/***********************************************************//**
+Sets a secondary index record delete mark to TRUE or FALSE.
+@return	DB_SUCCESS, DB_LOCK_WAIT, or error number */
+UNIV_INTERN
+ulint
+btr_cur_del_mark_set_sec_rec(
+/*=========================*/
+	ulint		flags,	/*!< in: locking flag */
+	btr_cur_t*	cursor,	/*!< in: cursor */
+	ibool		val,	/*!< in: value to set */
+	que_thr_t*	thr,	/*!< in: query thread */
+	mtr_t*		mtr);	/*!< in: mtr */
+/***********************************************************//**
+Clear a secondary index record's delete mark.  This function is only
+used by the insert buffer insert merge mechanism. */
+UNIV_INTERN
+void
+btr_cur_del_unmark_for_ibuf(
+/*========================*/
+	rec_t*		rec,		/*!< in/out: record to delete unmark */
+	page_zip_des_t*	page_zip,	/*!< in/out: compressed page
+					corresponding to rec, or NULL
+					when the tablespace is
+					uncompressed */
+	mtr_t*		mtr);		/*!< in: mtr */
+/*************************************************************//**
+Tries to compress a page of the tree if it seems useful. It is assumed
+that mtr holds an x-latch on the tree and on the cursor page. To avoid
+deadlocks, mtr must also own x-latches to brothers of page, if those
+brothers exist. NOTE: it is assumed that the caller has reserved enough
+free extents so that the compression will always succeed if done!
+@return	TRUE if compression occurred */
+UNIV_INTERN
+ibool
+btr_cur_compress_if_useful(
+/*=======================*/
+	btr_cur_t*	cursor,	/*!< in: cursor on the page to compress;
+				cursor does not stay valid if compression
+				occurs */
+	mtr_t*		mtr);	/*!< in: mtr */
+/*******************************************************//**
+Removes the record on which the tree cursor is positioned. It is assumed
+that the mtr has an x-latch on the page where the cursor is positioned,
+but no latch on the whole tree.
+@return	TRUE if success, i.e., the page did not become too empty */
+UNIV_INTERN
+ibool
+btr_cur_optimistic_delete(
+/*======================*/
+	btr_cur_t*	cursor,	/*!< in: cursor on the record to delete;
+				cursor stays valid: if deletion succeeds,
+				on function exit it points to the successor
+				of the deleted record */
+	mtr_t*		mtr);	/*!< in: mtr; if this function returns
+				TRUE on a leaf page of a secondary
+				index, the mtr must be committed
+				before latching any further pages */
+/*************************************************************//**
+Removes the record on which the tree cursor is positioned. Tries
+to compress the page if its fillfactor drops below a threshold
+or if it is the only page on the level. It is assumed that mtr holds
+an x-latch on the tree and on the cursor page. To avoid deadlocks,
+mtr must also own x-latches to brothers of page, if those brothers
+exist.
+@return	TRUE if compression occurred */
+UNIV_INTERN
+ibool
+btr_cur_pessimistic_delete(
+/*=======================*/
+	ulint*		err,	/*!< out: DB_SUCCESS or DB_OUT_OF_FILE_SPACE;
+				the latter may occur because we may have
+				to update node pointers on upper levels,
+				and in the case of variable length keys
+				these may actually grow in size */
+	ibool		has_reserved_extents, /*!< in: TRUE if the
+				caller has already reserved enough free
+				extents so that he knows that the operation
+				will succeed */
+	btr_cur_t*	cursor,	/*!< in: cursor on the record to delete;
+				if compression does not occur, the cursor
+				stays valid: it points to successor of
+				deleted record on function exit */
+	enum trx_rb_ctx	rb_ctx,	/*!< in: rollback context */
+	mtr_t*		mtr);	/*!< in: mtr */
+#endif /* !UNIV_HOTBACKUP */
+/***********************************************************//**
+Parses a redo log record of updating a record in-place.
+@return	end of log record or NULL */
+UNIV_INTERN
+byte*
+btr_cur_parse_update_in_place(
+/*==========================*/
+	byte*		ptr,	/*!< in: buffer */
+	byte*		end_ptr,/*!< in: buffer end */
+	page_t*		page,	/*!< in/out: page or NULL */
+	page_zip_des_t*	page_zip,/*!< in/out: compressed page, or NULL */
+	dict_index_t*	index);	/*!< in: index corresponding to page */
+/****************************************************************//**
+Parses the redo log record for delete marking or unmarking of a clustered
+index record.
+@return	end of log record or NULL */
+UNIV_INTERN
+byte*
+btr_cur_parse_del_mark_set_clust_rec(
+/*=================================*/
+	byte*		ptr,	/*!< in: buffer */
+	byte*		end_ptr,/*!< in: buffer end */
+	page_t*		page,	/*!< in/out: page or NULL */
+	page_zip_des_t*	page_zip,/*!< in/out: compressed page, or NULL */
+	dict_index_t*	index);	/*!< in: index corresponding to page */
+/****************************************************************//**
+Parses the redo log record for delete marking or unmarking of a secondary
+index record.
+@return	end of log record or NULL */
+UNIV_INTERN
+byte*
+btr_cur_parse_del_mark_set_sec_rec(
+/*===============================*/
+	byte*		ptr,	/*!< in: buffer */
+	byte*		end_ptr,/*!< in: buffer end */
+	page_t*		page,	/*!< in/out: page or NULL */
+	page_zip_des_t*	page_zip);/*!< in/out: compressed page, or NULL */
+#ifndef UNIV_HOTBACKUP
+/*******************************************************************//**
+Estimates the number of rows in a given index range.
+@return	estimated number of rows */
+UNIV_INTERN
+ib_int64_t
+btr_estimate_n_rows_in_range(
+/*=========================*/
+	dict_index_t*	index,	/*!< in: index */
+	const dtuple_t*	tuple1,	/*!< in: range start, may also be empty tuple */
+	ulint		mode1,	/*!< in: search mode for range start */
+	const dtuple_t*	tuple2,	/*!< in: range end, may also be empty tuple */
+	ulint		mode2);	/*!< in: search mode for range end */
+/*******************************************************************//**
+Estimates the number of different key values in a given index, for
+each n-column prefix of the index where n <= dict_index_get_n_unique(index).
+The estimates are stored in the array index->stat_n_diff_key_vals. */
+UNIV_INTERN
+void
+btr_estimate_number_of_different_key_vals(
+/*======================================*/
+	dict_index_t*	index);	/*!< in: index */
+/*******************************************************************//**
+Marks not updated extern fields as not-owned by this record. The ownership
+is transferred to the updated record which is inserted elsewhere in the
+index tree. In purge only the owner of externally stored field is allowed
+to free the field.
+@return TRUE if BLOB ownership was transferred */
+UNIV_INTERN
+ibool
+btr_cur_mark_extern_inherited_fields(
+/*=================================*/
+	page_zip_des_t*	page_zip,/*!< in/out: compressed page whose uncompressed
+				part will be updated, or NULL */
+	rec_t*		rec,	/*!< in/out: record in a clustered index */
+	dict_index_t*	index,	/*!< in: index of the page */
+	const ulint*	offsets,/*!< in: array returned by rec_get_offsets() */
+	const upd_t*	update,	/*!< in: update vector */
+	mtr_t*		mtr);	/*!< in: mtr, or NULL if not logged */
+/*******************************************************************//**
+The complement of the previous function: in an update entry may inherit
+some externally stored fields from a record. We must mark them as inherited
+in entry, so that they are not freed in a rollback. */
+UNIV_INTERN
+void
+btr_cur_mark_dtuple_inherited_extern(
+/*=================================*/
+	dtuple_t*	entry,		/*!< in/out: updated entry to be
+					inserted to clustered index */
+	const upd_t*	update);	/*!< in: update vector */
+/*******************************************************************//**
+Marks all extern fields in a dtuple as owned by the record. */
+UNIV_INTERN
+void
+btr_cur_unmark_dtuple_extern_fields(
+/*================================*/
+	dtuple_t*	entry);		/*!< in/out: clustered index entry */
+/*******************************************************************//**
+Stores the fields in big_rec_vec to the tablespace and puts pointers to
+them in rec.  The extern flags in rec will have to be set beforehand.
+The fields are stored on pages allocated from leaf node
+file segment of the index tree.
+@return	DB_SUCCESS or error */
+UNIV_INTERN
+ulint
+btr_store_big_rec_extern_fields(
+/*============================*/
+	dict_index_t*	index,		/*!< in: index of rec; the index tree
+					MUST be X-latched */
+	buf_block_t*	rec_block,	/*!< in/out: block containing rec */
+	rec_t*		rec,		/*!< in: record */
+	const ulint*	offsets,	/*!< in: rec_get_offsets(rec, index);
+					the "external storage" flags in offsets
+					will not correspond to rec when
+					this function returns */
+	big_rec_t*	big_rec_vec,	/*!< in: vector containing fields
+					to be stored externally */
+	mtr_t*		local_mtr);	/*!< in: mtr containing the latch to
+					rec and to the tree */
+/*******************************************************************//**
+Frees the space in an externally stored field to the file space
+management if the field in data is owned the externally stored field,
+in a rollback we may have the additional condition that the field must
+not be inherited. */
+UNIV_INTERN
+void
+btr_free_externally_stored_field(
+/*=============================*/
+	dict_index_t*	index,		/*!< in: index of the data, the index
+					tree MUST be X-latched; if the tree
+					height is 1, then also the root page
+					must be X-latched! (this is relevant
+					in the case this function is called
+					from purge where 'data' is located on
+					an undo log page, not an index
+					page) */
+	byte*		field_ref,	/*!< in/out: field reference */
+	const rec_t*	rec,		/*!< in: record containing field_ref, for
+					page_zip_write_blob_ptr(), or NULL */
+	const ulint*	offsets,	/*!< in: rec_get_offsets(rec, index),
+					or NULL */
+	page_zip_des_t*	page_zip,	/*!< in: compressed page corresponding
+					to rec, or NULL if rec == NULL */
+	ulint		i,		/*!< in: field number of field_ref;
+					ignored if rec == NULL */
+	enum trx_rb_ctx	rb_ctx,		/*!< in: rollback context */
+	mtr_t*		local_mtr);	/*!< in: mtr containing the latch to
+					data an an X-latch to the index
+					tree */
+/*******************************************************************//**
+Copies the prefix of an externally stored field of a record.  The
+clustered index record must be protected by a lock or a page latch.
+@return the length of the copied field, or 0 if the column was being
+or has been deleted */
+UNIV_INTERN
+ulint
+btr_copy_externally_stored_field_prefix(
+/*====================================*/
+	byte*		buf,	/*!< out: the field, or a prefix of it */
+	ulint		len,	/*!< in: length of buf, in bytes */
+	ulint		zip_size,/*!< in: nonzero=compressed BLOB page size,
+				zero for uncompressed BLOBs */
+	const byte*	data,	/*!< in: 'internally' stored part of the
+				field containing also the reference to
+				the external part; must be protected by
+				a lock or a page latch */
+	ulint		local_len);/*!< in: length of data, in bytes */
+/*******************************************************************//**
+Copies an externally stored field of a record to mem heap.
+@return	the field copied to heap, or NULL if the field is incomplete */
+UNIV_INTERN
+byte*
+btr_rec_copy_externally_stored_field(
+/*=================================*/
+	const rec_t*	rec,	/*!< in: record in a clustered index;
+				must be protected by a lock or a page latch */
+	const ulint*	offsets,/*!< in: array returned by rec_get_offsets() */
+	ulint		zip_size,/*!< in: nonzero=compressed BLOB page size,
+				zero for uncompressed BLOBs */
+	ulint		no,	/*!< in: field number */
+	ulint*		len,	/*!< out: length of the field */
+	mem_heap_t*	heap);	/*!< in: mem heap */
+/*******************************************************************//**
+Flags the data tuple fields that are marked as extern storage in the
+update vector.  We use this function to remember which fields we must
+mark as extern storage in a record inserted for an update.
+@return	number of flagged external columns */
+UNIV_INTERN
+ulint
+btr_push_update_extern_fields(
+/*==========================*/
+	dtuple_t*	tuple,	/*!< in/out: data tuple */
+	const upd_t*	update,	/*!< in: update vector */
+	mem_heap_t*	heap)	/*!< in: memory heap */
+	__attribute__((nonnull));
+
+/*######################################################################*/
+
+/** In the pessimistic delete, if the page data size drops below this
+limit, merging it to a neighbor is tried */
+#define BTR_CUR_PAGE_COMPRESS_LIMIT	(UNIV_PAGE_SIZE / 2)
+
+/** A slot in the path array. We store here info on a search path down the
+tree. Each slot contains data on a single level of the tree. */
+
+typedef struct btr_path_struct	btr_path_t;
+struct btr_path_struct{
+	ulint	nth_rec;	/*!< index of the record
+				where the page cursor stopped on
+				this level (index in alphabetical
+				order); value ULINT_UNDEFINED
+				denotes array end */
+	ulint	n_recs;		/*!< number of records on the page */
+};
+
+#define BTR_PATH_ARRAY_N_SLOTS	250	/*!< size of path array (in slots) */
+
+/** Values for the flag documenting the used search method */
+enum btr_cur_method {
+	BTR_CUR_HASH = 1,	/*!< successful shortcut using
+				the hash index */
+	BTR_CUR_HASH_FAIL,	/*!< failure using hash, success using
+				binary search: the misleading hash
+				reference is stored in the field
+				hash_node, and might be necessary to
+				update */
+	BTR_CUR_BINARY,		/*!< success using the binary search */
+	BTR_CUR_INSERT_TO_IBUF	/*!< performed the intended insert to
+				the insert buffer */
+};
+
+/** The tree cursor: the definition appears here only for the compiler
+to know struct size! */
+struct btr_cur_struct {
+	dict_index_t*	index;		/*!< index where positioned */
+	page_cur_t	page_cur;	/*!< page cursor */
+	buf_block_t*	left_block;	/*!< this field is used to store
+					a pointer to the left neighbor
+					page, in the cases
+					BTR_SEARCH_PREV and
+					BTR_MODIFY_PREV */
+	/*------------------------------*/
+	que_thr_t*	thr;		/*!< this field is only used
+					when btr_cur_search_to_nth_level
+					is called for an index entry
+					insertion: the calling query
+					thread is passed here to be
+					used in the insert buffer */
+	/*------------------------------*/
+	/** The following fields are used in
+	btr_cur_search_to_nth_level to pass information: */
+	/* @{ */
+	enum btr_cur_method	flag;	/*!< Search method used */
+	ulint		tree_height;	/*!< Tree height if the search is done
+					for a pessimistic insert or update
+					operation */
+	ulint		up_match;	/*!< If the search mode was PAGE_CUR_LE,
+					the number of matched fields to the
+					the first user record to the right of
+					the cursor record after
+					btr_cur_search_to_nth_level;
+					for the mode PAGE_CUR_GE, the matched
+					fields to the first user record AT THE
+					CURSOR or to the right of it;
+					NOTE that the up_match and low_match
+					values may exceed the correct values
+					for comparison to the adjacent user
+					record if that record is on a
+					different leaf page! (See the note in
+					row_ins_duplicate_key.) */
+	ulint		up_bytes;	/*!< number of matched bytes to the
+					right at the time cursor positioned;
+					only used internally in searches: not
+					defined after the search */
+	ulint		low_match;	/*!< if search mode was PAGE_CUR_LE,
+					the number of matched fields to the
+					first user record AT THE CURSOR or
+					to the left of it after
+					btr_cur_search_to_nth_level;
+					NOT defined for PAGE_CUR_GE or any
+					other search modes; see also the NOTE
+					in up_match! */
+	ulint		low_bytes;	/*!< number of matched bytes to the
+					right at the time cursor positioned;
+					only used internally in searches: not
+					defined after the search */
+	ulint		n_fields;	/*!< prefix length used in a hash
+					search if hash_node != NULL */
+	ulint		n_bytes;	/*!< hash prefix bytes if hash_node !=
+					NULL */
+	ulint		fold;		/*!< fold value used in the search if
+					flag is BTR_CUR_HASH */
+	/*------------------------------*/
+	/* @} */
+	btr_path_t*	path_arr;	/*!< in estimating the number of
+					rows in range, we store in this array
+					information of the path through
+					the tree */
+};
+
+/** If pessimistic delete fails because of lack of file space, there
+is still a good change of success a little later.  Try this many
+times. */
+#define BTR_CUR_RETRY_DELETE_N_TIMES	100
+/** If pessimistic delete fails because of lack of file space, there
+is still a good change of success a little later.  Sleep this many
+microseconds between retries. */
+#define BTR_CUR_RETRY_SLEEP_TIME	50000
+
+/** The reference in a field for which data is stored on a different page.
+The reference is at the end of the 'locally' stored part of the field.
+'Locally' means storage in the index record.
+We store locally a long enough prefix of each column so that we can determine
+the ordering parts of each index record without looking into the externally
+stored part. */
+/*-------------------------------------- @{ */
+#define BTR_EXTERN_SPACE_ID		0	/*!< space id where stored */
+#define BTR_EXTERN_PAGE_NO		4	/*!< page no where stored */
+#define BTR_EXTERN_OFFSET		8	/*!< offset of BLOB header
+						on that page */
+#define BTR_EXTERN_LEN			12	/*!< 8 bytes containing the
+						length of the externally
+						stored part of the BLOB.
+						The 2 highest bits are
+						reserved to the flags below. */
+/*-------------------------------------- @} */
+/* #define BTR_EXTERN_FIELD_REF_SIZE	20 // moved to btr0types.h */
+
+/** The most significant bit of BTR_EXTERN_LEN (i.e., the most
+significant bit of the byte at smallest address) is set to 1 if this
+field does not 'own' the externally stored field; only the owner field
+is allowed to free the field in purge! */
+#define BTR_EXTERN_OWNER_FLAG		128
+/** If the second most significant bit of BTR_EXTERN_LEN (i.e., the
+second most significant bit of the byte at smallest address) is 1 then
+it means that the externally stored field was inherited from an
+earlier version of the row.  In rollback we are not allowed to free an
+inherited external field. */
+#define BTR_EXTERN_INHERITED_FLAG	64
+
+/** Number of searches down the B-tree in btr_cur_search_to_nth_level(). */
+extern ulint	btr_cur_n_non_sea;
+/** Number of successful adaptive hash index lookups in
+btr_cur_search_to_nth_level(). */
+extern ulint	btr_cur_n_sea;
+/** Old value of btr_cur_n_non_sea.  Copied by
+srv_refresh_innodb_monitor_stats().  Referenced by
+srv_printf_innodb_monitor(). */
+extern ulint	btr_cur_n_non_sea_old;
+/** Old value of btr_cur_n_sea.  Copied by
+srv_refresh_innodb_monitor_stats().  Referenced by
+srv_printf_innodb_monitor(). */
+extern ulint	btr_cur_n_sea_old;
+#endif /* !UNIV_HOTBACKUP */
+
+#ifndef UNIV_NONINL
+#include "btr0cur.ic"
+#endif
+
+#endif
diff --git a/storage/xtradb/include/btr0cur.ic b/storage/xtradb/include/btr0cur.ic
new file mode 100644
index 00000000000..280583f6ccf
--- /dev/null
+++ b/storage/xtradb/include/btr0cur.ic
@@ -0,0 +1,200 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/btr0cur.ic
+The index tree cursor
+
+Created 10/16/1994 Heikki Tuuri
+*******************************************************/
+
+#ifndef UNIV_HOTBACKUP
+#include "btr0btr.h"
+
+#ifdef UNIV_DEBUG
+/*********************************************************//**
+Returns the page cursor component of a tree cursor.
+@return	pointer to page cursor component */
+UNIV_INLINE
+page_cur_t*
+btr_cur_get_page_cur(
+/*=================*/
+	const btr_cur_t*	cursor)	/*!< in: tree cursor */
+{
+	return(&((btr_cur_t*) cursor)->page_cur);
+}
+#endif /* UNIV_DEBUG */
+/*********************************************************//**
+Returns the buffer block on which the tree cursor is positioned.
+@return	pointer to buffer block */
+UNIV_INLINE
+buf_block_t*
+btr_cur_get_block(
+/*==============*/
+	btr_cur_t*	cursor)	/*!< in: tree cursor */
+{
+	return(page_cur_get_block(btr_cur_get_page_cur(cursor)));
+}
+
+/*********************************************************//**
+Returns the record pointer of a tree cursor.
+@return	pointer to record */
+UNIV_INLINE
+rec_t*
+btr_cur_get_rec(
+/*============*/
+	btr_cur_t*	cursor)	/*!< in: tree cursor */
+{
+	return(page_cur_get_rec(&(cursor->page_cur)));
+}
+
+/*********************************************************//**
+Returns the compressed page on which the tree cursor is positioned.
+@return	pointer to compressed page, or NULL if the page is not compressed */
+UNIV_INLINE
+page_zip_des_t*
+btr_cur_get_page_zip(
+/*=================*/
+	btr_cur_t*	cursor)	/*!< in: tree cursor */
+{
+	return(buf_block_get_page_zip(btr_cur_get_block(cursor)));
+}
+
+/*********************************************************//**
+Invalidates a tree cursor by setting record pointer to NULL. */
+UNIV_INLINE
+void
+btr_cur_invalidate(
+/*===============*/
+	btr_cur_t*	cursor)	/*!< in: tree cursor */
+{
+	page_cur_invalidate(&(cursor->page_cur));
+}
+
+/*********************************************************//**
+Returns the page of a tree cursor.
+@return	pointer to page */
+UNIV_INLINE
+page_t*
+btr_cur_get_page(
+/*=============*/
+	btr_cur_t*	cursor)	/*!< in: tree cursor */
+{
+	return(page_align(page_cur_get_rec(&(cursor->page_cur))));
+}
+
+/*********************************************************//**
+Returns the index of a cursor.
+@return	index */
+UNIV_INLINE
+dict_index_t*
+btr_cur_get_index(
+/*==============*/
+	btr_cur_t*	cursor)	/*!< in: B-tree cursor */
+{
+	return(cursor->index);
+}
+
+/*********************************************************//**
+Positions a tree cursor at a given record. */
+UNIV_INLINE
+void
+btr_cur_position(
+/*=============*/
+	dict_index_t*	index,	/*!< in: index */
+	rec_t*		rec,	/*!< in: record in tree */
+	buf_block_t*	block,	/*!< in: buffer block of rec */
+	btr_cur_t*	cursor)	/*!< out: cursor */
+{
+	ut_ad(page_align(rec) == block->frame);
+
+	page_cur_position(rec, block, btr_cur_get_page_cur(cursor));
+
+	cursor->index = index;
+}
+
+/*********************************************************************//**
+Checks if compressing an index page where a btr cursor is placed makes
+sense.
+@return	TRUE if compression is recommended */
+UNIV_INLINE
+ibool
+btr_cur_compress_recommendation(
+/*============================*/
+	btr_cur_t*	cursor,	/*!< in: btr cursor */
+	mtr_t*		mtr)	/*!< in: mtr */
+{
+	page_t*		page;
+
+	ut_ad(mtr_memo_contains(mtr, btr_cur_get_block(cursor),
+				MTR_MEMO_PAGE_X_FIX));
+
+	page = btr_cur_get_page(cursor);
+
+	if ((page_get_data_size(page) < BTR_CUR_PAGE_COMPRESS_LIMIT)
+	    || ((btr_page_get_next(page, mtr) == FIL_NULL)
+		&& (btr_page_get_prev(page, mtr) == FIL_NULL))) {
+
+		/* The page fillfactor has dropped below a predefined
+		minimum value OR the level in the B-tree contains just
+		one page: we recommend compression if this is not the
+		root page. */
+
+		return(dict_index_get_page(cursor->index)
+		       != page_get_page_no(page));
+	}
+
+	return(FALSE);
+}
+
+/*********************************************************************//**
+Checks if the record on which the cursor is placed can be deleted without
+making tree compression necessary (or, recommended).
+@return	TRUE if can be deleted without recommended compression */
+UNIV_INLINE
+ibool
+btr_cur_can_delete_without_compress(
+/*================================*/
+	btr_cur_t*	cursor,	/*!< in: btr cursor */
+	ulint		rec_size,/*!< in: rec_get_size(btr_cur_get_rec(cursor))*/
+	mtr_t*		mtr)	/*!< in: mtr */
+{
+	page_t*		page;
+
+	ut_ad(mtr_memo_contains(mtr, btr_cur_get_block(cursor),
+				MTR_MEMO_PAGE_X_FIX));
+
+	page = btr_cur_get_page(cursor);
+
+	if ((page_get_data_size(page) - rec_size < BTR_CUR_PAGE_COMPRESS_LIMIT)
+	    || ((btr_page_get_next(page, mtr) == FIL_NULL)
+		&& (btr_page_get_prev(page, mtr) == FIL_NULL))
+	    || (page_get_n_recs(page) < 2)) {
+
+		/* The page fillfactor will drop below a predefined
+		minimum value, OR the level in the B-tree contains just
+		one page, OR the page will become empty: we recommend
+		compression if this is not the root page. */
+
+		return(dict_index_get_page(cursor->index)
+		       == page_get_page_no(page));
+	}
+
+	return(TRUE);
+}
+#endif /* !UNIV_HOTBACKUP */
diff --git a/storage/xtradb/include/btr0pcur.h b/storage/xtradb/include/btr0pcur.h
new file mode 100644
index 00000000000..2334a266280
--- /dev/null
+++ b/storage/xtradb/include/btr0pcur.h
@@ -0,0 +1,551 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2010, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/btr0pcur.h
+The index tree persistent cursor
+
+Created 2/23/1996 Heikki Tuuri
+*******************************************************/
+
+#ifndef btr0pcur_h
+#define btr0pcur_h
+
+#include "univ.i"
+#include "dict0dict.h"
+#include "data0data.h"
+#include "mtr0mtr.h"
+#include "page0cur.h"
+#include "btr0cur.h"
+#include "btr0btr.h"
+#include "btr0types.h"
+
+/* Relative positions for a stored cursor position */
+#define BTR_PCUR_ON			1
+#define BTR_PCUR_BEFORE			2
+#define BTR_PCUR_AFTER			3
+/* Note that if the tree is not empty, btr_pcur_store_position does not
+use the following, but only uses the above three alternatives, where the
+position is stored relative to a specific record: this makes implementation
+of a scroll cursor easier */
+#define BTR_PCUR_BEFORE_FIRST_IN_TREE	4	/* in an empty tree */
+#define BTR_PCUR_AFTER_LAST_IN_TREE	5	/* in an empty tree */
+
+/**************************************************************//**
+Allocates memory for a persistent cursor object and initializes the cursor.
+@return	own: persistent cursor */
+UNIV_INTERN
+btr_pcur_t*
+btr_pcur_create_for_mysql(void);
+/*============================*/
+/**************************************************************//**
+Frees the memory for a persistent cursor object. */
+UNIV_INTERN
+void
+btr_pcur_free_for_mysql(
+/*====================*/
+	btr_pcur_t*	cursor);	/*!< in, own: persistent cursor */
+/**************************************************************//**
+Copies the stored position of a pcur to another pcur. */
+UNIV_INTERN
+void
+btr_pcur_copy_stored_position(
+/*==========================*/
+	btr_pcur_t*	pcur_receive,	/*!< in: pcur which will receive the
+					position info */
+	btr_pcur_t*	pcur_donate);	/*!< in: pcur from which the info is
+					copied */
+/**************************************************************//**
+Sets the old_rec_buf field to NULL. */
+UNIV_INLINE
+void
+btr_pcur_init(
+/*==========*/
+	btr_pcur_t*	pcur);	/*!< in: persistent cursor */
+/**************************************************************//**
+Initializes and opens a persistent cursor to an index tree. It should be
+closed with btr_pcur_close. */
+UNIV_INLINE
+void
+btr_pcur_open_func(
+/*===============*/
+	dict_index_t*	index,	/*!< in: index */
+	const dtuple_t*	tuple,	/*!< in: tuple on which search done */
+	ulint		mode,	/*!< in: PAGE_CUR_L, ...;
+				NOTE that if the search is made using a unique
+				prefix of a record, mode should be
+				PAGE_CUR_LE, not PAGE_CUR_GE, as the latter
+				may end up on the previous page from the
+				record! */
+	ulint		latch_mode,/*!< in: BTR_SEARCH_LEAF, ... */
+	btr_pcur_t*	cursor, /*!< in: memory buffer for persistent cursor */
+	const char*	file,	/*!< in: file name */
+	ulint		line,	/*!< in: line where called */
+	mtr_t*		mtr);	/*!< in: mtr */
+#define btr_pcur_open(i,t,md,l,c,m)				\
+	btr_pcur_open_func(i,t,md,l,c,__FILE__,__LINE__,m)
+/**************************************************************//**
+Opens an persistent cursor to an index tree without initializing the
+cursor. */
+UNIV_INLINE
+void
+btr_pcur_open_with_no_init_func(
+/*============================*/
+	dict_index_t*	index,	/*!< in: index */
+	const dtuple_t*	tuple,	/*!< in: tuple on which search done */
+	ulint		mode,	/*!< in: PAGE_CUR_L, ...;
+				NOTE that if the search is made using a unique
+				prefix of a record, mode should be
+				PAGE_CUR_LE, not PAGE_CUR_GE, as the latter
+				may end up on the previous page of the
+				record! */
+	ulint		latch_mode,/*!< in: BTR_SEARCH_LEAF, ...;
+				NOTE that if has_search_latch != 0 then
+				we maybe do not acquire a latch on the cursor
+				page, but assume that the caller uses his
+				btr search latch to protect the record! */
+	btr_pcur_t*	cursor, /*!< in: memory buffer for persistent cursor */
+	ulint		has_search_latch,/*!< in: latch mode the caller
+				currently has on btr_search_latch:
+				RW_S_LATCH, or 0 */
+	const char*	file,	/*!< in: file name */
+	ulint		line,	/*!< in: line where called */
+	mtr_t*		mtr);	/*!< in: mtr */
+#define btr_pcur_open_with_no_init(ix,t,md,l,cur,has,m)			\
+	btr_pcur_open_with_no_init_func(ix,t,md,l,cur,has,__FILE__,__LINE__,m)
+
+/*****************************************************************//**
+Opens a persistent cursor at either end of an index. */
+UNIV_INLINE
+void
+btr_pcur_open_at_index_side(
+/*========================*/
+	ibool		from_left,	/*!< in: TRUE if open to the low end,
+					FALSE if to the high end */
+	dict_index_t*	index,		/*!< in: index */
+	ulint		latch_mode,	/*!< in: latch mode */
+	btr_pcur_t*	pcur,		/*!< in: cursor */
+	ibool		do_init,	/*!< in: TRUE if should be initialized */
+	mtr_t*		mtr);		/*!< in: mtr */
+/**************************************************************//**
+Gets the up_match value for a pcur after a search.
+@return number of matched fields at the cursor or to the right if
+search mode was PAGE_CUR_GE, otherwise undefined */
+UNIV_INLINE
+ulint
+btr_pcur_get_up_match(
+/*==================*/
+	btr_pcur_t*	cursor); /*!< in: memory buffer for persistent cursor */
+/**************************************************************//**
+Gets the low_match value for a pcur after a search.
+@return number of matched fields at the cursor or to the right if
+search mode was PAGE_CUR_LE, otherwise undefined */
+UNIV_INLINE
+ulint
+btr_pcur_get_low_match(
+/*===================*/
+	btr_pcur_t*	cursor); /*!< in: memory buffer for persistent cursor */
+/**************************************************************//**
+If mode is PAGE_CUR_G or PAGE_CUR_GE, opens a persistent cursor on the first
+user record satisfying the search condition, in the case PAGE_CUR_L or
+PAGE_CUR_LE, on the last user record. If no such user record exists, then
+in the first case sets the cursor after last in tree, and in the latter case
+before first in tree. The latching mode must be BTR_SEARCH_LEAF or
+BTR_MODIFY_LEAF. */
+UNIV_INTERN
+void
+btr_pcur_open_on_user_rec_func(
+/*===========================*/
+	dict_index_t*	index,		/*!< in: index */
+	const dtuple_t*	tuple,		/*!< in: tuple on which search done */
+	ulint		mode,		/*!< in: PAGE_CUR_L, ... */
+	ulint		latch_mode,	/*!< in: BTR_SEARCH_LEAF or
+					BTR_MODIFY_LEAF */
+	btr_pcur_t*	cursor,		/*!< in: memory buffer for persistent
+					cursor */
+	const char*	file,		/*!< in: file name */
+	ulint		line,		/*!< in: line where called */
+	mtr_t*		mtr);		/*!< in: mtr */
+#define btr_pcur_open_on_user_rec(i,t,md,l,c,m)				\
+	btr_pcur_open_on_user_rec_func(i,t,md,l,c,__FILE__,__LINE__,m)
+/**********************************************************************//**
+Positions a cursor at a randomly chosen position within a B-tree. */
+UNIV_INLINE
+void
+btr_pcur_open_at_rnd_pos_func(
+/*==========================*/
+	dict_index_t*	index,		/*!< in: index */
+	ulint		latch_mode,	/*!< in: BTR_SEARCH_LEAF, ... */
+	btr_pcur_t*	cursor,		/*!< in/out: B-tree pcur */
+	const char*	file,		/*!< in: file name */
+	ulint		line,		/*!< in: line where called */
+	mtr_t*		mtr);		/*!< in: mtr */
+#define btr_pcur_open_at_rnd_pos(i,l,c,m)				\
+	btr_pcur_open_at_rnd_pos_func(i,l,c,__FILE__,__LINE__,m)
+/**************************************************************//**
+Frees the possible old_rec_buf buffer of a persistent cursor and sets the
+latch mode of the persistent cursor to BTR_NO_LATCHES. */
+UNIV_INLINE
+void
+btr_pcur_close(
+/*===========*/
+	btr_pcur_t*	cursor);	/*!< in: persistent cursor */
+/**************************************************************//**
+The position of the cursor is stored by taking an initial segment of the
+record the cursor is positioned on, before, or after, and copying it to the
+cursor data structure, or just setting a flag if the cursor id before the
+first in an EMPTY tree, or after the last in an EMPTY tree. NOTE that the
+page where the cursor is positioned must not be empty if the index tree is
+not totally empty! */
+UNIV_INTERN
+void
+btr_pcur_store_position(
+/*====================*/
+	btr_pcur_t*	cursor, /*!< in: persistent cursor */
+	mtr_t*		mtr);	/*!< in: mtr */
+/**************************************************************//**
+Restores the stored position of a persistent cursor bufferfixing the page and
+obtaining the specified latches. If the cursor position was saved when the
+(1) cursor was positioned on a user record: this function restores the position
+to the last record LESS OR EQUAL to the stored record;
+(2) cursor was positioned on a page infimum record: restores the position to
+the last record LESS than the user record which was the successor of the page
+infimum;
+(3) cursor was positioned on the page supremum: restores to the first record
+GREATER than the user record which was the predecessor of the supremum.
+(4) cursor was positioned before the first or after the last in an empty tree:
+restores to before first or after the last in the tree.
+@return TRUE if the cursor position was stored when it was on a user
+record and it can be restored on a user record whose ordering fields
+are identical to the ones of the original user record */
+UNIV_INTERN
+ibool
+btr_pcur_restore_position_func(
+/*===========================*/
+	ulint		latch_mode,	/*!< in: BTR_SEARCH_LEAF, ... */
+	btr_pcur_t*	cursor,		/*!< in: detached persistent cursor */
+	const char*	file,		/*!< in: file name */
+	ulint		line,		/*!< in: line where called */
+	mtr_t*		mtr);		/*!< in: mtr */
+#define btr_pcur_restore_position(l,cur,mtr)				\
+	btr_pcur_restore_position_func(l,cur,__FILE__,__LINE__,mtr)
+/**************************************************************//**
+If the latch mode of the cursor is BTR_LEAF_SEARCH or BTR_LEAF_MODIFY,
+releases the page latch and bufferfix reserved by the cursor.
+NOTE! In the case of BTR_LEAF_MODIFY, there should not exist changes
+made by the current mini-transaction to the data protected by the
+cursor latch, as then the latch must not be released until mtr_commit. */
+UNIV_INTERN
+void
+btr_pcur_release_leaf(
+/*==================*/
+	btr_pcur_t*	cursor, /*!< in: persistent cursor */
+	mtr_t*		mtr);	/*!< in: mtr */
+/*********************************************************//**
+Gets the rel_pos field for a cursor whose position has been stored.
+@return	BTR_PCUR_ON, ... */
+UNIV_INLINE
+ulint
+btr_pcur_get_rel_pos(
+/*=================*/
+	const btr_pcur_t*	cursor);/*!< in: persistent cursor */
+/*********************************************************//**
+Sets the mtr field for a pcur. */
+UNIV_INLINE
+void
+btr_pcur_set_mtr(
+/*=============*/
+	btr_pcur_t*	cursor,	/*!< in: persistent cursor */
+	mtr_t*		mtr);	/*!< in, own: mtr */
+/*********************************************************//**
+Gets the mtr field for a pcur.
+@return	mtr */
+UNIV_INLINE
+mtr_t*
+btr_pcur_get_mtr(
+/*=============*/
+	btr_pcur_t*	cursor);	/*!< in: persistent cursor */
+/**************************************************************//**
+Commits the mtr and sets the pcur latch mode to BTR_NO_LATCHES,
+that is, the cursor becomes detached. If there have been modifications
+to the page where pcur is positioned, this can be used instead of
+btr_pcur_release_leaf. Function btr_pcur_store_position should be used
+before calling this, if restoration of cursor is wanted later. */
+UNIV_INLINE
+void
+btr_pcur_commit_specify_mtr(
+/*========================*/
+	btr_pcur_t*	pcur,	/*!< in: persistent cursor */
+	mtr_t*		mtr);	/*!< in: mtr to commit */
+/**************************************************************//**
+Tests if a cursor is detached: that is the latch mode is BTR_NO_LATCHES.
+@return	TRUE if detached */
+UNIV_INLINE
+ibool
+btr_pcur_is_detached(
+/*=================*/
+	btr_pcur_t*	pcur);	/*!< in: persistent cursor */
+/*********************************************************//**
+Moves the persistent cursor to the next record in the tree. If no records are
+left, the cursor stays 'after last in tree'.
+@return	TRUE if the cursor was not after last in tree */
+UNIV_INLINE
+ibool
+btr_pcur_move_to_next(
+/*==================*/
+	btr_pcur_t*	cursor,	/*!< in: persistent cursor; NOTE that the
+				function may release the page latch */
+	mtr_t*		mtr);	/*!< in: mtr */
+/*********************************************************//**
+Moves the persistent cursor to the previous record in the tree. If no records
+are left, the cursor stays 'before first in tree'.
+@return	TRUE if the cursor was not before first in tree */
+UNIV_INTERN
+ibool
+btr_pcur_move_to_prev(
+/*==================*/
+	btr_pcur_t*	cursor,	/*!< in: persistent cursor; NOTE that the
+				function may release the page latch */
+	mtr_t*		mtr);	/*!< in: mtr */
+/*********************************************************//**
+Moves the persistent cursor to the last record on the same page. */
+UNIV_INLINE
+void
+btr_pcur_move_to_last_on_page(
+/*==========================*/
+	btr_pcur_t*	cursor,	/*!< in: persistent cursor */
+	mtr_t*		mtr);	/*!< in: mtr */
+/*********************************************************//**
+Moves the persistent cursor to the next user record in the tree. If no user
+records are left, the cursor ends up 'after last in tree'.
+@return	TRUE if the cursor moved forward, ending on a user record */
+UNIV_INLINE
+ibool
+btr_pcur_move_to_next_user_rec(
+/*===========================*/
+	btr_pcur_t*	cursor,	/*!< in: persistent cursor; NOTE that the
+				function may release the page latch */
+	mtr_t*		mtr);	/*!< in: mtr */
+/*********************************************************//**
+Moves the persistent cursor to the first record on the next page.
+Releases the latch on the current page, and bufferunfixes it.
+Note that there must not be modifications on the current page,
+as then the x-latch can be released only in mtr_commit. */
+UNIV_INTERN
+void
+btr_pcur_move_to_next_page(
+/*=======================*/
+	btr_pcur_t*	cursor,	/*!< in: persistent cursor; must be on the
+				last record of the current page */
+	mtr_t*		mtr);	/*!< in: mtr */
+/*********************************************************//**
+Moves the persistent cursor backward if it is on the first record
+of the page. Releases the latch on the current page, and bufferunfixes
+it. Note that to prevent a possible deadlock, the operation first
+stores the position of the cursor, releases the leaf latch, acquires
+necessary latches and restores the cursor position again before returning.
+The alphabetical position of the cursor is guaranteed to be sensible
+on return, but it may happen that the cursor is not positioned on the
+last record of any page, because the structure of the tree may have
+changed while the cursor had no latches. */
+UNIV_INTERN
+void
+btr_pcur_move_backward_from_page(
+/*=============================*/
+	btr_pcur_t*	cursor,	/*!< in: persistent cursor, must be on the
+				first record of the current page */
+	mtr_t*		mtr);	/*!< in: mtr */
+#ifdef UNIV_DEBUG
+/*********************************************************//**
+Returns the btr cursor component of a persistent cursor.
+@return	pointer to btr cursor component */
+UNIV_INLINE
+btr_cur_t*
+btr_pcur_get_btr_cur(
+/*=================*/
+	const btr_pcur_t*	cursor);	/*!< in: persistent cursor */
+/*********************************************************//**
+Returns the page cursor component of a persistent cursor.
+@return	pointer to page cursor component */
+UNIV_INLINE
+page_cur_t*
+btr_pcur_get_page_cur(
+/*==================*/
+	const btr_pcur_t*	cursor);	/*!< in: persistent cursor */
+#else /* UNIV_DEBUG */
+# define btr_pcur_get_btr_cur(cursor) (&(cursor)->btr_cur)
+# define btr_pcur_get_page_cur(cursor) (&(cursor)->btr_cur.page_cur)
+#endif /* UNIV_DEBUG */
+/*********************************************************//**
+Returns the page of a persistent cursor.
+@return	pointer to the page */
+UNIV_INLINE
+page_t*
+btr_pcur_get_page(
+/*==============*/
+	btr_pcur_t*	cursor);/*!< in: persistent cursor */
+/*********************************************************//**
+Returns the buffer block of a persistent cursor.
+@return	pointer to the block */
+UNIV_INLINE
+buf_block_t*
+btr_pcur_get_block(
+/*===============*/
+	btr_pcur_t*	cursor);/*!< in: persistent cursor */
+/*********************************************************//**
+Returns the record of a persistent cursor.
+@return	pointer to the record */
+UNIV_INLINE
+rec_t*
+btr_pcur_get_rec(
+/*=============*/
+	btr_pcur_t*	cursor);/*!< in: persistent cursor */
+/*********************************************************//**
+Checks if the persistent cursor is on a user record. */
+UNIV_INLINE
+ibool
+btr_pcur_is_on_user_rec(
+/*====================*/
+	const btr_pcur_t*	cursor);/*!< in: persistent cursor */
+/*********************************************************//**
+Checks if the persistent cursor is after the last user record on
+a page. */
+UNIV_INLINE
+ibool
+btr_pcur_is_after_last_on_page(
+/*===========================*/
+	const btr_pcur_t*	cursor);/*!< in: persistent cursor */
+/*********************************************************//**
+Checks if the persistent cursor is before the first user record on
+a page. */
+UNIV_INLINE
+ibool
+btr_pcur_is_before_first_on_page(
+/*=============================*/
+	const btr_pcur_t*	cursor);/*!< in: persistent cursor */
+/*********************************************************//**
+Checks if the persistent cursor is before the first user record in
+the index tree. */
+UNIV_INLINE
+ibool
+btr_pcur_is_before_first_in_tree(
+/*=============================*/
+	btr_pcur_t*	cursor,	/*!< in: persistent cursor */
+	mtr_t*		mtr);	/*!< in: mtr */
+/*********************************************************//**
+Checks if the persistent cursor is after the last user record in
+the index tree. */
+UNIV_INLINE
+ibool
+btr_pcur_is_after_last_in_tree(
+/*===========================*/
+	btr_pcur_t*	cursor,	/*!< in: persistent cursor */
+	mtr_t*		mtr);	/*!< in: mtr */
+/*********************************************************//**
+Moves the persistent cursor to the next record on the same page. */
+UNIV_INLINE
+void
+btr_pcur_move_to_next_on_page(
+/*==========================*/
+	btr_pcur_t*	cursor);/*!< in/out: persistent cursor */
+/*********************************************************//**
+Moves the persistent cursor to the previous record on the same page. */
+UNIV_INLINE
+void
+btr_pcur_move_to_prev_on_page(
+/*==========================*/
+	btr_pcur_t*	cursor);/*!< in/out: persistent cursor */
+
+
+/* The persistent B-tree cursor structure. This is used mainly for SQL
+selects, updates, and deletes. */
+
+struct btr_pcur_struct{
+	btr_cur_t	btr_cur;	/*!< a B-tree cursor */
+	ulint		latch_mode;	/*!< see TODO note below!
+					BTR_SEARCH_LEAF, BTR_MODIFY_LEAF,
+					BTR_MODIFY_TREE, or BTR_NO_LATCHES,
+					depending on the latching state of
+					the page and tree where the cursor is
+					positioned; the last value means that
+					the cursor is not currently positioned:
+					we say then that the cursor is
+					detached; it can be restored to
+					attached if the old position was
+					stored in old_rec */
+	ulint		old_stored;	/*!< BTR_PCUR_OLD_STORED
+					or BTR_PCUR_OLD_NOT_STORED */
+	rec_t*		old_rec;	/*!< if cursor position is stored,
+					contains an initial segment of the
+					latest record cursor was positioned
+					either on, before, or after */
+	ulint		old_n_fields;	/*!< number of fields in old_rec */
+	ulint		rel_pos;	/*!< BTR_PCUR_ON, BTR_PCUR_BEFORE, or
+					BTR_PCUR_AFTER, depending on whether
+					cursor was on, before, or after the
+					old_rec record */
+	buf_block_t*	block_when_stored;/* buffer block when the position was
+					stored */
+	ib_uint64_t	modify_clock;	/*!< the modify clock value of the
+					buffer block when the cursor position
+					was stored */
+	ulint		pos_state;	/*!< see TODO note below!
+					BTR_PCUR_IS_POSITIONED,
+					BTR_PCUR_WAS_POSITIONED,
+					BTR_PCUR_NOT_POSITIONED */
+	ulint		search_mode;	/*!< PAGE_CUR_G, ... */
+	trx_t*		trx_if_known;	/*!< the transaction, if we know it;
+					otherwise this field is not defined;
+					can ONLY BE USED in error prints in
+					fatal assertion failures! */
+	/*-----------------------------*/
+	/* NOTE that the following fields may possess dynamically allocated
+	memory which should be freed if not needed anymore! */
+
+	mtr_t*		mtr;		/*!< NULL, or this field may contain
+					a mini-transaction which holds the
+					latch on the cursor page */
+	byte*		old_rec_buf;	/*!< NULL, or a dynamically allocated
+					buffer for old_rec */
+	ulint		buf_size;	/*!< old_rec_buf size if old_rec_buf
+					is not NULL */
+};
+
+#define BTR_PCUR_IS_POSITIONED	1997660512	/* TODO: currently, the state
+						can be BTR_PCUR_IS_POSITIONED,
+						though it really should be
+						BTR_PCUR_WAS_POSITIONED,
+						because we have no obligation
+						to commit the cursor with
+						mtr; similarly latch_mode may
+						be out of date. This can
+						lead to problems if btr_pcur
+						is not used the right way;
+						all current code should be
+						ok. */
+#define BTR_PCUR_WAS_POSITIONED	1187549791
+#define BTR_PCUR_NOT_POSITIONED 1328997689
+
+#define BTR_PCUR_OLD_STORED	908467085
+#define BTR_PCUR_OLD_NOT_STORED	122766467
+
+#ifndef UNIV_NONINL
+#include "btr0pcur.ic"
+#endif
+
+#endif
diff --git a/storage/xtradb/include/btr0pcur.ic b/storage/xtradb/include/btr0pcur.ic
new file mode 100644
index 00000000000..0c38797e6c5
--- /dev/null
+++ b/storage/xtradb/include/btr0pcur.ic
@@ -0,0 +1,642 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2010, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/btr0pcur.ic
+The index tree persistent cursor
+
+Created 2/23/1996 Heikki Tuuri
+*******************************************************/
+
+
+/*********************************************************//**
+Gets the rel_pos field for a cursor whose position has been stored.
+@return	BTR_PCUR_ON, ... */
+UNIV_INLINE
+ulint
+btr_pcur_get_rel_pos(
+/*=================*/
+	const btr_pcur_t*	cursor)	/*!< in: persistent cursor */
+{
+	ut_ad(cursor);
+	ut_ad(cursor->old_rec);
+	ut_ad(cursor->old_stored == BTR_PCUR_OLD_STORED);
+	ut_ad(cursor->pos_state == BTR_PCUR_WAS_POSITIONED
+	      || cursor->pos_state == BTR_PCUR_IS_POSITIONED);
+
+	return(cursor->rel_pos);
+}
+
+/*********************************************************//**
+Sets the mtr field for a pcur. */
+UNIV_INLINE
+void
+btr_pcur_set_mtr(
+/*=============*/
+	btr_pcur_t*	cursor,	/*!< in: persistent cursor */
+	mtr_t*		mtr)	/*!< in, own: mtr */
+{
+	ut_ad(cursor);
+
+	cursor->mtr = mtr;
+}
+
+/*********************************************************//**
+Gets the mtr field for a pcur.
+@return	mtr */
+UNIV_INLINE
+mtr_t*
+btr_pcur_get_mtr(
+/*=============*/
+	btr_pcur_t*	cursor)	/*!< in: persistent cursor */
+{
+	ut_ad(cursor);
+
+	return(cursor->mtr);
+}
+
+#ifdef UNIV_DEBUG
+/*********************************************************//**
+Returns the btr cursor component of a persistent cursor.
+@return	pointer to btr cursor component */
+UNIV_INLINE
+btr_cur_t*
+btr_pcur_get_btr_cur(
+/*=================*/
+	const btr_pcur_t*	cursor)	/*!< in: persistent cursor */
+{
+	const btr_cur_t*	btr_cur = &cursor->btr_cur;
+	return((btr_cur_t*) btr_cur);
+}
+
+/*********************************************************//**
+Returns the page cursor component of a persistent cursor.
+@return	pointer to page cursor component */
+UNIV_INLINE
+page_cur_t*
+btr_pcur_get_page_cur(
+/*==================*/
+	const btr_pcur_t*	cursor)	/*!< in: persistent cursor */
+{
+	return(btr_cur_get_page_cur(btr_pcur_get_btr_cur(cursor)));
+}
+#endif /* UNIV_DEBUG */
+/*********************************************************//**
+Returns the page of a persistent cursor.
+@return	pointer to the page */
+UNIV_INLINE
+page_t*
+btr_pcur_get_page(
+/*==============*/
+	btr_pcur_t*	cursor)	/*!< in: persistent cursor */
+{
+	ut_ad(cursor->pos_state == BTR_PCUR_IS_POSITIONED);
+
+	return(btr_cur_get_page(btr_pcur_get_btr_cur(cursor)));
+}
+
+/*********************************************************//**
+Returns the buffer block of a persistent cursor.
+@return	pointer to the block */
+UNIV_INLINE
+buf_block_t*
+btr_pcur_get_block(
+/*===============*/
+	btr_pcur_t*	cursor)	/*!< in: persistent cursor */
+{
+	ut_ad(cursor->pos_state == BTR_PCUR_IS_POSITIONED);
+
+	return(btr_cur_get_block(btr_pcur_get_btr_cur(cursor)));
+}
+
+/*********************************************************//**
+Returns the record of a persistent cursor.
+@return	pointer to the record */
+UNIV_INLINE
+rec_t*
+btr_pcur_get_rec(
+/*=============*/
+	btr_pcur_t*	cursor)	/*!< in: persistent cursor */
+{
+	ut_ad(cursor->pos_state == BTR_PCUR_IS_POSITIONED);
+	ut_ad(cursor->latch_mode != BTR_NO_LATCHES);
+
+	return(btr_cur_get_rec(btr_pcur_get_btr_cur(cursor)));
+}
+
+/**************************************************************//**
+Gets the up_match value for a pcur after a search.
+@return number of matched fields at the cursor or to the right if
+search mode was PAGE_CUR_GE, otherwise undefined */
+UNIV_INLINE
+ulint
+btr_pcur_get_up_match(
+/*==================*/
+	btr_pcur_t*	cursor) /*!< in: memory buffer for persistent cursor */
+{
+	btr_cur_t*	btr_cursor;
+
+	ut_ad((cursor->pos_state == BTR_PCUR_WAS_POSITIONED)
+	      || (cursor->pos_state == BTR_PCUR_IS_POSITIONED));
+
+	btr_cursor = btr_pcur_get_btr_cur(cursor);
+
+	ut_ad(btr_cursor->up_match != ULINT_UNDEFINED);
+
+	return(btr_cursor->up_match);
+}
+
+/**************************************************************//**
+Gets the low_match value for a pcur after a search.
+@return number of matched fields at the cursor or to the right if
+search mode was PAGE_CUR_LE, otherwise undefined */
+UNIV_INLINE
+ulint
+btr_pcur_get_low_match(
+/*===================*/
+	btr_pcur_t*	cursor) /*!< in: memory buffer for persistent cursor */
+{
+	btr_cur_t*	btr_cursor;
+
+	ut_ad((cursor->pos_state == BTR_PCUR_WAS_POSITIONED)
+	      || (cursor->pos_state == BTR_PCUR_IS_POSITIONED));
+
+	btr_cursor = btr_pcur_get_btr_cur(cursor);
+	ut_ad(btr_cursor->low_match != ULINT_UNDEFINED);
+
+	return(btr_cursor->low_match);
+}
+
+/*********************************************************//**
+Checks if the persistent cursor is after the last user record on
+a page. */
+UNIV_INLINE
+ibool
+btr_pcur_is_after_last_on_page(
+/*===========================*/
+	const btr_pcur_t*	cursor)	/*!< in: persistent cursor */
+{
+	ut_ad(cursor->pos_state == BTR_PCUR_IS_POSITIONED);
+	ut_ad(cursor->latch_mode != BTR_NO_LATCHES);
+
+	return(page_cur_is_after_last(btr_pcur_get_page_cur(cursor)));
+}
+
+/*********************************************************//**
+Checks if the persistent cursor is before the first user record on
+a page. */
+UNIV_INLINE
+ibool
+btr_pcur_is_before_first_on_page(
+/*=============================*/
+	const btr_pcur_t*	cursor)	/*!< in: persistent cursor */
+{
+	ut_ad(cursor->pos_state == BTR_PCUR_IS_POSITIONED);
+	ut_ad(cursor->latch_mode != BTR_NO_LATCHES);
+
+	return(page_cur_is_before_first(btr_pcur_get_page_cur(cursor)));
+}
+
+/*********************************************************//**
+Checks if the persistent cursor is on a user record. */
+UNIV_INLINE
+ibool
+btr_pcur_is_on_user_rec(
+/*====================*/
+	const btr_pcur_t*	cursor)	/*!< in: persistent cursor */
+{
+	ut_ad(cursor->pos_state == BTR_PCUR_IS_POSITIONED);
+	ut_ad(cursor->latch_mode != BTR_NO_LATCHES);
+
+	if (btr_pcur_is_before_first_on_page(cursor)
+	    || btr_pcur_is_after_last_on_page(cursor)) {
+
+		return(FALSE);
+	}
+
+	return(TRUE);
+}
+
+/*********************************************************//**
+Checks if the persistent cursor is before the first user record in
+the index tree. */
+UNIV_INLINE
+ibool
+btr_pcur_is_before_first_in_tree(
+/*=============================*/
+	btr_pcur_t*	cursor,	/*!< in: persistent cursor */
+	mtr_t*		mtr)	/*!< in: mtr */
+{
+	ut_ad(cursor->pos_state == BTR_PCUR_IS_POSITIONED);
+	ut_ad(cursor->latch_mode != BTR_NO_LATCHES);
+
+	if (btr_page_get_prev(btr_pcur_get_page(cursor), mtr) != FIL_NULL) {
+
+		return(FALSE);
+	}
+
+	return(page_cur_is_before_first(btr_pcur_get_page_cur(cursor)));
+}
+
+/*********************************************************//**
+Checks if the persistent cursor is after the last user record in
+the index tree. */
+UNIV_INLINE
+ibool
+btr_pcur_is_after_last_in_tree(
+/*===========================*/
+	btr_pcur_t*	cursor,	/*!< in: persistent cursor */
+	mtr_t*		mtr)	/*!< in: mtr */
+{
+	ut_ad(cursor->pos_state == BTR_PCUR_IS_POSITIONED);
+	ut_ad(cursor->latch_mode != BTR_NO_LATCHES);
+
+	if (btr_page_get_next(btr_pcur_get_page(cursor), mtr) != FIL_NULL) {
+
+		return(FALSE);
+	}
+
+	return(page_cur_is_after_last(btr_pcur_get_page_cur(cursor)));
+}
+
+/*********************************************************//**
+Moves the persistent cursor to the next record on the same page. */
+UNIV_INLINE
+void
+btr_pcur_move_to_next_on_page(
+/*==========================*/
+	btr_pcur_t*	cursor)	/*!< in/out: persistent cursor */
+{
+	ut_ad(cursor->pos_state == BTR_PCUR_IS_POSITIONED);
+	ut_ad(cursor->latch_mode != BTR_NO_LATCHES);
+
+	page_cur_move_to_next(btr_pcur_get_page_cur(cursor));
+
+	cursor->old_stored = BTR_PCUR_OLD_NOT_STORED;
+}
+
+/*********************************************************//**
+Moves the persistent cursor to the previous record on the same page. */
+UNIV_INLINE
+void
+btr_pcur_move_to_prev_on_page(
+/*==========================*/
+	btr_pcur_t*	cursor)	/*!< in/out: persistent cursor */
+{
+	ut_ad(cursor->pos_state == BTR_PCUR_IS_POSITIONED);
+	ut_ad(cursor->latch_mode != BTR_NO_LATCHES);
+
+	page_cur_move_to_prev(btr_pcur_get_page_cur(cursor));
+
+	cursor->old_stored = BTR_PCUR_OLD_NOT_STORED;
+}
+
+/*********************************************************//**
+Moves the persistent cursor to the last record on the same page. */
+UNIV_INLINE
+void
+btr_pcur_move_to_last_on_page(
+/*==========================*/
+	btr_pcur_t*	cursor,	/*!< in: persistent cursor */
+	mtr_t*		mtr)	/*!< in: mtr */
+{
+	UT_NOT_USED(mtr);
+	ut_ad(cursor->latch_mode != BTR_NO_LATCHES);
+
+	page_cur_set_after_last(btr_pcur_get_block(cursor),
+				btr_pcur_get_page_cur(cursor));
+
+	cursor->old_stored = BTR_PCUR_OLD_NOT_STORED;
+}
+
+/*********************************************************//**
+Moves the persistent cursor to the next user record in the tree. If no user
+records are left, the cursor ends up 'after last in tree'.
+@return	TRUE if the cursor moved forward, ending on a user record */
+UNIV_INLINE
+ibool
+btr_pcur_move_to_next_user_rec(
+/*===========================*/
+	btr_pcur_t*	cursor,	/*!< in: persistent cursor; NOTE that the
+				function may release the page latch */
+	mtr_t*		mtr)	/*!< in: mtr */
+{
+	ut_ad(cursor->pos_state == BTR_PCUR_IS_POSITIONED);
+	ut_ad(cursor->latch_mode != BTR_NO_LATCHES);
+	cursor->old_stored = BTR_PCUR_OLD_NOT_STORED;
+loop:
+	if (btr_pcur_is_after_last_on_page(cursor)) {
+
+		if (btr_pcur_is_after_last_in_tree(cursor, mtr)) {
+
+			return(FALSE);
+		}
+
+		btr_pcur_move_to_next_page(cursor, mtr);
+	} else {
+		btr_pcur_move_to_next_on_page(cursor);
+	}
+
+	if (btr_pcur_is_on_user_rec(cursor)) {
+
+		return(TRUE);
+	}
+
+	goto loop;
+}
+
+/*********************************************************//**
+Moves the persistent cursor to the next record in the tree. If no records are
+left, the cursor stays 'after last in tree'.
+@return	TRUE if the cursor was not after last in tree */
+UNIV_INLINE
+ibool
+btr_pcur_move_to_next(
+/*==================*/
+	btr_pcur_t*	cursor,	/*!< in: persistent cursor; NOTE that the
+				function may release the page latch */
+	mtr_t*		mtr)	/*!< in: mtr */
+{
+	ut_ad(cursor->pos_state == BTR_PCUR_IS_POSITIONED);
+	ut_ad(cursor->latch_mode != BTR_NO_LATCHES);
+
+	cursor->old_stored = BTR_PCUR_OLD_NOT_STORED;
+
+	if (btr_pcur_is_after_last_on_page(cursor)) {
+
+		if (btr_pcur_is_after_last_in_tree(cursor, mtr)) {
+
+			return(FALSE);
+		}
+
+		btr_pcur_move_to_next_page(cursor, mtr);
+
+		return(TRUE);
+	}
+
+	btr_pcur_move_to_next_on_page(cursor);
+
+	return(TRUE);
+}
+
+/**************************************************************//**
+Commits the mtr and sets the pcur latch mode to BTR_NO_LATCHES,
+that is, the cursor becomes detached. If there have been modifications
+to the page where pcur is positioned, this can be used instead of
+btr_pcur_release_leaf. Function btr_pcur_store_position should be used
+before calling this, if restoration of cursor is wanted later. */
+UNIV_INLINE
+void
+btr_pcur_commit_specify_mtr(
+/*========================*/
+	btr_pcur_t*	pcur,	/*!< in: persistent cursor */
+	mtr_t*		mtr)	/*!< in: mtr to commit */
+{
+	ut_a(pcur->pos_state == BTR_PCUR_IS_POSITIONED);
+
+	pcur->latch_mode = BTR_NO_LATCHES;
+
+	mtr_commit(mtr);
+
+	pcur->pos_state = BTR_PCUR_WAS_POSITIONED;
+}
+
+/**************************************************************//**
+Sets the pcur latch mode to BTR_NO_LATCHES. */
+UNIV_INLINE
+void
+btr_pcur_detach(
+/*============*/
+	btr_pcur_t*	pcur)	/*!< in: persistent cursor */
+{
+	ut_a(pcur->pos_state == BTR_PCUR_IS_POSITIONED);
+
+	pcur->latch_mode = BTR_NO_LATCHES;
+
+	pcur->pos_state = BTR_PCUR_WAS_POSITIONED;
+}
+
+/**************************************************************//**
+Tests if a cursor is detached: that is the latch mode is BTR_NO_LATCHES.
+@return	TRUE if detached */
+UNIV_INLINE
+ibool
+btr_pcur_is_detached(
+/*=================*/
+	btr_pcur_t*	pcur)	/*!< in: persistent cursor */
+{
+	if (pcur->latch_mode == BTR_NO_LATCHES) {
+
+		return(TRUE);
+	}
+
+	return(FALSE);
+}
+
+/**************************************************************//**
+Sets the old_rec_buf field to NULL. */
+UNIV_INLINE
+void
+btr_pcur_init(
+/*==========*/
+	btr_pcur_t*	pcur)	/*!< in: persistent cursor */
+{
+	pcur->old_stored = BTR_PCUR_OLD_NOT_STORED;
+	pcur->old_rec_buf = NULL;
+	pcur->old_rec = NULL;
+}
+
+/**************************************************************//**
+Initializes and opens a persistent cursor to an index tree. It should be
+closed with btr_pcur_close. */
+UNIV_INLINE
+void
+btr_pcur_open_func(
+/*===============*/
+	dict_index_t*	index,	/*!< in: index */
+	const dtuple_t*	tuple,	/*!< in: tuple on which search done */
+	ulint		mode,	/*!< in: PAGE_CUR_L, ...;
+				NOTE that if the search is made using a unique
+				prefix of a record, mode should be
+				PAGE_CUR_LE, not PAGE_CUR_GE, as the latter
+				may end up on the previous page from the
+				record! */
+	ulint		latch_mode,/*!< in: BTR_SEARCH_LEAF, ... */
+	btr_pcur_t*	cursor, /*!< in: memory buffer for persistent cursor */
+	const char*	file,	/*!< in: file name */
+	ulint		line,	/*!< in: line where called */
+	mtr_t*		mtr)	/*!< in: mtr */
+{
+	btr_cur_t*	btr_cursor;
+
+	/* Initialize the cursor */
+
+	btr_pcur_init(cursor);
+
+	cursor->latch_mode = latch_mode;
+	cursor->search_mode = mode;
+
+	/* Search with the tree cursor */
+
+	btr_cursor = btr_pcur_get_btr_cur(cursor);
+
+	btr_cur_search_to_nth_level(index, 0, tuple, mode, latch_mode,
+				    btr_cursor, 0, file, line, mtr);
+	cursor->pos_state = BTR_PCUR_IS_POSITIONED;
+
+	cursor->trx_if_known = NULL;
+}
+
+/**************************************************************//**
+Opens an persistent cursor to an index tree without initializing the
+cursor. */
+UNIV_INLINE
+void
+btr_pcur_open_with_no_init_func(
+/*============================*/
+	dict_index_t*	index,	/*!< in: index */
+	const dtuple_t*	tuple,	/*!< in: tuple on which search done */
+	ulint		mode,	/*!< in: PAGE_CUR_L, ...;
+				NOTE that if the search is made using a unique
+				prefix of a record, mode should be
+				PAGE_CUR_LE, not PAGE_CUR_GE, as the latter
+				may end up on the previous page of the
+				record! */
+	ulint		latch_mode,/*!< in: BTR_SEARCH_LEAF, ...;
+				NOTE that if has_search_latch != 0 then
+				we maybe do not acquire a latch on the cursor
+				page, but assume that the caller uses his
+				btr search latch to protect the record! */
+	btr_pcur_t*	cursor, /*!< in: memory buffer for persistent cursor */
+	ulint		has_search_latch,/*!< in: latch mode the caller
+				currently has on btr_search_latch:
+				RW_S_LATCH, or 0 */
+	const char*	file,	/*!< in: file name */
+	ulint		line,	/*!< in: line where called */
+	mtr_t*		mtr)	/*!< in: mtr */
+{
+	btr_cur_t*	btr_cursor;
+
+	cursor->latch_mode = latch_mode;
+	cursor->search_mode = mode;
+
+	/* Search with the tree cursor */
+
+	btr_cursor = btr_pcur_get_btr_cur(cursor);
+
+	btr_cur_search_to_nth_level(index, 0, tuple, mode, latch_mode,
+				    btr_cursor, has_search_latch,
+				    file, line, mtr);
+	cursor->pos_state = BTR_PCUR_IS_POSITIONED;
+
+	cursor->old_stored = BTR_PCUR_OLD_NOT_STORED;
+
+	cursor->trx_if_known = NULL;
+}
+
+/*****************************************************************//**
+Opens a persistent cursor at either end of an index. */
+UNIV_INLINE
+void
+btr_pcur_open_at_index_side(
+/*========================*/
+	ibool		from_left,	/*!< in: TRUE if open to the low end,
+					FALSE if to the high end */
+	dict_index_t*	index,		/*!< in: index */
+	ulint		latch_mode,	/*!< in: latch mode */
+	btr_pcur_t*	pcur,		/*!< in: cursor */
+	ibool		do_init,	/*!< in: TRUE if should be initialized */
+	mtr_t*		mtr)		/*!< in: mtr */
+{
+	pcur->latch_mode = latch_mode;
+
+	if (from_left) {
+		pcur->search_mode = PAGE_CUR_G;
+	} else {
+		pcur->search_mode = PAGE_CUR_L;
+	}
+
+	if (do_init) {
+		btr_pcur_init(pcur);
+	}
+
+	btr_cur_open_at_index_side(from_left, index, latch_mode,
+				   btr_pcur_get_btr_cur(pcur), mtr);
+	pcur->pos_state = BTR_PCUR_IS_POSITIONED;
+
+	pcur->old_stored = BTR_PCUR_OLD_NOT_STORED;
+
+	pcur->trx_if_known = NULL;
+}
+
+/**********************************************************************//**
+Positions a cursor at a randomly chosen position within a B-tree. */
+UNIV_INLINE
+void
+btr_pcur_open_at_rnd_pos_func(
+/*==========================*/
+	dict_index_t*	index,		/*!< in: index */
+	ulint		latch_mode,	/*!< in: BTR_SEARCH_LEAF, ... */
+	btr_pcur_t*	cursor,		/*!< in/out: B-tree pcur */
+	const char*	file,		/*!< in: file name */
+	ulint		line,		/*!< in: line where called */
+	mtr_t*		mtr)		/*!< in: mtr */
+{
+	/* Initialize the cursor */
+
+	cursor->latch_mode = latch_mode;
+	cursor->search_mode = PAGE_CUR_G;
+
+	btr_pcur_init(cursor);
+
+	btr_cur_open_at_rnd_pos_func(index, latch_mode,
+				     btr_pcur_get_btr_cur(cursor),
+				     file, line, mtr);
+	cursor->pos_state = BTR_PCUR_IS_POSITIONED;
+	cursor->old_stored = BTR_PCUR_OLD_NOT_STORED;
+
+	cursor->trx_if_known = NULL;
+}
+
+/**************************************************************//**
+Frees the possible memory heap of a persistent cursor and sets the latch
+mode of the persistent cursor to BTR_NO_LATCHES. */
+UNIV_INLINE
+void
+btr_pcur_close(
+/*===========*/
+	btr_pcur_t*	cursor)	/*!< in: persistent cursor */
+{
+	if (cursor->old_rec_buf != NULL) {
+
+		mem_free(cursor->old_rec_buf);
+
+		cursor->old_rec = NULL;
+		cursor->old_rec_buf = NULL;
+	}
+
+	cursor->btr_cur.page_cur.rec = NULL;
+	cursor->btr_cur.page_cur.block = NULL;
+	cursor->old_rec = NULL;
+	cursor->old_stored = BTR_PCUR_OLD_NOT_STORED;
+
+	cursor->latch_mode = BTR_NO_LATCHES;
+	cursor->pos_state = BTR_PCUR_NOT_POSITIONED;
+
+	cursor->trx_if_known = NULL;
+}
diff --git a/storage/xtradb/include/btr0sea.h b/storage/xtradb/include/btr0sea.h
new file mode 100644
index 00000000000..f6d194319ae
--- /dev/null
+++ b/storage/xtradb/include/btr0sea.h
@@ -0,0 +1,327 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/********************************************************************//**
+@file include/btr0sea.h
+The index tree adaptive search
+
+Created 2/17/1996 Heikki Tuuri
+*************************************************************************/
+
+#ifndef btr0sea_h
+#define btr0sea_h
+
+#include "univ.i"
+
+#include "rem0rec.h"
+#include "dict0dict.h"
+#include "btr0types.h"
+#include "mtr0mtr.h"
+#include "ha0ha.h"
+
+/*****************************************************************//**
+Creates and initializes the adaptive search system at a database start. */
+UNIV_INTERN
+void
+btr_search_sys_create(
+/*==================*/
+	ulint	hash_size);	/*!< in: hash index hash table size */
+/*****************************************************************//**
+Frees the adaptive search system at a database shutdown. */
+UNIV_INTERN
+void
+btr_search_sys_free(void);
+/*=====================*/
+
+/********************************************************************//**
+Disable the adaptive hash search system and empty the index. */
+UNIV_INTERN
+void
+btr_search_disable(void);
+/*====================*/
+/********************************************************************//**
+Enable the adaptive hash search system. */
+UNIV_INTERN
+void
+btr_search_enable(void);
+/*====================*/
+
+/********************************************************************//**
+Returns search info for an index.
+@return	search info; search mutex reserved */
+UNIV_INLINE
+btr_search_t*
+btr_search_get_info(
+/*================*/
+	dict_index_t*	index);	/*!< in: index */
+/*****************************************************************//**
+Creates and initializes a search info struct.
+@return	own: search info struct */
+UNIV_INTERN
+btr_search_t*
+btr_search_info_create(
+/*===================*/
+	mem_heap_t*	heap);	/*!< in: heap where created */
+/*****************************************************************//**
+Returns the value of ref_count. The value is protected by
+btr_search_latch.
+@return	ref_count value. */
+UNIV_INTERN
+ulint
+btr_search_info_get_ref_count(
+/*==========================*/
+	btr_search_t*   info);	/*!< in: search info. */
+/*********************************************************************//**
+Updates the search info. */
+UNIV_INLINE
+void
+btr_search_info_update(
+/*===================*/
+	dict_index_t*	index,	/*!< in: index of the cursor */
+	btr_cur_t*	cursor);/*!< in: cursor which was just positioned */
+/******************************************************************//**
+Tries to guess the right search position based on the hash search info
+of the index. Note that if mode is PAGE_CUR_LE, which is used in inserts,
+and the function returns TRUE, then cursor->up_match and cursor->low_match
+both have sensible values.
+@return	TRUE if succeeded */
+UNIV_INTERN
+ibool
+btr_search_guess_on_hash(
+/*=====================*/
+	dict_index_t*	index,		/*!< in: index */
+	btr_search_t*	info,		/*!< in: index search info */
+	const dtuple_t*	tuple,		/*!< in: logical record */
+	ulint		mode,		/*!< in: PAGE_CUR_L, ... */
+	ulint		latch_mode,	/*!< in: BTR_SEARCH_LEAF, ... */
+	btr_cur_t*	cursor,		/*!< out: tree cursor */
+	ulint		has_search_latch,/*!< in: latch mode the caller
+					currently has on btr_search_latch:
+					RW_S_LATCH, RW_X_LATCH, or 0 */
+	mtr_t*		mtr);		/*!< in: mtr */
+/********************************************************************//**
+Moves or deletes hash entries for moved records. If new_page is already hashed,
+then the hash index for page, if any, is dropped. If new_page is not hashed,
+and page is hashed, then a new hash index is built to new_page with the same
+parameters as page (this often happens when a page is split). */
+UNIV_INTERN
+void
+btr_search_move_or_delete_hash_entries(
+/*===================================*/
+	buf_block_t*	new_block,	/*!< in: records are copied
+					to this page */
+	buf_block_t*	block,		/*!< in: index page from which
+					records were copied, and the
+					copied records will be deleted
+					from this page */
+	dict_index_t*	index);		/*!< in: record descriptor */
+/********************************************************************//**
+Drops a page hash index. */
+UNIV_INTERN
+void
+btr_search_drop_page_hash_index(
+/*============================*/
+	buf_block_t*	block);	/*!< in: block containing index page,
+				s- or x-latched, or an index page
+				for which we know that
+				block->buf_fix_count == 0 */
+/************************************************************************
+Drops a page hash index based on index */
+UNIV_INTERN
+void
+btr_search_drop_page_hash_index_on_index(
+/*=====================================*/
+	dict_index_t*	index);		/* in: record descriptor */
+/********************************************************************//**
+Drops a page hash index when a page is freed from a fseg to the file system.
+Drops possible hash index if the page happens to be in the buffer pool. */
+UNIV_INTERN
+void
+btr_search_drop_page_hash_when_freed(
+/*=================================*/
+	ulint	space,		/*!< in: space id */
+	ulint	zip_size,	/*!< in: compressed page size in bytes
+				or 0 for uncompressed pages */
+	ulint	page_no);	/*!< in: page number */
+/********************************************************************//**
+Updates the page hash index when a single record is inserted on a page. */
+UNIV_INTERN
+void
+btr_search_update_hash_node_on_insert(
+/*==================================*/
+	btr_cur_t*	cursor);/*!< in: cursor which was positioned to the
+				place to insert using btr_cur_search_...,
+				and the new record has been inserted next
+				to the cursor */
+/********************************************************************//**
+Updates the page hash index when a single record is inserted on a page. */
+UNIV_INTERN
+void
+btr_search_update_hash_on_insert(
+/*=============================*/
+	btr_cur_t*	cursor);/*!< in: cursor which was positioned to the
+				place to insert using btr_cur_search_...,
+				and the new record has been inserted next
+				to the cursor */
+/********************************************************************//**
+Updates the page hash index when a single record is deleted from a page. */
+UNIV_INTERN
+void
+btr_search_update_hash_on_delete(
+/*=============================*/
+	btr_cur_t*	cursor);/*!< in: cursor which was positioned on the
+				record to delete using btr_cur_search_...,
+				the record is not yet deleted */
+#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
+/********************************************************************//**
+Validates the search system.
+@return	TRUE if ok */
+UNIV_INTERN
+ibool
+btr_search_validate(void);
+/*======================*/
+#else
+# define btr_search_validate()	TRUE
+#endif /* defined UNIV_AHI_DEBUG || defined UNIV_DEBUG */
+
+/** Flag: has the search system been enabled?
+Protected by btr_search_latch and btr_search_enabled_mutex. */
+extern char	btr_search_enabled;
+
+/** Flag: whether the search system has completed its disabling process,
+It is set to TRUE right after buf_pool_drop_hash_index() in
+btr_search_disable(), indicating hash index entries are cleaned up.
+Protected by btr_search_latch and btr_search_enabled_mutex. */
+extern ibool	btr_search_fully_disabled;
+
+/** The search info struct in an index */
+struct btr_search_struct{
+	ulint	ref_count;	/*!< Number of blocks in this index tree
+				that have search index built
+				i.e. block->index points to this index.
+				Protected by btr_search_latch except
+				when during initialization in
+				btr_search_info_create(). */
+
+	/* @{ The following fields are not protected by any latch.
+	Unfortunately, this means that they must be aligned to
+	the machine word, i.e., they cannot be turned into bit-fields. */
+	buf_block_t* root_guess;/*!< the root page frame when it was last time
+				fetched, or NULL */
+	ulint	hash_analysis;	/*!< when this exceeds
+				BTR_SEARCH_HASH_ANALYSIS, the hash
+				analysis starts; this is reset if no
+				success noticed */
+	ibool	last_hash_succ;	/*!< TRUE if the last search would have
+				succeeded, or did succeed, using the hash
+				index; NOTE that the value here is not exact:
+				it is not calculated for every search, and the
+				calculation itself is not always accurate! */
+	ulint	n_hash_potential;
+				/*!< number of consecutive searches
+				which would have succeeded, or did succeed,
+				using the hash index;
+				the range is 0 .. BTR_SEARCH_BUILD_LIMIT + 5 */
+	/* @} */
+	/*---------------------- @{ */
+	ulint	n_fields;	/*!< recommended prefix length for hash search:
+				number of full fields */
+	ulint	n_bytes;	/*!< recommended prefix: number of bytes in
+				an incomplete field
+				@see BTR_PAGE_MAX_REC_SIZE */
+	ibool	left_side;	/*!< TRUE or FALSE, depending on whether
+				the leftmost record of several records with
+				the same prefix should be indexed in the
+				hash index */
+	/*---------------------- @} */
+#ifdef UNIV_SEARCH_PERF_STAT
+	ulint	n_hash_succ;	/*!< number of successful hash searches thus
+				far */
+	ulint	n_hash_fail;	/*!< number of failed hash searches */
+	ulint	n_patt_succ;	/*!< number of successful pattern searches thus
+				far */
+	ulint	n_searches;	/*!< number of searches */
+#endif /* UNIV_SEARCH_PERF_STAT */
+#ifdef UNIV_DEBUG
+	ulint	magic_n;	/*!< magic number @see BTR_SEARCH_MAGIC_N */
+/** value of btr_search_struct::magic_n, used in assertions */
+# define BTR_SEARCH_MAGIC_N	1112765
+#endif /* UNIV_DEBUG */
+};
+
+/** The hash index system */
+typedef struct btr_search_sys_struct	btr_search_sys_t;
+
+/** The hash index system */
+struct btr_search_sys_struct{
+	hash_table_t*	hash_index;	/*!< the adaptive hash index,
+					mapping dtuple_fold values
+					to rec_t pointers on index pages */
+};
+
+/** The adaptive hash index */
+extern btr_search_sys_t*	btr_search_sys;
+
+/** @brief The latch protecting the adaptive search system
+
+This latch protects the
+(1) hash index;
+(2) columns of a record to which we have a pointer in the hash index;
+
+but does NOT protect:
+
+(3) next record offset field in a record;
+(4) next or previous records on the same page.
+
+Bear in mind (3) and (4) when using the hash index.
+*/
+extern rw_lock_t*	btr_search_latch_temp;
+
+/** The latch protecting the adaptive search system */
+#define btr_search_latch	(*btr_search_latch_temp)
+
+#ifdef UNIV_SEARCH_PERF_STAT
+/** Number of successful adaptive hash index lookups */
+extern ulint	btr_search_n_succ;
+/** Number of failed adaptive hash index lookups */
+extern ulint	btr_search_n_hash_fail;
+#endif /* UNIV_SEARCH_PERF_STAT */
+
+/** After change in n_fields or n_bytes in info, this many rounds are waited
+before starting the hash analysis again: this is to save CPU time when there
+is no hope in building a hash index. */
+#define BTR_SEARCH_HASH_ANALYSIS	17
+
+/** Limit of consecutive searches for trying a search shortcut on the search
+pattern */
+#define BTR_SEARCH_ON_PATTERN_LIMIT	3
+
+/** Limit of consecutive searches for trying a search shortcut using
+the hash index */
+#define BTR_SEARCH_ON_HASH_LIMIT	3
+
+/** We do this many searches before trying to keep the search latch
+over calls from MySQL. If we notice someone waiting for the latch, we
+again set this much timeout. This is to reduce contention. */
+#define BTR_SEA_TIMEOUT			10000
+
+#ifndef UNIV_NONINL
+#include "btr0sea.ic"
+#endif
+
+#endif
diff --git a/storage/xtradb/include/btr0sea.ic b/storage/xtradb/include/btr0sea.ic
new file mode 100644
index 00000000000..beadeeb8d02
--- /dev/null
+++ b/storage/xtradb/include/btr0sea.ic
@@ -0,0 +1,84 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/********************************************************************//**
+@file include/btr0sea.ic
+The index tree adaptive search
+
+Created 2/17/1996 Heikki Tuuri
+*************************************************************************/
+
+#include "dict0mem.h"
+#include "btr0cur.h"
+#include "buf0buf.h"
+
+/*********************************************************************//**
+Updates the search info. */
+UNIV_INTERN
+void
+btr_search_info_update_slow(
+/*========================*/
+	btr_search_t*	info,	/*!< in/out: search info */
+	btr_cur_t*	cursor);/*!< in: cursor which was just positioned */
+
+/********************************************************************//**
+Returns search info for an index.
+@return	search info; search mutex reserved */
+UNIV_INLINE
+btr_search_t*
+btr_search_get_info(
+/*================*/
+	dict_index_t*	index)	/*!< in: index */
+{
+	ut_ad(index);
+
+	return(index->search_info);
+}
+
+/*********************************************************************//**
+Updates the search info. */
+UNIV_INLINE
+void
+btr_search_info_update(
+/*===================*/
+	dict_index_t*	index,	/*!< in: index of the cursor */
+	btr_cur_t*	cursor)	/*!< in: cursor which was just positioned */
+{
+	btr_search_t*	info;
+
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(!rw_lock_own(&btr_search_latch, RW_LOCK_SHARED));
+	ut_ad(!rw_lock_own(&btr_search_latch, RW_LOCK_EX));
+#endif /* UNIV_SYNC_DEBUG */
+
+	info = btr_search_get_info(index);
+
+	info->hash_analysis++;
+
+	if (info->hash_analysis < BTR_SEARCH_HASH_ANALYSIS) {
+
+		/* Do nothing */
+
+		return;
+
+	}
+
+	ut_ad(cursor->flag != BTR_CUR_HASH);
+
+	btr_search_info_update_slow(info, cursor);
+}
diff --git a/storage/xtradb/include/btr0types.h b/storage/xtradb/include/btr0types.h
new file mode 100644
index 00000000000..ef4a6b04b34
--- /dev/null
+++ b/storage/xtradb/include/btr0types.h
@@ -0,0 +1,51 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/********************************************************************//**
+@file include/btr0types.h
+The index tree general types
+
+Created 2/17/1996 Heikki Tuuri
+*************************************************************************/
+
+#ifndef btr0types_h
+#define btr0types_h
+
+#include "univ.i"
+
+#include "rem0types.h"
+#include "page0types.h"
+
+/** Persistent cursor */
+typedef struct btr_pcur_struct		btr_pcur_t;
+/** B-tree cursor */
+typedef struct btr_cur_struct		btr_cur_t;
+/** B-tree search information for the adaptive hash index */
+typedef struct btr_search_struct	btr_search_t;
+
+/** The size of a reference to data stored on a different page.
+The reference is stored at the end of the prefix of the field
+in the index record. */
+#define BTR_EXTERN_FIELD_REF_SIZE	20
+
+/** A BLOB field reference full of zero, for use in assertions and tests.
+Initially, BLOB field references are set to zero, in
+dtuple_convert_big_rec(). */
+extern const byte field_ref_zero[BTR_EXTERN_FIELD_REF_SIZE];
+
+#endif
diff --git a/storage/xtradb/include/buf0buddy.h b/storage/xtradb/include/buf0buddy.h
new file mode 100644
index 00000000000..3a35f8e46e9
--- /dev/null
+++ b/storage/xtradb/include/buf0buddy.h
@@ -0,0 +1,92 @@
+/*****************************************************************************
+
+Copyright (c) 2006, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/buf0buddy.h
+Binary buddy allocator for compressed pages
+
+Created December 2006 by Marko Makela
+*******************************************************/
+
+#ifndef buf0buddy_h
+#define buf0buddy_h
+
+#ifdef UNIV_MATERIALIZE
+# undef UNIV_INLINE
+# define UNIV_INLINE
+#endif
+
+#include "univ.i"
+#include "buf0types.h"
+
+/**********************************************************************//**
+Allocate a block.  The thread calling this function must hold
+buf_pool_mutex and must not hold buf_pool_zip_mutex or any
+block->mutex.  The buf_pool_mutex may only be released and reacquired
+if lru != NULL.  This function should only be used for allocating
+compressed page frames or control blocks (buf_page_t).  Allocated
+control blocks must be properly initialized immediately after
+buf_buddy_alloc() has returned the memory, before releasing
+buf_pool_mutex.
+@return	allocated block, possibly NULL if lru == NULL */
+UNIV_INLINE
+void*
+buf_buddy_alloc(
+/*============*/
+	ulint	size,	/*!< in: block size, up to UNIV_PAGE_SIZE */
+	ibool*	lru,	/*!< in: pointer to a variable that will be assigned
+			TRUE if storage was allocated from the LRU list
+			and buf_pool_mutex was temporarily released,
+			or NULL if the LRU list should not be used */
+	ibool	have_page_hash_mutex)
+	__attribute__((malloc));
+
+/**********************************************************************//**
+Release a block. */
+UNIV_INLINE
+void
+buf_buddy_free(
+/*===========*/
+	void*	buf,	/*!< in: block to be freed, must not be
+			pointed to by the buffer pool */
+	ulint	size,	/*!< in: block size, up to UNIV_PAGE_SIZE */
+	ibool	have_page_hash_mutex)
+	__attribute__((nonnull));
+
+/** Statistics of buddy blocks of a given size. */
+struct buf_buddy_stat_struct {
+	/** Number of blocks allocated from the buddy system. */
+	ulint		used;
+	/** Number of blocks relocated by the buddy system. */
+	ib_uint64_t	relocated;
+	/** Total duration of block relocations, in microseconds. */
+	ib_uint64_t	relocated_usec;
+};
+
+/** Statistics of buddy blocks of a given size. */
+typedef struct buf_buddy_stat_struct buf_buddy_stat_t;
+
+/** Statistics of the buddy system, indexed by block size.
+Protected by buf_pool_mutex. */
+extern buf_buddy_stat_t buf_buddy_stat[BUF_BUDDY_SIZES_MAX + 1];
+
+#ifndef UNIV_NONINL
+# include "buf0buddy.ic"
+#endif
+
+#endif /* buf0buddy_h */
diff --git a/storage/xtradb/include/buf0buddy.ic b/storage/xtradb/include/buf0buddy.ic
new file mode 100644
index 00000000000..69659fb69d6
--- /dev/null
+++ b/storage/xtradb/include/buf0buddy.ic
@@ -0,0 +1,143 @@
+/*****************************************************************************
+
+Copyright (c) 2006, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/buf0buddy.ic
+Binary buddy allocator for compressed pages
+
+Created December 2006 by Marko Makela
+*******************************************************/
+
+#ifdef UNIV_MATERIALIZE
+# undef UNIV_INLINE
+# define UNIV_INLINE
+#endif
+
+#include "buf0buf.h"
+#include "buf0buddy.h"
+#include "ut0ut.h"
+#include "sync0sync.h"
+
+/**********************************************************************//**
+Allocate a block.  The thread calling this function must hold
+buf_pool_mutex and must not hold buf_pool_zip_mutex or any block->mutex.
+The buf_pool_mutex may only be released and reacquired if lru != NULL.
+@return	allocated block, possibly NULL if lru==NULL */
+UNIV_INTERN
+void*
+buf_buddy_alloc_low(
+/*================*/
+	ulint	i,	/*!< in: index of buf_pool->zip_free[],
+			or BUF_BUDDY_SIZES */
+	ibool*	lru,	/*!< in: pointer to a variable that will be assigned
+			TRUE if storage was allocated from the LRU list
+			and buf_pool_mutex was temporarily released,
+			or NULL if the LRU list should not be used */
+	ibool	have_page_hash_mutex)
+	__attribute__((malloc));
+
+/**********************************************************************//**
+Deallocate a block. */
+UNIV_INTERN
+void
+buf_buddy_free_low(
+/*===============*/
+	void*	buf,	/*!< in: block to be freed, must not be
+			pointed to by the buffer pool */
+	ulint	i,	/*!< in: index of buf_pool->zip_free[],
+			or BUF_BUDDY_SIZES */
+	ibool	have_page_hash_mutex)
+	__attribute__((nonnull));
+
+/**********************************************************************//**
+Get the index of buf_pool->zip_free[] for a given block size.
+@return	index of buf_pool->zip_free[], or BUF_BUDDY_SIZES */
+UNIV_INLINE
+ulint
+buf_buddy_get_slot(
+/*===============*/
+	ulint	size)	/*!< in: block size */
+{
+	ulint	i;
+	ulint	s;
+
+	for (i = 0, s = BUF_BUDDY_LOW; s < size; i++, s <<= 1) {
+	}
+
+	ut_ad(i <= BUF_BUDDY_SIZES);
+	return(i);
+}
+
+/**********************************************************************//**
+Allocate a block.  The thread calling this function must hold
+buf_pool_mutex and must not hold buf_pool_zip_mutex or any
+block->mutex.  The buf_pool_mutex may only be released and reacquired
+if lru != NULL.  This function should only be used for allocating
+compressed page frames or control blocks (buf_page_t).  Allocated
+control blocks must be properly initialized immediately after
+buf_buddy_alloc() has returned the memory, before releasing
+buf_pool_mutex.
+@return	allocated block, possibly NULL if lru == NULL */
+UNIV_INLINE
+void*
+buf_buddy_alloc(
+/*============*/
+	ulint	size,	/*!< in: block size, up to UNIV_PAGE_SIZE */
+	ibool*	lru,	/*!< in: pointer to a variable that will be assigned
+			TRUE if storage was allocated from the LRU list
+			and buf_pool_mutex was temporarily released,
+			or NULL if the LRU list should not be used */
+	ibool	have_page_hash_mutex)
+{
+	//ut_ad(buf_pool_mutex_own());
+
+	return(buf_buddy_alloc_low(buf_buddy_get_slot(size), lru, have_page_hash_mutex));
+}
+
+/**********************************************************************//**
+Deallocate a block. */
+UNIV_INLINE
+void
+buf_buddy_free(
+/*===========*/
+	void*	buf,	/*!< in: block to be freed, must not be
+			pointed to by the buffer pool */
+	ulint	size,	/*!< in: block size, up to UNIV_PAGE_SIZE */
+	ibool	have_page_hash_mutex)
+{
+	//ut_ad(buf_pool_mutex_own());
+
+	if (!have_page_hash_mutex) {
+		mutex_enter(&LRU_list_mutex);
+		rw_lock_x_lock(&page_hash_latch);
+	}
+
+	mutex_enter(&zip_free_mutex);
+	buf_buddy_free_low(buf, buf_buddy_get_slot(size), TRUE);
+	mutex_exit(&zip_free_mutex);
+
+	if (!have_page_hash_mutex) {
+		mutex_exit(&LRU_list_mutex);
+		rw_lock_x_unlock(&page_hash_latch);
+	}
+}
+
+#ifdef UNIV_MATERIALIZE
+# undef UNIV_INLINE
+# define UNIV_INLINE	UNIV_INLINE_ORIGINAL
+#endif
diff --git a/storage/xtradb/include/buf0buf.h b/storage/xtradb/include/buf0buf.h
new file mode 100644
index 00000000000..e06927f42f0
--- /dev/null
+++ b/storage/xtradb/include/buf0buf.h
@@ -0,0 +1,1574 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2010, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/buf0buf.h
+The database buffer pool high-level routines
+
+Created 11/5/1995 Heikki Tuuri
+*******************************************************/
+
+#ifndef buf0buf_h
+#define buf0buf_h
+
+#include "univ.i"
+#include "fil0fil.h"
+#include "mtr0types.h"
+#include "buf0types.h"
+#include "hash0hash.h"
+#include "ut0byte.h"
+#include "page0types.h"
+#include "ut0rbt.h"
+#ifndef UNIV_HOTBACKUP
+#include "os0proc.h"
+#include "srv0srv.h"
+
+/** @name Modes for buf_page_get_gen */
+/* @{ */
+#define BUF_GET			10	/*!< get always */
+#define	BUF_GET_IF_IN_POOL	11	/*!< get if in pool */
+#define BUF_GET_NO_LATCH	14	/*!< get and bufferfix, but
+					set no latch; we have
+					separated this case, because
+					it is error-prone programming
+					not to set a latch, and it
+					should be used with care */
+/* @} */
+/** @name Modes for buf_page_get_known_nowait */
+/* @{ */
+#define BUF_MAKE_YOUNG	51		/*!< Move the block to the
+					start of the LRU list if there
+					is a danger that the block
+					would drift out of the buffer
+					pool*/
+#define BUF_KEEP_OLD	52		/*!< Preserve the current LRU
+					position of the block. */
+/* @} */
+
+extern buf_pool_t*	buf_pool;	/*!< The buffer pool of the database */
+#ifdef UNIV_DEBUG
+extern ibool		buf_debug_prints;/*!< If this is set TRUE, the program
+					prints info whenever read or flush
+					occurs */
+#endif /* UNIV_DEBUG */
+extern ulint srv_buf_pool_write_requests; /*!< variable to count write request
+					  issued */
+#else /* !UNIV_HOTBACKUP */
+extern buf_block_t*	back_block1;	/*!< first block, for --apply-log */
+extern buf_block_t*	back_block2;	/*!< second block, for page reorganize */
+#endif /* !UNIV_HOTBACKUP */
+
+/** Magic value to use instead of checksums when they are disabled */
+#define BUF_NO_CHECKSUM_MAGIC 0xDEADBEEFUL
+
+/** @brief States of a control block
+@see buf_page_struct
+
+The enumeration values must be 0..7. */
+enum buf_page_state {
+	BUF_BLOCK_ZIP_FREE = 0,		/*!< contains a free
+					compressed page */
+	BUF_BLOCK_ZIP_PAGE,		/*!< contains a clean
+					compressed page */
+	BUF_BLOCK_ZIP_DIRTY,		/*!< contains a compressed
+					page that is in the
+					buf_pool->flush_list */
+
+	BUF_BLOCK_NOT_USED,		/*!< is in the free list;
+					must be after the BUF_BLOCK_ZIP_
+					constants for compressed-only pages
+					@see buf_block_state_valid() */
+	BUF_BLOCK_READY_FOR_USE,	/*!< when buf_LRU_get_free_block
+					returns a block, it is in this state */
+	BUF_BLOCK_FILE_PAGE,		/*!< contains a buffered file page */
+	BUF_BLOCK_MEMORY,		/*!< contains some main memory
+					object */
+	BUF_BLOCK_REMOVE_HASH		/*!< hash index should be removed
+					before putting to the free list */
+};
+
+#ifndef UNIV_HOTBACKUP
+/********************************************************************//**
+Creates the buffer pool.
+@return	own: buf_pool object, NULL if not enough memory or error */
+UNIV_INTERN
+buf_pool_t*
+buf_pool_init(void);
+/*===============*/
+/********************************************************************//**
+Frees the buffer pool at shutdown.  This must not be invoked before
+freeing all mutexes. */
+UNIV_INTERN
+void
+buf_pool_free(void);
+/*===============*/
+
+/********************************************************************//**
+Drops the adaptive hash index.  To prevent a livelock, this function
+is only to be called while holding btr_search_latch and while
+btr_search_enabled == FALSE. */
+UNIV_INTERN
+void
+buf_pool_drop_hash_index(void);
+/*==========================*/
+
+/********************************************************************//**
+Relocate a buffer control block.  Relocates the block on the LRU list
+and in buf_pool->page_hash.  Does not relocate bpage->list.
+The caller must take care of relocating bpage->list. */
+UNIV_INTERN
+void
+buf_relocate(
+/*=========*/
+	buf_page_t*	bpage,	/*!< in/out: control block being relocated;
+				buf_page_get_state(bpage) must be
+				BUF_BLOCK_ZIP_DIRTY or BUF_BLOCK_ZIP_PAGE */
+	buf_page_t*	dpage)	/*!< in/out: destination control block */
+	__attribute__((nonnull));
+/********************************************************************//**
+Resizes the buffer pool. */
+UNIV_INTERN
+void
+buf_pool_resize(void);
+/*=================*/
+/*********************************************************************//**
+Gets the current size of buffer buf_pool in bytes.
+@return	size in bytes */
+UNIV_INLINE
+ulint
+buf_pool_get_curr_size(void);
+/*========================*/
+/********************************************************************//**
+Gets the smallest oldest_modification lsn for any page in the pool. Returns
+zero if all modified pages have been flushed to disk.
+@return	oldest modification in pool, zero if none */
+UNIV_INLINE
+ib_uint64_t
+buf_pool_get_oldest_modification(void);
+/*==================================*/
+/********************************************************************//**
+Allocates a buffer block.
+@return	own: the allocated block, in state BUF_BLOCK_MEMORY */
+UNIV_INLINE
+buf_block_t*
+buf_block_alloc(
+/*============*/
+	ulint	zip_size);	/*!< in: compressed page size in bytes,
+				or 0 if uncompressed tablespace */
+/********************************************************************//**
+Frees a buffer block which does not contain a file page. */
+UNIV_INLINE
+void
+buf_block_free(
+/*===========*/
+	buf_block_t*	block);	/*!< in, own: block to be freed */
+#endif /* !UNIV_HOTBACKUP */
+/*********************************************************************//**
+Copies contents of a buffer frame to a given buffer.
+@return	buf */
+UNIV_INLINE
+byte*
+buf_frame_copy(
+/*===========*/
+	byte*			buf,	/*!< in: buffer to copy to */
+	const buf_frame_t*	frame);	/*!< in: buffer frame */
+#ifndef UNIV_HOTBACKUP
+/**************************************************************//**
+NOTE! The following macros should be used instead of buf_page_get_gen,
+to improve debugging. Only values RW_S_LATCH and RW_X_LATCH are allowed
+in LA! */
+#define buf_page_get(SP, ZS, OF, LA, MTR)	 buf_page_get_gen(\
+				SP, ZS, OF, LA, NULL,\
+				BUF_GET, __FILE__, __LINE__, MTR)
+/**************************************************************//**
+Use these macros to bufferfix a page with no latching. Remember not to
+read the contents of the page unless you know it is safe. Do not modify
+the contents of the page! We have separated this case, because it is
+error-prone programming not to set a latch, and it should be used
+with care. */
+#define buf_page_get_with_no_latch(SP, ZS, OF, MTR)	   buf_page_get_gen(\
+				SP, ZS, OF, RW_NO_LATCH, NULL,\
+				BUF_GET_NO_LATCH, __FILE__, __LINE__, MTR)
+/********************************************************************//**
+This is the general function used to get optimistic access to a database
+page.
+@return	TRUE if success */
+UNIV_INTERN
+ibool
+buf_page_optimistic_get(
+/*====================*/
+	ulint		rw_latch,/*!< in: RW_S_LATCH, RW_X_LATCH */
+	buf_block_t*	block,	/*!< in: guessed block */
+	ib_uint64_t	modify_clock,/*!< in: modify clock value if mode is
+				..._GUESS_ON_CLOCK */
+	const char*	file,	/*!< in: file name */
+	ulint		line,	/*!< in: line where called */
+	mtr_t*		mtr);	/*!< in: mini-transaction */
+/********************************************************************//**
+This is used to get access to a known database page, when no waiting can be
+done.
+@return	TRUE if success */
+UNIV_INTERN
+ibool
+buf_page_get_known_nowait(
+/*======================*/
+	ulint		rw_latch,/*!< in: RW_S_LATCH, RW_X_LATCH */
+	buf_block_t*	block,	/*!< in: the known page */
+	ulint		mode,	/*!< in: BUF_MAKE_YOUNG or BUF_KEEP_OLD */
+	const char*	file,	/*!< in: file name */
+	ulint		line,	/*!< in: line where called */
+	mtr_t*		mtr);	/*!< in: mini-transaction */
+
+/*******************************************************************//**
+Given a tablespace id and page number tries to get that page. If the
+page is not in the buffer pool it is not loaded and NULL is returned.
+Suitable for using when holding the kernel mutex. */
+UNIV_INTERN
+const buf_block_t*
+buf_page_try_get_func(
+/*==================*/
+	ulint		space_id,/*!< in: tablespace id */
+	ulint		page_no,/*!< in: page number */
+	const char*	file,	/*!< in: file name */
+	ulint		line,	/*!< in: line where called */
+	mtr_t*		mtr);	/*!< in: mini-transaction */
+
+/** Tries to get a page. If the page is not in the buffer pool it is
+not loaded.  Suitable for using when holding the kernel mutex.
+@param space_id	in: tablespace id
+@param page_no	in: page number
+@param mtr	in: mini-transaction
+@return		the page if in buffer pool, NULL if not */
+#define buf_page_try_get(space_id, page_no, mtr)	\
+	buf_page_try_get_func(space_id, page_no, __FILE__, __LINE__, mtr);
+
+/********************************************************************//**
+Get read access to a compressed page (usually of type
+FIL_PAGE_TYPE_ZBLOB or FIL_PAGE_TYPE_ZBLOB2).
+The page must be released with buf_page_release_zip().
+NOTE: the page is not protected by any latch.  Mutual exclusion has to
+be implemented at a higher level.  In other words, all possible
+accesses to a given page through this function must be protected by
+the same set of mutexes or latches.
+@return	pointer to the block, or NULL if not compressed */
+UNIV_INTERN
+buf_page_t*
+buf_page_get_zip(
+/*=============*/
+	ulint		space,	/*!< in: space id */
+	ulint		zip_size,/*!< in: compressed page size */
+	ulint		offset);/*!< in: page number */
+/********************************************************************//**
+This is the general function used to get access to a database page.
+@return	pointer to the block or NULL */
+UNIV_INTERN
+buf_block_t*
+buf_page_get_gen(
+/*=============*/
+	ulint		space,	/*!< in: space id */
+	ulint		zip_size,/*!< in: compressed page size in bytes
+				or 0 for uncompressed pages */
+	ulint		offset,	/*!< in: page number */
+	ulint		rw_latch,/*!< in: RW_S_LATCH, RW_X_LATCH, RW_NO_LATCH */
+	buf_block_t*	guess,	/*!< in: guessed block or NULL */
+	ulint		mode,	/*!< in: BUF_GET, BUF_GET_IF_IN_POOL,
+				BUF_GET_NO_LATCH */
+	const char*	file,	/*!< in: file name */
+	ulint		line,	/*!< in: line where called */
+	mtr_t*		mtr);	/*!< in: mini-transaction */
+/********************************************************************//**
+Initializes a page to the buffer buf_pool. The page is usually not read
+from a file even if it cannot be found in the buffer buf_pool. This is one
+of the functions which perform to a block a state transition NOT_USED =>
+FILE_PAGE (the other is buf_page_get_gen).
+@return	pointer to the block, page bufferfixed */
+UNIV_INTERN
+buf_block_t*
+buf_page_create(
+/*============*/
+	ulint	space,	/*!< in: space id */
+	ulint	offset,	/*!< in: offset of the page within space in units of
+			a page */
+	ulint	zip_size,/*!< in: compressed page size, or 0 */
+	mtr_t*	mtr);	/*!< in: mini-transaction handle */
+#else /* !UNIV_HOTBACKUP */
+/********************************************************************//**
+Inits a page to the buffer buf_pool, for use in ibbackup --restore. */
+UNIV_INTERN
+void
+buf_page_init_for_backup_restore(
+/*=============================*/
+	ulint		space,	/*!< in: space id */
+	ulint		offset,	/*!< in: offset of the page within space
+				in units of a page */
+	ulint		zip_size,/*!< in: compressed page size in bytes
+				or 0 for uncompressed pages */
+	buf_block_t*	block);	/*!< in: block to init */
+#endif /* !UNIV_HOTBACKUP */
+
+#ifndef UNIV_HOTBACKUP
+/********************************************************************//**
+Releases a compressed-only page acquired with buf_page_get_zip(). */
+UNIV_INLINE
+void
+buf_page_release_zip(
+/*=================*/
+	buf_page_t*	bpage);		/*!< in: buffer block */
+/********************************************************************//**
+Decrements the bufferfix count of a buffer control block and releases
+a latch, if specified. */
+UNIV_INLINE
+void
+buf_page_release(
+/*=============*/
+	buf_block_t*	block,		/*!< in: buffer block */
+	ulint		rw_latch,	/*!< in: RW_S_LATCH, RW_X_LATCH,
+					RW_NO_LATCH */
+	mtr_t*		mtr);		/*!< in: mtr */
+/********************************************************************//**
+Moves a page to the start of the buffer pool LRU list. This high-level
+function can be used to prevent an important page from slipping out of
+the buffer pool. */
+UNIV_INTERN
+void
+buf_page_make_young(
+/*================*/
+	buf_page_t*	bpage);	/*!< in: buffer block of a file page */
+/********************************************************************//**
+Returns TRUE if the page can be found in the buffer pool hash table.
+
+NOTE that it is possible that the page is not yet read from disk,
+though.
+
+@return	TRUE if found in the page hash table */
+UNIV_INLINE
+ibool
+buf_page_peek(
+/*==========*/
+	ulint	space,	/*!< in: space id */
+	ulint	offset);/*!< in: page number */
+/********************************************************************//**
+Resets the check_index_page_at_flush field of a page if found in the buffer
+pool. */
+UNIV_INTERN
+void
+buf_reset_check_index_page_at_flush(
+/*================================*/
+	ulint	space,	/*!< in: space id */
+	ulint	offset);/*!< in: page number */
+#ifdef UNIV_DEBUG_FILE_ACCESSES
+/********************************************************************//**
+Sets file_page_was_freed TRUE if the page is found in the buffer pool.
+This function should be called when we free a file page and want the
+debug version to check that it is not accessed any more unless
+reallocated.
+@return	control block if found in page hash table, otherwise NULL */
+UNIV_INTERN
+buf_page_t*
+buf_page_set_file_page_was_freed(
+/*=============================*/
+	ulint	space,	/*!< in: space id */
+	ulint	offset);/*!< in: page number */
+/********************************************************************//**
+Sets file_page_was_freed FALSE if the page is found in the buffer pool.
+This function should be called when we free a file page and want the
+debug version to check that it is not accessed any more unless
+reallocated.
+@return	control block if found in page hash table, otherwise NULL */
+UNIV_INTERN
+buf_page_t*
+buf_page_reset_file_page_was_freed(
+/*===============================*/
+	ulint	space,	/*!< in: space id */
+	ulint	offset);	/*!< in: page number */
+#endif /* UNIV_DEBUG_FILE_ACCESSES */
+/********************************************************************//**
+Reads the freed_page_clock of a buffer block.
+@return	freed_page_clock */
+UNIV_INLINE
+ulint
+buf_page_get_freed_page_clock(
+/*==========================*/
+	const buf_page_t*	bpage)	/*!< in: block */
+	__attribute__((pure));
+/********************************************************************//**
+Reads the freed_page_clock of a buffer block.
+@return	freed_page_clock */
+UNIV_INLINE
+ulint
+buf_block_get_freed_page_clock(
+/*===========================*/
+	const buf_block_t*	block)	/*!< in: block */
+	__attribute__((pure));
+
+/********************************************************************//**
+Recommends a move of a block to the start of the LRU list if there is danger
+of dropping from the buffer pool. NOTE: does not reserve the buffer pool
+mutex.
+@return	TRUE if should be made younger */
+UNIV_INLINE
+ibool
+buf_page_peek_if_too_old(
+/*=====================*/
+	const buf_page_t*	bpage);	/*!< in: block to make younger */
+/********************************************************************//**
+Returns the current state of is_hashed of a page. FALSE if the page is
+not in the pool. NOTE that this operation does not fix the page in the
+pool if it is found there.
+@return	TRUE if page hash index is built in search system */
+UNIV_INTERN
+ibool
+buf_page_peek_if_search_hashed(
+/*===========================*/
+	ulint	space,	/*!< in: space id */
+	ulint	offset);/*!< in: page number */
+/********************************************************************//**
+Gets the youngest modification log sequence number for a frame.
+Returns zero if not file page or no modification occurred yet.
+@return	newest modification to page */
+UNIV_INLINE
+ib_uint64_t
+buf_page_get_newest_modification(
+/*=============================*/
+	const buf_page_t*	bpage);	/*!< in: block containing the
+					page frame */
+/********************************************************************//**
+Increments the modify clock of a frame by 1. The caller must (1) own the
+buf_pool mutex and block bufferfix count has to be zero, (2) or own an x-lock
+on the block. */
+UNIV_INLINE
+void
+buf_block_modify_clock_inc(
+/*=======================*/
+	buf_block_t*	block);	/*!< in: block */
+/********************************************************************//**
+Returns the value of the modify clock. The caller must have an s-lock
+or x-lock on the block.
+@return	value */
+UNIV_INLINE
+ib_uint64_t
+buf_block_get_modify_clock(
+/*=======================*/
+	buf_block_t*	block);	/*!< in: block */
+#else /* !UNIV_HOTBACKUP */
+# define buf_block_modify_clock_inc(block) ((void) 0)
+#endif /* !UNIV_HOTBACKUP */
+/********************************************************************//**
+Calculates a page checksum which is stored to the page when it is written
+to a file. Note that we must be careful to calculate the same value
+on 32-bit and 64-bit architectures.
+@return	checksum */
+UNIV_INTERN
+ulint
+buf_calc_page_new_checksum(
+/*=======================*/
+	const byte*	page);	/*!< in: buffer page */
+UNIV_INTERN
+ulint
+buf_calc_page_new_checksum_32(
+/*==========================*/
+	const byte*	page);	/*!< in: buffer page */
+/********************************************************************//**
+In versions < 4.0.14 and < 4.1.1 there was a bug that the checksum only
+looked at the first few bytes of the page. This calculates that old
+checksum.
+NOTE: we must first store the new formula checksum to
+FIL_PAGE_SPACE_OR_CHKSUM before calculating and storing this old checksum
+because this takes that field as an input!
+@return	checksum */
+UNIV_INTERN
+ulint
+buf_calc_page_old_checksum(
+/*=======================*/
+	const byte*	 page);	/*!< in: buffer page */
+/********************************************************************//**
+Checks if a page is corrupt.
+@return	TRUE if corrupted */
+UNIV_INTERN
+ibool
+buf_page_is_corrupted(
+/*==================*/
+	const byte*	read_buf,	/*!< in: a database page */
+	ulint		zip_size);	/*!< in: size of compressed page;
+					0 for uncompressed pages */
+#ifndef UNIV_HOTBACKUP
+/**********************************************************************//**
+Gets the space id, page offset, and byte offset within page of a
+pointer pointing to a buffer frame containing a file page. */
+UNIV_INLINE
+void
+buf_ptr_get_fsp_addr(
+/*=================*/
+	const void*	ptr,	/*!< in: pointer to a buffer frame */
+	ulint*		space,	/*!< out: space id */
+	fil_addr_t*	addr);	/*!< out: page offset and byte offset */
+/**********************************************************************//**
+Gets the hash value of a block. This can be used in searches in the
+lock hash table.
+@return	lock hash value */
+UNIV_INLINE
+ulint
+buf_block_get_lock_hash_val(
+/*========================*/
+	const buf_block_t*	block)	/*!< in: block */
+	__attribute__((pure));
+#ifdef UNIV_DEBUG
+/*********************************************************************//**
+Finds a block in the buffer pool that points to a
+given compressed page.
+@return	buffer block pointing to the compressed page, or NULL */
+UNIV_INTERN
+buf_block_t*
+buf_pool_contains_zip(
+/*==================*/
+	const void*	data);	/*!< in: pointer to compressed page */
+#endif /* UNIV_DEBUG */
+#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
+/*********************************************************************//**
+Validates the buffer pool data structure.
+@return	TRUE */
+UNIV_INTERN
+ibool
+buf_validate(void);
+/*==============*/
+#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
+#if defined UNIV_DEBUG_PRINT || defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
+/*********************************************************************//**
+Prints info of the buffer pool data structure. */
+UNIV_INTERN
+void
+buf_print(void);
+/*============*/
+#endif /* UNIV_DEBUG_PRINT || UNIV_DEBUG || UNIV_BUF_DEBUG */
+#endif /* !UNIV_HOTBACKUP */
+/********************************************************************//**
+Prints a page to stderr. */
+UNIV_INTERN
+void
+buf_page_print(
+/*===========*/
+	const byte*	read_buf,	/*!< in: a database page */
+	ulint		zip_size);	/*!< in: compressed page size, or
+					0 for uncompressed pages */
+/********************************************************************//**
+Decompress a block.
+@return	TRUE if successful */
+UNIV_INTERN
+ibool
+buf_zip_decompress(
+/*===============*/
+	buf_block_t*	block,	/*!< in/out: block */
+	ibool		check);	/*!< in: TRUE=verify the page checksum */
+#ifndef UNIV_HOTBACKUP
+#ifdef UNIV_DEBUG
+/*********************************************************************//**
+Returns the number of latched pages in the buffer pool.
+@return	number of latched pages */
+UNIV_INTERN
+ulint
+buf_get_latched_pages_number(void);
+/*==============================*/
+#endif /* UNIV_DEBUG */
+/*********************************************************************//**
+Returns the number of pending buf pool ios.
+@return	number of pending I/O operations */
+UNIV_INTERN
+ulint
+buf_get_n_pending_ios(void);
+/*=======================*/
+/*********************************************************************//**
+Prints info of the buffer i/o. */
+UNIV_INTERN
+void
+buf_print_io(
+/*=========*/
+	FILE*	file);	/*!< in: file where to print */
+/*********************************************************************//**
+Returns the ratio in percents of modified pages in the buffer pool /
+database pages in the buffer pool.
+@return	modified page percentage ratio */
+UNIV_INTERN
+ulint
+buf_get_modified_ratio_pct(void);
+/*============================*/
+/**********************************************************************//**
+Refreshes the statistics used to print per-second averages. */
+UNIV_INTERN
+void
+buf_refresh_io_stats(void);
+/*======================*/
+/*********************************************************************//**
+Asserts that all file pages in the buffer are in a replaceable state.
+@return	TRUE */
+UNIV_INTERN
+ibool
+buf_all_freed(void);
+/*===============*/
+/*********************************************************************//**
+Checks that there currently are no pending i/o-operations for the buffer
+pool.
+@return	TRUE if there is no pending i/o */
+UNIV_INTERN
+ibool
+buf_pool_check_no_pending_io(void);
+/*==============================*/
+/*********************************************************************//**
+Invalidates the file pages in the buffer pool when an archive recovery is
+completed. All the file pages buffered must be in a replaceable state when
+this function is called: not latched and not modified. */
+UNIV_INTERN
+void
+buf_pool_invalidate(void);
+/*=====================*/
+#endif /* !UNIV_HOTBACKUP */
+
+/*========================================================================
+--------------------------- LOWER LEVEL ROUTINES -------------------------
+=========================================================================*/
+
+#ifdef UNIV_SYNC_DEBUG
+/*********************************************************************//**
+Adds latch level info for the rw-lock protecting the buffer frame. This
+should be called in the debug version after a successful latching of a
+page if we know the latching order level of the acquired latch. */
+UNIV_INLINE
+void
+buf_block_dbg_add_level(
+/*====================*/
+	buf_block_t*	block,	/*!< in: buffer page
+				where we have acquired latch */
+	ulint		level);	/*!< in: latching order level */
+#else /* UNIV_SYNC_DEBUG */
+# define buf_block_dbg_add_level(block, level) /* nothing */
+#endif /* UNIV_SYNC_DEBUG */
+/*********************************************************************//**
+Gets the state of a block.
+@return	state */
+UNIV_INLINE
+enum buf_page_state
+buf_page_get_state(
+/*===============*/
+	const buf_page_t*	bpage);	/*!< in: pointer to the control block */
+/*********************************************************************//**
+Gets the state of a block.
+@return	state */
+UNIV_INLINE
+enum buf_page_state
+buf_block_get_state(
+/*================*/
+	const buf_block_t*	block)	/*!< in: pointer to the control block */
+	__attribute__((pure));
+/*********************************************************************//**
+Sets the state of a block. */
+UNIV_INLINE
+void
+buf_page_set_state(
+/*===============*/
+	buf_page_t*		bpage,	/*!< in/out: pointer to control block */
+	enum buf_page_state	state);	/*!< in: state */
+/*********************************************************************//**
+Sets the state of a block. */
+UNIV_INLINE
+void
+buf_block_set_state(
+/*================*/
+	buf_block_t*		block,	/*!< in/out: pointer to control block */
+	enum buf_page_state	state);	/*!< in: state */
+/*********************************************************************//**
+Determines if a block is mapped to a tablespace.
+@return	TRUE if mapped */
+UNIV_INLINE
+ibool
+buf_page_in_file(
+/*=============*/
+	const buf_page_t*	bpage)	/*!< in: pointer to control block */
+	__attribute__((pure));
+#ifndef UNIV_HOTBACKUP
+/*********************************************************************//**
+Determines if a block should be on unzip_LRU list.
+@return	TRUE if block belongs to unzip_LRU */
+UNIV_INLINE
+ibool
+buf_page_belongs_to_unzip_LRU(
+/*==========================*/
+	const buf_page_t*	bpage)	/*!< in: pointer to control block */
+	__attribute__((pure));
+
+/*********************************************************************//**
+Gets the mutex of a block.
+@return	pointer to mutex protecting bpage */
+UNIV_INLINE
+mutex_t*
+buf_page_get_mutex(
+/*===============*/
+	const buf_page_t*	bpage)	/*!< in: pointer to control block */
+	__attribute__((pure));
+
+/*************************************************************************
+Gets the mutex of a block and enter the mutex with consistency. */
+UNIV_INLINE
+mutex_t*
+buf_page_get_mutex_enter(
+/*=========================*/
+	const buf_page_t*	bpage)	/*!< in: pointer to control block */
+	__attribute__((pure));
+
+/*********************************************************************//**
+Get the flush type of a page.
+@return	flush type */
+UNIV_INLINE
+enum buf_flush
+buf_page_get_flush_type(
+/*====================*/
+	const buf_page_t*	bpage)	/*!< in: buffer page */
+	__attribute__((pure));
+/*********************************************************************//**
+Set the flush type of a page. */
+UNIV_INLINE
+void
+buf_page_set_flush_type(
+/*====================*/
+	buf_page_t*	bpage,		/*!< in: buffer page */
+	enum buf_flush	flush_type);	/*!< in: flush type */
+/*********************************************************************//**
+Map a block to a file page. */
+UNIV_INLINE
+void
+buf_block_set_file_page(
+/*====================*/
+	buf_block_t*		block,	/*!< in/out: pointer to control block */
+	ulint			space,	/*!< in: tablespace id */
+	ulint			page_no);/*!< in: page number */
+/*********************************************************************//**
+Gets the io_fix state of a block.
+@return	io_fix state */
+UNIV_INLINE
+enum buf_io_fix
+buf_page_get_io_fix(
+/*================*/
+	const buf_page_t*	bpage)	/*!< in: pointer to the control block */
+	__attribute__((pure));
+/*********************************************************************//**
+Gets the io_fix state of a block.
+@return	io_fix state */
+UNIV_INLINE
+enum buf_io_fix
+buf_block_get_io_fix(
+/*================*/
+	const buf_block_t*	block)	/*!< in: pointer to the control block */
+	__attribute__((pure));
+/*********************************************************************//**
+Sets the io_fix state of a block. */
+UNIV_INLINE
+void
+buf_page_set_io_fix(
+/*================*/
+	buf_page_t*	bpage,	/*!< in/out: control block */
+	enum buf_io_fix	io_fix);/*!< in: io_fix state */
+/*********************************************************************//**
+Sets the io_fix state of a block. */
+UNIV_INLINE
+void
+buf_block_set_io_fix(
+/*=================*/
+	buf_block_t*	block,	/*!< in/out: control block */
+	enum buf_io_fix	io_fix);/*!< in: io_fix state */
+
+/********************************************************************//**
+Determine if a buffer block can be relocated in memory.  The block
+can be dirty, but it must not be I/O-fixed or bufferfixed. */
+UNIV_INLINE
+ibool
+buf_page_can_relocate(
+/*==================*/
+	const buf_page_t*	bpage)	/*!< control block being relocated */
+	__attribute__((pure));
+
+/*********************************************************************//**
+Determine if a block has been flagged old.
+@return	TRUE if old */
+UNIV_INLINE
+ibool
+buf_page_is_old(
+/*============*/
+	const buf_page_t*	bpage)	/*!< in: control block */
+	__attribute__((pure));
+/*********************************************************************//**
+Flag a block old. */
+UNIV_INLINE
+void
+buf_page_set_old(
+/*=============*/
+	buf_page_t*	bpage,	/*!< in/out: control block */
+	ibool		old);	/*!< in: old */
+/*********************************************************************//**
+Determine the time of first access of a block in the buffer pool.
+@return	ut_time_ms() at the time of first access, 0 if not accessed */
+UNIV_INLINE
+unsigned
+buf_page_is_accessed(
+/*=================*/
+	const buf_page_t*	bpage)	/*!< in: control block */
+	__attribute__((nonnull, pure));
+/*********************************************************************//**
+Flag a block accessed. */
+UNIV_INLINE
+void
+buf_page_set_accessed(
+/*==================*/
+	buf_page_t*	bpage,		/*!< in/out: control block */
+	ulint		time_ms)	/*!< in: ut_time_ms() */
+	__attribute__((nonnull));
+/*********************************************************************//**
+Gets the buf_block_t handle of a buffered file block if an uncompressed
+page frame exists, or NULL.
+@return	control block, or NULL */
+UNIV_INLINE
+buf_block_t*
+buf_page_get_block(
+/*===============*/
+	buf_page_t*	bpage)	/*!< in: control block, or NULL */
+	__attribute__((pure));
+#endif /* !UNIV_HOTBACKUP */
+#ifdef UNIV_DEBUG
+/*********************************************************************//**
+Gets a pointer to the memory frame of a block.
+@return	pointer to the frame */
+UNIV_INLINE
+buf_frame_t*
+buf_block_get_frame(
+/*================*/
+	const buf_block_t*	block)	/*!< in: pointer to the control block */
+	__attribute__((pure));
+#else /* UNIV_DEBUG */
+# define buf_block_get_frame(block) (block ? (block)->frame : 0)
+#endif /* UNIV_DEBUG */
+/*********************************************************************//**
+Gets the space id of a block.
+@return	space id */
+UNIV_INLINE
+ulint
+buf_page_get_space(
+/*===============*/
+	const buf_page_t*	bpage)	/*!< in: pointer to the control block */
+	__attribute__((pure));
+/*********************************************************************//**
+Gets the space id of a block.
+@return	space id */
+UNIV_INLINE
+ulint
+buf_block_get_space(
+/*================*/
+	const buf_block_t*	block)	/*!< in: pointer to the control block */
+	__attribute__((pure));
+/*********************************************************************//**
+Gets the page number of a block.
+@return	page number */
+UNIV_INLINE
+ulint
+buf_page_get_page_no(
+/*=================*/
+	const buf_page_t*	bpage)	/*!< in: pointer to the control block */
+	__attribute__((pure));
+/*********************************************************************//**
+Gets the page number of a block.
+@return	page number */
+UNIV_INLINE
+ulint
+buf_block_get_page_no(
+/*==================*/
+	const buf_block_t*	block)	/*!< in: pointer to the control block */
+	__attribute__((pure));
+/*********************************************************************//**
+Gets the compressed page size of a block.
+@return	compressed page size, or 0 */
+UNIV_INLINE
+ulint
+buf_page_get_zip_size(
+/*==================*/
+	const buf_page_t*	bpage)	/*!< in: pointer to the control block */
+	__attribute__((pure));
+/*********************************************************************//**
+Gets the compressed page size of a block.
+@return	compressed page size, or 0 */
+UNIV_INLINE
+ulint
+buf_block_get_zip_size(
+/*===================*/
+	const buf_block_t*	block)	/*!< in: pointer to the control block */
+	__attribute__((pure));
+/*********************************************************************//**
+Gets the compressed page descriptor corresponding to an uncompressed page
+if applicable. */
+#define buf_block_get_page_zip(block) \
+	(UNIV_LIKELY_NULL((block)->page.zip.data) ? &(block)->page.zip : NULL)
+#ifndef UNIV_HOTBACKUP
+/*******************************************************************//**
+Gets the block to whose frame the pointer is pointing to.
+@return	pointer to block, never NULL */
+UNIV_INTERN
+buf_block_t*
+buf_block_align(
+/*============*/
+	const byte*	ptr);	/*!< in: pointer to a frame */
+/********************************************************************//**
+Find out if a pointer belongs to a buf_block_t. It can be a pointer to
+the buf_block_t itself or a member of it
+@return	TRUE if ptr belongs to a buf_block_t struct */
+UNIV_INTERN
+ibool
+buf_pointer_is_block_field(
+/*=======================*/
+	const void*		ptr);	/*!< in: pointer not
+					dereferenced */
+/** Find out if a pointer corresponds to a buf_block_t::mutex.
+@param m	in: mutex candidate
+@return		TRUE if m is a buf_block_t::mutex */
+#define buf_pool_is_block_mutex(m)			\
+	buf_pointer_is_block_field((const void*)(m))
+/** Find out if a pointer corresponds to a buf_block_t::lock.
+@param l	in: rw-lock candidate
+@return		TRUE if l is a buf_block_t::lock */
+#define buf_pool_is_block_lock(l)			\
+	buf_pointer_is_block_field((const void*)(l))
+
+#if defined UNIV_DEBUG || defined UNIV_ZIP_DEBUG
+/*********************************************************************//**
+Gets the compressed page descriptor corresponding to an uncompressed page
+if applicable.
+@return	compressed page descriptor, or NULL */
+UNIV_INLINE
+const page_zip_des_t*
+buf_frame_get_page_zip(
+/*===================*/
+	const byte*	ptr);	/*!< in: pointer to the page */
+#endif /* UNIV_DEBUG || UNIV_ZIP_DEBUG */
+/********************************************************************//**
+Function which inits a page for read to the buffer buf_pool. If the page is
+(1) already in buf_pool, or
+(2) if we specify to read only ibuf pages and the page is not an ibuf page, or
+(3) if the space is deleted or being deleted,
+then this function does nothing.
+Sets the io_fix flag to BUF_IO_READ and sets a non-recursive exclusive lock
+on the buffer frame. The io-handler must take care that the flag is cleared
+and the lock released later.
+@return	pointer to the block or NULL */
+UNIV_INTERN
+buf_page_t*
+buf_page_init_for_read(
+/*===================*/
+	ulint*		err,	/*!< out: DB_SUCCESS or DB_TABLESPACE_DELETED */
+	ulint		mode,	/*!< in: BUF_READ_IBUF_PAGES_ONLY, ... */
+	ulint		space,	/*!< in: space id */
+	ulint		zip_size,/*!< in: compressed page size, or 0 */
+	ibool		unzip,	/*!< in: TRUE=request uncompressed page */
+	ib_int64_t	tablespace_version,/*!< in: prevents reading from a wrong
+				version of the tablespace in case we have done
+				DISCARD + IMPORT */
+	ulint		offset);/*!< in: page number */
+/********************************************************************//**
+Completes an asynchronous read or write request of a file page to or from
+the buffer pool. */
+UNIV_INTERN
+void
+buf_page_io_complete(
+/*=================*/
+	buf_page_t*	bpage,	/*!< in: pointer to the block in question */
+	trx_t*		trx);
+/********************************************************************//**
+Calculates a folded value of a file page address to use in the page hash
+table.
+@return	the folded value */
+UNIV_INLINE
+ulint
+buf_page_address_fold(
+/*==================*/
+	ulint	space,	/*!< in: space id */
+	ulint	offset)	/*!< in: offset of the page within space */
+	__attribute__((const));
+/******************************************************************//**
+Returns the control block of a file page, NULL if not found.
+@return	block, NULL if not found */
+UNIV_INLINE
+buf_page_t*
+buf_page_hash_get(
+/*==============*/
+	ulint	space,	/*!< in: space id */
+	ulint	offset);/*!< in: offset of the page within space */
+/******************************************************************//**
+Returns the control block of a file page, NULL if not found
+or an uncompressed page frame does not exist.
+@return	block, NULL if not found */
+UNIV_INLINE
+buf_block_t*
+buf_block_hash_get(
+/*===============*/
+	ulint	space,	/*!< in: space id */
+	ulint	offset);/*!< in: offset of the page within space */
+/*********************************************************************//**
+Gets the current length of the free list of buffer blocks.
+@return	length of the free list */
+UNIV_INTERN
+ulint
+buf_get_free_list_len(void);
+/*=======================*/
+#endif /* !UNIV_HOTBACKUP */
+
+
+/** The common buffer control block structure
+for compressed and uncompressed frames */
+
+struct buf_page_struct{
+	/** @name General fields
+	None of these bit-fields must be modified without holding
+	buf_page_get_mutex() [buf_block_struct::mutex or
+	buf_pool_zip_mutex], since they can be stored in the same
+	machine word.  Some of these fields are additionally protected
+	by buf_pool_mutex. */
+	/* @{ */
+
+	unsigned	space:32;	/*!< tablespace id; also protected
+					by buf_pool_mutex. */
+	unsigned	offset:32;	/*!< page number; also protected
+					by buf_pool_mutex. */
+
+	unsigned	state:3;	/*!< state of the control block; also
+					protected by buf_pool_mutex.
+					State transitions from
+					BUF_BLOCK_READY_FOR_USE to
+					BUF_BLOCK_MEMORY need not be
+					protected by buf_page_get_mutex().
+					@see enum buf_page_state */
+#ifndef UNIV_HOTBACKUP
+	unsigned	flush_type:2;	/*!< if this block is currently being
+					flushed to disk, this tells the
+					flush_type.
+					@see enum buf_flush */
+	unsigned	io_fix:2;	/*!< type of pending I/O operation;
+					also protected by buf_pool_mutex
+					@see enum buf_io_fix */
+	unsigned	buf_fix_count:25;/*!< count of how manyfold this block
+					is currently bufferfixed */
+	/* @} */
+#endif /* !UNIV_HOTBACKUP */
+	page_zip_des_t	zip;		/*!< compressed page; zip.data
+					(but not the data it points to) is
+					also protected by buf_pool_mutex */
+#ifndef UNIV_HOTBACKUP
+	buf_page_t*	hash;		/*!< node used in chaining to
+					buf_pool->page_hash or
+					buf_pool->zip_hash */
+#ifdef UNIV_DEBUG
+	ibool		in_page_hash;	/*!< TRUE if in buf_pool->page_hash */
+	ibool		in_zip_hash;	/*!< TRUE if in buf_pool->zip_hash */
+#endif /* UNIV_DEBUG */
+
+	/** @name Page flushing fields
+	All these are protected by buf_pool_mutex. */
+	/* @{ */
+
+	/* UT_LIST_NODE_T(buf_page_t) list; */
+					/*!< based on state, this is a
+					list node, protected only by
+					buf_pool_mutex, in one of the
+					following lists in buf_pool:
+
+					- BUF_BLOCK_NOT_USED:	free
+					- BUF_BLOCK_FILE_PAGE:	flush_list
+					- BUF_BLOCK_ZIP_DIRTY:	flush_list
+					- BUF_BLOCK_ZIP_PAGE:	zip_clean
+					- BUF_BLOCK_ZIP_FREE:	zip_free[]
+
+					The contents of the list node
+					is undefined if !in_flush_list
+					&& state == BUF_BLOCK_FILE_PAGE,
+					or if state is one of
+					BUF_BLOCK_MEMORY,
+					BUF_BLOCK_REMOVE_HASH or
+					BUF_BLOCK_READY_IN_USE. */
+
+	/* resplit for optimistic use */
+	UT_LIST_NODE_T(buf_page_t) free;
+	UT_LIST_NODE_T(buf_page_t) flush_list;
+	UT_LIST_NODE_T(buf_page_t) zip_list; /* zip_clean or zip_free[] */
+#ifdef UNIV_DEBUG
+	ibool		in_flush_list;	/*!< TRUE if in buf_pool->flush_list;
+					when buf_pool_mutex is free, the
+					following should hold: in_flush_list
+					== (state == BUF_BLOCK_FILE_PAGE
+					    || state == BUF_BLOCK_ZIP_DIRTY) */
+	ibool		in_free_list;	/*!< TRUE if in buf_pool->free; when
+					buf_pool_mutex is free, the following
+					should hold: in_free_list
+					== (state == BUF_BLOCK_NOT_USED) */
+#endif /* UNIV_DEBUG */
+	ib_uint64_t	newest_modification;
+					/*!< log sequence number of
+					the youngest modification to
+					this block, zero if not
+					modified */
+	ib_uint64_t	oldest_modification;
+					/*!< log sequence number of
+					the START of the log entry
+					written of the oldest
+					modification to this block
+					which has not yet been flushed
+					on disk; zero if all
+					modifications are on disk */
+	/* @} */
+	/** @name LRU replacement algorithm fields
+	These fields are protected by buf_pool_mutex only (not
+	buf_pool_zip_mutex or buf_block_struct::mutex). */
+	/* @{ */
+
+	UT_LIST_NODE_T(buf_page_t) LRU;
+					/*!< node of the LRU list */
+//#ifdef UNIV_DEBUG
+	ibool		in_LRU_list;	/*!< TRUE if the page is in
+					the LRU list; used in
+					debugging */
+//#endif /* UNIV_DEBUG */
+	unsigned	old:1;		/*!< TRUE if the block is in the old
+					blocks in buf_pool->LRU_old */
+	unsigned	freed_page_clock:31;/*!< the value of
+					buf_pool->freed_page_clock
+					when this block was the last
+					time put to the head of the
+					LRU list; a thread is allowed
+					to read this for heuristic
+					purposes without holding any
+					mutex or latch */
+	unsigned	access_time:32;	/*!< time of first access, or
+					0 if the block was never accessed
+					in the buffer pool */
+	/* @} */
+	ibool		is_corrupt;
+# ifdef UNIV_DEBUG_FILE_ACCESSES
+	ibool		file_page_was_freed;
+					/*!< this is set to TRUE when fsp
+					frees a page in buffer pool */
+# endif /* UNIV_DEBUG_FILE_ACCESSES */
+#endif /* !UNIV_HOTBACKUP */
+};
+
+/** The buffer control block structure */
+
+struct buf_block_struct{
+
+	/** @name General fields */
+	/* @{ */
+
+	buf_page_t	page;		/*!< page information; this must
+					be the first field, so that
+					buf_pool->page_hash can point
+					to buf_page_t or buf_block_t */
+	byte*		frame;		/*!< pointer to buffer frame which
+					is of size UNIV_PAGE_SIZE, and
+					aligned to an address divisible by
+					UNIV_PAGE_SIZE */
+#ifndef UNIV_HOTBACKUP
+	UT_LIST_NODE_T(buf_block_t) unzip_LRU;
+					/*!< node of the decompressed LRU list;
+					a block is in the unzip_LRU list
+					if page.state == BUF_BLOCK_FILE_PAGE
+					and page.zip.data != NULL */
+//#ifdef UNIV_DEBUG
+	ibool		in_unzip_LRU_list;/*!< TRUE if the page is in the
+					decompressed LRU list;
+					used in debugging */
+//#endif /* UNIV_DEBUG */
+	mutex_t		mutex;		/*!< mutex protecting this block:
+					state (also protected by the buffer
+					pool mutex), io_fix, buf_fix_count,
+					and accessed; we introduce this new
+					mutex in InnoDB-5.1 to relieve
+					contention on the buffer pool mutex */
+	rw_lock_t	lock;		/*!< read-write lock of the buffer
+					frame */
+	unsigned	lock_hash_val:32;/*!< hashed value of the page address
+					in the record lock hash table;
+					protected by buf_block_t::lock
+					(or buf_block_t::mutex, buf_pool_mutex
+				        in buf_page_get_gen(),
+					buf_page_init_for_read()
+					and buf_page_create()) */
+	ibool		check_index_page_at_flush;
+					/*!< TRUE if we know that this is
+					an index page, and want the database
+					to check its consistency before flush;
+					note that there may be pages in the
+					buffer pool which are index pages,
+					but this flag is not set because
+					we do not keep track of all pages;
+					NOT protected by any mutex */
+	/* @} */
+	/** @name Optimistic search field */
+	/* @{ */
+
+	ib_uint64_t	modify_clock;	/*!< this clock is incremented every
+					time a pointer to a record on the
+					page may become obsolete; this is
+					used in the optimistic cursor
+					positioning: if the modify clock has
+					not changed, we know that the pointer
+					is still valid; this field may be
+					changed if the thread (1) owns the
+					pool mutex and the page is not
+					bufferfixed, or (2) the thread has an
+					x-latch on the block */
+	/* @} */
+	/** @name Hash search fields (unprotected)
+	NOTE that these fields are NOT protected by any semaphore! */
+	/* @{ */
+
+	ulint		n_hash_helps;	/*!< counter which controls building
+					of a new hash index for the page */
+	ulint		n_fields;	/*!< recommended prefix length for hash
+					search: number of full fields */
+	ulint		n_bytes;	/*!< recommended prefix: number of bytes
+					in an incomplete field */
+	ibool		left_side;	/*!< TRUE or FALSE, depending on
+					whether the leftmost record of several
+					records with the same prefix should be
+					indexed in the hash index */
+	/* @} */
+
+	/** @name Hash search fields
+	These 6 fields may only be modified when we have
+	an x-latch on btr_search_latch AND
+	- we are holding an s-latch or x-latch on buf_block_struct::lock or
+	- we know that buf_block_struct::buf_fix_count == 0.
+
+	An exception to this is when we init or create a page
+	in the buffer pool in buf0buf.c. */
+
+	/* @{ */
+
+#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
+	ulint		n_pointers;	/*!< used in debugging: the number of
+					pointers in the adaptive hash index
+					pointing to this frame */
+#endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */
+	unsigned	is_hashed:1;	/*!< TRUE if hash index has
+					already been built on this
+					page; note that it does not
+					guarantee that the index is
+					complete, though: there may
+					have been hash collisions,
+					record deletions, etc. */
+	unsigned	curr_n_fields:10;/*!< prefix length for hash indexing:
+					number of full fields */
+	unsigned	curr_n_bytes:15;/*!< number of bytes in hash
+					indexing */
+	unsigned	curr_left_side:1;/*!< TRUE or FALSE in hash indexing */
+	dict_index_t*	index;		/*!< Index for which the adaptive
+					hash index has been created. */
+	/* @} */
+# ifdef UNIV_SYNC_DEBUG
+	/** @name Debug fields */
+	/* @{ */
+	rw_lock_t	debug_latch;	/*!< in the debug version, each thread
+					which bufferfixes the block acquires
+					an s-latch here; so we can use the
+					debug utilities in sync0rw */
+	/* @} */
+# endif
+#endif /* !UNIV_HOTBACKUP */
+};
+
+/** Check if a buf_block_t object is in a valid state
+@param block	buffer block
+@return		TRUE if valid */
+#define buf_block_state_valid(block)				\
+(buf_block_get_state(block) >= BUF_BLOCK_NOT_USED		\
+ && (buf_block_get_state(block) <= BUF_BLOCK_REMOVE_HASH))
+
+#ifndef UNIV_HOTBACKUP
+/**********************************************************************//**
+Compute the hash fold value for blocks in buf_pool->zip_hash. */
+/* @{ */
+/* the fold should be relative when srv_buffer_pool_shm_key is enabled */
+#define BUF_POOL_ZIP_FOLD_PTR(ptr) (!srv_buffer_pool_shm_key\
+					?((ulint) (ptr) / UNIV_PAGE_SIZE)\
+					:((ulint) ((byte*)ptr - (byte*)(buf_pool->chunks->blocks->frame)) / UNIV_PAGE_SIZE))
+#define BUF_POOL_ZIP_FOLD(b) BUF_POOL_ZIP_FOLD_PTR((b)->frame)
+#define BUF_POOL_ZIP_FOLD_BPAGE(b) BUF_POOL_ZIP_FOLD((buf_block_t*) (b))
+/* @} */
+
+/** A chunk of buffers.  The buffer pool is allocated in chunks. */
+struct buf_chunk_struct{
+	ulint		mem_size;	/*!< allocated size of the chunk */
+	ulint		size;		/*!< size of frames[] and blocks[] */
+	void*		mem;		/*!< pointer to the memory area which
+					was allocated for the frames */
+	buf_block_t*	blocks;		/*!< array of buffer control blocks */
+};
+
+/** @brief The buffer pool statistics structure. */
+struct buf_pool_stat_struct{
+	ulint	n_page_gets;	/*!< number of page gets performed;
+				also successful searches through
+				the adaptive hash index are
+				counted as page gets; this field
+				is NOT protected by the buffer
+				pool mutex */
+	ulint	n_pages_read;	/*!< number read operations */
+	ulint	n_pages_written;/*!< number write operations */
+	ulint	n_pages_created;/*!< number of pages created
+				in the pool with no read */
+	ulint	n_ra_pages_read;/*!< number of pages read in
+				as part of read ahead */
+	ulint	n_ra_pages_evicted;/*!< number of read ahead
+				pages that are evicted without
+				being accessed */
+	ulint	n_pages_made_young; /*!< number of pages made young, in
+				calls to buf_LRU_make_block_young() */
+	ulint	n_pages_not_made_young; /*!< number of pages not made
+				young because the first access
+				was not long enough ago, in
+				buf_page_peek_if_too_old() */
+};
+
+/** @brief The buffer pool structure.
+
+NOTE! The definition appears here only for other modules of this
+directory (buf) to see it. Do not use from outside! */
+
+struct buf_pool_struct{
+
+	/** @name General fields */
+	/* @{ */
+
+	ulint		n_chunks;	/*!< number of buffer pool chunks */
+	buf_chunk_t*	chunks;		/*!< buffer pool chunks */
+	ulint		curr_size;	/*!< current pool size in pages */
+	hash_table_t*	page_hash;	/*!< hash table of buf_page_t or
+					buf_block_t file pages,
+					buf_page_in_file() == TRUE,
+					indexed by (space_id, offset) */
+	hash_table_t*	zip_hash;	/*!< hash table of buf_block_t blocks
+					whose frames are allocated to the
+					zip buddy system,
+					indexed by block->frame */
+	ulint		n_pend_reads;	/*!< number of pending read operations */
+	ulint		n_pend_unzip;	/*!< number of pending decompressions */
+
+	time_t		last_printout_time;
+					/*!< when buf_print_io was last time
+					called */
+	buf_pool_stat_t	stat;		/*!< current statistics */
+	buf_pool_stat_t	old_stat;	/*!< old statistics */
+
+	/* @} */
+
+	/** @name Page flushing algorithm fields */
+
+	/* @{ */
+
+	UT_LIST_BASE_NODE_T(buf_page_t) flush_list;
+					/*!< base node of the modified block
+					list */
+	ibool		init_flush[BUF_FLUSH_N_TYPES];
+					/*!< this is TRUE when a flush of the
+					given type is being initialized */
+	ulint		n_flush[BUF_FLUSH_N_TYPES];
+					/*!< this is the number of pending
+					writes in the given flush type */
+	os_event_t	no_flush[BUF_FLUSH_N_TYPES];
+					/*!< this is in the set state
+					when there is no flush batch
+					of the given type running */
+	ib_rbt_t*	flush_rbt;	/* !< a red-black tree is used
+					exclusively during recovery to
+					speed up insertions in the
+					flush_list. This tree contains
+					blocks in order of
+					oldest_modification LSN and is
+					kept in sync with the
+					flush_list.
+					Each member of the tree MUST
+					also be on the flush_list.
+					This tree is relevant only in
+					recovery and is set to NULL
+					once the recovery is over. */
+	ulint		freed_page_clock;/*!< a sequence number used
+					to count the number of buffer
+					blocks removed from the end of
+					the LRU list; NOTE that this
+					counter may wrap around at 4
+					billion! A thread is allowed
+					to read this for heuristic
+					purposes without holding any
+					mutex or latch */
+	ulint		LRU_flush_ended;/*!< when an LRU flush ends for a page,
+					this is incremented by one; this is
+					set to zero when a buffer block is
+					allocated */
+
+	/* @} */
+	/** @name LRU replacement algorithm fields */
+	/* @{ */
+
+	UT_LIST_BASE_NODE_T(buf_page_t) free;
+					/*!< base node of the free
+					block list */
+	UT_LIST_BASE_NODE_T(buf_page_t) LRU;
+					/*!< base node of the LRU list */
+	buf_page_t*	LRU_old;	/*!< pointer to the about
+					buf_LRU_old_ratio/BUF_LRU_OLD_RATIO_DIV
+					oldest blocks in the LRU list;
+					NULL if LRU length less than
+					BUF_LRU_OLD_MIN_LEN;
+					NOTE: when LRU_old != NULL, its length
+					should always equal LRU_old_len */
+	ulint		LRU_old_len;	/*!< length of the LRU list from
+					the block to which LRU_old points
+					onward, including that block;
+					see buf0lru.c for the restrictions
+					on this value; 0 if LRU_old == NULL;
+					NOTE: LRU_old_len must be adjusted
+					whenever LRU_old shrinks or grows! */
+
+	UT_LIST_BASE_NODE_T(buf_block_t) unzip_LRU;
+					/*!< base node of the
+					unzip_LRU list */
+
+	/* @} */
+	/** @name Buddy allocator fields
+	The buddy allocator is used for allocating compressed page
+	frames and buf_page_t descriptors of blocks that exist
+	in the buffer pool only in compressed form. */
+	/* @{ */
+	UT_LIST_BASE_NODE_T(buf_page_t)	zip_clean;
+					/*!< unmodified compressed pages */
+	UT_LIST_BASE_NODE_T(buf_page_t) zip_free[BUF_BUDDY_SIZES_MAX];
+					/*!< buddy free lists */
+//#if BUF_BUDDY_HIGH != UNIV_PAGE_SIZE
+//# error "BUF_BUDDY_HIGH != UNIV_PAGE_SIZE"
+//#endif
+#if BUF_BUDDY_LOW > PAGE_ZIP_MIN_SIZE
+# error "BUF_BUDDY_LOW > PAGE_ZIP_MIN_SIZE"
+#endif
+	/* @} */
+};
+
+/** mutex protecting the buffer pool struct and control blocks, except the
+read-write lock in them */
+extern mutex_t	buf_pool_mutex;
+extern mutex_t	LRU_list_mutex;
+extern mutex_t	flush_list_mutex;
+extern rw_lock_t	page_hash_latch;
+extern mutex_t	free_list_mutex;
+extern mutex_t	zip_free_mutex;
+extern mutex_t	zip_hash_mutex;
+/** mutex protecting the control blocks of compressed-only pages
+(of type buf_page_t, not buf_block_t) */
+extern mutex_t	buf_pool_zip_mutex;
+
+/** @name Accessors for buf_pool_mutex.
+Use these instead of accessing buf_pool_mutex directly. */
+/* @{ */
+
+/** Test if buf_pool_mutex is owned. */
+#define buf_pool_mutex_own() mutex_own(&buf_pool_mutex)
+/** Acquire the buffer pool mutex. */
+#define buf_pool_mutex_enter() do {		\
+	ut_ad(!mutex_own(&buf_pool_zip_mutex));	\
+	mutex_enter(&buf_pool_mutex);		\
+} while (0)
+
+#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
+/** Flag to forbid the release of the buffer pool mutex.
+Protected by buf_pool_mutex. */
+extern ulint	buf_pool_mutex_exit_forbidden;
+/** Forbid the release of the buffer pool mutex. */
+# define buf_pool_mutex_exit_forbid() do {	\
+	ut_ad(buf_pool_mutex_own());		\
+	buf_pool_mutex_exit_forbidden++;	\
+} while (0)
+/** Allow the release of the buffer pool mutex. */
+# define buf_pool_mutex_exit_allow() do {	\
+	ut_ad(buf_pool_mutex_own());		\
+	ut_a(buf_pool_mutex_exit_forbidden);	\
+	buf_pool_mutex_exit_forbidden--;	\
+} while (0)
+/** Release the buffer pool mutex. */
+# define buf_pool_mutex_exit() do {		\
+	ut_a(!buf_pool_mutex_exit_forbidden);	\
+	mutex_exit(&buf_pool_mutex);		\
+} while (0)
+#else
+/** Forbid the release of the buffer pool mutex. */
+# define buf_pool_mutex_exit_forbid() ((void) 0)
+/** Allow the release of the buffer pool mutex. */
+# define buf_pool_mutex_exit_allow() ((void) 0)
+/** Release the buffer pool mutex. */
+# define buf_pool_mutex_exit() mutex_exit(&buf_pool_mutex)
+#endif
+#endif /* !UNIV_HOTBACKUP */
+/* @} */
+
+/**********************************************************************
+Let us list the consistency conditions for different control block states.
+
+NOT_USED:	is in free list, not in LRU list, not in flush list, nor
+		page hash table
+READY_FOR_USE:	is not in free list, LRU list, or flush list, nor page
+		hash table
+MEMORY:		is not in free list, LRU list, or flush list, nor page
+		hash table
+FILE_PAGE:	space and offset are defined, is in page hash table
+		if io_fix == BUF_IO_WRITE,
+			pool: no_flush[flush_type] is in reset state,
+			pool: n_flush[flush_type] > 0
+
+		(1) if buf_fix_count == 0, then
+			is in LRU list, not in free list
+			is in flush list,
+				if and only if oldest_modification > 0
+			is x-locked,
+				if and only if io_fix == BUF_IO_READ
+			is s-locked,
+				if and only if io_fix == BUF_IO_WRITE
+
+		(2) if buf_fix_count > 0, then
+			is not in LRU list, not in free list
+			is in flush list,
+				if and only if oldest_modification > 0
+			if io_fix == BUF_IO_READ,
+				is x-locked
+			if io_fix == BUF_IO_WRITE,
+				is s-locked
+
+State transitions:
+
+NOT_USED => READY_FOR_USE
+READY_FOR_USE => MEMORY
+READY_FOR_USE => FILE_PAGE
+MEMORY => NOT_USED
+FILE_PAGE => NOT_USED	NOTE: This transition is allowed if and only if
+				(1) buf_fix_count == 0,
+				(2) oldest_modification == 0, and
+				(3) io_fix == 0.
+*/
+
+#ifndef UNIV_NONINL
+#include "buf0buf.ic"
+#endif
+
+#endif
diff --git a/storage/xtradb/include/buf0buf.ic b/storage/xtradb/include/buf0buf.ic
new file mode 100644
index 00000000000..93cc68e7fc9
--- /dev/null
+++ b/storage/xtradb/include/buf0buf.ic
@@ -0,0 +1,1126 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2010, Innobase Oy. All Rights Reserved.
+Copyright (c) 2008, Google Inc.
+
+Portions of this file contain modifications contributed and copyrighted by
+Google, Inc. Those modifications are gratefully acknowledged and are described
+briefly in the InnoDB documentation. The contributions by Google are
+incorporated with their permission, and subject to the conditions contained in
+the file COPYING.Google.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/buf0buf.ic
+The database buffer buf_pool
+
+Created 11/5/1995 Heikki Tuuri
+*******************************************************/
+
+#include "mtr0mtr.h"
+#ifndef UNIV_HOTBACKUP
+#include "buf0flu.h"
+#include "buf0lru.h"
+#include "buf0rea.h"
+#include "srv0srv.h"
+/********************************************************************//**
+Reads the freed_page_clock of a buffer block.
+@return	freed_page_clock */
+UNIV_INLINE
+ulint
+buf_page_get_freed_page_clock(
+/*==========================*/
+	const buf_page_t*	bpage)	/*!< in: block */
+{
+	/* This is sometimes read without holding buf_pool_mutex. */
+	return(bpage->freed_page_clock);
+}
+
+/********************************************************************//**
+Reads the freed_page_clock of a buffer block.
+@return	freed_page_clock */
+UNIV_INLINE
+ulint
+buf_block_get_freed_page_clock(
+/*===========================*/
+	const buf_block_t*	block)	/*!< in: block */
+{
+	return(buf_page_get_freed_page_clock(&block->page));
+}
+
+/********************************************************************//**
+Recommends a move of a block to the start of the LRU list if there is danger
+of dropping from the buffer pool. NOTE: does not reserve the buffer pool
+mutex.
+@return	TRUE if should be made younger */
+UNIV_INLINE
+ibool
+buf_page_peek_if_too_old(
+/*=====================*/
+	const buf_page_t*	bpage)	/*!< in: block to make younger */
+{
+	if (UNIV_UNLIKELY(buf_pool->freed_page_clock == 0)) {
+		/* If eviction has not started yet, do not update the
+		statistics or move blocks in the LRU list.  This is
+		either the warm-up phase or an in-memory workload. */
+		return(FALSE);
+	} else if (buf_LRU_old_threshold_ms && bpage->old) {
+		unsigned	access_time = buf_page_is_accessed(bpage);
+
+		if (access_time > 0
+		    && ((ib_uint32_t) (ut_time_ms() - access_time))
+		    >= buf_LRU_old_threshold_ms) {
+			return(TRUE);
+		}
+
+		buf_pool->stat.n_pages_not_made_young++;
+		return(FALSE);
+	} else {
+		/* FIXME: bpage->freed_page_clock is 31 bits */
+		return((buf_pool->freed_page_clock & ((1UL << 31) - 1))
+		       > ((ulint) bpage->freed_page_clock
+			  + (buf_pool->curr_size
+			     * (BUF_LRU_OLD_RATIO_DIV - buf_LRU_old_ratio)
+			     / (BUF_LRU_OLD_RATIO_DIV * 4))));
+	}
+}
+
+/*********************************************************************//**
+Gets the current size of buffer buf_pool in bytes.
+@return	size in bytes */
+UNIV_INLINE
+ulint
+buf_pool_get_curr_size(void)
+/*========================*/
+{
+	return(buf_pool->curr_size * UNIV_PAGE_SIZE);
+}
+
+/********************************************************************//**
+Gets the smallest oldest_modification lsn for any page in the pool. Returns
+zero if all modified pages have been flushed to disk.
+@return	oldest modification in pool, zero if none */
+UNIV_INLINE
+ib_uint64_t
+buf_pool_get_oldest_modification(void)
+/*==================================*/
+{
+	buf_page_t*	bpage;
+	ib_uint64_t	lsn;
+
+try_again:
+	//buf_pool_mutex_enter();
+	mutex_enter(&flush_list_mutex);
+
+	bpage = UT_LIST_GET_LAST(buf_pool->flush_list);
+
+	if (bpage == NULL) {
+		lsn = 0;
+	} else {
+		ut_ad(bpage->in_flush_list);
+		lsn = bpage->oldest_modification;
+		if (lsn == 0) {
+			mutex_exit(&flush_list_mutex);
+			goto try_again;
+		}
+	}
+
+	//buf_pool_mutex_exit();
+	mutex_exit(&flush_list_mutex);
+
+	/* The returned answer may be out of date: the flush_list can
+	change after the mutex has been released. */
+
+	return(lsn);
+}
+#endif /* !UNIV_HOTBACKUP */
+
+/*********************************************************************//**
+Gets the state of a block.
+@return	state */
+UNIV_INLINE
+enum buf_page_state
+buf_page_get_state(
+/*===============*/
+	const buf_page_t*	bpage)	/*!< in: pointer to the control block */
+{
+	enum buf_page_state	state = (enum buf_page_state) bpage->state;
+
+#ifdef UNIV_DEBUG
+	switch (state) {
+	case BUF_BLOCK_ZIP_FREE:
+	case BUF_BLOCK_ZIP_PAGE:
+	case BUF_BLOCK_ZIP_DIRTY:
+	case BUF_BLOCK_NOT_USED:
+	case BUF_BLOCK_READY_FOR_USE:
+	case BUF_BLOCK_FILE_PAGE:
+	case BUF_BLOCK_MEMORY:
+	case BUF_BLOCK_REMOVE_HASH:
+		break;
+	default:
+		ut_error;
+	}
+#endif /* UNIV_DEBUG */
+
+	return(state);
+}
+/*********************************************************************//**
+Gets the state of a block.
+@return	state */
+UNIV_INLINE
+enum buf_page_state
+buf_block_get_state(
+/*================*/
+	const buf_block_t*	block)	/*!< in: pointer to the control block */
+{
+	return(buf_page_get_state(&block->page));
+}
+/*********************************************************************//**
+Sets the state of a block. */
+UNIV_INLINE
+void
+buf_page_set_state(
+/*===============*/
+	buf_page_t*		bpage,	/*!< in/out: pointer to control block */
+	enum buf_page_state	state)	/*!< in: state */
+{
+#ifdef UNIV_DEBUG
+	enum buf_page_state	old_state	= buf_page_get_state(bpage);
+
+	switch (old_state) {
+	case BUF_BLOCK_ZIP_FREE:
+		ut_error;
+		break;
+	case BUF_BLOCK_ZIP_PAGE:
+		ut_a(state == BUF_BLOCK_ZIP_DIRTY);
+		break;
+	case BUF_BLOCK_ZIP_DIRTY:
+		ut_a(state == BUF_BLOCK_ZIP_PAGE);
+		break;
+	case BUF_BLOCK_NOT_USED:
+		ut_a(state == BUF_BLOCK_READY_FOR_USE);
+		break;
+	case BUF_BLOCK_READY_FOR_USE:
+		ut_a(state == BUF_BLOCK_MEMORY
+		     || state == BUF_BLOCK_FILE_PAGE
+		     || state == BUF_BLOCK_NOT_USED);
+		break;
+	case BUF_BLOCK_MEMORY:
+		ut_a(state == BUF_BLOCK_NOT_USED);
+		break;
+	case BUF_BLOCK_FILE_PAGE:
+		ut_a(state == BUF_BLOCK_NOT_USED
+		     || state == BUF_BLOCK_REMOVE_HASH);
+		break;
+	case BUF_BLOCK_REMOVE_HASH:
+		ut_a(state == BUF_BLOCK_MEMORY);
+		break;
+	}
+#endif /* UNIV_DEBUG */
+	bpage->state = state;
+	ut_ad(buf_page_get_state(bpage) == state);
+}
+
+/*********************************************************************//**
+Sets the state of a block. */
+UNIV_INLINE
+void
+buf_block_set_state(
+/*================*/
+	buf_block_t*		block,	/*!< in/out: pointer to control block */
+	enum buf_page_state	state)	/*!< in: state */
+{
+	buf_page_set_state(&block->page, state);
+}
+
+/*********************************************************************//**
+Determines if a block is mapped to a tablespace.
+@return	TRUE if mapped */
+UNIV_INLINE
+ibool
+buf_page_in_file(
+/*=============*/
+	const buf_page_t*	bpage)	/*!< in: pointer to control block */
+{
+	switch (buf_page_get_state(bpage)) {
+	case BUF_BLOCK_ZIP_FREE:
+		/* This is a free page in buf_pool->zip_free[].
+		Such pages should only be accessed by the buddy allocator. */
+		/* ut_error; */ /* optimistic */
+		break;
+	case BUF_BLOCK_ZIP_PAGE:
+	case BUF_BLOCK_ZIP_DIRTY:
+	case BUF_BLOCK_FILE_PAGE:
+		return(TRUE);
+	case BUF_BLOCK_NOT_USED:
+	case BUF_BLOCK_READY_FOR_USE:
+	case BUF_BLOCK_MEMORY:
+	case BUF_BLOCK_REMOVE_HASH:
+		break;
+	}
+
+	return(FALSE);
+}
+
+#ifndef UNIV_HOTBACKUP
+/*********************************************************************//**
+Determines if a block should be on unzip_LRU list.
+@return	TRUE if block belongs to unzip_LRU */
+UNIV_INLINE
+ibool
+buf_page_belongs_to_unzip_LRU(
+/*==========================*/
+	const buf_page_t*	bpage)	/*!< in: pointer to control block */
+{
+	ut_ad(buf_page_in_file(bpage));
+
+	return(bpage->zip.data
+	       && buf_page_get_state(bpage) == BUF_BLOCK_FILE_PAGE);
+}
+
+/*********************************************************************//**
+Gets the mutex of a block.
+@return	pointer to mutex protecting bpage */
+UNIV_INLINE
+mutex_t*
+buf_page_get_mutex(
+/*===============*/
+	const buf_page_t*	bpage)	/*!< in: pointer to control block */
+{
+	switch (buf_page_get_state(bpage)) {
+	case BUF_BLOCK_ZIP_FREE:
+		/* ut_error; */ /* optimistic */
+		return(NULL);
+	case BUF_BLOCK_ZIP_PAGE:
+	case BUF_BLOCK_ZIP_DIRTY:
+		return(&buf_pool_zip_mutex);
+	default:
+		return(&((buf_block_t*) bpage)->mutex);
+	}
+}
+
+/*************************************************************************
+Gets the mutex of a block and enter the mutex with consistency. */
+UNIV_INLINE
+mutex_t*
+buf_page_get_mutex_enter(
+/*=========================*/
+	const buf_page_t*	bpage)	/*!< in: pointer to control block */
+{
+	mutex_t*	block_mutex;
+
+	while(1) {
+		block_mutex = buf_page_get_mutex(bpage);
+		if (!block_mutex)
+			return block_mutex;
+
+		mutex_enter(block_mutex);
+		if (block_mutex == buf_page_get_mutex(bpage))
+			return block_mutex;
+		mutex_exit(block_mutex);
+	}
+}
+
+/*********************************************************************//**
+Get the flush type of a page.
+@return	flush type */
+UNIV_INLINE
+enum buf_flush
+buf_page_get_flush_type(
+/*====================*/
+	const buf_page_t*	bpage)	/*!< in: buffer page */
+{
+	enum buf_flush	flush_type = (enum buf_flush) bpage->flush_type;
+
+#ifdef UNIV_DEBUG
+	switch (flush_type) {
+	case BUF_FLUSH_LRU:
+	case BUF_FLUSH_SINGLE_PAGE:
+	case BUF_FLUSH_LIST:
+		return(flush_type);
+	case BUF_FLUSH_N_TYPES:
+		break;
+	}
+	ut_error;
+#endif /* UNIV_DEBUG */
+	return(flush_type);
+}
+/*********************************************************************//**
+Set the flush type of a page. */
+UNIV_INLINE
+void
+buf_page_set_flush_type(
+/*====================*/
+	buf_page_t*	bpage,		/*!< in: buffer page */
+	enum buf_flush	flush_type)	/*!< in: flush type */
+{
+	bpage->flush_type = flush_type;
+	ut_ad(buf_page_get_flush_type(bpage) == flush_type);
+}
+
+/*********************************************************************//**
+Map a block to a file page. */
+UNIV_INLINE
+void
+buf_block_set_file_page(
+/*====================*/
+	buf_block_t*		block,	/*!< in/out: pointer to control block */
+	ulint			space,	/*!< in: tablespace id */
+	ulint			page_no)/*!< in: page number */
+{
+	buf_block_set_state(block, BUF_BLOCK_FILE_PAGE);
+	block->page.space = space;
+	block->page.offset = page_no;
+}
+
+/*********************************************************************//**
+Gets the io_fix state of a block.
+@return	io_fix state */
+UNIV_INLINE
+enum buf_io_fix
+buf_page_get_io_fix(
+/*================*/
+	const buf_page_t*	bpage)	/*!< in: pointer to the control block */
+{
+	enum buf_io_fix	io_fix = (enum buf_io_fix) bpage->io_fix;
+#ifdef UNIV_DEBUG
+	switch (io_fix) {
+	case BUF_IO_NONE:
+	case BUF_IO_READ:
+	case BUF_IO_WRITE:
+		return(io_fix);
+	}
+	ut_error;
+#endif /* UNIV_DEBUG */
+	return(io_fix);
+}
+
+/*********************************************************************//**
+Gets the io_fix state of a block.
+@return	io_fix state */
+UNIV_INLINE
+enum buf_io_fix
+buf_block_get_io_fix(
+/*================*/
+	const buf_block_t*	block)	/*!< in: pointer to the control block */
+{
+	return(buf_page_get_io_fix(&block->page));
+}
+
+/*********************************************************************//**
+Sets the io_fix state of a block. */
+UNIV_INLINE
+void
+buf_page_set_io_fix(
+/*================*/
+	buf_page_t*	bpage,	/*!< in/out: control block */
+	enum buf_io_fix	io_fix)	/*!< in: io_fix state */
+{
+	//ut_ad(buf_pool_mutex_own());
+	ut_ad(mutex_own(buf_page_get_mutex(bpage)));
+
+	bpage->io_fix = io_fix;
+	ut_ad(buf_page_get_io_fix(bpage) == io_fix);
+}
+
+/*********************************************************************//**
+Sets the io_fix state of a block. */
+UNIV_INLINE
+void
+buf_block_set_io_fix(
+/*=================*/
+	buf_block_t*	block,	/*!< in/out: control block */
+	enum buf_io_fix	io_fix)	/*!< in: io_fix state */
+{
+	buf_page_set_io_fix(&block->page, io_fix);
+}
+
+/********************************************************************//**
+Determine if a buffer block can be relocated in memory.  The block
+can be dirty, but it must not be I/O-fixed or bufferfixed. */
+UNIV_INLINE
+ibool
+buf_page_can_relocate(
+/*==================*/
+	const buf_page_t*	bpage)	/*!< control block being relocated */
+{
+	//ut_ad(buf_pool_mutex_own());
+	ut_ad(mutex_own(buf_page_get_mutex(bpage)));
+	ut_ad(buf_page_in_file(bpage));
+	/* optimistic */
+	//ut_ad(bpage->in_LRU_list);
+
+	return(bpage->in_LRU_list && bpage->io_fix == BUF_IO_NONE
+	       && bpage->buf_fix_count == 0);
+}
+
+/*********************************************************************//**
+Determine if a block has been flagged old.
+@return	TRUE if old */
+UNIV_INLINE
+ibool
+buf_page_is_old(
+/*============*/
+	const buf_page_t*	bpage)	/*!< in: control block */
+{
+	ut_ad(buf_page_in_file(bpage));
+	//ut_ad(buf_pool_mutex_own()); /* This is used in optimistic */
+
+	return(bpage->old);
+}
+
+/*********************************************************************//**
+Flag a block old. */
+UNIV_INLINE
+void
+buf_page_set_old(
+/*=============*/
+	buf_page_t*	bpage,	/*!< in/out: control block */
+	ibool		old)	/*!< in: old */
+{
+	ut_a(buf_page_in_file(bpage));
+	//ut_ad(buf_pool_mutex_own());
+	ut_ad(mutex_own(&LRU_list_mutex));
+	ut_ad(bpage->in_LRU_list);
+
+#ifdef UNIV_LRU_DEBUG
+	ut_a((buf_pool->LRU_old_len == 0) == (buf_pool->LRU_old == NULL));
+	/* If a block is flagged "old", the LRU_old list must exist. */
+	ut_a(!old || buf_pool->LRU_old);
+
+	if (UT_LIST_GET_PREV(LRU, bpage) && UT_LIST_GET_NEXT(LRU, bpage)) {
+		const buf_page_t*	prev = UT_LIST_GET_PREV(LRU, bpage);
+		const buf_page_t*	next = UT_LIST_GET_NEXT(LRU, bpage);
+		if (prev->old == next->old) {
+			ut_a(prev->old == old);
+		} else {
+			ut_a(!prev->old);
+			ut_a(buf_pool->LRU_old == (old ? bpage : next));
+		}
+	}
+#endif /* UNIV_LRU_DEBUG */
+
+	bpage->old = old;
+}
+
+/*********************************************************************//**
+Determine the time of first access of a block in the buffer pool.
+@return	ut_time_ms() at the time of first access, 0 if not accessed */
+UNIV_INLINE
+unsigned
+buf_page_is_accessed(
+/*=================*/
+	const buf_page_t*	bpage)	/*!< in: control block */
+{
+	ut_ad(buf_page_in_file(bpage));
+
+	return(bpage->access_time);
+}
+
+/*********************************************************************//**
+Flag a block accessed. */
+UNIV_INLINE
+void
+buf_page_set_accessed(
+/*==================*/
+	buf_page_t*	bpage,		/*!< in/out: control block */
+	ulint		time_ms)	/*!< in: ut_time_ms() */
+{
+	ut_a(buf_page_in_file(bpage));
+	//ut_ad(buf_pool_mutex_own());
+	ut_ad(mutex_own(buf_page_get_mutex(bpage)));
+
+	if (!bpage->access_time) {
+		/* Make this the time of the first access. */
+		bpage->access_time = time_ms;
+	}
+}
+
+/*********************************************************************//**
+Gets the buf_block_t handle of a buffered file block if an uncompressed
+page frame exists, or NULL.
+@return	control block, or NULL */
+UNIV_INLINE
+buf_block_t*
+buf_page_get_block(
+/*===============*/
+	buf_page_t*	bpage)	/*!< in: control block, or NULL */
+{
+	if (UNIV_LIKELY(bpage != NULL)) {
+		ut_ad(buf_page_in_file(bpage));
+
+		if (buf_page_get_state(bpage) == BUF_BLOCK_FILE_PAGE) {
+			return((buf_block_t*) bpage);
+		}
+	}
+
+	return(NULL);
+}
+#endif /* !UNIV_HOTBACKUP */
+
+#ifdef UNIV_DEBUG
+/*********************************************************************//**
+Gets a pointer to the memory frame of a block.
+@return	pointer to the frame */
+UNIV_INLINE
+buf_frame_t*
+buf_block_get_frame(
+/*================*/
+	const buf_block_t*	block)	/*!< in: pointer to the control block */
+{
+	ut_a(srv_pass_corrupt_table || block);
+
+	if (srv_pass_corrupt_table && !block) {
+		return(0);
+	}
+
+	ut_ad(block);
+
+	switch (buf_block_get_state(block)) {
+	case BUF_BLOCK_ZIP_FREE:
+	case BUF_BLOCK_ZIP_PAGE:
+	case BUF_BLOCK_ZIP_DIRTY:
+	case BUF_BLOCK_NOT_USED:
+		ut_error;
+		break;
+	case BUF_BLOCK_FILE_PAGE:
+# ifndef UNIV_HOTBACKUP
+		ut_a(block->page.buf_fix_count > 0);
+# endif /* !UNIV_HOTBACKUP */
+		/* fall through */
+	case BUF_BLOCK_READY_FOR_USE:
+	case BUF_BLOCK_MEMORY:
+	case BUF_BLOCK_REMOVE_HASH:
+		goto ok;
+	}
+	ut_error;
+ok:
+	return((buf_frame_t*) block->frame);
+}
+#endif /* UNIV_DEBUG */
+
+/*********************************************************************//**
+Gets the space id of a block.
+@return	space id */
+UNIV_INLINE
+ulint
+buf_page_get_space(
+/*===============*/
+	const buf_page_t*	bpage)	/*!< in: pointer to the control block */
+{
+	ut_ad(bpage);
+	ut_a(buf_page_in_file(bpage));
+
+	return(bpage->space);
+}
+
+/*********************************************************************//**
+Gets the space id of a block.
+@return	space id */
+UNIV_INLINE
+ulint
+buf_block_get_space(
+/*================*/
+	const buf_block_t*	block)	/*!< in: pointer to the control block */
+{
+	ut_ad(block);
+	ut_a(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE);
+
+	return(block->page.space);
+}
+
+/*********************************************************************//**
+Gets the page number of a block.
+@return	page number */
+UNIV_INLINE
+ulint
+buf_page_get_page_no(
+/*=================*/
+	const buf_page_t*	bpage)	/*!< in: pointer to the control block */
+{
+	ut_ad(bpage);
+	ut_a(buf_page_in_file(bpage));
+
+	return(bpage->offset);
+}
+
+/*********************************************************************//**
+Gets the page number of a block.
+@return	page number */
+UNIV_INLINE
+ulint
+buf_block_get_page_no(
+/*==================*/
+	const buf_block_t*	block)	/*!< in: pointer to the control block */
+{
+	ut_ad(block);
+	ut_a(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE);
+
+	return(block->page.offset);
+}
+
+/*********************************************************************//**
+Gets the compressed page size of a block.
+@return	compressed page size, or 0 */
+UNIV_INLINE
+ulint
+buf_page_get_zip_size(
+/*==================*/
+	const buf_page_t*	bpage)	/*!< in: pointer to the control block */
+{
+	return(bpage->zip.ssize ? 512 << bpage->zip.ssize : 0);
+}
+
+/*********************************************************************//**
+Gets the compressed page size of a block.
+@return	compressed page size, or 0 */
+UNIV_INLINE
+ulint
+buf_block_get_zip_size(
+/*===================*/
+	const buf_block_t*	block)	/*!< in: pointer to the control block */
+{
+	return(block->page.zip.ssize ? 512 << block->page.zip.ssize : 0);
+}
+
+#ifndef UNIV_HOTBACKUP
+#if defined UNIV_DEBUG || defined UNIV_ZIP_DEBUG
+/*********************************************************************//**
+Gets the compressed page descriptor corresponding to an uncompressed page
+if applicable.
+@return	compressed page descriptor, or NULL */
+UNIV_INLINE
+const page_zip_des_t*
+buf_frame_get_page_zip(
+/*===================*/
+	const byte*	ptr)	/*!< in: pointer to the page */
+{
+	return(buf_block_get_page_zip(buf_block_align(ptr)));
+}
+#endif /* UNIV_DEBUG || UNIV_ZIP_DEBUG */
+#endif /* !UNIV_HOTBACKUP */
+
+/**********************************************************************//**
+Gets the space id, page offset, and byte offset within page of a
+pointer pointing to a buffer frame containing a file page. */
+UNIV_INLINE
+void
+buf_ptr_get_fsp_addr(
+/*=================*/
+	const void*	ptr,	/*!< in: pointer to a buffer frame */
+	ulint*		space,	/*!< out: space id */
+	fil_addr_t*	addr)	/*!< out: page offset and byte offset */
+{
+	const page_t*	page = (const page_t*) ut_align_down(ptr,
+							     UNIV_PAGE_SIZE);
+
+	*space = mach_read_from_4(page + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID);
+	addr->page = mach_read_from_4(page + FIL_PAGE_OFFSET);
+	addr->boffset = ut_align_offset(ptr, UNIV_PAGE_SIZE);
+}
+
+#ifndef UNIV_HOTBACKUP
+/**********************************************************************//**
+Gets the hash value of the page the pointer is pointing to. This can be used
+in searches in the lock hash table.
+@return	lock hash value */
+UNIV_INLINE
+ulint
+buf_block_get_lock_hash_val(
+/*========================*/
+	const buf_block_t*	block)	/*!< in: block */
+{
+	ut_ad(block);
+	ut_ad(buf_page_in_file(&block->page));
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(rw_lock_own(&(((buf_block_t*) block)->lock), RW_LOCK_EXCLUSIVE)
+	      || rw_lock_own(&(((buf_block_t*) block)->lock), RW_LOCK_SHARED));
+#endif /* UNIV_SYNC_DEBUG */
+	return(block->lock_hash_val);
+}
+
+/********************************************************************//**
+Allocates a buffer block.
+@return	own: the allocated block, in state BUF_BLOCK_MEMORY */
+UNIV_INLINE
+buf_block_t*
+buf_block_alloc(
+/*============*/
+	ulint	zip_size)	/*!< in: compressed page size in bytes,
+				or 0 if uncompressed tablespace */
+{
+	buf_block_t*	block;
+
+	block = buf_LRU_get_free_block(zip_size);
+
+	buf_block_set_state(block, BUF_BLOCK_MEMORY);
+
+	return(block);
+}
+
+/********************************************************************//**
+Frees a buffer block which does not contain a file page. */
+UNIV_INLINE
+void
+buf_block_free(
+/*===========*/
+	buf_block_t*	block)	/*!< in, own: block to be freed */
+{
+	//buf_pool_mutex_enter();
+
+	mutex_enter(&block->mutex);
+
+	ut_a(buf_block_get_state(block) != BUF_BLOCK_FILE_PAGE);
+
+	buf_LRU_block_free_non_file_page(block, FALSE);
+
+	mutex_exit(&block->mutex);
+
+	//buf_pool_mutex_exit();
+}
+#endif /* !UNIV_HOTBACKUP */
+
+/*********************************************************************//**
+Copies contents of a buffer frame to a given buffer.
+@return	buf */
+UNIV_INLINE
+byte*
+buf_frame_copy(
+/*===========*/
+	byte*			buf,	/*!< in: buffer to copy to */
+	const buf_frame_t*	frame)	/*!< in: buffer frame */
+{
+	ut_ad(buf && frame);
+
+	ut_memcpy(buf, frame, UNIV_PAGE_SIZE);
+
+	return(buf);
+}
+
+#ifndef UNIV_HOTBACKUP
+/********************************************************************//**
+Calculates a folded value of a file page address to use in the page hash
+table.
+@return	the folded value */
+UNIV_INLINE
+ulint
+buf_page_address_fold(
+/*==================*/
+	ulint	space,	/*!< in: space id */
+	ulint	offset)	/*!< in: offset of the page within space */
+{
+	return((space << 20) + space + offset);
+}
+
+/********************************************************************//**
+Gets the youngest modification log sequence number for a frame.
+Returns zero if not file page or no modification occurred yet.
+@return	newest modification to page */
+UNIV_INLINE
+ib_uint64_t
+buf_page_get_newest_modification(
+/*=============================*/
+	const buf_page_t*	bpage)	/*!< in: block containing the
+					page frame */
+{
+	ib_uint64_t	lsn;
+	mutex_t*	block_mutex = buf_page_get_mutex_enter(bpage);
+
+	if (block_mutex && buf_page_in_file(bpage)) {
+		lsn = bpage->newest_modification;
+	} else {
+		lsn = 0;
+	}
+
+	if (block_mutex) {
+		mutex_exit(block_mutex);
+	}
+
+	return(lsn);
+}
+
+/********************************************************************//**
+Increments the modify clock of a frame by 1. The caller must (1) own the
+buf_pool mutex and block bufferfix count has to be zero, (2) or own an x-lock
+on the block. */
+UNIV_INLINE
+void
+buf_block_modify_clock_inc(
+/*=======================*/
+	buf_block_t*	block)	/*!< in: block */
+{
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad((mutex_own(&LRU_list_mutex)
+	       && (block->page.buf_fix_count == 0))
+	      || rw_lock_own(&(block->lock), RW_LOCK_EXCLUSIVE));
+#endif /* UNIV_SYNC_DEBUG */
+
+	block->modify_clock++;
+}
+
+/********************************************************************//**
+Returns the value of the modify clock. The caller must have an s-lock
+or x-lock on the block.
+@return	value */
+UNIV_INLINE
+ib_uint64_t
+buf_block_get_modify_clock(
+/*=======================*/
+	buf_block_t*	block)	/*!< in: block */
+{
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(rw_lock_own(&(block->lock), RW_LOCK_SHARED)
+	      || rw_lock_own(&(block->lock), RW_LOCK_EXCLUSIVE));
+#endif /* UNIV_SYNC_DEBUG */
+
+	return(block->modify_clock);
+}
+
+/*******************************************************************//**
+Increments the bufferfix count. */
+UNIV_INLINE
+void
+buf_block_buf_fix_inc_func(
+/*=======================*/
+#ifdef UNIV_SYNC_DEBUG
+	const char*	file,	/*!< in: file name */
+	ulint		line,	/*!< in: line */
+#endif /* UNIV_SYNC_DEBUG */
+	buf_block_t*	block)	/*!< in/out: block to bufferfix */
+{
+#ifdef UNIV_SYNC_DEBUG
+	ibool	ret;
+
+	ret = rw_lock_s_lock_nowait(&(block->debug_latch), file, line);
+	ut_a(ret);
+#endif /* UNIV_SYNC_DEBUG */
+	ut_ad(mutex_own(&block->mutex));
+
+	block->page.buf_fix_count++;
+}
+#ifdef UNIV_SYNC_DEBUG
+/** Increments the bufferfix count.
+@param b	in/out: block to bufferfix
+@param f	in: file name where requested
+@param l	in: line number where requested */
+# define buf_block_buf_fix_inc(b,f,l) buf_block_buf_fix_inc_func(f,l,b)
+#else /* UNIV_SYNC_DEBUG */
+/** Increments the bufferfix count.
+@param b	in/out: block to bufferfix
+@param f	in: file name where requested
+@param l	in: line number where requested */
+# define buf_block_buf_fix_inc(b,f,l) buf_block_buf_fix_inc_func(b)
+#endif /* UNIV_SYNC_DEBUG */
+
+/*******************************************************************//**
+Decrements the bufferfix count. */
+UNIV_INLINE
+void
+buf_block_buf_fix_dec(
+/*==================*/
+	buf_block_t*	block)	/*!< in/out: block to bufferunfix */
+{
+	ut_ad(mutex_own(&block->mutex));
+
+	block->page.buf_fix_count--;
+#ifdef UNIV_SYNC_DEBUG
+	rw_lock_s_unlock(&block->debug_latch);
+#endif
+}
+
+/******************************************************************//**
+Returns the control block of a file page, NULL if not found.
+@return	block, NULL if not found */
+UNIV_INLINE
+buf_page_t*
+buf_page_hash_get(
+/*==============*/
+	ulint	space,	/*!< in: space id */
+	ulint	offset)	/*!< in: offset of the page within space */
+{
+	buf_page_t*	bpage;
+	ulint		fold;
+
+	ut_ad(buf_pool);
+	//ut_ad(buf_pool_mutex_own());
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(rw_lock_own(&page_hash_latch, RW_LOCK_EX)
+	      || rw_lock_own(&page_hash_latch, RW_LOCK_SHARED));
+#endif
+
+	/* Look for the page in the hash table */
+
+	fold = buf_page_address_fold(space, offset);
+
+	HASH_SEARCH(hash, buf_pool->page_hash, fold, buf_page_t*, bpage,
+		    ut_ad(bpage->in_page_hash && !bpage->in_zip_hash
+			  && buf_page_in_file(bpage)),
+		    bpage->space == space && bpage->offset == offset);
+	if (bpage) {
+		ut_a(buf_page_in_file(bpage));
+		ut_ad(bpage->in_page_hash);
+		ut_ad(!bpage->in_zip_hash);
+#if UNIV_WORD_SIZE == 4
+		/* On 32-bit systems, there is no padding in
+		buf_page_t.  On other systems, Valgrind could complain
+		about uninitialized pad bytes. */
+		UNIV_MEM_ASSERT_RW(bpage, sizeof *bpage);
+#endif
+	}
+
+	return(bpage);
+}
+
+/******************************************************************//**
+Returns the control block of a file page, NULL if not found
+or an uncompressed page frame does not exist.
+@return	block, NULL if not found */
+UNIV_INLINE
+buf_block_t*
+buf_block_hash_get(
+/*===============*/
+	ulint	space,	/*!< in: space id */
+	ulint	offset)	/*!< in: offset of the page within space */
+{
+	return(buf_page_get_block(buf_page_hash_get(space, offset)));
+}
+
+/********************************************************************//**
+Returns TRUE if the page can be found in the buffer pool hash table.
+
+NOTE that it is possible that the page is not yet read from disk,
+though.
+
+@return	TRUE if found in the page hash table */
+UNIV_INLINE
+ibool
+buf_page_peek(
+/*==========*/
+	ulint	space,	/*!< in: space id */
+	ulint	offset)	/*!< in: page number */
+{
+	const buf_page_t*	bpage;
+
+	//buf_pool_mutex_enter();
+	rw_lock_s_lock(&page_hash_latch);
+
+	bpage = buf_page_hash_get(space, offset);
+
+	//buf_pool_mutex_exit();
+	rw_lock_s_unlock(&page_hash_latch);
+
+	return(bpage != NULL);
+}
+
+/********************************************************************//**
+Releases a compressed-only page acquired with buf_page_get_zip(). */
+UNIV_INLINE
+void
+buf_page_release_zip(
+/*=================*/
+	buf_page_t*	bpage)		/*!< in: buffer block */
+{
+	buf_block_t*	block;
+
+	ut_ad(bpage);
+	ut_a(bpage->buf_fix_count > 0);
+
+	switch (buf_page_get_state(bpage)) {
+	case BUF_BLOCK_ZIP_PAGE:
+	case BUF_BLOCK_ZIP_DIRTY:
+		mutex_enter(&buf_pool_zip_mutex);
+		bpage->buf_fix_count--;
+		mutex_exit(&buf_pool_zip_mutex);
+		return;
+	case BUF_BLOCK_FILE_PAGE:
+		block = (buf_block_t*) bpage;
+		mutex_enter(&block->mutex);
+#ifdef UNIV_SYNC_DEBUG
+		rw_lock_s_unlock(&block->debug_latch);
+#endif
+		bpage->buf_fix_count--;
+		mutex_exit(&block->mutex);
+		return;
+	case BUF_BLOCK_ZIP_FREE:
+	case BUF_BLOCK_NOT_USED:
+	case BUF_BLOCK_READY_FOR_USE:
+	case BUF_BLOCK_MEMORY:
+	case BUF_BLOCK_REMOVE_HASH:
+		break;
+	}
+
+	ut_error;
+}
+
+/********************************************************************//**
+Decrements the bufferfix count of a buffer control block and releases
+a latch, if specified. */
+UNIV_INLINE
+void
+buf_page_release(
+/*=============*/
+	buf_block_t*	block,		/*!< in: buffer block */
+	ulint		rw_latch,	/*!< in: RW_S_LATCH, RW_X_LATCH,
+					RW_NO_LATCH */
+	mtr_t*		mtr __attribute__((unused)))		/*!< in: mtr */
+{
+	ut_ad(block);
+
+	ut_a(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE);
+	ut_a(block->page.buf_fix_count > 0);
+
+	/* buf_flush_note_modification() should be called before this function. */
+/*
+	if (rw_latch == RW_X_LATCH && mtr->modifications) {
+		buf_pool_mutex_enter();
+		buf_flush_note_modification(block, mtr);
+		buf_pool_mutex_exit();
+	}
+*/
+
+	mutex_enter(&block->mutex);
+
+#ifdef UNIV_SYNC_DEBUG
+	rw_lock_s_unlock(&(block->debug_latch));
+#endif
+	block->page.buf_fix_count--;
+
+	mutex_exit(&block->mutex);
+
+	if (rw_latch == RW_S_LATCH) {
+		rw_lock_s_unlock(&(block->lock));
+	} else if (rw_latch == RW_X_LATCH) {
+		rw_lock_x_unlock(&(block->lock));
+	}
+}
+
+#ifdef UNIV_SYNC_DEBUG
+/*********************************************************************//**
+Adds latch level info for the rw-lock protecting the buffer frame. This
+should be called in the debug version after a successful latching of a
+page if we know the latching order level of the acquired latch. */
+UNIV_INLINE
+void
+buf_block_dbg_add_level(
+/*====================*/
+	buf_block_t*	block,	/*!< in: buffer page
+				where we have acquired latch */
+	ulint		level)	/*!< in: latching order level */
+{
+	sync_thread_add_level(&block->lock, level);
+}
+#endif /* UNIV_SYNC_DEBUG */
+#endif /* !UNIV_HOTBACKUP */
diff --git a/storage/xtradb/include/buf0flu.h b/storage/xtradb/include/buf0flu.h
new file mode 100644
index 00000000000..2f7108fda1b
--- /dev/null
+++ b/storage/xtradb/include/buf0flu.h
@@ -0,0 +1,218 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2010, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/buf0flu.h
+The database buffer pool flush algorithm
+
+Created 11/5/1995 Heikki Tuuri
+*******************************************************/
+
+#ifndef buf0flu_h
+#define buf0flu_h
+
+#include "univ.i"
+#include "ut0byte.h"
+#ifndef UNIV_HOTBACKUP
+#include "mtr0types.h"
+#include "buf0types.h"
+
+/********************************************************************//**
+Remove a block from the flush list of modified blocks. */
+UNIV_INTERN
+void
+buf_flush_remove(
+/*=============*/
+	buf_page_t*	bpage);	/*!< in: pointer to the block in question */
+/********************************************************************//**
+Relocates a buffer control block on the flush_list.
+Note that it is assumed that the contents of bpage has already been
+copied to dpage. */
+UNIV_INTERN
+void
+buf_flush_relocate_on_flush_list(
+/*=============================*/
+	buf_page_t*	bpage,	/*!< in/out: control block being moved */
+	buf_page_t*	dpage);	/*!< in/out: destination block */
+/********************************************************************//**
+Updates the flush system data structures when a write is completed. */
+UNIV_INTERN
+void
+buf_flush_write_complete(
+/*=====================*/
+	buf_page_t*	bpage);	/*!< in: pointer to the block in question */
+/*********************************************************************//**
+Flushes pages from the end of the LRU list if there is too small
+a margin of replaceable pages there. */
+UNIV_INTERN
+void
+buf_flush_free_margin(
+/*=======================*/
+	ibool	wait);
+#endif /* !UNIV_HOTBACKUP */
+/********************************************************************//**
+Initializes a page for writing to the tablespace. */
+UNIV_INTERN
+void
+buf_flush_init_for_writing(
+/*=======================*/
+	byte*		page,		/*!< in/out: page */
+	void*		page_zip_,	/*!< in/out: compressed page, or NULL */
+	ib_uint64_t	newest_lsn);	/*!< in: newest modification lsn
+					to the page */
+#ifndef UNIV_HOTBACKUP
+/*******************************************************************//**
+This utility flushes dirty blocks from the end of the LRU list or flush_list.
+NOTE 1: in the case of an LRU flush the calling thread may own latches to
+pages: to avoid deadlocks, this function must be written so that it cannot
+end up waiting for these latches! NOTE 2: in the case of a flush list flush,
+the calling thread is not allowed to own any latches on pages!
+@return number of blocks for which the write request was queued;
+ULINT_UNDEFINED if there was a flush of the same type already running */
+UNIV_INTERN
+ulint
+buf_flush_batch(
+/*============*/
+	enum buf_flush	flush_type,	/*!< in: BUF_FLUSH_LRU or
+					BUF_FLUSH_LIST; if BUF_FLUSH_LIST,
+					then the caller must not own any
+					latches on pages */
+	ulint		min_n,		/*!< in: wished minimum mumber of blocks
+					flushed (it is not guaranteed that the
+					actual number is that big, though) */
+	ib_uint64_t	lsn_limit);	/*!< in the case BUF_FLUSH_LIST all
+					blocks whose oldest_modification is
+					smaller than this should be flushed
+					(if their number does not exceed
+					min_n), otherwise ignored */
+/******************************************************************//**
+Waits until a flush batch of the given type ends */
+UNIV_INTERN
+void
+buf_flush_wait_batch_end(
+/*=====================*/
+	enum buf_flush	type);	/*!< in: BUF_FLUSH_LRU or BUF_FLUSH_LIST */
+/********************************************************************//**
+This function should be called at a mini-transaction commit, if a page was
+modified in it. Puts the block to the list of modified blocks, if it not
+already in it. */
+UNIV_INLINE
+void
+buf_flush_note_modification(
+/*========================*/
+	buf_block_t*	block,	/*!< in: block which is modified */
+	mtr_t*		mtr);	/*!< in: mtr */
+/********************************************************************//**
+This function should be called when recovery has modified a buffer page. */
+UNIV_INLINE
+void
+buf_flush_recv_note_modification(
+/*=============================*/
+	buf_block_t*	block,		/*!< in: block which is modified */
+	ib_uint64_t	start_lsn,	/*!< in: start lsn of the first mtr in a
+					set of mtr's */
+	ib_uint64_t	end_lsn);	/*!< in: end lsn of the last mtr in the
+					set of mtr's */
+/********************************************************************//**
+Returns TRUE if the file page block is immediately suitable for replacement,
+i.e., transition FILE_PAGE => NOT_USED allowed.
+@return	TRUE if can replace immediately */
+UNIV_INTERN
+ibool
+buf_flush_ready_for_replace(
+/*========================*/
+	buf_page_t*	bpage);	/*!< in: buffer control block, must be
+				buf_page_in_file(bpage) and in the LRU list */
+
+/** @brief Statistics for selecting flush rate based on redo log
+generation speed.
+
+These statistics are generated for heuristics used in estimating the
+rate at which we should flush the dirty blocks to avoid bursty IO
+activity. Note that the rate of flushing not only depends on how many
+dirty pages we have in the buffer pool but it is also a fucntion of
+how much redo the workload is generating and at what rate. */
+
+struct buf_flush_stat_struct
+{
+	ib_uint64_t	redo;		/*!< amount of redo generated. */
+	ulint		n_flushed;	/*!< number of pages flushed. */
+};
+
+/** Statistics for selecting flush rate of dirty pages. */
+typedef struct buf_flush_stat_struct buf_flush_stat_t;
+/*********************************************************************
+Update the historical stats that we are collecting for flush rate
+heuristics at the end of each interval. */
+UNIV_INTERN
+void
+buf_flush_stat_update(void);
+/*=======================*/
+/*********************************************************************
+Determines the fraction of dirty pages that need to be flushed based
+on the speed at which we generate redo log. Note that if redo log
+is generated at significant rate without a corresponding increase
+in the number of dirty pages (for example, an in-memory workload)
+it can cause IO bursts of flushing. This function implements heuristics
+to avoid this burstiness.
+@return	number of dirty pages to be flushed / second */
+UNIV_INTERN
+ulint
+buf_flush_get_desired_flush_rate(void);
+/*==================================*/
+
+#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
+/******************************************************************//**
+Validates the flush list.
+@return	TRUE if ok */
+UNIV_INTERN
+ibool
+buf_flush_validate(void);
+/*====================*/
+#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
+
+/******************************************************************//**
+Initialize the red-black tree to speed up insertions into the flush_list
+during recovery process. Should be called at the start of recovery
+process before any page has been read/written. */
+UNIV_INTERN
+void
+buf_flush_init_flush_rbt(void);
+/*==========================*/
+
+/******************************************************************//**
+Frees up the red-black tree. */
+UNIV_INTERN
+void
+buf_flush_free_flush_rbt(void);
+/*==========================*/
+
+/** When buf_flush_free_margin is called, it tries to make this many blocks
+available to replacement in the free list and at the end of the LRU list (to
+make sure that a read-ahead batch can be read efficiently in a single
+sweep). */
+#define BUF_FLUSH_FREE_BLOCK_MARGIN	(5 + BUF_READ_AHEAD_AREA)
+/** Extra margin to apply above BUF_FLUSH_FREE_BLOCK_MARGIN */
+#define BUF_FLUSH_EXTRA_MARGIN		(BUF_FLUSH_FREE_BLOCK_MARGIN / 4 + 100)
+#endif /* !UNIV_HOTBACKUP */
+
+#ifndef UNIV_NONINL
+#include "buf0flu.ic"
+#endif
+
+#endif
diff --git a/storage/xtradb/include/buf0flu.ic b/storage/xtradb/include/buf0flu.ic
new file mode 100644
index 00000000000..4ad0814f344
--- /dev/null
+++ b/storage/xtradb/include/buf0flu.ic
@@ -0,0 +1,155 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/buf0flu.ic
+The database buffer pool flush algorithm
+
+Created 11/5/1995 Heikki Tuuri
+*******************************************************/
+
+#ifndef UNIV_HOTBACKUP
+#include "buf0buf.h"
+#include "mtr0mtr.h"
+
+/********************************************************************//**
+Inserts a modified block into the flush list. */
+UNIV_INTERN
+void
+buf_flush_insert_into_flush_list(
+/*=============================*/
+	buf_block_t*	block);	/*!< in/out: block which is modified */
+/********************************************************************//**
+Inserts a modified block into the flush list in the right sorted position.
+This function is used by recovery, because there the modifications do not
+necessarily come in the order of lsn's. */
+UNIV_INTERN
+void
+buf_flush_insert_sorted_into_flush_list(
+/*====================================*/
+	buf_block_t*	block);	/*!< in/out: block which is modified */
+
+/********************************************************************//**
+This function should be called at a mini-transaction commit, if a page was
+modified in it. Puts the block to the list of modified blocks, if it is not
+already in it. */
+UNIV_INLINE
+void
+buf_flush_note_modification(
+/*========================*/
+	buf_block_t*	block,	/*!< in: block which is modified */
+	mtr_t*		mtr)	/*!< in: mtr */
+{
+	ibool	use_LRU_mutex = FALSE;
+
+	if (UT_LIST_GET_LEN(buf_pool->unzip_LRU))
+		use_LRU_mutex = TRUE;
+
+	if (use_LRU_mutex)
+		mutex_enter(&LRU_list_mutex);
+
+	mutex_enter(&block->mutex);
+
+	ut_ad(block);
+	ut_ad(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE);
+	ut_ad(block->page.buf_fix_count > 0);
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(rw_lock_own(&(block->lock), RW_LOCK_EX));
+#endif /* UNIV_SYNC_DEBUG */
+	//ut_ad(buf_pool_mutex_own());
+
+	ut_ad(mtr->start_lsn != 0);
+	ut_ad(mtr->modifications);
+	ut_ad(block->page.newest_modification <= mtr->end_lsn);
+
+	block->page.newest_modification = mtr->end_lsn;
+
+	if (!block->page.oldest_modification) {
+		mutex_enter(&flush_list_mutex);
+
+		block->page.oldest_modification = mtr->start_lsn;
+		ut_ad(block->page.oldest_modification != 0);
+
+		buf_flush_insert_into_flush_list(block);
+		mutex_exit(&flush_list_mutex);
+	} else {
+		ut_ad(block->page.oldest_modification <= mtr->start_lsn);
+	}
+
+	mutex_exit(&block->mutex);
+
+	++srv_buf_pool_write_requests;
+
+	if (use_LRU_mutex)
+		mutex_exit(&LRU_list_mutex);
+}
+
+/********************************************************************//**
+This function should be called when recovery has modified a buffer page. */
+UNIV_INLINE
+void
+buf_flush_recv_note_modification(
+/*=============================*/
+	buf_block_t*	block,		/*!< in: block which is modified */
+	ib_uint64_t	start_lsn,	/*!< in: start lsn of the first mtr in a
+					set of mtr's */
+	ib_uint64_t	end_lsn)	/*!< in: end lsn of the last mtr in the
+					set of mtr's */
+{
+	ibool	use_LRU_mutex = FALSE;
+
+	if(UT_LIST_GET_LEN(buf_pool->unzip_LRU))
+		use_LRU_mutex = TRUE;
+
+	if (use_LRU_mutex)
+		mutex_enter(&LRU_list_mutex);
+
+	mutex_enter(&(block->mutex));
+
+	ut_ad(block);
+	ut_ad(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE);
+	ut_ad(block->page.buf_fix_count > 0);
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(rw_lock_own(&(block->lock), RW_LOCK_EX));
+#endif /* UNIV_SYNC_DEBUG */
+
+	//buf_pool_mutex_enter();
+
+	ut_ad(block->page.newest_modification <= end_lsn);
+
+	block->page.newest_modification = end_lsn;
+
+	if (!block->page.oldest_modification) {
+		mutex_enter(&flush_list_mutex);
+
+		block->page.oldest_modification = start_lsn;
+
+		ut_ad(block->page.oldest_modification != 0);
+
+		buf_flush_insert_sorted_into_flush_list(block);
+		mutex_exit(&flush_list_mutex);
+	} else {
+		ut_ad(block->page.oldest_modification <= start_lsn);
+	}
+
+	//buf_pool_mutex_exit();
+	if (use_LRU_mutex)
+		mutex_exit(&LRU_list_mutex);
+	mutex_exit(&(block->mutex));
+}
+#endif /* !UNIV_HOTBACKUP */
diff --git a/storage/xtradb/include/buf0lru.h b/storage/xtradb/include/buf0lru.h
new file mode 100644
index 00000000000..d3b59e8b579
--- /dev/null
+++ b/storage/xtradb/include/buf0lru.h
@@ -0,0 +1,309 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/buf0lru.h
+The database buffer pool LRU replacement algorithm
+
+Created 11/5/1995 Heikki Tuuri
+*******************************************************/
+
+#ifndef buf0lru_h
+#define buf0lru_h
+
+#include "univ.i"
+#include "ut0byte.h"
+#include "buf0types.h"
+
+/** The return type of buf_LRU_free_block() */
+enum buf_lru_free_block_status {
+	/** freed */
+	BUF_LRU_FREED = 0,
+	/** not freed because the caller asked to remove the
+	uncompressed frame but the control block cannot be
+	relocated */
+	BUF_LRU_CANNOT_RELOCATE,
+	/** not freed because of some other reason */
+	BUF_LRU_NOT_FREED
+};
+
+/******************************************************************//**
+Tries to remove LRU flushed blocks from the end of the LRU list and put them
+to the free list. This is beneficial for the efficiency of the insert buffer
+operation, as flushed pages from non-unique non-clustered indexes are here
+taken out of the buffer pool, and their inserts redirected to the insert
+buffer. Otherwise, the flushed blocks could get modified again before read
+operations need new buffer blocks, and the i/o work done in flushing would be
+wasted. */
+UNIV_INTERN
+void
+buf_LRU_try_free_flushed_blocks(void);
+/*==================================*/
+/******************************************************************//**
+Returns TRUE if less than 25 % of the buffer pool is available. This can be
+used in heuristics to prevent huge transactions eating up the whole buffer
+pool for their locks.
+@return	TRUE if less than 25 % of buffer pool left */
+UNIV_INTERN
+ibool
+buf_LRU_buf_pool_running_out(void);
+/*==============================*/
+
+/*#######################################################################
+These are low-level functions
+#########################################################################*/
+
+/** Minimum LRU list length for which the LRU_old pointer is defined */
+#define BUF_LRU_OLD_MIN_LEN	512	/* 8 megabytes of 16k pages */
+
+/** Maximum LRU list search length in buf_flush_LRU_recommendation() */
+#define BUF_LRU_FREE_SEARCH_LEN		(5 + 2 * BUF_READ_AHEAD_AREA)
+
+/******************************************************************//**
+Invalidates all pages belonging to a given tablespace when we are deleting
+the data file(s) of that tablespace. A PROBLEM: if readahead is being started,
+what guarantees that it will not try to read in pages after this operation has
+completed? */
+UNIV_INTERN
+void
+buf_LRU_invalidate_tablespace(
+/*==========================*/
+	ulint	id);	/*!< in: space id */
+/********************************************************************//**
+Insert a compressed block into buf_pool->zip_clean in the LRU order. */
+UNIV_INTERN
+void
+buf_LRU_insert_zip_clean(
+/*=====================*/
+	buf_page_t*	bpage);	/*!< in: pointer to the block in question */
+
+/******************************************************************//**
+Try to free a block.  If bpage is a descriptor of a compressed-only
+page, the descriptor object will be freed as well.
+
+NOTE: If this function returns BUF_LRU_FREED, it will temporarily
+release buf_pool_mutex.  Furthermore, the page frame will no longer be
+accessible via bpage.
+
+The caller must hold buf_pool_mutex and buf_page_get_mutex(bpage) and
+release these two mutexes after the call.  No other
+buf_page_get_mutex() may be held when calling this function.
+@return BUF_LRU_FREED if freed, BUF_LRU_CANNOT_RELOCATE or
+BUF_LRU_NOT_FREED otherwise. */
+UNIV_INTERN
+enum buf_lru_free_block_status
+buf_LRU_free_block(
+/*===============*/
+	buf_page_t*	bpage,	/*!< in: block to be freed */
+	ibool		zip,	/*!< in: TRUE if should remove also the
+				compressed page of an uncompressed page */
+	ibool*		buf_pool_mutex_released,
+				/*!< in: pointer to a variable that will
+				be assigned TRUE if buf_pool_mutex
+				was temporarily released, or NULL */
+	ibool		have_LRU_mutex);
+/******************************************************************//**
+Try to free a replaceable block.
+@return	TRUE if found and freed */
+UNIV_INTERN
+ibool
+buf_LRU_search_and_free_block(
+/*==========================*/
+	ulint	n_iterations);	/*!< in: how many times this has been called
+				repeatedly without result: a high value means
+				that we should search farther; if
+				n_iterations < 10, then we search
+				n_iterations / 10 * buf_pool->curr_size
+				pages from the end of the LRU list; if
+				n_iterations < 5, then we will also search
+				n_iterations / 5 of the unzip_LRU list. */
+/******************************************************************//**
+Returns a free block from the buf_pool.  The block is taken off the
+free list.  If it is empty, returns NULL.
+@return	a free control block, or NULL if the buf_block->free list is empty */
+UNIV_INTERN
+buf_block_t*
+buf_LRU_get_free_only(void);
+/*=======================*/
+/******************************************************************//**
+Returns a free block from the buf_pool. The block is taken off the
+free list. If it is empty, blocks are moved from the end of the
+LRU list to the free list.
+@return	the free control block, in state BUF_BLOCK_READY_FOR_USE */
+UNIV_INTERN
+buf_block_t*
+buf_LRU_get_free_block(
+/*===================*/
+	ulint	zip_size);	/*!< in: compressed page size in bytes,
+				or 0 if uncompressed tablespace */
+
+/******************************************************************//**
+Puts a block back to the free list. */
+UNIV_INTERN
+void
+buf_LRU_block_free_non_file_page(
+/*=============================*/
+	buf_block_t*	block,	/*!< in: block, must not contain a file page */
+	ibool		have_page_hash_mutex);
+/******************************************************************//**
+Adds a block to the LRU list. */
+UNIV_INTERN
+void
+buf_LRU_add_block(
+/*==============*/
+	buf_page_t*	bpage,	/*!< in: control block */
+	ibool		old);	/*!< in: TRUE if should be put to the old
+				blocks in the LRU list, else put to the
+				start; if the LRU list is very short, added to
+				the start regardless of this parameter */
+/******************************************************************//**
+Adds a block to the LRU list of decompressed zip pages. */
+UNIV_INTERN
+void
+buf_unzip_LRU_add_block(
+/*====================*/
+	buf_block_t*	block,	/*!< in: control block */
+	ibool		old);	/*!< in: TRUE if should be put to the end
+				of the list, else put to the start */
+/******************************************************************//**
+Moves a block to the start of the LRU list. */
+UNIV_INTERN
+void
+buf_LRU_make_block_young(
+/*=====================*/
+	buf_page_t*	bpage);	/*!< in: control block */
+/******************************************************************//**
+Moves a block to the end of the LRU list. */
+UNIV_INTERN
+void
+buf_LRU_make_block_old(
+/*===================*/
+	buf_page_t*	bpage);	/*!< in: control block */
+/**********************************************************************//**
+Updates buf_LRU_old_ratio.
+@return	updated old_pct */
+UNIV_INTERN
+uint
+buf_LRU_old_ratio_update(
+/*=====================*/
+	uint	old_pct,/*!< in: Reserve this percentage of
+			the buffer pool for "old" blocks. */
+	ibool	adjust);/*!< in: TRUE=adjust the LRU list;
+			FALSE=just assign buf_LRU_old_ratio
+			during the initialization of InnoDB */
+/********************************************************************//**
+Update the historical stats that we are collecting for LRU eviction
+policy at the end of each interval. */
+UNIV_INTERN
+void
+buf_LRU_stat_update(void);
+/*=====================*/
+/********************************************************************//**
+Dump the LRU page list to the specific file. */
+UNIV_INTERN
+ibool
+buf_LRU_file_dump(void);
+/*===================*/
+/********************************************************************//**
+Read the pages based on the specific file.*/
+UNIV_INTERN
+ibool
+buf_LRU_file_restore(void);
+/*======================*/
+
+#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
+/**********************************************************************//**
+Validates the LRU list.
+@return	TRUE */
+UNIV_INTERN
+ibool
+buf_LRU_validate(void);
+/*==================*/
+#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
+#if defined UNIV_DEBUG_PRINT || defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
+/**********************************************************************//**
+Prints the LRU list. */
+UNIV_INTERN
+void
+buf_LRU_print(void);
+/*===============*/
+#endif /* UNIV_DEBUG_PRINT || UNIV_DEBUG || UNIV_BUF_DEBUG */
+
+/** @name Heuristics for detecting index scan @{ */
+/** Reserve this much/BUF_LRU_OLD_RATIO_DIV of the buffer pool for
+"old" blocks.  Protected by buf_pool_mutex. */
+extern uint	buf_LRU_old_ratio;
+/** The denominator of buf_LRU_old_ratio. */
+#define BUF_LRU_OLD_RATIO_DIV	1024
+/** Maximum value of buf_LRU_old_ratio.
+@see buf_LRU_old_adjust_len
+@see buf_LRU_old_ratio_update */
+#define BUF_LRU_OLD_RATIO_MAX	BUF_LRU_OLD_RATIO_DIV
+/** Minimum value of buf_LRU_old_ratio.
+@see buf_LRU_old_adjust_len
+@see buf_LRU_old_ratio_update
+The minimum must exceed
+(BUF_LRU_OLD_TOLERANCE + 5) * BUF_LRU_OLD_RATIO_DIV / BUF_LRU_OLD_MIN_LEN. */
+#define BUF_LRU_OLD_RATIO_MIN	51
+
+#if BUF_LRU_OLD_RATIO_MIN >= BUF_LRU_OLD_RATIO_MAX
+# error "BUF_LRU_OLD_RATIO_MIN >= BUF_LRU_OLD_RATIO_MAX"
+#endif
+#if BUF_LRU_OLD_RATIO_MAX > BUF_LRU_OLD_RATIO_DIV
+# error "BUF_LRU_OLD_RATIO_MAX > BUF_LRU_OLD_RATIO_DIV"
+#endif
+
+/** Move blocks to "new" LRU list only if the first access was at
+least this many milliseconds ago.  Not protected by any mutex or latch. */
+extern uint	buf_LRU_old_threshold_ms;
+/* @} */
+
+/** @brief Statistics for selecting the LRU list for eviction.
+
+These statistics are not 'of' LRU but 'for' LRU.  We keep count of I/O
+and page_zip_decompress() operations.  Based on the statistics we decide
+if we want to evict from buf_pool->unzip_LRU or buf_pool->LRU. */
+struct buf_LRU_stat_struct
+{
+	ulint	io;	/**< Counter of buffer pool I/O operations. */
+	ulint	unzip;	/**< Counter of page_zip_decompress operations. */
+};
+
+/** Statistics for selecting the LRU list for eviction. */
+typedef struct buf_LRU_stat_struct buf_LRU_stat_t;
+
+/** Current operation counters.  Not protected by any mutex.
+Cleared by buf_LRU_stat_update(). */
+extern buf_LRU_stat_t	buf_LRU_stat_cur;
+
+/** Running sum of past values of buf_LRU_stat_cur.
+Updated by buf_LRU_stat_update().  Protected by buf_pool_mutex. */
+extern buf_LRU_stat_t	buf_LRU_stat_sum;
+
+/********************************************************************//**
+Increments the I/O counter in buf_LRU_stat_cur. */
+#define buf_LRU_stat_inc_io() buf_LRU_stat_cur.io++
+/********************************************************************//**
+Increments the page_zip_decompress() counter in buf_LRU_stat_cur. */
+#define buf_LRU_stat_inc_unzip() buf_LRU_stat_cur.unzip++
+
+#ifndef UNIV_NONINL
+#include "buf0lru.ic"
+#endif
+
+#endif
diff --git a/storage/xtradb/include/buf0lru.ic b/storage/xtradb/include/buf0lru.ic
new file mode 100644
index 00000000000..556f45d987f
--- /dev/null
+++ b/storage/xtradb/include/buf0lru.ic
@@ -0,0 +1,25 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/buf0lru.ic
+The database buffer replacement algorithm
+
+Created 11/5/1995 Heikki Tuuri
+*******************************************************/
+
diff --git a/storage/xtradb/include/buf0rea.h b/storage/xtradb/include/buf0rea.h
new file mode 100644
index 00000000000..56d3d24a3b7
--- /dev/null
+++ b/storage/xtradb/include/buf0rea.h
@@ -0,0 +1,170 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/buf0rea.h
+The database buffer read
+
+Created 11/5/1995 Heikki Tuuri
+*******************************************************/
+
+#ifndef buf0rea_h
+#define buf0rea_h
+
+#include "univ.i"
+#include "trx0types.h"
+#include "buf0types.h"
+
+/********************************************************************//**
+Low-level function which reads a page asynchronously from a file to the
+buffer buf_pool if it is not already there, in which case does nothing.
+Sets the io_fix flag and sets an exclusive lock on the buffer frame. The
+flag is cleared and the x-lock released by an i/o-handler thread.
+@return 1 if a read request was queued, 0 if the page already resided
+in buf_pool, or if the page is in the doublewrite buffer blocks in
+which case it is never read into the pool, or if the tablespace does
+not exist or is being dropped 
+@return 1 if read request is issued. 0 if it is not */
+UNIV_INTERN
+ulint
+buf_read_page_low(
+/*==============*/
+	ulint*	err,	/*!< out: DB_SUCCESS or DB_TABLESPACE_DELETED if we are
+			trying to read from a non-existent tablespace, or a
+			tablespace which is just now being dropped */
+	ibool	sync,	/*!< in: TRUE if synchronous aio is desired */
+	ulint	mode,	/*!< in: BUF_READ_IBUF_PAGES_ONLY, ...,
+			ORed to OS_AIO_SIMULATED_WAKE_LATER (see below
+			at read-ahead functions) */
+	ulint	space,	/*!< in: space id */
+	ulint	zip_size,/*!< in: compressed page size, or 0 */
+	ibool	unzip,	/*!< in: TRUE=request uncompressed page */
+	ib_int64_t tablespace_version, /*!< in: if the space memory object has
+			this timestamp different from what we are giving here,
+			treat the tablespace as dropped; this is a timestamp we
+			use to stop dangling page reads from a tablespace
+			which we have DISCARDed + IMPORTed back */
+	ulint	offset,	/*!< in: page number */
+	trx_t*	trx);
+/********************************************************************//**
+High-level function which reads a page asynchronously from a file to the
+buffer buf_pool if it is not already there. Sets the io_fix flag and sets
+an exclusive lock on the buffer frame. The flag is cleared and the x-lock
+released by the i/o-handler thread.
+@return TRUE if page has been read in, FALSE in case of failure */
+UNIV_INTERN
+ibool
+buf_read_page(
+/*==========*/
+	ulint	space,	/*!< in: space id */
+	ulint	zip_size,/*!< in: compressed page size in bytes, or 0 */
+	ulint	offset, /*!< in: page number */
+	trx_t*	trx);
+/********************************************************************//**
+Applies linear read-ahead if in the buf_pool the page is a border page of
+a linear read-ahead area and all the pages in the area have been accessed.
+Does not read any page if the read-ahead mechanism is not activated. Note
+that the algorithm looks at the 'natural' adjacent successor and
+predecessor of the page, which on the leaf level of a B-tree are the next
+and previous page in the chain of leaves. To know these, the page specified
+in (space, offset) must already be present in the buf_pool. Thus, the
+natural way to use this function is to call it when a page in the buf_pool
+is accessed the first time, calling this function just after it has been
+bufferfixed.
+NOTE 1: as this function looks at the natural predecessor and successor
+fields on the page, what happens, if these are not initialized to any
+sensible value? No problem, before applying read-ahead we check that the
+area to read is within the span of the space, if not, read-ahead is not
+applied. An uninitialized value may result in a useless read operation, but
+only very improbably.
+NOTE 2: the calling thread may own latches on pages: to avoid deadlocks this
+function must be written such that it cannot end up waiting for these
+latches!
+NOTE 3: the calling thread must want access to the page given: this rule is
+set to prevent unintended read-aheads performed by ibuf routines, a situation
+which could result in a deadlock if the OS does not support asynchronous io.
+@return	number of page read requests issued */
+UNIV_INTERN
+ulint
+buf_read_ahead_linear(
+/*==================*/
+	ulint	space,	/*!< in: space id */
+	ulint	zip_size,/*!< in: compressed page size in bytes, or 0 */
+	ulint	offset, /*!< in: page number of a page; NOTE: the current thread
+			must want access to this page (see NOTE 3 above) */
+	trx_t*	trx);
+/********************************************************************//**
+Issues read requests for pages which the ibuf module wants to read in, in
+order to contract the insert buffer tree. Technically, this function is like
+a read-ahead function. */
+UNIV_INTERN
+void
+buf_read_ibuf_merge_pages(
+/*======================*/
+	ibool		sync,		/*!< in: TRUE if the caller
+					wants this function to wait
+					for the highest address page
+					to get read in, before this
+					function returns */
+	const ulint*	space_ids,	/*!< in: array of space ids */
+	const ib_int64_t* space_versions,/*!< in: the spaces must have
+					this version number
+					(timestamp), otherwise we
+					discard the read; we use this
+					to cancel reads if DISCARD +
+					IMPORT may have changed the
+					tablespace size */
+	const ulint*	page_nos,	/*!< in: array of page numbers
+					to read, with the highest page
+					number the last in the
+					array */
+	ulint		n_stored);	/*!< in: number of elements
+					in the arrays */
+/********************************************************************//**
+Issues read requests for pages which recovery wants to read in. */
+UNIV_INTERN
+void
+buf_read_recv_pages(
+/*================*/
+	ibool		sync,		/*!< in: TRUE if the caller
+					wants this function to wait
+					for the highest address page
+					to get read in, before this
+					function returns */
+	ulint		space,		/*!< in: space id */
+	ulint		zip_size,	/*!< in: compressed page size in
+					bytes, or 0 */
+	const ulint*	page_nos,	/*!< in: array of page numbers
+					to read, with the highest page
+					number the last in the
+					array */
+	ulint		n_stored);	/*!< in: number of page numbers
+					in the array */
+
+/** The size in pages of the area which the read-ahead algorithms read if
+invoked */
+#define	BUF_READ_AHEAD_AREA		64
+
+/** @name Modes used in read-ahead @{ */
+/** read only pages belonging to the insert buffer tree */
+#define BUF_READ_IBUF_PAGES_ONLY	131
+/** read any page */
+#define BUF_READ_ANY_PAGE		132
+/* @} */
+
+#endif
diff --git a/storage/xtradb/include/buf0types.h b/storage/xtradb/include/buf0types.h
new file mode 100644
index 00000000000..507f1543bbb
--- /dev/null
+++ b/storage/xtradb/include/buf0types.h
@@ -0,0 +1,83 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/buf0types.h
+The database buffer pool global types for the directory
+
+Created 11/17/1995 Heikki Tuuri
+*******************************************************/
+
+#ifndef buf0types_h
+#define buf0types_h
+
+/** Buffer page (uncompressed or compressed) */
+typedef	struct buf_page_struct		buf_page_t;
+/** Buffer block for which an uncompressed page exists */
+typedef	struct buf_block_struct		buf_block_t;
+/** Buffer pool chunk comprising buf_block_t */
+typedef struct buf_chunk_struct		buf_chunk_t;
+/** Buffer pool comprising buf_chunk_t */
+typedef	struct buf_pool_struct		buf_pool_t;
+/** Buffer pool statistics struct */
+typedef	struct buf_pool_stat_struct	buf_pool_stat_t;
+
+/** A buffer frame. @see page_t */
+typedef	byte	buf_frame_t;
+
+/** Flags for flush types */
+enum buf_flush {
+	BUF_FLUSH_LRU = 0,		/*!< flush via the LRU list */
+	BUF_FLUSH_SINGLE_PAGE,		/*!< flush a single page */
+	BUF_FLUSH_LIST,			/*!< flush via the flush list
+					of dirty blocks */
+	BUF_FLUSH_N_TYPES		/*!< index of last element + 1  */
+};
+
+/** Flags for io_fix types */
+enum buf_io_fix {
+	BUF_IO_NONE = 0,		/**< no pending I/O */
+	BUF_IO_READ,			/**< read pending */
+	BUF_IO_WRITE			/**< write pending */
+};
+
+/** Parameters of binary buddy system for compressed pages (buf0buddy.h) */
+/* @{ */
+#if UNIV_WORD_SIZE <= 4 /* 32-bit system */
+/** Base-2 logarithm of the smallest buddy block size */
+# define BUF_BUDDY_LOW_SHIFT	6
+#else /* 64-bit system */
+/** Base-2 logarithm of the smallest buddy block size */
+# define BUF_BUDDY_LOW_SHIFT	7
+#endif
+#define BUF_BUDDY_LOW		(1 << BUF_BUDDY_LOW_SHIFT)
+					/*!< minimum block size in the binary
+					buddy system; must be at least
+					sizeof(buf_page_t) */
+#define BUF_BUDDY_SIZES		(UNIV_PAGE_SIZE_SHIFT - BUF_BUDDY_LOW_SHIFT)
+#define BUF_BUDDY_SIZES_MAX	(UNIV_PAGE_SIZE_SHIFT_MAX - BUF_BUDDY_LOW_SHIFT)
+					/*!< number of buddy sizes */
+
+/** twice the maximum block size of the buddy system;
+the underlying memory is aligned by this amount:
+this must be equal to UNIV_PAGE_SIZE */
+#define BUF_BUDDY_HIGH	(BUF_BUDDY_LOW << BUF_BUDDY_SIZES)
+/* @} */
+
+#endif
+
diff --git a/storage/xtradb/include/data0data.h b/storage/xtradb/include/data0data.h
new file mode 100644
index 00000000000..f9fce3f3657
--- /dev/null
+++ b/storage/xtradb/include/data0data.h
@@ -0,0 +1,483 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/********************************************************************//**
+@file include/data0data.h
+SQL data field and tuple
+
+Created 5/30/1994 Heikki Tuuri
+*************************************************************************/
+
+#ifndef data0data_h
+#define data0data_h
+
+#include "univ.i"
+
+#include "data0types.h"
+#include "data0type.h"
+#include "mem0mem.h"
+#include "dict0types.h"
+
+/** Storage for overflow data in a big record, that is, a clustered
+index record which needs external storage of data fields */
+typedef struct big_rec_struct		big_rec_t;
+
+#ifdef UNIV_DEBUG
+/*********************************************************************//**
+Gets pointer to the type struct of SQL data field.
+@return	pointer to the type struct */
+UNIV_INLINE
+dtype_t*
+dfield_get_type(
+/*============*/
+	const dfield_t*	field);	/*!< in: SQL data field */
+/*********************************************************************//**
+Gets pointer to the data in a field.
+@return	pointer to data */
+UNIV_INLINE
+void*
+dfield_get_data(
+/*============*/
+	const dfield_t* field);	/*!< in: field */
+#else /* UNIV_DEBUG */
+# define dfield_get_type(field) (&(field)->type)
+# define dfield_get_data(field) ((field)->data)
+#endif /* UNIV_DEBUG */
+/*********************************************************************//**
+Sets the type struct of SQL data field. */
+UNIV_INLINE
+void
+dfield_set_type(
+/*============*/
+	dfield_t*	field,	/*!< in: SQL data field */
+	dtype_t*	type);	/*!< in: pointer to data type struct */
+/*********************************************************************//**
+Gets length of field data.
+@return	length of data; UNIV_SQL_NULL if SQL null data */
+UNIV_INLINE
+ulint
+dfield_get_len(
+/*===========*/
+	const dfield_t* field);	/*!< in: field */
+/*********************************************************************//**
+Sets length in a field. */
+UNIV_INLINE
+void
+dfield_set_len(
+/*===========*/
+	dfield_t*	field,	/*!< in: field */
+	ulint		len);	/*!< in: length or UNIV_SQL_NULL */
+/*********************************************************************//**
+Determines if a field is SQL NULL
+@return	nonzero if SQL null data */
+UNIV_INLINE
+ulint
+dfield_is_null(
+/*===========*/
+	const dfield_t* field);	/*!< in: field */
+/*********************************************************************//**
+Determines if a field is externally stored
+@return	nonzero if externally stored */
+UNIV_INLINE
+ulint
+dfield_is_ext(
+/*==========*/
+	const dfield_t* field);	/*!< in: field */
+/*********************************************************************//**
+Sets the "external storage" flag */
+UNIV_INLINE
+void
+dfield_set_ext(
+/*===========*/
+	dfield_t*	field);	/*!< in/out: field */
+/*********************************************************************//**
+Sets pointer to the data and length in a field. */
+UNIV_INLINE
+void
+dfield_set_data(
+/*============*/
+	dfield_t*	field,	/*!< in: field */
+	const void*	data,	/*!< in: data */
+	ulint		len);	/*!< in: length or UNIV_SQL_NULL */
+/*********************************************************************//**
+Sets a data field to SQL NULL. */
+UNIV_INLINE
+void
+dfield_set_null(
+/*============*/
+	dfield_t*	field);	/*!< in/out: field */
+/**********************************************************************//**
+Writes an SQL null field full of zeros. */
+UNIV_INLINE
+void
+data_write_sql_null(
+/*================*/
+	byte*	data,	/*!< in: pointer to a buffer of size len */
+	ulint	len);	/*!< in: SQL null size in bytes */
+/*********************************************************************//**
+Copies the data and len fields. */
+UNIV_INLINE
+void
+dfield_copy_data(
+/*=============*/
+	dfield_t*	field1,	/*!< out: field to copy to */
+	const dfield_t*	field2);/*!< in: field to copy from */
+/*********************************************************************//**
+Copies a data field to another. */
+UNIV_INLINE
+void
+dfield_copy(
+/*========*/
+	dfield_t*	field1,	/*!< out: field to copy to */
+	const dfield_t*	field2);/*!< in: field to copy from */
+/*********************************************************************//**
+Copies the data pointed to by a data field. */
+UNIV_INLINE
+void
+dfield_dup(
+/*=======*/
+	dfield_t*	field,	/*!< in/out: data field */
+	mem_heap_t*	heap);	/*!< in: memory heap where allocated */
+/*********************************************************************//**
+Tests if data length and content is equal for two dfields.
+@return	TRUE if equal */
+UNIV_INLINE
+ibool
+dfield_datas_are_binary_equal(
+/*==========================*/
+	const dfield_t*	field1,	/*!< in: field */
+	const dfield_t*	field2);/*!< in: field */
+/*********************************************************************//**
+Tests if dfield data length and content is equal to the given.
+@return	TRUE if equal */
+UNIV_INTERN
+ibool
+dfield_data_is_binary_equal(
+/*========================*/
+	const dfield_t*	field,	/*!< in: field */
+	ulint		len,	/*!< in: data length or UNIV_SQL_NULL */
+	const byte*	data);	/*!< in: data */
+/*********************************************************************//**
+Gets number of fields in a data tuple.
+@return	number of fields */
+UNIV_INLINE
+ulint
+dtuple_get_n_fields(
+/*================*/
+	const dtuple_t*	tuple);	/*!< in: tuple */
+#ifdef UNIV_DEBUG
+/*********************************************************************//**
+Gets nth field of a tuple.
+@return	nth field */
+UNIV_INLINE
+dfield_t*
+dtuple_get_nth_field(
+/*=================*/
+	const dtuple_t*	tuple,	/*!< in: tuple */
+	ulint		n);	/*!< in: index of field */
+#else /* UNIV_DEBUG */
+# define dtuple_get_nth_field(tuple, n) ((tuple)->fields + (n))
+#endif /* UNIV_DEBUG */
+/*********************************************************************//**
+Gets info bits in a data tuple.
+@return	info bits */
+UNIV_INLINE
+ulint
+dtuple_get_info_bits(
+/*=================*/
+	const dtuple_t*	tuple);	/*!< in: tuple */
+/*********************************************************************//**
+Sets info bits in a data tuple. */
+UNIV_INLINE
+void
+dtuple_set_info_bits(
+/*=================*/
+	dtuple_t*	tuple,		/*!< in: tuple */
+	ulint		info_bits);	/*!< in: info bits */
+/*********************************************************************//**
+Gets number of fields used in record comparisons.
+@return	number of fields used in comparisons in rem0cmp.* */
+UNIV_INLINE
+ulint
+dtuple_get_n_fields_cmp(
+/*====================*/
+	const dtuple_t*	tuple);	/*!< in: tuple */
+/*********************************************************************//**
+Gets number of fields used in record comparisons. */
+UNIV_INLINE
+void
+dtuple_set_n_fields_cmp(
+/*====================*/
+	dtuple_t*	tuple,		/*!< in: tuple */
+	ulint		n_fields_cmp);	/*!< in: number of fields used in
+					comparisons in rem0cmp.* */
+/**********************************************************//**
+Creates a data tuple to a memory heap. The default value for number
+of fields used in record comparisons for this tuple is n_fields.
+@return	own: created tuple */
+UNIV_INLINE
+dtuple_t*
+dtuple_create(
+/*==========*/
+	mem_heap_t*	heap,	/*!< in: memory heap where the tuple
+				is created */
+	ulint		n_fields); /*!< in: number of fields */
+
+/**********************************************************//**
+Wrap data fields in a tuple. The default value for number
+of fields used in record comparisons for this tuple is n_fields.
+@return	data tuple */
+UNIV_INLINE
+const dtuple_t*
+dtuple_from_fields(
+/*===============*/
+	dtuple_t*	tuple,		/*!< in: storage for data tuple */
+	const dfield_t*	fields,		/*!< in: fields */
+	ulint		n_fields);	/*!< in: number of fields */
+
+/*********************************************************************//**
+Sets number of fields used in a tuple. Normally this is set in
+dtuple_create, but if you want later to set it smaller, you can use this. */
+UNIV_INTERN
+void
+dtuple_set_n_fields(
+/*================*/
+	dtuple_t*	tuple,		/*!< in: tuple */
+	ulint		n_fields);	/*!< in: number of fields */
+/*********************************************************************//**
+Copies a data tuple to another.  This is a shallow copy; if a deep copy
+is desired, dfield_dup() will have to be invoked on each field.
+@return	own: copy of tuple */
+UNIV_INLINE
+dtuple_t*
+dtuple_copy(
+/*========*/
+	const dtuple_t*	tuple,	/*!< in: tuple to copy from */
+	mem_heap_t*	heap);	/*!< in: memory heap
+				where the tuple is created */
+/**********************************************************//**
+The following function returns the sum of data lengths of a tuple. The space
+occupied by the field structs or the tuple struct is not counted.
+@return	sum of data lens */
+UNIV_INLINE
+ulint
+dtuple_get_data_size(
+/*=================*/
+	const dtuple_t*	tuple,	/*!< in: typed data tuple */
+	ulint		comp);	/*!< in: nonzero=ROW_FORMAT=COMPACT  */
+/*********************************************************************//**
+Computes the number of externally stored fields in a data tuple.
+@return	number of fields */
+UNIV_INLINE
+ulint
+dtuple_get_n_ext(
+/*=============*/
+	const dtuple_t*	tuple);	/*!< in: tuple */
+/************************************************************//**
+Compare two data tuples, respecting the collation of character fields.
+@return 1, 0 , -1 if tuple1 is greater, equal, less, respectively,
+than tuple2 */
+UNIV_INTERN
+int
+dtuple_coll_cmp(
+/*============*/
+	const dtuple_t*	tuple1,	/*!< in: tuple 1 */
+	const dtuple_t*	tuple2);/*!< in: tuple 2 */
+/************************************************************//**
+Folds a prefix given as the number of fields of a tuple.
+@return	the folded value */
+UNIV_INLINE
+ulint
+dtuple_fold(
+/*========*/
+	const dtuple_t*	tuple,	/*!< in: the tuple */
+	ulint		n_fields,/*!< in: number of complete fields to fold */
+	ulint		n_bytes,/*!< in: number of bytes to fold in an
+				incomplete last field */
+	dulint		tree_id)/*!< in: index tree id */
+	__attribute__((pure));
+/*******************************************************************//**
+Sets types of fields binary in a tuple. */
+UNIV_INLINE
+void
+dtuple_set_types_binary(
+/*====================*/
+	dtuple_t*	tuple,	/*!< in: data tuple */
+	ulint		n);	/*!< in: number of fields to set */
+/**********************************************************************//**
+Checks if a dtuple contains an SQL null value.
+@return	TRUE if some field is SQL null */
+UNIV_INLINE
+ibool
+dtuple_contains_null(
+/*=================*/
+	const dtuple_t*	tuple);	/*!< in: dtuple */
+/**********************************************************//**
+Checks that a data field is typed. Asserts an error if not.
+@return	TRUE if ok */
+UNIV_INTERN
+ibool
+dfield_check_typed(
+/*===============*/
+	const dfield_t*	field);	/*!< in: data field */
+/**********************************************************//**
+Checks that a data tuple is typed. Asserts an error if not.
+@return	TRUE if ok */
+UNIV_INTERN
+ibool
+dtuple_check_typed(
+/*===============*/
+	const dtuple_t*	tuple);	/*!< in: tuple */
+/**********************************************************//**
+Checks that a data tuple is typed.
+@return	TRUE if ok */
+UNIV_INTERN
+ibool
+dtuple_check_typed_no_assert(
+/*=========================*/
+	const dtuple_t*	tuple);	/*!< in: tuple */
+#ifdef UNIV_DEBUG
+/**********************************************************//**
+Validates the consistency of a tuple which must be complete, i.e,
+all fields must have been set.
+@return	TRUE if ok */
+UNIV_INTERN
+ibool
+dtuple_validate(
+/*============*/
+	const dtuple_t*	tuple);	/*!< in: tuple */
+#endif /* UNIV_DEBUG */
+/*************************************************************//**
+Pretty prints a dfield value according to its data type. */
+UNIV_INTERN
+void
+dfield_print(
+/*=========*/
+	const dfield_t*	dfield);/*!< in: dfield */
+/*************************************************************//**
+Pretty prints a dfield value according to its data type. Also the hex string
+is printed if a string contains non-printable characters. */
+UNIV_INTERN
+void
+dfield_print_also_hex(
+/*==================*/
+	const dfield_t*	dfield);	 /*!< in: dfield */
+/**********************************************************//**
+The following function prints the contents of a tuple. */
+UNIV_INTERN
+void
+dtuple_print(
+/*=========*/
+	FILE*		f,	/*!< in: output stream */
+	const dtuple_t*	tuple);	/*!< in: tuple */
+/**************************************************************//**
+Moves parts of long fields in entry to the big record vector so that
+the size of tuple drops below the maximum record size allowed in the
+database. Moves data only from those fields which are not necessary
+to determine uniquely the insertion place of the tuple in the index.
+@return own: created big record vector, NULL if we are not able to
+shorten the entry enough, i.e., if there are too many fixed-length or
+short fields in entry or the index is clustered */
+UNIV_INTERN
+big_rec_t*
+dtuple_convert_big_rec(
+/*===================*/
+	dict_index_t*	index,	/*!< in: index */
+	dtuple_t*	entry,	/*!< in/out: index entry */
+	ulint*		n_ext);	/*!< in/out: number of
+				externally stored columns */
+/**************************************************************//**
+Puts back to entry the data stored in vector. Note that to ensure the
+fields in entry can accommodate the data, vector must have been created
+from entry with dtuple_convert_big_rec. */
+UNIV_INTERN
+void
+dtuple_convert_back_big_rec(
+/*========================*/
+	dict_index_t*	index,	/*!< in: index */
+	dtuple_t*	entry,	/*!< in: entry whose data was put to vector */
+	big_rec_t*	vector);/*!< in, own: big rec vector; it is
+				freed in this function */
+/**************************************************************//**
+Frees the memory in a big rec vector. */
+UNIV_INLINE
+void
+dtuple_big_rec_free(
+/*================*/
+	big_rec_t*	vector);	/*!< in, own: big rec vector; it is
+				freed in this function */
+
+/*######################################################################*/
+
+/** Structure for an SQL data field */
+struct dfield_struct{
+	void*		data;	/*!< pointer to data */
+	unsigned	ext:1;	/*!< TRUE=externally stored, FALSE=local */
+	unsigned	len:32;	/*!< data length; UNIV_SQL_NULL if SQL null */
+	dtype_t		type;	/*!< type of data */
+};
+
+/** Structure for an SQL data tuple of fields (logical record) */
+struct dtuple_struct {
+	ulint		info_bits;	/*!< info bits of an index record:
+					the default is 0; this field is used
+					if an index record is built from
+					a data tuple */
+	ulint		n_fields;	/*!< number of fields in dtuple */
+	ulint		n_fields_cmp;	/*!< number of fields which should
+					be used in comparison services
+					of rem0cmp.*; the index search
+					is performed by comparing only these
+					fields, others are ignored; the
+					default value in dtuple creation is
+					the same value as n_fields */
+	dfield_t*	fields;		/*!< fields */
+	UT_LIST_NODE_T(dtuple_t) tuple_list;
+					/*!< data tuples can be linked into a
+					list using this field */
+#ifdef UNIV_DEBUG
+	ulint		magic_n;	/*!< magic number, used in
+					debug assertions */
+/** Value of dtuple_struct::magic_n */
+# define		DATA_TUPLE_MAGIC_N	65478679
+#endif /* UNIV_DEBUG */
+};
+
+/** A slot for a field in a big rec vector */
+typedef struct big_rec_field_struct	big_rec_field_t;
+/** A slot for a field in a big rec vector */
+struct big_rec_field_struct {
+	ulint		field_no;	/*!< field number in record */
+	ulint		len;		/*!< stored data length, in bytes */
+	const void*	data;		/*!< stored data */
+};
+
+/** Storage format for overflow data in a big record, that is, a
+clustered index record which needs external storage of data fields */
+struct big_rec_struct {
+	mem_heap_t*	heap;		/*!< memory heap from which
+					allocated */
+	ulint		n_fields;	/*!< number of stored fields */
+	big_rec_field_t*fields;		/*!< stored fields */
+};
+
+#ifndef UNIV_NONINL
+#include "data0data.ic"
+#endif
+
+#endif
diff --git a/storage/xtradb/include/data0data.ic b/storage/xtradb/include/data0data.ic
new file mode 100644
index 00000000000..da79aa33702
--- /dev/null
+++ b/storage/xtradb/include/data0data.ic
@@ -0,0 +1,612 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/********************************************************************//**
+@file include/data0data.ic
+SQL data field and tuple
+
+Created 5/30/1994 Heikki Tuuri
+*************************************************************************/
+
+#include "mem0mem.h"
+#include "ut0rnd.h"
+
+#ifdef UNIV_DEBUG
+/** Dummy variable to catch access to uninitialized fields.  In the
+debug version, dtuple_create() will make all fields of dtuple_t point
+to data_error. */
+extern byte data_error;
+
+/*********************************************************************//**
+Gets pointer to the type struct of SQL data field.
+@return	pointer to the type struct */
+UNIV_INLINE
+dtype_t*
+dfield_get_type(
+/*============*/
+	const dfield_t*	field)	/*!< in: SQL data field */
+{
+	ut_ad(field);
+
+	return((dtype_t*) &(field->type));
+}
+#endif /* UNIV_DEBUG */
+
+/*********************************************************************//**
+Sets the type struct of SQL data field. */
+UNIV_INLINE
+void
+dfield_set_type(
+/*============*/
+	dfield_t*	field,	/*!< in: SQL data field */
+	dtype_t*	type)	/*!< in: pointer to data type struct */
+{
+	ut_ad(field && type);
+
+	field->type = *type;
+}
+
+#ifdef UNIV_DEBUG
+/*********************************************************************//**
+Gets pointer to the data in a field.
+@return	pointer to data */
+UNIV_INLINE
+void*
+dfield_get_data(
+/*============*/
+	const dfield_t* field)	/*!< in: field */
+{
+	ut_ad(field);
+	ut_ad((field->len == UNIV_SQL_NULL)
+	      || (field->data != &data_error));
+
+	return((void*) field->data);
+}
+#endif /* UNIV_DEBUG */
+
+/*********************************************************************//**
+Gets length of field data.
+@return	length of data; UNIV_SQL_NULL if SQL null data */
+UNIV_INLINE
+ulint
+dfield_get_len(
+/*===========*/
+	const dfield_t*	field)	/*!< in: field */
+{
+	ut_ad(field);
+	ut_ad((field->len == UNIV_SQL_NULL)
+	      || (field->data != &data_error));
+
+	return(field->len);
+}
+
+/*********************************************************************//**
+Sets length in a field. */
+UNIV_INLINE
+void
+dfield_set_len(
+/*===========*/
+	dfield_t*	field,	/*!< in: field */
+	ulint		len)	/*!< in: length or UNIV_SQL_NULL */
+{
+	ut_ad(field);
+#ifdef UNIV_VALGRIND_DEBUG
+	if (len != UNIV_SQL_NULL) UNIV_MEM_ASSERT_RW(field->data, len);
+#endif /* UNIV_VALGRIND_DEBUG */
+
+	field->ext = 0;
+	field->len = len;
+}
+
+/*********************************************************************//**
+Determines if a field is SQL NULL
+@return	nonzero if SQL null data */
+UNIV_INLINE
+ulint
+dfield_is_null(
+/*===========*/
+	const dfield_t* field)	/*!< in: field */
+{
+	ut_ad(field);
+
+	return(field->len == UNIV_SQL_NULL);
+}
+
+/*********************************************************************//**
+Determines if a field is externally stored
+@return	nonzero if externally stored */
+UNIV_INLINE
+ulint
+dfield_is_ext(
+/*==========*/
+	const dfield_t* field)	/*!< in: field */
+{
+	ut_ad(field);
+
+	return(UNIV_UNLIKELY(field->ext));
+}
+
+/*********************************************************************//**
+Sets the "external storage" flag */
+UNIV_INLINE
+void
+dfield_set_ext(
+/*===========*/
+	dfield_t*	field)	/*!< in/out: field */
+{
+	ut_ad(field);
+
+	field->ext = 1;
+}
+
+/*********************************************************************//**
+Sets pointer to the data and length in a field. */
+UNIV_INLINE
+void
+dfield_set_data(
+/*============*/
+	dfield_t*	field,	/*!< in: field */
+	const void*	data,	/*!< in: data */
+	ulint		len)	/*!< in: length or UNIV_SQL_NULL */
+{
+	ut_ad(field);
+
+#ifdef UNIV_VALGRIND_DEBUG
+	if (len != UNIV_SQL_NULL) UNIV_MEM_ASSERT_RW(data, len);
+#endif /* UNIV_VALGRIND_DEBUG */
+	field->data = (void*) data;
+	field->ext = 0;
+	field->len = len;
+}
+
+/*********************************************************************//**
+Sets a data field to SQL NULL. */
+UNIV_INLINE
+void
+dfield_set_null(
+/*============*/
+	dfield_t*	field)	/*!< in/out: field */
+{
+	dfield_set_data(field, NULL, UNIV_SQL_NULL);
+}
+
+/*********************************************************************//**
+Copies the data and len fields. */
+UNIV_INLINE
+void
+dfield_copy_data(
+/*=============*/
+	dfield_t*	field1,	/*!< out: field to copy to */
+	const dfield_t*	field2)	/*!< in: field to copy from */
+{
+	ut_ad(field1 && field2);
+
+	field1->data = field2->data;
+	field1->len = field2->len;
+	field1->ext = field2->ext;
+}
+
+/*********************************************************************//**
+Copies a data field to another. */
+UNIV_INLINE
+void
+dfield_copy(
+/*========*/
+	dfield_t*	field1,	/*!< out: field to copy to */
+	const dfield_t*	field2)	/*!< in: field to copy from */
+{
+	*field1 = *field2;
+}
+
+/*********************************************************************//**
+Copies the data pointed to by a data field. */
+UNIV_INLINE
+void
+dfield_dup(
+/*=======*/
+	dfield_t*	field,	/*!< in/out: data field */
+	mem_heap_t*	heap)	/*!< in: memory heap where allocated */
+{
+	if (!dfield_is_null(field)) {
+		UNIV_MEM_ASSERT_RW(field->data, field->len);
+		field->data = mem_heap_dup(heap, field->data, field->len);
+	}
+}
+
+/*********************************************************************//**
+Tests if data length and content is equal for two dfields.
+@return	TRUE if equal */
+UNIV_INLINE
+ibool
+dfield_datas_are_binary_equal(
+/*==========================*/
+	const dfield_t*	field1,	/*!< in: field */
+	const dfield_t*	field2)	/*!< in: field */
+{
+	ulint	len;
+
+	len = field1->len;
+
+	return(len == field2->len
+	       && (len == UNIV_SQL_NULL
+		   || !memcmp(field1->data, field2->data, len)));
+}
+
+/*********************************************************************//**
+Gets info bits in a data tuple.
+@return	info bits */
+UNIV_INLINE
+ulint
+dtuple_get_info_bits(
+/*=================*/
+	const dtuple_t*	tuple)	/*!< in: tuple */
+{
+	ut_ad(tuple);
+
+	return(tuple->info_bits);
+}
+
+/*********************************************************************//**
+Sets info bits in a data tuple. */
+UNIV_INLINE
+void
+dtuple_set_info_bits(
+/*=================*/
+	dtuple_t*	tuple,		/*!< in: tuple */
+	ulint		info_bits)	/*!< in: info bits */
+{
+	ut_ad(tuple);
+
+	tuple->info_bits = info_bits;
+}
+
+/*********************************************************************//**
+Gets number of fields used in record comparisons.
+@return	number of fields used in comparisons in rem0cmp.* */
+UNIV_INLINE
+ulint
+dtuple_get_n_fields_cmp(
+/*====================*/
+	const dtuple_t*	tuple)	/*!< in: tuple */
+{
+	ut_ad(tuple);
+
+	return(tuple->n_fields_cmp);
+}
+
+/*********************************************************************//**
+Sets number of fields used in record comparisons. */
+UNIV_INLINE
+void
+dtuple_set_n_fields_cmp(
+/*====================*/
+	dtuple_t*	tuple,		/*!< in: tuple */
+	ulint		n_fields_cmp)	/*!< in: number of fields used in
+					comparisons in rem0cmp.* */
+{
+	ut_ad(tuple);
+	ut_ad(n_fields_cmp <= tuple->n_fields);
+
+	tuple->n_fields_cmp = n_fields_cmp;
+}
+
+/*********************************************************************//**
+Gets number of fields in a data tuple.
+@return	number of fields */
+UNIV_INLINE
+ulint
+dtuple_get_n_fields(
+/*================*/
+	const dtuple_t*	tuple)	/*!< in: tuple */
+{
+	ut_ad(tuple);
+
+	return(tuple->n_fields);
+}
+
+#ifdef UNIV_DEBUG
+/*********************************************************************//**
+Gets nth field of a tuple.
+@return	nth field */
+UNIV_INLINE
+dfield_t*
+dtuple_get_nth_field(
+/*=================*/
+	const dtuple_t*	tuple,	/*!< in: tuple */
+	ulint		n)	/*!< in: index of field */
+{
+	ut_ad(tuple);
+	ut_ad(n < tuple->n_fields);
+
+	return((dfield_t*) tuple->fields + n);
+}
+#endif /* UNIV_DEBUG */
+
+/**********************************************************//**
+Creates a data tuple to a memory heap. The default value for number
+of fields used in record comparisons for this tuple is n_fields.
+@return	own: created tuple */
+UNIV_INLINE
+dtuple_t*
+dtuple_create(
+/*==========*/
+	mem_heap_t*	heap,	/*!< in: memory heap where the tuple
+				is created */
+	ulint		n_fields) /*!< in: number of fields */
+{
+	dtuple_t*	tuple;
+
+	ut_ad(heap);
+
+	tuple = (dtuple_t*) mem_heap_alloc(heap, sizeof(dtuple_t)
+					   + n_fields * sizeof(dfield_t));
+	tuple->info_bits = 0;
+	tuple->n_fields = n_fields;
+	tuple->n_fields_cmp = n_fields;
+	tuple->fields = (dfield_t*) &tuple[1];
+
+#ifdef UNIV_DEBUG
+	tuple->magic_n = DATA_TUPLE_MAGIC_N;
+
+	{	/* In the debug version, initialize fields to an error value */
+		ulint	i;
+
+		for (i = 0; i < n_fields; i++) {
+			dfield_t*       field;
+
+			field = dtuple_get_nth_field(tuple, i);
+
+			dfield_set_len(field, UNIV_SQL_NULL);
+			field->data = &data_error;
+			dfield_get_type(field)->mtype = DATA_ERROR;
+		}
+	}
+
+	UNIV_MEM_INVALID(tuple->fields, n_fields * sizeof *tuple->fields);
+#endif
+	return(tuple);
+}
+
+/**********************************************************//**
+Wrap data fields in a tuple. The default value for number
+of fields used in record comparisons for this tuple is n_fields.
+@return	data tuple */
+UNIV_INLINE
+const dtuple_t*
+dtuple_from_fields(
+/*===============*/
+	dtuple_t*	tuple,		/*!< in: storage for data tuple */
+	const dfield_t*	fields,		/*!< in: fields */
+	ulint		n_fields)	/*!< in: number of fields */
+{
+	tuple->info_bits = 0;
+	tuple->n_fields = tuple->n_fields_cmp = n_fields;
+	tuple->fields = (dfield_t*) fields;
+	ut_d(tuple->magic_n = DATA_TUPLE_MAGIC_N);
+
+	return(tuple);
+}
+
+/*********************************************************************//**
+Copies a data tuple to another.  This is a shallow copy; if a deep copy
+is desired, dfield_dup() will have to be invoked on each field.
+@return	own: copy of tuple */
+UNIV_INLINE
+dtuple_t*
+dtuple_copy(
+/*========*/
+	const dtuple_t*	tuple,	/*!< in: tuple to copy from */
+	mem_heap_t*	heap)	/*!< in: memory heap
+				where the tuple is created */
+{
+	ulint		n_fields	= dtuple_get_n_fields(tuple);
+	dtuple_t*	new_tuple	= dtuple_create(heap, n_fields);
+	ulint		i;
+
+	for (i = 0; i < n_fields; i++) {
+		dfield_copy(dtuple_get_nth_field(new_tuple, i),
+			    dtuple_get_nth_field(tuple, i));
+	}
+
+	return(new_tuple);
+}
+
+/**********************************************************//**
+The following function returns the sum of data lengths of a tuple. The space
+occupied by the field structs or the tuple struct is not counted. Neither
+is possible space in externally stored parts of the field.
+@return	sum of data lengths */
+UNIV_INLINE
+ulint
+dtuple_get_data_size(
+/*=================*/
+	const dtuple_t*	tuple,	/*!< in: typed data tuple */
+	ulint		comp)	/*!< in: nonzero=ROW_FORMAT=COMPACT  */
+{
+	const dfield_t*	field;
+	ulint		n_fields;
+	ulint		len;
+	ulint		i;
+	ulint		sum	= 0;
+
+	ut_ad(tuple);
+	ut_ad(dtuple_check_typed(tuple));
+	ut_ad(tuple->magic_n == DATA_TUPLE_MAGIC_N);
+
+	n_fields = tuple->n_fields;
+
+	for (i = 0; i < n_fields; i++) {
+		field = dtuple_get_nth_field(tuple,  i);
+		len = dfield_get_len(field);
+
+		if (len == UNIV_SQL_NULL) {
+			len = dtype_get_sql_null_size(dfield_get_type(field),
+						      comp);
+		}
+
+		sum += len;
+	}
+
+	return(sum);
+}
+
+/*********************************************************************//**
+Computes the number of externally stored fields in a data tuple.
+@return	number of externally stored fields */
+UNIV_INLINE
+ulint
+dtuple_get_n_ext(
+/*=============*/
+	const dtuple_t*	tuple)	/*!< in: tuple */
+{
+	ulint	n_ext		= 0;
+	ulint	n_fields	= tuple->n_fields;
+	ulint	i;
+
+	ut_ad(tuple);
+	ut_ad(dtuple_check_typed(tuple));
+	ut_ad(tuple->magic_n == DATA_TUPLE_MAGIC_N);
+
+	for (i = 0; i < n_fields; i++) {
+		n_ext += dtuple_get_nth_field(tuple, i)->ext;
+	}
+
+	return(n_ext);
+}
+
+/*******************************************************************//**
+Sets types of fields binary in a tuple. */
+UNIV_INLINE
+void
+dtuple_set_types_binary(
+/*====================*/
+	dtuple_t*	tuple,	/*!< in: data tuple */
+	ulint		n)	/*!< in: number of fields to set */
+{
+	dtype_t*	dfield_type;
+	ulint		i;
+
+	for (i = 0; i < n; i++) {
+		dfield_type = dfield_get_type(dtuple_get_nth_field(tuple, i));
+		dtype_set(dfield_type, DATA_BINARY, 0, 0);
+	}
+}
+
+/************************************************************//**
+Folds a prefix given as the number of fields of a tuple.
+@return	the folded value */
+UNIV_INLINE
+ulint
+dtuple_fold(
+/*========*/
+	const dtuple_t*	tuple,	/*!< in: the tuple */
+	ulint		n_fields,/*!< in: number of complete fields to fold */
+	ulint		n_bytes,/*!< in: number of bytes to fold in an
+				incomplete last field */
+	dulint		tree_id)/*!< in: index tree id */
+{
+	const dfield_t*	field;
+	ulint		i;
+	const byte*	data;
+	ulint		len;
+	ulint		fold;
+
+	ut_ad(tuple);
+	ut_ad(tuple->magic_n == DATA_TUPLE_MAGIC_N);
+	ut_ad(dtuple_check_typed(tuple));
+
+	fold = ut_fold_dulint(tree_id);
+
+	for (i = 0; i < n_fields; i++) {
+		field = dtuple_get_nth_field(tuple, i);
+
+		data = (const byte*) dfield_get_data(field);
+		len = dfield_get_len(field);
+
+		if (len != UNIV_SQL_NULL) {
+			fold = ut_fold_ulint_pair(fold,
+						  ut_fold_binary(data, len));
+		}
+	}
+
+	if (n_bytes > 0) {
+		field = dtuple_get_nth_field(tuple, i);
+
+		data = (const byte*) dfield_get_data(field);
+		len = dfield_get_len(field);
+
+		if (len != UNIV_SQL_NULL) {
+			if (len > n_bytes) {
+				len = n_bytes;
+			}
+
+			fold = ut_fold_ulint_pair(fold,
+						  ut_fold_binary(data, len));
+		}
+	}
+
+	return(fold);
+}
+
+/**********************************************************************//**
+Writes an SQL null field full of zeros. */
+UNIV_INLINE
+void
+data_write_sql_null(
+/*================*/
+	byte*	data,	/*!< in: pointer to a buffer of size len */
+	ulint	len)	/*!< in: SQL null size in bytes */
+{
+	memset(data, 0, len);
+}
+
+/**********************************************************************//**
+Checks if a dtuple contains an SQL null value.
+@return	TRUE if some field is SQL null */
+UNIV_INLINE
+ibool
+dtuple_contains_null(
+/*=================*/
+	const dtuple_t*	tuple)	/*!< in: dtuple */
+{
+	ulint	n;
+	ulint	i;
+
+	n = dtuple_get_n_fields(tuple);
+
+	for (i = 0; i < n; i++) {
+		if (dfield_is_null(dtuple_get_nth_field(tuple, i))) {
+
+			return(TRUE);
+		}
+	}
+
+	return(FALSE);
+}
+
+/**************************************************************//**
+Frees the memory in a big rec vector. */
+UNIV_INLINE
+void
+dtuple_big_rec_free(
+/*================*/
+	big_rec_t*	vector)	/*!< in, own: big rec vector; it is
+				freed in this function */
+{
+	mem_heap_free(vector->heap);
+}
diff --git a/storage/xtradb/include/data0type.h b/storage/xtradb/include/data0type.h
new file mode 100644
index 00000000000..a73bed3a9f5
--- /dev/null
+++ b/storage/xtradb/include/data0type.h
@@ -0,0 +1,486 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/data0type.h
+Data types
+
+Created 1/16/1996 Heikki Tuuri
+*******************************************************/
+
+#ifndef data0type_h
+#define data0type_h
+
+#include "univ.i"
+
+extern ulint	data_mysql_default_charset_coll;
+#define DATA_MYSQL_LATIN1_SWEDISH_CHARSET_COLL 8
+#define DATA_MYSQL_BINARY_CHARSET_COLL 63
+
+/* SQL data type struct */
+typedef struct dtype_struct		dtype_t;
+
+/*-------------------------------------------*/
+/* The 'MAIN TYPE' of a column */
+#define	DATA_VARCHAR	1	/* character varying of the
+				latin1_swedish_ci charset-collation; note
+				that the MySQL format for this, DATA_BINARY,
+				DATA_VARMYSQL, is also affected by whether the
+				'precise type' contains
+				DATA_MYSQL_TRUE_VARCHAR */
+#define DATA_CHAR	2	/* fixed length character of the
+				latin1_swedish_ci charset-collation */
+#define DATA_FIXBINARY	3	/* binary string of fixed length */
+#define DATA_BINARY	4	/* binary string */
+#define DATA_BLOB	5	/* binary large object, or a TEXT type;
+				if prtype & DATA_BINARY_TYPE == 0, then this is
+				actually a TEXT column (or a BLOB created
+				with < 4.0.14; since column prefix indexes
+				came only in 4.0.14, the missing flag in BLOBs
+				created before that does not cause any harm) */
+#define	DATA_INT	6	/* integer: can be any size 1 - 8 bytes */
+#define	DATA_SYS_CHILD	7	/* address of the child page in node pointer */
+#define	DATA_SYS	8	/* system column */
+
+/* Data types >= DATA_FLOAT must be compared using the whole field, not as
+binary strings */
+
+#define DATA_FLOAT	9
+#define DATA_DOUBLE	10
+#define DATA_DECIMAL	11	/* decimal number stored as an ASCII string */
+#define	DATA_VARMYSQL	12	/* any charset varying length char */
+#define	DATA_MYSQL	13	/* any charset fixed length char */
+				/* NOTE that 4.1.1 used DATA_MYSQL and
+				DATA_VARMYSQL for all character sets, and the
+				charset-collation for tables created with it
+				can also be latin1_swedish_ci */
+#define DATA_MTYPE_MAX	63	/* dtype_store_for_order_and_null_size()
+				requires the values are <= 63 */
+/*-------------------------------------------*/
+/* The 'PRECISE TYPE' of a column */
+/*
+Tables created by a MySQL user have the following convention:
+
+- In the least significant byte in the precise type we store the MySQL type
+code (not applicable for system columns).
+
+- In the second least significant byte we OR flags DATA_NOT_NULL,
+DATA_UNSIGNED, DATA_BINARY_TYPE.
+
+- In the third least significant byte of the precise type of string types we
+store the MySQL charset-collation code. In DATA_BLOB columns created with
+< 4.0.14 we do not actually know if it is a BLOB or a TEXT column. Since there
+are no indexes on prefixes of BLOB or TEXT columns in < 4.0.14, this is no
+problem, though.
+
+Note that versions < 4.1.2 or < 5.0.1 did not store the charset code to the
+precise type, since the charset was always the default charset of the MySQL
+installation. If the stored charset code is 0 in the system table SYS_COLUMNS
+of InnoDB, that means that the default charset of this MySQL installation
+should be used.
+
+When loading a table definition from the system tables to the InnoDB data
+dictionary cache in main memory, InnoDB versions >= 4.1.2 and >= 5.0.1 check
+if the stored charset-collation is 0, and if that is the case and the type is
+a non-binary string, replace that 0 by the default charset-collation code of
+this MySQL installation. In short, in old tables, the charset-collation code
+in the system tables on disk can be 0, but in in-memory data structures
+(dtype_t), the charset-collation code is always != 0 for non-binary string
+types.
+
+In new tables, in binary string types, the charset-collation code is the
+MySQL code for the 'binary charset', that is, != 0.
+
+For binary string types and for DATA_CHAR, DATA_VARCHAR, and for those
+DATA_BLOB which are binary or have the charset-collation latin1_swedish_ci,
+InnoDB performs all comparisons internally, without resorting to the MySQL
+comparison functions. This is to save CPU time.
+
+InnoDB's own internal system tables have different precise types for their
+columns, and for them the precise type is usually not used at all.
+*/
+
+#define DATA_ENGLISH	4	/* English language character string: this
+				is a relic from pre-MySQL time and only used
+				for InnoDB's own system tables */
+#define DATA_ERROR	111	/* another relic from pre-MySQL time */
+
+#define DATA_MYSQL_TYPE_MASK 255 /* AND with this mask to extract the MySQL
+				 type from the precise type */
+#define DATA_MYSQL_TRUE_VARCHAR 15 /* MySQL type code for the >= 5.0.3
+				   format true VARCHAR */
+
+/* Precise data types for system columns and the length of those columns;
+NOTE: the values must run from 0 up in the order given! All codes must
+be less than 256 */
+#define	DATA_ROW_ID	0	/* row id: a dulint */
+#define DATA_ROW_ID_LEN	6	/* stored length for row id */
+
+#define DATA_TRX_ID	1	/* transaction id: 6 bytes */
+#define DATA_TRX_ID_LEN	6
+
+#define	DATA_ROLL_PTR	2	/* rollback data pointer: 7 bytes */
+#define DATA_ROLL_PTR_LEN 7
+
+#define	DATA_N_SYS_COLS 3	/* number of system columns defined above */
+
+#define DATA_SYS_PRTYPE_MASK 0xF /* mask to extract the above from prtype */
+
+/* Flags ORed to the precise data type */
+#define DATA_NOT_NULL	256	/* this is ORed to the precise type when
+				the column is declared as NOT NULL */
+#define DATA_UNSIGNED	512	/* this id ORed to the precise type when
+				we have an unsigned integer type */
+#define	DATA_BINARY_TYPE 1024	/* if the data type is a binary character
+				string, this is ORed to the precise type:
+				this only holds for tables created with
+				>= MySQL-4.0.14 */
+/* #define	DATA_NONLATIN1	2048 This is a relic from < 4.1.2 and < 5.0.1.
+				In earlier versions this was set for some
+				BLOB columns.
+*/
+#define	DATA_LONG_TRUE_VARCHAR 4096	/* this is ORed to the precise data
+				type when the column is true VARCHAR where
+				MySQL uses 2 bytes to store the data len;
+				for shorter VARCHARs MySQL uses only 1 byte */
+/*-------------------------------------------*/
+
+/* This many bytes we need to store the type information affecting the
+alphabetical order for a single field and decide the storage size of an
+SQL null*/
+#define DATA_ORDER_NULL_TYPE_BUF_SIZE		4
+/* In the >= 4.1.x storage format we add 2 bytes more so that we can also
+store the charset-collation number; one byte is left unused, though */
+#define DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE	6
+
+#ifndef UNIV_HOTBACKUP
+/*********************************************************************//**
+Gets the MySQL type code from a dtype.
+@return	MySQL type code; this is NOT an InnoDB type code! */
+UNIV_INLINE
+ulint
+dtype_get_mysql_type(
+/*=================*/
+	const dtype_t*	type);	/*!< in: type struct */
+/*********************************************************************//**
+Determine how many bytes the first n characters of the given string occupy.
+If the string is shorter than n characters, returns the number of bytes
+the characters in the string occupy.
+@return	length of the prefix, in bytes */
+UNIV_INTERN
+ulint
+dtype_get_at_most_n_mbchars(
+/*========================*/
+	ulint		prtype,		/*!< in: precise type */
+	ulint		mbminlen,	/*!< in: minimum length of a
+					multi-byte character */
+	ulint		mbmaxlen,	/*!< in: maximum length of a
+					multi-byte character */
+	ulint		prefix_len,	/*!< in: length of the requested
+					prefix, in characters, multiplied by
+					dtype_get_mbmaxlen(dtype) */
+	ulint		data_len,	/*!< in: length of str (in bytes) */
+	const char*	str);		/*!< in: the string whose prefix
+					length is being determined */
+#endif /* !UNIV_HOTBACKUP */
+/*********************************************************************//**
+Checks if a data main type is a string type. Also a BLOB is considered a
+string type.
+@return	TRUE if string type */
+UNIV_INTERN
+ibool
+dtype_is_string_type(
+/*=================*/
+	ulint	mtype);	/*!< in: InnoDB main data type code: DATA_CHAR, ... */
+/*********************************************************************//**
+Checks if a type is a binary string type. Note that for tables created with
+< 4.0.14, we do not know if a DATA_BLOB column is a BLOB or a TEXT column. For
+those DATA_BLOB columns this function currently returns FALSE.
+@return	TRUE if binary string type */
+UNIV_INTERN
+ibool
+dtype_is_binary_string_type(
+/*========================*/
+	ulint	mtype,	/*!< in: main data type */
+	ulint	prtype);/*!< in: precise type */
+/*********************************************************************//**
+Checks if a type is a non-binary string type. That is, dtype_is_string_type is
+TRUE and dtype_is_binary_string_type is FALSE. Note that for tables created
+with < 4.0.14, we do not know if a DATA_BLOB column is a BLOB or a TEXT column.
+For those DATA_BLOB columns this function currently returns TRUE.
+@return	TRUE if non-binary string type */
+UNIV_INTERN
+ibool
+dtype_is_non_binary_string_type(
+/*============================*/
+	ulint	mtype,	/*!< in: main data type */
+	ulint	prtype);/*!< in: precise type */
+/*********************************************************************//**
+Sets a data type structure. */
+UNIV_INLINE
+void
+dtype_set(
+/*======*/
+	dtype_t*	type,	/*!< in: type struct to init */
+	ulint		mtype,	/*!< in: main data type */
+	ulint		prtype,	/*!< in: precise type */
+	ulint		len);	/*!< in: precision of type */
+/*********************************************************************//**
+Copies a data type structure. */
+UNIV_INLINE
+void
+dtype_copy(
+/*=======*/
+	dtype_t*	type1,	/*!< in: type struct to copy to */
+	const dtype_t*	type2);	/*!< in: type struct to copy from */
+/*********************************************************************//**
+Gets the SQL main data type.
+@return	SQL main data type */
+UNIV_INLINE
+ulint
+dtype_get_mtype(
+/*============*/
+	const dtype_t*	type);	/*!< in: data type */
+/*********************************************************************//**
+Gets the precise data type.
+@return	precise data type */
+UNIV_INLINE
+ulint
+dtype_get_prtype(
+/*=============*/
+	const dtype_t*	type);	/*!< in: data type */
+#ifndef UNIV_HOTBACKUP
+/*********************************************************************//**
+Compute the mbminlen and mbmaxlen members of a data type structure. */
+UNIV_INLINE
+void
+dtype_get_mblen(
+/*============*/
+	ulint	mtype,		/*!< in: main type */
+	ulint	prtype,		/*!< in: precise type (and collation) */
+	ulint*	mbminlen,	/*!< out: minimum length of a
+				multi-byte character */
+	ulint*	mbmaxlen);	/*!< out: maximum length of a
+				multi-byte character */
+/*********************************************************************//**
+Gets the MySQL charset-collation code for MySQL string types.
+@return	MySQL charset-collation code */
+UNIV_INLINE
+ulint
+dtype_get_charset_coll(
+/*===================*/
+	ulint	prtype);/*!< in: precise data type */
+/*********************************************************************//**
+Forms a precise type from the < 4.1.2 format precise type plus the
+charset-collation code.
+@return precise type, including the charset-collation code */
+UNIV_INTERN
+ulint
+dtype_form_prtype(
+/*==============*/
+	ulint	old_prtype,	/*!< in: the MySQL type code and the flags
+				DATA_BINARY_TYPE etc. */
+	ulint	charset_coll);	/*!< in: MySQL charset-collation code */
+/*********************************************************************//**
+Determines if a MySQL string type is a subset of UTF-8.  This function
+may return false negatives, in case further character-set collation
+codes are introduced in MySQL later.
+@return	TRUE if a subset of UTF-8 */
+UNIV_INLINE
+ibool
+dtype_is_utf8(
+/*==========*/
+	ulint	prtype);/*!< in: precise data type */
+#endif /* !UNIV_HOTBACKUP */
+/*********************************************************************//**
+Gets the type length.
+@return	fixed length of the type, in bytes, or 0 if variable-length */
+UNIV_INLINE
+ulint
+dtype_get_len(
+/*==========*/
+	const dtype_t*	type);	/*!< in: data type */
+#ifndef UNIV_HOTBACKUP
+/*********************************************************************//**
+Gets the minimum length of a character, in bytes.
+@return minimum length of a char, in bytes, or 0 if this is not a
+character type */
+UNIV_INLINE
+ulint
+dtype_get_mbminlen(
+/*===============*/
+	const dtype_t*	type);	/*!< in: type */
+/*********************************************************************//**
+Gets the maximum length of a character, in bytes.
+@return maximum length of a char, in bytes, or 0 if this is not a
+character type */
+UNIV_INLINE
+ulint
+dtype_get_mbmaxlen(
+/*===============*/
+	const dtype_t*	type);	/*!< in: type */
+/*********************************************************************//**
+Gets the padding character code for the type.
+@return	padding character code, or ULINT_UNDEFINED if no padding specified */
+UNIV_INLINE
+ulint
+dtype_get_pad_char(
+/*===============*/
+	ulint	mtype,		/*!< in: main type */
+	ulint	prtype);	/*!< in: precise type */
+#endif /* !UNIV_HOTBACKUP */
+/***********************************************************************//**
+Returns the size of a fixed size data type, 0 if not a fixed size type.
+@return	fixed size, or 0 */
+UNIV_INLINE
+ulint
+dtype_get_fixed_size_low(
+/*=====================*/
+	ulint	mtype,		/*!< in: main type */
+	ulint	prtype,		/*!< in: precise type */
+	ulint	len,		/*!< in: length */
+	ulint	mbminlen,	/*!< in: minimum length of a multibyte char */
+	ulint	mbmaxlen,	/*!< in: maximum length of a multibyte char */
+	ulint	comp);		/*!< in: nonzero=ROW_FORMAT=COMPACT  */
+#ifndef UNIV_HOTBACKUP
+/***********************************************************************//**
+Returns the minimum size of a data type.
+@return	minimum size */
+UNIV_INLINE
+ulint
+dtype_get_min_size_low(
+/*===================*/
+	ulint	mtype,		/*!< in: main type */
+	ulint	prtype,		/*!< in: precise type */
+	ulint	len,		/*!< in: length */
+	ulint	mbminlen,	/*!< in: minimum length of a multibyte char */
+	ulint	mbmaxlen);	/*!< in: maximum length of a multibyte char */
+/***********************************************************************//**
+Returns the maximum size of a data type. Note: types in system tables may be
+incomplete and return incorrect information.
+@return	maximum size */
+UNIV_INLINE
+ulint
+dtype_get_max_size_low(
+/*===================*/
+	ulint	mtype,		/*!< in: main type */
+	ulint	len);		/*!< in: length */
+#endif /* !UNIV_HOTBACKUP */
+/***********************************************************************//**
+Returns the ROW_FORMAT=REDUNDANT stored SQL NULL size of a type.
+For fixed length types it is the fixed length of the type, otherwise 0.
+@return	SQL null storage size in ROW_FORMAT=REDUNDANT */
+UNIV_INLINE
+ulint
+dtype_get_sql_null_size(
+/*====================*/
+	const dtype_t*	type,	/*!< in: type */
+	ulint		comp);	/*!< in: nonzero=ROW_FORMAT=COMPACT  */
+#ifndef UNIV_HOTBACKUP
+/**********************************************************************//**
+Reads to a type the stored information which determines its alphabetical
+ordering and the storage size of an SQL NULL value. */
+UNIV_INLINE
+void
+dtype_read_for_order_and_null_size(
+/*===============================*/
+	dtype_t*	type,	/*!< in: type struct */
+	const byte*	buf);	/*!< in: buffer for the stored order info */
+/**********************************************************************//**
+Stores for a type the information which determines its alphabetical ordering
+and the storage size of an SQL NULL value. This is the >= 4.1.x storage
+format. */
+UNIV_INLINE
+void
+dtype_new_store_for_order_and_null_size(
+/*====================================*/
+	byte*		buf,	/*!< in: buffer for
+				DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE
+				bytes where we store the info */
+	const dtype_t*	type,	/*!< in: type struct */
+	ulint		prefix_len);/*!< in: prefix length to
+				replace type->len, or 0 */
+/**********************************************************************//**
+Reads to a type the stored information which determines its alphabetical
+ordering and the storage size of an SQL NULL value. This is the 4.1.x storage
+format. */
+UNIV_INLINE
+void
+dtype_new_read_for_order_and_null_size(
+/*===================================*/
+	dtype_t*	type,	/*!< in: type struct */
+	const byte*	buf);	/*!< in: buffer for stored type order info */
+#endif /* !UNIV_HOTBACKUP */
+
+/*********************************************************************//**
+Validates a data type structure.
+@return	TRUE if ok */
+UNIV_INTERN
+ibool
+dtype_validate(
+/*===========*/
+	const dtype_t*	type);	/*!< in: type struct to validate */
+/*********************************************************************//**
+Prints a data type structure. */
+UNIV_INTERN
+void
+dtype_print(
+/*========*/
+	const dtype_t*	type);	/*!< in: type */
+
+/* Structure for an SQL data type.
+If you add fields to this structure, be sure to initialize them everywhere.
+This structure is initialized in the following functions:
+dtype_set()
+dtype_read_for_order_and_null_size()
+dtype_new_read_for_order_and_null_size()
+sym_tab_add_null_lit() */
+
+struct dtype_struct{
+	unsigned	mtype:8;	/*!< main data type */
+	unsigned	prtype:24;	/*!< precise type; MySQL data
+					type, charset code, flags to
+					indicate nullability,
+					signedness, whether this is a
+					binary string, whether this is
+					a true VARCHAR where MySQL
+					uses 2 bytes to store the length */
+
+	/* the remaining fields do not affect alphabetical ordering: */
+
+	unsigned	len:16;		/*!< length; for MySQL data this
+					is field->pack_length(),
+					except that for a >= 5.0.3
+					type true VARCHAR this is the
+					maximum byte length of the
+					string data (in addition to
+					the string, MySQL uses 1 or 2
+					bytes to store the string length) */
+#ifndef UNIV_HOTBACKUP
+	unsigned	mbminlen:2;	/*!< minimum length of a
+					character, in bytes */
+	unsigned	mbmaxlen:3;	/*!< maximum length of a
+					character, in bytes */
+#endif /* !UNIV_HOTBACKUP */
+};
+
+#ifndef UNIV_NONINL
+#include "data0type.ic"
+#endif
+
+#endif
diff --git a/storage/xtradb/include/data0type.ic b/storage/xtradb/include/data0type.ic
new file mode 100644
index 00000000000..2bf67a941bd
--- /dev/null
+++ b/storage/xtradb/include/data0type.ic
@@ -0,0 +1,603 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2010, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/data0type.ic
+Data types
+
+Created 1/16/1996 Heikki Tuuri
+*******************************************************/
+
+#include "mach0data.h"
+#ifndef UNIV_HOTBACKUP
+# include "ha_prototypes.h"
+
+/*********************************************************************//**
+Gets the MySQL charset-collation code for MySQL string types.
+@return	MySQL charset-collation code */
+UNIV_INLINE
+ulint
+dtype_get_charset_coll(
+/*===================*/
+	ulint	prtype)	/*!< in: precise data type */
+{
+	return((prtype >> 16) & 0xFFUL);
+}
+
+/*********************************************************************//**
+Determines if a MySQL string type is a subset of UTF-8.  This function
+may return false negatives, in case further character-set collation
+codes are introduced in MySQL later.
+@return	TRUE if a subset of UTF-8 */
+UNIV_INLINE
+ibool
+dtype_is_utf8(
+/*==========*/
+	ulint	prtype)	/*!< in: precise data type */
+{
+	/* These codes have been copied from strings/ctype-extra.c
+	and strings/ctype-utf8.c. */
+	switch (dtype_get_charset_coll(prtype)) {
+	case 11: /* ascii_general_ci */
+	case 65: /* ascii_bin */
+	case 33: /* utf8_general_ci */
+	case 83: /* utf8_bin */
+	case 254: /* utf8_general_cs */
+			return(TRUE);
+	}
+
+	return(FALSE);
+}
+
+/*********************************************************************//**
+Gets the MySQL type code from a dtype.
+@return	MySQL type code; this is NOT an InnoDB type code! */
+UNIV_INLINE
+ulint
+dtype_get_mysql_type(
+/*=================*/
+	const dtype_t*	type)	/*!< in: type struct */
+{
+	return(type->prtype & 0xFFUL);
+}
+
+/*********************************************************************//**
+Compute the mbminlen and mbmaxlen members of a data type structure. */
+UNIV_INLINE
+void
+dtype_get_mblen(
+/*============*/
+	ulint	mtype,		/*!< in: main type */
+	ulint	prtype,		/*!< in: precise type (and collation) */
+	ulint*	mbminlen,	/*!< out: minimum length of a
+				multi-byte character */
+	ulint*	mbmaxlen)	/*!< out: maximum length of a
+				multi-byte character */
+{
+	if (dtype_is_string_type(mtype)) {
+		innobase_get_cset_width(dtype_get_charset_coll(prtype),
+					mbminlen, mbmaxlen);
+		ut_ad(*mbminlen <= *mbmaxlen);
+		ut_ad(*mbminlen <= 2); /* mbminlen in dtype_t is 0..3 */
+		ut_ad(*mbmaxlen < 1 << 3); /* mbmaxlen in dtype_t is 0..7 */
+	} else {
+		*mbminlen = *mbmaxlen = 0;
+	}
+}
+
+/*********************************************************************//**
+Compute the mbminlen and mbmaxlen members of a data type structure. */
+UNIV_INLINE
+void
+dtype_set_mblen(
+/*============*/
+	dtype_t*	type)	/*!< in/out: type */
+{
+	ulint	mbminlen;
+	ulint	mbmaxlen;
+
+	dtype_get_mblen(type->mtype, type->prtype, &mbminlen, &mbmaxlen);
+	type->mbminlen = mbminlen;
+	type->mbmaxlen = mbmaxlen;
+
+	ut_ad(dtype_validate(type));
+}
+#else /* !UNIV_HOTBACKUP */
+# define dtype_set_mblen(type) (void) 0
+#endif /* !UNIV_HOTBACKUP */
+
+/*********************************************************************//**
+Sets a data type structure. */
+UNIV_INLINE
+void
+dtype_set(
+/*======*/
+	dtype_t*	type,	/*!< in: type struct to init */
+	ulint		mtype,	/*!< in: main data type */
+	ulint		prtype,	/*!< in: precise type */
+	ulint		len)	/*!< in: precision of type */
+{
+	ut_ad(type);
+	ut_ad(mtype <= DATA_MTYPE_MAX);
+
+	type->mtype = mtype;
+	type->prtype = prtype;
+	type->len = len;
+
+	dtype_set_mblen(type);
+}
+
+/*********************************************************************//**
+Copies a data type structure. */
+UNIV_INLINE
+void
+dtype_copy(
+/*=======*/
+	dtype_t*	type1,	/*!< in: type struct to copy to */
+	const dtype_t*	type2)	/*!< in: type struct to copy from */
+{
+	*type1 = *type2;
+
+	ut_ad(dtype_validate(type1));
+}
+
+/*********************************************************************//**
+Gets the SQL main data type.
+@return	SQL main data type */
+UNIV_INLINE
+ulint
+dtype_get_mtype(
+/*============*/
+	const dtype_t*	type)	/*!< in: data type */
+{
+	ut_ad(type);
+
+	return(type->mtype);
+}
+
+/*********************************************************************//**
+Gets the precise data type.
+@return	precise data type */
+UNIV_INLINE
+ulint
+dtype_get_prtype(
+/*=============*/
+	const dtype_t*	type)	/*!< in: data type */
+{
+	ut_ad(type);
+
+	return(type->prtype);
+}
+
+/*********************************************************************//**
+Gets the type length.
+@return	fixed length of the type, in bytes, or 0 if variable-length */
+UNIV_INLINE
+ulint
+dtype_get_len(
+/*==========*/
+	const dtype_t*	type)	/*!< in: data type */
+{
+	ut_ad(type);
+
+	return(type->len);
+}
+
+#ifndef UNIV_HOTBACKUP
+/*********************************************************************//**
+Gets the minimum length of a character, in bytes.
+@return minimum length of a char, in bytes, or 0 if this is not a
+character type */
+UNIV_INLINE
+ulint
+dtype_get_mbminlen(
+/*===============*/
+	const dtype_t*	type)	/*!< in: type */
+{
+	ut_ad(type);
+	return(type->mbminlen);
+}
+/*********************************************************************//**
+Gets the maximum length of a character, in bytes.
+@return maximum length of a char, in bytes, or 0 if this is not a
+character type */
+UNIV_INLINE
+ulint
+dtype_get_mbmaxlen(
+/*===============*/
+	const dtype_t*	type)	/*!< in: type */
+{
+	ut_ad(type);
+	return(type->mbmaxlen);
+}
+
+/*********************************************************************//**
+Gets the padding character code for a type.
+@return	padding character code, or ULINT_UNDEFINED if no padding specified */
+UNIV_INLINE
+ulint
+dtype_get_pad_char(
+/*===============*/
+	ulint	mtype,		/*!< in: main type */
+	ulint	prtype)		/*!< in: precise type */
+{
+	switch (mtype) {
+	case DATA_FIXBINARY:
+	case DATA_BINARY:
+		if (UNIV_UNLIKELY(dtype_get_charset_coll(prtype)
+				  == DATA_MYSQL_BINARY_CHARSET_COLL)) {
+			/* Starting from 5.0.18, do not pad
+			VARBINARY or BINARY columns. */
+			return(ULINT_UNDEFINED);
+		}
+		/* Fall through */
+	case DATA_CHAR:
+	case DATA_VARCHAR:
+	case DATA_MYSQL:
+	case DATA_VARMYSQL:
+		/* Space is the padding character for all char and binary
+		strings, and starting from 5.0.3, also for TEXT strings. */
+
+		return(0x20);
+	case DATA_BLOB:
+		if (!(prtype & DATA_BINARY_TYPE)) {
+			return(0x20);
+		}
+		/* Fall through */
+	default:
+		/* No padding specified */
+		return(ULINT_UNDEFINED);
+	}
+}
+
+/**********************************************************************//**
+Stores for a type the information which determines its alphabetical ordering
+and the storage size of an SQL NULL value. This is the >= 4.1.x storage
+format. */
+UNIV_INLINE
+void
+dtype_new_store_for_order_and_null_size(
+/*====================================*/
+	byte*		buf,	/*!< in: buffer for
+				DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE
+				bytes where we store the info */
+	const dtype_t*	type,	/*!< in: type struct */
+	ulint		prefix_len)/*!< in: prefix length to
+				replace type->len, or 0 */
+{
+#if 6 != DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE
+#error "6 != DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE"
+#endif
+	ulint	len;
+
+	ut_ad(type);
+	ut_ad(type->mtype >= DATA_VARCHAR);
+	ut_ad(type->mtype <= DATA_MYSQL);
+
+	buf[0] = (byte)(type->mtype & 0xFFUL);
+
+	if (type->prtype & DATA_BINARY_TYPE) {
+		buf[0] = buf[0] | 128;
+	}
+
+	/* In versions < 4.1.2 we had:	if (type->prtype & DATA_NONLATIN1) {
+	buf[0] = buf[0] | 64;
+	}
+	*/
+
+	buf[1] = (byte)(type->prtype & 0xFFUL);
+
+	len = prefix_len ? prefix_len : type->len;
+
+	mach_write_to_2(buf + 2, len & 0xFFFFUL);
+
+	ut_ad(dtype_get_charset_coll(type->prtype) < 256);
+	mach_write_to_2(buf + 4, dtype_get_charset_coll(type->prtype));
+
+	if (type->prtype & DATA_NOT_NULL) {
+		buf[4] |= 128;
+	}
+}
+
+/**********************************************************************//**
+Reads to a type the stored information which determines its alphabetical
+ordering and the storage size of an SQL NULL value. This is the < 4.1.x
+storage format. */
+UNIV_INLINE
+void
+dtype_read_for_order_and_null_size(
+/*===============================*/
+	dtype_t*	type,	/*!< in: type struct */
+	const byte*	buf)	/*!< in: buffer for stored type order info */
+{
+#if 4 != DATA_ORDER_NULL_TYPE_BUF_SIZE
+# error "4 != DATA_ORDER_NULL_TYPE_BUF_SIZE"
+#endif
+
+	type->mtype = buf[0] & 63;
+	type->prtype = buf[1];
+
+	if (buf[0] & 128) {
+		type->prtype = type->prtype | DATA_BINARY_TYPE;
+	}
+
+	type->len = mach_read_from_2(buf + 2);
+
+	type->prtype = dtype_form_prtype(type->prtype,
+					 data_mysql_default_charset_coll);
+	dtype_set_mblen(type);
+}
+
+/**********************************************************************//**
+Reads to a type the stored information which determines its alphabetical
+ordering and the storage size of an SQL NULL value. This is the >= 4.1.x
+storage format. */
+UNIV_INLINE
+void
+dtype_new_read_for_order_and_null_size(
+/*===================================*/
+	dtype_t*	type,	/*!< in: type struct */
+	const byte*	buf)	/*!< in: buffer for stored type order info */
+{
+	ulint	charset_coll;
+
+#if 6 != DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE
+#error "6 != DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE"
+#endif
+
+	type->mtype = buf[0] & 63;
+	type->prtype = buf[1];
+
+	if (buf[0] & 128) {
+		type->prtype |= DATA_BINARY_TYPE;
+	}
+
+	if (buf[4] & 128) {
+		type->prtype |= DATA_NOT_NULL;
+	}
+
+	type->len = mach_read_from_2(buf + 2);
+
+	charset_coll = mach_read_from_2(buf + 4) & 0x7fff;
+
+	if (dtype_is_string_type(type->mtype)) {
+		ut_a(charset_coll < 256);
+
+		if (charset_coll == 0) {
+			/* This insert buffer record was inserted with MySQL
+			version < 4.1.2, and the charset-collation code was not
+			explicitly stored to dtype->prtype at that time. It
+			must be the default charset-collation of this MySQL
+			installation. */
+
+			charset_coll = data_mysql_default_charset_coll;
+		}
+
+		type->prtype = dtype_form_prtype(type->prtype, charset_coll);
+	}
+	dtype_set_mblen(type);
+}
+#endif /* !UNIV_HOTBACKUP */
+
+/***********************************************************************//**
+Returns the size of a fixed size data type, 0 if not a fixed size type.
+@return	fixed size, or 0 */
+UNIV_INLINE
+ulint
+dtype_get_fixed_size_low(
+/*=====================*/
+	ulint	mtype,		/*!< in: main type */
+	ulint	prtype,		/*!< in: precise type */
+	ulint	len,		/*!< in: length */
+	ulint	mbminlen,	/*!< in: minimum length of a multibyte char */
+	ulint	mbmaxlen,	/*!< in: maximum length of a multibyte char */
+	ulint	comp)		/*!< in: nonzero=ROW_FORMAT=COMPACT  */
+{
+	switch (mtype) {
+	case DATA_SYS:
+#ifdef UNIV_DEBUG
+		switch (prtype & DATA_MYSQL_TYPE_MASK) {
+		case DATA_ROW_ID:
+			ut_ad(len == DATA_ROW_ID_LEN);
+			break;
+		case DATA_TRX_ID:
+			ut_ad(len == DATA_TRX_ID_LEN);
+			break;
+		case DATA_ROLL_PTR:
+			ut_ad(len == DATA_ROLL_PTR_LEN);
+			break;
+		default:
+			ut_ad(0);
+			return(0);
+		}
+#endif /* UNIV_DEBUG */
+	case DATA_CHAR:
+	case DATA_FIXBINARY:
+	case DATA_INT:
+	case DATA_FLOAT:
+	case DATA_DOUBLE:
+		return(len);
+	case DATA_MYSQL:
+#ifndef UNIV_HOTBACKUP
+		if (prtype & DATA_BINARY_TYPE) {
+			return(len);
+		} else if (!comp) {
+			return(len);
+		} else {
+			/* We play it safe here and ask MySQL for
+			mbminlen and mbmaxlen.	Although
+			mbminlen and mbmaxlen are
+			initialized if and only if prtype
+			is (in one of the 3 functions in this file),
+			it could be that none of these functions
+			has been called. */
+
+			ulint	i_mbminlen, i_mbmaxlen;
+
+			innobase_get_cset_width(
+				dtype_get_charset_coll(prtype),
+				&i_mbminlen, &i_mbmaxlen);
+
+			if (UNIV_UNLIKELY(mbminlen != i_mbminlen)
+			    || UNIV_UNLIKELY(mbmaxlen != i_mbmaxlen)) {
+
+				ut_print_timestamp(stderr);
+				fprintf(stderr, "  InnoDB: "
+					"mbminlen=%lu, "
+					"mbmaxlen=%lu, "
+					"type->mbminlen=%lu, "
+					"type->mbmaxlen=%lu\n",
+					(ulong) i_mbminlen,
+					(ulong) i_mbmaxlen,
+					(ulong) mbminlen,
+					(ulong) mbmaxlen);
+			}
+			if (mbminlen == mbmaxlen) {
+				return(len);
+			}
+		}
+#else /* !UNIV_HOTBACKUP */
+		return(len);
+#endif /* !UNIV_HOTBACKUP */
+		/* fall through for variable-length charsets */
+	case DATA_VARCHAR:
+	case DATA_BINARY:
+	case DATA_DECIMAL:
+	case DATA_VARMYSQL:
+	case DATA_BLOB:
+		return(0);
+	default:
+		ut_error;
+	}
+
+	return(0);
+}
+
+#ifndef UNIV_HOTBACKUP
+/***********************************************************************//**
+Returns the minimum size of a data type.
+@return	minimum size */
+UNIV_INLINE
+ulint
+dtype_get_min_size_low(
+/*===================*/
+	ulint	mtype,		/*!< in: main type */
+	ulint	prtype,		/*!< in: precise type */
+	ulint	len,		/*!< in: length */
+	ulint	mbminlen,	/*!< in: minimum length of a multibyte char */
+	ulint	mbmaxlen)	/*!< in: maximum length of a multibyte char */
+{
+	switch (mtype) {
+	case DATA_SYS:
+#ifdef UNIV_DEBUG
+		switch (prtype & DATA_MYSQL_TYPE_MASK) {
+		case DATA_ROW_ID:
+			ut_ad(len == DATA_ROW_ID_LEN);
+			break;
+		case DATA_TRX_ID:
+			ut_ad(len == DATA_TRX_ID_LEN);
+			break;
+		case DATA_ROLL_PTR:
+			ut_ad(len == DATA_ROLL_PTR_LEN);
+			break;
+		default:
+			ut_ad(0);
+			return(0);
+		}
+#endif /* UNIV_DEBUG */
+	case DATA_CHAR:
+	case DATA_FIXBINARY:
+	case DATA_INT:
+	case DATA_FLOAT:
+	case DATA_DOUBLE:
+		return(len);
+	case DATA_MYSQL:
+		if ((prtype & DATA_BINARY_TYPE) || mbminlen == mbmaxlen) {
+			return(len);
+		}
+		/* this is a variable-length character set */
+		ut_a(mbminlen > 0);
+		ut_a(mbmaxlen > mbminlen);
+		ut_a(len % mbmaxlen == 0);
+		return(len * mbminlen / mbmaxlen);
+	case DATA_VARCHAR:
+	case DATA_BINARY:
+	case DATA_DECIMAL:
+	case DATA_VARMYSQL:
+	case DATA_BLOB:
+		return(0);
+	default:
+		ut_error;
+	}
+
+	return(0);
+}
+
+/***********************************************************************//**
+Returns the maximum size of a data type. Note: types in system tables may be
+incomplete and return incorrect information.
+@return	maximum size */
+UNIV_INLINE
+ulint
+dtype_get_max_size_low(
+/*===================*/
+	ulint	mtype,		/*!< in: main type */
+	ulint	len)		/*!< in: length */
+{
+	switch (mtype) {
+	case DATA_SYS:
+	case DATA_CHAR:
+	case DATA_FIXBINARY:
+	case DATA_INT:
+	case DATA_FLOAT:
+	case DATA_DOUBLE:
+	case DATA_MYSQL:
+	case DATA_VARCHAR:
+	case DATA_BINARY:
+	case DATA_DECIMAL:
+	case DATA_VARMYSQL:
+		return(len);
+	case DATA_BLOB:
+		break;
+	default:
+		ut_error;
+	}
+
+	return(ULINT_MAX);
+}
+#endif /* !UNIV_HOTBACKUP */
+
+/***********************************************************************//**
+Returns the ROW_FORMAT=REDUNDANT stored SQL NULL size of a type.
+For fixed length types it is the fixed length of the type, otherwise 0.
+@return	SQL null storage size in ROW_FORMAT=REDUNDANT */
+UNIV_INLINE
+ulint
+dtype_get_sql_null_size(
+/*====================*/
+	const dtype_t*	type,	/*!< in: type */
+	ulint		comp)	/*!< in: nonzero=ROW_FORMAT=COMPACT  */
+{
+#ifndef UNIV_HOTBACKUP
+	return(dtype_get_fixed_size_low(type->mtype, type->prtype, type->len,
+					type->mbminlen, type->mbmaxlen, comp));
+#else /* !UNIV_HOTBACKUP */
+	return(dtype_get_fixed_size_low(type->mtype, type->prtype, type->len,
+					0, 0, 0));
+#endif /* !UNIV_HOTBACKUP */
+}
diff --git a/storage/xtradb/include/data0types.h b/storage/xtradb/include/data0types.h
new file mode 100644
index 00000000000..04e835bc401
--- /dev/null
+++ b/storage/xtradb/include/data0types.h
@@ -0,0 +1,36 @@
+/*****************************************************************************
+
+Copyright (c) 2000, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/********************************************************************//**
+@file include/data0types.h
+Some type definitions
+
+Created 9/21/2000 Heikki Tuuri
+*************************************************************************/
+
+#ifndef data0types_h
+#define data0types_h
+
+/* SQL data field struct */
+typedef struct dfield_struct	dfield_t;
+
+/* SQL data tuple struct */
+typedef struct dtuple_struct	dtuple_t;
+
+#endif
+
diff --git a/storage/xtradb/include/db0err.h b/storage/xtradb/include/db0err.h
new file mode 100644
index 00000000000..c7fa6d2a444
--- /dev/null
+++ b/storage/xtradb/include/db0err.h
@@ -0,0 +1,111 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/db0err.h
+Global error codes for the database
+
+Created 5/24/1996 Heikki Tuuri
+*******************************************************/
+
+#ifndef db0err_h
+#define db0err_h
+
+
+enum db_err {
+	DB_SUCCESS_LOCKED_REC = 9,	/*!< like DB_SUCCESS, but a new
+					explicit record lock was created */
+	DB_SUCCESS = 10,
+
+	/* The following are error codes */
+	DB_ERROR,
+	DB_INTERRUPTED,
+	DB_OUT_OF_MEMORY,
+	DB_OUT_OF_FILE_SPACE,
+	DB_LOCK_WAIT,
+	DB_DEADLOCK,
+	DB_ROLLBACK,
+	DB_DUPLICATE_KEY,
+	DB_QUE_THR_SUSPENDED,
+	DB_MISSING_HISTORY,		/* required history data has been
+					deleted due to lack of space in
+					rollback segment */
+	DB_CLUSTER_NOT_FOUND = 30,
+	DB_TABLE_NOT_FOUND,
+	DB_MUST_GET_MORE_FILE_SPACE,	/* the database has to be stopped
+					and restarted with more file space */
+	DB_TABLE_IS_BEING_USED,
+	DB_TOO_BIG_RECORD,		/* a record in an index would not fit
+					on a compressed page, or it would
+					become bigger than 1/2 free space in
+					an uncompressed page frame */
+	DB_LOCK_WAIT_TIMEOUT,		/* lock wait lasted too long */
+	DB_NO_REFERENCED_ROW,		/* referenced key value not found
+					for a foreign key in an insert or
+					update of a row */
+	DB_ROW_IS_REFERENCED,		/* cannot delete or update a row
+					because it contains a key value
+					which is referenced */
+	DB_CANNOT_ADD_CONSTRAINT,	/* adding a foreign key constraint
+					to a table failed */
+	DB_CORRUPTION,			/* data structure corruption noticed */
+	DB_COL_APPEARS_TWICE_IN_INDEX,	/* InnoDB cannot handle an index
+					where same column appears twice */
+	DB_CANNOT_DROP_CONSTRAINT,	/* dropping a foreign key constraint
+					from a table failed */
+	DB_NO_SAVEPOINT,		/* no savepoint exists with the given
+					name */
+	DB_TABLESPACE_ALREADY_EXISTS,	/* we cannot create a new single-table
+					tablespace because a file of the same
+					name already exists */
+	DB_TABLESPACE_DELETED,		/* tablespace does not exist or is
+					being dropped right now */
+	DB_LOCK_TABLE_FULL,		/* lock structs have exhausted the
+					buffer pool (for big transactions,
+					InnoDB stores the lock structs in the
+					buffer pool) */
+	DB_FOREIGN_DUPLICATE_KEY,	/* foreign key constraints
+					activated by the operation would
+					lead to a duplicate key in some
+					table */
+	DB_TOO_MANY_CONCURRENT_TRXS,	/* when InnoDB runs out of the
+					preconfigured undo slots, this can
+					only happen when there are too many
+					concurrent transactions */
+	DB_UNSUPPORTED,			/* when InnoDB sees any artefact or
+					a feature that it can't recoginize or
+					work with e.g., FT indexes created by
+					a later version of the engine. */
+
+	DB_PRIMARY_KEY_IS_NULL,		/* a column in the PRIMARY KEY
+					was found to be NULL */
+	DB_FOREIGN_EXCEED_MAX_CASCADE,	/* Foreign key constraint related
+					cascading delete/update exceeds
+					maximum allowed depth */
+
+	/* The following are partial failure codes */
+	DB_FAIL = 1000,
+	DB_OVERFLOW,
+	DB_UNDERFLOW,
+	DB_STRONG_FAIL,
+	DB_ZIP_OVERFLOW,
+	DB_RECORD_NOT_FOUND = 1500,
+	DB_END_OF_INDEX
+};
+
+#endif
diff --git a/storage/xtradb/include/dict0boot.h b/storage/xtradb/include/dict0boot.h
new file mode 100644
index 00000000000..9239e031a7f
--- /dev/null
+++ b/storage/xtradb/include/dict0boot.h
@@ -0,0 +1,161 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2010, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/dict0boot.h
+Data dictionary creation and booting
+
+Created 4/18/1996 Heikki Tuuri
+*******************************************************/
+
+#ifndef dict0boot_h
+#define dict0boot_h
+
+#include "univ.i"
+
+#include "mtr0mtr.h"
+#include "mtr0log.h"
+#include "ut0byte.h"
+#include "buf0buf.h"
+#include "fsp0fsp.h"
+#include "dict0dict.h"
+
+typedef	byte	dict_hdr_t;
+
+/**********************************************************************//**
+Gets a pointer to the dictionary header and x-latches its page.
+@return	pointer to the dictionary header, page x-latched */
+UNIV_INTERN
+dict_hdr_t*
+dict_hdr_get(
+/*=========*/
+	mtr_t*	mtr);	/*!< in: mtr */
+/**********************************************************************//**
+Returns a new table, index, or space id. */
+UNIV_INTERN
+void
+dict_hdr_get_new_id(
+/*================*/
+	dulint*	table_id,	/*!< out: table id (not assigned if NULL) */
+	dulint*	index_id,	/*!< out: index id (not assigned if NULL) */
+	ulint*	space_id);	/*!< out: space id (not assigned if NULL) */
+/**********************************************************************//**
+Returns a new row id.
+@return	the new id */
+UNIV_INLINE
+dulint
+dict_sys_get_new_row_id(void);
+/*=========================*/
+/**********************************************************************//**
+Reads a row id from a record or other 6-byte stored form.
+@return	row id */
+UNIV_INLINE
+dulint
+dict_sys_read_row_id(
+/*=================*/
+	byte*	field);	/*!< in: record field */
+/**********************************************************************//**
+Writes a row id to a record or other 6-byte stored form. */
+UNIV_INLINE
+void
+dict_sys_write_row_id(
+/*==================*/
+	byte*	field,	/*!< in: record field */
+	dulint	row_id);/*!< in: row id */
+/*****************************************************************//**
+Initializes the data dictionary memory structures when the database is
+started. This function is also called when the data dictionary is created. */
+UNIV_INTERN
+void
+dict_boot(void);
+/*===========*/
+/*****************************************************************//**
+Creates and initializes the data dictionary at the database creation. */
+UNIV_INTERN
+void
+dict_create(void);
+/*=============*/
+
+
+/* Space id and page no where the dictionary header resides */
+#define	DICT_HDR_SPACE		0	/* the SYSTEM tablespace */
+#define	DICT_HDR_PAGE_NO	FSP_DICT_HDR_PAGE_NO
+
+/* The ids for the basic system tables and their indexes */
+#define DICT_TABLES_ID		ut_dulint_create(0, 1)
+#define DICT_COLUMNS_ID		ut_dulint_create(0, 2)
+#define DICT_INDEXES_ID		ut_dulint_create(0, 3)
+#define DICT_FIELDS_ID		ut_dulint_create(0, 4)
+#define DICT_STATS_ID		ut_dulint_create(0, 6)
+/* The following is a secondary index on SYS_TABLES */
+#define DICT_TABLE_IDS_ID	ut_dulint_create(0, 5)
+
+#define	DICT_HDR_FIRST_ID	10	/* the ids for tables etc. start
+					from this number, except for basic
+					system tables and their above defined
+					indexes; ibuf tables and indexes are
+					assigned as the id the number
+					DICT_IBUF_ID_MIN plus the space id */
+#define DICT_IBUF_ID_MIN	ut_dulint_create(0xFFFFFFFFUL, 0)
+
+/* The offset of the dictionary header on the page */
+#define	DICT_HDR		FSEG_PAGE_DATA
+
+/*-------------------------------------------------------------*/
+/* Dictionary header offsets */
+#define DICT_HDR_ROW_ID		0	/* The latest assigned row id */
+#define	DICT_HDR_TABLE_ID	8	/* The latest assigned table id */
+#define	DICT_HDR_INDEX_ID	16	/* The latest assigned index id */
+#define DICT_HDR_MAX_SPACE_ID	24	/* The latest assigned space id, or 0*/
+#define	DICT_HDR_MIX_ID_LOW	28	/* Obsolete,always DICT_HDR_FIRST_ID */
+#define	DICT_HDR_TABLES		32	/* Root of the table index tree */
+#define	DICT_HDR_TABLE_IDS	36	/* Root of the table index tree */
+#define	DICT_HDR_COLUMNS	40	/* Root of the column index tree */
+#define	DICT_HDR_INDEXES	44	/* Root of the index index tree */
+#define	DICT_HDR_FIELDS		48	/* Root of the index field
+					index tree */
+#define	DICT_HDR_STATS		52	/* Root of the stats tree */
+
+#define DICT_HDR_FSEG_HEADER	56	/* Segment header for the tablespace
+					segment into which the dictionary
+					header is created */
+
+#define	DICT_HDR_XTRADB_MARK	256	/* Flag to distinguish expansion of XtraDB */
+/*-------------------------------------------------------------*/
+
+/* The field number of the page number field in the sys_indexes table
+clustered index */
+#define DICT_SYS_INDEXES_PAGE_NO_FIELD	 8
+#define DICT_SYS_INDEXES_SPACE_NO_FIELD	 7
+#define DICT_SYS_INDEXES_TYPE_FIELD	 6
+#define DICT_SYS_INDEXES_NAME_FIELD	 4
+
+#define DICT_SYS_STATS_DIFF_VALS_FIELD	 4
+
+/* When a row id which is zero modulo this number (which must be a power of
+two) is assigned, the field DICT_HDR_ROW_ID on the dictionary header page is
+updated */
+#define DICT_HDR_ROW_ID_WRITE_MARGIN	256
+
+#define DICT_HDR_XTRADB_FLAG		ut_dulint_create(0x58545241UL,0x44425F31UL)	/* "XTRADB_1" */
+
+#ifndef UNIV_NONINL
+#include "dict0boot.ic"
+#endif
+
+#endif
diff --git a/storage/xtradb/include/dict0boot.ic b/storage/xtradb/include/dict0boot.ic
new file mode 100644
index 00000000000..d5f372e38c4
--- /dev/null
+++ b/storage/xtradb/include/dict0boot.ic
@@ -0,0 +1,93 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/dict0boot.ic
+Data dictionary creation and booting
+
+Created 4/18/1996 Heikki Tuuri
+*******************************************************/
+
+/**********************************************************************//**
+Writes the current value of the row id counter to the dictionary header file
+page. */
+UNIV_INTERN
+void
+dict_hdr_flush_row_id(void);
+/*=======================*/
+
+
+/**********************************************************************//**
+Returns a new row id.
+@return	the new id */
+UNIV_INLINE
+dulint
+dict_sys_get_new_row_id(void)
+/*=========================*/
+{
+	dulint	id;
+
+	mutex_enter(&(dict_sys->mutex));
+
+	id = dict_sys->row_id;
+
+	if (0 == (ut_dulint_get_low(id) % DICT_HDR_ROW_ID_WRITE_MARGIN)) {
+
+		dict_hdr_flush_row_id();
+	}
+
+	UT_DULINT_INC(dict_sys->row_id);
+
+	mutex_exit(&(dict_sys->mutex));
+
+	return(id);
+}
+
+/**********************************************************************//**
+Reads a row id from a record or other 6-byte stored form.
+@return	row id */
+UNIV_INLINE
+dulint
+dict_sys_read_row_id(
+/*=================*/
+	byte*	field)	/*!< in: record field */
+{
+#if DATA_ROW_ID_LEN != 6
+# error "DATA_ROW_ID_LEN != 6"
+#endif
+
+	return(mach_read_from_6(field));
+}
+
+/**********************************************************************//**
+Writes a row id to a record or other 6-byte stored form. */
+UNIV_INLINE
+void
+dict_sys_write_row_id(
+/*==================*/
+	byte*	field,	/*!< in: record field */
+	dulint	row_id)	/*!< in: row id */
+{
+#if DATA_ROW_ID_LEN != 6
+# error "DATA_ROW_ID_LEN != 6"
+#endif
+
+	mach_write_to_6(field, row_id);
+}
+
+
diff --git a/storage/xtradb/include/dict0crea.h b/storage/xtradb/include/dict0crea.h
new file mode 100644
index 00000000000..0249091a195
--- /dev/null
+++ b/storage/xtradb/include/dict0crea.h
@@ -0,0 +1,215 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/dict0crea.h
+Database object creation
+
+Created 1/8/1996 Heikki Tuuri
+*******************************************************/
+
+#ifndef dict0crea_h
+#define dict0crea_h
+
+#include "univ.i"
+#include "dict0types.h"
+#include "dict0dict.h"
+#include "que0types.h"
+#include "row0types.h"
+#include "mtr0mtr.h"
+
+/*********************************************************************//**
+Creates a table create graph.
+@return	own: table create node */
+UNIV_INTERN
+tab_node_t*
+tab_create_graph_create(
+/*====================*/
+	dict_table_t*	table,	/*!< in: table to create, built as a memory data
+				structure */
+	mem_heap_t*	heap);	/*!< in: heap where created */
+/*********************************************************************//**
+Creates an index create graph.
+@return	own: index create node */
+UNIV_INTERN
+ind_node_t*
+ind_create_graph_create(
+/*====================*/
+	dict_index_t*	index,	/*!< in: index to create, built as a memory data
+				structure */
+	mem_heap_t*	heap);	/*!< in: heap where created */
+/*********************************************************************//**
+*/
+UNIV_INTERN
+ind_node_t*
+ind_insert_stats_graph_create(
+/*==========================*/
+	dict_index_t*	index,
+	mem_heap_t*	heap);
+/***********************************************************//**
+Creates a table. This is a high-level function used in SQL execution graphs.
+@return	query thread to run next or NULL */
+UNIV_INTERN
+que_thr_t*
+dict_create_table_step(
+/*===================*/
+	que_thr_t*	thr);	/*!< in: query thread */
+/***********************************************************//**
+*/
+UNIV_INTERN
+que_thr_t*
+dict_insert_stats_step(
+/*===================*/
+	que_thr_t*	thr);
+/***********************************************************//**
+Creates an index. This is a high-level function used in SQL execution
+graphs.
+@return	query thread to run next or NULL */
+UNIV_INTERN
+que_thr_t*
+dict_create_index_step(
+/*===================*/
+	que_thr_t*	thr);	/*!< in: query thread */
+/*******************************************************************//**
+Truncates the index tree associated with a row in SYS_INDEXES table.
+@return	new root page number, or FIL_NULL on failure */
+UNIV_INTERN
+ulint
+dict_truncate_index_tree(
+/*=====================*/
+	dict_table_t*	table,	/*!< in: the table the index belongs to */
+	ulint		space,	/*!< in: 0=truncate,
+				nonzero=create the index tree in the
+				given tablespace */
+	btr_pcur_t*	pcur,	/*!< in/out: persistent cursor pointing to
+				record in the clustered index of
+				SYS_INDEXES table. The cursor may be
+				repositioned in this call. */
+	mtr_t*		mtr);	/*!< in: mtr having the latch
+				on the record page. The mtr may be
+				committed and restarted in this call. */
+/*******************************************************************//**
+Drops the index tree associated with a row in SYS_INDEXES table. */
+UNIV_INTERN
+void
+dict_drop_index_tree(
+/*=================*/
+	rec_t*	rec,	/*!< in/out: record in the clustered index
+			of SYS_INDEXES table */
+	mtr_t*	mtr);	/*!< in: mtr having the latch on the record page */
+/****************************************************************//**
+Creates the foreign key constraints system tables inside InnoDB
+at database creation or database start if they are not found or are
+not of the right form.
+@return	DB_SUCCESS or error code */
+UNIV_INTERN
+ulint
+dict_create_or_check_foreign_constraint_tables(void);
+/*================================================*/
+/********************************************************************//**
+Adds foreign key definitions to data dictionary tables in the database. We
+look at table->foreign_list, and also generate names to constraints that were
+not named by the user. A generated constraint has a name of the format
+databasename/tablename_ibfk_NUMBER, where the numbers start from 1, and are
+given locally for this table, that is, the number is not global, as in the
+old format constraints < 4.0.18 it used to be.
+@return	error code or DB_SUCCESS */
+UNIV_INTERN
+ulint
+dict_create_add_foreigns_to_dictionary(
+/*===================================*/
+	ulint		start_id,/*!< in: if we are actually doing ALTER TABLE
+				ADD CONSTRAINT, we want to generate constraint
+				numbers which are bigger than in the table so
+				far; we number the constraints from
+				start_id + 1 up; start_id should be set to 0 if
+				we are creating a new table, or if the table
+				so far has no constraints for which the name
+				was generated here */
+	dict_table_t*	table,	/*!< in: table */
+	trx_t*		trx);	/*!< in: transaction */
+
+/* Table create node structure */
+
+struct tab_node_struct{
+	que_common_t	common;	/*!< node type: QUE_NODE_TABLE_CREATE */
+	dict_table_t*	table;	/*!< table to create, built as a memory data
+				structure with dict_mem_... functions */
+	ins_node_t*	tab_def; /* child node which does the insert of
+				the table definition; the row to be inserted
+				is built by the parent node  */
+	ins_node_t*	col_def; /* child node which does the inserts of
+				the column definitions; the row to be inserted
+				is built by the parent node  */
+	commit_node_t*	commit_node;
+				/* child node which performs a commit after
+				a successful table creation */
+	/*----------------------*/
+	/* Local storage for this graph node */
+	ulint		state;	/*!< node execution state */
+	ulint		col_no;	/*!< next column definition to insert */
+	mem_heap_t*	heap;	/*!< memory heap used as auxiliary storage */
+};
+
+/* Table create node states */
+#define	TABLE_BUILD_TABLE_DEF	1
+#define	TABLE_BUILD_COL_DEF	2
+#define	TABLE_COMMIT_WORK	3
+#define	TABLE_ADD_TO_CACHE	4
+#define	TABLE_COMPLETED		5
+
+/* Index create node struct */
+
+struct ind_node_struct{
+	que_common_t	common;	/*!< node type: QUE_NODE_INDEX_CREATE */
+	dict_index_t*	index;	/*!< index to create, built as a memory data
+				structure with dict_mem_... functions */
+	ins_node_t*	ind_def; /* child node which does the insert of
+				the index definition; the row to be inserted
+				is built by the parent node  */
+	ins_node_t*	field_def; /* child node which does the inserts of
+				the field definitions; the row to be inserted
+				is built by the parent node  */
+	ins_node_t*	stats_def;
+	commit_node_t*	commit_node;
+				/* child node which performs a commit after
+				a successful index creation */
+	/*----------------------*/
+	/* Local storage for this graph node */
+	ulint		state;	/*!< node execution state */
+	ulint		page_no;/* root page number of the index */
+	dict_table_t*	table;	/*!< table which owns the index */
+	dtuple_t*	ind_row;/* index definition row built */
+	ulint		field_no;/* next field definition to insert */
+	ulint		stats_no;
+	mem_heap_t*	heap;	/*!< memory heap used as auxiliary storage */
+};
+
+/* Index create node states */
+#define	INDEX_BUILD_INDEX_DEF	1
+#define	INDEX_BUILD_FIELD_DEF	2
+#define	INDEX_CREATE_INDEX_TREE	3
+#define	INDEX_COMMIT_WORK	4
+#define	INDEX_ADD_TO_CACHE	5
+#define	INDEX_BUILD_STATS_COLS	6
+
+#ifndef UNIV_NONINL
+#include "dict0crea.ic"
+#endif
+
+#endif
diff --git a/storage/xtradb/include/dict0crea.ic b/storage/xtradb/include/dict0crea.ic
new file mode 100644
index 00000000000..c5365ce7489
--- /dev/null
+++ b/storage/xtradb/include/dict0crea.ic
@@ -0,0 +1,25 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/dict0crea.ic
+Database object creation
+
+Created 1/8/1996 Heikki Tuuri
+*******************************************************/
+
diff --git a/storage/xtradb/include/dict0dict.h b/storage/xtradb/include/dict0dict.h
new file mode 100644
index 00000000000..d18b3ecb1b0
--- /dev/null
+++ b/storage/xtradb/include/dict0dict.h
@@ -0,0 +1,1217 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/dict0dict.h
+Data dictionary system
+
+Created 1/8/1996 Heikki Tuuri
+*******************************************************/
+
+#ifndef dict0dict_h
+#define dict0dict_h
+
+#include "univ.i"
+#include "dict0types.h"
+#include "dict0mem.h"
+#include "data0type.h"
+#include "data0data.h"
+#include "mem0mem.h"
+#include "rem0types.h"
+#include "ut0mem.h"
+#include "ut0lst.h"
+#include "hash0hash.h"
+#include "ut0rnd.h"
+#include "ut0byte.h"
+#include "trx0types.h"
+
+#ifndef UNIV_HOTBACKUP
+# include "sync0sync.h"
+# include "sync0rw.h"
+/******************************************************************//**
+Makes all characters in a NUL-terminated UTF-8 string lower case. */
+UNIV_INTERN
+void
+dict_casedn_str(
+/*============*/
+	char*	a);	/*!< in/out: string to put in lower case */
+/********************************************************************//**
+Get the database name length in a table name.
+@return	database name length */
+UNIV_INTERN
+ulint
+dict_get_db_name_len(
+/*=================*/
+	const char*	name);	/*!< in: table name in the form
+				dbname '/' tablename */
+/********************************************************************//**
+Return the end of table name where we have removed dbname and '/'.
+@return	table name */
+
+const char*
+dict_remove_db_name(
+/*================*/
+	const char*	name);	/*!< in: table name in the form
+				dbname '/' tablename */
+/**********************************************************************//**
+Returns a table object based on table id.
+@return	table, NULL if does not exist */
+UNIV_INTERN
+dict_table_t*
+dict_table_get_on_id(
+/*=================*/
+        dulint  table_id,       /*!< in: table id */
+        trx_t*  trx);           /*!< in: transaction handle */
+/********************************************************************//**
+Decrements the count of open MySQL handles to a table. */
+UNIV_INTERN
+void
+dict_table_decrement_handle_count(
+/*==============================*/
+	dict_table_t*	table,		/*!< in/out: table */
+	ibool		dict_locked);	/*!< in: TRUE=data dictionary locked */
+/**********************************************************************//**
+Inits the data dictionary module. */
+UNIV_INTERN
+void
+dict_init(void);
+/*===========*/
+/********************************************************************//**
+Gets the space id of every table of the data dictionary and makes a linear
+list and a hash table of them to the data dictionary cache. This function
+can be called at database startup if we did not need to do a crash recovery.
+In crash recovery we must scan the space id's from the .ibd files in MySQL
+database directories. */
+UNIV_INTERN
+void
+dict_load_space_id_list(void);
+/*=========================*/
+/*********************************************************************//**
+Gets the column data type. */
+UNIV_INLINE
+void
+dict_col_copy_type(
+/*===============*/
+	const dict_col_t*	col,	/*!< in: column */
+	dtype_t*		type);	/*!< out: data type */
+#endif /* !UNIV_HOTBACKUP */
+#ifdef UNIV_DEBUG
+/*********************************************************************//**
+Assert that a column and a data type match.
+@return	TRUE */
+UNIV_INLINE
+ibool
+dict_col_type_assert_equal(
+/*=======================*/
+	const dict_col_t*	col,	/*!< in: column */
+	const dtype_t*		type);	/*!< in: data type */
+#endif /* UNIV_DEBUG */
+#ifndef UNIV_HOTBACKUP
+/***********************************************************************//**
+Returns the minimum size of the column.
+@return	minimum size */
+UNIV_INLINE
+ulint
+dict_col_get_min_size(
+/*==================*/
+	const dict_col_t*	col);	/*!< in: column */
+/***********************************************************************//**
+Returns the maximum size of the column.
+@return	maximum size */
+UNIV_INLINE
+ulint
+dict_col_get_max_size(
+/*==================*/
+	const dict_col_t*	col);	/*!< in: column */
+/***********************************************************************//**
+Returns the size of a fixed size column, 0 if not a fixed size column.
+@return	fixed size, or 0 */
+UNIV_INLINE
+ulint
+dict_col_get_fixed_size(
+/*====================*/
+	const dict_col_t*	col,	/*!< in: column */
+	ulint			comp);	/*!< in: nonzero=ROW_FORMAT=COMPACT  */
+/***********************************************************************//**
+Returns the ROW_FORMAT=REDUNDANT stored SQL NULL size of a column.
+For fixed length types it is the fixed length of the type, otherwise 0.
+@return	SQL null storage size in ROW_FORMAT=REDUNDANT */
+UNIV_INLINE
+ulint
+dict_col_get_sql_null_size(
+/*=======================*/
+	const dict_col_t*	col,	/*!< in: column */
+	ulint			comp);	/*!< in: nonzero=ROW_FORMAT=COMPACT  */
+
+/*********************************************************************//**
+Gets the column number.
+@return	col->ind, table column position (starting from 0) */
+UNIV_INLINE
+ulint
+dict_col_get_no(
+/*============*/
+	const dict_col_t*	col);	/*!< in: column */
+/*********************************************************************//**
+Gets the column position in the clustered index. */
+UNIV_INLINE
+ulint
+dict_col_get_clust_pos(
+/*===================*/
+	const dict_col_t*	col,		/*!< in: table column */
+	const dict_index_t*	clust_index);	/*!< in: clustered index */
+/****************************************************************//**
+If the given column name is reserved for InnoDB system columns, return
+TRUE.
+@return	TRUE if name is reserved */
+UNIV_INTERN
+ibool
+dict_col_name_is_reserved(
+/*======================*/
+	const char*	name);	/*!< in: column name */
+/********************************************************************//**
+Acquire the autoinc lock. */
+UNIV_INTERN
+void
+dict_table_autoinc_lock(
+/*====================*/
+	dict_table_t*	table);	/*!< in/out: table */
+/********************************************************************//**
+Unconditionally set the autoinc counter. */
+UNIV_INTERN
+void
+dict_table_autoinc_initialize(
+/*==========================*/
+	dict_table_t*	table,	/*!< in/out: table */
+	ib_uint64_t	value);	/*!< in: next value to assign to a row */
+/********************************************************************//**
+Reads the next autoinc value (== autoinc counter value), 0 if not yet
+initialized.
+@return	value for a new row, or 0 */
+UNIV_INTERN
+ib_uint64_t
+dict_table_autoinc_read(
+/*====================*/
+	const dict_table_t*	table);	/*!< in: table */
+/********************************************************************//**
+Updates the autoinc counter if the value supplied is greater than the
+current value. */
+UNIV_INTERN
+void
+dict_table_autoinc_update_if_greater(
+/*=================================*/
+
+	dict_table_t*	table,	/*!< in/out: table */
+	ib_uint64_t	value);	/*!< in: value which was assigned to a row */
+/********************************************************************//**
+Release the autoinc lock. */
+UNIV_INTERN
+void
+dict_table_autoinc_unlock(
+/*======================*/
+	dict_table_t*	table);	/*!< in/out: table */
+#endif /* !UNIV_HOTBACKUP */
+/**********************************************************************//**
+Adds system columns to a table object. */
+UNIV_INTERN
+void
+dict_table_add_system_columns(
+/*==========================*/
+	dict_table_t*	table,	/*!< in/out: table */
+	mem_heap_t*	heap);	/*!< in: temporary heap */
+#ifndef UNIV_HOTBACKUP
+/**********************************************************************//**
+Adds a table object to the dictionary cache. */
+UNIV_INTERN
+void
+dict_table_add_to_cache(
+/*====================*/
+	dict_table_t*	table,	/*!< in: table */
+	mem_heap_t*	heap);	/*!< in: temporary heap */
+/**********************************************************************//**
+Removes a table object from the dictionary cache. */
+UNIV_INTERN
+void
+dict_table_remove_from_cache(
+/*=========================*/
+	dict_table_t*	table);	/*!< in, own: table */
+/**********************************************************************//**
+Renames a table object.
+@return	TRUE if success */
+UNIV_INTERN
+ibool
+dict_table_rename_in_cache(
+/*=======================*/
+	dict_table_t*	table,		/*!< in/out: table */
+	const char*	new_name,	/*!< in: new name */
+	ibool		rename_also_foreigns);/*!< in: in ALTER TABLE we want
+					to preserve the original table name
+					in constraints which reference it */
+/**********************************************************************//**
+Removes an index from the dictionary cache. */
+UNIV_INTERN
+void
+dict_index_remove_from_cache(
+/*=========================*/
+	dict_table_t*	table,	/*!< in/out: table */
+	dict_index_t*	index);	/*!< in, own: index */
+/**********************************************************************//**
+Change the id of a table object in the dictionary cache. This is used in
+DISCARD TABLESPACE. */
+UNIV_INTERN
+void
+dict_table_change_id_in_cache(
+/*==========================*/
+	dict_table_t*	table,	/*!< in/out: table object already in cache */
+	dulint		new_id);/*!< in: new id to set */
+/**********************************************************************//**
+Adds a foreign key constraint object to the dictionary cache. May free
+the object if there already is an object with the same identifier in.
+At least one of foreign table or referenced table must already be in
+the dictionary cache!
+@return	DB_SUCCESS or error code */
+UNIV_INTERN
+ulint
+dict_foreign_add_to_cache(
+/*======================*/
+	dict_foreign_t*	foreign,	/*!< in, own: foreign key constraint */
+	ibool		check_charsets);/*!< in: TRUE=check charset
+					compatibility */
+/*********************************************************************//**
+Check if the index is referenced by a foreign key, if TRUE return the
+matching instance NULL otherwise.
+@return pointer to foreign key struct if index is defined for foreign
+key, otherwise NULL */
+UNIV_INTERN
+dict_foreign_t*
+dict_table_get_referenced_constraint(
+/*=================================*/
+	dict_table_t*	table,	/*!< in: InnoDB table */
+	dict_index_t*	index);	/*!< in: InnoDB index */
+/*********************************************************************//**
+Checks if a table is referenced by foreign keys.
+@return	TRUE if table is referenced by a foreign key */
+UNIV_INTERN
+ibool
+dict_table_is_referenced_by_foreign_key(
+/*====================================*/
+	const dict_table_t*	table);	/*!< in: InnoDB table */
+/**********************************************************************//**
+Replace the index in the foreign key list that matches this index's
+definition with an equivalent index. */
+UNIV_INTERN
+void
+dict_table_replace_index_in_foreign_list(
+/*=====================================*/
+	dict_table_t*	table,  /*!< in/out: table */
+	dict_index_t*	index);	/*!< in: index to be replaced */
+/*********************************************************************//**
+Checks if a index is defined for a foreign key constraint. Index is a part
+of a foreign key constraint if the index is referenced by foreign key
+or index is a foreign key index
+@return pointer to foreign key struct if index is defined for foreign
+key, otherwise NULL */
+UNIV_INTERN
+dict_foreign_t*
+dict_table_get_foreign_constraint(
+/*==============================*/
+	dict_table_t*	table,	/*!< in: InnoDB table */
+	dict_index_t*	index);	/*!< in: InnoDB index */
+/*********************************************************************//**
+Scans a table create SQL string and adds to the data dictionary
+the foreign key constraints declared in the string. This function
+should be called after the indexes for a table have been created.
+Each foreign key constraint must be accompanied with indexes in
+bot participating tables. The indexes are allowed to contain more
+fields than mentioned in the constraint.
+@return	error code or DB_SUCCESS */
+UNIV_INTERN
+ulint
+dict_create_foreign_constraints(
+/*============================*/
+	trx_t*		trx,		/*!< in: transaction */
+	const char*	sql_string,	/*!< in: table create statement where
+					foreign keys are declared like:
+					FOREIGN KEY (a, b) REFERENCES
+					table2(c, d), table2 can be written
+					also with the database
+					name before it: test.table2; the
+					default database id the database of
+					parameter name */
+	size_t		sql_length,	/*!< in: length of sql_string */
+	const char*	name,		/*!< in: table full name in the
+					normalized form
+					database_name/table_name */
+	ibool		reject_fks);	/*!< in: if TRUE, fail with error
+					code DB_CANNOT_ADD_CONSTRAINT if
+					any foreign keys are found. */
+/**********************************************************************//**
+Parses the CONSTRAINT id's to be dropped in an ALTER TABLE statement.
+@return DB_SUCCESS or DB_CANNOT_DROP_CONSTRAINT if syntax error or the
+constraint id does not match */
+UNIV_INTERN
+ulint
+dict_foreign_parse_drop_constraints(
+/*================================*/
+	mem_heap_t*	heap,			/*!< in: heap from which we can
+						allocate memory */
+	trx_t*		trx,			/*!< in: transaction */
+	dict_table_t*	table,			/*!< in: table */
+	ulint*		n,			/*!< out: number of constraints
+						to drop */
+	const char***	constraints_to_drop);	/*!< out: id's of the
+						constraints to drop */
+/**********************************************************************//**
+Returns a table object and optionally increment its MySQL open handle count.
+NOTE! This is a high-level function to be used mainly from outside the
+'dict' directory. Inside this directory dict_table_get_low is usually the
+appropriate function.
+@return	table, NULL if does not exist */
+UNIV_INTERN
+dict_table_t*
+dict_table_get(
+/*===========*/
+	const char*	table_name,	/*!< in: table name */
+	ibool		inc_mysql_count);
+					/*!< in: whether to increment the open
+					handle count on the table */
+/**********************************************************************//**
+Returns a index object, based on table and index id, and memoryfixes it.
+@return	index, NULL if does not exist */
+UNIV_INTERN
+dict_index_t*
+dict_index_get_on_id_low(
+/*=====================*/
+	dict_table_t*	table,		/*!< in: table */
+	dulint		index_id);	/*!< in: index id */
+/**********************************************************************//**
+Checks if a table is in the dictionary cache.
+@return	table, NULL if not found */
+
+UNIV_INLINE
+dict_table_t*
+dict_table_check_if_in_cache_low(
+/*=============================*/
+	const char*	table_name);	/*!< in: table name */
+/**********************************************************************//**
+Gets a table; loads it to the dictionary cache if necessary. A low-level
+function.
+@return	table, NULL if not found */
+UNIV_INLINE
+dict_table_t*
+dict_table_get_low(
+/*===============*/
+	const char*	table_name);	/*!< in: table name */
+/**********************************************************************//**
+Returns a table object based on table id.
+@return	table, NULL if does not exist */
+UNIV_INLINE
+dict_table_t*
+dict_table_get_on_id_low(
+/*=====================*/
+	dulint	table_id);	/*!< in: table id */
+/**********************************************************************//**
+Find an index that is equivalent to the one passed in and is not marked
+for deletion.
+@return	index equivalent to foreign->foreign_index, or NULL */
+UNIV_INTERN
+dict_index_t*
+dict_foreign_find_equiv_index(
+/*==========================*/
+	dict_foreign_t*	foreign);/*!< in: foreign key */
+/**********************************************************************//**
+Returns an index object by matching on the name and column names and
+if more than one index matches return the index with the max id
+@return	matching index, NULL if not found */
+UNIV_INTERN
+dict_index_t*
+dict_table_get_index_by_max_id(
+/*===========================*/
+	dict_table_t*	table,	/*!< in: table */
+	const char*	name,	/*!< in: the index name to find */
+	const char**	columns,/*!< in: array of column names */
+	ulint		n_cols);/*!< in: number of columns */
+/**********************************************************************//**
+Returns a column's name.
+@return column name. NOTE: not guaranteed to stay valid if table is
+modified in any way (columns added, etc.). */
+UNIV_INTERN
+const char*
+dict_table_get_col_name(
+/*====================*/
+	const dict_table_t*	table,	/*!< in: table */
+	ulint			col_nr);/*!< in: column number */
+
+/**********************************************************************//**
+Prints a table definition. */
+UNIV_INTERN
+void
+dict_table_print(
+/*=============*/
+	dict_table_t*	table);	/*!< in: table */
+/**********************************************************************//**
+Prints a table data. */
+UNIV_INTERN
+void
+dict_table_print_low(
+/*=================*/
+	dict_table_t*	table);	/*!< in: table */
+/**********************************************************************//**
+Prints a table data when we know the table name. */
+UNIV_INTERN
+void
+dict_table_print_by_name(
+/*=====================*/
+	const char*	name);	/*!< in: table name */
+/**********************************************************************//**
+Outputs info on foreign keys of a table. */
+UNIV_INTERN
+void
+dict_print_info_on_foreign_keys(
+/*============================*/
+	ibool		create_table_format, /*!< in: if TRUE then print in
+				a format suitable to be inserted into
+				a CREATE TABLE, otherwise in the format
+				of SHOW TABLE STATUS */
+	FILE*		file,	/*!< in: file where to print */
+	trx_t*		trx,	/*!< in: transaction */
+	dict_table_t*	table);	/*!< in: table */
+/**********************************************************************//**
+Outputs info on a foreign key of a table in a format suitable for
+CREATE TABLE. */
+UNIV_INTERN
+void
+dict_print_info_on_foreign_key_in_create_format(
+/*============================================*/
+	FILE*		file,		/*!< in: file where to print */
+	trx_t*		trx,		/*!< in: transaction */
+	dict_foreign_t*	foreign,	/*!< in: foreign key constraint */
+	ibool		add_newline);	/*!< in: whether to add a newline */
+/********************************************************************//**
+Displays the names of the index and the table. */
+UNIV_INTERN
+void
+dict_index_name_print(
+/*==================*/
+	FILE*			file,	/*!< in: output stream */
+	trx_t*			trx,	/*!< in: transaction */
+	const dict_index_t*	index);	/*!< in: index to print */
+#ifdef UNIV_DEBUG
+/********************************************************************//**
+Gets the first index on the table (the clustered index).
+@return	index, NULL if none exists */
+UNIV_INLINE
+dict_index_t*
+dict_table_get_first_index(
+/*=======================*/
+	const dict_table_t*	table);	/*!< in: table */
+/********************************************************************//**
+Gets the next index on the table.
+@return	index, NULL if none left */
+UNIV_INLINE
+dict_index_t*
+dict_table_get_next_index(
+/*======================*/
+	const dict_index_t*	index);	/*!< in: index */
+#else /* UNIV_DEBUG */
+# define dict_table_get_first_index(table) UT_LIST_GET_FIRST((table)->indexes)
+# define dict_table_get_next_index(index) UT_LIST_GET_NEXT(indexes, index)
+#endif /* UNIV_DEBUG */
+#endif /* !UNIV_HOTBACKUP */
+/********************************************************************//**
+Check whether the index is the clustered index.
+@return	nonzero for clustered index, zero for other indexes */
+UNIV_INLINE
+ulint
+dict_index_is_clust(
+/*================*/
+	const dict_index_t*	index)	/*!< in: index */
+	__attribute__((pure));
+/********************************************************************//**
+Check whether the index is unique.
+@return	nonzero for unique index, zero for other indexes */
+UNIV_INLINE
+ulint
+dict_index_is_unique(
+/*=================*/
+	const dict_index_t*	index)	/*!< in: index */
+	__attribute__((pure));
+/********************************************************************//**
+Check whether the index is the insert buffer tree.
+@return	nonzero for insert buffer, zero for other indexes */
+UNIV_INLINE
+ulint
+dict_index_is_ibuf(
+/*===============*/
+	const dict_index_t*	index)	/*!< in: index */
+	__attribute__((pure));
+/********************************************************************//**
+Check whether the index is a secondary index or the insert buffer tree.
+@return	nonzero for insert buffer, zero for other indexes */
+UNIV_INLINE
+ulint
+dict_index_is_sec_or_ibuf(
+/*======================*/
+	const dict_index_t*	index)	/*!< in: index */
+	__attribute__((pure));
+
+/********************************************************************//**
+Gets the number of user-defined columns in a table in the dictionary
+cache.
+@return	number of user-defined (e.g., not ROW_ID) columns of a table */
+UNIV_INLINE
+ulint
+dict_table_get_n_user_cols(
+/*=======================*/
+	const dict_table_t*	table);	/*!< in: table */
+/********************************************************************//**
+Gets the number of system columns in a table in the dictionary cache.
+@return	number of system (e.g., ROW_ID) columns of a table */
+UNIV_INLINE
+ulint
+dict_table_get_n_sys_cols(
+/*======================*/
+	const dict_table_t*	table);	/*!< in: table */
+/********************************************************************//**
+Gets the number of all columns (also system) in a table in the dictionary
+cache.
+@return	number of columns of a table */
+UNIV_INLINE
+ulint
+dict_table_get_n_cols(
+/*==================*/
+	const dict_table_t*	table);	/*!< in: table */
+#ifdef UNIV_DEBUG
+/********************************************************************//**
+Gets the nth column of a table.
+@return	pointer to column object */
+UNIV_INLINE
+dict_col_t*
+dict_table_get_nth_col(
+/*===================*/
+	const dict_table_t*	table,	/*!< in: table */
+	ulint			pos);	/*!< in: position of column */
+/********************************************************************//**
+Gets the given system column of a table.
+@return	pointer to column object */
+UNIV_INLINE
+dict_col_t*
+dict_table_get_sys_col(
+/*===================*/
+	const dict_table_t*	table,	/*!< in: table */
+	ulint			sys);	/*!< in: DATA_ROW_ID, ... */
+#else /* UNIV_DEBUG */
+#define dict_table_get_nth_col(table, pos) \
+((table)->cols + (pos))
+#define dict_table_get_sys_col(table, sys) \
+((table)->cols + (table)->n_cols + (sys) - DATA_N_SYS_COLS)
+#endif /* UNIV_DEBUG */
+/********************************************************************//**
+Gets the given system column number of a table.
+@return	column number */
+UNIV_INLINE
+ulint
+dict_table_get_sys_col_no(
+/*======================*/
+	const dict_table_t*	table,	/*!< in: table */
+	ulint			sys);	/*!< in: DATA_ROW_ID, ... */
+#ifndef UNIV_HOTBACKUP
+/********************************************************************//**
+Returns the minimum data size of an index record.
+@return	minimum data size in bytes */
+UNIV_INLINE
+ulint
+dict_index_get_min_size(
+/*====================*/
+	const dict_index_t*	index);	/*!< in: index */
+#endif /* !UNIV_HOTBACKUP */
+/********************************************************************//**
+Check whether the table uses the compact page format.
+@return	TRUE if table uses the compact page format */
+UNIV_INLINE
+ibool
+dict_table_is_comp(
+/*===============*/
+	const dict_table_t*	table);	/*!< in: table */
+/********************************************************************//**
+Determine the file format of a table.
+@return	file format version */
+UNIV_INLINE
+ulint
+dict_table_get_format(
+/*==================*/
+	const dict_table_t*	table);	/*!< in: table */
+/********************************************************************//**
+Set the file format of a table. */
+UNIV_INLINE
+void
+dict_table_set_format(
+/*==================*/
+	dict_table_t*	table,	/*!< in/out: table */
+	ulint		format);/*!< in: file format version */
+/********************************************************************//**
+Extract the compressed page size from table flags.
+@return	compressed page size, or 0 if not compressed */
+UNIV_INLINE
+ulint
+dict_table_flags_to_zip_size(
+/*=========================*/
+	ulint	flags)	/*!< in: flags */
+	__attribute__((const));
+/********************************************************************//**
+Check whether the table uses the compressed compact page format.
+@return	compressed page size, or 0 if not compressed */
+UNIV_INLINE
+ulint
+dict_table_zip_size(
+/*================*/
+	const dict_table_t*	table);	/*!< in: table */
+/*********************************************************************//**
+Obtain exclusive locks on all index trees of the table. This is to prevent
+accessing index trees while InnoDB is updating internal metadata for
+operations such as truncate tables. */
+UNIV_INLINE
+void
+dict_table_x_lock_indexes(
+/*======================*/
+	dict_table_t*	table);	/*!< in: table */
+/*********************************************************************//**
+Release the exclusive locks on all index tree. */
+UNIV_INLINE
+void
+dict_table_x_unlock_indexes(
+/*========================*/
+	dict_table_t*	table);	/*!< in: table */
+/********************************************************************//**
+Checks if a column is in the ordering columns of the clustered index of a
+table. Column prefixes are treated like whole columns.
+@return	TRUE if the column, or its prefix, is in the clustered key */
+UNIV_INTERN
+ibool
+dict_table_col_in_clustered_key(
+/*============================*/
+	const dict_table_t*	table,	/*!< in: table */
+	ulint			n);	/*!< in: column number */
+#ifndef UNIV_HOTBACKUP
+/*******************************************************************//**
+Copies types of columns contained in table to tuple and sets all
+fields of the tuple to the SQL NULL value.  This function should
+be called right after dtuple_create(). */
+UNIV_INTERN
+void
+dict_table_copy_types(
+/*==================*/
+	dtuple_t*		tuple,	/*!< in/out: data tuple */
+	const dict_table_t*	table);	/*!< in: table */
+/**********************************************************************//**
+Looks for an index with the given id. NOTE that we do not reserve
+the dictionary mutex: this function is for emergency purposes like
+printing info of a corrupt database page!
+@return	index or NULL if not found from cache */
+UNIV_INTERN
+dict_index_t*
+dict_index_find_on_id_low(
+/*======================*/
+	dulint	id);	/*!< in: index id */
+/**********************************************************************//**
+Adds an index to the dictionary cache.
+@return	DB_SUCCESS, DB_TOO_BIG_RECORD, or DB_CORRUPTION */
+UNIV_INTERN
+ulint
+dict_index_add_to_cache(
+/*====================*/
+	dict_table_t*	table,	/*!< in: table on which the index is */
+	dict_index_t*	index,	/*!< in, own: index; NOTE! The index memory
+				object is freed in this function! */
+	ulint		page_no,/*!< in: root page number of the index */
+	ibool		strict);/*!< in: TRUE=refuse to create the index
+				if records could be too big to fit in
+				an B-tree page */
+/**********************************************************************//**
+Removes an index from the dictionary cache. */
+UNIV_INTERN
+void
+dict_index_remove_from_cache(
+/*=========================*/
+	dict_table_t*	table,	/*!< in/out: table */
+	dict_index_t*	index);	/*!< in, own: index */
+#endif /* !UNIV_HOTBACKUP */
+/********************************************************************//**
+Gets the number of fields in the internal representation of an index,
+including fields added by the dictionary system.
+@return	number of fields */
+UNIV_INLINE
+ulint
+dict_index_get_n_fields(
+/*====================*/
+	const dict_index_t*	index);	/*!< in: an internal
+					representation of index (in
+					the dictionary cache) */
+/********************************************************************//**
+Gets the number of fields in the internal representation of an index
+that uniquely determine the position of an index entry in the index, if
+we do not take multiversioning into account: in the B-tree use the value
+returned by dict_index_get_n_unique_in_tree.
+@return	number of fields */
+UNIV_INLINE
+ulint
+dict_index_get_n_unique(
+/*====================*/
+	const dict_index_t*	index);	/*!< in: an internal representation
+					of index (in the dictionary cache) */
+/********************************************************************//**
+Gets the number of fields in the internal representation of an index
+which uniquely determine the position of an index entry in the index, if
+we also take multiversioning into account.
+@return	number of fields */
+UNIV_INLINE
+ulint
+dict_index_get_n_unique_in_tree(
+/*============================*/
+	const dict_index_t*	index);	/*!< in: an internal representation
+					of index (in the dictionary cache) */
+/********************************************************************//**
+Gets the number of user-defined ordering fields in the index. In the internal
+representation we add the row id to the ordering fields to make all indexes
+unique, but this function returns the number of fields the user defined
+in the index as ordering fields.
+@return	number of fields */
+UNIV_INLINE
+ulint
+dict_index_get_n_ordering_defined_by_user(
+/*======================================*/
+	const dict_index_t*	index);	/*!< in: an internal representation
+					of index (in the dictionary cache) */
+#ifdef UNIV_DEBUG
+/********************************************************************//**
+Gets the nth field of an index.
+@return	pointer to field object */
+UNIV_INLINE
+dict_field_t*
+dict_index_get_nth_field(
+/*=====================*/
+	const dict_index_t*	index,	/*!< in: index */
+	ulint			pos);	/*!< in: position of field */
+#else /* UNIV_DEBUG */
+# define dict_index_get_nth_field(index, pos) ((index)->fields + (pos))
+#endif /* UNIV_DEBUG */
+/********************************************************************//**
+Gets pointer to the nth column in an index.
+@return	column */
+UNIV_INLINE
+const dict_col_t*
+dict_index_get_nth_col(
+/*===================*/
+	const dict_index_t*	index,	/*!< in: index */
+	ulint			pos);	/*!< in: position of the field */
+/********************************************************************//**
+Gets the column number of the nth field in an index.
+@return	column number */
+UNIV_INLINE
+ulint
+dict_index_get_nth_col_no(
+/*======================*/
+	const dict_index_t*	index,	/*!< in: index */
+	ulint			pos);	/*!< in: position of the field */
+/********************************************************************//**
+Looks for column n in an index.
+@return position in internal representation of the index;
+ULINT_UNDEFINED if not contained */
+UNIV_INTERN
+ulint
+dict_index_get_nth_col_pos(
+/*=======================*/
+	const dict_index_t*	index,	/*!< in: index */
+	ulint			n);	/*!< in: column number */
+/********************************************************************//**
+Returns TRUE if the index contains a column or a prefix of that column.
+@return	TRUE if contains the column or its prefix */
+UNIV_INTERN
+ibool
+dict_index_contains_col_or_prefix(
+/*==============================*/
+	const dict_index_t*	index,	/*!< in: index */
+	ulint			n);	/*!< in: column number */
+/********************************************************************//**
+Looks for a matching field in an index. The column has to be the same. The
+column in index must be complete, or must contain a prefix longer than the
+column in index2. That is, we must be able to construct the prefix in index2
+from the prefix in index.
+@return position in internal representation of the index;
+ULINT_UNDEFINED if not contained */
+UNIV_INTERN
+ulint
+dict_index_get_nth_field_pos(
+/*=========================*/
+	const dict_index_t*	index,	/*!< in: index from which to search */
+	const dict_index_t*	index2,	/*!< in: index */
+	ulint			n);	/*!< in: field number in index2 */
+/********************************************************************//**
+Looks for column n position in the clustered index.
+@return	position in internal representation of the clustered index */
+UNIV_INTERN
+ulint
+dict_table_get_nth_col_pos(
+/*=======================*/
+	const dict_table_t*	table,	/*!< in: table */
+	ulint			n);	/*!< in: column number */
+/********************************************************************//**
+Returns the position of a system column in an index.
+@return	position, ULINT_UNDEFINED if not contained */
+UNIV_INLINE
+ulint
+dict_index_get_sys_col_pos(
+/*=======================*/
+	const dict_index_t*	index,	/*!< in: index */
+	ulint			type);	/*!< in: DATA_ROW_ID, ... */
+/*******************************************************************//**
+Adds a column to index. */
+UNIV_INTERN
+void
+dict_index_add_col(
+/*===============*/
+	dict_index_t*		index,		/*!< in/out: index */
+	const dict_table_t*	table,		/*!< in: table */
+	dict_col_t*		col,		/*!< in: column */
+	ulint			prefix_len);	/*!< in: column prefix length */
+#ifndef UNIV_HOTBACKUP
+/*******************************************************************//**
+Copies types of fields contained in index to tuple. */
+UNIV_INTERN
+void
+dict_index_copy_types(
+/*==================*/
+	dtuple_t*		tuple,		/*!< in/out: data tuple */
+	const dict_index_t*	index,		/*!< in: index */
+	ulint			n_fields);	/*!< in: number of
+						field types to copy */
+#endif /* !UNIV_HOTBACKUP */
+/*********************************************************************//**
+Gets the field column.
+@return	field->col, pointer to the table column */
+UNIV_INLINE
+const dict_col_t*
+dict_field_get_col(
+/*===============*/
+	const dict_field_t*	field);	/*!< in: index field */
+#ifndef UNIV_HOTBACKUP
+/**********************************************************************//**
+Returns an index object if it is found in the dictionary cache.
+Assumes that dict_sys->mutex is already being held.
+@return	index, NULL if not found */
+UNIV_INTERN
+dict_index_t*
+dict_index_get_if_in_cache_low(
+/*===========================*/
+	dulint	index_id);	/*!< in: index id */
+#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
+/**********************************************************************//**
+Returns an index object if it is found in the dictionary cache.
+@return	index, NULL if not found */
+UNIV_INTERN
+dict_index_t*
+dict_index_get_if_in_cache(
+/*=======================*/
+	dulint	index_id);	/*!< in: index id */
+#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
+#ifdef UNIV_DEBUG
+/**********************************************************************//**
+Checks that a tuple has n_fields_cmp value in a sensible range, so that
+no comparison can occur with the page number field in a node pointer.
+@return	TRUE if ok */
+UNIV_INTERN
+ibool
+dict_index_check_search_tuple(
+/*==========================*/
+	const dict_index_t*	index,	/*!< in: index tree */
+	const dtuple_t*		tuple);	/*!< in: tuple used in a search */
+/**********************************************************************//**
+Check for duplicate index entries in a table [using the index name] */
+UNIV_INTERN
+void
+dict_table_check_for_dup_indexes(
+/*=============================*/
+	const dict_table_t*	table,	/*!< in: Check for dup indexes
+					in this table */
+	ibool			tmp_ok);/*!< in: TRUE=allow temporary
+					index names */
+#endif /* UNIV_DEBUG */
+/**********************************************************************//**
+Builds a node pointer out of a physical record and a page number.
+@return	own: node pointer */
+UNIV_INTERN
+dtuple_t*
+dict_index_build_node_ptr(
+/*======================*/
+	const dict_index_t*	index,	/*!< in: index */
+	const rec_t*		rec,	/*!< in: record for which to build node
+					pointer */
+	ulint			page_no,/*!< in: page number to put in node
+					pointer */
+	mem_heap_t*		heap,	/*!< in: memory heap where pointer
+					created */
+	ulint			level);	/*!< in: level of rec in tree:
+					0 means leaf level */
+/**********************************************************************//**
+Copies an initial segment of a physical record, long enough to specify an
+index entry uniquely.
+@return	pointer to the prefix record */
+UNIV_INTERN
+rec_t*
+dict_index_copy_rec_order_prefix(
+/*=============================*/
+	const dict_index_t*	index,	/*!< in: index */
+	const rec_t*		rec,	/*!< in: record for which to
+					copy prefix */
+	ulint*			n_fields,/*!< out: number of fields copied */
+	byte**			buf,	/*!< in/out: memory buffer for the
+					copied prefix, or NULL */
+	ulint*			buf_size);/*!< in/out: buffer size */
+/**********************************************************************//**
+Builds a typed data tuple out of a physical record.
+@return	own: data tuple */
+UNIV_INTERN
+dtuple_t*
+dict_index_build_data_tuple(
+/*========================*/
+	dict_index_t*	index,	/*!< in: index */
+	rec_t*		rec,	/*!< in: record for which to build data tuple */
+	ulint		n_fields,/*!< in: number of data fields */
+	mem_heap_t*	heap);	/*!< in: memory heap where tuple created */
+/*********************************************************************//**
+Gets the space id of the root of the index tree.
+@return	space id */
+UNIV_INLINE
+ulint
+dict_index_get_space(
+/*=================*/
+	const dict_index_t*	index);	/*!< in: index */
+/*********************************************************************//**
+Sets the space id of the root of the index tree. */
+UNIV_INLINE
+void
+dict_index_set_space(
+/*=================*/
+	dict_index_t*	index,	/*!< in/out: index */
+	ulint		space);	/*!< in: space id */
+/*********************************************************************//**
+Gets the page number of the root of the index tree.
+@return	page number */
+UNIV_INLINE
+ulint
+dict_index_get_page(
+/*================*/
+	const dict_index_t*	tree);	/*!< in: index */
+/*********************************************************************//**
+Sets the page number of the root of index tree. */
+UNIV_INLINE
+void
+dict_index_set_page(
+/*================*/
+	dict_index_t*	index,	/*!< in/out: index */
+	ulint		page);	/*!< in: page number */
+/*********************************************************************//**
+Gets the read-write lock of the index tree.
+@return	read-write lock */
+UNIV_INLINE
+rw_lock_t*
+dict_index_get_lock(
+/*================*/
+	dict_index_t*	index);	/*!< in: index */
+/********************************************************************//**
+Returns free space reserved for future updates of records. This is
+relevant only in the case of many consecutive inserts, as updates
+which make the records bigger might fragment the index.
+@return	number of free bytes on page, reserved for updates */
+UNIV_INLINE
+ulint
+dict_index_get_space_reserve(void);
+/*==============================*/
+/*********************************************************************//**
+Calculates the minimum record length in an index. */
+UNIV_INTERN
+ulint
+dict_index_calc_min_rec_len(
+/*========================*/
+	const dict_index_t*	index);	/*!< in: index */
+/*********************************************************************//**
+Calculates new estimates for table and index statistics. The statistics
+are used in query optimization. */
+UNIV_INTERN
+void
+dict_update_statistics_low(
+/*=======================*/
+	dict_table_t*	table,		/*!< in/out: table */
+	ibool		has_dict_mutex,	/*!< in: TRUE if the caller has the
+					dictionary mutex */
+	ibool		sync);
+/*********************************************************************//**
+Calculates new estimates for table and index statistics. The statistics
+are used in query optimization. */
+UNIV_INTERN
+void
+dict_update_statistics(
+/*===================*/
+	dict_table_t*	table,	/*!< in/out: table */
+	ibool		sync);
+/********************************************************************//**
+Reserves the dictionary system mutex for MySQL. */
+UNIV_INTERN
+void
+dict_mutex_enter_for_mysql(void);
+/*============================*/
+/********************************************************************//**
+Releases the dictionary system mutex for MySQL. */
+UNIV_INTERN
+void
+dict_mutex_exit_for_mysql(void);
+/*===========================*/
+/**********************************************************************//**
+Lock the appropriate mutex to protect index->stat_n_diff_key_vals[].
+index->id is used to pick the right mutex and it should not change
+before dict_index_stat_mutex_exit() is called on this index. */
+UNIV_INTERN
+void
+dict_index_stat_mutex_enter(
+/*========================*/
+	const dict_index_t*	index);	/*!< in: index */
+/**********************************************************************//**
+Unlock the appropriate mutex that protects index->stat_n_diff_key_vals[]. */
+UNIV_INTERN
+void
+dict_index_stat_mutex_exit(
+/*=======================*/
+	const dict_index_t*	index);	/*!< in: index */
+/********************************************************************//**
+Checks if the database name in two table names is the same.
+@return	TRUE if same db name */
+UNIV_INTERN
+ibool
+dict_tables_have_same_db(
+/*=====================*/
+	const char*	name1,	/*!< in: table name in the form
+				dbname '/' tablename */
+	const char*	name2);	/*!< in: table name in the form
+				dbname '/' tablename */
+/*********************************************************************//**
+Removes an index from the cache */
+UNIV_INTERN
+void
+dict_index_remove_from_cache(
+/*=========================*/
+	dict_table_t*	table,	/*!< in/out: table */
+	dict_index_t*	index);	/*!< in, own: index */
+/**********************************************************************//**
+Get index by name
+@return	index, NULL if does not exist */
+UNIV_INTERN
+dict_index_t*
+dict_table_get_index_on_name(
+/*=========================*/
+	dict_table_t*	table,	/*!< in: table */
+	const char*	name);	/*!< in: name of the index to find */
+/**********************************************************************//**
+In case there is more than one index with the same name return the index
+with the min(id).
+@return	index, NULL if does not exist */
+UNIV_INTERN
+dict_index_t*
+dict_table_get_index_on_name_and_min_id(
+/*====================================*/
+	dict_table_t*	table,	/*!< in: table */
+	const char*	name);	/*!< in: name of the index to find */
+
+UNIV_INTERN
+void
+dict_table_LRU_trim(
+/*================*/
+	dict_table_t*	self);
+/* Buffers for storing detailed information about the latest foreign key
+and unique key errors */
+extern FILE*	dict_foreign_err_file;
+extern mutex_t	dict_foreign_err_mutex; /* mutex protecting the buffers */
+
+/** the dictionary system */
+extern dict_sys_t*	dict_sys;
+/** the data dictionary rw-latch protecting dict_sys */
+extern rw_lock_t	dict_operation_lock;
+
+/* Dictionary system struct */
+struct dict_sys_struct{
+	mutex_t		mutex;		/*!< mutex protecting the data
+					dictionary; protects also the
+					disk-based dictionary system tables;
+					this mutex serializes CREATE TABLE
+					and DROP TABLE, as well as reading
+					the dictionary data for a table from
+					system tables */
+	dulint		row_id;		/*!< the next row id to assign;
+					NOTE that at a checkpoint this
+					must be written to the dict system
+					header and flushed to a file; in
+					recovery this must be derived from
+					the log records */
+	hash_table_t*	table_hash;	/*!< hash table of the tables, based
+					on name */
+	hash_table_t*	table_id_hash;	/*!< hash table of the tables, based
+					on id */
+	UT_LIST_BASE_NODE_T(dict_table_t)
+			table_LRU;	/*!< LRU list of tables */
+	ulint		size;		/*!< varying space in bytes occupied
+					by the data dictionary table and
+					index objects */
+	dict_table_t*	sys_tables;	/*!< SYS_TABLES table */
+	dict_table_t*	sys_columns;	/*!< SYS_COLUMNS table */
+	dict_table_t*	sys_indexes;	/*!< SYS_INDEXES table */
+	dict_table_t*	sys_fields;	/*!< SYS_FIELDS table */
+	dict_table_t*	sys_stats;	/*!< SYS_STATS table */
+};
+#endif /* !UNIV_HOTBACKUP */
+
+/** dummy index for ROW_FORMAT=REDUNDANT supremum and infimum records */
+extern dict_index_t*	dict_ind_redundant;
+/** dummy index for ROW_FORMAT=COMPACT supremum and infimum records */
+extern dict_index_t*	dict_ind_compact;
+
+/**********************************************************************//**
+Inits dict_ind_redundant and dict_ind_compact. */
+UNIV_INTERN
+void
+dict_ind_init(void);
+/*===============*/
+
+/**********************************************************************//**
+Closes the data dictionary module. */
+UNIV_INTERN
+void
+dict_close(void);
+/*============*/
+
+/*************************************************************************
+set is_corrupt flag by space_id*/
+
+void
+dict_table_set_corrupt_by_space(
+/*============================*/
+	ulint	space_id,
+	ibool	need_mutex);
+
+#ifndef UNIV_NONINL
+#include "dict0dict.ic"
+#endif
+
+#endif
diff --git a/storage/xtradb/include/dict0dict.ic b/storage/xtradb/include/dict0dict.ic
new file mode 100644
index 00000000000..bd7534dc7e2
--- /dev/null
+++ b/storage/xtradb/include/dict0dict.ic
@@ -0,0 +1,861 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/******************************************************************//**
+@file include/dict0dict.ic
+Data dictionary system
+
+Created 1/8/1996 Heikki Tuuri
+***********************************************************************/
+
+#include "data0type.h"
+#ifndef UNIV_HOTBACKUP
+#include "dict0load.h"
+#include "rem0types.h"
+
+/*********************************************************************//**
+Gets the column data type. */
+UNIV_INLINE
+void
+dict_col_copy_type(
+/*===============*/
+	const dict_col_t*	col,	/*!< in: column */
+	dtype_t*		type)	/*!< out: data type */
+{
+	ut_ad(col && type);
+
+	type->mtype = col->mtype;
+	type->prtype = col->prtype;
+	type->len = col->len;
+	type->mbminlen = col->mbminlen;
+	type->mbmaxlen = col->mbmaxlen;
+}
+#endif /* !UNIV_HOTBACKUP */
+
+#ifdef UNIV_DEBUG
+/*********************************************************************//**
+Assert that a column and a data type match.
+@return	TRUE */
+UNIV_INLINE
+ibool
+dict_col_type_assert_equal(
+/*=======================*/
+	const dict_col_t*	col,	/*!< in: column */
+	const dtype_t*		type)	/*!< in: data type */
+{
+	ut_ad(col);
+	ut_ad(type);
+
+	ut_ad(col->mtype == type->mtype);
+	ut_ad(col->prtype == type->prtype);
+	ut_ad(col->len == type->len);
+# ifndef UNIV_HOTBACKUP
+	ut_ad(col->mbminlen == type->mbminlen);
+	ut_ad(col->mbmaxlen == type->mbmaxlen);
+# endif /* !UNIV_HOTBACKUP */
+
+	return(TRUE);
+}
+#endif /* UNIV_DEBUG */
+
+#ifndef UNIV_HOTBACKUP
+/***********************************************************************//**
+Returns the minimum size of the column.
+@return	minimum size */
+UNIV_INLINE
+ulint
+dict_col_get_min_size(
+/*==================*/
+	const dict_col_t*	col)	/*!< in: column */
+{
+	return(dtype_get_min_size_low(col->mtype, col->prtype, col->len,
+				      col->mbminlen, col->mbmaxlen));
+}
+/***********************************************************************//**
+Returns the maximum size of the column.
+@return	maximum size */
+UNIV_INLINE
+ulint
+dict_col_get_max_size(
+/*==================*/
+	const dict_col_t*	col)	/*!< in: column */
+{
+	return(dtype_get_max_size_low(col->mtype, col->len));
+}
+#endif /* !UNIV_HOTBACKUP */
+/***********************************************************************//**
+Returns the size of a fixed size column, 0 if not a fixed size column.
+@return	fixed size, or 0 */
+UNIV_INLINE
+ulint
+dict_col_get_fixed_size(
+/*====================*/
+	const dict_col_t*	col,	/*!< in: column */
+	ulint			comp)	/*!< in: nonzero=ROW_FORMAT=COMPACT  */
+{
+	return(dtype_get_fixed_size_low(col->mtype, col->prtype, col->len,
+					col->mbminlen, col->mbmaxlen, comp));
+}
+/***********************************************************************//**
+Returns the ROW_FORMAT=REDUNDANT stored SQL NULL size of a column.
+For fixed length types it is the fixed length of the type, otherwise 0.
+@return	SQL null storage size in ROW_FORMAT=REDUNDANT */
+UNIV_INLINE
+ulint
+dict_col_get_sql_null_size(
+/*=======================*/
+	const dict_col_t*	col,	/*!< in: column */
+	ulint			comp)	/*!< in: nonzero=ROW_FORMAT=COMPACT  */
+{
+	return(dict_col_get_fixed_size(col, comp));
+}
+
+/*********************************************************************//**
+Gets the column number.
+@return	col->ind, table column position (starting from 0) */
+UNIV_INLINE
+ulint
+dict_col_get_no(
+/*============*/
+	const dict_col_t*	col)	/*!< in: column */
+{
+	ut_ad(col);
+
+	return(col->ind);
+}
+
+/*********************************************************************//**
+Gets the column position in the clustered index. */
+UNIV_INLINE
+ulint
+dict_col_get_clust_pos(
+/*===================*/
+	const dict_col_t*	col,		/*!< in: table column */
+	const dict_index_t*	clust_index)	/*!< in: clustered index */
+{
+	ulint	i;
+
+	ut_ad(col);
+	ut_ad(clust_index);
+	ut_ad(dict_index_is_clust(clust_index));
+
+	for (i = 0; i < clust_index->n_def; i++) {
+		const dict_field_t*	field = &clust_index->fields[i];
+
+		if (!field->prefix_len && field->col == col) {
+			return(i);
+		}
+	}
+
+	return(ULINT_UNDEFINED);
+}
+
+#ifndef UNIV_HOTBACKUP
+#ifdef UNIV_DEBUG
+/********************************************************************//**
+Gets the first index on the table (the clustered index).
+@return	index, NULL if none exists */
+UNIV_INLINE
+dict_index_t*
+dict_table_get_first_index(
+/*=======================*/
+	const dict_table_t*	table)	/*!< in: table */
+{
+	ut_ad(table);
+	ut_ad(table->magic_n == DICT_TABLE_MAGIC_N);
+
+	return(UT_LIST_GET_FIRST(((dict_table_t*) table)->indexes));
+}
+
+/********************************************************************//**
+Gets the next index on the table.
+@return	index, NULL if none left */
+UNIV_INLINE
+dict_index_t*
+dict_table_get_next_index(
+/*======================*/
+	const dict_index_t*	index)	/*!< in: index */
+{
+	ut_ad(index);
+	ut_ad(index->magic_n == DICT_INDEX_MAGIC_N);
+
+	return(UT_LIST_GET_NEXT(indexes, (dict_index_t*) index));
+}
+#endif /* UNIV_DEBUG */
+#endif /* !UNIV_HOTBACKUP */
+
+/********************************************************************//**
+Check whether the index is the clustered index.
+@return	nonzero for clustered index, zero for other indexes */
+UNIV_INLINE
+ulint
+dict_index_is_clust(
+/*================*/
+	const dict_index_t*	index)	/*!< in: index */
+{
+	ut_ad(index);
+	ut_ad(index->magic_n == DICT_INDEX_MAGIC_N);
+
+	return(UNIV_UNLIKELY(index->type & DICT_CLUSTERED));
+}
+/********************************************************************//**
+Check whether the index is unique.
+@return	nonzero for unique index, zero for other indexes */
+UNIV_INLINE
+ulint
+dict_index_is_unique(
+/*=================*/
+	const dict_index_t*	index)	/*!< in: index */
+{
+	ut_ad(index);
+	ut_ad(index->magic_n == DICT_INDEX_MAGIC_N);
+
+	return(UNIV_UNLIKELY(index->type & DICT_UNIQUE));
+}
+
+/********************************************************************//**
+Check whether the index is the insert buffer tree.
+@return	nonzero for insert buffer, zero for other indexes */
+UNIV_INLINE
+ulint
+dict_index_is_ibuf(
+/*===============*/
+	const dict_index_t*	index)	/*!< in: index */
+{
+	ut_ad(index);
+	ut_ad(index->magic_n == DICT_INDEX_MAGIC_N);
+
+	return(UNIV_UNLIKELY(index->type & DICT_IBUF));
+}
+
+/********************************************************************//**
+Check whether the index is a secondary index or the insert buffer tree.
+@return	nonzero for insert buffer, zero for other indexes */
+UNIV_INLINE
+ulint
+dict_index_is_sec_or_ibuf(
+/*======================*/
+	const dict_index_t*	index)	/*!< in: index */
+{
+	ulint	type;
+
+	ut_ad(index);
+	ut_ad(index->magic_n == DICT_INDEX_MAGIC_N);
+
+	type = index->type;
+
+	return(UNIV_LIKELY(!(type & DICT_CLUSTERED) || (type & DICT_IBUF)));
+}
+
+/********************************************************************//**
+Gets the number of user-defined columns in a table in the dictionary
+cache.
+@return	number of user-defined (e.g., not ROW_ID) columns of a table */
+UNIV_INLINE
+ulint
+dict_table_get_n_user_cols(
+/*=======================*/
+	const dict_table_t*	table)	/*!< in: table */
+{
+	ut_ad(table);
+	ut_ad(table->magic_n == DICT_TABLE_MAGIC_N);
+
+	return(table->n_cols - DATA_N_SYS_COLS);
+}
+
+/********************************************************************//**
+Gets the number of system columns in a table in the dictionary cache.
+@return	number of system (e.g., ROW_ID) columns of a table */
+UNIV_INLINE
+ulint
+dict_table_get_n_sys_cols(
+/*======================*/
+	const dict_table_t*	table __attribute__((unused)))	/*!< in: table */
+{
+	ut_ad(table);
+	ut_ad(table->magic_n == DICT_TABLE_MAGIC_N);
+	ut_ad(table->cached);
+
+	return(DATA_N_SYS_COLS);
+}
+
+/********************************************************************//**
+Gets the number of all columns (also system) in a table in the dictionary
+cache.
+@return	number of columns of a table */
+UNIV_INLINE
+ulint
+dict_table_get_n_cols(
+/*==================*/
+	const dict_table_t*	table)	/*!< in: table */
+{
+	ut_ad(table);
+	ut_ad(table->magic_n == DICT_TABLE_MAGIC_N);
+
+	return(table->n_cols);
+}
+
+#ifdef UNIV_DEBUG
+/********************************************************************//**
+Gets the nth column of a table.
+@return	pointer to column object */
+UNIV_INLINE
+dict_col_t*
+dict_table_get_nth_col(
+/*===================*/
+	const dict_table_t*	table,	/*!< in: table */
+	ulint			pos)	/*!< in: position of column */
+{
+	ut_ad(table);
+	ut_ad(pos < table->n_def);
+	ut_ad(table->magic_n == DICT_TABLE_MAGIC_N);
+
+	return((dict_col_t*) (table->cols) + pos);
+}
+
+/********************************************************************//**
+Gets the given system column of a table.
+@return	pointer to column object */
+UNIV_INLINE
+dict_col_t*
+dict_table_get_sys_col(
+/*===================*/
+	const dict_table_t*	table,	/*!< in: table */
+	ulint			sys)	/*!< in: DATA_ROW_ID, ... */
+{
+	dict_col_t*	col;
+
+	ut_ad(table);
+	ut_ad(sys < DATA_N_SYS_COLS);
+	ut_ad(table->magic_n == DICT_TABLE_MAGIC_N);
+
+	col = dict_table_get_nth_col(table, table->n_cols
+				     - DATA_N_SYS_COLS + sys);
+	ut_ad(col->mtype == DATA_SYS);
+	ut_ad(col->prtype == (sys | DATA_NOT_NULL));
+
+	return(col);
+}
+#endif /* UNIV_DEBUG */
+
+/********************************************************************//**
+Gets the given system column number of a table.
+@return	column number */
+UNIV_INLINE
+ulint
+dict_table_get_sys_col_no(
+/*======================*/
+	const dict_table_t*	table,	/*!< in: table */
+	ulint			sys)	/*!< in: DATA_ROW_ID, ... */
+{
+	ut_ad(table);
+	ut_ad(sys < DATA_N_SYS_COLS);
+	ut_ad(table->magic_n == DICT_TABLE_MAGIC_N);
+
+	return(table->n_cols - DATA_N_SYS_COLS + sys);
+}
+
+/********************************************************************//**
+Check whether the table uses the compact page format.
+@return	TRUE if table uses the compact page format */
+UNIV_INLINE
+ibool
+dict_table_is_comp(
+/*===============*/
+	const dict_table_t*	table)	/*!< in: table */
+{
+	ut_ad(table);
+
+#if DICT_TF_COMPACT != TRUE
+#error
+#endif
+
+	return(UNIV_LIKELY(table->flags & DICT_TF_COMPACT));
+}
+
+/********************************************************************//**
+Determine the file format of a table.
+@return	file format version */
+UNIV_INLINE
+ulint
+dict_table_get_format(
+/*==================*/
+	const dict_table_t*	table)	/*!< in: table */
+{
+	ut_ad(table);
+
+	return((table->flags & DICT_TF_FORMAT_MASK) >> DICT_TF_FORMAT_SHIFT);
+}
+
+/********************************************************************//**
+Determine the file format of a table. */
+UNIV_INLINE
+void
+dict_table_set_format(
+/*==================*/
+	dict_table_t*	table,	/*!< in/out: table */
+	ulint		format)	/*!< in: file format version */
+{
+	ut_ad(table);
+
+	table->flags = (table->flags & ~DICT_TF_FORMAT_MASK)
+		| (format << DICT_TF_FORMAT_SHIFT);
+}
+
+/********************************************************************//**
+Extract the compressed page size from table flags.
+@return	compressed page size, or 0 if not compressed */
+UNIV_INLINE
+ulint
+dict_table_flags_to_zip_size(
+/*=========================*/
+	ulint	flags)	/*!< in: flags */
+{
+	ulint	zip_size = flags & DICT_TF_ZSSIZE_MASK;
+
+	if (UNIV_UNLIKELY(zip_size)) {
+		zip_size = ((PAGE_ZIP_MIN_SIZE >> 1)
+			 << (zip_size >> DICT_TF_ZSSIZE_SHIFT));
+
+		ut_ad(zip_size <= UNIV_PAGE_SIZE);
+	}
+
+	return(zip_size);
+}
+
+/********************************************************************//**
+Check whether the table uses the compressed compact page format.
+@return	compressed page size, or 0 if not compressed */
+UNIV_INLINE
+ulint
+dict_table_zip_size(
+/*================*/
+	const dict_table_t*	table)	/*!< in: table */
+{
+	ut_ad(table);
+
+	return(dict_table_flags_to_zip_size(table->flags));
+}
+
+/*********************************************************************//**
+Obtain exclusive locks on all index trees of the table. This is to prevent
+accessing index trees while InnoDB is updating internal metadata for
+operations such as truncate tables. */
+UNIV_INLINE
+void
+dict_table_x_lock_indexes(
+/*======================*/
+	dict_table_t*	table)	/*!< in: table */
+{
+	dict_index_t*   index;
+
+	ut_a(table);
+	ut_ad(mutex_own(&(dict_sys->mutex)));
+
+	/* Loop through each index of the table and lock them */
+	for (index = dict_table_get_first_index(table);
+	     index != NULL;
+	     index = dict_table_get_next_index(index)) {
+		rw_lock_x_lock(dict_index_get_lock(index));
+	}
+}
+
+/*********************************************************************//**
+Release the exclusive locks on all index tree. */
+UNIV_INLINE
+void
+dict_table_x_unlock_indexes(
+/*========================*/
+	dict_table_t*	table)	/*!< in: table */
+{
+	dict_index_t*   index;
+
+	ut_a(table);
+	ut_ad(mutex_own(&(dict_sys->mutex)));
+
+	for (index = dict_table_get_first_index(table);
+	     index != NULL;
+	     index = dict_table_get_next_index(index)) {
+		rw_lock_x_unlock(dict_index_get_lock(index));
+	}
+}
+/********************************************************************//**
+Gets the number of fields in the internal representation of an index,
+including fields added by the dictionary system.
+@return	number of fields */
+UNIV_INLINE
+ulint
+dict_index_get_n_fields(
+/*====================*/
+	const dict_index_t*	index)	/*!< in: an internal
+					representation of index (in
+					the dictionary cache) */
+{
+	ut_ad(index);
+	ut_ad(index->magic_n == DICT_INDEX_MAGIC_N);
+
+	return(index->n_fields);
+}
+
+/********************************************************************//**
+Gets the number of fields in the internal representation of an index
+that uniquely determine the position of an index entry in the index, if
+we do not take multiversioning into account: in the B-tree use the value
+returned by dict_index_get_n_unique_in_tree.
+@return	number of fields */
+UNIV_INLINE
+ulint
+dict_index_get_n_unique(
+/*====================*/
+	const dict_index_t*	index)	/*!< in: an internal representation
+					of index (in the dictionary cache) */
+{
+	ut_ad(index);
+	ut_ad(index->magic_n == DICT_INDEX_MAGIC_N);
+	ut_ad(index->cached);
+
+	return(index->n_uniq);
+}
+
+/********************************************************************//**
+Gets the number of fields in the internal representation of an index
+which uniquely determine the position of an index entry in the index, if
+we also take multiversioning into account.
+@return	number of fields */
+UNIV_INLINE
+ulint
+dict_index_get_n_unique_in_tree(
+/*============================*/
+	const dict_index_t*	index)	/*!< in: an internal representation
+					of index (in the dictionary cache) */
+{
+	ut_ad(index);
+	ut_ad(index->magic_n == DICT_INDEX_MAGIC_N);
+	ut_ad(index->cached);
+
+	if (dict_index_is_clust(index)) {
+
+		return(dict_index_get_n_unique(index));
+	}
+
+	return(dict_index_get_n_fields(index));
+}
+
+/********************************************************************//**
+Gets the number of user-defined ordering fields in the index. In the internal
+representation of clustered indexes we add the row id to the ordering fields
+to make a clustered index unique, but this function returns the number of
+fields the user defined in the index as ordering fields.
+@return	number of fields */
+UNIV_INLINE
+ulint
+dict_index_get_n_ordering_defined_by_user(
+/*======================================*/
+	const dict_index_t*	index)	/*!< in: an internal representation
+					of index (in the dictionary cache) */
+{
+	return(index->n_user_defined_cols);
+}
+
+#ifdef UNIV_DEBUG
+/********************************************************************//**
+Gets the nth field of an index.
+@return	pointer to field object */
+UNIV_INLINE
+dict_field_t*
+dict_index_get_nth_field(
+/*=====================*/
+	const dict_index_t*	index,	/*!< in: index */
+	ulint			pos)	/*!< in: position of field */
+{
+	ut_ad(index);
+	ut_ad(pos < index->n_def);
+	ut_ad(index->magic_n == DICT_INDEX_MAGIC_N);
+
+	return((dict_field_t*) (index->fields) + pos);
+}
+#endif /* UNIV_DEBUG */
+
+/********************************************************************//**
+Returns the position of a system column in an index.
+@return	position, ULINT_UNDEFINED if not contained */
+UNIV_INLINE
+ulint
+dict_index_get_sys_col_pos(
+/*=======================*/
+	const dict_index_t*	index,	/*!< in: index */
+	ulint			type)	/*!< in: DATA_ROW_ID, ... */
+{
+	ut_ad(index);
+	ut_ad(index->magic_n == DICT_INDEX_MAGIC_N);
+	ut_ad(!(index->type & DICT_UNIVERSAL));
+
+	if (dict_index_is_clust(index)) {
+
+		return(dict_col_get_clust_pos(
+			       dict_table_get_sys_col(index->table, type),
+			       index));
+	}
+
+	return(dict_index_get_nth_col_pos(
+		       index, dict_table_get_sys_col_no(index->table, type)));
+}
+
+/*********************************************************************//**
+Gets the field column.
+@return	field->col, pointer to the table column */
+UNIV_INLINE
+const dict_col_t*
+dict_field_get_col(
+/*===============*/
+	const dict_field_t*	field)	/*!< in: index field */
+{
+	ut_ad(field);
+
+	return(field->col);
+}
+
+/********************************************************************//**
+Gets pointer to the nth column in an index.
+@return	column */
+UNIV_INLINE
+const dict_col_t*
+dict_index_get_nth_col(
+/*===================*/
+	const dict_index_t*	index,	/*!< in: index */
+	ulint			pos)	/*!< in: position of the field */
+{
+	return(dict_field_get_col(dict_index_get_nth_field(index, pos)));
+}
+
+/********************************************************************//**
+Gets the column number the nth field in an index.
+@return	column number */
+UNIV_INLINE
+ulint
+dict_index_get_nth_col_no(
+/*======================*/
+	const dict_index_t*	index,	/*!< in: index */
+	ulint			pos)	/*!< in: position of the field */
+{
+	return(dict_col_get_no(dict_index_get_nth_col(index, pos)));
+}
+
+#ifndef UNIV_HOTBACKUP
+/********************************************************************//**
+Returns the minimum data size of an index record.
+@return	minimum data size in bytes */
+UNIV_INLINE
+ulint
+dict_index_get_min_size(
+/*====================*/
+	const dict_index_t*	index)	/*!< in: index */
+{
+	ulint	n	= dict_index_get_n_fields(index);
+	ulint	size	= 0;
+
+	while (n--) {
+		size += dict_col_get_min_size(dict_index_get_nth_col(index,
+								     n));
+	}
+
+	return(size);
+}
+
+/*********************************************************************//**
+Gets the space id of the root of the index tree.
+@return	space id */
+UNIV_INLINE
+ulint
+dict_index_get_space(
+/*=================*/
+	const dict_index_t*	index)	/*!< in: index */
+{
+	ut_ad(index);
+	ut_ad(index->magic_n == DICT_INDEX_MAGIC_N);
+
+	return(index->space);
+}
+
+/*********************************************************************//**
+Sets the space id of the root of the index tree. */
+UNIV_INLINE
+void
+dict_index_set_space(
+/*=================*/
+	dict_index_t*	index,	/*!< in/out: index */
+	ulint		space)	/*!< in: space id */
+{
+	ut_ad(index);
+	ut_ad(index->magic_n == DICT_INDEX_MAGIC_N);
+
+	index->space = space;
+}
+
+/*********************************************************************//**
+Gets the page number of the root of the index tree.
+@return	page number */
+UNIV_INLINE
+ulint
+dict_index_get_page(
+/*================*/
+	const dict_index_t*	index)	/*!< in: index */
+{
+	ut_ad(index);
+	ut_ad(index->magic_n == DICT_INDEX_MAGIC_N);
+
+	return(index->page);
+}
+
+/*********************************************************************//**
+Sets the page number of the root of index tree. */
+UNIV_INLINE
+void
+dict_index_set_page(
+/*================*/
+	dict_index_t*	index,	/*!< in/out: index */
+	ulint		page)	/*!< in: page number */
+{
+	ut_ad(index);
+	ut_ad(index->magic_n == DICT_INDEX_MAGIC_N);
+
+	index->page = page;
+}
+
+/*********************************************************************//**
+Gets the read-write lock of the index tree.
+@return	read-write lock */
+UNIV_INLINE
+rw_lock_t*
+dict_index_get_lock(
+/*================*/
+	dict_index_t*	index)	/*!< in: index */
+{
+	ut_ad(index);
+	ut_ad(index->magic_n == DICT_INDEX_MAGIC_N);
+
+	return(&(index->lock));
+}
+
+/********************************************************************//**
+Returns free space reserved for future updates of records. This is
+relevant only in the case of many consecutive inserts, as updates
+which make the records bigger might fragment the index.
+@return	number of free bytes on page, reserved for updates */
+UNIV_INLINE
+ulint
+dict_index_get_space_reserve(void)
+/*==============================*/
+{
+	return(UNIV_PAGE_SIZE / 16);
+}
+
+/**********************************************************************//**
+Checks if a table is in the dictionary cache.
+@return	table, NULL if not found */
+UNIV_INLINE
+dict_table_t*
+dict_table_check_if_in_cache_low(
+/*=============================*/
+	const char*	table_name)	/*!< in: table name */
+{
+	dict_table_t*	table;
+	ulint		table_fold;
+
+	ut_ad(table_name);
+	ut_ad(mutex_own(&(dict_sys->mutex)));
+
+	/* Look for the table name in the hash table */
+	table_fold = ut_fold_string(table_name);
+
+	HASH_SEARCH(name_hash, dict_sys->table_hash, table_fold,
+		    dict_table_t*, table, ut_ad(table->cached),
+		    !strcmp(table->name, table_name));
+
+	/* make young in table_LRU */
+	if (table) {
+		UT_LIST_REMOVE(table_LRU, dict_sys->table_LRU, table);
+		UT_LIST_ADD_FIRST(table_LRU, dict_sys->table_LRU, table);
+	}
+
+	return(table);
+}
+
+/**********************************************************************//**
+Gets a table; loads it to the dictionary cache if necessary. A low-level
+function.
+@return	table, NULL if not found */
+UNIV_INLINE
+dict_table_t*
+dict_table_get_low(
+/*===============*/
+	const char*	table_name)	/*!< in: table name */
+{
+	dict_table_t*	table;
+
+	ut_ad(table_name);
+	ut_ad(mutex_own(&(dict_sys->mutex)));
+
+	table = dict_table_check_if_in_cache_low(table_name);
+
+	if (table == NULL) {
+		table = dict_load_table(table_name);
+	}
+
+	ut_ad(!table || table->cached);
+
+	return(table);
+}
+
+/**********************************************************************//**
+Returns a table object based on table id.
+@return	table, NULL if does not exist */
+UNIV_INLINE
+dict_table_t*
+dict_table_get_on_id_low(
+/*=====================*/
+	dulint	table_id)	/*!< in: table id */
+{
+	dict_table_t*	table;
+	ulint		fold;
+
+	ut_ad(mutex_own(&(dict_sys->mutex)));
+
+	/* Look for the table name in the hash table */
+	fold = ut_fold_dulint(table_id);
+
+	HASH_SEARCH(id_hash, dict_sys->table_id_hash, fold,
+		    dict_table_t*, table, ut_ad(table->cached),
+		    !ut_dulint_cmp(table->id, table_id));
+	if (table == NULL) {
+		table = dict_load_table_on_id(table_id);
+	}
+
+	/* make young in table_LRU */
+	if (table) {
+		UT_LIST_REMOVE(table_LRU, dict_sys->table_LRU, table);
+		UT_LIST_ADD_FIRST(table_LRU, dict_sys->table_LRU, table);
+	}
+
+	ut_ad(!table || table->cached);
+
+	/* TODO: should get the type information from MySQL */
+
+	return(table);
+}
+#endif /* !UNIV_HOTBACKUP */
diff --git a/storage/xtradb/include/dict0load.h b/storage/xtradb/include/dict0load.h
new file mode 100644
index 00000000000..f41882019d5
--- /dev/null
+++ b/storage/xtradb/include/dict0load.h
@@ -0,0 +1,117 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/dict0load.h
+Loads to the memory cache database object definitions
+from dictionary tables
+
+Created 4/24/1996 Heikki Tuuri
+*******************************************************/
+
+#ifndef dict0load_h
+#define dict0load_h
+
+#include "univ.i"
+#include "dict0types.h"
+#include "ut0byte.h"
+#include "mem0mem.h"
+
+/********************************************************************//**
+In a crash recovery we already have all the tablespace objects created.
+This function compares the space id information in the InnoDB data dictionary
+to what we already read with fil_load_single_table_tablespaces().
+
+In a normal startup, we create the tablespace objects for every table in
+InnoDB's data dictionary, if the corresponding .ibd file exists.
+We also scan the biggest space id, and store it to fil_system. */
+UNIV_INTERN
+void
+dict_check_tablespaces_and_store_max_id(
+/*====================================*/
+	ibool	in_crash_recovery);	/*!< in: are we doing a crash recovery */
+/********************************************************************//**
+Finds the first table name in the given database.
+@return own: table name, NULL if does not exist; the caller must free
+the memory in the string! */
+UNIV_INTERN
+char*
+dict_get_first_table_name_in_db(
+/*============================*/
+	const char*	name);	/*!< in: database name which ends to '/' */
+/********************************************************************//**
+Loads a table definition and also all its index definitions, and also
+the cluster definition if the table is a member in a cluster. Also loads
+all foreign key constraints where the foreign key is in the table or where
+a foreign key references columns in this table.
+@return table, NULL if does not exist; if the table is stored in an
+.ibd file, but the file does not exist, then we set the
+ibd_file_missing flag TRUE in the table object we return */
+UNIV_INTERN
+dict_table_t*
+dict_load_table(
+/*============*/
+	const char*	name);	/*!< in: table name in the
+				databasename/tablename format */
+/***********************************************************************//**
+Loads a table object based on the table id.
+@return	table; NULL if table does not exist */
+UNIV_INTERN
+dict_table_t*
+dict_load_table_on_id(
+/*==================*/
+	dulint	table_id);	/*!< in: table id */
+/********************************************************************//**
+This function is called when the database is booted.
+Loads system table index definitions except for the clustered index which
+is added to the dictionary cache at booting before calling this function. */
+UNIV_INTERN
+void
+dict_load_sys_table(
+/*================*/
+	dict_table_t*	table);	/*!< in: system table */
+/***********************************************************************//**
+Loads foreign key constraints where the table is either the foreign key
+holder or where the table is referenced by a foreign key. Adds these
+constraints to the data dictionary. Note that we know that the dictionary
+cache already contains all constraints where the other relevant table is
+already in the dictionary cache.
+@return	DB_SUCCESS or error code */
+UNIV_INTERN
+ulint
+dict_load_foreigns(
+/*===============*/
+	const char*	table_name,	/*!< in: table name */
+	ibool		check_recursive,/*!< in: Whether to check recursive
+					load of tables chained by FK */
+	ibool		check_charsets);/*!< in: TRUE=check charsets
+					compatibility */
+/********************************************************************//**
+Prints to the standard output information on all tables found in the data
+dictionary system table. */
+UNIV_INTERN
+void
+dict_print(void);
+/*============*/
+
+
+#ifndef UNIV_NONINL
+#include "dict0load.ic"
+#endif
+
+#endif
diff --git a/storage/xtradb/include/dict0load.ic b/storage/xtradb/include/dict0load.ic
new file mode 100644
index 00000000000..ccc16db165b
--- /dev/null
+++ b/storage/xtradb/include/dict0load.ic
@@ -0,0 +1,26 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/dict0load.ic
+Loads to the memory cache database object definitions
+from dictionary tables
+
+Created 4/24/1996 Heikki Tuuri
+*******************************************************/
+
diff --git a/storage/xtradb/include/dict0mem.h b/storage/xtradb/include/dict0mem.h
new file mode 100644
index 00000000000..6736c2a3a36
--- /dev/null
+++ b/storage/xtradb/include/dict0mem.h
@@ -0,0 +1,577 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/dict0mem.h
+Data dictionary memory object creation
+
+Created 1/8/1996 Heikki Tuuri
+*******************************************************/
+
+#ifndef dict0mem_h
+#define dict0mem_h
+
+#include "univ.i"
+#include "dict0types.h"
+#include "data0type.h"
+#include "mem0mem.h"
+#include "rem0types.h"
+#include "btr0types.h"
+#ifndef UNIV_HOTBACKUP
+# include "lock0types.h"
+# include "que0types.h"
+# include "sync0rw.h"
+#endif /* !UNIV_HOTBACKUP */
+#include "ut0mem.h"
+#include "ut0lst.h"
+#include "ut0rnd.h"
+#include "ut0byte.h"
+#include "hash0hash.h"
+#include "trx0types.h"
+
+/** Type flags of an index: OR'ing of the flags is allowed to define a
+combination of types */
+/* @{ */
+#define DICT_CLUSTERED	1	/*!< clustered index */
+#define DICT_UNIQUE	2	/*!< unique index */
+#define	DICT_UNIVERSAL	4	/*!< index which can contain records from any
+				other index */
+#define	DICT_IBUF 	8	/*!< insert buffer tree */
+/* @} */
+
+/** Types for a table object */
+#define DICT_TABLE_ORDINARY		1 /*!< ordinary table */
+#if 0 /* not implemented */
+#define	DICT_TABLE_CLUSTER_MEMBER	2
+#define	DICT_TABLE_CLUSTER		3 /* this means that the table is
+					  really a cluster definition */
+#endif
+
+/** Table flags.  All unused bits must be 0. */
+/* @{ */
+#define DICT_TF_COMPACT			1	/* Compact page format.
+						This must be set for
+						new file formats
+						(later than
+						DICT_TF_FORMAT_51). */
+
+/** Compressed page size (0=uncompressed, up to 15 compressed sizes) */
+/* @{ */
+#define DICT_TF_ZSSIZE_SHIFT		1
+#define DICT_TF_ZSSIZE_MASK		(15 << DICT_TF_ZSSIZE_SHIFT)
+#define DICT_TF_ZSSIZE_MAX (UNIV_PAGE_SIZE_SHIFT - PAGE_ZIP_MIN_SIZE_SHIFT + 1)
+/* @} */
+
+/** File format */
+/* @{ */
+#define DICT_TF_FORMAT_SHIFT		5	/* file format */
+#define DICT_TF_FORMAT_MASK		\
+((~(~0 << (DICT_TF_BITS - DICT_TF_FORMAT_SHIFT))) << DICT_TF_FORMAT_SHIFT)
+#define DICT_TF_FORMAT_51		0	/*!< InnoDB/MySQL up to 5.1 */
+#define DICT_TF_FORMAT_ZIP		1	/*!< InnoDB plugin for 5.1:
+						compressed tables,
+						new BLOB treatment */
+/** Maximum supported file format */
+#define DICT_TF_FORMAT_MAX		DICT_TF_FORMAT_ZIP
+/* @} */
+#define DICT_TF_BITS			6	/*!< number of flag bits */
+#if (1 << (DICT_TF_BITS - DICT_TF_FORMAT_SHIFT)) <= DICT_TF_FORMAT_MAX
+# error "DICT_TF_BITS is insufficient for DICT_TF_FORMAT_MAX"
+#endif
+/* @} */
+
+/** @brief Additional table flags.
+
+These flags will be stored in SYS_TABLES.MIX_LEN.  All unused flags
+will be written as 0.  The column may contain garbage for tables
+created with old versions of InnoDB that only implemented
+ROW_FORMAT=REDUNDANT. */
+/* @{ */
+#define DICT_TF2_SHIFT			DICT_TF_BITS
+						/*!< Shift value for
+						table->flags. */
+#define DICT_TF2_TEMPORARY		1	/*!< TRUE for tables from
+						CREATE TEMPORARY TABLE. */
+#define DICT_TF2_BITS			(DICT_TF2_SHIFT + 1)
+						/*!< Total number of bits
+						in table->flags. */
+/* @} */
+
+/** Tables could be chained together with Foreign key constraint. When
+first load the parent table, we would load all of its descedents.
+This could result in rescursive calls and out of stack error eventually.
+DICT_FK_MAX_RECURSIVE_LOAD defines the maximum number of recursive loads,
+when exceeded, the child table will not be loaded. It will be loaded when
+the foreign constraint check needs to be run. */
+#define DICT_FK_MAX_RECURSIVE_LOAD	250
+
+/** Similarly, when tables are chained together with foreign key constraints
+with on cascading delete/update clause, delete from parent table could
+result in recursive cascading calls. This defines the maximum number of
+such cascading deletes/updates allowed. When exceeded, the delete from
+parent table will fail, and user has to drop excessive foreign constraint
+before proceeds. */
+#define FK_MAX_CASCADE_DEL		300
+
+/**********************************************************************//**
+Creates a table memory object.
+@return	own: table object */
+UNIV_INTERN
+dict_table_t*
+dict_mem_table_create(
+/*==================*/
+	const char*	name,		/*!< in: table name */
+	ulint		space,		/*!< in: space where the clustered index
+					of the table is placed; this parameter
+					is ignored if the table is made
+					a member of a cluster */
+	ulint		n_cols,		/*!< in: number of columns */
+	ulint		flags);		/*!< in: table flags */
+/****************************************************************//**
+Free a table memory object. */
+UNIV_INTERN
+void
+dict_mem_table_free(
+/*================*/
+	dict_table_t*	table);		/*!< in: table */
+/**********************************************************************//**
+Adds a column definition to a table. */
+UNIV_INTERN
+void
+dict_mem_table_add_col(
+/*===================*/
+	dict_table_t*	table,	/*!< in: table */
+	mem_heap_t*	heap,	/*!< in: temporary memory heap, or NULL */
+	const char*	name,	/*!< in: column name, or NULL */
+	ulint		mtype,	/*!< in: main datatype */
+	ulint		prtype,	/*!< in: precise type */
+	ulint		len);	/*!< in: precision */
+/**********************************************************************//**
+Creates an index memory object.
+@return	own: index object */
+UNIV_INTERN
+dict_index_t*
+dict_mem_index_create(
+/*==================*/
+	const char*	table_name,	/*!< in: table name */
+	const char*	index_name,	/*!< in: index name */
+	ulint		space,		/*!< in: space where the index tree is
+					placed, ignored if the index is of
+					the clustered type */
+	ulint		type,		/*!< in: DICT_UNIQUE,
+					DICT_CLUSTERED, ... ORed */
+	ulint		n_fields);	/*!< in: number of fields */
+/**********************************************************************//**
+Adds a field definition to an index. NOTE: does not take a copy
+of the column name if the field is a column. The memory occupied
+by the column name may be released only after publishing the index. */
+UNIV_INTERN
+void
+dict_mem_index_add_field(
+/*=====================*/
+	dict_index_t*	index,		/*!< in: index */
+	const char*	name,		/*!< in: column name */
+	ulint		prefix_len);	/*!< in: 0 or the column prefix length
+					in a MySQL index like
+					INDEX (textcol(25)) */
+/**********************************************************************//**
+Frees an index memory object. */
+UNIV_INTERN
+void
+dict_mem_index_free(
+/*================*/
+	dict_index_t*	index);	/*!< in: index */
+/**********************************************************************//**
+Creates and initializes a foreign constraint memory object.
+@return	own: foreign constraint struct */
+UNIV_INTERN
+dict_foreign_t*
+dict_mem_foreign_create(void);
+/*=========================*/
+
+/** Data structure for a column in a table */
+struct dict_col_struct{
+	/*----------------------*/
+	/** The following are copied from dtype_t,
+	so that all bit-fields can be packed tightly. */
+	/* @{ */
+	unsigned	mtype:8;	/*!< main data type */
+	unsigned	prtype:24;	/*!< precise type; MySQL data
+					type, charset code, flags to
+					indicate nullability,
+					signedness, whether this is a
+					binary string, whether this is
+					a true VARCHAR where MySQL
+					uses 2 bytes to store the length */
+
+	/* the remaining fields do not affect alphabetical ordering: */
+
+	unsigned	len:16;		/*!< length; for MySQL data this
+					is field->pack_length(),
+					except that for a >= 5.0.3
+					type true VARCHAR this is the
+					maximum byte length of the
+					string data (in addition to
+					the string, MySQL uses 1 or 2
+					bytes to store the string length) */
+
+	unsigned	mbminlen:2;	/*!< minimum length of a
+					character, in bytes */
+	unsigned	mbmaxlen:3;	/*!< maximum length of a
+					character, in bytes */
+	/*----------------------*/
+	/* End of definitions copied from dtype_t */
+	/* @} */
+
+	unsigned	ind:10;		/*!< table column position
+					(starting from 0) */
+	unsigned	ord_part:1;	/*!< nonzero if this column
+					appears in the ordering fields
+					of an index */
+};
+
+/** @brief DICT_MAX_INDEX_COL_LEN is measured in bytes and is the maximum
+indexed column length (or indexed prefix length).
+
+It is set to 3*256, so that one can create a column prefix index on
+256 characters of a TEXT or VARCHAR column also in the UTF-8
+charset. In that charset, a character may take at most 3 bytes.  This
+constant MUST NOT BE CHANGED, or the compatibility of InnoDB data
+files would be at risk! */
+#define DICT_MAX_INDEX_COL_LEN		REC_MAX_INDEX_COL_LEN
+
+/** Data structure for a field in an index */
+struct dict_field_struct{
+	dict_col_t*	col;		/*!< pointer to the table column */
+	const char*	name;		/*!< name of the column */
+	unsigned	prefix_len:10;	/*!< 0 or the length of the column
+					prefix in bytes in a MySQL index of
+					type, e.g., INDEX (textcol(25));
+					must be smaller than
+					DICT_MAX_INDEX_COL_LEN; NOTE that
+					in the UTF-8 charset, MySQL sets this
+					to 3 * the prefix len in UTF-8 chars */
+	unsigned	fixed_len:10;	/*!< 0 or the fixed length of the
+					column if smaller than
+					DICT_MAX_INDEX_COL_LEN */
+};
+
+/** Data structure for an index.  Most fields will be
+initialized to 0, NULL or FALSE in dict_mem_index_create(). */
+struct dict_index_struct{
+	dulint		id;	/*!< id of the index */
+	mem_heap_t*	heap;	/*!< memory heap */
+	const char*	name;	/*!< index name */
+	const char*	table_name;/*!< table name */
+	dict_table_t*	table;	/*!< back pointer to table */
+#ifndef UNIV_HOTBACKUP
+	unsigned	space:32;
+				/*!< space where the index tree is placed */
+	unsigned	page:32;/*!< index tree root page number */
+#endif /* !UNIV_HOTBACKUP */
+	unsigned	type:4;	/*!< index type (DICT_CLUSTERED, DICT_UNIQUE,
+				DICT_UNIVERSAL, DICT_IBUF) */
+	unsigned	trx_id_offset:10;/*!< position of the trx id column
+				in a clustered index record, if the fields
+				before it are known to be of a fixed size,
+				0 otherwise */
+	unsigned	n_user_defined_cols:10;
+				/*!< number of columns the user defined to
+				be in the index: in the internal
+				representation we add more columns */
+	unsigned	n_uniq:10;/*!< number of fields from the beginning
+				which are enough to determine an index
+				entry uniquely */
+	unsigned	n_def:10;/*!< number of fields defined so far */
+	unsigned	n_fields:10;/*!< number of fields in the index */
+	unsigned	n_nullable:10;/*!< number of nullable fields */
+	unsigned	cached:1;/*!< TRUE if the index object is in the
+				dictionary cache */
+	unsigned	to_be_dropped:1;
+				/*!< TRUE if this index is marked to be
+				dropped in ha_innobase::prepare_drop_index(),
+				otherwise FALSE */
+	dict_field_t*	fields;	/*!< array of field descriptions */
+#ifndef UNIV_HOTBACKUP
+	UT_LIST_NODE_T(dict_index_t)
+			indexes;/*!< list of indexes of the table */
+	btr_search_t*	search_info; /*!< info used in optimistic searches */
+	/*----------------------*/
+	/** Statistics for query optimization */
+	/* @{ */
+	ib_int64_t*	stat_n_diff_key_vals;
+				/*!< approximate number of different
+				key values for this index, for each
+				n-column prefix where n <=
+				dict_get_n_unique(index); we
+				periodically calculate new
+				estimates */
+	ulint		stat_index_size;
+				/*!< approximate index size in
+				database pages */
+	ulint		stat_n_leaf_pages;
+				/*!< approximate number of leaf pages in the
+				index tree */
+	/* @} */
+	rw_lock_t	lock;	/*!< read-write lock protecting the
+				upper levels of the index tree */
+	ib_uint64_t	trx_id; /*!< id of the transaction that created this
+				index, or 0 if the index existed
+				when InnoDB was started up */
+#endif /* !UNIV_HOTBACKUP */
+#ifdef UNIV_DEBUG
+	ulint		magic_n;/*!< magic number */
+/** Value of dict_index_struct::magic_n */
+# define DICT_INDEX_MAGIC_N	76789786
+#endif
+};
+
+/** Data structure for a foreign key constraint; an example:
+FOREIGN KEY (A, B) REFERENCES TABLE2 (C, D).  Most fields will be
+initialized to 0, NULL or FALSE in dict_mem_foreign_create(). */
+struct dict_foreign_struct{
+	mem_heap_t*	heap;		/*!< this object is allocated from
+					this memory heap */
+	char*		id;		/*!< id of the constraint as a
+					null-terminated string */
+	unsigned	n_fields:10;	/*!< number of indexes' first fields
+					for which the foreign key
+					constraint is defined: we allow the
+					indexes to contain more fields than
+					mentioned in the constraint, as long
+					as the first fields are as mentioned */
+	unsigned	type:6;		/*!< 0 or DICT_FOREIGN_ON_DELETE_CASCADE
+					or DICT_FOREIGN_ON_DELETE_SET_NULL */
+	char*		foreign_table_name;/*!< foreign table name */
+	dict_table_t*	foreign_table;	/*!< table where the foreign key is */
+	const char**	foreign_col_names;/*!< names of the columns in the
+					foreign key */
+	char*		referenced_table_name;/*!< referenced table name */
+	dict_table_t*	referenced_table;/*!< table where the referenced key
+					is */
+	const char**	referenced_col_names;/*!< names of the referenced
+					columns in the referenced table */
+	dict_index_t*	foreign_index;	/*!< foreign index; we require that
+					both tables contain explicitly defined
+					indexes for the constraint: InnoDB
+					does not generate new indexes
+					implicitly */
+	dict_index_t*	referenced_index;/*!< referenced index */
+	UT_LIST_NODE_T(dict_foreign_t)
+			foreign_list;	/*!< list node for foreign keys of the
+					table */
+	UT_LIST_NODE_T(dict_foreign_t)
+			referenced_list;/*!< list node for referenced
+					keys of the table */
+};
+
+/** The flags for ON_UPDATE and ON_DELETE can be ORed; the default is that
+a foreign key constraint is enforced, therefore RESTRICT just means no flag */
+/* @{ */
+#define DICT_FOREIGN_ON_DELETE_CASCADE	1	/*!< ON DELETE CASCADE */
+#define DICT_FOREIGN_ON_DELETE_SET_NULL	2	/*!< ON UPDATE SET NULL */
+#define DICT_FOREIGN_ON_UPDATE_CASCADE	4	/*!< ON DELETE CASCADE */
+#define DICT_FOREIGN_ON_UPDATE_SET_NULL	8	/*!< ON UPDATE SET NULL */
+#define DICT_FOREIGN_ON_DELETE_NO_ACTION 16	/*!< ON DELETE NO ACTION */
+#define DICT_FOREIGN_ON_UPDATE_NO_ACTION 32	/*!< ON UPDATE NO ACTION */
+/* @} */
+
+
+/** Data structure for a database table.  Most fields will be
+initialized to 0, NULL or FALSE in dict_mem_table_create(). */
+struct dict_table_struct{
+	dulint		id;	/*!< id of the table */
+	mem_heap_t*	heap;	/*!< memory heap */
+	char*		name;	/*!< table name */
+	const char*	dir_path_of_temp_table;/*!< NULL or the directory path
+				where a TEMPORARY table that was explicitly
+				created by a user should be placed if
+				innodb_file_per_table is defined in my.cnf;
+				in Unix this is usually /tmp/..., in Windows
+				temp\... */
+	unsigned	space:32;
+				/*!< space where the clustered index of the
+				table is placed */
+	unsigned	flags:DICT_TF2_BITS;/*!< DICT_TF_COMPACT, ... */
+	unsigned	ibd_file_missing:1;
+				/*!< TRUE if this is in a single-table
+				tablespace and the .ibd file is missing; then
+				we must return in ha_innodb.cc an error if the
+				user tries to query such an orphaned table */
+	unsigned	tablespace_discarded:1;
+				/*!< this flag is set TRUE when the user
+				calls DISCARD TABLESPACE on this
+				table, and reset to FALSE in IMPORT
+				TABLESPACE */
+	unsigned	cached:1;/*!< TRUE if the table object has been added
+				to the dictionary cache */
+	unsigned	n_def:10;/*!< number of columns defined so far */
+	unsigned	n_cols:10;/*!< number of columns */
+	dict_col_t*	cols;	/*!< array of column descriptions */
+	const char*	col_names;
+				/*!< Column names packed in a character string
+				"name1\0name2\0...nameN\0".  Until
+				the string contains n_cols, it will be
+				allocated from a temporary heap.  The final
+				string will be allocated from table->heap. */
+#ifndef UNIV_HOTBACKUP
+	hash_node_t	name_hash; /*!< hash chain node */
+	hash_node_t	id_hash; /*!< hash chain node */
+	UT_LIST_BASE_NODE_T(dict_index_t)
+			indexes; /*!< list of indexes of the table */
+	UT_LIST_BASE_NODE_T(dict_foreign_t)
+			foreign_list;/*!< list of foreign key constraints
+				in the table; these refer to columns
+				in other tables */
+	UT_LIST_BASE_NODE_T(dict_foreign_t)
+			referenced_list;/*!< list of foreign key constraints
+				which refer to this table */
+	UT_LIST_NODE_T(dict_table_t)
+			table_LRU; /*!< node of the LRU list of tables */
+	ulint		n_mysql_handles_opened;
+				/*!< count of how many handles MySQL has opened
+				to this table; dropping of the table is
+				NOT allowed until this count gets to zero;
+				MySQL does NOT itself check the number of
+				open handles at drop */
+	unsigned	fk_max_recusive_level:8;
+				/*!< maximum recursive level we support when
+				loading tables chained together with FK
+				constraints. If exceeds this level, we will
+				stop loading child table into memory along with
+				its parent table */
+	ulint		n_foreign_key_checks_running;
+				/*!< count of how many foreign key check
+				operations are currently being performed
+				on the table: we cannot drop the table while
+				there are foreign key checks running on
+				it! */
+	trx_id_t	query_cache_inv_trx_id;
+				/*!< transactions whose trx id is
+				smaller than this number are not
+				allowed to store to the MySQL query
+				cache or retrieve from it; when a trx
+				with undo logs commits, it sets this
+				to the value of the trx id counter for
+				the tables it had an IX lock on */
+	UT_LIST_BASE_NODE_T(lock_t)
+			locks; /*!< list of locks on the table */
+#ifdef UNIV_DEBUG
+	/*----------------------*/
+	ibool		does_not_fit_in_memory;
+				/*!< this field is used to specify in
+				simulations tables which are so big
+				that disk should be accessed: disk
+				access is simulated by putting the
+				thread to sleep for a while; NOTE that
+				this flag is not stored to the data
+				dictionary on disk, and the database
+				will forget about value TRUE if it has
+				to reload the table definition from
+				disk */
+#endif /* UNIV_DEBUG */
+	/*----------------------*/
+	unsigned	big_rows:1;
+				/*!< flag: TRUE if the maximum length of
+				a single row exceeds BIG_ROW_SIZE;
+				initialized in dict_table_add_to_cache() */
+				/** Statistics for query optimization */
+				/* @{ */
+	unsigned	stat_initialized:1; /*!< TRUE if statistics have
+				been calculated the first time
+				after database startup or table creation */
+	ib_int64_t	stat_n_rows;
+				/*!< approximate number of rows in the table;
+				we periodically calculate new estimates */
+	ulint		stat_clustered_index_size;
+				/*!< approximate clustered index size in
+				database pages */
+	ulint		stat_sum_of_other_index_sizes;
+				/*!< other indexes in database pages */
+	ulint		stat_modified_counter;
+				/*!< when a row is inserted, updated,
+				or deleted,
+				we add 1 to this number; we calculate new
+				estimates for the stat_... values for the
+				table and the indexes at an interval of 2 GB
+				or when about 1 / 16 of table has been
+				modified; also when the estimate operation is
+				called for MySQL SHOW TABLE STATUS; the
+				counter is reset to zero at statistics
+				calculation; this counter is not protected by
+				any latch, because this is only used for
+				heuristics */
+				/* @} */
+	/*----------------------*/
+				/**!< The following fields are used by the
+				AUTOINC code.  The actual collection of
+				tables locked during AUTOINC read/write is
+				kept in trx_t. In order to quickly determine
+				whether a transaction has locked the AUTOINC
+				lock we keep a pointer to the transaction
+				here in the autoinc_trx variable. This is to
+				avoid acquiring the kernel mutex and scanning
+				the vector in trx_t.
+
+				When an AUTOINC lock has to wait, the
+				corresponding lock instance is created on
+				the trx lock heap rather than use the
+				pre-allocated instance in autoinc_lock below.*/
+				/* @{ */
+	lock_t*		autoinc_lock;
+				/*!< a buffer for an AUTOINC lock
+				for this table: we allocate the memory here
+				so that individual transactions can get it
+				and release it without a need to allocate
+				space from the lock heap of the trx:
+				otherwise the lock heap would grow rapidly
+				if we do a large insert from a select */
+	mutex_t		autoinc_mutex;
+				/*!< mutex protecting the autoincrement
+				counter */
+	ib_uint64_t	autoinc;/*!< autoinc counter value to give to the
+				next inserted row */
+	ulong		n_waiting_or_granted_auto_inc_locks;
+				/*!< This counter is used to track the number
+				of granted and pending autoinc locks on this
+				table. This value is set after acquiring the
+				kernel mutex but we peek the contents to
+				determine whether other transactions have
+				acquired the AUTOINC lock or not. Of course
+				only one transaction can be granted the
+				lock but there can be multiple waiters. */
+	const trx_t*		autoinc_trx;
+				/*!< The transaction that currently holds the
+				the AUTOINC lock on this table. */
+				/* @} */
+	/*----------------------*/
+	ibool		is_corrupt;
+#endif /* !UNIV_HOTBACKUP */
+
+#ifdef UNIV_DEBUG
+	ulint		magic_n;/*!< magic number */
+/** Value of dict_table_struct::magic_n */
+# define DICT_TABLE_MAGIC_N	76333786
+#endif /* UNIV_DEBUG */
+};
+
+#ifndef UNIV_NONINL
+#include "dict0mem.ic"
+#endif
+
+#endif
diff --git a/storage/xtradb/include/dict0mem.ic b/storage/xtradb/include/dict0mem.ic
new file mode 100644
index 00000000000..c36adb07a18
--- /dev/null
+++ b/storage/xtradb/include/dict0mem.ic
@@ -0,0 +1,26 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/******************************************************************//**
+@file include/dict0mem.ic
+Data dictionary memory object creation
+
+Created 1/8/1996 Heikki Tuuri
+***********************************************************************/
+
+
diff --git a/storage/xtradb/include/dict0types.h b/storage/xtradb/include/dict0types.h
new file mode 100644
index 00000000000..7ad69193cc9
--- /dev/null
+++ b/storage/xtradb/include/dict0types.h
@@ -0,0 +1,48 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/dict0types.h
+Data dictionary global types
+
+Created 1/8/1996 Heikki Tuuri
+*******************************************************/
+
+#ifndef dict0types_h
+#define dict0types_h
+
+typedef struct dict_sys_struct		dict_sys_t;
+typedef struct dict_col_struct		dict_col_t;
+typedef struct dict_field_struct	dict_field_t;
+typedef struct dict_index_struct	dict_index_t;
+typedef struct dict_table_struct	dict_table_t;
+typedef struct dict_foreign_struct	dict_foreign_t;
+
+/* A cluster object is a table object with the type field set to
+DICT_CLUSTERED */
+
+typedef dict_table_t			dict_cluster_t;
+
+typedef struct ind_node_struct		ind_node_t;
+typedef struct tab_node_struct		tab_node_t;
+
+/* Space id and page no where the dictionary header resides */
+#define	DICT_HDR_SPACE		0	/* the SYSTEM tablespace */
+#define	DICT_HDR_PAGE_NO	FSP_DICT_HDR_PAGE_NO
+
+#endif
diff --git a/storage/xtradb/include/dyn0dyn.h b/storage/xtradb/include/dyn0dyn.h
new file mode 100644
index 00000000000..121a5946ac7
--- /dev/null
+++ b/storage/xtradb/include/dyn0dyn.h
@@ -0,0 +1,188 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/dyn0dyn.h
+The dynamically allocated array
+
+Created 2/5/1996 Heikki Tuuri
+*******************************************************/
+
+#ifndef dyn0dyn_h
+#define dyn0dyn_h
+
+#include "univ.i"
+#include "ut0lst.h"
+#include "mem0mem.h"
+
+/** A block in a dynamically allocated array */
+typedef struct dyn_block_struct		dyn_block_t;
+/** Dynamically allocated array */
+typedef dyn_block_t			dyn_array_t;
+
+
+/** This is the initial 'payload' size of a dynamic array;
+this must be > MLOG_BUF_MARGIN + 30! */
+#define	DYN_ARRAY_DATA_SIZE	512
+
+/*********************************************************************//**
+Initializes a dynamic array.
+@return	initialized dyn array */
+UNIV_INLINE
+dyn_array_t*
+dyn_array_create(
+/*=============*/
+	dyn_array_t*	arr);	/*!< in: pointer to a memory buffer of
+				size sizeof(dyn_array_t) */
+/************************************************************//**
+Frees a dynamic array. */
+UNIV_INLINE
+void
+dyn_array_free(
+/*===========*/
+	dyn_array_t*	arr);	/*!< in: dyn array */
+/*********************************************************************//**
+Makes room on top of a dyn array and returns a pointer to a buffer in it.
+After copying the elements, the caller must close the buffer using
+dyn_array_close.
+@return	pointer to the buffer */
+UNIV_INLINE
+byte*
+dyn_array_open(
+/*===========*/
+	dyn_array_t*	arr,	/*!< in: dynamic array */
+	ulint		size);	/*!< in: size in bytes of the buffer; MUST be
+				smaller than DYN_ARRAY_DATA_SIZE! */
+/*********************************************************************//**
+Closes the buffer returned by dyn_array_open. */
+UNIV_INLINE
+void
+dyn_array_close(
+/*============*/
+	dyn_array_t*	arr,	/*!< in: dynamic array */
+	byte*		ptr);	/*!< in: buffer space from ptr up was not used */
+/*********************************************************************//**
+Makes room on top of a dyn array and returns a pointer to
+the added element. The caller must copy the element to
+the pointer returned.
+@return	pointer to the element */
+UNIV_INLINE
+void*
+dyn_array_push(
+/*===========*/
+	dyn_array_t*	arr,	/*!< in: dynamic array */
+	ulint		size);	/*!< in: size in bytes of the element */
+/************************************************************//**
+Returns pointer to an element in dyn array.
+@return	pointer to element */
+UNIV_INLINE
+void*
+dyn_array_get_element(
+/*==================*/
+	dyn_array_t*	arr,	/*!< in: dyn array */
+	ulint		pos);	/*!< in: position of element as bytes
+				from array start */
+/************************************************************//**
+Returns the size of stored data in a dyn array.
+@return	data size in bytes */
+UNIV_INLINE
+ulint
+dyn_array_get_data_size(
+/*====================*/
+	dyn_array_t*	arr);	/*!< in: dyn array */
+/************************************************************//**
+Gets the first block in a dyn array. */
+UNIV_INLINE
+dyn_block_t*
+dyn_array_get_first_block(
+/*======================*/
+	dyn_array_t*	arr);	/*!< in: dyn array */
+/************************************************************//**
+Gets the last block in a dyn array. */
+UNIV_INLINE
+dyn_block_t*
+dyn_array_get_last_block(
+/*=====================*/
+	dyn_array_t*	arr);	/*!< in: dyn array */
+/********************************************************************//**
+Gets the next block in a dyn array.
+@return	pointer to next, NULL if end of list */
+UNIV_INLINE
+dyn_block_t*
+dyn_array_get_next_block(
+/*=====================*/
+	dyn_array_t*	arr,	/*!< in: dyn array */
+	dyn_block_t*	block);	/*!< in: dyn array block */
+/********************************************************************//**
+Gets the number of used bytes in a dyn array block.
+@return	number of bytes used */
+UNIV_INLINE
+ulint
+dyn_block_get_used(
+/*===============*/
+	dyn_block_t*	block);	/*!< in: dyn array block */
+/********************************************************************//**
+Gets pointer to the start of data in a dyn array block.
+@return	pointer to data */
+UNIV_INLINE
+byte*
+dyn_block_get_data(
+/*===============*/
+	dyn_block_t*	block);	/*!< in: dyn array block */
+/********************************************************//**
+Pushes n bytes to a dyn array. */
+UNIV_INLINE
+void
+dyn_push_string(
+/*============*/
+	dyn_array_t*	arr,	/*!< in: dyn array */
+	const byte*	str,	/*!< in: string to write */
+	ulint		len);	/*!< in: string length */
+
+/*#################################################################*/
+
+/** @brief A block in a dynamically allocated array.
+NOTE! Do not access the fields of the struct directly: the definition
+appears here only for the compiler to know its size! */
+struct dyn_block_struct{
+	mem_heap_t*	heap;	/*!< in the first block this is != NULL
+				if dynamic allocation has been needed */
+	ulint		used;	/*!< number of data bytes used in this block;
+				DYN_BLOCK_FULL_FLAG is set when the block
+				becomes full */
+	byte		data[DYN_ARRAY_DATA_SIZE];
+				/*!< storage for array elements */
+	UT_LIST_BASE_NODE_T(dyn_block_t) base;
+				/*!< linear list of dyn blocks: this node is
+				used only in the first block */
+	UT_LIST_NODE_T(dyn_block_t) list;
+				/*!< linear list node: used in all blocks */
+#ifdef UNIV_DEBUG
+	ulint		buf_end;/*!< only in the debug version: if dyn
+				array is opened, this is the buffer
+				end offset, else this is 0 */
+	ulint		magic_n;/*!< magic number (DYN_BLOCK_MAGIC_N) */
+#endif
+};
+
+
+#ifndef UNIV_NONINL
+#include "dyn0dyn.ic"
+#endif
+
+#endif
diff --git a/storage/xtradb/include/dyn0dyn.ic b/storage/xtradb/include/dyn0dyn.ic
new file mode 100644
index 00000000000..110e674abff
--- /dev/null
+++ b/storage/xtradb/include/dyn0dyn.ic
@@ -0,0 +1,365 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/dyn0dyn.ic
+The dynamically allocated array
+
+Created 2/5/1996 Heikki Tuuri
+*******************************************************/
+
+/** Value of dyn_block_struct::magic_n */
+#define DYN_BLOCK_MAGIC_N	375767
+/** Flag for dyn_block_struct::used that indicates a full block */
+#define DYN_BLOCK_FULL_FLAG	0x1000000UL
+
+/************************************************************//**
+Adds a new block to a dyn array.
+@return	created block */
+UNIV_INTERN
+dyn_block_t*
+dyn_array_add_block(
+/*================*/
+	dyn_array_t*	arr);	/*!< in: dyn array */
+
+
+/************************************************************//**
+Gets the first block in a dyn array. */
+UNIV_INLINE
+dyn_block_t*
+dyn_array_get_first_block(
+/*======================*/
+	dyn_array_t*	arr)	/*!< in: dyn array */
+{
+	return(arr);
+}
+
+/************************************************************//**
+Gets the last block in a dyn array. */
+UNIV_INLINE
+dyn_block_t*
+dyn_array_get_last_block(
+/*=====================*/
+	dyn_array_t*	arr)	/*!< in: dyn array */
+{
+	if (arr->heap == NULL) {
+
+		return(arr);
+	}
+
+	return(UT_LIST_GET_LAST(arr->base));
+}
+
+/********************************************************************//**
+Gets the next block in a dyn array.
+@return	pointer to next, NULL if end of list */
+UNIV_INLINE
+dyn_block_t*
+dyn_array_get_next_block(
+/*=====================*/
+	dyn_array_t*	arr,	/*!< in: dyn array */
+	dyn_block_t*	block)	/*!< in: dyn array block */
+{
+	ut_ad(arr && block);
+
+	if (arr->heap == NULL) {
+		ut_ad(arr == block);
+
+		return(NULL);
+	}
+
+	return(UT_LIST_GET_NEXT(list, block));
+}
+
+/********************************************************************//**
+Gets the number of used bytes in a dyn array block.
+@return	number of bytes used */
+UNIV_INLINE
+ulint
+dyn_block_get_used(
+/*===============*/
+	dyn_block_t*	block)	/*!< in: dyn array block */
+{
+	ut_ad(block);
+
+	return((block->used) & ~DYN_BLOCK_FULL_FLAG);
+}
+
+/********************************************************************//**
+Gets pointer to the start of data in a dyn array block.
+@return	pointer to data */
+UNIV_INLINE
+byte*
+dyn_block_get_data(
+/*===============*/
+	dyn_block_t*	block)	/*!< in: dyn array block */
+{
+	ut_ad(block);
+
+	return(block->data);
+}
+
+/*********************************************************************//**
+Initializes a dynamic array.
+@return	initialized dyn array */
+UNIV_INLINE
+dyn_array_t*
+dyn_array_create(
+/*=============*/
+	dyn_array_t*	arr)	/*!< in: pointer to a memory buffer of
+				size sizeof(dyn_array_t) */
+{
+	ut_ad(arr);
+#if DYN_ARRAY_DATA_SIZE >= DYN_BLOCK_FULL_FLAG
+# error "DYN_ARRAY_DATA_SIZE >= DYN_BLOCK_FULL_FLAG"
+#endif
+
+	arr->heap = NULL;
+	arr->used = 0;
+
+#ifdef UNIV_DEBUG
+	arr->buf_end = 0;
+	arr->magic_n = DYN_BLOCK_MAGIC_N;
+#endif
+	return(arr);
+}
+
+/************************************************************//**
+Frees a dynamic array. */
+UNIV_INLINE
+void
+dyn_array_free(
+/*===========*/
+	dyn_array_t*	arr)	/*!< in: dyn array */
+{
+	if (arr->heap != NULL) {
+		mem_heap_free(arr->heap);
+	}
+
+#ifdef UNIV_DEBUG
+	arr->magic_n = 0;
+#endif
+}
+
+/*********************************************************************//**
+Makes room on top of a dyn array and returns a pointer to the added element.
+The caller must copy the element to the pointer returned.
+@return	pointer to the element */
+UNIV_INLINE
+void*
+dyn_array_push(
+/*===========*/
+	dyn_array_t*	arr,	/*!< in: dynamic array */
+	ulint		size)	/*!< in: size in bytes of the element */
+{
+	dyn_block_t*	block;
+	ulint		used;
+
+	ut_ad(arr);
+	ut_ad(arr->magic_n == DYN_BLOCK_MAGIC_N);
+	ut_ad(size <= DYN_ARRAY_DATA_SIZE);
+	ut_ad(size);
+
+	block = arr;
+	used = block->used;
+
+	if (used + size > DYN_ARRAY_DATA_SIZE) {
+		/* Get the last array block */
+
+		block = dyn_array_get_last_block(arr);
+		used = block->used;
+
+		if (used + size > DYN_ARRAY_DATA_SIZE) {
+			block = dyn_array_add_block(arr);
+			used = block->used;
+		}
+	}
+
+	block->used = used + size;
+	ut_ad(block->used <= DYN_ARRAY_DATA_SIZE);
+
+	return((block->data) + used);
+}
+
+/*********************************************************************//**
+Makes room on top of a dyn array and returns a pointer to a buffer in it.
+After copying the elements, the caller must close the buffer using
+dyn_array_close.
+@return	pointer to the buffer */
+UNIV_INLINE
+byte*
+dyn_array_open(
+/*===========*/
+	dyn_array_t*	arr,	/*!< in: dynamic array */
+	ulint		size)	/*!< in: size in bytes of the buffer; MUST be
+				smaller than DYN_ARRAY_DATA_SIZE! */
+{
+	dyn_block_t*	block;
+	ulint		used;
+
+	ut_ad(arr);
+	ut_ad(arr->magic_n == DYN_BLOCK_MAGIC_N);
+	ut_ad(size <= DYN_ARRAY_DATA_SIZE);
+	ut_ad(size);
+
+	block = arr;
+	used = block->used;
+
+	if (used + size > DYN_ARRAY_DATA_SIZE) {
+		/* Get the last array block */
+
+		block = dyn_array_get_last_block(arr);
+		used = block->used;
+
+		if (used + size > DYN_ARRAY_DATA_SIZE) {
+			block = dyn_array_add_block(arr);
+			used = block->used;
+			ut_a(size <= DYN_ARRAY_DATA_SIZE);
+		}
+	}
+
+	ut_ad(block->used <= DYN_ARRAY_DATA_SIZE);
+#ifdef UNIV_DEBUG
+	ut_ad(arr->buf_end == 0);
+
+	arr->buf_end = used + size;
+#endif
+	return((block->data) + used);
+}
+
+/*********************************************************************//**
+Closes the buffer returned by dyn_array_open. */
+UNIV_INLINE
+void
+dyn_array_close(
+/*============*/
+	dyn_array_t*	arr,	/*!< in: dynamic array */
+	byte*		ptr)	/*!< in: buffer space from ptr up was not used */
+{
+	dyn_block_t*	block;
+
+	ut_ad(arr);
+	ut_ad(arr->magic_n == DYN_BLOCK_MAGIC_N);
+
+	block = dyn_array_get_last_block(arr);
+
+	ut_ad(arr->buf_end + block->data >= ptr);
+
+	block->used = ptr - block->data;
+
+	ut_ad(block->used <= DYN_ARRAY_DATA_SIZE);
+
+#ifdef UNIV_DEBUG
+	arr->buf_end = 0;
+#endif
+}
+
+/************************************************************//**
+Returns pointer to an element in dyn array.
+@return	pointer to element */
+UNIV_INLINE
+void*
+dyn_array_get_element(
+/*==================*/
+	dyn_array_t*	arr,	/*!< in: dyn array */
+	ulint		pos)	/*!< in: position of element as bytes
+				from array start */
+{
+	dyn_block_t*	block;
+	ulint		used;
+
+	ut_ad(arr);
+	ut_ad(arr->magic_n == DYN_BLOCK_MAGIC_N);
+
+	/* Get the first array block */
+	block = dyn_array_get_first_block(arr);
+
+	if (arr->heap != NULL) {
+		used = dyn_block_get_used(block);
+
+		while (pos >= used) {
+			pos -= used;
+			block = UT_LIST_GET_NEXT(list, block);
+			ut_ad(block);
+
+			used = dyn_block_get_used(block);
+		}
+	}
+
+	ut_ad(block);
+	ut_ad(dyn_block_get_used(block) >= pos);
+
+	return(block->data + pos);
+}
+
+/************************************************************//**
+Returns the size of stored data in a dyn array.
+@return	data size in bytes */
+UNIV_INLINE
+ulint
+dyn_array_get_data_size(
+/*====================*/
+	dyn_array_t*	arr)	/*!< in: dyn array */
+{
+	dyn_block_t*	block;
+	ulint		sum	= 0;
+
+	ut_ad(arr);
+	ut_ad(arr->magic_n == DYN_BLOCK_MAGIC_N);
+
+	if (arr->heap == NULL) {
+
+		return(arr->used);
+	}
+
+	/* Get the first array block */
+	block = dyn_array_get_first_block(arr);
+
+	while (block != NULL) {
+		sum += dyn_block_get_used(block);
+		block = dyn_array_get_next_block(arr, block);
+	}
+
+	return(sum);
+}
+
+/********************************************************//**
+Pushes n bytes to a dyn array. */
+UNIV_INLINE
+void
+dyn_push_string(
+/*============*/
+	dyn_array_t*	arr,	/*!< in: dyn array */
+	const byte*	str,	/*!< in: string to write */
+	ulint		len)	/*!< in: string length */
+{
+	ulint	n_copied;
+
+	while (len > 0) {
+		if (len > DYN_ARRAY_DATA_SIZE) {
+			n_copied = DYN_ARRAY_DATA_SIZE;
+		} else {
+			n_copied = len;
+		}
+
+		memcpy(dyn_array_push(arr, n_copied), str, n_copied);
+
+		str += n_copied;
+		len -= n_copied;
+	}
+}
diff --git a/storage/xtradb/include/eval0eval.h b/storage/xtradb/include/eval0eval.h
new file mode 100644
index 00000000000..60aefd8d453
--- /dev/null
+++ b/storage/xtradb/include/eval0eval.h
@@ -0,0 +1,114 @@
+/*****************************************************************************
+
+Copyright (c) 1997, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/eval0eval.h
+SQL evaluator: evaluates simple data structures, like expressions, in
+a query graph
+
+Created 12/29/1997 Heikki Tuuri
+*******************************************************/
+
+#ifndef eval0eval_h
+#define eval0eval_h
+
+#include "univ.i"
+#include "que0types.h"
+#include "pars0sym.h"
+#include "pars0pars.h"
+
+/*****************************************************************//**
+Free the buffer from global dynamic memory for a value of a que_node,
+if it has been allocated in the above function. The freeing for pushed
+column values is done in sel_col_prefetch_buf_free. */
+UNIV_INTERN
+void
+eval_node_free_val_buf(
+/*===================*/
+	que_node_t*	node);	/*!< in: query graph node */
+/*****************************************************************//**
+Evaluates a symbol table symbol. */
+UNIV_INLINE
+void
+eval_sym(
+/*=====*/
+	sym_node_t*	sym_node);	/*!< in: symbol table node */
+/*****************************************************************//**
+Evaluates an expression. */
+UNIV_INLINE
+void
+eval_exp(
+/*=====*/
+	que_node_t*	exp_node);	/*!< in: expression */
+/*****************************************************************//**
+Sets an integer value as the value of an expression node. */
+UNIV_INLINE
+void
+eval_node_set_int_val(
+/*==================*/
+	que_node_t*	node,	/*!< in: expression node */
+	lint		val);	/*!< in: value to set */
+/*****************************************************************//**
+Gets an integer value from an expression node.
+@return	integer value */
+UNIV_INLINE
+lint
+eval_node_get_int_val(
+/*==================*/
+	que_node_t*	node);	/*!< in: expression node */
+/*****************************************************************//**
+Copies a binary string value as the value of a query graph node. Allocates a
+new buffer if necessary. */
+UNIV_INLINE
+void
+eval_node_copy_and_alloc_val(
+/*=========================*/
+	que_node_t*	node,	/*!< in: query graph node */
+	const byte*	str,	/*!< in: binary string */
+	ulint		len);	/*!< in: string length or UNIV_SQL_NULL */
+/*****************************************************************//**
+Copies a query node value to another node. */
+UNIV_INLINE
+void
+eval_node_copy_val(
+/*===============*/
+	que_node_t*	node1,	/*!< in: node to copy to */
+	que_node_t*	node2);	/*!< in: node to copy from */
+/*****************************************************************//**
+Gets a iboolean value from a query node.
+@return	iboolean value */
+UNIV_INLINE
+ibool
+eval_node_get_ibool_val(
+/*====================*/
+	que_node_t*	node);	/*!< in: query graph node */
+/*****************************************************************//**
+Evaluates a comparison node.
+@return	the result of the comparison */
+UNIV_INTERN
+ibool
+eval_cmp(
+/*=====*/
+	func_node_t*	cmp_node);	/*!< in: comparison node */
+
+
+#ifndef UNIV_NONINL
+#include "eval0eval.ic"
+#endif
+
+#endif
diff --git a/storage/xtradb/include/eval0eval.ic b/storage/xtradb/include/eval0eval.ic
new file mode 100644
index 00000000000..fe767f39b00
--- /dev/null
+++ b/storage/xtradb/include/eval0eval.ic
@@ -0,0 +1,251 @@
+/*****************************************************************************
+
+Copyright (c) 1997, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/eval0eval.ic
+SQL evaluator: evaluates simple data structures, like expressions, in
+a query graph
+
+Created 12/29/1997 Heikki Tuuri
+*******************************************************/
+
+#include "que0que.h"
+#include "rem0cmp.h"
+#include "pars0grm.h"
+
+/*****************************************************************//**
+Evaluates a function node. */
+UNIV_INTERN
+void
+eval_func(
+/*======*/
+	func_node_t*	func_node);	/*!< in: function node */
+/*****************************************************************//**
+Allocate a buffer from global dynamic memory for a value of a que_node.
+NOTE that this memory must be explicitly freed when the query graph is
+freed. If the node already has allocated buffer, that buffer is freed
+here. NOTE that this is the only function where dynamic memory should be
+allocated for a query node val field.
+@return	pointer to allocated buffer */
+UNIV_INTERN
+byte*
+eval_node_alloc_val_buf(
+/*====================*/
+	que_node_t*	node,	/*!< in: query graph node; sets the val field
+				data field to point to the new buffer, and
+				len field equal to size */
+	ulint		size);	/*!< in: buffer size */
+
+
+/*****************************************************************//**
+Allocates a new buffer if needed.
+@return	pointer to buffer */
+UNIV_INLINE
+byte*
+eval_node_ensure_val_buf(
+/*=====================*/
+	que_node_t*	node,	/*!< in: query graph node; sets the val field
+				data field to point to the new buffer, and
+				len field equal to size */
+	ulint		size)	/*!< in: buffer size */
+{
+	dfield_t*	dfield;
+	byte*		data;
+
+	dfield = que_node_get_val(node);
+	dfield_set_len(dfield, size);
+
+	data = dfield_get_data(dfield);
+
+	if (!data || que_node_get_val_buf_size(node) < size) {
+
+		data = eval_node_alloc_val_buf(node, size);
+	}
+
+	return(data);
+}
+
+/*****************************************************************//**
+Evaluates a symbol table symbol. */
+UNIV_INLINE
+void
+eval_sym(
+/*=====*/
+	sym_node_t*	sym_node)	/*!< in: symbol table node */
+{
+
+	ut_ad(que_node_get_type(sym_node) == QUE_NODE_SYMBOL);
+
+	if (sym_node->indirection) {
+		/* The symbol table node is an alias for a variable or a
+		column */
+
+		dfield_copy_data(que_node_get_val(sym_node),
+				 que_node_get_val(sym_node->indirection));
+	}
+}
+
+/*****************************************************************//**
+Evaluates an expression. */
+UNIV_INLINE
+void
+eval_exp(
+/*=====*/
+	que_node_t*	exp_node)	/*!< in: expression */
+{
+	if (que_node_get_type(exp_node) == QUE_NODE_SYMBOL) {
+
+		eval_sym((sym_node_t*)exp_node);
+
+		return;
+	}
+
+	eval_func(exp_node);
+}
+
+/*****************************************************************//**
+Sets an integer value as the value of an expression node. */
+UNIV_INLINE
+void
+eval_node_set_int_val(
+/*==================*/
+	que_node_t*	node,	/*!< in: expression node */
+	lint		val)	/*!< in: value to set */
+{
+	dfield_t*	dfield;
+	byte*		data;
+
+	dfield = que_node_get_val(node);
+
+	data = dfield_get_data(dfield);
+
+	if (data == NULL) {
+		data = eval_node_alloc_val_buf(node, 4);
+	}
+
+	ut_ad(dfield_get_len(dfield) == 4);
+
+	mach_write_to_4(data, (ulint)val);
+}
+
+/*****************************************************************//**
+Gets an integer non-SQL null value from an expression node.
+@return	integer value */
+UNIV_INLINE
+lint
+eval_node_get_int_val(
+/*==================*/
+	que_node_t*	node)	/*!< in: expression node */
+{
+	dfield_t*	dfield;
+
+	dfield = que_node_get_val(node);
+
+	ut_ad(dfield_get_len(dfield) == 4);
+
+	return((int)mach_read_from_4(dfield_get_data(dfield)));
+}
+
+/*****************************************************************//**
+Gets a iboolean value from a query node.
+@return	iboolean value */
+UNIV_INLINE
+ibool
+eval_node_get_ibool_val(
+/*====================*/
+	que_node_t*	node)	/*!< in: query graph node */
+{
+	dfield_t*	dfield;
+	byte*		data;
+
+	dfield = que_node_get_val(node);
+
+	data = dfield_get_data(dfield);
+
+	ut_ad(data != NULL);
+
+	return(mach_read_from_1(data));
+}
+
+/*****************************************************************//**
+Sets a iboolean value as the value of a function node. */
+UNIV_INLINE
+void
+eval_node_set_ibool_val(
+/*====================*/
+	func_node_t*	func_node,	/*!< in: function node */
+	ibool		val)		/*!< in: value to set */
+{
+	dfield_t*	dfield;
+	byte*		data;
+
+	dfield = que_node_get_val(func_node);
+
+	data = dfield_get_data(dfield);
+
+	if (data == NULL) {
+		/* Allocate 1 byte to hold the value */
+
+		data = eval_node_alloc_val_buf(func_node, 1);
+	}
+
+	ut_ad(dfield_get_len(dfield) == 1);
+
+	mach_write_to_1(data, val);
+}
+
+/*****************************************************************//**
+Copies a binary string value as the value of a query graph node. Allocates a
+new buffer if necessary. */
+UNIV_INLINE
+void
+eval_node_copy_and_alloc_val(
+/*=========================*/
+	que_node_t*	node,	/*!< in: query graph node */
+	const byte*	str,	/*!< in: binary string */
+	ulint		len)	/*!< in: string length or UNIV_SQL_NULL */
+{
+	byte*		data;
+
+	if (len == UNIV_SQL_NULL) {
+		dfield_set_len(que_node_get_val(node), len);
+
+		return;
+	}
+
+	data = eval_node_ensure_val_buf(node, len);
+
+	ut_memcpy(data, str, len);
+}
+
+/*****************************************************************//**
+Copies a query node value to another node. */
+UNIV_INLINE
+void
+eval_node_copy_val(
+/*===============*/
+	que_node_t*	node1,	/*!< in: node to copy to */
+	que_node_t*	node2)	/*!< in: node to copy from */
+{
+	dfield_t*	dfield2;
+
+	dfield2 = que_node_get_val(node2);
+
+	eval_node_copy_and_alloc_val(node1, dfield_get_data(dfield2),
+				     dfield_get_len(dfield2));
+}
diff --git a/storage/xtradb/include/eval0proc.h b/storage/xtradb/include/eval0proc.h
new file mode 100644
index 00000000000..13e2e365320
--- /dev/null
+++ b/storage/xtradb/include/eval0proc.h
@@ -0,0 +1,104 @@
+/*****************************************************************************
+
+Copyright (c) 1998, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/eval0proc.h
+Executes SQL stored procedures and their control structures
+
+Created 1/20/1998 Heikki Tuuri
+*******************************************************/
+
+#ifndef eval0proc_h
+#define eval0proc_h
+
+#include "univ.i"
+#include "que0types.h"
+#include "pars0sym.h"
+#include "pars0pars.h"
+
+/**********************************************************************//**
+Performs an execution step of a procedure node.
+@return	query thread to run next or NULL */
+UNIV_INLINE
+que_thr_t*
+proc_step(
+/*======*/
+	que_thr_t*	thr);	/*!< in: query thread */
+/**********************************************************************//**
+Performs an execution step of an if-statement node.
+@return	query thread to run next or NULL */
+UNIV_INTERN
+que_thr_t*
+if_step(
+/*====*/
+	que_thr_t*	thr);	/*!< in: query thread */
+/**********************************************************************//**
+Performs an execution step of a while-statement node.
+@return	query thread to run next or NULL */
+UNIV_INTERN
+que_thr_t*
+while_step(
+/*=======*/
+	que_thr_t*	thr);	/*!< in: query thread */
+/**********************************************************************//**
+Performs an execution step of a for-loop node.
+@return	query thread to run next or NULL */
+UNIV_INTERN
+que_thr_t*
+for_step(
+/*=====*/
+	que_thr_t*	thr);	/*!< in: query thread */
+/**********************************************************************//**
+Performs an execution step of an assignment statement node.
+@return	query thread to run next or NULL */
+UNIV_INTERN
+que_thr_t*
+assign_step(
+/*========*/
+	que_thr_t*	thr);	/*!< in: query thread */
+/**********************************************************************//**
+Performs an execution step of a procedure call node.
+@return	query thread to run next or NULL */
+UNIV_INLINE
+que_thr_t*
+proc_eval_step(
+/*===========*/
+	que_thr_t*	thr);	/*!< in: query thread */
+/**********************************************************************//**
+Performs an execution step of an exit statement node.
+@return	query thread to run next or NULL */
+UNIV_INTERN
+que_thr_t*
+exit_step(
+/*======*/
+	que_thr_t*	thr);	/*!< in: query thread */
+/**********************************************************************//**
+Performs an execution step of a return-statement node.
+@return	query thread to run next or NULL */
+UNIV_INTERN
+que_thr_t*
+return_step(
+/*========*/
+	que_thr_t*	thr);	/*!< in: query thread */
+
+
+#ifndef UNIV_NONINL
+#include "eval0proc.ic"
+#endif
+
+#endif
diff --git a/storage/xtradb/include/eval0proc.ic b/storage/xtradb/include/eval0proc.ic
new file mode 100644
index 00000000000..c602af0a694
--- /dev/null
+++ b/storage/xtradb/include/eval0proc.ic
@@ -0,0 +1,88 @@
+/*****************************************************************************
+
+Copyright (c) 1998, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/eval0proc.ic
+Executes SQL stored procedures and their control structures
+
+Created 1/20/1998 Heikki Tuuri
+*******************************************************/
+
+#include "pars0pars.h"
+#include "que0que.h"
+#include "eval0eval.h"
+
+/**********************************************************************//**
+Performs an execution step of a procedure node.
+@return	query thread to run next or NULL */
+UNIV_INLINE
+que_thr_t*
+proc_step(
+/*======*/
+	que_thr_t*	thr)	/*!< in: query thread */
+{
+	proc_node_t*	node;
+
+	ut_ad(thr);
+
+	node = thr->run_node;
+	ut_ad(que_node_get_type(node) == QUE_NODE_PROC);
+
+	if (thr->prev_node == que_node_get_parent(node)) {
+		/* Start execution from the first statement in the statement
+		list */
+
+		thr->run_node = node->stat_list;
+	} else {
+		/* Move to the next statement */
+		ut_ad(que_node_get_next(thr->prev_node) == NULL);
+
+		thr->run_node = NULL;
+	}
+
+	if (thr->run_node == NULL) {
+		thr->run_node = que_node_get_parent(node);
+	}
+
+	return(thr);
+}
+
+/**********************************************************************//**
+Performs an execution step of a procedure call node.
+@return	query thread to run next or NULL */
+UNIV_INLINE
+que_thr_t*
+proc_eval_step(
+/*===========*/
+	que_thr_t*	thr)	/*!< in: query thread */
+{
+	func_node_t*	node;
+
+	ut_ad(thr);
+
+	node = thr->run_node;
+	ut_ad(que_node_get_type(node) == QUE_NODE_FUNC);
+
+	/* Evaluate the procedure */
+
+	eval_exp(node);
+
+	thr->run_node = que_node_get_parent(node);
+
+	return(thr);
+}
diff --git a/storage/xtradb/include/fil0fil.h b/storage/xtradb/include/fil0fil.h
new file mode 100644
index 00000000000..07c80ef8609
--- /dev/null
+++ b/storage/xtradb/include/fil0fil.h
@@ -0,0 +1,766 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2010, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/fil0fil.h
+The low-level file system
+
+Created 10/25/1995 Heikki Tuuri
+*******************************************************/
+
+#ifndef fil0fil_h
+#define fil0fil_h
+
+#include "univ.i"
+#ifndef UNIV_HOTBACKUP
+#include "sync0rw.h"
+#endif /* !UNIV_HOTBACKUP */
+#include "dict0types.h"
+#include "ut0byte.h"
+#include "os0file.h"
+
+/** When mysqld is run, the default directory "." is the mysqld datadir,
+but in the MySQL Embedded Server Library and ibbackup it is not the default
+directory, and we must set the base file path explicitly */
+extern const char*	fil_path_to_mysql_datadir;
+
+/** Initial size of a single-table tablespace in pages */
+#define FIL_IBD_FILE_INITIAL_SIZE	4
+
+/** 'null' (undefined) page offset in the context of file spaces */
+#define	FIL_NULL	ULINT32_UNDEFINED
+
+/* Space address data type; this is intended to be used when
+addresses accurate to a byte are stored in file pages. If the page part
+of the address is FIL_NULL, the address is considered undefined. */
+
+typedef	byte	fil_faddr_t;	/*!< 'type' definition in C: an address
+				stored in a file page is a string of bytes */
+#define FIL_ADDR_PAGE	0	/* first in address is the page offset */
+#define	FIL_ADDR_BYTE	4	/* then comes 2-byte byte offset within page*/
+
+#define	FIL_ADDR_SIZE	6	/* address size is 6 bytes */
+
+/** A struct for storing a space address FIL_ADDR, when it is used
+in C program data structures. */
+
+typedef struct fil_addr_struct	fil_addr_t;
+/** File space address */
+struct fil_addr_struct{
+	ulint	page;		/*!< page number within a space */
+	ulint	boffset;	/*!< byte offset within the page */
+};
+
+/** The null file address */
+extern fil_addr_t	fil_addr_null;
+
+/** The byte offsets on a file page for various variables @{ */
+#define FIL_PAGE_SPACE_OR_CHKSUM 0	/*!< in < MySQL-4.0.14 space id the
+					page belongs to (== 0) but in later
+					versions the 'new' checksum of the
+					page */
+#define FIL_PAGE_OFFSET		4	/*!< page offset inside space */
+#define FIL_PAGE_PREV		8	/*!< if there is a 'natural'
+					predecessor of the page, its
+					offset.  Otherwise FIL_NULL.
+					This field is not set on BLOB
+					pages, which are stored as a
+					singly-linked list.  See also
+					FIL_PAGE_NEXT. */
+#define FIL_PAGE_NEXT		12	/*!< if there is a 'natural' successor
+					of the page, its offset.
+					Otherwise FIL_NULL.
+					B-tree index pages
+					(FIL_PAGE_TYPE contains FIL_PAGE_INDEX)
+					on the same PAGE_LEVEL are maintained
+					as a doubly linked list via
+					FIL_PAGE_PREV and FIL_PAGE_NEXT
+					in the collation order of the
+					smallest user record on each page. */
+#define FIL_PAGE_LSN		16	/*!< lsn of the end of the newest
+					modification log record to the page */
+#define	FIL_PAGE_TYPE		24	/*!< file page type: FIL_PAGE_INDEX,...,
+					2 bytes.
+
+					The contents of this field can only
+					be trusted in the following case:
+					if the page is an uncompressed
+					B-tree index page, then it is
+					guaranteed that the value is
+					FIL_PAGE_INDEX.
+					The opposite does not hold.
+
+					In tablespaces created by
+					MySQL/InnoDB 5.1.7 or later, the
+					contents of this field is valid
+					for all uncompressed pages. */
+#define FIL_PAGE_FILE_FLUSH_LSN	26	/*!< this is only defined for the
+					first page in a system tablespace
+					data file (ibdata*, not *.ibd):
+					the file has been flushed to disk
+					at least up to this lsn */
+#define FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID  34 /*!< starting from 4.1.x this
+					contains the space id of the page */
+#define FIL_PAGE_DATA		38	/*!< start of the data on the page */
+#define FIL_PAGE_DATA_ALIGN_32	40
+/* @} */
+/** File page trailer @{ */
+#define FIL_PAGE_END_LSN_OLD_CHKSUM 8	/*!< the low 4 bytes of this are used
+					to store the page checksum, the
+					last 4 bytes should be identical
+					to the last 4 bytes of FIL_PAGE_LSN */
+#define FIL_PAGE_DATA_END	8	/*!< size of the page trailer */
+/* @} */
+
+/** File page types (values of FIL_PAGE_TYPE) @{ */
+#define FIL_PAGE_INDEX		17855	/*!< B-tree node */
+#define FIL_PAGE_UNDO_LOG	2	/*!< Undo log page */
+#define FIL_PAGE_INODE		3	/*!< Index node */
+#define FIL_PAGE_IBUF_FREE_LIST	4	/*!< Insert buffer free list */
+/* File page types introduced in MySQL/InnoDB 5.1.7 */
+#define FIL_PAGE_TYPE_ALLOCATED	0	/*!< Freshly allocated page */
+#define FIL_PAGE_IBUF_BITMAP	5	/*!< Insert buffer bitmap */
+#define FIL_PAGE_TYPE_SYS	6	/*!< System page */
+#define FIL_PAGE_TYPE_TRX_SYS	7	/*!< Transaction system data */
+#define FIL_PAGE_TYPE_FSP_HDR	8	/*!< File space header */
+#define FIL_PAGE_TYPE_XDES	9	/*!< Extent descriptor page */
+#define FIL_PAGE_TYPE_BLOB	10	/*!< Uncompressed BLOB page */
+#define FIL_PAGE_TYPE_ZBLOB	11	/*!< First compressed BLOB page */
+#define FIL_PAGE_TYPE_ZBLOB2	12	/*!< Subsequent compressed BLOB page */
+/* @} */
+
+/** Space types @{ */
+#define FIL_TABLESPACE		501	/*!< tablespace */
+#define FIL_LOG			502	/*!< redo log */
+/* @} */
+
+/** The number of fsyncs done to the log */
+extern ulint	fil_n_log_flushes;
+
+/** Number of pending redo log flushes */
+extern ulint	fil_n_pending_log_flushes;
+/** Number of pending tablespace flushes */
+extern ulint	fil_n_pending_tablespace_flushes;
+
+
+#ifndef UNIV_HOTBACKUP
+/*******************************************************************//**
+Returns the version number of a tablespace, -1 if not found.
+@return version number, -1 if the tablespace does not exist in the
+memory cache */
+UNIV_INTERN
+ib_int64_t
+fil_space_get_version(
+/*==================*/
+	ulint	id);	/*!< in: space id */
+/*******************************************************************//**
+Returns the latch of a file space.
+@return	latch protecting storage allocation */
+UNIV_INTERN
+rw_lock_t*
+fil_space_get_latch(
+/*================*/
+	ulint	id,	/*!< in: space id */
+	ulint*	zip_size);/*!< out: compressed page size, or
+			0 for uncompressed tablespaces */
+/*******************************************************************//**
+Returns the type of a file space.
+@return	FIL_TABLESPACE or FIL_LOG */
+UNIV_INTERN
+ulint
+fil_space_get_type(
+/*===============*/
+	ulint	id);	/*!< in: space id */
+#endif /* !UNIV_HOTBACKUP */
+/*******************************************************************//**
+Appends a new file to the chain of files of a space. File must be closed. */
+UNIV_INTERN
+void
+fil_node_create(
+/*============*/
+	const char*	name,	/*!< in: file name (file must be closed) */
+	ulint		size,	/*!< in: file size in database blocks, rounded
+				downwards to an integer */
+	ulint		id,	/*!< in: space id where to append */
+	ibool		is_raw);/*!< in: TRUE if a raw device or
+				a raw disk partition */
+#ifdef UNIV_LOG_ARCHIVE
+/****************************************************************//**
+Drops files from the start of a file space, so that its size is cut by
+the amount given. */
+UNIV_INTERN
+void
+fil_space_truncate_start(
+/*=====================*/
+	ulint	id,		/*!< in: space id */
+	ulint	trunc_len);	/*!< in: truncate by this much; it is an error
+				if this does not equal to the combined size of
+				some initial files in the space */
+#endif /* UNIV_LOG_ARCHIVE */
+/*******************************************************************//**
+Creates a space memory object and puts it to the 'fil system' hash table. If
+there is an error, prints an error message to the .err log.
+@return	TRUE if success */
+UNIV_INTERN
+ibool
+fil_space_create(
+/*=============*/
+	const char*	name,	/*!< in: space name */
+	ulint		id,	/*!< in: space id */
+	ulint		zip_size,/*!< in: compressed page size, or
+				0 for uncompressed tablespaces */
+	ulint		purpose);/*!< in: FIL_TABLESPACE, or FIL_LOG if log */
+/*******************************************************************//**
+Assigns a new space id for a new single-table tablespace. This works simply by
+incrementing the global counter. If 4 billion id's is not enough, we may need
+to recycle id's.
+@return	TRUE if assigned, FALSE if not */
+UNIV_INTERN
+ibool
+fil_assign_new_space_id(
+/*====================*/
+	ulint*	space_id);	/*!< in/out: space id */
+/*******************************************************************//**
+Returns the size of the space in pages. The tablespace must be cached in the
+memory cache.
+@return	space size, 0 if space not found */
+UNIV_INTERN
+ulint
+fil_space_get_size(
+/*===============*/
+	ulint	id);	/*!< in: space id */
+/*******************************************************************//**
+Returns the flags of the space. The tablespace must be cached
+in the memory cache.
+@return	flags, ULINT_UNDEFINED if space not found */
+UNIV_INTERN
+ulint
+fil_space_get_flags(
+/*================*/
+	ulint	id);	/*!< in: space id */
+/*******************************************************************//**
+Returns the compressed page size of the space, or 0 if the space
+is not compressed. The tablespace must be cached in the memory cache.
+@return	compressed page size, ULINT_UNDEFINED if space not found */
+UNIV_INTERN
+ulint
+fil_space_get_zip_size(
+/*===================*/
+	ulint	id);	/*!< in: space id */
+/*******************************************************************//**
+Checks if the pair space, page_no refers to an existing page in a tablespace
+file space. The tablespace must be cached in the memory cache.
+@return	TRUE if the address is meaningful */
+UNIV_INTERN
+ibool
+fil_check_adress_in_tablespace(
+/*===========================*/
+	ulint	id,	/*!< in: space id */
+	ulint	page_no);/*!< in: page number */
+/****************************************************************//**
+Initializes the tablespace memory cache. */
+UNIV_INTERN
+void
+fil_init(
+/*=====*/
+	ulint	hash_size,	/*!< in: hash table size */
+	ulint	max_n_open);	/*!< in: max number of open files */
+/*******************************************************************//**
+Initializes the tablespace memory cache. */
+UNIV_INTERN
+void
+fil_close(void);
+/*===========*/
+/*******************************************************************//**
+Opens all log files and system tablespace data files. They stay open until the
+database server shutdown. This should be called at a server startup after the
+space objects for the log and the system tablespace have been created. The
+purpose of this operation is to make sure we never run out of file descriptors
+if we need to read from the insert buffer or to write to the log. */
+UNIV_INTERN
+void
+fil_open_log_and_system_tablespace_files(void);
+/*==========================================*/
+/*******************************************************************//**
+Closes all open files. There must not be any pending i/o's or not flushed
+modifications in the files. */
+UNIV_INTERN
+void
+fil_close_all_files(void);
+/*=====================*/
+/*******************************************************************//**
+Sets the max tablespace id counter if the given number is bigger than the
+previous value. */
+UNIV_INTERN
+void
+fil_set_max_space_id_if_bigger(
+/*===========================*/
+	ulint	max_id);/*!< in: maximum known id */
+#ifndef UNIV_HOTBACKUP
+/****************************************************************//**
+Writes the flushed lsn and the latest archived log number to the page
+header of the first page of each data file in the system tablespace.
+@return	DB_SUCCESS or error number */
+UNIV_INTERN
+ulint
+fil_write_flushed_lsn_to_data_files(
+/*================================*/
+	ib_uint64_t	lsn,		/*!< in: lsn to write */
+	ulint		arch_log_no);	/*!< in: latest archived log
+					file number */
+/*******************************************************************//**
+Reads the flushed lsn and arch no fields from a data file at database
+startup. */
+UNIV_INTERN
+void
+fil_read_flushed_lsn_and_arch_log_no(
+/*=================================*/
+	os_file_t	data_file,		/*!< in: open data file */
+	ibool		one_read_already,	/*!< in: TRUE if min and max
+						parameters below already
+						contain sensible data */
+#ifdef UNIV_LOG_ARCHIVE
+	ulint*		min_arch_log_no,	/*!< in/out: */
+	ulint*		max_arch_log_no,	/*!< in/out: */
+#endif /* UNIV_LOG_ARCHIVE */
+	ib_uint64_t*	min_flushed_lsn,	/*!< in/out: */
+	ib_uint64_t*	max_flushed_lsn);	/*!< in/out: */
+/*******************************************************************//**
+Increments the count of pending insert buffer page merges, if space is not
+being deleted.
+@return	TRUE if being deleted, and ibuf merges should be skipped */
+UNIV_INTERN
+ibool
+fil_inc_pending_ibuf_merges(
+/*========================*/
+	ulint	id);	/*!< in: space id */
+/*******************************************************************//**
+Decrements the count of pending insert buffer page merges. */
+UNIV_INTERN
+void
+fil_decr_pending_ibuf_merges(
+/*=========================*/
+	ulint	id);	/*!< in: space id */
+#endif /* !UNIV_HOTBACKUP */
+/*******************************************************************//**
+Parses the body of a log record written about an .ibd file operation. That is,
+the log record part after the standard (type, space id, page no) header of the
+log record.
+
+If desired, also replays the delete or rename operation if the .ibd file
+exists and the space id in it matches. Replays the create operation if a file
+at that path does not exist yet. If the database directory for the file to be
+created does not exist, then we create the directory, too.
+
+Note that ibbackup --apply-log sets fil_path_to_mysql_datadir to point to the
+datadir that we should use in replaying the file operations.
+@return end of log record, or NULL if the record was not completely
+contained between ptr and end_ptr */
+UNIV_INTERN
+byte*
+fil_op_log_parse_or_replay(
+/*=======================*/
+	byte*	ptr,		/*!< in: buffer containing the log record body,
+				or an initial segment of it, if the record does
+				not fir completely between ptr and end_ptr */
+	byte*	end_ptr,	/*!< in: buffer end */
+	ulint	type,		/*!< in: the type of this log record */
+	ulint	space_id,	/*!< in: the space id of the tablespace in
+				question, or 0 if the log record should
+				only be parsed but not replayed */
+	ulint	log_flags);	/*!< in: redo log flags
+				(stored in the page number parameter) */
+/*******************************************************************//**
+Deletes a single-table tablespace. The tablespace must be cached in the
+memory cache.
+@return	TRUE if success */
+UNIV_INTERN
+ibool
+fil_delete_tablespace(
+/*==================*/
+	ulint	id);	/*!< in: space id */
+#ifndef UNIV_HOTBACKUP
+/*******************************************************************//**
+Discards a single-table tablespace. The tablespace must be cached in the
+memory cache. Discarding is like deleting a tablespace, but
+1) we do not drop the table from the data dictionary;
+2) we remove all insert buffer entries for the tablespace immediately; in DROP
+TABLE they are only removed gradually in the background;
+3) when the user does IMPORT TABLESPACE, the tablespace will have the same id
+as it originally had.
+@return	TRUE if success */
+UNIV_INTERN
+ibool
+fil_discard_tablespace(
+/*===================*/
+	ulint	id);	/*!< in: space id */
+#endif /* !UNIV_HOTBACKUP */
+/*******************************************************************//**
+Renames a single-table tablespace. The tablespace must be cached in the
+tablespace memory cache.
+@return	TRUE if success */
+UNIV_INTERN
+ibool
+fil_rename_tablespace(
+/*==================*/
+	const char*	old_name,	/*!< in: old table name in the standard
+					databasename/tablename format of
+					InnoDB, or NULL if we do the rename
+					based on the space id only */
+	ulint		id,		/*!< in: space id */
+	const char*	new_name);	/*!< in: new table name in the standard
+					databasename/tablename format
+					of InnoDB */
+
+/*******************************************************************//**
+Creates a new single-table tablespace to a database directory of MySQL.
+Database directories are under the 'datadir' of MySQL. The datadir is the
+directory of a running mysqld program. We can refer to it by simply the
+path '.'. Tables created with CREATE TEMPORARY TABLE we place in the temp
+dir of the mysqld server.
+@return	DB_SUCCESS or error code */
+UNIV_INTERN
+ulint
+fil_create_new_single_table_tablespace(
+/*===================================*/
+	ulint		space_id,	/*!< in: space id */
+	const char*	tablename,	/*!< in: the table name in the usual
+					databasename/tablename format
+					of InnoDB, or a dir path to a temp
+					table */
+	ibool		is_temp,	/*!< in: TRUE if a table created with
+					CREATE TEMPORARY TABLE */
+	ulint		flags,		/*!< in: tablespace flags */
+	ulint		size);		/*!< in: the initial size of the
+					tablespace file in pages,
+					must be >= FIL_IBD_FILE_INITIAL_SIZE */
+#ifndef UNIV_HOTBACKUP
+/********************************************************************//**
+Tries to open a single-table tablespace and optionally checks the space id is
+right in it. If does not succeed, prints an error message to the .err log. This
+function is used to open a tablespace when we start up mysqld, and also in
+IMPORT TABLESPACE.
+NOTE that we assume this operation is used either at the database startup
+or under the protection of the dictionary mutex, so that two users cannot
+race here. This operation does not leave the file associated with the
+tablespace open, but closes it after we have looked at the space id in it.
+@return	TRUE if success */
+UNIV_INTERN
+ibool
+fil_open_single_table_tablespace(
+/*=============================*/
+	ibool		check_space_id,	/*!< in: should we check that the space
+					id in the file is right; we assume
+					that this function runs much faster
+					if no check is made, since accessing
+					the file inode probably is much
+					faster (the OS caches them) than
+					accessing the first page of the file */
+	ulint		id,		/*!< in: space id */
+	ulint		flags,		/*!< in: tablespace flags */
+	const char*	name);		/*!< in: table name in the
+					databasename/tablename format */
+/********************************************************************//**
+It is possible, though very improbable, that the lsn's in the tablespace to be
+imported have risen above the current system lsn, if a lengthy purge, ibuf
+merge, or rollback was performed on a backup taken with ibbackup. If that is
+the case, reset page lsn's in the file. We assume that mysqld was shut down
+after it performed these cleanup operations on the .ibd file, so that it at
+the shutdown stamped the latest lsn to the FIL_PAGE_FILE_FLUSH_LSN in the
+first page of the .ibd file, and we can determine whether we need to reset the
+lsn's just by looking at that flush lsn.
+@return	TRUE if success */
+UNIV_INTERN
+ibool
+fil_reset_too_high_lsns(
+/*====================*/
+	const char*	name,		/*!< in: table name in the
+					databasename/tablename format */
+	ib_uint64_t	current_lsn);	/*!< in: reset lsn's if the lsn stamped
+					to FIL_PAGE_FILE_FLUSH_LSN in the
+					first page is too high */
+#endif /* !UNIV_HOTBACKUP */
+/********************************************************************//**
+At the server startup, if we need crash recovery, scans the database
+directories under the MySQL datadir, looking for .ibd files. Those files are
+single-table tablespaces. We need to know the space id in each of them so that
+we know into which file we should look to check the contents of a page stored
+in the doublewrite buffer, also to know where to apply log records where the
+space id is != 0.
+@return	DB_SUCCESS or error number */
+UNIV_INTERN
+ulint
+fil_load_single_table_tablespaces(void);
+/*===================================*/
+/*******************************************************************//**
+Returns TRUE if a single-table tablespace does not exist in the memory cache,
+or is being deleted there.
+@return	TRUE if does not exist or is being\ deleted */
+UNIV_INTERN
+ibool
+fil_tablespace_deleted_or_being_deleted_in_mem(
+/*===========================================*/
+	ulint		id,	/*!< in: space id */
+	ib_int64_t	version);/*!< in: tablespace_version should be this; if
+				you pass -1 as the value of this, then this
+				parameter is ignored */
+/*******************************************************************//**
+Returns TRUE if a single-table tablespace exists in the memory cache.
+@return	TRUE if exists */
+UNIV_INTERN
+ibool
+fil_tablespace_exists_in_mem(
+/*=========================*/
+	ulint	id);	/*!< in: space id */
+#ifndef UNIV_HOTBACKUP
+/*******************************************************************//**
+Returns TRUE if a matching tablespace exists in the InnoDB tablespace memory
+cache. Note that if we have not done a crash recovery at the database startup,
+there may be many tablespaces which are not yet in the memory cache.
+@return	TRUE if a matching tablespace exists in the memory cache */
+UNIV_INTERN
+ibool
+fil_space_for_table_exists_in_mem(
+/*==============================*/
+	ulint		id,		/*!< in: space id */
+	const char*	name,		/*!< in: table name in the standard
+					'databasename/tablename' format or
+					the dir path to a temp table */
+	ibool		is_temp,	/*!< in: TRUE if created with CREATE
+					TEMPORARY TABLE */
+	ibool		mark_space,	/*!< in: in crash recovery, at database
+					startup we mark all spaces which have
+					an associated table in the InnoDB
+					data dictionary, so that
+					we can print a warning about orphaned
+					tablespaces */
+	ibool		print_error_if_does_not_exist);
+					/*!< in: print detailed error
+					information to the .err log if a
+					matching tablespace is not found from
+					memory */
+#else /* !UNIV_HOTBACKUP */
+/********************************************************************//**
+Extends all tablespaces to the size stored in the space header. During the
+ibbackup --apply-log phase we extended the spaces on-demand so that log records
+could be appllied, but that may have left spaces still too small compared to
+the size stored in the space header. */
+UNIV_INTERN
+void
+fil_extend_tablespaces_to_stored_len(void);
+/*======================================*/
+#endif /* !UNIV_HOTBACKUP */
+/**********************************************************************//**
+Tries to extend a data file so that it would accommodate the number of pages
+given. The tablespace must be cached in the memory cache. If the space is big
+enough already, does nothing.
+@return	TRUE if success */
+UNIV_INTERN
+ibool
+fil_extend_space_to_desired_size(
+/*=============================*/
+	ulint*	actual_size,	/*!< out: size of the space after extension;
+				if we ran out of disk space this may be lower
+				than the desired size */
+	ulint	space_id,	/*!< in: space id */
+	ulint	size_after_extend);/*!< in: desired size in pages after the
+				extension; if the current space size is bigger
+				than this already, the function does nothing */
+/*******************************************************************//**
+Tries to reserve free extents in a file space.
+@return	TRUE if succeed */
+UNIV_INTERN
+ibool
+fil_space_reserve_free_extents(
+/*===========================*/
+	ulint	id,		/*!< in: space id */
+	ulint	n_free_now,	/*!< in: number of free extents now */
+	ulint	n_to_reserve);	/*!< in: how many one wants to reserve */
+/*******************************************************************//**
+Releases free extents in a file space. */
+UNIV_INTERN
+void
+fil_space_release_free_extents(
+/*===========================*/
+	ulint	id,		/*!< in: space id */
+	ulint	n_reserved);	/*!< in: how many one reserved */
+/*******************************************************************//**
+Gets the number of reserved extents. If the database is silent, this number
+should be zero. */
+UNIV_INTERN
+ulint
+fil_space_get_n_reserved_extents(
+/*=============================*/
+	ulint	id);		/*!< in: space id */
+/********************************************************************//**
+Reads or writes data. This operation is asynchronous (aio).
+@return DB_SUCCESS, or DB_TABLESPACE_DELETED if we are trying to do
+i/o on a tablespace which does not exist */
+#define fil_io(type, sync, space_id, zip_size, block_offset, byte_offset, len, buf, message) \
+	_fil_io(type, sync, space_id, zip_size, block_offset, byte_offset, len, buf, message, NULL)
+
+UNIV_INTERN
+ulint
+_fil_io(
+/*===*/
+	ulint	type,		/*!< in: OS_FILE_READ or OS_FILE_WRITE,
+				ORed to OS_FILE_LOG, if a log i/o
+				and ORed to OS_AIO_SIMULATED_WAKE_LATER
+				if simulated aio and we want to post a
+				batch of i/os; NOTE that a simulated batch
+				may introduce hidden chances of deadlocks,
+				because i/os are not actually handled until
+				all have been posted: use with great
+				caution! */
+	ibool	sync,		/*!< in: TRUE if synchronous aio is desired */
+	ulint	space_id,	/*!< in: space id */
+	ulint	zip_size,	/*!< in: compressed page size in bytes;
+				0 for uncompressed pages */
+	ulint	block_offset,	/*!< in: offset in number of blocks */
+	ulint	byte_offset,	/*!< in: remainder of offset in bytes; in
+				aio this must be divisible by the OS block
+				size */
+	ulint	len,		/*!< in: how many bytes to read or write; this
+				must not cross a file boundary; in aio this
+				must be a block size multiple */
+	void*	buf,		/*!< in/out: buffer where to store read data
+				or from where to write; in aio this must be
+				appropriately aligned */
+	void*	message,	/*!< in: message for aio handler if non-sync
+				aio used, else ignored */
+	trx_t*	trx);
+/********************************************************************//**
+Confirm whether the parameters are valid or not */
+UNIV_INTERN
+ibool
+fil_area_is_exist(
+/*==============*/
+	ulint	space_id,	/*!< in: space id */
+	ulint	zip_size,	/*!< in: compressed page size in bytes;
+				0 for uncompressed pages */
+	ulint	block_offset,	/*!< in: offset in number of blocks */
+	ulint	byte_offset,	/*!< in: remainder of offset in bytes; in
+				aio this must be divisible by the OS block
+				size */
+	ulint	len);		/*!< in: how many bytes to read or write; this
+				must not cross a file boundary; in aio this
+				must be a block size multiple */
+/**********************************************************************//**
+Waits for an aio operation to complete. This function is used to write the
+handler for completed requests. The aio array of pending requests is divided
+into segments (see os0file.c for more info). The thread specifies which
+segment it wants to wait for. */
+UNIV_INTERN
+void
+fil_aio_wait(
+/*=========*/
+	ulint	segment);	/*!< in: the number of the segment in the aio
+				array to wait for */
+/**********************************************************************//**
+Flushes to disk possible writes cached by the OS. If the space does not exist
+or is being dropped, does not do anything. */
+UNIV_INTERN
+void
+fil_flush(
+/*======*/
+	ulint	space_id);	/*!< in: file space id (this can be a group of
+				log files or a tablespace of the database) */
+/**********************************************************************//**
+Flushes to disk writes in file spaces of the given type possibly cached by
+the OS. */
+UNIV_INTERN
+void
+fil_flush_file_spaces(
+/*==================*/
+	ulint	purpose);	/*!< in: FIL_TABLESPACE, FIL_LOG */
+/******************************************************************//**
+Checks the consistency of the tablespace cache.
+@return	TRUE if ok */
+UNIV_INTERN
+ibool
+fil_validate(void);
+/*==============*/
+/********************************************************************//**
+Returns TRUE if file address is undefined.
+@return	TRUE if undefined */
+UNIV_INTERN
+ibool
+fil_addr_is_null(
+/*=============*/
+	fil_addr_t	addr);	/*!< in: address */
+/********************************************************************//**
+Get the predecessor of a file page.
+@return	FIL_PAGE_PREV */
+UNIV_INTERN
+ulint
+fil_page_get_prev(
+/*==============*/
+	const byte*	page);	/*!< in: file page */
+/********************************************************************//**
+Get the successor of a file page.
+@return	FIL_PAGE_NEXT */
+UNIV_INTERN
+ulint
+fil_page_get_next(
+/*==============*/
+	const byte*	page);	/*!< in: file page */
+/*********************************************************************//**
+Sets the file page type. */
+UNIV_INTERN
+void
+fil_page_set_type(
+/*==============*/
+	byte*	page,	/*!< in/out: file page */
+	ulint	type);	/*!< in: type */
+/*********************************************************************//**
+Gets the file page type.
+@return type; NOTE that if the type has not been written to page, the
+return value not defined */
+UNIV_INTERN
+ulint
+fil_page_get_type(
+/*==============*/
+	const byte*	page);	/*!< in: file page */
+
+/*************************************************************************
+Return local hash table informations. */
+
+ulint
+fil_system_hash_cells(void);
+/*========================*/
+
+ulint
+fil_system_hash_nodes(void);
+/*========================*/
+
+/*************************************************************************
+functions to access is_corrupt flag of fil_space_t*/
+
+ibool
+fil_space_is_corrupt(
+/*=================*/
+	ulint	space_id);
+
+void
+fil_space_set_corrupt(
+/*==================*/
+	ulint	space_id);
+
+typedef	struct fil_space_struct	fil_space_t;
+
+#endif
diff --git a/storage/xtradb/include/fsp0fsp.h b/storage/xtradb/include/fsp0fsp.h
new file mode 100644
index 00000000000..7abd3914eda
--- /dev/null
+++ b/storage/xtradb/include/fsp0fsp.h
@@ -0,0 +1,359 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/fsp0fsp.h
+File space management
+
+Created 12/18/1995 Heikki Tuuri
+*******************************************************/
+
+#ifndef fsp0fsp_h
+#define fsp0fsp_h
+
+#include "univ.i"
+
+#include "mtr0mtr.h"
+#include "fut0lst.h"
+#include "ut0byte.h"
+#include "page0types.h"
+#include "fsp0types.h"
+
+/**********************************************************************//**
+Initializes the file space system. */
+UNIV_INTERN
+void
+fsp_init(void);
+/*==========*/
+/**********************************************************************//**
+Gets the current free limit of the system tablespace.  The free limit
+means the place of the first page which has never been put to the
+free list for allocation.  The space above that address is initialized
+to zero.  Sets also the global variable log_fsp_current_free_limit.
+@return	free limit in megabytes */
+UNIV_INTERN
+ulint
+fsp_header_get_free_limit(void);
+/*===========================*/
+/**********************************************************************//**
+Gets the size of the system tablespace from the tablespace header.  If
+we do not have an auto-extending data file, this should be equal to
+the size of the data files.  If there is an auto-extending data file,
+this can be smaller.
+@return	size in pages */
+UNIV_INTERN
+ulint
+fsp_header_get_tablespace_size(void);
+/*================================*/
+/**********************************************************************//**
+Reads the file space size stored in the header page.
+@return	tablespace size stored in the space header */
+UNIV_INTERN
+ulint
+fsp_get_size_low(
+/*=============*/
+	page_t*	page);	/*!< in: header page (page 0 in the tablespace) */
+/**********************************************************************//**
+Reads the space id from the first page of a tablespace.
+@return	space id, ULINT UNDEFINED if error */
+UNIV_INTERN
+ulint
+fsp_header_get_space_id(
+/*====================*/
+	const page_t*	page);	/*!< in: first page of a tablespace */
+/**********************************************************************//**
+Reads the space flags from the first page of a tablespace.
+@return	flags */
+UNIV_INTERN
+ulint
+fsp_header_get_flags(
+/*=================*/
+	const page_t*	page);	/*!< in: first page of a tablespace */
+/**********************************************************************//**
+Reads the compressed page size from the first page of a tablespace.
+@return	compressed page size in bytes, or 0 if uncompressed */
+UNIV_INTERN
+ulint
+fsp_header_get_zip_size(
+/*====================*/
+	const page_t*	page);	/*!< in: first page of a tablespace */
+/**********************************************************************//**
+Writes the space id and compressed page size to a tablespace header.
+This function is used past the buffer pool when we in fil0fil.c create
+a new single-table tablespace. */
+UNIV_INTERN
+void
+fsp_header_init_fields(
+/*===================*/
+	page_t*	page,		/*!< in/out: first page in the space */
+	ulint	space_id,	/*!< in: space id */
+	ulint	flags);		/*!< in: tablespace flags (FSP_SPACE_FLAGS):
+				0, or table->flags if newer than COMPACT */
+/**********************************************************************//**
+Initializes the space header of a new created space and creates also the
+insert buffer tree root if space == 0. */
+UNIV_INTERN
+void
+fsp_header_init(
+/*============*/
+	ulint	space,		/*!< in: space id */
+	ulint	size,		/*!< in: current size in blocks */
+	mtr_t*	mtr);		/*!< in: mini-transaction handle */
+/**********************************************************************//**
+Increases the space size field of a space. */
+UNIV_INTERN
+void
+fsp_header_inc_size(
+/*================*/
+	ulint	space,	/*!< in: space id */
+	ulint	size_inc,/*!< in: size increment in pages */
+	mtr_t*	mtr);	/*!< in: mini-transaction handle */
+/**********************************************************************//**
+Creates a new segment.
+@return the block where the segment header is placed, x-latched, NULL
+if could not create segment because of lack of space */
+UNIV_INTERN
+buf_block_t*
+fseg_create(
+/*========*/
+	ulint	space,	/*!< in: space id */
+	ulint	page,	/*!< in: page where the segment header is placed: if
+			this is != 0, the page must belong to another segment,
+			if this is 0, a new page will be allocated and it
+			will belong to the created segment */
+	ulint	byte_offset, /*!< in: byte offset of the created segment header
+			on the page */
+	mtr_t*	mtr);	/*!< in: mtr */
+/**********************************************************************//**
+Creates a new segment.
+@return the block where the segment header is placed, x-latched, NULL
+if could not create segment because of lack of space */
+UNIV_INTERN
+buf_block_t*
+fseg_create_general(
+/*================*/
+	ulint	space,	/*!< in: space id */
+	ulint	page,	/*!< in: page where the segment header is placed: if
+			this is != 0, the page must belong to another segment,
+			if this is 0, a new page will be allocated and it
+			will belong to the created segment */
+	ulint	byte_offset, /*!< in: byte offset of the created segment header
+			on the page */
+	ibool	has_done_reservation, /*!< in: TRUE if the caller has already
+			done the reservation for the pages with
+			fsp_reserve_free_extents (at least 2 extents: one for
+			the inode and the other for the segment) then there is
+			no need to do the check for this individual
+			operation */
+	mtr_t*	mtr);	/*!< in: mtr */
+/**********************************************************************//**
+Calculates the number of pages reserved by a segment, and how many pages are
+currently used.
+@return	number of reserved pages */
+UNIV_INTERN
+ulint
+fseg_n_reserved_pages(
+/*==================*/
+	fseg_header_t*	header,	/*!< in: segment header */
+	ulint*		used,	/*!< out: number of pages used (<= reserved) */
+	mtr_t*		mtr);	/*!< in: mtr handle */
+/**********************************************************************//**
+Allocates a single free page from a segment. This function implements
+the intelligent allocation strategy which tries to minimize
+file space fragmentation.
+@return	the allocated page offset FIL_NULL if no page could be allocated */
+UNIV_INTERN
+ulint
+fseg_alloc_free_page(
+/*=================*/
+	fseg_header_t*	seg_header, /*!< in: segment header */
+	ulint		hint,	/*!< in: hint of which page would be desirable */
+	byte		direction, /*!< in: if the new page is needed because
+				of an index page split, and records are
+				inserted there in order, into which
+				direction they go alphabetically: FSP_DOWN,
+				FSP_UP, FSP_NO_DIR */
+	mtr_t*		mtr);	/*!< in: mtr handle */
+/**********************************************************************//**
+Allocates a single free page from a segment. This function implements
+the intelligent allocation strategy which tries to minimize file space
+fragmentation.
+@return	allocated page offset, FIL_NULL if no page could be allocated */
+UNIV_INTERN
+ulint
+fseg_alloc_free_page_general(
+/*=========================*/
+	fseg_header_t*	seg_header,/*!< in: segment header */
+	ulint		hint,	/*!< in: hint of which page would be desirable */
+	byte		direction,/*!< in: if the new page is needed because
+				of an index page split, and records are
+				inserted there in order, into which
+				direction they go alphabetically: FSP_DOWN,
+				FSP_UP, FSP_NO_DIR */
+	ibool		has_done_reservation, /*!< in: TRUE if the caller has
+				already done the reservation for the page
+				with fsp_reserve_free_extents, then there
+				is no need to do the check for this individual
+				page */
+	mtr_t*		mtr);	/*!< in: mtr handle */
+/**********************************************************************//**
+Reserves free pages from a tablespace. All mini-transactions which may
+use several pages from the tablespace should call this function beforehand
+and reserve enough free extents so that they certainly will be able
+to do their operation, like a B-tree page split, fully. Reservations
+must be released with function fil_space_release_free_extents!
+
+The alloc_type below has the following meaning: FSP_NORMAL means an
+operation which will probably result in more space usage, like an
+insert in a B-tree; FSP_UNDO means allocation to undo logs: if we are
+deleting rows, then this allocation will in the long run result in
+less space usage (after a purge); FSP_CLEANING means allocation done
+in a physical record delete (like in a purge) or other cleaning operation
+which will result in less space usage in the long run. We prefer the latter
+two types of allocation: when space is scarce, FSP_NORMAL allocations
+will not succeed, but the latter two allocations will succeed, if possible.
+The purpose is to avoid dead end where the database is full but the
+user cannot free any space because these freeing operations temporarily
+reserve some space.
+
+Single-table tablespaces whose size is < 32 pages are a special case. In this
+function we would liberally reserve several 64 page extents for every page
+split or merge in a B-tree. But we do not want to waste disk space if the table
+only occupies < 32 pages. That is why we apply different rules in that special
+case, just ensuring that there are 3 free pages available.
+@return	TRUE if we were able to make the reservation */
+UNIV_INTERN
+ibool
+fsp_reserve_free_extents(
+/*=====================*/
+	ulint*	n_reserved,/*!< out: number of extents actually reserved; if we
+			return TRUE and the tablespace size is < 64 pages,
+			then this can be 0, otherwise it is n_ext */
+	ulint	space,	/*!< in: space id */
+	ulint	n_ext,	/*!< in: number of extents to reserve */
+	ulint	alloc_type,/*!< in: FSP_NORMAL, FSP_UNDO, or FSP_CLEANING */
+	mtr_t*	mtr);	/*!< in: mtr */
+/**********************************************************************//**
+This function should be used to get information on how much we still
+will be able to insert new data to the database without running out the
+tablespace. Only free extents are taken into account and we also subtract
+the safety margin required by the above function fsp_reserve_free_extents.
+@return	available space in kB */
+UNIV_INTERN
+ullint
+fsp_get_available_space_in_free_extents(
+/*====================================*/
+	ulint	space);	/*!< in: space id */
+/**********************************************************************//**
+Frees a single page of a segment. */
+UNIV_INTERN
+void
+fseg_free_page(
+/*===========*/
+	fseg_header_t*	seg_header, /*!< in: segment header */
+	ulint		space,	/*!< in: space id */
+	ulint		page,	/*!< in: page offset */
+	mtr_t*		mtr);	/*!< in: mtr handle */
+/**********************************************************************//**
+Frees part of a segment. This function can be used to free a segment
+by repeatedly calling this function in different mini-transactions.
+Doing the freeing in a single mini-transaction might result in
+too big a mini-transaction.
+@return	TRUE if freeing completed */
+UNIV_INTERN
+ibool
+fseg_free_step(
+/*===========*/
+	fseg_header_t*	header,	/*!< in, own: segment header; NOTE: if the header
+				resides on the first page of the frag list
+				of the segment, this pointer becomes obsolete
+				after the last freeing step */
+	mtr_t*		mtr);	/*!< in: mtr */
+/**********************************************************************//**
+Frees part of a segment. Differs from fseg_free_step because this function
+leaves the header page unfreed.
+@return	TRUE if freeing completed, except the header page */
+UNIV_INTERN
+ibool
+fseg_free_step_not_header(
+/*======================*/
+	fseg_header_t*	header,	/*!< in: segment header which must reside on
+				the first fragment page of the segment */
+	mtr_t*		mtr);	/*!< in: mtr */
+/***********************************************************************//**
+Checks if a page address is an extent descriptor page address.
+@return	TRUE if a descriptor page */
+UNIV_INLINE
+ibool
+fsp_descr_page(
+/*===========*/
+	ulint	zip_size,/*!< in: compressed page size in bytes;
+			0 for uncompressed pages */
+	ulint	page_no);/*!< in: page number */
+/***********************************************************//**
+Parses a redo log record of a file page init.
+@return	end of log record or NULL */
+UNIV_INTERN
+byte*
+fsp_parse_init_file_page(
+/*=====================*/
+	byte*		ptr,	/*!< in: buffer */
+	byte*		end_ptr, /*!< in: buffer end */
+	buf_block_t*	block);	/*!< in: block or NULL */
+/*******************************************************************//**
+Validates the file space system and its segments.
+@return	TRUE if ok */
+UNIV_INTERN
+ibool
+fsp_validate(
+/*=========*/
+	ulint	space);	/*!< in: space id */
+/*******************************************************************//**
+Prints info of a file space. */
+UNIV_INTERN
+void
+fsp_print(
+/*======*/
+	ulint	space);	/*!< in: space id */
+#ifdef UNIV_DEBUG
+/*******************************************************************//**
+Validates a segment.
+@return	TRUE if ok */
+UNIV_INTERN
+ibool
+fseg_validate(
+/*==========*/
+	fseg_header_t*	header, /*!< in: segment header */
+	mtr_t*		mtr);	/*!< in: mtr */
+#endif /* UNIV_DEBUG */
+#ifdef UNIV_BTR_PRINT
+/*******************************************************************//**
+Writes info of a segment. */
+UNIV_INTERN
+void
+fseg_print(
+/*=======*/
+	fseg_header_t*	header, /*!< in: segment header */
+	mtr_t*		mtr);	/*!< in: mtr */
+#endif /* UNIV_BTR_PRINT */
+
+#ifndef UNIV_NONINL
+#include "fsp0fsp.ic"
+#endif
+
+#endif
diff --git a/storage/xtradb/include/fsp0fsp.ic b/storage/xtradb/include/fsp0fsp.ic
new file mode 100644
index 00000000000..434c370b527
--- /dev/null
+++ b/storage/xtradb/include/fsp0fsp.ic
@@ -0,0 +1,45 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/fsp0fsp.ic
+File space management
+
+Created 12/18/1995 Heikki Tuuri
+*******************************************************/
+
+/***********************************************************************//**
+Checks if a page address is an extent descriptor page address.
+@return	TRUE if a descriptor page */
+UNIV_INLINE
+ibool
+fsp_descr_page(
+/*===========*/
+	ulint	zip_size,/*!< in: compressed page size in bytes;
+			0 for uncompressed pages */
+	ulint	page_no)/*!< in: page number */
+{
+	ut_ad(ut_is_2pow(zip_size));
+
+	if (!zip_size) {
+		return(UNIV_UNLIKELY((page_no & (UNIV_PAGE_SIZE - 1))
+				     == FSP_XDES_OFFSET));
+	}
+
+	return(UNIV_UNLIKELY((page_no & (zip_size - 1)) == FSP_XDES_OFFSET));
+}
diff --git a/storage/xtradb/include/fsp0types.h b/storage/xtradb/include/fsp0types.h
new file mode 100644
index 00000000000..2dd2deca671
--- /dev/null
+++ b/storage/xtradb/include/fsp0types.h
@@ -0,0 +1,110 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/******************************************************
+@file include/fsp0types.h
+File space management types
+
+Created May 26, 2009 Vasil Dimov
+*******************************************************/
+
+#ifndef fsp0types_h
+#define fsp0types_h
+
+#include "univ.i"
+
+#include "fil0fil.h" /* for FIL_PAGE_DATA */
+
+/** @name Flags for inserting records in order
+If records are inserted in order, there are the following
+flags to tell this (their type is made byte for the compiler
+to warn if direction and hint parameters are switched in
+fseg_alloc_free_page) */
+/* @{ */
+#define	FSP_UP		((byte)111)	/*!< alphabetically upwards */
+#define	FSP_DOWN	((byte)112)	/*!< alphabetically downwards */
+#define	FSP_NO_DIR	((byte)113)	/*!< no order */
+/* @} */
+
+/** File space extent size (one megabyte) in pages */
+#define	FSP_EXTENT_SIZE		(1u << (20 - UNIV_PAGE_SIZE_SHIFT))
+
+/** On a page of any file segment, data may be put starting from this
+offset */
+#define FSEG_PAGE_DATA		FIL_PAGE_DATA
+
+/** @name File segment header
+The file segment header points to the inode describing the file segment. */
+/* @{ */
+/** Data type for file segment header */
+typedef	byte	fseg_header_t;
+
+#define FSEG_HDR_SPACE		0	/*!< space id of the inode */
+#define FSEG_HDR_PAGE_NO	4	/*!< page number of the inode */
+#define FSEG_HDR_OFFSET		8	/*!< byte offset of the inode */
+
+#define FSEG_HEADER_SIZE	10	/*!< Length of the file system
+					header, in bytes */
+/* @} */
+
+/** Flags for fsp_reserve_free_extents @{ */
+#define FSP_NORMAL	1000000
+#define	FSP_UNDO	2000000
+#define FSP_CLEANING	3000000
+/* @} */
+
+/* Number of pages described in a single descriptor page: currently each page
+description takes less than 1 byte; a descriptor page is repeated every
+this many file pages */
+/* #define XDES_DESCRIBED_PER_PAGE		UNIV_PAGE_SIZE */
+/* This has been replaced with either UNIV_PAGE_SIZE or page_zip->size. */
+
+/** @name The space low address page map
+The pages at FSP_XDES_OFFSET and FSP_IBUF_BITMAP_OFFSET are repeated
+every XDES_DESCRIBED_PER_PAGE pages in every tablespace. */
+/* @{ */
+/*--------------------------------------*/
+#define FSP_XDES_OFFSET			0	/* !< extent descriptor */
+#define FSP_IBUF_BITMAP_OFFSET		1	/* !< insert buffer bitmap */
+				/* The ibuf bitmap pages are the ones whose
+				page number is the number above plus a
+				multiple of XDES_DESCRIBED_PER_PAGE */
+
+#define FSP_FIRST_INODE_PAGE_NO		2	/*!< in every tablespace */
+				/* The following pages exist
+				in the system tablespace (space 0). */
+#define FSP_IBUF_HEADER_PAGE_NO		3	/*!< insert buffer
+						header page, in
+						tablespace 0 */
+#define FSP_IBUF_TREE_ROOT_PAGE_NO	4	/*!< insert buffer
+						B-tree root page in
+						tablespace 0 */
+				/* The ibuf tree root page number in
+				tablespace 0; its fseg inode is on the page
+				number FSP_FIRST_INODE_PAGE_NO */
+#define FSP_TRX_SYS_PAGE_NO		5	/*!< transaction
+						system header, in
+						tablespace 0 */
+#define	FSP_FIRST_RSEG_PAGE_NO		6	/*!< first rollback segment
+						page, in tablespace 0 */
+#define FSP_DICT_HDR_PAGE_NO		7	/*!< data dictionary header
+						page, in tablespace 0 */
+/*--------------------------------------*/
+/* @} */
+
+#endif /* fsp0types_h */
diff --git a/storage/xtradb/include/fut0fut.h b/storage/xtradb/include/fut0fut.h
new file mode 100644
index 00000000000..dce20b3bad6
--- /dev/null
+++ b/storage/xtradb/include/fut0fut.h
@@ -0,0 +1,55 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/******************************************************************//**
+@file include/fut0fut.h
+File-based utilities
+
+Created 12/13/1995 Heikki Tuuri
+***********************************************************************/
+
+
+#ifndef fut0fut_h
+#define fut0fut_h
+
+#include "univ.i"
+
+#include "fil0fil.h"
+#include "mtr0mtr.h"
+
+/********************************************************************//**
+Gets a pointer to a file address and latches the page.
+@return pointer to a byte in a frame; the file page in the frame is
+bufferfixed and latched */
+UNIV_INLINE
+byte*
+fut_get_ptr(
+/*========*/
+	ulint		space,	/*!< in: space id */
+	ulint		zip_size,/*!< in: compressed page size in bytes
+				or 0 for uncompressed pages */
+	fil_addr_t	addr,	/*!< in: file address */
+	ulint		rw_latch, /*!< in: RW_S_LATCH, RW_X_LATCH */
+	mtr_t*		mtr);	/*!< in: mtr handle */
+
+#ifndef UNIV_NONINL
+#include "fut0fut.ic"
+#endif
+
+#endif
+
diff --git a/storage/xtradb/include/fut0fut.ic b/storage/xtradb/include/fut0fut.ic
new file mode 100644
index 00000000000..529f2a516d3
--- /dev/null
+++ b/storage/xtradb/include/fut0fut.ic
@@ -0,0 +1,63 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/******************************************************************//**
+@file include/fut0fut.ic
+File-based utilities
+
+Created 12/13/1995 Heikki Tuuri
+***********************************************************************/
+
+#include "srv0srv.h"
+#include "sync0rw.h"
+#include "buf0buf.h"
+
+/********************************************************************//**
+Gets a pointer to a file address and latches the page.
+@return pointer to a byte in a frame; the file page in the frame is
+bufferfixed and latched */
+UNIV_INLINE
+byte*
+fut_get_ptr(
+/*========*/
+	ulint		space,	/*!< in: space id */
+	ulint		zip_size,/*!< in: compressed page size in bytes
+				or 0 for uncompressed pages */
+	fil_addr_t	addr,	/*!< in: file address */
+	ulint		rw_latch, /*!< in: RW_S_LATCH, RW_X_LATCH */
+	mtr_t*		mtr)	/*!< in: mtr handle */
+{
+	buf_block_t*	block;
+	byte*		ptr;
+
+	ut_ad(addr.boffset < UNIV_PAGE_SIZE);
+	ut_ad((rw_latch == RW_S_LATCH) || (rw_latch == RW_X_LATCH));
+
+	block = buf_page_get(space, zip_size, addr.page, rw_latch, mtr);
+
+	if (srv_pass_corrupt_table && !block) {
+		return(0);
+	}
+	ut_a(block);
+
+	ptr = buf_block_get_frame(block) + addr.boffset;
+
+	buf_block_dbg_add_level(block, SYNC_NO_ORDER_CHECK);
+
+	return(ptr);
+}
diff --git a/storage/xtradb/include/fut0lst.h b/storage/xtradb/include/fut0lst.h
new file mode 100644
index 00000000000..fe024c2498f
--- /dev/null
+++ b/storage/xtradb/include/fut0lst.h
@@ -0,0 +1,217 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/******************************************************************//**
+@file include/fut0lst.h
+File-based list utilities
+
+Created 11/28/1995 Heikki Tuuri
+***********************************************************************/
+
+#ifndef fut0lst_h
+#define fut0lst_h
+
+#include "univ.i"
+
+#include "fil0fil.h"
+#include "mtr0mtr.h"
+
+
+/* The C 'types' of base node and list node: these should be used to
+write self-documenting code. Of course, the sizeof macro cannot be
+applied to these types! */
+
+typedef	byte	flst_base_node_t;
+typedef	byte	flst_node_t;
+
+/* The physical size of a list base node in bytes */
+#define	FLST_BASE_NODE_SIZE	(4 + 2 * FIL_ADDR_SIZE)
+
+/* The physical size of a list node in bytes */
+#define	FLST_NODE_SIZE		(2 * FIL_ADDR_SIZE)
+
+#ifndef UNIV_HOTBACKUP
+/********************************************************************//**
+Initializes a list base node. */
+UNIV_INLINE
+void
+flst_init(
+/*======*/
+	flst_base_node_t*	base,	/*!< in: pointer to base node */
+	mtr_t*			mtr);	/*!< in: mini-transaction handle */
+/********************************************************************//**
+Adds a node as the last node in a list. */
+UNIV_INTERN
+void
+flst_add_last(
+/*==========*/
+	flst_base_node_t*	base,	/*!< in: pointer to base node of list */
+	flst_node_t*		node,	/*!< in: node to add */
+	mtr_t*			mtr);	/*!< in: mini-transaction handle */
+/********************************************************************//**
+Adds a node as the first node in a list. */
+UNIV_INTERN
+void
+flst_add_first(
+/*===========*/
+	flst_base_node_t*	base,	/*!< in: pointer to base node of list */
+	flst_node_t*		node,	/*!< in: node to add */
+	mtr_t*			mtr);	/*!< in: mini-transaction handle */
+/********************************************************************//**
+Inserts a node after another in a list. */
+UNIV_INTERN
+void
+flst_insert_after(
+/*==============*/
+	flst_base_node_t*	base,	/*!< in: pointer to base node of list */
+	flst_node_t*		node1,	/*!< in: node to insert after */
+	flst_node_t*		node2,	/*!< in: node to add */
+	mtr_t*			mtr);	/*!< in: mini-transaction handle */
+/********************************************************************//**
+Inserts a node before another in a list. */
+UNIV_INTERN
+void
+flst_insert_before(
+/*===============*/
+	flst_base_node_t*	base,	/*!< in: pointer to base node of list */
+	flst_node_t*		node2,	/*!< in: node to insert */
+	flst_node_t*		node3,	/*!< in: node to insert before */
+	mtr_t*			mtr);	/*!< in: mini-transaction handle */
+/********************************************************************//**
+Removes a node. */
+UNIV_INTERN
+void
+flst_remove(
+/*========*/
+	flst_base_node_t*	base,	/*!< in: pointer to base node of list */
+	flst_node_t*		node2,	/*!< in: node to remove */
+	mtr_t*			mtr);	/*!< in: mini-transaction handle */
+/********************************************************************//**
+Cuts off the tail of the list, including the node given. The number of
+nodes which will be removed must be provided by the caller, as this function
+does not measure the length of the tail. */
+UNIV_INTERN
+void
+flst_cut_end(
+/*=========*/
+	flst_base_node_t*	base,	/*!< in: pointer to base node of list */
+	flst_node_t*		node2,	/*!< in: first node to remove */
+	ulint			n_nodes,/*!< in: number of nodes to remove,
+					must be >= 1 */
+	mtr_t*			mtr);	/*!< in: mini-transaction handle */
+/********************************************************************//**
+Cuts off the tail of the list, not including the given node. The number of
+nodes which will be removed must be provided by the caller, as this function
+does not measure the length of the tail. */
+UNIV_INTERN
+void
+flst_truncate_end(
+/*==============*/
+	flst_base_node_t*	base,	/*!< in: pointer to base node of list */
+	flst_node_t*		node2,	/*!< in: first node not to remove */
+	ulint			n_nodes,/*!< in: number of nodes to remove */
+	mtr_t*			mtr);	/*!< in: mini-transaction handle */
+/********************************************************************//**
+Gets list length.
+@return	length */
+UNIV_INLINE
+ulint
+flst_get_len(
+/*=========*/
+	const flst_base_node_t*	base,	/*!< in: pointer to base node */
+	mtr_t*			mtr);	/*!< in: mini-transaction handle */
+/********************************************************************//**
+Gets list first node address.
+@return	file address */
+UNIV_INLINE
+fil_addr_t
+flst_get_first(
+/*===========*/
+	const flst_base_node_t*	base,	/*!< in: pointer to base node */
+	mtr_t*			mtr);	/*!< in: mini-transaction handle */
+/********************************************************************//**
+Gets list last node address.
+@return	file address */
+UNIV_INLINE
+fil_addr_t
+flst_get_last(
+/*==========*/
+	const flst_base_node_t*	base,	/*!< in: pointer to base node */
+	mtr_t*			mtr);	/*!< in: mini-transaction handle */
+/********************************************************************//**
+Gets list next node address.
+@return	file address */
+UNIV_INLINE
+fil_addr_t
+flst_get_next_addr(
+/*===============*/
+	const flst_node_t*	node,	/*!< in: pointer to node */
+	mtr_t*			mtr);	/*!< in: mini-transaction handle */
+/********************************************************************//**
+Gets list prev node address.
+@return	file address */
+UNIV_INLINE
+fil_addr_t
+flst_get_prev_addr(
+/*===============*/
+	const flst_node_t*	node,	/*!< in: pointer to node */
+	mtr_t*			mtr);	/*!< in: mini-transaction handle */
+/********************************************************************//**
+Writes a file address. */
+UNIV_INLINE
+void
+flst_write_addr(
+/*============*/
+	fil_faddr_t*	faddr,	/*!< in: pointer to file faddress */
+	fil_addr_t	addr,	/*!< in: file address */
+	mtr_t*		mtr);	/*!< in: mini-transaction handle */
+/********************************************************************//**
+Reads a file address.
+@return	file address */
+UNIV_INLINE
+fil_addr_t
+flst_read_addr(
+/*===========*/
+	const fil_faddr_t*	faddr,	/*!< in: pointer to file faddress */
+	mtr_t*			mtr);	/*!< in: mini-transaction handle */
+/********************************************************************//**
+Validates a file-based list.
+@return	TRUE if ok */
+UNIV_INTERN
+ibool
+flst_validate(
+/*==========*/
+	const flst_base_node_t*	base,	/*!< in: pointer to base node of list */
+	mtr_t*			mtr1);	/*!< in: mtr */
+/********************************************************************//**
+Prints info of a file-based list. */
+UNIV_INTERN
+void
+flst_print(
+/*=======*/
+	const flst_base_node_t*	base,	/*!< in: pointer to base node of list */
+	mtr_t*			mtr);	/*!< in: mtr */
+
+
+#ifndef UNIV_NONINL
+#include "fut0lst.ic"
+#endif
+
+#endif /* !UNIV_HOTBACKUP */
+
+#endif
diff --git a/storage/xtradb/include/fut0lst.ic b/storage/xtradb/include/fut0lst.ic
new file mode 100644
index 00000000000..dcd13c61871
--- /dev/null
+++ b/storage/xtradb/include/fut0lst.ic
@@ -0,0 +1,167 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/******************************************************************//**
+@file include/fut0lst.ic
+File-based list utilities
+
+Created 11/28/1995 Heikki Tuuri
+***********************************************************************/
+
+#include "fut0fut.h"
+#include "mtr0log.h"
+#include "buf0buf.h"
+
+/* We define the field offsets of a node for the list */
+#define FLST_PREV	0	/* 6-byte address of the previous list element;
+				the page part of address is FIL_NULL, if no
+				previous element */
+#define FLST_NEXT	FIL_ADDR_SIZE	/* 6-byte address of the next
+				list element; the page part of address
+				is FIL_NULL, if no next element */
+
+/* We define the field offsets of a base node for the list */
+#define FLST_LEN	0	/* 32-bit list length field */
+#define	FLST_FIRST	4	/* 6-byte address of the first element
+				of the list; undefined if empty list */
+#define	FLST_LAST	(4 + FIL_ADDR_SIZE) /* 6-byte address of the
+				last element of the list; undefined
+				if empty list */
+
+/********************************************************************//**
+Writes a file address. */
+UNIV_INLINE
+void
+flst_write_addr(
+/*============*/
+	fil_faddr_t*	faddr,	/*!< in: pointer to file faddress */
+	fil_addr_t	addr,	/*!< in: file address */
+	mtr_t*		mtr)	/*!< in: mini-transaction handle */
+{
+	ut_ad(faddr && mtr);
+	ut_ad(mtr_memo_contains_page(mtr, faddr, MTR_MEMO_PAGE_X_FIX));
+	ut_a(addr.page == FIL_NULL || addr.boffset >= FIL_PAGE_DATA);
+	ut_a(ut_align_offset(faddr, UNIV_PAGE_SIZE) >= FIL_PAGE_DATA);
+
+	mlog_write_ulint(faddr + FIL_ADDR_PAGE, addr.page, MLOG_4BYTES, mtr);
+	mlog_write_ulint(faddr + FIL_ADDR_BYTE, addr.boffset,
+			 MLOG_2BYTES, mtr);
+}
+
+/********************************************************************//**
+Reads a file address.
+@return	file address */
+UNIV_INLINE
+fil_addr_t
+flst_read_addr(
+/*===========*/
+	const fil_faddr_t*	faddr,	/*!< in: pointer to file faddress */
+	mtr_t*			mtr)	/*!< in: mini-transaction handle */
+{
+	fil_addr_t	addr;
+
+	ut_ad(faddr && mtr);
+
+	addr.page = mtr_read_ulint(faddr + FIL_ADDR_PAGE, MLOG_4BYTES, mtr);
+	addr.boffset = mtr_read_ulint(faddr + FIL_ADDR_BYTE, MLOG_2BYTES,
+				      mtr);
+	ut_a(addr.page == FIL_NULL || addr.boffset >= FIL_PAGE_DATA);
+	ut_a(ut_align_offset(faddr, UNIV_PAGE_SIZE) >= FIL_PAGE_DATA);
+	return(addr);
+}
+
+/********************************************************************//**
+Initializes a list base node. */
+UNIV_INLINE
+void
+flst_init(
+/*======*/
+	flst_base_node_t*	base,	/*!< in: pointer to base node */
+	mtr_t*			mtr)	/*!< in: mini-transaction handle */
+{
+	ut_ad(mtr_memo_contains_page(mtr, base, MTR_MEMO_PAGE_X_FIX));
+
+	mlog_write_ulint(base + FLST_LEN, 0, MLOG_4BYTES, mtr);
+	flst_write_addr(base + FLST_FIRST, fil_addr_null, mtr);
+	flst_write_addr(base + FLST_LAST, fil_addr_null, mtr);
+}
+
+/********************************************************************//**
+Gets list length.
+@return	length */
+UNIV_INLINE
+ulint
+flst_get_len(
+/*=========*/
+	const flst_base_node_t*	base,	/*!< in: pointer to base node */
+	mtr_t*			mtr)	/*!< in: mini-transaction handle */
+{
+	return(mtr_read_ulint(base + FLST_LEN, MLOG_4BYTES, mtr));
+}
+
+/********************************************************************//**
+Gets list first node address.
+@return	file address */
+UNIV_INLINE
+fil_addr_t
+flst_get_first(
+/*===========*/
+	const flst_base_node_t*	base,	/*!< in: pointer to base node */
+	mtr_t*			mtr)	/*!< in: mini-transaction handle */
+{
+	return(flst_read_addr(base + FLST_FIRST, mtr));
+}
+
+/********************************************************************//**
+Gets list last node address.
+@return	file address */
+UNIV_INLINE
+fil_addr_t
+flst_get_last(
+/*==========*/
+	const flst_base_node_t*	base,	/*!< in: pointer to base node */
+	mtr_t*			mtr)	/*!< in: mini-transaction handle */
+{
+	return(flst_read_addr(base + FLST_LAST, mtr));
+}
+
+/********************************************************************//**
+Gets list next node address.
+@return	file address */
+UNIV_INLINE
+fil_addr_t
+flst_get_next_addr(
+/*===============*/
+	const flst_node_t*	node,	/*!< in: pointer to node */
+	mtr_t*			mtr)	/*!< in: mini-transaction handle */
+{
+	return(flst_read_addr(node + FLST_NEXT, mtr));
+}
+
+/********************************************************************//**
+Gets list prev node address.
+@return	file address */
+UNIV_INLINE
+fil_addr_t
+flst_get_prev_addr(
+/*===============*/
+	const flst_node_t*	node,	/*!< in: pointer to node */
+	mtr_t*			mtr)	/*!< in: mini-transaction handle */
+{
+	return(flst_read_addr(node + FLST_PREV, mtr));
+}
diff --git a/storage/xtradb/include/ha0ha.h b/storage/xtradb/include/ha0ha.h
new file mode 100644
index 00000000000..3299000bf3c
--- /dev/null
+++ b/storage/xtradb/include/ha0ha.h
@@ -0,0 +1,243 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/ha0ha.h
+The hash table with external chains
+
+Created 8/18/1994 Heikki Tuuri
+*******************************************************/
+
+#ifndef ha0ha_h
+#define ha0ha_h
+
+#include "univ.i"
+
+#include "hash0hash.h"
+#include "page0types.h"
+#include "buf0types.h"
+
+/*************************************************************//**
+Looks for an element in a hash table.
+@return pointer to the data of the first hash table node in chain
+having the fold number, NULL if not found */
+UNIV_INLINE
+void*
+ha_search_and_get_data(
+/*===================*/
+	hash_table_t*	table,	/*!< in: hash table */
+	ulint		fold);	/*!< in: folded value of the searched data */
+/*********************************************************//**
+Looks for an element when we know the pointer to the data and updates
+the pointer to data if found. */
+UNIV_INTERN
+void
+ha_search_and_update_if_found_func(
+/*===============================*/
+	hash_table_t*	table,	/*!< in/out: hash table */
+	ulint		fold,	/*!< in: folded value of the searched data */
+	void*		data,	/*!< in: pointer to the data */
+#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
+	buf_block_t*	new_block,/*!< in: block containing new_data */
+#endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */
+	void*		new_data);/*!< in: new pointer to the data */
+
+#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
+/** Looks for an element when we know the pointer to the data and
+updates the pointer to data if found.
+@param table		in/out: hash table
+@param fold		in: folded value of the searched data
+@param data		in: pointer to the data
+@param new_block	in: block containing new_data
+@param new_data		in: new pointer to the data */
+# define ha_search_and_update_if_found(table,fold,data,new_block,new_data) \
+	ha_search_and_update_if_found_func(table,fold,data,new_block,new_data)
+#else /* UNIV_AHI_DEBUG || UNIV_DEBUG */
+/** Looks for an element when we know the pointer to the data and
+updates the pointer to data if found.
+@param table		in/out: hash table
+@param fold		in: folded value of the searched data
+@param data		in: pointer to the data
+@param new_block	ignored: block containing new_data
+@param new_data		in: new pointer to the data */
+# define ha_search_and_update_if_found(table,fold,data,new_block,new_data) \
+	ha_search_and_update_if_found_func(table,fold,data,new_data)
+#endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */
+/*************************************************************//**
+Creates a hash table with at least n array cells.  The actual number
+of cells is chosen to be a prime number slightly bigger than n.
+@return	own: created table */
+UNIV_INTERN
+hash_table_t*
+ha_create_func(
+/*===========*/
+	ulint	n,		/*!< in: number of array cells */
+#ifdef UNIV_SYNC_DEBUG
+	ulint	mutex_level,	/*!< in: level of the mutexes in the latching
+				order: this is used in the debug version */
+#endif /* UNIV_SYNC_DEBUG */
+	ulint	n_mutexes);	/*!< in: number of mutexes to protect the
+				hash table: must be a power of 2, or 0 */
+#ifdef UNIV_SYNC_DEBUG
+/** Creates a hash table.
+@return		own: created table
+@param n_c	in: number of array cells.  The actual number of cells is
+chosen to be a slightly bigger prime number.
+@param level	in: level of the mutexes in the latching order
+@param n_m	in: number of mutexes to protect the hash table;
+		must be a power of 2, or 0 */
+# define ha_create(n_c,n_m,level) ha_create_func(n_c,level,n_m)
+#else /* UNIV_SYNC_DEBUG */
+/** Creates a hash table.
+@return		own: created table
+@param n_c	in: number of array cells.  The actual number of cells is
+chosen to be a slightly bigger prime number.
+@param level	in: level of the mutexes in the latching order
+@param n_m	in: number of mutexes to protect the hash table;
+		must be a power of 2, or 0 */
+# define ha_create(n_c,n_m,level) ha_create_func(n_c,n_m)
+#endif /* UNIV_SYNC_DEBUG */
+
+/*************************************************************//**
+Empties a hash table and frees the memory heaps. */
+UNIV_INTERN
+void
+ha_clear(
+/*=====*/
+	hash_table_t*	table);	/*!< in, own: hash table */
+
+/*************************************************************//**
+Inserts an entry into a hash table. If an entry with the same fold number
+is found, its node is updated to point to the new data, and no new node
+is inserted.
+@return	TRUE if succeed, FALSE if no more memory could be allocated */
+UNIV_INTERN
+ibool
+ha_insert_for_fold_func(
+/*====================*/
+	hash_table_t*	table,	/*!< in: hash table */
+	ulint		fold,	/*!< in: folded value of data; if a node with
+				the same fold value already exists, it is
+				updated to point to the same data, and no new
+				node is created! */
+#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
+	buf_block_t*	block,	/*!< in: buffer block containing the data */
+#endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */
+	void*		data);	/*!< in: data, must not be NULL */
+
+#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
+/**
+Inserts an entry into a hash table. If an entry with the same fold number
+is found, its node is updated to point to the new data, and no new node
+is inserted.
+@return	TRUE if succeed, FALSE if no more memory could be allocated
+@param t	in: hash table
+@param f	in: folded value of data
+@param b	in: buffer block containing the data
+@param d	in: data, must not be NULL */
+# define ha_insert_for_fold(t,f,b,d) ha_insert_for_fold_func(t,f,b,d)
+#else /* UNIV_AHI_DEBUG || UNIV_DEBUG */
+/**
+Inserts an entry into a hash table. If an entry with the same fold number
+is found, its node is updated to point to the new data, and no new node
+is inserted.
+@return	TRUE if succeed, FALSE if no more memory could be allocated
+@param t	in: hash table
+@param f	in: folded value of data
+@param b	ignored: buffer block containing the data
+@param d	in: data, must not be NULL */
+# define ha_insert_for_fold(t,f,b,d) ha_insert_for_fold_func(t,f,d)
+#endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */
+
+/*********************************************************//**
+Looks for an element when we know the pointer to the data and deletes
+it from the hash table if found.
+@return	TRUE if found */
+UNIV_INLINE
+ibool
+ha_search_and_delete_if_found(
+/*==========================*/
+	hash_table_t*	table,	/*!< in: hash table */
+	ulint		fold,	/*!< in: folded value of the searched data */
+	void*		data);	/*!< in: pointer to the data */
+#ifndef UNIV_HOTBACKUP
+/*****************************************************************//**
+Removes from the chain determined by fold all nodes whose data pointer
+points to the page given. */
+UNIV_INTERN
+void
+ha_remove_all_nodes_to_page(
+/*========================*/
+	hash_table_t*	table,	/*!< in: hash table */
+	ulint		fold,	/*!< in: fold value */
+	const page_t*	page);	/*!< in: buffer page */
+#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
+/*************************************************************//**
+Validates a given range of the cells in hash table.
+@return	TRUE if ok */
+UNIV_INTERN
+ibool
+ha_validate(
+/*========*/
+	hash_table_t*	table,		/*!< in: hash table */
+	ulint		start_index,	/*!< in: start index */
+	ulint		end_index);	/*!< in: end index */
+#endif /* defined UNIV_AHI_DEBUG || defined UNIV_DEBUG */
+/*************************************************************//**
+Prints info of a hash table. */
+UNIV_INTERN
+void
+ha_print_info(
+/*==========*/
+	FILE*		file,	/*!< in: file where to print */
+	hash_table_t*	table);	/*!< in: hash table */
+#endif /* !UNIV_HOTBACKUP */
+
+/** The hash table external chain node */
+typedef struct ha_node_struct ha_node_t;
+
+/** The hash table external chain node */
+struct ha_node_struct {
+	ha_node_t*	next;	/*!< next chain node or NULL if none */
+#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
+	buf_block_t*	block;	/*!< buffer block containing the data, or NULL */
+#endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */
+	void*		data;	/*!< pointer to the data */
+	ulint		fold;	/*!< fold value for the data */
+};
+
+#ifndef UNIV_HOTBACKUP
+/** Assert that the current thread is holding the mutex protecting a
+hash bucket corresponding to a fold value.
+@param table	in: hash table
+@param fold	in: fold value */
+# define ASSERT_HASH_MUTEX_OWN(table, fold)				\
+	ut_ad(!(table)->mutexes || mutex_own(hash_get_mutex(table, fold)))
+#else /* !UNIV_HOTBACKUP */
+/** Assert that the current thread is holding the mutex protecting a
+hash bucket corresponding to a fold value.
+@param table	in: hash table
+@param fold	in: fold value */
+# define ASSERT_HASH_MUTEX_OWN(table, fold) ((void) 0)
+#endif /* !UNIV_HOTBACKUP */
+
+#ifndef UNIV_NONINL
+#include "ha0ha.ic"
+#endif
+
+#endif
diff --git a/storage/xtradb/include/ha0ha.ic b/storage/xtradb/include/ha0ha.ic
new file mode 100644
index 00000000000..734403c4cd9
--- /dev/null
+++ b/storage/xtradb/include/ha0ha.ic
@@ -0,0 +1,220 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/********************************************************************//**
+@file include/ha0ha.ic
+The hash table with external chains
+
+Created 8/18/1994 Heikki Tuuri
+*************************************************************************/
+
+#include "ut0rnd.h"
+#include "mem0mem.h"
+
+/***********************************************************//**
+Deletes a hash node. */
+UNIV_INTERN
+void
+ha_delete_hash_node(
+/*================*/
+	hash_table_t*	table,		/*!< in: hash table */
+	ha_node_t*	del_node);	/*!< in: node to be deleted */
+
+/******************************************************************//**
+Gets a hash node data.
+@return	pointer to the data */
+UNIV_INLINE
+void*
+ha_node_get_data(
+/*=============*/
+	ha_node_t*	node)	/*!< in: hash chain node */
+{
+	return(node->data);
+}
+
+/******************************************************************//**
+Sets hash node data. */
+UNIV_INLINE
+void
+ha_node_set_data_func(
+/*==================*/
+	ha_node_t*	node,	/*!< in: hash chain node */
+#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
+	buf_block_t*	block,	/*!< in: buffer block containing the data */
+#endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */
+	void*		data)	/*!< in: pointer to the data */
+{
+#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
+	node->block = block;
+#endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */
+	node->data = data;
+}
+
+#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
+/** Sets hash node data.
+@param n	in: hash chain node
+@param b	in: buffer block containing the data
+@param d	in: pointer to the data */
+# define ha_node_set_data(n,b,d) ha_node_set_data_func(n,b,d)
+#else /* UNIV_AHI_DEBUG || UNIV_DEBUG */
+/** Sets hash node data.
+@param n	in: hash chain node
+@param b	in: buffer block containing the data
+@param d	in: pointer to the data */
+# define ha_node_set_data(n,b,d) ha_node_set_data_func(n,d)
+#endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */
+
+/******************************************************************//**
+Gets the next node in a hash chain.
+@return	next node, NULL if none */
+UNIV_INLINE
+ha_node_t*
+ha_chain_get_next(
+/*==============*/
+	ha_node_t*	node)	/*!< in: hash chain node */
+{
+	return(node->next);
+}
+
+/******************************************************************//**
+Gets the first node in a hash chain.
+@return	first node, NULL if none */
+UNIV_INLINE
+ha_node_t*
+ha_chain_get_first(
+/*===============*/
+	hash_table_t*	table,	/*!< in: hash table */
+	ulint		fold)	/*!< in: fold value determining the chain */
+{
+	return((ha_node_t*)
+	       hash_get_nth_cell(table, hash_calc_hash(fold, table))->node);
+}
+
+/*************************************************************//**
+Looks for an element in a hash table.
+@return pointer to the first hash table node in chain having the fold
+number, NULL if not found */
+UNIV_INLINE
+ha_node_t*
+ha_search(
+/*======*/
+	hash_table_t*	table,	/*!< in: hash table */
+	ulint		fold)	/*!< in: folded value of the searched data */
+{
+	ha_node_t*	node;
+
+	ASSERT_HASH_MUTEX_OWN(table, fold);
+
+	node = ha_chain_get_first(table, fold);
+
+	while (node) {
+		if (node->fold == fold) {
+
+			return(node);
+		}
+
+		node = ha_chain_get_next(node);
+	}
+
+	return(NULL);
+}
+
+/*************************************************************//**
+Looks for an element in a hash table.
+@return pointer to the data of the first hash table node in chain
+having the fold number, NULL if not found */
+UNIV_INLINE
+void*
+ha_search_and_get_data(
+/*===================*/
+	hash_table_t*	table,	/*!< in: hash table */
+	ulint		fold)	/*!< in: folded value of the searched data */
+{
+	ha_node_t*	node;
+
+	ASSERT_HASH_MUTEX_OWN(table, fold);
+
+	node = ha_chain_get_first(table, fold);
+
+	while (node) {
+		if (node->fold == fold) {
+
+			return(node->data);
+		}
+
+		node = ha_chain_get_next(node);
+	}
+
+	return(NULL);
+}
+
+/*********************************************************//**
+Looks for an element when we know the pointer to the data.
+@return	pointer to the hash table node, NULL if not found in the table */
+UNIV_INLINE
+ha_node_t*
+ha_search_with_data(
+/*================*/
+	hash_table_t*	table,	/*!< in: hash table */
+	ulint		fold,	/*!< in: folded value of the searched data */
+	void*		data)	/*!< in: pointer to the data */
+{
+	ha_node_t*	node;
+
+	ASSERT_HASH_MUTEX_OWN(table, fold);
+
+	node = ha_chain_get_first(table, fold);
+
+	while (node) {
+		if (node->data == data) {
+
+			return(node);
+		}
+
+		node = ha_chain_get_next(node);
+	}
+
+	return(NULL);
+}
+
+/*********************************************************//**
+Looks for an element when we know the pointer to the data, and deletes
+it from the hash table, if found.
+@return	TRUE if found */
+UNIV_INLINE
+ibool
+ha_search_and_delete_if_found(
+/*==========================*/
+	hash_table_t*	table,	/*!< in: hash table */
+	ulint		fold,	/*!< in: folded value of the searched data */
+	void*		data)	/*!< in: pointer to the data */
+{
+	ha_node_t*	node;
+
+	ASSERT_HASH_MUTEX_OWN(table, fold);
+
+	node = ha_search_with_data(table, fold, data);
+
+	if (node) {
+		ha_delete_hash_node(table, node);
+
+		return(TRUE);
+	}
+
+	return(FALSE);
+}
diff --git a/storage/xtradb/include/ha0storage.h b/storage/xtradb/include/ha0storage.h
new file mode 100644
index 00000000000..c30bd840579
--- /dev/null
+++ b/storage/xtradb/include/ha0storage.h
@@ -0,0 +1,140 @@
+/*****************************************************************************
+
+Copyright (c) 2007, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/ha0storage.h
+Hash storage.
+Provides a data structure that stores chunks of data in
+its own storage, avoiding duplicates.
+
+Created September 22, 2007 Vasil Dimov
+*******************************************************/
+
+#ifndef ha0storage_h
+#define ha0storage_h
+
+#include "univ.i"
+
+/** This value is used by default by ha_storage_create(). More memory
+is allocated later when/if it is needed. */
+#define HA_STORAGE_DEFAULT_HEAP_BYTES	1024
+
+/** This value is used by default by ha_storage_create(). It is a
+constant per ha_storage's lifetime. */
+#define HA_STORAGE_DEFAULT_HASH_CELLS	4096
+
+/** Hash storage */
+typedef struct ha_storage_struct	ha_storage_t;
+
+/*******************************************************************//**
+Creates a hash storage. If any of the parameters is 0, then a default
+value is used.
+@return	own: hash storage */
+UNIV_INLINE
+ha_storage_t*
+ha_storage_create(
+/*==============*/
+	ulint	initial_heap_bytes,	/*!< in: initial heap's size */
+	ulint	initial_hash_cells);	/*!< in: initial number of cells
+					in the hash table */
+
+/*******************************************************************//**
+Copies data into the storage and returns a pointer to the copy. If the
+same data chunk is already present, then pointer to it is returned.
+Data chunks are considered to be equal if len1 == len2 and
+memcmp(data1, data2, len1) == 0. If "data" is not present (and thus
+data_len bytes need to be allocated) and the size of storage is going to
+become more than "memlim" then "data" is not added and NULL is returned.
+To disable this behavior "memlim" can be set to 0, which stands for
+"no limit".
+@return	pointer to the copy */
+UNIV_INTERN
+const void*
+ha_storage_put_memlim(
+/*==================*/
+	ha_storage_t*	storage,	/*!< in/out: hash storage */
+	const void*	data,		/*!< in: data to store */
+	ulint		data_len,	/*!< in: data length */
+	ulint		memlim);	/*!< in: memory limit to obey */
+
+/*******************************************************************//**
+Same as ha_storage_put_memlim() but without memory limit.
+@param storage	in/out: hash storage
+@param data	in: data to store
+@param data_len	in: data length
+@return		pointer to the copy of the string */
+#define ha_storage_put(storage, data, data_len)	\
+	ha_storage_put_memlim((storage), (data), (data_len), 0)
+
+/*******************************************************************//**
+Copies string into the storage and returns a pointer to the copy. If the
+same string is already present, then pointer to it is returned.
+Strings are considered to be equal if strcmp(str1, str2) == 0.
+@param storage	in/out: hash storage
+@param str	in: string to put
+@return		pointer to the copy of the string */
+#define ha_storage_put_str(storage, str)	\
+	((const char*) ha_storage_put((storage), (str), strlen(str) + 1))
+
+/*******************************************************************//**
+Copies string into the storage and returns a pointer to the copy obeying
+a memory limit.
+If the same string is already present, then pointer to it is returned.
+Strings are considered to be equal if strcmp(str1, str2) == 0.
+@param storage	in/out: hash storage
+@param str	in: string to put
+@param memlim	in: memory limit to obey
+@return		pointer to the copy of the string */
+#define ha_storage_put_str_memlim(storage, str, memlim)	\
+	((const char*) ha_storage_put_memlim((storage), (str),	\
+					     strlen(str) + 1, (memlim)))
+
+/*******************************************************************//**
+Empties a hash storage, freeing memory occupied by data chunks.
+This invalidates any pointers previously returned by ha_storage_put().
+The hash storage is not invalidated itself and can be used again. */
+UNIV_INLINE
+void
+ha_storage_empty(
+/*=============*/
+	ha_storage_t**	storage);	/*!< in/out: hash storage */
+
+/*******************************************************************//**
+Frees a hash storage and everything it contains, it cannot be used after
+this call.
+This invalidates any pointers previously returned by ha_storage_put(). */
+UNIV_INLINE
+void
+ha_storage_free(
+/*============*/
+	ha_storage_t*	storage);	/*!< in, own: hash storage */
+
+/*******************************************************************//**
+Gets the size of the memory used by a storage.
+@return	bytes used */
+UNIV_INLINE
+ulint
+ha_storage_get_size(
+/*================*/
+	const ha_storage_t*	storage);	/*!< in: hash storage */
+
+#ifndef UNIV_NONINL
+#include "ha0storage.ic"
+#endif
+
+#endif /* ha0storage_h */
diff --git a/storage/xtradb/include/ha0storage.ic b/storage/xtradb/include/ha0storage.ic
new file mode 100644
index 00000000000..5acbf82f005
--- /dev/null
+++ b/storage/xtradb/include/ha0storage.ic
@@ -0,0 +1,148 @@
+/*****************************************************************************
+
+Copyright (c) 2007, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/ha0storage.ic
+Hash storage.
+Provides a data structure that stores chunks of data in
+its own storage, avoiding duplicates.
+
+Created September 24, 2007 Vasil Dimov
+*******************************************************/
+
+#include "univ.i"
+#include "ha0storage.h"
+#include "hash0hash.h"
+#include "mem0mem.h"
+
+/** Hash storage for strings */
+struct ha_storage_struct {
+	mem_heap_t*	heap;	/*!< memory heap from which memory is
+				allocated */
+	hash_table_t*	hash;	/*!< hash table used to avoid
+				duplicates */
+};
+
+/** Objects of this type are stored in ha_storage_t */
+typedef struct ha_storage_node_struct ha_storage_node_t;
+/** Objects of this type are stored in ha_storage_struct */
+struct ha_storage_node_struct {
+	ulint			data_len;/*!< length of the data */
+	const void*		data;	/*!< pointer to data */
+	ha_storage_node_t*	next;	/*!< next node in hash chain */
+};
+
+/*******************************************************************//**
+Creates a hash storage. If any of the parameters is 0, then a default
+value is used.
+@return	own: hash storage */
+UNIV_INLINE
+ha_storage_t*
+ha_storage_create(
+/*==============*/
+	ulint	initial_heap_bytes,	/*!< in: initial heap's size */
+	ulint	initial_hash_cells)	/*!< in: initial number of cells
+					in the hash table */
+{
+	ha_storage_t*	storage;
+	mem_heap_t*	heap;
+
+	if (initial_heap_bytes == 0) {
+
+		initial_heap_bytes = HA_STORAGE_DEFAULT_HEAP_BYTES;
+	}
+
+	if (initial_hash_cells == 0) {
+
+		initial_hash_cells = HA_STORAGE_DEFAULT_HASH_CELLS;
+	}
+
+	/* we put "storage" within "storage->heap" */
+
+	heap = mem_heap_create(sizeof(ha_storage_t)
+			       + initial_heap_bytes);
+
+	storage = (ha_storage_t*) mem_heap_alloc(heap,
+						 sizeof(ha_storage_t));
+
+	storage->heap = heap;
+	storage->hash = hash_create(initial_hash_cells);
+
+	return(storage);
+}
+
+/*******************************************************************//**
+Empties a hash storage, freeing memory occupied by data chunks.
+This invalidates any pointers previously returned by ha_storage_put().
+The hash storage is not invalidated itself and can be used again. */
+UNIV_INLINE
+void
+ha_storage_empty(
+/*=============*/
+	ha_storage_t**	storage)	/*!< in/out: hash storage */
+{
+	ha_storage_t	temp_storage;
+
+	temp_storage.heap = (*storage)->heap;
+	temp_storage.hash = (*storage)->hash;
+
+	hash_table_clear(temp_storage.hash);
+	mem_heap_empty(temp_storage.heap);
+
+	*storage = (ha_storage_t*) mem_heap_alloc(temp_storage.heap,
+						  sizeof(ha_storage_t));
+
+	(*storage)->heap = temp_storage.heap;
+	(*storage)->hash = temp_storage.hash;
+}
+
+/*******************************************************************//**
+Frees a hash storage and everything it contains, it cannot be used after
+this call.
+This invalidates any pointers previously returned by ha_storage_put(). */
+UNIV_INLINE
+void
+ha_storage_free(
+/*============*/
+	ha_storage_t*	storage)	/*!< in, own: hash storage */
+{
+	/* order is important because the pointer storage->hash is
+	within the heap */
+	hash_table_free(storage->hash);
+	mem_heap_free(storage->heap);
+}
+
+/*******************************************************************//**
+Gets the size of the memory used by a storage.
+@return	bytes used */
+UNIV_INLINE
+ulint
+ha_storage_get_size(
+/*================*/
+	const ha_storage_t*	storage)	/*!< in: hash storage */
+{
+	ulint	ret;
+
+	ret = mem_heap_get_size(storage->heap);
+
+	/* this assumes hash->heap and hash->heaps are NULL */
+	ret += sizeof(hash_table_t);
+	ret += sizeof(hash_cell_t) * hash_get_n_cells(storage->hash);
+
+	return(ret);
+}
diff --git a/storage/xtradb/include/ha_prototypes.h b/storage/xtradb/include/ha_prototypes.h
new file mode 100644
index 00000000000..445d94eeabb
--- /dev/null
+++ b/storage/xtradb/include/ha_prototypes.h
@@ -0,0 +1,279 @@
+/*****************************************************************************
+
+Copyright (c) 2006, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/*******************************************************************//**
+@file include/ha_prototypes.h
+Prototypes for global functions in ha_innodb.cc that are called by
+InnoDB C code
+
+Created 5/11/2006 Osku Salerma
+************************************************************************/
+
+#ifndef HA_INNODB_PROTOTYPES_H
+#define HA_INNODB_PROTOTYPES_H
+
+#include "trx0types.h"
+#include "m_ctype.h" /* CHARSET_INFO */
+
+/*********************************************************************//**
+Wrapper around MySQL's copy_and_convert function.
+@return	number of bytes copied to 'to' */
+UNIV_INTERN
+ulint
+innobase_convert_string(
+/*====================*/
+	void*		to,		/*!< out: converted string */
+	ulint		to_length,	/*!< in: number of bytes reserved
+					for the converted string */
+	CHARSET_INFO*	to_cs,		/*!< in: character set to convert to */
+	const void*	from,		/*!< in: string to convert */
+	ulint		from_length,	/*!< in: number of bytes to convert */
+	CHARSET_INFO*	from_cs,	/*!< in: character set to convert from */
+	uint*		errors);	/*!< out: number of errors encountered
+					during the conversion */
+
+/*******************************************************************//**
+Formats the raw data in "data" (in InnoDB on-disk format) that is of
+type DATA_(CHAR|VARCHAR|MYSQL|VARMYSQL) using "charset_coll" and writes
+the result to "buf". The result is converted to "system_charset_info".
+Not more than "buf_size" bytes are written to "buf".
+The result is always NUL-terminated (provided buf_size > 0) and the
+number of bytes that were written to "buf" is returned (including the
+terminating NUL).
+@return	number of bytes that were written */
+UNIV_INTERN
+ulint
+innobase_raw_format(
+/*================*/
+	const char*	data,		/*!< in: raw data */
+	ulint		data_len,	/*!< in: raw data length
+					in bytes */
+	ulint		charset_coll,	/*!< in: charset collation */
+	char*		buf,		/*!< out: output buffer */
+	ulint		buf_size);	/*!< in: output buffer size
+					in bytes */
+
+/*****************************************************************//**
+Invalidates the MySQL query cache for the table. */
+UNIV_INTERN
+void
+innobase_invalidate_query_cache(
+/*============================*/
+	trx_t*		trx,		/*!< in: transaction which
+					modifies the table */
+	const char*	full_name,	/*!< in: concatenation of
+					database name, null char NUL,
+					table name, null char NUL;
+					NOTE that in Windows this is
+					always in LOWER CASE! */
+	ulint		full_name_len);	/*!< in: full name length where
+					also the null chars count */
+
+/*****************************************************************//**
+Convert a table or index name to the MySQL system_charset_info (UTF-8)
+and quote it if needed.
+@return	pointer to the end of buf */
+UNIV_INTERN
+char*
+innobase_convert_name(
+/*==================*/
+	char*		buf,	/*!< out: buffer for converted identifier */
+	ulint		buflen,	/*!< in: length of buf, in bytes */
+	const char*	id,	/*!< in: identifier to convert */
+	ulint		idlen,	/*!< in: length of id, in bytes */
+	void*		thd,	/*!< in: MySQL connection thread, or NULL */
+	ibool		table_id);/*!< in: TRUE=id is a table or database name;
+				FALSE=id is an index name */
+
+/******************************************************************//**
+Returns true if the thread is the replication thread on the slave
+server. Used in srv_conc_enter_innodb() to determine if the thread
+should be allowed to enter InnoDB - the replication thread is treated
+differently than other threads. Also used in
+srv_conc_force_exit_innodb().
+@return	true if thd is the replication thread */
+UNIV_INTERN
+ibool
+thd_is_replication_slave_thread(
+/*============================*/
+	void*	thd);	/*!< in: thread handle (THD*) */
+
+/******************************************************************//**
+Returns true if the transaction this thread is processing has edited
+non-transactional tables. Used by the deadlock detector when deciding
+which transaction to rollback in case of a deadlock - we try to avoid
+rolling back transactions that have edited non-transactional tables.
+@return	true if non-transactional tables have been edited */
+UNIV_INTERN
+ibool
+thd_has_edited_nontrans_tables(
+/*===========================*/
+	void*	thd);	/*!< in: thread handle (THD*) */
+
+/*************************************************************//**
+Prints info of a THD object (== user session thread) to the given file. */
+UNIV_INTERN
+void
+innobase_mysql_print_thd(
+/*=====================*/
+	FILE*	f,		/*!< in: output stream */
+	void*	thd,		/*!< in: pointer to a MySQL THD object */
+	uint	max_query_len);	/*!< in: max query length to print, or 0 to
+				   use the default max length */
+
+/**************************************************************//**
+Converts a MySQL type to an InnoDB type. Note that this function returns
+the 'mtype' of InnoDB. InnoDB differentiates between MySQL's old <= 4.1
+VARCHAR and the new true VARCHAR in >= 5.0.3 by the 'prtype'.
+@return	DATA_BINARY, DATA_VARCHAR, ... */
+UNIV_INTERN
+ulint
+get_innobase_type_from_mysql_type(
+/*==============================*/
+	ulint*		unsigned_flag,	/*!< out: DATA_UNSIGNED if an
+					'unsigned type';
+					at least ENUM and SET,
+					and unsigned integer
+					types are 'unsigned types' */
+	const void*	field)		/*!< in: MySQL Field */
+	__attribute__((nonnull));
+
+/******************************************************************//**
+Get the variable length bounds of the given character set. */
+UNIV_INTERN
+void
+innobase_get_cset_width(
+/*====================*/
+	ulint	cset,		/*!< in: MySQL charset-collation code */
+	ulint*	mbminlen,	/*!< out: minimum length of a char (in bytes) */
+	ulint*	mbmaxlen);	/*!< out: maximum length of a char (in bytes) */
+
+/******************************************************************//**
+Compares NUL-terminated UTF-8 strings case insensitively.
+@return	0 if a=b, <0 if a<b, >1 if a>b */
+UNIV_INTERN
+int
+innobase_strcasecmp(
+/*================*/
+	const char*	a,	/*!< in: first string to compare */
+	const char*	b);	/*!< in: second string to compare */
+
+/******************************************************************//**
+Returns true if the thread is executing a SELECT statement.
+@return	true if thd is executing SELECT */
+
+ibool
+thd_is_select(
+/*==========*/
+	const void*	thd);	/*!< in: thread handle (THD*) */
+
+/******************************************************************//**
+Converts an identifier to a table name. */
+UNIV_INTERN
+void
+innobase_convert_from_table_id(
+/*===========================*/
+	struct charset_info_st*	cs,	/*!< in: the 'from' character set */
+	char*			to,	/*!< out: converted identifier */
+	const char*		from,	/*!< in: identifier to convert */
+	ulint			len);	/*!< in: length of 'to', in bytes; should
+					be at least 5 * strlen(to) + 1 */
+/******************************************************************//**
+Converts an identifier to UTF-8. */
+UNIV_INTERN
+void
+innobase_convert_from_id(
+/*=====================*/
+	struct charset_info_st*	cs,	/*!< in: the 'from' character set */
+	char*			to,	/*!< out: converted identifier */
+	const char*		from,	/*!< in: identifier to convert */
+	ulint			len);	/*!< in: length of 'to', in bytes; should
+					be at least 3 * strlen(to) + 1 */
+/******************************************************************//**
+Makes all characters in a NUL-terminated UTF-8 string lower case. */
+UNIV_INTERN
+void
+innobase_casedn_str(
+/*================*/
+	char*	a);	/*!< in/out: string to put in lower case */
+
+/**********************************************************************//**
+Determines the connection character set.
+@return	connection character set */
+UNIV_INTERN
+struct charset_info_st*
+innobase_get_charset(
+/*=================*/
+	void*	mysql_thd);	/*!< in: MySQL thread handle */
+/**********************************************************************//**
+Determines the current SQL statement.
+@return	SQL statement string */
+UNIV_INTERN
+const char*
+innobase_get_stmt(
+/*==============*/
+	void*	mysql_thd,	/*!< in: MySQL thread handle */
+	size_t*	length)		/*!< out: length of the SQL statement */
+	__attribute__((nonnull));
+/******************************************************************//**
+This function is used to find the storage length in bytes of the first n
+characters for prefix indexes using a multibyte character set. The function
+finds charset information and returns length of prefix_len characters in the
+index field in bytes.
+@return	number of bytes occupied by the first n characters */
+UNIV_INTERN
+ulint
+innobase_get_at_most_n_mbchars(
+/*===========================*/
+	ulint charset_id,	/*!< in: character set id */
+	ulint prefix_len,	/*!< in: prefix length in bytes of the index
+				(this has to be divided by mbmaxlen to get the
+				number of CHARACTERS n in the prefix) */
+	ulint data_len,		/*!< in: length of the string in bytes */
+	const char* str);	/*!< in: character string */
+
+/******************************************************************//**
+Returns true if the thread supports XA,
+global value of innodb_supports_xa if thd is NULL.
+@return	true if thd supports XA */
+
+ibool
+thd_supports_xa(
+/*============*/
+	void*	thd);	/*!< in: thread handle (THD*), or NULL to query
+			the global innodb_supports_xa */
+
+/******************************************************************//**
+Returns the lock wait timeout for the current connection.
+@return	the lock wait timeout, in seconds */
+
+ulong
+thd_lock_wait_timeout(
+/*==================*/
+	void*	thd);	/*!< in: thread handle (THD*), or NULL to query
+			the global innodb_lock_wait_timeout */
+
+/******************************************************************//**
+*/
+
+ulong
+thd_flush_log_at_trx_commit_session(
+/*================================*/
+	void*	thd);
+
+#endif
diff --git a/storage/xtradb/include/handler0alter.h b/storage/xtradb/include/handler0alter.h
new file mode 100644
index 00000000000..985b76f4f50
--- /dev/null
+++ b/storage/xtradb/include/handler0alter.h
@@ -0,0 +1,42 @@
+/*****************************************************************************
+
+Copyright (c) 2005, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/handler0alter.h
+Smart ALTER TABLE
+*******************************************************/
+
+/*************************************************************//**
+Copies an InnoDB record to table->record[0]. */
+UNIV_INTERN
+void
+innobase_rec_to_mysql(
+/*==================*/
+	TABLE*			table,		/*!< in/out: MySQL table */
+	const rec_t*		rec,		/*!< in: record */
+	const dict_index_t*	index,		/*!< in: index */
+	const ulint*		offsets);	/*!< in: rec_get_offsets(
+						rec, index, ...) */
+
+/*************************************************************//**
+Resets table->record[0]. */
+UNIV_INTERN
+void
+innobase_rec_reset(
+/*===============*/
+	TABLE*			table);		/*!< in/out: MySQL table */
diff --git a/storage/xtradb/include/hash0hash.h b/storage/xtradb/include/hash0hash.h
new file mode 100644
index 00000000000..492c767acc4
--- /dev/null
+++ b/storage/xtradb/include/hash0hash.h
@@ -0,0 +1,496 @@
+/*****************************************************************************
+
+Copyright (c) 1997, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/hash0hash.h
+The simple hash table utility
+
+Created 5/20/1997 Heikki Tuuri
+*******************************************************/
+
+#ifndef hash0hash_h
+#define hash0hash_h
+
+#include "univ.i"
+#include "mem0mem.h"
+#ifndef UNIV_HOTBACKUP
+# include "sync0sync.h"
+#endif /* !UNIV_HOTBACKUP */
+
+typedef struct hash_table_struct hash_table_t;
+typedef struct hash_cell_struct hash_cell_t;
+
+typedef void*	hash_node_t;
+
+/* Fix Bug #13859: symbol collision between imap/mysql */
+#define hash_create hash0_create
+
+/*************************************************************//**
+Creates a hash table with >= n array cells. The actual number
+of cells is chosen to be a prime number slightly bigger than n.
+@return	own: created table */
+UNIV_INTERN
+hash_table_t*
+hash_create(
+/*========*/
+	ulint	n);	/*!< in: number of array cells */
+
+/*************************************************************//**
+*/
+UNIV_INTERN
+ulint
+hash_create_needed(
+/*===============*/
+	ulint	n);
+
+UNIV_INTERN
+void
+hash_create_init(
+/*=============*/
+	hash_table_t*	table,
+	ulint		n);
+
+UNIV_INTERN
+void
+hash_create_reuse(
+/*==============*/
+	hash_table_t*	table);
+
+#ifndef UNIV_HOTBACKUP
+/*************************************************************//**
+Creates a mutex array to protect a hash table. */
+UNIV_INTERN
+void
+hash_create_mutexes_func(
+/*=====================*/
+	hash_table_t*	table,		/*!< in: hash table */
+#ifdef UNIV_SYNC_DEBUG
+	ulint		sync_level,	/*!< in: latching order level of the
+					mutexes: used in the debug version */
+#endif /* UNIV_SYNC_DEBUG */
+	ulint		n_mutexes);	/*!< in: number of mutexes */
+#ifdef UNIV_SYNC_DEBUG
+# define hash_create_mutexes(t,n,level) hash_create_mutexes_func(t,level,n)
+#else /* UNIV_SYNC_DEBUG */
+# define hash_create_mutexes(t,n,level) hash_create_mutexes_func(t,n)
+#endif /* UNIV_SYNC_DEBUG */
+#endif /* !UNIV_HOTBACKUP */
+
+/*************************************************************//**
+Frees a hash table. */
+UNIV_INTERN
+void
+hash_table_free(
+/*============*/
+	hash_table_t*	table);	/*!< in, own: hash table */
+/**************************************************************//**
+Calculates the hash value from a folded value.
+@return	hashed value */
+UNIV_INLINE
+ulint
+hash_calc_hash(
+/*===========*/
+	ulint		fold,	/*!< in: folded value */
+	hash_table_t*	table);	/*!< in: hash table */
+#ifndef UNIV_HOTBACKUP
+/********************************************************************//**
+Assert that the mutex for the table in a hash operation is owned. */
+# define HASH_ASSERT_OWNED(TABLE, FOLD)					\
+ut_ad(!(TABLE)->mutexes || mutex_own(hash_get_mutex(TABLE, FOLD)));
+#else /* !UNIV_HOTBACKUP */
+# define HASH_ASSERT_OWNED(TABLE, FOLD)
+#endif /* !UNIV_HOTBACKUP */
+
+/*******************************************************************//**
+Inserts a struct to a hash table. */
+
+#define HASH_INSERT(TYPE, NAME, TABLE, FOLD, DATA)\
+do {\
+	hash_cell_t*	cell3333;\
+	TYPE*		struct3333;\
+\
+	HASH_ASSERT_OWNED(TABLE, FOLD)\
+\
+	(DATA)->NAME = NULL;\
+\
+	cell3333 = hash_get_nth_cell(TABLE, hash_calc_hash(FOLD, TABLE));\
+\
+	if (cell3333->node == NULL) {\
+		cell3333->node = DATA;\
+	} else {\
+		struct3333 = (TYPE*) cell3333->node;\
+\
+		while (struct3333->NAME != NULL) {\
+\
+			struct3333 = (TYPE*) struct3333->NAME;\
+		}\
+\
+		struct3333->NAME = DATA;\
+	}\
+} while (0)
+
+#ifdef UNIV_HASH_DEBUG
+# define HASH_ASSERT_VALID(DATA) ut_a((void*) (DATA) != (void*) -1)
+# define HASH_INVALIDATE(DATA, NAME) DATA->NAME = (void*) -1
+#else
+# define HASH_ASSERT_VALID(DATA) do {} while (0)
+# define HASH_INVALIDATE(DATA, NAME) do {} while (0)
+#endif
+
+/*******************************************************************//**
+Deletes a struct from a hash table. */
+
+#define HASH_DELETE(TYPE, NAME, TABLE, FOLD, DATA)\
+do {\
+	hash_cell_t*	cell3333;\
+	TYPE*		struct3333;\
+\
+	HASH_ASSERT_OWNED(TABLE, FOLD)\
+\
+	cell3333 = hash_get_nth_cell(TABLE, hash_calc_hash(FOLD, TABLE));\
+\
+	if (cell3333->node == DATA) {\
+		HASH_ASSERT_VALID(DATA->NAME);\
+		cell3333->node = DATA->NAME;\
+	} else {\
+		struct3333 = (TYPE*) cell3333->node;\
+\
+		while (struct3333->NAME != DATA) {\
+\
+			struct3333 = (TYPE*) struct3333->NAME;\
+			ut_a(struct3333);\
+		}\
+\
+		struct3333->NAME = DATA->NAME;\
+	}\
+	HASH_INVALIDATE(DATA, NAME);\
+} while (0)
+
+/*******************************************************************//**
+Gets the first struct in a hash chain, NULL if none. */
+
+#define HASH_GET_FIRST(TABLE, HASH_VAL)\
+	(hash_get_nth_cell(TABLE, HASH_VAL)->node)
+
+/*******************************************************************//**
+Gets the next struct in a hash chain, NULL if none. */
+
+#define HASH_GET_NEXT(NAME, DATA)	((DATA)->NAME)
+
+/********************************************************************//**
+Looks for a struct in a hash table. */
+#define HASH_SEARCH(NAME, TABLE, FOLD, TYPE, DATA, ASSERTION, TEST)\
+{\
+\
+	HASH_ASSERT_OWNED(TABLE, FOLD)\
+\
+	(DATA) = (TYPE) HASH_GET_FIRST(TABLE, hash_calc_hash(FOLD, TABLE));\
+	HASH_ASSERT_VALID(DATA);\
+\
+	while ((DATA) != NULL) {\
+		ASSERTION;\
+		if (TEST) {\
+			break;\
+		} else {\
+			HASH_ASSERT_VALID(HASH_GET_NEXT(NAME, DATA));\
+			(DATA) = (TYPE) HASH_GET_NEXT(NAME, DATA);\
+		}\
+	}\
+}
+
+/********************************************************************//**
+Looks for an item in all hash buckets. */
+#define HASH_SEARCH_ALL(NAME, TABLE, TYPE, DATA, ASSERTION, TEST)	\
+do {									\
+	ulint	i3333;							\
+									\
+	for (i3333 = (TABLE)->n_cells; i3333--; ) {			\
+		(DATA) = (TYPE) HASH_GET_FIRST(TABLE, i3333);		\
+									\
+		while ((DATA) != NULL) {				\
+			HASH_ASSERT_VALID(DATA);			\
+			ASSERTION;					\
+									\
+			if (TEST) {					\
+				break;					\
+			}						\
+									\
+			(DATA) = (TYPE) HASH_GET_NEXT(NAME, DATA);	\
+		}							\
+									\
+		if ((DATA) != NULL) {					\
+			break;						\
+		}							\
+	}								\
+} while (0)
+
+/************************************************************//**
+Gets the nth cell in a hash table.
+@return	pointer to cell */
+UNIV_INLINE
+hash_cell_t*
+hash_get_nth_cell(
+/*==============*/
+	hash_table_t*	table,	/*!< in: hash table */
+	ulint		n);	/*!< in: cell index */
+
+/*************************************************************//**
+Clears a hash table so that all the cells become empty. */
+UNIV_INLINE
+void
+hash_table_clear(
+/*=============*/
+	hash_table_t*	table);	/*!< in/out: hash table */
+
+/*************************************************************//**
+Returns the number of cells in a hash table.
+@return	number of cells */
+UNIV_INLINE
+ulint
+hash_get_n_cells(
+/*=============*/
+	hash_table_t*	table);	/*!< in: table */
+/*******************************************************************//**
+Deletes a struct which is stored in the heap of the hash table, and compacts
+the heap. The fold value must be stored in the struct NODE in a field named
+'fold'. */
+
+#define HASH_DELETE_AND_COMPACT(TYPE, NAME, TABLE, NODE)\
+do {\
+	TYPE*		node111;\
+	TYPE*		top_node111;\
+	hash_cell_t*	cell111;\
+	ulint		fold111;\
+\
+	fold111 = (NODE)->fold;\
+\
+	HASH_DELETE(TYPE, NAME, TABLE, fold111, NODE);\
+\
+	top_node111 = (TYPE*)mem_heap_get_top(\
+				hash_get_heap(TABLE, fold111),\
+							sizeof(TYPE));\
+\
+	/* If the node to remove is not the top node in the heap, compact the\
+	heap of nodes by moving the top node in the place of NODE. */\
+\
+	if (NODE != top_node111) {\
+\
+		/* Copy the top node in place of NODE */\
+\
+		*(NODE) = *top_node111;\
+\
+		cell111 = hash_get_nth_cell(TABLE,\
+				hash_calc_hash(top_node111->fold, TABLE));\
+\
+		/* Look for the pointer to the top node, to update it */\
+\
+		if (cell111->node == top_node111) {\
+			/* The top node is the first in the chain */\
+\
+			cell111->node = NODE;\
+		} else {\
+			/* We have to look for the predecessor of the top\
+			node */\
+			node111 = cell111->node;\
+\
+			while (top_node111 != HASH_GET_NEXT(NAME, node111)) {\
+\
+				node111 = HASH_GET_NEXT(NAME, node111);\
+			}\
+\
+			/* Now we have the predecessor node */\
+\
+			node111->NAME = NODE;\
+		}\
+	}\
+\
+	/* Free the space occupied by the top node */\
+\
+	mem_heap_free_top(hash_get_heap(TABLE, fold111), sizeof(TYPE));\
+} while (0)
+
+#ifndef UNIV_HOTBACKUP
+/****************************************************************//**
+Move all hash table entries from OLD_TABLE to NEW_TABLE. */
+
+#define HASH_MIGRATE(OLD_TABLE, NEW_TABLE, NODE_TYPE, PTR_NAME, FOLD_FUNC) \
+do {\
+	ulint		i2222;\
+	ulint		cell_count2222;\
+\
+	cell_count2222 = hash_get_n_cells(OLD_TABLE);\
+\
+	for (i2222 = 0; i2222 < cell_count2222; i2222++) {\
+		NODE_TYPE*	node2222 = HASH_GET_FIRST((OLD_TABLE), i2222);\
+\
+		while (node2222) {\
+			NODE_TYPE*	next2222 = node2222->PTR_NAME;\
+			ulint		fold2222 = FOLD_FUNC(node2222);\
+\
+			HASH_INSERT(NODE_TYPE, PTR_NAME, (NEW_TABLE),\
+				fold2222, node2222);\
+\
+			node2222 = next2222;\
+		}\
+	}\
+} while (0)
+
+/********************************************************************//**
+Align nodes with moving location.*/
+#define HASH_OFFSET(TABLE, NODE_TYPE, PTR_NAME, FADDR, FOFFSET, BOFFSET) \
+do {\
+	ulint		i2222;\
+	ulint		cell_count2222;\
+\
+	cell_count2222 = hash_get_n_cells(TABLE);\
+\
+	for (i2222 = 0; i2222 < cell_count2222; i2222++) {\
+		NODE_TYPE*	node2222;\
+\
+		if ((TABLE)->array[i2222].node) \
+			(TABLE)->array[i2222].node = (void*)((byte*)(TABLE)->array[i2222].node \
+			+ (((TABLE)->array[i2222].node > (void*)FADDR)?FOFFSET:BOFFSET));\
+		node2222 = HASH_GET_FIRST((TABLE), i2222);\
+\
+		while (node2222) {\
+			if (node2222->PTR_NAME) \
+				node2222->PTR_NAME = (void*)((byte*)(node2222->PTR_NAME) \
+				+ ((((void*)node2222->PTR_NAME) > (void*)FADDR)?FOFFSET:BOFFSET));\
+\
+			node2222 = node2222->PTR_NAME;\
+		}\
+	}\
+} while (0)
+
+/************************************************************//**
+Gets the mutex index for a fold value in a hash table.
+@return	mutex number */
+UNIV_INLINE
+ulint
+hash_get_mutex_no(
+/*==============*/
+	hash_table_t*	table,	/*!< in: hash table */
+	ulint		fold);	/*!< in: fold */
+/************************************************************//**
+Gets the nth heap in a hash table.
+@return	mem heap */
+UNIV_INLINE
+mem_heap_t*
+hash_get_nth_heap(
+/*==============*/
+	hash_table_t*	table,	/*!< in: hash table */
+	ulint		i);	/*!< in: index of the heap */
+/************************************************************//**
+Gets the heap for a fold value in a hash table.
+@return	mem heap */
+UNIV_INLINE
+mem_heap_t*
+hash_get_heap(
+/*==========*/
+	hash_table_t*	table,	/*!< in: hash table */
+	ulint		fold);	/*!< in: fold */
+/************************************************************//**
+Gets the nth mutex in a hash table.
+@return	mutex */
+UNIV_INLINE
+mutex_t*
+hash_get_nth_mutex(
+/*===============*/
+	hash_table_t*	table,	/*!< in: hash table */
+	ulint		i);	/*!< in: index of the mutex */
+/************************************************************//**
+Gets the mutex for a fold value in a hash table.
+@return	mutex */
+UNIV_INLINE
+mutex_t*
+hash_get_mutex(
+/*===========*/
+	hash_table_t*	table,	/*!< in: hash table */
+	ulint		fold);	/*!< in: fold */
+/************************************************************//**
+Reserves the mutex for a fold value in a hash table. */
+UNIV_INTERN
+void
+hash_mutex_enter(
+/*=============*/
+	hash_table_t*	table,	/*!< in: hash table */
+	ulint		fold);	/*!< in: fold */
+/************************************************************//**
+Releases the mutex for a fold value in a hash table. */
+UNIV_INTERN
+void
+hash_mutex_exit(
+/*============*/
+	hash_table_t*	table,	/*!< in: hash table */
+	ulint		fold);	/*!< in: fold */
+/************************************************************//**
+Reserves all the mutexes of a hash table, in an ascending order. */
+UNIV_INTERN
+void
+hash_mutex_enter_all(
+/*=================*/
+	hash_table_t*	table);	/*!< in: hash table */
+/************************************************************//**
+Releases all the mutexes of a hash table. */
+UNIV_INTERN
+void
+hash_mutex_exit_all(
+/*================*/
+	hash_table_t*	table);	/*!< in: hash table */
+#else /* !UNIV_HOTBACKUP */
+# define hash_get_heap(table, fold)	((table)->heap)
+# define hash_mutex_enter(table, fold)	((void) 0)
+# define hash_mutex_exit(table, fold)	((void) 0)
+#endif /* !UNIV_HOTBACKUP */
+
+struct hash_cell_struct{
+	void*	node;	/*!< hash chain node, NULL if none */
+};
+
+/* The hash table structure */
+struct hash_table_struct {
+#if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
+# ifndef UNIV_HOTBACKUP
+	ibool		adaptive;/* TRUE if this is the hash table of the
+				adaptive hash index */
+# endif /* !UNIV_HOTBACKUP */
+#endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */
+	ulint		n_cells;/* number of cells in the hash table */
+	hash_cell_t*	array;	/*!< pointer to cell array */
+#ifndef UNIV_HOTBACKUP
+	ulint		n_mutexes;/* if mutexes != NULL, then the number of
+				mutexes, must be a power of 2 */
+	mutex_t*	mutexes;/* NULL, or an array of mutexes used to
+				protect segments of the hash table */
+	mem_heap_t**	heaps;	/*!< if this is non-NULL, hash chain nodes for
+				external chaining can be allocated from these
+				memory heaps; there are then n_mutexes many of
+				these heaps */
+#endif /* !UNIV_HOTBACKUP */
+	mem_heap_t*	heap;
+#ifdef UNIV_DEBUG
+	ulint		magic_n;
+# define HASH_TABLE_MAGIC_N	76561114
+#endif /* UNIV_DEBUG */
+};
+
+#ifndef UNIV_NONINL
+#include "hash0hash.ic"
+#endif
+
+#endif
diff --git a/storage/xtradb/include/hash0hash.ic b/storage/xtradb/include/hash0hash.ic
new file mode 100644
index 00000000000..0b437894e2e
--- /dev/null
+++ b/storage/xtradb/include/hash0hash.ic
@@ -0,0 +1,183 @@
+/*****************************************************************************
+
+Copyright (c) 1997, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/hash0hash.ic
+The simple hash table utility
+
+Created 5/20/1997 Heikki Tuuri
+*******************************************************/
+
+#include "ut0rnd.h"
+
+/************************************************************//**
+Gets the nth cell in a hash table.
+@return	pointer to cell */
+UNIV_INLINE
+hash_cell_t*
+hash_get_nth_cell(
+/*==============*/
+	hash_table_t*	table,	/*!< in: hash table */
+	ulint		n)	/*!< in: cell index */
+{
+	ut_ad(table);
+	ut_ad(table->magic_n == HASH_TABLE_MAGIC_N);
+	ut_ad(n < table->n_cells);
+
+	return(table->array + n);
+}
+
+/*************************************************************//**
+Clears a hash table so that all the cells become empty. */
+UNIV_INLINE
+void
+hash_table_clear(
+/*=============*/
+	hash_table_t*	table)	/*!< in/out: hash table */
+{
+	ut_ad(table);
+	ut_ad(table->magic_n == HASH_TABLE_MAGIC_N);
+	memset(table->array, 0x0,
+	       table->n_cells * sizeof(*table->array));
+}
+
+/*************************************************************//**
+Returns the number of cells in a hash table.
+@return	number of cells */
+UNIV_INLINE
+ulint
+hash_get_n_cells(
+/*=============*/
+	hash_table_t*	table)	/*!< in: table */
+{
+	ut_ad(table);
+	ut_ad(table->magic_n == HASH_TABLE_MAGIC_N);
+	return(table->n_cells);
+}
+
+/**************************************************************//**
+Calculates the hash value from a folded value.
+@return	hashed value */
+UNIV_INLINE
+ulint
+hash_calc_hash(
+/*===========*/
+	ulint		fold,	/*!< in: folded value */
+	hash_table_t*	table)	/*!< in: hash table */
+{
+	ut_ad(table);
+	ut_ad(table->magic_n == HASH_TABLE_MAGIC_N);
+	return(ut_hash_ulint(fold, table->n_cells));
+}
+
+#ifndef UNIV_HOTBACKUP
+/************************************************************//**
+Gets the mutex index for a fold value in a hash table.
+@return	mutex number */
+UNIV_INLINE
+ulint
+hash_get_mutex_no(
+/*==============*/
+	hash_table_t*	table,	/*!< in: hash table */
+	ulint		fold)	/*!< in: fold */
+{
+	ut_ad(table);
+	ut_ad(table->magic_n == HASH_TABLE_MAGIC_N);
+	ut_ad(ut_is_2pow(table->n_mutexes));
+	return(ut_2pow_remainder(hash_calc_hash(fold, table),
+				 table->n_mutexes));
+}
+
+/************************************************************//**
+Gets the nth heap in a hash table.
+@return	mem heap */
+UNIV_INLINE
+mem_heap_t*
+hash_get_nth_heap(
+/*==============*/
+	hash_table_t*	table,	/*!< in: hash table */
+	ulint		i)	/*!< in: index of the heap */
+{
+	ut_ad(table);
+	ut_ad(table->magic_n == HASH_TABLE_MAGIC_N);
+	ut_ad(i < table->n_mutexes);
+
+	return(table->heaps[i]);
+}
+
+/************************************************************//**
+Gets the heap for a fold value in a hash table.
+@return	mem heap */
+UNIV_INLINE
+mem_heap_t*
+hash_get_heap(
+/*==========*/
+	hash_table_t*	table,	/*!< in: hash table */
+	ulint		fold)	/*!< in: fold */
+{
+	ulint	i;
+
+	ut_ad(table);
+	ut_ad(table->magic_n == HASH_TABLE_MAGIC_N);
+
+	if (table->heap) {
+		return(table->heap);
+	}
+
+	i = hash_get_mutex_no(table, fold);
+
+	return(hash_get_nth_heap(table, i));
+}
+
+/************************************************************//**
+Gets the nth mutex in a hash table.
+@return	mutex */
+UNIV_INLINE
+mutex_t*
+hash_get_nth_mutex(
+/*===============*/
+	hash_table_t*	table,	/*!< in: hash table */
+	ulint		i)	/*!< in: index of the mutex */
+{
+	ut_ad(table);
+	ut_ad(table->magic_n == HASH_TABLE_MAGIC_N);
+	ut_ad(i < table->n_mutexes);
+
+	return(table->mutexes + i);
+}
+
+/************************************************************//**
+Gets the mutex for a fold value in a hash table.
+@return	mutex */
+UNIV_INLINE
+mutex_t*
+hash_get_mutex(
+/*===========*/
+	hash_table_t*	table,	/*!< in: hash table */
+	ulint		fold)	/*!< in: fold */
+{
+	ulint	i;
+
+	ut_ad(table);
+	ut_ad(table->magic_n == HASH_TABLE_MAGIC_N);
+
+	i = hash_get_mutex_no(table, fold);
+
+	return(hash_get_nth_mutex(table, i));
+}
+#endif /* !UNIV_HOTBACKUP */
diff --git a/storage/xtradb/include/ibuf0ibuf.h b/storage/xtradb/include/ibuf0ibuf.h
new file mode 100644
index 00000000000..8aa21fb9d95
--- /dev/null
+++ b/storage/xtradb/include/ibuf0ibuf.h
@@ -0,0 +1,383 @@
+/*****************************************************************************
+
+Copyright (c) 1997, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/ibuf0ibuf.h
+Insert buffer
+
+Created 7/19/1997 Heikki Tuuri
+*******************************************************/
+
+#ifndef ibuf0ibuf_h
+#define ibuf0ibuf_h
+
+#include "univ.i"
+
+#include "mtr0mtr.h"
+#include "dict0mem.h"
+#include "fsp0fsp.h"
+
+#ifndef UNIV_HOTBACKUP
+# include "ibuf0types.h"
+
+/** Combinations of operations that can be buffered.  Because the enum
+values are used for indexing innobase_change_buffering_values[], they
+should start at 0 and there should not be any gaps. */
+typedef enum {
+	IBUF_USE_NONE = 0,
+	IBUF_USE_INSERT,	/* insert */
+
+	IBUF_USE_COUNT		/* number of entries in ibuf_use_t */
+} ibuf_use_t;
+
+/** Operations that can currently be buffered. */
+extern ibuf_use_t	ibuf_use;
+
+/** The insert buffer control structure */
+extern ibuf_t*		ibuf;
+
+/* The purpose of the insert buffer is to reduce random disk access.
+When we wish to insert a record into a non-unique secondary index and
+the B-tree leaf page where the record belongs to is not in the buffer
+pool, we insert the record into the insert buffer B-tree, indexed by
+(space_id, page_no).  When the page is eventually read into the buffer
+pool, we look up the insert buffer B-tree for any modifications to the
+page, and apply these upon the completion of the read operation.  This
+is called the insert buffer merge. */
+
+/* The insert buffer merge must always succeed.  To guarantee this,
+the insert buffer subsystem keeps track of the free space in pages for
+which it can buffer operations.  Two bits per page in the insert
+buffer bitmap indicate the available space in coarse increments.  The
+free bits in the insert buffer bitmap must never exceed the free space
+on a page.  It is safe to decrement or reset the bits in the bitmap in
+a mini-transaction that is committed before the mini-transaction that
+affects the free space.  It is unsafe to increment the bits in a
+separately committed mini-transaction, because in crash recovery, the
+free bits could momentarily be set too high. */
+
+/******************************************************************//**
+Creates the insert buffer data structure at a database startup and
+initializes the data structures for the insert buffer of each tablespace. */
+UNIV_INTERN
+void
+ibuf_init_at_db_start(void);
+/*=======================*/
+/*********************************************************************//**
+Reads the biggest tablespace id from the high end of the insert buffer
+tree and updates the counter in fil_system. */
+UNIV_INTERN
+void
+ibuf_update_max_tablespace_id(void);
+/*===============================*/
+/*********************************************************************//**
+Initializes an ibuf bitmap page. */
+UNIV_INTERN
+void
+ibuf_bitmap_page_init(
+/*==================*/
+	buf_block_t*	block,	/*!< in: bitmap page */
+	mtr_t*		mtr);	/*!< in: mtr */
+/************************************************************************//**
+Resets the free bits of the page in the ibuf bitmap. This is done in a
+separate mini-transaction, hence this operation does not restrict
+further work to only ibuf bitmap operations, which would result if the
+latch to the bitmap page were kept.  NOTE: The free bits in the insert
+buffer bitmap must never exceed the free space on a page.  It is safe
+to decrement or reset the bits in the bitmap in a mini-transaction
+that is committed before the mini-transaction that affects the free
+space. */
+UNIV_INTERN
+void
+ibuf_reset_free_bits(
+/*=================*/
+	buf_block_t*	block);	/*!< in: index page; free bits are set to 0
+				if the index is a non-clustered
+				non-unique, and page level is 0 */
+/************************************************************************//**
+Updates the free bits of an uncompressed page in the ibuf bitmap if
+there is not enough free on the page any more.  This is done in a
+separate mini-transaction, hence this operation does not restrict
+further work to only ibuf bitmap operations, which would result if the
+latch to the bitmap page were kept.  NOTE: The free bits in the insert
+buffer bitmap must never exceed the free space on a page.  It is
+unsafe to increment the bits in a separately committed
+mini-transaction, because in crash recovery, the free bits could
+momentarily be set too high.  It is only safe to use this function for
+decrementing the free bits.  Should more free space become available,
+we must not update the free bits here, because that would break crash
+recovery. */
+UNIV_INLINE
+void
+ibuf_update_free_bits_if_full(
+/*==========================*/
+	buf_block_t*	block,	/*!< in: index page to which we have added new
+				records; the free bits are updated if the
+				index is non-clustered and non-unique and
+				the page level is 0, and the page becomes
+				fuller */
+	ulint		max_ins_size,/*!< in: value of maximum insert size with
+				reorganize before the latest operation
+				performed to the page */
+	ulint		increase);/*!< in: upper limit for the additional space
+				used in the latest operation, if known, or
+				ULINT_UNDEFINED */
+/**********************************************************************//**
+Updates the free bits for an uncompressed page to reflect the present
+state.  Does this in the mtr given, which means that the latching
+order rules virtually prevent any further operations for this OS
+thread until mtr is committed.  NOTE: The free bits in the insert
+buffer bitmap must never exceed the free space on a page.  It is safe
+to set the free bits in the same mini-transaction that updated the
+page. */
+UNIV_INTERN
+void
+ibuf_update_free_bits_low(
+/*======================*/
+	const buf_block_t*	block,		/*!< in: index page */
+	ulint			max_ins_size,	/*!< in: value of
+						maximum insert size
+						with reorganize before
+						the latest operation
+						performed to the page */
+	mtr_t*			mtr);		/*!< in/out: mtr */
+/**********************************************************************//**
+Updates the free bits for a compressed page to reflect the present
+state.  Does this in the mtr given, which means that the latching
+order rules virtually prevent any further operations for this OS
+thread until mtr is committed.  NOTE: The free bits in the insert
+buffer bitmap must never exceed the free space on a page.  It is safe
+to set the free bits in the same mini-transaction that updated the
+page. */
+UNIV_INTERN
+void
+ibuf_update_free_bits_zip(
+/*======================*/
+	buf_block_t*	block,	/*!< in/out: index page */
+	mtr_t*		mtr);	/*!< in/out: mtr */
+/**********************************************************************//**
+Updates the free bits for the two pages to reflect the present state.
+Does this in the mtr given, which means that the latching order rules
+virtually prevent any further operations until mtr is committed.
+NOTE: The free bits in the insert buffer bitmap must never exceed the
+free space on a page.  It is safe to set the free bits in the same
+mini-transaction that updated the pages. */
+UNIV_INTERN
+void
+ibuf_update_free_bits_for_two_pages_low(
+/*====================================*/
+	ulint		zip_size,/*!< in: compressed page size in bytes;
+				0 for uncompressed pages */
+	buf_block_t*	block1,	/*!< in: index page */
+	buf_block_t*	block2,	/*!< in: index page */
+	mtr_t*		mtr);	/*!< in: mtr */
+/**********************************************************************//**
+A basic partial test if an insert to the insert buffer could be possible and
+recommended. */
+UNIV_INLINE
+ibool
+ibuf_should_try(
+/*============*/
+	dict_index_t*	index,			/*!< in: index where to insert */
+	ulint		ignore_sec_unique);	/*!< in: if != 0, we should
+						ignore UNIQUE constraint on
+						a secondary index when we
+						decide */
+/******************************************************************//**
+Returns TRUE if the current OS thread is performing an insert buffer
+routine.
+
+For instance, a read-ahead of non-ibuf pages is forbidden by threads
+that are executing an insert buffer routine.
+@return TRUE if inside an insert buffer routine */
+UNIV_INTERN
+ibool
+ibuf_inside(void);
+/*=============*/
+/***********************************************************************//**
+Checks if a page address is an ibuf bitmap page (level 3 page) address.
+@return	TRUE if a bitmap page */
+UNIV_INLINE
+ibool
+ibuf_bitmap_page(
+/*=============*/
+	ulint	zip_size,/*!< in: compressed page size in bytes;
+			0 for uncompressed pages */
+	ulint	page_no);/*!< in: page number */
+/***********************************************************************//**
+Checks if a page is a level 2 or 3 page in the ibuf hierarchy of pages.
+Must not be called when recv_no_ibuf_operations==TRUE.
+@return	TRUE if level 2 or level 3 page */
+UNIV_INTERN
+ibool
+ibuf_page(
+/*======*/
+	ulint	space,	/*!< in: space id */
+	ulint	zip_size,/*!< in: compressed page size in bytes, or 0 */
+	ulint	page_no,/*!< in: page number */
+	mtr_t*	mtr);	/*!< in: mtr which will contain an x-latch to the
+			bitmap page if the page is not one of the fixed
+			address ibuf pages, or NULL, in which case a new
+			transaction is created. */
+/***********************************************************************//**
+Frees excess pages from the ibuf free list. This function is called when an OS
+thread calls fsp services to allocate a new file segment, or a new page to a
+file segment, and the thread did not own the fsp latch before this call. */
+UNIV_INTERN
+void
+ibuf_free_excess_pages(void);
+/*========================*/
+/*********************************************************************//**
+Makes an index insert to the insert buffer, instead of directly to the disk
+page, if this is possible. Does not do insert if the index is clustered
+or unique.
+@return	TRUE if success */
+UNIV_INTERN
+ibool
+ibuf_insert(
+/*========*/
+	const dtuple_t*	entry,	/*!< in: index entry to insert */
+	dict_index_t*	index,	/*!< in: index where to insert */
+	ulint		space,	/*!< in: space id where to insert */
+	ulint		zip_size,/*!< in: compressed page size in bytes, or 0 */
+	ulint		page_no,/*!< in: page number where to insert */
+	que_thr_t*	thr);	/*!< in: query thread */
+/*********************************************************************//**
+When an index page is read from a disk to the buffer pool, this function
+inserts to the page the possible index entries buffered in the insert buffer.
+The entries are deleted from the insert buffer. If the page is not read, but
+created in the buffer pool, this function deletes its buffered entries from
+the insert buffer; there can exist entries for such a page if the page
+belonged to an index which subsequently was dropped. */
+UNIV_INTERN
+void
+ibuf_merge_or_delete_for_page(
+/*==========================*/
+	buf_block_t*	block,	/*!< in: if page has been read from
+				disk, pointer to the page x-latched,
+				else NULL */
+	ulint		space,	/*!< in: space id of the index page */
+	ulint		page_no,/*!< in: page number of the index page */
+	ulint		zip_size,/*!< in: compressed page size in bytes,
+				or 0 */
+	ibool		update_ibuf_bitmap);/*!< in: normally this is set
+				to TRUE, but if we have deleted or are
+				deleting the tablespace, then we
+				naturally do not want to update a
+				non-existent bitmap page */
+/*********************************************************************//**
+Deletes all entries in the insert buffer for a given space id. This is used
+in DISCARD TABLESPACE and IMPORT TABLESPACE.
+NOTE: this does not update the page free bitmaps in the space. The space will
+become CORRUPT when you call this function! */
+UNIV_INTERN
+void
+ibuf_delete_for_discarded_space(
+/*============================*/
+	ulint	space);	/*!< in: space id */
+/*********************************************************************//**
+Contracts insert buffer trees by reading pages to the buffer pool.
+@return a lower limit for the combined size in bytes of entries which
+will be merged from ibuf trees to the pages read, 0 if ibuf is
+empty */
+UNIV_INTERN
+ulint
+ibuf_contract(
+/*==========*/
+	ibool	sync);	/*!< in: TRUE if the caller wants to wait for the
+			issued read with the highest tablespace address
+			to complete */
+/*********************************************************************//**
+Contracts insert buffer trees by reading pages to the buffer pool.
+@return a lower limit for the combined size in bytes of entries which
+will be merged from ibuf trees to the pages read, 0 if ibuf is
+empty */
+UNIV_INTERN
+ulint
+ibuf_contract_for_n_pages(
+/*======================*/
+	ibool	sync,	/*!< in: TRUE if the caller wants to wait for the
+			issued read with the highest tablespace address
+			to complete */
+	ulint	n_pages);/*!< in: try to read at least this many pages to
+			the buffer pool and merge the ibuf contents to
+			them */
+#endif /* !UNIV_HOTBACKUP */
+/*********************************************************************//**
+Parses a redo log record of an ibuf bitmap page init.
+@return	end of log record or NULL */
+UNIV_INTERN
+byte*
+ibuf_parse_bitmap_init(
+/*===================*/
+	byte*		ptr,	/*!< in: buffer */
+	byte*		end_ptr,/*!< in: buffer end */
+	buf_block_t*	block,	/*!< in: block or NULL */
+	mtr_t*		mtr);	/*!< in: mtr or NULL */
+#ifndef UNIV_HOTBACKUP
+#ifdef UNIV_IBUF_COUNT_DEBUG
+/******************************************************************//**
+Gets the ibuf count for a given page.
+@return number of entries in the insert buffer currently buffered for
+this page */
+UNIV_INTERN
+ulint
+ibuf_count_get(
+/*===========*/
+	ulint	space,	/*!< in: space id */
+	ulint	page_no);/*!< in: page number */
+#endif
+/******************************************************************//**
+Looks if the insert buffer is empty.
+@return	TRUE if empty */
+UNIV_INTERN
+ibool
+ibuf_is_empty(void);
+/*===============*/
+/******************************************************************//**
+Prints info of ibuf. */
+UNIV_INTERN
+void
+ibuf_print(
+/*=======*/
+	FILE*	file);	/*!< in: file where to print */
+/******************************************************************//**
+Closes insert buffer and frees the data structures. */
+UNIV_INTERN
+void
+ibuf_close(void);
+/*============*/
+
+#define IBUF_HEADER_PAGE_NO	FSP_IBUF_HEADER_PAGE_NO
+#define IBUF_TREE_ROOT_PAGE_NO	FSP_IBUF_TREE_ROOT_PAGE_NO
+
+#endif /* !UNIV_HOTBACKUP */
+
+/* The ibuf header page currently contains only the file segment header
+for the file segment from which the pages for the ibuf tree are allocated */
+#define IBUF_HEADER		PAGE_DATA
+#define	IBUF_TREE_SEG_HEADER	0	/* fseg header for ibuf tree */
+
+/* The insert buffer tree itself is always located in space 0. */
+#define IBUF_SPACE_ID		0
+
+#ifndef UNIV_NONINL
+#include "ibuf0ibuf.ic"
+#endif
+
+#endif
diff --git a/storage/xtradb/include/ibuf0ibuf.ic b/storage/xtradb/include/ibuf0ibuf.ic
new file mode 100644
index 00000000000..15bbe61ab30
--- /dev/null
+++ b/storage/xtradb/include/ibuf0ibuf.ic
@@ -0,0 +1,327 @@
+/*****************************************************************************
+
+Copyright (c) 1997, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/ibuf0ibuf.ic
+Insert buffer
+
+Created 7/19/1997 Heikki Tuuri
+*******************************************************/
+
+#include "page0page.h"
+#include "page0zip.h"
+#ifndef UNIV_HOTBACKUP
+#include "buf0lru.h"
+
+/** Counter for ibuf_should_try() */
+extern ulint	ibuf_flush_count;
+
+/** An index page must contain at least UNIV_PAGE_SIZE /
+IBUF_PAGE_SIZE_PER_FREE_SPACE bytes of free space for ibuf to try to
+buffer inserts to this page.  If there is this much of free space, the
+corresponding bits are set in the ibuf bitmap. */
+#define IBUF_PAGE_SIZE_PER_FREE_SPACE	32
+
+/** Insert buffer struct */
+struct ibuf_struct{
+	ulint		size;		/*!< current size of the ibuf index
+					tree, in pages */
+	ulint		max_size;	/*!< recommended maximum size of the
+					ibuf index tree, in pages */
+	ulint		seg_size;	/*!< allocated pages of the file
+					segment containing ibuf header and
+					tree */
+	ibool		empty;		/*!< after an insert to the ibuf tree
+					is performed, this is set to FALSE,
+					and if a contract operation finds
+					the tree empty, this is set to
+					TRUE */
+	ulint		free_list_len;	/*!< length of the free list */
+	ulint		height;		/*!< tree height */
+	dict_index_t*	index;		/*!< insert buffer index */
+
+	ulint		n_inserts;	/*!< number of inserts made to
+					the insert buffer */
+	ulint		n_merges;	/*!< number of pages merged */
+	ulint		n_merged_recs;	/*!< number of records merged */
+};
+
+/************************************************************************//**
+Sets the free bit of the page in the ibuf bitmap. This is done in a separate
+mini-transaction, hence this operation does not restrict further work to only
+ibuf bitmap operations, which would result if the latch to the bitmap page
+were kept. */
+UNIV_INTERN
+void
+ibuf_set_free_bits_func(
+/*====================*/
+	buf_block_t*	block,	/*!< in: index page of a non-clustered index;
+				free bit is reset if page level is 0 */
+#ifdef UNIV_IBUF_DEBUG
+	ulint		max_val,/*!< in: ULINT_UNDEFINED or a maximum
+				value which the bits must have before
+				setting; this is for debugging */
+#endif /* UNIV_IBUF_DEBUG */
+	ulint		val);	/*!< in: value to set: < 4 */
+#ifdef UNIV_IBUF_DEBUG
+# define ibuf_set_free_bits(b,v,max) ibuf_set_free_bits_func(b,max,v)
+#else /* UNIV_IBUF_DEBUG */
+# define ibuf_set_free_bits(b,v,max) ibuf_set_free_bits_func(b,v)
+#endif /* UNIV_IBUF_DEBUG */
+
+/**********************************************************************//**
+A basic partial test if an insert to the insert buffer could be possible and
+recommended. */
+UNIV_INLINE
+ibool
+ibuf_should_try(
+/*============*/
+	dict_index_t*	index,			/*!< in: index where to insert */
+	ulint		ignore_sec_unique)	/*!< in: if != 0, we should
+						ignore UNIQUE constraint on
+						a secondary index when we
+						decide */
+{
+	if (ibuf_use != IBUF_USE_NONE
+	    && !dict_index_is_clust(index)
+	    && (ignore_sec_unique || !dict_index_is_unique(index))) {
+
+		ibuf_flush_count++;
+
+		if (ibuf_flush_count % 4 == 0) {
+
+			buf_LRU_try_free_flushed_blocks();
+		}
+
+		return(TRUE);
+	}
+
+	return(FALSE);
+}
+
+/***********************************************************************//**
+Checks if a page address is an ibuf bitmap page address.
+@return	TRUE if a bitmap page */
+UNIV_INLINE
+ibool
+ibuf_bitmap_page(
+/*=============*/
+	ulint	zip_size,/*!< in: compressed page size in bytes;
+			0 for uncompressed pages */
+	ulint	page_no)/*!< in: page number */
+{
+	ut_ad(ut_is_2pow(zip_size));
+
+	if (!zip_size) {
+		return(UNIV_UNLIKELY((page_no & (UNIV_PAGE_SIZE - 1))
+				     == FSP_IBUF_BITMAP_OFFSET));
+	}
+
+	return(UNIV_UNLIKELY((page_no & (zip_size - 1))
+			     == FSP_IBUF_BITMAP_OFFSET));
+}
+
+/*********************************************************************//**
+Translates the free space on a page to a value in the ibuf bitmap.
+@return	value for ibuf bitmap bits */
+UNIV_INLINE
+ulint
+ibuf_index_page_calc_free_bits(
+/*===========================*/
+	ulint	zip_size,	/*!< in: compressed page size in bytes;
+				0 for uncompressed pages */
+	ulint	max_ins_size)	/*!< in: maximum insert size after reorganize
+				for the page */
+{
+	ulint	n;
+	ut_ad(ut_is_2pow(zip_size));
+	ut_ad(!zip_size || zip_size > IBUF_PAGE_SIZE_PER_FREE_SPACE);
+	ut_ad(zip_size <= UNIV_PAGE_SIZE);
+
+	if (zip_size) {
+		n = max_ins_size
+			/ (zip_size / IBUF_PAGE_SIZE_PER_FREE_SPACE);
+	} else {
+		n = max_ins_size
+			/ (UNIV_PAGE_SIZE / IBUF_PAGE_SIZE_PER_FREE_SPACE);
+	}
+
+	if (n == 3) {
+		n = 2;
+	}
+
+	if (n > 3) {
+		n = 3;
+	}
+
+	return(n);
+}
+
+/*********************************************************************//**
+Translates the ibuf free bits to the free space on a page in bytes.
+@return	maximum insert size after reorganize for the page */
+UNIV_INLINE
+ulint
+ibuf_index_page_calc_free_from_bits(
+/*================================*/
+	ulint	zip_size,/*!< in: compressed page size in bytes;
+			0 for uncompressed pages */
+	ulint	bits)	/*!< in: value for ibuf bitmap bits */
+{
+	ut_ad(bits < 4);
+	ut_ad(ut_is_2pow(zip_size));
+	ut_ad(!zip_size || zip_size > IBUF_PAGE_SIZE_PER_FREE_SPACE);
+	ut_ad(zip_size <= UNIV_PAGE_SIZE);
+
+	if (zip_size) {
+		if (bits == 3) {
+			return(4 * zip_size / IBUF_PAGE_SIZE_PER_FREE_SPACE);
+		}
+
+		return(bits * zip_size / IBUF_PAGE_SIZE_PER_FREE_SPACE);
+	}
+
+	if (bits == 3) {
+		return(4 * UNIV_PAGE_SIZE / IBUF_PAGE_SIZE_PER_FREE_SPACE);
+	}
+
+	return(bits * (UNIV_PAGE_SIZE / IBUF_PAGE_SIZE_PER_FREE_SPACE));
+}
+
+/*********************************************************************//**
+Translates the free space on a compressed page to a value in the ibuf bitmap.
+@return	value for ibuf bitmap bits */
+UNIV_INLINE
+ulint
+ibuf_index_page_calc_free_zip(
+/*==========================*/
+	ulint			zip_size,
+					/*!< in: compressed page size in bytes */
+	const buf_block_t*	block)	/*!< in: buffer block */
+{
+	ulint			max_ins_size;
+	const page_zip_des_t*	page_zip;
+	lint			zip_max_ins;
+
+	ut_ad(zip_size == buf_block_get_zip_size(block));
+	ut_ad(zip_size);
+
+	max_ins_size = page_get_max_insert_size_after_reorganize(
+		buf_block_get_frame(block), 1);
+
+	page_zip = buf_block_get_page_zip(block);
+	zip_max_ins = page_zip_max_ins_size(page_zip,
+					    FALSE/* not clustered */);
+
+	if (UNIV_UNLIKELY(zip_max_ins < 0)) {
+		return(0);
+	} else if (UNIV_LIKELY(max_ins_size > (ulint) zip_max_ins)) {
+		max_ins_size = (ulint) zip_max_ins;
+	}
+
+	return(ibuf_index_page_calc_free_bits(zip_size, max_ins_size));
+}
+
+/*********************************************************************//**
+Translates the free space on a page to a value in the ibuf bitmap.
+@return	value for ibuf bitmap bits */
+UNIV_INLINE
+ulint
+ibuf_index_page_calc_free(
+/*======================*/
+	ulint			zip_size,/*!< in: compressed page size in bytes;
+					0 for uncompressed pages */
+	const buf_block_t*	block)	/*!< in: buffer block */
+{
+	ut_ad(zip_size == buf_block_get_zip_size(block));
+
+	if (!zip_size) {
+		ulint	max_ins_size;
+
+		max_ins_size = page_get_max_insert_size_after_reorganize(
+			buf_block_get_frame(block), 1);
+
+		return(ibuf_index_page_calc_free_bits(0, max_ins_size));
+	} else {
+		return(ibuf_index_page_calc_free_zip(zip_size, block));
+	}
+}
+
+/************************************************************************//**
+Updates the free bits of an uncompressed page in the ibuf bitmap if
+there is not enough free on the page any more.  This is done in a
+separate mini-transaction, hence this operation does not restrict
+further work to only ibuf bitmap operations, which would result if the
+latch to the bitmap page were kept.  NOTE: The free bits in the insert
+buffer bitmap must never exceed the free space on a page.  It is
+unsafe to increment the bits in a separately committed
+mini-transaction, because in crash recovery, the free bits could
+momentarily be set too high.  It is only safe to use this function for
+decrementing the free bits.  Should more free space become available,
+we must not update the free bits here, because that would break crash
+recovery. */
+UNIV_INLINE
+void
+ibuf_update_free_bits_if_full(
+/*==========================*/
+	buf_block_t*	block,	/*!< in: index page to which we have added new
+				records; the free bits are updated if the
+				index is non-clustered and non-unique and
+				the page level is 0, and the page becomes
+				fuller */
+	ulint		max_ins_size,/*!< in: value of maximum insert size with
+				reorganize before the latest operation
+				performed to the page */
+	ulint		increase)/*!< in: upper limit for the additional space
+				used in the latest operation, if known, or
+				ULINT_UNDEFINED */
+{
+	ulint	before;
+	ulint	after;
+
+	ut_ad(!buf_block_get_page_zip(block));
+
+	before = ibuf_index_page_calc_free_bits(0, max_ins_size);
+
+	if (max_ins_size >= increase) {
+#if ULINT32_UNDEFINED <= UNIV_PAGE_SIZE
+# error "ULINT32_UNDEFINED <= UNIV_PAGE_SIZE"
+#endif
+		after = ibuf_index_page_calc_free_bits(0, max_ins_size
+						       - increase);
+#ifdef UNIV_IBUF_DEBUG
+		ut_a(after <= ibuf_index_page_calc_free(0, block));
+#endif
+	} else {
+		after = ibuf_index_page_calc_free(0, block);
+	}
+
+	if (after == 0) {
+		/* We move the page to the front of the buffer pool LRU list:
+		the purpose of this is to prevent those pages to which we
+		cannot make inserts using the insert buffer from slipping
+		out of the buffer pool */
+
+		buf_page_make_young(&block->page);
+	}
+
+	if (before > after) {
+		ibuf_set_free_bits(block, after, before);
+	}
+}
+#endif /* !UNIV_HOTBACKUP */
diff --git a/storage/xtradb/include/ibuf0types.h b/storage/xtradb/include/ibuf0types.h
new file mode 100644
index 00000000000..55944f879b2
--- /dev/null
+++ b/storage/xtradb/include/ibuf0types.h
@@ -0,0 +1,31 @@
+/*****************************************************************************
+
+Copyright (c) 1997, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/ibuf0types.h
+Insert buffer global types
+
+Created 7/29/1997 Heikki Tuuri
+*******************************************************/
+
+#ifndef ibuf0types_h
+#define ibuf0types_h
+
+typedef	struct ibuf_struct	ibuf_t;
+
+#endif
diff --git a/storage/xtradb/include/lock0iter.h b/storage/xtradb/include/lock0iter.h
new file mode 100644
index 00000000000..25a57c9740c
--- /dev/null
+++ b/storage/xtradb/include/lock0iter.h
@@ -0,0 +1,69 @@
+/*****************************************************************************
+
+Copyright (c) 2007, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/lock0iter.h
+Lock queue iterator type and function prototypes.
+
+Created July 16, 2007 Vasil Dimov
+*******************************************************/
+
+#ifndef lock0iter_h
+#define lock0iter_h
+
+#include "univ.i"
+#include "lock0types.h"
+
+typedef struct lock_queue_iterator_struct {
+	const lock_t*	current_lock;
+	/* In case this is a record lock queue (not table lock queue)
+	then bit_no is the record number within the heap in which the
+	record is stored. */
+	ulint		bit_no;
+} lock_queue_iterator_t;
+
+/*******************************************************************//**
+Initialize lock queue iterator so that it starts to iterate from
+"lock". bit_no specifies the record number within the heap where the
+record is stored. It can be undefined (ULINT_UNDEFINED) in two cases:
+1. If the lock is a table lock, thus we have a table lock queue;
+2. If the lock is a record lock and it is a wait lock. In this case
+   bit_no is calculated in this function by using
+   lock_rec_find_set_bit(). There is exactly one bit set in the bitmap
+   of a wait lock. */
+UNIV_INTERN
+void
+lock_queue_iterator_reset(
+/*======================*/
+	lock_queue_iterator_t*	iter,	/*!< out: iterator */
+	const lock_t*		lock,	/*!< in: lock to start from */
+	ulint			bit_no);/*!< in: record number in the
+					heap */
+
+/*******************************************************************//**
+Gets the previous lock in the lock queue, returns NULL if there are no
+more locks (i.e. the current lock is the first one). The iterator is
+receded (if not-NULL is returned).
+@return	previous lock or NULL */
+
+const lock_t*
+lock_queue_iterator_get_prev(
+/*=========================*/
+	lock_queue_iterator_t*	iter);	/*!< in/out: iterator */
+
+#endif /* lock0iter_h */
diff --git a/storage/xtradb/include/lock0lock.h b/storage/xtradb/include/lock0lock.h
new file mode 100644
index 00000000000..73f885ecf04
--- /dev/null
+++ b/storage/xtradb/include/lock0lock.h
@@ -0,0 +1,829 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/lock0lock.h
+The transaction lock system
+
+Created 5/7/1996 Heikki Tuuri
+*******************************************************/
+
+#ifndef lock0lock_h
+#define lock0lock_h
+
+#include "univ.i"
+#include "buf0types.h"
+#include "trx0types.h"
+#include "mtr0types.h"
+#include "rem0types.h"
+#include "dict0types.h"
+#include "que0types.h"
+#include "lock0types.h"
+#include "read0types.h"
+#include "hash0hash.h"
+#include "ut0vec.h"
+
+#ifdef UNIV_DEBUG
+extern ibool	lock_print_waits;
+#endif /* UNIV_DEBUG */
+/* Buffer for storing information about the most recent deadlock error */
+extern FILE*	lock_latest_err_file;
+extern ulint    srv_n_lock_deadlock_count;
+
+/*********************************************************************//**
+Gets the size of a lock struct.
+@return	size in bytes */
+UNIV_INTERN
+ulint
+lock_get_size(void);
+/*===============*/
+/*********************************************************************//**
+Creates the lock system at database start. */
+UNIV_INTERN
+void
+lock_sys_create(
+/*============*/
+	ulint	n_cells);	/*!< in: number of slots in lock hash table */
+/*********************************************************************//**
+Closes the lock system at database shutdown. */
+UNIV_INTERN
+void
+lock_sys_close(void);
+/*================*/
+/*********************************************************************//**
+Checks if some transaction has an implicit x-lock on a record in a clustered
+index.
+@return	transaction which has the x-lock, or NULL */
+UNIV_INLINE
+trx_t*
+lock_clust_rec_some_has_impl(
+/*=========================*/
+	const rec_t*	rec,	/*!< in: user record */
+	dict_index_t*	index,	/*!< in: clustered index */
+	const ulint*	offsets);/*!< in: rec_get_offsets(rec, index) */
+/*********************************************************************//**
+Gets the heap_no of the smallest user record on a page.
+@return	heap_no of smallest user record, or PAGE_HEAP_NO_SUPREMUM */
+UNIV_INLINE
+ulint
+lock_get_min_heap_no(
+/*=================*/
+	const buf_block_t*	block);	/*!< in: buffer block */
+/*************************************************************//**
+Updates the lock table when we have reorganized a page. NOTE: we copy
+also the locks set on the infimum of the page; the infimum may carry
+locks if an update of a record is occurring on the page, and its locks
+were temporarily stored on the infimum. */
+UNIV_INTERN
+void
+lock_move_reorganize_page(
+/*======================*/
+	const buf_block_t*	block,	/*!< in: old index page, now
+					reorganized */
+	const buf_block_t*	oblock);/*!< in: copy of the old, not
+					reorganized page */
+/*************************************************************//**
+Moves the explicit locks on user records to another page if a record
+list end is moved to another page. */
+UNIV_INTERN
+void
+lock_move_rec_list_end(
+/*===================*/
+	const buf_block_t*	new_block,	/*!< in: index page to move to */
+	const buf_block_t*	block,		/*!< in: index page */
+	const rec_t*		rec);		/*!< in: record on page: this
+						is the first record moved */
+/*************************************************************//**
+Moves the explicit locks on user records to another page if a record
+list start is moved to another page. */
+UNIV_INTERN
+void
+lock_move_rec_list_start(
+/*=====================*/
+	const buf_block_t*	new_block,	/*!< in: index page to move to */
+	const buf_block_t*	block,		/*!< in: index page */
+	const rec_t*		rec,		/*!< in: record on page:
+						this is the first
+						record NOT copied */
+	const rec_t*		old_end);	/*!< in: old
+						previous-to-last
+						record on new_page
+						before the records
+						were copied */
+/*************************************************************//**
+Updates the lock table when a page is split to the right. */
+UNIV_INTERN
+void
+lock_update_split_right(
+/*====================*/
+	const buf_block_t*	right_block,	/*!< in: right page */
+	const buf_block_t*	left_block);	/*!< in: left page */
+/*************************************************************//**
+Updates the lock table when a page is merged to the right. */
+UNIV_INTERN
+void
+lock_update_merge_right(
+/*====================*/
+	const buf_block_t*	right_block,	/*!< in: right page to
+						which merged */
+	const rec_t*		orig_succ,	/*!< in: original
+						successor of infimum
+						on the right page
+						before merge */
+	const buf_block_t*	left_block);	/*!< in: merged index
+						page which will be
+						discarded */
+/*************************************************************//**
+Updates the lock table when the root page is copied to another in
+btr_root_raise_and_insert. Note that we leave lock structs on the
+root page, even though they do not make sense on other than leaf
+pages: the reason is that in a pessimistic update the infimum record
+of the root page will act as a dummy carrier of the locks of the record
+to be updated. */
+UNIV_INTERN
+void
+lock_update_root_raise(
+/*===================*/
+	const buf_block_t*	block,	/*!< in: index page to which copied */
+	const buf_block_t*	root);	/*!< in: root page */
+/*************************************************************//**
+Updates the lock table when a page is copied to another and the original page
+is removed from the chain of leaf pages, except if page is the root! */
+UNIV_INTERN
+void
+lock_update_copy_and_discard(
+/*=========================*/
+	const buf_block_t*	new_block,	/*!< in: index page to
+						which copied */
+	const buf_block_t*	block);		/*!< in: index page;
+						NOT the root! */
+/*************************************************************//**
+Updates the lock table when a page is split to the left. */
+UNIV_INTERN
+void
+lock_update_split_left(
+/*===================*/
+	const buf_block_t*	right_block,	/*!< in: right page */
+	const buf_block_t*	left_block);	/*!< in: left page */
+/*************************************************************//**
+Updates the lock table when a page is merged to the left. */
+UNIV_INTERN
+void
+lock_update_merge_left(
+/*===================*/
+	const buf_block_t*	left_block,	/*!< in: left page to
+						which merged */
+	const rec_t*		orig_pred,	/*!< in: original predecessor
+						of supremum on the left page
+						before merge */
+	const buf_block_t*	right_block);	/*!< in: merged index page
+						which will be discarded */
+/*************************************************************//**
+Resets the original locks on heir and replaces them with gap type locks
+inherited from rec. */
+UNIV_INTERN
+void
+lock_rec_reset_and_inherit_gap_locks(
+/*=================================*/
+	const buf_block_t*	heir_block,	/*!< in: block containing the
+						record which inherits */
+	const buf_block_t*	block,		/*!< in: block containing the
+						record from which inherited;
+						does NOT reset the locks on
+						this record */
+	ulint			heir_heap_no,	/*!< in: heap_no of the
+						inheriting record */
+	ulint			heap_no);	/*!< in: heap_no of the
+						donating record */
+/*************************************************************//**
+Updates the lock table when a page is discarded. */
+UNIV_INTERN
+void
+lock_update_discard(
+/*================*/
+	const buf_block_t*	heir_block,	/*!< in: index page
+						which will inherit the locks */
+	ulint			heir_heap_no,	/*!< in: heap_no of the record
+						which will inherit the locks */
+	const buf_block_t*	block);		/*!< in: index page
+						which will be discarded */
+/*************************************************************//**
+Updates the lock table when a new user record is inserted. */
+UNIV_INTERN
+void
+lock_update_insert(
+/*===============*/
+	const buf_block_t*	block,	/*!< in: buffer block containing rec */
+	const rec_t*		rec);	/*!< in: the inserted record */
+/*************************************************************//**
+Updates the lock table when a record is removed. */
+UNIV_INTERN
+void
+lock_update_delete(
+/*===============*/
+	const buf_block_t*	block,	/*!< in: buffer block containing rec */
+	const rec_t*		rec);	/*!< in: the record to be removed */
+/*********************************************************************//**
+Stores on the page infimum record the explicit locks of another record.
+This function is used to store the lock state of a record when it is
+updated and the size of the record changes in the update. The record
+is in such an update moved, perhaps to another page. The infimum record
+acts as a dummy carrier record, taking care of lock releases while the
+actual record is being moved. */
+UNIV_INTERN
+void
+lock_rec_store_on_page_infimum(
+/*===========================*/
+	const buf_block_t*	block,	/*!< in: buffer block containing rec */
+	const rec_t*		rec);	/*!< in: record whose lock state
+					is stored on the infimum
+					record of the same page; lock
+					bits are reset on the
+					record */
+/*********************************************************************//**
+Restores the state of explicit lock requests on a single record, where the
+state was stored on the infimum of the page. */
+UNIV_INTERN
+void
+lock_rec_restore_from_page_infimum(
+/*===============================*/
+	const buf_block_t*	block,	/*!< in: buffer block containing rec */
+	const rec_t*		rec,	/*!< in: record whose lock state
+					is restored */
+	const buf_block_t*	donator);/*!< in: page (rec is not
+					necessarily on this page)
+					whose infimum stored the lock
+					state; lock bits are reset on
+					the infimum */
+/*********************************************************************//**
+Returns TRUE if there are explicit record locks on a page.
+@return	TRUE if there are explicit record locks on the page */
+UNIV_INTERN
+ibool
+lock_rec_expl_exist_on_page(
+/*========================*/
+	ulint	space,	/*!< in: space id */
+	ulint	page_no);/*!< in: page number */
+/*********************************************************************//**
+Checks if locks of other transactions prevent an immediate insert of
+a record. If they do, first tests if the query thread should anyway
+be suspended for some reason; if not, then puts the transaction and
+the query thread to the lock wait state and inserts a waiting request
+for a gap x-lock to the lock queue.
+@return	DB_SUCCESS, DB_LOCK_WAIT, DB_DEADLOCK, or DB_QUE_THR_SUSPENDED */
+UNIV_INTERN
+ulint
+lock_rec_insert_check_and_lock(
+/*===========================*/
+	ulint		flags,	/*!< in: if BTR_NO_LOCKING_FLAG bit is
+				set, does nothing */
+	const rec_t*	rec,	/*!< in: record after which to insert */
+	buf_block_t*	block,	/*!< in/out: buffer block of rec */
+	dict_index_t*	index,	/*!< in: index */
+	que_thr_t*	thr,	/*!< in: query thread */
+	mtr_t*		mtr,	/*!< in/out: mini-transaction */
+	ibool*		inherit);/*!< out: set to TRUE if the new
+				inserted record maybe should inherit
+				LOCK_GAP type locks from the successor
+				record */
+/*********************************************************************//**
+Checks if locks of other transactions prevent an immediate modify (update,
+delete mark, or delete unmark) of a clustered index record. If they do,
+first tests if the query thread should anyway be suspended for some
+reason; if not, then puts the transaction and the query thread to the
+lock wait state and inserts a waiting request for a record x-lock to the
+lock queue.
+@return	DB_SUCCESS, DB_LOCK_WAIT, DB_DEADLOCK, or DB_QUE_THR_SUSPENDED */
+UNIV_INTERN
+ulint
+lock_clust_rec_modify_check_and_lock(
+/*=================================*/
+	ulint			flags,	/*!< in: if BTR_NO_LOCKING_FLAG
+					bit is set, does nothing */
+	const buf_block_t*	block,	/*!< in: buffer block of rec */
+	const rec_t*		rec,	/*!< in: record which should be
+					modified */
+	dict_index_t*		index,	/*!< in: clustered index */
+	const ulint*		offsets,/*!< in: rec_get_offsets(rec, index) */
+	que_thr_t*		thr);	/*!< in: query thread */
+/*********************************************************************//**
+Checks if locks of other transactions prevent an immediate modify
+(delete mark or delete unmark) of a secondary index record.
+@return	DB_SUCCESS, DB_LOCK_WAIT, DB_DEADLOCK, or DB_QUE_THR_SUSPENDED */
+UNIV_INTERN
+ulint
+lock_sec_rec_modify_check_and_lock(
+/*===============================*/
+	ulint		flags,	/*!< in: if BTR_NO_LOCKING_FLAG
+				bit is set, does nothing */
+	buf_block_t*	block,	/*!< in/out: buffer block of rec */
+	const rec_t*	rec,	/*!< in: record which should be
+				modified; NOTE: as this is a secondary
+				index, we always have to modify the
+				clustered index record first: see the
+				comment below */
+	dict_index_t*	index,	/*!< in: secondary index */
+	que_thr_t*	thr,	/*!< in: query thread */
+	mtr_t*		mtr);	/*!< in/out: mini-transaction */
+/*********************************************************************//**
+Like lock_clust_rec_read_check_and_lock(), but reads a
+secondary index record.
+@return	DB_SUCCESS, DB_SUCCESS_LOCKED_REC, DB_LOCK_WAIT, DB_DEADLOCK,
+or DB_QUE_THR_SUSPENDED */
+UNIV_INTERN
+enum db_err
+lock_sec_rec_read_check_and_lock(
+/*=============================*/
+	ulint			flags,	/*!< in: if BTR_NO_LOCKING_FLAG
+					bit is set, does nothing */
+	const buf_block_t*	block,	/*!< in: buffer block of rec */
+	const rec_t*		rec,	/*!< in: user record or page
+					supremum record which should
+					be read or passed over by a
+					read cursor */
+	dict_index_t*		index,	/*!< in: secondary index */
+	const ulint*		offsets,/*!< in: rec_get_offsets(rec, index) */
+	enum lock_mode		mode,	/*!< in: mode of the lock which
+					the read cursor should set on
+					records: LOCK_S or LOCK_X; the
+					latter is possible in
+					SELECT FOR UPDATE */
+	ulint			gap_mode,/*!< in: LOCK_ORDINARY, LOCK_GAP, or
+					LOCK_REC_NOT_GAP */
+	que_thr_t*		thr);	/*!< in: query thread */
+/*********************************************************************//**
+Checks if locks of other transactions prevent an immediate read, or passing
+over by a read cursor, of a clustered index record. If they do, first tests
+if the query thread should anyway be suspended for some reason; if not, then
+puts the transaction and the query thread to the lock wait state and inserts a
+waiting request for a record lock to the lock queue. Sets the requested mode
+lock on the record.
+@return	DB_SUCCESS, DB_SUCCESS_LOCKED_REC, DB_LOCK_WAIT, DB_DEADLOCK,
+or DB_QUE_THR_SUSPENDED */
+UNIV_INTERN
+enum db_err
+lock_clust_rec_read_check_and_lock(
+/*===============================*/
+	ulint			flags,	/*!< in: if BTR_NO_LOCKING_FLAG
+					bit is set, does nothing */
+	const buf_block_t*	block,	/*!< in: buffer block of rec */
+	const rec_t*		rec,	/*!< in: user record or page
+					supremum record which should
+					be read or passed over by a
+					read cursor */
+	dict_index_t*		index,	/*!< in: clustered index */
+	const ulint*		offsets,/*!< in: rec_get_offsets(rec, index) */
+	enum lock_mode		mode,	/*!< in: mode of the lock which
+					the read cursor should set on
+					records: LOCK_S or LOCK_X; the
+					latter is possible in
+					SELECT FOR UPDATE */
+	ulint			gap_mode,/*!< in: LOCK_ORDINARY, LOCK_GAP, or
+					LOCK_REC_NOT_GAP */
+	que_thr_t*		thr);	/*!< in: query thread */
+/*********************************************************************//**
+Checks if locks of other transactions prevent an immediate read, or passing
+over by a read cursor, of a clustered index record. If they do, first tests
+if the query thread should anyway be suspended for some reason; if not, then
+puts the transaction and the query thread to the lock wait state and inserts a
+waiting request for a record lock to the lock queue. Sets the requested mode
+lock on the record. This is an alternative version of
+lock_clust_rec_read_check_and_lock() that does not require the parameter
+"offsets".
+@return	DB_SUCCESS, DB_LOCK_WAIT, DB_DEADLOCK, or DB_QUE_THR_SUSPENDED */
+UNIV_INTERN
+ulint
+lock_clust_rec_read_check_and_lock_alt(
+/*===================================*/
+	ulint			flags,	/*!< in: if BTR_NO_LOCKING_FLAG
+					bit is set, does nothing */
+	const buf_block_t*	block,	/*!< in: buffer block of rec */
+	const rec_t*		rec,	/*!< in: user record or page
+					supremum record which should
+					be read or passed over by a
+					read cursor */
+	dict_index_t*		index,	/*!< in: clustered index */
+	enum lock_mode		mode,	/*!< in: mode of the lock which
+					the read cursor should set on
+					records: LOCK_S or LOCK_X; the
+					latter is possible in
+					SELECT FOR UPDATE */
+	ulint			gap_mode,/*!< in: LOCK_ORDINARY, LOCK_GAP, or
+					LOCK_REC_NOT_GAP */
+	que_thr_t*		thr);	/*!< in: query thread */
+/*********************************************************************//**
+Checks that a record is seen in a consistent read.
+@return TRUE if sees, or FALSE if an earlier version of the record
+should be retrieved */
+UNIV_INTERN
+ibool
+lock_clust_rec_cons_read_sees(
+/*==========================*/
+	const rec_t*	rec,	/*!< in: user record which should be read or
+				passed over by a read cursor */
+	dict_index_t*	index,	/*!< in: clustered index */
+	const ulint*	offsets,/*!< in: rec_get_offsets(rec, index) */
+	read_view_t*	view);	/*!< in: consistent read view */
+/*********************************************************************//**
+Checks that a non-clustered index record is seen in a consistent read.
+
+NOTE that a non-clustered index page contains so little information on
+its modifications that also in the case FALSE, the present version of
+rec may be the right, but we must check this from the clustered index
+record.
+
+@return TRUE if certainly sees, or FALSE if an earlier version of the
+clustered index record might be needed */
+UNIV_INTERN
+ulint
+lock_sec_rec_cons_read_sees(
+/*========================*/
+	const rec_t*		rec,	/*!< in: user record which
+					should be read or passed over
+					by a read cursor */
+	const read_view_t*	view);	/*!< in: consistent read view */
+/*********************************************************************//**
+Locks the specified database table in the mode given. If the lock cannot
+be granted immediately, the query thread is put to wait.
+@return	DB_SUCCESS, DB_LOCK_WAIT, DB_DEADLOCK, or DB_QUE_THR_SUSPENDED */
+UNIV_INTERN
+ulint
+lock_table(
+/*=======*/
+	ulint		flags,	/*!< in: if BTR_NO_LOCKING_FLAG bit is set,
+				does nothing */
+	dict_table_t*	table,	/*!< in: database table in dictionary cache */
+	enum lock_mode	mode,	/*!< in: lock mode */
+	que_thr_t*	thr);	/*!< in: query thread */
+/*************************************************************//**
+Removes a granted record lock of a transaction from the queue and grants
+locks to other transactions waiting in the queue if they now are entitled
+to a lock. */
+UNIV_INTERN
+void
+lock_rec_unlock(
+/*============*/
+	trx_t*			trx,	/*!< in: transaction that has
+					set a record lock */
+	const buf_block_t*	block,	/*!< in: buffer block containing rec */
+	const rec_t*		rec,	/*!< in: record */
+	enum lock_mode		lock_mode);/*!< in: LOCK_S or LOCK_X */
+/*********************************************************************//**
+Releases transaction locks, and releases possible other transactions waiting
+because of these locks. */
+UNIV_INTERN
+void
+lock_release_off_kernel(
+/*====================*/
+	trx_t*	trx);	/*!< in: transaction */
+/*********************************************************************//**
+Cancels a waiting lock request and releases possible other transactions
+waiting behind it. */
+UNIV_INTERN
+void
+lock_cancel_waiting_and_release(
+/*============================*/
+	lock_t*	lock);	/*!< in: waiting lock request */
+
+/*********************************************************************//**
+Removes locks on a table to be dropped or truncated.
+If remove_also_table_sx_locks is TRUE then table-level S and X locks are
+also removed in addition to other table-level and record-level locks.
+No lock, that is going to be removed, is allowed to be a wait lock. */
+UNIV_INTERN
+void
+lock_remove_all_on_table(
+/*=====================*/
+	dict_table_t*	table,			/*!< in: table to be dropped
+						or truncated */
+	ibool		remove_also_table_sx_locks);/*!< in: also removes
+						table S and X locks */
+
+/*********************************************************************//**
+Calculates the fold value of a page file address: used in inserting or
+searching for a lock in the hash table.
+@return	folded value */
+UNIV_INLINE
+ulint
+lock_rec_fold(
+/*==========*/
+	ulint	space,	/*!< in: space */
+	ulint	page_no)/*!< in: page number */
+	__attribute__((const));
+/*********************************************************************//**
+Calculates the hash value of a page file address: used in inserting or
+searching for a lock in the hash table.
+@return	hashed value */
+UNIV_INLINE
+ulint
+lock_rec_hash(
+/*==========*/
+	ulint	space,	/*!< in: space */
+	ulint	page_no);/*!< in: page number */
+
+/**********************************************************************//**
+Looks for a set bit in a record lock bitmap. Returns ULINT_UNDEFINED,
+if none found.
+@return bit index == heap number of the record, or ULINT_UNDEFINED if
+none found */
+UNIV_INTERN
+ulint
+lock_rec_find_set_bit(
+/*==================*/
+	const lock_t*	lock);	/*!< in: record lock with at least one
+				bit set */
+
+/*********************************************************************//**
+Gets the source table of an ALTER TABLE transaction.  The table must be
+covered by an IX or IS table lock.
+@return the source table of transaction, if it is covered by an IX or
+IS table lock; dest if there is no source table, and NULL if the
+transaction is locking more than two tables or an inconsistency is
+found */
+UNIV_INTERN
+dict_table_t*
+lock_get_src_table(
+/*===============*/
+	trx_t*		trx,	/*!< in: transaction */
+	dict_table_t*	dest,	/*!< in: destination of ALTER TABLE */
+	enum lock_mode*	mode);	/*!< out: lock mode of the source table */
+/*********************************************************************//**
+Determine if the given table is exclusively "owned" by the given
+transaction, i.e., transaction holds LOCK_IX and possibly LOCK_AUTO_INC
+on the table.
+@return TRUE if table is only locked by trx, with LOCK_IX, and
+possibly LOCK_AUTO_INC */
+UNIV_INTERN
+ibool
+lock_is_table_exclusive(
+/*====================*/
+	dict_table_t*	table,	/*!< in: table */
+	trx_t*		trx);	/*!< in: transaction */
+/*********************************************************************//**
+Checks if a lock request lock1 has to wait for request lock2.
+@return	TRUE if lock1 has to wait for lock2 to be removed */
+UNIV_INTERN
+ibool
+lock_has_to_wait(
+/*=============*/
+	const lock_t*	lock1,	/*!< in: waiting lock */
+	const lock_t*	lock2);	/*!< in: another lock; NOTE that it is
+				assumed that this has a lock bit set
+				on the same record as in lock1 if the
+				locks are record locks */
+/*********************************************************************//**
+Checks that a transaction id is sensible, i.e., not in the future.
+@return	TRUE if ok */
+UNIV_INTERN
+ibool
+lock_check_trx_id_sanity(
+/*=====================*/
+	trx_id_t	trx_id,		/*!< in: trx id */
+	const rec_t*	rec,		/*!< in: user record */
+	dict_index_t*	index,		/*!< in: clustered index */
+	const ulint*	offsets,	/*!< in: rec_get_offsets(rec, index) */
+	ibool		has_kernel_mutex);/*!< in: TRUE if the caller owns the
+					kernel mutex */
+/*********************************************************************//**
+Prints info of a table lock. */
+UNIV_INTERN
+void
+lock_table_print(
+/*=============*/
+	FILE*		file,	/*!< in: file where to print */
+	const lock_t*	lock);	/*!< in: table type lock */
+/*********************************************************************//**
+Prints info of a record lock. */
+UNIV_INTERN
+void
+lock_rec_print(
+/*===========*/
+	FILE*		file,	/*!< in: file where to print */
+	const lock_t*	lock);	/*!< in: record type lock */
+/*********************************************************************//**
+Prints info of locks for all transactions.
+@return FALSE if not able to obtain kernel mutex
+and exits without printing info */
+UNIV_INTERN
+ibool
+lock_print_info_summary(
+/*====================*/
+	FILE*	file,	/*!< in: file where to print */
+	ibool   nowait);/*!< in: whether to wait for the kernel mutex */
+/*************************************************************************
+Prints info of locks for each transaction. */
+UNIV_INTERN
+void
+lock_print_info_all_transactions(
+/*=============================*/
+	FILE*	file);	/*!< in: file where to print */
+/*********************************************************************//**
+Return approximate number or record locks (bits set in the bitmap) for
+this transaction. Since delete-marked records may be removed, the
+record count will not be precise. */
+UNIV_INTERN
+ulint
+lock_number_of_rows_locked(
+/*=======================*/
+	trx_t*	trx);	/*!< in: transaction */
+/*******************************************************************//**
+Check if a transaction holds any autoinc locks.
+@return TRUE if the transaction holds any AUTOINC locks. */
+UNIV_INTERN
+ibool
+lock_trx_holds_autoinc_locks(
+/*=========================*/
+	const trx_t*	trx);		/*!< in: transaction */
+/*******************************************************************//**
+Release all the transaction's autoinc locks. */
+UNIV_INTERN
+void
+lock_release_autoinc_locks(
+/*=======================*/
+	trx_t*		trx);		/*!< in/out: transaction */
+
+/*******************************************************************//**
+Gets the type of a lock. Non-inline version for using outside of the
+lock module.
+@return	LOCK_TABLE or LOCK_REC */
+UNIV_INTERN
+ulint
+lock_get_type(
+/*==========*/
+	const lock_t*	lock);	/*!< in: lock */
+
+/*******************************************************************//**
+Gets the id of the transaction owning a lock.
+@return	transaction id */
+UNIV_INTERN
+ullint
+lock_get_trx_id(
+/*============*/
+	const lock_t*	lock);	/*!< in: lock */
+
+/*******************************************************************//**
+Gets the mode of a lock in a human readable string.
+The string should not be free()'d or modified.
+@return	lock mode */
+UNIV_INTERN
+const char*
+lock_get_mode_str(
+/*==============*/
+	const lock_t*	lock);	/*!< in: lock */
+
+/*******************************************************************//**
+Gets the type of a lock in a human readable string.
+The string should not be free()'d or modified.
+@return	lock type */
+UNIV_INTERN
+const char*
+lock_get_type_str(
+/*==============*/
+	const lock_t*	lock);	/*!< in: lock */
+
+/*******************************************************************//**
+Gets the id of the table on which the lock is.
+@return	id of the table */
+UNIV_INTERN
+ullint
+lock_get_table_id(
+/*==============*/
+	const lock_t*	lock);	/*!< in: lock */
+
+/*******************************************************************//**
+Gets the name of the table on which the lock is.
+The string should not be free()'d or modified.
+@return	name of the table */
+UNIV_INTERN
+const char*
+lock_get_table_name(
+/*================*/
+	const lock_t*	lock);	/*!< in: lock */
+
+/*******************************************************************//**
+For a record lock, gets the index on which the lock is.
+@return	index */
+UNIV_INTERN
+const dict_index_t*
+lock_rec_get_index(
+/*===============*/
+	const lock_t*	lock);	/*!< in: lock */
+
+/*******************************************************************//**
+For a record lock, gets the name of the index on which the lock is.
+The string should not be free()'d or modified.
+@return	name of the index */
+UNIV_INTERN
+const char*
+lock_rec_get_index_name(
+/*====================*/
+	const lock_t*	lock);	/*!< in: lock */
+
+/*******************************************************************//**
+For a record lock, gets the tablespace number on which the lock is.
+@return	tablespace number */
+UNIV_INTERN
+ulint
+lock_rec_get_space_id(
+/*==================*/
+	const lock_t*	lock);	/*!< in: lock */
+
+/*******************************************************************//**
+For a record lock, gets the page number on which the lock is.
+@return	page number */
+UNIV_INTERN
+ulint
+lock_rec_get_page_no(
+/*=================*/
+	const lock_t*	lock);	/*!< in: lock */
+
+/** Lock modes and types */
+/* @{ */
+#define LOCK_MODE_MASK	0xFUL	/*!< mask used to extract mode from the
+				type_mode field in a lock */
+/** Lock types */
+/* @{ */
+#define LOCK_TABLE	16	/*!< table lock */
+#define	LOCK_REC	32	/*!< record lock */
+#define LOCK_TYPE_MASK	0xF0UL	/*!< mask used to extract lock type from the
+				type_mode field in a lock */
+#if LOCK_MODE_MASK & LOCK_TYPE_MASK
+# error "LOCK_MODE_MASK & LOCK_TYPE_MASK"
+#endif
+
+#define LOCK_WAIT	256	/*!< Waiting lock flag; when set, it
+				means that the lock has not yet been
+				granted, it is just waiting for its
+				turn in the wait queue */
+/* Precise modes */
+#define LOCK_ORDINARY	0	/*!< this flag denotes an ordinary
+				next-key lock in contrast to LOCK_GAP
+				or LOCK_REC_NOT_GAP */
+#define LOCK_GAP	512	/*!< when this bit is set, it means that the
+				lock holds only on the gap before the record;
+				for instance, an x-lock on the gap does not
+				give permission to modify the record on which
+				the bit is set; locks of this type are created
+				when records are removed from the index chain
+				of records */
+#define LOCK_REC_NOT_GAP 1024	/*!< this bit means that the lock is only on
+				the index record and does NOT block inserts
+				to the gap before the index record; this is
+				used in the case when we retrieve a record
+				with a unique key, and is also used in
+				locking plain SELECTs (not part of UPDATE
+				or DELETE) when the user has set the READ
+				COMMITTED isolation level */
+#define LOCK_INSERT_INTENTION 2048 /*!< this bit is set when we place a waiting
+				gap type record lock request in order to let
+				an insert of an index record to wait until
+				there are no conflicting locks by other
+				transactions on the gap; note that this flag
+				remains set when the waiting lock is granted,
+				or if the lock is inherited to a neighboring
+				record */
+#if (LOCK_WAIT|LOCK_GAP|LOCK_REC_NOT_GAP|LOCK_INSERT_INTENTION)&LOCK_MODE_MASK
+# error
+#endif
+#if (LOCK_WAIT|LOCK_GAP|LOCK_REC_NOT_GAP|LOCK_INSERT_INTENTION)&LOCK_TYPE_MASK
+# error
+#endif
+/* @} */
+
+/** Lock operation struct */
+typedef struct lock_op_struct	lock_op_t;
+/** Lock operation struct */
+struct lock_op_struct{
+	dict_table_t*	table;	/*!< table to be locked */
+	enum lock_mode	mode;	/*!< lock mode */
+};
+
+/** The lock system struct */
+struct lock_sys_struct{
+	hash_table_t*	rec_hash;	/*!< hash table of the record locks */
+};
+
+/** The lock system */
+extern lock_sys_t*	lock_sys;
+
+
+#ifndef UNIV_NONINL
+#include "lock0lock.ic"
+#endif
+
+#endif
diff --git a/storage/xtradb/include/lock0lock.ic b/storage/xtradb/include/lock0lock.ic
new file mode 100644
index 00000000000..014722f51c4
--- /dev/null
+++ b/storage/xtradb/include/lock0lock.ic
@@ -0,0 +1,121 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/lock0lock.ic
+The transaction lock system
+
+Created 5/7/1996 Heikki Tuuri
+*******************************************************/
+
+#include "sync0sync.h"
+#include "srv0srv.h"
+#include "dict0dict.h"
+#include "row0row.h"
+#include "trx0sys.h"
+#include "trx0trx.h"
+#include "buf0buf.h"
+#include "page0page.h"
+#include "page0cur.h"
+#include "row0vers.h"
+#include "que0que.h"
+#include "btr0cur.h"
+#include "read0read.h"
+#include "log0recv.h"
+
+/*********************************************************************//**
+Calculates the fold value of a page file address: used in inserting or
+searching for a lock in the hash table.
+@return	folded value */
+UNIV_INLINE
+ulint
+lock_rec_fold(
+/*==========*/
+	ulint	space,	/*!< in: space */
+	ulint	page_no)/*!< in: page number */
+{
+	return(ut_fold_ulint_pair(space, page_no));
+}
+
+/*********************************************************************//**
+Calculates the hash value of a page file address: used in inserting or
+searching for a lock in the hash table.
+@return	hashed value */
+UNIV_INLINE
+ulint
+lock_rec_hash(
+/*==========*/
+	ulint	space,	/*!< in: space */
+	ulint	page_no)/*!< in: page number */
+{
+	return(hash_calc_hash(lock_rec_fold(space, page_no),
+			      lock_sys->rec_hash));
+}
+
+/*********************************************************************//**
+Checks if some transaction has an implicit x-lock on a record in a clustered
+index.
+@return	transaction which has the x-lock, or NULL */
+UNIV_INLINE
+trx_t*
+lock_clust_rec_some_has_impl(
+/*=========================*/
+	const rec_t*	rec,	/*!< in: user record */
+	dict_index_t*	index,	/*!< in: clustered index */
+	const ulint*	offsets)/*!< in: rec_get_offsets(rec, index) */
+{
+	trx_id_t	trx_id;
+
+	ut_ad(mutex_own(&kernel_mutex));
+	ut_ad(dict_index_is_clust(index));
+	ut_ad(page_rec_is_user_rec(rec));
+
+	trx_id = row_get_rec_trx_id(rec, index, offsets);
+
+	if (trx_is_active(trx_id)) {
+		/* The modifying or inserting transaction is active */
+
+		return(trx_get_on_id(trx_id));
+	}
+
+	return(NULL);
+}
+
+/*********************************************************************//**
+Gets the heap_no of the smallest user record on a page.
+@return	heap_no of smallest user record, or PAGE_HEAP_NO_SUPREMUM */
+UNIV_INLINE
+ulint
+lock_get_min_heap_no(
+/*=================*/
+	const buf_block_t*	block)	/*!< in: buffer block */
+{
+	const page_t*	page	= block->frame;
+
+	if (page_is_comp(page)) {
+		return(rec_get_heap_no_new(
+			       page
+			       + rec_get_next_offs(page + PAGE_NEW_INFIMUM,
+						   TRUE)));
+	} else {
+		return(rec_get_heap_no_old(
+			       page
+			       + rec_get_next_offs(page + PAGE_OLD_INFIMUM,
+						   FALSE)));
+	}
+}
diff --git a/storage/xtradb/include/lock0priv.h b/storage/xtradb/include/lock0priv.h
new file mode 100644
index 00000000000..287c151b19f
--- /dev/null
+++ b/storage/xtradb/include/lock0priv.h
@@ -0,0 +1,108 @@
+/*****************************************************************************
+
+Copyright (c) 2007, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/lock0priv.h
+Lock module internal structures and methods.
+
+Created July 12, 2007 Vasil Dimov
+*******************************************************/
+
+#ifndef lock0priv_h
+#define lock0priv_h
+
+#ifndef LOCK_MODULE_IMPLEMENTATION
+/* If you need to access members of the structures defined in this
+file, please write appropriate functions that retrieve them and put
+those functions in lock/ */
+#error Do not include lock0priv.h outside of the lock/ module
+#endif
+
+#include "univ.i"
+#include "dict0types.h"
+#include "hash0hash.h"
+#include "trx0types.h"
+#include "ut0lst.h"
+
+/** A table lock */
+typedef struct lock_table_struct	lock_table_t;
+/** A table lock */
+struct lock_table_struct {
+	dict_table_t*	table;		/*!< database table in dictionary
+					cache */
+	UT_LIST_NODE_T(lock_t)
+			locks;		/*!< list of locks on the same
+					table */
+};
+
+/** Record lock for a page */
+typedef struct lock_rec_struct		lock_rec_t;
+/** Record lock for a page */
+struct lock_rec_struct {
+	ulint	space;			/*!< space id */
+	ulint	page_no;		/*!< page number */
+	ulint	n_bits;			/*!< number of bits in the lock
+					bitmap; NOTE: the lock bitmap is
+					placed immediately after the
+					lock struct */
+};
+
+/** Lock struct */
+struct lock_struct {
+	trx_t*		trx;		/*!< transaction owning the
+					lock */
+	UT_LIST_NODE_T(lock_t)
+			trx_locks;	/*!< list of the locks of the
+					transaction */
+	ulint		type_mode;	/*!< lock type, mode, LOCK_GAP or
+					LOCK_REC_NOT_GAP,
+					LOCK_INSERT_INTENTION,
+					wait flag, ORed */
+	hash_node_t	hash;		/*!< hash chain node for a record
+					lock */
+	dict_index_t*	index;		/*!< index for a record lock */
+	union {
+		lock_table_t	tab_lock;/*!< table lock */
+		lock_rec_t	rec_lock;/*!< record lock */
+	} un_member;			/*!< lock details */
+};
+
+/*********************************************************************//**
+Gets the type of a lock.
+@return	LOCK_TABLE or LOCK_REC */
+UNIV_INLINE
+ulint
+lock_get_type_low(
+/*==============*/
+	const lock_t*	lock);	/*!< in: lock */
+
+/*********************************************************************//**
+Gets the previous record lock set on a record.
+@return	previous lock on the same record, NULL if none exists */
+UNIV_INTERN
+const lock_t*
+lock_rec_get_prev(
+/*==============*/
+	const lock_t*	in_lock,/*!< in: record lock */
+	ulint		heap_no);/*!< in: heap number of the record */
+
+#ifndef UNIV_NONINL
+#include "lock0priv.ic"
+#endif
+
+#endif /* lock0priv_h */
diff --git a/storage/xtradb/include/lock0priv.ic b/storage/xtradb/include/lock0priv.ic
new file mode 100644
index 00000000000..30447c99848
--- /dev/null
+++ b/storage/xtradb/include/lock0priv.ic
@@ -0,0 +1,49 @@
+/*****************************************************************************
+
+Copyright (c) 2007, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/lock0priv.ic
+Lock module internal inline methods.
+
+Created July 16, 2007 Vasil Dimov
+*******************************************************/
+
+/* This file contains only methods which are used in
+lock/lock0* files, other than lock/lock0lock.c.
+I.e. lock/lock0lock.c contains more internal inline
+methods but they are used only in that file. */
+
+#ifndef LOCK_MODULE_IMPLEMENTATION
+#error Do not include lock0priv.ic outside of the lock/ module
+#endif
+
+/*********************************************************************//**
+Gets the type of a lock.
+@return	LOCK_TABLE or LOCK_REC */
+UNIV_INLINE
+ulint
+lock_get_type_low(
+/*==============*/
+	const lock_t*	lock)	/*!< in: lock */
+{
+	ut_ad(lock);
+
+	return(lock->type_mode & LOCK_TYPE_MASK);
+}
+
+/* vim: set filetype=c: */
diff --git a/storage/xtradb/include/lock0types.h b/storage/xtradb/include/lock0types.h
new file mode 100644
index 00000000000..45f29e90fe9
--- /dev/null
+++ b/storage/xtradb/include/lock0types.h
@@ -0,0 +1,45 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/lock0types.h
+The transaction lock system global types
+
+Created 5/7/1996 Heikki Tuuri
+*******************************************************/
+
+#ifndef lock0types_h
+#define lock0types_h
+
+#define lock_t ib_lock_t
+typedef struct lock_struct	lock_t;
+typedef struct lock_sys_struct	lock_sys_t;
+
+/* Basic lock modes */
+enum lock_mode {
+	LOCK_IS = 0,	/* intention shared */
+	LOCK_IX,	/* intention exclusive */
+	LOCK_S,		/* shared */
+	LOCK_X,		/* exclusive */
+	LOCK_AUTO_INC,	/* locks the auto-inc counter of a table
+			in an exclusive mode */
+	LOCK_NONE,	/* this is used elsewhere to note consistent read */
+	LOCK_NUM = LOCK_NONE/* number of lock modes */
+};
+
+#endif
diff --git a/storage/xtradb/include/log0log.h b/storage/xtradb/include/log0log.h
new file mode 100644
index 00000000000..8fce4ef96bc
--- /dev/null
+++ b/storage/xtradb/include/log0log.h
@@ -0,0 +1,969 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2010, Innobase Oy. All Rights Reserved.
+Copyright (c) 2009, Google Inc.
+
+Portions of this file contain modifications contributed and copyrighted by
+Google, Inc. Those modifications are gratefully acknowledged and are described
+briefly in the InnoDB documentation. The contributions by Google are
+incorporated with their permission, and subject to the conditions contained in
+the file COPYING.Google.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/log0log.h
+Database log
+
+Created 12/9/1995 Heikki Tuuri
+*******************************************************/
+
+#ifndef log0log_h
+#define log0log_h
+
+#include "univ.i"
+#include "ut0byte.h"
+#include "ut0lst.h"
+#ifndef UNIV_HOTBACKUP
+#include "sync0sync.h"
+#include "sync0rw.h"
+#endif /* !UNIV_HOTBACKUP */
+
+/** Redo log buffer */
+typedef struct log_struct	log_t;
+/** Redo log group */
+typedef struct log_group_struct	log_group_t;
+
+#ifdef UNIV_DEBUG
+/** Flag: write to log file? */
+extern	ibool	log_do_write;
+/** Flag: enable debug output when writing to the log? */
+extern	ibool	log_debug_writes;
+#else /* UNIV_DEBUG */
+/** Write to log */
+# define log_do_write TRUE
+#endif /* UNIV_DEBUG */
+
+/** Wait modes for log_write_up_to @{ */
+#define LOG_NO_WAIT		91
+#define LOG_WAIT_ONE_GROUP	92
+#define	LOG_WAIT_ALL_GROUPS	93
+/* @} */
+/** Maximum number of log groups in log_group_struct::checkpoint_buf */
+#define LOG_MAX_N_GROUPS	32
+
+#ifndef UNIV_HOTBACKUP
+/****************************************************************//**
+Sets the global variable log_fsp_current_free_limit. Also makes a checkpoint,
+so that we know that the limit has been written to a log checkpoint field
+on disk. */
+UNIV_INTERN
+void
+log_fsp_current_free_limit_set_and_checkpoint(
+/*==========================================*/
+	ulint	limit);	/*!< in: limit to set */
+#endif /* !UNIV_HOTBACKUP */
+/*******************************************************************//**
+Calculates where in log files we find a specified lsn.
+@return	log file number */
+UNIV_INTERN
+ulint
+log_calc_where_lsn_is(
+/*==================*/
+	ib_int64_t*	log_file_offset,	/*!< out: offset in that file
+						(including the header) */
+	ib_uint64_t	first_header_lsn,	/*!< in: first log file start
+						lsn */
+	ib_uint64_t	lsn,			/*!< in: lsn whose position to
+						determine */
+	ulint		n_log_files,		/*!< in: total number of log
+						files */
+	ib_int64_t	log_file_size);		/*!< in: log file size
+						(including the header) */
+#ifndef UNIV_HOTBACKUP
+/************************************************************//**
+Writes to the log the string given. The log must be released with
+log_release.
+@return	end lsn of the log record, zero if did not succeed */
+UNIV_INLINE
+ib_uint64_t
+log_reserve_and_write_fast(
+/*=======================*/
+	const void*	str,	/*!< in: string */
+	ulint		len,	/*!< in: string length */
+	ib_uint64_t*	start_lsn);/*!< out: start lsn of the log record */
+/***********************************************************************//**
+Releases the log mutex. */
+UNIV_INLINE
+void
+log_release(void);
+/*=============*/
+/***********************************************************************//**
+Checks if there is need for a log buffer flush or a new checkpoint, and does
+this if yes. Any database operation should call this when it has modified
+more than about 4 pages. NOTE that this function may only be called when the
+OS thread owns no synchronization objects except the dictionary mutex. */
+UNIV_INLINE
+void
+log_free_check(void);
+/*================*/
+/************************************************************//**
+Opens the log for log_write_low. The log must be closed with log_close and
+released with log_release.
+@return	start lsn of the log record */
+UNIV_INTERN
+ib_uint64_t
+log_reserve_and_open(
+/*=================*/
+	ulint	len);	/*!< in: length of data to be catenated */
+/************************************************************//**
+Writes to the log the string given. It is assumed that the caller holds the
+log mutex. */
+UNIV_INTERN
+void
+log_write_low(
+/*==========*/
+	byte*	str,		/*!< in: string */
+	ulint	str_len);	/*!< in: string length */
+/************************************************************//**
+Closes the log.
+@return	lsn */
+UNIV_INTERN
+ib_uint64_t
+log_close(void);
+/*===========*/
+/************************************************************//**
+Gets the current lsn.
+@return	current lsn */
+UNIV_INLINE
+ib_uint64_t
+log_get_lsn(void);
+/*=============*/
+/****************************************************************
+Gets the log group capacity. It is OK to read the value without
+holding log_sys->mutex because it is constant.
+@return	log group capacity */
+UNIV_INLINE
+ulint
+log_get_capacity(void);
+/*==================*/
+/******************************************************//**
+Initializes the log. */
+UNIV_INTERN
+void
+log_init(void);
+/*==========*/
+/******************************************************************//**
+Inits a log group to the log system. */
+UNIV_INTERN
+void
+log_group_init(
+/*===========*/
+	ulint	id,			/*!< in: group id */
+	ulint	n_files,		/*!< in: number of log files */
+	ulint	file_size,		/*!< in: log file size in bytes */
+	ulint	space_id,		/*!< in: space id of the file space
+					which contains the log files of this
+					group */
+	ulint	archive_space_id);	/*!< in: space id of the file space
+					which contains some archived log
+					files for this group; currently, only
+					for the first log group this is
+					used */
+/******************************************************//**
+Completes an i/o to a log file. */
+UNIV_INTERN
+void
+log_io_complete(
+/*============*/
+	log_group_t*	group);	/*!< in: log group */
+/******************************************************//**
+This function is called, e.g., when a transaction wants to commit. It checks
+that the log has been written to the log file up to the last log entry written
+by the transaction. If there is a flush running, it waits and checks if the
+flush flushed enough. If not, starts a new flush. */
+UNIV_INTERN
+void
+log_write_up_to(
+/*============*/
+	ib_uint64_t	lsn,	/*!< in: log sequence number up to which
+				the log should be written,
+				IB_ULONGLONG_MAX if not specified */
+	ulint		wait,	/*!< in: LOG_NO_WAIT, LOG_WAIT_ONE_GROUP,
+				or LOG_WAIT_ALL_GROUPS */
+	ibool		flush_to_disk);
+				/*!< in: TRUE if we want the written log
+				also to be flushed to disk */
+/****************************************************************//**
+Does a syncronous flush of the log buffer to disk. */
+UNIV_INTERN
+void
+log_buffer_flush_to_disk(void);
+/*==========================*/
+/****************************************************************//**
+This functions writes the log buffer to the log file and if 'flush'
+is set it forces a flush of the log file as well. This is meant to be
+called from background master thread only as it does not wait for
+the write (+ possible flush) to finish. */
+UNIV_INTERN
+void
+log_buffer_sync_in_background(
+/*==========================*/
+	ibool	flush);	/*<! in: flush the logs to disk */
+/****************************************************************//**
+Advances the smallest lsn for which there are unflushed dirty blocks in the
+buffer pool and also may make a new checkpoint. NOTE: this function may only
+be called if the calling thread owns no synchronization objects!
+@return FALSE if there was a flush batch of the same type running,
+which means that we could not start this flush batch */
+UNIV_INTERN
+ibool
+log_preflush_pool_modified_pages(
+/*=============================*/
+	ib_uint64_t	new_oldest,	/*!< in: try to advance
+					oldest_modified_lsn at least
+					to this lsn */
+	ibool		sync);		/*!< in: TRUE if synchronous
+					operation is desired */
+/******************************************************//**
+Makes a checkpoint. Note that this function does not flush dirty
+blocks from the buffer pool: it only checks what is lsn of the oldest
+modification in the pool, and writes information about the lsn in
+log files. Use log_make_checkpoint_at to flush also the pool.
+@return	TRUE if success, FALSE if a checkpoint write was already running */
+UNIV_INTERN
+ibool
+log_checkpoint(
+/*===========*/
+	ibool	sync,		/*!< in: TRUE if synchronous operation is
+				desired */
+	ibool	write_always);	/*!< in: the function normally checks if the
+				the new checkpoint would have a greater
+				lsn than the previous one: if not, then no
+				physical write is done; by setting this
+				parameter TRUE, a physical write will always be
+				made to log files */
+/****************************************************************//**
+Makes a checkpoint at a given lsn or later. */
+UNIV_INTERN
+void
+log_make_checkpoint_at(
+/*===================*/
+	ib_uint64_t	lsn,		/*!< in: make a checkpoint at this or a
+					later lsn, if IB_ULONGLONG_MAX, makes
+					a checkpoint at the latest lsn */
+	ibool		write_always);	/*!< in: the function normally checks if
+					the new checkpoint would have a
+					greater lsn than the previous one: if
+					not, then no physical write is done;
+					by setting this parameter TRUE, a
+					physical write will always be made to
+					log files */
+/****************************************************************//**
+Makes a checkpoint at the latest lsn and writes it to first page of each
+data file in the database, so that we know that the file spaces contain
+all modifications up to that lsn. This can only be called at database
+shutdown. This function also writes all log in log files to the log archive. */
+UNIV_INTERN
+void
+logs_empty_and_mark_files_at_shutdown(void);
+/*=======================================*/
+/******************************************************//**
+Reads a checkpoint info from a log group header to log_sys->checkpoint_buf. */
+UNIV_INTERN
+void
+log_group_read_checkpoint_info(
+/*===========================*/
+	log_group_t*	group,	/*!< in: log group */
+	ulint		field);	/*!< in: LOG_CHECKPOINT_1 or LOG_CHECKPOINT_2 */
+/*******************************************************************//**
+Gets info from a checkpoint about a log group. */
+UNIV_INTERN
+void
+log_checkpoint_get_nth_group_info(
+/*==============================*/
+	const byte*	buf,	/*!< in: buffer containing checkpoint info */
+	ulint		n,	/*!< in: nth slot */
+	ulint*		file_no,/*!< out: archived file number */
+	ulint*		offset);/*!< out: archived file offset */
+/******************************************************//**
+Writes checkpoint info to groups. */
+UNIV_INTERN
+void
+log_groups_write_checkpoint_info(void);
+/*==================================*/
+/********************************************************************//**
+Starts an archiving operation.
+@return	TRUE if succeed, FALSE if an archiving operation was already running */
+UNIV_INTERN
+ibool
+log_archive_do(
+/*===========*/
+	ibool	sync,	/*!< in: TRUE if synchronous operation is desired */
+	ulint*	n_bytes);/*!< out: archive log buffer size, 0 if nothing to
+			archive */
+/****************************************************************//**
+Writes the log contents to the archive up to the lsn when this function was
+called, and stops the archiving. When archiving is started again, the archived
+log file numbers start from a number one higher, so that the archiving will
+not write again to the archived log files which exist when this function
+returns.
+@return	DB_SUCCESS or DB_ERROR */
+UNIV_INTERN
+ulint
+log_archive_stop(void);
+/*==================*/
+/****************************************************************//**
+Starts again archiving which has been stopped.
+@return	DB_SUCCESS or DB_ERROR */
+UNIV_INTERN
+ulint
+log_archive_start(void);
+/*===================*/
+/****************************************************************//**
+Stop archiving the log so that a gap may occur in the archived log files.
+@return	DB_SUCCESS or DB_ERROR */
+UNIV_INTERN
+ulint
+log_archive_noarchivelog(void);
+/*==========================*/
+/****************************************************************//**
+Start archiving the log so that a gap may occur in the archived log files.
+@return	DB_SUCCESS or DB_ERROR */
+UNIV_INTERN
+ulint
+log_archive_archivelog(void);
+/*========================*/
+/******************************************************//**
+Generates an archived log file name. */
+UNIV_INTERN
+void
+log_archived_file_name_gen(
+/*=======================*/
+	char*	buf,	/*!< in: buffer where to write */
+	ulint	id,	/*!< in: group id */
+	ulint	file_no);/*!< in: file number */
+#else /* !UNIV_HOTBACKUP */
+/******************************************************//**
+Writes info to a buffer of a log group when log files are created in
+backup restoration. */
+UNIV_INTERN
+void
+log_reset_first_header_and_checkpoint(
+/*==================================*/
+	byte*		hdr_buf,/*!< in: buffer which will be written to the
+				start of the first log file */
+	ib_uint64_t	start);	/*!< in: lsn of the start of the first log file;
+				we pretend that there is a checkpoint at
+				start + LOG_BLOCK_HDR_SIZE */
+#endif /* !UNIV_HOTBACKUP */
+/********************************************************************//**
+Checks that there is enough free space in the log to start a new query step.
+Flushes the log buffer or makes a new checkpoint if necessary. NOTE: this
+function may only be called if the calling thread owns no synchronization
+objects! */
+UNIV_INTERN
+void
+log_check_margins(void);
+/*===================*/
+#ifndef UNIV_HOTBACKUP
+/******************************************************//**
+Reads a specified log segment to a buffer. */
+UNIV_INTERN
+void
+log_group_read_log_seg(
+/*===================*/
+	ulint		type,		/*!< in: LOG_ARCHIVE or LOG_RECOVER */
+	byte*		buf,		/*!< in: buffer where to read */
+	log_group_t*	group,		/*!< in: log group */
+	ib_uint64_t	start_lsn,	/*!< in: read area start */
+	ib_uint64_t	end_lsn);	/*!< in: read area end */
+/******************************************************//**
+Writes a buffer to a log file group. */
+UNIV_INTERN
+void
+log_group_write_buf(
+/*================*/
+	log_group_t*	group,		/*!< in: log group */
+	byte*		buf,		/*!< in: buffer */
+	ulint		len,		/*!< in: buffer len; must be divisible
+					by OS_FILE_LOG_BLOCK_SIZE */
+	ib_uint64_t	start_lsn,	/*!< in: start lsn of the buffer; must
+					be divisible by
+					OS_FILE_LOG_BLOCK_SIZE */
+	ulint		new_data_offset);/*!< in: start offset of new data in
+					buf: this parameter is used to decide
+					if we have to write a new log file
+					header */
+/********************************************************//**
+Sets the field values in group to correspond to a given lsn. For this function
+to work, the values must already be correctly initialized to correspond to
+some lsn, for instance, a checkpoint lsn. */
+UNIV_INTERN
+void
+log_group_set_fields(
+/*=================*/
+	log_group_t*	group,	/*!< in/out: group */
+	ib_uint64_t	lsn);	/*!< in: lsn for which the values should be
+				set */
+/******************************************************//**
+Calculates the data capacity of a log group, when the log file headers are not
+included.
+@return	capacity in bytes */
+UNIV_INTERN
+ulint
+log_group_get_capacity(
+/*===================*/
+	const log_group_t*	group);	/*!< in: log group */
+#endif /* !UNIV_HOTBACKUP */
+/************************************************************//**
+Gets a log block flush bit.
+@return	TRUE if this block was the first to be written in a log flush */
+UNIV_INLINE
+ibool
+log_block_get_flush_bit(
+/*====================*/
+	const byte*	log_block);	/*!< in: log block */
+/************************************************************//**
+Gets a log block number stored in the header.
+@return	log block number stored in the block header */
+UNIV_INLINE
+ulint
+log_block_get_hdr_no(
+/*=================*/
+	const byte*	log_block);	/*!< in: log block */
+/************************************************************//**
+Gets a log block data length.
+@return	log block data length measured as a byte offset from the block start */
+UNIV_INLINE
+ulint
+log_block_get_data_len(
+/*===================*/
+	const byte*	log_block);	/*!< in: log block */
+/************************************************************//**
+Sets the log block data length. */
+UNIV_INLINE
+void
+log_block_set_data_len(
+/*===================*/
+	byte*	log_block,	/*!< in/out: log block */
+	ulint	len);		/*!< in: data length */
+/************************************************************//**
+Calculates the checksum for a log block.
+@return	checksum */
+UNIV_INLINE
+ulint
+log_block_calc_checksum(
+/*====================*/
+	const byte*	block);	/*!< in: log block */
+/************************************************************//**
+Gets a log block checksum field value.
+@return	checksum */
+UNIV_INLINE
+ulint
+log_block_get_checksum(
+/*===================*/
+	const byte*	log_block);	/*!< in: log block */
+/************************************************************//**
+Sets a log block checksum field value. */
+UNIV_INLINE
+void
+log_block_set_checksum(
+/*===================*/
+	byte*	log_block,	/*!< in/out: log block */
+	ulint	checksum);	/*!< in: checksum */
+/************************************************************//**
+Gets a log block first mtr log record group offset.
+@return first mtr log record group byte offset from the block start, 0
+if none */
+UNIV_INLINE
+ulint
+log_block_get_first_rec_group(
+/*==========================*/
+	const byte*	log_block);	/*!< in: log block */
+/************************************************************//**
+Sets the log block first mtr log record group offset. */
+UNIV_INLINE
+void
+log_block_set_first_rec_group(
+/*==========================*/
+	byte*	log_block,	/*!< in/out: log block */
+	ulint	offset);	/*!< in: offset, 0 if none */
+/************************************************************//**
+Gets a log block checkpoint number field (4 lowest bytes).
+@return	checkpoint no (4 lowest bytes) */
+UNIV_INLINE
+ulint
+log_block_get_checkpoint_no(
+/*========================*/
+	const byte*	log_block);	/*!< in: log block */
+/************************************************************//**
+Initializes a log block in the log buffer. */
+UNIV_INLINE
+void
+log_block_init(
+/*===========*/
+	byte*		log_block,	/*!< in: pointer to the log buffer */
+	ib_uint64_t	lsn);		/*!< in: lsn within the log block */
+/************************************************************//**
+Initializes a log block in the log buffer in the old, < 3.23.52 format, where
+there was no checksum yet. */
+UNIV_INLINE
+void
+log_block_init_in_old_format(
+/*=========================*/
+	byte*		log_block,	/*!< in: pointer to the log buffer */
+	ib_uint64_t	lsn);		/*!< in: lsn within the log block */
+/************************************************************//**
+Converts a lsn to a log block number.
+@return	log block number, it is > 0 and <= 1G */
+UNIV_INLINE
+ulint
+log_block_convert_lsn_to_no(
+/*========================*/
+	ib_uint64_t	lsn);	/*!< in: lsn of a byte within the block */
+/******************************************************//**
+Prints info of the log. */
+UNIV_INTERN
+void
+log_print(
+/*======*/
+	FILE*	file);	/*!< in: file where to print */
+/******************************************************//**
+Peeks the current lsn.
+@return	TRUE if success, FALSE if could not get the log system mutex */
+UNIV_INTERN
+ibool
+log_peek_lsn(
+/*=========*/
+	ib_uint64_t*	lsn);	/*!< out: if returns TRUE, current lsn is here */
+/**********************************************************************//**
+Refreshes the statistics used to print per-second averages. */
+UNIV_INTERN
+void
+log_refresh_stats(void);
+/*===================*/
+/**********************************************************
+Shutdown the log system but do not release all the memory. */
+UNIV_INTERN
+void
+log_shutdown(void);
+/*==============*/
+/**********************************************************
+Free the log system data structures. */
+UNIV_INTERN
+void
+log_mem_free(void);
+/*==============*/
+
+extern log_t*	log_sys;
+
+/* Values used as flags */
+#define LOG_FLUSH	7652559
+#define LOG_CHECKPOINT	78656949
+#ifdef UNIV_LOG_ARCHIVE
+# define LOG_ARCHIVE	11122331
+#endif /* UNIV_LOG_ARCHIVE */
+#define LOG_RECOVER	98887331
+
+/* The counting of lsn's starts from this value: this must be non-zero */
+#define LOG_START_LSN		((ib_uint64_t) (16 * OS_FILE_LOG_BLOCK_SIZE))
+
+#define LOG_BUFFER_SIZE		(srv_log_buffer_size * UNIV_PAGE_SIZE)
+#define LOG_ARCHIVE_BUF_SIZE	(srv_log_buffer_size * UNIV_PAGE_SIZE / 4)
+
+/* Offsets of a log block header */
+#define	LOG_BLOCK_HDR_NO	0	/* block number which must be > 0 and
+					is allowed to wrap around at 2G; the
+					highest bit is set to 1 if this is the
+					first log block in a log flush write
+					segment */
+#define LOG_BLOCK_FLUSH_BIT_MASK 0x80000000UL
+					/* mask used to get the highest bit in
+					the preceding field */
+#define	LOG_BLOCK_HDR_DATA_LEN	4	/* number of bytes of log written to
+					this block */
+#define	LOG_BLOCK_FIRST_REC_GROUP 6	/* offset of the first start of an
+					mtr log record group in this log block,
+					0 if none; if the value is the same
+					as LOG_BLOCK_HDR_DATA_LEN, it means
+					that the first rec group has not yet
+					been catenated to this log block, but
+					if it will, it will start at this
+					offset; an archive recovery can
+					start parsing the log records starting
+					from this offset in this log block,
+					if value not 0 */
+#define LOG_BLOCK_CHECKPOINT_NO	8	/* 4 lower bytes of the value of
+					log_sys->next_checkpoint_no when the
+					log block was last written to: if the
+					block has not yet been written full,
+					this value is only updated before a
+					log buffer flush */
+#define LOG_BLOCK_HDR_SIZE	12	/* size of the log block header in
+					bytes */
+
+/* Offsets of a log block trailer from the end of the block */
+#define	LOG_BLOCK_CHECKSUM	4	/* 4 byte checksum of the log block
+					contents; in InnoDB versions
+					< 3.23.52 this did not contain the
+					checksum but the same value as
+					.._HDR_NO */
+#define	LOG_BLOCK_TRL_SIZE	4	/* trailer size in bytes */
+
+/* Offsets for a checkpoint field */
+#define LOG_CHECKPOINT_NO		0
+#define LOG_CHECKPOINT_LSN		8
+#define LOG_CHECKPOINT_OFFSET		16
+#define LOG_CHECKPOINT_LOG_BUF_SIZE	20
+#define	LOG_CHECKPOINT_ARCHIVED_LSN	24
+#define	LOG_CHECKPOINT_GROUP_ARRAY	32
+
+/* For each value smaller than LOG_MAX_N_GROUPS the following 8 bytes: */
+
+#define LOG_CHECKPOINT_ARCHIVED_FILE_NO	0
+#define LOG_CHECKPOINT_ARCHIVED_OFFSET	4
+
+#define	LOG_CHECKPOINT_ARRAY_END	(LOG_CHECKPOINT_GROUP_ARRAY\
+							+ LOG_MAX_N_GROUPS * 8)
+#define LOG_CHECKPOINT_CHECKSUM_1	LOG_CHECKPOINT_ARRAY_END
+#define LOG_CHECKPOINT_CHECKSUM_2	(4 + LOG_CHECKPOINT_ARRAY_END)
+#define LOG_CHECKPOINT_FSP_FREE_LIMIT	(8 + LOG_CHECKPOINT_ARRAY_END)
+					/* current fsp free limit in
+					tablespace 0, in units of one
+					megabyte; this information is only used
+					by ibbackup to decide if it can
+					truncate unused ends of
+					non-auto-extending data files in space
+					0 */
+#define LOG_CHECKPOINT_FSP_MAGIC_N	(12 + LOG_CHECKPOINT_ARRAY_END)
+					/* this magic number tells if the
+					checkpoint contains the above field:
+					the field was added to
+					InnoDB-3.23.50 */
+#define LOG_CHECKPOINT_SIZE		(16 + LOG_CHECKPOINT_ARRAY_END)
+
+#define LOG_CHECKPOINT_FSP_MAGIC_N_VAL	1441231243
+
+/* Offsets of a log file header */
+#define LOG_GROUP_ID		0	/* log group number */
+#define LOG_FILE_START_LSN	4	/* lsn of the start of data in this
+					log file */
+#define LOG_FILE_NO		12	/* 4-byte archived log file number;
+					this field is only defined in an
+					archived log file */
+#define LOG_FILE_WAS_CREATED_BY_HOT_BACKUP 16
+					/* a 32-byte field which contains
+					the string 'ibbackup' and the
+					creation time if the log file was
+					created by ibbackup --restore;
+					when mysqld is first time started
+					on the restored database, it can
+					print helpful info for the user */
+#define	LOG_FILE_ARCH_COMPLETED	OS_FILE_LOG_BLOCK_SIZE
+					/* this 4-byte field is TRUE when
+					the writing of an archived log file
+					has been completed; this field is
+					only defined in an archived log file */
+#define LOG_FILE_END_LSN	(OS_FILE_LOG_BLOCK_SIZE + 4)
+					/* lsn where the archived log file
+					at least extends: actually the
+					archived log file may extend to a
+					later lsn, as long as it is within the
+					same log block as this lsn; this field
+					is defined only when an archived log
+					file has been completely written */
+#define LOG_CHECKPOINT_1	OS_FILE_LOG_BLOCK_SIZE
+					/* first checkpoint field in the log
+					header; we write alternately to the
+					checkpoint fields when we make new
+					checkpoints; this field is only defined
+					in the first log file of a log group */
+#define LOG_CHECKPOINT_2	(3 * OS_FILE_LOG_BLOCK_SIZE)
+					/* second checkpoint field in the log
+					header */
+#define LOG_FILE_HDR_SIZE	(4 * OS_FILE_LOG_BLOCK_SIZE)
+
+#define LOG_GROUP_OK		301
+#define LOG_GROUP_CORRUPTED	302
+
+/** Log group consists of a number of log files, each of the same size; a log
+group is implemented as a space in the sense of the module fil0fil. */
+struct log_group_struct{
+	/* The following fields are protected by log_sys->mutex */
+	ulint		id;		/*!< log group id */
+	ulint		n_files;	/*!< number of files in the group */
+	ulint		file_size;	/*!< individual log file size in bytes,
+					including the log file header */
+	ulint		space_id;	/*!< file space which implements the log
+					group */
+	ulint		state;		/*!< LOG_GROUP_OK or
+					LOG_GROUP_CORRUPTED */
+	ib_uint64_t	lsn;		/*!< lsn used to fix coordinates within
+					the log group */
+	ulint		lsn_offset;	/*!< the offset of the above lsn */
+	ulint		n_pending_writes;/*!< number of currently pending flush
+					writes for this log group */
+	byte**		file_header_bufs_ptr;/*!< unaligned buffers */
+	byte**		file_header_bufs;/*!< buffers for each file
+					header in the group */
+#ifdef UNIV_LOG_ARCHIVE
+	/*-----------------------------*/
+	byte**		archive_file_header_bufs_ptr;/*!< unaligned buffers */
+	byte**		archive_file_header_bufs;/*!< buffers for each file
+					header in the group */
+	ulint		archive_space_id;/*!< file space which
+					implements the log group
+					archive */
+	ulint		archived_file_no;/*!< file number corresponding to
+					log_sys->archived_lsn */
+	ulint		archived_offset;/*!< file offset corresponding to
+					log_sys->archived_lsn, 0 if we have
+					not yet written to the archive file
+					number archived_file_no */
+	ulint		next_archived_file_no;/*!< during an archive write,
+					until the write is completed, we
+					store the next value for
+					archived_file_no here: the write
+					completion function then sets the new
+					value to ..._file_no */
+	ulint		next_archived_offset; /*!< like the preceding field */
+#endif /* UNIV_LOG_ARCHIVE */
+	/*-----------------------------*/
+	ib_uint64_t	scanned_lsn;	/*!< used only in recovery: recovery scan
+					succeeded up to this lsn in this log
+					group */
+	byte*		checkpoint_buf_ptr;/*!< unaligned checkpoint header */
+	byte*		checkpoint_buf;	/*!< checkpoint header is written from
+					this buffer to the group */
+	UT_LIST_NODE_T(log_group_t)
+			log_groups;	/*!< list of log groups */
+};
+
+/** Redo log buffer */
+struct log_struct{
+	byte		pad[64];	/*!< padding to prevent other memory
+					update hotspots from residing on the
+					same memory cache line */
+	ib_uint64_t	lsn;		/*!< log sequence number */
+	ulint		buf_free;	/*!< first free offset within the log
+					buffer */
+#ifndef UNIV_HOTBACKUP
+	mutex_t		mutex;		/*!< mutex protecting the log */
+#endif /* !UNIV_HOTBACKUP */
+	byte*		buf_ptr;	/* unaligned log buffer */
+	byte*		buf;		/*!< log buffer */
+	ulint		buf_size;	/*!< log buffer size in bytes */
+	ulint		max_buf_free;	/*!< recommended maximum value of
+					buf_free, after which the buffer is
+					flushed */
+	ulint		old_buf_free;	/*!< value of buf free when log was
+					last time opened; only in the debug
+					version */
+	ib_uint64_t	old_lsn;	/*!< value of lsn when log was
+					last time opened; only in the
+					debug version */
+	ibool		check_flush_or_checkpoint;
+					/*!< this is set to TRUE when there may
+					be need to flush the log buffer, or
+					preflush buffer pool pages, or make
+					a checkpoint; this MUST be TRUE when
+					lsn - last_checkpoint_lsn >
+					max_checkpoint_age; this flag is
+					peeked at by log_free_check(), which
+					does not reserve the log mutex */
+	UT_LIST_BASE_NODE_T(log_group_t)
+			log_groups;	/*!< log groups */
+
+#ifndef UNIV_HOTBACKUP
+	/** The fields involved in the log buffer flush @{ */
+
+	ulint		buf_next_to_write;/*!< first offset in the log buffer
+					where the byte content may not exist
+					written to file, e.g., the start
+					offset of a log record catenated
+					later; this is advanced when a flush
+					operation is completed to all the log
+					groups */
+	ib_uint64_t	written_to_some_lsn;
+					/*!< first log sequence number not yet
+					written to any log group; for this to
+					be advanced, it is enough that the
+					write i/o has been completed for any
+					one log group */
+	ib_uint64_t	written_to_all_lsn;
+					/*!< first log sequence number not yet
+					written to some log group; for this to
+					be advanced, it is enough that the
+					write i/o has been completed for all
+					log groups.
+					Note that since InnoDB currently
+					has only one log group therefore
+					this value is redundant. Also it
+					is possible that this value
+					falls behind the
+					flushed_to_disk_lsn transiently.
+					It is appropriate to use either
+					flushed_to_disk_lsn or
+					write_lsn which are always
+					up-to-date and accurate. */
+	ib_uint64_t	write_lsn;	/*!< end lsn for the current running
+					write */
+	ulint		write_end_offset;/*!< the data in buffer has
+					been written up to this offset
+					when the current write ends:
+					this field will then be copied
+					to buf_next_to_write */
+	ib_uint64_t	current_flush_lsn;/*!< end lsn for the current running
+					write + flush operation */
+	ib_uint64_t	flushed_to_disk_lsn;
+					/*!< how far we have written the log
+					AND flushed to disk */
+	ulint		n_pending_writes;/*!< number of currently
+					pending flushes or writes */
+	/* NOTE on the 'flush' in names of the fields below: starting from
+	4.0.14, we separate the write of the log file and the actual fsync()
+	or other method to flush it to disk. The names below shhould really
+	be 'flush_or_write'! */
+	os_event_t	no_flush_event;	/*!< this event is in the reset state
+					when a flush or a write is running;
+					a thread should wait for this without
+					owning the log mutex, but NOTE that
+					to set or reset this event, the
+					thread MUST own the log mutex! */
+	ibool		one_flushed;	/*!< during a flush, this is
+					first FALSE and becomes TRUE
+					when one log group has been
+					written or flushed */
+	os_event_t	one_flushed_event;/*!< this event is reset when the
+					flush or write has not yet completed
+					for any log group; e.g., this means
+					that a transaction has been committed
+					when this is set; a thread should wait
+					for this without owning the log mutex,
+					but NOTE that to set or reset this
+					event, the thread MUST own the log
+					mutex! */
+	ulint		n_log_ios;	/*!< number of log i/os initiated thus
+					far */
+	ulint		n_log_ios_old;	/*!< number of log i/o's at the
+					previous printout */
+	time_t		last_printout_time;/*!< when log_print was last time
+					called */
+	/* @} */
+
+	/** Fields involved in checkpoints @{ */
+	ulint		log_group_capacity; /*!< capacity of the log group; if
+					the checkpoint age exceeds this, it is
+					a serious error because it is possible
+					we will then overwrite log and spoil
+					crash recovery */
+	ulint		max_modified_age_async;
+					/*!< when this recommended
+					value for lsn -
+					buf_pool_get_oldest_modification()
+					is exceeded, we start an
+					asynchronous preflush of pool pages */
+	ulint		max_modified_age_sync;
+					/*!< when this recommended
+					value for lsn -
+					buf_pool_get_oldest_modification()
+					is exceeded, we start a
+					synchronous preflush of pool pages */
+	ulint		adm_checkpoint_interval;
+					/*!< administrator-specified checkpoint
+					interval in terms of log growth in
+					bytes; the interval actually used by
+					the database can be smaller */
+	ulint		max_checkpoint_age_async;
+					/*!< when this checkpoint age
+					is exceeded we start an
+					asynchronous writing of a new
+					checkpoint */
+	ulint		max_checkpoint_age;
+					/*!< this is the maximum allowed value
+					for lsn - last_checkpoint_lsn when a
+					new query step is started */
+	ib_uint64_t	next_checkpoint_no;
+					/*!< next checkpoint number */
+	ib_uint64_t	last_checkpoint_lsn;
+					/*!< latest checkpoint lsn */
+	ib_uint64_t	next_checkpoint_lsn;
+					/*!< next checkpoint lsn */
+	ulint		n_pending_checkpoint_writes;
+					/*!< number of currently pending
+					checkpoint writes */
+	rw_lock_t	checkpoint_lock;/*!< this latch is x-locked when a
+					checkpoint write is running; a thread
+					should wait for this without owning
+					the log mutex */
+#endif /* !UNIV_HOTBACKUP */
+	byte*		checkpoint_buf_ptr;/* unaligned checkpoint header */
+	byte*		checkpoint_buf;	/*!< checkpoint header is read to this
+					buffer */
+	/* @} */
+#ifdef UNIV_LOG_ARCHIVE
+	/** Fields involved in archiving @{ */
+	ulint		archiving_state;/*!< LOG_ARCH_ON, LOG_ARCH_STOPPING
+					LOG_ARCH_STOPPED, LOG_ARCH_OFF */
+	ib_uint64_t	archived_lsn;	/*!< archiving has advanced to this
+					lsn */
+	ulint		max_archived_lsn_age_async;
+					/*!< recommended maximum age of
+					archived_lsn, before we start
+					asynchronous copying to the archive */
+	ulint		max_archived_lsn_age;
+					/*!< maximum allowed age for
+					archived_lsn */
+	ib_uint64_t	next_archived_lsn;/*!< during an archive write,
+					until the write is completed, we
+					store the next value for
+					archived_lsn here: the write
+					completion function then sets the new
+					value to archived_lsn */
+	ulint		archiving_phase;/*!< LOG_ARCHIVE_READ or
+					LOG_ARCHIVE_WRITE */
+	ulint		n_pending_archive_ios;
+					/*!< number of currently pending reads
+					or writes in archiving */
+	rw_lock_t	archive_lock;	/*!< this latch is x-locked when an
+					archive write is running; a thread
+					should wait for this without owning
+					the log mutex */
+	ulint		archive_buf_size;/*!< size of archive_buf */
+	byte*		archive_buf;	/*!< log segment is written to the
+					archive from this buffer */
+	os_event_t	archiving_on;	/*!< if archiving has been stopped,
+					a thread can wait for this event to
+					become signaled */
+	/* @} */
+#endif /* UNIV_LOG_ARCHIVE */
+};
+
+#ifdef UNIV_LOG_ARCHIVE
+/** Archiving state @{ */
+#define LOG_ARCH_ON		71
+#define LOG_ARCH_STOPPING	72
+#define LOG_ARCH_STOPPING2	73
+#define LOG_ARCH_STOPPED	74
+#define LOG_ARCH_OFF		75
+/* @} */
+#endif /* UNIV_LOG_ARCHIVE */
+
+#ifndef UNIV_NONINL
+#include "log0log.ic"
+#endif
+
+#endif
diff --git a/storage/xtradb/include/log0log.ic b/storage/xtradb/include/log0log.ic
new file mode 100644
index 00000000000..1ce00fd7313
--- /dev/null
+++ b/storage/xtradb/include/log0log.ic
@@ -0,0 +1,446 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2010, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/log0log.ic
+Database log
+
+Created 12/9/1995 Heikki Tuuri
+*******************************************************/
+
+#include "os0file.h"
+#include "mach0data.h"
+#include "mtr0mtr.h"
+
+#ifdef UNIV_LOG_DEBUG
+/******************************************************//**
+Checks by parsing that the catenated log segment for a single mtr is
+consistent. */
+UNIV_INTERN
+ibool
+log_check_log_recs(
+/*===============*/
+	const byte*	buf,		/*!< in: pointer to the start of
+					the log segment in the
+					log_sys->buf log buffer */
+	ulint		len,		/*!< in: segment length in bytes */
+	ib_uint64_t	buf_start_lsn);	/*!< in: buffer start lsn */
+#endif /* UNIV_LOG_DEBUG */
+
+/************************************************************//**
+Gets a log block flush bit.
+@return	TRUE if this block was the first to be written in a log flush */
+UNIV_INLINE
+ibool
+log_block_get_flush_bit(
+/*====================*/
+	const byte*	log_block)	/*!< in: log block */
+{
+	if (LOG_BLOCK_FLUSH_BIT_MASK
+	    & mach_read_from_4(log_block + LOG_BLOCK_HDR_NO)) {
+
+		return(TRUE);
+	}
+
+	return(FALSE);
+}
+
+/************************************************************//**
+Sets the log block flush bit. */
+UNIV_INLINE
+void
+log_block_set_flush_bit(
+/*====================*/
+	byte*	log_block,	/*!< in/out: log block */
+	ibool	val)		/*!< in: value to set */
+{
+	ulint	field;
+
+	field = mach_read_from_4(log_block + LOG_BLOCK_HDR_NO);
+
+	if (val) {
+		field = field | LOG_BLOCK_FLUSH_BIT_MASK;
+	} else {
+		field = field & ~LOG_BLOCK_FLUSH_BIT_MASK;
+	}
+
+	mach_write_to_4(log_block + LOG_BLOCK_HDR_NO, field);
+}
+
+/************************************************************//**
+Gets a log block number stored in the header.
+@return	log block number stored in the block header */
+UNIV_INLINE
+ulint
+log_block_get_hdr_no(
+/*=================*/
+	const byte*	log_block)	/*!< in: log block */
+{
+	return(~LOG_BLOCK_FLUSH_BIT_MASK
+	       & mach_read_from_4(log_block + LOG_BLOCK_HDR_NO));
+}
+
+/************************************************************//**
+Sets the log block number stored in the header; NOTE that this must be set
+before the flush bit! */
+UNIV_INLINE
+void
+log_block_set_hdr_no(
+/*=================*/
+	byte*	log_block,	/*!< in/out: log block */
+	ulint	n)		/*!< in: log block number: must be > 0 and
+				< LOG_BLOCK_FLUSH_BIT_MASK */
+{
+	ut_ad(n > 0);
+	ut_ad(n < LOG_BLOCK_FLUSH_BIT_MASK);
+
+	mach_write_to_4(log_block + LOG_BLOCK_HDR_NO, n);
+}
+
+/************************************************************//**
+Gets a log block data length.
+@return	log block data length measured as a byte offset from the block start */
+UNIV_INLINE
+ulint
+log_block_get_data_len(
+/*===================*/
+	const byte*	log_block)	/*!< in: log block */
+{
+	return(mach_read_from_2(log_block + LOG_BLOCK_HDR_DATA_LEN));
+}
+
+/************************************************************//**
+Sets the log block data length. */
+UNIV_INLINE
+void
+log_block_set_data_len(
+/*===================*/
+	byte*	log_block,	/*!< in/out: log block */
+	ulint	len)		/*!< in: data length */
+{
+	mach_write_to_2(log_block + LOG_BLOCK_HDR_DATA_LEN, len);
+}
+
+/************************************************************//**
+Gets a log block first mtr log record group offset.
+@return first mtr log record group byte offset from the block start, 0
+if none */
+UNIV_INLINE
+ulint
+log_block_get_first_rec_group(
+/*==========================*/
+	const byte*	log_block)	/*!< in: log block */
+{
+	return(mach_read_from_2(log_block + LOG_BLOCK_FIRST_REC_GROUP));
+}
+
+/************************************************************//**
+Sets the log block first mtr log record group offset. */
+UNIV_INLINE
+void
+log_block_set_first_rec_group(
+/*==========================*/
+	byte*	log_block,	/*!< in/out: log block */
+	ulint	offset)		/*!< in: offset, 0 if none */
+{
+	mach_write_to_2(log_block + LOG_BLOCK_FIRST_REC_GROUP, offset);
+}
+
+/************************************************************//**
+Gets a log block checkpoint number field (4 lowest bytes).
+@return	checkpoint no (4 lowest bytes) */
+UNIV_INLINE
+ulint
+log_block_get_checkpoint_no(
+/*========================*/
+	const byte*	log_block)	/*!< in: log block */
+{
+	return(mach_read_from_4(log_block + LOG_BLOCK_CHECKPOINT_NO));
+}
+
+/************************************************************//**
+Sets a log block checkpoint number field (4 lowest bytes). */
+UNIV_INLINE
+void
+log_block_set_checkpoint_no(
+/*========================*/
+	byte*		log_block,	/*!< in/out: log block */
+	ib_uint64_t	no)		/*!< in: checkpoint no */
+{
+	mach_write_to_4(log_block + LOG_BLOCK_CHECKPOINT_NO, (ulint) no);
+}
+
+/************************************************************//**
+Converts a lsn to a log block number.
+@return	log block number, it is > 0 and <= 1G */
+UNIV_INLINE
+ulint
+log_block_convert_lsn_to_no(
+/*========================*/
+	ib_uint64_t	lsn)	/*!< in: lsn of a byte within the block */
+{
+	return(((ulint) (lsn / OS_FILE_LOG_BLOCK_SIZE) & 0x3FFFFFFFUL) + 1);
+}
+
+/************************************************************//**
+Calculates the checksum for a log block.
+@return	checksum */
+UNIV_INLINE
+ulint
+log_block_calc_checksum(
+/*====================*/
+	const byte*	block)	/*!< in: log block */
+{
+	ulint	sum;
+	ulint	sh;
+	ulint	i;
+
+	sum = 1;
+	sh = 0;
+
+	for (i = 0; i < OS_FILE_LOG_BLOCK_SIZE - LOG_BLOCK_TRL_SIZE; i++) {
+		ulint	b = (ulint) block[i];
+		sum &= 0x7FFFFFFFUL;
+		sum += b;
+		sum += b << sh;
+		sh++;
+		if (sh > 24) {
+			sh = 0;
+		}
+	}
+
+	return(sum);
+}
+
+/************************************************************//**
+Gets a log block checksum field value.
+@return	checksum */
+UNIV_INLINE
+ulint
+log_block_get_checksum(
+/*===================*/
+	const byte*	log_block)	/*!< in: log block */
+{
+	return(mach_read_from_4(log_block + OS_FILE_LOG_BLOCK_SIZE
+				- LOG_BLOCK_CHECKSUM));
+}
+
+/************************************************************//**
+Sets a log block checksum field value. */
+UNIV_INLINE
+void
+log_block_set_checksum(
+/*===================*/
+	byte*	log_block,	/*!< in/out: log block */
+	ulint	checksum)	/*!< in: checksum */
+{
+	mach_write_to_4(log_block + OS_FILE_LOG_BLOCK_SIZE
+			- LOG_BLOCK_CHECKSUM,
+			checksum);
+}
+
+/************************************************************//**
+Initializes a log block in the log buffer. */
+UNIV_INLINE
+void
+log_block_init(
+/*===========*/
+	byte*		log_block,	/*!< in: pointer to the log buffer */
+	ib_uint64_t	lsn)		/*!< in: lsn within the log block */
+{
+	ulint	no;
+
+	ut_ad(mutex_own(&(log_sys->mutex)));
+
+	no = log_block_convert_lsn_to_no(lsn);
+
+	log_block_set_hdr_no(log_block, no);
+
+	log_block_set_data_len(log_block, LOG_BLOCK_HDR_SIZE);
+	log_block_set_first_rec_group(log_block, 0);
+}
+
+/************************************************************//**
+Initializes a log block in the log buffer in the old format, where there
+was no checksum yet. */
+UNIV_INLINE
+void
+log_block_init_in_old_format(
+/*=========================*/
+	byte*		log_block,	/*!< in: pointer to the log buffer */
+	ib_uint64_t	lsn)		/*!< in: lsn within the log block */
+{
+	ulint	no;
+
+	ut_ad(mutex_own(&(log_sys->mutex)));
+
+	no = log_block_convert_lsn_to_no(lsn);
+
+	log_block_set_hdr_no(log_block, no);
+	mach_write_to_4(log_block + OS_FILE_LOG_BLOCK_SIZE
+			- LOG_BLOCK_CHECKSUM, no);
+	log_block_set_data_len(log_block, LOG_BLOCK_HDR_SIZE);
+	log_block_set_first_rec_group(log_block, 0);
+}
+
+#ifndef UNIV_HOTBACKUP
+/************************************************************//**
+Writes to the log the string given. The log must be released with
+log_release.
+@return	end lsn of the log record, zero if did not succeed */
+UNIV_INLINE
+ib_uint64_t
+log_reserve_and_write_fast(
+/*=======================*/
+	const void*	str,	/*!< in: string */
+	ulint		len,	/*!< in: string length */
+	ib_uint64_t*	start_lsn)/*!< out: start lsn of the log record */
+{
+	ulint		data_len;
+#ifdef UNIV_LOG_LSN_DEBUG
+	/* length of the LSN pseudo-record */
+	ulint		lsn_len;
+#endif /* UNIV_LOG_LSN_DEBUG */
+
+	mutex_enter(&log_sys->mutex);
+#ifdef UNIV_LOG_LSN_DEBUG
+	lsn_len = 1
+		+ mach_get_compressed_size(log_sys->lsn >> 32)
+		+ mach_get_compressed_size(log_sys->lsn & 0xFFFFFFFFUL);
+#endif /* UNIV_LOG_LSN_DEBUG */
+
+	data_len = len
+#ifdef UNIV_LOG_LSN_DEBUG
+		+ lsn_len
+#endif /* UNIV_LOG_LSN_DEBUG */
+		+ log_sys->buf_free % OS_FILE_LOG_BLOCK_SIZE;
+
+	if (data_len >= OS_FILE_LOG_BLOCK_SIZE - LOG_BLOCK_TRL_SIZE) {
+
+		/* The string does not fit within the current log block
+		or the log block would become full */
+
+		mutex_exit(&log_sys->mutex);
+
+		return(0);
+	}
+
+	*start_lsn = log_sys->lsn;
+
+#ifdef UNIV_LOG_LSN_DEBUG
+	{
+		/* Write the LSN pseudo-record. */
+		byte* b = &log_sys->buf[log_sys->buf_free];
+		*b++ = MLOG_LSN | (MLOG_SINGLE_REC_FLAG & *(const byte*) str);
+		/* Write the LSN in two parts,
+		as a pseudo page number and space id. */
+		b += mach_write_compressed(b, log_sys->lsn >> 32);
+		b += mach_write_compressed(b, log_sys->lsn & 0xFFFFFFFFUL);
+		ut_a(b - lsn_len == &log_sys->buf[log_sys->buf_free]);
+
+		memcpy(b, str, len);
+		len += lsn_len;
+	}
+#else /* UNIV_LOG_LSN_DEBUG */
+	memcpy(log_sys->buf + log_sys->buf_free, str, len);
+#endif /* UNIV_LOG_LSN_DEBUG */
+
+	log_block_set_data_len((byte*) ut_align_down(log_sys->buf
+						     + log_sys->buf_free,
+						     OS_FILE_LOG_BLOCK_SIZE),
+			       data_len);
+#ifdef UNIV_LOG_DEBUG
+	log_sys->old_buf_free = log_sys->buf_free;
+	log_sys->old_lsn = log_sys->lsn;
+#endif
+	log_sys->buf_free += len;
+
+	ut_ad(log_sys->buf_free <= log_sys->buf_size);
+
+	log_sys->lsn += len;
+
+#ifdef UNIV_LOG_DEBUG
+	log_check_log_recs(log_sys->buf + log_sys->old_buf_free,
+			   log_sys->buf_free - log_sys->old_buf_free,
+			   log_sys->old_lsn);
+#endif
+	return(log_sys->lsn);
+}
+
+/***********************************************************************//**
+Releases the log mutex. */
+UNIV_INLINE
+void
+log_release(void)
+/*=============*/
+{
+	mutex_exit(&(log_sys->mutex));
+}
+
+/************************************************************//**
+Gets the current lsn.
+@return	current lsn */
+UNIV_INLINE
+ib_uint64_t
+log_get_lsn(void)
+/*=============*/
+{
+	ib_uint64_t	lsn;
+
+	mutex_enter(&(log_sys->mutex));
+
+	lsn = log_sys->lsn;
+
+	mutex_exit(&(log_sys->mutex));
+
+	return(lsn);
+}
+
+/****************************************************************
+Gets the log group capacity. It is OK to read the value without
+holding log_sys->mutex because it is constant.
+@return	log group capacity */
+UNIV_INLINE
+ulint
+log_get_capacity(void)
+/*==================*/
+{
+	return(log_sys->log_group_capacity);
+}
+
+/***********************************************************************//**
+Checks if there is need for a log buffer flush or a new checkpoint, and does
+this if yes. Any database operation should call this when it has modified
+more than about 4 pages. NOTE that this function may only be called when the
+OS thread owns no synchronization objects except the dictionary mutex. */
+UNIV_INLINE
+void
+log_free_check(void)
+/*================*/
+{
+
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(sync_thread_levels_empty_gen(TRUE));
+#endif /* UNIV_SYNC_DEBUG */
+
+	if (log_sys->check_flush_or_checkpoint) {
+
+		log_check_margins();
+	}
+}
+#endif /* !UNIV_HOTBACKUP */
diff --git a/storage/xtradb/include/log0recv.h b/storage/xtradb/include/log0recv.h
new file mode 100644
index 00000000000..15065267250
--- /dev/null
+++ b/storage/xtradb/include/log0recv.h
@@ -0,0 +1,530 @@
+/*****************************************************************************
+
+Copyright (c) 1997, 2010, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/log0recv.h
+Recovery
+
+Created 9/20/1997 Heikki Tuuri
+*******************************************************/
+
+#ifndef log0recv_h
+#define log0recv_h
+
+#include "univ.i"
+#include "ut0byte.h"
+#include "buf0types.h"
+#include "hash0hash.h"
+#include "log0log.h"
+
+#ifdef UNIV_HOTBACKUP
+extern ibool	recv_replay_file_ops;
+
+/*******************************************************************//**
+Reads the checkpoint info needed in hot backup.
+@return	TRUE if success */
+UNIV_INTERN
+ibool
+recv_read_cp_info_for_backup(
+/*=========================*/
+	const byte*	hdr,	/*!< in: buffer containing the log group
+				header */
+	ib_uint64_t*	lsn,	/*!< out: checkpoint lsn */
+	ulint*		offset,	/*!< out: checkpoint offset in the log group */
+	ulint*		fsp_limit,/*!< out: fsp limit of space 0,
+				1000000000 if the database is running
+				with < version 3.23.50 of InnoDB */
+	ib_uint64_t*	cp_no,	/*!< out: checkpoint number */
+	ib_uint64_t*	first_header_lsn);
+				/*!< out: lsn of of the start of the
+				first log file */
+/*******************************************************************//**
+Scans the log segment and n_bytes_scanned is set to the length of valid
+log scanned. */
+UNIV_INTERN
+void
+recv_scan_log_seg_for_backup(
+/*=========================*/
+	byte*		buf,		/*!< in: buffer containing log data */
+	ulint		buf_len,	/*!< in: data length in that buffer */
+	ib_uint64_t*	scanned_lsn,	/*!< in/out: lsn of buffer start,
+					we return scanned lsn */
+	ulint*		scanned_checkpoint_no,
+					/*!< in/out: 4 lowest bytes of the
+					highest scanned checkpoint number so
+					far */
+	ulint*		n_bytes_scanned);/*!< out: how much we were able to
+					scan, smaller than buf_len if log
+					data ended here */
+#endif /* UNIV_HOTBACKUP */
+/*******************************************************************//**
+Returns TRUE if recovery is currently running.
+@return	recv_recovery_on */
+UNIV_INLINE
+ibool
+recv_recovery_is_on(void);
+/*=====================*/
+#ifdef UNIV_LOG_ARCHIVE
+/*******************************************************************//**
+Returns TRUE if recovery from backup is currently running.
+@return	recv_recovery_from_backup_on */
+UNIV_INLINE
+ibool
+recv_recovery_from_backup_is_on(void);
+/*=================================*/
+#endif /* UNIV_LOG_ARCHIVE */
+/************************************************************************//**
+Applies the hashed log records to the page, if the page lsn is less than the
+lsn of a log record. This can be called when a buffer page has just been
+read in, or also for a page already in the buffer pool. */
+UNIV_INTERN
+void
+recv_recover_page_func(
+/*===================*/
+#ifndef UNIV_HOTBACKUP
+	ibool		just_read_in,
+				/*!< in: TRUE if the i/o handler calls
+				this for a freshly read page */
+#endif /* !UNIV_HOTBACKUP */
+	buf_block_t*	block);	/*!< in/out: buffer block */
+#ifndef UNIV_HOTBACKUP
+/** Wrapper for recv_recover_page_func().
+Applies the hashed log records to the page, if the page lsn is less than the
+lsn of a log record. This can be called when a buffer page has just been
+read in, or also for a page already in the buffer pool.
+@param jri	in: TRUE if just read in (the i/o handler calls this for
+a freshly read page)
+@param block	in/out: the buffer block
+*/
+# define recv_recover_page(jri, block)	recv_recover_page_func(jri, block)
+#else /* !UNIV_HOTBACKUP */
+/** Wrapper for recv_recover_page_func().
+Applies the hashed log records to the page, if the page lsn is less than the
+lsn of a log record. This can be called when a buffer page has just been
+read in, or also for a page already in the buffer pool.
+@param jri	in: TRUE if just read in (the i/o handler calls this for
+a freshly read page)
+@param block	in/out: the buffer block
+*/
+# define recv_recover_page(jri, block)	recv_recover_page_func(block)
+#endif /* !UNIV_HOTBACKUP */
+/********************************************************//**
+Recovers from a checkpoint. When this function returns, the database is able
+to start processing of new user transactions, but the function
+recv_recovery_from_checkpoint_finish should be called later to complete
+the recovery and free the resources used in it.
+@return	error code or DB_SUCCESS */
+UNIV_INTERN
+ulint
+recv_recovery_from_checkpoint_start_func(
+/*=====================================*/
+#ifdef UNIV_LOG_ARCHIVE
+	ulint		type,		/*!< in: LOG_CHECKPOINT or
+					LOG_ARCHIVE */
+	ib_uint64_t	limit_lsn,	/*!< in: recover up to this lsn
+					if possible */
+#endif /* UNIV_LOG_ARCHIVE */
+	ib_uint64_t	min_flushed_lsn,/*!< in: min flushed lsn from
+					data files */
+	ib_uint64_t	max_flushed_lsn);/*!< in: max flushed lsn from
+					 data files */
+#ifdef UNIV_LOG_ARCHIVE
+/** Wrapper for recv_recovery_from_checkpoint_start_func().
+Recovers from a checkpoint. When this function returns, the database is able
+to start processing of new user transactions, but the function
+recv_recovery_from_checkpoint_finish should be called later to complete
+the recovery and free the resources used in it.
+@param type	in: LOG_CHECKPOINT or LOG_ARCHIVE
+@param lim	in: recover up to this log sequence number if possible
+@param min	in: minimum flushed log sequence number from data files
+@param max	in: maximum flushed log sequence number from data files
+@return	error code or DB_SUCCESS */
+# define recv_recovery_from_checkpoint_start(type,lim,min,max)		\
+	recv_recovery_from_checkpoint_start_func(type,lim,min,max)
+#else /* UNIV_LOG_ARCHIVE */
+/** Wrapper for recv_recovery_from_checkpoint_start_func().
+Recovers from a checkpoint. When this function returns, the database is able
+to start processing of new user transactions, but the function
+recv_recovery_from_checkpoint_finish should be called later to complete
+the recovery and free the resources used in it.
+@param type	ignored: LOG_CHECKPOINT or LOG_ARCHIVE
+@param lim	ignored: recover up to this log sequence number if possible
+@param min	in: minimum flushed log sequence number from data files
+@param max	in: maximum flushed log sequence number from data files
+@return	error code or DB_SUCCESS */
+# define recv_recovery_from_checkpoint_start(type,lim,min,max)		\
+	recv_recovery_from_checkpoint_start_func(min,max)
+#endif /* UNIV_LOG_ARCHIVE */
+/********************************************************//**
+Completes recovery from a checkpoint. */
+UNIV_INTERN
+void
+recv_recovery_from_checkpoint_finish(void);
+/*======================================*/
+/********************************************************//**
+Initiates the rollback of active transactions. */
+UNIV_INTERN
+void
+recv_recovery_rollback_active(void);
+/*===============================*/
+/*******************************************************//**
+Scans log from a buffer and stores new log data to the parsing buffer.
+Parses and hashes the log records if new data found.  Unless
+UNIV_HOTBACKUP is defined, this function will apply log records
+automatically when the hash table becomes full.
+@return TRUE if limit_lsn has been reached, or not able to scan any
+more in this log group */
+UNIV_INTERN
+ibool
+recv_scan_log_recs(
+/*===============*/
+	ulint		available_memory,/*!< in: we let the hash table of recs
+					to grow to this size, at the maximum */
+	ibool		store_to_hash,	/*!< in: TRUE if the records should be
+					stored to the hash table; this is set
+					to FALSE if just debug checking is
+					needed */
+	const byte*	buf,		/*!< in: buffer containing a log
+					segment or garbage */
+	ulint		len,		/*!< in: buffer length */
+	ib_uint64_t	start_lsn,	/*!< in: buffer start lsn */
+	ib_uint64_t*	contiguous_lsn,	/*!< in/out: it is known that all log
+					groups contain contiguous log data up
+					to this lsn */
+	ib_uint64_t*	group_scanned_lsn);/*!< out: scanning succeeded up to
+					this lsn */
+/******************************************************//**
+Resets the logs. The contents of log files will be lost! */
+UNIV_INTERN
+void
+recv_reset_logs(
+/*============*/
+	ib_uint64_t	lsn,		/*!< in: reset to this lsn
+					rounded up to be divisible by
+					OS_FILE_LOG_BLOCK_SIZE, after
+					which we add
+					LOG_BLOCK_HDR_SIZE */
+#ifdef UNIV_LOG_ARCHIVE
+	ulint		arch_log_no,	/*!< in: next archived log file number */
+#endif /* UNIV_LOG_ARCHIVE */
+	ibool		new_logs_created);/*!< in: TRUE if resetting logs
+					is done at the log creation;
+					FALSE if it is done after
+					archive recovery */
+#ifdef UNIV_HOTBACKUP
+/******************************************************//**
+Creates new log files after a backup has been restored. */
+UNIV_INTERN
+void
+recv_reset_log_files_for_backup(
+/*============================*/
+	const char*	log_dir,	/*!< in: log file directory path */
+	ulint		n_log_files,	/*!< in: number of log files */
+	ulint		log_file_size,	/*!< in: log file size */
+	ib_uint64_t	lsn);		/*!< in: new start lsn, must be
+					divisible by OS_FILE_LOG_BLOCK_SIZE */
+#endif /* UNIV_HOTBACKUP */
+/********************************************************//**
+Creates the recovery system. */
+UNIV_INTERN
+void
+recv_sys_create(void);
+/*=================*/
+/**********************************************************//**
+Release recovery system mutexes. */
+UNIV_INTERN
+void
+recv_sys_close(void);
+/*================*/
+/********************************************************//**
+Frees the recovery system memory. */
+UNIV_INTERN
+void
+recv_sys_mem_free(void);
+/*===================*/
+/********************************************************//**
+Inits the recovery system for a recovery operation. */
+UNIV_INTERN
+void
+recv_sys_init(
+/*==========*/
+	ulint	available_memory);	/*!< in: available memory in bytes */
+#ifndef UNIV_HOTBACKUP
+/********************************************************//**
+Reset the state of the recovery system variables. */
+UNIV_INTERN
+void
+recv_sys_var_init(void);
+/*===================*/
+#endif /* !UNIV_HOTBACKUP */
+/*******************************************************************//**
+Empties the hash table of stored log records, applying them to appropriate
+pages. */
+UNIV_INTERN
+void
+recv_apply_hashed_log_recs(
+/*=======================*/
+	ibool	allow_ibuf);	/*!< in: if TRUE, also ibuf operations are
+				allowed during the application; if FALSE,
+				no ibuf operations are allowed, and after
+				the application all file pages are flushed to
+				disk and invalidated in buffer pool: this
+				alternative means that no new log records
+				can be generated during the application */
+#ifdef UNIV_HOTBACKUP
+/*******************************************************************//**
+Applies log records in the hash table to a backup. */
+UNIV_INTERN
+void
+recv_apply_log_recs_for_backup(void);
+/*================================*/
+#endif
+#ifdef UNIV_LOG_ARCHIVE
+/********************************************************//**
+Recovers from archived log files, and also from log files, if they exist.
+@return	error code or DB_SUCCESS */
+UNIV_INTERN
+ulint
+recv_recovery_from_archive_start(
+/*=============================*/
+	ib_uint64_t	min_flushed_lsn,/*!< in: min flushed lsn field from the
+					data files */
+	ib_uint64_t	limit_lsn,	/*!< in: recover up to this lsn if
+					possible */
+	ulint		first_log_no);	/*!< in: number of the first archived
+					log file to use in the recovery; the
+					file will be searched from
+					INNOBASE_LOG_ARCH_DIR specified in
+					server config file */
+/********************************************************//**
+Completes recovery from archive. */
+UNIV_INTERN
+void
+recv_recovery_from_archive_finish(void);
+/*===================================*/
+#endif /* UNIV_LOG_ARCHIVE */
+
+/** Block of log record data */
+typedef struct recv_data_struct	recv_data_t;
+/** Block of log record data */
+struct recv_data_struct{
+	recv_data_t*	next;	/*!< pointer to the next block or NULL */
+				/*!< the log record data is stored physically
+				immediately after this struct, max amount
+				RECV_DATA_BLOCK_SIZE bytes of it */
+};
+
+/** Stored log record struct */
+typedef struct recv_struct	recv_t;
+/** Stored log record struct */
+struct recv_struct{
+	byte		type;	/*!< log record type */
+	ulint		len;	/*!< log record body length in bytes */
+	recv_data_t*	data;	/*!< chain of blocks containing the log record
+				body */
+	ib_uint64_t	start_lsn;/*!< start lsn of the log segment written by
+				the mtr which generated this log record: NOTE
+				that this is not necessarily the start lsn of
+				this log record */
+	ib_uint64_t	end_lsn;/*!< end lsn of the log segment written by
+				the mtr which generated this log record: NOTE
+				that this is not necessarily the end lsn of
+				this log record */
+	UT_LIST_NODE_T(recv_t)
+			rec_list;/*!< list of log records for this page */
+};
+
+/** States of recv_addr_struct */
+enum recv_addr_state {
+	/** not yet processed */
+	RECV_NOT_PROCESSED,
+	/** page is being read */
+	RECV_BEING_READ,
+	/** log records are being applied on the page */
+	RECV_BEING_PROCESSED,
+	/** log records have been applied on the page, or they have
+	been discarded because the tablespace does not exist */
+	RECV_PROCESSED
+};
+
+/** Hashed page file address struct */
+typedef struct recv_addr_struct	recv_addr_t;
+/** Hashed page file address struct */
+struct recv_addr_struct{
+	enum recv_addr_state state;
+				/*!< recovery state of the page */
+	unsigned	space:32;/*!< space id */
+	unsigned	page_no:32;/*!< page number */
+	UT_LIST_BASE_NODE_T(recv_t)
+			rec_list;/*!< list of log records for this page */
+	hash_node_t	addr_hash;/*!< hash node in the hash bucket chain */
+};
+
+/** Recovery system data structure */
+typedef struct recv_sys_struct	recv_sys_t;
+/** Recovery system data structure */
+struct recv_sys_struct{
+#ifndef UNIV_HOTBACKUP
+	mutex_t		mutex;	/*!< mutex protecting the fields apply_log_recs,
+				n_addrs, and the state field in each recv_addr
+				struct */
+#endif /* !UNIV_HOTBACKUP */
+	ibool		apply_log_recs;
+				/*!< this is TRUE when log rec application to
+				pages is allowed; this flag tells the
+				i/o-handler if it should do log record
+				application */
+	ibool		apply_batch_on;
+				/*!< this is TRUE when a log rec application
+				batch is running */
+	ib_uint64_t	lsn;	/*!< log sequence number */
+	ulint		last_log_buf_size;
+				/*!< size of the log buffer when the database
+				last time wrote to the log */
+	byte*		last_block;
+				/*!< possible incomplete last recovered log
+				block */
+	byte*		last_block_buf_start;
+				/*!< the nonaligned start address of the
+				preceding buffer */
+	byte*		buf;	/*!< buffer for parsing log records */
+	ulint		len;	/*!< amount of data in buf */
+	ib_uint64_t	parse_start_lsn;
+				/*!< this is the lsn from which we were able to
+				start parsing log records and adding them to
+				the hash table; zero if a suitable
+				start point not found yet */
+	ib_uint64_t	scanned_lsn;
+				/*!< the log data has been scanned up to this
+				lsn */
+	ulint		scanned_checkpoint_no;
+				/*!< the log data has been scanned up to this
+				checkpoint number (lowest 4 bytes) */
+	ulint		recovered_offset;
+				/*!< start offset of non-parsed log records in
+				buf */
+	ib_uint64_t	recovered_lsn;
+				/*!< the log records have been parsed up to
+				this lsn */
+	ib_uint64_t	limit_lsn;/*!< recovery should be made at most
+				up to this lsn */
+	ibool		found_corrupt_log;
+				/*!< this is set to TRUE if we during log
+				scan find a corrupt log block, or a corrupt
+				log record, or there is a log parsing
+				buffer overflow */
+#ifdef UNIV_LOG_ARCHIVE
+	log_group_t*	archive_group;
+				/*!< in archive recovery: the log group whose
+				archive is read */
+#endif /* !UNIV_LOG_ARCHIVE */
+	mem_heap_t*	heap;	/*!< memory heap of log records and file
+				addresses*/
+	hash_table_t*	addr_hash;/*!< hash table of file addresses of pages */
+	ulint		n_addrs;/*!< number of not processed hashed file
+				addresses in the hash table */
+
+/* If you modified the following defines at original file,
+   You should also modify them. */
+/* defined in os0file.c */
+#define OS_AIO_MERGE_N_CONSECUTIVE	64
+/* defined in log0recv.c */
+#define RECV_READ_AHEAD_AREA	32
+	time_t		stats_recv_start_time;
+	ulint		stats_recv_turns;
+
+	ulint		stats_read_requested_pages;
+	ulint		stats_read_in_area[RECV_READ_AHEAD_AREA];
+
+	ulint		stats_read_io_pages;
+	ulint		stats_read_io_consecutive[OS_AIO_MERGE_N_CONSECUTIVE];
+	ulint		stats_write_io_pages;
+	ulint		stats_write_io_consecutive[OS_AIO_MERGE_N_CONSECUTIVE];
+
+	ulint		stats_doublewrite_check_pages;
+	ulint		stats_doublewrite_overwrite_pages;
+
+	ulint		stats_recover_pages_with_read;
+	ulint		stats_recover_pages_without_read;
+
+	ulint		stats_log_recs;
+	ulint		stats_log_len_sum;
+
+	ulint		stats_applied_log_recs;
+	ulint		stats_applied_log_len_sum;
+	ulint		stats_pages_already_new;
+
+	ib_uint64_t	stats_oldest_modified_lsn;
+	ib_uint64_t	stats_newest_modified_lsn;
+};
+
+/** The recovery system */
+extern recv_sys_t*	recv_sys;
+
+/** TRUE when applying redo log records during crash recovery; FALSE
+otherwise.  Note that this is FALSE while a background thread is
+rolling back incomplete transactions. */
+extern ibool		recv_recovery_on;
+/** If the following is TRUE, the buffer pool file pages must be invalidated
+after recovery and no ibuf operations are allowed; this becomes TRUE if
+the log record hash table becomes too full, and log records must be merged
+to file pages already before the recovery is finished: in this case no
+ibuf operations are allowed, as they could modify the pages read in the
+buffer pool before the pages have been recovered to the up-to-date state.
+
+TRUE means that recovery is running and no operations on the log files
+are allowed yet: the variable name is misleading. */
+extern ibool		recv_no_ibuf_operations;
+/** TRUE when recv_init_crash_recovery() has been called. */
+extern ibool		recv_needed_recovery;
+#ifdef UNIV_DEBUG
+/** TRUE if writing to the redo log (mtr_commit) is forbidden.
+Protected by log_sys->mutex. */
+extern ibool		recv_no_log_write;
+#endif /* UNIV_DEBUG */
+
+/** TRUE if buf_page_is_corrupted() should check if the log sequence
+number (FIL_PAGE_LSN) is in the future.  Initially FALSE, and set by
+recv_recovery_from_checkpoint_start_func(). */
+extern ibool		recv_lsn_checks_on;
+#ifdef UNIV_HOTBACKUP
+/** TRUE when the redo log is being backed up */
+extern ibool		recv_is_making_a_backup;
+#endif /* UNIV_HOTBACKUP */
+/** Maximum page number encountered in the redo log */
+extern ulint		recv_max_parsed_page_no;
+
+/** Size of the parsing buffer; it must accommodate RECV_SCAN_SIZE many
+times! */
+#define RECV_PARSING_BUF_SIZE	(2 * 1024 * 1024)
+
+/** Size of block reads when the log groups are scanned forward to do a
+roll-forward */
+#define RECV_SCAN_SIZE		(4 * UNIV_PAGE_SIZE)
+
+/** This many frames must be left free in the buffer pool when we scan
+the log and store the scanned log records in the buffer pool: we will
+use these free frames to read in pages when we start applying the
+log records to the database. */
+extern ulint	recv_n_pool_free_frames;
+
+#ifndef UNIV_NONINL
+#include "log0recv.ic"
+#endif
+
+#endif
diff --git a/storage/xtradb/include/log0recv.ic b/storage/xtradb/include/log0recv.ic
new file mode 100644
index 00000000000..0a8e55b96fa
--- /dev/null
+++ b/storage/xtradb/include/log0recv.ic
@@ -0,0 +1,53 @@
+/*****************************************************************************
+
+Copyright (c) 1997, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/log0recv.ic
+Recovery
+
+Created 9/20/1997 Heikki Tuuri
+*******************************************************/
+
+#include "univ.i"
+
+/*******************************************************************//**
+Returns TRUE if recovery is currently running.
+@return	recv_recovery_on */
+UNIV_INLINE
+ibool
+recv_recovery_is_on(void)
+/*=====================*/
+{
+	return(UNIV_UNLIKELY(recv_recovery_on));
+}
+
+#ifdef UNIV_LOG_ARCHIVE
+/** TRUE when applying redo log records from an archived log file */
+extern ibool	recv_recovery_from_backup_on;
+
+/*******************************************************************//**
+Returns TRUE if recovery from backup is currently running.
+@return	recv_recovery_from_backup_on */
+UNIV_INLINE
+ibool
+recv_recovery_from_backup_is_on(void)
+/*=================================*/
+{
+	return(recv_recovery_from_backup_on);
+}
+#endif /* UNIV_LOG_ARCHIVE */
diff --git a/storage/xtradb/include/mach0data.h b/storage/xtradb/include/mach0data.h
new file mode 100644
index 00000000000..44ee3df22ce
--- /dev/null
+++ b/storage/xtradb/include/mach0data.h
@@ -0,0 +1,400 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/******************************************************************//**
+@file include/mach0data.h
+Utilities for converting data from the database file
+to the machine format.
+
+Created 11/28/1995 Heikki Tuuri
+***********************************************************************/
+
+#ifndef mach0data_h
+#define mach0data_h
+
+#include "univ.i"
+#include "ut0byte.h"
+
+/* The data and all fields are always stored in a database file
+in the same format: ascii, big-endian, ... .
+All data in the files MUST be accessed using the functions in this
+module. */
+
+/*******************************************************//**
+The following function is used to store data in one byte. */
+UNIV_INLINE
+void
+mach_write_to_1(
+/*============*/
+	byte*	b,	/*!< in: pointer to byte where to store */
+	ulint	n);	 /*!< in: ulint integer to be stored, >= 0, < 256 */
+/********************************************************//**
+The following function is used to fetch data from one byte.
+@return	ulint integer, >= 0, < 256 */
+UNIV_INLINE
+ulint
+mach_read_from_1(
+/*=============*/
+	const byte*	b)	/*!< in: pointer to byte */
+	__attribute__((nonnull, pure));
+/*******************************************************//**
+The following function is used to store data in two consecutive
+bytes. We store the most significant byte to the lower address. */
+UNIV_INLINE
+void
+mach_write_to_2(
+/*============*/
+	byte*	b,	/*!< in: pointer to two bytes where to store */
+	ulint	n);	 /*!< in: ulint integer to be stored, >= 0, < 64k */
+/********************************************************//**
+The following function is used to fetch data from two consecutive
+bytes. The most significant byte is at the lowest address.
+@return	ulint integer, >= 0, < 64k */
+UNIV_INLINE
+ulint
+mach_read_from_2(
+/*=============*/
+	const byte*	b)	/*!< in: pointer to two bytes */
+	__attribute__((nonnull, pure));
+
+/********************************************************//**
+The following function is used to convert a 16-bit data item
+to the canonical format, for fast bytewise equality test
+against memory.
+@return	16-bit integer in canonical format */
+UNIV_INLINE
+uint16
+mach_encode_2(
+/*==========*/
+	ulint	n)	/*!< in: integer in machine-dependent format */
+	__attribute__((const));
+/********************************************************//**
+The following function is used to convert a 16-bit data item
+from the canonical format, for fast bytewise equality test
+against memory.
+@return	integer in machine-dependent format */
+UNIV_INLINE
+ulint
+mach_decode_2(
+/*==========*/
+	uint16	n)	/*!< in: 16-bit integer in canonical format */
+	__attribute__((const));
+/*******************************************************//**
+The following function is used to store data in 3 consecutive
+bytes. We store the most significant byte to the lowest address. */
+UNIV_INLINE
+void
+mach_write_to_3(
+/*============*/
+	byte*	b,	/*!< in: pointer to 3 bytes where to store */
+	ulint	n);	 /*!< in: ulint integer to be stored */
+/********************************************************//**
+The following function is used to fetch data from 3 consecutive
+bytes. The most significant byte is at the lowest address.
+@return	ulint integer */
+UNIV_INLINE
+ulint
+mach_read_from_3(
+/*=============*/
+	const byte*	b)	/*!< in: pointer to 3 bytes */
+	__attribute__((nonnull, pure));
+/*******************************************************//**
+The following function is used to store data in four consecutive
+bytes. We store the most significant byte to the lowest address. */
+UNIV_INLINE
+void
+mach_write_to_4(
+/*============*/
+	byte*	b,	/*!< in: pointer to four bytes where to store */
+	ulint	n);	 /*!< in: ulint integer to be stored */
+/********************************************************//**
+The following function is used to fetch data from 4 consecutive
+bytes. The most significant byte is at the lowest address.
+@return	ulint integer */
+UNIV_INLINE
+ulint
+mach_read_from_4(
+/*=============*/
+	const byte*	b)	/*!< in: pointer to four bytes */
+	__attribute__((nonnull, pure));
+/*********************************************************//**
+Writes a ulint in a compressed form (1..5 bytes).
+@return	stored size in bytes */
+UNIV_INLINE
+ulint
+mach_write_compressed(
+/*==================*/
+	byte*	b,	/*!< in: pointer to memory where to store */
+	ulint	n);	/*!< in: ulint integer to be stored */
+/*********************************************************//**
+Returns the size of an ulint when written in the compressed form.
+@return	compressed size in bytes */
+UNIV_INLINE
+ulint
+mach_get_compressed_size(
+/*=====================*/
+	ulint	n)	/*!< in: ulint integer to be stored */
+	__attribute__((const));
+/*********************************************************//**
+Reads a ulint in a compressed form.
+@return	read integer */
+UNIV_INLINE
+ulint
+mach_read_compressed(
+/*=================*/
+	const byte*	b)	/*!< in: pointer to memory from where to read */
+	__attribute__((nonnull, pure));
+/*******************************************************//**
+The following function is used to store data in 6 consecutive
+bytes. We store the most significant byte to the lowest address. */
+UNIV_INLINE
+void
+mach_write_to_6(
+/*============*/
+	byte*	b,	/*!< in: pointer to 6 bytes where to store */
+	dulint	n);	 /*!< in: dulint integer to be stored */
+/********************************************************//**
+The following function is used to fetch data from 6 consecutive
+bytes. The most significant byte is at the lowest address.
+@return	dulint integer */
+UNIV_INLINE
+dulint
+mach_read_from_6(
+/*=============*/
+	const byte*	b)	/*!< in: pointer to 6 bytes */
+	__attribute__((nonnull, pure));
+/*******************************************************//**
+The following function is used to store data in 7 consecutive
+bytes. We store the most significant byte to the lowest address. */
+UNIV_INLINE
+void
+mach_write_to_7(
+/*============*/
+	byte*	b,	/*!< in: pointer to 7 bytes where to store */
+	dulint	n);	 /*!< in: dulint integer to be stored */
+/********************************************************//**
+The following function is used to fetch data from 7 consecutive
+bytes. The most significant byte is at the lowest address.
+@return	dulint integer */
+UNIV_INLINE
+dulint
+mach_read_from_7(
+/*=============*/
+	const byte*	b)	/*!< in: pointer to 7 bytes */
+	__attribute__((nonnull, pure));
+/*******************************************************//**
+The following function is used to store data in 8 consecutive
+bytes. We store the most significant byte to the lowest address. */
+UNIV_INLINE
+void
+mach_write_to_8(
+/*============*/
+	byte*	b,	/*!< in: pointer to 8 bytes where to store */
+	dulint	n);	/*!< in: dulint integer to be stored */
+/*******************************************************//**
+The following function is used to store data in 8 consecutive
+bytes. We store the most significant byte to the lowest address. */
+UNIV_INLINE
+void
+mach_write_ull(
+/*===========*/
+	byte*		b,	/*!< in: pointer to 8 bytes where to store */
+	ib_uint64_t	n);	/*!< in: 64-bit integer to be stored */
+/********************************************************//**
+The following function is used to fetch data from 8 consecutive
+bytes. The most significant byte is at the lowest address.
+@return	dulint integer */
+UNIV_INLINE
+dulint
+mach_read_from_8(
+/*=============*/
+	const byte*	b)	/*!< in: pointer to 8 bytes */
+	__attribute__((nonnull, pure));
+/********************************************************//**
+The following function is used to fetch data from 8 consecutive
+bytes. The most significant byte is at the lowest address.
+@return	64-bit integer */
+UNIV_INLINE
+ib_uint64_t
+mach_read_ull(
+/*==========*/
+	const byte*	b)	/*!< in: pointer to 8 bytes */
+	__attribute__((nonnull, pure));
+/*********************************************************//**
+Writes a dulint in a compressed form (5..9 bytes).
+@return	size in bytes */
+UNIV_INLINE
+ulint
+mach_dulint_write_compressed(
+/*=========================*/
+	byte*	b,	/*!< in: pointer to memory where to store */
+	dulint	n);	/*!< in: dulint integer to be stored */
+/*********************************************************//**
+Returns the size of a dulint when written in the compressed form.
+@return	compressed size in bytes */
+UNIV_INLINE
+ulint
+mach_dulint_get_compressed_size(
+/*============================*/
+	dulint	 n);	/*!< in: dulint integer to be stored */
+/*********************************************************//**
+Reads a dulint in a compressed form.
+@return	read dulint */
+UNIV_INLINE
+dulint
+mach_dulint_read_compressed(
+/*========================*/
+	const byte*	b)	/*!< in: pointer to memory from where to read */
+	__attribute__((nonnull, pure));
+/*********************************************************//**
+Writes a dulint in a compressed form (1..11 bytes).
+@return	size in bytes */
+UNIV_INLINE
+ulint
+mach_dulint_write_much_compressed(
+/*==============================*/
+	byte*	b,	/*!< in: pointer to memory where to store */
+	dulint	n);	/*!< in: dulint integer to be stored */
+/*********************************************************//**
+Returns the size of a dulint when written in the compressed form.
+@return	compressed size in bytes */
+UNIV_INLINE
+ulint
+mach_dulint_get_much_compressed_size(
+/*=================================*/
+	dulint	 n)	 /*!< in: dulint integer to be stored */
+	__attribute__((const));
+/*********************************************************//**
+Reads a dulint in a compressed form.
+@return	read dulint */
+UNIV_INLINE
+dulint
+mach_dulint_read_much_compressed(
+/*=============================*/
+	const byte*	b)	/*!< in: pointer to memory from where to read */
+	__attribute__((nonnull, pure));
+/*********************************************************//**
+Reads a ulint in a compressed form if the log record fully contains it.
+@return	pointer to end of the stored field, NULL if not complete */
+UNIV_INTERN
+byte*
+mach_parse_compressed(
+/*==================*/
+	byte*	ptr,	/*!< in: pointer to buffer from where to read */
+	byte*	end_ptr,/*!< in: pointer to end of the buffer */
+	ulint*	val);	/*!< out: read value */
+/*********************************************************//**
+Reads a dulint in a compressed form if the log record fully contains it.
+@return	pointer to end of the stored field, NULL if not complete */
+UNIV_INTERN
+byte*
+mach_dulint_parse_compressed(
+/*=========================*/
+	byte*	ptr,	/*!< in: pointer to buffer from where to read */
+	byte*	end_ptr,/*!< in: pointer to end of the buffer */
+	dulint*	val);	/*!< out: read value */
+#ifndef UNIV_HOTBACKUP
+/*********************************************************//**
+Reads a double. It is stored in a little-endian format.
+@return	double read */
+UNIV_INLINE
+double
+mach_double_read(
+/*=============*/
+	const byte*	b)	/*!< in: pointer to memory from where to read */
+	__attribute__((nonnull, pure));
+/*********************************************************//**
+Writes a double. It is stored in a little-endian format. */
+UNIV_INLINE
+void
+mach_double_write(
+/*==============*/
+	byte*	b,	/*!< in: pointer to memory where to write */
+	double	d);	/*!< in: double */
+/*********************************************************//**
+Reads a float. It is stored in a little-endian format.
+@return	float read */
+UNIV_INLINE
+float
+mach_float_read(
+/*============*/
+	const byte*	b)	/*!< in: pointer to memory from where to read */
+	__attribute__((nonnull, pure));
+/*********************************************************//**
+Writes a float. It is stored in a little-endian format. */
+UNIV_INLINE
+void
+mach_float_write(
+/*=============*/
+	byte*	b,	/*!< in: pointer to memory where to write */
+	float	d);	/*!< in: float */
+/*********************************************************//**
+Reads a ulint stored in the little-endian format.
+@return	unsigned long int */
+UNIV_INLINE
+ulint
+mach_read_from_n_little_endian(
+/*===========================*/
+	const byte*	buf,		/*!< in: from where to read */
+	ulint		buf_size)	/*!< in: from how many bytes to read */
+	__attribute__((nonnull, pure));
+/*********************************************************//**
+Writes a ulint in the little-endian format. */
+UNIV_INLINE
+void
+mach_write_to_n_little_endian(
+/*==========================*/
+	byte*	dest,		/*!< in: where to write */
+	ulint	dest_size,	/*!< in: into how many bytes to write */
+	ulint	n);		/*!< in: unsigned long int to write */
+/*********************************************************//**
+Reads a ulint stored in the little-endian format.
+@return	unsigned long int */
+UNIV_INLINE
+ulint
+mach_read_from_2_little_endian(
+/*===========================*/
+	const byte*	buf)		/*!< in: from where to read */
+	__attribute__((nonnull, pure));
+/*********************************************************//**
+Writes a ulint in the little-endian format. */
+UNIV_INLINE
+void
+mach_write_to_2_little_endian(
+/*==========================*/
+	byte*	dest,		/*!< in: where to write */
+	ulint	n);		/*!< in: unsigned long int to write */
+
+/*********************************************************//**
+Convert integral type from storage byte order (big endian) to
+host byte order.
+@return	integer value */
+UNIV_INLINE
+ullint
+mach_read_int_type(
+/*===============*/
+	const byte*	src,		/*!< in: where to read from */
+	ulint		len,		/*!< in: length of src */
+	ibool		unsigned_type);	/*!< in: signed or unsigned flag */
+#endif /* !UNIV_HOTBACKUP */
+
+#ifndef UNIV_NONINL
+#include "mach0data.ic"
+#endif
+
+#endif
diff --git a/storage/xtradb/include/mach0data.ic b/storage/xtradb/include/mach0data.ic
new file mode 100644
index 00000000000..96d2417ac81
--- /dev/null
+++ b/storage/xtradb/include/mach0data.ic
@@ -0,0 +1,783 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/******************************************************************//**
+@file include/mach0data.ic
+Utilities for converting data from the database file
+to the machine format.
+
+Created 11/28/1995 Heikki Tuuri
+***********************************************************************/
+
+#include "ut0mem.h"
+
+/*******************************************************//**
+The following function is used to store data in one byte. */
+UNIV_INLINE
+void
+mach_write_to_1(
+/*============*/
+	byte*	b,	/*!< in: pointer to byte where to store */
+	ulint	n)	/*!< in: ulint integer to be stored, >= 0, < 256 */
+{
+	ut_ad(b);
+	ut_ad((n | 0xFFUL) <= 0xFFUL);
+
+	b[0] = (byte)n;
+}
+
+/********************************************************//**
+The following function is used to fetch data from one byte.
+@return	ulint integer, >= 0, < 256 */
+UNIV_INLINE
+ulint
+mach_read_from_1(
+/*=============*/
+	const byte*	b)	/*!< in: pointer to byte */
+{
+	ut_ad(b);
+	return((ulint)(b[0]));
+}
+
+/*******************************************************//**
+The following function is used to store data in two consecutive
+bytes. We store the most significant byte to the lowest address. */
+UNIV_INLINE
+void
+mach_write_to_2(
+/*============*/
+	byte*	b,	/*!< in: pointer to two bytes where to store */
+	ulint	n)	/*!< in: ulint integer to be stored */
+{
+	ut_ad(b);
+	ut_ad((n | 0xFFFFUL) <= 0xFFFFUL);
+
+	b[0] = (byte)(n >> 8);
+	b[1] = (byte)(n);
+}
+
+/********************************************************//**
+The following function is used to fetch data from 2 consecutive
+bytes. The most significant byte is at the lowest address.
+@return	ulint integer */
+UNIV_INLINE
+ulint
+mach_read_from_2(
+/*=============*/
+	const byte*	b)	/*!< in: pointer to 2 bytes */
+{
+	return(((ulint)(b[0]) << 8) | (ulint)(b[1]));
+}
+
+/********************************************************//**
+The following function is used to convert a 16-bit data item
+to the canonical format, for fast bytewise equality test
+against memory.
+@return	16-bit integer in canonical format */
+UNIV_INLINE
+uint16
+mach_encode_2(
+/*==========*/
+	ulint	n)	/*!< in: integer in machine-dependent format */
+{
+	uint16	ret;
+	ut_ad(2 == sizeof ret);
+	mach_write_to_2((byte*) &ret, n);
+	return(ret);
+}
+/********************************************************//**
+The following function is used to convert a 16-bit data item
+from the canonical format, for fast bytewise equality test
+against memory.
+@return	integer in machine-dependent format */
+UNIV_INLINE
+ulint
+mach_decode_2(
+/*==========*/
+	uint16	n)	/*!< in: 16-bit integer in canonical format */
+{
+	ut_ad(2 == sizeof n);
+	return(mach_read_from_2((const byte*) &n));
+}
+
+/*******************************************************//**
+The following function is used to store data in 3 consecutive
+bytes. We store the most significant byte to the lowest address. */
+UNIV_INLINE
+void
+mach_write_to_3(
+/*============*/
+	byte*	b,	/*!< in: pointer to 3 bytes where to store */
+	ulint	n)	/*!< in: ulint integer to be stored */
+{
+	ut_ad(b);
+	ut_ad((n | 0xFFFFFFUL) <= 0xFFFFFFUL);
+
+	b[0] = (byte)(n >> 16);
+	b[1] = (byte)(n >> 8);
+	b[2] = (byte)(n);
+}
+
+/********************************************************//**
+The following function is used to fetch data from 3 consecutive
+bytes. The most significant byte is at the lowest address.
+@return	ulint integer */
+UNIV_INLINE
+ulint
+mach_read_from_3(
+/*=============*/
+	const byte*	b)	/*!< in: pointer to 3 bytes */
+{
+	ut_ad(b);
+	return( ((ulint)(b[0]) << 16)
+		| ((ulint)(b[1]) << 8)
+		| (ulint)(b[2])
+		);
+}
+
+/*******************************************************//**
+The following function is used to store data in four consecutive
+bytes. We store the most significant byte to the lowest address. */
+UNIV_INLINE
+void
+mach_write_to_4(
+/*============*/
+	byte*	b,	/*!< in: pointer to four bytes where to store */
+	ulint	n)	/*!< in: ulint integer to be stored */
+{
+	ut_ad(b);
+
+	b[0] = (byte)(n >> 24);
+	b[1] = (byte)(n >> 16);
+	b[2] = (byte)(n >> 8);
+	b[3] = (byte)n;
+}
+
+/********************************************************//**
+The following function is used to fetch data from 4 consecutive
+bytes. The most significant byte is at the lowest address.
+@return	ulint integer */
+UNIV_INLINE
+ulint
+mach_read_from_4(
+/*=============*/
+	const byte*	b)	/*!< in: pointer to four bytes */
+{
+	ut_ad(b);
+	return( ((ulint)(b[0]) << 24)
+		| ((ulint)(b[1]) << 16)
+		| ((ulint)(b[2]) << 8)
+		| (ulint)(b[3])
+		);
+}
+
+/*********************************************************//**
+Writes a ulint in a compressed form where the first byte codes the
+length of the stored ulint. We look at the most significant bits of
+the byte. If the most significant bit is zero, it means 1-byte storage,
+else if the 2nd bit is 0, it means 2-byte storage, else if 3rd is 0,
+it means 3-byte storage, else if 4th is 0, it means 4-byte storage,
+else the storage is 5-byte.
+@return	compressed size in bytes */
+UNIV_INLINE
+ulint
+mach_write_compressed(
+/*==================*/
+	byte*	b,	/*!< in: pointer to memory where to store */
+	ulint	n)	/*!< in: ulint integer (< 2^32) to be stored */
+{
+	ut_ad(b);
+
+	if (n < 0x80UL) {
+		mach_write_to_1(b, n);
+		return(1);
+	} else if (n < 0x4000UL) {
+		mach_write_to_2(b, n | 0x8000UL);
+		return(2);
+	} else if (n < 0x200000UL) {
+		mach_write_to_3(b, n | 0xC00000UL);
+		return(3);
+	} else if (n < 0x10000000UL) {
+		mach_write_to_4(b, n | 0xE0000000UL);
+		return(4);
+	} else {
+		mach_write_to_1(b, 0xF0UL);
+		mach_write_to_4(b + 1, n);
+		return(5);
+	}
+}
+
+/*********************************************************//**
+Returns the size of a ulint when written in the compressed form.
+@return	compressed size in bytes */
+UNIV_INLINE
+ulint
+mach_get_compressed_size(
+/*=====================*/
+	ulint	n)	/*!< in: ulint integer (< 2^32) to be stored */
+{
+	if (n < 0x80UL) {
+		return(1);
+	} else if (n < 0x4000UL) {
+		return(2);
+	} else if (n < 0x200000UL) {
+		return(3);
+	} else if (n < 0x10000000UL) {
+		return(4);
+	} else {
+		return(5);
+	}
+}
+
+/*********************************************************//**
+Reads a ulint in a compressed form.
+@return	read integer (< 2^32) */
+UNIV_INLINE
+ulint
+mach_read_compressed(
+/*=================*/
+	const byte*	b)	/*!< in: pointer to memory from where to read */
+{
+	ulint	flag;
+
+	ut_ad(b);
+
+	flag = mach_read_from_1(b);
+
+	if (flag < 0x80UL) {
+		return(flag);
+	} else if (flag < 0xC0UL) {
+		return(mach_read_from_2(b) & 0x7FFFUL);
+	} else if (flag < 0xE0UL) {
+		return(mach_read_from_3(b) & 0x3FFFFFUL);
+	} else if (flag < 0xF0UL) {
+		return(mach_read_from_4(b) & 0x1FFFFFFFUL);
+	} else {
+		ut_ad(flag == 0xF0UL);
+		return(mach_read_from_4(b + 1));
+	}
+}
+
+/*******************************************************//**
+The following function is used to store data in 8 consecutive
+bytes. We store the most significant byte to the lowest address. */
+UNIV_INLINE
+void
+mach_write_to_8(
+/*============*/
+	byte*	b,	/*!< in: pointer to 8 bytes where to store */
+	dulint	n)	/*!< in: dulint integer to be stored */
+{
+	ut_ad(b);
+
+	mach_write_to_4(b, ut_dulint_get_high(n));
+	mach_write_to_4(b + 4, ut_dulint_get_low(n));
+}
+
+/*******************************************************//**
+The following function is used to store data in 8 consecutive
+bytes. We store the most significant byte to the lowest address. */
+UNIV_INLINE
+void
+mach_write_ull(
+/*===========*/
+	byte*		b,	/*!< in: pointer to 8 bytes where to store */
+	ib_uint64_t	n)	/*!< in: 64-bit integer to be stored */
+{
+	ut_ad(b);
+
+	mach_write_to_4(b, (ulint) (n >> 32));
+	mach_write_to_4(b + 4, (ulint) n);
+}
+
+/********************************************************//**
+The following function is used to fetch data from 8 consecutive
+bytes. The most significant byte is at the lowest address.
+@return	dulint integer */
+UNIV_INLINE
+dulint
+mach_read_from_8(
+/*=============*/
+	const byte*	b)	/*!< in: pointer to 8 bytes */
+{
+	ulint	high;
+	ulint	low;
+
+	ut_ad(b);
+
+	high = mach_read_from_4(b);
+	low = mach_read_from_4(b + 4);
+
+	return(ut_dulint_create(high, low));
+}
+
+/********************************************************//**
+The following function is used to fetch data from 8 consecutive
+bytes. The most significant byte is at the lowest address.
+@return	64-bit integer */
+UNIV_INLINE
+ib_uint64_t
+mach_read_ull(
+/*==========*/
+	const byte*	b)	/*!< in: pointer to 8 bytes */
+{
+	ib_uint64_t	ull;
+
+	ull = ((ib_uint64_t) mach_read_from_4(b)) << 32;
+	ull |= (ib_uint64_t) mach_read_from_4(b + 4);
+
+	return(ull);
+}
+
+/*******************************************************//**
+The following function is used to store data in 7 consecutive
+bytes. We store the most significant byte to the lowest address. */
+UNIV_INLINE
+void
+mach_write_to_7(
+/*============*/
+	byte*	b,	/*!< in: pointer to 7 bytes where to store */
+	dulint	n)	/*!< in: dulint integer to be stored */
+{
+	ut_ad(b);
+
+	mach_write_to_3(b, ut_dulint_get_high(n));
+	mach_write_to_4(b + 3, ut_dulint_get_low(n));
+}
+
+/********************************************************//**
+The following function is used to fetch data from 7 consecutive
+bytes. The most significant byte is at the lowest address.
+@return	dulint integer */
+UNIV_INLINE
+dulint
+mach_read_from_7(
+/*=============*/
+	const byte*	b)	/*!< in: pointer to 7 bytes */
+{
+	ulint	high;
+	ulint	low;
+
+	ut_ad(b);
+
+	high = mach_read_from_3(b);
+	low = mach_read_from_4(b + 3);
+
+	return(ut_dulint_create(high, low));
+}
+
+/*******************************************************//**
+The following function is used to store data in 6 consecutive
+bytes. We store the most significant byte to the lowest address. */
+UNIV_INLINE
+void
+mach_write_to_6(
+/*============*/
+	byte*	b,	/*!< in: pointer to 6 bytes where to store */
+	dulint	n)	/*!< in: dulint integer to be stored */
+{
+	ut_ad(b);
+
+	mach_write_to_2(b, ut_dulint_get_high(n));
+	mach_write_to_4(b + 2, ut_dulint_get_low(n));
+}
+
+/********************************************************//**
+The following function is used to fetch data from 6 consecutive
+bytes. The most significant byte is at the lowest address.
+@return	dulint integer */
+UNIV_INLINE
+dulint
+mach_read_from_6(
+/*=============*/
+	const byte*	b)	/*!< in: pointer to 6 bytes */
+{
+	ulint	high;
+	ulint	low;
+
+	ut_ad(b);
+
+	high = mach_read_from_2(b);
+	low = mach_read_from_4(b + 2);
+
+	return(ut_dulint_create(high, low));
+}
+
+/*********************************************************//**
+Writes a dulint in a compressed form (5..9 bytes).
+@return	size in bytes */
+UNIV_INLINE
+ulint
+mach_dulint_write_compressed(
+/*=========================*/
+	byte*	b,	/*!< in: pointer to memory where to store */
+	dulint	n)	/*!< in: dulint integer to be stored */
+{
+	ulint	size;
+
+	ut_ad(b);
+
+	size = mach_write_compressed(b, ut_dulint_get_high(n));
+	mach_write_to_4(b + size, ut_dulint_get_low(n));
+
+	return(size + 4);
+}
+
+/*********************************************************//**
+Returns the size of a dulint when written in the compressed form.
+@return	compressed size in bytes */
+UNIV_INLINE
+ulint
+mach_dulint_get_compressed_size(
+/*============================*/
+	dulint	 n)	/*!< in: dulint integer to be stored */
+{
+	return(4 + mach_get_compressed_size(ut_dulint_get_high(n)));
+}
+
+/*********************************************************//**
+Reads a dulint in a compressed form.
+@return	read dulint */
+UNIV_INLINE
+dulint
+mach_dulint_read_compressed(
+/*========================*/
+	const byte*	b)	/*!< in: pointer to memory from where to read */
+{
+	ulint	high;
+	ulint	low;
+	ulint	size;
+
+	ut_ad(b);
+
+	high = mach_read_compressed(b);
+
+	size = mach_get_compressed_size(high);
+
+	low = mach_read_from_4(b + size);
+
+	return(ut_dulint_create(high, low));
+}
+
+/*********************************************************//**
+Writes a dulint in a compressed form (1..11 bytes).
+@return	size in bytes */
+UNIV_INLINE
+ulint
+mach_dulint_write_much_compressed(
+/*==============================*/
+	byte*	b,	/*!< in: pointer to memory where to store */
+	dulint	n)	/*!< in: dulint integer to be stored */
+{
+	ulint	size;
+
+	ut_ad(b);
+
+	if (ut_dulint_get_high(n) == 0) {
+		return(mach_write_compressed(b, ut_dulint_get_low(n)));
+	}
+
+	*b = (byte)0xFF;
+	size = 1 + mach_write_compressed(b + 1, ut_dulint_get_high(n));
+
+	size += mach_write_compressed(b + size, ut_dulint_get_low(n));
+
+	return(size);
+}
+
+/*********************************************************//**
+Returns the size of a dulint when written in the compressed form.
+@return	compressed size in bytes */
+UNIV_INLINE
+ulint
+mach_dulint_get_much_compressed_size(
+/*=================================*/
+	dulint	 n)	/*!< in: dulint integer to be stored */
+{
+	if (0 == ut_dulint_get_high(n)) {
+		return(mach_get_compressed_size(ut_dulint_get_low(n)));
+	}
+
+	return(1 + mach_get_compressed_size(ut_dulint_get_high(n))
+	       + mach_get_compressed_size(ut_dulint_get_low(n)));
+}
+
+/*********************************************************//**
+Reads a dulint in a compressed form.
+@return	read dulint */
+UNIV_INLINE
+dulint
+mach_dulint_read_much_compressed(
+/*=============================*/
+	const byte*	b)	/*!< in: pointer to memory from where to read */
+{
+	ulint	high;
+	ulint	low;
+	ulint	size;
+
+	ut_ad(b);
+
+	if (*b != (byte)0xFF) {
+		high = 0;
+		size = 0;
+	} else {
+		high = mach_read_compressed(b + 1);
+
+		size = 1 + mach_get_compressed_size(high);
+	}
+
+	low = mach_read_compressed(b + size);
+
+	return(ut_dulint_create(high, low));
+}
+#ifndef UNIV_HOTBACKUP
+/*********************************************************//**
+Reads a double. It is stored in a little-endian format.
+@return	double read */
+UNIV_INLINE
+double
+mach_double_read(
+/*=============*/
+	const byte*	b)	/*!< in: pointer to memory from where to read */
+{
+	double	d;
+	ulint	i;
+	byte*	ptr;
+
+	ptr = (byte*)&d;
+
+	for (i = 0; i < sizeof(double); i++) {
+#ifdef WORDS_BIGENDIAN
+		ptr[sizeof(double) - i - 1] = b[i];
+#else
+		ptr[i] = b[i];
+#endif
+	}
+
+	return(d);
+}
+
+/*********************************************************//**
+Writes a double. It is stored in a little-endian format. */
+UNIV_INLINE
+void
+mach_double_write(
+/*==============*/
+	byte*	b,	/*!< in: pointer to memory where to write */
+	double	d)	/*!< in: double */
+{
+	ulint	i;
+	byte*	ptr;
+
+	ptr = (byte*)&d;
+
+	for (i = 0; i < sizeof(double); i++) {
+#ifdef WORDS_BIGENDIAN
+		b[i] = ptr[sizeof(double) - i - 1];
+#else
+		b[i] = ptr[i];
+#endif
+	}
+}
+
+/*********************************************************//**
+Reads a float. It is stored in a little-endian format.
+@return	float read */
+UNIV_INLINE
+float
+mach_float_read(
+/*============*/
+	const byte*	b)	/*!< in: pointer to memory from where to read */
+{
+	float	d;
+	ulint	i;
+	byte*	ptr;
+
+	ptr = (byte*)&d;
+
+	for (i = 0; i < sizeof(float); i++) {
+#ifdef WORDS_BIGENDIAN
+		ptr[sizeof(float) - i - 1] = b[i];
+#else
+		ptr[i] = b[i];
+#endif
+	}
+
+	return(d);
+}
+
+/*********************************************************//**
+Writes a float. It is stored in a little-endian format. */
+UNIV_INLINE
+void
+mach_float_write(
+/*=============*/
+	byte*	b,	/*!< in: pointer to memory where to write */
+	float	d)	/*!< in: float */
+{
+	ulint	i;
+	byte*	ptr;
+
+	ptr = (byte*)&d;
+
+	for (i = 0; i < sizeof(float); i++) {
+#ifdef WORDS_BIGENDIAN
+		b[i] = ptr[sizeof(float) - i - 1];
+#else
+		b[i] = ptr[i];
+#endif
+	}
+}
+
+/*********************************************************//**
+Reads a ulint stored in the little-endian format.
+@return	unsigned long int */
+UNIV_INLINE
+ulint
+mach_read_from_n_little_endian(
+/*===========================*/
+	const byte*	buf,		/*!< in: from where to read */
+	ulint		buf_size)	/*!< in: from how many bytes to read */
+{
+	ulint	n	= 0;
+	const byte*	ptr;
+
+	ut_ad(buf_size <= sizeof(ulint));
+	ut_ad(buf_size > 0);
+
+	ptr = buf + buf_size;
+
+	for (;;) {
+		ptr--;
+
+		n = n << 8;
+
+		n += (ulint)(*ptr);
+
+		if (ptr == buf) {
+			break;
+		}
+	}
+
+	return(n);
+}
+
+/*********************************************************//**
+Writes a ulint in the little-endian format. */
+UNIV_INLINE
+void
+mach_write_to_n_little_endian(
+/*==========================*/
+	byte*	dest,		/*!< in: where to write */
+	ulint	dest_size,	/*!< in: into how many bytes to write */
+	ulint	n)		/*!< in: unsigned long int to write */
+{
+	byte*	end;
+
+	ut_ad(dest_size <= sizeof(ulint));
+	ut_ad(dest_size > 0);
+
+	end = dest + dest_size;
+
+	for (;;) {
+		*dest = (byte)(n & 0xFF);
+
+		n = n >> 8;
+
+		dest++;
+
+		if (dest == end) {
+			break;
+		}
+	}
+
+	ut_ad(n == 0);
+}
+
+/*********************************************************//**
+Reads a ulint stored in the little-endian format.
+@return	unsigned long int */
+UNIV_INLINE
+ulint
+mach_read_from_2_little_endian(
+/*===========================*/
+	const byte*	buf)		/*!< in: from where to read */
+{
+	return((ulint)(buf[0]) | ((ulint)(buf[1]) << 8));
+}
+
+/*********************************************************//**
+Writes a ulint in the little-endian format. */
+UNIV_INLINE
+void
+mach_write_to_2_little_endian(
+/*==========================*/
+	byte*	dest,		/*!< in: where to write */
+	ulint	n)		/*!< in: unsigned long int to write */
+{
+	ut_ad(n < 256 * 256);
+
+	*dest = (byte)(n & 0xFFUL);
+
+	n = n >> 8;
+	dest++;
+
+	*dest = (byte)(n & 0xFFUL);
+}
+
+/*********************************************************//**
+Convert integral type from storage byte order (big endian) to
+host byte order.
+@return	integer value */
+UNIV_INLINE
+ullint
+mach_read_int_type(
+/*===============*/
+	const byte*	src,		/*!< in: where to read from */
+	ulint		len,		/*!< in: length of src */
+	ibool		unsigned_type)	/*!< in: signed or unsigned flag */
+{
+	/* XXX this can be optimized on big-endian machines */
+
+	ullint	ret;
+	uint	i;
+
+	if (unsigned_type || (src[0] & 0x80)) {
+
+		ret = 0x0000000000000000ULL;
+	} else {
+
+		ret = 0xFFFFFFFFFFFFFF00ULL;
+	}
+
+	if (unsigned_type) {
+
+		ret |= src[0];
+	} else {
+
+		ret |= src[0] ^ 0x80;
+	}
+
+	for (i = 1; i < len; i++) {
+		ret <<= 8;
+		ret |= src[i];
+	}
+
+	return(ret);
+}
+#endif /* !UNIV_HOTBACKUP */
diff --git a/storage/xtradb/include/mem0dbg.h b/storage/xtradb/include/mem0dbg.h
new file mode 100644
index 00000000000..d81e1418b2b
--- /dev/null
+++ b/storage/xtradb/include/mem0dbg.h
@@ -0,0 +1,150 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2010, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/mem0dbg.h
+The memory management: the debug code. This is not a compilation module,
+but is included in mem0mem.* !
+
+Created 6/9/1994 Heikki Tuuri
+*******************************************************/
+
+/* In the debug version each allocated field is surrounded with
+check fields whose sizes are given below */
+
+#ifdef UNIV_MEM_DEBUG
+# ifndef UNIV_HOTBACKUP
+/* The mutex which protects in the debug version the hash table
+containing the list of live memory heaps, and also the global
+variables in mem0dbg.c. */
+extern mutex_t	mem_hash_mutex;
+# endif /* !UNIV_HOTBACKUP */
+
+#define MEM_FIELD_HEADER_SIZE	ut_calc_align(2 * sizeof(ulint),\
+						UNIV_MEM_ALIGNMENT)
+#define MEM_FIELD_TRAILER_SIZE	sizeof(ulint)
+#else
+#define MEM_FIELD_HEADER_SIZE	0
+#endif
+
+
+/* Space needed when allocating for a user a field of
+length N. The space is allocated only in multiples of
+UNIV_MEM_ALIGNMENT. In the debug version there are also
+check fields at the both ends of the field. */
+#ifdef UNIV_MEM_DEBUG
+#define MEM_SPACE_NEEDED(N) ut_calc_align((N) + MEM_FIELD_HEADER_SIZE\
+		 + MEM_FIELD_TRAILER_SIZE, UNIV_MEM_ALIGNMENT)
+#else
+#define MEM_SPACE_NEEDED(N) ut_calc_align((N), UNIV_MEM_ALIGNMENT)
+#endif
+
+#if defined UNIV_MEM_DEBUG || defined UNIV_DEBUG
+/***************************************************************//**
+Checks a memory heap for consistency and prints the contents if requested.
+Outputs the sum of sizes of buffers given to the user (only in
+the debug version), the physical size of the heap and the number of
+blocks in the heap. In case of error returns 0 as sizes and number
+of blocks. */
+UNIV_INTERN
+void
+mem_heap_validate_or_print(
+/*=======================*/
+	mem_heap_t*	heap,	/*!< in: memory heap */
+	byte*		top,	/*!< in: calculate and validate only until
+				this top pointer in the heap is reached,
+				if this pointer is NULL, ignored */
+	ibool		 print,	 /*!< in: if TRUE, prints the contents
+				of the heap; works only in
+				the debug version */
+	ibool*		 error,	 /*!< out: TRUE if error */
+	ulint*		us_size,/*!< out: allocated memory
+				(for the user) in the heap,
+				if a NULL pointer is passed as this
+				argument, it is ignored; in the
+				non-debug version this is always -1 */
+	ulint*		ph_size,/*!< out: physical size of the heap,
+				if a NULL pointer is passed as this
+				argument, it is ignored */
+	ulint*		n_blocks); /*!< out: number of blocks in the heap,
+				if a NULL pointer is passed as this
+				argument, it is ignored */
+/**************************************************************//**
+Validates the contents of a memory heap.
+@return	TRUE if ok */
+UNIV_INTERN
+ibool
+mem_heap_validate(
+/*==============*/
+	mem_heap_t*   heap);	/*!< in: memory heap */
+#endif /* UNIV_MEM_DEBUG || UNIV_DEBUG */
+#ifdef UNIV_DEBUG
+/**************************************************************//**
+Checks that an object is a memory heap (or a block of it)
+@return	TRUE if ok */
+UNIV_INTERN
+ibool
+mem_heap_check(
+/*===========*/
+	mem_heap_t*   heap);	/*!< in: memory heap */
+#endif /* UNIV_DEBUG */
+#ifdef UNIV_MEM_DEBUG
+/*****************************************************************//**
+TRUE if no memory is currently allocated.
+@return	TRUE if no heaps exist */
+UNIV_INTERN
+ibool
+mem_all_freed(void);
+/*===============*/
+/*****************************************************************//**
+Validates the dynamic memory
+@return	TRUE if error */
+UNIV_INTERN
+ibool
+mem_validate_no_assert(void);
+/*=========================*/
+/************************************************************//**
+Validates the dynamic memory
+@return	TRUE if ok */
+UNIV_INTERN
+ibool
+mem_validate(void);
+/*===============*/
+#endif /* UNIV_MEM_DEBUG */
+/************************************************************//**
+Tries to find neigboring memory allocation blocks and dumps to stderr
+the neighborhood of a given pointer. */
+UNIV_INTERN
+void
+mem_analyze_corruption(
+/*===================*/
+	void*	ptr);	/*!< in: pointer to place of possible corruption */
+/*****************************************************************//**
+Prints information of dynamic memory usage and currently allocated memory
+heaps or buffers. Can only be used in the debug version. */
+UNIV_INTERN
+void
+mem_print_info(void);
+/*================*/
+/*****************************************************************//**
+Prints information of dynamic memory usage and currently allocated memory
+heaps or buffers since the last ..._print_info or..._print_new_info. */
+UNIV_INTERN
+void
+mem_print_new_info(void);
+/*====================*/
diff --git a/storage/xtradb/include/mem0dbg.ic b/storage/xtradb/include/mem0dbg.ic
new file mode 100644
index 00000000000..b0c8178a623
--- /dev/null
+++ b/storage/xtradb/include/mem0dbg.ic
@@ -0,0 +1,109 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2010, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/********************************************************************//**
+@file include/mem0dbg.ic
+The memory management: the debug code. This is not an independent
+compilation module but is included in mem0mem.*.
+
+Created 6/8/1994 Heikki Tuuri
+*************************************************************************/
+
+#ifdef UNIV_MEM_DEBUG
+extern ulint	mem_current_allocated_memory;
+
+/******************************************************************//**
+Initializes an allocated memory field in the debug version. */
+UNIV_INTERN
+void
+mem_field_init(
+/*===========*/
+	byte*	buf,	/*!< in: memory field */
+	ulint	n);	/*!< in: how many bytes the user requested */
+/******************************************************************//**
+Erases an allocated memory field in the debug version. */
+UNIV_INTERN
+void
+mem_field_erase(
+/*============*/
+	byte*	buf,	/*!< in: memory field */
+	ulint	n);	/*!< in: how many bytes the user requested */
+/***************************************************************//**
+Initializes a buffer to a random combination of hex BA and BE.
+Used to initialize allocated memory. */
+UNIV_INTERN
+void
+mem_init_buf(
+/*=========*/
+	byte*	buf,	/*!< in: pointer to buffer */
+	ulint	 n);	 /*!< in: length of buffer */
+/***************************************************************//**
+Initializes a buffer to a random combination of hex DE and AD.
+Used to erase freed memory. */
+UNIV_INTERN
+void
+mem_erase_buf(
+/*==========*/
+	byte*	buf,	/*!< in: pointer to buffer */
+	ulint	n);	/*!< in: length of buffer */
+/***************************************************************//**
+Inserts a created memory heap to the hash table of
+current allocated memory heaps.
+Initializes the hash table when first called. */
+UNIV_INTERN
+void
+mem_hash_insert(
+/*============*/
+	mem_heap_t*	heap,	   /*!< in: the created heap */
+	const char*	file_name, /*!< in: file name of creation */
+	ulint		line);	   /*!< in: line where created */
+/***************************************************************//**
+Removes a memory heap (which is going to be freed by the caller)
+from the list of live memory heaps. Returns the size of the heap
+in terms of how much memory in bytes was allocated for the user of
+the heap (not the total space occupied by the heap).
+Also validates the heap.
+NOTE: This function does not free the storage occupied by the
+heap itself, only the node in the list of heaps. */
+UNIV_INTERN
+void
+mem_hash_remove(
+/*============*/
+	mem_heap_t*	heap,	   /*!< in: the heap to be freed */
+	const char*	file_name, /*!< in: file name of freeing */
+	ulint		line);	   /*!< in: line where freed */
+
+
+void
+mem_field_header_set_len(byte* field, ulint len);
+
+ulint
+mem_field_header_get_len(byte* field);
+
+void
+mem_field_header_set_check(byte* field, ulint check);
+
+ulint
+mem_field_header_get_check(byte* field);
+
+void
+mem_field_trailer_set_check(byte* field, ulint check);
+
+ulint
+mem_field_trailer_get_check(byte* field);
+#endif /* UNIV_MEM_DEBUG */
diff --git a/storage/xtradb/include/mem0mem.h b/storage/xtradb/include/mem0mem.h
new file mode 100644
index 00000000000..ee28cf7b225
--- /dev/null
+++ b/storage/xtradb/include/mem0mem.h
@@ -0,0 +1,402 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2010, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/mem0mem.h
+The memory management
+
+Created 6/9/1994 Heikki Tuuri
+*******************************************************/
+
+#ifndef mem0mem_h
+#define mem0mem_h
+
+#include "univ.i"
+#include "ut0mem.h"
+#include "ut0byte.h"
+#include "ut0rnd.h"
+#ifndef UNIV_HOTBACKUP
+# include "sync0sync.h"
+#endif /* UNIV_HOTBACKUP */
+#include "ut0lst.h"
+#include "mach0data.h"
+
+/* -------------------- MEMORY HEAPS ----------------------------- */
+
+/* The info structure stored at the beginning of a heap block */
+typedef struct mem_block_info_struct mem_block_info_t;
+
+/* A block of a memory heap consists of the info structure
+followed by an area of memory */
+typedef mem_block_info_t	mem_block_t;
+
+/* A memory heap is a nonempty linear list of memory blocks */
+typedef mem_block_t	mem_heap_t;
+
+/* Types of allocation for memory heaps: DYNAMIC means allocation from the
+dynamic memory pool of the C compiler, BUFFER means allocation from the
+buffer pool; the latter method is used for very big heaps */
+
+#define MEM_HEAP_DYNAMIC	0	/* the most common type */
+#define MEM_HEAP_BUFFER		1
+#define MEM_HEAP_BTR_SEARCH	2	/* this flag can optionally be
+					ORed to MEM_HEAP_BUFFER, in which
+					case heap->free_block is used in
+					some cases for memory allocations,
+					and if it's NULL, the memory
+					allocation functions can return
+					NULL. */
+
+/* The following start size is used for the first block in the memory heap if
+the size is not specified, i.e., 0 is given as the parameter in the call of
+create. The standard size is the maximum (payload) size of the blocks used for
+allocations of small buffers. */
+
+#define MEM_BLOCK_START_SIZE		64
+#define MEM_BLOCK_STANDARD_SIZE		\
+	(UNIV_PAGE_SIZE >= 16384 ? 8000 : MEM_MAX_ALLOC_IN_BUF)
+
+/* If a memory heap is allowed to grow into the buffer pool, the following
+is the maximum size for a single allocated buffer: */
+#define MEM_MAX_ALLOC_IN_BUF		(UNIV_PAGE_SIZE - 200)
+
+/******************************************************************//**
+Initializes the memory system. */
+UNIV_INTERN
+void
+mem_init(
+/*=====*/
+	ulint	size);	/*!< in: common pool size in bytes */
+/******************************************************************//**
+Closes the memory system. */
+UNIV_INTERN
+void
+mem_close(void);
+/*===========*/
+
+/**************************************************************//**
+Use this macro instead of the corresponding function! Macro for memory
+heap creation. */
+
+#define mem_heap_create(N)	mem_heap_create_func(\
+		(N), MEM_HEAP_DYNAMIC, __FILE__, __LINE__)
+/**************************************************************//**
+Use this macro instead of the corresponding function! Macro for memory
+heap creation. */
+
+#define mem_heap_create_in_buffer(N)	mem_heap_create_func(\
+		(N), MEM_HEAP_BUFFER, __FILE__, __LINE__)
+/**************************************************************//**
+Use this macro instead of the corresponding function! Macro for memory
+heap creation. */
+
+#define mem_heap_create_in_btr_search(N)	mem_heap_create_func(\
+		(N), MEM_HEAP_BTR_SEARCH | MEM_HEAP_BUFFER,\
+		__FILE__, __LINE__)
+
+/**************************************************************//**
+Use this macro instead of the corresponding function! Macro for memory
+heap freeing. */
+
+#define mem_heap_free(heap) mem_heap_free_func(\
+					  (heap), __FILE__, __LINE__)
+/*****************************************************************//**
+NOTE: Use the corresponding macros instead of this function. Creates a
+memory heap. For debugging purposes, takes also the file name and line as
+arguments.
+@return own: memory heap, NULL if did not succeed (only possible for
+MEM_HEAP_BTR_SEARCH type heaps) */
+UNIV_INLINE
+mem_heap_t*
+mem_heap_create_func(
+/*=================*/
+	ulint		n,		/*!< in: desired start block size,
+					this means that a single user buffer
+					of size n will fit in the block,
+					0 creates a default size block */
+	ulint		type,		/*!< in: heap type */
+	const char*	file_name,	/*!< in: file name where created */
+	ulint		line);		/*!< in: line where created */
+/*****************************************************************//**
+NOTE: Use the corresponding macro instead of this function. Frees the space
+occupied by a memory heap. In the debug version erases the heap memory
+blocks. */
+UNIV_INLINE
+void
+mem_heap_free_func(
+/*===============*/
+	mem_heap_t*	heap,		/*!< in, own: heap to be freed */
+	const char*	file_name,	/*!< in: file name where freed */
+	ulint		line);		/*!< in: line where freed */
+/***************************************************************//**
+Allocates and zero-fills n bytes of memory from a memory heap.
+@return	allocated, zero-filled storage */
+UNIV_INLINE
+void*
+mem_heap_zalloc(
+/*============*/
+	mem_heap_t*	heap,	/*!< in: memory heap */
+	ulint		n);	/*!< in: number of bytes; if the heap is allowed
+				to grow into the buffer pool, this must be
+				<= MEM_MAX_ALLOC_IN_BUF */
+/***************************************************************//**
+Allocates n bytes of memory from a memory heap.
+@return allocated storage, NULL if did not succeed (only possible for
+MEM_HEAP_BTR_SEARCH type heaps) */
+UNIV_INLINE
+void*
+mem_heap_alloc(
+/*===========*/
+	mem_heap_t*	heap,	/*!< in: memory heap */
+	ulint		n);	/*!< in: number of bytes; if the heap is allowed
+				to grow into the buffer pool, this must be
+				<= MEM_MAX_ALLOC_IN_BUF */
+/*****************************************************************//**
+Returns a pointer to the heap top.
+@return	pointer to the heap top */
+UNIV_INLINE
+byte*
+mem_heap_get_heap_top(
+/*==================*/
+	mem_heap_t*	heap);	/*!< in: memory heap */
+/*****************************************************************//**
+Frees the space in a memory heap exceeding the pointer given. The
+pointer must have been acquired from mem_heap_get_heap_top. The first
+memory block of the heap is not freed. */
+UNIV_INLINE
+void
+mem_heap_free_heap_top(
+/*===================*/
+	mem_heap_t*	heap,	/*!< in: heap from which to free */
+	byte*		old_top);/*!< in: pointer to old top of heap */
+/*****************************************************************//**
+Empties a memory heap. The first memory block of the heap is not freed. */
+UNIV_INLINE
+void
+mem_heap_empty(
+/*===========*/
+	mem_heap_t*	heap);	/*!< in: heap to empty */
+/*****************************************************************//**
+Returns a pointer to the topmost element in a memory heap.
+The size of the element must be given.
+@return	pointer to the topmost element */
+UNIV_INLINE
+void*
+mem_heap_get_top(
+/*=============*/
+	mem_heap_t*	heap,	/*!< in: memory heap */
+	ulint		n);	/*!< in: size of the topmost element */
+/*****************************************************************//**
+Frees the topmost element in a memory heap.
+The size of the element must be given. */
+UNIV_INLINE
+void
+mem_heap_free_top(
+/*==============*/
+	mem_heap_t*	heap,	/*!< in: memory heap */
+	ulint		n);	/*!< in: size of the topmost element */
+/*****************************************************************//**
+Returns the space in bytes occupied by a memory heap. */
+UNIV_INLINE
+ulint
+mem_heap_get_size(
+/*==============*/
+	mem_heap_t*	heap);		/*!< in: heap */
+/**************************************************************//**
+Use this macro instead of the corresponding function!
+Macro for memory buffer allocation */
+
+#define mem_zalloc(N)	memset(mem_alloc(N), 0, (N));
+
+#define mem_alloc(N)	mem_alloc_func((N), NULL, __FILE__, __LINE__)
+#define mem_alloc2(N,S)	mem_alloc_func((N), (S), __FILE__, __LINE__)
+/***************************************************************//**
+NOTE: Use the corresponding macro instead of this function.
+Allocates a single buffer of memory from the dynamic memory of
+the C compiler. Is like malloc of C. The buffer must be freed
+with mem_free.
+@return	own: free storage */
+UNIV_INLINE
+void*
+mem_alloc_func(
+/*===========*/
+	ulint		n,		/*!< in: requested size in bytes */
+	ulint*		size,		/*!< out: allocated size in bytes,
+					or NULL */
+	const char*	file_name,	/*!< in: file name where created */
+	ulint		line);		/*!< in: line where created */
+
+/**************************************************************//**
+Use this macro instead of the corresponding function!
+Macro for memory buffer freeing */
+
+#define mem_free(PTR)	mem_free_func((PTR), __FILE__, __LINE__)
+/***************************************************************//**
+NOTE: Use the corresponding macro instead of this function.
+Frees a single buffer of storage from
+the dynamic memory of C compiler. Similar to free of C. */
+UNIV_INLINE
+void
+mem_free_func(
+/*==========*/
+	void*		ptr,		/*!< in, own: buffer to be freed */
+	const char*	file_name,	/*!< in: file name where created */
+	ulint		line);		/*!< in: line where created */
+
+/**********************************************************************//**
+Duplicates a NUL-terminated string.
+@return	own: a copy of the string, must be deallocated with mem_free */
+UNIV_INLINE
+char*
+mem_strdup(
+/*=======*/
+	const char*	str);	/*!< in: string to be copied */
+/**********************************************************************//**
+Makes a NUL-terminated copy of a nonterminated string.
+@return	own: a copy of the string, must be deallocated with mem_free */
+UNIV_INLINE
+char*
+mem_strdupl(
+/*========*/
+	const char*	str,	/*!< in: string to be copied */
+	ulint		len);	/*!< in: length of str, in bytes */
+
+/**********************************************************************//**
+Duplicates a NUL-terminated string, allocated from a memory heap.
+@return	own: a copy of the string */
+UNIV_INTERN
+char*
+mem_heap_strdup(
+/*============*/
+	mem_heap_t*	heap,	/*!< in: memory heap where string is allocated */
+	const char*	str);	/*!< in: string to be copied */
+/**********************************************************************//**
+Makes a NUL-terminated copy of a nonterminated string,
+allocated from a memory heap.
+@return	own: a copy of the string */
+UNIV_INLINE
+char*
+mem_heap_strdupl(
+/*=============*/
+	mem_heap_t*	heap,	/*!< in: memory heap where string is allocated */
+	const char*	str,	/*!< in: string to be copied */
+	ulint		len);	/*!< in: length of str, in bytes */
+
+/**********************************************************************//**
+Concatenate two strings and return the result, using a memory heap.
+@return	own: the result */
+UNIV_INTERN
+char*
+mem_heap_strcat(
+/*============*/
+	mem_heap_t*	heap,	/*!< in: memory heap where string is allocated */
+	const char*	s1,	/*!< in: string 1 */
+	const char*	s2);	/*!< in: string 2 */
+
+/**********************************************************************//**
+Duplicate a block of data, allocated from a memory heap.
+@return	own: a copy of the data */
+UNIV_INTERN
+void*
+mem_heap_dup(
+/*=========*/
+	mem_heap_t*	heap,	/*!< in: memory heap where copy is allocated */
+	const void*	data,	/*!< in: data to be copied */
+	ulint		len);	/*!< in: length of data, in bytes */
+
+/****************************************************************//**
+A simple (s)printf replacement that dynamically allocates the space for the
+formatted string from the given heap. This supports a very limited set of
+the printf syntax: types 's' and 'u' and length modifier 'l' (which is
+required for the 'u' type).
+@return	heap-allocated formatted string */
+UNIV_INTERN
+char*
+mem_heap_printf(
+/*============*/
+	mem_heap_t*	heap,	/*!< in: memory heap */
+	const char*	format,	/*!< in: format string */
+	...) __attribute__ ((format (printf, 2, 3)));
+
+#ifdef MEM_PERIODIC_CHECK
+/******************************************************************//**
+Goes through the list of all allocated mem blocks, checks their magic
+numbers, and reports possible corruption. */
+UNIV_INTERN
+void
+mem_validate_all_blocks(void);
+/*=========================*/
+#endif
+
+/*#######################################################################*/
+
+/* The info header of a block in a memory heap */
+
+struct mem_block_info_struct {
+	ulint	magic_n;/* magic number for debugging */
+	char	file_name[8];/* file name where the mem heap was created */
+	ulint	line;	/*!< line number where the mem heap was created */
+	UT_LIST_BASE_NODE_T(mem_block_t) base; /* In the first block in the
+			the list this is the base node of the list of blocks;
+			in subsequent blocks this is undefined */
+	UT_LIST_NODE_T(mem_block_t) list; /* This contains pointers to next
+			and prev in the list. The first block allocated
+			to the heap is also the first block in this list,
+			though it also contains the base node of the list. */
+	ulint	len;	/*!< physical length of this block in bytes */
+	ulint	total_size; /* physical length in bytes of all blocks
+			in the heap. This is defined only in the base
+			node and is set to ULINT_UNDEFINED in others. */
+	ulint	type;	/*!< type of heap: MEM_HEAP_DYNAMIC, or
+			MEM_HEAP_BUF possibly ORed to MEM_HEAP_BTR_SEARCH */
+	ulint	free;	/*!< offset in bytes of the first free position for
+			user data in the block */
+	ulint	start;	/*!< the value of the struct field 'free' at the
+			creation of the block */
+#ifndef UNIV_HOTBACKUP
+	void*	free_block;
+			/* if the MEM_HEAP_BTR_SEARCH bit is set in type,
+			and this is the heap root, this can contain an
+			allocated buffer frame, which can be appended as a
+			free block to the heap, if we need more space;
+			otherwise, this is NULL */
+	void*	buf_block;
+			/* if this block has been allocated from the buffer
+			pool, this contains the buf_block_t handle;
+			otherwise, this is NULL */
+#endif /* !UNIV_HOTBACKUP */
+#ifdef MEM_PERIODIC_CHECK
+	UT_LIST_NODE_T(mem_block_t) mem_block_list;
+			/* List of all mem blocks allocated; protected
+			by the mem_comm_pool mutex */
+#endif
+};
+
+#define MEM_BLOCK_MAGIC_N	764741555
+#define MEM_FREED_BLOCK_MAGIC_N	547711122
+
+/* Header size for a memory heap block */
+#define MEM_BLOCK_HEADER_SIZE	ut_calc_align(sizeof(mem_block_info_t),\
+							UNIV_MEM_ALIGNMENT)
+#include "mem0dbg.h"
+
+#ifndef UNIV_NONINL
+#include "mem0mem.ic"
+#endif
+
+#endif
diff --git a/storage/xtradb/include/mem0mem.ic b/storage/xtradb/include/mem0mem.ic
new file mode 100644
index 00000000000..cbce2edc661
--- /dev/null
+++ b/storage/xtradb/include/mem0mem.ic
@@ -0,0 +1,640 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2010, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/********************************************************************//**
+@file include/mem0mem.ic
+The memory management
+
+Created 6/8/1994 Heikki Tuuri
+*************************************************************************/
+
+#include "mem0dbg.ic"
+#ifndef UNIV_HOTBACKUP
+# include "mem0pool.h"
+#endif /* !UNIV_HOTBACKUP */
+
+/***************************************************************//**
+Creates a memory heap block where data can be allocated.
+@return own: memory heap block, NULL if did not succeed (only possible
+for MEM_HEAP_BTR_SEARCH type heaps) */
+UNIV_INTERN
+mem_block_t*
+mem_heap_create_block(
+/*==================*/
+	mem_heap_t*	heap,	/*!< in: memory heap or NULL if first block
+				should be created */
+	ulint		n,	/*!< in: number of bytes needed for user data */
+	ulint		type,	/*!< in: type of heap: MEM_HEAP_DYNAMIC or
+				MEM_HEAP_BUFFER */
+	const char*	file_name,/*!< in: file name where created */
+	ulint		line);	/*!< in: line where created */
+/******************************************************************//**
+Frees a block from a memory heap. */
+UNIV_INTERN
+void
+mem_heap_block_free(
+/*================*/
+	mem_heap_t*	heap,	/*!< in: heap */
+	mem_block_t*	block);	/*!< in: block to free */
+#ifndef UNIV_HOTBACKUP
+/******************************************************************//**
+Frees the free_block field from a memory heap. */
+UNIV_INTERN
+void
+mem_heap_free_block_free(
+/*=====================*/
+	mem_heap_t*	heap);	/*!< in: heap */
+#endif /* !UNIV_HOTBACKUP */
+/***************************************************************//**
+Adds a new block to a memory heap.
+@return created block, NULL if did not succeed (only possible for
+MEM_HEAP_BTR_SEARCH type heaps) */
+UNIV_INTERN
+mem_block_t*
+mem_heap_add_block(
+/*===============*/
+	mem_heap_t*	heap,	/*!< in: memory heap */
+	ulint		n);	/*!< in: number of bytes user needs */
+
+UNIV_INLINE
+void
+mem_block_set_len(mem_block_t* block, ulint len)
+{
+	ut_ad(len > 0);
+
+	block->len = len;
+}
+
+UNIV_INLINE
+ulint
+mem_block_get_len(mem_block_t* block)
+{
+	return(block->len);
+}
+
+UNIV_INLINE
+void
+mem_block_set_type(mem_block_t* block, ulint type)
+{
+	ut_ad((type == MEM_HEAP_DYNAMIC) || (type == MEM_HEAP_BUFFER)
+	      || (type == MEM_HEAP_BUFFER + MEM_HEAP_BTR_SEARCH));
+
+	block->type = type;
+}
+
+UNIV_INLINE
+ulint
+mem_block_get_type(mem_block_t* block)
+{
+	return(block->type);
+}
+
+UNIV_INLINE
+void
+mem_block_set_free(mem_block_t* block, ulint free)
+{
+	ut_ad(free > 0);
+	ut_ad(free <= mem_block_get_len(block));
+
+	block->free = free;
+}
+
+UNIV_INLINE
+ulint
+mem_block_get_free(mem_block_t* block)
+{
+	return(block->free);
+}
+
+UNIV_INLINE
+void
+mem_block_set_start(mem_block_t* block, ulint start)
+{
+	ut_ad(start > 0);
+
+	block->start = start;
+}
+
+UNIV_INLINE
+ulint
+mem_block_get_start(mem_block_t* block)
+{
+	return(block->start);
+}
+
+/***************************************************************//**
+Allocates and zero-fills n bytes of memory from a memory heap.
+@return	allocated, zero-filled storage */
+UNIV_INLINE
+void*
+mem_heap_zalloc(
+/*============*/
+	mem_heap_t*	heap,	/*!< in: memory heap */
+	ulint		n)	/*!< in: number of bytes; if the heap is allowed
+				to grow into the buffer pool, this must be
+				<= MEM_MAX_ALLOC_IN_BUF */
+{
+	ut_ad(heap);
+	ut_ad(!(heap->type & MEM_HEAP_BTR_SEARCH));
+	return(memset(mem_heap_alloc(heap, n), 0, n));
+}
+
+/***************************************************************//**
+Allocates n bytes of memory from a memory heap.
+@return allocated storage, NULL if did not succeed (only possible for
+MEM_HEAP_BTR_SEARCH type heaps) */
+UNIV_INLINE
+void*
+mem_heap_alloc(
+/*===========*/
+	mem_heap_t*	heap,	/*!< in: memory heap */
+	ulint		n)	/*!< in: number of bytes; if the heap is allowed
+				to grow into the buffer pool, this must be
+				<= MEM_MAX_ALLOC_IN_BUF */
+{
+	mem_block_t*	block;
+	void*		buf;
+	ulint		free;
+
+	ut_ad(mem_heap_check(heap));
+
+	block = UT_LIST_GET_LAST(heap->base);
+
+	ut_ad(!(block->type & MEM_HEAP_BUFFER) || (n <= MEM_MAX_ALLOC_IN_BUF));
+
+	/* Check if there is enough space in block. If not, create a new
+	block to the heap */
+
+	if (mem_block_get_len(block)
+	    < mem_block_get_free(block) + MEM_SPACE_NEEDED(n)) {
+
+		block = mem_heap_add_block(heap, n);
+
+		if (block == NULL) {
+
+			return(NULL);
+		}
+	}
+
+	free = mem_block_get_free(block);
+
+	buf = (byte*)block + free;
+
+	mem_block_set_free(block, free + MEM_SPACE_NEEDED(n));
+
+#ifdef UNIV_MEM_DEBUG
+	UNIV_MEM_ALLOC(buf,
+		       n + MEM_FIELD_HEADER_SIZE + MEM_FIELD_TRAILER_SIZE);
+
+	/* In the debug version write debugging info to the field */
+	mem_field_init((byte*)buf, n);
+
+	/* Advance buf to point at the storage which will be given to the
+	caller */
+	buf = (byte*)buf + MEM_FIELD_HEADER_SIZE;
+
+#endif
+#ifdef UNIV_SET_MEM_TO_ZERO
+	UNIV_MEM_ALLOC(buf, n);
+	memset(buf, '\0', n);
+#endif
+	UNIV_MEM_ALLOC(buf, n);
+	return(buf);
+}
+
+/*****************************************************************//**
+Returns a pointer to the heap top.
+@return	pointer to the heap top */
+UNIV_INLINE
+byte*
+mem_heap_get_heap_top(
+/*==================*/
+	mem_heap_t*	heap)	/*!< in: memory heap */
+{
+	mem_block_t*	block;
+	byte*		buf;
+
+	ut_ad(mem_heap_check(heap));
+
+	block = UT_LIST_GET_LAST(heap->base);
+
+	buf = (byte*)block + mem_block_get_free(block);
+
+	return(buf);
+}
+
+/*****************************************************************//**
+Frees the space in a memory heap exceeding the pointer given. The
+pointer must have been acquired from mem_heap_get_heap_top. The first
+memory block of the heap is not freed. */
+UNIV_INLINE
+void
+mem_heap_free_heap_top(
+/*===================*/
+	mem_heap_t*	heap,	/*!< in: heap from which to free */
+	byte*		old_top)/*!< in: pointer to old top of heap */
+{
+	mem_block_t*	block;
+	mem_block_t*	prev_block;
+#ifdef UNIV_MEM_DEBUG
+	ibool		error;
+	ulint		total_size;
+	ulint		size;
+#endif
+
+	ut_ad(mem_heap_check(heap));
+
+#ifdef UNIV_MEM_DEBUG
+
+	/* Validate the heap and get its total allocated size */
+	mem_heap_validate_or_print(heap, NULL, FALSE, &error, &total_size,
+				   NULL, NULL);
+	ut_a(!error);
+
+	/* Get the size below top pointer */
+	mem_heap_validate_or_print(heap, old_top, FALSE, &error, &size, NULL,
+				   NULL);
+	ut_a(!error);
+
+#endif
+
+	block = UT_LIST_GET_LAST(heap->base);
+
+	while (block != NULL) {
+		if (((byte*)block + mem_block_get_free(block) >= old_top)
+		    && ((byte*)block <= old_top)) {
+			/* Found the right block */
+
+			break;
+		}
+
+		/* Store prev_block value before freeing the current block
+		(the current block will be erased in freeing) */
+
+		prev_block = UT_LIST_GET_PREV(list, block);
+
+		mem_heap_block_free(heap, block);
+
+		block = prev_block;
+	}
+
+	ut_ad(block);
+
+	/* Set the free field of block */
+	mem_block_set_free(block, old_top - (byte*)block);
+
+#ifdef UNIV_MEM_DEBUG
+	ut_ad(mem_block_get_start(block) <= mem_block_get_free(block));
+
+	/* In the debug version erase block from top up */
+	mem_erase_buf(old_top, (byte*)block + block->len - old_top);
+
+	/* Update allocated memory count */
+	mutex_enter(&mem_hash_mutex);
+	mem_current_allocated_memory -= (total_size - size);
+	mutex_exit(&mem_hash_mutex);
+#else /* UNIV_MEM_DEBUG */
+	UNIV_MEM_ASSERT_W(old_top, (byte*)block + block->len - old_top);
+#endif /* UNIV_MEM_DEBUG */
+	UNIV_MEM_ALLOC(old_top, (byte*)block + block->len - old_top);
+
+	/* If free == start, we may free the block if it is not the first
+	one */
+
+	if ((heap != block) && (mem_block_get_free(block)
+				== mem_block_get_start(block))) {
+		mem_heap_block_free(heap, block);
+	}
+}
+
+/*****************************************************************//**
+Empties a memory heap. The first memory block of the heap is not freed. */
+UNIV_INLINE
+void
+mem_heap_empty(
+/*===========*/
+	mem_heap_t*	heap)	/*!< in: heap to empty */
+{
+	mem_heap_free_heap_top(heap, (byte*)heap + mem_block_get_start(heap));
+#ifndef UNIV_HOTBACKUP
+	if (heap->free_block) {
+		mem_heap_free_block_free(heap);
+	}
+#endif /* !UNIV_HOTBACKUP */
+}
+
+/*****************************************************************//**
+Returns a pointer to the topmost element in a memory heap. The size of the
+element must be given.
+@return	pointer to the topmost element */
+UNIV_INLINE
+void*
+mem_heap_get_top(
+/*=============*/
+	mem_heap_t*	heap,	/*!< in: memory heap */
+	ulint		n)	/*!< in: size of the topmost element */
+{
+	mem_block_t*	block;
+	void*		buf;
+
+	ut_ad(mem_heap_check(heap));
+
+	block = UT_LIST_GET_LAST(heap->base);
+
+	buf = (byte*)block + mem_block_get_free(block) - MEM_SPACE_NEEDED(n);
+
+#ifdef UNIV_MEM_DEBUG
+	ut_ad(mem_block_get_start(block) <=(ulint)((byte*)buf - (byte*)block));
+
+	/* In the debug version, advance buf to point at the storage which
+	was given to the caller in the allocation*/
+
+	buf = (byte*)buf + MEM_FIELD_HEADER_SIZE;
+
+	/* Check that the field lengths agree */
+	ut_ad(n == (ulint)mem_field_header_get_len(buf));
+#endif
+
+	return(buf);
+}
+
+/*****************************************************************//**
+Frees the topmost element in a memory heap. The size of the element must be
+given. */
+UNIV_INLINE
+void
+mem_heap_free_top(
+/*==============*/
+	mem_heap_t*	heap,	/*!< in: memory heap */
+	ulint		n)	/*!< in: size of the topmost element */
+{
+	mem_block_t*	block;
+
+	ut_ad(mem_heap_check(heap));
+
+	block = UT_LIST_GET_LAST(heap->base);
+
+	/* Subtract the free field of block */
+	mem_block_set_free(block, mem_block_get_free(block)
+			   - MEM_SPACE_NEEDED(n));
+	UNIV_MEM_ASSERT_W((byte*) block + mem_block_get_free(block), n);
+#ifdef UNIV_MEM_DEBUG
+
+	ut_ad(mem_block_get_start(block) <= mem_block_get_free(block));
+
+	/* In the debug version check the consistency, and erase field */
+	mem_field_erase((byte*)block + mem_block_get_free(block), n);
+#endif
+
+	/* If free == start, we may free the block if it is not the first
+	one */
+
+	if ((heap != block) && (mem_block_get_free(block)
+				== mem_block_get_start(block))) {
+		mem_heap_block_free(heap, block);
+	} else {
+		/* Avoid a bogus UNIV_MEM_ASSERT_W() warning in a
+		subsequent invocation of mem_heap_free_top().
+		Originally, this was UNIV_MEM_FREE(), to catch writes
+		to freed memory. */
+		UNIV_MEM_ALLOC((byte*) block + mem_block_get_free(block), n);
+	}
+}
+
+/*****************************************************************//**
+NOTE: Use the corresponding macros instead of this function. Creates a
+memory heap. For debugging purposes, takes also the file name and line as
+argument.
+@return own: memory heap, NULL if did not succeed (only possible for
+MEM_HEAP_BTR_SEARCH type heaps) */
+UNIV_INLINE
+mem_heap_t*
+mem_heap_create_func(
+/*=================*/
+	ulint		n,		/*!< in: desired start block size,
+					this means that a single user buffer
+					of size n will fit in the block,
+					0 creates a default size block */
+	ulint		type,		/*!< in: heap type */
+	const char*	file_name,	/*!< in: file name where created */
+	ulint		line)		/*!< in: line where created */
+{
+	mem_block_t*   block;
+
+	if (!n) {
+		n = MEM_BLOCK_START_SIZE;
+	}
+
+	block = mem_heap_create_block(NULL, n, type, file_name, line);
+
+	if (block == NULL) {
+
+		return(NULL);
+	}
+
+	UT_LIST_INIT(block->base);
+
+	/* Add the created block itself as the first block in the list */
+	UT_LIST_ADD_FIRST(list, block->base, block);
+
+#ifdef UNIV_MEM_DEBUG
+
+	mem_hash_insert(block, file_name, line);
+
+#endif
+
+	return(block);
+}
+
+/*****************************************************************//**
+NOTE: Use the corresponding macro instead of this function. Frees the space
+occupied by a memory heap. In the debug version erases the heap memory
+blocks. */
+UNIV_INLINE
+void
+mem_heap_free_func(
+/*===============*/
+	mem_heap_t*	heap,		/*!< in, own: heap to be freed */
+	const char*	file_name __attribute__((unused)),
+					/*!< in: file name where freed */
+	ulint		line  __attribute__((unused)))
+{
+	mem_block_t*	block;
+	mem_block_t*	prev_block;
+
+	ut_ad(mem_heap_check(heap));
+
+	block = UT_LIST_GET_LAST(heap->base);
+
+#ifdef UNIV_MEM_DEBUG
+
+	/* In the debug version remove the heap from the hash table of heaps
+	and check its consistency */
+
+	mem_hash_remove(heap, file_name, line);
+
+#endif
+#ifndef UNIV_HOTBACKUP
+	if (heap->free_block) {
+		mem_heap_free_block_free(heap);
+	}
+#endif /* !UNIV_HOTBACKUP */
+
+	while (block != NULL) {
+		/* Store the contents of info before freeing current block
+		(it is erased in freeing) */
+
+		prev_block = UT_LIST_GET_PREV(list, block);
+
+		mem_heap_block_free(heap, block);
+
+		block = prev_block;
+	}
+}
+
+/***************************************************************//**
+NOTE: Use the corresponding macro instead of this function.
+Allocates a single buffer of memory from the dynamic memory of
+the C compiler. Is like malloc of C. The buffer must be freed
+with mem_free.
+@return	own: free storage */
+UNIV_INLINE
+void*
+mem_alloc_func(
+/*===========*/
+	ulint		n,		/*!< in: desired number of bytes */
+	ulint*		size,		/*!< out: allocated size in bytes,
+					or NULL */
+	const char*	file_name,	/*!< in: file name where created */
+	ulint		line)		/*!< in: line where created */
+{
+	mem_heap_t*	heap;
+	void*		buf;
+
+	heap = mem_heap_create_func(n, MEM_HEAP_DYNAMIC, file_name, line);
+
+	/* Note that as we created the first block in the heap big enough
+	for the buffer requested by the caller, the buffer will be in the
+	first block and thus we can calculate the pointer to the heap from
+	the pointer to the buffer when we free the memory buffer. */
+
+	if (UNIV_LIKELY_NULL(size)) {
+		/* Adjust the allocation to the actual size of the
+		memory block. */
+		ulint	m = mem_block_get_len(heap)
+			- mem_block_get_free(heap);
+#ifdef UNIV_MEM_DEBUG
+		m -= MEM_FIELD_HEADER_SIZE + MEM_FIELD_TRAILER_SIZE;
+#endif /* UNIV_MEM_DEBUG */
+		ut_ad(m >= n);
+		*size = n = m;
+	}
+
+	buf = mem_heap_alloc(heap, n);
+
+	ut_a((byte*)heap == (byte*)buf - MEM_BLOCK_HEADER_SIZE
+	     - MEM_FIELD_HEADER_SIZE);
+	return(buf);
+}
+
+/***************************************************************//**
+NOTE: Use the corresponding macro instead of this function. Frees a single
+buffer of storage from the dynamic memory of the C compiler. Similar to the
+free of C. */
+UNIV_INLINE
+void
+mem_free_func(
+/*==========*/
+	void*		ptr,		/*!< in, own: buffer to be freed */
+	const char*	file_name,	/*!< in: file name where created */
+	ulint		line)		/*!< in: line where created */
+{
+	mem_heap_t*   heap;
+
+	heap = (mem_heap_t*)((byte*)ptr - MEM_BLOCK_HEADER_SIZE
+			     - MEM_FIELD_HEADER_SIZE);
+	mem_heap_free_func(heap, file_name, line);
+}
+
+/*****************************************************************//**
+Returns the space in bytes occupied by a memory heap. */
+UNIV_INLINE
+ulint
+mem_heap_get_size(
+/*==============*/
+	mem_heap_t*	heap)	/*!< in: heap */
+{
+	ulint		size	= 0;
+
+	ut_ad(mem_heap_check(heap));
+
+	size = heap->total_size;
+
+#ifndef UNIV_HOTBACKUP
+	if (heap->free_block) {
+		size += UNIV_PAGE_SIZE;
+	}
+#endif /* !UNIV_HOTBACKUP */
+
+	return(size);
+}
+
+/**********************************************************************//**
+Duplicates a NUL-terminated string.
+@return	own: a copy of the string, must be deallocated with mem_free */
+UNIV_INLINE
+char*
+mem_strdup(
+/*=======*/
+	const char*	str)	/*!< in: string to be copied */
+{
+	ulint	len = strlen(str) + 1;
+	return((char*) memcpy(mem_alloc(len), str, len));
+}
+
+/**********************************************************************//**
+Makes a NUL-terminated copy of a nonterminated string.
+@return	own: a copy of the string, must be deallocated with mem_free */
+UNIV_INLINE
+char*
+mem_strdupl(
+/*========*/
+	const char*	str,	/*!< in: string to be copied */
+	ulint		len)	/*!< in: length of str, in bytes */
+{
+	char*	s = (char*) mem_alloc(len + 1);
+	s[len] = 0;
+	return((char*) memcpy(s, str, len));
+}
+
+/**********************************************************************//**
+Makes a NUL-terminated copy of a nonterminated string,
+allocated from a memory heap.
+@return	own: a copy of the string */
+UNIV_INLINE
+char*
+mem_heap_strdupl(
+/*=============*/
+	mem_heap_t*	heap,	/*!< in: memory heap where string is allocated */
+	const char*	str,	/*!< in: string to be copied */
+	ulint		len)	/*!< in: length of str, in bytes */
+{
+	char*	s = (char*) mem_heap_alloc(heap, len + 1);
+	s[len] = 0;
+	return((char*) memcpy(s, str, len));
+}
diff --git a/storage/xtradb/include/mem0pool.h b/storage/xtradb/include/mem0pool.h
new file mode 100644
index 00000000000..fa8be296ec9
--- /dev/null
+++ b/storage/xtradb/include/mem0pool.h
@@ -0,0 +1,124 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/mem0pool.h
+The lowest-level memory management
+
+Created 6/9/1994 Heikki Tuuri
+*******************************************************/
+
+#ifndef mem0pool_h
+#define mem0pool_h
+
+#include "univ.i"
+#include "os0file.h"
+#include "ut0lst.h"
+
+/** Memory area header */
+typedef struct mem_area_struct	mem_area_t;
+/** Memory pool */
+typedef struct mem_pool_struct	mem_pool_t;
+
+/** The common memory pool */
+extern mem_pool_t*	mem_comm_pool;
+
+/** Memory area header */
+
+struct mem_area_struct{
+	ulint		size_and_free;	/*!< memory area size is obtained by
+					anding with ~MEM_AREA_FREE; area in
+					a free list if ANDing with
+					MEM_AREA_FREE results in nonzero */
+	UT_LIST_NODE_T(mem_area_t)
+			free_list;	/*!< free list node */
+};
+
+/** Each memory area takes this many extra bytes for control information */
+#define MEM_AREA_EXTRA_SIZE	(ut_calc_align(sizeof(struct mem_area_struct),\
+			UNIV_MEM_ALIGNMENT))
+
+/********************************************************************//**
+Creates a memory pool.
+@return	memory pool */
+UNIV_INTERN
+mem_pool_t*
+mem_pool_create(
+/*============*/
+	ulint	size);	/*!< in: pool size in bytes */
+/********************************************************************//**
+Frees a memory pool. */
+UNIV_INTERN
+void
+mem_pool_free(
+/*==========*/
+	mem_pool_t*	pool);	/*!< in, own: memory pool */
+/********************************************************************//**
+Allocates memory from a pool. NOTE: This low-level function should only be
+used in mem0mem.*!
+@return	own: allocated memory buffer */
+UNIV_INTERN
+void*
+mem_area_alloc(
+/*===========*/
+	ulint*		psize,	/*!< in: requested size in bytes; for optimum
+				space usage, the size should be a power of 2
+				minus MEM_AREA_EXTRA_SIZE;
+				out: allocated size in bytes (greater than
+				or equal to the requested size) */
+	mem_pool_t*	pool);	/*!< in: memory pool */
+/********************************************************************//**
+Frees memory to a pool. */
+UNIV_INTERN
+void
+mem_area_free(
+/*==========*/
+	void*		ptr,	/*!< in, own: pointer to allocated memory
+				buffer */
+	mem_pool_t*	pool);	/*!< in: memory pool */
+/********************************************************************//**
+Returns the amount of reserved memory.
+@return	reserved mmeory in bytes */
+UNIV_INTERN
+ulint
+mem_pool_get_reserved(
+/*==================*/
+	mem_pool_t*	pool);	/*!< in: memory pool */
+/********************************************************************//**
+Validates a memory pool.
+@return	TRUE if ok */
+UNIV_INTERN
+ibool
+mem_pool_validate(
+/*==============*/
+	mem_pool_t*	pool);	/*!< in: memory pool */
+/********************************************************************//**
+Prints info of a memory pool. */
+UNIV_INTERN
+void
+mem_pool_print_info(
+/*================*/
+	FILE*		outfile,/*!< in: output file to write to */
+	mem_pool_t*	pool);	/*!< in: memory pool */
+
+
+#ifndef UNIV_NONINL
+#include "mem0pool.ic"
+#endif
+
+#endif
diff --git a/storage/xtradb/include/mem0pool.ic b/storage/xtradb/include/mem0pool.ic
new file mode 100644
index 00000000000..b891dd6dea0
--- /dev/null
+++ b/storage/xtradb/include/mem0pool.ic
@@ -0,0 +1,24 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/********************************************************************//**
+@file include/mem0pool.ic
+The lowest-level memory management
+
+Created 6/8/1994 Heikki Tuuri
+*************************************************************************/
diff --git a/storage/xtradb/include/mtr0log.h b/storage/xtradb/include/mtr0log.h
new file mode 100644
index 00000000000..6322af2a569
--- /dev/null
+++ b/storage/xtradb/include/mtr0log.h
@@ -0,0 +1,250 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/mtr0log.h
+Mini-transaction logging routines
+
+Created 12/7/1995 Heikki Tuuri
+*******************************************************/
+
+#ifndef mtr0log_h
+#define mtr0log_h
+
+#include "univ.i"
+#include "mtr0mtr.h"
+#include "dict0types.h"
+
+#ifndef UNIV_HOTBACKUP
+/********************************************************//**
+Writes 1 - 4 bytes to a file page buffered in the buffer pool.
+Writes the corresponding log record to the mini-transaction log. */
+UNIV_INTERN
+void
+mlog_write_ulint(
+/*=============*/
+	byte*	ptr,	/*!< in: pointer where to write */
+	ulint	val,	/*!< in: value to write */
+	byte	type,	/*!< in: MLOG_1BYTE, MLOG_2BYTES, MLOG_4BYTES */
+	mtr_t*	mtr);	/*!< in: mini-transaction handle */
+/********************************************************//**
+Writes 8 bytes to a file page buffered in the buffer pool.
+Writes the corresponding log record to the mini-transaction log. */
+UNIV_INTERN
+void
+mlog_write_dulint(
+/*==============*/
+	byte*	ptr,	/*!< in: pointer where to write */
+	dulint	val,	/*!< in: value to write */
+	mtr_t*	mtr);	/*!< in: mini-transaction handle */
+/********************************************************//**
+Writes a string to a file page buffered in the buffer pool. Writes the
+corresponding log record to the mini-transaction log. */
+UNIV_INTERN
+void
+mlog_write_string(
+/*==============*/
+	byte*		ptr,	/*!< in: pointer where to write */
+	const byte*	str,	/*!< in: string to write */
+	ulint		len,	/*!< in: string length */
+	mtr_t*		mtr);	/*!< in: mini-transaction handle */
+/********************************************************//**
+Logs a write of a string to a file page buffered in the buffer pool.
+Writes the corresponding log record to the mini-transaction log. */
+UNIV_INTERN
+void
+mlog_log_string(
+/*============*/
+	byte*	ptr,	/*!< in: pointer written to */
+	ulint	len,	/*!< in: string length */
+	mtr_t*	mtr);	/*!< in: mini-transaction handle */
+/********************************************************//**
+Writes initial part of a log record consisting of one-byte item
+type and four-byte space and page numbers. */
+UNIV_INTERN
+void
+mlog_write_initial_log_record(
+/*==========================*/
+	const byte*	ptr,	/*!< in: pointer to (inside) a buffer
+				frame holding the file page where
+				modification is made */
+	byte		type,	/*!< in: log item type: MLOG_1BYTE, ... */
+	mtr_t*		mtr);	/*!< in: mini-transaction handle */
+/********************************************************//**
+Writes a log record about an .ibd file create/delete/rename.
+@return	new value of log_ptr */
+UNIV_INLINE
+byte*
+mlog_write_initial_log_record_for_file_op(
+/*======================================*/
+	ulint	type,	/*!< in: MLOG_FILE_CREATE, MLOG_FILE_DELETE, or
+			MLOG_FILE_RENAME */
+	ulint	space_id,/*!< in: space id, if applicable */
+	ulint	page_no,/*!< in: page number (not relevant currently) */
+	byte*	log_ptr,/*!< in: pointer to mtr log which has been opened */
+	mtr_t*	mtr);	/*!< in: mtr */
+/********************************************************//**
+Catenates 1 - 4 bytes to the mtr log. */
+UNIV_INLINE
+void
+mlog_catenate_ulint(
+/*================*/
+	mtr_t*	mtr,	/*!< in: mtr */
+	ulint	val,	/*!< in: value to write */
+	ulint	type);	/*!< in: MLOG_1BYTE, MLOG_2BYTES, MLOG_4BYTES */
+/********************************************************//**
+Catenates n bytes to the mtr log. */
+UNIV_INTERN
+void
+mlog_catenate_string(
+/*=================*/
+	mtr_t*		mtr,	/*!< in: mtr */
+	const byte*	str,	/*!< in: string to write */
+	ulint		len);	/*!< in: string length */
+/********************************************************//**
+Catenates a compressed ulint to mlog. */
+UNIV_INLINE
+void
+mlog_catenate_ulint_compressed(
+/*===========================*/
+	mtr_t*	mtr,	/*!< in: mtr */
+	ulint	val);	/*!< in: value to write */
+/********************************************************//**
+Catenates a compressed dulint to mlog. */
+UNIV_INLINE
+void
+mlog_catenate_dulint_compressed(
+/*============================*/
+	mtr_t*	mtr,	/*!< in: mtr */
+	dulint	val);	/*!< in: value to write */
+/********************************************************//**
+Opens a buffer to mlog. It must be closed with mlog_close.
+@return	buffer, NULL if log mode MTR_LOG_NONE */
+UNIV_INLINE
+byte*
+mlog_open(
+/*======*/
+	mtr_t*	mtr,	/*!< in: mtr */
+	ulint	size);	/*!< in: buffer size in bytes; MUST be
+			smaller than DYN_ARRAY_DATA_SIZE! */
+/********************************************************//**
+Closes a buffer opened to mlog. */
+UNIV_INLINE
+void
+mlog_close(
+/*=======*/
+	mtr_t*	mtr,	/*!< in: mtr */
+	byte*	ptr);	/*!< in: buffer space from ptr up was not used */
+/********************************************************//**
+Writes the initial part of a log record (3..11 bytes).
+If the implementation of this function is changed, all
+size parameters to mlog_open() should be adjusted accordingly!
+@return	new value of log_ptr */
+UNIV_INLINE
+byte*
+mlog_write_initial_log_record_fast(
+/*===============================*/
+	const byte*	ptr,	/*!< in: pointer to (inside) a buffer
+				frame holding the file page where
+				modification is made */
+	byte		type,	/*!< in: log item type: MLOG_1BYTE, ... */
+	byte*		log_ptr,/*!< in: pointer to mtr log which has
+				been opened */
+	mtr_t*		mtr);	/*!< in: mtr */
+#else /* !UNIV_HOTBACKUP */
+# define mlog_write_initial_log_record(ptr,type,mtr) ((void) 0)
+# define mlog_write_initial_log_record_fast(ptr,type,log_ptr,mtr) ((byte *) 0)
+#endif /* !UNIV_HOTBACKUP */
+/********************************************************//**
+Parses an initial log record written by mlog_write_initial_log_record.
+@return	parsed record end, NULL if not a complete record */
+UNIV_INTERN
+byte*
+mlog_parse_initial_log_record(
+/*==========================*/
+	byte*	ptr,	/*!< in: buffer */
+	byte*	end_ptr,/*!< in: buffer end */
+	byte*	type,	/*!< out: log record type: MLOG_1BYTE, ... */
+	ulint*	space,	/*!< out: space id */
+	ulint*	page_no);/*!< out: page number */
+/********************************************************//**
+Parses a log record written by mlog_write_ulint or mlog_write_dulint.
+@return	parsed record end, NULL if not a complete record */
+UNIV_INTERN
+byte*
+mlog_parse_nbytes(
+/*==============*/
+	ulint	type,	/*!< in: log record type: MLOG_1BYTE, ... */
+	byte*	ptr,	/*!< in: buffer */
+	byte*	end_ptr,/*!< in: buffer end */
+	byte*	page,	/*!< in: page where to apply the log record, or NULL */
+	void*	page_zip);/*!< in/out: compressed page, or NULL */
+/********************************************************//**
+Parses a log record written by mlog_write_string.
+@return	parsed record end, NULL if not a complete record */
+UNIV_INTERN
+byte*
+mlog_parse_string(
+/*==============*/
+	byte*	ptr,	/*!< in: buffer */
+	byte*	end_ptr,/*!< in: buffer end */
+	byte*	page,	/*!< in: page where to apply the log record, or NULL */
+	void*	page_zip);/*!< in/out: compressed page, or NULL */
+
+#ifndef UNIV_HOTBACKUP
+/********************************************************//**
+Opens a buffer for mlog, writes the initial log record and,
+if needed, the field lengths of an index.  Reserves space
+for further log entries.  The log entry must be closed with
+mtr_close().
+@return	buffer, NULL if log mode MTR_LOG_NONE */
+UNIV_INTERN
+byte*
+mlog_open_and_write_index(
+/*======================*/
+	mtr_t*		mtr,	/*!< in: mtr */
+	const byte*	rec,	/*!< in: index record or page */
+	dict_index_t*	index,	/*!< in: record descriptor */
+	byte		type,	/*!< in: log item type */
+	ulint		size);	/*!< in: requested buffer size in bytes
+				(if 0, calls mlog_close() and returns NULL) */
+#endif /* !UNIV_HOTBACKUP */
+
+/********************************************************//**
+Parses a log record written by mlog_open_and_write_index.
+@return	parsed record end, NULL if not a complete record */
+UNIV_INTERN
+byte*
+mlog_parse_index(
+/*=============*/
+	byte*		ptr,	/*!< in: buffer */
+	const byte*	end_ptr,/*!< in: buffer end */
+	ibool		comp,	/*!< in: TRUE=compact record format */
+	dict_index_t**	index);	/*!< out, own: dummy index */
+
+#ifndef UNIV_HOTBACKUP
+/* Insert, update, and maybe other functions may use this value to define an
+extra mlog buffer size for variable size data */
+#define MLOG_BUF_MARGIN	256
+#endif /* !UNIV_HOTBACKUP */
+
+#ifndef UNIV_NONINL
+#include "mtr0log.ic"
+#endif
+
+#endif
diff --git a/storage/xtradb/include/mtr0log.ic b/storage/xtradb/include/mtr0log.ic
new file mode 100644
index 00000000000..63af02ba409
--- /dev/null
+++ b/storage/xtradb/include/mtr0log.ic
@@ -0,0 +1,275 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/mtr0log.ic
+Mini-transaction logging routines
+
+Created 12/7/1995 Heikki Tuuri
+*******************************************************/
+
+#include "mach0data.h"
+#include "ut0lst.h"
+#include "buf0buf.h"
+#include "fsp0types.h"
+#include "srv0srv.h"
+#include "trx0sys.h"
+/********************************************************//**
+Opens a buffer to mlog. It must be closed with mlog_close.
+@return	buffer, NULL if log mode MTR_LOG_NONE */
+UNIV_INLINE
+byte*
+mlog_open(
+/*======*/
+	mtr_t*	mtr,	/*!< in: mtr */
+	ulint	size)	/*!< in: buffer size in bytes; MUST be
+			smaller than DYN_ARRAY_DATA_SIZE! */
+{
+	dyn_array_t*	mlog;
+
+	mtr->modifications = TRUE;
+
+	if (mtr_get_log_mode(mtr) == MTR_LOG_NONE) {
+
+		return(NULL);
+	}
+
+	mlog = &(mtr->log);
+
+	return(dyn_array_open(mlog, size));
+}
+
+/********************************************************//**
+Closes a buffer opened to mlog. */
+UNIV_INLINE
+void
+mlog_close(
+/*=======*/
+	mtr_t*	mtr,	/*!< in: mtr */
+	byte*	ptr)	/*!< in: buffer space from ptr up was not used */
+{
+	dyn_array_t*	mlog;
+
+	ut_ad(mtr_get_log_mode(mtr) != MTR_LOG_NONE);
+
+	mlog = &(mtr->log);
+
+	dyn_array_close(mlog, ptr);
+}
+
+#ifndef UNIV_HOTBACKUP
+/********************************************************//**
+Catenates 1 - 4 bytes to the mtr log. The value is not compressed. */
+UNIV_INLINE
+void
+mlog_catenate_ulint(
+/*================*/
+	mtr_t*	mtr,	/*!< in: mtr */
+	ulint	val,	/*!< in: value to write */
+	ulint	type)	/*!< in: MLOG_1BYTE, MLOG_2BYTES, MLOG_4BYTES */
+{
+	dyn_array_t*	mlog;
+	byte*		ptr;
+
+	if (mtr_get_log_mode(mtr) == MTR_LOG_NONE) {
+
+		return;
+	}
+
+	mlog = &(mtr->log);
+
+#if MLOG_1BYTE != 1
+# error "MLOG_1BYTE != 1"
+#endif
+#if MLOG_2BYTES != 2
+# error "MLOG_2BYTES != 2"
+#endif
+#if MLOG_4BYTES != 4
+# error "MLOG_4BYTES != 4"
+#endif
+#if MLOG_8BYTES != 8
+# error "MLOG_8BYTES != 8"
+#endif
+	ptr = (byte*) dyn_array_push(mlog, type);
+
+	if (type == MLOG_4BYTES) {
+		mach_write_to_4(ptr, val);
+	} else if (type == MLOG_2BYTES) {
+		mach_write_to_2(ptr, val);
+	} else {
+		ut_ad(type == MLOG_1BYTE);
+		mach_write_to_1(ptr, val);
+	}
+}
+
+/********************************************************//**
+Catenates a compressed ulint to mlog. */
+UNIV_INLINE
+void
+mlog_catenate_ulint_compressed(
+/*===========================*/
+	mtr_t*	mtr,	/*!< in: mtr */
+	ulint	val)	/*!< in: value to write */
+{
+	byte*	log_ptr;
+
+	log_ptr = mlog_open(mtr, 10);
+
+	/* If no logging is requested, we may return now */
+	if (log_ptr == NULL) {
+
+		return;
+	}
+
+	log_ptr += mach_write_compressed(log_ptr, val);
+
+	mlog_close(mtr, log_ptr);
+}
+
+/********************************************************//**
+Catenates a compressed dulint to mlog. */
+UNIV_INLINE
+void
+mlog_catenate_dulint_compressed(
+/*============================*/
+	mtr_t*	mtr,	/*!< in: mtr */
+	dulint	val)	/*!< in: value to write */
+{
+	byte*	log_ptr;
+
+	log_ptr = mlog_open(mtr, 15);
+
+	/* If no logging is requested, we may return now */
+	if (log_ptr == NULL) {
+
+		return;
+	}
+
+	log_ptr += mach_dulint_write_compressed(log_ptr, val);
+
+	mlog_close(mtr, log_ptr);
+}
+
+/********************************************************//**
+Writes the initial part of a log record (3..11 bytes).
+If the implementation of this function is changed, all
+size parameters to mlog_open() should be adjusted accordingly!
+@return	new value of log_ptr */
+UNIV_INLINE
+byte*
+mlog_write_initial_log_record_fast(
+/*===============================*/
+	const byte*	ptr,	/*!< in: pointer to (inside) a buffer
+				frame holding the file page where
+				modification is made */
+	byte		type,	/*!< in: log item type: MLOG_1BYTE, ... */
+	byte*		log_ptr,/*!< in: pointer to mtr log which has
+				been opened */
+	mtr_t*		mtr)	/*!< in: mtr */
+{
+#ifdef UNIV_DEBUG
+	buf_block_t*	block;
+#endif
+	const byte*	page;
+	ulint		space;
+	ulint		offset;
+
+	ut_ad(mtr_memo_contains_page(mtr, ptr, MTR_MEMO_PAGE_X_FIX));
+	ut_ad(type <= MLOG_BIGGEST_TYPE);
+	ut_ad(ptr && log_ptr);
+
+	page = (const byte*) ut_align_down(ptr, UNIV_PAGE_SIZE);
+	space = mach_read_from_4(page + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID);
+	offset = mach_read_from_4(page + FIL_PAGE_OFFSET);
+
+	/* check whether the page is in the doublewrite buffer;
+	the doublewrite buffer is located in pages
+	FSP_EXTENT_SIZE, ..., 3 * FSP_EXTENT_SIZE - 1 in the
+	system tablespace */
+	if ((space == TRX_SYS_SPACE
+	     || (srv_doublewrite_file && space == TRX_DOUBLEWRITE_SPACE))
+	    && offset >= (ulint)FSP_EXTENT_SIZE && offset < 3 * (ulint)FSP_EXTENT_SIZE) {
+		if (trx_doublewrite_buf_is_being_created) {
+			/* Do nothing: we only come to this branch in an
+			InnoDB database creation. We do not redo log
+			anything for the doublewrite buffer pages. */
+			return(log_ptr);
+		} else {
+			fprintf(stderr,
+				"Error: trying to redo log a record of type "
+				"%d on page %lu of space %lu in the "
+				"doublewrite buffer, continuing anyway.\n"
+				"Please post a bug report to "
+				"bugs.mysql.com.\n",
+				type, offset, space);
+		}
+	}
+
+	mach_write_to_1(log_ptr, type);
+	log_ptr++;
+	log_ptr += mach_write_compressed(log_ptr, space);
+	log_ptr += mach_write_compressed(log_ptr, offset);
+
+	mtr->n_log_recs++;
+
+#ifdef UNIV_LOG_DEBUG
+	fprintf(stderr,
+		"Adding to mtr log record type %lu space %lu page no %lu\n",
+		(ulong) type, space, offset);
+#endif
+
+#ifdef UNIV_DEBUG
+	/* We now assume that all x-latched pages have been modified! */
+	block = (buf_block_t*) buf_block_align(ptr);
+
+	if (!mtr_memo_contains(mtr, block, MTR_MEMO_MODIFY)) {
+
+		mtr_memo_push(mtr, block, MTR_MEMO_MODIFY);
+	}
+#endif
+	return(log_ptr);
+}
+
+/********************************************************//**
+Writes a log record about an .ibd file create/delete/rename.
+@return	new value of log_ptr */
+UNIV_INLINE
+byte*
+mlog_write_initial_log_record_for_file_op(
+/*======================================*/
+	ulint	type,	/*!< in: MLOG_FILE_CREATE, MLOG_FILE_DELETE, or
+			MLOG_FILE_RENAME */
+	ulint	space_id,/*!< in: space id, if applicable */
+	ulint	page_no,/*!< in: page number (not relevant currently) */
+	byte*	log_ptr,/*!< in: pointer to mtr log which has been opened */
+	mtr_t*	mtr)	/*!< in: mtr */
+{
+	ut_ad(log_ptr);
+
+	mach_write_to_1(log_ptr, type);
+	log_ptr++;
+
+	/* We write dummy space id and page number */
+	log_ptr += mach_write_compressed(log_ptr, space_id);
+	log_ptr += mach_write_compressed(log_ptr, page_no);
+
+	mtr->n_log_recs++;
+
+	return(log_ptr);
+}
+#endif /* !UNIV_HOTBACKUP */
diff --git a/storage/xtradb/include/mtr0mtr.h b/storage/xtradb/include/mtr0mtr.h
new file mode 100644
index 00000000000..bc3f1951be9
--- /dev/null
+++ b/storage/xtradb/include/mtr0mtr.h
@@ -0,0 +1,419 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/mtr0mtr.h
+Mini-transaction buffer
+
+Created 11/26/1995 Heikki Tuuri
+*******************************************************/
+
+#ifndef mtr0mtr_h
+#define mtr0mtr_h
+
+#include "univ.i"
+#include "mem0mem.h"
+#include "dyn0dyn.h"
+#include "buf0types.h"
+#include "sync0rw.h"
+#include "ut0byte.h"
+#include "mtr0types.h"
+#include "page0types.h"
+
+/* Logging modes for a mini-transaction */
+#define MTR_LOG_ALL		21	/* default mode: log all operations
+					modifying disk-based data */
+#define	MTR_LOG_NONE		22	/* log no operations */
+/*#define	MTR_LOG_SPACE	23 */	/* log only operations modifying
+					file space page allocation data
+					(operations in fsp0fsp.* ) */
+#define	MTR_LOG_SHORT_INSERTS	24	/* inserts are logged in a shorter
+					form */
+
+/* Types for the mlock objects to store in the mtr memo; NOTE that the
+first 3 values must be RW_S_LATCH, RW_X_LATCH, RW_NO_LATCH */
+#define	MTR_MEMO_PAGE_S_FIX	RW_S_LATCH
+#define	MTR_MEMO_PAGE_X_FIX	RW_X_LATCH
+#define	MTR_MEMO_BUF_FIX	RW_NO_LATCH
+#define MTR_MEMO_MODIFY		54
+#define	MTR_MEMO_S_LOCK		55
+#define	MTR_MEMO_X_LOCK		56
+
+/** @name Log item types
+The log items are declared 'byte' so that the compiler can warn if val
+and type parameters are switched in a call to mlog_write_ulint. NOTE!
+For 1 - 8 bytes, the flag value must give the length also! @{ */
+#define	MLOG_SINGLE_REC_FLAG	128		/*!< if the mtr contains only
+						one log record for one page,
+						i.e., write_initial_log_record
+						has been called only once,
+						this flag is ORed to the type
+						of that first log record */
+#define	MLOG_1BYTE		(1)		/*!< one byte is written */
+#define	MLOG_2BYTES		(2)		/*!< 2 bytes ... */
+#define	MLOG_4BYTES		(4)		/*!< 4 bytes ... */
+#define	MLOG_8BYTES		(8)		/*!< 8 bytes ... */
+#define	MLOG_REC_INSERT		((byte)9)	/*!< record insert */
+#define	MLOG_REC_CLUST_DELETE_MARK ((byte)10)	/*!< mark clustered index record
+						deleted */
+#define	MLOG_REC_SEC_DELETE_MARK ((byte)11)	/*!< mark secondary index record
+						deleted */
+#define MLOG_REC_UPDATE_IN_PLACE ((byte)13)	/*!< update of a record,
+						preserves record field sizes */
+#define MLOG_REC_DELETE		((byte)14)	/*!< delete a record from a
+						page */
+#define	MLOG_LIST_END_DELETE	((byte)15)	/*!< delete record list end on
+						index page */
+#define	MLOG_LIST_START_DELETE	((byte)16)	/*!< delete record list start on
+						index page */
+#define	MLOG_LIST_END_COPY_CREATED ((byte)17)	/*!< copy record list end to a
+						new created index page */
+#define	MLOG_PAGE_REORGANIZE	((byte)18)	/*!< reorganize an
+						index page in
+						ROW_FORMAT=REDUNDANT */
+#define MLOG_PAGE_CREATE	((byte)19)	/*!< create an index page */
+#define	MLOG_UNDO_INSERT	((byte)20)	/*!< insert entry in an undo
+						log */
+#define MLOG_UNDO_ERASE_END	((byte)21)	/*!< erase an undo log
+						page end */
+#define	MLOG_UNDO_INIT		((byte)22)	/*!< initialize a page in an
+						undo log */
+#define MLOG_UNDO_HDR_DISCARD	((byte)23)	/*!< discard an update undo log
+						header */
+#define	MLOG_UNDO_HDR_REUSE	((byte)24)	/*!< reuse an insert undo log
+						header */
+#define MLOG_UNDO_HDR_CREATE	((byte)25)	/*!< create an undo
+						log header */
+#define MLOG_REC_MIN_MARK	((byte)26)	/*!< mark an index
+						record as the
+						predefined minimum
+						record */
+#define MLOG_IBUF_BITMAP_INIT	((byte)27)	/*!< initialize an
+						ibuf bitmap page */
+/*#define	MLOG_FULL_PAGE	((byte)28)	full contents of a page */
+#ifdef UNIV_LOG_LSN_DEBUG
+# define MLOG_LSN		((byte)28)	/* current LSN */
+#endif
+#define MLOG_INIT_FILE_PAGE	((byte)29)	/*!< this means that a
+						file page is taken
+						into use and the prior
+						contents of the page
+						should be ignored: in
+						recovery we must not
+						trust the lsn values
+						stored to the file
+						page */
+#define MLOG_WRITE_STRING	((byte)30)	/*!< write a string to
+						a page */
+#define	MLOG_MULTI_REC_END	((byte)31)	/*!< if a single mtr writes
+						several log records,
+						this log record ends the
+						sequence of these records */
+#define MLOG_DUMMY_RECORD	((byte)32)	/*!< dummy log record used to
+						pad a log block full */
+#define MLOG_FILE_CREATE	((byte)33)	/*!< log record about an .ibd
+						file creation */
+#define MLOG_FILE_RENAME	((byte)34)	/*!< log record about an .ibd
+						file rename */
+#define MLOG_FILE_DELETE	((byte)35)	/*!< log record about an .ibd
+						file deletion */
+#define MLOG_COMP_REC_MIN_MARK	((byte)36)	/*!< mark a compact
+						index record as the
+						predefined minimum
+						record */
+#define MLOG_COMP_PAGE_CREATE	((byte)37)	/*!< create a compact
+						index page */
+#define MLOG_COMP_REC_INSERT	((byte)38)	/*!< compact record insert */
+#define MLOG_COMP_REC_CLUST_DELETE_MARK ((byte)39)
+						/*!< mark compact
+						clustered index record
+						deleted */
+#define MLOG_COMP_REC_SEC_DELETE_MARK ((byte)40)/*!< mark compact
+						secondary index record
+						deleted; this log
+						record type is
+						redundant, as
+						MLOG_REC_SEC_DELETE_MARK
+						is independent of the
+						record format. */
+#define MLOG_COMP_REC_UPDATE_IN_PLACE ((byte)41)/*!< update of a
+						compact record,
+						preserves record field
+						sizes */
+#define MLOG_COMP_REC_DELETE	((byte)42)	/*!< delete a compact record
+						from a page */
+#define MLOG_COMP_LIST_END_DELETE ((byte)43)	/*!< delete compact record list
+						end on index page */
+#define MLOG_COMP_LIST_START_DELETE ((byte)44)	/*!< delete compact record list
+						start on index page */
+#define MLOG_COMP_LIST_END_COPY_CREATED ((byte)45)
+						/*!< copy compact
+						record list end to a
+						new created index
+						page */
+#define MLOG_COMP_PAGE_REORGANIZE ((byte)46)	/*!< reorganize an index page */
+#define MLOG_FILE_CREATE2	((byte)47)	/*!< log record about creating
+						an .ibd file, with format */
+#define MLOG_ZIP_WRITE_NODE_PTR	((byte)48)	/*!< write the node pointer of
+						a record on a compressed
+						non-leaf B-tree page */
+#define MLOG_ZIP_WRITE_BLOB_PTR	((byte)49)	/*!< write the BLOB pointer
+						of an externally stored column
+						on a compressed page */
+#define MLOG_ZIP_WRITE_HEADER	((byte)50)	/*!< write to compressed page
+						header */
+#define MLOG_ZIP_PAGE_COMPRESS	((byte)51)	/*!< compress an index page */
+#define MLOG_BIGGEST_TYPE	((byte)51)	/*!< biggest value (used in
+						assertions) */
+/* @} */
+
+/** @name Flags for MLOG_FILE operations
+(stored in the page number parameter, called log_flags in the
+functions).  The page number parameter was originally written as 0. @{ */
+#define MLOG_FILE_FLAG_TEMP	1	/*!< identifies TEMPORARY TABLE in
+					MLOG_FILE_CREATE, MLOG_FILE_CREATE2 */
+/* @} */
+
+/***************************************************************//**
+Starts a mini-transaction and creates a mini-transaction handle
+and buffer in the memory buffer given by the caller.
+@return	mtr buffer which also acts as the mtr handle */
+UNIV_INLINE
+mtr_t*
+mtr_start(
+/*======*/
+	mtr_t*	mtr);	/*!< in: memory buffer for the mtr buffer */
+/***************************************************************//**
+Commits a mini-transaction. */
+UNIV_INTERN
+void
+mtr_commit(
+/*=======*/
+	mtr_t*	mtr);	/*!< in: mini-transaction */
+/**********************************************************//**
+Sets and returns a savepoint in mtr.
+@return	savepoint */
+UNIV_INLINE
+ulint
+mtr_set_savepoint(
+/*==============*/
+	mtr_t*	mtr);	/*!< in: mtr */
+/**********************************************************//**
+Releases the latches stored in an mtr memo down to a savepoint.
+NOTE! The mtr must not have made changes to buffer pages after the
+savepoint, as these can be handled only by mtr_commit. */
+UNIV_INTERN
+void
+mtr_rollback_to_savepoint(
+/*======================*/
+	mtr_t*	mtr,		/*!< in: mtr */
+	ulint	savepoint);	/*!< in: savepoint */
+#ifndef UNIV_HOTBACKUP
+/**********************************************************//**
+Releases the (index tree) s-latch stored in an mtr memo after a
+savepoint. */
+UNIV_INLINE
+void
+mtr_release_s_latch_at_savepoint(
+/*=============================*/
+	mtr_t*		mtr,		/*!< in: mtr */
+	ulint		savepoint,	/*!< in: savepoint */
+	rw_lock_t*	lock);		/*!< in: latch to release */
+#else /* !UNIV_HOTBACKUP */
+# define mtr_release_s_latch_at_savepoint(mtr,savepoint,lock) ((void) 0)
+#endif /* !UNIV_HOTBACKUP */
+/***************************************************************//**
+Gets the logging mode of a mini-transaction.
+@return	logging mode: MTR_LOG_NONE, ... */
+UNIV_INLINE
+ulint
+mtr_get_log_mode(
+/*=============*/
+	mtr_t*	mtr);	/*!< in: mtr */
+/***************************************************************//**
+Changes the logging mode of a mini-transaction.
+@return	old mode */
+UNIV_INLINE
+ulint
+mtr_set_log_mode(
+/*=============*/
+	mtr_t*	mtr,	/*!< in: mtr */
+	ulint	mode);	/*!< in: logging mode: MTR_LOG_NONE, ... */
+/********************************************************//**
+Reads 1 - 4 bytes from a file page buffered in the buffer pool.
+@return	value read */
+UNIV_INTERN
+ulint
+mtr_read_ulint(
+/*===========*/
+	const byte*	ptr,	/*!< in: pointer from where to read */
+	ulint		type,	/*!< in: MLOG_1BYTE, MLOG_2BYTES, MLOG_4BYTES */
+	mtr_t*		mtr);	/*!< in: mini-transaction handle */
+/********************************************************//**
+Reads 8 bytes from a file page buffered in the buffer pool.
+@return	value read */
+UNIV_INTERN
+dulint
+mtr_read_dulint(
+/*============*/
+	const byte*	ptr,	/*!< in: pointer from where to read */
+	mtr_t*		mtr);	/*!< in: mini-transaction handle */
+#ifndef UNIV_HOTBACKUP
+/*********************************************************************//**
+This macro locks an rw-lock in s-mode. */
+#define mtr_s_lock(B, MTR)	mtr_s_lock_func((B), __FILE__, __LINE__,\
+						(MTR))
+/*********************************************************************//**
+This macro locks an rw-lock in x-mode. */
+#define mtr_x_lock(B, MTR)	mtr_x_lock_func((B), __FILE__, __LINE__,\
+						(MTR))
+/*********************************************************************//**
+NOTE! Use the macro above!
+Locks a lock in s-mode. */
+UNIV_INLINE
+void
+mtr_s_lock_func(
+/*============*/
+	rw_lock_t*	lock,	/*!< in: rw-lock */
+	const char*	file,	/*!< in: file name */
+	ulint		line,	/*!< in: line number */
+	mtr_t*		mtr);	/*!< in: mtr */
+/*********************************************************************//**
+NOTE! Use the macro above!
+Locks a lock in x-mode. */
+UNIV_INLINE
+void
+mtr_x_lock_func(
+/*============*/
+	rw_lock_t*	lock,	/*!< in: rw-lock */
+	const char*	file,	/*!< in: file name */
+	ulint		line,	/*!< in: line number */
+	mtr_t*		mtr);	/*!< in: mtr */
+#endif /* !UNIV_HOTBACKUP */
+
+/***************************************************//**
+Releases an object in the memo stack. */
+UNIV_INTERN
+void
+mtr_memo_release(
+/*=============*/
+	mtr_t*	mtr,	/*!< in: mtr */
+	void*	object,	/*!< in: object */
+	ulint	type);	/*!< in: object type: MTR_MEMO_S_LOCK, ... */
+#ifdef UNIV_DEBUG
+# ifndef UNIV_HOTBACKUP
+/**********************************************************//**
+Checks if memo contains the given item.
+@return	TRUE if contains */
+UNIV_INLINE
+ibool
+mtr_memo_contains(
+/*==============*/
+	mtr_t*		mtr,	/*!< in: mtr */
+	const void*	object,	/*!< in: object to search */
+	ulint		type);	/*!< in: type of object */
+
+/**********************************************************//**
+Checks if memo contains the given page.
+@return	TRUE if contains */
+UNIV_INTERN
+ibool
+mtr_memo_contains_page(
+/*===================*/
+	mtr_t*		mtr,	/*!< in: mtr */
+	const byte*	ptr,	/*!< in: pointer to buffer frame */
+	ulint		type);	/*!< in: type of object */
+/*********************************************************//**
+Prints info of an mtr handle. */
+UNIV_INTERN
+void
+mtr_print(
+/*======*/
+	mtr_t*	mtr);	/*!< in: mtr */
+# else /* !UNIV_HOTBACKUP */
+#  define mtr_memo_contains(mtr, object, type)		TRUE
+#  define mtr_memo_contains_page(mtr, ptr, type)	TRUE
+# endif /* !UNIV_HOTBACKUP */
+#endif /* UNIV_DEBUG */
+/*######################################################################*/
+
+#define	MTR_BUF_MEMO_SIZE	200	/* number of slots in memo */
+
+/***************************************************************//**
+Returns the log object of a mini-transaction buffer.
+@return	log */
+UNIV_INLINE
+dyn_array_t*
+mtr_get_log(
+/*========*/
+	mtr_t*	mtr);	/*!< in: mini-transaction */
+/***************************************************//**
+Pushes an object to an mtr memo stack. */
+UNIV_INLINE
+void
+mtr_memo_push(
+/*==========*/
+	mtr_t*	mtr,	/*!< in: mtr */
+	void*	object,	/*!< in: object */
+	ulint	type);	/*!< in: object type: MTR_MEMO_S_LOCK, ... */
+
+
+/* Type definition of a mini-transaction memo stack slot. */
+typedef	struct mtr_memo_slot_struct	mtr_memo_slot_t;
+struct mtr_memo_slot_struct{
+	ulint	type;	/*!< type of the stored object (MTR_MEMO_S_LOCK, ...) */
+	void*	object;	/*!< pointer to the object */
+};
+
+/* Mini-transaction handle and buffer */
+struct mtr_struct{
+#ifdef UNIV_DEBUG
+	ulint		state;	/*!< MTR_ACTIVE, MTR_COMMITTING, MTR_COMMITTED */
+#endif
+	dyn_array_t	memo;	/*!< memo stack for locks etc. */
+	dyn_array_t	log;	/*!< mini-transaction log */
+	ibool		modifications;
+				/* TRUE if the mtr made modifications to
+				buffer pool pages */
+	ulint		n_log_recs;
+				/* count of how many page initial log records
+				have been written to the mtr log */
+	ulint		log_mode; /* specifies which operations should be
+				logged; default value MTR_LOG_ALL */
+	ib_uint64_t	start_lsn;/* start lsn of the possible log entry for
+				this mtr */
+	ib_uint64_t	end_lsn;/* end lsn of the possible log entry for
+				this mtr */
+#ifdef UNIV_DEBUG
+	ulint		magic_n;
+#endif /* UNIV_DEBUG */
+};
+
+#ifdef UNIV_DEBUG
+# define MTR_MAGIC_N		54551
+#endif /* UNIV_DEBUG */
+
+#define MTR_ACTIVE		12231
+#define MTR_COMMITTING		56456
+#define MTR_COMMITTED		34676
+
+#ifndef UNIV_NONINL
+#include "mtr0mtr.ic"
+#endif
+
+#endif
diff --git a/storage/xtradb/include/mtr0mtr.ic b/storage/xtradb/include/mtr0mtr.ic
new file mode 100644
index 00000000000..18f8e87b3cf
--- /dev/null
+++ b/storage/xtradb/include/mtr0mtr.ic
@@ -0,0 +1,275 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2010, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/mtr0mtr.ic
+Mini-transaction buffer
+
+Created 11/26/1995 Heikki Tuuri
+*******************************************************/
+
+#ifndef UNIV_HOTBACKUP
+# include "sync0sync.h"
+# include "sync0rw.h"
+#endif /* !UNIV_HOTBACKUP */
+#include "mach0data.h"
+
+/***************************************************************//**
+Starts a mini-transaction and creates a mini-transaction handle
+and a buffer in the memory buffer given by the caller.
+@return	mtr buffer which also acts as the mtr handle */
+UNIV_INLINE
+mtr_t*
+mtr_start(
+/*======*/
+	mtr_t*	mtr)	/*!< in: memory buffer for the mtr buffer */
+{
+	dyn_array_create(&(mtr->memo));
+	dyn_array_create(&(mtr->log));
+
+	mtr->log_mode = MTR_LOG_ALL;
+	mtr->modifications = FALSE;
+	mtr->n_log_recs = 0;
+
+	ut_d(mtr->state = MTR_ACTIVE);
+	ut_d(mtr->magic_n = MTR_MAGIC_N);
+
+	return(mtr);
+}
+
+/***************************************************//**
+Pushes an object to an mtr memo stack. */
+UNIV_INLINE
+void
+mtr_memo_push(
+/*==========*/
+	mtr_t*	mtr,	/*!< in: mtr */
+	void*	object,	/*!< in: object */
+	ulint	type)	/*!< in: object type: MTR_MEMO_S_LOCK, ... */
+{
+	dyn_array_t*		memo;
+	mtr_memo_slot_t*	slot;
+
+	ut_ad(object);
+	ut_ad(type >= MTR_MEMO_PAGE_S_FIX);
+	ut_ad(type <= MTR_MEMO_X_LOCK);
+	ut_ad(mtr);
+	ut_ad(mtr->magic_n == MTR_MAGIC_N);
+	ut_ad(mtr->state == MTR_ACTIVE);
+
+	memo = &(mtr->memo);
+
+	slot = (mtr_memo_slot_t*) dyn_array_push(memo, sizeof *slot);
+
+	slot->object = object;
+	slot->type = type;
+}
+
+/**********************************************************//**
+Sets and returns a savepoint in mtr.
+@return	savepoint */
+UNIV_INLINE
+ulint
+mtr_set_savepoint(
+/*==============*/
+	mtr_t*	mtr)	/*!< in: mtr */
+{
+	dyn_array_t*	memo;
+
+	ut_ad(mtr);
+	ut_ad(mtr->magic_n == MTR_MAGIC_N);
+	ut_ad(mtr->state == MTR_ACTIVE);
+
+	memo = &(mtr->memo);
+
+	return(dyn_array_get_data_size(memo));
+}
+
+#ifndef UNIV_HOTBACKUP
+/**********************************************************//**
+Releases the (index tree) s-latch stored in an mtr memo after a
+savepoint. */
+UNIV_INLINE
+void
+mtr_release_s_latch_at_savepoint(
+/*=============================*/
+	mtr_t*		mtr,		/*!< in: mtr */
+	ulint		savepoint,	/*!< in: savepoint */
+	rw_lock_t*	lock)		/*!< in: latch to release */
+{
+	mtr_memo_slot_t* slot;
+	dyn_array_t*	memo;
+
+	ut_ad(mtr);
+	ut_ad(mtr->magic_n == MTR_MAGIC_N);
+	ut_ad(mtr->state == MTR_ACTIVE);
+
+	memo = &(mtr->memo);
+
+	ut_ad(dyn_array_get_data_size(memo) > savepoint);
+
+	slot = (mtr_memo_slot_t*) dyn_array_get_element(memo, savepoint);
+
+	ut_ad(slot->object == lock);
+	ut_ad(slot->type == MTR_MEMO_S_LOCK);
+
+	rw_lock_s_unlock(lock);
+
+	slot->object = NULL;
+}
+
+# ifdef UNIV_DEBUG
+/**********************************************************//**
+Checks if memo contains the given item.
+@return	TRUE if contains */
+UNIV_INLINE
+ibool
+mtr_memo_contains(
+/*==============*/
+	mtr_t*		mtr,	/*!< in: mtr */
+	const void*	object,	/*!< in: object to search */
+	ulint		type)	/*!< in: type of object */
+{
+	mtr_memo_slot_t* slot;
+	dyn_array_t*	memo;
+	ulint		offset;
+
+	ut_ad(mtr);
+	ut_ad(mtr->magic_n == MTR_MAGIC_N);
+	ut_ad(mtr->state == MTR_ACTIVE || mtr->state == MTR_COMMITTING);
+
+	memo = &(mtr->memo);
+
+	offset = dyn_array_get_data_size(memo);
+
+	while (offset > 0) {
+		offset -= sizeof(mtr_memo_slot_t);
+
+		slot = dyn_array_get_element(memo, offset);
+
+		if ((object == slot->object) && (type == slot->type)) {
+
+			return(TRUE);
+		}
+	}
+
+	return(FALSE);
+}
+# endif /* UNIV_DEBUG */
+#endif /* !UNIV_HOTBACKUP */
+
+/***************************************************************//**
+Returns the log object of a mini-transaction buffer.
+@return	log */
+UNIV_INLINE
+dyn_array_t*
+mtr_get_log(
+/*========*/
+	mtr_t*	mtr)	/*!< in: mini-transaction */
+{
+	ut_ad(mtr);
+	ut_ad(mtr->magic_n == MTR_MAGIC_N);
+
+	return(&(mtr->log));
+}
+
+/***************************************************************//**
+Gets the logging mode of a mini-transaction.
+@return	logging mode: MTR_LOG_NONE, ... */
+UNIV_INLINE
+ulint
+mtr_get_log_mode(
+/*=============*/
+	mtr_t*	mtr)	/*!< in: mtr */
+{
+	ut_ad(mtr);
+	ut_ad(mtr->log_mode >= MTR_LOG_ALL);
+	ut_ad(mtr->log_mode <= MTR_LOG_SHORT_INSERTS);
+
+	return(mtr->log_mode);
+}
+
+/***************************************************************//**
+Changes the logging mode of a mini-transaction.
+@return	old mode */
+UNIV_INLINE
+ulint
+mtr_set_log_mode(
+/*=============*/
+	mtr_t*	mtr,	/*!< in: mtr */
+	ulint	mode)	/*!< in: logging mode: MTR_LOG_NONE, ... */
+{
+	ulint	old_mode;
+
+	ut_ad(mtr);
+	ut_ad(mode >= MTR_LOG_ALL);
+	ut_ad(mode <= MTR_LOG_SHORT_INSERTS);
+
+	old_mode = mtr->log_mode;
+
+	if ((mode == MTR_LOG_SHORT_INSERTS) && (old_mode == MTR_LOG_NONE)) {
+		/* Do nothing */
+	} else {
+		mtr->log_mode = mode;
+	}
+
+	ut_ad(old_mode >= MTR_LOG_ALL);
+	ut_ad(old_mode <= MTR_LOG_SHORT_INSERTS);
+
+	return(old_mode);
+}
+
+#ifndef UNIV_HOTBACKUP
+/*********************************************************************//**
+Locks a lock in s-mode. */
+UNIV_INLINE
+void
+mtr_s_lock_func(
+/*============*/
+	rw_lock_t*	lock,	/*!< in: rw-lock */
+	const char*	file,	/*!< in: file name */
+	ulint		line,	/*!< in: line number */
+	mtr_t*		mtr)	/*!< in: mtr */
+{
+	ut_ad(mtr);
+	ut_ad(lock);
+
+	rw_lock_s_lock_func(lock, 0, file, line);
+
+	mtr_memo_push(mtr, lock, MTR_MEMO_S_LOCK);
+}
+
+/*********************************************************************//**
+Locks a lock in x-mode. */
+UNIV_INLINE
+void
+mtr_x_lock_func(
+/*============*/
+	rw_lock_t*	lock,	/*!< in: rw-lock */
+	const char*	file,	/*!< in: file name */
+	ulint		line,	/*!< in: line number */
+	mtr_t*		mtr)	/*!< in: mtr */
+{
+	ut_ad(mtr);
+	ut_ad(lock);
+
+	rw_lock_x_lock_func(lock, 0, file, line);
+
+	mtr_memo_push(mtr, lock, MTR_MEMO_X_LOCK);
+}
+#endif /* !UNIV_HOTBACKUP */
diff --git a/storage/xtradb/include/mtr0types.h b/storage/xtradb/include/mtr0types.h
new file mode 100644
index 00000000000..83a7aaf3839
--- /dev/null
+++ b/storage/xtradb/include/mtr0types.h
@@ -0,0 +1,31 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/mtr0types.h
+Mini-transaction buffer global types
+
+Created 11/26/1995 Heikki Tuuri
+*******************************************************/
+
+#ifndef mtr0types_h
+#define mtr0types_h
+
+typedef struct mtr_struct	mtr_t;
+
+#endif
diff --git a/storage/xtradb/include/mysql_addons.h b/storage/xtradb/include/mysql_addons.h
new file mode 100644
index 00000000000..17660c18710
--- /dev/null
+++ b/storage/xtradb/include/mysql_addons.h
@@ -0,0 +1,33 @@
+/*****************************************************************************
+
+Copyright (c) 2007, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/mysql_addons.h
+This file contains functions that need to be added to
+MySQL code but have not been added yet.
+
+Whenever you add a function here submit a MySQL bug
+report (feature request) with the implementation. Then
+write the bug number in the comment before the
+function in this file.
+
+When MySQL commits the function it can be deleted from
+here. In a perfect world this file exists but is empty.
+
+Created November 07, 2007 Vasil Dimov
+*******************************************************/
diff --git a/storage/xtradb/include/os0file.h b/storage/xtradb/include/os0file.h
new file mode 100644
index 00000000000..eeab8a2b5d9
--- /dev/null
+++ b/storage/xtradb/include/os0file.h
@@ -0,0 +1,794 @@
+/***********************************************************************
+
+Copyright (c) 1995, 2010, Innobase Oy. All Rights Reserved.
+Copyright (c) 2009, Percona Inc.
+
+Portions of this file contain modifications contributed and copyrighted
+by Percona Inc.. Those modifications are
+gratefully acknowledged and are described briefly in the InnoDB
+documentation. The contributions by Percona Inc. are incorporated with
+their permission, and subject to the conditions contained in the file
+COPYING.Percona.
+
+This program is free software; you can redistribute it and/or modify it
+under the terms of the GNU General Public License as published by the
+Free Software Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
+Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+
+***********************************************************************/
+
+/**************************************************//**
+@file include/os0file.h
+The interface to the operating system file io
+
+Created 10/21/1995 Heikki Tuuri
+*******************************************************/
+
+#ifndef os0file_h
+#define os0file_h
+
+#include "univ.i"
+#include "trx0types.h"
+
+#ifndef __WIN__
+#include <dirent.h>
+#include <sys/stat.h>
+#include <time.h>
+#endif
+
+/** File node of a tablespace or the log data space */
+typedef	struct fil_node_struct	fil_node_t;
+
+#ifdef UNIV_DO_FLUSH
+extern ibool	os_do_not_call_flush_at_each_write;
+#endif /* UNIV_DO_FLUSH */
+extern ibool	os_has_said_disk_full;
+/** Flag: enable debug printout for asynchronous i/o */
+extern ibool	os_aio_print_debug;
+
+/** Number of pending os_file_pread() operations */
+extern ulint	os_file_n_pending_preads;
+/** Number of pending os_file_pwrite() operations */
+extern ulint	os_file_n_pending_pwrites;
+
+/** Number of pending read operations */
+extern ulint	os_n_pending_reads;
+/** Number of pending write operations */
+extern ulint	os_n_pending_writes;
+
+#ifdef __WIN__
+
+/** We define always WIN_ASYNC_IO, and check at run-time whether
+   the OS actually supports it: Win 95 does not, NT does. */
+#define WIN_ASYNC_IO
+
+/** Use unbuffered I/O */
+#define UNIV_NON_BUFFERED_IO
+
+#endif
+
+#ifdef __WIN__
+/** File handle */
+#define os_file_t	HANDLE
+/** Convert a C file descriptor to a native file handle
+@param fd	file descriptor
+@return		native file handle */
+#define OS_FILE_FROM_FD(fd) (HANDLE) _get_osfhandle(fd)
+#else
+/** File handle */
+typedef int	os_file_t;
+/** Convert a C file descriptor to a native file handle
+@param fd	file descriptor
+@return		native file handle */
+#define OS_FILE_FROM_FD(fd) fd
+#endif
+
+/** Umask for creating files */
+extern ulint	os_innodb_umask;
+
+/** If this flag is TRUE, then we will use the native aio of the
+OS (provided we compiled Innobase with it in), otherwise we will
+use simulated aio we build below with threads */
+
+extern ibool	os_aio_use_native_aio;
+
+/** The next value should be smaller or equal to the smallest sector size used
+on any disk. A log block is required to be a portion of disk which is written
+so that if the start and the end of a block get written to disk, then the
+whole block gets written. This should be true even in most cases of a crash:
+if this fails for a log block, then it is equivalent to a media failure in the
+log. */
+
+#define OS_FILE_LOG_BLOCK_SIZE		512
+
+/** Options for file_create @{ */
+#define	OS_FILE_OPEN			51
+#define	OS_FILE_CREATE			52
+#define OS_FILE_OVERWRITE		53
+#define OS_FILE_OPEN_RAW		54
+#define	OS_FILE_CREATE_PATH		55
+#define	OS_FILE_OPEN_RETRY		56	/* for os_file_create() on
+						the first ibdata file */
+
+#define OS_FILE_READ_ONLY		333
+#define	OS_FILE_READ_WRITE		444
+#define	OS_FILE_READ_ALLOW_DELETE	555	/* for ibbackup */
+
+/* Options for file_create */
+#define	OS_FILE_AIO			61
+#define	OS_FILE_NORMAL			62
+/* @} */
+
+/** Types for file create @{ */
+#define	OS_DATA_FILE			100
+#define OS_LOG_FILE			101
+/* @} */
+
+/** Error codes from os_file_get_last_error @{ */
+#define	OS_FILE_NOT_FOUND		71
+#define	OS_FILE_DISK_FULL		72
+#define	OS_FILE_ALREADY_EXISTS		73
+#define	OS_FILE_PATH_ERROR		74
+#define	OS_FILE_AIO_RESOURCES_RESERVED	75	/* wait for OS aio resources
+						to become available again */
+#define	OS_FILE_SHARING_VIOLATION	76
+#define	OS_FILE_ERROR_NOT_SPECIFIED	77
+#define	OS_FILE_INSUFFICIENT_RESOURCE	78
+#define	OS_FILE_OPERATION_ABORTED	79
+/* @} */
+
+/** Types for aio operations @{ */
+#define OS_FILE_READ	10
+#define OS_FILE_WRITE	11
+
+#define OS_FILE_LOG	256	/* This can be ORed to type */
+/* @} */
+
+#define OS_AIO_N_PENDING_IOS_PER_THREAD 32	/*!< Win NT does not allow more
+						than 64 */
+
+/** Modes for aio operations @{ */
+#define OS_AIO_NORMAL	21	/*!< Normal asynchronous i/o not for ibuf
+				pages or ibuf bitmap pages */
+#define OS_AIO_IBUF	22	/*!< Asynchronous i/o for ibuf pages or ibuf
+				bitmap pages */
+#define OS_AIO_LOG	23	/*!< Asynchronous i/o for the log */
+#define OS_AIO_SYNC	24	/*!< Asynchronous i/o where the calling thread
+				will itself wait for the i/o to complete,
+				doing also the job of the i/o-handler thread;
+				can be used for any pages, ibuf or non-ibuf.
+				This is used to save CPU time, as we can do
+				with fewer thread switches. Plain synchronous
+				i/o is not as good, because it must serialize
+				the file seek and read or write, causing a
+				bottleneck for parallelism. */
+
+#define OS_AIO_SIMULATED_WAKE_LATER	512 /*!< This can be ORed to mode
+				in the call of os_aio(...),
+				if the caller wants to post several i/o
+				requests in a batch, and only after that
+				wake the i/o-handler thread; this has
+				effect only in simulated aio */
+/* @} */
+
+#define OS_WIN31	1	/*!< Microsoft Windows 3.x */
+#define OS_WIN95	2	/*!< Microsoft Windows 95 */
+#define OS_WINNT	3	/*!< Microsoft Windows NT 3.x */
+#define OS_WIN2000	4	/*!< Microsoft Windows 2000 */
+
+extern ulint	os_n_file_reads;
+extern ulint	os_n_file_writes;
+extern ulint	os_n_fsyncs;
+
+/* File types for directory entry data type */
+
+enum os_file_type_enum{
+	OS_FILE_TYPE_UNKNOWN = 0,
+	OS_FILE_TYPE_FILE,			/* regular file */
+	OS_FILE_TYPE_DIR,			/* directory */
+	OS_FILE_TYPE_LINK			/* symbolic link */
+};
+typedef enum os_file_type_enum	  os_file_type_t;
+
+/* Maximum path string length in bytes when referring to tables with in the
+'./databasename/tablename.ibd' path format; we can allocate at least 2 buffers
+of this size from the thread stack; that is why this should not be made much
+bigger than 4000 bytes */
+#define OS_FILE_MAX_PATH	4000
+
+/* Struct used in fetching information of a file in a directory */
+struct os_file_stat_struct{
+	char		name[OS_FILE_MAX_PATH];	/*!< path to a file */
+	os_file_type_t	type;			/*!< file type */
+	ib_int64_t	size;			/*!< file size */
+	time_t		ctime;			/*!< creation time */
+	time_t		mtime;			/*!< modification time */
+	time_t		atime;			/*!< access time */
+};
+typedef struct os_file_stat_struct	os_file_stat_t;
+
+#ifdef __WIN__
+typedef HANDLE	os_file_dir_t;	/*!< directory stream */
+#else
+typedef DIR*	os_file_dir_t;	/*!< directory stream */
+#endif
+
+/***********************************************************************//**
+Gets the operating system version. Currently works only on Windows.
+@return	OS_WIN95, OS_WIN31, OS_WINNT, or OS_WIN2000 */
+UNIV_INTERN
+ulint
+os_get_os_version(void);
+/*===================*/
+#ifndef UNIV_HOTBACKUP
+/****************************************************************//**
+Creates the seek mutexes used in positioned reads and writes. */
+UNIV_INTERN
+void
+os_io_init_simple(void);
+/*===================*/
+/***********************************************************************//**
+Creates a temporary file.  This function is like tmpfile(3), but
+the temporary file is created in the MySQL temporary directory.
+On Netware, this function is like tmpfile(3), because the C run-time
+library of Netware does not expose the delete-on-close flag.
+@return	temporary file handle, or NULL on error */
+
+FILE*
+os_file_create_tmpfile(void);
+/*========================*/
+#endif /* !UNIV_HOTBACKUP */
+/***********************************************************************//**
+The os_file_opendir() function opens a directory stream corresponding to the
+directory named by the dirname argument. The directory stream is positioned
+at the first entry. In both Unix and Windows we automatically skip the '.'
+and '..' items at the start of the directory listing.
+@return	directory stream, NULL if error */
+UNIV_INTERN
+os_file_dir_t
+os_file_opendir(
+/*============*/
+	const char*	dirname,	/*!< in: directory name; it must not
+					contain a trailing '\' or '/' */
+	ibool		error_is_fatal);/*!< in: TRUE if we should treat an
+					error as a fatal error; if we try to
+					open symlinks then we do not wish a
+					fatal error if it happens not to be
+					a directory */
+/***********************************************************************//**
+Closes a directory stream.
+@return	0 if success, -1 if failure */
+UNIV_INTERN
+int
+os_file_closedir(
+/*=============*/
+	os_file_dir_t	dir);	/*!< in: directory stream */
+/***********************************************************************//**
+This function returns information of the next file in the directory. We jump
+over the '.' and '..' entries in the directory.
+@return	0 if ok, -1 if error, 1 if at the end of the directory */
+UNIV_INTERN
+int
+os_file_readdir_next_file(
+/*======================*/
+	const char*	dirname,/*!< in: directory name or path */
+	os_file_dir_t	dir,	/*!< in: directory stream */
+	os_file_stat_t*	info);	/*!< in/out: buffer where the info is returned */
+/*****************************************************************//**
+This function attempts to create a directory named pathname. The new directory
+gets default permissions. On Unix, the permissions are (0770 & ~umask). If the
+directory exists already, nothing is done and the call succeeds, unless the
+fail_if_exists arguments is true.
+@return	TRUE if call succeeds, FALSE on error */
+UNIV_INTERN
+ibool
+os_file_create_directory(
+/*=====================*/
+	const char*	pathname,	/*!< in: directory name as
+					null-terminated string */
+	ibool		fail_if_exists);/*!< in: if TRUE, pre-existing directory
+					is treated as an error. */
+/****************************************************************//**
+A simple function to open or create a file.
+@return own: handle to the file, not defined if error, error number
+can be retrieved with os_file_get_last_error */
+UNIV_INTERN
+os_file_t
+os_file_create_simple(
+/*==================*/
+	const char*	name,	/*!< in: name of the file or path as a
+				null-terminated string */
+	ulint		create_mode,/*!< in: OS_FILE_OPEN if an existing file is
+				opened (if does not exist, error), or
+				OS_FILE_CREATE if a new file is created
+				(if exists, error), or
+				OS_FILE_CREATE_PATH if new file
+				(if exists, error) and subdirectories along
+				its path are created (if needed)*/
+	ulint		access_type,/*!< in: OS_FILE_READ_ONLY or
+				OS_FILE_READ_WRITE */
+	ibool*		success);/*!< out: TRUE if succeed, FALSE if error */
+/****************************************************************//**
+A simple function to open or create a file.
+@return own: handle to the file, not defined if error, error number
+can be retrieved with os_file_get_last_error */
+UNIV_INTERN
+os_file_t
+os_file_create_simple_no_error_handling(
+/*====================================*/
+	const char*	name,	/*!< in: name of the file or path as a
+				null-terminated string */
+	ulint		create_mode,/*!< in: OS_FILE_OPEN if an existing file
+				is opened (if does not exist, error), or
+				OS_FILE_CREATE if a new file is created
+				(if exists, error) */
+	ulint		access_type,/*!< in: OS_FILE_READ_ONLY,
+				OS_FILE_READ_WRITE, or
+				OS_FILE_READ_ALLOW_DELETE; the last option is
+				used by a backup program reading the file */
+	ibool*		success);/*!< out: TRUE if succeed, FALSE if error */
+/****************************************************************//**
+Tries to disable OS caching on an opened file descriptor. */
+UNIV_INTERN
+void
+os_file_set_nocache(
+/*================*/
+	int		fd,		/*!< in: file descriptor to alter */
+	const char*	file_name,	/*!< in: file name, used in the
+					diagnostic message */
+	const char*	operation_name);/*!< in: "open" or "create"; used in the
+					diagnostic message */
+/****************************************************************//**
+Opens an existing file or creates a new.
+@return own: handle to the file, not defined if error, error number
+can be retrieved with os_file_get_last_error */
+UNIV_INTERN
+os_file_t
+os_file_create(
+/*===========*/
+	const char*	name,	/*!< in: name of the file or path as a
+				null-terminated string */
+	ulint		create_mode,/*!< in: OS_FILE_OPEN if an existing file
+				is opened (if does not exist, error), or
+				OS_FILE_CREATE if a new file is created
+				(if exists, error),
+				OS_FILE_OVERWRITE if a new file is created
+				or an old overwritten;
+				OS_FILE_OPEN_RAW, if a raw device or disk
+				partition should be opened */
+	ulint		purpose,/*!< in: OS_FILE_AIO, if asynchronous,
+				non-buffered i/o is desired,
+				OS_FILE_NORMAL, if any normal file;
+				NOTE that it also depends on type, os_aio_..
+				and srv_.. variables whether we really use
+				async i/o or unbuffered i/o: look in the
+				function source code for the exact rules */
+	ulint		type,	/*!< in: OS_DATA_FILE or OS_LOG_FILE */
+	ibool*		success);/*!< out: TRUE if succeed, FALSE if error */
+/***********************************************************************//**
+Deletes a file. The file has to be closed before calling this.
+@return	TRUE if success */
+UNIV_INTERN
+ibool
+os_file_delete(
+/*===========*/
+	const char*	name);	/*!< in: file path as a null-terminated string */
+
+/***********************************************************************//**
+Deletes a file if it exists. The file has to be closed before calling this.
+@return	TRUE if success */
+UNIV_INTERN
+ibool
+os_file_delete_if_exists(
+/*=====================*/
+	const char*	name);	/*!< in: file path as a null-terminated string */
+/***********************************************************************//**
+Renames a file (can also move it to another directory). It is safest that the
+file is closed before calling this function.
+@return	TRUE if success */
+UNIV_INTERN
+ibool
+os_file_rename(
+/*===========*/
+	const char*	oldpath,	/*!< in: old file path as a
+					null-terminated string */
+	const char*	newpath);	/*!< in: new file path */
+/***********************************************************************//**
+Closes a file handle. In case of error, error number can be retrieved with
+os_file_get_last_error.
+@return	TRUE if success */
+UNIV_INTERN
+ibool
+os_file_close(
+/*==========*/
+	os_file_t	file);	/*!< in, own: handle to a file */
+#ifdef UNIV_HOTBACKUP
+/***********************************************************************//**
+Closes a file handle.
+@return	TRUE if success */
+UNIV_INTERN
+ibool
+os_file_close_no_error_handling(
+/*============================*/
+	os_file_t	file);	/*!< in, own: handle to a file */
+#endif /* UNIV_HOTBACKUP */
+/***********************************************************************//**
+Gets a file size.
+@return	TRUE if success */
+UNIV_INTERN
+ibool
+os_file_get_size(
+/*=============*/
+	os_file_t	file,	/*!< in: handle to a file */
+	ulint*		size,	/*!< out: least significant 32 bits of file
+				size */
+	ulint*		size_high);/*!< out: most significant 32 bits of size */
+/***********************************************************************//**
+Gets file size as a 64-bit integer ib_int64_t.
+@return	size in bytes, -1 if error */
+UNIV_INTERN
+ib_int64_t
+os_file_get_size_as_iblonglong(
+/*===========================*/
+	os_file_t	file);	/*!< in: handle to a file */
+/***********************************************************************//**
+Write the specified number of zeros to a newly created file.
+@return	TRUE if success */
+UNIV_INTERN
+ibool
+os_file_set_size(
+/*=============*/
+	const char*	name,	/*!< in: name of the file or path as a
+				null-terminated string */
+	os_file_t	file,	/*!< in: handle to a file */
+	ulint		size,	/*!< in: least significant 32 bits of file
+				size */
+	ulint		size_high);/*!< in: most significant 32 bits of size */
+/***********************************************************************//**
+Truncates a file at its current position.
+@return	TRUE if success */
+UNIV_INTERN
+ibool
+os_file_set_eof(
+/*============*/
+	FILE*		file);	/*!< in: file to be truncated */
+/***********************************************************************//**
+Flushes the write buffers of a given file to the disk.
+@return	TRUE if success */
+UNIV_INTERN
+ibool
+os_file_flush(
+/*==========*/
+	os_file_t	file);	/*!< in, own: handle to a file */
+/***********************************************************************//**
+Retrieves the last error number if an error occurs in a file io function.
+The number should be retrieved before any other OS calls (because they may
+overwrite the error number). If the number is not known to this program,
+the OS error number + 100 is returned.
+@return	error number, or OS error number + 100 */
+UNIV_INTERN
+ulint
+os_file_get_last_error(
+/*===================*/
+	ibool	report_all_errors);	/*!< in: TRUE if we want an error message
+					printed of all errors */
+/*******************************************************************//**
+Requests a synchronous read operation.
+@return	TRUE if request was successful, FALSE if fail */
+#define os_file_read(file, buf, offset, offset_high, n)         \
+		_os_file_read(file, buf, offset, offset_high, n, NULL)
+
+UNIV_INTERN
+ibool
+_os_file_read(
+/*=========*/
+	os_file_t	file,	/*!< in: handle to a file */
+	void*		buf,	/*!< in: buffer where to read */
+	ulint		offset,	/*!< in: least significant 32 bits of file
+				offset where to read */
+	ulint		offset_high,/*!< in: most significant 32 bits of
+				offset */
+	ulint		n,	/*!< in: number of bytes to read */
+	trx_t*		trx);
+/*******************************************************************//**
+Rewind file to its start, read at most size - 1 bytes from it to str, and
+NUL-terminate str. All errors are silently ignored. This function is
+mostly meant to be used with temporary files. */
+UNIV_INTERN
+void
+os_file_read_string(
+/*================*/
+	FILE*	file,	/*!< in: file to read from */
+	char*	str,	/*!< in: buffer where to read */
+	ulint	size);	/*!< in: size of buffer */
+/*******************************************************************//**
+Requests a synchronous positioned read operation. This function does not do
+any error handling. In case of error it returns FALSE.
+@return	TRUE if request was successful, FALSE if fail */
+UNIV_INTERN
+ibool
+os_file_read_no_error_handling(
+/*===========================*/
+	os_file_t	file,	/*!< in: handle to a file */
+	void*		buf,	/*!< in: buffer where to read */
+	ulint		offset,	/*!< in: least significant 32 bits of file
+				offset where to read */
+	ulint		offset_high,/*!< in: most significant 32 bits of
+				offset */
+	ulint		n);	/*!< in: number of bytes to read */
+
+/*******************************************************************//**
+Requests a synchronous write operation.
+@return	TRUE if request was successful, FALSE if fail */
+UNIV_INTERN
+ibool
+os_file_write(
+/*==========*/
+	const char*	name,	/*!< in: name of the file or path as a
+				null-terminated string */
+	os_file_t	file,	/*!< in: handle to a file */
+	const void*	buf,	/*!< in: buffer from which to write */
+	ulint		offset,	/*!< in: least significant 32 bits of file
+				offset where to write */
+	ulint		offset_high,/*!< in: most significant 32 bits of
+				offset */
+	ulint		n);	/*!< in: number of bytes to write */
+/*******************************************************************//**
+Check the existence and type of the given file.
+@return	TRUE if call succeeded */
+UNIV_INTERN
+ibool
+os_file_status(
+/*===========*/
+	const char*	path,	/*!< in:	pathname of the file */
+	ibool*		exists,	/*!< out: TRUE if file exists */
+	os_file_type_t* type);	/*!< out: type of the file (if it exists) */
+/****************************************************************//**
+The function os_file_dirname returns a directory component of a
+null-terminated pathname string.  In the usual case, dirname returns
+the string up to, but not including, the final '/', and basename
+is the component following the final '/'.  Trailing '/' charac�
+ters are not counted as part of the pathname.
+
+If path does not contain a slash, dirname returns the string ".".
+
+Concatenating the string returned by dirname, a "/", and the basename
+yields a complete pathname.
+
+The return value is  a copy of the directory component of the pathname.
+The copy is allocated from heap. It is the caller responsibility
+to free it after it is no longer needed.
+
+The following list of examples (taken from SUSv2) shows the strings
+returned by dirname and basename for different paths:
+
+       path	      dirname	     basename
+       "/usr/lib"     "/usr"	     "lib"
+       "/usr/"	      "/"	     "usr"
+       "usr"	      "."	     "usr"
+       "/"	      "/"	     "/"
+       "."	      "."	     "."
+       ".."	      "."	     ".."
+
+@return	own: directory component of the pathname */
+UNIV_INTERN
+char*
+os_file_dirname(
+/*============*/
+	const char*	path);	/*!< in: pathname */
+/****************************************************************//**
+Creates all missing subdirectories along the given path.
+@return	TRUE if call succeeded FALSE otherwise */
+UNIV_INTERN
+ibool
+os_file_create_subdirs_if_needed(
+/*=============================*/
+	const char*	path);	/*!< in: path name */
+/***********************************************************************
+Initializes the asynchronous io system. Creates one array each for ibuf
+and log i/o. Also creates one array each for read and write where each
+array is divided logically into n_read_segs and n_write_segs
+respectively. The caller must create an i/o handler thread for each
+segment in these arrays. This function also creates the sync array.
+No i/o handler thread needs to be created for that */
+UNIV_INTERN
+void
+os_aio_init(
+/*========*/
+	ulint	n_per_seg,	/*<! in: maximum number of pending aio
+				operations allowed per segment */
+	ulint	n_read_segs,	/*<! in: number of reader threads */
+	ulint	n_write_segs,	/*<! in: number of writer threads */
+	ulint	n_slots_sync);	/*<! in: number of slots in the sync aio
+				array */
+/***********************************************************************
+Frees the asynchronous io system. */
+UNIV_INTERN
+void
+os_aio_free(void);
+/*=============*/
+
+/*******************************************************************//**
+Requests an asynchronous i/o operation.
+@return	TRUE if request was queued successfully, FALSE if fail */
+UNIV_INTERN
+ibool
+os_aio(
+/*===*/
+	ulint		type,	/*!< in: OS_FILE_READ or OS_FILE_WRITE */
+	ulint		mode,	/*!< in: OS_AIO_NORMAL, ..., possibly ORed
+				to OS_AIO_SIMULATED_WAKE_LATER: the
+				last flag advises this function not to wake
+				i/o-handler threads, but the caller will
+				do the waking explicitly later, in this
+				way the caller can post several requests in
+				a batch; NOTE that the batch must not be
+				so big that it exhausts the slots in aio
+				arrays! NOTE that a simulated batch
+				may introduce hidden chances of deadlocks,
+				because i/os are not actually handled until
+				all have been posted: use with great
+				caution! */
+	const char*	name,	/*!< in: name of the file or path as a
+				null-terminated string */
+	os_file_t	file,	/*!< in: handle to a file */
+	void*		buf,	/*!< in: buffer where to read or from which
+				to write */
+	ulint		offset,	/*!< in: least significant 32 bits of file
+				offset where to read or write */
+	ulint		offset_high, /*!< in: most significant 32 bits of
+				offset */
+	ulint		n,	/*!< in: number of bytes to read or write */
+	fil_node_t*	message1,/*!< in: message for the aio handler
+				(can be used to identify a completed
+				aio operation); ignored if mode is
+				OS_AIO_SYNC */
+	void*		message2,/*!< in: message for the aio handler
+				(can be used to identify a completed
+				aio operation); ignored if mode is
+				OS_AIO_SYNC */
+	trx_t*		trx);
+/************************************************************************//**
+Wakes up all async i/o threads so that they know to exit themselves in
+shutdown. */
+UNIV_INTERN
+void
+os_aio_wake_all_threads_at_shutdown(void);
+/*=====================================*/
+/************************************************************************//**
+Waits until there are no pending writes in os_aio_write_array. There can
+be other, synchronous, pending writes. */
+UNIV_INTERN
+void
+os_aio_wait_until_no_pending_writes(void);
+/*=====================================*/
+/**********************************************************************//**
+Wakes up simulated aio i/o-handler threads if they have something to do. */
+UNIV_INTERN
+void
+os_aio_simulated_wake_handler_threads(void);
+/*=======================================*/
+/**********************************************************************//**
+This function can be called if one wants to post a batch of reads and
+prefers an i/o-handler thread to handle them all at once later. You must
+call os_aio_simulated_wake_handler_threads later to ensure the threads
+are not left sleeping! */
+UNIV_INTERN
+void
+os_aio_simulated_put_read_threads_to_sleep(void);
+/*============================================*/
+
+#ifdef WIN_ASYNC_IO
+/**********************************************************************//**
+This function is only used in Windows asynchronous i/o.
+Waits for an aio operation to complete. This function is used to wait the
+for completed requests. The aio array of pending requests is divided
+into segments. The thread specifies which segment or slot it wants to wait
+for. NOTE: this function will also take care of freeing the aio slot,
+therefore no other thread is allowed to do the freeing!
+@return	TRUE if the aio operation succeeded */
+UNIV_INTERN
+ibool
+os_aio_windows_handle(
+/*==================*/
+	ulint	segment,	/*!< in: the number of the segment in the aio
+				arrays to wait for; segment 0 is the ibuf
+				i/o thread, segment 1 the log i/o thread,
+				then follow the non-ibuf read threads, and as
+				the last are the non-ibuf write threads; if
+				this is ULINT_UNDEFINED, then it means that
+				sync aio is used, and this parameter is
+				ignored */
+	ulint	pos,		/*!< this parameter is used only in sync aio:
+				wait for the aio slot at this position */
+	fil_node_t**message1,	/*!< out: the messages passed with the aio
+				request; note that also in the case where
+				the aio operation failed, these output
+				parameters are valid and can be used to
+				restart the operation, for example */
+	void**	message2,
+	ulint*	type);		/*!< out: OS_FILE_WRITE or ..._READ */
+#endif
+
+/**********************************************************************//**
+Does simulated aio. This function should be called by an i/o-handler
+thread.
+@return	TRUE if the aio operation succeeded */
+UNIV_INTERN
+ibool
+os_aio_simulated_handle(
+/*====================*/
+	ulint	segment,	/*!< in: the number of the segment in the aio
+				arrays to wait for; segment 0 is the ibuf
+				i/o thread, segment 1 the log i/o thread,
+				then follow the non-ibuf read threads, and as
+				the last are the non-ibuf write threads */
+	fil_node_t**message1,	/*!< out: the messages passed with the aio
+				request; note that also in the case where
+				the aio operation failed, these output
+				parameters are valid and can be used to
+				restart the operation, for example */
+	void**	message2,
+	ulint*	type);		/*!< out: OS_FILE_WRITE or ..._READ */
+/**********************************************************************//**
+Validates the consistency of the aio system.
+@return	TRUE if ok */
+UNIV_INTERN
+ibool
+os_aio_validate(void);
+/*=================*/
+/**********************************************************************//**
+Prints info of the aio arrays. */
+UNIV_INTERN
+void
+os_aio_print(
+/*=========*/
+	FILE*	file);	/*!< in: file where to print */
+/**********************************************************************//**
+Refreshes the statistics used to print per-second averages. */
+UNIV_INTERN
+void
+os_aio_refresh_stats(void);
+/*======================*/
+
+#ifdef UNIV_DEBUG
+/**********************************************************************//**
+Checks that all slots in the system have been freed, that is, there are
+no pending io operations. */
+UNIV_INTERN
+ibool
+os_aio_all_slots_free(void);
+/*=======================*/
+#endif /* UNIV_DEBUG */
+
+/*******************************************************************//**
+This function returns information about the specified file
+@return	TRUE if stat information found */
+UNIV_INTERN
+ibool
+os_file_get_status(
+/*===============*/
+	const char*	path,		/*!< in:	pathname of the file */
+	os_file_stat_t* stat_info);	/*!< information of a file in a
+					directory */
+
+#if !defined(UNIV_HOTBACKUP) && !defined(__NETWARE__)
+/*********************************************************************//**
+Creates a temporary file that will be deleted on close.
+This function is defined in ha_innodb.cc.
+@return	temporary file descriptor, or < 0 on error */
+UNIV_INTERN
+int
+innobase_mysql_tmpfile(void);
+/*========================*/
+#endif /* !UNIV_HOTBACKUP && !__NETWARE__ */
+
+#endif
diff --git a/storage/xtradb/include/os0proc.h b/storage/xtradb/include/os0proc.h
new file mode 100644
index 00000000000..582cef6f803
--- /dev/null
+++ b/storage/xtradb/include/os0proc.h
@@ -0,0 +1,105 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/os0proc.h
+The interface to the operating system
+process control primitives
+
+Created 9/30/1995 Heikki Tuuri
+*******************************************************/
+
+#ifndef os0proc_h
+#define os0proc_h
+
+#include "univ.i"
+
+#ifdef UNIV_LINUX
+#include <sys/ipc.h>
+#include <sys/shm.h>
+#else
+# if defined HAVE_SYS_IPC_H && HAVE_SYS_SHM_H
+#include <sys/ipc.h>
+#include <sys/shm.h>
+# endif
+#endif
+
+typedef void*			os_process_t;
+typedef unsigned long int	os_process_id_t;
+
+extern ibool os_use_large_pages;
+/* Large page size. This may be a boot-time option on some platforms */
+extern ulint os_large_page_size;
+
+/****************************************************************//**
+Converts the current process id to a number. It is not guaranteed that the
+number is unique. In Linux returns the 'process number' of the current
+thread. That number is the same as one sees in 'top', for example. In Linux
+the thread id is not the same as one sees in 'top'.
+@return	process id as a number */
+UNIV_INTERN
+ulint
+os_proc_get_number(void);
+/*====================*/
+/****************************************************************//**
+Allocates large pages memory.
+@return	allocated memory */
+UNIV_INTERN
+void*
+os_mem_alloc_large(
+/*===============*/
+	ulint*	n);			/*!< in/out: number of bytes */
+/****************************************************************//**
+Frees large pages memory. */
+UNIV_INTERN
+void
+os_mem_free_large(
+/*==============*/
+	void	*ptr,			/*!< in: pointer returned by
+					os_mem_alloc_large() */
+	ulint	size);			/*!< in: size returned by
+					os_mem_alloc_large() */
+
+
+/****************************************************************//**
+Allocates or attaches and reuses shared memory segment.
+The content is not cleared automatically.
+@return	allocated memory */
+UNIV_INTERN
+void*
+os_shm_alloc(
+/*=========*/
+	ulint*	n,			/*!< in/out: number of bytes */
+	uint	key,
+	ibool*	is_new);
+
+/****************************************************************//**
+Detach shared memory segment. */
+UNIV_INTERN
+void
+os_shm_free(
+/*========*/
+	void	*ptr,			/*!< in: pointer returned by
+					os_shm_alloc() */
+	ulint	size);			/*!< in: size returned by
+					os_shm_alloc() */
+#ifndef UNIV_NONINL
+#include "os0proc.ic"
+#endif
+
+#endif
diff --git a/storage/xtradb/include/os0proc.ic b/storage/xtradb/include/os0proc.ic
new file mode 100644
index 00000000000..c9641644525
--- /dev/null
+++ b/storage/xtradb/include/os0proc.ic
@@ -0,0 +1,27 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/os0proc.ic
+The interface to the operating system
+process control primitives
+
+Created 9/30/1995 Heikki Tuuri
+*******************************************************/
+
+
diff --git a/storage/xtradb/include/os0sync.h b/storage/xtradb/include/os0sync.h
new file mode 100644
index 00000000000..c230a03b6db
--- /dev/null
+++ b/storage/xtradb/include/os0sync.h
@@ -0,0 +1,445 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 2008, Google Inc.
+
+Portions of this file contain modifications contributed and copyrighted by
+Google, Inc. Those modifications are gratefully acknowledged and are described
+briefly in the InnoDB documentation. The contributions by Google are
+incorporated with their permission, and subject to the conditions contained in
+the file COPYING.Google.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/os0sync.h
+The interface to the operating system
+synchronization primitives.
+
+Created 9/6/1995 Heikki Tuuri
+*******************************************************/
+
+#ifndef os0sync_h
+#define os0sync_h
+
+#include "univ.i"
+#include "ut0lst.h"
+
+#ifdef __WIN__
+
+/** Native mutex */
+#define os_fast_mutex_t CRITICAL_SECTION
+
+/** Native event */
+typedef HANDLE		os_native_event_t;
+
+/** Operating system event */
+typedef struct os_event_struct	os_event_struct_t;
+/** Operating system event handle */
+typedef os_event_struct_t*	os_event_t;
+
+/** An asynchronous signal sent between threads */
+struct os_event_struct {
+	os_native_event_t		  handle;
+					/*!< Windows event */
+	UT_LIST_NODE_T(os_event_struct_t) os_event_list;
+					/*!< list of all created events */
+};
+#else
+/** Native mutex */
+typedef pthread_mutex_t	os_fast_mutex_t;
+
+/** Operating system event */
+typedef struct os_event_struct	os_event_struct_t;
+/** Operating system event handle */
+typedef os_event_struct_t*	os_event_t;
+
+/** An asynchronous signal sent between threads */
+struct os_event_struct {
+	os_fast_mutex_t	os_mutex;	/*!< this mutex protects the next
+					fields */
+	ibool		is_set;		/*!< this is TRUE when the event is
+					in the signaled state, i.e., a thread
+					does not stop if it tries to wait for
+					this event */
+	ib_int64_t	signal_count;	/*!< this is incremented each time
+					the event becomes signaled */
+	pthread_cond_t	cond_var;	/*!< condition variable is used in
+					waiting for the event */
+	UT_LIST_NODE_T(os_event_struct_t) os_event_list;
+					/*!< list of all created events */
+};
+#endif
+
+/** Operating system mutex */
+typedef struct os_mutex_struct	os_mutex_str_t;
+/** Operating system mutex handle */
+typedef os_mutex_str_t*		os_mutex_t;
+
+/** Denotes an infinite delay for os_event_wait_time() */
+#define OS_SYNC_INFINITE_TIME	((ulint)(-1))
+
+/** Return value of os_event_wait_time() when the time is exceeded */
+#define OS_SYNC_TIME_EXCEEDED	1
+
+/** Mutex protecting counts and the event and OS 'slow' mutex lists */
+extern os_mutex_t	os_sync_mutex;
+
+/** This is incremented by 1 in os_thread_create and decremented by 1 in
+os_thread_exit */
+extern ulint		os_thread_count;
+
+extern ulint		os_event_count;
+extern ulint		os_mutex_count;
+extern ulint		os_fast_mutex_count;
+
+/*********************************************************//**
+Initializes global event and OS 'slow' mutex lists. */
+UNIV_INTERN
+void
+os_sync_init(void);
+/*==============*/
+/*********************************************************//**
+Frees created events and OS 'slow' mutexes. */
+UNIV_INTERN
+void
+os_sync_free(void);
+/*==============*/
+/*********************************************************//**
+Creates an event semaphore, i.e., a semaphore which may just have two states:
+signaled and nonsignaled. The created event is manual reset: it must be reset
+explicitly by calling sync_os_reset_event.
+@return	the event handle */
+UNIV_INTERN
+os_event_t
+os_event_create(
+/*============*/
+	const char*	name);	/*!< in: the name of the event, if NULL
+				the event is created without a name */
+/**********************************************************//**
+Sets an event semaphore to the signaled state: lets waiting threads
+proceed. */
+UNIV_INTERN
+void
+os_event_set(
+/*=========*/
+	os_event_t	event);	/*!< in: event to set */
+/**********************************************************//**
+Resets an event semaphore to the nonsignaled state. Waiting threads will
+stop to wait for the event.
+The return value should be passed to os_even_wait_low() if it is desired
+that this thread should not wait in case of an intervening call to
+os_event_set() between this os_event_reset() and the
+os_event_wait_low() call. See comments for os_event_wait_low(). */
+UNIV_INTERN
+ib_int64_t
+os_event_reset(
+/*===========*/
+	os_event_t	event);	/*!< in: event to reset */
+/**********************************************************//**
+Frees an event object. */
+UNIV_INTERN
+void
+os_event_free(
+/*==========*/
+	os_event_t	event);	/*!< in: event to free */
+
+/**********************************************************//**
+Waits for an event object until it is in the signaled state. If
+srv_shutdown_state == SRV_SHUTDOWN_EXIT_THREADS this also exits the
+waiting thread when the event becomes signaled (or immediately if the
+event is already in the signaled state).
+
+Typically, if the event has been signalled after the os_event_reset()
+we'll return immediately because event->is_set == TRUE.
+There are, however, situations (e.g.: sync_array code) where we may
+lose this information. For example:
+
+thread A calls os_event_reset()
+thread B calls os_event_set()   [event->is_set == TRUE]
+thread C calls os_event_reset() [event->is_set == FALSE]
+thread A calls os_event_wait()  [infinite wait!]
+thread C calls os_event_wait()  [infinite wait!]
+
+Where such a scenario is possible, to avoid infinite wait, the
+value returned by os_event_reset() should be passed in as
+reset_sig_count. */
+UNIV_INTERN
+void
+os_event_wait_low(
+/*==============*/
+	os_event_t	event,		/*!< in: event to wait */
+	ib_int64_t	reset_sig_count);/*!< in: zero or the value
+					returned by previous call of
+					os_event_reset(). */
+
+#define os_event_wait(event) os_event_wait_low(event, 0)
+
+/**********************************************************//**
+Waits for an event object until it is in the signaled state or
+a timeout is exceeded.
+@return	0 if success, OS_SYNC_TIME_EXCEEDED if timeout was exceeded */
+UNIV_INTERN
+ulint
+os_event_wait_time(
+/*===============*/
+	os_event_t	event,	/*!< in: event to wait */
+	ulint		time);	/*!< in: timeout in microseconds, or
+				OS_SYNC_INFINITE_TIME */
+#ifdef __WIN__
+/**********************************************************//**
+Waits for any event in an OS native event array. Returns if even a single
+one is signaled or becomes signaled.
+@return	index of the event which was signaled */
+UNIV_INTERN
+ulint
+os_event_wait_multiple(
+/*===================*/
+	ulint			n,	/*!< in: number of events in the
+					array */
+	os_native_event_t*	native_event_array);
+					/*!< in: pointer to an array of event
+					handles */
+#endif
+/*********************************************************//**
+Creates an operating system mutex semaphore. Because these are slow, the
+mutex semaphore of InnoDB itself (mutex_t) should be used where possible.
+@return	the mutex handle */
+UNIV_INTERN
+os_mutex_t
+os_mutex_create(
+/*============*/
+	const char*	name);	/*!< in: the name of the mutex, if NULL
+				the mutex is created without a name */
+/**********************************************************//**
+Acquires ownership of a mutex semaphore. */
+UNIV_INTERN
+void
+os_mutex_enter(
+/*===========*/
+	os_mutex_t	mutex);	/*!< in: mutex to acquire */
+/**********************************************************//**
+Releases ownership of a mutex. */
+UNIV_INTERN
+void
+os_mutex_exit(
+/*==========*/
+	os_mutex_t	mutex);	/*!< in: mutex to release */
+/**********************************************************//**
+Frees an mutex object. */
+UNIV_INTERN
+void
+os_mutex_free(
+/*==========*/
+	os_mutex_t	mutex);	/*!< in: mutex to free */
+/**********************************************************//**
+Acquires ownership of a fast mutex. Currently in Windows this is the same
+as os_fast_mutex_lock!
+@return	0 if success, != 0 if was reserved by another thread */
+UNIV_INLINE
+ulint
+os_fast_mutex_trylock(
+/*==================*/
+	os_fast_mutex_t*	fast_mutex);	/*!< in: mutex to acquire */
+/**********************************************************//**
+Releases ownership of a fast mutex. */
+UNIV_INTERN
+void
+os_fast_mutex_unlock(
+/*=================*/
+	os_fast_mutex_t*	fast_mutex);	/*!< in: mutex to release */
+/*********************************************************//**
+Initializes an operating system fast mutex semaphore. */
+UNIV_INTERN
+void
+os_fast_mutex_init(
+/*===============*/
+	os_fast_mutex_t*	fast_mutex);	/*!< in: fast mutex */
+/**********************************************************//**
+Acquires ownership of a fast mutex. */
+UNIV_INTERN
+void
+os_fast_mutex_lock(
+/*===============*/
+	os_fast_mutex_t*	fast_mutex);	/*!< in: mutex to acquire */
+/**********************************************************//**
+Frees an mutex object. */
+UNIV_INTERN
+void
+os_fast_mutex_free(
+/*===============*/
+	os_fast_mutex_t*	fast_mutex);	/*!< in: mutex to free */
+
+/**********************************************************//**
+Atomic compare-and-swap and increment for InnoDB. */
+
+#if defined(HAVE_IB_GCC_ATOMIC_BUILTINS)
+
+#define HAVE_ATOMIC_BUILTINS
+
+/**********************************************************//**
+Returns true if swapped, ptr is pointer to target, old_val is value to
+compare to, new_val is the value to swap in. */
+
+# define os_compare_and_swap(ptr, old_val, new_val) \
+	__sync_bool_compare_and_swap(ptr, old_val, new_val)
+
+# define os_compare_and_swap_ulint(ptr, old_val, new_val) \
+	os_compare_and_swap(ptr, old_val, new_val)
+
+# define os_compare_and_swap_lint(ptr, old_val, new_val) \
+	os_compare_and_swap(ptr, old_val, new_val)
+
+# ifdef HAVE_IB_ATOMIC_PTHREAD_T_GCC
+#  define os_compare_and_swap_thread_id(ptr, old_val, new_val) \
+	os_compare_and_swap(ptr, old_val, new_val)
+#  define INNODB_RW_LOCKS_USE_ATOMICS
+#  define IB_ATOMICS_STARTUP_MSG \
+	"Mutexes and rw_locks use GCC atomic builtins"
+# else /* HAVE_IB_ATOMIC_PTHREAD_T_GCC */
+#  define IB_ATOMICS_STARTUP_MSG \
+	"Mutexes use GCC atomic builtins, rw_locks do not"
+# endif /* HAVE_IB_ATOMIC_PTHREAD_T_GCC */
+
+/**********************************************************//**
+Returns the resulting value, ptr is pointer to target, amount is the
+amount of increment. */
+
+# define os_atomic_increment(ptr, amount) \
+	__sync_add_and_fetch(ptr, amount)
+
+# define os_atomic_increment_lint(ptr, amount) \
+	os_atomic_increment(ptr, amount)
+
+# define os_atomic_increment_ulint(ptr, amount) \
+	os_atomic_increment(ptr, amount)
+
+/**********************************************************//**
+Returns the old value of *ptr, atomically sets *ptr to new_val */
+
+# define os_atomic_test_and_set_byte(ptr, new_val) \
+	__sync_lock_test_and_set(ptr, new_val)
+
+#elif defined(HAVE_IB_SOLARIS_ATOMICS)
+
+#define HAVE_ATOMIC_BUILTINS
+
+/* If not compiling with GCC or GCC doesn't support the atomic
+intrinsics and running on Solaris >= 10 use Solaris atomics */
+
+#include <atomic.h>
+
+/**********************************************************//**
+Returns true if swapped, ptr is pointer to target, old_val is value to
+compare to, new_val is the value to swap in. */
+
+# define os_compare_and_swap_ulint(ptr, old_val, new_val) \
+	(atomic_cas_ulong(ptr, old_val, new_val) == old_val)
+
+# define os_compare_and_swap_lint(ptr, old_val, new_val) \
+	((lint)atomic_cas_ulong((ulong_t*) ptr, old_val, new_val) == old_val)
+
+# ifdef HAVE_IB_ATOMIC_PTHREAD_T_SOLARIS
+#  if SIZEOF_PTHREAD_T == 4
+#   define os_compare_and_swap_thread_id(ptr, old_val, new_val) \
+	((pthread_t)atomic_cas_32(ptr, old_val, new_val) == old_val)
+#  elif SIZEOF_PTHREAD_T == 8
+#   define os_compare_and_swap_thread_id(ptr, old_val, new_val) \
+	((pthread_t)atomic_cas_64(ptr, old_val, new_val) == old_val)
+#  else
+#   error "SIZEOF_PTHREAD_T != 4 or 8"
+#  endif /* SIZEOF_PTHREAD_T CHECK */
+#  define INNODB_RW_LOCKS_USE_ATOMICS
+#  define IB_ATOMICS_STARTUP_MSG \
+	"Mutexes and rw_locks use Solaris atomic functions"
+# else /* HAVE_IB_ATOMIC_PTHREAD_T_SOLARIS */
+#  define IB_ATOMICS_STARTUP_MSG \
+	"Mutexes use Solaris atomic functions, rw_locks do not"
+# endif /* HAVE_IB_ATOMIC_PTHREAD_T_SOLARIS */
+
+/**********************************************************//**
+Returns the resulting value, ptr is pointer to target, amount is the
+amount of increment. */
+
+# define os_atomic_increment_lint(ptr, amount) \
+	atomic_add_long_nv((ulong_t*) ptr, amount)
+
+# define os_atomic_increment_ulint(ptr, amount) \
+	atomic_add_long_nv(ptr, amount)
+
+/**********************************************************//**
+Returns the old value of *ptr, atomically sets *ptr to new_val */
+
+# define os_atomic_test_and_set_byte(ptr, new_val) \
+	atomic_swap_uchar(ptr, new_val)
+
+#elif defined(HAVE_WINDOWS_ATOMICS)
+
+#define HAVE_ATOMIC_BUILTINS
+
+/* On Windows, use Windows atomics / interlocked */
+# ifdef _WIN64
+#  define win_cmp_and_xchg InterlockedCompareExchange64
+#  define win_xchg_and_add InterlockedExchangeAdd64
+# else /* _WIN64 */
+#  define win_cmp_and_xchg InterlockedCompareExchange
+#  define win_xchg_and_add InterlockedExchangeAdd
+# endif
+
+/**********************************************************//**
+Returns true if swapped, ptr is pointer to target, old_val is value to
+compare to, new_val is the value to swap in. */
+
+# define os_compare_and_swap_ulint(ptr, old_val, new_val) \
+	(win_cmp_and_xchg(ptr, new_val, old_val) == old_val)
+
+# define os_compare_and_swap_lint(ptr, old_val, new_val) \
+	(win_cmp_and_xchg(ptr, new_val, old_val) == old_val)
+
+/* windows thread objects can always be passed to windows atomic functions */
+# define os_compare_and_swap_thread_id(ptr, old_val, new_val) \
+	(InterlockedCompareExchange(ptr, new_val, old_val) == old_val)
+# define INNODB_RW_LOCKS_USE_ATOMICS
+# define IB_ATOMICS_STARTUP_MSG \
+	"Mutexes and rw_locks use Windows interlocked functions"
+
+/**********************************************************//**
+Returns the resulting value, ptr is pointer to target, amount is the
+amount of increment. */
+
+# define os_atomic_increment_lint(ptr, amount) \
+	(win_xchg_and_add(ptr, amount) + amount)
+
+# define os_atomic_increment_ulint(ptr, amount) \
+	((ulint) (win_xchg_and_add(ptr, amount) + amount))
+
+/**********************************************************//**
+Returns the old value of *ptr, atomically sets *ptr to new_val.
+InterlockedExchange() operates on LONG, and the LONG will be
+clobbered */
+
+# define os_atomic_test_and_set_byte(ptr, new_val) \
+	((byte) InterlockedExchange(ptr, new_val))
+
+#else
+# define IB_ATOMICS_STARTUP_MSG \
+	"Mutexes and rw_locks use InnoDB's own implementation"
+#endif
+
+#ifndef UNIV_NONINL
+#include "os0sync.ic"
+#endif
+
+#endif
diff --git a/storage/xtradb/include/os0sync.ic b/storage/xtradb/include/os0sync.ic
new file mode 100644
index 00000000000..1f3ce38fa65
--- /dev/null
+++ b/storage/xtradb/include/os0sync.ic
@@ -0,0 +1,53 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/os0sync.ic
+The interface to the operating system synchronization primitives.
+
+Created 9/6/1995 Heikki Tuuri
+*******************************************************/
+
+#ifdef __WIN__
+#include <winbase.h>
+#endif
+
+/**********************************************************//**
+Acquires ownership of a fast mutex. Currently in Windows this is the same
+as os_fast_mutex_lock!
+@return	0 if success, != 0 if was reserved by another thread */
+UNIV_INLINE
+ulint
+os_fast_mutex_trylock(
+/*==================*/
+	os_fast_mutex_t*	fast_mutex)	/*!< in: mutex to acquire */
+{
+#ifdef __WIN__
+	EnterCriticalSection(fast_mutex);
+
+	return(0);
+#else
+	/* NOTE that the MySQL my_pthread.h redefines pthread_mutex_trylock
+	so that it returns 0 on success. In the operating system
+	libraries, HP-UX-10.20 follows the old Posix 1003.4a Draft 4 and
+	returns 1 on success (but MySQL remaps that to 0), while Linux,
+	FreeBSD, Solaris, AIX, Tru64 Unix, HP-UX-11.0 return 0 on success. */
+
+	return((ulint) pthread_mutex_trylock(fast_mutex));
+#endif
+}
diff --git a/storage/xtradb/include/os0thread.h b/storage/xtradb/include/os0thread.h
new file mode 100644
index 00000000000..6583de0005f
--- /dev/null
+++ b/storage/xtradb/include/os0thread.h
@@ -0,0 +1,162 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/os0thread.h
+The interface to the operating system
+process and thread control primitives
+
+Created 9/8/1995 Heikki Tuuri
+*******************************************************/
+
+#ifndef os0thread_h
+#define os0thread_h
+
+#include "univ.i"
+
+/* Maximum number of threads which can be created in the program;
+this is also the size of the wait slot array for MySQL threads which
+can wait inside InnoDB */
+
+#define	OS_THREAD_MAX_N		srv_max_n_threads
+
+
+/* Possible fixed priorities for threads */
+#define OS_THREAD_PRIORITY_NONE		100
+#define OS_THREAD_PRIORITY_BACKGROUND	1
+#define OS_THREAD_PRIORITY_NORMAL	2
+#define OS_THREAD_PRIORITY_ABOVE_NORMAL	3
+
+#ifdef __WIN__
+typedef void*			os_thread_t;
+typedef unsigned long		os_thread_id_t;	/*!< In Windows the thread id
+						is an unsigned long int */
+#else
+typedef pthread_t		os_thread_t;
+typedef os_thread_t		os_thread_id_t;	/*!< In Unix we use the thread
+						handle itself as the id of
+						the thread */
+#endif
+
+/* Define a function pointer type to use in a typecast */
+typedef void* (*os_posix_f_t) (void*);
+
+/***************************************************************//**
+Compares two thread ids for equality.
+@return	TRUE if equal */
+UNIV_INTERN
+ibool
+os_thread_eq(
+/*=========*/
+	os_thread_id_t	a,	/*!< in: OS thread or thread id */
+	os_thread_id_t	b);	/*!< in: OS thread or thread id */
+/****************************************************************//**
+Converts an OS thread id to a ulint. It is NOT guaranteed that the ulint is
+unique for the thread though!
+@return	thread identifier as a number */
+UNIV_INTERN
+ulint
+os_thread_pf(
+/*=========*/
+	os_thread_id_t	a);	/*!< in: OS thread identifier */
+/****************************************************************//**
+Creates a new thread of execution. The execution starts from
+the function given. The start function takes a void* parameter
+and returns a ulint.
+NOTE: We count the number of threads in os_thread_exit(). A created
+thread should always use that to exit and not use return() to exit.
+@return	handle to the thread */
+UNIV_INTERN
+os_thread_t
+os_thread_create(
+/*=============*/
+#ifndef __WIN__
+		 os_posix_f_t		 start_f,
+#else
+	ulint (*start_f)(void*),		/*!< in: pointer to function
+						from which to start */
+#endif
+	void*			arg,		/*!< in: argument to start
+						function */
+	os_thread_id_t*		thread_id);	/*!< out: id of the created
+						thread, or NULL */
+
+/*****************************************************************//**
+Exits the current thread. */
+UNIV_INTERN
+void
+os_thread_exit(
+/*===========*/
+	void*	exit_value);	/*!< in: exit value; in Windows this void*
+				is cast as a DWORD */
+/*****************************************************************//**
+Returns the thread identifier of current thread.
+@return	current thread identifier */
+UNIV_INTERN
+os_thread_id_t
+os_thread_get_curr_id(void);
+/*========================*/
+/*****************************************************************//**
+Returns handle to the current thread.
+@return	current thread handle */
+UNIV_INTERN
+os_thread_t
+os_thread_get_curr(void);
+/*====================*/
+/*****************************************************************//**
+Advises the os to give up remainder of the thread's time slice. */
+UNIV_INTERN
+void
+os_thread_yield(void);
+/*=================*/
+/*****************************************************************//**
+The thread sleeps at least the time given in microseconds. */
+UNIV_INTERN
+void
+os_thread_sleep(
+/*============*/
+	ulint	tm);	/*!< in: time in microseconds */
+/******************************************************************//**
+Gets a thread priority.
+@return	priority */
+UNIV_INTERN
+ulint
+os_thread_get_priority(
+/*===================*/
+	os_thread_t	handle);/*!< in: OS handle to the thread */
+/******************************************************************//**
+Sets a thread priority. */
+UNIV_INTERN
+void
+os_thread_set_priority(
+/*===================*/
+	os_thread_t	handle,	/*!< in: OS handle to the thread */
+	ulint		pri);	/*!< in: priority: one of OS_PRIORITY_... */
+/******************************************************************//**
+Gets the last operating system error code for the calling thread.
+@return	last error on Windows, 0 otherwise */
+UNIV_INTERN
+ulint
+os_thread_get_last_error(void);
+/*==========================*/
+
+#ifndef UNIV_NONINL
+#include "os0thread.ic"
+#endif
+
+#endif
diff --git a/storage/xtradb/include/os0thread.ic b/storage/xtradb/include/os0thread.ic
new file mode 100644
index 00000000000..f89bc40b4fa
--- /dev/null
+++ b/storage/xtradb/include/os0thread.ic
@@ -0,0 +1,25 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/os0thread.ic
+The interface to the operating system
+process and thread control primitives
+
+Created 9/8/1995 Heikki Tuuri
+*******************************************************/
diff --git a/storage/xtradb/include/page0cur.h b/storage/xtradb/include/page0cur.h
new file mode 100644
index 00000000000..6b444b3dd96
--- /dev/null
+++ b/storage/xtradb/include/page0cur.h
@@ -0,0 +1,362 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/********************************************************************//**
+@file include/page0cur.h
+The page cursor
+
+Created 10/4/1994 Heikki Tuuri
+*************************************************************************/
+
+#ifndef page0cur_h
+#define page0cur_h
+
+#include "univ.i"
+
+#include "buf0types.h"
+#include "page0page.h"
+#include "rem0rec.h"
+#include "data0data.h"
+#include "mtr0mtr.h"
+
+
+#define PAGE_CUR_ADAPT
+
+/* Page cursor search modes; the values must be in this order! */
+
+#define	PAGE_CUR_UNSUPP	0
+#define	PAGE_CUR_G	1
+#define	PAGE_CUR_GE	2
+#define	PAGE_CUR_L	3
+#define	PAGE_CUR_LE	4
+/*#define PAGE_CUR_LE_OR_EXTENDS 5*/ /* This is a search mode used in
+				 "column LIKE 'abc%' ORDER BY column DESC";
+				 we have to find strings which are <= 'abc' or
+				 which extend it */
+#ifdef UNIV_SEARCH_DEBUG
+# define PAGE_CUR_DBG	6	/* As PAGE_CUR_LE, but skips search shortcut */
+#endif /* UNIV_SEARCH_DEBUG */
+
+#ifdef UNIV_DEBUG
+/*********************************************************//**
+Gets pointer to the page frame where the cursor is positioned.
+@return	page */
+UNIV_INLINE
+page_t*
+page_cur_get_page(
+/*==============*/
+	page_cur_t*	cur);	/*!< in: page cursor */
+/*********************************************************//**
+Gets pointer to the buffer block where the cursor is positioned.
+@return	page */
+UNIV_INLINE
+buf_block_t*
+page_cur_get_block(
+/*===============*/
+	page_cur_t*	cur);	/*!< in: page cursor */
+/*********************************************************//**
+Gets pointer to the page frame where the cursor is positioned.
+@return	page */
+UNIV_INLINE
+page_zip_des_t*
+page_cur_get_page_zip(
+/*==================*/
+	page_cur_t*	cur);	/*!< in: page cursor */
+/*********************************************************//**
+Gets the record where the cursor is positioned.
+@return	record */
+UNIV_INLINE
+rec_t*
+page_cur_get_rec(
+/*=============*/
+	page_cur_t*	cur);	/*!< in: page cursor */
+#else /* UNIV_DEBUG */
+# define page_cur_get_page(cur)		page_align((cur)->rec)
+# define page_cur_get_block(cur)	(cur)->block
+# define page_cur_get_page_zip(cur)	buf_block_get_page_zip((cur)->block)
+# define page_cur_get_rec(cur)		(cur)->rec
+#endif /* UNIV_DEBUG */
+/*********************************************************//**
+Sets the cursor object to point before the first user record
+on the page. */
+UNIV_INLINE
+void
+page_cur_set_before_first(
+/*======================*/
+	const buf_block_t*	block,	/*!< in: index page */
+	page_cur_t*		cur);	/*!< in: cursor */
+/*********************************************************//**
+Sets the cursor object to point after the last user record on
+the page. */
+UNIV_INLINE
+void
+page_cur_set_after_last(
+/*====================*/
+	const buf_block_t*	block,	/*!< in: index page */
+	page_cur_t*		cur);	/*!< in: cursor */
+/*********************************************************//**
+Returns TRUE if the cursor is before first user record on page.
+@return	TRUE if at start */
+UNIV_INLINE
+ibool
+page_cur_is_before_first(
+/*=====================*/
+	const page_cur_t*	cur);	/*!< in: cursor */
+/*********************************************************//**
+Returns TRUE if the cursor is after last user record.
+@return	TRUE if at end */
+UNIV_INLINE
+ibool
+page_cur_is_after_last(
+/*===================*/
+	const page_cur_t*	cur);	/*!< in: cursor */
+/**********************************************************//**
+Positions the cursor on the given record. */
+UNIV_INLINE
+void
+page_cur_position(
+/*==============*/
+	const rec_t*		rec,	/*!< in: record on a page */
+	const buf_block_t*	block,	/*!< in: buffer block containing
+					the record */
+	page_cur_t*		cur);	/*!< out: page cursor */
+/**********************************************************//**
+Invalidates a page cursor by setting the record pointer NULL. */
+UNIV_INLINE
+void
+page_cur_invalidate(
+/*================*/
+	page_cur_t*	cur);	/*!< out: page cursor */
+/**********************************************************//**
+Moves the cursor to the next record on page. */
+UNIV_INLINE
+void
+page_cur_move_to_next(
+/*==================*/
+	page_cur_t*	cur);	/*!< in/out: cursor; must not be after last */
+/**********************************************************//**
+Moves the cursor to the previous record on page. */
+UNIV_INLINE
+void
+page_cur_move_to_prev(
+/*==================*/
+	page_cur_t*	cur);	/*!< in/out: cursor; not before first */
+#ifndef UNIV_HOTBACKUP
+/***********************************************************//**
+Inserts a record next to page cursor. Returns pointer to inserted record if
+succeed, i.e., enough space available, NULL otherwise. The cursor stays at
+the same logical position, but the physical position may change if it is
+pointing to a compressed page that was reorganized.
+@return	pointer to record if succeed, NULL otherwise */
+UNIV_INLINE
+rec_t*
+page_cur_tuple_insert(
+/*==================*/
+	page_cur_t*	cursor,	/*!< in/out: a page cursor */
+	const dtuple_t*	tuple,	/*!< in: pointer to a data tuple */
+	dict_index_t*	index,	/*!< in: record descriptor */
+	ulint		n_ext,	/*!< in: number of externally stored columns */
+	mtr_t*		mtr);	/*!< in: mini-transaction handle, or NULL */
+#endif /* !UNIV_HOTBACKUP */
+/***********************************************************//**
+Inserts a record next to page cursor. Returns pointer to inserted record if
+succeed, i.e., enough space available, NULL otherwise. The cursor stays at
+the same logical position, but the physical position may change if it is
+pointing to a compressed page that was reorganized.
+@return	pointer to record if succeed, NULL otherwise */
+UNIV_INLINE
+rec_t*
+page_cur_rec_insert(
+/*================*/
+	page_cur_t*	cursor,	/*!< in/out: a page cursor */
+	const rec_t*	rec,	/*!< in: record to insert */
+	dict_index_t*	index,	/*!< in: record descriptor */
+	ulint*		offsets,/*!< in/out: rec_get_offsets(rec, index) */
+	mtr_t*		mtr);	/*!< in: mini-transaction handle, or NULL */
+/***********************************************************//**
+Inserts a record next to page cursor on an uncompressed page.
+Returns pointer to inserted record if succeed, i.e., enough
+space available, NULL otherwise. The cursor stays at the same position.
+@return	pointer to record if succeed, NULL otherwise */
+UNIV_INTERN
+rec_t*
+page_cur_insert_rec_low(
+/*====================*/
+	rec_t*		current_rec,/*!< in: pointer to current record after
+				which the new record is inserted */
+	dict_index_t*	index,	/*!< in: record descriptor */
+	const rec_t*	rec,	/*!< in: pointer to a physical record */
+	ulint*		offsets,/*!< in/out: rec_get_offsets(rec, index) */
+	mtr_t*		mtr);	/*!< in: mini-transaction handle, or NULL */
+/***********************************************************//**
+Inserts a record next to page cursor on a compressed and uncompressed
+page. Returns pointer to inserted record if succeed, i.e.,
+enough space available, NULL otherwise.
+The cursor stays at the same position.
+@return	pointer to record if succeed, NULL otherwise */
+UNIV_INTERN
+rec_t*
+page_cur_insert_rec_zip(
+/*====================*/
+	rec_t**		current_rec,/*!< in/out: pointer to current record after
+				which the new record is inserted */
+	buf_block_t*	block,	/*!< in: buffer block of *current_rec */
+	dict_index_t*	index,	/*!< in: record descriptor */
+	const rec_t*	rec,	/*!< in: pointer to a physical record */
+	ulint*		offsets,/*!< in/out: rec_get_offsets(rec, index) */
+	mtr_t*		mtr);	/*!< in: mini-transaction handle, or NULL */
+/*************************************************************//**
+Copies records from page to a newly created page, from a given record onward,
+including that record. Infimum and supremum records are not copied. */
+UNIV_INTERN
+void
+page_copy_rec_list_end_to_created_page(
+/*===================================*/
+	page_t*		new_page,	/*!< in/out: index page to copy to */
+	rec_t*		rec,		/*!< in: first record to copy */
+	dict_index_t*	index,		/*!< in: record descriptor */
+	mtr_t*		mtr);		/*!< in: mtr */
+/***********************************************************//**
+Deletes a record at the page cursor. The cursor is moved to the
+next record after the deleted one. */
+UNIV_INTERN
+void
+page_cur_delete_rec(
+/*================*/
+	page_cur_t*	cursor,	/*!< in/out: a page cursor */
+	dict_index_t*	index,	/*!< in: record descriptor */
+	const ulint*	offsets,/*!< in: rec_get_offsets(cursor->rec, index) */
+	mtr_t*		mtr);	/*!< in: mini-transaction handle */
+#ifndef UNIV_HOTBACKUP
+/****************************************************************//**
+Searches the right position for a page cursor.
+@return	number of matched fields on the left */
+UNIV_INLINE
+ulint
+page_cur_search(
+/*============*/
+	const buf_block_t*	block,	/*!< in: buffer block */
+	const dict_index_t*	index,	/*!< in: record descriptor */
+	const dtuple_t*		tuple,	/*!< in: data tuple */
+	ulint			mode,	/*!< in: PAGE_CUR_L,
+					PAGE_CUR_LE, PAGE_CUR_G, or
+					PAGE_CUR_GE */
+	page_cur_t*		cursor);/*!< out: page cursor */
+/****************************************************************//**
+Searches the right position for a page cursor. */
+UNIV_INTERN
+void
+page_cur_search_with_match(
+/*=======================*/
+	const buf_block_t*	block,	/*!< in: buffer block */
+	const dict_index_t*	index,	/*!< in: record descriptor */
+	const dtuple_t*		tuple,	/*!< in: data tuple */
+	ulint			mode,	/*!< in: PAGE_CUR_L,
+					PAGE_CUR_LE, PAGE_CUR_G, or
+					PAGE_CUR_GE */
+	ulint*			iup_matched_fields,
+					/*!< in/out: already matched
+					fields in upper limit record */
+	ulint*			iup_matched_bytes,
+					/*!< in/out: already matched
+					bytes in a field not yet
+					completely matched */
+	ulint*			ilow_matched_fields,
+					/*!< in/out: already matched
+					fields in lower limit record */
+	ulint*			ilow_matched_bytes,
+					/*!< in/out: already matched
+					bytes in a field not yet
+					completely matched */
+	page_cur_t*		cursor);/*!< out: page cursor */
+/***********************************************************//**
+Positions a page cursor on a randomly chosen user record on a page. If there
+are no user records, sets the cursor on the infimum record. */
+UNIV_INTERN
+void
+page_cur_open_on_rnd_user_rec(
+/*==========================*/
+	buf_block_t*	block,	/*!< in: page */
+	page_cur_t*	cursor);/*!< out: page cursor */
+
+UNIV_INTERN
+void
+page_cur_open_on_nth_user_rec(
+/*==========================*/
+	buf_block_t*	block,	/*!< in: page */
+	page_cur_t*	cursor,	/*!< out: page cursor */
+	ulint		nth);
+
+UNIV_INTERN
+ibool
+page_cur_open_on_rnd_user_rec_after_nth(
+/*==========================*/
+	buf_block_t*	block,	/*!< in: page */
+	page_cur_t*	cursor,	/*!< out: page cursor */
+	ulint		nth);
+#endif /* !UNIV_HOTBACKUP */
+/***********************************************************//**
+Parses a log record of a record insert on a page.
+@return	end of log record or NULL */
+UNIV_INTERN
+byte*
+page_cur_parse_insert_rec(
+/*======================*/
+	ibool		is_short,/*!< in: TRUE if short inserts */
+	byte*		ptr,	/*!< in: buffer */
+	byte*		end_ptr,/*!< in: buffer end */
+	buf_block_t*	block,	/*!< in: page or NULL */
+	dict_index_t*	index,	/*!< in: record descriptor */
+	mtr_t*		mtr);	/*!< in: mtr or NULL */
+/**********************************************************//**
+Parses a log record of copying a record list end to a new created page.
+@return	end of log record or NULL */
+UNIV_INTERN
+byte*
+page_parse_copy_rec_list_to_created_page(
+/*=====================================*/
+	byte*		ptr,	/*!< in: buffer */
+	byte*		end_ptr,/*!< in: buffer end */
+	buf_block_t*	block,	/*!< in: page or NULL */
+	dict_index_t*	index,	/*!< in: record descriptor */
+	mtr_t*		mtr);	/*!< in: mtr or NULL */
+/***********************************************************//**
+Parses log record of a record delete on a page.
+@return	pointer to record end or NULL */
+UNIV_INTERN
+byte*
+page_cur_parse_delete_rec(
+/*======================*/
+	byte*		ptr,	/*!< in: buffer */
+	byte*		end_ptr,/*!< in: buffer end */
+	buf_block_t*	block,	/*!< in: page or NULL */
+	dict_index_t*	index,	/*!< in: record descriptor */
+	mtr_t*		mtr);	/*!< in: mtr or NULL */
+
+/** Index page cursor */
+
+struct page_cur_struct{
+	byte*		rec;	/*!< pointer to a record on page */
+	buf_block_t*	block;	/*!< pointer to the block containing rec */
+};
+
+#ifndef UNIV_NONINL
+#include "page0cur.ic"
+#endif
+
+#endif
diff --git a/storage/xtradb/include/page0cur.ic b/storage/xtradb/include/page0cur.ic
new file mode 100644
index 00000000000..3520677dfb3
--- /dev/null
+++ b/storage/xtradb/include/page0cur.ic
@@ -0,0 +1,299 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/********************************************************************//**
+@file include/page0cur.ic
+The page cursor
+
+Created 10/4/1994 Heikki Tuuri
+*************************************************************************/
+
+#include "page0page.h"
+#include "buf0types.h"
+
+#ifdef UNIV_DEBUG
+/*********************************************************//**
+Gets pointer to the page frame where the cursor is positioned.
+@return	page */
+UNIV_INLINE
+page_t*
+page_cur_get_page(
+/*==============*/
+	page_cur_t*	cur)	/*!< in: page cursor */
+{
+	ut_ad(cur);
+	ut_ad(page_align(cur->rec) == cur->block->frame);
+
+	return(page_align(cur->rec));
+}
+
+/*********************************************************//**
+Gets pointer to the buffer block where the cursor is positioned.
+@return	page */
+UNIV_INLINE
+buf_block_t*
+page_cur_get_block(
+/*===============*/
+	page_cur_t*	cur)	/*!< in: page cursor */
+{
+	ut_ad(cur);
+	ut_ad(page_align(cur->rec) == cur->block->frame);
+	return(cur->block);
+}
+
+/*********************************************************//**
+Gets pointer to the page frame where the cursor is positioned.
+@return	page */
+UNIV_INLINE
+page_zip_des_t*
+page_cur_get_page_zip(
+/*==================*/
+	page_cur_t*	cur)	/*!< in: page cursor */
+{
+	return(buf_block_get_page_zip(page_cur_get_block(cur)));
+}
+
+/*********************************************************//**
+Gets the record where the cursor is positioned.
+@return	record */
+UNIV_INLINE
+rec_t*
+page_cur_get_rec(
+/*=============*/
+	page_cur_t*	cur)	/*!< in: page cursor */
+{
+	ut_ad(cur);
+	ut_ad(page_align(cur->rec) == cur->block->frame);
+
+	return(cur->rec);
+}
+#endif /* UNIV_DEBUG */
+
+/*********************************************************//**
+Sets the cursor object to point before the first user record
+on the page. */
+UNIV_INLINE
+void
+page_cur_set_before_first(
+/*======================*/
+	const buf_block_t*	block,	/*!< in: index page */
+	page_cur_t*		cur)	/*!< in: cursor */
+{
+	cur->block = (buf_block_t*) block;
+	cur->rec = page_get_infimum_rec(buf_block_get_frame(cur->block));
+}
+
+/*********************************************************//**
+Sets the cursor object to point after the last user record on
+the page. */
+UNIV_INLINE
+void
+page_cur_set_after_last(
+/*====================*/
+	const buf_block_t*	block,	/*!< in: index page */
+	page_cur_t*		cur)	/*!< in: cursor */
+{
+	cur->block = (buf_block_t*) block;
+	cur->rec = page_get_supremum_rec(buf_block_get_frame(cur->block));
+}
+
+/*********************************************************//**
+Returns TRUE if the cursor is before first user record on page.
+@return	TRUE if at start */
+UNIV_INLINE
+ibool
+page_cur_is_before_first(
+/*=====================*/
+	const page_cur_t*	cur)	/*!< in: cursor */
+{
+	ut_ad(cur);
+	ut_ad(page_align(cur->rec) == cur->block->frame);
+	return(page_rec_is_infimum(cur->rec));
+}
+
+/*********************************************************//**
+Returns TRUE if the cursor is after last user record.
+@return	TRUE if at end */
+UNIV_INLINE
+ibool
+page_cur_is_after_last(
+/*===================*/
+	const page_cur_t*	cur)	/*!< in: cursor */
+{
+	ut_ad(cur);
+	ut_ad(page_align(cur->rec) == cur->block->frame);
+	return(page_rec_is_supremum(cur->rec));
+}
+
+/**********************************************************//**
+Positions the cursor on the given record. */
+UNIV_INLINE
+void
+page_cur_position(
+/*==============*/
+	const rec_t*		rec,	/*!< in: record on a page */
+	const buf_block_t*	block,	/*!< in: buffer block containing
+					the record */
+	page_cur_t*		cur)	/*!< out: page cursor */
+{
+	ut_ad(rec && block && cur);
+	ut_ad(page_align(rec) == block->frame);
+
+	cur->rec = (rec_t*) rec;
+	cur->block = (buf_block_t*) block;
+}
+
+/**********************************************************//**
+Invalidates a page cursor by setting the record pointer NULL. */
+UNIV_INLINE
+void
+page_cur_invalidate(
+/*================*/
+	page_cur_t*	cur)	/*!< out: page cursor */
+{
+	ut_ad(cur);
+
+	cur->rec = NULL;
+	cur->block = NULL;
+}
+
+/**********************************************************//**
+Moves the cursor to the next record on page. */
+UNIV_INLINE
+void
+page_cur_move_to_next(
+/*==================*/
+	page_cur_t*	cur)	/*!< in/out: cursor; must not be after last */
+{
+	ut_ad(!page_cur_is_after_last(cur));
+
+	cur->rec = page_rec_get_next(cur->rec);
+}
+
+/**********************************************************//**
+Moves the cursor to the previous record on page. */
+UNIV_INLINE
+void
+page_cur_move_to_prev(
+/*==================*/
+	page_cur_t*	cur)	/*!< in/out: page cursor, not before first */
+{
+	ut_ad(!page_cur_is_before_first(cur));
+
+	cur->rec = page_rec_get_prev(cur->rec);
+}
+
+#ifndef UNIV_HOTBACKUP
+/****************************************************************//**
+Searches the right position for a page cursor.
+@return	number of matched fields on the left */
+UNIV_INLINE
+ulint
+page_cur_search(
+/*============*/
+	const buf_block_t*	block,	/*!< in: buffer block */
+	const dict_index_t*	index,	/*!< in: record descriptor */
+	const dtuple_t*		tuple,	/*!< in: data tuple */
+	ulint			mode,	/*!< in: PAGE_CUR_L,
+					PAGE_CUR_LE, PAGE_CUR_G, or
+					PAGE_CUR_GE */
+	page_cur_t*		cursor)	/*!< out: page cursor */
+{
+	ulint		low_matched_fields = 0;
+	ulint		low_matched_bytes = 0;
+	ulint		up_matched_fields = 0;
+	ulint		up_matched_bytes = 0;
+
+	ut_ad(dtuple_check_typed(tuple));
+
+	page_cur_search_with_match(block, index, tuple, mode,
+				   &up_matched_fields,
+				   &up_matched_bytes,
+				   &low_matched_fields,
+				   &low_matched_bytes,
+				   cursor);
+	return(low_matched_fields);
+}
+
+/***********************************************************//**
+Inserts a record next to page cursor. Returns pointer to inserted record if
+succeed, i.e., enough space available, NULL otherwise. The cursor stays at
+the same logical position, but the physical position may change if it is
+pointing to a compressed page that was reorganized.
+@return	pointer to record if succeed, NULL otherwise */
+UNIV_INLINE
+rec_t*
+page_cur_tuple_insert(
+/*==================*/
+	page_cur_t*	cursor,	/*!< in/out: a page cursor */
+	const dtuple_t*	tuple,	/*!< in: pointer to a data tuple */
+	dict_index_t*	index,	/*!< in: record descriptor */
+	ulint		n_ext,	/*!< in: number of externally stored columns */
+	mtr_t*		mtr)	/*!< in: mini-transaction handle, or NULL */
+{
+	mem_heap_t*	heap;
+	ulint*		offsets;
+	ulint		size
+		= rec_get_converted_size(index, tuple, n_ext);
+	rec_t*		rec;
+
+	heap = mem_heap_create(size
+			       + (4 + REC_OFFS_HEADER_SIZE
+				  + dtuple_get_n_fields(tuple))
+			       * sizeof *offsets);
+	rec = rec_convert_dtuple_to_rec((byte*) mem_heap_alloc(heap, size),
+					index, tuple, n_ext);
+	offsets = rec_get_offsets(rec, index, NULL, ULINT_UNDEFINED, &heap);
+
+	if (buf_block_get_page_zip(cursor->block)) {
+		rec = page_cur_insert_rec_zip(&cursor->rec, cursor->block,
+					      index, rec, offsets, mtr);
+	} else {
+		rec = page_cur_insert_rec_low(cursor->rec,
+					      index, rec, offsets, mtr);
+	}
+
+	mem_heap_free(heap);
+	return(rec);
+}
+#endif /* !UNIV_HOTBACKUP */
+
+/***********************************************************//**
+Inserts a record next to page cursor. Returns pointer to inserted record if
+succeed, i.e., enough space available, NULL otherwise. The cursor stays at
+the same logical position, but the physical position may change if it is
+pointing to a compressed page that was reorganized.
+@return	pointer to record if succeed, NULL otherwise */
+UNIV_INLINE
+rec_t*
+page_cur_rec_insert(
+/*================*/
+	page_cur_t*	cursor,	/*!< in/out: a page cursor */
+	const rec_t*	rec,	/*!< in: record to insert */
+	dict_index_t*	index,	/*!< in: record descriptor */
+	ulint*		offsets,/*!< in/out: rec_get_offsets(rec, index) */
+	mtr_t*		mtr)	/*!< in: mini-transaction handle, or NULL */
+{
+	if (buf_block_get_page_zip(cursor->block)) {
+		return(page_cur_insert_rec_zip(&cursor->rec, cursor->block,
+					       index, rec, offsets, mtr));
+	} else {
+		return(page_cur_insert_rec_low(cursor->rec,
+					       index, rec, offsets, mtr));
+	}
+}
diff --git a/storage/xtradb/include/page0page.h b/storage/xtradb/include/page0page.h
new file mode 100644
index 00000000000..5b2bcf7c054
--- /dev/null
+++ b/storage/xtradb/include/page0page.h
@@ -0,0 +1,1015 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/page0page.h
+Index page routines
+
+Created 2/2/1994 Heikki Tuuri
+*******************************************************/
+
+#ifndef page0page_h
+#define page0page_h
+
+#include "univ.i"
+
+#include "page0types.h"
+#include "fil0fil.h"
+#include "buf0buf.h"
+#include "data0data.h"
+#include "dict0dict.h"
+#include "rem0rec.h"
+#include "fsp0fsp.h"
+#include "mtr0mtr.h"
+
+#ifdef UNIV_MATERIALIZE
+#undef UNIV_INLINE
+#define UNIV_INLINE
+#endif
+
+/*			PAGE HEADER
+			===========
+
+Index page header starts at the first offset left free by the FIL-module */
+
+typedef	byte		page_header_t;
+
+#define	PAGE_HEADER	FSEG_PAGE_DATA	/* index page header starts at this
+				offset */
+/*-----------------------------*/
+#define PAGE_N_DIR_SLOTS 0	/* number of slots in page directory */
+#define	PAGE_HEAP_TOP	 2	/* pointer to record heap top */
+#define	PAGE_N_HEAP	 4	/* number of records in the heap,
+				bit 15=flag: new-style compact page format */
+#define	PAGE_FREE	 6	/* pointer to start of page free record list */
+#define	PAGE_GARBAGE	 8	/* number of bytes in deleted records */
+#define	PAGE_LAST_INSERT 10	/* pointer to the last inserted record, or
+				NULL if this info has been reset by a delete,
+				for example */
+#define	PAGE_DIRECTION	 12	/* last insert direction: PAGE_LEFT, ... */
+#define	PAGE_N_DIRECTION 14	/* number of consecutive inserts to the same
+				direction */
+#define	PAGE_N_RECS	 16	/* number of user records on the page */
+#define PAGE_MAX_TRX_ID	 18	/* highest id of a trx which may have modified
+				a record on the page; a dulint; defined only
+				in secondary indexes and in the insert buffer
+				tree; NOTE: this may be modified only
+				when the thread has an x-latch to the page,
+				and ALSO an x-latch to btr_search_latch
+				if there is a hash index to the page! */
+#define PAGE_HEADER_PRIV_END 26	/* end of private data structure of the page
+				header which are set in a page create */
+/*----*/
+#define	PAGE_LEVEL	 26	/* level of the node in an index tree; the
+				leaf level is the level 0.  This field should
+				not be written to after page creation. */
+#define	PAGE_INDEX_ID	 28	/* index id where the page belongs.
+				This field should not be written to after
+				page creation. */
+#define PAGE_BTR_SEG_LEAF 36	/* file segment header for the leaf pages in
+				a B-tree: defined only on the root page of a
+				B-tree, but not in the root of an ibuf tree */
+#define PAGE_BTR_IBUF_FREE_LIST	PAGE_BTR_SEG_LEAF
+#define PAGE_BTR_IBUF_FREE_LIST_NODE PAGE_BTR_SEG_LEAF
+				/* in the place of PAGE_BTR_SEG_LEAF and _TOP
+				there is a free list base node if the page is
+				the root page of an ibuf tree, and at the same
+				place is the free list node if the page is in
+				a free list */
+#define PAGE_BTR_SEG_TOP (36 + FSEG_HEADER_SIZE)
+				/* file segment header for the non-leaf pages
+				in a B-tree: defined only on the root page of
+				a B-tree, but not in the root of an ibuf
+				tree */
+/*----*/
+#define PAGE_DATA	(PAGE_HEADER + 36 + 2 * FSEG_HEADER_SIZE)
+				/* start of data on the page */
+
+#define PAGE_OLD_INFIMUM	(PAGE_DATA + 1 + REC_N_OLD_EXTRA_BYTES)
+				/* offset of the page infimum record on an
+				old-style page */
+#define PAGE_OLD_SUPREMUM	(PAGE_DATA + 2 + 2 * REC_N_OLD_EXTRA_BYTES + 8)
+				/* offset of the page supremum record on an
+				old-style page */
+#define PAGE_OLD_SUPREMUM_END (PAGE_OLD_SUPREMUM + 9)
+				/* offset of the page supremum record end on
+				an old-style page */
+#define PAGE_NEW_INFIMUM	(PAGE_DATA + REC_N_NEW_EXTRA_BYTES)
+				/* offset of the page infimum record on a
+				new-style compact page */
+#define PAGE_NEW_SUPREMUM	(PAGE_DATA + 2 * REC_N_NEW_EXTRA_BYTES + 8)
+				/* offset of the page supremum record on a
+				new-style compact page */
+#define PAGE_NEW_SUPREMUM_END (PAGE_NEW_SUPREMUM + 8)
+				/* offset of the page supremum record end on
+				a new-style compact page */
+/*-----------------------------*/
+
+/* Heap numbers */
+#define PAGE_HEAP_NO_INFIMUM	0	/* page infimum */
+#define PAGE_HEAP_NO_SUPREMUM	1	/* page supremum */
+#define PAGE_HEAP_NO_USER_LOW	2	/* first user record in
+					creation (insertion) order,
+					not necessarily collation order;
+					this record may have been deleted */
+
+/* Directions of cursor movement */
+#define	PAGE_LEFT		1
+#define	PAGE_RIGHT		2
+#define	PAGE_SAME_REC		3
+#define	PAGE_SAME_PAGE		4
+#define	PAGE_NO_DIRECTION	5
+
+/*			PAGE DIRECTORY
+			==============
+*/
+
+typedef	byte			page_dir_slot_t;
+typedef page_dir_slot_t		page_dir_t;
+
+/* Offset of the directory start down from the page end. We call the
+slot with the highest file address directory start, as it points to
+the first record in the list of records. */
+#define	PAGE_DIR		FIL_PAGE_DATA_END
+
+/* We define a slot in the page directory as two bytes */
+#define	PAGE_DIR_SLOT_SIZE	2
+
+/* The offset of the physically lower end of the directory, counted from
+page end, when the page is empty */
+#define PAGE_EMPTY_DIR_START	(PAGE_DIR + 2 * PAGE_DIR_SLOT_SIZE)
+
+/* The maximum and minimum number of records owned by a directory slot. The
+number may drop below the minimum in the first and the last slot in the
+directory. */
+#define PAGE_DIR_SLOT_MAX_N_OWNED	8
+#define	PAGE_DIR_SLOT_MIN_N_OWNED	4
+
+/************************************************************//**
+Gets the start of a page.
+@return	start of the page */
+UNIV_INLINE
+page_t*
+page_align(
+/*=======*/
+	const void*	ptr)	/*!< in: pointer to page frame */
+		__attribute__((const));
+/************************************************************//**
+Gets the offset within a page.
+@return	offset from the start of the page */
+UNIV_INLINE
+ulint
+page_offset(
+/*========*/
+	const void*	ptr)	/*!< in: pointer to page frame */
+		__attribute__((const));
+/*************************************************************//**
+Returns the max trx id field value. */
+UNIV_INLINE
+trx_id_t
+page_get_max_trx_id(
+/*================*/
+	const page_t*	page);	/*!< in: page */
+/*************************************************************//**
+Sets the max trx id field value. */
+UNIV_INTERN
+void
+page_set_max_trx_id(
+/*================*/
+	buf_block_t*	block,	/*!< in/out: page */
+	page_zip_des_t*	page_zip,/*!< in/out: compressed page, or NULL */
+	trx_id_t	trx_id,	/*!< in: transaction id */
+	mtr_t*		mtr);	/*!< in/out: mini-transaction, or NULL */
+/*************************************************************//**
+Sets the max trx id field value if trx_id is bigger than the previous
+value. */
+UNIV_INLINE
+void
+page_update_max_trx_id(
+/*===================*/
+	buf_block_t*	block,	/*!< in/out: page */
+	page_zip_des_t*	page_zip,/*!< in/out: compressed page whose
+				uncompressed part will be updated, or NULL */
+	trx_id_t	trx_id,	/*!< in: transaction id */
+	mtr_t*		mtr);	/*!< in/out: mini-transaction */
+/*************************************************************//**
+Reads the given header field. */
+UNIV_INLINE
+ulint
+page_header_get_field(
+/*==================*/
+	const page_t*	page,	/*!< in: page */
+	ulint		field);	/*!< in: PAGE_N_DIR_SLOTS, ... */
+/*************************************************************//**
+Sets the given header field. */
+UNIV_INLINE
+void
+page_header_set_field(
+/*==================*/
+	page_t*		page,	/*!< in/out: page */
+	page_zip_des_t*	page_zip,/*!< in/out: compressed page whose
+				uncompressed part will be updated, or NULL */
+	ulint		field,	/*!< in: PAGE_N_DIR_SLOTS, ... */
+	ulint		val);	/*!< in: value */
+/*************************************************************//**
+Returns the offset stored in the given header field.
+@return	offset from the start of the page, or 0 */
+UNIV_INLINE
+ulint
+page_header_get_offs(
+/*=================*/
+	const page_t*	page,	/*!< in: page */
+	ulint		field)	/*!< in: PAGE_FREE, ... */
+	__attribute__((nonnull, pure));
+
+/*************************************************************//**
+Returns the pointer stored in the given header field, or NULL. */
+#define page_header_get_ptr(page, field)			\
+	(page_header_get_offs(page, field)			\
+	 ? page + page_header_get_offs(page, field) : NULL)
+/*************************************************************//**
+Sets the pointer stored in the given header field. */
+UNIV_INLINE
+void
+page_header_set_ptr(
+/*================*/
+	page_t*		page,	/*!< in/out: page */
+	page_zip_des_t*	page_zip,/*!< in/out: compressed page whose
+				uncompressed part will be updated, or NULL */
+	ulint		field,	/*!< in/out: PAGE_FREE, ... */
+	const byte*	ptr);	/*!< in: pointer or NULL*/
+#ifndef UNIV_HOTBACKUP
+/*************************************************************//**
+Resets the last insert info field in the page header. Writes to mlog
+about this operation. */
+UNIV_INLINE
+void
+page_header_reset_last_insert(
+/*==========================*/
+	page_t*		page,	/*!< in: page */
+	page_zip_des_t*	page_zip,/*!< in/out: compressed page whose
+				uncompressed part will be updated, or NULL */
+	mtr_t*		mtr);	/*!< in: mtr */
+#endif /* !UNIV_HOTBACKUP */
+/************************************************************//**
+Gets the offset of the first record on the page.
+@return	offset of the first record in record list, relative from page */
+UNIV_INLINE
+ulint
+page_get_infimum_offset(
+/*====================*/
+	const page_t*	page);	/*!< in: page which must have record(s) */
+/************************************************************//**
+Gets the offset of the last record on the page.
+@return	offset of the last record in record list, relative from page */
+UNIV_INLINE
+ulint
+page_get_supremum_offset(
+/*=====================*/
+	const page_t*	page);	/*!< in: page which must have record(s) */
+#define page_get_infimum_rec(page) ((page) + page_get_infimum_offset(page))
+#define page_get_supremum_rec(page) ((page) + page_get_supremum_offset(page))
+/************************************************************//**
+Returns the middle record of record list. If there are an even number
+of records in the list, returns the first record of upper half-list.
+@return	middle record */
+UNIV_INTERN
+rec_t*
+page_get_middle_rec(
+/*================*/
+	page_t*	page);	/*!< in: page */
+#ifndef UNIV_HOTBACKUP
+/*************************************************************//**
+Compares a data tuple to a physical record. Differs from the function
+cmp_dtuple_rec_with_match in the way that the record must reside on an
+index page, and also page infimum and supremum records can be given in
+the parameter rec. These are considered as the negative infinity and
+the positive infinity in the alphabetical order.
+@return 1, 0, -1, if dtuple is greater, equal, less than rec,
+respectively, when only the common first fields are compared */
+UNIV_INLINE
+int
+page_cmp_dtuple_rec_with_match(
+/*===========================*/
+	const dtuple_t*	dtuple,	/*!< in: data tuple */
+	const rec_t*	rec,	/*!< in: physical record on a page; may also
+				be page infimum or supremum, in which case
+				matched-parameter values below are not
+				affected */
+	const ulint*	offsets,/*!< in: array returned by rec_get_offsets() */
+	ulint*		matched_fields, /*!< in/out: number of already completely
+				matched fields; when function returns
+				contains the value for current comparison */
+	ulint*		matched_bytes); /*!< in/out: number of already matched
+				bytes within the first field not completely
+				matched; when function returns contains the
+				value for current comparison */
+#endif /* !UNIV_HOTBACKUP */
+/*************************************************************//**
+Gets the page number.
+@return	page number */
+UNIV_INLINE
+ulint
+page_get_page_no(
+/*=============*/
+	const page_t*	page);	/*!< in: page */
+/*************************************************************//**
+Gets the tablespace identifier.
+@return	space id */
+UNIV_INLINE
+ulint
+page_get_space_id(
+/*==============*/
+	const page_t*	page);	/*!< in: page */
+/*************************************************************//**
+Gets the number of user records on page (the infimum and supremum records
+are not user records).
+@return	number of user records */
+UNIV_INLINE
+ulint
+page_get_n_recs(
+/*============*/
+	const page_t*	page);	/*!< in: index page */
+/***************************************************************//**
+Returns the number of records before the given record in chain.
+The number includes infimum and supremum records.
+@return	number of records */
+UNIV_INTERN
+ulint
+page_rec_get_n_recs_before(
+/*=======================*/
+	const rec_t*	rec);	/*!< in: the physical record */
+/*************************************************************//**
+Gets the number of records in the heap.
+@return	number of user records */
+UNIV_INLINE
+ulint
+page_dir_get_n_heap(
+/*================*/
+	const page_t*	page);	/*!< in: index page */
+/*************************************************************//**
+Sets the number of records in the heap. */
+UNIV_INLINE
+void
+page_dir_set_n_heap(
+/*================*/
+	page_t*		page,	/*!< in/out: index page */
+	page_zip_des_t*	page_zip,/*!< in/out: compressed page whose
+				uncompressed part will be updated, or NULL.
+				Note that the size of the dense page directory
+				in the compressed page trailer is
+				n_heap * PAGE_ZIP_DIR_SLOT_SIZE. */
+	ulint		n_heap);/*!< in: number of records */
+/*************************************************************//**
+Gets the number of dir slots in directory.
+@return	number of slots */
+UNIV_INLINE
+ulint
+page_dir_get_n_slots(
+/*=================*/
+	const page_t*	page);	/*!< in: index page */
+/*************************************************************//**
+Sets the number of dir slots in directory. */
+UNIV_INLINE
+void
+page_dir_set_n_slots(
+/*=================*/
+	page_t*		page,	/*!< in/out: page */
+	page_zip_des_t*	page_zip,/*!< in/out: compressed page whose
+				uncompressed part will be updated, or NULL */
+	ulint		n_slots);/*!< in: number of slots */
+#ifdef UNIV_DEBUG
+/*************************************************************//**
+Gets pointer to nth directory slot.
+@return	pointer to dir slot */
+UNIV_INLINE
+page_dir_slot_t*
+page_dir_get_nth_slot(
+/*==================*/
+	const page_t*	page,	/*!< in: index page */
+	ulint		n);	/*!< in: position */
+#else /* UNIV_DEBUG */
+# define page_dir_get_nth_slot(page, n)		\
+	((page) + UNIV_PAGE_SIZE - PAGE_DIR	\
+	 - (n + 1) * PAGE_DIR_SLOT_SIZE)
+#endif /* UNIV_DEBUG */
+/**************************************************************//**
+Used to check the consistency of a record on a page.
+@return	TRUE if succeed */
+UNIV_INLINE
+ibool
+page_rec_check(
+/*===========*/
+	const rec_t*	rec);	/*!< in: record */
+/***************************************************************//**
+Gets the record pointed to by a directory slot.
+@return	pointer to record */
+UNIV_INLINE
+const rec_t*
+page_dir_slot_get_rec(
+/*==================*/
+	const page_dir_slot_t*	slot);	/*!< in: directory slot */
+/***************************************************************//**
+This is used to set the record offset in a directory slot. */
+UNIV_INLINE
+void
+page_dir_slot_set_rec(
+/*==================*/
+	page_dir_slot_t* slot,	/*!< in: directory slot */
+	rec_t*		 rec);	/*!< in: record on the page */
+/***************************************************************//**
+Gets the number of records owned by a directory slot.
+@return	number of records */
+UNIV_INLINE
+ulint
+page_dir_slot_get_n_owned(
+/*======================*/
+	const page_dir_slot_t*	slot);	/*!< in: page directory slot */
+/***************************************************************//**
+This is used to set the owned records field of a directory slot. */
+UNIV_INLINE
+void
+page_dir_slot_set_n_owned(
+/*======================*/
+	page_dir_slot_t*slot,	/*!< in/out: directory slot */
+	page_zip_des_t*	page_zip,/*!< in/out: compressed page, or NULL */
+	ulint		n);	/*!< in: number of records owned by the slot */
+/************************************************************//**
+Calculates the space reserved for directory slots of a given
+number of records. The exact value is a fraction number
+n * PAGE_DIR_SLOT_SIZE / PAGE_DIR_SLOT_MIN_N_OWNED, and it is
+rounded upwards to an integer. */
+UNIV_INLINE
+ulint
+page_dir_calc_reserved_space(
+/*=========================*/
+	ulint	n_recs);	/*!< in: number of records */
+/***************************************************************//**
+Looks for the directory slot which owns the given record.
+@return	the directory slot number */
+UNIV_INTERN
+ulint
+page_dir_find_owner_slot(
+/*=====================*/
+	const rec_t*	rec);	/*!< in: the physical record */
+/************************************************************//**
+Determine whether the page is in new-style compact format.
+@return nonzero if the page is in compact format, zero if it is in
+old-style format */
+UNIV_INLINE
+ulint
+page_is_comp(
+/*=========*/
+	const page_t*	page);	/*!< in: index page */
+/************************************************************//**
+TRUE if the record is on a page in compact format.
+@return	nonzero if in compact format */
+UNIV_INLINE
+ulint
+page_rec_is_comp(
+/*=============*/
+	const rec_t*	rec);	/*!< in: record */
+/***************************************************************//**
+Returns the heap number of a record.
+@return	heap number */
+UNIV_INLINE
+ulint
+page_rec_get_heap_no(
+/*=================*/
+	const rec_t*	rec);	/*!< in: the physical record */
+/************************************************************//**
+Determine whether the page is a B-tree leaf.
+@return	TRUE if the page is a B-tree leaf */
+UNIV_INLINE
+ibool
+page_is_leaf(
+/*=========*/
+	const page_t*	page)	/*!< in: page */
+	__attribute__((pure));
+/************************************************************//**
+Gets the pointer to the next record on the page.
+@return	pointer to next record */
+UNIV_INLINE
+const rec_t*
+page_rec_get_next_low(
+/*==================*/
+	const rec_t*	rec,	/*!< in: pointer to record */
+	ulint		comp);	/*!< in: nonzero=compact page layout */
+/************************************************************//**
+Gets the pointer to the next record on the page.
+@return	pointer to next record */
+UNIV_INLINE
+rec_t*
+page_rec_get_next(
+/*==============*/
+	const rec_t*	rec);	/*!< in: pointer to record */
+/************************************************************//**
+Gets the pointer to the next record on the page.
+@return	pointer to next record */
+UNIV_INLINE
+const rec_t*
+page_rec_get_next_const(
+/*====================*/
+	const rec_t*	rec);	/*!< in: pointer to record */
+/************************************************************//**
+Sets the pointer to the next record on the page. */
+UNIV_INLINE
+void
+page_rec_set_next(
+/*==============*/
+	rec_t*	rec,	/*!< in: pointer to record,
+			must not be page supremum */
+	rec_t*	next);	/*!< in: pointer to next record,
+			must not be page infimum */
+/************************************************************//**
+Gets the pointer to the previous record.
+@return	pointer to previous record */
+UNIV_INLINE
+const rec_t*
+page_rec_get_prev_const(
+/*====================*/
+	const rec_t*	rec);	/*!< in: pointer to record, must not be page
+				infimum */
+/************************************************************//**
+Gets the pointer to the previous record.
+@return	pointer to previous record */
+UNIV_INLINE
+rec_t*
+page_rec_get_prev(
+/*==============*/
+	rec_t*		rec);	/*!< in: pointer to record,
+				must not be page infimum */
+/************************************************************//**
+TRUE if the record is a user record on the page.
+@return	TRUE if a user record */
+UNIV_INLINE
+ibool
+page_rec_is_user_rec_low(
+/*=====================*/
+	ulint	offset)	/*!< in: record offset on page */
+	__attribute__((const));
+/************************************************************//**
+TRUE if the record is the supremum record on a page.
+@return	TRUE if the supremum record */
+UNIV_INLINE
+ibool
+page_rec_is_supremum_low(
+/*=====================*/
+	ulint	offset)	/*!< in: record offset on page */
+	__attribute__((const));
+/************************************************************//**
+TRUE if the record is the infimum record on a page.
+@return	TRUE if the infimum record */
+UNIV_INLINE
+ibool
+page_rec_is_infimum_low(
+/*====================*/
+	ulint	offset)	/*!< in: record offset on page */
+	__attribute__((const));
+
+/************************************************************//**
+TRUE if the record is a user record on the page.
+@return	TRUE if a user record */
+UNIV_INLINE
+ibool
+page_rec_is_user_rec(
+/*=================*/
+	const rec_t*	rec)	/*!< in: record */
+	__attribute__((const));
+/************************************************************//**
+TRUE if the record is the supremum record on a page.
+@return	TRUE if the supremum record */
+UNIV_INLINE
+ibool
+page_rec_is_supremum(
+/*=================*/
+	const rec_t*	rec)	/*!< in: record */
+	__attribute__((const));
+
+/************************************************************//**
+TRUE if the record is the infimum record on a page.
+@return	TRUE if the infimum record */
+UNIV_INLINE
+ibool
+page_rec_is_infimum(
+/*================*/
+	const rec_t*	rec)	/*!< in: record */
+	__attribute__((const));
+/***************************************************************//**
+Looks for the record which owns the given record.
+@return	the owner record */
+UNIV_INLINE
+rec_t*
+page_rec_find_owner_rec(
+/*====================*/
+	rec_t*	rec);	/*!< in: the physical record */
+/***********************************************************************//**
+This is a low-level operation which is used in a database index creation
+to update the page number of a created B-tree to a data dictionary
+record. */
+UNIV_INTERN
+void
+page_rec_write_index_page_no(
+/*=========================*/
+	rec_t*	rec,	/*!< in: record to update */
+	ulint	i,	/*!< in: index of the field to update */
+	ulint	page_no,/*!< in: value to write */
+	mtr_t*	mtr);	/*!< in: mtr */
+/************************************************************//**
+Returns the maximum combined size of records which can be inserted on top
+of record heap.
+@return	maximum combined size for inserted records */
+UNIV_INLINE
+ulint
+page_get_max_insert_size(
+/*=====================*/
+	const page_t*	page,	/*!< in: index page */
+	ulint		n_recs);/*!< in: number of records */
+/************************************************************//**
+Returns the maximum combined size of records which can be inserted on top
+of record heap if page is first reorganized.
+@return	maximum combined size for inserted records */
+UNIV_INLINE
+ulint
+page_get_max_insert_size_after_reorganize(
+/*======================================*/
+	const page_t*	page,	/*!< in: index page */
+	ulint		n_recs);/*!< in: number of records */
+/*************************************************************//**
+Calculates free space if a page is emptied.
+@return	free space */
+UNIV_INLINE
+ulint
+page_get_free_space_of_empty(
+/*=========================*/
+	ulint	comp)	/*!< in: nonzero=compact page format */
+		__attribute__((const));
+/**********************************************************//**
+Returns the base extra size of a physical record.  This is the
+size of the fixed header, independent of the record size.
+@return	REC_N_NEW_EXTRA_BYTES or REC_N_OLD_EXTRA_BYTES */
+UNIV_INLINE
+ulint
+page_rec_get_base_extra_size(
+/*=========================*/
+	const rec_t*	rec);	/*!< in: physical record */
+/************************************************************//**
+Returns the sum of the sizes of the records in the record list
+excluding the infimum and supremum records.
+@return	data in bytes */
+UNIV_INLINE
+ulint
+page_get_data_size(
+/*===============*/
+	const page_t*	page);	/*!< in: index page */
+/************************************************************//**
+Allocates a block of memory from the head of the free list
+of an index page. */
+UNIV_INLINE
+void
+page_mem_alloc_free(
+/*================*/
+	page_t*		page,	/*!< in/out: index page */
+	page_zip_des_t*	page_zip,/*!< in/out: compressed page with enough
+				space available for inserting the record,
+				or NULL */
+	rec_t*		next_rec,/*!< in: pointer to the new head of the
+				free record list */
+	ulint		need);	/*!< in: number of bytes allocated */
+/************************************************************//**
+Allocates a block of memory from the heap of an index page.
+@return	pointer to start of allocated buffer, or NULL if allocation fails */
+UNIV_INTERN
+byte*
+page_mem_alloc_heap(
+/*================*/
+	page_t*		page,	/*!< in/out: index page */
+	page_zip_des_t*	page_zip,/*!< in/out: compressed page with enough
+				space available for inserting the record,
+				or NULL */
+	ulint		need,	/*!< in: total number of bytes needed */
+	ulint*		heap_no);/*!< out: this contains the heap number
+				of the allocated record
+				if allocation succeeds */
+/************************************************************//**
+Puts a record to free list. */
+UNIV_INLINE
+void
+page_mem_free(
+/*==========*/
+	page_t*		page,	/*!< in/out: index page */
+	page_zip_des_t*	page_zip,/*!< in/out: compressed page, or NULL */
+	rec_t*		rec,	/*!< in: pointer to the (origin of) record */
+	dict_index_t*	index,	/*!< in: index of rec */
+	const ulint*	offsets);/*!< in: array returned by rec_get_offsets() */
+/**********************************************************//**
+Create an uncompressed B-tree index page.
+@return	pointer to the page */
+UNIV_INTERN
+page_t*
+page_create(
+/*========*/
+	buf_block_t*	block,		/*!< in: a buffer block where the
+					page is created */
+	mtr_t*		mtr,		/*!< in: mini-transaction handle */
+	ulint		comp);		/*!< in: nonzero=compact page format */
+/**********************************************************//**
+Create a compressed B-tree index page.
+@return	pointer to the page */
+UNIV_INTERN
+page_t*
+page_create_zip(
+/*============*/
+	buf_block_t*	block,		/*!< in/out: a buffer frame where the
+					page is created */
+	dict_index_t*	index,		/*!< in: the index of the page */
+	ulint		level,		/*!< in: the B-tree level of the page */
+	mtr_t*		mtr);		/*!< in: mini-transaction handle */
+
+/*************************************************************//**
+Differs from page_copy_rec_list_end, because this function does not
+touch the lock table and max trx id on page or compress the page. */
+UNIV_INTERN
+void
+page_copy_rec_list_end_no_locks(
+/*============================*/
+	buf_block_t*	new_block,	/*!< in: index page to copy to */
+	buf_block_t*	block,		/*!< in: index page of rec */
+	rec_t*		rec,		/*!< in: record on page */
+	dict_index_t*	index,		/*!< in: record descriptor */
+	mtr_t*		mtr);		/*!< in: mtr */
+/*************************************************************//**
+Copies records from page to new_page, from the given record onward,
+including that record. Infimum and supremum records are not copied.
+The records are copied to the start of the record list on new_page.
+@return pointer to the original successor of the infimum record on
+new_page, or NULL on zip overflow (new_block will be decompressed) */
+UNIV_INTERN
+rec_t*
+page_copy_rec_list_end(
+/*===================*/
+	buf_block_t*	new_block,	/*!< in/out: index page to copy to */
+	buf_block_t*	block,		/*!< in: index page containing rec */
+	rec_t*		rec,		/*!< in: record on page */
+	dict_index_t*	index,		/*!< in: record descriptor */
+	mtr_t*		mtr)		/*!< in: mtr */
+	__attribute__((nonnull));
+/*************************************************************//**
+Copies records from page to new_page, up to the given record, NOT
+including that record. Infimum and supremum records are not copied.
+The records are copied to the end of the record list on new_page.
+@return pointer to the original predecessor of the supremum record on
+new_page, or NULL on zip overflow (new_block will be decompressed) */
+UNIV_INTERN
+rec_t*
+page_copy_rec_list_start(
+/*=====================*/
+	buf_block_t*	new_block,	/*!< in/out: index page to copy to */
+	buf_block_t*	block,		/*!< in: index page containing rec */
+	rec_t*		rec,		/*!< in: record on page */
+	dict_index_t*	index,		/*!< in: record descriptor */
+	mtr_t*		mtr)		/*!< in: mtr */
+	__attribute__((nonnull));
+/*************************************************************//**
+Deletes records from a page from a given record onward, including that record.
+The infimum and supremum records are not deleted. */
+UNIV_INTERN
+void
+page_delete_rec_list_end(
+/*=====================*/
+	rec_t*		rec,	/*!< in: pointer to record on page */
+	buf_block_t*	block,	/*!< in: buffer block of the page */
+	dict_index_t*	index,	/*!< in: record descriptor */
+	ulint		n_recs,	/*!< in: number of records to delete,
+				or ULINT_UNDEFINED if not known */
+	ulint		size,	/*!< in: the sum of the sizes of the
+				records in the end of the chain to
+				delete, or ULINT_UNDEFINED if not known */
+	mtr_t*		mtr)	/*!< in: mtr */
+	__attribute__((nonnull));
+/*************************************************************//**
+Deletes records from page, up to the given record, NOT including
+that record. Infimum and supremum records are not deleted. */
+UNIV_INTERN
+void
+page_delete_rec_list_start(
+/*=======================*/
+	rec_t*		rec,	/*!< in: record on page */
+	buf_block_t*	block,	/*!< in: buffer block of the page */
+	dict_index_t*	index,	/*!< in: record descriptor */
+	mtr_t*		mtr)	/*!< in: mtr */
+	__attribute__((nonnull));
+/*************************************************************//**
+Moves record list end to another page. Moved records include
+split_rec.
+@return TRUE on success; FALSE on compression failure (new_block will
+be decompressed) */
+UNIV_INTERN
+ibool
+page_move_rec_list_end(
+/*===================*/
+	buf_block_t*	new_block,	/*!< in/out: index page where to move */
+	buf_block_t*	block,		/*!< in: index page from where to move */
+	rec_t*		split_rec,	/*!< in: first record to move */
+	dict_index_t*	index,		/*!< in: record descriptor */
+	mtr_t*		mtr)		/*!< in: mtr */
+	__attribute__((nonnull(1, 2, 4, 5)));
+/*************************************************************//**
+Moves record list start to another page. Moved records do not include
+split_rec.
+@return	TRUE on success; FALSE on compression failure */
+UNIV_INTERN
+ibool
+page_move_rec_list_start(
+/*=====================*/
+	buf_block_t*	new_block,	/*!< in/out: index page where to move */
+	buf_block_t*	block,		/*!< in/out: page containing split_rec */
+	rec_t*		split_rec,	/*!< in: first record not to move */
+	dict_index_t*	index,		/*!< in: record descriptor */
+	mtr_t*		mtr)		/*!< in: mtr */
+	__attribute__((nonnull(1, 2, 4, 5)));
+/****************************************************************//**
+Splits a directory slot which owns too many records. */
+UNIV_INTERN
+void
+page_dir_split_slot(
+/*================*/
+	page_t*		page,	/*!< in: index page */
+	page_zip_des_t*	page_zip,/*!< in/out: compressed page whose
+				uncompressed part will be written, or NULL */
+	ulint		slot_no)/*!< in: the directory slot */
+	__attribute__((nonnull(1)));
+/*************************************************************//**
+Tries to balance the given directory slot with too few records
+with the upper neighbor, so that there are at least the minimum number
+of records owned by the slot; this may result in the merging of
+two slots. */
+UNIV_INTERN
+void
+page_dir_balance_slot(
+/*==================*/
+	page_t*		page,	/*!< in/out: index page */
+	page_zip_des_t*	page_zip,/*!< in/out: compressed page, or NULL */
+	ulint		slot_no)/*!< in: the directory slot */
+	__attribute__((nonnull(1)));
+/**********************************************************//**
+Parses a log record of a record list end or start deletion.
+@return	end of log record or NULL */
+UNIV_INTERN
+byte*
+page_parse_delete_rec_list(
+/*=======================*/
+	byte		type,	/*!< in: MLOG_LIST_END_DELETE,
+				MLOG_LIST_START_DELETE,
+				MLOG_COMP_LIST_END_DELETE or
+				MLOG_COMP_LIST_START_DELETE */
+	byte*		ptr,	/*!< in: buffer */
+	byte*		end_ptr,/*!< in: buffer end */
+	buf_block_t*	block,	/*!< in/out: buffer block or NULL */
+	dict_index_t*	index,	/*!< in: record descriptor */
+	mtr_t*		mtr);	/*!< in: mtr or NULL */
+/***********************************************************//**
+Parses a redo log record of creating a page.
+@return	end of log record or NULL */
+UNIV_INTERN
+byte*
+page_parse_create(
+/*==============*/
+	byte*		ptr,	/*!< in: buffer */
+	byte*		end_ptr,/*!< in: buffer end */
+	ulint		comp,	/*!< in: nonzero=compact page format */
+	buf_block_t*	block,	/*!< in: block or NULL */
+	mtr_t*		mtr);	/*!< in: mtr or NULL */
+/************************************************************//**
+Prints record contents including the data relevant only in
+the index page context. */
+UNIV_INTERN
+void
+page_rec_print(
+/*===========*/
+	const rec_t*	rec,	/*!< in: physical record */
+	const ulint*	offsets);/*!< in: record descriptor */
+/***************************************************************//**
+This is used to print the contents of the directory for
+debugging purposes. */
+UNIV_INTERN
+void
+page_dir_print(
+/*===========*/
+	page_t*	page,	/*!< in: index page */
+	ulint	pr_n);	/*!< in: print n first and n last entries */
+/***************************************************************//**
+This is used to print the contents of the page record list for
+debugging purposes. */
+UNIV_INTERN
+void
+page_print_list(
+/*============*/
+	buf_block_t*	block,	/*!< in: index page */
+	dict_index_t*	index,	/*!< in: dictionary index of the page */
+	ulint		pr_n);	/*!< in: print n first and n last entries */
+/***************************************************************//**
+Prints the info in a page header. */
+UNIV_INTERN
+void
+page_header_print(
+/*==============*/
+	const page_t*	page);	/*!< in: index page */
+/***************************************************************//**
+This is used to print the contents of the page for
+debugging purposes. */
+UNIV_INTERN
+void
+page_print(
+/*=======*/
+	buf_block_t*	block,	/*!< in: index page */
+	dict_index_t*	index,	/*!< in: dictionary index of the page */
+	ulint		dn,	/*!< in: print dn first and last entries
+				in directory */
+	ulint		rn);	/*!< in: print rn first and last records
+				in directory */
+/***************************************************************//**
+The following is used to validate a record on a page. This function
+differs from rec_validate as it can also check the n_owned field and
+the heap_no field.
+@return	TRUE if ok */
+UNIV_INTERN
+ibool
+page_rec_validate(
+/*==============*/
+	rec_t*		rec,	/*!< in: physical record */
+	const ulint*	offsets);/*!< in: array returned by rec_get_offsets() */
+/***************************************************************//**
+Checks that the first directory slot points to the infimum record and
+the last to the supremum. This function is intended to track if the
+bug fixed in 4.0.14 has caused corruption to users' databases. */
+UNIV_INTERN
+void
+page_check_dir(
+/*===========*/
+	const page_t*	page);	/*!< in: index page */
+/***************************************************************//**
+This function checks the consistency of an index page when we do not
+know the index. This is also resilient so that this should never crash
+even if the page is total garbage.
+@return	TRUE if ok */
+UNIV_INTERN
+ibool
+page_simple_validate_old(
+/*=====================*/
+	page_t*	page);	/*!< in: old-style index page */
+/***************************************************************//**
+This function checks the consistency of an index page when we do not
+know the index. This is also resilient so that this should never crash
+even if the page is total garbage.
+@return	TRUE if ok */
+UNIV_INTERN
+ibool
+page_simple_validate_new(
+/*=====================*/
+	page_t*	block);	/*!< in: new-style index page */
+/***************************************************************//**
+This function checks the consistency of an index page.
+@return	TRUE if ok */
+UNIV_INTERN
+ibool
+page_validate(
+/*==========*/
+	page_t*		page,	/*!< in: index page */
+	dict_index_t*	index);	/*!< in: data dictionary index containing
+				the page record type definition */
+/***************************************************************//**
+Looks in the page record list for a record with the given heap number.
+@return	record, NULL if not found */
+
+const rec_t*
+page_find_rec_with_heap_no(
+/*=======================*/
+	const page_t*	page,	/*!< in: index page */
+	ulint		heap_no);/*!< in: heap number */
+
+#ifdef UNIV_MATERIALIZE
+#undef UNIV_INLINE
+#define UNIV_INLINE  UNIV_INLINE_ORIGINAL
+#endif
+
+#ifndef UNIV_NONINL
+#include "page0page.ic"
+#endif
+
+#endif
diff --git a/storage/xtradb/include/page0page.ic b/storage/xtradb/include/page0page.ic
new file mode 100644
index 00000000000..dab9dc742e4
--- /dev/null
+++ b/storage/xtradb/include/page0page.ic
@@ -0,0 +1,1076 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/page0page.ic
+Index page routines
+
+Created 2/2/1994 Heikki Tuuri
+*******************************************************/
+
+#include "mach0data.h"
+#ifdef UNIV_DEBUG
+# include "log0recv.h"
+#endif /* !UNIV_DEBUG */
+#ifndef UNIV_HOTBACKUP
+# include "rem0cmp.h"
+#endif /* !UNIV_HOTBACKUP */
+#include "mtr0log.h"
+#include "page0zip.h"
+
+#ifdef UNIV_MATERIALIZE
+#undef UNIV_INLINE
+#define UNIV_INLINE
+#endif
+
+/************************************************************//**
+Gets the start of a page.
+@return	start of the page */
+UNIV_INLINE
+page_t*
+page_align(
+/*=======*/
+	const void*	ptr)	/*!< in: pointer to page frame */
+{
+	return((page_t*) ut_align_down(ptr, UNIV_PAGE_SIZE));
+}
+/************************************************************//**
+Gets the offset within a page.
+@return	offset from the start of the page */
+UNIV_INLINE
+ulint
+page_offset(
+/*========*/
+	const void*	ptr)	/*!< in: pointer to page frame */
+{
+	return(ut_align_offset(ptr, UNIV_PAGE_SIZE));
+}
+/*************************************************************//**
+Returns the max trx id field value. */
+UNIV_INLINE
+trx_id_t
+page_get_max_trx_id(
+/*================*/
+	const page_t*	page)	/*!< in: page */
+{
+	ut_ad(page);
+
+	return(mach_read_from_8(page + PAGE_HEADER + PAGE_MAX_TRX_ID));
+}
+
+/*************************************************************//**
+Sets the max trx id field value if trx_id is bigger than the previous
+value. */
+UNIV_INLINE
+void
+page_update_max_trx_id(
+/*===================*/
+	buf_block_t*	block,	/*!< in/out: page */
+	page_zip_des_t*	page_zip,/*!< in/out: compressed page whose
+				uncompressed part will be updated, or NULL */
+	trx_id_t	trx_id,	/*!< in: transaction id */
+	mtr_t*		mtr)	/*!< in/out: mini-transaction */
+{
+	ut_ad(block);
+	ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
+	/* During crash recovery, this function may be called on
+	something else than a leaf page of a secondary index or the
+	insert buffer index tree (dict_index_is_sec_or_ibuf() returns
+	TRUE for the dummy indexes constructed during redo log
+	application).  In that case, PAGE_MAX_TRX_ID is unused,
+	and trx_id is usually zero. */
+	ut_ad(!ut_dulint_is_zero(trx_id) || recv_recovery_is_on());
+	ut_ad(page_is_leaf(buf_block_get_frame(block)));
+
+	if (ut_dulint_cmp(page_get_max_trx_id(buf_block_get_frame(block)),
+			  trx_id) < 0) {
+
+		page_set_max_trx_id(block, page_zip, trx_id, mtr);
+	}
+}
+
+/*************************************************************//**
+Reads the given header field. */
+UNIV_INLINE
+ulint
+page_header_get_field(
+/*==================*/
+	const page_t*	page,	/*!< in: page */
+	ulint		field)	/*!< in: PAGE_LEVEL, ... */
+{
+	ut_ad(page);
+	ut_ad(field <= PAGE_INDEX_ID);
+
+	return(mach_read_from_2(page + PAGE_HEADER + field));
+}
+
+/*************************************************************//**
+Sets the given header field. */
+UNIV_INLINE
+void
+page_header_set_field(
+/*==================*/
+	page_t*		page,	/*!< in/out: page */
+	page_zip_des_t*	page_zip,/*!< in/out: compressed page whose
+				uncompressed part will be updated, or NULL */
+	ulint		field,	/*!< in: PAGE_N_DIR_SLOTS, ... */
+	ulint		val)	/*!< in: value */
+{
+	ut_ad(page);
+	ut_ad(field <= PAGE_N_RECS);
+	ut_ad(field == PAGE_N_HEAP || val < UNIV_PAGE_SIZE);
+	ut_ad(field != PAGE_N_HEAP || (val & 0x7fff) < UNIV_PAGE_SIZE);
+
+	mach_write_to_2(page + PAGE_HEADER + field, val);
+	if (UNIV_LIKELY_NULL(page_zip)) {
+		page_zip_write_header(page_zip,
+				      page + PAGE_HEADER + field, 2, NULL);
+	}
+}
+
+/*************************************************************//**
+Returns the offset stored in the given header field.
+@return	offset from the start of the page, or 0 */
+UNIV_INLINE
+ulint
+page_header_get_offs(
+/*=================*/
+	const page_t*	page,	/*!< in: page */
+	ulint		field)	/*!< in: PAGE_FREE, ... */
+{
+	ulint	offs;
+
+	ut_ad(page);
+	ut_ad((field == PAGE_FREE)
+	      || (field == PAGE_LAST_INSERT)
+	      || (field == PAGE_HEAP_TOP));
+
+	offs = page_header_get_field(page, field);
+
+	ut_ad((field != PAGE_HEAP_TOP) || offs);
+
+	return(offs);
+}
+
+/*************************************************************//**
+Sets the pointer stored in the given header field. */
+UNIV_INLINE
+void
+page_header_set_ptr(
+/*================*/
+	page_t*		page,	/*!< in: page */
+	page_zip_des_t*	page_zip,/*!< in/out: compressed page whose
+				uncompressed part will be updated, or NULL */
+	ulint		field,	/*!< in: PAGE_FREE, ... */
+	const byte*	ptr)	/*!< in: pointer or NULL*/
+{
+	ulint	offs;
+
+	ut_ad(page);
+	ut_ad((field == PAGE_FREE)
+	      || (field == PAGE_LAST_INSERT)
+	      || (field == PAGE_HEAP_TOP));
+
+	if (ptr == NULL) {
+		offs = 0;
+	} else {
+		offs = ptr - page;
+	}
+
+	ut_ad((field != PAGE_HEAP_TOP) || offs);
+
+	page_header_set_field(page, page_zip, field, offs);
+}
+
+#ifndef UNIV_HOTBACKUP
+/*************************************************************//**
+Resets the last insert info field in the page header. Writes to mlog
+about this operation. */
+UNIV_INLINE
+void
+page_header_reset_last_insert(
+/*==========================*/
+	page_t*		page,	/*!< in/out: page */
+	page_zip_des_t*	page_zip,/*!< in/out: compressed page whose
+				uncompressed part will be updated, or NULL */
+	mtr_t*		mtr)	/*!< in: mtr */
+{
+	ut_ad(page && mtr);
+
+	if (UNIV_LIKELY_NULL(page_zip)) {
+		mach_write_to_2(page + (PAGE_HEADER + PAGE_LAST_INSERT), 0);
+		page_zip_write_header(page_zip,
+				      page + (PAGE_HEADER + PAGE_LAST_INSERT),
+				      2, mtr);
+	} else {
+		mlog_write_ulint(page + (PAGE_HEADER + PAGE_LAST_INSERT), 0,
+				 MLOG_2BYTES, mtr);
+	}
+}
+#endif /* !UNIV_HOTBACKUP */
+
+/************************************************************//**
+Determine whether the page is in new-style compact format.
+@return nonzero if the page is in compact format, zero if it is in
+old-style format */
+UNIV_INLINE
+ulint
+page_is_comp(
+/*=========*/
+	const page_t*	page)	/*!< in: index page */
+{
+	return(UNIV_EXPECT(page_header_get_field(page, PAGE_N_HEAP) & 0x8000,
+			   0x8000));
+}
+
+/************************************************************//**
+TRUE if the record is on a page in compact format.
+@return	nonzero if in compact format */
+UNIV_INLINE
+ulint
+page_rec_is_comp(
+/*=============*/
+	const rec_t*	rec)	/*!< in: record */
+{
+	return(page_is_comp(page_align(rec)));
+}
+
+/***************************************************************//**
+Returns the heap number of a record.
+@return	heap number */
+UNIV_INLINE
+ulint
+page_rec_get_heap_no(
+/*=================*/
+	const rec_t*	rec)	/*!< in: the physical record */
+{
+	if (page_rec_is_comp(rec)) {
+		return(rec_get_heap_no_new(rec));
+	} else {
+		return(rec_get_heap_no_old(rec));
+	}
+}
+
+/************************************************************//**
+Determine whether the page is a B-tree leaf.
+@return	TRUE if the page is a B-tree leaf */
+UNIV_INLINE
+ibool
+page_is_leaf(
+/*=========*/
+	const page_t*	page)	/*!< in: page */
+{
+	if (!page) {
+		return(FALSE);
+	}
+	return(!*(const uint16*) (page + (PAGE_HEADER + PAGE_LEVEL)));
+}
+
+/************************************************************//**
+Gets the offset of the first record on the page.
+@return	offset of the first record in record list, relative from page */
+UNIV_INLINE
+ulint
+page_get_infimum_offset(
+/*====================*/
+	const page_t*	page)	/*!< in: page which must have record(s) */
+{
+	ut_ad(page);
+	ut_ad(!page_offset(page));
+
+	if (page_is_comp(page)) {
+		return(PAGE_NEW_INFIMUM);
+	} else {
+		return(PAGE_OLD_INFIMUM);
+	}
+}
+
+/************************************************************//**
+Gets the offset of the last record on the page.
+@return	offset of the last record in record list, relative from page */
+UNIV_INLINE
+ulint
+page_get_supremum_offset(
+/*=====================*/
+	const page_t*	page)	/*!< in: page which must have record(s) */
+{
+	ut_ad(page);
+	ut_ad(!page_offset(page));
+
+	if (page_is_comp(page)) {
+		return(PAGE_NEW_SUPREMUM);
+	} else {
+		return(PAGE_OLD_SUPREMUM);
+	}
+}
+
+/************************************************************//**
+TRUE if the record is a user record on the page.
+@return	TRUE if a user record */
+UNIV_INLINE
+ibool
+page_rec_is_user_rec_low(
+/*=====================*/
+	ulint	offset)	/*!< in: record offset on page */
+{
+	ut_ad(offset >= PAGE_NEW_INFIMUM);
+#if PAGE_OLD_INFIMUM < PAGE_NEW_INFIMUM
+# error "PAGE_OLD_INFIMUM < PAGE_NEW_INFIMUM"
+#endif
+#if PAGE_OLD_SUPREMUM < PAGE_NEW_SUPREMUM
+# error "PAGE_OLD_SUPREMUM < PAGE_NEW_SUPREMUM"
+#endif
+#if PAGE_NEW_INFIMUM > PAGE_OLD_SUPREMUM
+# error "PAGE_NEW_INFIMUM > PAGE_OLD_SUPREMUM"
+#endif
+#if PAGE_OLD_INFIMUM > PAGE_NEW_SUPREMUM
+# error "PAGE_OLD_INFIMUM > PAGE_NEW_SUPREMUM"
+#endif
+#if PAGE_NEW_SUPREMUM > PAGE_OLD_SUPREMUM_END
+# error "PAGE_NEW_SUPREMUM > PAGE_OLD_SUPREMUM_END"
+#endif
+#if PAGE_OLD_SUPREMUM > PAGE_NEW_SUPREMUM_END
+# error "PAGE_OLD_SUPREMUM > PAGE_NEW_SUPREMUM_END"
+#endif
+	ut_ad(offset <= UNIV_PAGE_SIZE - PAGE_EMPTY_DIR_START);
+
+	return(UNIV_LIKELY(offset != PAGE_NEW_SUPREMUM)
+	       && UNIV_LIKELY(offset != PAGE_NEW_INFIMUM)
+	       && UNIV_LIKELY(offset != PAGE_OLD_INFIMUM)
+	       && UNIV_LIKELY(offset != PAGE_OLD_SUPREMUM));
+}
+
+/************************************************************//**
+TRUE if the record is the supremum record on a page.
+@return	TRUE if the supremum record */
+UNIV_INLINE
+ibool
+page_rec_is_supremum_low(
+/*=====================*/
+	ulint	offset)	/*!< in: record offset on page */
+{
+	ut_ad(offset >= PAGE_NEW_INFIMUM);
+	ut_ad(offset <= UNIV_PAGE_SIZE - PAGE_EMPTY_DIR_START);
+
+	return(UNIV_UNLIKELY(offset == PAGE_NEW_SUPREMUM)
+	       || UNIV_UNLIKELY(offset == PAGE_OLD_SUPREMUM));
+}
+
+/************************************************************//**
+TRUE if the record is the infimum record on a page.
+@return	TRUE if the infimum record */
+UNIV_INLINE
+ibool
+page_rec_is_infimum_low(
+/*====================*/
+	ulint	offset)	/*!< in: record offset on page */
+{
+	ut_ad(offset >= PAGE_NEW_INFIMUM);
+	ut_ad(offset <= UNIV_PAGE_SIZE - PAGE_EMPTY_DIR_START);
+
+	return(UNIV_UNLIKELY(offset == PAGE_NEW_INFIMUM)
+	       || UNIV_UNLIKELY(offset == PAGE_OLD_INFIMUM));
+}
+
+/************************************************************//**
+TRUE if the record is a user record on the page.
+@return	TRUE if a user record */
+UNIV_INLINE
+ibool
+page_rec_is_user_rec(
+/*=================*/
+	const rec_t*	rec)	/*!< in: record */
+{
+	return(page_rec_is_user_rec_low(page_offset(rec)));
+}
+
+/************************************************************//**
+TRUE if the record is the supremum record on a page.
+@return	TRUE if the supremum record */
+UNIV_INLINE
+ibool
+page_rec_is_supremum(
+/*=================*/
+	const rec_t*	rec)	/*!< in: record */
+{
+	return(page_rec_is_supremum_low(page_offset(rec)));
+}
+
+/************************************************************//**
+TRUE if the record is the infimum record on a page.
+@return	TRUE if the infimum record */
+UNIV_INLINE
+ibool
+page_rec_is_infimum(
+/*================*/
+	const rec_t*	rec)	/*!< in: record */
+{
+	return(page_rec_is_infimum_low(page_offset(rec)));
+}
+
+#ifndef UNIV_HOTBACKUP
+/*************************************************************//**
+Compares a data tuple to a physical record. Differs from the function
+cmp_dtuple_rec_with_match in the way that the record must reside on an
+index page, and also page infimum and supremum records can be given in
+the parameter rec. These are considered as the negative infinity and
+the positive infinity in the alphabetical order.
+@return 1, 0, -1, if dtuple is greater, equal, less than rec,
+respectively, when only the common first fields are compared */
+UNIV_INLINE
+int
+page_cmp_dtuple_rec_with_match(
+/*===========================*/
+	const dtuple_t*	dtuple,	/*!< in: data tuple */
+	const rec_t*	rec,	/*!< in: physical record on a page; may also
+				be page infimum or supremum, in which case
+				matched-parameter values below are not
+				affected */
+	const ulint*	offsets,/*!< in: array returned by rec_get_offsets() */
+	ulint*		matched_fields, /*!< in/out: number of already completely
+				matched fields; when function returns
+				contains the value for current comparison */
+	ulint*		matched_bytes) /*!< in/out: number of already matched
+				bytes within the first field not completely
+				matched; when function returns contains the
+				value for current comparison */
+{
+	ulint	rec_offset;
+
+	ut_ad(dtuple_check_typed(dtuple));
+	ut_ad(rec_offs_validate(rec, NULL, offsets));
+	ut_ad(!rec_offs_comp(offsets) == !page_rec_is_comp(rec));
+
+	rec_offset = page_offset(rec);
+
+	if (UNIV_UNLIKELY(rec_offset == PAGE_NEW_INFIMUM)
+	    || UNIV_UNLIKELY(rec_offset == PAGE_OLD_INFIMUM)) {
+		return(1);
+	}
+	if (UNIV_UNLIKELY(rec_offset == PAGE_NEW_SUPREMUM)
+	    || UNIV_UNLIKELY(rec_offset == PAGE_OLD_SUPREMUM)) {
+		return(-1);
+	}
+
+	return(cmp_dtuple_rec_with_match(dtuple, rec, offsets,
+					 matched_fields,
+					 matched_bytes));
+}
+#endif /* !UNIV_HOTBACKUP */
+
+/*************************************************************//**
+Gets the page number.
+@return	page number */
+UNIV_INLINE
+ulint
+page_get_page_no(
+/*=============*/
+	const page_t*	page)	/*!< in: page */
+{
+	ut_ad(page == page_align((page_t*) page));
+	return(mach_read_from_4(page + FIL_PAGE_OFFSET));
+}
+
+/*************************************************************//**
+Gets the tablespace identifier.
+@return	space id */
+UNIV_INLINE
+ulint
+page_get_space_id(
+/*==============*/
+	const page_t*	page)	/*!< in: page */
+{
+	ut_ad(page == page_align((page_t*) page));
+	return(mach_read_from_4(page + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID));
+}
+
+/*************************************************************//**
+Gets the number of user records on page (infimum and supremum records
+are not user records).
+@return	number of user records */
+UNIV_INLINE
+ulint
+page_get_n_recs(
+/*============*/
+	const page_t*	page)	/*!< in: index page */
+{
+	return(page_header_get_field(page, PAGE_N_RECS));
+}
+
+/*************************************************************//**
+Gets the number of dir slots in directory.
+@return	number of slots */
+UNIV_INLINE
+ulint
+page_dir_get_n_slots(
+/*=================*/
+	const page_t*	page)	/*!< in: index page */
+{
+	return(page_header_get_field(page, PAGE_N_DIR_SLOTS));
+}
+/*************************************************************//**
+Sets the number of dir slots in directory. */
+UNIV_INLINE
+void
+page_dir_set_n_slots(
+/*=================*/
+	page_t*		page,	/*!< in/out: page */
+	page_zip_des_t*	page_zip,/*!< in/out: compressed page whose
+				uncompressed part will be updated, or NULL */
+	ulint		n_slots)/*!< in: number of slots */
+{
+	page_header_set_field(page, page_zip, PAGE_N_DIR_SLOTS, n_slots);
+}
+
+/*************************************************************//**
+Gets the number of records in the heap.
+@return	number of user records */
+UNIV_INLINE
+ulint
+page_dir_get_n_heap(
+/*================*/
+	const page_t*	page)	/*!< in: index page */
+{
+	return(page_header_get_field(page, PAGE_N_HEAP) & 0x7fff);
+}
+
+/*************************************************************//**
+Sets the number of records in the heap. */
+UNIV_INLINE
+void
+page_dir_set_n_heap(
+/*================*/
+	page_t*		page,	/*!< in/out: index page */
+	page_zip_des_t*	page_zip,/*!< in/out: compressed page whose
+				uncompressed part will be updated, or NULL.
+				Note that the size of the dense page directory
+				in the compressed page trailer is
+				n_heap * PAGE_ZIP_DIR_SLOT_SIZE. */
+	ulint		n_heap)	/*!< in: number of records */
+{
+	ut_ad(n_heap < 0x8000);
+	ut_ad(!page_zip || n_heap
+	      == (page_header_get_field(page, PAGE_N_HEAP) & 0x7fff) + 1);
+
+	page_header_set_field(page, page_zip, PAGE_N_HEAP, n_heap
+			      | (0x8000
+				 & page_header_get_field(page, PAGE_N_HEAP)));
+}
+
+#ifdef UNIV_DEBUG
+/*************************************************************//**
+Gets pointer to nth directory slot.
+@return	pointer to dir slot */
+UNIV_INLINE
+page_dir_slot_t*
+page_dir_get_nth_slot(
+/*==================*/
+	const page_t*	page,	/*!< in: index page */
+	ulint		n)	/*!< in: position */
+{
+	ut_ad(page_dir_get_n_slots(page) > n);
+
+	return((page_dir_slot_t*)
+	       page + UNIV_PAGE_SIZE - PAGE_DIR
+	       - (n + 1) * PAGE_DIR_SLOT_SIZE);
+}
+#endif /* UNIV_DEBUG */
+
+/**************************************************************//**
+Used to check the consistency of a record on a page.
+@return	TRUE if succeed */
+UNIV_INLINE
+ibool
+page_rec_check(
+/*===========*/
+	const rec_t*	rec)	/*!< in: record */
+{
+	const page_t*	page = page_align(rec);
+
+	ut_a(rec);
+
+	ut_a(page_offset(rec) <= page_header_get_field(page, PAGE_HEAP_TOP));
+	ut_a(page_offset(rec) >= PAGE_DATA);
+
+	return(TRUE);
+}
+
+/***************************************************************//**
+Gets the record pointed to by a directory slot.
+@return	pointer to record */
+UNIV_INLINE
+const rec_t*
+page_dir_slot_get_rec(
+/*==================*/
+	const page_dir_slot_t*	slot)	/*!< in: directory slot */
+{
+	return(page_align(slot) + mach_read_from_2(slot));
+}
+
+/***************************************************************//**
+This is used to set the record offset in a directory slot. */
+UNIV_INLINE
+void
+page_dir_slot_set_rec(
+/*==================*/
+	page_dir_slot_t* slot,	/*!< in: directory slot */
+	rec_t*		 rec)	/*!< in: record on the page */
+{
+	ut_ad(page_rec_check(rec));
+
+	mach_write_to_2(slot, page_offset(rec));
+}
+
+/***************************************************************//**
+Gets the number of records owned by a directory slot.
+@return	number of records */
+UNIV_INLINE
+ulint
+page_dir_slot_get_n_owned(
+/*======================*/
+	const page_dir_slot_t*	slot)	/*!< in: page directory slot */
+{
+	const rec_t*	rec	= page_dir_slot_get_rec(slot);
+	if (page_rec_is_comp(slot)) {
+		return(rec_get_n_owned_new(rec));
+	} else {
+		return(rec_get_n_owned_old(rec));
+	}
+}
+
+/***************************************************************//**
+This is used to set the owned records field of a directory slot. */
+UNIV_INLINE
+void
+page_dir_slot_set_n_owned(
+/*======================*/
+	page_dir_slot_t*slot,	/*!< in/out: directory slot */
+	page_zip_des_t*	page_zip,/*!< in/out: compressed page, or NULL */
+	ulint		n)	/*!< in: number of records owned by the slot */
+{
+	rec_t*	rec	= (rec_t*) page_dir_slot_get_rec(slot);
+	if (page_rec_is_comp(slot)) {
+		rec_set_n_owned_new(rec, page_zip, n);
+	} else {
+		ut_ad(!page_zip);
+		rec_set_n_owned_old(rec, n);
+	}
+}
+
+/************************************************************//**
+Calculates the space reserved for directory slots of a given number of
+records. The exact value is a fraction number n * PAGE_DIR_SLOT_SIZE /
+PAGE_DIR_SLOT_MIN_N_OWNED, and it is rounded upwards to an integer. */
+UNIV_INLINE
+ulint
+page_dir_calc_reserved_space(
+/*=========================*/
+	ulint	n_recs)		/*!< in: number of records */
+{
+	return((PAGE_DIR_SLOT_SIZE * n_recs + PAGE_DIR_SLOT_MIN_N_OWNED - 1)
+	       / PAGE_DIR_SLOT_MIN_N_OWNED);
+}
+
+/************************************************************//**
+Gets the pointer to the next record on the page.
+@return	pointer to next record */
+UNIV_INLINE
+const rec_t*
+page_rec_get_next_low(
+/*==================*/
+	const rec_t*	rec,	/*!< in: pointer to record */
+	ulint		comp)	/*!< in: nonzero=compact page layout */
+{
+	ulint		offs;
+	const page_t*	page;
+
+	ut_ad(page_rec_check(rec));
+
+	page = page_align(rec);
+
+	offs = rec_get_next_offs(rec, comp);
+
+	if (UNIV_UNLIKELY(offs >= UNIV_PAGE_SIZE)) {
+		fprintf(stderr,
+			"InnoDB: Next record offset is nonsensical %lu"
+			" in record at offset %lu\n"
+			"InnoDB: rec address %p, space id %lu, page %lu\n",
+			(ulong)offs, (ulong) page_offset(rec),
+			(void*) rec,
+			(ulong) page_get_space_id(page),
+			(ulong) page_get_page_no(page));
+		buf_page_print(page, 0);
+
+		ut_error;
+	}
+
+	if (UNIV_UNLIKELY(offs == 0)) {
+
+		return(NULL);
+	}
+
+	return(page + offs);
+}
+
+/************************************************************//**
+Gets the pointer to the next record on the page.
+@return	pointer to next record */
+UNIV_INLINE
+rec_t*
+page_rec_get_next(
+/*==============*/
+	const rec_t*	rec)	/*!< in: pointer to record */
+{
+	return((rec_t*) page_rec_get_next_low(rec, page_rec_is_comp(rec)));
+}
+
+/************************************************************//**
+Gets the pointer to the next record on the page.
+@return	pointer to next record */
+UNIV_INLINE
+const rec_t*
+page_rec_get_next_const(
+/*====================*/
+	const rec_t*	rec)	/*!< in: pointer to record */
+{
+	return(page_rec_get_next_low(rec, page_rec_is_comp(rec)));
+}
+
+/************************************************************//**
+Sets the pointer to the next record on the page. */
+UNIV_INLINE
+void
+page_rec_set_next(
+/*==============*/
+	rec_t*	rec,		/*!< in: pointer to record,
+				must not be page supremum */
+	rec_t*	next)		/*!< in: pointer to next record,
+				must not be page infimum */
+{
+	ulint	offs;
+
+	ut_ad(page_rec_check(rec));
+	ut_ad(!page_rec_is_supremum(rec));
+	ut_ad(rec != next);
+
+	ut_ad(!next || !page_rec_is_infimum(next));
+	ut_ad(!next || page_align(rec) == page_align(next));
+
+	if (UNIV_LIKELY(next != NULL)) {
+		offs = page_offset(next);
+	} else {
+		offs = 0;
+	}
+
+	if (page_rec_is_comp(rec)) {
+		rec_set_next_offs_new(rec, offs);
+	} else {
+		rec_set_next_offs_old(rec, offs);
+	}
+}
+
+/************************************************************//**
+Gets the pointer to the previous record.
+@return	pointer to previous record */
+UNIV_INLINE
+const rec_t*
+page_rec_get_prev_const(
+/*====================*/
+	const rec_t*	rec)	/*!< in: pointer to record, must not be page
+				infimum */
+{
+	const page_dir_slot_t*	slot;
+	ulint			slot_no;
+	const rec_t*		rec2;
+	const rec_t*		prev_rec = NULL;
+	const page_t*		page;
+
+	ut_ad(page_rec_check(rec));
+
+	page = page_align(rec);
+
+	ut_ad(!page_rec_is_infimum(rec));
+
+	slot_no = page_dir_find_owner_slot(rec);
+
+	ut_a(slot_no != 0);
+
+	slot = page_dir_get_nth_slot(page, slot_no - 1);
+
+	rec2 = page_dir_slot_get_rec(slot);
+
+	if (page_is_comp(page)) {
+		while (rec != rec2) {
+			prev_rec = rec2;
+			rec2 = page_rec_get_next_low(rec2, TRUE);
+		}
+	} else {
+		while (rec != rec2) {
+			prev_rec = rec2;
+			rec2 = page_rec_get_next_low(rec2, FALSE);
+		}
+	}
+
+	ut_a(prev_rec);
+
+	return(prev_rec);
+}
+
+/************************************************************//**
+Gets the pointer to the previous record.
+@return	pointer to previous record */
+UNIV_INLINE
+rec_t*
+page_rec_get_prev(
+/*==============*/
+	rec_t*	rec)	/*!< in: pointer to record, must not be page
+			infimum */
+{
+	return((rec_t*) page_rec_get_prev_const(rec));
+}
+
+/***************************************************************//**
+Looks for the record which owns the given record.
+@return	the owner record */
+UNIV_INLINE
+rec_t*
+page_rec_find_owner_rec(
+/*====================*/
+	rec_t*	rec)	/*!< in: the physical record */
+{
+	ut_ad(page_rec_check(rec));
+
+	if (page_rec_is_comp(rec)) {
+		while (rec_get_n_owned_new(rec) == 0) {
+			rec = page_rec_get_next(rec);
+		}
+	} else {
+		while (rec_get_n_owned_old(rec) == 0) {
+			rec = page_rec_get_next(rec);
+		}
+	}
+
+	return(rec);
+}
+
+/**********************************************************//**
+Returns the base extra size of a physical record.  This is the
+size of the fixed header, independent of the record size.
+@return	REC_N_NEW_EXTRA_BYTES or REC_N_OLD_EXTRA_BYTES */
+UNIV_INLINE
+ulint
+page_rec_get_base_extra_size(
+/*=========================*/
+	const rec_t*	rec)	/*!< in: physical record */
+{
+#if REC_N_NEW_EXTRA_BYTES + 1 != REC_N_OLD_EXTRA_BYTES
+# error "REC_N_NEW_EXTRA_BYTES + 1 != REC_N_OLD_EXTRA_BYTES"
+#endif
+	return(REC_N_NEW_EXTRA_BYTES + (ulint) !page_rec_is_comp(rec));
+}
+
+/************************************************************//**
+Returns the sum of the sizes of the records in the record list, excluding
+the infimum and supremum records.
+@return	data in bytes */
+UNIV_INLINE
+ulint
+page_get_data_size(
+/*===============*/
+	const page_t*	page)	/*!< in: index page */
+{
+	ulint	ret;
+
+	ret = (ulint)(page_header_get_field(page, PAGE_HEAP_TOP)
+		      - (page_is_comp(page)
+			 ? PAGE_NEW_SUPREMUM_END
+			 : PAGE_OLD_SUPREMUM_END)
+		      - page_header_get_field(page, PAGE_GARBAGE));
+
+	ut_ad(ret < UNIV_PAGE_SIZE);
+
+	return(ret);
+}
+
+
+/************************************************************//**
+Allocates a block of memory from the free list of an index page. */
+UNIV_INLINE
+void
+page_mem_alloc_free(
+/*================*/
+	page_t*		page,	/*!< in/out: index page */
+	page_zip_des_t*	page_zip,/*!< in/out: compressed page with enough
+				space available for inserting the record,
+				or NULL */
+	rec_t*		next_rec,/*!< in: pointer to the new head of the
+				free record list */
+	ulint		need)	/*!< in: number of bytes allocated */
+{
+	ulint		garbage;
+
+#ifdef UNIV_DEBUG
+	const rec_t*	old_rec	= page_header_get_ptr(page, PAGE_FREE);
+	ulint		next_offs;
+
+	ut_ad(old_rec);
+	next_offs = rec_get_next_offs(old_rec, page_is_comp(page));
+	ut_ad(next_rec == (next_offs ? page + next_offs : NULL));
+#endif
+
+	page_header_set_ptr(page, page_zip, PAGE_FREE, next_rec);
+
+	garbage = page_header_get_field(page, PAGE_GARBAGE);
+	ut_ad(garbage >= need);
+
+	page_header_set_field(page, page_zip, PAGE_GARBAGE, garbage - need);
+}
+
+/*************************************************************//**
+Calculates free space if a page is emptied.
+@return	free space */
+UNIV_INLINE
+ulint
+page_get_free_space_of_empty(
+/*=========================*/
+	ulint	comp)		/*!< in: nonzero=compact page layout */
+{
+	if (UNIV_LIKELY(comp)) {
+		return((ulint)(UNIV_PAGE_SIZE
+			       - PAGE_NEW_SUPREMUM_END
+			       - PAGE_DIR
+			       - 2 * PAGE_DIR_SLOT_SIZE));
+	}
+
+	return((ulint)(UNIV_PAGE_SIZE
+		       - PAGE_OLD_SUPREMUM_END
+		       - PAGE_DIR
+		       - 2 * PAGE_DIR_SLOT_SIZE));
+}
+
+/************************************************************//**
+Each user record on a page, and also the deleted user records in the heap
+takes its size plus the fraction of the dir cell size /
+PAGE_DIR_SLOT_MIN_N_OWNED bytes for it. If the sum of these exceeds the
+value of page_get_free_space_of_empty, the insert is impossible, otherwise
+it is allowed. This function returns the maximum combined size of records
+which can be inserted on top of the record heap.
+@return	maximum combined size for inserted records */
+UNIV_INLINE
+ulint
+page_get_max_insert_size(
+/*=====================*/
+	const page_t*	page,	/*!< in: index page */
+	ulint		n_recs)	/*!< in: number of records */
+{
+	ulint	occupied;
+	ulint	free_space;
+
+	if (page_is_comp(page)) {
+		occupied = page_header_get_field(page, PAGE_HEAP_TOP)
+			- PAGE_NEW_SUPREMUM_END
+			+ page_dir_calc_reserved_space(
+				n_recs + page_dir_get_n_heap(page) - 2);
+
+		free_space = page_get_free_space_of_empty(TRUE);
+	} else {
+		occupied = page_header_get_field(page, PAGE_HEAP_TOP)
+			- PAGE_OLD_SUPREMUM_END
+			+ page_dir_calc_reserved_space(
+				n_recs + page_dir_get_n_heap(page) - 2);
+
+		free_space = page_get_free_space_of_empty(FALSE);
+	}
+
+	/* Above the 'n_recs +' part reserves directory space for the new
+	inserted records; the '- 2' excludes page infimum and supremum
+	records */
+
+	if (occupied > free_space) {
+
+		return(0);
+	}
+
+	return(free_space - occupied);
+}
+
+/************************************************************//**
+Returns the maximum combined size of records which can be inserted on top
+of the record heap if a page is first reorganized.
+@return	maximum combined size for inserted records */
+UNIV_INLINE
+ulint
+page_get_max_insert_size_after_reorganize(
+/*======================================*/
+	const page_t*	page,	/*!< in: index page */
+	ulint		n_recs)	/*!< in: number of records */
+{
+	ulint	occupied;
+	ulint	free_space;
+
+	occupied = page_get_data_size(page)
+		+ page_dir_calc_reserved_space(n_recs + page_get_n_recs(page));
+
+	free_space = page_get_free_space_of_empty(page_is_comp(page));
+
+	if (occupied > free_space) {
+
+		return(0);
+	}
+
+	return(free_space - occupied);
+}
+
+/************************************************************//**
+Puts a record to free list. */
+UNIV_INLINE
+void
+page_mem_free(
+/*==========*/
+	page_t*		page,	/*!< in/out: index page */
+	page_zip_des_t*	page_zip,/*!< in/out: compressed page, or NULL */
+	rec_t*		rec,	/*!< in: pointer to the (origin of) record */
+	dict_index_t*	index,	/*!< in: index of rec */
+	const ulint*	offsets)/*!< in: array returned by rec_get_offsets() */
+{
+	rec_t*		free;
+	ulint		garbage;
+
+	ut_ad(rec_offs_validate(rec, index, offsets));
+	free = page_header_get_ptr(page, PAGE_FREE);
+
+	page_rec_set_next(rec, free);
+	page_header_set_ptr(page, page_zip, PAGE_FREE, rec);
+
+	garbage = page_header_get_field(page, PAGE_GARBAGE);
+
+	page_header_set_field(page, page_zip, PAGE_GARBAGE,
+			      garbage + rec_offs_size(offsets));
+
+	if (UNIV_LIKELY_NULL(page_zip)) {
+		page_zip_dir_delete(page_zip, rec, index, offsets, free);
+	} else {
+		page_header_set_field(page, page_zip, PAGE_N_RECS,
+				      page_get_n_recs(page) - 1);
+	}
+}
+
+#ifdef UNIV_MATERIALIZE
+#undef UNIV_INLINE
+#define UNIV_INLINE	UNIV_INLINE_ORIGINAL
+#endif
diff --git a/storage/xtradb/include/page0types.h b/storage/xtradb/include/page0types.h
new file mode 100644
index 00000000000..49fe9d6abbe
--- /dev/null
+++ b/storage/xtradb/include/page0types.h
@@ -0,0 +1,151 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/page0types.h
+Index page routines
+
+Created 2/2/1994 Heikki Tuuri
+*******************************************************/
+
+#ifndef page0types_h
+#define page0types_h
+
+#include "univ.i"
+#include "dict0types.h"
+#include "mtr0types.h"
+
+/** Eliminates a name collision on HP-UX */
+#define page_t	   ib_page_t
+/** Type of the index page */
+typedef	byte		page_t;
+/** Index page cursor */
+typedef struct page_cur_struct	page_cur_t;
+
+/** Compressed index page */
+typedef byte				page_zip_t;
+/** Compressed page descriptor */
+typedef struct page_zip_des_struct	page_zip_des_t;
+
+/* The following definitions would better belong to page0zip.h,
+but we cannot include page0zip.h from rem0rec.ic, because
+page0*.h includes rem0rec.h and may include rem0rec.ic. */
+
+/** Number of bits needed for representing different compressed page sizes */
+#define PAGE_ZIP_SSIZE_BITS 3
+
+/** log2 of smallest compressed page size */
+#define PAGE_ZIP_MIN_SIZE_SHIFT	10
+/** Smallest compressed page size */
+#define PAGE_ZIP_MIN_SIZE	(1 << PAGE_ZIP_MIN_SIZE_SHIFT)
+
+/** Number of supported compressed page sizes */
+#define PAGE_ZIP_NUM_SSIZE (UNIV_PAGE_SIZE_SHIFT - PAGE_ZIP_MIN_SIZE_SHIFT + 2)
+#define PAGE_ZIP_NUM_SSIZE_MAX (UNIV_PAGE_SIZE_SHIFT_MAX - PAGE_ZIP_MIN_SIZE_SHIFT + 2)
+#if PAGE_ZIP_NUM_SSIZE_MAX > (1 << PAGE_ZIP_SSIZE_BITS)
+# error "PAGE_ZIP_NUM_SSIZE_MAX > (1 << PAGE_ZIP_SSIZE_BITS)"
+#endif
+
+/** Compressed page descriptor */
+struct page_zip_des_struct
+{
+	page_zip_t*	data;		/*!< compressed page data */
+
+#ifdef UNIV_DEBUG
+	unsigned	m_start:16;	/*!< start offset of modification log */
+#endif /* UNIV_DEBUG */
+	unsigned	m_end:16;	/*!< end offset of modification log */
+	unsigned	m_nonempty:1;	/*!< TRUE if the modification log
+					is not empty */
+	unsigned	n_blobs:12;	/*!< number of externally stored
+					columns on the page; the maximum
+					is 744 on a 16 KiB page */
+	unsigned	ssize:PAGE_ZIP_SSIZE_BITS;
+					/*!< 0 or compressed page size;
+					the size in bytes is
+					PAGE_ZIP_MIN_SIZE << (ssize - 1). */
+};
+
+/** Compression statistics for a given page size */
+struct page_zip_stat_struct {
+	/** Number of page compressions */
+	ulint		compressed;
+	/** Number of successful page compressions */
+	ulint		compressed_ok;
+	/** Number of page decompressions */
+	ulint		decompressed;
+	/** Duration of page compressions in microseconds */
+	ib_uint64_t	compressed_usec;
+	/** Duration of page decompressions in microseconds */
+	ib_uint64_t	decompressed_usec;
+};
+
+/** Compression statistics */
+typedef struct page_zip_stat_struct page_zip_stat_t;
+
+/** Statistics on compression, indexed by page_zip_des_struct::ssize - 1 */
+extern page_zip_stat_t page_zip_stat[PAGE_ZIP_NUM_SSIZE_MAX - 1];
+
+/**********************************************************************//**
+Write the "deleted" flag of a record on a compressed page.  The flag must
+already have been written on the uncompressed page. */
+UNIV_INTERN
+void
+page_zip_rec_set_deleted(
+/*=====================*/
+	page_zip_des_t*	page_zip,/*!< in/out: compressed page */
+	const byte*	rec,	/*!< in: record on the uncompressed page */
+	ulint		flag)	/*!< in: the deleted flag (nonzero=TRUE) */
+	__attribute__((nonnull));
+
+/**********************************************************************//**
+Write the "owned" flag of a record on a compressed page.  The n_owned field
+must already have been written on the uncompressed page. */
+UNIV_INTERN
+void
+page_zip_rec_set_owned(
+/*===================*/
+	page_zip_des_t*	page_zip,/*!< in/out: compressed page */
+	const byte*	rec,	/*!< in: record on the uncompressed page */
+	ulint		flag)	/*!< in: the owned flag (nonzero=TRUE) */
+	__attribute__((nonnull));
+
+/**********************************************************************//**
+Shift the dense page directory when a record is deleted. */
+UNIV_INTERN
+void
+page_zip_dir_delete(
+/*================*/
+	page_zip_des_t*	page_zip,/*!< in/out: compressed page */
+	byte*		rec,	/*!< in: deleted record */
+	dict_index_t*	index,	/*!< in: index of rec */
+	const ulint*	offsets,/*!< in: rec_get_offsets(rec) */
+	const byte*	free)	/*!< in: previous start of the free list */
+	__attribute__((nonnull(1,2,3,4)));
+
+/**********************************************************************//**
+Add a slot to the dense page directory. */
+UNIV_INTERN
+void
+page_zip_dir_add_slot(
+/*==================*/
+	page_zip_des_t*	page_zip,	/*!< in/out: compressed page */
+	ulint		is_clustered)	/*!< in: nonzero for clustered index,
+					zero for others */
+	__attribute__((nonnull));
+#endif
diff --git a/storage/xtradb/include/page0zip.h b/storage/xtradb/include/page0zip.h
new file mode 100644
index 00000000000..4d37302ed20
--- /dev/null
+++ b/storage/xtradb/include/page0zip.h
@@ -0,0 +1,475 @@
+/*****************************************************************************
+
+Copyright (c) 2005, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/page0zip.h
+Compressed page interface
+
+Created June 2005 by Marko Makela
+*******************************************************/
+
+#ifndef page0zip_h
+#define page0zip_h
+
+#ifdef UNIV_MATERIALIZE
+# undef UNIV_INLINE
+# define UNIV_INLINE
+#endif
+
+#include "mtr0types.h"
+#include "page0types.h"
+#include "buf0types.h"
+#include "dict0types.h"
+#include "trx0types.h"
+#include "mem0mem.h"
+
+/**********************************************************************//**
+Determine the size of a compressed page in bytes.
+@return	size in bytes */
+UNIV_INLINE
+ulint
+page_zip_get_size(
+/*==============*/
+	const page_zip_des_t*	page_zip)	/*!< in: compressed page */
+	__attribute__((nonnull, pure));
+/**********************************************************************//**
+Set the size of a compressed page in bytes. */
+UNIV_INLINE
+void
+page_zip_set_size(
+/*==============*/
+	page_zip_des_t*	page_zip,	/*!< in/out: compressed page */
+	ulint		size);		/*!< in: size in bytes */
+
+#ifndef UNIV_HOTBACKUP
+/**********************************************************************//**
+Determine if a record is so big that it needs to be stored externally.
+@return	FALSE if the entire record can be stored locally on the page */
+UNIV_INLINE
+ibool
+page_zip_rec_needs_ext(
+/*===================*/
+	ulint	rec_size,	/*!< in: length of the record in bytes */
+	ulint	comp,		/*!< in: nonzero=compact format */
+	ulint	n_fields,	/*!< in: number of fields in the record;
+				ignored if zip_size == 0 */
+	ulint	zip_size)	/*!< in: compressed page size in bytes, or 0 */
+	__attribute__((const));
+
+/**********************************************************************//**
+Determine the guaranteed free space on an empty page.
+@return	minimum payload size on the page */
+UNIV_INTERN
+ulint
+page_zip_empty_size(
+/*================*/
+	ulint	n_fields,	/*!< in: number of columns in the index */
+	ulint	zip_size)	/*!< in: compressed page size in bytes */
+	__attribute__((const));
+#endif /* !UNIV_HOTBACKUP */
+
+/**********************************************************************//**
+Initialize a compressed page descriptor. */
+UNIV_INLINE
+void
+page_zip_des_init(
+/*==============*/
+	page_zip_des_t*	page_zip);	/*!< in/out: compressed page
+					descriptor */
+
+/**********************************************************************//**
+Configure the zlib allocator to use the given memory heap. */
+UNIV_INTERN
+void
+page_zip_set_alloc(
+/*===============*/
+	void*		stream,		/*!< in/out: zlib stream */
+	mem_heap_t*	heap);		/*!< in: memory heap to use */
+
+/**********************************************************************//**
+Compress a page.
+@return TRUE on success, FALSE on failure; page_zip will be left
+intact on failure. */
+UNIV_INTERN
+ibool
+page_zip_compress(
+/*==============*/
+	page_zip_des_t*	page_zip,/*!< in: size; out: data, n_blobs,
+				m_start, m_end, m_nonempty */
+	const page_t*	page,	/*!< in: uncompressed page */
+	dict_index_t*	index,	/*!< in: index of the B-tree node */
+	mtr_t*		mtr)	/*!< in: mini-transaction, or NULL */
+	__attribute__((nonnull(1,3)));
+
+/**********************************************************************//**
+Decompress a page.  This function should tolerate errors on the compressed
+page.  Instead of letting assertions fail, it will return FALSE if an
+inconsistency is detected.
+@return	TRUE on success, FALSE on failure */
+UNIV_INTERN
+ibool
+page_zip_decompress(
+/*================*/
+	page_zip_des_t*	page_zip,/*!< in: data, ssize;
+				out: m_start, m_end, m_nonempty, n_blobs */
+	page_t*		page,	/*!< out: uncompressed page, may be trashed */
+	ibool		all)	/*!< in: TRUE=decompress the whole page;
+				FALSE=verify but do not copy some
+				page header fields that should not change
+				after page creation */
+	__attribute__((nonnull(1,2)));
+
+#ifdef UNIV_DEBUG
+/**********************************************************************//**
+Validate a compressed page descriptor.
+@return	TRUE if ok */
+UNIV_INLINE
+ibool
+page_zip_simple_validate(
+/*=====================*/
+	const page_zip_des_t*	page_zip);	/*!< in: compressed page
+						descriptor */
+#endif /* UNIV_DEBUG */
+
+#ifdef UNIV_ZIP_DEBUG
+/**********************************************************************//**
+Check that the compressed and decompressed pages match.
+@return	TRUE if valid, FALSE if not */
+UNIV_INTERN
+ibool
+page_zip_validate_low(
+/*==================*/
+	const page_zip_des_t*	page_zip,/*!< in: compressed page */
+	const page_t*		page,	/*!< in: uncompressed page */
+	ibool			sloppy)	/*!< in: FALSE=strict,
+					TRUE=ignore the MIN_REC_FLAG */
+	__attribute__((nonnull));
+/**********************************************************************//**
+Check that the compressed and decompressed pages match. */
+UNIV_INTERN
+ibool
+page_zip_validate(
+/*==============*/
+	const page_zip_des_t*	page_zip,/*!< in: compressed page */
+	const page_t*		page)	/*!< in: uncompressed page */
+	__attribute__((nonnull));
+#endif /* UNIV_ZIP_DEBUG */
+
+/**********************************************************************//**
+Determine how big record can be inserted without recompressing the page.
+@return a positive number indicating the maximum size of a record
+whose insertion is guaranteed to succeed, or zero or negative */
+UNIV_INLINE
+lint
+page_zip_max_ins_size(
+/*==================*/
+	const page_zip_des_t*	page_zip,/*!< in: compressed page */
+	ibool			is_clust)/*!< in: TRUE if clustered index */
+	__attribute__((nonnull, pure));
+
+/**********************************************************************//**
+Determine if enough space is available in the modification log.
+@return	TRUE if page_zip_write_rec() will succeed */
+UNIV_INLINE
+ibool
+page_zip_available(
+/*===============*/
+	const page_zip_des_t*	page_zip,/*!< in: compressed page */
+	ibool			is_clust,/*!< in: TRUE if clustered index */
+	ulint			length,	/*!< in: combined size of the record */
+	ulint			create)	/*!< in: nonzero=add the record to
+					the heap */
+	__attribute__((nonnull, pure));
+
+/**********************************************************************//**
+Write data to the uncompressed header portion of a page.  The data must
+already have been written to the uncompressed page. */
+UNIV_INLINE
+void
+page_zip_write_header(
+/*==================*/
+	page_zip_des_t*	page_zip,/*!< in/out: compressed page */
+	const byte*	str,	/*!< in: address on the uncompressed page */
+	ulint		length,	/*!< in: length of the data */
+	mtr_t*		mtr)	/*!< in: mini-transaction, or NULL */
+	__attribute__((nonnull(1,2)));
+
+/**********************************************************************//**
+Write an entire record on the compressed page.  The data must already
+have been written to the uncompressed page. */
+UNIV_INTERN
+void
+page_zip_write_rec(
+/*===============*/
+	page_zip_des_t*	page_zip,/*!< in/out: compressed page */
+	const byte*	rec,	/*!< in: record being written */
+	dict_index_t*	index,	/*!< in: the index the record belongs to */
+	const ulint*	offsets,/*!< in: rec_get_offsets(rec, index) */
+	ulint		create)	/*!< in: nonzero=insert, zero=update */
+	__attribute__((nonnull));
+
+/***********************************************************//**
+Parses a log record of writing a BLOB pointer of a record.
+@return	end of log record or NULL */
+UNIV_INTERN
+byte*
+page_zip_parse_write_blob_ptr(
+/*==========================*/
+	byte*		ptr,	/*!< in: redo log buffer */
+	byte*		end_ptr,/*!< in: redo log buffer end */
+	page_t*		page,	/*!< in/out: uncompressed page */
+	page_zip_des_t*	page_zip);/*!< in/out: compressed page */
+
+/**********************************************************************//**
+Write a BLOB pointer of a record on the leaf page of a clustered index.
+The information must already have been updated on the uncompressed page. */
+UNIV_INTERN
+void
+page_zip_write_blob_ptr(
+/*====================*/
+	page_zip_des_t*	page_zip,/*!< in/out: compressed page */
+	const byte*	rec,	/*!< in/out: record whose data is being
+				written */
+	dict_index_t*	index,	/*!< in: index of the page */
+	const ulint*	offsets,/*!< in: rec_get_offsets(rec, index) */
+	ulint		n,	/*!< in: column index */
+	mtr_t*		mtr)	/*!< in: mini-transaction handle,
+				or NULL if no logging is needed */
+	__attribute__((nonnull(1,2,3,4)));
+
+/***********************************************************//**
+Parses a log record of writing the node pointer of a record.
+@return	end of log record or NULL */
+UNIV_INTERN
+byte*
+page_zip_parse_write_node_ptr(
+/*==========================*/
+	byte*		ptr,	/*!< in: redo log buffer */
+	byte*		end_ptr,/*!< in: redo log buffer end */
+	page_t*		page,	/*!< in/out: uncompressed page */
+	page_zip_des_t*	page_zip);/*!< in/out: compressed page */
+
+/**********************************************************************//**
+Write the node pointer of a record on a non-leaf compressed page. */
+UNIV_INTERN
+void
+page_zip_write_node_ptr(
+/*====================*/
+	page_zip_des_t*	page_zip,/*!< in/out: compressed page */
+	byte*		rec,	/*!< in/out: record */
+	ulint		size,	/*!< in: data size of rec */
+	ulint		ptr,	/*!< in: node pointer */
+	mtr_t*		mtr)	/*!< in: mini-transaction, or NULL */
+	__attribute__((nonnull(1,2)));
+
+/**********************************************************************//**
+Write the trx_id and roll_ptr of a record on a B-tree leaf node page. */
+UNIV_INTERN
+void
+page_zip_write_trx_id_and_roll_ptr(
+/*===============================*/
+	page_zip_des_t*	page_zip,/*!< in/out: compressed page */
+	byte*		rec,	/*!< in/out: record */
+	const ulint*	offsets,/*!< in: rec_get_offsets(rec, index) */
+	ulint		trx_id_col,/*!< in: column number of TRX_ID in rec */
+	trx_id_t	trx_id,	/*!< in: transaction identifier */
+	roll_ptr_t	roll_ptr)/*!< in: roll_ptr */
+	__attribute__((nonnull));
+
+/**********************************************************************//**
+Write the "deleted" flag of a record on a compressed page.  The flag must
+already have been written on the uncompressed page. */
+UNIV_INTERN
+void
+page_zip_rec_set_deleted(
+/*=====================*/
+	page_zip_des_t*	page_zip,/*!< in/out: compressed page */
+	const byte*	rec,	/*!< in: record on the uncompressed page */
+	ulint		flag)	/*!< in: the deleted flag (nonzero=TRUE) */
+	__attribute__((nonnull));
+
+/**********************************************************************//**
+Write the "owned" flag of a record on a compressed page.  The n_owned field
+must already have been written on the uncompressed page. */
+UNIV_INTERN
+void
+page_zip_rec_set_owned(
+/*===================*/
+	page_zip_des_t*	page_zip,/*!< in/out: compressed page */
+	const byte*	rec,	/*!< in: record on the uncompressed page */
+	ulint		flag)	/*!< in: the owned flag (nonzero=TRUE) */
+	__attribute__((nonnull));
+
+/**********************************************************************//**
+Insert a record to the dense page directory. */
+UNIV_INTERN
+void
+page_zip_dir_insert(
+/*================*/
+	page_zip_des_t*	page_zip,/*!< in/out: compressed page */
+	const byte*	prev_rec,/*!< in: record after which to insert */
+	const byte*	free_rec,/*!< in: record from which rec was
+				allocated, or NULL */
+	byte*		rec);	/*!< in: record to insert */
+
+/**********************************************************************//**
+Shift the dense page directory and the array of BLOB pointers
+when a record is deleted. */
+UNIV_INTERN
+void
+page_zip_dir_delete(
+/*================*/
+	page_zip_des_t*	page_zip,/*!< in/out: compressed page */
+	byte*		rec,	/*!< in: deleted record */
+	dict_index_t*	index,	/*!< in: index of rec */
+	const ulint*	offsets,/*!< in: rec_get_offsets(rec) */
+	const byte*	free)	/*!< in: previous start of the free list */
+	__attribute__((nonnull(1,2,3,4)));
+
+/**********************************************************************//**
+Add a slot to the dense page directory. */
+UNIV_INTERN
+void
+page_zip_dir_add_slot(
+/*==================*/
+	page_zip_des_t*	page_zip,	/*!< in/out: compressed page */
+	ulint		is_clustered)	/*!< in: nonzero for clustered index,
+					zero for others */
+	__attribute__((nonnull));
+
+/***********************************************************//**
+Parses a log record of writing to the header of a page.
+@return	end of log record or NULL */
+UNIV_INTERN
+byte*
+page_zip_parse_write_header(
+/*========================*/
+	byte*		ptr,	/*!< in: redo log buffer */
+	byte*		end_ptr,/*!< in: redo log buffer end */
+	page_t*		page,	/*!< in/out: uncompressed page */
+	page_zip_des_t*	page_zip);/*!< in/out: compressed page */
+
+/**********************************************************************//**
+Write data to the uncompressed header portion of a page.  The data must
+already have been written to the uncompressed page.
+However, the data portion of the uncompressed page may differ from
+the compressed page when a record is being inserted in
+page_cur_insert_rec_low(). */
+UNIV_INLINE
+void
+page_zip_write_header(
+/*==================*/
+	page_zip_des_t*	page_zip,/*!< in/out: compressed page */
+	const byte*	str,	/*!< in: address on the uncompressed page */
+	ulint		length,	/*!< in: length of the data */
+	mtr_t*		mtr)	/*!< in: mini-transaction, or NULL */
+	__attribute__((nonnull(1,2)));
+
+/**********************************************************************//**
+Reorganize and compress a page.  This is a low-level operation for
+compressed pages, to be used when page_zip_compress() fails.
+On success, a redo log entry MLOG_ZIP_PAGE_COMPRESS will be written.
+The function btr_page_reorganize() should be preferred whenever possible.
+IMPORTANT: if page_zip_reorganize() is invoked on a leaf page of a
+non-clustered index, the caller must update the insert buffer free
+bits in the same mini-transaction in such a way that the modification
+will be redo-logged.
+@return TRUE on success, FALSE on failure; page_zip will be left
+intact on failure, but page will be overwritten. */
+UNIV_INTERN
+ibool
+page_zip_reorganize(
+/*================*/
+	buf_block_t*	block,	/*!< in/out: page with compressed page;
+				on the compressed page, in: size;
+				out: data, n_blobs,
+				m_start, m_end, m_nonempty */
+	dict_index_t*	index,	/*!< in: index of the B-tree node */
+	mtr_t*		mtr)	/*!< in: mini-transaction */
+	__attribute__((nonnull));
+#ifndef UNIV_HOTBACKUP
+/**********************************************************************//**
+Copy the records of a page byte for byte.  Do not copy the page header
+or trailer, except those B-tree header fields that are directly
+related to the storage of records.  Also copy PAGE_MAX_TRX_ID.
+NOTE: The caller must update the lock table and the adaptive hash index. */
+UNIV_INTERN
+void
+page_zip_copy_recs(
+/*===============*/
+	page_zip_des_t*		page_zip,	/*!< out: copy of src_zip
+						(n_blobs, m_start, m_end,
+						m_nonempty, data[0..size-1]) */
+	page_t*			page,		/*!< out: copy of src */
+	const page_zip_des_t*	src_zip,	/*!< in: compressed page */
+	const page_t*		src,		/*!< in: page */
+	dict_index_t*		index,		/*!< in: index of the B-tree */
+	mtr_t*			mtr)		/*!< in: mini-transaction */
+	__attribute__((nonnull(1,2,3,4)));
+#endif /* !UNIV_HOTBACKUP */
+
+/**********************************************************************//**
+Parses a log record of compressing an index page.
+@return	end of log record or NULL */
+UNIV_INTERN
+byte*
+page_zip_parse_compress(
+/*====================*/
+	byte*		ptr,	/*!< in: buffer */
+	byte*		end_ptr,/*!< in: buffer end */
+	page_t*		page,	/*!< out: uncompressed page */
+	page_zip_des_t*	page_zip)/*!< out: compressed page */
+	__attribute__((nonnull(1,2)));
+
+/**********************************************************************//**
+Calculate the compressed page checksum.
+@return	page checksum */
+UNIV_INTERN
+ulint
+page_zip_calc_checksum(
+/*===================*/
+        const void*     data,   /*!< in: compressed page */
+        ulint           size)   /*!< in: size of compressed page */
+	__attribute__((nonnull));
+
+#ifndef UNIV_HOTBACKUP
+/** Check if a pointer to an uncompressed page matches a compressed page.
+@param ptr	pointer to an uncompressed page frame
+@param page_zip	compressed page descriptor
+@return		TRUE if ptr and page_zip refer to the same block */
+# define PAGE_ZIP_MATCH(ptr, page_zip)			\
+	(buf_frame_get_page_zip(ptr) == (page_zip))
+#else /* !UNIV_HOTBACKUP */
+/** Check if a pointer to an uncompressed page matches a compressed page.
+@param ptr	pointer to an uncompressed page frame
+@param page_zip	compressed page descriptor
+@return		TRUE if ptr and page_zip refer to the same block */
+# define PAGE_ZIP_MATCH(ptr, page_zip)				\
+	(page_align(ptr) + UNIV_PAGE_SIZE == (page_zip)->data)
+#endif /* !UNIV_HOTBACKUP */
+
+#ifdef UNIV_MATERIALIZE
+# undef UNIV_INLINE
+# define UNIV_INLINE	UNIV_INLINE_ORIGINAL
+#endif
+
+#ifndef UNIV_NONINL
+# include "page0zip.ic"
+#endif
+
+#endif /* page0zip_h */
diff --git a/storage/xtradb/include/page0zip.ic b/storage/xtradb/include/page0zip.ic
new file mode 100644
index 00000000000..75cc7a9fcc4
--- /dev/null
+++ b/storage/xtradb/include/page0zip.ic
@@ -0,0 +1,397 @@
+/*****************************************************************************
+
+Copyright (c) 2005, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/page0zip.ic
+Compressed page interface
+
+Created June 2005 by Marko Makela
+*******************************************************/
+
+#ifdef UNIV_MATERIALIZE
+# undef UNIV_INLINE
+# define UNIV_INLINE
+#endif
+
+#include "page0zip.h"
+#include "page0page.h"
+
+/* The format of compressed pages is as follows.
+
+The header and trailer of the uncompressed pages, excluding the page
+directory in the trailer, are copied as is to the header and trailer
+of the compressed page.
+
+At the end of the compressed page, there is a dense page directory
+pointing to every user record contained on the page, including deleted
+records on the free list.  The dense directory is indexed in the
+collation order, i.e., in the order in which the record list is
+linked on the uncompressed page.  The infimum and supremum records are
+excluded.  The two most significant bits of the entries are allocated
+for the delete-mark and an n_owned flag indicating the last record in
+a chain of records pointed to from the sparse page directory on the
+uncompressed page.
+
+The data between PAGE_ZIP_START and the last page directory entry will
+be written in compressed format, starting at offset PAGE_DATA.
+Infimum and supremum records are not stored.  We exclude the
+REC_N_NEW_EXTRA_BYTES in every record header.  These can be recovered
+from the dense page directory stored at the end of the compressed
+page.
+
+The fields node_ptr (in non-leaf B-tree nodes; level>0), trx_id and
+roll_ptr (in leaf B-tree nodes; level=0), and BLOB pointers of
+externally stored columns are stored separately, in ascending order of
+heap_no and column index, starting backwards from the dense page
+directory.
+
+The compressed data stream may be followed by a modification log
+covering the compressed portion of the page, as follows.
+
+MODIFICATION LOG ENTRY FORMAT
+- write record:
+  - (heap_no - 1) << 1 (1..2 bytes)
+  - extra bytes backwards
+  - data bytes
+- clear record:
+  - (heap_no - 1) << 1 | 1 (1..2 bytes)
+
+The integer values are stored in a variable-length format:
+- 0xxxxxxx: 0..127
+- 1xxxxxxx xxxxxxxx: 0..32767
+
+The end of the modification log is marked by a 0 byte.
+
+In summary, the compressed page looks like this:
+
+(1) Uncompressed page header (PAGE_DATA bytes)
+(2) Compressed index information
+(3) Compressed page data
+(4) Page modification log (page_zip->m_start..page_zip->m_end)
+(5) Empty zero-filled space
+(6) BLOB pointers (on leaf pages)
+  - BTR_EXTERN_FIELD_REF_SIZE for each externally stored column
+  - in descending collation order
+(7) Uncompressed columns of user records, n_dense * uncompressed_size bytes,
+  - indexed by heap_no
+  - DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN for leaf pages of clustered indexes
+  - REC_NODE_PTR_SIZE for non-leaf pages
+  - 0 otherwise
+(8) dense page directory, stored backwards
+  - n_dense = n_heap - 2
+  - existing records in ascending collation order
+  - deleted records (free list) in link order
+*/
+
+/** Start offset of the area that will be compressed */
+#define PAGE_ZIP_START		PAGE_NEW_SUPREMUM_END
+/** Size of an compressed page directory entry */
+#define PAGE_ZIP_DIR_SLOT_SIZE	2
+/** Mask of record offsets */
+#define PAGE_ZIP_DIR_SLOT_MASK	0x3fff
+/** 'owned' flag */
+#define PAGE_ZIP_DIR_SLOT_OWNED	0x4000
+/** 'deleted' flag */
+#define PAGE_ZIP_DIR_SLOT_DEL	0x8000
+
+/**********************************************************************//**
+Determine the size of a compressed page in bytes.
+@return	size in bytes */
+UNIV_INLINE
+ulint
+page_zip_get_size(
+/*==============*/
+	const page_zip_des_t*	page_zip)	/*!< in: compressed page */
+{
+	ulint	size;
+
+	if (UNIV_UNLIKELY(!page_zip->ssize)) {
+		return(0);
+	}
+
+	size = (PAGE_ZIP_MIN_SIZE >> 1) << page_zip->ssize;
+
+	ut_ad(size >= PAGE_ZIP_MIN_SIZE);
+	ut_ad(size <= UNIV_PAGE_SIZE);
+
+	return(size);
+}
+/**********************************************************************//**
+Set the size of a compressed page in bytes. */
+UNIV_INLINE
+void
+page_zip_set_size(
+/*==============*/
+	page_zip_des_t*	page_zip,	/*!< in/out: compressed page */
+	ulint		size)		/*!< in: size in bytes */
+{
+	if (size) {
+		int	ssize;
+
+		ut_ad(ut_is_2pow(size));
+
+		for (ssize = 1; size > (ulint) (512 << ssize); ssize++) {
+		}
+
+		page_zip->ssize = ssize;
+	} else {
+		page_zip->ssize = 0;
+	}
+
+	ut_ad(page_zip_get_size(page_zip) == size);
+}
+
+#ifndef UNIV_HOTBACKUP
+/**********************************************************************//**
+Determine if a record is so big that it needs to be stored externally.
+@return	FALSE if the entire record can be stored locally on the page */
+UNIV_INLINE
+ibool
+page_zip_rec_needs_ext(
+/*===================*/
+	ulint	rec_size,	/*!< in: length of the record in bytes */
+	ulint	comp,		/*!< in: nonzero=compact format */
+	ulint	n_fields,	/*!< in: number of fields in the record;
+				ignored if zip_size == 0 */
+	ulint	zip_size)	/*!< in: compressed page size in bytes, or 0 */
+{
+	ut_ad(rec_size > comp ? REC_N_NEW_EXTRA_BYTES : REC_N_OLD_EXTRA_BYTES);
+	ut_ad(ut_is_2pow(zip_size));
+	ut_ad(comp || !zip_size);
+
+#if UNIV_PAGE_SIZE > REC_MAX_DATA_SIZE
+	if (UNIV_UNLIKELY(rec_size >= REC_MAX_DATA_SIZE)) {
+		return(TRUE);
+	}
+#endif
+
+	if (UNIV_UNLIKELY(zip_size)) {
+		ut_ad(comp);
+		/* On a compressed page, there is a two-byte entry in
+		the dense page directory for every record.  But there
+		is no record header.  There should be enough room for
+		one record on an empty leaf page.  Subtract 1 byte for
+		the encoded heap number.  Check also the available space
+		on the uncompressed page. */
+		return(rec_size - (REC_N_NEW_EXTRA_BYTES - 2)
+		       >= (page_zip_empty_size(n_fields, zip_size) - 1)
+		       || rec_size >= page_get_free_space_of_empty(TRUE) / 2);
+	}
+
+	return(rec_size >= page_get_free_space_of_empty(comp) / 2);
+}
+#endif /* !UNIV_HOTBACKUP */
+
+#ifdef UNIV_DEBUG
+/**********************************************************************//**
+Validate a compressed page descriptor.
+@return	TRUE if ok */
+UNIV_INLINE
+ibool
+page_zip_simple_validate(
+/*=====================*/
+	const page_zip_des_t*	page_zip)/*!< in: compressed page descriptor */
+{
+	ut_ad(page_zip);
+	ut_ad(page_zip->data);
+	ut_ad(page_zip->ssize < PAGE_ZIP_NUM_SSIZE);
+	ut_ad(page_zip_get_size(page_zip)
+	      > PAGE_DATA + PAGE_ZIP_DIR_SLOT_SIZE);
+	ut_ad(page_zip->m_start <= page_zip->m_end);
+	ut_ad(page_zip->m_end < page_zip_get_size(page_zip));
+	ut_ad(page_zip->n_blobs
+	      < page_zip_get_size(page_zip) / BTR_EXTERN_FIELD_REF_SIZE);
+	return(TRUE);
+}
+#endif /* UNIV_DEBUG */
+
+/**********************************************************************//**
+Determine if the length of the page trailer.
+@return length of the page trailer, in bytes, not including the
+terminating zero byte of the modification log */
+UNIV_INLINE
+ibool
+page_zip_get_trailer_len(
+/*=====================*/
+	const page_zip_des_t*	page_zip,/*!< in: compressed page */
+	ibool			is_clust,/*!< in: TRUE if clustered index */
+	ulint*			entry_size)/*!< out: size of the uncompressed
+					portion of a user record */
+{
+	ulint	uncompressed_size;
+
+	ut_ad(page_zip_simple_validate(page_zip));
+	UNIV_MEM_ASSERT_RW(page_zip->data, page_zip_get_size(page_zip));
+
+	if (UNIV_UNLIKELY(!page_is_leaf(page_zip->data))) {
+		uncompressed_size = PAGE_ZIP_DIR_SLOT_SIZE
+			+ REC_NODE_PTR_SIZE;
+		ut_ad(!page_zip->n_blobs);
+	} else if (UNIV_UNLIKELY(is_clust)) {
+		uncompressed_size = PAGE_ZIP_DIR_SLOT_SIZE
+			+ DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN;
+	} else {
+		uncompressed_size = PAGE_ZIP_DIR_SLOT_SIZE;
+		ut_ad(!page_zip->n_blobs);
+	}
+
+	if (entry_size) {
+		*entry_size = uncompressed_size;
+	}
+
+	return((page_dir_get_n_heap(page_zip->data) - 2)
+	       * uncompressed_size
+	       + page_zip->n_blobs * BTR_EXTERN_FIELD_REF_SIZE);
+}
+
+/**********************************************************************//**
+Determine how big record can be inserted without recompressing the page.
+@return a positive number indicating the maximum size of a record
+whose insertion is guaranteed to succeed, or zero or negative */
+UNIV_INLINE
+lint
+page_zip_max_ins_size(
+/*==================*/
+	const page_zip_des_t*	page_zip,/*!< in: compressed page */
+	ibool			is_clust)/*!< in: TRUE if clustered index */
+{
+	ulint	uncompressed_size;
+	ulint	trailer_len;
+
+	trailer_len = page_zip_get_trailer_len(page_zip, is_clust,
+					       &uncompressed_size);
+
+	/* When a record is created, a pointer may be added to
+	the dense directory.
+	Likewise, space for the columns that will not be
+	compressed will be allocated from the page trailer.
+	Also the BLOB pointers will be allocated from there, but
+	we may as well count them in the length of the record. */
+
+	trailer_len += uncompressed_size;
+
+	return((lint) page_zip_get_size(page_zip)
+	       - trailer_len - page_zip->m_end
+	       - (REC_N_NEW_EXTRA_BYTES - 2));
+}
+
+/**********************************************************************//**
+Determine if enough space is available in the modification log.
+@return	TRUE if enough space is available */
+UNIV_INLINE
+ibool
+page_zip_available(
+/*===============*/
+	const page_zip_des_t*	page_zip,/*!< in: compressed page */
+	ibool			is_clust,/*!< in: TRUE if clustered index */
+	ulint			length,	/*!< in: combined size of the record */
+	ulint			create)	/*!< in: nonzero=add the record to
+					the heap */
+{
+	ulint	uncompressed_size;
+	ulint	trailer_len;
+
+	ut_ad(length > REC_N_NEW_EXTRA_BYTES);
+
+	trailer_len = page_zip_get_trailer_len(page_zip, is_clust,
+					       &uncompressed_size);
+
+	/* Subtract the fixed extra bytes and add the maximum
+	space needed for identifying the record (encoded heap_no). */
+	length -= REC_N_NEW_EXTRA_BYTES - 2;
+
+	if (UNIV_UNLIKELY(create)) {
+		/* When a record is created, a pointer may be added to
+		the dense directory.
+		Likewise, space for the columns that will not be
+		compressed will be allocated from the page trailer.
+		Also the BLOB pointers will be allocated from there, but
+		we may as well count them in the length of the record. */
+
+		trailer_len += uncompressed_size;
+	}
+
+	return(UNIV_LIKELY(length
+			   + trailer_len
+			   + page_zip->m_end
+			   < page_zip_get_size(page_zip)));
+}
+
+/**********************************************************************//**
+Initialize a compressed page descriptor. */
+UNIV_INLINE
+void
+page_zip_des_init(
+/*==============*/
+	page_zip_des_t*	page_zip)	/*!< in/out: compressed page
+					descriptor */
+{
+	memset(page_zip, 0, sizeof *page_zip);
+}
+
+/**********************************************************************//**
+Write a log record of writing to the uncompressed header portion of a page. */
+UNIV_INTERN
+void
+page_zip_write_header_log(
+/*======================*/
+	const byte*	data,/*!< in: data on the uncompressed page */
+	ulint		length,	/*!< in: length of the data */
+	mtr_t*		mtr);	/*!< in: mini-transaction */
+
+/**********************************************************************//**
+Write data to the uncompressed header portion of a page.  The data must
+already have been written to the uncompressed page.
+However, the data portion of the uncompressed page may differ from
+the compressed page when a record is being inserted in
+page_cur_insert_rec_zip(). */
+UNIV_INLINE
+void
+page_zip_write_header(
+/*==================*/
+	page_zip_des_t*	page_zip,/*!< in/out: compressed page */
+	const byte*	str,	/*!< in: address on the uncompressed page */
+	ulint		length,	/*!< in: length of the data */
+	mtr_t*		mtr)	/*!< in: mini-transaction, or NULL */
+{
+	ulint	pos;
+
+	ut_ad(PAGE_ZIP_MATCH(str, page_zip));
+	ut_ad(page_zip_simple_validate(page_zip));
+	UNIV_MEM_ASSERT_RW(page_zip->data, page_zip_get_size(page_zip));
+
+	pos = page_offset(str);
+
+	ut_ad(pos < PAGE_DATA);
+
+	memcpy(page_zip->data + pos, str, length);
+
+	/* The following would fail in page_cur_insert_rec_zip(). */
+	/* ut_ad(page_zip_validate(page_zip, str - pos)); */
+
+	if (UNIV_LIKELY_NULL(mtr)) {
+#ifndef UNIV_HOTBACKUP
+		page_zip_write_header_log(str, length, mtr);
+#endif /* !UNIV_HOTBACKUP */
+	}
+}
+
+#ifdef UNIV_MATERIALIZE
+# undef UNIV_INLINE
+# define UNIV_INLINE	UNIV_INLINE_ORIGINAL
+#endif
diff --git a/storage/xtradb/include/pars0grm.h b/storage/xtradb/include/pars0grm.h
new file mode 100644
index 00000000000..3de233eed3a
--- /dev/null
+++ b/storage/xtradb/include/pars0grm.h
@@ -0,0 +1,236 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1984, 1989, 1990, 2000, 2001, 2002, 2003, 2004 Free Software
+Foundation, Inc.
+
+As a special exception, when this file is copied by Bison into a
+Bison output file, you may use that output file without restriction.
+This special exception was added by the Free Software Foundation
+in version 1.24 of Bison.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/* A Bison parser, made by GNU Bison 1.875d.  */
+
+/* Tokens.  */
+#ifndef YYTOKENTYPE
+# define YYTOKENTYPE
+   /* Put the tokens into the symbol table, so that GDB and other debuggers
+      know about them.  */
+   enum yytokentype {
+     PARS_INT_LIT = 258,
+     PARS_FLOAT_LIT = 259,
+     PARS_STR_LIT = 260,
+     PARS_FIXBINARY_LIT = 261,
+     PARS_BLOB_LIT = 262,
+     PARS_NULL_LIT = 263,
+     PARS_ID_TOKEN = 264,
+     PARS_AND_TOKEN = 265,
+     PARS_OR_TOKEN = 266,
+     PARS_NOT_TOKEN = 267,
+     PARS_GE_TOKEN = 268,
+     PARS_LE_TOKEN = 269,
+     PARS_NE_TOKEN = 270,
+     PARS_PROCEDURE_TOKEN = 271,
+     PARS_IN_TOKEN = 272,
+     PARS_OUT_TOKEN = 273,
+     PARS_BINARY_TOKEN = 274,
+     PARS_BLOB_TOKEN = 275,
+     PARS_INT_TOKEN = 276,
+     PARS_INTEGER_TOKEN = 277,
+     PARS_FLOAT_TOKEN = 278,
+     PARS_CHAR_TOKEN = 279,
+     PARS_IS_TOKEN = 280,
+     PARS_BEGIN_TOKEN = 281,
+     PARS_END_TOKEN = 282,
+     PARS_IF_TOKEN = 283,
+     PARS_THEN_TOKEN = 284,
+     PARS_ELSE_TOKEN = 285,
+     PARS_ELSIF_TOKEN = 286,
+     PARS_LOOP_TOKEN = 287,
+     PARS_WHILE_TOKEN = 288,
+     PARS_RETURN_TOKEN = 289,
+     PARS_SELECT_TOKEN = 290,
+     PARS_SUM_TOKEN = 291,
+     PARS_COUNT_TOKEN = 292,
+     PARS_DISTINCT_TOKEN = 293,
+     PARS_FROM_TOKEN = 294,
+     PARS_WHERE_TOKEN = 295,
+     PARS_FOR_TOKEN = 296,
+     PARS_DDOT_TOKEN = 297,
+     PARS_READ_TOKEN = 298,
+     PARS_ORDER_TOKEN = 299,
+     PARS_BY_TOKEN = 300,
+     PARS_ASC_TOKEN = 301,
+     PARS_DESC_TOKEN = 302,
+     PARS_INSERT_TOKEN = 303,
+     PARS_INTO_TOKEN = 304,
+     PARS_VALUES_TOKEN = 305,
+     PARS_UPDATE_TOKEN = 306,
+     PARS_SET_TOKEN = 307,
+     PARS_DELETE_TOKEN = 308,
+     PARS_CURRENT_TOKEN = 309,
+     PARS_OF_TOKEN = 310,
+     PARS_CREATE_TOKEN = 311,
+     PARS_TABLE_TOKEN = 312,
+     PARS_INDEX_TOKEN = 313,
+     PARS_UNIQUE_TOKEN = 314,
+     PARS_CLUSTERED_TOKEN = 315,
+     PARS_DOES_NOT_FIT_IN_MEM_TOKEN = 316,
+     PARS_ON_TOKEN = 317,
+     PARS_ASSIGN_TOKEN = 318,
+     PARS_DECLARE_TOKEN = 319,
+     PARS_CURSOR_TOKEN = 320,
+     PARS_SQL_TOKEN = 321,
+     PARS_OPEN_TOKEN = 322,
+     PARS_FETCH_TOKEN = 323,
+     PARS_CLOSE_TOKEN = 324,
+     PARS_NOTFOUND_TOKEN = 325,
+     PARS_TO_CHAR_TOKEN = 326,
+     PARS_TO_NUMBER_TOKEN = 327,
+     PARS_TO_BINARY_TOKEN = 328,
+     PARS_BINARY_TO_NUMBER_TOKEN = 329,
+     PARS_SUBSTR_TOKEN = 330,
+     PARS_REPLSTR_TOKEN = 331,
+     PARS_CONCAT_TOKEN = 332,
+     PARS_INSTR_TOKEN = 333,
+     PARS_LENGTH_TOKEN = 334,
+     PARS_SYSDATE_TOKEN = 335,
+     PARS_PRINTF_TOKEN = 336,
+     PARS_ASSERT_TOKEN = 337,
+     PARS_RND_TOKEN = 338,
+     PARS_RND_STR_TOKEN = 339,
+     PARS_ROW_PRINTF_TOKEN = 340,
+     PARS_COMMIT_TOKEN = 341,
+     PARS_ROLLBACK_TOKEN = 342,
+     PARS_WORK_TOKEN = 343,
+     PARS_UNSIGNED_TOKEN = 344,
+     PARS_EXIT_TOKEN = 345,
+     PARS_FUNCTION_TOKEN = 346,
+     PARS_LOCK_TOKEN = 347,
+     PARS_SHARE_TOKEN = 348,
+     PARS_MODE_TOKEN = 349,
+     NEG = 350
+   };
+#endif
+#define PARS_INT_LIT 258
+#define PARS_FLOAT_LIT 259
+#define PARS_STR_LIT 260
+#define PARS_FIXBINARY_LIT 261
+#define PARS_BLOB_LIT 262
+#define PARS_NULL_LIT 263
+#define PARS_ID_TOKEN 264
+#define PARS_AND_TOKEN 265
+#define PARS_OR_TOKEN 266
+#define PARS_NOT_TOKEN 267
+#define PARS_GE_TOKEN 268
+#define PARS_LE_TOKEN 269
+#define PARS_NE_TOKEN 270
+#define PARS_PROCEDURE_TOKEN 271
+#define PARS_IN_TOKEN 272
+#define PARS_OUT_TOKEN 273
+#define PARS_BINARY_TOKEN 274
+#define PARS_BLOB_TOKEN 275
+#define PARS_INT_TOKEN 276
+#define PARS_INTEGER_TOKEN 277
+#define PARS_FLOAT_TOKEN 278
+#define PARS_CHAR_TOKEN 279
+#define PARS_IS_TOKEN 280
+#define PARS_BEGIN_TOKEN 281
+#define PARS_END_TOKEN 282
+#define PARS_IF_TOKEN 283
+#define PARS_THEN_TOKEN 284
+#define PARS_ELSE_TOKEN 285
+#define PARS_ELSIF_TOKEN 286
+#define PARS_LOOP_TOKEN 287
+#define PARS_WHILE_TOKEN 288
+#define PARS_RETURN_TOKEN 289
+#define PARS_SELECT_TOKEN 290
+#define PARS_SUM_TOKEN 291
+#define PARS_COUNT_TOKEN 292
+#define PARS_DISTINCT_TOKEN 293
+#define PARS_FROM_TOKEN 294
+#define PARS_WHERE_TOKEN 295
+#define PARS_FOR_TOKEN 296
+#define PARS_DDOT_TOKEN 297
+#define PARS_READ_TOKEN 298
+#define PARS_ORDER_TOKEN 299
+#define PARS_BY_TOKEN 300
+#define PARS_ASC_TOKEN 301
+#define PARS_DESC_TOKEN 302
+#define PARS_INSERT_TOKEN 303
+#define PARS_INTO_TOKEN 304
+#define PARS_VALUES_TOKEN 305
+#define PARS_UPDATE_TOKEN 306
+#define PARS_SET_TOKEN 307
+#define PARS_DELETE_TOKEN 308
+#define PARS_CURRENT_TOKEN 309
+#define PARS_OF_TOKEN 310
+#define PARS_CREATE_TOKEN 311
+#define PARS_TABLE_TOKEN 312
+#define PARS_INDEX_TOKEN 313
+#define PARS_UNIQUE_TOKEN 314
+#define PARS_CLUSTERED_TOKEN 315
+#define PARS_DOES_NOT_FIT_IN_MEM_TOKEN 316
+#define PARS_ON_TOKEN 317
+#define PARS_ASSIGN_TOKEN 318
+#define PARS_DECLARE_TOKEN 319
+#define PARS_CURSOR_TOKEN 320
+#define PARS_SQL_TOKEN 321
+#define PARS_OPEN_TOKEN 322
+#define PARS_FETCH_TOKEN 323
+#define PARS_CLOSE_TOKEN 324
+#define PARS_NOTFOUND_TOKEN 325
+#define PARS_TO_CHAR_TOKEN 326
+#define PARS_TO_NUMBER_TOKEN 327
+#define PARS_TO_BINARY_TOKEN 328
+#define PARS_BINARY_TO_NUMBER_TOKEN 329
+#define PARS_SUBSTR_TOKEN 330
+#define PARS_REPLSTR_TOKEN 331
+#define PARS_CONCAT_TOKEN 332
+#define PARS_INSTR_TOKEN 333
+#define PARS_LENGTH_TOKEN 334
+#define PARS_SYSDATE_TOKEN 335
+#define PARS_PRINTF_TOKEN 336
+#define PARS_ASSERT_TOKEN 337
+#define PARS_RND_TOKEN 338
+#define PARS_RND_STR_TOKEN 339
+#define PARS_ROW_PRINTF_TOKEN 340
+#define PARS_COMMIT_TOKEN 341
+#define PARS_ROLLBACK_TOKEN 342
+#define PARS_WORK_TOKEN 343
+#define PARS_UNSIGNED_TOKEN 344
+#define PARS_EXIT_TOKEN 345
+#define PARS_FUNCTION_TOKEN 346
+#define PARS_LOCK_TOKEN 347
+#define PARS_SHARE_TOKEN 348
+#define PARS_MODE_TOKEN 349
+#define NEG 350
+
+
+
+
+#if ! defined (YYSTYPE) && ! defined (YYSTYPE_IS_DECLARED)
+typedef int YYSTYPE;
+# define yystype YYSTYPE /* obsolescent; will be withdrawn */
+# define YYSTYPE_IS_DECLARED 1
+# define YYSTYPE_IS_TRIVIAL 1
+#endif
+
+extern YYSTYPE yylval;
+
+
+
diff --git a/storage/xtradb/include/pars0opt.h b/storage/xtradb/include/pars0opt.h
new file mode 100644
index 00000000000..42d956068f8
--- /dev/null
+++ b/storage/xtradb/include/pars0opt.h
@@ -0,0 +1,75 @@
+/*****************************************************************************
+
+Copyright (c) 1997, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/pars0opt.h
+Simple SQL optimizer
+
+Created 12/21/1997 Heikki Tuuri
+*******************************************************/
+
+#ifndef pars0opt_h
+#define pars0opt_h
+
+#include "univ.i"
+#include "que0types.h"
+#include "usr0types.h"
+#include "pars0sym.h"
+#include "dict0types.h"
+#include "row0sel.h"
+
+/*******************************************************************//**
+Optimizes a select. Decides which indexes to tables to use. The tables
+are accessed in the order that they were written to the FROM part in the
+select statement. */
+UNIV_INTERN
+void
+opt_search_plan(
+/*============*/
+	sel_node_t*	sel_node);	/*!< in: parsed select node */
+/*******************************************************************//**
+Looks for occurrences of the columns of the table in the query subgraph and
+adds them to the list of columns if an occurrence of the same column does not
+already exist in the list. If the column is already in the list, puts a value
+indirection to point to the occurrence in the column list, except if the
+column occurrence we are looking at is in the column list, in which case
+nothing is done. */
+UNIV_INTERN
+void
+opt_find_all_cols(
+/*==============*/
+	ibool		copy_val,	/*!< in: if TRUE, new found columns are
+					added as columns to copy */
+	dict_index_t*	index,		/*!< in: index to use */
+	sym_node_list_t* col_list,	/*!< in: base node of a list where
+					to add new found columns */
+	plan_t*		plan,		/*!< in: plan or NULL */
+	que_node_t*	exp);		/*!< in: expression or condition */
+/********************************************************************//**
+Prints info of a query plan. */
+UNIV_INTERN
+void
+opt_print_query_plan(
+/*=================*/
+	sel_node_t*	sel_node);	/*!< in: select node */
+
+#ifndef UNIV_NONINL
+#include "pars0opt.ic"
+#endif
+
+#endif
diff --git a/storage/xtradb/include/pars0opt.ic b/storage/xtradb/include/pars0opt.ic
new file mode 100644
index 00000000000..e0bb6bf1af2
--- /dev/null
+++ b/storage/xtradb/include/pars0opt.ic
@@ -0,0 +1,24 @@
+/*****************************************************************************
+
+Copyright (c) 1997, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/pars0opt.ic
+Simple SQL optimizer
+
+Created 12/21/1997 Heikki Tuuri
+*******************************************************/
diff --git a/storage/xtradb/include/pars0pars.h b/storage/xtradb/include/pars0pars.h
new file mode 100644
index 00000000000..fe5d76ebbb0
--- /dev/null
+++ b/storage/xtradb/include/pars0pars.h
@@ -0,0 +1,748 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/pars0pars.h
+SQL parser
+
+Created 11/19/1996 Heikki Tuuri
+*******************************************************/
+
+#ifndef pars0pars_h
+#define pars0pars_h
+
+#include "univ.i"
+#include "que0types.h"
+#include "usr0types.h"
+#include "pars0types.h"
+#include "row0types.h"
+#include "trx0types.h"
+#include "ut0vec.h"
+
+/** Type of the user functions. The first argument is always InnoDB-supplied
+and varies in type, while 'user_arg' is a user-supplied argument. The
+meaning of the return type also varies. See the individual use cases, e.g.
+the FETCH statement, for details on them. */
+typedef void* (*pars_user_func_cb_t)(void* arg, void* user_arg);
+
+/** If the following is set TRUE, the parser will emit debugging
+information */
+extern int	yydebug;
+
+#ifdef UNIV_SQL_DEBUG
+/** If the following is set TRUE, the lexer will print the SQL string
+as it tokenizes it */
+extern ibool	pars_print_lexed;
+#endif /* UNIV_SQL_DEBUG */
+
+/* Global variable used while parsing a single procedure or query : the code is
+NOT re-entrant */
+extern sym_tab_t*	pars_sym_tab_global;
+
+extern pars_res_word_t	pars_to_char_token;
+extern pars_res_word_t	pars_to_number_token;
+extern pars_res_word_t	pars_to_binary_token;
+extern pars_res_word_t	pars_binary_to_number_token;
+extern pars_res_word_t	pars_substr_token;
+extern pars_res_word_t	pars_replstr_token;
+extern pars_res_word_t	pars_concat_token;
+extern pars_res_word_t	pars_length_token;
+extern pars_res_word_t	pars_instr_token;
+extern pars_res_word_t	pars_sysdate_token;
+extern pars_res_word_t	pars_printf_token;
+extern pars_res_word_t	pars_assert_token;
+extern pars_res_word_t	pars_rnd_token;
+extern pars_res_word_t	pars_rnd_str_token;
+extern pars_res_word_t	pars_count_token;
+extern pars_res_word_t	pars_sum_token;
+extern pars_res_word_t	pars_distinct_token;
+extern pars_res_word_t	pars_binary_token;
+extern pars_res_word_t	pars_blob_token;
+extern pars_res_word_t	pars_int_token;
+extern pars_res_word_t	pars_char_token;
+extern pars_res_word_t	pars_float_token;
+extern pars_res_word_t	pars_update_token;
+extern pars_res_word_t	pars_asc_token;
+extern pars_res_word_t	pars_desc_token;
+extern pars_res_word_t	pars_open_token;
+extern pars_res_word_t	pars_close_token;
+extern pars_res_word_t	pars_share_token;
+extern pars_res_word_t	pars_unique_token;
+extern pars_res_word_t	pars_clustered_token;
+
+extern ulint		pars_star_denoter;
+
+/* Procedure parameter types */
+#define PARS_INPUT	0
+#define PARS_OUTPUT	1
+#define PARS_NOT_PARAM	2
+
+int
+yyparse(void);
+
+/*************************************************************//**
+Parses an SQL string returning the query graph.
+@return	own: the query graph */
+UNIV_INTERN
+que_t*
+pars_sql(
+/*=====*/
+	pars_info_t*	info,	/*!< in: extra information, or NULL */
+	const char*	str);	/*!< in: SQL string */
+/*************************************************************//**
+Retrieves characters to the lexical analyzer. */
+UNIV_INTERN
+void
+pars_get_lex_chars(
+/*===============*/
+	char*	buf,		/*!< in/out: buffer where to copy */
+	int*	result,		/*!< out: number of characters copied or EOF */
+	int	max_size);	/*!< in: maximum number of characters which fit
+				in the buffer */
+/*************************************************************//**
+Called by yyparse on error. */
+UNIV_INTERN
+void
+yyerror(
+/*====*/
+	const char*	s);	/*!< in: error message string */
+/*********************************************************************//**
+Parses a variable declaration.
+@return	own: symbol table node of type SYM_VAR */
+UNIV_INTERN
+sym_node_t*
+pars_variable_declaration(
+/*======================*/
+	sym_node_t*	node,	/*!< in: symbol table node allocated for the
+				id of the variable */
+	pars_res_word_t* type);	/*!< in: pointer to a type token */
+/*********************************************************************//**
+Parses a function expression.
+@return	own: function node in a query tree */
+UNIV_INTERN
+func_node_t*
+pars_func(
+/*======*/
+	que_node_t*	res_word,/*!< in: function name reserved word */
+	que_node_t*	arg);	/*!< in: first argument in the argument list */
+/*********************************************************************//**
+Parses an operator expression.
+@return	own: function node in a query tree */
+UNIV_INTERN
+func_node_t*
+pars_op(
+/*====*/
+	int		func,	/*!< in: operator token code */
+	que_node_t*	arg1,	/*!< in: first argument */
+	que_node_t*	arg2);	/*!< in: second argument or NULL for an unary
+				operator */
+/*********************************************************************//**
+Parses an ORDER BY clause. Order by a single column only is supported.
+@return	own: order-by node in a query tree */
+UNIV_INTERN
+order_node_t*
+pars_order_by(
+/*==========*/
+	sym_node_t*	column,	/*!< in: column name */
+	pars_res_word_t* asc);	/*!< in: &pars_asc_token or pars_desc_token */
+/*********************************************************************//**
+Parses a select list; creates a query graph node for the whole SELECT
+statement.
+@return	own: select node in a query tree */
+UNIV_INTERN
+sel_node_t*
+pars_select_list(
+/*=============*/
+	que_node_t*	select_list,	/*!< in: select list */
+	sym_node_t*	into_list);	/*!< in: variables list or NULL */
+/*********************************************************************//**
+Parses a cursor declaration.
+@return	sym_node */
+UNIV_INTERN
+que_node_t*
+pars_cursor_declaration(
+/*====================*/
+	sym_node_t*	sym_node,	/*!< in: cursor id node in the symbol
+					table */
+	sel_node_t*	select_node);	/*!< in: select node */
+/*********************************************************************//**
+Parses a function declaration.
+@return	sym_node */
+UNIV_INTERN
+que_node_t*
+pars_function_declaration(
+/*======================*/
+	sym_node_t*	sym_node);	/*!< in: function id node in the symbol
+					table */
+/*********************************************************************//**
+Parses a select statement.
+@return	own: select node in a query tree */
+UNIV_INTERN
+sel_node_t*
+pars_select_statement(
+/*==================*/
+	sel_node_t*	select_node,	/*!< in: select node already containing
+					the select list */
+	sym_node_t*	table_list,	/*!< in: table list */
+	que_node_t*	search_cond,	/*!< in: search condition or NULL */
+	pars_res_word_t* for_update,	/*!< in: NULL or &pars_update_token */
+	pars_res_word_t* consistent_read,/*!< in: NULL or
+						&pars_consistent_token */
+	order_node_t*	order_by);	/*!< in: NULL or an order-by node */
+/*********************************************************************//**
+Parses a column assignment in an update.
+@return	column assignment node */
+UNIV_INTERN
+col_assign_node_t*
+pars_column_assignment(
+/*===================*/
+	sym_node_t*	column,	/*!< in: column to assign */
+	que_node_t*	exp);	/*!< in: value to assign */
+/*********************************************************************//**
+Parses a delete or update statement start.
+@return	own: update node in a query tree */
+UNIV_INTERN
+upd_node_t*
+pars_update_statement_start(
+/*========================*/
+	ibool		is_delete,	/*!< in: TRUE if delete */
+	sym_node_t*	table_sym,	/*!< in: table name node */
+	col_assign_node_t* col_assign_list);/*!< in: column assignment list, NULL
+					if delete */
+/*********************************************************************//**
+Parses an update or delete statement.
+@return	own: update node in a query tree */
+UNIV_INTERN
+upd_node_t*
+pars_update_statement(
+/*==================*/
+	upd_node_t*	node,		/*!< in: update node */
+	sym_node_t*	cursor_sym,	/*!< in: pointer to a cursor entry in
+					the symbol table or NULL */
+	que_node_t*	search_cond);	/*!< in: search condition or NULL */
+/*********************************************************************//**
+Parses an insert statement.
+@return	own: update node in a query tree */
+UNIV_INTERN
+ins_node_t*
+pars_insert_statement(
+/*==================*/
+	sym_node_t*	table_sym,	/*!< in: table name node */
+	que_node_t*	values_list,	/*!< in: value expression list or NULL */
+	sel_node_t*	select);	/*!< in: select condition or NULL */
+/*********************************************************************//**
+Parses a procedure parameter declaration.
+@return	own: symbol table node of type SYM_VAR */
+UNIV_INTERN
+sym_node_t*
+pars_parameter_declaration(
+/*=======================*/
+	sym_node_t*	node,	/*!< in: symbol table node allocated for the
+				id of the parameter */
+	ulint		param_type,
+				/*!< in: PARS_INPUT or PARS_OUTPUT */
+	pars_res_word_t* type);	/*!< in: pointer to a type token */
+/*********************************************************************//**
+Parses an elsif element.
+@return	elsif node */
+UNIV_INTERN
+elsif_node_t*
+pars_elsif_element(
+/*===============*/
+	que_node_t*	cond,		/*!< in: if-condition */
+	que_node_t*	stat_list);	/*!< in: statement list */
+/*********************************************************************//**
+Parses an if-statement.
+@return	if-statement node */
+UNIV_INTERN
+if_node_t*
+pars_if_statement(
+/*==============*/
+	que_node_t*	cond,		/*!< in: if-condition */
+	que_node_t*	stat_list,	/*!< in: statement list */
+	que_node_t*	else_part);	/*!< in: else-part statement list */
+/*********************************************************************//**
+Parses a for-loop-statement.
+@return	for-statement node */
+UNIV_INTERN
+for_node_t*
+pars_for_statement(
+/*===============*/
+	sym_node_t*	loop_var,	/*!< in: loop variable */
+	que_node_t*	loop_start_limit,/*!< in: loop start expression */
+	que_node_t*	loop_end_limit,	/*!< in: loop end expression */
+	que_node_t*	stat_list);	/*!< in: statement list */
+/*********************************************************************//**
+Parses a while-statement.
+@return	while-statement node */
+UNIV_INTERN
+while_node_t*
+pars_while_statement(
+/*=================*/
+	que_node_t*	cond,		/*!< in: while-condition */
+	que_node_t*	stat_list);	/*!< in: statement list */
+/*********************************************************************//**
+Parses an exit statement.
+@return	exit statement node */
+UNIV_INTERN
+exit_node_t*
+pars_exit_statement(void);
+/*=====================*/
+/*********************************************************************//**
+Parses a return-statement.
+@return	return-statement node */
+UNIV_INTERN
+return_node_t*
+pars_return_statement(void);
+/*=======================*/
+/*********************************************************************//**
+Parses a procedure call.
+@return	function node */
+UNIV_INTERN
+func_node_t*
+pars_procedure_call(
+/*================*/
+	que_node_t*	res_word,/*!< in: procedure name reserved word */
+	que_node_t*	args);	/*!< in: argument list */
+/*********************************************************************//**
+Parses an assignment statement.
+@return	assignment statement node */
+UNIV_INTERN
+assign_node_t*
+pars_assignment_statement(
+/*======================*/
+	sym_node_t*	var,	/*!< in: variable to assign */
+	que_node_t*	val);	/*!< in: value to assign */
+/*********************************************************************//**
+Parses a fetch statement. into_list or user_func (but not both) must be
+non-NULL.
+@return	fetch statement node */
+UNIV_INTERN
+fetch_node_t*
+pars_fetch_statement(
+/*=================*/
+	sym_node_t*	cursor,		/*!< in: cursor node */
+	sym_node_t*	into_list,	/*!< in: variables to set, or NULL */
+	sym_node_t*	user_func);	/*!< in: user function name, or NULL */
+/*********************************************************************//**
+Parses an open or close cursor statement.
+@return	fetch statement node */
+UNIV_INTERN
+open_node_t*
+pars_open_statement(
+/*================*/
+	ulint		type,	/*!< in: ROW_SEL_OPEN_CURSOR
+				or ROW_SEL_CLOSE_CURSOR */
+	sym_node_t*	cursor);	/*!< in: cursor node */
+/*********************************************************************//**
+Parses a row_printf-statement.
+@return	row_printf-statement node */
+UNIV_INTERN
+row_printf_node_t*
+pars_row_printf_statement(
+/*======================*/
+	sel_node_t*	sel_node);	/*!< in: select node */
+/*********************************************************************//**
+Parses a commit statement.
+@return	own: commit node struct */
+UNIV_INTERN
+commit_node_t*
+pars_commit_statement(void);
+/*=======================*/
+/*********************************************************************//**
+Parses a rollback statement.
+@return	own: rollback node struct */
+UNIV_INTERN
+roll_node_t*
+pars_rollback_statement(void);
+/*=========================*/
+/*********************************************************************//**
+Parses a column definition at a table creation.
+@return	column sym table node */
+UNIV_INTERN
+sym_node_t*
+pars_column_def(
+/*============*/
+	sym_node_t*		sym_node,	/*!< in: column node in the
+						symbol table */
+	pars_res_word_t*	type,		/*!< in: data type */
+	sym_node_t*		len,		/*!< in: length of column, or
+						NULL */
+	void*			is_unsigned,	/*!< in: if not NULL, column
+						is of type UNSIGNED. */
+	void*			is_not_null);	/*!< in: if not NULL, column
+						is of type NOT NULL. */
+/*********************************************************************//**
+Parses a table creation operation.
+@return	table create subgraph */
+UNIV_INTERN
+tab_node_t*
+pars_create_table(
+/*==============*/
+	sym_node_t*	table_sym,	/*!< in: table name node in the symbol
+					table */
+	sym_node_t*	column_defs,	/*!< in: list of column names */
+	void*		not_fit_in_memory);/*!< in: a non-NULL pointer means that
+					this is a table which in simulations
+					should be simulated as not fitting
+					in memory; thread is put to sleep
+					to simulate disk accesses; NOTE that
+					this flag is not stored to the data
+					dictionary on disk, and the database
+					will forget about non-NULL value if
+					it has to reload the table definition
+					from disk */
+/*********************************************************************//**
+Parses an index creation operation.
+@return	index create subgraph */
+UNIV_INTERN
+ind_node_t*
+pars_create_index(
+/*==============*/
+	pars_res_word_t* unique_def,	/*!< in: not NULL if a unique index */
+	pars_res_word_t* clustered_def,	/*!< in: not NULL if a clustered index */
+	sym_node_t*	index_sym,	/*!< in: index name node in the symbol
+					table */
+	sym_node_t*	table_sym,	/*!< in: table name node in the symbol
+					table */
+	sym_node_t*	column_list);	/*!< in: list of column names */
+/*********************************************************************//**
+Parses a procedure definition.
+@return	query fork node */
+UNIV_INTERN
+que_fork_t*
+pars_procedure_definition(
+/*======================*/
+	sym_node_t*	sym_node,	/*!< in: procedure id node in the symbol
+					table */
+	sym_node_t*	param_list,	/*!< in: parameter declaration list */
+	que_node_t*	stat_list);	/*!< in: statement list */
+
+/*************************************************************//**
+Parses a stored procedure call, when this is not within another stored
+procedure, that is, the client issues a procedure call directly.
+In MySQL/InnoDB, stored InnoDB procedures are invoked via the
+parsed procedure tree, not via InnoDB SQL, so this function is not used.
+@return	query graph */
+UNIV_INTERN
+que_fork_t*
+pars_stored_procedure_call(
+/*=======================*/
+	sym_node_t*	sym_node);	/*!< in: stored procedure name */
+/******************************************************************//**
+Completes a query graph by adding query thread and fork nodes
+above it and prepares the graph for running. The fork created is of
+type QUE_FORK_MYSQL_INTERFACE.
+@return	query thread node to run */
+UNIV_INTERN
+que_thr_t*
+pars_complete_graph_for_exec(
+/*=========================*/
+	que_node_t*	node,	/*!< in: root node for an incomplete
+				query graph */
+	trx_t*		trx,	/*!< in: transaction handle */
+	mem_heap_t*	heap);	/*!< in: memory heap from which allocated */
+
+/****************************************************************//**
+Create parser info struct.
+@return	own: info struct */
+UNIV_INTERN
+pars_info_t*
+pars_info_create(void);
+/*==================*/
+
+/****************************************************************//**
+Free info struct and everything it contains. */
+UNIV_INTERN
+void
+pars_info_free(
+/*===========*/
+	pars_info_t*	info);	/*!< in, own: info struct */
+
+/****************************************************************//**
+Add bound literal. */
+UNIV_INTERN
+void
+pars_info_add_literal(
+/*==================*/
+	pars_info_t*	info,		/*!< in: info struct */
+	const char*	name,		/*!< in: name */
+	const void*	address,	/*!< in: address */
+	ulint		length,		/*!< in: length of data */
+	ulint		type,		/*!< in: type, e.g. DATA_FIXBINARY */
+	ulint		prtype);	/*!< in: precise type, e.g.
+					DATA_UNSIGNED */
+
+/****************************************************************//**
+Equivalent to pars_info_add_literal(info, name, str, strlen(str),
+DATA_VARCHAR, DATA_ENGLISH). */
+UNIV_INTERN
+void
+pars_info_add_str_literal(
+/*======================*/
+	pars_info_t*	info,		/*!< in: info struct */
+	const char*	name,		/*!< in: name */
+	const char*	str);		/*!< in: string */
+
+/****************************************************************//**
+Equivalent to:
+
+char buf[4];
+mach_write_to_4(buf, val);
+pars_info_add_literal(info, name, buf, 4, DATA_INT, 0);
+
+except that the buffer is dynamically allocated from the info struct's
+heap. */
+UNIV_INTERN
+void
+pars_info_add_int4_literal(
+/*=======================*/
+	pars_info_t*	info,		/*!< in: info struct */
+	const char*	name,		/*!< in: name */
+	lint		val);		/*!< in: value */
+
+/****************************************************************//**
+Equivalent to:
+
+char buf[8];
+mach_write_to_8(buf, val);
+pars_info_add_literal(info, name, buf, 8, DATA_BINARY, 0);
+
+except that the buffer is dynamically allocated from the info struct's
+heap. */
+UNIV_INTERN
+void
+pars_info_add_dulint_literal(
+/*=========================*/
+	pars_info_t*	info,		/*!< in: info struct */
+	const char*	name,		/*!< in: name */
+	dulint		val);		/*!< in: value */
+/****************************************************************//**
+Add user function. */
+UNIV_INTERN
+void
+pars_info_add_function(
+/*===================*/
+	pars_info_t*		info,	/*!< in: info struct */
+	const char*		name,	/*!< in: function name */
+	pars_user_func_cb_t	func,	/*!< in: function address */
+	void*			arg);	/*!< in: user-supplied argument */
+
+/****************************************************************//**
+Add bound id. */
+UNIV_INTERN
+void
+pars_info_add_id(
+/*=============*/
+	pars_info_t*	info,		/*!< in: info struct */
+	const char*	name,		/*!< in: name */
+	const char*	id);		/*!< in: id */
+
+/****************************************************************//**
+Get user function with the given name.
+@return	user func, or NULL if not found */
+UNIV_INTERN
+pars_user_func_t*
+pars_info_get_user_func(
+/*====================*/
+	pars_info_t*		info,	/*!< in: info struct */
+	const char*		name);	/*!< in: function name to find*/
+
+/****************************************************************//**
+Get bound literal with the given name.
+@return	bound literal, or NULL if not found */
+UNIV_INTERN
+pars_bound_lit_t*
+pars_info_get_bound_lit(
+/*====================*/
+	pars_info_t*		info,	/*!< in: info struct */
+	const char*		name);	/*!< in: bound literal name to find */
+
+/****************************************************************//**
+Get bound id with the given name.
+@return	bound id, or NULL if not found */
+UNIV_INTERN
+pars_bound_id_t*
+pars_info_get_bound_id(
+/*===================*/
+	pars_info_t*		info,	/*!< in: info struct */
+	const char*		name);	/*!< in: bound id name to find */
+
+/******************************************************************//**
+Release any resources used by the lexer. */
+UNIV_INTERN
+void
+pars_lexer_close(void);
+/*==================*/
+
+/** Extra information supplied for pars_sql(). */
+struct pars_info_struct {
+	mem_heap_t*	heap;		/*!< our own memory heap */
+
+	ib_vector_t*	funcs;		/*!< user functions, or NUll
+					(pars_user_func_t*) */
+	ib_vector_t*	bound_lits;	/*!< bound literals, or NULL
+					(pars_bound_lit_t*) */
+	ib_vector_t*	bound_ids;	/*!< bound ids, or NULL
+					(pars_bound_id_t*) */
+
+	ibool		graph_owns_us;	/*!< if TRUE (which is the default),
+					que_graph_free() will free us */
+};
+
+/** User-supplied function and argument. */
+struct pars_user_func_struct {
+	const char*		name;	/*!< function name */
+	pars_user_func_cb_t	func;	/*!< function address */
+	void*			arg;	/*!< user-supplied argument */
+};
+
+/** Bound literal. */
+struct pars_bound_lit_struct {
+	const char*	name;		/*!< name */
+	const void*	address;	/*!< address */
+	ulint		length;		/*!< length of data */
+	ulint		type;		/*!< type, e.g. DATA_FIXBINARY */
+	ulint		prtype;		/*!< precise type, e.g. DATA_UNSIGNED */
+};
+
+/** Bound identifier. */
+struct pars_bound_id_struct {
+	const char*	name;		/*!< name */
+	const char*	id;		/*!< identifier */
+};
+
+/** Struct used to denote a reserved word in a parsing tree */
+struct pars_res_word_struct{
+	int	code;	/*!< the token code for the reserved word from
+			pars0grm.h */
+};
+
+/** A predefined function or operator node in a parsing tree; this construct
+is also used for some non-functions like the assignment ':=' */
+struct func_node_struct{
+	que_common_t	common;	/*!< type: QUE_NODE_FUNC */
+	int		func;	/*!< token code of the function name */
+	ulint		class;	/*!< class of the function */
+	que_node_t*	args;	/*!< argument(s) of the function */
+	UT_LIST_NODE_T(func_node_t) cond_list;
+				/*!< list of comparison conditions; defined
+				only for comparison operator nodes except,
+				presently, for OPT_SCROLL_TYPE ones */
+	UT_LIST_NODE_T(func_node_t) func_node_list;
+				/*!< list of function nodes in a parsed
+				query graph */
+};
+
+/** An order-by node in a select */
+struct order_node_struct{
+	que_common_t	common;	/*!< type: QUE_NODE_ORDER */
+	sym_node_t*	column;	/*!< order-by column */
+	ibool		asc;	/*!< TRUE if ascending, FALSE if descending */
+};
+
+/** Procedure definition node */
+struct proc_node_struct{
+	que_common_t	common;		/*!< type: QUE_NODE_PROC */
+	sym_node_t*	proc_id;	/*!< procedure name symbol in the symbol
+					table of this same procedure */
+	sym_node_t*	param_list;	/*!< input and output parameters */
+	que_node_t*	stat_list;	/*!< statement list */
+	sym_tab_t*	sym_tab;	/*!< symbol table of this procedure */
+};
+
+/** elsif-element node */
+struct elsif_node_struct{
+	que_common_t	common;		/*!< type: QUE_NODE_ELSIF */
+	que_node_t*	cond;		/*!< if condition */
+	que_node_t*	stat_list;	/*!< statement list */
+};
+
+/** if-statement node */
+struct if_node_struct{
+	que_common_t	common;		/*!< type: QUE_NODE_IF */
+	que_node_t*	cond;		/*!< if condition */
+	que_node_t*	stat_list;	/*!< statement list */
+	que_node_t*	else_part;	/*!< else-part statement list */
+	elsif_node_t*	elsif_list;	/*!< elsif element list */
+};
+
+/** while-statement node */
+struct while_node_struct{
+	que_common_t	common;		/*!< type: QUE_NODE_WHILE */
+	que_node_t*	cond;		/*!< while condition */
+	que_node_t*	stat_list;	/*!< statement list */
+};
+
+/** for-loop-statement node */
+struct for_node_struct{
+	que_common_t	common;		/*!< type: QUE_NODE_FOR */
+	sym_node_t*	loop_var;	/*!< loop variable: this is the
+					dereferenced symbol from the
+					variable declarations, not the
+					symbol occurrence in the for loop
+					definition */
+	que_node_t*	loop_start_limit;/*!< initial value of loop variable */
+	que_node_t*	loop_end_limit;	/*!< end value of loop variable */
+	lint		loop_end_value;	/*!< evaluated value for the end value:
+					it is calculated only when the loop
+					is entered, and will not change within
+					the loop */
+	que_node_t*	stat_list;	/*!< statement list */
+};
+
+/** exit statement node */
+struct exit_node_struct{
+	que_common_t	common;		/*!< type: QUE_NODE_EXIT */
+};
+
+/** return-statement node */
+struct return_node_struct{
+	que_common_t	common;		/*!< type: QUE_NODE_RETURN */
+};
+
+/** Assignment statement node */
+struct assign_node_struct{
+	que_common_t	common;		/*!< type: QUE_NODE_ASSIGNMENT */
+	sym_node_t*	var;		/*!< variable to set */
+	que_node_t*	val;		/*!< value to assign */
+};
+
+/** Column assignment node */
+struct col_assign_node_struct{
+	que_common_t	common;		/*!< type: QUE_NODE_COL_ASSIGN */
+	sym_node_t*	col;		/*!< column to set */
+	que_node_t*	val;		/*!< value to assign */
+};
+
+/** Classes of functions */
+/* @{ */
+#define PARS_FUNC_ARITH		1	/*!< +, -, *, / */
+#define	PARS_FUNC_LOGICAL	2	/*!< AND, OR, NOT */
+#define PARS_FUNC_CMP		3	/*!< comparison operators */
+#define	PARS_FUNC_PREDEFINED	4	/*!< TO_NUMBER, SUBSTR, ... */
+#define	PARS_FUNC_AGGREGATE	5	/*!< COUNT, DISTINCT, SUM */
+#define	PARS_FUNC_OTHER		6	/*!< these are not real functions,
+					e.g., := */
+/* @} */
+
+#ifndef UNIV_NONINL
+#include "pars0pars.ic"
+#endif
+
+#endif
diff --git a/storage/xtradb/include/pars0pars.ic b/storage/xtradb/include/pars0pars.ic
new file mode 100644
index 00000000000..ae6c13cd671
--- /dev/null
+++ b/storage/xtradb/include/pars0pars.ic
@@ -0,0 +1,24 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/pars0pars.ic
+SQL parser
+
+Created 11/19/1996 Heikki Tuuri
+*******************************************************/
diff --git a/storage/xtradb/include/pars0sym.h b/storage/xtradb/include/pars0sym.h
new file mode 100644
index 00000000000..6d1a4b82414
--- /dev/null
+++ b/storage/xtradb/include/pars0sym.h
@@ -0,0 +1,244 @@
+/*****************************************************************************
+
+Copyright (c) 1997, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/pars0sym.h
+SQL parser symbol table
+
+Created 12/15/1997 Heikki Tuuri
+*******************************************************/
+
+#ifndef pars0sym_h
+#define pars0sym_h
+
+#include "univ.i"
+#include "que0types.h"
+#include "usr0types.h"
+#include "dict0types.h"
+#include "pars0types.h"
+#include "row0types.h"
+
+/******************************************************************//**
+Creates a symbol table for a single stored procedure or query.
+@return	own: symbol table */
+UNIV_INTERN
+sym_tab_t*
+sym_tab_create(
+/*===========*/
+	mem_heap_t*	heap);	/*!< in: memory heap where to create */
+/******************************************************************//**
+Frees the memory allocated dynamically AFTER parsing phase for variables
+etc. in the symbol table. Does not free the mem heap where the table was
+originally created. Frees also SQL explicit cursor definitions. */
+UNIV_INTERN
+void
+sym_tab_free_private(
+/*=================*/
+	sym_tab_t*	sym_tab);	/*!< in, own: symbol table */
+/******************************************************************//**
+Adds an integer literal to a symbol table.
+@return	symbol table node */
+UNIV_INTERN
+sym_node_t*
+sym_tab_add_int_lit(
+/*================*/
+	sym_tab_t*	sym_tab,	/*!< in: symbol table */
+	ulint		val);		/*!< in: integer value */
+/******************************************************************//**
+Adds an string literal to a symbol table.
+@return	symbol table node */
+UNIV_INTERN
+sym_node_t*
+sym_tab_add_str_lit(
+/*================*/
+	sym_tab_t*	sym_tab,	/*!< in: symbol table */
+	byte*		str,		/*!< in: string with no quotes around
+					it */
+	ulint		len);		/*!< in: string length */
+/******************************************************************//**
+Add a bound literal to a symbol table.
+@return	symbol table node */
+UNIV_INTERN
+sym_node_t*
+sym_tab_add_bound_lit(
+/*==================*/
+	sym_tab_t*	sym_tab,	/*!< in: symbol table */
+	const char*	name,		/*!< in: name of bound literal */
+	ulint*		lit_type);	/*!< out: type of literal (PARS_*_LIT) */
+/******************************************************************//**
+Adds an SQL null literal to a symbol table.
+@return	symbol table node */
+UNIV_INTERN
+sym_node_t*
+sym_tab_add_null_lit(
+/*=================*/
+	sym_tab_t*	sym_tab);	/*!< in: symbol table */
+/******************************************************************//**
+Adds an identifier to a symbol table.
+@return	symbol table node */
+UNIV_INTERN
+sym_node_t*
+sym_tab_add_id(
+/*===========*/
+	sym_tab_t*	sym_tab,	/*!< in: symbol table */
+	byte*		name,		/*!< in: identifier name */
+	ulint		len);		/*!< in: identifier length */
+
+/******************************************************************//**
+Add a bound identifier to a symbol table.
+@return	symbol table node */
+UNIV_INTERN
+sym_node_t*
+sym_tab_add_bound_id(
+/*===========*/
+	sym_tab_t*	sym_tab,	/*!< in: symbol table */
+	const char*	name);		/*!< in: name of bound id */
+
+/** Index of sym_node_struct::field_nos corresponding to the clustered index */
+#define	SYM_CLUST_FIELD_NO	0
+/** Index of sym_node_struct::field_nos corresponding to a secondary index */
+#define	SYM_SEC_FIELD_NO	1
+
+/** Types of a symbol table node */
+enum sym_tab_entry {
+	SYM_VAR = 91,		/*!< declared parameter or local
+				variable of a procedure */
+	SYM_IMPLICIT_VAR,	/*!< storage for a intermediate result
+				of a calculation */
+	SYM_LIT,		/*!< literal */
+	SYM_TABLE,		/*!< database table name */
+	SYM_COLUMN,		/*!< database table name */
+	SYM_CURSOR,		/*!< named cursor */
+	SYM_PROCEDURE_NAME,	/*!< stored procedure name */
+	SYM_INDEX,		/*!< database index name */
+	SYM_FUNCTION		/*!< user function name */
+};
+
+/** Symbol table node */
+struct sym_node_struct{
+	que_common_t			common;		/*!< node type:
+							QUE_NODE_SYMBOL */
+	/* NOTE: if the data field in 'common.val' is not NULL and the symbol
+	table node is not for a temporary column, the memory for the value has
+	been allocated from dynamic memory and it should be freed when the
+	symbol table is discarded */
+
+	/* 'alias' and 'indirection' are almost the same, but not quite.
+	'alias' always points to the primary instance of the variable, while
+	'indirection' does the same only if we should use the primary
+	instance's values for the node's data. This is usually the case, but
+	when initializing a cursor (e.g., "DECLARE CURSOR c IS SELECT * FROM
+	t WHERE id = x;"), we copy the values from the primary instance to
+	the cursor's instance so that they are fixed for the duration of the
+	cursor, and set 'indirection' to NULL. If we did not, the value of
+	'x' could change between fetches and things would break horribly.
+
+	TODO: It would be cleaner to make 'indirection' a boolean field and
+	always use 'alias' to refer to the primary node. */
+
+	sym_node_t*			indirection;	/*!< pointer to
+							another symbol table
+							node which contains
+							the value for this
+							node, NULL otherwise */
+	sym_node_t*			alias;		/*!< pointer to
+							another symbol table
+							node for which this
+							node is an alias,
+							NULL otherwise */
+	UT_LIST_NODE_T(sym_node_t)	col_var_list;	/*!< list of table
+							columns or a list of
+							input variables for an
+							explicit cursor */
+	ibool				copy_val;	/*!< TRUE if a column
+							and its value should
+							be copied to dynamic
+							memory when fetched */
+	ulint				field_nos[2];	/*!< if a column, in
+							the position
+							SYM_CLUST_FIELD_NO is
+							the field number in the
+							clustered index; in
+							the position
+							SYM_SEC_FIELD_NO
+							the field number in the
+							non-clustered index to
+							use first; if not found
+							from the index, then
+							ULINT_UNDEFINED */
+	ibool				resolved;	/*!< TRUE if the
+							meaning of a variable
+							or a column has been
+							resolved; for literals
+							this is always TRUE */
+	enum sym_tab_entry		token_type;	/*!< type of the
+							parsed token */
+	const char*			name;		/*!< name of an id */
+	ulint				name_len;	/*!< id name length */
+	dict_table_t*			table;		/*!< table definition
+							if a table id or a
+							column id */
+	ulint				col_no;		/*!< column number if a
+							column */
+	sel_buf_t*			prefetch_buf;	/*!< NULL, or a buffer
+							for cached column
+							values for prefetched
+							rows */
+	sel_node_t*			cursor_def;	/*!< cursor definition
+							select node if a
+							named cursor */
+	ulint				param_type;	/*!< PARS_INPUT,
+							PARS_OUTPUT, or
+							PARS_NOT_PARAM if not a
+							procedure parameter */
+	sym_tab_t*			sym_table;	/*!< back pointer to
+							the symbol table */
+	UT_LIST_NODE_T(sym_node_t)	sym_list;	/*!< list of symbol
+							nodes */
+};
+
+/** Symbol table */
+struct sym_tab_struct{
+	que_t*			query_graph;
+					/*!< query graph generated by the
+					parser */
+	const char*		sql_string;
+					/*!< SQL string to parse */
+	size_t			string_len;
+					/*!< SQL string length */
+	int			next_char_pos;
+					/*!< position of the next character in
+					sql_string to give to the lexical
+					analyzer */
+	pars_info_t*		info;	/*!< extra information, or NULL */
+	sym_node_list_t		sym_list;
+					/*!< list of symbol nodes in the symbol
+					table */
+	UT_LIST_BASE_NODE_T(func_node_t)
+				func_node_list;
+					/*!< list of function nodes in the
+					parsed query graph */
+	mem_heap_t*		heap;	/*!< memory heap from which we can
+					allocate space */
+};
+
+#ifndef UNIV_NONINL
+#include "pars0sym.ic"
+#endif
+
+#endif
diff --git a/storage/xtradb/include/pars0sym.ic b/storage/xtradb/include/pars0sym.ic
new file mode 100644
index 00000000000..9eb09db3a47
--- /dev/null
+++ b/storage/xtradb/include/pars0sym.ic
@@ -0,0 +1,24 @@
+/*****************************************************************************
+
+Copyright (c) 1997, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/pars0sym.ic
+SQL parser symbol table
+
+Created 12/15/1997 Heikki Tuuri
+*******************************************************/
diff --git a/storage/xtradb/include/pars0types.h b/storage/xtradb/include/pars0types.h
new file mode 100644
index 00000000000..e0a8a86bf07
--- /dev/null
+++ b/storage/xtradb/include/pars0types.h
@@ -0,0 +1,50 @@
+/*****************************************************************************
+
+Copyright (c) 1998, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/pars0types.h
+SQL parser global types
+
+Created 1/11/1998 Heikki Tuuri
+*******************************************************/
+
+#ifndef pars0types_h
+#define pars0types_h
+
+typedef struct pars_info_struct		pars_info_t;
+typedef struct pars_user_func_struct	pars_user_func_t;
+typedef struct pars_bound_lit_struct	pars_bound_lit_t;
+typedef struct pars_bound_id_struct	pars_bound_id_t;
+typedef struct sym_node_struct		sym_node_t;
+typedef struct sym_tab_struct		sym_tab_t;
+typedef struct pars_res_word_struct	pars_res_word_t;
+typedef struct func_node_struct		func_node_t;
+typedef struct order_node_struct	order_node_t;
+typedef struct proc_node_struct		proc_node_t;
+typedef struct elsif_node_struct	elsif_node_t;
+typedef struct if_node_struct		if_node_t;
+typedef struct while_node_struct	while_node_t;
+typedef struct for_node_struct		for_node_t;
+typedef struct exit_node_struct		exit_node_t;
+typedef struct return_node_struct	return_node_t;
+typedef struct assign_node_struct	assign_node_t;
+typedef struct col_assign_node_struct	col_assign_node_t;
+
+typedef UT_LIST_BASE_NODE_T(sym_node_t)	sym_node_list_t;
+
+#endif
diff --git a/storage/xtradb/include/que0que.h b/storage/xtradb/include/que0que.h
new file mode 100644
index 00000000000..ed48f980294
--- /dev/null
+++ b/storage/xtradb/include/que0que.h
@@ -0,0 +1,529 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2010, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/que0que.h
+Query graph
+
+Created 5/27/1996 Heikki Tuuri
+*******************************************************/
+
+#ifndef que0que_h
+#define que0que_h
+
+#include "univ.i"
+#include "data0data.h"
+#include "dict0types.h"
+#include "trx0trx.h"
+#include "trx0roll.h"
+#include "srv0srv.h"
+#include "usr0types.h"
+#include "que0types.h"
+#include "row0types.h"
+#include "pars0types.h"
+
+/* If the following flag is set TRUE, the module will print trace info
+of SQL execution in the UNIV_SQL_DEBUG version */
+extern ibool	que_trace_on;
+
+/***********************************************************************//**
+Adds a query graph to the session's list of graphs. */
+UNIV_INTERN
+void
+que_graph_publish(
+/*==============*/
+	que_t*	graph,	/*!< in: graph */
+	sess_t*	sess);	/*!< in: session */
+/***********************************************************************//**
+Creates a query graph fork node.
+@return	own: fork node */
+UNIV_INTERN
+que_fork_t*
+que_fork_create(
+/*============*/
+	que_t*		graph,		/*!< in: graph, if NULL then this
+					fork node is assumed to be the
+					graph root */
+	que_node_t*	parent,		/*!< in: parent node */
+	ulint		fork_type,	/*!< in: fork type */
+	mem_heap_t*	heap);		/*!< in: memory heap where created */
+/***********************************************************************//**
+Gets the first thr in a fork. */
+UNIV_INLINE
+que_thr_t*
+que_fork_get_first_thr(
+/*===================*/
+	que_fork_t*	fork);	/*!< in: query fork */
+/***********************************************************************//**
+Gets the child node of the first thr in a fork. */
+UNIV_INLINE
+que_node_t*
+que_fork_get_child(
+/*===============*/
+	que_fork_t*	fork);	/*!< in: query fork */
+/***********************************************************************//**
+Sets the parent of a graph node. */
+UNIV_INLINE
+void
+que_node_set_parent(
+/*================*/
+	que_node_t*	node,	/*!< in: graph node */
+	que_node_t*	parent);/*!< in: parent */
+/***********************************************************************//**
+Creates a query graph thread node.
+@return	own: query thread node */
+UNIV_INTERN
+que_thr_t*
+que_thr_create(
+/*===========*/
+	que_fork_t*	parent,	/*!< in: parent node, i.e., a fork node */
+	mem_heap_t*	heap);	/*!< in: memory heap where created */
+/**********************************************************************//**
+Frees a query graph, but not the heap where it was created. Does not free
+explicit cursor declarations, they are freed in que_graph_free. */
+UNIV_INTERN
+void
+que_graph_free_recursive(
+/*=====================*/
+	que_node_t*	node);	/*!< in: query graph node */
+/**********************************************************************//**
+Frees a query graph. */
+UNIV_INTERN
+void
+que_graph_free(
+/*===========*/
+	que_t*	graph);	/*!< in: query graph; we assume that the memory
+			heap where this graph was created is private
+			to this graph: if not, then use
+			que_graph_free_recursive and free the heap
+			afterwards! */
+/**********************************************************************//**
+Stops a query thread if graph or trx is in a state requiring it. The
+conditions are tested in the order (1) graph, (2) trx. The kernel mutex has
+to be reserved.
+@return	TRUE if stopped */
+UNIV_INTERN
+ibool
+que_thr_stop(
+/*=========*/
+	que_thr_t*	thr);	/*!< in: query thread */
+/**********************************************************************//**
+Moves a thread from another state to the QUE_THR_RUNNING state. Increments
+the n_active_thrs counters of the query graph and transaction. */
+UNIV_INTERN
+void
+que_thr_move_to_run_state_for_mysql(
+/*================================*/
+	que_thr_t*	thr,	/*!< in: an query thread */
+	trx_t*		trx);	/*!< in: transaction */
+/**********************************************************************//**
+A patch for MySQL used to 'stop' a dummy query thread used in MySQL
+select, when there is no error or lock wait. */
+UNIV_INTERN
+void
+que_thr_stop_for_mysql_no_error(
+/*============================*/
+	que_thr_t*	thr,	/*!< in: query thread */
+	trx_t*		trx);	/*!< in: transaction */
+/**********************************************************************//**
+A patch for MySQL used to 'stop' a dummy query thread used in MySQL. The
+query thread is stopped and made inactive, except in the case where
+it was put to the lock wait state in lock0lock.c, but the lock has already
+been granted or the transaction chosen as a victim in deadlock resolution. */
+UNIV_INTERN
+void
+que_thr_stop_for_mysql(
+/*===================*/
+	que_thr_t*	thr);	/*!< in: query thread */
+/**********************************************************************//**
+Run a query thread. Handles lock waits. */
+UNIV_INTERN
+void
+que_run_threads(
+/*============*/
+	que_thr_t*	thr);	/*!< in: query thread */
+/**********************************************************************//**
+After signal handling is finished, returns control to a query graph error
+handling routine. (Currently, just returns the control to the root of the
+graph so that the graph can communicate an error message to the client.) */
+UNIV_INTERN
+void
+que_fork_error_handle(
+/*==================*/
+	trx_t*	trx,	/*!< in: trx */
+	que_t*	fork);	/*!< in: query graph which was run before signal
+			handling started, NULL not allowed */
+/**********************************************************************//**
+Moves a suspended query thread to the QUE_THR_RUNNING state and releases
+a single worker thread to execute it. This function should be used to end
+the wait state of a query thread waiting for a lock or a stored procedure
+completion. */
+UNIV_INTERN
+void
+que_thr_end_wait(
+/*=============*/
+	que_thr_t*	thr,		/*!< in: query thread in the
+					QUE_THR_LOCK_WAIT,
+					or QUE_THR_PROCEDURE_WAIT, or
+					QUE_THR_SIG_REPLY_WAIT state */
+	que_thr_t**	next_thr);	/*!< in/out: next query thread to run;
+					if the value which is passed in is
+					a pointer to a NULL pointer, then the
+					calling function can start running
+					a new query thread */
+/**********************************************************************//**
+Same as que_thr_end_wait, but no parameter next_thr available. */
+UNIV_INTERN
+void
+que_thr_end_wait_no_next_thr(
+/*=========================*/
+	que_thr_t*	thr);		/*!< in: query thread in the
+					QUE_THR_LOCK_WAIT,
+					or QUE_THR_PROCEDURE_WAIT, or
+					QUE_THR_SIG_REPLY_WAIT state */
+/**********************************************************************//**
+Starts execution of a command in a query fork. Picks a query thread which
+is not in the QUE_THR_RUNNING state and moves it to that state. If none
+can be chosen, a situation which may arise in parallelized fetches, NULL
+is returned.
+@return a query thread of the graph moved to QUE_THR_RUNNING state, or
+NULL; the query thread should be executed by que_run_threads by the
+caller */
+UNIV_INTERN
+que_thr_t*
+que_fork_start_command(
+/*===================*/
+	que_fork_t*	fork);	/*!< in: a query fork */
+/***********************************************************************//**
+Gets the trx of a query thread. */
+UNIV_INLINE
+trx_t*
+thr_get_trx(
+/*========*/
+	que_thr_t*	thr);	/*!< in: query thread */
+/*******************************************************************//**
+Determines if this thread is rolling back an incomplete transaction
+in crash recovery.
+@return TRUE if thr is rolling back an incomplete transaction in crash
+recovery */
+UNIV_INLINE
+ibool
+thr_is_recv(
+/*========*/
+	const que_thr_t*	thr);	/*!< in: query thread */
+/***********************************************************************//**
+Gets the type of a graph node. */
+UNIV_INLINE
+ulint
+que_node_get_type(
+/*==============*/
+	que_node_t*	node);	/*!< in: graph node */
+/***********************************************************************//**
+Gets pointer to the value data type field of a graph node. */
+UNIV_INLINE
+dtype_t*
+que_node_get_data_type(
+/*===================*/
+	que_node_t*	node);	/*!< in: graph node */
+/***********************************************************************//**
+Gets pointer to the value dfield of a graph node. */
+UNIV_INLINE
+dfield_t*
+que_node_get_val(
+/*=============*/
+	que_node_t*	node);	/*!< in: graph node */
+/***********************************************************************//**
+Gets the value buffer size of a graph node.
+@return	val buffer size, not defined if val.data == NULL in node */
+UNIV_INLINE
+ulint
+que_node_get_val_buf_size(
+/*======================*/
+	que_node_t*	node);	/*!< in: graph node */
+/***********************************************************************//**
+Sets the value buffer size of a graph node. */
+UNIV_INLINE
+void
+que_node_set_val_buf_size(
+/*======================*/
+	que_node_t*	node,	/*!< in: graph node */
+	ulint		size);	/*!< in: size */
+/*********************************************************************//**
+Gets the next list node in a list of query graph nodes. */
+UNIV_INLINE
+que_node_t*
+que_node_get_next(
+/*==============*/
+	que_node_t*	node);	/*!< in: node in a list */
+/*********************************************************************//**
+Gets the parent node of a query graph node.
+@return	parent node or NULL */
+UNIV_INLINE
+que_node_t*
+que_node_get_parent(
+/*================*/
+	que_node_t*	node);	/*!< in: node */
+/****************************************************************//**
+Get the first containing loop node (e.g. while_node_t or for_node_t) for the
+given node, or NULL if the node is not within a loop.
+@return	containing loop node, or NULL. */
+UNIV_INTERN
+que_node_t*
+que_node_get_containing_loop_node(
+/*==============================*/
+	que_node_t*	node);	/*!< in: node */
+/*********************************************************************//**
+Catenates a query graph node to a list of them, possible empty list.
+@return	one-way list of nodes */
+UNIV_INLINE
+que_node_t*
+que_node_list_add_last(
+/*===================*/
+	que_node_t*	node_list,	/*!< in: node list, or NULL */
+	que_node_t*	node);		/*!< in: node */
+/*********************************************************************//**
+Gets a query graph node list length.
+@return	length, for NULL list 0 */
+UNIV_INLINE
+ulint
+que_node_list_get_len(
+/*==================*/
+	que_node_t*	node_list);	/*!< in: node list, or NULL */
+/**********************************************************************//**
+Checks if graph, trx, or session is in a state where the query thread should
+be stopped.
+@return TRUE if should be stopped; NOTE that if the peek is made
+without reserving the kernel mutex, then another peek with the mutex
+reserved is necessary before deciding the actual stopping */
+UNIV_INLINE
+ibool
+que_thr_peek_stop(
+/*==============*/
+	que_thr_t*	thr);	/*!< in: query thread */
+/***********************************************************************//**
+Returns TRUE if the query graph is for a SELECT statement.
+@return	TRUE if a select */
+UNIV_INLINE
+ibool
+que_graph_is_select(
+/*================*/
+	que_t*		graph);		/*!< in: graph */
+/**********************************************************************//**
+Prints info of an SQL query graph node. */
+UNIV_INTERN
+void
+que_node_print_info(
+/*================*/
+	que_node_t*	node);	/*!< in: query graph node */
+/*********************************************************************//**
+Evaluate the given SQL
+@return	error code or DB_SUCCESS */
+UNIV_INTERN
+ulint
+que_eval_sql(
+/*=========*/
+	pars_info_t*	info,	/*!< in: info struct, or NULL */
+	const char*	sql,	/*!< in: SQL string */
+	ibool		reserve_dict_mutex,
+				/*!< in: if TRUE, acquire/release
+				dict_sys->mutex around call to pars_sql. */
+	trx_t*		trx);	/*!< in: trx */
+
+/* Query graph query thread node: the fields are protected by the kernel
+mutex with the exceptions named below */
+
+struct que_thr_struct{
+	que_common_t	common;		/*!< type: QUE_NODE_THR */
+	ulint		magic_n;	/*!< magic number to catch memory
+					corruption */
+	que_node_t*	child;		/*!< graph child node */
+	que_t*		graph;		/*!< graph where this node belongs */
+	ibool		is_active;	/*!< TRUE if the thread has been set
+					to the run state in
+					que_thr_move_to_run_state, but not
+					deactivated in
+					que_thr_dec_reference_count */
+	ulint		state;		/*!< state of the query thread */
+	UT_LIST_NODE_T(que_thr_t)
+			thrs;		/*!< list of thread nodes of the fork
+					node */
+	UT_LIST_NODE_T(que_thr_t)
+			trx_thrs;	/*!< lists of threads in wait list of
+					the trx */
+	UT_LIST_NODE_T(que_thr_t)
+			queue;		/*!< list of runnable thread nodes in
+					the server task queue */
+	/*------------------------------*/
+	/* The following fields are private to the OS thread executing the
+	query thread, and are not protected by the kernel mutex: */
+
+	que_node_t*	run_node;	/*!< pointer to the node where the
+					subgraph down from this node is
+					currently executed */
+	que_node_t*	prev_node;	/*!< pointer to the node from which
+					the control came */
+	ulint		resource;	/*!< resource usage of the query thread
+					thus far */
+	ulint		lock_state;	/*!< lock state of thread (table or
+					row) */
+	ulint		fk_cascade_depth; /*!< maximum cascading call depth
+					supported for foreign key constraint
+					related delete/updates */
+};
+
+#define QUE_THR_MAGIC_N		8476583
+#define QUE_THR_MAGIC_FREED	123461526
+
+/* Query graph fork node: its fields are protected by the kernel mutex */
+struct que_fork_struct{
+	que_common_t	common;		/*!< type: QUE_NODE_FORK */
+	que_t*		graph;		/*!< query graph of this node */
+	ulint		fork_type;	/*!< fork type */
+	ulint		n_active_thrs;	/*!< if this is the root of a graph, the
+					number query threads that have been
+					started in que_thr_move_to_run_state
+					but for which que_thr_dec_refer_count
+					has not yet been called */
+	trx_t*		trx;		/*!< transaction: this is set only in
+					the root node */
+	ulint		state;		/*!< state of the fork node */
+	que_thr_t*	caller;		/*!< pointer to a possible calling query
+					thread */
+	UT_LIST_BASE_NODE_T(que_thr_t)
+			thrs;		/*!< list of query threads */
+	/*------------------------------*/
+	/* The fields in this section are defined only in the root node */
+	sym_tab_t*	sym_tab;	/*!< symbol table of the query,
+					generated by the parser, or NULL
+					if the graph was created 'by hand' */
+	pars_info_t*	info;		/*!< info struct, or NULL */
+	/* The following cur_... fields are relevant only in a select graph */
+
+	ulint		cur_end;	/*!< QUE_CUR_NOT_DEFINED, QUE_CUR_START,
+					QUE_CUR_END */
+	ulint		cur_pos;	/*!< if there are n rows in the result
+					set, values 0 and n + 1 mean before
+					first row, or after last row, depending
+					on cur_end; values 1...n mean a row
+					index */
+	ibool		cur_on_row;	/*!< TRUE if cursor is on a row, i.e.,
+					it is not before the first row or
+					after the last row */
+	dulint		n_inserts;	/*!< number of rows inserted */
+	dulint		n_updates;	/*!< number of rows updated */
+	dulint		n_deletes;	/*!< number of rows deleted */
+	sel_node_t*	last_sel_node;	/*!< last executed select node, or NULL
+					if none */
+	UT_LIST_NODE_T(que_fork_t)
+			graphs;		/*!< list of query graphs of a session
+					or a stored procedure */
+	/*------------------------------*/
+	mem_heap_t*	heap;		/*!< memory heap where the fork was
+					created */
+
+};
+
+/* Query fork (or graph) types */
+#define QUE_FORK_SELECT_NON_SCROLL	1	/* forward-only cursor */
+#define QUE_FORK_SELECT_SCROLL		2	/* scrollable cursor */
+#define QUE_FORK_INSERT			3
+#define QUE_FORK_UPDATE			4
+#define QUE_FORK_ROLLBACK		5
+			/* This is really the undo graph used in rollback,
+			no signal-sending roll_node in this graph */
+#define QUE_FORK_PURGE			6
+#define	QUE_FORK_EXECUTE		7
+#define QUE_FORK_PROCEDURE		8
+#define QUE_FORK_PROCEDURE_CALL		9
+#define QUE_FORK_MYSQL_INTERFACE	10
+#define	QUE_FORK_RECOVERY		11
+
+/* Query fork (or graph) states */
+#define QUE_FORK_ACTIVE		1
+#define QUE_FORK_COMMAND_WAIT	2
+#define QUE_FORK_INVALID	3
+#define QUE_FORK_BEING_FREED	4
+
+/* Flag which is ORed to control structure statement node types */
+#define QUE_NODE_CONTROL_STAT	1024
+
+/* Query graph node types */
+#define	QUE_NODE_LOCK		1
+#define	QUE_NODE_INSERT		2
+#define QUE_NODE_UPDATE		4
+#define	QUE_NODE_CURSOR		5
+#define	QUE_NODE_SELECT		6
+#define	QUE_NODE_AGGREGATE	7
+#define QUE_NODE_FORK		8
+#define QUE_NODE_THR		9
+#define QUE_NODE_UNDO		10
+#define QUE_NODE_COMMIT		11
+#define QUE_NODE_ROLLBACK	12
+#define QUE_NODE_PURGE		13
+#define QUE_NODE_CREATE_TABLE	14
+#define QUE_NODE_CREATE_INDEX	15
+#define QUE_NODE_SYMBOL		16
+#define QUE_NODE_RES_WORD	17
+#define QUE_NODE_FUNC		18
+#define QUE_NODE_ORDER		19
+#define QUE_NODE_PROC		(20 + QUE_NODE_CONTROL_STAT)
+#define QUE_NODE_IF		(21 + QUE_NODE_CONTROL_STAT)
+#define QUE_NODE_WHILE		(22 + QUE_NODE_CONTROL_STAT)
+#define QUE_NODE_ASSIGNMENT	23
+#define QUE_NODE_FETCH		24
+#define QUE_NODE_OPEN		25
+#define QUE_NODE_COL_ASSIGNMENT	26
+#define QUE_NODE_FOR		(27 + QUE_NODE_CONTROL_STAT)
+#define QUE_NODE_RETURN		28
+#define QUE_NODE_ROW_PRINTF	29
+#define QUE_NODE_ELSIF		30
+#define QUE_NODE_CALL		31
+#define QUE_NODE_EXIT		32
+
+#define QUE_NODE_INSERT_STATS	34
+
+/* Query thread states */
+#define QUE_THR_RUNNING		1
+#define QUE_THR_PROCEDURE_WAIT	2
+#define	QUE_THR_COMPLETED	3	/* in selects this means that the
+					thread is at the end of its result set
+					(or start, in case of a scroll cursor);
+					in other statements, this means the
+					thread has done its task */
+#define QUE_THR_COMMAND_WAIT	4
+#define QUE_THR_LOCK_WAIT	5
+#define QUE_THR_SIG_REPLY_WAIT	6
+#define QUE_THR_SUSPENDED	7
+#define QUE_THR_ERROR		8
+
+/* Query thread lock states */
+#define QUE_THR_LOCK_NOLOCK	0
+#define QUE_THR_LOCK_ROW	1
+#define QUE_THR_LOCK_TABLE	2
+
+/* From where the cursor position is counted */
+#define QUE_CUR_NOT_DEFINED	1
+#define QUE_CUR_START		2
+#define	QUE_CUR_END		3
+
+
+#ifndef UNIV_NONINL
+#include "que0que.ic"
+#endif
+
+#endif
diff --git a/storage/xtradb/include/que0que.ic b/storage/xtradb/include/que0que.ic
new file mode 100644
index 00000000000..bd936670e1e
--- /dev/null
+++ b/storage/xtradb/include/que0que.ic
@@ -0,0 +1,287 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2010, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/que0que.ic
+Query graph
+
+Created 5/27/1996 Heikki Tuuri
+*******************************************************/
+
+#include "usr0sess.h"
+
+/***********************************************************************//**
+Gets the trx of a query thread. */
+UNIV_INLINE
+trx_t*
+thr_get_trx(
+/*========*/
+	que_thr_t*	thr)	/*!< in: query thread */
+{
+	ut_ad(thr);
+
+	return(thr->graph->trx);
+}
+
+/*******************************************************************//**
+Determines if this thread is rolling back an incomplete transaction
+in crash recovery.
+@return TRUE if thr is rolling back an incomplete transaction in crash
+recovery */
+UNIV_INLINE
+ibool
+thr_is_recv(
+/*========*/
+	const que_thr_t*	thr)	/*!< in: query thread */
+{
+	return(trx_is_recv(thr->graph->trx));
+}
+
+/***********************************************************************//**
+Gets the first thr in a fork. */
+UNIV_INLINE
+que_thr_t*
+que_fork_get_first_thr(
+/*===================*/
+	que_fork_t*	fork)	/*!< in: query fork */
+{
+	return(UT_LIST_GET_FIRST(fork->thrs));
+}
+
+/***********************************************************************//**
+Gets the child node of the first thr in a fork. */
+UNIV_INLINE
+que_node_t*
+que_fork_get_child(
+/*===============*/
+	que_fork_t*	fork)	/*!< in: query fork */
+{
+	que_thr_t*	thr;
+
+	thr = UT_LIST_GET_FIRST(fork->thrs);
+
+	return(thr->child);
+}
+
+/***********************************************************************//**
+Gets the type of a graph node. */
+UNIV_INLINE
+ulint
+que_node_get_type(
+/*==============*/
+	que_node_t*	node)	/*!< in: graph node */
+{
+	ut_ad(node);
+
+	return(((que_common_t*)node)->type);
+}
+
+/***********************************************************************//**
+Gets pointer to the value dfield of a graph node. */
+UNIV_INLINE
+dfield_t*
+que_node_get_val(
+/*=============*/
+	que_node_t*	node)	/*!< in: graph node */
+{
+	ut_ad(node);
+
+	return(&(((que_common_t*)node)->val));
+}
+
+/***********************************************************************//**
+Gets the value buffer size of a graph node.
+@return	val buffer size, not defined if val.data == NULL in node */
+UNIV_INLINE
+ulint
+que_node_get_val_buf_size(
+/*======================*/
+	que_node_t*	node)	/*!< in: graph node */
+{
+	ut_ad(node);
+
+	return(((que_common_t*)node)->val_buf_size);
+}
+
+/***********************************************************************//**
+Sets the value buffer size of a graph node. */
+UNIV_INLINE
+void
+que_node_set_val_buf_size(
+/*======================*/
+	que_node_t*	node,	/*!< in: graph node */
+	ulint		size)	/*!< in: size */
+{
+	ut_ad(node);
+
+	((que_common_t*)node)->val_buf_size = size;
+}
+
+/***********************************************************************//**
+Sets the parent of a graph node. */
+UNIV_INLINE
+void
+que_node_set_parent(
+/*================*/
+	que_node_t*	node,	/*!< in: graph node */
+	que_node_t*	parent)	/*!< in: parent */
+{
+	ut_ad(node);
+
+	((que_common_t*)node)->parent = parent;
+}
+
+/***********************************************************************//**
+Gets pointer to the value data type field of a graph node. */
+UNIV_INLINE
+dtype_t*
+que_node_get_data_type(
+/*===================*/
+	que_node_t*	node)	/*!< in: graph node */
+{
+	ut_ad(node);
+
+	return(dfield_get_type(&((que_common_t*) node)->val));
+}
+
+/*********************************************************************//**
+Catenates a query graph node to a list of them, possible empty list.
+@return	one-way list of nodes */
+UNIV_INLINE
+que_node_t*
+que_node_list_add_last(
+/*===================*/
+	que_node_t*	node_list,	/*!< in: node list, or NULL */
+	que_node_t*	node)		/*!< in: node */
+{
+	que_common_t*	cnode;
+	que_common_t*	cnode2;
+
+	cnode = (que_common_t*) node;
+
+	cnode->brother = NULL;
+
+	if (node_list == NULL) {
+
+		return(node);
+	}
+
+	cnode2 = (que_common_t*) node_list;
+
+	while (cnode2->brother != NULL) {
+		cnode2 = (que_common_t*) cnode2->brother;
+	}
+
+	cnode2->brother = node;
+
+	return(node_list);
+}
+
+/*********************************************************************//**
+Gets the next list node in a list of query graph nodes.
+@return	next node in a list of nodes */
+UNIV_INLINE
+que_node_t*
+que_node_get_next(
+/*==============*/
+	que_node_t*	node)	/*!< in: node in a list */
+{
+	return(((que_common_t*)node)->brother);
+}
+
+/*********************************************************************//**
+Gets a query graph node list length.
+@return	length, for NULL list 0 */
+UNIV_INLINE
+ulint
+que_node_list_get_len(
+/*==================*/
+	que_node_t*	node_list)	/*!< in: node list, or NULL */
+{
+	const que_common_t*	cnode;
+	ulint			len;
+
+	cnode = (const que_common_t*) node_list;
+	len = 0;
+
+	while (cnode != NULL) {
+		len++;
+		cnode = (const que_common_t*) cnode->brother;
+	}
+
+	return(len);
+}
+
+/*********************************************************************//**
+Gets the parent node of a query graph node.
+@return	parent node or NULL */
+UNIV_INLINE
+que_node_t*
+que_node_get_parent(
+/*================*/
+	que_node_t*	node)	/*!< in: node */
+{
+	return(((que_common_t*)node)->parent);
+}
+
+/**********************************************************************//**
+Checks if graph, trx, or session is in a state where the query thread should
+be stopped.
+@return TRUE if should be stopped; NOTE that if the peek is made
+without reserving the kernel mutex, then another peek with the mutex
+reserved is necessary before deciding the actual stopping */
+UNIV_INLINE
+ibool
+que_thr_peek_stop(
+/*==============*/
+	que_thr_t*	thr)	/*!< in: query thread */
+{
+	trx_t*	trx;
+	que_t*	graph;
+
+	graph = thr->graph;
+	trx = graph->trx;
+
+	if (graph->state != QUE_FORK_ACTIVE
+	    || trx->que_state == TRX_QUE_LOCK_WAIT
+	    || (UT_LIST_GET_LEN(trx->signals) > 0
+		&& trx->que_state == TRX_QUE_RUNNING)) {
+
+		return(TRUE);
+	}
+
+	return(FALSE);
+}
+
+/***********************************************************************//**
+Returns TRUE if the query graph is for a SELECT statement.
+@return	TRUE if a select */
+UNIV_INLINE
+ibool
+que_graph_is_select(
+/*================*/
+	que_t*		graph)		/*!< in: graph */
+{
+	if (graph->fork_type == QUE_FORK_SELECT_SCROLL
+	    || graph->fork_type == QUE_FORK_SELECT_NON_SCROLL) {
+
+		return(TRUE);
+	}
+
+	return(FALSE);
+}
diff --git a/storage/xtradb/include/que0types.h b/storage/xtradb/include/que0types.h
new file mode 100644
index 00000000000..ea976074768
--- /dev/null
+++ b/storage/xtradb/include/que0types.h
@@ -0,0 +1,60 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/que0types.h
+Query graph global types
+
+Created 5/27/1996 Heikki Tuuri
+*******************************************************/
+
+#ifndef que0types_h
+#define que0types_h
+
+#include "data0data.h"
+#include "dict0types.h"
+
+/* Pseudotype for all graph nodes */
+typedef void	que_node_t;
+
+typedef struct que_fork_struct	que_fork_t;
+
+/* Query graph root is a fork node */
+typedef	que_fork_t	que_t;
+
+typedef struct que_thr_struct		que_thr_t;
+typedef struct que_common_struct	que_common_t;
+
+/* Common struct at the beginning of each query graph node; the name of this
+substruct must be 'common' */
+
+struct que_common_struct{
+	ulint		type;	/*!< query node type */
+	que_node_t*	parent;	/*!< back pointer to parent node, or NULL */
+	que_node_t*	brother;/* pointer to a possible brother node */
+	dfield_t	val;	/*!< evaluated value for an expression */
+	ulint		val_buf_size;
+				/* buffer size for the evaluated value data,
+				if the buffer has been allocated dynamically:
+				if this field is != 0, and the node is a
+				symbol node or a function node, then we
+				have to free the data field in val
+				explicitly */
+};
+
+#endif
diff --git a/storage/xtradb/include/read0read.h b/storage/xtradb/include/read0read.h
new file mode 100644
index 00000000000..4d9a9fade36
--- /dev/null
+++ b/storage/xtradb/include/read0read.h
@@ -0,0 +1,194 @@
+/*****************************************************************************
+
+Copyright (c) 1997, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/read0read.h
+Cursor read
+
+Created 2/16/1997 Heikki Tuuri
+*******************************************************/
+
+#ifndef read0read_h
+#define read0read_h
+
+#include "univ.i"
+
+
+#include "ut0byte.h"
+#include "ut0lst.h"
+#include "trx0trx.h"
+#include "read0types.h"
+
+/*********************************************************************//**
+Opens a read view where exactly the transactions serialized before this
+point in time are seen in the view.
+@return	own: read view struct */
+UNIV_INTERN
+read_view_t*
+read_view_open_now(
+/*===============*/
+	trx_id_t	cr_trx_id,	/*!< in: trx_id of creating
+					transaction, or ut_dulint_zero
+					used in purge */
+	mem_heap_t*	heap);		/*!< in: memory heap from which
+					allocated */
+/*********************************************************************//**
+Makes a copy of the oldest existing read view, or opens a new. The view
+must be closed with ..._close.
+@return	own: read view struct */
+UNIV_INTERN
+read_view_t*
+read_view_oldest_copy_or_open_new(
+/*==============================*/
+	trx_id_t	cr_trx_id,	/*!< in: trx_id of creating
+					transaction, or ut_dulint_zero
+					used in purge */
+	mem_heap_t*	heap);		/*!< in: memory heap from which
+					allocated */
+/*********************************************************************//**
+Closes a read view. */
+UNIV_INTERN
+void
+read_view_close(
+/*============*/
+	read_view_t*	view);	/*!< in: read view */
+/*********************************************************************//**
+Closes a consistent read view for MySQL. This function is called at an SQL
+statement end if the trx isolation level is <= TRX_ISO_READ_COMMITTED. */
+UNIV_INTERN
+void
+read_view_close_for_mysql(
+/*======================*/
+	trx_t*	trx);	/*!< in: trx which has a read view */
+/*********************************************************************//**
+Checks if a read view sees the specified transaction.
+@return	TRUE if sees */
+UNIV_INLINE
+ibool
+read_view_sees_trx_id(
+/*==================*/
+	const read_view_t*	view,	/*!< in: read view */
+	trx_id_t		trx_id);/*!< in: trx id */
+/*********************************************************************//**
+Prints a read view to stderr. */
+UNIV_INTERN
+void
+read_view_print(
+/*============*/
+	const read_view_t*	view);	/*!< in: read view */
+/*********************************************************************//**
+Create a consistent cursor view for mysql to be used in cursors. In this
+consistent read view modifications done by the creating transaction or future
+transactions are not visible. */
+UNIV_INTERN
+cursor_view_t*
+read_cursor_view_create_for_mysql(
+/*==============================*/
+	trx_t*		cr_trx);/*!< in: trx where cursor view is created */
+/*********************************************************************//**
+Close a given consistent cursor view for mysql and restore global read view
+back to a transaction read view. */
+UNIV_INTERN
+void
+read_cursor_view_close_for_mysql(
+/*=============================*/
+	trx_t*		trx,		/*!< in: trx */
+	cursor_view_t*	curview);	/*!< in: cursor view to be closed */
+/*********************************************************************//**
+This function sets a given consistent cursor view to a transaction
+read view if given consistent cursor view is not NULL. Otherwise, function
+restores a global read view to a transaction read view. */
+UNIV_INTERN
+void
+read_cursor_set_for_mysql(
+/*======================*/
+	trx_t*		trx,	/*!< in: transaction where cursor is set */
+	cursor_view_t*	curview);/*!< in: consistent cursor view to be set */
+
+/** Read view lists the trx ids of those transactions for which a consistent
+read should not see the modifications to the database. */
+
+struct read_view_struct{
+	ulint		type;	/*!< VIEW_NORMAL, VIEW_HIGH_GRANULARITY */
+	undo_no_t	undo_no;/*!< ut_dulint_zero or if type is
+				VIEW_HIGH_GRANULARITY
+				transaction undo_no when this high-granularity
+				consistent read view was created */
+	trx_id_t	low_limit_no;
+				/*!< The view does not need to see the undo
+				logs for transactions whose transaction number
+				is strictly smaller (<) than this value: they
+				can be removed in purge if not needed by other
+				views */
+	trx_id_t	low_limit_id;
+				/*!< The read should not see any transaction
+				with trx id >= this value. In other words,
+				this is the "high water mark". */
+	trx_id_t	up_limit_id;
+				/*!< The read should see all trx ids which
+				are strictly smaller (<) than this value.
+				In other words,
+				this is the "low water mark". */
+	ulint		n_trx_ids;
+				/*!< Number of cells in the trx_ids array */
+	trx_id_t*	trx_ids;/*!< Additional trx ids which the read should
+				not see: typically, these are the active
+				transactions at the time when the read is
+				serialized, except the reading transaction
+				itself; the trx ids in this array are in a
+				descending order. These trx_ids should be
+				between the "low" and "high" water marks,
+				that is, up_limit_id and low_limit_id. */
+	trx_id_t	creator_trx_id;
+				/*!< trx id of creating transaction, or
+				ut_dulint_zero used in purge */
+	UT_LIST_NODE_T(read_view_t) view_list;
+				/*!< List of read views in trx_sys */
+};
+
+/** Read view types @{ */
+#define VIEW_NORMAL		1	/*!< Normal consistent read view
+					where transaction does not see changes
+					made by active transactions except
+					creating transaction. */
+#define VIEW_HIGH_GRANULARITY	2	/*!< High-granularity read view where
+					transaction does not see changes
+					made by active transactions and own
+					changes after a point in time when this
+					read view was created. */
+/* @} */
+
+/** Implement InnoDB framework to support consistent read views in
+cursors. This struct holds both heap where consistent read view
+is allocated and pointer to a read view. */
+
+struct cursor_view_struct{
+	mem_heap_t*	heap;
+				/*!< Memory heap for the cursor view */
+	read_view_t*	read_view;
+				/*!< Consistent read view of the cursor*/
+	ulint		n_mysql_tables_in_use;
+				/*!< number of Innobase tables used in the
+				processing of this cursor */
+};
+
+#ifndef UNIV_NONINL
+#include "read0read.ic"
+#endif
+
+#endif
diff --git a/storage/xtradb/include/read0read.ic b/storage/xtradb/include/read0read.ic
new file mode 100644
index 00000000000..9924967cc2d
--- /dev/null
+++ b/storage/xtradb/include/read0read.ic
@@ -0,0 +1,98 @@
+/*****************************************************************************
+
+Copyright (c) 1997, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/read0read.ic
+Cursor read
+
+Created 2/16/1997 Heikki Tuuri
+*******************************************************/
+
+/*********************************************************************//**
+Gets the nth trx id in a read view.
+@return	trx id */
+UNIV_INLINE
+trx_id_t
+read_view_get_nth_trx_id(
+/*=====================*/
+	const read_view_t*	view,	/*!< in: read view */
+	ulint			n)	/*!< in: position */
+{
+	ut_ad(n < view->n_trx_ids);
+
+	return(*(view->trx_ids + n));
+}
+
+/*********************************************************************//**
+Sets the nth trx id in a read view. */
+UNIV_INLINE
+void
+read_view_set_nth_trx_id(
+/*=====================*/
+	read_view_t*	view,	/*!< in: read view */
+	ulint		n,	/*!< in: position */
+	trx_id_t	trx_id)	/*!< in: trx id to set */
+{
+	ut_ad(n < view->n_trx_ids);
+
+	*(view->trx_ids + n) = trx_id;
+}
+
+/*********************************************************************//**
+Checks if a read view sees the specified transaction.
+@return	TRUE if sees */
+UNIV_INLINE
+ibool
+read_view_sees_trx_id(
+/*==================*/
+	const read_view_t*	view,	/*!< in: read view */
+	trx_id_t		trx_id)	/*!< in: trx id */
+{
+	ulint	n_ids;
+	int	cmp;
+	ulint	i;
+
+	if (ut_dulint_cmp(trx_id, view->up_limit_id) < 0) {
+
+		return(TRUE);
+	}
+
+	if (ut_dulint_cmp(trx_id, view->low_limit_id) >= 0) {
+
+		return(FALSE);
+	}
+
+	/* We go through the trx ids in the array smallest first: this order
+	may save CPU time, because if there was a very long running
+	transaction in the trx id array, its trx id is looked at first, and
+	the first two comparisons may well decide the visibility of trx_id. */
+
+	n_ids = view->n_trx_ids;
+
+	for (i = 0; i < n_ids; i++) {
+
+		cmp = ut_dulint_cmp(
+			trx_id,
+			read_view_get_nth_trx_id(view, n_ids - i - 1));
+		if (cmp <= 0) {
+			return(cmp < 0);
+		}
+	}
+
+	return(TRUE);
+}
diff --git a/storage/xtradb/include/read0types.h b/storage/xtradb/include/read0types.h
new file mode 100644
index 00000000000..caf69e3fb51
--- /dev/null
+++ b/storage/xtradb/include/read0types.h
@@ -0,0 +1,32 @@
+/*****************************************************************************
+
+Copyright (c) 1997, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/read0types.h
+Cursor read
+
+Created 2/16/1997 Heikki Tuuri
+*******************************************************/
+
+#ifndef read0types_h
+#define read0types_h
+
+typedef struct read_view_struct	read_view_t;
+typedef struct cursor_view_struct	cursor_view_t;
+
+#endif
diff --git a/storage/xtradb/include/rem0cmp.h b/storage/xtradb/include/rem0cmp.h
new file mode 100644
index 00000000000..fcea62ad486
--- /dev/null
+++ b/storage/xtradb/include/rem0cmp.h
@@ -0,0 +1,197 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/*******************************************************************//**
+@file include/rem0cmp.h
+Comparison services for records
+
+Created 7/1/1994 Heikki Tuuri
+************************************************************************/
+
+#ifndef rem0cmp_h
+#define rem0cmp_h
+
+#include "univ.i"
+#include "data0data.h"
+#include "data0type.h"
+#include "dict0dict.h"
+#include "rem0rec.h"
+
+/*************************************************************//**
+Returns TRUE if two columns are equal for comparison purposes.
+@return	TRUE if the columns are considered equal in comparisons */
+UNIV_INTERN
+ibool
+cmp_cols_are_equal(
+/*===============*/
+	const dict_col_t*	col1,	/*!< in: column 1 */
+	const dict_col_t*	col2,	/*!< in: column 2 */
+	ibool			check_charsets);
+					/*!< in: whether to check charsets */
+/*************************************************************//**
+This function is used to compare two data fields for which we know the
+data type.
+@return	1, 0, -1, if data1 is greater, equal, less than data2, respectively */
+UNIV_INLINE
+int
+cmp_data_data(
+/*==========*/
+	ulint		mtype,	/*!< in: main type */
+	ulint		prtype,	/*!< in: precise type */
+	const byte*	data1,	/*!< in: data field (== a pointer to a memory
+				buffer) */
+	ulint		len1,	/*!< in: data field length or UNIV_SQL_NULL */
+	const byte*	data2,	/*!< in: data field (== a pointer to a memory
+				buffer) */
+	ulint		len2);	/*!< in: data field length or UNIV_SQL_NULL */
+/*************************************************************//**
+This function is used to compare two data fields for which we know the
+data type.
+@return	1, 0, -1, if data1 is greater, equal, less than data2, respectively */
+UNIV_INTERN
+int
+cmp_data_data_slow(
+/*===============*/
+	ulint		mtype,	/*!< in: main type */
+	ulint		prtype,	/*!< in: precise type */
+	const byte*	data1,	/*!< in: data field (== a pointer to a memory
+				buffer) */
+	ulint		len1,	/*!< in: data field length or UNIV_SQL_NULL */
+	const byte*	data2,	/*!< in: data field (== a pointer to a memory
+				buffer) */
+	ulint		len2);	/*!< in: data field length or UNIV_SQL_NULL */
+/*************************************************************//**
+This function is used to compare two dfields where at least the first
+has its data type field set.
+@return 1, 0, -1, if dfield1 is greater, equal, less than dfield2,
+respectively */
+UNIV_INLINE
+int
+cmp_dfield_dfield(
+/*==============*/
+	const dfield_t*	dfield1,/*!< in: data field; must have type field set */
+	const dfield_t*	dfield2);/*!< in: data field */
+/*************************************************************//**
+This function is used to compare a data tuple to a physical record.
+Only dtuple->n_fields_cmp first fields are taken into account for
+the data tuple! If we denote by n = n_fields_cmp, then rec must
+have either m >= n fields, or it must differ from dtuple in some of
+the m fields rec has. If rec has an externally stored field we do not
+compare it but return with value 0 if such a comparison should be
+made.
+@return 1, 0, -1, if dtuple is greater, equal, less than rec,
+respectively, when only the common first fields are compared, or until
+the first externally stored field in rec */
+UNIV_INTERN
+int
+cmp_dtuple_rec_with_match(
+/*======================*/
+	const dtuple_t*	dtuple,	/*!< in: data tuple */
+	const rec_t*	rec,	/*!< in: physical record which differs from
+				dtuple in some of the common fields, or which
+				has an equal number or more fields than
+				dtuple */
+	const ulint*	offsets,/*!< in: array returned by rec_get_offsets() */
+	ulint*		matched_fields, /*!< in/out: number of already completely
+				matched fields; when function returns,
+				contains the value for current comparison */
+	ulint*		matched_bytes); /*!< in/out: number of already matched
+				bytes within the first field not completely
+				matched; when function returns, contains the
+				value for current comparison */
+/**************************************************************//**
+Compares a data tuple to a physical record.
+@see cmp_dtuple_rec_with_match
+@return 1, 0, -1, if dtuple is greater, equal, less than rec, respectively */
+UNIV_INTERN
+int
+cmp_dtuple_rec(
+/*===========*/
+	const dtuple_t*	dtuple,	/*!< in: data tuple */
+	const rec_t*	rec,	/*!< in: physical record */
+	const ulint*	offsets);/*!< in: array returned by rec_get_offsets() */
+/**************************************************************//**
+Checks if a dtuple is a prefix of a record. The last field in dtuple
+is allowed to be a prefix of the corresponding field in the record.
+@return	TRUE if prefix */
+UNIV_INTERN
+ibool
+cmp_dtuple_is_prefix_of_rec(
+/*========================*/
+	const dtuple_t*	dtuple,	/*!< in: data tuple */
+	const rec_t*	rec,	/*!< in: physical record */
+	const ulint*	offsets);/*!< in: array returned by rec_get_offsets() */
+/*************************************************************//**
+Compare two physical records that contain the same number of columns,
+none of which are stored externally.
+@return	1, 0, -1 if rec1 is greater, equal, less, respectively, than rec2 */
+UNIV_INTERN
+int
+cmp_rec_rec_simple(
+/*===============*/
+	const rec_t*		rec1,	/*!< in: physical record */
+	const rec_t*		rec2,	/*!< in: physical record */
+	const ulint*		offsets1,/*!< in: rec_get_offsets(rec1, ...) */
+	const ulint*		offsets2,/*!< in: rec_get_offsets(rec2, ...) */
+	const dict_index_t*	index,	/*!< in: data dictionary index */
+	ibool*			null_eq);/*!< out: set to TRUE if
+					found matching null values */
+/*************************************************************//**
+This function is used to compare two physical records. Only the common
+first fields are compared, and if an externally stored field is
+encountered, then 0 is returned.
+@return 1, 0, -1 if rec1 is greater, equal, less, respectively */
+UNIV_INTERN
+int
+cmp_rec_rec_with_match(
+/*===================*/
+	const rec_t*	rec1,	/*!< in: physical record */
+	const rec_t*	rec2,	/*!< in: physical record */
+	const ulint*	offsets1,/*!< in: rec_get_offsets(rec1, index) */
+	const ulint*	offsets2,/*!< in: rec_get_offsets(rec2, index) */
+	dict_index_t*	index,	/*!< in: data dictionary index */
+	ulint*		matched_fields, /*!< in/out: number of already completely
+				matched fields; when the function returns,
+				contains the value the for current
+				comparison */
+	ulint*		matched_bytes, /*!< in/out: number of already matched
+				bytes within the first field not completely
+				matched; when the function returns, contains
+				the value for the current comparison */
+	ulint		stats_method);
+/*************************************************************//**
+This function is used to compare two physical records. Only the common
+first fields are compared.
+@return 1, 0 , -1 if rec1 is greater, equal, less, respectively, than
+rec2; only the common first fields are compared */
+UNIV_INLINE
+int
+cmp_rec_rec(
+/*========*/
+	const rec_t*	rec1,	/*!< in: physical record */
+	const rec_t*	rec2,	/*!< in: physical record */
+	const ulint*	offsets1,/*!< in: rec_get_offsets(rec1, index) */
+	const ulint*	offsets2,/*!< in: rec_get_offsets(rec2, index) */
+	dict_index_t*	index);	/*!< in: data dictionary index */
+
+
+#ifndef UNIV_NONINL
+#include "rem0cmp.ic"
+#endif
+
+#endif
diff --git a/storage/xtradb/include/rem0cmp.ic b/storage/xtradb/include/rem0cmp.ic
new file mode 100644
index 00000000000..d5185ec94af
--- /dev/null
+++ b/storage/xtradb/include/rem0cmp.ic
@@ -0,0 +1,91 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/*******************************************************************//**
+@file include/rem0cmp.ic
+Comparison services for records
+
+Created 7/1/1994 Heikki Tuuri
+************************************************************************/
+
+/*************************************************************//**
+This function is used to compare two data fields for which we know the
+data type.
+@return	1, 0, -1, if data1 is greater, equal, less than data2, respectively */
+UNIV_INLINE
+int
+cmp_data_data(
+/*==========*/
+	ulint		mtype,	/*!< in: main type */
+	ulint		prtype,	/*!< in: precise type */
+	const byte*	data1,	/*!< in: data field (== a pointer to a memory
+				buffer) */
+	ulint		len1,	/*!< in: data field length or UNIV_SQL_NULL */
+	const byte*	data2,	/*!< in: data field (== a pointer to a memory
+				buffer) */
+	ulint		len2)	/*!< in: data field length or UNIV_SQL_NULL */
+{
+	return(cmp_data_data_slow(mtype, prtype, data1, len1, data2, len2));
+}
+
+/*************************************************************//**
+This function is used to compare two dfields where at least the first
+has its data type field set.
+@return 1, 0, -1, if dfield1 is greater, equal, less than dfield2,
+respectively */
+UNIV_INLINE
+int
+cmp_dfield_dfield(
+/*==============*/
+	const dfield_t*	dfield1,/*!< in: data field; must have type field set */
+	const dfield_t*	dfield2)/*!< in: data field */
+{
+	const dtype_t*	type;
+
+	ut_ad(dfield_check_typed(dfield1));
+
+	type = dfield_get_type(dfield1);
+
+	return(cmp_data_data(type->mtype, type->prtype,
+			     (const byte*) dfield_get_data(dfield1),
+			     dfield_get_len(dfield1),
+			     (const byte*) dfield_get_data(dfield2),
+			     dfield_get_len(dfield2)));
+}
+
+/*************************************************************//**
+This function is used to compare two physical records. Only the common
+first fields are compared.
+@return 1, 0 , -1 if rec1 is greater, equal, less, respectively, than
+rec2; only the common first fields are compared */
+UNIV_INLINE
+int
+cmp_rec_rec(
+/*========*/
+	const rec_t*	rec1,	/*!< in: physical record */
+	const rec_t*	rec2,	/*!< in: physical record */
+	const ulint*	offsets1,/*!< in: rec_get_offsets(rec1, index) */
+	const ulint*	offsets2,/*!< in: rec_get_offsets(rec2, index) */
+	dict_index_t*	index)	/*!< in: data dictionary index */
+{
+	ulint	match_f		= 0;
+	ulint	match_b		= 0;
+
+	return(cmp_rec_rec_with_match(rec1, rec2, offsets1, offsets2, index,
+				      &match_f, &match_b, 0));
+}
diff --git a/storage/xtradb/include/rem0rec.h b/storage/xtradb/include/rem0rec.h
new file mode 100644
index 00000000000..17d08afabb9
--- /dev/null
+++ b/storage/xtradb/include/rem0rec.h
@@ -0,0 +1,824 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/********************************************************************//**
+@file include/rem0rec.h
+Record manager
+
+Created 5/30/1994 Heikki Tuuri
+*************************************************************************/
+
+#ifndef rem0rec_h
+#define rem0rec_h
+
+#include "univ.i"
+#include "data0data.h"
+#include "rem0types.h"
+#include "mtr0types.h"
+#include "page0types.h"
+
+/* Info bit denoting the predefined minimum record: this bit is set
+if and only if the record is the first user record on a non-leaf
+B-tree page that is the leftmost page on its level
+(PAGE_LEVEL is nonzero and FIL_PAGE_PREV is FIL_NULL). */
+#define REC_INFO_MIN_REC_FLAG	0x10UL
+/* The deleted flag in info bits */
+#define REC_INFO_DELETED_FLAG	0x20UL	/* when bit is set to 1, it means the
+					record has been delete marked */
+
+/* Number of extra bytes in an old-style record,
+in addition to the data and the offsets */
+#define REC_N_OLD_EXTRA_BYTES	6
+/* Number of extra bytes in a new-style record,
+in addition to the data and the offsets */
+#define REC_N_NEW_EXTRA_BYTES	5
+
+/* Record status values */
+#define REC_STATUS_ORDINARY	0
+#define REC_STATUS_NODE_PTR	1
+#define REC_STATUS_INFIMUM	2
+#define REC_STATUS_SUPREMUM	3
+
+/* The following four constants are needed in page0zip.c in order to
+efficiently compress and decompress pages. */
+
+/* The offset of heap_no in a compact record */
+#define REC_NEW_HEAP_NO		4
+/* The shift of heap_no in a compact record.
+The status is stored in the low-order bits. */
+#define	REC_HEAP_NO_SHIFT	3
+
+/* Length of a B-tree node pointer, in bytes */
+#define REC_NODE_PTR_SIZE	4
+
+#ifdef UNIV_DEBUG
+/* Length of the rec_get_offsets() header */
+# define REC_OFFS_HEADER_SIZE	4
+#else /* UNIV_DEBUG */
+/* Length of the rec_get_offsets() header */
+# define REC_OFFS_HEADER_SIZE	2
+#endif /* UNIV_DEBUG */
+
+/* Number of elements that should be initially allocated for the
+offsets[] array, first passed to rec_get_offsets() */
+#define REC_OFFS_NORMAL_SIZE	100
+#define REC_OFFS_SMALL_SIZE	10
+
+/******************************************************//**
+The following function is used to get the pointer of the next chained record
+on the same page.
+@return	pointer to the next chained record, or NULL if none */
+UNIV_INLINE
+const rec_t*
+rec_get_next_ptr_const(
+/*===================*/
+	const rec_t*	rec,	/*!< in: physical record */
+	ulint		comp);	/*!< in: nonzero=compact page format */
+/******************************************************//**
+The following function is used to get the pointer of the next chained record
+on the same page.
+@return	pointer to the next chained record, or NULL if none */
+UNIV_INLINE
+rec_t*
+rec_get_next_ptr(
+/*=============*/
+	rec_t*	rec,	/*!< in: physical record */
+	ulint	comp);	/*!< in: nonzero=compact page format */
+/******************************************************//**
+The following function is used to get the offset of the
+next chained record on the same page.
+@return	the page offset of the next chained record, or 0 if none */
+UNIV_INLINE
+ulint
+rec_get_next_offs(
+/*==============*/
+	const rec_t*	rec,	/*!< in: physical record */
+	ulint		comp);	/*!< in: nonzero=compact page format */
+/******************************************************//**
+The following function is used to set the next record offset field
+of an old-style record. */
+UNIV_INLINE
+void
+rec_set_next_offs_old(
+/*==================*/
+	rec_t*	rec,	/*!< in: old-style physical record */
+	ulint	next);	/*!< in: offset of the next record */
+/******************************************************//**
+The following function is used to set the next record offset field
+of a new-style record. */
+UNIV_INLINE
+void
+rec_set_next_offs_new(
+/*==================*/
+	rec_t*	rec,	/*!< in/out: new-style physical record */
+	ulint	next);	/*!< in: offset of the next record */
+/******************************************************//**
+The following function is used to get the number of fields
+in an old-style record.
+@return	number of data fields */
+UNIV_INLINE
+ulint
+rec_get_n_fields_old(
+/*=================*/
+	const rec_t*	rec);	/*!< in: physical record */
+/******************************************************//**
+The following function is used to get the number of fields
+in a record.
+@return	number of data fields */
+UNIV_INLINE
+ulint
+rec_get_n_fields(
+/*=============*/
+	const rec_t*		rec,	/*!< in: physical record */
+	const dict_index_t*	index);	/*!< in: record descriptor */
+/******************************************************//**
+The following function is used to get the number of records owned by the
+previous directory record.
+@return	number of owned records */
+UNIV_INLINE
+ulint
+rec_get_n_owned_old(
+/*================*/
+	const rec_t*	rec);	/*!< in: old-style physical record */
+/******************************************************//**
+The following function is used to set the number of owned records. */
+UNIV_INLINE
+void
+rec_set_n_owned_old(
+/*================*/
+	rec_t*	rec,		/*!< in: old-style physical record */
+	ulint	n_owned);	/*!< in: the number of owned */
+/******************************************************//**
+The following function is used to get the number of records owned by the
+previous directory record.
+@return	number of owned records */
+UNIV_INLINE
+ulint
+rec_get_n_owned_new(
+/*================*/
+	const rec_t*	rec);	/*!< in: new-style physical record */
+/******************************************************//**
+The following function is used to set the number of owned records. */
+UNIV_INLINE
+void
+rec_set_n_owned_new(
+/*================*/
+	rec_t*		rec,	/*!< in/out: new-style physical record */
+	page_zip_des_t*	page_zip,/*!< in/out: compressed page, or NULL */
+	ulint		n_owned);/*!< in: the number of owned */
+/******************************************************//**
+The following function is used to retrieve the info bits of
+a record.
+@return	info bits */
+UNIV_INLINE
+ulint
+rec_get_info_bits(
+/*==============*/
+	const rec_t*	rec,	/*!< in: physical record */
+	ulint		comp);	/*!< in: nonzero=compact page format */
+/******************************************************//**
+The following function is used to set the info bits of a record. */
+UNIV_INLINE
+void
+rec_set_info_bits_old(
+/*==================*/
+	rec_t*	rec,	/*!< in: old-style physical record */
+	ulint	bits);	/*!< in: info bits */
+/******************************************************//**
+The following function is used to set the info bits of a record. */
+UNIV_INLINE
+void
+rec_set_info_bits_new(
+/*==================*/
+	rec_t*	rec,	/*!< in/out: new-style physical record */
+	ulint	bits);	/*!< in: info bits */
+/******************************************************//**
+The following function retrieves the status bits of a new-style record.
+@return	status bits */
+UNIV_INLINE
+ulint
+rec_get_status(
+/*===========*/
+	const rec_t*	rec);	/*!< in: physical record */
+
+/******************************************************//**
+The following function is used to set the status bits of a new-style record. */
+UNIV_INLINE
+void
+rec_set_status(
+/*===========*/
+	rec_t*	rec,	/*!< in/out: physical record */
+	ulint	bits);	/*!< in: info bits */
+
+/******************************************************//**
+The following function is used to retrieve the info and status
+bits of a record.  (Only compact records have status bits.)
+@return	info bits */
+UNIV_INLINE
+ulint
+rec_get_info_and_status_bits(
+/*=========================*/
+	const rec_t*	rec,	/*!< in: physical record */
+	ulint		comp);	/*!< in: nonzero=compact page format */
+/******************************************************//**
+The following function is used to set the info and status
+bits of a record.  (Only compact records have status bits.) */
+UNIV_INLINE
+void
+rec_set_info_and_status_bits(
+/*=========================*/
+	rec_t*	rec,	/*!< in/out: compact physical record */
+	ulint	bits);	/*!< in: info bits */
+
+/******************************************************//**
+The following function tells if record is delete marked.
+@return	nonzero if delete marked */
+UNIV_INLINE
+ulint
+rec_get_deleted_flag(
+/*=================*/
+	const rec_t*	rec,	/*!< in: physical record */
+	ulint		comp);	/*!< in: nonzero=compact page format */
+/******************************************************//**
+The following function is used to set the deleted bit. */
+UNIV_INLINE
+void
+rec_set_deleted_flag_old(
+/*=====================*/
+	rec_t*	rec,	/*!< in: old-style physical record */
+	ulint	flag);	/*!< in: nonzero if delete marked */
+/******************************************************//**
+The following function is used to set the deleted bit. */
+UNIV_INLINE
+void
+rec_set_deleted_flag_new(
+/*=====================*/
+	rec_t*		rec,	/*!< in/out: new-style physical record */
+	page_zip_des_t*	page_zip,/*!< in/out: compressed page, or NULL */
+	ulint		flag);	/*!< in: nonzero if delete marked */
+/******************************************************//**
+The following function tells if a new-style record is a node pointer.
+@return	TRUE if node pointer */
+UNIV_INLINE
+ibool
+rec_get_node_ptr_flag(
+/*==================*/
+	const rec_t*	rec);	/*!< in: physical record */
+/******************************************************//**
+The following function is used to get the order number
+of an old-style record in the heap of the index page.
+@return	heap order number */
+UNIV_INLINE
+ulint
+rec_get_heap_no_old(
+/*================*/
+	const rec_t*	rec);	/*!< in: physical record */
+/******************************************************//**
+The following function is used to set the heap number
+field in an old-style record. */
+UNIV_INLINE
+void
+rec_set_heap_no_old(
+/*================*/
+	rec_t*	rec,	/*!< in: physical record */
+	ulint	heap_no);/*!< in: the heap number */
+/******************************************************//**
+The following function is used to get the order number
+of a new-style record in the heap of the index page.
+@return	heap order number */
+UNIV_INLINE
+ulint
+rec_get_heap_no_new(
+/*================*/
+	const rec_t*	rec);	/*!< in: physical record */
+/******************************************************//**
+The following function is used to set the heap number
+field in a new-style record. */
+UNIV_INLINE
+void
+rec_set_heap_no_new(
+/*================*/
+	rec_t*	rec,	/*!< in/out: physical record */
+	ulint	heap_no);/*!< in: the heap number */
+/******************************************************//**
+The following function is used to test whether the data offsets
+in the record are stored in one-byte or two-byte format.
+@return	TRUE if 1-byte form */
+UNIV_INLINE
+ibool
+rec_get_1byte_offs_flag(
+/*====================*/
+	const rec_t*	rec);	/*!< in: physical record */
+
+/******************************************************//**
+Determine how many of the first n columns in a compact
+physical record are stored externally.
+@return	number of externally stored columns */
+UNIV_INTERN
+ulint
+rec_get_n_extern_new(
+/*=================*/
+	const rec_t*	rec,	/*!< in: compact physical record */
+	dict_index_t*	index,	/*!< in: record descriptor */
+	ulint		n);	/*!< in: number of columns to scan */
+
+/******************************************************//**
+The following function determines the offsets to each field
+in the record.	It can reuse a previously allocated array.
+@return	the new offsets */
+UNIV_INTERN
+ulint*
+rec_get_offsets_func(
+/*=================*/
+	const rec_t*		rec,	/*!< in: physical record */
+	const dict_index_t*	index,	/*!< in: record descriptor */
+	ulint*			offsets,/*!< in/out: array consisting of
+					offsets[0] allocated elements,
+					or an array from rec_get_offsets(),
+					or NULL */
+	ulint			n_fields,/*!< in: maximum number of
+					initialized fields
+					 (ULINT_UNDEFINED if all fields) */
+	mem_heap_t**		heap,	/*!< in/out: memory heap */
+	const char*		file,	/*!< in: file name where called */
+	ulint			line);	/*!< in: line number where called */
+
+#define rec_get_offsets(rec,index,offsets,n,heap)	\
+	rec_get_offsets_func(rec,index,offsets,n,heap,__FILE__,__LINE__)
+
+/******************************************************//**
+Determine the offset to each field in a leaf-page record
+in ROW_FORMAT=COMPACT.  This is a special case of
+rec_init_offsets() and rec_get_offsets_func(). */
+UNIV_INTERN
+void
+rec_init_offsets_comp_ordinary(
+/*===========================*/
+	const rec_t*		rec,	/*!< in: physical record in
+					ROW_FORMAT=COMPACT */
+	ulint			extra,	/*!< in: number of bytes to reserve
+					between the record header and
+					the data payload
+					(usually REC_N_NEW_EXTRA_BYTES) */
+	const dict_index_t*	index,	/*!< in: record descriptor */
+	ulint*			offsets);/*!< in/out: array of offsets;
+					in: n=rec_offs_n_fields(offsets) */
+
+/******************************************************//**
+The following function determines the offsets to each field
+in the record.  It can reuse a previously allocated array. */
+UNIV_INTERN
+void
+rec_get_offsets_reverse(
+/*====================*/
+	const byte*		extra,	/*!< in: the extra bytes of a
+					compact record in reverse order,
+					excluding the fixed-size
+					REC_N_NEW_EXTRA_BYTES */
+	const dict_index_t*	index,	/*!< in: record descriptor */
+	ulint			node_ptr,/*!< in: nonzero=node pointer,
+					0=leaf node */
+	ulint*			offsets);/*!< in/out: array consisting of
+					offsets[0] allocated elements */
+
+/************************************************************//**
+Validates offsets returned by rec_get_offsets().
+@return	TRUE if valid */
+UNIV_INLINE
+ibool
+rec_offs_validate(
+/*==============*/
+	const rec_t*		rec,	/*!< in: record or NULL */
+	const dict_index_t*	index,	/*!< in: record descriptor or NULL */
+	const ulint*		offsets);/*!< in: array returned by
+					rec_get_offsets() */
+#ifdef UNIV_DEBUG
+/************************************************************//**
+Updates debug data in offsets, in order to avoid bogus
+rec_offs_validate() failures. */
+UNIV_INLINE
+void
+rec_offs_make_valid(
+/*================*/
+	const rec_t*		rec,	/*!< in: record */
+	const dict_index_t*	index,	/*!< in: record descriptor */
+	ulint*			offsets);/*!< in: array returned by
+					rec_get_offsets() */
+#else
+# define rec_offs_make_valid(rec, index, offsets) ((void) 0)
+#endif /* UNIV_DEBUG */
+
+/************************************************************//**
+The following function is used to get the offset to the nth
+data field in an old-style record.
+@return	offset to the field */
+UNIV_INTERN
+ulint
+rec_get_nth_field_offs_old(
+/*=======================*/
+	const rec_t*	rec,	/*!< in: record */
+	ulint		n,	/*!< in: index of the field */
+	ulint*		len);	/*!< out: length of the field; UNIV_SQL_NULL
+				if SQL null */
+#define rec_get_nth_field_old(rec, n, len) \
+((rec) + rec_get_nth_field_offs_old(rec, n, len))
+/************************************************************//**
+Gets the physical size of an old-style field.
+Also an SQL null may have a field of size > 0,
+if the data type is of a fixed size.
+@return	field size in bytes */
+UNIV_INLINE
+ulint
+rec_get_nth_field_size(
+/*===================*/
+	const rec_t*	rec,	/*!< in: record */
+	ulint		n);	/*!< in: index of the field */
+/************************************************************//**
+The following function is used to get an offset to the nth
+data field in a record.
+@return	offset from the origin of rec */
+UNIV_INLINE
+ulint
+rec_get_nth_field_offs(
+/*===================*/
+	const ulint*	offsets,/*!< in: array returned by rec_get_offsets() */
+	ulint		n,	/*!< in: index of the field */
+	ulint*		len);	/*!< out: length of the field; UNIV_SQL_NULL
+				if SQL null */
+#define rec_get_nth_field(rec, offsets, n, len) \
+((rec) + rec_get_nth_field_offs(offsets, n, len))
+/******************************************************//**
+Determine if the offsets are for a record in the new
+compact format.
+@return	nonzero if compact format */
+UNIV_INLINE
+ulint
+rec_offs_comp(
+/*==========*/
+	const ulint*	offsets);/*!< in: array returned by rec_get_offsets() */
+/******************************************************//**
+Determine if the offsets are for a record containing
+externally stored columns.
+@return	nonzero if externally stored */
+UNIV_INLINE
+ulint
+rec_offs_any_extern(
+/*================*/
+	const ulint*	offsets);/*!< in: array returned by rec_get_offsets() */
+/******************************************************//**
+Returns nonzero if the extern bit is set in nth field of rec.
+@return	nonzero if externally stored */
+UNIV_INLINE
+ulint
+rec_offs_nth_extern(
+/*================*/
+	const ulint*	offsets,/*!< in: array returned by rec_get_offsets() */
+	ulint		n);	/*!< in: nth field */
+/******************************************************//**
+Returns nonzero if the SQL NULL bit is set in nth field of rec.
+@return	nonzero if SQL NULL */
+UNIV_INLINE
+ulint
+rec_offs_nth_sql_null(
+/*==================*/
+	const ulint*	offsets,/*!< in: array returned by rec_get_offsets() */
+	ulint		n);	/*!< in: nth field */
+/******************************************************//**
+Gets the physical size of a field.
+@return	length of field */
+UNIV_INLINE
+ulint
+rec_offs_nth_size(
+/*==============*/
+	const ulint*	offsets,/*!< in: array returned by rec_get_offsets() */
+	ulint		n);	/*!< in: nth field */
+
+/******************************************************//**
+Returns the number of extern bits set in a record.
+@return	number of externally stored fields */
+UNIV_INLINE
+ulint
+rec_offs_n_extern(
+/*==============*/
+	const ulint*	offsets);/*!< in: array returned by rec_get_offsets() */
+/***********************************************************//**
+This is used to modify the value of an already existing field in a record.
+The previous value must have exactly the same size as the new value. If len
+is UNIV_SQL_NULL then the field is treated as an SQL null.
+For records in ROW_FORMAT=COMPACT (new-style records), len must not be
+UNIV_SQL_NULL unless the field already is SQL null. */
+UNIV_INLINE
+void
+rec_set_nth_field(
+/*==============*/
+	rec_t*		rec,	/*!< in: record */
+	const ulint*	offsets,/*!< in: array returned by rec_get_offsets() */
+	ulint		n,	/*!< in: index number of the field */
+	const void*	data,	/*!< in: pointer to the data if not SQL null */
+	ulint		len);	/*!< in: length of the data or UNIV_SQL_NULL */
+/**********************************************************//**
+The following function returns the data size of an old-style physical
+record, that is the sum of field lengths. SQL null fields
+are counted as length 0 fields. The value returned by the function
+is the distance from record origin to record end in bytes.
+@return	size */
+UNIV_INLINE
+ulint
+rec_get_data_size_old(
+/*==================*/
+	const rec_t*	rec);	/*!< in: physical record */
+/**********************************************************//**
+The following function returns the number of allocated elements
+for an array of offsets.
+@return	number of elements */
+UNIV_INLINE
+ulint
+rec_offs_get_n_alloc(
+/*=================*/
+	const ulint*	offsets);/*!< in: array for rec_get_offsets() */
+/**********************************************************//**
+The following function sets the number of allocated elements
+for an array of offsets. */
+UNIV_INLINE
+void
+rec_offs_set_n_alloc(
+/*=================*/
+	ulint*	offsets,	/*!< out: array for rec_get_offsets(),
+				must be allocated */
+	ulint	n_alloc);	/*!< in: number of elements */
+#define rec_offs_init(offsets) \
+	rec_offs_set_n_alloc(offsets, (sizeof offsets) / sizeof *offsets)
+/**********************************************************//**
+The following function returns the number of fields in a record.
+@return	number of fields */
+UNIV_INLINE
+ulint
+rec_offs_n_fields(
+/*==============*/
+	const ulint*	offsets);/*!< in: array returned by rec_get_offsets() */
+/**********************************************************//**
+The following function returns the data size of a physical
+record, that is the sum of field lengths. SQL null fields
+are counted as length 0 fields. The value returned by the function
+is the distance from record origin to record end in bytes.
+@return	size */
+UNIV_INLINE
+ulint
+rec_offs_data_size(
+/*===============*/
+	const ulint*	offsets);/*!< in: array returned by rec_get_offsets() */
+/**********************************************************//**
+Returns the total size of record minus data size of record.
+The value returned by the function is the distance from record
+start to record origin in bytes.
+@return	size */
+UNIV_INLINE
+ulint
+rec_offs_extra_size(
+/*================*/
+	const ulint*	offsets);/*!< in: array returned by rec_get_offsets() */
+/**********************************************************//**
+Returns the total size of a physical record.
+@return	size */
+UNIV_INLINE
+ulint
+rec_offs_size(
+/*==========*/
+	const ulint*	offsets);/*!< in: array returned by rec_get_offsets() */
+/**********************************************************//**
+Returns a pointer to the start of the record.
+@return	pointer to start */
+UNIV_INLINE
+byte*
+rec_get_start(
+/*==========*/
+	rec_t*		rec,	/*!< in: pointer to record */
+	const ulint*	offsets);/*!< in: array returned by rec_get_offsets() */
+/**********************************************************//**
+Returns a pointer to the end of the record.
+@return	pointer to end */
+UNIV_INLINE
+byte*
+rec_get_end(
+/*========*/
+	rec_t*		rec,	/*!< in: pointer to record */
+	const ulint*	offsets);/*!< in: array returned by rec_get_offsets() */
+/***************************************************************//**
+Copies a physical record to a buffer.
+@return	pointer to the origin of the copy */
+UNIV_INLINE
+rec_t*
+rec_copy(
+/*=====*/
+	void*		buf,	/*!< in: buffer */
+	const rec_t*	rec,	/*!< in: physical record */
+	const ulint*	offsets);/*!< in: array returned by rec_get_offsets() */
+#ifndef UNIV_HOTBACKUP
+/**************************************************************//**
+Copies the first n fields of a physical record to a new physical record in
+a buffer.
+@return	own: copied record */
+UNIV_INTERN
+rec_t*
+rec_copy_prefix_to_buf(
+/*===================*/
+	const rec_t*		rec,		/*!< in: physical record */
+	const dict_index_t*	index,		/*!< in: record descriptor */
+	ulint			n_fields,	/*!< in: number of fields
+						to copy */
+	byte**			buf,		/*!< in/out: memory buffer
+						for the copied prefix,
+						or NULL */
+	ulint*			buf_size);	/*!< in/out: buffer size */
+/************************************************************//**
+Folds a prefix of a physical record to a ulint.
+@return	the folded value */
+UNIV_INLINE
+ulint
+rec_fold(
+/*=====*/
+	const rec_t*	rec,		/*!< in: the physical record */
+	const ulint*	offsets,	/*!< in: array returned by
+					rec_get_offsets() */
+	ulint		n_fields,	/*!< in: number of complete
+					fields to fold */
+	ulint		n_bytes,	/*!< in: number of bytes to fold
+					in an incomplete last field */
+	dulint		tree_id)	/*!< in: index tree id */
+	__attribute__((pure));
+#endif /* !UNIV_HOTBACKUP */
+/*********************************************************//**
+Builds a ROW_FORMAT=COMPACT record out of a data tuple. */
+UNIV_INTERN
+void
+rec_convert_dtuple_to_rec_comp(
+/*===========================*/
+	rec_t*			rec,	/*!< in: origin of record */
+	ulint			extra,	/*!< in: number of bytes to
+					reserve between the record
+					header and the data payload
+					(normally REC_N_NEW_EXTRA_BYTES) */
+	const dict_index_t*	index,	/*!< in: record descriptor */
+	ulint			status,	/*!< in: status bits of the record */
+	const dfield_t*		fields,	/*!< in: array of data fields */
+	ulint			n_fields);/*!< in: number of data fields */
+/*********************************************************//**
+Builds a physical record out of a data tuple and
+stores it into the given buffer.
+@return	pointer to the origin of physical record */
+UNIV_INTERN
+rec_t*
+rec_convert_dtuple_to_rec(
+/*======================*/
+	byte*			buf,	/*!< in: start address of the
+					physical record */
+	const dict_index_t*	index,	/*!< in: record descriptor */
+	const dtuple_t*		dtuple,	/*!< in: data tuple */
+	ulint			n_ext);	/*!< in: number of
+					externally stored columns */
+/**********************************************************//**
+Returns the extra size of an old-style physical record if we know its
+data size and number of fields.
+@return	extra size */
+UNIV_INLINE
+ulint
+rec_get_converted_extra_size(
+/*=========================*/
+	ulint	data_size,	/*!< in: data size */
+	ulint	n_fields,	/*!< in: number of fields */
+	ulint	n_ext)		/*!< in: number of externally stored columns */
+		__attribute__((const));
+/**********************************************************//**
+Determines the size of a data tuple prefix in ROW_FORMAT=COMPACT.
+@return	total size */
+UNIV_INTERN
+ulint
+rec_get_converted_size_comp_prefix(
+/*===============================*/
+	const dict_index_t*	index,	/*!< in: record descriptor;
+					dict_table_is_comp() is
+					assumed to hold, even if
+					it does not */
+	const dfield_t*		fields,	/*!< in: array of data fields */
+	ulint			n_fields,/*!< in: number of data fields */
+	ulint*			extra);	/*!< out: extra size */
+/**********************************************************//**
+Determines the size of a data tuple in ROW_FORMAT=COMPACT.
+@return	total size */
+UNIV_INTERN
+ulint
+rec_get_converted_size_comp(
+/*========================*/
+	const dict_index_t*	index,	/*!< in: record descriptor;
+					dict_table_is_comp() is
+					assumed to hold, even if
+					it does not */
+	ulint			status,	/*!< in: status bits of the record */
+	const dfield_t*		fields,	/*!< in: array of data fields */
+	ulint			n_fields,/*!< in: number of data fields */
+	ulint*			extra);	/*!< out: extra size */
+/**********************************************************//**
+The following function returns the size of a data tuple when converted to
+a physical record.
+@return	size */
+UNIV_INLINE
+ulint
+rec_get_converted_size(
+/*===================*/
+	dict_index_t*	index,	/*!< in: record descriptor */
+	const dtuple_t*	dtuple,	/*!< in: data tuple */
+	ulint		n_ext);	/*!< in: number of externally stored columns */
+#ifndef UNIV_HOTBACKUP
+/**************************************************************//**
+Copies the first n fields of a physical record to a data tuple.
+The fields are copied to the memory heap. */
+UNIV_INTERN
+void
+rec_copy_prefix_to_dtuple(
+/*======================*/
+	dtuple_t*		tuple,		/*!< out: data tuple */
+	const rec_t*		rec,		/*!< in: physical record */
+	const dict_index_t*	index,		/*!< in: record descriptor */
+	ulint			n_fields,	/*!< in: number of fields
+						to copy */
+	mem_heap_t*		heap);		/*!< in: memory heap */
+#endif /* !UNIV_HOTBACKUP */
+/***************************************************************//**
+Validates the consistency of a physical record.
+@return	TRUE if ok */
+UNIV_INTERN
+ibool
+rec_validate(
+/*=========*/
+	const rec_t*	rec,	/*!< in: physical record */
+	const ulint*	offsets);/*!< in: array returned by rec_get_offsets() */
+/***************************************************************//**
+Prints an old-style physical record. */
+UNIV_INTERN
+void
+rec_print_old(
+/*==========*/
+	FILE*		file,	/*!< in: file where to print */
+	const rec_t*	rec);	/*!< in: physical record */
+#ifndef UNIV_HOTBACKUP
+/***************************************************************//**
+Prints a physical record in ROW_FORMAT=COMPACT.  Ignores the
+record header. */
+UNIV_INTERN
+void
+rec_print_comp(
+/*===========*/
+	FILE*		file,	/*!< in: file where to print */
+	const rec_t*	rec,	/*!< in: physical record */
+	const ulint*	offsets);/*!< in: array returned by rec_get_offsets() */
+/***************************************************************//**
+Prints a physical record. */
+UNIV_INTERN
+void
+rec_print_new(
+/*==========*/
+	FILE*		file,	/*!< in: file where to print */
+	const rec_t*	rec,	/*!< in: physical record */
+	const ulint*	offsets);/*!< in: array returned by rec_get_offsets() */
+/***************************************************************//**
+Prints a physical record. */
+UNIV_INTERN
+void
+rec_print(
+/*======*/
+	FILE*		file,	/*!< in: file where to print */
+	const rec_t*	rec,	/*!< in: physical record */
+	dict_index_t*	index);	/*!< in: record descriptor */
+#endif /* UNIV_HOTBACKUP */
+
+#define REC_INFO_BITS		6	/* This is single byte bit-field */
+
+/* Maximum lengths for the data in a physical record if the offsets
+are given in one byte (resp. two byte) format. */
+#define REC_1BYTE_OFFS_LIMIT	0x7FUL
+#define REC_2BYTE_OFFS_LIMIT	0x7FFFUL
+
+/* The data size of record must be smaller than this because we reserve
+two upmost bits in a two byte offset for special purposes */
+#define REC_MAX_DATA_SIZE	(16 * 1024)
+
+#ifndef UNIV_NONINL
+#include "rem0rec.ic"
+#endif
+
+#endif
diff --git a/storage/xtradb/include/rem0rec.ic b/storage/xtradb/include/rem0rec.ic
new file mode 100644
index 00000000000..fa96c97f95e
--- /dev/null
+++ b/storage/xtradb/include/rem0rec.ic
@@ -0,0 +1,1647 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/********************************************************************//**
+@file include/rem0rec.ic
+Record manager
+
+Created 5/30/1994 Heikki Tuuri
+*************************************************************************/
+
+#include "mach0data.h"
+#include "ut0byte.h"
+#include "dict0dict.h"
+
+/* Compact flag ORed to the extra size returned by rec_get_offsets() */
+#define REC_OFFS_COMPACT	((ulint) 1 << 31)
+/* SQL NULL flag in offsets returned by rec_get_offsets() */
+#define REC_OFFS_SQL_NULL	((ulint) 1 << 31)
+/* External flag in offsets returned by rec_get_offsets() */
+#define REC_OFFS_EXTERNAL	((ulint) 1 << 30)
+/* Mask for offsets returned by rec_get_offsets() */
+#define REC_OFFS_MASK		(REC_OFFS_EXTERNAL - 1)
+
+/* Offsets of the bit-fields in an old-style record. NOTE! In the table the
+most significant bytes and bits are written below less significant.
+
+	(1) byte offset		(2) bit usage within byte
+	downward from
+	origin ->	1	8 bits pointer to next record
+			2	8 bits pointer to next record
+			3	1 bit short flag
+				7 bits number of fields
+			4	3 bits number of fields
+				5 bits heap number
+			5	8 bits heap number
+			6	4 bits n_owned
+				4 bits info bits
+*/
+
+/* Offsets of the bit-fields in a new-style record. NOTE! In the table the
+most significant bytes and bits are written below less significant.
+
+	(1) byte offset		(2) bit usage within byte
+	downward from
+	origin ->	1	8 bits relative offset of next record
+			2	8 bits relative offset of next record
+				  the relative offset is an unsigned 16-bit
+				  integer:
+				  (offset_of_next_record
+				   - offset_of_this_record) mod 64Ki,
+				  where mod is the modulo as a non-negative
+				  number;
+				  we can calculate the offset of the next
+				  record with the formula:
+				  relative_offset + offset_of_this_record
+				  mod UNIV_PAGE_SIZE
+			3	3 bits status:
+					000=conventional record
+					001=node pointer record (inside B-tree)
+					010=infimum record
+					011=supremum record
+					1xx=reserved
+				5 bits heap number
+			4	8 bits heap number
+			5	4 bits n_owned
+				4 bits info bits
+*/
+
+/* We list the byte offsets from the origin of the record, the mask,
+and the shift needed to obtain each bit-field of the record. */
+
+#define REC_NEXT		2
+#define REC_NEXT_MASK		0xFFFFUL
+#define REC_NEXT_SHIFT		0
+
+#define REC_OLD_SHORT		3	/* This is single byte bit-field */
+#define REC_OLD_SHORT_MASK	0x1UL
+#define REC_OLD_SHORT_SHIFT	0
+
+#define REC_OLD_N_FIELDS	4
+#define REC_OLD_N_FIELDS_MASK	0x7FEUL
+#define REC_OLD_N_FIELDS_SHIFT	1
+
+#define REC_NEW_STATUS		3	/* This is single byte bit-field */
+#define REC_NEW_STATUS_MASK	0x7UL
+#define REC_NEW_STATUS_SHIFT	0
+
+#define REC_OLD_HEAP_NO		5
+#define REC_HEAP_NO_MASK	0xFFF8UL
+#if 0 /* defined in rem0rec.h for use of page0zip.c */
+#define REC_NEW_HEAP_NO		4
+#define	REC_HEAP_NO_SHIFT	3
+#endif
+
+#define REC_OLD_N_OWNED		6	/* This is single byte bit-field */
+#define REC_NEW_N_OWNED		5	/* This is single byte bit-field */
+#define	REC_N_OWNED_MASK	0xFUL
+#define REC_N_OWNED_SHIFT	0
+
+#define REC_OLD_INFO_BITS	6	/* This is single byte bit-field */
+#define REC_NEW_INFO_BITS	5	/* This is single byte bit-field */
+#define	REC_INFO_BITS_MASK	0xF0UL
+#define REC_INFO_BITS_SHIFT	0
+
+/* The following masks are used to filter the SQL null bit from
+one-byte and two-byte offsets */
+
+#define REC_1BYTE_SQL_NULL_MASK	0x80UL
+#define REC_2BYTE_SQL_NULL_MASK	0x8000UL
+
+/* In a 2-byte offset the second most significant bit denotes
+a field stored to another page: */
+
+#define REC_2BYTE_EXTERN_MASK	0x4000UL
+
+#if REC_OLD_SHORT_MASK << (8 * (REC_OLD_SHORT - 3)) \
+		^ REC_OLD_N_FIELDS_MASK << (8 * (REC_OLD_N_FIELDS - 4)) \
+		^ REC_HEAP_NO_MASK << (8 * (REC_OLD_HEAP_NO - 4)) \
+		^ REC_N_OWNED_MASK << (8 * (REC_OLD_N_OWNED - 3)) \
+		^ REC_INFO_BITS_MASK << (8 * (REC_OLD_INFO_BITS - 3)) \
+		^ 0xFFFFFFFFUL
+# error "sum of old-style masks != 0xFFFFFFFFUL"
+#endif
+#if REC_NEW_STATUS_MASK << (8 * (REC_NEW_STATUS - 3)) \
+		^ REC_HEAP_NO_MASK << (8 * (REC_NEW_HEAP_NO - 4)) \
+		^ REC_N_OWNED_MASK << (8 * (REC_NEW_N_OWNED - 3)) \
+		^ REC_INFO_BITS_MASK << (8 * (REC_NEW_INFO_BITS - 3)) \
+		^ 0xFFFFFFUL
+# error "sum of new-style masks != 0xFFFFFFUL"
+#endif
+
+/***********************************************************//**
+Sets the value of the ith field SQL null bit of an old-style record. */
+UNIV_INTERN
+void
+rec_set_nth_field_null_bit(
+/*=======================*/
+	rec_t*	rec,	/*!< in: record */
+	ulint	i,	/*!< in: ith field */
+	ibool	val);	/*!< in: value to set */
+/***********************************************************//**
+Sets an old-style record field to SQL null.
+The physical size of the field is not changed. */
+UNIV_INTERN
+void
+rec_set_nth_field_sql_null(
+/*=======================*/
+	rec_t*	rec,	/*!< in: record */
+	ulint	n);	/*!< in: index of the field */
+
+/******************************************************//**
+Gets a bit field from within 1 byte. */
+UNIV_INLINE
+ulint
+rec_get_bit_field_1(
+/*================*/
+	const rec_t*	rec,	/*!< in: pointer to record origin */
+	ulint		offs,	/*!< in: offset from the origin down */
+	ulint		mask,	/*!< in: mask used to filter bits */
+	ulint		shift)	/*!< in: shift right applied after masking */
+{
+	ut_ad(rec);
+
+	return((mach_read_from_1(rec - offs) & mask) >> shift);
+}
+
+/******************************************************//**
+Sets a bit field within 1 byte. */
+UNIV_INLINE
+void
+rec_set_bit_field_1(
+/*================*/
+	rec_t*	rec,	/*!< in: pointer to record origin */
+	ulint	val,	/*!< in: value to set */
+	ulint	offs,	/*!< in: offset from the origin down */
+	ulint	mask,	/*!< in: mask used to filter bits */
+	ulint	shift)	/*!< in: shift right applied after masking */
+{
+	ut_ad(rec);
+	ut_ad(offs <= REC_N_OLD_EXTRA_BYTES);
+	ut_ad(mask);
+	ut_ad(mask <= 0xFFUL);
+	ut_ad(((mask >> shift) << shift) == mask);
+	ut_ad(((val << shift) & mask) == (val << shift));
+
+	mach_write_to_1(rec - offs,
+			(mach_read_from_1(rec - offs) & ~mask)
+			| (val << shift));
+}
+
+/******************************************************//**
+Gets a bit field from within 2 bytes. */
+UNIV_INLINE
+ulint
+rec_get_bit_field_2(
+/*================*/
+	const rec_t*	rec,	/*!< in: pointer to record origin */
+	ulint		offs,	/*!< in: offset from the origin down */
+	ulint		mask,	/*!< in: mask used to filter bits */
+	ulint		shift)	/*!< in: shift right applied after masking */
+{
+	ut_ad(rec);
+
+	return((mach_read_from_2(rec - offs) & mask) >> shift);
+}
+
+/******************************************************//**
+Sets a bit field within 2 bytes. */
+UNIV_INLINE
+void
+rec_set_bit_field_2(
+/*================*/
+	rec_t*	rec,	/*!< in: pointer to record origin */
+	ulint	val,	/*!< in: value to set */
+	ulint	offs,	/*!< in: offset from the origin down */
+	ulint	mask,	/*!< in: mask used to filter bits */
+	ulint	shift)	/*!< in: shift right applied after masking */
+{
+	ut_ad(rec);
+	ut_ad(offs <= REC_N_OLD_EXTRA_BYTES);
+	ut_ad(mask > 0xFFUL);
+	ut_ad(mask <= 0xFFFFUL);
+	ut_ad((mask >> shift) & 1);
+	ut_ad(0 == ((mask >> shift) & ((mask >> shift) + 1)));
+	ut_ad(((mask >> shift) << shift) == mask);
+	ut_ad(((val << shift) & mask) == (val << shift));
+
+	mach_write_to_2(rec - offs,
+			(mach_read_from_2(rec - offs) & ~mask)
+			| (val << shift));
+}
+
+/******************************************************//**
+The following function is used to get the pointer of the next chained record
+on the same page.
+@return	pointer to the next chained record, or NULL if none */
+UNIV_INLINE
+const rec_t*
+rec_get_next_ptr_const(
+/*===================*/
+	const rec_t*	rec,	/*!< in: physical record */
+	ulint		comp)	/*!< in: nonzero=compact page format */
+{
+	ulint	field_value;
+
+	ut_ad(REC_NEXT_MASK == 0xFFFFUL);
+	ut_ad(REC_NEXT_SHIFT == 0);
+
+	field_value = mach_read_from_2(rec - REC_NEXT);
+
+	if (UNIV_UNLIKELY(field_value == 0)) {
+
+		return(NULL);
+	}
+
+	if (UNIV_LIKELY(comp != 0)) {
+#if UNIV_PAGE_SIZE <= 32768
+		/* Note that for 64 KiB pages, field_value can 'wrap around'
+		and the debug assertion is not valid */
+
+		/* In the following assertion, field_value is interpreted
+		as signed 16-bit integer in 2's complement arithmetics.
+		If all platforms defined int16_t in the standard headers,
+		the expression could be written simpler as
+		(int16_t) field_value + ut_align_offset(...) < UNIV_PAGE_SIZE
+		*/
+		ut_ad((field_value >= 32768
+		       ? field_value - 65536
+		       : field_value)
+		      + ut_align_offset(rec, UNIV_PAGE_SIZE)
+		      < UNIV_PAGE_SIZE);
+#endif
+		/* There must be at least REC_N_NEW_EXTRA_BYTES + 1
+		between each record. */
+		ut_ad((field_value > REC_N_NEW_EXTRA_BYTES
+		       && field_value < 32768)
+		      || field_value < (uint16) -REC_N_NEW_EXTRA_BYTES);
+
+		return((byte*) ut_align_down(rec, UNIV_PAGE_SIZE)
+		       + ut_align_offset(rec + field_value, UNIV_PAGE_SIZE));
+	} else {
+		ut_ad(field_value < UNIV_PAGE_SIZE);
+
+		return((byte*) ut_align_down(rec, UNIV_PAGE_SIZE)
+		       + field_value);
+	}
+}
+
+/******************************************************//**
+The following function is used to get the pointer of the next chained record
+on the same page.
+@return	pointer to the next chained record, or NULL if none */
+UNIV_INLINE
+rec_t*
+rec_get_next_ptr(
+/*=============*/
+	rec_t*	rec,	/*!< in: physical record */
+	ulint	comp)	/*!< in: nonzero=compact page format */
+{
+	return((rec_t*) rec_get_next_ptr_const(rec, comp));
+}
+
+/******************************************************//**
+The following function is used to get the offset of the next chained record
+on the same page.
+@return	the page offset of the next chained record, or 0 if none */
+UNIV_INLINE
+ulint
+rec_get_next_offs(
+/*==============*/
+	const rec_t*	rec,	/*!< in: physical record */
+	ulint		comp)	/*!< in: nonzero=compact page format */
+{
+	ulint	field_value;
+#if REC_NEXT_MASK != 0xFFFFUL
+# error "REC_NEXT_MASK != 0xFFFFUL"
+#endif
+#if REC_NEXT_SHIFT
+# error "REC_NEXT_SHIFT != 0"
+#endif
+
+	field_value = mach_read_from_2(rec - REC_NEXT);
+
+	if (UNIV_LIKELY(comp != 0)) {
+#if UNIV_PAGE_SIZE <= 32768
+		/* Note that for 64 KiB pages, field_value can 'wrap around'
+		and the debug assertion is not valid */
+
+		/* In the following assertion, field_value is interpreted
+		as signed 16-bit integer in 2's complement arithmetics.
+		If all platforms defined int16_t in the standard headers,
+		the expression could be written simpler as
+		(int16_t) field_value + ut_align_offset(...) < UNIV_PAGE_SIZE
+		*/
+		ut_ad((field_value >= 32768
+		       ? field_value - 65536
+		       : field_value)
+		      + ut_align_offset(rec, UNIV_PAGE_SIZE)
+		      < UNIV_PAGE_SIZE);
+#endif
+		if (UNIV_UNLIKELY(field_value == 0)) {
+
+			return(0);
+		}
+
+		/* There must be at least REC_N_NEW_EXTRA_BYTES + 1
+		between each record. */
+		ut_ad((field_value > REC_N_NEW_EXTRA_BYTES
+		       && field_value < 32768)
+		      || field_value < (uint16) -REC_N_NEW_EXTRA_BYTES);
+
+		return(ut_align_offset(rec + field_value, UNIV_PAGE_SIZE));
+	} else {
+		ut_ad(field_value < UNIV_PAGE_SIZE);
+
+		return(field_value);
+	}
+}
+
+/******************************************************//**
+The following function is used to set the next record offset field
+of an old-style record. */
+UNIV_INLINE
+void
+rec_set_next_offs_old(
+/*==================*/
+	rec_t*	rec,	/*!< in: old-style physical record */
+	ulint	next)	/*!< in: offset of the next record */
+{
+	ut_ad(rec);
+	ut_ad(UNIV_PAGE_SIZE > next);
+#if REC_NEXT_MASK != 0xFFFFUL
+# error "REC_NEXT_MASK != 0xFFFFUL"
+#endif
+#if REC_NEXT_SHIFT
+# error "REC_NEXT_SHIFT != 0"
+#endif
+
+	mach_write_to_2(rec - REC_NEXT, next);
+}
+
+/******************************************************//**
+The following function is used to set the next record offset field
+of a new-style record. */
+UNIV_INLINE
+void
+rec_set_next_offs_new(
+/*==================*/
+	rec_t*	rec,	/*!< in/out: new-style physical record */
+	ulint	next)	/*!< in: offset of the next record */
+{
+	ulint	field_value;
+
+	ut_ad(rec);
+	ut_ad(UNIV_PAGE_SIZE > next);
+
+	if (UNIV_UNLIKELY(!next)) {
+		field_value = 0;
+	} else {
+		/* The following two statements calculate
+		next - offset_of_rec mod 64Ki, where mod is the modulo
+		as a non-negative number */
+
+		field_value = (ulint)
+			((lint) next 
+			 - (lint) ut_align_offset(rec, UNIV_PAGE_SIZE));
+		field_value &= REC_NEXT_MASK;
+	}
+
+	mach_write_to_2(rec - REC_NEXT, field_value);
+}
+
+/******************************************************//**
+The following function is used to get the number of fields
+in an old-style record.
+@return	number of data fields */
+UNIV_INLINE
+ulint
+rec_get_n_fields_old(
+/*=================*/
+	const rec_t*	rec)	/*!< in: physical record */
+{
+	ulint	ret;
+
+	ut_ad(rec);
+
+	ret = rec_get_bit_field_2(rec, REC_OLD_N_FIELDS,
+				  REC_OLD_N_FIELDS_MASK,
+				  REC_OLD_N_FIELDS_SHIFT);
+	ut_ad(ret <= REC_MAX_N_FIELDS);
+	ut_ad(ret > 0);
+
+	return(ret);
+}
+
+/******************************************************//**
+The following function is used to set the number of fields
+in an old-style record. */
+UNIV_INLINE
+void
+rec_set_n_fields_old(
+/*=================*/
+	rec_t*	rec,		/*!< in: physical record */
+	ulint	n_fields)	/*!< in: the number of fields */
+{
+	ut_ad(rec);
+	ut_ad(n_fields <= REC_MAX_N_FIELDS);
+	ut_ad(n_fields > 0);
+
+	rec_set_bit_field_2(rec, n_fields, REC_OLD_N_FIELDS,
+			    REC_OLD_N_FIELDS_MASK, REC_OLD_N_FIELDS_SHIFT);
+}
+
+/******************************************************//**
+The following function retrieves the status bits of a new-style record.
+@return	status bits */
+UNIV_INLINE
+ulint
+rec_get_status(
+/*===========*/
+	const rec_t*	rec)	/*!< in: physical record */
+{
+	ulint	ret;
+
+	ut_ad(rec);
+
+	ret = rec_get_bit_field_1(rec, REC_NEW_STATUS,
+				  REC_NEW_STATUS_MASK, REC_NEW_STATUS_SHIFT);
+	ut_ad((ret & ~REC_NEW_STATUS_MASK) == 0);
+
+	return(ret);
+}
+
+/******************************************************//**
+The following function is used to get the number of fields
+in a record.
+@return	number of data fields */
+UNIV_INLINE
+ulint
+rec_get_n_fields(
+/*=============*/
+	const rec_t*		rec,	/*!< in: physical record */
+	const dict_index_t*	index)	/*!< in: record descriptor */
+{
+	ut_ad(rec);
+	ut_ad(index);
+
+	if (!dict_table_is_comp(index->table)) {
+		return(rec_get_n_fields_old(rec));
+	}
+
+	switch (rec_get_status(rec)) {
+	case REC_STATUS_ORDINARY:
+		return(dict_index_get_n_fields(index));
+	case REC_STATUS_NODE_PTR:
+		return(dict_index_get_n_unique_in_tree(index) + 1);
+	case REC_STATUS_INFIMUM:
+	case REC_STATUS_SUPREMUM:
+		return(1);
+	default:
+		ut_error;
+		return(ULINT_UNDEFINED);
+	}
+}
+
+/******************************************************//**
+The following function is used to get the number of records owned by the
+previous directory record.
+@return	number of owned records */
+UNIV_INLINE
+ulint
+rec_get_n_owned_old(
+/*================*/
+	const rec_t*	rec)	/*!< in: old-style physical record */
+{
+	return(rec_get_bit_field_1(rec, REC_OLD_N_OWNED,
+				   REC_N_OWNED_MASK, REC_N_OWNED_SHIFT));
+}
+
+/******************************************************//**
+The following function is used to set the number of owned records. */
+UNIV_INLINE
+void
+rec_set_n_owned_old(
+/*================*/
+	rec_t*	rec,		/*!< in: old-style physical record */
+	ulint	n_owned)	/*!< in: the number of owned */
+{
+	rec_set_bit_field_1(rec, n_owned, REC_OLD_N_OWNED,
+			    REC_N_OWNED_MASK, REC_N_OWNED_SHIFT);
+}
+
+/******************************************************//**
+The following function is used to get the number of records owned by the
+previous directory record.
+@return	number of owned records */
+UNIV_INLINE
+ulint
+rec_get_n_owned_new(
+/*================*/
+	const rec_t*	rec)	/*!< in: new-style physical record */
+{
+	return(rec_get_bit_field_1(rec, REC_NEW_N_OWNED,
+				   REC_N_OWNED_MASK, REC_N_OWNED_SHIFT));
+}
+
+/******************************************************//**
+The following function is used to set the number of owned records. */
+UNIV_INLINE
+void
+rec_set_n_owned_new(
+/*================*/
+	rec_t*		rec,	/*!< in/out: new-style physical record */
+	page_zip_des_t*	page_zip,/*!< in/out: compressed page, or NULL */
+	ulint		n_owned)/*!< in: the number of owned */
+{
+	rec_set_bit_field_1(rec, n_owned, REC_NEW_N_OWNED,
+			    REC_N_OWNED_MASK, REC_N_OWNED_SHIFT);
+	if (UNIV_LIKELY_NULL(page_zip)
+	    && UNIV_LIKELY(rec_get_status(rec)
+			   != REC_STATUS_SUPREMUM)) {
+		page_zip_rec_set_owned(page_zip, rec, n_owned);
+	}
+}
+
+/******************************************************//**
+The following function is used to retrieve the info bits of a record.
+@return	info bits */
+UNIV_INLINE
+ulint
+rec_get_info_bits(
+/*==============*/
+	const rec_t*	rec,	/*!< in: physical record */
+	ulint		comp)	/*!< in: nonzero=compact page format */
+{
+	return(rec_get_bit_field_1(
+		       rec, comp ? REC_NEW_INFO_BITS : REC_OLD_INFO_BITS,
+		       REC_INFO_BITS_MASK, REC_INFO_BITS_SHIFT));
+}
+
+/******************************************************//**
+The following function is used to set the info bits of a record. */
+UNIV_INLINE
+void
+rec_set_info_bits_old(
+/*==================*/
+	rec_t*	rec,	/*!< in: old-style physical record */
+	ulint	bits)	/*!< in: info bits */
+{
+	rec_set_bit_field_1(rec, bits, REC_OLD_INFO_BITS,
+			    REC_INFO_BITS_MASK, REC_INFO_BITS_SHIFT);
+}
+/******************************************************//**
+The following function is used to set the info bits of a record. */
+UNIV_INLINE
+void
+rec_set_info_bits_new(
+/*==================*/
+	rec_t*	rec,	/*!< in/out: new-style physical record */
+	ulint	bits)	/*!< in: info bits */
+{
+	rec_set_bit_field_1(rec, bits, REC_NEW_INFO_BITS,
+			    REC_INFO_BITS_MASK, REC_INFO_BITS_SHIFT);
+}
+
+/******************************************************//**
+The following function is used to set the status bits of a new-style record. */
+UNIV_INLINE
+void
+rec_set_status(
+/*===========*/
+	rec_t*	rec,	/*!< in/out: physical record */
+	ulint	bits)	/*!< in: info bits */
+{
+	rec_set_bit_field_1(rec, bits, REC_NEW_STATUS,
+			    REC_NEW_STATUS_MASK, REC_NEW_STATUS_SHIFT);
+}
+
+/******************************************************//**
+The following function is used to retrieve the info and status
+bits of a record.  (Only compact records have status bits.)
+@return	info bits */
+UNIV_INLINE
+ulint
+rec_get_info_and_status_bits(
+/*=========================*/
+	const rec_t*	rec,	/*!< in: physical record */
+	ulint		comp)	/*!< in: nonzero=compact page format */
+{
+	ulint	bits;
+#if (REC_NEW_STATUS_MASK >> REC_NEW_STATUS_SHIFT) \
+& (REC_INFO_BITS_MASK >> REC_INFO_BITS_SHIFT)
+# error "REC_NEW_STATUS_MASK and REC_INFO_BITS_MASK overlap"
+#endif
+	if (UNIV_LIKELY(comp != 0)) {
+		bits = rec_get_info_bits(rec, TRUE) | rec_get_status(rec);
+	} else {
+		bits = rec_get_info_bits(rec, FALSE);
+		ut_ad(!(bits & ~(REC_INFO_BITS_MASK >> REC_INFO_BITS_SHIFT)));
+	}
+	return(bits);
+}
+/******************************************************//**
+The following function is used to set the info and status
+bits of a record.  (Only compact records have status bits.) */
+UNIV_INLINE
+void
+rec_set_info_and_status_bits(
+/*=========================*/
+	rec_t*	rec,	/*!< in/out: physical record */
+	ulint	bits)	/*!< in: info bits */
+{
+#if (REC_NEW_STATUS_MASK >> REC_NEW_STATUS_SHIFT) \
+& (REC_INFO_BITS_MASK >> REC_INFO_BITS_SHIFT)
+# error "REC_NEW_STATUS_MASK and REC_INFO_BITS_MASK overlap"
+#endif
+	rec_set_status(rec, bits & REC_NEW_STATUS_MASK);
+	rec_set_info_bits_new(rec, bits & ~REC_NEW_STATUS_MASK);
+}
+
+/******************************************************//**
+The following function tells if record is delete marked.
+@return	nonzero if delete marked */
+UNIV_INLINE
+ulint
+rec_get_deleted_flag(
+/*=================*/
+	const rec_t*	rec,	/*!< in: physical record */
+	ulint		comp)	/*!< in: nonzero=compact page format */
+{
+	if (UNIV_LIKELY(comp != 0)) {
+		return(UNIV_UNLIKELY(
+			       rec_get_bit_field_1(rec, REC_NEW_INFO_BITS,
+						   REC_INFO_DELETED_FLAG,
+						   REC_INFO_BITS_SHIFT)));
+	} else {
+		return(UNIV_UNLIKELY(
+			       rec_get_bit_field_1(rec, REC_OLD_INFO_BITS,
+						   REC_INFO_DELETED_FLAG,
+						   REC_INFO_BITS_SHIFT)));
+	}
+}
+
+/******************************************************//**
+The following function is used to set the deleted bit. */
+UNIV_INLINE
+void
+rec_set_deleted_flag_old(
+/*=====================*/
+	rec_t*	rec,	/*!< in: old-style physical record */
+	ulint	flag)	/*!< in: nonzero if delete marked */
+{
+	ulint	val;
+
+	val = rec_get_info_bits(rec, FALSE);
+
+	if (flag) {
+		val |= REC_INFO_DELETED_FLAG;
+	} else {
+		val &= ~REC_INFO_DELETED_FLAG;
+	}
+
+	rec_set_info_bits_old(rec, val);
+}
+
+/******************************************************//**
+The following function is used to set the deleted bit. */
+UNIV_INLINE
+void
+rec_set_deleted_flag_new(
+/*=====================*/
+	rec_t*		rec,	/*!< in/out: new-style physical record */
+	page_zip_des_t*	page_zip,/*!< in/out: compressed page, or NULL */
+	ulint		flag)	/*!< in: nonzero if delete marked */
+{
+	ulint	val;
+
+	val = rec_get_info_bits(rec, TRUE);
+
+	if (flag) {
+		val |= REC_INFO_DELETED_FLAG;
+	} else {
+		val &= ~REC_INFO_DELETED_FLAG;
+	}
+
+	rec_set_info_bits_new(rec, val);
+
+	if (UNIV_LIKELY_NULL(page_zip)) {
+		page_zip_rec_set_deleted(page_zip, rec, flag);
+	}
+}
+
+/******************************************************//**
+The following function tells if a new-style record is a node pointer.
+@return	TRUE if node pointer */
+UNIV_INLINE
+ibool
+rec_get_node_ptr_flag(
+/*==================*/
+	const rec_t*	rec)	/*!< in: physical record */
+{
+	return(REC_STATUS_NODE_PTR == rec_get_status(rec));
+}
+
+/******************************************************//**
+The following function is used to get the order number
+of an old-style record in the heap of the index page.
+@return	heap order number */
+UNIV_INLINE
+ulint
+rec_get_heap_no_old(
+/*================*/
+	const rec_t*	rec)	/*!< in: physical record */
+{
+	return(rec_get_bit_field_2(rec, REC_OLD_HEAP_NO,
+				   REC_HEAP_NO_MASK, REC_HEAP_NO_SHIFT));
+}
+
+/******************************************************//**
+The following function is used to set the heap number
+field in an old-style record. */
+UNIV_INLINE
+void
+rec_set_heap_no_old(
+/*================*/
+	rec_t*	rec,	/*!< in: physical record */
+	ulint	heap_no)/*!< in: the heap number */
+{
+	rec_set_bit_field_2(rec, heap_no, REC_OLD_HEAP_NO,
+			    REC_HEAP_NO_MASK, REC_HEAP_NO_SHIFT);
+}
+
+/******************************************************//**
+The following function is used to get the order number
+of a new-style record in the heap of the index page.
+@return	heap order number */
+UNIV_INLINE
+ulint
+rec_get_heap_no_new(
+/*================*/
+	const rec_t*	rec)	/*!< in: physical record */
+{
+	return(rec_get_bit_field_2(rec, REC_NEW_HEAP_NO,
+				   REC_HEAP_NO_MASK, REC_HEAP_NO_SHIFT));
+}
+
+/******************************************************//**
+The following function is used to set the heap number
+field in a new-style record. */
+UNIV_INLINE
+void
+rec_set_heap_no_new(
+/*================*/
+	rec_t*	rec,	/*!< in/out: physical record */
+	ulint	heap_no)/*!< in: the heap number */
+{
+	rec_set_bit_field_2(rec, heap_no, REC_NEW_HEAP_NO,
+			    REC_HEAP_NO_MASK, REC_HEAP_NO_SHIFT);
+}
+
+/******************************************************//**
+The following function is used to test whether the data offsets in the record
+are stored in one-byte or two-byte format.
+@return	TRUE if 1-byte form */
+UNIV_INLINE
+ibool
+rec_get_1byte_offs_flag(
+/*====================*/
+	const rec_t*	rec)	/*!< in: physical record */
+{
+#if TRUE != 1
+#error "TRUE != 1"
+#endif
+
+	return(rec_get_bit_field_1(rec, REC_OLD_SHORT, REC_OLD_SHORT_MASK,
+				   REC_OLD_SHORT_SHIFT));
+}
+
+/******************************************************//**
+The following function is used to set the 1-byte offsets flag. */
+UNIV_INLINE
+void
+rec_set_1byte_offs_flag(
+/*====================*/
+	rec_t*	rec,	/*!< in: physical record */
+	ibool	flag)	/*!< in: TRUE if 1byte form */
+{
+#if TRUE != 1
+#error "TRUE != 1"
+#endif
+	ut_ad(flag <= TRUE);
+
+	rec_set_bit_field_1(rec, flag, REC_OLD_SHORT, REC_OLD_SHORT_MASK,
+			    REC_OLD_SHORT_SHIFT);
+}
+
+/******************************************************//**
+Returns the offset of nth field end if the record is stored in the 1-byte
+offsets form. If the field is SQL null, the flag is ORed in the returned
+value.
+@return	offset of the start of the field, SQL null flag ORed */
+UNIV_INLINE
+ulint
+rec_1_get_field_end_info(
+/*=====================*/
+	const rec_t*	rec,	/*!< in: record */
+	ulint		n)	/*!< in: field index */
+{
+	ut_ad(rec_get_1byte_offs_flag(rec));
+	ut_ad(n < rec_get_n_fields_old(rec));
+
+	return(mach_read_from_1(rec - (REC_N_OLD_EXTRA_BYTES + n + 1)));
+}
+
+/******************************************************//**
+Returns the offset of nth field end if the record is stored in the 2-byte
+offsets form. If the field is SQL null, the flag is ORed in the returned
+value.
+@return offset of the start of the field, SQL null flag and extern
+storage flag ORed */
+UNIV_INLINE
+ulint
+rec_2_get_field_end_info(
+/*=====================*/
+	const rec_t*	rec,	/*!< in: record */
+	ulint		n)	/*!< in: field index */
+{
+	ut_ad(!rec_get_1byte_offs_flag(rec));
+	ut_ad(n < rec_get_n_fields_old(rec));
+
+	return(mach_read_from_2(rec - (REC_N_OLD_EXTRA_BYTES + 2 * n + 2)));
+}
+
+/* Get the base address of offsets.  The extra_size is stored at
+this position, and following positions hold the end offsets of
+the fields. */
+#define rec_offs_base(offsets) (offsets + REC_OFFS_HEADER_SIZE)
+
+/**********************************************************//**
+The following function returns the number of allocated elements
+for an array of offsets.
+@return	number of elements */
+UNIV_INLINE
+ulint
+rec_offs_get_n_alloc(
+/*=================*/
+	const ulint*	offsets)/*!< in: array for rec_get_offsets() */
+{
+	ulint	n_alloc;
+	ut_ad(offsets);
+	n_alloc = offsets[0];
+	ut_ad(n_alloc > REC_OFFS_HEADER_SIZE);
+	UNIV_MEM_ASSERT_W(offsets, n_alloc * sizeof *offsets);
+	return(n_alloc);
+}
+
+/**********************************************************//**
+The following function sets the number of allocated elements
+for an array of offsets. */
+UNIV_INLINE
+void
+rec_offs_set_n_alloc(
+/*=================*/
+	ulint*	offsets,	/*!< out: array for rec_get_offsets(),
+				must be allocated */
+	ulint	n_alloc)	/*!< in: number of elements */
+{
+	ut_ad(offsets);
+	ut_ad(n_alloc > REC_OFFS_HEADER_SIZE);
+	UNIV_MEM_ASSERT_AND_ALLOC(offsets, n_alloc * sizeof *offsets);
+	offsets[0] = n_alloc;
+}
+
+/**********************************************************//**
+The following function returns the number of fields in a record.
+@return	number of fields */
+UNIV_INLINE
+ulint
+rec_offs_n_fields(
+/*==============*/
+	const ulint*	offsets)/*!< in: array returned by rec_get_offsets() */
+{
+	ulint	n_fields;
+	ut_ad(offsets);
+	n_fields = offsets[1];
+	ut_ad(n_fields > 0);
+	ut_ad(n_fields <= REC_MAX_N_FIELDS);
+	ut_ad(n_fields + REC_OFFS_HEADER_SIZE
+	      <= rec_offs_get_n_alloc(offsets));
+	return(n_fields);
+}
+
+/************************************************************//**
+Validates offsets returned by rec_get_offsets().
+@return	TRUE if valid */
+UNIV_INLINE
+ibool
+rec_offs_validate(
+/*==============*/
+	const rec_t*		rec,	/*!< in: record or NULL */
+	const dict_index_t*	index,	/*!< in: record descriptor or NULL */
+	const ulint*		offsets)/*!< in: array returned by
+					rec_get_offsets() */
+{
+	ulint	i	= rec_offs_n_fields(offsets);
+	ulint	last	= ULINT_MAX;
+	ulint	comp	= *rec_offs_base(offsets) & REC_OFFS_COMPACT;
+
+	if (rec) {
+		ut_ad((ulint) rec == offsets[2]);
+		if (!comp) {
+			ut_a(rec_get_n_fields_old(rec) >= i);
+		}
+	}
+	if (index) {
+		ulint max_n_fields;
+		ut_ad((ulint) index == offsets[3]);
+		max_n_fields = ut_max(
+			dict_index_get_n_fields(index),
+			dict_index_get_n_unique_in_tree(index) + 1);
+		if (comp && rec) {
+			switch (rec_get_status(rec)) {
+			case REC_STATUS_ORDINARY:
+				break;
+			case REC_STATUS_NODE_PTR:
+				max_n_fields = dict_index_get_n_unique_in_tree(
+					index) + 1;
+				break;
+			case REC_STATUS_INFIMUM:
+			case REC_STATUS_SUPREMUM:
+				max_n_fields = 1;
+				break;
+			default:
+				ut_error;
+			}
+		}
+		/* index->n_def == 0 for dummy indexes if !comp */
+		ut_a(!comp || index->n_def);
+		ut_a(!index->n_def || i <= max_n_fields);
+	}
+	while (i--) {
+		ulint	curr = rec_offs_base(offsets)[1 + i] & REC_OFFS_MASK;
+		ut_a(curr <= last);
+		last = curr;
+	}
+	return(TRUE);
+}
+#ifdef UNIV_DEBUG
+/************************************************************//**
+Updates debug data in offsets, in order to avoid bogus
+rec_offs_validate() failures. */
+UNIV_INLINE
+void
+rec_offs_make_valid(
+/*================*/
+	const rec_t*		rec,	/*!< in: record */
+	const dict_index_t*	index,	/*!< in: record descriptor */
+	ulint*			offsets)/*!< in: array returned by
+					rec_get_offsets() */
+{
+	ut_ad(rec);
+	ut_ad(index);
+	ut_ad(offsets);
+	ut_ad(rec_get_n_fields(rec, index) >= rec_offs_n_fields(offsets));
+	offsets[2] = (ulint) rec;
+	offsets[3] = (ulint) index;
+}
+#endif /* UNIV_DEBUG */
+
+/************************************************************//**
+The following function is used to get an offset to the nth
+data field in a record.
+@return	offset from the origin of rec */
+UNIV_INLINE
+ulint
+rec_get_nth_field_offs(
+/*===================*/
+	const ulint*	offsets,/*!< in: array returned by rec_get_offsets() */
+	ulint		n,	/*!< in: index of the field */
+	ulint*		len)	/*!< out: length of the field; UNIV_SQL_NULL
+				if SQL null */
+{
+	ulint	offs;
+	ulint	length;
+	ut_ad(n < rec_offs_n_fields(offsets));
+	ut_ad(len);
+
+	if (UNIV_UNLIKELY(n == 0)) {
+		offs = 0;
+	} else {
+		offs = rec_offs_base(offsets)[n] & REC_OFFS_MASK;
+	}
+
+	length = rec_offs_base(offsets)[1 + n];
+
+	if (length & REC_OFFS_SQL_NULL) {
+		length = UNIV_SQL_NULL;
+	} else {
+		length &= REC_OFFS_MASK;
+		length -= offs;
+	}
+
+	*len = length;
+	return(offs);
+}
+
+/******************************************************//**
+Determine if the offsets are for a record in the new
+compact format.
+@return	nonzero if compact format */
+UNIV_INLINE
+ulint
+rec_offs_comp(
+/*==========*/
+	const ulint*	offsets)/*!< in: array returned by rec_get_offsets() */
+{
+	ut_ad(rec_offs_validate(NULL, NULL, offsets));
+	return(*rec_offs_base(offsets) & REC_OFFS_COMPACT);
+}
+
+/******************************************************//**
+Determine if the offsets are for a record containing
+externally stored columns.
+@return	nonzero if externally stored */
+UNIV_INLINE
+ulint
+rec_offs_any_extern(
+/*================*/
+	const ulint*	offsets)/*!< in: array returned by rec_get_offsets() */
+{
+	ut_ad(rec_offs_validate(NULL, NULL, offsets));
+	return(UNIV_UNLIKELY(*rec_offs_base(offsets) & REC_OFFS_EXTERNAL));
+}
+
+/******************************************************//**
+Returns nonzero if the extern bit is set in nth field of rec.
+@return	nonzero if externally stored */
+UNIV_INLINE
+ulint
+rec_offs_nth_extern(
+/*================*/
+	const ulint*	offsets,/*!< in: array returned by rec_get_offsets() */
+	ulint		n)	/*!< in: nth field */
+{
+	ut_ad(rec_offs_validate(NULL, NULL, offsets));
+	ut_ad(n < rec_offs_n_fields(offsets));
+	return(UNIV_UNLIKELY(rec_offs_base(offsets)[1 + n]
+			     & REC_OFFS_EXTERNAL));
+}
+
+/******************************************************//**
+Returns nonzero if the SQL NULL bit is set in nth field of rec.
+@return	nonzero if SQL NULL */
+UNIV_INLINE
+ulint
+rec_offs_nth_sql_null(
+/*==================*/
+	const ulint*	offsets,/*!< in: array returned by rec_get_offsets() */
+	ulint		n)	/*!< in: nth field */
+{
+	ut_ad(rec_offs_validate(NULL, NULL, offsets));
+	ut_ad(n < rec_offs_n_fields(offsets));
+	return(UNIV_UNLIKELY(rec_offs_base(offsets)[1 + n]
+			     & REC_OFFS_SQL_NULL));
+}
+
+/******************************************************//**
+Gets the physical size of a field.
+@return	length of field */
+UNIV_INLINE
+ulint
+rec_offs_nth_size(
+/*==============*/
+	const ulint*	offsets,/*!< in: array returned by rec_get_offsets() */
+	ulint		n)	/*!< in: nth field */
+{
+	ut_ad(rec_offs_validate(NULL, NULL, offsets));
+	ut_ad(n < rec_offs_n_fields(offsets));
+	if (!n) {
+		return(rec_offs_base(offsets)[1 + n] & REC_OFFS_MASK);
+	}
+	return((rec_offs_base(offsets)[1 + n] - rec_offs_base(offsets)[n])
+	       & REC_OFFS_MASK);
+}
+
+/******************************************************//**
+Returns the number of extern bits set in a record.
+@return	number of externally stored fields */
+UNIV_INLINE
+ulint
+rec_offs_n_extern(
+/*==============*/
+	const ulint*	offsets)/*!< in: array returned by rec_get_offsets() */
+{
+	ulint	n = 0;
+
+	if (rec_offs_any_extern(offsets)) {
+		ulint	i;
+
+		for (i = rec_offs_n_fields(offsets); i--; ) {
+			if (rec_offs_nth_extern(offsets, i)) {
+				n++;
+			}
+		}
+	}
+
+	return(n);
+}
+
+/******************************************************//**
+Returns the offset of n - 1th field end if the record is stored in the 1-byte
+offsets form. If the field is SQL null, the flag is ORed in the returned
+value. This function and the 2-byte counterpart are defined here because the
+C-compiler was not able to sum negative and positive constant offsets, and
+warned of constant arithmetic overflow within the compiler.
+@return	offset of the start of the PREVIOUS field, SQL null flag ORed */
+UNIV_INLINE
+ulint
+rec_1_get_prev_field_end_info(
+/*==========================*/
+	const rec_t*	rec,	/*!< in: record */
+	ulint		n)	/*!< in: field index */
+{
+	ut_ad(rec_get_1byte_offs_flag(rec));
+	ut_ad(n <= rec_get_n_fields_old(rec));
+
+	return(mach_read_from_1(rec - (REC_N_OLD_EXTRA_BYTES + n)));
+}
+
+/******************************************************//**
+Returns the offset of n - 1th field end if the record is stored in the 2-byte
+offsets form. If the field is SQL null, the flag is ORed in the returned
+value.
+@return	offset of the start of the PREVIOUS field, SQL null flag ORed */
+UNIV_INLINE
+ulint
+rec_2_get_prev_field_end_info(
+/*==========================*/
+	const rec_t*	rec,	/*!< in: record */
+	ulint		n)	/*!< in: field index */
+{
+	ut_ad(!rec_get_1byte_offs_flag(rec));
+	ut_ad(n <= rec_get_n_fields_old(rec));
+
+	return(mach_read_from_2(rec - (REC_N_OLD_EXTRA_BYTES + 2 * n)));
+}
+
+/******************************************************//**
+Sets the field end info for the nth field if the record is stored in the
+1-byte format. */
+UNIV_INLINE
+void
+rec_1_set_field_end_info(
+/*=====================*/
+	rec_t*	rec,	/*!< in: record */
+	ulint	n,	/*!< in: field index */
+	ulint	info)	/*!< in: value to set */
+{
+	ut_ad(rec_get_1byte_offs_flag(rec));
+	ut_ad(n < rec_get_n_fields_old(rec));
+
+	mach_write_to_1(rec - (REC_N_OLD_EXTRA_BYTES + n + 1), info);
+}
+
+/******************************************************//**
+Sets the field end info for the nth field if the record is stored in the
+2-byte format. */
+UNIV_INLINE
+void
+rec_2_set_field_end_info(
+/*=====================*/
+	rec_t*	rec,	/*!< in: record */
+	ulint	n,	/*!< in: field index */
+	ulint	info)	/*!< in: value to set */
+{
+	ut_ad(!rec_get_1byte_offs_flag(rec));
+	ut_ad(n < rec_get_n_fields_old(rec));
+
+	mach_write_to_2(rec - (REC_N_OLD_EXTRA_BYTES + 2 * n + 2), info);
+}
+
+/******************************************************//**
+Returns the offset of nth field start if the record is stored in the 1-byte
+offsets form.
+@return	offset of the start of the field */
+UNIV_INLINE
+ulint
+rec_1_get_field_start_offs(
+/*=======================*/
+	const rec_t*	rec,	/*!< in: record */
+	ulint		n)	/*!< in: field index */
+{
+	ut_ad(rec_get_1byte_offs_flag(rec));
+	ut_ad(n <= rec_get_n_fields_old(rec));
+
+	if (n == 0) {
+
+		return(0);
+	}
+
+	return(rec_1_get_prev_field_end_info(rec, n)
+	       & ~REC_1BYTE_SQL_NULL_MASK);
+}
+
+/******************************************************//**
+Returns the offset of nth field start if the record is stored in the 2-byte
+offsets form.
+@return	offset of the start of the field */
+UNIV_INLINE
+ulint
+rec_2_get_field_start_offs(
+/*=======================*/
+	const rec_t*	rec,	/*!< in: record */
+	ulint		n)	/*!< in: field index */
+{
+	ut_ad(!rec_get_1byte_offs_flag(rec));
+	ut_ad(n <= rec_get_n_fields_old(rec));
+
+	if (n == 0) {
+
+		return(0);
+	}
+
+	return(rec_2_get_prev_field_end_info(rec, n)
+	       & ~(REC_2BYTE_SQL_NULL_MASK | REC_2BYTE_EXTERN_MASK));
+}
+
+/******************************************************//**
+The following function is used to read the offset of the start of a data field
+in the record. The start of an SQL null field is the end offset of the
+previous non-null field, or 0, if none exists. If n is the number of the last
+field + 1, then the end offset of the last field is returned.
+@return	offset of the start of the field */
+UNIV_INLINE
+ulint
+rec_get_field_start_offs(
+/*=====================*/
+	const rec_t*	rec,	/*!< in: record */
+	ulint		n)	/*!< in: field index */
+{
+	ut_ad(rec);
+	ut_ad(n <= rec_get_n_fields_old(rec));
+
+	if (n == 0) {
+
+		return(0);
+	}
+
+	if (rec_get_1byte_offs_flag(rec)) {
+
+		return(rec_1_get_field_start_offs(rec, n));
+	}
+
+	return(rec_2_get_field_start_offs(rec, n));
+}
+
+/************************************************************//**
+Gets the physical size of an old-style field.
+Also an SQL null may have a field of size > 0,
+if the data type is of a fixed size.
+@return	field size in bytes */
+UNIV_INLINE
+ulint
+rec_get_nth_field_size(
+/*===================*/
+	const rec_t*	rec,	/*!< in: record */
+	ulint		n)	/*!< in: index of the field */
+{
+	ulint	os;
+	ulint	next_os;
+
+	os = rec_get_field_start_offs(rec, n);
+	next_os = rec_get_field_start_offs(rec, n + 1);
+
+	ut_ad(next_os - os < UNIV_PAGE_SIZE);
+
+	return(next_os - os);
+}
+
+/***********************************************************//**
+This is used to modify the value of an already existing field in a record.
+The previous value must have exactly the same size as the new value. If len
+is UNIV_SQL_NULL then the field is treated as an SQL null.
+For records in ROW_FORMAT=COMPACT (new-style records), len must not be
+UNIV_SQL_NULL unless the field already is SQL null. */
+UNIV_INLINE
+void
+rec_set_nth_field(
+/*==============*/
+	rec_t*		rec,	/*!< in: record */
+	const ulint*	offsets,/*!< in: array returned by rec_get_offsets() */
+	ulint		n,	/*!< in: index number of the field */
+	const void*	data,	/*!< in: pointer to the data
+				if not SQL null */
+	ulint		len)	/*!< in: length of the data or UNIV_SQL_NULL */
+{
+	byte*	data2;
+	ulint	len2;
+
+	ut_ad(rec);
+	ut_ad(rec_offs_validate(rec, NULL, offsets));
+
+	if (UNIV_UNLIKELY(len == UNIV_SQL_NULL)) {
+		if (!rec_offs_nth_sql_null(offsets, n)) {
+			ut_a(!rec_offs_comp(offsets));
+			rec_set_nth_field_sql_null(rec, n);
+		}
+
+		return;
+	}
+
+	data2 = rec_get_nth_field(rec, offsets, n, &len2);
+	if (len2 == UNIV_SQL_NULL) {
+		ut_ad(!rec_offs_comp(offsets));
+		rec_set_nth_field_null_bit(rec, n, FALSE);
+		ut_ad(len == rec_get_nth_field_size(rec, n));
+	} else {
+		ut_ad(len2 == len);
+	}
+
+	ut_memcpy(data2, data, len);
+}
+
+/**********************************************************//**
+The following function returns the data size of an old-style physical
+record, that is the sum of field lengths. SQL null fields
+are counted as length 0 fields. The value returned by the function
+is the distance from record origin to record end in bytes.
+@return	size */
+UNIV_INLINE
+ulint
+rec_get_data_size_old(
+/*==================*/
+	const rec_t*	rec)	/*!< in: physical record */
+{
+	ut_ad(rec);
+
+	return(rec_get_field_start_offs(rec, rec_get_n_fields_old(rec)));
+}
+
+/**********************************************************//**
+The following function sets the number of fields in offsets. */
+UNIV_INLINE
+void
+rec_offs_set_n_fields(
+/*==================*/
+	ulint*	offsets,	/*!< in/out: array returned by
+				rec_get_offsets() */
+	ulint	n_fields)	/*!< in: number of fields */
+{
+	ut_ad(offsets);
+	ut_ad(n_fields > 0);
+	ut_ad(n_fields <= REC_MAX_N_FIELDS);
+	ut_ad(n_fields + REC_OFFS_HEADER_SIZE
+	      <= rec_offs_get_n_alloc(offsets));
+	offsets[1] = n_fields;
+}
+
+/**********************************************************//**
+The following function returns the data size of a physical
+record, that is the sum of field lengths. SQL null fields
+are counted as length 0 fields. The value returned by the function
+is the distance from record origin to record end in bytes.
+@return	size */
+UNIV_INLINE
+ulint
+rec_offs_data_size(
+/*===============*/
+	const ulint*	offsets)/*!< in: array returned by rec_get_offsets() */
+{
+	ulint	size;
+
+	ut_ad(rec_offs_validate(NULL, NULL, offsets));
+	size = rec_offs_base(offsets)[rec_offs_n_fields(offsets)]
+		& REC_OFFS_MASK;
+	ut_ad(size < UNIV_PAGE_SIZE);
+	return(size);
+}
+
+/**********************************************************//**
+Returns the total size of record minus data size of record. The value
+returned by the function is the distance from record start to record origin
+in bytes.
+@return	size */
+UNIV_INLINE
+ulint
+rec_offs_extra_size(
+/*================*/
+	const ulint*	offsets)/*!< in: array returned by rec_get_offsets() */
+{
+	ulint	size;
+	ut_ad(rec_offs_validate(NULL, NULL, offsets));
+	size = *rec_offs_base(offsets) & ~(REC_OFFS_COMPACT | REC_OFFS_EXTERNAL);
+	ut_ad(size < UNIV_PAGE_SIZE);
+	return(size);
+}
+
+/**********************************************************//**
+Returns the total size of a physical record.
+@return	size */
+UNIV_INLINE
+ulint
+rec_offs_size(
+/*==========*/
+	const ulint*	offsets)/*!< in: array returned by rec_get_offsets() */
+{
+	return(rec_offs_data_size(offsets) + rec_offs_extra_size(offsets));
+}
+
+/**********************************************************//**
+Returns a pointer to the end of the record.
+@return	pointer to end */
+UNIV_INLINE
+byte*
+rec_get_end(
+/*========*/
+	rec_t*		rec,	/*!< in: pointer to record */
+	const ulint*	offsets)/*!< in: array returned by rec_get_offsets() */
+{
+	ut_ad(rec_offs_validate(rec, NULL, offsets));
+	return(rec + rec_offs_data_size(offsets));
+}
+
+/**********************************************************//**
+Returns a pointer to the start of the record.
+@return	pointer to start */
+UNIV_INLINE
+byte*
+rec_get_start(
+/*==========*/
+	rec_t*		rec,	/*!< in: pointer to record */
+	const ulint*	offsets)/*!< in: array returned by rec_get_offsets() */
+{
+	ut_ad(rec_offs_validate(rec, NULL, offsets));
+	return(rec - rec_offs_extra_size(offsets));
+}
+
+/***************************************************************//**
+Copies a physical record to a buffer.
+@return	pointer to the origin of the copy */
+UNIV_INLINE
+rec_t*
+rec_copy(
+/*=====*/
+	void*		buf,	/*!< in: buffer */
+	const rec_t*	rec,	/*!< in: physical record */
+	const ulint*	offsets)/*!< in: array returned by rec_get_offsets() */
+{
+	ulint	extra_len;
+	ulint	data_len;
+
+	ut_ad(rec && buf);
+	ut_ad(rec_offs_validate((rec_t*) rec, NULL, offsets));
+	ut_ad(rec_validate(rec, offsets));
+
+	extra_len = rec_offs_extra_size(offsets);
+	data_len = rec_offs_data_size(offsets);
+
+	ut_memcpy(buf, rec - extra_len, extra_len + data_len);
+
+	return((byte*)buf + extra_len);
+}
+
+/**********************************************************//**
+Returns the extra size of an old-style physical record if we know its
+data size and number of fields.
+@return	extra size */
+UNIV_INLINE
+ulint
+rec_get_converted_extra_size(
+/*=========================*/
+	ulint	data_size,	/*!< in: data size */
+	ulint	n_fields,	/*!< in: number of fields */
+	ulint	n_ext)		/*!< in: number of externally stored columns */
+{
+	if (!n_ext && data_size <= REC_1BYTE_OFFS_LIMIT) {
+
+		return(REC_N_OLD_EXTRA_BYTES + n_fields);
+	}
+
+	return(REC_N_OLD_EXTRA_BYTES + 2 * n_fields);
+}
+
+/**********************************************************//**
+The following function returns the size of a data tuple when converted to
+a physical record.
+@return	size */
+UNIV_INLINE
+ulint
+rec_get_converted_size(
+/*===================*/
+	dict_index_t*	index,	/*!< in: record descriptor */
+	const dtuple_t*	dtuple,	/*!< in: data tuple */
+	ulint		n_ext)	/*!< in: number of externally stored columns */
+{
+	ulint	data_size;
+	ulint	extra_size;
+
+	ut_ad(index);
+	ut_ad(dtuple);
+	ut_ad(dtuple_check_typed(dtuple));
+
+	ut_ad(index->type & DICT_UNIVERSAL
+	      || dtuple_get_n_fields(dtuple)
+	      == (((dtuple_get_info_bits(dtuple) & REC_NEW_STATUS_MASK)
+		   == REC_STATUS_NODE_PTR)
+		  ? dict_index_get_n_unique_in_tree(index) + 1
+		  : dict_index_get_n_fields(index)));
+
+	if (dict_table_is_comp(index->table)) {
+		return(rec_get_converted_size_comp(index,
+						   dtuple_get_info_bits(dtuple)
+						   & REC_NEW_STATUS_MASK,
+						   dtuple->fields,
+						   dtuple->n_fields, NULL));
+	}
+
+	data_size = dtuple_get_data_size(dtuple, 0);
+
+	extra_size = rec_get_converted_extra_size(
+		data_size, dtuple_get_n_fields(dtuple), n_ext);
+
+	return(data_size + extra_size);
+}
+
+#ifndef UNIV_HOTBACKUP
+/************************************************************//**
+Folds a prefix of a physical record to a ulint. Folds only existing fields,
+that is, checks that we do not run out of the record.
+@return	the folded value */
+UNIV_INLINE
+ulint
+rec_fold(
+/*=====*/
+	const rec_t*	rec,		/*!< in: the physical record */
+	const ulint*	offsets,	/*!< in: array returned by
+					rec_get_offsets() */
+	ulint		n_fields,	/*!< in: number of complete
+					fields to fold */
+	ulint		n_bytes,	/*!< in: number of bytes to fold
+					in an incomplete last field */
+	dulint		tree_id)	/*!< in: index tree id */
+{
+	ulint		i;
+	const byte*	data;
+	ulint		len;
+	ulint		fold;
+	ulint		n_fields_rec;
+
+	ut_ad(rec_offs_validate(rec, NULL, offsets));
+	ut_ad(rec_validate(rec, offsets));
+	ut_ad(n_fields + n_bytes > 0);
+
+	n_fields_rec = rec_offs_n_fields(offsets);
+	ut_ad(n_fields <= n_fields_rec);
+	ut_ad(n_fields < n_fields_rec || n_bytes == 0);
+
+	if (n_fields > n_fields_rec) {
+		n_fields = n_fields_rec;
+	}
+
+	if (n_fields == n_fields_rec) {
+		n_bytes = 0;
+	}
+
+	fold = ut_fold_dulint(tree_id);
+
+	for (i = 0; i < n_fields; i++) {
+		data = rec_get_nth_field(rec, offsets, i, &len);
+
+		if (len != UNIV_SQL_NULL) {
+			fold = ut_fold_ulint_pair(fold,
+						  ut_fold_binary(data, len));
+		}
+	}
+
+	if (n_bytes > 0) {
+		data = rec_get_nth_field(rec, offsets, i, &len);
+
+		if (len != UNIV_SQL_NULL) {
+			if (len > n_bytes) {
+				len = n_bytes;
+			}
+
+			fold = ut_fold_ulint_pair(fold,
+						  ut_fold_binary(data, len));
+		}
+	}
+
+	return(fold);
+}
+#endif /* !UNIV_HOTBACKUP */
diff --git a/storage/xtradb/include/rem0types.h b/storage/xtradb/include/rem0types.h
new file mode 100644
index 00000000000..8b84d4af233
--- /dev/null
+++ b/storage/xtradb/include/rem0types.h
@@ -0,0 +1,46 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/********************************************************************//**
+@file include/rem0types.h
+Record manager global types
+
+Created 5/30/1994 Heikki Tuuri
+*************************************************************************/
+
+#ifndef rem0types_h
+#define rem0types_h
+
+/* We define the physical record simply as an array of bytes */
+typedef byte	rec_t;
+
+/* Maximum values for various fields (for non-blob tuples) */
+#define REC_MAX_N_FIELDS	(1024 - 1)
+#define REC_MAX_HEAP_NO		(2 * 8192 - 1)
+#define REC_MAX_N_OWNED		(16 - 1)
+
+/* REC_MAX_INDEX_COL_LEN is measured in bytes and is the maximum
+indexed column length (or indexed prefix length). It is set to 3*256,
+so that one can create a column prefix index on 256 characters of a
+TEXT or VARCHAR column also in the UTF-8 charset. In that charset,
+a character may take at most 3 bytes.
+This constant MUST NOT BE CHANGED, or the compatibility of InnoDB data
+files would be at risk! */
+#define REC_MAX_INDEX_COL_LEN	768
+
+#endif
diff --git a/storage/xtradb/include/row0ext.h b/storage/xtradb/include/row0ext.h
new file mode 100644
index 00000000000..43d82d644e6
--- /dev/null
+++ b/storage/xtradb/include/row0ext.h
@@ -0,0 +1,95 @@
+/*****************************************************************************
+
+Copyright (c) 2006, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/row0ext.h
+Caching of externally stored column prefixes
+
+Created September 2006 Marko Makela
+*******************************************************/
+
+#ifndef row0ext_h
+#define row0ext_h
+
+#include "univ.i"
+#include "row0types.h"
+#include "data0types.h"
+#include "mem0mem.h"
+
+/********************************************************************//**
+Creates a cache of column prefixes of externally stored columns.
+@return	own: column prefix cache */
+UNIV_INTERN
+row_ext_t*
+row_ext_create(
+/*===========*/
+	ulint		n_ext,	/*!< in: number of externally stored columns */
+	const ulint*	ext,	/*!< in: col_no's of externally stored columns
+				in the InnoDB table object, as reported by
+				dict_col_get_no(); NOT relative to the records
+				in the clustered index */
+	const dtuple_t*	tuple,	/*!< in: data tuple containing the field
+				references of the externally stored
+				columns; must be indexed by col_no;
+				the clustered index record must be
+				covered by a lock or a page latch
+				to prevent deletion (rollback or purge). */
+	ulint		zip_size,/*!< compressed page size in bytes, or 0 */
+	mem_heap_t*	heap);	/*!< in: heap where created */
+
+/********************************************************************//**
+Looks up a column prefix of an externally stored column.
+@return column prefix, or NULL if the column is not stored externally,
+or pointer to field_ref_zero if the BLOB pointer is unset */
+UNIV_INLINE
+const byte*
+row_ext_lookup_ith(
+/*===============*/
+	const row_ext_t*	ext,	/*!< in/out: column prefix cache */
+	ulint			i,	/*!< in: index of ext->ext[] */
+	ulint*			len);	/*!< out: length of prefix, in bytes,
+					at most REC_MAX_INDEX_COL_LEN */
+/********************************************************************//**
+Looks up a column prefix of an externally stored column.
+@return column prefix, or NULL if the column is not stored externally,
+or pointer to field_ref_zero if the BLOB pointer is unset */
+UNIV_INLINE
+const byte*
+row_ext_lookup(
+/*===========*/
+	const row_ext_t*	ext,	/*!< in: column prefix cache */
+	ulint			col,	/*!< in: column number in the InnoDB
+					table object, as reported by
+					dict_col_get_no(); NOT relative to the
+					records in the clustered index */
+	ulint*			len);	/*!< out: length of prefix, in bytes,
+					at most REC_MAX_INDEX_COL_LEN */
+
+/** Prefixes of externally stored columns */
+struct row_ext_struct{
+	ulint		n_ext;	/*!< number of externally stored columns */
+	const ulint*	ext;	/*!< col_no's of externally stored columns */
+	byte*		buf;	/*!< backing store of the column prefix cache */
+	ulint		len[1];	/*!< prefix lengths; 0 if not cached */
+};
+
+#ifndef UNIV_NONINL
+#include "row0ext.ic"
+#endif
+
+#endif
diff --git a/storage/xtradb/include/row0ext.ic b/storage/xtradb/include/row0ext.ic
new file mode 100644
index 00000000000..82771a9312a
--- /dev/null
+++ b/storage/xtradb/include/row0ext.ic
@@ -0,0 +1,84 @@
+/*****************************************************************************
+
+Copyright (c) 2006, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/row0ext.ic
+Caching of externally stored column prefixes
+
+Created September 2006 Marko Makela
+*******************************************************/
+
+#include "rem0types.h"
+#include "btr0types.h"
+
+/********************************************************************//**
+Looks up a column prefix of an externally stored column.
+@return column prefix, or NULL if the column is not stored externally,
+or pointer to field_ref_zero if the BLOB pointer is unset */
+UNIV_INLINE
+const byte*
+row_ext_lookup_ith(
+/*===============*/
+	const row_ext_t*	ext,	/*!< in/out: column prefix cache */
+	ulint			i,	/*!< in: index of ext->ext[] */
+	ulint*			len)	/*!< out: length of prefix, in bytes,
+					at most REC_MAX_INDEX_COL_LEN */
+{
+	ut_ad(ext);
+	ut_ad(len);
+	ut_ad(i < ext->n_ext);
+
+	*len = ext->len[i];
+
+	if (UNIV_UNLIKELY(*len == 0)) {
+		/* The BLOB could not be fetched to the cache. */
+		return(field_ref_zero);
+	} else {
+		return(ext->buf + i * REC_MAX_INDEX_COL_LEN);
+	}
+}
+
+/********************************************************************//**
+Looks up a column prefix of an externally stored column.
+@return column prefix, or NULL if the column is not stored externally,
+or pointer to field_ref_zero if the BLOB pointer is unset */
+UNIV_INLINE
+const byte*
+row_ext_lookup(
+/*===========*/
+	const row_ext_t*	ext,	/*!< in: column prefix cache */
+	ulint			col,	/*!< in: column number in the InnoDB
+					table object, as reported by
+					dict_col_get_no(); NOT relative to the
+					records in the clustered index */
+	ulint*			len)	/*!< out: length of prefix, in bytes,
+					at most REC_MAX_INDEX_COL_LEN */
+{
+	ulint	i;
+
+	ut_ad(ext);
+	ut_ad(len);
+
+	for (i = 0; i < ext->n_ext; i++) {
+		if (col == ext->ext[i]) {
+			return(row_ext_lookup_ith(ext, i, len));
+		}
+	}
+
+	return(NULL);
+}
diff --git a/storage/xtradb/include/row0ins.h b/storage/xtradb/include/row0ins.h
new file mode 100644
index 00000000000..9f93565ddb7
--- /dev/null
+++ b/storage/xtradb/include/row0ins.h
@@ -0,0 +1,156 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/row0ins.h
+Insert into a table
+
+Created 4/20/1996 Heikki Tuuri
+*******************************************************/
+
+#ifndef row0ins_h
+#define row0ins_h
+
+#include "univ.i"
+#include "data0data.h"
+#include "que0types.h"
+#include "dict0types.h"
+#include "trx0types.h"
+#include "row0types.h"
+
+/***************************************************************//**
+Checks if foreign key constraint fails for an index entry. Sets shared locks
+which lock either the success or the failure of the constraint. NOTE that
+the caller must have a shared latch on dict_foreign_key_check_lock.
+@return DB_SUCCESS, DB_LOCK_WAIT, DB_NO_REFERENCED_ROW, or
+DB_ROW_IS_REFERENCED */
+UNIV_INTERN
+ulint
+row_ins_check_foreign_constraint(
+/*=============================*/
+	ibool		check_ref,/*!< in: TRUE If we want to check that
+				the referenced table is ok, FALSE if we
+				want to check the foreign key table */
+	dict_foreign_t*	foreign,/*!< in: foreign constraint; NOTE that the
+				tables mentioned in it must be in the
+				dictionary cache if they exist at all */
+	dict_table_t*	table,	/*!< in: if check_ref is TRUE, then the foreign
+				table, else the referenced table */
+	dtuple_t*	entry,	/*!< in: index entry for index */
+	que_thr_t*	thr);	/*!< in: query thread */
+/*********************************************************************//**
+Creates an insert node struct.
+@return	own: insert node struct */
+UNIV_INTERN
+ins_node_t*
+ins_node_create(
+/*============*/
+	ulint		ins_type,	/*!< in: INS_VALUES, ... */
+	dict_table_t*	table,		/*!< in: table where to insert */
+	mem_heap_t*	heap);		/*!< in: mem heap where created */
+/*********************************************************************//**
+Sets a new row to insert for an INS_DIRECT node. This function is only used
+if we have constructed the row separately, which is a rare case; this
+function is quite slow. */
+UNIV_INTERN
+void
+ins_node_set_new_row(
+/*=================*/
+	ins_node_t*	node,	/*!< in: insert node */
+	dtuple_t*	row);	/*!< in: new row (or first row) for the node */
+/***************************************************************//**
+Inserts an index entry to index. Tries first optimistic, then pessimistic
+descent down the tree. If the entry matches enough to a delete marked record,
+performs the insert by updating or delete unmarking the delete marked
+record.
+@return	DB_SUCCESS, DB_LOCK_WAIT, DB_DUPLICATE_KEY, or some other error code */
+UNIV_INTERN
+ulint
+row_ins_index_entry(
+/*================*/
+	dict_index_t*	index,	/*!< in: index */
+	dtuple_t*	entry,	/*!< in: index entry to insert */
+	ulint		n_ext,	/*!< in: number of externally stored columns */
+	ibool		foreign,/*!< in: TRUE=check foreign key constraints */
+	que_thr_t*	thr);	/*!< in: query thread */
+/***********************************************************//**
+Inserts a row to a table. This is a high-level function used in
+SQL execution graphs.
+@return	query thread to run next or NULL */
+UNIV_INTERN
+que_thr_t*
+row_ins_step(
+/*=========*/
+	que_thr_t*	thr);	/*!< in: query thread */
+/***********************************************************//**
+Creates an entry template for each index of a table. */
+UNIV_INTERN
+void
+ins_node_create_entry_list(
+/*=======================*/
+	ins_node_t*	node);	/*!< in: row insert node */
+
+/* Insert node structure */
+
+struct ins_node_struct{
+	que_common_t	common;	/*!< node type: QUE_NODE_INSERT */
+	ulint		ins_type;/* INS_VALUES, INS_SEARCHED, or INS_DIRECT */
+	dtuple_t*	row;	/*!< row to insert */
+	dict_table_t*	table;	/*!< table where to insert */
+	sel_node_t*	select;	/*!< select in searched insert */
+	que_node_t*	values_list;/* list of expressions to evaluate and
+				insert in an INS_VALUES insert */
+	ulint		state;	/*!< node execution state */
+	dict_index_t*	index;	/*!< NULL, or the next index where the index
+				entry should be inserted */
+	dtuple_t*	entry;	/*!< NULL, or entry to insert in the index;
+				after a successful insert of the entry,
+				this should be reset to NULL */
+	UT_LIST_BASE_NODE_T(dtuple_t)
+			entry_list;/* list of entries, one for each index */
+	byte*		row_id_buf;/* buffer for the row id sys field in row */
+	trx_id_t	trx_id;	/*!< trx id or the last trx which executed the
+				node */
+	byte*		trx_id_buf;/* buffer for the trx id sys field in row */
+	mem_heap_t*	entry_sys_heap;
+				/* memory heap used as auxiliary storage;
+				entry_list and sys fields are stored here;
+				if this is NULL, entry list should be created
+				and buffers for sys fields in row allocated */
+	ulint		magic_n;
+};
+
+#define	INS_NODE_MAGIC_N	15849075
+
+/* Insert node types */
+#define INS_SEARCHED	0	/* INSERT INTO ... SELECT ... */
+#define INS_VALUES	1	/* INSERT INTO ... VALUES ... */
+#define INS_DIRECT	2	/* this is for internal use in dict0crea:
+				insert the row directly */
+
+/* Node execution states */
+#define	INS_NODE_SET_IX_LOCK	1	/* we should set an IX lock on table */
+#define INS_NODE_ALLOC_ROW_ID	2	/* row id should be allocated */
+#define	INS_NODE_INSERT_ENTRIES 3	/* index entries should be built and
+					inserted */
+
+#ifndef UNIV_NONINL
+#include "row0ins.ic"
+#endif
+
+#endif
diff --git a/storage/xtradb/include/row0ins.ic b/storage/xtradb/include/row0ins.ic
new file mode 100644
index 00000000000..84f6da255bf
--- /dev/null
+++ b/storage/xtradb/include/row0ins.ic
@@ -0,0 +1,26 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/row0ins.ic
+Insert into a table
+
+Created 4/20/1996 Heikki Tuuri
+*******************************************************/
+
+
diff --git a/storage/xtradb/include/row0merge.h b/storage/xtradb/include/row0merge.h
new file mode 100644
index 00000000000..62a5efd11f7
--- /dev/null
+++ b/storage/xtradb/include/row0merge.h
@@ -0,0 +1,197 @@
+/*****************************************************************************
+
+Copyright (c) 2005, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/row0merge.h
+Index build routines using a merge sort
+
+Created 13/06/2005 Jan Lindstrom
+*******************************************************/
+
+#ifndef row0merge_h
+#define row0merge_h
+
+#include "univ.i"
+#include "data0data.h"
+#include "dict0types.h"
+#include "trx0types.h"
+#include "que0types.h"
+#include "mtr0mtr.h"
+#include "rem0types.h"
+#include "rem0rec.h"
+#include "read0types.h"
+#include "btr0types.h"
+#include "row0mysql.h"
+#include "lock0types.h"
+
+/** Index field definition */
+struct merge_index_field_struct {
+	ulint		prefix_len;	/*!< column prefix length, or 0
+					if indexing the whole column */
+	const char*	field_name;	/*!< field name */
+};
+
+/** Index field definition */
+typedef struct merge_index_field_struct merge_index_field_t;
+
+/** Definition of an index being created */
+struct merge_index_def_struct {
+	const char*		name;		/*!< index name */
+	ulint			ind_type;	/*!< 0, DICT_UNIQUE,
+						or DICT_CLUSTERED */
+	ulint			n_fields;	/*!< number of fields
+						in index */
+	merge_index_field_t*	fields;		/*!< field definitions */
+};
+
+/** Definition of an index being created */
+typedef struct merge_index_def_struct merge_index_def_t;
+
+/*********************************************************************//**
+Sets an exclusive lock on a table, for the duration of creating indexes.
+@return	error code or DB_SUCCESS */
+UNIV_INTERN
+ulint
+row_merge_lock_table(
+/*=================*/
+	trx_t*		trx,		/*!< in/out: transaction */
+	dict_table_t*	table,		/*!< in: table to lock */
+	enum lock_mode	mode);		/*!< in: LOCK_X or LOCK_S */
+/*********************************************************************//**
+Drop an index from the InnoDB system tables.  The data dictionary must
+have been locked exclusively by the caller, because the transaction
+will not be committed. */
+UNIV_INTERN
+void
+row_merge_drop_index(
+/*=================*/
+	dict_index_t*	index,	/*!< in: index to be removed */
+	dict_table_t*	table,	/*!< in: table */
+	trx_t*		trx);	/*!< in: transaction handle */
+/*********************************************************************//**
+Drop those indexes which were created before an error occurred when
+building an index.  The data dictionary must have been locked
+exclusively by the caller, because the transaction will not be
+committed. */
+UNIV_INTERN
+void
+row_merge_drop_indexes(
+/*===================*/
+	trx_t*		trx,		/*!< in: transaction */
+	dict_table_t*	table,		/*!< in: table containing the indexes */
+	dict_index_t**	index,		/*!< in: indexes to drop */
+	ulint		num_created);	/*!< in: number of elements in index[] */
+/*********************************************************************//**
+Drop all partially created indexes during crash recovery. */
+UNIV_INTERN
+void
+row_merge_drop_temp_indexes(void);
+/*=============================*/
+/*********************************************************************//**
+Rename the tables in the data dictionary.  The data dictionary must
+have been locked exclusively by the caller, because the transaction
+will not be committed.
+@return	error code or DB_SUCCESS */
+UNIV_INTERN
+ulint
+row_merge_rename_tables(
+/*====================*/
+	dict_table_t*	old_table,	/*!< in/out: old table, renamed to
+					tmp_name */
+	dict_table_t*	new_table,	/*!< in/out: new table, renamed to
+					old_table->name */
+	const char*	tmp_name,	/*!< in: new name for old_table */
+	trx_t*		trx);		/*!< in: transaction handle */
+
+/*********************************************************************//**
+Create a temporary table for creating a primary key, using the definition
+of an existing table.
+@return	table, or NULL on error */
+UNIV_INTERN
+dict_table_t*
+row_merge_create_temporary_table(
+/*=============================*/
+	const char*		table_name,	/*!< in: new table name */
+	const merge_index_def_t*index_def,	/*!< in: the index definition
+						of the primary key */
+	const dict_table_t*	table,		/*!< in: old table definition */
+	trx_t*			trx);		/*!< in/out: transaction
+						(sets error_state) */
+/*********************************************************************//**
+Rename the temporary indexes in the dictionary to permanent ones.  The
+data dictionary must have been locked exclusively by the caller,
+because the transaction will not be committed.
+@return	DB_SUCCESS if all OK */
+UNIV_INTERN
+ulint
+row_merge_rename_indexes(
+/*=====================*/
+	trx_t*		trx,		/*!< in/out: transaction */
+	dict_table_t*	table);		/*!< in/out: table with new indexes */
+/*********************************************************************//**
+Create the index and load in to the dictionary.
+@return	index, or NULL on error */
+UNIV_INTERN
+dict_index_t*
+row_merge_create_index(
+/*===================*/
+	trx_t*			trx,	/*!< in/out: trx (sets error_state) */
+	dict_table_t*		table,	/*!< in: the index is on this table */
+	const merge_index_def_t*index_def);
+					/*!< in: the index definition */
+/*********************************************************************//**
+Check if a transaction can use an index.
+@return	TRUE if index can be used by the transaction else FALSE */
+UNIV_INTERN
+ibool
+row_merge_is_index_usable(
+/*======================*/
+	const trx_t*		trx,	/*!< in: transaction */
+	const dict_index_t*	index);	/*!< in: index to check */
+/*********************************************************************//**
+If there are views that refer to the old table name then we "attach" to
+the new instance of the table else we drop it immediately.
+@return	DB_SUCCESS or error code */
+UNIV_INTERN
+ulint
+row_merge_drop_table(
+/*=================*/
+	trx_t*		trx,		/*!< in: transaction */
+	dict_table_t*	table);		/*!< in: table instance to drop */
+
+/*********************************************************************//**
+Build indexes on a table by reading a clustered index,
+creating a temporary file containing index entries, merge sorting
+these index entries and inserting sorted index entries to indexes.
+@return	DB_SUCCESS or error code */
+UNIV_INTERN
+ulint
+row_merge_build_indexes(
+/*====================*/
+	trx_t*		trx,		/*!< in: transaction */
+	dict_table_t*	old_table,	/*!< in: table where rows are
+					read from */
+	dict_table_t*	new_table,	/*!< in: table where indexes are
+					created; identical to old_table
+					unless creating a PRIMARY KEY */
+	dict_index_t**	indexes,	/*!< in: indexes to be created */
+	ulint		n_indexes,	/*!< in: size of indexes[] */
+	TABLE*		table);		/*!< in/out: MySQL table, for
+					reporting erroneous key value
+					if applicable */
+#endif /* row0merge.h */
diff --git a/storage/xtradb/include/row0mysql.h b/storage/xtradb/include/row0mysql.h
new file mode 100644
index 00000000000..9090e476bfd
--- /dev/null
+++ b/storage/xtradb/include/row0mysql.h
@@ -0,0 +1,807 @@
+/*****************************************************************************
+
+Copyright (c) 2000, 2010, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/row0mysql.h
+Interface between Innobase row operations and MySQL.
+Contains also create table and other data dictionary operations.
+
+Created 9/17/2000 Heikki Tuuri
+*******************************************************/
+
+#ifndef row0mysql_h
+#define row0mysql_h
+
+#include "univ.i"
+#include "data0data.h"
+#include "que0types.h"
+#include "dict0types.h"
+#include "trx0types.h"
+#include "row0types.h"
+#include "btr0pcur.h"
+#include "trx0types.h"
+
+extern ibool row_rollback_on_timeout;
+
+typedef struct row_prebuilt_struct row_prebuilt_t;
+
+/*******************************************************************//**
+Frees the blob heap in prebuilt when no longer needed. */
+UNIV_INTERN
+void
+row_mysql_prebuilt_free_blob_heap(
+/*==============================*/
+	row_prebuilt_t*	prebuilt);	/*!< in: prebuilt struct of a
+					ha_innobase:: table handle */
+/*******************************************************************//**
+Stores a >= 5.0.3 format true VARCHAR length to dest, in the MySQL row
+format.
+@return pointer to the data, we skip the 1 or 2 bytes at the start
+that are used to store the len */
+UNIV_INTERN
+byte*
+row_mysql_store_true_var_len(
+/*=========================*/
+	byte*	dest,	/*!< in: where to store */
+	ulint	len,	/*!< in: length, must fit in two bytes */
+	ulint	lenlen);/*!< in: storage length of len: either 1 or 2 bytes */
+/*******************************************************************//**
+Reads a >= 5.0.3 format true VARCHAR length, in the MySQL row format, and
+returns a pointer to the data.
+@return pointer to the data, we skip the 1 or 2 bytes at the start
+that are used to store the len */
+UNIV_INTERN
+const byte*
+row_mysql_read_true_varchar(
+/*========================*/
+	ulint*		len,	/*!< out: variable-length field length */
+	const byte*	field,	/*!< in: field in the MySQL format */
+	ulint		lenlen);/*!< in: storage length of len: either 1
+				or 2 bytes */
+/*******************************************************************//**
+Stores a reference to a BLOB in the MySQL format. */
+UNIV_INTERN
+void
+row_mysql_store_blob_ref(
+/*=====================*/
+	byte*		dest,	/*!< in: where to store */
+	ulint		col_len,/*!< in: dest buffer size: determines into
+				how many bytes the BLOB length is stored,
+				the space for the length may vary from 1
+				to 4 bytes */
+	const void*	data,	/*!< in: BLOB data; if the value to store
+				is SQL NULL this should be NULL pointer */
+	ulint		len);	/*!< in: BLOB length; if the value to store
+				is SQL NULL this should be 0; remember
+				also to set the NULL bit in the MySQL record
+				header! */
+/*******************************************************************//**
+Reads a reference to a BLOB in the MySQL format.
+@return	pointer to BLOB data */
+UNIV_INTERN
+const byte*
+row_mysql_read_blob_ref(
+/*====================*/
+	ulint*		len,		/*!< out: BLOB length */
+	const byte*	ref,		/*!< in: BLOB reference in the
+					MySQL format */
+	ulint		col_len);	/*!< in: BLOB reference length
+					(not BLOB length) */
+/**************************************************************//**
+Stores a non-SQL-NULL field given in the MySQL format in the InnoDB format.
+The counterpart of this function is row_sel_field_store_in_mysql_format() in
+row0sel.c.
+@return	up to which byte we used buf in the conversion */
+UNIV_INTERN
+byte*
+row_mysql_store_col_in_innobase_format(
+/*===================================*/
+	dfield_t*	dfield,		/*!< in/out: dfield where dtype
+					information must be already set when
+					this function is called! */
+	byte*		buf,		/*!< in/out: buffer for a converted
+					integer value; this must be at least
+					col_len long then! */
+	ibool		row_format_col,	/*!< TRUE if the mysql_data is from
+					a MySQL row, FALSE if from a MySQL
+					key value;
+					in MySQL, a true VARCHAR storage
+					format differs in a row and in a
+					key value: in a key value the length
+					is always stored in 2 bytes! */
+	const byte*	mysql_data,	/*!< in: MySQL column value, not
+					SQL NULL; NOTE that dfield may also
+					get a pointer to mysql_data,
+					therefore do not discard this as long
+					as dfield is used! */
+	ulint		col_len,	/*!< in: MySQL column length; NOTE that
+					this is the storage length of the
+					column in the MySQL format row, not
+					necessarily the length of the actual
+					payload data; if the column is a true
+					VARCHAR then this is irrelevant */
+	ulint		comp);		/*!< in: nonzero=compact format */
+/****************************************************************//**
+Handles user errors and lock waits detected by the database engine.
+@return TRUE if it was a lock wait and we should continue running the
+query thread */
+UNIV_INTERN
+ibool
+row_mysql_handle_errors(
+/*====================*/
+	ulint*		new_err,/*!< out: possible new error encountered in
+				rollback, or the old error which was
+				during the function entry */
+	trx_t*		trx,	/*!< in: transaction */
+	que_thr_t*	thr,	/*!< in: query thread */
+	trx_savept_t*	savept);/*!< in: savepoint */
+/********************************************************************//**
+Create a prebuilt struct for a MySQL table handle.
+@return	own: a prebuilt struct */
+UNIV_INTERN
+row_prebuilt_t*
+row_create_prebuilt(
+/*================*/
+	dict_table_t*	table);	/*!< in: Innobase table handle */
+/********************************************************************//**
+Free a prebuilt struct for a MySQL table handle. */
+UNIV_INTERN
+void
+row_prebuilt_free(
+/*==============*/
+	row_prebuilt_t*	prebuilt,	/*!< in, own: prebuilt struct */
+	ibool		dict_locked);	/*!< in: TRUE=data dictionary locked */
+/*********************************************************************//**
+Updates the transaction pointers in query graphs stored in the prebuilt
+struct. */
+UNIV_INTERN
+void
+row_update_prebuilt_trx(
+/*====================*/
+	row_prebuilt_t*	prebuilt,	/*!< in/out: prebuilt struct
+					in MySQL handle */
+	trx_t*		trx);		/*!< in: transaction handle */
+/*********************************************************************//**
+Unlocks AUTO_INC type locks that were possibly reserved by a trx. This
+function should be called at the the end of an SQL statement, by the
+connection thread that owns the transaction (trx->mysql_thd). */
+UNIV_INTERN
+void
+row_unlock_table_autoinc_for_mysql(
+/*===============================*/
+	trx_t*	trx);			/*!< in/out: transaction */
+/*********************************************************************//**
+Sets an AUTO_INC type lock on the table mentioned in prebuilt. The
+AUTO_INC lock gives exclusive access to the auto-inc counter of the
+table. The lock is reserved only for the duration of an SQL statement.
+It is not compatible with another AUTO_INC or exclusive lock on the
+table.
+@return	error code or DB_SUCCESS */
+UNIV_INTERN
+int
+row_lock_table_autoinc_for_mysql(
+/*=============================*/
+	row_prebuilt_t*	prebuilt);	/*!< in: prebuilt struct in the MySQL
+					table handle */
+/*********************************************************************//**
+Sets a table lock on the table mentioned in prebuilt.
+@return	error code or DB_SUCCESS */
+UNIV_INTERN
+int
+row_lock_table_for_mysql(
+/*=====================*/
+	row_prebuilt_t*	prebuilt,	/*!< in: prebuilt struct in the MySQL
+					table handle */
+	dict_table_t*	table,		/*!< in: table to lock, or NULL
+					if prebuilt->table should be
+					locked as
+					prebuilt->select_lock_type */
+	ulint		mode);		/*!< in: lock mode of table
+					(ignored if table==NULL) */
+
+/*********************************************************************//**
+Does an insert for MySQL.
+@return	error code or DB_SUCCESS */
+UNIV_INTERN
+int
+row_insert_for_mysql(
+/*=================*/
+	byte*		mysql_rec,	/*!< in: row in the MySQL format */
+	row_prebuilt_t*	prebuilt);	/*!< in: prebuilt struct in MySQL
+					handle */
+/*********************************************************************//**
+Builds a dummy query graph used in selects. */
+UNIV_INTERN
+void
+row_prebuild_sel_graph(
+/*===================*/
+	row_prebuilt_t*	prebuilt);	/*!< in: prebuilt struct in MySQL
+					handle */
+/*********************************************************************//**
+Gets pointer to a prebuilt update vector used in updates. If the update
+graph has not yet been built in the prebuilt struct, then this function
+first builds it.
+@return	prebuilt update vector */
+UNIV_INTERN
+upd_t*
+row_get_prebuilt_update_vector(
+/*===========================*/
+	row_prebuilt_t*	prebuilt);	/*!< in: prebuilt struct in MySQL
+					handle */
+/*********************************************************************//**
+Checks if a table is such that we automatically created a clustered
+index on it (on row id).
+@return	TRUE if the clustered index was generated automatically */
+UNIV_INTERN
+ibool
+row_table_got_default_clust_index(
+/*==============================*/
+	const dict_table_t*	table);	/*!< in: table */
+/*********************************************************************//**
+Does an update or delete of a row for MySQL.
+@return	error code or DB_SUCCESS */
+UNIV_INTERN
+int
+row_update_for_mysql(
+/*=================*/
+	byte*		mysql_rec,	/*!< in: the row to be updated, in
+					the MySQL format */
+	row_prebuilt_t*	prebuilt);	/*!< in: prebuilt struct in MySQL
+					handle */
+/*********************************************************************//**
+This can only be used when srv_locks_unsafe_for_binlog is TRUE or this
+session is using a READ COMMITTED or READ UNCOMMITTED isolation level.
+Before calling this function row_search_for_mysql() must have
+initialized prebuilt->new_rec_locks to store the information which new
+record locks really were set. This function removes a newly set
+clustered index record lock under prebuilt->pcur or
+prebuilt->clust_pcur.  Thus, this implements a 'mini-rollback' that
+releases the latest clustered index record lock we set.
+@return error code or DB_SUCCESS */
+UNIV_INTERN
+int
+row_unlock_for_mysql(
+/*=================*/
+	row_prebuilt_t*	prebuilt,	/*!< in/out: prebuilt struct in MySQL
+					handle */
+	ibool		has_latches_on_recs);/*!< in: TRUE if called
+					so that we have the latches on
+					the records under pcur and
+					clust_pcur, and we do not need
+					to reposition the cursors. */
+/*********************************************************************//**
+Creates an query graph node of 'update' type to be used in the MySQL
+interface.
+@return	own: update node */
+UNIV_INTERN
+upd_node_t*
+row_create_update_node_for_mysql(
+/*=============================*/
+	dict_table_t*	table,	/*!< in: table to update */
+	mem_heap_t*	heap);	/*!< in: mem heap from which allocated */
+/**********************************************************************//**
+Does a cascaded delete or set null in a foreign key operation.
+@return	error code or DB_SUCCESS */
+UNIV_INTERN
+ulint
+row_update_cascade_for_mysql(
+/*=========================*/
+	que_thr_t*	thr,	/*!< in: query thread */
+	upd_node_t*	node,	/*!< in: update node used in the cascade
+				or set null operation */
+	dict_table_t*	table);	/*!< in: table where we do the operation */
+/*********************************************************************//**
+Locks the data dictionary exclusively for performing a table create or other
+data dictionary modification operation. */
+UNIV_INTERN
+void
+row_mysql_lock_data_dictionary_func(
+/*================================*/
+	trx_t*		trx,	/*!< in/out: transaction */
+	const char*	file,	/*!< in: file name */
+	ulint		line);	/*!< in: line number */
+#define row_mysql_lock_data_dictionary(trx)				\
+	row_mysql_lock_data_dictionary_func(trx, __FILE__, __LINE__)
+/*********************************************************************//**
+Unlocks the data dictionary exclusive lock. */
+UNIV_INTERN
+void
+row_mysql_unlock_data_dictionary(
+/*=============================*/
+	trx_t*	trx);	/*!< in/out: transaction */
+/*********************************************************************//**
+Locks the data dictionary in shared mode from modifications, for performing
+foreign key check, rollback, or other operation invisible to MySQL. */
+UNIV_INTERN
+void
+row_mysql_freeze_data_dictionary_func(
+/*==================================*/
+	trx_t*		trx,	/*!< in/out: transaction */
+	const char*	file,	/*!< in: file name */
+	ulint		line);	/*!< in: line number */
+#define row_mysql_freeze_data_dictionary(trx)				\
+	row_mysql_freeze_data_dictionary_func(trx, __FILE__, __LINE__)
+/*********************************************************************//**
+Unlocks the data dictionary shared lock. */
+UNIV_INTERN
+void
+row_mysql_unfreeze_data_dictionary(
+/*===============================*/
+	trx_t*	trx);	/*!< in/out: transaction */
+/*********************************************************************//**
+Creates a table for MySQL. If the name of the table ends in
+one of "innodb_monitor", "innodb_lock_monitor", "innodb_tablespace_monitor",
+"innodb_table_monitor", then this will also start the printing of monitor
+output by the master thread. If the table name ends in "innodb_mem_validate",
+InnoDB will try to invoke mem_validate().
+@return	error code or DB_SUCCESS */
+UNIV_INTERN
+int
+row_create_table_for_mysql(
+/*=======================*/
+	dict_table_t*	table,		/*!< in, own: table definition
+					(will be freed) */
+	trx_t*		trx);		/*!< in: transaction handle */
+/*********************************************************************//**
+Does an index creation operation for MySQL. TODO: currently failure
+to create an index results in dropping the whole table! This is no problem
+currently as all indexes must be created at the same time as the table.
+@return	error number or DB_SUCCESS */
+UNIV_INTERN
+int
+row_create_index_for_mysql(
+/*=======================*/
+	dict_index_t*	index,		/*!< in, own: index definition
+					(will be freed) */
+	trx_t*		trx,		/*!< in: transaction handle */
+	const ulint*	field_lengths); /*!< in: if not NULL, must contain
+					dict_index_get_n_fields(index)
+					actual field lengths for the
+					index columns, which are
+					then checked for not being too
+					large. */
+/*********************************************************************//**
+*/
+UNIV_INTERN
+int
+row_insert_stats_for_mysql(
+/*=======================*/
+	dict_index_t*	index,
+	trx_t*		trx);
+/*********************************************************************//**
+Scans a table create SQL string and adds to the data dictionary
+the foreign key constraints declared in the string. This function
+should be called after the indexes for a table have been created.
+Each foreign key constraint must be accompanied with indexes in
+bot participating tables. The indexes are allowed to contain more
+fields than mentioned in the constraint.
+@return	error code or DB_SUCCESS */
+UNIV_INTERN
+int
+row_table_add_foreign_constraints(
+/*==============================*/
+	trx_t*		trx,		/*!< in: transaction */
+	const char*	sql_string,	/*!< in: table create statement where
+					foreign keys are declared like:
+				FOREIGN KEY (a, b) REFERENCES table2(c, d),
+					table2 can be written also with the
+					database name before it: test.table2 */
+	size_t		sql_length,	/*!< in: length of sql_string */
+	const char*	name,		/*!< in: table full name in the
+					normalized form
+					database_name/table_name */
+	ibool		reject_fks);	/*!< in: if TRUE, fail with error
+					code DB_CANNOT_ADD_CONSTRAINT if
+					any foreign keys are found. */
+
+/*********************************************************************//**
+The master thread in srv0srv.c calls this regularly to drop tables which
+we must drop in background after queries to them have ended. Such lazy
+dropping of tables is needed in ALTER TABLE on Unix.
+@return	how many tables dropped + remaining tables in list */
+UNIV_INTERN
+ulint
+row_drop_tables_for_mysql_in_background(void);
+/*=========================================*/
+/*********************************************************************//**
+Get the background drop list length. NOTE: the caller must own the kernel
+mutex!
+@return	how many tables in list */
+UNIV_INTERN
+ulint
+row_get_background_drop_list_len_low(void);
+/*======================================*/
+/*********************************************************************//**
+Truncates a table for MySQL.
+@return	error code or DB_SUCCESS */
+UNIV_INTERN
+int
+row_truncate_table_for_mysql(
+/*=========================*/
+	dict_table_t*	table,	/*!< in: table handle */
+	trx_t*		trx);	/*!< in: transaction handle */
+/*********************************************************************//**
+Drops a table for MySQL.  If the name of the dropped table ends in
+one of "innodb_monitor", "innodb_lock_monitor", "innodb_tablespace_monitor",
+"innodb_table_monitor", then this will also stop the printing of monitor
+output by the master thread.  If the data dictionary was not already locked
+by the transaction, the transaction will be committed.  Otherwise, the
+data dictionary will remain locked.
+@return	error code or DB_SUCCESS */
+UNIV_INTERN
+int
+row_drop_table_for_mysql(
+/*=====================*/
+	const char*	name,	/*!< in: table name */
+	trx_t*		trx,	/*!< in: transaction handle */
+	ibool		drop_db);/*!< in: TRUE=dropping whole database */
+/*********************************************************************//**
+Drop all temporary tables during crash recovery. */
+UNIV_INTERN
+void
+row_mysql_drop_temp_tables(void);
+/*============================*/
+
+/*********************************************************************//**
+Discards the tablespace of a table which stored in an .ibd file. Discarding
+means that this function deletes the .ibd file and assigns a new table id for
+the table. Also the flag table->ibd_file_missing is set TRUE.
+@return	error code or DB_SUCCESS */
+UNIV_INTERN
+int
+row_discard_tablespace_for_mysql(
+/*=============================*/
+	const char*	name,	/*!< in: table name */
+	trx_t*		trx);	/*!< in: transaction handle */
+/*****************************************************************//**
+Imports a tablespace. The space id in the .ibd file must match the space id
+of the table in the data dictionary.
+@return	error code or DB_SUCCESS */
+UNIV_INTERN
+int
+row_import_tablespace_for_mysql(
+/*============================*/
+	const char*	name,	/*!< in: table name */
+	trx_t*		trx);	/*!< in: transaction handle */
+/*********************************************************************//**
+Drops a database for MySQL.
+@return	error code or DB_SUCCESS */
+UNIV_INTERN
+int
+row_drop_database_for_mysql(
+/*========================*/
+	const char*	name,	/*!< in: database name which ends to '/' */
+	trx_t*		trx);	/*!< in: transaction handle */
+/*********************************************************************//**
+Renames a table for MySQL.
+@return	error code or DB_SUCCESS */
+UNIV_INTERN
+ulint
+row_rename_table_for_mysql(
+/*=======================*/
+	const char*	old_name,	/*!< in: old table name */
+	const char*	new_name,	/*!< in: new table name */
+	trx_t*		trx,		/*!< in: transaction handle */
+	ibool		commit);	/*!< in: if TRUE then commit trx */
+/*********************************************************************//**
+Checks that the index contains entries in an ascending order, unique
+constraint is not broken, and calculates the number of index entries
+in the read view of the current transaction.
+@return	DB_SUCCESS if ok */
+UNIV_INTERN
+ulint
+row_check_index_for_mysql(
+/*======================*/
+	row_prebuilt_t*		prebuilt,	/*!< in: prebuilt struct
+						in MySQL handle */
+	const dict_index_t*	index,		/*!< in: index */
+	ulint*			n_rows);	/*!< out: number of entries
+						seen in the consistent read */
+
+/*********************************************************************//**
+Determines if a table is a magic monitor table.
+@return	TRUE if monitor table */
+UNIV_INTERN
+ibool
+row_is_magic_monitor_table(
+/*=======================*/
+	const char*	table_name);	/*!< in: name of the table, in the
+					form database/table_name */
+
+/* A struct describing a place for an individual column in the MySQL
+row format which is presented to the table handler in ha_innobase.
+This template struct is used to speed up row transformations between
+Innobase and MySQL. */
+
+typedef struct mysql_row_templ_struct mysql_row_templ_t;
+struct mysql_row_templ_struct {
+	ulint	col_no;			/*!< column number of the column */
+	ulint	rec_field_no;		/*!< field number of the column in an
+					Innobase record in the current index;
+					not defined if template_type is
+					ROW_MYSQL_WHOLE_ROW */
+	ulint	mysql_col_offset;	/*!< offset of the column in the MySQL
+					row format */
+	ulint	mysql_col_len;		/*!< length of the column in the MySQL
+					row format */
+	ulint	mysql_null_byte_offset;	/*!< MySQL NULL bit byte offset in a
+					MySQL record */
+	ulint	mysql_null_bit_mask;	/*!< bit mask to get the NULL bit,
+					zero if column cannot be NULL */
+	ulint	type;			/*!< column type in Innobase mtype
+					numbers DATA_CHAR... */
+	ulint	mysql_type;		/*!< MySQL type code; this is always
+					< 256 */
+	ulint	mysql_length_bytes;	/*!< if mysql_type
+					== DATA_MYSQL_TRUE_VARCHAR, this tells
+					whether we should use 1 or 2 bytes to
+					store the MySQL true VARCHAR data
+					length at the start of row in the MySQL
+					format (NOTE that the MySQL key value
+					format always uses 2 bytes for the data
+					len) */
+	ulint	charset;		/*!< MySQL charset-collation code
+					of the column, or zero */
+	ulint	mbminlen;		/*!< minimum length of a char, in bytes,
+					or zero if not a char type */
+	ulint	mbmaxlen;		/*!< maximum length of a char, in bytes,
+					or zero if not a char type */
+	ulint	is_unsigned;		/*!< if a column type is an integer
+					type and this field is != 0, then
+					it is an unsigned integer type */
+};
+
+#define MYSQL_FETCH_CACHE_SIZE		8
+/* After fetching this many rows, we start caching them in fetch_cache */
+#define MYSQL_FETCH_CACHE_THRESHOLD	4
+
+#define ROW_PREBUILT_ALLOCATED	78540783
+#define ROW_PREBUILT_FREED	26423527
+
+typedef int (*index_cond_func_t)(void *param);
+/** A struct for (sometimes lazily) prebuilt structures in an Innobase table
+
+handle used within MySQL; these are used to save CPU time. */
+
+struct row_prebuilt_struct {
+	ulint		magic_n;	/*!< this magic number is set to
+					ROW_PREBUILT_ALLOCATED when created,
+					or ROW_PREBUILT_FREED when the
+					struct has been freed */
+	dict_table_t*	table;		/*!< Innobase table handle */
+	dict_index_t*	index;		/*!< current index for a search, if
+					any */
+	trx_t*		trx;		/*!< current transaction handle */
+	unsigned	sql_stat_start:1;/*!< TRUE when we start processing of
+					an SQL statement: we may have to set
+					an intention lock on the table,
+					create a consistent read view etc. */
+	unsigned	mysql_has_locked:1;/*!< this is set TRUE when MySQL
+					calls external_lock on this handle
+					with a lock flag, and set FALSE when
+					with the F_UNLOCK flag */
+	unsigned	clust_index_was_generated:1;
+					/*!< if the user did not define a
+					primary key in MySQL, then Innobase
+					automatically generated a clustered
+					index where the ordering column is
+					the row id: in this case this flag
+					is set to TRUE */
+	unsigned	index_usable:1;	/*!< caches the value of
+					row_merge_is_index_usable(trx,index) */
+	unsigned	read_just_key:1;/*!< set to 1 when MySQL calls
+					ha_innobase::extra with the
+					argument HA_EXTRA_KEYREAD; it is enough
+					to read just columns defined in
+					the index (i.e., no read of the
+					clustered index record necessary) */
+	unsigned	used_in_HANDLER:1;/*!< TRUE if we have been using this
+					handle in a MySQL HANDLER low level
+					index cursor command: then we must
+					store the pcur position even in a
+					unique search from a clustered index,
+					because HANDLER allows NEXT and PREV
+					in such a situation */
+	unsigned	template_type:2;/*!< ROW_MYSQL_WHOLE_ROW,
+					ROW_MYSQL_REC_FIELDS,
+					ROW_MYSQL_DUMMY_TEMPLATE, or
+					ROW_MYSQL_NO_TEMPLATE */
+	unsigned	n_template:10;	/*!< number of elements in the
+					template */
+	unsigned	null_bitmap_len:10;/*!< number of bytes in the SQL NULL
+					bitmap at the start of a row in the
+					MySQL format */
+	unsigned	need_to_access_clustered:1; /*!< if we are fetching
+					columns through a secondary index
+					and at least one column is not in
+					the secondary index, then this is
+					set to TRUE */
+	unsigned	templ_contains_blob:1;/*!< TRUE if the template contains
+					a column with DATA_BLOB ==
+					get_innobase_type_from_mysql_type();
+					not to be confused with InnoDB
+					externally stored columns
+					(VARCHAR can be off-page too) */
+	mysql_row_templ_t* mysql_template;/*!< template used to transform
+					rows fast between MySQL and Innobase
+					formats; memory for this template
+					is not allocated from 'heap' */
+	mem_heap_t*	heap;		/*!< memory heap from which
+					these auxiliary structures are
+					allocated when needed */
+	ins_node_t*	ins_node;	/*!< Innobase SQL insert node
+					used to perform inserts
+					to the table */
+	byte*		ins_upd_rec_buff;/*!< buffer for storing data converted
+					to the Innobase format from the MySQL
+					format */
+	const byte*	default_rec;	/*!< the default values of all columns
+					(a "default row") in MySQL format */
+	ulint		hint_need_to_fetch_extra_cols;
+					/*!< normally this is set to 0; if this
+					is set to ROW_RETRIEVE_PRIMARY_KEY,
+					then we should at least retrieve all
+					columns in the primary key; if this
+					is set to ROW_RETRIEVE_ALL_COLS, then
+					we must retrieve all columns in the
+					key (if read_just_key == 1), or all
+					columns in the table */
+	upd_node_t*	upd_node;	/*!< Innobase SQL update node used
+					to perform updates and deletes */
+	que_fork_t*	ins_graph;	/*!< Innobase SQL query graph used
+					in inserts */
+	que_fork_t*	upd_graph;	/*!< Innobase SQL query graph used
+					in updates or deletes */
+	btr_pcur_t*	pcur;		/*!< persistent cursor used in selects
+					and updates */
+	btr_pcur_t*	clust_pcur;	/*!< persistent cursor used in
+					some selects and updates */
+	que_fork_t*	sel_graph;	/*!< dummy query graph used in
+					selects */
+	dtuple_t*	search_tuple;	/*!< prebuilt dtuple used in selects */
+	byte		row_id[DATA_ROW_ID_LEN];
+					/*!< if the clustered index was
+					generated, the row id of the
+					last row fetched is stored
+					here */
+	dtuple_t*	clust_ref;	/*!< prebuilt dtuple used in
+					sel/upd/del */
+	ulint		select_lock_type;/*!< LOCK_NONE, LOCK_S, or LOCK_X */
+	ulint		stored_select_lock_type;/*!< this field is used to
+					remember the original select_lock_type
+					that was decided in ha_innodb.cc,
+					::store_lock(), ::external_lock(),
+					etc. */
+	ulint		row_read_type;	/*!< ROW_READ_WITH_LOCKS if row locks
+					should be the obtained for records
+					under an UPDATE or DELETE cursor.
+					If innodb_locks_unsafe_for_binlog
+					is TRUE, this can be set to
+					ROW_READ_TRY_SEMI_CONSISTENT, so that
+					if the row under an UPDATE or DELETE
+					cursor was locked by another
+					transaction, InnoDB will resort
+					to reading the last committed value
+					('semi-consistent read').  Then,
+					this field will be set to
+					ROW_READ_DID_SEMI_CONSISTENT to
+					indicate that.	If the row does not
+					match the WHERE condition, MySQL will
+					invoke handler::unlock_row() to
+					clear the flag back to
+					ROW_READ_TRY_SEMI_CONSISTENT and
+					to simply skip the row.	 If
+					the row matches, the next call to
+					row_search_for_mysql() will lock
+					the row.
+					This eliminates lock waits in some
+					cases; note that this breaks
+					serializability. */
+	ulint		new_rec_locks;	/*!< normally 0; if
+					srv_locks_unsafe_for_binlog is
+					TRUE or session is using READ
+					COMMITTED or READ UNCOMMITTED
+					isolation level, set in
+					row_search_for_mysql() if we set a new
+					record lock on the secondary
+					or clustered index; this is
+					used in row_unlock_for_mysql()
+					when releasing the lock under
+					the cursor if we determine
+					after retrieving the row that
+					it does not need to be locked
+					('mini-rollback') */
+	ulint		mysql_prefix_len;/*!< byte offset of the end of
+					the last requested column */
+	ulint		mysql_row_len;	/*!< length in bytes of a row in the
+					MySQL format */
+	ulint		n_rows_fetched;	/*!< number of rows fetched after
+					positioning the current cursor */
+	ulint		fetch_direction;/*!< ROW_SEL_NEXT or ROW_SEL_PREV */
+	byte*		fetch_cache[MYSQL_FETCH_CACHE_SIZE];
+					/*!< a cache for fetched rows if we
+					fetch many rows from the same cursor:
+					it saves CPU time to fetch them in a
+					batch; we reserve mysql_row_len
+					bytes for each such row; these
+					pointers point 4 bytes past the
+					allocated mem buf start, because
+					there is a 4 byte magic number at the
+					start and at the end */
+	ibool		keep_other_fields_on_keyread; /*!< when using fetch
+					cache with HA_EXTRA_KEYREAD, don't
+					overwrite other fields in mysql row
+					row buffer.*/
+	ulint		fetch_cache_first;/*!< position of the first not yet
+					fetched row in fetch_cache */
+	ulint		n_fetch_cached;	/*!< number of not yet fetched rows
+					in fetch_cache */
+	mem_heap_t*	blob_heap;	/*!< in SELECTS BLOB fields are copied
+					to this heap */
+	mem_heap_t*	old_vers_heap;	/*!< memory heap where a previous
+					version is built in consistent read */
+	/*----------------------*/
+	ulonglong	autoinc_last_value;
+					/*!< last value of AUTO-INC interval */
+	ulonglong	autoinc_increment;/*!< The increment step of the auto
+					increment column. Value must be
+					greater than or equal to 1. Required to
+					calculate the next value */
+	ulonglong	autoinc_offset; /*!< The offset passed to
+					get_auto_increment() by MySQL. Required
+					to calculate the next value */
+	ulint		autoinc_error;	/*!< The actual error code encountered
+					while trying to init or read the
+					autoinc value from the table. We
+					store it here so that we can return
+					it to MySQL */
+	/*----------------------*/
+	ulint		magic_n2;	/*!< this should be the same as
+					magic_n */
+	/*----------------------*/
+        index_cond_func_t idx_cond_func;/* Index Condition Pushdown function,
+                                        or NULL if there is none set */
+        void*           idx_cond_func_arg;/* ICP function  argument */
+        ulint           n_index_fields; /* Number of fields at the start of
+                                        mysql_template. Valid only when using
+                                        ICP. */
+	/*----------------------*/
+};
+
+#define ROW_PREBUILT_FETCH_MAGIC_N	465765687
+
+#define ROW_MYSQL_WHOLE_ROW	0
+#define ROW_MYSQL_REC_FIELDS	1
+#define ROW_MYSQL_NO_TEMPLATE	2
+#define ROW_MYSQL_DUMMY_TEMPLATE 3	/* dummy template used in
+					row_scan_and_check_index */
+
+/* Values for hint_need_to_fetch_extra_cols */
+#define ROW_RETRIEVE_PRIMARY_KEY	1
+#define ROW_RETRIEVE_ALL_COLS		2
+
+/* Values for row_read_type */
+#define ROW_READ_WITH_LOCKS		0
+#define ROW_READ_TRY_SEMI_CONSISTENT	1
+#define ROW_READ_DID_SEMI_CONSISTENT	2
+
+#ifndef UNIV_NONINL
+#include "row0mysql.ic"
+#endif
+
+#endif
diff --git a/storage/xtradb/include/row0mysql.ic b/storage/xtradb/include/row0mysql.ic
new file mode 100644
index 00000000000..35033aa2ad1
--- /dev/null
+++ b/storage/xtradb/include/row0mysql.ic
@@ -0,0 +1,24 @@
+/*****************************************************************************
+
+Copyright (c) 2001, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/row0mysql.ic
+MySQL interface for Innobase
+
+Created 1/23/2001 Heikki Tuuri
+*******************************************************/
diff --git a/storage/xtradb/include/row0purge.h b/storage/xtradb/include/row0purge.h
new file mode 100644
index 00000000000..89ec54fb54a
--- /dev/null
+++ b/storage/xtradb/include/row0purge.h
@@ -0,0 +1,96 @@
+/*****************************************************************************
+
+Copyright (c) 1997, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/row0purge.h
+Purge obsolete records
+
+Created 3/14/1997 Heikki Tuuri
+*******************************************************/
+
+#ifndef row0purge_h
+#define row0purge_h
+
+#include "univ.i"
+#include "data0data.h"
+#include "btr0types.h"
+#include "btr0pcur.h"
+#include "dict0types.h"
+#include "trx0types.h"
+#include "que0types.h"
+#include "row0types.h"
+
+/********************************************************************//**
+Creates a purge node to a query graph.
+@return	own: purge node */
+UNIV_INTERN
+purge_node_t*
+row_purge_node_create(
+/*==================*/
+	que_thr_t*	parent,	/*!< in: parent node, i.e., a thr node */
+	mem_heap_t*	heap);	/*!< in: memory heap where created */
+/***********************************************************//**
+Does the purge operation for a single undo log record. This is a high-level
+function used in an SQL execution graph.
+@return	query thread to run next or NULL */
+UNIV_INTERN
+que_thr_t*
+row_purge_step(
+/*===========*/
+	que_thr_t*	thr);	/*!< in: query thread */
+
+/* Purge node structure */
+
+struct purge_node_struct{
+	que_common_t	common;	/*!< node type: QUE_NODE_PURGE */
+	/*----------------------*/
+	/* Local storage for this graph node */
+	roll_ptr_t	roll_ptr;/* roll pointer to undo log record */
+	trx_undo_rec_t*	undo_rec;/* undo log record */
+	trx_undo_inf_t*	reservation;/* reservation for the undo log record in
+				the purge array */
+	undo_no_t	undo_no;/* undo number of the record */
+	ulint		rec_type;/* undo log record type: TRX_UNDO_INSERT_REC,
+				... */
+	btr_pcur_t	pcur;	/*!< persistent cursor used in searching the
+				clustered index record */
+	ibool		found_clust;/* TRUE if the clustered index record
+				determined by ref was found in the clustered
+				index, and we were able to position pcur on
+				it */
+	dict_table_t*	table;	/*!< table where purge is done */
+	ulint		cmpl_info;/* compiler analysis info of an update */
+	upd_t*		update;	/*!< update vector for a clustered index
+				record */
+	dtuple_t*	ref;	/*!< NULL, or row reference to the next row to
+				handle */
+	dtuple_t*	row;	/*!< NULL, or a copy (also fields copied to
+				heap) of the indexed fields of the row to
+				handle */
+	dict_index_t*	index;	/*!< NULL, or the next index whose record should
+				be handled */
+	mem_heap_t*	heap;	/*!< memory heap used as auxiliary storage for
+				row; this must be emptied after a successful
+				purge of a row */
+};
+
+#ifndef UNIV_NONINL
+#include "row0purge.ic"
+#endif
+
+#endif
diff --git a/storage/xtradb/include/row0purge.ic b/storage/xtradb/include/row0purge.ic
new file mode 100644
index 00000000000..23d7d3845a4
--- /dev/null
+++ b/storage/xtradb/include/row0purge.ic
@@ -0,0 +1,25 @@
+/*****************************************************************************
+
+Copyright (c) 1997, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+
+/**************************************************//**
+@file include/row0purge.ic
+Purge obsolete records
+
+Created 3/14/1997 Heikki Tuuri
+*******************************************************/
diff --git a/storage/xtradb/include/row0row.h b/storage/xtradb/include/row0row.h
new file mode 100644
index 00000000000..723b7b53395
--- /dev/null
+++ b/storage/xtradb/include/row0row.h
@@ -0,0 +1,310 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/row0row.h
+General row routines
+
+Created 4/20/1996 Heikki Tuuri
+*******************************************************/
+
+#ifndef row0row_h
+#define row0row_h
+
+#include "univ.i"
+#include "data0data.h"
+#include "dict0types.h"
+#include "trx0types.h"
+#include "que0types.h"
+#include "mtr0mtr.h"
+#include "rem0types.h"
+#include "read0types.h"
+#include "row0types.h"
+#include "btr0types.h"
+
+/*********************************************************************//**
+Gets the offset of the trx id field, in bytes relative to the origin of
+a clustered index record.
+@return	offset of DATA_TRX_ID */
+UNIV_INTERN
+ulint
+row_get_trx_id_offset(
+/*==================*/
+	const rec_t*	rec,	/*!< in: record */
+	dict_index_t*	index,	/*!< in: clustered index */
+	const ulint*	offsets);/*!< in: rec_get_offsets(rec, index) */
+/*********************************************************************//**
+Reads the trx id field from a clustered index record.
+@return	value of the field */
+UNIV_INLINE
+trx_id_t
+row_get_rec_trx_id(
+/*===============*/
+	const rec_t*	rec,	/*!< in: record */
+	dict_index_t*	index,	/*!< in: clustered index */
+	const ulint*	offsets);/*!< in: rec_get_offsets(rec, index) */
+/*********************************************************************//**
+Reads the roll pointer field from a clustered index record.
+@return	value of the field */
+UNIV_INLINE
+roll_ptr_t
+row_get_rec_roll_ptr(
+/*=================*/
+	const rec_t*	rec,	/*!< in: record */
+	dict_index_t*	index,	/*!< in: clustered index */
+	const ulint*	offsets);/*!< in: rec_get_offsets(rec, index) */
+/*****************************************************************//**
+When an insert or purge to a table is performed, this function builds
+the entry to be inserted into or purged from an index on the table.
+@return index entry which should be inserted or purged, or NULL if the
+externally stored columns in the clustered index record are
+unavailable and ext != NULL */
+UNIV_INTERN
+dtuple_t*
+row_build_index_entry(
+/*==================*/
+	const dtuple_t*	row,	/*!< in: row which should be
+				inserted or purged */
+	row_ext_t*	ext,	/*!< in: externally stored column prefixes,
+				or NULL */
+	dict_index_t*	index,	/*!< in: index on the table */
+	mem_heap_t*	heap);	/*!< in: memory heap from which the memory for
+				the index entry is allocated */
+/*******************************************************************//**
+An inverse function to row_build_index_entry. Builds a row from a
+record in a clustered index.
+@return	own: row built; see the NOTE below! */
+UNIV_INTERN
+dtuple_t*
+row_build(
+/*======*/
+	ulint			type,	/*!< in: ROW_COPY_POINTERS or
+					ROW_COPY_DATA; the latter
+					copies also the data fields to
+					heap while the first only
+					places pointers to data fields
+					on the index page, and thus is
+					more efficient */
+	const dict_index_t*	index,	/*!< in: clustered index */
+	const rec_t*		rec,	/*!< in: record in the clustered
+					index; NOTE: in the case
+					ROW_COPY_POINTERS the data
+					fields in the row will point
+					directly into this record,
+					therefore, the buffer page of
+					this record must be at least
+					s-latched and the latch held
+					as long as the row dtuple is used! */
+	const ulint*		offsets,/*!< in: rec_get_offsets(rec,index)
+					or NULL, in which case this function
+					will invoke rec_get_offsets() */
+	const dict_table_t*	col_table,
+					/*!< in: table, to check which
+					externally stored columns
+					occur in the ordering columns
+					of an index, or NULL if
+					index->table should be
+					consulted instead; the user
+					columns in this table should be
+					the same columns as in index->table */
+	row_ext_t**		ext,	/*!< out, own: cache of
+					externally stored column
+					prefixes, or NULL */
+	mem_heap_t*		heap);	/*!< in: memory heap from which
+					the memory needed is allocated */
+/*******************************************************************//**
+Converts an index record to a typed data tuple.
+@return index entry built; does not set info_bits, and the data fields
+in the entry will point directly to rec */
+UNIV_INTERN
+dtuple_t*
+row_rec_to_index_entry_low(
+/*=======================*/
+	const rec_t*		rec,	/*!< in: record in the index */
+	const dict_index_t*	index,	/*!< in: index */
+	const ulint*		offsets,/*!< in: rec_get_offsets(rec, index) */
+	ulint*			n_ext,	/*!< out: number of externally
+					stored columns */
+	mem_heap_t*		heap);	/*!< in: memory heap from which
+					the memory needed is allocated */
+/*******************************************************************//**
+Converts an index record to a typed data tuple. NOTE that externally
+stored (often big) fields are NOT copied to heap.
+@return	own: index entry built; see the NOTE below! */
+UNIV_INTERN
+dtuple_t*
+row_rec_to_index_entry(
+/*===================*/
+	ulint			type,	/*!< in: ROW_COPY_DATA, or
+					ROW_COPY_POINTERS: the former
+					copies also the data fields to
+					heap as the latter only places
+					pointers to data fields on the
+					index page */
+	const rec_t*		rec,	/*!< in: record in the index;
+					NOTE: in the case
+					ROW_COPY_POINTERS the data
+					fields in the row will point
+					directly into this record,
+					therefore, the buffer page of
+					this record must be at least
+					s-latched and the latch held
+					as long as the dtuple is used! */
+	const dict_index_t*	index,	/*!< in: index */
+	ulint*			offsets,/*!< in/out: rec_get_offsets(rec) */
+	ulint*			n_ext,	/*!< out: number of externally
+					stored columns */
+	mem_heap_t*		heap);	/*!< in: memory heap from which
+					the memory needed is allocated */
+/*******************************************************************//**
+Builds from a secondary index record a row reference with which we can
+search the clustered index record.
+@return	own: row reference built; see the NOTE below! */
+UNIV_INTERN
+dtuple_t*
+row_build_row_ref(
+/*==============*/
+	ulint		type,	/*!< in: ROW_COPY_DATA, or ROW_COPY_POINTERS:
+				the former copies also the data fields to
+				heap, whereas the latter only places pointers
+				to data fields on the index page */
+	dict_index_t*	index,	/*!< in: secondary index */
+	const rec_t*	rec,	/*!< in: record in the index;
+				NOTE: in the case ROW_COPY_POINTERS
+				the data fields in the row will point
+				directly into this record, therefore,
+				the buffer page of this record must be
+				at least s-latched and the latch held
+				as long as the row reference is used! */
+	mem_heap_t*	heap);	/*!< in: memory heap from which the memory
+				needed is allocated */
+/*******************************************************************//**
+Builds from a secondary index record a row reference with which we can
+search the clustered index record. */
+UNIV_INTERN
+void
+row_build_row_ref_in_tuple(
+/*=======================*/
+	dtuple_t*		ref,	/*!< in/out: row reference built;
+					see the NOTE below! */
+	const rec_t*		rec,	/*!< in: record in the index;
+					NOTE: the data fields in ref
+					will point directly into this
+					record, therefore, the buffer
+					page of this record must be at
+					least s-latched and the latch
+					held as long as the row
+					reference is used! */
+	const dict_index_t*	index,	/*!< in: secondary index */
+	ulint*			offsets,/*!< in: rec_get_offsets(rec, index)
+					or NULL */
+	trx_t*			trx);	/*!< in: transaction */
+/*******************************************************************//**
+Builds from a secondary index record a row reference with which we can
+search the clustered index record. */
+UNIV_INLINE
+void
+row_build_row_ref_fast(
+/*===================*/
+	dtuple_t*	ref,	/*!< in/out: typed data tuple where the
+				reference is built */
+	const ulint*	map,	/*!< in: array of field numbers in rec
+				telling how ref should be built from
+				the fields of rec */
+	const rec_t*	rec,	/*!< in: record in the index; must be
+				preserved while ref is used, as we do
+				not copy field values to heap */
+	const ulint*	offsets);/*!< in: array returned by rec_get_offsets() */
+/***************************************************************//**
+Searches the clustered index record for a row, if we have the row
+reference.
+@return	TRUE if found */
+UNIV_INTERN
+ibool
+row_search_on_row_ref(
+/*==================*/
+	btr_pcur_t*		pcur,	/*!< out: persistent cursor, which must
+					be closed by the caller */
+	ulint			mode,	/*!< in: BTR_MODIFY_LEAF, ... */
+	const dict_table_t*	table,	/*!< in: table */
+	const dtuple_t*		ref,	/*!< in: row reference */
+	mtr_t*			mtr);	/*!< in/out: mtr */
+/*********************************************************************//**
+Fetches the clustered index record for a secondary index record. The latches
+on the secondary index record are preserved.
+@return	record or NULL, if no record found */
+UNIV_INTERN
+rec_t*
+row_get_clust_rec(
+/*==============*/
+	ulint		mode,	/*!< in: BTR_MODIFY_LEAF, ... */
+	const rec_t*	rec,	/*!< in: record in a secondary index */
+	dict_index_t*	index,	/*!< in: secondary index */
+	dict_index_t**	clust_index,/*!< out: clustered index */
+	mtr_t*		mtr);	/*!< in: mtr */
+/***************************************************************//**
+Searches an index record.
+@return	TRUE if found */
+UNIV_INTERN
+ibool
+row_search_index_entry(
+/*===================*/
+	dict_index_t*	index,	/*!< in: index */
+	const dtuple_t*	entry,	/*!< in: index entry */
+	ulint		mode,	/*!< in: BTR_MODIFY_LEAF, ... */
+	btr_pcur_t*	pcur,	/*!< in/out: persistent cursor, which must
+				be closed by the caller */
+	mtr_t*		mtr);	/*!< in: mtr */
+
+
+#define ROW_COPY_DATA		1
+#define ROW_COPY_POINTERS	2
+
+/* The allowed latching order of index records is the following:
+(1) a secondary index record ->
+(2) the clustered index record ->
+(3) rollback segment data for the clustered index record.
+
+No new latches may be obtained while the kernel mutex is reserved.
+However, the kernel mutex can be reserved while latches are owned. */
+
+/*******************************************************************//**
+Formats the raw data in "data" (in InnoDB on-disk format) using
+"dict_field" and writes the result to "buf".
+Not more than "buf_size" bytes are written to "buf".
+The result is always NUL-terminated (provided buf_size is positive) and the
+number of bytes that were written to "buf" is returned (including the
+terminating NUL).
+@return	number of bytes that were written */
+UNIV_INTERN
+ulint
+row_raw_format(
+/*===========*/
+	const char*		data,		/*!< in: raw data */
+	ulint			data_len,	/*!< in: raw data length
+						in bytes */
+	const dict_field_t*	dict_field,	/*!< in: index field */
+	char*			buf,		/*!< out: output buffer */
+	ulint			buf_size);	/*!< in: output buffer size
+						in bytes */
+
+#ifndef UNIV_NONINL
+#include "row0row.ic"
+#endif
+
+#endif
diff --git a/storage/xtradb/include/row0row.ic b/storage/xtradb/include/row0row.ic
new file mode 100644
index 00000000000..05c007641af
--- /dev/null
+++ b/storage/xtradb/include/row0row.ic
@@ -0,0 +1,120 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/row0row.ic
+General row routines
+
+Created 4/20/1996 Heikki Tuuri
+*******************************************************/
+
+#include "dict0dict.h"
+#include "rem0rec.h"
+#include "trx0undo.h"
+
+/*********************************************************************//**
+Reads the trx id field from a clustered index record.
+@return	value of the field */
+UNIV_INLINE
+trx_id_t
+row_get_rec_trx_id(
+/*===============*/
+	const rec_t*	rec,	/*!< in: record */
+	dict_index_t*	index,	/*!< in: clustered index */
+	const ulint*	offsets)/*!< in: rec_get_offsets(rec, index) */
+{
+	ulint	offset;
+
+	ut_ad(dict_index_is_clust(index));
+	ut_ad(rec_offs_validate(rec, index, offsets));
+
+	offset = index->trx_id_offset;
+
+	if (!offset) {
+		offset = row_get_trx_id_offset(rec, index, offsets);
+	}
+
+	return(trx_read_trx_id(rec + offset));
+}
+
+/*********************************************************************//**
+Reads the roll pointer field from a clustered index record.
+@return	value of the field */
+UNIV_INLINE
+roll_ptr_t
+row_get_rec_roll_ptr(
+/*=================*/
+	const rec_t*	rec,	/*!< in: record */
+	dict_index_t*	index,	/*!< in: clustered index */
+	const ulint*	offsets)/*!< in: rec_get_offsets(rec, index) */
+{
+	ulint	offset;
+
+	ut_ad(dict_index_is_clust(index));
+	ut_ad(rec_offs_validate(rec, index, offsets));
+
+	offset = index->trx_id_offset;
+
+	if (!offset) {
+		offset = row_get_trx_id_offset(rec, index, offsets);
+	}
+
+	return(trx_read_roll_ptr(rec + offset + DATA_TRX_ID_LEN));
+}
+
+/*******************************************************************//**
+Builds from a secondary index record a row reference with which we can
+search the clustered index record. */
+UNIV_INLINE
+void
+row_build_row_ref_fast(
+/*===================*/
+	dtuple_t*	ref,	/*!< in/out: typed data tuple where the
+				reference is built */
+	const ulint*	map,	/*!< in: array of field numbers in rec
+				telling how ref should be built from
+				the fields of rec */
+	const rec_t*	rec,	/*!< in: record in the index; must be
+				preserved while ref is used, as we do
+				not copy field values to heap */
+	const ulint*	offsets)/*!< in: array returned by rec_get_offsets() */
+{
+	dfield_t*	dfield;
+	const byte*	field;
+	ulint		len;
+	ulint		ref_len;
+	ulint		field_no;
+	ulint		i;
+
+	ut_ad(rec_offs_validate(rec, NULL, offsets));
+	ut_ad(!rec_offs_any_extern(offsets));
+	ref_len = dtuple_get_n_fields(ref);
+
+	for (i = 0; i < ref_len; i++) {
+		dfield = dtuple_get_nth_field(ref, i);
+
+		field_no = *(map + i);
+
+		if (field_no != ULINT_UNDEFINED) {
+
+			field = rec_get_nth_field(rec, offsets,
+						  field_no, &len);
+			dfield_set_data(dfield, field, len);
+		}
+	}
+}
diff --git a/storage/xtradb/include/row0sel.h b/storage/xtradb/include/row0sel.h
new file mode 100644
index 00000000000..8544b9d08ba
--- /dev/null
+++ b/storage/xtradb/include/row0sel.h
@@ -0,0 +1,402 @@
+/*****************************************************************************
+
+Copyright (c) 1997, 2010, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/row0sel.h
+Select
+
+Created 12/19/1997 Heikki Tuuri
+*******************************************************/
+
+#ifndef row0sel_h
+#define row0sel_h
+
+#include "univ.i"
+#include "data0data.h"
+#include "que0types.h"
+#include "dict0types.h"
+#include "trx0types.h"
+#include "row0types.h"
+#include "que0types.h"
+#include "pars0sym.h"
+#include "btr0pcur.h"
+#include "read0read.h"
+#include "row0mysql.h"
+
+/*********************************************************************//**
+Creates a select node struct.
+@return	own: select node struct */
+UNIV_INTERN
+sel_node_t*
+sel_node_create(
+/*============*/
+	mem_heap_t*	heap);	/*!< in: memory heap where created */
+/*********************************************************************//**
+Frees the memory private to a select node when a query graph is freed,
+does not free the heap where the node was originally created. */
+UNIV_INTERN
+void
+sel_node_free_private(
+/*==================*/
+	sel_node_t*	node);	/*!< in: select node struct */
+/*********************************************************************//**
+Frees a prefetch buffer for a column, including the dynamically allocated
+memory for data stored there. */
+UNIV_INTERN
+void
+sel_col_prefetch_buf_free(
+/*======================*/
+	sel_buf_t*	prefetch_buf);	/*!< in, own: prefetch buffer */
+/*********************************************************************//**
+Gets the plan node for the nth table in a join.
+@return	plan node */
+UNIV_INLINE
+plan_t*
+sel_node_get_nth_plan(
+/*==================*/
+	sel_node_t*	node,	/*!< in: select node */
+	ulint		i);	/*!< in: get ith plan node */
+/**********************************************************************//**
+Performs a select step. This is a high-level function used in SQL execution
+graphs.
+@return	query thread to run next or NULL */
+UNIV_INTERN
+que_thr_t*
+row_sel_step(
+/*=========*/
+	que_thr_t*	thr);	/*!< in: query thread */
+/**********************************************************************//**
+Performs an execution step of an open or close cursor statement node.
+@return	query thread to run next or NULL */
+UNIV_INLINE
+que_thr_t*
+open_step(
+/*======*/
+	que_thr_t*	thr);	/*!< in: query thread */
+/**********************************************************************//**
+Performs a fetch for a cursor.
+@return	query thread to run next or NULL */
+UNIV_INTERN
+que_thr_t*
+fetch_step(
+/*=======*/
+	que_thr_t*	thr);	/*!< in: query thread */
+/****************************************************************//**
+Sample callback function for fetch that prints each row.
+@return	always returns non-NULL */
+UNIV_INTERN
+void*
+row_fetch_print(
+/*============*/
+	void*	row,		/*!< in:  sel_node_t* */
+	void*	user_arg);	/*!< in:  not used */
+/***********************************************************//**
+Prints a row in a select result.
+@return	query thread to run next or NULL */
+UNIV_INTERN
+que_thr_t*
+row_printf_step(
+/*============*/
+	que_thr_t*	thr);	/*!< in: query thread */
+/****************************************************************//**
+Converts a key value stored in MySQL format to an Innobase dtuple. The last
+field of the key value may be just a prefix of a fixed length field: hence
+the parameter key_len. But currently we do not allow search keys where the
+last field is only a prefix of the full key field len and print a warning if
+such appears. */
+UNIV_INTERN
+void
+row_sel_convert_mysql_key_to_innobase(
+/*==================================*/
+	dtuple_t*	tuple,		/*!< in/out: tuple where to build;
+					NOTE: we assume that the type info
+					in the tuple is already according
+					to index! */
+	byte*		buf,		/*!< in: buffer to use in field
+					conversions */
+	ulint		buf_len,	/*!< in: buffer length */
+	dict_index_t*	index,		/*!< in: index of the key value */
+	const byte*	key_ptr,	/*!< in: MySQL key value */
+	ulint		key_len,	/*!< in: MySQL key value length */
+	trx_t*		trx);		/*!< in: transaction */
+/********************************************************************//**
+Searches for rows in the database. This is used in the interface to
+MySQL. This function opens a cursor, and also implements fetch next
+and fetch prev. NOTE that if we do a search with a full key value
+from a unique index (ROW_SEL_EXACT), then we will not store the cursor
+position and fetch next or fetch prev must not be tried to the cursor!
+@return DB_SUCCESS, DB_RECORD_NOT_FOUND, DB_END_OF_INDEX, DB_DEADLOCK,
+DB_LOCK_TABLE_FULL, or DB_TOO_BIG_RECORD */
+UNIV_INTERN
+ulint
+row_search_for_mysql(
+/*=================*/
+	byte*		buf,		/*!< in/out: buffer for the fetched
+					row in the MySQL format */
+	ulint		mode,		/*!< in: search mode PAGE_CUR_L, ... */
+	row_prebuilt_t*	prebuilt,	/*!< in: prebuilt struct for the
+					table handle; this contains the info
+					of search_tuple, index; if search
+					tuple contains 0 fields then we
+					position the cursor at the start or
+					the end of the index, depending on
+					'mode' */
+	ulint		match_mode,	/*!< in: 0 or ROW_SEL_EXACT or
+					ROW_SEL_EXACT_PREFIX */
+	ulint		direction);	/*!< in: 0 or ROW_SEL_NEXT or
+					ROW_SEL_PREV; NOTE: if this is != 0,
+					then prebuilt must have a pcur
+					with stored position! In opening of a
+					cursor 'direction' should be 0. */
+/*******************************************************************//**
+Checks if MySQL at the moment is allowed for this table to retrieve a
+consistent read result, or store it to the query cache.
+@return	TRUE if storing or retrieving from the query cache is permitted */
+UNIV_INTERN
+ibool
+row_search_check_if_query_cache_permitted(
+/*======================================*/
+	trx_t*		trx,		/*!< in: transaction object */
+	const char*	norm_name);	/*!< in: concatenation of database name,
+					'/' char, table name */
+/*******************************************************************//**
+Read the max AUTOINC value from an index.
+@return	DB_SUCCESS if all OK else error code */
+UNIV_INTERN
+ulint
+row_search_max_autoinc(
+/*===================*/
+	dict_index_t*	index,		/*!< in: index to search */
+	const char*	col_name,	/*!< in: autoinc column name */
+	ib_uint64_t*	value);		/*!< out: AUTOINC value read */
+
+/** A structure for caching column values for prefetched rows */
+struct sel_buf_struct{
+	byte*		data;	/*!< data, or NULL; if not NULL, this field
+				has allocated memory which must be explicitly
+				freed; can be != NULL even when len is
+				UNIV_SQL_NULL */
+	ulint		len;	/*!< data length or UNIV_SQL_NULL */
+	ulint		val_buf_size;
+				/*!< size of memory buffer allocated for data:
+				this can be more than len; this is defined
+				when data != NULL */
+};
+
+/** Query plan */
+struct plan_struct{
+	dict_table_t*	table;		/*!< table struct in the dictionary
+					cache */
+	dict_index_t*	index;		/*!< table index used in the search */
+	btr_pcur_t	pcur;		/*!< persistent cursor used to search
+					the index */
+	ibool		asc;		/*!< TRUE if cursor traveling upwards */
+	ibool		pcur_is_open;	/*!< TRUE if pcur has been positioned
+					and we can try to fetch new rows */
+	ibool		cursor_at_end;	/*!< TRUE if the cursor is open but
+					we know that there are no more
+					qualifying rows left to retrieve from
+					the index tree; NOTE though, that
+					there may still be unprocessed rows in
+					the prefetch stack; always FALSE when
+					pcur_is_open is FALSE */
+	ibool		stored_cursor_rec_processed;
+					/*!< TRUE if the pcur position has been
+					stored and the record it is positioned
+					on has already been processed */
+	que_node_t**	tuple_exps;	/*!< array of expressions
+					which are used to calculate
+					the field values in the search
+					tuple: there is one expression
+					for each field in the search
+					tuple */
+	dtuple_t*	tuple;		/*!< search tuple */
+	ulint		mode;		/*!< search mode: PAGE_CUR_G, ... */
+	ulint		n_exact_match;	/*!< number of first fields in
+					the search tuple which must be
+					exactly matched */
+	ibool		unique_search;	/*!< TRUE if we are searching an
+					index record with a unique key */
+	ulint		n_rows_fetched;	/*!< number of rows fetched using pcur
+					after it was opened */
+	ulint		n_rows_prefetched;/*!< number of prefetched rows cached
+					for fetch: fetching several rows in
+					the same mtr saves CPU time */
+	ulint		first_prefetched;/*!< index of the first cached row in
+					select buffer arrays for each column */
+	ibool		no_prefetch;	/*!< no prefetch for this table */
+	sym_node_list_t	columns;	/*!< symbol table nodes for the columns
+					to retrieve from the table */
+	UT_LIST_BASE_NODE_T(func_node_t)
+			end_conds;	/*!< conditions which determine the
+					fetch limit of the index segment we
+					have to look at: when one of these
+					fails, the result set has been
+					exhausted for the cursor in this
+					index; these conditions are normalized
+					so that in a comparison the column
+					for this table is the first argument */
+	UT_LIST_BASE_NODE_T(func_node_t)
+			other_conds;	/*!< the rest of search conditions we can
+					test at this table in a join */
+	ibool		must_get_clust;	/*!< TRUE if index is a non-clustered
+					index and we must also fetch the
+					clustered index record; this is the
+					case if the non-clustered record does
+					not contain all the needed columns, or
+					if this is a single-table explicit
+					cursor, or a searched update or
+					delete */
+	ulint*		clust_map;	/*!< map telling how clust_ref is built
+					from the fields of a non-clustered
+					record */
+	dtuple_t*	clust_ref;	/*!< the reference to the clustered
+					index entry is built here if index is
+					a non-clustered index */
+	btr_pcur_t	clust_pcur;	/*!< if index is non-clustered, we use
+					this pcur to search the clustered
+					index */
+	mem_heap_t*	old_vers_heap;	/*!< memory heap used in building an old
+					version of a row, or NULL */
+};
+
+/** Select node states */
+enum sel_node_state {
+	SEL_NODE_CLOSED,	/*!< it is a declared cursor which is not
+				currently open */
+	SEL_NODE_OPEN,		/*!< intention locks not yet set on tables */
+	SEL_NODE_FETCH,		/*!< intention locks have been set */
+	SEL_NODE_NO_MORE_ROWS	/*!< cursor has reached the result set end */
+};
+
+/** Select statement node */
+struct sel_node_struct{
+	que_common_t	common;		/*!< node type: QUE_NODE_SELECT */
+	enum sel_node_state
+			state;	/*!< node state */
+	que_node_t*	select_list;	/*!< select list */
+	sym_node_t*	into_list;	/*!< variables list or NULL */
+	sym_node_t*	table_list;	/*!< table list */
+	ibool		asc;		/*!< TRUE if the rows should be fetched
+					in an ascending order */
+	ibool		set_x_locks;	/*!< TRUE if the cursor is for update or
+					delete, which means that a row x-lock
+					should be placed on the cursor row */
+	ulint		row_lock_mode;	/*!< LOCK_X or LOCK_S */
+	ulint		n_tables;	/*!< number of tables */
+	ulint		fetch_table;	/*!< number of the next table to access
+					in the join */
+	plan_t*		plans;		/*!< array of n_tables many plan nodes
+					containing the search plan and the
+					search data structures */
+	que_node_t*	search_cond;	/*!< search condition */
+	read_view_t*	read_view;	/*!< if the query is a non-locking
+					consistent read, its read view is
+					placed here, otherwise NULL */
+	ibool		consistent_read;/*!< TRUE if the select is a consistent,
+					non-locking read */
+	order_node_t*	order_by;	/*!< order by column definition, or
+					NULL */
+	ibool		is_aggregate;	/*!< TRUE if the select list consists of
+					aggregate functions */
+	ibool		aggregate_already_fetched;
+					/*!< TRUE if the aggregate row has
+					already been fetched for the current
+					cursor */
+	ibool		can_get_updated;/*!< this is TRUE if the select
+					is in a single-table explicit
+					cursor which can get updated
+					within the stored procedure,
+					or in a searched update or
+					delete; NOTE that to determine
+					of an explicit cursor if it
+					can get updated, the parser
+					checks from a stored procedure
+					if it contains positioned
+					update or delete statements */
+	sym_node_t*	explicit_cursor;/*!< not NULL if an explicit cursor */
+	UT_LIST_BASE_NODE_T(sym_node_t)
+			copy_variables; /*!< variables whose values we have to
+					copy when an explicit cursor is opened,
+					so that they do not change between
+					fetches */
+};
+
+/** Fetch statement node */
+struct fetch_node_struct{
+	que_common_t	common;		/*!< type: QUE_NODE_FETCH */
+	sel_node_t*	cursor_def;	/*!< cursor definition */
+	sym_node_t*	into_list;	/*!< variables to set */
+
+	pars_user_func_t*
+			func;		/*!< User callback function or NULL.
+					The first argument to the function
+					is a sel_node_t*, containing the
+					results of the SELECT operation for
+					one row. If the function returns
+					NULL, it is not interested in
+					further rows and the cursor is
+					modified so (cursor % NOTFOUND) is
+					true. If it returns not-NULL,
+					continue normally. See
+					row_fetch_print() for an example
+					(and a useful debugging tool). */
+};
+
+/** Open or close cursor operation type */
+enum open_node_op {
+	ROW_SEL_OPEN_CURSOR,	/*!< open cursor */
+	ROW_SEL_CLOSE_CURSOR	/*!< close cursor */
+};
+
+/** Open or close cursor statement node */
+struct open_node_struct{
+	que_common_t	common;		/*!< type: QUE_NODE_OPEN */
+	enum open_node_op
+			op_type;	/*!< operation type: open or
+					close cursor */
+	sel_node_t*	cursor_def;	/*!< cursor definition */
+};
+
+/** Row printf statement node */
+struct row_printf_node_struct{
+	que_common_t	common;		/*!< type: QUE_NODE_ROW_PRINTF */
+	sel_node_t*	sel_node;	/*!< select */
+};
+
+/** Search direction for the MySQL interface */
+enum row_sel_direction {
+	ROW_SEL_NEXT = 1,	/*!< ascending direction */
+	ROW_SEL_PREV = 2	/*!< descending direction */
+};
+
+/** Match mode for the MySQL interface */
+enum row_sel_match_mode {
+	ROW_SEL_EXACT = 1,	/*!< search using a complete key value */
+	ROW_SEL_EXACT_PREFIX	/*!< search using a key prefix which
+				must match rows: the prefix may
+				contain an incomplete field (the last
+				field in prefix may be just a prefix
+				of a fixed length column) */
+};
+
+#ifndef UNIV_NONINL
+#include "row0sel.ic"
+#endif
+
+#endif
diff --git a/storage/xtradb/include/row0sel.ic b/storage/xtradb/include/row0sel.ic
new file mode 100644
index 00000000000..5907f9913da
--- /dev/null
+++ b/storage/xtradb/include/row0sel.ic
@@ -0,0 +1,105 @@
+/*****************************************************************************
+
+Copyright (c) 1997, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/row0sel.ic
+Select
+
+Created 12/19/1997 Heikki Tuuri
+*******************************************************/
+
+#include "que0que.h"
+
+/*********************************************************************//**
+Gets the plan node for the nth table in a join.
+@return	plan node */
+UNIV_INLINE
+plan_t*
+sel_node_get_nth_plan(
+/*==================*/
+	sel_node_t*	node,	/*!< in: select node */
+	ulint		i)	/*!< in: get ith plan node */
+{
+	ut_ad(i < node->n_tables);
+
+	return(node->plans + i);
+}
+
+/*********************************************************************//**
+Resets the cursor defined by sel_node to the SEL_NODE_OPEN state, which means
+that it will start fetching from the start of the result set again, regardless
+of where it was before, and it will set intention locks on the tables. */
+UNIV_INLINE
+void
+sel_node_reset_cursor(
+/*==================*/
+	sel_node_t*	node)	/*!< in: select node */
+{
+	node->state = SEL_NODE_OPEN;
+}
+
+/**********************************************************************//**
+Performs an execution step of an open or close cursor statement node.
+@return	query thread to run next or NULL */
+UNIV_INLINE
+que_thr_t*
+open_step(
+/*======*/
+	que_thr_t*	thr)	/*!< in: query thread */
+{
+	sel_node_t*	sel_node;
+	open_node_t*	node;
+	ulint		err;
+
+	ut_ad(thr);
+
+	node = (open_node_t*) thr->run_node;
+	ut_ad(que_node_get_type(node) == QUE_NODE_OPEN);
+
+	sel_node = node->cursor_def;
+
+	err = DB_SUCCESS;
+
+	if (node->op_type == ROW_SEL_OPEN_CURSOR) {
+
+		/*		if (sel_node->state == SEL_NODE_CLOSED) { */
+
+		sel_node_reset_cursor(sel_node);
+		/*		} else {
+		err = DB_ERROR;
+		} */
+	} else {
+		if (sel_node->state != SEL_NODE_CLOSED) {
+
+			sel_node->state = SEL_NODE_CLOSED;
+		} else {
+			err = DB_ERROR;
+		}
+	}
+
+	if (UNIV_EXPECT(err, DB_SUCCESS) != DB_SUCCESS) {
+		/* SQL error detected */
+		fprintf(stderr, "SQL error %lu\n", (ulong) err);
+
+		ut_error;
+	}
+
+	thr->run_node = que_node_get_parent(node);
+
+	return(thr);
+}
diff --git a/storage/xtradb/include/row0types.h b/storage/xtradb/include/row0types.h
new file mode 100644
index 00000000000..7920fd75061
--- /dev/null
+++ b/storage/xtradb/include/row0types.h
@@ -0,0 +1,59 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/row0types.h
+Row operation global types
+
+Created 12/27/1996 Heikki Tuuri
+*******************************************************/
+
+#ifndef row0types_h
+#define row0types_h
+
+typedef struct plan_struct plan_t;
+
+typedef	struct upd_struct upd_t;
+
+typedef struct upd_field_struct upd_field_t;
+
+typedef	struct upd_node_struct upd_node_t;
+
+typedef	struct del_node_struct del_node_t;
+
+typedef	struct ins_node_struct ins_node_t;
+
+typedef struct sel_node_struct	sel_node_t;
+
+typedef struct open_node_struct	open_node_t;
+
+typedef struct fetch_node_struct fetch_node_t;
+
+typedef struct row_printf_node_struct	row_printf_node_t;
+typedef struct sel_buf_struct	sel_buf_t;
+
+typedef	struct undo_node_struct undo_node_t;
+
+typedef	struct purge_node_struct purge_node_t;
+
+typedef struct row_ext_struct row_ext_t;
+
+/* MySQL data types */
+typedef struct st_table TABLE;
+
+#endif
diff --git a/storage/xtradb/include/row0uins.h b/storage/xtradb/include/row0uins.h
new file mode 100644
index 00000000000..77b071c3a6b
--- /dev/null
+++ b/storage/xtradb/include/row0uins.h
@@ -0,0 +1,54 @@
+/*****************************************************************************
+
+Copyright (c) 1997, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/row0uins.h
+Fresh insert undo
+
+Created 2/25/1997 Heikki Tuuri
+*******************************************************/
+
+#ifndef row0uins_h
+#define row0uins_h
+
+#include "univ.i"
+#include "data0data.h"
+#include "dict0types.h"
+#include "trx0types.h"
+#include "que0types.h"
+#include "row0types.h"
+#include "mtr0mtr.h"
+
+/***********************************************************//**
+Undoes a fresh insert of a row to a table. A fresh insert means that
+the same clustered index unique key did not have any record, even delete
+marked, at the time of the insert.  InnoDB is eager in a rollback:
+if it figures out that an index record will be removed in the purge
+anyway, it will remove it in the rollback.
+@return	DB_SUCCESS */
+UNIV_INTERN
+ulint
+row_undo_ins(
+/*=========*/
+	undo_node_t*	node);	/*!< in: row undo node */
+
+#ifndef UNIV_NONINL
+#include "row0uins.ic"
+#endif
+
+#endif
diff --git a/storage/xtradb/include/row0uins.ic b/storage/xtradb/include/row0uins.ic
new file mode 100644
index 00000000000..27606150d8e
--- /dev/null
+++ b/storage/xtradb/include/row0uins.ic
@@ -0,0 +1,25 @@
+/*****************************************************************************
+
+Copyright (c) 1997, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/row0uins.ic
+Fresh insert undo
+
+Created 2/25/1997 Heikki Tuuri
+*******************************************************/
+
diff --git a/storage/xtradb/include/row0umod.h b/storage/xtradb/include/row0umod.h
new file mode 100644
index 00000000000..ed44cc8d601
--- /dev/null
+++ b/storage/xtradb/include/row0umod.h
@@ -0,0 +1,52 @@
+/*****************************************************************************
+
+Copyright (c) 1997, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/row0umod.h
+Undo modify of a row
+
+Created 2/27/1997 Heikki Tuuri
+*******************************************************/
+
+#ifndef row0umod_h
+#define row0umod_h
+
+#include "univ.i"
+#include "data0data.h"
+#include "dict0types.h"
+#include "trx0types.h"
+#include "que0types.h"
+#include "row0types.h"
+#include "mtr0mtr.h"
+
+/***********************************************************//**
+Undoes a modify operation on a row of a table.
+@return	DB_SUCCESS or error code */
+UNIV_INTERN
+ulint
+row_undo_mod(
+/*=========*/
+	undo_node_t*	node,	/*!< in: row undo node */
+	que_thr_t*	thr);	/*!< in: query thread */
+
+
+#ifndef UNIV_NONINL
+#include "row0umod.ic"
+#endif
+
+#endif
diff --git a/storage/xtradb/include/row0umod.ic b/storage/xtradb/include/row0umod.ic
new file mode 100644
index 00000000000..ea3fd3b43c7
--- /dev/null
+++ b/storage/xtradb/include/row0umod.ic
@@ -0,0 +1,24 @@
+/*****************************************************************************
+
+Copyright (c) 1997, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/row0umod.ic
+Undo modify of a row
+
+Created 2/27/1997 Heikki Tuuri
+*******************************************************/
diff --git a/storage/xtradb/include/row0undo.h b/storage/xtradb/include/row0undo.h
new file mode 100644
index 00000000000..6eb4ca448b3
--- /dev/null
+++ b/storage/xtradb/include/row0undo.h
@@ -0,0 +1,142 @@
+/*****************************************************************************
+
+Copyright (c) 1997, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/row0undo.h
+Row undo
+
+Created 1/8/1997 Heikki Tuuri
+*******************************************************/
+
+#ifndef row0undo_h
+#define row0undo_h
+
+#include "univ.i"
+#include "mtr0mtr.h"
+#include "trx0sys.h"
+#include "btr0types.h"
+#include "btr0pcur.h"
+#include "dict0types.h"
+#include "trx0types.h"
+#include "que0types.h"
+#include "row0types.h"
+
+/********************************************************************//**
+Creates a row undo node to a query graph.
+@return	own: undo node */
+UNIV_INTERN
+undo_node_t*
+row_undo_node_create(
+/*=================*/
+	trx_t*		trx,	/*!< in: transaction */
+	que_thr_t*	parent,	/*!< in: parent node, i.e., a thr node */
+	mem_heap_t*	heap);	/*!< in: memory heap where created */
+/***********************************************************//**
+Looks for the clustered index record when node has the row reference.
+The pcur in node is used in the search. If found, stores the row to node,
+and stores the position of pcur, and detaches it. The pcur must be closed
+by the caller in any case.
+@return TRUE if found; NOTE the node->pcur must be closed by the
+caller, regardless of the return value */
+UNIV_INTERN
+ibool
+row_undo_search_clust_to_pcur(
+/*==========================*/
+	undo_node_t*	node);	/*!< in: row undo node */
+/***********************************************************//**
+Undoes a row operation in a table. This is a high-level function used
+in SQL execution graphs.
+@return	query thread to run next or NULL */
+UNIV_INTERN
+que_thr_t*
+row_undo_step(
+/*==========*/
+	que_thr_t*	thr);	/*!< in: query thread */
+
+/* A single query thread will try to perform the undo for all successive
+versions of a clustered index record, if the transaction has modified it
+several times during the execution which is rolled back. It may happen
+that the task is transferred to another query thread, if the other thread
+is assigned to handle an undo log record in the chain of different versions
+of the record, and the other thread happens to get the x-latch to the
+clustered index record at the right time.
+	If a query thread notices that the clustered index record it is looking
+for is missing, or the roll ptr field in the record doed not point to the
+undo log record the thread was assigned to handle, then it gives up the undo
+task for that undo log record, and fetches the next. This situation can occur
+just in the case where the transaction modified the same record several times
+and another thread is currently doing the undo for successive versions of
+that index record. */
+
+/** Execution state of an undo node */
+enum undo_exec {
+	UNDO_NODE_FETCH_NEXT = 1,	/*!< we should fetch the next
+					undo log record */
+	UNDO_NODE_PREV_VERS,		/*!< the roll ptr to previous
+					version of a row is stored in
+					node, and undo should be done
+					based on it */
+	UNDO_NODE_INSERT,		/*!< undo a fresh insert of a
+					row to a table */
+	UNDO_NODE_MODIFY		/*!< undo a modify operation
+					(DELETE or UPDATE) on a row
+					of a table */
+};
+
+/** Undo node structure */
+struct undo_node_struct{
+	que_common_t	common;	/*!< node type: QUE_NODE_UNDO */
+	enum undo_exec	state;	/*!< node execution state */
+	trx_t*		trx;	/*!< trx for which undo is done */
+	roll_ptr_t	roll_ptr;/*!< roll pointer to undo log record */
+	trx_undo_rec_t*	undo_rec;/*!< undo log record */
+	undo_no_t	undo_no;/*!< undo number of the record */
+	ulint		rec_type;/*!< undo log record type: TRX_UNDO_INSERT_REC,
+				... */
+	roll_ptr_t	new_roll_ptr;
+				/*!< roll ptr to restore to clustered index
+				record */
+	trx_id_t	new_trx_id; /*!< trx id to restore to clustered index
+				record */
+	btr_pcur_t	pcur;	/*!< persistent cursor used in searching the
+				clustered index record */
+	dict_table_t*	table;	/*!< table where undo is done */
+	ulint		cmpl_info;/*!< compiler analysis of an update */
+	upd_t*		update;	/*!< update vector for a clustered index
+				record */
+	dtuple_t*	ref;	/*!< row reference to the next row to handle */
+	dtuple_t*	row;	/*!< a copy (also fields copied to heap) of the
+				row to handle */
+	row_ext_t*	ext;	/*!< NULL, or prefixes of the externally
+				stored columns of the row */
+	dtuple_t*	undo_row;/*!< NULL, or the row after undo */
+	row_ext_t*	undo_ext;/*!< NULL, or prefixes of the externally
+				stored columns of undo_row */
+	dict_index_t*	index;	/*!< the next index whose record should be
+				handled */
+	mem_heap_t*	heap;	/*!< memory heap used as auxiliary storage for
+				row; this must be emptied after undo is tried
+				on a row */
+};
+
+
+#ifndef UNIV_NONINL
+#include "row0undo.ic"
+#endif
+
+#endif
diff --git a/storage/xtradb/include/row0undo.ic b/storage/xtradb/include/row0undo.ic
new file mode 100644
index 00000000000..dc788debc14
--- /dev/null
+++ b/storage/xtradb/include/row0undo.ic
@@ -0,0 +1,24 @@
+/*****************************************************************************
+
+Copyright (c) 1997, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/row0undo.ic
+Row undo
+
+Created 1/8/1997 Heikki Tuuri
+*******************************************************/
diff --git a/storage/xtradb/include/row0upd.h b/storage/xtradb/include/row0upd.h
new file mode 100644
index 00000000000..635d746d5a1
--- /dev/null
+++ b/storage/xtradb/include/row0upd.h
@@ -0,0 +1,483 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/row0upd.h
+Update of a row
+
+Created 12/27/1996 Heikki Tuuri
+*******************************************************/
+
+#ifndef row0upd_h
+#define row0upd_h
+
+#include "univ.i"
+#include "data0data.h"
+#include "row0types.h"
+#include "btr0types.h"
+#include "dict0types.h"
+#include "trx0types.h"
+
+#ifndef UNIV_HOTBACKUP
+# include "btr0pcur.h"
+# include "que0types.h"
+# include "pars0types.h"
+#endif /* !UNIV_HOTBACKUP */
+
+/*********************************************************************//**
+Creates an update vector object.
+@return	own: update vector object */
+UNIV_INLINE
+upd_t*
+upd_create(
+/*=======*/
+	ulint		n,	/*!< in: number of fields */
+	mem_heap_t*	heap);	/*!< in: heap from which memory allocated */
+/*********************************************************************//**
+Returns the number of fields in the update vector == number of columns
+to be updated by an update vector.
+@return	number of fields */
+UNIV_INLINE
+ulint
+upd_get_n_fields(
+/*=============*/
+	const upd_t*	update);	/*!< in: update vector */
+#ifdef UNIV_DEBUG
+/*********************************************************************//**
+Returns the nth field of an update vector.
+@return	update vector field */
+UNIV_INLINE
+upd_field_t*
+upd_get_nth_field(
+/*==============*/
+	const upd_t*	update,	/*!< in: update vector */
+	ulint		n);	/*!< in: field position in update vector */
+#else
+# define upd_get_nth_field(update, n) ((update)->fields + (n))
+#endif
+#ifndef UNIV_HOTBACKUP
+/*********************************************************************//**
+Sets an index field number to be updated by an update vector field. */
+UNIV_INLINE
+void
+upd_field_set_field_no(
+/*===================*/
+	upd_field_t*	upd_field,	/*!< in: update vector field */
+	ulint		field_no,	/*!< in: field number in a clustered
+					index */
+	dict_index_t*	index,		/*!< in: index */
+	trx_t*		trx);		/*!< in: transaction */
+/*********************************************************************//**
+Returns a field of an update vector by field_no.
+@return	update vector field, or NULL */
+UNIV_INLINE
+const upd_field_t*
+upd_get_field_by_field_no(
+/*======================*/
+	const upd_t*	update,	/*!< in: update vector */
+	ulint		no)	/*!< in: field_no */
+	__attribute__((nonnull, pure));
+/*********************************************************************//**
+Writes into the redo log the values of trx id and roll ptr and enough info
+to determine their positions within a clustered index record.
+@return	new pointer to mlog */
+UNIV_INTERN
+byte*
+row_upd_write_sys_vals_to_log(
+/*==========================*/
+	dict_index_t*	index,	/*!< in: clustered index */
+	trx_t*		trx,	/*!< in: transaction */
+	roll_ptr_t	roll_ptr,/*!< in: roll ptr of the undo log record */
+	byte*		log_ptr,/*!< pointer to a buffer of size > 20 opened
+				in mlog */
+	mtr_t*		mtr);	/*!< in: mtr */
+/*********************************************************************//**
+Updates the trx id and roll ptr field in a clustered index record when
+a row is updated or marked deleted. */
+UNIV_INLINE
+void
+row_upd_rec_sys_fields(
+/*===================*/
+	rec_t*		rec,	/*!< in/out: record */
+	page_zip_des_t*	page_zip,/*!< in/out: compressed page whose
+				uncompressed part will be updated, or NULL */
+	dict_index_t*	index,	/*!< in: clustered index */
+	const ulint*	offsets,/*!< in: rec_get_offsets(rec, index) */
+	trx_t*		trx,	/*!< in: transaction */
+	roll_ptr_t	roll_ptr);/*!< in: roll ptr of the undo log record */
+/*********************************************************************//**
+Sets the trx id or roll ptr field of a clustered index entry. */
+UNIV_INTERN
+void
+row_upd_index_entry_sys_field(
+/*==========================*/
+	const dtuple_t*	entry,	/*!< in: index entry, where the memory buffers
+				for sys fields are already allocated:
+				the function just copies the new values to
+				them */
+	dict_index_t*	index,	/*!< in: clustered index */
+	ulint		type,	/*!< in: DATA_TRX_ID or DATA_ROLL_PTR */
+	dulint		val);	/*!< in: value to write */
+/*********************************************************************//**
+Creates an update node for a query graph.
+@return	own: update node */
+UNIV_INTERN
+upd_node_t*
+upd_node_create(
+/*============*/
+	mem_heap_t*	heap);	/*!< in: mem heap where created */
+/***********************************************************//**
+Writes to the redo log the new values of the fields occurring in the index. */
+UNIV_INTERN
+void
+row_upd_index_write_log(
+/*====================*/
+	const upd_t*	update,	/*!< in: update vector */
+	byte*		log_ptr,/*!< in: pointer to mlog buffer: must
+				contain at least MLOG_BUF_MARGIN bytes
+				of free space; the buffer is closed
+				within this function */
+	mtr_t*		mtr);	/*!< in: mtr into whose log to write */
+/***********************************************************//**
+Returns TRUE if row update changes size of some field in index or if some
+field to be updated is stored externally in rec or update.
+@return TRUE if the update changes the size of some field in index or
+the field is external in rec or update */
+UNIV_INTERN
+ibool
+row_upd_changes_field_size_or_external(
+/*===================================*/
+	dict_index_t*	index,	/*!< in: index */
+	const ulint*	offsets,/*!< in: rec_get_offsets(rec, index) */
+	const upd_t*	update);/*!< in: update vector */
+#endif /* !UNIV_HOTBACKUP */
+/***********************************************************//**
+Replaces the new column values stored in the update vector to the record
+given. No field size changes are allowed. */
+UNIV_INTERN
+void
+row_upd_rec_in_place(
+/*=================*/
+	rec_t*		rec,	/*!< in/out: record where replaced */
+	dict_index_t*	index,	/*!< in: the index the record belongs to */
+	const ulint*	offsets,/*!< in: array returned by rec_get_offsets() */
+	const upd_t*	update,	/*!< in: update vector */
+	page_zip_des_t*	page_zip);/*!< in: compressed page with enough space
+				available, or NULL */
+#ifndef UNIV_HOTBACKUP
+/***************************************************************//**
+Builds an update vector from those fields which in a secondary index entry
+differ from a record that has the equal ordering fields. NOTE: we compare
+the fields as binary strings!
+@return	own: update vector of differing fields */
+UNIV_INTERN
+upd_t*
+row_upd_build_sec_rec_difference_binary(
+/*====================================*/
+	dict_index_t*	index,	/*!< in: index */
+	const dtuple_t*	entry,	/*!< in: entry to insert */
+	const rec_t*	rec,	/*!< in: secondary index record */
+	trx_t*		trx,	/*!< in: transaction */
+	mem_heap_t*	heap);	/*!< in: memory heap from which allocated */
+/***************************************************************//**
+Builds an update vector from those fields, excluding the roll ptr and
+trx id fields, which in an index entry differ from a record that has
+the equal ordering fields. NOTE: we compare the fields as binary strings!
+@return own: update vector of differing fields, excluding roll ptr and
+trx id */
+UNIV_INTERN
+upd_t*
+row_upd_build_difference_binary(
+/*============================*/
+	dict_index_t*	index,	/*!< in: clustered index */
+	const dtuple_t*	entry,	/*!< in: entry to insert */
+	const rec_t*	rec,	/*!< in: clustered index record */
+	trx_t*		trx,	/*!< in: transaction */
+	mem_heap_t*	heap);	/*!< in: memory heap from which allocated */
+/***********************************************************//**
+Replaces the new column values stored in the update vector to the index entry
+given. */
+UNIV_INTERN
+void
+row_upd_index_replace_new_col_vals_index_pos(
+/*=========================================*/
+	dtuple_t*	entry,	/*!< in/out: index entry where replaced;
+				the clustered index record must be
+				covered by a lock or a page latch to
+				prevent deletion (rollback or purge) */
+	dict_index_t*	index,	/*!< in: index; NOTE that this may also be a
+				non-clustered index */
+	const upd_t*	update,	/*!< in: an update vector built for the index so
+				that the field number in an upd_field is the
+				index position */
+	ibool		order_only,
+				/*!< in: if TRUE, limit the replacement to
+				ordering fields of index; note that this
+				does not work for non-clustered indexes. */
+	mem_heap_t*	heap)	/*!< in: memory heap for allocating and
+				copying the new values */
+	__attribute__((nonnull));
+/***********************************************************//**
+Replaces the new column values stored in the update vector to the index entry
+given. */
+UNIV_INTERN
+void
+row_upd_index_replace_new_col_vals(
+/*===============================*/
+	dtuple_t*	entry,	/*!< in/out: index entry where replaced;
+				the clustered index record must be
+				covered by a lock or a page latch to
+				prevent deletion (rollback or purge) */
+	dict_index_t*	index,	/*!< in: index; NOTE that this may also be a
+				non-clustered index */
+	const upd_t*	update,	/*!< in: an update vector built for the
+				CLUSTERED index so that the field number in
+				an upd_field is the clustered index position */
+	mem_heap_t*	heap)	/*!< in: memory heap for allocating and
+				copying the new values */
+	__attribute__((nonnull));
+/***********************************************************//**
+Replaces the new column values stored in the update vector. */
+UNIV_INTERN
+void
+row_upd_replace(
+/*============*/
+	dtuple_t*		row,	/*!< in/out: row where replaced,
+					indexed by col_no;
+					the clustered index record must be
+					covered by a lock or a page latch to
+					prevent deletion (rollback or purge) */
+	row_ext_t**		ext,	/*!< out, own: NULL, or externally
+					stored column prefixes */
+	const dict_index_t*	index,	/*!< in: clustered index */
+	const upd_t*		update,	/*!< in: an update vector built for the
+					clustered index */
+	mem_heap_t*		heap);	/*!< in: memory heap */
+/***********************************************************//**
+Checks if an update vector changes an ordering field of an index record.
+
+This function is fast if the update vector is short or the number of ordering
+fields in the index is small. Otherwise, this can be quadratic.
+NOTE: we compare the fields as binary strings!
+@return TRUE if update vector changes an ordering field in the index record */
+UNIV_INTERN
+ibool
+row_upd_changes_ord_field_binary(
+/*=============================*/
+	const dtuple_t*	row,	/*!< in: old value of row, or NULL if the
+				row and the data values in update are not
+				known when this function is called, e.g., at
+				compile time */
+	dict_index_t*	index,	/*!< in: index of the record */
+	const upd_t*	update);/*!< in: update vector for the row; NOTE: the
+				field numbers in this MUST be clustered index
+				positions! */
+/***********************************************************//**
+Checks if an update vector changes an ordering field of an index record.
+This function is fast if the update vector is short or the number of ordering
+fields in the index is small. Otherwise, this can be quadratic.
+NOTE: we compare the fields as binary strings!
+@return TRUE if update vector may change an ordering field in an index
+record */
+UNIV_INTERN
+ibool
+row_upd_changes_some_index_ord_field_binary(
+/*========================================*/
+	const dict_table_t*	table,	/*!< in: table */
+	const upd_t*		update);/*!< in: update vector for the row */
+/***********************************************************//**
+Updates a row in a table. This is a high-level function used
+in SQL execution graphs.
+@return	query thread to run next or NULL */
+UNIV_INTERN
+que_thr_t*
+row_upd_step(
+/*=========*/
+	que_thr_t*	thr);	/*!< in: query thread */
+#endif /* !UNIV_HOTBACKUP */
+/*********************************************************************//**
+Parses the log data of system field values.
+@return	log data end or NULL */
+UNIV_INTERN
+byte*
+row_upd_parse_sys_vals(
+/*===================*/
+	byte*		ptr,	/*!< in: buffer */
+	byte*		end_ptr,/*!< in: buffer end */
+	ulint*		pos,	/*!< out: TRX_ID position in record */
+	trx_id_t*	trx_id,	/*!< out: trx id */
+	roll_ptr_t*	roll_ptr);/*!< out: roll ptr */
+/*********************************************************************//**
+Updates the trx id and roll ptr field in a clustered index record in database
+recovery. */
+UNIV_INTERN
+void
+row_upd_rec_sys_fields_in_recovery(
+/*===============================*/
+	rec_t*		rec,	/*!< in/out: record */
+	page_zip_des_t*	page_zip,/*!< in/out: compressed page, or NULL */
+	const ulint*	offsets,/*!< in: array returned by rec_get_offsets() */
+	ulint		pos,	/*!< in: TRX_ID position in rec */
+	trx_id_t	trx_id,	/*!< in: transaction id */
+	roll_ptr_t	roll_ptr);/*!< in: roll ptr of the undo log record */
+/*********************************************************************//**
+Parses the log data written by row_upd_index_write_log.
+@return	log data end or NULL */
+UNIV_INTERN
+byte*
+row_upd_index_parse(
+/*================*/
+	byte*		ptr,	/*!< in: buffer */
+	byte*		end_ptr,/*!< in: buffer end */
+	mem_heap_t*	heap,	/*!< in: memory heap where update vector is
+				built */
+	upd_t**		update_out);/*!< out: update vector */
+
+
+/* Update vector field */
+struct upd_field_struct{
+	unsigned	field_no:16;	/*!< field number in an index, usually
+					the clustered index, but in updating
+					a secondary index record in btr0cur.c
+					this is the position in the secondary
+					index */
+#ifndef UNIV_HOTBACKUP
+	unsigned	orig_len:16;	/*!< original length of the locally
+					stored part of an externally stored
+					column, or 0 */
+	que_node_t*	exp;		/*!< expression for calculating a new
+					value: it refers to column values and
+					constants in the symbol table of the
+					query graph */
+#endif /* !UNIV_HOTBACKUP */
+	dfield_t	new_val;	/*!< new value for the column */
+};
+
+/* Update vector structure */
+struct upd_struct{
+	ulint		info_bits;	/*!< new value of info bits to record;
+					default is 0 */
+	ulint		n_fields;	/*!< number of update fields */
+	upd_field_t*	fields;		/*!< array of update fields */
+};
+
+#ifndef UNIV_HOTBACKUP
+/* Update node structure which also implements the delete operation
+of a row */
+
+struct upd_node_struct{
+	que_common_t	common;	/*!< node type: QUE_NODE_UPDATE */
+	ibool		is_delete;/* TRUE if delete, FALSE if update */
+	ibool		searched_update;
+				/* TRUE if searched update, FALSE if
+				positioned */
+	ibool		in_mysql_interface;
+				/* TRUE if the update node was created
+				for the MySQL interface */
+	dict_foreign_t*	foreign;/* NULL or pointer to a foreign key
+				constraint if this update node is used in
+				doing an ON DELETE or ON UPDATE operation */
+	upd_node_t*	cascade_node;/* NULL or an update node template which
+				is used to implement ON DELETE/UPDATE CASCADE
+				or ... SET NULL for foreign keys */
+	mem_heap_t*	cascade_heap;/* NULL or a mem heap where the cascade
+				node is created */
+	sel_node_t*	select;	/*!< query graph subtree implementing a base
+				table cursor: the rows returned will be
+				updated */
+	btr_pcur_t*	pcur;	/*!< persistent cursor placed on the clustered
+				index record which should be updated or
+				deleted; the cursor is stored in the graph
+				of 'select' field above, except in the case
+				of the MySQL interface */
+	dict_table_t*	table;	/*!< table where updated */
+	upd_t*		update;	/*!< update vector for the row */
+	ulint		update_n_fields;
+				/* when this struct is used to implement
+				a cascade operation for foreign keys, we store
+				here the size of the buffer allocated for use
+				as the update vector */
+	sym_node_list_t	columns;/* symbol table nodes for the columns
+				to retrieve from the table */
+	ibool		has_clust_rec_x_lock;
+				/* TRUE if the select which retrieves the
+				records to update already sets an x-lock on
+				the clustered record; note that it must always
+				set at least an s-lock */
+	ulint		cmpl_info;/* information extracted during query
+				compilation; speeds up execution:
+				UPD_NODE_NO_ORD_CHANGE and
+				UPD_NODE_NO_SIZE_CHANGE, ORed */
+	/*----------------------*/
+	/* Local storage for this graph node */
+	ulint		state;	/*!< node execution state */
+	dict_index_t*	index;	/*!< NULL, or the next index whose record should
+				be updated */
+	dtuple_t*	row;	/*!< NULL, or a copy (also fields copied to
+				heap) of the row to update; this must be reset
+				to NULL after a successful update */
+	row_ext_t*	ext;	/*!< NULL, or prefixes of the externally
+				stored columns in the old row */
+	dtuple_t*	upd_row;/* NULL, or a copy of the updated row */
+	row_ext_t*	upd_ext;/* NULL, or prefixes of the externally
+				stored columns in upd_row */
+	mem_heap_t*	heap;	/*!< memory heap used as auxiliary storage;
+				this must be emptied after a successful
+				update */
+	/*----------------------*/
+	sym_node_t*	table_sym;/* table node in symbol table */
+	que_node_t*	col_assign_list;
+				/* column assignment list */
+	ulint		magic_n;
+};
+
+#define	UPD_NODE_MAGIC_N	1579975
+
+/* Node execution states */
+#define UPD_NODE_SET_IX_LOCK	   1	/* execution came to the node from
+					a node above and if the field
+					has_clust_rec_x_lock is FALSE, we
+					should set an intention x-lock on
+					the table */
+#define UPD_NODE_UPDATE_CLUSTERED  2	/* clustered index record should be
+					updated */
+#define UPD_NODE_INSERT_CLUSTERED  3	/* clustered index record should be
+					inserted, old record is already delete
+					marked */
+#define UPD_NODE_UPDATE_ALL_SEC	   4	/* an ordering field of the clustered
+					index record was changed, or this is
+					a delete operation: should update
+					all the secondary index records */
+#define	UPD_NODE_UPDATE_SOME_SEC   5	/* secondary index entries should be
+					looked at and updated if an ordering
+					field changed */
+
+/* Compilation info flags: these must fit within 3 bits; see trx0rec.h */
+#define UPD_NODE_NO_ORD_CHANGE	1	/* no secondary index record will be
+					changed in the update and no ordering
+					field of the clustered index */
+#define UPD_NODE_NO_SIZE_CHANGE	2	/* no record field size will be
+					changed in the update */
+
+#endif /* !UNIV_HOTBACKUP */
+
+#ifndef UNIV_NONINL
+#include "row0upd.ic"
+#endif
+
+#endif
diff --git a/storage/xtradb/include/row0upd.ic b/storage/xtradb/include/row0upd.ic
new file mode 100644
index 00000000000..18e22f1eca9
--- /dev/null
+++ b/storage/xtradb/include/row0upd.ic
@@ -0,0 +1,184 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/row0upd.ic
+Update of a row
+
+Created 12/27/1996 Heikki Tuuri
+*******************************************************/
+
+#include "mtr0log.h"
+#ifndef UNIV_HOTBACKUP
+# include "trx0trx.h"
+# include "trx0undo.h"
+# include "row0row.h"
+# include "btr0sea.h"
+#endif /* !UNIV_HOTBACKUP */
+#include "page0zip.h"
+
+/*********************************************************************//**
+Creates an update vector object.
+@return	own: update vector object */
+UNIV_INLINE
+upd_t*
+upd_create(
+/*=======*/
+	ulint		n,	/*!< in: number of fields */
+	mem_heap_t*	heap)	/*!< in: heap from which memory allocated */
+{
+	upd_t*	update;
+
+	update = (upd_t*) mem_heap_alloc(heap, sizeof(upd_t));
+
+	update->info_bits = 0;
+	update->n_fields = n;
+	update->fields = (upd_field_t*)
+		mem_heap_alloc(heap, sizeof(upd_field_t) * n);
+
+	return(update);
+}
+
+/*********************************************************************//**
+Returns the number of fields in the update vector == number of columns
+to be updated by an update vector.
+@return	number of fields */
+UNIV_INLINE
+ulint
+upd_get_n_fields(
+/*=============*/
+	const upd_t*	update)	/*!< in: update vector */
+{
+	ut_ad(update);
+
+	return(update->n_fields);
+}
+
+#ifdef UNIV_DEBUG
+/*********************************************************************//**
+Returns the nth field of an update vector.
+@return	update vector field */
+UNIV_INLINE
+upd_field_t*
+upd_get_nth_field(
+/*==============*/
+	const upd_t*	update,	/*!< in: update vector */
+	ulint		n)	/*!< in: field position in update vector */
+{
+	ut_ad(update);
+	ut_ad(n < update->n_fields);
+
+	return((upd_field_t*) update->fields + n);
+}
+#endif /* UNIV_DEBUG */
+
+#ifndef UNIV_HOTBACKUP
+/*********************************************************************//**
+Sets an index field number to be updated by an update vector field. */
+UNIV_INLINE
+void
+upd_field_set_field_no(
+/*===================*/
+	upd_field_t*	upd_field,	/*!< in: update vector field */
+	ulint		field_no,	/*!< in: field number in a clustered
+					index */
+	dict_index_t*	index,		/*!< in: index */
+	trx_t*		trx)		/*!< in: transaction */
+{
+	upd_field->field_no = field_no;
+	upd_field->orig_len = 0;
+
+	if (UNIV_UNLIKELY(field_no >= dict_index_get_n_fields(index))) {
+		fprintf(stderr,
+			"InnoDB: Error: trying to access field %lu in ",
+			(ulong) field_no);
+		dict_index_name_print(stderr, trx, index);
+		fprintf(stderr, "\n"
+			"InnoDB: but index only has %lu fields\n",
+			(ulong) dict_index_get_n_fields(index));
+	}
+
+	dict_col_copy_type(dict_index_get_nth_col(index, field_no),
+			   dfield_get_type(&upd_field->new_val));
+}
+
+/*********************************************************************//**
+Returns a field of an update vector by field_no.
+@return	update vector field, or NULL */
+UNIV_INLINE
+const upd_field_t*
+upd_get_field_by_field_no(
+/*======================*/
+	const upd_t*	update,	/*!< in: update vector */
+	ulint		no)	/*!< in: field_no */
+{
+	ulint	i;
+	for (i = 0; i < upd_get_n_fields(update); i++) {
+		const upd_field_t*	uf = upd_get_nth_field(update, i);
+
+		if (uf->field_no == no) {
+
+			return(uf);
+		}
+	}
+
+	return(NULL);
+}
+
+/*********************************************************************//**
+Updates the trx id and roll ptr field in a clustered index record when
+a row is updated or marked deleted. */
+UNIV_INLINE
+void
+row_upd_rec_sys_fields(
+/*===================*/
+	rec_t*		rec,	/*!< in/out: record */
+	page_zip_des_t*	page_zip,/*!< in/out: compressed page whose
+				uncompressed part will be updated, or NULL */
+	dict_index_t*	index,	/*!< in: clustered index */
+	const ulint*	offsets,/*!< in: rec_get_offsets(rec, index) */
+	trx_t*		trx,	/*!< in: transaction */
+	roll_ptr_t	roll_ptr)/*!< in: roll ptr of the undo log record */
+{
+	ut_ad(dict_index_is_clust(index));
+	ut_ad(rec_offs_validate(rec, index, offsets));
+#ifdef UNIV_SYNC_DEBUG
+	if (!rw_lock_own(&btr_search_latch, RW_LOCK_EX)) {
+		ut_ad(!buf_block_align(rec)->is_hashed);
+	}
+#endif /* UNIV_SYNC_DEBUG */
+
+	if (UNIV_LIKELY_NULL(page_zip)) {
+		ulint	pos = dict_index_get_sys_col_pos(index, DATA_TRX_ID);
+		page_zip_write_trx_id_and_roll_ptr(page_zip, rec, offsets,
+						   pos, trx->id, roll_ptr);
+	} else {
+		ulint	offset = index->trx_id_offset;
+
+		if (!offset) {
+			offset = row_get_trx_id_offset(rec, index, offsets);
+		}
+
+#if DATA_TRX_ID + 1 != DATA_ROLL_PTR
+# error "DATA_TRX_ID + 1 != DATA_ROLL_PTR"
+#endif
+		trx_write_trx_id(rec + offset, trx->id);
+		trx_write_roll_ptr(rec + offset + DATA_TRX_ID_LEN, roll_ptr);
+	}
+}
+#endif /* !UNIV_HOTBACKUP */
diff --git a/storage/xtradb/include/row0vers.h b/storage/xtradb/include/row0vers.h
new file mode 100644
index 00000000000..5a2e38230d5
--- /dev/null
+++ b/storage/xtradb/include/row0vers.h
@@ -0,0 +1,142 @@
+/*****************************************************************************
+
+Copyright (c) 1997, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/row0vers.h
+Row versions
+
+Created 2/6/1997 Heikki Tuuri
+*******************************************************/
+
+#ifndef row0vers_h
+#define row0vers_h
+
+#include "univ.i"
+#include "data0data.h"
+#include "dict0types.h"
+#include "trx0types.h"
+#include "que0types.h"
+#include "rem0types.h"
+#include "mtr0mtr.h"
+#include "read0types.h"
+
+/*****************************************************************//**
+Finds out if an active transaction has inserted or modified a secondary
+index record. NOTE: the kernel mutex is temporarily released in this
+function!
+@return NULL if committed, else the active transaction */
+UNIV_INTERN
+trx_t*
+row_vers_impl_x_locked_off_kernel(
+/*==============================*/
+	const rec_t*	rec,	/*!< in: record in a secondary index */
+	dict_index_t*	index,	/*!< in: the secondary index */
+	const ulint*	offsets);/*!< in: rec_get_offsets(rec, index) */
+/*****************************************************************//**
+Finds out if we must preserve a delete marked earlier version of a clustered
+index record, because it is >= the purge view.
+@return	TRUE if earlier version should be preserved */
+UNIV_INTERN
+ibool
+row_vers_must_preserve_del_marked(
+/*==============================*/
+	trx_id_t	trx_id,	/*!< in: transaction id in the version */
+	mtr_t*		mtr);	/*!< in: mtr holding the latch on the
+				clustered index record; it will also
+				hold the latch on purge_view */
+/*****************************************************************//**
+Finds out if a version of the record, where the version >= the current
+purge view, should have ientry as its secondary index entry. We check
+if there is any not delete marked version of the record where the trx
+id >= purge view, and the secondary index entry == ientry; exactly in
+this case we return TRUE.
+@return	TRUE if earlier version should have */
+UNIV_INTERN
+ibool
+row_vers_old_has_index_entry(
+/*=========================*/
+	ibool		also_curr,/*!< in: TRUE if also rec is included in the
+				versions to search; otherwise only versions
+				prior to it are searched */
+	const rec_t*	rec,	/*!< in: record in the clustered index; the
+				caller must have a latch on the page */
+	mtr_t*		mtr,	/*!< in: mtr holding the latch on rec; it will
+				also hold the latch on purge_view */
+	dict_index_t*	index,	/*!< in: the secondary index */
+	const dtuple_t*	ientry);/*!< in: the secondary index entry */
+/*****************************************************************//**
+Constructs the version of a clustered index record which a consistent
+read should see. We assume that the trx id stored in rec is such that
+the consistent read should not see rec in its present version.
+@return	DB_SUCCESS or DB_MISSING_HISTORY */
+UNIV_INTERN
+ulint
+row_vers_build_for_consistent_read(
+/*===============================*/
+	const rec_t*	rec,	/*!< in: record in a clustered index; the
+				caller must have a latch on the page; this
+				latch locks the top of the stack of versions
+				of this records */
+	mtr_t*		mtr,	/*!< in: mtr holding the latch on rec; it will
+				also hold the latch on purge_view */
+	dict_index_t*	index,	/*!< in: the clustered index */
+	ulint**		offsets,/*!< in/out: offsets returned by
+				rec_get_offsets(rec, index) */
+	read_view_t*	view,	/*!< in: the consistent read view */
+	mem_heap_t**	offset_heap,/*!< in/out: memory heap from which
+				the offsets are allocated */
+	mem_heap_t*	in_heap,/*!< in: memory heap from which the memory for
+				*old_vers is allocated; memory for possible
+				intermediate versions is allocated and freed
+				locally within the function */
+	rec_t**		old_vers);/*!< out, own: old version, or NULL if the
+				record does not exist in the view, that is,
+				it was freshly inserted afterwards */
+
+/*****************************************************************//**
+Constructs the last committed version of a clustered index record,
+which should be seen by a semi-consistent read.
+@return	DB_SUCCESS or DB_MISSING_HISTORY */
+UNIV_INTERN
+ulint
+row_vers_build_for_semi_consistent_read(
+/*====================================*/
+	const rec_t*	rec,	/*!< in: record in a clustered index; the
+				caller must have a latch on the page; this
+				latch locks the top of the stack of versions
+				of this records */
+	mtr_t*		mtr,	/*!< in: mtr holding the latch on rec */
+	dict_index_t*	index,	/*!< in: the clustered index */
+	ulint**		offsets,/*!< in/out: offsets returned by
+				rec_get_offsets(rec, index) */
+	mem_heap_t**	offset_heap,/*!< in/out: memory heap from which
+				the offsets are allocated */
+	mem_heap_t*	in_heap,/*!< in: memory heap from which the memory for
+				*old_vers is allocated; memory for possible
+				intermediate versions is allocated and freed
+				locally within the function */
+	const rec_t**	old_vers);/*!< out: rec, old version, or NULL if the
+				record does not exist in the view, that is,
+				it was freshly inserted afterwards */
+
+
+#ifndef UNIV_NONINL
+#include "row0vers.ic"
+#endif
+
+#endif
diff --git a/storage/xtradb/include/row0vers.ic b/storage/xtradb/include/row0vers.ic
new file mode 100644
index 00000000000..8bb3a5c0cb3
--- /dev/null
+++ b/storage/xtradb/include/row0vers.ic
@@ -0,0 +1,30 @@
+/*****************************************************************************
+
+Copyright (c) 1997, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/row0vers.ic
+Row versions
+
+Created 2/6/1997 Heikki Tuuri
+*******************************************************/
+
+#include "row0row.h"
+#include "dict0dict.h"
+#include "read0read.h"
+#include "page0page.h"
+#include "log0recv.h"
diff --git a/storage/xtradb/include/srv0que.h b/storage/xtradb/include/srv0que.h
new file mode 100644
index 00000000000..82ee7739ef7
--- /dev/null
+++ b/storage/xtradb/include/srv0que.h
@@ -0,0 +1,42 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/srv0que.h
+Server query execution
+
+Created 6/5/1996 Heikki Tuuri
+*******************************************************/
+
+#ifndef srv0que_h
+#define srv0que_h
+
+#include "univ.i"
+#include "que0types.h"
+
+/**********************************************************************//**
+Enqueues a task to server task queue and releases a worker thread, if there
+is a suspended one. */
+UNIV_INTERN
+void
+srv_que_task_enqueue_low(
+/*=====================*/
+	que_thr_t*	thr);	/*!< in: query thread */
+
+#endif
+
diff --git a/storage/xtradb/include/srv0srv.h b/storage/xtradb/include/srv0srv.h
new file mode 100644
index 00000000000..8c64d5cee71
--- /dev/null
+++ b/storage/xtradb/include/srv0srv.h
@@ -0,0 +1,733 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2010, Innobase Oy. All Rights Reserved.
+Copyright (c) 2008, 2009, Google Inc.
+Copyright (c) 2009, Percona Inc.
+
+Portions of this file contain modifications contributed and copyrighted by
+Google, Inc. Those modifications are gratefully acknowledged and are described
+briefly in the InnoDB documentation. The contributions by Google are
+incorporated with their permission, and subject to the conditions contained in
+the file COPYING.Google.
+
+Portions of this file contain modifications contributed and copyrighted
+by Percona Inc.. Those modifications are
+gratefully acknowledged and are described briefly in the InnoDB
+documentation. The contributions by Percona Inc. are incorporated with
+their permission, and subject to the conditions contained in the file
+COPYING.Percona.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/srv0srv.h
+The server main program
+
+Created 10/10/1995 Heikki Tuuri
+*******************************************************/
+
+#ifndef srv0srv_h
+#define srv0srv_h
+
+#include "univ.i"
+#ifndef UNIV_HOTBACKUP
+#include "sync0sync.h"
+#include "os0sync.h"
+#include "que0types.h"
+#include "trx0types.h"
+
+extern const char*	srv_main_thread_op_info;
+
+/** Prefix used by MySQL to indicate pre-5.1 table name encoding */
+extern const char	srv_mysql50_table_name_prefix[9];
+
+/* When this event is set the lock timeout and InnoDB monitor
+thread starts running */
+extern os_event_t	srv_lock_timeout_thread_event;
+
+/* This event is set to tell the purge thread to shut down */
+extern os_event_t	srv_purge_thread_event;
+
+/* If the last data file is auto-extended, we add this many pages to it
+at a time */
+#define SRV_AUTO_EXTEND_INCREMENT	\
+	(srv_auto_extend_increment * ((1024 * 1024) / UNIV_PAGE_SIZE))
+
+/* prototypes for new functions added to ha_innodb.cc */
+ibool	innobase_get_slow_log();
+
+/* This is set to TRUE if the MySQL user has set it in MySQL */
+extern ibool	srv_lower_case_table_names;
+
+/* Mutex for locking srv_monitor_file */
+extern mutex_t	srv_monitor_file_mutex;
+/* Temporary file for innodb monitor output */
+extern FILE*	srv_monitor_file;
+/* Mutex for locking srv_dict_tmpfile.
+This mutex has a very high rank; threads reserving it should not
+be holding any InnoDB latches. */
+extern mutex_t	srv_dict_tmpfile_mutex;
+/* Temporary file for output from the data dictionary */
+extern FILE*	srv_dict_tmpfile;
+/* Mutex for locking srv_misc_tmpfile.
+This mutex has a very low rank; threads reserving it should not
+acquire any further latches or sleep before releasing this one. */
+extern mutex_t	srv_misc_tmpfile_mutex;
+/* Temporary file for miscellanous diagnostic output */
+extern FILE*	srv_misc_tmpfile;
+
+/* Server parameters which are read from the initfile */
+
+extern char*	srv_data_home;
+#ifdef UNIV_LOG_ARCHIVE
+extern char*	srv_arch_dir;
+#endif /* UNIV_LOG_ARCHIVE */
+
+/** store to its own file each table created by an user; data
+dictionary tables are in the system tablespace 0 */
+#ifndef UNIV_HOTBACKUP
+extern my_bool	srv_file_per_table;
+#else
+extern ibool	srv_file_per_table;
+#endif /* UNIV_HOTBACKUP */
+/** The file format to use on new *.ibd files. */
+extern ulint	srv_file_format;
+/** Whether to check file format during startup.  A value of
+DICT_TF_FORMAT_MAX + 1 means no checking ie. FALSE.  The default is to
+set it to the highest format we support. */
+extern ulint	srv_check_file_format_at_startup;
+/** Place locks to records only i.e. do not use next-key locking except
+on duplicate key checking and foreign key checking */
+extern ibool	srv_locks_unsafe_for_binlog;
+#endif /* !UNIV_HOTBACKUP */
+
+extern ulint	srv_n_data_files;
+extern char**	srv_data_file_names;
+extern ulint*	srv_data_file_sizes;
+extern ulint*	srv_data_file_is_raw_partition;
+
+extern char*	srv_doublewrite_file;
+
+extern ibool	srv_extra_undoslots;
+
+extern ibool	srv_recovery_stats;
+
+extern ulint	srv_use_purge_thread;
+
+extern ibool	srv_auto_extend_last_data_file;
+extern ulint	srv_last_file_size_max;
+extern char**	srv_log_group_home_dirs;
+#ifndef UNIV_HOTBACKUP
+extern ulong	srv_auto_extend_increment;
+
+extern ibool	srv_created_new_raw;
+
+extern ulint	srv_n_log_groups;
+extern ulint	srv_n_log_files;
+extern ulint	srv_log_file_size;
+extern ulint	srv_log_buffer_size;
+extern ulong	srv_flush_log_at_trx_commit;
+extern char	srv_adaptive_flushing;
+
+
+extern ulong    srv_show_locks_held;
+extern ulong    srv_show_verbose_locks;
+
+/* The sort order table of the MySQL latin1_swedish_ci character set
+collation */
+extern const byte*	srv_latin1_ordering;
+#ifndef UNIV_HOTBACKUP
+extern my_bool	srv_use_sys_malloc;
+#else
+extern ibool	srv_use_sys_malloc;
+#endif /* UNIV_HOTBACKUP */
+extern ulint	srv_buf_pool_size;	/*!< requested size in bytes */
+extern ulint	srv_buf_pool_old_size;	/*!< previously requested size */
+extern ulint	srv_buf_pool_curr_size;	/*!< current size in bytes */
+extern ulint	srv_mem_pool_size;
+extern ulint	srv_lock_table_size;
+
+extern uint	srv_buffer_pool_shm_key;
+extern ibool	srv_buffer_pool_shm_is_reused;
+extern ibool	srv_buffer_pool_shm_checksum;
+
+extern ibool	srv_thread_concurrency_timer_based;
+
+extern ulint	srv_n_file_io_threads;
+extern ulong	srv_read_ahead_threshold;
+extern ulint	srv_n_read_io_threads;
+extern ulint	srv_n_write_io_threads;
+
+/* Number of IO operations per second the server can do */
+extern ulong    srv_io_capacity;
+/* Returns the number of IO operations that is X percent of the
+capacity. PCT_IO(5) -> returns the number of IO operations that
+is 5% of the max where max is srv_io_capacity.  */
+#define PCT_IO(p) ((ulong) (srv_io_capacity * ((double) p / 100.0)))
+
+#ifdef UNIV_LOG_ARCHIVE
+extern ibool	srv_log_archive_on;
+extern ibool	srv_archive_recovery;
+extern dulint	srv_archive_recovery_limit_lsn;
+#endif /* UNIV_LOG_ARCHIVE */
+
+extern char*	srv_file_flush_method_str;
+extern ulint	srv_unix_file_flush_method;
+extern ulint	srv_win_file_flush_method;
+
+extern ulint	srv_max_n_open_files;
+
+extern ulint	srv_max_dirty_pages_pct;
+
+extern ulint	srv_force_recovery;
+extern ulong	srv_thread_concurrency;
+
+extern ulint	srv_max_n_threads;
+
+extern lint	srv_conc_n_threads;
+
+extern ulint	srv_fast_shutdown;	 /* If this is 1, do not do a
+					 purge and index buffer merge.
+					 If this 2, do not even flush the
+					 buffer pool to data files at the
+					 shutdown: we effectively 'crash'
+					 InnoDB (but lose no committed
+					 transactions). */
+extern ibool	srv_innodb_status;
+
+extern unsigned long long	srv_stats_sample_pages;
+extern ulong	srv_stats_method;
+#define SRV_STATS_METHOD_NULLS_EQUAL     0
+#define SRV_STATS_METHOD_NULLS_NOT_EQUAL 1
+#define SRV_STATS_METHOD_IGNORE_NULLS    2
+extern ulong	srv_stats_auto_update;
+extern ulint	srv_stats_update_need_lock;
+extern ibool	srv_use_sys_stats_table;
+
+extern ibool	srv_use_doublewrite_buf;
+extern ibool	srv_use_checksums;
+extern ibool	srv_fast_checksum;
+
+extern ibool	srv_set_thread_priorities;
+extern int	srv_query_thread_priority;
+
+extern ulong	srv_max_buf_pool_modified_pct;
+extern ulong	srv_max_purge_lag;
+
+extern ulong	srv_replication_delay;
+
+extern long long	srv_ibuf_max_size;
+extern ulong	srv_ibuf_active_contract;
+extern ulong	srv_ibuf_accel_rate;
+extern ulint	srv_checkpoint_age_target;
+extern ulong	srv_flush_neighbor_pages;
+extern ulong	srv_enable_unsafe_group_commit;
+extern ulong	srv_read_ahead;
+extern ulong	srv_adaptive_checkpoint;
+
+extern ulong	srv_expand_import;
+extern ulint	srv_pass_corrupt_table;
+
+extern ulong	srv_extra_rsegments;
+extern ulong	srv_dict_size_limit;
+/*-------------------------------------------*/
+
+extern ulint	srv_n_rows_inserted;
+extern ulint	srv_n_rows_updated;
+extern ulint	srv_n_rows_deleted;
+extern ulint	srv_n_rows_read;
+
+extern ibool	srv_print_innodb_monitor;
+extern ibool	srv_print_innodb_lock_monitor;
+extern ibool	srv_print_innodb_tablespace_monitor;
+extern ibool	srv_print_verbose_log;
+extern ibool	srv_print_innodb_table_monitor;
+
+extern ibool	srv_lock_timeout_active;
+extern ibool	srv_monitor_active;
+extern ibool	srv_error_monitor_active;
+
+extern ulong	srv_n_spin_wait_rounds;
+extern ulong	srv_n_free_tickets_to_enter;
+extern ulong	srv_thread_sleep_delay;
+extern ulong	srv_spin_wait_delay;
+extern ibool	srv_priority_boost;
+
+extern	ulint	srv_mem_pool_size;
+extern	ulint	srv_lock_table_size;
+
+#ifdef UNIV_DEBUG
+extern	ibool	srv_print_thread_releases;
+extern	ibool	srv_print_lock_waits;
+extern	ibool	srv_print_buf_io;
+extern	ibool	srv_print_log_io;
+extern	ibool	srv_print_latch_waits;
+#else /* UNIV_DEBUG */
+# define srv_print_thread_releases	FALSE
+# define srv_print_lock_waits		FALSE
+# define srv_print_buf_io		FALSE
+# define srv_print_log_io		FALSE
+# define srv_print_latch_waits		FALSE
+#endif /* UNIV_DEBUG */
+
+extern ulint	srv_activity_count;
+extern ulint	srv_fatal_semaphore_wait_threshold;
+extern ulint	srv_dml_needed_delay;
+
+extern mutex_t*	kernel_mutex_temp;/* mutex protecting the server, trx structs,
+				query threads, and lock table: we allocate
+				it from dynamic memory to get it to the
+				same DRAM page as other hotspot semaphores */
+#define kernel_mutex (*kernel_mutex_temp)
+
+#define SRV_MAX_N_IO_THREADS	130
+
+/* Array of English strings describing the current state of an
+i/o handler thread */
+extern const char* srv_io_thread_op_info[];
+extern const char* srv_io_thread_function[];
+
+/* the number of the log write requests done */
+extern ulint srv_log_write_requests;
+
+/* the number of physical writes to the log performed */
+extern ulint srv_log_writes;
+
+/* amount of data written to the log files in bytes */
+extern ulint srv_os_log_written;
+
+/* amount of writes being done to the log files */
+extern ulint srv_os_log_pending_writes;
+
+/* we increase this counter, when there we don't have enough space in the
+log buffer and have to flush it */
+extern ulint srv_log_waits;
+
+/* variable that counts amount of data read in total (in bytes) */
+extern ulint srv_data_read;
+
+/* here we count the amount of data written in total (in bytes) */
+extern ulint srv_data_written;
+
+/* this variable counts the amount of times, when the doublewrite buffer
+was flushed */
+extern ulint srv_dblwr_writes;
+
+/* here we store the number of pages that have been flushed to the
+doublewrite buffer */
+extern ulint srv_dblwr_pages_written;
+
+/* in this variable we store the number of write requests issued */
+extern ulint srv_buf_pool_write_requests;
+
+/* here we store the number of times when we had to wait for a free page
+in the buffer pool. It happens when the buffer pool is full and we need
+to make a flush, in order to be able to read or create a page. */
+extern ulint srv_buf_pool_wait_free;
+
+/* variable to count the number of pages that were written from the
+buffer pool to disk */
+extern ulint srv_buf_pool_flushed;
+
+/** Number of buffer pool reads that led to the
+reading of a disk page */
+extern ulint srv_buf_pool_reads;
+
+/** Time in seconds between automatic buffer pool dumps */
+extern uint srv_auto_lru_dump;
+
+/** Status variables to be passed to MySQL */
+typedef struct export_var_struct export_struc;
+
+/** Status variables to be passed to MySQL */
+extern export_struc export_vars;
+
+/** The server system */
+typedef struct srv_sys_struct	srv_sys_t;
+
+/** The server system */
+extern srv_sys_t*	srv_sys;
+#endif /* !UNIV_HOTBACKUP */
+
+/** Types of raw partitions in innodb_data_file_path */
+enum {
+	SRV_NOT_RAW = 0,	/*!< Not a raw partition */
+	SRV_NEW_RAW,		/*!< A 'newraw' partition, only to be
+				initialized */
+	SRV_OLD_RAW		/*!< An initialized raw partition */
+};
+
+/** Alternatives for the file flush option in Unix; see the InnoDB manual
+about what these mean */
+enum {
+	SRV_UNIX_FSYNC = 1,	/*!< fsync, the default */
+	SRV_UNIX_O_DSYNC,	/*!< open log files in O_SYNC mode */
+	SRV_UNIX_LITTLESYNC,	/*!< do not call os_file_flush()
+				when writing data files, but do flush
+				after writing to log files */
+	SRV_UNIX_NOSYNC,	/*!< do not flush after writing */
+	SRV_UNIX_O_DIRECT,	/*!< invoke os_file_set_nocache() on
+				data files */
+	SRV_UNIX_ALL_O_DIRECT  /* new method for examination: logfile also open O_DIRECT */
+};
+
+/** Alternatives for file i/o in Windows */
+enum {
+	SRV_WIN_IO_NORMAL = 1,	/*!< buffered I/O */
+	SRV_WIN_IO_UNBUFFERED	/*!< unbuffered I/O; this is the default */
+};
+
+/** Alternatives for srv_force_recovery. Non-zero values are intended
+to help the user get a damaged database up so that he can dump intact
+tables and rows with SELECT INTO OUTFILE. The database must not otherwise
+be used with these options! A bigger number below means that all precautions
+of lower numbers are included. */
+enum {
+	SRV_FORCE_IGNORE_CORRUPT = 1,	/*!< let the server run even if it
+					detects a corrupt page */
+	SRV_FORCE_NO_BACKGROUND	= 2,	/*!< prevent the main thread from
+					running: if a crash would occur
+					in purge, this prevents it */
+	SRV_FORCE_NO_TRX_UNDO = 3,	/*!< do not run trx rollback after
+					recovery */
+	SRV_FORCE_NO_IBUF_MERGE = 4,	/*!< prevent also ibuf operations:
+					if they would cause a crash, better
+					not do them */
+	SRV_FORCE_NO_UNDO_LOG_SCAN = 5,	/*!< do not look at undo logs when
+					starting the database: InnoDB will
+					treat even incomplete transactions
+					as committed */
+	SRV_FORCE_NO_LOG_REDO = 6	/*!< do not do the log roll-forward
+					in connection with recovery */
+};
+
+#ifndef UNIV_HOTBACKUP
+/** Types of threads existing in the system. */
+enum srv_thread_type {
+	SRV_COM = 1,	/**< threads serving communication and queries */
+	SRV_CONSOLE,	/**< thread serving console */
+	SRV_WORKER,	/**< threads serving parallelized queries and
+			queries released from lock wait */
+#if 0
+	/* Utility threads */
+	SRV_BUFFER,	/**< thread flushing dirty buffer blocks */
+	SRV_RECOVERY,	/**< threads finishing a recovery */
+	SRV_INSERT,	/**< thread flushing the insert buffer to disk */
+#endif
+	SRV_PURGE,	/* thread purging undo records */
+	SRV_PURGE_WORKER,	/* thread purging undo records */
+	SRV_MASTER	/**< the master thread, (whose type number must
+			be biggest) */
+};
+
+/*********************************************************************//**
+Boots Innobase server.
+@return	DB_SUCCESS or error code */
+UNIV_INTERN
+ulint
+srv_boot(void);
+/*==========*/
+/*********************************************************************//**
+Initializes the server. */
+UNIV_INTERN
+void
+srv_init(void);
+/*==========*/
+/*********************************************************************//**
+Frees the data structures created in srv_init(). */
+UNIV_INTERN
+void
+srv_free(void);
+/*==========*/
+/*********************************************************************//**
+Initializes the synchronization primitives, memory system, and the thread
+local storage. */
+UNIV_INTERN
+void
+srv_general_init(void);
+/*==================*/
+/*********************************************************************//**
+Gets the number of threads in the system.
+@return	sum of srv_n_threads[] */
+UNIV_INTERN
+ulint
+srv_get_n_threads(void);
+/*===================*/
+/*********************************************************************//**
+Returns the calling thread type.
+@return	SRV_COM, ... */
+
+enum srv_thread_type
+srv_get_thread_type(void);
+/*=====================*/
+/*********************************************************************//**
+Sets the info describing an i/o thread current state. */
+UNIV_INTERN
+void
+srv_set_io_thread_op_info(
+/*======================*/
+	ulint		i,	/*!< in: the 'segment' of the i/o thread */
+	const char*	str);	/*!< in: constant char string describing the
+				state */
+/*********************************************************************//**
+Releases threads of the type given from suspension in the thread table.
+NOTE! The server mutex has to be reserved by the caller!
+@return number of threads released: this may be less than n if not
+enough threads were suspended at the moment */
+UNIV_INTERN
+ulint
+srv_release_threads(
+/*================*/
+	enum srv_thread_type	type,	/*!< in: thread type */
+	ulint			n);	/*!< in: number of threads to release */
+/*********************************************************************//**
+The master thread controlling the server.
+@return	a dummy parameter */
+UNIV_INTERN
+os_thread_ret_t
+srv_master_thread(
+/*==============*/
+	void*	arg);	/*!< in: a dummy parameter required by
+			os_thread_create */
+/*************************************************************************
+The undo purge thread. */
+UNIV_INTERN
+os_thread_ret_t
+srv_purge_thread(
+/*=============*/
+	void*	arg);	/* in: a dummy parameter required by
+			os_thread_create */
+/*************************************************************************
+The undo purge thread. */
+UNIV_INTERN
+os_thread_ret_t
+srv_purge_worker_thread(
+/*====================*/
+	void*	arg);
+/*******************************************************************//**
+Tells the Innobase server that there has been activity in the database
+and wakes up the master thread if it is suspended (not sleeping). Used
+in the MySQL interface. Note that there is a small chance that the master
+thread stays suspended (we do not protect our operation with the kernel
+mutex, for performace reasons). */
+UNIV_INTERN
+void
+srv_active_wake_master_thread(void);
+/*===============================*/
+/*******************************************************************//**
+Wakes up the master thread if it is suspended or being suspended. */
+UNIV_INTERN
+void
+srv_wake_master_thread(void);
+/*========================*/
+/*********************************************************************//**
+Puts an OS thread to wait if there are too many concurrent threads
+(>= srv_thread_concurrency) inside InnoDB. The threads wait in a FIFO queue. */
+UNIV_INTERN
+void
+srv_conc_enter_innodb(
+/*==================*/
+	trx_t*	trx);	/*!< in: transaction object associated with the
+			thread */
+/*********************************************************************//**
+This lets a thread enter InnoDB regardless of the number of threads inside
+InnoDB. This must be called when a thread ends a lock wait. */
+UNIV_INTERN
+void
+srv_conc_force_enter_innodb(
+/*========================*/
+	trx_t*	trx);	/*!< in: transaction object associated with the
+			thread */
+/*********************************************************************//**
+This must be called when a thread exits InnoDB in a lock wait or at the
+end of an SQL statement. */
+UNIV_INTERN
+void
+srv_conc_force_exit_innodb(
+/*=======================*/
+	trx_t*	trx);	/*!< in: transaction object associated with the
+			thread */
+/*********************************************************************//**
+This must be called when a thread exits InnoDB. */
+UNIV_INTERN
+void
+srv_conc_exit_innodb(
+/*=================*/
+	trx_t*	trx);	/*!< in: transaction object associated with the
+			thread */
+/***************************************************************//**
+Puts a MySQL OS thread to wait for a lock to be released. If an error
+occurs during the wait trx->error_state associated with thr is
+!= DB_SUCCESS when we return. DB_LOCK_WAIT_TIMEOUT and DB_DEADLOCK
+are possible errors. DB_DEADLOCK is returned if selective deadlock
+resolution chose this transaction as a victim. */
+UNIV_INTERN
+void
+srv_suspend_mysql_thread(
+/*=====================*/
+	que_thr_t*	thr);	/*!< in: query thread associated with the MySQL
+				OS thread */
+/********************************************************************//**
+Releases a MySQL OS thread waiting for a lock to be released, if the
+thread is already suspended. */
+UNIV_INTERN
+void
+srv_release_mysql_thread_if_suspended(
+/*==================================*/
+	que_thr_t*	thr);	/*!< in: query thread associated with the
+				MySQL OS thread	 */
+/*********************************************************************//**
+A thread which wakes up threads whose lock wait may have lasted too long.
+@return	a dummy parameter */
+UNIV_INTERN
+os_thread_ret_t
+srv_lock_timeout_thread(
+/*====================*/
+	void*	arg);	/*!< in: a dummy parameter required by
+			os_thread_create */
+/*********************************************************************//**
+A thread which prints the info output by various InnoDB monitors.
+@return	a dummy parameter */
+UNIV_INTERN
+os_thread_ret_t
+srv_monitor_thread(
+/*===============*/
+	void*	arg);	/*!< in: a dummy parameter required by
+			os_thread_create */
+/*************************************************************************
+A thread which prints warnings about semaphore waits which have lasted
+too long. These can be used to track bugs which cause hangs.
+@return	a dummy parameter */
+UNIV_INTERN
+os_thread_ret_t
+srv_error_monitor_thread(
+/*=====================*/
+	void*	arg);	/*!< in: a dummy parameter required by
+			os_thread_create */
+/*********************************************************************//**
+A thread which restores the buffer pool from a dump file on startup and does
+periodic buffer pool dumps.
+@return	a dummy parameter */
+UNIV_INTERN
+os_thread_ret_t
+srv_LRU_dump_restore_thread(
+/*====================*/
+	void*	arg);	/*!< in: a dummy parameter required by
+			os_thread_create */
+/******************************************************************//**
+Outputs to a file the output of the InnoDB Monitor.
+@return FALSE if not all information printed
+due to failure to obtain necessary mutex */
+UNIV_INTERN
+ibool
+srv_printf_innodb_monitor(
+/*======================*/
+	FILE*	file,		/*!< in: output stream */
+	ibool	nowait,		/*!< in: whether to wait for kernel mutex */
+	ulint*	trx_start,	/*!< out: file position of the start of
+				the list of active transactions */
+	ulint*	trx_end);	/*!< out: file position of the end of
+				the list of active transactions */
+
+/******************************************************************//**
+Function to pass InnoDB status variables to MySQL */
+UNIV_INTERN
+void
+srv_export_innodb_status(void);
+/*==========================*/
+
+/** Thread slot in the thread table */
+typedef struct srv_slot_struct	srv_slot_t;
+
+/** Thread table is an array of slots */
+typedef srv_slot_t	srv_table_t;
+
+/** Status variables to be passed to MySQL */
+struct export_var_struct{
+	ulint innodb_data_pending_reads;	/*!< Pending reads */
+	ulint innodb_data_pending_writes;	/*!< Pending writes */
+	ulint innodb_data_pending_fsyncs;	/*!< Pending fsyncs */
+	ulint innodb_data_fsyncs;		/*!< Number of fsyncs so far */
+	ulint innodb_data_read;			/*!< Data bytes read */
+	ulint innodb_data_writes;		/*!< I/O write requests */
+	ulint innodb_data_written;		/*!< Data bytes written */
+	ulint innodb_data_reads;		/*!< I/O read requests */
+	ulint innodb_dict_tables;
+	ulint innodb_buffer_pool_pages_total;	/*!< Buffer pool size */
+	ulint innodb_buffer_pool_pages_data;	/*!< Data pages */
+	ulint innodb_buffer_pool_pages_dirty;	/*!< Dirty data pages */
+	ulint innodb_buffer_pool_pages_misc;	/*!< Miscellanous pages */
+	ulint innodb_buffer_pool_pages_free;	/*!< Free pages */
+#ifdef UNIV_DEBUG
+	ulint innodb_buffer_pool_pages_latched;	/*!< Latched pages */
+#endif /* UNIV_DEBUG */
+	ulint innodb_buffer_pool_read_requests;	/*!< buf_pool->stat.n_page_gets */
+	ulint innodb_buffer_pool_reads;		/*!< srv_buf_pool_reads */
+	ulint innodb_buffer_pool_wait_free;	/*!< srv_buf_pool_wait_free */
+	ulint innodb_buffer_pool_pages_flushed;	/*!< srv_buf_pool_flushed */
+	ulint innodb_buffer_pool_write_requests;/*!< srv_buf_pool_write_requests */
+	ulint innodb_buffer_pool_read_ahead;	/*!< srv_read_ahead */
+	ulint innodb_buffer_pool_read_ahead_evicted;/*!< srv_read_ahead evicted*/
+        ulint innodb_deadlocks;                 /* ??? */
+	ulint innodb_dblwr_pages_written;	/*!< srv_dblwr_pages_written */
+	ulint innodb_dblwr_writes;		/*!< srv_dblwr_writes */
+	ibool innodb_have_atomic_builtins;	/*!< HAVE_ATOMIC_BUILTINS */
+	ulint innodb_log_waits;			/*!< srv_log_waits */
+	ulint innodb_log_write_requests;	/*!< srv_log_write_requests */
+	ulint innodb_log_writes;		/*!< srv_log_writes */
+	ulint innodb_os_log_written;		/*!< srv_os_log_written */
+	ulint innodb_os_log_fsyncs;		/*!< fil_n_log_flushes */
+	ulint innodb_os_log_pending_writes;	/*!< srv_os_log_pending_writes */
+	ulint innodb_os_log_pending_fsyncs;	/*!< fil_n_pending_log_flushes */
+	ulint innodb_page_size;			/*!< UNIV_PAGE_SIZE */
+	ulint innodb_pages_created;		/*!< buf_pool->stat.n_pages_created */
+	ulint innodb_pages_read;		/*!< buf_pool->stat.n_pages_read */
+	ulint innodb_pages_written;		/*!< buf_pool->stat.n_pages_written */
+	ulint innodb_row_lock_waits;		/*!< srv_n_lock_wait_count */
+	ulint innodb_row_lock_current_waits;	/*!< srv_n_lock_wait_current_count */
+	ib_int64_t innodb_row_lock_time;	/*!< srv_n_lock_wait_time
+						/ 1000 */
+	ulint innodb_row_lock_time_avg;		/*!< srv_n_lock_wait_time
+						/ 1000
+						/ srv_n_lock_wait_count */
+	ulint innodb_row_lock_time_max;		/*!< srv_n_lock_max_wait_time
+						/ 1000 */
+	ulint innodb_rows_read;			/*!< srv_n_rows_read */
+	ulint innodb_rows_inserted;		/*!< srv_n_rows_inserted */
+	ulint innodb_rows_updated;		/*!< srv_n_rows_updated */
+	ulint innodb_rows_deleted;		/*!< srv_n_rows_deleted */
+};
+
+/** The server system struct */
+struct srv_sys_struct{
+	srv_table_t*	threads;	/*!< server thread table */
+	UT_LIST_BASE_NODE_T(que_thr_t)
+			tasks;		/*!< task queue */
+};
+
+extern ulint	srv_n_threads_active[];
+#else /* !UNIV_HOTBACKUP */
+# define srv_use_checksums			TRUE
+# define srv_use_adaptive_hash_indexes		FALSE
+# define srv_force_recovery			0UL
+# define srv_set_io_thread_op_info(t,info)	((void) 0)
+# define srv_is_being_started			0
+# define srv_win_file_flush_method		SRV_WIN_IO_UNBUFFERED
+# define srv_unix_file_flush_method		SRV_UNIX_O_DSYNC
+# define srv_start_raw_disk_in_use		0
+# define srv_file_per_table			1
+#endif /* !UNIV_HOTBACKUP */
+
+#endif
diff --git a/storage/xtradb/include/srv0srv.ic b/storage/xtradb/include/srv0srv.ic
new file mode 100644
index 00000000000..8a1a678a016
--- /dev/null
+++ b/storage/xtradb/include/srv0srv.ic
@@ -0,0 +1,24 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/srv0srv.ic
+Server main program
+
+Created 10/4/1995 Heikki Tuuri
+*******************************************************/
diff --git a/storage/xtradb/include/srv0start.h b/storage/xtradb/include/srv0start.h
new file mode 100644
index 00000000000..8abf15da9c1
--- /dev/null
+++ b/storage/xtradb/include/srv0start.h
@@ -0,0 +1,134 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/srv0start.h
+Starts the Innobase database server
+
+Created 10/10/1995 Heikki Tuuri
+*******************************************************/
+
+#ifndef srv0start_h
+#define srv0start_h
+
+#include "univ.i"
+#include "ut0byte.h"
+
+/*********************************************************************//**
+Normalizes a directory path for Windows: converts slashes to backslashes. */
+UNIV_INTERN
+void
+srv_normalize_path_for_win(
+/*=======================*/
+	char*	str);	/*!< in/out: null-terminated character string */
+/*********************************************************************//**
+Reads the data files and their sizes from a character string given in
+the .cnf file.
+@return	TRUE if ok, FALSE on parse error */
+UNIV_INTERN
+ibool
+srv_parse_data_file_paths_and_sizes(
+/*================================*/
+	char*	str);	/*!< in/out: the data file path string */
+/*********************************************************************//**
+Reads log group home directories from a character string given in
+the .cnf file.
+@return	TRUE if ok, FALSE on parse error */
+UNIV_INTERN
+ibool
+srv_parse_log_group_home_dirs(
+/*==========================*/
+	char*	str);	/*!< in/out: character string */
+/*********************************************************************//**
+Frees the memory allocated by srv_parse_data_file_paths_and_sizes()
+and srv_parse_log_group_home_dirs(). */
+UNIV_INTERN
+void
+srv_free_paths_and_sizes(void);
+/*==========================*/
+/*********************************************************************//**
+Adds a slash or a backslash to the end of a string if it is missing
+and the string is not empty.
+@return	string which has the separator if the string is not empty */
+UNIV_INTERN
+char*
+srv_add_path_separator_if_needed(
+/*=============================*/
+	char*	str);	/*!< in: null-terminated character string */
+#ifndef UNIV_HOTBACKUP
+/****************************************************************//**
+Starts Innobase and creates a new database if database files
+are not found and the user wants.
+@return	DB_SUCCESS or error code */
+UNIV_INTERN
+int
+innobase_start_or_create_for_mysql(void);
+/*====================================*/
+/****************************************************************//**
+Shuts down the Innobase database.
+@return	DB_SUCCESS or error code */
+UNIV_INTERN
+int
+innobase_shutdown_for_mysql(void);
+/*=============================*/
+/** Log sequence number at shutdown */
+extern	ib_uint64_t	srv_shutdown_lsn;
+/** Log sequence number immediately after startup */
+extern	ib_uint64_t	srv_start_lsn;
+
+#ifdef __NETWARE__
+void set_panic_flag_for_netware(void);
+#endif
+
+#ifdef HAVE_DARWIN_THREADS
+/** TRUE if the F_FULLFSYNC option is available */
+extern	ibool	srv_have_fullfsync;
+#endif
+
+/** TRUE if the server is being started */
+extern	ibool	srv_is_being_started;
+/** TRUE if the server was successfully started */
+extern	ibool	srv_was_started;
+/** TRUE if the server is being started, before rolling back any
+incomplete transactions */
+extern	ibool	srv_startup_is_before_trx_rollback_phase;
+
+/** TRUE if a raw partition is in use */
+extern	ibool	srv_start_raw_disk_in_use;
+
+
+/** Shutdown state */
+enum srv_shutdown_state {
+	SRV_SHUTDOWN_NONE = 0,	/*!< Database running normally */
+	SRV_SHUTDOWN_CLEANUP,	/*!< Cleaning up in
+				logs_empty_and_mark_files_at_shutdown() */
+	SRV_SHUTDOWN_LAST_PHASE,/*!< Last phase after ensuring that
+				the buffer pool can be freed: flush
+				all file spaces and close all files */
+	SRV_SHUTDOWN_EXIT_THREADS/*!< Exit all threads */
+};
+
+/** At a shutdown this value climbs from SRV_SHUTDOWN_NONE to
+SRV_SHUTDOWN_CLEANUP and then to SRV_SHUTDOWN_LAST_PHASE, and so on */
+extern	enum srv_shutdown_state	srv_shutdown_state;
+#endif /* !UNIV_HOTBACKUP */
+
+/** Log 'spaces' have id's >= this */
+#define SRV_LOG_SPACE_FIRST_ID		0xFFFFFFF0UL
+
+#endif
diff --git a/storage/xtradb/include/sync0arr.h b/storage/xtradb/include/sync0arr.h
new file mode 100644
index 00000000000..5f1280f5e28
--- /dev/null
+++ b/storage/xtradb/include/sync0arr.h
@@ -0,0 +1,142 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/sync0arr.h
+The wait array used in synchronization primitives
+
+Created 9/5/1995 Heikki Tuuri
+*******************************************************/
+
+#ifndef sync0arr_h
+#define sync0arr_h
+
+#include "univ.i"
+#include "ut0lst.h"
+#include "ut0mem.h"
+#include "os0thread.h"
+
+/** Synchronization wait array cell */
+typedef struct sync_cell_struct		sync_cell_t;
+/** Synchronization wait array */
+typedef struct sync_array_struct	sync_array_t;
+
+/** Parameters for sync_array_create() @{ */
+#define SYNC_ARRAY_OS_MUTEX	1	/*!< protected by os_mutex_t */
+#define SYNC_ARRAY_MUTEX	2	/*!< protected by mutex_t */
+/* @} */
+
+/*******************************************************************//**
+Creates a synchronization wait array. It is protected by a mutex
+which is automatically reserved when the functions operating on it
+are called.
+@return	own: created wait array */
+UNIV_INTERN
+sync_array_t*
+sync_array_create(
+/*==============*/
+	ulint	n_cells,	/*!< in: number of cells in the array
+				to create */
+	ulint	protection);	/*!< in: either SYNC_ARRAY_OS_MUTEX or
+				SYNC_ARRAY_MUTEX: determines the type
+				of mutex protecting the data structure */
+/******************************************************************//**
+Frees the resources in a wait array. */
+UNIV_INTERN
+void
+sync_array_free(
+/*============*/
+	sync_array_t*	arr);	/*!< in, own: sync wait array */
+/******************************************************************//**
+Reserves a wait array cell for waiting for an object.
+The event of the cell is reset to nonsignalled state. */
+UNIV_INTERN
+void
+sync_array_reserve_cell(
+/*====================*/
+	sync_array_t*	arr,	/*!< in: wait array */
+	void*		object, /*!< in: pointer to the object to wait for */
+	ulint		type,	/*!< in: lock request type */
+	const char*	file,	/*!< in: file where requested */
+	ulint		line,	/*!< in: line where requested */
+	ulint*		index); /*!< out: index of the reserved cell */
+/******************************************************************//**
+This function should be called when a thread starts to wait on
+a wait array cell. In the debug version this function checks
+if the wait for a semaphore will result in a deadlock, in which
+case prints info and asserts. */
+UNIV_INTERN
+void
+sync_array_wait_event(
+/*==================*/
+	sync_array_t*	arr,	/*!< in: wait array */
+	ulint		index);	 /*!< in: index of the reserved cell */
+/******************************************************************//**
+Frees the cell. NOTE! sync_array_wait_event frees the cell
+automatically! */
+UNIV_INTERN
+void
+sync_array_free_cell(
+/*=================*/
+	sync_array_t*	arr,	/*!< in: wait array */
+	ulint		index);	/*!< in: index of the cell in array */
+/**********************************************************************//**
+Note that one of the wait objects was signalled. */
+UNIV_INTERN
+void
+sync_array_object_signalled(
+/*========================*/
+	sync_array_t*	arr);	/*!< in: wait array */
+/**********************************************************************//**
+If the wakeup algorithm does not work perfectly at semaphore relases,
+this function will do the waking (see the comment in mutex_exit). This
+function should be called about every 1 second in the server. */
+UNIV_INTERN
+void
+sync_arr_wake_threads_if_sema_free(void);
+/*====================================*/
+/**********************************************************************//**
+Prints warnings of long semaphore waits to stderr.
+@return	TRUE if fatal semaphore wait threshold was exceeded */
+UNIV_INTERN
+ibool
+sync_array_print_long_waits(void);
+/*=============================*/
+/********************************************************************//**
+Validates the integrity of the wait array. Checks
+that the number of reserved cells equals the count variable. */
+UNIV_INTERN
+void
+sync_array_validate(
+/*================*/
+	sync_array_t*	arr);	/*!< in: sync wait array */
+/**********************************************************************//**
+Prints info of the wait array. */
+UNIV_INTERN
+void
+sync_array_print_info(
+/*==================*/
+	FILE*		file,	/*!< in: file where to print */
+	sync_array_t*	arr);	/*!< in: wait array */
+
+
+#ifndef UNIV_NONINL
+#include "sync0arr.ic"
+#endif
+
+#endif
diff --git a/storage/xtradb/include/sync0arr.ic b/storage/xtradb/include/sync0arr.ic
new file mode 100644
index 00000000000..bf57f5b2dc2
--- /dev/null
+++ b/storage/xtradb/include/sync0arr.ic
@@ -0,0 +1,27 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/sync0arr.ic
+The wait array for synchronization primitives
+
+Inline code
+
+Created 9/5/1995 Heikki Tuuri
+*******************************************************/
+
diff --git a/storage/xtradb/include/sync0rw.h b/storage/xtradb/include/sync0rw.h
new file mode 100644
index 00000000000..4edf93f4042
--- /dev/null
+++ b/storage/xtradb/include/sync0rw.h
@@ -0,0 +1,588 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2010, Innobase Oy. All Rights Reserved.
+Copyright (c) 2008, Google Inc.
+
+Portions of this file contain modifications contributed and copyrighted by
+Google, Inc. Those modifications are gratefully acknowledged and are described
+briefly in the InnoDB documentation. The contributions by Google are
+incorporated with their permission, and subject to the conditions contained in
+the file COPYING.Google.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/sync0rw.h
+The read-write lock (for threads, not for database transactions)
+
+Created 9/11/1995 Heikki Tuuri
+*******************************************************/
+
+#ifndef sync0rw_h
+#define sync0rw_h
+
+#include "univ.i"
+#ifndef UNIV_HOTBACKUP
+#include "ut0lst.h"
+#include "sync0sync.h"
+#include "os0sync.h"
+
+/* The following undef is to prevent a name conflict with a macro
+in MySQL: */
+#undef rw_lock_t
+#endif /* !UNIV_HOTBACKUP */
+
+/* Latch types; these are used also in btr0btr.h: keep the numerical values
+smaller than 30 and the order of the numerical values like below! */
+#define RW_S_LATCH	1
+#define	RW_X_LATCH	2
+#define	RW_NO_LATCH	3
+
+#ifndef UNIV_HOTBACKUP
+/* We decrement lock_word by this amount for each x_lock. It is also the
+start value for the lock_word, meaning that it limits the maximum number
+of concurrent read locks before the rw_lock breaks. The current value of
+0x00100000 allows 1,048,575 concurrent readers and 2047 recursive writers.*/
+#define X_LOCK_DECR		0x00100000
+
+typedef struct rw_lock_struct		rw_lock_t;
+#ifdef UNIV_SYNC_DEBUG
+typedef struct rw_lock_debug_struct	rw_lock_debug_t;
+#endif /* UNIV_SYNC_DEBUG */
+
+typedef UT_LIST_BASE_NODE_T(rw_lock_t)	rw_lock_list_t;
+
+extern rw_lock_list_t	rw_lock_list;
+extern mutex_t		rw_lock_list_mutex;
+
+#ifdef UNIV_SYNC_DEBUG
+/* The global mutex which protects debug info lists of all rw-locks.
+To modify the debug info list of an rw-lock, this mutex has to be
+
+acquired in addition to the mutex protecting the lock. */
+extern mutex_t		rw_lock_debug_mutex;
+extern os_event_t	rw_lock_debug_event;	/*!< If deadlock detection does
+					not get immediately the mutex it
+					may wait for this event */
+extern ibool		rw_lock_debug_waiters;	/*!< This is set to TRUE, if
+					there may be waiters for the event */
+#endif /* UNIV_SYNC_DEBUG */
+
+/** number of spin waits on rw-latches,
+resulted during exclusive (write) locks */
+extern	ib_int64_t	rw_s_spin_wait_count;
+/** number of spin loop rounds on rw-latches,
+resulted during exclusive (write) locks */
+extern	ib_int64_t	rw_s_spin_round_count;
+/** number of unlocks (that unlock shared locks),
+set only when UNIV_SYNC_PERF_STAT is defined */
+extern	ib_int64_t	rw_s_exit_count;
+/** number of OS waits on rw-latches,
+resulted during shared (read) locks */
+extern	ib_int64_t	rw_s_os_wait_count;
+/** number of spin waits on rw-latches,
+resulted during shared (read) locks */
+extern	ib_int64_t	rw_x_spin_wait_count;
+/** number of spin loop rounds on rw-latches,
+resulted during shared (read) locks */
+extern	ib_int64_t	rw_x_spin_round_count;
+/** number of OS waits on rw-latches,
+resulted during exclusive (write) locks */
+extern	ib_int64_t	rw_x_os_wait_count;
+/** number of unlocks (that unlock exclusive locks),
+set only when UNIV_SYNC_PERF_STAT is defined */
+extern	ib_int64_t	rw_x_exit_count;
+
+/******************************************************************//**
+Creates, or rather, initializes an rw-lock object in a specified memory
+location (which must be appropriately aligned). The rw-lock is initialized
+to the non-locked state. Explicit freeing of the rw-lock with rw_lock_free
+is necessary only if the memory block containing it is freed. */
+#ifdef UNIV_DEBUG
+# ifdef UNIV_SYNC_DEBUG
+#  define rw_lock_create(L, level) 					\
+	rw_lock_create_func((L), (level), #L, __FILE__, __LINE__)
+# else /* UNIV_SYNC_DEBUG */
+#  define rw_lock_create(L, level) 					\
+	rw_lock_create_func((L), #L, __FILE__, __LINE__)
+# endif /* UNIV_SYNC_DEBUG */
+#else /* UNIV_DEBUG */
+# define rw_lock_create(L, level) 					\
+	rw_lock_create_func((L), #L, NULL, 0)
+#endif /* UNIV_DEBUG */
+
+/******************************************************************//**
+Creates, or rather, initializes an rw-lock object in a specified memory
+location (which must be appropriately aligned). The rw-lock is initialized
+to the non-locked state. Explicit freeing of the rw-lock with rw_lock_free
+is necessary only if the memory block containing it is freed. */
+UNIV_INTERN
+void
+rw_lock_create_func(
+/*================*/
+	rw_lock_t*	lock,		/*!< in: pointer to memory */
+#ifdef UNIV_DEBUG
+# ifdef UNIV_SYNC_DEBUG
+	ulint		level,		/*!< in: level */
+# endif /* UNIV_SYNC_DEBUG */
+#endif /* UNIV_DEBUG */
+	const char*	cmutex_name, 	/*!< in: mutex name */
+	const char*	cfile_name,	/*!< in: file name where created */
+	ulint 		cline);		/*!< in: file line where created */
+/******************************************************************//**
+Calling this function is obligatory only if the memory buffer containing
+the rw-lock is freed. Removes an rw-lock object from the global list. The
+rw-lock is checked to be in the non-locked state. */
+UNIV_INTERN
+void
+rw_lock_free(
+/*=========*/
+	rw_lock_t*	lock);	/*!< in: rw-lock */
+#ifdef UNIV_DEBUG
+/******************************************************************//**
+Checks that the rw-lock has been initialized and that there are no
+simultaneous shared and exclusive locks.
+@return	TRUE */
+UNIV_INTERN
+ibool
+rw_lock_validate(
+/*=============*/
+	rw_lock_t*	lock);	/*!< in: rw-lock */
+#endif /* UNIV_DEBUG */
+/**************************************************************//**
+NOTE! The following macros should be used in rw s-locking, not the
+corresponding function. */
+
+#define rw_lock_s_lock(M)	rw_lock_s_lock_func(\
+		(M), 0, __FILE__, __LINE__)
+/**************************************************************//**
+NOTE! The following macros should be used in rw s-locking, not the
+corresponding function. */
+
+#define rw_lock_s_lock_gen(M, P)	rw_lock_s_lock_func(\
+		(M), (P), __FILE__, __LINE__)
+/**************************************************************//**
+NOTE! The following macros should be used in rw s-locking, not the
+corresponding function. */
+
+#define rw_lock_s_lock_nowait(M, F, L)    rw_lock_s_lock_low(\
+					  (M), 0, (F), (L))
+/******************************************************************//**
+Low-level function which tries to lock an rw-lock in s-mode. Performs no
+spinning.
+@return	TRUE if success */
+UNIV_INLINE
+ibool
+rw_lock_s_lock_low(
+/*===============*/
+	rw_lock_t*	lock,	/*!< in: pointer to rw-lock */
+	ulint		pass __attribute__((unused)),
+				/*!< in: pass value; != 0, if the lock will be
+				passed to another thread to unlock */
+	const char*	file_name, /*!< in: file name where lock requested */
+	ulint		line);	/*!< in: line where requested */
+/******************************************************************//**
+NOTE! Use the corresponding macro, not directly this function, except if
+you supply the file name and line number. Lock an rw-lock in shared mode
+for the current thread. If the rw-lock is locked in exclusive mode, or
+there is an exclusive lock request waiting, the function spins a preset
+time (controlled by SYNC_SPIN_ROUNDS), waiting for the lock, before
+suspending the thread. */
+UNIV_INLINE
+void
+rw_lock_s_lock_func(
+/*================*/
+	rw_lock_t*	lock,	/*!< in: pointer to rw-lock */
+	ulint		pass,	/*!< in: pass value; != 0, if the lock will
+				be passed to another thread to unlock */
+	const char*	file_name,/*!< in: file name where lock requested */
+	ulint		line);	/*!< in: line where requested */
+/******************************************************************//**
+NOTE! Use the corresponding macro, not directly this function! Lock an
+rw-lock in exclusive mode for the current thread if the lock can be
+obtained immediately.
+@return	TRUE if success */
+UNIV_INLINE
+ibool
+rw_lock_x_lock_func_nowait(
+/*=======================*/
+	rw_lock_t*	lock,	/*!< in: pointer to rw-lock */
+	const char*	file_name,/*!< in: file name where lock requested */
+	ulint		line);	/*!< in: line where requested */
+/******************************************************************//**
+Releases a shared mode lock. */
+UNIV_INLINE
+void
+rw_lock_s_unlock_func(
+/*==================*/
+#ifdef UNIV_SYNC_DEBUG
+	ulint		pass,	/*!< in: pass value; != 0, if the lock may have
+				been passed to another thread to unlock */
+#endif
+	rw_lock_t*	lock);	/*!< in/out: rw-lock */
+
+#ifdef UNIV_SYNC_DEBUG
+# define rw_lock_s_unlock_gen(L, P)	rw_lock_s_unlock_func(P, L)
+#else
+# define rw_lock_s_unlock_gen(L, P)	rw_lock_s_unlock_func(L)
+#endif
+/*******************************************************************//**
+Releases a shared mode lock. */
+#define rw_lock_s_unlock(L)		rw_lock_s_unlock_gen(L, 0)
+
+/**************************************************************//**
+NOTE! The following macro should be used in rw x-locking, not the
+corresponding function. */
+
+#define rw_lock_x_lock(M)	rw_lock_x_lock_func(\
+		(M), 0, __FILE__, __LINE__)
+/**************************************************************//**
+NOTE! The following macro should be used in rw x-locking, not the
+corresponding function. */
+
+#define rw_lock_x_lock_gen(M, P)	rw_lock_x_lock_func(\
+		(M), (P), __FILE__, __LINE__)
+/**************************************************************//**
+NOTE! The following macros should be used in rw x-locking, not the
+corresponding function. */
+
+#define rw_lock_x_lock_nowait(M)	rw_lock_x_lock_func_nowait(\
+		(M), __FILE__, __LINE__)
+/******************************************************************//**
+NOTE! Use the corresponding macro, not directly this function! Lock an
+rw-lock in exclusive mode for the current thread. If the rw-lock is locked
+in shared or exclusive mode, or there is an exclusive lock request waiting,
+the function spins a preset time (controlled by SYNC_SPIN_ROUNDS), waiting
+for the lock, before suspending the thread. If the same thread has an x-lock
+on the rw-lock, locking succeed, with the following exception: if pass != 0,
+only a single x-lock may be taken on the lock. NOTE: If the same thread has
+an s-lock, locking does not succeed! */
+UNIV_INTERN
+void
+rw_lock_x_lock_func(
+/*================*/
+	rw_lock_t*	lock,	/*!< in: pointer to rw-lock */
+	ulint		pass,	/*!< in: pass value; != 0, if the lock will
+				be passed to another thread to unlock */
+	const char*	file_name,/*!< in: file name where lock requested */
+	ulint		line);	/*!< in: line where requested */
+/******************************************************************//**
+Releases an exclusive mode lock. */
+UNIV_INLINE
+void
+rw_lock_x_unlock_func(
+/*==================*/
+#ifdef UNIV_SYNC_DEBUG
+	ulint		pass,	/*!< in: pass value; != 0, if the lock may have
+				been passed to another thread to unlock */
+#endif
+	rw_lock_t*	lock);	/*!< in/out: rw-lock */
+
+#ifdef UNIV_SYNC_DEBUG
+# define rw_lock_x_unlock_gen(L, P)	rw_lock_x_unlock_func(P, L)
+#else
+# define rw_lock_x_unlock_gen(L, P)	rw_lock_x_unlock_func(L)
+#endif
+/*******************************************************************//**
+Releases an exclusive mode lock. */
+#define rw_lock_x_unlock(L)		rw_lock_x_unlock_gen(L, 0)
+
+/******************************************************************//**
+Low-level function which locks an rw-lock in s-mode when we know that it
+is possible and none else is currently accessing the rw-lock structure.
+Then we can do the locking without reserving the mutex. */
+UNIV_INLINE
+void
+rw_lock_s_lock_direct(
+/*==================*/
+	rw_lock_t*	lock,		/*!< in/out: rw-lock */
+	const char*	file_name,	/*!< in: file name where requested */
+	ulint		line);		/*!< in: line where lock requested */
+/******************************************************************//**
+Low-level function which locks an rw-lock in x-mode when we know that it
+is not locked and none else is currently accessing the rw-lock structure.
+Then we can do the locking without reserving the mutex. */
+UNIV_INLINE
+void
+rw_lock_x_lock_direct(
+/*==================*/
+	rw_lock_t*	lock,		/*!< in/out: rw-lock */
+	const char*	file_name,	/*!< in: file name where requested */
+	ulint		line);		/*!< in: line where lock requested */
+/******************************************************************//**
+This function is used in the insert buffer to move the ownership of an
+x-latch on a buffer frame to the current thread. The x-latch was set by
+the buffer read operation and it protected the buffer frame while the
+read was done. The ownership is moved because we want that the current
+thread is able to acquire a second x-latch which is stored in an mtr.
+This, in turn, is needed to pass the debug checks of index page
+operations. */
+UNIV_INTERN
+void
+rw_lock_x_lock_move_ownership(
+/*==========================*/
+	rw_lock_t*	lock);	/*!< in: lock which was x-locked in the
+				buffer read */
+/******************************************************************//**
+Releases a shared mode lock when we know there are no waiters and none
+else will access the lock during the time this function is executed. */
+UNIV_INLINE
+void
+rw_lock_s_unlock_direct(
+/*====================*/
+	rw_lock_t*	lock);	/*!< in/out: rw-lock */
+/******************************************************************//**
+Releases an exclusive mode lock when we know there are no waiters, and
+none else will access the lock durint the time this function is executed. */
+UNIV_INLINE
+void
+rw_lock_x_unlock_direct(
+/*====================*/
+	rw_lock_t*	lock);	/*!< in/out: rw-lock */
+/******************************************************************//**
+Returns the value of writer_count for the lock. Does not reserve the lock
+mutex, so the caller must be sure it is not changed during the call.
+@return	value of writer_count */
+UNIV_INLINE
+ulint
+rw_lock_get_x_lock_count(
+/*=====================*/
+	const rw_lock_t*	lock);	/*!< in: rw-lock */
+/********************************************************************//**
+Check if there are threads waiting for the rw-lock.
+@return	1 if waiters, 0 otherwise */
+UNIV_INLINE
+ulint
+rw_lock_get_waiters(
+/*================*/
+	const rw_lock_t*	lock);	/*!< in: rw-lock */
+/******************************************************************//**
+Returns the write-status of the lock - this function made more sense
+with the old rw_lock implementation.
+@return	RW_LOCK_NOT_LOCKED, RW_LOCK_EX, RW_LOCK_WAIT_EX */
+UNIV_INLINE
+ulint
+rw_lock_get_writer(
+/*===============*/
+	const rw_lock_t*	lock);	/*!< in: rw-lock */
+/******************************************************************//**
+Returns the number of readers.
+@return	number of readers */
+UNIV_INLINE
+ulint
+rw_lock_get_reader_count(
+/*=====================*/
+	const rw_lock_t*	lock);	/*!< in: rw-lock */
+/******************************************************************//**
+Decrements lock_word the specified amount if it is greater than 0.
+This is used by both s_lock and x_lock operations.
+@return	TRUE if decr occurs */
+UNIV_INLINE
+ibool
+rw_lock_lock_word_decr(
+/*===================*/
+	rw_lock_t*	lock,		/*!< in/out: rw-lock */
+	ulint		amount);	/*!< in: amount to decrement */
+/******************************************************************//**
+Increments lock_word the specified amount and returns new value.
+@return	lock->lock_word after increment */
+UNIV_INLINE
+lint
+rw_lock_lock_word_incr(
+/*===================*/
+	rw_lock_t*	lock,		/*!< in/out: rw-lock */
+	ulint		amount);	/*!< in: amount to increment */
+/******************************************************************//**
+This function sets the lock->writer_thread and lock->recursive fields.
+For platforms where we are using atomic builtins instead of lock->mutex
+it sets the lock->writer_thread field using atomics to ensure memory
+ordering. Note that it is assumed that the caller of this function
+effectively owns the lock i.e.: nobody else is allowed to modify
+lock->writer_thread at this point in time.
+The protocol is that lock->writer_thread MUST be updated BEFORE the
+lock->recursive flag is set. */
+UNIV_INLINE
+void
+rw_lock_set_writer_id_and_recursion_flag(
+/*=====================================*/
+	rw_lock_t*	lock,		/*!< in/out: lock to work on */
+	ibool		recursive);	/*!< in: TRUE if recursion
+					allowed */
+#ifdef UNIV_SYNC_DEBUG
+/******************************************************************//**
+Checks if the thread has locked the rw-lock in the specified mode, with
+the pass value == 0. */
+UNIV_INTERN
+ibool
+rw_lock_own(
+/*========*/
+	rw_lock_t*	lock,		/*!< in: rw-lock */
+	ulint		lock_type)	/*!< in: lock type: RW_LOCK_SHARED,
+					RW_LOCK_EX */
+	__attribute__((warn_unused_result));
+#endif /* UNIV_SYNC_DEBUG */
+/******************************************************************//**
+Checks if somebody has locked the rw-lock in the specified mode. */
+UNIV_INTERN
+ibool
+rw_lock_is_locked(
+/*==============*/
+	rw_lock_t*	lock,		/*!< in: rw-lock */
+	ulint		lock_type);	/*!< in: lock type: RW_LOCK_SHARED,
+					RW_LOCK_EX */
+#ifdef UNIV_SYNC_DEBUG
+/***************************************************************//**
+Prints debug info of an rw-lock. */
+UNIV_INTERN
+void
+rw_lock_print(
+/*==========*/
+	rw_lock_t*	lock);	/*!< in: rw-lock */
+/***************************************************************//**
+Prints debug info of currently locked rw-locks. */
+UNIV_INTERN
+void
+rw_lock_list_print_info(
+/*====================*/
+	FILE*	file);		/*!< in: file where to print */
+/***************************************************************//**
+Returns the number of currently locked rw-locks.
+Works only in the debug version.
+@return	number of locked rw-locks */
+UNIV_INTERN
+ulint
+rw_lock_n_locked(void);
+/*==================*/
+
+/*#####################################################################*/
+
+/******************************************************************//**
+Acquires the debug mutex. We cannot use the mutex defined in sync0sync,
+because the debug mutex is also acquired in sync0arr while holding the OS
+mutex protecting the sync array, and the ordinary mutex_enter might
+recursively call routines in sync0arr, leading to a deadlock on the OS
+mutex. */
+UNIV_INTERN
+void
+rw_lock_debug_mutex_enter(void);
+/*==========================*/
+/******************************************************************//**
+Releases the debug mutex. */
+UNIV_INTERN
+void
+rw_lock_debug_mutex_exit(void);
+/*==========================*/
+/*********************************************************************//**
+Prints info of a debug struct. */
+UNIV_INTERN
+void
+rw_lock_debug_print(
+/*================*/
+	rw_lock_debug_t*	info);	/*!< in: debug struct */
+#endif /* UNIV_SYNC_DEBUG */
+
+/* NOTE! The structure appears here only for the compiler to know its size.
+Do not use its fields directly! */
+
+/** The structure used in the spin lock implementation of a read-write
+lock. Several threads may have a shared lock simultaneously in this
+lock, but only one writer may have an exclusive lock, in which case no
+shared locks are allowed. To prevent starving of a writer blocked by
+readers, a writer may queue for x-lock by decrementing lock_word: no
+new readers will be let in while the thread waits for readers to
+exit. */
+struct rw_lock_struct {
+	volatile lint	lock_word;
+				/*!< Holds the state of the lock. */
+	volatile ulint	waiters;/*!< 1: there are waiters */
+	volatile ibool	recursive;/*!< Default value FALSE which means the lock
+				is non-recursive. The value is typically set
+				to TRUE making normal rw_locks recursive. In
+				case of asynchronous IO, when a non-zero
+				value of 'pass' is passed then we keep the
+				lock non-recursive.
+				This flag also tells us about the state of
+				writer_thread field. If this flag is set
+				then writer_thread MUST contain the thread
+				id of the current x-holder or wait-x thread.
+				This flag must be reset in x_unlock
+				functions before incrementing the lock_word */
+	volatile os_thread_id_t	writer_thread;
+				/*!< Thread id of writer thread. Is only
+				guaranteed to have sane and non-stale
+				value iff recursive flag is set. */
+	os_event_t	event;	/*!< Used by sync0arr.c for thread queueing */
+	os_event_t	wait_ex_event;
+				/*!< Event for next-writer to wait on. A thread
+				must decrement lock_word before waiting. */
+#ifndef INNODB_RW_LOCKS_USE_ATOMICS
+	mutex_t	mutex;		/*!< The mutex protecting rw_lock_struct */
+#endif /* INNODB_RW_LOCKS_USE_ATOMICS */
+
+	UT_LIST_NODE_T(rw_lock_t) list;
+				/*!< All allocated rw locks are put into a
+				list */
+#ifdef UNIV_SYNC_DEBUG
+	UT_LIST_BASE_NODE_T(rw_lock_debug_t) debug_list;
+				/*!< In the debug version: pointer to the debug
+				info list of the lock */
+	ulint	level;		/*!< Level in the global latching order. */
+#endif /* UNIV_SYNC_DEBUG */
+	ulint count_os_wait;	/*!< Count of os_waits. May not be accurate */
+	//const char*	cfile_name;/*!< File name where lock created */
+	const char*	lock_name;/*!< lock name */
+        /* last s-lock file/line is not guaranteed to be correct */
+	const char*	last_s_file_name;/*!< File name where last s-locked */
+	const char*	last_x_file_name;/*!< File name where last x-locked */
+	ibool		writer_is_wait_ex;
+				/*!< This is TRUE if the writer field is
+				RW_LOCK_WAIT_EX; this field is located far
+				from the memory update hotspot fields which
+				are at the start of this struct, thus we can
+				peek this field without causing much memory
+				bus traffic */
+	//unsigned	cline:14;	/*!< Line where created */
+	unsigned	last_s_line:14;	/*!< Line number where last time s-locked */
+	unsigned	last_x_line:14;	/*!< Line number where last time x-locked */
+#ifdef UNIV_DEBUG
+	ulint	magic_n;	/*!< RW_LOCK_MAGIC_N */
+/** Value of rw_lock_struct::magic_n */
+#define	RW_LOCK_MAGIC_N	22643
+#endif /* UNIV_DEBUG */
+};
+
+#ifdef UNIV_SYNC_DEBUG
+/** The structure for storing debug info of an rw-lock */
+struct	rw_lock_debug_struct {
+
+	os_thread_id_t thread_id;  /*!< The thread id of the thread which
+				locked the rw-lock */
+	ulint	pass;		/*!< Pass value given in the lock operation */
+	ulint	lock_type;	/*!< Type of the lock: RW_LOCK_EX,
+				RW_LOCK_SHARED, RW_LOCK_WAIT_EX */
+	const char*	file_name;/*!< File name where the lock was obtained */
+	ulint	line;		/*!< Line where the rw-lock was locked */
+	UT_LIST_NODE_T(rw_lock_debug_t) list;
+				/*!< Debug structs are linked in a two-way
+				list */
+};
+#endif /* UNIV_SYNC_DEBUG */
+
+#ifndef UNIV_NONINL
+#include "sync0rw.ic"
+#endif
+#endif /* !UNIV_HOTBACKUP */
+
+#endif
diff --git a/storage/xtradb/include/sync0rw.ic b/storage/xtradb/include/sync0rw.ic
new file mode 100644
index 00000000000..7116f1b7c9b
--- /dev/null
+++ b/storage/xtradb/include/sync0rw.ic
@@ -0,0 +1,624 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 2008, Google Inc.
+
+Portions of this file contain modifications contributed and copyrighted by
+Google, Inc. Those modifications are gratefully acknowledged and are described
+briefly in the InnoDB documentation. The contributions by Google are
+incorporated with their permission, and subject to the conditions contained in
+the file COPYING.Google.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/sync0rw.ic
+The read-write lock (for threads)
+
+Created 9/11/1995 Heikki Tuuri
+*******************************************************/
+
+/******************************************************************//**
+Lock an rw-lock in shared mode for the current thread. If the rw-lock is
+locked in exclusive mode, or there is an exclusive lock request waiting,
+the function spins a preset time (controlled by SYNC_SPIN_ROUNDS),
+waiting for the lock before suspending the thread. */
+UNIV_INTERN
+void
+rw_lock_s_lock_spin(
+/*================*/
+	rw_lock_t*	lock,	/*!< in: pointer to rw-lock */
+	ulint		pass,	/*!< in: pass value; != 0, if the lock will
+				be passed to another thread to unlock */
+	const char*	file_name,/*!< in: file name where lock requested */
+	ulint		line);	/*!< in: line where requested */
+#ifdef UNIV_SYNC_DEBUG
+/******************************************************************//**
+Inserts the debug information for an rw-lock. */
+UNIV_INTERN
+void
+rw_lock_add_debug_info(
+/*===================*/
+	rw_lock_t*	lock,		/*!< in: rw-lock */
+	ulint		pass,		/*!< in: pass value */
+	ulint		lock_type,	/*!< in: lock type */
+	const char*	file_name,	/*!< in: file where requested */
+	ulint		line);		/*!< in: line where requested */
+/******************************************************************//**
+Removes a debug information struct for an rw-lock. */
+UNIV_INTERN
+void
+rw_lock_remove_debug_info(
+/*======================*/
+	rw_lock_t*	lock,		/*!< in: rw-lock */
+	ulint		pass,		/*!< in: pass value */
+	ulint		lock_type);	/*!< in: lock type */
+#endif /* UNIV_SYNC_DEBUG */
+
+/********************************************************************//**
+Check if there are threads waiting for the rw-lock.
+@return	1 if waiters, 0 otherwise */
+UNIV_INLINE
+ulint
+rw_lock_get_waiters(
+/*================*/
+	const rw_lock_t*	lock)	/*!< in: rw-lock */
+{
+	return(lock->waiters);
+}
+
+/********************************************************************//**
+Sets lock->waiters to 1. It is not an error if lock->waiters is already
+1. On platforms where ATOMIC builtins are used this function enforces a
+memory barrier. */
+UNIV_INLINE
+void
+rw_lock_set_waiter_flag(
+/*====================*/
+	rw_lock_t*	lock)	/*!< in/out: rw-lock */
+{
+#ifdef INNODB_RW_LOCKS_USE_ATOMICS
+	os_compare_and_swap_ulint(&lock->waiters, 0, 1);
+#else /* INNODB_RW_LOCKS_USE_ATOMICS */
+	lock->waiters = 1;
+#endif /* INNODB_RW_LOCKS_USE_ATOMICS */
+}
+
+/********************************************************************//**
+Resets lock->waiters to 0. It is not an error if lock->waiters is already
+0. On platforms where ATOMIC builtins are used this function enforces a
+memory barrier. */
+UNIV_INLINE
+void
+rw_lock_reset_waiter_flag(
+/*======================*/
+	rw_lock_t*	lock)	/*!< in/out: rw-lock */
+{
+#ifdef INNODB_RW_LOCKS_USE_ATOMICS
+	os_compare_and_swap_ulint(&lock->waiters, 1, 0);
+#else /* INNODB_RW_LOCKS_USE_ATOMICS */
+	lock->waiters = 0;
+#endif /* INNODB_RW_LOCKS_USE_ATOMICS */
+}
+
+/******************************************************************//**
+Returns the write-status of the lock - this function made more sense
+with the old rw_lock implementation.
+@return	RW_LOCK_NOT_LOCKED, RW_LOCK_EX, RW_LOCK_WAIT_EX */
+UNIV_INLINE
+ulint
+rw_lock_get_writer(
+/*===============*/
+	const rw_lock_t*	lock)	/*!< in: rw-lock */
+{
+	lint lock_word = lock->lock_word;
+	if (lock_word > 0) {
+		/* return NOT_LOCKED in s-lock state, like the writer
+		member of the old lock implementation. */
+		return(RW_LOCK_NOT_LOCKED);
+	} else if (((-lock_word) % X_LOCK_DECR) == 0) {
+		return(RW_LOCK_EX);
+	} else {
+                ut_ad(lock_word > -X_LOCK_DECR);
+		return(RW_LOCK_WAIT_EX);
+	}
+}
+
+/******************************************************************//**
+Returns the number of readers.
+@return	number of readers */
+UNIV_INLINE
+ulint
+rw_lock_get_reader_count(
+/*=====================*/
+	const rw_lock_t*	lock)	/*!< in: rw-lock */
+{
+	lint lock_word = lock->lock_word;
+	if (lock_word > 0) {
+		/* s-locked, no x-waiters */
+		return(X_LOCK_DECR - lock_word);
+	} else if (lock_word < 0 && lock_word > -X_LOCK_DECR) {
+		/* s-locked, with x-waiters */
+		return((ulint)(-lock_word));
+	}
+	return(0);
+}
+
+#ifndef INNODB_RW_LOCKS_USE_ATOMICS
+UNIV_INLINE
+mutex_t*
+rw_lock_get_mutex(
+/*==============*/
+	rw_lock_t*	lock)
+{
+	return(&(lock->mutex));
+}
+#endif
+
+/******************************************************************//**
+Returns the value of writer_count for the lock. Does not reserve the lock
+mutex, so the caller must be sure it is not changed during the call.
+@return	value of writer_count */
+UNIV_INLINE
+ulint
+rw_lock_get_x_lock_count(
+/*=====================*/
+	const rw_lock_t*	lock)	/*!< in: rw-lock */
+{
+	lint lock_copy = lock->lock_word;
+	/* If there is a reader, lock_word is not divisible by X_LOCK_DECR */
+	if (lock_copy > 0 || (-lock_copy) % X_LOCK_DECR != 0) {
+		return(0);
+	}
+	return(((-lock_copy) / X_LOCK_DECR) + 1);
+}
+
+/******************************************************************//**
+Two different implementations for decrementing the lock_word of a rw_lock:
+one for systems supporting atomic operations, one for others. This does
+does not support recusive x-locks: they should be handled by the caller and
+need not be atomic since they are performed by the current lock holder.
+Returns true if the decrement was made, false if not.
+@return	TRUE if decr occurs */
+UNIV_INLINE
+ibool
+rw_lock_lock_word_decr(
+/*===================*/
+	rw_lock_t*	lock,		/*!< in/out: rw-lock */
+	ulint		amount)		/*!< in: amount to decrement */
+{
+#ifdef INNODB_RW_LOCKS_USE_ATOMICS
+        lint local_lock_word = lock->lock_word;
+	while (local_lock_word > 0) {
+		if (os_compare_and_swap_lint(&lock->lock_word,
+					     local_lock_word,
+					     local_lock_word - amount)) {
+			return(TRUE);
+		}
+		local_lock_word = lock->lock_word;
+	}
+	return(FALSE);
+#else /* INNODB_RW_LOCKS_USE_ATOMICS */
+	ibool success = FALSE;
+	mutex_enter(&(lock->mutex));
+	if (lock->lock_word > 0) {
+		lock->lock_word -= amount;
+		success = TRUE;
+	}
+	mutex_exit(&(lock->mutex));
+	return(success);
+#endif /* INNODB_RW_LOCKS_USE_ATOMICS */
+}
+
+/******************************************************************//**
+Increments lock_word the specified amount and returns new value.
+@return	lock->lock_word after increment */
+UNIV_INLINE
+lint
+rw_lock_lock_word_incr(
+/*===================*/
+	rw_lock_t*	lock,		/*!< in/out: rw-lock */
+	ulint		amount)		/*!< in: amount of increment */
+{
+#ifdef INNODB_RW_LOCKS_USE_ATOMICS
+	return(os_atomic_increment_lint(&lock->lock_word, amount));
+#else /* INNODB_RW_LOCKS_USE_ATOMICS */
+	lint local_lock_word;
+
+	mutex_enter(&(lock->mutex));
+
+	lock->lock_word += amount;
+	local_lock_word = lock->lock_word;
+
+	mutex_exit(&(lock->mutex));
+
+        return(local_lock_word);
+#endif /* INNODB_RW_LOCKS_USE_ATOMICS */
+}
+
+/******************************************************************//**
+This function sets the lock->writer_thread and lock->recursive fields.
+For platforms where we are using atomic builtins instead of lock->mutex
+it sets the lock->writer_thread field using atomics to ensure memory
+ordering. Note that it is assumed that the caller of this function
+effectively owns the lock i.e.: nobody else is allowed to modify
+lock->writer_thread at this point in time.
+The protocol is that lock->writer_thread MUST be updated BEFORE the
+lock->recursive flag is set. */
+UNIV_INLINE
+void
+rw_lock_set_writer_id_and_recursion_flag(
+/*=====================================*/
+	rw_lock_t*	lock,		/*!< in/out: lock to work on */
+	ibool		recursive)	/*!< in: TRUE if recursion
+					allowed */
+{
+	os_thread_id_t	curr_thread	= os_thread_get_curr_id();
+
+#ifdef INNODB_RW_LOCKS_USE_ATOMICS
+	os_thread_id_t	local_thread;
+	ibool		success;
+
+	/* Prevent Valgrind warnings about writer_thread being
+	uninitialized.  It does not matter if writer_thread is
+	uninitialized, because we are comparing writer_thread against
+	itself, and the operation should always succeed. */
+	UNIV_MEM_VALID(&lock->writer_thread, sizeof lock->writer_thread);
+
+	local_thread = lock->writer_thread;
+	success = os_compare_and_swap_thread_id(
+		&lock->writer_thread, local_thread, curr_thread);
+	ut_a(success);
+	lock->recursive = recursive;
+
+#else /* INNODB_RW_LOCKS_USE_ATOMICS */
+
+	mutex_enter(&lock->mutex);
+	lock->writer_thread = curr_thread;
+	lock->recursive = recursive;
+	mutex_exit(&lock->mutex);
+
+#endif /* INNODB_RW_LOCKS_USE_ATOMICS */
+}
+
+/******************************************************************//**
+Low-level function which tries to lock an rw-lock in s-mode. Performs no
+spinning.
+@return	TRUE if success */
+UNIV_INLINE
+ibool
+rw_lock_s_lock_low(
+/*===============*/
+	rw_lock_t*	lock,	/*!< in: pointer to rw-lock */
+	ulint		pass __attribute__((unused)),
+				/*!< in: pass value; != 0, if the lock will be
+				passed to another thread to unlock */
+	const char*	file_name, /*!< in: file name where lock requested */
+	ulint		line)	/*!< in: line where requested */
+{
+	/* TODO: study performance of UNIV_LIKELY branch prediction hints. */
+	if (!rw_lock_lock_word_decr(lock, 1)) {
+		/* Locking did not succeed */
+		return(FALSE);
+	}
+
+#ifdef UNIV_SYNC_DEBUG
+	rw_lock_add_debug_info(lock, pass, RW_LOCK_SHARED, file_name, line);
+#endif
+	/* These debugging values are not set safely: they may be incorrect
+        or even refer to a line that is invalid for the file name. */
+	lock->last_s_file_name = file_name;
+	lock->last_s_line = line;
+
+	return(TRUE);	/* locking succeeded */
+}
+
+/******************************************************************//**
+Low-level function which locks an rw-lock in s-mode when we know that it
+is possible and none else is currently accessing the rw-lock structure.
+Then we can do the locking without reserving the mutex. */
+UNIV_INLINE
+void
+rw_lock_s_lock_direct(
+/*==================*/
+	rw_lock_t*	lock,		/*!< in/out: rw-lock */
+	const char*	file_name,	/*!< in: file name where requested */
+	ulint		line)		/*!< in: line where lock requested */
+{
+	ut_ad(lock->lock_word == X_LOCK_DECR);
+
+	/* Indicate there is a new reader by decrementing lock_word */
+	lock->lock_word--;
+
+	lock->last_s_file_name = file_name;
+	lock->last_s_line = line;
+
+#ifdef UNIV_SYNC_DEBUG
+	rw_lock_add_debug_info(lock, 0, RW_LOCK_SHARED, file_name, line);
+#endif
+}
+
+/******************************************************************//**
+Low-level function which locks an rw-lock in x-mode when we know that it
+is not locked and none else is currently accessing the rw-lock structure.
+Then we can do the locking without reserving the mutex. */
+UNIV_INLINE
+void
+rw_lock_x_lock_direct(
+/*==================*/
+	rw_lock_t*	lock,		/*!< in/out: rw-lock */
+	const char*	file_name,	/*!< in: file name where requested */
+	ulint		line)		/*!< in: line where lock requested */
+{
+	ut_ad(rw_lock_validate(lock));
+	ut_ad(lock->lock_word == X_LOCK_DECR);
+
+	lock->lock_word -= X_LOCK_DECR;
+	lock->writer_thread = os_thread_get_curr_id();
+	lock->recursive = TRUE;
+
+	lock->last_x_file_name = file_name;
+	lock->last_x_line = line;
+
+#ifdef UNIV_SYNC_DEBUG
+	rw_lock_add_debug_info(lock, 0, RW_LOCK_EX, file_name, line);
+#endif
+}
+
+/******************************************************************//**
+NOTE! Use the corresponding macro, not directly this function! Lock an
+rw-lock in shared mode for the current thread. If the rw-lock is locked
+in exclusive mode, or there is an exclusive lock request waiting, the
+function spins a preset time (controlled by SYNC_SPIN_ROUNDS), waiting for
+the lock, before suspending the thread. */
+UNIV_INLINE
+void
+rw_lock_s_lock_func(
+/*================*/
+	rw_lock_t*	lock,	/*!< in: pointer to rw-lock */
+	ulint		pass,	/*!< in: pass value; != 0, if the lock will
+				be passed to another thread to unlock */
+	const char*	file_name,/*!< in: file name where lock requested */
+	ulint		line)	/*!< in: line where requested */
+{
+	/* NOTE: As we do not know the thread ids for threads which have
+	s-locked a latch, and s-lockers will be served only after waiting
+	x-lock requests have been fulfilled, then if this thread already
+	owns an s-lock here, it may end up in a deadlock with another thread
+	which requests an x-lock here. Therefore, we will forbid recursive
+	s-locking of a latch: the following assert will warn the programmer
+	of the possibility of this kind of a deadlock. If we want to implement
+	safe recursive s-locking, we should keep in a list the thread ids of
+	the threads which have s-locked a latch. This would use some CPU
+	time. */
+
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(!rw_lock_own(lock, RW_LOCK_SHARED)); /* see NOTE above */
+#endif /* UNIV_SYNC_DEBUG */
+
+	/* TODO: study performance of UNIV_LIKELY branch prediction hints. */
+	if (rw_lock_s_lock_low(lock, pass, file_name, line)) {
+
+		return; /* Success */
+	} else {
+		/* Did not succeed, try spin wait */
+
+		rw_lock_s_lock_spin(lock, pass, file_name, line);
+
+		return;
+	}
+}
+
+/******************************************************************//**
+NOTE! Use the corresponding macro, not directly this function! Lock an
+rw-lock in exclusive mode for the current thread if the lock can be
+obtained immediately.
+@return	TRUE if success */
+UNIV_INLINE
+ibool
+rw_lock_x_lock_func_nowait(
+/*=======================*/
+	rw_lock_t*	lock,	/*!< in: pointer to rw-lock */
+	const char*	file_name,/*!< in: file name where lock requested */
+	ulint		line)	/*!< in: line where requested */
+{
+	os_thread_id_t	curr_thread	= os_thread_get_curr_id();
+
+	ibool success;
+
+#ifdef INNODB_RW_LOCKS_USE_ATOMICS
+	success = os_compare_and_swap_lint(&lock->lock_word, X_LOCK_DECR, 0);
+#else
+
+	success = FALSE;
+	mutex_enter(&(lock->mutex));
+	if (lock->lock_word == X_LOCK_DECR) {
+		lock->lock_word = 0;
+		success = TRUE;
+	}
+	mutex_exit(&(lock->mutex));
+
+#endif
+	if (success) {
+		rw_lock_set_writer_id_and_recursion_flag(lock, TRUE);
+
+	} else if (lock->recursive
+		   && os_thread_eq(lock->writer_thread, curr_thread)) {
+		/* Relock: this lock_word modification is safe since no other
+		threads can modify (lock, unlock, or reserve) lock_word while
+		there is an exclusive writer and this is the writer thread. */
+		lock->lock_word -= X_LOCK_DECR;
+
+		ut_ad(((-lock->lock_word) % X_LOCK_DECR) == 0);
+
+	} else {
+		/* Failure */
+		return(FALSE);
+	}
+#ifdef UNIV_SYNC_DEBUG
+	rw_lock_add_debug_info(lock, 0, RW_LOCK_EX, file_name, line);
+#endif
+
+	lock->last_x_file_name = file_name;
+	lock->last_x_line = line;
+
+	ut_ad(rw_lock_validate(lock));
+
+	return(TRUE);
+}
+
+/******************************************************************//**
+Releases a shared mode lock. */
+UNIV_INLINE
+void
+rw_lock_s_unlock_func(
+/*==================*/
+#ifdef UNIV_SYNC_DEBUG
+	ulint		pass,	/*!< in: pass value; != 0, if the lock may have
+				been passed to another thread to unlock */
+#endif
+	rw_lock_t*	lock)	/*!< in/out: rw-lock */
+{
+	ut_ad((lock->lock_word % X_LOCK_DECR) != 0);
+
+#ifdef UNIV_SYNC_DEBUG
+	rw_lock_remove_debug_info(lock, pass, RW_LOCK_SHARED);
+#endif
+
+	/* Increment lock_word to indicate 1 less reader */
+	if (rw_lock_lock_word_incr(lock, 1) == 0) {
+
+		/* wait_ex waiter exists. It may not be asleep, but we signal
+                anyway. We do not wake other waiters, because they can't
+                exist without wait_ex waiter and wait_ex waiter goes first.*/
+		os_event_set(lock->wait_ex_event);
+		sync_array_object_signalled(sync_primary_wait_array);
+
+	}
+
+	ut_ad(rw_lock_validate(lock));
+
+#ifdef UNIV_SYNC_PERF_STAT
+	rw_s_exit_count++;
+#endif
+}
+
+/******************************************************************//**
+Releases a shared mode lock when we know there are no waiters and none
+else will access the lock during the time this function is executed. */
+UNIV_INLINE
+void
+rw_lock_s_unlock_direct(
+/*====================*/
+	rw_lock_t*	lock)	/*!< in/out: rw-lock */
+{
+	ut_ad(lock->lock_word < X_LOCK_DECR);
+
+#ifdef UNIV_SYNC_DEBUG
+	rw_lock_remove_debug_info(lock, 0, RW_LOCK_SHARED);
+#endif
+
+	/* Decrease reader count by incrementing lock_word */
+	lock->lock_word++;
+
+	ut_ad(!lock->waiters);
+	ut_ad(rw_lock_validate(lock));
+#ifdef UNIV_SYNC_PERF_STAT
+	rw_s_exit_count++;
+#endif
+}
+
+/******************************************************************//**
+Releases an exclusive mode lock. */
+UNIV_INLINE
+void
+rw_lock_x_unlock_func(
+/*==================*/
+#ifdef UNIV_SYNC_DEBUG
+	ulint		pass,	/*!< in: pass value; != 0, if the lock may have
+				been passed to another thread to unlock */
+#endif
+	rw_lock_t*	lock)	/*!< in/out: rw-lock */
+{
+	ut_ad((lock->lock_word % X_LOCK_DECR) == 0);
+
+	/* lock->recursive flag also indicates if lock->writer_thread is
+	valid or stale. If we are the last of the recursive callers
+	then we must unset lock->recursive flag to indicate that the
+	lock->writer_thread is now stale.
+	Note that since we still hold the x-lock we can safely read the
+	lock_word. */
+	if (lock->lock_word == 0) {
+		/* Last caller in a possible recursive chain. */
+		lock->recursive = FALSE;
+		UNIV_MEM_INVALID(&lock->writer_thread,
+				 sizeof lock->writer_thread);
+	}
+
+#ifdef UNIV_SYNC_DEBUG
+	rw_lock_remove_debug_info(lock, pass, RW_LOCK_EX);
+#endif
+
+	if (rw_lock_lock_word_incr(lock, X_LOCK_DECR) == X_LOCK_DECR) {
+		/* Lock is now free. May have to signal read/write waiters.
+                We do not need to signal wait_ex waiters, since they cannot
+                exist when there is a writer. */
+		if (lock->waiters) {
+			rw_lock_reset_waiter_flag(lock);
+			os_event_set(lock->event);
+			sync_array_object_signalled(sync_primary_wait_array);
+		}
+	}
+
+	ut_ad(rw_lock_validate(lock));
+
+#ifdef UNIV_SYNC_PERF_STAT
+	rw_x_exit_count++;
+#endif
+}
+
+/******************************************************************//**
+Releases an exclusive mode lock when we know there are no waiters, and
+none else will access the lock during the time this function is executed. */
+UNIV_INLINE
+void
+rw_lock_x_unlock_direct(
+/*====================*/
+	rw_lock_t*	lock)	/*!< in/out: rw-lock */
+{
+	/* Reset the exclusive lock if this thread no longer has an x-mode
+	lock */
+
+	ut_ad((lock->lock_word % X_LOCK_DECR) == 0);
+
+#ifdef UNIV_SYNC_DEBUG
+	rw_lock_remove_debug_info(lock, 0, RW_LOCK_EX);
+#endif
+
+	if (lock->lock_word == 0) {
+		lock->recursive = FALSE;
+		UNIV_MEM_INVALID(&lock->writer_thread,
+				 sizeof lock->writer_thread);
+	}
+
+	lock->lock_word += X_LOCK_DECR;
+
+	ut_ad(!lock->waiters);
+	ut_ad(rw_lock_validate(lock));
+
+#ifdef UNIV_SYNC_PERF_STAT
+	rw_x_exit_count++;
+#endif
+}
diff --git a/storage/xtradb/include/sync0sync.h b/storage/xtradb/include/sync0sync.h
new file mode 100644
index 00000000000..a500cf1da45
--- /dev/null
+++ b/storage/xtradb/include/sync0sync.h
@@ -0,0 +1,596 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2010, Innobase Oy. All Rights Reserved.
+Copyright (c) 2008, Google Inc.
+
+Portions of this file contain modifications contributed and copyrighted by
+Google, Inc. Those modifications are gratefully acknowledged and are described
+briefly in the InnoDB documentation. The contributions by Google are
+incorporated with their permission, and subject to the conditions contained in
+the file COPYING.Google.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/sync0sync.h
+Mutex, the basic synchronization primitive
+
+Created 9/5/1995 Heikki Tuuri
+*******************************************************/
+
+#ifndef sync0sync_h
+#define sync0sync_h
+
+#include "univ.i"
+#include "sync0types.h"
+#include "ut0lst.h"
+#include "ut0mem.h"
+#include "os0thread.h"
+#include "os0sync.h"
+#include "sync0arr.h"
+
+#if  defined(UNIV_DEBUG) && !defined(UNIV_HOTBACKUP)
+extern my_bool	timed_mutexes;
+#endif /* UNIV_DEBUG && !UNIV_HOTBACKUP */
+
+#ifdef HAVE_WINDOWS_ATOMICS
+typedef LONG lock_word_t;	/*!< On Windows, InterlockedExchange operates
+				on LONG variable */
+#else
+typedef byte lock_word_t;
+#endif
+
+/******************************************************************//**
+Initializes the synchronization data structures. */
+UNIV_INTERN
+void
+sync_init(void);
+/*===========*/
+/******************************************************************//**
+Frees the resources in synchronization data structures. */
+UNIV_INTERN
+void
+sync_close(void);
+/*===========*/
+/******************************************************************//**
+Creates, or rather, initializes a mutex object to a specified memory
+location (which must be appropriately aligned). The mutex is initialized
+in the reset state. Explicit freeing of the mutex with mutex_free is
+necessary only if the memory block containing it is freed. */
+
+#ifdef UNIV_DEBUG
+# ifdef UNIV_SYNC_DEBUG
+#  define mutex_create(M, level)					\
+	mutex_create_func((M), #M, (level), __FILE__, __LINE__)
+# else
+#  define mutex_create(M, level)					\
+	mutex_create_func((M), #M, __FILE__, __LINE__)
+# endif
+#else
+# define mutex_create(M, level)					\
+	mutex_create_func((M), #M, NULL, 0)
+#endif
+
+/******************************************************************//**
+Creates, or rather, initializes a mutex object in a specified memory
+location (which must be appropriately aligned). The mutex is initialized
+in the reset state. Explicit freeing of the mutex with mutex_free is
+necessary only if the memory block containing it is freed. */
+UNIV_INTERN
+void
+mutex_create_func(
+/*==============*/
+	mutex_t*	mutex,		/*!< in: pointer to memory */
+	const char*	cmutex_name,	/*!< in: mutex name */
+#ifdef UNIV_DEBUG
+# ifdef UNIV_SYNC_DEBUG
+	ulint		level,		/*!< in: level */
+# endif /* UNIV_SYNC_DEBUG */
+#endif /* UNIV_DEBUG */
+	const char*	cfile_name,	/*!< in: file name where created */
+	ulint		cline);		/*!< in: file line where created */
+
+#undef mutex_free			/* Fix for MacOS X */
+
+/******************************************************************//**
+Calling this function is obligatory only if the memory buffer containing
+the mutex is freed. Removes a mutex object from the mutex list. The mutex
+is checked to be in the reset state. */
+UNIV_INTERN
+void
+mutex_free(
+/*=======*/
+	mutex_t*	mutex);	/*!< in: mutex */
+/**************************************************************//**
+NOTE! The following macro should be used in mutex locking, not the
+corresponding function. */
+
+#define mutex_enter(M)	  mutex_enter_func((M), __FILE__, __LINE__)
+/**************************************************************//**
+NOTE! The following macro should be used in mutex locking, not the
+corresponding function. */
+
+/* NOTE! currently same as mutex_enter! */
+
+#define mutex_enter_fast(M)	mutex_enter_func((M), __FILE__, __LINE__)
+/******************************************************************//**
+NOTE! Use the corresponding macro in the header file, not this function
+directly. Locks a mutex for the current thread. If the mutex is reserved
+the function spins a preset time (controlled by SYNC_SPIN_ROUNDS) waiting
+for the mutex before suspending the thread. */
+UNIV_INLINE
+void
+mutex_enter_func(
+/*=============*/
+	mutex_t*	mutex,		/*!< in: pointer to mutex */
+	const char*	file_name,	/*!< in: file name where locked */
+	ulint		line);		/*!< in: line where locked */
+/**************************************************************//**
+NOTE! The following macro should be used in mutex locking, not the
+corresponding function. */
+
+#define mutex_enter_nowait(M)	\
+	mutex_enter_nowait_func((M), __FILE__, __LINE__)
+/********************************************************************//**
+NOTE! Use the corresponding macro in the header file, not this function
+directly. Tries to lock the mutex for the current thread. If the lock is not
+acquired immediately, returns with return value 1.
+@return	0 if succeed, 1 if not */
+UNIV_INTERN
+ulint
+mutex_enter_nowait_func(
+/*====================*/
+	mutex_t*	mutex,		/*!< in: pointer to mutex */
+	const char*	file_name,	/*!< in: file name where mutex
+					requested */
+	ulint		line);		/*!< in: line where requested */
+/******************************************************************//**
+Unlocks a mutex owned by the current thread. */
+UNIV_INLINE
+void
+mutex_exit(
+/*=======*/
+	mutex_t*	mutex);	/*!< in: pointer to mutex */
+#ifdef UNIV_SYNC_DEBUG
+/******************************************************************//**
+Returns TRUE if no mutex or rw-lock is currently locked.
+Works only in the debug version.
+@return	TRUE if no mutexes and rw-locks reserved */
+UNIV_INTERN
+ibool
+sync_all_freed(void);
+/*================*/
+#endif /* UNIV_SYNC_DEBUG */
+/*#####################################################################
+FUNCTION PROTOTYPES FOR DEBUGGING */
+/*******************************************************************//**
+Prints wait info of the sync system. */
+UNIV_INTERN
+void
+sync_print_wait_info(
+/*=================*/
+	FILE*	file);		/*!< in: file where to print */
+/*******************************************************************//**
+Prints info of the sync system. */
+UNIV_INTERN
+void
+sync_print(
+/*=======*/
+	FILE*	file);		/*!< in: file where to print */
+#ifdef UNIV_DEBUG
+/******************************************************************//**
+Checks that the mutex has been initialized.
+@return	TRUE */
+UNIV_INTERN
+ibool
+mutex_validate(
+/*===========*/
+	const mutex_t*	mutex);	/*!< in: mutex */
+/******************************************************************//**
+Checks that the current thread owns the mutex. Works only
+in the debug version.
+@return	TRUE if owns */
+UNIV_INTERN
+ibool
+mutex_own(
+/*======*/
+	const mutex_t*	mutex)	/*!< in: mutex */
+	__attribute__((warn_unused_result));
+#endif /* UNIV_DEBUG */
+#ifdef UNIV_SYNC_DEBUG
+/******************************************************************//**
+Adds a latch and its level in the thread level array. Allocates the memory
+for the array if called first time for this OS thread. Makes the checks
+against other latch levels stored in the array for this thread. */
+UNIV_INTERN
+void
+sync_thread_add_level(
+/*==================*/
+	void*	latch,	/*!< in: pointer to a mutex or an rw-lock */
+	ulint	level);	/*!< in: level in the latching order; if
+			SYNC_LEVEL_VARYING, nothing is done */
+/******************************************************************//**
+Removes a latch from the thread level array if it is found there.
+@return TRUE if found in the array; it is no error if the latch is
+not found, as we presently are not able to determine the level for
+every latch reservation the program does */
+UNIV_INTERN
+ibool
+sync_thread_reset_level(
+/*====================*/
+	void*	latch);	/*!< in: pointer to a mutex or an rw-lock */
+/******************************************************************//**
+Checks that the level array for the current thread is empty.
+@return	TRUE if empty */
+UNIV_INTERN
+ibool
+sync_thread_levels_empty(void);
+/*==========================*/
+/******************************************************************//**
+Checks if the level array for the current thread contains a
+mutex or rw-latch at the specified level.
+@return	a matching latch, or NULL if not found */
+UNIV_INTERN
+void*
+sync_thread_levels_contains(
+/*========================*/
+	ulint	level);			/*!< in: latching order level
+					(SYNC_DICT, ...)*/
+/******************************************************************//**
+Checks if the level array for the current thread is empty.
+@return	a latch, or NULL if empty except the exceptions specified below */
+UNIV_INTERN
+void*
+sync_thread_levels_nonempty_gen(
+/*============================*/
+	ibool	dict_mutex_allowed);	/*!< in: TRUE if dictionary mutex is
+					allowed to be owned by the thread,
+					also purge_is_running mutex is
+					allowed */
+#define sync_thread_levels_empty_gen(d) (!sync_thread_levels_nonempty_gen(d))
+/******************************************************************//**
+Gets the debug information for a reserved mutex. */
+UNIV_INTERN
+void
+mutex_get_debug_info(
+/*=================*/
+	mutex_t*	mutex,		/*!< in: mutex */
+	const char**	file_name,	/*!< out: file where requested */
+	ulint*		line,		/*!< out: line where requested */
+	os_thread_id_t* thread_id);	/*!< out: id of the thread which owns
+					the mutex */
+/******************************************************************//**
+Counts currently reserved mutexes. Works only in the debug version.
+@return	number of reserved mutexes */
+UNIV_INTERN
+ulint
+mutex_n_reserved(void);
+/*==================*/
+#endif /* UNIV_SYNC_DEBUG */
+/******************************************************************//**
+NOT to be used outside this module except in debugging! Gets the value
+of the lock word. */
+UNIV_INLINE
+lock_word_t
+mutex_get_lock_word(
+/*================*/
+	const mutex_t*	mutex);	/*!< in: mutex */
+#ifdef UNIV_SYNC_DEBUG
+/******************************************************************//**
+NOT to be used outside this module except in debugging! Gets the waiters
+field in a mutex.
+@return	value to set */
+UNIV_INLINE
+ulint
+mutex_get_waiters(
+/*==============*/
+	const mutex_t*	mutex);	/*!< in: mutex */
+#endif /* UNIV_SYNC_DEBUG */
+
+/*
+		LATCHING ORDER WITHIN THE DATABASE
+		==================================
+
+The mutex or latch in the central memory object, for instance, a rollback
+segment object, must be acquired before acquiring the latch or latches to
+the corresponding file data structure. In the latching order below, these
+file page object latches are placed immediately below the corresponding
+central memory object latch or mutex.
+
+Synchronization object			Notes
+----------------------			-----
+
+Dictionary mutex			If we have a pointer to a dictionary
+|					object, e.g., a table, it can be
+|					accessed without reserving the
+|					dictionary mutex. We must have a
+|					reservation, a memoryfix, to the
+|					appropriate table object in this case,
+|					and the table must be explicitly
+|					released later.
+V
+Dictionary header
+|
+V
+Secondary index tree latch		The tree latch protects also all
+|					the B-tree non-leaf pages. These
+V					can be read with the page only
+Secondary index non-leaf		bufferfixed to save CPU time,
+|					no s-latch is needed on the page.
+|					Modification of a page requires an
+|					x-latch on the page, however. If a
+|					thread owns an x-latch to the tree,
+|					it is allowed to latch non-leaf pages
+|					even after it has acquired the fsp
+|					latch.
+V
+Secondary index leaf			The latch on the secondary index leaf
+|					can be kept while accessing the
+|					clustered index, to save CPU time.
+V
+Clustered index tree latch		To increase concurrency, the tree
+|					latch is usually released when the
+|					leaf page latch has been acquired.
+V
+Clustered index non-leaf
+|
+V
+Clustered index leaf
+|
+V
+Transaction system header
+|
+V
+Transaction undo mutex			The undo log entry must be written
+|					before any index page is modified.
+|					Transaction undo mutex is for the undo
+|					logs the analogue of the tree latch
+|					for a B-tree. If a thread has the
+|					trx undo mutex reserved, it is allowed
+|					to latch the undo log pages in any
+|					order, and also after it has acquired
+|					the fsp latch.
+V
+Rollback segment mutex			The rollback segment mutex must be
+|					reserved, if, e.g., a new page must
+|					be added to an undo log. The rollback
+|					segment and the undo logs in its
+|					history list can be seen as an
+|					analogue of a B-tree, and the latches
+|					reserved similarly, using a version of
+|					lock-coupling. If an undo log must be
+|					extended by a page when inserting an
+|					undo log record, this corresponds to
+|					a pessimistic insert in a B-tree.
+V
+Rollback segment header
+|
+V
+Purge system latch
+|
+V
+Undo log pages				If a thread owns the trx undo mutex,
+|					or for a log in the history list, the
+|					rseg mutex, it is allowed to latch
+|					undo log pages in any order, and even
+|					after it has acquired the fsp latch.
+|					If a thread does not have the
+|					appropriate mutex, it is allowed to
+|					latch only a single undo log page in
+|					a mini-transaction.
+V
+File space management latch		If a mini-transaction must allocate
+|					several file pages, it can do that,
+|					because it keeps the x-latch to the
+|					file space management in its memo.
+V
+File system pages
+|
+V
+Kernel mutex				If a kernel operation needs a file
+|					page allocation, it must reserve the
+|					fsp x-latch before acquiring the kernel
+|					mutex.
+V
+Search system mutex
+|
+V
+Buffer pool mutex
+|
+V
+Log mutex
+|
+Any other latch
+|
+V
+Memory pool mutex */
+
+/* Latching order levels */
+
+/* User transaction locks are higher than any of the latch levels below:
+no latches are allowed when a thread goes to wait for a normal table
+or row lock! */
+#define SYNC_USER_TRX_LOCK	9999
+#define SYNC_NO_ORDER_CHECK	3000	/* this can be used to suppress
+					latching order checking */
+#define	SYNC_LEVEL_VARYING	2000	/* Level is varying. Only used with
+					buffer pool page locks, which do not
+					have a fixed level, but instead have
+					their level set after the page is
+					locked; see e.g.
+					ibuf_bitmap_get_map_page(). */
+#define SYNC_TRX_I_S_RWLOCK	1910	/* Used for
+					trx_i_s_cache_t::rw_lock */
+#define SYNC_TRX_I_S_LAST_READ	1900	/* Used for
+					trx_i_s_cache_t::last_read_mutex */
+#define SYNC_FILE_FORMAT_TAG	1200	/* Used to serialize access to the
+					file format tag */
+#define	SYNC_DICT_OPERATION	1001	/* table create, drop, etc. reserve
+					this in X-mode; implicit or backround
+					operations purge, rollback, foreign
+					key checks reserve this in S-mode */
+#define SYNC_DICT		1000
+#define SYNC_DICT_AUTOINC_MUTEX	999
+#define SYNC_DICT_HEADER	995
+#define SYNC_IBUF_HEADER	914
+#define SYNC_IBUF_PESS_INSERT_MUTEX 912
+#define SYNC_IBUF_MUTEX		910	/* ibuf mutex is really below
+					SYNC_FSP_PAGE: we assign a value this
+					high only to make the program to pass
+					the debug checks */
+/*-------------------------------*/
+#define	SYNC_INDEX_TREE		900
+#define SYNC_TREE_NODE_NEW	892
+#define SYNC_TREE_NODE_FROM_HASH 891
+#define SYNC_TREE_NODE		890
+#define	SYNC_PURGE_SYS		810
+#define	SYNC_PURGE_LATCH	800
+#define	SYNC_TRX_UNDO		700
+#define SYNC_RSEG		600
+#define SYNC_RSEG_HEADER_NEW	591
+#define SYNC_RSEG_HEADER	590
+#define SYNC_TRX_UNDO_PAGE	570
+#define SYNC_EXTERN_STORAGE	500
+#define	SYNC_FSP		400
+#define	SYNC_FSP_PAGE		395
+/*------------------------------------- Insert buffer headers */
+/*------------------------------------- ibuf_mutex */
+/*------------------------------------- Insert buffer tree */
+#define	SYNC_IBUF_BITMAP_MUTEX	351
+#define	SYNC_IBUF_BITMAP	350
+/*------------------------------------- MySQL query cache mutex */
+/*------------------------------------- MySQL binlog mutex */
+/*-------------------------------*/
+#define	SYNC_KERNEL		300
+#define SYNC_REC_LOCK		299
+#define	SYNC_TRX_LOCK_HEAP	298
+#define SYNC_TRX_SYS_HEADER	290
+#define SYNC_LOG		170
+#define SYNC_RECV		168
+#define	SYNC_WORK_QUEUE		162
+#define	SYNC_SEARCH_SYS_CONF	161	/* for assigning btr_search_enabled */
+#define	SYNC_SEARCH_SYS		160	/* NOTE that if we have a memory
+					heap that can be extended to the
+					buffer pool, its logical level is
+					SYNC_SEARCH_SYS, as memory allocation
+					can call routines there! Otherwise
+					the level is SYNC_MEM_HASH. */
+#define SYNC_BUF_LRU_LIST	157
+#define SYNC_BUF_PAGE_HASH	156
+#define	SYNC_BUF_BLOCK		155
+#define SYNC_BUF_FREE_LIST	153
+#define SYNC_BUF_ZIP_FREE	152
+#define SYNC_BUF_ZIP_HASH	151
+#define	SYNC_BUF_POOL		150
+#define SYNC_BUF_FLUSH_LIST	149
+#define SYNC_DOUBLEWRITE	140
+#define	SYNC_ANY_LATCH		135
+#define SYNC_THR_LOCAL		133
+#define	SYNC_MEM_HASH		131
+#define	SYNC_MEM_POOL		130
+
+/* Codes used to designate lock operations */
+#define RW_LOCK_NOT_LOCKED	350
+#define RW_LOCK_EX		351
+#define RW_LOCK_EXCLUSIVE	351
+#define RW_LOCK_SHARED		352
+#define RW_LOCK_WAIT_EX		353
+#define SYNC_MUTEX		354
+
+/* NOTE! The structure appears here only for the compiler to know its size.
+Do not use its fields directly! The structure used in the spin lock
+implementation of a mutual exclusion semaphore. */
+
+/** InnoDB mutex */
+struct mutex_struct {
+	os_event_t	event;	/*!< Used by sync0arr.c for the wait queue */
+	volatile lock_word_t	lock_word;	/*!< lock_word is the target
+				of the atomic test-and-set instruction when
+				atomic operations are enabled. */
+
+#if !defined(HAVE_ATOMIC_BUILTINS)
+	os_fast_mutex_t
+		os_fast_mutex;	/*!< We use this OS mutex in place of lock_word
+				when atomic operations are not enabled */
+#endif
+	volatile ulint	waiters;	/*!< This ulint is set to 1 if there are (or
+				may be) threads waiting in the global wait
+				array for this mutex to be released.
+				Otherwise, this is 0. */
+	UT_LIST_NODE_T(mutex_t)	list; /*!< All allocated mutexes are put into
+				a list.	Pointers to the next and prev. */
+#ifdef UNIV_SYNC_DEBUG
+	const char*	file_name;	/*!< File where the mutex was locked */
+	ulint	line;		/*!< Line where the mutex was locked */
+	ulint	level;		/*!< Level in the global latching order */
+#endif /* UNIV_SYNC_DEBUG */
+#ifdef UNIV_DEBUG
+	const char*	cfile_name;/*!< File name where mutex created */
+	ulint		cline;	/*!< Line where created */
+	os_thread_id_t thread_id; /*!< The thread id of the thread
+				which locked the mutex. */
+	ulint		magic_n;	/*!< MUTEX_MAGIC_N */
+/** Value of mutex_struct::magic_n */
+# define MUTEX_MAGIC_N	(ulint)979585
+#endif /* UNIV_DEBUG */
+	ulong		count_os_wait;	/*!< count of os_wait */
+#ifdef UNIV_DEBUG
+	ulong		count_using;	/*!< count of times mutex used */
+	ulong		count_spin_loop; /*!< count of spin loops */
+	ulong		count_spin_rounds;/*!< count of spin rounds */
+	ulong		count_os_yield;	/*!< count of os_wait */
+	ulonglong	lspent_time;	/*!< mutex os_wait timer msec */
+	ulonglong	lmax_spent_time;/*!< mutex os_wait timer msec */
+	ulint		mutex_type;	/*!< 0=usual mutex, 1=rw_lock mutex */
+#endif /* UNIV_DEBUG */
+	const char*	cmutex_name;	/*!< mutex name */
+};
+
+/** The global array of wait cells for implementation of the databases own
+mutexes and read-write locks. */
+extern sync_array_t*	sync_primary_wait_array;/* Appears here for
+						debugging purposes only! */
+
+/** Constant determining how long spin wait is continued before suspending
+the thread. A value 600 rounds on a 1995 100 MHz Pentium seems to correspond
+to 20 microseconds. */
+
+#define	SYNC_SPIN_ROUNDS	srv_n_spin_wait_rounds
+
+/** The number of mutex_exit calls. Intended for performance monitoring. */
+extern	ib_int64_t	mutex_exit_count;
+
+#ifdef UNIV_SYNC_DEBUG
+/** Latching order checks start when this is set TRUE */
+extern ibool	sync_order_checks_on;
+#endif /* UNIV_SYNC_DEBUG */
+
+/** This variable is set to TRUE when sync_init is called */
+extern ibool	sync_initialized;
+
+/** Global list of database mutexes (not OS mutexes) created. */
+typedef UT_LIST_BASE_NODE_T(mutex_t)  ut_list_base_node_t;
+/** Global list of database mutexes (not OS mutexes) created. */
+extern ut_list_base_node_t  mutex_list;
+
+/** Mutex protecting the mutex_list variable */
+extern mutex_t mutex_list_mutex;
+
+
+#ifndef UNIV_NONINL
+#include "sync0sync.ic"
+#endif
+
+#endif
diff --git a/storage/xtradb/include/sync0sync.ic b/storage/xtradb/include/sync0sync.ic
new file mode 100644
index 00000000000..b05020b5660
--- /dev/null
+++ b/storage/xtradb/include/sync0sync.ic
@@ -0,0 +1,222 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 2008, Google Inc.
+
+Portions of this file contain modifications contributed and copyrighted by
+Google, Inc. Those modifications are gratefully acknowledged and are described
+briefly in the InnoDB documentation. The contributions by Google are
+incorporated with their permission, and subject to the conditions contained in
+the file COPYING.Google.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/sync0sync.ic
+Mutex, the basic synchronization primitive
+
+Created 9/5/1995 Heikki Tuuri
+*******************************************************/
+
+/******************************************************************//**
+Sets the waiters field in a mutex. */
+UNIV_INTERN
+void
+mutex_set_waiters(
+/*==============*/
+	mutex_t*	mutex,	/*!< in: mutex */
+	ulint		n);	/*!< in: value to set */
+/******************************************************************//**
+Reserves a mutex for the current thread. If the mutex is reserved, the
+function spins a preset time (controlled by SYNC_SPIN_ROUNDS) waiting
+for the mutex before suspending the thread. */
+UNIV_INTERN
+void
+mutex_spin_wait(
+/*============*/
+	mutex_t*	mutex,		/*!< in: pointer to mutex */
+	const char*	file_name,	/*!< in: file name where mutex
+					requested */
+	ulint		line);		/*!< in: line where requested */
+#ifdef UNIV_SYNC_DEBUG
+/******************************************************************//**
+Sets the debug information for a reserved mutex. */
+UNIV_INTERN
+void
+mutex_set_debug_info(
+/*=================*/
+	mutex_t*	mutex,		/*!< in: mutex */
+	const char*	file_name,	/*!< in: file where requested */
+	ulint		line);		/*!< in: line where requested */
+#endif /* UNIV_SYNC_DEBUG */
+/******************************************************************//**
+Releases the threads waiting in the primary wait array for this mutex. */
+UNIV_INTERN
+void
+mutex_signal_object(
+/*================*/
+	mutex_t*	mutex);	/*!< in: mutex */
+
+/******************************************************************//**
+Performs an atomic test-and-set instruction to the lock_word field of a
+mutex.
+@return	the previous value of lock_word: 0 or 1 */
+UNIV_INLINE
+byte
+mutex_test_and_set(
+/*===============*/
+	mutex_t*	mutex)	/*!< in: mutex */
+{
+#if defined(HAVE_ATOMIC_BUILTINS)
+	return(os_atomic_test_and_set_byte(&mutex->lock_word, 1));
+#else
+	ibool	ret;
+
+	ret = os_fast_mutex_trylock(&(mutex->os_fast_mutex));
+
+	if (ret == 0) {
+		/* We check that os_fast_mutex_trylock does not leak
+		and allow race conditions */
+		ut_a(mutex->lock_word == 0);
+
+		mutex->lock_word = 1;
+	}
+
+	return((byte)ret);
+#endif
+}
+
+/******************************************************************//**
+Performs a reset instruction to the lock_word field of a mutex. This
+instruction also serializes memory operations to the program order. */
+UNIV_INLINE
+void
+mutex_reset_lock_word(
+/*==================*/
+	mutex_t*	mutex)	/*!< in: mutex */
+{
+#if defined(HAVE_ATOMIC_BUILTINS)
+	/* In theory __sync_lock_release should be used to release the lock.
+	Unfortunately, it does not work properly alone. The workaround is
+	that more conservative __sync_lock_test_and_set is used instead. */
+	os_atomic_test_and_set_byte(&mutex->lock_word, 0);
+#else
+	mutex->lock_word = 0;
+
+	os_fast_mutex_unlock(&(mutex->os_fast_mutex));
+#endif
+}
+
+/******************************************************************//**
+Gets the value of the lock word. */
+UNIV_INLINE
+lock_word_t
+mutex_get_lock_word(
+/*================*/
+	const mutex_t*	mutex)	/*!< in: mutex */
+{
+	ut_ad(mutex);
+
+	return(mutex->lock_word);
+}
+
+/******************************************************************//**
+Gets the waiters field in a mutex.
+@return	value to set */
+UNIV_INLINE
+ulint
+mutex_get_waiters(
+/*==============*/
+	const mutex_t*	mutex)	/*!< in: mutex */
+{
+	const volatile ulint*	ptr;	/*!< declared volatile to ensure that
+					the value is read from memory */
+	ut_ad(mutex);
+
+	ptr = &(mutex->waiters);
+
+	return(*ptr);		/* Here we assume that the read of a single
+				word from memory is atomic */
+}
+
+/******************************************************************//**
+Unlocks a mutex owned by the current thread. */
+UNIV_INLINE
+void
+mutex_exit(
+/*=======*/
+	mutex_t*	mutex)	/*!< in: pointer to mutex */
+{
+	ut_ad(mutex_own(mutex));
+
+	ut_d(mutex->thread_id = (os_thread_id_t) ULINT_UNDEFINED);
+
+#ifdef UNIV_SYNC_DEBUG
+	sync_thread_reset_level(mutex);
+#endif
+	mutex_reset_lock_word(mutex);
+
+	/* A problem: we assume that mutex_reset_lock word
+	is a memory barrier, that is when we read the waiters
+	field next, the read must be serialized in memory
+	after the reset. A speculative processor might
+	perform the read first, which could leave a waiting
+	thread hanging indefinitely.
+
+	Our current solution call every second
+	sync_arr_wake_threads_if_sema_free()
+	to wake up possible hanging threads if
+	they are missed in mutex_signal_object. */
+
+	if (mutex_get_waiters(mutex) != 0) {
+
+		mutex_signal_object(mutex);
+	}
+
+#ifdef UNIV_SYNC_PERF_STAT
+	mutex_exit_count++;
+#endif
+}
+
+/******************************************************************//**
+Locks a mutex for the current thread. If the mutex is reserved, the function
+spins a preset time (controlled by SYNC_SPIN_ROUNDS), waiting for the mutex
+before suspending the thread. */
+UNIV_INLINE
+void
+mutex_enter_func(
+/*=============*/
+	mutex_t*	mutex,		/*!< in: pointer to mutex */
+	const char*	file_name,	/*!< in: file name where locked */
+	ulint		line)		/*!< in: line where locked */
+{
+	ut_ad(mutex_validate(mutex));
+	ut_ad(!mutex_own(mutex));
+
+	/* Note that we do not peek at the value of lock_word before trying
+	the atomic test_and_set; we could peek, and possibly save time. */
+
+	ut_d(mutex->count_using++);
+
+	if (!mutex_test_and_set(mutex)) {
+		ut_d(mutex->thread_id = os_thread_get_curr_id());
+#ifdef UNIV_SYNC_DEBUG
+		mutex_set_debug_info(mutex, file_name, line);
+#endif
+		return;	/* Succeeded! */
+	}
+
+	mutex_spin_wait(mutex, file_name, line);
+}
diff --git a/storage/xtradb/include/sync0types.h b/storage/xtradb/include/sync0types.h
new file mode 100644
index 00000000000..1911bbac7fd
--- /dev/null
+++ b/storage/xtradb/include/sync0types.h
@@ -0,0 +1,34 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/sync0types.h
+Global types for sync
+
+Created 9/5/1995 Heikki Tuuri
+*******************************************************/
+
+#ifndef sync0types_h
+#define sync0types_h
+
+/** Rename mutex_t to avoid name space collision on some systems */
+#define mutex_t ib_mutex_t
+/** InnoDB mutex */
+typedef struct mutex_struct		mutex_t;
+
+#endif
diff --git a/storage/xtradb/include/thr0loc.h b/storage/xtradb/include/thr0loc.h
new file mode 100644
index 00000000000..293d1ebd57f
--- /dev/null
+++ b/storage/xtradb/include/thr0loc.h
@@ -0,0 +1,101 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/thr0loc.h
+The thread local storage
+
+Created 10/5/1995 Heikki Tuuri
+*******************************************************/
+
+/* This module implements storage private to each thread,
+a capability useful in some situations like storing the
+OS handle to the current thread, or its priority. */
+
+#ifndef thr0loc_h
+#define thr0loc_h
+
+#include "univ.i"
+#include "os0thread.h"
+
+/****************************************************************//**
+Initializes the thread local storage module. */
+UNIV_INTERN
+void
+thr_local_init(void);
+/*================*/
+ /****************************************************************//**
+Close the thread local storage module. */
+UNIV_INTERN
+void
+thr_local_close(void);
+/*=================*/
+/*******************************************************************//**
+Creates a local storage struct for the calling new thread. */
+UNIV_INTERN
+void
+thr_local_create(void);
+/*==================*/
+/*******************************************************************//**
+Frees the local storage struct for the specified thread. */
+UNIV_INTERN
+void
+thr_local_free(
+/*===========*/
+	os_thread_id_t	id);	/*!< in: thread id */
+/*******************************************************************//**
+Gets the slot number in the thread table of a thread.
+@return	slot number */
+UNIV_INTERN
+ulint
+thr_local_get_slot_no(
+/*==================*/
+	os_thread_id_t	id);	/*!< in: thread id of the thread */
+/*******************************************************************//**
+Sets in the local storage the slot number in the thread table of a thread. */
+UNIV_INTERN
+void
+thr_local_set_slot_no(
+/*==================*/
+	os_thread_id_t	id,	/*!< in: thread id of the thread */
+	ulint		slot_no);/*!< in: slot number */
+/*******************************************************************//**
+Returns pointer to the 'in_ibuf' field within the current thread local
+storage.
+@return	pointer to the in_ibuf field */
+UNIV_INTERN
+ibool*
+thr_local_get_in_ibuf_field(void);
+/*=============================*/
+
+/*************************************************************************
+Return local hash table informations. */
+
+ulint
+thr_local_hash_cells(void);
+/*=======================*/
+
+ulint
+thr_local_hash_nodes(void);
+/*=======================*/
+
+#ifndef UNIV_NONINL
+#include "thr0loc.ic"
+#endif
+
+#endif
diff --git a/storage/xtradb/include/thr0loc.ic b/storage/xtradb/include/thr0loc.ic
new file mode 100644
index 00000000000..ce44e512320
--- /dev/null
+++ b/storage/xtradb/include/thr0loc.ic
@@ -0,0 +1,24 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/thr0loc.ic
+Thread local storage
+
+Created 10/4/1995 Heikki Tuuri
+*******************************************************/
diff --git a/storage/xtradb/include/trx0i_s.h b/storage/xtradb/include/trx0i_s.h
new file mode 100644
index 00000000000..7bd4e1b88c8
--- /dev/null
+++ b/storage/xtradb/include/trx0i_s.h
@@ -0,0 +1,247 @@
+/*****************************************************************************
+
+Copyright (c) 2007, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/trx0i_s.h
+INFORMATION SCHEMA innodb_trx, innodb_locks and
+innodb_lock_waits tables cache structures and public
+functions.
+
+Created July 17, 2007 Vasil Dimov
+*******************************************************/
+
+#ifndef trx0i_s_h
+#define trx0i_s_h
+
+#include "univ.i"
+#include "trx0types.h"
+#include "ut0ut.h"
+
+/** The maximum amount of memory that can be consumed by innodb_trx,
+innodb_locks and innodb_lock_waits information schema tables. */
+#define TRX_I_S_MEM_LIMIT		16777216 /* 16 MiB */
+
+/** The maximum length of a string that can be stored in
+i_s_locks_row_t::lock_data */
+#define TRX_I_S_LOCK_DATA_MAX_LEN	8192
+
+/** The maximum length of a string that can be stored in
+i_s_trx_row_t::trx_query */
+#define TRX_I_S_TRX_QUERY_MAX_LEN	1024
+
+/** A row of INFORMATION_SCHEMA.innodb_locks */
+typedef struct i_s_locks_row_struct	i_s_locks_row_t;
+/** A row of INFORMATION_SCHEMA.innodb_trx */
+typedef struct i_s_trx_row_struct i_s_trx_row_t;
+/** A row of INFORMATION_SCHEMA.innodb_lock_waits */
+typedef struct i_s_lock_waits_row_struct i_s_lock_waits_row_t;
+
+/** Objects of trx_i_s_cache_t::locks_hash */
+typedef struct i_s_hash_chain_struct	i_s_hash_chain_t;
+
+/** Objects of this type are added to the hash table
+trx_i_s_cache_t::locks_hash */
+struct i_s_hash_chain_struct {
+	i_s_locks_row_t*	value;	/*!< row of
+					INFORMATION_SCHEMA.innodb_locks*/
+	i_s_hash_chain_t*	next;	/*!< next item in the hash chain */
+};
+
+/** This structure represents INFORMATION_SCHEMA.innodb_locks row */
+struct i_s_locks_row_struct {
+	ullint		lock_trx_id;	/*!< transaction identifier */
+	const char*	lock_mode;	/*!< lock mode from
+					lock_get_mode_str() */
+	const char*	lock_type;	/*!< lock type from
+					lock_get_type_str() */
+	const char*	lock_table;	/*!< table name from
+					lock_get_table_name() */
+	const char*	lock_index;	/*!< index name from
+					lock_rec_get_index_name() */
+	/** Information for record locks.  All these are
+	ULINT_UNDEFINED for table locks. */
+	/* @{ */
+	ulint		lock_space;	/*!< tablespace identifier */
+	ulint		lock_page;	/*!< page number within the_space */
+	ulint		lock_rec;	/*!< heap number of the record
+					on the page */
+	const char*	lock_data;	/*!< (some) content of the record */
+	/* @} */
+
+	/** The following are auxiliary and not included in the table */
+	/* @{ */
+	ullint		lock_table_id;
+					/*!< table identifier from
+					lock_get_table_id */
+	i_s_hash_chain_t hash_chain;	/*!< hash table chain node for
+					trx_i_s_cache_t::locks_hash */
+	/* @} */
+};
+
+/** This structure represents INFORMATION_SCHEMA.innodb_trx row */
+struct i_s_trx_row_struct {
+	ullint			trx_id;		/*!< transaction identifier */
+	const char*		trx_state;	/*!< transaction state from
+						trx_get_que_state_str() */
+	ib_time_t		trx_started;	/*!< trx_struct::start_time */
+	const i_s_locks_row_t*	requested_lock_row;
+						/*!< pointer to a row
+						in innodb_locks if trx
+						is waiting, or NULL */
+	ib_time_t		trx_wait_started;
+						/*!< trx_struct::wait_started */
+	ullint			trx_weight;	/*!< TRX_WEIGHT() */
+	ulint			trx_mysql_thread_id;
+						/*!< thd_get_thread_id() */
+	const char*		trx_query;	/*!< MySQL statement being
+						executed in the transaction */
+};
+
+/** This structure represents INFORMATION_SCHEMA.innodb_lock_waits row */
+struct i_s_lock_waits_row_struct {
+	const i_s_locks_row_t*	requested_lock_row;	/*!< requested lock */
+	const i_s_locks_row_t*	blocking_lock_row;	/*!< blocking lock */
+};
+
+/** Cache of INFORMATION_SCHEMA table data */
+typedef struct trx_i_s_cache_struct	trx_i_s_cache_t;
+
+/** Auxiliary enum used by functions that need to select one of the
+INFORMATION_SCHEMA tables */
+enum i_s_table {
+	I_S_INNODB_TRX,		/*!< INFORMATION_SCHEMA.innodb_trx */
+	I_S_INNODB_LOCKS,	/*!< INFORMATION_SCHEMA.innodb_locks */
+	I_S_INNODB_LOCK_WAITS	/*!< INFORMATION_SCHEMA.innodb_lock_waits */
+};
+
+/** This is the intermediate buffer where data needed to fill the
+INFORMATION SCHEMA tables is fetched and later retrieved by the C++
+code in handler/i_s.cc. */
+extern trx_i_s_cache_t*	trx_i_s_cache;
+
+/*******************************************************************//**
+Initialize INFORMATION SCHEMA trx related cache. */
+UNIV_INTERN
+void
+trx_i_s_cache_init(
+/*===============*/
+	trx_i_s_cache_t*	cache);	/*!< out: cache to init */
+/*******************************************************************//**
+Free the INFORMATION SCHEMA trx related cache. */
+UNIV_INTERN
+void
+trx_i_s_cache_free(
+/*===============*/
+	trx_i_s_cache_t*	cache);	/*!< in/out: cache to free */
+
+/*******************************************************************//**
+Issue a shared/read lock on the tables cache. */
+UNIV_INTERN
+void
+trx_i_s_cache_start_read(
+/*=====================*/
+	trx_i_s_cache_t*	cache);	/*!< in: cache */
+
+/*******************************************************************//**
+Release a shared/read lock on the tables cache. */
+UNIV_INTERN
+void
+trx_i_s_cache_end_read(
+/*===================*/
+	trx_i_s_cache_t*	cache);	/*!< in: cache */
+
+/*******************************************************************//**
+Issue an exclusive/write lock on the tables cache. */
+UNIV_INTERN
+void
+trx_i_s_cache_start_write(
+/*======================*/
+	trx_i_s_cache_t*	cache);	/*!< in: cache */
+
+/*******************************************************************//**
+Release an exclusive/write lock on the tables cache. */
+UNIV_INTERN
+void
+trx_i_s_cache_end_write(
+/*====================*/
+	trx_i_s_cache_t*	cache);	/*!< in: cache */
+
+
+/*******************************************************************//**
+Retrieves the number of used rows in the cache for a given
+INFORMATION SCHEMA table.
+@return	number of rows */
+UNIV_INTERN
+ulint
+trx_i_s_cache_get_rows_used(
+/*========================*/
+	trx_i_s_cache_t*	cache,	/*!< in: cache */
+	enum i_s_table		table);	/*!< in: which table */
+
+/*******************************************************************//**
+Retrieves the nth row in the cache for a given INFORMATION SCHEMA
+table.
+@return	row */
+UNIV_INTERN
+void*
+trx_i_s_cache_get_nth_row(
+/*======================*/
+	trx_i_s_cache_t*	cache,	/*!< in: cache */
+	enum i_s_table		table,	/*!< in: which table */
+	ulint			n);	/*!< in: row number */
+
+/*******************************************************************//**
+Update the transactions cache if it has not been read for some time.
+@return	0 - fetched, 1 - not */
+UNIV_INTERN
+int
+trx_i_s_possibly_fetch_data_into_cache(
+/*===================================*/
+	trx_i_s_cache_t*	cache);	/*!< in/out: cache */
+
+/*******************************************************************//**
+Returns TRUE if the data in the cache is truncated due to the memory
+limit posed by TRX_I_S_MEM_LIMIT.
+@return	TRUE if truncated */
+UNIV_INTERN
+ibool
+trx_i_s_cache_is_truncated(
+/*=======================*/
+	trx_i_s_cache_t*	cache);	/*!< in: cache */
+
+/** The maximum length of a resulting lock_id_size in
+trx_i_s_create_lock_id(), not including the terminating NUL.
+":%lu:%lu:%lu" -> 63 chars */
+#define TRX_I_S_LOCK_ID_MAX_LEN	(TRX_ID_MAX_LEN + 63)
+
+/*******************************************************************//**
+Crafts a lock id string from a i_s_locks_row_t object. Returns its
+second argument. This function aborts if there is not enough space in
+lock_id. Be sure to provide at least TRX_I_S_LOCK_ID_MAX_LEN + 1 if you
+want to be 100% sure that it will not abort.
+@return	resulting lock id */
+UNIV_INTERN
+char*
+trx_i_s_create_lock_id(
+/*===================*/
+	const i_s_locks_row_t*	row,	/*!< in: innodb_locks row */
+	char*			lock_id,/*!< out: resulting lock_id */
+	ulint			lock_id_size);/*!< in: size of the lock id
+					buffer */
+
+#endif /* trx0i_s_h */
diff --git a/storage/xtradb/include/trx0purge.h b/storage/xtradb/include/trx0purge.h
new file mode 100644
index 00000000000..ae5bc6f90be
--- /dev/null
+++ b/storage/xtradb/include/trx0purge.h
@@ -0,0 +1,213 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/trx0purge.h
+Purge old versions
+
+Created 3/26/1996 Heikki Tuuri
+*******************************************************/
+
+#ifndef trx0purge_h
+#define trx0purge_h
+
+#include "univ.i"
+#include "trx0types.h"
+#include "mtr0mtr.h"
+#include "trx0sys.h"
+#include "que0types.h"
+#include "page0page.h"
+#include "usr0sess.h"
+#include "fil0fil.h"
+
+/** The global data structure coordinating a purge */
+extern trx_purge_t*	purge_sys;
+
+/** A dummy undo record used as a return value when we have a whole undo log
+which needs no purge */
+extern trx_undo_rec_t	trx_purge_dummy_rec;
+
+/********************************************************************//**
+Calculates the file address of an undo log header when we have the file
+address of its history list node.
+@return	file address of the log */
+UNIV_INLINE
+fil_addr_t
+trx_purge_get_log_from_hist(
+/*========================*/
+	fil_addr_t	node_addr);	/*!< in: file address of the history
+					list node of the log */
+/*****************************************************************//**
+Checks if trx_id is >= purge_view: then it is guaranteed that its update
+undo log still exists in the system.
+@return TRUE if is sure that it is preserved, also if the function
+returns FALSE, it is possible that the undo log still exists in the
+system */
+UNIV_INTERN
+ibool
+trx_purge_update_undo_must_exist(
+/*=============================*/
+	trx_id_t	trx_id);/*!< in: transaction id */
+/********************************************************************//**
+Creates the global purge system control structure and inits the history
+mutex. */
+UNIV_INTERN
+void
+trx_purge_sys_create(void);
+/*======================*/
+/********************************************************************//**
+Frees the global purge system control structure. */
+UNIV_INTERN
+void
+trx_purge_sys_close(void);
+/*======================*/
+/************************************************************************
+Adds the update undo log as the first log in the history list. Removes the
+update undo log segment from the rseg slot if it is too big for reuse. */
+UNIV_INTERN
+void
+trx_purge_add_update_undo_to_history(
+/*=================================*/
+	trx_t*	trx,		/*!< in: transaction */
+	page_t*	undo_page,	/*!< in: update undo log header page,
+				x-latched */
+	mtr_t*	mtr);		/*!< in: mtr */
+/********************************************************************//**
+Fetches the next undo log record from the history list to purge. It must be
+released with the corresponding release function.
+@return copy of an undo log record or pointer to trx_purge_dummy_rec,
+if the whole undo log can skipped in purge; NULL if none left */
+UNIV_INTERN
+trx_undo_rec_t*
+trx_purge_fetch_next_rec(
+/*=====================*/
+	roll_ptr_t*	roll_ptr,/*!< out: roll pointer to undo record */
+	trx_undo_inf_t** cell,	/*!< out: storage cell for the record in the
+				purge array */
+	mem_heap_t*	heap);	/*!< in: memory heap where copied */
+/*******************************************************************//**
+Releases a reserved purge undo record. */
+UNIV_INTERN
+void
+trx_purge_rec_release(
+/*==================*/
+	trx_undo_inf_t*	cell);	/*!< in: storage cell */
+/*******************************************************************//**
+This function runs a purge batch.
+@return	number of undo log pages handled in the batch */
+UNIV_INTERN
+ulint
+trx_purge(void);
+/*===========*/
+/**********************************************************************
+This function runs a purge worker batch */
+UNIV_INTERN
+void
+trx_purge_worker(
+/*=============*/
+	ulint	worker_id);
+/**********************************************************************
+This function waits the event for worker batch */
+UNIV_INTERN
+void
+trx_purge_worker_wait(void);
+/*========================*/
+/**********************************************************************
+This function wakes the waiting worker batch */
+UNIV_INTERN
+void
+trx_purge_worker_wake(void);
+/*========================*/
+/******************************************************************//**
+Prints information of the purge system to stderr. */
+UNIV_INTERN
+void
+trx_purge_sys_print(void);
+/*======================*/
+
+/** The control structure used in the purge operation */
+struct trx_purge_struct{
+	ulint		state;		/*!< Purge system state */
+	sess_t*		sess;		/*!< System session running the purge
+					query */
+	trx_t*		trx;		/*!< System transaction running the purge
+					query: this trx is not in the trx list
+					of the trx system and it never ends */
+	que_t*		query;		/*!< The query graph which will do the
+					parallelized purge operation */
+	ulint		n_worker;
+	os_event_t	worker_event;
+	sess_t**	sess_arr;
+	trx_t**		trx_arr;
+	que_t**		query_arr;
+	rw_lock_t	latch;		/*!< The latch protecting the purge view.
+					A purge operation must acquire an
+					x-latch here for the instant at which
+					it changes the purge view: an undo
+					log operation can prevent this by
+					obtaining an s-latch here. */
+	read_view_t*	view;		/*!< The purge will not remove undo logs
+					which are >= this view (purge view) */
+	mutex_t		mutex;		/*!< Mutex protecting the fields below */
+	ulint		n_pages_handled;/*!< Approximate number of undo log
+					pages processed in purge */
+	ulint		handle_limit;	/*!< Target of how many pages to get
+					processed in the current purge */
+	/*------------------------------*/
+	/* The following two fields form the 'purge pointer' which advances
+	during a purge, and which is used in history list truncation */
+
+	trx_id_t	purge_trx_no;	/*!< Purge has advanced past all
+					transactions whose number is less
+					than this */
+	undo_no_t	purge_undo_no;	/*!< Purge has advanced past all records
+					whose undo number is less than this */
+	/*-----------------------------*/
+	ibool		next_stored;	/*!< TRUE if the info of the next record
+					to purge is stored below: if yes, then
+					the transaction number and the undo
+					number of the record are stored in
+					purge_trx_no and purge_undo_no above */
+	trx_rseg_t*	rseg;		/*!< Rollback segment for the next undo
+					record to purge */
+	ulint		page_no;	/*!< Page number for the next undo
+					record to purge, page number of the
+					log header, if dummy record */
+	ulint		offset;		/*!< Page offset for the next undo
+					record to purge, 0 if the dummy
+					record */
+	ulint		hdr_page_no;	/*!< Header page of the undo log where
+					the next record to purge belongs */
+	ulint		hdr_offset;	/*!< Header byte offset on the page */
+	/*-----------------------------*/
+	trx_undo_arr_t*	arr;		/*!< Array of transaction numbers and
+					undo numbers of the undo records
+					currently under processing in purge */
+	mem_heap_t*	heap;		/*!< Temporary storage used during a
+					purge: can be emptied after purge
+					completes */
+};
+
+#define TRX_PURGE_ON		1	/* purge operation is running */
+#define TRX_STOP_PURGE		2	/* purge operation is stopped, or
+					it should be stopped */
+#ifndef UNIV_NONINL
+#include "trx0purge.ic"
+#endif
+
+#endif
diff --git a/storage/xtradb/include/trx0purge.ic b/storage/xtradb/include/trx0purge.ic
new file mode 100644
index 00000000000..de09e393654
--- /dev/null
+++ b/storage/xtradb/include/trx0purge.ic
@@ -0,0 +1,43 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/trx0purge.ic
+Purge old versions
+
+Created 3/26/1996 Heikki Tuuri
+*******************************************************/
+
+#include "trx0undo.h"
+
+/********************************************************************//**
+Calculates the file address of an undo log header when we have the file
+address of its history list node.
+@return	file address of the log */
+UNIV_INLINE
+fil_addr_t
+trx_purge_get_log_from_hist(
+/*========================*/
+	fil_addr_t	node_addr)	/*!< in: file address of the history
+					list node of the log */
+{
+	node_addr.boffset -= TRX_UNDO_HISTORY_NODE;
+
+	return(node_addr);
+}
+
diff --git a/storage/xtradb/include/trx0rec.h b/storage/xtradb/include/trx0rec.h
new file mode 100644
index 00000000000..a6e56e963c6
--- /dev/null
+++ b/storage/xtradb/include/trx0rec.h
@@ -0,0 +1,338 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/trx0rec.h
+Transaction undo log record
+
+Created 3/26/1996 Heikki Tuuri
+*******************************************************/
+
+#ifndef trx0rec_h
+#define trx0rec_h
+
+#include "univ.i"
+#include "trx0types.h"
+#include "row0types.h"
+#include "mtr0mtr.h"
+#include "dict0types.h"
+#include "data0data.h"
+#include "rem0types.h"
+
+#ifndef UNIV_HOTBACKUP
+# include "que0types.h"
+
+/***********************************************************************//**
+Copies the undo record to the heap.
+@return	own: copy of undo log record */
+UNIV_INLINE
+trx_undo_rec_t*
+trx_undo_rec_copy(
+/*==============*/
+	const trx_undo_rec_t*	undo_rec,	/*!< in: undo log record */
+	mem_heap_t*		heap);		/*!< in: heap where copied */
+/**********************************************************************//**
+Reads the undo log record type.
+@return	record type */
+UNIV_INLINE
+ulint
+trx_undo_rec_get_type(
+/*==================*/
+	const trx_undo_rec_t*	undo_rec);	/*!< in: undo log record */
+/**********************************************************************//**
+Reads from an undo log record the record compiler info.
+@return	compiler info */
+UNIV_INLINE
+ulint
+trx_undo_rec_get_cmpl_info(
+/*=======================*/
+	const trx_undo_rec_t*	undo_rec);	/*!< in: undo log record */
+/**********************************************************************//**
+Returns TRUE if an undo log record contains an extern storage field.
+@return	TRUE if extern */
+UNIV_INLINE
+ibool
+trx_undo_rec_get_extern_storage(
+/*============================*/
+	const trx_undo_rec_t*	undo_rec);	/*!< in: undo log record */
+/**********************************************************************//**
+Reads the undo log record number.
+@return	undo no */
+UNIV_INLINE
+undo_no_t
+trx_undo_rec_get_undo_no(
+/*=====================*/
+	const trx_undo_rec_t*	undo_rec);	/*!< in: undo log record */
+/**********************************************************************//**
+Returns the start of the undo record data area.
+@return	offset to the data area */
+UNIV_INLINE
+ulint
+trx_undo_rec_get_offset(
+/*====================*/
+	undo_no_t	undo_no)	/*!< in: undo no read from node */
+	__attribute__((const));
+
+/**********************************************************************//**
+Returns the start of the undo record data area. */
+#define trx_undo_rec_get_ptr(undo_rec, undo_no)		\
+	((undo_rec) + trx_undo_rec_get_offset(undo_no))
+
+/**********************************************************************//**
+Reads from an undo log record the general parameters.
+@return	remaining part of undo log record after reading these values */
+UNIV_INTERN
+byte*
+trx_undo_rec_get_pars(
+/*==================*/
+	trx_undo_rec_t*	undo_rec,	/*!< in: undo log record */
+	ulint*		type,		/*!< out: undo record type:
+					TRX_UNDO_INSERT_REC, ... */
+	ulint*		cmpl_info,	/*!< out: compiler info, relevant only
+					for update type records */
+	ibool*		updated_extern,	/*!< out: TRUE if we updated an
+					externally stored fild */
+	undo_no_t*	undo_no,	/*!< out: undo log record number */
+	dulint*		table_id);	/*!< out: table id */
+/*******************************************************************//**
+Builds a row reference from an undo log record.
+@return	pointer to remaining part of undo record */
+UNIV_INTERN
+byte*
+trx_undo_rec_get_row_ref(
+/*=====================*/
+	byte*		ptr,	/*!< in: remaining part of a copy of an undo log
+				record, at the start of the row reference;
+				NOTE that this copy of the undo log record must
+				be preserved as long as the row reference is
+				used, as we do NOT copy the data in the
+				record! */
+	dict_index_t*	index,	/*!< in: clustered index */
+	dtuple_t**	ref,	/*!< out, own: row reference */
+	mem_heap_t*	heap);	/*!< in: memory heap from which the memory
+				needed is allocated */
+/*******************************************************************//**
+Skips a row reference from an undo log record.
+@return	pointer to remaining part of undo record */
+UNIV_INTERN
+byte*
+trx_undo_rec_skip_row_ref(
+/*======================*/
+	byte*		ptr,	/*!< in: remaining part in update undo log
+				record, at the start of the row reference */
+	dict_index_t*	index);	/*!< in: clustered index */
+/**********************************************************************//**
+Reads from an undo log update record the system field values of the old
+version.
+@return	remaining part of undo log record after reading these values */
+UNIV_INTERN
+byte*
+trx_undo_update_rec_get_sys_cols(
+/*=============================*/
+	byte*		ptr,		/*!< in: remaining part of undo
+					log record after reading
+					general parameters */
+	trx_id_t*	trx_id,		/*!< out: trx id */
+	roll_ptr_t*	roll_ptr,	/*!< out: roll ptr */
+	ulint*		info_bits);	/*!< out: info bits state */
+/*******************************************************************//**
+Builds an update vector based on a remaining part of an undo log record.
+@return remaining part of the record, NULL if an error detected, which
+means that the record is corrupted */
+UNIV_INTERN
+byte*
+trx_undo_update_rec_get_update(
+/*===========================*/
+	byte*		ptr,	/*!< in: remaining part in update undo log
+				record, after reading the row reference
+				NOTE that this copy of the undo log record must
+				be preserved as long as the update vector is
+				used, as we do NOT copy the data in the
+				record! */
+	dict_index_t*	index,	/*!< in: clustered index */
+	ulint		type,	/*!< in: TRX_UNDO_UPD_EXIST_REC,
+				TRX_UNDO_UPD_DEL_REC, or
+				TRX_UNDO_DEL_MARK_REC; in the last case,
+				only trx id and roll ptr fields are added to
+				the update vector */
+	trx_id_t	trx_id,	/*!< in: transaction id from this undorecord */
+	roll_ptr_t	roll_ptr,/*!< in: roll pointer from this undo record */
+	ulint		info_bits,/*!< in: info bits from this undo record */
+	trx_t*		trx,	/*!< in: transaction */
+	mem_heap_t*	heap,	/*!< in: memory heap from which the memory
+				needed is allocated */
+	upd_t**		upd);	/*!< out, own: update vector */
+/*******************************************************************//**
+Builds a partial row from an update undo log record. It contains the
+columns which occur as ordering in any index of the table.
+@return	pointer to remaining part of undo record */
+UNIV_INTERN
+byte*
+trx_undo_rec_get_partial_row(
+/*=========================*/
+	byte*		ptr,	/*!< in: remaining part in update undo log
+				record of a suitable type, at the start of
+				the stored index columns;
+				NOTE that this copy of the undo log record must
+				be preserved as long as the partial row is
+				used, as we do NOT copy the data in the
+				record! */
+	dict_index_t*	index,	/*!< in: clustered index */
+	dtuple_t**	row,	/*!< out, own: partial row */
+	ibool		ignore_prefix, /*!< in: flag to indicate if we
+				expect blob prefixes in undo. Used
+				only in the assertion. */
+	mem_heap_t*	heap);	/*!< in: memory heap from which the memory
+				needed is allocated */
+/***********************************************************************//**
+Writes information to an undo log about an insert, update, or a delete marking
+of a clustered index record. This information is used in a rollback of the
+transaction and in consistent reads that must look to the history of this
+transaction.
+@return	DB_SUCCESS or error code */
+UNIV_INTERN
+ulint
+trx_undo_report_row_operation(
+/*==========================*/
+	ulint		flags,		/*!< in: if BTR_NO_UNDO_LOG_FLAG bit is
+					set, does nothing */
+	ulint		op_type,	/*!< in: TRX_UNDO_INSERT_OP or
+					TRX_UNDO_MODIFY_OP */
+	que_thr_t*	thr,		/*!< in: query thread */
+	dict_index_t*	index,		/*!< in: clustered index */
+	const dtuple_t*	clust_entry,	/*!< in: in the case of an insert,
+					index entry to insert into the
+					clustered index, otherwise NULL */
+	const upd_t*	update,		/*!< in: in the case of an update,
+					the update vector, otherwise NULL */
+	ulint		cmpl_info,	/*!< in: compiler info on secondary
+					index updates */
+	const rec_t*	rec,		/*!< in: case of an update or delete
+					marking, the record in the clustered
+					index, otherwise NULL */
+	roll_ptr_t*	roll_ptr);	/*!< out: rollback pointer to the
+					inserted undo log record,
+					ut_dulint_zero if BTR_NO_UNDO_LOG
+					flag was specified */
+/******************************************************************//**
+Copies an undo record to heap. This function can be called if we know that
+the undo log record exists.
+@return	own: copy of the record */
+UNIV_INTERN
+trx_undo_rec_t*
+trx_undo_get_undo_rec_low(
+/*======================*/
+	roll_ptr_t	roll_ptr,	/*!< in: roll pointer to record */
+	mem_heap_t*	heap);		/*!< in: memory heap where copied */
+/******************************************************************//**
+Copies an undo record to heap.
+
+NOTE: the caller must have latches on the clustered index page and
+purge_view.
+
+@return DB_SUCCESS, or DB_MISSING_HISTORY if the undo log has been
+truncated and we cannot fetch the old version */
+UNIV_INTERN
+ulint
+trx_undo_get_undo_rec(
+/*==================*/
+	roll_ptr_t	roll_ptr,	/*!< in: roll pointer to record */
+	trx_id_t	trx_id,		/*!< in: id of the trx that generated
+					the roll pointer: it points to an
+					undo log of this transaction */
+	trx_undo_rec_t** undo_rec,	/*!< out, own: copy of the record */
+	mem_heap_t*	heap);		/*!< in: memory heap where copied */
+/*******************************************************************//**
+Build a previous version of a clustered index record. This function checks
+that the caller has a latch on the index page of the clustered index record
+and an s-latch on the purge_view. This guarantees that the stack of versions
+is locked.
+@return DB_SUCCESS, or DB_MISSING_HISTORY if the previous version is
+earlier than purge_view, which means that it may have been removed,
+DB_ERROR if corrupted record */
+UNIV_INTERN
+ulint
+trx_undo_prev_version_build(
+/*========================*/
+	const rec_t*	index_rec,/*!< in: clustered index record in the
+				index tree */
+	mtr_t*		index_mtr,/*!< in: mtr which contains the latch to
+				index_rec page and purge_view */
+	const rec_t*	rec,	/*!< in: version of a clustered index record */
+	dict_index_t*	index,	/*!< in: clustered index */
+	ulint*		offsets,/*!< in: rec_get_offsets(rec, index) */
+	mem_heap_t*	heap,	/*!< in: memory heap from which the memory
+				needed is allocated */
+	rec_t**		old_vers);/*!< out, own: previous version, or NULL if
+				rec is the first inserted version, or if
+				history data has been deleted */
+#endif /* !UNIV_HOTBACKUP */
+/***********************************************************//**
+Parses a redo log record of adding an undo log record.
+@return	end of log record or NULL */
+UNIV_INTERN
+byte*
+trx_undo_parse_add_undo_rec(
+/*========================*/
+	byte*	ptr,	/*!< in: buffer */
+	byte*	end_ptr,/*!< in: buffer end */
+	page_t*	page);	/*!< in: page or NULL */
+/***********************************************************//**
+Parses a redo log record of erasing of an undo page end.
+@return	end of log record or NULL */
+UNIV_INTERN
+byte*
+trx_undo_parse_erase_page_end(
+/*==========================*/
+	byte*	ptr,	/*!< in: buffer */
+	byte*	end_ptr,/*!< in: buffer end */
+	page_t*	page,	/*!< in: page or NULL */
+	mtr_t*	mtr);	/*!< in: mtr or NULL */
+
+#ifndef UNIV_HOTBACKUP
+
+/* Types of an undo log record: these have to be smaller than 16, as the
+compilation info multiplied by 16 is ORed to this value in an undo log
+record */
+
+#define	TRX_UNDO_INSERT_REC	11	/* fresh insert into clustered index */
+#define	TRX_UNDO_UPD_EXIST_REC	12	/* update of a non-delete-marked
+					record */
+#define	TRX_UNDO_UPD_DEL_REC	13	/* update of a delete marked record to
+					a not delete marked record; also the
+					fields of the record can change */
+#define	TRX_UNDO_DEL_MARK_REC	14	/* delete marking of a record; fields
+					do not change */
+#define	TRX_UNDO_CMPL_INFO_MULT	16	/* compilation info is multiplied by
+					this and ORed to the type above */
+#define	TRX_UNDO_UPD_EXTERN	128	/* This bit can be ORed to type_cmpl
+					to denote that we updated external
+					storage fields: used by purge to
+					free the external storage */
+
+/* Operation type flags used in trx_undo_report_row_operation */
+#define	TRX_UNDO_INSERT_OP		1
+#define	TRX_UNDO_MODIFY_OP		2
+
+#ifndef UNIV_NONINL
+#include "trx0rec.ic"
+#endif
+
+#endif /* !UNIV_HOTBACKUP */
+
+#endif /* trx0rec_h */
diff --git a/storage/xtradb/include/trx0rec.ic b/storage/xtradb/include/trx0rec.ic
new file mode 100644
index 00000000000..e7e41d6d9f6
--- /dev/null
+++ b/storage/xtradb/include/trx0rec.ic
@@ -0,0 +1,112 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/trx0rec.ic
+Transaction undo log record
+
+Created 3/26/1996 Heikki Tuuri
+*******************************************************/
+
+#ifndef UNIV_HOTBACKUP
+/**********************************************************************//**
+Reads from an undo log record the record type.
+@return	record type */
+UNIV_INLINE
+ulint
+trx_undo_rec_get_type(
+/*==================*/
+	const trx_undo_rec_t*	undo_rec)	/*!< in: undo log record */
+{
+	return(mach_read_from_1(undo_rec + 2) & (TRX_UNDO_CMPL_INFO_MULT - 1));
+}
+
+/**********************************************************************//**
+Reads from an undo log record the record compiler info.
+@return	compiler info */
+UNIV_INLINE
+ulint
+trx_undo_rec_get_cmpl_info(
+/*=======================*/
+	const trx_undo_rec_t*	undo_rec)	/*!< in: undo log record */
+{
+	return(mach_read_from_1(undo_rec + 2) / TRX_UNDO_CMPL_INFO_MULT);
+}
+
+/**********************************************************************//**
+Returns TRUE if an undo log record contains an extern storage field.
+@return	TRUE if extern */
+UNIV_INLINE
+ibool
+trx_undo_rec_get_extern_storage(
+/*============================*/
+	const trx_undo_rec_t*	undo_rec)	/*!< in: undo log record */
+{
+	if (mach_read_from_1(undo_rec + 2) & TRX_UNDO_UPD_EXTERN) {
+
+		return(TRUE);
+	}
+
+	return(FALSE);
+}
+
+/**********************************************************************//**
+Reads the undo log record number.
+@return	undo no */
+UNIV_INLINE
+undo_no_t
+trx_undo_rec_get_undo_no(
+/*=====================*/
+	const trx_undo_rec_t*	undo_rec)	/*!< in: undo log record */
+{
+	const byte*	ptr;
+
+	ptr = undo_rec + 3;
+
+	return(mach_dulint_read_much_compressed(ptr));
+}
+
+/**********************************************************************//**
+Returns the start of the undo record data area.
+@return	offset to the data area */
+UNIV_INLINE
+ulint
+trx_undo_rec_get_offset(
+/*====================*/
+	undo_no_t	undo_no)	/*!< in: undo no read from node */
+{
+	return (3 + mach_dulint_get_much_compressed_size(undo_no));
+}
+
+/***********************************************************************//**
+Copies the undo record to the heap.
+@return	own: copy of undo log record */
+UNIV_INLINE
+trx_undo_rec_t*
+trx_undo_rec_copy(
+/*==============*/
+	const trx_undo_rec_t*	undo_rec,	/*!< in: undo log record */
+	mem_heap_t*		heap)		/*!< in: heap where copied */
+{
+	ulint		len;
+
+	len = mach_read_from_2(undo_rec)
+		- ut_align_offset(undo_rec, UNIV_PAGE_SIZE);
+	return(mem_heap_dup(heap, undo_rec, len));
+}
+#endif /* !UNIV_HOTBACKUP */
diff --git a/storage/xtradb/include/trx0roll.h b/storage/xtradb/include/trx0roll.h
new file mode 100644
index 00000000000..1dee5655c8c
--- /dev/null
+++ b/storage/xtradb/include/trx0roll.h
@@ -0,0 +1,352 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/trx0roll.h
+Transaction rollback
+
+Created 3/26/1996 Heikki Tuuri
+*******************************************************/
+
+#ifndef trx0roll_h
+#define trx0roll_h
+
+#include "univ.i"
+#include "trx0trx.h"
+#include "trx0types.h"
+#include "mtr0mtr.h"
+#include "trx0sys.h"
+
+#define trx_roll_free_all_savepoints(s) trx_roll_savepoints_free((s), NULL)
+
+/*******************************************************************//**
+Determines if this transaction is rolling back an incomplete transaction
+in crash recovery.
+@return TRUE if trx is an incomplete transaction that is being rolled
+back in crash recovery */
+UNIV_INTERN
+ibool
+trx_is_recv(
+/*========*/
+	const trx_t*	trx);	/*!< in: transaction */
+/*******************************************************************//**
+Returns a transaction savepoint taken at this point in time.
+@return	savepoint */
+UNIV_INTERN
+trx_savept_t
+trx_savept_take(
+/*============*/
+	trx_t*	trx);	/*!< in: transaction */
+/*******************************************************************//**
+Creates an undo number array. */
+UNIV_INTERN
+trx_undo_arr_t*
+trx_undo_arr_create(void);
+/*=====================*/
+/*******************************************************************//**
+Frees an undo number array. */
+UNIV_INTERN
+void
+trx_undo_arr_free(
+/*==============*/
+	trx_undo_arr_t*	arr);	/*!< in: undo number array */
+/*******************************************************************//**
+Returns pointer to nth element in an undo number array.
+@return	pointer to the nth element */
+UNIV_INLINE
+trx_undo_inf_t*
+trx_undo_arr_get_nth_info(
+/*======================*/
+	trx_undo_arr_t*	arr,	/*!< in: undo number array */
+	ulint		n);	/*!< in: position */
+/***********************************************************************//**
+Tries truncate the undo logs. */
+UNIV_INTERN
+void
+trx_roll_try_truncate(
+/*==================*/
+	trx_t*	trx);	/*!< in/out: transaction */
+/********************************************************************//**
+Pops the topmost record when the two undo logs of a transaction are seen
+as a single stack of records ordered by their undo numbers. Inserts the
+undo number of the popped undo record to the array of currently processed
+undo numbers in the transaction. When the query thread finishes processing
+of this undo record, it must be released with trx_undo_rec_release.
+@return undo log record copied to heap, NULL if none left, or if the
+undo number of the top record would be less than the limit */
+UNIV_INTERN
+trx_undo_rec_t*
+trx_roll_pop_top_rec_of_trx(
+/*========================*/
+	trx_t*		trx,	/*!< in: transaction */
+	undo_no_t	limit,	/*!< in: least undo number we need */
+	roll_ptr_t*	roll_ptr,/*!< out: roll pointer to undo record */
+	mem_heap_t*	heap);	/*!< in: memory heap where copied */
+/********************************************************************//**
+Reserves an undo log record for a query thread to undo. This should be
+called if the query thread gets the undo log record not using the pop
+function above.
+@return	TRUE if succeeded */
+UNIV_INTERN
+ibool
+trx_undo_rec_reserve(
+/*=================*/
+	trx_t*		trx,	/*!< in/out: transaction */
+	undo_no_t	undo_no);/*!< in: undo number of the record */
+/*******************************************************************//**
+Releases a reserved undo record. */
+UNIV_INTERN
+void
+trx_undo_rec_release(
+/*=================*/
+	trx_t*		trx,	/*!< in/out: transaction */
+	undo_no_t	undo_no);/*!< in: undo number */
+/*********************************************************************//**
+Starts a rollback operation. */
+UNIV_INTERN
+void
+trx_rollback(
+/*=========*/
+	trx_t*		trx,	/*!< in: transaction */
+	trx_sig_t*	sig,	/*!< in: signal starting the rollback */
+	que_thr_t**	next_thr);/*!< in/out: next query thread to run;
+				if the value which is passed in is
+				a pointer to a NULL pointer, then the
+				calling function can start running
+				a new query thread */
+/*******************************************************************//**
+Rollback or clean up any incomplete transactions which were
+encountered in crash recovery.  If the transaction already was
+committed, then we clean up a possible insert undo log. If the
+transaction was not yet committed, then we roll it back. */
+UNIV_INTERN
+void
+trx_rollback_or_clean_recovered(
+/*============================*/
+	ibool	all);	/*!< in: FALSE=roll back dictionary transactions;
+			TRUE=roll back all non-PREPARED transactions */
+/*******************************************************************//**
+Rollback or clean up any incomplete transactions which were
+encountered in crash recovery.  If the transaction already was
+committed, then we clean up a possible insert undo log. If the
+transaction was not yet committed, then we roll it back.
+Note: this is done in a background thread.
+@return	a dummy parameter */
+UNIV_INTERN
+os_thread_ret_t
+trx_rollback_or_clean_all_recovered(
+/*================================*/
+	void*	arg __attribute__((unused)));
+			/*!< in: a dummy parameter required by
+			os_thread_create */
+/****************************************************************//**
+Finishes a transaction rollback. */
+UNIV_INTERN
+void
+trx_finish_rollback_off_kernel(
+/*===========================*/
+	que_t*		graph,	/*!< in: undo graph which can now be freed */
+	trx_t*		trx,	/*!< in: transaction */
+	que_thr_t**	next_thr);/*!< in/out: next query thread to run;
+				if the value which is passed in is
+				a pointer to a NULL pointer, then the
+				calling function can start running
+				a new query thread; if this parameter is
+				NULL, it is ignored */
+/****************************************************************//**
+Builds an undo 'query' graph for a transaction. The actual rollback is
+performed by executing this query graph like a query subprocedure call.
+The reply about the completion of the rollback will be sent by this
+graph.
+@return	own: the query graph */
+UNIV_INTERN
+que_t*
+trx_roll_graph_build(
+/*=================*/
+	trx_t*	trx);	/*!< in: trx handle */
+/*********************************************************************//**
+Creates a rollback command node struct.
+@return	own: rollback node struct */
+UNIV_INTERN
+roll_node_t*
+roll_node_create(
+/*=============*/
+	mem_heap_t*	heap);	/*!< in: mem heap where created */
+/***********************************************************//**
+Performs an execution step for a rollback command node in a query graph.
+@return	query thread to run next, or NULL */
+UNIV_INTERN
+que_thr_t*
+trx_rollback_step(
+/*==============*/
+	que_thr_t*	thr);	/*!< in: query thread */
+/*******************************************************************//**
+Rollback a transaction used in MySQL.
+@return	error code or DB_SUCCESS */
+UNIV_INTERN
+int
+trx_rollback_for_mysql(
+/*===================*/
+	trx_t*	trx);	/*!< in: transaction handle */
+/*******************************************************************//**
+Rollback the latest SQL statement for MySQL.
+@return	error code or DB_SUCCESS */
+UNIV_INTERN
+int
+trx_rollback_last_sql_stat_for_mysql(
+/*=================================*/
+	trx_t*	trx);	/*!< in: transaction handle */
+/*******************************************************************//**
+Rollback a transaction used in MySQL.
+@return	error code or DB_SUCCESS */
+UNIV_INTERN
+int
+trx_general_rollback_for_mysql(
+/*===========================*/
+	trx_t*		trx,	/*!< in: transaction handle */
+	trx_savept_t*	savept);/*!< in: pointer to savepoint undo number, if
+				partial rollback requested, or NULL for
+				complete rollback */
+/*******************************************************************//**
+Rolls back a transaction back to a named savepoint. Modifications after the
+savepoint are undone but InnoDB does NOT release the corresponding locks
+which are stored in memory. If a lock is 'implicit', that is, a new inserted
+row holds a lock where the lock information is carried by the trx id stored in
+the row, these locks are naturally released in the rollback. Savepoints which
+were set after this savepoint are deleted.
+@return if no savepoint of the name found then DB_NO_SAVEPOINT,
+otherwise DB_SUCCESS */
+UNIV_INTERN
+ulint
+trx_rollback_to_savepoint_for_mysql(
+/*================================*/
+	trx_t*		trx,			/*!< in: transaction handle */
+	const char*	savepoint_name,		/*!< in: savepoint name */
+	ib_int64_t*	mysql_binlog_cache_pos);/*!< out: the MySQL binlog cache
+						position corresponding to this
+						savepoint; MySQL needs this
+						information to remove the
+						binlog entries of the queries
+						executed after the savepoint */
+/*******************************************************************//**
+Creates a named savepoint. If the transaction is not yet started, starts it.
+If there is already a savepoint of the same name, this call erases that old
+savepoint and replaces it with a new. Savepoints are deleted in a transaction
+commit or rollback.
+@return	always DB_SUCCESS */
+UNIV_INTERN
+ulint
+trx_savepoint_for_mysql(
+/*====================*/
+	trx_t*		trx,			/*!< in: transaction handle */
+	const char*	savepoint_name,		/*!< in: savepoint name */
+	ib_int64_t	binlog_cache_pos);	/*!< in: MySQL binlog cache
+						position corresponding to this
+						connection at the time of the
+						savepoint */
+
+/*******************************************************************//**
+Releases a named savepoint. Savepoints which
+were set after this savepoint are deleted.
+@return if no savepoint of the name found then DB_NO_SAVEPOINT,
+otherwise DB_SUCCESS */
+UNIV_INTERN
+ulint
+trx_release_savepoint_for_mysql(
+/*============================*/
+	trx_t*		trx,			/*!< in: transaction handle */
+	const char*	savepoint_name);	/*!< in: savepoint name */
+
+/*******************************************************************//**
+Frees a single savepoint struct. */
+UNIV_INTERN
+void
+trx_roll_savepoint_free(
+/*=====================*/
+	trx_t*			trx,	/*!< in: transaction handle */
+	trx_named_savept_t*	savep);	/*!< in: savepoint to free */
+
+/*******************************************************************//**
+Frees savepoint structs starting from savep, if savep == NULL then
+free all savepoints. */
+UNIV_INTERN
+void
+trx_roll_savepoints_free(
+/*=====================*/
+	trx_t*			trx,	/*!< in: transaction handle */
+	trx_named_savept_t*	savep);	/*!< in: free all savepoints > this one;
+					if this is NULL, free all savepoints
+					of trx */
+
+/** A cell of trx_undo_arr_struct; used during a rollback and a purge */
+struct	trx_undo_inf_struct{
+	trx_id_t	trx_no;	/*!< transaction number: not defined during
+				a rollback */
+	undo_no_t	undo_no;/*!< undo number of an undo record */
+	ibool		in_use;	/*!< TRUE if the cell is in use */
+};
+
+/** During a rollback and a purge, undo numbers of undo records currently being
+processed are stored in this array */
+
+struct trx_undo_arr_struct{
+	ulint		n_cells;	/*!< number of cells in the array */
+	ulint		n_used;		/*!< number of cells currently in use */
+	trx_undo_inf_t*	infos;		/*!< the array of undo infos */
+	mem_heap_t*	heap;		/*!< memory heap from which allocated */
+};
+
+/** Rollback node states */
+enum roll_node_state {
+	ROLL_NODE_SEND = 1,	/*!< about to send a rollback signal to
+				the transaction */
+	ROLL_NODE_WAIT		/*!< rollback signal sent to the transaction,
+				waiting for completion */
+};
+
+/** Rollback command node in a query graph */
+struct roll_node_struct{
+	que_common_t		common;	/*!< node type: QUE_NODE_ROLLBACK */
+	enum roll_node_state	state;	/*!< node execution state */
+	ibool			partial;/*!< TRUE if we want a partial
+					rollback */
+	trx_savept_t		savept;	/*!< savepoint to which to
+					roll back, in the case of a
+					partial rollback */
+};
+
+/** A savepoint set with SQL's "SAVEPOINT savepoint_id" command */
+struct trx_named_savept_struct{
+	char*		name;		/*!< savepoint name */
+	trx_savept_t	savept;		/*!< the undo number corresponding to
+					the savepoint */
+	ib_int64_t	mysql_binlog_cache_pos;
+					/*!< the MySQL binlog cache position
+					corresponding to this savepoint, not
+					defined if the MySQL binlogging is not
+					enabled */
+	UT_LIST_NODE_T(trx_named_savept_t)
+			trx_savepoints;	/*!< the list of savepoints of a
+					transaction */
+};
+
+#ifndef UNIV_NONINL
+#include "trx0roll.ic"
+#endif
+
+#endif
diff --git a/storage/xtradb/include/trx0roll.ic b/storage/xtradb/include/trx0roll.ic
new file mode 100644
index 00000000000..3460832b18c
--- /dev/null
+++ b/storage/xtradb/include/trx0roll.ic
@@ -0,0 +1,40 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/trx0roll.ic
+Transaction rollback
+
+Created 3/26/1996 Heikki Tuuri
+*******************************************************/
+
+/*******************************************************************//**
+Returns pointer to nth element in an undo number array.
+@return	pointer to the nth element */
+UNIV_INLINE
+trx_undo_inf_t*
+trx_undo_arr_get_nth_info(
+/*======================*/
+	trx_undo_arr_t*	arr,	/*!< in: undo number array */
+	ulint		n)	/*!< in: position */
+{
+	ut_ad(arr);
+	ut_ad(n < arr->n_cells);
+
+	return(arr->infos + n);
+}
diff --git a/storage/xtradb/include/trx0rseg.h b/storage/xtradb/include/trx0rseg.h
new file mode 100644
index 00000000000..303188f09f2
--- /dev/null
+++ b/storage/xtradb/include/trx0rseg.h
@@ -0,0 +1,223 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2010, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/trx0rseg.h
+Rollback segment
+
+Created 3/26/1996 Heikki Tuuri
+*******************************************************/
+
+#ifndef trx0rseg_h
+#define trx0rseg_h
+
+#include "univ.i"
+#include "trx0types.h"
+#include "trx0sys.h"
+
+/******************************************************************//**
+Gets a rollback segment header.
+@return	rollback segment header, page x-latched */
+UNIV_INLINE
+trx_rsegf_t*
+trx_rsegf_get(
+/*==========*/
+	ulint	space,		/*!< in: space where placed */
+	ulint	zip_size,	/*!< in: compressed page size in bytes
+				or 0 for uncompressed pages */
+	ulint	page_no,	/*!< in: page number of the header */
+	mtr_t*	mtr);		/*!< in: mtr */
+/******************************************************************//**
+Gets a newly created rollback segment header.
+@return	rollback segment header, page x-latched */
+UNIV_INLINE
+trx_rsegf_t*
+trx_rsegf_get_new(
+/*==============*/
+	ulint	space,		/*!< in: space where placed */
+	ulint	zip_size,	/*!< in: compressed page size in bytes
+				or 0 for uncompressed pages */
+	ulint	page_no,	/*!< in: page number of the header */
+	mtr_t*	mtr);		/*!< in: mtr */
+/***************************************************************//**
+Gets the file page number of the nth undo log slot.
+@return	page number of the undo log segment */
+UNIV_INLINE
+ulint
+trx_rsegf_get_nth_undo(
+/*===================*/
+	trx_rsegf_t*	rsegf,	/*!< in: rollback segment header */
+	ulint		n,	/*!< in: index of slot */
+	mtr_t*		mtr);	/*!< in: mtr */
+/***************************************************************//**
+Sets the file page number of the nth undo log slot. */
+UNIV_INLINE
+void
+trx_rsegf_set_nth_undo(
+/*===================*/
+	trx_rsegf_t*	rsegf,	/*!< in: rollback segment header */
+	ulint		n,	/*!< in: index of slot */
+	ulint		page_no,/*!< in: page number of the undo log segment */
+	mtr_t*		mtr);	/*!< in: mtr */
+/****************************************************************//**
+Looks for a free slot for an undo log segment.
+@return	slot index or ULINT_UNDEFINED if not found */
+UNIV_INLINE
+ulint
+trx_rsegf_undo_find_free(
+/*=====================*/
+	trx_rsegf_t*	rsegf,	/*!< in: rollback segment header */
+	mtr_t*		mtr);	/*!< in: mtr */
+/******************************************************************//**
+Looks for a rollback segment, based on the rollback segment id.
+@return	rollback segment */
+UNIV_INTERN
+trx_rseg_t*
+trx_rseg_get_on_id(
+/*===============*/
+	ulint	id);	/*!< in: rollback segment id */
+/****************************************************************//**
+Creates a rollback segment header. This function is called only when
+a new rollback segment is created in the database.
+@return	page number of the created segment, FIL_NULL if fail */
+UNIV_INTERN
+ulint
+trx_rseg_header_create(
+/*===================*/
+	ulint	space,		/*!< in: space id */
+	ulint	zip_size,	/*!< in: compressed page size in bytes
+				or 0 for uncompressed pages */
+	ulint	max_size,	/*!< in: max size in pages */
+	ulint*	slot_no,	/*!< out: rseg id == slot number in trx sys */
+	mtr_t*	mtr);		/*!< in: mtr */
+/*********************************************************************//**
+Creates the memory copies for rollback segments and initializes the
+rseg list and array in trx_sys at a database startup. */
+UNIV_INTERN
+void
+trx_rseg_list_and_array_init(
+/*=========================*/
+	trx_sysf_t*	sys_header,	/*!< in: trx system header */
+	mtr_t*		mtr);		/*!< in: mtr */
+/****************************************************************//**
+Creates a new rollback segment to the database.
+@return	the created segment object, NULL if fail */
+UNIV_INTERN
+trx_rseg_t*
+trx_rseg_create(
+/*============*/
+	ulint	space,		/*!< in: space id */
+	ulint	max_size,	/*!< in: max size in pages */
+	ulint*	id,		/*!< out: rseg id */
+	mtr_t*	mtr);		/*!< in: mtr */
+/***************************************************************************
+Free's an instance of the rollback segment in memory. */
+UNIV_INTERN
+void
+trx_rseg_mem_free(
+/*==============*/
+	trx_rseg_t*	rseg);		/* in, own: instance to free */
+
+
+/* Real max value may be 4076 in usual. But reserve 4 slot for safety or etc... */
+#define TRX_RSEG_N_EXTRA_SLOTS	(((UNIV_PAGE_SIZE - (FIL_PAGE_DATA + FIL_PAGE_DATA_END + TRX_RSEG_UNDO_SLOTS)) / TRX_RSEG_SLOT_SIZE) - 4)
+
+/* Number of undo log slots in a rollback segment file copy */
+#define TRX_RSEG_N_SLOTS	(srv_extra_undoslots ? TRX_RSEG_N_EXTRA_SLOTS : (UNIV_PAGE_SIZE / 16))
+
+/* Maximum number of transactions supported by a single rollback segment */
+#define TRX_RSEG_MAX_N_TRXS	(TRX_RSEG_N_SLOTS / 2)
+
+/* The rollback segment memory object */
+struct trx_rseg_struct{
+	/*--------------------------------------------------------*/
+	ulint		id;	/*!< rollback segment id == the index of
+				its slot in the trx system file copy */
+	mutex_t		mutex;	/*!< mutex protecting the fields in this
+				struct except id; NOTE that the latching
+				order must always be kernel mutex ->
+				rseg mutex */
+	ulint		space;	/*!< space where the rollback segment is
+				header is placed */
+	ulint		zip_size;/* compressed page size of space
+				in bytes, or 0 for uncompressed spaces */
+	ulint		page_no;/* page number of the rollback segment
+				header */
+	ulint		max_size;/* maximum allowed size in pages */
+	ulint		curr_size;/* current size in pages */
+	/*--------------------------------------------------------*/
+	/* Fields for update undo logs */
+	UT_LIST_BASE_NODE_T(trx_undo_t) update_undo_list;
+					/* List of update undo logs */
+	UT_LIST_BASE_NODE_T(trx_undo_t) update_undo_cached;
+					/* List of update undo log segments
+					cached for fast reuse */
+	/*--------------------------------------------------------*/
+	/* Fields for insert undo logs */
+	UT_LIST_BASE_NODE_T(trx_undo_t) insert_undo_list;
+					/* List of insert undo logs */
+	UT_LIST_BASE_NODE_T(trx_undo_t) insert_undo_cached;
+					/* List of insert undo log segments
+					cached for fast reuse */
+	/*--------------------------------------------------------*/
+	ulint		last_page_no;	/*!< Page number of the last not yet
+					purged log header in the history list;
+					FIL_NULL if all list purged */
+	ulint		last_offset;	/*!< Byte offset of the last not yet
+					purged log header */
+	trx_id_t	last_trx_no;	/*!< Transaction number of the last not
+					yet purged log */
+	ibool		last_del_marks;	/*!< TRUE if the last not yet purged log
+					needs purging */
+	/*--------------------------------------------------------*/
+	UT_LIST_NODE_T(trx_rseg_t) rseg_list;
+					/* the list of the rollback segment
+					memory objects */
+};
+
+/* Undo log segment slot in a rollback segment header */
+/*-------------------------------------------------------------*/
+#define	TRX_RSEG_SLOT_PAGE_NO	0	/* Page number of the header page of
+					an undo log segment */
+/*-------------------------------------------------------------*/
+/* Slot size */
+#define TRX_RSEG_SLOT_SIZE	4
+
+/* The offset of the rollback segment header on its page */
+#define	TRX_RSEG		FSEG_PAGE_DATA
+
+/* Transaction rollback segment header */
+/*-------------------------------------------------------------*/
+#define	TRX_RSEG_MAX_SIZE	0	/* Maximum allowed size for rollback
+					segment in pages */
+#define	TRX_RSEG_HISTORY_SIZE	4	/* Number of file pages occupied
+					by the logs in the history list */
+#define	TRX_RSEG_HISTORY	8	/* The update undo logs for committed
+					transactions */
+#define	TRX_RSEG_FSEG_HEADER	(8 + FLST_BASE_NODE_SIZE)
+					/* Header for the file segment where
+					this page is placed */
+#define TRX_RSEG_UNDO_SLOTS	(8 + FLST_BASE_NODE_SIZE + FSEG_HEADER_SIZE)
+					/* Undo log segment slots */
+/*-------------------------------------------------------------*/
+
+#ifndef UNIV_NONINL
+#include "trx0rseg.ic"
+#endif
+
+#endif
diff --git a/storage/xtradb/include/trx0rseg.ic b/storage/xtradb/include/trx0rseg.ic
new file mode 100644
index 00000000000..daffa92fc7d
--- /dev/null
+++ b/storage/xtradb/include/trx0rseg.ic
@@ -0,0 +1,145 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/trx0rseg.ic
+Rollback segment
+
+Created 3/26/1996 Heikki Tuuri
+*******************************************************/
+
+#include "srv0srv.h"
+#include "mtr0log.h"
+
+/******************************************************************//**
+Gets a rollback segment header.
+@return	rollback segment header, page x-latched */
+UNIV_INLINE
+trx_rsegf_t*
+trx_rsegf_get(
+/*==========*/
+	ulint	space,		/*!< in: space where placed */
+	ulint	zip_size,	/*!< in: compressed page size in bytes
+				or 0 for uncompressed pages */
+	ulint	page_no,	/*!< in: page number of the header */
+	mtr_t*	mtr)		/*!< in: mtr */
+{
+	buf_block_t*	block;
+	trx_rsegf_t*	header;
+
+	block = buf_page_get(space, zip_size, page_no, RW_X_LATCH, mtr);
+	buf_block_dbg_add_level(block, SYNC_RSEG_HEADER);
+
+	header = TRX_RSEG + buf_block_get_frame(block);
+
+	return(header);
+}
+
+/******************************************************************//**
+Gets a newly created rollback segment header.
+@return	rollback segment header, page x-latched */
+UNIV_INLINE
+trx_rsegf_t*
+trx_rsegf_get_new(
+/*==============*/
+	ulint	space,		/*!< in: space where placed */
+	ulint	zip_size,	/*!< in: compressed page size in bytes
+				or 0 for uncompressed pages */
+	ulint	page_no,	/*!< in: page number of the header */
+	mtr_t*	mtr)		/*!< in: mtr */
+{
+	buf_block_t*	block;
+	trx_rsegf_t*	header;
+
+	block = buf_page_get(space, zip_size, page_no, RW_X_LATCH, mtr);
+	buf_block_dbg_add_level(block, SYNC_RSEG_HEADER_NEW);
+
+	header = TRX_RSEG + buf_block_get_frame(block);
+
+	return(header);
+}
+
+/***************************************************************//**
+Gets the file page number of the nth undo log slot.
+@return	page number of the undo log segment */
+UNIV_INLINE
+ulint
+trx_rsegf_get_nth_undo(
+/*===================*/
+	trx_rsegf_t*	rsegf,	/*!< in: rollback segment header */
+	ulint		n,	/*!< in: index of slot */
+	mtr_t*		mtr)	/*!< in: mtr */
+{
+	if (UNIV_UNLIKELY(n >= TRX_RSEG_N_SLOTS)) {
+		fprintf(stderr,
+			"InnoDB: Error: trying to get slot %lu of rseg\n",
+			(ulong) n);
+		ut_error;
+	}
+
+	return(mtr_read_ulint(rsegf + TRX_RSEG_UNDO_SLOTS
+			      + n * TRX_RSEG_SLOT_SIZE, MLOG_4BYTES, mtr));
+}
+
+/***************************************************************//**
+Sets the file page number of the nth undo log slot. */
+UNIV_INLINE
+void
+trx_rsegf_set_nth_undo(
+/*===================*/
+	trx_rsegf_t*	rsegf,	/*!< in: rollback segment header */
+	ulint		n,	/*!< in: index of slot */
+	ulint		page_no,/*!< in: page number of the undo log segment */
+	mtr_t*		mtr)	/*!< in: mtr */
+{
+	if (UNIV_UNLIKELY(n >= TRX_RSEG_N_SLOTS)) {
+		fprintf(stderr,
+			"InnoDB: Error: trying to set slot %lu of rseg\n",
+			(ulong) n);
+		ut_error;
+	}
+
+	mlog_write_ulint(rsegf + TRX_RSEG_UNDO_SLOTS + n * TRX_RSEG_SLOT_SIZE,
+			 page_no, MLOG_4BYTES, mtr);
+}
+
+/****************************************************************//**
+Looks for a free slot for an undo log segment.
+@return	slot index or ULINT_UNDEFINED if not found */
+UNIV_INLINE
+ulint
+trx_rsegf_undo_find_free(
+/*=====================*/
+	trx_rsegf_t*	rsegf,	/*!< in: rollback segment header */
+	mtr_t*		mtr)	/*!< in: mtr */
+{
+	ulint		i;
+	ulint		page_no;
+
+	for (i = 0; i < TRX_RSEG_N_SLOTS; i++) {
+
+		page_no = trx_rsegf_get_nth_undo(rsegf, i, mtr);
+
+		if (page_no == FIL_NULL) {
+
+			return(i);
+		}
+	}
+
+	return(ULINT_UNDEFINED);
+}
diff --git a/storage/xtradb/include/trx0sys.h b/storage/xtradb/include/trx0sys.h
new file mode 100644
index 00000000000..9ef9485b611
--- /dev/null
+++ b/storage/xtradb/include/trx0sys.h
@@ -0,0 +1,664 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/trx0sys.h
+Transaction system
+
+Created 3/26/1996 Heikki Tuuri
+*******************************************************/
+
+#ifndef trx0sys_h
+#define trx0sys_h
+
+#include "univ.i"
+
+#include "trx0types.h"
+#include "fsp0types.h"
+#include "fil0fil.h"
+#include "buf0buf.h"
+#ifndef UNIV_HOTBACKUP
+#include "mtr0mtr.h"
+#include "ut0byte.h"
+#include "mem0mem.h"
+#include "sync0sync.h"
+#include "ut0lst.h"
+#include "read0types.h"
+#include "page0types.h"
+
+/** In a MySQL replication slave, in crash recovery we store the master log
+file name and position here. */
+/* @{ */
+/** Master binlog file name */
+extern char		trx_sys_mysql_master_log_name[];
+/** Master binlog file position.  We have successfully got the updates
+up to this position.  -1 means that no crash recovery was needed, or
+there was no master log position info inside InnoDB.*/
+extern ib_int64_t	trx_sys_mysql_master_log_pos;
+/* @} */
+
+extern char		trx_sys_mysql_relay_log_name[];
+extern ib_int64_t	trx_sys_mysql_relay_log_pos;
+
+/** If this MySQL server uses binary logging, after InnoDB has been inited
+and if it has done a crash recovery, we store the binlog file name and position
+here. */
+/* @{ */
+/** Binlog file name */
+extern char		trx_sys_mysql_bin_log_name[];
+/** Binlog file position, or -1 if unknown */
+extern ib_int64_t	trx_sys_mysql_bin_log_pos;
+/* @} */
+
+/** The transaction system */
+extern trx_sys_t*	trx_sys;
+
+/** Doublewrite system */
+extern trx_doublewrite_t*	trx_doublewrite;
+/** The following is set to TRUE when we are upgrading from pre-4.1
+format data files to the multiple tablespaces format data files */
+extern ibool			trx_doublewrite_must_reset_space_ids;
+/** Set to TRUE when the doublewrite buffer is being created */
+extern ibool			trx_doublewrite_buf_is_being_created;
+/** The following is TRUE when we are using the database in the
+post-4.1 format, i.e., we have successfully upgraded, or have created
+a new database installation */
+extern ibool			trx_sys_multiple_tablespace_format;
+
+/****************************************************************//**
+Creates the doublewrite buffer to a new InnoDB installation. The header of the
+doublewrite buffer is placed on the trx system header page. */
+UNIV_INTERN
+void
+trx_sys_create_doublewrite_buf(void);
+/*================================*/
+/****************************************************************//**
+At a database startup initializes the doublewrite buffer memory structure if
+we already have a doublewrite buffer created in the data files. If we are
+upgrading to an InnoDB version which supports multiple tablespaces, then this
+function performs the necessary update operations. If we are in a crash
+recovery, this function uses a possible doublewrite buffer to restore
+half-written pages in the data files. */
+UNIV_INTERN
+void
+trx_sys_doublewrite_init_or_restore_pages(
+/*======================================*/
+	ibool	restore_corrupt_pages);	/*!< in: TRUE=restore pages */
+/****************************************************************//**
+Marks the trx sys header when we have successfully upgraded to the >= 4.1.x
+multiple tablespace format. */
+UNIV_INTERN
+void
+trx_sys_mark_upgraded_to_multiple_tablespaces(void);
+/*===============================================*/
+/****************************************************************//**
+Determines if a page number is located inside the doublewrite buffer.
+@return TRUE if the location is inside the two blocks of the
+doublewrite buffer */
+UNIV_INTERN
+ibool
+trx_doublewrite_page_inside(
+/*========================*/
+	ulint	page_no);	/*!< in: page number */
+/***************************************************************//**
+Checks if a page address is the trx sys header page.
+@return	TRUE if trx sys header page */
+UNIV_INLINE
+ibool
+trx_sys_hdr_page(
+/*=============*/
+	ulint	space,	/*!< in: space */
+	ulint	page_no);/*!< in: page number */
+/***************************************************************//**
+Checks if a space is the system tablespaces.
+@return TRUE if system tablespace */
+UNIV_INLINE
+ibool
+trx_sys_sys_space(
+/*==============*/
+	ulint	space);	/*!< in: space */
+/***************************************************************//**
+Checks if a space is the doublewrite tablespace.
+@return TRUE if doublewrite tablespace */
+UNIV_INLINE
+ibool
+trx_sys_doublewrite_space(
+/*======================*/
+	ulint	space);	/*!< in: space */
+/*****************************************************************//**
+Creates and initializes the central memory structures for the transaction
+system. This is called when the database is started. */
+UNIV_INTERN
+void
+trx_sys_init_at_db_start(void);
+/*==========================*/
+/*****************************************************************//**
+Creates and initializes the transaction system at the database creation. */
+UNIV_INTERN
+void
+trx_sys_create(void);
+/*================*/
+/*****************************************************************//**
+Creates and initializes the dummy transaction system page for tablespace. */
+UNIV_INTERN
+void
+trx_sys_dummy_create(
+/*=================*/
+	ulint	space);
+/*********************************************************************
+Create extra rollback segments when create_new_db */
+UNIV_INTERN
+void
+trx_sys_create_extra_rseg(
+/*======================*/
+	ulint	num);	/* in: number of extra user rollback segments */
+/****************************************************************//**
+Looks for a free slot for a rollback segment in the trx system file copy.
+@return	slot index or ULINT_UNDEFINED if not found */
+UNIV_INTERN
+ulint
+trx_sysf_rseg_find_free(
+/*====================*/
+	mtr_t*		mtr);		/*!< in: mtr */
+/***************************************************************//**
+Gets the pointer in the nth slot of the rseg array.
+@return	pointer to rseg object, NULL if slot not in use */
+UNIV_INLINE
+trx_rseg_t*
+trx_sys_get_nth_rseg(
+/*=================*/
+	trx_sys_t*	sys,	/*!< in: trx system */
+	ulint		n);	/*!< in: index of slot */
+/***************************************************************//**
+Sets the pointer in the nth slot of the rseg array. */
+UNIV_INLINE
+void
+trx_sys_set_nth_rseg(
+/*=================*/
+	trx_sys_t*	sys,	/*!< in: trx system */
+	ulint		n,	/*!< in: index of slot */
+	trx_rseg_t*	rseg);	/*!< in: pointer to rseg object, NULL if slot
+				not in use */
+/**********************************************************************//**
+Gets a pointer to the transaction system file copy and x-locks its page.
+@return	pointer to system file copy, page x-locked */
+UNIV_INLINE
+trx_sysf_t*
+trx_sysf_get(
+/*=========*/
+	mtr_t*	mtr);	/*!< in: mtr */
+/*****************************************************************//**
+Gets the space of the nth rollback segment slot in the trx system
+file copy.
+@return	space id */
+UNIV_INLINE
+ulint
+trx_sysf_rseg_get_space(
+/*====================*/
+	trx_sysf_t*	sys_header,	/*!< in: trx sys file copy */
+	ulint		i,		/*!< in: slot index == rseg id */
+	mtr_t*		mtr);		/*!< in: mtr */
+/*****************************************************************//**
+Gets the page number of the nth rollback segment slot in the trx system
+file copy.
+@return	page number, FIL_NULL if slot unused */
+UNIV_INLINE
+ulint
+trx_sysf_rseg_get_page_no(
+/*======================*/
+	trx_sysf_t*	sys_header,	/*!< in: trx sys file copy */
+	ulint		i,		/*!< in: slot index == rseg id */
+	mtr_t*		mtr);		/*!< in: mtr */
+/*****************************************************************//**
+Sets the space id of the nth rollback segment slot in the trx system
+file copy. */
+UNIV_INLINE
+void
+trx_sysf_rseg_set_space(
+/*====================*/
+	trx_sysf_t*	sys_header,	/*!< in: trx sys file copy */
+	ulint		i,		/*!< in: slot index == rseg id */
+	ulint		space,		/*!< in: space id */
+	mtr_t*		mtr);		/*!< in: mtr */
+/*****************************************************************//**
+Sets the page number of the nth rollback segment slot in the trx system
+file copy. */
+UNIV_INLINE
+void
+trx_sysf_rseg_set_page_no(
+/*======================*/
+	trx_sysf_t*	sys_header,	/*!< in: trx sys file copy */
+	ulint		i,		/*!< in: slot index == rseg id */
+	ulint		page_no,	/*!< in: page number, FIL_NULL if
+					the slot is reset to unused */
+	mtr_t*		mtr);		/*!< in: mtr */
+/*****************************************************************//**
+Allocates a new transaction id.
+@return	new, allocated trx id */
+UNIV_INLINE
+trx_id_t
+trx_sys_get_new_trx_id(void);
+/*========================*/
+/*****************************************************************//**
+Allocates a new transaction number.
+@return	new, allocated trx number */
+UNIV_INLINE
+trx_id_t
+trx_sys_get_new_trx_no(void);
+/*========================*/
+#endif /* !UNIV_HOTBACKUP */
+/*****************************************************************//**
+Writes a trx id to an index page. In case that the id size changes in
+some future version, this function should be used instead of
+mach_write_... */
+UNIV_INLINE
+void
+trx_write_trx_id(
+/*=============*/
+	byte*		ptr,	/*!< in: pointer to memory where written */
+	trx_id_t	id);	/*!< in: id */
+#ifndef UNIV_HOTBACKUP
+/*****************************************************************//**
+Reads a trx id from an index page. In case that the id size changes in
+some future version, this function should be used instead of
+mach_read_...
+@return	id */
+UNIV_INLINE
+trx_id_t
+trx_read_trx_id(
+/*============*/
+	const byte*	ptr);	/*!< in: pointer to memory from where to read */
+/****************************************************************//**
+Looks for the trx handle with the given id in trx_list.
+@return	the trx handle or NULL if not found */
+UNIV_INLINE
+trx_t*
+trx_get_on_id(
+/*==========*/
+	trx_id_t	trx_id);/*!< in: trx id to search for */
+/****************************************************************//**
+Returns the minumum trx id in trx list. This is the smallest id for which
+the trx can possibly be active. (But, you must look at the trx->conc_state to
+find out if the minimum trx id transaction itself is active, or already
+committed.)
+@return	the minimum trx id, or trx_sys->max_trx_id if the trx list is empty */
+UNIV_INLINE
+trx_id_t
+trx_list_get_min_trx_id(void);
+/*=========================*/
+/****************************************************************//**
+Checks if a transaction with the given id is active.
+@return	TRUE if active */
+UNIV_INLINE
+ibool
+trx_is_active(
+/*==========*/
+	trx_id_t	trx_id);/*!< in: trx id of the transaction */
+/****************************************************************//**
+Checks that trx is in the trx list.
+@return	TRUE if is in */
+UNIV_INTERN
+ibool
+trx_in_trx_list(
+/*============*/
+	trx_t*	in_trx);/*!< in: trx */
+/*****************************************************************//**
+Updates the offset information about the end of the MySQL binlog entry
+which corresponds to the transaction just being committed. In a MySQL
+replication slave updates the latest master binlog position up to which
+replication has proceeded. */
+UNIV_INTERN
+void
+trx_sys_update_mysql_binlog_offset(
+/*===============================*/
+	trx_sysf_t*	sys_header,
+	const char*	file_name_in,/*!< in: MySQL log file name */
+	ib_int64_t	offset,	/*!< in: position in that log file */
+	ulint		field,	/*!< in: offset of the MySQL log info field in
+				the trx sys header */
+	mtr_t*		mtr);	/*!< in: mtr */
+/*****************************************************************//**
+Prints to stderr the MySQL binlog offset info in the trx system header if
+the magic number shows it valid. */
+UNIV_INTERN
+void
+trx_sys_print_mysql_binlog_offset(void);
+/*===================================*/
+/*****************************************************************//**
+Prints to stderr the MySQL master log offset info in the trx system header if
+the magic number shows it valid. */
+UNIV_INTERN
+void
+trx_sys_print_mysql_master_log_pos(void);
+/*====================================*/
+/*****************************************************************//**
+Initializes the tablespace tag system. */
+UNIV_INTERN
+void
+trx_sys_file_format_init(void);
+/*==========================*/
+/*****************************************************************//**
+Closes the tablespace tag system. */
+UNIV_INTERN
+void
+trx_sys_file_format_close(void);
+/*===========================*/
+/********************************************************************//**
+Tags the system table space with minimum format id if it has not been
+tagged yet.
+WARNING: This function is only called during the startup and AFTER the
+redo log application during recovery has finished. */
+UNIV_INTERN
+void
+trx_sys_file_format_tag_init(void);
+/*==============================*/
+#ifndef UNIV_HOTBACKUP
+/*****************************************************************//**
+Shutdown/Close the transaction system. */
+UNIV_INTERN
+void
+trx_sys_close(void);
+/*===============*/
+#endif /* !UNIV_HOTBACKUP */
+/*****************************************************************//**
+Get the name representation of the file format from its id.
+@return	pointer to the name */
+UNIV_INTERN
+const char*
+trx_sys_file_format_id_to_name(
+/*===========================*/
+	const ulint	id);		/*!< in: id of the file format */
+/*****************************************************************//**
+Set the file format id unconditionally except if it's already the
+same value.
+@return	TRUE if value updated */
+UNIV_INTERN
+ibool
+trx_sys_file_format_max_set(
+/*========================*/
+	ulint		format_id,	/*!< in: file format id */
+	const char**	name);		/*!< out: max file format name or
+					NULL if not needed. */
+/*****************************************************************//**
+Get the name representation of the file format from its id.
+@return	pointer to the max format name */
+UNIV_INTERN
+const char*
+trx_sys_file_format_max_get(void);
+/*=============================*/
+/*****************************************************************//**
+Check for the max file format tag stored on disk.
+@return	DB_SUCCESS or error code */
+UNIV_INTERN
+ulint
+trx_sys_file_format_max_check(
+/*==========================*/
+	ulint		max_format_id);	/*!< in: the max format id to check */
+/********************************************************************//**
+Update the file format tag in the system tablespace only if the given
+format id is greater than the known max id.
+@return	TRUE if format_id was bigger than the known max id */
+UNIV_INTERN
+ibool
+trx_sys_file_format_max_upgrade(
+/*============================*/
+	const char**	name,		/*!< out: max file format name */
+	ulint		format_id);	/*!< in: file format identifier */
+#else /* !UNIV_HOTBACKUP */
+/*****************************************************************//**
+Prints to stderr the MySQL binlog info in the system header if the
+magic number shows it valid. */
+UNIV_INTERN
+void
+trx_sys_print_mysql_binlog_offset_from_page(
+/*========================================*/
+	const byte*	page);	/*!< in: buffer containing the trx
+				system header page, i.e., page number
+				TRX_SYS_PAGE_NO in the tablespace */
+/*****************************************************************//**
+Reads the file format id from the first system table space file.
+Even if the call succeeds and returns TRUE, the returned format id
+may be ULINT_UNDEFINED signalling that the format id was not present
+in the data file.
+@return TRUE if call succeeds */
+UNIV_INTERN
+ibool
+trx_sys_read_file_format_id(
+/*========================*/
+	const char *pathname,	/*!< in: pathname of the first system
+				table space file */
+	ulint *format_id);	/*!< out: file format of the system table
+				space */
+/*****************************************************************//**
+Reads the file format id from the given per-table data file.
+@return TRUE if call succeeds */
+UNIV_INTERN
+ibool
+trx_sys_read_pertable_file_format_id(
+/*=================================*/
+	const char *pathname,	/*!< in: pathname of a per-table
+				datafile */
+	ulint *format_id);	/*!< out: file format of the per-table
+				data file */
+/*****************************************************************//**
+Get the name representation of the file format from its id.
+@return	pointer to the name */
+UNIV_INTERN
+const char*
+trx_sys_file_format_id_to_name(
+/*===========================*/
+	const ulint	id);	/*!< in: id of the file format */
+
+#endif /* !UNIV_HOTBACKUP */
+/* The automatically created system rollback segment has this id */
+#define TRX_SYS_SYSTEM_RSEG_ID	0
+
+/* Space id and page no where the trx system file copy resides */
+#define	TRX_SYS_SPACE	0	/* the SYSTEM tablespace */
+#define	TRX_DOUBLEWRITE_SPACE	1	/* the doublewrite buffer tablespace if used */
+#define	TRX_SYS_SPACE_MAX	9	/* reserved max space id for system tablespaces */
+#include "fsp0fsp.h"
+#define	TRX_SYS_PAGE_NO	FSP_TRX_SYS_PAGE_NO
+
+/* The offset of the transaction system header on the page */
+#define	TRX_SYS		FSEG_PAGE_DATA
+
+/** Transaction system header */
+/*------------------------------------------------------------- @{ */
+#define	TRX_SYS_TRX_ID_STORE	0	/*!< the maximum trx id or trx
+					number modulo
+					TRX_SYS_TRX_ID_UPDATE_MARGIN
+					written to a file page by any
+					transaction; the assignment of
+					transaction ids continues from
+					this number rounded up by
+					TRX_SYS_TRX_ID_UPDATE_MARGIN
+					plus
+					TRX_SYS_TRX_ID_UPDATE_MARGIN
+					when the database is
+					started */
+#define TRX_SYS_FSEG_HEADER	8	/*!< segment header for the
+					tablespace segment the trx
+					system is created into */
+#define	TRX_SYS_RSEGS		(8 + FSEG_HEADER_SIZE)
+					/*!< the start of the array of
+					rollback segment specification
+					slots */
+/*------------------------------------------------------------- @} */
+
+/** Maximum number of rollback segments: the number of segment
+specification slots in the transaction system array; rollback segment
+id must fit in one byte, therefore 256; each slot is currently 8 bytes
+in size */
+#define	TRX_SYS_N_RSEGS		256
+
+/** Maximum length of MySQL binlog file name, in bytes.
+@see trx_sys_mysql_master_log_name
+@see trx_sys_mysql_bin_log_name */
+#define TRX_SYS_MYSQL_LOG_NAME_LEN	512
+#define TRX_SYS_MYSQL_MASTER_LOG_NAME_LEN	480	/* (500 - 12) is dead line. */
+/** Contents of TRX_SYS_MYSQL_LOG_MAGIC_N_FLD */
+#define TRX_SYS_MYSQL_LOG_MAGIC_N	873422344
+
+//#if UNIV_PAGE_SIZE < 4096
+//# error "UNIV_PAGE_SIZE < 4096"
+//#endif
+/** The offset of the MySQL replication info in the trx system header;
+this contains the same fields as TRX_SYS_MYSQL_LOG_INFO below */
+#define TRX_SYS_MYSQL_MASTER_LOG_INFO	(UNIV_PAGE_SIZE - 2000)
+#define TRX_SYS_MYSQL_RELAY_LOG_INFO	(UNIV_PAGE_SIZE - 1500)
+
+/** The offset of the MySQL binlog offset info in the trx system header */
+#define TRX_SYS_MYSQL_LOG_INFO		(UNIV_PAGE_SIZE - 1000)
+#define	TRX_SYS_MYSQL_LOG_MAGIC_N_FLD	0	/*!< magic number which is
+						TRX_SYS_MYSQL_LOG_MAGIC_N
+						if we have valid data in the
+						MySQL binlog info */
+#define TRX_SYS_MYSQL_LOG_OFFSET_HIGH	4	/*!< high 4 bytes of the offset
+						within that file */
+#define TRX_SYS_MYSQL_LOG_OFFSET_LOW	8	/*!< low 4 bytes of the offset
+						within that file */
+#define TRX_SYS_MYSQL_LOG_NAME		12	/*!< MySQL log file name */
+
+/** Doublewrite buffer */
+/* @{ */
+/** The offset of the doublewrite buffer header on the trx system header page */
+#define TRX_SYS_DOUBLEWRITE		(UNIV_PAGE_SIZE - 200)
+/*-------------------------------------------------------------*/
+#define TRX_SYS_DOUBLEWRITE_FSEG	0	/*!< fseg header of the fseg
+						containing the doublewrite
+						buffer */
+#define TRX_SYS_DOUBLEWRITE_MAGIC	FSEG_HEADER_SIZE
+						/*!< 4-byte magic number which
+						shows if we already have
+						created the doublewrite
+						buffer */
+#define TRX_SYS_DOUBLEWRITE_BLOCK1	(4 + FSEG_HEADER_SIZE)
+						/*!< page number of the
+						first page in the first
+						sequence of 64
+						(= FSP_EXTENT_SIZE) consecutive
+						pages in the doublewrite
+						buffer */
+#define TRX_SYS_DOUBLEWRITE_BLOCK2	(8 + FSEG_HEADER_SIZE)
+						/*!< page number of the
+						first page in the second
+						sequence of 64 consecutive
+						pages in the doublewrite
+						buffer */
+#define TRX_SYS_DOUBLEWRITE_REPEAT	12	/*!< we repeat
+						TRX_SYS_DOUBLEWRITE_MAGIC,
+						TRX_SYS_DOUBLEWRITE_BLOCK1,
+						TRX_SYS_DOUBLEWRITE_BLOCK2
+						so that if the trx sys
+						header is half-written
+						to disk, we still may
+						be able to recover the
+						information */
+/** If this is not yet set to TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED_N,
+we must reset the doublewrite buffer, because starting from 4.1.x the
+space id of a data page is stored into
+FIL_PAGE_ARCH_LOG_NO_OR_SPACE_NO. */
+#define TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED (24 + FSEG_HEADER_SIZE)
+
+/*-------------------------------------------------------------*/
+/** Contents of TRX_SYS_DOUBLEWRITE_MAGIC */
+#define TRX_SYS_DOUBLEWRITE_MAGIC_N	536853855
+/** Contents of TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED */
+#define TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED_N 1783657386
+
+/** Size of the doublewrite block in pages */
+#define TRX_SYS_DOUBLEWRITE_BLOCK_SIZE	FSP_EXTENT_SIZE
+/* @} */
+
+#ifndef UNIV_HOTBACKUP
+/** File format tag */
+/* @{ */
+/** The offset of the file format tag on the trx system header page
+(TRX_SYS_PAGE_NO of TRX_SYS_SPACE) */
+#define TRX_SYS_FILE_FORMAT_TAG		(UNIV_PAGE_SIZE - 16)
+
+/** Contents of TRX_SYS_FILE_FORMAT_TAG when valid.  The file format
+identifier is added to this constant. */
+#define TRX_SYS_FILE_FORMAT_TAG_MAGIC_N_LOW	3645922177UL
+/** Contents of TRX_SYS_FILE_FORMAT_TAG+4 when valid */
+#define TRX_SYS_FILE_FORMAT_TAG_MAGIC_N_HIGH	2745987765UL
+/* @} */
+
+/** Doublewrite control struct */
+struct trx_doublewrite_struct{
+	mutex_t	mutex;		/*!< mutex protecting the first_free field and
+				write_buf */
+	ulint	block1;		/*!< the page number of the first
+				doublewrite block (64 pages) */
+	ulint	block2;		/*!< page number of the second block */
+	ulint	first_free;	/*!< first free position in write_buf measured
+				in units of UNIV_PAGE_SIZE */
+	byte*	write_buf;	/*!< write buffer used in writing to the
+				doublewrite buffer, aligned to an
+				address divisible by UNIV_PAGE_SIZE
+				(which is required by Windows aio) */
+	byte*	write_buf_unaligned;
+				/*!< pointer to write_buf, but unaligned */
+	buf_page_t**
+		buf_block_arr;	/*!< array to store pointers to the buffer
+				blocks which have been cached to write_buf */
+};
+
+/** The transaction system central memory data structure; protected by the
+kernel mutex */
+struct trx_sys_struct{
+	trx_id_t	max_trx_id;	/*!< The smallest number not yet
+					assigned as a transaction id or
+					transaction number */
+	UT_LIST_BASE_NODE_T(trx_t) trx_list;
+					/*!< List of active and committed in
+					memory transactions, sorted on trx id,
+					biggest first */
+	UT_LIST_BASE_NODE_T(trx_t) mysql_trx_list;
+					/*!< List of transactions created
+					for MySQL */
+	UT_LIST_BASE_NODE_T(trx_rseg_t) rseg_list;
+					/*!< List of rollback segment
+					objects */
+	trx_rseg_t*	latest_rseg;	/*!< Latest rollback segment in the
+					round-robin assignment of rollback
+					segments to transactions */
+	trx_rseg_t*	rseg_array[TRX_SYS_N_RSEGS];
+					/*!< Pointer array to rollback
+					segments; NULL if slot not in use */
+	ulint		rseg_history_len;/*!< Length of the TRX_RSEG_HISTORY
+					list (update undo logs for committed
+					transactions), protected by
+					rseg->mutex */
+	UT_LIST_BASE_NODE_T(read_view_t) view_list;
+					/*!< List of read views sorted
+					on trx no, biggest first */
+};
+
+/** When a trx id which is zero modulo this number (which must be a power of
+two) is assigned, the field TRX_SYS_TRX_ID_STORE on the transaction system
+page is updated */
+#define TRX_SYS_TRX_ID_WRITE_MARGIN	256
+#endif /* !UNIV_HOTBACKUP */
+
+#ifndef UNIV_NONINL
+#include "trx0sys.ic"
+#endif
+
+#endif
diff --git a/storage/xtradb/include/trx0sys.ic b/storage/xtradb/include/trx0sys.ic
new file mode 100644
index 00000000000..c7b09d4aec2
--- /dev/null
+++ b/storage/xtradb/include/trx0sys.ic
@@ -0,0 +1,421 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/trx0sys.ic
+Transaction system
+
+Created 3/26/1996 Heikki Tuuri
+*******************************************************/
+
+#include "trx0trx.h"
+#include "data0type.h"
+#ifndef UNIV_HOTBACKUP
+# include "srv0srv.h"
+# include "mtr0log.h"
+
+/* The typedef for rseg slot in the file copy */
+typedef byte	trx_sysf_rseg_t;
+
+/* Rollback segment specification slot offsets */
+/*-------------------------------------------------------------*/
+#define	TRX_SYS_RSEG_SPACE	0	/* space where the segment
+					header is placed; starting with
+					MySQL/InnoDB 5.1.7, this is
+					UNIV_UNDEFINED if the slot is unused */
+#define	TRX_SYS_RSEG_PAGE_NO	4	/*  page number where the segment
+					header is placed; this is FIL_NULL
+					if the slot is unused */
+/*-------------------------------------------------------------*/
+/* Size of a rollback segment specification slot */
+#define TRX_SYS_RSEG_SLOT_SIZE	8
+
+/*****************************************************************//**
+Writes the value of max_trx_id to the file based trx system header. */
+UNIV_INTERN
+void
+trx_sys_flush_max_trx_id(void);
+/*==========================*/
+
+/***************************************************************//**
+Checks if a page address is the trx sys header page.
+@return	TRUE if trx sys header page */
+UNIV_INLINE
+ibool
+trx_sys_hdr_page(
+/*=============*/
+	ulint	space,	/*!< in: space */
+	ulint	page_no)/*!< in: page number */
+{
+	if ((space == TRX_SYS_SPACE) && (page_no == TRX_SYS_PAGE_NO)) {
+
+		return(TRUE);
+	}
+
+	return(FALSE);
+}
+
+/***************************************************************//**
+Checks if a space is the system tablespaces.
+@return TRUE if system tablespace */
+UNIV_INLINE
+ibool
+trx_sys_sys_space(
+/*==============*/
+	ulint	space)	/*!< in: space */
+{
+	if (srv_doublewrite_file) {
+		/* several spaces are reserved */
+		return((ibool)(space <= TRX_SYS_SPACE_MAX));
+	} else {
+		return((ibool)(space == TRX_SYS_SPACE));
+	}
+}
+
+/***************************************************************//**
+Checks if a space is the doublewrite tablespace.
+@return TRUE if doublewrite tablespace */
+UNIV_INLINE
+ibool
+trx_sys_doublewrite_space(
+/*======================*/
+	ulint	space)	/*!< in: space */
+{
+	if (srv_doublewrite_file) {
+		/* doublewrite buffer is separated */
+		return((ibool)(space == TRX_DOUBLEWRITE_SPACE));
+	} else {
+		return((ibool)(space == TRX_SYS_SPACE));
+	}
+}
+
+/***************************************************************//**
+Gets the pointer in the nth slot of the rseg array.
+@return	pointer to rseg object, NULL if slot not in use */
+UNIV_INLINE
+trx_rseg_t*
+trx_sys_get_nth_rseg(
+/*=================*/
+	trx_sys_t*	sys,	/*!< in: trx system */
+	ulint		n)	/*!< in: index of slot */
+{
+	ut_ad(mutex_own(&(kernel_mutex)));
+	ut_ad(n < TRX_SYS_N_RSEGS);
+
+	return(sys->rseg_array[n]);
+}
+
+/***************************************************************//**
+Sets the pointer in the nth slot of the rseg array. */
+UNIV_INLINE
+void
+trx_sys_set_nth_rseg(
+/*=================*/
+	trx_sys_t*	sys,	/*!< in: trx system */
+	ulint		n,	/*!< in: index of slot */
+	trx_rseg_t*	rseg)	/*!< in: pointer to rseg object, NULL if slot
+				not in use */
+{
+	ut_ad(n < TRX_SYS_N_RSEGS);
+
+	sys->rseg_array[n] = rseg;
+}
+
+/**********************************************************************//**
+Gets a pointer to the transaction system header and x-latches its page.
+@return	pointer to system header, page x-latched. */
+UNIV_INLINE
+trx_sysf_t*
+trx_sysf_get(
+/*=========*/
+	mtr_t*	mtr)	/*!< in: mtr */
+{
+	buf_block_t*	block;
+	trx_sysf_t*	header;
+
+	ut_ad(mtr);
+
+	block = buf_page_get(TRX_SYS_SPACE, 0, TRX_SYS_PAGE_NO,
+			     RW_X_LATCH, mtr);
+	buf_block_dbg_add_level(block, SYNC_TRX_SYS_HEADER);
+
+	header = TRX_SYS + buf_block_get_frame(block);
+
+	return(header);
+}
+
+/*****************************************************************//**
+Gets the space of the nth rollback segment slot in the trx system
+file copy.
+@return	space id */
+UNIV_INLINE
+ulint
+trx_sysf_rseg_get_space(
+/*====================*/
+	trx_sysf_t*	sys_header,	/*!< in: trx sys header */
+	ulint		i,		/*!< in: slot index == rseg id */
+	mtr_t*		mtr)		/*!< in: mtr */
+{
+	ut_ad(mutex_own(&(kernel_mutex)));
+	ut_ad(sys_header);
+	ut_ad(i < TRX_SYS_N_RSEGS);
+
+	return(mtr_read_ulint(sys_header + TRX_SYS_RSEGS
+			      + i * TRX_SYS_RSEG_SLOT_SIZE
+			      + TRX_SYS_RSEG_SPACE, MLOG_4BYTES, mtr));
+}
+
+/*****************************************************************//**
+Gets the page number of the nth rollback segment slot in the trx system
+header.
+@return	page number, FIL_NULL if slot unused */
+UNIV_INLINE
+ulint
+trx_sysf_rseg_get_page_no(
+/*======================*/
+	trx_sysf_t*	sys_header,	/*!< in: trx system header */
+	ulint		i,		/*!< in: slot index == rseg id */
+	mtr_t*		mtr)		/*!< in: mtr */
+{
+	ut_ad(sys_header);
+	ut_ad(mutex_own(&(kernel_mutex)));
+	ut_ad(i < TRX_SYS_N_RSEGS);
+
+	return(mtr_read_ulint(sys_header + TRX_SYS_RSEGS
+			      + i * TRX_SYS_RSEG_SLOT_SIZE
+			      + TRX_SYS_RSEG_PAGE_NO, MLOG_4BYTES, mtr));
+}
+
+/*****************************************************************//**
+Sets the space id of the nth rollback segment slot in the trx system
+file copy. */
+UNIV_INLINE
+void
+trx_sysf_rseg_set_space(
+/*====================*/
+	trx_sysf_t*	sys_header,	/*!< in: trx sys file copy */
+	ulint		i,		/*!< in: slot index == rseg id */
+	ulint		space,		/*!< in: space id */
+	mtr_t*		mtr)		/*!< in: mtr */
+{
+	ut_ad(mutex_own(&(kernel_mutex)));
+	ut_ad(sys_header);
+	ut_ad(i < TRX_SYS_N_RSEGS);
+
+	mlog_write_ulint(sys_header + TRX_SYS_RSEGS
+			 + i * TRX_SYS_RSEG_SLOT_SIZE
+			 + TRX_SYS_RSEG_SPACE,
+			 space,
+			 MLOG_4BYTES, mtr);
+}
+
+/*****************************************************************//**
+Sets the page number of the nth rollback segment slot in the trx system
+header. */
+UNIV_INLINE
+void
+trx_sysf_rseg_set_page_no(
+/*======================*/
+	trx_sysf_t*	sys_header,	/*!< in: trx sys header */
+	ulint		i,		/*!< in: slot index == rseg id */
+	ulint		page_no,	/*!< in: page number, FIL_NULL if the
+					slot is reset to unused */
+	mtr_t*		mtr)		/*!< in: mtr */
+{
+	ut_ad(mutex_own(&(kernel_mutex)));
+	ut_ad(sys_header);
+	ut_ad(i < TRX_SYS_N_RSEGS);
+
+	mlog_write_ulint(sys_header + TRX_SYS_RSEGS
+			 + i * TRX_SYS_RSEG_SLOT_SIZE
+			 + TRX_SYS_RSEG_PAGE_NO,
+			 page_no,
+			 MLOG_4BYTES, mtr);
+}
+#endif /* !UNIV_HOTBACKUP */
+
+/*****************************************************************//**
+Writes a trx id to an index page. In case that the id size changes in
+some future version, this function should be used instead of
+mach_write_... */
+UNIV_INLINE
+void
+trx_write_trx_id(
+/*=============*/
+	byte*		ptr,	/*!< in: pointer to memory where written */
+	trx_id_t	id)	/*!< in: id */
+{
+#if DATA_TRX_ID_LEN != 6
+# error "DATA_TRX_ID_LEN != 6"
+#endif
+	mach_write_to_6(ptr, id);
+}
+
+#ifndef UNIV_HOTBACKUP
+/*****************************************************************//**
+Reads a trx id from an index page. In case that the id size changes in
+some future version, this function should be used instead of
+mach_read_...
+@return	id */
+UNIV_INLINE
+trx_id_t
+trx_read_trx_id(
+/*============*/
+	const byte*	ptr)	/*!< in: pointer to memory from where to read */
+{
+#if DATA_TRX_ID_LEN != 6
+# error "DATA_TRX_ID_LEN != 6"
+#endif
+	return(mach_read_from_6(ptr));
+}
+
+/****************************************************************//**
+Looks for the trx handle with the given id in trx_list.
+@return	the trx handle or NULL if not found */
+UNIV_INLINE
+trx_t*
+trx_get_on_id(
+/*==========*/
+	trx_id_t	trx_id)	/*!< in: trx id to search for */
+{
+	trx_t*	trx;
+
+	ut_ad(mutex_own(&(kernel_mutex)));
+
+	trx = UT_LIST_GET_FIRST(trx_sys->trx_list);
+
+	while (trx != NULL) {
+		if (0 == ut_dulint_cmp(trx_id, trx->id)) {
+
+			return(trx);
+		}
+
+		trx = UT_LIST_GET_NEXT(trx_list, trx);
+	}
+
+	return(NULL);
+}
+
+/****************************************************************//**
+Returns the minumum trx id in trx list. This is the smallest id for which
+the trx can possibly be active. (But, you must look at the trx->conc_state to
+find out if the minimum trx id transaction itself is active, or already
+committed.)
+@return	the minimum trx id, or trx_sys->max_trx_id if the trx list is empty */
+UNIV_INLINE
+trx_id_t
+trx_list_get_min_trx_id(void)
+/*=========================*/
+{
+	trx_t*	trx;
+
+	ut_ad(mutex_own(&(kernel_mutex)));
+
+	trx = UT_LIST_GET_LAST(trx_sys->trx_list);
+
+	if (trx == NULL) {
+
+		return(trx_sys->max_trx_id);
+	}
+
+	return(trx->id);
+}
+
+/****************************************************************//**
+Checks if a transaction with the given id is active.
+@return	TRUE if active */
+UNIV_INLINE
+ibool
+trx_is_active(
+/*==========*/
+	trx_id_t	trx_id)	/*!< in: trx id of the transaction */
+{
+	trx_t*	trx;
+
+	ut_ad(mutex_own(&(kernel_mutex)));
+
+	if (ut_dulint_cmp(trx_id, trx_list_get_min_trx_id()) < 0) {
+
+		return(FALSE);
+	}
+
+	if (ut_dulint_cmp(trx_id, trx_sys->max_trx_id) >= 0) {
+
+		/* There must be corruption: we return TRUE because this
+		function is only called by lock_clust_rec_some_has_impl()
+		and row_vers_impl_x_locked_off_kernel() and they have
+		diagnostic prints in this case */
+
+		return(TRUE);
+	}
+
+	trx = trx_get_on_id(trx_id);
+	if (trx && (trx->conc_state == TRX_ACTIVE
+		    || trx->conc_state == TRX_PREPARED)) {
+
+		return(TRUE);
+	}
+
+	return(FALSE);
+}
+
+/*****************************************************************//**
+Allocates a new transaction id.
+@return	new, allocated trx id */
+UNIV_INLINE
+trx_id_t
+trx_sys_get_new_trx_id(void)
+/*========================*/
+{
+	trx_id_t	id;
+
+	ut_ad(mutex_own(&kernel_mutex));
+
+	/* VERY important: after the database is started, max_trx_id value is
+	divisible by TRX_SYS_TRX_ID_WRITE_MARGIN, and the following if
+	will evaluate to TRUE when this function is first time called,
+	and the value for trx id will be written to disk-based header!
+	Thus trx id values will not overlap when the database is
+	repeatedly started! */
+
+	if (ut_dulint_get_low(trx_sys->max_trx_id)
+	    % TRX_SYS_TRX_ID_WRITE_MARGIN == 0) {
+
+		trx_sys_flush_max_trx_id();
+	}
+
+	id = trx_sys->max_trx_id;
+
+	UT_DULINT_INC(trx_sys->max_trx_id);
+
+	return(id);
+}
+
+/*****************************************************************//**
+Allocates a new transaction number.
+@return	new, allocated trx number */
+UNIV_INLINE
+trx_id_t
+trx_sys_get_new_trx_no(void)
+/*========================*/
+{
+	ut_ad(mutex_own(&kernel_mutex));
+
+	return(trx_sys_get_new_trx_id());
+}
+#endif /* !UNIV_HOTBACKUP */
diff --git a/storage/xtradb/include/trx0trx.h b/storage/xtradb/include/trx0trx.h
new file mode 100644
index 00000000000..4c0ce392bcd
--- /dev/null
+++ b/storage/xtradb/include/trx0trx.h
@@ -0,0 +1,849 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2010, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/trx0trx.h
+The transaction
+
+Created 3/26/1996 Heikki Tuuri
+*******************************************************/
+
+#ifndef trx0trx_h
+#define trx0trx_h
+
+#include "univ.i"
+#include "trx0types.h"
+#include "dict0types.h"
+#ifndef UNIV_HOTBACKUP
+#include "lock0types.h"
+#include "usr0types.h"
+#include "que0types.h"
+#include "mem0mem.h"
+#include "read0types.h"
+#include "trx0xa.h"
+#include "ut0vec.h"
+
+/** Dummy session used currently in MySQL interface */
+extern sess_t*	trx_dummy_sess;
+
+/** Number of transactions currently allocated for MySQL: protected by
+the kernel mutex */
+extern ulint	trx_n_mysql_transactions;
+
+/********************************************************************//**
+Releases the search latch if trx has reserved it. */
+UNIV_INTERN
+void
+trx_search_latch_release_if_reserved(
+/*=================================*/
+	trx_t*	   trx); /*!< in: transaction */
+/******************************************************************//**
+Set detailed error message for the transaction. */
+UNIV_INTERN
+void
+trx_set_detailed_error(
+/*===================*/
+	trx_t*		trx,	/*!< in: transaction struct */
+	const char*	msg);	/*!< in: detailed error message */
+/*************************************************************//**
+Set detailed error message for the transaction from a file. Note that the
+file is rewinded before reading from it. */
+UNIV_INTERN
+void
+trx_set_detailed_error_from_file(
+/*=============================*/
+	trx_t*	trx,	/*!< in: transaction struct */
+	FILE*	file);	/*!< in: file to read message from */
+/****************************************************************//**
+Retrieves the error_info field from a trx.
+@return	the error info */
+UNIV_INLINE
+const dict_index_t*
+trx_get_error_info(
+/*===============*/
+	const trx_t*	trx);	/*!< in: trx object */
+/****************************************************************//**
+Creates and initializes a transaction object.
+@return	own: the transaction */
+UNIV_INTERN
+trx_t*
+trx_create(
+/*=======*/
+	sess_t*	sess)	/*!< in: session */
+	__attribute__((nonnull));
+/********************************************************************//**
+Creates a transaction object for MySQL.
+@return	own: transaction object */
+UNIV_INTERN
+trx_t*
+trx_allocate_for_mysql(void);
+/*========================*/
+/********************************************************************//**
+Creates a transaction object for background operations by the master thread.
+@return	own: transaction object */
+UNIV_INTERN
+trx_t*
+trx_allocate_for_background(void);
+/*=============================*/
+/********************************************************************//**
+Frees a transaction object. */
+UNIV_INTERN
+void
+trx_free(
+/*=====*/
+	trx_t*	trx);	/*!< in, own: trx object */
+/********************************************************************//**
+Frees a transaction object for MySQL. */
+UNIV_INTERN
+void
+trx_free_for_mysql(
+/*===============*/
+	trx_t*	trx);	/*!< in, own: trx object */
+/********************************************************************//**
+Frees a transaction object of a background operation of the master thread. */
+UNIV_INTERN
+void
+trx_free_for_background(
+/*====================*/
+	trx_t*	trx);	/*!< in, own: trx object */
+/****************************************************************//**
+Creates trx objects for transactions and initializes the trx list of
+trx_sys at database start. Rollback segment and undo log lists must
+already exist when this function is called, because the lists of
+transactions to be rolled back or cleaned up are built based on the
+undo log lists. */
+UNIV_INTERN
+void
+trx_lists_init_at_db_start(void);
+/*============================*/
+/****************************************************************//**
+Starts a new transaction.
+@return TRUE if success, FALSE if the rollback segment could not
+support this many transactions */
+UNIV_INTERN
+ibool
+trx_start(
+/*======*/
+	trx_t*	trx,	/*!< in: transaction */
+	ulint	rseg_id);/*!< in: rollback segment id; if ULINT_UNDEFINED
+			is passed, the system chooses the rollback segment
+			automatically in a round-robin fashion */
+/****************************************************************//**
+Starts a new transaction.
+@return	TRUE */
+UNIV_INTERN
+ibool
+trx_start_low(
+/*==========*/
+	trx_t*	trx,	/*!< in: transaction */
+	ulint	rseg_id);/*!< in: rollback segment id; if ULINT_UNDEFINED
+			is passed, the system chooses the rollback segment
+			automatically in a round-robin fashion */
+/*************************************************************//**
+Starts the transaction if it is not yet started. */
+UNIV_INLINE
+void
+trx_start_if_not_started(
+/*=====================*/
+	trx_t*	trx);	/*!< in: transaction */
+/*************************************************************//**
+Starts the transaction if it is not yet started. Assumes we have reserved
+the kernel mutex! */
+UNIV_INLINE
+void
+trx_start_if_not_started_low(
+/*=========================*/
+	trx_t*	trx);	/*!< in: transaction */
+/****************************************************************//**
+Commits a transaction. */
+UNIV_INTERN
+void
+trx_commit_off_kernel(
+/*==================*/
+	trx_t*	trx);	/*!< in: transaction */
+/****************************************************************//**
+Cleans up a transaction at database startup. The cleanup is needed if
+the transaction already got to the middle of a commit when the database
+crashed, and we cannot roll it back. */
+UNIV_INTERN
+void
+trx_cleanup_at_db_startup(
+/*======================*/
+	trx_t*	trx);	/*!< in: transaction */
+/**********************************************************************//**
+Does the transaction commit for MySQL.
+@return	DB_SUCCESS or error number */
+UNIV_INTERN
+ulint
+trx_commit_for_mysql(
+/*=================*/
+	trx_t*	trx);	/*!< in: trx handle */
+/**********************************************************************//**
+Does the transaction prepare for MySQL.
+@return	0 or error number */
+UNIV_INTERN
+ulint
+trx_prepare_for_mysql(
+/*==================*/
+	trx_t*	trx);	/*!< in: trx handle */
+/**********************************************************************//**
+This function is used to find number of prepared transactions and
+their transaction objects for a recovery.
+@return	number of prepared transactions */
+UNIV_INTERN
+int
+trx_recover_for_mysql(
+/*==================*/
+	XID*	xid_list,	/*!< in/out: prepared transactions */
+	ulint	len);		/*!< in: number of slots in xid_list */
+/*******************************************************************//**
+This function is used to find one X/Open XA distributed transaction
+which is in the prepared state
+@return	trx or NULL */
+UNIV_INTERN
+trx_t *
+trx_get_trx_by_xid(
+/*===============*/
+	XID*	xid);	/*!< in: X/Open XA transaction identification */
+/**********************************************************************//**
+If required, flushes the log to disk if we called trx_commit_for_mysql()
+with trx->flush_log_later == TRUE.
+@return	0 or error number */
+UNIV_INTERN
+ulint
+trx_commit_complete_for_mysql(
+/*==========================*/
+	trx_t*	trx);	/*!< in: trx handle */
+/**********************************************************************//**
+Marks the latest SQL statement ended. */
+UNIV_INTERN
+void
+trx_mark_sql_stat_end(
+/*==================*/
+	trx_t*	trx);	/*!< in: trx handle */
+/********************************************************************//**
+Assigns a read view for a consistent read query. All the consistent reads
+within the same transaction will get the same read view, which is created
+when this function is first called for a new started transaction.
+@return	consistent read view */
+UNIV_INTERN
+read_view_t*
+trx_assign_read_view(
+/*=================*/
+	trx_t*	trx);	/*!< in: active transaction */
+/***********************************************************//**
+The transaction must be in the TRX_QUE_LOCK_WAIT state. Puts it to
+the TRX_QUE_RUNNING state and releases query threads which were
+waiting for a lock in the wait_thrs list. */
+UNIV_INTERN
+void
+trx_end_lock_wait(
+/*==============*/
+	trx_t*	trx);	/*!< in: transaction */
+/****************************************************************//**
+Sends a signal to a trx object. */
+UNIV_INTERN
+void
+trx_sig_send(
+/*=========*/
+	trx_t*		trx,		/*!< in: trx handle */
+	ulint		type,		/*!< in: signal type */
+	ulint		sender,		/*!< in: TRX_SIG_SELF or
+					TRX_SIG_OTHER_SESS */
+	que_thr_t*	receiver_thr,	/*!< in: query thread which wants the
+					reply, or NULL; if type is
+					TRX_SIG_END_WAIT, this must be NULL */
+	trx_savept_t*	savept,		/*!< in: possible rollback savepoint, or
+					NULL */
+	que_thr_t**	next_thr);	/*!< in/out: next query thread to run;
+					if the value which is passed in is
+					a pointer to a NULL pointer, then the
+					calling function can start running
+					a new query thread; if the parameter
+					is NULL, it is ignored */
+/****************************************************************//**
+Send the reply message when a signal in the queue of the trx has
+been handled. */
+UNIV_INTERN
+void
+trx_sig_reply(
+/*==========*/
+	trx_sig_t*	sig,		/*!< in: signal */
+	que_thr_t**	next_thr);	/*!< in/out: next query thread to run;
+					if the value which is passed in is
+					a pointer to a NULL pointer, then the
+					calling function can start running
+					a new query thread */
+/****************************************************************//**
+Removes the signal object from a trx signal queue. */
+UNIV_INTERN
+void
+trx_sig_remove(
+/*===========*/
+	trx_t*		trx,	/*!< in: trx handle */
+	trx_sig_t*	sig);	/*!< in, own: signal */
+/****************************************************************//**
+Starts handling of a trx signal. */
+UNIV_INTERN
+void
+trx_sig_start_handle(
+/*=================*/
+	trx_t*		trx,		/*!< in: trx handle */
+	que_thr_t**	next_thr);	/*!< in/out: next query thread to run;
+					if the value which is passed in is
+					a pointer to a NULL pointer, then the
+					calling function can start running
+					a new query thread */
+/****************************************************************//**
+Ends signal handling. If the session is in the error state, and
+trx->graph_before_signal_handling != NULL, returns control to the error
+handling routine of the graph (currently only returns the control to the
+graph root which then sends an error message to the client). */
+UNIV_INTERN
+void
+trx_end_signal_handling(
+/*====================*/
+	trx_t*	trx);	/*!< in: trx */
+/*********************************************************************//**
+Creates a commit command node struct.
+@return	own: commit node struct */
+UNIV_INTERN
+commit_node_t*
+commit_node_create(
+/*===============*/
+	mem_heap_t*	heap);	/*!< in: mem heap where created */
+/***********************************************************//**
+Performs an execution step for a commit type node in a query graph.
+@return	query thread to run next, or NULL */
+UNIV_INTERN
+que_thr_t*
+trx_commit_step(
+/*============*/
+	que_thr_t*	thr);	/*!< in: query thread */
+
+/**********************************************************************//**
+Prints info about a transaction to the given file. The caller must own the
+kernel mutex. */
+UNIV_INTERN
+void
+trx_print(
+/*======*/
+	FILE*	f,		/*!< in: output stream */
+	trx_t*	trx,		/*!< in: transaction */
+	ulint	max_query_len);	/*!< in: max query length to print, or 0 to
+				   use the default max length */
+
+/** Type of data dictionary operation */
+typedef enum trx_dict_op {
+	/** The transaction is not modifying the data dictionary. */
+	TRX_DICT_OP_NONE = 0,
+	/** The transaction is creating a table or an index, or
+	dropping a table.  The table must be dropped in crash
+	recovery.  This and TRX_DICT_OP_NONE are the only possible
+	operation modes in crash recovery. */
+	TRX_DICT_OP_TABLE = 1,
+	/** The transaction is creating or dropping an index in an
+	existing table.  In crash recovery, the data dictionary
+	must be locked, but the table must not be dropped. */
+	TRX_DICT_OP_INDEX = 2
+} trx_dict_op_t;
+
+/**********************************************************************//**
+Determine if a transaction is a dictionary operation.
+@return	dictionary operation mode */
+UNIV_INLINE
+enum trx_dict_op
+trx_get_dict_operation(
+/*===================*/
+	const trx_t*	trx)	/*!< in: transaction */
+	__attribute__((pure));
+/**********************************************************************//**
+Flag a transaction a dictionary operation. */
+UNIV_INLINE
+void
+trx_set_dict_operation(
+/*===================*/
+	trx_t*			trx,	/*!< in/out: transaction */
+	enum trx_dict_op	op);	/*!< in: operation, not
+					TRX_DICT_OP_NONE */
+
+#ifndef UNIV_HOTBACKUP
+/**********************************************************************//**
+Determines if the currently running transaction has been interrupted.
+@return	TRUE if interrupted */
+UNIV_INTERN
+ibool
+trx_is_interrupted(
+/*===============*/
+	trx_t*	trx);	/*!< in: transaction */
+/**********************************************************************//**
+Determines if the currently running transaction is in strict mode.
+@return	TRUE if strict */
+UNIV_INTERN
+ibool
+trx_is_strict(
+/*==========*/
+	trx_t*	trx);	/*!< in: transaction */
+#else /* !UNIV_HOTBACKUP */
+#define trx_is_interrupted(trx) FALSE
+#endif /* !UNIV_HOTBACKUP */
+
+/*******************************************************************//**
+Calculates the "weight" of a transaction. The weight of one transaction
+is estimated as the number of altered rows + the number of locked rows.
+@param t	transaction
+@return		transaction weight */
+#define TRX_WEIGHT(t)	\
+	ut_dulint_add((t)->undo_no, UT_LIST_GET_LEN((t)->trx_locks))
+
+/*******************************************************************//**
+Compares the "weight" (or size) of two transactions. Transactions that
+have edited non-transactional tables are considered heavier than ones
+that have not.
+@return	<0, 0 or >0; similar to strcmp(3) */
+UNIV_INTERN
+int
+trx_weight_cmp(
+/*===========*/
+	const trx_t*	a,	/*!< in: the first transaction to be compared */
+	const trx_t*	b);	/*!< in: the second transaction to be compared */
+
+/*******************************************************************//**
+Retrieves transacion's id, represented as unsigned long long.
+@return	transaction's id */
+UNIV_INLINE
+ullint
+trx_get_id(
+/*=======*/
+	const trx_t*	trx);	/*!< in: transaction */
+
+/* Maximum length of a string that can be returned by
+trx_get_que_state_str(). */
+#define TRX_QUE_STATE_STR_MAX_LEN	12 /* "ROLLING BACK" */
+
+/*******************************************************************//**
+Retrieves transaction's que state in a human readable string. The string
+should not be free()'d or modified.
+@return	string in the data segment */
+UNIV_INLINE
+const char*
+trx_get_que_state_str(
+/*==================*/
+	const trx_t*	trx);	/*!< in: transaction */
+
+/* Signal to a transaction */
+struct trx_sig_struct{
+	unsigned	type:3;		/*!< signal type */
+	unsigned	sender:1;	/*!< TRX_SIG_SELF or
+					TRX_SIG_OTHER_SESS */
+	que_thr_t*	receiver;	/*!< non-NULL if the sender of the signal
+					wants reply after the operation induced
+					by the signal is completed */
+	trx_savept_t	savept;		/*!< possible rollback savepoint */
+	UT_LIST_NODE_T(trx_sig_t)
+			signals;	/*!< queue of pending signals to the
+					transaction */
+	UT_LIST_NODE_T(trx_sig_t)
+			reply_signals;	/*!< list of signals for which the sender
+					transaction is waiting a reply */
+};
+
+#define TRX_MAGIC_N	91118598
+
+/* The transaction handle; every session has a trx object which is freed only
+when the session is freed; in addition there may be session-less transactions
+rolling back after a database recovery */
+
+struct trx_struct{
+	ulint		magic_n;
+
+	/* These fields are not protected by any mutex. */
+	const char*	op_info;	/*!< English text describing the
+					current operation, or an empty
+					string */
+	ulint		conc_state;	/*!< state of the trx from the point
+					of view of concurrency control:
+					TRX_ACTIVE, TRX_COMMITTED_IN_MEMORY,
+					... */
+	ulint		isolation_level;/* TRX_ISO_REPEATABLE_READ, ... */
+	ulint		check_foreigns;	/* normally TRUE, but if the user
+					wants to suppress foreign key checks,
+					(in table imports, for example) we
+					set this FALSE */
+	ulint		check_unique_secondary;
+					/* normally TRUE, but if the user
+					wants to speed up inserts by
+					suppressing unique key checks
+					for secondary indexes when we decide
+					if we can use the insert buffer for
+					them, we set this FALSE */
+	ulint		support_xa;	/*!< normally we do the XA two-phase
+					commit steps, but by setting this to
+					FALSE, one can save CPU time and about
+					150 bytes in the undo log size as then
+					we skip XA steps */
+	ulint		flush_log_at_trx_commit_session;
+	ulint		flush_log_later;/* In 2PC, we hold the
+					prepare_commit mutex across
+					both phases. In that case, we
+					defer flush of the logs to disk
+					until after we release the
+					mutex. */
+	ulint		must_flush_log_later;/* this flag is set to TRUE in
+					trx_commit_off_kernel() if
+					flush_log_later was TRUE, and there
+					were modifications by the transaction;
+					in that case we must flush the log
+					in trx_commit_complete_for_mysql() */
+	ulint		duplicates;	/*!< TRX_DUP_IGNORE | TRX_DUP_REPLACE */
+	ulint		active_trans;	/*!< 1 - if a transaction in MySQL
+					is active. 2 - if prepare_commit_mutex
+					was taken */
+	ulint		has_search_latch;
+					/* TRUE if this trx has latched the
+					search system latch in S-mode */
+	ulint		deadlock_mark;	/*!< a mark field used in deadlock
+					checking algorithm.  */
+	trx_dict_op_t	dict_operation;	/**< @see enum trx_dict_op */
+
+	/* Fields protected by the srv_conc_mutex. */
+	ulint		declared_to_be_inside_innodb;
+					/* this is TRUE if we have declared
+					this transaction in
+					srv_conc_enter_innodb to be inside the
+					InnoDB engine */
+
+	/* Fields protected by dict_operation_lock. The very latch
+	it is used to track. */
+	ulint		dict_operation_lock_mode;
+					/*!< 0, RW_S_LATCH, or RW_X_LATCH:
+					the latch mode trx currently holds
+					on dict_operation_lock */
+
+	/* All the next fields are protected by the kernel mutex, except the
+	undo logs which are protected by undo_mutex */
+	ulint		is_purge;	/*!< 0=user transaction, 1=purge */
+	ulint		is_recovered;	/*!< 0=normal transaction,
+					1=recovered, must be rolled back */
+	ulint		que_state;	/*!< valid when conc_state
+					== TRX_ACTIVE: TRX_QUE_RUNNING,
+					TRX_QUE_LOCK_WAIT, ... */
+	ulint		handling_signals;/* this is TRUE as long as the trx
+					is handling signals */
+	time_t		start_time;	/*!< time the trx object was created
+					or the state last time became
+					TRX_ACTIVE */
+	trx_id_t	id;		/*!< transaction id */
+	XID		xid;		/*!< X/Open XA transaction
+					identification to identify a
+					transaction branch */
+	trx_id_t	no;		/*!< transaction serialization number ==
+					max trx id when the transaction is
+					moved to COMMITTED_IN_MEMORY state */
+	ib_uint64_t	commit_lsn;	/*!< lsn at the time of the commit */
+	trx_id_t	table_id;	/*!< Table to drop iff dict_operation
+					is TRUE, or ut_dulint_zero. */
+	/*------------------------------*/
+	void*		mysql_thd;	/*!< MySQL thread handle corresponding
+					to this trx, or NULL */
+	const char*	mysql_log_file_name;
+					/* if MySQL binlog is used, this field
+					contains a pointer to the latest file
+					name; this is NULL if binlog is not
+					used */
+	ib_int64_t	mysql_log_offset;/* if MySQL binlog is used, this field
+					contains the end offset of the binlog
+					entry */
+	const char*	mysql_master_log_file_name;
+					/* if the database server is a MySQL
+					replication slave, we have here the
+					master binlog name up to which
+					replication has processed; otherwise
+					this is a pointer to a null
+					character */
+	ib_int64_t	mysql_master_log_pos;
+					/* if the database server is a MySQL
+					replication slave, this is the
+					position in the log file up to which
+					replication has processed */
+	const char*	mysql_relay_log_file_name;
+	ib_int64_t	mysql_relay_log_pos;
+
+	os_thread_id_t	mysql_thread_id;/* id of the MySQL thread associated
+					with this transaction object */
+	ulint		mysql_process_no;/* since in Linux, 'top' reports
+					process id's and not thread id's, we
+					store the process number too */
+	/*------------------------------*/
+	ulint		n_mysql_tables_in_use; /* number of Innobase tables
+					used in the processing of the current
+					SQL statement in MySQL */
+	ulint		mysql_n_tables_locked;
+					/* how many tables the current SQL
+					statement uses, except those
+					in consistent read */
+	ulint		search_latch_timeout;
+					/* If we notice that someone is
+					waiting for our S-lock on the search
+					latch to be released, we wait in
+					row0sel.c for BTR_SEA_TIMEOUT new
+					searches until we try to keep
+					the search latch again over
+					calls from MySQL; this is intended
+					to reduce contention on the search
+					latch */
+	/*------------------------------*/
+	ulint		n_tickets_to_enter_innodb;
+					/* this can be > 0 only when
+					declared_to_... is TRUE; when we come
+					to srv_conc_innodb_enter, if the value
+					here is > 0, we decrement this by 1 */
+	/*------------------------------*/
+	UT_LIST_NODE_T(trx_t)
+			trx_list;	/*!< list of transactions */
+	UT_LIST_NODE_T(trx_t)
+			mysql_trx_list;	/*!< list of transactions created for
+					MySQL */
+	/*------------------------------*/
+	ulint		error_state;	/*!< 0 if no error, otherwise error
+					number; NOTE That ONLY the thread
+					doing the transaction is allowed to
+					set this field: this is NOT protected
+					by the kernel mutex */
+	const dict_index_t*error_info;	/*!< if the error number indicates a
+					duplicate key error, a pointer to
+					the problematic index is stored here */
+	ulint		error_key_num;	/*!< if the index creation fails to a
+					duplicate key error, a mysql key
+					number of that index is stored here */
+	sess_t*		sess;		/*!< session of the trx, NULL if none */
+	que_t*		graph;		/*!< query currently run in the session,
+					or NULL if none; NOTE that the query
+					belongs to the session, and it can
+					survive over a transaction commit, if
+					it is a stored procedure with a COMMIT
+					WORK statement, for instance */
+	ulint		n_active_thrs;	/*!< number of active query threads */
+	que_t*		graph_before_signal_handling;
+					/* value of graph when signal handling
+					for this trx started: this is used to
+					return control to the original query
+					graph for error processing */
+	trx_sig_t	sig;		/*!< one signal object can be allocated
+					in this space, avoiding mem_alloc */
+	UT_LIST_BASE_NODE_T(trx_sig_t)
+			signals;	/*!< queue of processed or pending
+					signals to the trx */
+	UT_LIST_BASE_NODE_T(trx_sig_t)
+			reply_signals;	/*!< list of signals sent by the query
+					threads of this trx for which a thread
+					is waiting for a reply; if this trx is
+					killed, the reply requests in the list
+					must be canceled */
+	/*------------------------------*/
+	lock_t*		wait_lock;	/*!< if trx execution state is
+					TRX_QUE_LOCK_WAIT, this points to
+					the lock request, otherwise this is
+					NULL */
+	ibool		was_chosen_as_deadlock_victim;
+					/* when the transaction decides to wait
+					for a lock, it sets this to FALSE;
+					if another transaction chooses this
+					transaction as a victim in deadlock
+					resolution, it sets this to TRUE */
+	time_t		wait_started;	/*!< lock wait started at this time */
+	UT_LIST_BASE_NODE_T(que_thr_t)
+			wait_thrs;	/*!< query threads belonging to this
+					trx that are in the QUE_THR_LOCK_WAIT
+					state */
+	/*------------------------------*/
+	mem_heap_t*	lock_heap;	/*!< memory heap for the locks of the
+					transaction */
+	UT_LIST_BASE_NODE_T(lock_t)
+			trx_locks;	/*!< locks reserved by the transaction */
+	/*------------------------------*/
+	mem_heap_t*	global_read_view_heap;
+					/* memory heap for the global read
+					view */
+	read_view_t*	global_read_view;
+					/* consistent read view associated
+					to a transaction or NULL */
+	read_view_t*	read_view;	/*!< consistent read view used in the
+					transaction or NULL, this read view
+					if defined can be normal read view
+					associated to a transaction (i.e.
+					same as global_read_view) or read view
+					associated to a cursor */
+	/*------------------------------*/
+	UT_LIST_BASE_NODE_T(trx_named_savept_t)
+			trx_savepoints;	/*!< savepoints set with SAVEPOINT ...,
+					oldest first */
+	/*------------------------------*/
+	mutex_t		undo_mutex;	/*!< mutex protecting the fields in this
+					section (down to undo_no_arr), EXCEPT
+					last_sql_stat_start, which can be
+					accessed only when we know that there
+					cannot be any activity in the undo
+					logs! */
+	undo_no_t	undo_no;	/*!< next undo log record number to
+					assign; since the undo log is
+					private for a transaction, this
+					is a simple ascending sequence
+					with no gaps; thus it represents
+					the number of modified/inserted
+					rows in a transaction */
+	trx_savept_t	last_sql_stat_start;
+					/* undo_no when the last sql statement
+					was started: in case of an error, trx
+					is rolled back down to this undo
+					number; see note at undo_mutex! */
+	trx_rseg_t*	rseg;		/*!< rollback segment assigned to the
+					transaction, or NULL if not assigned
+					yet */
+	trx_undo_t*	insert_undo;	/*!< pointer to the insert undo log, or
+					NULL if no inserts performed yet */
+	trx_undo_t*	update_undo;	/*!< pointer to the update undo log, or
+					NULL if no update performed yet */
+	undo_no_t	roll_limit;	/*!< least undo number to undo during
+					a rollback */
+	ulint		pages_undone;	/*!< number of undo log pages undone
+					since the last undo log truncation */
+	trx_undo_arr_t*	undo_no_arr;	/*!< array of undo numbers of undo log
+					records which are currently processed
+					by a rollback operation */
+	/*------------------------------*/
+	ulint		n_autoinc_rows;	/*!< no. of AUTO-INC rows required for
+					an SQL statement. This is useful for
+					multi-row INSERTs */
+	ib_vector_t*    autoinc_locks;  /* AUTOINC locks held by this
+					transaction. Note that these are
+					also in the lock list trx_locks. This
+					vector needs to be freed explicitly
+					when the trx_t instance is desrtoyed */
+	/*------------------------------*/
+	char detailed_error[256];	/*!< detailed error message for last
+					error, or empty. */
+	/*------------------------------*/
+	ulint		io_reads;
+	ib_uint64_t	io_read;
+	ulint		io_reads_wait_timer;
+	ib_uint64_t	lock_que_wait_ustarted;
+	ulint           lock_que_wait_timer;
+	ulint           innodb_que_wait_timer;
+	ulint           distinct_page_access;
+#define	DPAH_SIZE	8192
+	byte*		distinct_page_access_hash;
+	ibool		take_stats;
+};
+
+#define TRX_MAX_N_THREADS	32	/* maximum number of
+					concurrent threads running a
+					single operation of a
+					transaction, e.g., a parallel
+					query */
+/* Transaction concurrency states (trx->conc_state) */
+#define	TRX_NOT_STARTED		0
+#define	TRX_ACTIVE		1
+#define	TRX_COMMITTED_IN_MEMORY	2
+#define	TRX_PREPARED		3	/* Support for 2PC/XA */
+
+/* Transaction execution states when trx->conc_state == TRX_ACTIVE */
+#define TRX_QUE_RUNNING		0	/* transaction is running */
+#define TRX_QUE_LOCK_WAIT	1	/* transaction is waiting for a lock */
+#define TRX_QUE_ROLLING_BACK	2	/* transaction is rolling back */
+#define TRX_QUE_COMMITTING	3	/* transaction is committing */
+
+/* Transaction isolation levels (trx->isolation_level) */
+#define TRX_ISO_READ_UNCOMMITTED	0	/* dirty read: non-locking
+						SELECTs are performed so that
+						we do not look at a possible
+						earlier version of a record;
+						thus they are not 'consistent'
+						reads under this isolation
+						level; otherwise like level
+						2 */
+
+#define TRX_ISO_READ_COMMITTED		1	/* somewhat Oracle-like
+						isolation, except that in
+						range UPDATE and DELETE we
+						must block phantom rows
+						with next-key locks;
+						SELECT ... FOR UPDATE and ...
+						LOCK IN SHARE MODE only lock
+						the index records, NOT the
+						gaps before them, and thus
+						allow free inserting;
+						each consistent read reads its
+						own snapshot */
+
+#define TRX_ISO_REPEATABLE_READ		2	/* this is the default;
+						all consistent reads in the
+						same trx read the same
+						snapshot;
+						full next-key locking used
+						in locking reads to block
+						insertions into gaps */
+
+#define TRX_ISO_SERIALIZABLE		3	/* all plain SELECTs are
+						converted to LOCK IN SHARE
+						MODE reads */
+
+/* Treatment of duplicate values (trx->duplicates; for example, in inserts).
+Multiple flags can be combined with bitwise OR. */
+#define TRX_DUP_IGNORE	1	/* duplicate rows are to be updated */
+#define TRX_DUP_REPLACE	2	/* duplicate rows are to be replaced */
+
+
+/* Types of a trx signal */
+#define TRX_SIG_NO_SIGNAL		0
+#define TRX_SIG_TOTAL_ROLLBACK		1
+#define TRX_SIG_ROLLBACK_TO_SAVEPT	2
+#define TRX_SIG_COMMIT			3
+#define	TRX_SIG_ERROR_OCCURRED		4
+#define TRX_SIG_BREAK_EXECUTION		5
+
+/* Sender types of a signal */
+#define TRX_SIG_SELF		0	/* sent by the session itself, or
+					by an error occurring within this
+					session */
+#define TRX_SIG_OTHER_SESS	1	/* sent by another session (which
+					must hold rights to this) */
+
+/** Commit node states */
+enum commit_node_state {
+	COMMIT_NODE_SEND = 1,	/*!< about to send a commit signal to
+				the transaction */
+	COMMIT_NODE_WAIT	/*!< commit signal sent to the transaction,
+				waiting for completion */
+};
+
+/** Commit command node in a query graph */
+struct commit_node_struct{
+	que_common_t	common;	/*!< node type: QUE_NODE_COMMIT */
+	enum commit_node_state
+			state;	/*!< node execution state */
+};
+
+
+
+#ifndef UNIV_NONINL
+#include "trx0trx.ic"
+#endif
+#endif /* !UNIV_HOTBACKUP */
+
+#endif
diff --git a/storage/xtradb/include/trx0trx.ic b/storage/xtradb/include/trx0trx.ic
new file mode 100644
index 00000000000..7332eeece85
--- /dev/null
+++ b/storage/xtradb/include/trx0trx.ic
@@ -0,0 +1,164 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/trx0trx.ic
+The transaction
+
+Created 3/26/1996 Heikki Tuuri
+*******************************************************/
+
+/*************************************************************//**
+Starts the transaction if it is not yet started. */
+UNIV_INLINE
+void
+trx_start_if_not_started(
+/*=====================*/
+	trx_t*	trx)	/*!< in: transaction */
+{
+	ut_ad(trx->conc_state != TRX_COMMITTED_IN_MEMORY);
+
+	if (trx->conc_state == TRX_NOT_STARTED) {
+
+		trx_start(trx, ULINT_UNDEFINED);
+	}
+}
+
+/*************************************************************//**
+Starts the transaction if it is not yet started. Assumes we have reserved
+the kernel mutex! */
+UNIV_INLINE
+void
+trx_start_if_not_started_low(
+/*=========================*/
+	trx_t*	trx)	/*!< in: transaction */
+{
+	ut_ad(trx->conc_state != TRX_COMMITTED_IN_MEMORY);
+
+	if (trx->conc_state == TRX_NOT_STARTED) {
+
+		trx_start_low(trx, ULINT_UNDEFINED);
+	}
+}
+
+/****************************************************************//**
+Retrieves the error_info field from a trx.
+@return	the error info */
+UNIV_INLINE
+const dict_index_t*
+trx_get_error_info(
+/*===============*/
+	const trx_t*	trx)	/*!< in: trx object */
+{
+	return(trx->error_info);
+}
+
+/*******************************************************************//**
+Retrieves transacion's id, represented as unsigned long long.
+@return	transaction's id */
+UNIV_INLINE
+ullint
+trx_get_id(
+/*=======*/
+	const trx_t*	trx)	/*!< in: transaction */
+{
+	return((ullint)ut_conv_dulint_to_longlong(trx->id));
+}
+
+/*******************************************************************//**
+Retrieves transaction's que state in a human readable string. The string
+should not be free()'d or modified.
+@return	string in the data segment */
+UNIV_INLINE
+const char*
+trx_get_que_state_str(
+/*==================*/
+	const trx_t*	trx)	/*!< in: transaction */
+{
+	/* be sure to adjust TRX_QUE_STATE_STR_MAX_LEN if you change this */
+	switch (trx->que_state) {
+	case TRX_QUE_RUNNING:
+		return("RUNNING");
+	case TRX_QUE_LOCK_WAIT:
+		return("LOCK WAIT");
+	case TRX_QUE_ROLLING_BACK:
+		return("ROLLING BACK");
+	case TRX_QUE_COMMITTING:
+		return("COMMITTING");
+	default:
+		return("UNKNOWN");
+	}
+}
+
+/**********************************************************************//**
+Determine if a transaction is a dictionary operation.
+@return	dictionary operation mode */
+UNIV_INLINE
+enum trx_dict_op
+trx_get_dict_operation(
+/*===================*/
+	const trx_t*	trx)	/*!< in: transaction */
+{
+	enum trx_dict_op op = (enum trx_dict_op) trx->dict_operation;
+
+#ifdef UNIV_DEBUG
+	switch (op) {
+	case TRX_DICT_OP_NONE:
+	case TRX_DICT_OP_TABLE:
+	case TRX_DICT_OP_INDEX:
+		return(op);
+	}
+	ut_error;
+#endif /* UNIV_DEBUG */
+	return((enum trx_dict_op) UNIV_EXPECT(op, TRX_DICT_OP_NONE));
+}
+/**********************************************************************//**
+Flag a transaction a dictionary operation. */
+UNIV_INLINE
+void
+trx_set_dict_operation(
+/*===================*/
+	trx_t*			trx,	/*!< in/out: transaction */
+	enum trx_dict_op	op)	/*!< in: operation, not
+					TRX_DICT_OP_NONE */
+{
+#ifdef UNIV_DEBUG
+	enum trx_dict_op	old_op = trx_get_dict_operation(trx);
+
+	switch (op) {
+	case TRX_DICT_OP_NONE:
+		ut_error;
+		break;
+	case TRX_DICT_OP_TABLE:
+		switch (old_op) {
+		case TRX_DICT_OP_NONE:
+		case TRX_DICT_OP_INDEX:
+		case TRX_DICT_OP_TABLE:
+			goto ok;
+		}
+		ut_error;
+		break;
+	case TRX_DICT_OP_INDEX:
+		ut_ad(old_op == TRX_DICT_OP_NONE);
+		break;
+	}
+ok:
+#endif /* UNIV_DEBUG */
+
+	trx->dict_operation = op;
+}
diff --git a/storage/xtradb/include/trx0types.h b/storage/xtradb/include/trx0types.h
new file mode 100644
index 00000000000..40a7256cbfd
--- /dev/null
+++ b/storage/xtradb/include/trx0types.h
@@ -0,0 +1,115 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2010, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/trx0types.h
+Transaction system global type definitions
+
+Created 3/26/1996 Heikki Tuuri
+*******************************************************/
+
+#ifndef trx0types_h
+#define trx0types_h
+
+#include "ut0byte.h"
+
+/** prepare trx_t::id for being printed via printf(3) */
+#define TRX_ID_PREP_PRINTF(id)	(ullint) ut_conv_dulint_to_longlong(id)
+
+/** printf(3) format used for printing TRX_ID_PRINTF_PREP() */
+#define TRX_ID_FMT		"%llX"
+
+/** maximum length that a formatted trx_t::id could take, not including
+the terminating NUL character. */
+#define TRX_ID_MAX_LEN		17
+
+/** Memory objects */
+/* @{ */
+/** Transaction */
+typedef struct trx_struct	trx_t;
+/** Transaction system */
+typedef struct trx_sys_struct	trx_sys_t;
+/** Doublewrite information */
+typedef struct trx_doublewrite_struct	trx_doublewrite_t;
+/** Signal */
+typedef struct trx_sig_struct	trx_sig_t;
+/** Rollback segment */
+typedef struct trx_rseg_struct	trx_rseg_t;
+/** Transaction undo log */
+typedef struct trx_undo_struct	trx_undo_t;
+/** Array of undo numbers of undo records being rolled back or purged */
+typedef struct trx_undo_arr_struct trx_undo_arr_t;
+/** A cell of trx_undo_arr_t */
+typedef struct trx_undo_inf_struct trx_undo_inf_t;
+/** The control structure used in the purge operation */
+typedef struct trx_purge_struct	trx_purge_t;
+/** Rollback command node in a query graph */
+typedef struct roll_node_struct	roll_node_t;
+/** Commit command node in a query graph */
+typedef struct commit_node_struct commit_node_t;
+/** SAVEPOINT command node in a query graph */
+typedef struct trx_named_savept_struct trx_named_savept_t;
+/* @} */
+
+/** Rollback contexts */
+enum trx_rb_ctx {
+	RB_NONE = 0,	/*!< no rollback */
+	RB_NORMAL,	/*!< normal rollback */
+	RB_RECOVERY_PURGE_REC,
+			/*!< rolling back an incomplete transaction,
+			in crash recovery, rolling back an
+			INSERT that was performed by updating a
+			delete-marked record; if the delete-marked record
+			no longer exists in an active read view, it will
+			be purged */
+	RB_RECOVERY	/*!< rolling back an incomplete transaction,
+			in crash recovery */
+};
+
+/** Transaction identifier (DB_TRX_ID, DATA_TRX_ID) */
+typedef dulint	trx_id_t;
+/** Rollback pointer (DB_ROLL_PTR, DATA_ROLL_PTR) */
+typedef dulint	roll_ptr_t;
+/** Undo number */
+typedef dulint	undo_no_t;
+
+/** Transaction savepoint */
+typedef struct trx_savept_struct trx_savept_t;
+/** Transaction savepoint */
+struct trx_savept_struct{
+	undo_no_t	least_undo_no;	/*!< least undo number to undo */
+};
+
+/** File objects */
+/* @{ */
+/** Transaction system header */
+typedef byte	trx_sysf_t;
+/** Rollback segment header */
+typedef byte	trx_rsegf_t;
+/** Undo segment header */
+typedef byte	trx_usegf_t;
+/** Undo log header */
+typedef byte	trx_ulogf_t;
+/** Undo log page header */
+typedef byte	trx_upagef_t;
+
+/** Undo log record */
+typedef	byte	trx_undo_rec_t;
+/* @} */
+
+#endif
diff --git a/storage/xtradb/include/trx0undo.h b/storage/xtradb/include/trx0undo.h
new file mode 100644
index 00000000000..a084f2394b5
--- /dev/null
+++ b/storage/xtradb/include/trx0undo.h
@@ -0,0 +1,551 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/trx0undo.h
+Transaction undo log
+
+Created 3/26/1996 Heikki Tuuri
+*******************************************************/
+
+#ifndef trx0undo_h
+#define trx0undo_h
+
+#include "univ.i"
+#include "trx0types.h"
+#include "mtr0mtr.h"
+#include "trx0sys.h"
+#include "page0types.h"
+#include "trx0xa.h"
+
+#ifndef UNIV_HOTBACKUP
+/***********************************************************************//**
+Builds a roll pointer.
+@return	roll pointer */
+UNIV_INLINE
+roll_ptr_t
+trx_undo_build_roll_ptr(
+/*====================*/
+	ibool	is_insert,	/*!< in: TRUE if insert undo log */
+	ulint	rseg_id,	/*!< in: rollback segment id */
+	ulint	page_no,	/*!< in: page number */
+	ulint	offset);	/*!< in: offset of the undo entry within page */
+/***********************************************************************//**
+Decodes a roll pointer. */
+UNIV_INLINE
+void
+trx_undo_decode_roll_ptr(
+/*=====================*/
+	roll_ptr_t	roll_ptr,	/*!< in: roll pointer */
+	ibool*		is_insert,	/*!< out: TRUE if insert undo log */
+	ulint*		rseg_id,	/*!< out: rollback segment id */
+	ulint*		page_no,	/*!< out: page number */
+	ulint*		offset);	/*!< out: offset of the undo
+					entry within page */
+/***********************************************************************//**
+Returns TRUE if the roll pointer is of the insert type.
+@return	TRUE if insert undo log */
+UNIV_INLINE
+ibool
+trx_undo_roll_ptr_is_insert(
+/*========================*/
+	roll_ptr_t	roll_ptr);	/*!< in: roll pointer */
+#endif /* !UNIV_HOTBACKUP */
+/*****************************************************************//**
+Writes a roll ptr to an index page. In case that the size changes in
+some future version, this function should be used instead of
+mach_write_... */
+UNIV_INLINE
+void
+trx_write_roll_ptr(
+/*===============*/
+	byte*		ptr,		/*!< in: pointer to memory where
+					written */
+	roll_ptr_t	roll_ptr);	/*!< in: roll ptr */
+/*****************************************************************//**
+Reads a roll ptr from an index page. In case that the roll ptr size
+changes in some future version, this function should be used instead of
+mach_read_...
+@return	roll ptr */
+UNIV_INLINE
+roll_ptr_t
+trx_read_roll_ptr(
+/*==============*/
+	const byte*	ptr);	/*!< in: pointer to memory from where to read */
+#ifndef UNIV_HOTBACKUP
+/******************************************************************//**
+Gets an undo log page and x-latches it.
+@return	pointer to page x-latched */
+UNIV_INLINE
+page_t*
+trx_undo_page_get(
+/*==============*/
+	ulint	space,		/*!< in: space where placed */
+	ulint	zip_size,	/*!< in: compressed page size in bytes
+				or 0 for uncompressed pages */
+	ulint	page_no,	/*!< in: page number */
+	mtr_t*	mtr);		/*!< in: mtr */
+/******************************************************************//**
+Gets an undo log page and s-latches it.
+@return	pointer to page s-latched */
+UNIV_INLINE
+page_t*
+trx_undo_page_get_s_latched(
+/*========================*/
+	ulint	space,		/*!< in: space where placed */
+	ulint	zip_size,	/*!< in: compressed page size in bytes
+				or 0 for uncompressed pages */
+	ulint	page_no,	/*!< in: page number */
+	mtr_t*	mtr);		/*!< in: mtr */
+/******************************************************************//**
+Returns the previous undo record on the page in the specified log, or
+NULL if none exists.
+@return	pointer to record, NULL if none */
+UNIV_INLINE
+trx_undo_rec_t*
+trx_undo_page_get_prev_rec(
+/*=======================*/
+	trx_undo_rec_t*	rec,	/*!< in: undo log record */
+	ulint		page_no,/*!< in: undo log header page number */
+	ulint		offset);/*!< in: undo log header offset on page */
+/******************************************************************//**
+Returns the next undo log record on the page in the specified log, or
+NULL if none exists.
+@return	pointer to record, NULL if none */
+UNIV_INLINE
+trx_undo_rec_t*
+trx_undo_page_get_next_rec(
+/*=======================*/
+	trx_undo_rec_t*	rec,	/*!< in: undo log record */
+	ulint		page_no,/*!< in: undo log header page number */
+	ulint		offset);/*!< in: undo log header offset on page */
+/******************************************************************//**
+Returns the last undo record on the page in the specified undo log, or
+NULL if none exists.
+@return	pointer to record, NULL if none */
+UNIV_INLINE
+trx_undo_rec_t*
+trx_undo_page_get_last_rec(
+/*=======================*/
+	page_t*	undo_page,/*!< in: undo log page */
+	ulint	page_no,/*!< in: undo log header page number */
+	ulint	offset);	/*!< in: undo log header offset on page */
+/******************************************************************//**
+Returns the first undo record on the page in the specified undo log, or
+NULL if none exists.
+@return	pointer to record, NULL if none */
+UNIV_INLINE
+trx_undo_rec_t*
+trx_undo_page_get_first_rec(
+/*========================*/
+	page_t*	undo_page,/*!< in: undo log page */
+	ulint	page_no,/*!< in: undo log header page number */
+	ulint	offset);/*!< in: undo log header offset on page */
+/***********************************************************************//**
+Gets the previous record in an undo log.
+@return	undo log record, the page s-latched, NULL if none */
+UNIV_INTERN
+trx_undo_rec_t*
+trx_undo_get_prev_rec(
+/*==================*/
+	trx_undo_rec_t*	rec,	/*!< in: undo record */
+	ulint		page_no,/*!< in: undo log header page number */
+	ulint		offset,	/*!< in: undo log header offset on page */
+	mtr_t*		mtr);	/*!< in: mtr */
+/***********************************************************************//**
+Gets the next record in an undo log.
+@return	undo log record, the page s-latched, NULL if none */
+UNIV_INTERN
+trx_undo_rec_t*
+trx_undo_get_next_rec(
+/*==================*/
+	trx_undo_rec_t*	rec,	/*!< in: undo record */
+	ulint		page_no,/*!< in: undo log header page number */
+	ulint		offset,	/*!< in: undo log header offset on page */
+	mtr_t*		mtr);	/*!< in: mtr */
+/***********************************************************************//**
+Gets the first record in an undo log.
+@return	undo log record, the page latched, NULL if none */
+UNIV_INTERN
+trx_undo_rec_t*
+trx_undo_get_first_rec(
+/*===================*/
+	ulint	space,	/*!< in: undo log header space */
+	ulint	zip_size,/*!< in: compressed page size in bytes
+			or 0 for uncompressed pages */
+	ulint	page_no,/*!< in: undo log header page number */
+	ulint	offset,	/*!< in: undo log header offset on page */
+	ulint	mode,	/*!< in: latching mode: RW_S_LATCH or RW_X_LATCH */
+	mtr_t*	mtr);	/*!< in: mtr */
+/********************************************************************//**
+Tries to add a page to the undo log segment where the undo log is placed.
+@return	page number if success, else FIL_NULL */
+UNIV_INTERN
+ulint
+trx_undo_add_page(
+/*==============*/
+	trx_t*		trx,	/*!< in: transaction */
+	trx_undo_t*	undo,	/*!< in: undo log memory object */
+	mtr_t*		mtr);	/*!< in: mtr which does not have a latch to any
+				undo log page; the caller must have reserved
+				the rollback segment mutex */
+/***********************************************************************//**
+Truncates an undo log from the end. This function is used during a rollback
+to free space from an undo log. */
+UNIV_INTERN
+void
+trx_undo_truncate_end(
+/*==================*/
+	trx_t*		trx,	/*!< in: transaction whose undo log it is */
+	trx_undo_t*	undo,	/*!< in: undo log */
+	undo_no_t	limit);	/*!< in: all undo records with undo number
+				>= this value should be truncated */
+/***********************************************************************//**
+Truncates an undo log from the start. This function is used during a purge
+operation. */
+UNIV_INTERN
+void
+trx_undo_truncate_start(
+/*====================*/
+	trx_rseg_t*	rseg,		/*!< in: rollback segment */
+	ulint		space,		/*!< in: space id of the log */
+	ulint		hdr_page_no,	/*!< in: header page number */
+	ulint		hdr_offset,	/*!< in: header offset on the page */
+	undo_no_t	limit);		/*!< in: all undo pages with
+					undo numbers < this value
+					should be truncated; NOTE that
+					the function only frees whole
+					pages; the header page is not
+					freed, but emptied, if all the
+					records there are < limit */
+/********************************************************************//**
+Initializes the undo log lists for a rollback segment memory copy.
+This function is only called when the database is started or a new
+rollback segment created.
+@return	the combined size of undo log segments in pages */
+UNIV_INTERN
+ulint
+trx_undo_lists_init(
+/*================*/
+	trx_rseg_t*	rseg);	/*!< in: rollback segment memory object */
+/**********************************************************************//**
+Assigns an undo log for a transaction. A new undo log is created or a cached
+undo log reused.
+@return DB_SUCCESS if undo log assign successful, possible error codes
+are: DB_TOO_MANY_CONCURRENT_TRXS DB_OUT_OF_FILE_SPACE
+DB_OUT_OF_MEMORY */
+UNIV_INTERN
+ulint
+trx_undo_assign_undo(
+/*=================*/
+	trx_t*		trx,	/*!< in: transaction */
+	ulint		type);	/*!< in: TRX_UNDO_INSERT or TRX_UNDO_UPDATE */
+/******************************************************************//**
+Sets the state of the undo log segment at a transaction finish.
+@return	undo log segment header page, x-latched */
+UNIV_INTERN
+page_t*
+trx_undo_set_state_at_finish(
+/*=========================*/
+	trx_rseg_t*	rseg,	/*!< in: rollback segment memory object */
+	trx_t*		trx,	/*!< in: transaction */
+	trx_undo_t*	undo,	/*!< in: undo log memory copy */
+	mtr_t*		mtr);	/*!< in: mtr */
+/******************************************************************//**
+Sets the state of the undo log segment at a transaction prepare.
+@return	undo log segment header page, x-latched */
+UNIV_INTERN
+page_t*
+trx_undo_set_state_at_prepare(
+/*==========================*/
+	trx_t*		trx,	/*!< in: transaction */
+	trx_undo_t*	undo,	/*!< in: undo log memory copy */
+	mtr_t*		mtr);	/*!< in: mtr */
+
+/**********************************************************************//**
+Adds the update undo log header as the first in the history list, and
+frees the memory object, or puts it to the list of cached update undo log
+segments. */
+UNIV_INTERN
+void
+trx_undo_update_cleanup(
+/*====================*/
+	trx_t*	trx,		/*!< in: trx owning the update undo log */
+	page_t*	undo_page,	/*!< in: update undo log header page,
+				x-latched */
+	mtr_t*	mtr);		/*!< in: mtr */
+/******************************************************************//**
+Frees or caches an insert undo log after a transaction commit or rollback.
+Knowledge of inserts is not needed after a commit or rollback, therefore
+the data can be discarded. */
+UNIV_INTERN
+void
+trx_undo_insert_cleanup(
+/*====================*/
+	trx_t*	trx);	/*!< in: transaction handle */
+#endif /* !UNIV_HOTBACKUP */
+/***********************************************************//**
+Parses the redo log entry of an undo log page initialization.
+@return	end of log record or NULL */
+UNIV_INTERN
+byte*
+trx_undo_parse_page_init(
+/*=====================*/
+	byte*	ptr,	/*!< in: buffer */
+	byte*	end_ptr,/*!< in: buffer end */
+	page_t*	page,	/*!< in: page or NULL */
+	mtr_t*	mtr);	/*!< in: mtr or NULL */
+/***********************************************************//**
+Parses the redo log entry of an undo log page header create or reuse.
+@return	end of log record or NULL */
+UNIV_INTERN
+byte*
+trx_undo_parse_page_header(
+/*=======================*/
+	ulint	type,	/*!< in: MLOG_UNDO_HDR_CREATE or MLOG_UNDO_HDR_REUSE */
+	byte*	ptr,	/*!< in: buffer */
+	byte*	end_ptr,/*!< in: buffer end */
+	page_t*	page,	/*!< in: page or NULL */
+	mtr_t*	mtr);	/*!< in: mtr or NULL */
+/***********************************************************//**
+Parses the redo log entry of an undo log page header discard.
+@return	end of log record or NULL */
+UNIV_INTERN
+byte*
+trx_undo_parse_discard_latest(
+/*==========================*/
+	byte*	ptr,	/*!< in: buffer */
+	byte*	end_ptr,/*!< in: buffer end */
+	page_t*	page,	/*!< in: page or NULL */
+	mtr_t*	mtr);	/*!< in: mtr or NULL */
+/************************************************************************
+Frees an undo log memory copy. */
+UNIV_INTERN
+void
+trx_undo_mem_free(
+/*==============*/
+	trx_undo_t*	undo);		/* in: the undo object to be freed */
+
+/* Types of an undo log segment */
+#define	TRX_UNDO_INSERT		1	/* contains undo entries for inserts */
+#define	TRX_UNDO_UPDATE		2	/* contains undo entries for updates
+					and delete markings: in short,
+					modifys (the name 'UPDATE' is a
+					historical relic) */
+/* States of an undo log segment */
+#define TRX_UNDO_ACTIVE		1	/* contains an undo log of an active
+					transaction */
+#define	TRX_UNDO_CACHED		2	/* cached for quick reuse */
+#define	TRX_UNDO_TO_FREE	3	/* insert undo segment can be freed */
+#define	TRX_UNDO_TO_PURGE	4	/* update undo segment will not be
+					reused: it can be freed in purge when
+					all undo data in it is removed */
+#define	TRX_UNDO_PREPARED	5	/* contains an undo log of an
+					prepared transaction */
+
+#ifndef UNIV_HOTBACKUP
+/** Transaction undo log memory object; this is protected by the undo_mutex
+in the corresponding transaction object */
+
+struct trx_undo_struct{
+	/*-----------------------------*/
+	ulint		id;		/*!< undo log slot number within the
+					rollback segment */
+	ulint		type;		/*!< TRX_UNDO_INSERT or
+					TRX_UNDO_UPDATE */
+	ulint		state;		/*!< state of the corresponding undo log
+					segment */
+	ibool		del_marks;	/*!< relevant only in an update undo log:
+					this is TRUE if the transaction may
+					have delete marked records, because of
+					a delete of a row or an update of an
+					indexed field; purge is then
+					necessary; also TRUE if the transaction
+					has updated an externally stored
+					field */
+	trx_id_t	trx_id;		/*!< id of the trx assigned to the undo
+					log */
+	XID		xid;		/*!< X/Open XA transaction
+					identification */
+	ibool		dict_operation;	/*!< TRUE if a dict operation trx */
+	dulint		table_id;	/*!< if a dict operation, then the table
+					id */
+	trx_rseg_t*	rseg;		/*!< rseg where the undo log belongs */
+	/*-----------------------------*/
+	ulint		space;		/*!< space id where the undo log
+					placed */
+	ulint		zip_size;	/*!< compressed page size of space
+					in bytes, or 0 for uncompressed */
+	ulint		hdr_page_no;	/*!< page number of the header page in
+					the undo log */
+	ulint		hdr_offset;	/*!< header offset of the undo log on the
+					page */
+	ulint		last_page_no;	/*!< page number of the last page in the
+					undo log; this may differ from
+					top_page_no during a rollback */
+	ulint		size;		/*!< current size in pages */
+	/*-----------------------------*/
+	ulint		empty;		/*!< TRUE if the stack of undo log
+					records is currently empty */
+	ulint		top_page_no;	/*!< page number where the latest undo
+					log record was catenated; during
+					rollback the page from which the latest
+					undo record was chosen */
+	ulint		top_offset;	/*!< offset of the latest undo record,
+					i.e., the topmost element in the undo
+					log if we think of it as a stack */
+	undo_no_t	top_undo_no;	/*!< undo number of the latest record */
+	buf_block_t*	guess_block;	/*!< guess for the buffer block where
+					the top page might reside */
+	/*-----------------------------*/
+	UT_LIST_NODE_T(trx_undo_t) undo_list;
+					/*!< undo log objects in the rollback
+					segment are chained into lists */
+};
+#endif /* !UNIV_HOTBACKUP */
+
+/** The offset of the undo log page header on pages of the undo log */
+#define	TRX_UNDO_PAGE_HDR	FSEG_PAGE_DATA
+/*-------------------------------------------------------------*/
+/** Transaction undo log page header offsets */
+/* @{ */
+#define	TRX_UNDO_PAGE_TYPE	0	/*!< TRX_UNDO_INSERT or
+					TRX_UNDO_UPDATE */
+#define	TRX_UNDO_PAGE_START	2	/*!< Byte offset where the undo log
+					records for the LATEST transaction
+					start on this page (remember that
+					in an update undo log, the first page
+					can contain several undo logs) */
+#define	TRX_UNDO_PAGE_FREE	4	/*!< On each page of the undo log this
+					field contains the byte offset of the
+					first free byte on the page */
+#define TRX_UNDO_PAGE_NODE	6	/*!< The file list node in the chain
+					of undo log pages */
+/*-------------------------------------------------------------*/
+#define TRX_UNDO_PAGE_HDR_SIZE	(6 + FLST_NODE_SIZE)
+					/*!< Size of the transaction undo
+					log page header, in bytes */
+/* @} */
+
+/** An update undo segment with just one page can be reused if it has
+at most this many bytes used; we must leave space at least for one new undo
+log header on the page */
+
+#define TRX_UNDO_PAGE_REUSE_LIMIT	(3 * UNIV_PAGE_SIZE / 4)
+
+/* An update undo log segment may contain several undo logs on its first page
+if the undo logs took so little space that the segment could be cached and
+reused. All the undo log headers are then on the first page, and the last one
+owns the undo log records on subsequent pages if the segment is bigger than
+one page. If an undo log is stored in a segment, then on the first page it is
+allowed to have zero undo records, but if the segment extends to several
+pages, then all the rest of the pages must contain at least one undo log
+record. */
+
+/** The offset of the undo log segment header on the first page of the undo
+log segment */
+
+#define	TRX_UNDO_SEG_HDR	(TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_HDR_SIZE)
+/** Undo log segment header */
+/* @{ */
+/*-------------------------------------------------------------*/
+#define	TRX_UNDO_STATE		0	/*!< TRX_UNDO_ACTIVE, ... */
+#define	TRX_UNDO_LAST_LOG	2	/*!< Offset of the last undo log header
+					on the segment header page, 0 if
+					none */
+#define	TRX_UNDO_FSEG_HEADER	4	/*!< Header for the file segment which
+					the undo log segment occupies */
+#define	TRX_UNDO_PAGE_LIST	(4 + FSEG_HEADER_SIZE)
+					/*!< Base node for the list of pages in
+					the undo log segment; defined only on
+					the undo log segment's first page */
+/*-------------------------------------------------------------*/
+/** Size of the undo log segment header */
+#define TRX_UNDO_SEG_HDR_SIZE	(4 + FSEG_HEADER_SIZE + FLST_BASE_NODE_SIZE)
+/* @} */
+
+
+/** The undo log header. There can be several undo log headers on the first
+page of an update undo log segment. */
+/* @{ */
+/*-------------------------------------------------------------*/
+#define	TRX_UNDO_TRX_ID		0	/*!< Transaction id */
+#define	TRX_UNDO_TRX_NO		8	/*!< Transaction number of the
+					transaction; defined only if the log
+					is in a history list */
+#define TRX_UNDO_DEL_MARKS	16	/*!< Defined only in an update undo
+					log: TRUE if the transaction may have
+					done delete markings of records, and
+					thus purge is necessary */
+#define	TRX_UNDO_LOG_START	18	/*!< Offset of the first undo log record
+					of this log on the header page; purge
+					may remove undo log record from the
+					log start, and therefore this is not
+					necessarily the same as this log
+					header end offset */
+#define	TRX_UNDO_XID_EXISTS	20	/*!< TRUE if undo log header includes
+					X/Open XA transaction identification
+					XID */
+#define	TRX_UNDO_DICT_TRANS	21	/*!< TRUE if the transaction is a table
+					create, index create, or drop
+					transaction: in recovery
+					the transaction cannot be rolled back
+					in the usual way: a 'rollback' rather
+					means dropping the created or dropped
+					table, if it still exists */
+#define TRX_UNDO_TABLE_ID	22	/*!< Id of the table if the preceding
+					field is TRUE */
+#define	TRX_UNDO_NEXT_LOG	30	/*!< Offset of the next undo log header
+					on this page, 0 if none */
+#define	TRX_UNDO_PREV_LOG	32	/*!< Offset of the previous undo log
+					header on this page, 0 if none */
+#define TRX_UNDO_HISTORY_NODE	34	/*!< If the log is put to the history
+					list, the file list node is here */
+/*-------------------------------------------------------------*/
+/** Size of the undo log header without XID information */
+#define TRX_UNDO_LOG_OLD_HDR_SIZE (34 + FLST_NODE_SIZE)
+
+/* Note: the writing of the undo log old header is coded by a log record
+MLOG_UNDO_HDR_CREATE or MLOG_UNDO_HDR_REUSE. The appending of an XID to the
+header is logged separately. In this sense, the XID is not really a member
+of the undo log header. TODO: do not append the XID to the log header if XA
+is not needed by the user. The XID wastes about 150 bytes of space in every
+undo log. In the history list we may have millions of undo logs, which means
+quite a large overhead. */
+
+/** X/Open XA Transaction Identification (XID) */
+/* @{ */
+/** xid_t::formatID */
+#define	TRX_UNDO_XA_FORMAT	(TRX_UNDO_LOG_OLD_HDR_SIZE)
+/** xid_t::gtrid_length */
+#define	TRX_UNDO_XA_TRID_LEN	(TRX_UNDO_XA_FORMAT + 4)
+/** xid_t::bqual_length */
+#define	TRX_UNDO_XA_BQUAL_LEN	(TRX_UNDO_XA_TRID_LEN + 4)
+/** Distributed transaction identifier data */
+#define	TRX_UNDO_XA_XID		(TRX_UNDO_XA_BQUAL_LEN + 4)
+/*--------------------------------------------------------------*/
+#define TRX_UNDO_LOG_XA_HDR_SIZE (TRX_UNDO_XA_XID + XIDDATASIZE)
+				/*!< Total size of the undo log header
+				with the XA XID */
+/* @} */
+
+#ifndef UNIV_NONINL
+#include "trx0undo.ic"
+#endif
+
+#endif
diff --git a/storage/xtradb/include/trx0undo.ic b/storage/xtradb/include/trx0undo.ic
new file mode 100644
index 00000000000..2d289b34ef1
--- /dev/null
+++ b/storage/xtradb/include/trx0undo.ic
@@ -0,0 +1,351 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/trx0undo.ic
+Transaction undo log
+
+Created 3/26/1996 Heikki Tuuri
+*******************************************************/
+
+#include "data0type.h"
+#include "page0page.h"
+
+#ifndef UNIV_HOTBACKUP
+/***********************************************************************//**
+Builds a roll pointer.
+@return	roll pointer */
+UNIV_INLINE
+roll_ptr_t
+trx_undo_build_roll_ptr(
+/*====================*/
+	ibool	is_insert,	/*!< in: TRUE if insert undo log */
+	ulint	rseg_id,	/*!< in: rollback segment id */
+	ulint	page_no,	/*!< in: page number */
+	ulint	offset)		/*!< in: offset of the undo entry within page */
+{
+#if DATA_ROLL_PTR_LEN != 7
+# error "DATA_ROLL_PTR_LEN != 7"
+#endif
+	ut_ad(rseg_id < 128);
+
+	return(ut_dulint_create(is_insert * 128 * 256 * 256
+				+ rseg_id * 256 * 256
+				+ (page_no / 256) / 256,
+				(page_no % (256 * 256)) * 256 * 256
+				+ offset));
+}
+
+/***********************************************************************//**
+Decodes a roll pointer. */
+UNIV_INLINE
+void
+trx_undo_decode_roll_ptr(
+/*=====================*/
+	roll_ptr_t	roll_ptr,	/*!< in: roll pointer */
+	ibool*		is_insert,	/*!< out: TRUE if insert undo log */
+	ulint*		rseg_id,	/*!< out: rollback segment id */
+	ulint*		page_no,	/*!< out: page number */
+	ulint*		offset)		/*!< out: offset of the undo
+					entry within page */
+{
+	ulint	low;
+	ulint	high;
+#if DATA_ROLL_PTR_LEN != 7
+# error "DATA_ROLL_PTR_LEN != 7"
+#endif
+#if TRUE != 1
+# error "TRUE != 1"
+#endif
+	high = ut_dulint_get_high(roll_ptr);
+	low = ut_dulint_get_low(roll_ptr);
+
+	*offset = low % (256 * 256);
+
+	*is_insert = high / (256 * 256 * 128);	/* TRUE == 1 */
+	*rseg_id = (high / (256 * 256)) % 128;
+
+	*page_no = (high % (256 * 256)) * 256 * 256
+		+ (low / 256) / 256;
+}
+
+/***********************************************************************//**
+Returns TRUE if the roll pointer is of the insert type.
+@return	TRUE if insert undo log */
+UNIV_INLINE
+ibool
+trx_undo_roll_ptr_is_insert(
+/*========================*/
+	roll_ptr_t	roll_ptr)	/*!< in: roll pointer */
+{
+	ulint	high;
+#if DATA_ROLL_PTR_LEN != 7
+# error "DATA_ROLL_PTR_LEN != 7"
+#endif
+#if TRUE != 1
+# error "TRUE != 1"
+#endif
+	high = ut_dulint_get_high(roll_ptr);
+
+	return(high / (256 * 256 * 128));
+}
+#endif /* !UNIV_HOTBACKUP */
+
+/*****************************************************************//**
+Writes a roll ptr to an index page. In case that the size changes in
+some future version, this function should be used instead of
+mach_write_... */
+UNIV_INLINE
+void
+trx_write_roll_ptr(
+/*===============*/
+	byte*		ptr,		/*!< in: pointer to memory where
+					written */
+	roll_ptr_t	roll_ptr)	/*!< in: roll ptr */
+{
+#if DATA_ROLL_PTR_LEN != 7
+# error "DATA_ROLL_PTR_LEN != 7"
+#endif
+	mach_write_to_7(ptr, roll_ptr);
+}
+
+/*****************************************************************//**
+Reads a roll ptr from an index page. In case that the roll ptr size
+changes in some future version, this function should be used instead of
+mach_read_...
+@return	roll ptr */
+UNIV_INLINE
+roll_ptr_t
+trx_read_roll_ptr(
+/*==============*/
+	const byte*	ptr)	/*!< in: pointer to memory from where to read */
+{
+#if DATA_ROLL_PTR_LEN != 7
+# error "DATA_ROLL_PTR_LEN != 7"
+#endif
+	return(mach_read_from_7(ptr));
+}
+
+#ifndef UNIV_HOTBACKUP
+/******************************************************************//**
+Gets an undo log page and x-latches it.
+@return	pointer to page x-latched */
+UNIV_INLINE
+page_t*
+trx_undo_page_get(
+/*==============*/
+	ulint	space,		/*!< in: space where placed */
+	ulint	zip_size,	/*!< in: compressed page size in bytes
+				or 0 for uncompressed pages */
+	ulint	page_no,	/*!< in: page number */
+	mtr_t*	mtr)		/*!< in: mtr */
+{
+	buf_block_t*	block = buf_page_get(space, zip_size, page_no,
+					     RW_X_LATCH, mtr);
+	buf_block_dbg_add_level(block, SYNC_TRX_UNDO_PAGE);
+
+	return(buf_block_get_frame(block));
+}
+
+/******************************************************************//**
+Gets an undo log page and s-latches it.
+@return	pointer to page s-latched */
+UNIV_INLINE
+page_t*
+trx_undo_page_get_s_latched(
+/*========================*/
+	ulint	space,		/*!< in: space where placed */
+	ulint	zip_size,	/*!< in: compressed page size in bytes
+				or 0 for uncompressed pages */
+	ulint	page_no,	/*!< in: page number */
+	mtr_t*	mtr)		/*!< in: mtr */
+{
+	buf_block_t*	block = buf_page_get(space, zip_size, page_no,
+					     RW_S_LATCH, mtr);
+	buf_block_dbg_add_level(block, SYNC_TRX_UNDO_PAGE);
+
+	return(buf_block_get_frame(block));
+}
+
+/******************************************************************//**
+Returns the start offset of the undo log records of the specified undo
+log on the page.
+@return	start offset */
+UNIV_INLINE
+ulint
+trx_undo_page_get_start(
+/*====================*/
+	page_t*	undo_page,/*!< in: undo log page */
+	ulint	page_no,/*!< in: undo log header page number */
+	ulint	offset)	/*!< in: undo log header offset on page */
+{
+	ulint	start;
+
+	if (page_no == page_get_page_no(undo_page)) {
+
+		start = mach_read_from_2(offset + undo_page
+					 + TRX_UNDO_LOG_START);
+	} else {
+		start = TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_HDR_SIZE;
+	}
+
+	return(start);
+}
+
+/******************************************************************//**
+Returns the end offset of the undo log records of the specified undo
+log on the page.
+@return	end offset */
+UNIV_INLINE
+ulint
+trx_undo_page_get_end(
+/*==================*/
+	page_t*	undo_page,/*!< in: undo log page */
+	ulint	page_no,/*!< in: undo log header page number */
+	ulint	offset)	/*!< in: undo log header offset on page */
+{
+	trx_ulogf_t*	log_hdr;
+	ulint		end;
+
+	if (page_no == page_get_page_no(undo_page)) {
+
+		log_hdr = undo_page + offset;
+
+		end = mach_read_from_2(log_hdr + TRX_UNDO_NEXT_LOG);
+
+		if (end == 0) {
+			end = mach_read_from_2(undo_page + TRX_UNDO_PAGE_HDR
+					       + TRX_UNDO_PAGE_FREE);
+		}
+	} else {
+		end = mach_read_from_2(undo_page + TRX_UNDO_PAGE_HDR
+				       + TRX_UNDO_PAGE_FREE);
+	}
+
+	return(end);
+}
+
+/******************************************************************//**
+Returns the previous undo record on the page in the specified log, or
+NULL if none exists.
+@return	pointer to record, NULL if none */
+UNIV_INLINE
+trx_undo_rec_t*
+trx_undo_page_get_prev_rec(
+/*=======================*/
+	trx_undo_rec_t*	rec,	/*!< in: undo log record */
+	ulint		page_no,/*!< in: undo log header page number */
+	ulint		offset)	/*!< in: undo log header offset on page */
+{
+	page_t*	undo_page;
+	ulint	start;
+
+	undo_page = (page_t*) ut_align_down(rec, UNIV_PAGE_SIZE);
+
+	start = trx_undo_page_get_start(undo_page, page_no, offset);
+
+	if (start + undo_page == rec) {
+
+		return(NULL);
+	}
+
+	return(undo_page + mach_read_from_2(rec - 2));
+}
+
+/******************************************************************//**
+Returns the next undo log record on the page in the specified log, or
+NULL if none exists.
+@return	pointer to record, NULL if none */
+UNIV_INLINE
+trx_undo_rec_t*
+trx_undo_page_get_next_rec(
+/*=======================*/
+	trx_undo_rec_t*	rec,	/*!< in: undo log record */
+	ulint		page_no,/*!< in: undo log header page number */
+	ulint		offset)	/*!< in: undo log header offset on page */
+{
+	page_t*	undo_page;
+	ulint	end;
+	ulint	next;
+
+	undo_page = (page_t*) ut_align_down(rec, UNIV_PAGE_SIZE);
+
+	end = trx_undo_page_get_end(undo_page, page_no, offset);
+
+	next = mach_read_from_2(rec);
+
+	if (next == end) {
+
+		return(NULL);
+	}
+
+	return(undo_page + next);
+}
+
+/******************************************************************//**
+Returns the last undo record on the page in the specified undo log, or
+NULL if none exists.
+@return	pointer to record, NULL if none */
+UNIV_INLINE
+trx_undo_rec_t*
+trx_undo_page_get_last_rec(
+/*=======================*/
+	page_t*	undo_page,/*!< in: undo log page */
+	ulint	page_no,/*!< in: undo log header page number */
+	ulint	offset)	/*!< in: undo log header offset on page */
+{
+	ulint	start;
+	ulint	end;
+
+	start = trx_undo_page_get_start(undo_page, page_no, offset);
+	end = trx_undo_page_get_end(undo_page, page_no, offset);
+
+	if (start == end) {
+
+		return(NULL);
+	}
+
+	return(undo_page + mach_read_from_2(undo_page + end - 2));
+}
+
+/******************************************************************//**
+Returns the first undo record on the page in the specified undo log, or
+NULL if none exists.
+@return	pointer to record, NULL if none */
+UNIV_INLINE
+trx_undo_rec_t*
+trx_undo_page_get_first_rec(
+/*========================*/
+	page_t*	undo_page,/*!< in: undo log page */
+	ulint	page_no,/*!< in: undo log header page number */
+	ulint	offset)	/*!< in: undo log header offset on page */
+{
+	ulint	start;
+	ulint	end;
+
+	start = trx_undo_page_get_start(undo_page, page_no, offset);
+	end = trx_undo_page_get_end(undo_page, page_no, offset);
+
+	if (start == end) {
+
+		return(NULL);
+	}
+
+	return(undo_page + start);
+}
+#endif /* !UNIV_HOTBACKUP */
diff --git a/storage/xtradb/include/trx0xa.h b/storage/xtradb/include/trx0xa.h
new file mode 100644
index 00000000000..e0dd8a1af5b
--- /dev/null
+++ b/storage/xtradb/include/trx0xa.h
@@ -0,0 +1,70 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/*
+ * Start of xa.h header
+ *
+ * Define a symbol to prevent multiple inclusions of this header file
+ */
+#ifndef	XA_H
+#define	XA_H
+
+/*
+ * Transaction branch identification: XID and NULLXID:
+ */
+#ifndef XIDDATASIZE
+
+/** Sizes of transaction identifier */
+#define	XIDDATASIZE	128		/*!< maximum size of a transaction
+					identifier, in bytes */
+#define	MAXGTRIDSIZE	 64		/*!< maximum size in bytes of gtrid */
+#define	MAXBQUALSIZE	 64		/*!< maximum size in bytes of bqual */
+
+/** X/Open XA distributed transaction identifier */
+struct xid_t {
+	long formatID;			/*!< format identifier; -1
+					means that the XID is null */
+	long gtrid_length;		/*!< value from 1 through 64 */
+	long bqual_length;		/*!< value from 1 through 64 */
+	char data[XIDDATASIZE];		/*!< distributed transaction
+					identifier */
+};
+/** X/Open XA distributed transaction identifier */
+typedef	struct xid_t XID;
+#endif
+/** X/Open XA distributed transaction status codes */
+/* @{ */
+#define	XA_OK		0		/*!< normal execution */
+#define	XAER_ASYNC	-2		/*!< asynchronous operation already
+					outstanding */
+#define	XAER_RMERR	-3		/*!< a resource manager error
+					occurred in the transaction
+					branch */
+#define	XAER_NOTA	-4		/*!< the XID is not valid */
+#define	XAER_INVAL	-5		/*!< invalid arguments were given */
+#define	XAER_PROTO	-6		/*!< routine invoked in an improper
+					context */
+#define	XAER_RMFAIL	-7		/*!< resource manager unavailable */
+#define	XAER_DUPID	-8		/*!< the XID already exists */
+#define	XAER_OUTSIDE	-9		/*!< resource manager doing
+					work outside transaction */
+/* @} */
+#endif /* ifndef XA_H */
+/*
+ * End of xa.h header
+ */
diff --git a/storage/xtradb/include/univ.i b/storage/xtradb/include/univ.i
new file mode 100644
index 00000000000..8691e3cf337
--- /dev/null
+++ b/storage/xtradb/include/univ.i
@@ -0,0 +1,501 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2010, Innobase Oy. All Rights Reserved.
+Copyright (c) 2008, Google Inc.
+Copyright (c) 2009, Sun Microsystems, Inc.
+
+Portions of this file contain modifications contributed and copyrighted by
+Google, Inc. Those modifications are gratefully acknowledged and are described
+briefly in the InnoDB documentation. The contributions by Google are
+incorporated with their permission, and subject to the conditions contained in
+the file COPYING.Google.
+
+Portions of this file contain modifications contributed and copyrighted by
+Sun Microsystems, Inc. Those modifications are gratefully acknowledged and
+are described briefly in the InnoDB documentation. The contributions by
+Sun Microsystems are incorporated with their permission, and subject to the
+conditions contained in the file COPYING.Sun_Microsystems.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/***********************************************************************//**
+@file include/univ.i
+Version control for database, common definitions, and include files
+
+Created 1/20/1994 Heikki Tuuri
+****************************************************************************/
+
+#ifndef univ_i
+#define univ_i
+
+#ifdef UNIV_HOTBACKUP
+#include "hb_univ.i"
+#endif /* UNIV_HOTBACKUP */
+
+#define INNODB_VERSION_MAJOR	1
+#define INNODB_VERSION_MINOR	0
+#define INNODB_VERSION_BUGFIX	12
+#define PERCONA_INNODB_VERSION 12.1
+
+/* The following is the InnoDB version as shown in
+SELECT plugin_version FROM information_schema.plugins;
+calculated in make_version_string() in sql/sql_show.cc like this:
+"version >> 8" . "version & 0xff"
+because the version is shown with only one dot, we skip the last
+component, i.e. we show M.N.P as M.N */
+#define INNODB_VERSION_SHORT	\
+	(INNODB_VERSION_MAJOR << 8 | INNODB_VERSION_MINOR)
+
+/* auxiliary macros to help creating the version as string */
+#define __INNODB_VERSION(a, b, c, d)   (#a "." #b "." #c "-" #d)
+#define _INNODB_VERSION(a, b, c, d)    __INNODB_VERSION(a, b, c, d)
+
+
+#define INNODB_VERSION_STR			\
+	_INNODB_VERSION(INNODB_VERSION_MAJOR,	\
+			INNODB_VERSION_MINOR,	\
+			INNODB_VERSION_BUGFIX,  \
+			PERCONA_INNODB_VERSION)
+
+#define REFMAN "http://dev.mysql.com/doc/refman/5.1/en/"
+
+#ifdef MYSQL_DYNAMIC_PLUGIN
+/* In the dynamic plugin, redefine some externally visible symbols
+in order not to conflict with the symbols of a builtin InnoDB. */
+
+/* Rename all C++ classes that contain virtual functions, because we
+have not figured out how to apply the visibility=hidden attribute to
+the virtual method table (vtable) in GCC 3. */
+# define ha_innobase ha_innodb
+#endif /* MYSQL_DYNAMIC_PLUGIN */
+
+/* if any of the following macros is defined at this point this means
+that the code from the "right" plug.in was executed and we do not
+need to include ut0auxconf.h which would either define the same macros
+or will be empty */
+#if !defined(HAVE_IB_GCC_ATOMIC_BUILTINS) \
+ && !defined(HAVE_IB_ATOMIC_PTHREAD_T_GCC) \
+ && !defined(HAVE_IB_SOLARIS_ATOMICS) \
+ && !defined(HAVE_IB_ATOMIC_PTHREAD_T_SOLARIS) \
+ && !defined(SIZEOF_PTHREAD_T) \
+ && !defined(HAVE_IB_PAUSE_INSTRUCTION)
+# include "ut0auxconf.h"
+#endif
+
+#if (defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)) && !defined(MYSQL_SERVER) && !defined(__WIN__)
+# undef __WIN__
+# define __WIN__
+
+# include <windows.h>
+
+# ifdef _NT_
+#  define __NT__
+# endif
+
+#else
+/* The defines used with MySQL */
+
+/* Include two header files from MySQL to make the Unix flavor used
+in compiling more Posix-compatible. These headers also define __WIN__
+if we are compiling on Windows. */
+
+#ifndef UNIV_HOTBACKUP
+# include <my_global.h>
+# include <my_pthread.h>
+#endif /* UNIV_HOTBACKUP */
+
+/* Include <sys/stat.h> to get S_I... macros defined for os0file.c */
+# include <sys/stat.h>
+# if !defined(__NETWARE__) && !defined(__WIN__)
+#  include <sys/mman.h> /* mmap() for os0proc.c */
+# endif
+
+/* Include the header file generated by GNU autoconf */
+# ifndef __WIN__
+#  ifndef UNIV_HOTBACKUP
+#   include "config.h"
+#  endif /* UNIV_HOTBACKUP */
+# endif
+
+# ifdef HAVE_SCHED_H
+#  include <sched.h>
+# endif
+
+/* We only try to do explicit inlining of functions with gcc and
+Sun Studio */
+
+# if !defined(__GNUC__) && !(defined(__SUNPRO_C) || defined(__SUNPRO_CC))
+#  undef  UNIV_MUST_NOT_INLINE			/* Remove compiler warning */
+#  define UNIV_MUST_NOT_INLINE
+# endif
+
+# ifdef HAVE_PREAD
+#  define HAVE_PWRITE
+# endif
+
+#endif /* #if (defined(WIN32) || ... */
+
+/*			DEBUG VERSION CONTROL
+			===================== */
+
+/* The following flag will make InnoDB to initialize
+all memory it allocates to zero. It hides Purify
+warnings about reading unallocated memory unless
+memory is read outside the allocated blocks. */
+/*
+#define UNIV_INIT_MEM_TO_ZERO
+*/
+
+/* When this macro is defined then additional test functions will be
+compiled. These functions live at the end of each relevant source file
+and have "test_" prefix. These functions are not called from anywhere in
+the code, they can be called from gdb after
+innobase_start_or_create_for_mysql() has executed using the call
+command. Not tested on Windows. */
+/*
+#define UNIV_COMPILE_TEST_FUNCS
+*/
+
+#if defined(HAVE_valgrind)&& defined(HAVE_VALGRIND_MEMCHECK_H)
+# define UNIV_DEBUG_VALGRIND
+#endif
+#if 0
+#define UNIV_DEBUG_VALGRIND			/* Enable extra
+						Valgrind instrumentation */
+#define UNIV_DEBUG_PRINT			/* Enable the compilation of
+						some debug print functions */
+#define UNIV_AHI_DEBUG				/* Enable adaptive hash index
+						debugging without UNIV_DEBUG */
+#define UNIV_BUF_DEBUG				/* Enable buffer pool
+						debugging without UNIV_DEBUG */
+#define UNIV_DEBUG				/* Enable ut_ad() assertions
+						and disable UNIV_INLINE */
+#define UNIV_DEBUG_LOCK_VALIDATE		/* Enable
+						ut_ad(lock_rec_validate_page())
+						assertions. */
+#define UNIV_DEBUG_FILE_ACCESSES		/* Debug .ibd file access
+						(field file_page_was_freed
+						in buf_page_t) */
+#define UNIV_LRU_DEBUG				/* debug the buffer pool LRU */
+#define UNIV_HASH_DEBUG				/* debug HASH_ macros */
+#define UNIV_LIST_DEBUG				/* debug UT_LIST_ macros */
+#define UNIV_LOG_LSN_DEBUG			/* write LSN to the redo log;
+this will break redo log file compatibility, but it may be useful when
+debugging redo log application problems. */
+#define UNIV_MEM_DEBUG				/* detect memory leaks etc */
+#define UNIV_IBUF_DEBUG				/* debug the insert buffer */
+#define UNIV_IBUF_COUNT_DEBUG			/* debug the insert buffer;
+this limits the database to IBUF_COUNT_N_SPACES and IBUF_COUNT_N_PAGES,
+and the insert buffer must be empty when the database is started */
+#define UNIV_SYNC_DEBUG				/* debug mutex and latch
+operations (very slow); also UNIV_DEBUG must be defined */
+#define UNIV_SEARCH_DEBUG			/* debug B-tree comparisons */
+#define UNIV_SYNC_PERF_STAT			/* operation counts for
+						rw-locks and mutexes */
+#define UNIV_SEARCH_PERF_STAT			/* statistics for the
+						adaptive hash index */
+#define UNIV_SRV_PRINT_LATCH_WAITS		/* enable diagnostic output
+						in sync0sync.c */
+#define UNIV_BTR_PRINT				/* enable functions for
+						printing B-trees */
+#define UNIV_ZIP_DEBUG				/* extensive consistency checks
+						for compressed pages */
+#define UNIV_ZIP_COPY				/* call page_zip_copy_recs()
+						more often */
+#endif
+
+#define UNIV_BTR_DEBUG				/* check B-tree links */
+#define UNIV_LIGHT_MEM_DEBUG			/* light memory debugging */
+
+#ifdef HAVE_valgrind
+/* The following sets all new allocated memory to zero before use:
+this can be used to eliminate unnecessary Purify warnings, but note that
+it also masks many bugs Purify could detect. For detailed Purify analysis it
+is best to remove the define below and look through the warnings one
+by one. */
+#define UNIV_SET_MEM_TO_ZERO
+#endif
+
+/*
+#define UNIV_SQL_DEBUG
+#define UNIV_LOG_DEBUG
+*/
+			/* the above option prevents forcing of log to disk
+			at a buffer page write: it should be tested with this
+			option off; also some ibuf tests are suppressed */
+
+/* Linkage specifier for non-static InnoDB symbols (variables and functions)
+that are only referenced from within InnoDB, not from MySQL */
+#if defined(__GNUC__) && (__GNUC__ >= 4) || defined(__INTEL_COMPILER)
+# define UNIV_INTERN __attribute__((visibility ("hidden")))
+#else
+# define UNIV_INTERN
+#endif
+
+#if (!defined(UNIV_DEBUG) && !defined(UNIV_MUST_NOT_INLINE))
+/* Definition for inline version */
+
+#ifdef __WIN__
+# define UNIV_INLINE	__inline
+#elif defined(__SUNPRO_CC) || defined(__SUNPRO_C)
+# define UNIV_INLINE static inline
+#else
+# define UNIV_INLINE static __inline__
+#endif
+
+#else
+/* If we want to compile a noninlined version we use the following macro
+definitions: */
+
+#define UNIV_NONINL
+#define UNIV_INLINE	UNIV_INTERN
+
+#endif	/* UNIV_DEBUG */
+
+#ifdef _WIN32
+#define UNIV_WORD_SIZE		4
+#elif defined(_WIN64)
+#define UNIV_WORD_SIZE		8
+#else
+/* MySQL config.h generated by GNU autoconf will define SIZEOF_LONG in Posix */
+#define UNIV_WORD_SIZE		SIZEOF_LONG
+#endif
+
+/* The following alignment is used in memory allocations in memory heap
+management to ensure correct alignment for doubles etc. */
+#define UNIV_MEM_ALIGNMENT      8
+
+/* The following alignment is used in aligning lints etc. */
+#define UNIV_WORD_ALIGNMENT	UNIV_WORD_SIZE
+
+/*
+			DATABASE VERSION CONTROL
+			========================
+*/
+
+/* The 2-logarithm of UNIV_PAGE_SIZE: */
+/* #define UNIV_PAGE_SIZE_SHIFT	14 */
+#define UNIV_PAGE_SIZE_SHIFT_MAX	14
+#define UNIV_PAGE_SIZE_SHIFT	srv_page_size_shift
+/* The universal page size of the database */
+/* #define UNIV_PAGE_SIZE		(1u << UNIV_PAGE_SIZE_SHIFT) */
+#define UNIV_PAGE_SIZE		srv_page_size
+#define UNIV_PAGE_SIZE_MAX	(1u << UNIV_PAGE_SIZE_SHIFT_MAX)
+
+/* Maximum number of parallel threads in a parallelized operation */
+#define UNIV_MAX_PARALLELISM	32
+
+/* The maximum length of a table name. This is the MySQL limit and is
+defined in mysql_com.h like NAME_CHAR_LEN*SYSTEM_CHARSET_MBMAXLEN, the
+number does not include a terminating '\0'. InnoDB probably can handle
+longer names internally */
+#define MAX_TABLE_NAME_LEN	192
+
+/*
+			UNIVERSAL TYPE DEFINITIONS
+			==========================
+*/
+
+/* Note that inside MySQL 'byte' is defined as char on Linux! */
+#define byte			unsigned char
+
+/* Define an unsigned integer type that is exactly 32 bits. */
+
+#if SIZEOF_INT == 4
+typedef unsigned int		ib_uint32_t;
+#elif SIZEOF_LONG == 4
+typedef unsigned long		ib_uint32_t;
+#else
+#error "Neither int or long is 4 bytes"
+#endif
+
+/* Another basic type we use is unsigned long integer which should be equal to
+the word size of the machine, that is on a 32-bit platform 32 bits, and on a
+64-bit platform 64 bits. We also give the printf format for the type as a
+macro ULINTPF. */
+
+#ifdef _WIN64
+typedef unsigned __int64	ulint;
+#define ULINTPF			"%I64u"
+typedef __int64			lint;
+#define MYSQL_SYSVAR_ULINT MYSQL_SYSVAR_ULONGLONG
+#else
+typedef unsigned long int	ulint;
+#define ULINTPF			"%lu"
+typedef long int		lint;
+#define MYSQL_SYSVAR_ULINT MYSQL_SYSVAR_ULONG
+#endif
+
+#ifdef __WIN__
+typedef __int64			ib_int64_t;
+typedef unsigned __int64	ib_uint64_t;
+#elif !defined(UNIV_HOTBACKUP)
+/* Note: longlong and ulonglong come from MySQL headers. */
+typedef longlong		ib_int64_t;
+typedef ulonglong		ib_uint64_t;
+#endif
+
+#ifndef UNIV_HOTBACKUP
+typedef unsigned long long int	ullint;
+#endif /* UNIV_HOTBACKUP */
+
+#ifndef __WIN__
+#if SIZEOF_LONG != SIZEOF_VOIDP
+#error "Error: InnoDB's ulint must be of the same size as void*"
+#endif
+#endif
+
+/* The 'undefined' value for a ulint */
+#define ULINT_UNDEFINED		((ulint)(-1))
+
+/* The undefined 32-bit unsigned integer */
+#define	ULINT32_UNDEFINED	0xFFFFFFFF
+
+/* Maximum value for a ulint */
+#define ULINT_MAX		((ulint)(-2))
+
+/* Maximum value for ib_uint64_t */
+#define IB_ULONGLONG_MAX	((ib_uint64_t) (~0ULL))
+
+/* This 'ibool' type is used within Innobase. Remember that different included
+headers may define 'bool' differently. Do not assume that 'bool' is a ulint! */
+#define ibool			ulint
+
+#ifndef TRUE
+
+#define TRUE    1
+#define FALSE   0
+
+#endif
+
+/* The following number as the length of a logical field means that the field
+has the SQL NULL as its value. NOTE that because we assume that the length
+of a field is a 32-bit integer when we store it, for example, to an undo log
+on disk, we must have also this number fit in 32 bits, also in 64-bit
+computers! */
+
+#define UNIV_SQL_NULL ULINT32_UNDEFINED
+
+/* Lengths which are not UNIV_SQL_NULL, but bigger than the following
+number indicate that a field contains a reference to an externally
+stored part of the field in the tablespace. The length field then
+contains the sum of the following flag and the locally stored len. */
+
+#define UNIV_EXTERN_STORAGE_FIELD (UNIV_SQL_NULL - UNIV_PAGE_SIZE_MAX)
+
+/* Some macros to improve branch prediction and reduce cache misses */
+#if defined(__GNUC__) && (__GNUC__ > 2) && ! defined(__INTEL_COMPILER)
+/* Tell the compiler that 'expr' probably evaluates to 'constant'. */
+# define UNIV_EXPECT(expr,constant) __builtin_expect(expr, constant)
+/* Tell the compiler that a pointer is likely to be NULL */
+# define UNIV_LIKELY_NULL(ptr) __builtin_expect((ulint) ptr, 0)
+/* Minimize cache-miss latency by moving data at addr into a cache before
+it is read. */
+# define UNIV_PREFETCH_R(addr) __builtin_prefetch(addr, 0, 3)
+/* Minimize cache-miss latency by moving data at addr into a cache before
+it is read or written. */
+# define UNIV_PREFETCH_RW(addr) __builtin_prefetch(addr, 1, 3)
+/* Sun Studio includes sun_prefetch.h as of version 5.9 */
+#elif (defined(__SUNPRO_C) && __SUNPRO_C >= 0x590) \
+       || (defined(__SUNPRO_CC) && __SUNPRO_CC >= 0x590)
+# include <sun_prefetch.h>
+#if __SUNPRO_C >= 0x550
+# undef UNIV_INTERN
+# define UNIV_INTERN __hidden
+#endif /* __SUNPRO_C >= 0x550 */
+/* Use sun_prefetch when compile with Sun Studio */
+# define UNIV_EXPECT(expr,value) (expr)
+# define UNIV_LIKELY_NULL(expr) (expr)
+# define UNIV_PREFETCH_R(addr) sun_prefetch_read_many(addr)
+# define UNIV_PREFETCH_RW(addr) sun_prefetch_write_many(addr)
+#else
+/* Dummy versions of the macros */
+# define UNIV_EXPECT(expr,value) (expr)
+# define UNIV_LIKELY_NULL(expr) (expr)
+# define UNIV_PREFETCH_R(addr) ((void) 0)
+# define UNIV_PREFETCH_RW(addr) ((void) 0)
+#endif
+/* Tell the compiler that cond is likely to hold */
+#define UNIV_LIKELY(cond) UNIV_EXPECT(cond, TRUE)
+/* Tell the compiler that cond is unlikely to hold */
+#define UNIV_UNLIKELY(cond) UNIV_EXPECT(cond, FALSE)
+
+/* Compile-time constant of the given array's size. */
+#define UT_ARR_SIZE(a) (sizeof(a) / sizeof((a)[0]))
+
+/* The return type from a thread's start function differs between Unix and
+Windows, so define a typedef for it and a macro to use at the end of such
+functions. */
+
+#ifdef __WIN__
+typedef ulint os_thread_ret_t;
+#define OS_THREAD_DUMMY_RETURN return(0)
+#else
+typedef void* os_thread_ret_t;
+#define OS_THREAD_DUMMY_RETURN return(NULL)
+#endif
+
+#include <stdio.h>
+#include "ut0dbg.h"
+#include "ut0ut.h"
+#include "db0err.h"
+#ifdef UNIV_DEBUG_VALGRIND
+# include <valgrind/memcheck.h>
+# define UNIV_MEM_VALID(addr, size) VALGRIND_MAKE_MEM_DEFINED(addr, size)
+# define UNIV_MEM_INVALID(addr, size) VALGRIND_MAKE_MEM_UNDEFINED(addr, size)
+# define UNIV_MEM_FREE(addr, size) VALGRIND_MAKE_MEM_NOACCESS(addr, size)
+# define UNIV_MEM_ALLOC(addr, size) VALGRIND_MAKE_MEM_UNDEFINED(addr, size)
+# define UNIV_MEM_DESC(addr, size, b) VALGRIND_CREATE_BLOCK(addr, size, b)
+# define UNIV_MEM_UNDESC(b) VALGRIND_DISCARD(b)
+# define UNIV_MEM_ASSERT_RW(addr, size) do {				\
+	const void* _p = (const void*) (ulint)				\
+		VALGRIND_CHECK_MEM_IS_DEFINED(addr, size);		\
+	if (UNIV_LIKELY_NULL(_p))					\
+		fprintf(stderr, "%s:%d: %p[%u] undefined at %ld\n",	\
+			__FILE__, __LINE__,				\
+			(const void*) (addr), (unsigned) (size), (long)	\
+			(((const char*) _p) - ((const char*) (addr))));	\
+	} while (0)
+# define UNIV_MEM_ASSERT_W(addr, size) do {				\
+	const void* _p = (const void*) (ulint)				\
+		VALGRIND_CHECK_MEM_IS_ADDRESSABLE(addr, size);		\
+	if (UNIV_LIKELY_NULL(_p))					\
+		fprintf(stderr, "%s:%d: %p[%u] unwritable at %ld\n",	\
+			__FILE__, __LINE__,				\
+			(const void*) (addr), (unsigned) (size), (long)	\
+			(((const char*) _p) - ((const char*) (addr))));	\
+	} while (0)
+#else
+# define UNIV_MEM_VALID(addr, size) do {} while(0)
+# define UNIV_MEM_INVALID(addr, size) do {} while(0)
+# define UNIV_MEM_FREE(addr, size) do {} while(0)
+# define UNIV_MEM_ALLOC(addr, size) do {} while(0)
+# define UNIV_MEM_DESC(addr, size, b) do {} while(0)
+# define UNIV_MEM_UNDESC(b) do {} while(0)
+# define UNIV_MEM_ASSERT_RW(addr, size) do {} while(0)
+# define UNIV_MEM_ASSERT_W(addr, size) do {} while(0)
+#endif
+#define UNIV_MEM_ASSERT_AND_FREE(addr, size) do {	\
+	UNIV_MEM_ASSERT_W(addr, size);			\
+	UNIV_MEM_FREE(addr, size);			\
+} while (0)
+#define UNIV_MEM_ASSERT_AND_ALLOC(addr, size) do {	\
+	UNIV_MEM_ASSERT_W(addr, size);			\
+	UNIV_MEM_ALLOC(addr, size);			\
+} while (0)
+
+extern ulint	srv_page_size_shift;
+extern ulint	srv_page_size;
+#endif
diff --git a/storage/xtradb/include/usr0sess.h b/storage/xtradb/include/usr0sess.h
new file mode 100644
index 00000000000..2c288f7d455
--- /dev/null
+++ b/storage/xtradb/include/usr0sess.h
@@ -0,0 +1,76 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/usr0sess.h
+Sessions
+
+Created 6/25/1996 Heikki Tuuri
+*******************************************************/
+
+#ifndef usr0sess_h
+#define usr0sess_h
+
+#include "univ.i"
+#include "ut0byte.h"
+#include "trx0types.h"
+#include "srv0srv.h"
+#include "trx0types.h"
+#include "usr0types.h"
+#include "que0types.h"
+#include "data0data.h"
+#include "rem0rec.h"
+
+/*********************************************************************//**
+Opens a session.
+@return	own: session object */
+UNIV_INTERN
+sess_t*
+sess_open(void);
+/*============*/
+/*********************************************************************//**
+Closes a session, freeing the memory occupied by it. */
+UNIV_INTERN
+void
+sess_close(
+/*=======*/
+	sess_t*		sess);		/* in, own: session object */
+
+/* The session handle. All fields are protected by the kernel mutex */
+struct sess_struct{
+	ulint		state;		/*!< state of the session */
+	trx_t*		trx;		/*!< transaction object permanently
+					assigned for the session: the
+					transaction instance designated by the
+					trx id changes, but the memory
+					structure is preserved */
+	UT_LIST_BASE_NODE_T(que_t)
+			graphs;		/*!< query graphs belonging to this
+					session */
+};
+
+/* Session states */
+#define SESS_ACTIVE		1
+#define SESS_ERROR		2	/* session contains an error message
+					which has not yet been communicated
+					to the client */
+#ifndef UNIV_NONINL
+#include "usr0sess.ic"
+#endif
+
+#endif
diff --git a/storage/xtradb/include/usr0sess.ic b/storage/xtradb/include/usr0sess.ic
new file mode 100644
index 00000000000..35a75d75acc
--- /dev/null
+++ b/storage/xtradb/include/usr0sess.ic
@@ -0,0 +1,24 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/usr0sess.ic
+Sessions
+
+Created 6/25/1996 Heikki Tuuri
+*******************************************************/
diff --git a/storage/xtradb/include/usr0types.h b/storage/xtradb/include/usr0types.h
new file mode 100644
index 00000000000..6cc6f015613
--- /dev/null
+++ b/storage/xtradb/include/usr0types.h
@@ -0,0 +1,31 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file include/usr0types.h
+Users and sessions global types
+
+Created 6/25/1996 Heikki Tuuri
+*******************************************************/
+
+#ifndef usr0types_h
+#define usr0types_h
+
+typedef struct sess_struct	sess_t;
+
+#endif
diff --git a/storage/xtradb/include/ut0auxconf.h b/storage/xtradb/include/ut0auxconf.h
new file mode 100644
index 00000000000..16bcc308392
--- /dev/null
+++ b/storage/xtradb/include/ut0auxconf.h
@@ -0,0 +1,14 @@
+/* Do not remove this file even though it is empty.
+This file is included in univ.i and will cause compilation failure
+if not present.
+A custom checks have been added in the generated
+storage/innobase/Makefile.in that is shipped with the InnoDB Plugin
+source archive. These checks eventually define some macros and put
+them in this file.
+This is a hack that has been developed in order to deploy new compile
+time checks without the need to regenerate the ./configure script that is
+distributed in the MySQL 5.1 official source archives.
+If by any chance Makefile.in and ./configure are regenerated and thus
+the hack from Makefile.in wiped away then the "real" checks from plug.in
+will take over.
+*/
diff --git a/storage/xtradb/include/ut0byte.h b/storage/xtradb/include/ut0byte.h
new file mode 100644
index 00000000000..f55e2888c60
--- /dev/null
+++ b/storage/xtradb/include/ut0byte.h
@@ -0,0 +1,270 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/******************************************************************//**
+@file include/ut0byte.h
+Utilities for byte operations
+
+Created 1/20/1994 Heikki Tuuri
+***********************************************************************/
+
+#ifndef ut0byte_h
+#define ut0byte_h
+
+
+#include "univ.i"
+
+/** Pair of ulint integers. */
+typedef	struct dulint_struct	dulint;
+/** Type definition for a 64-bit unsigned integer, which works also
+in 32-bit machines. NOTE! Access the fields only with the accessor
+functions. This definition appears here only for the compiler to
+know the size of a dulint. */
+struct dulint_struct{
+	ulint	high;	/*!< most significant 32 bits */
+	ulint	low;	/*!< least significant 32 bits */
+};
+
+/** Zero value for a dulint */
+extern const dulint	ut_dulint_zero;
+
+/** Maximum value for a dulint */
+extern const dulint	ut_dulint_max;
+
+/*******************************************************//**
+Creates a 64-bit dulint out of two ulints.
+@return	created dulint */
+UNIV_INLINE
+dulint
+ut_dulint_create(
+/*=============*/
+	ulint	high,	/*!< in: high-order 32 bits */
+	ulint	low);	/*!< in: low-order 32 bits */
+/*******************************************************//**
+Gets the high-order 32 bits of a dulint.
+@return	32 bits in ulint */
+UNIV_INLINE
+ulint
+ut_dulint_get_high(
+/*===============*/
+	dulint	d);	/*!< in: dulint */
+/*******************************************************//**
+Gets the low-order 32 bits of a dulint.
+@return	32 bits in ulint */
+UNIV_INLINE
+ulint
+ut_dulint_get_low(
+/*==============*/
+	dulint	d);	/*!< in: dulint */
+/*******************************************************//**
+Converts a dulint (a struct of 2 ulints) to ib_int64_t, which is a 64-bit
+integer type.
+@return	value in ib_int64_t type */
+UNIV_INLINE
+ib_int64_t
+ut_conv_dulint_to_longlong(
+/*=======================*/
+	dulint	d);	/*!< in: dulint */
+/*******************************************************//**
+Tests if a dulint is zero.
+@return	TRUE if zero */
+UNIV_INLINE
+ibool
+ut_dulint_is_zero(
+/*==============*/
+	dulint	a);	/*!< in: dulint */
+/*******************************************************//**
+Compares two dulints.
+@return	-1 if a < b, 0 if a == b, 1 if a > b */
+UNIV_INLINE
+int
+ut_dulint_cmp(
+/*==========*/
+	dulint	a,	/*!< in: dulint */
+	dulint	b);	/*!< in: dulint */
+/*******************************************************//**
+Calculates the max of two dulints.
+@return	max(a, b) */
+UNIV_INLINE
+dulint
+ut_dulint_get_max(
+/*==============*/
+	dulint	a,	/*!< in: dulint */
+	dulint	b);	/*!< in: dulint */
+/*******************************************************//**
+Calculates the min of two dulints.
+@return	min(a, b) */
+UNIV_INLINE
+dulint
+ut_dulint_get_min(
+/*==============*/
+	dulint	a,	/*!< in: dulint */
+	dulint	b);	/*!< in: dulint */
+/*******************************************************//**
+Adds a ulint to a dulint.
+@return	sum a + b */
+UNIV_INLINE
+dulint
+ut_dulint_add(
+/*==========*/
+	dulint	a,	/*!< in: dulint */
+	ulint	b);	/*!< in: ulint */
+/*******************************************************//**
+Subtracts a ulint from a dulint.
+@return	a - b */
+UNIV_INLINE
+dulint
+ut_dulint_subtract(
+/*===============*/
+	dulint	a,	/*!< in: dulint */
+	ulint	b);	/*!< in: ulint, b <= a */
+/*******************************************************//**
+Subtracts a dulint from another. NOTE that the difference must be positive
+and smaller that 4G.
+@return	a - b */
+UNIV_INLINE
+ulint
+ut_dulint_minus(
+/*============*/
+	dulint	a,	/*!< in: dulint; NOTE a must be >= b and at most
+			2 to power 32 - 1 greater */
+	dulint	b);	/*!< in: dulint */
+/********************************************************//**
+Rounds a dulint downward to a multiple of a power of 2.
+@return	rounded value */
+UNIV_INLINE
+dulint
+ut_dulint_align_down(
+/*=================*/
+	dulint	 n,		/*!< in: number to be rounded */
+	ulint	 align_no);	/*!< in: align by this number which must be a
+				power of 2 */
+/********************************************************//**
+Rounds a dulint upward to a multiple of a power of 2.
+@return	rounded value */
+UNIV_INLINE
+dulint
+ut_dulint_align_up(
+/*===============*/
+	dulint	 n,		/*!< in: number to be rounded */
+	ulint	 align_no);	/*!< in: align by this number which must be a
+				power of 2 */
+/********************************************************//**
+Rounds a dulint downward to a multiple of a power of 2.
+@return	rounded value */
+UNIV_INLINE
+ib_uint64_t
+ut_uint64_align_down(
+/*=================*/
+	ib_uint64_t	 n,		/*!< in: number to be rounded */
+	ulint		 align_no);	/*!< in: align by this number
+					which must be a power of 2 */
+/********************************************************//**
+Rounds ib_uint64_t upward to a multiple of a power of 2.
+@return	rounded value */
+UNIV_INLINE
+ib_uint64_t
+ut_uint64_align_up(
+/*===============*/
+	ib_uint64_t	 n,		/*!< in: number to be rounded */
+	ulint		 align_no);	/*!< in: align by this number
+					which must be a power of 2 */
+/*******************************************************//**
+Increments a dulint variable by 1. */
+#define UT_DULINT_INC(D)\
+{\
+	if ((D).low == 0xFFFFFFFFUL) {\
+		(D).high = (D).high + 1;\
+		(D).low = 0;\
+	} else {\
+		(D).low = (D).low + 1;\
+	}\
+}
+/*******************************************************//**
+Tests if two dulints are equal. */
+#define UT_DULINT_EQ(D1, D2)	(((D1).low == (D2).low)\
+						&& ((D1).high == (D2).high))
+#ifdef notdefined
+/************************************************************//**
+Sort function for dulint arrays. */
+UNIV_INTERN
+void
+ut_dulint_sort(
+/*===========*/
+	dulint*	arr,	/*!< in/out: array to be sorted */
+	dulint*	aux_arr,/*!< in/out: auxiliary array (same size as arr) */
+	ulint	low,	/*!< in: low bound of sort interval, inclusive */
+	ulint	high);	/*!< in: high bound of sort interval, noninclusive */
+#endif /* notdefined */
+
+/*********************************************************//**
+The following function rounds up a pointer to the nearest aligned address.
+@return	aligned pointer */
+UNIV_INLINE
+void*
+ut_align(
+/*=====*/
+	const void*	ptr,		/*!< in: pointer */
+	ulint		align_no);	/*!< in: align by this number */
+/*********************************************************//**
+The following function rounds down a pointer to the nearest
+aligned address.
+@return	aligned pointer */
+UNIV_INLINE
+void*
+ut_align_down(
+/*==========*/
+	const void*	ptr,		/*!< in: pointer */
+	ulint		align_no)	/*!< in: align by this number */
+		__attribute__((const));
+/*********************************************************//**
+The following function computes the offset of a pointer from the nearest
+aligned address.
+@return	distance from aligned pointer */
+UNIV_INLINE
+ulint
+ut_align_offset(
+/*============*/
+	const void*	ptr,		/*!< in: pointer */
+	ulint		align_no)	/*!< in: align by this number */
+			__attribute__((const));
+/*****************************************************************//**
+Gets the nth bit of a ulint.
+@return	TRUE if nth bit is 1; 0th bit is defined to be the least significant */
+UNIV_INLINE
+ibool
+ut_bit_get_nth(
+/*===========*/
+	ulint	a,	/*!< in: ulint */
+	ulint	n);	/*!< in: nth bit requested */
+/*****************************************************************//**
+Sets the nth bit of a ulint.
+@return	the ulint with the bit set as requested */
+UNIV_INLINE
+ulint
+ut_bit_set_nth(
+/*===========*/
+	ulint	a,	/*!< in: ulint */
+	ulint	n,	/*!< in: nth bit requested */
+	ibool	val);	/*!< in: value for the bit to set */
+
+#ifndef UNIV_NONINL
+#include "ut0byte.ic"
+#endif
+
+#endif
diff --git a/storage/xtradb/include/ut0byte.ic b/storage/xtradb/include/ut0byte.ic
new file mode 100644
index 00000000000..3dd51890cb4
--- /dev/null
+++ b/storage/xtradb/include/ut0byte.ic
@@ -0,0 +1,411 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************************//**
+@file include/ut0byte.ic
+Utilities for byte operations
+
+Created 5/30/1994 Heikki Tuuri
+*******************************************************************/
+
+/*******************************************************//**
+Creates a 64-bit dulint out of two ulints.
+@return	created dulint */
+UNIV_INLINE
+dulint
+ut_dulint_create(
+/*=============*/
+	ulint	high,	/*!< in: high-order 32 bits */
+	ulint	low)	/*!< in: low-order 32 bits */
+{
+	dulint	res;
+
+	ut_ad(high <= 0xFFFFFFFF);
+	ut_ad(low <= 0xFFFFFFFF);
+
+	res.high = high;
+	res.low	 = low;
+
+	return(res);
+}
+
+/*******************************************************//**
+Gets the high-order 32 bits of a dulint.
+@return	32 bits in ulint */
+UNIV_INLINE
+ulint
+ut_dulint_get_high(
+/*===============*/
+	dulint	d)	/*!< in: dulint */
+{
+	return(d.high);
+}
+
+/*******************************************************//**
+Gets the low-order 32 bits of a dulint.
+@return	32 bits in ulint */
+UNIV_INLINE
+ulint
+ut_dulint_get_low(
+/*==============*/
+	dulint	d)	/*!< in: dulint */
+{
+	return(d.low);
+}
+
+/*******************************************************//**
+Converts a dulint (a struct of 2 ulints) to ib_int64_t, which is a 64-bit
+integer type.
+@return	value in ib_int64_t type */
+UNIV_INLINE
+ib_int64_t
+ut_conv_dulint_to_longlong(
+/*=======================*/
+	dulint	d)	/*!< in: dulint */
+{
+	return((ib_int64_t)d.low
+	       + (((ib_int64_t)d.high) << 32));
+}
+
+/*******************************************************//**
+Tests if a dulint is zero.
+@return	TRUE if zero */
+UNIV_INLINE
+ibool
+ut_dulint_is_zero(
+/*==============*/
+	dulint	a)	/*!< in: dulint */
+{
+	if ((a.low == 0) && (a.high == 0)) {
+
+		return(TRUE);
+	}
+
+	return(FALSE);
+}
+
+/*******************************************************//**
+Compares two dulints.
+@return	-1 if a < b, 0 if a == b, 1 if a > b */
+UNIV_INLINE
+int
+ut_dulint_cmp(
+/*==========*/
+	dulint	a,	/*!< in: dulint */
+	dulint	b)	/*!< in: dulint */
+{
+	if (a.high > b.high) {
+		return(1);
+	} else if (a.high < b.high) {
+		return(-1);
+	} else if (a.low > b.low) {
+		return(1);
+	} else if (a.low < b.low) {
+		return(-1);
+	} else {
+		return(0);
+	}
+}
+
+/*******************************************************//**
+Calculates the max of two dulints.
+@return	max(a, b) */
+UNIV_INLINE
+dulint
+ut_dulint_get_max(
+/*==============*/
+	dulint	a,	/*!< in: dulint */
+	dulint	b)	/*!< in: dulint */
+{
+	if (ut_dulint_cmp(a, b) > 0) {
+
+		return(a);
+	}
+
+	return(b);
+}
+
+/*******************************************************//**
+Calculates the min of two dulints.
+@return	min(a, b) */
+UNIV_INLINE
+dulint
+ut_dulint_get_min(
+/*==============*/
+	dulint	a,	/*!< in: dulint */
+	dulint	b)	/*!< in: dulint */
+{
+	if (ut_dulint_cmp(a, b) > 0) {
+
+		return(b);
+	}
+
+	return(a);
+}
+
+/*******************************************************//**
+Adds a ulint to a dulint.
+@return	sum a + b */
+UNIV_INLINE
+dulint
+ut_dulint_add(
+/*==========*/
+	dulint	a,	/*!< in: dulint */
+	ulint	b)	/*!< in: ulint */
+{
+	if (0xFFFFFFFFUL - b >= a.low) {
+		a.low += b;
+
+		return(a);
+	}
+
+	a.low = a.low - (0xFFFFFFFFUL - b) - 1;
+
+	a.high++;
+
+	return(a);
+}
+
+/*******************************************************//**
+Subtracts a ulint from a dulint.
+@return	a - b */
+UNIV_INLINE
+dulint
+ut_dulint_subtract(
+/*===============*/
+	dulint	a,	/*!< in: dulint */
+	ulint	b)	/*!< in: ulint, b <= a */
+{
+	if (a.low >= b) {
+		a.low -= b;
+
+		return(a);
+	}
+
+	b -= a.low + 1;
+
+	a.low = 0xFFFFFFFFUL - b;
+
+	ut_ad(a.high > 0);
+
+	a.high--;
+
+	return(a);
+}
+
+/*******************************************************//**
+Subtracts a dulint from another. NOTE that the difference must be positive
+and smaller that 4G.
+@return	a - b */
+UNIV_INLINE
+ulint
+ut_dulint_minus(
+/*============*/
+	dulint	a,	/*!< in: dulint; NOTE a must be >= b and at most
+			2 to power 32 - 1 greater */
+	dulint	b)	/*!< in: dulint */
+{
+	ulint	diff;
+
+	if (a.high == b.high) {
+		ut_ad(a.low >= b.low);
+
+		return(a.low - b.low);
+	}
+
+	ut_ad(a.high == b.high + 1);
+
+	diff = (ulint)(0xFFFFFFFFUL - b.low);
+	diff += 1 + a.low;
+
+	ut_ad(diff > a.low);
+
+	return(diff);
+}
+
+/********************************************************//**
+Rounds a dulint downward to a multiple of a power of 2.
+@return	rounded value */
+UNIV_INLINE
+dulint
+ut_dulint_align_down(
+/*=================*/
+	dulint	 n,		/*!< in: number to be rounded */
+	ulint	 align_no)	/*!< in: align by this number which must be a
+				power of 2 */
+{
+	ulint	low, high;
+
+	ut_ad(align_no > 0);
+	ut_ad(((align_no - 1) & align_no) == 0);
+
+	low = ut_dulint_get_low(n);
+	high = ut_dulint_get_high(n);
+
+	low = low & ~(align_no - 1);
+
+	return(ut_dulint_create(high, low));
+}
+
+/********************************************************//**
+Rounds a dulint upward to a multiple of a power of 2.
+@return	rounded value */
+UNIV_INLINE
+dulint
+ut_dulint_align_up(
+/*===============*/
+	dulint	 n,		/*!< in: number to be rounded */
+	ulint	 align_no)	/*!< in: align by this number which must be a
+				power of 2 */
+{
+	return(ut_dulint_align_down(ut_dulint_add(n, align_no - 1), align_no));
+}
+
+/********************************************************//**
+Rounds ib_uint64_t downward to a multiple of a power of 2.
+@return	rounded value */
+UNIV_INLINE
+ib_uint64_t
+ut_uint64_align_down(
+/*=================*/
+	ib_uint64_t	 n,		/*!< in: number to be rounded */
+	ulint		 align_no)	/*!< in: align by this number
+					which must be a power of 2 */
+{
+	ut_ad(align_no > 0);
+	ut_ad(ut_is_2pow(align_no));
+
+	return(n & ~((ib_uint64_t) align_no - 1));
+}
+
+/********************************************************//**
+Rounds ib_uint64_t upward to a multiple of a power of 2.
+@return	rounded value */
+UNIV_INLINE
+ib_uint64_t
+ut_uint64_align_up(
+/*===============*/
+	ib_uint64_t	 n,		/*!< in: number to be rounded */
+	ulint		 align_no)	/*!< in: align by this number
+					which must be a power of 2 */
+{
+	ib_uint64_t	align_1 = (ib_uint64_t) align_no - 1;
+
+	ut_ad(align_no > 0);
+	ut_ad(ut_is_2pow(align_no));
+
+	return((n + align_1) & ~align_1);
+}
+
+/*********************************************************//**
+The following function rounds up a pointer to the nearest aligned address.
+@return	aligned pointer */
+UNIV_INLINE
+void*
+ut_align(
+/*=====*/
+	const void*	ptr,		/*!< in: pointer */
+	ulint		align_no)	/*!< in: align by this number */
+{
+	ut_ad(align_no > 0);
+	ut_ad(((align_no - 1) & align_no) == 0);
+	ut_ad(ptr);
+
+	ut_ad(sizeof(void*) == sizeof(ulint));
+
+	return((void*)((((ulint)ptr) + align_no - 1) & ~(align_no - 1)));
+}
+
+/*********************************************************//**
+The following function rounds down a pointer to the nearest
+aligned address.
+@return	aligned pointer */
+UNIV_INLINE
+void*
+ut_align_down(
+/*==========*/
+	const void*	ptr,		/*!< in: pointer */
+	ulint		align_no)	/*!< in: align by this number */
+{
+	ut_ad(align_no > 0);
+	ut_ad(((align_no - 1) & align_no) == 0);
+	ut_ad(ptr);
+
+	ut_ad(sizeof(void*) == sizeof(ulint));
+
+	return((void*)((((ulint)ptr)) & ~(align_no - 1)));
+}
+
+/*********************************************************//**
+The following function computes the offset of a pointer from the nearest
+aligned address.
+@return	distance from aligned pointer */
+UNIV_INLINE
+ulint
+ut_align_offset(
+/*============*/
+	const void*	ptr,		/*!< in: pointer */
+	ulint		align_no)	/*!< in: align by this number */
+{
+	ut_ad(align_no > 0);
+	ut_ad(((align_no - 1) & align_no) == 0);
+	ut_ad(ptr);
+
+	ut_ad(sizeof(void*) == sizeof(ulint));
+
+	return(((ulint)ptr) & (align_no - 1));
+}
+
+/*****************************************************************//**
+Gets the nth bit of a ulint.
+@return	TRUE if nth bit is 1; 0th bit is defined to be the least significant */
+UNIV_INLINE
+ibool
+ut_bit_get_nth(
+/*===========*/
+	ulint	a,	/*!< in: ulint */
+	ulint	n)	/*!< in: nth bit requested */
+{
+	ut_ad(n < 8 * sizeof(ulint));
+#if TRUE != 1
+# error "TRUE != 1"
+#endif
+	return(1 & (a >> n));
+}
+
+/*****************************************************************//**
+Sets the nth bit of a ulint.
+@return	the ulint with the bit set as requested */
+UNIV_INLINE
+ulint
+ut_bit_set_nth(
+/*===========*/
+	ulint	a,	/*!< in: ulint */
+	ulint	n,	/*!< in: nth bit requested */
+	ibool	val)	/*!< in: value for the bit to set */
+{
+	ut_ad(n < 8 * sizeof(ulint));
+#if TRUE != 1
+# error "TRUE != 1"
+#endif
+	if (val) {
+		return(((ulint) 1 << n) | a);
+	} else {
+		return(~((ulint) 1 << n) & a);
+	}
+}
diff --git a/storage/xtradb/include/ut0dbg.h b/storage/xtradb/include/ut0dbg.h
new file mode 100644
index 00000000000..78b525c38ab
--- /dev/null
+++ b/storage/xtradb/include/ut0dbg.h
@@ -0,0 +1,175 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/*****************************************************************//**
+@file include/ut0dbg.h
+Debug utilities for Innobase
+
+Created 1/30/1994 Heikki Tuuri
+**********************************************************************/
+
+#ifndef ut0dbg_h
+#define ut0dbg_h
+
+#include "univ.i"
+#include <stdlib.h>
+#include "os0thread.h"
+
+#if defined(__GNUC__) && (__GNUC__ > 2)
+/** Test if an assertion fails.
+@param EXPR	assertion expression
+@return		nonzero if EXPR holds, zero if not */
+# define UT_DBG_FAIL(EXPR) UNIV_UNLIKELY(!((ulint)(EXPR)))
+#else
+/** This is used to eliminate compiler warnings */
+extern ulint	ut_dbg_zero;
+/** Test if an assertion fails.
+@param EXPR	assertion expression
+@return		nonzero if EXPR holds, zero if not */
+# define UT_DBG_FAIL(EXPR) !((ulint)(EXPR) + ut_dbg_zero)
+#endif
+
+/*************************************************************//**
+Report a failed assertion. */
+UNIV_INTERN
+void
+ut_dbg_assertion_failed(
+/*====================*/
+	const char* expr,	/*!< in: the failed assertion */
+	const char* file,	/*!< in: source file containing the assertion */
+	ulint line);		/*!< in: line number of the assertion */
+
+#ifdef __NETWARE__
+/** Flag for ignoring further assertion failures.  This is set to TRUE
+when on NetWare there happens an InnoDB assertion failure or other
+fatal error condition that requires an immediate shutdown. */
+extern ibool	panic_shutdown;
+/* Abort the execution. */
+void ut_dbg_panic(void);
+# define UT_DBG_PANIC ut_dbg_panic()
+/* Stop threads in ut_a(). */
+# define UT_DBG_STOP	do {} while (0)	/* We do not do this on NetWare */
+#else /* __NETWARE__ */
+# if defined(__WIN__) || defined(__INTEL_COMPILER)
+#  undef UT_DBG_USE_ABORT
+# elif defined(__GNUC__) && (__GNUC__ > 2)
+#  define UT_DBG_USE_ABORT
+# endif
+
+# ifndef UT_DBG_USE_ABORT
+/** A null pointer that will be dereferenced to trigger a memory trap */
+extern ulint*	ut_dbg_null_ptr;
+# endif
+
+# if defined(UNIV_SYNC_DEBUG) || !defined(UT_DBG_USE_ABORT)
+/** If this is set to TRUE by ut_dbg_assertion_failed(), all threads
+will stop at the next ut_a() or ut_ad(). */
+extern ibool	ut_dbg_stop_threads;
+
+/*************************************************************//**
+Stop a thread after assertion failure. */
+UNIV_INTERN
+void
+ut_dbg_stop_thread(
+/*===============*/
+	const char*	file,
+	ulint		line);
+# endif
+
+# ifdef UT_DBG_USE_ABORT
+/** Abort the execution. */
+#  define UT_DBG_PANIC abort()
+/** Stop threads (null operation) */
+#  define UT_DBG_STOP do {} while (0)
+# else /* UT_DBG_USE_ABORT */
+/** Abort the execution. */
+#  define UT_DBG_PANIC					\
+	if (*(ut_dbg_null_ptr)) ut_dbg_null_ptr = NULL
+/** Stop threads in ut_a(). */
+#  define UT_DBG_STOP do						\
+	if (UNIV_UNLIKELY(ut_dbg_stop_threads)) {		\
+		ut_dbg_stop_thread(__FILE__, (ulint) __LINE__);	\
+	} while (0)
+# endif /* UT_DBG_USE_ABORT */
+#endif /* __NETWARE__ */
+
+/** Abort execution if EXPR does not evaluate to nonzero.
+@param EXPR	assertion expression that should hold */
+#define ut_a(EXPR) do {						\
+	if (UT_DBG_FAIL(EXPR)) {				\
+		ut_dbg_assertion_failed(#EXPR,			\
+				__FILE__, (ulint) __LINE__);	\
+		UT_DBG_PANIC;					\
+	}							\
+	UT_DBG_STOP;						\
+} while (0)
+
+/** Abort execution. */
+#define ut_error do {						\
+	ut_dbg_assertion_failed(0, __FILE__, (ulint) __LINE__);	\
+	UT_DBG_PANIC;						\
+} while (0)
+
+#ifdef UNIV_DEBUG
+/** Debug assertion. Does nothing unless UNIV_DEBUG is defined. */
+#define ut_ad(EXPR)	ut_a(EXPR)
+/** Debug statement. Does nothing unless UNIV_DEBUG is defined. */
+#define ut_d(EXPR)	do {EXPR;} while (0)
+#else
+/** Debug assertion. Does nothing unless UNIV_DEBUG is defined. */
+#define ut_ad(EXPR)
+/** Debug statement. Does nothing unless UNIV_DEBUG is defined. */
+#define ut_d(EXPR)
+#endif
+
+/** Silence warnings about an unused variable by doing a null assignment.
+@param A	the unused variable */
+#define UT_NOT_USED(A)	A = A
+
+#ifdef UNIV_COMPILE_TEST_FUNCS
+
+#include <sys/types.h>
+#include <sys/time.h>
+#include <sys/resource.h>
+
+/** structure used for recording usage statistics */
+typedef struct speedo_struct {
+	struct rusage	ru;	/*!< getrusage() result */
+	struct timeval	tv;	/*!< gettimeofday() result */
+} speedo_t;
+
+/*******************************************************************//**
+Resets a speedo (records the current time in it). */
+UNIV_INTERN
+void
+speedo_reset(
+/*=========*/
+	speedo_t*	speedo);	/*!< out: speedo */
+
+/*******************************************************************//**
+Shows the time elapsed and usage statistics since the last reset of a
+speedo. */
+UNIV_INTERN
+void
+speedo_show(
+/*========*/
+	const speedo_t*	speedo);	/*!< in: speedo */
+
+#endif /* UNIV_COMPILE_TEST_FUNCS */
+
+#endif
diff --git a/storage/xtradb/include/ut0list.h b/storage/xtradb/include/ut0list.h
new file mode 100644
index 00000000000..ec67f4e2a0f
--- /dev/null
+++ b/storage/xtradb/include/ut0list.h
@@ -0,0 +1,172 @@
+/*****************************************************************************
+
+Copyright (c) 2006, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/*******************************************************************//**
+@file include/ut0list.h
+A double-linked list
+
+Created 4/26/2006 Osku Salerma
+************************************************************************/
+
+/*******************************************************************//**
+A double-linked list. This differs from the one in ut0lst.h in that in this
+one, each list node contains a pointer to the data, whereas the one in
+ut0lst.h uses a strategy where the list pointers are embedded in the data
+items themselves.
+
+Use this one when you need to store arbitrary data in the list where you
+can't embed the list pointers in the data, if a data item needs to be
+stored in multiple lists, etc.
+
+Note about the memory management: ib_list_t is a fixed-size struct whose
+allocation/deallocation is done through ib_list_create/ib_list_free, but the
+memory for the list nodes is allocated through a user-given memory heap,
+which can either be the same for all nodes or vary per node. Most users will
+probably want to create a memory heap to store the item-specific data, and
+pass in this same heap to the list node creation functions, thus
+automatically freeing the list node when the item's heap is freed.
+
+************************************************************************/
+
+#ifndef IB_LIST_H
+#define IB_LIST_H
+
+#include "mem0mem.h"
+
+typedef struct ib_list_struct ib_list_t;
+typedef struct ib_list_node_struct ib_list_node_t;
+typedef struct ib_list_helper_struct ib_list_helper_t;
+
+/****************************************************************//**
+Create a new list using mem_alloc. Lists created with this function must be
+freed with ib_list_free.
+@return	list */
+UNIV_INTERN
+ib_list_t*
+ib_list_create(void);
+/*=================*/
+
+
+/****************************************************************//**
+Create a new list using the given heap. ib_list_free MUST NOT BE CALLED for
+lists created with this function.
+@return	list */
+UNIV_INTERN
+ib_list_t*
+ib_list_create_heap(
+/*================*/
+	mem_heap_t*	heap);	/*!< in: memory heap to use */
+
+/****************************************************************//**
+Free a list. */
+UNIV_INTERN
+void
+ib_list_free(
+/*=========*/
+	ib_list_t*	list);	/*!< in: list */
+
+/****************************************************************//**
+Add the data to the start of the list.
+@return	new list node */
+UNIV_INTERN
+ib_list_node_t*
+ib_list_add_first(
+/*==============*/
+	ib_list_t*	list,	/*!< in: list */
+	void*		data,	/*!< in: data */
+	mem_heap_t*	heap);	/*!< in: memory heap to use */
+
+/****************************************************************//**
+Add the data to the end of the list.
+@return	new list node */
+UNIV_INTERN
+ib_list_node_t*
+ib_list_add_last(
+/*=============*/
+	ib_list_t*	list,	/*!< in: list */
+	void*		data,	/*!< in: data */
+	mem_heap_t*	heap);	/*!< in: memory heap to use */
+
+/****************************************************************//**
+Add the data after the indicated node.
+@return	new list node */
+UNIV_INTERN
+ib_list_node_t*
+ib_list_add_after(
+/*==============*/
+	ib_list_t*	list,		/*!< in: list */
+	ib_list_node_t*	prev_node,	/*!< in: node preceding new node (can
+					be NULL) */
+	void*		data,		/*!< in: data */
+	mem_heap_t*	heap);		/*!< in: memory heap to use */
+
+/****************************************************************//**
+Remove the node from the list. */
+UNIV_INTERN
+void
+ib_list_remove(
+/*===========*/
+	ib_list_t*	list,	/*!< in: list */
+	ib_list_node_t*	node);	/*!< in: node to remove */
+
+/****************************************************************//**
+Get the first node in the list.
+@return	first node, or NULL */
+UNIV_INLINE
+ib_list_node_t*
+ib_list_get_first(
+/*==============*/
+	ib_list_t*	list);	/*!< in: list */
+
+/****************************************************************//**
+Get the last node in the list.
+@return	last node, or NULL */
+UNIV_INLINE
+ib_list_node_t*
+ib_list_get_last(
+/*=============*/
+	ib_list_t*	list);	/*!< in: list */
+
+/* List. */
+struct ib_list_struct {
+	ib_list_node_t*		first;		/*!< first node */
+	ib_list_node_t*		last;		/*!< last node */
+	ibool			is_heap_list;	/*!< TRUE if this list was
+						allocated through a heap */
+};
+
+/* A list node. */
+struct ib_list_node_struct {
+	ib_list_node_t*		prev;		/*!< previous node */
+	ib_list_node_t*		next;		/*!< next node */
+	void*			data;		/*!< user data */
+};
+
+/* Quite often, the only additional piece of data you need is the per-item
+memory heap, so we have this generic struct available to use in those
+cases. */
+struct ib_list_helper_struct {
+	mem_heap_t*	heap;		/*!< memory heap */
+	void*		data;		/*!< user data */
+};
+
+#ifndef UNIV_NONINL
+#include "ut0list.ic"
+#endif
+
+#endif
diff --git a/storage/xtradb/include/ut0list.ic b/storage/xtradb/include/ut0list.ic
new file mode 100644
index 00000000000..eb5c62796e8
--- /dev/null
+++ b/storage/xtradb/include/ut0list.ic
@@ -0,0 +1,48 @@
+/*****************************************************************************
+
+Copyright (c) 2006, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/*******************************************************************//**
+@file include/ut0list.ic
+A double-linked list
+
+Created 4/26/2006 Osku Salerma
+************************************************************************/
+
+/****************************************************************//**
+Get the first node in the list.
+@return	first node, or NULL */
+UNIV_INLINE
+ib_list_node_t*
+ib_list_get_first(
+/*==============*/
+	ib_list_t*	list)	/*!< in: list */
+{
+	return(list->first);
+}
+
+/****************************************************************//**
+Get the last node in the list.
+@return	last node, or NULL */
+UNIV_INLINE
+ib_list_node_t*
+ib_list_get_last(
+/*=============*/
+	ib_list_t*	list)	/*!< in: list */
+{
+	return(list->last);
+}
diff --git a/storage/xtradb/include/ut0lst.h b/storage/xtradb/include/ut0lst.h
new file mode 100644
index 00000000000..245dfc226c3
--- /dev/null
+++ b/storage/xtradb/include/ut0lst.h
@@ -0,0 +1,304 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/******************************************************************//**
+@file include/ut0lst.h
+List utilities
+
+Created 9/10/1995 Heikki Tuuri
+***********************************************************************/
+
+#ifndef ut0lst_h
+#define ut0lst_h
+
+#include "univ.i"
+
+/* This module implements the two-way linear list which should be used
+if a list is used in the database. Note that a single struct may belong
+to two or more lists, provided that the list are given different names.
+An example of the usage of the lists can be found in fil0fil.c. */
+
+/*******************************************************************//**
+This macro expands to the unnamed type definition of a struct which acts
+as the two-way list base node. The base node contains pointers
+to both ends of the list and a count of nodes in the list (excluding
+the base node from the count).
+@param TYPE	the name of the list node data type */
+#define UT_LIST_BASE_NODE_T(TYPE)\
+struct {\
+	ulint	count;	/*!< count of nodes in list */\
+	TYPE *	start;	/*!< pointer to list start, NULL if empty */\
+	TYPE *	end;	/*!< pointer to list end, NULL if empty */\
+}\
+
+/*******************************************************************//**
+This macro expands to the unnamed type definition of a struct which
+should be embedded in the nodes of the list, the node type must be a struct.
+This struct contains the pointers to next and previous nodes in the list.
+The name of the field in the node struct should be the name given
+to the list.
+@param TYPE	the list node type name */
+/* Example:
+typedef struct LRU_node_struct	LRU_node_t;
+struct LRU_node_struct {
+	UT_LIST_NODE_T(LRU_node_t)	LRU_list;
+	...
+}
+The example implements an LRU list of name LRU_list. Its nodes are of type
+LRU_node_t. */
+
+#define UT_LIST_NODE_T(TYPE)\
+struct {\
+	TYPE *	prev;	/*!< pointer to the previous node,\
+			NULL if start of list */\
+	TYPE *	next;	/*!< pointer to next node, NULL if end of list */\
+}\
+
+/*******************************************************************//**
+Initializes the base node of a two-way list.
+@param BASE	the list base node
+*/
+#define UT_LIST_INIT(BASE)\
+{\
+	(BASE).count = 0;\
+	(BASE).start = NULL;\
+	(BASE).end   = NULL;\
+}\
+
+/*******************************************************************//**
+Adds the node as the first element in a two-way linked list.
+@param NAME	list name
+@param BASE	the base node (not a pointer to it)
+@param N	pointer to the node to be added to the list.
+*/
+#define UT_LIST_ADD_FIRST(NAME, BASE, N)\
+{\
+	ut_ad(N);\
+	((BASE).count)++;\
+	((N)->NAME).next = (BASE).start;\
+	((N)->NAME).prev = NULL;\
+	if (UNIV_LIKELY((BASE).start != NULL)) {\
+		ut_ad((BASE).start != (N));\
+		(((BASE).start)->NAME).prev = (N);\
+	}\
+	(BASE).start = (N);\
+	if (UNIV_UNLIKELY((BASE).end == NULL)) {\
+		(BASE).end = (N);\
+	}\
+}\
+
+/*******************************************************************//**
+Adds the node as the last element in a two-way linked list.
+@param NAME	list name
+@param BASE	the base node (not a pointer to it)
+@param N	pointer to the node to be added to the list
+*/
+#define UT_LIST_ADD_LAST(NAME, BASE, N)\
+{\
+	ut_ad(N);\
+	((BASE).count)++;\
+	((N)->NAME).prev = (BASE).end;\
+	((N)->NAME).next = NULL;\
+	if ((BASE).end != NULL) {\
+		ut_ad((BASE).end != (N));\
+		(((BASE).end)->NAME).next = (N);\
+	}\
+	(BASE).end = (N);\
+	if ((BASE).start == NULL) {\
+		(BASE).start = (N);\
+	}\
+}\
+
+/*******************************************************************//**
+Inserts a NODE2 after NODE1 in a list.
+@param NAME	list name
+@param BASE	the base node (not a pointer to it)
+@param NODE1	pointer to node after which NODE2 is inserted
+@param NODE2	pointer to node being inserted after NODE1
+*/
+#define UT_LIST_INSERT_AFTER(NAME, BASE, NODE1, NODE2)\
+{\
+	ut_ad(NODE1);\
+	ut_ad(NODE2);\
+	ut_ad((NODE1) != (NODE2));\
+	((BASE).count)++;\
+	((NODE2)->NAME).prev = (NODE1);\
+	((NODE2)->NAME).next = ((NODE1)->NAME).next;\
+	if (((NODE1)->NAME).next != NULL) {\
+		((((NODE1)->NAME).next)->NAME).prev = (NODE2);\
+	}\
+	((NODE1)->NAME).next = (NODE2);\
+	if ((BASE).end == (NODE1)) {\
+		(BASE).end = (NODE2);\
+	}\
+}\
+
+#ifdef UNIV_LIST_DEBUG
+/** Invalidate the pointers in a list node.
+@param NAME	list name
+@param N	pointer to the node that was removed */
+# define UT_LIST_REMOVE_CLEAR(NAME, N)		\
+((N)->NAME.prev = (N)->NAME.next = (void*) -1)
+#else
+/** Invalidate the pointers in a list node.
+@param NAME	list name
+@param N	pointer to the node that was removed */
+# define UT_LIST_REMOVE_CLEAR(NAME, N) do {} while (0)
+#endif
+
+/*******************************************************************//**
+Removes a node from a two-way linked list.
+@param NAME	list name
+@param BASE	the base node (not a pointer to it)
+@param N	pointer to the node to be removed from the list
+*/
+#define UT_LIST_REMOVE(NAME, BASE, N)					\
+do {									\
+	ut_ad(N);							\
+	ut_a((BASE).count > 0);						\
+	((BASE).count)--;						\
+	if (((N)->NAME).next != NULL) {					\
+		((((N)->NAME).next)->NAME).prev = ((N)->NAME).prev;	\
+	} else {							\
+		(BASE).end = ((N)->NAME).prev;				\
+	}								\
+	if (((N)->NAME).prev != NULL) {					\
+		((((N)->NAME).prev)->NAME).next = ((N)->NAME).next;	\
+	} else {							\
+		(BASE).start = ((N)->NAME).next;			\
+	}								\
+	UT_LIST_REMOVE_CLEAR(NAME, N);					\
+} while (0)
+
+/********************************************************************//**
+Gets the next node in a two-way list.
+@param NAME	list name
+@param N	pointer to a node
+@return		the successor of N in NAME, or NULL */
+#define UT_LIST_GET_NEXT(NAME, N)\
+	(((N)->NAME).next)
+
+/********************************************************************//**
+Gets the previous node in a two-way list.
+@param NAME	list name
+@param N	pointer to a node
+@return		the predecessor of N in NAME, or NULL */
+#define UT_LIST_GET_PREV(NAME, N)\
+	(((N)->NAME).prev)
+
+/********************************************************************//**
+Alternative macro to get the number of nodes in a two-way list, i.e.,
+its length.
+@param BASE	the base node (not a pointer to it).
+@return		the number of nodes in the list */
+#define UT_LIST_GET_LEN(BASE)\
+	(BASE).count
+
+/********************************************************************//**
+Gets the first node in a two-way list.
+@param BASE	the base node (not a pointer to it)
+@return		first node, or NULL if the list is empty */
+#define UT_LIST_GET_FIRST(BASE)\
+	(BASE).start
+
+/********************************************************************//**
+Gets the last node in a two-way list.
+@param BASE	the base node (not a pointer to it)
+@return		last node, or NULL if the list is empty */
+#define UT_LIST_GET_LAST(BASE)\
+	(BASE).end
+
+/********************************************************************//**
+Checks the consistency of a two-way list.
+@param NAME		the name of the list
+@param TYPE		node type
+@param BASE		base node (not a pointer to it)
+@param ASSERTION	a condition on ut_list_node_313 */
+#define UT_LIST_VALIDATE(NAME, TYPE, BASE, ASSERTION)			\
+do {									\
+	ulint	ut_list_i_313;						\
+	TYPE*	ut_list_node_313;					\
+									\
+	ut_list_node_313 = (BASE).start;				\
+									\
+	for (ut_list_i_313 = (BASE).count; ut_list_i_313--; ) {		\
+		ut_a(ut_list_node_313);					\
+		ASSERTION;						\
+		ut_ad((ut_list_node_313->NAME).next || !ut_list_i_313);	\
+		ut_list_node_313 = (ut_list_node_313->NAME).next;	\
+	}								\
+									\
+	ut_a(ut_list_node_313 == NULL);					\
+									\
+	ut_list_node_313 = (BASE).end;					\
+									\
+	for (ut_list_i_313 = (BASE).count; ut_list_i_313--; ) {		\
+		ut_a(ut_list_node_313);					\
+		ASSERTION;						\
+		ut_ad((ut_list_node_313->NAME).prev || !ut_list_i_313);	\
+		ut_list_node_313 = (ut_list_node_313->NAME).prev;	\
+	}								\
+									\
+	ut_a(ut_list_node_313 == NULL);					\
+} while (0)
+
+/********************************************************************//**
+Align nodes with moving location.
+@param NAME		the name of the list
+@param TYPE		node type
+@param BASE		base node (not a pointer to it)
+@param OFFSET		offset moved */
+#define UT_LIST_OFFSET(NAME, TYPE, BASE, FADDR, FOFFSET, BOFFSET)	\
+do {									\
+	ulint	ut_list_i_313;						\
+	TYPE*	ut_list_node_313;					\
+									\
+	if ((BASE).start)						\
+		(BASE).start = (void*)((byte*)((BASE).start)			\
+			+ (((void*)((BASE).start) > (void*)FADDR)?FOFFSET:BOFFSET));\
+	if ((BASE).end)							\
+		(BASE).end   = (void*)((byte*)((BASE).end)			\
+			+ (((void*)((BASE).end) > (void*)FADDR)?FOFFSET:BOFFSET));\
+									\
+	ut_list_node_313 = (BASE).start;				\
+									\
+	for (ut_list_i_313 = (BASE).count; ut_list_i_313--; ) {		\
+		ut_a(ut_list_node_313);					\
+		if ((ut_list_node_313->NAME).prev)			\
+			(ut_list_node_313->NAME).prev = (void*)((byte*)((ut_list_node_313->NAME).prev)\
+				+ (((void*)((ut_list_node_313->NAME).prev) > (void*)FADDR)?FOFFSET:BOFFSET));\
+		if ((ut_list_node_313->NAME).next)			\
+			(ut_list_node_313->NAME).next =	(void*)((byte*)((ut_list_node_313->NAME).next)\
+				+ (((void*)((ut_list_node_313->NAME).next)> (void*)FADDR)?FOFFSET:BOFFSET));\
+		ut_list_node_313 = (ut_list_node_313->NAME).next;	\
+	}								\
+									\
+	ut_a(ut_list_node_313 == NULL);					\
+									\
+	ut_list_node_313 = (BASE).end;					\
+									\
+	for (ut_list_i_313 = (BASE).count; ut_list_i_313--; ) {		\
+		ut_a(ut_list_node_313);					\
+		ut_list_node_313 = (ut_list_node_313->NAME).prev;	\
+	}								\
+									\
+	ut_a(ut_list_node_313 == NULL);					\
+} while (0)
+
+#endif
+
diff --git a/storage/xtradb/include/ut0mem.h b/storage/xtradb/include/ut0mem.h
new file mode 100644
index 00000000000..f14606be966
--- /dev/null
+++ b/storage/xtradb/include/ut0mem.h
@@ -0,0 +1,307 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/*******************************************************************//**
+@file include/ut0mem.h
+Memory primitives
+
+Created 5/30/1994 Heikki Tuuri
+************************************************************************/
+
+#ifndef ut0mem_h
+#define ut0mem_h
+
+#include "univ.i"
+#include <string.h>
+#ifndef UNIV_HOTBACKUP
+# include "os0sync.h"
+
+/** The total amount of memory currently allocated from the operating
+system with os_mem_alloc_large() or malloc().  Does not count malloc()
+if srv_use_sys_malloc is set.  Protected by ut_list_mutex. */
+extern ulint		ut_total_allocated_memory;
+
+/** Mutex protecting ut_total_allocated_memory and ut_mem_block_list */
+extern os_fast_mutex_t	ut_list_mutex;
+#endif /* !UNIV_HOTBACKUP */
+
+/** Wrapper for memcpy(3).  Copy memory area when the source and
+target are not overlapping.
+* @param dest	in: copy to
+* @param sour	in: copy from
+* @param n	in: number of bytes to copy
+* @return	dest */
+UNIV_INLINE
+void*
+ut_memcpy(void* dest, const void* sour, ulint n);
+
+/** Wrapper for memmove(3).  Copy memory area when the source and
+target are overlapping.
+* @param dest	in: copy to
+* @param sour	in: copy from
+* @param n	in: number of bytes to copy
+* @return	dest */
+UNIV_INLINE
+void*
+ut_memmove(void* dest, const void* sour, ulint n);
+
+/** Wrapper for memcmp(3).  Compare memory areas.
+* @param str1	in: first memory block to compare
+* @param str2	in: second memory block to compare
+* @param n	in: number of bytes to compare
+* @return	negative, 0, or positive if str1 is smaller, equal,
+		or greater than str2, respectively. */
+UNIV_INLINE
+int
+ut_memcmp(const void* str1, const void* str2, ulint n);
+
+/**********************************************************************//**
+Initializes the mem block list at database startup. */
+UNIV_INTERN
+void
+ut_mem_init(void);
+/*=============*/
+
+/**********************************************************************//**
+Allocates memory. Sets it also to zero if UNIV_SET_MEM_TO_ZERO is
+defined and set_to_zero is TRUE.
+@return	own: allocated memory */
+UNIV_INTERN
+void*
+ut_malloc_low(
+/*==========*/
+	ulint	n,			/*!< in: number of bytes to allocate */
+	ibool	set_to_zero,		/*!< in: TRUE if allocated memory
+					should be set to zero if
+					UNIV_SET_MEM_TO_ZERO is defined */
+	ibool	assert_on_error);	/*!< in: if TRUE, we crash mysqld if
+					the memory cannot be allocated */
+/**********************************************************************//**
+Allocates memory. Sets it also to zero if UNIV_SET_MEM_TO_ZERO is
+defined.
+@return	own: allocated memory */
+UNIV_INTERN
+void*
+ut_malloc(
+/*======*/
+	ulint	n);	/*!< in: number of bytes to allocate */
+#ifndef UNIV_HOTBACKUP
+/**********************************************************************//**
+Tests if malloc of n bytes would succeed. ut_malloc() asserts if memory runs
+out. It cannot be used if we want to return an error message. Prints to
+stderr a message if fails.
+@return	TRUE if succeeded */
+UNIV_INTERN
+ibool
+ut_test_malloc(
+/*===========*/
+	ulint	n);	/*!< in: try to allocate this many bytes */
+#endif /* !UNIV_HOTBACKUP */
+/**********************************************************************//**
+Frees a memory block allocated with ut_malloc. Freeing a NULL pointer is
+a nop. */
+UNIV_INTERN
+void
+ut_free(
+/*====*/
+	void* ptr);  /*!< in, own: memory block */
+#ifndef UNIV_HOTBACKUP
+/**********************************************************************//**
+Implements realloc. This is needed by /pars/lexyy.c. Otherwise, you should not
+use this function because the allocation functions in mem0mem.h are the
+recommended ones in InnoDB.
+
+man realloc in Linux, 2004:
+
+       realloc()  changes the size of the memory block pointed to
+       by ptr to size bytes.  The contents will be  unchanged  to
+       the minimum of the old and new sizes; newly allocated mem�
+       ory will be uninitialized.  If ptr is NULL,  the	 call  is
+       equivalent  to malloc(size); if size is equal to zero, the
+       call is equivalent to free(ptr).	 Unless ptr is	NULL,  it
+       must  have  been	 returned by an earlier call to malloc(),
+       calloc() or realloc().
+
+RETURN VALUE
+       realloc() returns a pointer to the newly allocated memory,
+       which is suitably aligned for any kind of variable and may
+       be different from ptr, or NULL if the  request  fails.  If
+       size  was equal to 0, either NULL or a pointer suitable to
+       be passed to free() is returned.	 If realloc()  fails  the
+       original	 block	is  left  untouched  - it is not freed or
+       moved.
+@return	own: pointer to new mem block or NULL */
+UNIV_INTERN
+void*
+ut_realloc(
+/*=======*/
+	void*	ptr,	/*!< in: pointer to old block or NULL */
+	ulint	size);	/*!< in: desired size */
+/**********************************************************************//**
+Frees in shutdown all allocated memory not freed yet. */
+UNIV_INTERN
+void
+ut_free_all_mem(void);
+/*=================*/
+#endif /* !UNIV_HOTBACKUP */
+
+/** Wrapper for strcpy(3).  Copy a NUL-terminated string.
+* @param dest	in: copy to
+* @param sour	in: copy from
+* @return	dest */
+UNIV_INLINE
+char*
+ut_strcpy(char* dest, const char* sour);
+
+/** Wrapper for strlen(3).  Determine the length of a NUL-terminated string.
+* @param str	in: string
+* @return	length of the string in bytes, excluding the terminating NUL */
+UNIV_INLINE
+ulint
+ut_strlen(const char* str);
+
+/** Wrapper for strcmp(3).  Compare NUL-terminated strings.
+* @param str1	in: first string to compare
+* @param str2	in: second string to compare
+* @return	negative, 0, or positive if str1 is smaller, equal,
+		or greater than str2, respectively. */
+UNIV_INLINE
+int
+ut_strcmp(const char* str1, const char* str2);
+
+/**********************************************************************//**
+Copies up to size - 1 characters from the NUL-terminated string src to
+dst, NUL-terminating the result. Returns strlen(src), so truncation
+occurred if the return value >= size.
+@return	strlen(src) */
+UNIV_INTERN
+ulint
+ut_strlcpy(
+/*=======*/
+	char*		dst,	/*!< in: destination buffer */
+	const char*	src,	/*!< in: source buffer */
+	ulint		size);	/*!< in: size of destination buffer */
+
+/**********************************************************************//**
+Like ut_strlcpy, but if src doesn't fit in dst completely, copies the last
+(size - 1) bytes of src, not the first.
+@return	strlen(src) */
+UNIV_INTERN
+ulint
+ut_strlcpy_rev(
+/*===========*/
+	char*		dst,	/*!< in: destination buffer */
+	const char*	src,	/*!< in: source buffer */
+	ulint		size);	/*!< in: size of destination buffer */
+
+/**********************************************************************//**
+Compute strlen(ut_strcpyq(str, q)).
+@return	length of the string when quoted */
+UNIV_INLINE
+ulint
+ut_strlenq(
+/*=======*/
+	const char*	str,	/*!< in: null-terminated string */
+	char		q);	/*!< in: the quote character */
+
+/**********************************************************************//**
+Make a quoted copy of a NUL-terminated string.	Leading and trailing
+quotes will not be included; only embedded quotes will be escaped.
+See also ut_strlenq() and ut_memcpyq().
+@return	pointer to end of dest */
+UNIV_INTERN
+char*
+ut_strcpyq(
+/*=======*/
+	char*		dest,	/*!< in: output buffer */
+	char		q,	/*!< in: the quote character */
+	const char*	src);	/*!< in: null-terminated string */
+
+/**********************************************************************//**
+Make a quoted copy of a fixed-length string.  Leading and trailing
+quotes will not be included; only embedded quotes will be escaped.
+See also ut_strlenq() and ut_strcpyq().
+@return	pointer to end of dest */
+UNIV_INTERN
+char*
+ut_memcpyq(
+/*=======*/
+	char*		dest,	/*!< in: output buffer */
+	char		q,	/*!< in: the quote character */
+	const char*	src,	/*!< in: string to be quoted */
+	ulint		len);	/*!< in: length of src */
+
+/**********************************************************************//**
+Return the number of times s2 occurs in s1. Overlapping instances of s2
+are only counted once.
+@return	the number of times s2 occurs in s1 */
+UNIV_INTERN
+ulint
+ut_strcount(
+/*========*/
+	const char*	s1,	/*!< in: string to search in */
+	const char*	s2);	/*!< in: string to search for */
+
+/**********************************************************************//**
+Replace every occurrence of s1 in str with s2. Overlapping instances of s1
+are only replaced once.
+@return	own: modified string, must be freed with mem_free() */
+UNIV_INTERN
+char*
+ut_strreplace(
+/*==========*/
+	const char*	str,	/*!< in: string to operate on */
+	const char*	s1,	/*!< in: string to replace */
+	const char*	s2);	/*!< in: string to replace s1 with */
+
+/**********************************************************************//**
+Converts a raw binary data to a NUL-terminated hex string. The output is
+truncated if there is not enough space in "hex", make sure "hex_size" is at
+least (2 * raw_size + 1) if you do not want this to happen. Returns the
+actual number of characters written to "hex" (including the NUL).
+@return	number of chars written */
+UNIV_INLINE
+ulint
+ut_raw_to_hex(
+/*==========*/
+	const void*	raw,		/*!< in: raw data */
+	ulint		raw_size,	/*!< in: "raw" length in bytes */
+	char*		hex,		/*!< out: hex string */
+	ulint		hex_size);	/*!< in: "hex" size in bytes */
+
+/*******************************************************************//**
+Adds single quotes to the start and end of string and escapes any quotes
+by doubling them. Returns the number of bytes that were written to "buf"
+(including the terminating NUL). If buf_size is too small then the
+trailing bytes from "str" are discarded.
+@return	number of bytes that were written */
+UNIV_INLINE
+ulint
+ut_str_sql_format(
+/*==============*/
+	const char*	str,		/*!< in: string */
+	ulint		str_len,	/*!< in: string length in bytes */
+	char*		buf,		/*!< out: output buffer */
+	ulint		buf_size);	/*!< in: output buffer size
+					in bytes */
+
+#ifndef UNIV_NONINL
+#include "ut0mem.ic"
+#endif
+
+#endif
diff --git a/storage/xtradb/include/ut0mem.ic b/storage/xtradb/include/ut0mem.ic
new file mode 100644
index 00000000000..f36c28f1989
--- /dev/null
+++ b/storage/xtradb/include/ut0mem.ic
@@ -0,0 +1,338 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/*******************************************************************//**
+@file include/ut0mem.ic
+Memory primitives
+
+Created 5/30/1994 Heikki Tuuri
+************************************************************************/
+
+#include "ut0byte.h"
+#include "mach0data.h"
+
+/** Wrapper for memcpy(3).  Copy memory area when the source and
+target are not overlapping.
+* @param dest	in: copy to
+* @param sour	in: copy from
+* @param n	in: number of bytes to copy
+* @return	dest */
+UNIV_INLINE
+void*
+ut_memcpy(void* dest, const void* sour, ulint n)
+{
+	return(memcpy(dest, sour, n));
+}
+
+/** Wrapper for memmove(3).  Copy memory area when the source and
+target are overlapping.
+* @param dest	in: copy to
+* @param sour	in: copy from
+* @param n	in: number of bytes to copy
+* @return	dest */
+UNIV_INLINE
+void*
+ut_memmove(void* dest, const void* sour, ulint n)
+{
+	return(memmove(dest, sour, n));
+}
+
+/** Wrapper for memcmp(3).  Compare memory areas.
+* @param str1	in: first memory block to compare
+* @param str2	in: second memory block to compare
+* @param n	in: number of bytes to compare
+* @return	negative, 0, or positive if str1 is smaller, equal,
+		or greater than str2, respectively. */
+UNIV_INLINE
+int
+ut_memcmp(const void* str1, const void* str2, ulint n)
+{
+	return(memcmp(str1, str2, n));
+}
+
+/** Wrapper for strcpy(3).  Copy a NUL-terminated string.
+* @param dest	in: copy to
+* @param sour	in: copy from
+* @return	dest */
+UNIV_INLINE
+char*
+ut_strcpy(char* dest, const char* sour)
+{
+	return(strcpy(dest, sour));
+}
+
+/** Wrapper for strlen(3).  Determine the length of a NUL-terminated string.
+* @param str	in: string
+* @return	length of the string in bytes, excluding the terminating NUL */
+UNIV_INLINE
+ulint
+ut_strlen(const char* str)
+{
+	return(strlen(str));
+}
+
+/** Wrapper for strcmp(3).  Compare NUL-terminated strings.
+* @param str1	in: first string to compare
+* @param str2	in: second string to compare
+* @return	negative, 0, or positive if str1 is smaller, equal,
+		or greater than str2, respectively. */
+UNIV_INLINE
+int
+ut_strcmp(const char* str1, const char* str2)
+{
+	return(strcmp(str1, str2));
+}
+
+/**********************************************************************//**
+Compute strlen(ut_strcpyq(str, q)).
+@return	length of the string when quoted */
+UNIV_INLINE
+ulint
+ut_strlenq(
+/*=======*/
+	const char*	str,	/*!< in: null-terminated string */
+	char		q)	/*!< in: the quote character */
+{
+	ulint len;
+
+	for (len = 0; *str; len++, str++) {
+		if (*str == q) {
+			len++;
+		}
+	}
+
+	return(len);
+}
+
+/**********************************************************************//**
+Converts a raw binary data to a NUL-terminated hex string. The output is
+truncated if there is not enough space in "hex", make sure "hex_size" is at
+least (2 * raw_size + 1) if you do not want this to happen. Returns the
+actual number of characters written to "hex" (including the NUL).
+@return	number of chars written */
+UNIV_INLINE
+ulint
+ut_raw_to_hex(
+/*==========*/
+	const void*	raw,		/*!< in: raw data */
+	ulint		raw_size,	/*!< in: "raw" length in bytes */
+	char*		hex,		/*!< out: hex string */
+	ulint		hex_size)	/*!< in: "hex" size in bytes */
+{
+
+#ifdef WORDS_BIGENDIAN
+
+#define MK_UINT16(a, b) (((uint16) (a)) << 8 | (uint16) (b))
+
+#define UINT16_GET_A(u)	((unsigned char) ((u) >> 8))
+#define UINT16_GET_B(u)	((unsigned char) ((u) & 0xFF))
+
+#else /* WORDS_BIGENDIAN */
+
+#define MK_UINT16(a, b) (((uint16) (b)) << 8 | (uint16) (a))
+
+#define UINT16_GET_A(u)	((unsigned char) ((u) & 0xFF))
+#define UINT16_GET_B(u)	((unsigned char) ((u) >> 8))
+
+#endif /* WORDS_BIGENDIAN */
+
+#define MK_ALL_UINT16_WITH_A(a)	\
+	MK_UINT16(a, '0'),	\
+	MK_UINT16(a, '1'),	\
+	MK_UINT16(a, '2'),	\
+	MK_UINT16(a, '3'),	\
+	MK_UINT16(a, '4'),	\
+	MK_UINT16(a, '5'),	\
+	MK_UINT16(a, '6'),	\
+	MK_UINT16(a, '7'),	\
+	MK_UINT16(a, '8'),	\
+	MK_UINT16(a, '9'),	\
+	MK_UINT16(a, 'A'),	\
+	MK_UINT16(a, 'B'),	\
+	MK_UINT16(a, 'C'),	\
+	MK_UINT16(a, 'D'),	\
+	MK_UINT16(a, 'E'),	\
+	MK_UINT16(a, 'F')
+
+	static const uint16	hex_map[256] = {
+		MK_ALL_UINT16_WITH_A('0'),
+		MK_ALL_UINT16_WITH_A('1'),
+		MK_ALL_UINT16_WITH_A('2'),
+		MK_ALL_UINT16_WITH_A('3'),
+		MK_ALL_UINT16_WITH_A('4'),
+		MK_ALL_UINT16_WITH_A('5'),
+		MK_ALL_UINT16_WITH_A('6'),
+		MK_ALL_UINT16_WITH_A('7'),
+		MK_ALL_UINT16_WITH_A('8'),
+		MK_ALL_UINT16_WITH_A('9'),
+		MK_ALL_UINT16_WITH_A('A'),
+		MK_ALL_UINT16_WITH_A('B'),
+		MK_ALL_UINT16_WITH_A('C'),
+		MK_ALL_UINT16_WITH_A('D'),
+		MK_ALL_UINT16_WITH_A('E'),
+		MK_ALL_UINT16_WITH_A('F')
+	};
+	const unsigned char*	rawc;
+	ulint			read_bytes;
+	ulint			write_bytes;
+	ulint			i;
+
+	rawc = (const unsigned char*) raw;
+
+	if (hex_size == 0) {
+
+		return(0);
+	}
+
+	if (hex_size <= 2 * raw_size) {
+
+		read_bytes = hex_size / 2;
+		write_bytes = hex_size;
+	} else {
+
+		read_bytes = raw_size;
+		write_bytes = 2 * raw_size + 1;
+	}
+
+#define LOOP_READ_BYTES(ASSIGN)			\
+	for (i = 0; i < read_bytes; i++) {	\
+		ASSIGN;				\
+		hex += 2;			\
+		rawc++;				\
+	}
+
+	if (ut_align_offset(hex, 2) == 0) {
+
+		LOOP_READ_BYTES(
+			*(uint16*) hex = hex_map[*rawc]
+		);
+	} else {
+
+		LOOP_READ_BYTES(
+			*hex       = UINT16_GET_A(hex_map[*rawc]);
+			*(hex + 1) = UINT16_GET_B(hex_map[*rawc])
+		);
+	}
+
+	if (hex_size <= 2 * raw_size && hex_size % 2 == 0) {
+
+		hex--;
+	}
+
+	*hex = '\0';
+
+	return(write_bytes);
+}
+
+/*******************************************************************//**
+Adds single quotes to the start and end of string and escapes any quotes
+by doubling them. Returns the number of bytes that were written to "buf"
+(including the terminating NUL). If buf_size is too small then the
+trailing bytes from "str" are discarded.
+@return	number of bytes that were written */
+UNIV_INLINE
+ulint
+ut_str_sql_format(
+/*==============*/
+	const char*	str,		/*!< in: string */
+	ulint		str_len,	/*!< in: string length in bytes */
+	char*		buf,		/*!< out: output buffer */
+	ulint		buf_size)	/*!< in: output buffer size
+					in bytes */
+{
+	ulint	str_i;
+	ulint	buf_i;
+
+	buf_i = 0;
+
+	switch (buf_size) {
+	case 3:
+
+		if (str_len == 0) {
+
+			buf[buf_i] = '\'';
+			buf_i++;
+			buf[buf_i] = '\'';
+			buf_i++;
+		}
+		/* FALLTHROUGH */
+	case 2:
+	case 1:
+
+		buf[buf_i] = '\0';
+		buf_i++;
+		/* FALLTHROUGH */
+	case 0:
+
+		return(buf_i);
+	}
+
+	/* buf_size >= 4 */
+
+	buf[0] = '\'';
+	buf_i = 1;
+
+	for (str_i = 0; str_i < str_len; str_i++) {
+
+		char	ch;
+
+		if (buf_size - buf_i == 2) {
+
+			break;
+		}
+
+		ch = str[str_i];
+
+		switch (ch) {
+		case '\0':
+
+			if (UNIV_UNLIKELY(buf_size - buf_i < 4)) {
+
+				goto func_exit;
+			}
+			buf[buf_i] = '\\';
+			buf_i++;
+			buf[buf_i] = '0';
+			buf_i++;
+			break;
+		case '\'':
+		case '\\':
+
+			if (UNIV_UNLIKELY(buf_size - buf_i < 4)) {
+
+				goto func_exit;
+			}
+			buf[buf_i] = ch;
+			buf_i++;
+			/* FALLTHROUGH */
+		default:
+
+			buf[buf_i] = ch;
+			buf_i++;
+		}
+	}
+
+func_exit:
+
+	buf[buf_i] = '\'';
+	buf_i++;
+	buf[buf_i] = '\0';
+	buf_i++;
+
+	return(buf_i);
+}
diff --git a/storage/xtradb/include/ut0rbt.h b/storage/xtradb/include/ut0rbt.h
new file mode 100644
index 00000000000..6fd050acfe7
--- /dev/null
+++ b/storage/xtradb/include/ut0rbt.h
@@ -0,0 +1,309 @@
+/*****************************************************************************
+Copyright (c) 2006, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/*******************************************************************//**
+@file include/ut0rbt.h
+Red-Black tree implementation.
+
+Created 2007-03-20 Sunny Bains
+************************************************************************/
+
+#ifndef INNOBASE_UT0RBT_H
+#define INNOBASE_UT0RBT_H
+
+#if !defined(IB_RBT_TESTING)
+#include "univ.i"
+#include "ut0mem.h"
+#else
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <assert.h>
+
+#define	ut_malloc	malloc
+#define	ut_free		free
+#define	ulint		unsigned long
+#define	ut_a(c)		assert(c)
+#define ut_error	assert(0)
+#define	ibool		unsigned int
+#define	TRUE		1
+#define	FALSE		0
+#endif
+
+/* Red black tree typedefs */
+typedef struct ib_rbt_struct ib_rbt_t;
+typedef struct ib_rbt_node_struct ib_rbt_node_t;
+/* FIXME: Iterator is a better name than _bound_ */
+typedef struct ib_rbt_bound_struct ib_rbt_bound_t;
+typedef void (*ib_rbt_print_node)(const ib_rbt_node_t* node);
+typedef int (*ib_rbt_compare)(const void* p1, const void* p2);
+
+/* Red black tree color types */
+enum ib_rbt_color_enum {
+	IB_RBT_RED,
+	IB_RBT_BLACK
+};
+
+typedef enum ib_rbt_color_enum ib_rbt_color_t;
+
+/* Red black tree node */
+struct ib_rbt_node_struct {
+	ib_rbt_color_t	color;			/* color of this node */
+
+	ib_rbt_node_t*	left;			/* points left child */
+	ib_rbt_node_t*	right;			/* points right child */
+	ib_rbt_node_t*	parent;			/* points parent node */
+
+	char		value[1];		/* Data value */
+};
+
+/* Red black tree instance.*/
+struct	ib_rbt_struct {
+	ib_rbt_node_t*	nil;			/* Black colored node that is
+						used as a sentinel. This is
+						pre-allocated too.*/
+
+	ib_rbt_node_t*	root;			/* Root of the tree, this is
+						pre-allocated and the first
+						data node is the left child.*/
+
+	ulint		n_nodes;		/* Total number of data nodes */
+
+	ib_rbt_compare	compare;		/* Fn. to use for comparison */
+	ulint		sizeof_value;		/* Sizeof the item in bytes */
+};
+
+/* The result of searching for a key in the tree, this is useful for
+a speedy lookup and insert if key doesn't exist.*/
+struct ib_rbt_bound_struct {
+	const ib_rbt_node_t*
+			last;			/* Last node visited */
+
+	int		result;			/* Result of comparing with
+						the last non-nil node that
+						was visited */
+};
+
+/* Size in elements (t is an rb tree instance) */
+#define rbt_size(t)	(t->n_nodes)
+
+/* Check whether the rb tree is empty (t is an rb tree instance) */
+#define rbt_empty(t)	(rbt_size(t) == 0)
+
+/* Get data value (t is the data type, n is an rb tree node instance) */
+#define rbt_value(t, n) ((t*) &n->value[0])
+
+/* Compare a key with the node value (t is tree, k is key, n is node)*/
+#define rbt_compare(t, k, n) (t->compare(k, n->value))
+
+/****************************************************************//**
+Free an instance of  a red black tree */
+UNIV_INTERN
+void
+rbt_free(
+/*=====*/
+	ib_rbt_t*	tree);		/*!< in: rb tree to free */
+/****************************************************************//**
+Create an instance of a red black tree
+@return	rb tree instance */
+UNIV_INTERN
+ib_rbt_t*
+rbt_create(
+/*=======*/
+	size_t		sizeof_value,	/*!< in: size in bytes */
+	ib_rbt_compare	compare);	/*!< in: comparator */
+/****************************************************************//**
+Delete a node from the red black tree, identified by key.
+@return TRUE if success FALSE if not found */
+UNIV_INTERN
+ibool
+rbt_delete(
+/*=======*/
+	ib_rbt_t*	tree,		/*!< in: rb tree */
+	const void*	key);		/*!< in: key to delete */
+/****************************************************************//**
+Remove a node from the rb tree, the node is not free'd, that is the
+callers responsibility.
+@return	the deleted node with the const. */
+UNIV_INTERN
+ib_rbt_node_t*
+rbt_remove_node(
+/*============*/
+	ib_rbt_t*	tree,		/*!< in: rb tree */
+	const ib_rbt_node_t*
+			node);		/*!< in: node to delete, this
+					is a fudge and declared const
+					because the caller has access
+					only to const nodes.*/
+/****************************************************************//**
+Find a matching node in the rb tree.
+@return	node if found else return NULL */
+UNIV_INTERN
+const ib_rbt_node_t*
+rbt_lookup(
+/*=======*/
+	const ib_rbt_t*	tree,		/*!< in: rb tree to search */
+	const void*	key);		/*!< in: key to lookup */
+/****************************************************************//**
+Generic insert of a value in the rb tree.
+@return	inserted node */
+UNIV_INTERN
+const ib_rbt_node_t*
+rbt_insert(
+/*=======*/
+	ib_rbt_t*	tree,		/*!< in: rb tree */
+	const void*	key,		/*!< in: key for ordering */
+	const void*	value);		/*!< in: data that will be
+					copied to the node.*/
+/****************************************************************//**
+Add a new node to the tree, useful for data that is pre-sorted.
+@return	appended node */
+UNIV_INTERN
+const ib_rbt_node_t*
+rbt_add_node(
+/*=========*/
+	ib_rbt_t*	tree,		/*!< in: rb tree */
+	ib_rbt_bound_t*	parent,		/*!< in: parent */
+	const void*	value);		/*!< in: this value is copied
+					to the node */
+/****************************************************************//**
+Return the left most data node in the tree
+@return	left most node */
+UNIV_INTERN
+const ib_rbt_node_t*
+rbt_first(
+/*======*/
+	const ib_rbt_t*	tree);		/*!< in: rb tree */
+/****************************************************************//**
+Return the right most data node in the tree
+@return	right most node */
+UNIV_INTERN
+const ib_rbt_node_t*
+rbt_last(
+/*=====*/
+	const ib_rbt_t*	tree);		/*!< in: rb tree */
+/****************************************************************//**
+Return the next node from current.
+@return	successor node to current that is passed in. */
+UNIV_INTERN
+const ib_rbt_node_t*
+rbt_next(
+/*=====*/
+	const ib_rbt_t*	tree,		/*!< in: rb tree */
+	const ib_rbt_node_t*		/*!< in: current node */
+			current);
+/****************************************************************//**
+Return the prev node from current.
+@return	precedessor node to current that is passed in */
+UNIV_INTERN
+const ib_rbt_node_t*
+rbt_prev(
+/*=====*/
+	const ib_rbt_t*	tree,		/*!< in: rb tree */
+	const ib_rbt_node_t*		/*!< in: current node */
+			current);
+/****************************************************************//**
+Find the node that has the lowest key that is >= key.
+@return	node that satisfies the lower bound constraint or NULL */
+UNIV_INTERN
+const ib_rbt_node_t*
+rbt_lower_bound(
+/*============*/
+	const ib_rbt_t*	tree,		/*!< in: rb tree */
+	const void*	key);		/*!< in: key to search */
+/****************************************************************//**
+Find the node that has the greatest key that is <= key.
+@return	node that satisifies the upper bound constraint or NULL */
+UNIV_INTERN
+const ib_rbt_node_t*
+rbt_upper_bound(
+/*============*/
+	const ib_rbt_t*	tree,		/*!< in: rb tree */
+	const void*	key);		/*!< in: key to search */
+/****************************************************************//**
+Search for the key, a node will be retuned in parent.last, whether it
+was found or not. If not found then parent.last will contain the
+parent node for the possibly new key otherwise the matching node.
+@return	result of last comparison */
+UNIV_INTERN
+int
+rbt_search(
+/*=======*/
+	const ib_rbt_t*	tree,		/*!< in: rb tree */
+	ib_rbt_bound_t*	parent,		/*!< in: search bounds */
+	const void*	key);		/*!< in: key to search */
+/****************************************************************//**
+Search for the key, a node will be retuned in parent.last, whether it
+was found or not. If not found then parent.last will contain the
+parent node for the possibly new key otherwise the matching node.
+@return	result of last comparison */
+UNIV_INTERN
+int
+rbt_search_cmp(
+/*===========*/
+	const ib_rbt_t*	tree,		/*!< in: rb tree */
+	ib_rbt_bound_t*	parent,		/*!< in: search bounds */
+	const void*	key,		/*!< in: key to search */
+	ib_rbt_compare	compare);	/*!< in: comparator */
+/****************************************************************//**
+Clear the tree, deletes (and free's) all the nodes. */
+UNIV_INTERN
+void
+rbt_clear(
+/*======*/
+	ib_rbt_t*	tree);		/*!< in: rb tree */
+/****************************************************************//**
+Merge the node from dst into src. Return the number of nodes merged.
+@return	no. of recs merged */
+UNIV_INTERN
+ulint
+rbt_merge_uniq(
+/*===========*/
+	ib_rbt_t*	dst,		/*!< in: dst rb tree */
+	const ib_rbt_t*	src);		/*!< in: src rb tree */
+/****************************************************************//**
+Merge the node from dst into src. Return the number of nodes merged.
+Delete the nodes from src after copying node to dst. As a side effect
+the duplicates will be left untouched in the src, since we don't support
+duplicates (yet). NOTE: src and dst must be similar, the function doesn't
+check for this condition (yet).
+@return	no. of recs merged */
+UNIV_INTERN
+ulint
+rbt_merge_uniq_destructive(
+/*=======================*/
+	ib_rbt_t*	dst,		/*!< in: dst rb tree */
+	ib_rbt_t*	src);		/*!< in: src rb tree */
+/****************************************************************//**
+Verify the integrity of the RB tree. For debugging. 0 failure else height
+of tree (in count of black nodes).
+@return	TRUE if OK FALSE if tree invalid. */
+UNIV_INTERN
+ibool
+rbt_validate(
+/*=========*/
+	const ib_rbt_t*	tree);		/*!< in: tree to validate */
+/****************************************************************//**
+Iterate over the tree in depth first order. */
+UNIV_INTERN
+void
+rbt_print(
+/*======*/
+	const ib_rbt_t*		tree,	/*!< in: tree to traverse */
+	ib_rbt_print_node	print);	/*!< in: print function */
+
+#endif /* INNOBASE_UT0RBT_H */
diff --git a/storage/xtradb/include/ut0rnd.h b/storage/xtradb/include/ut0rnd.h
new file mode 100644
index 00000000000..ad55df40abc
--- /dev/null
+++ b/storage/xtradb/include/ut0rnd.h
@@ -0,0 +1,150 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/******************************************************************//**
+@file include/ut0rnd.h
+Random numbers and hashing
+
+Created 1/20/1994 Heikki Tuuri
+***********************************************************************/
+
+#ifndef ut0rnd_h
+#define ut0rnd_h
+
+#include "univ.i"
+
+#include "ut0byte.h"
+
+/** The 'character code' for end of field or string (used
+in folding records */
+#define UT_END_OF_FIELD		257
+
+/********************************************************//**
+This is used to set the random number seed. */
+UNIV_INLINE
+void
+ut_rnd_set_seed(
+/*============*/
+	ulint	 seed);		 /*!< in: seed */
+/********************************************************//**
+The following function generates a series of 'random' ulint integers.
+@return	the next 'random' number */
+UNIV_INLINE
+ulint
+ut_rnd_gen_next_ulint(
+/*==================*/
+	ulint	rnd);	/*!< in: the previous random number value */
+/*********************************************************//**
+The following function generates 'random' ulint integers which
+enumerate the value space (let there be N of them) of ulint integers
+in a pseudo-random fashion. Note that the same integer is repeated
+always after N calls to the generator.
+@return	the 'random' number */
+UNIV_INLINE
+ulint
+ut_rnd_gen_ulint(void);
+/*==================*/
+/********************************************************//**
+Generates a random integer from a given interval.
+@return	the 'random' number */
+UNIV_INLINE
+ulint
+ut_rnd_interval(
+/*============*/
+	ulint	low,	/*!< in: low limit; can generate also this value */
+	ulint	high);	/*!< in: high limit; can generate also this value */
+/*********************************************************//**
+Generates a random iboolean value.
+@return	the random value */
+UNIV_INLINE
+ibool
+ut_rnd_gen_ibool(void);
+/*=================*/
+/*******************************************************//**
+The following function generates a hash value for a ulint integer
+to a hash table of size table_size, which should be a prime or some
+random number to work reliably.
+@return	hash value */
+UNIV_INLINE
+ulint
+ut_hash_ulint(
+/*==========*/
+	ulint	 key,		/*!< in: value to be hashed */
+	ulint	 table_size);	/*!< in: hash table size */
+/*************************************************************//**
+Folds a pair of ulints.
+@return	folded value */
+UNIV_INLINE
+ulint
+ut_fold_ulint_pair(
+/*===============*/
+	ulint	n1,	/*!< in: ulint */
+	ulint	n2)	/*!< in: ulint */
+	__attribute__((const));
+/*************************************************************//**
+Folds a dulint.
+@return	folded value */
+UNIV_INLINE
+ulint
+ut_fold_dulint(
+/*===========*/
+	dulint	d)	/*!< in: dulint */
+	__attribute__((const));
+/*************************************************************//**
+Folds a character string ending in the null character.
+@return	folded value */
+UNIV_INLINE
+ulint
+ut_fold_string(
+/*===========*/
+	const char*	str)	/*!< in: null-terminated string */
+	__attribute__((pure));
+/*************************************************************//**
+Folds a binary string.
+@return	folded value */
+UNIV_INLINE
+ulint
+ut_fold_binary(
+/*===========*/
+	const byte*	str,	/*!< in: string of bytes */
+	ulint		len)	/*!< in: length */
+	__attribute__((pure));
+UNIV_INLINE
+ulint
+ut_fold_binary_32(
+/*==============*/
+	const byte*	str,	/*!< in: string of bytes */
+	ulint		len)	/*!< in: length */
+	__attribute__((pure));
+/***********************************************************//**
+Looks for a prime number slightly greater than the given argument.
+The prime is chosen so that it is not near any power of 2.
+@return	prime */
+UNIV_INTERN
+ulint
+ut_find_prime(
+/*==========*/
+	ulint	n)	/*!< in: positive number > 100 */
+	__attribute__((const));
+
+
+#ifndef UNIV_NONINL
+#include "ut0rnd.ic"
+#endif
+
+#endif
diff --git a/storage/xtradb/include/ut0rnd.ic b/storage/xtradb/include/ut0rnd.ic
new file mode 100644
index 00000000000..c2043660efd
--- /dev/null
+++ b/storage/xtradb/include/ut0rnd.ic
@@ -0,0 +1,256 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************************//**
+@file include/ut0rnd.ic
+Random numbers and hashing
+
+Created 5/30/1994 Heikki Tuuri
+*******************************************************************/
+
+#define UT_HASH_RANDOM_MASK	1463735687
+#define UT_HASH_RANDOM_MASK2	1653893711
+#define UT_RND1			151117737
+#define UT_RND2			119785373
+#define UT_RND3			 85689495
+#define UT_RND4			 76595339
+#define UT_SUM_RND2		 98781234
+#define UT_SUM_RND3		126792457
+#define UT_SUM_RND4		 63498502
+#define UT_XOR_RND1		187678878
+#define UT_XOR_RND2		143537923
+
+/** Seed value of ut_rnd_gen_ulint() */
+extern	ulint	 ut_rnd_ulint_counter;
+
+/********************************************************//**
+This is used to set the random number seed. */
+UNIV_INLINE
+void
+ut_rnd_set_seed(
+/*============*/
+	ulint	 seed)		 /*!< in: seed */
+{
+	ut_rnd_ulint_counter = seed;
+}
+
+/********************************************************//**
+The following function generates a series of 'random' ulint integers.
+@return	the next 'random' number */
+UNIV_INLINE
+ulint
+ut_rnd_gen_next_ulint(
+/*==================*/
+	ulint	rnd)	/*!< in: the previous random number value */
+{
+	ulint	n_bits;
+
+	n_bits = 8 * sizeof(ulint);
+
+	rnd = UT_RND2 * rnd + UT_SUM_RND3;
+	rnd = UT_XOR_RND1 ^ rnd;
+	rnd = (rnd << 20) + (rnd >> (n_bits - 20));
+	rnd = UT_RND3 * rnd + UT_SUM_RND4;
+	rnd = UT_XOR_RND2 ^ rnd;
+	rnd = (rnd << 20) + (rnd >> (n_bits - 20));
+	rnd = UT_RND1 * rnd + UT_SUM_RND2;
+
+	return(rnd);
+}
+
+/********************************************************//**
+The following function generates 'random' ulint integers which
+enumerate the value space of ulint integers in a pseudo random
+fashion. Note that the same integer is repeated always after
+2 to power 32 calls to the generator (if ulint is 32-bit).
+@return	the 'random' number */
+UNIV_INLINE
+ulint
+ut_rnd_gen_ulint(void)
+/*==================*/
+{
+	ulint	rnd;
+	ulint	n_bits;
+
+	n_bits = 8 * sizeof(ulint);
+
+	ut_rnd_ulint_counter = UT_RND1 * ut_rnd_ulint_counter + UT_RND2;
+
+	rnd = ut_rnd_gen_next_ulint(ut_rnd_ulint_counter);
+
+	return(rnd);
+}
+
+/********************************************************//**
+Generates a random integer from a given interval.
+@return	the 'random' number */
+UNIV_INLINE
+ulint
+ut_rnd_interval(
+/*============*/
+	ulint	low,	/*!< in: low limit; can generate also this value */
+	ulint	high)	/*!< in: high limit; can generate also this value */
+{
+	ulint	rnd;
+
+	ut_ad(high >= low);
+
+	if (low == high) {
+
+		return(low);
+	}
+
+	rnd = ut_rnd_gen_ulint();
+
+	return(low + (rnd % (high - low + 1)));
+}
+
+/*********************************************************//**
+Generates a random iboolean value.
+@return	the random value */
+UNIV_INLINE
+ibool
+ut_rnd_gen_ibool(void)
+/*=================*/
+{
+	ulint	 x;
+
+	x = ut_rnd_gen_ulint();
+
+	if (((x >> 20) + (x >> 15)) & 1) {
+
+		return(TRUE);
+	}
+
+	return(FALSE);
+}
+
+/*******************************************************//**
+The following function generates a hash value for a ulint integer
+to a hash table of size table_size, which should be a prime
+or some random number for the hash table to work reliably.
+@return	hash value */
+UNIV_INLINE
+ulint
+ut_hash_ulint(
+/*==========*/
+	ulint	 key,		/*!< in: value to be hashed */
+	ulint	 table_size)	/*!< in: hash table size */
+{
+	ut_ad(table_size);
+	key = key ^ UT_HASH_RANDOM_MASK2;
+
+	return(key % table_size);
+}
+
+/*************************************************************//**
+Folds a pair of ulints.
+@return	folded value */
+UNIV_INLINE
+ulint
+ut_fold_ulint_pair(
+/*===============*/
+	ulint	n1,	/*!< in: ulint */
+	ulint	n2)	/*!< in: ulint */
+{
+	return(((((n1 ^ n2 ^ UT_HASH_RANDOM_MASK2) << 8) + n1)
+		^ UT_HASH_RANDOM_MASK) + n2);
+}
+
+/*************************************************************//**
+Folds a dulint.
+@return	folded value */
+UNIV_INLINE
+ulint
+ut_fold_dulint(
+/*===========*/
+	dulint	d)	/*!< in: dulint */
+{
+	return(ut_fold_ulint_pair(ut_dulint_get_low(d),
+				  ut_dulint_get_high(d)));
+}
+
+/*************************************************************//**
+Folds a character string ending in the null character.
+@return	folded value */
+UNIV_INLINE
+ulint
+ut_fold_string(
+/*===========*/
+	const char*	str)	/*!< in: null-terminated string */
+{
+	ulint	fold = 0;
+
+	ut_ad(str);
+
+	while (*str != '\0') {
+		fold = ut_fold_ulint_pair(fold, (ulint)(*str));
+		str++;
+	}
+
+	return(fold);
+}
+
+/*************************************************************//**
+Folds a binary string.
+@return	folded value */
+UNIV_INLINE
+ulint
+ut_fold_binary(
+/*===========*/
+	const byte*	str,	/*!< in: string of bytes */
+	ulint		len)	/*!< in: length */
+{
+	const byte*	str_end	= str + len;
+	ulint		fold = 0;
+
+	ut_ad(str || !len);
+
+	while (str < str_end) {
+		fold = ut_fold_ulint_pair(fold, (ulint)(*str));
+
+		str++;
+	}
+
+	return(fold);
+}
+
+UNIV_INLINE
+ulint
+ut_fold_binary_32(
+/*==============*/
+	const byte*	str,	/*!< in: string of bytes */
+	ulint		len)	/*!< in: length */
+{
+	const ib_uint32_t*	str_end = (const ib_uint32_t*) (str + len);
+	const ib_uint32_t*	str_32 = (const ib_uint32_t*) str;
+	ulint			fold = 0;
+
+	ut_ad(str);
+	/* This function is only for word-aligned data */
+	ut_ad(len % 4 == 0);
+	ut_ad((ulint)str % 4 == 0);
+
+	while (str_32 < str_end) {
+		fold = ut_fold_ulint_pair(fold, (ulint)(*str_32));
+
+		str_32++;
+	}
+
+	return(fold);
+}
diff --git a/storage/xtradb/include/ut0sort.h b/storage/xtradb/include/ut0sort.h
new file mode 100644
index 00000000000..5c6647dda9e
--- /dev/null
+++ b/storage/xtradb/include/ut0sort.h
@@ -0,0 +1,106 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/******************************************************************//**
+@file include/ut0sort.h
+Sort utility
+
+Created 11/9/1995 Heikki Tuuri
+***********************************************************************/
+
+#ifndef ut0sort_h
+#define ut0sort_h
+
+#include "univ.i"
+
+/* This module gives a macro definition of the body of
+a standard sort function for an array of elements of any
+type. The comparison function is given as a parameter to
+the macro. The sort algorithm is mergesort which has logarithmic
+worst case.
+*/
+
+/*******************************************************************//**
+This macro expands to the body of a standard sort function.
+The sort function uses mergesort and must be defined separately
+for each type of array.
+Also the comparison function has to be defined individually
+for each array cell type. SORT_FUN is the sort function name.
+The function takes the array to be sorted (ARR),
+the array of auxiliary space (AUX_ARR) of same size,
+and the low (LOW), inclusive, and high (HIGH), noninclusive,
+limits for the sort interval as arguments.
+CMP_FUN is the comparison function name. It takes as arguments
+two elements from the array and returns 1, if the first is bigger,
+0 if equal, and -1 if the second bigger. */
+
+#define UT_SORT_FUNCTION_BODY(SORT_FUN, ARR, AUX_ARR, LOW, HIGH, CMP_FUN)\
+{\
+	ulint		ut_sort_mid77;\
+	ulint		ut_sort_i77;\
+	ulint		ut_sort_low77;\
+	ulint		ut_sort_high77;\
+\
+	ut_ad((LOW) < (HIGH));\
+	ut_ad(ARR);\
+	ut_ad(AUX_ARR);\
+\
+	if ((LOW) == (HIGH) - 1) {\
+		return;\
+	} else if ((LOW) == (HIGH) - 2) {\
+		if (CMP_FUN((ARR)[LOW], (ARR)[(HIGH) - 1]) > 0) {\
+			(AUX_ARR)[LOW] = (ARR)[LOW];\
+			(ARR)[LOW] = (ARR)[(HIGH) - 1];\
+			(ARR)[(HIGH) - 1] = (AUX_ARR)[LOW];\
+		}\
+		return;\
+	}\
+\
+	ut_sort_mid77 = ((LOW) + (HIGH)) / 2;\
+\
+	SORT_FUN((ARR), (AUX_ARR), (LOW), ut_sort_mid77);\
+	SORT_FUN((ARR), (AUX_ARR), ut_sort_mid77, (HIGH));\
+\
+	ut_sort_low77 = (LOW);\
+	ut_sort_high77 = ut_sort_mid77;\
+\
+	for (ut_sort_i77 = (LOW); ut_sort_i77 < (HIGH); ut_sort_i77++) {\
+\
+		if (ut_sort_low77 >= ut_sort_mid77) {\
+			(AUX_ARR)[ut_sort_i77] = (ARR)[ut_sort_high77];\
+			ut_sort_high77++;\
+		} else if (ut_sort_high77 >= (HIGH)) {\
+			(AUX_ARR)[ut_sort_i77] = (ARR)[ut_sort_low77];\
+			ut_sort_low77++;\
+		} else if (CMP_FUN((ARR)[ut_sort_low77],\
+				   (ARR)[ut_sort_high77]) > 0) {\
+			(AUX_ARR)[ut_sort_i77] = (ARR)[ut_sort_high77];\
+			ut_sort_high77++;\
+		} else {\
+			(AUX_ARR)[ut_sort_i77] = (ARR)[ut_sort_low77];\
+			ut_sort_low77++;\
+		}\
+	}\
+\
+	memcpy((void*) ((ARR) + (LOW)), (AUX_ARR) + (LOW),\
+	       ((HIGH) - (LOW)) * sizeof *(ARR));\
+}\
+
+
+#endif
+
diff --git a/storage/xtradb/include/ut0ut.h b/storage/xtradb/include/ut0ut.h
new file mode 100644
index 00000000000..197b8401428
--- /dev/null
+++ b/storage/xtradb/include/ut0ut.h
@@ -0,0 +1,403 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 2009, Sun Microsystems, Inc.
+
+Portions of this file contain modifications contributed and copyrighted by
+Sun Microsystems, Inc. Those modifications are gratefully acknowledged and
+are described briefly in the InnoDB documentation. The contributions by
+Sun Microsystems are incorporated with their permission, and subject to the
+conditions contained in the file COPYING.Sun_Microsystems.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/******************************************************************//**
+@file include/ut0ut.h
+Various utilities
+
+Created 1/20/1994 Heikki Tuuri
+***********************************************************************/
+
+#ifndef ut0ut_h
+#define ut0ut_h
+
+#include "univ.i"
+
+#ifndef UNIV_HOTBACKUP
+# include "os0sync.h" /* for HAVE_ATOMIC_BUILTINS */
+#endif /* UNIV_HOTBACKUP */
+
+#include <time.h>
+#ifndef MYSQL_SERVER
+#include <ctype.h>
+#endif
+
+/** Index name prefix in fast index creation */
+#define	TEMP_INDEX_PREFIX	'\377'
+/** Index name prefix in fast index creation, as a string constant */
+#define TEMP_INDEX_PREFIX_STR	"\377"
+
+/** Time stamp */
+typedef time_t	ib_time_t;
+
+#ifndef UNIV_HOTBACKUP
+#if defined(HAVE_IB_PAUSE_INSTRUCTION)
+#  ifdef WIN32
+     /* In the Win32 API, the x86 PAUSE instruction is executed by calling
+     the YieldProcessor macro defined in WinNT.h. It is a CPU architecture-
+     independent way by using YieldProcessor.*/
+#    define UT_RELAX_CPU() YieldProcessor()
+#  else
+     /* According to the gcc info page, asm volatile means that the
+     instruction has important side-effects and must not be removed.
+     Also asm volatile may trigger a memory barrier (spilling all registers
+     to memory). */
+#    define UT_RELAX_CPU() __asm__ __volatile__ ("pause")
+#  endif
+#elif defined(HAVE_ATOMIC_BUILTINS)
+#  define UT_RELAX_CPU() do { \
+     volatile lint	volatile_var; \
+     os_compare_and_swap_lint(&volatile_var, 0, 1); \
+   } while (0)
+#else
+#  define UT_RELAX_CPU() ((void)0) /* avoid warning for an empty statement */
+#endif
+
+/*********************************************************************//**
+Delays execution for at most max_wait_us microseconds or returns earlier
+if cond becomes true.
+@param cond		in: condition to wait for; evaluated every 2 ms
+@param max_wait_us	in: maximum delay to wait, in microseconds */
+#define UT_WAIT_FOR(cond, max_wait_us)				\
+do {								\
+	ullint	start_us;					\
+	start_us = ut_time_us(NULL);				\
+	while (!(cond) 						\
+	       && ut_time_us(NULL) - start_us < (max_wait_us)) {\
+								\
+		os_thread_sleep(2000 /* 2 ms */);		\
+	}							\
+} while (0)
+#endif /* !UNIV_HOTBACKUP */
+
+/********************************************************//**
+Gets the high 32 bits in a ulint. That is makes a shift >> 32,
+but since there seem to be compiler bugs in both gcc and Visual C++,
+we do this by a special conversion.
+@return	a >> 32 */
+UNIV_INTERN
+ulint
+ut_get_high32(
+/*==========*/
+	ulint	a);	/*!< in: ulint */
+/******************************************************//**
+Calculates the minimum of two ulints.
+@return	minimum */
+UNIV_INLINE
+ulint
+ut_min(
+/*===*/
+	ulint	 n1,	/*!< in: first number */
+	ulint	 n2);	/*!< in: second number */
+/******************************************************//**
+Calculates the maximum of two ulints.
+@return	maximum */
+UNIV_INLINE
+ulint
+ut_max(
+/*===*/
+	ulint	 n1,	/*!< in: first number */
+	ulint	 n2);	/*!< in: second number */
+/****************************************************************//**
+Calculates minimum of two ulint-pairs. */
+UNIV_INLINE
+void
+ut_pair_min(
+/*========*/
+	ulint*	a,	/*!< out: more significant part of minimum */
+	ulint*	b,	/*!< out: less significant part of minimum */
+	ulint	a1,	/*!< in: more significant part of first pair */
+	ulint	b1,	/*!< in: less significant part of first pair */
+	ulint	a2,	/*!< in: more significant part of second pair */
+	ulint	b2);	/*!< in: less significant part of second pair */
+/******************************************************//**
+Compares two ulints.
+@return	1 if a > b, 0 if a == b, -1 if a < b */
+UNIV_INLINE
+int
+ut_ulint_cmp(
+/*=========*/
+	ulint	a,	/*!< in: ulint */
+	ulint	b);	/*!< in: ulint */
+/*******************************************************//**
+Compares two pairs of ulints.
+@return	-1 if a < b, 0 if a == b, 1 if a > b */
+UNIV_INLINE
+int
+ut_pair_cmp(
+/*========*/
+	ulint	a1,	/*!< in: more significant part of first pair */
+	ulint	a2,	/*!< in: less significant part of first pair */
+	ulint	b1,	/*!< in: more significant part of second pair */
+	ulint	b2);	/*!< in: less significant part of second pair */
+/*************************************************************//**
+Determines if a number is zero or a power of two.
+@param n	in: number
+@return		nonzero if n is zero or a power of two; zero otherwise */
+#define ut_is_2pow(n) UNIV_LIKELY(!((n) & ((n) - 1)))
+/*************************************************************//**
+Calculates fast the remainder of n/m when m is a power of two.
+@param n	in: numerator
+@param m	in: denominator, must be a power of two
+@return		the remainder of n/m */
+#define ut_2pow_remainder(n, m) ((n) & ((m) - 1))
+/*************************************************************//**
+Calculates the biggest multiple of m that is not bigger than n
+when m is a power of two.  In other words, rounds n down to m * k.
+@param n	in: number to round down
+@param m	in: alignment, must be a power of two
+@return		n rounded down to the biggest possible integer multiple of m */
+#define ut_2pow_round(n, m) ((n) & ~((m) - 1))
+/** Align a number down to a multiple of a power of two.
+@param n	in: number to round down
+@param m	in: alignment, must be a power of two
+@return		n rounded down to the biggest possible integer multiple of m */
+#define ut_calc_align_down(n, m) ut_2pow_round(n, m)
+/********************************************************//**
+Calculates the smallest multiple of m that is not smaller than n
+when m is a power of two.  In other words, rounds n up to m * k.
+@param n	in: number to round up
+@param m	in: alignment, must be a power of two
+@return		n rounded up to the smallest possible integer multiple of m */
+#define ut_calc_align(n, m) (((n) + ((m) - 1)) & ~((m) - 1))
+/*************************************************************//**
+Calculates fast the 2-logarithm of a number, rounded upward to an
+integer.
+@return	logarithm in the base 2, rounded upward */
+UNIV_INLINE
+ulint
+ut_2_log(
+/*=====*/
+	ulint	n);	/*!< in: number */
+/*************************************************************//**
+Calculates 2 to power n.
+@return	2 to power n */
+UNIV_INLINE
+ulint
+ut_2_exp(
+/*=====*/
+	ulint	n);	/*!< in: number */
+/*************************************************************//**
+Calculates fast the number rounded up to the nearest power of 2.
+@return	first power of 2 which is >= n */
+UNIV_INTERN
+ulint
+ut_2_power_up(
+/*==========*/
+	ulint	n)	/*!< in: number != 0 */
+	__attribute__((const));
+
+/** Determine how many bytes (groups of 8 bits) are needed to
+store the given number of bits.
+@param b	in: bits
+@return		number of bytes (octets) needed to represent b */
+#define UT_BITS_IN_BYTES(b) (((b) + 7) / 8)
+
+/**********************************************************//**
+Returns system time. We do not specify the format of the time returned:
+the only way to manipulate it is to use the function ut_difftime.
+@return	system time */
+UNIV_INTERN
+ib_time_t
+ut_time(void);
+/*=========*/
+#ifndef UNIV_HOTBACKUP
+/**********************************************************//**
+Returns system time.
+Upon successful completion, the value 0 is returned; otherwise the
+value -1 is returned and the global variable errno is set to indicate the
+error.
+@return	0 on success, -1 otherwise */
+UNIV_INTERN
+int
+ut_usectime(
+/*========*/
+	ulint*	sec,	/*!< out: seconds since the Epoch */
+	ulint*	ms);	/*!< out: microseconds since the Epoch+*sec */
+
+/**********************************************************//**
+Returns the number of microseconds since epoch. Similar to
+time(3), the return value is also stored in *tloc, provided
+that tloc is non-NULL.
+@return	us since epoch */
+UNIV_INTERN
+ullint
+ut_time_us(
+/*=======*/
+	ullint*	tloc);	/*!< out: us since epoch, if non-NULL */
+/**********************************************************//**
+Returns the number of milliseconds since some epoch.  The
+value may wrap around.  It should only be used for heuristic
+purposes.
+@return	ms since epoch */
+UNIV_INTERN
+ulint
+ut_time_ms(void);
+/*============*/
+#endif /* !UNIV_HOTBACKUP */
+
+/**********************************************************//**
+Returns the difference of two times in seconds.
+@return	time2 - time1 expressed in seconds */
+UNIV_INTERN
+double
+ut_difftime(
+/*========*/
+	ib_time_t	time2,	/*!< in: time */
+	ib_time_t	time1);	/*!< in: time */
+/**********************************************************//**
+Prints a timestamp to a file. */
+UNIV_INTERN
+void
+ut_print_timestamp(
+/*===============*/
+	FILE*  file); /*!< in: file where to print */
+/**********************************************************//**
+Sprintfs a timestamp to a buffer, 13..14 chars plus terminating NUL. */
+UNIV_INTERN
+void
+ut_sprintf_timestamp(
+/*=================*/
+	char*	buf); /*!< in: buffer where to sprintf */
+#ifdef UNIV_HOTBACKUP
+/**********************************************************//**
+Sprintfs a timestamp to a buffer with no spaces and with ':' characters
+replaced by '_'. */
+UNIV_INTERN
+void
+ut_sprintf_timestamp_without_extra_chars(
+/*=====================================*/
+	char*	buf); /*!< in: buffer where to sprintf */
+/**********************************************************//**
+Returns current year, month, day. */
+UNIV_INTERN
+void
+ut_get_year_month_day(
+/*==================*/
+	ulint*	year,	/*!< out: current year */
+	ulint*	month,	/*!< out: month */
+	ulint*	day);	/*!< out: day */
+#else /* UNIV_HOTBACKUP */
+/*************************************************************//**
+Runs an idle loop on CPU. The argument gives the desired delay
+in microseconds on 100 MHz Pentium + Visual C++.
+@return	dummy value */
+UNIV_INTERN
+ulint
+ut_delay(
+/*=====*/
+	ulint	delay);	/*!< in: delay in microseconds on 100 MHz Pentium */
+#endif /* UNIV_HOTBACKUP */
+/*************************************************************//**
+Prints the contents of a memory buffer in hex and ascii. */
+UNIV_INTERN
+void
+ut_print_buf(
+/*=========*/
+	FILE*		file,	/*!< in: file where to print */
+	const void*	buf,	/*!< in: memory buffer */
+	ulint		len);	/*!< in: length of the buffer */
+
+/**********************************************************************//**
+Outputs a NUL-terminated file name, quoted with apostrophes. */
+UNIV_INTERN
+void
+ut_print_filename(
+/*==============*/
+	FILE*		f,	/*!< in: output stream */
+	const char*	name);	/*!< in: name to print */
+
+#ifndef UNIV_HOTBACKUP
+/* Forward declaration of transaction handle */
+struct trx_struct;
+
+/**********************************************************************//**
+Outputs a fixed-length string, quoted as an SQL identifier.
+If the string contains a slash '/', the string will be
+output as two identifiers separated by a period (.),
+as in SQL database_name.identifier. */
+UNIV_INTERN
+void
+ut_print_name(
+/*==========*/
+	FILE*		f,	/*!< in: output stream */
+	struct trx_struct*trx,	/*!< in: transaction */
+	ibool		table_id,/*!< in: TRUE=print a table name,
+				FALSE=print other identifier */
+	const char*	name);	/*!< in: name to print */
+
+/**********************************************************************//**
+Outputs a fixed-length string, quoted as an SQL identifier.
+If the string contains a slash '/', the string will be
+output as two identifiers separated by a period (.),
+as in SQL database_name.identifier. */
+UNIV_INTERN
+void
+ut_print_namel(
+/*===========*/
+	FILE*		f,	/*!< in: output stream */
+	struct trx_struct*trx,	/*!< in: transaction (NULL=no quotes) */
+	ibool		table_id,/*!< in: TRUE=print a table name,
+				FALSE=print other identifier */
+	const char*	name,	/*!< in: name to print */
+	ulint		namelen);/*!< in: length of name */
+
+/**********************************************************************//**
+Catenate files. */
+UNIV_INTERN
+void
+ut_copy_file(
+/*=========*/
+	FILE*	dest,	/*!< in: output file */
+	FILE*	src);	/*!< in: input file to be appended to output */
+#endif /* !UNIV_HOTBACKUP */
+
+#ifdef __WIN__
+/**********************************************************************//**
+A substitute for snprintf(3), formatted output conversion into
+a limited buffer.
+@return number of characters that would have been printed if the size
+were unlimited, not including the terminating '\0'. */
+UNIV_INTERN
+int
+ut_snprintf(
+/*========*/
+	char*		str,	/*!< out: string */
+	size_t		size,	/*!< in: str size */
+	const char*	fmt,	/*!< in: format */
+	...);			/*!< in: format values */
+#else
+/**********************************************************************//**
+A wrapper for snprintf(3), formatted output conversion into
+a limited buffer. */
+# define ut_snprintf	snprintf
+#endif /* __WIN__ */
+
+#ifndef UNIV_NONINL
+#include "ut0ut.ic"
+#endif
+
+#endif
+
diff --git a/storage/xtradb/include/ut0ut.ic b/storage/xtradb/include/ut0ut.ic
new file mode 100644
index 00000000000..6f55c7e410e
--- /dev/null
+++ b/storage/xtradb/include/ut0ut.ic
@@ -0,0 +1,162 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************************//**
+@file include/ut0ut.ic
+Various utilities
+
+Created 5/30/1994 Heikki Tuuri
+*******************************************************************/
+
+/******************************************************//**
+Calculates the minimum of two ulints.
+@return	minimum */
+UNIV_INLINE
+ulint
+ut_min(
+/*===*/
+	ulint	 n1,	/*!< in: first number */
+	ulint	 n2)	/*!< in: second number */
+{
+	return((n1 <= n2) ? n1 : n2);
+}
+
+/******************************************************//**
+Calculates the maximum of two ulints.
+@return	maximum */
+UNIV_INLINE
+ulint
+ut_max(
+/*===*/
+	ulint	 n1,	/*!< in: first number */
+	ulint	 n2)	/*!< in: second number */
+{
+	return((n1 <= n2) ? n2 : n1);
+}
+
+/****************************************************************//**
+Calculates minimum of two ulint-pairs. */
+UNIV_INLINE
+void
+ut_pair_min(
+/*========*/
+	ulint*	a,	/*!< out: more significant part of minimum */
+	ulint*	b,	/*!< out: less significant part of minimum */
+	ulint	a1,	/*!< in: more significant part of first pair */
+	ulint	b1,	/*!< in: less significant part of first pair */
+	ulint	a2,	/*!< in: more significant part of second pair */
+	ulint	b2)	/*!< in: less significant part of second pair */
+{
+	if (a1 == a2) {
+		*a = a1;
+		*b = ut_min(b1, b2);
+	} else if (a1 < a2) {
+		*a = a1;
+		*b = b1;
+	} else {
+		*a = a2;
+		*b = b2;
+	}
+}
+
+/******************************************************//**
+Compares two ulints.
+@return	1 if a > b, 0 if a == b, -1 if a < b */
+UNIV_INLINE
+int
+ut_ulint_cmp(
+/*=========*/
+	ulint	a,	/*!< in: ulint */
+	ulint	b)	/*!< in: ulint */
+{
+	if (a < b) {
+		return(-1);
+	} else if (a == b) {
+		return(0);
+	} else {
+		return(1);
+	}
+}
+
+/*******************************************************//**
+Compares two pairs of ulints.
+@return	-1 if a < b, 0 if a == b, 1 if a > b */
+UNIV_INLINE
+int
+ut_pair_cmp(
+/*========*/
+	ulint	a1,	/*!< in: more significant part of first pair */
+	ulint	a2,	/*!< in: less significant part of first pair */
+	ulint	b1,	/*!< in: more significant part of second pair */
+	ulint	b2)	/*!< in: less significant part of second pair */
+{
+	if (a1 > b1) {
+		return(1);
+	} else if (a1 < b1) {
+		return(-1);
+	} else if (a2 > b2) {
+		return(1);
+	} else if (a2 < b2) {
+		return(-1);
+	} else {
+		return(0);
+	}
+}
+
+/*************************************************************//**
+Calculates fast the 2-logarithm of a number, rounded upward to an
+integer.
+@return	logarithm in the base 2, rounded upward */
+UNIV_INLINE
+ulint
+ut_2_log(
+/*=====*/
+	ulint	n)	/*!< in: number != 0 */
+{
+	ulint	res;
+
+	res = 0;
+
+	ut_ad(n > 0);
+
+	n = n - 1;
+
+	for (;;) {
+		n = n / 2;
+
+		if (n == 0) {
+			break;
+		}
+
+		res++;
+	}
+
+	return(res + 1);
+}
+
+/*************************************************************//**
+Calculates 2 to power n.
+@return	2 to power n */
+UNIV_INLINE
+ulint
+ut_2_exp(
+/*=====*/
+	ulint	n)	/*!< in: number */
+{
+	return((ulint) 1 << n);
+}
diff --git a/storage/xtradb/include/ut0vec.h b/storage/xtradb/include/ut0vec.h
new file mode 100644
index 00000000000..a770f671cfc
--- /dev/null
+++ b/storage/xtradb/include/ut0vec.h
@@ -0,0 +1,125 @@
+/*****************************************************************************
+
+Copyright (c) 2006, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/*******************************************************************//**
+@file include/ut0vec.h
+A vector of pointers to data items
+
+Created 4/6/2006 Osku Salerma
+************************************************************************/
+
+#ifndef IB_VECTOR_H
+#define IB_VECTOR_H
+
+#include "univ.i"
+#include "mem0mem.h"
+
+/** An automatically resizing vector data type. */
+typedef struct ib_vector_struct ib_vector_t;
+
+/* An automatically resizing vector datatype with the following properties:
+
+ -Contains void* items.
+
+ -The items are owned by the caller.
+
+ -All memory allocation is done through a heap owned by the caller, who is
+ responsible for freeing it when done with the vector.
+
+ -When the vector is resized, the old memory area is left allocated since it
+ uses the same heap as the new memory area, so this is best used for
+ relatively small or short-lived uses.
+*/
+
+/****************************************************************//**
+Create a new vector with the given initial size.
+@return	vector */
+UNIV_INTERN
+ib_vector_t*
+ib_vector_create(
+/*=============*/
+	mem_heap_t*	heap,	/*!< in: heap */
+	ulint		size);	/*!< in: initial size */
+
+/****************************************************************//**
+Push a new element to the vector, increasing its size if necessary. */
+UNIV_INTERN
+void
+ib_vector_push(
+/*===========*/
+	ib_vector_t*	vec,	/*!< in: vector */
+	void*		elem);	/*!< in: data element */
+
+/****************************************************************//**
+Get the number of elements in the vector.
+@return	number of elements in vector */
+UNIV_INLINE
+ulint
+ib_vector_size(
+/*===========*/
+	const ib_vector_t*	vec);	/*!< in: vector */
+
+/****************************************************************//**
+Test whether a vector is empty or not.
+@return	TRUE if empty */
+UNIV_INLINE
+ibool
+ib_vector_is_empty(
+/*===============*/
+	const ib_vector_t*	vec);	/*!< in: vector */
+
+/****************************************************************//**
+Get the n'th element.
+@return	n'th element */
+UNIV_INLINE
+void*
+ib_vector_get(
+/*==========*/
+	ib_vector_t*	vec,	/*!< in: vector */
+	ulint		n);	/*!< in: element index to get */
+
+/****************************************************************//**
+Remove the last element from the vector. */
+UNIV_INLINE
+void*
+ib_vector_pop(
+/*==========*/
+	ib_vector_t*	vec);	/*!< in: vector */
+
+/****************************************************************//**
+Free the underlying heap of the vector. Note that vec is invalid
+after this call. */
+UNIV_INLINE
+void
+ib_vector_free(
+/*===========*/
+	ib_vector_t*	vec);	/*!< in,own: vector */
+
+/** An automatically resizing vector data type. */
+struct ib_vector_struct {
+	mem_heap_t*	heap;	/*!< heap */
+	void**		data;	/*!< data elements */
+	ulint		used;	/*!< number of elements currently used */
+	ulint		total;	/*!< number of elements allocated */
+};
+
+#ifndef UNIV_NONINL
+#include "ut0vec.ic"
+#endif
+
+#endif
diff --git a/storage/xtradb/include/ut0vec.ic b/storage/xtradb/include/ut0vec.ic
new file mode 100644
index 00000000000..02e881f9bca
--- /dev/null
+++ b/storage/xtradb/include/ut0vec.ic
@@ -0,0 +1,96 @@
+/*****************************************************************************
+
+Copyright (c) 2006, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/*******************************************************************//**
+@file include/ut0vec.ic
+A vector of pointers to data items
+
+Created 4/6/2006 Osku Salerma
+************************************************************************/
+
+/****************************************************************//**
+Get number of elements in vector.
+@return	number of elements in vector */
+UNIV_INLINE
+ulint
+ib_vector_size(
+/*===========*/
+	const ib_vector_t*	vec)	/*!< in: vector */
+{
+	return(vec->used);
+}
+
+/****************************************************************//**
+Get n'th element.
+@return	n'th element */
+UNIV_INLINE
+void*
+ib_vector_get(
+/*==========*/
+	ib_vector_t*	vec,	/*!< in: vector */
+	ulint		n)	/*!< in: element index to get */
+{
+	ut_a(n < vec->used);
+
+	return(vec->data[n]);
+}
+
+/****************************************************************//**
+Remove the last element from the vector.
+@return	last vector element */
+UNIV_INLINE
+void*
+ib_vector_pop(
+/*==========*/
+	ib_vector_t*    vec)    /*!< in/out: vector */
+{
+	void*           elem;
+
+	ut_a(vec->used > 0);
+	--vec->used;
+	elem = vec->data[vec->used];
+
+	ut_d(vec->data[vec->used] = NULL);
+	UNIV_MEM_INVALID(&vec->data[vec->used], sizeof(*vec->data));
+
+	return(elem);
+}
+
+/****************************************************************//**
+Free the underlying heap of the vector. Note that vec is invalid
+after this call. */
+UNIV_INLINE
+void
+ib_vector_free(
+/*===========*/
+	ib_vector_t*    vec)    /*!< in, own: vector */
+{
+	mem_heap_free(vec->heap);
+}
+
+/****************************************************************//**
+Test whether a vector is empty or not.
+@return	TRUE if empty */
+UNIV_INLINE
+ibool
+ib_vector_is_empty(
+/*===============*/
+	const ib_vector_t*	vec)	/*!< in: vector */
+{
+	return(ib_vector_size(vec) == 0);
+}
diff --git a/storage/xtradb/include/ut0wqueue.h b/storage/xtradb/include/ut0wqueue.h
new file mode 100644
index 00000000000..2ec0f16ab05
--- /dev/null
+++ b/storage/xtradb/include/ut0wqueue.h
@@ -0,0 +1,85 @@
+/*****************************************************************************
+
+Copyright (c) 2006, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/*******************************************************************//**
+@file include/ut0wqueue.h
+A work queue
+
+Created 4/26/2006 Osku Salerma
+************************************************************************/
+
+/*******************************************************************//**
+A Work queue. Threads can add work items to the queue and other threads can
+wait for work items to be available and take them off the queue for
+processing.
+************************************************************************/
+
+#ifndef IB_WORK_QUEUE_H
+#define IB_WORK_QUEUE_H
+
+#include "ut0list.h"
+#include "mem0mem.h"
+#include "os0sync.h"
+#include "sync0types.h"
+
+typedef struct ib_wqueue_struct ib_wqueue_t;
+
+/****************************************************************//**
+Create a new work queue.
+@return	work queue */
+UNIV_INTERN
+ib_wqueue_t*
+ib_wqueue_create(void);
+/*===================*/
+
+/****************************************************************//**
+Free a work queue. */
+UNIV_INTERN
+void
+ib_wqueue_free(
+/*===========*/
+	ib_wqueue_t*	wq);	/*!< in: work queue */
+
+/****************************************************************//**
+Add a work item to the queue. */
+UNIV_INTERN
+void
+ib_wqueue_add(
+/*==========*/
+	ib_wqueue_t*	wq,	/*!< in: work queue */
+	void*		item,	/*!< in: work item */
+	mem_heap_t*	heap);	/*!< in: memory heap to use for allocating the
+				list node */
+
+/****************************************************************//**
+Wait for a work item to appear in the queue.
+@return	work item */
+UNIV_INTERN
+void*
+ib_wqueue_wait(
+/*===========*/
+	ib_wqueue_t*	wq);	/*!< in: work queue */
+
+/* Work queue. */
+struct ib_wqueue_struct {
+	mutex_t		mutex;	/*!< mutex protecting everything */
+	ib_list_t*	items;	/*!< work item list */
+	os_event_t	event;	/*!< event we use to signal additions to list */
+};
+
+#endif
diff --git a/storage/xtradb/lock/lock0iter.c b/storage/xtradb/lock/lock0iter.c
new file mode 100644
index 00000000000..51d1802ccde
--- /dev/null
+++ b/storage/xtradb/lock/lock0iter.c
@@ -0,0 +1,114 @@
+/*****************************************************************************
+
+Copyright (c) 2007, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file lock/lock0iter.c
+Lock queue iterator. Can iterate over table and record
+lock queues.
+
+Created July 16, 2007 Vasil Dimov
+*******************************************************/
+
+#define LOCK_MODULE_IMPLEMENTATION
+
+#include "univ.i"
+#include "lock0iter.h"
+#include "lock0lock.h"
+#include "lock0priv.h"
+#include "ut0dbg.h"
+#include "ut0lst.h"
+#ifdef UNIV_DEBUG
+# include "srv0srv.h" /* kernel_mutex */
+#endif /* UNIV_DEBUG */
+
+/*******************************************************************//**
+Initialize lock queue iterator so that it starts to iterate from
+"lock". bit_no specifies the record number within the heap where the
+record is stored. It can be undefined (ULINT_UNDEFINED) in two cases:
+1. If the lock is a table lock, thus we have a table lock queue;
+2. If the lock is a record lock and it is a wait lock. In this case
+   bit_no is calculated in this function by using
+   lock_rec_find_set_bit(). There is exactly one bit set in the bitmap
+   of a wait lock. */
+UNIV_INTERN
+void
+lock_queue_iterator_reset(
+/*======================*/
+	lock_queue_iterator_t*	iter,	/*!< out: iterator */
+	const lock_t*		lock,	/*!< in: lock to start from */
+	ulint			bit_no)	/*!< in: record number in the
+					heap */
+{
+	ut_ad(mutex_own(&kernel_mutex));
+
+	iter->current_lock = lock;
+
+	if (bit_no != ULINT_UNDEFINED) {
+
+		iter->bit_no = bit_no;
+	} else {
+
+		switch (lock_get_type_low(lock)) {
+		case LOCK_TABLE:
+			iter->bit_no = ULINT_UNDEFINED;
+			break;
+		case LOCK_REC:
+			iter->bit_no = lock_rec_find_set_bit(lock);
+			ut_a(iter->bit_no != ULINT_UNDEFINED);
+			break;
+		default:
+			ut_error;
+		}
+	}
+}
+
+/*******************************************************************//**
+Gets the previous lock in the lock queue, returns NULL if there are no
+more locks (i.e. the current lock is the first one). The iterator is
+receded (if not-NULL is returned).
+@return	previous lock or NULL */
+UNIV_INTERN
+const lock_t*
+lock_queue_iterator_get_prev(
+/*=========================*/
+	lock_queue_iterator_t*	iter)	/*!< in/out: iterator */
+{
+	const lock_t*	prev_lock;
+
+	ut_ad(mutex_own(&kernel_mutex));
+
+	switch (lock_get_type_low(iter->current_lock)) {
+	case LOCK_REC:
+		prev_lock = lock_rec_get_prev(
+			iter->current_lock, iter->bit_no);
+		break;
+	case LOCK_TABLE:
+		prev_lock = UT_LIST_GET_PREV(
+			un_member.tab_lock.locks, iter->current_lock);
+		break;
+	default:
+		ut_error;
+	}
+
+	if (prev_lock != NULL) {
+
+		iter->current_lock = prev_lock;
+	}
+
+	return(prev_lock);
+}
diff --git a/storage/xtradb/lock/lock0lock.c b/storage/xtradb/lock/lock0lock.c
new file mode 100644
index 00000000000..1ded67d9147
--- /dev/null
+++ b/storage/xtradb/lock/lock0lock.c
@@ -0,0 +1,5773 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2010, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file lock/lock0lock.c
+The transaction lock system
+
+Created 5/7/1996 Heikki Tuuri
+*******************************************************/
+
+#define LOCK_MODULE_IMPLEMENTATION
+
+#include "lock0lock.h"
+#include "lock0priv.h"
+
+#ifdef UNIV_NONINL
+#include "lock0lock.ic"
+#include "lock0priv.ic"
+#endif
+
+#include "ha_prototypes.h"
+#include "usr0sess.h"
+#include "trx0purge.h"
+#include "dict0mem.h"
+#include "trx0sys.h"
+
+/* Restricts the length of search we will do in the waits-for
+graph of transactions */
+#define LOCK_MAX_N_STEPS_IN_DEADLOCK_CHECK 1000000
+
+/* Restricts the recursion depth of the search we will do in the waits-for
+graph of transactions */
+#define LOCK_MAX_DEPTH_IN_DEADLOCK_CHECK 200
+
+/* When releasing transaction locks, this specifies how often we release
+the kernel mutex for a moment to give also others access to it */
+
+#define LOCK_RELEASE_KERNEL_INTERVAL	1000
+
+/* Safety margin when creating a new record lock: this many extra records
+can be inserted to the page without need to create a lock with a bigger
+bitmap */
+
+#define LOCK_PAGE_BITMAP_MARGIN		64
+
+/* An explicit record lock affects both the record and the gap before it.
+An implicit x-lock does not affect the gap, it only locks the index
+record from read or update.
+
+If a transaction has modified or inserted an index record, then
+it owns an implicit x-lock on the record. On a secondary index record,
+a transaction has an implicit x-lock also if it has modified the
+clustered index record, the max trx id of the page where the secondary
+index record resides is >= trx id of the transaction (or database recovery
+is running), and there are no explicit non-gap lock requests on the
+secondary index record.
+
+This complicated definition for a secondary index comes from the
+implementation: we want to be able to determine if a secondary index
+record has an implicit x-lock, just by looking at the present clustered
+index record, not at the historical versions of the record. The
+complicated definition can be explained to the user so that there is
+nondeterminism in the access path when a query is answered: we may,
+or may not, access the clustered index record and thus may, or may not,
+bump into an x-lock set there.
+
+Different transaction can have conflicting locks set on the gap at the
+same time. The locks on the gap are purely inhibitive: an insert cannot
+be made, or a select cursor may have to wait if a different transaction
+has a conflicting lock on the gap. An x-lock on the gap does not give
+the right to insert into the gap.
+
+An explicit lock can be placed on a user record or the supremum record of
+a page. The locks on the supremum record are always thought to be of the gap
+type, though the gap bit is not set. When we perform an update of a record
+where the size of the record changes, we may temporarily store its explicit
+locks on the infimum record of the page, though the infimum otherwise never
+carries locks.
+
+A waiting record lock can also be of the gap type. A waiting lock request
+can be granted when there is no conflicting mode lock request by another
+transaction ahead of it in the explicit lock queue.
+
+In version 4.0.5 we added yet another explicit lock type: LOCK_REC_NOT_GAP.
+It only locks the record it is placed on, not the gap before the record.
+This lock type is necessary to emulate an Oracle-like READ COMMITTED isolation
+level.
+
+-------------------------------------------------------------------------
+RULE 1: If there is an implicit x-lock on a record, and there are non-gap
+-------
+lock requests waiting in the queue, then the transaction holding the implicit
+x-lock also has an explicit non-gap record x-lock. Therefore, as locks are
+released, we can grant locks to waiting lock requests purely by looking at
+the explicit lock requests in the queue.
+
+RULE 3: Different transactions cannot have conflicting granted non-gap locks
+-------
+on a record at the same time. However, they can have conflicting granted gap
+locks.
+RULE 4: If a there is a waiting lock request in a queue, no lock request,
+-------
+gap or not, can be inserted ahead of it in the queue. In record deletes
+and page splits new gap type locks can be created by the database manager
+for a transaction, and without rule 4, the waits-for graph of transactions
+might become cyclic without the database noticing it, as the deadlock check
+is only performed when a transaction itself requests a lock!
+-------------------------------------------------------------------------
+
+An insert is allowed to a gap if there are no explicit lock requests by
+other transactions on the next record. It does not matter if these lock
+requests are granted or waiting, gap bit set or not, with the exception
+that a gap type request set by another transaction to wait for
+its turn to do an insert is ignored. On the other hand, an
+implicit x-lock by another transaction does not prevent an insert, which
+allows for more concurrency when using an Oracle-style sequence number
+generator for the primary key with many transactions doing inserts
+concurrently.
+
+A modify of a record is allowed if the transaction has an x-lock on the
+record, or if other transactions do not have any non-gap lock requests on the
+record.
+
+A read of a single user record with a cursor is allowed if the transaction
+has a non-gap explicit, or an implicit lock on the record, or if the other
+transactions have no x-lock requests on the record. At a page supremum a
+read is always allowed.
+
+In summary, an implicit lock is seen as a granted x-lock only on the
+record, not on the gap. An explicit lock with no gap bit set is a lock
+both on the record and the gap. If the gap bit is set, the lock is only
+on the gap. Different transaction cannot own conflicting locks on the
+record at the same time, but they may own conflicting locks on the gap.
+Granted locks on a record give an access right to the record, but gap type
+locks just inhibit operations.
+
+NOTE: Finding out if some transaction has an implicit x-lock on a secondary
+index record can be cumbersome. We may have to look at previous versions of
+the corresponding clustered index record to find out if a delete marked
+secondary index record was delete marked by an active transaction, not by
+a committed one.
+
+FACT A: If a transaction has inserted a row, it can delete it any time
+without need to wait for locks.
+
+PROOF: The transaction has an implicit x-lock on every index record inserted
+for the row, and can thus modify each record without the need to wait. Q.E.D.
+
+FACT B: If a transaction has read some result set with a cursor, it can read
+it again, and retrieves the same result set, if it has not modified the
+result set in the meantime. Hence, there is no phantom problem. If the
+biggest record, in the alphabetical order, touched by the cursor is removed,
+a lock wait may occur, otherwise not.
+
+PROOF: When a read cursor proceeds, it sets an s-lock on each user record
+it passes, and a gap type s-lock on each page supremum. The cursor must
+wait until it has these locks granted. Then no other transaction can
+have a granted x-lock on any of the user records, and therefore cannot
+modify the user records. Neither can any other transaction insert into
+the gaps which were passed over by the cursor. Page splits and merges,
+and removal of obsolete versions of records do not affect this, because
+when a user record or a page supremum is removed, the next record inherits
+its locks as gap type locks, and therefore blocks inserts to the same gap.
+Also, if a page supremum is inserted, it inherits its locks from the successor
+record. When the cursor is positioned again at the start of the result set,
+the records it will touch on its course are either records it touched
+during the last pass or new inserted page supremums. It can immediately
+access all these records, and when it arrives at the biggest record, it
+notices that the result set is complete. If the biggest record was removed,
+lock wait can occur because the next record only inherits a gap type lock,
+and a wait may be needed. Q.E.D. */
+
+/* If an index record should be changed or a new inserted, we must check
+the lock on the record or the next. When a read cursor starts reading,
+we will set a record level s-lock on each record it passes, except on the
+initial record on which the cursor is positioned before we start to fetch
+records. Our index tree search has the convention that the B-tree
+cursor is positioned BEFORE the first possibly matching record in
+the search. Optimizations are possible here: if the record is searched
+on an equality condition to a unique key, we could actually set a special
+lock on the record, a lock which would not prevent any insert before
+this record. In the next key locking an x-lock set on a record also
+prevents inserts just before that record.
+	There are special infimum and supremum records on each page.
+A supremum record can be locked by a read cursor. This records cannot be
+updated but the lock prevents insert of a user record to the end of
+the page.
+	Next key locks will prevent the phantom problem where new rows
+could appear to SELECT result sets after the select operation has been
+performed. Prevention of phantoms ensures the serilizability of
+transactions.
+	What should we check if an insert of a new record is wanted?
+Only the lock on the next record on the same page, because also the
+supremum record can carry a lock. An s-lock prevents insertion, but
+what about an x-lock? If it was set by a searched update, then there
+is implicitly an s-lock, too, and the insert should be prevented.
+What if our transaction owns an x-lock to the next record, but there is
+a waiting s-lock request on the next record? If this s-lock was placed
+by a read cursor moving in the ascending order in the index, we cannot
+do the insert immediately, because when we finally commit our transaction,
+the read cursor should see also the new inserted record. So we should
+move the read cursor backward from the next record for it to pass over
+the new inserted record. This move backward may be too cumbersome to
+implement. If we in this situation just enqueue a second x-lock request
+for our transaction on the next record, then the deadlock mechanism
+notices a deadlock between our transaction and the s-lock request
+transaction. This seems to be an ok solution.
+	We could have the convention that granted explicit record locks,
+lock the corresponding records from changing, and also lock the gaps
+before them from inserting. A waiting explicit lock request locks the gap
+before from inserting. Implicit record x-locks, which we derive from the
+transaction id in the clustered index record, only lock the record itself
+from modification, not the gap before it from inserting.
+	How should we store update locks? If the search is done by a unique
+key, we could just modify the record trx id. Otherwise, we could put a record
+x-lock on the record. If the update changes ordering fields of the
+clustered index record, the inserted new record needs no record lock in
+lock table, the trx id is enough. The same holds for a secondary index
+record. Searched delete is similar to update.
+
+PROBLEM:
+What about waiting lock requests? If a transaction is waiting to make an
+update to a record which another modified, how does the other transaction
+know to send the end-lock-wait signal to the waiting transaction? If we have
+the convention that a transaction may wait for just one lock at a time, how
+do we preserve it if lock wait ends?
+
+PROBLEM:
+Checking the trx id label of a secondary index record. In the case of a
+modification, not an insert, is this necessary? A secondary index record
+is modified only by setting or resetting its deleted flag. A secondary index
+record contains fields to uniquely determine the corresponding clustered
+index record. A secondary index record is therefore only modified if we
+also modify the clustered index record, and the trx id checking is done
+on the clustered index record, before we come to modify the secondary index
+record. So, in the case of delete marking or unmarking a secondary index
+record, we do not have to care about trx ids, only the locks in the lock
+table must be checked. In the case of a select from a secondary index, the
+trx id is relevant, and in this case we may have to search the clustered
+index record.
+
+PROBLEM: How to update record locks when page is split or merged, or
+--------------------------------------------------------------------
+a record is deleted or updated?
+If the size of fields in a record changes, we perform the update by
+a delete followed by an insert. How can we retain the locks set or
+waiting on the record? Because a record lock is indexed in the bitmap
+by the heap number of the record, when we remove the record from the
+record list, it is possible still to keep the lock bits. If the page
+is reorganized, we could make a table of old and new heap numbers,
+and permute the bitmaps in the locks accordingly. We can add to the
+table a row telling where the updated record ended. If the update does
+not require a reorganization of the page, we can simply move the lock
+bits for the updated record to the position determined by its new heap
+number (we may have to allocate a new lock, if we run out of the bitmap
+in the old one).
+	A more complicated case is the one where the reinsertion of the
+updated record is done pessimistically, because the structure of the
+tree may change.
+
+PROBLEM: If a supremum record is removed in a page merge, or a record
+---------------------------------------------------------------------
+removed in a purge, what to do to the waiting lock requests? In a split to
+the right, we just move the lock requests to the new supremum. If a record
+is removed, we could move the waiting lock request to its inheritor, the
+next record in the index. But, the next record may already have lock
+requests on its own queue. A new deadlock check should be made then. Maybe
+it is easier just to release the waiting transactions. They can then enqueue
+new lock requests on appropriate records.
+
+PROBLEM: When a record is inserted, what locks should it inherit from the
+-------------------------------------------------------------------------
+upper neighbor? An insert of a new supremum record in a page split is
+always possible, but an insert of a new user record requires that the upper
+neighbor does not have any lock requests by other transactions, granted or
+waiting, in its lock queue. Solution: We can copy the locks as gap type
+locks, so that also the waiting locks are transformed to granted gap type
+locks on the inserted record. */
+
+/* LOCK COMPATIBILITY MATRIX
+ *    IS IX S  X  AI
+ * IS +	 +  +  -  +
+ * IX +	 +  -  -  +
+ * S  +	 -  +  -  -
+ * X  -	 -  -  -  -
+ * AI +	 +  -  -  -
+ *
+ * Note that for rows, InnoDB only acquires S or X locks.
+ * For tables, InnoDB normally acquires IS or IX locks.
+ * S or X table locks are only acquired for LOCK TABLES.
+ * Auto-increment (AI) locks are needed because of
+ * statement-level MySQL binlog.
+ * See also lock_mode_compatible().
+ */
+#define LK(a,b) (1 << ((a) * LOCK_NUM + (b)))
+#define LKS(a,b) LK(a,b) | LK(b,a)
+
+/* Define the lock compatibility matrix in a ulint.  The first line below
+defines the diagonal entries.  The following lines define the compatibility
+for LOCK_IX, LOCK_S, and LOCK_AUTO_INC using LKS(), since the matrix
+is symmetric. */
+#define LOCK_MODE_COMPATIBILITY 0					\
+ | LK(LOCK_IS, LOCK_IS) | LK(LOCK_IX, LOCK_IX) | LK(LOCK_S, LOCK_S)	\
+ | LKS(LOCK_IX, LOCK_IS) | LKS(LOCK_IS, LOCK_AUTO_INC)			\
+ | LKS(LOCK_S, LOCK_IS)							\
+ | LKS(LOCK_AUTO_INC, LOCK_IS) | LKS(LOCK_AUTO_INC, LOCK_IX)
+
+/* STRONGER-OR-EQUAL RELATION (mode1=row, mode2=column)
+ *    IS IX S  X  AI
+ * IS +  -  -  -  -
+ * IX +  +  -  -  -
+ * S  +  -  +  -  -
+ * X  +  +  +  +  +
+ * AI -  -  -  -  +
+ * See lock_mode_stronger_or_eq().
+ */
+
+/* Define the stronger-or-equal lock relation in a ulint.  This relation
+contains all pairs LK(mode1, mode2) where mode1 is stronger than or
+equal to mode2. */
+#define LOCK_MODE_STRONGER_OR_EQ 0					\
+ | LK(LOCK_IS, LOCK_IS)							\
+ | LK(LOCK_IX, LOCK_IS) | LK(LOCK_IX, LOCK_IX)				\
+ | LK(LOCK_S, LOCK_IS) | LK(LOCK_S, LOCK_S)				\
+ | LK(LOCK_AUTO_INC, LOCK_AUTO_INC)					\
+ | LK(LOCK_X, LOCK_IS) | LK(LOCK_X, LOCK_IX) | LK(LOCK_X, LOCK_S)	\
+ | LK(LOCK_X, LOCK_AUTO_INC) | LK(LOCK_X, LOCK_X)
+
+#ifdef UNIV_DEBUG
+UNIV_INTERN ibool	lock_print_waits	= FALSE;
+
+/*********************************************************************//**
+Validates the lock system.
+@return	TRUE if ok */
+static
+ibool
+lock_validate(void);
+/*===============*/
+
+/*********************************************************************//**
+Validates the record lock queues on a page.
+@return	TRUE if ok */
+static
+ibool
+lock_rec_validate_page(
+/*===================*/
+	ulint	space,	/*!< in: space id */
+	ulint	zip_size,/*!< in: compressed page size in bytes
+			or 0 for uncompressed pages */
+	ulint	page_no);/*!< in: page number */
+#endif /* UNIV_DEBUG */
+
+/* The lock system */
+UNIV_INTERN lock_sys_t*	lock_sys	= NULL;
+
+/* We store info on the latest deadlock error to this buffer. InnoDB
+Monitor will then fetch it and print */
+UNIV_INTERN ibool	lock_deadlock_found = FALSE;
+UNIV_INTERN FILE*	lock_latest_err_file;
+
+/* Flags for recursive deadlock search */
+#define LOCK_VICTIM_IS_START	1
+#define LOCK_VICTIM_IS_OTHER	2
+#define LOCK_EXCEED_MAX_DEPTH	3
+
+/********************************************************************//**
+Checks if a lock request results in a deadlock.
+@return TRUE if a deadlock was detected and we chose trx as a victim;
+FALSE if no deadlock, or there was a deadlock, but we chose other
+transaction(s) as victim(s) */
+static
+ibool
+lock_deadlock_occurs(
+/*=================*/
+	lock_t*	lock,	/*!< in: lock the transaction is requesting */
+	trx_t*	trx);	/*!< in: transaction */
+/********************************************************************//**
+Looks recursively for a deadlock.
+@return 0 if no deadlock found, LOCK_VICTIM_IS_START if there was a
+deadlock and we chose 'start' as the victim, LOCK_VICTIM_IS_OTHER if a
+deadlock was found and we chose some other trx as a victim: we must do
+the search again in this last case because there may be another
+deadlock!
+LOCK_EXCEED_MAX_DEPTH if the lock search exceeds max steps or max depth. */
+static
+ulint
+lock_deadlock_recursive(
+/*====================*/
+	trx_t*	start,		/*!< in: recursion starting point */
+	trx_t*	trx,		/*!< in: a transaction waiting for a lock */
+	lock_t*	wait_lock,	/*!< in:  lock that is waiting to be granted */
+	ulint*	cost,		/*!< in/out: number of calculation steps thus
+				far: if this exceeds LOCK_MAX_N_STEPS_...
+				we return LOCK_EXCEED_MAX_DEPTH */
+	ulint	depth);		/*!< in: recursion depth: if this exceeds
+				LOCK_MAX_DEPTH_IN_DEADLOCK_CHECK, we
+				return LOCK_EXCEED_MAX_DEPTH */
+
+/*********************************************************************//**
+Gets the nth bit of a record lock.
+@return	TRUE if bit set also if i == ULINT_UNDEFINED return FALSE*/
+UNIV_INLINE
+ibool
+lock_rec_get_nth_bit(
+/*=================*/
+	const lock_t*	lock,	/*!< in: record lock */
+	ulint		i)	/*!< in: index of the bit */
+{
+	ulint	byte_index;
+	ulint	bit_index;
+
+	ut_ad(lock);
+	ut_ad(lock_get_type_low(lock) == LOCK_REC);
+
+	if (i >= lock->un_member.rec_lock.n_bits) {
+
+		return(FALSE);
+	}
+
+	byte_index = i / 8;
+	bit_index = i % 8;
+
+	return(1 & ((const byte*) &lock[1])[byte_index] >> bit_index);
+}
+
+/*************************************************************************/
+
+#define lock_mutex_enter_kernel()	mutex_enter(&kernel_mutex)
+#define lock_mutex_exit_kernel()	mutex_exit(&kernel_mutex)
+
+/*********************************************************************//**
+Checks that a transaction id is sensible, i.e., not in the future.
+@return	TRUE if ok */
+UNIV_INTERN
+ibool
+lock_check_trx_id_sanity(
+/*=====================*/
+	trx_id_t	trx_id,		/*!< in: trx id */
+	const rec_t*	rec,		/*!< in: user record */
+	dict_index_t*	index,		/*!< in: index */
+	const ulint*	offsets,	/*!< in: rec_get_offsets(rec, index) */
+	ibool		has_kernel_mutex)/*!< in: TRUE if the caller owns the
+					kernel mutex */
+{
+	ibool	is_ok		= TRUE;
+
+	ut_ad(rec_offs_validate(rec, index, offsets));
+
+	if (!has_kernel_mutex) {
+		mutex_enter(&kernel_mutex);
+	}
+
+	/* A sanity check: the trx_id in rec must be smaller than the global
+	trx id counter */
+
+	if (ut_dulint_cmp(trx_id, trx_sys->max_trx_id) >= 0) {
+		ut_print_timestamp(stderr);
+		fputs("  InnoDB: Error: transaction id associated"
+		      " with record\n",
+		      stderr);
+		rec_print_new(stderr, rec, offsets);
+		fputs("InnoDB: in ", stderr);
+		dict_index_name_print(stderr, NULL, index);
+		fprintf(stderr, "\n"
+			"InnoDB: is " TRX_ID_FMT " which is higher than the"
+			" global trx id counter " TRX_ID_FMT "!\n"
+			"InnoDB: The table is corrupt. You have to do"
+			" dump + drop + reimport.\n",
+			TRX_ID_PREP_PRINTF(trx_id),
+			TRX_ID_PREP_PRINTF(trx_sys->max_trx_id));
+
+		is_ok = FALSE;
+	}
+
+	if (!has_kernel_mutex) {
+		mutex_exit(&kernel_mutex);
+	}
+
+	return(is_ok);
+}
+
+/*********************************************************************//**
+Checks that a record is seen in a consistent read.
+@return TRUE if sees, or FALSE if an earlier version of the record
+should be retrieved */
+UNIV_INTERN
+ibool
+lock_clust_rec_cons_read_sees(
+/*==========================*/
+	const rec_t*	rec,	/*!< in: user record which should be read or
+				passed over by a read cursor */
+	dict_index_t*	index,	/*!< in: clustered index */
+	const ulint*	offsets,/*!< in: rec_get_offsets(rec, index) */
+	read_view_t*	view)	/*!< in: consistent read view */
+{
+	trx_id_t	trx_id;
+
+	ut_ad(dict_index_is_clust(index));
+	ut_ad(page_rec_is_user_rec(rec));
+	ut_ad(rec_offs_validate(rec, index, offsets));
+
+	/* NOTE that we call this function while holding the search
+	system latch. To obey the latching order we must NOT reserve the
+	kernel mutex here! */
+
+	trx_id = row_get_rec_trx_id(rec, index, offsets);
+
+	return(read_view_sees_trx_id(view, trx_id));
+}
+
+/*********************************************************************//**
+Checks that a non-clustered index record is seen in a consistent read.
+
+NOTE that a non-clustered index page contains so little information on
+its modifications that also in the case FALSE, the present version of
+rec may be the right, but we must check this from the clustered index
+record.
+
+@return TRUE if certainly sees, or FALSE if an earlier version of the
+clustered index record might be needed */
+UNIV_INTERN
+ulint
+lock_sec_rec_cons_read_sees(
+/*========================*/
+	const rec_t*		rec,	/*!< in: user record which
+					should be read or passed over
+					by a read cursor */
+	const read_view_t*	view)	/*!< in: consistent read view */
+{
+	trx_id_t	max_trx_id;
+
+	ut_ad(page_rec_is_user_rec(rec));
+
+	/* NOTE that we might call this function while holding the search
+	system latch. To obey the latching order we must NOT reserve the
+	kernel mutex here! */
+
+	if (recv_recovery_is_on()) {
+
+		return(FALSE);
+	}
+
+	max_trx_id = page_get_max_trx_id(page_align(rec));
+	ut_ad(!ut_dulint_is_zero(max_trx_id));
+
+	return(ut_dulint_cmp(max_trx_id, view->up_limit_id) < 0);
+}
+
+/*********************************************************************//**
+Creates the lock system at database start. */
+UNIV_INTERN
+void
+lock_sys_create(
+/*============*/
+	ulint	n_cells)	/*!< in: number of slots in lock hash table */
+{
+	lock_sys = mem_alloc(sizeof(lock_sys_t));
+
+	lock_sys->rec_hash = hash_create(n_cells);
+
+	/* hash_create_mutexes(lock_sys->rec_hash, 2, SYNC_REC_LOCK); */
+
+	lock_latest_err_file = os_file_create_tmpfile();
+	ut_a(lock_latest_err_file);
+}
+
+/*********************************************************************//**
+Closes the lock system at database shutdown. */
+UNIV_INTERN
+void
+lock_sys_close(void)
+/*================*/
+{
+	if (lock_latest_err_file != NULL) {
+		fclose(lock_latest_err_file);
+		lock_latest_err_file = NULL;
+	}
+
+	hash_table_free(lock_sys->rec_hash);
+	mem_free(lock_sys);
+	lock_sys = NULL;
+}
+
+/*********************************************************************//**
+Gets the size of a lock struct.
+@return	size in bytes */
+UNIV_INTERN
+ulint
+lock_get_size(void)
+/*===============*/
+{
+	return((ulint)sizeof(lock_t));
+}
+
+/*********************************************************************//**
+Gets the mode of a lock.
+@return	mode */
+UNIV_INLINE
+enum lock_mode
+lock_get_mode(
+/*==========*/
+	const lock_t*	lock)	/*!< in: lock */
+{
+	ut_ad(lock);
+
+	return(lock->type_mode & LOCK_MODE_MASK);
+}
+
+/*********************************************************************//**
+Gets the wait flag of a lock.
+@return	TRUE if waiting */
+UNIV_INLINE
+ibool
+lock_get_wait(
+/*==========*/
+	const lock_t*	lock)	/*!< in: lock */
+{
+	ut_ad(lock);
+
+	if (UNIV_UNLIKELY(lock->type_mode & LOCK_WAIT)) {
+
+		return(TRUE);
+	}
+
+	return(FALSE);
+}
+
+/*********************************************************************//**
+Gets the source table of an ALTER TABLE transaction.  The table must be
+covered by an IX or IS table lock.
+@return the source table of transaction, if it is covered by an IX or
+IS table lock; dest if there is no source table, and NULL if the
+transaction is locking more than two tables or an inconsistency is
+found */
+UNIV_INTERN
+dict_table_t*
+lock_get_src_table(
+/*===============*/
+	trx_t*		trx,	/*!< in: transaction */
+	dict_table_t*	dest,	/*!< in: destination of ALTER TABLE */
+	enum lock_mode*	mode)	/*!< out: lock mode of the source table */
+{
+	dict_table_t*	src;
+	lock_t*		lock;
+
+	src = NULL;
+	*mode = LOCK_NONE;
+
+	for (lock = UT_LIST_GET_FIRST(trx->trx_locks);
+	     lock;
+	     lock = UT_LIST_GET_NEXT(trx_locks, lock)) {
+		lock_table_t*	tab_lock;
+		enum lock_mode	lock_mode;
+		if (!(lock_get_type_low(lock) & LOCK_TABLE)) {
+			/* We are only interested in table locks. */
+			continue;
+		}
+		tab_lock = &lock->un_member.tab_lock;
+		if (dest == tab_lock->table) {
+			/* We are not interested in the destination table. */
+			continue;
+		} else if (!src) {
+			/* This presumably is the source table. */
+			src = tab_lock->table;
+			if (UT_LIST_GET_LEN(src->locks) != 1
+			    || UT_LIST_GET_FIRST(src->locks) != lock) {
+				/* We only support the case when
+				there is only one lock on this table. */
+				return(NULL);
+			}
+		} else if (src != tab_lock->table) {
+			/* The transaction is locking more than
+			two tables (src and dest): abort */
+			return(NULL);
+		}
+
+		/* Check that the source table is locked by
+		LOCK_IX or LOCK_IS. */
+		lock_mode = lock_get_mode(lock);
+		if (lock_mode == LOCK_IX || lock_mode == LOCK_IS) {
+			if (*mode != LOCK_NONE && *mode != lock_mode) {
+				/* There are multiple locks on src. */
+				return(NULL);
+			}
+			*mode = lock_mode;
+		}
+	}
+
+	if (!src) {
+		/* No source table lock found: flag the situation to caller */
+		src = dest;
+	}
+
+	return(src);
+}
+
+/*********************************************************************//**
+Determine if the given table is exclusively "owned" by the given
+transaction, i.e., transaction holds LOCK_IX and possibly LOCK_AUTO_INC
+on the table.
+@return TRUE if table is only locked by trx, with LOCK_IX, and
+possibly LOCK_AUTO_INC */
+UNIV_INTERN
+ibool
+lock_is_table_exclusive(
+/*====================*/
+	dict_table_t*	table,	/*!< in: table */
+	trx_t*		trx)	/*!< in: transaction */
+{
+	const lock_t*	lock;
+	ibool		ok	= FALSE;
+
+	ut_ad(table);
+	ut_ad(trx);
+
+	lock_mutex_enter_kernel();
+
+	for (lock = UT_LIST_GET_FIRST(table->locks);
+	     lock;
+	     lock = UT_LIST_GET_NEXT(locks, &lock->un_member.tab_lock)) {
+		if (lock->trx != trx) {
+			/* A lock on the table is held
+			by some other transaction. */
+			goto not_ok;
+		}
+
+		if (!(lock_get_type_low(lock) & LOCK_TABLE)) {
+			/* We are interested in table locks only. */
+			continue;
+		}
+
+		switch (lock_get_mode(lock)) {
+		case LOCK_IX:
+			ok = TRUE;
+			break;
+		case LOCK_AUTO_INC:
+			/* It is allowed for trx to hold an
+			auto_increment lock. */
+			break;
+		default:
+not_ok:
+			/* Other table locks than LOCK_IX are not allowed. */
+			ok = FALSE;
+			goto func_exit;
+		}
+	}
+
+func_exit:
+	lock_mutex_exit_kernel();
+
+	return(ok);
+}
+
+/*********************************************************************//**
+Sets the wait flag of a lock and the back pointer in trx to lock. */
+UNIV_INLINE
+void
+lock_set_lock_and_trx_wait(
+/*=======================*/
+	lock_t*	lock,	/*!< in: lock */
+	trx_t*	trx)	/*!< in: trx */
+{
+	ut_ad(lock);
+	ut_ad(trx->wait_lock == NULL);
+
+	trx->wait_lock = lock;
+	lock->type_mode |= LOCK_WAIT;
+}
+
+/**********************************************************************//**
+The back pointer to a waiting lock request in the transaction is set to NULL
+and the wait bit in lock type_mode is reset. */
+UNIV_INLINE
+void
+lock_reset_lock_and_trx_wait(
+/*=========================*/
+	lock_t*	lock)	/*!< in: record lock */
+{
+	ut_ad((lock->trx)->wait_lock == lock);
+	ut_ad(lock_get_wait(lock));
+
+	/* Reset the back pointer in trx to this waiting lock request */
+
+	(lock->trx)->wait_lock = NULL;
+	lock->type_mode &= ~LOCK_WAIT;
+}
+
+/*********************************************************************//**
+Gets the gap flag of a record lock.
+@return	TRUE if gap flag set */
+UNIV_INLINE
+ibool
+lock_rec_get_gap(
+/*=============*/
+	const lock_t*	lock)	/*!< in: record lock */
+{
+	ut_ad(lock);
+	ut_ad(lock_get_type_low(lock) == LOCK_REC);
+
+	if (lock->type_mode & LOCK_GAP) {
+
+		return(TRUE);
+	}
+
+	return(FALSE);
+}
+
+/*********************************************************************//**
+Gets the LOCK_REC_NOT_GAP flag of a record lock.
+@return	TRUE if LOCK_REC_NOT_GAP flag set */
+UNIV_INLINE
+ibool
+lock_rec_get_rec_not_gap(
+/*=====================*/
+	const lock_t*	lock)	/*!< in: record lock */
+{
+	ut_ad(lock);
+	ut_ad(lock_get_type_low(lock) == LOCK_REC);
+
+	if (lock->type_mode & LOCK_REC_NOT_GAP) {
+
+		return(TRUE);
+	}
+
+	return(FALSE);
+}
+
+/*********************************************************************//**
+Gets the waiting insert flag of a record lock.
+@return	TRUE if gap flag set */
+UNIV_INLINE
+ibool
+lock_rec_get_insert_intention(
+/*==========================*/
+	const lock_t*	lock)	/*!< in: record lock */
+{
+	ut_ad(lock);
+	ut_ad(lock_get_type_low(lock) == LOCK_REC);
+
+	if (lock->type_mode & LOCK_INSERT_INTENTION) {
+
+		return(TRUE);
+	}
+
+	return(FALSE);
+}
+
+/*********************************************************************//**
+Calculates if lock mode 1 is stronger or equal to lock mode 2.
+@return	nonzero if mode1 stronger or equal to mode2 */
+UNIV_INLINE
+ulint
+lock_mode_stronger_or_eq(
+/*=====================*/
+	enum lock_mode	mode1,	/*!< in: lock mode */
+	enum lock_mode	mode2)	/*!< in: lock mode */
+{
+	ut_ad(mode1 == LOCK_X || mode1 == LOCK_S || mode1 == LOCK_IX
+	      || mode1 == LOCK_IS || mode1 == LOCK_AUTO_INC);
+	ut_ad(mode2 == LOCK_X || mode2 == LOCK_S || mode2 == LOCK_IX
+	      || mode2 == LOCK_IS || mode2 == LOCK_AUTO_INC);
+
+	return((LOCK_MODE_STRONGER_OR_EQ) & LK(mode1, mode2));
+}
+
+/*********************************************************************//**
+Calculates if lock mode 1 is compatible with lock mode 2.
+@return	nonzero if mode1 compatible with mode2 */
+UNIV_INLINE
+ulint
+lock_mode_compatible(
+/*=================*/
+	enum lock_mode	mode1,	/*!< in: lock mode */
+	enum lock_mode	mode2)	/*!< in: lock mode */
+{
+	ut_ad(mode1 == LOCK_X || mode1 == LOCK_S || mode1 == LOCK_IX
+	      || mode1 == LOCK_IS || mode1 == LOCK_AUTO_INC);
+	ut_ad(mode2 == LOCK_X || mode2 == LOCK_S || mode2 == LOCK_IX
+	      || mode2 == LOCK_IS || mode2 == LOCK_AUTO_INC);
+
+	return((LOCK_MODE_COMPATIBILITY) & LK(mode1, mode2));
+}
+
+/*********************************************************************//**
+Checks if a lock request for a new lock has to wait for request lock2.
+@return	TRUE if new lock has to wait for lock2 to be removed */
+UNIV_INLINE
+ibool
+lock_rec_has_to_wait(
+/*=================*/
+	const trx_t*	trx,	/*!< in: trx of new lock */
+	ulint		type_mode,/*!< in: precise mode of the new lock
+				to set: LOCK_S or LOCK_X, possibly
+				ORed to LOCK_GAP or LOCK_REC_NOT_GAP,
+				LOCK_INSERT_INTENTION */
+	const lock_t*	lock2,	/*!< in: another record lock; NOTE that
+				it is assumed that this has a lock bit
+				set on the same record as in the new
+				lock we are setting */
+	ibool lock_is_on_supremum)  /*!< in: TRUE if we are setting the
+				lock on the 'supremum' record of an
+				index page: we know then that the lock
+				request is really for a 'gap' type lock */
+{
+	ut_ad(trx && lock2);
+	ut_ad(lock_get_type_low(lock2) == LOCK_REC);
+
+	if (trx != lock2->trx
+	    && !lock_mode_compatible(LOCK_MODE_MASK & type_mode,
+				     lock_get_mode(lock2))) {
+
+		/* We have somewhat complex rules when gap type record locks
+		cause waits */
+
+		if ((lock_is_on_supremum || (type_mode & LOCK_GAP))
+		    && !(type_mode & LOCK_INSERT_INTENTION)) {
+
+			/* Gap type locks without LOCK_INSERT_INTENTION flag
+			do not need to wait for anything. This is because
+			different users can have conflicting lock types
+			on gaps. */
+
+			return(FALSE);
+		}
+
+		if (!(type_mode & LOCK_INSERT_INTENTION)
+		    && lock_rec_get_gap(lock2)) {
+
+			/* Record lock (LOCK_ORDINARY or LOCK_REC_NOT_GAP
+			does not need to wait for a gap type lock */
+
+			return(FALSE);
+		}
+
+		if ((type_mode & LOCK_GAP)
+		    && lock_rec_get_rec_not_gap(lock2)) {
+
+			/* Lock on gap does not need to wait for
+			a LOCK_REC_NOT_GAP type lock */
+
+			return(FALSE);
+		}
+
+		if (lock_rec_get_insert_intention(lock2)) {
+
+			/* No lock request needs to wait for an insert
+			intention lock to be removed. This is ok since our
+			rules allow conflicting locks on gaps. This eliminates
+			a spurious deadlock caused by a next-key lock waiting
+			for an insert intention lock; when the insert
+			intention lock was granted, the insert deadlocked on
+			the waiting next-key lock.
+
+			Also, insert intention locks do not disturb each
+			other. */
+
+			return(FALSE);
+		}
+
+		return(TRUE);
+	}
+
+	return(FALSE);
+}
+
+/*********************************************************************//**
+Checks if a lock request lock1 has to wait for request lock2.
+@return	TRUE if lock1 has to wait for lock2 to be removed */
+UNIV_INTERN
+ibool
+lock_has_to_wait(
+/*=============*/
+	const lock_t*	lock1,	/*!< in: waiting lock */
+	const lock_t*	lock2)	/*!< in: another lock; NOTE that it is
+				assumed that this has a lock bit set
+				on the same record as in lock1 if the
+				locks are record locks */
+{
+	ut_ad(lock1 && lock2);
+
+	if (lock1->trx != lock2->trx
+	    && !lock_mode_compatible(lock_get_mode(lock1),
+				     lock_get_mode(lock2))) {
+		if (lock_get_type_low(lock1) == LOCK_REC) {
+			ut_ad(lock_get_type_low(lock2) == LOCK_REC);
+
+			/* If this lock request is for a supremum record
+			then the second bit on the lock bitmap is set */
+
+			return(lock_rec_has_to_wait(lock1->trx,
+						    lock1->type_mode, lock2,
+						    lock_rec_get_nth_bit(
+							    lock1, 1)));
+		}
+
+		return(TRUE);
+	}
+
+	return(FALSE);
+}
+
+/*============== RECORD LOCK BASIC FUNCTIONS ============================*/
+
+/*********************************************************************//**
+Gets the number of bits in a record lock bitmap.
+@return	number of bits */
+UNIV_INLINE
+ulint
+lock_rec_get_n_bits(
+/*================*/
+	const lock_t*	lock)	/*!< in: record lock */
+{
+	return(lock->un_member.rec_lock.n_bits);
+}
+
+/**********************************************************************//**
+Sets the nth bit of a record lock to TRUE. */
+UNIV_INLINE
+void
+lock_rec_set_nth_bit(
+/*=================*/
+	lock_t*	lock,	/*!< in: record lock */
+	ulint	i)	/*!< in: index of the bit */
+{
+	ulint	byte_index;
+	ulint	bit_index;
+
+	ut_ad(lock);
+	ut_ad(lock_get_type_low(lock) == LOCK_REC);
+	ut_ad(i < lock->un_member.rec_lock.n_bits);
+
+	byte_index = i / 8;
+	bit_index = i % 8;
+
+	((byte*) &lock[1])[byte_index] |= 1 << bit_index;
+}
+
+/**********************************************************************//**
+Looks for a set bit in a record lock bitmap. Returns ULINT_UNDEFINED,
+if none found.
+@return bit index == heap number of the record, or ULINT_UNDEFINED if
+none found */
+UNIV_INTERN
+ulint
+lock_rec_find_set_bit(
+/*==================*/
+	const lock_t*	lock)	/*!< in: record lock with at least one bit set */
+{
+	ulint	i;
+
+	for (i = 0; i < lock_rec_get_n_bits(lock); i++) {
+
+		if (lock_rec_get_nth_bit(lock, i)) {
+
+			return(i);
+		}
+	}
+
+	return(ULINT_UNDEFINED);
+}
+
+/**********************************************************************//**
+Resets the nth bit of a record lock. */
+UNIV_INLINE
+void
+lock_rec_reset_nth_bit(
+/*===================*/
+	lock_t*	lock,	/*!< in: record lock */
+	ulint	i)	/*!< in: index of the bit which must be set to TRUE
+			when this function is called */
+{
+	ulint	byte_index;
+	ulint	bit_index;
+
+	ut_ad(lock);
+	ut_ad(lock_get_type_low(lock) == LOCK_REC);
+	ut_ad(i < lock->un_member.rec_lock.n_bits);
+
+	byte_index = i / 8;
+	bit_index = i % 8;
+
+	((byte*) &lock[1])[byte_index] &= ~(1 << bit_index);
+}
+
+/*********************************************************************//**
+Gets the first or next record lock on a page.
+@return	next lock, NULL if none exists */
+UNIV_INLINE
+lock_t*
+lock_rec_get_next_on_page(
+/*======================*/
+	lock_t*	lock)	/*!< in: a record lock */
+{
+	ulint	space;
+	ulint	page_no;
+
+	ut_ad(mutex_own(&kernel_mutex));
+	ut_ad(lock_get_type_low(lock) == LOCK_REC);
+
+	space = lock->un_member.rec_lock.space;
+	page_no = lock->un_member.rec_lock.page_no;
+
+	for (;;) {
+		lock = HASH_GET_NEXT(hash, lock);
+
+		if (!lock) {
+
+			break;
+		}
+
+		if ((lock->un_member.rec_lock.space == space)
+		    && (lock->un_member.rec_lock.page_no == page_no)) {
+
+			break;
+		}
+	}
+
+	return(lock);
+}
+
+/*********************************************************************//**
+Gets the first record lock on a page, where the page is identified by its
+file address.
+@return	first lock, NULL if none exists */
+UNIV_INLINE
+lock_t*
+lock_rec_get_first_on_page_addr(
+/*============================*/
+	ulint	space,	/*!< in: space */
+	ulint	page_no)/*!< in: page number */
+{
+	lock_t*	lock;
+
+	ut_ad(mutex_own(&kernel_mutex));
+
+	lock = HASH_GET_FIRST(lock_sys->rec_hash,
+			      lock_rec_hash(space, page_no));
+	while (lock) {
+		if ((lock->un_member.rec_lock.space == space)
+		    && (lock->un_member.rec_lock.page_no == page_no)) {
+
+			break;
+		}
+
+		lock = HASH_GET_NEXT(hash, lock);
+	}
+
+	return(lock);
+}
+
+/*********************************************************************//**
+Returns TRUE if there are explicit record locks on a page.
+@return	TRUE if there are explicit record locks on the page */
+UNIV_INTERN
+ibool
+lock_rec_expl_exist_on_page(
+/*========================*/
+	ulint	space,	/*!< in: space id */
+	ulint	page_no)/*!< in: page number */
+{
+	ibool	ret;
+
+	mutex_enter(&kernel_mutex);
+
+	if (lock_rec_get_first_on_page_addr(space, page_no)) {
+		ret = TRUE;
+	} else {
+		ret = FALSE;
+	}
+
+	mutex_exit(&kernel_mutex);
+
+	return(ret);
+}
+
+/*********************************************************************//**
+Gets the first record lock on a page, where the page is identified by a
+pointer to it.
+@return	first lock, NULL if none exists */
+UNIV_INLINE
+lock_t*
+lock_rec_get_first_on_page(
+/*=======================*/
+	const buf_block_t*	block)	/*!< in: buffer block */
+{
+	ulint	hash;
+	lock_t*	lock;
+	ulint	space	= buf_block_get_space(block);
+	ulint	page_no	= buf_block_get_page_no(block);
+
+	ut_ad(mutex_own(&kernel_mutex));
+
+	hash = buf_block_get_lock_hash_val(block);
+
+	lock = HASH_GET_FIRST(lock_sys->rec_hash, hash);
+
+	while (lock) {
+		if ((lock->un_member.rec_lock.space == space)
+		    && (lock->un_member.rec_lock.page_no == page_no)) {
+
+			break;
+		}
+
+		lock = HASH_GET_NEXT(hash, lock);
+	}
+
+	return(lock);
+}
+
+/*********************************************************************//**
+Gets the next explicit lock request on a record.
+@return	next lock, NULL if none exists or if heap_no == ULINT_UNDEFINED */
+UNIV_INLINE
+lock_t*
+lock_rec_get_next(
+/*==============*/
+	ulint	heap_no,/*!< in: heap number of the record */
+	lock_t*	lock)	/*!< in: lock */
+{
+	ut_ad(mutex_own(&kernel_mutex));
+
+	do {
+		ut_ad(lock_get_type_low(lock) == LOCK_REC);
+		lock = lock_rec_get_next_on_page(lock);
+	} while (lock && !lock_rec_get_nth_bit(lock, heap_no));
+
+	return(lock);
+}
+
+/*********************************************************************//**
+Gets the first explicit lock request on a record.
+@return	first lock, NULL if none exists */
+UNIV_INLINE
+lock_t*
+lock_rec_get_first(
+/*===============*/
+	const buf_block_t*	block,	/*!< in: block containing the record */
+	ulint			heap_no)/*!< in: heap number of the record */
+{
+	lock_t*	lock;
+
+	ut_ad(mutex_own(&kernel_mutex));
+
+	for (lock = lock_rec_get_first_on_page(block); lock;
+	     lock = lock_rec_get_next_on_page(lock)) {
+		if (lock_rec_get_nth_bit(lock, heap_no)) {
+			break;
+		}
+	}
+
+	return(lock);
+}
+
+/*********************************************************************//**
+Resets the record lock bitmap to zero. NOTE: does not touch the wait_lock
+pointer in the transaction! This function is used in lock object creation
+and resetting. */
+static
+void
+lock_rec_bitmap_reset(
+/*==================*/
+	lock_t*	lock)	/*!< in: record lock */
+{
+	ulint	n_bytes;
+
+	ut_ad(lock_get_type_low(lock) == LOCK_REC);
+
+	/* Reset to zero the bitmap which resides immediately after the lock
+	struct */
+
+	n_bytes = lock_rec_get_n_bits(lock) / 8;
+
+	ut_ad((lock_rec_get_n_bits(lock) % 8) == 0);
+
+	memset(&lock[1], 0, n_bytes);
+}
+
+/*********************************************************************//**
+Copies a record lock to heap.
+@return	copy of lock */
+static
+lock_t*
+lock_rec_copy(
+/*==========*/
+	const lock_t*	lock,	/*!< in: record lock */
+	mem_heap_t*	heap)	/*!< in: memory heap */
+{
+	ulint	size;
+
+	ut_ad(lock_get_type_low(lock) == LOCK_REC);
+
+	size = sizeof(lock_t) + lock_rec_get_n_bits(lock) / 8;
+
+	return(mem_heap_dup(heap, lock, size));
+}
+
+/*********************************************************************//**
+Gets the previous record lock set on a record.
+@return	previous lock on the same record, NULL if none exists */
+UNIV_INTERN
+const lock_t*
+lock_rec_get_prev(
+/*==============*/
+	const lock_t*	in_lock,/*!< in: record lock */
+	ulint		heap_no)/*!< in: heap number of the record */
+{
+	lock_t*	lock;
+	ulint	space;
+	ulint	page_no;
+	lock_t*	found_lock	= NULL;
+
+	ut_ad(mutex_own(&kernel_mutex));
+	ut_ad(lock_get_type_low(in_lock) == LOCK_REC);
+
+	space = in_lock->un_member.rec_lock.space;
+	page_no = in_lock->un_member.rec_lock.page_no;
+
+	lock = lock_rec_get_first_on_page_addr(space, page_no);
+
+	for (;;) {
+		ut_ad(lock);
+
+		if (lock == in_lock) {
+
+			return(found_lock);
+		}
+
+		if (lock_rec_get_nth_bit(lock, heap_no)) {
+
+			found_lock = lock;
+		}
+
+		lock = lock_rec_get_next_on_page(lock);
+	}
+}
+
+/*============= FUNCTIONS FOR ANALYZING TABLE LOCK QUEUE ================*/
+
+/*********************************************************************//**
+Checks if a transaction has the specified table lock, or stronger.
+@return	lock or NULL */
+UNIV_INLINE
+lock_t*
+lock_table_has(
+/*===========*/
+	trx_t*		trx,	/*!< in: transaction */
+	dict_table_t*	table,	/*!< in: table */
+	enum lock_mode	mode)	/*!< in: lock mode */
+{
+	lock_t*	lock;
+
+	ut_ad(mutex_own(&kernel_mutex));
+
+	/* Look for stronger locks the same trx already has on the table */
+
+	lock = UT_LIST_GET_LAST(table->locks);
+
+	while (lock != NULL) {
+
+		if (lock->trx == trx
+		    && lock_mode_stronger_or_eq(lock_get_mode(lock), mode)) {
+
+			/* The same trx already has locked the table in
+			a mode stronger or equal to the mode given */
+
+			ut_ad(!lock_get_wait(lock));
+
+			return(lock);
+		}
+
+		lock = UT_LIST_GET_PREV(un_member.tab_lock.locks, lock);
+	}
+
+	return(NULL);
+}
+
+/*============= FUNCTIONS FOR ANALYZING RECORD LOCK QUEUE ================*/
+
+/*********************************************************************//**
+Checks if a transaction has a GRANTED explicit lock on rec stronger or equal
+to precise_mode.
+@return	lock or NULL */
+UNIV_INLINE
+lock_t*
+lock_rec_has_expl(
+/*==============*/
+	ulint			precise_mode,/*!< in: LOCK_S or LOCK_X
+					possibly ORed to LOCK_GAP or
+					LOCK_REC_NOT_GAP, for a
+					supremum record we regard this
+					always a gap type request */
+	const buf_block_t*	block,	/*!< in: buffer block containing
+					the record */
+	ulint			heap_no,/*!< in: heap number of the record */
+	trx_t*			trx)	/*!< in: transaction */
+{
+	lock_t*	lock;
+
+	ut_ad(mutex_own(&kernel_mutex));
+	ut_ad((precise_mode & LOCK_MODE_MASK) == LOCK_S
+	      || (precise_mode & LOCK_MODE_MASK) == LOCK_X);
+	ut_ad(!(precise_mode & LOCK_INSERT_INTENTION));
+
+	lock = lock_rec_get_first(block, heap_no);
+
+	while (lock) {
+		if (lock->trx == trx
+		    && lock_mode_stronger_or_eq(lock_get_mode(lock),
+						precise_mode & LOCK_MODE_MASK)
+		    && !lock_get_wait(lock)
+		    && (!lock_rec_get_rec_not_gap(lock)
+			|| (precise_mode & LOCK_REC_NOT_GAP)
+			|| heap_no == PAGE_HEAP_NO_SUPREMUM)
+		    && (!lock_rec_get_gap(lock)
+			|| (precise_mode & LOCK_GAP)
+			|| heap_no == PAGE_HEAP_NO_SUPREMUM)
+		    && (!lock_rec_get_insert_intention(lock))) {
+
+			return(lock);
+		}
+
+		lock = lock_rec_get_next(heap_no, lock);
+	}
+
+	return(NULL);
+}
+
+#ifdef UNIV_DEBUG
+/*********************************************************************//**
+Checks if some other transaction has a lock request in the queue.
+@return	lock or NULL */
+static
+lock_t*
+lock_rec_other_has_expl_req(
+/*========================*/
+	enum lock_mode		mode,	/*!< in: LOCK_S or LOCK_X */
+	ulint			gap,	/*!< in: LOCK_GAP if also gap
+					locks are taken into account,
+					or 0 if not */
+	ulint			wait,	/*!< in: LOCK_WAIT if also
+					waiting locks are taken into
+					account, or 0 if not */
+	const buf_block_t*	block,	/*!< in: buffer block containing
+					the record */
+	ulint			heap_no,/*!< in: heap number of the record */
+	const trx_t*		trx)	/*!< in: transaction, or NULL if
+					requests by all transactions
+					are taken into account */
+{
+	lock_t*	lock;
+
+	ut_ad(mutex_own(&kernel_mutex));
+	ut_ad(mode == LOCK_X || mode == LOCK_S);
+	ut_ad(gap == 0 || gap == LOCK_GAP);
+	ut_ad(wait == 0 || wait == LOCK_WAIT);
+
+	lock = lock_rec_get_first(block, heap_no);
+
+	while (lock) {
+		if (lock->trx != trx
+		    && (gap
+			|| !(lock_rec_get_gap(lock)
+			     || heap_no == PAGE_HEAP_NO_SUPREMUM))
+		    && (wait || !lock_get_wait(lock))
+		    && lock_mode_stronger_or_eq(lock_get_mode(lock), mode)) {
+
+			return(lock);
+		}
+
+		lock = lock_rec_get_next(heap_no, lock);
+	}
+
+	return(NULL);
+}
+#endif /* UNIV_DEBUG */
+
+/*********************************************************************//**
+Checks if some other transaction has a conflicting explicit lock request
+in the queue, so that we have to wait.
+@return	lock or NULL */
+static
+lock_t*
+lock_rec_other_has_conflicting(
+/*===========================*/
+	enum lock_mode		mode,	/*!< in: LOCK_S or LOCK_X,
+					possibly ORed to LOCK_GAP or
+					LOC_REC_NOT_GAP,
+					LOCK_INSERT_INTENTION */
+	const buf_block_t*	block,	/*!< in: buffer block containing
+					the record */
+	ulint			heap_no,/*!< in: heap number of the record */
+	trx_t*			trx)	/*!< in: our transaction */
+{
+	lock_t*	lock;
+
+	ut_ad(mutex_own(&kernel_mutex));
+
+	lock = lock_rec_get_first(block, heap_no);
+
+	if (UNIV_LIKELY_NULL(lock)) {
+		if (UNIV_UNLIKELY(heap_no == PAGE_HEAP_NO_SUPREMUM)) {
+
+			do {
+				if (lock_rec_has_to_wait(trx, mode, lock,
+							 TRUE)) {
+					return(lock);
+				}
+
+				lock = lock_rec_get_next(heap_no, lock);
+			} while (lock);
+		} else {
+
+			do {
+				if (lock_rec_has_to_wait(trx, mode, lock,
+							 FALSE)) {
+					return(lock);
+				}
+
+				lock = lock_rec_get_next(heap_no, lock);
+			} while (lock);
+		}
+	}
+
+	return(NULL);
+}
+
+/*********************************************************************//**
+Looks for a suitable type record lock struct by the same trx on the same page.
+This can be used to save space when a new record lock should be set on a page:
+no new struct is needed, if a suitable old is found.
+@return	lock or NULL */
+UNIV_INLINE
+lock_t*
+lock_rec_find_similar_on_page(
+/*==========================*/
+	ulint		type_mode,	/*!< in: lock type_mode field */
+	ulint		heap_no,	/*!< in: heap number of the record */
+	lock_t*		lock,		/*!< in: lock_rec_get_first_on_page() */
+	const trx_t*	trx)		/*!< in: transaction */
+{
+	ut_ad(mutex_own(&kernel_mutex));
+
+	while (lock != NULL) {
+		if (lock->trx == trx
+		    && lock->type_mode == type_mode
+		    && lock_rec_get_n_bits(lock) > heap_no) {
+
+			return(lock);
+		}
+
+		lock = lock_rec_get_next_on_page(lock);
+	}
+
+	return(NULL);
+}
+
+/*********************************************************************//**
+Checks if some transaction has an implicit x-lock on a record in a secondary
+index.
+@return	transaction which has the x-lock, or NULL */
+static
+trx_t*
+lock_sec_rec_some_has_impl_off_kernel(
+/*==================================*/
+	const rec_t*	rec,	/*!< in: user record */
+	dict_index_t*	index,	/*!< in: secondary index */
+	const ulint*	offsets)/*!< in: rec_get_offsets(rec, index) */
+{
+	const page_t*	page = page_align(rec);
+
+	ut_ad(mutex_own(&kernel_mutex));
+	ut_ad(!dict_index_is_clust(index));
+	ut_ad(page_rec_is_user_rec(rec));
+	ut_ad(rec_offs_validate(rec, index, offsets));
+
+	/* Some transaction may have an implicit x-lock on the record only
+	if the max trx id for the page >= min trx id for the trx list, or
+	database recovery is running. We do not write the changes of a page
+	max trx id to the log, and therefore during recovery, this value
+	for a page may be incorrect. */
+
+	if (!(ut_dulint_cmp(page_get_max_trx_id(page),
+			    trx_list_get_min_trx_id()) >= 0)
+	    && !recv_recovery_is_on()) {
+
+		return(NULL);
+	}
+
+	/* Ok, in this case it is possible that some transaction has an
+	implicit x-lock. We have to look in the clustered index. */
+
+	if (!lock_check_trx_id_sanity(page_get_max_trx_id(page),
+				      rec, index, offsets, TRUE)) {
+		buf_page_print(page, 0);
+
+		/* The page is corrupt: try to avoid a crash by returning
+		NULL */
+		return(NULL);
+	}
+
+	return(row_vers_impl_x_locked_off_kernel(rec, index, offsets));
+}
+
+/*********************************************************************//**
+Return approximate number or record locks (bits set in the bitmap) for
+this transaction. Since delete-marked records may be removed, the
+record count will not be precise. */
+UNIV_INTERN
+ulint
+lock_number_of_rows_locked(
+/*=======================*/
+	trx_t*	trx)	/*!< in: transaction */
+{
+	lock_t*	lock;
+	ulint   n_records = 0;
+	ulint	n_bits;
+	ulint	n_bit;
+
+	lock = UT_LIST_GET_FIRST(trx->trx_locks);
+
+	while (lock) {
+		if (lock_get_type_low(lock) == LOCK_REC) {
+			n_bits = lock_rec_get_n_bits(lock);
+
+			for (n_bit = 0; n_bit < n_bits; n_bit++) {
+				if (lock_rec_get_nth_bit(lock, n_bit)) {
+					n_records++;
+				}
+			}
+		}
+
+		lock = UT_LIST_GET_NEXT(trx_locks, lock);
+	}
+
+	return (n_records);
+}
+
+/*============== RECORD LOCK CREATION AND QUEUE MANAGEMENT =============*/
+
+/*********************************************************************//**
+Creates a new record lock and inserts it to the lock queue. Does NOT check
+for deadlocks or lock compatibility!
+@return	created lock */
+static
+lock_t*
+lock_rec_create(
+/*============*/
+	ulint			type_mode,/*!< in: lock mode and wait
+					flag, type is ignored and
+					replaced by LOCK_REC */
+	const buf_block_t*	block,	/*!< in: buffer block containing
+					the record */
+	ulint			heap_no,/*!< in: heap number of the record */
+	dict_index_t*		index,	/*!< in: index of record */
+	trx_t*			trx)	/*!< in: transaction */
+{
+	lock_t*		lock;
+	ulint		page_no;
+	ulint		space;
+	ulint		n_bits;
+	ulint		n_bytes;
+	const page_t*	page;
+
+	ut_ad(mutex_own(&kernel_mutex));
+
+	space = buf_block_get_space(block);
+	page_no	= buf_block_get_page_no(block);
+	page = block->frame;
+
+	ut_ad(!!page_is_comp(page) == dict_table_is_comp(index->table));
+
+	/* If rec is the supremum record, then we reset the gap and
+	LOCK_REC_NOT_GAP bits, as all locks on the supremum are
+	automatically of the gap type */
+
+	if (UNIV_UNLIKELY(heap_no == PAGE_HEAP_NO_SUPREMUM)) {
+		ut_ad(!(type_mode & LOCK_REC_NOT_GAP));
+
+		type_mode = type_mode & ~(LOCK_GAP | LOCK_REC_NOT_GAP);
+	}
+
+	/* Make lock bitmap bigger by a safety margin */
+	n_bits = page_dir_get_n_heap(page) + LOCK_PAGE_BITMAP_MARGIN;
+	n_bytes = 1 + n_bits / 8;
+
+	lock = mem_heap_alloc(trx->lock_heap, sizeof(lock_t) + n_bytes);
+
+	UT_LIST_ADD_LAST(trx_locks, trx->trx_locks, lock);
+
+	lock->trx = trx;
+
+	lock->type_mode = (type_mode & ~LOCK_TYPE_MASK) | LOCK_REC;
+	lock->index = index;
+
+	lock->un_member.rec_lock.space = space;
+	lock->un_member.rec_lock.page_no = page_no;
+	lock->un_member.rec_lock.n_bits = n_bytes * 8;
+
+	/* Reset to zero the bitmap which resides immediately after the
+	lock struct */
+
+	lock_rec_bitmap_reset(lock);
+
+	/* Set the bit corresponding to rec */
+	lock_rec_set_nth_bit(lock, heap_no);
+
+	HASH_INSERT(lock_t, hash, lock_sys->rec_hash,
+		    lock_rec_fold(space, page_no), lock);
+	if (UNIV_UNLIKELY(type_mode & LOCK_WAIT)) {
+
+		lock_set_lock_and_trx_wait(lock, trx);
+	}
+
+	return(lock);
+}
+
+/*********************************************************************//**
+Enqueues a waiting request for a lock which cannot be granted immediately.
+Checks for deadlocks.
+@return DB_LOCK_WAIT, DB_DEADLOCK, or DB_QUE_THR_SUSPENDED, or
+DB_SUCCESS_LOCKED_REC; DB_SUCCESS_LOCKED_REC means that
+there was a deadlock, but another transaction was chosen as a victim,
+and we got the lock immediately: no need to wait then */
+static
+enum db_err
+lock_rec_enqueue_waiting(
+/*=====================*/
+	ulint			type_mode,/*!< in: lock mode this
+					transaction is requesting:
+					LOCK_S or LOCK_X, possibly
+					ORed with LOCK_GAP or
+					LOCK_REC_NOT_GAP, ORed with
+					LOCK_INSERT_INTENTION if this
+					waiting lock request is set
+					when performing an insert of
+					an index record */
+	const buf_block_t*	block,	/*!< in: buffer block containing
+					the record */
+	ulint			heap_no,/*!< in: heap number of the record */
+	dict_index_t*		index,	/*!< in: index of record */
+	que_thr_t*		thr)	/*!< in: query thread */
+{
+	lock_t*	lock;
+	trx_t*	trx;
+	ulint   sec;
+	ulint   ms;
+
+	ut_ad(mutex_own(&kernel_mutex));
+
+	/* Test if there already is some other reason to suspend thread:
+	we do not enqueue a lock request if the query thread should be
+	stopped anyway */
+
+	if (UNIV_UNLIKELY(que_thr_stop(thr))) {
+
+		ut_error;
+
+		return(DB_QUE_THR_SUSPENDED);
+	}
+
+	trx = thr_get_trx(thr);
+
+	switch (trx_get_dict_operation(trx)) {
+	case TRX_DICT_OP_NONE:
+		break;
+	case TRX_DICT_OP_TABLE:
+	case TRX_DICT_OP_INDEX:
+		ut_print_timestamp(stderr);
+		fputs("  InnoDB: Error: a record lock wait happens"
+		      " in a dictionary operation!\n"
+		      "InnoDB: ", stderr);
+		dict_index_name_print(stderr, trx, index);
+		fputs(".\n"
+		      "InnoDB: Submit a detailed bug report"
+		      " to http://bugs.mysql.com\n",
+		      stderr);
+	}
+
+	/* Enqueue the lock request that will wait to be granted */
+	lock = lock_rec_create(type_mode | LOCK_WAIT,
+			       block, heap_no, index, trx);
+
+	/* Check if a deadlock occurs: if yes, remove the lock request and
+	return an error code */
+
+	if (UNIV_UNLIKELY(lock_deadlock_occurs(lock, trx))) {
+
+		lock_reset_lock_and_trx_wait(lock);
+		lock_rec_reset_nth_bit(lock, heap_no);
+
+		return(DB_DEADLOCK);
+	}
+
+	/* If there was a deadlock but we chose another transaction as a
+	victim, it is possible that we already have the lock now granted! */
+
+	if (trx->wait_lock == NULL) {
+
+		return(DB_SUCCESS_LOCKED_REC);
+	}
+
+	trx->que_state = TRX_QUE_LOCK_WAIT;
+	trx->was_chosen_as_deadlock_victim = FALSE;
+	trx->wait_started = time(NULL);
+	if (innobase_get_slow_log() && trx->take_stats) {
+		ut_usectime(&sec, &ms);
+		trx->lock_que_wait_ustarted = (ib_uint64_t)sec * 1000000 + ms;
+	}
+
+	ut_a(que_thr_stop(thr));
+
+#ifdef UNIV_DEBUG
+	if (lock_print_waits) {
+		fprintf(stderr, "Lock wait for trx %lu in index ",
+			(ulong) ut_dulint_get_low(trx->id));
+		ut_print_name(stderr, trx, FALSE, index->name);
+	}
+#endif /* UNIV_DEBUG */
+
+	return(DB_LOCK_WAIT);
+}
+
+/*********************************************************************//**
+Adds a record lock request in the record queue. The request is normally
+added as the last in the queue, but if there are no waiting lock requests
+on the record, and the request to be added is not a waiting request, we
+can reuse a suitable record lock object already existing on the same page,
+just setting the appropriate bit in its bitmap. This is a low-level function
+which does NOT check for deadlocks or lock compatibility!
+@return	lock where the bit was set */
+static
+lock_t*
+lock_rec_add_to_queue(
+/*==================*/
+	ulint			type_mode,/*!< in: lock mode, wait, gap
+					etc. flags; type is ignored
+					and replaced by LOCK_REC */
+	const buf_block_t*	block,	/*!< in: buffer block containing
+					the record */
+	ulint			heap_no,/*!< in: heap number of the record */
+	dict_index_t*		index,	/*!< in: index of record */
+	trx_t*			trx)	/*!< in: transaction */
+{
+	lock_t*	lock;
+
+	ut_ad(mutex_own(&kernel_mutex));
+#ifdef UNIV_DEBUG
+	switch (type_mode & LOCK_MODE_MASK) {
+	case LOCK_X:
+	case LOCK_S:
+		break;
+	default:
+		ut_error;
+	}
+
+	if (!(type_mode & (LOCK_WAIT | LOCK_GAP))) {
+		enum lock_mode	mode = (type_mode & LOCK_MODE_MASK) == LOCK_S
+			? LOCK_X
+			: LOCK_S;
+		lock_t*		other_lock
+			= lock_rec_other_has_expl_req(mode, 0, LOCK_WAIT,
+						      block, heap_no, trx);
+		ut_a(!other_lock);
+	}
+#endif /* UNIV_DEBUG */
+
+	type_mode |= LOCK_REC;
+
+	/* If rec is the supremum record, then we can reset the gap bit, as
+	all locks on the supremum are automatically of the gap type, and we
+	try to avoid unnecessary memory consumption of a new record lock
+	struct for a gap type lock */
+
+	if (UNIV_UNLIKELY(heap_no == PAGE_HEAP_NO_SUPREMUM)) {
+		ut_ad(!(type_mode & LOCK_REC_NOT_GAP));
+
+		/* There should never be LOCK_REC_NOT_GAP on a supremum
+		record, but let us play safe */
+
+		type_mode = type_mode & ~(LOCK_GAP | LOCK_REC_NOT_GAP);
+	}
+
+	/* Look for a waiting lock request on the same record or on a gap */
+
+	lock = lock_rec_get_first_on_page(block);
+
+	while (lock != NULL) {
+		if (lock_get_wait(lock)
+		    && (lock_rec_get_nth_bit(lock, heap_no))) {
+
+			goto somebody_waits;
+		}
+
+		lock = lock_rec_get_next_on_page(lock);
+	}
+
+	if (UNIV_LIKELY(!(type_mode & LOCK_WAIT))) {
+
+		/* Look for a similar record lock on the same page:
+		if one is found and there are no waiting lock requests,
+		we can just set the bit */
+
+		lock = lock_rec_find_similar_on_page(
+			type_mode, heap_no,
+			lock_rec_get_first_on_page(block), trx);
+
+		if (lock) {
+
+			lock_rec_set_nth_bit(lock, heap_no);
+
+			return(lock);
+		}
+	}
+
+somebody_waits:
+	return(lock_rec_create(type_mode, block, heap_no, index, trx));
+}
+
+/** Record locking request status */
+enum lock_rec_req_status {
+	/** Failed to acquire a lock */
+	LOCK_REC_FAIL,
+	/** Succeeded in acquiring a lock (implicit or already acquired) */
+	LOCK_REC_SUCCESS,
+	/** Explicitly created a new lock */
+	LOCK_REC_SUCCESS_CREATED
+};
+
+/*********************************************************************//**
+This is a fast routine for locking a record in the most common cases:
+there are no explicit locks on the page, or there is just one lock, owned
+by this transaction, and of the right type_mode. This is a low-level function
+which does NOT look at implicit locks! Checks lock compatibility within
+explicit locks. This function sets a normal next-key lock, or in the case of
+a page supremum record, a gap type lock.
+@return whether the locking succeeded */
+UNIV_INLINE
+enum lock_rec_req_status
+lock_rec_lock_fast(
+/*===============*/
+	ibool			impl,	/*!< in: if TRUE, no lock is set
+					if no wait is necessary: we
+					assume that the caller will
+					set an implicit lock */
+	ulint			mode,	/*!< in: lock mode: LOCK_X or
+					LOCK_S possibly ORed to either
+					LOCK_GAP or LOCK_REC_NOT_GAP */
+	const buf_block_t*	block,	/*!< in: buffer block containing
+					the record */
+	ulint			heap_no,/*!< in: heap number of record */
+	dict_index_t*		index,	/*!< in: index of record */
+	que_thr_t*		thr)	/*!< in: query thread */
+{
+	lock_t*	lock;
+	trx_t*	trx;
+
+	ut_ad(mutex_own(&kernel_mutex));
+	ut_ad((LOCK_MODE_MASK & mode) != LOCK_S
+	      || lock_table_has(thr_get_trx(thr), index->table, LOCK_IS));
+	ut_ad((LOCK_MODE_MASK & mode) != LOCK_X
+	      || lock_table_has(thr_get_trx(thr), index->table, LOCK_IX));
+	ut_ad((LOCK_MODE_MASK & mode) == LOCK_S
+	      || (LOCK_MODE_MASK & mode) == LOCK_X);
+	ut_ad(mode - (LOCK_MODE_MASK & mode) == LOCK_GAP
+	      || mode - (LOCK_MODE_MASK & mode) == 0
+	      || mode - (LOCK_MODE_MASK & mode) == LOCK_REC_NOT_GAP);
+
+	lock = lock_rec_get_first_on_page(block);
+
+	trx = thr_get_trx(thr);
+
+	if (lock == NULL) {
+		if (!impl) {
+			lock_rec_create(mode, block, heap_no, index, trx);
+		}
+
+		return(LOCK_REC_SUCCESS_CREATED);
+	}
+
+	if (lock_rec_get_next_on_page(lock)) {
+
+		return(LOCK_REC_FAIL);
+	}
+
+	if (lock->trx != trx
+	    || lock->type_mode != (mode | LOCK_REC)
+	    || lock_rec_get_n_bits(lock) <= heap_no) {
+
+		return(LOCK_REC_FAIL);
+	}
+
+	if (!impl) {
+		/* If the nth bit of the record lock is already set then we
+		do not set a new lock bit, otherwise we do set */
+
+		if (!lock_rec_get_nth_bit(lock, heap_no)) {
+			lock_rec_set_nth_bit(lock, heap_no);
+			return(LOCK_REC_SUCCESS_CREATED);
+		}
+	}
+
+	return(LOCK_REC_SUCCESS);
+}
+
+/*********************************************************************//**
+This is the general, and slower, routine for locking a record. This is a
+low-level function which does NOT look at implicit locks! Checks lock
+compatibility within explicit locks. This function sets a normal next-key
+lock, or in the case of a page supremum record, a gap type lock.
+@return	DB_SUCCESS, DB_SUCCESS_LOCKED_REC, DB_LOCK_WAIT, DB_DEADLOCK,
+or DB_QUE_THR_SUSPENDED */
+static
+enum db_err
+lock_rec_lock_slow(
+/*===============*/
+	ibool			impl,	/*!< in: if TRUE, no lock is set
+					if no wait is necessary: we
+					assume that the caller will
+					set an implicit lock */
+	ulint			mode,	/*!< in: lock mode: LOCK_X or
+					LOCK_S possibly ORed to either
+					LOCK_GAP or LOCK_REC_NOT_GAP */
+	const buf_block_t*	block,	/*!< in: buffer block containing
+					the record */
+	ulint			heap_no,/*!< in: heap number of record */
+	dict_index_t*		index,	/*!< in: index of record */
+	que_thr_t*		thr)	/*!< in: query thread */
+{
+	trx_t*	trx;
+
+	ut_ad(mutex_own(&kernel_mutex));
+	ut_ad((LOCK_MODE_MASK & mode) != LOCK_S
+	      || lock_table_has(thr_get_trx(thr), index->table, LOCK_IS));
+	ut_ad((LOCK_MODE_MASK & mode) != LOCK_X
+	      || lock_table_has(thr_get_trx(thr), index->table, LOCK_IX));
+	ut_ad((LOCK_MODE_MASK & mode) == LOCK_S
+	      || (LOCK_MODE_MASK & mode) == LOCK_X);
+	ut_ad(mode - (LOCK_MODE_MASK & mode) == LOCK_GAP
+	      || mode - (LOCK_MODE_MASK & mode) == 0
+	      || mode - (LOCK_MODE_MASK & mode) == LOCK_REC_NOT_GAP);
+
+	trx = thr_get_trx(thr);
+
+	if (lock_rec_has_expl(mode, block, heap_no, trx)) {
+		/* The trx already has a strong enough lock on rec: do
+		nothing */
+
+	} else if (lock_rec_other_has_conflicting(mode, block, heap_no, trx)) {
+
+		/* If another transaction has a non-gap conflicting request in
+		the queue, as this transaction does not have a lock strong
+		enough already granted on the record, we have to wait. */
+
+		return(lock_rec_enqueue_waiting(mode, block, heap_no,
+						index, thr));
+	} else if (!impl) {
+		/* Set the requested lock on the record */
+
+		lock_rec_add_to_queue(LOCK_REC | mode, block,
+				      heap_no, index, trx);
+		return(DB_SUCCESS_LOCKED_REC);
+	}
+
+	return(DB_SUCCESS);
+}
+
+/*********************************************************************//**
+Tries to lock the specified record in the mode requested. If not immediately
+possible, enqueues a waiting lock request. This is a low-level function
+which does NOT look at implicit locks! Checks lock compatibility within
+explicit locks. This function sets a normal next-key lock, or in the case
+of a page supremum record, a gap type lock.
+@return	DB_SUCCESS, DB_SUCCESS_LOCKED_REC, DB_LOCK_WAIT, DB_DEADLOCK,
+or DB_QUE_THR_SUSPENDED */
+static
+enum db_err
+lock_rec_lock(
+/*==========*/
+	ibool			impl,	/*!< in: if TRUE, no lock is set
+					if no wait is necessary: we
+					assume that the caller will
+					set an implicit lock */
+	ulint			mode,	/*!< in: lock mode: LOCK_X or
+					LOCK_S possibly ORed to either
+					LOCK_GAP or LOCK_REC_NOT_GAP */
+	const buf_block_t*	block,	/*!< in: buffer block containing
+					the record */
+	ulint			heap_no,/*!< in: heap number of record */
+	dict_index_t*		index,	/*!< in: index of record */
+	que_thr_t*		thr)	/*!< in: query thread */
+{
+	ut_ad(mutex_own(&kernel_mutex));
+	ut_ad((LOCK_MODE_MASK & mode) != LOCK_S
+	      || lock_table_has(thr_get_trx(thr), index->table, LOCK_IS));
+	ut_ad((LOCK_MODE_MASK & mode) != LOCK_X
+	      || lock_table_has(thr_get_trx(thr), index->table, LOCK_IX));
+	ut_ad((LOCK_MODE_MASK & mode) == LOCK_S
+	      || (LOCK_MODE_MASK & mode) == LOCK_X);
+	ut_ad(mode - (LOCK_MODE_MASK & mode) == LOCK_GAP
+	      || mode - (LOCK_MODE_MASK & mode) == LOCK_REC_NOT_GAP
+	      || mode - (LOCK_MODE_MASK & mode) == 0);
+
+	/* We try a simplified and faster subroutine for the most
+	common cases */
+	switch (lock_rec_lock_fast(impl, mode, block, heap_no, index, thr)) {
+	case LOCK_REC_SUCCESS:
+		return(DB_SUCCESS);
+	case LOCK_REC_SUCCESS_CREATED:
+		return(DB_SUCCESS_LOCKED_REC);
+	case LOCK_REC_FAIL:
+		return(lock_rec_lock_slow(impl, mode, block,
+					  heap_no, index, thr));
+	}
+
+	ut_error;
+	return(DB_ERROR);
+}
+
+/*********************************************************************//**
+Checks if a waiting record lock request still has to wait in a queue.
+@return	TRUE if still has to wait */
+static
+ibool
+lock_rec_has_to_wait_in_queue(
+/*==========================*/
+	lock_t*	wait_lock)	/*!< in: waiting record lock */
+{
+	lock_t*	lock;
+	ulint	space;
+	ulint	page_no;
+	ulint	heap_no;
+
+	ut_ad(mutex_own(&kernel_mutex));
+	ut_ad(lock_get_wait(wait_lock));
+	ut_ad(lock_get_type_low(wait_lock) == LOCK_REC);
+
+	space = wait_lock->un_member.rec_lock.space;
+	page_no = wait_lock->un_member.rec_lock.page_no;
+	heap_no = lock_rec_find_set_bit(wait_lock);
+
+	lock = lock_rec_get_first_on_page_addr(space, page_no);
+
+	while (lock != wait_lock) {
+
+		if (lock_rec_get_nth_bit(lock, heap_no)
+		    && lock_has_to_wait(wait_lock, lock)) {
+
+			return(TRUE);
+		}
+
+		lock = lock_rec_get_next_on_page(lock);
+	}
+
+	return(FALSE);
+}
+
+/*************************************************************//**
+Grants a lock to a waiting lock request and releases the waiting
+transaction. */
+static
+void
+lock_grant(
+/*=======*/
+	lock_t*	lock)	/*!< in/out: waiting lock request */
+{
+	ut_ad(mutex_own(&kernel_mutex));
+
+	lock_reset_lock_and_trx_wait(lock);
+
+	if (lock_get_mode(lock) == LOCK_AUTO_INC) {
+		trx_t*		trx = lock->trx;
+		dict_table_t*	table = lock->un_member.tab_lock.table;
+
+		if (table->autoinc_trx == trx) {
+			fprintf(stderr,
+				"InnoDB: Error: trx already had"
+				" an AUTO-INC lock!\n");
+		} else {
+			table->autoinc_trx = trx;
+
+			ib_vector_push(trx->autoinc_locks, lock);
+		}
+	}
+
+#ifdef UNIV_DEBUG
+	if (lock_print_waits) {
+		fprintf(stderr, "Lock wait for trx %lu ends\n",
+			(ulong) ut_dulint_get_low(lock->trx->id));
+	}
+#endif /* UNIV_DEBUG */
+
+	/* If we are resolving a deadlock by choosing another transaction
+	as a victim, then our original transaction may not be in the
+	TRX_QUE_LOCK_WAIT state, and there is no need to end the lock wait
+	for it */
+
+	if (lock->trx->que_state == TRX_QUE_LOCK_WAIT) {
+		trx_end_lock_wait(lock->trx);
+	}
+}
+
+/*************************************************************//**
+Cancels a waiting record lock request and releases the waiting transaction
+that requested it. NOTE: does NOT check if waiting lock requests behind this
+one can now be granted! */
+static
+void
+lock_rec_cancel(
+/*============*/
+	lock_t*	lock)	/*!< in: waiting record lock request */
+{
+	ut_ad(mutex_own(&kernel_mutex));
+	ut_ad(lock_get_type_low(lock) == LOCK_REC);
+
+	/* Reset the bit (there can be only one set bit) in the lock bitmap */
+	lock_rec_reset_nth_bit(lock, lock_rec_find_set_bit(lock));
+
+	/* Reset the wait flag and the back pointer to lock in trx */
+
+	lock_reset_lock_and_trx_wait(lock);
+
+	/* The following function releases the trx from lock wait */
+
+	trx_end_lock_wait(lock->trx);
+}
+
+/*************************************************************//**
+Removes a record lock request, waiting or granted, from the queue and
+grants locks to other transactions in the queue if they now are entitled
+to a lock. NOTE: all record locks contained in in_lock are removed. */
+static
+void
+lock_rec_dequeue_from_page(
+/*=======================*/
+	lock_t*	in_lock)/*!< in: record lock object: all record locks which
+			are contained in this lock object are removed;
+			transactions waiting behind will get their lock
+			requests granted, if they are now qualified to it */
+{
+	ulint	space;
+	ulint	page_no;
+	lock_t*	lock;
+	trx_t*	trx;
+
+	ut_ad(mutex_own(&kernel_mutex));
+	ut_ad(lock_get_type_low(in_lock) == LOCK_REC);
+
+	trx = in_lock->trx;
+
+	space = in_lock->un_member.rec_lock.space;
+	page_no = in_lock->un_member.rec_lock.page_no;
+
+	HASH_DELETE(lock_t, hash, lock_sys->rec_hash,
+		    lock_rec_fold(space, page_no), in_lock);
+
+	UT_LIST_REMOVE(trx_locks, trx->trx_locks, in_lock);
+
+	/* Check if waiting locks in the queue can now be granted: grant
+	locks if there are no conflicting locks ahead. */
+
+	lock = lock_rec_get_first_on_page_addr(space, page_no);
+
+	while (lock != NULL) {
+		if (lock_get_wait(lock)
+		    && !lock_rec_has_to_wait_in_queue(lock)) {
+
+			/* Grant the lock */
+			lock_grant(lock);
+		}
+
+		lock = lock_rec_get_next_on_page(lock);
+	}
+}
+
+/*************************************************************//**
+Removes a record lock request, waiting or granted, from the queue. */
+static
+void
+lock_rec_discard(
+/*=============*/
+	lock_t*	in_lock)/*!< in: record lock object: all record locks which
+			are contained in this lock object are removed */
+{
+	ulint	space;
+	ulint	page_no;
+	trx_t*	trx;
+
+	ut_ad(mutex_own(&kernel_mutex));
+	ut_ad(lock_get_type_low(in_lock) == LOCK_REC);
+
+	trx = in_lock->trx;
+
+	space = in_lock->un_member.rec_lock.space;
+	page_no = in_lock->un_member.rec_lock.page_no;
+
+	HASH_DELETE(lock_t, hash, lock_sys->rec_hash,
+		    lock_rec_fold(space, page_no), in_lock);
+
+	UT_LIST_REMOVE(trx_locks, trx->trx_locks, in_lock);
+}
+
+/*************************************************************//**
+Removes record lock objects set on an index page which is discarded. This
+function does not move locks, or check for waiting locks, therefore the
+lock bitmaps must already be reset when this function is called. */
+static
+void
+lock_rec_free_all_from_discard_page(
+/*================================*/
+	const buf_block_t*	block)	/*!< in: page to be discarded */
+{
+	ulint	space;
+	ulint	page_no;
+	lock_t*	lock;
+	lock_t*	next_lock;
+
+	ut_ad(mutex_own(&kernel_mutex));
+
+	space = buf_block_get_space(block);
+	page_no = buf_block_get_page_no(block);
+
+	lock = lock_rec_get_first_on_page_addr(space, page_no);
+
+	while (lock != NULL) {
+		ut_ad(lock_rec_find_set_bit(lock) == ULINT_UNDEFINED);
+		ut_ad(!lock_get_wait(lock));
+
+		next_lock = lock_rec_get_next_on_page(lock);
+
+		lock_rec_discard(lock);
+
+		lock = next_lock;
+	}
+}
+
+/*============= RECORD LOCK MOVING AND INHERITING ===================*/
+
+/*************************************************************//**
+Resets the lock bits for a single record. Releases transactions waiting for
+lock requests here. */
+static
+void
+lock_rec_reset_and_release_wait(
+/*============================*/
+	const buf_block_t*	block,	/*!< in: buffer block containing
+					the record */
+	ulint			heap_no)/*!< in: heap number of record */
+{
+	lock_t*	lock;
+
+	ut_ad(mutex_own(&kernel_mutex));
+
+	lock = lock_rec_get_first(block, heap_no);
+
+	while (lock != NULL) {
+		if (lock_get_wait(lock)) {
+			lock_rec_cancel(lock);
+		} else {
+			lock_rec_reset_nth_bit(lock, heap_no);
+		}
+
+		lock = lock_rec_get_next(heap_no, lock);
+	}
+}
+
+/*************************************************************//**
+Makes a record to inherit the locks (except LOCK_INSERT_INTENTION type)
+of another record as gap type locks, but does not reset the lock bits of
+the other record. Also waiting lock requests on rec are inherited as
+GRANTED gap locks. */
+static
+void
+lock_rec_inherit_to_gap(
+/*====================*/
+	const buf_block_t*	heir_block,	/*!< in: block containing the
+						record which inherits */
+	const buf_block_t*	block,		/*!< in: block containing the
+						record from which inherited;
+						does NOT reset the locks on
+						this record */
+	ulint			heir_heap_no,	/*!< in: heap_no of the
+						inheriting record */
+	ulint			heap_no)	/*!< in: heap_no of the
+						donating record */
+{
+	lock_t*	lock;
+
+	ut_ad(mutex_own(&kernel_mutex));
+
+	lock = lock_rec_get_first(block, heap_no);
+
+	/* If srv_locks_unsafe_for_binlog is TRUE or session is using
+	READ COMMITTED isolation level, we do not want locks set
+	by an UPDATE or a DELETE to be inherited as gap type locks. But we
+	DO want S-locks set by a consistency constraint to be inherited also
+	then. */
+
+	while (lock != NULL) {
+		if (!lock_rec_get_insert_intention(lock)
+		    && !((srv_locks_unsafe_for_binlog
+			  || lock->trx->isolation_level
+			  <= TRX_ISO_READ_COMMITTED)
+			 && lock_get_mode(lock) == LOCK_X)) {
+
+			lock_rec_add_to_queue(LOCK_REC | LOCK_GAP
+					      | lock_get_mode(lock),
+					      heir_block, heir_heap_no,
+					      lock->index, lock->trx);
+		}
+
+		lock = lock_rec_get_next(heap_no, lock);
+	}
+}
+
+/*************************************************************//**
+Makes a record to inherit the gap locks (except LOCK_INSERT_INTENTION type)
+of another record as gap type locks, but does not reset the lock bits of the
+other record. Also waiting lock requests are inherited as GRANTED gap locks. */
+static
+void
+lock_rec_inherit_to_gap_if_gap_lock(
+/*================================*/
+	const buf_block_t*	block,		/*!< in: buffer block */
+	ulint			heir_heap_no,	/*!< in: heap_no of
+						record which inherits */
+	ulint			heap_no)	/*!< in: heap_no of record
+						from which inherited;
+						does NOT reset the locks
+						on this record */
+{
+	lock_t*	lock;
+
+	ut_ad(mutex_own(&kernel_mutex));
+
+	lock = lock_rec_get_first(block, heap_no);
+
+	while (lock != NULL) {
+		if (!lock_rec_get_insert_intention(lock)
+		    && (heap_no == PAGE_HEAP_NO_SUPREMUM
+			|| !lock_rec_get_rec_not_gap(lock))) {
+
+			lock_rec_add_to_queue(LOCK_REC | LOCK_GAP
+					      | lock_get_mode(lock),
+					      block, heir_heap_no,
+					      lock->index, lock->trx);
+		}
+
+		lock = lock_rec_get_next(heap_no, lock);
+	}
+}
+
+/*************************************************************//**
+Moves the locks of a record to another record and resets the lock bits of
+the donating record. */
+static
+void
+lock_rec_move(
+/*==========*/
+	const buf_block_t*	receiver,	/*!< in: buffer block containing
+						the receiving record */
+	const buf_block_t*	donator,	/*!< in: buffer block containing
+						the donating record */
+	ulint			receiver_heap_no,/*!< in: heap_no of the record
+						which gets the locks; there
+						must be no lock requests
+						on it! */
+	ulint			donator_heap_no)/*!< in: heap_no of the record
+						which gives the locks */
+{
+	lock_t*	lock;
+
+	ut_ad(mutex_own(&kernel_mutex));
+
+	lock = lock_rec_get_first(donator, donator_heap_no);
+
+	ut_ad(lock_rec_get_first(receiver, receiver_heap_no) == NULL);
+
+	while (lock != NULL) {
+		const ulint	type_mode = lock->type_mode;
+
+		lock_rec_reset_nth_bit(lock, donator_heap_no);
+
+		if (UNIV_UNLIKELY(type_mode & LOCK_WAIT)) {
+			lock_reset_lock_and_trx_wait(lock);
+		}
+
+		/* Note that we FIRST reset the bit, and then set the lock:
+		the function works also if donator == receiver */
+
+		lock_rec_add_to_queue(type_mode, receiver, receiver_heap_no,
+				      lock->index, lock->trx);
+		lock = lock_rec_get_next(donator_heap_no, lock);
+	}
+
+	ut_ad(lock_rec_get_first(donator, donator_heap_no) == NULL);
+}
+
+/*************************************************************//**
+Updates the lock table when we have reorganized a page. NOTE: we copy
+also the locks set on the infimum of the page; the infimum may carry
+locks if an update of a record is occurring on the page, and its locks
+were temporarily stored on the infimum. */
+UNIV_INTERN
+void
+lock_move_reorganize_page(
+/*======================*/
+	const buf_block_t*	block,	/*!< in: old index page, now
+					reorganized */
+	const buf_block_t*	oblock)	/*!< in: copy of the old, not
+					reorganized page */
+{
+	lock_t*		lock;
+	UT_LIST_BASE_NODE_T(lock_t)	old_locks;
+	mem_heap_t*	heap		= NULL;
+	ulint		comp;
+
+	lock_mutex_enter_kernel();
+
+	lock = lock_rec_get_first_on_page(block);
+
+	if (lock == NULL) {
+		lock_mutex_exit_kernel();
+
+		return;
+	}
+
+	heap = mem_heap_create(256);
+
+	/* Copy first all the locks on the page to heap and reset the
+	bitmaps in the original locks; chain the copies of the locks
+	using the trx_locks field in them. */
+
+	UT_LIST_INIT(old_locks);
+
+	do {
+		/* Make a copy of the lock */
+		lock_t*	old_lock = lock_rec_copy(lock, heap);
+
+		UT_LIST_ADD_LAST(trx_locks, old_locks, old_lock);
+
+		/* Reset bitmap of lock */
+		lock_rec_bitmap_reset(lock);
+
+		if (lock_get_wait(lock)) {
+			lock_reset_lock_and_trx_wait(lock);
+		}
+
+		lock = lock_rec_get_next_on_page(lock);
+	} while (lock != NULL);
+
+	comp = page_is_comp(block->frame);
+	ut_ad(comp == page_is_comp(oblock->frame));
+
+	for (lock = UT_LIST_GET_FIRST(old_locks); lock;
+	     lock = UT_LIST_GET_NEXT(trx_locks, lock)) {
+		/* NOTE: we copy also the locks set on the infimum and
+		supremum of the page; the infimum may carry locks if an
+		update of a record is occurring on the page, and its locks
+		were temporarily stored on the infimum */
+		page_cur_t	cur1;
+		page_cur_t	cur2;
+
+		page_cur_set_before_first(block, &cur1);
+		page_cur_set_before_first(oblock, &cur2);
+
+		/* Set locks according to old locks */
+		for (;;) {
+			ulint	old_heap_no;
+			ulint	new_heap_no;
+
+			ut_ad(comp || !memcmp(page_cur_get_rec(&cur1),
+					      page_cur_get_rec(&cur2),
+					      rec_get_data_size_old(
+						      page_cur_get_rec(
+							      &cur2))));
+			if (UNIV_LIKELY(comp)) {
+				old_heap_no = rec_get_heap_no_new(
+					page_cur_get_rec(&cur2));
+				new_heap_no = rec_get_heap_no_new(
+					page_cur_get_rec(&cur1));
+			} else {
+				old_heap_no = rec_get_heap_no_old(
+					page_cur_get_rec(&cur2));
+				new_heap_no = rec_get_heap_no_old(
+					page_cur_get_rec(&cur1));
+			}
+
+			if (lock_rec_get_nth_bit(lock, old_heap_no)) {
+
+				/* Clear the bit in old_lock. */
+				ut_d(lock_rec_reset_nth_bit(lock,
+							    old_heap_no));
+
+				/* NOTE that the old lock bitmap could be too
+				small for the new heap number! */
+
+				lock_rec_add_to_queue(lock->type_mode, block,
+						      new_heap_no,
+						      lock->index, lock->trx);
+
+				/* if (new_heap_no == PAGE_HEAP_NO_SUPREMUM
+				&& lock_get_wait(lock)) {
+				fprintf(stderr,
+				"---\n--\n!!!Lock reorg: supr type %lu\n",
+				lock->type_mode);
+				} */
+			}
+
+			if (UNIV_UNLIKELY
+			    (new_heap_no == PAGE_HEAP_NO_SUPREMUM)) {
+
+				ut_ad(old_heap_no == PAGE_HEAP_NO_SUPREMUM);
+				break;
+			}
+
+			page_cur_move_to_next(&cur1);
+			page_cur_move_to_next(&cur2);
+		}
+
+#ifdef UNIV_DEBUG
+		{
+			ulint	i = lock_rec_find_set_bit(lock);
+
+			/* Check that all locks were moved. */
+			if (UNIV_UNLIKELY(i != ULINT_UNDEFINED)) {
+				fprintf(stderr,
+					"lock_move_reorganize_page():"
+					" %lu not moved in %p\n",
+					(ulong) i, (void*) lock);
+				ut_error;
+			}
+		}
+#endif /* UNIV_DEBUG */
+	}
+
+	lock_mutex_exit_kernel();
+
+	mem_heap_free(heap);
+
+#ifdef UNIV_DEBUG_LOCK_VALIDATE
+	ut_ad(lock_rec_validate_page(buf_block_get_space(block),
+				     buf_block_get_zip_size(block),
+				     buf_block_get_page_no(block)));
+#endif
+}
+
+/*************************************************************//**
+Moves the explicit locks on user records to another page if a record
+list end is moved to another page. */
+UNIV_INTERN
+void
+lock_move_rec_list_end(
+/*===================*/
+	const buf_block_t*	new_block,	/*!< in: index page to move to */
+	const buf_block_t*	block,		/*!< in: index page */
+	const rec_t*		rec)		/*!< in: record on page: this
+						is the first record moved */
+{
+	lock_t*		lock;
+	const ulint	comp	= page_rec_is_comp(rec);
+
+	lock_mutex_enter_kernel();
+
+	/* Note: when we move locks from record to record, waiting locks
+	and possible granted gap type locks behind them are enqueued in
+	the original order, because new elements are inserted to a hash
+	table to the end of the hash chain, and lock_rec_add_to_queue
+	does not reuse locks if there are waiters in the queue. */
+
+	for (lock = lock_rec_get_first_on_page(block); lock;
+	     lock = lock_rec_get_next_on_page(lock)) {
+		page_cur_t	cur1;
+		page_cur_t	cur2;
+		const ulint	type_mode = lock->type_mode;
+
+		page_cur_position(rec, block, &cur1);
+
+		if (page_cur_is_before_first(&cur1)) {
+			page_cur_move_to_next(&cur1);
+		}
+
+		page_cur_set_before_first(new_block, &cur2);
+		page_cur_move_to_next(&cur2);
+
+		/* Copy lock requests on user records to new page and
+		reset the lock bits on the old */
+
+		while (!page_cur_is_after_last(&cur1)) {
+			ulint	heap_no;
+
+			if (comp) {
+				heap_no = rec_get_heap_no_new(
+					page_cur_get_rec(&cur1));
+			} else {
+				heap_no = rec_get_heap_no_old(
+					page_cur_get_rec(&cur1));
+				ut_ad(!memcmp(page_cur_get_rec(&cur1),
+					 page_cur_get_rec(&cur2),
+					 rec_get_data_size_old(
+						 page_cur_get_rec(&cur2))));
+			}
+
+			if (lock_rec_get_nth_bit(lock, heap_no)) {
+				lock_rec_reset_nth_bit(lock, heap_no);
+
+				if (UNIV_UNLIKELY(type_mode & LOCK_WAIT)) {
+					lock_reset_lock_and_trx_wait(lock);
+				}
+
+				if (comp) {
+					heap_no = rec_get_heap_no_new(
+						page_cur_get_rec(&cur2));
+				} else {
+					heap_no = rec_get_heap_no_old(
+						page_cur_get_rec(&cur2));
+				}
+
+				lock_rec_add_to_queue(type_mode,
+						      new_block, heap_no,
+						      lock->index, lock->trx);
+			}
+
+			page_cur_move_to_next(&cur1);
+			page_cur_move_to_next(&cur2);
+		}
+	}
+
+	lock_mutex_exit_kernel();
+
+#ifdef UNIV_DEBUG_LOCK_VALIDATE
+	ut_ad(lock_rec_validate_page(buf_block_get_space(block),
+				     buf_block_get_zip_size(block),
+				     buf_block_get_page_no(block)));
+	ut_ad(lock_rec_validate_page(buf_block_get_space(new_block),
+				     buf_block_get_zip_size(block),
+				     buf_block_get_page_no(new_block)));
+#endif
+}
+
+/*************************************************************//**
+Moves the explicit locks on user records to another page if a record
+list start is moved to another page. */
+UNIV_INTERN
+void
+lock_move_rec_list_start(
+/*=====================*/
+	const buf_block_t*	new_block,	/*!< in: index page to move to */
+	const buf_block_t*	block,		/*!< in: index page */
+	const rec_t*		rec,		/*!< in: record on page:
+						this is the first
+						record NOT copied */
+	const rec_t*		old_end)	/*!< in: old
+						previous-to-last
+						record on new_page
+						before the records
+						were copied */
+{
+	lock_t*		lock;
+	const ulint	comp	= page_rec_is_comp(rec);
+
+	ut_ad(block->frame == page_align(rec));
+	ut_ad(new_block->frame == page_align(old_end));
+
+	lock_mutex_enter_kernel();
+
+	for (lock = lock_rec_get_first_on_page(block); lock;
+	     lock = lock_rec_get_next_on_page(lock)) {
+		page_cur_t	cur1;
+		page_cur_t	cur2;
+		const ulint	type_mode = lock->type_mode;
+
+		page_cur_set_before_first(block, &cur1);
+		page_cur_move_to_next(&cur1);
+
+		page_cur_position(old_end, new_block, &cur2);
+		page_cur_move_to_next(&cur2);
+
+		/* Copy lock requests on user records to new page and
+		reset the lock bits on the old */
+
+		while (page_cur_get_rec(&cur1) != rec) {
+			ulint	heap_no;
+
+			if (comp) {
+				heap_no = rec_get_heap_no_new(
+					page_cur_get_rec(&cur1));
+			} else {
+				heap_no = rec_get_heap_no_old(
+					page_cur_get_rec(&cur1));
+				ut_ad(!memcmp(page_cur_get_rec(&cur1),
+					      page_cur_get_rec(&cur2),
+					      rec_get_data_size_old(
+						      page_cur_get_rec(
+							      &cur2))));
+			}
+
+			if (lock_rec_get_nth_bit(lock, heap_no)) {
+				lock_rec_reset_nth_bit(lock, heap_no);
+
+				if (UNIV_UNLIKELY(type_mode & LOCK_WAIT)) {
+					lock_reset_lock_and_trx_wait(lock);
+				}
+
+				if (comp) {
+					heap_no = rec_get_heap_no_new(
+						page_cur_get_rec(&cur2));
+				} else {
+					heap_no = rec_get_heap_no_old(
+						page_cur_get_rec(&cur2));
+				}
+
+				lock_rec_add_to_queue(type_mode,
+						      new_block, heap_no,
+						      lock->index, lock->trx);
+			}
+
+			page_cur_move_to_next(&cur1);
+			page_cur_move_to_next(&cur2);
+		}
+
+#ifdef UNIV_DEBUG
+		if (page_rec_is_supremum(rec)) {
+			ulint	i;
+
+			for (i = PAGE_HEAP_NO_USER_LOW;
+			     i < lock_rec_get_n_bits(lock); i++) {
+				if (UNIV_UNLIKELY
+				    (lock_rec_get_nth_bit(lock, i))) {
+
+					fprintf(stderr,
+						"lock_move_rec_list_start():"
+						" %lu not moved in %p\n",
+						(ulong) i, (void*) lock);
+					ut_error;
+				}
+			}
+		}
+#endif /* UNIV_DEBUG */
+	}
+
+	lock_mutex_exit_kernel();
+
+#ifdef UNIV_DEBUG_LOCK_VALIDATE
+	ut_ad(lock_rec_validate_page(buf_block_get_space(block),
+				     buf_block_get_zip_size(block),
+				     buf_block_get_page_no(block)));
+#endif
+}
+
+/*************************************************************//**
+Updates the lock table when a page is split to the right. */
+UNIV_INTERN
+void
+lock_update_split_right(
+/*====================*/
+	const buf_block_t*	right_block,	/*!< in: right page */
+	const buf_block_t*	left_block)	/*!< in: left page */
+{
+	ulint	heap_no = lock_get_min_heap_no(right_block);
+
+	lock_mutex_enter_kernel();
+
+	/* Move the locks on the supremum of the left page to the supremum
+	of the right page */
+
+	lock_rec_move(right_block, left_block,
+		      PAGE_HEAP_NO_SUPREMUM, PAGE_HEAP_NO_SUPREMUM);
+
+	/* Inherit the locks to the supremum of left page from the successor
+	of the infimum on right page */
+
+	lock_rec_inherit_to_gap(left_block, right_block,
+				PAGE_HEAP_NO_SUPREMUM, heap_no);
+
+	lock_mutex_exit_kernel();
+}
+
+/*************************************************************//**
+Updates the lock table when a page is merged to the right. */
+UNIV_INTERN
+void
+lock_update_merge_right(
+/*====================*/
+	const buf_block_t*	right_block,	/*!< in: right page to
+						which merged */
+	const rec_t*		orig_succ,	/*!< in: original
+						successor of infimum
+						on the right page
+						before merge */
+	const buf_block_t*	left_block)	/*!< in: merged index
+						page which will be
+						discarded */
+{
+	lock_mutex_enter_kernel();
+
+	/* Inherit the locks from the supremum of the left page to the
+	original successor of infimum on the right page, to which the left
+	page was merged */
+
+	lock_rec_inherit_to_gap(right_block, left_block,
+				page_rec_get_heap_no(orig_succ),
+				PAGE_HEAP_NO_SUPREMUM);
+
+	/* Reset the locks on the supremum of the left page, releasing
+	waiting transactions */
+
+	lock_rec_reset_and_release_wait(left_block,
+					PAGE_HEAP_NO_SUPREMUM);
+
+	lock_rec_free_all_from_discard_page(left_block);
+
+	lock_mutex_exit_kernel();
+}
+
+/*************************************************************//**
+Updates the lock table when the root page is copied to another in
+btr_root_raise_and_insert. Note that we leave lock structs on the
+root page, even though they do not make sense on other than leaf
+pages: the reason is that in a pessimistic update the infimum record
+of the root page will act as a dummy carrier of the locks of the record
+to be updated. */
+UNIV_INTERN
+void
+lock_update_root_raise(
+/*===================*/
+	const buf_block_t*	block,	/*!< in: index page to which copied */
+	const buf_block_t*	root)	/*!< in: root page */
+{
+	lock_mutex_enter_kernel();
+
+	/* Move the locks on the supremum of the root to the supremum
+	of block */
+
+	lock_rec_move(block, root,
+		      PAGE_HEAP_NO_SUPREMUM, PAGE_HEAP_NO_SUPREMUM);
+	lock_mutex_exit_kernel();
+}
+
+/*************************************************************//**
+Updates the lock table when a page is copied to another and the original page
+is removed from the chain of leaf pages, except if page is the root! */
+UNIV_INTERN
+void
+lock_update_copy_and_discard(
+/*=========================*/
+	const buf_block_t*	new_block,	/*!< in: index page to
+						which copied */
+	const buf_block_t*	block)		/*!< in: index page;
+						NOT the root! */
+{
+	lock_mutex_enter_kernel();
+
+	/* Move the locks on the supremum of the old page to the supremum
+	of new_page */
+
+	lock_rec_move(new_block, block,
+		      PAGE_HEAP_NO_SUPREMUM, PAGE_HEAP_NO_SUPREMUM);
+	lock_rec_free_all_from_discard_page(block);
+
+	lock_mutex_exit_kernel();
+}
+
+/*************************************************************//**
+Updates the lock table when a page is split to the left. */
+UNIV_INTERN
+void
+lock_update_split_left(
+/*===================*/
+	const buf_block_t*	right_block,	/*!< in: right page */
+	const buf_block_t*	left_block)	/*!< in: left page */
+{
+	ulint	heap_no = lock_get_min_heap_no(right_block);
+
+	lock_mutex_enter_kernel();
+
+	/* Inherit the locks to the supremum of the left page from the
+	successor of the infimum on the right page */
+
+	lock_rec_inherit_to_gap(left_block, right_block,
+				PAGE_HEAP_NO_SUPREMUM, heap_no);
+
+	lock_mutex_exit_kernel();
+}
+
+/*************************************************************//**
+Updates the lock table when a page is merged to the left. */
+UNIV_INTERN
+void
+lock_update_merge_left(
+/*===================*/
+	const buf_block_t*	left_block,	/*!< in: left page to
+						which merged */
+	const rec_t*		orig_pred,	/*!< in: original predecessor
+						of supremum on the left page
+						before merge */
+	const buf_block_t*	right_block)	/*!< in: merged index page
+						which will be discarded */
+{
+	const rec_t*	left_next_rec;
+
+	ut_ad(left_block->frame == page_align(orig_pred));
+
+	lock_mutex_enter_kernel();
+
+	left_next_rec = page_rec_get_next_const(orig_pred);
+
+	if (!page_rec_is_supremum(left_next_rec)) {
+
+		/* Inherit the locks on the supremum of the left page to the
+		first record which was moved from the right page */
+
+		lock_rec_inherit_to_gap(left_block, left_block,
+					page_rec_get_heap_no(left_next_rec),
+					PAGE_HEAP_NO_SUPREMUM);
+
+		/* Reset the locks on the supremum of the left page,
+		releasing waiting transactions */
+
+		lock_rec_reset_and_release_wait(left_block,
+						PAGE_HEAP_NO_SUPREMUM);
+	}
+
+	/* Move the locks from the supremum of right page to the supremum
+	of the left page */
+
+	lock_rec_move(left_block, right_block,
+		      PAGE_HEAP_NO_SUPREMUM, PAGE_HEAP_NO_SUPREMUM);
+
+	lock_rec_free_all_from_discard_page(right_block);
+
+	lock_mutex_exit_kernel();
+}
+
+/*************************************************************//**
+Resets the original locks on heir and replaces them with gap type locks
+inherited from rec. */
+UNIV_INTERN
+void
+lock_rec_reset_and_inherit_gap_locks(
+/*=================================*/
+	const buf_block_t*	heir_block,	/*!< in: block containing the
+						record which inherits */
+	const buf_block_t*	block,		/*!< in: block containing the
+						record from which inherited;
+						does NOT reset the locks on
+						this record */
+	ulint			heir_heap_no,	/*!< in: heap_no of the
+						inheriting record */
+	ulint			heap_no)	/*!< in: heap_no of the
+						donating record */
+{
+	mutex_enter(&kernel_mutex);
+
+	lock_rec_reset_and_release_wait(heir_block, heir_heap_no);
+
+	lock_rec_inherit_to_gap(heir_block, block, heir_heap_no, heap_no);
+
+	mutex_exit(&kernel_mutex);
+}
+
+/*************************************************************//**
+Updates the lock table when a page is discarded. */
+UNIV_INTERN
+void
+lock_update_discard(
+/*================*/
+	const buf_block_t*	heir_block,	/*!< in: index page
+						which will inherit the locks */
+	ulint			heir_heap_no,	/*!< in: heap_no of the record
+						which will inherit the locks */
+	const buf_block_t*	block)		/*!< in: index page
+						which will be discarded */
+{
+	const page_t*	page = block->frame;
+	const rec_t*	rec;
+	ulint		heap_no;
+
+	lock_mutex_enter_kernel();
+
+	if (!lock_rec_get_first_on_page(block)) {
+		/* No locks exist on page, nothing to do */
+
+		lock_mutex_exit_kernel();
+
+		return;
+	}
+
+	/* Inherit all the locks on the page to the record and reset all
+	the locks on the page */
+
+	if (page_is_comp(page)) {
+		rec = page + PAGE_NEW_INFIMUM;
+
+		do {
+			heap_no = rec_get_heap_no_new(rec);
+
+			lock_rec_inherit_to_gap(heir_block, block,
+						heir_heap_no, heap_no);
+
+			lock_rec_reset_and_release_wait(block, heap_no);
+
+			rec = page + rec_get_next_offs(rec, TRUE);
+		} while (heap_no != PAGE_HEAP_NO_SUPREMUM);
+	} else {
+		rec = page + PAGE_OLD_INFIMUM;
+
+		do {
+			heap_no = rec_get_heap_no_old(rec);
+
+			lock_rec_inherit_to_gap(heir_block, block,
+						heir_heap_no, heap_no);
+
+			lock_rec_reset_and_release_wait(block, heap_no);
+
+			rec = page + rec_get_next_offs(rec, FALSE);
+		} while (heap_no != PAGE_HEAP_NO_SUPREMUM);
+	}
+
+	lock_rec_free_all_from_discard_page(block);
+
+	lock_mutex_exit_kernel();
+}
+
+/*************************************************************//**
+Updates the lock table when a new user record is inserted. */
+UNIV_INTERN
+void
+lock_update_insert(
+/*===============*/
+	const buf_block_t*	block,	/*!< in: buffer block containing rec */
+	const rec_t*		rec)	/*!< in: the inserted record */
+{
+	ulint	receiver_heap_no;
+	ulint	donator_heap_no;
+
+	ut_ad(block->frame == page_align(rec));
+
+	/* Inherit the gap-locking locks for rec, in gap mode, from the next
+	record */
+
+	if (page_rec_is_comp(rec)) {
+		receiver_heap_no = rec_get_heap_no_new(rec);
+		donator_heap_no = rec_get_heap_no_new(
+			page_rec_get_next_low(rec, TRUE));
+	} else {
+		receiver_heap_no = rec_get_heap_no_old(rec);
+		donator_heap_no = rec_get_heap_no_old(
+			page_rec_get_next_low(rec, FALSE));
+	}
+
+	lock_mutex_enter_kernel();
+	lock_rec_inherit_to_gap_if_gap_lock(block,
+					    receiver_heap_no, donator_heap_no);
+	lock_mutex_exit_kernel();
+}
+
+/*************************************************************//**
+Updates the lock table when a record is removed. */
+UNIV_INTERN
+void
+lock_update_delete(
+/*===============*/
+	const buf_block_t*	block,	/*!< in: buffer block containing rec */
+	const rec_t*		rec)	/*!< in: the record to be removed */
+{
+	const page_t*	page = block->frame;
+	ulint		heap_no;
+	ulint		next_heap_no;
+
+	ut_ad(page == page_align(rec));
+
+	if (page_is_comp(page)) {
+		heap_no = rec_get_heap_no_new(rec);
+		next_heap_no = rec_get_heap_no_new(page
+						   + rec_get_next_offs(rec,
+								       TRUE));
+	} else {
+		heap_no = rec_get_heap_no_old(rec);
+		next_heap_no = rec_get_heap_no_old(page
+						   + rec_get_next_offs(rec,
+								       FALSE));
+	}
+
+	lock_mutex_enter_kernel();
+
+	/* Let the next record inherit the locks from rec, in gap mode */
+
+	lock_rec_inherit_to_gap(block, block, next_heap_no, heap_no);
+
+	/* Reset the lock bits on rec and release waiting transactions */
+
+	lock_rec_reset_and_release_wait(block, heap_no);
+
+	lock_mutex_exit_kernel();
+}
+
+/*********************************************************************//**
+Stores on the page infimum record the explicit locks of another record.
+This function is used to store the lock state of a record when it is
+updated and the size of the record changes in the update. The record
+is moved in such an update, perhaps to another page. The infimum record
+acts as a dummy carrier record, taking care of lock releases while the
+actual record is being moved. */
+UNIV_INTERN
+void
+lock_rec_store_on_page_infimum(
+/*===========================*/
+	const buf_block_t*	block,	/*!< in: buffer block containing rec */
+	const rec_t*		rec)	/*!< in: record whose lock state
+					is stored on the infimum
+					record of the same page; lock
+					bits are reset on the
+					record */
+{
+	ulint	heap_no = page_rec_get_heap_no(rec);
+
+	ut_ad(block->frame == page_align(rec));
+
+	lock_mutex_enter_kernel();
+
+	lock_rec_move(block, block, PAGE_HEAP_NO_INFIMUM, heap_no);
+
+	lock_mutex_exit_kernel();
+}
+
+/*********************************************************************//**
+Restores the state of explicit lock requests on a single record, where the
+state was stored on the infimum of the page. */
+UNIV_INTERN
+void
+lock_rec_restore_from_page_infimum(
+/*===============================*/
+	const buf_block_t*	block,	/*!< in: buffer block containing rec */
+	const rec_t*		rec,	/*!< in: record whose lock state
+					is restored */
+	const buf_block_t*	donator)/*!< in: page (rec is not
+					necessarily on this page)
+					whose infimum stored the lock
+					state; lock bits are reset on
+					the infimum */
+{
+	ulint	heap_no = page_rec_get_heap_no(rec);
+
+	lock_mutex_enter_kernel();
+
+	lock_rec_move(block, donator, heap_no, PAGE_HEAP_NO_INFIMUM);
+
+	lock_mutex_exit_kernel();
+}
+
+/*=========== DEADLOCK CHECKING ======================================*/
+
+/********************************************************************//**
+Checks if a lock request results in a deadlock.
+@return TRUE if a deadlock was detected and we chose trx as a victim;
+FALSE if no deadlock, or there was a deadlock, but we chose other
+transaction(s) as victim(s) */
+static
+ibool
+lock_deadlock_occurs(
+/*=================*/
+	lock_t*	lock,	/*!< in: lock the transaction is requesting */
+	trx_t*	trx)	/*!< in: transaction */
+{
+	trx_t*		mark_trx;
+	ulint		ret;
+	ulint		cost	= 0;
+
+	ut_ad(trx);
+	ut_ad(lock);
+	ut_ad(mutex_own(&kernel_mutex));
+retry:
+	/* We check that adding this trx to the waits-for graph
+	does not produce a cycle. First mark all active transactions
+	with 0: */
+
+	mark_trx = UT_LIST_GET_FIRST(trx_sys->trx_list);
+
+	while (mark_trx) {
+		mark_trx->deadlock_mark = 0;
+		mark_trx = UT_LIST_GET_NEXT(trx_list, mark_trx);
+	}
+
+	ret = lock_deadlock_recursive(trx, trx, lock, &cost, 0);
+
+	switch (ret) {
+	case LOCK_VICTIM_IS_OTHER:
+		/* We chose some other trx as a victim: retry if there still
+		is a deadlock */
+		goto retry;
+
+	case LOCK_EXCEED_MAX_DEPTH:
+		/* If the lock search exceeds the max step
+		or the max depth, the current trx will be
+		the victim. Print its information. */
+		rewind(lock_latest_err_file);
+		ut_print_timestamp(lock_latest_err_file);
+
+		fputs("TOO DEEP OR LONG SEARCH IN THE LOCK TABLE"
+		      " WAITS-FOR GRAPH, WE WILL ROLL BACK"
+		      " FOLLOWING TRANSACTION \n",
+		      lock_latest_err_file);
+
+		fputs("\n*** TRANSACTION:\n", lock_latest_err_file);
+		      trx_print(lock_latest_err_file, trx, 3000);
+
+		fputs("*** WAITING FOR THIS LOCK TO BE GRANTED:\n",
+		      lock_latest_err_file);
+
+		if (lock_get_type(lock) == LOCK_REC) {
+			lock_rec_print(lock_latest_err_file, lock);
+		} else {
+			lock_table_print(lock_latest_err_file, lock);
+		}
+		break;
+
+	case LOCK_VICTIM_IS_START:
+		srv_n_lock_deadlock_count++;
+		fputs("*** WE ROLL BACK TRANSACTION (2)\n",
+		      lock_latest_err_file);
+		break;
+
+	default:
+		/* No deadlock detected*/
+		return(FALSE);
+	}
+
+	lock_deadlock_found = TRUE;
+
+	return(TRUE);
+}
+
+/********************************************************************//**
+Looks recursively for a deadlock.
+@return 0 if no deadlock found, LOCK_VICTIM_IS_START if there was a
+deadlock and we chose 'start' as the victim, LOCK_VICTIM_IS_OTHER if a
+deadlock was found and we chose some other trx as a victim: we must do
+the search again in this last case because there may be another
+deadlock!
+LOCK_EXCEED_MAX_DEPTH if the lock search exceeds max steps or max depth. */
+static
+ulint
+lock_deadlock_recursive(
+/*====================*/
+	trx_t*	start,		/*!< in: recursion starting point */
+	trx_t*	trx,		/*!< in: a transaction waiting for a lock */
+	lock_t*	wait_lock,	/*!< in: lock that is waiting to be granted */
+	ulint*	cost,		/*!< in/out: number of calculation steps thus
+				far: if this exceeds LOCK_MAX_N_STEPS_...
+				we return LOCK_EXCEED_MAX_DEPTH */
+	ulint	depth)		/*!< in: recursion depth: if this exceeds
+				LOCK_MAX_DEPTH_IN_DEADLOCK_CHECK, we
+				return LOCK_EXCEED_MAX_DEPTH */
+{
+	ulint	ret;
+	lock_t*	lock;
+	trx_t*	lock_trx;
+	ulint	heap_no		= ULINT_UNDEFINED;
+
+	ut_a(trx);
+	ut_a(start);
+	ut_a(wait_lock);
+	ut_ad(mutex_own(&kernel_mutex));
+
+	if (trx->deadlock_mark == 1) {
+		/* We have already exhaustively searched the subtree starting
+		from this trx */
+
+		return(0);
+	}
+
+	*cost = *cost + 1;
+
+	if (lock_get_type_low(wait_lock) == LOCK_REC) {
+		ulint		space;
+		ulint		page_no;
+
+		heap_no = lock_rec_find_set_bit(wait_lock);
+		ut_a(heap_no != ULINT_UNDEFINED);
+
+		space = wait_lock->un_member.rec_lock.space;
+		page_no = wait_lock->un_member.rec_lock.page_no;
+
+		lock = lock_rec_get_first_on_page_addr(space, page_no);
+
+		/* Position the iterator on the first matching record lock. */
+		while (lock != NULL
+		       && lock != wait_lock
+		       && !lock_rec_get_nth_bit(lock, heap_no)) {
+
+			lock = lock_rec_get_next_on_page(lock);
+		}
+
+		if (lock == wait_lock) {
+			lock = NULL;
+		}
+
+		ut_ad(lock == NULL || lock_rec_get_nth_bit(lock, heap_no));
+
+	} else {
+		lock = wait_lock;
+	}
+
+	/* Look at the locks ahead of wait_lock in the lock queue */
+
+	for (;;) {
+		/* Get previous table lock. */
+		if (heap_no == ULINT_UNDEFINED) {
+
+			lock = UT_LIST_GET_PREV(
+				un_member.tab_lock.locks, lock);
+		}
+
+		if (lock == NULL) {
+			/* We can mark this subtree as searched */
+			trx->deadlock_mark = 1;
+
+			return(FALSE);
+		}
+
+		if (lock_has_to_wait(wait_lock, lock)) {
+
+			ibool	too_far
+				= depth > LOCK_MAX_DEPTH_IN_DEADLOCK_CHECK
+				|| *cost > LOCK_MAX_N_STEPS_IN_DEADLOCK_CHECK;
+
+			lock_trx = lock->trx;
+
+			if (lock_trx == start) {
+
+				/* We came back to the recursion starting
+				point: a deadlock detected; or we have
+				searched the waits-for graph too long */
+
+				FILE*	ef = lock_latest_err_file;
+
+				rewind(ef);
+				ut_print_timestamp(ef);
+
+				fputs("\n*** (1) TRANSACTION:\n", ef);
+
+				trx_print(ef, wait_lock->trx, 3000);
+
+				fputs("*** (1) WAITING FOR THIS LOCK"
+				      " TO BE GRANTED:\n", ef);
+
+				if (lock_get_type_low(wait_lock) == LOCK_REC) {
+					lock_rec_print(ef, wait_lock);
+				} else {
+					lock_table_print(ef, wait_lock);
+				}
+
+				fputs("*** (2) TRANSACTION:\n", ef);
+
+				trx_print(ef, lock->trx, 3000);
+
+				fputs("*** (2) HOLDS THE LOCK(S):\n", ef);
+
+				if (lock_get_type_low(lock) == LOCK_REC) {
+					lock_rec_print(ef, lock);
+				} else {
+					lock_table_print(ef, lock);
+				}
+
+				fputs("*** (2) WAITING FOR THIS LOCK"
+				      " TO BE GRANTED:\n", ef);
+
+				if (lock_get_type_low(start->wait_lock)
+				    == LOCK_REC) {
+					lock_rec_print(ef, start->wait_lock);
+				} else {
+					lock_table_print(ef, start->wait_lock);
+				}
+#ifdef UNIV_DEBUG
+				if (lock_print_waits) {
+					fputs("Deadlock detected\n",
+					      stderr);
+				}
+#endif /* UNIV_DEBUG */
+
+				if (trx_weight_cmp(wait_lock->trx,
+						   start) >= 0) {
+					/* Our recursion starting point
+					transaction is 'smaller', let us
+					choose 'start' as the victim and roll
+					back it */
+
+					return(LOCK_VICTIM_IS_START);
+				}
+
+				lock_deadlock_found = TRUE;
+
+				/* Let us choose the transaction of wait_lock
+				as a victim to try to avoid deadlocking our
+				recursion starting point transaction */
+
+				fputs("*** WE ROLL BACK TRANSACTION (1)\n",
+				      ef);
+
+				wait_lock->trx->was_chosen_as_deadlock_victim
+					= TRUE;
+
+				lock_cancel_waiting_and_release(wait_lock);
+
+				/* Since trx and wait_lock are no longer
+				in the waits-for graph, we can return FALSE;
+				note that our selective algorithm can choose
+				several transactions as victims, but still
+				we may end up rolling back also the recursion
+				starting point transaction! */
+
+				return(LOCK_VICTIM_IS_OTHER);
+			}
+
+			if (too_far) {
+
+#ifdef UNIV_DEBUG
+				if (lock_print_waits) {
+					fputs("Deadlock search exceeds"
+					      " max steps or depth.\n",
+					      stderr);
+				}
+#endif /* UNIV_DEBUG */
+				/* The information about transaction/lock
+				to be rolled back is available in the top
+				level. Do not print anything here. */
+				return(LOCK_EXCEED_MAX_DEPTH);
+			}
+
+			if (lock_trx->que_state == TRX_QUE_LOCK_WAIT) {
+
+				/* Another trx ahead has requested lock	in an
+				incompatible mode, and is itself waiting for
+				a lock */
+
+				ret = lock_deadlock_recursive(
+					start, lock_trx,
+					lock_trx->wait_lock, cost, depth + 1);
+
+				if (ret != 0) {
+
+					return(ret);
+				}
+			}
+		}
+		/* Get the next record lock to check. */
+		if (heap_no != ULINT_UNDEFINED) {
+
+			ut_a(lock != NULL);
+
+			do {
+				lock = lock_rec_get_next_on_page(lock);
+			} while (lock != NULL
+				&& lock != wait_lock
+				&& !lock_rec_get_nth_bit(lock, heap_no));
+
+			if (lock == wait_lock) {
+				lock = NULL;
+			}
+		}
+	}/* end of the 'for (;;)'-loop */
+}
+
+/*========================= TABLE LOCKS ==============================*/
+
+/*********************************************************************//**
+Creates a table lock object and adds it as the last in the lock queue
+of the table. Does NOT check for deadlocks or lock compatibility.
+@return	own: new lock object */
+UNIV_INLINE
+lock_t*
+lock_table_create(
+/*==============*/
+	dict_table_t*	table,	/*!< in: database table in dictionary cache */
+	ulint		type_mode,/*!< in: lock mode possibly ORed with
+				LOCK_WAIT */
+	trx_t*		trx)	/*!< in: trx */
+{
+	lock_t*	lock;
+
+	ut_ad(table && trx);
+	ut_ad(mutex_own(&kernel_mutex));
+
+	if ((type_mode & LOCK_MODE_MASK) == LOCK_AUTO_INC) {
+		++table->n_waiting_or_granted_auto_inc_locks;
+	}
+
+	/* For AUTOINC locking we reuse the lock instance only if
+	there is no wait involved else we allocate the waiting lock
+	from the transaction lock heap. */
+	if (type_mode == LOCK_AUTO_INC) {
+
+		lock = table->autoinc_lock;
+
+		table->autoinc_trx = trx;
+
+		ib_vector_push(trx->autoinc_locks, lock);
+	} else {
+		lock = mem_heap_alloc(trx->lock_heap, sizeof(lock_t));
+	}
+
+	UT_LIST_ADD_LAST(trx_locks, trx->trx_locks, lock);
+
+	lock->type_mode = type_mode | LOCK_TABLE;
+	lock->trx = trx;
+
+	lock->un_member.tab_lock.table = table;
+
+	UT_LIST_ADD_LAST(un_member.tab_lock.locks, table->locks, lock);
+
+	if (UNIV_UNLIKELY(type_mode & LOCK_WAIT)) {
+
+		lock_set_lock_and_trx_wait(lock, trx);
+	}
+
+	return(lock);
+}
+
+/*************************************************************//**
+Removes a table lock request from the queue and the trx list of locks;
+this is a low-level function which does NOT check if waiting requests
+can now be granted. */
+UNIV_INLINE
+void
+lock_table_remove_low(
+/*==================*/
+	lock_t*	lock)	/*!< in: table lock */
+{
+	trx_t*		trx;
+	dict_table_t*	table;
+
+	ut_ad(mutex_own(&kernel_mutex));
+
+	trx = lock->trx;
+	table = lock->un_member.tab_lock.table;
+
+	/* Remove the table from the transaction's AUTOINC vector, if
+	the lock that is being release is an AUTOINC lock. */
+	if (lock_get_mode(lock) == LOCK_AUTO_INC) {
+
+		/* The table's AUTOINC lock can get transferred to
+		another transaction before we get here. */
+		if (table->autoinc_trx == trx) {
+			table->autoinc_trx = NULL;
+		}
+
+		/* The locks must be freed in the reverse order from
+		the one in which they were acquired. This is to avoid
+		traversing the AUTOINC lock vector unnecessarily. 
+
+		We only store locks that were granted in the
+		trx->autoinc_locks vector (see lock_table_create()
+		and lock_grant()). Therefore it can be empty and we
+		need to check for that. */
+
+		if (!lock_get_wait(lock)
+		    && !ib_vector_is_empty(trx->autoinc_locks)) {
+			lock_t*	autoinc_lock;
+
+			autoinc_lock = ib_vector_pop(trx->autoinc_locks);
+			ut_a(autoinc_lock == lock);
+		}
+
+		ut_a(table->n_waiting_or_granted_auto_inc_locks > 0);
+		--table->n_waiting_or_granted_auto_inc_locks;
+	}
+
+	UT_LIST_REMOVE(trx_locks, trx->trx_locks, lock);
+	UT_LIST_REMOVE(un_member.tab_lock.locks, table->locks, lock);
+}
+
+/*********************************************************************//**
+Enqueues a waiting request for a table lock which cannot be granted
+immediately. Checks for deadlocks.
+@return DB_LOCK_WAIT, DB_DEADLOCK, or DB_QUE_THR_SUSPENDED, or
+DB_SUCCESS; DB_SUCCESS means that there was a deadlock, but another
+transaction was chosen as a victim, and we got the lock immediately:
+no need to wait then */
+static
+ulint
+lock_table_enqueue_waiting(
+/*=======================*/
+	ulint		mode,	/*!< in: lock mode this transaction is
+				requesting */
+	dict_table_t*	table,	/*!< in: table */
+	que_thr_t*	thr)	/*!< in: query thread */
+{
+	lock_t*	lock;
+	trx_t*	trx;
+	ulint   sec;
+	ulint   ms;
+
+	ut_ad(mutex_own(&kernel_mutex));
+
+	/* Test if there already is some other reason to suspend thread:
+	we do not enqueue a lock request if the query thread should be
+	stopped anyway */
+
+	if (que_thr_stop(thr)) {
+		ut_error;
+
+		return(DB_QUE_THR_SUSPENDED);
+	}
+
+	trx = thr_get_trx(thr);
+
+	switch (trx_get_dict_operation(trx)) {
+	case TRX_DICT_OP_NONE:
+		break;
+	case TRX_DICT_OP_TABLE:
+	case TRX_DICT_OP_INDEX:
+		ut_print_timestamp(stderr);
+		fputs("  InnoDB: Error: a table lock wait happens"
+		      " in a dictionary operation!\n"
+		      "InnoDB: Table name ", stderr);
+		ut_print_name(stderr, trx, TRUE, table->name);
+		fputs(".\n"
+		      "InnoDB: Submit a detailed bug report"
+		      " to http://bugs.mysql.com\n",
+		      stderr);
+	}
+
+	/* Enqueue the lock request that will wait to be granted */
+
+	lock = lock_table_create(table, mode | LOCK_WAIT, trx);
+
+	/* Check if a deadlock occurs: if yes, remove the lock request and
+	return an error code */
+
+	if (lock_deadlock_occurs(lock, trx)) {
+
+		/* The order here is important, we don't want to
+		lose the state of the lock before calling remove. */
+		lock_table_remove_low(lock);
+		lock_reset_lock_and_trx_wait(lock);
+
+		return(DB_DEADLOCK);
+	}
+
+	if (trx->wait_lock == NULL) {
+		/* Deadlock resolution chose another transaction as a victim,
+		and we accidentally got our lock granted! */
+
+		return(DB_SUCCESS);
+	}
+
+	if (innobase_get_slow_log() && trx->take_stats) {
+		ut_usectime(&sec, &ms);
+		trx->lock_que_wait_ustarted = (ib_uint64_t)sec * 1000000 + ms;
+	}
+	trx->que_state = TRX_QUE_LOCK_WAIT;
+	trx->was_chosen_as_deadlock_victim = FALSE;
+	trx->wait_started = time(NULL);
+
+	ut_a(que_thr_stop(thr));
+
+	return(DB_LOCK_WAIT);
+}
+
+/*********************************************************************//**
+Checks if other transactions have an incompatible mode lock request in
+the lock queue.
+@return	lock or NULL */
+UNIV_INLINE
+lock_t*
+lock_table_other_has_incompatible(
+/*==============================*/
+	trx_t*		trx,	/*!< in: transaction, or NULL if all
+				transactions should be included */
+	ulint		wait,	/*!< in: LOCK_WAIT if also waiting locks are
+				taken into account, or 0 if not */
+	dict_table_t*	table,	/*!< in: table */
+	enum lock_mode	mode)	/*!< in: lock mode */
+{
+	lock_t*	lock;
+
+	ut_ad(mutex_own(&kernel_mutex));
+
+	lock = UT_LIST_GET_LAST(table->locks);
+
+	while (lock != NULL) {
+
+		if ((lock->trx != trx)
+		    && (!lock_mode_compatible(lock_get_mode(lock), mode))
+		    && (wait || !(lock_get_wait(lock)))) {
+
+			return(lock);
+		}
+
+		lock = UT_LIST_GET_PREV(un_member.tab_lock.locks, lock);
+	}
+
+	return(NULL);
+}
+
+/*********************************************************************//**
+Locks the specified database table in the mode given. If the lock cannot
+be granted immediately, the query thread is put to wait.
+@return	DB_SUCCESS, DB_LOCK_WAIT, DB_DEADLOCK, or DB_QUE_THR_SUSPENDED */
+UNIV_INTERN
+ulint
+lock_table(
+/*=======*/
+	ulint		flags,	/*!< in: if BTR_NO_LOCKING_FLAG bit is set,
+				does nothing */
+	dict_table_t*	table,	/*!< in: database table in dictionary cache */
+	enum lock_mode	mode,	/*!< in: lock mode */
+	que_thr_t*	thr)	/*!< in: query thread */
+{
+	trx_t*	trx;
+	ulint	err;
+
+	ut_ad(table && thr);
+
+	if (flags & BTR_NO_LOCKING_FLAG) {
+
+		return(DB_SUCCESS);
+	}
+
+	ut_a(flags == 0);
+
+	trx = thr_get_trx(thr);
+
+	lock_mutex_enter_kernel();
+
+	/* Look for stronger locks the same trx already has on the table */
+
+	if (lock_table_has(trx, table, mode)) {
+
+		lock_mutex_exit_kernel();
+
+		return(DB_SUCCESS);
+	}
+
+	/* We have to check if the new lock is compatible with any locks
+	other transactions have in the table lock queue. */
+
+	if (lock_table_other_has_incompatible(trx, LOCK_WAIT, table, mode)) {
+
+		/* Another trx has a request on the table in an incompatible
+		mode: this trx may have to wait */
+
+		err = lock_table_enqueue_waiting(mode | flags, table, thr);
+
+		lock_mutex_exit_kernel();
+
+		return(err);
+	}
+
+	lock_table_create(table, mode | flags, trx);
+
+	ut_a(!flags || mode == LOCK_S || mode == LOCK_X);
+
+	lock_mutex_exit_kernel();
+
+	return(DB_SUCCESS);
+}
+
+/*********************************************************************//**
+Checks if a waiting table lock request still has to wait in a queue.
+@return	TRUE if still has to wait */
+static
+ibool
+lock_table_has_to_wait_in_queue(
+/*============================*/
+	lock_t*	wait_lock)	/*!< in: waiting table lock */
+{
+	dict_table_t*	table;
+	lock_t*		lock;
+
+	ut_ad(mutex_own(&kernel_mutex));
+	ut_ad(lock_get_wait(wait_lock));
+
+	table = wait_lock->un_member.tab_lock.table;
+
+	lock = UT_LIST_GET_FIRST(table->locks);
+
+	while (lock != wait_lock) {
+
+		if (lock_has_to_wait(wait_lock, lock)) {
+
+			return(TRUE);
+		}
+
+		lock = UT_LIST_GET_NEXT(un_member.tab_lock.locks, lock);
+	}
+
+	return(FALSE);
+}
+
+/*************************************************************//**
+Removes a table lock request, waiting or granted, from the queue and grants
+locks to other transactions in the queue, if they now are entitled to a
+lock. */
+static
+void
+lock_table_dequeue(
+/*===============*/
+	lock_t*	in_lock)/*!< in: table lock object; transactions waiting
+			behind will get their lock requests granted, if
+			they are now qualified to it */
+{
+	lock_t*	lock;
+
+	ut_ad(mutex_own(&kernel_mutex));
+	ut_a(lock_get_type_low(in_lock) == LOCK_TABLE);
+
+	lock = UT_LIST_GET_NEXT(un_member.tab_lock.locks, in_lock);
+
+	lock_table_remove_low(in_lock);
+
+	/* Check if waiting locks in the queue can now be granted: grant
+	locks if there are no conflicting locks ahead. */
+
+	while (lock != NULL) {
+
+		if (lock_get_wait(lock)
+		    && !lock_table_has_to_wait_in_queue(lock)) {
+
+			/* Grant the lock */
+			lock_grant(lock);
+		}
+
+		lock = UT_LIST_GET_NEXT(un_member.tab_lock.locks, lock);
+	}
+}
+
+/*=========================== LOCK RELEASE ==============================*/
+
+/*************************************************************//**
+Removes a granted record lock of a transaction from the queue and grants
+locks to other transactions waiting in the queue if they now are entitled
+to a lock. */
+UNIV_INTERN
+void
+lock_rec_unlock(
+/*============*/
+	trx_t*			trx,	/*!< in: transaction that has
+					set a record lock */
+	const buf_block_t*	block,	/*!< in: buffer block containing rec */
+	const rec_t*		rec,	/*!< in: record */
+	enum lock_mode		lock_mode)/*!< in: LOCK_S or LOCK_X */
+{
+	lock_t*	first_lock;
+	lock_t*	lock;
+	ulint	heap_no;
+
+	ut_ad(trx && rec);
+	ut_ad(block->frame == page_align(rec));
+
+	heap_no = page_rec_get_heap_no(rec);
+
+	mutex_enter(&kernel_mutex);
+
+	first_lock = lock_rec_get_first(block, heap_no);
+
+	/* Find the last lock with the same lock_mode and transaction
+	from the record. */
+
+	for (lock = first_lock; lock != NULL;
+	     lock = lock_rec_get_next(heap_no, lock)) {
+		if (lock->trx == trx && lock_get_mode(lock) == lock_mode) {
+			ut_a(!lock_get_wait(lock));
+			lock_rec_reset_nth_bit(lock, heap_no);
+			goto released;
+		}
+	}
+
+	mutex_exit(&kernel_mutex);
+	ut_print_timestamp(stderr);
+	fprintf(stderr,
+		"  InnoDB: Error: unlock row could not"
+		" find a %lu mode lock on the record\n",
+		(ulong) lock_mode);
+
+	return;
+
+released:
+	/* Check if we can now grant waiting lock requests */
+
+	for (lock = first_lock; lock != NULL;
+	     lock = lock_rec_get_next(heap_no, lock)) {
+		if (lock_get_wait(lock)
+		    && !lock_rec_has_to_wait_in_queue(lock)) {
+
+			/* Grant the lock */
+			lock_grant(lock);
+		}
+	}
+
+	mutex_exit(&kernel_mutex);
+}
+
+/*********************************************************************//**
+Releases transaction locks, and releases possible other transactions waiting
+because of these locks. */
+UNIV_INTERN
+void
+lock_release_off_kernel(
+/*====================*/
+	trx_t*	trx)	/*!< in: transaction */
+{
+	dict_table_t*	table;
+	ulint		count;
+	lock_t*		lock;
+
+	ut_ad(mutex_own(&kernel_mutex));
+
+	lock = UT_LIST_GET_LAST(trx->trx_locks);
+
+	count = 0;
+
+	while (lock != NULL) {
+
+		count++;
+
+		if (lock_get_type_low(lock) == LOCK_REC) {
+
+			lock_rec_dequeue_from_page(lock);
+		} else {
+			ut_ad(lock_get_type_low(lock) & LOCK_TABLE);
+
+			if (lock_get_mode(lock) != LOCK_IS
+			    && !ut_dulint_is_zero(trx->undo_no)) {
+
+				/* The trx may have modified the table. We
+				block the use of the MySQL query cache for
+				all currently active transactions. */
+
+				table = lock->un_member.tab_lock.table;
+
+				table->query_cache_inv_trx_id
+					= trx_sys->max_trx_id;
+			}
+
+			lock_table_dequeue(lock);
+		}
+
+		if (count == LOCK_RELEASE_KERNEL_INTERVAL) {
+			/* Release the kernel mutex for a while, so that we
+			do not monopolize it */
+
+			lock_mutex_exit_kernel();
+
+			lock_mutex_enter_kernel();
+
+			count = 0;
+		}
+
+		lock = UT_LIST_GET_LAST(trx->trx_locks);
+	}
+
+	ut_a(ib_vector_size(trx->autoinc_locks) == 0);
+
+	mem_heap_empty(trx->lock_heap);
+}
+
+/*********************************************************************//**
+Cancels a waiting lock request and releases possible other transactions
+waiting behind it. */
+UNIV_INTERN
+void
+lock_cancel_waiting_and_release(
+/*============================*/
+	lock_t*	lock)	/*!< in: waiting lock request */
+{
+	ut_ad(mutex_own(&kernel_mutex));
+
+	if (lock_get_type_low(lock) == LOCK_REC) {
+
+		lock_rec_dequeue_from_page(lock);
+	} else {
+		ut_ad(lock_get_type_low(lock) & LOCK_TABLE);
+
+		if (lock->trx->autoinc_locks != NULL) {
+			/* Release the transaction's AUTOINC locks/ */
+			lock_release_autoinc_locks(lock->trx);
+		}
+
+		lock_table_dequeue(lock);
+	}
+
+	/* Reset the wait flag and the back pointer to lock in trx */
+
+	lock_reset_lock_and_trx_wait(lock);
+
+	/* The following function releases the trx from lock wait */
+
+	trx_end_lock_wait(lock->trx);
+}
+
+/* True if a lock mode is S or X */
+#define IS_LOCK_S_OR_X(lock) \
+	(lock_get_mode(lock) == LOCK_S \
+	 || lock_get_mode(lock) == LOCK_X)
+
+
+/*********************************************************************//**
+Removes locks of a transaction on a table to be dropped.
+If remove_also_table_sx_locks is TRUE then table-level S and X locks are
+also removed in addition to other table-level and record-level locks.
+No lock, that is going to be removed, is allowed to be a wait lock. */
+static
+void
+lock_remove_all_on_table_for_trx(
+/*=============================*/
+	dict_table_t*	table,			/*!< in: table to be dropped */
+	trx_t*		trx,			/*!< in: a transaction */
+	ibool		remove_also_table_sx_locks)/*!< in: also removes
+						table S and X locks */
+{
+	lock_t*	lock;
+	lock_t*	prev_lock;
+
+	ut_ad(mutex_own(&kernel_mutex));
+
+	lock = UT_LIST_GET_LAST(trx->trx_locks);
+
+	while (lock != NULL) {
+		prev_lock = UT_LIST_GET_PREV(trx_locks, lock);
+
+		if (lock_get_type_low(lock) == LOCK_REC
+		    && lock->index->table == table) {
+			ut_a(!lock_get_wait(lock));
+
+			lock_rec_discard(lock);
+		} else if (lock_get_type_low(lock) & LOCK_TABLE
+			   && lock->un_member.tab_lock.table == table
+			   && (remove_also_table_sx_locks
+			       || !IS_LOCK_S_OR_X(lock))) {
+
+			ut_a(!lock_get_wait(lock));
+
+			lock_table_remove_low(lock);
+		}
+
+		lock = prev_lock;
+	}
+}
+
+/*********************************************************************//**
+Removes locks on a table to be dropped or truncated.
+If remove_also_table_sx_locks is TRUE then table-level S and X locks are
+also removed in addition to other table-level and record-level locks.
+No lock, that is going to be removed, is allowed to be a wait lock. */
+UNIV_INTERN
+void
+lock_remove_all_on_table(
+/*=====================*/
+	dict_table_t*	table,			/*!< in: table to be dropped
+						or truncated */
+	ibool		remove_also_table_sx_locks)/*!< in: also removes
+						table S and X locks */
+{
+	lock_t*	lock;
+	lock_t*	prev_lock;
+
+	mutex_enter(&kernel_mutex);
+
+	lock = UT_LIST_GET_FIRST(table->locks);
+
+	while (lock != NULL) {
+
+		prev_lock = UT_LIST_GET_PREV(un_member.tab_lock.locks,
+					     lock);
+
+		/* If we should remove all locks (remove_also_table_sx_locks
+		is TRUE), or if the lock is not table-level S or X lock,
+		then check we are not going to remove a wait lock. */
+		if (remove_also_table_sx_locks
+		    || !(lock_get_type(lock) == LOCK_TABLE
+			 && IS_LOCK_S_OR_X(lock))) {
+
+			ut_a(!lock_get_wait(lock));
+		}
+
+		lock_remove_all_on_table_for_trx(table, lock->trx,
+						 remove_also_table_sx_locks);
+
+		if (prev_lock == NULL) {
+			if (lock == UT_LIST_GET_FIRST(table->locks)) {
+				/* lock was not removed, pick its successor */
+				lock = UT_LIST_GET_NEXT(
+					un_member.tab_lock.locks, lock);
+			} else {
+				/* lock was removed, pick the first one */
+				lock = UT_LIST_GET_FIRST(table->locks);
+			}
+		} else if (UT_LIST_GET_NEXT(un_member.tab_lock.locks,
+					    prev_lock) != lock) {
+			/* If lock was removed by
+			lock_remove_all_on_table_for_trx() then pick the
+			successor of prev_lock ... */
+			lock = UT_LIST_GET_NEXT(
+				un_member.tab_lock.locks, prev_lock);
+		} else {
+			/* ... otherwise pick the successor of lock. */
+			lock = UT_LIST_GET_NEXT(
+				un_member.tab_lock.locks, lock);
+		}
+	}
+
+	mutex_exit(&kernel_mutex);
+}
+
+/*===================== VALIDATION AND DEBUGGING  ====================*/
+
+/*********************************************************************//**
+Prints info of a table lock. */
+UNIV_INTERN
+void
+lock_table_print(
+/*=============*/
+	FILE*		file,	/*!< in: file where to print */
+	const lock_t*	lock)	/*!< in: table type lock */
+{
+	ut_ad(mutex_own(&kernel_mutex));
+	ut_a(lock_get_type_low(lock) == LOCK_TABLE);
+
+	fputs("TABLE LOCK table ", file);
+	ut_print_name(file, lock->trx, TRUE,
+		      lock->un_member.tab_lock.table->name);
+	fprintf(file, " trx id " TRX_ID_FMT,
+		TRX_ID_PREP_PRINTF(lock->trx->id));
+
+	if (lock_get_mode(lock) == LOCK_S) {
+		fputs(" lock mode S", file);
+	} else if (lock_get_mode(lock) == LOCK_X) {
+		fputs(" lock mode X", file);
+	} else if (lock_get_mode(lock) == LOCK_IS) {
+		fputs(" lock mode IS", file);
+	} else if (lock_get_mode(lock) == LOCK_IX) {
+		fputs(" lock mode IX", file);
+	} else if (lock_get_mode(lock) == LOCK_AUTO_INC) {
+		fputs(" lock mode AUTO-INC", file);
+	} else {
+		fprintf(file, " unknown lock mode %lu",
+			(ulong) lock_get_mode(lock));
+	}
+
+	if (lock_get_wait(lock)) {
+		fputs(" waiting", file);
+	}
+
+	putc('\n', file);
+}
+
+/*********************************************************************//**
+Prints info of a record lock. */
+UNIV_INTERN
+void
+lock_rec_print(
+/*===========*/
+	FILE*		file,	/*!< in: file where to print */
+	const lock_t*	lock)	/*!< in: record type lock */
+{
+	const buf_block_t*	block;
+	ulint			space;
+	ulint			page_no;
+	ulint			i;
+	mtr_t			mtr;
+	mem_heap_t*		heap		= NULL;
+	ulint			offsets_[REC_OFFS_NORMAL_SIZE];
+	ulint*			offsets		= offsets_;
+	rec_offs_init(offsets_);
+
+	ut_ad(mutex_own(&kernel_mutex));
+	ut_a(lock_get_type_low(lock) == LOCK_REC);
+
+	space = lock->un_member.rec_lock.space;
+	page_no = lock->un_member.rec_lock.page_no;
+
+	fprintf(file, "RECORD LOCKS space id %lu page no %lu n bits %lu ",
+		(ulong) space, (ulong) page_no,
+		(ulong) lock_rec_get_n_bits(lock));
+	dict_index_name_print(file, lock->trx, lock->index);
+	fprintf(file, " trx id " TRX_ID_FMT,
+		TRX_ID_PREP_PRINTF(lock->trx->id));
+
+	if (lock_get_mode(lock) == LOCK_S) {
+		fputs(" lock mode S", file);
+	} else if (lock_get_mode(lock) == LOCK_X) {
+		fputs(" lock_mode X", file);
+	} else {
+		ut_error;
+	}
+
+	if (lock_rec_get_gap(lock)) {
+		fputs(" locks gap before rec", file);
+	}
+
+	if (lock_rec_get_rec_not_gap(lock)) {
+		fputs(" locks rec but not gap", file);
+	}
+
+	if (lock_rec_get_insert_intention(lock)) {
+		fputs(" insert intention", file);
+	}
+
+	if (lock_get_wait(lock)) {
+		fputs(" waiting", file);
+	}
+
+	mtr_start(&mtr);
+
+	putc('\n', file);
+
+	if ( srv_show_verbose_locks ) {
+	block = buf_page_try_get(space, page_no, &mtr);
+
+	for (i = 0; i < lock_rec_get_n_bits(lock); ++i) {
+
+		if (!lock_rec_get_nth_bit(lock, i)) {
+			continue;
+		}
+
+		fprintf(file, "Record lock, heap no %lu", (ulong) i);
+
+		if (block) {
+			const rec_t*	rec;
+
+			rec = page_find_rec_with_heap_no(
+				buf_block_get_frame(block), i);
+
+			offsets = rec_get_offsets(
+				rec, lock->index, offsets,
+				ULINT_UNDEFINED, &heap);
+
+			putc(' ', file);
+			rec_print_new(file, rec, offsets);
+		}
+
+		putc('\n', file);
+	}
+	}
+
+	mtr_commit(&mtr);
+	if (UNIV_LIKELY_NULL(heap)) {
+		mem_heap_free(heap);
+	}
+}
+
+#ifdef UNIV_DEBUG
+/* Print the number of lock structs from lock_print_info_summary() only
+in non-production builds for performance reasons, see
+http://bugs.mysql.com/36942 */
+#define PRINT_NUM_OF_LOCK_STRUCTS
+#endif /* UNIV_DEBUG */
+
+#ifdef PRINT_NUM_OF_LOCK_STRUCTS
+/*********************************************************************//**
+Calculates the number of record lock structs in the record lock hash table.
+@return	number of record locks */
+static
+ulint
+lock_get_n_rec_locks(void)
+/*======================*/
+{
+	lock_t*	lock;
+	ulint	n_locks	= 0;
+	ulint	i;
+
+	ut_ad(mutex_own(&kernel_mutex));
+
+	for (i = 0; i < hash_get_n_cells(lock_sys->rec_hash); i++) {
+
+		lock = HASH_GET_FIRST(lock_sys->rec_hash, i);
+
+		while (lock) {
+			n_locks++;
+
+			lock = HASH_GET_NEXT(hash, lock);
+		}
+	}
+
+	return(n_locks);
+}
+#endif /* PRINT_NUM_OF_LOCK_STRUCTS */
+
+/*********************************************************************//**
+Prints info of locks for all transactions.
+@return FALSE if not able to obtain kernel mutex
+and exits without printing info */
+UNIV_INTERN
+ibool
+lock_print_info_summary(
+/*====================*/
+	FILE*	file,	/*!< in: file where to print */
+	ibool   nowait)	/*!< in: whether to wait for the kernel mutex */
+{
+	/* if nowait is FALSE, wait on the kernel mutex,
+	otherwise return immediately if fail to obtain the
+	mutex. */
+	if (!nowait) {
+		lock_mutex_enter_kernel();
+	} else if (mutex_enter_nowait(&kernel_mutex)) {
+		fputs("FAIL TO OBTAIN KERNEL MUTEX, "
+		      "SKIP LOCK INFO PRINTING\n", file);
+		return(FALSE);
+	}
+
+	if (lock_deadlock_found) {
+		fputs("------------------------\n"
+		      "LATEST DETECTED DEADLOCK\n"
+		      "------------------------\n", file);
+
+		ut_copy_file(file, lock_latest_err_file);
+	}
+
+	fputs("------------\n"
+	      "TRANSACTIONS\n"
+	      "------------\n", file);
+
+	fprintf(file, "Trx id counter " TRX_ID_FMT "\n",
+		TRX_ID_PREP_PRINTF(trx_sys->max_trx_id));
+
+	fprintf(file,
+		"Purge done for trx's n:o < " TRX_ID_FMT
+		" undo n:o < " TRX_ID_FMT "\n",
+		TRX_ID_PREP_PRINTF(purge_sys->purge_trx_no),
+		TRX_ID_PREP_PRINTF(purge_sys->purge_undo_no));
+
+	fprintf(file,
+		"History list length %lu\n",
+		(ulong) trx_sys->rseg_history_len);
+
+#ifdef PRINT_NUM_OF_LOCK_STRUCTS
+	fprintf(file,
+		"Total number of lock structs in row lock hash table %lu\n",
+		(ulong) lock_get_n_rec_locks());
+#endif /* PRINT_NUM_OF_LOCK_STRUCTS */
+	return(TRUE);
+}
+
+/*********************************************************************//**
+Prints info of locks for each transaction. */
+UNIV_INTERN
+void
+lock_print_info_all_transactions(
+/*=============================*/
+	FILE*	file)	/*!< in: file where to print */
+{
+	lock_t*	lock;
+	ibool	load_page_first = TRUE;
+	ulint	nth_trx		= 0;
+	ulint	nth_lock	= 0;
+	ulint	i;
+	mtr_t	mtr;
+	trx_t*	trx;
+
+	fprintf(file, "LIST OF TRANSACTIONS FOR EACH SESSION:\n");
+
+	/* First print info on non-active transactions */
+
+	trx = UT_LIST_GET_FIRST(trx_sys->mysql_trx_list);
+
+	while (trx) {
+		if (trx->conc_state == TRX_NOT_STARTED) {
+			fputs("---", file);
+			trx_print(file, trx, 600);
+		}
+
+		trx = UT_LIST_GET_NEXT(mysql_trx_list, trx);
+	}
+
+loop:
+	trx = UT_LIST_GET_FIRST(trx_sys->trx_list);
+
+	i = 0;
+
+	/* Since we temporarily release the kernel mutex when
+	reading a database page in below, variable trx may be
+	obsolete now and we must loop through the trx list to
+	get probably the same trx, or some other trx. */
+
+	while (trx && (i < nth_trx)) {
+		trx = UT_LIST_GET_NEXT(trx_list, trx);
+		i++;
+	}
+
+	if (trx == NULL) {
+		lock_mutex_exit_kernel();
+
+		ut_ad(lock_validate());
+
+		return;
+	}
+
+	if (nth_lock == 0) {
+		fputs("---", file);
+		trx_print(file, trx, 600);
+
+		if (trx->read_view) {
+			fprintf(file,
+				"Trx read view will not see trx with"
+				" id >= " TRX_ID_FMT
+				", sees < " TRX_ID_FMT "\n",
+				TRX_ID_PREP_PRINTF(
+					trx->read_view->low_limit_id),
+				TRX_ID_PREP_PRINTF(
+					trx->read_view->up_limit_id));
+		}
+
+		if (trx->que_state == TRX_QUE_LOCK_WAIT) {
+			fprintf(file,
+				"------- TRX HAS BEEN WAITING %lu SEC"
+				" FOR THIS LOCK TO BE GRANTED:\n",
+				(ulong) difftime(time(NULL),
+						 trx->wait_started));
+
+			if (lock_get_type_low(trx->wait_lock) == LOCK_REC) {
+				lock_rec_print(file, trx->wait_lock);
+			} else {
+				lock_table_print(file, trx->wait_lock);
+			}
+
+			fputs("------------------\n", file);
+		}
+	}
+
+        if (!srv_print_innodb_lock_monitor && !srv_show_locks_held) {
+		nth_trx++;
+		goto loop;
+	}
+
+	i = 0;
+
+	/* Look at the note about the trx loop above why we loop here:
+	lock may be an obsolete pointer now. */
+
+	lock = UT_LIST_GET_FIRST(trx->trx_locks);
+
+	while (lock && (i < nth_lock)) {
+		lock = UT_LIST_GET_NEXT(trx_locks, lock);
+		i++;
+	}
+
+	if (lock == NULL) {
+		nth_trx++;
+		nth_lock = 0;
+
+		goto loop;
+	}
+
+	if (lock_get_type_low(lock) == LOCK_REC) {
+		if (load_page_first) {
+			ulint	space	= lock->un_member.rec_lock.space;
+			ulint	zip_size= fil_space_get_zip_size(space);
+			ulint	page_no = lock->un_member.rec_lock.page_no;
+
+			if (UNIV_UNLIKELY(zip_size == ULINT_UNDEFINED)) {
+
+				/* It is a single table tablespace and
+				the .ibd file is missing (TRUNCATE
+				TABLE probably stole the locks): just
+				print the lock without attempting to
+				load the page in the buffer pool. */
+
+				fprintf(file, "RECORD LOCKS on"
+					" non-existing space %lu\n",
+					(ulong) space);
+				goto print_rec;
+			}
+
+			lock_mutex_exit_kernel();
+
+			mtr_start(&mtr);
+
+			buf_page_get_with_no_latch(space, zip_size,
+						   page_no, &mtr);
+
+			mtr_commit(&mtr);
+
+			load_page_first = FALSE;
+
+			lock_mutex_enter_kernel();
+
+			goto loop;
+		}
+
+print_rec:
+		lock_rec_print(file, lock);
+	} else {
+		ut_ad(lock_get_type_low(lock) & LOCK_TABLE);
+
+		lock_table_print(file, lock);
+	}
+
+	load_page_first = TRUE;
+
+	nth_lock++;
+
+	if (nth_lock >= srv_show_locks_held) {
+		fputs("TOO MANY LOCKS PRINTED FOR THIS TRX:"
+		      " SUPPRESSING FURTHER PRINTS\n",
+		      file);
+
+		nth_trx++;
+		nth_lock = 0;
+
+		goto loop;
+	}
+
+	goto loop;
+}
+
+#ifdef UNIV_DEBUG
+/*********************************************************************//**
+Validates the lock queue on a table.
+@return	TRUE if ok */
+static
+ibool
+lock_table_queue_validate(
+/*======================*/
+	dict_table_t*	table)	/*!< in: table */
+{
+	lock_t*	lock;
+
+	ut_ad(mutex_own(&kernel_mutex));
+
+	lock = UT_LIST_GET_FIRST(table->locks);
+
+	while (lock) {
+		ut_a(((lock->trx)->conc_state == TRX_ACTIVE)
+		     || ((lock->trx)->conc_state == TRX_PREPARED)
+		     || ((lock->trx)->conc_state == TRX_COMMITTED_IN_MEMORY));
+
+		if (!lock_get_wait(lock)) {
+
+			ut_a(!lock_table_other_has_incompatible(
+				     lock->trx, 0, table,
+				     lock_get_mode(lock)));
+		} else {
+
+			ut_a(lock_table_has_to_wait_in_queue(lock));
+		}
+
+		lock = UT_LIST_GET_NEXT(un_member.tab_lock.locks, lock);
+	}
+
+	return(TRUE);
+}
+
+/*********************************************************************//**
+Validates the lock queue on a single record.
+@return	TRUE if ok */
+static
+ibool
+lock_rec_queue_validate(
+/*====================*/
+	const buf_block_t*	block,	/*!< in: buffer block containing rec */
+	const rec_t*		rec,	/*!< in: record to look at */
+	dict_index_t*		index,	/*!< in: index, or NULL if not known */
+	const ulint*		offsets)/*!< in: rec_get_offsets(rec, index) */
+{
+	trx_t*	impl_trx;
+	lock_t*	lock;
+	ulint	heap_no;
+
+	ut_a(rec);
+	ut_a(block->frame == page_align(rec));
+	ut_ad(rec_offs_validate(rec, index, offsets));
+	ut_ad(!page_rec_is_comp(rec) == !rec_offs_comp(offsets));
+
+	heap_no = page_rec_get_heap_no(rec);
+
+	lock_mutex_enter_kernel();
+
+	if (!page_rec_is_user_rec(rec)) {
+
+		lock = lock_rec_get_first(block, heap_no);
+
+		while (lock) {
+			switch(lock->trx->conc_state) {
+			case TRX_ACTIVE:
+			case TRX_PREPARED:
+			case TRX_COMMITTED_IN_MEMORY:
+				break;
+			default:
+				ut_error;
+			}
+
+			ut_a(trx_in_trx_list(lock->trx));
+
+			if (lock_get_wait(lock)) {
+				ut_a(lock_rec_has_to_wait_in_queue(lock));
+			}
+
+			if (index) {
+				ut_a(lock->index == index);
+			}
+
+			lock = lock_rec_get_next(heap_no, lock);
+		}
+
+		lock_mutex_exit_kernel();
+
+		return(TRUE);
+	}
+
+	if (!index);
+	else if (dict_index_is_clust(index)) {
+
+		impl_trx = lock_clust_rec_some_has_impl(rec, index, offsets);
+
+		if (impl_trx
+		    && lock_rec_other_has_expl_req(LOCK_S, 0, LOCK_WAIT,
+						   block, heap_no, impl_trx)) {
+
+			ut_a(lock_rec_has_expl(LOCK_X | LOCK_REC_NOT_GAP,
+					       block, heap_no, impl_trx));
+		}
+#if 0
+	} else {
+
+		/* The kernel mutex may get released temporarily in the
+		next function call: we have to release lock table mutex
+		to obey the latching order */
+
+		/* If this thread is holding the file space latch
+		(fil_space_t::latch), the following check WILL break
+		latching order and may cause a deadlock of threads. */
+
+		/* NOTE: This is a bogus check that would fail in the
+		following case: Our transaction is updating a
+		row. After it has updated the clustered index record,
+		it goes to a secondary index record and finds someone
+		else holding an explicit S- or X-lock on that
+		secondary index record, presumably from a locking
+		read. Our transaction cannot update the secondary
+		index immediately, but places a waiting X-lock request
+		on the secondary index record. There is nothing
+		illegal in this. The assertion is simply too strong. */
+
+		/* From the locking point of view, each secondary
+		index is a separate table. A lock that is held on
+		secondary index rec does not give any rights to modify
+		or read the clustered index rec. Therefore, we can
+		think of the sec index as a separate 'table' from the
+		clust index 'table'. Conversely, a transaction that
+		has acquired a lock on and modified a clustered index
+		record may need to wait for a lock on the
+		corresponding record in a secondary index. */
+
+		impl_trx = lock_sec_rec_some_has_impl_off_kernel(
+			rec, index, offsets);
+
+		if (impl_trx
+		    && lock_rec_other_has_expl_req(LOCK_S, 0, LOCK_WAIT,
+						   block, heap_no, impl_trx)) {
+
+			ut_a(lock_rec_has_expl(LOCK_X | LOCK_REC_NOT_GAP,
+					       block, heap_no, impl_trx));
+		}
+#endif
+	}
+
+	lock = lock_rec_get_first(block, heap_no);
+
+	while (lock) {
+		ut_a(lock->trx->conc_state == TRX_ACTIVE
+		     || lock->trx->conc_state == TRX_PREPARED
+		     || lock->trx->conc_state == TRX_COMMITTED_IN_MEMORY);
+		ut_a(trx_in_trx_list(lock->trx));
+
+		if (index) {
+			ut_a(lock->index == index);
+		}
+
+		if (!lock_rec_get_gap(lock) && !lock_get_wait(lock)) {
+
+			enum lock_mode	mode;
+
+			if (lock_get_mode(lock) == LOCK_S) {
+				mode = LOCK_X;
+			} else {
+				mode = LOCK_S;
+			}
+			ut_a(!lock_rec_other_has_expl_req(
+				     mode, 0, 0, block, heap_no, lock->trx));
+
+		} else if (lock_get_wait(lock) && !lock_rec_get_gap(lock)) {
+
+			ut_a(lock_rec_has_to_wait_in_queue(lock));
+		}
+
+		lock = lock_rec_get_next(heap_no, lock);
+	}
+
+	lock_mutex_exit_kernel();
+
+	return(TRUE);
+}
+
+/*********************************************************************//**
+Validates the record lock queues on a page.
+@return	TRUE if ok */
+static
+ibool
+lock_rec_validate_page(
+/*===================*/
+	ulint	space,	/*!< in: space id */
+	ulint	zip_size,/*!< in: compressed page size in bytes
+			or 0 for uncompressed pages */
+	ulint	page_no)/*!< in: page number */
+{
+	dict_index_t*	index;
+	buf_block_t*	block;
+	const page_t*	page;
+	lock_t*		lock;
+	const rec_t*	rec;
+	ulint		nth_lock	= 0;
+	ulint		nth_bit		= 0;
+	ulint		i;
+	mtr_t		mtr;
+	mem_heap_t*	heap		= NULL;
+	ulint		offsets_[REC_OFFS_NORMAL_SIZE];
+	ulint*		offsets		= offsets_;
+	rec_offs_init(offsets_);
+
+	ut_ad(!mutex_own(&kernel_mutex));
+
+	mtr_start(&mtr);
+
+	ut_ad(zip_size != ULINT_UNDEFINED);
+	block = buf_page_get(space, zip_size, page_no, RW_X_LATCH, &mtr);
+	buf_block_dbg_add_level(block, SYNC_NO_ORDER_CHECK);
+
+	page = block->frame;
+
+	lock_mutex_enter_kernel();
+loop:
+	lock = lock_rec_get_first_on_page_addr(space, page_no);
+
+	if (!lock) {
+		goto function_exit;
+	}
+
+	for (i = 0; i < nth_lock; i++) {
+
+		lock = lock_rec_get_next_on_page(lock);
+
+		if (!lock) {
+			goto function_exit;
+		}
+	}
+
+	ut_a(trx_in_trx_list(lock->trx));
+	ut_a(lock->trx->conc_state == TRX_ACTIVE
+	     || lock->trx->conc_state == TRX_PREPARED
+	     || lock->trx->conc_state == TRX_COMMITTED_IN_MEMORY);
+
+# ifdef UNIV_SYNC_DEBUG
+	/* Only validate the record queues when this thread is not
+	holding a space->latch.  Deadlocks are possible due to
+	latching order violation when UNIV_DEBUG is defined while
+	UNIV_SYNC_DEBUG is not. */
+	if (!sync_thread_levels_contains(SYNC_FSP))
+# endif /* UNIV_SYNC_DEBUG */
+	for (i = nth_bit; i < lock_rec_get_n_bits(lock); i++) {
+
+		if (i == 1 || lock_rec_get_nth_bit(lock, i)) {
+
+			index = lock->index;
+			rec = page_find_rec_with_heap_no(page, i);
+			ut_a(rec);
+			offsets = rec_get_offsets(rec, index, offsets,
+						  ULINT_UNDEFINED, &heap);
+
+			fprintf(stderr,
+				"Validating %lu %lu\n",
+				(ulong) space, (ulong) page_no);
+
+			lock_mutex_exit_kernel();
+
+			/* If this thread is holding the file space
+			latch (fil_space_t::latch), the following
+			check WILL break the latching order and may
+			cause a deadlock of threads. */
+
+			lock_rec_queue_validate(block, rec, index, offsets);
+
+			lock_mutex_enter_kernel();
+
+			nth_bit = i + 1;
+
+			goto loop;
+		}
+	}
+
+	nth_bit = 0;
+	nth_lock++;
+
+	goto loop;
+
+function_exit:
+	lock_mutex_exit_kernel();
+
+	mtr_commit(&mtr);
+
+	if (UNIV_LIKELY_NULL(heap)) {
+		mem_heap_free(heap);
+	}
+	return(TRUE);
+}
+
+/*********************************************************************//**
+Validates the lock system.
+@return	TRUE if ok */
+static
+ibool
+lock_validate(void)
+/*===============*/
+{
+	lock_t*	lock;
+	trx_t*	trx;
+	dulint	limit;
+	ulint	space;
+	ulint	page_no;
+	ulint	i;
+
+	lock_mutex_enter_kernel();
+
+	trx = UT_LIST_GET_FIRST(trx_sys->trx_list);
+
+	while (trx) {
+		lock = UT_LIST_GET_FIRST(trx->trx_locks);
+
+		while (lock) {
+			if (lock_get_type_low(lock) & LOCK_TABLE) {
+
+				lock_table_queue_validate(
+					lock->un_member.tab_lock.table);
+			}
+
+			lock = UT_LIST_GET_NEXT(trx_locks, lock);
+		}
+
+		trx = UT_LIST_GET_NEXT(trx_list, trx);
+	}
+
+	for (i = 0; i < hash_get_n_cells(lock_sys->rec_hash); i++) {
+
+		limit = ut_dulint_zero;
+
+		for (;;) {
+			lock = HASH_GET_FIRST(lock_sys->rec_hash, i);
+
+			while (lock) {
+				ut_a(trx_in_trx_list(lock->trx));
+
+				space = lock->un_member.rec_lock.space;
+				page_no = lock->un_member.rec_lock.page_no;
+
+				if (ut_dulint_cmp(
+					    ut_dulint_create(space, page_no),
+					    limit) >= 0) {
+					break;
+				}
+
+				lock = HASH_GET_NEXT(hash, lock);
+			}
+
+			if (!lock) {
+
+				break;
+			}
+
+			lock_mutex_exit_kernel();
+
+			lock_rec_validate_page(space,
+					       fil_space_get_zip_size(space),
+					       page_no);
+
+			lock_mutex_enter_kernel();
+
+			limit = ut_dulint_create(space, page_no + 1);
+		}
+	}
+
+	lock_mutex_exit_kernel();
+
+	return(TRUE);
+}
+#endif /* UNIV_DEBUG */
+/*============ RECORD LOCK CHECKS FOR ROW OPERATIONS ====================*/
+
+/*********************************************************************//**
+Checks if locks of other transactions prevent an immediate insert of
+a record. If they do, first tests if the query thread should anyway
+be suspended for some reason; if not, then puts the transaction and
+the query thread to the lock wait state and inserts a waiting request
+for a gap x-lock to the lock queue.
+@return	DB_SUCCESS, DB_LOCK_WAIT, DB_DEADLOCK, or DB_QUE_THR_SUSPENDED */
+UNIV_INTERN
+ulint
+lock_rec_insert_check_and_lock(
+/*===========================*/
+	ulint		flags,	/*!< in: if BTR_NO_LOCKING_FLAG bit is
+				set, does nothing */
+	const rec_t*	rec,	/*!< in: record after which to insert */
+	buf_block_t*	block,	/*!< in/out: buffer block of rec */
+	dict_index_t*	index,	/*!< in: index */
+	que_thr_t*	thr,	/*!< in: query thread */
+	mtr_t*		mtr,	/*!< in/out: mini-transaction */
+	ibool*		inherit)/*!< out: set to TRUE if the new
+				inserted record maybe should inherit
+				LOCK_GAP type locks from the successor
+				record */
+{
+	const rec_t*	next_rec;
+	trx_t*		trx;
+	lock_t*		lock;
+	ulint		err;
+	ulint		next_rec_heap_no;
+
+	ut_ad(block->frame == page_align(rec));
+
+	if (flags & BTR_NO_LOCKING_FLAG) {
+
+		return(DB_SUCCESS);
+	}
+
+	trx = thr_get_trx(thr);
+	next_rec = page_rec_get_next_const(rec);
+	next_rec_heap_no = page_rec_get_heap_no(next_rec);
+
+	lock_mutex_enter_kernel();
+
+	/* When inserting a record into an index, the table must be at
+	least IX-locked or we must be building an index, in which case
+	the table must be at least S-locked. */
+	ut_ad(lock_table_has(trx, index->table, LOCK_IX)
+	      || (*index->name == TEMP_INDEX_PREFIX
+		  && lock_table_has(trx, index->table, LOCK_S)));
+
+	lock = lock_rec_get_first(block, next_rec_heap_no);
+
+	if (UNIV_LIKELY(lock == NULL)) {
+		/* We optimize CPU time usage in the simplest case */
+
+		lock_mutex_exit_kernel();
+
+		if (!dict_index_is_clust(index)) {
+			/* Update the page max trx id field */
+			page_update_max_trx_id(block,
+					       buf_block_get_page_zip(block),
+					       trx->id, mtr);
+		}
+
+		*inherit = FALSE;
+
+		return(DB_SUCCESS);
+	}
+
+	*inherit = TRUE;
+
+	/* If another transaction has an explicit lock request which locks
+	the gap, waiting or granted, on the successor, the insert has to wait.
+
+	An exception is the case where the lock by the another transaction
+	is a gap type lock which it placed to wait for its turn to insert. We
+	do not consider that kind of a lock conflicting with our insert. This
+	eliminates an unnecessary deadlock which resulted when 2 transactions
+	had to wait for their insert. Both had waiting gap type lock requests
+	on the successor, which produced an unnecessary deadlock. */
+
+	if (lock_rec_other_has_conflicting(
+		    LOCK_X | LOCK_GAP | LOCK_INSERT_INTENTION,
+		    block, next_rec_heap_no, trx)) {
+
+		/* Note that we may get DB_SUCCESS also here! */
+		err = lock_rec_enqueue_waiting(LOCK_X | LOCK_GAP
+					       | LOCK_INSERT_INTENTION,
+					       block, next_rec_heap_no,
+					       index, thr);
+	} else {
+		err = DB_SUCCESS;
+	}
+
+	lock_mutex_exit_kernel();
+
+	switch (err) {
+	case DB_SUCCESS_LOCKED_REC:
+		err = DB_SUCCESS;
+		/* fall through */
+	case DB_SUCCESS:
+		if (dict_index_is_clust(index)) {
+			break;
+		}
+		/* Update the page max trx id field */
+		page_update_max_trx_id(block,
+				       buf_block_get_page_zip(block),
+				       trx->id, mtr);
+	}
+
+#ifdef UNIV_DEBUG
+	{
+		mem_heap_t*	heap		= NULL;
+		ulint		offsets_[REC_OFFS_NORMAL_SIZE];
+		const ulint*	offsets;
+		rec_offs_init(offsets_);
+
+		offsets = rec_get_offsets(next_rec, index, offsets_,
+					  ULINT_UNDEFINED, &heap);
+		ut_ad(lock_rec_queue_validate(block,
+					      next_rec, index, offsets));
+		if (UNIV_LIKELY_NULL(heap)) {
+			mem_heap_free(heap);
+		}
+	}
+#endif /* UNIV_DEBUG */
+
+	return(err);
+}
+
+/*********************************************************************//**
+If a transaction has an implicit x-lock on a record, but no explicit x-lock
+set on the record, sets one for it. NOTE that in the case of a secondary
+index, the kernel mutex may get temporarily released. */
+static
+void
+lock_rec_convert_impl_to_expl(
+/*==========================*/
+	const buf_block_t*	block,	/*!< in: buffer block of rec */
+	const rec_t*		rec,	/*!< in: user record on page */
+	dict_index_t*		index,	/*!< in: index of record */
+	const ulint*		offsets)/*!< in: rec_get_offsets(rec, index) */
+{
+	trx_t*	impl_trx;
+
+	ut_ad(mutex_own(&kernel_mutex));
+	ut_ad(page_rec_is_user_rec(rec));
+	ut_ad(rec_offs_validate(rec, index, offsets));
+	ut_ad(!page_rec_is_comp(rec) == !rec_offs_comp(offsets));
+
+	if (dict_index_is_clust(index)) {
+		impl_trx = lock_clust_rec_some_has_impl(rec, index, offsets);
+	} else {
+		impl_trx = lock_sec_rec_some_has_impl_off_kernel(
+			rec, index, offsets);
+	}
+
+	if (impl_trx) {
+		ulint	heap_no = page_rec_get_heap_no(rec);
+
+		/* If the transaction has no explicit x-lock set on the
+		record, set one for it */
+
+		if (!lock_rec_has_expl(LOCK_X | LOCK_REC_NOT_GAP, block,
+				       heap_no, impl_trx)) {
+
+			lock_rec_add_to_queue(
+				LOCK_REC | LOCK_X | LOCK_REC_NOT_GAP,
+				block, heap_no, index, impl_trx);
+		}
+	}
+}
+
+/*********************************************************************//**
+Checks if locks of other transactions prevent an immediate modify (update,
+delete mark, or delete unmark) of a clustered index record. If they do,
+first tests if the query thread should anyway be suspended for some
+reason; if not, then puts the transaction and the query thread to the
+lock wait state and inserts a waiting request for a record x-lock to the
+lock queue.
+@return	DB_SUCCESS, DB_LOCK_WAIT, DB_DEADLOCK, or DB_QUE_THR_SUSPENDED */
+UNIV_INTERN
+ulint
+lock_clust_rec_modify_check_and_lock(
+/*=================================*/
+	ulint			flags,	/*!< in: if BTR_NO_LOCKING_FLAG
+					bit is set, does nothing */
+	const buf_block_t*	block,	/*!< in: buffer block of rec */
+	const rec_t*		rec,	/*!< in: record which should be
+					modified */
+	dict_index_t*		index,	/*!< in: clustered index */
+	const ulint*		offsets,/*!< in: rec_get_offsets(rec, index) */
+	que_thr_t*		thr)	/*!< in: query thread */
+{
+	ulint	err;
+	ulint	heap_no;
+
+	ut_ad(rec_offs_validate(rec, index, offsets));
+	ut_ad(dict_index_is_clust(index));
+	ut_ad(block->frame == page_align(rec));
+
+	if (flags & BTR_NO_LOCKING_FLAG) {
+
+		return(DB_SUCCESS);
+	}
+
+	heap_no = rec_offs_comp(offsets)
+		? rec_get_heap_no_new(rec)
+		: rec_get_heap_no_old(rec);
+
+	lock_mutex_enter_kernel();
+
+	ut_ad(lock_table_has(thr_get_trx(thr), index->table, LOCK_IX));
+
+	/* If a transaction has no explicit x-lock set on the record, set one
+	for it */
+
+	lock_rec_convert_impl_to_expl(block, rec, index, offsets);
+
+	err = lock_rec_lock(TRUE, LOCK_X | LOCK_REC_NOT_GAP,
+			    block, heap_no, index, thr);
+
+	lock_mutex_exit_kernel();
+
+	ut_ad(lock_rec_queue_validate(block, rec, index, offsets));
+
+	if (UNIV_UNLIKELY(err == DB_SUCCESS_LOCKED_REC)) {
+		err = DB_SUCCESS;
+	}
+
+	return(err);
+}
+
+/*********************************************************************//**
+Checks if locks of other transactions prevent an immediate modify (delete
+mark or delete unmark) of a secondary index record.
+@return	DB_SUCCESS, DB_LOCK_WAIT, DB_DEADLOCK, or DB_QUE_THR_SUSPENDED */
+UNIV_INTERN
+ulint
+lock_sec_rec_modify_check_and_lock(
+/*===============================*/
+	ulint		flags,	/*!< in: if BTR_NO_LOCKING_FLAG
+				bit is set, does nothing */
+	buf_block_t*	block,	/*!< in/out: buffer block of rec */
+	const rec_t*	rec,	/*!< in: record which should be
+				modified; NOTE: as this is a secondary
+				index, we always have to modify the
+				clustered index record first: see the
+				comment below */
+	dict_index_t*	index,	/*!< in: secondary index */
+	que_thr_t*	thr,	/*!< in: query thread */
+	mtr_t*		mtr)	/*!< in/out: mini-transaction */
+{
+	ulint	err;
+	ulint	heap_no;
+
+	ut_ad(!dict_index_is_clust(index));
+	ut_ad(block->frame == page_align(rec));
+
+	if (flags & BTR_NO_LOCKING_FLAG) {
+
+		return(DB_SUCCESS);
+	}
+
+	heap_no = page_rec_get_heap_no(rec);
+
+	/* Another transaction cannot have an implicit lock on the record,
+	because when we come here, we already have modified the clustered
+	index record, and this would not have been possible if another active
+	transaction had modified this secondary index record. */
+
+	lock_mutex_enter_kernel();
+
+	ut_ad(lock_table_has(thr_get_trx(thr), index->table, LOCK_IX));
+
+	err = lock_rec_lock(TRUE, LOCK_X | LOCK_REC_NOT_GAP,
+			    block, heap_no, index, thr);
+
+	lock_mutex_exit_kernel();
+
+#ifdef UNIV_DEBUG
+	{
+		mem_heap_t*	heap		= NULL;
+		ulint		offsets_[REC_OFFS_NORMAL_SIZE];
+		const ulint*	offsets;
+		rec_offs_init(offsets_);
+
+		offsets = rec_get_offsets(rec, index, offsets_,
+					  ULINT_UNDEFINED, &heap);
+		ut_ad(lock_rec_queue_validate(block, rec, index, offsets));
+		if (UNIV_LIKELY_NULL(heap)) {
+			mem_heap_free(heap);
+		}
+	}
+#endif /* UNIV_DEBUG */
+
+	if (err == DB_SUCCESS || err == DB_SUCCESS_LOCKED_REC) {
+		/* Update the page max trx id field */
+		/* It might not be necessary to do this if
+		err == DB_SUCCESS (no new lock created),
+		but it should not cost too much performance. */
+		page_update_max_trx_id(block,
+				       buf_block_get_page_zip(block),
+				       thr_get_trx(thr)->id, mtr);
+		err = DB_SUCCESS;
+	}
+
+	return(err);
+}
+
+/*********************************************************************//**
+Like lock_clust_rec_read_check_and_lock(), but reads a
+secondary index record.
+@return	DB_SUCCESS, DB_SUCCESS_LOCKED_REC, DB_LOCK_WAIT, DB_DEADLOCK,
+or DB_QUE_THR_SUSPENDED */
+UNIV_INTERN
+enum db_err
+lock_sec_rec_read_check_and_lock(
+/*=============================*/
+	ulint			flags,	/*!< in: if BTR_NO_LOCKING_FLAG
+					bit is set, does nothing */
+	const buf_block_t*	block,	/*!< in: buffer block of rec */
+	const rec_t*		rec,	/*!< in: user record or page
+					supremum record which should
+					be read or passed over by a
+					read cursor */
+	dict_index_t*		index,	/*!< in: secondary index */
+	const ulint*		offsets,/*!< in: rec_get_offsets(rec, index) */
+	enum lock_mode		mode,	/*!< in: mode of the lock which
+					the read cursor should set on
+					records: LOCK_S or LOCK_X; the
+					latter is possible in
+					SELECT FOR UPDATE */
+	ulint			gap_mode,/*!< in: LOCK_ORDINARY, LOCK_GAP, or
+					LOCK_REC_NOT_GAP */
+	que_thr_t*		thr)	/*!< in: query thread */
+{
+	enum db_err	err;
+	ulint		heap_no;
+
+	ut_ad(!dict_index_is_clust(index));
+	ut_ad(block->frame == page_align(rec));
+	ut_ad(page_rec_is_user_rec(rec) || page_rec_is_supremum(rec));
+	ut_ad(rec_offs_validate(rec, index, offsets));
+	ut_ad(mode == LOCK_X || mode == LOCK_S);
+
+	if (flags & BTR_NO_LOCKING_FLAG) {
+
+		return(DB_SUCCESS);
+	}
+
+	heap_no = page_rec_get_heap_no(rec);
+
+	lock_mutex_enter_kernel();
+
+	ut_ad(mode != LOCK_X
+	      || lock_table_has(thr_get_trx(thr), index->table, LOCK_IX));
+	ut_ad(mode != LOCK_S
+	      || lock_table_has(thr_get_trx(thr), index->table, LOCK_IS));
+
+	/* Some transaction may have an implicit x-lock on the record only
+	if the max trx id for the page >= min trx id for the trx list or a
+	database recovery is running. */
+
+	if (((ut_dulint_cmp(page_get_max_trx_id(block->frame),
+			    trx_list_get_min_trx_id()) >= 0)
+	     || recv_recovery_is_on())
+	    && !page_rec_is_supremum(rec)) {
+
+		lock_rec_convert_impl_to_expl(block, rec, index, offsets);
+	}
+
+	err = lock_rec_lock(FALSE, mode | gap_mode,
+			    block, heap_no, index, thr);
+
+	lock_mutex_exit_kernel();
+
+	ut_ad(lock_rec_queue_validate(block, rec, index, offsets));
+
+	return(err);
+}
+
+/*********************************************************************//**
+Checks if locks of other transactions prevent an immediate read, or passing
+over by a read cursor, of a clustered index record. If they do, first tests
+if the query thread should anyway be suspended for some reason; if not, then
+puts the transaction and the query thread to the lock wait state and inserts a
+waiting request for a record lock to the lock queue. Sets the requested mode
+lock on the record.
+@return	DB_SUCCESS, DB_SUCCESS_LOCKED_REC, DB_LOCK_WAIT, DB_DEADLOCK,
+or DB_QUE_THR_SUSPENDED */
+UNIV_INTERN
+enum db_err
+lock_clust_rec_read_check_and_lock(
+/*===============================*/
+	ulint			flags,	/*!< in: if BTR_NO_LOCKING_FLAG
+					bit is set, does nothing */
+	const buf_block_t*	block,	/*!< in: buffer block of rec */
+	const rec_t*		rec,	/*!< in: user record or page
+					supremum record which should
+					be read or passed over by a
+					read cursor */
+	dict_index_t*		index,	/*!< in: clustered index */
+	const ulint*		offsets,/*!< in: rec_get_offsets(rec, index) */
+	enum lock_mode		mode,	/*!< in: mode of the lock which
+					the read cursor should set on
+					records: LOCK_S or LOCK_X; the
+					latter is possible in
+					SELECT FOR UPDATE */
+	ulint			gap_mode,/*!< in: LOCK_ORDINARY, LOCK_GAP, or
+					LOCK_REC_NOT_GAP */
+	que_thr_t*		thr)	/*!< in: query thread */
+{
+	enum db_err	err;
+	ulint		heap_no;
+
+	ut_ad(dict_index_is_clust(index));
+	ut_ad(block->frame == page_align(rec));
+	ut_ad(page_rec_is_user_rec(rec) || page_rec_is_supremum(rec));
+	ut_ad(gap_mode == LOCK_ORDINARY || gap_mode == LOCK_GAP
+	      || gap_mode == LOCK_REC_NOT_GAP);
+	ut_ad(rec_offs_validate(rec, index, offsets));
+
+	if (flags & BTR_NO_LOCKING_FLAG) {
+
+		return(DB_SUCCESS);
+	}
+
+	heap_no = page_rec_get_heap_no(rec);
+
+	lock_mutex_enter_kernel();
+
+	ut_ad(mode != LOCK_X
+	      || lock_table_has(thr_get_trx(thr), index->table, LOCK_IX));
+	ut_ad(mode != LOCK_S
+	      || lock_table_has(thr_get_trx(thr), index->table, LOCK_IS));
+
+	if (UNIV_LIKELY(heap_no != PAGE_HEAP_NO_SUPREMUM)) {
+
+		lock_rec_convert_impl_to_expl(block, rec, index, offsets);
+	}
+
+	err = lock_rec_lock(FALSE, mode | gap_mode,
+			    block, heap_no, index, thr);
+
+	lock_mutex_exit_kernel();
+
+	ut_ad(lock_rec_queue_validate(block, rec, index, offsets));
+
+	return(err);
+}
+/*********************************************************************//**
+Checks if locks of other transactions prevent an immediate read, or passing
+over by a read cursor, of a clustered index record. If they do, first tests
+if the query thread should anyway be suspended for some reason; if not, then
+puts the transaction and the query thread to the lock wait state and inserts a
+waiting request for a record lock to the lock queue. Sets the requested mode
+lock on the record. This is an alternative version of
+lock_clust_rec_read_check_and_lock() that does not require the parameter
+"offsets".
+@return	DB_SUCCESS, DB_LOCK_WAIT, DB_DEADLOCK, or DB_QUE_THR_SUSPENDED */
+UNIV_INTERN
+ulint
+lock_clust_rec_read_check_and_lock_alt(
+/*===================================*/
+	ulint			flags,	/*!< in: if BTR_NO_LOCKING_FLAG
+					bit is set, does nothing */
+	const buf_block_t*	block,	/*!< in: buffer block of rec */
+	const rec_t*		rec,	/*!< in: user record or page
+					supremum record which should
+					be read or passed over by a
+					read cursor */
+	dict_index_t*		index,	/*!< in: clustered index */
+	enum lock_mode		mode,	/*!< in: mode of the lock which
+					the read cursor should set on
+					records: LOCK_S or LOCK_X; the
+					latter is possible in
+					SELECT FOR UPDATE */
+	ulint			gap_mode,/*!< in: LOCK_ORDINARY, LOCK_GAP, or
+					LOCK_REC_NOT_GAP */
+	que_thr_t*		thr)	/*!< in: query thread */
+{
+	mem_heap_t*	tmp_heap	= NULL;
+	ulint		offsets_[REC_OFFS_NORMAL_SIZE];
+	ulint*		offsets		= offsets_;
+	ulint		err;
+	rec_offs_init(offsets_);
+
+	offsets = rec_get_offsets(rec, index, offsets,
+				  ULINT_UNDEFINED, &tmp_heap);
+	err = lock_clust_rec_read_check_and_lock(flags, block, rec, index,
+						 offsets, mode, gap_mode, thr);
+	if (tmp_heap) {
+		mem_heap_free(tmp_heap);
+	}
+
+	if (UNIV_UNLIKELY(err == DB_SUCCESS_LOCKED_REC)) {
+		err = DB_SUCCESS;
+	}
+
+	return(err);
+}
+
+/*******************************************************************//**
+Release the last lock from the transaction's autoinc locks. */
+UNIV_INLINE
+void
+lock_release_autoinc_last_lock(
+/*===========================*/
+	ib_vector_t*	autoinc_locks)	/*!< in/out: vector of AUTOINC locks */
+{
+	ulint		last;
+	lock_t*		lock;
+
+	ut_ad(mutex_own(&kernel_mutex));
+	ut_a(!ib_vector_is_empty(autoinc_locks));
+
+	/* The lock to be release must be the last lock acquired. */
+	last = ib_vector_size(autoinc_locks) - 1;
+	lock = ib_vector_get(autoinc_locks, last);
+
+	/* Should have only AUTOINC locks in the vector. */
+	ut_a(lock_get_mode(lock) == LOCK_AUTO_INC);
+	ut_a(lock_get_type(lock) == LOCK_TABLE);
+
+	ut_a(lock->un_member.tab_lock.table != NULL);
+
+	/* This will remove the lock from the trx autoinc_locks too. */
+	lock_table_dequeue(lock);
+}
+
+/*******************************************************************//**
+Check if a transaction holds any autoinc locks. 
+@return TRUE if the transaction holds any AUTOINC locks. */
+UNIV_INTERN
+ibool
+lock_trx_holds_autoinc_locks(
+/*=========================*/
+	const trx_t*	trx)		/*!< in: transaction */
+{
+	ut_a(trx->autoinc_locks != NULL);
+
+	return(!ib_vector_is_empty(trx->autoinc_locks));
+}
+
+/*******************************************************************//**
+Release all the transaction's autoinc locks. */
+UNIV_INTERN
+void
+lock_release_autoinc_locks(
+/*=======================*/
+	trx_t*		trx)		/*!< in/out: transaction */
+{
+	ut_ad(mutex_own(&kernel_mutex));
+
+	ut_a(trx->autoinc_locks != NULL);
+
+	/* We release the locks in the reverse order. This is to
+	avoid searching the vector for the element to delete at
+	the lower level. See (lock_table_remove_low()) for details. */
+	while (!ib_vector_is_empty(trx->autoinc_locks)) {
+
+		/* lock_table_remove_low() will also remove the lock from
+		the transaction's autoinc_locks vector. */
+		lock_release_autoinc_last_lock(trx->autoinc_locks);
+	}
+
+	/* Should release all locks. */
+	ut_a(ib_vector_is_empty(trx->autoinc_locks));
+}
+
+/*******************************************************************//**
+Gets the type of a lock. Non-inline version for using outside of the
+lock module.
+@return	LOCK_TABLE or LOCK_REC */
+UNIV_INTERN
+ulint
+lock_get_type(
+/*==========*/
+	const lock_t*	lock)	/*!< in: lock */
+{
+	return(lock_get_type_low(lock));
+}
+
+/*******************************************************************//**
+Gets the id of the transaction owning a lock.
+@return	transaction id */
+UNIV_INTERN
+ullint
+lock_get_trx_id(
+/*============*/
+	const lock_t*	lock)	/*!< in: lock */
+{
+	return(trx_get_id(lock->trx));
+}
+
+/*******************************************************************//**
+Gets the mode of a lock in a human readable string.
+The string should not be free()'d or modified.
+@return	lock mode */
+UNIV_INTERN
+const char*
+lock_get_mode_str(
+/*==============*/
+	const lock_t*	lock)	/*!< in: lock */
+{
+	ibool	is_gap_lock;
+
+	is_gap_lock = lock_get_type_low(lock) == LOCK_REC
+		&& lock_rec_get_gap(lock);
+
+	switch (lock_get_mode(lock)) {
+	case LOCK_S:
+		if (is_gap_lock) {
+			return("S,GAP");
+		} else {
+			return("S");
+		}
+	case LOCK_X:
+		if (is_gap_lock) {
+			return("X,GAP");
+		} else {
+			return("X");
+		}
+	case LOCK_IS:
+		if (is_gap_lock) {
+			return("IS,GAP");
+		} else {
+			return("IS");
+		}
+	case LOCK_IX:
+		if (is_gap_lock) {
+			return("IX,GAP");
+		} else {
+			return("IX");
+		}
+	case LOCK_AUTO_INC:
+		return("AUTO_INC");
+	default:
+		return("UNKNOWN");
+	}
+}
+
+/*******************************************************************//**
+Gets the type of a lock in a human readable string.
+The string should not be free()'d or modified.
+@return	lock type */
+UNIV_INTERN
+const char*
+lock_get_type_str(
+/*==============*/
+	const lock_t*	lock)	/*!< in: lock */
+{
+	switch (lock_get_type_low(lock)) {
+	case LOCK_REC:
+		return("RECORD");
+	case LOCK_TABLE:
+		return("TABLE");
+	default:
+		return("UNKNOWN");
+	}
+}
+
+/*******************************************************************//**
+Gets the table on which the lock is.
+@return	table */
+UNIV_INLINE
+dict_table_t*
+lock_get_table(
+/*===========*/
+	const lock_t*	lock)	/*!< in: lock */
+{
+	switch (lock_get_type_low(lock)) {
+	case LOCK_REC:
+		return(lock->index->table);
+	case LOCK_TABLE:
+		return(lock->un_member.tab_lock.table);
+	default:
+		ut_error;
+		return(NULL);
+	}
+}
+
+/*******************************************************************//**
+Gets the id of the table on which the lock is.
+@return	id of the table */
+UNIV_INTERN
+ullint
+lock_get_table_id(
+/*==============*/
+	const lock_t*	lock)	/*!< in: lock */
+{
+	dict_table_t*	table;
+
+	table = lock_get_table(lock);
+
+	return((ullint)ut_conv_dulint_to_longlong(table->id));
+}
+
+/*******************************************************************//**
+Gets the name of the table on which the lock is.
+The string should not be free()'d or modified.
+@return	name of the table */
+UNIV_INTERN
+const char*
+lock_get_table_name(
+/*================*/
+	const lock_t*	lock)	/*!< in: lock */
+{
+	dict_table_t*	table;
+
+	table = lock_get_table(lock);
+
+	return(table->name);
+}
+
+/*******************************************************************//**
+For a record lock, gets the index on which the lock is.
+@return	index */
+UNIV_INTERN
+const dict_index_t*
+lock_rec_get_index(
+/*===============*/
+	const lock_t*	lock)	/*!< in: lock */
+{
+	ut_a(lock_get_type_low(lock) == LOCK_REC);
+
+	return(lock->index);
+}
+
+/*******************************************************************//**
+For a record lock, gets the name of the index on which the lock is.
+The string should not be free()'d or modified.
+@return	name of the index */
+UNIV_INTERN
+const char*
+lock_rec_get_index_name(
+/*====================*/
+	const lock_t*	lock)	/*!< in: lock */
+{
+	ut_a(lock_get_type_low(lock) == LOCK_REC);
+
+	return(lock->index->name);
+}
+
+/*******************************************************************//**
+For a record lock, gets the tablespace number on which the lock is.
+@return	tablespace number */
+UNIV_INTERN
+ulint
+lock_rec_get_space_id(
+/*==================*/
+	const lock_t*	lock)	/*!< in: lock */
+{
+	ut_a(lock_get_type_low(lock) == LOCK_REC);
+
+	return(lock->un_member.rec_lock.space);
+}
+
+/*******************************************************************//**
+For a record lock, gets the page number on which the lock is.
+@return	page number */
+UNIV_INTERN
+ulint
+lock_rec_get_page_no(
+/*=================*/
+	const lock_t*	lock)	/*!< in: lock */
+{
+	ut_a(lock_get_type_low(lock) == LOCK_REC);
+
+	return(lock->un_member.rec_lock.page_no);
+}
diff --git a/storage/xtradb/log/log0log.c b/storage/xtradb/log/log0log.c
new file mode 100644
index 00000000000..b9f19aeff31
--- /dev/null
+++ b/storage/xtradb/log/log0log.c
@@ -0,0 +1,3507 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2010, Innobase Oy. All Rights Reserved.
+Copyright (c) 2009, Google Inc.
+
+Portions of this file contain modifications contributed and copyrighted by
+Google, Inc. Those modifications are gratefully acknowledged and are described
+briefly in the InnoDB documentation. The contributions by Google are
+incorporated with their permission, and subject to the conditions contained in
+the file COPYING.Google.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file log/log0log.c
+Database log
+
+Created 12/9/1995 Heikki Tuuri
+*******************************************************/
+
+#include "log0log.h"
+
+#ifdef UNIV_NONINL
+#include "log0log.ic"
+#endif
+
+#ifndef UNIV_HOTBACKUP
+#include "mem0mem.h"
+#include "buf0buf.h"
+#include "buf0flu.h"
+#include "srv0srv.h"
+#include "log0recv.h"
+#include "fil0fil.h"
+#include "dict0boot.h"
+#include "srv0srv.h"
+#include "srv0start.h"
+#include "trx0sys.h"
+#include "trx0trx.h"
+
+/*
+General philosophy of InnoDB redo-logs:
+
+1) Every change to a contents of a data page must be done
+through mtr, which in mtr_commit() writes log records
+to the InnoDB redo log.
+
+2) Normally these changes are performed using a mlog_write_ulint()
+or similar function.
+
+3) In some page level operations only a code number of a
+c-function and its parameters are written to the log to
+reduce the size of the log.
+
+  3a) You should not add parameters to these kind of functions
+  (e.g. trx_undo_header_create(), trx_undo_insert_header_reuse())
+
+  3b) You should not add such functionality which either change
+  working when compared with the old or are dependent on data
+  outside of the page. These kind of functions should implement
+  self-contained page transformation and it should be unchanged
+  if you don't have very essential reasons to change log
+  semantics or format.
+
+*/
+
+/* Current free limit of space 0; protected by the log sys mutex; 0 means
+uninitialized */
+UNIV_INTERN ulint	log_fsp_current_free_limit		= 0;
+
+/* Global log system variable */
+UNIV_INTERN log_t*	log_sys	= NULL;
+
+#ifdef UNIV_DEBUG
+UNIV_INTERN ibool	log_do_write = TRUE;
+#endif /* UNIV_DEBUG */
+
+/* These control how often we print warnings if the last checkpoint is too
+old */
+UNIV_INTERN ibool	log_has_printed_chkp_warning = FALSE;
+UNIV_INTERN time_t	log_last_warning_time;
+
+#ifdef UNIV_LOG_ARCHIVE
+/* Pointer to this variable is used as the i/o-message when we do i/o to an
+archive */
+UNIV_INTERN byte	log_archive_io;
+#endif /* UNIV_LOG_ARCHIVE */
+
+/* A margin for free space in the log buffer before a log entry is catenated */
+#define LOG_BUF_WRITE_MARGIN	(4 * OS_FILE_LOG_BLOCK_SIZE)
+
+/* Margins for free space in the log buffer after a log entry is catenated */
+#define LOG_BUF_FLUSH_RATIO	2
+#define LOG_BUF_FLUSH_MARGIN	(LOG_BUF_WRITE_MARGIN + 4 * UNIV_PAGE_SIZE)
+
+/* Margin for the free space in the smallest log group, before a new query
+step which modifies the database, is started */
+
+#define LOG_CHECKPOINT_FREE_PER_THREAD	(4 * UNIV_PAGE_SIZE)
+#define LOG_CHECKPOINT_EXTRA_FREE	(8 * UNIV_PAGE_SIZE)
+
+/* This parameter controls asynchronous making of a new checkpoint; the value
+should be bigger than LOG_POOL_PREFLUSH_RATIO_SYNC */
+
+#define LOG_POOL_CHECKPOINT_RATIO_ASYNC	32
+
+/* This parameter controls synchronous preflushing of modified buffer pages */
+#define LOG_POOL_PREFLUSH_RATIO_SYNC	16
+
+/* The same ratio for asynchronous preflushing; this value should be less than
+the previous */
+#define LOG_POOL_PREFLUSH_RATIO_ASYNC	8
+
+/* Extra margin, in addition to one log file, used in archiving */
+#define LOG_ARCHIVE_EXTRA_MARGIN	(4 * UNIV_PAGE_SIZE)
+
+/* This parameter controls asynchronous writing to the archive */
+#define LOG_ARCHIVE_RATIO_ASYNC		16
+
+/* Codes used in unlocking flush latches */
+#define LOG_UNLOCK_NONE_FLUSHED_LOCK	1
+#define LOG_UNLOCK_FLUSH_LOCK		2
+
+/* States of an archiving operation */
+#define	LOG_ARCHIVE_READ	1
+#define	LOG_ARCHIVE_WRITE	2
+
+/******************************************************//**
+Completes a checkpoint write i/o to a log file. */
+static
+void
+log_io_complete_checkpoint(void);
+/*============================*/
+#ifdef UNIV_LOG_ARCHIVE
+/******************************************************//**
+Completes an archiving i/o. */
+static
+void
+log_io_complete_archive(void);
+/*=========================*/
+#endif /* UNIV_LOG_ARCHIVE */
+
+/****************************************************************//**
+Sets the global variable log_fsp_current_free_limit. Also makes a checkpoint,
+so that we know that the limit has been written to a log checkpoint field
+on disk. */
+UNIV_INTERN
+void
+log_fsp_current_free_limit_set_and_checkpoint(
+/*==========================================*/
+	ulint	limit)	/*!< in: limit to set */
+{
+	ibool	success;
+
+	mutex_enter(&(log_sys->mutex));
+
+	log_fsp_current_free_limit = limit;
+
+	mutex_exit(&(log_sys->mutex));
+
+	/* Try to make a synchronous checkpoint */
+
+	success = FALSE;
+
+	while (!success) {
+		success = log_checkpoint(TRUE, TRUE);
+	}
+}
+
+/****************************************************************//**
+Returns the oldest modified block lsn in the pool, or log_sys->lsn if none
+exists.
+@return	LSN of oldest modification */
+static
+ib_uint64_t
+log_buf_pool_get_oldest_modification(void)
+/*======================================*/
+{
+	ib_uint64_t	lsn;
+
+	ut_ad(mutex_own(&(log_sys->mutex)));
+
+	lsn = buf_pool_get_oldest_modification();
+
+	if (!lsn) {
+
+		lsn = log_sys->lsn;
+	}
+
+	return(lsn);
+}
+
+/************************************************************//**
+Opens the log for log_write_low. The log must be closed with log_close and
+released with log_release.
+@return	start lsn of the log record */
+UNIV_INTERN
+ib_uint64_t
+log_reserve_and_open(
+/*=================*/
+	ulint	len)	/*!< in: length of data to be catenated */
+{
+	log_t*	log			= log_sys;
+	ulint	len_upper_limit;
+#ifdef UNIV_LOG_ARCHIVE
+	ulint	archived_lsn_age;
+	ulint	dummy;
+#endif /* UNIV_LOG_ARCHIVE */
+#ifdef UNIV_DEBUG
+	ulint	count			= 0;
+#endif /* UNIV_DEBUG */
+
+	ut_a(len < log->buf_size / 2);
+loop:
+	mutex_enter(&(log->mutex));
+	ut_ad(!recv_no_log_write);
+
+	/* Calculate an upper limit for the space the string may take in the
+	log buffer */
+
+	len_upper_limit = LOG_BUF_WRITE_MARGIN + (5 * len) / 4;
+
+	if (log->buf_free + len_upper_limit > log->buf_size) {
+
+		mutex_exit(&(log->mutex));
+
+		/* Not enough free space, do a syncronous flush of the log
+		buffer */
+
+		log_buffer_flush_to_disk();
+
+		srv_log_waits++;
+
+		ut_ad(++count < 50);
+
+		goto loop;
+	}
+
+#ifdef UNIV_LOG_ARCHIVE
+	if (log->archiving_state != LOG_ARCH_OFF) {
+
+		archived_lsn_age = log->lsn - log->archived_lsn;
+		if (archived_lsn_age + len_upper_limit
+		    > log->max_archived_lsn_age) {
+			/* Not enough free archived space in log groups: do a
+			synchronous archive write batch: */
+
+			mutex_exit(&(log->mutex));
+
+			ut_ad(len_upper_limit <= log->max_archived_lsn_age);
+
+			log_archive_do(TRUE, &dummy);
+
+			ut_ad(++count < 50);
+
+			goto loop;
+		}
+	}
+#endif /* UNIV_LOG_ARCHIVE */
+
+#ifdef UNIV_LOG_DEBUG
+	log->old_buf_free = log->buf_free;
+	log->old_lsn = log->lsn;
+#endif
+	return(log->lsn);
+}
+
+/************************************************************//**
+Writes to the log the string given. It is assumed that the caller holds the
+log mutex. */
+UNIV_INTERN
+void
+log_write_low(
+/*==========*/
+	byte*	str,		/*!< in: string */
+	ulint	str_len)	/*!< in: string length */
+{
+	log_t*	log	= log_sys;
+	ulint	len;
+	ulint	data_len;
+	byte*	log_block;
+
+	ut_ad(mutex_own(&(log->mutex)));
+part_loop:
+	ut_ad(!recv_no_log_write);
+	/* Calculate a part length */
+
+	data_len = (log->buf_free % OS_FILE_LOG_BLOCK_SIZE) + str_len;
+
+	if (data_len <= OS_FILE_LOG_BLOCK_SIZE - LOG_BLOCK_TRL_SIZE) {
+
+		/* The string fits within the current log block */
+
+		len = str_len;
+	} else {
+		data_len = OS_FILE_LOG_BLOCK_SIZE - LOG_BLOCK_TRL_SIZE;
+
+		len = OS_FILE_LOG_BLOCK_SIZE
+			- (log->buf_free % OS_FILE_LOG_BLOCK_SIZE)
+			- LOG_BLOCK_TRL_SIZE;
+	}
+
+	ut_memcpy(log->buf + log->buf_free, str, len);
+
+	str_len -= len;
+	str = str + len;
+
+	log_block = ut_align_down(log->buf + log->buf_free,
+				  OS_FILE_LOG_BLOCK_SIZE);
+	log_block_set_data_len(log_block, data_len);
+
+	if (data_len == OS_FILE_LOG_BLOCK_SIZE - LOG_BLOCK_TRL_SIZE) {
+		/* This block became full */
+		log_block_set_data_len(log_block, OS_FILE_LOG_BLOCK_SIZE);
+		log_block_set_checkpoint_no(log_block,
+					    log_sys->next_checkpoint_no);
+		len += LOG_BLOCK_HDR_SIZE + LOG_BLOCK_TRL_SIZE;
+
+		log->lsn += len;
+
+		/* Initialize the next block header */
+		log_block_init(log_block + OS_FILE_LOG_BLOCK_SIZE, log->lsn);
+	} else {
+		log->lsn += len;
+	}
+
+	log->buf_free += len;
+
+	ut_ad(log->buf_free <= log->buf_size);
+
+	if (str_len > 0) {
+		goto part_loop;
+	}
+
+	srv_log_write_requests++;
+}
+
+/************************************************************//**
+*/
+UNIV_INLINE
+ulint
+log_max_modified_age_async()
+{
+	if (srv_checkpoint_age_target) {
+		return(ut_min(log_sys->max_modified_age_async,
+				srv_checkpoint_age_target
+				- srv_checkpoint_age_target / 8));
+	} else {
+		return(log_sys->max_modified_age_async);
+	}
+}
+
+UNIV_INLINE
+ulint
+log_max_checkpoint_age_async()
+{
+	if (srv_checkpoint_age_target) {
+		return(ut_min(log_sys->max_checkpoint_age_async,
+				srv_checkpoint_age_target));
+	} else {
+		return(log_sys->max_checkpoint_age_async);
+	}
+}
+
+/************************************************************//**
+Closes the log.
+@return	lsn */
+UNIV_INTERN
+ib_uint64_t
+log_close(void)
+/*===========*/
+{
+	byte*		log_block;
+	ulint		first_rec_group;
+	ib_uint64_t	oldest_lsn;
+	ib_uint64_t	lsn;
+	log_t*		log	= log_sys;
+	ib_uint64_t	checkpoint_age;
+
+	ut_ad(mutex_own(&(log->mutex)));
+	ut_ad(!recv_no_log_write);
+
+	lsn = log->lsn;
+
+	log_block = ut_align_down(log->buf + log->buf_free,
+				  OS_FILE_LOG_BLOCK_SIZE);
+	first_rec_group = log_block_get_first_rec_group(log_block);
+
+	if (first_rec_group == 0) {
+		/* We initialized a new log block which was not written
+		full by the current mtr: the next mtr log record group
+		will start within this block at the offset data_len */
+
+		log_block_set_first_rec_group(
+			log_block, log_block_get_data_len(log_block));
+	}
+
+	if (log->buf_free > log->max_buf_free) {
+
+		log->check_flush_or_checkpoint = TRUE;
+	}
+
+	checkpoint_age = lsn - log->last_checkpoint_lsn;
+
+	if (checkpoint_age >= log->log_group_capacity) {
+		/* TODO: split btr_store_big_rec_extern_fields() into small
+		steps so that we can release all latches in the middle, and
+		call log_free_check() to ensure we never write over log written
+		after the latest checkpoint. In principle, we should split all
+		big_rec operations, but other operations are smaller. */
+
+		if (!log_has_printed_chkp_warning
+		    || difftime(time(NULL), log_last_warning_time) > 15) {
+
+			log_has_printed_chkp_warning = TRUE;
+			log_last_warning_time = time(NULL);
+
+			ut_print_timestamp(stderr);
+			fprintf(stderr,
+				"  InnoDB: ERROR: the age of the last"
+				" checkpoint is %lu,\n"
+				"InnoDB: which exceeds the log group"
+				" capacity %lu.\n"
+				"InnoDB: If you are using big"
+				" BLOB or TEXT rows, you must set the\n"
+				"InnoDB: combined size of log files"
+				" at least 10 times bigger than the\n"
+				"InnoDB: largest such row.\n",
+				(ulong) checkpoint_age,
+				(ulong) log->log_group_capacity);
+		}
+	}
+
+	if (checkpoint_age <= log_max_modified_age_async()) {
+
+		goto function_exit;
+	}
+
+	oldest_lsn = buf_pool_get_oldest_modification();
+
+	if (!oldest_lsn
+	    || lsn - oldest_lsn > log_max_modified_age_async()
+	    || checkpoint_age > log_max_checkpoint_age_async()) {
+
+		log->check_flush_or_checkpoint = TRUE;
+	}
+function_exit:
+
+#ifdef UNIV_LOG_DEBUG
+	log_check_log_recs(log->buf + log->old_buf_free,
+			   log->buf_free - log->old_buf_free, log->old_lsn);
+#endif
+
+	return(lsn);
+}
+
+#ifdef UNIV_LOG_ARCHIVE
+/******************************************************//**
+Pads the current log block full with dummy log records. Used in producing
+consistent archived log files. */
+static
+void
+log_pad_current_log_block(void)
+/*===========================*/
+{
+	byte		b		= MLOG_DUMMY_RECORD;
+	ulint		pad_length;
+	ulint		i;
+	ib_uint64_t	lsn;
+
+	/* We retrieve lsn only because otherwise gcc crashed on HP-UX */
+	lsn = log_reserve_and_open(OS_FILE_LOG_BLOCK_SIZE);
+
+	pad_length = OS_FILE_LOG_BLOCK_SIZE
+		- (log_sys->buf_free % OS_FILE_LOG_BLOCK_SIZE)
+		- LOG_BLOCK_TRL_SIZE;
+
+	for (i = 0; i < pad_length; i++) {
+		log_write_low(&b, 1);
+	}
+
+	lsn = log_sys->lsn;
+
+	log_close();
+	log_release();
+
+	ut_a(lsn % OS_FILE_LOG_BLOCK_SIZE == LOG_BLOCK_HDR_SIZE);
+}
+#endif /* UNIV_LOG_ARCHIVE */
+
+/******************************************************//**
+Calculates the data capacity of a log group, when the log file headers are not
+included.
+@return	capacity in bytes */
+UNIV_INTERN
+ulint
+log_group_get_capacity(
+/*===================*/
+	const log_group_t*	group)	/*!< in: log group */
+{
+	ut_ad(mutex_own(&(log_sys->mutex)));
+
+	return((group->file_size - LOG_FILE_HDR_SIZE) * group->n_files);
+}
+
+/******************************************************//**
+Calculates the offset within a log group, when the log file headers are not
+included.
+@return	size offset (<= offset) */
+UNIV_INLINE
+ulint
+log_group_calc_size_offset(
+/*=======================*/
+	ulint			offset,	/*!< in: real offset within the
+					log group */
+	const log_group_t*	group)	/*!< in: log group */
+{
+	ut_ad(mutex_own(&(log_sys->mutex)));
+
+	return(offset - LOG_FILE_HDR_SIZE * (1 + offset / group->file_size));
+}
+
+/******************************************************//**
+Calculates the offset within a log group, when the log file headers are
+included.
+@return	real offset (>= offset) */
+UNIV_INLINE
+ulint
+log_group_calc_real_offset(
+/*=======================*/
+	ulint			offset,	/*!< in: size offset within the
+					log group */
+	const log_group_t*	group)	/*!< in: log group */
+{
+	ut_ad(mutex_own(&(log_sys->mutex)));
+
+	return(offset + LOG_FILE_HDR_SIZE
+	       * (1 + offset / (group->file_size - LOG_FILE_HDR_SIZE)));
+}
+
+/******************************************************//**
+Calculates the offset of an lsn within a log group.
+@return	offset within the log group */
+static
+ulint
+log_group_calc_lsn_offset(
+/*======================*/
+	ib_uint64_t		lsn,	/*!< in: lsn, must be within 4 GB of
+					group->lsn */
+	const log_group_t*	group)	/*!< in: log group */
+{
+	ib_uint64_t	gr_lsn;
+	ib_int64_t	gr_lsn_size_offset;
+	ib_int64_t	difference;
+	ib_int64_t	group_size;
+	ib_int64_t	offset;
+
+	ut_ad(mutex_own(&(log_sys->mutex)));
+
+	/* If total log file size is > 2 GB we can easily get overflows
+	with 32-bit integers. Use 64-bit integers instead. */
+
+	gr_lsn = group->lsn;
+
+	gr_lsn_size_offset = (ib_int64_t)
+		log_group_calc_size_offset(group->lsn_offset, group);
+
+	group_size = (ib_int64_t) log_group_get_capacity(group);
+
+	if (lsn >= gr_lsn) {
+
+		difference = (ib_int64_t) (lsn - gr_lsn);
+	} else {
+		difference = (ib_int64_t) (gr_lsn - lsn);
+
+		difference = difference % group_size;
+
+		difference = group_size - difference;
+	}
+
+	offset = (gr_lsn_size_offset + difference) % group_size;
+
+	if (sizeof(ulint) == 4) {
+	ut_a(offset < (((ib_int64_t) 1) << 32)); /* offset must be < 4 GB */
+	}
+
+	/* fprintf(stderr,
+	"Offset is %lu gr_lsn_offset is %lu difference is %lu\n",
+	(ulint)offset,(ulint)gr_lsn_size_offset, (ulint)difference);
+	*/
+
+	return(log_group_calc_real_offset((ulint)offset, group));
+}
+#endif /* !UNIV_HOTBACKUP */
+
+#ifdef UNIV_DEBUG
+UNIV_INTERN ibool	log_debug_writes = FALSE;
+#endif /* UNIV_DEBUG */
+
+/*******************************************************************//**
+Calculates where in log files we find a specified lsn.
+@return	log file number */
+UNIV_INTERN
+ulint
+log_calc_where_lsn_is(
+/*==================*/
+	ib_int64_t*	log_file_offset,	/*!< out: offset in that file
+						(including the header) */
+	ib_uint64_t	first_header_lsn,	/*!< in: first log file start
+						lsn */
+	ib_uint64_t	lsn,			/*!< in: lsn whose position to
+						determine */
+	ulint		n_log_files,		/*!< in: total number of log
+						files */
+	ib_int64_t	log_file_size)		/*!< in: log file size
+						(including the header) */
+{
+	ib_int64_t	capacity	= log_file_size - LOG_FILE_HDR_SIZE;
+	ulint		file_no;
+	ib_int64_t	add_this_many;
+
+	if (lsn < first_header_lsn) {
+		add_this_many = 1 + (first_header_lsn - lsn)
+			/ (capacity * (ib_int64_t)n_log_files);
+		lsn += add_this_many
+			* capacity * (ib_int64_t)n_log_files;
+	}
+
+	ut_a(lsn >= first_header_lsn);
+
+	file_no = ((ulint)((lsn - first_header_lsn) / capacity))
+		% n_log_files;
+	*log_file_offset = (lsn - first_header_lsn) % capacity;
+
+	*log_file_offset = *log_file_offset + LOG_FILE_HDR_SIZE;
+
+	return(file_no);
+}
+
+#ifndef UNIV_HOTBACKUP
+/********************************************************//**
+Sets the field values in group to correspond to a given lsn. For this function
+to work, the values must already be correctly initialized to correspond to
+some lsn, for instance, a checkpoint lsn. */
+UNIV_INTERN
+void
+log_group_set_fields(
+/*=================*/
+	log_group_t*	group,	/*!< in/out: group */
+	ib_uint64_t	lsn)	/*!< in: lsn for which the values should be
+				set */
+{
+	group->lsn_offset = log_group_calc_lsn_offset(lsn, group);
+	group->lsn = lsn;
+}
+
+/*****************************************************************//**
+Calculates the recommended highest values for lsn - last_checkpoint_lsn,
+lsn - buf_get_oldest_modification(), and lsn - max_archive_lsn_age.
+@return error value FALSE if the smallest log group is too small to
+accommodate the number of OS threads in the database server */
+static
+ibool
+log_calc_max_ages(void)
+/*===================*/
+{
+	log_group_t*	group;
+	ulint		margin;
+	ulint		free;
+	ibool		success		= TRUE;
+	ulint		smallest_capacity;
+	ulint		archive_margin;
+	ulint		smallest_archive_margin;
+
+	mutex_enter(&(log_sys->mutex));
+
+	group = UT_LIST_GET_FIRST(log_sys->log_groups);
+
+	ut_ad(group);
+
+	smallest_capacity = ULINT_MAX;
+	smallest_archive_margin = ULINT_MAX;
+
+	while (group) {
+		if (log_group_get_capacity(group) < smallest_capacity) {
+
+			smallest_capacity = log_group_get_capacity(group);
+		}
+
+		archive_margin = log_group_get_capacity(group)
+			- (group->file_size - LOG_FILE_HDR_SIZE)
+			- LOG_ARCHIVE_EXTRA_MARGIN;
+
+		if (archive_margin < smallest_archive_margin) {
+
+			smallest_archive_margin = archive_margin;
+		}
+
+		group = UT_LIST_GET_NEXT(log_groups, group);
+	}
+
+	/* Add extra safety */
+	smallest_capacity = smallest_capacity - smallest_capacity / 10;
+
+	/* For each OS thread we must reserve so much free space in the
+	smallest log group that it can accommodate the log entries produced
+	by single query steps: running out of free log space is a serious
+	system error which requires rebooting the database. */
+
+	free = LOG_CHECKPOINT_FREE_PER_THREAD * (10 + srv_thread_concurrency)
+		+ LOG_CHECKPOINT_EXTRA_FREE;
+	if (free >= smallest_capacity / 2) {
+		success = FALSE;
+
+		goto failure;
+	} else {
+		margin = smallest_capacity - free;
+	}
+
+	margin = ut_min(margin, log_sys->adm_checkpoint_interval);
+
+	margin = margin - margin / 10;	/* Add still some extra safety */
+
+	log_sys->log_group_capacity = smallest_capacity;
+
+	log_sys->max_modified_age_async = margin
+		- margin / LOG_POOL_PREFLUSH_RATIO_ASYNC;
+	log_sys->max_modified_age_sync = margin
+		- margin / LOG_POOL_PREFLUSH_RATIO_SYNC;
+
+	log_sys->max_checkpoint_age_async = margin - margin
+		/ LOG_POOL_CHECKPOINT_RATIO_ASYNC;
+	log_sys->max_checkpoint_age = margin;
+
+#ifdef UNIV_LOG_ARCHIVE
+	log_sys->max_archived_lsn_age = smallest_archive_margin;
+
+	log_sys->max_archived_lsn_age_async = smallest_archive_margin
+		- smallest_archive_margin / LOG_ARCHIVE_RATIO_ASYNC;
+#endif /* UNIV_LOG_ARCHIVE */
+failure:
+	mutex_exit(&(log_sys->mutex));
+
+	if (!success) {
+		fprintf(stderr,
+			"InnoDB: Error: ib_logfiles are too small"
+			" for innodb_thread_concurrency %lu.\n"
+			"InnoDB: The combined size of ib_logfiles"
+			" should be bigger than\n"
+			"InnoDB: 200 kB * innodb_thread_concurrency.\n"
+			"InnoDB: To get mysqld to start up, set"
+			" innodb_thread_concurrency in my.cnf\n"
+			"InnoDB: to a lower value, for example, to 8."
+			" After an ERROR-FREE shutdown\n"
+			"InnoDB: of mysqld you can adjust the size of"
+			" ib_logfiles, as explained in\n"
+			"InnoDB: " REFMAN "adding-and-removing.html\n"
+			"InnoDB: Cannot continue operation."
+			" Calling exit(1).\n",
+			(ulong)srv_thread_concurrency);
+
+		exit(1);
+	}
+
+	return(success);
+}
+
+/******************************************************//**
+Initializes the log. */
+UNIV_INTERN
+void
+log_init(void)
+/*==========*/
+{
+	log_sys = mem_alloc(sizeof(log_t));
+
+	mutex_create(&log_sys->mutex, SYNC_LOG);
+
+	mutex_enter(&(log_sys->mutex));
+
+	/* Start the lsn from one log block from zero: this way every
+	log record has a start lsn != zero, a fact which we will use */
+
+	log_sys->lsn = LOG_START_LSN;
+
+	ut_a(LOG_BUFFER_SIZE >= 16 * OS_FILE_LOG_BLOCK_SIZE);
+	ut_a(LOG_BUFFER_SIZE >= 4 * UNIV_PAGE_SIZE);
+
+	log_sys->buf_ptr = mem_alloc(LOG_BUFFER_SIZE + OS_FILE_LOG_BLOCK_SIZE);
+	log_sys->buf = ut_align(log_sys->buf_ptr, OS_FILE_LOG_BLOCK_SIZE);
+
+	log_sys->buf_size = LOG_BUFFER_SIZE;
+
+	memset(log_sys->buf, '\0', LOG_BUFFER_SIZE);
+
+	log_sys->max_buf_free = log_sys->buf_size / LOG_BUF_FLUSH_RATIO
+		- LOG_BUF_FLUSH_MARGIN;
+	log_sys->check_flush_or_checkpoint = TRUE;
+	UT_LIST_INIT(log_sys->log_groups);
+
+	log_sys->n_log_ios = 0;
+
+	log_sys->n_log_ios_old = log_sys->n_log_ios;
+	log_sys->last_printout_time = time(NULL);
+	/*----------------------------*/
+
+	log_sys->buf_next_to_write = 0;
+
+	log_sys->write_lsn = 0;
+	log_sys->current_flush_lsn = 0;
+	log_sys->flushed_to_disk_lsn = 0;
+
+	log_sys->written_to_some_lsn = log_sys->lsn;
+	log_sys->written_to_all_lsn = log_sys->lsn;
+
+	log_sys->n_pending_writes = 0;
+
+	log_sys->no_flush_event = os_event_create(NULL);
+
+	os_event_set(log_sys->no_flush_event);
+
+	log_sys->one_flushed_event = os_event_create(NULL);
+
+	os_event_set(log_sys->one_flushed_event);
+
+	/*----------------------------*/
+	log_sys->adm_checkpoint_interval = ULINT_MAX;
+
+	log_sys->next_checkpoint_no = 0;
+	log_sys->last_checkpoint_lsn = log_sys->lsn;
+	log_sys->n_pending_checkpoint_writes = 0;
+
+	rw_lock_create(&log_sys->checkpoint_lock, SYNC_NO_ORDER_CHECK);
+
+	log_sys->checkpoint_buf_ptr = mem_alloc(2 * OS_FILE_LOG_BLOCK_SIZE);
+	log_sys->checkpoint_buf = ut_align(log_sys->checkpoint_buf_ptr,
+					   OS_FILE_LOG_BLOCK_SIZE);
+	memset(log_sys->checkpoint_buf, '\0', OS_FILE_LOG_BLOCK_SIZE);
+	/*----------------------------*/
+
+#ifdef UNIV_LOG_ARCHIVE
+	/* Under MySQL, log archiving is always off */
+	log_sys->archiving_state = LOG_ARCH_OFF;
+	log_sys->archived_lsn = log_sys->lsn;
+	log_sys->next_archived_lsn = 0;
+
+	log_sys->n_pending_archive_ios = 0;
+
+	rw_lock_create(&log_sys->archive_lock, SYNC_NO_ORDER_CHECK);
+
+	log_sys->archive_buf = NULL;
+
+	/* ut_align(
+	ut_malloc(LOG_ARCHIVE_BUF_SIZE
+	+ OS_FILE_LOG_BLOCK_SIZE),
+	OS_FILE_LOG_BLOCK_SIZE); */
+	log_sys->archive_buf_size = 0;
+
+	/* memset(log_sys->archive_buf, '\0', LOG_ARCHIVE_BUF_SIZE); */
+
+	log_sys->archiving_on = os_event_create(NULL);
+#endif /* UNIV_LOG_ARCHIVE */
+
+	/*----------------------------*/
+
+	log_block_init(log_sys->buf, log_sys->lsn);
+	log_block_set_first_rec_group(log_sys->buf, LOG_BLOCK_HDR_SIZE);
+
+	log_sys->buf_free = LOG_BLOCK_HDR_SIZE;
+	log_sys->lsn = LOG_START_LSN + LOG_BLOCK_HDR_SIZE;
+
+	mutex_exit(&(log_sys->mutex));
+
+#ifdef UNIV_LOG_DEBUG
+	recv_sys_create();
+	recv_sys_init(buf_pool_get_curr_size());
+
+	recv_sys->parse_start_lsn = log_sys->lsn;
+	recv_sys->scanned_lsn = log_sys->lsn;
+	recv_sys->scanned_checkpoint_no = 0;
+	recv_sys->recovered_lsn = log_sys->lsn;
+	recv_sys->limit_lsn = IB_ULONGLONG_MAX;
+#endif
+}
+
+/******************************************************************//**
+Inits a log group to the log system. */
+UNIV_INTERN
+void
+log_group_init(
+/*===========*/
+	ulint	id,			/*!< in: group id */
+	ulint	n_files,		/*!< in: number of log files */
+	ulint	file_size,		/*!< in: log file size in bytes */
+	ulint	space_id,		/*!< in: space id of the file space
+					which contains the log files of this
+					group */
+	ulint	archive_space_id __attribute__((unused)))
+					/*!< in: space id of the file space
+					which contains some archived log
+					files for this group; currently, only
+					for the first log group this is
+					used */
+{
+	ulint	i;
+
+	log_group_t*	group;
+
+	group = mem_alloc(sizeof(log_group_t));
+
+	group->id = id;
+	group->n_files = n_files;
+	group->file_size = file_size;
+	group->space_id = space_id;
+	group->state = LOG_GROUP_OK;
+	group->lsn = LOG_START_LSN;
+	group->lsn_offset = LOG_FILE_HDR_SIZE;
+	group->n_pending_writes = 0;
+
+	group->file_header_bufs_ptr = mem_alloc(sizeof(byte*) * n_files);
+	group->file_header_bufs = mem_alloc(sizeof(byte*) * n_files);
+#ifdef UNIV_LOG_ARCHIVE
+	group->archive_file_header_bufs_ptr = mem_alloc(
+		sizeof(byte*) * n_files);
+	group->archive_file_header_bufs = mem_alloc(sizeof(byte*) * n_files);
+#endif /* UNIV_LOG_ARCHIVE */
+
+	for (i = 0; i < n_files; i++) {
+		group->file_header_bufs_ptr[i] = mem_alloc(
+			LOG_FILE_HDR_SIZE + OS_FILE_LOG_BLOCK_SIZE);
+
+		group->file_header_bufs[i] = ut_align(
+			group->file_header_bufs_ptr[i],
+			OS_FILE_LOG_BLOCK_SIZE);
+
+		memset(*(group->file_header_bufs + i), '\0',
+		       LOG_FILE_HDR_SIZE);
+
+#ifdef UNIV_LOG_ARCHIVE
+		group->archive_file_header_bufs_ptr[i] = mem_alloc(
+			LOG_FILE_HDR_SIZE + OS_FILE_LOG_BLOCK_SIZE);
+
+		group->archive_file_header_bufs[i] = ut_align(
+			group->archive_file_header_bufs_ptr[i],
+			OS_FILE_LOG_BLOCK_SIZE);
+
+		memset(*(group->archive_file_header_bufs + i), '\0',
+		       LOG_FILE_HDR_SIZE);
+#endif /* UNIV_LOG_ARCHIVE */
+	}
+
+#ifdef UNIV_LOG_ARCHIVE
+	group->archive_space_id = archive_space_id;
+
+	group->archived_file_no = 0;
+	group->archived_offset = 0;
+#endif /* UNIV_LOG_ARCHIVE */
+
+	group->checkpoint_buf_ptr = mem_alloc(2 * OS_FILE_LOG_BLOCK_SIZE);
+	group->checkpoint_buf = ut_align(group->checkpoint_buf_ptr,
+					 OS_FILE_LOG_BLOCK_SIZE);
+
+	memset(group->checkpoint_buf, '\0', OS_FILE_LOG_BLOCK_SIZE);
+
+	UT_LIST_ADD_LAST(log_groups, log_sys->log_groups, group);
+
+	ut_a(log_calc_max_ages());
+}
+
+/******************************************************************//**
+Does the unlockings needed in flush i/o completion. */
+UNIV_INLINE
+void
+log_flush_do_unlocks(
+/*=================*/
+	ulint	code)	/*!< in: any ORed combination of LOG_UNLOCK_FLUSH_LOCK
+			and LOG_UNLOCK_NONE_FLUSHED_LOCK */
+{
+	ut_ad(mutex_own(&(log_sys->mutex)));
+
+	/* NOTE that we must own the log mutex when doing the setting of the
+	events: this is because transactions will wait for these events to
+	be set, and at that moment the log flush they were waiting for must
+	have ended. If the log mutex were not reserved here, the i/o-thread
+	calling this function might be preempted for a while, and when it
+	resumed execution, it might be that a new flush had been started, and
+	this function would erroneously signal the NEW flush as completed.
+	Thus, the changes in the state of these events are performed
+	atomically in conjunction with the changes in the state of
+	log_sys->n_pending_writes etc. */
+
+	if (code & LOG_UNLOCK_NONE_FLUSHED_LOCK) {
+		os_event_set(log_sys->one_flushed_event);
+	}
+
+	if (code & LOG_UNLOCK_FLUSH_LOCK) {
+		os_event_set(log_sys->no_flush_event);
+	}
+}
+
+/******************************************************************//**
+Checks if a flush is completed for a log group and does the completion
+routine if yes.
+@return	LOG_UNLOCK_NONE_FLUSHED_LOCK or 0 */
+UNIV_INLINE
+ulint
+log_group_check_flush_completion(
+/*=============================*/
+	log_group_t*	group)	/*!< in: log group */
+{
+	ut_ad(mutex_own(&(log_sys->mutex)));
+
+	if (!log_sys->one_flushed && group->n_pending_writes == 0) {
+#ifdef UNIV_DEBUG
+		if (log_debug_writes) {
+			fprintf(stderr,
+				"Log flushed first to group %lu\n",
+				(ulong) group->id);
+		}
+#endif /* UNIV_DEBUG */
+		log_sys->written_to_some_lsn = log_sys->write_lsn;
+		log_sys->one_flushed = TRUE;
+
+		return(LOG_UNLOCK_NONE_FLUSHED_LOCK);
+	}
+
+#ifdef UNIV_DEBUG
+	if (log_debug_writes && (group->n_pending_writes == 0)) {
+
+		fprintf(stderr, "Log flushed to group %lu\n",
+			(ulong) group->id);
+	}
+#endif /* UNIV_DEBUG */
+	return(0);
+}
+
+/******************************************************//**
+Checks if a flush is completed and does the completion routine if yes.
+@return	LOG_UNLOCK_FLUSH_LOCK or 0 */
+static
+ulint
+log_sys_check_flush_completion(void)
+/*================================*/
+{
+	ulint	move_start;
+	ulint	move_end;
+
+	ut_ad(mutex_own(&(log_sys->mutex)));
+
+	if (log_sys->n_pending_writes == 0) {
+
+		log_sys->written_to_all_lsn = log_sys->write_lsn;
+		log_sys->buf_next_to_write = log_sys->write_end_offset;
+
+		if (log_sys->write_end_offset > log_sys->max_buf_free / 2) {
+			/* Move the log buffer content to the start of the
+			buffer */
+
+			move_start = ut_calc_align_down(
+				log_sys->write_end_offset,
+				OS_FILE_LOG_BLOCK_SIZE);
+			move_end = ut_calc_align(log_sys->buf_free,
+						 OS_FILE_LOG_BLOCK_SIZE);
+
+			ut_memmove(log_sys->buf, log_sys->buf + move_start,
+				   move_end - move_start);
+			log_sys->buf_free -= move_start;
+
+			log_sys->buf_next_to_write -= move_start;
+		}
+
+		return(LOG_UNLOCK_FLUSH_LOCK);
+	}
+
+	return(0);
+}
+
+/******************************************************//**
+Completes an i/o to a log file. */
+UNIV_INTERN
+void
+log_io_complete(
+/*============*/
+	log_group_t*	group)	/*!< in: log group or a dummy pointer */
+{
+	ulint	unlock;
+
+#ifdef UNIV_LOG_ARCHIVE
+	if ((byte*)group == &log_archive_io) {
+		/* It was an archive write */
+
+		log_io_complete_archive();
+
+		return;
+	}
+#endif /* UNIV_LOG_ARCHIVE */
+
+	if ((ulint)group & 0x1UL) {
+		/* It was a checkpoint write */
+		group = (log_group_t*)((ulint)group - 1);
+
+		if (srv_unix_file_flush_method != SRV_UNIX_O_DSYNC
+		    && srv_unix_file_flush_method != SRV_UNIX_ALL_O_DIRECT
+		    && srv_unix_file_flush_method != SRV_UNIX_NOSYNC) {
+
+			fil_flush(group->space_id);
+		}
+
+#ifdef UNIV_DEBUG
+		if (log_debug_writes) {
+			fprintf(stderr,
+				"Checkpoint info written to group %lu\n",
+				group->id);
+		}
+#endif /* UNIV_DEBUG */
+		log_io_complete_checkpoint();
+
+		return;
+	}
+
+	ut_error;	/*!< We currently use synchronous writing of the
+			logs and cannot end up here! */
+
+	if (srv_unix_file_flush_method != SRV_UNIX_O_DSYNC
+	    && srv_unix_file_flush_method != SRV_UNIX_ALL_O_DIRECT
+	    && srv_unix_file_flush_method != SRV_UNIX_NOSYNC
+	    && srv_flush_log_at_trx_commit != 2) {
+
+		fil_flush(group->space_id);
+	}
+
+	mutex_enter(&(log_sys->mutex));
+	ut_ad(!recv_no_log_write);
+
+	ut_a(group->n_pending_writes > 0);
+	ut_a(log_sys->n_pending_writes > 0);
+
+	group->n_pending_writes--;
+	log_sys->n_pending_writes--;
+
+	unlock = log_group_check_flush_completion(group);
+	unlock = unlock | log_sys_check_flush_completion();
+
+	log_flush_do_unlocks(unlock);
+
+	mutex_exit(&(log_sys->mutex));
+}
+
+/******************************************************//**
+Writes a log file header to a log file space. */
+static
+void
+log_group_file_header_flush(
+/*========================*/
+	log_group_t*	group,		/*!< in: log group */
+	ulint		nth_file,	/*!< in: header to the nth file in the
+					log file space */
+	ib_uint64_t	start_lsn)	/*!< in: log file data starts at this
+					lsn */
+{
+	byte*	buf;
+	ulint	dest_offset;
+
+	ut_ad(mutex_own(&(log_sys->mutex)));
+	ut_ad(!recv_no_log_write);
+	ut_a(nth_file < group->n_files);
+
+	buf = *(group->file_header_bufs + nth_file);
+
+	mach_write_to_4(buf + LOG_GROUP_ID, group->id);
+	mach_write_ull(buf + LOG_FILE_START_LSN, start_lsn);
+
+	/* Wipe over possible label of ibbackup --restore */
+	memcpy(buf + LOG_FILE_WAS_CREATED_BY_HOT_BACKUP, "    ", 4);
+
+	dest_offset = nth_file * group->file_size;
+
+#ifdef UNIV_DEBUG
+	if (log_debug_writes) {
+		fprintf(stderr,
+			"Writing log file header to group %lu file %lu\n",
+			(ulong) group->id, (ulong) nth_file);
+	}
+#endif /* UNIV_DEBUG */
+	if (log_do_write) {
+		log_sys->n_log_ios++;
+
+		srv_os_log_pending_writes++;
+
+		fil_io(OS_FILE_WRITE | OS_FILE_LOG, TRUE, group->space_id, 0,
+		       dest_offset / UNIV_PAGE_SIZE,
+		       dest_offset % UNIV_PAGE_SIZE,
+		       OS_FILE_LOG_BLOCK_SIZE,
+		       buf, group);
+
+		srv_os_log_pending_writes--;
+	}
+}
+
+/******************************************************//**
+Stores a 4-byte checksum to the trailer checksum field of a log block
+before writing it to a log file. This checksum is used in recovery to
+check the consistency of a log block. */
+static
+void
+log_block_store_checksum(
+/*=====================*/
+	byte*	block)	/*!< in/out: pointer to a log block */
+{
+	log_block_set_checksum(block, log_block_calc_checksum(block));
+}
+
+/******************************************************//**
+Writes a buffer to a log file group. */
+UNIV_INTERN
+void
+log_group_write_buf(
+/*================*/
+	log_group_t*	group,		/*!< in: log group */
+	byte*		buf,		/*!< in: buffer */
+	ulint		len,		/*!< in: buffer len; must be divisible
+					by OS_FILE_LOG_BLOCK_SIZE */
+	ib_uint64_t	start_lsn,	/*!< in: start lsn of the buffer; must
+					be divisible by
+					OS_FILE_LOG_BLOCK_SIZE */
+	ulint		new_data_offset)/*!< in: start offset of new data in
+					buf: this parameter is used to decide
+					if we have to write a new log file
+					header */
+{
+	ulint	write_len;
+	ibool	write_header;
+	ulint	next_offset;
+	ulint	i;
+
+	ut_ad(mutex_own(&(log_sys->mutex)));
+	ut_ad(!recv_no_log_write);
+	ut_a(len % OS_FILE_LOG_BLOCK_SIZE == 0);
+	ut_a(((ulint) start_lsn) % OS_FILE_LOG_BLOCK_SIZE == 0);
+
+	if (new_data_offset == 0) {
+		write_header = TRUE;
+	} else {
+		write_header = FALSE;
+	}
+loop:
+	if (len == 0) {
+
+		return;
+	}
+
+	next_offset = log_group_calc_lsn_offset(start_lsn, group);
+
+	if ((next_offset % group->file_size == LOG_FILE_HDR_SIZE)
+	    && write_header) {
+		/* We start to write a new log file instance in the group */
+
+		log_group_file_header_flush(group,
+					    next_offset / group->file_size,
+					    start_lsn);
+		srv_os_log_written+= OS_FILE_LOG_BLOCK_SIZE;
+		srv_log_writes++;
+	}
+
+	if ((next_offset % group->file_size) + len > group->file_size) {
+
+		write_len = group->file_size
+			- (next_offset % group->file_size);
+	} else {
+		write_len = len;
+	}
+
+#ifdef UNIV_DEBUG
+	if (log_debug_writes) {
+
+		fprintf(stderr,
+			"Writing log file segment to group %lu"
+			" offset %lu len %lu\n"
+			"start lsn %llu\n"
+			"First block n:o %lu last block n:o %lu\n",
+			(ulong) group->id, (ulong) next_offset,
+			(ulong) write_len,
+			start_lsn,
+			(ulong) log_block_get_hdr_no(buf),
+			(ulong) log_block_get_hdr_no(
+				buf + write_len - OS_FILE_LOG_BLOCK_SIZE));
+		ut_a(log_block_get_hdr_no(buf)
+		     == log_block_convert_lsn_to_no(start_lsn));
+
+		for (i = 0; i < write_len / OS_FILE_LOG_BLOCK_SIZE; i++) {
+
+			ut_a(log_block_get_hdr_no(buf) + i
+			     == log_block_get_hdr_no(
+				     buf + i * OS_FILE_LOG_BLOCK_SIZE));
+		}
+	}
+#endif /* UNIV_DEBUG */
+	/* Calculate the checksums for each log block and write them to
+	the trailer fields of the log blocks */
+
+	for (i = 0; i < write_len / OS_FILE_LOG_BLOCK_SIZE; i++) {
+		log_block_store_checksum(buf + i * OS_FILE_LOG_BLOCK_SIZE);
+	}
+
+	if (log_do_write) {
+		log_sys->n_log_ios++;
+
+		srv_os_log_pending_writes++;
+
+		fil_io(OS_FILE_WRITE | OS_FILE_LOG, TRUE, group->space_id, 0,
+		       next_offset / UNIV_PAGE_SIZE,
+		       next_offset % UNIV_PAGE_SIZE, write_len, buf, group);
+
+		srv_os_log_pending_writes--;
+
+		srv_os_log_written+= write_len;
+		srv_log_writes++;
+	}
+
+	if (write_len < len) {
+		start_lsn += write_len;
+		len -= write_len;
+		buf += write_len;
+
+		write_header = TRUE;
+
+		goto loop;
+	}
+}
+
+/******************************************************//**
+This function is called, e.g., when a transaction wants to commit. It checks
+that the log has been written to the log file up to the last log entry written
+by the transaction. If there is a flush running, it waits and checks if the
+flush flushed enough. If not, starts a new flush. */
+UNIV_INTERN
+void
+log_write_up_to(
+/*============*/
+	ib_uint64_t	lsn,	/*!< in: log sequence number up to which
+				the log should be written,
+				IB_ULONGLONG_MAX if not specified */
+	ulint		wait,	/*!< in: LOG_NO_WAIT, LOG_WAIT_ONE_GROUP,
+				or LOG_WAIT_ALL_GROUPS */
+	ibool		flush_to_disk)
+				/*!< in: TRUE if we want the written log
+				also to be flushed to disk */
+{
+	log_group_t*	group;
+	ulint		start_offset;
+	ulint		end_offset;
+	ulint		area_start;
+	ulint		area_end;
+#ifdef UNIV_DEBUG
+	ulint		loop_count	= 0;
+#endif /* UNIV_DEBUG */
+	ulint		unlock;
+
+	if (recv_no_ibuf_operations) {
+		/* Recovery is running and no operations on the log files are
+		allowed yet (the variable name .._no_ibuf_.. is misleading) */
+
+		return;
+	}
+
+loop:
+#ifdef UNIV_DEBUG
+	loop_count++;
+
+	ut_ad(loop_count < 5);
+
+# if 0
+	if (loop_count > 2) {
+		fprintf(stderr, "Log loop count %lu\n", loop_count);
+	}
+# endif
+#endif
+
+	mutex_enter(&(log_sys->mutex));
+	ut_ad(!recv_no_log_write);
+
+	if (flush_to_disk
+	    && log_sys->flushed_to_disk_lsn >= lsn) {
+
+		mutex_exit(&(log_sys->mutex));
+
+		return;
+	}
+
+	if (!flush_to_disk
+	    && (log_sys->written_to_all_lsn >= lsn
+		|| (log_sys->written_to_some_lsn >= lsn
+		    && wait != LOG_WAIT_ALL_GROUPS))) {
+
+		mutex_exit(&(log_sys->mutex));
+
+		return;
+	}
+
+	if (log_sys->n_pending_writes > 0) {
+		/* A write (+ possibly flush to disk) is running */
+
+		if (flush_to_disk
+		    && log_sys->current_flush_lsn >= lsn) {
+			/* The write + flush will write enough: wait for it to
+			complete  */
+
+			goto do_waits;
+		}
+
+		if (!flush_to_disk
+		    && log_sys->write_lsn >= lsn) {
+			/* The write will write enough: wait for it to
+			complete  */
+
+			goto do_waits;
+		}
+
+		mutex_exit(&(log_sys->mutex));
+
+		/* Wait for the write to complete and try to start a new
+		write */
+
+		os_event_wait(log_sys->no_flush_event);
+
+		goto loop;
+	}
+
+	if (!flush_to_disk
+	    && log_sys->buf_free == log_sys->buf_next_to_write) {
+		/* Nothing to write and no flush to disk requested */
+
+		mutex_exit(&(log_sys->mutex));
+
+		return;
+	}
+
+#ifdef UNIV_DEBUG
+	if (log_debug_writes) {
+		fprintf(stderr,
+			"Writing log from %llu up to lsn %llu\n",
+			log_sys->written_to_all_lsn,
+			log_sys->lsn);
+	}
+#endif /* UNIV_DEBUG */
+	log_sys->n_pending_writes++;
+
+	group = UT_LIST_GET_FIRST(log_sys->log_groups);
+	group->n_pending_writes++;	/*!< We assume here that we have only
+					one log group! */
+
+	os_event_reset(log_sys->no_flush_event);
+	os_event_reset(log_sys->one_flushed_event);
+
+	start_offset = log_sys->buf_next_to_write;
+	end_offset = log_sys->buf_free;
+
+	area_start = ut_calc_align_down(start_offset, OS_FILE_LOG_BLOCK_SIZE);
+	area_end = ut_calc_align(end_offset, OS_FILE_LOG_BLOCK_SIZE);
+
+	ut_ad(area_end - area_start > 0);
+
+	log_sys->write_lsn = log_sys->lsn;
+
+	if (flush_to_disk) {
+		log_sys->current_flush_lsn = log_sys->lsn;
+	}
+
+	log_sys->one_flushed = FALSE;
+
+	log_block_set_flush_bit(log_sys->buf + area_start, TRUE);
+	log_block_set_checkpoint_no(
+		log_sys->buf + area_end - OS_FILE_LOG_BLOCK_SIZE,
+		log_sys->next_checkpoint_no);
+
+	/* Copy the last, incompletely written, log block a log block length
+	up, so that when the flush operation writes from the log buffer, the
+	segment to write will not be changed by writers to the log */
+
+	ut_memcpy(log_sys->buf + area_end,
+		  log_sys->buf + area_end - OS_FILE_LOG_BLOCK_SIZE,
+		  OS_FILE_LOG_BLOCK_SIZE);
+
+	log_sys->buf_free += OS_FILE_LOG_BLOCK_SIZE;
+	log_sys->write_end_offset = log_sys->buf_free;
+
+	group = UT_LIST_GET_FIRST(log_sys->log_groups);
+
+	/* Do the write to the log files */
+
+	while (group) {
+		log_group_write_buf(
+			group, log_sys->buf + area_start,
+			area_end - area_start,
+			ut_uint64_align_down(log_sys->written_to_all_lsn,
+					     OS_FILE_LOG_BLOCK_SIZE),
+			start_offset - area_start);
+
+		log_group_set_fields(group, log_sys->write_lsn);
+
+		group = UT_LIST_GET_NEXT(log_groups, group);
+	}
+
+	mutex_exit(&(log_sys->mutex));
+
+	if (srv_unix_file_flush_method == SRV_UNIX_O_DSYNC
+	    || srv_unix_file_flush_method == SRV_UNIX_ALL_O_DIRECT) {
+		/* O_DSYNC means the OS did not buffer the log file at all:
+		so we have also flushed to disk what we have written */
+
+		log_sys->flushed_to_disk_lsn = log_sys->write_lsn;
+
+	} else if (flush_to_disk) {
+
+		group = UT_LIST_GET_FIRST(log_sys->log_groups);
+
+		fil_flush(group->space_id);
+		log_sys->flushed_to_disk_lsn = log_sys->write_lsn;
+	}
+
+	mutex_enter(&(log_sys->mutex));
+
+	group = UT_LIST_GET_FIRST(log_sys->log_groups);
+
+	ut_a(group->n_pending_writes == 1);
+	ut_a(log_sys->n_pending_writes == 1);
+
+	group->n_pending_writes--;
+	log_sys->n_pending_writes--;
+
+	unlock = log_group_check_flush_completion(group);
+	unlock = unlock | log_sys_check_flush_completion();
+
+	log_flush_do_unlocks(unlock);
+
+	mutex_exit(&(log_sys->mutex));
+
+	return;
+
+do_waits:
+	mutex_exit(&(log_sys->mutex));
+
+	switch (wait) {
+	case LOG_WAIT_ONE_GROUP:
+		os_event_wait(log_sys->one_flushed_event);
+		break;
+	case LOG_WAIT_ALL_GROUPS:
+		os_event_wait(log_sys->no_flush_event);
+		break;
+#ifdef UNIV_DEBUG
+	case LOG_NO_WAIT:
+		break;
+	default:
+		ut_error;
+#endif /* UNIV_DEBUG */
+	}
+}
+
+/****************************************************************//**
+Does a syncronous flush of the log buffer to disk. */
+UNIV_INTERN
+void
+log_buffer_flush_to_disk(void)
+/*==========================*/
+{
+	ib_uint64_t	lsn;
+
+	mutex_enter(&(log_sys->mutex));
+
+	lsn = log_sys->lsn;
+
+	mutex_exit(&(log_sys->mutex));
+
+	log_write_up_to(lsn, LOG_WAIT_ALL_GROUPS, TRUE);
+}
+
+/****************************************************************//**
+This functions writes the log buffer to the log file and if 'flush'
+is set it forces a flush of the log file as well. This is meant to be
+called from background master thread only as it does not wait for
+the write (+ possible flush) to finish. */
+UNIV_INTERN
+void
+log_buffer_sync_in_background(
+/*==========================*/
+	ibool	flush)	/*!< in: flush the logs to disk */
+{
+	ib_uint64_t	lsn;
+
+	mutex_enter(&(log_sys->mutex));
+
+	lsn = log_sys->lsn;
+
+	mutex_exit(&(log_sys->mutex));
+
+	log_write_up_to(lsn, LOG_NO_WAIT, flush);
+}
+
+/********************************************************************
+
+Tries to establish a big enough margin of free space in the log buffer, such
+that a new log entry can be catenated without an immediate need for a flush. */
+static
+void
+log_flush_margin(void)
+/*==================*/
+{
+	log_t*		log	= log_sys;
+	ib_uint64_t	lsn	= 0;
+
+	mutex_enter(&(log->mutex));
+
+	if (log->buf_free > log->max_buf_free) {
+
+		if (log->n_pending_writes > 0) {
+			/* A flush is running: hope that it will provide enough
+			free space */
+		} else {
+			lsn = log->lsn;
+		}
+	}
+
+	mutex_exit(&(log->mutex));
+
+	if (lsn) {
+		log_write_up_to(lsn, LOG_NO_WAIT, FALSE);
+	}
+}
+
+/****************************************************************//**
+Advances the smallest lsn for which there are unflushed dirty blocks in the
+buffer pool. NOTE: this function may only be called if the calling thread owns
+no synchronization objects!
+@return FALSE if there was a flush batch of the same type running,
+which means that we could not start this flush batch */
+UNIV_INTERN
+ibool
+log_preflush_pool_modified_pages(
+/*=============================*/
+	ib_uint64_t	new_oldest,	/*!< in: try to advance
+					oldest_modified_lsn at least
+					to this lsn */
+	ibool		sync)		/*!< in: TRUE if synchronous
+					operation is desired */
+{
+	ulint	n_pages;
+
+	if (recv_recovery_on) {
+		/* If the recovery is running, we must first apply all
+		log records to their respective file pages to get the
+		right modify lsn values to these pages: otherwise, there
+		might be pages on disk which are not yet recovered to the
+		current lsn, and even after calling this function, we could
+		not know how up-to-date the disk version of the database is,
+		and we could not make a new checkpoint on the basis of the
+		info on the buffer pool only. */
+
+		recv_apply_hashed_log_recs(TRUE);
+	}
+
+	n_pages = buf_flush_batch(BUF_FLUSH_LIST, ULINT_MAX, new_oldest);
+
+	if (sync) {
+		buf_flush_wait_batch_end(BUF_FLUSH_LIST);
+	}
+
+	if (n_pages == ULINT_UNDEFINED) {
+
+		return(FALSE);
+	}
+
+	return(TRUE);
+}
+
+/******************************************************//**
+Completes a checkpoint. */
+static
+void
+log_complete_checkpoint(void)
+/*=========================*/
+{
+	ut_ad(mutex_own(&(log_sys->mutex)));
+	ut_ad(log_sys->n_pending_checkpoint_writes == 0);
+
+	log_sys->next_checkpoint_no++;
+
+	log_sys->last_checkpoint_lsn = log_sys->next_checkpoint_lsn;
+
+	rw_lock_x_unlock_gen(&(log_sys->checkpoint_lock), LOG_CHECKPOINT);
+}
+
+/******************************************************//**
+Completes an asynchronous checkpoint info write i/o to a log file. */
+static
+void
+log_io_complete_checkpoint(void)
+/*============================*/
+{
+	mutex_enter(&(log_sys->mutex));
+
+	ut_ad(log_sys->n_pending_checkpoint_writes > 0);
+
+	log_sys->n_pending_checkpoint_writes--;
+
+	if (log_sys->n_pending_checkpoint_writes == 0) {
+		log_complete_checkpoint();
+	}
+
+	mutex_exit(&(log_sys->mutex));
+}
+
+/*******************************************************************//**
+Writes info to a checkpoint about a log group. */
+static
+void
+log_checkpoint_set_nth_group_info(
+/*==============================*/
+	byte*	buf,	/*!< in: buffer for checkpoint info */
+	ulint	n,	/*!< in: nth slot */
+	ulint	file_no,/*!< in: archived file number */
+	ulint	offset)	/*!< in: archived file offset */
+{
+	ut_ad(n < LOG_MAX_N_GROUPS);
+
+	mach_write_to_4(buf + LOG_CHECKPOINT_GROUP_ARRAY
+			+ 8 * n + LOG_CHECKPOINT_ARCHIVED_FILE_NO, file_no);
+	mach_write_to_4(buf + LOG_CHECKPOINT_GROUP_ARRAY
+			+ 8 * n + LOG_CHECKPOINT_ARCHIVED_OFFSET, offset);
+}
+
+/*******************************************************************//**
+Gets info from a checkpoint about a log group. */
+UNIV_INTERN
+void
+log_checkpoint_get_nth_group_info(
+/*==============================*/
+	const byte*	buf,	/*!< in: buffer containing checkpoint info */
+	ulint		n,	/*!< in: nth slot */
+	ulint*		file_no,/*!< out: archived file number */
+	ulint*		offset)	/*!< out: archived file offset */
+{
+	ut_ad(n < LOG_MAX_N_GROUPS);
+
+	*file_no = mach_read_from_4(buf + LOG_CHECKPOINT_GROUP_ARRAY
+				    + 8 * n + LOG_CHECKPOINT_ARCHIVED_FILE_NO);
+	*offset = mach_read_from_4(buf + LOG_CHECKPOINT_GROUP_ARRAY
+				   + 8 * n + LOG_CHECKPOINT_ARCHIVED_OFFSET);
+}
+
+/******************************************************//**
+Writes the checkpoint info to a log group header. */
+static
+void
+log_group_checkpoint(
+/*=================*/
+	log_group_t*	group)	/*!< in: log group */
+{
+	log_group_t*	group2;
+#ifdef UNIV_LOG_ARCHIVE
+	ib_uint64_t	archived_lsn;
+	ib_uint64_t	next_archived_lsn;
+#endif /* UNIV_LOG_ARCHIVE */
+	ulint		write_offset;
+	ulint		fold;
+	byte*		buf;
+	ulint		i;
+
+	ut_ad(mutex_own(&(log_sys->mutex)));
+#if LOG_CHECKPOINT_SIZE > OS_FILE_LOG_BLOCK_SIZE
+# error "LOG_CHECKPOINT_SIZE > OS_FILE_LOG_BLOCK_SIZE"
+#endif
+
+	buf = group->checkpoint_buf;
+
+	mach_write_ull(buf + LOG_CHECKPOINT_NO, log_sys->next_checkpoint_no);
+	mach_write_ull(buf + LOG_CHECKPOINT_LSN, log_sys->next_checkpoint_lsn);
+
+	mach_write_to_4(buf + LOG_CHECKPOINT_OFFSET,
+			log_group_calc_lsn_offset(
+				log_sys->next_checkpoint_lsn, group));
+
+	mach_write_to_4(buf + LOG_CHECKPOINT_LOG_BUF_SIZE, log_sys->buf_size);
+
+#ifdef UNIV_LOG_ARCHIVE
+#error "UNIV_LOG_ARCHIVE could not be enabled"
+	if (log_sys->archiving_state == LOG_ARCH_OFF) {
+		archived_lsn = IB_ULONGLONG_MAX;
+	} else {
+		archived_lsn = log_sys->archived_lsn;
+
+		if (archived_lsn != log_sys->next_archived_lsn) {
+			next_archived_lsn = log_sys->next_archived_lsn;
+			/* For debugging only */
+		}
+	}
+
+	mach_write_ull(buf + LOG_CHECKPOINT_ARCHIVED_LSN, archived_lsn);
+#else /* UNIV_LOG_ARCHIVE */
+	mach_write_ull(buf + LOG_CHECKPOINT_ARCHIVED_LSN,
+			(ib_uint64_t)log_group_calc_lsn_offset(
+				log_sys->next_checkpoint_lsn, group));
+#endif /* UNIV_LOG_ARCHIVE */
+
+	for (i = 0; i < LOG_MAX_N_GROUPS; i++) {
+		log_checkpoint_set_nth_group_info(buf, i, 0, 0);
+	}
+
+	group2 = UT_LIST_GET_FIRST(log_sys->log_groups);
+
+	while (group2) {
+		log_checkpoint_set_nth_group_info(buf, group2->id,
+#ifdef UNIV_LOG_ARCHIVE
+						  group2->archived_file_no,
+						  group2->archived_offset
+#else /* UNIV_LOG_ARCHIVE */
+						  0, 0
+#endif /* UNIV_LOG_ARCHIVE */
+						  );
+
+		group2 = UT_LIST_GET_NEXT(log_groups, group2);
+	}
+
+	fold = ut_fold_binary(buf, LOG_CHECKPOINT_CHECKSUM_1);
+	mach_write_to_4(buf + LOG_CHECKPOINT_CHECKSUM_1, fold);
+
+	fold = ut_fold_binary(buf + LOG_CHECKPOINT_LSN,
+			      LOG_CHECKPOINT_CHECKSUM_2 - LOG_CHECKPOINT_LSN);
+	mach_write_to_4(buf + LOG_CHECKPOINT_CHECKSUM_2, fold);
+
+	/* Starting from InnoDB-3.23.50, we also write info on allocated
+	size in the tablespace */
+
+	mach_write_to_4(buf + LOG_CHECKPOINT_FSP_FREE_LIMIT,
+			log_fsp_current_free_limit);
+
+	mach_write_to_4(buf + LOG_CHECKPOINT_FSP_MAGIC_N,
+			LOG_CHECKPOINT_FSP_MAGIC_N_VAL);
+
+	/* We alternate the physical place of the checkpoint info in the first
+	log file */
+
+	if ((log_sys->next_checkpoint_no & 1) == 0) {
+		write_offset = LOG_CHECKPOINT_1;
+	} else {
+		write_offset = LOG_CHECKPOINT_2;
+	}
+
+	if (log_do_write) {
+		if (log_sys->n_pending_checkpoint_writes == 0) {
+
+			rw_lock_x_lock_gen(&(log_sys->checkpoint_lock),
+					   LOG_CHECKPOINT);
+		}
+
+		log_sys->n_pending_checkpoint_writes++;
+
+		log_sys->n_log_ios++;
+
+		/* We send as the last parameter the group machine address
+		added with 1, as we want to distinguish between a normal log
+		file write and a checkpoint field write */
+
+		fil_io(OS_FILE_WRITE | OS_FILE_LOG, FALSE, group->space_id, 0,
+		       write_offset / UNIV_PAGE_SIZE,
+		       write_offset % UNIV_PAGE_SIZE,
+		       OS_FILE_LOG_BLOCK_SIZE,
+		       buf, ((byte*)group + 1));
+
+		ut_ad(((ulint)group & 0x1UL) == 0);
+	}
+}
+#endif /* !UNIV_HOTBACKUP */
+
+#ifdef UNIV_HOTBACKUP
+/******************************************************//**
+Writes info to a buffer of a log group when log files are created in
+backup restoration. */
+UNIV_INTERN
+void
+log_reset_first_header_and_checkpoint(
+/*==================================*/
+	byte*		hdr_buf,/*!< in: buffer which will be written to the
+				start of the first log file */
+	ib_uint64_t	start)	/*!< in: lsn of the start of the first log file;
+				we pretend that there is a checkpoint at
+				start + LOG_BLOCK_HDR_SIZE */
+{
+	ulint		fold;
+	byte*		buf;
+	ib_uint64_t	lsn;
+
+	mach_write_to_4(hdr_buf + LOG_GROUP_ID, 0);
+	mach_write_ull(hdr_buf + LOG_FILE_START_LSN, start);
+
+	lsn = start + LOG_BLOCK_HDR_SIZE;
+
+	/* Write the label of ibbackup --restore */
+	strcpy((char*) hdr_buf + LOG_FILE_WAS_CREATED_BY_HOT_BACKUP,
+	       "ibbackup ");
+	ut_sprintf_timestamp((char*) hdr_buf
+			     + (LOG_FILE_WAS_CREATED_BY_HOT_BACKUP
+				+ (sizeof "ibbackup ") - 1));
+	buf = hdr_buf + LOG_CHECKPOINT_1;
+
+	mach_write_ull(buf + LOG_CHECKPOINT_NO, 0);
+	mach_write_ull(buf + LOG_CHECKPOINT_LSN, lsn);
+
+	mach_write_to_4(buf + LOG_CHECKPOINT_OFFSET,
+			LOG_FILE_HDR_SIZE + LOG_BLOCK_HDR_SIZE);
+
+	mach_write_to_4(buf + LOG_CHECKPOINT_LOG_BUF_SIZE, 2 * 1024 * 1024);
+
+	mach_write_ull(buf + LOG_CHECKPOINT_ARCHIVED_LSN, IB_ULONGLONG_MAX);
+
+	fold = ut_fold_binary(buf, LOG_CHECKPOINT_CHECKSUM_1);
+	mach_write_to_4(buf + LOG_CHECKPOINT_CHECKSUM_1, fold);
+
+	fold = ut_fold_binary(buf + LOG_CHECKPOINT_LSN,
+			      LOG_CHECKPOINT_CHECKSUM_2 - LOG_CHECKPOINT_LSN);
+	mach_write_to_4(buf + LOG_CHECKPOINT_CHECKSUM_2, fold);
+
+	/* Starting from InnoDB-3.23.50, we should also write info on
+	allocated size in the tablespace, but unfortunately we do not
+	know it here */
+}
+#endif /* UNIV_HOTBACKUP */
+
+#ifndef UNIV_HOTBACKUP
+/******************************************************//**
+Reads a checkpoint info from a log group header to log_sys->checkpoint_buf. */
+UNIV_INTERN
+void
+log_group_read_checkpoint_info(
+/*===========================*/
+	log_group_t*	group,	/*!< in: log group */
+	ulint		field)	/*!< in: LOG_CHECKPOINT_1 or LOG_CHECKPOINT_2 */
+{
+	ut_ad(mutex_own(&(log_sys->mutex)));
+
+	log_sys->n_log_ios++;
+
+	fil_io(OS_FILE_READ | OS_FILE_LOG, TRUE, group->space_id, 0,
+	       field / UNIV_PAGE_SIZE, field % UNIV_PAGE_SIZE,
+	       OS_FILE_LOG_BLOCK_SIZE, log_sys->checkpoint_buf, NULL);
+}
+
+/******************************************************//**
+Writes checkpoint info to groups. */
+UNIV_INTERN
+void
+log_groups_write_checkpoint_info(void)
+/*==================================*/
+{
+	log_group_t*	group;
+
+	ut_ad(mutex_own(&(log_sys->mutex)));
+
+	group = UT_LIST_GET_FIRST(log_sys->log_groups);
+
+	while (group) {
+		log_group_checkpoint(group);
+
+		group = UT_LIST_GET_NEXT(log_groups, group);
+	}
+}
+
+/******************************************************//**
+Makes a checkpoint. Note that this function does not flush dirty
+blocks from the buffer pool: it only checks what is lsn of the oldest
+modification in the pool, and writes information about the lsn in
+log files. Use log_make_checkpoint_at to flush also the pool.
+@return	TRUE if success, FALSE if a checkpoint write was already running */
+UNIV_INTERN
+ibool
+log_checkpoint(
+/*===========*/
+	ibool	sync,		/*!< in: TRUE if synchronous operation is
+				desired */
+	ibool	write_always)	/*!< in: the function normally checks if the
+				the new checkpoint would have a greater
+				lsn than the previous one: if not, then no
+				physical write is done; by setting this
+				parameter TRUE, a physical write will always be
+				made to log files */
+{
+	ib_uint64_t	oldest_lsn;
+
+	if (recv_recovery_is_on()) {
+		recv_apply_hashed_log_recs(TRUE);
+	}
+
+	if (srv_unix_file_flush_method != SRV_UNIX_NOSYNC) {
+		fil_flush_file_spaces(FIL_TABLESPACE);
+	}
+
+	mutex_enter(&(log_sys->mutex));
+
+	ut_ad(!recv_no_log_write);
+	oldest_lsn = log_buf_pool_get_oldest_modification();
+
+	mutex_exit(&(log_sys->mutex));
+
+	/* Because log also contains headers and dummy log records,
+	if the buffer pool contains no dirty buffers, oldest_lsn
+	gets the value log_sys->lsn from the previous function,
+	and we must make sure that the log is flushed up to that
+	lsn. If there are dirty buffers in the buffer pool, then our
+	write-ahead-logging algorithm ensures that the log has been flushed
+	up to oldest_lsn. */
+
+	log_write_up_to(oldest_lsn, LOG_WAIT_ALL_GROUPS, TRUE);
+
+	mutex_enter(&(log_sys->mutex));
+
+	if (!write_always
+	    && log_sys->last_checkpoint_lsn >= oldest_lsn) {
+
+		mutex_exit(&(log_sys->mutex));
+
+		return(TRUE);
+	}
+
+	ut_ad(log_sys->flushed_to_disk_lsn >= oldest_lsn);
+
+	if (log_sys->n_pending_checkpoint_writes > 0) {
+		/* A checkpoint write is running */
+
+		mutex_exit(&(log_sys->mutex));
+
+		if (sync) {
+			/* Wait for the checkpoint write to complete */
+			rw_lock_s_lock(&(log_sys->checkpoint_lock));
+			rw_lock_s_unlock(&(log_sys->checkpoint_lock));
+		}
+
+		return(FALSE);
+	}
+
+	log_sys->next_checkpoint_lsn = oldest_lsn;
+
+#ifdef UNIV_DEBUG
+	if (log_debug_writes) {
+		fprintf(stderr, "Making checkpoint no %lu at lsn %llu\n",
+			(ulong) log_sys->next_checkpoint_no,
+			oldest_lsn);
+	}
+#endif /* UNIV_DEBUG */
+
+	log_groups_write_checkpoint_info();
+
+	mutex_exit(&(log_sys->mutex));
+
+	if (sync) {
+		/* Wait for the checkpoint write to complete */
+		rw_lock_s_lock(&(log_sys->checkpoint_lock));
+		rw_lock_s_unlock(&(log_sys->checkpoint_lock));
+	}
+
+	return(TRUE);
+}
+
+/****************************************************************//**
+Makes a checkpoint at a given lsn or later. */
+UNIV_INTERN
+void
+log_make_checkpoint_at(
+/*===================*/
+	ib_uint64_t	lsn,		/*!< in: make a checkpoint at this or a
+					later lsn, if IB_ULONGLONG_MAX, makes
+					a checkpoint at the latest lsn */
+	ibool		write_always)	/*!< in: the function normally checks if
+					the new checkpoint would have a
+					greater lsn than the previous one: if
+					not, then no physical write is done;
+					by setting this parameter TRUE, a
+					physical write will always be made to
+					log files */
+{
+	/* Preflush pages synchronously */
+
+	while (!log_preflush_pool_modified_pages(lsn, TRUE));
+
+	while (!log_checkpoint(TRUE, write_always));
+}
+
+/****************************************************************//**
+Tries to establish a big enough margin of free space in the log groups, such
+that a new log entry can be catenated without an immediate need for a
+checkpoint. NOTE: this function may only be called if the calling thread
+owns no synchronization objects! */
+static
+void
+log_checkpoint_margin(void)
+/*=======================*/
+{
+	log_t*		log		= log_sys;
+	ib_uint64_t	age;
+	ib_uint64_t	checkpoint_age;
+	ib_uint64_t	advance;
+	ib_uint64_t	oldest_lsn;
+	ibool		sync;
+	ibool		checkpoint_sync;
+	ibool		do_checkpoint;
+	ibool		success;
+loop:
+	sync = FALSE;
+	checkpoint_sync = FALSE;
+	do_checkpoint = FALSE;
+
+	mutex_enter(&(log->mutex));
+	ut_ad(!recv_no_log_write);
+
+	if (log->check_flush_or_checkpoint == FALSE) {
+		mutex_exit(&(log->mutex));
+
+		return;
+	}
+
+	oldest_lsn = log_buf_pool_get_oldest_modification();
+
+	age = log->lsn - oldest_lsn;
+
+	if (age > log->max_modified_age_sync) {
+
+		/* A flush is urgent: we have to do a synchronous preflush */
+
+		sync = TRUE;
+		advance = 2 * (age - log->max_modified_age_sync);
+	} else if (age > log_max_modified_age_async()) {
+
+		/* A flush is not urgent: we do an asynchronous preflush */
+		advance = age - log_max_modified_age_async();
+	} else {
+		advance = 0;
+	}
+
+	checkpoint_age = log->lsn - log->last_checkpoint_lsn;
+
+	if (checkpoint_age > log->max_checkpoint_age) {
+		/* A checkpoint is urgent: we do it synchronously */
+
+		checkpoint_sync = TRUE;
+
+		do_checkpoint = TRUE;
+
+	} else if (checkpoint_age > log_max_checkpoint_age_async()) {
+		/* A checkpoint is not urgent: do it asynchronously */
+
+		do_checkpoint = TRUE;
+
+		log->check_flush_or_checkpoint = FALSE;
+	} else {
+		log->check_flush_or_checkpoint = FALSE;
+	}
+
+	mutex_exit(&(log->mutex));
+
+	if (advance) {
+		ib_uint64_t	new_oldest = oldest_lsn + advance;
+
+		success = log_preflush_pool_modified_pages(new_oldest, sync);
+
+		/* If the flush succeeded, this thread has done its part
+		and can proceed. If it did not succeed, there was another
+		thread doing a flush at the same time. If sync was FALSE,
+		the flush was not urgent, and we let this thread proceed.
+		Otherwise, we let it start from the beginning again. */
+
+		if (sync && !success) {
+			mutex_enter(&(log->mutex));
+
+			log->check_flush_or_checkpoint = TRUE;
+
+			mutex_exit(&(log->mutex));
+			goto loop;
+		}
+	}
+
+	if (do_checkpoint) {
+		log_checkpoint(checkpoint_sync, FALSE);
+
+		if (checkpoint_sync) {
+
+			goto loop;
+		}
+	}
+}
+
+/******************************************************//**
+Reads a specified log segment to a buffer. */
+UNIV_INTERN
+void
+log_group_read_log_seg(
+/*===================*/
+	ulint		type,		/*!< in: LOG_ARCHIVE or LOG_RECOVER */
+	byte*		buf,		/*!< in: buffer where to read */
+	log_group_t*	group,		/*!< in: log group */
+	ib_uint64_t	start_lsn,	/*!< in: read area start */
+	ib_uint64_t	end_lsn)	/*!< in: read area end */
+{
+	ulint	len;
+	ulint	source_offset;
+	ibool	sync;
+
+	ut_ad(mutex_own(&(log_sys->mutex)));
+
+	sync = (type == LOG_RECOVER);
+loop:
+	source_offset = log_group_calc_lsn_offset(start_lsn, group);
+
+	len = (ulint) (end_lsn - start_lsn);
+
+	ut_ad(len != 0);
+
+	if ((source_offset % group->file_size) + len > group->file_size) {
+
+		len = group->file_size - (source_offset % group->file_size);
+	}
+
+#ifdef UNIV_LOG_ARCHIVE
+	if (type == LOG_ARCHIVE) {
+
+		log_sys->n_pending_archive_ios++;
+	}
+#endif /* UNIV_LOG_ARCHIVE */
+
+	log_sys->n_log_ios++;
+
+	fil_io(OS_FILE_READ | OS_FILE_LOG, sync, group->space_id, 0,
+	       source_offset / UNIV_PAGE_SIZE, source_offset % UNIV_PAGE_SIZE,
+	       len, buf, NULL);
+
+	start_lsn += len;
+	buf += len;
+
+	if (start_lsn != end_lsn) {
+
+		goto loop;
+	}
+}
+
+#ifdef UNIV_LOG_ARCHIVE
+/******************************************************//**
+Generates an archived log file name. */
+UNIV_INTERN
+void
+log_archived_file_name_gen(
+/*=======================*/
+	char*	buf,	/*!< in: buffer where to write */
+	ulint	id __attribute__((unused)),
+			/*!< in: group id;
+			currently we only archive the first group */
+	ulint	file_no)/*!< in: file number */
+{
+	sprintf(buf, "%sib_arch_log_%010lu", srv_arch_dir, (ulong) file_no);
+}
+
+/******************************************************//**
+Writes a log file header to a log file space. */
+static
+void
+log_group_archive_file_header_write(
+/*================================*/
+	log_group_t*	group,		/*!< in: log group */
+	ulint		nth_file,	/*!< in: header to the nth file in the
+					archive log file space */
+	ulint		file_no,	/*!< in: archived file number */
+	ib_uint64_t	start_lsn)	/*!< in: log file data starts at this
+					lsn */
+{
+	byte*	buf;
+	ulint	dest_offset;
+
+	ut_ad(mutex_own(&(log_sys->mutex)));
+
+	ut_a(nth_file < group->n_files);
+
+	buf = *(group->archive_file_header_bufs + nth_file);
+
+	mach_write_to_4(buf + LOG_GROUP_ID, group->id);
+	mach_write_ull(buf + LOG_FILE_START_LSN, start_lsn);
+	mach_write_to_4(buf + LOG_FILE_NO, file_no);
+
+	mach_write_to_4(buf + LOG_FILE_ARCH_COMPLETED, FALSE);
+
+	dest_offset = nth_file * group->file_size;
+
+	log_sys->n_log_ios++;
+
+	fil_io(OS_FILE_WRITE | OS_FILE_LOG, TRUE, group->archive_space_id,
+	       dest_offset / UNIV_PAGE_SIZE,
+	       dest_offset % UNIV_PAGE_SIZE,
+	       2 * OS_FILE_LOG_BLOCK_SIZE,
+	       buf, &log_archive_io);
+}
+
+/******************************************************//**
+Writes a log file header to a completed archived log file. */
+static
+void
+log_group_archive_completed_header_write(
+/*=====================================*/
+	log_group_t*	group,		/*!< in: log group */
+	ulint		nth_file,	/*!< in: header to the nth file in the
+					archive log file space */
+	ib_uint64_t	end_lsn)	/*!< in: end lsn of the file */
+{
+	byte*	buf;
+	ulint	dest_offset;
+
+	ut_ad(mutex_own(&(log_sys->mutex)));
+	ut_a(nth_file < group->n_files);
+
+	buf = *(group->archive_file_header_bufs + nth_file);
+
+	mach_write_to_4(buf + LOG_FILE_ARCH_COMPLETED, TRUE);
+	mach_write_ull(buf + LOG_FILE_END_LSN, end_lsn);
+
+	dest_offset = nth_file * group->file_size + LOG_FILE_ARCH_COMPLETED;
+
+	log_sys->n_log_ios++;
+
+	fil_io(OS_FILE_WRITE | OS_FILE_LOG, TRUE, group->archive_space_id,
+	       dest_offset / UNIV_PAGE_SIZE,
+	       dest_offset % UNIV_PAGE_SIZE,
+	       OS_FILE_LOG_BLOCK_SIZE,
+	       buf + LOG_FILE_ARCH_COMPLETED,
+	       &log_archive_io);
+}
+
+/******************************************************//**
+Does the archive writes for a single log group. */
+static
+void
+log_group_archive(
+/*==============*/
+	log_group_t*	group)	/*!< in: log group */
+{
+	os_file_t	 file_handle;
+	ib_uint64_t	start_lsn;
+	ib_uint64_t	end_lsn;
+	char		name[1024];
+	byte*		buf;
+	ulint		len;
+	ibool		ret;
+	ulint		next_offset;
+	ulint		n_files;
+	ulint		open_mode;
+
+	ut_ad(mutex_own(&(log_sys->mutex)));
+
+	start_lsn = log_sys->archived_lsn;
+
+	ut_a(start_lsn % OS_FILE_LOG_BLOCK_SIZE == 0);
+
+	end_lsn = log_sys->next_archived_lsn;
+
+	ut_a(end_lsn % OS_FILE_LOG_BLOCK_SIZE == 0);
+
+	buf = log_sys->archive_buf;
+
+	n_files = 0;
+
+	next_offset = group->archived_offset;
+loop:
+	if ((next_offset % group->file_size == 0)
+	    || (fil_space_get_size(group->archive_space_id) == 0)) {
+
+		/* Add the file to the archive file space; create or open the
+		file */
+
+		if (next_offset % group->file_size == 0) {
+			open_mode = OS_FILE_CREATE;
+		} else {
+			open_mode = OS_FILE_OPEN;
+		}
+
+		log_archived_file_name_gen(name, group->id,
+					   group->archived_file_no + n_files);
+
+		file_handle = os_file_create(name, open_mode, OS_FILE_AIO,
+					     OS_DATA_FILE, &ret);
+
+		if (!ret && (open_mode == OS_FILE_CREATE)) {
+			file_handle = os_file_create(
+				name, OS_FILE_OPEN, OS_FILE_AIO,
+				OS_DATA_FILE, &ret);
+		}
+
+		if (!ret) {
+			fprintf(stderr,
+				"InnoDB: Cannot create or open"
+				" archive log file %s.\n"
+				"InnoDB: Cannot continue operation.\n"
+				"InnoDB: Check that the log archive"
+				" directory exists,\n"
+				"InnoDB: you have access rights to it, and\n"
+				"InnoDB: there is space available.\n", name);
+			exit(1);
+		}
+
+#ifdef UNIV_DEBUG
+		if (log_debug_writes) {
+			fprintf(stderr, "Created archive file %s\n", name);
+		}
+#endif /* UNIV_DEBUG */
+
+		ret = os_file_close(file_handle);
+
+		ut_a(ret);
+
+		/* Add the archive file as a node to the space */
+
+		fil_node_create(name, group->file_size / UNIV_PAGE_SIZE,
+				group->archive_space_id, FALSE);
+
+		if (next_offset % group->file_size == 0) {
+			log_group_archive_file_header_write(
+				group, n_files,
+				group->archived_file_no + n_files,
+				start_lsn);
+
+			next_offset += LOG_FILE_HDR_SIZE;
+		}
+	}
+
+	len = end_lsn - start_lsn;
+
+	if (group->file_size < (next_offset % group->file_size) + len) {
+
+		len = group->file_size - (next_offset % group->file_size);
+	}
+
+#ifdef UNIV_DEBUG
+	if (log_debug_writes) {
+		fprintf(stderr,
+			"Archiving starting at lsn %llu, len %lu"
+			" to group %lu\n",
+			start_lsn,
+			(ulong) len, (ulong) group->id);
+	}
+#endif /* UNIV_DEBUG */
+
+	log_sys->n_pending_archive_ios++;
+
+	log_sys->n_log_ios++;
+
+	fil_io(OS_FILE_WRITE | OS_FILE_LOG, FALSE, group->archive_space_id,
+	       next_offset / UNIV_PAGE_SIZE, next_offset % UNIV_PAGE_SIZE,
+	       ut_calc_align(len, OS_FILE_LOG_BLOCK_SIZE), buf,
+	       &log_archive_io);
+
+	start_lsn += len;
+	next_offset += len;
+	buf += len;
+
+	if (next_offset % group->file_size == 0) {
+		n_files++;
+	}
+
+	if (end_lsn != start_lsn) {
+
+		goto loop;
+	}
+
+	group->next_archived_file_no = group->archived_file_no + n_files;
+	group->next_archived_offset = next_offset % group->file_size;
+
+	ut_a(group->next_archived_offset % OS_FILE_LOG_BLOCK_SIZE == 0);
+}
+
+/*****************************************************//**
+(Writes to the archive of each log group.) Currently, only the first
+group is archived. */
+static
+void
+log_archive_groups(void)
+/*====================*/
+{
+	log_group_t*	group;
+
+	ut_ad(mutex_own(&(log_sys->mutex)));
+
+	group = UT_LIST_GET_FIRST(log_sys->log_groups);
+
+	log_group_archive(group);
+}
+
+/*****************************************************//**
+Completes the archiving write phase for (each log group), currently,
+the first log group. */
+static
+void
+log_archive_write_complete_groups(void)
+/*===================================*/
+{
+	log_group_t*	group;
+	ulint		end_offset;
+	ulint		trunc_files;
+	ulint		n_files;
+	ib_uint64_t	start_lsn;
+	ib_uint64_t	end_lsn;
+	ulint		i;
+
+	ut_ad(mutex_own(&(log_sys->mutex)));
+
+	group = UT_LIST_GET_FIRST(log_sys->log_groups);
+
+	group->archived_file_no = group->next_archived_file_no;
+	group->archived_offset = group->next_archived_offset;
+
+	/* Truncate from the archive file space all but the last
+	file, or if it has been written full, all files */
+
+	n_files = (UNIV_PAGE_SIZE
+		   * fil_space_get_size(group->archive_space_id))
+		/ group->file_size;
+	ut_ad(n_files > 0);
+
+	end_offset = group->archived_offset;
+
+	if (end_offset % group->file_size == 0) {
+
+		trunc_files = n_files;
+	} else {
+		trunc_files = n_files - 1;
+	}
+
+#ifdef UNIV_DEBUG
+	if (log_debug_writes && trunc_files) {
+		fprintf(stderr,
+			"Complete file(s) archived to group %lu\n",
+			(ulong) group->id);
+	}
+#endif /* UNIV_DEBUG */
+
+	/* Calculate the archive file space start lsn */
+	start_lsn = log_sys->next_archived_lsn
+		- (end_offset - LOG_FILE_HDR_SIZE + trunc_files
+		   * (group->file_size - LOG_FILE_HDR_SIZE));
+	end_lsn = start_lsn;
+
+	for (i = 0; i < trunc_files; i++) {
+
+		end_lsn += group->file_size - LOG_FILE_HDR_SIZE;
+
+		/* Write a notice to the headers of archived log
+		files that the file write has been completed */
+
+		log_group_archive_completed_header_write(group, i, end_lsn);
+	}
+
+	fil_space_truncate_start(group->archive_space_id,
+				 trunc_files * group->file_size);
+
+#ifdef UNIV_DEBUG
+	if (log_debug_writes) {
+		fputs("Archiving writes completed\n", stderr);
+	}
+#endif /* UNIV_DEBUG */
+}
+
+/******************************************************//**
+Completes an archiving i/o. */
+static
+void
+log_archive_check_completion_low(void)
+/*==================================*/
+{
+	ut_ad(mutex_own(&(log_sys->mutex)));
+
+	if (log_sys->n_pending_archive_ios == 0
+	    && log_sys->archiving_phase == LOG_ARCHIVE_READ) {
+
+#ifdef UNIV_DEBUG
+		if (log_debug_writes) {
+			fputs("Archiving read completed\n", stderr);
+		}
+#endif /* UNIV_DEBUG */
+
+		/* Archive buffer has now been read in: start archive writes */
+
+		log_sys->archiving_phase = LOG_ARCHIVE_WRITE;
+
+		log_archive_groups();
+	}
+
+	if (log_sys->n_pending_archive_ios == 0
+	    && log_sys->archiving_phase == LOG_ARCHIVE_WRITE) {
+
+		log_archive_write_complete_groups();
+
+		log_sys->archived_lsn = log_sys->next_archived_lsn;
+
+		rw_lock_x_unlock_gen(&(log_sys->archive_lock), LOG_ARCHIVE);
+	}
+}
+
+/******************************************************//**
+Completes an archiving i/o. */
+static
+void
+log_io_complete_archive(void)
+/*=========================*/
+{
+	log_group_t*	group;
+
+	mutex_enter(&(log_sys->mutex));
+
+	group = UT_LIST_GET_FIRST(log_sys->log_groups);
+
+	mutex_exit(&(log_sys->mutex));
+
+	fil_flush(group->archive_space_id);
+
+	mutex_enter(&(log_sys->mutex));
+
+	ut_ad(log_sys->n_pending_archive_ios > 0);
+
+	log_sys->n_pending_archive_ios--;
+
+	log_archive_check_completion_low();
+
+	mutex_exit(&(log_sys->mutex));
+}
+
+/********************************************************************//**
+Starts an archiving operation.
+@return	TRUE if succeed, FALSE if an archiving operation was already running */
+UNIV_INTERN
+ibool
+log_archive_do(
+/*===========*/
+	ibool	sync,	/*!< in: TRUE if synchronous operation is desired */
+	ulint*	n_bytes)/*!< out: archive log buffer size, 0 if nothing to
+			archive */
+{
+	ibool		calc_new_limit;
+	ib_uint64_t	start_lsn;
+	ib_uint64_t	limit_lsn;
+
+	calc_new_limit = TRUE;
+loop:
+	mutex_enter(&(log_sys->mutex));
+
+	switch (log_sys->archiving_state) {
+	case LOG_ARCH_OFF:
+arch_none:
+		mutex_exit(&(log_sys->mutex));
+
+		*n_bytes = 0;
+
+		return(TRUE);
+	case LOG_ARCH_STOPPED:
+	case LOG_ARCH_STOPPING2:
+		mutex_exit(&(log_sys->mutex));
+
+		os_event_wait(log_sys->archiving_on);
+
+		goto loop;
+	}
+
+	start_lsn = log_sys->archived_lsn;
+
+	if (calc_new_limit) {
+		ut_a(log_sys->archive_buf_size % OS_FILE_LOG_BLOCK_SIZE == 0);
+		limit_lsn = start_lsn + log_sys->archive_buf_size;
+
+		*n_bytes = log_sys->archive_buf_size;
+
+		if (limit_lsn >= log_sys->lsn) {
+
+			limit_lsn = ut_uint64_align_down(
+				log_sys->lsn, OS_FILE_LOG_BLOCK_SIZE);
+		}
+	}
+
+	if (log_sys->archived_lsn >= limit_lsn) {
+
+		goto arch_none;
+	}
+
+	if (log_sys->written_to_all_lsn < limit_lsn) {
+
+		mutex_exit(&(log_sys->mutex));
+
+		log_write_up_to(limit_lsn, LOG_WAIT_ALL_GROUPS, TRUE);
+
+		calc_new_limit = FALSE;
+
+		goto loop;
+	}
+
+	if (log_sys->n_pending_archive_ios > 0) {
+		/* An archiving operation is running */
+
+		mutex_exit(&(log_sys->mutex));
+
+		if (sync) {
+			rw_lock_s_lock(&(log_sys->archive_lock));
+			rw_lock_s_unlock(&(log_sys->archive_lock));
+		}
+
+		*n_bytes = log_sys->archive_buf_size;
+
+		return(FALSE);
+	}
+
+	rw_lock_x_lock_gen(&(log_sys->archive_lock), LOG_ARCHIVE);
+
+	log_sys->archiving_phase = LOG_ARCHIVE_READ;
+
+	log_sys->next_archived_lsn = limit_lsn;
+
+#ifdef UNIV_DEBUG
+	if (log_debug_writes) {
+		fprintf(stderr,
+			"Archiving from lsn %llu to lsn %llu\n",
+			log_sys->archived_lsn, limit_lsn);
+	}
+#endif /* UNIV_DEBUG */
+
+	/* Read the log segment to the archive buffer */
+
+	log_group_read_log_seg(LOG_ARCHIVE, log_sys->archive_buf,
+			       UT_LIST_GET_FIRST(log_sys->log_groups),
+			       start_lsn, limit_lsn);
+
+	mutex_exit(&(log_sys->mutex));
+
+	if (sync) {
+		rw_lock_s_lock(&(log_sys->archive_lock));
+		rw_lock_s_unlock(&(log_sys->archive_lock));
+	}
+
+	*n_bytes = log_sys->archive_buf_size;
+
+	return(TRUE);
+}
+
+/****************************************************************//**
+Writes the log contents to the archive at least up to the lsn when this
+function was called. */
+static
+void
+log_archive_all(void)
+/*=================*/
+{
+	ib_uint64_t	present_lsn;
+	ulint		dummy;
+
+	mutex_enter(&(log_sys->mutex));
+
+	if (log_sys->archiving_state == LOG_ARCH_OFF) {
+		mutex_exit(&(log_sys->mutex));
+
+		return;
+	}
+
+	present_lsn = log_sys->lsn;
+
+	mutex_exit(&(log_sys->mutex));
+
+	log_pad_current_log_block();
+
+	for (;;) {
+		mutex_enter(&(log_sys->mutex));
+
+		if (present_lsn <= log_sys->archived_lsn) {
+
+			mutex_exit(&(log_sys->mutex));
+
+			return;
+		}
+
+		mutex_exit(&(log_sys->mutex));
+
+		log_archive_do(TRUE, &dummy);
+	}
+}
+
+/*****************************************************//**
+Closes the possible open archive log file (for each group) the first group,
+and if it was open, increments the group file count by 2, if desired. */
+static
+void
+log_archive_close_groups(
+/*=====================*/
+	ibool	increment_file_count)	/*!< in: TRUE if we want to increment
+					the file count */
+{
+	log_group_t*	group;
+	ulint		trunc_len;
+
+	ut_ad(mutex_own(&(log_sys->mutex)));
+
+	if (log_sys->archiving_state == LOG_ARCH_OFF) {
+
+		return;
+	}
+
+	group = UT_LIST_GET_FIRST(log_sys->log_groups);
+
+	trunc_len = UNIV_PAGE_SIZE
+		* fil_space_get_size(group->archive_space_id);
+	if (trunc_len > 0) {
+		ut_a(trunc_len == group->file_size);
+
+		/* Write a notice to the headers of archived log
+		files that the file write has been completed */
+
+		log_group_archive_completed_header_write(
+			group, 0, log_sys->archived_lsn);
+
+		fil_space_truncate_start(group->archive_space_id,
+					 trunc_len);
+		if (increment_file_count) {
+			group->archived_offset = 0;
+			group->archived_file_no += 2;
+		}
+
+#ifdef UNIV_DEBUG
+		if (log_debug_writes) {
+			fprintf(stderr,
+				"Incrementing arch file no to %lu"
+				" in log group %lu\n",
+				(ulong) group->archived_file_no + 2,
+				(ulong) group->id);
+		}
+#endif /* UNIV_DEBUG */
+	}
+}
+
+/****************************************************************//**
+Writes the log contents to the archive up to the lsn when this function was
+called, and stops the archiving. When archiving is started again, the archived
+log file numbers start from 2 higher, so that the archiving will not write
+again to the archived log files which exist when this function returns.
+@return	DB_SUCCESS or DB_ERROR */
+UNIV_INTERN
+ulint
+log_archive_stop(void)
+/*==================*/
+{
+	ibool	success;
+
+	mutex_enter(&(log_sys->mutex));
+
+	if (log_sys->archiving_state != LOG_ARCH_ON) {
+
+		mutex_exit(&(log_sys->mutex));
+
+		return(DB_ERROR);
+	}
+
+	log_sys->archiving_state = LOG_ARCH_STOPPING;
+
+	mutex_exit(&(log_sys->mutex));
+
+	log_archive_all();
+
+	mutex_enter(&(log_sys->mutex));
+
+	log_sys->archiving_state = LOG_ARCH_STOPPING2;
+	os_event_reset(log_sys->archiving_on);
+
+	mutex_exit(&(log_sys->mutex));
+
+	/* Wait for a possible archiving operation to end */
+
+	rw_lock_s_lock(&(log_sys->archive_lock));
+	rw_lock_s_unlock(&(log_sys->archive_lock));
+
+	mutex_enter(&(log_sys->mutex));
+
+	/* Close all archived log files, incrementing the file count by 2,
+	if appropriate */
+
+	log_archive_close_groups(TRUE);
+
+	mutex_exit(&(log_sys->mutex));
+
+	/* Make a checkpoint, so that if recovery is needed, the file numbers
+	of new archived log files will start from the right value */
+
+	success = FALSE;
+
+	while (!success) {
+		success = log_checkpoint(TRUE, TRUE);
+	}
+
+	mutex_enter(&(log_sys->mutex));
+
+	log_sys->archiving_state = LOG_ARCH_STOPPED;
+
+	mutex_exit(&(log_sys->mutex));
+
+	return(DB_SUCCESS);
+}
+
+/****************************************************************//**
+Starts again archiving which has been stopped.
+@return	DB_SUCCESS or DB_ERROR */
+UNIV_INTERN
+ulint
+log_archive_start(void)
+/*===================*/
+{
+	mutex_enter(&(log_sys->mutex));
+
+	if (log_sys->archiving_state != LOG_ARCH_STOPPED) {
+
+		mutex_exit(&(log_sys->mutex));
+
+		return(DB_ERROR);
+	}
+
+	log_sys->archiving_state = LOG_ARCH_ON;
+
+	os_event_set(log_sys->archiving_on);
+
+	mutex_exit(&(log_sys->mutex));
+
+	return(DB_SUCCESS);
+}
+
+/****************************************************************//**
+Stop archiving the log so that a gap may occur in the archived log files.
+@return	DB_SUCCESS or DB_ERROR */
+UNIV_INTERN
+ulint
+log_archive_noarchivelog(void)
+/*==========================*/
+{
+loop:
+	mutex_enter(&(log_sys->mutex));
+
+	if (log_sys->archiving_state == LOG_ARCH_STOPPED
+	    || log_sys->archiving_state == LOG_ARCH_OFF) {
+
+		log_sys->archiving_state = LOG_ARCH_OFF;
+
+		os_event_set(log_sys->archiving_on);
+
+		mutex_exit(&(log_sys->mutex));
+
+		return(DB_SUCCESS);
+	}
+
+	mutex_exit(&(log_sys->mutex));
+
+	log_archive_stop();
+
+	os_thread_sleep(500000);
+
+	goto loop;
+}
+
+/****************************************************************//**
+Start archiving the log so that a gap may occur in the archived log files.
+@return	DB_SUCCESS or DB_ERROR */
+UNIV_INTERN
+ulint
+log_archive_archivelog(void)
+/*========================*/
+{
+	mutex_enter(&(log_sys->mutex));
+
+	if (log_sys->archiving_state == LOG_ARCH_OFF) {
+
+		log_sys->archiving_state = LOG_ARCH_ON;
+
+		log_sys->archived_lsn
+			= ut_uint64_align_down(log_sys->lsn,
+					       OS_FILE_LOG_BLOCK_SIZE);
+		mutex_exit(&(log_sys->mutex));
+
+		return(DB_SUCCESS);
+	}
+
+	mutex_exit(&(log_sys->mutex));
+
+	return(DB_ERROR);
+}
+
+/****************************************************************//**
+Tries to establish a big enough margin of free space in the log groups, such
+that a new log entry can be catenated without an immediate need for
+archiving. */
+static
+void
+log_archive_margin(void)
+/*====================*/
+{
+	log_t*	log		= log_sys;
+	ulint	age;
+	ibool	sync;
+	ulint	dummy;
+loop:
+	mutex_enter(&(log->mutex));
+
+	if (log->archiving_state == LOG_ARCH_OFF) {
+		mutex_exit(&(log->mutex));
+
+		return;
+	}
+
+	age = log->lsn - log->archived_lsn;
+
+	if (age > log->max_archived_lsn_age) {
+
+		/* An archiving is urgent: we have to do synchronous i/o */
+
+		sync = TRUE;
+
+	} else if (age > log->max_archived_lsn_age_async) {
+
+		/* An archiving is not urgent: we do asynchronous i/o */
+
+		sync = FALSE;
+	} else {
+		/* No archiving required yet */
+
+		mutex_exit(&(log->mutex));
+
+		return;
+	}
+
+	mutex_exit(&(log->mutex));
+
+	log_archive_do(sync, &dummy);
+
+	if (sync == TRUE) {
+		/* Check again that enough was written to the archive */
+
+		goto loop;
+	}
+}
+#endif /* UNIV_LOG_ARCHIVE */
+
+/********************************************************************//**
+Checks that there is enough free space in the log to start a new query step.
+Flushes the log buffer or makes a new checkpoint if necessary. NOTE: this
+function may only be called if the calling thread owns no synchronization
+objects! */
+UNIV_INTERN
+void
+log_check_margins(void)
+/*===================*/
+{
+loop:
+	log_flush_margin();
+
+	log_checkpoint_margin();
+
+#ifdef UNIV_LOG_ARCHIVE
+	log_archive_margin();
+#endif /* UNIV_LOG_ARCHIVE */
+
+	mutex_enter(&(log_sys->mutex));
+	ut_ad(!recv_no_log_write);
+
+	if (log_sys->check_flush_or_checkpoint) {
+
+		mutex_exit(&(log_sys->mutex));
+
+		goto loop;
+	}
+
+	mutex_exit(&(log_sys->mutex));
+}
+
+/****************************************************************//**
+Makes a checkpoint at the latest lsn and writes it to first page of each
+data file in the database, so that we know that the file spaces contain
+all modifications up to that lsn. This can only be called at database
+shutdown. This function also writes all log in log files to the log archive. */
+UNIV_INTERN
+void
+logs_empty_and_mark_files_at_shutdown(void)
+/*=======================================*/
+{
+	ib_uint64_t	lsn;
+	ulint		arch_log_no;
+
+	if (srv_print_verbose_log) {
+		ut_print_timestamp(stderr);
+		fprintf(stderr, "  InnoDB: Starting shutdown...\n");
+	}
+	/* Wait until the master thread and all other operations are idle: our
+	algorithm only works if the server is idle at shutdown */
+
+	srv_shutdown_state = SRV_SHUTDOWN_CLEANUP;
+	os_event_set(srv_purge_thread_event);
+loop:
+	os_thread_sleep(100000);
+
+	mutex_enter(&kernel_mutex);
+
+	/* We need the monitor threads to stop before we proceed with a
+	normal shutdown. In case of very fast shutdown, however, we can
+	proceed without waiting for monitor threads. */
+
+	if (srv_fast_shutdown < 2
+	   && (srv_error_monitor_active
+	      || srv_lock_timeout_active || srv_monitor_active)) {
+
+		mutex_exit(&kernel_mutex);
+
+		goto loop;
+	}
+
+	/* Check that there are no longer transactions. We need this wait even
+	for the 'very fast' shutdown, because the InnoDB layer may have
+	committed or prepared transactions and we don't want to lose them. */
+
+	if (trx_n_mysql_transactions > 0
+	    || UT_LIST_GET_LEN(trx_sys->trx_list) > 0) {
+
+		mutex_exit(&kernel_mutex);
+
+		goto loop;
+	}
+
+	if (srv_fast_shutdown == 2) {
+		/* In this fastest shutdown we do not flush the buffer pool:
+		it is essentially a 'crash' of the InnoDB server. Make sure
+		that the log is all flushed to disk, so that we can recover
+		all committed transactions in a crash recovery. We must not
+		write the lsn stamps to the data files, since at a startup
+		InnoDB deduces from the stamps if the previous shutdown was
+		clean. */
+
+		log_buffer_flush_to_disk();
+
+		return; /* We SKIP ALL THE REST !! */
+	}
+
+	/* Check that the master thread is suspended */
+
+	if (srv_n_threads_active[SRV_MASTER] != 0) {
+
+		mutex_exit(&kernel_mutex);
+
+		goto loop;
+	}
+
+	/* Check that the purge threads ended */
+	if (srv_use_purge_thread
+	    && (srv_n_threads_active[SRV_PURGE] != 0
+		|| srv_n_threads_active[SRV_PURGE_WORKER] != 0)) {
+
+		mutex_exit(&kernel_mutex);
+
+		goto loop;
+	}
+
+	mutex_exit(&kernel_mutex);
+
+	mutex_enter(&(log_sys->mutex));
+
+	if (log_sys->n_pending_checkpoint_writes
+#ifdef UNIV_LOG_ARCHIVE
+	    || log_sys->n_pending_archive_ios
+#endif /* UNIV_LOG_ARCHIVE */
+	    || log_sys->n_pending_writes) {
+
+		mutex_exit(&(log_sys->mutex));
+
+		goto loop;
+	}
+
+	mutex_exit(&(log_sys->mutex));
+
+	if (!buf_pool_check_no_pending_io()) {
+
+		goto loop;
+	}
+
+#ifdef UNIV_LOG_ARCHIVE
+	log_archive_all();
+#endif /* UNIV_LOG_ARCHIVE */
+
+	log_make_checkpoint_at(IB_ULONGLONG_MAX, TRUE);
+
+	mutex_enter(&(log_sys->mutex));
+
+	lsn = log_sys->lsn;
+
+	if (lsn != log_sys->last_checkpoint_lsn
+#ifdef UNIV_LOG_ARCHIVE
+	    || (srv_log_archive_on
+		&& lsn != log_sys->archived_lsn + LOG_BLOCK_HDR_SIZE)
+#endif /* UNIV_LOG_ARCHIVE */
+	    ) {
+
+		mutex_exit(&(log_sys->mutex));
+
+		goto loop;
+	}
+
+	arch_log_no = 0;
+
+#ifdef UNIV_LOG_ARCHIVE
+	UT_LIST_GET_FIRST(log_sys->log_groups)->archived_file_no;
+
+	if (0 == UT_LIST_GET_FIRST(log_sys->log_groups)->archived_offset) {
+
+		arch_log_no--;
+	}
+
+	log_archive_close_groups(TRUE);
+#endif /* UNIV_LOG_ARCHIVE */
+
+	mutex_exit(&(log_sys->mutex));
+
+	mutex_enter(&kernel_mutex);
+	/* Check that the master thread has stayed suspended */
+	if (srv_n_threads_active[SRV_MASTER] != 0) {
+		fprintf(stderr,
+			"InnoDB: Warning: the master thread woke up"
+			" during shutdown\n");
+
+		mutex_exit(&kernel_mutex);
+
+		goto loop;
+	}
+	mutex_exit(&kernel_mutex);
+
+	fil_flush_file_spaces(FIL_TABLESPACE);
+	fil_flush_file_spaces(FIL_LOG);
+
+	/* The call fil_write_flushed_lsn_to_data_files() will pass the buffer
+	pool: therefore it is essential that the buffer pool has been
+	completely flushed to disk! (We do not call fil_write... if the
+	'very fast' shutdown is enabled.) */
+
+	if (!buf_all_freed()) {
+
+		goto loop;
+	}
+
+	srv_shutdown_state = SRV_SHUTDOWN_LAST_PHASE;
+
+	/* Make some checks that the server really is quiet */
+	ut_a(srv_n_threads_active[SRV_MASTER] == 0);
+	ut_a(buf_all_freed());
+	ut_a(lsn == log_sys->lsn);
+
+	if (lsn < srv_start_lsn) {
+		fprintf(stderr,
+			"InnoDB: Error: log sequence number"
+			" at shutdown %llu\n"
+			"InnoDB: is lower than at startup %llu!\n",
+			lsn, srv_start_lsn);
+	}
+
+	srv_shutdown_lsn = lsn;
+
+	fil_write_flushed_lsn_to_data_files(lsn, arch_log_no);
+
+	fil_flush_file_spaces(FIL_TABLESPACE);
+
+	fil_close_all_files();
+
+	/* Make some checks that the server really is quiet */
+	ut_a(srv_n_threads_active[SRV_MASTER] == 0);
+	ut_a(buf_all_freed());
+	ut_a(lsn == log_sys->lsn);
+}
+
+#ifdef UNIV_LOG_DEBUG
+/******************************************************//**
+Checks by parsing that the catenated log segment for a single mtr is
+consistent. */
+UNIV_INTERN
+ibool
+log_check_log_recs(
+/*===============*/
+	const byte*	buf,		/*!< in: pointer to the start of
+					the log segment in the
+					log_sys->buf log buffer */
+	ulint		len,		/*!< in: segment length in bytes */
+	ib_uint64_t	buf_start_lsn)	/*!< in: buffer start lsn */
+{
+	ib_uint64_t	contiguous_lsn;
+	ib_uint64_t	scanned_lsn;
+	const byte*	start;
+	const byte*	end;
+	byte*		buf1;
+	byte*		scan_buf;
+
+	ut_ad(mutex_own(&(log_sys->mutex)));
+
+	if (len == 0) {
+
+		return(TRUE);
+	}
+
+	start = ut_align_down(buf, OS_FILE_LOG_BLOCK_SIZE);
+	end = ut_align(buf + len, OS_FILE_LOG_BLOCK_SIZE);
+
+	buf1 = mem_alloc((end - start) + OS_FILE_LOG_BLOCK_SIZE);
+	scan_buf = ut_align(buf1, OS_FILE_LOG_BLOCK_SIZE);
+
+	ut_memcpy(scan_buf, start, end - start);
+
+	recv_scan_log_recs((buf_pool->curr_size
+			    - recv_n_pool_free_frames) * UNIV_PAGE_SIZE,
+			   FALSE, scan_buf, end - start,
+			   ut_uint64_align_down(buf_start_lsn,
+						OS_FILE_LOG_BLOCK_SIZE),
+			   &contiguous_lsn, &scanned_lsn);
+
+	ut_a(scanned_lsn == buf_start_lsn + len);
+	ut_a(recv_sys->recovered_lsn == scanned_lsn);
+
+	mem_free(buf1);
+
+	return(TRUE);
+}
+#endif /* UNIV_LOG_DEBUG */
+
+/******************************************************//**
+Peeks the current lsn.
+@return	TRUE if success, FALSE if could not get the log system mutex */
+UNIV_INTERN
+ibool
+log_peek_lsn(
+/*=========*/
+	ib_uint64_t*	lsn)	/*!< out: if returns TRUE, current lsn is here */
+{
+	if (0 == mutex_enter_nowait(&(log_sys->mutex))) {
+		*lsn = log_sys->lsn;
+
+		mutex_exit(&(log_sys->mutex));
+
+		return(TRUE);
+	}
+
+	return(FALSE);
+}
+
+/******************************************************//**
+Prints info of the log. */
+UNIV_INTERN
+void
+log_print(
+/*======*/
+	FILE*	file)	/*!< in: file where to print */
+{
+	double	time_elapsed;
+	time_t	current_time;
+
+	mutex_enter(&(log_sys->mutex));
+
+	fprintf(file,
+		"Log sequence number %llu\n"
+		"Log flushed up to   %llu\n"
+		"Last checkpoint at  %llu\n",
+		log_sys->lsn,
+		log_sys->flushed_to_disk_lsn,
+		log_sys->last_checkpoint_lsn);
+
+	fprintf(file,
+		"Max checkpoint age    %lu\n"
+		"Checkpoint age target %lu\n"
+		"Modified age          %lu\n"
+		"Checkpoint age        %lu\n",
+			(ulong) log_sys->max_checkpoint_age,
+			(ulong) log_max_checkpoint_age_async(),
+			(ulong) (log_sys->lsn -
+					log_buf_pool_get_oldest_modification()),
+			(ulong) (log_sys->lsn - log_sys->last_checkpoint_lsn));
+
+	current_time = time(NULL);
+
+	time_elapsed = 0.001 + difftime(current_time,
+					log_sys->last_printout_time);
+	fprintf(file,
+		"%lu pending log writes, %lu pending chkp writes\n"
+		"%lu log i/o's done, %.2f log i/o's/second\n",
+		(ulong) log_sys->n_pending_writes,
+		(ulong) log_sys->n_pending_checkpoint_writes,
+		(ulong) log_sys->n_log_ios,
+		((log_sys->n_log_ios - log_sys->n_log_ios_old)
+		 / time_elapsed));
+
+	log_sys->n_log_ios_old = log_sys->n_log_ios;
+	log_sys->last_printout_time = current_time;
+
+	mutex_exit(&(log_sys->mutex));
+}
+
+/**********************************************************************//**
+Refreshes the statistics used to print per-second averages. */
+UNIV_INTERN
+void
+log_refresh_stats(void)
+/*===================*/
+{
+	log_sys->n_log_ios_old = log_sys->n_log_ios;
+	log_sys->last_printout_time = time(NULL);
+}
+
+/**********************************************************************
+Closes a log group. */
+static
+void
+log_group_close(
+/*===========*/
+	log_group_t*	group)		/* in,own: log group to close */
+{
+	ulint	i;
+
+	for (i = 0; i < group->n_files; i++) {
+		mem_free(group->file_header_bufs_ptr[i]);
+#ifdef UNIV_LOG_ARCHIVE
+		mem_free(group->archive_file_header_bufs_ptr[i]);
+#endif /* UNIV_LOG_ARCHIVE */
+	}
+
+	mem_free(group->file_header_bufs_ptr);
+	mem_free(group->file_header_bufs);
+
+#ifdef UNIV_LOG_ARCHIVE
+	mem_free(group->archive_file_header_bufs_ptr);
+	mem_free(group->archive_file_header_bufs);
+#endif /* UNIV_LOG_ARCHIVE */
+
+	mem_free(group->checkpoint_buf_ptr);
+
+	mem_free(group);
+}
+
+/**********************************************************
+Shutdown the log system but do not release all the memory. */
+UNIV_INTERN
+void
+log_shutdown(void)
+/*==============*/
+{
+	log_group_t*	group;
+
+	group = UT_LIST_GET_FIRST(log_sys->log_groups);
+
+	while (UT_LIST_GET_LEN(log_sys->log_groups) > 0) {
+		log_group_t*	prev_group = group;
+
+		group = UT_LIST_GET_NEXT(log_groups, group);
+		UT_LIST_REMOVE(log_groups, log_sys->log_groups, prev_group);
+
+		log_group_close(prev_group);
+	}
+
+	mem_free(log_sys->buf_ptr);
+	log_sys->buf_ptr = NULL;
+	log_sys->buf = NULL;
+	mem_free(log_sys->checkpoint_buf_ptr);
+	log_sys->checkpoint_buf_ptr = NULL;
+	log_sys->checkpoint_buf = NULL;
+
+	os_event_free(log_sys->no_flush_event);
+	os_event_free(log_sys->one_flushed_event);
+
+	rw_lock_free(&log_sys->checkpoint_lock);
+
+	mutex_free(&log_sys->mutex);
+
+#ifdef UNIV_LOG_ARCHIVE
+	rw_lock_free(&log_sys->archive_lock);
+	os_event_create(log_sys->archiving_on);
+#endif /* UNIV_LOG_ARCHIVE */
+
+#ifdef UNIV_LOG_DEBUG
+	recv_sys_debug_free();
+#endif
+
+	recv_sys_close();
+}
+
+/**********************************************************
+Free the log system data structures. */
+UNIV_INTERN
+void
+log_mem_free(void)
+/*==============*/
+{
+	if (log_sys != NULL) {
+		recv_sys_mem_free();
+		mem_free(log_sys);
+
+		log_sys = NULL;
+	}
+}
+#endif /* !UNIV_HOTBACKUP */
diff --git a/storage/xtradb/log/log0recv.c b/storage/xtradb/log/log0recv.c
new file mode 100644
index 00000000000..200b3b088a7
--- /dev/null
+++ b/storage/xtradb/log/log0recv.c
@@ -0,0 +1,3955 @@
+/*****************************************************************************
+
+Copyright (c) 1997, 2010, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file log/log0recv.c
+Recovery
+
+Created 9/20/1997 Heikki Tuuri
+*******************************************************/
+
+#include "log0recv.h"
+
+#ifdef UNIV_NONINL
+#include "log0recv.ic"
+#endif
+
+#include "mem0mem.h"
+#include "buf0buf.h"
+#include "buf0flu.h"
+#include "mtr0mtr.h"
+#include "mtr0log.h"
+#include "page0cur.h"
+#include "page0zip.h"
+#include "btr0btr.h"
+#include "btr0cur.h"
+#include "ibuf0ibuf.h"
+#include "trx0undo.h"
+#include "trx0rec.h"
+#include "fil0fil.h"
+#ifndef UNIV_HOTBACKUP
+# include "buf0rea.h"
+# include "srv0srv.h"
+# include "srv0start.h"
+# include "trx0roll.h"
+# include "row0merge.h"
+# include "sync0sync.h"
+#else /* !UNIV_HOTBACKUP */
+
+/** This is set to FALSE if the backup was originally taken with the
+ibbackup --include regexp option: then we do not want to create tables in
+directories which were not included */
+UNIV_INTERN ibool	recv_replay_file_ops	= TRUE;
+#endif /* !UNIV_HOTBACKUP */
+
+/** Log records are stored in the hash table in chunks at most of this size;
+this must be less than UNIV_PAGE_SIZE as it is stored in the buffer pool */
+#define RECV_DATA_BLOCK_SIZE	(MEM_MAX_ALLOC_IN_BUF - sizeof(recv_data_t))
+
+/** Read-ahead area in applying log records to file pages */
+#define RECV_READ_AHEAD_AREA	32
+
+/** The recovery system */
+UNIV_INTERN recv_sys_t*	recv_sys = NULL;
+/** TRUE when applying redo log records during crash recovery; FALSE
+otherwise.  Note that this is FALSE while a background thread is
+rolling back incomplete transactions. */
+UNIV_INTERN ibool	recv_recovery_on;
+#ifdef UNIV_LOG_ARCHIVE
+/** TRUE when applying redo log records from an archived log file */
+UNIV_INTERN ibool	recv_recovery_from_backup_on;
+#endif /* UNIV_LOG_ARCHIVE */
+
+#ifndef UNIV_HOTBACKUP
+/** TRUE when recv_init_crash_recovery() has been called. */
+UNIV_INTERN ibool	recv_needed_recovery;
+# ifdef UNIV_DEBUG
+/** TRUE if writing to the redo log (mtr_commit) is forbidden.
+Protected by log_sys->mutex. */
+UNIV_INTERN ibool	recv_no_log_write = FALSE;
+# endif /* UNIV_DEBUG */
+
+/** TRUE if buf_page_is_corrupted() should check if the log sequence
+number (FIL_PAGE_LSN) is in the future.  Initially FALSE, and set by
+recv_recovery_from_checkpoint_start_func(). */
+UNIV_INTERN ibool	recv_lsn_checks_on;
+
+/** There are two conditions under which we scan the logs, the first
+is normal startup and the second is when we do a recovery from an
+archive.
+This flag is set if we are doing a scan from the last checkpoint during
+startup. If we find log entries that were written after the last checkpoint
+we know that the server was not cleanly shutdown. We must then initialize
+the crash recovery environment before attempting to store these entries in
+the log hash table. */
+static ibool		recv_log_scan_is_startup_type;
+
+/** If the following is TRUE, the buffer pool file pages must be invalidated
+after recovery and no ibuf operations are allowed; this becomes TRUE if
+the log record hash table becomes too full, and log records must be merged
+to file pages already before the recovery is finished: in this case no
+ibuf operations are allowed, as they could modify the pages read in the
+buffer pool before the pages have been recovered to the up-to-date state.
+
+TRUE means that recovery is running and no operations on the log files
+are allowed yet: the variable name is misleading. */
+UNIV_INTERN ibool	recv_no_ibuf_operations;
+/** TRUE when the redo log is being backed up */
+# define recv_is_making_a_backup		FALSE
+/** TRUE when recovering from a backed up redo log file */
+# define recv_is_from_backup			FALSE
+#else /* !UNIV_HOTBACKUP */
+# define recv_needed_recovery			FALSE
+/** TRUE when the redo log is being backed up */
+UNIV_INTERN ibool	recv_is_making_a_backup	= FALSE;
+/** TRUE when recovering from a backed up redo log file */
+UNIV_INTERN ibool	recv_is_from_backup	= FALSE;
+# define buf_pool_get_curr_size() (5 * 1024 * 1024)
+#endif /* !UNIV_HOTBACKUP */
+/** The following counter is used to decide when to print info on
+log scan */
+static ulint	recv_scan_print_counter;
+
+/** The type of the previous parsed redo log record */
+static ulint	recv_previous_parsed_rec_type;
+/** The offset of the previous parsed redo log record */
+static ulint	recv_previous_parsed_rec_offset;
+/** The 'multi' flag of the previous parsed redo log record */
+static ulint	recv_previous_parsed_rec_is_multi;
+
+/** Maximum page number encountered in the redo log */
+UNIV_INTERN ulint	recv_max_parsed_page_no;
+
+/** This many frames must be left free in the buffer pool when we scan
+the log and store the scanned log records in the buffer pool: we will
+use these free frames to read in pages when we start applying the
+log records to the database.
+This is the default value. If the actual size of the buffer pool is
+larger than 10 MB we'll set this value to 512. */
+UNIV_INTERN ulint	recv_n_pool_free_frames;
+
+/** The maximum lsn we see for a page during the recovery process. If this
+is bigger than the lsn we are able to scan up to, that is an indication that
+the recovery failed and the database may be corrupt. */
+UNIV_INTERN ib_uint64_t	recv_max_page_lsn;
+
+/* prototypes */
+
+#ifndef UNIV_HOTBACKUP
+/*******************************************************//**
+Initialize crash recovery environment. Can be called iff
+recv_needed_recovery == FALSE. */
+static
+void
+recv_init_crash_recovery(void);
+/*===========================*/
+#endif /* !UNIV_HOTBACKUP */
+
+/********************************************************//**
+Creates the recovery system. */
+UNIV_INTERN
+void
+recv_sys_create(void)
+/*=================*/
+{
+	if (recv_sys != NULL) {
+
+		return;
+	}
+
+	recv_sys = mem_alloc(sizeof(*recv_sys));
+	memset(recv_sys, 0x0, sizeof(*recv_sys));
+
+	mutex_create(&recv_sys->mutex, SYNC_RECV);
+
+	recv_sys->heap = NULL;
+	recv_sys->addr_hash = NULL;
+
+	recv_sys->stats_recv_start_time = time(NULL);
+	recv_sys->stats_oldest_modified_lsn = IB_ULONGLONG_MAX;
+}
+
+/********************************************************//**
+Release recovery system mutexes. */
+UNIV_INTERN
+void
+recv_sys_close(void)
+/*================*/
+{
+	if (recv_sys != NULL) {
+		if (recv_sys->addr_hash != NULL) {
+			hash_table_free(recv_sys->addr_hash);
+		}
+
+		if (recv_sys->heap != NULL) {
+			mem_heap_free(recv_sys->heap);
+		}
+
+		if (recv_sys->buf != NULL) {
+			ut_free(recv_sys->buf);
+		}
+
+		if (recv_sys->last_block_buf_start != NULL) {
+			mem_free(recv_sys->last_block_buf_start);
+		}
+
+		mutex_free(&recv_sys->mutex);
+
+		mem_free(recv_sys);
+		recv_sys = NULL;
+	}
+}
+
+/********************************************************//**
+Frees the recovery system memory. */
+UNIV_INTERN
+void
+recv_sys_mem_free(void)
+/*===================*/
+{
+	if (recv_sys != NULL) {
+		if (recv_sys->addr_hash != NULL) {
+			hash_table_free(recv_sys->addr_hash);
+		}
+
+		if (recv_sys->heap != NULL) {
+			mem_heap_free(recv_sys->heap);
+		}
+
+		if (recv_sys->buf != NULL) {
+			ut_free(recv_sys->buf);
+		}
+
+		if (recv_sys->last_block_buf_start != NULL) {
+			mem_free(recv_sys->last_block_buf_start);
+		}
+
+		mem_free(recv_sys);
+		recv_sys = NULL;
+	}
+}
+
+#ifndef UNIV_HOTBACKUP
+/************************************************************
+Reset the state of the recovery system variables. */
+UNIV_INTERN
+void
+recv_sys_var_init(void)
+/*===================*/
+{
+	recv_lsn_checks_on = FALSE;
+
+	recv_n_pool_free_frames = 256;
+
+	recv_recovery_on = FALSE;
+
+#ifdef UNIV_LOG_ARCHIVE
+	recv_recovery_from_backup_on = FALSE;
+#endif /* UNIV_LOG_ARCHIVE */
+
+	recv_needed_recovery = FALSE;
+
+	recv_lsn_checks_on = FALSE;
+
+	recv_log_scan_is_startup_type = FALSE;
+
+	recv_no_ibuf_operations = FALSE;
+
+	recv_scan_print_counter	= 0;
+
+	recv_previous_parsed_rec_type	= 999999;
+
+	recv_previous_parsed_rec_offset	= 0;
+
+	recv_previous_parsed_rec_is_multi = 0;
+
+	recv_max_parsed_page_no	= 0;
+
+	recv_n_pool_free_frames	= 256;
+
+	recv_max_page_lsn = 0;
+}
+#endif /* !UNIV_HOTBACKUP */
+
+/************************************************************
+Inits the recovery system for a recovery operation. */
+UNIV_INTERN
+void
+recv_sys_init(
+/*==========*/
+	ulint	available_memory)	/*!< in: available memory in bytes */
+{
+	if (recv_sys->heap != NULL) {
+
+		return;
+	}
+
+	/* Initialize red-black tree for fast insertions into the
+	flush_list during recovery process.
+	As this initialization is done while holding the buffer pool
+	mutex we perform it before acquiring recv_sys->mutex. */
+#ifndef UNIV_HOTBACKUP
+	buf_flush_init_flush_rbt();
+
+	mutex_enter(&(recv_sys->mutex));
+
+	recv_sys->heap = mem_heap_create_in_buffer(256);
+#else /* !UNIV_HOTBACKUP */
+	recv_sys->heap = mem_heap_create(256);
+	recv_is_from_backup = TRUE;
+#endif /* !UNIV_HOTBACKUP */
+
+	/* Set appropriate value of recv_n_pool_free_frames. */
+	if (buf_pool_get_curr_size() >= (10 * 1024 * 1024)) {
+		/* Buffer pool of size greater than 10 MB. */
+		recv_n_pool_free_frames = 512;
+	}
+
+	if (buf_pool_get_curr_size() >= (32 * 1024 * 1024)) {
+		/* Buffer pool of size greater than 32 MB. */
+		recv_n_pool_free_frames = 1024;
+	}
+
+	recv_sys->buf = ut_malloc(RECV_PARSING_BUF_SIZE);
+	recv_sys->len = 0;
+	recv_sys->recovered_offset = 0;
+
+	recv_sys->addr_hash = hash_create(available_memory / 512);
+	recv_sys->n_addrs = 0;
+
+	recv_sys->apply_log_recs = FALSE;
+	recv_sys->apply_batch_on = FALSE;
+
+	recv_sys->last_block_buf_start = mem_alloc(2 * OS_FILE_LOG_BLOCK_SIZE);
+
+	recv_sys->last_block = ut_align(recv_sys->last_block_buf_start,
+					OS_FILE_LOG_BLOCK_SIZE);
+	recv_sys->found_corrupt_log = FALSE;
+
+	recv_max_page_lsn = 0;
+
+	mutex_exit(&(recv_sys->mutex));
+}
+
+/********************************************************//**
+Empties the hash table when it has been fully processed. */
+static
+void
+recv_sys_empty_hash(void)
+/*=====================*/
+{
+	ut_ad(mutex_own(&(recv_sys->mutex)));
+
+	if (recv_sys->n_addrs != 0) {
+		fprintf(stderr,
+			"InnoDB: Error: %lu pages with log records"
+			" were left unprocessed!\n"
+			"InnoDB: Maximum page number with"
+			" log records on it %lu\n",
+			(ulong) recv_sys->n_addrs,
+			(ulong) recv_max_parsed_page_no);
+		ut_error;
+	}
+
+	hash_table_free(recv_sys->addr_hash);
+	mem_heap_empty(recv_sys->heap);
+
+	recv_sys->addr_hash = hash_create(buf_pool_get_curr_size() / 512);
+}
+
+#ifndef UNIV_HOTBACKUP
+# ifndef UNIV_LOG_DEBUG
+/********************************************************//**
+Frees the recovery system. */
+static
+void
+recv_sys_debug_free(void)
+/*=====================*/
+{
+	mutex_enter(&(recv_sys->mutex));
+
+	hash_table_free(recv_sys->addr_hash);
+	mem_heap_free(recv_sys->heap);
+	ut_free(recv_sys->buf);
+	mem_free(recv_sys->last_block_buf_start);
+
+	recv_sys->buf = NULL;
+	recv_sys->heap = NULL;
+	recv_sys->addr_hash = NULL;
+	recv_sys->last_block_buf_start = NULL;
+
+	mutex_exit(&(recv_sys->mutex));
+
+	/* Free up the flush_rbt. */
+	buf_flush_free_flush_rbt();
+}
+# endif /* UNIV_LOG_DEBUG */
+
+/********************************************************//**
+Truncates possible corrupted or extra records from a log group. */
+static
+void
+recv_truncate_group(
+/*================*/
+	log_group_t*	group,		/*!< in: log group */
+	ib_uint64_t	recovered_lsn,	/*!< in: recovery succeeded up to this
+					lsn */
+	ib_uint64_t	limit_lsn,	/*!< in: this was the limit for
+					recovery */
+	ib_uint64_t	checkpoint_lsn,	/*!< in: recovery was started from this
+					checkpoint */
+	ib_uint64_t	archived_lsn)	/*!< in: the log has been archived up to
+					this lsn */
+{
+	ib_uint64_t	start_lsn;
+	ib_uint64_t	end_lsn;
+	ib_uint64_t	finish_lsn1;
+	ib_uint64_t	finish_lsn2;
+	ib_uint64_t	finish_lsn;
+	ulint		len;
+	ulint		i;
+
+	if (archived_lsn == IB_ULONGLONG_MAX) {
+		/* Checkpoint was taken in the NOARCHIVELOG mode */
+		archived_lsn = checkpoint_lsn;
+	}
+
+	finish_lsn1 = ut_uint64_align_down(archived_lsn,
+					   OS_FILE_LOG_BLOCK_SIZE)
+		+ log_group_get_capacity(group);
+
+	finish_lsn2 = ut_uint64_align_up(recovered_lsn,
+					 OS_FILE_LOG_BLOCK_SIZE)
+		+ recv_sys->last_log_buf_size;
+
+	if (limit_lsn != IB_ULONGLONG_MAX) {
+		/* We do not know how far we should erase log records: erase
+		as much as possible */
+
+		finish_lsn = finish_lsn1;
+	} else {
+		/* It is enough to erase the length of the log buffer */
+		finish_lsn = finish_lsn1 < finish_lsn2
+			? finish_lsn1 : finish_lsn2;
+	}
+
+	ut_a(RECV_SCAN_SIZE <= log_sys->buf_size);
+
+	/* Write the log buffer full of zeros */
+	for (i = 0; i < RECV_SCAN_SIZE; i++) {
+
+		*(log_sys->buf + i) = '\0';
+	}
+
+	start_lsn = ut_uint64_align_down(recovered_lsn,
+					 OS_FILE_LOG_BLOCK_SIZE);
+
+	if (start_lsn != recovered_lsn) {
+		/* Copy the last incomplete log block to the log buffer and
+		edit its data length: */
+
+		ut_memcpy(log_sys->buf, recv_sys->last_block,
+			  OS_FILE_LOG_BLOCK_SIZE);
+		log_block_set_data_len(log_sys->buf,
+				       (ulint) (recovered_lsn - start_lsn));
+	}
+
+	if (start_lsn >= finish_lsn) {
+
+		return;
+	}
+
+	for (;;) {
+		end_lsn = start_lsn + RECV_SCAN_SIZE;
+
+		if (end_lsn > finish_lsn) {
+
+			end_lsn = finish_lsn;
+		}
+
+		len = (ulint) (end_lsn - start_lsn);
+
+		log_group_write_buf(group, log_sys->buf, len, start_lsn, 0);
+		if (end_lsn >= finish_lsn) {
+
+			return;
+		}
+
+		/* Write the log buffer full of zeros */
+		for (i = 0; i < RECV_SCAN_SIZE; i++) {
+
+			*(log_sys->buf + i) = '\0';
+		}
+
+		start_lsn = end_lsn;
+	}
+}
+
+/********************************************************//**
+Copies the log segment between group->recovered_lsn and recovered_lsn from the
+most up-to-date log group to group, so that it contains the latest log data. */
+static
+void
+recv_copy_group(
+/*============*/
+	log_group_t*	up_to_date_group,	/*!< in: the most up-to-date log
+						group */
+	log_group_t*	group,			/*!< in: copy to this log
+						group */
+	ib_uint64_t	recovered_lsn)		/*!< in: recovery succeeded up
+						to this lsn */
+{
+	ib_uint64_t	start_lsn;
+	ib_uint64_t	end_lsn;
+	ulint		len;
+
+	if (group->scanned_lsn >= recovered_lsn) {
+
+		return;
+	}
+
+	ut_a(RECV_SCAN_SIZE <= log_sys->buf_size);
+
+	start_lsn = ut_uint64_align_down(group->scanned_lsn,
+					 OS_FILE_LOG_BLOCK_SIZE);
+	for (;;) {
+		end_lsn = start_lsn + RECV_SCAN_SIZE;
+
+		if (end_lsn > recovered_lsn) {
+			end_lsn = ut_uint64_align_up(recovered_lsn,
+						     OS_FILE_LOG_BLOCK_SIZE);
+		}
+
+		log_group_read_log_seg(LOG_RECOVER, log_sys->buf,
+				       up_to_date_group, start_lsn, end_lsn);
+
+		len = (ulint) (end_lsn - start_lsn);
+
+		log_group_write_buf(group, log_sys->buf, len, start_lsn, 0);
+
+		if (end_lsn >= recovered_lsn) {
+
+			return;
+		}
+
+		start_lsn = end_lsn;
+	}
+}
+
+/********************************************************//**
+Copies a log segment from the most up-to-date log group to the other log
+groups, so that they all contain the latest log data. Also writes the info
+about the latest checkpoint to the groups, and inits the fields in the group
+memory structs to up-to-date values. */
+static
+void
+recv_synchronize_groups(
+/*====================*/
+	log_group_t*	up_to_date_group)	/*!< in: the most up-to-date
+						log group */
+{
+	log_group_t*	group;
+	ib_uint64_t	start_lsn;
+	ib_uint64_t	end_lsn;
+	ib_uint64_t	recovered_lsn;
+	ib_uint64_t	limit_lsn;
+
+	recovered_lsn = recv_sys->recovered_lsn;
+	limit_lsn = recv_sys->limit_lsn;
+
+	/* Read the last recovered log block to the recovery system buffer:
+	the block is always incomplete */
+
+	start_lsn = ut_uint64_align_down(recovered_lsn,
+					 OS_FILE_LOG_BLOCK_SIZE);
+	end_lsn = ut_uint64_align_up(recovered_lsn, OS_FILE_LOG_BLOCK_SIZE);
+
+	ut_a(start_lsn != end_lsn);
+
+	log_group_read_log_seg(LOG_RECOVER, recv_sys->last_block,
+			       up_to_date_group, start_lsn, end_lsn);
+
+	group = UT_LIST_GET_FIRST(log_sys->log_groups);
+
+	while (group) {
+		if (group != up_to_date_group) {
+
+			/* Copy log data if needed */
+
+			recv_copy_group(group, up_to_date_group,
+					recovered_lsn);
+		}
+
+		/* Update the fields in the group struct to correspond to
+		recovered_lsn */
+
+		log_group_set_fields(group, recovered_lsn);
+
+		group = UT_LIST_GET_NEXT(log_groups, group);
+	}
+
+	/* Copy the checkpoint info to the groups; remember that we have
+	incremented checkpoint_no by one, and the info will not be written
+	over the max checkpoint info, thus making the preservation of max
+	checkpoint info on disk certain */
+
+	log_groups_write_checkpoint_info();
+
+	mutex_exit(&(log_sys->mutex));
+
+	/* Wait for the checkpoint write to complete */
+	rw_lock_s_lock(&(log_sys->checkpoint_lock));
+	rw_lock_s_unlock(&(log_sys->checkpoint_lock));
+
+	mutex_enter(&(log_sys->mutex));
+}
+#endif /* !UNIV_HOTBACKUP */
+
+/***********************************************************************//**
+Checks the consistency of the checkpoint info
+@return	TRUE if ok */
+static
+ibool
+recv_check_cp_is_consistent(
+/*========================*/
+	const byte*	buf)	/*!< in: buffer containing checkpoint info */
+{
+	ulint	fold;
+
+	fold = ut_fold_binary(buf, LOG_CHECKPOINT_CHECKSUM_1);
+
+	if ((fold & 0xFFFFFFFFUL) != mach_read_from_4(
+		    buf + LOG_CHECKPOINT_CHECKSUM_1)) {
+		return(FALSE);
+	}
+
+	fold = ut_fold_binary(buf + LOG_CHECKPOINT_LSN,
+			      LOG_CHECKPOINT_CHECKSUM_2 - LOG_CHECKPOINT_LSN);
+
+	if ((fold & 0xFFFFFFFFUL) != mach_read_from_4(
+		    buf + LOG_CHECKPOINT_CHECKSUM_2)) {
+		return(FALSE);
+	}
+
+	return(TRUE);
+}
+
+#ifndef UNIV_HOTBACKUP
+/********************************************************//**
+Looks for the maximum consistent checkpoint from the log groups.
+@return	error code or DB_SUCCESS */
+static
+ulint
+recv_find_max_checkpoint(
+/*=====================*/
+	log_group_t**	max_group,	/*!< out: max group */
+	ulint*		max_field)	/*!< out: LOG_CHECKPOINT_1 or
+					LOG_CHECKPOINT_2 */
+{
+	log_group_t*	group;
+	ib_uint64_t	max_no;
+	ib_uint64_t	checkpoint_no;
+	ulint		field;
+	byte*		buf;
+
+	group = UT_LIST_GET_FIRST(log_sys->log_groups);
+
+	max_no = 0;
+	*max_group = NULL;
+	*max_field = 0;
+
+	buf = log_sys->checkpoint_buf;
+
+	while (group) {
+		group->state = LOG_GROUP_CORRUPTED;
+
+		for (field = LOG_CHECKPOINT_1; field <= LOG_CHECKPOINT_2;
+		     field += LOG_CHECKPOINT_2 - LOG_CHECKPOINT_1) {
+
+			log_group_read_checkpoint_info(group, field);
+
+			if (!recv_check_cp_is_consistent(buf)) {
+#ifdef UNIV_DEBUG
+				if (log_debug_writes) {
+					fprintf(stderr,
+						"InnoDB: Checkpoint in group"
+						" %lu at %lu invalid, %lu\n",
+						(ulong) group->id,
+						(ulong) field,
+						(ulong) mach_read_from_4(
+							buf
+							+ LOG_CHECKPOINT_CHECKSUM_1));
+
+				}
+#endif /* UNIV_DEBUG */
+				goto not_consistent;
+			}
+
+			group->state = LOG_GROUP_OK;
+
+			group->lsn = mach_read_ull(
+				buf + LOG_CHECKPOINT_LSN);
+
+#ifdef UNIV_LOG_ARCHIVE
+#error "UNIV_LOG_ARCHIVE could not be enabled"
+#endif
+			{
+			ib_uint64_t tmp_lsn_offset = mach_read_ull(
+					buf + LOG_CHECKPOINT_ARCHIVED_LSN);
+				if (sizeof(ulint) != 4
+				    && tmp_lsn_offset != IB_ULONGLONG_MAX) {
+					group->lsn_offset = (ulint) tmp_lsn_offset;
+				} else {
+			group->lsn_offset = mach_read_from_4(
+				buf + LOG_CHECKPOINT_OFFSET);
+				}
+			}
+
+			checkpoint_no = mach_read_ull(
+				buf + LOG_CHECKPOINT_NO);
+
+#ifdef UNIV_DEBUG
+			if (log_debug_writes) {
+				fprintf(stderr,
+					"InnoDB: Checkpoint number %lu"
+					" found in group %lu\n",
+					(ulong) checkpoint_no,
+					(ulong) group->id);
+			}
+#endif /* UNIV_DEBUG */
+
+			if (checkpoint_no >= max_no) {
+				*max_group = group;
+				*max_field = field;
+				max_no = checkpoint_no;
+			}
+
+not_consistent:
+			;
+		}
+
+		group = UT_LIST_GET_NEXT(log_groups, group);
+	}
+
+	if (*max_group == NULL) {
+
+		fprintf(stderr,
+			"InnoDB: No valid checkpoint found.\n"
+			"InnoDB: If this error appears when you are"
+			" creating an InnoDB database,\n"
+			"InnoDB: the problem may be that during"
+			" an earlier attempt you managed\n"
+			"InnoDB: to create the InnoDB data files,"
+			" but log file creation failed.\n"
+			"InnoDB: If that is the case, please refer to\n"
+			"InnoDB: " REFMAN "error-creating-innodb.html\n");
+		return(DB_ERROR);
+	}
+
+	return(DB_SUCCESS);
+}
+#else /* !UNIV_HOTBACKUP */
+/*******************************************************************//**
+Reads the checkpoint info needed in hot backup.
+@return	TRUE if success */
+UNIV_INTERN
+ibool
+recv_read_cp_info_for_backup(
+/*=========================*/
+	const byte*	hdr,	/*!< in: buffer containing the log group
+				header */
+	ib_uint64_t*	lsn,	/*!< out: checkpoint lsn */
+	ulint*		offset,	/*!< out: checkpoint offset in the log group */
+	ulint*		fsp_limit,/*!< out: fsp limit of space 0,
+				1000000000 if the database is running
+				with < version 3.23.50 of InnoDB */
+	ib_uint64_t*	cp_no,	/*!< out: checkpoint number */
+	ib_uint64_t*	first_header_lsn)
+				/*!< out: lsn of of the start of the
+				first log file */
+{
+	ulint		max_cp		= 0;
+	ib_uint64_t	max_cp_no	= 0;
+	const byte*	cp_buf;
+
+	cp_buf = hdr + LOG_CHECKPOINT_1;
+
+	if (recv_check_cp_is_consistent(cp_buf)) {
+		max_cp_no = mach_read_ull(cp_buf + LOG_CHECKPOINT_NO);
+		max_cp = LOG_CHECKPOINT_1;
+	}
+
+	cp_buf = hdr + LOG_CHECKPOINT_2;
+
+	if (recv_check_cp_is_consistent(cp_buf)) {
+		if (mach_read_ull(cp_buf + LOG_CHECKPOINT_NO) > max_cp_no) {
+			max_cp = LOG_CHECKPOINT_2;
+		}
+	}
+
+	if (max_cp == 0) {
+		return(FALSE);
+	}
+
+	cp_buf = hdr + max_cp;
+
+	*lsn = mach_read_ull(cp_buf + LOG_CHECKPOINT_LSN);
+	*offset = mach_read_from_4(cp_buf + LOG_CHECKPOINT_OFFSET);
+
+	/* If the user is running a pre-3.23.50 version of InnoDB, its
+	checkpoint data does not contain the fsp limit info */
+	if (mach_read_from_4(cp_buf + LOG_CHECKPOINT_FSP_MAGIC_N)
+	    == LOG_CHECKPOINT_FSP_MAGIC_N_VAL) {
+
+		*fsp_limit = mach_read_from_4(
+			cp_buf + LOG_CHECKPOINT_FSP_FREE_LIMIT);
+
+		if (*fsp_limit == 0) {
+			*fsp_limit = 1000000000;
+		}
+	} else {
+		*fsp_limit = 1000000000;
+	}
+
+	/*	fprintf(stderr, "fsp limit %lu MB\n", *fsp_limit); */
+
+	*cp_no = mach_read_ull(cp_buf + LOG_CHECKPOINT_NO);
+
+	*first_header_lsn = mach_read_ull(hdr + LOG_FILE_START_LSN);
+
+	return(TRUE);
+}
+#endif /* !UNIV_HOTBACKUP */
+
+/******************************************************//**
+Checks the 4-byte checksum to the trailer checksum field of a log
+block.  We also accept a log block in the old format before
+InnoDB-3.23.52 where the checksum field contains the log block number.
+@return TRUE if ok, or if the log block may be in the format of InnoDB
+version predating 3.23.52 */
+static
+ibool
+log_block_checksum_is_ok_or_old_format(
+/*===================================*/
+	const byte*	block)	/*!< in: pointer to a log block */
+{
+#ifdef UNIV_LOG_DEBUG
+	return(TRUE);
+#endif /* UNIV_LOG_DEBUG */
+	if (log_block_calc_checksum(block) == log_block_get_checksum(block)) {
+
+		return(TRUE);
+	}
+
+	if (log_block_get_hdr_no(block) == log_block_get_checksum(block)) {
+
+		/* We assume the log block is in the format of
+		InnoDB version < 3.23.52 and the block is ok */
+#if 0
+		fprintf(stderr,
+			"InnoDB: Scanned old format < InnoDB-3.23.52"
+			" log block number %lu\n",
+			log_block_get_hdr_no(block));
+#endif
+		return(TRUE);
+	}
+
+	return(FALSE);
+}
+
+#ifdef UNIV_HOTBACKUP
+/*******************************************************************//**
+Scans the log segment and n_bytes_scanned is set to the length of valid
+log scanned. */
+UNIV_INTERN
+void
+recv_scan_log_seg_for_backup(
+/*=========================*/
+	byte*		buf,		/*!< in: buffer containing log data */
+	ulint		buf_len,	/*!< in: data length in that buffer */
+	ib_uint64_t*	scanned_lsn,	/*!< in/out: lsn of buffer start,
+					we return scanned lsn */
+	ulint*		scanned_checkpoint_no,
+					/*!< in/out: 4 lowest bytes of the
+					highest scanned checkpoint number so
+					far */
+	ulint*		n_bytes_scanned)/*!< out: how much we were able to
+					scan, smaller than buf_len if log
+					data ended here */
+{
+	ulint	data_len;
+	byte*	log_block;
+	ulint	no;
+
+	*n_bytes_scanned = 0;
+
+	for (log_block = buf; log_block < buf + buf_len;
+	     log_block += OS_FILE_LOG_BLOCK_SIZE) {
+
+		no = log_block_get_hdr_no(log_block);
+
+#if 0
+		fprintf(stderr, "Log block header no %lu\n", no);
+#endif
+
+		if (no != log_block_convert_lsn_to_no(*scanned_lsn)
+		    || !log_block_checksum_is_ok_or_old_format(log_block)) {
+#if 0
+			fprintf(stderr,
+				"Log block n:o %lu, scanned lsn n:o %lu\n",
+				no, log_block_convert_lsn_to_no(*scanned_lsn));
+#endif
+			/* Garbage or an incompletely written log block */
+
+			log_block += OS_FILE_LOG_BLOCK_SIZE;
+#if 0
+			fprintf(stderr,
+				"Next log block n:o %lu\n",
+				log_block_get_hdr_no(log_block));
+#endif
+			break;
+		}
+
+		if (*scanned_checkpoint_no > 0
+		    && log_block_get_checkpoint_no(log_block)
+		    < *scanned_checkpoint_no
+		    && *scanned_checkpoint_no
+		    - log_block_get_checkpoint_no(log_block)
+		    > 0x80000000UL) {
+
+			/* Garbage from a log buffer flush which was made
+			before the most recent database recovery */
+#if 0
+			fprintf(stderr,
+				"Scanned cp n:o %lu, block cp n:o %lu\n",
+				*scanned_checkpoint_no,
+				log_block_get_checkpoint_no(log_block));
+#endif
+			break;
+		}
+
+		data_len = log_block_get_data_len(log_block);
+
+		*scanned_checkpoint_no
+			= log_block_get_checkpoint_no(log_block);
+		*scanned_lsn += data_len;
+
+		*n_bytes_scanned += data_len;
+
+		if (data_len < OS_FILE_LOG_BLOCK_SIZE) {
+			/* Log data ends here */
+
+#if 0
+			fprintf(stderr, "Log block data len %lu\n",
+				data_len);
+#endif
+			break;
+		}
+	}
+}
+#endif /* UNIV_HOTBACKUP */
+
+/*******************************************************************//**
+Tries to parse a single log record body and also applies it to a page if
+specified. File ops are parsed, but not applied in this function.
+@return	log record end, NULL if not a complete record */
+static
+byte*
+recv_parse_or_apply_log_rec_body(
+/*=============================*/
+	byte		type,	/*!< in: type */
+	byte*		ptr,	/*!< in: pointer to a buffer */
+	byte*		end_ptr,/*!< in: pointer to the buffer end */
+	buf_block_t*	block,	/*!< in/out: buffer block or NULL; if
+				not NULL, then the log record is
+				applied to the page, and the log
+				record should be complete then */
+	mtr_t*		mtr)	/*!< in: mtr or NULL; should be non-NULL
+				if and only if block is non-NULL */
+{
+	dict_index_t*	index	= NULL;
+	page_t*		page;
+	page_zip_des_t*	page_zip;
+#ifdef UNIV_DEBUG
+	ulint		page_type;
+#endif /* UNIV_DEBUG */
+
+	ut_ad(!block == !mtr);
+
+	if (block) {
+		page = block->frame;
+		page_zip = buf_block_get_page_zip(block);
+		ut_d(page_type = fil_page_get_type(page));
+	} else {
+		page = NULL;
+		page_zip = NULL;
+		ut_d(page_type = FIL_PAGE_TYPE_ALLOCATED);
+	}
+
+	switch (type) {
+#ifdef UNIV_LOG_LSN_DEBUG
+	case MLOG_LSN:
+		/* The LSN is checked in recv_parse_log_rec(). */
+		break;
+#endif /* UNIV_LOG_LSN_DEBUG */
+	case MLOG_1BYTE: case MLOG_2BYTES: case MLOG_4BYTES: case MLOG_8BYTES:
+#ifdef UNIV_DEBUG
+		if (page && page_type == FIL_PAGE_TYPE_ALLOCATED
+		    && end_ptr >= ptr + 2) {
+			/* It is OK to set FIL_PAGE_TYPE and certain
+			list node fields on an empty page.  Any other
+			write is not OK. */
+
+			/* NOTE: There may be bogus assertion failures for
+			dict_hdr_create(), trx_rseg_header_create(),
+			trx_sys_create_doublewrite_buf(), and
+			trx_sysf_create().
+			These are only called during database creation. */
+			ulint	offs = mach_read_from_2(ptr);
+
+			switch (type) {
+			default:
+				ut_error;
+			case MLOG_2BYTES:
+				/* Note that this can fail when the
+				redo log been written with something
+				older than InnoDB Plugin 1.0.4. */
+				ut_ad(offs == FIL_PAGE_TYPE
+				      || offs == IBUF_TREE_SEG_HEADER
+				      + IBUF_HEADER + FSEG_HDR_OFFSET
+				      || offs == PAGE_BTR_IBUF_FREE_LIST
+				      + PAGE_HEADER + FIL_ADDR_BYTE
+				      || offs == PAGE_BTR_IBUF_FREE_LIST
+				      + PAGE_HEADER + FIL_ADDR_BYTE
+				      + FIL_ADDR_SIZE
+				      || offs == PAGE_BTR_SEG_LEAF
+				      + PAGE_HEADER + FSEG_HDR_OFFSET
+				      || offs == PAGE_BTR_SEG_TOP
+				      + PAGE_HEADER + FSEG_HDR_OFFSET
+				      || offs == PAGE_BTR_IBUF_FREE_LIST_NODE
+				      + PAGE_HEADER + FIL_ADDR_BYTE
+				      + 0 /*FLST_PREV*/
+				      || offs == PAGE_BTR_IBUF_FREE_LIST_NODE
+				      + PAGE_HEADER + FIL_ADDR_BYTE
+				      + FIL_ADDR_SIZE /*FLST_NEXT*/);
+				break;
+			case MLOG_4BYTES:
+				/* Note that this can fail when the
+				redo log been written with something
+				older than InnoDB Plugin 1.0.4. */
+				ut_ad(0
+				      || offs == IBUF_TREE_SEG_HEADER
+				      + IBUF_HEADER + FSEG_HDR_SPACE
+				      || offs == IBUF_TREE_SEG_HEADER
+				      + IBUF_HEADER + FSEG_HDR_PAGE_NO
+				      || offs == PAGE_BTR_IBUF_FREE_LIST
+				      + PAGE_HEADER/* flst_init */
+				      || offs == PAGE_BTR_IBUF_FREE_LIST
+				      + PAGE_HEADER + FIL_ADDR_PAGE
+				      || offs == PAGE_BTR_IBUF_FREE_LIST
+				      + PAGE_HEADER + FIL_ADDR_PAGE
+				      + FIL_ADDR_SIZE
+				      || offs == PAGE_BTR_SEG_LEAF
+				      + PAGE_HEADER + FSEG_HDR_PAGE_NO
+				      || offs == PAGE_BTR_SEG_LEAF
+				      + PAGE_HEADER + FSEG_HDR_SPACE
+				      || offs == PAGE_BTR_SEG_TOP
+				      + PAGE_HEADER + FSEG_HDR_PAGE_NO
+				      || offs == PAGE_BTR_SEG_TOP
+				      + PAGE_HEADER + FSEG_HDR_SPACE
+				      || offs == PAGE_BTR_IBUF_FREE_LIST_NODE
+				      + PAGE_HEADER + FIL_ADDR_PAGE
+				      + 0 /*FLST_PREV*/
+				      || offs == PAGE_BTR_IBUF_FREE_LIST_NODE
+				      + PAGE_HEADER + FIL_ADDR_PAGE
+				      + FIL_ADDR_SIZE /*FLST_NEXT*/);
+				break;
+			}
+		}
+#endif /* UNIV_DEBUG */
+		ptr = mlog_parse_nbytes(type, ptr, end_ptr, page, page_zip);
+		break;
+	case MLOG_REC_INSERT: case MLOG_COMP_REC_INSERT:
+		ut_ad(!page || page_type == FIL_PAGE_INDEX);
+
+		if (NULL != (ptr = mlog_parse_index(
+				     ptr, end_ptr,
+				     type == MLOG_COMP_REC_INSERT,
+				     &index))) {
+			ut_a(!page
+			     || (ibool)!!page_is_comp(page)
+			     == dict_table_is_comp(index->table));
+			ptr = page_cur_parse_insert_rec(FALSE, ptr, end_ptr,
+							block, index, mtr);
+		}
+		break;
+	case MLOG_REC_CLUST_DELETE_MARK: case MLOG_COMP_REC_CLUST_DELETE_MARK:
+		ut_ad(!page || page_type == FIL_PAGE_INDEX);
+
+		if (NULL != (ptr = mlog_parse_index(
+				     ptr, end_ptr,
+				     type == MLOG_COMP_REC_CLUST_DELETE_MARK,
+				     &index))) {
+			ut_a(!page
+			     || (ibool)!!page_is_comp(page)
+			     == dict_table_is_comp(index->table));
+			ptr = btr_cur_parse_del_mark_set_clust_rec(
+				ptr, end_ptr, page, page_zip, index);
+		}
+		break;
+	case MLOG_COMP_REC_SEC_DELETE_MARK:
+		ut_ad(!page || page_type == FIL_PAGE_INDEX);
+		/* This log record type is obsolete, but we process it for
+		backward compatibility with MySQL 5.0.3 and 5.0.4. */
+		ut_a(!page || page_is_comp(page));
+		ut_a(!page_zip);
+		ptr = mlog_parse_index(ptr, end_ptr, TRUE, &index);
+		if (!ptr) {
+			break;
+		}
+		/* Fall through */
+	case MLOG_REC_SEC_DELETE_MARK:
+		ut_ad(!page || page_type == FIL_PAGE_INDEX);
+		ptr = btr_cur_parse_del_mark_set_sec_rec(ptr, end_ptr,
+							 page, page_zip);
+		break;
+	case MLOG_REC_UPDATE_IN_PLACE: case MLOG_COMP_REC_UPDATE_IN_PLACE:
+		ut_ad(!page || page_type == FIL_PAGE_INDEX);
+
+		if (NULL != (ptr = mlog_parse_index(
+				     ptr, end_ptr,
+				     type == MLOG_COMP_REC_UPDATE_IN_PLACE,
+				     &index))) {
+			ut_a(!page
+			     || (ibool)!!page_is_comp(page)
+			     == dict_table_is_comp(index->table));
+			ptr = btr_cur_parse_update_in_place(ptr, end_ptr, page,
+							    page_zip, index);
+		}
+		break;
+	case MLOG_LIST_END_DELETE: case MLOG_COMP_LIST_END_DELETE:
+	case MLOG_LIST_START_DELETE: case MLOG_COMP_LIST_START_DELETE:
+		ut_ad(!page || page_type == FIL_PAGE_INDEX);
+
+		if (NULL != (ptr = mlog_parse_index(
+				     ptr, end_ptr,
+				     type == MLOG_COMP_LIST_END_DELETE
+				     || type == MLOG_COMP_LIST_START_DELETE,
+				     &index))) {
+			ut_a(!page
+			     || (ibool)!!page_is_comp(page)
+			     == dict_table_is_comp(index->table));
+			ptr = page_parse_delete_rec_list(type, ptr, end_ptr,
+							 block, index, mtr);
+		}
+		break;
+	case MLOG_LIST_END_COPY_CREATED: case MLOG_COMP_LIST_END_COPY_CREATED:
+		ut_ad(!page || page_type == FIL_PAGE_INDEX);
+
+		if (NULL != (ptr = mlog_parse_index(
+				     ptr, end_ptr,
+				     type == MLOG_COMP_LIST_END_COPY_CREATED,
+				     &index))) {
+			ut_a(!page
+			     || (ibool)!!page_is_comp(page)
+			     == dict_table_is_comp(index->table));
+			ptr = page_parse_copy_rec_list_to_created_page(
+				ptr, end_ptr, block, index, mtr);
+		}
+		break;
+	case MLOG_PAGE_REORGANIZE: case MLOG_COMP_PAGE_REORGANIZE:
+		ut_ad(!page || page_type == FIL_PAGE_INDEX);
+
+		if (NULL != (ptr = mlog_parse_index(
+				     ptr, end_ptr,
+				     type == MLOG_COMP_PAGE_REORGANIZE,
+				     &index))) {
+			ut_a(!page
+			     || (ibool)!!page_is_comp(page)
+			     == dict_table_is_comp(index->table));
+			ptr = btr_parse_page_reorganize(ptr, end_ptr, index,
+							block, mtr);
+		}
+		break;
+	case MLOG_PAGE_CREATE: case MLOG_COMP_PAGE_CREATE:
+		/* Allow anything in page_type when creating a page. */
+		ut_a(!page_zip);
+		ptr = page_parse_create(ptr, end_ptr,
+					type == MLOG_COMP_PAGE_CREATE,
+					block, mtr);
+		break;
+	case MLOG_UNDO_INSERT:
+		ut_ad(!page || page_type == FIL_PAGE_UNDO_LOG);
+		ptr = trx_undo_parse_add_undo_rec(ptr, end_ptr, page);
+		break;
+	case MLOG_UNDO_ERASE_END:
+		ut_ad(!page || page_type == FIL_PAGE_UNDO_LOG);
+		ptr = trx_undo_parse_erase_page_end(ptr, end_ptr, page, mtr);
+		break;
+	case MLOG_UNDO_INIT:
+		/* Allow anything in page_type when creating a page. */
+		ptr = trx_undo_parse_page_init(ptr, end_ptr, page, mtr);
+		break;
+	case MLOG_UNDO_HDR_DISCARD:
+		ut_ad(!page || page_type == FIL_PAGE_UNDO_LOG);
+		ptr = trx_undo_parse_discard_latest(ptr, end_ptr, page, mtr);
+		break;
+	case MLOG_UNDO_HDR_CREATE:
+	case MLOG_UNDO_HDR_REUSE:
+		ut_ad(!page || page_type == FIL_PAGE_UNDO_LOG);
+		ptr = trx_undo_parse_page_header(type, ptr, end_ptr,
+						 page, mtr);
+		break;
+	case MLOG_REC_MIN_MARK: case MLOG_COMP_REC_MIN_MARK:
+		ut_ad(!page || page_type == FIL_PAGE_INDEX);
+		/* On a compressed page, MLOG_COMP_REC_MIN_MARK
+		will be followed by MLOG_COMP_REC_DELETE
+		or MLOG_ZIP_WRITE_HEADER(FIL_PAGE_PREV, FIL_NULL)
+		in the same mini-transaction. */
+		ut_a(type == MLOG_COMP_REC_MIN_MARK || !page_zip);
+		ptr = btr_parse_set_min_rec_mark(
+			ptr, end_ptr, type == MLOG_COMP_REC_MIN_MARK,
+			page, mtr);
+		break;
+	case MLOG_REC_DELETE: case MLOG_COMP_REC_DELETE:
+		ut_ad(!page || page_type == FIL_PAGE_INDEX);
+
+		if (NULL != (ptr = mlog_parse_index(
+				     ptr, end_ptr,
+				     type == MLOG_COMP_REC_DELETE,
+				     &index))) {
+			ut_a(!page
+			     || (ibool)!!page_is_comp(page)
+			     == dict_table_is_comp(index->table));
+			ptr = page_cur_parse_delete_rec(ptr, end_ptr,
+							block, index, mtr);
+		}
+		break;
+	case MLOG_IBUF_BITMAP_INIT:
+		/* Allow anything in page_type when creating a page. */
+		ptr = ibuf_parse_bitmap_init(ptr, end_ptr, block, mtr);
+		break;
+	case MLOG_INIT_FILE_PAGE:
+		/* Allow anything in page_type when creating a page. */
+		ptr = fsp_parse_init_file_page(ptr, end_ptr, block);
+		break;
+	case MLOG_WRITE_STRING:
+		ut_ad(!page || page_type != FIL_PAGE_TYPE_ALLOCATED);
+		ptr = mlog_parse_string(ptr, end_ptr, page, page_zip);
+		break;
+	case MLOG_FILE_CREATE:
+	case MLOG_FILE_RENAME:
+	case MLOG_FILE_DELETE:
+	case MLOG_FILE_CREATE2:
+		ptr = fil_op_log_parse_or_replay(ptr, end_ptr, type, 0, 0);
+		break;
+	case MLOG_ZIP_WRITE_NODE_PTR:
+		ut_ad(!page || page_type == FIL_PAGE_INDEX);
+		ptr = page_zip_parse_write_node_ptr(ptr, end_ptr,
+						    page, page_zip);
+		break;
+	case MLOG_ZIP_WRITE_BLOB_PTR:
+		ut_ad(!page || page_type == FIL_PAGE_INDEX);
+		ptr = page_zip_parse_write_blob_ptr(ptr, end_ptr,
+						    page, page_zip);
+		break;
+	case MLOG_ZIP_WRITE_HEADER:
+		ut_ad(!page || page_type == FIL_PAGE_INDEX);
+		ptr = page_zip_parse_write_header(ptr, end_ptr,
+						  page, page_zip);
+		break;
+	case MLOG_ZIP_PAGE_COMPRESS:
+		/* Allow anything in page_type when creating a page. */
+		ptr = page_zip_parse_compress(ptr, end_ptr,
+					      page, page_zip);
+		break;
+	default:
+		ptr = NULL;
+		recv_sys->found_corrupt_log = TRUE;
+	}
+
+	if (index) {
+		dict_table_t*	table = index->table;
+
+		dict_mem_index_free(index);
+		dict_mem_table_free(table);
+	}
+
+	return(ptr);
+}
+
+/*********************************************************************//**
+Calculates the fold value of a page file address: used in inserting or
+searching for a log record in the hash table.
+@return	folded value */
+UNIV_INLINE
+ulint
+recv_fold(
+/*======*/
+	ulint	space,	/*!< in: space */
+	ulint	page_no)/*!< in: page number */
+{
+	return(ut_fold_ulint_pair(space, page_no));
+}
+
+/*********************************************************************//**
+Calculates the hash value of a page file address: used in inserting or
+searching for a log record in the hash table.
+@return	folded value */
+UNIV_INLINE
+ulint
+recv_hash(
+/*======*/
+	ulint	space,	/*!< in: space */
+	ulint	page_no)/*!< in: page number */
+{
+	return(hash_calc_hash(recv_fold(space, page_no), recv_sys->addr_hash));
+}
+
+/*********************************************************************//**
+Gets the hashed file address struct for a page.
+@return	file address struct, NULL if not found from the hash table */
+static
+recv_addr_t*
+recv_get_fil_addr_struct(
+/*=====================*/
+	ulint	space,	/*!< in: space id */
+	ulint	page_no)/*!< in: page number */
+{
+	recv_addr_t*	recv_addr;
+
+	recv_addr = HASH_GET_FIRST(recv_sys->addr_hash,
+				   recv_hash(space, page_no));
+	while (recv_addr) {
+		if ((recv_addr->space == space)
+		    && (recv_addr->page_no == page_no)) {
+
+			break;
+		}
+
+		recv_addr = HASH_GET_NEXT(addr_hash, recv_addr);
+	}
+
+	return(recv_addr);
+}
+
+/*******************************************************************//**
+Adds a new log record to the hash table of log records. */
+static
+void
+recv_add_to_hash_table(
+/*===================*/
+	byte		type,		/*!< in: log record type */
+	ulint		space,		/*!< in: space id */
+	ulint		page_no,	/*!< in: page number */
+	byte*		body,		/*!< in: log record body */
+	byte*		rec_end,	/*!< in: log record end */
+	ib_uint64_t	start_lsn,	/*!< in: start lsn of the mtr */
+	ib_uint64_t	end_lsn)	/*!< in: end lsn of the mtr */
+{
+	recv_t*		recv;
+	ulint		len;
+	recv_data_t*	recv_data;
+	recv_data_t**	prev_field;
+	recv_addr_t*	recv_addr;
+
+	if (fil_tablespace_deleted_or_being_deleted_in_mem(space, -1)) {
+		/* The tablespace does not exist any more: do not store the
+		log record */
+
+		return;
+	}
+
+	len = rec_end - body;
+
+	if (srv_recovery_stats) {
+		recv_sys->stats_log_recs++;
+		recv_sys->stats_log_len_sum += len;
+	}
+
+	recv = mem_heap_alloc(recv_sys->heap, sizeof(recv_t));
+	recv->type = type;
+	recv->len = rec_end - body;
+	recv->start_lsn = start_lsn;
+	recv->end_lsn = end_lsn;
+
+	recv_addr = recv_get_fil_addr_struct(space, page_no);
+
+	if (recv_addr == NULL) {
+		recv_addr = mem_heap_alloc(recv_sys->heap,
+					   sizeof(recv_addr_t));
+		recv_addr->space = space;
+		recv_addr->page_no = page_no;
+		recv_addr->state = RECV_NOT_PROCESSED;
+
+		UT_LIST_INIT(recv_addr->rec_list);
+
+		HASH_INSERT(recv_addr_t, addr_hash, recv_sys->addr_hash,
+			    recv_fold(space, page_no), recv_addr);
+		recv_sys->n_addrs++;
+#if 0
+		fprintf(stderr, "Inserting log rec for space %lu, page %lu\n",
+			space, page_no);
+#endif
+	}
+
+	UT_LIST_ADD_LAST(rec_list, recv_addr->rec_list, recv);
+
+	prev_field = &(recv->data);
+
+	/* Store the log record body in chunks of less than UNIV_PAGE_SIZE:
+	recv_sys->heap grows into the buffer pool, and bigger chunks could not
+	be allocated */
+
+	while (rec_end > body) {
+
+		len = rec_end - body;
+
+		if (len > RECV_DATA_BLOCK_SIZE) {
+			len = RECV_DATA_BLOCK_SIZE;
+		}
+
+		recv_data = mem_heap_alloc(recv_sys->heap,
+					   sizeof(recv_data_t) + len);
+		*prev_field = recv_data;
+
+		memcpy(recv_data + 1, body, len);
+
+		prev_field = &(recv_data->next);
+
+		body += len;
+	}
+
+	*prev_field = NULL;
+}
+
+/*********************************************************************//**
+Copies the log record body from recv to buf. */
+static
+void
+recv_data_copy_to_buf(
+/*==================*/
+	byte*	buf,	/*!< in: buffer of length at least recv->len */
+	recv_t*	recv)	/*!< in: log record */
+{
+	recv_data_t*	recv_data;
+	ulint		part_len;
+	ulint		len;
+
+	len = recv->len;
+	recv_data = recv->data;
+
+	while (len > 0) {
+		if (len > RECV_DATA_BLOCK_SIZE) {
+			part_len = RECV_DATA_BLOCK_SIZE;
+		} else {
+			part_len = len;
+		}
+
+		ut_memcpy(buf, ((byte*)recv_data) + sizeof(recv_data_t),
+			  part_len);
+		buf += part_len;
+		len -= part_len;
+
+		recv_data = recv_data->next;
+	}
+}
+
+/************************************************************************//**
+Applies the hashed log records to the page, if the page lsn is less than the
+lsn of a log record. This can be called when a buffer page has just been
+read in, or also for a page already in the buffer pool. */
+UNIV_INTERN
+void
+recv_recover_page_func(
+/*===================*/
+#ifndef UNIV_HOTBACKUP
+	ibool		just_read_in,
+				/*!< in: TRUE if the i/o handler calls
+				this for a freshly read page */
+#endif /* !UNIV_HOTBACKUP */
+	buf_block_t*	block)	/*!< in/out: buffer block */
+{
+	page_t*		page;
+	page_zip_des_t*	page_zip;
+	recv_addr_t*	recv_addr;
+	recv_t*		recv;
+	byte*		buf;
+	ib_uint64_t	start_lsn;
+	ib_uint64_t	end_lsn;
+	ib_uint64_t	page_lsn;
+	ib_uint64_t	page_lsn_orig;
+	ib_uint64_t	page_newest_lsn;
+	ibool		modification_to_page;
+#ifndef UNIV_HOTBACKUP
+	ibool		success;
+#endif /* !UNIV_HOTBACKUP */
+	mtr_t		mtr;
+
+	mutex_enter(&(recv_sys->mutex));
+
+	if (recv_sys->apply_log_recs == FALSE) {
+
+		/* Log records should not be applied now */
+
+		mutex_exit(&(recv_sys->mutex));
+
+		return;
+	}
+
+	recv_addr = recv_get_fil_addr_struct(buf_block_get_space(block),
+					     buf_block_get_page_no(block));
+
+	if ((recv_addr == NULL)
+		/* bugfix: http://bugs.mysql.com/bug.php?id=44140 */
+	    || (recv_addr->state == RECV_BEING_READ && !just_read_in)
+	    || (recv_addr->state == RECV_BEING_PROCESSED)
+	    || (recv_addr->state == RECV_PROCESSED)) {
+
+		mutex_exit(&(recv_sys->mutex));
+
+		return;
+	}
+
+#if 0
+	fprintf(stderr, "Recovering space %lu, page %lu\n",
+		buf_block_get_space(block), buf_block_get_page_no(block));
+#endif
+
+	recv_addr->state = RECV_BEING_PROCESSED;
+
+	if (srv_recovery_stats) {
+		if (just_read_in) {
+			recv_sys->stats_recover_pages_with_read++;
+		} else {
+			recv_sys->stats_recover_pages_without_read++;
+		}
+	}
+
+	mutex_exit(&(recv_sys->mutex));
+
+	mtr_start(&mtr);
+	mtr_set_log_mode(&mtr, MTR_LOG_NONE);
+
+	page = block->frame;
+	page_zip = buf_block_get_page_zip(block);
+
+#ifndef UNIV_HOTBACKUP
+	if (just_read_in) {
+		/* Move the ownership of the x-latch on the page to
+		this OS thread, so that we can acquire a second
+		x-latch on it.  This is needed for the operations to
+		the page to pass the debug checks. */
+
+		rw_lock_x_lock_move_ownership(&block->lock);
+	}
+
+	success = buf_page_get_known_nowait(RW_X_LATCH, block,
+					    BUF_KEEP_OLD,
+					    __FILE__, __LINE__,
+					    &mtr);
+	ut_a(success);
+
+	buf_block_dbg_add_level(block, SYNC_NO_ORDER_CHECK);
+#endif /* !UNIV_HOTBACKUP */
+
+	/* Read the newest modification lsn from the page */
+	page_lsn = mach_read_ull(page + FIL_PAGE_LSN);
+	page_lsn_orig = page_lsn;
+
+#ifndef UNIV_HOTBACKUP
+	/* It may be that the page has been modified in the buffer
+	pool: read the newest modification lsn there */
+
+	page_newest_lsn = buf_page_get_newest_modification(&block->page);
+
+	if (page_newest_lsn) {
+
+		page_lsn = page_newest_lsn;
+	}
+#else /* !UNIV_HOTBACKUP */
+	/* In recovery from a backup we do not really use the buffer pool */
+	page_newest_lsn = 0;
+#endif /* !UNIV_HOTBACKUP */
+
+	modification_to_page = FALSE;
+	start_lsn = end_lsn = 0;
+
+	if (srv_recovery_stats) {
+		mutex_enter(&(recv_sys->mutex));
+		if (page_lsn_orig && recv_sys->stats_oldest_modified_lsn > page_lsn_orig) {
+			recv_sys->stats_oldest_modified_lsn = page_lsn_orig;
+		}
+		if (page_lsn_orig && recv_sys->stats_newest_modified_lsn < page_lsn_orig) {
+			recv_sys->stats_newest_modified_lsn = page_lsn_orig;
+		}
+		if (UT_LIST_GET_LAST(recv_addr->rec_list)->start_lsn
+		    < page_lsn_orig) {
+			recv_sys->stats_pages_already_new++;
+		}
+		mutex_exit(&(recv_sys->mutex));
+	}
+
+	recv = UT_LIST_GET_FIRST(recv_addr->rec_list);
+
+	while (recv) {
+		end_lsn = recv->end_lsn;
+
+		if (recv->len > RECV_DATA_BLOCK_SIZE) {
+			/* We have to copy the record body to a separate
+			buffer */
+
+			buf = mem_alloc(recv->len);
+
+			recv_data_copy_to_buf(buf, recv);
+		} else {
+			buf = ((byte*)(recv->data)) + sizeof(recv_data_t);
+		}
+
+		if (recv->type == MLOG_INIT_FILE_PAGE) {
+			page_lsn = page_newest_lsn;
+
+			memset(FIL_PAGE_LSN + page, 0, 8);
+			memset(UNIV_PAGE_SIZE - FIL_PAGE_END_LSN_OLD_CHKSUM
+			       + page, 0, 8);
+
+			if (page_zip) {
+				memset(FIL_PAGE_LSN + page_zip->data, 0, 8);
+			}
+		}
+
+		if (recv->start_lsn >= page_lsn) {
+
+			ib_uint64_t	end_lsn;
+
+			if (!modification_to_page) {
+
+				modification_to_page = TRUE;
+				start_lsn = recv->start_lsn;
+			}
+
+#ifdef UNIV_DEBUG
+			if (log_debug_writes) {
+				fprintf(stderr,
+					"InnoDB: Applying log rec"
+					" type %lu len %lu"
+					" to space %lu page no %lu\n",
+					(ulong) recv->type, (ulong) recv->len,
+					(ulong) recv_addr->space,
+					(ulong) recv_addr->page_no);
+			}
+#endif /* UNIV_DEBUG */
+
+			recv_parse_or_apply_log_rec_body(recv->type, buf,
+							 buf + recv->len,
+							 block, &mtr);
+
+			if (srv_recovery_stats) {
+				mutex_enter(&(recv_sys->mutex));
+				recv_sys->stats_applied_log_recs++;
+				recv_sys->stats_applied_log_len_sum += recv->len;
+				mutex_exit(&(recv_sys->mutex));
+			}
+
+			end_lsn = recv->start_lsn + recv->len;
+			mach_write_ull(FIL_PAGE_LSN + page, end_lsn);
+			mach_write_ull(UNIV_PAGE_SIZE
+				       - FIL_PAGE_END_LSN_OLD_CHKSUM
+				       + page, end_lsn);
+
+			if (page_zip) {
+				mach_write_ull(FIL_PAGE_LSN
+					       + page_zip->data, end_lsn);
+			}
+		}
+
+		if (recv->len > RECV_DATA_BLOCK_SIZE) {
+			mem_free(buf);
+		}
+
+		recv = UT_LIST_GET_NEXT(rec_list, recv);
+	}
+
+#ifdef UNIV_ZIP_DEBUG
+	if (fil_page_get_type(page) == FIL_PAGE_INDEX) {
+		page_zip_des_t*	page_zip = buf_block_get_page_zip(block);
+
+		if (page_zip) {
+			ut_a(page_zip_validate_low(page_zip, page, FALSE));
+		}
+	}
+#endif /* UNIV_ZIP_DEBUG */
+
+	mutex_enter(&(recv_sys->mutex));
+
+	if (recv_max_page_lsn < page_lsn) {
+		recv_max_page_lsn = page_lsn;
+	}
+
+	recv_addr->state = RECV_PROCESSED;
+
+	ut_a(recv_sys->n_addrs);
+	recv_sys->n_addrs--;
+
+	mutex_exit(&(recv_sys->mutex));
+
+#ifndef UNIV_HOTBACKUP
+	if (modification_to_page) {
+		ut_a(block);
+
+		buf_flush_recv_note_modification(block, start_lsn, end_lsn);
+	}
+#endif /* !UNIV_HOTBACKUP */
+
+	/* Make sure that committing mtr does not change the modification
+	lsn values of page */
+
+	mtr.modifications = FALSE;
+
+	mtr_commit(&mtr);
+}
+
+#ifndef UNIV_HOTBACKUP
+/*******************************************************************//**
+Reads in pages which have hashed log records, from an area around a given
+page number.
+@return	number of pages found */
+static
+ulint
+recv_read_in_area(
+/*==============*/
+	ulint	space,	/*!< in: space */
+	ulint	zip_size,/*!< in: compressed page size in bytes, or 0 */
+	ulint	page_no)/*!< in: page number */
+{
+	recv_addr_t* recv_addr;
+	ulint	page_nos[RECV_READ_AHEAD_AREA];
+	ulint	low_limit;
+	ulint	n;
+
+	low_limit = page_no - (page_no % RECV_READ_AHEAD_AREA);
+
+	n = 0;
+
+	for (page_no = low_limit; page_no < low_limit + RECV_READ_AHEAD_AREA;
+	     page_no++) {
+		recv_addr = recv_get_fil_addr_struct(space, page_no);
+
+		if (recv_addr && !buf_page_peek(space, page_no)) {
+
+			mutex_enter(&(recv_sys->mutex));
+
+			if (recv_addr->state == RECV_NOT_PROCESSED) {
+				recv_addr->state = RECV_BEING_READ;
+
+				page_nos[n] = page_no;
+
+				n++;
+			}
+
+			mutex_exit(&(recv_sys->mutex));
+		}
+	}
+
+	if (srv_recovery_stats && n) {
+		mutex_enter(&(recv_sys->mutex));
+		recv_sys->stats_read_requested_pages += n;
+		recv_sys->stats_read_in_area[n - 1]++;
+		mutex_exit(&(recv_sys->mutex));
+	}
+
+	buf_read_recv_pages(FALSE, space, zip_size, page_nos, n);
+	/*
+	fprintf(stderr, "Recv pages at %lu n %lu\n", page_nos[0], n);
+	*/
+	return(n);
+}
+
+/*******************************************************************//**
+Empties the hash table of stored log records, applying them to appropriate
+pages. */
+UNIV_INTERN
+void
+recv_apply_hashed_log_recs(
+/*=======================*/
+	ibool	allow_ibuf)	/*!< in: if TRUE, also ibuf operations are
+				allowed during the application; if FALSE,
+				no ibuf operations are allowed, and after
+				the application all file pages are flushed to
+				disk and invalidated in buffer pool: this
+				alternative means that no new log records
+				can be generated during the application;
+				the caller must in this case own the log
+				mutex */
+{
+	recv_addr_t* recv_addr;
+	ulint	i;
+	ulint	n_pages;
+	ibool	has_printed	= FALSE;
+	mtr_t	mtr;
+loop:
+	mutex_enter(&(recv_sys->mutex));
+
+	if (recv_sys->apply_batch_on) {
+
+		mutex_exit(&(recv_sys->mutex));
+
+		os_thread_sleep(500000);
+
+		goto loop;
+	}
+
+	ut_ad(!allow_ibuf == mutex_own(&log_sys->mutex));
+
+	if (!allow_ibuf) {
+		recv_no_ibuf_operations = TRUE;
+	}
+
+	recv_sys->apply_log_recs = TRUE;
+	recv_sys->apply_batch_on = TRUE;
+
+	for (i = 0; i < hash_get_n_cells(recv_sys->addr_hash); i++) {
+
+		recv_addr = HASH_GET_FIRST(recv_sys->addr_hash, i);
+
+		while (recv_addr) {
+			ulint	space = recv_addr->space;
+			ulint	zip_size = fil_space_get_zip_size(space);
+			ulint	page_no = recv_addr->page_no;
+
+			if (recv_addr->state == RECV_NOT_PROCESSED) {
+				if (!has_printed) {
+					ut_print_timestamp(stderr);
+					fputs("  InnoDB: Starting an"
+					      " apply batch of log records"
+					      " to the database...\n"
+					      "InnoDB: Progress in percents: ",
+					      stderr);
+					has_printed = TRUE;
+				}
+
+				mutex_exit(&(recv_sys->mutex));
+
+				if (buf_page_peek(space, page_no)) {
+					buf_block_t*	block;
+
+					mtr_start(&mtr);
+
+					block = buf_page_get(
+						space, zip_size, page_no,
+						RW_X_LATCH, &mtr);
+					buf_block_dbg_add_level(
+						block, SYNC_NO_ORDER_CHECK);
+
+					recv_recover_page(FALSE, block);
+					mtr_commit(&mtr);
+				} else {
+					recv_read_in_area(space, zip_size,
+							  page_no);
+				}
+
+				mutex_enter(&(recv_sys->mutex));
+			}
+
+			recv_addr = HASH_GET_NEXT(addr_hash, recv_addr);
+		}
+
+		if (has_printed
+		    && (i * 100) / hash_get_n_cells(recv_sys->addr_hash)
+		    != ((i + 1) * 100)
+		    / hash_get_n_cells(recv_sys->addr_hash)) {
+
+			fprintf(stderr, "%lu ", (ulong)
+				((i * 100)
+				 / hash_get_n_cells(recv_sys->addr_hash)));
+		}
+	}
+
+	/* Wait until all the pages have been processed */
+
+	while (recv_sys->n_addrs != 0) {
+
+		mutex_exit(&(recv_sys->mutex));
+
+		os_thread_sleep(500000);
+
+		mutex_enter(&(recv_sys->mutex));
+	}
+
+	if (has_printed) {
+
+		fprintf(stderr, "\n");
+	}
+
+	if (!allow_ibuf) {
+		/* Flush all the file pages to disk and invalidate them in
+		the buffer pool */
+
+		ut_d(recv_no_log_write = TRUE);
+		mutex_exit(&(recv_sys->mutex));
+		mutex_exit(&(log_sys->mutex));
+
+		n_pages = buf_flush_batch(BUF_FLUSH_LIST, ULINT_MAX,
+					  IB_ULONGLONG_MAX);
+		ut_a(n_pages != ULINT_UNDEFINED);
+
+		buf_flush_wait_batch_end(BUF_FLUSH_LIST);
+
+		buf_pool_invalidate();
+
+		mutex_enter(&(log_sys->mutex));
+		mutex_enter(&(recv_sys->mutex));
+		ut_d(recv_no_log_write = FALSE);
+
+		recv_no_ibuf_operations = FALSE;
+	}
+
+	recv_sys->apply_log_recs = FALSE;
+	recv_sys->apply_batch_on = FALSE;
+
+	recv_sys_empty_hash();
+
+	if (has_printed) {
+		fprintf(stderr, "InnoDB: Apply batch completed\n");
+
+		if (srv_recovery_stats) {
+			recv_sys->stats_recv_turns++;
+		}
+	}
+
+	mutex_exit(&(recv_sys->mutex));
+}
+#else /* !UNIV_HOTBACKUP */
+/*******************************************************************//**
+Applies log records in the hash table to a backup. */
+UNIV_INTERN
+void
+recv_apply_log_recs_for_backup(void)
+/*================================*/
+{
+	recv_addr_t*	recv_addr;
+	ulint		n_hash_cells;
+	buf_block_t*	block;
+	ulint		actual_size;
+	ibool		success;
+	ulint		error;
+	ulint		i;
+
+	recv_sys->apply_log_recs = TRUE;
+	recv_sys->apply_batch_on = TRUE;
+
+	block = back_block1;
+
+	fputs("InnoDB: Starting an apply batch of log records"
+	      " to the database...\n"
+	      "InnoDB: Progress in percents: ", stderr);
+
+	n_hash_cells = hash_get_n_cells(recv_sys->addr_hash);
+
+	for (i = 0; i < n_hash_cells; i++) {
+		/* The address hash table is externally chained */
+		recv_addr = hash_get_nth_cell(recv_sys->addr_hash, i)->node;
+
+		while (recv_addr != NULL) {
+
+			ulint	zip_size
+				= fil_space_get_zip_size(recv_addr->space);
+
+			if (zip_size == ULINT_UNDEFINED) {
+#if 0
+				fprintf(stderr,
+					"InnoDB: Warning: cannot apply"
+					" log record to"
+					" tablespace %lu page %lu,\n"
+					"InnoDB: because tablespace with"
+					" that id does not exist.\n",
+					recv_addr->space, recv_addr->page_no);
+#endif
+				recv_addr->state = RECV_PROCESSED;
+
+				ut_a(recv_sys->n_addrs);
+				recv_sys->n_addrs--;
+
+				goto skip_this_recv_addr;
+			}
+
+			/* We simulate a page read made by the buffer pool, to
+			make sure the recovery apparatus works ok. We must init
+			the block. */
+
+			buf_page_init_for_backup_restore(
+				recv_addr->space, recv_addr->page_no,
+				zip_size, block);
+
+			/* Extend the tablespace's last file if the page_no
+			does not fall inside its bounds; we assume the last
+			file is auto-extending, and ibbackup copied the file
+			when it still was smaller */
+
+			success = fil_extend_space_to_desired_size(
+				&actual_size,
+				recv_addr->space, recv_addr->page_no + 1);
+			if (!success) {
+				fprintf(stderr,
+					"InnoDB: Fatal error: cannot extend"
+					" tablespace %lu to hold %lu pages\n",
+					recv_addr->space, recv_addr->page_no);
+
+				exit(1);
+			}
+
+			/* Read the page from the tablespace file using the
+			fil0fil.c routines */
+
+			if (zip_size) {
+				error = fil_io(OS_FILE_READ, TRUE,
+					       recv_addr->space, zip_size,
+					       recv_addr->page_no, 0, zip_size,
+					       block->page.zip.data, NULL);
+				if (error == DB_SUCCESS
+				    && !buf_zip_decompress(block, TRUE)) {
+					exit(1);
+				}
+			} else {
+				error = fil_io(OS_FILE_READ, TRUE,
+					       recv_addr->space, 0,
+					       recv_addr->page_no, 0,
+					       UNIV_PAGE_SIZE,
+					       block->frame, NULL);
+			}
+
+			if (error != DB_SUCCESS) {
+				fprintf(stderr,
+					"InnoDB: Fatal error: cannot read"
+					" from tablespace"
+					" %lu page number %lu\n",
+					(ulong) recv_addr->space,
+					(ulong) recv_addr->page_no);
+
+				exit(1);
+			}
+
+			/* Apply the log records to this page */
+			recv_recover_page(FALSE, block);
+
+			/* Write the page back to the tablespace file using the
+			fil0fil.c routines */
+
+			buf_flush_init_for_writing(
+				block->frame, buf_block_get_page_zip(block),
+				mach_read_ull(block->frame + FIL_PAGE_LSN));
+
+			if (zip_size) {
+				error = fil_io(OS_FILE_WRITE, TRUE,
+					       recv_addr->space, zip_size,
+					       recv_addr->page_no, 0,
+					       zip_size,
+					       block->page.zip.data, NULL);
+			} else {
+				error = fil_io(OS_FILE_WRITE, TRUE,
+					       recv_addr->space, 0,
+					       recv_addr->page_no, 0,
+					       UNIV_PAGE_SIZE,
+					       block->frame, NULL);
+			}
+skip_this_recv_addr:
+			recv_addr = HASH_GET_NEXT(addr_hash, recv_addr);
+		}
+
+		if ((100 * i) / n_hash_cells
+		    != (100 * (i + 1)) / n_hash_cells) {
+			fprintf(stderr, "%lu ",
+				(ulong) ((100 * i) / n_hash_cells));
+			fflush(stderr);
+		}
+	}
+
+	recv_sys_empty_hash();
+}
+#endif /* !UNIV_HOTBACKUP */
+
+/*******************************************************************//**
+Tries to parse a single log record and returns its length.
+@return	length of the record, or 0 if the record was not complete */
+static
+ulint
+recv_parse_log_rec(
+/*===============*/
+	byte*	ptr,	/*!< in: pointer to a buffer */
+	byte*	end_ptr,/*!< in: pointer to the buffer end */
+	byte*	type,	/*!< out: type */
+	ulint*	space,	/*!< out: space id */
+	ulint*	page_no,/*!< out: page number */
+	byte**	body)	/*!< out: log record body start */
+{
+	byte*	new_ptr;
+
+	*body = NULL;
+
+	if (ptr == end_ptr) {
+
+		return(0);
+	}
+
+	if (*ptr == MLOG_MULTI_REC_END) {
+
+		*type = *ptr;
+
+		return(1);
+	}
+
+	if (*ptr == MLOG_DUMMY_RECORD) {
+		*type = *ptr;
+
+		*space = ULINT_UNDEFINED - 1; /* For debugging */
+
+		return(1);
+	}
+
+	new_ptr = mlog_parse_initial_log_record(ptr, end_ptr, type, space,
+						page_no);
+	*body = new_ptr;
+
+	if (UNIV_UNLIKELY(!new_ptr)) {
+
+		return(0);
+	}
+
+#ifdef UNIV_LOG_LSN_DEBUG
+	if (*type == MLOG_LSN) {
+		ib_uint64_t	lsn = (ib_uint64_t) *space << 32 | *page_no;
+# ifdef UNIV_LOG_DEBUG
+		ut_a(lsn == log_sys->old_lsn);
+# else /* UNIV_LOG_DEBUG */
+		ut_a(lsn == recv_sys->recovered_lsn);
+# endif /* UNIV_LOG_DEBUG */
+	}
+#endif /* UNIV_LOG_LSN_DEBUG */
+
+	new_ptr = recv_parse_or_apply_log_rec_body(*type, new_ptr, end_ptr,
+						   NULL, NULL);
+	if (UNIV_UNLIKELY(new_ptr == NULL)) {
+
+		return(0);
+	}
+
+	if (*page_no > recv_max_parsed_page_no) {
+		recv_max_parsed_page_no = *page_no;
+	}
+
+	return(new_ptr - ptr);
+}
+
+/*******************************************************//**
+Calculates the new value for lsn when more data is added to the log. */
+static
+ib_uint64_t
+recv_calc_lsn_on_data_add(
+/*======================*/
+	ib_uint64_t	lsn,	/*!< in: old lsn */
+	ib_uint64_t	len)	/*!< in: this many bytes of data is
+				added, log block headers not included */
+{
+	ulint	frag_len;
+	ulint	lsn_len;
+
+	frag_len = (((ulint) lsn) % OS_FILE_LOG_BLOCK_SIZE)
+		- LOG_BLOCK_HDR_SIZE;
+	ut_ad(frag_len < OS_FILE_LOG_BLOCK_SIZE - LOG_BLOCK_HDR_SIZE
+	      - LOG_BLOCK_TRL_SIZE);
+	lsn_len = (ulint) len;
+	lsn_len += (lsn_len + frag_len)
+		/ (OS_FILE_LOG_BLOCK_SIZE - LOG_BLOCK_HDR_SIZE
+		   - LOG_BLOCK_TRL_SIZE)
+		* (LOG_BLOCK_HDR_SIZE + LOG_BLOCK_TRL_SIZE);
+
+	return(lsn + lsn_len);
+}
+
+#ifdef UNIV_LOG_DEBUG
+/*******************************************************//**
+Checks that the parser recognizes incomplete initial segments of a log
+record as incomplete. */
+static
+void
+recv_check_incomplete_log_recs(
+/*===========================*/
+	byte*	ptr,	/*!< in: pointer to a complete log record */
+	ulint	len)	/*!< in: length of the log record */
+{
+	ulint	i;
+	byte	type;
+	ulint	space;
+	ulint	page_no;
+	byte*	body;
+
+	for (i = 0; i < len; i++) {
+		ut_a(0 == recv_parse_log_rec(ptr, ptr + i, &type, &space,
+					     &page_no, &body));
+	}
+}
+#endif /* UNIV_LOG_DEBUG */
+
+/*******************************************************//**
+Prints diagnostic info of corrupt log. */
+static
+void
+recv_report_corrupt_log(
+/*====================*/
+	byte*	ptr,	/*!< in: pointer to corrupt log record */
+	byte	type,	/*!< in: type of the record */
+	ulint	space,	/*!< in: space id, this may also be garbage */
+	ulint	page_no)/*!< in: page number, this may also be garbage */
+{
+	fprintf(stderr,
+		"InnoDB: ############### CORRUPT LOG RECORD FOUND\n"
+		"InnoDB: Log record type %lu, space id %lu, page number %lu\n"
+		"InnoDB: Log parsing proceeded successfully up to %llu\n"
+		"InnoDB: Previous log record type %lu, is multi %lu\n"
+		"InnoDB: Recv offset %lu, prev %lu\n",
+		(ulong) type, (ulong) space, (ulong) page_no,
+		recv_sys->recovered_lsn,
+		(ulong) recv_previous_parsed_rec_type,
+		(ulong) recv_previous_parsed_rec_is_multi,
+		(ulong) (ptr - recv_sys->buf),
+		(ulong) recv_previous_parsed_rec_offset);
+
+	if ((ulint)(ptr - recv_sys->buf + 100)
+	    > recv_previous_parsed_rec_offset
+	    && (ulint)(ptr - recv_sys->buf + 100
+		       - recv_previous_parsed_rec_offset)
+	    < 200000) {
+		fputs("InnoDB: Hex dump of corrupt log starting"
+		      " 100 bytes before the start\n"
+		      "InnoDB: of the previous log rec,\n"
+		      "InnoDB: and ending 100 bytes after the start"
+		      " of the corrupt rec:\n",
+		      stderr);
+
+		ut_print_buf(stderr,
+			     recv_sys->buf
+			     + recv_previous_parsed_rec_offset - 100,
+			     ptr - recv_sys->buf + 200
+			     - recv_previous_parsed_rec_offset);
+		putc('\n', stderr);
+	}
+
+#ifndef UNIV_HOTBACKUP
+	if (!srv_force_recovery) {
+		fputs("InnoDB: Set innodb_force_recovery"
+		      " to ignore this error.\n", stderr);
+		ut_error;
+	}
+#endif /* !UNIV_HOTBACKUP */
+
+	fputs("InnoDB: WARNING: the log file may have been corrupt and it\n"
+	      "InnoDB: is possible that the log scan did not proceed\n"
+	      "InnoDB: far enough in recovery! Please run CHECK TABLE\n"
+	      "InnoDB: on your InnoDB tables to check that they are ok!\n"
+	      "InnoDB: If mysqld crashes after this recovery, look at\n"
+	      "InnoDB: " REFMAN "forcing-recovery.html\n"
+	      "InnoDB: about forcing recovery.\n", stderr);
+
+	fflush(stderr);
+}
+
+/*******************************************************//**
+Parses log records from a buffer and stores them to a hash table to wait
+merging to file pages.
+@return	currently always returns FALSE */
+static
+ibool
+recv_parse_log_recs(
+/*================*/
+	ibool	store_to_hash)	/*!< in: TRUE if the records should be stored
+				to the hash table; this is set to FALSE if just
+				debug checking is needed */
+{
+	byte*		ptr;
+	byte*		end_ptr;
+	ulint		single_rec;
+	ulint		len;
+	ulint		total_len;
+	ib_uint64_t	new_recovered_lsn;
+	ib_uint64_t	old_lsn;
+	byte		type;
+	ulint		space;
+	ulint		page_no;
+	byte*		body;
+	ulint		n_recs;
+
+	ut_ad(mutex_own(&(log_sys->mutex)));
+	ut_ad(recv_sys->parse_start_lsn != 0);
+loop:
+	ptr = recv_sys->buf + recv_sys->recovered_offset;
+
+	end_ptr = recv_sys->buf + recv_sys->len;
+
+	if (ptr == end_ptr) {
+
+		return(FALSE);
+	}
+
+	single_rec = (ulint)*ptr & MLOG_SINGLE_REC_FLAG;
+
+	if (single_rec || *ptr == MLOG_DUMMY_RECORD) {
+		/* The mtr only modified a single page, or this is a file op */
+
+		old_lsn = recv_sys->recovered_lsn;
+
+		/* Try to parse a log record, fetching its type, space id,
+		page no, and a pointer to the body of the log record */
+
+		len = recv_parse_log_rec(ptr, end_ptr, &type, &space,
+					 &page_no, &body);
+
+		if (len == 0 || recv_sys->found_corrupt_log) {
+			if (recv_sys->found_corrupt_log) {
+
+				recv_report_corrupt_log(ptr,
+							type, space, page_no);
+			}
+
+			return(FALSE);
+		}
+
+		new_recovered_lsn = recv_calc_lsn_on_data_add(old_lsn, len);
+
+		if (new_recovered_lsn > recv_sys->scanned_lsn) {
+			/* The log record filled a log block, and we require
+			that also the next log block should have been scanned
+			in */
+
+			return(FALSE);
+		}
+
+		recv_previous_parsed_rec_type = (ulint)type;
+		recv_previous_parsed_rec_offset = recv_sys->recovered_offset;
+		recv_previous_parsed_rec_is_multi = 0;
+
+		recv_sys->recovered_offset += len;
+		recv_sys->recovered_lsn = new_recovered_lsn;
+
+#ifdef UNIV_DEBUG
+		if (log_debug_writes) {
+			fprintf(stderr,
+				"InnoDB: Parsed a single log rec"
+				" type %lu len %lu space %lu page no %lu\n",
+				(ulong) type, (ulong) len, (ulong) space,
+				(ulong) page_no);
+		}
+#endif /* UNIV_DEBUG */
+
+		if (type == MLOG_DUMMY_RECORD) {
+			/* Do nothing */
+
+		} else if (!store_to_hash) {
+			/* In debug checking, update a replicate page
+			according to the log record, and check that it
+			becomes identical with the original page */
+#ifdef UNIV_LOG_DEBUG
+			recv_check_incomplete_log_recs(ptr, len);
+#endif/* UNIV_LOG_DEBUG */
+
+		} else if (type == MLOG_FILE_CREATE
+			   || type == MLOG_FILE_CREATE2
+			   || type == MLOG_FILE_RENAME
+			   || type == MLOG_FILE_DELETE) {
+			ut_a(space);
+#ifdef UNIV_HOTBACKUP
+			if (recv_replay_file_ops) {
+
+				/* In ibbackup --apply-log, replay an .ibd file
+				operation, if possible; note that
+				fil_path_to_mysql_datadir is set in ibbackup to
+				point to the datadir we should use there */
+
+				if (NULL == fil_op_log_parse_or_replay(
+					    body, end_ptr, type,
+					    space, page_no)) {
+					fprintf(stderr,
+						"InnoDB: Error: file op"
+						" log record of type %lu"
+						" space %lu not complete in\n"
+						"InnoDB: the replay phase."
+						" Path %s\n",
+						(ulint)type, space,
+						(char*)(body + 2));
+
+					ut_error;
+				}
+			}
+#endif
+			/* In normal mysqld crash recovery we do not try to
+			replay file operations */
+#ifdef UNIV_LOG_LSN_DEBUG
+		} else if (type == MLOG_LSN) {
+			/* Do not add these records to the hash table.
+			The page number and space id fields are misused
+			for something else. */
+#endif /* UNIV_LOG_LSN_DEBUG */
+		} else {
+			recv_add_to_hash_table(type, space, page_no, body,
+					       ptr + len, old_lsn,
+					       recv_sys->recovered_lsn);
+		}
+	} else {
+		/* Check that all the records associated with the single mtr
+		are included within the buffer */
+
+		total_len = 0;
+		n_recs = 0;
+
+		for (;;) {
+			len = recv_parse_log_rec(ptr, end_ptr, &type, &space,
+						 &page_no, &body);
+			if (len == 0 || recv_sys->found_corrupt_log) {
+
+				if (recv_sys->found_corrupt_log) {
+
+					recv_report_corrupt_log(
+						ptr, type, space, page_no);
+				}
+
+				return(FALSE);
+			}
+
+			recv_previous_parsed_rec_type = (ulint)type;
+			recv_previous_parsed_rec_offset
+				= recv_sys->recovered_offset + total_len;
+			recv_previous_parsed_rec_is_multi = 1;
+
+#ifdef UNIV_LOG_DEBUG
+			if ((!store_to_hash) && (type != MLOG_MULTI_REC_END)) {
+				recv_check_incomplete_log_recs(ptr, len);
+			}
+#endif /* UNIV_LOG_DEBUG */
+
+#ifdef UNIV_DEBUG
+			if (log_debug_writes) {
+				fprintf(stderr,
+					"InnoDB: Parsed a multi log rec"
+					" type %lu len %lu"
+					" space %lu page no %lu\n",
+					(ulong) type, (ulong) len,
+					(ulong) space, (ulong) page_no);
+			}
+#endif /* UNIV_DEBUG */
+
+			total_len += len;
+			n_recs++;
+
+			ptr += len;
+
+			if (type == MLOG_MULTI_REC_END) {
+
+				/* Found the end mark for the records */
+
+				break;
+			}
+		}
+
+		new_recovered_lsn = recv_calc_lsn_on_data_add(
+			recv_sys->recovered_lsn, total_len);
+
+		if (new_recovered_lsn > recv_sys->scanned_lsn) {
+			/* The log record filled a log block, and we require
+			that also the next log block should have been scanned
+			in */
+
+			return(FALSE);
+		}
+
+		/* Add all the records to the hash table */
+
+		ptr = recv_sys->buf + recv_sys->recovered_offset;
+
+		for (;;) {
+			old_lsn = recv_sys->recovered_lsn;
+			len = recv_parse_log_rec(ptr, end_ptr, &type, &space,
+						 &page_no, &body);
+			if (recv_sys->found_corrupt_log) {
+
+				recv_report_corrupt_log(ptr,
+							type, space, page_no);
+			}
+
+			ut_a(len != 0);
+			ut_a(0 == ((ulint)*ptr & MLOG_SINGLE_REC_FLAG));
+
+			recv_sys->recovered_offset += len;
+			recv_sys->recovered_lsn
+				= recv_calc_lsn_on_data_add(old_lsn, len);
+			if (type == MLOG_MULTI_REC_END) {
+
+				/* Found the end mark for the records */
+
+				break;
+			}
+
+			if (store_to_hash
+#ifdef UNIV_LOG_LSN_DEBUG
+			    && type != MLOG_LSN
+#endif /* UNIV_LOG_LSN_DEBUG */
+			    ) {
+				recv_add_to_hash_table(type, space, page_no,
+						       body, ptr + len,
+						       old_lsn,
+						       new_recovered_lsn);
+			}
+
+			ptr += len;
+		}
+	}
+
+	goto loop;
+}
+
+/*******************************************************//**
+Adds data from a new log block to the parsing buffer of recv_sys if
+recv_sys->parse_start_lsn is non-zero.
+@return	TRUE if more data added */
+static
+ibool
+recv_sys_add_to_parsing_buf(
+/*========================*/
+	const byte*	log_block,	/*!< in: log block */
+	ib_uint64_t	scanned_lsn)	/*!< in: lsn of how far we were able
+					to find data in this log block */
+{
+	ulint	more_len;
+	ulint	data_len;
+	ulint	start_offset;
+	ulint	end_offset;
+
+	ut_ad(scanned_lsn >= recv_sys->scanned_lsn);
+
+	if (!recv_sys->parse_start_lsn) {
+		/* Cannot start parsing yet because no start point for
+		it found */
+
+		return(FALSE);
+	}
+
+	data_len = log_block_get_data_len(log_block);
+
+	if (recv_sys->parse_start_lsn >= scanned_lsn) {
+
+		return(FALSE);
+
+	} else if (recv_sys->scanned_lsn >= scanned_lsn) {
+
+		return(FALSE);
+
+	} else if (recv_sys->parse_start_lsn > recv_sys->scanned_lsn) {
+		more_len = (ulint) (scanned_lsn - recv_sys->parse_start_lsn);
+	} else {
+		more_len = (ulint) (scanned_lsn - recv_sys->scanned_lsn);
+	}
+
+	if (more_len == 0) {
+
+		return(FALSE);
+	}
+
+	ut_ad(data_len >= more_len);
+
+	start_offset = data_len - more_len;
+
+	if (start_offset < LOG_BLOCK_HDR_SIZE) {
+		start_offset = LOG_BLOCK_HDR_SIZE;
+	}
+
+	end_offset = data_len;
+
+	if (end_offset > OS_FILE_LOG_BLOCK_SIZE - LOG_BLOCK_TRL_SIZE) {
+		end_offset = OS_FILE_LOG_BLOCK_SIZE - LOG_BLOCK_TRL_SIZE;
+	}
+
+	ut_ad(start_offset <= end_offset);
+
+	if (start_offset < end_offset) {
+		ut_memcpy(recv_sys->buf + recv_sys->len,
+			  log_block + start_offset, end_offset - start_offset);
+
+		recv_sys->len += end_offset - start_offset;
+
+		ut_a(recv_sys->len <= RECV_PARSING_BUF_SIZE);
+	}
+
+	return(TRUE);
+}
+
+/*******************************************************//**
+Moves the parsing buffer data left to the buffer start. */
+static
+void
+recv_sys_justify_left_parsing_buf(void)
+/*===================================*/
+{
+	ut_memmove(recv_sys->buf, recv_sys->buf + recv_sys->recovered_offset,
+		   recv_sys->len - recv_sys->recovered_offset);
+
+	recv_sys->len -= recv_sys->recovered_offset;
+
+	recv_sys->recovered_offset = 0;
+}
+
+/*******************************************************//**
+Scans log from a buffer and stores new log data to the parsing buffer.
+Parses and hashes the log records if new data found.  Unless
+UNIV_HOTBACKUP is defined, this function will apply log records
+automatically when the hash table becomes full.
+@return TRUE if limit_lsn has been reached, or not able to scan any
+more in this log group */
+UNIV_INTERN
+ibool
+recv_scan_log_recs(
+/*===============*/
+	ulint		available_memory,/*!< in: we let the hash table of recs
+					to grow to this size, at the maximum */
+	ibool		store_to_hash,	/*!< in: TRUE if the records should be
+					stored to the hash table; this is set
+					to FALSE if just debug checking is
+					needed */
+	const byte*	buf,		/*!< in: buffer containing a log
+					segment or garbage */
+	ulint		len,		/*!< in: buffer length */
+	ib_uint64_t	start_lsn,	/*!< in: buffer start lsn */
+	ib_uint64_t*	contiguous_lsn,	/*!< in/out: it is known that all log
+					groups contain contiguous log data up
+					to this lsn */
+	ib_uint64_t*	group_scanned_lsn)/*!< out: scanning succeeded up to
+					this lsn */
+{
+	const byte*	log_block;
+	ulint		no;
+	ib_uint64_t	scanned_lsn;
+	ibool		finished;
+	ulint		data_len;
+	ibool		more_data;
+
+	ut_ad(start_lsn % OS_FILE_LOG_BLOCK_SIZE == 0);
+	ut_ad(len % OS_FILE_LOG_BLOCK_SIZE == 0);
+	ut_ad(len >= OS_FILE_LOG_BLOCK_SIZE);
+	ut_a(store_to_hash <= TRUE);
+
+	finished = FALSE;
+
+	log_block = buf;
+	scanned_lsn = start_lsn;
+	more_data = FALSE;
+
+	do {
+		no = log_block_get_hdr_no(log_block);
+		/*
+		fprintf(stderr, "Log block header no %lu\n", no);
+
+		fprintf(stderr, "Scanned lsn no %lu\n",
+		log_block_convert_lsn_to_no(scanned_lsn));
+		*/
+		if (no != log_block_convert_lsn_to_no(scanned_lsn)
+		    || !log_block_checksum_is_ok_or_old_format(log_block)) {
+
+			if (no == log_block_convert_lsn_to_no(scanned_lsn)
+			    && !log_block_checksum_is_ok_or_old_format(
+				    log_block)) {
+				fprintf(stderr,
+					"InnoDB: Log block no %lu at"
+					" lsn %llu has\n"
+					"InnoDB: ok header, but checksum field"
+					" contains %lu, should be %lu\n",
+					(ulong) no,
+					scanned_lsn,
+					(ulong) log_block_get_checksum(
+						log_block),
+					(ulong) log_block_calc_checksum(
+						log_block));
+			}
+
+			/* Garbage or an incompletely written log block */
+
+			finished = TRUE;
+
+			break;
+		}
+
+		if (log_block_get_flush_bit(log_block)) {
+			/* This block was a start of a log flush operation:
+			we know that the previous flush operation must have
+			been completed for all log groups before this block
+			can have been flushed to any of the groups. Therefore,
+			we know that log data is contiguous up to scanned_lsn
+			in all non-corrupt log groups. */
+
+			if (scanned_lsn > *contiguous_lsn) {
+				*contiguous_lsn = scanned_lsn;
+			}
+		}
+
+		data_len = log_block_get_data_len(log_block);
+
+		if ((store_to_hash || (data_len == OS_FILE_LOG_BLOCK_SIZE))
+		    && scanned_lsn + data_len > recv_sys->scanned_lsn
+		    && (recv_sys->scanned_checkpoint_no > 0)
+		    && (log_block_get_checkpoint_no(log_block)
+			< recv_sys->scanned_checkpoint_no)
+		    && (recv_sys->scanned_checkpoint_no
+			- log_block_get_checkpoint_no(log_block)
+			> 0x80000000UL)) {
+
+			/* Garbage from a log buffer flush which was made
+			before the most recent database recovery */
+
+			finished = TRUE;
+#ifdef UNIV_LOG_DEBUG
+			/* This is not really an error, but currently
+			we stop here in the debug version: */
+
+			ut_error;
+#endif
+			break;
+		}
+
+		if (!recv_sys->parse_start_lsn
+		    && (log_block_get_first_rec_group(log_block) > 0)) {
+
+			/* We found a point from which to start the parsing
+			of log records */
+
+			recv_sys->parse_start_lsn = scanned_lsn
+				+ log_block_get_first_rec_group(log_block);
+			recv_sys->scanned_lsn = recv_sys->parse_start_lsn;
+			recv_sys->recovered_lsn = recv_sys->parse_start_lsn;
+		}
+
+		scanned_lsn += data_len;
+
+		if (scanned_lsn > recv_sys->scanned_lsn) {
+
+			/* We have found more entries. If this scan is
+ 			of startup type, we must initiate crash recovery
+			environment before parsing these log records. */
+
+#ifndef UNIV_HOTBACKUP
+			if (recv_log_scan_is_startup_type
+			    && !recv_needed_recovery) {
+
+				fprintf(stderr,
+					"InnoDB: Log scan progressed"
+					" past the checkpoint lsn %llu\n",
+					recv_sys->scanned_lsn);
+				recv_init_crash_recovery();
+			}
+#endif /* !UNIV_HOTBACKUP */
+
+			/* We were able to find more log data: add it to the
+			parsing buffer if parse_start_lsn is already
+			non-zero */
+
+			if (recv_sys->len + 4 * OS_FILE_LOG_BLOCK_SIZE
+			    >= RECV_PARSING_BUF_SIZE) {
+				fprintf(stderr,
+					"InnoDB: Error: log parsing"
+					" buffer overflow."
+					" Recovery may have failed!\n");
+
+				recv_sys->found_corrupt_log = TRUE;
+
+#ifndef UNIV_HOTBACKUP
+				if (!srv_force_recovery) {
+					fputs("InnoDB: Set"
+					      " innodb_force_recovery"
+					      " to ignore this error.\n",
+					      stderr);
+					ut_error;
+				}
+#endif /* !UNIV_HOTBACKUP */
+
+			} else if (!recv_sys->found_corrupt_log) {
+				more_data = recv_sys_add_to_parsing_buf(
+					log_block, scanned_lsn);
+			}
+
+			recv_sys->scanned_lsn = scanned_lsn;
+			recv_sys->scanned_checkpoint_no
+				= log_block_get_checkpoint_no(log_block);
+		}
+
+		if (data_len < OS_FILE_LOG_BLOCK_SIZE) {
+			/* Log data for this group ends here */
+
+			finished = TRUE;
+			break;
+		} else {
+			log_block += OS_FILE_LOG_BLOCK_SIZE;
+		}
+	} while (log_block < buf + len && !finished);
+
+	*group_scanned_lsn = scanned_lsn;
+
+	if (recv_needed_recovery
+	    || (recv_is_from_backup && !recv_is_making_a_backup)) {
+		recv_scan_print_counter++;
+
+		if (finished || (recv_scan_print_counter % 80 == 0)) {
+
+			fprintf(stderr,
+				"InnoDB: Doing recovery: scanned up to"
+				" log sequence number %llu\n",
+				*group_scanned_lsn);
+		}
+	}
+
+	if (more_data && !recv_sys->found_corrupt_log) {
+		/* Try to parse more log records */
+
+		recv_parse_log_recs(store_to_hash);
+
+#ifndef UNIV_HOTBACKUP
+		if (store_to_hash && mem_heap_get_size(recv_sys->heap)
+		    > available_memory) {
+
+			/* Hash table of log records has grown too big:
+			empty it; FALSE means no ibuf operations
+			allowed, as we cannot add new records to the
+			log yet: they would be produced by ibuf
+			operations */
+
+			recv_apply_hashed_log_recs(FALSE);
+		}
+#endif /* !UNIV_HOTBACKUP */
+
+		if (recv_sys->recovered_offset > RECV_PARSING_BUF_SIZE / 4) {
+			/* Move parsing buffer data to the buffer start */
+
+			recv_sys_justify_left_parsing_buf();
+		}
+	}
+
+	return(finished);
+}
+
+#ifndef UNIV_HOTBACKUP
+/*******************************************************//**
+Scans log from a buffer and stores new log data to the parsing buffer. Parses
+and hashes the log records if new data found. */
+static
+void
+recv_group_scan_log_recs(
+/*=====================*/
+	log_group_t*	group,		/*!< in: log group */
+	ib_uint64_t*	contiguous_lsn,	/*!< in/out: it is known that all log
+					groups contain contiguous log data up
+					to this lsn */
+	ib_uint64_t*	group_scanned_lsn)/*!< out: scanning succeeded up to
+					this lsn */
+{
+	ibool		finished;
+	ib_uint64_t	start_lsn;
+	ib_uint64_t	end_lsn;
+
+	finished = FALSE;
+
+	start_lsn = *contiguous_lsn;
+
+	while (!finished) {
+		end_lsn = start_lsn + RECV_SCAN_SIZE;
+
+		log_group_read_log_seg(LOG_RECOVER, log_sys->buf,
+				       group, start_lsn, end_lsn);
+
+		finished = recv_scan_log_recs(
+			(buf_pool->curr_size - recv_n_pool_free_frames)
+			* UNIV_PAGE_SIZE, TRUE, log_sys->buf, RECV_SCAN_SIZE,
+			start_lsn, contiguous_lsn, group_scanned_lsn);
+		start_lsn = end_lsn;
+	}
+
+#ifdef UNIV_DEBUG
+	if (log_debug_writes) {
+		fprintf(stderr,
+			"InnoDB: Scanned group %lu up to"
+			" log sequence number %llu\n",
+			(ulong) group->id,
+			*group_scanned_lsn);
+	}
+#endif /* UNIV_DEBUG */
+}
+
+/*******************************************************//**
+Initialize crash recovery environment. Can be called iff
+recv_needed_recovery == FALSE. */
+static
+void
+recv_init_crash_recovery(void)
+/*==========================*/
+{
+	ut_a(!recv_needed_recovery);
+	ut_a(!srv_buffer_pool_shm_is_reused);
+
+	recv_needed_recovery = TRUE;
+
+	ut_print_timestamp(stderr);
+
+	fprintf(stderr,
+		"  InnoDB: Database was not"
+		" shut down normally!\n"
+		"InnoDB: Starting crash recovery.\n");
+
+	fprintf(stderr,
+		"InnoDB: Reading tablespace information"
+		" from the .ibd files...\n");
+
+	fil_load_single_table_tablespaces();
+
+	/* If we are using the doublewrite method, we will
+	check if there are half-written pages in data files,
+	and restore them from the doublewrite buffer if
+	possible */
+
+	if (srv_force_recovery < SRV_FORCE_NO_LOG_REDO) {
+
+		fprintf(stderr,
+			"InnoDB: Restoring possible"
+			" half-written data pages from"
+			" the doublewrite\n"
+			"InnoDB: buffer...\n");
+		trx_sys_doublewrite_init_or_restore_pages(TRUE);
+	}
+}
+
+/********************************************************//**
+Recovers from a checkpoint. When this function returns, the database is able
+to start processing of new user transactions, but the function
+recv_recovery_from_checkpoint_finish should be called later to complete
+the recovery and free the resources used in it.
+@return	error code or DB_SUCCESS */
+UNIV_INTERN
+ulint
+recv_recovery_from_checkpoint_start_func(
+/*=====================================*/
+#ifdef UNIV_LOG_ARCHIVE
+	ulint		type,		/*!< in: LOG_CHECKPOINT or
+					LOG_ARCHIVE */
+	ib_uint64_t	limit_lsn,	/*!< in: recover up to this lsn
+					if possible */
+#endif /* UNIV_LOG_ARCHIVE */
+	ib_uint64_t	min_flushed_lsn,/*!< in: min flushed lsn from
+					data files */
+	ib_uint64_t	max_flushed_lsn)/*!< in: max flushed lsn from
+					data files */
+{
+	log_group_t*	group;
+	log_group_t*	max_cp_group;
+	log_group_t*	up_to_date_group;
+	ulint		max_cp_field;
+	ib_uint64_t	checkpoint_lsn;
+	ib_uint64_t	checkpoint_no;
+	ib_uint64_t	old_scanned_lsn;
+	ib_uint64_t	group_scanned_lsn;
+	ib_uint64_t	contiguous_lsn;
+	ib_uint64_t	archived_lsn;
+	byte*		buf;
+	byte*		log_hdr_buf;
+	byte		log_hdr_buf_base[LOG_FILE_HDR_SIZE + OS_FILE_LOG_BLOCK_SIZE];
+	ulint		err;
+
+	log_hdr_buf = ut_align(log_hdr_buf_base, OS_FILE_LOG_BLOCK_SIZE);
+
+#ifdef UNIV_LOG_ARCHIVE
+	ut_ad(type != LOG_CHECKPOINT || limit_lsn == IB_ULONGLONG_MAX);
+/** TRUE when recovering from a checkpoint */
+# define TYPE_CHECKPOINT	(type == LOG_CHECKPOINT)
+/** Recover up to this log sequence number */
+# define LIMIT_LSN		limit_lsn
+#else /* UNIV_LOG_ARCHIVE */
+/** TRUE when recovering from a checkpoint */
+# define TYPE_CHECKPOINT	1
+/** Recover up to this log sequence number */
+# define LIMIT_LSN		IB_ULONGLONG_MAX
+#endif /* UNIV_LOG_ARCHIVE */
+
+	if (TYPE_CHECKPOINT) {
+		recv_sys_create();
+		recv_sys_init(buf_pool_get_curr_size());
+	}
+
+	if (srv_force_recovery >= SRV_FORCE_NO_LOG_REDO) {
+		fprintf(stderr,
+			"InnoDB: The user has set SRV_FORCE_NO_LOG_REDO on\n");
+		fprintf(stderr,
+			"InnoDB: Skipping log redo\n");
+
+		return(DB_SUCCESS);
+	}
+
+	recv_recovery_on = TRUE;
+
+	recv_sys->limit_lsn = LIMIT_LSN;
+
+	mutex_enter(&(log_sys->mutex));
+
+	/* Look for the latest checkpoint from any of the log groups */
+
+	err = recv_find_max_checkpoint(&max_cp_group, &max_cp_field);
+
+	if (err != DB_SUCCESS) {
+
+		mutex_exit(&(log_sys->mutex));
+
+		return(err);
+	}
+
+	log_group_read_checkpoint_info(max_cp_group, max_cp_field);
+
+	buf = log_sys->checkpoint_buf;
+
+	checkpoint_lsn = mach_read_ull(buf + LOG_CHECKPOINT_LSN);
+	checkpoint_no = mach_read_ull(buf + LOG_CHECKPOINT_NO);
+	archived_lsn = mach_read_ull(buf + LOG_CHECKPOINT_ARCHIVED_LSN);
+
+	/* Read the first log file header to print a note if this is
+	a recovery from a restored InnoDB Hot Backup */
+
+	fil_io(OS_FILE_READ | OS_FILE_LOG, TRUE, max_cp_group->space_id, 0,
+	       0, 0, LOG_FILE_HDR_SIZE,
+	       log_hdr_buf, max_cp_group);
+
+	if (0 == ut_memcmp(log_hdr_buf + LOG_FILE_WAS_CREATED_BY_HOT_BACKUP,
+			   (byte*)"ibbackup", (sizeof "ibbackup") - 1)) {
+		/* This log file was created by ibbackup --restore: print
+		a note to the user about it */
+
+		fprintf(stderr,
+			"InnoDB: The log file was created by"
+			" ibbackup --apply-log at\n"
+			"InnoDB: %s\n",
+			log_hdr_buf + LOG_FILE_WAS_CREATED_BY_HOT_BACKUP);
+		fprintf(stderr,
+			"InnoDB: NOTE: the following crash recovery"
+			" is part of a normal restore.\n");
+
+		/* Wipe over the label now */
+
+		memset(log_hdr_buf + LOG_FILE_WAS_CREATED_BY_HOT_BACKUP,
+		       ' ', 4);
+		/* Write to the log file to wipe over the label */
+		fil_io(OS_FILE_WRITE | OS_FILE_LOG, TRUE,
+		       max_cp_group->space_id, 0,
+		       0, 0, OS_FILE_LOG_BLOCK_SIZE,
+		       log_hdr_buf, max_cp_group);
+	}
+
+#ifdef UNIV_LOG_ARCHIVE
+	group = UT_LIST_GET_FIRST(log_sys->log_groups);
+
+	while (group) {
+		log_checkpoint_get_nth_group_info(buf, group->id,
+						  &(group->archived_file_no),
+						  &(group->archived_offset));
+
+		group = UT_LIST_GET_NEXT(log_groups, group);
+	}
+#endif /* UNIV_LOG_ARCHIVE */
+
+	if (TYPE_CHECKPOINT) {
+		/* Start reading the log groups from the checkpoint lsn up. The
+		variable contiguous_lsn contains an lsn up to which the log is
+		known to be contiguously written to all log groups. */
+
+		recv_sys->parse_start_lsn = checkpoint_lsn;
+		recv_sys->scanned_lsn = checkpoint_lsn;
+		recv_sys->scanned_checkpoint_no = 0;
+		recv_sys->recovered_lsn = checkpoint_lsn;
+
+		srv_start_lsn = checkpoint_lsn;
+	}
+
+	contiguous_lsn = ut_uint64_align_down(recv_sys->scanned_lsn,
+					      OS_FILE_LOG_BLOCK_SIZE);
+	if (TYPE_CHECKPOINT) {
+		up_to_date_group = max_cp_group;
+#ifdef UNIV_LOG_ARCHIVE
+	} else {
+		ulint	capacity;
+
+		/* Try to recover the remaining part from logs: first from
+		the logs of the archived group */
+
+		group = recv_sys->archive_group;
+		capacity = log_group_get_capacity(group);
+
+		if (recv_sys->scanned_lsn > checkpoint_lsn + capacity
+		    || checkpoint_lsn > recv_sys->scanned_lsn + capacity) {
+
+			mutex_exit(&(log_sys->mutex));
+
+			/* The group does not contain enough log: probably
+			an archived log file was missing or corrupt */
+
+			return(DB_ERROR);
+		}
+
+		recv_group_scan_log_recs(group, &contiguous_lsn,
+					 &group_scanned_lsn);
+		if (recv_sys->scanned_lsn < checkpoint_lsn) {
+
+			mutex_exit(&(log_sys->mutex));
+
+			/* The group did not contain enough log: an archived
+			log file was missing or invalid, or the log group
+			was corrupt */
+
+			return(DB_ERROR);
+		}
+
+		group->scanned_lsn = group_scanned_lsn;
+		up_to_date_group = group;
+#endif /* UNIV_LOG_ARCHIVE */
+	}
+
+	ut_ad(RECV_SCAN_SIZE <= log_sys->buf_size);
+
+	group = UT_LIST_GET_FIRST(log_sys->log_groups);
+
+#ifdef UNIV_LOG_ARCHIVE
+	if ((type == LOG_ARCHIVE) && (group == recv_sys->archive_group)) {
+		group = UT_LIST_GET_NEXT(log_groups, group);
+	}
+#endif /* UNIV_LOG_ARCHIVE */
+
+	/* Set the flag to publish that we are doing startup scan. */
+	recv_log_scan_is_startup_type = TYPE_CHECKPOINT;
+	while (group) {
+		old_scanned_lsn = recv_sys->scanned_lsn;
+
+		recv_group_scan_log_recs(group, &contiguous_lsn,
+					 &group_scanned_lsn);
+		group->scanned_lsn = group_scanned_lsn;
+
+		if (old_scanned_lsn < group_scanned_lsn) {
+			/* We found a more up-to-date group */
+
+			up_to_date_group = group;
+		}
+
+#ifdef UNIV_LOG_ARCHIVE
+		if ((type == LOG_ARCHIVE)
+		    && (group == recv_sys->archive_group)) {
+			group = UT_LIST_GET_NEXT(log_groups, group);
+		}
+#endif /* UNIV_LOG_ARCHIVE */
+
+		group = UT_LIST_GET_NEXT(log_groups, group);
+	}
+
+	/* Done with startup scan. Clear the flag. */
+	recv_log_scan_is_startup_type = FALSE;
+	if (TYPE_CHECKPOINT) {
+		/* NOTE: we always do a 'recovery' at startup, but only if
+		there is something wrong we will print a message to the
+		user about recovery: */
+
+		if (checkpoint_lsn != max_flushed_lsn
+		    || checkpoint_lsn != min_flushed_lsn) {
+
+			if (checkpoint_lsn < max_flushed_lsn) {
+				fprintf(stderr,
+					"InnoDB: #########################"
+					"#################################\n"
+					"InnoDB:                          "
+					"WARNING!\n"
+					"InnoDB: The log sequence number"
+					" in ibdata files is higher\n"
+					"InnoDB: than the log sequence number"
+					" in the ib_logfiles! Are you sure\n"
+					"InnoDB: you are using the right"
+					" ib_logfiles to start up"
+					" the database?\n"
+					"InnoDB: Log sequence number in"
+					" ib_logfiles is %llu, log\n"
+					"InnoDB: sequence numbers stamped"
+					" to ibdata file headers are between\n"
+					"InnoDB: %llu and %llu.\n"
+					"InnoDB: #########################"
+					"#################################\n",
+					checkpoint_lsn,
+					min_flushed_lsn,
+					max_flushed_lsn);
+			}
+
+			if (!recv_needed_recovery) {
+				fprintf(stderr,
+					"InnoDB: The log sequence number"
+					" in ibdata files does not match\n"
+					"InnoDB: the log sequence number"
+					" in the ib_logfiles!\n");
+				recv_init_crash_recovery();
+			}
+		}
+
+		if (!recv_needed_recovery) {
+			/* Init the doublewrite buffer memory structure */
+			trx_sys_doublewrite_init_or_restore_pages(FALSE);
+		}
+	}
+
+	/* We currently have only one log group */
+	if (group_scanned_lsn < checkpoint_lsn) {
+		ut_print_timestamp(stderr);
+		fprintf(stderr,
+			"  InnoDB: ERROR: We were only able to scan the log"
+			" up to\n"
+			"InnoDB: %llu, but a checkpoint was at %llu.\n"
+			"InnoDB: It is possible that"
+			" the database is now corrupt!\n",
+			group_scanned_lsn,
+			checkpoint_lsn);
+	}
+
+	if (group_scanned_lsn < recv_max_page_lsn) {
+		ut_print_timestamp(stderr);
+		fprintf(stderr,
+			"  InnoDB: ERROR: We were only able to scan the log"
+			" up to %llu\n"
+			"InnoDB: but a database page a had an lsn %llu."
+			" It is possible that the\n"
+			"InnoDB: database is now corrupt!\n",
+			group_scanned_lsn,
+			recv_max_page_lsn);
+	}
+
+	if (recv_sys->recovered_lsn < checkpoint_lsn) {
+
+		mutex_exit(&(log_sys->mutex));
+
+		if (recv_sys->recovered_lsn >= LIMIT_LSN) {
+
+			return(DB_SUCCESS);
+		}
+
+		ut_error;
+
+		return(DB_ERROR);
+	}
+
+	/* Synchronize the uncorrupted log groups to the most up-to-date log
+	group; we also copy checkpoint info to groups */
+
+	log_sys->next_checkpoint_lsn = checkpoint_lsn;
+	log_sys->next_checkpoint_no = checkpoint_no + 1;
+
+#ifdef UNIV_LOG_ARCHIVE
+	log_sys->archived_lsn = archived_lsn;
+#endif /* UNIV_LOG_ARCHIVE */
+
+	recv_synchronize_groups(up_to_date_group);
+
+	if (!recv_needed_recovery) {
+		ut_a(checkpoint_lsn == recv_sys->recovered_lsn);
+	} else {
+		srv_start_lsn = recv_sys->recovered_lsn;
+	}
+
+	log_sys->lsn = recv_sys->recovered_lsn;
+
+	ut_memcpy(log_sys->buf, recv_sys->last_block, OS_FILE_LOG_BLOCK_SIZE);
+
+	log_sys->buf_free = (ulint) log_sys->lsn % OS_FILE_LOG_BLOCK_SIZE;
+	log_sys->buf_next_to_write = log_sys->buf_free;
+	log_sys->written_to_some_lsn = log_sys->lsn;
+	log_sys->written_to_all_lsn = log_sys->lsn;
+
+	log_sys->last_checkpoint_lsn = checkpoint_lsn;
+
+	log_sys->next_checkpoint_no = checkpoint_no + 1;
+
+#ifdef UNIV_LOG_ARCHIVE
+	if (archived_lsn == IB_ULONGLONG_MAX) {
+
+		log_sys->archiving_state = LOG_ARCH_OFF;
+	}
+#endif /* UNIV_LOG_ARCHIVE */
+
+	mutex_enter(&(recv_sys->mutex));
+
+	recv_sys->apply_log_recs = TRUE;
+
+	mutex_exit(&(recv_sys->mutex));
+
+	mutex_exit(&(log_sys->mutex));
+
+	recv_lsn_checks_on = TRUE;
+
+	/* The database is now ready to start almost normal processing of user
+	transactions: transaction rollbacks and the application of the log
+	records in the hash table can be run in background. */
+
+	return(DB_SUCCESS);
+
+#undef TYPE_CHECKPOINT
+#undef LIMIT_LSN
+}
+
+/********************************************************//**
+Completes recovery from a checkpoint. */
+UNIV_INTERN
+void
+recv_recovery_from_checkpoint_finish(void)
+/*======================================*/
+{
+	/* Apply the hashed log records to the respective file pages */
+
+	if (srv_force_recovery < SRV_FORCE_NO_LOG_REDO) {
+
+		recv_apply_hashed_log_recs(TRUE);
+	}
+
+#ifdef UNIV_DEBUG
+	if (log_debug_writes) {
+		fprintf(stderr,
+			"InnoDB: Log records applied to the database\n");
+	}
+#endif /* UNIV_DEBUG */
+
+	if (recv_needed_recovery && srv_recovery_stats) {
+		ulint	i;
+
+		fprintf(stderr,
+			"InnoDB: Applying log records was done. Its statistics are followings.\n");
+
+		fprintf(stderr,
+			"============================================================\n"
+			"-------------------\n"
+			"RECOVERY STATISTICS\n"
+			"-------------------\n");
+		fprintf(stderr,
+			"Recovery time: %g sec. (%lu turns)\n",
+			difftime(time(NULL), recv_sys->stats_recv_start_time),
+			recv_sys->stats_recv_turns);
+
+		fprintf(stderr,
+			"\n"
+			"Data page IO statistics\n"
+			"  Requested pages: %lu\n"
+			"  Read pages:      %lu\n"
+			"  Written pages:   %lu\n"
+			"  (Dirty blocks):  %lu\n",
+			recv_sys->stats_read_requested_pages,
+			recv_sys->stats_read_io_pages,
+			recv_sys->stats_write_io_pages,
+			UT_LIST_GET_LEN(buf_pool->flush_list));
+
+		fprintf(stderr,
+			"  Grouping IO [times]:\n"
+			"\tnumber of pages,\n"
+			"\t\tread request neighbors (in %d pages chunk),\n"
+			"\t\t\tcombined read IO,\n"
+			"\t\t\t\tcombined write IO\n",
+			RECV_READ_AHEAD_AREA);
+		for (i = 0; i < ut_max(RECV_READ_AHEAD_AREA,
+					OS_AIO_MERGE_N_CONSECUTIVE); i++) {
+			fprintf(stderr,
+				"\t%3lu,\t%lu,\t%lu,\t%lu\n", i + 1,
+				(i < RECV_READ_AHEAD_AREA) ?
+					recv_sys->stats_read_in_area[i] : 0,
+				(i < OS_AIO_MERGE_N_CONSECUTIVE) ?
+					recv_sys->stats_read_io_consecutive[i] : 0,
+				(i < OS_AIO_MERGE_N_CONSECUTIVE) ?
+					recv_sys->stats_write_io_consecutive[i] : 0);
+		}
+
+		fprintf(stderr,
+			"\n"
+			"Recovery process statistics\n"
+			"  Checked pages by doublewrite buffer: %lu\n"
+			"  Overwritten pages from doublewrite:  %lu\n"
+			"  Recovered pages by io_thread:        %lu\n"
+			"  Recovered pages by main thread:      %lu\n"
+			"  Parsed log records to apply:         %lu\n"
+			"            Sum of the length:         %lu\n"
+			"  Applied log records:                 %lu\n"
+			"            Sum of the length:         %lu\n"
+			"  Pages which are already new enough:  %lu (It may not be accurate, if turns > 1)\n"
+			"  Oldest page's LSN:                   %llu\n"
+			"  Newest page's LSN:                   %llu\n",
+			recv_sys->stats_doublewrite_check_pages,
+			recv_sys->stats_doublewrite_overwrite_pages,
+			recv_sys->stats_recover_pages_with_read,
+			recv_sys->stats_recover_pages_without_read,
+			recv_sys->stats_log_recs,
+			recv_sys->stats_log_len_sum,
+			recv_sys->stats_applied_log_recs,
+			recv_sys->stats_applied_log_len_sum,
+			recv_sys->stats_pages_already_new,
+			recv_sys->stats_oldest_modified_lsn,
+			recv_sys->stats_newest_modified_lsn);
+
+		fprintf(stderr,
+			"============================================================\n");
+	}
+
+	if (recv_needed_recovery) {
+		trx_sys_print_mysql_master_log_pos();
+		trx_sys_print_mysql_binlog_offset();
+	}
+
+	if (recv_sys->found_corrupt_log) {
+
+		fprintf(stderr,
+			"InnoDB: WARNING: the log file may have been"
+			" corrupt and it\n"
+			"InnoDB: is possible that the log scan or parsing"
+			" did not proceed\n"
+			"InnoDB: far enough in recovery. Please run"
+			" CHECK TABLE\n"
+			"InnoDB: on your InnoDB tables to check that"
+			" they are ok!\n"
+			"InnoDB: It may be safest to recover your"
+			" InnoDB database from\n"
+			"InnoDB: a backup!\n");
+	}
+
+	/* Free the resources of the recovery system */
+
+	recv_recovery_on = FALSE;
+
+#ifndef UNIV_LOG_DEBUG
+	recv_sys_debug_free();
+#endif
+	/* Roll back any recovered data dictionary transactions, so
+	that the data dictionary tables will be free of any locks.
+	The data dictionary latch should guarantee that there is at
+	most one data dictionary transaction active at a time. */
+	trx_rollback_or_clean_recovered(FALSE);
+}
+
+/********************************************************//**
+Initiates the rollback of active transactions. */
+UNIV_INTERN
+void
+recv_recovery_rollback_active(void)
+/*===============================*/
+{
+	int		i;
+
+#ifdef UNIV_SYNC_DEBUG
+	/* Wait for a while so that created threads have time to suspend
+	themselves before we switch the latching order checks on */
+	os_thread_sleep(1000000);
+
+	/* Switch latching order checks on in sync0sync.c */
+	sync_order_checks_on = TRUE;
+#endif
+	/* Drop partially created indexes. */
+	row_merge_drop_temp_indexes();
+	/* Drop temporary tables. */
+	row_mysql_drop_temp_tables();
+
+	if (srv_force_recovery < SRV_FORCE_NO_TRX_UNDO) {
+		/* Rollback the uncommitted transactions which have no user
+		session */
+
+		os_thread_create(trx_rollback_or_clean_all_recovered,
+				 (void *)&i, NULL);
+	}
+}
+
+/******************************************************//**
+Resets the logs. The contents of log files will be lost! */
+UNIV_INTERN
+void
+recv_reset_logs(
+/*============*/
+	ib_uint64_t	lsn,		/*!< in: reset to this lsn
+					rounded up to be divisible by
+					OS_FILE_LOG_BLOCK_SIZE, after
+					which we add
+					LOG_BLOCK_HDR_SIZE */
+#ifdef UNIV_LOG_ARCHIVE
+	ulint		arch_log_no,	/*!< in: next archived log file number */
+#endif /* UNIV_LOG_ARCHIVE */
+	ibool		new_logs_created)/*!< in: TRUE if resetting logs
+					is done at the log creation;
+					FALSE if it is done after
+					archive recovery */
+{
+	log_group_t*	group;
+
+	ut_ad(mutex_own(&(log_sys->mutex)));
+
+	log_sys->lsn = ut_uint64_align_up(lsn, OS_FILE_LOG_BLOCK_SIZE);
+
+	group = UT_LIST_GET_FIRST(log_sys->log_groups);
+
+	while (group) {
+		group->lsn = log_sys->lsn;
+		group->lsn_offset = LOG_FILE_HDR_SIZE;
+#ifdef UNIV_LOG_ARCHIVE
+		group->archived_file_no = arch_log_no;
+		group->archived_offset = 0;
+#endif /* UNIV_LOG_ARCHIVE */
+
+		if (!new_logs_created) {
+			recv_truncate_group(group, group->lsn, group->lsn,
+					    group->lsn, group->lsn);
+		}
+
+		group = UT_LIST_GET_NEXT(log_groups, group);
+	}
+
+	log_sys->buf_next_to_write = 0;
+	log_sys->written_to_some_lsn = log_sys->lsn;
+	log_sys->written_to_all_lsn = log_sys->lsn;
+
+	log_sys->next_checkpoint_no = 0;
+	log_sys->last_checkpoint_lsn = 0;
+
+#ifdef UNIV_LOG_ARCHIVE
+	log_sys->archived_lsn = log_sys->lsn;
+#endif /* UNIV_LOG_ARCHIVE */
+
+	log_block_init(log_sys->buf, log_sys->lsn);
+	log_block_set_first_rec_group(log_sys->buf, LOG_BLOCK_HDR_SIZE);
+
+	log_sys->buf_free = LOG_BLOCK_HDR_SIZE;
+	log_sys->lsn += LOG_BLOCK_HDR_SIZE;
+
+	mutex_exit(&(log_sys->mutex));
+
+	/* Reset the checkpoint fields in logs */
+
+	log_make_checkpoint_at(IB_ULONGLONG_MAX, TRUE);
+	log_make_checkpoint_at(IB_ULONGLONG_MAX, TRUE);
+
+	mutex_enter(&(log_sys->mutex));
+}
+#endif /* !UNIV_HOTBACKUP */
+
+#ifdef UNIV_HOTBACKUP
+/******************************************************//**
+Creates new log files after a backup has been restored. */
+UNIV_INTERN
+void
+recv_reset_log_files_for_backup(
+/*============================*/
+	const char*	log_dir,	/*!< in: log file directory path */
+	ulint		n_log_files,	/*!< in: number of log files */
+	ulint		log_file_size,	/*!< in: log file size */
+	ib_uint64_t	lsn)		/*!< in: new start lsn, must be
+					divisible by OS_FILE_LOG_BLOCK_SIZE */
+{
+	os_file_t	log_file;
+	ibool		success;
+	byte*		buf;
+	ulint		i;
+	ulint		log_dir_len;
+	char		name[5000];
+	static const char ib_logfile_basename[] = "ib_logfile";
+
+	log_dir_len = strlen(log_dir);
+	/* full path name of ib_logfile consists of log dir path + basename
+	+ number. This must fit in the name buffer.
+	*/
+	ut_a(log_dir_len + strlen(ib_logfile_basename) + 11  < sizeof(name));
+
+	buf = ut_malloc(LOG_FILE_HDR_SIZE + OS_FILE_LOG_BLOCK_SIZE);
+	memset(buf, '\0', LOG_FILE_HDR_SIZE + OS_FILE_LOG_BLOCK_SIZE);
+
+	for (i = 0; i < n_log_files; i++) {
+
+		sprintf(name, "%s%s%lu", log_dir,
+			ib_logfile_basename, (ulong)i);
+
+		log_file = os_file_create_simple(name, OS_FILE_CREATE,
+						 OS_FILE_READ_WRITE, &success);
+		if (!success) {
+			fprintf(stderr,
+				"InnoDB: Cannot create %s. Check that"
+				" the file does not exist yet.\n", name);
+
+			exit(1);
+		}
+
+		fprintf(stderr,
+			"Setting log file size to %lu %lu\n",
+			(ulong) ut_get_high32(log_file_size),
+			(ulong) log_file_size & 0xFFFFFFFFUL);
+
+		success = os_file_set_size(name, log_file,
+					   log_file_size & 0xFFFFFFFFUL,
+					   ut_get_high32(log_file_size));
+
+		if (!success) {
+			fprintf(stderr,
+				"InnoDB: Cannot set %s size to %lu %lu\n",
+				name, (ulong) ut_get_high32(log_file_size),
+				(ulong) (log_file_size & 0xFFFFFFFFUL));
+			exit(1);
+		}
+
+		os_file_flush(log_file);
+		os_file_close(log_file);
+	}
+
+	/* We pretend there is a checkpoint at lsn + LOG_BLOCK_HDR_SIZE */
+
+	log_reset_first_header_and_checkpoint(buf, lsn);
+
+	log_block_init_in_old_format(buf + LOG_FILE_HDR_SIZE, lsn);
+	log_block_set_first_rec_group(buf + LOG_FILE_HDR_SIZE,
+				      LOG_BLOCK_HDR_SIZE);
+	sprintf(name, "%s%s%lu", log_dir, ib_logfile_basename, (ulong)0);
+
+	log_file = os_file_create_simple(name, OS_FILE_OPEN,
+					 OS_FILE_READ_WRITE, &success);
+	if (!success) {
+		fprintf(stderr, "InnoDB: Cannot open %s.\n", name);
+
+		exit(1);
+	}
+
+	os_file_write(name, log_file, buf, 0, 0,
+		      LOG_FILE_HDR_SIZE + OS_FILE_LOG_BLOCK_SIZE);
+	os_file_flush(log_file);
+	os_file_close(log_file);
+
+	ut_free(buf);
+}
+#endif /* UNIV_HOTBACKUP */
+
+#ifdef UNIV_LOG_ARCHIVE
+/******************************************************//**
+Reads from the archive of a log group and performs recovery.
+@return	TRUE if no more complete consistent archive files */
+static
+ibool
+log_group_recover_from_archive_file(
+/*================================*/
+	log_group_t*	group)		/*!< in: log group */
+{
+	os_file_t	file_handle;
+	ib_uint64_t	start_lsn;
+	ib_uint64_t	file_end_lsn;
+	ib_uint64_t	dummy_lsn;
+	ib_uint64_t	scanned_lsn;
+	ulint		len;
+	ibool		ret;
+	byte*		buf;
+	ulint		read_offset;
+	ulint		file_size;
+	ulint		file_size_high;
+	int		input_char;
+	char		name[10000];
+
+	ut_a(0);
+
+try_open_again:
+	buf = log_sys->buf;
+
+	/* Add the file to the archive file space; open the file */
+
+	log_archived_file_name_gen(name, group->id, group->archived_file_no);
+
+	file_handle = os_file_create(name, OS_FILE_OPEN,
+				     OS_FILE_LOG, OS_FILE_AIO, &ret);
+
+	if (ret == FALSE) {
+ask_again:
+		fprintf(stderr,
+			"InnoDB: Do you want to copy additional"
+			" archived log files\n"
+			"InnoDB: to the directory\n");
+		fprintf(stderr,
+			"InnoDB: or were these all the files needed"
+			" in recovery?\n");
+		fprintf(stderr,
+			"InnoDB: (Y == copy more files; N == this is all)?");
+
+		input_char = getchar();
+
+		if (input_char == (int) 'N') {
+
+			return(TRUE);
+		} else if (input_char == (int) 'Y') {
+
+			goto try_open_again;
+		} else {
+			goto ask_again;
+		}
+	}
+
+	ret = os_file_get_size(file_handle, &file_size, &file_size_high);
+	ut_a(ret);
+
+	ut_a(file_size_high == 0);
+
+	fprintf(stderr, "InnoDB: Opened archived log file %s\n", name);
+
+	ret = os_file_close(file_handle);
+
+	if (file_size < LOG_FILE_HDR_SIZE) {
+		fprintf(stderr,
+			"InnoDB: Archive file header incomplete %s\n", name);
+
+		return(TRUE);
+	}
+
+	ut_a(ret);
+
+	/* Add the archive file as a node to the space */
+
+	fil_node_create(name, 1 + file_size / UNIV_PAGE_SIZE,
+			group->archive_space_id, FALSE);
+#if RECV_SCAN_SIZE < LOG_FILE_HDR_SIZE
+# error "RECV_SCAN_SIZE < LOG_FILE_HDR_SIZE"
+#endif
+
+	/* Read the archive file header */
+	fil_io(OS_FILE_READ | OS_FILE_LOG, TRUE, group->archive_space_id, 0, 0,
+	       LOG_FILE_HDR_SIZE, buf, NULL);
+
+	/* Check if the archive file header is consistent */
+
+	if (mach_read_from_4(buf + LOG_GROUP_ID) != group->id
+	    || mach_read_from_4(buf + LOG_FILE_NO)
+	    != group->archived_file_no) {
+		fprintf(stderr,
+			"InnoDB: Archive file header inconsistent %s\n", name);
+
+		return(TRUE);
+	}
+
+	if (!mach_read_from_4(buf + LOG_FILE_ARCH_COMPLETED)) {
+		fprintf(stderr,
+			"InnoDB: Archive file not completely written %s\n",
+			name);
+
+		return(TRUE);
+	}
+
+	start_lsn = mach_read_ull(buf + LOG_FILE_START_LSN);
+	file_end_lsn = mach_read_ull(buf + LOG_FILE_END_LSN);
+
+	if (!recv_sys->scanned_lsn) {
+
+		if (recv_sys->parse_start_lsn < start_lsn) {
+			fprintf(stderr,
+				"InnoDB: Archive log file %s"
+				" starts from too big a lsn\n",
+				name);
+			return(TRUE);
+		}
+
+		recv_sys->scanned_lsn = start_lsn;
+	}
+
+	if (recv_sys->scanned_lsn != start_lsn) {
+
+		fprintf(stderr,
+			"InnoDB: Archive log file %s starts from"
+			" a wrong lsn\n",
+			name);
+		return(TRUE);
+	}
+
+	read_offset = LOG_FILE_HDR_SIZE;
+
+	for (;;) {
+		len = RECV_SCAN_SIZE;
+
+		if (read_offset + len > file_size) {
+			len = ut_calc_align_down(file_size - read_offset,
+						 OS_FILE_LOG_BLOCK_SIZE);
+		}
+
+		if (len == 0) {
+
+			break;
+		}
+
+#ifdef UNIV_DEBUG
+		if (log_debug_writes) {
+			fprintf(stderr,
+				"InnoDB: Archive read starting at"
+				" lsn %llu, len %lu from file %s\n",
+				start_lsn,
+				(ulong) len, name);
+		}
+#endif /* UNIV_DEBUG */
+
+		fil_io(OS_FILE_READ | OS_FILE_LOG, TRUE,
+		       group->archive_space_id, read_offset / UNIV_PAGE_SIZE,
+		       read_offset % UNIV_PAGE_SIZE, len, buf, NULL);
+
+		ret = recv_scan_log_recs(
+			(buf_pool->n_frames - recv_n_pool_free_frames)
+			* UNIV_PAGE_SIZE, TRUE, buf, len, start_lsn,
+			&dummy_lsn, &scanned_lsn);
+
+		if (scanned_lsn == file_end_lsn) {
+
+			return(FALSE);
+		}
+
+		if (ret) {
+			fprintf(stderr,
+				"InnoDB: Archive log file %s"
+				" does not scan right\n",
+				name);
+			return(TRUE);
+		}
+
+		read_offset += len;
+		start_lsn += len;
+
+		ut_ad(start_lsn == scanned_lsn);
+	}
+
+	return(FALSE);
+}
+
+/********************************************************//**
+Recovers from archived log files, and also from log files, if they exist.
+@return	error code or DB_SUCCESS */
+UNIV_INTERN
+ulint
+recv_recovery_from_archive_start(
+/*=============================*/
+	ib_uint64_t	min_flushed_lsn,/*!< in: min flushed lsn field from the
+					data files */
+	ib_uint64_t	limit_lsn,	/*!< in: recover up to this lsn if
+					possible */
+	ulint		first_log_no)	/*!< in: number of the first archived
+					log file to use in the recovery; the
+					file will be searched from
+					INNOBASE_LOG_ARCH_DIR specified in
+					server config file */
+{
+	log_group_t*	group;
+	ulint		group_id;
+	ulint		trunc_len;
+	ibool		ret;
+	ulint		err;
+
+	ut_a(0);
+
+	recv_sys_create();
+	recv_sys_init(buf_pool_get_curr_size());
+
+	recv_recovery_on = TRUE;
+	recv_recovery_from_backup_on = TRUE;
+
+	recv_sys->limit_lsn = limit_lsn;
+
+	group_id = 0;
+
+	group = UT_LIST_GET_FIRST(log_sys->log_groups);
+
+	while (group) {
+		if (group->id == group_id) {
+
+			break;
+		}
+
+		group = UT_LIST_GET_NEXT(log_groups, group);
+	}
+
+	if (!group) {
+		fprintf(stderr,
+			"InnoDB: There is no log group defined with id %lu!\n",
+			(ulong) group_id);
+		return(DB_ERROR);
+	}
+
+	group->archived_file_no = first_log_no;
+
+	recv_sys->parse_start_lsn = min_flushed_lsn;
+
+	recv_sys->scanned_lsn = 0;
+	recv_sys->scanned_checkpoint_no = 0;
+	recv_sys->recovered_lsn = recv_sys->parse_start_lsn;
+
+	recv_sys->archive_group = group;
+
+	ret = FALSE;
+
+	mutex_enter(&(log_sys->mutex));
+
+	while (!ret) {
+		ret = log_group_recover_from_archive_file(group);
+
+		/* Close and truncate a possible processed archive file
+		from the file space */
+
+		trunc_len = UNIV_PAGE_SIZE
+			* fil_space_get_size(group->archive_space_id);
+		if (trunc_len > 0) {
+			fil_space_truncate_start(group->archive_space_id,
+						 trunc_len);
+		}
+
+		group->archived_file_no++;
+	}
+
+	if (recv_sys->recovered_lsn < limit_lsn) {
+
+		if (!recv_sys->scanned_lsn) {
+
+			recv_sys->scanned_lsn = recv_sys->parse_start_lsn;
+		}
+
+		mutex_exit(&(log_sys->mutex));
+
+		err = recv_recovery_from_checkpoint_start(LOG_ARCHIVE,
+							  limit_lsn,
+							  IB_ULONGLONG_MAX,
+							  IB_ULONGLONG_MAX);
+		if (err != DB_SUCCESS) {
+
+			return(err);
+		}
+
+		mutex_enter(&(log_sys->mutex));
+	}
+
+	if (limit_lsn != IB_ULONGLONG_MAX) {
+
+		recv_apply_hashed_log_recs(FALSE);
+
+		recv_reset_logs(recv_sys->recovered_lsn, 0, FALSE);
+	}
+
+	mutex_exit(&(log_sys->mutex));
+
+	return(DB_SUCCESS);
+}
+
+/********************************************************//**
+Completes recovery from archive. */
+UNIV_INTERN
+void
+recv_recovery_from_archive_finish(void)
+/*===================================*/
+{
+	recv_recovery_from_checkpoint_finish();
+
+	recv_recovery_from_backup_on = FALSE;
+}
+#endif /* UNIV_LOG_ARCHIVE */
diff --git a/storage/xtradb/mach/mach0data.c b/storage/xtradb/mach/mach0data.c
new file mode 100644
index 00000000000..e030ce9aadf
--- /dev/null
+++ b/storage/xtradb/mach/mach0data.c
@@ -0,0 +1,134 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/******************************************************************//**
+@file mach/mach0data.c
+Utilities for converting data from the database file
+to the machine format.
+
+Created 11/28/1995 Heikki Tuuri
+***********************************************************************/
+
+#include "mach0data.h"
+
+#ifdef UNIV_NONINL
+#include "mach0data.ic"
+#endif
+
+/*********************************************************//**
+Reads a ulint in a compressed form if the log record fully contains it.
+@return	pointer to end of the stored field, NULL if not complete */
+UNIV_INTERN
+byte*
+mach_parse_compressed(
+/*==================*/
+	byte*	ptr,	/*!< in: pointer to buffer from where to read */
+	byte*	end_ptr,/*!< in: pointer to end of the buffer */
+	ulint*	val)	/*!< out: read value (< 2^32) */
+{
+	ulint	flag;
+
+	ut_ad(ptr && end_ptr && val);
+
+	if (ptr >= end_ptr) {
+
+		return(NULL);
+	}
+
+	flag = mach_read_from_1(ptr);
+
+	if (flag < 0x80UL) {
+		*val = flag;
+		return(ptr + 1);
+
+	} else if (flag < 0xC0UL) {
+		if (end_ptr < ptr + 2) {
+			return(NULL);
+		}
+
+		*val = mach_read_from_2(ptr) & 0x7FFFUL;
+
+		return(ptr + 2);
+
+	} else if (flag < 0xE0UL) {
+		if (end_ptr < ptr + 3) {
+			return(NULL);
+		}
+
+		*val = mach_read_from_3(ptr) & 0x3FFFFFUL;
+
+		return(ptr + 3);
+	} else if (flag < 0xF0UL) {
+		if (end_ptr < ptr + 4) {
+			return(NULL);
+		}
+
+		*val = mach_read_from_4(ptr) & 0x1FFFFFFFUL;
+
+		return(ptr + 4);
+	} else {
+		ut_ad(flag == 0xF0UL);
+
+		if (end_ptr < ptr + 5) {
+			return(NULL);
+		}
+
+		*val = mach_read_from_4(ptr + 1);
+		return(ptr + 5);
+	}
+}
+
+/*********************************************************//**
+Reads a dulint in a compressed form if the log record fully contains it.
+@return	pointer to end of the stored field, NULL if not complete */
+UNIV_INTERN
+byte*
+mach_dulint_parse_compressed(
+/*=========================*/
+	byte*	ptr,	/*!< in: pointer to buffer from where to read */
+	byte*	end_ptr,/*!< in: pointer to end of the buffer */
+	dulint*	val)	/*!< out: read value */
+{
+	ulint	high;
+	ulint	low;
+	ulint	size;
+
+	ut_ad(ptr && end_ptr && val);
+
+	if (end_ptr < ptr + 5) {
+
+		return(NULL);
+	}
+
+	high = mach_read_compressed(ptr);
+
+	size = mach_get_compressed_size(high);
+
+	ptr += size;
+
+	if (end_ptr < ptr + 4) {
+
+		return(NULL);
+	}
+
+	low = mach_read_from_4(ptr);
+
+	*val = ut_dulint_create(high, low);
+
+	return(ptr + 4);
+}
diff --git a/storage/xtradb/mem/mem0dbg.c b/storage/xtradb/mem/mem0dbg.c
new file mode 100644
index 00000000000..1cd2ff15bab
--- /dev/null
+++ b/storage/xtradb/mem/mem0dbg.c
@@ -0,0 +1,1041 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2010, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/********************************************************************//**
+@file mem/mem0dbg.c
+The memory management: the debug code. This is not a compilation module,
+but is included in mem0mem.* !
+
+Created 6/9/1994 Heikki Tuuri
+*************************************************************************/
+
+#ifdef UNIV_MEM_DEBUG
+# ifndef UNIV_HOTBACKUP
+/* The mutex which protects in the debug version the hash table
+containing the list of live memory heaps, and also the global
+variables below. */
+UNIV_INTERN mutex_t	mem_hash_mutex;
+# endif /* !UNIV_HOTBACKUP */
+
+/* The following variables contain information about the
+extent of memory allocations. Only used in the debug version.
+Protected by mem_hash_mutex above. */
+
+static ulint		mem_n_created_heaps		= 0;
+static ulint		mem_n_allocations		= 0;
+static ulint		mem_total_allocated_memory	= 0;
+UNIV_INTERN ulint	mem_current_allocated_memory	= 0;
+static ulint		mem_max_allocated_memory	= 0;
+# ifndef UNIV_HOTBACKUP
+static ulint		mem_last_print_info		= 0;
+static ibool		mem_hash_initialized		= FALSE;
+# endif /* !UNIV_HOTBACKUP */
+
+/* Size of the hash table for memory management tracking */
+#define	MEM_HASH_SIZE	997
+
+/* The node of the list containing currently allocated memory heaps */
+
+typedef struct mem_hash_node_struct mem_hash_node_t;
+struct mem_hash_node_struct {
+	UT_LIST_NODE_T(mem_hash_node_t)
+				list;	/*!< hash list node */
+	mem_heap_t*		heap;	/*!< memory heap */
+	const char*		file_name;/* file where heap was created*/
+	ulint			line;	/*!< file line of creation */
+	ulint			nth_heap;/* this is the nth heap created */
+	UT_LIST_NODE_T(mem_hash_node_t)
+				all_list;/* list of all created heaps */
+};
+
+typedef UT_LIST_BASE_NODE_T(mem_hash_node_t) mem_hash_cell_t;
+
+/* The hash table of allocated heaps */
+static mem_hash_cell_t		mem_hash_table[MEM_HASH_SIZE];
+
+/* The base node of the list of all allocated heaps */
+static mem_hash_cell_t		mem_all_list_base;
+
+
+
+UNIV_INLINE
+mem_hash_cell_t*
+mem_hash_get_nth_cell(ulint i);
+
+/* Accessor function for the hash table. Returns a pointer to the
+table cell. */
+UNIV_INLINE
+mem_hash_cell_t*
+mem_hash_get_nth_cell(ulint i)
+{
+	ut_a(i < MEM_HASH_SIZE);
+
+	return(&(mem_hash_table[i]));
+}
+
+/* Accessor functions for a memory field in the debug version */
+UNIV_INTERN
+void
+mem_field_header_set_len(byte* field, ulint len)
+{
+	mach_write_to_4(field - 2 * sizeof(ulint), len);
+}
+
+UNIV_INTERN
+ulint
+mem_field_header_get_len(byte* field)
+{
+	return(mach_read_from_4(field - 2 * sizeof(ulint)));
+}
+
+UNIV_INTERN
+void
+mem_field_header_set_check(byte* field, ulint check)
+{
+	mach_write_to_4(field - sizeof(ulint), check);
+}
+
+UNIV_INTERN
+ulint
+mem_field_header_get_check(byte* field)
+{
+	return(mach_read_from_4(field - sizeof(ulint)));
+}
+
+UNIV_INTERN
+void
+mem_field_trailer_set_check(byte* field, ulint check)
+{
+	mach_write_to_4(field + mem_field_header_get_len(field), check);
+}
+
+UNIV_INTERN
+ulint
+mem_field_trailer_get_check(byte* field)
+{
+	return(mach_read_from_4(field
+				+ mem_field_header_get_len(field)));
+}
+#endif /* UNIV_MEM_DEBUG */
+
+#ifndef UNIV_HOTBACKUP
+/******************************************************************//**
+Initializes the memory system. */
+UNIV_INTERN
+void
+mem_init(
+/*=====*/
+	ulint	size)	/*!< in: common pool size in bytes */
+{
+#ifdef UNIV_MEM_DEBUG
+
+	ulint	i;
+
+	/* Initialize the hash table */
+	ut_a(FALSE == mem_hash_initialized);
+
+	mutex_create(&mem_hash_mutex, SYNC_MEM_HASH);
+
+	for (i = 0; i < MEM_HASH_SIZE; i++) {
+		UT_LIST_INIT(*mem_hash_get_nth_cell(i));
+	}
+
+	UT_LIST_INIT(mem_all_list_base);
+
+	mem_hash_initialized = TRUE;
+#endif
+
+	if (UNIV_LIKELY(srv_use_sys_malloc)) {
+		/* When innodb_use_sys_malloc is set, the
+		mem_comm_pool won't be used for any allocations.  We
+		create a dummy mem_comm_pool, because some statistics
+		and debugging code relies on it being initialized. */
+		size = 1;
+	}
+
+	mem_comm_pool = mem_pool_create(size);
+}
+
+/******************************************************************//**
+Closes the memory system. */
+UNIV_INTERN
+void
+mem_close(void)
+/*===========*/
+{
+	mem_pool_free(mem_comm_pool);
+	mem_comm_pool = NULL;
+#ifdef UNIV_MEM_DEBUG
+	mutex_free(&mem_hash_mutex);
+	mem_hash_initialized = FALSE;
+#endif /* UNIV_MEM_DEBUG */
+}
+#endif /* !UNIV_HOTBACKUP */
+
+#ifdef UNIV_MEM_DEBUG
+/******************************************************************//**
+Initializes an allocated memory field in the debug version. */
+UNIV_INTERN
+void
+mem_field_init(
+/*===========*/
+	byte*	buf,	/*!< in: memory field */
+	ulint	n)	/*!< in: how many bytes the user requested */
+{
+	ulint	rnd;
+	byte*	usr_buf;
+
+	usr_buf = buf + MEM_FIELD_HEADER_SIZE;
+
+	/* In the debug version write the length field and the
+	check fields to the start and the end of the allocated storage.
+	The field header consists of a length field and
+	a random number field, in this order. The field trailer contains
+	the same random number as a check field. */
+
+	mem_field_header_set_len(usr_buf, n);
+
+	rnd = ut_rnd_gen_ulint();
+
+	mem_field_header_set_check(usr_buf, rnd);
+	mem_field_trailer_set_check(usr_buf, rnd);
+
+	/* Update the memory allocation information */
+
+	mutex_enter(&mem_hash_mutex);
+
+	mem_total_allocated_memory += n;
+	mem_current_allocated_memory += n;
+	mem_n_allocations++;
+
+	if (mem_current_allocated_memory > mem_max_allocated_memory) {
+		mem_max_allocated_memory = mem_current_allocated_memory;
+	}
+
+	mutex_exit(&mem_hash_mutex);
+
+	/* In the debug version set the buffer to a random
+	combination of 0xBA and 0xBE */
+
+	mem_init_buf(usr_buf, n);
+}
+
+/******************************************************************//**
+Erases an allocated memory field in the debug version. */
+UNIV_INTERN
+void
+mem_field_erase(
+/*============*/
+	byte*	buf,	/*!< in: memory field */
+	ulint	n __attribute__((unused)))
+			/*!< in: how many bytes the user requested */
+{
+	byte*	usr_buf;
+
+	usr_buf = buf + MEM_FIELD_HEADER_SIZE;
+
+	mutex_enter(&mem_hash_mutex);
+	mem_current_allocated_memory	-= n;
+	mutex_exit(&mem_hash_mutex);
+
+	/* Check that the field lengths agree */
+	ut_ad(n == (ulint)mem_field_header_get_len(usr_buf));
+
+	/* In the debug version, set the freed space to a random
+	combination of 0xDE and 0xAD */
+
+	mem_erase_buf(buf, MEM_SPACE_NEEDED(n));
+}
+
+/***************************************************************//**
+Initializes a buffer to a random combination of hex BA and BE.
+Used to initialize allocated memory. */
+UNIV_INTERN
+void
+mem_init_buf(
+/*=========*/
+	byte*	buf,	/*!< in: pointer to buffer */
+	ulint	 n)	/*!< in: length of buffer */
+{
+	byte*	ptr;
+
+	UNIV_MEM_ASSERT_W(buf, n);
+
+	for (ptr = buf; ptr < buf + n; ptr++) {
+
+		if (ut_rnd_gen_ibool()) {
+			*ptr = 0xBA;
+		} else {
+			*ptr = 0xBE;
+		}
+	}
+
+	UNIV_MEM_INVALID(buf, n);
+}
+
+/***************************************************************//**
+Initializes a buffer to a random combination of hex DE and AD.
+Used to erase freed memory. */
+UNIV_INTERN
+void
+mem_erase_buf(
+/*==========*/
+	byte*	buf,	/*!< in: pointer to buffer */
+	ulint	n)	/*!< in: length of buffer */
+{
+	byte*	ptr;
+
+	UNIV_MEM_ASSERT_W(buf, n);
+
+	for (ptr = buf; ptr < buf + n; ptr++) {
+		if (ut_rnd_gen_ibool()) {
+			*ptr = 0xDE;
+		} else {
+			*ptr = 0xAD;
+		}
+	}
+
+	UNIV_MEM_FREE(buf, n);
+}
+
+/***************************************************************//**
+Inserts a created memory heap to the hash table of current allocated
+memory heaps. */
+UNIV_INTERN
+void
+mem_hash_insert(
+/*============*/
+	mem_heap_t*	heap,	   /*!< in: the created heap */
+	const char*	file_name, /*!< in: file name of creation */
+	ulint		line)	   /*!< in: line where created */
+{
+	mem_hash_node_t*	new_node;
+	ulint			cell_no	;
+
+	ut_ad(mem_heap_check(heap));
+
+	mutex_enter(&mem_hash_mutex);
+
+	cell_no = ut_hash_ulint((ulint)heap, MEM_HASH_SIZE);
+
+	/* Allocate a new node to the list */
+	new_node = ut_malloc(sizeof(mem_hash_node_t));
+
+	new_node->heap = heap;
+	new_node->file_name = file_name;
+	new_node->line = line;
+	new_node->nth_heap = mem_n_created_heaps;
+
+	/* Insert into lists */
+	UT_LIST_ADD_FIRST(list, *mem_hash_get_nth_cell(cell_no), new_node);
+
+	UT_LIST_ADD_LAST(all_list, mem_all_list_base, new_node);
+
+	mem_n_created_heaps++;
+
+	mutex_exit(&mem_hash_mutex);
+}
+
+/***************************************************************//**
+Removes a memory heap (which is going to be freed by the caller)
+from the list of live memory heaps. Returns the size of the heap
+in terms of how much memory in bytes was allocated for the user of
+the heap (not the total space occupied by the heap).
+Also validates the heap.
+NOTE: This function does not free the storage occupied by the
+heap itself, only the node in the list of heaps. */
+UNIV_INTERN
+void
+mem_hash_remove(
+/*============*/
+	mem_heap_t*	heap,	   /*!< in: the heap to be freed */
+	const char*	file_name, /*!< in: file name of freeing */
+	ulint		line)	   /*!< in: line where freed */
+{
+	mem_hash_node_t*	node;
+	ulint			cell_no;
+	ibool			error;
+	ulint			size;
+
+	ut_ad(mem_heap_check(heap));
+
+	mutex_enter(&mem_hash_mutex);
+
+	cell_no = ut_hash_ulint((ulint)heap, MEM_HASH_SIZE);
+
+	/* Look for the heap in the hash table list */
+	node = UT_LIST_GET_FIRST(*mem_hash_get_nth_cell(cell_no));
+
+	while (node != NULL) {
+		if (node->heap == heap) {
+
+			break;
+		}
+
+		node = UT_LIST_GET_NEXT(list, node);
+	}
+
+	if (node == NULL) {
+		fprintf(stderr,
+			"Memory heap or buffer freed in %s line %lu"
+			" did not exist.\n",
+			file_name, (ulong) line);
+		ut_error;
+	}
+
+	/* Remove from lists */
+	UT_LIST_REMOVE(list, *mem_hash_get_nth_cell(cell_no), node);
+
+	UT_LIST_REMOVE(all_list, mem_all_list_base, node);
+
+	/* Validate the heap which will be freed */
+	mem_heap_validate_or_print(node->heap, NULL, FALSE, &error, &size,
+				   NULL, NULL);
+	if (error) {
+		fprintf(stderr,
+			"Inconsistency in memory heap or"
+			" buffer n:o %lu created\n"
+			"in %s line %lu and tried to free in %s line %lu.\n"
+			"Hex dump of 400 bytes around memory heap"
+			" first block start:\n",
+			node->nth_heap, node->file_name, (ulong) node->line,
+			file_name, (ulong) line);
+		ut_print_buf(stderr, (byte*)node->heap - 200, 400);
+		fputs("\nDump of the mem heap:\n", stderr);
+		mem_heap_validate_or_print(node->heap, NULL, TRUE, &error,
+					   &size, NULL, NULL);
+		ut_error;
+	}
+
+	/* Free the memory occupied by the node struct */
+	ut_free(node);
+
+	mem_current_allocated_memory -= size;
+
+	mutex_exit(&mem_hash_mutex);
+}
+#endif /* UNIV_MEM_DEBUG */
+
+#if defined UNIV_MEM_DEBUG || defined UNIV_DEBUG
+/***************************************************************//**
+Checks a memory heap for consistency and prints the contents if requested.
+Outputs the sum of sizes of buffers given to the user (only in
+the debug version), the physical size of the heap and the number of
+blocks in the heap. In case of error returns 0 as sizes and number
+of blocks. */
+UNIV_INTERN
+void
+mem_heap_validate_or_print(
+/*=======================*/
+	mem_heap_t*	heap,	/*!< in: memory heap */
+	byte*		top __attribute__((unused)),
+				/*!< in: calculate and validate only until
+				this top pointer in the heap is reached,
+				if this pointer is NULL, ignored */
+	ibool		print,	/*!< in: if TRUE, prints the contents
+				of the heap; works only in
+				the debug version */
+	ibool*		error,	/*!< out: TRUE if error */
+	ulint*		us_size,/*!< out: allocated memory
+				(for the user) in the heap,
+				if a NULL pointer is passed as this
+				argument, it is ignored; in the
+				non-debug version this is always -1 */
+	ulint*		ph_size,/*!< out: physical size of the heap,
+				if a NULL pointer is passed as this
+				argument, it is ignored */
+	ulint*		n_blocks) /*!< out: number of blocks in the heap,
+				if a NULL pointer is passed as this
+				argument, it is ignored */
+{
+	mem_block_t*	block;
+	ulint		total_len	= 0;
+	ulint		block_count	= 0;
+	ulint		phys_len	= 0;
+#ifdef UNIV_MEM_DEBUG
+	ulint		len;
+	byte*		field;
+	byte*		user_field;
+	ulint		check_field;
+#endif
+
+	/* Pessimistically, we set the parameters to error values */
+	if (us_size != NULL) {
+		*us_size = 0;
+	}
+	if (ph_size != NULL) {
+		*ph_size = 0;
+	}
+	if (n_blocks != NULL) {
+		*n_blocks = 0;
+	}
+	*error = TRUE;
+
+	block = heap;
+
+	if (block->magic_n != MEM_BLOCK_MAGIC_N) {
+		return;
+	}
+
+	if (print) {
+		fputs("Memory heap:", stderr);
+	}
+
+	while (block != NULL) {
+		phys_len += mem_block_get_len(block);
+
+		if ((block->type == MEM_HEAP_BUFFER)
+		    && (mem_block_get_len(block) > UNIV_PAGE_SIZE)) {
+
+			fprintf(stderr,
+				"InnoDB: Error: mem block %p"
+				" length %lu > UNIV_PAGE_SIZE\n",
+				(void*) block,
+				(ulong) mem_block_get_len(block));
+			/* error */
+
+			return;
+		}
+
+#ifdef UNIV_MEM_DEBUG
+		/* We can trace the fields of the block only in the debug
+		version */
+		if (print) {
+			fprintf(stderr, " Block %ld:", block_count);
+		}
+
+		field = (byte*)block + mem_block_get_start(block);
+
+		if (top && (field == top)) {
+
+			goto completed;
+		}
+
+		while (field < (byte*)block + mem_block_get_free(block)) {
+
+			/* Calculate the pointer to the storage
+			which was given to the user */
+
+			user_field = field + MEM_FIELD_HEADER_SIZE;
+
+			len = mem_field_header_get_len(user_field);
+
+			if (print) {
+				ut_print_buf(stderr, user_field, len);
+				putc('\n', stderr);
+			}
+
+			total_len += len;
+			check_field = mem_field_header_get_check(user_field);
+
+			if (check_field
+			    != mem_field_trailer_get_check(user_field)) {
+				/* error */
+
+				fprintf(stderr,
+					"InnoDB: Error: block %lx mem"
+					" field %lx len %lu\n"
+					"InnoDB: header check field is"
+					" %lx but trailer %lx\n",
+					(ulint)block,
+					(ulint)field, len, check_field,
+					mem_field_trailer_get_check(
+						user_field));
+
+				return;
+			}
+
+			/* Move to next field */
+			field = field + MEM_SPACE_NEEDED(len);
+
+			if (top && (field == top)) {
+
+				goto completed;
+			}
+
+		}
+
+		/* At the end check that we have arrived to the first free
+		position */
+
+		if (field != (byte*)block + mem_block_get_free(block)) {
+			/* error */
+
+			fprintf(stderr,
+				"InnoDB: Error: block %lx end of"
+				" mem fields %lx\n"
+				"InnoDB: but block free at %lx\n",
+				(ulint)block, (ulint)field,
+				(ulint)((byte*)block
+					+ mem_block_get_free(block)));
+
+			return;
+		}
+
+#endif
+
+		block = UT_LIST_GET_NEXT(list, block);
+		block_count++;
+	}
+#ifdef UNIV_MEM_DEBUG
+completed:
+#endif
+	if (us_size != NULL) {
+		*us_size = total_len;
+	}
+	if (ph_size != NULL) {
+		*ph_size = phys_len;
+	}
+	if (n_blocks != NULL) {
+		*n_blocks = block_count;
+	}
+	*error = FALSE;
+}
+
+/**************************************************************//**
+Prints the contents of a memory heap. */
+static
+void
+mem_heap_print(
+/*===========*/
+	mem_heap_t*	heap)	/*!< in: memory heap */
+{
+	ibool	error;
+	ulint	us_size;
+	ulint	phys_size;
+	ulint	n_blocks;
+
+	ut_ad(mem_heap_check(heap));
+
+	mem_heap_validate_or_print(heap, NULL, TRUE, &error,
+				   &us_size, &phys_size, &n_blocks);
+	fprintf(stderr,
+		"\nheap type: %lu; size: user size %lu;"
+		" physical size %lu; blocks %lu.\n",
+		(ulong) heap->type, (ulong) us_size,
+		(ulong) phys_size, (ulong) n_blocks);
+	ut_a(!error);
+}
+
+/**************************************************************//**
+Validates the contents of a memory heap.
+@return	TRUE if ok */
+UNIV_INTERN
+ibool
+mem_heap_validate(
+/*==============*/
+	mem_heap_t*	heap)	/*!< in: memory heap */
+{
+	ibool	error;
+	ulint	us_size;
+	ulint	phys_size;
+	ulint	n_blocks;
+
+	ut_ad(mem_heap_check(heap));
+
+	mem_heap_validate_or_print(heap, NULL, FALSE, &error, &us_size,
+				   &phys_size, &n_blocks);
+	if (error) {
+		mem_heap_print(heap);
+	}
+
+	ut_a(!error);
+
+	return(TRUE);
+}
+#endif /* UNIV_MEM_DEBUG || UNIV_DEBUG */
+
+#ifdef UNIV_DEBUG
+/**************************************************************//**
+Checks that an object is a memory heap (or a block of it).
+@return	TRUE if ok */
+UNIV_INTERN
+ibool
+mem_heap_check(
+/*===========*/
+	mem_heap_t*	heap)	/*!< in: memory heap */
+{
+	ut_a(heap->magic_n == MEM_BLOCK_MAGIC_N);
+
+	return(TRUE);
+}
+#endif /* UNIV_DEBUG */
+
+#ifdef UNIV_MEM_DEBUG
+/*****************************************************************//**
+TRUE if no memory is currently allocated.
+@return	TRUE if no heaps exist */
+UNIV_INTERN
+ibool
+mem_all_freed(void)
+/*===============*/
+{
+	mem_hash_node_t*	node;
+	ulint			heap_count	= 0;
+	ulint			i;
+
+	mem_validate();
+
+	mutex_enter(&mem_hash_mutex);
+
+	for (i = 0; i < MEM_HASH_SIZE; i++) {
+
+		node = UT_LIST_GET_FIRST(*mem_hash_get_nth_cell(i));
+		while (node != NULL) {
+			heap_count++;
+			node = UT_LIST_GET_NEXT(list, node);
+		}
+	}
+
+	mutex_exit(&mem_hash_mutex);
+
+	if (heap_count == 0) {
+# ifndef UNIV_HOTBACKUP
+		ut_a(mem_pool_get_reserved(mem_comm_pool) == 0);
+# endif /* !UNIV_HOTBACKUP */
+
+		return(TRUE);
+	} else {
+		return(FALSE);
+	}
+}
+
+/*****************************************************************//**
+Validates the dynamic memory allocation system.
+@return	TRUE if error */
+UNIV_INTERN
+ibool
+mem_validate_no_assert(void)
+/*========================*/
+{
+	mem_hash_node_t*	node;
+	ulint			n_heaps			= 0;
+	ulint			allocated_mem;
+	ulint			ph_size;
+	ulint			total_allocated_mem	= 0;
+	ibool			error			= FALSE;
+	ulint			n_blocks;
+	ulint			i;
+
+# ifndef UNIV_HOTBACKUP
+	mem_pool_validate(mem_comm_pool);
+# endif /* !UNIV_HOTBACKUP */
+
+	mutex_enter(&mem_hash_mutex);
+
+	for (i = 0; i < MEM_HASH_SIZE; i++) {
+
+		node = UT_LIST_GET_FIRST(*mem_hash_get_nth_cell(i));
+
+		while (node != NULL) {
+			n_heaps++;
+
+			mem_heap_validate_or_print(node->heap, NULL,
+						   FALSE, &error,
+						   &allocated_mem,
+						   &ph_size, &n_blocks);
+
+			if (error) {
+				fprintf(stderr,
+					"\nERROR!!!!!!!!!!!!!!!!!!!"
+					"!!!!!!!!!!!!!!!!!!!!!!!\n\n"
+					"Inconsistency in memory heap"
+					" or buffer created\n"
+					"in %s line %lu.\n",
+					node->file_name, node->line);
+
+				mutex_exit(&mem_hash_mutex);
+
+				return(TRUE);
+			}
+
+			total_allocated_mem += allocated_mem;
+			node = UT_LIST_GET_NEXT(list, node);
+		}
+	}
+
+	if ((n_heaps == 0) && (mem_current_allocated_memory != 0)) {
+		error = TRUE;
+	}
+
+	if (mem_total_allocated_memory < mem_current_allocated_memory) {
+		error = TRUE;
+	}
+
+	if (mem_max_allocated_memory > mem_total_allocated_memory) {
+		error = TRUE;
+	}
+
+	if (mem_n_created_heaps < n_heaps) {
+		error = TRUE;
+	}
+
+	mutex_exit(&mem_hash_mutex);
+
+	return(error);
+}
+
+/************************************************************//**
+Validates the dynamic memory
+@return	TRUE if ok */
+UNIV_INTERN
+ibool
+mem_validate(void)
+/*==============*/
+{
+	ut_a(!mem_validate_no_assert());
+
+	return(TRUE);
+}
+#endif /* UNIV_MEM_DEBUG */
+
+/************************************************************//**
+Tries to find neigboring memory allocation blocks and dumps to stderr
+the neighborhood of a given pointer. */
+UNIV_INTERN
+void
+mem_analyze_corruption(
+/*===================*/
+	void*	ptr)	/*!< in: pointer to place of possible corruption */
+{
+	byte*	p;
+	ulint	i;
+	ulint	dist;
+
+	fputs("InnoDB: Apparent memory corruption: mem dump ", stderr);
+	ut_print_buf(stderr, (byte*)ptr - 250, 500);
+
+	fputs("\nInnoDB: Scanning backward trying to find"
+	      " previous allocated mem blocks\n", stderr);
+
+	p = (byte*)ptr;
+	dist = 0;
+
+	for (i = 0; i < 10; i++) {
+		for (;;) {
+			if (((ulint)p) % 4 == 0) {
+
+				if (*((ulint*)p) == MEM_BLOCK_MAGIC_N) {
+					fprintf(stderr,
+						"Mem block at - %lu,"
+						" file %s, line %lu\n",
+						(ulong) dist,
+						(p + sizeof(ulint)),
+						(ulong)
+						(*(ulint*)(p + 8
+							   + sizeof(ulint))));
+
+					break;
+				}
+
+				if (*((ulint*)p) == MEM_FREED_BLOCK_MAGIC_N) {
+					fprintf(stderr,
+						"Freed mem block at - %lu,"
+						" file %s, line %lu\n",
+						(ulong) dist,
+						(p + sizeof(ulint)),
+						(ulong)
+						(*(ulint*)(p + 8
+							   + sizeof(ulint))));
+
+					break;
+				}
+			}
+
+			p--;
+			dist++;
+		}
+
+		p--;
+		dist++;
+	}
+
+	fprintf(stderr,
+		"InnoDB: Scanning forward trying to find next"
+		" allocated mem blocks\n");
+
+	p = (byte*)ptr;
+	dist = 0;
+
+	for (i = 0; i < 10; i++) {
+		for (;;) {
+			if (((ulint)p) % 4 == 0) {
+
+				if (*((ulint*)p) == MEM_BLOCK_MAGIC_N) {
+					fprintf(stderr,
+						"Mem block at + %lu, file %s,"
+						" line %lu\n",
+						(ulong) dist,
+						(p + sizeof(ulint)),
+						(ulong)
+						(*(ulint*)(p + 8
+							   + sizeof(ulint))));
+
+					break;
+				}
+
+				if (*((ulint*)p) == MEM_FREED_BLOCK_MAGIC_N) {
+					fprintf(stderr,
+						"Freed mem block at + %lu,"
+						" file %s, line %lu\n",
+						(ulong) dist,
+						(p + sizeof(ulint)),
+						(ulong)
+						(*(ulint*)(p + 8
+							   + sizeof(ulint))));
+
+					break;
+				}
+			}
+
+			p++;
+			dist++;
+		}
+
+		p++;
+		dist++;
+	}
+}
+
+#ifndef UNIV_HOTBACKUP
+/*****************************************************************//**
+Prints information of dynamic memory usage and currently allocated
+memory heaps or buffers. Can only be used in the debug version. */
+static
+void
+mem_print_info_low(
+/*===============*/
+	ibool	print_all)	/*!< in: if TRUE, all heaps are printed,
+				else only the heaps allocated after the
+				previous call of this function */
+{
+#ifdef UNIV_MEM_DEBUG
+	mem_hash_node_t*	node;
+	ulint			n_heaps			= 0;
+	ulint			allocated_mem;
+	ulint			ph_size;
+	ulint			total_allocated_mem	= 0;
+	ibool			error;
+	ulint			n_blocks;
+#endif
+	FILE*			outfile;
+
+	/* outfile = fopen("ibdebug", "a"); */
+
+	outfile = stdout;
+
+	fprintf(outfile, "\n");
+	fprintf(outfile,
+		"________________________________________________________\n");
+	fprintf(outfile, "MEMORY ALLOCATION INFORMATION\n\n");
+
+#ifndef UNIV_MEM_DEBUG
+
+	UT_NOT_USED(print_all);
+
+	mem_pool_print_info(outfile, mem_comm_pool);
+
+	fprintf(outfile,
+		"Sorry, non-debug version cannot give more memory info\n");
+
+	/* fclose(outfile); */
+
+	return;
+#else
+	mutex_enter(&mem_hash_mutex);
+
+	fprintf(outfile, "LIST OF CREATED HEAPS AND ALLOCATED BUFFERS: \n\n");
+
+	if (!print_all) {
+		fprintf(outfile, "AFTER THE LAST PRINT INFO\n");
+	}
+
+	node = UT_LIST_GET_FIRST(mem_all_list_base);
+
+	while (node != NULL) {
+		n_heaps++;
+
+		if (!print_all && node->nth_heap < mem_last_print_info) {
+
+			goto next_heap;
+		}
+
+		mem_heap_validate_or_print(node->heap, NULL,
+					   FALSE, &error, &allocated_mem,
+					   &ph_size, &n_blocks);
+		total_allocated_mem += allocated_mem;
+
+		fprintf(outfile,
+			"%lu: file %s line %lu of size %lu phys.size %lu"
+			" with %lu blocks, type %lu\n",
+			node->nth_heap, node->file_name, node->line,
+			allocated_mem, ph_size, n_blocks,
+			(node->heap)->type);
+next_heap:
+		node = UT_LIST_GET_NEXT(all_list, node);
+	}
+
+	fprintf(outfile, "\n");
+
+	fprintf(outfile, "Current allocated memory              : %lu\n",
+		mem_current_allocated_memory);
+	fprintf(outfile, "Current allocated heaps and buffers   : %lu\n",
+		n_heaps);
+	fprintf(outfile, "Cumulative allocated memory           : %lu\n",
+		mem_total_allocated_memory);
+	fprintf(outfile, "Maximum allocated memory              : %lu\n",
+		mem_max_allocated_memory);
+	fprintf(outfile, "Cumulative created heaps and buffers  : %lu\n",
+		mem_n_created_heaps);
+	fprintf(outfile, "Cumulative number of allocations      : %lu\n",
+		mem_n_allocations);
+
+	mem_last_print_info = mem_n_created_heaps;
+
+	mutex_exit(&mem_hash_mutex);
+
+	mem_pool_print_info(outfile, mem_comm_pool);
+
+	/*	mem_validate(); */
+
+	/*	fclose(outfile); */
+#endif
+}
+
+/*****************************************************************//**
+Prints information of dynamic memory usage and currently allocated memory
+heaps or buffers. Can only be used in the debug version. */
+UNIV_INTERN
+void
+mem_print_info(void)
+/*================*/
+{
+	mem_print_info_low(TRUE);
+}
+
+/*****************************************************************//**
+Prints information of dynamic memory usage and currently allocated memory
+heaps or buffers since the last ..._print_info or..._print_new_info. */
+UNIV_INTERN
+void
+mem_print_new_info(void)
+/*====================*/
+{
+	mem_print_info_low(FALSE);
+}
+#endif /* !UNIV_HOTBACKUP */
diff --git a/storage/xtradb/mem/mem0mem.c b/storage/xtradb/mem/mem0mem.c
new file mode 100644
index 00000000000..1dd4db30841
--- /dev/null
+++ b/storage/xtradb/mem/mem0mem.c
@@ -0,0 +1,573 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2010, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/********************************************************************//**
+@file mem/mem0mem.c
+The memory management
+
+Created 6/9/1994 Heikki Tuuri
+*************************************************************************/
+
+#include "mem0mem.h"
+#ifdef UNIV_NONINL
+#include "mem0mem.ic"
+#endif
+
+#include "buf0buf.h"
+#include "srv0srv.h"
+#include "mem0dbg.c"
+#include <stdarg.h>
+
+/*
+			THE MEMORY MANAGEMENT
+			=====================
+
+The basic element of the memory management is called a memory
+heap. A memory heap is conceptually a
+stack from which memory can be allocated. The stack may grow infinitely.
+The top element of the stack may be freed, or
+the whole stack can be freed at one time. The advantage of the
+memory heap concept is that we can avoid using the malloc and free
+functions of C which are quite expensive, for example, on the Solaris + GCC
+system (50 MHz Sparc, 1993) the pair takes 3 microseconds,
+on Win NT + 100MHz Pentium, 2.5 microseconds.
+When we use a memory heap,
+we can allocate larger blocks of memory at a time and thus
+reduce overhead. Slightly more efficient the method is when we
+allocate the memory from the index page buffer pool, as we can
+claim a new page fast. This is called buffer allocation.
+When we allocate the memory from the dynamic memory of the
+C environment, that is called dynamic allocation.
+
+The default way of operation of the memory heap is the following.
+First, when the heap is created, an initial block of memory is
+allocated. In dynamic allocation this may be about 50 bytes.
+If more space is needed, additional blocks are allocated
+and they are put into a linked list.
+After the initial block, each allocated block is twice the size of the
+previous, until a threshold is attained, after which the sizes
+of the blocks stay the same. An exception is, of course, the case
+where the caller requests a memory buffer whose size is
+bigger than the threshold. In that case a block big enough must
+be allocated.
+
+The heap is physically arranged so that if the current block
+becomes full, a new block is allocated and always inserted in the
+chain of blocks as the last block.
+
+In the debug version of the memory management, all the allocated
+heaps are kept in a list (which is implemented as a hash table).
+Thus we can notice if the caller tries to free an already freed
+heap. In addition, each buffer given to the caller contains
+start field at the start and a trailer field at the end of the buffer.
+
+The start field has the following content:
+A. sizeof(ulint) bytes of field length (in the standard byte order)
+B. sizeof(ulint) bytes of check field (a random number)
+
+The trailer field contains:
+A. sizeof(ulint) bytes of check field (the same random number as at the start)
+
+Thus we can notice if something has been copied over the
+borders of the buffer, which is illegal.
+The memory in the buffers is initialized to a random byte sequence.
+After freeing, all the blocks in the heap are set to random bytes
+to help us discover errors which result from the use of
+buffers in an already freed heap. */
+
+#ifdef MEM_PERIODIC_CHECK
+
+ibool					mem_block_list_inited;
+/* List of all mem blocks allocated; protected by the mem_comm_pool mutex */
+UT_LIST_BASE_NODE_T(mem_block_t)	mem_block_list;
+
+#endif
+
+/**********************************************************************//**
+Duplicates a NUL-terminated string, allocated from a memory heap.
+@return	own: a copy of the string */
+UNIV_INTERN
+char*
+mem_heap_strdup(
+/*============*/
+	mem_heap_t*	heap,	/*!< in: memory heap where string is allocated */
+	const char*	str)	/*!< in: string to be copied */
+{
+	return(mem_heap_dup(heap, str, strlen(str) + 1));
+}
+
+/**********************************************************************//**
+Duplicate a block of data, allocated from a memory heap.
+@return	own: a copy of the data */
+UNIV_INTERN
+void*
+mem_heap_dup(
+/*=========*/
+	mem_heap_t*	heap,	/*!< in: memory heap where copy is allocated */
+	const void*	data,	/*!< in: data to be copied */
+	ulint		len)	/*!< in: length of data, in bytes */
+{
+	return(memcpy(mem_heap_alloc(heap, len), data, len));
+}
+
+/**********************************************************************//**
+Concatenate two strings and return the result, using a memory heap.
+@return	own: the result */
+UNIV_INTERN
+char*
+mem_heap_strcat(
+/*============*/
+	mem_heap_t*	heap,	/*!< in: memory heap where string is allocated */
+	const char*	s1,	/*!< in: string 1 */
+	const char*	s2)	/*!< in: string 2 */
+{
+	char*	s;
+	ulint	s1_len = strlen(s1);
+	ulint	s2_len = strlen(s2);
+
+	s = mem_heap_alloc(heap, s1_len + s2_len + 1);
+
+	memcpy(s, s1, s1_len);
+	memcpy(s + s1_len, s2, s2_len);
+
+	s[s1_len + s2_len] = '\0';
+
+	return(s);
+}
+
+
+/****************************************************************//**
+Helper function for mem_heap_printf.
+@return	length of formatted string, including terminating NUL */
+static
+ulint
+mem_heap_printf_low(
+/*================*/
+	char*		buf,	/*!< in/out: buffer to store formatted string
+				in, or NULL to just calculate length */
+	const char*	format,	/*!< in: format string */
+	va_list		ap)	/*!< in: arguments */
+{
+	ulint 		len = 0;
+
+	while (*format) {
+
+		/* Does this format specifier have the 'l' length modifier. */
+		ibool	is_long = FALSE;
+
+		/* Length of one parameter. */
+		size_t	plen;
+
+		if (*format++ != '%') {
+			/* Non-format character. */
+
+			len++;
+
+			if (buf) {
+				*buf++ = *(format - 1);
+			}
+
+			continue;
+		}
+
+		if (*format == 'l') {
+			is_long = TRUE;
+			format++;
+		}
+
+		switch (*format++) {
+		case 's':
+			/* string */
+			{
+				char*	s = va_arg(ap, char*);
+
+				/* "%ls" is a non-sensical format specifier. */
+				ut_a(!is_long);
+
+				plen = strlen(s);
+				len += plen;
+
+				if (buf) {
+					memcpy(buf, s, plen);
+					buf += plen;
+				}
+			}
+
+			break;
+
+		case 'u':
+			/* unsigned int */
+			{
+				char		tmp[32];
+				unsigned long	val;
+
+				/* We only support 'long' values for now. */
+				ut_a(is_long);
+
+				val = va_arg(ap, unsigned long);
+
+				plen = sprintf(tmp, "%lu", val);
+				len += plen;
+
+				if (buf) {
+					memcpy(buf, tmp, plen);
+					buf += plen;
+				}
+			}
+
+			break;
+
+		case '%':
+
+			/* "%l%" is a non-sensical format specifier. */
+			ut_a(!is_long);
+
+			len++;
+
+			if (buf) {
+				*buf++ = '%';
+			}
+
+			break;
+
+		default:
+			ut_error;
+		}
+	}
+
+	/* For the NUL character. */
+	len++;
+
+	if (buf) {
+		*buf = '\0';
+	}
+
+	return(len);
+}
+
+/****************************************************************//**
+A simple (s)printf replacement that dynamically allocates the space for the
+formatted string from the given heap. This supports a very limited set of
+the printf syntax: types 's' and 'u' and length modifier 'l' (which is
+required for the 'u' type).
+@return	heap-allocated formatted string */
+UNIV_INTERN
+char*
+mem_heap_printf(
+/*============*/
+	mem_heap_t*	heap,	/*!< in: memory heap */
+	const char*	format,	/*!< in: format string */
+	...)
+{
+	va_list		ap;
+	char*		str;
+	ulint 		len;
+
+	/* Calculate length of string */
+	len = 0;
+	va_start(ap, format);
+	len = mem_heap_printf_low(NULL, format, ap);
+	va_end(ap);
+
+	/* Now create it for real. */
+	str = mem_heap_alloc(heap, len);
+	va_start(ap, format);
+	mem_heap_printf_low(str, format, ap);
+	va_end(ap);
+
+	return(str);
+}
+
+/***************************************************************//**
+Creates a memory heap block where data can be allocated.
+@return own: memory heap block, NULL if did not succeed (only possible
+for MEM_HEAP_BTR_SEARCH type heaps) */
+UNIV_INTERN
+mem_block_t*
+mem_heap_create_block(
+/*==================*/
+	mem_heap_t*	heap,	/*!< in: memory heap or NULL if first block
+				should be created */
+	ulint		n,	/*!< in: number of bytes needed for user data */
+	ulint		type,	/*!< in: type of heap: MEM_HEAP_DYNAMIC or
+				MEM_HEAP_BUFFER */
+	const char*	file_name,/*!< in: file name where created */
+	ulint		line)	/*!< in: line where created */
+{
+#ifndef UNIV_HOTBACKUP
+	buf_block_t*	buf_block = NULL;
+#endif /* !UNIV_HOTBACKUP */
+	mem_block_t*	block;
+	ulint		len;
+
+	ut_ad((type == MEM_HEAP_DYNAMIC) || (type == MEM_HEAP_BUFFER)
+	      || (type == MEM_HEAP_BUFFER + MEM_HEAP_BTR_SEARCH));
+
+	if (heap && heap->magic_n != MEM_BLOCK_MAGIC_N) {
+		mem_analyze_corruption(heap);
+	}
+
+	/* In dynamic allocation, calculate the size: block header + data. */
+	len = MEM_BLOCK_HEADER_SIZE + MEM_SPACE_NEEDED(n);
+
+#ifndef UNIV_HOTBACKUP
+	if (type == MEM_HEAP_DYNAMIC || len < UNIV_PAGE_SIZE / 2) {
+
+		ut_ad(type == MEM_HEAP_DYNAMIC || n <= MEM_MAX_ALLOC_IN_BUF);
+
+		block = mem_area_alloc(&len, mem_comm_pool);
+	} else {
+		len = UNIV_PAGE_SIZE;
+
+		if ((type & MEM_HEAP_BTR_SEARCH) && heap) {
+			/* We cannot allocate the block from the
+			buffer pool, but must get the free block from
+			the heap header free block field */
+
+			buf_block = heap->free_block;
+			heap->free_block = NULL;
+
+			if (UNIV_UNLIKELY(!buf_block)) {
+
+				return(NULL);
+			}
+		} else {
+			buf_block = buf_block_alloc(0);
+		}
+
+		block = (mem_block_t*) buf_block->frame;
+	}
+
+	ut_ad(block);
+	block->buf_block = buf_block;
+	block->free_block = NULL;
+#else /* !UNIV_HOTBACKUP */
+	len = MEM_BLOCK_HEADER_SIZE + MEM_SPACE_NEEDED(n);
+	block = ut_malloc(len);
+	ut_ad(block);
+#endif /* !UNIV_HOTBACKUP */
+
+	block->magic_n = MEM_BLOCK_MAGIC_N;
+	ut_strlcpy_rev(block->file_name, file_name, sizeof(block->file_name));
+	block->line = line;
+
+#ifdef MEM_PERIODIC_CHECK
+	mutex_enter(&(mem_comm_pool->mutex));
+
+	if (!mem_block_list_inited) {
+		mem_block_list_inited = TRUE;
+		UT_LIST_INIT(mem_block_list);
+	}
+
+	UT_LIST_ADD_LAST(mem_block_list, mem_block_list, block);
+
+	mutex_exit(&(mem_comm_pool->mutex));
+#endif
+	mem_block_set_len(block, len);
+	mem_block_set_type(block, type);
+	mem_block_set_free(block, MEM_BLOCK_HEADER_SIZE);
+	mem_block_set_start(block, MEM_BLOCK_HEADER_SIZE);
+
+	if (UNIV_UNLIKELY(heap == NULL)) {
+		/* This is the first block of the heap. The field
+		total_size should be initialized here */
+		block->total_size = len;
+	} else {
+		/* Not the first allocation for the heap. This block's
+		total_length field should be set to undefined. */
+		ut_d(block->total_size = ULINT_UNDEFINED);
+		UNIV_MEM_INVALID(&block->total_size,
+				 sizeof block->total_size);
+
+		heap->total_size += len;
+	}
+
+	ut_ad((ulint)MEM_BLOCK_HEADER_SIZE < len);
+
+	return(block);
+}
+
+/***************************************************************//**
+Adds a new block to a memory heap.
+@return created block, NULL if did not succeed (only possible for
+MEM_HEAP_BTR_SEARCH type heaps) */
+UNIV_INTERN
+mem_block_t*
+mem_heap_add_block(
+/*===============*/
+	mem_heap_t*	heap,	/*!< in: memory heap */
+	ulint		n)	/*!< in: number of bytes user needs */
+{
+	mem_block_t*	block;
+	mem_block_t*	new_block;
+	ulint		new_size;
+
+	ut_ad(mem_heap_check(heap));
+
+	block = UT_LIST_GET_LAST(heap->base);
+
+	/* We have to allocate a new block. The size is always at least
+	doubled until the standard size is reached. After that the size
+	stays the same, except in cases where the caller needs more space. */
+
+	new_size = 2 * mem_block_get_len(block);
+
+	if (heap->type != MEM_HEAP_DYNAMIC) {
+		/* From the buffer pool we allocate buffer frames */
+		ut_a(n <= MEM_MAX_ALLOC_IN_BUF);
+
+		if (new_size > MEM_MAX_ALLOC_IN_BUF) {
+			new_size = MEM_MAX_ALLOC_IN_BUF;
+		}
+	} else if (new_size > MEM_BLOCK_STANDARD_SIZE) {
+
+		new_size = MEM_BLOCK_STANDARD_SIZE;
+	}
+
+	if (new_size < n) {
+		new_size = n;
+	}
+
+	new_block = mem_heap_create_block(heap, new_size, heap->type,
+					  heap->file_name, heap->line);
+	if (new_block == NULL) {
+
+		return(NULL);
+	}
+
+	/* Add the new block as the last block */
+
+	UT_LIST_INSERT_AFTER(list, heap->base, block, new_block);
+
+	return(new_block);
+}
+
+/******************************************************************//**
+Frees a block from a memory heap. */
+UNIV_INTERN
+void
+mem_heap_block_free(
+/*================*/
+	mem_heap_t*	heap,	/*!< in: heap */
+	mem_block_t*	block)	/*!< in: block to free */
+{
+	ulint		type;
+	ulint		len;
+#ifndef UNIV_HOTBACKUP
+	buf_block_t*	buf_block	= block->buf_block;
+#endif /* !UNIV_HOTBACKUP */
+
+	if (block->magic_n != MEM_BLOCK_MAGIC_N) {
+		mem_analyze_corruption(block);
+	}
+
+	UT_LIST_REMOVE(list, heap->base, block);
+
+#ifdef MEM_PERIODIC_CHECK
+	mutex_enter(&(mem_comm_pool->mutex));
+
+	UT_LIST_REMOVE(mem_block_list, mem_block_list, block);
+
+	mutex_exit(&(mem_comm_pool->mutex));
+#endif
+
+	ut_ad(heap->total_size >= block->len);
+	heap->total_size -= block->len;
+
+	type = heap->type;
+	len = block->len;
+	block->magic_n = MEM_FREED_BLOCK_MAGIC_N;
+
+#ifndef UNIV_HOTBACKUP
+	if (!srv_use_sys_malloc) {
+#ifdef UNIV_MEM_DEBUG
+		/* In the debug version we set the memory to a random
+		combination of hex 0xDE and 0xAD. */
+
+		mem_erase_buf((byte*)block, len);
+#else /* UNIV_MEM_DEBUG */
+		UNIV_MEM_ASSERT_AND_FREE(block, len);
+#endif /* UNIV_MEM_DEBUG */
+
+	}
+	if (type == MEM_HEAP_DYNAMIC || len < UNIV_PAGE_SIZE / 2) {
+
+		ut_ad(!buf_block);
+		mem_area_free(block, mem_comm_pool);
+	} else {
+		ut_ad(type & MEM_HEAP_BUFFER);
+
+		buf_block_free(buf_block);
+	}
+#else /* !UNIV_HOTBACKUP */
+#ifdef UNIV_MEM_DEBUG
+	/* In the debug version we set the memory to a random
+	combination of hex 0xDE and 0xAD. */
+
+	mem_erase_buf((byte*)block, len);
+#else /* UNIV_MEM_DEBUG */
+	UNIV_MEM_ASSERT_AND_FREE(block, len);
+#endif /* UNIV_MEM_DEBUG */
+	ut_free(block);
+#endif /* !UNIV_HOTBACKUP */
+}
+
+#ifndef UNIV_HOTBACKUP
+/******************************************************************//**
+Frees the free_block field from a memory heap. */
+UNIV_INTERN
+void
+mem_heap_free_block_free(
+/*=====================*/
+	mem_heap_t*	heap)	/*!< in: heap */
+{
+	if (UNIV_LIKELY_NULL(heap->free_block)) {
+
+		buf_block_free(heap->free_block);
+
+		heap->free_block = NULL;
+	}
+}
+#endif /* !UNIV_HOTBACKUP */
+
+#ifdef MEM_PERIODIC_CHECK
+/******************************************************************//**
+Goes through the list of all allocated mem blocks, checks their magic
+numbers, and reports possible corruption. */
+UNIV_INTERN
+void
+mem_validate_all_blocks(void)
+/*=========================*/
+{
+	mem_block_t*	block;
+
+	mutex_enter(&(mem_comm_pool->mutex));
+
+	block = UT_LIST_GET_FIRST(mem_block_list);
+
+	while (block) {
+		if (block->magic_n != MEM_BLOCK_MAGIC_N) {
+			mem_analyze_corruption(block);
+		}
+
+		block = UT_LIST_GET_NEXT(mem_block_list, block);
+	}
+
+	mutex_exit(&(mem_comm_pool->mutex));
+}
+#endif
diff --git a/storage/xtradb/mem/mem0pool.c b/storage/xtradb/mem/mem0pool.c
new file mode 100644
index 00000000000..3291453eeb5
--- /dev/null
+++ b/storage/xtradb/mem/mem0pool.c
@@ -0,0 +1,728 @@
+/*****************************************************************************
+
+Copyright (c) 1997, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/********************************************************************//**
+@file mem/mem0pool.c
+The lowest-level memory management
+
+Created 5/12/1997 Heikki Tuuri
+*************************************************************************/
+
+#include "mem0pool.h"
+#ifdef UNIV_NONINL
+#include "mem0pool.ic"
+#endif
+
+#include "srv0srv.h"
+#include "sync0sync.h"
+#include "ut0mem.h"
+#include "ut0lst.h"
+#include "ut0byte.h"
+#include "mem0mem.h"
+#include "srv0start.h"
+
+/* We would like to use also the buffer frames to allocate memory. This
+would be desirable, because then the memory consumption of the database
+would be fixed, and we might even lock the buffer pool to the main memory.
+The problem here is that the buffer management routines can themselves call
+memory allocation, while the buffer pool mutex is reserved.
+
+The main components of the memory consumption are:
+
+1. buffer pool,
+2. parsed and optimized SQL statements,
+3. data dictionary cache,
+4. log buffer,
+5. locks for each transaction,
+6. hash table for the adaptive index,
+7. state and buffers for each SQL query currently being executed,
+8. session for each user, and
+9. stack for each OS thread.
+
+Items 1 and 2 are managed by an LRU algorithm. Items 5 and 6 can potentially
+consume very much memory. Items 7 and 8 should consume quite little memory,
+and the OS should take care of item 9, which too should consume little memory.
+
+A solution to the memory management:
+
+1. the buffer pool size is set separately;
+2. log buffer size is set separately;
+3. the common pool size for all the other entries, except 8, is set separately.
+
+Problems: we may waste memory if the common pool is set too big. Another
+problem is the locks, which may take very much space in big transactions.
+Then the shared pool size should be set very big. We can allow locks to take
+space from the buffer pool, but the SQL optimizer is then unaware of the
+usable size of the buffer pool. We could also combine the objects in the
+common pool and the buffers in the buffer pool into a single LRU list and
+manage it uniformly, but this approach does not take into account the parsing
+and other costs unique to SQL statements.
+
+The locks for a transaction can be seen as a part of the state of the
+transaction. Hence, they should be stored in the common pool. We still
+have the problem of a very big update transaction, for example, which
+will set very many x-locks on rows, and the locks will consume a lot
+of memory, say, half of the buffer pool size.
+
+Another problem is what to do if we are not able to malloc a requested
+block of memory from the common pool. Then we can request memory from
+the operating system. If it does not help, a system error results.
+
+Because 5 and 6 may potentially consume very much memory, we let them grow
+into the buffer pool. We may let the locks of a transaction take frames
+from the buffer pool, when the corresponding memory heap block has grown to
+the size of a buffer frame. Similarly for the hash node cells of the locks,
+and for the adaptive index. Thus, for each individual transaction, its locks
+can occupy at most about the size of the buffer frame of memory in the common
+pool, and after that its locks will grow into the buffer pool. */
+
+/** Mask used to extract the free bit from area->size */
+#define MEM_AREA_FREE	1
+
+/** The smallest memory area total size */
+#define MEM_AREA_MIN_SIZE	(2 * MEM_AREA_EXTRA_SIZE)
+
+
+/** Data structure for a memory pool. The space is allocated using the buddy
+algorithm, where free list i contains areas of size 2 to power i. */
+struct mem_pool_struct{
+	byte*		buf;		/*!< memory pool */
+	ulint		size;		/*!< memory common pool size */
+	ulint		reserved;	/*!< amount of currently allocated
+					memory */
+	mutex_t		mutex;		/*!< mutex protecting this struct */
+	UT_LIST_BASE_NODE_T(mem_area_t)
+			free_list[64];	/*!< lists of free memory areas: an
+					area is put to the list whose number
+					is the 2-logarithm of the area size */
+};
+
+/** The common memory pool */
+UNIV_INTERN mem_pool_t*	mem_comm_pool	= NULL;
+
+/* We use this counter to check that the mem pool mutex does not leak;
+this is to track a strange assertion failure reported at
+mysql@lists.mysql.com */
+
+UNIV_INTERN ulint	mem_n_threads_inside		= 0;
+
+/********************************************************************//**
+Reserves the mem pool mutex if we are not in server shutdown. Use
+this function only in memory free functions, since only memory
+free functions are used during server shutdown. */
+UNIV_INLINE
+void
+mem_pool_mutex_enter(
+/*=================*/
+	mem_pool_t*	pool)		/*!< in: memory pool */
+{
+	if (srv_shutdown_state < SRV_SHUTDOWN_EXIT_THREADS) {
+		mutex_enter(&(pool->mutex));
+	}
+}
+
+/********************************************************************//**
+Releases the mem pool mutex if we are not in server shutdown. As
+its corresponding mem_pool_mutex_enter() function, use it only
+in memory free functions */
+UNIV_INLINE
+void
+mem_pool_mutex_exit(
+/*================*/
+	mem_pool_t*	pool)		/*!< in: memory pool */
+{
+	if (srv_shutdown_state < SRV_SHUTDOWN_EXIT_THREADS) {
+		mutex_exit(&(pool->mutex));
+	}
+}
+
+/********************************************************************//**
+Returns memory area size.
+@return	size */
+UNIV_INLINE
+ulint
+mem_area_get_size(
+/*==============*/
+	mem_area_t*	area)	/*!< in: area */
+{
+	return(area->size_and_free & ~MEM_AREA_FREE);
+}
+
+/********************************************************************//**
+Sets memory area size. */
+UNIV_INLINE
+void
+mem_area_set_size(
+/*==============*/
+	mem_area_t*	area,	/*!< in: area */
+	ulint		size)	/*!< in: size */
+{
+	area->size_and_free = (area->size_and_free & MEM_AREA_FREE)
+		| size;
+}
+
+/********************************************************************//**
+Returns memory area free bit.
+@return	TRUE if free */
+UNIV_INLINE
+ibool
+mem_area_get_free(
+/*==============*/
+	mem_area_t*	area)	/*!< in: area */
+{
+#if TRUE != MEM_AREA_FREE
+# error "TRUE != MEM_AREA_FREE"
+#endif
+	return(area->size_and_free & MEM_AREA_FREE);
+}
+
+/********************************************************************//**
+Sets memory area free bit. */
+UNIV_INLINE
+void
+mem_area_set_free(
+/*==============*/
+	mem_area_t*	area,	/*!< in: area */
+	ibool		free)	/*!< in: free bit value */
+{
+#if TRUE != MEM_AREA_FREE
+# error "TRUE != MEM_AREA_FREE"
+#endif
+	area->size_and_free = (area->size_and_free & ~MEM_AREA_FREE)
+		| free;
+}
+
+/********************************************************************//**
+Creates a memory pool.
+@return	memory pool */
+UNIV_INTERN
+mem_pool_t*
+mem_pool_create(
+/*============*/
+	ulint	size)	/*!< in: pool size in bytes */
+{
+	mem_pool_t*	pool;
+	mem_area_t*	area;
+	ulint		i;
+	ulint		used;
+
+	pool = ut_malloc(sizeof(mem_pool_t));
+
+	/* We do not set the memory to zero (FALSE) in the pool,
+	but only when allocated at a higher level in mem0mem.c.
+	This is to avoid masking useful Purify warnings. */
+
+	pool->buf = ut_malloc_low(size, FALSE, TRUE);
+	pool->size = size;
+
+	mutex_create(&pool->mutex, SYNC_MEM_POOL);
+
+	/* Initialize the free lists */
+
+	for (i = 0; i < 64; i++) {
+
+		UT_LIST_INIT(pool->free_list[i]);
+	}
+
+	used = 0;
+
+	while (size - used >= MEM_AREA_MIN_SIZE) {
+
+		i = ut_2_log(size - used);
+
+		if (ut_2_exp(i) > size - used) {
+
+			/* ut_2_log rounds upward */
+
+			i--;
+		}
+
+		area = (mem_area_t*)(pool->buf + used);
+
+		mem_area_set_size(area, ut_2_exp(i));
+		mem_area_set_free(area, TRUE);
+		UNIV_MEM_FREE(MEM_AREA_EXTRA_SIZE + (byte*) area,
+			      ut_2_exp(i) - MEM_AREA_EXTRA_SIZE);
+
+		UT_LIST_ADD_FIRST(free_list, pool->free_list[i], area);
+
+		used = used + ut_2_exp(i);
+	}
+
+	ut_ad(size >= used);
+
+	pool->reserved = 0;
+
+	return(pool);
+}
+
+/********************************************************************//**
+Frees a memory pool. */
+UNIV_INTERN
+void
+mem_pool_free(
+/*==========*/
+	mem_pool_t*	pool)	/*!< in, own: memory pool */
+{
+	ut_free(pool->buf);
+	ut_free(pool);
+}
+
+/********************************************************************//**
+Fills the specified free list.
+@return	TRUE if we were able to insert a block to the free list */
+static
+ibool
+mem_pool_fill_free_list(
+/*====================*/
+	ulint		i,	/*!< in: free list index */
+	mem_pool_t*	pool)	/*!< in: memory pool */
+{
+	mem_area_t*	area;
+	mem_area_t*	area2;
+	ibool		ret;
+
+	ut_ad(mutex_own(&(pool->mutex)));
+
+	if (UNIV_UNLIKELY(i >= 63)) {
+		/* We come here when we have run out of space in the
+		memory pool: */
+
+		return(FALSE);
+	}
+
+	area = UT_LIST_GET_FIRST(pool->free_list[i + 1]);
+
+	if (area == NULL) {
+		if (UT_LIST_GET_LEN(pool->free_list[i + 1]) > 0) {
+			ut_print_timestamp(stderr);
+
+			fprintf(stderr,
+				"  InnoDB: Error: mem pool free list %lu"
+				" length is %lu\n"
+				"InnoDB: though the list is empty!\n",
+				(ulong) i + 1,
+				(ulong)
+				UT_LIST_GET_LEN(pool->free_list[i + 1]));
+		}
+
+		ret = mem_pool_fill_free_list(i + 1, pool);
+
+		if (ret == FALSE) {
+
+			return(FALSE);
+		}
+
+		area = UT_LIST_GET_FIRST(pool->free_list[i + 1]);
+	}
+
+	if (UNIV_UNLIKELY(UT_LIST_GET_LEN(pool->free_list[i + 1]) == 0)) {
+		mem_analyze_corruption(area);
+
+		ut_error;
+	}
+
+	UT_LIST_REMOVE(free_list, pool->free_list[i + 1], area);
+
+	area2 = (mem_area_t*)(((byte*)area) + ut_2_exp(i));
+	UNIV_MEM_ALLOC(area2, MEM_AREA_EXTRA_SIZE);
+
+	mem_area_set_size(area2, ut_2_exp(i));
+	mem_area_set_free(area2, TRUE);
+
+	UT_LIST_ADD_FIRST(free_list, pool->free_list[i], area2);
+
+	mem_area_set_size(area, ut_2_exp(i));
+
+	UT_LIST_ADD_FIRST(free_list, pool->free_list[i], area);
+
+	return(TRUE);
+}
+
+/********************************************************************//**
+Allocates memory from a pool. NOTE: This low-level function should only be
+used in mem0mem.*!
+@return	own: allocated memory buffer */
+UNIV_INTERN
+void*
+mem_area_alloc(
+/*===========*/
+	ulint*		psize,	/*!< in: requested size in bytes; for optimum
+				space usage, the size should be a power of 2
+				minus MEM_AREA_EXTRA_SIZE;
+				out: allocated size in bytes (greater than
+				or equal to the requested size) */
+	mem_pool_t*	pool)	/*!< in: memory pool */
+{
+	mem_area_t*	area;
+	ulint		size;
+	ulint		n;
+	ibool		ret;
+
+	/* If we are using os allocator just make a simple call
+	to malloc */
+	if (UNIV_LIKELY(srv_use_sys_malloc)) {
+		return(malloc(*psize));
+	}
+
+	size = *psize;
+	n = ut_2_log(ut_max(size + MEM_AREA_EXTRA_SIZE, MEM_AREA_MIN_SIZE));
+
+	mutex_enter(&(pool->mutex));
+	mem_n_threads_inside++;
+
+	ut_a(mem_n_threads_inside == 1);
+
+	area = UT_LIST_GET_FIRST(pool->free_list[n]);
+
+	if (area == NULL) {
+		ret = mem_pool_fill_free_list(n, pool);
+
+		if (ret == FALSE) {
+			/* Out of memory in memory pool: we try to allocate
+			from the operating system with the regular malloc: */
+
+			mem_n_threads_inside--;
+			mutex_exit(&(pool->mutex));
+
+			return(ut_malloc(size));
+		}
+
+		area = UT_LIST_GET_FIRST(pool->free_list[n]);
+	}
+
+	if (!mem_area_get_free(area)) {
+		fprintf(stderr,
+			"InnoDB: Error: Removing element from mem pool"
+			" free list %lu though the\n"
+			"InnoDB: element is not marked free!\n",
+			(ulong) n);
+
+		mem_analyze_corruption(area);
+
+		/* Try to analyze a strange assertion failure reported at
+		mysql@lists.mysql.com where the free bit IS 1 in the
+		hex dump above */
+
+		if (mem_area_get_free(area)) {
+			fprintf(stderr,
+				"InnoDB: Probably a race condition"
+				" because now the area is marked free!\n");
+		}
+
+		ut_error;
+	}
+
+	if (UT_LIST_GET_LEN(pool->free_list[n]) == 0) {
+		fprintf(stderr,
+			"InnoDB: Error: Removing element from mem pool"
+			" free list %lu\n"
+			"InnoDB: though the list length is 0!\n",
+			(ulong) n);
+		mem_analyze_corruption(area);
+
+		ut_error;
+	}
+
+	ut_ad(mem_area_get_size(area) == ut_2_exp(n));
+
+	mem_area_set_free(area, FALSE);
+
+	UT_LIST_REMOVE(free_list, pool->free_list[n], area);
+
+	pool->reserved += mem_area_get_size(area);
+
+	mem_n_threads_inside--;
+	mutex_exit(&(pool->mutex));
+
+	ut_ad(mem_pool_validate(pool));
+
+	*psize = ut_2_exp(n) - MEM_AREA_EXTRA_SIZE;
+	UNIV_MEM_ALLOC(MEM_AREA_EXTRA_SIZE + (byte*)area, *psize);
+
+	return((void*)(MEM_AREA_EXTRA_SIZE + ((byte*)area)));
+}
+
+/********************************************************************//**
+Gets the buddy of an area, if it exists in pool.
+@return	the buddy, NULL if no buddy in pool */
+UNIV_INLINE
+mem_area_t*
+mem_area_get_buddy(
+/*===============*/
+	mem_area_t*	area,	/*!< in: memory area */
+	ulint		size,	/*!< in: memory area size */
+	mem_pool_t*	pool)	/*!< in: memory pool */
+{
+	mem_area_t*	buddy;
+
+	ut_ad(size != 0);
+
+	if (((((byte*)area) - pool->buf) % (2 * size)) == 0) {
+
+		/* The buddy is in a higher address */
+
+		buddy = (mem_area_t*)(((byte*)area) + size);
+
+		if ((((byte*)buddy) - pool->buf) + size > pool->size) {
+
+			/* The buddy is not wholly contained in the pool:
+			there is no buddy */
+
+			buddy = NULL;
+		}
+	} else {
+		/* The buddy is in a lower address; NOTE that area cannot
+		be at the pool lower end, because then we would end up to
+		the upper branch in this if-clause: the remainder would be
+		0 */
+
+		buddy = (mem_area_t*)(((byte*)area) - size);
+	}
+
+	return(buddy);
+}
+
+/********************************************************************//**
+Frees memory to a pool. */
+UNIV_INTERN
+void
+mem_area_free(
+/*==========*/
+	void*		ptr,	/*!< in, own: pointer to allocated memory
+				buffer */
+	mem_pool_t*	pool)	/*!< in: memory pool */
+{
+	mem_area_t*	area;
+	mem_area_t*	buddy;
+	void*		new_ptr;
+	ulint		size;
+	ulint		n;
+
+	if (UNIV_LIKELY(srv_use_sys_malloc)) {
+		free(ptr);
+
+		return;
+	}
+
+	/* It may be that the area was really allocated from the OS with
+	regular malloc: check if ptr points within our memory pool */
+
+	if ((byte*)ptr < pool->buf || (byte*)ptr >= pool->buf + pool->size) {
+		ut_free(ptr);
+
+		return;
+	}
+
+	area = (mem_area_t*) (((byte*)ptr) - MEM_AREA_EXTRA_SIZE);
+
+	if (mem_area_get_free(area)) {
+		fprintf(stderr,
+			"InnoDB: Error: Freeing element to mem pool"
+			" free list though the\n"
+			"InnoDB: element is marked free!\n");
+
+		mem_analyze_corruption(area);
+		ut_error;
+	}
+
+	size = mem_area_get_size(area);
+	UNIV_MEM_FREE(ptr, size - MEM_AREA_EXTRA_SIZE);
+
+	if (size == 0) {
+		fprintf(stderr,
+			"InnoDB: Error: Mem area size is 0. Possibly a"
+			" memory overrun of the\n"
+			"InnoDB: previous allocated area!\n");
+
+		mem_analyze_corruption(area);
+		ut_error;
+	}
+
+#ifdef UNIV_LIGHT_MEM_DEBUG
+	if (((byte*)area) + size < pool->buf + pool->size) {
+
+		ulint	next_size;
+
+		next_size = mem_area_get_size(
+			(mem_area_t*)(((byte*)area) + size));
+		if (UNIV_UNLIKELY(!next_size || !ut_is_2pow(next_size))) {
+			fprintf(stderr,
+				"InnoDB: Error: Memory area size %lu,"
+				" next area size %lu not a power of 2!\n"
+				"InnoDB: Possibly a memory overrun of"
+				" the buffer being freed here.\n",
+				(ulong) size, (ulong) next_size);
+			mem_analyze_corruption(area);
+
+			ut_error;
+		}
+	}
+#endif
+	buddy = mem_area_get_buddy(area, size, pool);
+
+	n = ut_2_log(size);
+
+	mem_pool_mutex_enter(pool);
+	mem_n_threads_inside++;
+
+	ut_a(mem_n_threads_inside == 1);
+
+	if (buddy && mem_area_get_free(buddy)
+	    && (size == mem_area_get_size(buddy))) {
+
+		/* The buddy is in a free list */
+
+		if ((byte*)buddy < (byte*)area) {
+			new_ptr = ((byte*)buddy) + MEM_AREA_EXTRA_SIZE;
+
+			mem_area_set_size(buddy, 2 * size);
+			mem_area_set_free(buddy, FALSE);
+		} else {
+			new_ptr = ptr;
+
+			mem_area_set_size(area, 2 * size);
+		}
+
+		/* Remove the buddy from its free list and merge it to area */
+
+		UT_LIST_REMOVE(free_list, pool->free_list[n], buddy);
+
+		pool->reserved += ut_2_exp(n);
+
+		mem_n_threads_inside--;
+		mem_pool_mutex_exit(pool);
+
+		mem_area_free(new_ptr, pool);
+
+		return;
+	} else {
+		UT_LIST_ADD_FIRST(free_list, pool->free_list[n], area);
+
+		mem_area_set_free(area, TRUE);
+
+		ut_ad(pool->reserved >= size);
+
+		pool->reserved -= size;
+	}
+
+	mem_n_threads_inside--;
+	mem_pool_mutex_exit(pool);
+
+	ut_ad(mem_pool_validate(pool));
+}
+
+/********************************************************************//**
+Validates a memory pool.
+@return	TRUE if ok */
+UNIV_INTERN
+ibool
+mem_pool_validate(
+/*==============*/
+	mem_pool_t*	pool)	/*!< in: memory pool */
+{
+	mem_area_t*	area;
+	mem_area_t*	buddy;
+	ulint		free;
+	ulint		i;
+
+	mem_pool_mutex_enter(pool);
+
+	free = 0;
+
+	for (i = 0; i < 64; i++) {
+
+		UT_LIST_VALIDATE(free_list, mem_area_t, pool->free_list[i],
+				 (void) 0);
+
+		area = UT_LIST_GET_FIRST(pool->free_list[i]);
+
+		while (area != NULL) {
+			ut_a(mem_area_get_free(area));
+			ut_a(mem_area_get_size(area) == ut_2_exp(i));
+
+			buddy = mem_area_get_buddy(area, ut_2_exp(i), pool);
+
+			ut_a(!buddy || !mem_area_get_free(buddy)
+			     || (ut_2_exp(i) != mem_area_get_size(buddy)));
+
+			area = UT_LIST_GET_NEXT(free_list, area);
+
+			free += ut_2_exp(i);
+		}
+	}
+
+	ut_a(free + pool->reserved == pool->size);
+
+	mem_pool_mutex_exit(pool);
+
+	return(TRUE);
+}
+
+/********************************************************************//**
+Prints info of a memory pool. */
+UNIV_INTERN
+void
+mem_pool_print_info(
+/*================*/
+	FILE*		outfile,/*!< in: output file to write to */
+	mem_pool_t*	pool)	/*!< in: memory pool */
+{
+	ulint		i;
+
+	mem_pool_validate(pool);
+
+	fprintf(outfile, "INFO OF A MEMORY POOL\n");
+
+	mutex_enter(&(pool->mutex));
+
+	for (i = 0; i < 64; i++) {
+		if (UT_LIST_GET_LEN(pool->free_list[i]) > 0) {
+
+			fprintf(outfile,
+				"Free list length %lu for"
+				" blocks of size %lu\n",
+				(ulong) UT_LIST_GET_LEN(pool->free_list[i]),
+				(ulong) ut_2_exp(i));
+		}
+	}
+
+	fprintf(outfile, "Pool size %lu, reserved %lu.\n", (ulong) pool->size,
+		(ulong) pool->reserved);
+	mutex_exit(&(pool->mutex));
+}
+
+/********************************************************************//**
+Returns the amount of reserved memory.
+@return	reserved memory in bytes */
+UNIV_INTERN
+ulint
+mem_pool_get_reserved(
+/*==================*/
+	mem_pool_t*	pool)	/*!< in: memory pool */
+{
+	ulint	reserved;
+
+	mutex_enter(&(pool->mutex));
+
+	reserved = pool->reserved;
+
+	mutex_exit(&(pool->mutex));
+
+	return(reserved);
+}
diff --git a/storage/xtradb/mtr/mtr0log.c b/storage/xtradb/mtr/mtr0log.c
new file mode 100644
index 00000000000..d22015a575f
--- /dev/null
+++ b/storage/xtradb/mtr/mtr0log.c
@@ -0,0 +1,612 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file mtr/mtr0log.c
+Mini-transaction log routines
+
+Created 12/7/1995 Heikki Tuuri
+*******************************************************/
+
+#include "mtr0log.h"
+
+#ifdef UNIV_NONINL
+#include "mtr0log.ic"
+#endif
+
+#include "buf0buf.h"
+#include "dict0dict.h"
+#include "log0recv.h"
+#include "page0page.h"
+
+#ifndef UNIV_HOTBACKUP
+# include "dict0boot.h"
+
+/********************************************************//**
+Catenates n bytes to the mtr log. */
+UNIV_INTERN
+void
+mlog_catenate_string(
+/*=================*/
+	mtr_t*		mtr,	/*!< in: mtr */
+	const byte*	str,	/*!< in: string to write */
+	ulint		len)	/*!< in: string length */
+{
+	dyn_array_t*	mlog;
+
+	if (mtr_get_log_mode(mtr) == MTR_LOG_NONE) {
+
+		return;
+	}
+
+	mlog = &(mtr->log);
+
+	dyn_push_string(mlog, str, len);
+}
+
+/********************************************************//**
+Writes the initial part of a log record consisting of one-byte item
+type and four-byte space and page numbers. Also pushes info
+to the mtr memo that a buffer page has been modified. */
+UNIV_INTERN
+void
+mlog_write_initial_log_record(
+/*==========================*/
+	const byte*	ptr,	/*!< in: pointer to (inside) a buffer
+				frame holding the file page where
+				modification is made */
+	byte		type,	/*!< in: log item type: MLOG_1BYTE, ... */
+	mtr_t*		mtr)	/*!< in: mini-transaction handle */
+{
+	byte*	log_ptr;
+
+	ut_ad(type <= MLOG_BIGGEST_TYPE);
+	ut_ad(type > MLOG_8BYTES);
+
+	log_ptr = mlog_open(mtr, 11);
+
+	/* If no logging is requested, we may return now */
+	if (log_ptr == NULL) {
+
+		return;
+	}
+
+	log_ptr = mlog_write_initial_log_record_fast(ptr, type, log_ptr, mtr);
+
+	mlog_close(mtr, log_ptr);
+}
+#endif /* !UNIV_HOTBACKUP */
+
+/********************************************************//**
+Parses an initial log record written by mlog_write_initial_log_record.
+@return	parsed record end, NULL if not a complete record */
+UNIV_INTERN
+byte*
+mlog_parse_initial_log_record(
+/*==========================*/
+	byte*	ptr,	/*!< in: buffer */
+	byte*	end_ptr,/*!< in: buffer end */
+	byte*	type,	/*!< out: log record type: MLOG_1BYTE, ... */
+	ulint*	space,	/*!< out: space id */
+	ulint*	page_no)/*!< out: page number */
+{
+	if (end_ptr < ptr + 1) {
+
+		return(NULL);
+	}
+
+	*type = (byte)((ulint)*ptr & ~MLOG_SINGLE_REC_FLAG);
+	ut_ad(*type <= MLOG_BIGGEST_TYPE);
+
+	ptr++;
+
+	if (end_ptr < ptr + 2) {
+
+		return(NULL);
+	}
+
+	ptr = mach_parse_compressed(ptr, end_ptr, space);
+
+	if (ptr == NULL) {
+
+		return(NULL);
+	}
+
+	ptr = mach_parse_compressed(ptr, end_ptr, page_no);
+
+	return(ptr);
+}
+
+/********************************************************//**
+Parses a log record written by mlog_write_ulint or mlog_write_dulint.
+@return	parsed record end, NULL if not a complete record or a corrupt record */
+UNIV_INTERN
+byte*
+mlog_parse_nbytes(
+/*==============*/
+	ulint	type,	/*!< in: log record type: MLOG_1BYTE, ... */
+	byte*	ptr,	/*!< in: buffer */
+	byte*	end_ptr,/*!< in: buffer end */
+	byte*	page,	/*!< in: page where to apply the log record, or NULL */
+	void*	page_zip)/*!< in/out: compressed page, or NULL */
+{
+	ulint	offset;
+	ulint	val;
+	dulint	dval;
+
+	ut_a(type <= MLOG_8BYTES);
+	ut_a(!page || !page_zip || fil_page_get_type(page) != FIL_PAGE_INDEX);
+
+	if (end_ptr < ptr + 2) {
+
+		return(NULL);
+	}
+
+	offset = mach_read_from_2(ptr);
+	ptr += 2;
+
+	if (offset >= UNIV_PAGE_SIZE) {
+		recv_sys->found_corrupt_log = TRUE;
+
+		return(NULL);
+	}
+
+	if (type == MLOG_8BYTES) {
+		ptr = mach_dulint_parse_compressed(ptr, end_ptr, &dval);
+
+		if (ptr == NULL) {
+
+			return(NULL);
+		}
+
+		if (page) {
+			if (UNIV_LIKELY_NULL(page_zip)) {
+				mach_write_to_8
+					(((page_zip_des_t*) page_zip)->data
+					 + offset, dval);
+			}
+			mach_write_to_8(page + offset, dval);
+		}
+
+		return(ptr);
+	}
+
+	ptr = mach_parse_compressed(ptr, end_ptr, &val);
+
+	if (ptr == NULL) {
+
+		return(NULL);
+	}
+
+	switch (type) {
+	case MLOG_1BYTE:
+		if (UNIV_UNLIKELY(val > 0xFFUL)) {
+			goto corrupt;
+		}
+		if (page) {
+			if (UNIV_LIKELY_NULL(page_zip)) {
+				mach_write_to_1
+					(((page_zip_des_t*) page_zip)->data
+					 + offset, val);
+			}
+			mach_write_to_1(page + offset, val);
+		}
+		break;
+	case MLOG_2BYTES:
+		if (UNIV_UNLIKELY(val > 0xFFFFUL)) {
+			goto corrupt;
+		}
+		if (page) {
+			if (UNIV_LIKELY_NULL(page_zip)) {
+				mach_write_to_2
+					(((page_zip_des_t*) page_zip)->data
+					 + offset, val);
+			}
+			mach_write_to_2(page + offset, val);
+		}
+		break;
+	case MLOG_4BYTES:
+		if (page) {
+			if (UNIV_LIKELY_NULL(page_zip)) {
+				mach_write_to_4
+					(((page_zip_des_t*) page_zip)->data
+					 + offset, val);
+			}
+			mach_write_to_4(page + offset, val);
+		}
+		break;
+	default:
+	corrupt:
+		recv_sys->found_corrupt_log = TRUE;
+		ptr = NULL;
+	}
+
+	return(ptr);
+}
+
+/********************************************************//**
+Writes 1 - 4 bytes to a file page buffered in the buffer pool.
+Writes the corresponding log record to the mini-transaction log. */
+UNIV_INTERN
+void
+mlog_write_ulint(
+/*=============*/
+	byte*	ptr,	/*!< in: pointer where to write */
+	ulint	val,	/*!< in: value to write */
+	byte	type,	/*!< in: MLOG_1BYTE, MLOG_2BYTES, MLOG_4BYTES */
+	mtr_t*	mtr)	/*!< in: mini-transaction handle */
+{
+	byte*	log_ptr;
+
+	switch (type) {
+	case MLOG_1BYTE:
+		mach_write_to_1(ptr, val);
+		break;
+	case MLOG_2BYTES:
+		mach_write_to_2(ptr, val);
+		break;
+	case MLOG_4BYTES:
+		mach_write_to_4(ptr, val);
+		break;
+	default:
+		ut_error;
+	}
+
+	log_ptr = mlog_open(mtr, 11 + 2 + 5);
+
+	/* If no logging is requested, we may return now */
+	if (log_ptr == NULL) {
+
+		return;
+	}
+
+	log_ptr = mlog_write_initial_log_record_fast(ptr, type, log_ptr, mtr);
+
+	mach_write_to_2(log_ptr, page_offset(ptr));
+	log_ptr += 2;
+
+	log_ptr += mach_write_compressed(log_ptr, val);
+
+	mlog_close(mtr, log_ptr);
+}
+
+/********************************************************//**
+Writes 8 bytes to a file page buffered in the buffer pool.
+Writes the corresponding log record to the mini-transaction log. */
+UNIV_INTERN
+void
+mlog_write_dulint(
+/*==============*/
+	byte*	ptr,	/*!< in: pointer where to write */
+	dulint	val,	/*!< in: value to write */
+	mtr_t*	mtr)	/*!< in: mini-transaction handle */
+{
+	byte*	log_ptr;
+
+	ut_ad(ptr && mtr);
+
+	mach_write_to_8(ptr, val);
+
+	log_ptr = mlog_open(mtr, 11 + 2 + 9);
+
+	/* If no logging is requested, we may return now */
+	if (log_ptr == NULL) {
+
+		return;
+	}
+
+	log_ptr = mlog_write_initial_log_record_fast(ptr, MLOG_8BYTES,
+						     log_ptr, mtr);
+
+	mach_write_to_2(log_ptr, page_offset(ptr));
+	log_ptr += 2;
+
+	log_ptr += mach_dulint_write_compressed(log_ptr, val);
+
+	mlog_close(mtr, log_ptr);
+}
+
+#ifndef UNIV_HOTBACKUP
+/********************************************************//**
+Writes a string to a file page buffered in the buffer pool. Writes the
+corresponding log record to the mini-transaction log. */
+UNIV_INTERN
+void
+mlog_write_string(
+/*==============*/
+	byte*		ptr,	/*!< in: pointer where to write */
+	const byte*	str,	/*!< in: string to write */
+	ulint		len,	/*!< in: string length */
+	mtr_t*		mtr)	/*!< in: mini-transaction handle */
+{
+	ut_ad(ptr && mtr);
+	ut_a(len < UNIV_PAGE_SIZE);
+
+	memcpy(ptr, str, len);
+
+	mlog_log_string(ptr, len, mtr);
+}
+
+/********************************************************//**
+Logs a write of a string to a file page buffered in the buffer pool.
+Writes the corresponding log record to the mini-transaction log. */
+UNIV_INTERN
+void
+mlog_log_string(
+/*============*/
+	byte*	ptr,	/*!< in: pointer written to */
+	ulint	len,	/*!< in: string length */
+	mtr_t*	mtr)	/*!< in: mini-transaction handle */
+{
+	byte*	log_ptr;
+
+	ut_ad(ptr && mtr);
+	ut_ad(len <= UNIV_PAGE_SIZE);
+
+	log_ptr = mlog_open(mtr, 30);
+
+	/* If no logging is requested, we may return now */
+	if (log_ptr == NULL) {
+
+		return;
+	}
+
+	log_ptr = mlog_write_initial_log_record_fast(ptr, MLOG_WRITE_STRING,
+						     log_ptr, mtr);
+	mach_write_to_2(log_ptr, page_offset(ptr));
+	log_ptr += 2;
+
+	mach_write_to_2(log_ptr, len);
+	log_ptr += 2;
+
+	mlog_close(mtr, log_ptr);
+
+	mlog_catenate_string(mtr, ptr, len);
+}
+#endif /* !UNIV_HOTBACKUP */
+
+/********************************************************//**
+Parses a log record written by mlog_write_string.
+@return	parsed record end, NULL if not a complete record */
+UNIV_INTERN
+byte*
+mlog_parse_string(
+/*==============*/
+	byte*	ptr,	/*!< in: buffer */
+	byte*	end_ptr,/*!< in: buffer end */
+	byte*	page,	/*!< in: page where to apply the log record, or NULL */
+	void*	page_zip)/*!< in/out: compressed page, or NULL */
+{
+	ulint	offset;
+	ulint	len;
+
+	ut_a(!page || !page_zip || fil_page_get_type(page) != FIL_PAGE_INDEX);
+
+	if (end_ptr < ptr + 4) {
+
+		return(NULL);
+	}
+
+	offset = mach_read_from_2(ptr);
+	ptr += 2;
+	len = mach_read_from_2(ptr);
+	ptr += 2;
+
+	if (UNIV_UNLIKELY(offset >= UNIV_PAGE_SIZE)
+			|| UNIV_UNLIKELY(len + offset > UNIV_PAGE_SIZE)) {
+		recv_sys->found_corrupt_log = TRUE;
+
+		return(NULL);
+	}
+
+	if (end_ptr < ptr + len) {
+
+		return(NULL);
+	}
+
+	if (page) {
+		if (UNIV_LIKELY_NULL(page_zip)) {
+			memcpy(((page_zip_des_t*) page_zip)->data
+				+ offset, ptr, len);
+		}
+		memcpy(page + offset, ptr, len);
+	}
+
+	return(ptr + len);
+}
+
+#ifndef UNIV_HOTBACKUP
+/********************************************************//**
+Opens a buffer for mlog, writes the initial log record and,
+if needed, the field lengths of an index.
+@return	buffer, NULL if log mode MTR_LOG_NONE */
+UNIV_INTERN
+byte*
+mlog_open_and_write_index(
+/*======================*/
+	mtr_t*		mtr,	/*!< in: mtr */
+	const byte*	rec,	/*!< in: index record or page */
+	dict_index_t*	index,	/*!< in: record descriptor */
+	byte		type,	/*!< in: log item type */
+	ulint		size)	/*!< in: requested buffer size in bytes
+				(if 0, calls mlog_close() and returns NULL) */
+{
+	byte*		log_ptr;
+	const byte*	log_start;
+	const byte*	log_end;
+
+	ut_ad(!!page_rec_is_comp(rec) == dict_table_is_comp(index->table));
+
+	if (!page_rec_is_comp(rec)) {
+		log_start = log_ptr = mlog_open(mtr, 11 + size);
+		if (!log_ptr) {
+			return(NULL); /* logging is disabled */
+		}
+		log_ptr = mlog_write_initial_log_record_fast(rec, type,
+							     log_ptr, mtr);
+		log_end = log_ptr + 11 + size;
+	} else {
+		ulint	i;
+		ulint	n	= dict_index_get_n_fields(index);
+		/* total size needed */
+		ulint	total	= 11 + size + (n + 2) * 2;
+		ulint	alloc	= total;
+		/* allocate at most DYN_ARRAY_DATA_SIZE at a time */
+		if (alloc > DYN_ARRAY_DATA_SIZE) {
+			alloc = DYN_ARRAY_DATA_SIZE;
+		}
+		log_start = log_ptr = mlog_open(mtr, alloc);
+		if (!log_ptr) {
+			return(NULL); /* logging is disabled */
+		}
+		log_end = log_ptr + alloc;
+		log_ptr = mlog_write_initial_log_record_fast(rec, type,
+							     log_ptr, mtr);
+		mach_write_to_2(log_ptr, n);
+		log_ptr += 2;
+		mach_write_to_2(log_ptr,
+				dict_index_get_n_unique_in_tree(index));
+		log_ptr += 2;
+		for (i = 0; i < n; i++) {
+			dict_field_t*		field;
+			const dict_col_t*	col;
+			ulint			len;
+
+			field = dict_index_get_nth_field(index, i);
+			col = dict_field_get_col(field);
+			len = field->fixed_len;
+			ut_ad(len < 0x7fff);
+			if (len == 0
+			    && (col->len > 255 || col->mtype == DATA_BLOB)) {
+				/* variable-length field
+				with maximum length > 255 */
+				len = 0x7fff;
+			}
+			if (col->prtype & DATA_NOT_NULL) {
+				len |= 0x8000;
+			}
+			if (log_ptr + 2 > log_end) {
+				mlog_close(mtr, log_ptr);
+				ut_a(total > (ulint) (log_ptr - log_start));
+				total -= log_ptr - log_start;
+				alloc = total;
+				if (alloc > DYN_ARRAY_DATA_SIZE) {
+					alloc = DYN_ARRAY_DATA_SIZE;
+				}
+				log_start = log_ptr = mlog_open(mtr, alloc);
+				if (!log_ptr) {
+					return(NULL); /* logging is disabled */
+				}
+				log_end = log_ptr + alloc;
+			}
+			mach_write_to_2(log_ptr, len);
+			log_ptr += 2;
+		}
+	}
+	if (size == 0) {
+		mlog_close(mtr, log_ptr);
+		log_ptr = NULL;
+	} else if (log_ptr + size > log_end) {
+		mlog_close(mtr, log_ptr);
+		log_ptr = mlog_open(mtr, size);
+	}
+	return(log_ptr);
+}
+#endif /* !UNIV_HOTBACKUP */
+
+/********************************************************//**
+Parses a log record written by mlog_open_and_write_index.
+@return	parsed record end, NULL if not a complete record */
+UNIV_INTERN
+byte*
+mlog_parse_index(
+/*=============*/
+	byte*		ptr,	/*!< in: buffer */
+	const byte*	end_ptr,/*!< in: buffer end */
+	ibool		comp,	/*!< in: TRUE=compact record format */
+	dict_index_t**	index)	/*!< out, own: dummy index */
+{
+	ulint		i, n, n_uniq;
+	dict_table_t*	table;
+	dict_index_t*	ind;
+
+	ut_ad(comp == FALSE || comp == TRUE);
+
+	if (comp) {
+		if (end_ptr < ptr + 4) {
+			return(NULL);
+		}
+		n = mach_read_from_2(ptr);
+		ptr += 2;
+		n_uniq = mach_read_from_2(ptr);
+		ptr += 2;
+		ut_ad(n_uniq <= n);
+		if (end_ptr < ptr + n * 2) {
+			return(NULL);
+		}
+	} else {
+		n = n_uniq = 1;
+	}
+	table = dict_mem_table_create("LOG_DUMMY", DICT_HDR_SPACE, n,
+				      comp ? DICT_TF_COMPACT : 0);
+	ind = dict_mem_index_create("LOG_DUMMY", "LOG_DUMMY",
+				    DICT_HDR_SPACE, 0, n);
+	ind->table = table;
+	ind->n_uniq = (unsigned int) n_uniq;
+	if (n_uniq != n) {
+		ut_a(n_uniq + DATA_ROLL_PTR <= n);
+		ind->type = DICT_CLUSTERED;
+	}
+	if (comp) {
+		for (i = 0; i < n; i++) {
+			ulint	len = mach_read_from_2(ptr);
+			ptr += 2;
+			/* The high-order bit of len is the NOT NULL flag;
+			the rest is 0 or 0x7fff for variable-length fields,
+			and 1..0x7ffe for fixed-length fields. */
+			dict_mem_table_add_col(
+				table, NULL, NULL,
+				((len + 1) & 0x7fff) <= 1
+				? DATA_BINARY : DATA_FIXBINARY,
+				len & 0x8000 ? DATA_NOT_NULL : 0,
+				len & 0x7fff);
+
+			dict_index_add_col(ind, table,
+					   dict_table_get_nth_col(table, i),
+					   0);
+		}
+		dict_table_add_system_columns(table, table->heap);
+		if (n_uniq != n) {
+			/* Identify DB_TRX_ID and DB_ROLL_PTR in the index. */
+			ut_a(DATA_TRX_ID_LEN
+			     == dict_index_get_nth_col(ind, DATA_TRX_ID - 1
+						       + n_uniq)->len);
+			ut_a(DATA_ROLL_PTR_LEN
+			     == dict_index_get_nth_col(ind, DATA_ROLL_PTR - 1
+						       + n_uniq)->len);
+			ind->fields[DATA_TRX_ID - 1 + n_uniq].col
+				= &table->cols[n + DATA_TRX_ID];
+			ind->fields[DATA_ROLL_PTR - 1 + n_uniq].col
+				= &table->cols[n + DATA_ROLL_PTR];
+		}
+	}
+	/* avoid ut_ad(index->cached) in dict_index_get_n_unique_in_tree */
+	ind->cached = TRUE;
+	*index = ind;
+	return(ptr);
+}
diff --git a/storage/xtradb/mtr/mtr0mtr.c b/storage/xtradb/mtr/mtr0mtr.c
new file mode 100644
index 00000000000..34e6d3ffc92
--- /dev/null
+++ b/storage/xtradb/mtr/mtr0mtr.c
@@ -0,0 +1,401 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file mtr/mtr0mtr.c
+Mini-transaction buffer
+
+Created 11/26/1995 Heikki Tuuri
+*******************************************************/
+
+#include "mtr0mtr.h"
+
+#ifdef UNIV_NONINL
+#include "mtr0mtr.ic"
+#endif
+
+#include "buf0buf.h"
+#include "page0types.h"
+#include "mtr0log.h"
+#include "log0log.h"
+#include "buf0flu.h"
+
+#ifndef UNIV_HOTBACKUP
+# include "log0recv.h"
+/*****************************************************************//**
+Releases the item in the slot given. */
+UNIV_INLINE
+void
+mtr_memo_slot_release(
+/*==================*/
+	mtr_t*			mtr,	/*!< in: mtr */
+	mtr_memo_slot_t*	slot)	/*!< in: memo slot */
+{
+	void*	object;
+	ulint	type;
+
+	ut_ad(mtr && slot);
+
+	object = slot->object;
+	type = slot->type;
+
+	if (UNIV_LIKELY(object != NULL)) {
+		if (type <= MTR_MEMO_BUF_FIX) {
+			buf_page_release((buf_block_t*)object, type, mtr);
+		} else if (type == MTR_MEMO_S_LOCK) {
+			rw_lock_s_unlock((rw_lock_t*)object);
+#ifdef UNIV_DEBUG
+		} else if (type != MTR_MEMO_X_LOCK) {
+			ut_ad(type == MTR_MEMO_MODIFY);
+			ut_ad(mtr_memo_contains(mtr, object,
+						MTR_MEMO_PAGE_X_FIX));
+#endif /* UNIV_DEBUG */
+		} else {
+			rw_lock_x_unlock((rw_lock_t*)object);
+		}
+	}
+
+	slot->object = NULL;
+}
+
+/**********************************************************//**
+Releases the mlocks and other objects stored in an mtr memo. They are released
+in the order opposite to which they were pushed to the memo. NOTE! It is
+essential that the x-rw-lock on a modified buffer page is not released before
+buf_page_note_modification is called for that page! Otherwise, some thread
+might race to modify it, and the flush list sort order on lsn would be
+destroyed. */
+UNIV_INLINE
+void
+mtr_memo_pop_all(
+/*=============*/
+	mtr_t*	mtr)	/*!< in: mtr */
+{
+	mtr_memo_slot_t* slot;
+	dyn_array_t*	memo;
+	ulint		offset;
+
+	ut_ad(mtr);
+	ut_ad(mtr->magic_n == MTR_MAGIC_N);
+	ut_ad(mtr->state == MTR_COMMITTING); /* Currently only used in
+					     commit */
+	memo = &(mtr->memo);
+
+	offset = dyn_array_get_data_size(memo);
+
+	while (offset > 0) {
+		offset -= sizeof(mtr_memo_slot_t);
+		slot = dyn_array_get_element(memo, offset);
+
+		mtr_memo_slot_release(mtr, slot);
+	}
+}
+
+UNIV_INLINE
+void
+mtr_memo_note_modification_all(
+/*===========================*/
+	mtr_t*	mtr)	/* in: mtr */
+{
+	mtr_memo_slot_t* slot;
+	dyn_array_t*	memo;
+	ulint		offset;
+
+	ut_ad(mtr);
+	ut_ad(mtr->magic_n == MTR_MAGIC_N);
+	ut_ad(mtr->state == MTR_COMMITTING); /* Currently only used in
+					     commit */
+	ut_ad(mtr->modifications);
+
+	memo = &(mtr->memo);
+
+	offset = dyn_array_get_data_size(memo);
+
+	while (offset > 0) {
+		offset -= sizeof(mtr_memo_slot_t);
+		slot = dyn_array_get_element(memo, offset);
+
+		if (UNIV_LIKELY(slot->object != NULL) &&
+		    slot->type == MTR_MEMO_PAGE_X_FIX) {
+			buf_flush_note_modification(
+				(buf_block_t*)slot->object, mtr);
+		}
+	}
+}
+
+/************************************************************//**
+Writes the contents of a mini-transaction log, if any, to the database log. */
+static
+void
+mtr_log_reserve_and_write(
+/*======================*/
+	mtr_t*	mtr)	/*!< in: mtr */
+{
+	dyn_array_t*	mlog;
+	dyn_block_t*	block;
+	ulint		data_size;
+	byte*		first_data;
+
+	ut_ad(mtr);
+
+	mlog = &(mtr->log);
+
+	first_data = dyn_block_get_data(mlog);
+
+	if (mtr->n_log_recs > 1) {
+		mlog_catenate_ulint(mtr, MLOG_MULTI_REC_END, MLOG_1BYTE);
+	} else {
+		*first_data = (byte)((ulint)*first_data
+				     | MLOG_SINGLE_REC_FLAG);
+	}
+
+	if (mlog->heap == NULL) {
+		mtr->end_lsn = log_reserve_and_write_fast(
+			first_data, dyn_block_get_used(mlog),
+			&mtr->start_lsn);
+		if (mtr->end_lsn) {
+
+			return;
+		}
+	}
+
+	data_size = dyn_array_get_data_size(mlog);
+
+	/* Open the database log for log_write_low */
+	mtr->start_lsn = log_reserve_and_open(data_size);
+
+	if (mtr->log_mode == MTR_LOG_ALL) {
+
+		block = mlog;
+
+		while (block != NULL) {
+			log_write_low(dyn_block_get_data(block),
+				      dyn_block_get_used(block));
+			block = dyn_array_get_next_block(mlog, block);
+		}
+	} else {
+		ut_ad(mtr->log_mode == MTR_LOG_NONE);
+		/* Do nothing */
+	}
+
+	mtr->end_lsn = log_close();
+}
+#endif /* !UNIV_HOTBACKUP */
+
+/***************************************************************//**
+Commits a mini-transaction. */
+UNIV_INTERN
+void
+mtr_commit(
+/*=======*/
+	mtr_t*	mtr)	/*!< in: mini-transaction */
+{
+#ifndef UNIV_HOTBACKUP
+	ibool		write_log;
+#endif /* !UNIV_HOTBACKUP */
+
+	ut_ad(mtr);
+	ut_ad(mtr->magic_n == MTR_MAGIC_N);
+	ut_ad(mtr->state == MTR_ACTIVE);
+	ut_d(mtr->state = MTR_COMMITTING);
+
+#ifndef UNIV_HOTBACKUP
+	/* This is a dirty read, for debugging. */
+	ut_ad(!recv_no_log_write);
+	write_log = mtr->modifications && mtr->n_log_recs;
+
+	if (write_log) {
+		mtr_log_reserve_and_write(mtr);
+
+		mtr_memo_note_modification_all(mtr);
+	}
+
+	/* We first update the modification info to buffer pages, and only
+	after that release the log mutex: this guarantees that when the log
+	mutex is free, all buffer pages contain an up-to-date info of their
+	modifications. This fact is used in making a checkpoint when we look
+	at the oldest modification of any page in the buffer pool. It is also
+	required when we insert modified buffer pages in to the flush list
+	which must be sorted on oldest_modification. */
+
+	if (write_log) {
+		log_release();
+	}
+
+	/* All unlocking has been moved here, after log_sys mutex release. */
+	mtr_memo_pop_all(mtr);
+
+#endif /* !UNIV_HOTBACKUP */
+
+	ut_d(mtr->state = MTR_COMMITTED);
+	dyn_array_free(&(mtr->memo));
+	dyn_array_free(&(mtr->log));
+}
+
+#ifndef UNIV_HOTBACKUP
+/**********************************************************//**
+Releases the latches stored in an mtr memo down to a savepoint.
+NOTE! The mtr must not have made changes to buffer pages after the
+savepoint, as these can be handled only by mtr_commit. */
+UNIV_INTERN
+void
+mtr_rollback_to_savepoint(
+/*======================*/
+	mtr_t*	mtr,		/*!< in: mtr */
+	ulint	savepoint)	/*!< in: savepoint */
+{
+	mtr_memo_slot_t* slot;
+	dyn_array_t*	memo;
+	ulint		offset;
+
+	ut_ad(mtr);
+	ut_ad(mtr->magic_n == MTR_MAGIC_N);
+	ut_ad(mtr->state == MTR_ACTIVE);
+
+	memo = &(mtr->memo);
+
+	offset = dyn_array_get_data_size(memo);
+	ut_ad(offset >= savepoint);
+
+	while (offset > savepoint) {
+		offset -= sizeof(mtr_memo_slot_t);
+
+		slot = dyn_array_get_element(memo, offset);
+
+		ut_ad(slot->type != MTR_MEMO_MODIFY);
+		mtr_memo_slot_release(mtr, slot);
+	}
+}
+
+/***************************************************//**
+Releases an object in the memo stack. */
+UNIV_INTERN
+void
+mtr_memo_release(
+/*=============*/
+	mtr_t*	mtr,	/*!< in: mtr */
+	void*	object,	/*!< in: object */
+	ulint	type)	/*!< in: object type: MTR_MEMO_S_LOCK, ... */
+{
+	mtr_memo_slot_t* slot;
+	dyn_array_t*	memo;
+	ulint		offset;
+
+	ut_ad(mtr);
+	ut_ad(mtr->magic_n == MTR_MAGIC_N);
+	ut_ad(mtr->state == MTR_ACTIVE);
+
+	memo = &(mtr->memo);
+
+	offset = dyn_array_get_data_size(memo);
+
+	while (offset > 0) {
+		offset -= sizeof(mtr_memo_slot_t);
+
+		slot = dyn_array_get_element(memo, offset);
+
+		if ((object == slot->object) && (type == slot->type)) {
+			if (mtr->modifications &&
+			    UNIV_LIKELY(slot->object != NULL) &&
+			    slot->type == MTR_MEMO_PAGE_X_FIX) {
+				buf_flush_note_modification(
+					(buf_block_t*)slot->object, mtr);
+			}
+
+			mtr_memo_slot_release(mtr, slot);
+
+			break;
+		}
+	}
+}
+#endif /* !UNIV_HOTBACKUP */
+
+/********************************************************//**
+Reads 1 - 4 bytes from a file page buffered in the buffer pool.
+@return	value read */
+UNIV_INTERN
+ulint
+mtr_read_ulint(
+/*===========*/
+	const byte*	ptr,	/*!< in: pointer from where to read */
+	ulint		type,	/*!< in: MLOG_1BYTE, MLOG_2BYTES, MLOG_4BYTES */
+	mtr_t*		mtr __attribute__((unused)))
+				/*!< in: mini-transaction handle */
+{
+	ut_ad(mtr->state == MTR_ACTIVE);
+	ut_ad(mtr_memo_contains_page(mtr, ptr, MTR_MEMO_PAGE_S_FIX)
+	      || mtr_memo_contains_page(mtr, ptr, MTR_MEMO_PAGE_X_FIX));
+	if (type == MLOG_1BYTE) {
+		return(mach_read_from_1(ptr));
+	} else if (type == MLOG_2BYTES) {
+		return(mach_read_from_2(ptr));
+	} else {
+		ut_ad(type == MLOG_4BYTES);
+		return(mach_read_from_4(ptr));
+	}
+}
+
+/********************************************************//**
+Reads 8 bytes from a file page buffered in the buffer pool.
+@return	value read */
+UNIV_INTERN
+dulint
+mtr_read_dulint(
+/*============*/
+	const byte*	ptr,	/*!< in: pointer from where to read */
+	mtr_t*		mtr __attribute__((unused)))
+				/*!< in: mini-transaction handle */
+{
+	ut_ad(mtr->state == MTR_ACTIVE);
+	ut_ad(mtr_memo_contains_page(mtr, ptr, MTR_MEMO_PAGE_S_FIX)
+	      || mtr_memo_contains_page(mtr, ptr, MTR_MEMO_PAGE_X_FIX));
+	return(mach_read_from_8(ptr));
+}
+
+#ifdef UNIV_DEBUG
+# ifndef UNIV_HOTBACKUP
+/**********************************************************//**
+Checks if memo contains the given page.
+@return	TRUE if contains */
+UNIV_INTERN
+ibool
+mtr_memo_contains_page(
+/*===================*/
+	mtr_t*		mtr,	/*!< in: mtr */
+	const byte*	ptr,	/*!< in: pointer to buffer frame */
+	ulint		type)	/*!< in: type of object */
+{
+	return(mtr_memo_contains(mtr, buf_block_align(ptr), type));
+}
+
+/*********************************************************//**
+Prints info of an mtr handle. */
+UNIV_INTERN
+void
+mtr_print(
+/*======*/
+	mtr_t*	mtr)	/*!< in: mtr */
+{
+	fprintf(stderr,
+		"Mini-transaction handle: memo size %lu bytes"
+		" log size %lu bytes\n",
+		(ulong) dyn_array_get_data_size(&(mtr->memo)),
+		(ulong) dyn_array_get_data_size(&(mtr->log)));
+}
+# endif /* !UNIV_HOTBACKUP */
+#endif /* UNIV_DEBUG */
diff --git a/storage/xtradb/os/os0file.c b/storage/xtradb/os/os0file.c
new file mode 100644
index 00000000000..48d796c38e1
--- /dev/null
+++ b/storage/xtradb/os/os0file.c
@@ -0,0 +1,4604 @@
+/***********************************************************************
+
+Copyright (c) 1995, 2010, Innobase Oy. All Rights Reserved.
+Copyright (c) 2009, Percona Inc.
+
+Portions of this file contain modifications contributed and copyrighted
+by Percona Inc.. Those modifications are
+gratefully acknowledged and are described briefly in the InnoDB
+documentation. The contributions by Percona Inc. are incorporated with
+their permission, and subject to the conditions contained in the file
+COPYING.Percona.
+
+This program is free software; you can redistribute it and/or modify it
+under the terms of the GNU General Public License as published by the
+Free Software Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
+Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program; if not, write to the Free Software Foundation, Inc.,
+59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+
+***********************************************************************/
+
+/**************************************************//**
+@file os/os0file.c
+The interface to the operating system file i/o primitives
+
+Created 10/21/1995 Heikki Tuuri
+*******************************************************/
+
+#include "os0file.h"
+#include "ut0mem.h"
+#include "srv0srv.h"
+#include "srv0start.h"
+#include "fil0fil.h"
+#include "buf0buf.h"
+#include "trx0sys.h"
+#include "trx0trx.h"
+#include "log0recv.h"
+#ifndef UNIV_HOTBACKUP
+# include "os0sync.h"
+# include "os0thread.h"
+#else /* !UNIV_HOTBACKUP */
+# ifdef __WIN__
+/* Add includes for the _stat() call to compile on Windows */
+#  include <sys/types.h>
+#  include <sys/stat.h>
+#  include <errno.h>
+# endif /* __WIN__ */
+#endif /* !UNIV_HOTBACKUP */
+
+/* This specifies the file permissions InnoDB uses when it creates files in
+Unix; the value of os_innodb_umask is initialized in ha_innodb.cc to
+my_umask */
+
+#ifndef __WIN__
+/** Umask for creating files */
+UNIV_INTERN ulint	os_innodb_umask
+			= S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP;
+#else
+/** Umask for creating files */
+UNIV_INTERN ulint	os_innodb_umask		= 0;
+#endif
+
+#ifdef UNIV_DO_FLUSH
+/* If the following is set to TRUE, we do not call os_file_flush in every
+os_file_write. We can set this TRUE when the doublewrite buffer is used. */
+UNIV_INTERN ibool	os_do_not_call_flush_at_each_write	= FALSE;
+#else
+/* We do not call os_file_flush in every os_file_write. */
+#endif /* UNIV_DO_FLUSH */
+
+#ifdef UNIV_HOTBACKUP
+# define os_aio_use_native_aio	FALSE
+#else /* UNIV_HOTBACKUP */
+/* We use these mutexes to protect lseek + file i/o operation, if the
+OS does not provide an atomic pread or pwrite, or similar */
+#define OS_FILE_N_SEEK_MUTEXES	16
+UNIV_INTERN os_mutex_t	os_file_seek_mutexes[OS_FILE_N_SEEK_MUTEXES];
+
+/* In simulated aio, merge at most this many consecutive i/os */
+#define OS_AIO_MERGE_N_CONSECUTIVE	64
+
+/** If this flag is TRUE, then we will use the native aio of the
+OS (provided we compiled Innobase with it in), otherwise we will
+use simulated aio we build below with threads */
+
+UNIV_INTERN ibool	os_aio_use_native_aio	= FALSE;
+
+/** Flag: enable debug printout for asynchronous i/o */
+UNIV_INTERN ibool	os_aio_print_debug	= FALSE;
+
+/* State for the state of an IO request in simulated AIO.
+   Protocol for simulated aio:
+     client requests IO: find slot with reserved = FALSE. Add entry with
+                         status = OS_AIO_NOT_ISSUED.
+     IO thread wakes: find adjacent slots with reserved = TRUE and status =
+                      OS_AIO_NOT_ISSUED. Change status for slots to
+                      OS_AIO_ISSUED.
+     IO operation completes: set status for slots to OS_AIO_DONE. set status
+                             for the first slot to OS_AIO_CLAIMED and return
+                             result for that slot.
+   When there are multiple read and write threads, they all compete to execute
+   the requests in the array (os_aio_array_t). This avoids the need to load
+   balance requests at the time the request is made at the cost of waking all
+   threads when a request is available.
+*/
+typedef enum {
+	OS_AIO_NOT_ISSUED, /* Available to be processed by an IO thread. */
+	OS_AIO_ISSUED,     /* Being processed by an IO thread. */
+	OS_AIO_DONE,       /* Request processed. */
+	OS_AIO_CLAIMED     /* Result being returned to client. */
+} os_aio_status;
+
+/** The asynchronous i/o array slot structure */
+typedef struct os_aio_slot_struct	os_aio_slot_t;
+
+/** The asynchronous i/o array slot structure */
+struct os_aio_slot_struct{
+	ibool		is_read;	/*!< TRUE if a read operation */
+	ulint		pos;		/*!< index of the slot in the aio
+					array */
+	ibool		reserved;	/*!< TRUE if this slot is reserved */
+	os_aio_status   status;		/* Status for current request. Valid when reserved
+					is TRUE. Used only in simulated aio. */
+	time_t		reservation_time;/*!< time when reserved */
+	ulint		len;		/*!< length of the block to read or
+					write */
+	byte*		buf;		/*!< buffer used in i/o */
+	ulint		type;		/*!< OS_FILE_READ or OS_FILE_WRITE */
+	ulint		offset;		/*!< 32 low bits of file offset in
+					bytes */
+	ulint		offset_high;	/*!< 32 high bits of file offset */
+	os_file_t	file;		/*!< file where to read or write */
+	const char*	name;		/*!< file name or path */
+//	ibool		io_already_done;/*!< used only in simulated aio:
+//					TRUE if the physical i/o already
+//					made and only the slot message
+//					needs to be passed to the caller
+//					of os_aio_simulated_handle */
+	fil_node_t*	message1;	/*!< message which is given by the */
+	void*		message2;	/*!< the requester of an aio operation
+					and which can be used to identify
+					which pending aio operation was
+					completed */
+#ifdef WIN_ASYNC_IO
+	os_event_t	event;		/*!< event object we need in the
+					OVERLAPPED struct */
+	OVERLAPPED	control;	/*!< Windows control block for the
+					aio request */
+#endif
+};
+
+/** The asynchronous i/o array structure */
+typedef struct os_aio_array_struct	os_aio_array_t;
+
+/** The asynchronous i/o array structure */
+struct os_aio_array_struct{
+	os_mutex_t	mutex;	/*!< the mutex protecting the aio array */
+	os_event_t	not_full;
+				/*!< The event which is set to the
+				signaled state when there is space in
+				the aio outside the ibuf segment */
+	os_event_t	is_empty;
+				/*!< The event which is set to the
+				signaled state when there are no
+				pending i/os in this array */
+	ulint		n_slots;/*!< Total number of slots in the aio
+				array.  This must be divisible by
+				n_threads. */
+	ulint		n_segments;
+				/*!< Number of segments in the aio
+				array of pending aio requests. A
+				thread can wait separately for any one
+				of the segments. */
+	ulint		n_reserved;
+				/*!< Number of reserved slots in the
+				aio array outside the ibuf segment */
+	os_aio_slot_t*	slots;	/*!< Pointer to the slots in the array */
+#ifdef __WIN__
+	os_native_event_t* native_events;
+				/*!< Pointer to an array of OS native
+				event handles where we copied the
+				handles from slots, in the same
+				order. This can be used in
+				WaitForMultipleObjects; used only in
+				Windows */
+#endif
+};
+
+/** Array of events used in simulated aio */
+static os_event_t*	os_aio_segment_wait_events	= NULL;
+
+/* Number for the first global segment for reading. */
+const ulint os_aio_first_read_segment = 2;
+
+/* Number for the first global segment for writing. Set to
+2 + os_aio_read_write_threads. */
+ulint os_aio_first_write_segment = 0;
+
+/** The aio arrays for non-ibuf i/o and ibuf i/o, as well as sync aio. These
+are NULL when the module has not yet been initialized. @{ */
+static os_aio_array_t*	os_aio_read_array	= NULL;	/*!< Reads */
+static os_aio_array_t*	os_aio_write_array	= NULL;	/*!< Writes */
+static os_aio_array_t*	os_aio_ibuf_array	= NULL;	/*!< Insert buffer */
+static os_aio_array_t*	os_aio_log_array	= NULL;	/*!< Redo log */
+static os_aio_array_t*	os_aio_sync_array	= NULL;	/*!< Synchronous I/O */
+/* @} */
+
+/* Per thread buffer used for merged IO requests. Used by
+os_aio_simulated_handle so that a buffer doesn't have to be allocated
+for each request. */
+static byte* os_aio_thread_buffer[SRV_MAX_N_IO_THREADS];
+static ulint os_aio_thread_buffer_size[SRV_MAX_N_IO_THREADS];
+
+/** Number of asynchronous I/O segments.  Set by os_aio_init(). */
+static ulint	os_aio_n_segments	= ULINT_UNDEFINED;
+
+/** If the following is TRUE, read i/o handler threads try to
+wait until a batch of new read requests have been posted */
+static volatile ibool	os_aio_recommend_sleep_for_read_threads	= FALSE;
+#endif /* UNIV_HOTBACKUP */
+
+UNIV_INTERN ulint	os_n_file_reads		= 0;
+UNIV_INTERN ulint	os_bytes_read_since_printout = 0;
+UNIV_INTERN ulint	os_n_file_writes	= 0;
+UNIV_INTERN ulint	os_n_fsyncs		= 0;
+UNIV_INTERN ulint	os_n_file_reads_old	= 0;
+UNIV_INTERN ulint	os_n_file_writes_old	= 0;
+UNIV_INTERN ulint	os_n_fsyncs_old		= 0;
+UNIV_INTERN time_t	os_last_printout;
+
+UNIV_INTERN ibool	os_has_said_disk_full	= FALSE;
+
+#ifndef UNIV_HOTBACKUP
+/** The mutex protecting the following counts of pending I/O operations */
+static os_mutex_t	os_file_count_mutex;
+#endif /* !UNIV_HOTBACKUP */
+/** Number of pending os_file_pread() operations */
+UNIV_INTERN ulint	os_file_n_pending_preads  = 0;
+/** Number of pending os_file_pwrite() operations */
+UNIV_INTERN ulint	os_file_n_pending_pwrites = 0;
+/** Number of pending write operations */
+UNIV_INTERN ulint	os_n_pending_writes = 0;
+/** Number of pending read operations */
+UNIV_INTERN ulint	os_n_pending_reads = 0;
+
+/***********************************************************************//**
+Gets the operating system version. Currently works only on Windows.
+@return	OS_WIN95, OS_WIN31, OS_WINNT, OS_WIN2000 */
+UNIV_INTERN
+ulint
+os_get_os_version(void)
+/*===================*/
+{
+#ifdef __WIN__
+	OSVERSIONINFO	  os_info;
+
+	os_info.dwOSVersionInfoSize = sizeof(OSVERSIONINFO);
+
+	ut_a(GetVersionEx(&os_info));
+
+	if (os_info.dwPlatformId == VER_PLATFORM_WIN32s) {
+		return(OS_WIN31);
+	} else if (os_info.dwPlatformId == VER_PLATFORM_WIN32_WINDOWS) {
+		return(OS_WIN95);
+	} else if (os_info.dwPlatformId == VER_PLATFORM_WIN32_NT) {
+		if (os_info.dwMajorVersion <= 4) {
+			return(OS_WINNT);
+		} else {
+			return(OS_WIN2000);
+		}
+	} else {
+		ut_error;
+		return(0);
+	}
+#else
+	ut_error;
+
+	return(0);
+#endif
+}
+
+/***********************************************************************//**
+Retrieves the last error number if an error occurs in a file io function.
+The number should be retrieved before any other OS calls (because they may
+overwrite the error number). If the number is not known to this program,
+the OS error number + 100 is returned.
+@return	error number, or OS error number + 100 */
+UNIV_INTERN
+ulint
+os_file_get_last_error(
+/*===================*/
+	ibool	report_all_errors)	/*!< in: TRUE if we want an error message
+					printed of all errors */
+{
+	ulint	err;
+
+#ifdef __WIN__
+
+	err = (ulint) GetLastError();
+
+	if (report_all_errors
+	    || (err != ERROR_DISK_FULL && err != ERROR_FILE_EXISTS)) {
+
+		ut_print_timestamp(stderr);
+		fprintf(stderr,
+			"  InnoDB: Operating system error number %lu"
+			" in a file operation.\n", (ulong) err);
+
+		if (err == ERROR_PATH_NOT_FOUND) {
+			fprintf(stderr,
+				"InnoDB: The error means the system"
+				" cannot find the path specified.\n");
+
+			if (srv_is_being_started) {
+				fprintf(stderr,
+					"InnoDB: If you are installing InnoDB,"
+					" remember that you must create\n"
+					"InnoDB: directories yourself, InnoDB"
+					" does not create them.\n");
+			}
+		} else if (err == ERROR_ACCESS_DENIED) {
+			fprintf(stderr,
+				"InnoDB: The error means mysqld does not have"
+				" the access rights to\n"
+				"InnoDB: the directory. It may also be"
+				" you have created a subdirectory\n"
+				"InnoDB: of the same name as a data file.\n");
+		} else if (err == ERROR_SHARING_VIOLATION
+			   || err == ERROR_LOCK_VIOLATION) {
+			fprintf(stderr,
+				"InnoDB: The error means that another program"
+				" is using InnoDB's files.\n"
+				"InnoDB: This might be a backup or antivirus"
+				" software or another instance\n"
+				"InnoDB: of MySQL."
+				" Please close it to get rid of this error.\n");
+		} else if (err == ERROR_WORKING_SET_QUOTA
+			   || err == ERROR_NO_SYSTEM_RESOURCES) {
+			fprintf(stderr,
+				"InnoDB: The error means that there are no"
+				" sufficient system resources or quota to"
+				" complete the operation.\n");
+		} else if (err == ERROR_OPERATION_ABORTED) {
+			fprintf(stderr,
+				"InnoDB: The error means that the I/O"
+				" operation has been aborted\n"
+				"InnoDB: because of either a thread exit"
+				" or an application request.\n"
+				"InnoDB: Retry attempt is made.\n");
+		} else {
+			fprintf(stderr,
+				"InnoDB: Some operating system error numbers"
+				" are described at\n"
+				"InnoDB: "
+				REFMAN
+				"operating-system-error-codes.html\n");
+		}
+	}
+
+	fflush(stderr);
+
+	if (err == ERROR_FILE_NOT_FOUND) {
+		return(OS_FILE_NOT_FOUND);
+	} else if (err == ERROR_DISK_FULL) {
+		return(OS_FILE_DISK_FULL);
+	} else if (err == ERROR_FILE_EXISTS) {
+		return(OS_FILE_ALREADY_EXISTS);
+	} else if (err == ERROR_SHARING_VIOLATION
+		   || err == ERROR_LOCK_VIOLATION) {
+		return(OS_FILE_SHARING_VIOLATION);
+	} else if (err == ERROR_WORKING_SET_QUOTA
+		   || err == ERROR_NO_SYSTEM_RESOURCES) {
+		return(OS_FILE_INSUFFICIENT_RESOURCE);
+	} else if (err == ERROR_OPERATION_ABORTED) {
+		return(OS_FILE_OPERATION_ABORTED);
+	} else {
+		return(100 + err);
+	}
+#else
+	err = (ulint) errno;
+
+	if (report_all_errors
+	    || (err != ENOSPC && err != EEXIST)) {
+
+		ut_print_timestamp(stderr);
+		fprintf(stderr,
+			"  InnoDB: Operating system error number %lu"
+			" in a file operation.\n", (ulong) err);
+
+		if (err == ENOENT) {
+			fprintf(stderr,
+				"InnoDB: The error means the system"
+				" cannot find the path specified.\n");
+
+			if (srv_is_being_started) {
+				fprintf(stderr,
+					"InnoDB: If you are installing InnoDB,"
+					" remember that you must create\n"
+					"InnoDB: directories yourself, InnoDB"
+					" does not create them.\n");
+			}
+		} else if (err == EACCES) {
+			fprintf(stderr,
+				"InnoDB: The error means mysqld does not have"
+				" the access rights to\n"
+				"InnoDB: the directory.\n");
+		} else {
+			if (strerror((int)err) != NULL) {
+				fprintf(stderr,
+					"InnoDB: Error number %lu"
+					" means '%s'.\n",
+					err, strerror((int)err));
+			}
+
+			fprintf(stderr,
+				"InnoDB: Some operating system"
+				" error numbers are described at\n"
+				"InnoDB: "
+				REFMAN
+				"operating-system-error-codes.html\n");
+		}
+	}
+
+	fflush(stderr);
+
+	if (err == ENOSPC) {
+		return(OS_FILE_DISK_FULL);
+	} else if (err == ENOENT) {
+		return(OS_FILE_NOT_FOUND);
+	} else if (err == EEXIST) {
+		return(OS_FILE_ALREADY_EXISTS);
+	} else if (err == EXDEV || err == ENOTDIR || err == EISDIR) {
+		return(OS_FILE_PATH_ERROR);
+	} else {
+		return(100 + err);
+	}
+#endif
+}
+
+/****************************************************************//**
+Does error handling when a file operation fails.
+Conditionally exits (calling exit(3)) based on should_exit value and the
+error type
+@return	TRUE if we should retry the operation */
+static
+ibool
+os_file_handle_error_cond_exit(
+/*===========================*/
+	const char*	name,		/*!< in: name of a file or NULL */
+	const char*	operation,	/*!< in: operation */
+	ibool		should_exit)	/*!< in: call exit(3) if unknown error
+					and this parameter is TRUE */
+{
+	ulint	err;
+
+	err = os_file_get_last_error(FALSE);
+
+	if (err == OS_FILE_DISK_FULL) {
+		/* We only print a warning about disk full once */
+
+		if (os_has_said_disk_full) {
+
+			return(FALSE);
+		}
+
+		if (name) {
+			ut_print_timestamp(stderr);
+			fprintf(stderr,
+				"  InnoDB: Encountered a problem with"
+				" file %s\n", name);
+		}
+
+		ut_print_timestamp(stderr);
+		fprintf(stderr,
+			"  InnoDB: Disk is full. Try to clean the disk"
+			" to free space.\n");
+
+		os_has_said_disk_full = TRUE;
+
+		fflush(stderr);
+
+		return(FALSE);
+	} else if (err == OS_FILE_AIO_RESOURCES_RESERVED) {
+
+		return(TRUE);
+	} else if (err == OS_FILE_ALREADY_EXISTS
+		   || err == OS_FILE_PATH_ERROR) {
+
+		return(FALSE);
+	} else if (err == OS_FILE_SHARING_VIOLATION) {
+
+		os_thread_sleep(10000000);  /* 10 sec */
+		return(TRUE);
+	} else if (err == OS_FILE_INSUFFICIENT_RESOURCE) {
+
+		os_thread_sleep(100000);	/* 100 ms */
+		return(TRUE);
+	} else if (err == OS_FILE_OPERATION_ABORTED) {
+
+		os_thread_sleep(100000);	/* 100 ms */
+		return(TRUE);
+	} else {
+		if (name) {
+			fprintf(stderr, "InnoDB: File name %s\n", name);
+		}
+
+		fprintf(stderr, "InnoDB: File operation call: '%s'.\n",
+			operation);
+
+		if (should_exit) {
+			fprintf(stderr, "InnoDB: Cannot continue operation.\n");
+
+			fflush(stderr);
+
+			exit(1);
+		}
+	}
+
+	return(FALSE);
+}
+
+/****************************************************************//**
+Does error handling when a file operation fails.
+@return	TRUE if we should retry the operation */
+static
+ibool
+os_file_handle_error(
+/*=================*/
+	const char*	name,	/*!< in: name of a file or NULL */
+	const char*	operation)/*!< in: operation */
+{
+	/* exit in case of unknown error */
+	return(os_file_handle_error_cond_exit(name, operation, TRUE));
+}
+
+/****************************************************************//**
+Does error handling when a file operation fails.
+@return	TRUE if we should retry the operation */
+static
+ibool
+os_file_handle_error_no_exit(
+/*=========================*/
+	const char*	name,	/*!< in: name of a file or NULL */
+	const char*	operation)/*!< in: operation */
+{
+	/* don't exit in case of unknown error */
+	return(os_file_handle_error_cond_exit(name, operation, FALSE));
+}
+
+#undef USE_FILE_LOCK
+#define USE_FILE_LOCK
+#if defined(UNIV_HOTBACKUP) || defined(__WIN__) || defined(__NETWARE__)
+/* InnoDB Hot Backup does not lock the data files.
+ * On Windows, mandatory locking is used.
+ */
+# undef USE_FILE_LOCK
+#endif
+#ifdef USE_FILE_LOCK
+/****************************************************************//**
+Obtain an exclusive lock on a file.
+@return	0 on success */
+static
+int
+os_file_lock(
+/*=========*/
+	int		fd,	/*!< in: file descriptor */
+	const char*	name)	/*!< in: file name */
+{
+	struct flock lk;
+	lk.l_type = F_WRLCK;
+	lk.l_whence = SEEK_SET;
+	lk.l_start = lk.l_len = 0;
+	if (fcntl(fd, F_SETLK, &lk) == -1) {
+		fprintf(stderr,
+			"InnoDB: Unable to lock %s, error: %d\n", name, errno);
+
+		if (errno == EAGAIN || errno == EACCES) {
+			fprintf(stderr,
+				"InnoDB: Check that you do not already have"
+				" another mysqld process\n"
+				"InnoDB: using the same InnoDB data"
+				" or log files.\n");
+		}
+
+		return(-1);
+	}
+
+	return(0);
+}
+#endif /* USE_FILE_LOCK */
+
+#ifndef UNIV_HOTBACKUP
+/****************************************************************//**
+Creates the seek mutexes used in positioned reads and writes. */
+UNIV_INTERN
+void
+os_io_init_simple(void)
+/*===================*/
+{
+	ulint	i;
+
+	os_file_count_mutex = os_mutex_create(NULL);
+
+	for (i = 0; i < OS_FILE_N_SEEK_MUTEXES; i++) {
+		os_file_seek_mutexes[i] = os_mutex_create(NULL);
+	}
+}
+
+/***********************************************************************//**
+Creates a temporary file.  This function is like tmpfile(3), but
+the temporary file is created in the MySQL temporary directory.
+On Netware, this function is like tmpfile(3), because the C run-time
+library of Netware does not expose the delete-on-close flag.
+@return	temporary file handle, or NULL on error */
+UNIV_INTERN
+FILE*
+os_file_create_tmpfile(void)
+/*========================*/
+{
+#ifdef __NETWARE__
+	FILE*	file	= tmpfile();
+#else /* __NETWARE__ */
+	FILE*	file	= NULL;
+	int	fd	= innobase_mysql_tmpfile();
+
+	if (fd >= 0) {
+		file = fdopen(fd, "w+b");
+	}
+#endif /* __NETWARE__ */
+
+	if (!file) {
+		ut_print_timestamp(stderr);
+		fprintf(stderr,
+			"  InnoDB: Error: unable to create temporary file;"
+			" errno: %d\n", errno);
+#ifndef __NETWARE__
+		if (fd >= 0) {
+			close(fd);
+		}
+#endif /* !__NETWARE__ */
+	}
+
+	return(file);
+}
+#endif /* !UNIV_HOTBACKUP */
+
+/***********************************************************************//**
+The os_file_opendir() function opens a directory stream corresponding to the
+directory named by the dirname argument. The directory stream is positioned
+at the first entry. In both Unix and Windows we automatically skip the '.'
+and '..' items at the start of the directory listing.
+@return	directory stream, NULL if error */
+UNIV_INTERN
+os_file_dir_t
+os_file_opendir(
+/*============*/
+	const char*	dirname,	/*!< in: directory name; it must not
+					contain a trailing '\' or '/' */
+	ibool		error_is_fatal)	/*!< in: TRUE if we should treat an
+					error as a fatal error; if we try to
+					open symlinks then we do not wish a
+					fatal error if it happens not to be
+					a directory */
+{
+	os_file_dir_t		dir;
+#ifdef __WIN__
+	LPWIN32_FIND_DATA	lpFindFileData;
+	char			path[OS_FILE_MAX_PATH + 3];
+
+	ut_a(strlen(dirname) < OS_FILE_MAX_PATH);
+
+	strcpy(path, dirname);
+	strcpy(path + strlen(path), "\\*");
+
+	/* Note that in Windows opening the 'directory stream' also retrieves
+	the first entry in the directory. Since it is '.', that is no problem,
+	as we will skip over the '.' and '..' entries anyway. */
+
+	lpFindFileData = ut_malloc(sizeof(WIN32_FIND_DATA));
+
+	dir = FindFirstFile((LPCTSTR) path, lpFindFileData);
+
+	ut_free(lpFindFileData);
+
+	if (dir == INVALID_HANDLE_VALUE) {
+
+		if (error_is_fatal) {
+			os_file_handle_error(dirname, "opendir");
+		}
+
+		return(NULL);
+	}
+
+	return(dir);
+#else
+	dir = opendir(dirname);
+
+	if (dir == NULL && error_is_fatal) {
+		os_file_handle_error(dirname, "opendir");
+	}
+
+	return(dir);
+#endif
+}
+
+/***********************************************************************//**
+Closes a directory stream.
+@return	0 if success, -1 if failure */
+UNIV_INTERN
+int
+os_file_closedir(
+/*=============*/
+	os_file_dir_t	dir)	/*!< in: directory stream */
+{
+#ifdef __WIN__
+	BOOL		ret;
+
+	ret = FindClose(dir);
+
+	if (!ret) {
+		os_file_handle_error_no_exit(NULL, "closedir");
+
+		return(-1);
+	}
+
+	return(0);
+#else
+	int	ret;
+
+	ret = closedir(dir);
+
+	if (ret) {
+		os_file_handle_error_no_exit(NULL, "closedir");
+	}
+
+	return(ret);
+#endif
+}
+
+/***********************************************************************//**
+This function returns information of the next file in the directory. We jump
+over the '.' and '..' entries in the directory.
+@return	0 if ok, -1 if error, 1 if at the end of the directory */
+UNIV_INTERN
+int
+os_file_readdir_next_file(
+/*======================*/
+	const char*	dirname,/*!< in: directory name or path */
+	os_file_dir_t	dir,	/*!< in: directory stream */
+	os_file_stat_t*	info)	/*!< in/out: buffer where the info is returned */
+{
+#ifdef __WIN__
+	LPWIN32_FIND_DATA	lpFindFileData;
+	BOOL			ret;
+
+	lpFindFileData = ut_malloc(sizeof(WIN32_FIND_DATA));
+next_file:
+	ret = FindNextFile(dir, lpFindFileData);
+
+	if (ret) {
+		ut_a(strlen((char *) lpFindFileData->cFileName)
+		     < OS_FILE_MAX_PATH);
+
+		if (strcmp((char *) lpFindFileData->cFileName, ".") == 0
+		    || strcmp((char *) lpFindFileData->cFileName, "..") == 0) {
+
+			goto next_file;
+		}
+
+		strcpy(info->name, (char *) lpFindFileData->cFileName);
+
+		info->size = (ib_int64_t)(lpFindFileData->nFileSizeLow)
+			+ (((ib_int64_t)(lpFindFileData->nFileSizeHigh))
+			   << 32);
+
+		if (lpFindFileData->dwFileAttributes
+		    & FILE_ATTRIBUTE_REPARSE_POINT) {
+			/* TODO: test Windows symlinks */
+			/* TODO: MySQL has apparently its own symlink
+			implementation in Windows, dbname.sym can
+			redirect a database directory:
+			REFMAN "windows-symbolic-links.html" */
+			info->type = OS_FILE_TYPE_LINK;
+		} else if (lpFindFileData->dwFileAttributes
+			   & FILE_ATTRIBUTE_DIRECTORY) {
+			info->type = OS_FILE_TYPE_DIR;
+		} else {
+			/* It is probably safest to assume that all other
+			file types are normal. Better to check them rather
+			than blindly skip them. */
+
+			info->type = OS_FILE_TYPE_FILE;
+		}
+	}
+
+	ut_free(lpFindFileData);
+
+	if (ret) {
+		return(0);
+	} else if (GetLastError() == ERROR_NO_MORE_FILES) {
+
+		return(1);
+	} else {
+		os_file_handle_error_no_exit(dirname,
+					     "readdir_next_file");
+		return(-1);
+	}
+#else
+	struct dirent*	ent;
+	char*		full_path;
+	int		ret;
+	struct stat	statinfo;
+#ifdef HAVE_READDIR_R
+	char		dirent_buf[sizeof(struct dirent)
+				   + _POSIX_PATH_MAX + 100];
+	/* In /mysys/my_lib.c, _POSIX_PATH_MAX + 1 is used as
+	the max file name len; but in most standards, the
+	length is NAME_MAX; we add 100 to be even safer */
+#endif
+
+next_file:
+
+#ifdef HAVE_READDIR_R
+	ret = readdir_r(dir, (struct dirent*)dirent_buf, &ent);
+
+	if (ret != 0
+#ifdef UNIV_AIX
+	    /* On AIX, only if we got non-NULL 'ent' (result) value and
+	    a non-zero 'ret' (return) value, it indicates a failed
+	    readdir_r() call. An NULL 'ent' with an non-zero 'ret'
+	    would indicate the "end of the directory" is reached. */
+	    && ent != NULL
+#endif
+	   ) {
+		fprintf(stderr,
+			"InnoDB: cannot read directory %s, error %lu\n",
+			dirname, (ulong)ret);
+
+		return(-1);
+	}
+
+	if (ent == NULL) {
+		/* End of directory */
+
+		return(1);
+	}
+
+	ut_a(strlen(ent->d_name) < _POSIX_PATH_MAX + 100 - 1);
+#else
+	ent = readdir(dir);
+
+	if (ent == NULL) {
+
+		return(1);
+	}
+#endif
+	ut_a(strlen(ent->d_name) < OS_FILE_MAX_PATH);
+
+	if (strcmp(ent->d_name, ".") == 0 || strcmp(ent->d_name, "..") == 0) {
+
+		goto next_file;
+	}
+
+	strcpy(info->name, ent->d_name);
+
+	full_path = ut_malloc(strlen(dirname) + strlen(ent->d_name) + 10);
+
+	sprintf(full_path, "%s/%s", dirname, ent->d_name);
+
+	ret = stat(full_path, &statinfo);
+
+	if (ret) {
+
+		if (errno == ENOENT) {
+			/* readdir() returned a file that does not exist,
+			it must have been deleted in the meantime. Do what
+			would have happened if the file was deleted before
+			readdir() - ignore and go to the next entry.
+			If this is the last entry then info->name will still
+			contain the name of the deleted file when this
+			function returns, but this is not an issue since the
+			caller shouldn't be looking at info when end of
+			directory is returned. */
+
+			ut_free(full_path);
+
+			goto next_file;
+		}
+
+		os_file_handle_error_no_exit(full_path, "stat");
+
+		ut_free(full_path);
+
+		return(-1);
+	}
+
+	info->size = (ib_int64_t)statinfo.st_size;
+
+	if (S_ISDIR(statinfo.st_mode)) {
+		info->type = OS_FILE_TYPE_DIR;
+	} else if (S_ISLNK(statinfo.st_mode)) {
+		info->type = OS_FILE_TYPE_LINK;
+	} else if (S_ISREG(statinfo.st_mode)) {
+		info->type = OS_FILE_TYPE_FILE;
+	} else {
+		info->type = OS_FILE_TYPE_UNKNOWN;
+	}
+
+	ut_free(full_path);
+
+	return(0);
+#endif
+}
+
+/*****************************************************************//**
+This function attempts to create a directory named pathname. The new directory
+gets default permissions. On Unix the permissions are (0770 & ~umask). If the
+directory exists already, nothing is done and the call succeeds, unless the
+fail_if_exists arguments is true.
+@return	TRUE if call succeeds, FALSE on error */
+UNIV_INTERN
+ibool
+os_file_create_directory(
+/*=====================*/
+	const char*	pathname,	/*!< in: directory name as
+					null-terminated string */
+	ibool		fail_if_exists)	/*!< in: if TRUE, pre-existing directory
+					is treated as an error. */
+{
+#ifdef __WIN__
+	BOOL	rcode;
+
+	rcode = CreateDirectory((LPCTSTR) pathname, NULL);
+	if (!(rcode != 0
+	      || (GetLastError() == ERROR_ALREADY_EXISTS
+		  && !fail_if_exists))) {
+		/* failure */
+		os_file_handle_error(pathname, "CreateDirectory");
+
+		return(FALSE);
+	}
+
+	return (TRUE);
+#else
+	int	rcode;
+
+	rcode = mkdir(pathname, 0770);
+
+	if (!(rcode == 0 || (errno == EEXIST && !fail_if_exists))) {
+		/* failure */
+		os_file_handle_error(pathname, "mkdir");
+
+		return(FALSE);
+	}
+
+	return (TRUE);
+#endif
+}
+
+/****************************************************************//**
+A simple function to open or create a file.
+@return own: handle to the file, not defined if error, error number
+can be retrieved with os_file_get_last_error */
+UNIV_INTERN
+os_file_t
+os_file_create_simple(
+/*==================*/
+	const char*	name,	/*!< in: name of the file or path as a
+				null-terminated string */
+	ulint		create_mode,/*!< in: OS_FILE_OPEN if an existing file is
+				opened (if does not exist, error), or
+				OS_FILE_CREATE if a new file is created
+				(if exists, error), or
+				OS_FILE_CREATE_PATH if new file
+				(if exists, error) and subdirectories along
+				its path are created (if needed)*/
+	ulint		access_type,/*!< in: OS_FILE_READ_ONLY or
+				OS_FILE_READ_WRITE */
+	ibool*		success)/*!< out: TRUE if succeed, FALSE if error */
+{
+#ifdef __WIN__
+	os_file_t	file;
+	DWORD		create_flag;
+	DWORD		access;
+	DWORD		attributes	= 0;
+	ibool		retry;
+
+try_again:
+	ut_a(name);
+
+	if (create_mode == OS_FILE_OPEN) {
+		create_flag = OPEN_EXISTING;
+	} else if (create_mode == OS_FILE_CREATE) {
+		create_flag = CREATE_NEW;
+	} else if (create_mode == OS_FILE_CREATE_PATH) {
+		/* create subdirs along the path if needed  */
+		*success = os_file_create_subdirs_if_needed(name);
+		if (!*success) {
+			ut_error;
+		}
+		create_flag = CREATE_NEW;
+		create_mode = OS_FILE_CREATE;
+	} else {
+		create_flag = 0;
+		ut_error;
+	}
+
+	if (access_type == OS_FILE_READ_ONLY) {
+		access = GENERIC_READ;
+	} else if (access_type == OS_FILE_READ_WRITE) {
+		access = GENERIC_READ | GENERIC_WRITE;
+	} else {
+		access = 0;
+		ut_error;
+	}
+
+	file = CreateFile((LPCTSTR) name,
+			  access,
+			  FILE_SHARE_READ | FILE_SHARE_WRITE,
+			  /* file can be read and written also
+			  by other processes */
+			  NULL,	/* default security attributes */
+			  create_flag,
+			  attributes,
+			  NULL);	/*!< no template file */
+
+	if (file == INVALID_HANDLE_VALUE) {
+		*success = FALSE;
+
+		retry = os_file_handle_error(name,
+					     create_mode == OS_FILE_OPEN ?
+					     "open" : "create");
+		if (retry) {
+			goto try_again;
+		}
+	} else {
+		*success = TRUE;
+	}
+
+	return(file);
+#else /* __WIN__ */
+	os_file_t	file;
+	int		create_flag;
+	ibool		retry;
+
+try_again:
+	ut_a(name);
+
+	if (create_mode == OS_FILE_OPEN) {
+		if (access_type == OS_FILE_READ_ONLY) {
+			create_flag = O_RDONLY;
+		} else {
+			create_flag = O_RDWR;
+		}
+	} else if (create_mode == OS_FILE_CREATE) {
+		create_flag = O_RDWR | O_CREAT | O_EXCL;
+	} else if (create_mode == OS_FILE_CREATE_PATH) {
+		/* create subdirs along the path if needed  */
+		*success = os_file_create_subdirs_if_needed(name);
+		if (!*success) {
+			return (-1);
+		}
+		create_flag = O_RDWR | O_CREAT | O_EXCL;
+		create_mode = OS_FILE_CREATE;
+	} else {
+		create_flag = 0;
+		ut_error;
+	}
+
+	if (create_mode == OS_FILE_CREATE) {
+		file = open(name, create_flag, S_IRUSR | S_IWUSR
+			    | S_IRGRP | S_IWGRP);
+	} else {
+		file = open(name, create_flag);
+	}
+
+	if (file == -1) {
+		*success = FALSE;
+
+		retry = os_file_handle_error(name,
+					     create_mode == OS_FILE_OPEN ?
+					     "open" : "create");
+		if (retry) {
+			goto try_again;
+		}
+#ifdef USE_FILE_LOCK
+	} else if (access_type == OS_FILE_READ_WRITE
+		   && os_file_lock(file, name)) {
+		*success = FALSE;
+		close(file);
+		file = -1;
+#endif
+	} else {
+		*success = TRUE;
+	}
+
+	return(file);
+#endif /* __WIN__ */
+}
+
+/****************************************************************//**
+A simple function to open or create a file.
+@return own: handle to the file, not defined if error, error number
+can be retrieved with os_file_get_last_error */
+UNIV_INTERN
+os_file_t
+os_file_create_simple_no_error_handling(
+/*====================================*/
+	const char*	name,	/*!< in: name of the file or path as a
+				null-terminated string */
+	ulint		create_mode,/*!< in: OS_FILE_OPEN if an existing file
+				is opened (if does not exist, error), or
+				OS_FILE_CREATE if a new file is created
+				(if exists, error) */
+	ulint		access_type,/*!< in: OS_FILE_READ_ONLY,
+				OS_FILE_READ_WRITE, or
+				OS_FILE_READ_ALLOW_DELETE; the last option is
+				used by a backup program reading the file */
+	ibool*		success)/*!< out: TRUE if succeed, FALSE if error */
+{
+#ifdef __WIN__
+	os_file_t	file;
+	DWORD		create_flag;
+	DWORD		access;
+	DWORD		attributes	= 0;
+	DWORD		share_mode	= FILE_SHARE_READ | FILE_SHARE_WRITE;
+
+	ut_a(name);
+
+	if (create_mode == OS_FILE_OPEN) {
+		create_flag = OPEN_EXISTING;
+	} else if (create_mode == OS_FILE_CREATE) {
+		create_flag = CREATE_NEW;
+	} else {
+		create_flag = 0;
+		ut_error;
+	}
+
+	if (access_type == OS_FILE_READ_ONLY) {
+		access = GENERIC_READ;
+	} else if (access_type == OS_FILE_READ_WRITE) {
+		access = GENERIC_READ | GENERIC_WRITE;
+	} else if (access_type == OS_FILE_READ_ALLOW_DELETE) {
+		access = GENERIC_READ;
+		share_mode = FILE_SHARE_DELETE | FILE_SHARE_READ
+			| FILE_SHARE_WRITE;	/*!< A backup program has to give
+						mysqld the maximum freedom to
+						do what it likes with the
+						file */
+	} else {
+		access = 0;
+		ut_error;
+	}
+
+	file = CreateFile((LPCTSTR) name,
+			  access,
+			  share_mode,
+			  NULL,	/* default security attributes */
+			  create_flag,
+			  attributes,
+			  NULL);	/*!< no template file */
+
+	if (file == INVALID_HANDLE_VALUE) {
+		*success = FALSE;
+	} else {
+		*success = TRUE;
+	}
+
+	return(file);
+#else /* __WIN__ */
+	os_file_t	file;
+	int		create_flag;
+
+	ut_a(name);
+
+	if (create_mode == OS_FILE_OPEN) {
+		if (access_type == OS_FILE_READ_ONLY) {
+			create_flag = O_RDONLY;
+		} else {
+			create_flag = O_RDWR;
+		}
+	} else if (create_mode == OS_FILE_CREATE) {
+		create_flag = O_RDWR | O_CREAT | O_EXCL;
+	} else {
+		create_flag = 0;
+		ut_error;
+	}
+
+	if (create_mode == OS_FILE_CREATE) {
+		file = open(name, create_flag, S_IRUSR | S_IWUSR
+			    | S_IRGRP | S_IWGRP);
+	} else {
+		file = open(name, create_flag);
+	}
+
+	if (file == -1) {
+		*success = FALSE;
+#ifdef USE_FILE_LOCK
+	} else if (access_type == OS_FILE_READ_WRITE
+		   && os_file_lock(file, name)) {
+		*success = FALSE;
+		close(file);
+		file = -1;
+#endif
+	} else {
+		*success = TRUE;
+	}
+
+	return(file);
+#endif /* __WIN__ */
+}
+
+/****************************************************************//**
+Tries to disable OS caching on an opened file descriptor. */
+UNIV_INTERN
+void
+os_file_set_nocache(
+/*================*/
+	int		fd,		/*!< in: file descriptor to alter */
+	const char*	file_name,	/*!< in: file name, used in the
+					diagnostic message */
+	const char*	operation_name)	/*!< in: "open" or "create"; used in the
+					diagnostic message */
+{
+	/* some versions of Solaris may not have DIRECTIO_ON */
+#if defined(UNIV_SOLARIS) && defined(DIRECTIO_ON)
+	if (directio(fd, DIRECTIO_ON) == -1) {
+		int	errno_save;
+		errno_save = (int)errno;
+		ut_print_timestamp(stderr);
+		fprintf(stderr,
+			"  InnoDB: Failed to set DIRECTIO_ON "
+			"on file %s: %s: %s, continuing anyway\n",
+			file_name, operation_name, strerror(errno_save));
+	}
+#elif defined(O_DIRECT)
+	if (fcntl(fd, F_SETFL, O_DIRECT) == -1) {
+		int	errno_save;
+		errno_save = (int)errno;
+		ut_print_timestamp(stderr);
+		fprintf(stderr,
+			"  InnoDB: Failed to set O_DIRECT "
+			"on file %s: %s: %s, continuing anyway\n",
+			file_name, operation_name, strerror(errno_save));
+		if (errno_save == EINVAL) {
+			ut_print_timestamp(stderr);
+			fprintf(stderr,
+				"  InnoDB: O_DIRECT is known to result in "
+				"'Invalid argument' on Linux on tmpfs, "
+				"see MySQL Bug#26662\n");
+		}
+	}
+#endif
+}
+
+/****************************************************************//**
+Opens an existing file or creates a new.
+@return own: handle to the file, not defined if error, error number
+can be retrieved with os_file_get_last_error */
+UNIV_INTERN
+os_file_t
+os_file_create(
+/*===========*/
+	const char*	name,	/*!< in: name of the file or path as a
+				null-terminated string */
+	ulint		create_mode,/*!< in: OS_FILE_OPEN if an existing file
+				is opened (if does not exist, error), or
+				OS_FILE_CREATE if a new file is created
+				(if exists, error),
+				OS_FILE_OVERWRITE if a new file is created
+				or an old overwritten;
+				OS_FILE_OPEN_RAW, if a raw device or disk
+				partition should be opened */
+	ulint		purpose,/*!< in: OS_FILE_AIO, if asynchronous,
+				non-buffered i/o is desired,
+				OS_FILE_NORMAL, if any normal file;
+				NOTE that it also depends on type, os_aio_..
+				and srv_.. variables whether we really use
+				async i/o or unbuffered i/o: look in the
+				function source code for the exact rules */
+	ulint		type,	/*!< in: OS_DATA_FILE or OS_LOG_FILE */
+	ibool*		success)/*!< out: TRUE if succeed, FALSE if error */
+{
+#ifdef __WIN__
+	os_file_t	file;
+	DWORD		share_mode	= FILE_SHARE_READ;
+	DWORD		create_flag;
+	DWORD		attributes;
+	ibool		retry;
+try_again:
+	ut_a(name);
+
+	if (create_mode == OS_FILE_OPEN_RAW) {
+		create_flag = OPEN_EXISTING;
+		share_mode = FILE_SHARE_WRITE;
+	} else if (create_mode == OS_FILE_OPEN
+		   || create_mode == OS_FILE_OPEN_RETRY) {
+		create_flag = OPEN_EXISTING;
+	} else if (create_mode == OS_FILE_CREATE) {
+		create_flag = CREATE_NEW;
+	} else if (create_mode == OS_FILE_OVERWRITE) {
+		create_flag = CREATE_ALWAYS;
+	} else {
+		create_flag = 0;
+		ut_error;
+	}
+
+	if (purpose == OS_FILE_AIO) {
+		/* If specified, use asynchronous (overlapped) io and no
+		buffering of writes in the OS */
+		attributes = 0;
+#ifdef WIN_ASYNC_IO
+		if (os_aio_use_native_aio) {
+			attributes = attributes | FILE_FLAG_OVERLAPPED;
+		}
+#endif
+#ifdef UNIV_NON_BUFFERED_IO
+# ifndef UNIV_HOTBACKUP
+		if (type == OS_LOG_FILE && srv_flush_log_at_trx_commit == 2) {
+			/* Do not use unbuffered i/o to log files because
+			value 2 denotes that we do not flush the log at every
+			commit, but only once per second */
+		} else if (srv_win_file_flush_method
+			   == SRV_WIN_IO_UNBUFFERED) {
+			attributes = attributes | FILE_FLAG_NO_BUFFERING;
+		}
+# else /* !UNIV_HOTBACKUP */
+		attributes = attributes | FILE_FLAG_NO_BUFFERING;
+# endif /* !UNIV_HOTBACKUP */
+#endif /* UNIV_NON_BUFFERED_IO */
+	} else if (purpose == OS_FILE_NORMAL) {
+		attributes = 0;
+#ifdef UNIV_NON_BUFFERED_IO
+# ifndef UNIV_HOTBACKUP
+		if (type == OS_LOG_FILE && srv_flush_log_at_trx_commit == 2) {
+			/* Do not use unbuffered i/o to log files because
+			value 2 denotes that we do not flush the log at every
+			commit, but only once per second */
+		} else if (srv_win_file_flush_method
+			   == SRV_WIN_IO_UNBUFFERED) {
+			attributes = attributes | FILE_FLAG_NO_BUFFERING;
+		}
+# else /* !UNIV_HOTBACKUP */
+		attributes = attributes | FILE_FLAG_NO_BUFFERING;
+# endif /* !UNIV_HOTBACKUP */
+#endif /* UNIV_NON_BUFFERED_IO */
+	} else {
+		attributes = 0;
+		ut_error;
+	}
+
+	file = CreateFile((LPCTSTR) name,
+			  GENERIC_READ | GENERIC_WRITE, /* read and write
+							access */
+			  share_mode,	/* File can be read also by other
+					processes; we must give the read
+					permission because of ibbackup. We do
+					not give the write permission to
+					others because if one would succeed to
+					start 2 instances of mysqld on the
+					SAME files, that could cause severe
+					database corruption! When opening
+					raw disk partitions, Microsoft manuals
+					say that we must give also the write
+					permission. */
+			  NULL,	/* default security attributes */
+			  create_flag,
+			  attributes,
+			  NULL);	/*!< no template file */
+
+	if (file == INVALID_HANDLE_VALUE) {
+		*success = FALSE;
+
+		/* When srv_file_per_table is on, file creation failure may not
+		be critical to the whole instance. Do not crash the server in
+		case of unknown errors.
+		Please note "srv_file_per_table" is a global variable with
+		no explicit synchronization protection. It could be
+		changed during this execution path. It might not have the
+		same value as the one when building the table definition */
+		if (srv_file_per_table) {
+			retry = os_file_handle_error_no_exit(name,
+						create_mode == OS_FILE_CREATE ?
+						"create" : "open");
+		} else {
+			retry = os_file_handle_error(name,
+						create_mode == OS_FILE_CREATE ?
+						"create" : "open");
+		}
+
+		if (retry) {
+			goto try_again;
+		}
+	} else {
+		*success = TRUE;
+	}
+
+	return(file);
+#else /* __WIN__ */
+	os_file_t	file;
+	int		create_flag;
+	ibool		retry;
+	const char*	mode_str	= NULL;
+	const char*	type_str	= NULL;
+	const char*	purpose_str	= NULL;
+
+try_again:
+	ut_a(name);
+
+	if (create_mode == OS_FILE_OPEN || create_mode == OS_FILE_OPEN_RAW
+	    || create_mode == OS_FILE_OPEN_RETRY) {
+		mode_str = "OPEN";
+		create_flag = O_RDWR;
+	} else if (create_mode == OS_FILE_CREATE) {
+		mode_str = "CREATE";
+		create_flag = O_RDWR | O_CREAT | O_EXCL;
+	} else if (create_mode == OS_FILE_OVERWRITE) {
+		mode_str = "OVERWRITE";
+		create_flag = O_RDWR | O_CREAT | O_TRUNC;
+	} else {
+		create_flag = 0;
+		ut_error;
+	}
+
+	if (type == OS_LOG_FILE) {
+		type_str = "LOG";
+	} else if (type == OS_DATA_FILE) {
+		type_str = "DATA";
+	} else {
+		ut_error;
+	}
+
+	if (purpose == OS_FILE_AIO) {
+		purpose_str = "AIO";
+	} else if (purpose == OS_FILE_NORMAL) {
+		purpose_str = "NORMAL";
+	} else {
+		ut_error;
+	}
+
+#if 0
+	fprintf(stderr, "Opening file %s, mode %s, type %s, purpose %s\n",
+		name, mode_str, type_str, purpose_str);
+#endif
+#ifdef O_SYNC
+	/* We let O_SYNC only affect log files; note that we map O_DSYNC to
+	O_SYNC because the datasync options seemed to corrupt files in 2001
+	in both Linux and Solaris */
+	if (type == OS_LOG_FILE
+	    && srv_unix_file_flush_method == SRV_UNIX_O_DSYNC) {
+
+# if 0
+		fprintf(stderr, "Using O_SYNC for file %s\n", name);
+# endif
+
+		create_flag = create_flag | O_SYNC;
+	}
+#endif /* O_SYNC */
+
+	file = open(name, create_flag, os_innodb_umask);
+
+	if (file == -1) {
+		*success = FALSE;
+
+		/* When srv_file_per_table is on, file creation failure may not
+		be critical to the whole instance. Do not crash the server in
+		case of unknown errors.
+		Please note "srv_file_per_table" is a global variable with
+		no explicit synchronization protection. It could be
+		changed during this execution path. It might not have the
+		same value as the one when building the table definition */
+		if (srv_file_per_table) {
+			retry = os_file_handle_error_no_exit(name,
+						create_mode == OS_FILE_CREATE ?
+						"create" : "open");
+		} else {
+			retry = os_file_handle_error(name,
+						create_mode == OS_FILE_CREATE ?
+						"create" : "open");
+		}
+
+		if (retry) {
+			goto try_again;
+		} else {
+			return(file /* -1 */);
+		}
+	}
+	/* else */
+
+	*success = TRUE;
+
+	/* We disable OS caching (O_DIRECT) only on data files */
+	if (type != OS_LOG_FILE
+	    && srv_unix_file_flush_method == SRV_UNIX_O_DIRECT) {
+		
+		os_file_set_nocache(file, name, mode_str);
+	}
+
+	/* ALL_O_DIRECT: O_DIRECT also for transaction log file */
+	if (srv_unix_file_flush_method == SRV_UNIX_ALL_O_DIRECT) {
+		os_file_set_nocache(file, name, mode_str);
+	}
+
+#ifdef USE_FILE_LOCK
+	if (create_mode != OS_FILE_OPEN_RAW && os_file_lock(file, name)) {
+
+		if (create_mode == OS_FILE_OPEN_RETRY) {
+			int i;
+			ut_print_timestamp(stderr);
+			fputs("  InnoDB: Retrying to lock"
+			      " the first data file\n",
+			      stderr);
+			for (i = 0; i < 100; i++) {
+				os_thread_sleep(1000000);
+				if (!os_file_lock(file, name)) {
+					*success = TRUE;
+					return(file);
+				}
+			}
+			ut_print_timestamp(stderr);
+			fputs("  InnoDB: Unable to open the first data file\n",
+			      stderr);
+		}
+
+		*success = FALSE;
+		close(file);
+		file = -1;
+	}
+#endif /* USE_FILE_LOCK */
+
+	return(file);
+#endif /* __WIN__ */
+}
+
+/***********************************************************************//**
+Deletes a file if it exists. The file has to be closed before calling this.
+@return	TRUE if success */
+UNIV_INTERN
+ibool
+os_file_delete_if_exists(
+/*=====================*/
+	const char*	name)	/*!< in: file path as a null-terminated string */
+{
+#ifdef __WIN__
+	BOOL	ret;
+	ulint	count	= 0;
+loop:
+	/* In Windows, deleting an .ibd file may fail if ibbackup is copying
+	it */
+
+	ret = DeleteFile((LPCTSTR)name);
+
+	if (ret) {
+		return(TRUE);
+	}
+
+	if (GetLastError() == ERROR_FILE_NOT_FOUND) {
+		/* the file does not exist, this not an error */
+
+		return(TRUE);
+	}
+
+	count++;
+
+	if (count > 100 && 0 == (count % 10)) {
+		fprintf(stderr,
+			"InnoDB: Warning: cannot delete file %s\n"
+			"InnoDB: Are you running ibbackup"
+			" to back up the file?\n", name);
+
+		os_file_get_last_error(TRUE); /* print error information */
+	}
+
+	os_thread_sleep(1000000);	/* sleep for a second */
+
+	if (count > 2000) {
+
+		return(FALSE);
+	}
+
+	goto loop;
+#else
+	int	ret;
+
+	ret = unlink(name);
+
+	if (ret != 0 && errno != ENOENT) {
+		os_file_handle_error_no_exit(name, "delete");
+
+		return(FALSE);
+	}
+
+	return(TRUE);
+#endif
+}
+
+/***********************************************************************//**
+Deletes a file. The file has to be closed before calling this.
+@return	TRUE if success */
+UNIV_INTERN
+ibool
+os_file_delete(
+/*===========*/
+	const char*	name)	/*!< in: file path as a null-terminated string */
+{
+#ifdef __WIN__
+	BOOL	ret;
+	ulint	count	= 0;
+loop:
+	/* In Windows, deleting an .ibd file may fail if ibbackup is copying
+	it */
+
+	ret = DeleteFile((LPCTSTR)name);
+
+	if (ret) {
+		return(TRUE);
+	}
+
+	if (GetLastError() == ERROR_FILE_NOT_FOUND) {
+		/* If the file does not exist, we classify this as a 'mild'
+		error and return */
+
+		return(FALSE);
+	}
+
+	count++;
+
+	if (count > 100 && 0 == (count % 10)) {
+		fprintf(stderr,
+			"InnoDB: Warning: cannot delete file %s\n"
+			"InnoDB: Are you running ibbackup"
+			" to back up the file?\n", name);
+
+		os_file_get_last_error(TRUE); /* print error information */
+	}
+
+	os_thread_sleep(1000000);	/* sleep for a second */
+
+	if (count > 2000) {
+
+		return(FALSE);
+	}
+
+	goto loop;
+#else
+	int	ret;
+
+	ret = unlink(name);
+
+	if (ret != 0) {
+		os_file_handle_error_no_exit(name, "delete");
+
+		return(FALSE);
+	}
+
+	return(TRUE);
+#endif
+}
+
+/***********************************************************************//**
+Renames a file (can also move it to another directory). It is safest that the
+file is closed before calling this function.
+@return	TRUE if success */
+UNIV_INTERN
+ibool
+os_file_rename(
+/*===========*/
+	const char*	oldpath,/*!< in: old file path as a null-terminated
+				string */
+	const char*	newpath)/*!< in: new file path */
+{
+#ifdef __WIN__
+	BOOL	ret;
+
+	ret = MoveFile((LPCTSTR)oldpath, (LPCTSTR)newpath);
+
+	if (ret) {
+		return(TRUE);
+	}
+
+	os_file_handle_error_no_exit(oldpath, "rename");
+
+	return(FALSE);
+#else
+	int	ret;
+
+	ret = rename(oldpath, newpath);
+
+	if (ret != 0) {
+		os_file_handle_error_no_exit(oldpath, "rename");
+
+		return(FALSE);
+	}
+
+	return(TRUE);
+#endif
+}
+
+/***********************************************************************//**
+Closes a file handle. In case of error, error number can be retrieved with
+os_file_get_last_error.
+@return	TRUE if success */
+UNIV_INTERN
+ibool
+os_file_close(
+/*==========*/
+	os_file_t	file)	/*!< in, own: handle to a file */
+{
+#ifdef __WIN__
+	BOOL	ret;
+
+	ut_a(file);
+
+	ret = CloseHandle(file);
+
+	if (ret) {
+		return(TRUE);
+	}
+
+	os_file_handle_error(NULL, "close");
+
+	return(FALSE);
+#else
+	int	ret;
+
+	ret = close(file);
+
+	if (ret == -1) {
+		os_file_handle_error(NULL, "close");
+
+		return(FALSE);
+	}
+
+	return(TRUE);
+#endif
+}
+
+#ifdef UNIV_HOTBACKUP
+/***********************************************************************//**
+Closes a file handle.
+@return	TRUE if success */
+UNIV_INTERN
+ibool
+os_file_close_no_error_handling(
+/*============================*/
+	os_file_t	file)	/*!< in, own: handle to a file */
+{
+#ifdef __WIN__
+	BOOL	ret;
+
+	ut_a(file);
+
+	ret = CloseHandle(file);
+
+	if (ret) {
+		return(TRUE);
+	}
+
+	return(FALSE);
+#else
+	int	ret;
+
+	ret = close(file);
+
+	if (ret == -1) {
+
+		return(FALSE);
+	}
+
+	return(TRUE);
+#endif
+}
+#endif /* UNIV_HOTBACKUP */
+
+/***********************************************************************//**
+Gets a file size.
+@return	TRUE if success */
+UNIV_INTERN
+ibool
+os_file_get_size(
+/*=============*/
+	os_file_t	file,	/*!< in: handle to a file */
+	ulint*		size,	/*!< out: least significant 32 bits of file
+				size */
+	ulint*		size_high)/*!< out: most significant 32 bits of size */
+{
+#ifdef __WIN__
+	DWORD	high;
+	DWORD	low;
+
+	low = GetFileSize(file, &high);
+
+	if ((low == 0xFFFFFFFF) && (GetLastError() != NO_ERROR)) {
+		return(FALSE);
+	}
+
+	*size = low;
+	*size_high = high;
+
+	return(TRUE);
+#else
+	off_t	offs;
+
+	offs = lseek(file, 0, SEEK_END);
+
+	if (offs == ((off_t)-1)) {
+
+		return(FALSE);
+	}
+
+	if (sizeof(off_t) > 4) {
+		*size = (ulint)(offs & 0xFFFFFFFFUL);
+		*size_high = (ulint)(offs >> 32);
+	} else {
+		*size = (ulint) offs;
+		*size_high = 0;
+	}
+
+	return(TRUE);
+#endif
+}
+
+/***********************************************************************//**
+Gets file size as a 64-bit integer ib_int64_t.
+@return	size in bytes, -1 if error */
+UNIV_INTERN
+ib_int64_t
+os_file_get_size_as_iblonglong(
+/*===========================*/
+	os_file_t	file)	/*!< in: handle to a file */
+{
+	ulint	size;
+	ulint	size_high;
+	ibool	success;
+
+	success = os_file_get_size(file, &size, &size_high);
+
+	if (!success) {
+
+		return(-1);
+	}
+
+	return((((ib_int64_t)size_high) << 32) + (ib_int64_t)size);
+}
+
+/***********************************************************************//**
+Write the specified number of zeros to a newly created file.
+@return	TRUE if success */
+UNIV_INTERN
+ibool
+os_file_set_size(
+/*=============*/
+	const char*	name,	/*!< in: name of the file or path as a
+				null-terminated string */
+	os_file_t	file,	/*!< in: handle to a file */
+	ulint		size,	/*!< in: least significant 32 bits of file
+				size */
+	ulint		size_high)/*!< in: most significant 32 bits of size */
+{
+	ib_int64_t	current_size;
+	ib_int64_t	desired_size;
+	ibool		ret;
+	byte*		buf;
+	byte*		buf2;
+	ulint		buf_size;
+
+	ut_a(size == (size & 0xFFFFFFFF));
+
+	current_size = 0;
+	desired_size = (ib_int64_t)size + (((ib_int64_t)size_high) << 32);
+
+	/* Write up to 1 megabyte at a time. */
+	buf_size = ut_min(64, (ulint) (desired_size / UNIV_PAGE_SIZE))
+		* UNIV_PAGE_SIZE;
+	buf2 = ut_malloc(buf_size + UNIV_PAGE_SIZE);
+
+	/* Align the buffer for possible raw i/o */
+	buf = ut_align(buf2, UNIV_PAGE_SIZE);
+
+	/* Write buffer full of zeros */
+	memset(buf, 0, buf_size);
+
+	if (desired_size >= (ib_int64_t)(100 * 1024 * 1024)) {
+
+		fprintf(stderr, "InnoDB: Progress in MB:");
+	}
+
+	while (current_size < desired_size) {
+		ulint	n_bytes;
+
+		if (desired_size - current_size < (ib_int64_t) buf_size) {
+			n_bytes = (ulint) (desired_size - current_size);
+		} else {
+			n_bytes = buf_size;
+		}
+
+		ret = os_file_write(name, file, buf,
+				    (ulint)(current_size & 0xFFFFFFFF),
+				    (ulint)(current_size >> 32),
+				    n_bytes);
+		if (!ret) {
+			ut_free(buf2);
+			goto error_handling;
+		}
+
+		/* Print about progress for each 100 MB written */
+		if ((ib_int64_t) (current_size + n_bytes) / (ib_int64_t)(100 * 1024 * 1024)
+		    != current_size / (ib_int64_t)(100 * 1024 * 1024)) {
+
+			fprintf(stderr, " %lu00",
+				(ulong) ((current_size + n_bytes)
+					 / (ib_int64_t)(100 * 1024 * 1024)));
+		}
+
+		current_size += n_bytes;
+	}
+
+	if (desired_size >= (ib_int64_t)(100 * 1024 * 1024)) {
+
+		fprintf(stderr, "\n");
+	}
+
+	ut_free(buf2);
+
+	ret = os_file_flush(file);
+
+	if (ret) {
+		return(TRUE);
+	}
+
+error_handling:
+	return(FALSE);
+}
+
+/***********************************************************************//**
+Truncates a file at its current position.
+@return	TRUE if success */
+UNIV_INTERN
+ibool
+os_file_set_eof(
+/*============*/
+	FILE*		file)	/*!< in: file to be truncated */
+{
+#ifdef __WIN__
+	HANDLE h = (HANDLE) _get_osfhandle(fileno(file));
+	return(SetEndOfFile(h));
+#else /* __WIN__ */
+	return(!ftruncate(fileno(file), ftell(file)));
+#endif /* __WIN__ */
+}
+
+#ifndef __WIN__
+/***********************************************************************//**
+Wrapper to fsync(2) that retries the call on some errors.
+Returns the value 0 if successful; otherwise the value -1 is returned and
+the global variable errno is set to indicate the error.
+@return	0 if success, -1 otherwise */
+
+static
+int
+os_file_fsync(
+/*==========*/
+	os_file_t	file)	/*!< in: handle to a file */
+{
+	int	ret;
+	int	failures;
+	ibool	retry;
+
+	failures = 0;
+
+	do {
+		ret = fsync(file);
+
+		os_n_fsyncs++;
+
+		if (ret == -1 && errno == ENOLCK) {
+
+			if (failures % 100 == 0) {
+
+				ut_print_timestamp(stderr);
+				fprintf(stderr,
+					"  InnoDB: fsync(): "
+					"No locks available; retrying\n");
+			}
+
+			os_thread_sleep(200000 /* 0.2 sec */);
+
+			failures++;
+
+			retry = TRUE;
+		} else {
+
+			retry = FALSE;
+		}
+	} while (retry);
+
+	return(ret);
+}
+#endif /* !__WIN__ */
+
+/***********************************************************************//**
+Flushes the write buffers of a given file to the disk.
+@return	TRUE if success */
+UNIV_INTERN
+ibool
+os_file_flush(
+/*==========*/
+	os_file_t	file)	/*!< in, own: handle to a file */
+{
+#ifdef __WIN__
+	BOOL	ret;
+
+	ut_a(file);
+
+	os_n_fsyncs++;
+
+	ret = FlushFileBuffers(file);
+
+	if (ret) {
+		return(TRUE);
+	}
+
+	/* Since Windows returns ERROR_INVALID_FUNCTION if the 'file' is
+	actually a raw device, we choose to ignore that error if we are using
+	raw disks */
+
+	if (srv_start_raw_disk_in_use && GetLastError()
+	    == ERROR_INVALID_FUNCTION) {
+		return(TRUE);
+	}
+
+	os_file_handle_error(NULL, "flush");
+
+	/* It is a fatal error if a file flush does not succeed, because then
+	the database can get corrupt on disk */
+	ut_error;
+
+	return(FALSE);
+#else
+	int	ret;
+
+#if defined(HAVE_DARWIN_THREADS)
+# ifndef F_FULLFSYNC
+	/* The following definition is from the Mac OS X 10.3 <sys/fcntl.h> */
+#  define F_FULLFSYNC 51 /* fsync + ask the drive to flush to the media */
+# elif F_FULLFSYNC != 51
+#  error "F_FULLFSYNC != 51: ABI incompatibility with Mac OS X 10.3"
+# endif
+	/* Apple has disabled fsync() for internal disk drives in OS X. That
+	caused corruption for a user when he tested a power outage. Let us in
+	OS X use a nonstandard flush method recommended by an Apple
+	engineer. */
+
+	if (!srv_have_fullfsync) {
+		/* If we are not on an operating system that supports this,
+		then fall back to a plain fsync. */
+
+		ret = os_file_fsync(file);
+	} else {
+		ret = fcntl(file, F_FULLFSYNC, NULL);
+
+		if (ret) {
+			/* If we are not on a file system that supports this,
+			then fall back to a plain fsync. */
+			ret = os_file_fsync(file);
+		}
+	}
+#else
+	ret = os_file_fsync(file);
+#endif
+
+	if (ret == 0) {
+		return(TRUE);
+	}
+
+	/* Since Linux returns EINVAL if the 'file' is actually a raw device,
+	we choose to ignore that error if we are using raw disks */
+
+	if (srv_start_raw_disk_in_use && errno == EINVAL) {
+
+		return(TRUE);
+	}
+
+	ut_print_timestamp(stderr);
+
+	fprintf(stderr,
+		"  InnoDB: Error: the OS said file flush did not succeed\n");
+
+	os_file_handle_error(NULL, "flush");
+
+	/* It is a fatal error if a file flush does not succeed, because then
+	the database can get corrupt on disk */
+	ut_error;
+
+	return(FALSE);
+#endif
+}
+
+#ifndef __WIN__
+/*******************************************************************//**
+Does a synchronous read operation in Posix.
+@return	number of bytes read, -1 if error */
+#define os_file_pread(file, buf, n, offset, offset_high)        \
+		_os_file_pread(file, buf, n, offset, offset_high, NULL);
+
+static
+ssize_t
+_os_file_pread(
+/*==========*/
+	os_file_t	file,	/*!< in: handle to a file */
+	void*		buf,	/*!< in: buffer where to read */
+	ulint		n,	/*!< in: number of bytes to read */
+	ulint		offset,	/*!< in: least significant 32 bits of file
+				offset from where to read */
+	ulint		offset_high, /*!< in: most significant 32 bits of
+				offset */
+	trx_t*		trx)
+{
+	off_t	offs;
+#if defined(HAVE_PREAD) && !defined(HAVE_BROKEN_PREAD)
+	ssize_t	n_bytes;
+#endif /* HAVE_PREAD && !HAVE_BROKEN_PREAD */
+	ulint		sec;
+	ulint		ms;
+	ib_uint64_t	start_time;
+	ib_uint64_t	finish_time;
+
+	ut_a((offset & 0xFFFFFFFFUL) == offset);
+
+	/* If off_t is > 4 bytes in size, then we assume we can pass a
+	64-bit address */
+
+	if (sizeof(off_t) > 4) {
+		offs = (off_t)offset + (((off_t)offset_high) << 32);
+
+	} else {
+		offs = (off_t)offset;
+
+		if (offset_high > 0) {
+			fprintf(stderr,
+				"InnoDB: Error: file read at offset > 4 GB\n");
+		}
+	}
+
+	os_n_file_reads++;
+
+	if (innobase_get_slow_log() && trx && trx->take_stats)
+	{
+	        trx->io_reads++;
+		trx->io_read += n;
+		ut_usectime(&sec, &ms);
+		start_time = (ib_uint64_t)sec * 1000000 + ms;
+	} else {
+		start_time = 0;
+	}
+#if defined(HAVE_PREAD) && !defined(HAVE_BROKEN_PREAD)
+	os_mutex_enter(os_file_count_mutex);
+	os_file_n_pending_preads++;
+	os_n_pending_reads++;
+	os_mutex_exit(os_file_count_mutex);
+
+	n_bytes = pread(file, buf, (ssize_t)n, offs);
+
+	os_mutex_enter(os_file_count_mutex);
+	os_file_n_pending_preads--;
+	os_n_pending_reads--;
+	os_mutex_exit(os_file_count_mutex);
+
+	if (innobase_get_slow_log() && trx && trx->take_stats && start_time)
+	{
+		ut_usectime(&sec, &ms);
+		finish_time = (ib_uint64_t)sec * 1000000 + ms;
+		trx->io_reads_wait_timer += (ulint)(finish_time - start_time);
+	}
+
+	return(n_bytes);
+#else
+	{
+		off_t	ret_offset;
+		ssize_t	ret;
+#ifndef UNIV_HOTBACKUP
+		ulint	i;
+#endif /* !UNIV_HOTBACKUP */
+
+		os_mutex_enter(os_file_count_mutex);
+		os_n_pending_reads++;
+		os_mutex_exit(os_file_count_mutex);
+
+#ifndef UNIV_HOTBACKUP
+		/* Protect the seek / read operation with a mutex */
+		i = ((ulint) file) % OS_FILE_N_SEEK_MUTEXES;
+
+		os_mutex_enter(os_file_seek_mutexes[i]);
+#endif /* !UNIV_HOTBACKUP */
+
+		ret_offset = lseek(file, offs, SEEK_SET);
+
+		if (ret_offset < 0) {
+			ret = -1;
+		} else {
+			ret = read(file, buf, (ssize_t)n);
+		}
+
+#ifndef UNIV_HOTBACKUP
+		os_mutex_exit(os_file_seek_mutexes[i]);
+#endif /* !UNIV_HOTBACKUP */
+
+		os_mutex_enter(os_file_count_mutex);
+		os_n_pending_reads--;
+		os_mutex_exit(os_file_count_mutex);
+
+		if (innobase_get_slow_log() && trx && trx->take_stats && start_time)
+		{
+			ut_usectime(&sec, &ms);
+			finish_time = (ib_uint64_t)sec * 1000000 + ms;
+			trx->io_reads_wait_timer += (ulint)(finish_time - start_time);
+		}
+
+		return(ret);
+	}
+#endif
+}
+
+/*******************************************************************//**
+Does a synchronous write operation in Posix.
+@return	number of bytes written, -1 if error */
+static
+ssize_t
+os_file_pwrite(
+/*===========*/
+	os_file_t	file,	/*!< in: handle to a file */
+	const void*	buf,	/*!< in: buffer from where to write */
+	ulint		n,	/*!< in: number of bytes to write */
+	ulint		offset,	/*!< in: least significant 32 bits of file
+				offset where to write */
+	ulint		offset_high) /*!< in: most significant 32 bits of
+				offset */
+{
+	ssize_t	ret;
+	off_t	offs;
+
+	ut_a((offset & 0xFFFFFFFFUL) == offset);
+
+	/* If off_t is > 4 bytes in size, then we assume we can pass a
+	64-bit address */
+
+	if (sizeof(off_t) > 4) {
+		offs = (off_t)offset + (((off_t)offset_high) << 32);
+	} else {
+		offs = (off_t)offset;
+
+		if (offset_high > 0) {
+			fprintf(stderr,
+				"InnoDB: Error: file write"
+				" at offset > 4 GB\n");
+		}
+	}
+
+	os_n_file_writes++;
+
+#if defined(HAVE_PWRITE) && !defined(HAVE_BROKEN_PREAD)
+	os_mutex_enter(os_file_count_mutex);
+	os_file_n_pending_pwrites++;
+	os_n_pending_writes++;
+	os_mutex_exit(os_file_count_mutex);
+
+	ret = pwrite(file, buf, (ssize_t)n, offs);
+
+	os_mutex_enter(os_file_count_mutex);
+	os_file_n_pending_pwrites--;
+	os_n_pending_writes--;
+	os_mutex_exit(os_file_count_mutex);
+
+# ifdef UNIV_DO_FLUSH
+	if (srv_unix_file_flush_method != SRV_UNIX_LITTLESYNC
+	    && srv_unix_file_flush_method != SRV_UNIX_NOSYNC
+	    && !os_do_not_call_flush_at_each_write) {
+
+		/* Always do fsync to reduce the probability that when
+		the OS crashes, a database page is only partially
+		physically written to disk. */
+
+		ut_a(TRUE == os_file_flush(file));
+	}
+# endif /* UNIV_DO_FLUSH */
+
+	return(ret);
+#else
+	{
+		off_t	ret_offset;
+# ifndef UNIV_HOTBACKUP
+		ulint	i;
+# endif /* !UNIV_HOTBACKUP */
+
+		os_mutex_enter(os_file_count_mutex);
+		os_n_pending_writes++;
+		os_mutex_exit(os_file_count_mutex);
+
+# ifndef UNIV_HOTBACKUP
+		/* Protect the seek / write operation with a mutex */
+		i = ((ulint) file) % OS_FILE_N_SEEK_MUTEXES;
+
+		os_mutex_enter(os_file_seek_mutexes[i]);
+# endif /* UNIV_HOTBACKUP */
+
+		ret_offset = lseek(file, offs, SEEK_SET);
+
+		if (ret_offset < 0) {
+			ret = -1;
+
+			goto func_exit;
+		}
+
+		ret = write(file, buf, (ssize_t)n);
+
+# ifdef UNIV_DO_FLUSH
+		if (srv_unix_file_flush_method != SRV_UNIX_LITTLESYNC
+		    && srv_unix_file_flush_method != SRV_UNIX_NOSYNC
+		    && !os_do_not_call_flush_at_each_write) {
+
+			/* Always do fsync to reduce the probability that when
+			the OS crashes, a database page is only partially
+			physically written to disk. */
+
+			ut_a(TRUE == os_file_flush(file));
+		}
+# endif /* UNIV_DO_FLUSH */
+
+func_exit:
+# ifndef UNIV_HOTBACKUP
+		os_mutex_exit(os_file_seek_mutexes[i]);
+# endif /* !UNIV_HOTBACKUP */
+
+		os_mutex_enter(os_file_count_mutex);
+		os_n_pending_writes--;
+		os_mutex_exit(os_file_count_mutex);
+
+		return(ret);
+	}
+#endif
+}
+#endif
+
+/*******************************************************************//**
+Requests a synchronous positioned read operation.
+@return	TRUE if request was successful, FALSE if fail */
+UNIV_INTERN
+ibool
+_os_file_read(
+/*=========*/
+	os_file_t	file,	/*!< in: handle to a file */
+	void*		buf,	/*!< in: buffer where to read */
+	ulint		offset,	/*!< in: least significant 32 bits of file
+				offset where to read */
+	ulint		offset_high, /*!< in: most significant 32 bits of
+				offset */
+	ulint		n,	/*!< in: number of bytes to read */
+	trx_t*		trx)
+{
+#ifdef __WIN__
+	BOOL		ret;
+	DWORD		len;
+	DWORD		ret2;
+	DWORD		low;
+	DWORD		high;
+	ibool		retry;
+#ifndef UNIV_HOTBACKUP
+	ulint		i;
+#endif /* !UNIV_HOTBACKUP */
+
+	ut_a((offset & 0xFFFFFFFFUL) == offset);
+
+	os_n_file_reads++;
+	os_bytes_read_since_printout += n;
+
+try_again:
+	ut_ad(file);
+	ut_ad(buf);
+	ut_ad(n > 0);
+
+	low = (DWORD) offset;
+	high = (DWORD) offset_high;
+
+	os_mutex_enter(os_file_count_mutex);
+	os_n_pending_reads++;
+	os_mutex_exit(os_file_count_mutex);
+
+#ifndef UNIV_HOTBACKUP
+	/* Protect the seek / read operation with a mutex */
+	i = ((ulint) file) % OS_FILE_N_SEEK_MUTEXES;
+
+	os_mutex_enter(os_file_seek_mutexes[i]);
+#endif /* !UNIV_HOTBACKUP */
+
+	ret2 = SetFilePointer(file, low, &high, FILE_BEGIN);
+
+	if (ret2 == 0xFFFFFFFF && GetLastError() != NO_ERROR) {
+
+#ifndef UNIV_HOTBACKUP
+		os_mutex_exit(os_file_seek_mutexes[i]);
+#endif /* !UNIV_HOTBACKUP */
+
+		os_mutex_enter(os_file_count_mutex);
+		os_n_pending_reads--;
+		os_mutex_exit(os_file_count_mutex);
+
+		goto error_handling;
+	}
+
+	ret = ReadFile(file, buf, (DWORD) n, &len, NULL);
+
+#ifndef UNIV_HOTBACKUP
+	os_mutex_exit(os_file_seek_mutexes[i]);
+#endif /* !UNIV_HOTBACKUP */
+
+	os_mutex_enter(os_file_count_mutex);
+	os_n_pending_reads--;
+	os_mutex_exit(os_file_count_mutex);
+
+	if (ret && len == n) {
+		return(TRUE);
+	}
+#else /* __WIN__ */
+	ibool	retry;
+	ssize_t	ret;
+
+	os_bytes_read_since_printout += n;
+
+try_again:
+	ret = _os_file_pread(file, buf, n, offset, offset_high, trx);
+
+	if ((ulint)ret == n) {
+
+		return(TRUE);
+	}
+
+	fprintf(stderr,
+		"InnoDB: Error: tried to read %lu bytes at offset %lu %lu.\n"
+		"InnoDB: Was only able to read %ld.\n",
+		(ulong)n, (ulong)offset_high,
+		(ulong)offset, (long)ret);
+#endif /* __WIN__ */
+#ifdef __WIN__
+error_handling:
+#endif
+	retry = os_file_handle_error(NULL, "read");
+
+	if (retry) {
+		goto try_again;
+	}
+
+	fprintf(stderr,
+		"InnoDB: Fatal error: cannot read from file."
+		" OS error number %lu.\n",
+#ifdef __WIN__
+		(ulong) GetLastError()
+#else
+		(ulong) errno
+#endif
+		);
+	fflush(stderr);
+
+	ut_error;
+
+	return(FALSE);
+}
+
+/*******************************************************************//**
+Requests a synchronous positioned read operation. This function does not do
+any error handling. In case of error it returns FALSE.
+@return	TRUE if request was successful, FALSE if fail */
+UNIV_INTERN
+ibool
+os_file_read_no_error_handling(
+/*===========================*/
+	os_file_t	file,	/*!< in: handle to a file */
+	void*		buf,	/*!< in: buffer where to read */
+	ulint		offset,	/*!< in: least significant 32 bits of file
+				offset where to read */
+	ulint		offset_high, /*!< in: most significant 32 bits of
+				offset */
+	ulint		n)	/*!< in: number of bytes to read */
+{
+#ifdef __WIN__
+	BOOL		ret;
+	DWORD		len;
+	DWORD		ret2;
+	DWORD		low;
+	DWORD		high;
+	ibool		retry;
+#ifndef UNIV_HOTBACKUP
+	ulint		i;
+#endif /* !UNIV_HOTBACKUP */
+
+	ut_a((offset & 0xFFFFFFFFUL) == offset);
+
+	os_n_file_reads++;
+	os_bytes_read_since_printout += n;
+
+try_again:
+	ut_ad(file);
+	ut_ad(buf);
+	ut_ad(n > 0);
+
+	low = (DWORD) offset;
+	high = (DWORD) offset_high;
+
+	os_mutex_enter(os_file_count_mutex);
+	os_n_pending_reads++;
+	os_mutex_exit(os_file_count_mutex);
+
+#ifndef UNIV_HOTBACKUP
+	/* Protect the seek / read operation with a mutex */
+	i = ((ulint) file) % OS_FILE_N_SEEK_MUTEXES;
+
+	os_mutex_enter(os_file_seek_mutexes[i]);
+#endif /* !UNIV_HOTBACKUP */
+
+	ret2 = SetFilePointer(file, low, &high, FILE_BEGIN);
+
+	if (ret2 == 0xFFFFFFFF && GetLastError() != NO_ERROR) {
+
+#ifndef UNIV_HOTBACKUP
+		os_mutex_exit(os_file_seek_mutexes[i]);
+#endif /* !UNIV_HOTBACKUP */
+
+		os_mutex_enter(os_file_count_mutex);
+		os_n_pending_reads--;
+		os_mutex_exit(os_file_count_mutex);
+
+		goto error_handling;
+	}
+
+	ret = ReadFile(file, buf, (DWORD) n, &len, NULL);
+
+#ifndef UNIV_HOTBACKUP
+	os_mutex_exit(os_file_seek_mutexes[i]);
+#endif /* !UNIV_HOTBACKUP */
+
+	os_mutex_enter(os_file_count_mutex);
+	os_n_pending_reads--;
+	os_mutex_exit(os_file_count_mutex);
+
+	if (ret && len == n) {
+		return(TRUE);
+	}
+#else /* __WIN__ */
+	ibool	retry;
+	ssize_t	ret;
+
+	os_bytes_read_since_printout += n;
+
+try_again:
+	ret = os_file_pread(file, buf, n, offset, offset_high);
+
+	if ((ulint)ret == n) {
+
+		return(TRUE);
+	}
+#endif /* __WIN__ */
+#ifdef __WIN__
+error_handling:
+#endif
+	retry = os_file_handle_error_no_exit(NULL, "read");
+
+	if (retry) {
+		goto try_again;
+	}
+
+	return(FALSE);
+}
+
+/*******************************************************************//**
+Rewind file to its start, read at most size - 1 bytes from it to str, and
+NUL-terminate str. All errors are silently ignored. This function is
+mostly meant to be used with temporary files. */
+UNIV_INTERN
+void
+os_file_read_string(
+/*================*/
+	FILE*	file,	/*!< in: file to read from */
+	char*	str,	/*!< in: buffer where to read */
+	ulint	size)	/*!< in: size of buffer */
+{
+	size_t	flen;
+
+	if (size == 0) {
+		return;
+	}
+
+	rewind(file);
+	flen = fread(str, 1, size - 1, file);
+	str[flen] = '\0';
+}
+
+/*******************************************************************//**
+Requests a synchronous write operation.
+@return	TRUE if request was successful, FALSE if fail */
+UNIV_INTERN
+ibool
+os_file_write(
+/*==========*/
+	const char*	name,	/*!< in: name of the file or path as a
+				null-terminated string */
+	os_file_t	file,	/*!< in: handle to a file */
+	const void*	buf,	/*!< in: buffer from which to write */
+	ulint		offset,	/*!< in: least significant 32 bits of file
+				offset where to write */
+	ulint		offset_high, /*!< in: most significant 32 bits of
+				offset */
+	ulint		n)	/*!< in: number of bytes to write */
+{
+#ifdef __WIN__
+	BOOL		ret;
+	DWORD		len;
+	DWORD		ret2;
+	DWORD		low;
+	DWORD		high;
+	ulint		n_retries	= 0;
+	ulint		err;
+#ifndef UNIV_HOTBACKUP
+	ulint		i;
+#endif /* !UNIV_HOTBACKUP */
+
+	ut_a((offset & 0xFFFFFFFF) == offset);
+
+	os_n_file_writes++;
+
+	ut_ad(file);
+	ut_ad(buf);
+	ut_ad(n > 0);
+retry:
+	low = (DWORD) offset;
+	high = (DWORD) offset_high;
+
+	os_mutex_enter(os_file_count_mutex);
+	os_n_pending_writes++;
+	os_mutex_exit(os_file_count_mutex);
+
+#ifndef UNIV_HOTBACKUP
+	/* Protect the seek / write operation with a mutex */
+	i = ((ulint) file) % OS_FILE_N_SEEK_MUTEXES;
+
+	os_mutex_enter(os_file_seek_mutexes[i]);
+#endif /* !UNIV_HOTBACKUP */
+
+	ret2 = SetFilePointer(file, low, &high, FILE_BEGIN);
+
+	if (ret2 == 0xFFFFFFFF && GetLastError() != NO_ERROR) {
+
+#ifndef UNIV_HOTBACKUP
+		os_mutex_exit(os_file_seek_mutexes[i]);
+#endif /* !UNIV_HOTBACKUP */
+
+		os_mutex_enter(os_file_count_mutex);
+		os_n_pending_writes--;
+		os_mutex_exit(os_file_count_mutex);
+
+		ut_print_timestamp(stderr);
+
+		fprintf(stderr,
+			"  InnoDB: Error: File pointer positioning to"
+			" file %s failed at\n"
+			"InnoDB: offset %lu %lu. Operating system"
+			" error number %lu.\n"
+			"InnoDB: Some operating system error numbers"
+			" are described at\n"
+			"InnoDB: "
+			REFMAN "operating-system-error-codes.html\n",
+			name, (ulong) offset_high, (ulong) offset,
+			(ulong) GetLastError());
+
+		return(FALSE);
+	}
+
+	ret = WriteFile(file, buf, (DWORD) n, &len, NULL);
+
+	/* Always do fsync to reduce the probability that when the OS crashes,
+	a database page is only partially physically written to disk. */
+
+# ifdef UNIV_DO_FLUSH
+	if (!os_do_not_call_flush_at_each_write) {
+		ut_a(TRUE == os_file_flush(file));
+	}
+# endif /* UNIV_DO_FLUSH */
+
+#ifndef UNIV_HOTBACKUP
+	os_mutex_exit(os_file_seek_mutexes[i]);
+#endif /* !UNIV_HOTBACKUP */
+
+	os_mutex_enter(os_file_count_mutex);
+	os_n_pending_writes--;
+	os_mutex_exit(os_file_count_mutex);
+
+	if (ret && len == n) {
+
+		return(TRUE);
+	}
+
+	/* If some background file system backup tool is running, then, at
+	least in Windows 2000, we may get here a specific error. Let us
+	retry the operation 100 times, with 1 second waits. */
+
+	if (GetLastError() == ERROR_LOCK_VIOLATION && n_retries < 100) {
+
+		os_thread_sleep(1000000);
+
+		n_retries++;
+
+		goto retry;
+	}
+
+	if (!os_has_said_disk_full) {
+
+		err = (ulint)GetLastError();
+
+		ut_print_timestamp(stderr);
+
+		fprintf(stderr,
+			"  InnoDB: Error: Write to file %s failed"
+			" at offset %lu %lu.\n"
+			"InnoDB: %lu bytes should have been written,"
+			" only %lu were written.\n"
+			"InnoDB: Operating system error number %lu.\n"
+			"InnoDB: Check that your OS and file system"
+			" support files of this size.\n"
+			"InnoDB: Check also that the disk is not full"
+			" or a disk quota exceeded.\n",
+			name, (ulong) offset_high, (ulong) offset,
+			(ulong) n, (ulong) len, (ulong) err);
+
+		if (strerror((int)err) != NULL) {
+			fprintf(stderr,
+				"InnoDB: Error number %lu means '%s'.\n",
+				(ulong) err, strerror((int)err));
+		}
+
+		fprintf(stderr,
+			"InnoDB: Some operating system error numbers"
+			" are described at\n"
+			"InnoDB: "
+			REFMAN "operating-system-error-codes.html\n");
+
+		os_has_said_disk_full = TRUE;
+	}
+
+	return(FALSE);
+#else
+	ssize_t	ret;
+
+	ret = os_file_pwrite(file, buf, n, offset, offset_high);
+
+	if ((ulint)ret == n) {
+
+		return(TRUE);
+	}
+
+	if (!os_has_said_disk_full) {
+
+		ut_print_timestamp(stderr);
+
+		fprintf(stderr,
+			"  InnoDB: Error: Write to file %s failed"
+			" at offset %lu %lu.\n"
+			"InnoDB: %lu bytes should have been written,"
+			" only %ld were written.\n"
+			"InnoDB: Operating system error number %lu.\n"
+			"InnoDB: Check that your OS and file system"
+			" support files of this size.\n"
+			"InnoDB: Check also that the disk is not full"
+			" or a disk quota exceeded.\n",
+			name, offset_high, offset, n, (long int)ret,
+			(ulint)errno);
+		if (strerror(errno) != NULL) {
+			fprintf(stderr,
+				"InnoDB: Error number %lu means '%s'.\n",
+				(ulint)errno, strerror(errno));
+		}
+
+		fprintf(stderr,
+			"InnoDB: Some operating system error numbers"
+			" are described at\n"
+			"InnoDB: "
+			REFMAN "operating-system-error-codes.html\n");
+
+		os_has_said_disk_full = TRUE;
+	}
+
+	return(FALSE);
+#endif
+}
+
+/*******************************************************************//**
+Check the existence and type of the given file.
+@return	TRUE if call succeeded */
+UNIV_INTERN
+ibool
+os_file_status(
+/*===========*/
+	const char*	path,	/*!< in:	pathname of the file */
+	ibool*		exists,	/*!< out: TRUE if file exists */
+	os_file_type_t* type)	/*!< out: type of the file (if it exists) */
+{
+#ifdef __WIN__
+	int		ret;
+	struct _stat	statinfo;
+
+	ret = _stat(path, &statinfo);
+	if (ret && (errno == ENOENT || errno == ENOTDIR)) {
+		/* file does not exist */
+		*exists = FALSE;
+		return(TRUE);
+	} else if (ret) {
+		/* file exists, but stat call failed */
+
+		os_file_handle_error_no_exit(path, "stat");
+
+		return(FALSE);
+	}
+
+	if (_S_IFDIR & statinfo.st_mode) {
+		*type = OS_FILE_TYPE_DIR;
+	} else if (_S_IFREG & statinfo.st_mode) {
+		*type = OS_FILE_TYPE_FILE;
+	} else {
+		*type = OS_FILE_TYPE_UNKNOWN;
+	}
+
+	*exists = TRUE;
+
+	return(TRUE);
+#else
+	int		ret;
+	struct stat	statinfo;
+
+	ret = stat(path, &statinfo);
+	if (ret && (errno == ENOENT || errno == ENOTDIR)) {
+		/* file does not exist */
+		*exists = FALSE;
+		return(TRUE);
+	} else if (ret) {
+		/* file exists, but stat call failed */
+
+		os_file_handle_error_no_exit(path, "stat");
+
+		return(FALSE);
+	}
+
+	if (S_ISDIR(statinfo.st_mode)) {
+		*type = OS_FILE_TYPE_DIR;
+	} else if (S_ISLNK(statinfo.st_mode)) {
+		*type = OS_FILE_TYPE_LINK;
+	} else if (S_ISREG(statinfo.st_mode)) {
+		*type = OS_FILE_TYPE_FILE;
+	} else {
+		*type = OS_FILE_TYPE_UNKNOWN;
+	}
+
+	*exists = TRUE;
+
+	return(TRUE);
+#endif
+}
+
+/*******************************************************************//**
+This function returns information about the specified file
+@return	TRUE if stat information found */
+UNIV_INTERN
+ibool
+os_file_get_status(
+/*===============*/
+	const char*	path,		/*!< in:	pathname of the file */
+	os_file_stat_t* stat_info)	/*!< information of a file in a
+					directory */
+{
+#ifdef __WIN__
+	int		ret;
+	struct _stat	statinfo;
+
+	ret = _stat(path, &statinfo);
+	if (ret && (errno == ENOENT || errno == ENOTDIR)) {
+		/* file does not exist */
+
+		return(FALSE);
+	} else if (ret) {
+		/* file exists, but stat call failed */
+
+		os_file_handle_error_no_exit(path, "stat");
+
+		return(FALSE);
+	}
+	if (_S_IFDIR & statinfo.st_mode) {
+		stat_info->type = OS_FILE_TYPE_DIR;
+	} else if (_S_IFREG & statinfo.st_mode) {
+		stat_info->type = OS_FILE_TYPE_FILE;
+	} else {
+		stat_info->type = OS_FILE_TYPE_UNKNOWN;
+	}
+
+	stat_info->ctime = statinfo.st_ctime;
+	stat_info->atime = statinfo.st_atime;
+	stat_info->mtime = statinfo.st_mtime;
+	stat_info->size	 = statinfo.st_size;
+
+	return(TRUE);
+#else
+	int		ret;
+	struct stat	statinfo;
+
+	ret = stat(path, &statinfo);
+
+	if (ret && (errno == ENOENT || errno == ENOTDIR)) {
+		/* file does not exist */
+
+		return(FALSE);
+	} else if (ret) {
+		/* file exists, but stat call failed */
+
+		os_file_handle_error_no_exit(path, "stat");
+
+		return(FALSE);
+	}
+
+	if (S_ISDIR(statinfo.st_mode)) {
+		stat_info->type = OS_FILE_TYPE_DIR;
+	} else if (S_ISLNK(statinfo.st_mode)) {
+		stat_info->type = OS_FILE_TYPE_LINK;
+	} else if (S_ISREG(statinfo.st_mode)) {
+		stat_info->type = OS_FILE_TYPE_FILE;
+	} else {
+		stat_info->type = OS_FILE_TYPE_UNKNOWN;
+	}
+
+	stat_info->ctime = statinfo.st_ctime;
+	stat_info->atime = statinfo.st_atime;
+	stat_info->mtime = statinfo.st_mtime;
+	stat_info->size	 = statinfo.st_size;
+
+	return(TRUE);
+#endif
+}
+
+/* path name separator character */
+#ifdef __WIN__
+#  define OS_FILE_PATH_SEPARATOR	'\\'
+#else
+#  define OS_FILE_PATH_SEPARATOR	'/'
+#endif
+
+/****************************************************************//**
+The function os_file_dirname returns a directory component of a
+null-terminated pathname string.  In the usual case, dirname returns
+the string up to, but not including, the final '/', and basename
+is the component following the final '/'.  Trailing '/' charac�
+ters are not counted as part of the pathname.
+
+If path does not contain a slash, dirname returns the string ".".
+
+Concatenating the string returned by dirname, a "/", and the basename
+yields a complete pathname.
+
+The return value is  a copy of the directory component of the pathname.
+The copy is allocated from heap. It is the caller responsibility
+to free it after it is no longer needed.
+
+The following list of examples (taken from SUSv2) shows the strings
+returned by dirname and basename for different paths:
+
+       path	      dirname	     basename
+       "/usr/lib"     "/usr"	     "lib"
+       "/usr/"	      "/"	     "usr"
+       "usr"	      "."	     "usr"
+       "/"	      "/"	     "/"
+       "."	      "."	     "."
+       ".."	      "."	     ".."
+
+@return	own: directory component of the pathname */
+UNIV_INTERN
+char*
+os_file_dirname(
+/*============*/
+	const char*	path)	/*!< in: pathname */
+{
+	/* Find the offset of the last slash */
+	const char* last_slash = strrchr(path, OS_FILE_PATH_SEPARATOR);
+	if (!last_slash) {
+		/* No slash in the path, return "." */
+
+		return(mem_strdup("."));
+	}
+
+	/* Ok, there is a slash */
+
+	if (last_slash == path) {
+		/* last slash is the first char of the path */
+
+		return(mem_strdup("/"));
+	}
+
+	/* Non-trivial directory component */
+
+	return(mem_strdupl(path, last_slash - path));
+}
+
+/****************************************************************//**
+Creates all missing subdirectories along the given path.
+@return	TRUE if call succeeded FALSE otherwise */
+UNIV_INTERN
+ibool
+os_file_create_subdirs_if_needed(
+/*=============================*/
+	const char*	path)	/*!< in: path name */
+{
+	char*		subdir;
+	ibool		success, subdir_exists;
+	os_file_type_t	type;
+
+	subdir = os_file_dirname(path);
+	if (strlen(subdir) == 1
+	    && (*subdir == OS_FILE_PATH_SEPARATOR || *subdir == '.')) {
+		/* subdir is root or cwd, nothing to do */
+		mem_free(subdir);
+
+		return(TRUE);
+	}
+
+	/* Test if subdir exists */
+	success = os_file_status(subdir, &subdir_exists, &type);
+	if (success && !subdir_exists) {
+		/* subdir does not exist, create it */
+		success = os_file_create_subdirs_if_needed(subdir);
+		if (!success) {
+			mem_free(subdir);
+
+			return(FALSE);
+		}
+		success = os_file_create_directory(subdir, FALSE);
+	}
+
+	mem_free(subdir);
+
+	return(success);
+}
+
+#ifndef UNIV_HOTBACKUP
+/****************************************************************//**
+Returns a pointer to the nth slot in the aio array.
+@return	pointer to slot */
+static
+os_aio_slot_t*
+os_aio_array_get_nth_slot(
+/*======================*/
+	os_aio_array_t*		array,	/*!< in: aio array */
+	ulint			index)	/*!< in: index of the slot */
+{
+	ut_a(index < array->n_slots);
+
+	return((array->slots) + index);
+}
+
+/************************************************************************//**
+Creates an aio wait array.
+@return	own: aio array */
+static
+os_aio_array_t*
+os_aio_array_create(
+/*================*/
+	ulint	n,		/*!< in: maximum number of pending aio operations
+				allowed; n must be divisible by n_segments */
+	ulint	n_segments)	/*!< in: number of segments in the aio array */
+{
+	os_aio_array_t*	array;
+	ulint		i;
+	os_aio_slot_t*	slot;
+#ifdef WIN_ASYNC_IO
+	OVERLAPPED*	over;
+#endif
+	ut_a(n > 0);
+	ut_a(n_segments > 0);
+
+	array = ut_malloc(sizeof(os_aio_array_t));
+
+	array->mutex		= os_mutex_create(NULL);
+	array->not_full		= os_event_create(NULL);
+	array->is_empty		= os_event_create(NULL);
+
+	os_event_set(array->is_empty);
+
+	array->n_slots		= n;
+	array->n_segments	= n_segments;
+	array->n_reserved	= 0;
+	array->slots		= ut_malloc(n * sizeof(os_aio_slot_t));
+#ifdef __WIN__
+	array->native_events	= ut_malloc(n * sizeof(os_native_event_t));
+#endif
+	for (i = 0; i < n; i++) {
+		slot = os_aio_array_get_nth_slot(array, i);
+
+		slot->pos = i;
+		slot->reserved = FALSE;
+#ifdef WIN_ASYNC_IO
+		slot->event = os_event_create(NULL);
+
+		over = &(slot->control);
+
+		over->hEvent = slot->event->handle;
+
+		*((array->native_events) + i) = over->hEvent;
+#endif
+	}
+
+	return(array);
+}
+
+/************************************************************************//**
+Frees an aio wait array. */
+static
+void
+os_aio_array_free(
+/*==============*/
+	os_aio_array_t*	array)	/*!< in, own: array to free */
+{
+#ifdef WIN_ASYNC_IO
+	ulint	i;
+
+	for (i = 0; i < array->n_slots; i++) {
+		os_aio_slot_t*	slot = os_aio_array_get_nth_slot(array, i);
+		os_event_free(slot->event);
+	}
+#endif /* WIN_ASYNC_IO */
+
+#ifdef __WIN__
+	ut_free(array->native_events);
+#endif /* __WIN__ */
+	os_mutex_free(array->mutex);
+	os_event_free(array->not_full);
+	os_event_free(array->is_empty);
+
+	ut_free(array->slots);
+	ut_free(array);
+}
+
+/***********************************************************************
+Initializes the asynchronous io system. Creates one array each for ibuf
+and log i/o. Also creates one array each for read and write where each
+array is divided logically into n_read_segs and n_write_segs
+respectively. The caller must create an i/o handler thread for each
+segment in these arrays. This function also creates the sync array.
+No i/o handler thread needs to be created for that */
+UNIV_INTERN
+void
+os_aio_init(
+/*========*/
+	ulint	n_per_seg,	/*<! in: maximum number of pending aio
+				operations allowed per segment */
+	ulint	n_read_segs,	/*<! in: number of reader threads */
+	ulint	n_write_segs,	/*<! in: number of writer threads */
+	ulint	n_slots_sync)	/*<! in: number of slots in the sync aio
+				array */
+{
+	ulint	i;
+	ulint 	n_segments = 2 + n_read_segs + n_write_segs;
+
+	ut_ad(n_segments >= 4);
+
+	os_io_init_simple();
+
+	for (i = 0; i < n_segments; i++) {
+		srv_set_io_thread_op_info(i, "not started yet");
+		os_aio_thread_buffer[i] = 0;
+		os_aio_thread_buffer_size[i] = 0;
+	}
+
+
+	/* fprintf(stderr, "Array n per seg %lu\n", n_per_seg); */
+
+	os_aio_first_write_segment = os_aio_first_read_segment + n_read_segs;
+	os_aio_ibuf_array = os_aio_array_create(n_per_seg, 1);
+
+	srv_io_thread_function[0] = "insert buffer thread";
+
+	os_aio_log_array = os_aio_array_create(n_per_seg, 1);
+
+	srv_io_thread_function[1] = "log thread";
+
+	os_aio_read_array = os_aio_array_create(n_per_seg,
+						n_read_segs);
+	for (i = 2; i < 2 + n_read_segs; i++) {
+		ut_a(i < SRV_MAX_N_IO_THREADS);
+		srv_io_thread_function[i] = "read thread";
+	}
+
+	os_aio_write_array = os_aio_array_create(n_per_seg,
+						 n_write_segs);
+	for (i = 2 + n_read_segs; i < n_segments; i++) {
+		ut_a(i < SRV_MAX_N_IO_THREADS);
+		srv_io_thread_function[i] = "write thread";
+	}
+
+	os_aio_sync_array = os_aio_array_create(n_slots_sync, 1);
+
+	os_aio_n_segments = n_segments;
+
+	os_aio_validate();
+
+	os_aio_segment_wait_events = ut_malloc(n_segments * sizeof(void*));
+
+	for (i = 0; i < n_segments; i++) {
+		os_aio_segment_wait_events[i] = os_event_create(NULL);
+	}
+
+	os_last_printout = time(NULL);
+
+}
+
+/***********************************************************************
+Frees the asynchronous io system. */
+UNIV_INTERN
+void
+os_aio_free(void)
+/*=============*/
+{
+	ulint	i;
+
+	os_aio_array_free(os_aio_ibuf_array);
+	os_aio_ibuf_array = NULL;
+	os_aio_array_free(os_aio_log_array);
+	os_aio_log_array = NULL;
+	os_aio_array_free(os_aio_read_array);
+	os_aio_read_array = NULL;
+	os_aio_array_free(os_aio_write_array);
+	os_aio_write_array = NULL;
+	os_aio_array_free(os_aio_sync_array);
+	os_aio_sync_array = NULL;
+
+	for (i = 0; i < os_aio_n_segments; i++) {
+		os_event_free(os_aio_segment_wait_events[i]);
+	}
+
+	ut_free(os_aio_segment_wait_events);
+	os_aio_segment_wait_events = 0;
+	os_aio_n_segments = 0;
+}
+
+#ifdef WIN_ASYNC_IO
+/************************************************************************//**
+Wakes up all async i/o threads in the array in Windows async i/o at
+shutdown. */
+static
+void
+os_aio_array_wake_win_aio_at_shutdown(
+/*==================================*/
+	os_aio_array_t*	array)	/*!< in: aio array */
+{
+	ulint	i;
+
+	for (i = 0; i < array->n_slots; i++) {
+
+		os_event_set((array->slots + i)->event);
+	}
+}
+#endif
+
+/************************************************************************//**
+Wakes up all async i/o threads so that they know to exit themselves in
+shutdown. */
+UNIV_INTERN
+void
+os_aio_wake_all_threads_at_shutdown(void)
+/*=====================================*/
+{
+	ulint	i;
+
+#ifdef WIN_ASYNC_IO
+	/* This code wakes up all ai/o threads in Windows native aio */
+	os_aio_array_wake_win_aio_at_shutdown(os_aio_read_array);
+	os_aio_array_wake_win_aio_at_shutdown(os_aio_write_array);
+	os_aio_array_wake_win_aio_at_shutdown(os_aio_ibuf_array);
+	os_aio_array_wake_win_aio_at_shutdown(os_aio_log_array);
+#endif
+	/* This loop wakes up all simulated ai/o threads */
+
+	for (i = 0; i < os_aio_n_segments; i++) {
+
+		os_event_set(os_aio_segment_wait_events[i]);
+	}
+}
+
+/************************************************************************//**
+Waits until there are no pending writes in os_aio_write_array. There can
+be other, synchronous, pending writes. */
+UNIV_INTERN
+void
+os_aio_wait_until_no_pending_writes(void)
+/*=====================================*/
+{
+	os_event_wait(os_aio_write_array->is_empty);
+}
+
+/**********************************************************************//**
+Calculates segment number for a slot.
+@return segment number (which is the number used by, for example,
+i/o-handler threads) */
+static
+ulint
+os_aio_get_segment_no_from_slot(
+/*============================*/
+	os_aio_array_t*	array,	/*!< in: aio wait array */
+	os_aio_slot_t*	slot)	/*!< in: slot in this array */
+{
+	ulint	segment;
+	ulint	seg_len;
+
+	if (array == os_aio_ibuf_array) {
+		segment = 0;
+
+	} else if (array == os_aio_log_array) {
+		segment = 1;
+
+	} else if (array == os_aio_read_array) {
+		seg_len = os_aio_read_array->n_slots
+			/ os_aio_read_array->n_segments;
+
+		segment = 2 + slot->pos / seg_len;
+	} else {
+		ut_a(array == os_aio_write_array);
+		seg_len = os_aio_write_array->n_slots
+			/ os_aio_write_array->n_segments;
+
+		segment = os_aio_read_array->n_segments + 2
+			+ slot->pos / seg_len;
+	}
+
+	return(segment);
+}
+
+/**********************************************************************//**
+Calculates local segment number and aio array from global segment number.
+@return	local segment number within the aio array */
+static
+ulint
+os_aio_get_array_and_local_segment(
+/*===============================*/
+	os_aio_array_t** array,		/*!< out: aio wait array */
+	ulint		 global_segment)/*!< in: global segment number */
+{
+	ulint	segment;
+
+	ut_a(global_segment < os_aio_n_segments);
+
+	if (global_segment == 0) {
+		*array = os_aio_ibuf_array;
+		segment = 0;
+
+	} else if (global_segment == 1) {
+		*array = os_aio_log_array;
+		segment = 0;
+
+	} else if (global_segment < os_aio_read_array->n_segments + 2) {
+		*array = os_aio_read_array;
+
+		segment = global_segment - 2;
+	} else {
+		*array = os_aio_write_array;
+
+		segment = global_segment - (os_aio_read_array->n_segments + 2);
+	}
+
+	return(segment);
+}
+
+/*******************************************************************//**
+Requests for a slot in the aio array. If no slot is available, waits until
+not_full-event becomes signaled.
+@return	pointer to slot */
+static
+os_aio_slot_t*
+os_aio_array_reserve_slot(
+/*======================*/
+	ulint		type,	/*!< in: OS_FILE_READ or OS_FILE_WRITE */
+	os_aio_array_t*	array,	/*!< in: aio array */
+	fil_node_t*	message1,/*!< in: message to be passed along with
+				the aio operation */
+	void*		message2,/*!< in: message to be passed along with
+				the aio operation */
+	os_file_t	file,	/*!< in: file handle */
+	const char*	name,	/*!< in: name of the file or path as a
+				null-terminated string */
+	void*		buf,	/*!< in: buffer where to read or from which
+				to write */
+	ulint		offset,	/*!< in: least significant 32 bits of file
+				offset */
+	ulint		offset_high, /*!< in: most significant 32 bits of
+				offset */
+	ulint		len,	/*!< in: length of the block to read or write */
+	trx_t*		trx)
+{
+	os_aio_slot_t*	slot;
+#ifdef WIN_ASYNC_IO
+	OVERLAPPED*	control;
+#endif
+	ulint		i;
+	ulint		slots_per_seg;
+	ulint		local_seg;
+
+	/* No need of a mutex. Only reading constant fields */
+	slots_per_seg = array->n_slots / array->n_segments;
+
+	/* We attempt to keep adjacent blocks in the same local
+	segment. This can help in merging IO requests when we are
+	doing simulated AIO */
+	local_seg = (offset >> (UNIV_PAGE_SIZE_SHIFT + 6))
+		    % array->n_segments;
+
+loop:
+	os_mutex_enter(array->mutex);
+
+	if (array->n_reserved == array->n_slots) {
+		os_mutex_exit(array->mutex);
+
+		if (!os_aio_use_native_aio) {
+			/* If the handler threads are suspended, wake them
+			so that we get more slots */
+
+			os_aio_simulated_wake_handler_threads();
+		}
+
+		os_event_wait(array->not_full);
+
+		goto loop;
+	}
+
+	/* First try to find a slot in the preferred local segment */
+	for (i = local_seg * slots_per_seg; i < array->n_slots; i++) {
+		slot = os_aio_array_get_nth_slot(array, i);
+
+		if (slot->reserved == FALSE) {
+			goto found;
+		}
+	}
+
+	/* Fall back to a full scan. We are guaranteed to find a slot */
+	for (i = 0;; i++) {
+		slot = os_aio_array_get_nth_slot(array, i);
+
+		if (slot->reserved == FALSE) {
+			goto found;
+		}
+	}
+
+found:
+	ut_a(slot->reserved == FALSE);
+	array->n_reserved++;
+
+	if (array->n_reserved == 1) {
+		os_event_reset(array->is_empty);
+	}
+
+	if (array->n_reserved == array->n_slots) {
+		os_event_reset(array->not_full);
+	}
+
+	slot->reserved = TRUE;
+	slot->reservation_time = time(NULL);
+	slot->message1 = message1;
+	slot->message2 = message2;
+	slot->file     = file;
+	slot->name     = name;
+	slot->len      = len;
+	slot->type     = type;
+	slot->buf      = buf;
+	slot->offset   = offset;
+	slot->offset_high = offset_high;
+//	slot->io_already_done = FALSE;
+	slot->status = OS_AIO_NOT_ISSUED;
+
+#ifdef WIN_ASYNC_IO
+	control = &(slot->control);
+	control->Offset = (DWORD)offset;
+	control->OffsetHigh = (DWORD)offset_high;
+	os_event_reset(slot->event);
+#endif
+
+	os_mutex_exit(array->mutex);
+
+	return(slot);
+}
+
+/*******************************************************************//**
+Frees a slot in the aio array. */
+static
+void
+os_aio_array_free_slot(
+/*===================*/
+	os_aio_array_t*	array,	/*!< in: aio array */
+	os_aio_slot_t*	slot)	/*!< in: pointer to slot */
+{
+	ut_ad(array);
+	ut_ad(slot);
+
+	os_mutex_enter(array->mutex);
+
+	ut_ad(slot->reserved);
+
+	slot->reserved = FALSE;
+	slot->status = OS_AIO_NOT_ISSUED;
+
+	array->n_reserved--;
+
+	if (array->n_reserved == array->n_slots - 1) {
+		os_event_set(array->not_full);
+	}
+
+	if (array->n_reserved == 0) {
+		os_event_set(array->is_empty);
+	}
+
+#ifdef WIN_ASYNC_IO
+	os_event_reset(slot->event);
+#endif
+	os_mutex_exit(array->mutex);
+}
+
+/**********************************************************************//**
+Wakes up a simulated aio i/o-handler thread if it has something to do. */
+static
+void
+os_aio_simulated_wake_handler_thread(
+/*=================================*/
+	ulint	global_segment)	/*!< in: the number of the segment in the aio
+				arrays */
+{
+	os_aio_array_t*	array;
+	os_aio_slot_t*	slot;
+	ulint		segment;
+	ulint		n;
+	ulint		i;
+
+	ut_ad(!os_aio_use_native_aio);
+
+	segment = os_aio_get_array_and_local_segment(&array, global_segment);
+
+	n = array->n_slots;
+
+	/* Look through n slots after the segment * n'th slot */
+
+	os_mutex_enter(array->mutex);
+
+	for (i = 0; i < n; i++) {
+		slot = os_aio_array_get_nth_slot(array, i);
+
+		if (slot->reserved &&
+		    (slot->status == OS_AIO_NOT_ISSUED ||
+		     slot->status == OS_AIO_DONE)) {
+			/* Found an i/o request */
+
+			break;
+		}
+	}
+
+	os_mutex_exit(array->mutex);
+
+	if (i < n) {
+		if (array == os_aio_ibuf_array) {
+			os_event_set(os_aio_segment_wait_events[0]);
+
+		} else if (array == os_aio_log_array) {
+			os_event_set(os_aio_segment_wait_events[1]);
+
+		} else if (array == os_aio_read_array) {
+			ulint	x;
+			for (x = os_aio_first_read_segment; x < os_aio_first_write_segment; x++)
+				os_event_set(os_aio_segment_wait_events[x]);
+
+		} else if (array == os_aio_write_array) {
+			ulint	x;
+			for (x = os_aio_first_write_segment; x < os_aio_n_segments; x++)
+				os_event_set(os_aio_segment_wait_events[x]);
+
+		} else {
+			ut_a(0);
+		}
+	}
+}
+
+/**********************************************************************//**
+Wakes up simulated aio i/o-handler threads if they have something to do. */
+UNIV_INTERN
+void
+os_aio_simulated_wake_handler_threads(void)
+/*=======================================*/
+{
+	if (os_aio_use_native_aio) {
+		/* We do not use simulated aio: do nothing */
+
+		return;
+	}
+
+	os_aio_recommend_sleep_for_read_threads	= FALSE;
+
+	os_aio_simulated_wake_handler_thread(0);
+	os_aio_simulated_wake_handler_thread(1);
+	os_aio_simulated_wake_handler_thread(os_aio_first_read_segment);
+	os_aio_simulated_wake_handler_thread(os_aio_first_write_segment);
+}
+
+/**********************************************************************//**
+This function can be called if one wants to post a batch of reads and
+prefers an i/o-handler thread to handle them all at once later. You must
+call os_aio_simulated_wake_handler_threads later to ensure the threads
+are not left sleeping! */
+UNIV_INTERN
+void
+os_aio_simulated_put_read_threads_to_sleep(void)
+/*============================================*/
+{
+
+/* The idea of putting background IO threads to sleep is only for
+Windows when using simulated AIO. Windows XP seems to schedule
+background threads too eagerly to allow for coalescing during
+readahead requests. */
+#ifdef __WIN__
+	os_aio_array_t*	array;
+	ulint		g;
+
+	if (os_aio_use_native_aio) {
+		/* We do not use simulated aio: do nothing */
+
+		return;
+	}
+
+	os_aio_recommend_sleep_for_read_threads	= TRUE;
+
+	for (g = 0; g < os_aio_n_segments; g++) {
+		os_aio_get_array_and_local_segment(&array, g);
+
+		if (array == os_aio_read_array) {
+
+			os_event_reset(os_aio_segment_wait_events[g]);
+		}
+	}
+#endif /* __WIN__ */
+}
+
+/*******************************************************************//**
+Requests an asynchronous i/o operation.
+@return	TRUE if request was queued successfully, FALSE if fail */
+UNIV_INTERN
+ibool
+os_aio(
+/*===*/
+	ulint		type,	/*!< in: OS_FILE_READ or OS_FILE_WRITE */
+	ulint		mode,	/*!< in: OS_AIO_NORMAL, ..., possibly ORed
+				to OS_AIO_SIMULATED_WAKE_LATER: the
+				last flag advises this function not to wake
+				i/o-handler threads, but the caller will
+				do the waking explicitly later, in this
+				way the caller can post several requests in
+				a batch; NOTE that the batch must not be
+				so big that it exhausts the slots in aio
+				arrays! NOTE that a simulated batch
+				may introduce hidden chances of deadlocks,
+				because i/os are not actually handled until
+				all have been posted: use with great
+				caution! */
+	const char*	name,	/*!< in: name of the file or path as a
+				null-terminated string */
+	os_file_t	file,	/*!< in: handle to a file */
+	void*		buf,	/*!< in: buffer where to read or from which
+				to write */
+	ulint		offset,	/*!< in: least significant 32 bits of file
+				offset where to read or write */
+	ulint		offset_high, /*!< in: most significant 32 bits of
+				offset */
+	ulint		n,	/*!< in: number of bytes to read or write */
+	fil_node_t*	message1,/*!< in: message for the aio handler
+				(can be used to identify a completed
+				aio operation); ignored if mode is
+				OS_AIO_SYNC */
+	void*		message2,/*!< in: message for the aio handler
+				(can be used to identify a completed
+				aio operation); ignored if mode is
+				OS_AIO_SYNC */
+	trx_t*		trx)
+{
+	os_aio_array_t*	array;
+	os_aio_slot_t*	slot;
+#ifdef WIN_ASYNC_IO
+	ibool		retval;
+	BOOL		ret		= TRUE;
+	DWORD		len		= (DWORD) n;
+	struct fil_node_struct * dummy_mess1;
+	void*		dummy_mess2;
+	ulint		dummy_type;
+#endif
+	ulint		err		= 0;
+	ibool		retry;
+	ulint		wake_later;
+
+	ut_ad(file);
+	ut_ad(buf);
+	ut_ad(n > 0);
+	ut_ad(n % OS_FILE_LOG_BLOCK_SIZE == 0);
+	ut_ad(offset % OS_FILE_LOG_BLOCK_SIZE == 0);
+	ut_ad(os_aio_validate());
+
+	wake_later = mode & OS_AIO_SIMULATED_WAKE_LATER;
+	mode = mode & (~OS_AIO_SIMULATED_WAKE_LATER);
+
+	if (mode == OS_AIO_SYNC
+#ifdef WIN_ASYNC_IO
+	    && !os_aio_use_native_aio
+#endif
+	    ) {
+		/* This is actually an ordinary synchronous read or write:
+		no need to use an i/o-handler thread. NOTE that if we use
+		Windows async i/o, Windows does not allow us to use
+		ordinary synchronous os_file_read etc. on the same file,
+		therefore we have built a special mechanism for synchronous
+		wait in the Windows case. */
+
+		if (type == OS_FILE_READ) {
+			return(_os_file_read(file, buf, offset,
+					    offset_high, n, trx));
+		}
+
+		ut_a(type == OS_FILE_WRITE);
+
+		return(os_file_write(name, file, buf, offset, offset_high, n));
+	}
+
+try_again:
+	if (mode == OS_AIO_NORMAL) {
+		if (type == OS_FILE_READ) {
+			array = os_aio_read_array;
+		} else {
+			array = os_aio_write_array;
+		}
+	} else if (mode == OS_AIO_IBUF) {
+		ut_ad(type == OS_FILE_READ);
+		/* Reduce probability of deadlock bugs in connection with ibuf:
+		do not let the ibuf i/o handler sleep */
+
+		wake_later = FALSE;
+
+		array = os_aio_ibuf_array;
+	} else if (mode == OS_AIO_LOG) {
+
+		array = os_aio_log_array;
+	} else if (mode == OS_AIO_SYNC) {
+		array = os_aio_sync_array;
+	} else {
+		array = NULL; /* Eliminate compiler warning */
+		ut_error;
+	}
+
+	if (trx && type == OS_FILE_READ)
+	{
+		trx->io_reads++;
+		trx->io_read += n;
+	}
+	slot = os_aio_array_reserve_slot(type, array, message1, message2, file,
+					 name, buf, offset, offset_high, n, trx);
+	if (type == OS_FILE_READ) {
+		if (os_aio_use_native_aio) {
+#ifdef WIN_ASYNC_IO
+			os_n_file_reads++;
+			os_bytes_read_since_printout += len;
+
+			ret = ReadFile(file, buf, (DWORD)n, &len,
+				       &(slot->control));
+#endif
+		} else {
+			if (!wake_later) {
+				os_aio_simulated_wake_handler_thread(
+					os_aio_get_segment_no_from_slot(
+						array, slot));
+			}
+		}
+	} else if (type == OS_FILE_WRITE) {
+		if (os_aio_use_native_aio) {
+#ifdef WIN_ASYNC_IO
+			os_n_file_writes++;
+			ret = WriteFile(file, buf, (DWORD)n, &len,
+					&(slot->control));
+#endif
+		} else {
+			if (!wake_later) {
+				os_aio_simulated_wake_handler_thread(
+					os_aio_get_segment_no_from_slot(
+						array, slot));
+			}
+		}
+	} else {
+		ut_error;
+	}
+
+#ifdef WIN_ASYNC_IO
+	if (os_aio_use_native_aio) {
+		if ((ret && len == n)
+		    || (!ret && GetLastError() == ERROR_IO_PENDING)) {
+			/* aio was queued successfully! */
+
+			if (mode == OS_AIO_SYNC) {
+				/* We want a synchronous i/o operation on a
+				file where we also use async i/o: in Windows
+				we must use the same wait mechanism as for
+				async i/o */
+
+				retval = os_aio_windows_handle(ULINT_UNDEFINED,
+							       slot->pos,
+							       &dummy_mess1,
+							       &dummy_mess2,
+							       &dummy_type);
+
+				return(retval);
+			}
+
+			return(TRUE);
+		}
+
+		err = 1; /* Fall through the next if */
+	}
+#endif
+	if (err == 0) {
+		/* aio was queued successfully! */
+
+		return(TRUE);
+	}
+
+	os_aio_array_free_slot(array, slot);
+
+	retry = os_file_handle_error(name,
+				     type == OS_FILE_READ
+				     ? "aio read" : "aio write");
+	if (retry) {
+
+		goto try_again;
+	}
+
+	return(FALSE);
+}
+
+#ifdef WIN_ASYNC_IO
+/**********************************************************************//**
+This function is only used in Windows asynchronous i/o.
+Waits for an aio operation to complete. This function is used to wait the
+for completed requests. The aio array of pending requests is divided
+into segments. The thread specifies which segment or slot it wants to wait
+for. NOTE: this function will also take care of freeing the aio slot,
+therefore no other thread is allowed to do the freeing!
+@return	TRUE if the aio operation succeeded */
+UNIV_INTERN
+ibool
+os_aio_windows_handle(
+/*==================*/
+	ulint	segment,	/*!< in: the number of the segment in the aio
+				arrays to wait for; segment 0 is the ibuf
+				i/o thread, segment 1 the log i/o thread,
+				then follow the non-ibuf read threads, and as
+				the last are the non-ibuf write threads; if
+				this is ULINT_UNDEFINED, then it means that
+				sync aio is used, and this parameter is
+				ignored */
+	ulint	pos,		/*!< this parameter is used only in sync aio:
+				wait for the aio slot at this position */
+	fil_node_t**message1,	/*!< out: the messages passed with the aio
+				request; note that also in the case where
+				the aio operation failed, these output
+				parameters are valid and can be used to
+				restart the operation, for example */
+	void**	message2,
+	ulint*	type)		/*!< out: OS_FILE_WRITE or ..._READ */
+{
+	ulint		orig_seg	= segment;
+	os_aio_array_t*	array;
+	os_aio_slot_t*	slot;
+	ulint		n;
+	ulint		i;
+	ibool		ret_val;
+	BOOL		ret;
+	DWORD		len;
+	BOOL		retry		= FALSE;
+
+	if (segment == ULINT_UNDEFINED) {
+		array = os_aio_sync_array;
+		segment = 0;
+	} else {
+		segment = os_aio_get_array_and_local_segment(&array, segment);
+	}
+
+	/* NOTE! We only access constant fields in os_aio_array. Therefore
+	we do not have to acquire the protecting mutex yet */
+
+	ut_ad(os_aio_validate());
+	ut_ad(segment < array->n_segments);
+
+	n = array->n_slots;
+
+	if (array == os_aio_sync_array) {
+		os_event_wait(os_aio_array_get_nth_slot(array, pos)->event);
+		i = pos;
+	} else {
+		srv_set_io_thread_op_info(orig_seg, "wait Windows aio");
+		i = os_event_wait_multiple(n,
+					   (array->native_events)
+					   );
+	}
+
+	os_mutex_enter(array->mutex);
+
+	slot = os_aio_array_get_nth_slot(array, i);
+
+	ut_a(slot->reserved);
+
+	if (orig_seg != ULINT_UNDEFINED) {
+		srv_set_io_thread_op_info(orig_seg,
+					  "get windows aio return value");
+	}
+
+	ret = GetOverlappedResult(slot->file, &(slot->control), &len, TRUE);
+
+	*message1 = slot->message1;
+	*message2 = slot->message2;
+
+	*type = slot->type;
+
+	if (ret && len == slot->len) {
+		ret_val = TRUE;
+
+#ifdef UNIV_DO_FLUSH
+		if (slot->type == OS_FILE_WRITE
+		    && !os_do_not_call_flush_at_each_write) {
+			ut_a(TRUE == os_file_flush(slot->file));
+		}
+#endif /* UNIV_DO_FLUSH */
+	} else if (os_file_handle_error(slot->name, "Windows aio")) {
+
+		retry = TRUE;
+	} else {
+
+		ret_val = FALSE;
+	}
+
+	os_mutex_exit(array->mutex);
+
+	if (retry) {
+		/* retry failed read/write operation synchronously.
+		No need to hold array->mutex. */
+
+		switch (slot->type) {
+		case OS_FILE_WRITE:
+			ret = WriteFile(slot->file, slot->buf,
+					slot->len, &len,
+					&(slot->control));
+
+			break;
+		case OS_FILE_READ:
+			ret = ReadFile(slot->file, slot->buf,
+				       slot->len, &len,
+				       &(slot->control));
+
+			break;
+		default:
+			ut_error;
+		}
+
+		if (!ret && GetLastError() == ERROR_IO_PENDING) {
+			/* aio was queued successfully!
+			We want a synchronous i/o operation on a
+			file where we also use async i/o: in Windows
+			we must use the same wait mechanism as for
+			async i/o */
+
+			ret = GetOverlappedResult(slot->file,
+						  &(slot->control),
+						  &len, TRUE);
+		}
+
+		ret_val = ret && len == slot->len;
+	}
+
+	os_aio_array_free_slot(array, slot);
+
+	return(ret_val);
+}
+#endif
+
+/**********************************************************************//**
+Does simulated aio. This function should be called by an i/o-handler
+thread.
+@return	TRUE if the aio operation succeeded */
+UNIV_INTERN
+ibool
+os_aio_simulated_handle(
+/*====================*/
+	ulint	global_segment,	/*!< in: the number of the segment in the aio
+				arrays to wait for; segment 0 is the ibuf
+				i/o thread, segment 1 the log i/o thread,
+				then follow the non-ibuf read threads, and as
+				the last are the non-ibuf write threads */
+	fil_node_t**message1,	/*!< out: the messages passed with the aio
+				request; note that also in the case where
+				the aio operation failed, these output
+				parameters are valid and can be used to
+				restart the operation, for example */
+	void**	message2,
+	ulint*	type)		/*!< out: OS_FILE_WRITE or ..._READ */
+{
+	os_aio_array_t*	array;
+	ulint		segment;
+	os_aio_slot_t*	slot;
+	os_aio_slot_t*	slot2;
+	os_aio_slot_t*	consecutive_ios[OS_AIO_MERGE_N_CONSECUTIVE];
+	os_aio_slot_t*  lowest_request;
+	os_aio_slot_t*	oldest_request;
+	ulint		n_consecutive;
+	ulint		total_len;
+	ulint		offs;
+	ulint		lowest_offset;
+	ulint		oldest_offset;
+	ulint		biggest_age;
+	ulint		age;
+	byte*		combined_buf;
+	byte*		combined_buf2;
+	ibool		ret;
+	ulint		n;
+	ulint		i;
+	time_t          now;
+
+	/* Fix compiler warning */
+	*consecutive_ios = NULL;
+
+	segment = os_aio_get_array_and_local_segment(&array, global_segment);
+
+restart:
+	/* NOTE! We only access constant fields in os_aio_array. Therefore
+	we do not have to acquire the protecting mutex yet */
+
+	srv_set_io_thread_op_info(global_segment,
+				  "looking for i/o requests (a)");
+	ut_ad(os_aio_validate());
+	ut_ad(segment < array->n_segments);
+
+	n = array->n_slots;
+
+	/* Look through n slots after the segment * n'th slot */
+
+	if (array == os_aio_read_array
+	    && os_aio_recommend_sleep_for_read_threads) {
+
+		/* Give other threads chance to add several i/os to the array
+		at once. */
+
+		goto recommended_sleep;
+	}
+
+	os_mutex_enter(array->mutex);
+
+	srv_set_io_thread_op_info(global_segment,
+				  "looking for i/o requests (b)");
+
+	/* Check if there is a slot for which the i/o has already been
+	done */
+
+	for (i = 0; i < n; i++) {
+		slot = os_aio_array_get_nth_slot(array, i);
+
+		if (slot->reserved && slot->status == OS_AIO_DONE) {
+
+			if (os_aio_print_debug) {
+				fprintf(stderr,
+					"InnoDB: i/o for slot %lu"
+					" already done, returning\n",
+					(ulong) i);
+			}
+
+			ret = TRUE;
+
+			goto slot_io_done;
+		}
+	}
+
+	n_consecutive = 0;
+
+	/* If there are at least 2 seconds old requests, then pick the oldest
+	one to prevent starvation. If several requests have the same age,
+	then pick the one at the lowest offset. */
+
+	biggest_age = 0;
+	now = time(NULL);
+	oldest_request = lowest_request = NULL;
+	oldest_offset = lowest_offset = ULINT_MAX;
+
+	/* Find the oldest request and the request with the smallest offset */
+	for (i = 0; i < n; i++) {
+		slot = os_aio_array_get_nth_slot(array, i);
+
+		if (slot->reserved && slot->status == OS_AIO_NOT_ISSUED) {
+			age = (ulint)difftime(now, slot->reservation_time);
+
+			if ((age >= 2 && age > biggest_age)
+			    || (age >= 2 && age == biggest_age
+				&& slot->offset < oldest_offset)) {
+
+				/* Found an i/o request */
+				biggest_age = age;
+				oldest_request = slot;
+				oldest_offset = slot->offset;
+			}
+
+			/* Look for an i/o request at the lowest offset in the array
+			 * (we ignore the high 32 bits of the offset) */
+			if (slot->offset < lowest_offset) {
+				/* Found an i/o request */
+				lowest_request = slot;
+				lowest_offset = slot->offset;
+			}
+		}
+	}
+
+	if (!lowest_request && !oldest_request) {
+
+		/* No i/o requested at the moment */
+
+		goto wait_for_io;
+	}
+
+	if (oldest_request) {
+		slot = oldest_request;
+	} else {
+		slot = lowest_request;
+	}
+	consecutive_ios[0] = slot;
+	n_consecutive = 1;
+
+	/* Check if there are several consecutive blocks to read or write */
+
+consecutive_loop:
+	for (i = 0; i < n; i++) {
+		slot2 = os_aio_array_get_nth_slot(array, i);
+
+		if (slot2->reserved && slot2 != slot
+		    && slot2->offset == slot->offset + slot->len
+		    /* check that sum does not wrap over */
+		    && slot->offset + slot->len > slot->offset
+		    && slot2->offset_high == slot->offset_high
+		    && slot2->type == slot->type
+		    && slot2->file == slot->file
+		    && slot2->status == OS_AIO_NOT_ISSUED) {
+
+			/* Found a consecutive i/o request */
+
+			consecutive_ios[n_consecutive] = slot2;
+			n_consecutive++;
+
+			slot = slot2;
+
+			if (n_consecutive < OS_AIO_MERGE_N_CONSECUTIVE) {
+
+				goto consecutive_loop;
+			} else {
+				break;
+			}
+		}
+	}
+
+	srv_set_io_thread_op_info(global_segment, "consecutive i/o requests");
+
+	/* We have now collected n_consecutive i/o requests in the array;
+	allocate a single buffer which can hold all data, and perform the
+	i/o */
+
+	total_len = 0;
+	slot = consecutive_ios[0];
+
+	for (i = 0; i < n_consecutive; i++) {
+		total_len += consecutive_ios[i]->len;
+		ut_a(consecutive_ios[i]->status == OS_AIO_NOT_ISSUED);
+		consecutive_ios[i]->status = OS_AIO_ISSUED;
+	}
+
+	if (n_consecutive == 1) {
+		/* We can use the buffer of the i/o request */
+		combined_buf = slot->buf;
+		combined_buf2 = NULL;
+	} else {
+		if ((total_len + UNIV_PAGE_SIZE) > os_aio_thread_buffer_size[global_segment]) {
+			if (os_aio_thread_buffer[global_segment])
+				ut_free(os_aio_thread_buffer[global_segment]);
+
+			os_aio_thread_buffer[global_segment] = ut_malloc(total_len + UNIV_PAGE_SIZE);
+			os_aio_thread_buffer_size[global_segment] = total_len + UNIV_PAGE_SIZE;
+		}
+		combined_buf2 = os_aio_thread_buffer[global_segment];
+
+		ut_a(combined_buf2);
+
+		combined_buf = ut_align(combined_buf2, UNIV_PAGE_SIZE);
+	}
+
+	/* We release the array mutex for the time of the i/o: NOTE that
+	this assumes that there is just one i/o-handler thread serving
+	a single segment of slots! */
+
+	ut_a(slot->reserved);
+	ut_a(slot->status == OS_AIO_ISSUED);
+
+	os_mutex_exit(array->mutex);
+
+	if (slot->type == OS_FILE_WRITE && n_consecutive > 1) {
+		/* Copy the buffers to the combined buffer */
+		offs = 0;
+
+		for (i = 0; i < n_consecutive; i++) {
+
+			ut_memcpy(combined_buf + offs, consecutive_ios[i]->buf,
+				  consecutive_ios[i]->len);
+			offs += consecutive_ios[i]->len;
+		}
+	}
+
+	srv_set_io_thread_op_info(global_segment, "doing file i/o");
+
+	if (os_aio_print_debug) {
+		fprintf(stderr,
+			"InnoDB: doing i/o of type %lu at offset %lu %lu,"
+			" length %lu\n",
+			(ulong) slot->type, (ulong) slot->offset_high,
+			(ulong) slot->offset, (ulong) total_len);
+	}
+
+	/* Do the i/o with ordinary, synchronous i/o functions: */
+	if (slot->type == OS_FILE_WRITE) {
+		ret = os_file_write(slot->name, slot->file, combined_buf,
+				    slot->offset, slot->offset_high,
+				    total_len);
+	} else {
+		ret = os_file_read(slot->file, combined_buf,
+				   slot->offset, slot->offset_high, total_len);
+	}
+
+	ut_a(ret);
+	srv_set_io_thread_op_info(global_segment, "file i/o done");
+
+#if 0
+	fprintf(stderr,
+		"aio: %lu consecutive %lu:th segment, first offs %lu blocks\n",
+		n_consecutive, global_segment, slot->offset / UNIV_PAGE_SIZE);
+#endif
+
+	if (slot->type == OS_FILE_READ && n_consecutive > 1) {
+		/* Copy the combined buffer to individual buffers */
+		offs = 0;
+
+		for (i = 0; i < n_consecutive; i++) {
+
+			ut_memcpy(consecutive_ios[i]->buf, combined_buf + offs,
+				  consecutive_ios[i]->len);
+			offs += consecutive_ios[i]->len;
+		}
+	}
+
+	if (srv_recovery_stats && recv_recovery_is_on() && n_consecutive) {
+		mutex_enter(&(recv_sys->mutex));
+		if (slot->type == OS_FILE_READ) {
+			recv_sys->stats_read_io_pages += n_consecutive;
+			recv_sys->stats_read_io_consecutive[n_consecutive - 1]++;
+		} else if (slot->type == OS_FILE_WRITE) {
+			recv_sys->stats_write_io_pages += n_consecutive;
+			recv_sys->stats_write_io_consecutive[n_consecutive - 1]++;
+		}
+		mutex_exit(&(recv_sys->mutex));
+	}
+
+	os_mutex_enter(array->mutex);
+
+	/* Mark the i/os done in slots */
+
+	for (i = 0; i < n_consecutive; i++) {
+		ut_a(consecutive_ios[i]->status == OS_AIO_ISSUED);
+		consecutive_ios[i]->status = OS_AIO_DONE;
+	}
+
+	/* We return the messages for the first slot now, and if there were
+	several slots, the messages will be returned with subsequent calls
+	of this function */
+
+slot_io_done:
+
+	ut_a(slot->reserved);
+	ut_a(slot->status == OS_AIO_DONE);
+	slot->status = OS_AIO_CLAIMED;
+
+	*message1 = slot->message1;
+	*message2 = slot->message2;
+
+	*type = slot->type;
+
+	os_mutex_exit(array->mutex);
+
+	os_aio_array_free_slot(array, slot);
+
+	return(ret);
+
+wait_for_io:
+	srv_set_io_thread_op_info(global_segment, "resetting wait event");
+
+	/* We wait here until there again can be i/os in the segment
+	of this thread */
+
+	os_event_reset(os_aio_segment_wait_events[global_segment]);
+
+	os_mutex_exit(array->mutex);
+
+recommended_sleep:
+	srv_set_io_thread_op_info(global_segment, "waiting for i/o request");
+
+	os_event_wait(os_aio_segment_wait_events[global_segment]);
+
+	if (os_aio_print_debug) {
+		fprintf(stderr,
+			"InnoDB: i/o handler thread for i/o"
+			" segment %lu wakes up\n",
+			(ulong) global_segment);
+	}
+
+	goto restart;
+}
+
+/**********************************************************************//**
+Validates the consistency of an aio array.
+@return	TRUE if ok */
+static
+ibool
+os_aio_array_validate(
+/*==================*/
+	os_aio_array_t*	array)	/*!< in: aio wait array */
+{
+	os_aio_slot_t*	slot;
+	ulint		n_reserved	= 0;
+	ulint		i;
+
+	ut_a(array);
+
+	os_mutex_enter(array->mutex);
+
+	ut_a(array->n_slots > 0);
+	ut_a(array->n_segments > 0);
+
+	for (i = 0; i < array->n_slots; i++) {
+		slot = os_aio_array_get_nth_slot(array, i);
+
+		if (slot->reserved) {
+			n_reserved++;
+			ut_a(slot->len > 0);
+		}
+	}
+
+	ut_a(array->n_reserved == n_reserved);
+
+	os_mutex_exit(array->mutex);
+
+	return(TRUE);
+}
+
+/**********************************************************************//**
+Validates the consistency the aio system.
+@return	TRUE if ok */
+UNIV_INTERN
+ibool
+os_aio_validate(void)
+/*=================*/
+{
+	os_aio_array_validate(os_aio_read_array);
+	os_aio_array_validate(os_aio_write_array);
+	os_aio_array_validate(os_aio_ibuf_array);
+	os_aio_array_validate(os_aio_log_array);
+	os_aio_array_validate(os_aio_sync_array);
+
+	return(TRUE);
+}
+
+/**********************************************************************//**
+Prints info of the aio arrays. */
+UNIV_INTERN
+void
+os_aio_print(
+/*=========*/
+	FILE*	file)	/*!< in: file where to print */
+{
+	os_aio_array_t*	array;
+	os_aio_slot_t*	slot;
+	ulint		n_reserved;
+	time_t		current_time;
+	double		time_elapsed;
+	double		avg_bytes_read;
+	ulint		i;
+
+	for (i = 0; i < srv_n_file_io_threads; i++) {
+		fprintf(file, "I/O thread %lu state: %s (%s)", (ulong) i,
+			srv_io_thread_op_info[i],
+			srv_io_thread_function[i]);
+
+#ifndef __WIN__
+		if (os_aio_segment_wait_events[i]->is_set) {
+			fprintf(file, " ev set");
+		}
+#endif
+
+		fprintf(file, "\n");
+	}
+
+	fputs("Pending normal aio reads:", file);
+
+	array = os_aio_read_array;
+loop:
+	ut_a(array);
+
+	os_mutex_enter(array->mutex);
+
+	ut_a(array->n_slots > 0);
+	ut_a(array->n_segments > 0);
+
+	n_reserved = 0;
+
+	for (i = 0; i < array->n_slots; i++) {
+		slot = os_aio_array_get_nth_slot(array, i);
+
+		if (slot->reserved) {
+			n_reserved++;
+#if 0
+			fprintf(stderr, "Reserved slot, messages %p %p\n",
+				(void*) slot->message1,
+				(void*) slot->message2);
+#endif
+			ut_a(slot->len > 0);
+		}
+	}
+
+	ut_a(array->n_reserved == n_reserved);
+
+	fprintf(file, " %lu", (ulong) n_reserved);
+
+	os_mutex_exit(array->mutex);
+
+	if (array == os_aio_read_array) {
+		fputs(", aio writes:", file);
+
+		array = os_aio_write_array;
+
+		goto loop;
+	}
+
+	if (array == os_aio_write_array) {
+		fputs(",\n ibuf aio reads:", file);
+		array = os_aio_ibuf_array;
+
+		goto loop;
+	}
+
+	if (array == os_aio_ibuf_array) {
+		fputs(", log i/o's:", file);
+		array = os_aio_log_array;
+
+		goto loop;
+	}
+
+	if (array == os_aio_log_array) {
+		fputs(", sync i/o's:", file);
+		array = os_aio_sync_array;
+
+		goto loop;
+	}
+
+	putc('\n', file);
+	current_time = time(NULL);
+	time_elapsed = 0.001 + difftime(current_time, os_last_printout);
+
+	fprintf(file,
+		"Pending flushes (fsync) log: %lu; buffer pool: %lu\n"
+		"%lu OS file reads, %lu OS file writes, %lu OS fsyncs\n",
+		(ulong) fil_n_pending_log_flushes,
+		(ulong) fil_n_pending_tablespace_flushes,
+		(ulong) os_n_file_reads, (ulong) os_n_file_writes,
+		(ulong) os_n_fsyncs);
+
+	if (os_file_n_pending_preads != 0 || os_file_n_pending_pwrites != 0) {
+		fprintf(file,
+			"%lu pending preads, %lu pending pwrites\n",
+			(ulong) os_file_n_pending_preads,
+			(ulong) os_file_n_pending_pwrites);
+	}
+
+	if (os_n_file_reads == os_n_file_reads_old) {
+		avg_bytes_read = 0.0;
+	} else {
+		avg_bytes_read = (double) os_bytes_read_since_printout
+			/ (os_n_file_reads - os_n_file_reads_old);
+	}
+
+	fprintf(file,
+		"%.2f reads/s, %lu avg bytes/read,"
+		" %.2f writes/s, %.2f fsyncs/s\n",
+		(os_n_file_reads - os_n_file_reads_old)
+		/ time_elapsed,
+		(ulong)avg_bytes_read,
+		(os_n_file_writes - os_n_file_writes_old)
+		/ time_elapsed,
+		(os_n_fsyncs - os_n_fsyncs_old)
+		/ time_elapsed);
+
+	os_n_file_reads_old = os_n_file_reads;
+	os_n_file_writes_old = os_n_file_writes;
+	os_n_fsyncs_old = os_n_fsyncs;
+	os_bytes_read_since_printout = 0;
+
+	os_last_printout = current_time;
+}
+
+/**********************************************************************//**
+Refreshes the statistics used to print per-second averages. */
+UNIV_INTERN
+void
+os_aio_refresh_stats(void)
+/*======================*/
+{
+	os_n_file_reads_old = os_n_file_reads;
+	os_n_file_writes_old = os_n_file_writes;
+	os_n_fsyncs_old = os_n_fsyncs;
+	os_bytes_read_since_printout = 0;
+
+	os_last_printout = time(NULL);
+}
+
+#ifdef UNIV_DEBUG
+/**********************************************************************//**
+Checks that all slots in the system have been freed, that is, there are
+no pending io operations.
+@return	TRUE if all free */
+UNIV_INTERN
+ibool
+os_aio_all_slots_free(void)
+/*=======================*/
+{
+	os_aio_array_t*	array;
+	ulint		n_res	= 0;
+
+	array = os_aio_read_array;
+
+	os_mutex_enter(array->mutex);
+
+	n_res += array->n_reserved;
+
+	os_mutex_exit(array->mutex);
+
+	array = os_aio_write_array;
+
+	os_mutex_enter(array->mutex);
+
+	n_res += array->n_reserved;
+
+	os_mutex_exit(array->mutex);
+
+	array = os_aio_ibuf_array;
+
+	os_mutex_enter(array->mutex);
+
+	n_res += array->n_reserved;
+
+	os_mutex_exit(array->mutex);
+
+	array = os_aio_log_array;
+
+	os_mutex_enter(array->mutex);
+
+	n_res += array->n_reserved;
+
+	os_mutex_exit(array->mutex);
+
+	array = os_aio_sync_array;
+
+	os_mutex_enter(array->mutex);
+
+	n_res += array->n_reserved;
+
+	os_mutex_exit(array->mutex);
+
+	if (n_res == 0) {
+
+		return(TRUE);
+	}
+
+	return(FALSE);
+}
+#endif /* UNIV_DEBUG */
+
+#endif /* !UNIV_HOTBACKUP */
diff --git a/storage/xtradb/os/os0proc.c b/storage/xtradb/os/os0proc.c
new file mode 100644
index 00000000000..4567d96b6f4
--- /dev/null
+++ b/storage/xtradb/os/os0proc.c
@@ -0,0 +1,401 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file os/os0proc.c
+The interface to the operating system
+process control primitives
+
+Created 9/30/1995 Heikki Tuuri
+*******************************************************/
+
+#include "os0proc.h"
+#ifdef UNIV_NONINL
+#include "os0proc.ic"
+#endif
+
+#include "ut0mem.h"
+#include "ut0byte.h"
+
+/* FreeBSD for example has only MAP_ANON, Linux has MAP_ANONYMOUS and
+MAP_ANON but MAP_ANON is marked as deprecated */
+#if defined(MAP_ANONYMOUS)
+#define OS_MAP_ANON	MAP_ANONYMOUS
+#elif defined(MAP_ANON)
+#define OS_MAP_ANON	MAP_ANON
+#endif
+
+UNIV_INTERN ibool os_use_large_pages;
+/* Large page size. This may be a boot-time option on some platforms */
+UNIV_INTERN ulint os_large_page_size;
+
+/****************************************************************//**
+Converts the current process id to a number. It is not guaranteed that the
+number is unique. In Linux returns the 'process number' of the current
+thread. That number is the same as one sees in 'top', for example. In Linux
+the thread id is not the same as one sees in 'top'.
+@return	process id as a number */
+UNIV_INTERN
+ulint
+os_proc_get_number(void)
+/*====================*/
+{
+#ifdef __WIN__
+	return((ulint)GetCurrentProcessId());
+#else
+	return((ulint)getpid());
+#endif
+}
+
+/****************************************************************//**
+Allocates large pages memory.
+@return	allocated memory */
+UNIV_INTERN
+void*
+os_mem_alloc_large(
+/*===============*/
+	ulint*	n)			/*!< in/out: number of bytes */
+{
+	void*	ptr;
+	ulint	size;
+#if defined HAVE_LARGE_PAGES && defined UNIV_LINUX
+	int shmid;
+	struct shmid_ds buf;
+
+	if (!os_use_large_pages || !os_large_page_size) {
+		goto skip;
+	}
+
+	/* Align block size to os_large_page_size */
+	ut_ad(ut_is_2pow(os_large_page_size));
+	size = ut_2pow_round(*n + (os_large_page_size - 1),
+			     os_large_page_size);
+
+	shmid = shmget(IPC_PRIVATE, (size_t)size, SHM_HUGETLB | SHM_R | SHM_W);
+	if (shmid < 0) {
+		fprintf(stderr, "InnoDB: HugeTLB: Warning: Failed to allocate"
+			" %lu bytes. errno %d\n", size, errno);
+		ptr = NULL;
+	} else {
+		ptr = shmat(shmid, NULL, 0);
+		if (ptr == (void *)-1) {
+			fprintf(stderr, "InnoDB: HugeTLB: Warning: Failed to"
+				" attach shared memory segment, errno %d\n",
+				errno);
+			ptr = NULL;
+		}
+
+		/* Remove the shared memory segment so that it will be
+		automatically freed after memory is detached or
+		process exits */
+		shmctl(shmid, IPC_RMID, &buf);
+	}
+
+	if (ptr) {
+		*n = size;
+		os_fast_mutex_lock(&ut_list_mutex);
+		ut_total_allocated_memory += size;
+		os_fast_mutex_unlock(&ut_list_mutex);
+# ifdef UNIV_SET_MEM_TO_ZERO
+		memset(ptr, '\0', size);
+# endif
+		UNIV_MEM_ALLOC(ptr, size);
+		return(ptr);
+	}
+
+	fprintf(stderr, "InnoDB HugeTLB: Warning: Using conventional"
+		" memory pool\n");
+skip:
+#endif /* HAVE_LARGE_PAGES && UNIV_LINUX */
+
+#ifdef __WIN__
+	SYSTEM_INFO	system_info;
+	GetSystemInfo(&system_info);
+
+	/* Align block size to system page size */
+	ut_ad(ut_is_2pow(system_info.dwPageSize));
+	/* system_info.dwPageSize is only 32-bit. Casting to ulint is required
+	on 64-bit Windows. */
+	size = *n = ut_2pow_round(*n + (system_info.dwPageSize - 1),
+				  (ulint) system_info.dwPageSize);
+	ptr = VirtualAlloc(NULL, size, MEM_COMMIT | MEM_RESERVE,
+			   PAGE_READWRITE);
+	if (!ptr) {
+		fprintf(stderr, "InnoDB: VirtualAlloc(%lu bytes) failed;"
+			" Windows error %lu\n",
+			(ulong) size, (ulong) GetLastError());
+	} else {
+		os_fast_mutex_lock(&ut_list_mutex);
+		ut_total_allocated_memory += size;
+		os_fast_mutex_unlock(&ut_list_mutex);
+		UNIV_MEM_ALLOC(ptr, size);
+	}
+#elif defined __NETWARE__ || !defined OS_MAP_ANON
+	size = *n;
+	ptr = ut_malloc_low(size, TRUE, FALSE);
+#else
+# ifdef HAVE_GETPAGESIZE
+	size = getpagesize();
+# else
+	size = UNIV_PAGE_SIZE;
+# endif
+	/* Align block size to system page size */
+	ut_ad(ut_is_2pow(size));
+	size = *n = ut_2pow_round(*n + (size - 1), size);
+	ptr = mmap(NULL, size, PROT_READ | PROT_WRITE,
+		   MAP_PRIVATE | OS_MAP_ANON, -1, 0);
+	if (UNIV_UNLIKELY(ptr == (void*) -1)) {
+		fprintf(stderr, "InnoDB: mmap(%lu bytes) failed;"
+			" errno %lu\n",
+			(ulong) size, (ulong) errno);
+		ptr = NULL;
+	} else {
+		os_fast_mutex_lock(&ut_list_mutex);
+		ut_total_allocated_memory += size;
+		os_fast_mutex_unlock(&ut_list_mutex);
+		UNIV_MEM_ALLOC(ptr, size);
+	}
+#endif
+	return(ptr);
+}
+
+/****************************************************************//**
+Frees large pages memory. */
+UNIV_INTERN
+void
+os_mem_free_large(
+/*==============*/
+	void	*ptr,			/*!< in: pointer returned by
+					os_mem_alloc_large() */
+	ulint	size)			/*!< in: size returned by
+					os_mem_alloc_large() */
+{
+	os_fast_mutex_lock(&ut_list_mutex);
+	ut_a(ut_total_allocated_memory >= size);
+	os_fast_mutex_unlock(&ut_list_mutex);
+
+#if defined HAVE_LARGE_PAGES && defined UNIV_LINUX
+	if (os_use_large_pages && os_large_page_size && !shmdt(ptr)) {
+		os_fast_mutex_lock(&ut_list_mutex);
+		ut_a(ut_total_allocated_memory >= size);
+		ut_total_allocated_memory -= size;
+		os_fast_mutex_unlock(&ut_list_mutex);
+		UNIV_MEM_FREE(ptr, size);
+		return;
+	}
+#endif /* HAVE_LARGE_PAGES && UNIV_LINUX */
+#ifdef __WIN__
+	/* When RELEASE memory, the size parameter must be 0.
+	Do not use MEM_RELEASE with MEM_DECOMMIT. */
+	if (!VirtualFree(ptr, 0, MEM_RELEASE)) {
+		fprintf(stderr, "InnoDB: VirtualFree(%p, %lu) failed;"
+			" Windows error %lu\n",
+			ptr, (ulong) size, (ulong) GetLastError());
+	} else {
+		os_fast_mutex_lock(&ut_list_mutex);
+		ut_a(ut_total_allocated_memory >= size);
+		ut_total_allocated_memory -= size;
+		os_fast_mutex_unlock(&ut_list_mutex);
+		UNIV_MEM_FREE(ptr, size);
+	}
+#elif defined __NETWARE__ || !defined OS_MAP_ANON
+	ut_free(ptr);
+#else
+	if (munmap(ptr, size)) {
+		fprintf(stderr, "InnoDB: munmap(%p, %lu) failed;"
+			" errno %lu\n",
+			ptr, (ulong) size, (ulong) errno);
+	} else {
+		os_fast_mutex_lock(&ut_list_mutex);
+		ut_a(ut_total_allocated_memory >= size);
+		ut_total_allocated_memory -= size;
+		os_fast_mutex_unlock(&ut_list_mutex);
+		UNIV_MEM_FREE(ptr, size);
+	}
+#endif
+}
+
+/****************************************************************//**
+Allocates or attaches and reuses shared memory segment.
+The content is not cleared automatically.
+@return	allocated memory */
+UNIV_INTERN
+void*
+os_shm_alloc(
+/*=========*/
+	ulint*	n,			/*!< in/out: number of bytes */
+	uint	key,
+	ibool*	is_new)
+{
+	void*	ptr;
+#if defined HAVE_SYS_IPC_H && HAVE_SYS_SHM_H
+	ulint	size;
+	int	shmid;
+
+	*is_new = FALSE;
+	fprintf(stderr,
+		"InnoDB: The shared memory segment containing the buffer pool is: key  %#x (%d).\n",
+		key, key);
+# if defined HAVE_LARGE_PAGES && defined UNIV_LINUX
+	if (!os_use_large_pages || !os_large_page_size) {
+		goto skip;
+	}
+
+	/* Align block size to os_large_page_size */
+	ut_ad(ut_is_2pow(os_large_page_size));
+	size = ut_2pow_round(*n + (os_large_page_size - 1),
+			     os_large_page_size);
+
+	shmid = shmget((key_t)key, (size_t)size,
+			IPC_CREAT | IPC_EXCL | SHM_HUGETLB | SHM_R | SHM_W);
+	if (shmid < 0) {
+		if (errno == EEXIST) {
+			fprintf(stderr,
+				"InnoDB: HugeTLB: The shared memory segment exists.\n");
+			shmid = shmget((key_t)key, (size_t)size,
+					SHM_HUGETLB | SHM_R | SHM_W);
+			if (shmid < 0) {
+				fprintf(stderr,
+					"InnoDB: HugeTLB: Warning: Failed to allocate %lu bytes. (reuse) errno %d\n",
+					size, errno);
+				goto skip;
+			} else {
+				fprintf(stderr,
+					"InnoDB: HugeTLB: The existent shared memory segment is used.\n");
+			}
+		} else {
+			fprintf(stderr,
+				"InnoDB: HugeTLB: Warning: Failed to allocate %lu bytes. (new) errno %d\n",
+				size, errno);
+			goto skip;
+		}
+	} else {
+		*is_new = TRUE;
+		fprintf(stderr,
+			"InnoDB: HugeTLB: A new shared memory segment has been created .\n");
+	}
+
+	ptr = shmat(shmid, NULL, 0);
+	if (ptr == (void *)-1) {
+		fprintf(stderr,
+			"InnoDB: HugeTLB: Warning: Failed to attach shared memory segment, errno %d\n",
+			errno);
+		ptr = NULL;
+	}
+
+	if (ptr) {
+		*n = size;
+		os_fast_mutex_lock(&ut_list_mutex);
+		ut_total_allocated_memory += size;
+		os_fast_mutex_unlock(&ut_list_mutex);
+		UNIV_MEM_ALLOC(ptr, size);
+		return(ptr);
+	}
+skip:
+	*is_new = FALSE;
+# endif /* HAVE_LARGE_PAGES && defined UNIV_LINUX */
+# ifdef HAVE_GETPAGESIZE
+	size = getpagesize();
+# else
+	size = UNIV_PAGE_SIZE;
+# endif
+	/* Align block size to system page size */
+	ut_ad(ut_is_2pow(size));
+	size = *n = ut_2pow_round(*n + (size - 1), size);
+
+	shmid = shmget((key_t)key, (size_t)size,
+			IPC_CREAT | IPC_EXCL | SHM_R | SHM_W);
+	if (shmid < 0) {
+		if (errno == EEXIST) {
+			fprintf(stderr,
+				"InnoDB: A shared memory segment containing the buffer pool seems to already exist.\n");
+			shmid = shmget((key_t)key, (size_t)size,
+					SHM_R | SHM_W);
+			if (shmid < 0) {
+				fprintf(stderr,
+					"InnoDB: Warning: Failed to allocate %lu bytes. (reuse) errno %d\n",
+					size, errno);
+				ptr = NULL;
+				goto end;
+			} else {
+				fprintf(stderr,
+					"InnoDB: The existent shared memory segment is used.\n");
+			}
+		} else {
+			fprintf(stderr,
+				"InnoDB: Warning: Failed to allocate %lu bytes. (new) errno %d\n",
+				size, errno);
+			ptr = NULL;
+			goto end;
+		}
+	} else {
+		*is_new = TRUE;
+		fprintf(stderr,
+			"InnoDB: A new shared memory segment has been created.\n");
+	}
+
+	ptr = shmat(shmid, NULL, 0);
+	if (ptr == (void *)-1) {
+		fprintf(stderr,
+			"InnoDB: Warning: Failed to attach shared memory segment, errno %d\n",
+			errno);
+		ptr = NULL;
+	}
+
+	if (ptr) {
+		*n = size;
+		os_fast_mutex_lock(&ut_list_mutex);
+		ut_total_allocated_memory += size;
+		os_fast_mutex_unlock(&ut_list_mutex);
+		UNIV_MEM_ALLOC(ptr, size);
+	}
+end:
+#else /* HAVE_SYS_IPC_H && HAVE_SYS_SHM_H */
+	fprintf(stderr, "InnoDB: shared memory segment is not supported.\n");
+	ptr = NULL;
+#endif /* HAVE_SYS_IPC_H && HAVE_SYS_SHM_H */
+	return(ptr);
+}
+
+/****************************************************************//**
+Detach shared memory segment. */
+UNIV_INTERN
+void
+os_shm_free(
+/*========*/
+	void	*ptr,			/*!< in: pointer returned by
+					os_shm_alloc() */
+	ulint	size)			/*!< in: size returned by
+					os_shm_alloc() */
+{
+	os_fast_mutex_lock(&ut_list_mutex);
+	ut_a(ut_total_allocated_memory >= size);
+	os_fast_mutex_unlock(&ut_list_mutex);
+
+#if defined HAVE_SYS_IPC_H && HAVE_SYS_SHM_H
+	if (!shmdt(ptr)) {
+		os_fast_mutex_lock(&ut_list_mutex);
+		ut_a(ut_total_allocated_memory >= size);
+		ut_total_allocated_memory -= size;
+		os_fast_mutex_unlock(&ut_list_mutex);
+		UNIV_MEM_FREE(ptr, size);
+	}
+#else /* HAVE_SYS_IPC_H && HAVE_SYS_SHM_H */
+	fprintf(stderr, "InnoDB: shared memory segment is not supported.\n");
+#endif /* HAVE_SYS_IPC_H && HAVE_SYS_SHM_H */
+}
diff --git a/storage/xtradb/os/os0sync.c b/storage/xtradb/os/os0sync.c
new file mode 100644
index 00000000000..f9ab58c2ee4
--- /dev/null
+++ b/storage/xtradb/os/os0sync.c
@@ -0,0 +1,762 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file os/os0sync.c
+The interface to the operating system
+synchronization primitives.
+
+Created 9/6/1995 Heikki Tuuri
+*******************************************************/
+
+#include "os0sync.h"
+#ifdef UNIV_NONINL
+#include "os0sync.ic"
+#endif
+
+#ifdef __WIN__
+#include <windows.h>
+#else
+#include <sys/time.h>
+#include <time.h>
+#endif
+
+#include "ut0mem.h"
+#include "srv0start.h"
+
+/* Type definition for an operating system mutex struct */
+struct os_mutex_struct{
+	os_event_t	event;	/*!< Used by sync0arr.c for queing threads */
+	void*		handle;	/*!< OS handle to mutex */
+	ulint		count;	/*!< we use this counter to check
+				that the same thread does not
+				recursively lock the mutex: we
+				do not assume that the OS mutex
+				supports recursive locking, though
+				NT seems to do that */
+	UT_LIST_NODE_T(os_mutex_str_t) os_mutex_list;
+				/* list of all 'slow' OS mutexes created */
+};
+
+/** Mutex protecting counts and the lists of OS mutexes and events */
+UNIV_INTERN os_mutex_t	os_sync_mutex;
+/** TRUE if os_sync_mutex has been initialized */
+static ibool		os_sync_mutex_inited	= FALSE;
+/** TRUE when os_sync_free() is being executed */
+static ibool		os_sync_free_called	= FALSE;
+
+/** This is incremented by 1 in os_thread_create and decremented by 1 in
+os_thread_exit */
+UNIV_INTERN ulint	os_thread_count		= 0;
+
+/** The list of all events created */
+static UT_LIST_BASE_NODE_T(os_event_struct_t)	os_event_list;
+
+/** The list of all OS 'slow' mutexes */
+static UT_LIST_BASE_NODE_T(os_mutex_str_t)	os_mutex_list;
+
+UNIV_INTERN ulint	os_event_count		= 0;
+UNIV_INTERN ulint	os_mutex_count		= 0;
+UNIV_INTERN ulint	os_fast_mutex_count	= 0;
+
+/* Because a mutex is embedded inside an event and there is an
+event embedded inside a mutex, on free, this generates a recursive call.
+This version of the free event function doesn't acquire the global lock */
+static void os_event_free_internal(os_event_t	event);
+
+/*********************************************************//**
+Initializes global event and OS 'slow' mutex lists. */
+UNIV_INTERN
+void
+os_sync_init(void)
+/*==============*/
+{
+	UT_LIST_INIT(os_event_list);
+	UT_LIST_INIT(os_mutex_list);
+
+	os_sync_mutex = NULL;
+	os_sync_mutex_inited = FALSE;
+
+	os_sync_mutex = os_mutex_create(NULL);
+
+	os_sync_mutex_inited = TRUE;
+}
+
+/*********************************************************//**
+Frees created events and OS 'slow' mutexes. */
+UNIV_INTERN
+void
+os_sync_free(void)
+/*==============*/
+{
+	os_event_t	event;
+	os_mutex_t	mutex;
+
+	os_sync_free_called = TRUE;
+	event = UT_LIST_GET_FIRST(os_event_list);
+
+	while (event) {
+
+		os_event_free(event);
+
+		event = UT_LIST_GET_FIRST(os_event_list);
+	}
+
+	mutex = UT_LIST_GET_FIRST(os_mutex_list);
+
+	while (mutex) {
+		if (mutex == os_sync_mutex) {
+			/* Set the flag to FALSE so that we do not try to
+			reserve os_sync_mutex any more in remaining freeing
+			operations in shutdown */
+			os_sync_mutex_inited = FALSE;
+		}
+
+		os_mutex_free(mutex);
+
+		mutex = UT_LIST_GET_FIRST(os_mutex_list);
+	}
+	os_sync_free_called = FALSE;
+}
+
+/*********************************************************//**
+Creates an event semaphore, i.e., a semaphore which may just have two
+states: signaled and nonsignaled. The created event is manual reset: it
+must be reset explicitly by calling sync_os_reset_event.
+@return	the event handle */
+UNIV_INTERN
+os_event_t
+os_event_create(
+/*============*/
+	const char*	name)	/*!< in: the name of the event, if NULL
+				the event is created without a name */
+{
+#ifdef __WIN__
+	os_event_t event;
+
+	event = ut_malloc(sizeof(struct os_event_struct));
+
+	event->handle = CreateEvent(NULL, /* No security attributes */
+				    TRUE, /* Manual reset */
+				    FALSE, /* Initial state nonsignaled */
+				    (LPCTSTR) name);
+	if (!event->handle) {
+		fprintf(stderr,
+			"InnoDB: Could not create a Windows event semaphore;"
+			" Windows error %lu\n",
+			(ulong) GetLastError());
+	}
+#else /* Unix */
+	os_event_t	event;
+
+	UT_NOT_USED(name);
+
+	event = ut_malloc(sizeof(struct os_event_struct));
+
+	os_fast_mutex_init(&(event->os_mutex));
+
+	ut_a(0 == pthread_cond_init(&(event->cond_var), NULL));
+
+	event->is_set = FALSE;
+
+	/* We return this value in os_event_reset(), which can then be
+	be used to pass to the os_event_wait_low(). The value of zero
+	is reserved in os_event_wait_low() for the case when the
+	caller does not want to pass any signal_count value. To
+	distinguish between the two cases we initialize signal_count
+	to 1 here. */
+	event->signal_count = 1;
+#endif /* __WIN__ */
+
+	/* The os_sync_mutex can be NULL because during startup an event
+	can be created [ because it's embedded in the mutex/rwlock ] before
+	this module has been initialized */
+	if (os_sync_mutex != NULL) {
+		os_mutex_enter(os_sync_mutex);
+	}
+
+	/* Put to the list of events */
+	UT_LIST_ADD_FIRST(os_event_list, os_event_list, event);
+
+	os_event_count++;
+
+	if (os_sync_mutex != NULL) {
+		os_mutex_exit(os_sync_mutex);
+	}
+
+	return(event);
+}
+
+/**********************************************************//**
+Sets an event semaphore to the signaled state: lets waiting threads
+proceed. */
+UNIV_INTERN
+void
+os_event_set(
+/*=========*/
+	os_event_t	event)	/*!< in: event to set */
+{
+#ifdef __WIN__
+	ut_a(event);
+	ut_a(SetEvent(event->handle));
+#else
+	ut_a(event);
+
+	os_fast_mutex_lock(&(event->os_mutex));
+
+	if (event->is_set) {
+		/* Do nothing */
+	} else {
+		event->is_set = TRUE;
+		event->signal_count += 1;
+		ut_a(0 == pthread_cond_broadcast(&(event->cond_var)));
+	}
+
+	os_fast_mutex_unlock(&(event->os_mutex));
+#endif
+}
+
+/**********************************************************//**
+Resets an event semaphore to the nonsignaled state. Waiting threads will
+stop to wait for the event.
+The return value should be passed to os_even_wait_low() if it is desired
+that this thread should not wait in case of an intervening call to
+os_event_set() between this os_event_reset() and the
+os_event_wait_low() call. See comments for os_event_wait_low().
+@return	current signal_count. */
+UNIV_INTERN
+ib_int64_t
+os_event_reset(
+/*===========*/
+	os_event_t	event)	/*!< in: event to reset */
+{
+	ib_int64_t	ret = 0;
+
+#ifdef __WIN__
+	ut_a(event);
+
+	ut_a(ResetEvent(event->handle));
+#else
+	ut_a(event);
+
+	os_fast_mutex_lock(&(event->os_mutex));
+
+	if (!event->is_set) {
+		/* Do nothing */
+	} else {
+		event->is_set = FALSE;
+	}
+	ret = event->signal_count;
+
+	os_fast_mutex_unlock(&(event->os_mutex));
+#endif
+	return(ret);
+}
+
+/**********************************************************//**
+Frees an event object, without acquiring the global lock. */
+static
+void
+os_event_free_internal(
+/*===================*/
+	os_event_t	event)	/*!< in: event to free */
+{
+#ifdef __WIN__
+	ut_a(event);
+
+	ut_a(CloseHandle(event->handle));
+#else
+	ut_a(event);
+
+	/* This is to avoid freeing the mutex twice */
+	os_fast_mutex_free(&(event->os_mutex));
+
+	ut_a(0 == pthread_cond_destroy(&(event->cond_var)));
+#endif
+	/* Remove from the list of events */
+
+	UT_LIST_REMOVE(os_event_list, os_event_list, event);
+
+	os_event_count--;
+
+	ut_free(event);
+}
+
+/**********************************************************//**
+Frees an event object. */
+UNIV_INTERN
+void
+os_event_free(
+/*==========*/
+	os_event_t	event)	/*!< in: event to free */
+
+{
+#ifdef __WIN__
+	ut_a(event);
+
+	ut_a(CloseHandle(event->handle));
+#else
+	ut_a(event);
+
+	os_fast_mutex_free(&(event->os_mutex));
+	ut_a(0 == pthread_cond_destroy(&(event->cond_var)));
+#endif
+	/* Remove from the list of events */
+
+	os_mutex_enter(os_sync_mutex);
+
+	UT_LIST_REMOVE(os_event_list, os_event_list, event);
+
+	os_event_count--;
+
+	os_mutex_exit(os_sync_mutex);
+
+	ut_free(event);
+}
+
+/**********************************************************//**
+Waits for an event object until it is in the signaled state. If
+srv_shutdown_state == SRV_SHUTDOWN_EXIT_THREADS this also exits the
+waiting thread when the event becomes signaled (or immediately if the
+event is already in the signaled state).
+
+Typically, if the event has been signalled after the os_event_reset()
+we'll return immediately because event->is_set == TRUE.
+There are, however, situations (e.g.: sync_array code) where we may
+lose this information. For example:
+
+thread A calls os_event_reset()
+thread B calls os_event_set()   [event->is_set == TRUE]
+thread C calls os_event_reset() [event->is_set == FALSE]
+thread A calls os_event_wait()  [infinite wait!]
+thread C calls os_event_wait()  [infinite wait!]
+
+Where such a scenario is possible, to avoid infinite wait, the
+value returned by os_event_reset() should be passed in as
+reset_sig_count. */
+UNIV_INTERN
+void
+os_event_wait_low(
+/*==============*/
+	os_event_t	event,		/*!< in: event to wait */
+	ib_int64_t	reset_sig_count)/*!< in: zero or the value
+					returned by previous call of
+					os_event_reset(). */
+{
+#ifdef __WIN__
+	DWORD	err;
+
+	ut_a(event);
+
+	UT_NOT_USED(reset_sig_count);
+
+	/* Specify an infinite time limit for waiting */
+	err = WaitForSingleObject(event->handle, INFINITE);
+
+	ut_a(err == WAIT_OBJECT_0);
+
+	if (srv_shutdown_state == SRV_SHUTDOWN_EXIT_THREADS) {
+		os_thread_exit(NULL);
+	}
+#else
+	ib_int64_t	old_signal_count;
+
+	os_fast_mutex_lock(&(event->os_mutex));
+
+	if (reset_sig_count) {
+		old_signal_count = reset_sig_count;
+	} else {
+		old_signal_count = event->signal_count;
+	}
+
+	for (;;) {
+		if (event->is_set == TRUE
+		    || event->signal_count != old_signal_count) {
+
+			os_fast_mutex_unlock(&(event->os_mutex));
+
+			if (srv_shutdown_state == SRV_SHUTDOWN_EXIT_THREADS) {
+
+				os_thread_exit(NULL);
+			}
+			/* Ok, we may return */
+
+			return;
+		}
+
+		pthread_cond_wait(&(event->cond_var), &(event->os_mutex));
+
+		/* Solaris manual said that spurious wakeups may occur: we
+		have to check if the event really has been signaled after
+		we came here to wait */
+	}
+#endif
+}
+
+/**********************************************************//**
+Waits for an event object until it is in the signaled state or
+a timeout is exceeded.
+@return	0 if success, OS_SYNC_TIME_EXCEEDED if timeout was exceeded */
+UNIV_INTERN
+ulint
+os_event_wait_time(
+/*===============*/
+	os_event_t	event,	/*!< in: event to wait */
+	ulint		wtime)	/*!< in: timeout in microseconds, or
+				OS_SYNC_INFINITE_TIME */
+{
+#ifdef __WIN__
+	DWORD	err;
+
+	ut_a(event);
+
+	if (wtime != OS_SYNC_INFINITE_TIME) {
+		err = WaitForSingleObject(event->handle, (DWORD) wtime / 1000);
+	} else {
+		err = WaitForSingleObject(event->handle, INFINITE);
+	}
+
+	if (err == WAIT_OBJECT_0) {
+
+		return(0);
+	} else if (err == WAIT_TIMEOUT) {
+
+		return(OS_SYNC_TIME_EXCEEDED);
+	} else {
+		ut_error;
+		return(1000000); /* dummy value to eliminate compiler warn. */
+	}
+#else
+	int	err;
+	int	ret = 0;
+	ulint	tmp;
+	ib_int64_t	old_count;
+	struct timeval tv_start;
+	struct timespec timeout;
+
+	if (wtime == OS_SYNC_INFINITE_TIME) {
+		os_event_wait(event);
+		return 0;
+	}
+
+	/* Compute the absolute point in time at which to time out. */
+	gettimeofday(&tv_start, NULL);
+	tmp = tv_start.tv_usec + wtime;
+	timeout.tv_sec = tv_start.tv_sec + (tmp / 1000000);
+	timeout.tv_nsec = (tmp % 1000000) * 1000;
+
+	os_fast_mutex_lock(&(event->os_mutex));
+	old_count = event->signal_count;
+
+	for (;;) {
+		if (event->is_set == TRUE || event->signal_count != old_count)
+			break;
+
+		err = pthread_cond_timedwait(&(event->cond_var),
+					     &(event->os_mutex), &timeout);
+		if (err == ETIMEDOUT) {
+			ret = OS_SYNC_TIME_EXCEEDED;
+			break;
+		}
+	}
+
+	os_fast_mutex_unlock(&(event->os_mutex));
+
+	if (srv_shutdown_state == SRV_SHUTDOWN_EXIT_THREADS) {
+
+		os_thread_exit(NULL);
+	}
+
+	return ret;
+#endif
+}
+
+#ifdef __WIN__
+/**********************************************************//**
+Waits for any event in an OS native event array. Returns if even a single
+one is signaled or becomes signaled.
+@return	index of the event which was signaled */
+UNIV_INTERN
+ulint
+os_event_wait_multiple(
+/*===================*/
+	ulint			n,	/*!< in: number of events in the
+					array */
+	os_native_event_t*	native_event_array)
+					/*!< in: pointer to an array of event
+					handles */
+{
+	DWORD	index;
+
+	ut_a(native_event_array);
+	ut_a(n > 0);
+
+	index = WaitForMultipleObjects((DWORD) n, native_event_array,
+				       FALSE,	   /* Wait for any 1 event */
+				       INFINITE); /* Infinite wait time
+						  limit */
+	ut_a(index >= WAIT_OBJECT_0);	/* NOTE: Pointless comparison */
+	ut_a(index < WAIT_OBJECT_0 + n);
+
+	if (srv_shutdown_state == SRV_SHUTDOWN_EXIT_THREADS) {
+		os_thread_exit(NULL);
+	}
+
+	return(index - WAIT_OBJECT_0);
+}
+#endif
+
+/*********************************************************//**
+Creates an operating system mutex semaphore. Because these are slow, the
+mutex semaphore of InnoDB itself (mutex_t) should be used where possible.
+@return	the mutex handle */
+UNIV_INTERN
+os_mutex_t
+os_mutex_create(
+/*============*/
+	const char*	name)	/*!< in: the name of the mutex, if NULL
+				the mutex is created without a name */
+{
+#ifdef __WIN__
+	HANDLE		mutex;
+	os_mutex_t	mutex_str;
+
+	mutex = CreateMutex(NULL,	/* No security attributes */
+			    FALSE,		/* Initial state: no owner */
+			    (LPCTSTR) name);
+	ut_a(mutex);
+#else
+	os_fast_mutex_t*	mutex;
+	os_mutex_t		mutex_str;
+
+	UT_NOT_USED(name);
+
+	mutex = ut_malloc(sizeof(os_fast_mutex_t));
+
+	os_fast_mutex_init(mutex);
+#endif
+	mutex_str = ut_malloc(sizeof(os_mutex_str_t));
+
+	mutex_str->handle = mutex;
+	mutex_str->count = 0;
+	mutex_str->event = os_event_create(NULL);
+
+	if (UNIV_LIKELY(os_sync_mutex_inited)) {
+		/* When creating os_sync_mutex itself we cannot reserve it */
+		os_mutex_enter(os_sync_mutex);
+	}
+
+	UT_LIST_ADD_FIRST(os_mutex_list, os_mutex_list, mutex_str);
+
+	os_mutex_count++;
+
+	if (UNIV_LIKELY(os_sync_mutex_inited)) {
+		os_mutex_exit(os_sync_mutex);
+	}
+
+	return(mutex_str);
+}
+
+/**********************************************************//**
+Acquires ownership of a mutex semaphore. */
+UNIV_INTERN
+void
+os_mutex_enter(
+/*===========*/
+	os_mutex_t	mutex)	/*!< in: mutex to acquire */
+{
+#ifdef __WIN__
+	DWORD	err;
+
+	ut_a(mutex);
+
+	/* Specify infinite time limit for waiting */
+	err = WaitForSingleObject(mutex->handle, INFINITE);
+
+	ut_a(err == WAIT_OBJECT_0);
+
+	(mutex->count)++;
+	ut_a(mutex->count == 1);
+#else
+	os_fast_mutex_lock(mutex->handle);
+
+	(mutex->count)++;
+
+	ut_a(mutex->count == 1);
+#endif
+}
+
+/**********************************************************//**
+Releases ownership of a mutex. */
+UNIV_INTERN
+void
+os_mutex_exit(
+/*==========*/
+	os_mutex_t	mutex)	/*!< in: mutex to release */
+{
+	ut_a(mutex);
+
+	ut_a(mutex->count == 1);
+
+	(mutex->count)--;
+#ifdef __WIN__
+	ut_a(ReleaseMutex(mutex->handle));
+#else
+	os_fast_mutex_unlock(mutex->handle);
+#endif
+}
+
+/**********************************************************//**
+Frees a mutex object. */
+UNIV_INTERN
+void
+os_mutex_free(
+/*==========*/
+	os_mutex_t	mutex)	/*!< in: mutex to free */
+{
+	ut_a(mutex);
+
+	if (UNIV_LIKELY(!os_sync_free_called)) {
+		os_event_free_internal(mutex->event);
+	}
+
+	if (UNIV_LIKELY(os_sync_mutex_inited)) {
+		os_mutex_enter(os_sync_mutex);
+	}
+
+	UT_LIST_REMOVE(os_mutex_list, os_mutex_list, mutex);
+
+	os_mutex_count--;
+
+	if (UNIV_LIKELY(os_sync_mutex_inited)) {
+		os_mutex_exit(os_sync_mutex);
+	}
+
+#ifdef __WIN__
+	ut_a(CloseHandle(mutex->handle));
+
+	ut_free(mutex);
+#else
+	os_fast_mutex_free(mutex->handle);
+	ut_free(mutex->handle);
+	ut_free(mutex);
+#endif
+}
+
+/*********************************************************//**
+Initializes an operating system fast mutex semaphore. */
+UNIV_INTERN
+void
+os_fast_mutex_init(
+/*===============*/
+	os_fast_mutex_t*	fast_mutex)	/*!< in: fast mutex */
+{
+#ifdef __WIN__
+	ut_a(fast_mutex);
+
+	InitializeCriticalSection((LPCRITICAL_SECTION) fast_mutex);
+#else
+	ut_a(0 == pthread_mutex_init(fast_mutex, MY_MUTEX_INIT_FAST));
+#endif
+	if (UNIV_LIKELY(os_sync_mutex_inited)) {
+		/* When creating os_sync_mutex itself (in Unix) we cannot
+		reserve it */
+
+		os_mutex_enter(os_sync_mutex);
+	}
+
+	os_fast_mutex_count++;
+
+	if (UNIV_LIKELY(os_sync_mutex_inited)) {
+		os_mutex_exit(os_sync_mutex);
+	}
+}
+
+/**********************************************************//**
+Acquires ownership of a fast mutex. */
+UNIV_INTERN
+void
+os_fast_mutex_lock(
+/*===============*/
+	os_fast_mutex_t*	fast_mutex)	/*!< in: mutex to acquire */
+{
+#ifdef __WIN__
+	EnterCriticalSection((LPCRITICAL_SECTION) fast_mutex);
+#else
+	pthread_mutex_lock(fast_mutex);
+#endif
+}
+
+/**********************************************************//**
+Releases ownership of a fast mutex. */
+UNIV_INTERN
+void
+os_fast_mutex_unlock(
+/*=================*/
+	os_fast_mutex_t*	fast_mutex)	/*!< in: mutex to release */
+{
+#ifdef __WIN__
+	LeaveCriticalSection(fast_mutex);
+#else
+	pthread_mutex_unlock(fast_mutex);
+#endif
+}
+
+/**********************************************************//**
+Frees a mutex object. */
+UNIV_INTERN
+void
+os_fast_mutex_free(
+/*===============*/
+	os_fast_mutex_t*	fast_mutex)	/*!< in: mutex to free */
+{
+#ifdef __WIN__
+	ut_a(fast_mutex);
+
+	DeleteCriticalSection((LPCRITICAL_SECTION) fast_mutex);
+#else
+	int	ret;
+
+	ret = pthread_mutex_destroy(fast_mutex);
+
+	if (UNIV_UNLIKELY(ret != 0)) {
+		ut_print_timestamp(stderr);
+		fprintf(stderr,
+			"  InnoDB: error: return value %lu when calling\n"
+			"InnoDB: pthread_mutex_destroy().\n", (ulint)ret);
+		fprintf(stderr,
+			"InnoDB: Byte contents of the pthread mutex at %p:\n",
+			(void*) fast_mutex);
+		ut_print_buf(stderr, fast_mutex, sizeof(os_fast_mutex_t));
+		putc('\n', stderr);
+	}
+#endif
+	if (UNIV_LIKELY(os_sync_mutex_inited)) {
+		/* When freeing the last mutexes, we have
+		already freed os_sync_mutex */
+
+		os_mutex_enter(os_sync_mutex);
+	}
+
+	ut_ad(os_fast_mutex_count > 0);
+	os_fast_mutex_count--;
+
+	if (UNIV_LIKELY(os_sync_mutex_inited)) {
+		os_mutex_exit(os_sync_mutex);
+	}
+}
diff --git a/storage/xtradb/os/os0thread.c b/storage/xtradb/os/os0thread.c
new file mode 100644
index 00000000000..34818ada804
--- /dev/null
+++ b/storage/xtradb/os/os0thread.c
@@ -0,0 +1,375 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file os/os0thread.c
+The interface to the operating system thread control primitives
+
+Created 9/8/1995 Heikki Tuuri
+*******************************************************/
+
+#include "os0thread.h"
+#ifdef UNIV_NONINL
+#include "os0thread.ic"
+#endif
+
+#ifdef __WIN__
+#include <windows.h>
+#endif
+
+#ifndef UNIV_HOTBACKUP
+#include "srv0srv.h"
+#include "os0sync.h"
+
+/***************************************************************//**
+Compares two thread ids for equality.
+@return	TRUE if equal */
+UNIV_INTERN
+ibool
+os_thread_eq(
+/*=========*/
+	os_thread_id_t	a,	/*!< in: OS thread or thread id */
+	os_thread_id_t	b)	/*!< in: OS thread or thread id */
+{
+#ifdef __WIN__
+	if (a == b) {
+		return(TRUE);
+	}
+
+	return(FALSE);
+#else
+	if (pthread_equal(a, b)) {
+		return(TRUE);
+	}
+
+	return(FALSE);
+#endif
+}
+
+/****************************************************************//**
+Converts an OS thread id to a ulint. It is NOT guaranteed that the ulint is
+unique for the thread though!
+@return	thread identifier as a number */
+UNIV_INTERN
+ulint
+os_thread_pf(
+/*=========*/
+	os_thread_id_t	a)	/*!< in: OS thread identifier */
+{
+#ifdef UNIV_HPUX10
+	/* In HP-UX-10.20 a pthread_t is a struct of 3 fields: field1, field2,
+	field3. We do not know if field1 determines the thread uniquely. */
+
+	return((ulint)(a.field1));
+#else
+	return((ulint)a);
+#endif
+}
+
+/*****************************************************************//**
+Returns the thread identifier of current thread. Currently the thread
+identifier in Unix is the thread handle itself. Note that in HP-UX
+pthread_t is a struct of 3 fields.
+@return	current thread identifier */
+UNIV_INTERN
+os_thread_id_t
+os_thread_get_curr_id(void)
+/*=======================*/
+{
+#ifdef __WIN__
+	return(GetCurrentThreadId());
+#else
+	return(pthread_self());
+#endif
+}
+
+/****************************************************************//**
+Creates a new thread of execution. The execution starts from
+the function given. The start function takes a void* parameter
+and returns an ulint.
+@return	handle to the thread */
+UNIV_INTERN
+os_thread_t
+os_thread_create(
+/*=============*/
+#ifndef __WIN__
+	os_posix_f_t		start_f,
+#else
+	ulint (*start_f)(void*),		/*!< in: pointer to function
+						from which to start */
+#endif
+	void*			arg,		/*!< in: argument to start
+						function */
+	os_thread_id_t*		thread_id)	/*!< out: id of the created
+						thread, or NULL */
+{
+#ifdef __WIN__
+	os_thread_t	thread;
+	DWORD		win_thread_id;
+
+	os_mutex_enter(os_sync_mutex);
+	os_thread_count++;
+	os_mutex_exit(os_sync_mutex);
+
+	thread = CreateThread(NULL,	/* no security attributes */
+			      0,	/* default size stack */
+			      (LPTHREAD_START_ROUTINE)start_f,
+			      arg,
+			      0,	/* thread runs immediately */
+			      &win_thread_id);
+
+	if (srv_set_thread_priorities) {
+
+		/* Set created thread priority the same as a normal query
+		in MYSQL: we try to prevent starvation of threads by
+		assigning same priority QUERY_PRIOR to all */
+
+		ut_a(SetThreadPriority(thread, srv_query_thread_priority));
+	}
+
+	if (thread_id) {
+		*thread_id = win_thread_id;
+	}
+
+	return(thread);
+#else
+	int		ret;
+	os_thread_t	pthread;
+	pthread_attr_t	attr;
+
+#ifndef UNIV_HPUX10
+	pthread_attr_init(&attr);
+#endif
+
+#ifdef UNIV_AIX
+	/* We must make sure a thread stack is at least 32 kB, otherwise
+	InnoDB might crash; we do not know if the default stack size on
+	AIX is always big enough. An empirical test on AIX-4.3 suggested
+	the size was 96 kB, though. */
+
+	ret = pthread_attr_setstacksize(&attr,
+					(size_t)(PTHREAD_STACK_MIN
+						 + 32 * 1024));
+	if (ret) {
+		fprintf(stderr,
+			"InnoDB: Error: pthread_attr_setstacksize"
+			" returned %d\n", ret);
+		exit(1);
+	}
+#endif
+#ifdef __NETWARE__
+	ret = pthread_attr_setstacksize(&attr,
+					(size_t) NW_THD_STACKSIZE);
+	if (ret) {
+		fprintf(stderr,
+			"InnoDB: Error: pthread_attr_setstacksize"
+			" returned %d\n", ret);
+		exit(1);
+	}
+#endif
+	os_mutex_enter(os_sync_mutex);
+	os_thread_count++;
+	os_mutex_exit(os_sync_mutex);
+
+#ifdef UNIV_HPUX10
+	ret = pthread_create(&pthread, pthread_attr_default, start_f, arg);
+#else
+	ret = pthread_create(&pthread, &attr, start_f, arg);
+#endif
+	if (ret) {
+		fprintf(stderr,
+			"InnoDB: Error: pthread_create returned %d\n", ret);
+		exit(1);
+	}
+
+#ifndef UNIV_HPUX10
+	pthread_attr_destroy(&attr);
+#endif
+	if (srv_set_thread_priorities) {
+
+		my_pthread_setprio(pthread, srv_query_thread_priority);
+	}
+
+	if (thread_id) {
+		*thread_id = pthread;
+	}
+
+	return(pthread);
+#endif
+}
+
+/*****************************************************************//**
+Exits the current thread. */
+UNIV_INTERN
+void
+os_thread_exit(
+/*===========*/
+	void*	exit_value)	/*!< in: exit value; in Windows this void*
+				is cast as a DWORD */
+{
+#ifdef UNIV_DEBUG_THREAD_CREATION
+	fprintf(stderr, "Thread exits, id %lu\n",
+		os_thread_pf(os_thread_get_curr_id()));
+#endif
+	os_mutex_enter(os_sync_mutex);
+	os_thread_count--;
+	os_mutex_exit(os_sync_mutex);
+
+#ifdef __WIN__
+	ExitThread((DWORD)exit_value);
+#else
+	pthread_detach(pthread_self());
+	pthread_exit(exit_value);
+#endif
+}
+
+/*****************************************************************//**
+Returns handle to the current thread.
+@return	current thread handle */
+UNIV_INTERN
+os_thread_t
+os_thread_get_curr(void)
+/*====================*/
+{
+#ifdef __WIN__
+	return(GetCurrentThread());
+#else
+	return(pthread_self());
+#endif
+}
+
+/*****************************************************************//**
+Advises the os to give up remainder of the thread's time slice. */
+UNIV_INTERN
+void
+os_thread_yield(void)
+/*=================*/
+{
+#if defined(__WIN__)
+	Sleep(0);
+#elif (defined(HAVE_SCHED_YIELD) && defined(HAVE_SCHED_H))
+	sched_yield();
+#elif defined(HAVE_PTHREAD_YIELD_ZERO_ARG)
+	pthread_yield();
+#elif defined(HAVE_PTHREAD_YIELD_ONE_ARG)
+	pthread_yield(0);
+#else
+	os_thread_sleep(0);
+#endif
+}
+#endif /* !UNIV_HOTBACKUP */
+
+/*****************************************************************//**
+The thread sleeps at least the time given in microseconds. */
+UNIV_INTERN
+void
+os_thread_sleep(
+/*============*/
+	ulint	tm)	/*!< in: time in microseconds */
+{
+#ifdef __WIN__
+	Sleep((DWORD) tm / 1000);
+#elif defined(__NETWARE__)
+	delay(tm / 1000);
+#else
+	struct timeval	t;
+
+	t.tv_sec = tm / 1000000;
+	t.tv_usec = tm % 1000000;
+
+	select(0, NULL, NULL, NULL, &t);
+#endif
+}
+
+#ifndef UNIV_HOTBACKUP
+/******************************************************************//**
+Sets a thread priority. */
+UNIV_INTERN
+void
+os_thread_set_priority(
+/*===================*/
+	os_thread_t	handle,	/*!< in: OS handle to the thread */
+	ulint		pri)	/*!< in: priority */
+{
+#ifdef __WIN__
+	int	os_pri;
+
+	if (pri == OS_THREAD_PRIORITY_BACKGROUND) {
+		os_pri = THREAD_PRIORITY_BELOW_NORMAL;
+	} else if (pri == OS_THREAD_PRIORITY_NORMAL) {
+		os_pri = THREAD_PRIORITY_NORMAL;
+	} else if (pri == OS_THREAD_PRIORITY_ABOVE_NORMAL) {
+		os_pri = THREAD_PRIORITY_HIGHEST;
+	} else {
+		ut_error;
+	}
+
+	ut_a(SetThreadPriority(handle, os_pri));
+#else
+	UT_NOT_USED(handle);
+	UT_NOT_USED(pri);
+#endif
+}
+
+/******************************************************************//**
+Gets a thread priority.
+@return	priority */
+UNIV_INTERN
+ulint
+os_thread_get_priority(
+/*===================*/
+	os_thread_t	handle __attribute__((unused)))
+				/*!< in: OS handle to the thread */
+{
+#ifdef __WIN__
+	int	os_pri;
+	ulint	pri;
+
+	os_pri = GetThreadPriority(handle);
+
+	if (os_pri == THREAD_PRIORITY_BELOW_NORMAL) {
+		pri = OS_THREAD_PRIORITY_BACKGROUND;
+	} else if (os_pri == THREAD_PRIORITY_NORMAL) {
+		pri = OS_THREAD_PRIORITY_NORMAL;
+	} else if (os_pri == THREAD_PRIORITY_HIGHEST) {
+		pri = OS_THREAD_PRIORITY_ABOVE_NORMAL;
+	} else {
+		ut_error;
+	}
+
+	return(pri);
+#else
+	return(0);
+#endif
+}
+
+/******************************************************************//**
+Gets the last operating system error code for the calling thread.
+@return	last error on Windows, 0 otherwise */
+UNIV_INTERN
+ulint
+os_thread_get_last_error(void)
+/*==========================*/
+{
+#ifdef __WIN__
+	return(GetLastError());
+#else
+	return(0);
+#endif
+}
+#endif /* !UNIV_HOTBACKUP */
diff --git a/storage/xtradb/page/page0cur.c b/storage/xtradb/page/page0cur.c
new file mode 100644
index 00000000000..fa3d2532deb
--- /dev/null
+++ b/storage/xtradb/page/page0cur.c
@@ -0,0 +1,2055 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/********************************************************************//**
+@file page/page0cur.c
+The page cursor
+
+Created 10/4/1994 Heikki Tuuri
+*************************************************************************/
+
+#include "page0cur.h"
+#ifdef UNIV_NONINL
+#include "page0cur.ic"
+#endif
+
+#include "page0zip.h"
+#include "mtr0log.h"
+#include "log0recv.h"
+#include "ut0ut.h"
+#ifndef UNIV_HOTBACKUP
+#include "rem0cmp.h"
+
+#ifdef PAGE_CUR_ADAPT
+# ifdef UNIV_SEARCH_PERF_STAT
+static ulint	page_cur_short_succ	= 0;
+# endif /* UNIV_SEARCH_PERF_STAT */
+
+/*******************************************************************//**
+This is a linear congruential generator PRNG. Returns a pseudo random
+number between 0 and 2^64-1 inclusive. The formula and the constants
+being used are:
+X[n+1] = (a * X[n] + c) mod m
+where:
+X[0] = ut_time_us(NULL)
+a = 1103515245 (3^5 * 5 * 7 * 129749)
+c = 12345 (3 * 5 * 823)
+m = 18446744073709551616 (2^64)
+
+@return	number between 0 and 2^64-1 */
+static
+ib_uint64_t
+page_cur_lcg_prng(void)
+/*===================*/
+{
+#define LCG_a	1103515245
+#define LCG_c	12345
+	static ib_uint64_t	lcg_current = 0;
+	static ibool		initialized = FALSE;
+
+	if (!initialized) {
+		lcg_current = (ib_uint64_t) ut_time_us(NULL);
+		initialized = TRUE;
+	}
+
+	/* no need to "% 2^64" explicitly because lcg_current is
+	64 bit and this will be done anyway */
+	lcg_current = LCG_a * lcg_current + LCG_c;
+
+	return(lcg_current);
+}
+
+/****************************************************************//**
+Tries a search shortcut based on the last insert.
+@return	TRUE on success */
+UNIV_INLINE
+ibool
+page_cur_try_search_shortcut(
+/*=========================*/
+	const buf_block_t*	block,	/*!< in: index page */
+	const dict_index_t*	index,	/*!< in: record descriptor */
+	const dtuple_t*		tuple,	/*!< in: data tuple */
+	ulint*			iup_matched_fields,
+					/*!< in/out: already matched
+					fields in upper limit record */
+	ulint*			iup_matched_bytes,
+					/*!< in/out: already matched
+					bytes in a field not yet
+					completely matched */
+	ulint*			ilow_matched_fields,
+					/*!< in/out: already matched
+					fields in lower limit record */
+	ulint*			ilow_matched_bytes,
+					/*!< in/out: already matched
+					bytes in a field not yet
+					completely matched */
+	page_cur_t*		cursor) /*!< out: page cursor */
+{
+	const rec_t*	rec;
+	const rec_t*	next_rec;
+	ulint		low_match;
+	ulint		low_bytes;
+	ulint		up_match;
+	ulint		up_bytes;
+#ifdef UNIV_SEARCH_DEBUG
+	page_cur_t	cursor2;
+#endif
+	ibool		success		= FALSE;
+	const page_t*	page		= buf_block_get_frame(block);
+	mem_heap_t*	heap		= NULL;
+	ulint		offsets_[REC_OFFS_NORMAL_SIZE];
+	ulint*		offsets		= offsets_;
+	rec_offs_init(offsets_);
+
+	ut_ad(dtuple_check_typed(tuple));
+
+	rec = page_header_get_ptr(page, PAGE_LAST_INSERT);
+	offsets = rec_get_offsets(rec, index, offsets,
+				  dtuple_get_n_fields(tuple), &heap);
+
+	ut_ad(rec);
+	ut_ad(page_rec_is_user_rec(rec));
+
+	ut_pair_min(&low_match, &low_bytes,
+		    *ilow_matched_fields, *ilow_matched_bytes,
+		    *iup_matched_fields, *iup_matched_bytes);
+
+	up_match = low_match;
+	up_bytes = low_bytes;
+
+	if (page_cmp_dtuple_rec_with_match(tuple, rec, offsets,
+					   &low_match, &low_bytes) < 0) {
+		goto exit_func;
+	}
+
+	next_rec = page_rec_get_next_const(rec);
+	offsets = rec_get_offsets(next_rec, index, offsets,
+				  dtuple_get_n_fields(tuple), &heap);
+
+	if (page_cmp_dtuple_rec_with_match(tuple, next_rec, offsets,
+					   &up_match, &up_bytes) >= 0) {
+		goto exit_func;
+	}
+
+	page_cur_position(rec, block, cursor);
+
+#ifdef UNIV_SEARCH_DEBUG
+	page_cur_search_with_match(block, index, tuple, PAGE_CUR_DBG,
+				   iup_matched_fields,
+				   iup_matched_bytes,
+				   ilow_matched_fields,
+				   ilow_matched_bytes,
+				   &cursor2);
+	ut_a(cursor2.rec == cursor->rec);
+
+	if (!page_rec_is_supremum(next_rec)) {
+
+		ut_a(*iup_matched_fields == up_match);
+		ut_a(*iup_matched_bytes == up_bytes);
+	}
+
+	ut_a(*ilow_matched_fields == low_match);
+	ut_a(*ilow_matched_bytes == low_bytes);
+#endif
+	if (!page_rec_is_supremum(next_rec)) {
+
+		*iup_matched_fields = up_match;
+		*iup_matched_bytes = up_bytes;
+	}
+
+	*ilow_matched_fields = low_match;
+	*ilow_matched_bytes = low_bytes;
+
+#ifdef UNIV_SEARCH_PERF_STAT
+	page_cur_short_succ++;
+#endif
+	success = TRUE;
+exit_func:
+	if (UNIV_LIKELY_NULL(heap)) {
+		mem_heap_free(heap);
+	}
+	return(success);
+}
+
+#endif
+
+#ifdef PAGE_CUR_LE_OR_EXTENDS
+/****************************************************************//**
+Checks if the nth field in a record is a character type field which extends
+the nth field in tuple, i.e., the field is longer or equal in length and has
+common first characters.
+@return	TRUE if rec field extends tuple field */
+static
+ibool
+page_cur_rec_field_extends(
+/*=======================*/
+	const dtuple_t*	tuple,	/*!< in: data tuple */
+	const rec_t*	rec,	/*!< in: record */
+	const ulint*	offsets,/*!< in: array returned by rec_get_offsets() */
+	ulint		n)	/*!< in: compare nth field */
+{
+	const dtype_t*	type;
+	const dfield_t*	dfield;
+	const byte*	rec_f;
+	ulint		rec_f_len;
+
+	ut_ad(rec_offs_validate(rec, NULL, offsets));
+	dfield = dtuple_get_nth_field(tuple, n);
+
+	type = dfield_get_type(dfield);
+
+	rec_f = rec_get_nth_field(rec, offsets, n, &rec_f_len);
+
+	if (type->mtype == DATA_VARCHAR
+	    || type->mtype == DATA_CHAR
+	    || type->mtype == DATA_FIXBINARY
+	    || type->mtype == DATA_BINARY
+	    || type->mtype == DATA_BLOB
+	    || type->mtype == DATA_VARMYSQL
+	    || type->mtype == DATA_MYSQL) {
+
+		if (dfield_get_len(dfield) != UNIV_SQL_NULL
+		    && rec_f_len != UNIV_SQL_NULL
+		    && rec_f_len >= dfield_get_len(dfield)
+		    && !cmp_data_data_slow(type->mtype, type->prtype,
+					   dfield_get_data(dfield),
+					   dfield_get_len(dfield),
+					   rec_f, dfield_get_len(dfield))) {
+
+			return(TRUE);
+		}
+	}
+
+	return(FALSE);
+}
+#endif /* PAGE_CUR_LE_OR_EXTENDS */
+
+/****************************************************************//**
+Searches the right position for a page cursor. */
+UNIV_INTERN
+void
+page_cur_search_with_match(
+/*=======================*/
+	const buf_block_t*	block,	/*!< in: buffer block */
+	const dict_index_t*	index,	/*!< in: record descriptor */
+	const dtuple_t*		tuple,	/*!< in: data tuple */
+	ulint			mode,	/*!< in: PAGE_CUR_L,
+					PAGE_CUR_LE, PAGE_CUR_G, or
+					PAGE_CUR_GE */
+	ulint*			iup_matched_fields,
+					/*!< in/out: already matched
+					fields in upper limit record */
+	ulint*			iup_matched_bytes,
+					/*!< in/out: already matched
+					bytes in a field not yet
+					completely matched */
+	ulint*			ilow_matched_fields,
+					/*!< in/out: already matched
+					fields in lower limit record */
+	ulint*			ilow_matched_bytes,
+					/*!< in/out: already matched
+					bytes in a field not yet
+					completely matched */
+	page_cur_t*		cursor)	/*!< out: page cursor */
+{
+	ulint		up;
+	ulint		low;
+	ulint		mid;
+	const page_t*	page;
+	const page_dir_slot_t* slot;
+	const rec_t*	up_rec;
+	const rec_t*	low_rec;
+	const rec_t*	mid_rec;
+	ulint		up_matched_fields;
+	ulint		up_matched_bytes;
+	ulint		low_matched_fields;
+	ulint		low_matched_bytes;
+	ulint		cur_matched_fields;
+	ulint		cur_matched_bytes;
+	int		cmp;
+#ifdef UNIV_SEARCH_DEBUG
+	int		dbg_cmp;
+	ulint		dbg_matched_fields;
+	ulint		dbg_matched_bytes;
+#endif
+#ifdef UNIV_ZIP_DEBUG
+	const page_zip_des_t*	page_zip = buf_block_get_page_zip(block);
+#endif /* UNIV_ZIP_DEBUG */
+	mem_heap_t*	heap		= NULL;
+	ulint		offsets_[REC_OFFS_NORMAL_SIZE];
+	ulint*		offsets		= offsets_;
+	rec_offs_init(offsets_);
+
+	ut_ad(block && tuple && iup_matched_fields && iup_matched_bytes
+	      && ilow_matched_fields && ilow_matched_bytes && cursor);
+	ut_ad(dtuple_validate(tuple));
+#ifdef UNIV_DEBUG
+# ifdef PAGE_CUR_DBG
+	if (mode != PAGE_CUR_DBG)
+# endif /* PAGE_CUR_DBG */
+# ifdef PAGE_CUR_LE_OR_EXTENDS
+		if (mode != PAGE_CUR_LE_OR_EXTENDS)
+# endif /* PAGE_CUR_LE_OR_EXTENDS */
+			ut_ad(mode == PAGE_CUR_L || mode == PAGE_CUR_LE
+			      || mode == PAGE_CUR_G || mode == PAGE_CUR_GE);
+#endif /* UNIV_DEBUG */
+	page = buf_block_get_frame(block);
+#ifdef UNIV_ZIP_DEBUG
+	ut_a(!page_zip || page_zip_validate(page_zip, page));
+#endif /* UNIV_ZIP_DEBUG */
+
+	page_check_dir(page);
+
+#ifdef PAGE_CUR_ADAPT
+	if (page_is_leaf(page)
+	    && (mode == PAGE_CUR_LE)
+	    && (page_header_get_field(page, PAGE_N_DIRECTION) > 3)
+	    && (page_header_get_ptr(page, PAGE_LAST_INSERT))
+	    && (page_header_get_field(page, PAGE_DIRECTION) == PAGE_RIGHT)) {
+
+		if (page_cur_try_search_shortcut(
+			    block, index, tuple,
+			    iup_matched_fields, iup_matched_bytes,
+			    ilow_matched_fields, ilow_matched_bytes,
+			    cursor)) {
+			return;
+		}
+	}
+# ifdef PAGE_CUR_DBG
+	if (mode == PAGE_CUR_DBG) {
+		mode = PAGE_CUR_LE;
+	}
+# endif
+#endif
+
+	/* The following flag does not work for non-latin1 char sets because
+	cmp_full_field does not tell how many bytes matched */
+#ifdef PAGE_CUR_LE_OR_EXTENDS
+	ut_a(mode != PAGE_CUR_LE_OR_EXTENDS);
+#endif /* PAGE_CUR_LE_OR_EXTENDS */
+
+	/* If mode PAGE_CUR_G is specified, we are trying to position the
+	cursor to answer a query of the form "tuple < X", where tuple is
+	the input parameter, and X denotes an arbitrary physical record on
+	the page. We want to position the cursor on the first X which
+	satisfies the condition. */
+
+	up_matched_fields  = *iup_matched_fields;
+	up_matched_bytes   = *iup_matched_bytes;
+	low_matched_fields = *ilow_matched_fields;
+	low_matched_bytes  = *ilow_matched_bytes;
+
+	/* Perform binary search. First the search is done through the page
+	directory, after that as a linear search in the list of records
+	owned by the upper limit directory slot. */
+
+	low = 0;
+	up = page_dir_get_n_slots(page) - 1;
+
+	/* Perform binary search until the lower and upper limit directory
+	slots come to the distance 1 of each other */
+
+	while (up - low > 1) {
+		mid = (low + up) / 2;
+		slot = page_dir_get_nth_slot(page, mid);
+		mid_rec = page_dir_slot_get_rec(slot);
+
+		ut_pair_min(&cur_matched_fields, &cur_matched_bytes,
+			    low_matched_fields, low_matched_bytes,
+			    up_matched_fields, up_matched_bytes);
+
+		offsets = rec_get_offsets(mid_rec, index, offsets,
+					  dtuple_get_n_fields_cmp(tuple),
+					  &heap);
+
+		cmp = cmp_dtuple_rec_with_match(tuple, mid_rec, offsets,
+						&cur_matched_fields,
+						&cur_matched_bytes);
+		if (UNIV_LIKELY(cmp > 0)) {
+low_slot_match:
+			low = mid;
+			low_matched_fields = cur_matched_fields;
+			low_matched_bytes = cur_matched_bytes;
+
+		} else if (UNIV_EXPECT(cmp, -1)) {
+#ifdef PAGE_CUR_LE_OR_EXTENDS
+			if (mode == PAGE_CUR_LE_OR_EXTENDS
+			    && page_cur_rec_field_extends(
+				    tuple, mid_rec, offsets,
+				    cur_matched_fields)) {
+
+				goto low_slot_match;
+			}
+#endif /* PAGE_CUR_LE_OR_EXTENDS */
+up_slot_match:
+			up = mid;
+			up_matched_fields = cur_matched_fields;
+			up_matched_bytes = cur_matched_bytes;
+
+		} else if (mode == PAGE_CUR_G || mode == PAGE_CUR_LE
+#ifdef PAGE_CUR_LE_OR_EXTENDS
+			   || mode == PAGE_CUR_LE_OR_EXTENDS
+#endif /* PAGE_CUR_LE_OR_EXTENDS */
+			   ) {
+
+			goto low_slot_match;
+		} else {
+
+			goto up_slot_match;
+		}
+	}
+
+	slot = page_dir_get_nth_slot(page, low);
+	low_rec = page_dir_slot_get_rec(slot);
+	slot = page_dir_get_nth_slot(page, up);
+	up_rec = page_dir_slot_get_rec(slot);
+
+	/* Perform linear search until the upper and lower records come to
+	distance 1 of each other. */
+
+	while (page_rec_get_next_const(low_rec) != up_rec) {
+
+		mid_rec = page_rec_get_next_const(low_rec);
+
+		ut_pair_min(&cur_matched_fields, &cur_matched_bytes,
+			    low_matched_fields, low_matched_bytes,
+			    up_matched_fields, up_matched_bytes);
+
+		offsets = rec_get_offsets(mid_rec, index, offsets,
+					  dtuple_get_n_fields_cmp(tuple),
+					  &heap);
+
+		cmp = cmp_dtuple_rec_with_match(tuple, mid_rec, offsets,
+						&cur_matched_fields,
+						&cur_matched_bytes);
+		if (UNIV_LIKELY(cmp > 0)) {
+low_rec_match:
+			low_rec = mid_rec;
+			low_matched_fields = cur_matched_fields;
+			low_matched_bytes = cur_matched_bytes;
+
+		} else if (UNIV_EXPECT(cmp, -1)) {
+#ifdef PAGE_CUR_LE_OR_EXTENDS
+			if (mode == PAGE_CUR_LE_OR_EXTENDS
+			    && page_cur_rec_field_extends(
+				    tuple, mid_rec, offsets,
+				    cur_matched_fields)) {
+
+				goto low_rec_match;
+			}
+#endif /* PAGE_CUR_LE_OR_EXTENDS */
+up_rec_match:
+			up_rec = mid_rec;
+			up_matched_fields = cur_matched_fields;
+			up_matched_bytes = cur_matched_bytes;
+		} else if (mode == PAGE_CUR_G || mode == PAGE_CUR_LE
+#ifdef PAGE_CUR_LE_OR_EXTENDS
+			   || mode == PAGE_CUR_LE_OR_EXTENDS
+#endif /* PAGE_CUR_LE_OR_EXTENDS */
+			   ) {
+
+			goto low_rec_match;
+		} else {
+
+			goto up_rec_match;
+		}
+	}
+
+#ifdef UNIV_SEARCH_DEBUG
+
+	/* Check that the lower and upper limit records have the
+	right alphabetical order compared to tuple. */
+	dbg_matched_fields = 0;
+	dbg_matched_bytes = 0;
+
+	offsets = rec_get_offsets(low_rec, index, offsets,
+				  ULINT_UNDEFINED, &heap);
+	dbg_cmp = page_cmp_dtuple_rec_with_match(tuple, low_rec, offsets,
+						 &dbg_matched_fields,
+						 &dbg_matched_bytes);
+	if (mode == PAGE_CUR_G) {
+		ut_a(dbg_cmp >= 0);
+	} else if (mode == PAGE_CUR_GE) {
+		ut_a(dbg_cmp == 1);
+	} else if (mode == PAGE_CUR_L) {
+		ut_a(dbg_cmp == 1);
+	} else if (mode == PAGE_CUR_LE) {
+		ut_a(dbg_cmp >= 0);
+	}
+
+	if (!page_rec_is_infimum(low_rec)) {
+
+		ut_a(low_matched_fields == dbg_matched_fields);
+		ut_a(low_matched_bytes == dbg_matched_bytes);
+	}
+
+	dbg_matched_fields = 0;
+	dbg_matched_bytes = 0;
+
+	offsets = rec_get_offsets(up_rec, index, offsets,
+				  ULINT_UNDEFINED, &heap);
+	dbg_cmp = page_cmp_dtuple_rec_with_match(tuple, up_rec, offsets,
+						 &dbg_matched_fields,
+						 &dbg_matched_bytes);
+	if (mode == PAGE_CUR_G) {
+		ut_a(dbg_cmp == -1);
+	} else if (mode == PAGE_CUR_GE) {
+		ut_a(dbg_cmp <= 0);
+	} else if (mode == PAGE_CUR_L) {
+		ut_a(dbg_cmp <= 0);
+	} else if (mode == PAGE_CUR_LE) {
+		ut_a(dbg_cmp == -1);
+	}
+
+	if (!page_rec_is_supremum(up_rec)) {
+
+		ut_a(up_matched_fields == dbg_matched_fields);
+		ut_a(up_matched_bytes == dbg_matched_bytes);
+	}
+#endif
+	if (mode <= PAGE_CUR_GE) {
+		page_cur_position(up_rec, block, cursor);
+	} else {
+		page_cur_position(low_rec, block, cursor);
+	}
+
+	*iup_matched_fields  = up_matched_fields;
+	*iup_matched_bytes   = up_matched_bytes;
+	*ilow_matched_fields = low_matched_fields;
+	*ilow_matched_bytes  = low_matched_bytes;
+	if (UNIV_LIKELY_NULL(heap)) {
+		mem_heap_free(heap);
+	}
+}
+
+/***********************************************************//**
+Positions a page cursor on a randomly chosen user record on a page. If there
+are no user records, sets the cursor on the infimum record. */
+UNIV_INTERN
+void
+page_cur_open_on_rnd_user_rec(
+/*==========================*/
+	buf_block_t*	block,	/*!< in: page */
+	page_cur_t*	cursor)	/*!< out: page cursor */
+{
+	ulint	rnd;
+	ulint	n_recs = page_get_n_recs(buf_block_get_frame(block));
+
+	page_cur_set_before_first(block, cursor);
+
+	if (UNIV_UNLIKELY(n_recs == 0)) {
+
+		return;
+	}
+
+	rnd = (ulint) (page_cur_lcg_prng() % n_recs);
+
+	do {
+		page_cur_move_to_next(cursor);
+	} while (rnd--);
+}
+
+UNIV_INTERN
+void
+page_cur_open_on_nth_user_rec(
+/*==========================*/
+	buf_block_t*	block,	/*!< in: page */
+	page_cur_t*	cursor,	/*!< out: page cursor */
+	ulint		nth)
+{
+	ulint	n_recs = page_get_n_recs(buf_block_get_frame(block));
+
+	page_cur_set_before_first(block, cursor);
+
+	if (UNIV_UNLIKELY(n_recs == 0)) {
+
+		return;
+	}
+
+	nth--;
+
+	if (nth >= n_recs) {
+		nth = n_recs - 1;
+	}
+
+	do {
+		page_cur_move_to_next(cursor);
+	} while (nth--);
+}
+
+UNIV_INTERN
+ibool
+page_cur_open_on_rnd_user_rec_after_nth(
+/*==========================*/
+	buf_block_t*	block,	/*!< in: page */
+	page_cur_t*	cursor,	/*!< out: page cursor */
+	ulint		nth)
+{
+	ulint	rnd;
+	ulint	n_recs = page_get_n_recs(buf_block_get_frame(block));
+	ibool	ret;
+
+	page_cur_set_before_first(block, cursor);
+
+	if (UNIV_UNLIKELY(n_recs == 0)) {
+
+		return (FALSE);
+	}
+
+	nth--;
+
+	if (nth >= n_recs) {
+		nth = n_recs - 1;
+	}
+
+	rnd = (ulint) (nth + page_cur_lcg_prng() % (n_recs - nth));
+
+	if (rnd == nth) {
+		ret = TRUE;
+	} else {
+		ret = FALSE;
+	}
+
+	do {
+		page_cur_move_to_next(cursor);
+	} while (rnd--);
+
+	return (ret);
+}
+
+/***********************************************************//**
+Writes the log record of a record insert on a page. */
+static
+void
+page_cur_insert_rec_write_log(
+/*==========================*/
+	rec_t*		insert_rec,	/*!< in: inserted physical record */
+	ulint		rec_size,	/*!< in: insert_rec size */
+	rec_t*		cursor_rec,	/*!< in: record the
+					cursor is pointing to */
+	dict_index_t*	index,		/*!< in: record descriptor */
+	mtr_t*		mtr)		/*!< in: mini-transaction handle */
+{
+	ulint	cur_rec_size;
+	ulint	extra_size;
+	ulint	cur_extra_size;
+	const byte* ins_ptr;
+	byte*	log_ptr;
+	const byte* log_end;
+	ulint	i;
+
+	ut_a(rec_size < UNIV_PAGE_SIZE);
+	ut_ad(page_align(insert_rec) == page_align(cursor_rec));
+	ut_ad(!page_rec_is_comp(insert_rec)
+	      == !dict_table_is_comp(index->table));
+
+	{
+		mem_heap_t*	heap		= NULL;
+		ulint		cur_offs_[REC_OFFS_NORMAL_SIZE];
+		ulint		ins_offs_[REC_OFFS_NORMAL_SIZE];
+
+		ulint*		cur_offs;
+		ulint*		ins_offs;
+
+		rec_offs_init(cur_offs_);
+		rec_offs_init(ins_offs_);
+
+		cur_offs = rec_get_offsets(cursor_rec, index, cur_offs_,
+					   ULINT_UNDEFINED, &heap);
+		ins_offs = rec_get_offsets(insert_rec, index, ins_offs_,
+					   ULINT_UNDEFINED, &heap);
+
+		extra_size = rec_offs_extra_size(ins_offs);
+		cur_extra_size = rec_offs_extra_size(cur_offs);
+		ut_ad(rec_size == rec_offs_size(ins_offs));
+		cur_rec_size = rec_offs_size(cur_offs);
+
+		if (UNIV_LIKELY_NULL(heap)) {
+			mem_heap_free(heap);
+		}
+	}
+
+	ins_ptr = insert_rec - extra_size;
+
+	i = 0;
+
+	if (cur_extra_size == extra_size) {
+		ulint		min_rec_size = ut_min(cur_rec_size, rec_size);
+
+		const byte*	cur_ptr = cursor_rec - cur_extra_size;
+
+		/* Find out the first byte in insert_rec which differs from
+		cursor_rec; skip the bytes in the record info */
+
+		do {
+			if (*ins_ptr == *cur_ptr) {
+				i++;
+				ins_ptr++;
+				cur_ptr++;
+			} else if ((i < extra_size)
+				   && (i >= extra_size
+				       - page_rec_get_base_extra_size
+				       (insert_rec))) {
+				i = extra_size;
+				ins_ptr = insert_rec;
+				cur_ptr = cursor_rec;
+			} else {
+				break;
+			}
+		} while (i < min_rec_size);
+	}
+
+	if (mtr_get_log_mode(mtr) != MTR_LOG_SHORT_INSERTS) {
+
+		if (page_rec_is_comp(insert_rec)) {
+			log_ptr = mlog_open_and_write_index(
+				mtr, insert_rec, index, MLOG_COMP_REC_INSERT,
+				2 + 5 + 1 + 5 + 5 + MLOG_BUF_MARGIN);
+			if (UNIV_UNLIKELY(!log_ptr)) {
+				/* Logging in mtr is switched off
+				during crash recovery: in that case
+				mlog_open returns NULL */
+				return;
+			}
+		} else {
+			log_ptr = mlog_open(mtr, 11
+					    + 2 + 5 + 1 + 5 + 5
+					    + MLOG_BUF_MARGIN);
+			if (UNIV_UNLIKELY(!log_ptr)) {
+				/* Logging in mtr is switched off
+				during crash recovery: in that case
+				mlog_open returns NULL */
+				return;
+			}
+
+			log_ptr = mlog_write_initial_log_record_fast(
+				insert_rec, MLOG_REC_INSERT, log_ptr, mtr);
+		}
+
+		log_end = &log_ptr[2 + 5 + 1 + 5 + 5 + MLOG_BUF_MARGIN];
+		/* Write the cursor rec offset as a 2-byte ulint */
+		mach_write_to_2(log_ptr, page_offset(cursor_rec));
+		log_ptr += 2;
+	} else {
+		log_ptr = mlog_open(mtr, 5 + 1 + 5 + 5 + MLOG_BUF_MARGIN);
+		if (!log_ptr) {
+			/* Logging in mtr is switched off during crash
+			recovery: in that case mlog_open returns NULL */
+			return;
+		}
+		log_end = &log_ptr[5 + 1 + 5 + 5 + MLOG_BUF_MARGIN];
+	}
+
+	if (page_rec_is_comp(insert_rec)) {
+		if (UNIV_UNLIKELY
+		    (rec_get_info_and_status_bits(insert_rec, TRUE)
+		     != rec_get_info_and_status_bits(cursor_rec, TRUE))) {
+
+			goto need_extra_info;
+		}
+	} else {
+		if (UNIV_UNLIKELY
+		    (rec_get_info_and_status_bits(insert_rec, FALSE)
+		     != rec_get_info_and_status_bits(cursor_rec, FALSE))) {
+
+			goto need_extra_info;
+		}
+	}
+
+	if (extra_size != cur_extra_size || rec_size != cur_rec_size) {
+need_extra_info:
+		/* Write the record end segment length
+		and the extra info storage flag */
+		log_ptr += mach_write_compressed(log_ptr,
+						 2 * (rec_size - i) + 1);
+
+		/* Write the info bits */
+		mach_write_to_1(log_ptr,
+				rec_get_info_and_status_bits(
+					insert_rec,
+					page_rec_is_comp(insert_rec)));
+		log_ptr++;
+
+		/* Write the record origin offset */
+		log_ptr += mach_write_compressed(log_ptr, extra_size);
+
+		/* Write the mismatch index */
+		log_ptr += mach_write_compressed(log_ptr, i);
+
+		ut_a(i < UNIV_PAGE_SIZE);
+		ut_a(extra_size < UNIV_PAGE_SIZE);
+	} else {
+		/* Write the record end segment length
+		and the extra info storage flag */
+		log_ptr += mach_write_compressed(log_ptr, 2 * (rec_size - i));
+	}
+
+	/* Write to the log the inserted index record end segment which
+	differs from the cursor record */
+
+	rec_size -= i;
+
+	if (log_ptr + rec_size <= log_end) {
+		memcpy(log_ptr, ins_ptr, rec_size);
+		mlog_close(mtr, log_ptr + rec_size);
+	} else {
+		mlog_close(mtr, log_ptr);
+		ut_a(rec_size < UNIV_PAGE_SIZE);
+		mlog_catenate_string(mtr, ins_ptr, rec_size);
+	}
+}
+#else /* !UNIV_HOTBACKUP */
+# define page_cur_insert_rec_write_log(ins_rec,size,cur,index,mtr) ((void) 0)
+#endif /* !UNIV_HOTBACKUP */
+
+/***********************************************************//**
+Parses a log record of a record insert on a page.
+@return	end of log record or NULL */
+UNIV_INTERN
+byte*
+page_cur_parse_insert_rec(
+/*======================*/
+	ibool		is_short,/*!< in: TRUE if short inserts */
+	byte*		ptr,	/*!< in: buffer */
+	byte*		end_ptr,/*!< in: buffer end */
+	buf_block_t*	block,	/*!< in: page or NULL */
+	dict_index_t*	index,	/*!< in: record descriptor */
+	mtr_t*		mtr)	/*!< in: mtr or NULL */
+{
+	ulint	origin_offset;
+	ulint	end_seg_len;
+	ulint	mismatch_index;
+	page_t*	page;
+	rec_t*	cursor_rec;
+	byte	buf1[1024];
+	byte*	buf;
+	byte*	ptr2			= ptr;
+	ulint	info_and_status_bits = 0; /* remove warning */
+	page_cur_t cursor;
+	mem_heap_t*	heap		= NULL;
+	ulint		offsets_[REC_OFFS_NORMAL_SIZE];
+	ulint*		offsets		= offsets_;
+	rec_offs_init(offsets_);
+
+	page = block ? buf_block_get_frame(block) : NULL;
+
+	if (is_short) {
+		cursor_rec = page_rec_get_prev(page_get_supremum_rec(page));
+	} else {
+		ulint	offset;
+
+		/* Read the cursor rec offset as a 2-byte ulint */
+
+		if (UNIV_UNLIKELY(end_ptr < ptr + 2)) {
+
+			return(NULL);
+		}
+
+		offset = mach_read_from_2(ptr);
+		ptr += 2;
+
+		cursor_rec = page + offset;
+
+		if (UNIV_UNLIKELY(offset >= UNIV_PAGE_SIZE)) {
+
+			recv_sys->found_corrupt_log = TRUE;
+
+			return(NULL);
+		}
+	}
+
+	ptr = mach_parse_compressed(ptr, end_ptr, &end_seg_len);
+
+	if (ptr == NULL) {
+
+		return(NULL);
+	}
+
+	if (UNIV_UNLIKELY(end_seg_len >= UNIV_PAGE_SIZE << 1)) {
+		recv_sys->found_corrupt_log = TRUE;
+
+		return(NULL);
+	}
+
+	if (end_seg_len & 0x1UL) {
+		/* Read the info bits */
+
+		if (end_ptr < ptr + 1) {
+
+			return(NULL);
+		}
+
+		info_and_status_bits = mach_read_from_1(ptr);
+		ptr++;
+
+		ptr = mach_parse_compressed(ptr, end_ptr, &origin_offset);
+
+		if (ptr == NULL) {
+
+			return(NULL);
+		}
+
+		ut_a(origin_offset < UNIV_PAGE_SIZE);
+
+		ptr = mach_parse_compressed(ptr, end_ptr, &mismatch_index);
+
+		if (ptr == NULL) {
+
+			return(NULL);
+		}
+
+		ut_a(mismatch_index < UNIV_PAGE_SIZE);
+	}
+
+	if (UNIV_UNLIKELY(end_ptr < ptr + (end_seg_len >> 1))) {
+
+		return(NULL);
+	}
+
+	if (!block) {
+
+		return(ptr + (end_seg_len >> 1));
+	}
+
+	ut_ad(!!page_is_comp(page) == dict_table_is_comp(index->table));
+	ut_ad(!buf_block_get_page_zip(block) || page_is_comp(page));
+
+	/* Read from the log the inserted index record end segment which
+	differs from the cursor record */
+
+	offsets = rec_get_offsets(cursor_rec, index, offsets,
+				  ULINT_UNDEFINED, &heap);
+
+	if (!(end_seg_len & 0x1UL)) {
+		info_and_status_bits = rec_get_info_and_status_bits(
+			cursor_rec, page_is_comp(page));
+		origin_offset = rec_offs_extra_size(offsets);
+		mismatch_index = rec_offs_size(offsets) - (end_seg_len >> 1);
+	}
+
+	end_seg_len >>= 1;
+
+	if (mismatch_index + end_seg_len < sizeof buf1) {
+		buf = buf1;
+	} else {
+		buf = mem_alloc(mismatch_index + end_seg_len);
+	}
+
+	/* Build the inserted record to buf */
+
+        if (UNIV_UNLIKELY(mismatch_index >= UNIV_PAGE_SIZE)) {
+		fprintf(stderr,
+			"Is short %lu, info_and_status_bits %lu, offset %lu, "
+			"o_offset %lu\n"
+			"mismatch index %lu, end_seg_len %lu\n"
+			"parsed len %lu\n",
+			(ulong) is_short, (ulong) info_and_status_bits,
+			(ulong) page_offset(cursor_rec),
+			(ulong) origin_offset,
+			(ulong) mismatch_index, (ulong) end_seg_len,
+			(ulong) (ptr - ptr2));
+
+		fputs("Dump of 300 bytes of log:\n", stderr);
+		ut_print_buf(stderr, ptr2, 300);
+		putc('\n', stderr);
+
+		buf_page_print(page, 0);
+
+		ut_error;
+	}
+
+	ut_memcpy(buf, rec_get_start(cursor_rec, offsets), mismatch_index);
+	ut_memcpy(buf + mismatch_index, ptr, end_seg_len);
+
+	if (page_is_comp(page)) {
+		rec_set_info_and_status_bits(buf + origin_offset,
+				     info_and_status_bits);
+	} else {
+		rec_set_info_bits_old(buf + origin_offset,
+							info_and_status_bits);
+	}
+
+	page_cur_position(cursor_rec, block, &cursor);
+
+	offsets = rec_get_offsets(buf + origin_offset, index, offsets,
+				  ULINT_UNDEFINED, &heap);
+	if (UNIV_UNLIKELY(!page_cur_rec_insert(&cursor,
+					       buf + origin_offset,
+					       index, offsets, mtr))) {
+		/* The redo log record should only have been written
+		after the write was successful. */
+		ut_error;
+	}
+
+	if (buf != buf1) {
+
+		mem_free(buf);
+	}
+
+	if (UNIV_LIKELY_NULL(heap)) {
+		mem_heap_free(heap);
+	}
+
+	return(ptr + end_seg_len);
+}
+
+/***********************************************************//**
+Inserts a record next to page cursor on an uncompressed page.
+Returns pointer to inserted record if succeed, i.e., enough
+space available, NULL otherwise. The cursor stays at the same position.
+@return	pointer to record if succeed, NULL otherwise */
+UNIV_INTERN
+rec_t*
+page_cur_insert_rec_low(
+/*====================*/
+	rec_t*		current_rec,/*!< in: pointer to current record after
+				which the new record is inserted */
+	dict_index_t*	index,	/*!< in: record descriptor */
+	const rec_t*	rec,	/*!< in: pointer to a physical record */
+	ulint*		offsets,/*!< in/out: rec_get_offsets(rec, index) */
+	mtr_t*		mtr)	/*!< in: mini-transaction handle, or NULL */
+{
+	byte*		insert_buf;
+	ulint		rec_size;
+	page_t*		page;		/*!< the relevant page */
+	rec_t*		last_insert;	/*!< cursor position at previous
+					insert */
+	rec_t*		free_rec;	/*!< a free record that was reused,
+					or NULL */
+	rec_t*		insert_rec;	/*!< inserted record */
+	ulint		heap_no;	/*!< heap number of the inserted
+					record */
+
+	ut_ad(rec_offs_validate(rec, index, offsets));
+
+	page = page_align(current_rec);
+	ut_ad(dict_table_is_comp(index->table)
+	      == (ibool) !!page_is_comp(page));
+
+	ut_ad(!page_rec_is_supremum(current_rec));
+
+	/* 1. Get the size of the physical record in the page */
+	rec_size = rec_offs_size(offsets);
+
+#ifdef UNIV_DEBUG_VALGRIND
+	{
+		const void*	rec_start
+			= rec - rec_offs_extra_size(offsets);
+		ulint		extra_size
+			= rec_offs_extra_size(offsets)
+			- (rec_offs_comp(offsets)
+			   ? REC_N_NEW_EXTRA_BYTES
+			   : REC_N_OLD_EXTRA_BYTES);
+
+		/* All data bytes of the record must be valid. */
+		UNIV_MEM_ASSERT_RW(rec, rec_offs_data_size(offsets));
+		/* The variable-length header must be valid. */
+		UNIV_MEM_ASSERT_RW(rec_start, extra_size);
+	}
+#endif /* UNIV_DEBUG_VALGRIND */
+
+	/* 2. Try to find suitable space from page memory management */
+
+	free_rec = page_header_get_ptr(page, PAGE_FREE);
+	if (UNIV_LIKELY_NULL(free_rec)) {
+		/* Try to allocate from the head of the free list. */
+		ulint		foffsets_[REC_OFFS_NORMAL_SIZE];
+		ulint*		foffsets	= foffsets_;
+		mem_heap_t*	heap		= NULL;
+
+		rec_offs_init(foffsets_);
+
+		foffsets = rec_get_offsets(free_rec, index, foffsets,
+					ULINT_UNDEFINED, &heap);
+		if (rec_offs_size(foffsets) < rec_size) {
+			if (UNIV_LIKELY_NULL(heap)) {
+				mem_heap_free(heap);
+			}
+
+			goto use_heap;
+		}
+
+		insert_buf = free_rec - rec_offs_extra_size(foffsets);
+
+		if (page_is_comp(page)) {
+			heap_no = rec_get_heap_no_new(free_rec);
+			page_mem_alloc_free(page, NULL,
+					rec_get_next_ptr(free_rec, TRUE),
+					rec_size);
+		} else {
+			heap_no = rec_get_heap_no_old(free_rec);
+			page_mem_alloc_free(page, NULL,
+					rec_get_next_ptr(free_rec, FALSE),
+					rec_size);
+		}
+
+		if (UNIV_LIKELY_NULL(heap)) {
+			mem_heap_free(heap);
+		}
+	} else {
+use_heap:
+		free_rec = NULL;
+		insert_buf = page_mem_alloc_heap(page, NULL,
+						 rec_size, &heap_no);
+
+		if (UNIV_UNLIKELY(insert_buf == NULL)) {
+			return(NULL);
+		}
+	}
+
+	/* 3. Create the record */
+	insert_rec = rec_copy(insert_buf, rec, offsets);
+	rec_offs_make_valid(insert_rec, index, offsets);
+
+	/* 4. Insert the record in the linked list of records */
+	ut_ad(current_rec != insert_rec);
+
+	{
+		/* next record after current before the insertion */
+		rec_t*	next_rec = page_rec_get_next(current_rec);
+#ifdef UNIV_DEBUG
+		if (page_is_comp(page)) {
+			ut_ad(rec_get_status(current_rec)
+				<= REC_STATUS_INFIMUM);
+			ut_ad(rec_get_status(insert_rec) < REC_STATUS_INFIMUM);
+			ut_ad(rec_get_status(next_rec) != REC_STATUS_INFIMUM);
+		}
+#endif
+		page_rec_set_next(insert_rec, next_rec);
+		page_rec_set_next(current_rec, insert_rec);
+	}
+
+	page_header_set_field(page, NULL, PAGE_N_RECS,
+			      1 + page_get_n_recs(page));
+
+	/* 5. Set the n_owned field in the inserted record to zero,
+	and set the heap_no field */
+	if (page_is_comp(page)) {
+		rec_set_n_owned_new(insert_rec, NULL, 0);
+		rec_set_heap_no_new(insert_rec, heap_no);
+	} else {
+		rec_set_n_owned_old(insert_rec, 0);
+		rec_set_heap_no_old(insert_rec, heap_no);
+	}
+
+	UNIV_MEM_ASSERT_RW(rec_get_start(insert_rec, offsets),
+			   rec_offs_size(offsets));
+	/* 6. Update the last insertion info in page header */
+
+	last_insert = page_header_get_ptr(page, PAGE_LAST_INSERT);
+	ut_ad(!last_insert || !page_is_comp(page)
+	      || rec_get_node_ptr_flag(last_insert)
+	      == rec_get_node_ptr_flag(insert_rec));
+
+	if (UNIV_UNLIKELY(last_insert == NULL)) {
+		page_header_set_field(page, NULL, PAGE_DIRECTION,
+				      PAGE_NO_DIRECTION);
+		page_header_set_field(page, NULL, PAGE_N_DIRECTION, 0);
+
+	} else if ((last_insert == current_rec)
+		   && (page_header_get_field(page, PAGE_DIRECTION)
+		       != PAGE_LEFT)) {
+
+		page_header_set_field(page, NULL, PAGE_DIRECTION,
+							PAGE_RIGHT);
+		page_header_set_field(page, NULL, PAGE_N_DIRECTION,
+				      page_header_get_field(
+					      page, PAGE_N_DIRECTION) + 1);
+
+	} else if ((page_rec_get_next(insert_rec) == last_insert)
+		   && (page_header_get_field(page, PAGE_DIRECTION)
+		       != PAGE_RIGHT)) {
+
+		page_header_set_field(page, NULL, PAGE_DIRECTION,
+							PAGE_LEFT);
+		page_header_set_field(page, NULL, PAGE_N_DIRECTION,
+				      page_header_get_field(
+					      page, PAGE_N_DIRECTION) + 1);
+	} else {
+		page_header_set_field(page, NULL, PAGE_DIRECTION,
+							PAGE_NO_DIRECTION);
+		page_header_set_field(page, NULL, PAGE_N_DIRECTION, 0);
+	}
+
+	page_header_set_ptr(page, NULL, PAGE_LAST_INSERT, insert_rec);
+
+	/* 7. It remains to update the owner record. */
+	{
+		rec_t*	owner_rec	= page_rec_find_owner_rec(insert_rec);
+		ulint	n_owned;
+		if (page_is_comp(page)) {
+			n_owned = rec_get_n_owned_new(owner_rec);
+			rec_set_n_owned_new(owner_rec, NULL, n_owned + 1);
+		} else {
+			n_owned = rec_get_n_owned_old(owner_rec);
+			rec_set_n_owned_old(owner_rec, n_owned + 1);
+		}
+
+		/* 8. Now we have incremented the n_owned field of the owner
+		record. If the number exceeds PAGE_DIR_SLOT_MAX_N_OWNED,
+		we have to split the corresponding directory slot in two. */
+
+		if (UNIV_UNLIKELY(n_owned == PAGE_DIR_SLOT_MAX_N_OWNED)) {
+			page_dir_split_slot(
+				page, NULL,
+				page_dir_find_owner_slot(owner_rec));
+		}
+	}
+
+	/* 9. Write log record of the insert */
+	if (UNIV_LIKELY(mtr != NULL)) {
+		page_cur_insert_rec_write_log(insert_rec, rec_size,
+					      current_rec, index, mtr);
+	}
+
+	return(insert_rec);
+}
+
+/***********************************************************//**
+Compresses or reorganizes a page after an optimistic insert.
+@return	rec if succeed, NULL otherwise */
+static
+rec_t*
+page_cur_insert_rec_zip_reorg(
+/*==========================*/
+	rec_t**		current_rec,/*!< in/out: pointer to current record after
+				which the new record is inserted */
+	buf_block_t*	block,	/*!< in: buffer block */
+	dict_index_t*	index,	/*!< in: record descriptor */
+	rec_t*		rec,	/*!< in: inserted record */
+	page_t*		page,	/*!< in: uncompressed page */
+	page_zip_des_t*	page_zip,/*!< in: compressed page */
+	mtr_t*		mtr)	/*!< in: mini-transaction, or NULL */
+{
+	ulint		pos;
+
+	/* Recompress or reorganize and recompress the page. */
+	if (UNIV_LIKELY(page_zip_compress(page_zip, page, index, mtr))) {
+		return(rec);
+	}
+
+	/* Before trying to reorganize the page,
+	store the number of preceding records on the page. */
+	pos = page_rec_get_n_recs_before(rec);
+
+	if (page_zip_reorganize(block, index, mtr)) {
+		/* The page was reorganized: Find rec by seeking to pos,
+		and update *current_rec. */
+		rec = page + PAGE_NEW_INFIMUM;
+
+		while (--pos) {
+			rec = page + rec_get_next_offs(rec, TRUE);
+		}
+
+		*current_rec = rec;
+		rec = page + rec_get_next_offs(rec, TRUE);
+
+		return(rec);
+	}
+
+	/* Out of space: restore the page */
+	if (!page_zip_decompress(page_zip, page, FALSE)) {
+		ut_error; /* Memory corrupted? */
+	}
+	ut_ad(page_validate(page, index));
+	return(NULL);
+}
+
+/***********************************************************//**
+Inserts a record next to page cursor on a compressed and uncompressed
+page. Returns pointer to inserted record if succeed, i.e.,
+enough space available, NULL otherwise.
+The cursor stays at the same position.
+@return	pointer to record if succeed, NULL otherwise */
+UNIV_INTERN
+rec_t*
+page_cur_insert_rec_zip(
+/*====================*/
+	rec_t**		current_rec,/*!< in/out: pointer to current record after
+				which the new record is inserted */
+	buf_block_t*	block,	/*!< in: buffer block of *current_rec */
+	dict_index_t*	index,	/*!< in: record descriptor */
+	const rec_t*	rec,	/*!< in: pointer to a physical record */
+	ulint*		offsets,/*!< in/out: rec_get_offsets(rec, index) */
+	mtr_t*		mtr)	/*!< in: mini-transaction handle, or NULL */
+{
+	byte*		insert_buf;
+	ulint		rec_size;
+	page_t*		page;		/*!< the relevant page */
+	rec_t*		last_insert;	/*!< cursor position at previous
+					insert */
+	rec_t*		free_rec;	/*!< a free record that was reused,
+					or NULL */
+	rec_t*		insert_rec;	/*!< inserted record */
+	ulint		heap_no;	/*!< heap number of the inserted
+					record */
+	page_zip_des_t*	page_zip;
+
+	page_zip = buf_block_get_page_zip(block);
+	ut_ad(page_zip);
+
+	ut_ad(rec_offs_validate(rec, index, offsets));
+
+	page = page_align(*current_rec);
+	ut_ad(dict_table_is_comp(index->table));
+	ut_ad(page_is_comp(page));
+
+	ut_ad(!page_rec_is_supremum(*current_rec));
+#ifdef UNIV_ZIP_DEBUG
+	ut_a(page_zip_validate(page_zip, page));
+#endif /* UNIV_ZIP_DEBUG */
+
+	/* 1. Get the size of the physical record in the page */
+	rec_size = rec_offs_size(offsets);
+
+#ifdef UNIV_DEBUG_VALGRIND
+	{
+		const void*	rec_start
+			= rec - rec_offs_extra_size(offsets);
+		ulint		extra_size
+			= rec_offs_extra_size(offsets)
+			- (rec_offs_comp(offsets)
+			   ? REC_N_NEW_EXTRA_BYTES
+			   : REC_N_OLD_EXTRA_BYTES);
+
+		/* All data bytes of the record must be valid. */
+		UNIV_MEM_ASSERT_RW(rec, rec_offs_data_size(offsets));
+		/* The variable-length header must be valid. */
+		UNIV_MEM_ASSERT_RW(rec_start, extra_size);
+	}
+#endif /* UNIV_DEBUG_VALGRIND */
+
+	/* 2. Try to find suitable space from page memory management */
+	if (!page_zip_available(page_zip, dict_index_is_clust(index),
+				rec_size, 1)) {
+
+		/* Try compressing the whole page afterwards. */
+		insert_rec = page_cur_insert_rec_low(*current_rec,
+						     index, rec, offsets,
+						     NULL);
+
+		if (UNIV_LIKELY(insert_rec != NULL)) {
+			insert_rec = page_cur_insert_rec_zip_reorg(
+				current_rec, block, index, insert_rec,
+				page, page_zip, mtr);
+		}
+
+		return(insert_rec);
+	}
+
+	free_rec = page_header_get_ptr(page, PAGE_FREE);
+	if (UNIV_LIKELY_NULL(free_rec)) {
+		/* Try to allocate from the head of the free list. */
+		lint	extra_size_diff;
+		ulint		foffsets_[REC_OFFS_NORMAL_SIZE];
+		ulint*		foffsets	= foffsets_;
+		mem_heap_t*	heap		= NULL;
+
+		rec_offs_init(foffsets_);
+
+		foffsets = rec_get_offsets(free_rec, index, foffsets,
+					ULINT_UNDEFINED, &heap);
+		if (rec_offs_size(foffsets) < rec_size) {
+too_small:
+			if (UNIV_LIKELY_NULL(heap)) {
+				mem_heap_free(heap);
+			}
+
+			goto use_heap;
+		}
+
+		insert_buf = free_rec - rec_offs_extra_size(foffsets);
+
+		/* On compressed pages, do not relocate records from
+		the free list.  If extra_size would grow, use the heap. */
+		extra_size_diff
+			= rec_offs_extra_size(offsets)
+			- rec_offs_extra_size(foffsets);
+
+		if (UNIV_UNLIKELY(extra_size_diff < 0)) {
+			/* Add an offset to the extra_size. */
+			if (rec_offs_size(foffsets)
+			    < rec_size - extra_size_diff) {
+
+				goto too_small;
+			}
+
+			insert_buf -= extra_size_diff;
+		} else if (UNIV_UNLIKELY(extra_size_diff)) {
+			/* Do not allow extra_size to grow */
+
+			goto too_small;
+		}
+
+		heap_no = rec_get_heap_no_new(free_rec);
+		page_mem_alloc_free(page, page_zip,
+				    rec_get_next_ptr(free_rec, TRUE),
+				    rec_size);
+
+		if (!page_is_leaf(page)) {
+			/* Zero out the node pointer of free_rec,
+			in case it will not be overwritten by
+			insert_rec. */
+
+			ut_ad(rec_size > REC_NODE_PTR_SIZE);
+
+			if (rec_offs_extra_size(foffsets)
+			    + rec_offs_data_size(foffsets) > rec_size) {
+
+				memset(rec_get_end(free_rec, foffsets)
+				       - REC_NODE_PTR_SIZE, 0,
+				       REC_NODE_PTR_SIZE);
+			}
+		} else if (dict_index_is_clust(index)) {
+			/* Zero out the DB_TRX_ID and DB_ROLL_PTR
+			columns of free_rec, in case it will not be
+			overwritten by insert_rec. */
+
+			ulint	trx_id_col;
+			ulint	trx_id_offs;
+			ulint	len;
+
+			trx_id_col = dict_index_get_sys_col_pos(index,
+								DATA_TRX_ID);
+			ut_ad(trx_id_col > 0);
+			ut_ad(trx_id_col != ULINT_UNDEFINED);
+
+			trx_id_offs = rec_get_nth_field_offs(foffsets,
+							     trx_id_col, &len);
+			ut_ad(len == DATA_TRX_ID_LEN);
+
+			if (DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN + trx_id_offs
+			    + rec_offs_extra_size(foffsets) > rec_size) {
+				/* We will have to zero out the
+				DB_TRX_ID and DB_ROLL_PTR, because
+				they will not be fully overwritten by
+				insert_rec. */
+
+				memset(free_rec + trx_id_offs, 0,
+				       DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN);
+			}
+
+			ut_ad(free_rec + trx_id_offs + DATA_TRX_ID_LEN
+			      == rec_get_nth_field(free_rec, foffsets,
+						   trx_id_col + 1, &len));
+			ut_ad(len == DATA_ROLL_PTR_LEN);
+		}
+
+		if (UNIV_LIKELY_NULL(heap)) {
+			mem_heap_free(heap);
+		}
+	} else {
+use_heap:
+		free_rec = NULL;
+		insert_buf = page_mem_alloc_heap(page, page_zip,
+						 rec_size, &heap_no);
+
+		if (UNIV_UNLIKELY(insert_buf == NULL)) {
+			return(NULL);
+		}
+
+		page_zip_dir_add_slot(page_zip, dict_index_is_clust(index));
+	}
+
+	/* 3. Create the record */
+	insert_rec = rec_copy(insert_buf, rec, offsets);
+	rec_offs_make_valid(insert_rec, index, offsets);
+
+	/* 4. Insert the record in the linked list of records */
+	ut_ad(*current_rec != insert_rec);
+
+	{
+		/* next record after current before the insertion */
+		rec_t*	next_rec = page_rec_get_next(*current_rec);
+		ut_ad(rec_get_status(*current_rec)
+		      <= REC_STATUS_INFIMUM);
+		ut_ad(rec_get_status(insert_rec) < REC_STATUS_INFIMUM);
+		ut_ad(rec_get_status(next_rec) != REC_STATUS_INFIMUM);
+
+		page_rec_set_next(insert_rec, next_rec);
+		page_rec_set_next(*current_rec, insert_rec);
+	}
+
+	page_header_set_field(page, page_zip, PAGE_N_RECS,
+			      1 + page_get_n_recs(page));
+
+	/* 5. Set the n_owned field in the inserted record to zero,
+	and set the heap_no field */
+	rec_set_n_owned_new(insert_rec, NULL, 0);
+	rec_set_heap_no_new(insert_rec, heap_no);
+
+	UNIV_MEM_ASSERT_RW(rec_get_start(insert_rec, offsets),
+			   rec_offs_size(offsets));
+
+	page_zip_dir_insert(page_zip, *current_rec, free_rec, insert_rec);
+
+	/* 6. Update the last insertion info in page header */
+
+	last_insert = page_header_get_ptr(page, PAGE_LAST_INSERT);
+	ut_ad(!last_insert
+	      || rec_get_node_ptr_flag(last_insert)
+	      == rec_get_node_ptr_flag(insert_rec));
+
+	if (UNIV_UNLIKELY(last_insert == NULL)) {
+		page_header_set_field(page, page_zip, PAGE_DIRECTION,
+							PAGE_NO_DIRECTION);
+		page_header_set_field(page, page_zip, PAGE_N_DIRECTION, 0);
+
+	} else if ((last_insert == *current_rec)
+		   && (page_header_get_field(page, PAGE_DIRECTION)
+		       != PAGE_LEFT)) {
+
+		page_header_set_field(page, page_zip, PAGE_DIRECTION,
+							PAGE_RIGHT);
+		page_header_set_field(page, page_zip, PAGE_N_DIRECTION,
+				      page_header_get_field(
+					      page, PAGE_N_DIRECTION) + 1);
+
+	} else if ((page_rec_get_next(insert_rec) == last_insert)
+		   && (page_header_get_field(page, PAGE_DIRECTION)
+		       != PAGE_RIGHT)) {
+
+		page_header_set_field(page, page_zip, PAGE_DIRECTION,
+							PAGE_LEFT);
+		page_header_set_field(page, page_zip, PAGE_N_DIRECTION,
+				      page_header_get_field(
+					      page, PAGE_N_DIRECTION) + 1);
+	} else {
+		page_header_set_field(page, page_zip, PAGE_DIRECTION,
+							PAGE_NO_DIRECTION);
+		page_header_set_field(page, page_zip, PAGE_N_DIRECTION, 0);
+	}
+
+	page_header_set_ptr(page, page_zip, PAGE_LAST_INSERT, insert_rec);
+
+	/* 7. It remains to update the owner record. */
+	{
+		rec_t*	owner_rec	= page_rec_find_owner_rec(insert_rec);
+		ulint	n_owned;
+
+		n_owned = rec_get_n_owned_new(owner_rec);
+		rec_set_n_owned_new(owner_rec, page_zip, n_owned + 1);
+
+		/* 8. Now we have incremented the n_owned field of the owner
+		record. If the number exceeds PAGE_DIR_SLOT_MAX_N_OWNED,
+		we have to split the corresponding directory slot in two. */
+
+		if (UNIV_UNLIKELY(n_owned == PAGE_DIR_SLOT_MAX_N_OWNED)) {
+			page_dir_split_slot(
+				page, page_zip,
+				page_dir_find_owner_slot(owner_rec));
+		}
+	}
+
+	page_zip_write_rec(page_zip, insert_rec, index, offsets, 1);
+
+	/* 9. Write log record of the insert */
+	if (UNIV_LIKELY(mtr != NULL)) {
+		page_cur_insert_rec_write_log(insert_rec, rec_size,
+					      *current_rec, index, mtr);
+	}
+
+	return(insert_rec);
+}
+
+#ifndef UNIV_HOTBACKUP
+/**********************************************************//**
+Writes a log record of copying a record list end to a new created page.
+@return 4-byte field where to write the log data length, or NULL if
+logging is disabled */
+UNIV_INLINE
+byte*
+page_copy_rec_list_to_created_page_write_log(
+/*=========================================*/
+	page_t*		page,	/*!< in: index page */
+	dict_index_t*	index,	/*!< in: record descriptor */
+	mtr_t*		mtr)	/*!< in: mtr */
+{
+	byte*	log_ptr;
+
+	ut_ad(!!page_is_comp(page) == dict_table_is_comp(index->table));
+
+	log_ptr = mlog_open_and_write_index(mtr, page, index,
+					    page_is_comp(page)
+					    ? MLOG_COMP_LIST_END_COPY_CREATED
+					    : MLOG_LIST_END_COPY_CREATED, 4);
+	if (UNIV_LIKELY(log_ptr != NULL)) {
+		mlog_close(mtr, log_ptr + 4);
+	}
+
+	return(log_ptr);
+}
+#endif /* !UNIV_HOTBACKUP */
+
+/**********************************************************//**
+Parses a log record of copying a record list end to a new created page.
+@return	end of log record or NULL */
+UNIV_INTERN
+byte*
+page_parse_copy_rec_list_to_created_page(
+/*=====================================*/
+	byte*		ptr,	/*!< in: buffer */
+	byte*		end_ptr,/*!< in: buffer end */
+	buf_block_t*	block,	/*!< in: page or NULL */
+	dict_index_t*	index,	/*!< in: record descriptor */
+	mtr_t*		mtr)	/*!< in: mtr or NULL */
+{
+	byte*		rec_end;
+	ulint		log_data_len;
+	page_t*		page;
+	page_zip_des_t*	page_zip;
+
+	if (ptr + 4 > end_ptr) {
+
+		return(NULL);
+	}
+
+	log_data_len = mach_read_from_4(ptr);
+	ptr += 4;
+
+	rec_end = ptr + log_data_len;
+
+	if (rec_end > end_ptr) {
+
+		return(NULL);
+	}
+
+	if (!block) {
+
+		return(rec_end);
+	}
+
+	while (ptr < rec_end) {
+		ptr = page_cur_parse_insert_rec(TRUE, ptr, end_ptr,
+						block, index, mtr);
+	}
+
+	ut_a(ptr == rec_end);
+
+	page = buf_block_get_frame(block);
+	page_zip = buf_block_get_page_zip(block);
+
+	page_header_set_ptr(page, page_zip, PAGE_LAST_INSERT, NULL);
+	page_header_set_field(page, page_zip, PAGE_DIRECTION,
+							PAGE_NO_DIRECTION);
+	page_header_set_field(page, page_zip, PAGE_N_DIRECTION, 0);
+
+	return(rec_end);
+}
+
+#ifndef UNIV_HOTBACKUP
+/*************************************************************//**
+Copies records from page to a newly created page, from a given record onward,
+including that record. Infimum and supremum records are not copied. */
+UNIV_INTERN
+void
+page_copy_rec_list_end_to_created_page(
+/*===================================*/
+	page_t*		new_page,	/*!< in/out: index page to copy to */
+	rec_t*		rec,		/*!< in: first record to copy */
+	dict_index_t*	index,		/*!< in: record descriptor */
+	mtr_t*		mtr)		/*!< in: mtr */
+{
+	page_dir_slot_t* slot = 0; /* remove warning */
+	byte*	heap_top;
+	rec_t*	insert_rec = 0; /* remove warning */
+	rec_t*	prev_rec;
+	ulint	count;
+	ulint	n_recs;
+	ulint	slot_index;
+	ulint	rec_size;
+	ulint	log_mode;
+	byte*	log_ptr;
+	ulint	log_data_len;
+	mem_heap_t*	heap		= NULL;
+	ulint		offsets_[REC_OFFS_NORMAL_SIZE];
+	ulint*		offsets		= offsets_;
+	rec_offs_init(offsets_);
+
+	ut_ad(page_dir_get_n_heap(new_page) == PAGE_HEAP_NO_USER_LOW);
+	ut_ad(page_align(rec) != new_page);
+	ut_ad(page_rec_is_comp(rec) == page_is_comp(new_page));
+
+	if (page_rec_is_infimum(rec)) {
+
+		rec = page_rec_get_next(rec);
+	}
+
+	if (page_rec_is_supremum(rec)) {
+
+		return;
+	}
+
+#ifdef UNIV_DEBUG
+	/* To pass the debug tests we have to set these dummy values
+	in the debug version */
+	page_dir_set_n_slots(new_page, NULL, UNIV_PAGE_SIZE / 2);
+	page_header_set_ptr(new_page, NULL, PAGE_HEAP_TOP,
+			    new_page + UNIV_PAGE_SIZE - 1);
+#endif
+
+	log_ptr = page_copy_rec_list_to_created_page_write_log(new_page,
+							       index, mtr);
+
+	log_data_len = dyn_array_get_data_size(&(mtr->log));
+
+	/* Individual inserts are logged in a shorter form */
+
+	log_mode = mtr_set_log_mode(mtr, MTR_LOG_SHORT_INSERTS);
+
+	prev_rec = page_get_infimum_rec(new_page);
+	if (page_is_comp(new_page)) {
+		heap_top = new_page + PAGE_NEW_SUPREMUM_END;
+	} else {
+		heap_top = new_page + PAGE_OLD_SUPREMUM_END;
+	}
+	count = 0;
+	slot_index = 0;
+	n_recs = 0;
+
+	do {
+		offsets = rec_get_offsets(rec, index, offsets,
+					  ULINT_UNDEFINED, &heap);
+		insert_rec = rec_copy(heap_top, rec, offsets);
+
+		if (page_is_comp(new_page)) {
+			rec_set_next_offs_new(prev_rec,
+					      page_offset(insert_rec));
+
+			rec_set_n_owned_new(insert_rec, NULL, 0);
+			rec_set_heap_no_new(insert_rec,
+					    PAGE_HEAP_NO_USER_LOW + n_recs);
+		} else {
+			rec_set_next_offs_old(prev_rec,
+					      page_offset(insert_rec));
+
+			rec_set_n_owned_old(insert_rec, 0);
+			rec_set_heap_no_old(insert_rec,
+					    PAGE_HEAP_NO_USER_LOW + n_recs);
+		}
+
+		count++;
+		n_recs++;
+
+		if (UNIV_UNLIKELY
+		    (count == (PAGE_DIR_SLOT_MAX_N_OWNED + 1) / 2)) {
+
+			slot_index++;
+
+			slot = page_dir_get_nth_slot(new_page, slot_index);
+
+			page_dir_slot_set_rec(slot, insert_rec);
+			page_dir_slot_set_n_owned(slot, NULL, count);
+
+			count = 0;
+		}
+
+		rec_size = rec_offs_size(offsets);
+
+		ut_ad(heap_top < new_page + UNIV_PAGE_SIZE);
+
+		heap_top += rec_size;
+
+		page_cur_insert_rec_write_log(insert_rec, rec_size, prev_rec,
+					      index, mtr);
+		prev_rec = insert_rec;
+		rec = page_rec_get_next(rec);
+	} while (!page_rec_is_supremum(rec));
+
+	if ((slot_index > 0) && (count + 1
+				 + (PAGE_DIR_SLOT_MAX_N_OWNED + 1) / 2
+				 <= PAGE_DIR_SLOT_MAX_N_OWNED)) {
+		/* We can merge the two last dir slots. This operation is
+		here to make this function imitate exactly the equivalent
+		task made using page_cur_insert_rec, which we use in database
+		recovery to reproduce the task performed by this function.
+		To be able to check the correctness of recovery, it is good
+		that it imitates exactly. */
+
+		count += (PAGE_DIR_SLOT_MAX_N_OWNED + 1) / 2;
+
+		page_dir_slot_set_n_owned(slot, NULL, 0);
+
+		slot_index--;
+	}
+
+	if (UNIV_LIKELY_NULL(heap)) {
+		mem_heap_free(heap);
+	}
+
+	log_data_len = dyn_array_get_data_size(&(mtr->log)) - log_data_len;
+
+	ut_a(log_data_len < 100 * UNIV_PAGE_SIZE);
+
+	if (UNIV_LIKELY(log_ptr != NULL)) {
+		mach_write_to_4(log_ptr, log_data_len);
+	}
+
+	if (page_is_comp(new_page)) {
+		rec_set_next_offs_new(insert_rec, PAGE_NEW_SUPREMUM);
+	} else {
+		rec_set_next_offs_old(insert_rec, PAGE_OLD_SUPREMUM);
+	}
+
+	slot = page_dir_get_nth_slot(new_page, 1 + slot_index);
+
+	page_dir_slot_set_rec(slot, page_get_supremum_rec(new_page));
+	page_dir_slot_set_n_owned(slot, NULL, count + 1);
+
+	page_dir_set_n_slots(new_page, NULL, 2 + slot_index);
+	page_header_set_ptr(new_page, NULL, PAGE_HEAP_TOP, heap_top);
+	page_dir_set_n_heap(new_page, NULL, PAGE_HEAP_NO_USER_LOW + n_recs);
+	page_header_set_field(new_page, NULL, PAGE_N_RECS, n_recs);
+
+	page_header_set_ptr(new_page, NULL, PAGE_LAST_INSERT, NULL);
+	page_header_set_field(new_page, NULL, PAGE_DIRECTION,
+							PAGE_NO_DIRECTION);
+	page_header_set_field(new_page, NULL, PAGE_N_DIRECTION, 0);
+
+	/* Restore the log mode */
+
+	mtr_set_log_mode(mtr, log_mode);
+}
+
+/***********************************************************//**
+Writes log record of a record delete on a page. */
+UNIV_INLINE
+void
+page_cur_delete_rec_write_log(
+/*==========================*/
+	rec_t*		rec,	/*!< in: record to be deleted */
+	dict_index_t*	index,	/*!< in: record descriptor */
+	mtr_t*		mtr)	/*!< in: mini-transaction handle */
+{
+	byte*	log_ptr;
+
+	ut_ad(!!page_rec_is_comp(rec) == dict_table_is_comp(index->table));
+
+	log_ptr = mlog_open_and_write_index(mtr, rec, index,
+					    page_rec_is_comp(rec)
+					    ? MLOG_COMP_REC_DELETE
+					    : MLOG_REC_DELETE, 2);
+
+	if (!log_ptr) {
+		/* Logging in mtr is switched off during crash recovery:
+		in that case mlog_open returns NULL */
+		return;
+	}
+
+	/* Write the cursor rec offset as a 2-byte ulint */
+	mach_write_to_2(log_ptr, page_offset(rec));
+
+	mlog_close(mtr, log_ptr + 2);
+}
+#else /* !UNIV_HOTBACKUP */
+# define page_cur_delete_rec_write_log(rec,index,mtr) ((void) 0)
+#endif /* !UNIV_HOTBACKUP */
+
+/***********************************************************//**
+Parses log record of a record delete on a page.
+@return	pointer to record end or NULL */
+UNIV_INTERN
+byte*
+page_cur_parse_delete_rec(
+/*======================*/
+	byte*		ptr,	/*!< in: buffer */
+	byte*		end_ptr,/*!< in: buffer end */
+	buf_block_t*	block,	/*!< in: page or NULL */
+	dict_index_t*	index,	/*!< in: record descriptor */
+	mtr_t*		mtr)	/*!< in: mtr or NULL */
+{
+	ulint		offset;
+	page_cur_t	cursor;
+
+	if (end_ptr < ptr + 2) {
+
+		return(NULL);
+	}
+
+	/* Read the cursor rec offset as a 2-byte ulint */
+	offset = mach_read_from_2(ptr);
+	ptr += 2;
+
+	ut_a(offset <= UNIV_PAGE_SIZE);
+
+	if (block) {
+		page_t*		page		= buf_block_get_frame(block);
+		mem_heap_t*	heap		= NULL;
+		ulint		offsets_[REC_OFFS_NORMAL_SIZE];
+		rec_t*		rec		= page + offset;
+		rec_offs_init(offsets_);
+
+		page_cur_position(rec, block, &cursor);
+		ut_ad(!buf_block_get_page_zip(block) || page_is_comp(page));
+
+		page_cur_delete_rec(&cursor, index,
+				    rec_get_offsets(rec, index, offsets_,
+						    ULINT_UNDEFINED, &heap),
+				    mtr);
+		if (UNIV_LIKELY_NULL(heap)) {
+			mem_heap_free(heap);
+		}
+	}
+
+	return(ptr);
+}
+
+/***********************************************************//**
+Deletes a record at the page cursor. The cursor is moved to the next
+record after the deleted one. */
+UNIV_INTERN
+void
+page_cur_delete_rec(
+/*================*/
+	page_cur_t*	cursor,	/*!< in/out: a page cursor */
+	dict_index_t*	index,	/*!< in: record descriptor */
+	const ulint*	offsets,/*!< in: rec_get_offsets(cursor->rec, index) */
+	mtr_t*		mtr)	/*!< in: mini-transaction handle */
+{
+	page_dir_slot_t* cur_dir_slot;
+	page_dir_slot_t* prev_slot;
+	page_t*		page;
+	page_zip_des_t*	page_zip;
+	rec_t*		current_rec;
+	rec_t*		prev_rec	= NULL;
+	rec_t*		next_rec;
+	ulint		cur_slot_no;
+	ulint		cur_n_owned;
+	rec_t*		rec;
+
+	ut_ad(cursor && mtr);
+
+	page = page_cur_get_page(cursor);
+	page_zip = page_cur_get_page_zip(cursor);
+
+	/* page_zip_validate() will fail here when
+	btr_cur_pessimistic_delete() invokes btr_set_min_rec_mark().
+	Then, both "page_zip" and "page" would have the min-rec-mark
+	set on the smallest user record, but "page" would additionally
+	have it set on the smallest-but-one record.  Because sloppy
+	page_zip_validate_low() only ignores min-rec-flag differences
+	in the smallest user record, it cannot be used here either. */
+
+	current_rec = cursor->rec;
+	ut_ad(rec_offs_validate(current_rec, index, offsets));
+	ut_ad(!!page_is_comp(page) == dict_table_is_comp(index->table));
+
+	/* The record must not be the supremum or infimum record. */
+	ut_ad(page_rec_is_user_rec(current_rec));
+
+	/* Save to local variables some data associated with current_rec */
+	cur_slot_no = page_dir_find_owner_slot(current_rec);
+	cur_dir_slot = page_dir_get_nth_slot(page, cur_slot_no);
+	cur_n_owned = page_dir_slot_get_n_owned(cur_dir_slot);
+
+	/* 0. Write the log record */
+	page_cur_delete_rec_write_log(current_rec, index, mtr);
+
+	/* 1. Reset the last insert info in the page header and increment
+	the modify clock for the frame */
+
+	page_header_set_ptr(page, page_zip, PAGE_LAST_INSERT, NULL);
+
+	/* The page gets invalid for optimistic searches: increment the
+	frame modify clock */
+
+	buf_block_modify_clock_inc(page_cur_get_block(cursor));
+
+	/* 2. Find the next and the previous record. Note that the cursor is
+	left at the next record. */
+
+	ut_ad(cur_slot_no > 0);
+	prev_slot = page_dir_get_nth_slot(page, cur_slot_no - 1);
+
+	rec = (rec_t*) page_dir_slot_get_rec(prev_slot);
+
+	/* rec now points to the record of the previous directory slot. Look
+	for the immediate predecessor of current_rec in a loop. */
+
+	while(current_rec != rec) {
+		prev_rec = rec;
+		rec = page_rec_get_next(rec);
+	}
+
+	page_cur_move_to_next(cursor);
+	next_rec = cursor->rec;
+
+	/* 3. Remove the record from the linked list of records */
+
+	page_rec_set_next(prev_rec, next_rec);
+
+	/* 4. If the deleted record is pointed to by a dir slot, update the
+	record pointer in slot. In the following if-clause we assume that
+	prev_rec is owned by the same slot, i.e., PAGE_DIR_SLOT_MIN_N_OWNED
+	>= 2. */
+
+#if PAGE_DIR_SLOT_MIN_N_OWNED < 2
+# error "PAGE_DIR_SLOT_MIN_N_OWNED < 2"
+#endif
+	ut_ad(cur_n_owned > 1);
+
+	if (current_rec == page_dir_slot_get_rec(cur_dir_slot)) {
+		page_dir_slot_set_rec(cur_dir_slot, prev_rec);
+	}
+
+	/* 5. Update the number of owned records of the slot */
+
+	page_dir_slot_set_n_owned(cur_dir_slot, page_zip, cur_n_owned - 1);
+
+	/* 6. Free the memory occupied by the record */
+	page_mem_free(page, page_zip, current_rec, index, offsets);
+
+	/* 7. Now we have decremented the number of owned records of the slot.
+	If the number drops below PAGE_DIR_SLOT_MIN_N_OWNED, we balance the
+	slots. */
+
+	if (UNIV_UNLIKELY(cur_n_owned <= PAGE_DIR_SLOT_MIN_N_OWNED)) {
+		page_dir_balance_slot(page, page_zip, cur_slot_no);
+	}
+
+#ifdef UNIV_ZIP_DEBUG
+	ut_a(!page_zip || page_zip_validate(page_zip, page));
+#endif /* UNIV_ZIP_DEBUG */
+}
+
+#ifdef UNIV_COMPILE_TEST_FUNCS
+
+/*******************************************************************//**
+Print the first n numbers, generated by page_cur_lcg_prng() to make sure
+(visually) that it works properly. */
+void
+test_page_cur_lcg_prng(
+/*===================*/
+	int	n)	/*!< in: print first n numbers */
+{
+	int			i;
+	unsigned long long	rnd;
+
+	for (i = 0; i < n; i++) {
+		rnd = page_cur_lcg_prng();
+		printf("%llu\t%%2=%llu %%3=%llu %%5=%llu %%7=%llu %%11=%llu\n",
+		       rnd,
+		       rnd % 2,
+		       rnd % 3,
+		       rnd % 5,
+		       rnd % 7,
+		       rnd % 11);
+	}
+}
+
+#endif /* UNIV_COMPILE_TEST_FUNCS */
diff --git a/storage/xtradb/page/page0page.c b/storage/xtradb/page/page0page.c
new file mode 100644
index 00000000000..10008f9ac25
--- /dev/null
+++ b/storage/xtradb/page/page0page.c
@@ -0,0 +1,2624 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2010, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file page/page0page.c
+Index page routines
+
+Created 2/2/1994 Heikki Tuuri
+*******************************************************/
+
+#define THIS_MODULE
+#include "page0page.h"
+#ifdef UNIV_NONINL
+#include "page0page.ic"
+#endif
+#undef THIS_MODULE
+
+#include "page0cur.h"
+#include "page0zip.h"
+#include "buf0buf.h"
+#include "btr0btr.h"
+#ifndef UNIV_HOTBACKUP
+# include "srv0srv.h"
+# include "lock0lock.h"
+# include "fut0lst.h"
+# include "btr0sea.h"
+#endif /* !UNIV_HOTBACKUP */
+
+/*			THE INDEX PAGE
+			==============
+
+The index page consists of a page header which contains the page's
+id and other information. On top of it are the index records
+in a heap linked into a one way linear list according to alphabetic order.
+
+Just below page end is an array of pointers which we call page directory,
+to about every sixth record in the list. The pointers are placed in
+the directory in the alphabetical order of the records pointed to,
+enabling us to make binary search using the array. Each slot n:o I
+in the directory points to a record, where a 4-bit field contains a count
+of those records which are in the linear list between pointer I and
+the pointer I - 1 in the directory, including the record
+pointed to by pointer I and not including the record pointed to by I - 1.
+We say that the record pointed to by slot I, or that slot I, owns
+these records. The count is always kept in the range 4 to 8, with
+the exception that it is 1 for the first slot, and 1--8 for the second slot.
+
+An essentially binary search can be performed in the list of index
+records, like we could do if we had pointer to every record in the
+page directory. The data structure is, however, more efficient when
+we are doing inserts, because most inserts are just pushed on a heap.
+Only every 8th insert requires block move in the directory pointer
+table, which itself is quite small. A record is deleted from the page
+by just taking it off the linear list and updating the number of owned
+records-field of the record which owns it, and updating the page directory,
+if necessary. A special case is the one when the record owns itself.
+Because the overhead of inserts is so small, we may also increase the
+page size from the projected default of 8 kB to 64 kB without too
+much loss of efficiency in inserts. Bigger page becomes actual
+when the disk transfer rate compared to seek and latency time rises.
+On the present system, the page size is set so that the page transfer
+time (3 ms) is 20 % of the disk random access time (15 ms).
+
+When the page is split, merged, or becomes full but contains deleted
+records, we have to reorganize the page.
+
+Assuming a page size of 8 kB, a typical index page of a secondary
+index contains 300 index entries, and the size of the page directory
+is 50 x 4 bytes = 200 bytes. */
+
+/***************************************************************//**
+Looks for the directory slot which owns the given record.
+@return	the directory slot number */
+UNIV_INTERN
+ulint
+page_dir_find_owner_slot(
+/*=====================*/
+	const rec_t*	rec)	/*!< in: the physical record */
+{
+	const page_t*			page;
+	register uint16			rec_offs_bytes;
+	register const page_dir_slot_t*	slot;
+	register const page_dir_slot_t*	first_slot;
+	register const rec_t*		r = rec;
+
+	ut_ad(page_rec_check(rec));
+
+	page = page_align(rec);
+	first_slot = page_dir_get_nth_slot(page, 0);
+	slot = page_dir_get_nth_slot(page, page_dir_get_n_slots(page) - 1);
+
+	if (page_is_comp(page)) {
+		while (rec_get_n_owned_new(r) == 0) {
+			r = rec_get_next_ptr_const(r, TRUE);
+			ut_ad(r >= page + PAGE_NEW_SUPREMUM);
+			ut_ad(r < page + (UNIV_PAGE_SIZE - PAGE_DIR));
+		}
+	} else {
+		while (rec_get_n_owned_old(r) == 0) {
+			r = rec_get_next_ptr_const(r, FALSE);
+			ut_ad(r >= page + PAGE_OLD_SUPREMUM);
+			ut_ad(r < page + (UNIV_PAGE_SIZE - PAGE_DIR));
+		}
+	}
+
+	rec_offs_bytes = mach_encode_2(r - page);
+
+	while (UNIV_LIKELY(*(uint16*) slot != rec_offs_bytes)) {
+
+		if (UNIV_UNLIKELY(slot == first_slot)) {
+			fprintf(stderr,
+				"InnoDB: Probable data corruption on"
+				" page %lu\n"
+				"InnoDB: Original record ",
+				(ulong) page_get_page_no(page));
+
+			if (page_is_comp(page)) {
+				fputs("(compact record)", stderr);
+			} else {
+				rec_print_old(stderr, rec);
+			}
+
+			fputs("\n"
+			      "InnoDB: on that page.\n"
+			      "InnoDB: Cannot find the dir slot for record ",
+			      stderr);
+			if (page_is_comp(page)) {
+				fputs("(compact record)", stderr);
+			} else {
+				rec_print_old(stderr, page
+					      + mach_decode_2(rec_offs_bytes));
+			}
+			fputs("\n"
+			      "InnoDB: on that page!\n", stderr);
+
+			buf_page_print(page, 0);
+
+			ut_error;
+		}
+
+		slot += PAGE_DIR_SLOT_SIZE;
+	}
+
+	return(((ulint) (first_slot - slot)) / PAGE_DIR_SLOT_SIZE);
+}
+
+/**************************************************************//**
+Used to check the consistency of a directory slot.
+@return	TRUE if succeed */
+static
+ibool
+page_dir_slot_check(
+/*================*/
+	page_dir_slot_t*	slot)	/*!< in: slot */
+{
+	page_t*	page;
+	ulint	n_slots;
+	ulint	n_owned;
+
+	ut_a(slot);
+
+	page = page_align(slot);
+
+	n_slots = page_dir_get_n_slots(page);
+
+	ut_a(slot <= page_dir_get_nth_slot(page, 0));
+	ut_a(slot >= page_dir_get_nth_slot(page, n_slots - 1));
+
+	ut_a(page_rec_check(page_dir_slot_get_rec(slot)));
+
+	if (page_is_comp(page)) {
+		n_owned = rec_get_n_owned_new(page_dir_slot_get_rec(slot));
+	} else {
+		n_owned = rec_get_n_owned_old(page_dir_slot_get_rec(slot));
+	}
+
+	if (slot == page_dir_get_nth_slot(page, 0)) {
+		ut_a(n_owned == 1);
+	} else if (slot == page_dir_get_nth_slot(page, n_slots - 1)) {
+		ut_a(n_owned >= 1);
+		ut_a(n_owned <= PAGE_DIR_SLOT_MAX_N_OWNED);
+	} else {
+		ut_a(n_owned >= PAGE_DIR_SLOT_MIN_N_OWNED);
+		ut_a(n_owned <= PAGE_DIR_SLOT_MAX_N_OWNED);
+	}
+
+	return(TRUE);
+}
+
+/*************************************************************//**
+Sets the max trx id field value. */
+UNIV_INTERN
+void
+page_set_max_trx_id(
+/*================*/
+	buf_block_t*	block,	/*!< in/out: page */
+	page_zip_des_t*	page_zip,/*!< in/out: compressed page, or NULL */
+	trx_id_t	trx_id,	/*!< in: transaction id */
+	mtr_t*		mtr)	/*!< in/out: mini-transaction, or NULL */
+{
+	page_t*		page		= buf_block_get_frame(block);
+#ifndef UNIV_HOTBACKUP
+	const ibool	is_hashed	= block->is_hashed;
+
+	if (is_hashed) {
+		rw_lock_x_lock(&btr_search_latch);
+	}
+
+	ut_ad(!mtr || mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
+#endif /* !UNIV_HOTBACKUP */
+
+	/* It is not necessary to write this change to the redo log, as
+	during a database recovery we assume that the max trx id of every
+	page is the maximum trx id assigned before the crash. */
+
+	if (UNIV_LIKELY_NULL(page_zip)) {
+		mach_write_to_8(page + (PAGE_HEADER + PAGE_MAX_TRX_ID), trx_id);
+		page_zip_write_header(page_zip,
+				      page + (PAGE_HEADER + PAGE_MAX_TRX_ID),
+				      8, mtr);
+#ifndef UNIV_HOTBACKUP
+	} else if (mtr) {
+		mlog_write_dulint(page + (PAGE_HEADER + PAGE_MAX_TRX_ID),
+				  trx_id, mtr);
+#endif /* !UNIV_HOTBACKUP */
+	} else {
+		mach_write_to_8(page + (PAGE_HEADER + PAGE_MAX_TRX_ID), trx_id);
+	}
+
+#ifndef UNIV_HOTBACKUP
+	if (is_hashed) {
+		rw_lock_x_unlock(&btr_search_latch);
+	}
+#endif /* !UNIV_HOTBACKUP */
+}
+
+/************************************************************//**
+Allocates a block of memory from the heap of an index page.
+@return	pointer to start of allocated buffer, or NULL if allocation fails */
+UNIV_INTERN
+byte*
+page_mem_alloc_heap(
+/*================*/
+	page_t*		page,	/*!< in/out: index page */
+	page_zip_des_t*	page_zip,/*!< in/out: compressed page with enough
+				space available for inserting the record,
+				or NULL */
+	ulint		need,	/*!< in: total number of bytes needed */
+	ulint*		heap_no)/*!< out: this contains the heap number
+				of the allocated record
+				if allocation succeeds */
+{
+	byte*	block;
+	ulint	avl_space;
+
+	ut_ad(page && heap_no);
+
+	avl_space = page_get_max_insert_size(page, 1);
+
+	if (avl_space >= need) {
+		block = page_header_get_ptr(page, PAGE_HEAP_TOP);
+
+		page_header_set_ptr(page, page_zip, PAGE_HEAP_TOP,
+				    block + need);
+		*heap_no = page_dir_get_n_heap(page);
+
+		page_dir_set_n_heap(page, page_zip, 1 + *heap_no);
+
+		return(block);
+	}
+
+	return(NULL);
+}
+
+#ifndef UNIV_HOTBACKUP
+/**********************************************************//**
+Writes a log record of page creation. */
+UNIV_INLINE
+void
+page_create_write_log(
+/*==================*/
+	buf_frame_t*	frame,	/*!< in: a buffer frame where the page is
+				created */
+	mtr_t*		mtr,	/*!< in: mini-transaction handle */
+	ibool		comp)	/*!< in: TRUE=compact page format */
+{
+	mlog_write_initial_log_record(frame, comp
+				      ? MLOG_COMP_PAGE_CREATE
+				      : MLOG_PAGE_CREATE, mtr);
+}
+#else /* !UNIV_HOTBACKUP */
+# define page_create_write_log(frame,mtr,comp) ((void) 0)
+#endif /* !UNIV_HOTBACKUP */
+
+/***********************************************************//**
+Parses a redo log record of creating a page.
+@return	end of log record or NULL */
+UNIV_INTERN
+byte*
+page_parse_create(
+/*==============*/
+	byte*		ptr,	/*!< in: buffer */
+	byte*		end_ptr __attribute__((unused)), /*!< in: buffer end */
+	ulint		comp,	/*!< in: nonzero=compact page format */
+	buf_block_t*	block,	/*!< in: block or NULL */
+	mtr_t*		mtr)	/*!< in: mtr or NULL */
+{
+	ut_ad(ptr && end_ptr);
+
+	/* The record is empty, except for the record initial part */
+
+	if (block) {
+		page_create(block, mtr, comp);
+	}
+
+	return(ptr);
+}
+
+/**********************************************************//**
+The index page creation function.
+@return	pointer to the page */
+static
+page_t*
+page_create_low(
+/*============*/
+	buf_block_t*	block,		/*!< in: a buffer block where the
+					page is created */
+	ulint		comp)		/*!< in: nonzero=compact page format */
+{
+	page_dir_slot_t* slot;
+	mem_heap_t*	heap;
+	dtuple_t*	tuple;
+	dfield_t*	field;
+	byte*		heap_top;
+	rec_t*		infimum_rec;
+	rec_t*		supremum_rec;
+	page_t*		page;
+	dict_index_t*	index;
+	ulint*		offsets;
+
+	ut_ad(block);
+#if PAGE_BTR_IBUF_FREE_LIST + FLST_BASE_NODE_SIZE > PAGE_DATA
+# error "PAGE_BTR_IBUF_FREE_LIST + FLST_BASE_NODE_SIZE > PAGE_DATA"
+#endif
+#if PAGE_BTR_IBUF_FREE_LIST_NODE + FLST_NODE_SIZE > PAGE_DATA
+# error "PAGE_BTR_IBUF_FREE_LIST_NODE + FLST_NODE_SIZE > PAGE_DATA"
+#endif
+
+	/* The infimum and supremum records use a dummy index. */
+	if (UNIV_LIKELY(comp)) {
+		index = dict_ind_compact;
+	} else {
+		index = dict_ind_redundant;
+	}
+
+	/* 1. INCREMENT MODIFY CLOCK */
+	buf_block_modify_clock_inc(block);
+
+	page = buf_block_get_frame(block);
+
+	fil_page_set_type(page, FIL_PAGE_INDEX);
+
+	heap = mem_heap_create(200);
+
+	/* 3. CREATE THE INFIMUM AND SUPREMUM RECORDS */
+
+	/* Create first a data tuple for infimum record */
+	tuple = dtuple_create(heap, 1);
+	dtuple_set_info_bits(tuple, REC_STATUS_INFIMUM);
+	field = dtuple_get_nth_field(tuple, 0);
+
+	dfield_set_data(field, "infimum", 8);
+	dtype_set(dfield_get_type(field),
+		  DATA_VARCHAR, DATA_ENGLISH | DATA_NOT_NULL, 8);
+	/* Set the corresponding physical record to its place in the page
+	record heap */
+
+	heap_top = page + PAGE_DATA;
+
+	infimum_rec = rec_convert_dtuple_to_rec(heap_top, index, tuple, 0);
+
+	if (UNIV_LIKELY(comp)) {
+		ut_a(infimum_rec == page + PAGE_NEW_INFIMUM);
+
+		rec_set_n_owned_new(infimum_rec, NULL, 1);
+		rec_set_heap_no_new(infimum_rec, 0);
+	} else {
+		ut_a(infimum_rec == page + PAGE_OLD_INFIMUM);
+
+		rec_set_n_owned_old(infimum_rec, 1);
+		rec_set_heap_no_old(infimum_rec, 0);
+	}
+
+	offsets = rec_get_offsets(infimum_rec, index, NULL,
+				  ULINT_UNDEFINED, &heap);
+
+	heap_top = rec_get_end(infimum_rec, offsets);
+
+	/* Create then a tuple for supremum */
+
+	tuple = dtuple_create(heap, 1);
+	dtuple_set_info_bits(tuple, REC_STATUS_SUPREMUM);
+	field = dtuple_get_nth_field(tuple, 0);
+
+	dfield_set_data(field, "supremum", comp ? 8 : 9);
+	dtype_set(dfield_get_type(field),
+		  DATA_VARCHAR, DATA_ENGLISH | DATA_NOT_NULL, comp ? 8 : 9);
+
+	supremum_rec = rec_convert_dtuple_to_rec(heap_top, index, tuple, 0);
+
+	if (UNIV_LIKELY(comp)) {
+		ut_a(supremum_rec == page + PAGE_NEW_SUPREMUM);
+
+		rec_set_n_owned_new(supremum_rec, NULL, 1);
+		rec_set_heap_no_new(supremum_rec, 1);
+	} else {
+		ut_a(supremum_rec == page + PAGE_OLD_SUPREMUM);
+
+		rec_set_n_owned_old(supremum_rec, 1);
+		rec_set_heap_no_old(supremum_rec, 1);
+	}
+
+	offsets = rec_get_offsets(supremum_rec, index, offsets,
+				  ULINT_UNDEFINED, &heap);
+	heap_top = rec_get_end(supremum_rec, offsets);
+
+	ut_ad(heap_top == page
+	      + (comp ? PAGE_NEW_SUPREMUM_END : PAGE_OLD_SUPREMUM_END));
+
+	mem_heap_free(heap);
+
+	/* 4. INITIALIZE THE PAGE */
+
+	page_header_set_field(page, NULL, PAGE_N_DIR_SLOTS, 2);
+	page_header_set_ptr(page, NULL, PAGE_HEAP_TOP, heap_top);
+	page_header_set_field(page, NULL, PAGE_N_HEAP, comp
+			      ? 0x8000 | PAGE_HEAP_NO_USER_LOW
+			      : PAGE_HEAP_NO_USER_LOW);
+	page_header_set_ptr(page, NULL, PAGE_FREE, NULL);
+	page_header_set_field(page, NULL, PAGE_GARBAGE, 0);
+	page_header_set_ptr(page, NULL, PAGE_LAST_INSERT, NULL);
+	page_header_set_field(page, NULL, PAGE_DIRECTION, PAGE_NO_DIRECTION);
+	page_header_set_field(page, NULL, PAGE_N_DIRECTION, 0);
+	page_header_set_field(page, NULL, PAGE_N_RECS, 0);
+	page_set_max_trx_id(block, NULL, ut_dulint_zero, NULL);
+	memset(heap_top, 0, UNIV_PAGE_SIZE - PAGE_EMPTY_DIR_START
+	       - page_offset(heap_top));
+
+	/* 5. SET POINTERS IN RECORDS AND DIR SLOTS */
+
+	/* Set the slots to point to infimum and supremum. */
+
+	slot = page_dir_get_nth_slot(page, 0);
+	page_dir_slot_set_rec(slot, infimum_rec);
+
+	slot = page_dir_get_nth_slot(page, 1);
+	page_dir_slot_set_rec(slot, supremum_rec);
+
+	/* Set the next pointers in infimum and supremum */
+
+	if (UNIV_LIKELY(comp)) {
+		rec_set_next_offs_new(infimum_rec, PAGE_NEW_SUPREMUM);
+		rec_set_next_offs_new(supremum_rec, 0);
+	} else {
+		rec_set_next_offs_old(infimum_rec, PAGE_OLD_SUPREMUM);
+		rec_set_next_offs_old(supremum_rec, 0);
+	}
+
+	return(page);
+}
+
+/**********************************************************//**
+Create an uncompressed B-tree index page.
+@return	pointer to the page */
+UNIV_INTERN
+page_t*
+page_create(
+/*========*/
+	buf_block_t*	block,		/*!< in: a buffer block where the
+					page is created */
+	mtr_t*		mtr,		/*!< in: mini-transaction handle */
+	ulint		comp)		/*!< in: nonzero=compact page format */
+{
+	page_create_write_log(buf_block_get_frame(block), mtr, comp);
+	return(page_create_low(block, comp));
+}
+
+/**********************************************************//**
+Create a compressed B-tree index page.
+@return	pointer to the page */
+UNIV_INTERN
+page_t*
+page_create_zip(
+/*============*/
+	buf_block_t*	block,		/*!< in/out: a buffer frame where the
+					page is created */
+	dict_index_t*	index,		/*!< in: the index of the page */
+	ulint		level,		/*!< in: the B-tree level of the page */
+	mtr_t*		mtr)		/*!< in: mini-transaction handle */
+{
+	page_t*		page;
+	page_zip_des_t*	page_zip	= buf_block_get_page_zip(block);
+
+	ut_ad(block);
+	ut_ad(page_zip);
+	ut_ad(index);
+	ut_ad(dict_table_is_comp(index->table));
+
+	page = page_create_low(block, TRUE);
+	mach_write_to_2(page + PAGE_HEADER + PAGE_LEVEL, level);
+
+	if (UNIV_UNLIKELY(!page_zip_compress(page_zip, page, index, mtr))) {
+		/* The compression of a newly created page
+		should always succeed. */
+		ut_error;
+	}
+
+	return(page);
+}
+
+/*************************************************************//**
+Differs from page_copy_rec_list_end, because this function does not
+touch the lock table and max trx id on page or compress the page. */
+UNIV_INTERN
+void
+page_copy_rec_list_end_no_locks(
+/*============================*/
+	buf_block_t*	new_block,	/*!< in: index page to copy to */
+	buf_block_t*	block,		/*!< in: index page of rec */
+	rec_t*		rec,		/*!< in: record on page */
+	dict_index_t*	index,		/*!< in: record descriptor */
+	mtr_t*		mtr)		/*!< in: mtr */
+{
+	page_t*		new_page	= buf_block_get_frame(new_block);
+	page_cur_t	cur1;
+	rec_t*		cur2;
+	mem_heap_t*	heap		= NULL;
+	ulint		offsets_[REC_OFFS_NORMAL_SIZE];
+	ulint*		offsets		= offsets_;
+	rec_offs_init(offsets_);
+
+	page_cur_position(rec, block, &cur1);
+
+	if (page_cur_is_before_first(&cur1)) {
+
+		page_cur_move_to_next(&cur1);
+	}
+
+	ut_a((ibool)!!page_is_comp(new_page)
+	     == dict_table_is_comp(index->table));
+	ut_a(page_is_comp(new_page) == page_rec_is_comp(rec));
+	ut_a(mach_read_from_2(new_page + UNIV_PAGE_SIZE - 10) == (ulint)
+	     (page_is_comp(new_page) ? PAGE_NEW_INFIMUM : PAGE_OLD_INFIMUM));
+
+	cur2 = page_get_infimum_rec(buf_block_get_frame(new_block));
+
+	/* Copy records from the original page to the new page */
+
+	while (!page_cur_is_after_last(&cur1)) {
+		rec_t*	cur1_rec = page_cur_get_rec(&cur1);
+		rec_t*	ins_rec;
+		offsets = rec_get_offsets(cur1_rec, index, offsets,
+					  ULINT_UNDEFINED, &heap);
+		ins_rec = page_cur_insert_rec_low(cur2, index,
+						  cur1_rec, offsets, mtr);
+		if (UNIV_UNLIKELY(!ins_rec)) {
+			/* Track an assertion failure reported on the mailing
+			list on June 18th, 2003 */
+
+			buf_page_print(new_page, 0);
+			buf_page_print(page_align(rec), 0);
+			ut_print_timestamp(stderr);
+
+			fprintf(stderr,
+				"InnoDB: rec offset %lu, cur1 offset %lu,"
+				" cur2 offset %lu\n",
+				(ulong) page_offset(rec),
+				(ulong) page_offset(page_cur_get_rec(&cur1)),
+				(ulong) page_offset(cur2));
+			ut_error;
+		}
+
+		page_cur_move_to_next(&cur1);
+		cur2 = ins_rec;
+	}
+
+	if (UNIV_LIKELY_NULL(heap)) {
+		mem_heap_free(heap);
+	}
+}
+
+#ifndef UNIV_HOTBACKUP
+/*************************************************************//**
+Copies records from page to new_page, from a given record onward,
+including that record. Infimum and supremum records are not copied.
+The records are copied to the start of the record list on new_page.
+@return pointer to the original successor of the infimum record on
+new_page, or NULL on zip overflow (new_block will be decompressed) */
+UNIV_INTERN
+rec_t*
+page_copy_rec_list_end(
+/*===================*/
+	buf_block_t*	new_block,	/*!< in/out: index page to copy to */
+	buf_block_t*	block,		/*!< in: index page containing rec */
+	rec_t*		rec,		/*!< in: record on page */
+	dict_index_t*	index,		/*!< in: record descriptor */
+	mtr_t*		mtr)		/*!< in: mtr */
+{
+	page_t*		new_page	= buf_block_get_frame(new_block);
+	page_zip_des_t*	new_page_zip	= buf_block_get_page_zip(new_block);
+	page_t*		page		= page_align(rec);
+	rec_t*		ret		= page_rec_get_next(
+		page_get_infimum_rec(new_page));
+	ulint		log_mode	= 0; /* remove warning */
+
+#ifdef UNIV_ZIP_DEBUG
+	if (new_page_zip) {
+		page_zip_des_t*	page_zip = buf_block_get_page_zip(block);
+		ut_a(page_zip);
+
+		/* Strict page_zip_validate() may fail here.
+		Furthermore, btr_compress() may set FIL_PAGE_PREV to
+		FIL_NULL on new_page while leaving it intact on
+		new_page_zip.  So, we cannot validate new_page_zip. */
+		ut_a(page_zip_validate_low(page_zip, page, TRUE));
+	}
+#endif /* UNIV_ZIP_DEBUG */
+	ut_ad(buf_block_get_frame(block) == page);
+	ut_ad(page_is_leaf(page) == page_is_leaf(new_page));
+	ut_ad(page_is_comp(page) == page_is_comp(new_page));
+	/* Here, "ret" may be pointing to a user record or the
+	predefined supremum record. */
+
+	if (UNIV_LIKELY_NULL(new_page_zip)) {
+		log_mode = mtr_set_log_mode(mtr, MTR_LOG_NONE);
+	}
+
+	if (page_dir_get_n_heap(new_page) == PAGE_HEAP_NO_USER_LOW) {
+		page_copy_rec_list_end_to_created_page(new_page, rec,
+						       index, mtr);
+	} else {
+		page_copy_rec_list_end_no_locks(new_block, block, rec,
+						index, mtr);
+	}
+
+	/* Update PAGE_MAX_TRX_ID on the uncompressed page.
+	Modifications will be redo logged and copied to the compressed
+	page in page_zip_compress() or page_zip_reorganize() below. */
+	if (dict_index_is_sec_or_ibuf(index) && page_is_leaf(page)) {
+		page_update_max_trx_id(new_block, NULL,
+				       page_get_max_trx_id(page), mtr);
+	}
+
+	if (UNIV_LIKELY_NULL(new_page_zip)) {
+		mtr_set_log_mode(mtr, log_mode);
+
+		if (UNIV_UNLIKELY
+		    (!page_zip_compress(new_page_zip, new_page, index, mtr))) {
+			/* Before trying to reorganize the page,
+			store the number of preceding records on the page. */
+			ulint	ret_pos
+				= page_rec_get_n_recs_before(ret);
+			/* Before copying, "ret" was the successor of
+			the predefined infimum record.  It must still
+			have at least one predecessor (the predefined
+			infimum record, or a freshly copied record
+			that is smaller than "ret"). */
+			ut_a(ret_pos > 0);
+
+			if (UNIV_UNLIKELY
+			    (!page_zip_reorganize(new_block, index, mtr))) {
+
+				if (UNIV_UNLIKELY
+				    (!page_zip_decompress(new_page_zip,
+							  new_page, FALSE))) {
+					ut_error;
+				}
+				ut_ad(page_validate(new_page, index));
+				return(NULL);
+			} else {
+				/* The page was reorganized:
+				Seek to ret_pos. */
+				ret = new_page + PAGE_NEW_INFIMUM;
+
+				do {
+					ret = rec_get_next_ptr(ret, TRUE);
+				} while (--ret_pos);
+			}
+		}
+	}
+
+	/* Update the lock table and possible hash index */
+
+	lock_move_rec_list_end(new_block, block, rec);
+
+	btr_search_move_or_delete_hash_entries(new_block, block, index);
+
+	return(ret);
+}
+
+/*************************************************************//**
+Copies records from page to new_page, up to the given record,
+NOT including that record. Infimum and supremum records are not copied.
+The records are copied to the end of the record list on new_page.
+@return pointer to the original predecessor of the supremum record on
+new_page, or NULL on zip overflow (new_block will be decompressed) */
+UNIV_INTERN
+rec_t*
+page_copy_rec_list_start(
+/*=====================*/
+	buf_block_t*	new_block,	/*!< in/out: index page to copy to */
+	buf_block_t*	block,		/*!< in: index page containing rec */
+	rec_t*		rec,		/*!< in: record on page */
+	dict_index_t*	index,		/*!< in: record descriptor */
+	mtr_t*		mtr)		/*!< in: mtr */
+{
+	page_t*		new_page	= buf_block_get_frame(new_block);
+	page_zip_des_t*	new_page_zip	= buf_block_get_page_zip(new_block);
+	page_cur_t	cur1;
+	rec_t*		cur2;
+	ulint		log_mode	= 0 /* remove warning */;
+	mem_heap_t*	heap		= NULL;
+	rec_t*		ret
+		= page_rec_get_prev(page_get_supremum_rec(new_page));
+	ulint		offsets_[REC_OFFS_NORMAL_SIZE];
+	ulint*		offsets		= offsets_;
+	rec_offs_init(offsets_);
+
+	/* Here, "ret" may be pointing to a user record or the
+	predefined infimum record. */
+
+	if (page_rec_is_infimum(rec)) {
+
+		return(ret);
+	}
+
+	if (UNIV_LIKELY_NULL(new_page_zip)) {
+		log_mode = mtr_set_log_mode(mtr, MTR_LOG_NONE);
+	}
+
+	page_cur_set_before_first(block, &cur1);
+	page_cur_move_to_next(&cur1);
+
+	cur2 = ret;
+
+	/* Copy records from the original page to the new page */
+
+	while (page_cur_get_rec(&cur1) != rec) {
+		rec_t*	cur1_rec = page_cur_get_rec(&cur1);
+		offsets = rec_get_offsets(cur1_rec, index, offsets,
+					  ULINT_UNDEFINED, &heap);
+		cur2 = page_cur_insert_rec_low(cur2, index,
+					       cur1_rec, offsets, mtr);
+		ut_a(cur2);
+
+		page_cur_move_to_next(&cur1);
+	}
+
+	if (UNIV_LIKELY_NULL(heap)) {
+		mem_heap_free(heap);
+	}
+
+	/* Update PAGE_MAX_TRX_ID on the uncompressed page.
+	Modifications will be redo logged and copied to the compressed
+	page in page_zip_compress() or page_zip_reorganize() below. */
+	if (dict_index_is_sec_or_ibuf(index)
+	    && page_is_leaf(page_align(rec))) {
+		page_update_max_trx_id(new_block, NULL,
+				       page_get_max_trx_id(page_align(rec)),
+				       mtr);
+	}
+
+	if (UNIV_LIKELY_NULL(new_page_zip)) {
+		mtr_set_log_mode(mtr, log_mode);
+
+		if (UNIV_UNLIKELY
+		    (!page_zip_compress(new_page_zip, new_page, index, mtr))) {
+			/* Before trying to reorganize the page,
+			store the number of preceding records on the page. */
+			ulint	ret_pos
+				= page_rec_get_n_recs_before(ret);
+			/* Before copying, "ret" was the predecessor
+			of the predefined supremum record.  If it was
+			the predefined infimum record, then it would
+			still be the infimum.  Thus, the assertion
+			ut_a(ret_pos > 0) would fail here. */
+
+			if (UNIV_UNLIKELY
+			    (!page_zip_reorganize(new_block, index, mtr))) {
+
+				if (UNIV_UNLIKELY
+				    (!page_zip_decompress(new_page_zip,
+							  new_page, FALSE))) {
+					ut_error;
+				}
+				ut_ad(page_validate(new_page, index));
+				return(NULL);
+			} else {
+				/* The page was reorganized:
+				Seek to ret_pos. */
+				ret = new_page + PAGE_NEW_INFIMUM;
+
+				do {
+					ret = rec_get_next_ptr(ret, TRUE);
+				} while (--ret_pos);
+			}
+		}
+	}
+
+	/* Update the lock table and possible hash index */
+
+	lock_move_rec_list_start(new_block, block, rec, ret);
+
+	btr_search_move_or_delete_hash_entries(new_block, block, index);
+
+	return(ret);
+}
+
+/**********************************************************//**
+Writes a log record of a record list end or start deletion. */
+UNIV_INLINE
+void
+page_delete_rec_list_write_log(
+/*===========================*/
+	rec_t*		rec,	/*!< in: record on page */
+	dict_index_t*	index,	/*!< in: record descriptor */
+	byte		type,	/*!< in: operation type:
+				MLOG_LIST_END_DELETE, ... */
+	mtr_t*		mtr)	/*!< in: mtr */
+{
+	byte*	log_ptr;
+	ut_ad(type == MLOG_LIST_END_DELETE
+	      || type == MLOG_LIST_START_DELETE
+	      || type == MLOG_COMP_LIST_END_DELETE
+	      || type == MLOG_COMP_LIST_START_DELETE);
+
+	log_ptr = mlog_open_and_write_index(mtr, rec, index, type, 2);
+	if (log_ptr) {
+		/* Write the parameter as a 2-byte ulint */
+		mach_write_to_2(log_ptr, page_offset(rec));
+		mlog_close(mtr, log_ptr + 2);
+	}
+}
+#else /* !UNIV_HOTBACKUP */
+# define page_delete_rec_list_write_log(rec,index,type,mtr) ((void) 0)
+#endif /* !UNIV_HOTBACKUP */
+
+/**********************************************************//**
+Parses a log record of a record list end or start deletion.
+@return	end of log record or NULL */
+UNIV_INTERN
+byte*
+page_parse_delete_rec_list(
+/*=======================*/
+	byte		type,	/*!< in: MLOG_LIST_END_DELETE,
+				MLOG_LIST_START_DELETE,
+				MLOG_COMP_LIST_END_DELETE or
+				MLOG_COMP_LIST_START_DELETE */
+	byte*		ptr,	/*!< in: buffer */
+	byte*		end_ptr,/*!< in: buffer end */
+	buf_block_t*	block,	/*!< in/out: buffer block or NULL */
+	dict_index_t*	index,	/*!< in: record descriptor */
+	mtr_t*		mtr)	/*!< in: mtr or NULL */
+{
+	page_t*	page;
+	ulint	offset;
+
+	ut_ad(type == MLOG_LIST_END_DELETE
+	      || type == MLOG_LIST_START_DELETE
+	      || type == MLOG_COMP_LIST_END_DELETE
+	      || type == MLOG_COMP_LIST_START_DELETE);
+
+	/* Read the record offset as a 2-byte ulint */
+
+	if (end_ptr < ptr + 2) {
+
+		return(NULL);
+	}
+
+	offset = mach_read_from_2(ptr);
+	ptr += 2;
+
+	if (!block) {
+
+		return(ptr);
+	}
+
+	page = buf_block_get_frame(block);
+
+	ut_ad(!!page_is_comp(page) == dict_table_is_comp(index->table));
+
+	if (type == MLOG_LIST_END_DELETE
+	    || type == MLOG_COMP_LIST_END_DELETE) {
+		page_delete_rec_list_end(page + offset, block, index,
+					 ULINT_UNDEFINED, ULINT_UNDEFINED,
+					 mtr);
+	} else {
+		page_delete_rec_list_start(page + offset, block, index, mtr);
+	}
+
+	return(ptr);
+}
+
+/*************************************************************//**
+Deletes records from a page from a given record onward, including that record.
+The infimum and supremum records are not deleted. */
+UNIV_INTERN
+void
+page_delete_rec_list_end(
+/*=====================*/
+	rec_t*		rec,	/*!< in: pointer to record on page */
+	buf_block_t*	block,	/*!< in: buffer block of the page */
+	dict_index_t*	index,	/*!< in: record descriptor */
+	ulint		n_recs,	/*!< in: number of records to delete,
+				or ULINT_UNDEFINED if not known */
+	ulint		size,	/*!< in: the sum of the sizes of the
+				records in the end of the chain to
+				delete, or ULINT_UNDEFINED if not known */
+	mtr_t*		mtr)	/*!< in: mtr */
+{
+	page_dir_slot_t*slot;
+	ulint		slot_index;
+	rec_t*		last_rec;
+	rec_t*		prev_rec;
+	ulint		n_owned;
+	page_zip_des_t*	page_zip	= buf_block_get_page_zip(block);
+	page_t*		page		= page_align(rec);
+	mem_heap_t*	heap		= NULL;
+	ulint		offsets_[REC_OFFS_NORMAL_SIZE];
+	ulint*		offsets		= offsets_;
+	rec_offs_init(offsets_);
+
+	ut_ad(size == ULINT_UNDEFINED || size < UNIV_PAGE_SIZE);
+	ut_ad(!page_zip || page_rec_is_comp(rec));
+#ifdef UNIV_ZIP_DEBUG
+	ut_a(!page_zip || page_zip_validate(page_zip, page));
+#endif /* UNIV_ZIP_DEBUG */
+
+	if (page_rec_is_infimum(rec)) {
+		rec = page_rec_get_next(rec);
+	}
+
+	if (page_rec_is_supremum(rec)) {
+
+		return;
+	}
+
+	/* Reset the last insert info in the page header and increment
+	the modify clock for the frame */
+
+	page_header_set_ptr(page, page_zip, PAGE_LAST_INSERT, NULL);
+
+	/* The page gets invalid for optimistic searches: increment the
+	frame modify clock */
+
+	buf_block_modify_clock_inc(block);
+
+	page_delete_rec_list_write_log(rec, index, page_is_comp(page)
+				       ? MLOG_COMP_LIST_END_DELETE
+				       : MLOG_LIST_END_DELETE, mtr);
+
+	if (UNIV_LIKELY_NULL(page_zip)) {
+		ulint		log_mode;
+
+		ut_a(page_is_comp(page));
+		/* Individual deletes are not logged */
+
+		log_mode = mtr_set_log_mode(mtr, MTR_LOG_NONE);
+
+		do {
+			page_cur_t	cur;
+			page_cur_position(rec, block, &cur);
+
+			offsets = rec_get_offsets(rec, index, offsets,
+						  ULINT_UNDEFINED, &heap);
+			rec = rec_get_next_ptr(rec, TRUE);
+#ifdef UNIV_ZIP_DEBUG
+			ut_a(page_zip_validate(page_zip, page));
+#endif /* UNIV_ZIP_DEBUG */
+			page_cur_delete_rec(&cur, index, offsets, mtr);
+		} while (page_offset(rec) != PAGE_NEW_SUPREMUM);
+
+		if (UNIV_LIKELY_NULL(heap)) {
+			mem_heap_free(heap);
+		}
+
+		/* Restore log mode */
+
+		mtr_set_log_mode(mtr, log_mode);
+		return;
+	}
+
+	prev_rec = page_rec_get_prev(rec);
+
+	last_rec = page_rec_get_prev(page_get_supremum_rec(page));
+
+	if ((size == ULINT_UNDEFINED) || (n_recs == ULINT_UNDEFINED)) {
+		rec_t*		rec2		= rec;
+		/* Calculate the sum of sizes and the number of records */
+		size = 0;
+		n_recs = 0;
+
+		do {
+			ulint	s;
+			offsets = rec_get_offsets(rec2, index, offsets,
+						  ULINT_UNDEFINED, &heap);
+			s = rec_offs_size(offsets);
+			ut_ad(rec2 - page + s - rec_offs_extra_size(offsets)
+			      < UNIV_PAGE_SIZE);
+			ut_ad(size + s < UNIV_PAGE_SIZE);
+			size += s;
+			n_recs++;
+
+			rec2 = page_rec_get_next(rec2);
+		} while (!page_rec_is_supremum(rec2));
+
+		if (UNIV_LIKELY_NULL(heap)) {
+			mem_heap_free(heap);
+		}
+	}
+
+	ut_ad(size < UNIV_PAGE_SIZE);
+
+	/* Update the page directory; there is no need to balance the number
+	of the records owned by the supremum record, as it is allowed to be
+	less than PAGE_DIR_SLOT_MIN_N_OWNED */
+
+	if (page_is_comp(page)) {
+		rec_t*	rec2	= rec;
+		ulint	count	= 0;
+
+		while (rec_get_n_owned_new(rec2) == 0) {
+			count++;
+
+			rec2 = rec_get_next_ptr(rec2, TRUE);
+		}
+
+		ut_ad(rec_get_n_owned_new(rec2) > count);
+
+		n_owned = rec_get_n_owned_new(rec2) - count;
+		slot_index = page_dir_find_owner_slot(rec2);
+		slot = page_dir_get_nth_slot(page, slot_index);
+	} else {
+		rec_t*	rec2	= rec;
+		ulint	count	= 0;
+
+		while (rec_get_n_owned_old(rec2) == 0) {
+			count++;
+
+			rec2 = rec_get_next_ptr(rec2, FALSE);
+		}
+
+		ut_ad(rec_get_n_owned_old(rec2) > count);
+
+		n_owned = rec_get_n_owned_old(rec2) - count;
+		slot_index = page_dir_find_owner_slot(rec2);
+		slot = page_dir_get_nth_slot(page, slot_index);
+	}
+
+	page_dir_slot_set_rec(slot, page_get_supremum_rec(page));
+	page_dir_slot_set_n_owned(slot, NULL, n_owned);
+
+	page_dir_set_n_slots(page, NULL, slot_index + 1);
+
+	/* Remove the record chain segment from the record chain */
+	page_rec_set_next(prev_rec, page_get_supremum_rec(page));
+
+	/* Catenate the deleted chain segment to the page free list */
+
+	page_rec_set_next(last_rec, page_header_get_ptr(page, PAGE_FREE));
+	page_header_set_ptr(page, NULL, PAGE_FREE, rec);
+
+	page_header_set_field(page, NULL, PAGE_GARBAGE, size
+			      + page_header_get_field(page, PAGE_GARBAGE));
+
+	page_header_set_field(page, NULL, PAGE_N_RECS,
+			      (ulint)(page_get_n_recs(page) - n_recs));
+}
+
+/*************************************************************//**
+Deletes records from page, up to the given record, NOT including
+that record. Infimum and supremum records are not deleted. */
+UNIV_INTERN
+void
+page_delete_rec_list_start(
+/*=======================*/
+	rec_t*		rec,	/*!< in: record on page */
+	buf_block_t*	block,	/*!< in: buffer block of the page */
+	dict_index_t*	index,	/*!< in: record descriptor */
+	mtr_t*		mtr)	/*!< in: mtr */
+{
+	page_cur_t	cur1;
+	ulint		log_mode;
+	ulint		offsets_[REC_OFFS_NORMAL_SIZE];
+	ulint*		offsets		= offsets_;
+	mem_heap_t*	heap		= NULL;
+	byte		type;
+
+	rec_offs_init(offsets_);
+
+	ut_ad((ibool) !!page_rec_is_comp(rec)
+	      == dict_table_is_comp(index->table));
+#ifdef UNIV_ZIP_DEBUG
+	{
+		page_zip_des_t*	page_zip= buf_block_get_page_zip(block);
+		page_t*		page	= buf_block_get_frame(block);
+
+		/* page_zip_validate() would detect a min_rec_mark mismatch
+		in btr_page_split_and_insert()
+		between btr_attach_half_pages() and insert_page = ...
+		when btr_page_get_split_rec_to_left() holds
+		(direction == FSP_DOWN). */
+		ut_a(!page_zip || page_zip_validate_low(page_zip, page, TRUE));
+	}
+#endif /* UNIV_ZIP_DEBUG */
+
+	if (page_rec_is_infimum(rec)) {
+
+		return;
+	}
+
+	if (page_rec_is_comp(rec)) {
+		type = MLOG_COMP_LIST_START_DELETE;
+	} else {
+		type = MLOG_LIST_START_DELETE;
+	}
+
+	page_delete_rec_list_write_log(rec, index, type, mtr);
+
+	page_cur_set_before_first(block, &cur1);
+	page_cur_move_to_next(&cur1);
+
+	/* Individual deletes are not logged */
+
+	log_mode = mtr_set_log_mode(mtr, MTR_LOG_NONE);
+
+	while (page_cur_get_rec(&cur1) != rec) {
+		offsets = rec_get_offsets(page_cur_get_rec(&cur1), index,
+					  offsets, ULINT_UNDEFINED, &heap);
+		page_cur_delete_rec(&cur1, index, offsets, mtr);
+	}
+
+	if (UNIV_LIKELY_NULL(heap)) {
+		mem_heap_free(heap);
+	}
+
+	/* Restore log mode */
+
+	mtr_set_log_mode(mtr, log_mode);
+}
+
+#ifndef UNIV_HOTBACKUP
+/*************************************************************//**
+Moves record list end to another page. Moved records include
+split_rec.
+@return TRUE on success; FALSE on compression failure (new_block will
+be decompressed) */
+UNIV_INTERN
+ibool
+page_move_rec_list_end(
+/*===================*/
+	buf_block_t*	new_block,	/*!< in/out: index page where to move */
+	buf_block_t*	block,		/*!< in: index page from where to move */
+	rec_t*		split_rec,	/*!< in: first record to move */
+	dict_index_t*	index,		/*!< in: record descriptor */
+	mtr_t*		mtr)		/*!< in: mtr */
+{
+	page_t*		new_page	= buf_block_get_frame(new_block);
+	ulint		old_data_size;
+	ulint		new_data_size;
+	ulint		old_n_recs;
+	ulint		new_n_recs;
+
+	old_data_size = page_get_data_size(new_page);
+	old_n_recs = page_get_n_recs(new_page);
+#ifdef UNIV_ZIP_DEBUG
+	{
+		page_zip_des_t*	new_page_zip
+			= buf_block_get_page_zip(new_block);
+		page_zip_des_t*	page_zip
+			= buf_block_get_page_zip(block);
+		ut_a(!new_page_zip == !page_zip);
+		ut_a(!new_page_zip
+		     || page_zip_validate(new_page_zip, new_page));
+		ut_a(!page_zip
+		     || page_zip_validate(page_zip, page_align(split_rec)));
+	}
+#endif /* UNIV_ZIP_DEBUG */
+
+	if (UNIV_UNLIKELY(!page_copy_rec_list_end(new_block, block,
+						  split_rec, index, mtr))) {
+		return(FALSE);
+	}
+
+	new_data_size = page_get_data_size(new_page);
+	new_n_recs = page_get_n_recs(new_page);
+
+	ut_ad(new_data_size >= old_data_size);
+
+	page_delete_rec_list_end(split_rec, block, index,
+				 new_n_recs - old_n_recs,
+				 new_data_size - old_data_size, mtr);
+
+	return(TRUE);
+}
+
+/*************************************************************//**
+Moves record list start to another page. Moved records do not include
+split_rec.
+@return	TRUE on success; FALSE on compression failure */
+UNIV_INTERN
+ibool
+page_move_rec_list_start(
+/*=====================*/
+	buf_block_t*	new_block,	/*!< in/out: index page where to move */
+	buf_block_t*	block,		/*!< in/out: page containing split_rec */
+	rec_t*		split_rec,	/*!< in: first record not to move */
+	dict_index_t*	index,		/*!< in: record descriptor */
+	mtr_t*		mtr)		/*!< in: mtr */
+{
+	if (UNIV_UNLIKELY(!page_copy_rec_list_start(new_block, block,
+						    split_rec, index, mtr))) {
+		return(FALSE);
+	}
+
+	page_delete_rec_list_start(split_rec, block, index, mtr);
+
+	return(TRUE);
+}
+
+/***********************************************************************//**
+This is a low-level operation which is used in a database index creation
+to update the page number of a created B-tree to a data dictionary record. */
+UNIV_INTERN
+void
+page_rec_write_index_page_no(
+/*=========================*/
+	rec_t*	rec,	/*!< in: record to update */
+	ulint	i,	/*!< in: index of the field to update */
+	ulint	page_no,/*!< in: value to write */
+	mtr_t*	mtr)	/*!< in: mtr */
+{
+	byte*	data;
+	ulint	len;
+
+	data = rec_get_nth_field_old(rec, i, &len);
+
+	ut_ad(len == 4);
+
+	mlog_write_ulint(data, page_no, MLOG_4BYTES, mtr);
+}
+#endif /* !UNIV_HOTBACKUP */
+
+/**************************************************************//**
+Used to delete n slots from the directory. This function updates
+also n_owned fields in the records, so that the first slot after
+the deleted ones inherits the records of the deleted slots. */
+UNIV_INLINE
+void
+page_dir_delete_slot(
+/*=================*/
+	page_t*		page,	/*!< in/out: the index page */
+	page_zip_des_t*	page_zip,/*!< in/out: compressed page, or NULL */
+	ulint		slot_no)/*!< in: slot to be deleted */
+{
+	page_dir_slot_t*	slot;
+	ulint			n_owned;
+	ulint			i;
+	ulint			n_slots;
+
+	ut_ad(!page_zip || page_is_comp(page));
+	ut_ad(slot_no > 0);
+	ut_ad(slot_no + 1 < page_dir_get_n_slots(page));
+
+	n_slots = page_dir_get_n_slots(page);
+
+	/* 1. Reset the n_owned fields of the slots to be
+	deleted */
+	slot = page_dir_get_nth_slot(page, slot_no);
+	n_owned = page_dir_slot_get_n_owned(slot);
+	page_dir_slot_set_n_owned(slot, page_zip, 0);
+
+	/* 2. Update the n_owned value of the first non-deleted slot */
+
+	slot = page_dir_get_nth_slot(page, slot_no + 1);
+	page_dir_slot_set_n_owned(slot, page_zip,
+				  n_owned + page_dir_slot_get_n_owned(slot));
+
+	/* 3. Destroy the slot by copying slots */
+	for (i = slot_no + 1; i < n_slots; i++) {
+		rec_t*	rec = (rec_t*)
+			page_dir_slot_get_rec(page_dir_get_nth_slot(page, i));
+		page_dir_slot_set_rec(page_dir_get_nth_slot(page, i - 1), rec);
+	}
+
+	/* 4. Zero out the last slot, which will be removed */
+	mach_write_to_2(page_dir_get_nth_slot(page, n_slots - 1), 0);
+
+	/* 5. Update the page header */
+	page_header_set_field(page, page_zip, PAGE_N_DIR_SLOTS, n_slots - 1);
+}
+
+/**************************************************************//**
+Used to add n slots to the directory. Does not set the record pointers
+in the added slots or update n_owned values: this is the responsibility
+of the caller. */
+UNIV_INLINE
+void
+page_dir_add_slot(
+/*==============*/
+	page_t*		page,	/*!< in/out: the index page */
+	page_zip_des_t*	page_zip,/*!< in/out: comprssed page, or NULL */
+	ulint		start)	/*!< in: the slot above which the new slots
+				are added */
+{
+	page_dir_slot_t*	slot;
+	ulint			n_slots;
+
+	n_slots = page_dir_get_n_slots(page);
+
+	ut_ad(start < n_slots - 1);
+
+	/* Update the page header */
+	page_dir_set_n_slots(page, page_zip, n_slots + 1);
+
+	/* Move slots up */
+	slot = page_dir_get_nth_slot(page, n_slots);
+	memmove(slot, slot + PAGE_DIR_SLOT_SIZE,
+		(n_slots - 1 - start) * PAGE_DIR_SLOT_SIZE);
+}
+
+/****************************************************************//**
+Splits a directory slot which owns too many records. */
+UNIV_INTERN
+void
+page_dir_split_slot(
+/*================*/
+	page_t*		page,	/*!< in/out: index page */
+	page_zip_des_t*	page_zip,/*!< in/out: compressed page whose
+				uncompressed part will be written, or NULL */
+	ulint		slot_no)/*!< in: the directory slot */
+{
+	rec_t*			rec;
+	page_dir_slot_t*	new_slot;
+	page_dir_slot_t*	prev_slot;
+	page_dir_slot_t*	slot;
+	ulint			i;
+	ulint			n_owned;
+
+	ut_ad(page);
+	ut_ad(!page_zip || page_is_comp(page));
+	ut_ad(slot_no > 0);
+
+	slot = page_dir_get_nth_slot(page, slot_no);
+
+	n_owned = page_dir_slot_get_n_owned(slot);
+	ut_ad(n_owned == PAGE_DIR_SLOT_MAX_N_OWNED + 1);
+
+	/* 1. We loop to find a record approximately in the middle of the
+	records owned by the slot. */
+
+	prev_slot = page_dir_get_nth_slot(page, slot_no - 1);
+	rec = (rec_t*) page_dir_slot_get_rec(prev_slot);
+
+	for (i = 0; i < n_owned / 2; i++) {
+		rec = page_rec_get_next(rec);
+	}
+
+	ut_ad(n_owned / 2 >= PAGE_DIR_SLOT_MIN_N_OWNED);
+
+	/* 2. We add one directory slot immediately below the slot to be
+	split. */
+
+	page_dir_add_slot(page, page_zip, slot_no - 1);
+
+	/* The added slot is now number slot_no, and the old slot is
+	now number slot_no + 1 */
+
+	new_slot = page_dir_get_nth_slot(page, slot_no);
+	slot = page_dir_get_nth_slot(page, slot_no + 1);
+
+	/* 3. We store the appropriate values to the new slot. */
+
+	page_dir_slot_set_rec(new_slot, rec);
+	page_dir_slot_set_n_owned(new_slot, page_zip, n_owned / 2);
+
+	/* 4. Finally, we update the number of records field of the
+	original slot */
+
+	page_dir_slot_set_n_owned(slot, page_zip, n_owned - (n_owned / 2));
+}
+
+/*************************************************************//**
+Tries to balance the given directory slot with too few records with the upper
+neighbor, so that there are at least the minimum number of records owned by
+the slot; this may result in the merging of two slots. */
+UNIV_INTERN
+void
+page_dir_balance_slot(
+/*==================*/
+	page_t*		page,	/*!< in/out: index page */
+	page_zip_des_t*	page_zip,/*!< in/out: compressed page, or NULL */
+	ulint		slot_no)/*!< in: the directory slot */
+{
+	page_dir_slot_t*	slot;
+	page_dir_slot_t*	up_slot;
+	ulint			n_owned;
+	ulint			up_n_owned;
+	rec_t*			old_rec;
+	rec_t*			new_rec;
+
+	ut_ad(page);
+	ut_ad(!page_zip || page_is_comp(page));
+	ut_ad(slot_no > 0);
+
+	slot = page_dir_get_nth_slot(page, slot_no);
+
+	/* The last directory slot cannot be balanced with the upper
+	neighbor, as there is none. */
+
+	if (UNIV_UNLIKELY(slot_no == page_dir_get_n_slots(page) - 1)) {
+
+		return;
+	}
+
+	up_slot = page_dir_get_nth_slot(page, slot_no + 1);
+
+	n_owned = page_dir_slot_get_n_owned(slot);
+	up_n_owned = page_dir_slot_get_n_owned(up_slot);
+
+	ut_ad(n_owned == PAGE_DIR_SLOT_MIN_N_OWNED - 1);
+
+	/* If the upper slot has the minimum value of n_owned, we will merge
+	the two slots, therefore we assert: */
+	ut_ad(2 * PAGE_DIR_SLOT_MIN_N_OWNED - 1 <= PAGE_DIR_SLOT_MAX_N_OWNED);
+
+	if (up_n_owned > PAGE_DIR_SLOT_MIN_N_OWNED) {
+
+		/* In this case we can just transfer one record owned
+		by the upper slot to the property of the lower slot */
+		old_rec = (rec_t*) page_dir_slot_get_rec(slot);
+
+		if (page_is_comp(page)) {
+			new_rec = rec_get_next_ptr(old_rec, TRUE);
+
+			rec_set_n_owned_new(old_rec, page_zip, 0);
+			rec_set_n_owned_new(new_rec, page_zip, n_owned + 1);
+		} else {
+			new_rec = rec_get_next_ptr(old_rec, FALSE);
+
+			rec_set_n_owned_old(old_rec, 0);
+			rec_set_n_owned_old(new_rec, n_owned + 1);
+		}
+
+		page_dir_slot_set_rec(slot, new_rec);
+
+		page_dir_slot_set_n_owned(up_slot, page_zip, up_n_owned -1);
+	} else {
+		/* In this case we may merge the two slots */
+		page_dir_delete_slot(page, page_zip, slot_no);
+	}
+}
+
+#ifndef UNIV_HOTBACKUP
+/************************************************************//**
+Returns the middle record of the record list. If there are an even number
+of records in the list, returns the first record of the upper half-list.
+@return	middle record */
+UNIV_INTERN
+rec_t*
+page_get_middle_rec(
+/*================*/
+	page_t*	page)	/*!< in: page */
+{
+	page_dir_slot_t*	slot;
+	ulint			middle;
+	ulint			i;
+	ulint			n_owned;
+	ulint			count;
+	rec_t*			rec;
+
+	/* This many records we must leave behind */
+	middle = (page_get_n_recs(page) + PAGE_HEAP_NO_USER_LOW) / 2;
+
+	count = 0;
+
+	for (i = 0;; i++) {
+
+		slot = page_dir_get_nth_slot(page, i);
+		n_owned = page_dir_slot_get_n_owned(slot);
+
+		if (count + n_owned > middle) {
+			break;
+		} else {
+			count += n_owned;
+		}
+	}
+
+	ut_ad(i > 0);
+	slot = page_dir_get_nth_slot(page, i - 1);
+	rec = (rec_t*) page_dir_slot_get_rec(slot);
+	rec = page_rec_get_next(rec);
+
+	/* There are now count records behind rec */
+
+	for (i = 0; i < middle - count; i++) {
+		rec = page_rec_get_next(rec);
+	}
+
+	return(rec);
+}
+#endif /* !UNIV_HOTBACKUP */
+
+/***************************************************************//**
+Returns the number of records before the given record in chain.
+The number includes infimum and supremum records.
+@return	number of records */
+UNIV_INTERN
+ulint
+page_rec_get_n_recs_before(
+/*=======================*/
+	const rec_t*	rec)	/*!< in: the physical record */
+{
+	const page_dir_slot_t*	slot;
+	const rec_t*		slot_rec;
+	const page_t*		page;
+	ulint			i;
+	lint			n	= 0;
+
+	ut_ad(page_rec_check(rec));
+
+	page = page_align(rec);
+	if (page_is_comp(page)) {
+		while (rec_get_n_owned_new(rec) == 0) {
+
+			rec = rec_get_next_ptr_const(rec, TRUE);
+			n--;
+		}
+
+		for (i = 0; ; i++) {
+			slot = page_dir_get_nth_slot(page, i);
+			slot_rec = page_dir_slot_get_rec(slot);
+
+			n += rec_get_n_owned_new(slot_rec);
+
+			if (rec == slot_rec) {
+
+				break;
+			}
+		}
+	} else {
+		while (rec_get_n_owned_old(rec) == 0) {
+
+			rec = rec_get_next_ptr_const(rec, FALSE);
+			n--;
+		}
+
+		for (i = 0; ; i++) {
+			slot = page_dir_get_nth_slot(page, i);
+			slot_rec = page_dir_slot_get_rec(slot);
+
+			n += rec_get_n_owned_old(slot_rec);
+
+			if (rec == slot_rec) {
+
+				break;
+			}
+		}
+	}
+
+	n--;
+
+	ut_ad(n >= 0);
+
+	return((ulint) n);
+}
+
+#ifndef UNIV_HOTBACKUP
+/************************************************************//**
+Prints record contents including the data relevant only in
+the index page context. */
+UNIV_INTERN
+void
+page_rec_print(
+/*===========*/
+	const rec_t*	rec,	/*!< in: physical record */
+	const ulint*	offsets)/*!< in: record descriptor */
+{
+	ut_a(!page_rec_is_comp(rec) == !rec_offs_comp(offsets));
+	rec_print_new(stderr, rec, offsets);
+	if (page_rec_is_comp(rec)) {
+		fprintf(stderr,
+			" n_owned: %lu; heap_no: %lu; next rec: %lu\n",
+			(ulong) rec_get_n_owned_new(rec),
+			(ulong) rec_get_heap_no_new(rec),
+			(ulong) rec_get_next_offs(rec, TRUE));
+	} else {
+		fprintf(stderr,
+			" n_owned: %lu; heap_no: %lu; next rec: %lu\n",
+			(ulong) rec_get_n_owned_old(rec),
+			(ulong) rec_get_heap_no_old(rec),
+			(ulong) rec_get_next_offs(rec, TRUE));
+	}
+
+	page_rec_check(rec);
+	rec_validate(rec, offsets);
+}
+
+/***************************************************************//**
+This is used to print the contents of the directory for
+debugging purposes. */
+UNIV_INTERN
+void
+page_dir_print(
+/*===========*/
+	page_t*	page,	/*!< in: index page */
+	ulint	pr_n)	/*!< in: print n first and n last entries */
+{
+	ulint			n;
+	ulint			i;
+	page_dir_slot_t*	slot;
+
+	n = page_dir_get_n_slots(page);
+
+	fprintf(stderr, "--------------------------------\n"
+		"PAGE DIRECTORY\n"
+		"Page address %p\n"
+		"Directory stack top at offs: %lu; number of slots: %lu\n",
+		page, (ulong) page_offset(page_dir_get_nth_slot(page, n - 1)),
+		(ulong) n);
+	for (i = 0; i < n; i++) {
+		slot = page_dir_get_nth_slot(page, i);
+		if ((i == pr_n) && (i < n - pr_n)) {
+			fputs("    ...   \n", stderr);
+		}
+		if ((i < pr_n) || (i >= n - pr_n)) {
+			fprintf(stderr,
+				"Contents of slot: %lu: n_owned: %lu,"
+				" rec offs: %lu\n",
+				(ulong) i,
+				(ulong) page_dir_slot_get_n_owned(slot),
+				(ulong)
+				page_offset(page_dir_slot_get_rec(slot)));
+		}
+	}
+	fprintf(stderr, "Total of %lu records\n"
+		"--------------------------------\n",
+		(ulong) (PAGE_HEAP_NO_USER_LOW + page_get_n_recs(page)));
+}
+
+/***************************************************************//**
+This is used to print the contents of the page record list for
+debugging purposes. */
+UNIV_INTERN
+void
+page_print_list(
+/*============*/
+	buf_block_t*	block,	/*!< in: index page */
+	dict_index_t*	index,	/*!< in: dictionary index of the page */
+	ulint		pr_n)	/*!< in: print n first and n last entries */
+{
+	page_t*		page		= block->frame;
+	page_cur_t	cur;
+	ulint		count;
+	ulint		n_recs;
+	mem_heap_t*	heap		= NULL;
+	ulint		offsets_[REC_OFFS_NORMAL_SIZE];
+	ulint*		offsets		= offsets_;
+	rec_offs_init(offsets_);
+
+	ut_a((ibool)!!page_is_comp(page) == dict_table_is_comp(index->table));
+
+	fprintf(stderr,
+		"--------------------------------\n"
+		"PAGE RECORD LIST\n"
+		"Page address %p\n", page);
+
+	n_recs = page_get_n_recs(page);
+
+	page_cur_set_before_first(block, &cur);
+	count = 0;
+	for (;;) {
+		offsets = rec_get_offsets(cur.rec, index, offsets,
+					  ULINT_UNDEFINED, &heap);
+		page_rec_print(cur.rec, offsets);
+
+		if (count == pr_n) {
+			break;
+		}
+		if (page_cur_is_after_last(&cur)) {
+			break;
+		}
+		page_cur_move_to_next(&cur);
+		count++;
+	}
+
+	if (n_recs > 2 * pr_n) {
+		fputs(" ... \n", stderr);
+	}
+
+	while (!page_cur_is_after_last(&cur)) {
+		page_cur_move_to_next(&cur);
+
+		if (count + pr_n >= n_recs) {
+			offsets = rec_get_offsets(cur.rec, index, offsets,
+						  ULINT_UNDEFINED, &heap);
+			page_rec_print(cur.rec, offsets);
+		}
+		count++;
+	}
+
+	fprintf(stderr,
+		"Total of %lu records \n"
+		"--------------------------------\n",
+		(ulong) (count + 1));
+
+	if (UNIV_LIKELY_NULL(heap)) {
+		mem_heap_free(heap);
+	}
+}
+
+/***************************************************************//**
+Prints the info in a page header. */
+UNIV_INTERN
+void
+page_header_print(
+/*==============*/
+	const page_t*	page)
+{
+	fprintf(stderr,
+		"--------------------------------\n"
+		"PAGE HEADER INFO\n"
+		"Page address %p, n records %lu (%s)\n"
+		"n dir slots %lu, heap top %lu\n"
+		"Page n heap %lu, free %lu, garbage %lu\n"
+		"Page last insert %lu, direction %lu, n direction %lu\n",
+		page, (ulong) page_header_get_field(page, PAGE_N_RECS),
+		page_is_comp(page) ? "compact format" : "original format",
+		(ulong) page_header_get_field(page, PAGE_N_DIR_SLOTS),
+		(ulong) page_header_get_field(page, PAGE_HEAP_TOP),
+		(ulong) page_dir_get_n_heap(page),
+		(ulong) page_header_get_field(page, PAGE_FREE),
+		(ulong) page_header_get_field(page, PAGE_GARBAGE),
+		(ulong) page_header_get_field(page, PAGE_LAST_INSERT),
+		(ulong) page_header_get_field(page, PAGE_DIRECTION),
+		(ulong) page_header_get_field(page, PAGE_N_DIRECTION));
+}
+
+/***************************************************************//**
+This is used to print the contents of the page for
+debugging purposes. */
+UNIV_INTERN
+void
+page_print(
+/*=======*/
+	buf_block_t*	block,	/*!< in: index page */
+	dict_index_t*	index,	/*!< in: dictionary index of the page */
+	ulint		dn,	/*!< in: print dn first and last entries
+				in directory */
+	ulint		rn)	/*!< in: print rn first and last records
+				in directory */
+{
+	page_t*	page = block->frame;
+
+	page_header_print(page);
+	page_dir_print(page, dn);
+	page_print_list(block, index, rn);
+}
+#endif /* !UNIV_HOTBACKUP */
+
+/***************************************************************//**
+The following is used to validate a record on a page. This function
+differs from rec_validate as it can also check the n_owned field and
+the heap_no field.
+@return	TRUE if ok */
+UNIV_INTERN
+ibool
+page_rec_validate(
+/*==============*/
+	rec_t*		rec,	/*!< in: physical record */
+	const ulint*	offsets)/*!< in: array returned by rec_get_offsets() */
+{
+	ulint	n_owned;
+	ulint	heap_no;
+	page_t*	page;
+
+	page = page_align(rec);
+	ut_a(!page_is_comp(page) == !rec_offs_comp(offsets));
+
+	page_rec_check(rec);
+	rec_validate(rec, offsets);
+
+	if (page_rec_is_comp(rec)) {
+		n_owned = rec_get_n_owned_new(rec);
+		heap_no = rec_get_heap_no_new(rec);
+	} else {
+		n_owned = rec_get_n_owned_old(rec);
+		heap_no = rec_get_heap_no_old(rec);
+	}
+
+	if (UNIV_UNLIKELY(!(n_owned <= PAGE_DIR_SLOT_MAX_N_OWNED))) {
+		fprintf(stderr,
+			"InnoDB: Dir slot of rec %lu, n owned too big %lu\n",
+			(ulong) page_offset(rec), (ulong) n_owned);
+		return(FALSE);
+	}
+
+	if (UNIV_UNLIKELY(!(heap_no < page_dir_get_n_heap(page)))) {
+		fprintf(stderr,
+			"InnoDB: Heap no of rec %lu too big %lu %lu\n",
+			(ulong) page_offset(rec), (ulong) heap_no,
+			(ulong) page_dir_get_n_heap(page));
+		return(FALSE);
+	}
+
+	return(TRUE);
+}
+
+#ifndef UNIV_HOTBACKUP
+/***************************************************************//**
+Checks that the first directory slot points to the infimum record and
+the last to the supremum. This function is intended to track if the
+bug fixed in 4.0.14 has caused corruption to users' databases. */
+UNIV_INTERN
+void
+page_check_dir(
+/*===========*/
+	const page_t*	page)	/*!< in: index page */
+{
+	ulint	n_slots;
+	ulint	infimum_offs;
+	ulint	supremum_offs;
+
+	n_slots = page_dir_get_n_slots(page);
+	infimum_offs = mach_read_from_2(page_dir_get_nth_slot(page, 0));
+	supremum_offs = mach_read_from_2(page_dir_get_nth_slot(page,
+							       n_slots - 1));
+
+	if (UNIV_UNLIKELY(!page_rec_is_infimum_low(infimum_offs))) {
+
+		fprintf(stderr,
+			"InnoDB: Page directory corruption:"
+			" infimum not pointed to\n");
+		buf_page_print(page, 0);
+	}
+
+	if (UNIV_UNLIKELY(!page_rec_is_supremum_low(supremum_offs))) {
+
+		fprintf(stderr,
+			"InnoDB: Page directory corruption:"
+			" supremum not pointed to\n");
+		buf_page_print(page, 0);
+	}
+}
+#endif /* !UNIV_HOTBACKUP */
+
+/***************************************************************//**
+This function checks the consistency of an index page when we do not
+know the index. This is also resilient so that this should never crash
+even if the page is total garbage.
+@return	TRUE if ok */
+UNIV_INTERN
+ibool
+page_simple_validate_old(
+/*=====================*/
+	page_t*	page)	/*!< in: old-style index page */
+{
+	page_dir_slot_t* slot;
+	ulint		slot_no;
+	ulint		n_slots;
+	rec_t*		rec;
+	byte*		rec_heap_top;
+	ulint		count;
+	ulint		own_count;
+	ibool		ret	= FALSE;
+
+	ut_a(!page_is_comp(page));
+
+	/* Check first that the record heap and the directory do not
+	overlap. */
+
+	n_slots = page_dir_get_n_slots(page);
+
+	if (UNIV_UNLIKELY(n_slots > UNIV_PAGE_SIZE / 4)) {
+		fprintf(stderr,
+			"InnoDB: Nonsensical number %lu of page dir slots\n",
+			(ulong) n_slots);
+
+		goto func_exit;
+	}
+
+	rec_heap_top = page_header_get_ptr(page, PAGE_HEAP_TOP);
+
+	if (UNIV_UNLIKELY(rec_heap_top
+			  > page_dir_get_nth_slot(page, n_slots - 1))) {
+
+		fprintf(stderr,
+			"InnoDB: Record heap and dir overlap on a page,"
+			" heap top %lu, dir %lu\n",
+			(ulong) page_header_get_field(page, PAGE_HEAP_TOP),
+			(ulong)
+			page_offset(page_dir_get_nth_slot(page, n_slots - 1)));
+
+		goto func_exit;
+	}
+
+	/* Validate the record list in a loop checking also that it is
+	consistent with the page record directory. */
+
+	count = 0;
+	own_count = 1;
+	slot_no = 0;
+	slot = page_dir_get_nth_slot(page, slot_no);
+
+	rec = page_get_infimum_rec(page);
+
+	for (;;) {
+		if (UNIV_UNLIKELY(rec > rec_heap_top)) {
+			fprintf(stderr,
+				"InnoDB: Record %lu is above"
+				" rec heap top %lu\n",
+				(ulong)(rec - page),
+				(ulong)(rec_heap_top - page));
+
+			goto func_exit;
+		}
+
+		if (UNIV_UNLIKELY(rec_get_n_owned_old(rec))) {
+			/* This is a record pointed to by a dir slot */
+			if (UNIV_UNLIKELY(rec_get_n_owned_old(rec)
+					  != own_count)) {
+
+				fprintf(stderr,
+					"InnoDB: Wrong owned count %lu, %lu,"
+					" rec %lu\n",
+					(ulong) rec_get_n_owned_old(rec),
+					(ulong) own_count,
+					(ulong)(rec - page));
+
+				goto func_exit;
+			}
+
+			if (UNIV_UNLIKELY
+			    (page_dir_slot_get_rec(slot) != rec)) {
+				fprintf(stderr,
+					"InnoDB: Dir slot does not point"
+					" to right rec %lu\n",
+					(ulong)(rec - page));
+
+				goto func_exit;
+			}
+
+			own_count = 0;
+
+			if (!page_rec_is_supremum(rec)) {
+				slot_no++;
+				slot = page_dir_get_nth_slot(page, slot_no);
+			}
+		}
+
+		if (page_rec_is_supremum(rec)) {
+
+			break;
+		}
+
+		if (UNIV_UNLIKELY
+		    (rec_get_next_offs(rec, FALSE) < FIL_PAGE_DATA
+		     || rec_get_next_offs(rec, FALSE) >= UNIV_PAGE_SIZE)) {
+			fprintf(stderr,
+				"InnoDB: Next record offset"
+				" nonsensical %lu for rec %lu\n",
+				(ulong) rec_get_next_offs(rec, FALSE),
+				(ulong) (rec - page));
+
+			goto func_exit;
+		}
+
+		count++;
+
+		if (UNIV_UNLIKELY(count > UNIV_PAGE_SIZE)) {
+			fprintf(stderr,
+				"InnoDB: Page record list appears"
+				" to be circular %lu\n",
+				(ulong) count);
+			goto func_exit;
+		}
+
+		rec = page_rec_get_next(rec);
+		own_count++;
+	}
+
+	if (UNIV_UNLIKELY(rec_get_n_owned_old(rec) == 0)) {
+		fprintf(stderr, "InnoDB: n owned is zero in a supremum rec\n");
+
+		goto func_exit;
+	}
+
+	if (UNIV_UNLIKELY(slot_no != n_slots - 1)) {
+		fprintf(stderr, "InnoDB: n slots wrong %lu, %lu\n",
+			(ulong) slot_no, (ulong) (n_slots - 1));
+		goto func_exit;
+	}
+
+	if (UNIV_UNLIKELY(page_header_get_field(page, PAGE_N_RECS)
+			  + PAGE_HEAP_NO_USER_LOW
+			  != count + 1)) {
+		fprintf(stderr, "InnoDB: n recs wrong %lu %lu\n",
+			(ulong) page_header_get_field(page, PAGE_N_RECS)
+			+ PAGE_HEAP_NO_USER_LOW,
+			(ulong) (count + 1));
+
+		goto func_exit;
+	}
+
+	/* Check then the free list */
+	rec = page_header_get_ptr(page, PAGE_FREE);
+
+	while (rec != NULL) {
+		if (UNIV_UNLIKELY(rec < page + FIL_PAGE_DATA
+				  || rec >= page + UNIV_PAGE_SIZE)) {
+			fprintf(stderr,
+				"InnoDB: Free list record has"
+				" a nonsensical offset %lu\n",
+				(ulong) (rec - page));
+
+			goto func_exit;
+		}
+
+		if (UNIV_UNLIKELY(rec > rec_heap_top)) {
+			fprintf(stderr,
+				"InnoDB: Free list record %lu"
+				" is above rec heap top %lu\n",
+				(ulong) (rec - page),
+				(ulong) (rec_heap_top - page));
+
+			goto func_exit;
+		}
+
+		count++;
+
+		if (UNIV_UNLIKELY(count > UNIV_PAGE_SIZE)) {
+			fprintf(stderr,
+				"InnoDB: Page free list appears"
+				" to be circular %lu\n",
+				(ulong) count);
+			goto func_exit;
+		}
+
+		rec = page_rec_get_next(rec);
+	}
+
+	if (UNIV_UNLIKELY(page_dir_get_n_heap(page) != count + 1)) {
+
+		fprintf(stderr, "InnoDB: N heap is wrong %lu, %lu\n",
+			(ulong) page_dir_get_n_heap(page),
+			(ulong) (count + 1));
+
+		goto func_exit;
+	}
+
+	ret = TRUE;
+
+func_exit:
+	return(ret);
+}
+
+/***************************************************************//**
+This function checks the consistency of an index page when we do not
+know the index. This is also resilient so that this should never crash
+even if the page is total garbage.
+@return	TRUE if ok */
+UNIV_INTERN
+ibool
+page_simple_validate_new(
+/*=====================*/
+	page_t*	page)	/*!< in: new-style index page */
+{
+	page_dir_slot_t* slot;
+	ulint		slot_no;
+	ulint		n_slots;
+	rec_t*		rec;
+	byte*		rec_heap_top;
+	ulint		count;
+	ulint		own_count;
+	ibool		ret	= FALSE;
+
+	ut_a(page_is_comp(page));
+
+	/* Check first that the record heap and the directory do not
+	overlap. */
+
+	n_slots = page_dir_get_n_slots(page);
+
+	if (UNIV_UNLIKELY(n_slots > UNIV_PAGE_SIZE / 4)) {
+		fprintf(stderr,
+			"InnoDB: Nonsensical number %lu"
+			" of page dir slots\n", (ulong) n_slots);
+
+		goto func_exit;
+	}
+
+	rec_heap_top = page_header_get_ptr(page, PAGE_HEAP_TOP);
+
+	if (UNIV_UNLIKELY(rec_heap_top
+			  > page_dir_get_nth_slot(page, n_slots - 1))) {
+
+		fprintf(stderr,
+			"InnoDB: Record heap and dir overlap on a page,"
+			" heap top %lu, dir %lu\n",
+			(ulong) page_header_get_field(page, PAGE_HEAP_TOP),
+			(ulong)
+			page_offset(page_dir_get_nth_slot(page, n_slots - 1)));
+
+		goto func_exit;
+	}
+
+	/* Validate the record list in a loop checking also that it is
+	consistent with the page record directory. */
+
+	count = 0;
+	own_count = 1;
+	slot_no = 0;
+	slot = page_dir_get_nth_slot(page, slot_no);
+
+	rec = page_get_infimum_rec(page);
+
+	for (;;) {
+		if (UNIV_UNLIKELY(rec > rec_heap_top)) {
+			fprintf(stderr,
+				"InnoDB: Record %lu is above rec"
+				" heap top %lu\n",
+				(ulong) page_offset(rec),
+				(ulong) page_offset(rec_heap_top));
+
+			goto func_exit;
+		}
+
+		if (UNIV_UNLIKELY(rec_get_n_owned_new(rec))) {
+			/* This is a record pointed to by a dir slot */
+			if (UNIV_UNLIKELY(rec_get_n_owned_new(rec)
+					  != own_count)) {
+
+				fprintf(stderr,
+					"InnoDB: Wrong owned count %lu, %lu,"
+					" rec %lu\n",
+					(ulong) rec_get_n_owned_new(rec),
+					(ulong) own_count,
+					(ulong) page_offset(rec));
+
+				goto func_exit;
+			}
+
+			if (UNIV_UNLIKELY
+			    (page_dir_slot_get_rec(slot) != rec)) {
+				fprintf(stderr,
+					"InnoDB: Dir slot does not point"
+					" to right rec %lu\n",
+					(ulong) page_offset(rec));
+
+				goto func_exit;
+			}
+
+			own_count = 0;
+
+			if (!page_rec_is_supremum(rec)) {
+				slot_no++;
+				slot = page_dir_get_nth_slot(page, slot_no);
+			}
+		}
+
+		if (page_rec_is_supremum(rec)) {
+
+			break;
+		}
+
+		if (UNIV_UNLIKELY
+		    (rec_get_next_offs(rec, TRUE) < FIL_PAGE_DATA
+		     || rec_get_next_offs(rec, TRUE) >= UNIV_PAGE_SIZE)) {
+			fprintf(stderr,
+				"InnoDB: Next record offset nonsensical %lu"
+				" for rec %lu\n",
+				(ulong) rec_get_next_offs(rec, TRUE),
+				(ulong) page_offset(rec));
+
+			goto func_exit;
+		}
+
+		count++;
+
+		if (UNIV_UNLIKELY(count > UNIV_PAGE_SIZE)) {
+			fprintf(stderr,
+				"InnoDB: Page record list appears"
+				" to be circular %lu\n",
+				(ulong) count);
+			goto func_exit;
+		}
+
+		rec = page_rec_get_next(rec);
+		own_count++;
+	}
+
+	if (UNIV_UNLIKELY(rec_get_n_owned_new(rec) == 0)) {
+		fprintf(stderr, "InnoDB: n owned is zero"
+			" in a supremum rec\n");
+
+		goto func_exit;
+	}
+
+	if (UNIV_UNLIKELY(slot_no != n_slots - 1)) {
+		fprintf(stderr, "InnoDB: n slots wrong %lu, %lu\n",
+			(ulong) slot_no, (ulong) (n_slots - 1));
+		goto func_exit;
+	}
+
+	if (UNIV_UNLIKELY(page_header_get_field(page, PAGE_N_RECS)
+			  + PAGE_HEAP_NO_USER_LOW
+			  != count + 1)) {
+		fprintf(stderr, "InnoDB: n recs wrong %lu %lu\n",
+			(ulong) page_header_get_field(page, PAGE_N_RECS)
+			+ PAGE_HEAP_NO_USER_LOW,
+			(ulong) (count + 1));
+
+		goto func_exit;
+	}
+
+	/* Check then the free list */
+	rec = page_header_get_ptr(page, PAGE_FREE);
+
+	while (rec != NULL) {
+		if (UNIV_UNLIKELY(rec < page + FIL_PAGE_DATA
+				  || rec >= page + UNIV_PAGE_SIZE)) {
+			fprintf(stderr,
+				"InnoDB: Free list record has"
+				" a nonsensical offset %lu\n",
+				(ulong) page_offset(rec));
+
+			goto func_exit;
+		}
+
+		if (UNIV_UNLIKELY(rec > rec_heap_top)) {
+			fprintf(stderr,
+				"InnoDB: Free list record %lu"
+				" is above rec heap top %lu\n",
+				(ulong) page_offset(rec),
+				(ulong) page_offset(rec_heap_top));
+
+			goto func_exit;
+		}
+
+		count++;
+
+		if (UNIV_UNLIKELY(count > UNIV_PAGE_SIZE)) {
+			fprintf(stderr,
+				"InnoDB: Page free list appears"
+				" to be circular %lu\n",
+				(ulong) count);
+			goto func_exit;
+		}
+
+		rec = page_rec_get_next(rec);
+	}
+
+	if (UNIV_UNLIKELY(page_dir_get_n_heap(page) != count + 1)) {
+
+		fprintf(stderr, "InnoDB: N heap is wrong %lu, %lu\n",
+			(ulong) page_dir_get_n_heap(page),
+			(ulong) (count + 1));
+
+		goto func_exit;
+	}
+
+	ret = TRUE;
+
+func_exit:
+	return(ret);
+}
+
+/***************************************************************//**
+This function checks the consistency of an index page.
+@return	TRUE if ok */
+UNIV_INTERN
+ibool
+page_validate(
+/*==========*/
+	page_t*		page,	/*!< in: index page */
+	dict_index_t*	index)	/*!< in: data dictionary index containing
+				the page record type definition */
+{
+	page_dir_slot_t*slot;
+	mem_heap_t*	heap;
+	byte*		buf;
+	ulint		count;
+	ulint		own_count;
+	ulint		rec_own_count;
+	ulint		slot_no;
+	ulint		data_size;
+	rec_t*		rec;
+	rec_t*		old_rec		= NULL;
+	ulint		offs;
+	ulint		n_slots;
+	ibool		ret		= FALSE;
+	ulint		i;
+	ulint*		offsets		= NULL;
+	ulint*		old_offsets	= NULL;
+
+	if (UNIV_UNLIKELY((ibool) !!page_is_comp(page)
+			  != dict_table_is_comp(index->table))) {
+		fputs("InnoDB: 'compact format' flag mismatch\n", stderr);
+		goto func_exit2;
+	}
+	if (page_is_comp(page)) {
+		if (UNIV_UNLIKELY(!page_simple_validate_new(page))) {
+			goto func_exit2;
+		}
+	} else {
+		if (UNIV_UNLIKELY(!page_simple_validate_old(page))) {
+			goto func_exit2;
+		}
+	}
+
+	heap = mem_heap_create(UNIV_PAGE_SIZE + 200);
+
+	/* The following buffer is used to check that the
+	records in the page record heap do not overlap */
+
+	buf = mem_heap_zalloc(heap, UNIV_PAGE_SIZE);
+
+	/* Check first that the record heap and the directory do not
+	overlap. */
+
+	n_slots = page_dir_get_n_slots(page);
+
+	if (UNIV_UNLIKELY(!(page_header_get_ptr(page, PAGE_HEAP_TOP)
+			    <= page_dir_get_nth_slot(page, n_slots - 1)))) {
+
+		fprintf(stderr, 
+			"InnoDB: Record heap and dir overlap"
+			" on space %lu page %lu index %s, %p, %p\n",
+			(ulong) page_get_space_id(page),
+			(ulong) page_get_page_no(page), index->name,
+			page_header_get_ptr(page, PAGE_HEAP_TOP),
+			page_dir_get_nth_slot(page, n_slots - 1));
+
+		goto func_exit;
+	}
+
+	/* Validate the record list in a loop checking also that
+	it is consistent with the directory. */
+	count = 0;
+	data_size = 0;
+	own_count = 1;
+	slot_no = 0;
+	slot = page_dir_get_nth_slot(page, slot_no);
+
+	rec = page_get_infimum_rec(page);
+
+	for (;;) {
+		offsets = rec_get_offsets(rec, index, offsets,
+					  ULINT_UNDEFINED, &heap);
+
+		if (page_is_comp(page) && page_rec_is_user_rec(rec)
+		    && UNIV_UNLIKELY(rec_get_node_ptr_flag(rec)
+				     == page_is_leaf(page))) {
+			fputs("InnoDB: node_ptr flag mismatch\n", stderr);
+			goto func_exit;
+		}
+
+		if (UNIV_UNLIKELY(!page_rec_validate(rec, offsets))) {
+			goto func_exit;
+		}
+
+#ifndef UNIV_HOTBACKUP
+		/* Check that the records are in the ascending order */
+		if (UNIV_LIKELY(count >= PAGE_HEAP_NO_USER_LOW)
+		    && !page_rec_is_supremum(rec)) {
+			if (UNIV_UNLIKELY
+			    (1 != cmp_rec_rec(rec, old_rec,
+					      offsets, old_offsets, index))) {
+				fprintf(stderr, 
+					"InnoDB: Records in wrong order"
+					" on space %lu page %lu index %s\n",
+					(ulong) page_get_space_id(page),
+					(ulong) page_get_page_no(page),
+					index->name);
+				fputs("\nInnoDB: previous record ", stderr);
+				rec_print_new(stderr, old_rec, old_offsets);
+				fputs("\nInnoDB: record ", stderr);
+				rec_print_new(stderr, rec, offsets);
+				putc('\n', stderr);
+
+				goto func_exit;
+			}
+		}
+#endif /* !UNIV_HOTBACKUP */
+
+		if (page_rec_is_user_rec(rec)) {
+
+			data_size += rec_offs_size(offsets);
+		}
+
+		offs = page_offset(rec_get_start(rec, offsets));
+		i = rec_offs_size(offsets);
+		if (UNIV_UNLIKELY(offs + i >= UNIV_PAGE_SIZE)) {
+			fputs("InnoDB: record offset out of bounds\n", stderr);
+			goto func_exit;
+		}
+
+		while (i--) {
+			if (UNIV_UNLIKELY(buf[offs + i])) {
+				/* No other record may overlap this */
+
+				fputs("InnoDB: Record overlaps another\n",
+				      stderr);
+				goto func_exit;
+			}
+
+			buf[offs + i] = 1;
+		}
+
+		if (page_is_comp(page)) {
+			rec_own_count = rec_get_n_owned_new(rec);
+		} else {
+			rec_own_count = rec_get_n_owned_old(rec);
+		}
+
+		if (UNIV_UNLIKELY(rec_own_count)) {
+			/* This is a record pointed to by a dir slot */
+			if (UNIV_UNLIKELY(rec_own_count != own_count)) {
+				fprintf(stderr,
+					"InnoDB: Wrong owned count %lu, %lu\n",
+					(ulong) rec_own_count,
+					(ulong) own_count);
+				goto func_exit;
+			}
+
+			if (page_dir_slot_get_rec(slot) != rec) {
+				fputs("InnoDB: Dir slot does not"
+				      " point to right rec\n",
+				      stderr);
+				goto func_exit;
+			}
+
+			page_dir_slot_check(slot);
+
+			own_count = 0;
+			if (!page_rec_is_supremum(rec)) {
+				slot_no++;
+				slot = page_dir_get_nth_slot(page, slot_no);
+			}
+		}
+
+		if (page_rec_is_supremum(rec)) {
+			break;
+		}
+
+		count++;
+		own_count++;
+		old_rec = rec;
+		rec = page_rec_get_next(rec);
+
+		/* set old_offsets to offsets; recycle offsets */
+		{
+			ulint* offs = old_offsets;
+			old_offsets = offsets;
+			offsets = offs;
+		}
+	}
+
+	if (page_is_comp(page)) {
+		if (UNIV_UNLIKELY(rec_get_n_owned_new(rec) == 0)) {
+
+			goto n_owned_zero;
+		}
+	} else if (UNIV_UNLIKELY(rec_get_n_owned_old(rec) == 0)) {
+n_owned_zero:
+		fputs("InnoDB: n owned is zero\n", stderr);
+		goto func_exit;
+	}
+
+	if (UNIV_UNLIKELY(slot_no != n_slots - 1)) {
+		fprintf(stderr, "InnoDB: n slots wrong %lu %lu\n",
+			(ulong) slot_no, (ulong) (n_slots - 1));
+		goto func_exit;
+	}
+
+	if (UNIV_UNLIKELY(page_header_get_field(page, PAGE_N_RECS)
+			  + PAGE_HEAP_NO_USER_LOW
+			  != count + 1)) {
+		fprintf(stderr, "InnoDB: n recs wrong %lu %lu\n",
+			(ulong) page_header_get_field(page, PAGE_N_RECS)
+			+ PAGE_HEAP_NO_USER_LOW,
+			(ulong) (count + 1));
+		goto func_exit;
+	}
+
+	if (UNIV_UNLIKELY(data_size != page_get_data_size(page))) {
+		fprintf(stderr,
+			"InnoDB: Summed data size %lu, returned by func %lu\n",
+			(ulong) data_size, (ulong) page_get_data_size(page));
+		goto func_exit;
+	}
+
+	/* Check then the free list */
+	rec = page_header_get_ptr(page, PAGE_FREE);
+
+	while (rec != NULL) {
+		offsets = rec_get_offsets(rec, index, offsets,
+					  ULINT_UNDEFINED, &heap);
+		if (UNIV_UNLIKELY(!page_rec_validate(rec, offsets))) {
+
+			goto func_exit;
+		}
+
+		count++;
+		offs = page_offset(rec_get_start(rec, offsets));
+		i = rec_offs_size(offsets);
+		if (UNIV_UNLIKELY(offs + i >= UNIV_PAGE_SIZE)) {
+			fputs("InnoDB: record offset out of bounds\n", stderr);
+			goto func_exit;
+		}
+
+		while (i--) {
+
+			if (UNIV_UNLIKELY(buf[offs + i])) {
+				fputs("InnoDB: Record overlaps another"
+				      " in free list\n", stderr);
+				goto func_exit;
+			}
+
+			buf[offs + i] = 1;
+		}
+
+		rec = page_rec_get_next(rec);
+	}
+
+	if (UNIV_UNLIKELY(page_dir_get_n_heap(page) != count + 1)) {
+		fprintf(stderr, "InnoDB: N heap is wrong %lu %lu\n",
+			(ulong) page_dir_get_n_heap(page),
+			(ulong) count + 1);
+		goto func_exit;
+	}
+
+	ret = TRUE;
+
+func_exit:
+	mem_heap_free(heap);
+
+	if (UNIV_UNLIKELY(ret == FALSE)) {
+func_exit2:
+		fprintf(stderr, 
+			"InnoDB: Apparent corruption"
+			" in space %lu page %lu index %s\n",
+			(ulong) page_get_space_id(page),
+			(ulong) page_get_page_no(page),
+			index->name);
+		buf_page_print(page, 0);
+	}
+
+	return(ret);
+}
+
+#ifndef UNIV_HOTBACKUP
+/***************************************************************//**
+Looks in the page record list for a record with the given heap number.
+@return	record, NULL if not found */
+UNIV_INTERN
+const rec_t*
+page_find_rec_with_heap_no(
+/*=======================*/
+	const page_t*	page,	/*!< in: index page */
+	ulint		heap_no)/*!< in: heap number */
+{
+	const rec_t*	rec;
+
+	if (page_is_comp(page)) {
+		rec = page + PAGE_NEW_INFIMUM;
+
+		for(;;) {
+			ulint	rec_heap_no = rec_get_heap_no_new(rec);
+
+			if (rec_heap_no == heap_no) {
+
+				return(rec);
+			} else if (rec_heap_no == PAGE_HEAP_NO_SUPREMUM) {
+
+				return(NULL);
+			}
+
+			rec = page + rec_get_next_offs(rec, TRUE);
+		}
+	} else {
+		rec = page + PAGE_OLD_INFIMUM;
+
+		for (;;) {
+			ulint	rec_heap_no = rec_get_heap_no_old(rec);
+
+			if (rec_heap_no == heap_no) {
+
+				return(rec);
+			} else if (rec_heap_no == PAGE_HEAP_NO_SUPREMUM) {
+
+				return(NULL);
+			}
+
+			rec = page + rec_get_next_offs(rec, FALSE);
+		}
+	}
+}
+#endif /* !UNIV_HOTBACKUP */
diff --git a/storage/xtradb/page/page0zip.c b/storage/xtradb/page/page0zip.c
new file mode 100644
index 00000000000..a94d2d54417
--- /dev/null
+++ b/storage/xtradb/page/page0zip.c
@@ -0,0 +1,4677 @@
+/*****************************************************************************
+
+Copyright (c) 2005, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file page/page0zip.c
+Compressed page interface
+
+Created June 2005 by Marko Makela
+*******************************************************/
+
+#define THIS_MODULE
+#include "page0zip.h"
+#ifdef UNIV_NONINL
+# include "page0zip.ic"
+#endif
+#undef THIS_MODULE
+#include "page0page.h"
+#include "mtr0log.h"
+#include "ut0sort.h"
+#include "dict0dict.h"
+#include "btr0cur.h"
+#include "page0types.h"
+#include "log0recv.h"
+#include "zlib.h"
+#ifndef UNIV_HOTBACKUP
+# include "buf0lru.h"
+# include "btr0sea.h"
+# include "dict0boot.h"
+# include "lock0lock.h"
+#else /* !UNIV_HOTBACKUP */
+# define lock_move_reorganize_page(block, temp_block)	((void) 0)
+# define buf_LRU_stat_inc_unzip()			((void) 0)
+#endif /* !UNIV_HOTBACKUP */
+
+#ifndef UNIV_HOTBACKUP
+/** Statistics on compression, indexed by page_zip_des_t::ssize - 1 */
+UNIV_INTERN page_zip_stat_t page_zip_stat[PAGE_ZIP_NUM_SSIZE_MAX - 1];
+#endif /* !UNIV_HOTBACKUP */
+
+/* Please refer to ../include/page0zip.ic for a description of the
+compressed page format. */
+
+/* The infimum and supremum records are omitted from the compressed page.
+On compress, we compare that the records are there, and on uncompress we
+restore the records. */
+/** Extra bytes of an infimum record */
+static const byte infimum_extra[] = {
+	0x01,			/* info_bits=0, n_owned=1 */
+	0x00, 0x02		/* heap_no=0, status=2 */
+	/* ?, ?	*/		/* next=(first user rec, or supremum) */
+};
+/** Data bytes of an infimum record */
+static const byte infimum_data[] = {
+	0x69, 0x6e, 0x66, 0x69,
+	0x6d, 0x75, 0x6d, 0x00	/* "infimum\0" */
+};
+/** Extra bytes and data bytes of a supremum record */
+static const byte supremum_extra_data[] = {
+	/* 0x0?, */		/* info_bits=0, n_owned=1..8 */
+	0x00, 0x0b,		/* heap_no=1, status=3 */
+	0x00, 0x00,		/* next=0 */
+	0x73, 0x75, 0x70, 0x72,
+	0x65, 0x6d, 0x75, 0x6d	/* "supremum" */
+};
+
+/** Assert that a block of memory is filled with zero bytes.
+Compare at most sizeof(field_ref_zero) bytes.
+@param b	in: memory block
+@param s	in: size of the memory block, in bytes */
+#define ASSERT_ZERO(b, s) \
+	ut_ad(!memcmp(b, field_ref_zero, ut_min(s, sizeof field_ref_zero)))
+/** Assert that a BLOB pointer is filled with zero bytes.
+@param b	in: BLOB pointer */
+#define ASSERT_ZERO_BLOB(b) \
+	ut_ad(!memcmp(b, field_ref_zero, sizeof field_ref_zero))
+
+/* Enable some extra debugging output.  This code can be enabled
+independently of any UNIV_ debugging conditions. */
+#if defined UNIV_DEBUG || defined UNIV_ZIP_DEBUG
+# include <stdarg.h>
+__attribute__((format (printf, 1, 2)))
+/**********************************************************************//**
+Report a failure to decompress or compress.
+@return	number of characters printed */
+static
+int
+page_zip_fail_func(
+/*===============*/
+	const char*	fmt,	/*!< in: printf(3) format string */
+	...)			/*!< in: arguments corresponding to fmt */
+{
+	int	res;
+	va_list	ap;
+
+	ut_print_timestamp(stderr);
+	fputs("  InnoDB: ", stderr);
+	va_start(ap, fmt);
+	res = vfprintf(stderr, fmt, ap);
+	va_end(ap);
+
+	return(res);
+}
+/** Wrapper for page_zip_fail_func()
+@param fmt_args	in: printf(3) format string and arguments */
+# define page_zip_fail(fmt_args) page_zip_fail_func fmt_args
+#else /* UNIV_DEBUG || UNIV_ZIP_DEBUG */
+/** Dummy wrapper for page_zip_fail_func()
+@param fmt_args	ignored: printf(3) format string and arguments */
+# define page_zip_fail(fmt_args) /* empty */
+#endif /* UNIV_DEBUG || UNIV_ZIP_DEBUG */
+
+#ifndef UNIV_HOTBACKUP
+/**********************************************************************//**
+Determine the guaranteed free space on an empty page.
+@return	minimum payload size on the page */
+UNIV_INTERN
+ulint
+page_zip_empty_size(
+/*================*/
+	ulint	n_fields,	/*!< in: number of columns in the index */
+	ulint	zip_size)	/*!< in: compressed page size in bytes */
+{
+	lint	size = zip_size
+		/* subtract the page header and the longest
+		uncompressed data needed for one record */
+		- (PAGE_DATA
+		   + PAGE_ZIP_DIR_SLOT_SIZE
+		   + DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN
+		   + 1/* encoded heap_no==2 in page_zip_write_rec() */
+		   + 1/* end of modification log */
+		   - REC_N_NEW_EXTRA_BYTES/* omitted bytes */)
+		/* subtract the space for page_zip_fields_encode() */
+		- compressBound(2 * (n_fields + 1));
+	return(size > 0 ? (ulint) size : 0);
+}
+#endif /* !UNIV_HOTBACKUP */
+
+/*************************************************************//**
+Gets the size of the compressed page trailer (the dense page directory),
+including deleted records (the free list).
+@return	length of dense page directory, in bytes */
+UNIV_INLINE
+ulint
+page_zip_dir_size(
+/*==============*/
+	const page_zip_des_t*	page_zip)	/*!< in: compressed page */
+{
+	/* Exclude the page infimum and supremum from the record count. */
+	ulint	size = PAGE_ZIP_DIR_SLOT_SIZE
+		* (page_dir_get_n_heap(page_zip->data)
+		   - PAGE_HEAP_NO_USER_LOW);
+	return(size);
+}
+
+/*************************************************************//**
+Gets the size of the compressed page trailer (the dense page directory),
+only including user records (excluding the free list).
+@return	length of dense page directory comprising existing records, in bytes */
+UNIV_INLINE
+ulint
+page_zip_dir_user_size(
+/*===================*/
+	const page_zip_des_t*	page_zip)	/*!< in: compressed page */
+{
+	ulint	size = PAGE_ZIP_DIR_SLOT_SIZE
+		* page_get_n_recs(page_zip->data);
+	ut_ad(size <= page_zip_dir_size(page_zip));
+	return(size);
+}
+
+/*************************************************************//**
+Find the slot of the given record in the dense page directory.
+@return	dense directory slot, or NULL if record not found */
+UNIV_INLINE
+byte*
+page_zip_dir_find_low(
+/*==================*/
+	byte*	slot,			/*!< in: start of records */
+	byte*	end,			/*!< in: end of records */
+	ulint	offset)			/*!< in: offset of user record */
+{
+	ut_ad(slot <= end);
+
+	for (; slot < end; slot += PAGE_ZIP_DIR_SLOT_SIZE) {
+		if ((mach_read_from_2(slot) & PAGE_ZIP_DIR_SLOT_MASK)
+		    == offset) {
+			return(slot);
+		}
+	}
+
+	return(NULL);
+}
+
+/*************************************************************//**
+Find the slot of the given non-free record in the dense page directory.
+@return	dense directory slot, or NULL if record not found */
+UNIV_INLINE
+byte*
+page_zip_dir_find(
+/*==============*/
+	page_zip_des_t*	page_zip,		/*!< in: compressed page */
+	ulint		offset)			/*!< in: offset of user record */
+{
+	byte*	end	= page_zip->data + page_zip_get_size(page_zip);
+
+	ut_ad(page_zip_simple_validate(page_zip));
+
+	return(page_zip_dir_find_low(end - page_zip_dir_user_size(page_zip),
+				     end,
+				     offset));
+}
+
+/*************************************************************//**
+Find the slot of the given free record in the dense page directory.
+@return	dense directory slot, or NULL if record not found */
+UNIV_INLINE
+byte*
+page_zip_dir_find_free(
+/*===================*/
+	page_zip_des_t*	page_zip,		/*!< in: compressed page */
+	ulint		offset)			/*!< in: offset of user record */
+{
+	byte*	end	= page_zip->data + page_zip_get_size(page_zip);
+
+	ut_ad(page_zip_simple_validate(page_zip));
+
+	return(page_zip_dir_find_low(end - page_zip_dir_size(page_zip),
+				     end - page_zip_dir_user_size(page_zip),
+				     offset));
+}
+
+/*************************************************************//**
+Read a given slot in the dense page directory.
+@return record offset on the uncompressed page, possibly ORed with
+PAGE_ZIP_DIR_SLOT_DEL or PAGE_ZIP_DIR_SLOT_OWNED */
+UNIV_INLINE
+ulint
+page_zip_dir_get(
+/*=============*/
+	const page_zip_des_t*	page_zip,	/*!< in: compressed page */
+	ulint			slot)		/*!< in: slot
+						(0=first user record) */
+{
+	ut_ad(page_zip_simple_validate(page_zip));
+	ut_ad(slot < page_zip_dir_size(page_zip) / PAGE_ZIP_DIR_SLOT_SIZE);
+	return(mach_read_from_2(page_zip->data + page_zip_get_size(page_zip)
+				- PAGE_ZIP_DIR_SLOT_SIZE * (slot + 1)));
+}
+
+#ifndef UNIV_HOTBACKUP
+/**********************************************************************//**
+Write a log record of compressing an index page. */
+static
+void
+page_zip_compress_write_log(
+/*========================*/
+	const page_zip_des_t*	page_zip,/*!< in: compressed page */
+	const page_t*		page,	/*!< in: uncompressed page */
+	dict_index_t*		index,	/*!< in: index of the B-tree node */
+	mtr_t*			mtr)	/*!< in: mini-transaction */
+{
+	byte*	log_ptr;
+	ulint	trailer_size;
+
+	ut_ad(!dict_index_is_ibuf(index));
+
+	log_ptr = mlog_open(mtr, 11 + 2 + 2);
+
+	if (!log_ptr) {
+
+		return;
+	}
+
+	/* Read the number of user records. */
+	trailer_size = page_dir_get_n_heap(page_zip->data)
+		- PAGE_HEAP_NO_USER_LOW;
+	/* Multiply by uncompressed of size stored per record */
+	if (!page_is_leaf(page)) {
+		trailer_size *= PAGE_ZIP_DIR_SLOT_SIZE + REC_NODE_PTR_SIZE;
+	} else if (dict_index_is_clust(index)) {
+		trailer_size *= PAGE_ZIP_DIR_SLOT_SIZE
+			+ DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN;
+	} else {
+		trailer_size *= PAGE_ZIP_DIR_SLOT_SIZE;
+	}
+	/* Add the space occupied by BLOB pointers. */
+	trailer_size += page_zip->n_blobs * BTR_EXTERN_FIELD_REF_SIZE;
+	ut_a(page_zip->m_end > PAGE_DATA);
+#if FIL_PAGE_DATA > PAGE_DATA
+# error "FIL_PAGE_DATA > PAGE_DATA"
+#endif
+	ut_a(page_zip->m_end + trailer_size <= page_zip_get_size(page_zip));
+
+	log_ptr = mlog_write_initial_log_record_fast((page_t*) page,
+						     MLOG_ZIP_PAGE_COMPRESS,
+						     log_ptr, mtr);
+	mach_write_to_2(log_ptr, page_zip->m_end - FIL_PAGE_TYPE);
+	log_ptr += 2;
+	mach_write_to_2(log_ptr, trailer_size);
+	log_ptr += 2;
+	mlog_close(mtr, log_ptr);
+
+	/* Write FIL_PAGE_PREV and FIL_PAGE_NEXT */
+	mlog_catenate_string(mtr, page_zip->data + FIL_PAGE_PREV, 4);
+	mlog_catenate_string(mtr, page_zip->data + FIL_PAGE_NEXT, 4);
+	/* Write most of the page header, the compressed stream and
+	the modification log. */
+	mlog_catenate_string(mtr, page_zip->data + FIL_PAGE_TYPE,
+			     page_zip->m_end - FIL_PAGE_TYPE);
+	/* Write the uncompressed trailer of the compressed page. */
+	mlog_catenate_string(mtr, page_zip->data + page_zip_get_size(page_zip)
+			     - trailer_size, trailer_size);
+}
+#endif /* !UNIV_HOTBACKUP */
+
+/******************************************************//**
+Determine how many externally stored columns are contained
+in existing records with smaller heap_no than rec. */
+static
+ulint
+page_zip_get_n_prev_extern(
+/*=======================*/
+	const page_zip_des_t*	page_zip,/*!< in: dense page directory on
+					compressed page */
+	const rec_t*		rec,	/*!< in: compact physical record
+					on a B-tree leaf page */
+	dict_index_t*		index)	/*!< in: record descriptor */
+{
+	const page_t*	page	= page_align(rec);
+	ulint		n_ext	= 0;
+	ulint		i;
+	ulint		left;
+	ulint		heap_no;
+	ulint		n_recs	= page_get_n_recs(page_zip->data);
+
+	ut_ad(page_is_leaf(page));
+	ut_ad(page_is_comp(page));
+	ut_ad(dict_table_is_comp(index->table));
+	ut_ad(dict_index_is_clust(index));
+	ut_ad(!dict_index_is_ibuf(index));
+
+	heap_no = rec_get_heap_no_new(rec);
+	ut_ad(heap_no >= PAGE_HEAP_NO_USER_LOW);
+	left = heap_no - PAGE_HEAP_NO_USER_LOW;
+	if (UNIV_UNLIKELY(!left)) {
+		return(0);
+	}
+
+	for (i = 0; i < n_recs; i++) {
+		const rec_t*	r	= page + (page_zip_dir_get(page_zip, i)
+						  & PAGE_ZIP_DIR_SLOT_MASK);
+
+		if (rec_get_heap_no_new(r) < heap_no) {
+			n_ext += rec_get_n_extern_new(r, index,
+						      ULINT_UNDEFINED);
+			if (!--left) {
+				break;
+			}
+		}
+	}
+
+	return(n_ext);
+}
+
+/**********************************************************************//**
+Encode the length of a fixed-length column.
+@return	buf + length of encoded val */
+static
+byte*
+page_zip_fixed_field_encode(
+/*========================*/
+	byte*	buf,	/*!< in: pointer to buffer where to write */
+	ulint	val)	/*!< in: value to write */
+{
+	ut_ad(val >= 2);
+
+	if (UNIV_LIKELY(val < 126)) {
+		/*
+		0 = nullable variable field of at most 255 bytes length;
+		1 = not null variable field of at most 255 bytes length;
+		126 = nullable variable field with maximum length >255;
+		127 = not null variable field with maximum length >255
+		*/
+		*buf++ = (byte) val;
+	} else {
+		*buf++ = (byte) (0x80 | val >> 8);
+		*buf++ = (byte) val;
+	}
+
+	return(buf);
+}
+
+/**********************************************************************//**
+Write the index information for the compressed page.
+@return	used size of buf */
+static
+ulint
+page_zip_fields_encode(
+/*===================*/
+	ulint		n,	/*!< in: number of fields to compress */
+	dict_index_t*	index,	/*!< in: index comprising at least n fields */
+	ulint		trx_id_pos,/*!< in: position of the trx_id column
+				in the index, or ULINT_UNDEFINED if
+				this is a non-leaf page */
+	byte*		buf)	/*!< out: buffer of (n + 1) * 2 bytes */
+{
+	const byte*	buf_start	= buf;
+	ulint		i;
+	ulint		col;
+	ulint		trx_id_col	= 0;
+	/* sum of lengths of preceding non-nullable fixed fields, or 0 */
+	ulint		fixed_sum	= 0;
+
+	ut_ad(trx_id_pos == ULINT_UNDEFINED || trx_id_pos < n);
+
+	for (i = col = 0; i < n; i++) {
+		dict_field_t*	field = dict_index_get_nth_field(index, i);
+		ulint		val;
+
+		if (dict_field_get_col(field)->prtype & DATA_NOT_NULL) {
+			val = 1; /* set the "not nullable" flag */
+		} else {
+			val = 0; /* nullable field */
+		}
+
+		if (!field->fixed_len) {
+			/* variable-length field */
+			const dict_col_t*	column
+				= dict_field_get_col(field);
+
+			if (UNIV_UNLIKELY(column->len > 255)
+			    || UNIV_UNLIKELY(column->mtype == DATA_BLOB)) {
+				val |= 0x7e; /* max > 255 bytes */
+			}
+
+			if (fixed_sum) {
+				/* write out the length of any
+				preceding non-nullable fields */
+				buf = page_zip_fixed_field_encode(
+					buf, fixed_sum << 1 | 1);
+				fixed_sum = 0;
+				col++;
+			}
+
+			*buf++ = (byte) val;
+			col++;
+		} else if (val) {
+			/* fixed-length non-nullable field */
+
+			if (fixed_sum && UNIV_UNLIKELY
+			    (fixed_sum + field->fixed_len
+			     > DICT_MAX_INDEX_COL_LEN)) {
+				/* Write out the length of the
+				preceding non-nullable fields,
+				to avoid exceeding the maximum
+				length of a fixed-length column. */
+				buf = page_zip_fixed_field_encode(
+					buf, fixed_sum << 1 | 1);
+				fixed_sum = 0;
+				col++;
+			}
+
+			if (i && UNIV_UNLIKELY(i == trx_id_pos)) {
+				if (fixed_sum) {
+					/* Write out the length of any
+					preceding non-nullable fields,
+					and start a new trx_id column. */
+					buf = page_zip_fixed_field_encode(
+						buf, fixed_sum << 1 | 1);
+					col++;
+				}
+
+				trx_id_col = col;
+				fixed_sum = field->fixed_len;
+			} else {
+				/* add to the sum */
+				fixed_sum += field->fixed_len;
+			}
+		} else {
+			/* fixed-length nullable field */
+
+			if (fixed_sum) {
+				/* write out the length of any
+				preceding non-nullable fields */
+				buf = page_zip_fixed_field_encode(
+					buf, fixed_sum << 1 | 1);
+				fixed_sum = 0;
+				col++;
+			}
+
+			buf = page_zip_fixed_field_encode(
+				buf, field->fixed_len << 1);
+			col++;
+		}
+	}
+
+	if (fixed_sum) {
+		/* Write out the lengths of last fixed-length columns. */
+		buf = page_zip_fixed_field_encode(buf, fixed_sum << 1 | 1);
+	}
+
+	if (trx_id_pos != ULINT_UNDEFINED) {
+		/* Write out the position of the trx_id column */
+		i = trx_id_col;
+	} else {
+		/* Write out the number of nullable fields */
+		i = index->n_nullable;
+	}
+
+	if (i < 128) {
+		*buf++ = (byte) i;
+	} else {
+		*buf++ = (byte) (0x80 | i >> 8);
+		*buf++ = (byte) i;
+	}
+
+	ut_ad((ulint) (buf - buf_start) <= (n + 2) * 2);
+	return((ulint) (buf - buf_start));
+}
+
+/**********************************************************************//**
+Populate the dense page directory from the sparse directory. */
+static
+void
+page_zip_dir_encode(
+/*================*/
+	const page_t*	page,	/*!< in: compact page */
+	byte*		buf,	/*!< in: pointer to dense page directory[-1];
+				out: dense directory on compressed page */
+	const rec_t**	recs)	/*!< in: pointer to an array of 0, or NULL;
+				out: dense page directory sorted by ascending
+				address (and heap_no) */
+{
+	const byte*	rec;
+	ulint		status;
+	ulint		min_mark;
+	ulint		heap_no;
+	ulint		i;
+	ulint		n_heap;
+	ulint		offs;
+
+	min_mark = 0;
+
+	if (page_is_leaf(page)) {
+		status = REC_STATUS_ORDINARY;
+	} else {
+		status = REC_STATUS_NODE_PTR;
+		if (UNIV_UNLIKELY
+		    (mach_read_from_4(page + FIL_PAGE_PREV) == FIL_NULL)) {
+			min_mark = REC_INFO_MIN_REC_FLAG;
+		}
+	}
+
+	n_heap = page_dir_get_n_heap(page);
+
+	/* Traverse the list of stored records in the collation order,
+	starting from the first user record. */
+
+	rec = page + PAGE_NEW_INFIMUM;
+
+	i = 0;
+
+	for (;;) {
+		ulint	info_bits;
+		offs = rec_get_next_offs(rec, TRUE);
+		if (UNIV_UNLIKELY(offs == PAGE_NEW_SUPREMUM)) {
+			break;
+		}
+		rec = page + offs;
+		heap_no = rec_get_heap_no_new(rec);
+		ut_a(heap_no >= PAGE_HEAP_NO_USER_LOW);
+		ut_a(heap_no < n_heap);
+		ut_a(offs < UNIV_PAGE_SIZE - PAGE_DIR);
+		ut_a(offs >= PAGE_ZIP_START);
+#if PAGE_ZIP_DIR_SLOT_MASK & (PAGE_ZIP_DIR_SLOT_MASK + 1)
+# error "PAGE_ZIP_DIR_SLOT_MASK is not 1 less than a power of 2"
+#endif
+#if PAGE_ZIP_DIR_SLOT_MASK < UNIV_PAGE_SIZE - 1
+# error "PAGE_ZIP_DIR_SLOT_MASK < UNIV_PAGE_SIZE - 1"
+#endif
+		if (UNIV_UNLIKELY(rec_get_n_owned_new(rec))) {
+			offs |= PAGE_ZIP_DIR_SLOT_OWNED;
+		}
+
+		info_bits = rec_get_info_bits(rec, TRUE);
+		if (UNIV_UNLIKELY(info_bits & REC_INFO_DELETED_FLAG)) {
+			info_bits &= ~REC_INFO_DELETED_FLAG;
+			offs |= PAGE_ZIP_DIR_SLOT_DEL;
+		}
+		ut_a(info_bits == min_mark);
+		/* Only the smallest user record can have
+		REC_INFO_MIN_REC_FLAG set. */
+		min_mark = 0;
+
+		mach_write_to_2(buf - PAGE_ZIP_DIR_SLOT_SIZE * ++i, offs);
+
+		if (UNIV_LIKELY_NULL(recs)) {
+			/* Ensure that each heap_no occurs at most once. */
+			ut_a(!recs[heap_no - PAGE_HEAP_NO_USER_LOW]);
+			/* exclude infimum and supremum */
+			recs[heap_no - PAGE_HEAP_NO_USER_LOW] = rec;
+		}
+
+		ut_a(rec_get_status(rec) == status);
+	}
+
+	offs = page_header_get_field(page, PAGE_FREE);
+
+	/* Traverse the free list (of deleted records). */
+	while (offs) {
+		ut_ad(!(offs & ~PAGE_ZIP_DIR_SLOT_MASK));
+		rec = page + offs;
+
+		heap_no = rec_get_heap_no_new(rec);
+		ut_a(heap_no >= PAGE_HEAP_NO_USER_LOW);
+		ut_a(heap_no < n_heap);
+
+		ut_a(!rec[-REC_N_NEW_EXTRA_BYTES]); /* info_bits and n_owned */
+		ut_a(rec_get_status(rec) == status);
+
+		mach_write_to_2(buf - PAGE_ZIP_DIR_SLOT_SIZE * ++i, offs);
+
+		if (UNIV_LIKELY_NULL(recs)) {
+			/* Ensure that each heap_no occurs at most once. */
+			ut_a(!recs[heap_no - PAGE_HEAP_NO_USER_LOW]);
+			/* exclude infimum and supremum */
+			recs[heap_no - PAGE_HEAP_NO_USER_LOW] = rec;
+		}
+
+		offs = rec_get_next_offs(rec, TRUE);
+	}
+
+	/* Ensure that each heap no occurs at least once. */
+	ut_a(i + PAGE_HEAP_NO_USER_LOW == n_heap);
+}
+
+/**********************************************************************//**
+Allocate memory for zlib. */
+static
+void*
+page_zip_malloc(
+/*============*/
+	void*	opaque,	/*!< in/out: memory heap */
+	uInt	items,	/*!< in: number of items to allocate */
+	uInt	size)	/*!< in: size of an item in bytes */
+{
+	return(mem_heap_alloc(opaque, items * size));
+}
+
+/**********************************************************************//**
+Deallocate memory for zlib. */
+static
+void
+page_zip_free(
+/*==========*/
+	void*	opaque __attribute__((unused)),	/*!< in: memory heap */
+	void*	address __attribute__((unused)))/*!< in: object to free */
+{
+}
+
+/**********************************************************************//**
+Configure the zlib allocator to use the given memory heap. */
+UNIV_INTERN
+void
+page_zip_set_alloc(
+/*===============*/
+	void*		stream,		/*!< in/out: zlib stream */
+	mem_heap_t*	heap)		/*!< in: memory heap to use */
+{
+	z_stream*	strm = stream;
+
+	strm->zalloc = page_zip_malloc;
+	strm->zfree = page_zip_free;
+	strm->opaque = heap;
+}
+
+#if 0 || defined UNIV_DEBUG || defined UNIV_ZIP_DEBUG
+/** Symbol for enabling compression and decompression diagnostics */
+# define PAGE_ZIP_COMPRESS_DBG
+#endif
+
+#ifdef PAGE_ZIP_COMPRESS_DBG
+/** Set this variable in a debugger to enable
+excessive logging in page_zip_compress(). */
+UNIV_INTERN ibool	page_zip_compress_dbg;
+/** Set this variable in a debugger to enable
+binary logging of the data passed to deflate().
+When this variable is nonzero, it will act
+as a log file name generator. */
+UNIV_INTERN unsigned	page_zip_compress_log;
+
+/**********************************************************************//**
+Wrapper for deflate().  Log the operation if page_zip_compress_dbg is set.
+@return	deflate() status: Z_OK, Z_BUF_ERROR, ... */
+static
+int
+page_zip_compress_deflate(
+/*======================*/
+	FILE*		logfile,/*!< in: log file, or NULL */
+	z_streamp	strm,	/*!< in/out: compressed stream for deflate() */
+	int		flush)	/*!< in: deflate() flushing method */
+{
+	int	status;
+	if (UNIV_UNLIKELY(page_zip_compress_dbg)) {
+		ut_print_buf(stderr, strm->next_in, strm->avail_in);
+	}
+	if (UNIV_LIKELY_NULL(logfile)) {
+		fwrite(strm->next_in, 1, strm->avail_in, logfile);
+	}
+	status = deflate(strm, flush);
+	if (UNIV_UNLIKELY(page_zip_compress_dbg)) {
+		fprintf(stderr, " -> %d\n", status);
+	}
+	return(status);
+}
+
+/* Redefine deflate(). */
+# undef deflate
+/** Debug wrapper for the zlib compression routine deflate().
+Log the operation if page_zip_compress_dbg is set.
+@param strm	in/out: compressed stream
+@param flush	in: flushing method
+@return		deflate() status: Z_OK, Z_BUF_ERROR, ... */
+# define deflate(strm, flush) page_zip_compress_deflate(logfile, strm, flush)
+/** Declaration of the logfile parameter */
+# define FILE_LOGFILE FILE* logfile,
+/** The logfile parameter */
+# define LOGFILE logfile,
+#else /* PAGE_ZIP_COMPRESS_DBG */
+/** Empty declaration of the logfile parameter */
+# define FILE_LOGFILE
+/** Missing logfile parameter */
+# define LOGFILE
+#endif /* PAGE_ZIP_COMPRESS_DBG */
+
+/**********************************************************************//**
+Compress the records of a node pointer page.
+@return	Z_OK, or a zlib error code */
+static
+int
+page_zip_compress_node_ptrs(
+/*========================*/
+	FILE_LOGFILE
+	z_stream*	c_stream,	/*!< in/out: compressed page stream */
+	const rec_t**	recs,		/*!< in: dense page directory
+					sorted by address */
+	ulint		n_dense,	/*!< in: size of recs[] */
+	dict_index_t*	index,		/*!< in: the index of the page */
+	byte*		storage,	/*!< in: end of dense page directory */
+	mem_heap_t*	heap)		/*!< in: temporary memory heap */
+{
+	int	err	= Z_OK;
+	ulint*	offsets = NULL;
+
+	do {
+		const rec_t*	rec = *recs++;
+
+		offsets = rec_get_offsets(rec, index, offsets,
+					  ULINT_UNDEFINED, &heap);
+		/* Only leaf nodes may contain externally stored columns. */
+		ut_ad(!rec_offs_any_extern(offsets));
+
+		UNIV_MEM_ASSERT_RW(rec, rec_offs_data_size(offsets));
+		UNIV_MEM_ASSERT_RW(rec - rec_offs_extra_size(offsets),
+				   rec_offs_extra_size(offsets));
+
+		/* Compress the extra bytes. */
+		c_stream->avail_in = rec - REC_N_NEW_EXTRA_BYTES
+			- c_stream->next_in;
+
+		if (c_stream->avail_in) {
+			err = deflate(c_stream, Z_NO_FLUSH);
+			if (UNIV_UNLIKELY(err != Z_OK)) {
+				break;
+			}
+		}
+		ut_ad(!c_stream->avail_in);
+
+		/* Compress the data bytes, except node_ptr. */
+		c_stream->next_in = (byte*) rec;
+		c_stream->avail_in = rec_offs_data_size(offsets)
+			- REC_NODE_PTR_SIZE;
+		ut_ad(c_stream->avail_in);
+
+		err = deflate(c_stream, Z_NO_FLUSH);
+		if (UNIV_UNLIKELY(err != Z_OK)) {
+			break;
+		}
+
+		ut_ad(!c_stream->avail_in);
+
+		memcpy(storage - REC_NODE_PTR_SIZE
+		       * (rec_get_heap_no_new(rec) - 1),
+		       c_stream->next_in, REC_NODE_PTR_SIZE);
+		c_stream->next_in += REC_NODE_PTR_SIZE;
+	} while (--n_dense);
+
+	return(err);
+}
+
+/**********************************************************************//**
+Compress the records of a leaf node of a secondary index.
+@return	Z_OK, or a zlib error code */
+static
+int
+page_zip_compress_sec(
+/*==================*/
+	FILE_LOGFILE
+	z_stream*	c_stream,	/*!< in/out: compressed page stream */
+	const rec_t**	recs,		/*!< in: dense page directory
+					sorted by address */
+	ulint		n_dense)	/*!< in: size of recs[] */
+{
+	int		err	= Z_OK;
+
+	ut_ad(n_dense > 0);
+
+	do {
+		const rec_t*	rec = *recs++;
+
+		/* Compress everything up to this record. */
+		c_stream->avail_in = rec - REC_N_NEW_EXTRA_BYTES
+			- c_stream->next_in;
+
+		if (UNIV_LIKELY(c_stream->avail_in)) {
+			UNIV_MEM_ASSERT_RW(c_stream->next_in,
+					   c_stream->avail_in);
+			err = deflate(c_stream, Z_NO_FLUSH);
+			if (UNIV_UNLIKELY(err != Z_OK)) {
+				break;
+			}
+		}
+
+		ut_ad(!c_stream->avail_in);
+		ut_ad(c_stream->next_in == rec - REC_N_NEW_EXTRA_BYTES);
+
+		/* Skip the REC_N_NEW_EXTRA_BYTES. */
+
+		c_stream->next_in = (byte*) rec;
+	} while (--n_dense);
+
+	return(err);
+}
+
+/**********************************************************************//**
+Compress a record of a leaf node of a clustered index that contains
+externally stored columns.
+@return	Z_OK, or a zlib error code */
+static
+int
+page_zip_compress_clust_ext(
+/*========================*/
+	FILE_LOGFILE
+	z_stream*	c_stream,	/*!< in/out: compressed page stream */
+	const rec_t*	rec,		/*!< in: record */
+	const ulint*	offsets,	/*!< in: rec_get_offsets(rec) */
+	ulint		trx_id_col,	/*!< in: position of of DB_TRX_ID */
+	byte*		deleted,	/*!< in: dense directory entry pointing
+					to the head of the free list */
+	byte*		storage,	/*!< in: end of dense page directory */
+	byte**		externs,	/*!< in/out: pointer to the next
+					available BLOB pointer */
+	ulint*		n_blobs)	/*!< in/out: number of
+					externally stored columns */
+{
+	int	err;
+	ulint	i;
+
+	UNIV_MEM_ASSERT_RW(rec, rec_offs_data_size(offsets));
+	UNIV_MEM_ASSERT_RW(rec - rec_offs_extra_size(offsets),
+			   rec_offs_extra_size(offsets));
+
+	for (i = 0; i < rec_offs_n_fields(offsets); i++) {
+		ulint		len;
+		const byte*	src;
+
+		if (UNIV_UNLIKELY(i == trx_id_col)) {
+			ut_ad(!rec_offs_nth_extern(offsets, i));
+			/* Store trx_id and roll_ptr
+			in uncompressed form. */
+			src = rec_get_nth_field(rec, offsets, i, &len);
+			ut_ad(src + DATA_TRX_ID_LEN
+			      == rec_get_nth_field(rec, offsets,
+						   i + 1, &len));
+			ut_ad(len == DATA_ROLL_PTR_LEN);
+
+			/* Compress any preceding bytes. */
+			c_stream->avail_in
+				= src - c_stream->next_in;
+
+			if (c_stream->avail_in) {
+				err = deflate(c_stream, Z_NO_FLUSH);
+				if (UNIV_UNLIKELY(err != Z_OK)) {
+
+					return(err);
+				}
+			}
+
+			ut_ad(!c_stream->avail_in);
+			ut_ad(c_stream->next_in == src);
+
+			memcpy(storage
+			       - (DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN)
+			       * (rec_get_heap_no_new(rec) - 1),
+			       c_stream->next_in,
+			       DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN);
+
+			c_stream->next_in
+				+= DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN;
+
+			/* Skip also roll_ptr */
+			i++;
+		} else if (rec_offs_nth_extern(offsets, i)) {
+			src = rec_get_nth_field(rec, offsets, i, &len);
+			ut_ad(len >= BTR_EXTERN_FIELD_REF_SIZE);
+			src += len - BTR_EXTERN_FIELD_REF_SIZE;
+
+			c_stream->avail_in = src
+				- c_stream->next_in;
+			if (UNIV_LIKELY(c_stream->avail_in)) {
+				err = deflate(c_stream, Z_NO_FLUSH);
+				if (UNIV_UNLIKELY(err != Z_OK)) {
+
+					return(err);
+				}
+			}
+
+			ut_ad(!c_stream->avail_in);
+			ut_ad(c_stream->next_in == src);
+
+			/* Reserve space for the data at
+			the end of the space reserved for
+			the compressed data and the page
+			modification log. */
+
+			if (UNIV_UNLIKELY
+			    (c_stream->avail_out
+			     <= BTR_EXTERN_FIELD_REF_SIZE)) {
+				/* out of space */
+				return(Z_BUF_ERROR);
+			}
+
+			ut_ad(*externs == c_stream->next_out
+			      + c_stream->avail_out
+			      + 1/* end of modif. log */);
+
+			c_stream->next_in
+				+= BTR_EXTERN_FIELD_REF_SIZE;
+
+			/* Skip deleted records. */
+			if (UNIV_LIKELY_NULL
+			    (page_zip_dir_find_low(
+				    storage, deleted,
+				    page_offset(rec)))) {
+				continue;
+			}
+
+			(*n_blobs)++;
+			c_stream->avail_out
+				-= BTR_EXTERN_FIELD_REF_SIZE;
+			*externs -= BTR_EXTERN_FIELD_REF_SIZE;
+
+			/* Copy the BLOB pointer */
+			memcpy(*externs, c_stream->next_in
+			       - BTR_EXTERN_FIELD_REF_SIZE,
+			       BTR_EXTERN_FIELD_REF_SIZE);
+		}
+	}
+
+	return(Z_OK);
+}
+
+/**********************************************************************//**
+Compress the records of a leaf node of a clustered index.
+@return	Z_OK, or a zlib error code */
+static
+int
+page_zip_compress_clust(
+/*====================*/
+	FILE_LOGFILE
+	z_stream*	c_stream,	/*!< in/out: compressed page stream */
+	const rec_t**	recs,		/*!< in: dense page directory
+					sorted by address */
+	ulint		n_dense,	/*!< in: size of recs[] */
+	dict_index_t*	index,		/*!< in: the index of the page */
+	ulint*		n_blobs,	/*!< in: 0; out: number of
+					externally stored columns */
+	ulint		trx_id_col,	/*!< index of the trx_id column */
+	byte*		deleted,	/*!< in: dense directory entry pointing
+					to the head of the free list */
+	byte*		storage,	/*!< in: end of dense page directory */
+	mem_heap_t*	heap)		/*!< in: temporary memory heap */
+{
+	int	err		= Z_OK;
+	ulint*	offsets		= NULL;
+	/* BTR_EXTERN_FIELD_REF storage */
+	byte*	externs		= storage - n_dense
+		* (DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN);
+
+	ut_ad(*n_blobs == 0);
+
+	do {
+		const rec_t*	rec = *recs++;
+
+		offsets = rec_get_offsets(rec, index, offsets,
+					  ULINT_UNDEFINED, &heap);
+		ut_ad(rec_offs_n_fields(offsets)
+		      == dict_index_get_n_fields(index));
+		UNIV_MEM_ASSERT_RW(rec, rec_offs_data_size(offsets));
+		UNIV_MEM_ASSERT_RW(rec - rec_offs_extra_size(offsets),
+				   rec_offs_extra_size(offsets));
+
+		/* Compress the extra bytes. */
+		c_stream->avail_in = rec - REC_N_NEW_EXTRA_BYTES
+			- c_stream->next_in;
+
+		if (c_stream->avail_in) {
+			err = deflate(c_stream, Z_NO_FLUSH);
+			if (UNIV_UNLIKELY(err != Z_OK)) {
+
+				goto func_exit;
+			}
+		}
+		ut_ad(!c_stream->avail_in);
+		ut_ad(c_stream->next_in == rec - REC_N_NEW_EXTRA_BYTES);
+
+		/* Compress the data bytes. */
+
+		c_stream->next_in = (byte*) rec;
+
+		/* Check if there are any externally stored columns.
+		For each externally stored column, store the
+		BTR_EXTERN_FIELD_REF separately. */
+		if (UNIV_UNLIKELY(rec_offs_any_extern(offsets))) {
+			ut_ad(dict_index_is_clust(index));
+
+			err = page_zip_compress_clust_ext(
+				LOGFILE
+				c_stream, rec, offsets, trx_id_col,
+				deleted, storage, &externs, n_blobs);
+
+			if (UNIV_UNLIKELY(err != Z_OK)) {
+
+				goto func_exit;
+			}
+		} else {
+			ulint		len;
+			const byte*	src;
+
+			/* Store trx_id and roll_ptr in uncompressed form. */
+			src = rec_get_nth_field(rec, offsets,
+						trx_id_col, &len);
+			ut_ad(src + DATA_TRX_ID_LEN
+			      == rec_get_nth_field(rec, offsets,
+						   trx_id_col + 1, &len));
+			ut_ad(len == DATA_ROLL_PTR_LEN);
+			UNIV_MEM_ASSERT_RW(rec, rec_offs_data_size(offsets));
+			UNIV_MEM_ASSERT_RW(rec - rec_offs_extra_size(offsets),
+					   rec_offs_extra_size(offsets));
+
+			/* Compress any preceding bytes. */
+			c_stream->avail_in = src - c_stream->next_in;
+
+			if (c_stream->avail_in) {
+				err = deflate(c_stream, Z_NO_FLUSH);
+				if (UNIV_UNLIKELY(err != Z_OK)) {
+
+					return(err);
+				}
+			}
+
+			ut_ad(!c_stream->avail_in);
+			ut_ad(c_stream->next_in == src);
+
+			memcpy(storage
+			       - (DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN)
+			       * (rec_get_heap_no_new(rec) - 1),
+			       c_stream->next_in,
+			       DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN);
+
+			c_stream->next_in
+				+= DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN;
+
+			/* Skip also roll_ptr */
+			ut_ad(trx_id_col + 1 < rec_offs_n_fields(offsets));
+		}
+
+		/* Compress the last bytes of the record. */
+		c_stream->avail_in = rec + rec_offs_data_size(offsets)
+			- c_stream->next_in;
+
+		if (c_stream->avail_in) {
+			err = deflate(c_stream, Z_NO_FLUSH);
+			if (UNIV_UNLIKELY(err != Z_OK)) {
+
+				goto func_exit;
+			}
+		}
+		ut_ad(!c_stream->avail_in);
+	} while (--n_dense);
+
+func_exit:
+	return(err);
+}
+
+/**********************************************************************//**
+Compress a page.
+@return TRUE on success, FALSE on failure; page_zip will be left
+intact on failure. */
+UNIV_INTERN
+ibool
+page_zip_compress(
+/*==============*/
+	page_zip_des_t*	page_zip,/*!< in: size; out: data, n_blobs,
+				m_start, m_end, m_nonempty */
+	const page_t*	page,	/*!< in: uncompressed page */
+	dict_index_t*	index,	/*!< in: index of the B-tree node */
+	mtr_t*		mtr)	/*!< in: mini-transaction, or NULL */
+{
+	z_stream	c_stream;
+	int		err;
+	ulint		n_fields;/* number of index fields needed */
+	byte*		fields;	/*!< index field information */
+	byte*		buf;	/*!< compressed payload of the page */
+	byte*		buf_end;/* end of buf */
+	ulint		n_dense;
+	ulint		slot_size;/* amount of uncompressed bytes per record */
+	const rec_t**	recs;	/*!< dense page directory, sorted by address */
+	mem_heap_t*	heap;
+	ulint		trx_id_col;
+	ulint*		offsets	= NULL;
+	ulint		n_blobs	= 0;
+	byte*		storage;/* storage of uncompressed columns */
+#ifndef UNIV_HOTBACKUP
+	ullint		usec = ut_time_us(NULL);
+#endif /* !UNIV_HOTBACKUP */
+#ifdef PAGE_ZIP_COMPRESS_DBG
+	FILE*		logfile = NULL;
+#endif
+
+	if (!page) {
+		return(FALSE);
+	}
+
+	ut_a(page_is_comp(page));
+	ut_a(fil_page_get_type(page) == FIL_PAGE_INDEX);
+	ut_ad(page_simple_validate_new((page_t*) page));
+	ut_ad(page_zip_simple_validate(page_zip));
+	ut_ad(dict_table_is_comp(index->table));
+	ut_ad(!dict_index_is_ibuf(index));
+
+	UNIV_MEM_ASSERT_RW(page, UNIV_PAGE_SIZE);
+
+	/* Check the data that will be omitted. */
+	ut_a(!memcmp(page + (PAGE_NEW_INFIMUM - REC_N_NEW_EXTRA_BYTES),
+		     infimum_extra, sizeof infimum_extra));
+	ut_a(!memcmp(page + PAGE_NEW_INFIMUM,
+		     infimum_data, sizeof infimum_data));
+	ut_a(page[PAGE_NEW_SUPREMUM - REC_N_NEW_EXTRA_BYTES]
+	     /* info_bits == 0, n_owned <= max */
+	     <= PAGE_DIR_SLOT_MAX_N_OWNED);
+	ut_a(!memcmp(page + (PAGE_NEW_SUPREMUM - REC_N_NEW_EXTRA_BYTES + 1),
+		     supremum_extra_data, sizeof supremum_extra_data));
+
+	if (UNIV_UNLIKELY(!page_get_n_recs(page))) {
+		ut_a(rec_get_next_offs(page + PAGE_NEW_INFIMUM, TRUE)
+		     == PAGE_NEW_SUPREMUM);
+	}
+
+	if (page_is_leaf(page)) {
+		n_fields = dict_index_get_n_fields(index);
+	} else {
+		n_fields = dict_index_get_n_unique_in_tree(index);
+	}
+
+	/* The dense directory excludes the infimum and supremum records. */
+	n_dense = page_dir_get_n_heap(page) - PAGE_HEAP_NO_USER_LOW;
+#ifdef PAGE_ZIP_COMPRESS_DBG
+	if (UNIV_UNLIKELY(page_zip_compress_dbg)) {
+		fprintf(stderr, "compress %p %p %lu %lu %lu\n",
+			(void*) page_zip, (void*) page,
+			page_is_leaf(page),
+			n_fields, n_dense);
+	}
+	if (UNIV_UNLIKELY(page_zip_compress_log)) {
+		/* Create a log file for every compression attempt. */
+		char	logfilename[9];
+		ut_snprintf(logfilename, sizeof logfilename,
+			    "%08x", page_zip_compress_log++);
+		logfile = fopen(logfilename, "wb");
+
+		if (logfile) {
+			/* Write the uncompressed page to the log. */
+			fwrite(page, 1, UNIV_PAGE_SIZE, logfile);
+			/* Record the compressed size as zero.
+			This will be overwritten at successful exit. */
+			putc(0, logfile);
+			putc(0, logfile);
+			putc(0, logfile);
+			putc(0, logfile);
+		}
+	}
+#endif /* PAGE_ZIP_COMPRESS_DBG */
+#ifndef UNIV_HOTBACKUP
+	page_zip_stat[page_zip->ssize - 1].compressed++;
+#endif /* !UNIV_HOTBACKUP */
+
+	if (UNIV_UNLIKELY(n_dense * PAGE_ZIP_DIR_SLOT_SIZE
+			  >= page_zip_get_size(page_zip))) {
+
+		goto err_exit;
+	}
+
+	heap = mem_heap_create(page_zip_get_size(page_zip)
+			       + n_fields * (2 + sizeof *offsets)
+			       + n_dense * ((sizeof *recs)
+					    - PAGE_ZIP_DIR_SLOT_SIZE)
+			       + UNIV_PAGE_SIZE * 4
+			       + (512 << MAX_MEM_LEVEL));
+
+	recs = mem_heap_zalloc(heap, n_dense * sizeof *recs);
+
+	fields = mem_heap_alloc(heap, (n_fields + 1) * 2);
+
+	buf = mem_heap_alloc(heap, page_zip_get_size(page_zip) - PAGE_DATA);
+	buf_end = buf + page_zip_get_size(page_zip) - PAGE_DATA;
+
+	/* Compress the data payload. */
+	page_zip_set_alloc(&c_stream, heap);
+
+	err = deflateInit2(&c_stream, Z_DEFAULT_COMPRESSION,
+			   Z_DEFLATED, UNIV_PAGE_SIZE_SHIFT,
+			   MAX_MEM_LEVEL, Z_DEFAULT_STRATEGY);
+	ut_a(err == Z_OK);
+
+	c_stream.next_out = buf;
+	/* Subtract the space reserved for uncompressed data. */
+	/* Page header and the end marker of the modification log */
+	c_stream.avail_out = buf_end - buf - 1;
+	/* Dense page directory and uncompressed columns, if any */
+	if (page_is_leaf(page)) {
+		if (dict_index_is_clust(index)) {
+			trx_id_col = dict_index_get_sys_col_pos(
+				index, DATA_TRX_ID);
+			ut_ad(trx_id_col > 0);
+			ut_ad(trx_id_col != ULINT_UNDEFINED);
+
+			slot_size = PAGE_ZIP_DIR_SLOT_SIZE
+				+ DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN;
+		} else {
+			/* Signal the absence of trx_id
+			in page_zip_fields_encode() */
+			ut_ad(dict_index_get_sys_col_pos(index, DATA_TRX_ID)
+			      == ULINT_UNDEFINED);
+			trx_id_col = 0;
+			slot_size = PAGE_ZIP_DIR_SLOT_SIZE;
+		}
+	} else {
+		slot_size = PAGE_ZIP_DIR_SLOT_SIZE + REC_NODE_PTR_SIZE;
+		trx_id_col = ULINT_UNDEFINED;
+	}
+
+	if (UNIV_UNLIKELY(c_stream.avail_out <= n_dense * slot_size
+			  + 6/* sizeof(zlib header and footer) */)) {
+		goto zlib_error;
+	}
+
+	c_stream.avail_out -= n_dense * slot_size;
+	c_stream.avail_in = page_zip_fields_encode(n_fields, index,
+						   trx_id_col, fields);
+	c_stream.next_in = fields;
+	if (UNIV_LIKELY(!trx_id_col)) {
+		trx_id_col = ULINT_UNDEFINED;
+	}
+
+	UNIV_MEM_ASSERT_RW(c_stream.next_in, c_stream.avail_in);
+	err = deflate(&c_stream, Z_FULL_FLUSH);
+	if (err != Z_OK) {
+		goto zlib_error;
+	}
+
+	ut_ad(!c_stream.avail_in);
+
+	page_zip_dir_encode(page, buf_end, recs);
+
+	c_stream.next_in = (byte*) page + PAGE_ZIP_START;
+
+	storage = buf_end - n_dense * PAGE_ZIP_DIR_SLOT_SIZE;
+
+	/* Compress the records in heap_no order. */
+	if (UNIV_UNLIKELY(!n_dense)) {
+	} else if (!page_is_leaf(page)) {
+		/* This is a node pointer page. */
+		err = page_zip_compress_node_ptrs(LOGFILE
+						  &c_stream, recs, n_dense,
+						  index, storage, heap);
+		if (UNIV_UNLIKELY(err != Z_OK)) {
+			goto zlib_error;
+		}
+	} else if (UNIV_LIKELY(trx_id_col == ULINT_UNDEFINED)) {
+		/* This is a leaf page in a secondary index. */
+		err = page_zip_compress_sec(LOGFILE
+					    &c_stream, recs, n_dense);
+		if (UNIV_UNLIKELY(err != Z_OK)) {
+			goto zlib_error;
+		}
+	} else {
+		/* This is a leaf page in a clustered index. */
+		err = page_zip_compress_clust(LOGFILE
+					      &c_stream, recs, n_dense,
+					      index, &n_blobs, trx_id_col,
+					      buf_end - PAGE_ZIP_DIR_SLOT_SIZE
+					      * page_get_n_recs(page),
+					      storage, heap);
+		if (UNIV_UNLIKELY(err != Z_OK)) {
+			goto zlib_error;
+		}
+	}
+
+	/* Finish the compression. */
+	ut_ad(!c_stream.avail_in);
+	/* Compress any trailing garbage, in case the last record was
+	allocated from an originally longer space on the free list,
+	or the data of the last record from page_zip_compress_sec(). */
+	c_stream.avail_in
+		= page_header_get_field(page, PAGE_HEAP_TOP)
+		- (c_stream.next_in - page);
+	ut_a(c_stream.avail_in <= UNIV_PAGE_SIZE - PAGE_ZIP_START - PAGE_DIR);
+
+	UNIV_MEM_ASSERT_RW(c_stream.next_in, c_stream.avail_in);
+	err = deflate(&c_stream, Z_FINISH);
+
+	if (UNIV_UNLIKELY(err != Z_STREAM_END)) {
+zlib_error:
+		deflateEnd(&c_stream);
+		mem_heap_free(heap);
+err_exit:
+#ifdef PAGE_ZIP_COMPRESS_DBG
+		if (logfile) {
+			fclose(logfile);
+		}
+#endif /* PAGE_ZIP_COMPRESS_DBG */
+#ifndef UNIV_HOTBACKUP
+		page_zip_stat[page_zip->ssize - 1].compressed_usec
+			+= ut_time_us(NULL) - usec;
+#endif /* !UNIV_HOTBACKUP */
+		return(FALSE);
+	}
+
+	err = deflateEnd(&c_stream);
+	ut_a(err == Z_OK);
+
+	ut_ad(buf + c_stream.total_out == c_stream.next_out);
+	ut_ad((ulint) (storage - c_stream.next_out) >= c_stream.avail_out);
+
+	/* Valgrind believes that zlib does not initialize some bits
+	in the last 7 or 8 bytes of the stream.  Make Valgrind happy. */
+	UNIV_MEM_VALID(buf, c_stream.total_out);
+
+	/* Zero out the area reserved for the modification log.
+	Space for the end marker of the modification log is not
+	included in avail_out. */
+	memset(c_stream.next_out, 0, c_stream.avail_out + 1/* end marker */);
+
+#ifdef UNIV_DEBUG
+	page_zip->m_start =
+#endif /* UNIV_DEBUG */
+		page_zip->m_end = PAGE_DATA + c_stream.total_out;
+	page_zip->m_nonempty = FALSE;
+	page_zip->n_blobs = n_blobs;
+	/* Copy those header fields that will not be written
+	in buf_flush_init_for_writing() */
+	memcpy(page_zip->data + FIL_PAGE_PREV, page + FIL_PAGE_PREV,
+	       FIL_PAGE_LSN - FIL_PAGE_PREV);
+	memcpy(page_zip->data + FIL_PAGE_TYPE, page + FIL_PAGE_TYPE, 2);
+	memcpy(page_zip->data + FIL_PAGE_DATA, page + FIL_PAGE_DATA,
+	       PAGE_DATA - FIL_PAGE_DATA);
+	/* Copy the rest of the compressed page */
+	memcpy(page_zip->data + PAGE_DATA, buf,
+	       page_zip_get_size(page_zip) - PAGE_DATA);
+	mem_heap_free(heap);
+#ifdef UNIV_ZIP_DEBUG
+	ut_a(page_zip_validate(page_zip, page));
+#endif /* UNIV_ZIP_DEBUG */
+
+	if (mtr) {
+#ifndef UNIV_HOTBACKUP
+		page_zip_compress_write_log(page_zip, page, index, mtr);
+#endif /* !UNIV_HOTBACKUP */
+	}
+
+	UNIV_MEM_ASSERT_RW(page_zip->data, page_zip_get_size(page_zip));
+
+#ifdef PAGE_ZIP_COMPRESS_DBG
+	if (logfile) {
+		/* Record the compressed size of the block. */
+		byte sz[4];
+		mach_write_to_4(sz, c_stream.total_out);
+		fseek(logfile, UNIV_PAGE_SIZE, SEEK_SET);
+		fwrite(sz, 1, sizeof sz, logfile);
+		fclose(logfile);
+	}
+#endif /* PAGE_ZIP_COMPRESS_DBG */
+#ifndef UNIV_HOTBACKUP
+	{
+		page_zip_stat_t*	zip_stat
+			= &page_zip_stat[page_zip->ssize - 1];
+		zip_stat->compressed_ok++;
+		zip_stat->compressed_usec += ut_time_us(NULL) - usec;
+	}
+#endif /* !UNIV_HOTBACKUP */
+
+	return(TRUE);
+}
+
+/**********************************************************************//**
+Compare two page directory entries.
+@return	positive if rec1 > rec2 */
+UNIV_INLINE
+ibool
+page_zip_dir_cmp(
+/*=============*/
+	const rec_t*	rec1,	/*!< in: rec1 */
+	const rec_t*	rec2)	/*!< in: rec2 */
+{
+	return(rec1 > rec2);
+}
+
+/**********************************************************************//**
+Sort the dense page directory by address (heap_no). */
+static
+void
+page_zip_dir_sort(
+/*==============*/
+	rec_t**	arr,	/*!< in/out: dense page directory */
+	rec_t**	aux_arr,/*!< in/out: work area */
+	ulint	low,	/*!< in: lower bound of the sorting area, inclusive */
+	ulint	high)	/*!< in: upper bound of the sorting area, exclusive */
+{
+	UT_SORT_FUNCTION_BODY(page_zip_dir_sort, arr, aux_arr, low, high,
+			      page_zip_dir_cmp);
+}
+
+/**********************************************************************//**
+Deallocate the index information initialized by page_zip_fields_decode(). */
+static
+void
+page_zip_fields_free(
+/*=================*/
+	dict_index_t*	index)	/*!< in: dummy index to be freed */
+{
+	if (index) {
+		dict_table_t*	table = index->table;
+		mem_heap_free(index->heap);
+		mutex_free(&(table->autoinc_mutex));
+		ut_free(table->name);
+		mem_heap_free(table->heap);
+	}
+}
+
+/**********************************************************************//**
+Read the index information for the compressed page.
+@return	own: dummy index describing the page, or NULL on error */
+static
+dict_index_t*
+page_zip_fields_decode(
+/*===================*/
+	const byte*	buf,	/*!< in: index information */
+	const byte*	end,	/*!< in: end of buf */
+	ulint*		trx_id_col)/*!< in: NULL for non-leaf pages;
+				for leaf pages, pointer to where to store
+				the position of the trx_id column */
+{
+	const byte*	b;
+	ulint		n;
+	ulint		i;
+	ulint		val;
+	dict_table_t*	table;
+	dict_index_t*	index;
+
+	/* Determine the number of fields. */
+	for (b = buf, n = 0; b < end; n++) {
+		if (*b++ & 0x80) {
+			b++; /* skip the second byte */
+		}
+	}
+
+	n--; /* n_nullable or trx_id */
+
+	if (UNIV_UNLIKELY(n > REC_MAX_N_FIELDS)) {
+
+		page_zip_fail(("page_zip_fields_decode: n = %lu\n",
+			       (ulong) n));
+		return(NULL);
+	}
+
+	if (UNIV_UNLIKELY(b > end)) {
+
+		page_zip_fail(("page_zip_fields_decode: %p > %p\n",
+			       (const void*) b, (const void*) end));
+		return(NULL);
+	}
+
+	table = dict_mem_table_create("ZIP_DUMMY", DICT_HDR_SPACE, n,
+				      DICT_TF_COMPACT);
+	index = dict_mem_index_create("ZIP_DUMMY", "ZIP_DUMMY",
+				      DICT_HDR_SPACE, 0, n);
+	index->table = table;
+	index->n_uniq = n;
+	/* avoid ut_ad(index->cached) in dict_index_get_n_unique_in_tree */
+	index->cached = TRUE;
+
+	/* Initialize the fields. */
+	for (b = buf, i = 0; i < n; i++) {
+		ulint	mtype;
+		ulint	len;
+
+		val = *b++;
+
+		if (UNIV_UNLIKELY(val & 0x80)) {
+			/* fixed length > 62 bytes */
+			val = (val & 0x7f) << 8 | *b++;
+			len = val >> 1;
+			mtype = DATA_FIXBINARY;
+		} else if (UNIV_UNLIKELY(val >= 126)) {
+			/* variable length with max > 255 bytes */
+			len = 0x7fff;
+			mtype = DATA_BINARY;
+		} else if (val <= 1) {
+			/* variable length with max <= 255 bytes */
+			len = 0;
+			mtype = DATA_BINARY;
+		} else {
+			/* fixed length < 62 bytes */
+			len = val >> 1;
+			mtype = DATA_FIXBINARY;
+		}
+
+		dict_mem_table_add_col(table, NULL, NULL, mtype,
+				       val & 1 ? DATA_NOT_NULL : 0, len);
+		dict_index_add_col(index, table,
+				   dict_table_get_nth_col(table, i), 0);
+	}
+
+	val = *b++;
+	if (UNIV_UNLIKELY(val & 0x80)) {
+		val = (val & 0x7f) << 8 | *b++;
+	}
+
+	/* Decode the position of the trx_id column. */
+	if (trx_id_col) {
+		if (!val) {
+			val = ULINT_UNDEFINED;
+		} else if (UNIV_UNLIKELY(val >= n)) {
+			page_zip_fields_free(index);
+			index = NULL;
+		} else {
+			index->type = DICT_CLUSTERED;
+		}
+
+		*trx_id_col = val;
+	} else {
+		/* Decode the number of nullable fields. */
+		if (UNIV_UNLIKELY(index->n_nullable > val)) {
+			page_zip_fields_free(index);
+			index = NULL;
+		} else {
+			index->n_nullable = val;
+		}
+	}
+
+	ut_ad(b == end);
+
+	return(index);
+}
+
+/**********************************************************************//**
+Populate the sparse page directory from the dense directory.
+@return	TRUE on success, FALSE on failure */
+static
+ibool
+page_zip_dir_decode(
+/*================*/
+	const page_zip_des_t*	page_zip,/*!< in: dense page directory on
+					compressed page */
+	page_t*			page,	/*!< in: compact page with valid header;
+					out: trailer and sparse page directory
+					filled in */
+	rec_t**			recs,	/*!< out: dense page directory sorted by
+					ascending address (and heap_no) */
+	rec_t**			recs_aux,/*!< in/out: scratch area */
+	ulint			n_dense)/*!< in: number of user records, and
+					size of recs[] and recs_aux[] */
+{
+	ulint	i;
+	ulint	n_recs;
+	byte*	slot;
+
+	n_recs = page_get_n_recs(page);
+
+	if (UNIV_UNLIKELY(n_recs > n_dense)) {
+		page_zip_fail(("page_zip_dir_decode 1: %lu > %lu\n",
+			       (ulong) n_recs, (ulong) n_dense));
+		return(FALSE);
+	}
+
+	/* Traverse the list of stored records in the sorting order,
+	starting from the first user record. */
+
+	slot = page + (UNIV_PAGE_SIZE - PAGE_DIR - PAGE_DIR_SLOT_SIZE);
+	UNIV_PREFETCH_RW(slot);
+
+	/* Zero out the page trailer. */
+	memset(slot + PAGE_DIR_SLOT_SIZE, 0, PAGE_DIR);
+
+	mach_write_to_2(slot, PAGE_NEW_INFIMUM);
+	slot -= PAGE_DIR_SLOT_SIZE;
+	UNIV_PREFETCH_RW(slot);
+
+	/* Initialize the sparse directory and copy the dense directory. */
+	for (i = 0; i < n_recs; i++) {
+		ulint	offs = page_zip_dir_get(page_zip, i);
+
+		if (offs & PAGE_ZIP_DIR_SLOT_OWNED) {
+			mach_write_to_2(slot, offs & PAGE_ZIP_DIR_SLOT_MASK);
+			slot -= PAGE_DIR_SLOT_SIZE;
+			UNIV_PREFETCH_RW(slot);
+		}
+
+		if (UNIV_UNLIKELY((offs & PAGE_ZIP_DIR_SLOT_MASK)
+				  < PAGE_ZIP_START + REC_N_NEW_EXTRA_BYTES)) {
+			page_zip_fail(("page_zip_dir_decode 2: %u %u %lx\n",
+				       (unsigned) i, (unsigned) n_recs,
+				       (ulong) offs));
+			return(FALSE);
+		}
+
+		recs[i] = page + (offs & PAGE_ZIP_DIR_SLOT_MASK);
+	}
+
+	mach_write_to_2(slot, PAGE_NEW_SUPREMUM);
+	{
+		const page_dir_slot_t*	last_slot = page_dir_get_nth_slot(
+			page, page_dir_get_n_slots(page) - 1);
+
+		if (UNIV_UNLIKELY(slot != last_slot)) {
+			page_zip_fail(("page_zip_dir_decode 3: %p != %p\n",
+				       (const void*) slot,
+				       (const void*) last_slot));
+			return(FALSE);
+		}
+	}
+
+	/* Copy the rest of the dense directory. */
+	for (; i < n_dense; i++) {
+		ulint	offs = page_zip_dir_get(page_zip, i);
+
+		if (UNIV_UNLIKELY(offs & ~PAGE_ZIP_DIR_SLOT_MASK)) {
+			page_zip_fail(("page_zip_dir_decode 4: %u %u %lx\n",
+				       (unsigned) i, (unsigned) n_dense,
+				       (ulong) offs));
+			return(FALSE);
+		}
+
+		recs[i] = page + offs;
+	}
+
+	if (UNIV_LIKELY(n_dense > 1)) {
+		page_zip_dir_sort(recs, recs_aux, 0, n_dense);
+	}
+	return(TRUE);
+}
+
+/**********************************************************************//**
+Initialize the REC_N_NEW_EXTRA_BYTES of each record.
+@return	TRUE on success, FALSE on failure */
+static
+ibool
+page_zip_set_extra_bytes(
+/*=====================*/
+	const page_zip_des_t*	page_zip,/*!< in: compressed page */
+	page_t*			page,	/*!< in/out: uncompressed page */
+	ulint			info_bits)/*!< in: REC_INFO_MIN_REC_FLAG or 0 */
+{
+	ulint	n;
+	ulint	i;
+	ulint	n_owned = 1;
+	ulint	offs;
+	rec_t*	rec;
+
+	n = page_get_n_recs(page);
+	rec = page + PAGE_NEW_INFIMUM;
+
+	for (i = 0; i < n; i++) {
+		offs = page_zip_dir_get(page_zip, i);
+
+		if (UNIV_UNLIKELY(offs & PAGE_ZIP_DIR_SLOT_DEL)) {
+			info_bits |= REC_INFO_DELETED_FLAG;
+		}
+		if (UNIV_UNLIKELY(offs & PAGE_ZIP_DIR_SLOT_OWNED)) {
+			info_bits |= n_owned;
+			n_owned = 1;
+		} else {
+			n_owned++;
+		}
+		offs &= PAGE_ZIP_DIR_SLOT_MASK;
+		if (UNIV_UNLIKELY(offs < PAGE_ZIP_START
+				  + REC_N_NEW_EXTRA_BYTES)) {
+			page_zip_fail(("page_zip_set_extra_bytes 1:"
+				       " %u %u %lx\n",
+				       (unsigned) i, (unsigned) n,
+				       (ulong) offs));
+			return(FALSE);
+		}
+
+		rec_set_next_offs_new(rec, offs);
+		rec = page + offs;
+		rec[-REC_N_NEW_EXTRA_BYTES] = (byte) info_bits;
+		info_bits = 0;
+	}
+
+	/* Set the next pointer of the last user record. */
+	rec_set_next_offs_new(rec, PAGE_NEW_SUPREMUM);
+
+	/* Set n_owned of the supremum record. */
+	page[PAGE_NEW_SUPREMUM - REC_N_NEW_EXTRA_BYTES] = (byte) n_owned;
+
+	/* The dense directory excludes the infimum and supremum records. */
+	n = page_dir_get_n_heap(page) - PAGE_HEAP_NO_USER_LOW;
+
+	if (i >= n) {
+		if (UNIV_LIKELY(i == n)) {
+			return(TRUE);
+		}
+
+		page_zip_fail(("page_zip_set_extra_bytes 2: %u != %u\n",
+			       (unsigned) i, (unsigned) n));
+		return(FALSE);
+	}
+
+	offs = page_zip_dir_get(page_zip, i);
+
+	/* Set the extra bytes of deleted records on the free list. */
+	for (;;) {
+		if (UNIV_UNLIKELY(!offs)
+		    || UNIV_UNLIKELY(offs & ~PAGE_ZIP_DIR_SLOT_MASK)) {
+
+			page_zip_fail(("page_zip_set_extra_bytes 3: %lx\n",
+				       (ulong) offs));
+			return(FALSE);
+		}
+
+		rec = page + offs;
+		rec[-REC_N_NEW_EXTRA_BYTES] = 0; /* info_bits and n_owned */
+
+		if (++i == n) {
+			break;
+		}
+
+		offs = page_zip_dir_get(page_zip, i);
+		rec_set_next_offs_new(rec, offs);
+	}
+
+	/* Terminate the free list. */
+	rec[-REC_N_NEW_EXTRA_BYTES] = 0; /* info_bits and n_owned */
+	rec_set_next_offs_new(rec, 0);
+
+	return(TRUE);
+}
+
+/**********************************************************************//**
+Apply the modification log to a record containing externally stored
+columns.  Do not copy the fields that are stored separately.
+@return	pointer to modification log, or NULL on failure */
+static
+const byte*
+page_zip_apply_log_ext(
+/*===================*/
+	rec_t*		rec,		/*!< in/out: record */
+	const ulint*	offsets,	/*!< in: rec_get_offsets(rec) */
+	ulint		trx_id_col,	/*!< in: position of of DB_TRX_ID */
+	const byte*	data,		/*!< in: modification log */
+	const byte*	end)		/*!< in: end of modification log */
+{
+	ulint	i;
+	ulint	len;
+	byte*	next_out = rec;
+
+	/* Check if there are any externally stored columns.
+	For each externally stored column, skip the
+	BTR_EXTERN_FIELD_REF. */
+
+	for (i = 0; i < rec_offs_n_fields(offsets); i++) {
+		byte*	dst;
+
+		if (UNIV_UNLIKELY(i == trx_id_col)) {
+			/* Skip trx_id and roll_ptr */
+			dst = rec_get_nth_field(rec, offsets,
+						i, &len);
+			if (UNIV_UNLIKELY(dst - next_out >= end - data)
+			    || UNIV_UNLIKELY
+			    (len < (DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN))
+			    || rec_offs_nth_extern(offsets, i)) {
+				page_zip_fail(("page_zip_apply_log_ext:"
+					       " trx_id len %lu,"
+					       " %p - %p >= %p - %p\n",
+					       (ulong) len,
+					       (const void*) dst,
+					       (const void*) next_out,
+					       (const void*) end,
+					       (const void*) data));
+				return(NULL);
+			}
+
+			memcpy(next_out, data, dst - next_out);
+			data += dst - next_out;
+			next_out = dst + (DATA_TRX_ID_LEN
+					  + DATA_ROLL_PTR_LEN);
+		} else if (rec_offs_nth_extern(offsets, i)) {
+			dst = rec_get_nth_field(rec, offsets,
+						i, &len);
+			ut_ad(len
+			      >= BTR_EXTERN_FIELD_REF_SIZE);
+
+			len += dst - next_out
+				- BTR_EXTERN_FIELD_REF_SIZE;
+
+			if (UNIV_UNLIKELY(data + len >= end)) {
+				page_zip_fail(("page_zip_apply_log_ext: "
+					       "ext %p+%lu >= %p\n",
+					       (const void*) data,
+					       (ulong) len,
+					       (const void*) end));
+				return(NULL);
+			}
+
+			memcpy(next_out, data, len);
+			data += len;
+			next_out += len
+				+ BTR_EXTERN_FIELD_REF_SIZE;
+		}
+	}
+
+	/* Copy the last bytes of the record. */
+	len = rec_get_end(rec, offsets) - next_out;
+	if (UNIV_UNLIKELY(data + len >= end)) {
+		page_zip_fail(("page_zip_apply_log_ext: "
+			       "last %p+%lu >= %p\n",
+			       (const void*) data,
+			       (ulong) len,
+			       (const void*) end));
+		return(NULL);
+	}
+	memcpy(next_out, data, len);
+	data += len;
+
+	return(data);
+}
+
+/**********************************************************************//**
+Apply the modification log to an uncompressed page.
+Do not copy the fields that are stored separately.
+@return	pointer to end of modification log, or NULL on failure */
+static
+const byte*
+page_zip_apply_log(
+/*===============*/
+	const byte*	data,	/*!< in: modification log */
+	ulint		size,	/*!< in: maximum length of the log, in bytes */
+	rec_t**		recs,	/*!< in: dense page directory,
+				sorted by address (indexed by
+				heap_no - PAGE_HEAP_NO_USER_LOW) */
+	ulint		n_dense,/*!< in: size of recs[] */
+	ulint		trx_id_col,/*!< in: column number of trx_id in the index,
+				or ULINT_UNDEFINED if none */
+	ulint		heap_status,
+				/*!< in: heap_no and status bits for
+				the next record to uncompress */
+	dict_index_t*	index,	/*!< in: index of the page */
+	ulint*		offsets)/*!< in/out: work area for
+				rec_get_offsets_reverse() */
+{
+	const byte* const end = data + size;
+
+	for (;;) {
+		ulint	val;
+		rec_t*	rec;
+		ulint	len;
+		ulint	hs;
+
+		val = *data++;
+		if (UNIV_UNLIKELY(!val)) {
+			return(data - 1);
+		}
+		if (val & 0x80) {
+			val = (val & 0x7f) << 8 | *data++;
+			if (UNIV_UNLIKELY(!val)) {
+				page_zip_fail(("page_zip_apply_log:"
+					       " invalid val %x%x\n",
+					       data[-2], data[-1]));
+				return(NULL);
+			}
+		}
+		if (UNIV_UNLIKELY(data >= end)) {
+			page_zip_fail(("page_zip_apply_log: %p >= %p\n",
+				       (const void*) data,
+				       (const void*) end));
+			return(NULL);
+		}
+		if (UNIV_UNLIKELY((val >> 1) > n_dense)) {
+			page_zip_fail(("page_zip_apply_log: %lu>>1 > %lu\n",
+				       (ulong) val, (ulong) n_dense));
+			return(NULL);
+		}
+
+		/* Determine the heap number and status bits of the record. */
+		rec = recs[(val >> 1) - 1];
+
+		hs = ((val >> 1) + 1) << REC_HEAP_NO_SHIFT;
+		hs |= heap_status & ((1 << REC_HEAP_NO_SHIFT) - 1);
+
+		/* This may either be an old record that is being
+		overwritten (updated in place, or allocated from
+		the free list), or a new record, with the next
+		available_heap_no. */
+		if (UNIV_UNLIKELY(hs > heap_status)) {
+			page_zip_fail(("page_zip_apply_log: %lu > %lu\n",
+				       (ulong) hs, (ulong) heap_status));
+			return(NULL);
+		} else if (hs == heap_status) {
+			/* A new record was allocated from the heap. */
+			if (UNIV_UNLIKELY(val & 1)) {
+				/* Only existing records may be cleared. */
+				page_zip_fail(("page_zip_apply_log:"
+					       " attempting to create"
+					       " deleted rec %lu\n",
+					       (ulong) hs));
+				return(NULL);
+			}
+			heap_status += 1 << REC_HEAP_NO_SHIFT;
+		}
+
+		mach_write_to_2(rec - REC_NEW_HEAP_NO, hs);
+
+		if (val & 1) {
+			/* Clear the data bytes of the record. */
+			mem_heap_t*	heap	= NULL;
+			ulint*		offs;
+			offs = rec_get_offsets(rec, index, offsets,
+					       ULINT_UNDEFINED, &heap);
+			memset(rec, 0, rec_offs_data_size(offs));
+
+			if (UNIV_LIKELY_NULL(heap)) {
+				mem_heap_free(heap);
+			}
+			continue;
+		}
+
+#if REC_STATUS_NODE_PTR != TRUE
+# error "REC_STATUS_NODE_PTR != TRUE"
+#endif
+		rec_get_offsets_reverse(data, index,
+					hs & REC_STATUS_NODE_PTR,
+					offsets);
+		rec_offs_make_valid(rec, index, offsets);
+
+		/* Copy the extra bytes (backwards). */
+		{
+			byte*	start	= rec_get_start(rec, offsets);
+			byte*	b	= rec - REC_N_NEW_EXTRA_BYTES;
+			while (b != start) {
+				*--b = *data++;
+			}
+		}
+
+		/* Copy the data bytes. */
+		if (UNIV_UNLIKELY(rec_offs_any_extern(offsets))) {
+			/* Non-leaf nodes should not contain any
+			externally stored columns. */
+			if (UNIV_UNLIKELY(hs & REC_STATUS_NODE_PTR)) {
+				page_zip_fail(("page_zip_apply_log: "
+					       "%lu&REC_STATUS_NODE_PTR\n",
+					       (ulong) hs));
+				return(NULL);
+			}
+
+			data = page_zip_apply_log_ext(
+				rec, offsets, trx_id_col, data, end);
+
+			if (UNIV_UNLIKELY(!data)) {
+				return(NULL);
+			}
+		} else if (UNIV_UNLIKELY(hs & REC_STATUS_NODE_PTR)) {
+			len = rec_offs_data_size(offsets)
+				- REC_NODE_PTR_SIZE;
+			/* Copy the data bytes, except node_ptr. */
+			if (UNIV_UNLIKELY(data + len >= end)) {
+				page_zip_fail(("page_zip_apply_log: "
+					       "node_ptr %p+%lu >= %p\n",
+					       (const void*) data,
+					       (ulong) len,
+					       (const void*) end));
+				return(NULL);
+			}
+			memcpy(rec, data, len);
+			data += len;
+		} else if (UNIV_LIKELY(trx_id_col == ULINT_UNDEFINED)) {
+			len = rec_offs_data_size(offsets);
+
+			/* Copy all data bytes of
+			a record in a secondary index. */
+			if (UNIV_UNLIKELY(data + len >= end)) {
+				page_zip_fail(("page_zip_apply_log: "
+					       "sec %p+%lu >= %p\n",
+					       (const void*) data,
+					       (ulong) len,
+					       (const void*) end));
+				return(NULL);
+			}
+
+			memcpy(rec, data, len);
+			data += len;
+		} else {
+			/* Skip DB_TRX_ID and DB_ROLL_PTR. */
+			ulint	l = rec_get_nth_field_offs(offsets,
+							   trx_id_col, &len);
+			byte*	b;
+
+			if (UNIV_UNLIKELY(data + l >= end)
+			    || UNIV_UNLIKELY(len < (DATA_TRX_ID_LEN
+						    + DATA_ROLL_PTR_LEN))) {
+				page_zip_fail(("page_zip_apply_log: "
+					       "trx_id %p+%lu >= %p\n",
+					       (const void*) data,
+					       (ulong) l,
+					       (const void*) end));
+				return(NULL);
+			}
+
+			/* Copy any preceding data bytes. */
+			memcpy(rec, data, l);
+			data += l;
+
+			/* Copy any bytes following DB_TRX_ID, DB_ROLL_PTR. */
+			b = rec + l + (DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN);
+			len = rec_get_end(rec, offsets) - b;
+			if (UNIV_UNLIKELY(data + len >= end)) {
+				page_zip_fail(("page_zip_apply_log: "
+					       "clust %p+%lu >= %p\n",
+					       (const void*) data,
+					       (ulong) len,
+					       (const void*) end));
+				return(NULL);
+			}
+			memcpy(b, data, len);
+			data += len;
+		}
+	}
+}
+
+/**********************************************************************//**
+Decompress the records of a node pointer page.
+@return	TRUE on success, FALSE on failure */
+static
+ibool
+page_zip_decompress_node_ptrs(
+/*==========================*/
+	page_zip_des_t*	page_zip,	/*!< in/out: compressed page */
+	z_stream*	d_stream,	/*!< in/out: compressed page stream */
+	rec_t**		recs,		/*!< in: dense page directory
+					sorted by address */
+	ulint		n_dense,	/*!< in: size of recs[] */
+	dict_index_t*	index,		/*!< in: the index of the page */
+	ulint*		offsets,	/*!< in/out: temporary offsets */
+	mem_heap_t*	heap)		/*!< in: temporary memory heap */
+{
+	ulint		heap_status = REC_STATUS_NODE_PTR
+		| PAGE_HEAP_NO_USER_LOW << REC_HEAP_NO_SHIFT;
+	ulint		slot;
+	const byte*	storage;
+
+	/* Subtract the space reserved for uncompressed data. */
+	d_stream->avail_in -= n_dense
+		* (PAGE_ZIP_DIR_SLOT_SIZE + REC_NODE_PTR_SIZE);
+
+	/* Decompress the records in heap_no order. */
+	for (slot = 0; slot < n_dense; slot++) {
+		rec_t*	rec = recs[slot];
+
+		d_stream->avail_out = rec - REC_N_NEW_EXTRA_BYTES
+			- d_stream->next_out;
+
+		ut_ad(d_stream->avail_out < UNIV_PAGE_SIZE
+		      - PAGE_ZIP_START - PAGE_DIR);
+		switch (inflate(d_stream, Z_SYNC_FLUSH)) {
+		case Z_STREAM_END:
+			/* Apparently, n_dense has grown
+			since the time the page was last compressed. */
+			goto zlib_done;
+		case Z_OK:
+		case Z_BUF_ERROR:
+			if (!d_stream->avail_out) {
+				break;
+			}
+			/* fall through */
+		default:
+			page_zip_fail(("page_zip_decompress_node_ptrs:"
+				       " 1 inflate(Z_SYNC_FLUSH)=%s\n",
+				       d_stream->msg));
+			goto zlib_error;
+		}
+
+		ut_ad(d_stream->next_out == rec - REC_N_NEW_EXTRA_BYTES);
+		/* Prepare to decompress the data bytes. */
+		d_stream->next_out = rec;
+		/* Set heap_no and the status bits. */
+		mach_write_to_2(rec - REC_NEW_HEAP_NO, heap_status);
+		heap_status += 1 << REC_HEAP_NO_SHIFT;
+
+		/* Read the offsets. The status bits are needed here. */
+		offsets = rec_get_offsets(rec, index, offsets,
+					  ULINT_UNDEFINED, &heap);
+
+		/* Non-leaf nodes should not have any externally
+		stored columns. */
+		ut_ad(!rec_offs_any_extern(offsets));
+
+		/* Decompress the data bytes, except node_ptr. */
+		d_stream->avail_out = rec_offs_data_size(offsets)
+			- REC_NODE_PTR_SIZE;
+
+		switch (inflate(d_stream, Z_SYNC_FLUSH)) {
+		case Z_STREAM_END:
+			goto zlib_done;
+		case Z_OK:
+		case Z_BUF_ERROR:
+			if (!d_stream->avail_out) {
+				break;
+			}
+			/* fall through */
+		default:
+			page_zip_fail(("page_zip_decompress_node_ptrs:"
+				       " 2 inflate(Z_SYNC_FLUSH)=%s\n",
+				       d_stream->msg));
+			goto zlib_error;
+		}
+
+		/* Clear the node pointer in case the record
+		will be deleted and the space will be reallocated
+		to a smaller record. */
+		memset(d_stream->next_out, 0, REC_NODE_PTR_SIZE);
+		d_stream->next_out += REC_NODE_PTR_SIZE;
+
+		ut_ad(d_stream->next_out == rec_get_end(rec, offsets));
+	}
+
+	/* Decompress any trailing garbage, in case the last record was
+	allocated from an originally longer space on the free list. */
+	d_stream->avail_out = page_header_get_field(page_zip->data,
+						    PAGE_HEAP_TOP)
+		- page_offset(d_stream->next_out);
+	if (UNIV_UNLIKELY(d_stream->avail_out > UNIV_PAGE_SIZE
+			  - PAGE_ZIP_START - PAGE_DIR)) {
+
+		page_zip_fail(("page_zip_decompress_node_ptrs:"
+			       " avail_out = %u\n",
+			       d_stream->avail_out));
+		goto zlib_error;
+	}
+
+	if (UNIV_UNLIKELY(inflate(d_stream, Z_FINISH) != Z_STREAM_END)) {
+		page_zip_fail(("page_zip_decompress_node_ptrs:"
+			       " inflate(Z_FINISH)=%s\n",
+			       d_stream->msg));
+zlib_error:
+		inflateEnd(d_stream);
+		return(FALSE);
+	}
+
+	/* Note that d_stream->avail_out > 0 may hold here
+	if the modification log is nonempty. */
+
+zlib_done:
+	if (UNIV_UNLIKELY(inflateEnd(d_stream) != Z_OK)) {
+		ut_error;
+	}
+
+	{
+		page_t*	page = page_align(d_stream->next_out);
+
+		/* Clear the unused heap space on the uncompressed page. */
+		memset(d_stream->next_out, 0,
+		       page_dir_get_nth_slot(page,
+					     page_dir_get_n_slots(page) - 1)
+		       - d_stream->next_out);
+	}
+
+#ifdef UNIV_DEBUG
+	page_zip->m_start = PAGE_DATA + d_stream->total_in;
+#endif /* UNIV_DEBUG */
+
+	/* Apply the modification log. */
+	{
+		const byte*	mod_log_ptr;
+		mod_log_ptr = page_zip_apply_log(d_stream->next_in,
+						 d_stream->avail_in + 1,
+						 recs, n_dense,
+						 ULINT_UNDEFINED, heap_status,
+						 index, offsets);
+
+		if (UNIV_UNLIKELY(!mod_log_ptr)) {
+			return(FALSE);
+		}
+		page_zip->m_end = mod_log_ptr - page_zip->data;
+		page_zip->m_nonempty = mod_log_ptr != d_stream->next_in;
+	}
+
+	if (UNIV_UNLIKELY
+	    (page_zip_get_trailer_len(page_zip,
+				      dict_index_is_clust(index), NULL)
+	     + page_zip->m_end >= page_zip_get_size(page_zip))) {
+		page_zip_fail(("page_zip_decompress_node_ptrs:"
+			       " %lu + %lu >= %lu, %lu\n",
+			       (ulong) page_zip_get_trailer_len(
+				       page_zip, dict_index_is_clust(index),
+				       NULL),
+			       (ulong) page_zip->m_end,
+			       (ulong) page_zip_get_size(page_zip),
+			       (ulong) dict_index_is_clust(index)));
+		return(FALSE);
+	}
+
+	/* Restore the uncompressed columns in heap_no order. */
+	storage	= page_zip->data + page_zip_get_size(page_zip)
+		- n_dense * PAGE_ZIP_DIR_SLOT_SIZE;
+
+	for (slot = 0; slot < n_dense; slot++) {
+		rec_t*		rec	= recs[slot];
+
+		offsets = rec_get_offsets(rec, index, offsets,
+					  ULINT_UNDEFINED, &heap);
+		/* Non-leaf nodes should not have any externally
+		stored columns. */
+		ut_ad(!rec_offs_any_extern(offsets));
+		storage -= REC_NODE_PTR_SIZE;
+
+		memcpy(rec_get_end(rec, offsets) - REC_NODE_PTR_SIZE,
+		       storage, REC_NODE_PTR_SIZE);
+	}
+
+	return(TRUE);
+}
+
+/**********************************************************************//**
+Decompress the records of a leaf node of a secondary index.
+@return	TRUE on success, FALSE on failure */
+static
+ibool
+page_zip_decompress_sec(
+/*====================*/
+	page_zip_des_t*	page_zip,	/*!< in/out: compressed page */
+	z_stream*	d_stream,	/*!< in/out: compressed page stream */
+	rec_t**		recs,		/*!< in: dense page directory
+					sorted by address */
+	ulint		n_dense,	/*!< in: size of recs[] */
+	dict_index_t*	index,		/*!< in: the index of the page */
+	ulint*		offsets)	/*!< in/out: temporary offsets */
+{
+	ulint	heap_status	= REC_STATUS_ORDINARY
+		| PAGE_HEAP_NO_USER_LOW << REC_HEAP_NO_SHIFT;
+	ulint	slot;
+
+	ut_a(!dict_index_is_clust(index));
+
+	/* Subtract the space reserved for uncompressed data. */
+	d_stream->avail_in -= n_dense * PAGE_ZIP_DIR_SLOT_SIZE;
+
+	for (slot = 0; slot < n_dense; slot++) {
+		rec_t*	rec = recs[slot];
+
+		/* Decompress everything up to this record. */
+		d_stream->avail_out = rec - REC_N_NEW_EXTRA_BYTES
+			- d_stream->next_out;
+
+		if (UNIV_LIKELY(d_stream->avail_out)) {
+			switch (inflate(d_stream, Z_SYNC_FLUSH)) {
+			case Z_STREAM_END:
+				/* Apparently, n_dense has grown
+				since the time the page was last compressed. */
+				goto zlib_done;
+			case Z_OK:
+			case Z_BUF_ERROR:
+				if (!d_stream->avail_out) {
+					break;
+				}
+				/* fall through */
+			default:
+				page_zip_fail(("page_zip_decompress_sec:"
+					       " inflate(Z_SYNC_FLUSH)=%s\n",
+					       d_stream->msg));
+				goto zlib_error;
+			}
+		}
+
+		ut_ad(d_stream->next_out == rec - REC_N_NEW_EXTRA_BYTES);
+
+		/* Skip the REC_N_NEW_EXTRA_BYTES. */
+
+		d_stream->next_out = rec;
+
+		/* Set heap_no and the status bits. */
+		mach_write_to_2(rec - REC_NEW_HEAP_NO, heap_status);
+		heap_status += 1 << REC_HEAP_NO_SHIFT;
+	}
+
+	/* Decompress the data of the last record and any trailing garbage,
+	in case the last record was allocated from an originally longer space
+	on the free list. */
+	d_stream->avail_out = page_header_get_field(page_zip->data,
+						    PAGE_HEAP_TOP)
+		- page_offset(d_stream->next_out);
+	if (UNIV_UNLIKELY(d_stream->avail_out > UNIV_PAGE_SIZE
+			  - PAGE_ZIP_START - PAGE_DIR)) {
+
+		page_zip_fail(("page_zip_decompress_sec:"
+			       " avail_out = %u\n",
+			       d_stream->avail_out));
+		goto zlib_error;
+	}
+
+	if (UNIV_UNLIKELY(inflate(d_stream, Z_FINISH) != Z_STREAM_END)) {
+		page_zip_fail(("page_zip_decompress_sec:"
+			       " inflate(Z_FINISH)=%s\n",
+			       d_stream->msg));
+zlib_error:
+		inflateEnd(d_stream);
+		return(FALSE);
+	}
+
+	/* Note that d_stream->avail_out > 0 may hold here
+	if the modification log is nonempty. */
+
+zlib_done:
+	if (UNIV_UNLIKELY(inflateEnd(d_stream) != Z_OK)) {
+		ut_error;
+	}
+
+	{
+		page_t*	page = page_align(d_stream->next_out);
+
+		/* Clear the unused heap space on the uncompressed page. */
+		memset(d_stream->next_out, 0,
+		       page_dir_get_nth_slot(page,
+					     page_dir_get_n_slots(page) - 1)
+		       - d_stream->next_out);
+	}
+
+#ifdef UNIV_DEBUG
+	page_zip->m_start = PAGE_DATA + d_stream->total_in;
+#endif /* UNIV_DEBUG */
+
+	/* Apply the modification log. */
+	{
+		const byte*	mod_log_ptr;
+		mod_log_ptr = page_zip_apply_log(d_stream->next_in,
+						 d_stream->avail_in + 1,
+						 recs, n_dense,
+						 ULINT_UNDEFINED, heap_status,
+						 index, offsets);
+
+		if (UNIV_UNLIKELY(!mod_log_ptr)) {
+			return(FALSE);
+		}
+		page_zip->m_end = mod_log_ptr - page_zip->data;
+		page_zip->m_nonempty = mod_log_ptr != d_stream->next_in;
+	}
+
+	if (UNIV_UNLIKELY(page_zip_get_trailer_len(page_zip, FALSE, NULL)
+			  + page_zip->m_end >= page_zip_get_size(page_zip))) {
+
+		page_zip_fail(("page_zip_decompress_sec: %lu + %lu >= %lu\n",
+			       (ulong) page_zip_get_trailer_len(
+				       page_zip, FALSE, NULL),
+			       (ulong) page_zip->m_end,
+			       (ulong) page_zip_get_size(page_zip)));
+		return(FALSE);
+	}
+
+	/* There are no uncompressed columns on leaf pages of
+	secondary indexes. */
+
+	return(TRUE);
+}
+
+/**********************************************************************//**
+Decompress a record of a leaf node of a clustered index that contains
+externally stored columns.
+@return	TRUE on success */
+static
+ibool
+page_zip_decompress_clust_ext(
+/*==========================*/
+	z_stream*	d_stream,	/*!< in/out: compressed page stream */
+	rec_t*		rec,		/*!< in/out: record */
+	const ulint*	offsets,	/*!< in: rec_get_offsets(rec) */
+	ulint		trx_id_col)	/*!< in: position of of DB_TRX_ID */
+{
+	ulint	i;
+
+	for (i = 0; i < rec_offs_n_fields(offsets); i++) {
+		ulint	len;
+		byte*	dst;
+
+		if (UNIV_UNLIKELY(i == trx_id_col)) {
+			/* Skip trx_id and roll_ptr */
+			dst = rec_get_nth_field(rec, offsets, i, &len);
+			if (UNIV_UNLIKELY(len < DATA_TRX_ID_LEN
+					  + DATA_ROLL_PTR_LEN)) {
+
+				page_zip_fail(("page_zip_decompress_clust_ext:"
+					       " len[%lu] = %lu\n",
+					       (ulong) i, (ulong) len));
+				return(FALSE);
+			}
+
+			if (rec_offs_nth_extern(offsets, i)) {
+
+				page_zip_fail(("page_zip_decompress_clust_ext:"
+					       " DB_TRX_ID at %lu is ext\n",
+					       (ulong) i));
+				return(FALSE);
+			}
+
+			d_stream->avail_out = dst - d_stream->next_out;
+
+			switch (inflate(d_stream, Z_SYNC_FLUSH)) {
+			case Z_STREAM_END:
+			case Z_OK:
+			case Z_BUF_ERROR:
+				if (!d_stream->avail_out) {
+					break;
+				}
+				/* fall through */
+			default:
+				page_zip_fail(("page_zip_decompress_clust_ext:"
+					       " 1 inflate(Z_SYNC_FLUSH)=%s\n",
+					       d_stream->msg));
+				return(FALSE);
+			}
+
+			ut_ad(d_stream->next_out == dst);
+
+			/* Clear DB_TRX_ID and DB_ROLL_PTR in order to
+			avoid uninitialized bytes in case the record
+			is affected by page_zip_apply_log(). */
+			memset(dst, 0, DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN);
+
+			d_stream->next_out += DATA_TRX_ID_LEN
+				+ DATA_ROLL_PTR_LEN;
+		} else if (rec_offs_nth_extern(offsets, i)) {
+			dst = rec_get_nth_field(rec, offsets, i, &len);
+			ut_ad(len >= BTR_EXTERN_FIELD_REF_SIZE);
+			dst += len - BTR_EXTERN_FIELD_REF_SIZE;
+
+			d_stream->avail_out = dst - d_stream->next_out;
+			switch (inflate(d_stream, Z_SYNC_FLUSH)) {
+			case Z_STREAM_END:
+			case Z_OK:
+			case Z_BUF_ERROR:
+				if (!d_stream->avail_out) {
+					break;
+				}
+				/* fall through */
+			default:
+				page_zip_fail(("page_zip_decompress_clust_ext:"
+					       " 2 inflate(Z_SYNC_FLUSH)=%s\n",
+					       d_stream->msg));
+				return(FALSE);
+			}
+
+			ut_ad(d_stream->next_out == dst);
+
+			/* Clear the BLOB pointer in case
+			the record will be deleted and the
+			space will not be reused.  Note that
+			the final initialization of the BLOB
+			pointers (copying from "externs"
+			or clearing) will have to take place
+			only after the page modification log
+			has been applied.  Otherwise, we
+			could end up with an uninitialized
+			BLOB pointer when a record is deleted,
+			reallocated and deleted. */
+			memset(d_stream->next_out, 0,
+			       BTR_EXTERN_FIELD_REF_SIZE);
+			d_stream->next_out
+				+= BTR_EXTERN_FIELD_REF_SIZE;
+		}
+	}
+
+	return(TRUE);
+}
+
+/**********************************************************************//**
+Compress the records of a leaf node of a clustered index.
+@return	TRUE on success, FALSE on failure */
+static
+ibool
+page_zip_decompress_clust(
+/*======================*/
+	page_zip_des_t*	page_zip,	/*!< in/out: compressed page */
+	z_stream*	d_stream,	/*!< in/out: compressed page stream */
+	rec_t**		recs,		/*!< in: dense page directory
+					sorted by address */
+	ulint		n_dense,	/*!< in: size of recs[] */
+	dict_index_t*	index,		/*!< in: the index of the page */
+	ulint		trx_id_col,	/*!< index of the trx_id column */
+	ulint*		offsets,	/*!< in/out: temporary offsets */
+	mem_heap_t*	heap)		/*!< in: temporary memory heap */
+{
+	int		err;
+	ulint		slot;
+	ulint		heap_status	= REC_STATUS_ORDINARY
+		| PAGE_HEAP_NO_USER_LOW << REC_HEAP_NO_SHIFT;
+	const byte*	storage;
+	const byte*	externs;
+
+	ut_a(dict_index_is_clust(index));
+
+	/* Subtract the space reserved for uncompressed data. */
+	d_stream->avail_in -= n_dense * (PAGE_ZIP_DIR_SLOT_SIZE
+					 + DATA_TRX_ID_LEN
+					 + DATA_ROLL_PTR_LEN);
+
+	/* Decompress the records in heap_no order. */
+	for (slot = 0; slot < n_dense; slot++) {
+		rec_t*	rec	= recs[slot];
+
+		d_stream->avail_out = rec - REC_N_NEW_EXTRA_BYTES
+			- d_stream->next_out;
+
+		ut_ad(d_stream->avail_out < UNIV_PAGE_SIZE
+		      - PAGE_ZIP_START - PAGE_DIR);
+		err = inflate(d_stream, Z_SYNC_FLUSH);
+		switch (err) {
+		case Z_STREAM_END:
+			/* Apparently, n_dense has grown
+			since the time the page was last compressed. */
+			goto zlib_done;
+		case Z_OK:
+		case Z_BUF_ERROR:
+			if (UNIV_LIKELY(!d_stream->avail_out)) {
+				break;
+			}
+			/* fall through */
+		default:
+			page_zip_fail(("page_zip_decompress_clust:"
+				       " 1 inflate(Z_SYNC_FLUSH)=%s\n",
+				       d_stream->msg));
+			goto zlib_error;
+		}
+
+		ut_ad(d_stream->next_out == rec - REC_N_NEW_EXTRA_BYTES);
+		/* Prepare to decompress the data bytes. */
+		d_stream->next_out = rec;
+		/* Set heap_no and the status bits. */
+		mach_write_to_2(rec - REC_NEW_HEAP_NO, heap_status);
+		heap_status += 1 << REC_HEAP_NO_SHIFT;
+
+		/* Read the offsets. The status bits are needed here. */
+		offsets = rec_get_offsets(rec, index, offsets,
+					  ULINT_UNDEFINED, &heap);
+
+		/* This is a leaf page in a clustered index. */
+
+		/* Check if there are any externally stored columns.
+		For each externally stored column, restore the
+		BTR_EXTERN_FIELD_REF separately. */
+
+		if (UNIV_UNLIKELY(rec_offs_any_extern(offsets))) {
+			if (UNIV_UNLIKELY
+			    (!page_zip_decompress_clust_ext(
+				    d_stream, rec, offsets, trx_id_col))) {
+
+				goto zlib_error;
+			}
+		} else {
+			/* Skip trx_id and roll_ptr */
+			ulint	len;
+			byte*	dst = rec_get_nth_field(rec, offsets,
+							trx_id_col, &len);
+			if (UNIV_UNLIKELY(len < DATA_TRX_ID_LEN
+					  + DATA_ROLL_PTR_LEN)) {
+
+				page_zip_fail(("page_zip_decompress_clust:"
+					       " len = %lu\n", (ulong) len));
+				goto zlib_error;
+			}
+
+			d_stream->avail_out = dst - d_stream->next_out;
+
+			switch (inflate(d_stream, Z_SYNC_FLUSH)) {
+			case Z_STREAM_END:
+			case Z_OK:
+			case Z_BUF_ERROR:
+				if (!d_stream->avail_out) {
+					break;
+				}
+				/* fall through */
+			default:
+				page_zip_fail(("page_zip_decompress_clust:"
+					       " 2 inflate(Z_SYNC_FLUSH)=%s\n",
+					       d_stream->msg));
+				goto zlib_error;
+			}
+
+			ut_ad(d_stream->next_out == dst);
+
+			/* Clear DB_TRX_ID and DB_ROLL_PTR in order to
+			avoid uninitialized bytes in case the record
+			is affected by page_zip_apply_log(). */
+			memset(dst, 0, DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN);
+
+			d_stream->next_out += DATA_TRX_ID_LEN
+				+ DATA_ROLL_PTR_LEN;
+		}
+
+		/* Decompress the last bytes of the record. */
+		d_stream->avail_out = rec_get_end(rec, offsets)
+			- d_stream->next_out;
+
+		switch (inflate(d_stream, Z_SYNC_FLUSH)) {
+		case Z_STREAM_END:
+		case Z_OK:
+		case Z_BUF_ERROR:
+			if (!d_stream->avail_out) {
+				break;
+			}
+			/* fall through */
+		default:
+			page_zip_fail(("page_zip_decompress_clust:"
+				       " 3 inflate(Z_SYNC_FLUSH)=%s\n",
+				       d_stream->msg));
+			goto zlib_error;
+		}
+	}
+
+	/* Decompress any trailing garbage, in case the last record was
+	allocated from an originally longer space on the free list. */
+	d_stream->avail_out = page_header_get_field(page_zip->data,
+						    PAGE_HEAP_TOP)
+		- page_offset(d_stream->next_out);
+	if (UNIV_UNLIKELY(d_stream->avail_out > UNIV_PAGE_SIZE
+			  - PAGE_ZIP_START - PAGE_DIR)) {
+
+		page_zip_fail(("page_zip_decompress_clust:"
+			       " avail_out = %u\n",
+			       d_stream->avail_out));
+		goto zlib_error;
+	}
+
+	if (UNIV_UNLIKELY(inflate(d_stream, Z_FINISH) != Z_STREAM_END)) {
+		page_zip_fail(("page_zip_decompress_clust:"
+			       " inflate(Z_FINISH)=%s\n",
+			       d_stream->msg));
+zlib_error:
+		inflateEnd(d_stream);
+		return(FALSE);
+	}
+
+	/* Note that d_stream->avail_out > 0 may hold here
+	if the modification log is nonempty. */
+
+zlib_done:
+	if (UNIV_UNLIKELY(inflateEnd(d_stream) != Z_OK)) {
+		ut_error;
+	}
+
+	{
+		page_t*	page = page_align(d_stream->next_out);
+
+		/* Clear the unused heap space on the uncompressed page. */
+		memset(d_stream->next_out, 0,
+		       page_dir_get_nth_slot(page,
+					     page_dir_get_n_slots(page) - 1)
+		       - d_stream->next_out);
+	}
+
+#ifdef UNIV_DEBUG
+	page_zip->m_start = PAGE_DATA + d_stream->total_in;
+#endif /* UNIV_DEBUG */
+
+	/* Apply the modification log. */
+	{
+		const byte*	mod_log_ptr;
+		mod_log_ptr = page_zip_apply_log(d_stream->next_in,
+						 d_stream->avail_in + 1,
+						 recs, n_dense,
+						 trx_id_col, heap_status,
+						 index, offsets);
+
+		if (UNIV_UNLIKELY(!mod_log_ptr)) {
+			return(FALSE);
+		}
+		page_zip->m_end = mod_log_ptr - page_zip->data;
+		page_zip->m_nonempty = mod_log_ptr != d_stream->next_in;
+	}
+
+	if (UNIV_UNLIKELY(page_zip_get_trailer_len(page_zip, TRUE, NULL)
+			  + page_zip->m_end >= page_zip_get_size(page_zip))) {
+
+		page_zip_fail(("page_zip_decompress_clust: %lu + %lu >= %lu\n",
+			       (ulong) page_zip_get_trailer_len(
+				       page_zip, TRUE, NULL),
+			       (ulong) page_zip->m_end,
+			       (ulong) page_zip_get_size(page_zip)));
+		return(FALSE);
+	}
+
+	storage = page_zip->data + page_zip_get_size(page_zip)
+		- n_dense * PAGE_ZIP_DIR_SLOT_SIZE;
+
+	externs = storage - n_dense
+		* (DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN);
+
+	/* Restore the uncompressed columns in heap_no order. */
+
+	for (slot = 0; slot < n_dense; slot++) {
+		ulint	i;
+		ulint	len;
+		byte*	dst;
+		rec_t*	rec	= recs[slot];
+		ibool	exists	= !page_zip_dir_find_free(
+			page_zip, page_offset(rec));
+		offsets = rec_get_offsets(rec, index, offsets,
+					  ULINT_UNDEFINED, &heap);
+
+		dst = rec_get_nth_field(rec, offsets,
+					trx_id_col, &len);
+		ut_ad(len >= DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN);
+		storage -= DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN;
+		memcpy(dst, storage,
+		       DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN);
+
+		/* Check if there are any externally stored
+		columns in this record.  For each externally
+		stored column, restore or clear the
+		BTR_EXTERN_FIELD_REF. */
+		if (!rec_offs_any_extern(offsets)) {
+			continue;
+		}
+
+		for (i = 0; i < rec_offs_n_fields(offsets); i++) {
+			if (!rec_offs_nth_extern(offsets, i)) {
+				continue;
+			}
+			dst = rec_get_nth_field(rec, offsets, i, &len);
+
+			if (UNIV_UNLIKELY(len < BTR_EXTERN_FIELD_REF_SIZE)) {
+				page_zip_fail(("page_zip_decompress_clust:"
+					       " %lu < 20\n",
+					       (ulong) len));
+				return(FALSE);
+			}
+
+			dst += len - BTR_EXTERN_FIELD_REF_SIZE;
+
+			if (UNIV_LIKELY(exists)) {
+				/* Existing record:
+				restore the BLOB pointer */
+				externs -= BTR_EXTERN_FIELD_REF_SIZE;
+
+				if (UNIV_UNLIKELY
+				    (externs < page_zip->data
+				     + page_zip->m_end)) {
+					page_zip_fail(("page_zip_"
+						       "decompress_clust: "
+						       "%p < %p + %lu\n",
+						       (const void*) externs,
+						       (const void*)
+						       page_zip->data,
+						       (ulong)
+						       page_zip->m_end));
+					return(FALSE);
+				}
+
+				memcpy(dst, externs,
+				       BTR_EXTERN_FIELD_REF_SIZE);
+
+				page_zip->n_blobs++;
+			} else {
+				/* Deleted record:
+				clear the BLOB pointer */
+				memset(dst, 0,
+				       BTR_EXTERN_FIELD_REF_SIZE);
+			}
+		}
+	}
+
+	return(TRUE);
+}
+
+/**********************************************************************//**
+Decompress a page.  This function should tolerate errors on the compressed
+page.  Instead of letting assertions fail, it will return FALSE if an
+inconsistency is detected.
+@return	TRUE on success, FALSE on failure */
+UNIV_INTERN
+ibool
+page_zip_decompress(
+/*================*/
+	page_zip_des_t*	page_zip,/*!< in: data, ssize;
+				out: m_start, m_end, m_nonempty, n_blobs */
+	page_t*		page,	/*!< out: uncompressed page, may be trashed */
+	ibool		all)	/*!< in: TRUE=decompress the whole page;
+				FALSE=verify but do not copy some
+				page header fields that should not change
+				after page creation */
+{
+	z_stream	d_stream;
+	dict_index_t*	index	= NULL;
+	rec_t**		recs;	/*!< dense page directory, sorted by address */
+	ulint		n_dense;/* number of user records on the page */
+	ulint		trx_id_col = ULINT_UNDEFINED;
+	mem_heap_t*	heap;
+	ulint*		offsets;
+#ifndef UNIV_HOTBACKUP
+	ullint		usec = ut_time_us(NULL);
+#endif /* !UNIV_HOTBACKUP */
+
+	ut_ad(page_zip_simple_validate(page_zip));
+	UNIV_MEM_ASSERT_W(page, UNIV_PAGE_SIZE);
+	UNIV_MEM_ASSERT_RW(page_zip->data, page_zip_get_size(page_zip));
+
+	/* The dense directory excludes the infimum and supremum records. */
+	n_dense = page_dir_get_n_heap(page_zip->data) - PAGE_HEAP_NO_USER_LOW;
+	if (UNIV_UNLIKELY(n_dense * PAGE_ZIP_DIR_SLOT_SIZE
+			  >= page_zip_get_size(page_zip))) {
+		page_zip_fail(("page_zip_decompress 1: %lu %lu\n",
+			       (ulong) n_dense,
+			       (ulong) page_zip_get_size(page_zip)));
+		return(FALSE);
+	}
+
+	heap = mem_heap_create(n_dense * (3 * sizeof *recs) + UNIV_PAGE_SIZE);
+	recs = mem_heap_alloc(heap, n_dense * (2 * sizeof *recs));
+
+	if (all) {
+		/* Copy the page header. */
+		memcpy(page, page_zip->data, PAGE_DATA);
+	} else {
+		/* Check that the bytes that we skip are identical. */
+#if defined UNIV_DEBUG || defined UNIV_ZIP_DEBUG
+		ut_a(!memcmp(FIL_PAGE_TYPE + page,
+			     FIL_PAGE_TYPE + page_zip->data,
+			     PAGE_HEADER - FIL_PAGE_TYPE));
+		ut_a(!memcmp(PAGE_HEADER + PAGE_LEVEL + page,
+			     PAGE_HEADER + PAGE_LEVEL + page_zip->data,
+			     PAGE_DATA - (PAGE_HEADER + PAGE_LEVEL)));
+#endif /* UNIV_DEBUG || UNIV_ZIP_DEBUG */
+
+		/* Copy the mutable parts of the page header. */
+		memcpy(page, page_zip->data, FIL_PAGE_TYPE);
+		memcpy(PAGE_HEADER + page, PAGE_HEADER + page_zip->data,
+		       PAGE_LEVEL - PAGE_N_DIR_SLOTS);
+
+#if defined UNIV_DEBUG || defined UNIV_ZIP_DEBUG
+		/* Check that the page headers match after copying. */
+		ut_a(!memcmp(page, page_zip->data, PAGE_DATA));
+#endif /* UNIV_DEBUG || UNIV_ZIP_DEBUG */
+	}
+
+#ifdef UNIV_ZIP_DEBUG
+	/* Clear the uncompressed page, except the header. */
+	memset(PAGE_DATA + page, 0x55, UNIV_PAGE_SIZE - PAGE_DATA);
+#endif /* UNIV_ZIP_DEBUG */
+	UNIV_MEM_INVALID(PAGE_DATA + page, UNIV_PAGE_SIZE - PAGE_DATA);
+
+	/* Copy the page directory. */
+	if (UNIV_UNLIKELY(!page_zip_dir_decode(page_zip, page, recs,
+					       recs + n_dense, n_dense))) {
+zlib_error:
+		mem_heap_free(heap);
+		return(FALSE);
+	}
+
+	/* Copy the infimum and supremum records. */
+	memcpy(page + (PAGE_NEW_INFIMUM - REC_N_NEW_EXTRA_BYTES),
+	       infimum_extra, sizeof infimum_extra);
+	if (UNIV_UNLIKELY(!page_get_n_recs(page))) {
+		rec_set_next_offs_new(page + PAGE_NEW_INFIMUM,
+				      PAGE_NEW_SUPREMUM);
+	} else {
+		rec_set_next_offs_new(page + PAGE_NEW_INFIMUM,
+				      page_zip_dir_get(page_zip, 0)
+				      & PAGE_ZIP_DIR_SLOT_MASK);
+	}
+	memcpy(page + PAGE_NEW_INFIMUM, infimum_data, sizeof infimum_data);
+	memcpy(page + (PAGE_NEW_SUPREMUM - REC_N_NEW_EXTRA_BYTES + 1),
+	       supremum_extra_data, sizeof supremum_extra_data);
+
+	page_zip_set_alloc(&d_stream, heap);
+
+	if (UNIV_UNLIKELY(inflateInit2(&d_stream, UNIV_PAGE_SIZE_SHIFT)
+			  != Z_OK)) {
+		ut_error;
+	}
+
+	d_stream.next_in = page_zip->data + PAGE_DATA;
+	/* Subtract the space reserved for
+	the page header and the end marker of the modification log. */
+	d_stream.avail_in = page_zip_get_size(page_zip) - (PAGE_DATA + 1);
+
+	d_stream.next_out = page + PAGE_ZIP_START;
+	d_stream.avail_out = UNIV_PAGE_SIZE - PAGE_ZIP_START;
+
+	/* Decode the zlib header and the index information. */
+	if (UNIV_UNLIKELY(inflate(&d_stream, Z_BLOCK) != Z_OK)) {
+
+		page_zip_fail(("page_zip_decompress:"
+			       " 1 inflate(Z_BLOCK)=%s\n", d_stream.msg));
+		goto zlib_error;
+	}
+
+	if (UNIV_UNLIKELY(inflate(&d_stream, Z_BLOCK) != Z_OK)) {
+
+		page_zip_fail(("page_zip_decompress:"
+			       " 2 inflate(Z_BLOCK)=%s\n", d_stream.msg));
+		goto zlib_error;
+	}
+
+	index = page_zip_fields_decode(
+		page + PAGE_ZIP_START, d_stream.next_out,
+		page_is_leaf(page) ? &trx_id_col : NULL);
+
+	if (UNIV_UNLIKELY(!index)) {
+
+		goto zlib_error;
+	}
+
+	/* Decompress the user records. */
+	page_zip->n_blobs = 0;
+	d_stream.next_out = page + PAGE_ZIP_START;
+
+	{
+		/* Pre-allocate the offsets for rec_get_offsets_reverse(). */
+		ulint	n = 1 + 1/* node ptr */ + REC_OFFS_HEADER_SIZE
+			+ dict_index_get_n_fields(index);
+		offsets = mem_heap_alloc(heap, n * sizeof(ulint));
+		*offsets = n;
+	}
+
+	/* Decompress the records in heap_no order. */
+	if (!page_is_leaf(page)) {
+		/* This is a node pointer page. */
+		ulint	info_bits;
+
+		if (UNIV_UNLIKELY
+		    (!page_zip_decompress_node_ptrs(page_zip, &d_stream,
+						    recs, n_dense, index,
+						    offsets, heap))) {
+			goto err_exit;
+		}
+
+		info_bits = mach_read_from_4(page + FIL_PAGE_PREV) == FIL_NULL
+			? REC_INFO_MIN_REC_FLAG : 0;
+
+		if (UNIV_UNLIKELY(!page_zip_set_extra_bytes(page_zip, page,
+							    info_bits))) {
+			goto err_exit;
+		}
+	} else if (UNIV_LIKELY(trx_id_col == ULINT_UNDEFINED)) {
+		/* This is a leaf page in a secondary index. */
+		if (UNIV_UNLIKELY(!page_zip_decompress_sec(page_zip, &d_stream,
+							   recs, n_dense,
+							   index, offsets))) {
+			goto err_exit;
+		}
+
+		if (UNIV_UNLIKELY(!page_zip_set_extra_bytes(page_zip,
+							    page, 0))) {
+err_exit:
+			page_zip_fields_free(index);
+			mem_heap_free(heap);
+			return(FALSE);
+		}
+	} else {
+		/* This is a leaf page in a clustered index. */
+		if (UNIV_UNLIKELY(!page_zip_decompress_clust(page_zip,
+							     &d_stream, recs,
+							     n_dense, index,
+							     trx_id_col,
+							     offsets, heap))) {
+			goto err_exit;
+		}
+
+		if (UNIV_UNLIKELY(!page_zip_set_extra_bytes(page_zip,
+							    page, 0))) {
+			goto err_exit;
+		}
+	}
+
+	ut_a(page_is_comp(page));
+	UNIV_MEM_ASSERT_RW(page, UNIV_PAGE_SIZE);
+
+	page_zip_fields_free(index);
+	mem_heap_free(heap);
+#ifndef UNIV_HOTBACKUP
+	{
+		page_zip_stat_t*	zip_stat
+			= &page_zip_stat[page_zip->ssize - 1];
+		zip_stat->decompressed++;
+		zip_stat->decompressed_usec += ut_time_us(NULL) - usec;
+	}
+#endif /* !UNIV_HOTBACKUP */
+
+	/* Update the stat counter for LRU policy. */
+	buf_LRU_stat_inc_unzip();
+
+	return(TRUE);
+}
+
+#ifdef UNIV_ZIP_DEBUG
+/**********************************************************************//**
+Dump a block of memory on the standard error stream. */
+static
+void
+page_zip_hexdump_func(
+/*==================*/
+	const char*	name,	/*!< in: name of the data structure */
+	const void*	buf,	/*!< in: data */
+	ulint		size)	/*!< in: length of the data, in bytes */
+{
+	const byte*	s	= buf;
+	ulint		addr;
+	const ulint	width	= 32; /* bytes per line */
+
+	fprintf(stderr, "%s:\n", name);
+
+	for (addr = 0; addr < size; addr += width) {
+		ulint	i;
+
+		fprintf(stderr, "%04lx ", (ulong) addr);
+
+		i = ut_min(width, size - addr);
+
+		while (i--) {
+			fprintf(stderr, "%02x", *s++);
+		}
+
+		putc('\n', stderr);
+	}
+}
+
+/** Dump a block of memory on the standard error stream.
+@param buf	in: data
+@param size	in: length of the data, in bytes */
+#define page_zip_hexdump(buf, size) page_zip_hexdump_func(#buf, buf, size)
+
+/** Flag: make page_zip_validate() compare page headers only */
+UNIV_INTERN ibool	page_zip_validate_header_only = FALSE;
+
+/**********************************************************************//**
+Check that the compressed and decompressed pages match.
+@return	TRUE if valid, FALSE if not */
+UNIV_INTERN
+ibool
+page_zip_validate_low(
+/*==================*/
+	const page_zip_des_t*	page_zip,/*!< in: compressed page */
+	const page_t*		page,	/*!< in: uncompressed page */
+	ibool			sloppy)	/*!< in: FALSE=strict,
+					TRUE=ignore the MIN_REC_FLAG */
+{
+	page_zip_des_t	temp_page_zip;
+	byte*		temp_page_buf;
+	page_t*		temp_page;
+	ibool		valid;
+
+	if (memcmp(page_zip->data + FIL_PAGE_PREV, page + FIL_PAGE_PREV,
+		   FIL_PAGE_LSN - FIL_PAGE_PREV)
+	    || memcmp(page_zip->data + FIL_PAGE_TYPE, page + FIL_PAGE_TYPE, 2)
+	    || memcmp(page_zip->data + FIL_PAGE_DATA, page + FIL_PAGE_DATA,
+		      PAGE_DATA - FIL_PAGE_DATA)) {
+		page_zip_fail(("page_zip_validate: page header\n"));
+		page_zip_hexdump(page_zip, sizeof *page_zip);
+		page_zip_hexdump(page_zip->data, page_zip_get_size(page_zip));
+		page_zip_hexdump(page, UNIV_PAGE_SIZE);
+		return(FALSE);
+	}
+
+	ut_a(page_is_comp(page));
+
+	if (page_zip_validate_header_only) {
+		return(TRUE);
+	}
+
+	/* page_zip_decompress() expects the uncompressed page to be
+	UNIV_PAGE_SIZE aligned. */
+	temp_page_buf = ut_malloc(2 * UNIV_PAGE_SIZE);
+	temp_page = ut_align(temp_page_buf, UNIV_PAGE_SIZE);
+
+#ifdef UNIV_DEBUG_VALGRIND
+	/* Get detailed information on the valid bits in case the
+	UNIV_MEM_ASSERT_RW() checks fail.  The v-bits of page[],
+	page_zip->data[] or page_zip could be viewed at temp_page[] or
+	temp_page_zip in a debugger when running valgrind --db-attach. */
+	VALGRIND_GET_VBITS(page, temp_page, UNIV_PAGE_SIZE);
+	UNIV_MEM_ASSERT_RW(page, UNIV_PAGE_SIZE);
+# if UNIV_WORD_SIZE == 4
+	VALGRIND_GET_VBITS(page_zip, &temp_page_zip, sizeof temp_page_zip);
+	/* On 32-bit systems, there is no padding in page_zip_des_t.
+	On other systems, Valgrind could complain about uninitialized
+	pad bytes. */
+	UNIV_MEM_ASSERT_RW(page_zip, sizeof *page_zip);
+# endif
+	VALGRIND_GET_VBITS(page_zip->data, temp_page,
+			   page_zip_get_size(page_zip));
+	UNIV_MEM_ASSERT_RW(page_zip->data, page_zip_get_size(page_zip));
+#endif /* UNIV_DEBUG_VALGRIND */
+
+	temp_page_zip = *page_zip;
+	valid = page_zip_decompress(&temp_page_zip, temp_page, TRUE);
+	if (!valid) {
+		fputs("page_zip_validate(): failed to decompress\n", stderr);
+		goto func_exit;
+	}
+	if (page_zip->n_blobs != temp_page_zip.n_blobs) {
+		page_zip_fail(("page_zip_validate: n_blobs: %u!=%u\n",
+			       page_zip->n_blobs, temp_page_zip.n_blobs));
+		valid = FALSE;
+	}
+#ifdef UNIV_DEBUG
+	if (page_zip->m_start != temp_page_zip.m_start) {
+		page_zip_fail(("page_zip_validate: m_start: %u!=%u\n",
+			       page_zip->m_start, temp_page_zip.m_start));
+		valid = FALSE;
+	}
+#endif /* UNIV_DEBUG */
+	if (page_zip->m_end != temp_page_zip.m_end) {
+		page_zip_fail(("page_zip_validate: m_end: %u!=%u\n",
+			       page_zip->m_end, temp_page_zip.m_end));
+		valid = FALSE;
+	}
+	if (page_zip->m_nonempty != temp_page_zip.m_nonempty) {
+		page_zip_fail(("page_zip_validate(): m_nonempty: %u!=%u\n",
+			       page_zip->m_nonempty,
+			       temp_page_zip.m_nonempty));
+		valid = FALSE;
+	}
+	if (memcmp(page + PAGE_HEADER, temp_page + PAGE_HEADER,
+		   UNIV_PAGE_SIZE - PAGE_HEADER - FIL_PAGE_DATA_END)) {
+
+		/* In crash recovery, the "minimum record" flag may be
+		set incorrectly until the mini-transaction is
+		committed.  Let us tolerate that difference when we
+		are performing a sloppy validation. */
+
+		if (sloppy) {
+			byte	info_bits_diff;
+			ulint	offset
+				= rec_get_next_offs(page + PAGE_NEW_INFIMUM,
+						    TRUE);
+			ut_a(offset >= PAGE_NEW_SUPREMUM);
+			offset -= 5 /* REC_NEW_INFO_BITS */;
+
+			info_bits_diff = page[offset] ^ temp_page[offset];
+
+			if (info_bits_diff == REC_INFO_MIN_REC_FLAG) {
+				temp_page[offset] = page[offset];
+
+				if (!memcmp(page + PAGE_HEADER,
+					    temp_page + PAGE_HEADER,
+					    UNIV_PAGE_SIZE - PAGE_HEADER
+					    - FIL_PAGE_DATA_END)) {
+
+					/* Only the minimum record flag
+					differed.  Let us ignore it. */
+					page_zip_fail(("page_zip_validate: "
+						       "min_rec_flag "
+						       "(ignored, "
+						       "%lu,%lu,0x%02lx)\n",
+						       page_get_space_id(page),
+						       page_get_page_no(page),
+						       (ulong) page[offset]));
+					goto func_exit;
+				}
+			}
+		}
+		page_zip_fail(("page_zip_validate: content\n"));
+		valid = FALSE;
+	}
+
+func_exit:
+	if (!valid) {
+		page_zip_hexdump(page_zip, sizeof *page_zip);
+		page_zip_hexdump(page_zip->data, page_zip_get_size(page_zip));
+		page_zip_hexdump(page, UNIV_PAGE_SIZE);
+		page_zip_hexdump(temp_page, UNIV_PAGE_SIZE);
+	}
+	ut_free(temp_page_buf);
+	return(valid);
+}
+
+/**********************************************************************//**
+Check that the compressed and decompressed pages match.
+@return	TRUE if valid, FALSE if not */
+UNIV_INTERN
+ibool
+page_zip_validate(
+/*==============*/
+	const page_zip_des_t*	page_zip,/*!< in: compressed page */
+	const page_t*		page)	/*!< in: uncompressed page */
+{
+	return(page_zip_validate_low(page_zip, page,
+				     recv_recovery_is_on()));
+}
+#endif /* UNIV_ZIP_DEBUG */
+
+#ifdef UNIV_DEBUG
+/**********************************************************************//**
+Assert that the compressed and decompressed page headers match.
+@return	TRUE */
+static
+ibool
+page_zip_header_cmp(
+/*================*/
+	const page_zip_des_t*	page_zip,/*!< in: compressed page */
+	const byte*		page)	/*!< in: uncompressed page */
+{
+	ut_ad(!memcmp(page_zip->data + FIL_PAGE_PREV, page + FIL_PAGE_PREV,
+		      FIL_PAGE_LSN - FIL_PAGE_PREV));
+	ut_ad(!memcmp(page_zip->data + FIL_PAGE_TYPE, page + FIL_PAGE_TYPE,
+		      2));
+	ut_ad(!memcmp(page_zip->data + FIL_PAGE_DATA, page + FIL_PAGE_DATA,
+		      PAGE_DATA - FIL_PAGE_DATA));
+
+	return(TRUE);
+}
+#endif /* UNIV_DEBUG */
+
+/**********************************************************************//**
+Write a record on the compressed page that contains externally stored
+columns.  The data must already have been written to the uncompressed page.
+@return	end of modification log */
+static
+byte*
+page_zip_write_rec_ext(
+/*===================*/
+	page_zip_des_t*	page_zip,	/*!< in/out: compressed page */
+	const page_t*	page,		/*!< in: page containing rec */
+	const byte*	rec,		/*!< in: record being written */
+	dict_index_t*	index,		/*!< in: record descriptor */
+	const ulint*	offsets,	/*!< in: rec_get_offsets(rec, index) */
+	ulint		create,		/*!< in: nonzero=insert, zero=update */
+	ulint		trx_id_col,	/*!< in: position of DB_TRX_ID */
+	ulint		heap_no,	/*!< in: heap number of rec */
+	byte*		storage,	/*!< in: end of dense page directory */
+	byte*		data)		/*!< in: end of modification log */
+{
+	const byte*	start	= rec;
+	ulint		i;
+	ulint		len;
+	byte*		externs	= storage;
+	ulint		n_ext	= rec_offs_n_extern(offsets);
+
+	ut_ad(rec_offs_validate(rec, index, offsets));
+	UNIV_MEM_ASSERT_RW(rec, rec_offs_data_size(offsets));
+	UNIV_MEM_ASSERT_RW(rec - rec_offs_extra_size(offsets),
+			   rec_offs_extra_size(offsets));
+
+	externs -= (DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN)
+		* (page_dir_get_n_heap(page) - PAGE_HEAP_NO_USER_LOW);
+
+	/* Note that this will not take into account
+	the BLOB columns of rec if create==TRUE. */
+	ut_ad(data + rec_offs_data_size(offsets)
+	      - (DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN)
+	      - n_ext * BTR_EXTERN_FIELD_REF_SIZE
+	      < externs - BTR_EXTERN_FIELD_REF_SIZE * page_zip->n_blobs);
+
+	{
+		ulint	blob_no = page_zip_get_n_prev_extern(
+			page_zip, rec, index);
+		byte*	ext_end = externs - page_zip->n_blobs
+			* BTR_EXTERN_FIELD_REF_SIZE;
+		ut_ad(blob_no <= page_zip->n_blobs);
+		externs -= blob_no * BTR_EXTERN_FIELD_REF_SIZE;
+
+		if (create) {
+			page_zip->n_blobs += n_ext;
+			ASSERT_ZERO_BLOB(ext_end - n_ext
+					 * BTR_EXTERN_FIELD_REF_SIZE);
+			memmove(ext_end - n_ext
+				* BTR_EXTERN_FIELD_REF_SIZE,
+				ext_end,
+				externs - ext_end);
+		}
+
+		ut_a(blob_no + n_ext <= page_zip->n_blobs);
+	}
+
+	for (i = 0; i < rec_offs_n_fields(offsets); i++) {
+		const byte*	src;
+
+		if (UNIV_UNLIKELY(i == trx_id_col)) {
+			ut_ad(!rec_offs_nth_extern(offsets,
+						   i));
+			ut_ad(!rec_offs_nth_extern(offsets,
+						   i + 1));
+			/* Locate trx_id and roll_ptr. */
+			src = rec_get_nth_field(rec, offsets,
+						i, &len);
+			ut_ad(len == DATA_TRX_ID_LEN);
+			ut_ad(src + DATA_TRX_ID_LEN
+			      == rec_get_nth_field(
+				      rec, offsets,
+				      i + 1, &len));
+			ut_ad(len == DATA_ROLL_PTR_LEN);
+
+			/* Log the preceding fields. */
+			ASSERT_ZERO(data, src - start);
+			memcpy(data, start, src - start);
+			data += src - start;
+			start = src + (DATA_TRX_ID_LEN
+				       + DATA_ROLL_PTR_LEN);
+
+			/* Store trx_id and roll_ptr. */
+			memcpy(storage - (DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN)
+			       * (heap_no - 1),
+			       src, DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN);
+			i++; /* skip also roll_ptr */
+		} else if (rec_offs_nth_extern(offsets, i)) {
+			src = rec_get_nth_field(rec, offsets,
+						i, &len);
+
+			ut_ad(dict_index_is_clust(index));
+			ut_ad(len
+			      >= BTR_EXTERN_FIELD_REF_SIZE);
+			src += len - BTR_EXTERN_FIELD_REF_SIZE;
+
+			ASSERT_ZERO(data, src - start);
+			memcpy(data, start, src - start);
+			data += src - start;
+			start = src + BTR_EXTERN_FIELD_REF_SIZE;
+
+			/* Store the BLOB pointer. */
+			externs -= BTR_EXTERN_FIELD_REF_SIZE;
+			ut_ad(data < externs);
+			memcpy(externs, src, BTR_EXTERN_FIELD_REF_SIZE);
+		}
+	}
+
+	/* Log the last bytes of the record. */
+	len = rec_offs_data_size(offsets) - (start - rec);
+
+	ASSERT_ZERO(data, len);
+	memcpy(data, start, len);
+	data += len;
+
+	return(data);
+}
+
+/**********************************************************************//**
+Write an entire record on the compressed page.  The data must already
+have been written to the uncompressed page. */
+UNIV_INTERN
+void
+page_zip_write_rec(
+/*===============*/
+	page_zip_des_t*	page_zip,/*!< in/out: compressed page */
+	const byte*	rec,	/*!< in: record being written */
+	dict_index_t*	index,	/*!< in: the index the record belongs to */
+	const ulint*	offsets,/*!< in: rec_get_offsets(rec, index) */
+	ulint		create)	/*!< in: nonzero=insert, zero=update */
+{
+	const page_t*	page;
+	byte*		data;
+	byte*		storage;
+	ulint		heap_no;
+	byte*		slot;
+
+	ut_ad(PAGE_ZIP_MATCH(rec, page_zip));
+	ut_ad(page_zip_simple_validate(page_zip));
+	ut_ad(page_zip_get_size(page_zip)
+	      > PAGE_DATA + page_zip_dir_size(page_zip));
+	ut_ad(rec_offs_comp(offsets));
+	ut_ad(rec_offs_validate(rec, index, offsets));
+
+	ut_ad(page_zip->m_start >= PAGE_DATA);
+
+	page = page_align(rec);
+
+	ut_ad(page_zip_header_cmp(page_zip, page));
+	ut_ad(page_simple_validate_new((page_t*) page));
+
+	UNIV_MEM_ASSERT_RW(page_zip->data, page_zip_get_size(page_zip));
+	UNIV_MEM_ASSERT_RW(rec, rec_offs_data_size(offsets));
+	UNIV_MEM_ASSERT_RW(rec - rec_offs_extra_size(offsets),
+			   rec_offs_extra_size(offsets));
+
+	slot = page_zip_dir_find(page_zip, page_offset(rec));
+	ut_a(slot);
+	/* Copy the delete mark. */
+	if (rec_get_deleted_flag(rec, TRUE)) {
+		*slot |= PAGE_ZIP_DIR_SLOT_DEL >> 8;
+	} else {
+		*slot &= ~(PAGE_ZIP_DIR_SLOT_DEL >> 8);
+	}
+
+	ut_ad(rec_get_start((rec_t*) rec, offsets) >= page + PAGE_ZIP_START);
+	ut_ad(rec_get_end((rec_t*) rec, offsets) <= page + UNIV_PAGE_SIZE
+	      - PAGE_DIR - PAGE_DIR_SLOT_SIZE
+	      * page_dir_get_n_slots(page));
+
+	heap_no = rec_get_heap_no_new(rec);
+	ut_ad(heap_no >= PAGE_HEAP_NO_USER_LOW); /* not infimum or supremum */
+	ut_ad(heap_no < page_dir_get_n_heap(page));
+
+	/* Append to the modification log. */
+	data = page_zip->data + page_zip->m_end;
+	ut_ad(!*data);
+
+	/* Identify the record by writing its heap number - 1.
+	0 is reserved to indicate the end of the modification log. */
+
+	if (UNIV_UNLIKELY(heap_no - 1 >= 64)) {
+		*data++ = (byte) (0x80 | (heap_no - 1) >> 7);
+		ut_ad(!*data);
+	}
+	*data++ = (byte) ((heap_no - 1) << 1);
+	ut_ad(!*data);
+
+	{
+		const byte*	start	= rec - rec_offs_extra_size(offsets);
+		const byte*	b	= rec - REC_N_NEW_EXTRA_BYTES;
+
+		/* Write the extra bytes backwards, so that
+		rec_offs_extra_size() can be easily computed in
+		page_zip_apply_log() by invoking
+		rec_get_offsets_reverse(). */
+
+		while (b != start) {
+			*data++ = *--b;
+			ut_ad(!*data);
+		}
+	}
+
+	/* Write the data bytes.  Store the uncompressed bytes separately. */
+	storage = page_zip->data + page_zip_get_size(page_zip)
+		- (page_dir_get_n_heap(page) - PAGE_HEAP_NO_USER_LOW)
+		* PAGE_ZIP_DIR_SLOT_SIZE;
+
+	if (page_is_leaf(page)) {
+		ulint		len;
+
+		if (dict_index_is_clust(index)) {
+			ulint		trx_id_col;
+
+			trx_id_col = dict_index_get_sys_col_pos(index,
+								DATA_TRX_ID);
+			ut_ad(trx_id_col != ULINT_UNDEFINED);
+
+			/* Store separately trx_id, roll_ptr and
+			the BTR_EXTERN_FIELD_REF of each BLOB column. */
+			if (rec_offs_any_extern(offsets)) {
+				data = page_zip_write_rec_ext(
+					page_zip, page,
+					rec, index, offsets, create,
+					trx_id_col, heap_no, storage, data);
+			} else {
+				/* Locate trx_id and roll_ptr. */
+				const byte*	src
+					= rec_get_nth_field(rec, offsets,
+							    trx_id_col, &len);
+				ut_ad(len == DATA_TRX_ID_LEN);
+				ut_ad(src + DATA_TRX_ID_LEN
+				      == rec_get_nth_field(
+					      rec, offsets,
+					      trx_id_col + 1, &len));
+				ut_ad(len == DATA_ROLL_PTR_LEN);
+
+				/* Log the preceding fields. */
+				ASSERT_ZERO(data, src - rec);
+				memcpy(data, rec, src - rec);
+				data += src - rec;
+
+				/* Store trx_id and roll_ptr. */
+				memcpy(storage
+				       - (DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN)
+				       * (heap_no - 1),
+				       src,
+				       DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN);
+
+				src += DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN;
+
+				/* Log the last bytes of the record. */
+				len = rec_offs_data_size(offsets)
+					- (src - rec);
+
+				ASSERT_ZERO(data, len);
+				memcpy(data, src, len);
+				data += len;
+			}
+		} else {
+			/* Leaf page of a secondary index:
+			no externally stored columns */
+			ut_ad(dict_index_get_sys_col_pos(index, DATA_TRX_ID)
+			      == ULINT_UNDEFINED);
+			ut_ad(!rec_offs_any_extern(offsets));
+
+			/* Log the entire record. */
+			len = rec_offs_data_size(offsets);
+
+			ASSERT_ZERO(data, len);
+			memcpy(data, rec, len);
+			data += len;
+		}
+	} else {
+		/* This is a node pointer page. */
+		ulint	len;
+
+		/* Non-leaf nodes should not have any externally
+		stored columns. */
+		ut_ad(!rec_offs_any_extern(offsets));
+
+		/* Copy the data bytes, except node_ptr. */
+		len = rec_offs_data_size(offsets) - REC_NODE_PTR_SIZE;
+		ut_ad(data + len < storage - REC_NODE_PTR_SIZE
+		      * (page_dir_get_n_heap(page) - PAGE_HEAP_NO_USER_LOW));
+		ASSERT_ZERO(data, len);
+		memcpy(data, rec, len);
+		data += len;
+
+		/* Copy the node pointer to the uncompressed area. */
+		memcpy(storage - REC_NODE_PTR_SIZE
+		       * (heap_no - 1),
+		       rec + len,
+		       REC_NODE_PTR_SIZE);
+	}
+
+	ut_a(!*data);
+	ut_ad((ulint) (data - page_zip->data) < page_zip_get_size(page_zip));
+	page_zip->m_end = data - page_zip->data;
+	page_zip->m_nonempty = TRUE;
+
+#ifdef UNIV_ZIP_DEBUG
+	ut_a(page_zip_validate(page_zip, page_align(rec)));
+#endif /* UNIV_ZIP_DEBUG */
+}
+
+/***********************************************************//**
+Parses a log record of writing a BLOB pointer of a record.
+@return	end of log record or NULL */
+UNIV_INTERN
+byte*
+page_zip_parse_write_blob_ptr(
+/*==========================*/
+	byte*		ptr,	/*!< in: redo log buffer */
+	byte*		end_ptr,/*!< in: redo log buffer end */
+	page_t*		page,	/*!< in/out: uncompressed page */
+	page_zip_des_t*	page_zip)/*!< in/out: compressed page */
+{
+	ulint	offset;
+	ulint	z_offset;
+
+	ut_ad(!page == !page_zip);
+
+	if (UNIV_UNLIKELY
+	    (end_ptr < ptr + (2 + 2 + BTR_EXTERN_FIELD_REF_SIZE))) {
+
+		return(NULL);
+	}
+
+	offset = mach_read_from_2(ptr);
+	z_offset = mach_read_from_2(ptr + 2);
+
+	if (UNIV_UNLIKELY(offset < PAGE_ZIP_START)
+	    || UNIV_UNLIKELY(offset >= UNIV_PAGE_SIZE)
+	    || UNIV_UNLIKELY(z_offset >= UNIV_PAGE_SIZE)) {
+corrupt:
+		recv_sys->found_corrupt_log = TRUE;
+
+		return(NULL);
+	}
+
+	if (page) {
+		if (UNIV_UNLIKELY(!page_zip)
+		    || UNIV_UNLIKELY(!page_is_leaf(page))) {
+
+			goto corrupt;
+		}
+
+#ifdef UNIV_ZIP_DEBUG
+		ut_a(page_zip_validate(page_zip, page));
+#endif /* UNIV_ZIP_DEBUG */
+
+		memcpy(page + offset,
+		       ptr + 4, BTR_EXTERN_FIELD_REF_SIZE);
+		memcpy(page_zip->data + z_offset,
+		       ptr + 4, BTR_EXTERN_FIELD_REF_SIZE);
+
+#ifdef UNIV_ZIP_DEBUG
+		ut_a(page_zip_validate(page_zip, page));
+#endif /* UNIV_ZIP_DEBUG */
+	}
+
+	return(ptr + (2 + 2 + BTR_EXTERN_FIELD_REF_SIZE));
+}
+
+/**********************************************************************//**
+Write a BLOB pointer of a record on the leaf page of a clustered index.
+The information must already have been updated on the uncompressed page. */
+UNIV_INTERN
+void
+page_zip_write_blob_ptr(
+/*====================*/
+	page_zip_des_t*	page_zip,/*!< in/out: compressed page */
+	const byte*	rec,	/*!< in/out: record whose data is being
+				written */
+	dict_index_t*	index,	/*!< in: index of the page */
+	const ulint*	offsets,/*!< in: rec_get_offsets(rec, index) */
+	ulint		n,	/*!< in: column index */
+	mtr_t*		mtr)	/*!< in: mini-transaction handle,
+				or NULL if no logging is needed */
+{
+	const byte*	field;
+	byte*		externs;
+	const page_t*	page	= page_align(rec);
+	ulint		blob_no;
+	ulint		len;
+
+	ut_ad(PAGE_ZIP_MATCH(rec, page_zip));
+	ut_ad(page_simple_validate_new((page_t*) page));
+	ut_ad(page_zip_simple_validate(page_zip));
+	ut_ad(page_zip_get_size(page_zip)
+	      > PAGE_DATA + page_zip_dir_size(page_zip));
+	ut_ad(rec_offs_comp(offsets));
+	ut_ad(rec_offs_validate(rec, NULL, offsets));
+	ut_ad(rec_offs_any_extern(offsets));
+	ut_ad(rec_offs_nth_extern(offsets, n));
+
+	ut_ad(page_zip->m_start >= PAGE_DATA);
+	ut_ad(page_zip_header_cmp(page_zip, page));
+
+	ut_ad(page_is_leaf(page));
+	ut_ad(dict_index_is_clust(index));
+
+	UNIV_MEM_ASSERT_RW(page_zip->data, page_zip_get_size(page_zip));
+	UNIV_MEM_ASSERT_RW(rec, rec_offs_data_size(offsets));
+	UNIV_MEM_ASSERT_RW(rec - rec_offs_extra_size(offsets),
+			   rec_offs_extra_size(offsets));
+
+	blob_no = page_zip_get_n_prev_extern(page_zip, rec, index)
+		+ rec_get_n_extern_new(rec, index, n);
+	ut_a(blob_no < page_zip->n_blobs);
+
+	externs = page_zip->data + page_zip_get_size(page_zip)
+		- (page_dir_get_n_heap(page) - PAGE_HEAP_NO_USER_LOW)
+		* (PAGE_ZIP_DIR_SLOT_SIZE
+		   + DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN);
+
+	field = rec_get_nth_field(rec, offsets, n, &len);
+
+	externs -= (blob_no + 1) * BTR_EXTERN_FIELD_REF_SIZE;
+	field += len - BTR_EXTERN_FIELD_REF_SIZE;
+
+	memcpy(externs, field, BTR_EXTERN_FIELD_REF_SIZE);
+
+#ifdef UNIV_ZIP_DEBUG
+	ut_a(page_zip_validate(page_zip, page));
+#endif /* UNIV_ZIP_DEBUG */
+
+	if (mtr) {
+#ifndef UNIV_HOTBACKUP
+		byte*	log_ptr	= mlog_open(
+			mtr, 11 + 2 + 2 + BTR_EXTERN_FIELD_REF_SIZE);
+		if (UNIV_UNLIKELY(!log_ptr)) {
+			return;
+		}
+
+		log_ptr = mlog_write_initial_log_record_fast(
+			(byte*) field, MLOG_ZIP_WRITE_BLOB_PTR, log_ptr, mtr);
+		mach_write_to_2(log_ptr, page_offset(field));
+		log_ptr += 2;
+		mach_write_to_2(log_ptr, externs - page_zip->data);
+		log_ptr += 2;
+		memcpy(log_ptr, externs, BTR_EXTERN_FIELD_REF_SIZE);
+		log_ptr += BTR_EXTERN_FIELD_REF_SIZE;
+		mlog_close(mtr, log_ptr);
+#endif /* !UNIV_HOTBACKUP */
+	}
+}
+
+/***********************************************************//**
+Parses a log record of writing the node pointer of a record.
+@return	end of log record or NULL */
+UNIV_INTERN
+byte*
+page_zip_parse_write_node_ptr(
+/*==========================*/
+	byte*		ptr,	/*!< in: redo log buffer */
+	byte*		end_ptr,/*!< in: redo log buffer end */
+	page_t*		page,	/*!< in/out: uncompressed page */
+	page_zip_des_t*	page_zip)/*!< in/out: compressed page */
+{
+	ulint	offset;
+	ulint	z_offset;
+
+	ut_ad(!page == !page_zip);
+
+	if (UNIV_UNLIKELY(end_ptr < ptr + (2 + 2 + REC_NODE_PTR_SIZE))) {
+
+		return(NULL);
+	}
+
+	offset = mach_read_from_2(ptr);
+	z_offset = mach_read_from_2(ptr + 2);
+
+	if (UNIV_UNLIKELY(offset < PAGE_ZIP_START)
+	    || UNIV_UNLIKELY(offset >= UNIV_PAGE_SIZE)
+	    || UNIV_UNLIKELY(z_offset >= UNIV_PAGE_SIZE)) {
+corrupt:
+		recv_sys->found_corrupt_log = TRUE;
+
+		return(NULL);
+	}
+
+	if (page) {
+		byte*	storage_end;
+		byte*	field;
+		byte*	storage;
+		ulint	heap_no;
+
+		if (UNIV_UNLIKELY(!page_zip)
+		    || UNIV_UNLIKELY(page_is_leaf(page))) {
+
+			goto corrupt;
+		}
+
+#ifdef UNIV_ZIP_DEBUG
+		ut_a(page_zip_validate(page_zip, page));
+#endif /* UNIV_ZIP_DEBUG */
+
+		field = page + offset;
+		storage = page_zip->data + z_offset;
+
+		storage_end = page_zip->data + page_zip_get_size(page_zip)
+			- (page_dir_get_n_heap(page) - PAGE_HEAP_NO_USER_LOW)
+			* PAGE_ZIP_DIR_SLOT_SIZE;
+
+		heap_no = 1 + (storage_end - storage) / REC_NODE_PTR_SIZE;
+
+		if (UNIV_UNLIKELY((storage_end - storage) % REC_NODE_PTR_SIZE)
+		    || UNIV_UNLIKELY(heap_no < PAGE_HEAP_NO_USER_LOW)
+		    || UNIV_UNLIKELY(heap_no >= page_dir_get_n_heap(page))) {
+
+			goto corrupt;
+		}
+
+		memcpy(field, ptr + 4, REC_NODE_PTR_SIZE);
+		memcpy(storage, ptr + 4, REC_NODE_PTR_SIZE);
+
+#ifdef UNIV_ZIP_DEBUG
+		ut_a(page_zip_validate(page_zip, page));
+#endif /* UNIV_ZIP_DEBUG */
+	}
+
+	return(ptr + (2 + 2 + REC_NODE_PTR_SIZE));
+}
+
+/**********************************************************************//**
+Write the node pointer of a record on a non-leaf compressed page. */
+UNIV_INTERN
+void
+page_zip_write_node_ptr(
+/*====================*/
+	page_zip_des_t*	page_zip,/*!< in/out: compressed page */
+	byte*		rec,	/*!< in/out: record */
+	ulint		size,	/*!< in: data size of rec */
+	ulint		ptr,	/*!< in: node pointer */
+	mtr_t*		mtr)	/*!< in: mini-transaction, or NULL */
+{
+	byte*	field;
+	byte*	storage;
+	page_t*	page	= page_align(rec);
+
+	ut_ad(PAGE_ZIP_MATCH(rec, page_zip));
+	ut_ad(page_simple_validate_new(page));
+	ut_ad(page_zip_simple_validate(page_zip));
+	ut_ad(page_zip_get_size(page_zip)
+	      > PAGE_DATA + page_zip_dir_size(page_zip));
+	ut_ad(page_rec_is_comp(rec));
+
+	ut_ad(page_zip->m_start >= PAGE_DATA);
+	ut_ad(page_zip_header_cmp(page_zip, page));
+
+	ut_ad(!page_is_leaf(page));
+
+	UNIV_MEM_ASSERT_RW(page_zip->data, page_zip_get_size(page_zip));
+	UNIV_MEM_ASSERT_RW(rec, size);
+
+	storage = page_zip->data + page_zip_get_size(page_zip)
+		- (page_dir_get_n_heap(page) - PAGE_HEAP_NO_USER_LOW)
+		* PAGE_ZIP_DIR_SLOT_SIZE
+		- (rec_get_heap_no_new(rec) - 1) * REC_NODE_PTR_SIZE;
+	field = rec + size - REC_NODE_PTR_SIZE;
+
+#if defined UNIV_DEBUG || defined UNIV_ZIP_DEBUG
+	ut_a(!memcmp(storage, field, REC_NODE_PTR_SIZE));
+#endif /* UNIV_DEBUG || UNIV_ZIP_DEBUG */
+#if REC_NODE_PTR_SIZE != 4
+# error "REC_NODE_PTR_SIZE != 4"
+#endif
+	mach_write_to_4(field, ptr);
+	memcpy(storage, field, REC_NODE_PTR_SIZE);
+
+	if (mtr) {
+#ifndef UNIV_HOTBACKUP
+		byte*	log_ptr	= mlog_open(mtr,
+					    11 + 2 + 2 + REC_NODE_PTR_SIZE);
+		if (UNIV_UNLIKELY(!log_ptr)) {
+			return;
+		}
+
+		log_ptr = mlog_write_initial_log_record_fast(
+			field, MLOG_ZIP_WRITE_NODE_PTR, log_ptr, mtr);
+		mach_write_to_2(log_ptr, page_offset(field));
+		log_ptr += 2;
+		mach_write_to_2(log_ptr, storage - page_zip->data);
+		log_ptr += 2;
+		memcpy(log_ptr, field, REC_NODE_PTR_SIZE);
+		log_ptr += REC_NODE_PTR_SIZE;
+		mlog_close(mtr, log_ptr);
+#endif /* !UNIV_HOTBACKUP */
+	}
+}
+
+/**********************************************************************//**
+Write the trx_id and roll_ptr of a record on a B-tree leaf node page. */
+UNIV_INTERN
+void
+page_zip_write_trx_id_and_roll_ptr(
+/*===============================*/
+	page_zip_des_t*	page_zip,/*!< in/out: compressed page */
+	byte*		rec,	/*!< in/out: record */
+	const ulint*	offsets,/*!< in: rec_get_offsets(rec, index) */
+	ulint		trx_id_col,/*!< in: column number of TRX_ID in rec */
+	trx_id_t	trx_id,	/*!< in: transaction identifier */
+	roll_ptr_t	roll_ptr)/*!< in: roll_ptr */
+{
+	byte*	field;
+	byte*	storage;
+	page_t*	page	= page_align(rec);
+	ulint	len;
+
+	ut_ad(PAGE_ZIP_MATCH(rec, page_zip));
+	ut_ad(page_simple_validate_new(page));
+	ut_ad(page_zip_simple_validate(page_zip));
+	ut_ad(page_zip_get_size(page_zip)
+	      > PAGE_DATA + page_zip_dir_size(page_zip));
+	ut_ad(rec_offs_validate(rec, NULL, offsets));
+	ut_ad(rec_offs_comp(offsets));
+
+	ut_ad(page_zip->m_start >= PAGE_DATA);
+	ut_ad(page_zip_header_cmp(page_zip, page));
+
+	ut_ad(page_is_leaf(page));
+
+	UNIV_MEM_ASSERT_RW(page_zip->data, page_zip_get_size(page_zip));
+
+	storage = page_zip->data + page_zip_get_size(page_zip)
+		- (page_dir_get_n_heap(page) - PAGE_HEAP_NO_USER_LOW)
+		* PAGE_ZIP_DIR_SLOT_SIZE
+		- (rec_get_heap_no_new(rec) - 1)
+		* (DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN);
+
+#if DATA_TRX_ID + 1 != DATA_ROLL_PTR
+# error "DATA_TRX_ID + 1 != DATA_ROLL_PTR"
+#endif
+	field = rec_get_nth_field(rec, offsets, trx_id_col, &len);
+	ut_ad(len == DATA_TRX_ID_LEN);
+	ut_ad(field + DATA_TRX_ID_LEN
+	      == rec_get_nth_field(rec, offsets, trx_id_col + 1, &len));
+	ut_ad(len == DATA_ROLL_PTR_LEN);
+#if defined UNIV_DEBUG || defined UNIV_ZIP_DEBUG
+	ut_a(!memcmp(storage, field, DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN));
+#endif /* UNIV_DEBUG || UNIV_ZIP_DEBUG */
+#if DATA_TRX_ID_LEN != 6
+# error "DATA_TRX_ID_LEN != 6"
+#endif
+	mach_write_to_6(field, trx_id);
+#if DATA_ROLL_PTR_LEN != 7
+# error "DATA_ROLL_PTR_LEN != 7"
+#endif
+	mach_write_to_7(field + DATA_TRX_ID_LEN, roll_ptr);
+	memcpy(storage, field, DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN);
+
+	UNIV_MEM_ASSERT_RW(rec, rec_offs_data_size(offsets));
+	UNIV_MEM_ASSERT_RW(rec - rec_offs_extra_size(offsets),
+			   rec_offs_extra_size(offsets));
+	UNIV_MEM_ASSERT_RW(page_zip->data, page_zip_get_size(page_zip));
+}
+
+#ifdef UNIV_ZIP_DEBUG
+/** Set this variable in a debugger to disable page_zip_clear_rec().
+The only observable effect should be the compression ratio due to
+deleted records not being zeroed out.  In rare cases, there can be
+page_zip_validate() failures on the node_ptr, trx_id and roll_ptr
+columns if the space is reallocated for a smaller record. */
+UNIV_INTERN ibool	page_zip_clear_rec_disable;
+#endif /* UNIV_ZIP_DEBUG */
+
+/**********************************************************************//**
+Clear an area on the uncompressed and compressed page, if possible. */
+static
+void
+page_zip_clear_rec(
+/*===============*/
+	page_zip_des_t*	page_zip,/*!< in/out: compressed page */
+	byte*		rec,	/*!< in: record to clear */
+	dict_index_t*	index,	/*!< in: index of rec */
+	const ulint*	offsets)/*!< in: rec_get_offsets(rec, index) */
+{
+	ulint	heap_no;
+	page_t*	page	= page_align(rec);
+	/* page_zip_validate() would fail here if a record
+	containing externally stored columns is being deleted. */
+	ut_ad(rec_offs_validate(rec, index, offsets));
+	ut_ad(!page_zip_dir_find(page_zip, page_offset(rec)));
+	ut_ad(page_zip_dir_find_free(page_zip, page_offset(rec)));
+	ut_ad(page_zip_header_cmp(page_zip, page));
+
+	heap_no = rec_get_heap_no_new(rec);
+	ut_ad(heap_no >= PAGE_HEAP_NO_USER_LOW);
+
+	UNIV_MEM_ASSERT_RW(page_zip->data, page_zip_get_size(page_zip));
+	UNIV_MEM_ASSERT_RW(rec, rec_offs_data_size(offsets));
+	UNIV_MEM_ASSERT_RW(rec - rec_offs_extra_size(offsets),
+			   rec_offs_extra_size(offsets));
+
+	if (
+#ifdef UNIV_ZIP_DEBUG
+	    !page_zip_clear_rec_disable &&
+#endif /* UNIV_ZIP_DEBUG */
+	    page_zip->m_end
+	    + 1 + ((heap_no - 1) >= 64)/* size of the log entry */
+	    + page_zip_get_trailer_len(page_zip,
+				       dict_index_is_clust(index), NULL)
+	    < page_zip_get_size(page_zip)) {
+		byte*	data;
+
+		/* Clear only the data bytes, because the allocator and
+		the decompressor depend on the extra bytes. */
+		memset(rec, 0, rec_offs_data_size(offsets));
+
+		if (!page_is_leaf(page)) {
+			/* Clear node_ptr on the compressed page. */
+			byte*	storage	= page_zip->data
+				+ page_zip_get_size(page_zip)
+				- (page_dir_get_n_heap(page)
+				   - PAGE_HEAP_NO_USER_LOW)
+				* PAGE_ZIP_DIR_SLOT_SIZE;
+
+			memset(storage - (heap_no - 1) * REC_NODE_PTR_SIZE,
+			       0, REC_NODE_PTR_SIZE);
+		} else if (dict_index_is_clust(index)) {
+			/* Clear trx_id and roll_ptr on the compressed page. */
+			byte*	storage	= page_zip->data
+				+ page_zip_get_size(page_zip)
+				- (page_dir_get_n_heap(page)
+				   - PAGE_HEAP_NO_USER_LOW)
+				* PAGE_ZIP_DIR_SLOT_SIZE;
+
+			memset(storage - (heap_no - 1)
+			       * (DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN),
+			       0, DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN);
+		}
+
+		/* Log that the data was zeroed out. */
+		data = page_zip->data + page_zip->m_end;
+		ut_ad(!*data);
+		if (UNIV_UNLIKELY(heap_no - 1 >= 64)) {
+			*data++ = (byte) (0x80 | (heap_no - 1) >> 7);
+			ut_ad(!*data);
+		}
+		*data++ = (byte) ((heap_no - 1) << 1 | 1);
+		ut_ad(!*data);
+		ut_ad((ulint) (data - page_zip->data)
+		      < page_zip_get_size(page_zip));
+		page_zip->m_end = data - page_zip->data;
+		page_zip->m_nonempty = TRUE;
+	} else if (page_is_leaf(page) && dict_index_is_clust(index)) {
+		/* Do not clear the record, because there is not enough space
+		to log the operation. */
+
+		if (rec_offs_any_extern(offsets)) {
+			ulint	i;
+
+			for (i = rec_offs_n_fields(offsets); i--; ) {
+				/* Clear all BLOB pointers in order to make
+				page_zip_validate() pass. */
+				if (rec_offs_nth_extern(offsets, i)) {
+					ulint	len;
+					byte*	field = rec_get_nth_field(
+						rec, offsets, i, &len);
+					memset(field + len
+					       - BTR_EXTERN_FIELD_REF_SIZE,
+					       0, BTR_EXTERN_FIELD_REF_SIZE);
+				}
+			}
+		}
+	}
+
+#ifdef UNIV_ZIP_DEBUG
+	ut_a(page_zip_validate(page_zip, page));
+#endif /* UNIV_ZIP_DEBUG */
+}
+
+/**********************************************************************//**
+Write the "deleted" flag of a record on a compressed page.  The flag must
+already have been written on the uncompressed page. */
+UNIV_INTERN
+void
+page_zip_rec_set_deleted(
+/*=====================*/
+	page_zip_des_t*	page_zip,/*!< in/out: compressed page */
+	const byte*	rec,	/*!< in: record on the uncompressed page */
+	ulint		flag)	/*!< in: the deleted flag (nonzero=TRUE) */
+{
+	byte*	slot = page_zip_dir_find(page_zip, page_offset(rec));
+	ut_a(slot);
+	UNIV_MEM_ASSERT_RW(page_zip->data, page_zip_get_size(page_zip));
+	if (flag) {
+		*slot |= (PAGE_ZIP_DIR_SLOT_DEL >> 8);
+	} else {
+		*slot &= ~(PAGE_ZIP_DIR_SLOT_DEL >> 8);
+	}
+#ifdef UNIV_ZIP_DEBUG
+	ut_a(page_zip_validate(page_zip, page_align(rec)));
+#endif /* UNIV_ZIP_DEBUG */
+}
+
+/**********************************************************************//**
+Write the "owned" flag of a record on a compressed page.  The n_owned field
+must already have been written on the uncompressed page. */
+UNIV_INTERN
+void
+page_zip_rec_set_owned(
+/*===================*/
+	page_zip_des_t*	page_zip,/*!< in/out: compressed page */
+	const byte*	rec,	/*!< in: record on the uncompressed page */
+	ulint		flag)	/*!< in: the owned flag (nonzero=TRUE) */
+{
+	byte*	slot = page_zip_dir_find(page_zip, page_offset(rec));
+	ut_a(slot);
+	UNIV_MEM_ASSERT_RW(page_zip->data, page_zip_get_size(page_zip));
+	if (flag) {
+		*slot |= (PAGE_ZIP_DIR_SLOT_OWNED >> 8);
+	} else {
+		*slot &= ~(PAGE_ZIP_DIR_SLOT_OWNED >> 8);
+	}
+}
+
+/**********************************************************************//**
+Insert a record to the dense page directory. */
+UNIV_INTERN
+void
+page_zip_dir_insert(
+/*================*/
+	page_zip_des_t*	page_zip,/*!< in/out: compressed page */
+	const byte*	prev_rec,/*!< in: record after which to insert */
+	const byte*	free_rec,/*!< in: record from which rec was
+				allocated, or NULL */
+	byte*		rec)	/*!< in: record to insert */
+{
+	ulint	n_dense;
+	byte*	slot_rec;
+	byte*	slot_free;
+
+	ut_ad(prev_rec != rec);
+	ut_ad(page_rec_get_next((rec_t*) prev_rec) == rec);
+	ut_ad(page_zip_simple_validate(page_zip));
+
+	UNIV_MEM_ASSERT_RW(page_zip->data, page_zip_get_size(page_zip));
+
+	if (page_rec_is_infimum(prev_rec)) {
+		/* Use the first slot. */
+		slot_rec = page_zip->data + page_zip_get_size(page_zip);
+	} else {
+		byte*	end	= page_zip->data + page_zip_get_size(page_zip);
+		byte*	start	= end - page_zip_dir_user_size(page_zip);
+
+		if (UNIV_LIKELY(!free_rec)) {
+			/* PAGE_N_RECS was already incremented
+			in page_cur_insert_rec_zip(), but the
+			dense directory slot at that position
+			contains garbage.  Skip it. */
+			start += PAGE_ZIP_DIR_SLOT_SIZE;
+		}
+
+		slot_rec = page_zip_dir_find_low(start, end,
+						 page_offset(prev_rec));
+		ut_a(slot_rec);
+	}
+
+	/* Read the old n_dense (n_heap may have been incremented). */
+	n_dense = page_dir_get_n_heap(page_zip->data)
+		- (PAGE_HEAP_NO_USER_LOW + 1);
+
+	if (UNIV_LIKELY_NULL(free_rec)) {
+		/* The record was allocated from the free list.
+		Shift the dense directory only up to that slot.
+		Note that in this case, n_dense is actually
+		off by one, because page_cur_insert_rec_zip()
+		did not increment n_heap. */
+		ut_ad(rec_get_heap_no_new(rec) < n_dense + 1
+		      + PAGE_HEAP_NO_USER_LOW);
+		ut_ad(rec >= free_rec);
+		slot_free = page_zip_dir_find(page_zip, page_offset(free_rec));
+		ut_ad(slot_free);
+		slot_free += PAGE_ZIP_DIR_SLOT_SIZE;
+	} else {
+		/* The record was allocated from the heap.
+		Shift the entire dense directory. */
+		ut_ad(rec_get_heap_no_new(rec) == n_dense
+		      + PAGE_HEAP_NO_USER_LOW);
+
+		/* Shift to the end of the dense page directory. */
+		slot_free = page_zip->data + page_zip_get_size(page_zip)
+			- PAGE_ZIP_DIR_SLOT_SIZE * n_dense;
+	}
+
+	/* Shift the dense directory to allocate place for rec. */
+	memmove(slot_free - PAGE_ZIP_DIR_SLOT_SIZE, slot_free,
+		slot_rec - slot_free);
+
+	/* Write the entry for the inserted record.
+	The "owned" and "deleted" flags must be zero. */
+	mach_write_to_2(slot_rec - PAGE_ZIP_DIR_SLOT_SIZE, page_offset(rec));
+}
+
+/**********************************************************************//**
+Shift the dense page directory and the array of BLOB pointers
+when a record is deleted. */
+UNIV_INTERN
+void
+page_zip_dir_delete(
+/*================*/
+	page_zip_des_t*	page_zip,/*!< in/out: compressed page */
+	byte*		rec,	/*!< in: record to delete */
+	dict_index_t*	index,	/*!< in: index of rec */
+	const ulint*	offsets,/*!< in: rec_get_offsets(rec) */
+	const byte*	free)	/*!< in: previous start of the free list */
+{
+	byte*	slot_rec;
+	byte*	slot_free;
+	ulint	n_ext;
+	page_t*	page	= page_align(rec);
+
+	ut_ad(rec_offs_validate(rec, index, offsets));
+	ut_ad(rec_offs_comp(offsets));
+
+	UNIV_MEM_ASSERT_RW(page_zip->data, page_zip_get_size(page_zip));
+	UNIV_MEM_ASSERT_RW(rec, rec_offs_data_size(offsets));
+	UNIV_MEM_ASSERT_RW(rec - rec_offs_extra_size(offsets),
+			   rec_offs_extra_size(offsets));
+
+	slot_rec = page_zip_dir_find(page_zip, page_offset(rec));
+
+	ut_a(slot_rec);
+
+	/* This could not be done before page_zip_dir_find(). */
+	page_header_set_field(page, page_zip, PAGE_N_RECS,
+			      (ulint)(page_get_n_recs(page) - 1));
+
+	if (UNIV_UNLIKELY(!free)) {
+		/* Make the last slot the start of the free list. */
+		slot_free = page_zip->data + page_zip_get_size(page_zip)
+			- PAGE_ZIP_DIR_SLOT_SIZE
+			* (page_dir_get_n_heap(page_zip->data)
+			   - PAGE_HEAP_NO_USER_LOW);
+	} else {
+		slot_free = page_zip_dir_find_free(page_zip,
+						   page_offset(free));
+		ut_a(slot_free < slot_rec);
+		/* Grow the free list by one slot by moving the start. */
+		slot_free += PAGE_ZIP_DIR_SLOT_SIZE;
+	}
+
+	if (UNIV_LIKELY(slot_rec > slot_free)) {
+		memmove(slot_free + PAGE_ZIP_DIR_SLOT_SIZE,
+			slot_free,
+			slot_rec - slot_free);
+	}
+
+	/* Write the entry for the deleted record.
+	The "owned" and "deleted" flags will be cleared. */
+	mach_write_to_2(slot_free, page_offset(rec));
+
+	if (!page_is_leaf(page) || !dict_index_is_clust(index)) {
+		ut_ad(!rec_offs_any_extern(offsets));
+		goto skip_blobs;
+	}
+
+	n_ext = rec_offs_n_extern(offsets);
+	if (UNIV_UNLIKELY(n_ext)) {
+		/* Shift and zero fill the array of BLOB pointers. */
+		ulint	blob_no;
+		byte*	externs;
+		byte*	ext_end;
+
+		blob_no = page_zip_get_n_prev_extern(page_zip, rec, index);
+		ut_a(blob_no + n_ext <= page_zip->n_blobs);
+
+		externs = page_zip->data + page_zip_get_size(page_zip)
+			- (page_dir_get_n_heap(page) - PAGE_HEAP_NO_USER_LOW)
+			* (PAGE_ZIP_DIR_SLOT_SIZE
+			   + DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN);
+
+		ext_end = externs - page_zip->n_blobs
+			* BTR_EXTERN_FIELD_REF_SIZE;
+		externs -= blob_no * BTR_EXTERN_FIELD_REF_SIZE;
+
+		page_zip->n_blobs -= n_ext;
+		/* Shift and zero fill the array. */
+		memmove(ext_end + n_ext * BTR_EXTERN_FIELD_REF_SIZE, ext_end,
+			(page_zip->n_blobs - blob_no)
+			* BTR_EXTERN_FIELD_REF_SIZE);
+		memset(ext_end, 0, n_ext * BTR_EXTERN_FIELD_REF_SIZE);
+	}
+
+skip_blobs:
+	/* The compression algorithm expects info_bits and n_owned
+	to be 0 for deleted records. */
+	rec[-REC_N_NEW_EXTRA_BYTES] = 0; /* info_bits and n_owned */
+
+	page_zip_clear_rec(page_zip, rec, index, offsets);
+}
+
+/**********************************************************************//**
+Add a slot to the dense page directory. */
+UNIV_INTERN
+void
+page_zip_dir_add_slot(
+/*==================*/
+	page_zip_des_t*	page_zip,	/*!< in/out: compressed page */
+	ulint		is_clustered)	/*!< in: nonzero for clustered index,
+					zero for others */
+{
+	ulint	n_dense;
+	byte*	dir;
+	byte*	stored;
+
+	ut_ad(page_is_comp(page_zip->data));
+	UNIV_MEM_ASSERT_RW(page_zip->data, page_zip_get_size(page_zip));
+
+	/* Read the old n_dense (n_heap has already been incremented). */
+	n_dense = page_dir_get_n_heap(page_zip->data)
+		- (PAGE_HEAP_NO_USER_LOW + 1);
+
+	dir = page_zip->data + page_zip_get_size(page_zip)
+		- PAGE_ZIP_DIR_SLOT_SIZE * n_dense;
+
+	if (!page_is_leaf(page_zip->data)) {
+		ut_ad(!page_zip->n_blobs);
+		stored = dir - n_dense * REC_NODE_PTR_SIZE;
+	} else if (UNIV_UNLIKELY(is_clustered)) {
+		/* Move the BLOB pointer array backwards to make space for the
+		roll_ptr and trx_id columns and the dense directory slot. */
+		byte*	externs;
+
+		stored = dir - n_dense
+			* (DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN);
+		externs = stored
+			- page_zip->n_blobs * BTR_EXTERN_FIELD_REF_SIZE;
+		ASSERT_ZERO(externs
+			    - (PAGE_ZIP_DIR_SLOT_SIZE
+			       + DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN),
+			    PAGE_ZIP_DIR_SLOT_SIZE
+			    + DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN);
+		memmove(externs - (PAGE_ZIP_DIR_SLOT_SIZE
+				   + DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN),
+			externs, stored - externs);
+	} else {
+		stored = dir
+			- page_zip->n_blobs * BTR_EXTERN_FIELD_REF_SIZE;
+		ASSERT_ZERO(stored - PAGE_ZIP_DIR_SLOT_SIZE,
+			    PAGE_ZIP_DIR_SLOT_SIZE);
+	}
+
+	/* Move the uncompressed area backwards to make space
+	for one directory slot. */
+	memmove(stored - PAGE_ZIP_DIR_SLOT_SIZE, stored, dir - stored);
+}
+
+/***********************************************************//**
+Parses a log record of writing to the header of a page.
+@return	end of log record or NULL */
+UNIV_INTERN
+byte*
+page_zip_parse_write_header(
+/*========================*/
+	byte*		ptr,	/*!< in: redo log buffer */
+	byte*		end_ptr,/*!< in: redo log buffer end */
+	page_t*		page,	/*!< in/out: uncompressed page */
+	page_zip_des_t*	page_zip)/*!< in/out: compressed page */
+{
+	ulint	offset;
+	ulint	len;
+
+	ut_ad(ptr && end_ptr);
+	ut_ad(!page == !page_zip);
+
+	if (UNIV_UNLIKELY(end_ptr < ptr + (1 + 1))) {
+
+		return(NULL);
+	}
+
+	offset = (ulint) *ptr++;
+	len = (ulint) *ptr++;
+
+	if (UNIV_UNLIKELY(!len) || UNIV_UNLIKELY(offset + len >= PAGE_DATA)) {
+corrupt:
+		recv_sys->found_corrupt_log = TRUE;
+
+		return(NULL);
+	}
+
+	if (UNIV_UNLIKELY(end_ptr < ptr + len)) {
+
+		return(NULL);
+	}
+
+	if (page) {
+		if (UNIV_UNLIKELY(!page_zip)) {
+
+			goto corrupt;
+		}
+#ifdef UNIV_ZIP_DEBUG
+		ut_a(page_zip_validate(page_zip, page));
+#endif /* UNIV_ZIP_DEBUG */
+
+		memcpy(page + offset, ptr, len);
+		memcpy(page_zip->data + offset, ptr, len);
+
+#ifdef UNIV_ZIP_DEBUG
+		ut_a(page_zip_validate(page_zip, page));
+#endif /* UNIV_ZIP_DEBUG */
+	}
+
+	return(ptr + len);
+}
+
+#ifndef UNIV_HOTBACKUP
+/**********************************************************************//**
+Write a log record of writing to the uncompressed header portion of a page. */
+UNIV_INTERN
+void
+page_zip_write_header_log(
+/*======================*/
+	const byte*	data,	/*!< in: data on the uncompressed page */
+	ulint		length,	/*!< in: length of the data */
+	mtr_t*		mtr)	/*!< in: mini-transaction */
+{
+	byte*	log_ptr	= mlog_open(mtr, 11 + 1 + 1);
+	ulint	offset	= page_offset(data);
+
+	ut_ad(offset < PAGE_DATA);
+	ut_ad(offset + length < PAGE_DATA);
+#if PAGE_DATA > 255
+# error "PAGE_DATA > 255"
+#endif
+	ut_ad(length < 256);
+
+	/* If no logging is requested, we may return now */
+	if (UNIV_UNLIKELY(!log_ptr)) {
+
+		return;
+	}
+
+	log_ptr = mlog_write_initial_log_record_fast(
+		(byte*) data, MLOG_ZIP_WRITE_HEADER, log_ptr, mtr);
+	*log_ptr++ = (byte) offset;
+	*log_ptr++ = (byte) length;
+	mlog_close(mtr, log_ptr);
+
+	mlog_catenate_string(mtr, data, length);
+}
+#endif /* !UNIV_HOTBACKUP */
+
+/**********************************************************************//**
+Reorganize and compress a page.  This is a low-level operation for
+compressed pages, to be used when page_zip_compress() fails.
+On success, a redo log entry MLOG_ZIP_PAGE_COMPRESS will be written.
+The function btr_page_reorganize() should be preferred whenever possible.
+IMPORTANT: if page_zip_reorganize() is invoked on a leaf page of a
+non-clustered index, the caller must update the insert buffer free
+bits in the same mini-transaction in such a way that the modification
+will be redo-logged.
+@return TRUE on success, FALSE on failure; page_zip will be left
+intact on failure, but page will be overwritten. */
+UNIV_INTERN
+ibool
+page_zip_reorganize(
+/*================*/
+	buf_block_t*	block,	/*!< in/out: page with compressed page;
+				on the compressed page, in: size;
+				out: data, n_blobs,
+				m_start, m_end, m_nonempty */
+	dict_index_t*	index,	/*!< in: index of the B-tree node */
+	mtr_t*		mtr)	/*!< in: mini-transaction */
+{
+	page_zip_des_t*	page_zip	= buf_block_get_page_zip(block);
+	page_t*		page		= buf_block_get_frame(block);
+	buf_block_t*	temp_block;
+	page_t*		temp_page;
+	ulint		log_mode;
+
+	ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
+	ut_ad(page_is_comp(page));
+	ut_ad(!dict_index_is_ibuf(index));
+	/* Note that page_zip_validate(page_zip, page) may fail here. */
+	UNIV_MEM_ASSERT_RW(page, UNIV_PAGE_SIZE);
+	UNIV_MEM_ASSERT_RW(page_zip->data, page_zip_get_size(page_zip));
+
+	/* Disable logging */
+	log_mode = mtr_set_log_mode(mtr, MTR_LOG_NONE);
+
+#ifndef UNIV_HOTBACKUP
+	temp_block = buf_block_alloc(0);
+	btr_search_drop_page_hash_index(block);
+	block->check_index_page_at_flush = TRUE;
+#else /* !UNIV_HOTBACKUP */
+	ut_ad(block == back_block1);
+	temp_block = back_block2;
+#endif /* !UNIV_HOTBACKUP */
+	temp_page = temp_block->frame;
+
+	/* Copy the old page to temporary space */
+	buf_frame_copy(temp_page, page);
+
+	/* Recreate the page: note that global data on page (possible
+	segment headers, next page-field, etc.) is preserved intact */
+
+	page_create(block, mtr, TRUE);
+
+	/* Copy the records from the temporary space to the recreated page;
+	do not copy the lock bits yet */
+
+	page_copy_rec_list_end_no_locks(block, temp_block,
+					page_get_infimum_rec(temp_page),
+					index, mtr);
+
+	if (!dict_index_is_clust(index) && page_is_leaf(temp_page)) {
+		/* Copy max trx id to recreated page */
+		trx_id_t	max_trx_id = page_get_max_trx_id(temp_page);
+		page_set_max_trx_id(block, NULL, max_trx_id, NULL);
+		ut_ad(!ut_dulint_is_zero(max_trx_id));
+	}
+
+	/* Restore logging. */
+	mtr_set_log_mode(mtr, log_mode);
+
+	if (UNIV_UNLIKELY(!page_zip_compress(page_zip, page, index, mtr))) {
+
+#ifndef UNIV_HOTBACKUP
+		buf_block_free(temp_block);
+#endif /* !UNIV_HOTBACKUP */
+		return(FALSE);
+	}
+
+	lock_move_reorganize_page(block, temp_block);
+
+#ifndef UNIV_HOTBACKUP
+	buf_block_free(temp_block);
+#endif /* !UNIV_HOTBACKUP */
+	return(TRUE);
+}
+
+#ifndef UNIV_HOTBACKUP
+/**********************************************************************//**
+Copy the records of a page byte for byte.  Do not copy the page header
+or trailer, except those B-tree header fields that are directly
+related to the storage of records.  Also copy PAGE_MAX_TRX_ID.
+NOTE: The caller must update the lock table and the adaptive hash index. */
+UNIV_INTERN
+void
+page_zip_copy_recs(
+/*===============*/
+	page_zip_des_t*		page_zip,	/*!< out: copy of src_zip
+						(n_blobs, m_start, m_end,
+						m_nonempty, data[0..size-1]) */
+	page_t*			page,		/*!< out: copy of src */
+	const page_zip_des_t*	src_zip,	/*!< in: compressed page */
+	const page_t*		src,		/*!< in: page */
+	dict_index_t*		index,		/*!< in: index of the B-tree */
+	mtr_t*			mtr)		/*!< in: mini-transaction */
+{
+	ut_ad(mtr_memo_contains_page(mtr, page, MTR_MEMO_PAGE_X_FIX));
+	ut_ad(mtr_memo_contains_page(mtr, (page_t*) src, MTR_MEMO_PAGE_X_FIX));
+	ut_ad(!dict_index_is_ibuf(index));
+#ifdef UNIV_ZIP_DEBUG
+	/* The B-tree operations that call this function may set
+	FIL_PAGE_PREV or PAGE_LEVEL, causing a temporary min_rec_flag
+	mismatch.  A strict page_zip_validate() will be executed later
+	during the B-tree operations. */
+	ut_a(page_zip_validate_low(src_zip, src, TRUE));
+#endif /* UNIV_ZIP_DEBUG */
+	ut_a(page_zip_get_size(page_zip) == page_zip_get_size(src_zip));
+	if (UNIV_UNLIKELY(src_zip->n_blobs)) {
+		ut_a(page_is_leaf(src));
+		ut_a(dict_index_is_clust(index));
+	}
+
+	/* The PAGE_MAX_TRX_ID must be set on leaf pages of secondary
+	indexes.  It does not matter on other pages. */
+	ut_a(dict_index_is_clust(index) || !page_is_leaf(src)
+	     || !ut_dulint_is_zero(page_get_max_trx_id(src)));
+
+	UNIV_MEM_ASSERT_W(page, UNIV_PAGE_SIZE);
+	UNIV_MEM_ASSERT_W(page_zip->data, page_zip_get_size(page_zip));
+	UNIV_MEM_ASSERT_RW(src, UNIV_PAGE_SIZE);
+	UNIV_MEM_ASSERT_RW(src_zip->data, page_zip_get_size(page_zip));
+
+	/* Copy those B-tree page header fields that are related to
+	the records stored in the page.  Also copy the field
+	PAGE_MAX_TRX_ID.  Skip the rest of the page header and
+	trailer.  On the compressed page, there is no trailer. */
+#if PAGE_MAX_TRX_ID + 8 != PAGE_HEADER_PRIV_END
+# error "PAGE_MAX_TRX_ID + 8 != PAGE_HEADER_PRIV_END"
+#endif
+	memcpy(PAGE_HEADER + page, PAGE_HEADER + src,
+	       PAGE_HEADER_PRIV_END);
+	memcpy(PAGE_DATA + page, PAGE_DATA + src,
+	       UNIV_PAGE_SIZE - PAGE_DATA - FIL_PAGE_DATA_END);
+	memcpy(PAGE_HEADER + page_zip->data, PAGE_HEADER + src_zip->data,
+	       PAGE_HEADER_PRIV_END);
+	memcpy(PAGE_DATA + page_zip->data, PAGE_DATA + src_zip->data,
+	       page_zip_get_size(page_zip) - PAGE_DATA);
+
+	/* Copy all fields of src_zip to page_zip, except the pointer
+	to the compressed data page. */
+	{
+		page_zip_t*	data = page_zip->data;
+		memcpy(page_zip, src_zip, sizeof *page_zip);
+		page_zip->data = data;
+	}
+	ut_ad(page_zip_get_trailer_len(page_zip,
+				       dict_index_is_clust(index), NULL)
+	      + page_zip->m_end < page_zip_get_size(page_zip));
+
+	if (!page_is_leaf(src)
+	    && UNIV_UNLIKELY(mach_read_from_4(src + FIL_PAGE_PREV) == FIL_NULL)
+	    && UNIV_LIKELY(mach_read_from_4(page
+					    + FIL_PAGE_PREV) != FIL_NULL)) {
+		/* Clear the REC_INFO_MIN_REC_FLAG of the first user record. */
+		ulint	offs = rec_get_next_offs(page + PAGE_NEW_INFIMUM,
+						 TRUE);
+		if (UNIV_LIKELY(offs != PAGE_NEW_SUPREMUM)) {
+			rec_t*	rec = page + offs;
+			ut_a(rec[-REC_N_NEW_EXTRA_BYTES]
+			     & REC_INFO_MIN_REC_FLAG);
+			rec[-REC_N_NEW_EXTRA_BYTES] &= ~ REC_INFO_MIN_REC_FLAG;
+		}
+	}
+
+#ifdef UNIV_ZIP_DEBUG
+	ut_a(page_zip_validate(page_zip, page));
+#endif /* UNIV_ZIP_DEBUG */
+
+	page_zip_compress_write_log(page_zip, page, index, mtr);
+}
+#endif /* !UNIV_HOTBACKUP */
+
+/**********************************************************************//**
+Parses a log record of compressing an index page.
+@return	end of log record or NULL */
+UNIV_INTERN
+byte*
+page_zip_parse_compress(
+/*====================*/
+	byte*		ptr,	/*!< in: buffer */
+	byte*		end_ptr,/*!< in: buffer end */
+	page_t*		page,	/*!< out: uncompressed page */
+	page_zip_des_t*	page_zip)/*!< out: compressed page */
+{
+	ulint	size;
+	ulint	trailer_size;
+
+	ut_ad(ptr && end_ptr);
+	ut_ad(!page == !page_zip);
+
+	if (UNIV_UNLIKELY(ptr + (2 + 2) > end_ptr)) {
+
+		return(NULL);
+	}
+
+	size = mach_read_from_2(ptr);
+	ptr += 2;
+	trailer_size = mach_read_from_2(ptr);
+	ptr += 2;
+
+	if (UNIV_UNLIKELY(ptr + 8 + size + trailer_size > end_ptr)) {
+
+		return(NULL);
+	}
+
+	if (page) {
+		if (UNIV_UNLIKELY(!page_zip)
+		    || UNIV_UNLIKELY(page_zip_get_size(page_zip) < size)) {
+corrupt:
+			recv_sys->found_corrupt_log = TRUE;
+
+			return(NULL);
+		}
+
+		memcpy(page_zip->data + FIL_PAGE_PREV, ptr, 4);
+		memcpy(page_zip->data + FIL_PAGE_NEXT, ptr + 4, 4);
+		memcpy(page_zip->data + FIL_PAGE_TYPE, ptr + 8, size);
+		memset(page_zip->data + FIL_PAGE_TYPE + size, 0,
+		       page_zip_get_size(page_zip) - trailer_size
+		       - (FIL_PAGE_TYPE + size));
+		memcpy(page_zip->data + page_zip_get_size(page_zip)
+		       - trailer_size, ptr + 8 + size, trailer_size);
+
+		if (UNIV_UNLIKELY(!page_zip_decompress(page_zip, page,
+						       TRUE))) {
+
+			goto corrupt;
+		}
+	}
+
+	return(ptr + 8 + size + trailer_size);
+}
+
+/**********************************************************************//**
+Calculate the compressed page checksum.
+@return	page checksum */
+UNIV_INTERN
+ulint
+page_zip_calc_checksum(
+/*===================*/
+	const void*	data,	/*!< in: compressed page */
+	ulint		size)	/*!< in: size of compressed page */
+{
+	/* Exclude FIL_PAGE_SPACE_OR_CHKSUM, FIL_PAGE_LSN,
+	and FIL_PAGE_FILE_FLUSH_LSN from the checksum. */
+
+	const Bytef*	s	= data;
+	uLong		adler;
+
+	ut_ad(size > FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID);
+
+	adler = adler32(0L, s + FIL_PAGE_OFFSET,
+			FIL_PAGE_LSN - FIL_PAGE_OFFSET);
+	adler = adler32(adler, s + FIL_PAGE_TYPE, 2);
+	adler = adler32(adler, s + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID,
+			size - FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID);
+
+	return((ulint) adler);
+}
diff --git a/storage/xtradb/pars/lexyy.c b/storage/xtradb/pars/lexyy.c
new file mode 100644
index 00000000000..fc6b5102581
--- /dev/null
+++ b/storage/xtradb/pars/lexyy.c
@@ -0,0 +1,2795 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+#include "univ.i"
+#line 2 "lexyy.c"
+
+#line 4 "lexyy.c"
+
+#define  YY_INT_ALIGNED short int
+
+/* A lexical scanner generated by flex */
+
+#define FLEX_SCANNER
+#define YY_FLEX_MAJOR_VERSION 2
+#define YY_FLEX_MINOR_VERSION 5
+#define YY_FLEX_SUBMINOR_VERSION 31
+#if YY_FLEX_SUBMINOR_VERSION > 0
+#define FLEX_BETA
+#endif
+
+/* First, we deal with  platform-specific or compiler-specific issues. */
+
+/* begin standard C headers. */
+#include <stdio.h>
+#include <string.h>
+#include <errno.h>
+#include <stdlib.h>
+
+/* end standard C headers. */
+
+/* flex integer type definitions */
+
+#ifndef FLEXINT_H
+#define FLEXINT_H
+
+/* C99 systems have <inttypes.h>. Non-C99 systems may or may not. */
+
+#if defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L
+#include <inttypes.h>
+typedef int8_t flex_int8_t;
+typedef uint8_t flex_uint8_t;
+typedef int16_t flex_int16_t;
+typedef uint16_t flex_uint16_t;
+typedef int32_t flex_int32_t;
+typedef uint32_t flex_uint32_t;
+#else
+typedef signed char flex_int8_t;
+typedef short int flex_int16_t;
+typedef int flex_int32_t;
+typedef unsigned char flex_uint8_t; 
+typedef unsigned short int flex_uint16_t;
+typedef unsigned int flex_uint32_t;
+#endif /* ! C99 */
+
+/* Limits of integral types. */
+#ifndef INT8_MIN
+#define INT8_MIN               (-128)
+#endif
+#ifndef INT16_MIN
+#define INT16_MIN              (-32767-1)
+#endif
+#ifndef INT32_MIN
+#define INT32_MIN              (-2147483647-1)
+#endif
+#ifndef INT8_MAX
+#define INT8_MAX               (127)
+#endif
+#ifndef INT16_MAX
+#define INT16_MAX              (32767)
+#endif
+#ifndef INT32_MAX
+#define INT32_MAX              (2147483647)
+#endif
+#ifndef UINT8_MAX
+#define UINT8_MAX              (255U)
+#endif
+#ifndef UINT16_MAX
+#define UINT16_MAX             (65535U)
+#endif
+#ifndef UINT32_MAX
+#define UINT32_MAX             (4294967295U)
+#endif
+
+#endif /* ! FLEXINT_H */
+
+#ifdef __cplusplus
+
+/* The "const" storage-class-modifier is valid. */
+#define YY_USE_CONST
+
+#else	/* ! __cplusplus */
+
+#if __STDC__
+
+#define YY_USE_CONST
+
+#endif	/* __STDC__ */
+#endif	/* ! __cplusplus */
+
+#ifdef YY_USE_CONST
+#define yyconst const
+#else
+#define yyconst
+#endif
+
+/* Returned upon end-of-file. */
+#define YY_NULL 0
+
+/* Promotes a possibly negative, possibly signed char to an unsigned
+ * integer for use as an array index.  If the signed char is negative,
+ * we want to instead treat it as an 8-bit unsigned char, hence the
+ * double cast.
+ */
+#define YY_SC_TO_UI(c) ((unsigned int) (unsigned char) c)
+
+/* Enter a start condition.  This macro really ought to take a parameter,
+ * but we do it the disgusting crufty way forced on us by the ()-less
+ * definition of BEGIN.
+ */
+#define BEGIN (yy_start) = 1 + 2 *
+
+/* Translate the current start state into a value that can be later handed
+ * to BEGIN to return to the state.  The YYSTATE alias is for lex
+ * compatibility.
+ */
+#define YY_START (((yy_start) - 1) / 2)
+#define YYSTATE YY_START
+
+/* Action number for EOF rule of a given start state. */
+#define YY_STATE_EOF(state) (YY_END_OF_BUFFER + state + 1)
+
+/* Special action meaning "start processing a new file". */
+#define YY_NEW_FILE yyrestart(yyin  )
+
+#define YY_END_OF_BUFFER_CHAR 0
+
+/* Size of default input buffer. */
+#ifndef YY_BUF_SIZE
+#define YY_BUF_SIZE 16384
+#endif
+
+#ifndef YY_TYPEDEF_YY_BUFFER_STATE
+#define YY_TYPEDEF_YY_BUFFER_STATE
+typedef struct yy_buffer_state *YY_BUFFER_STATE;
+#endif
+
+static int yyleng;
+
+static FILE *yyin, *yyout;
+
+#define EOB_ACT_CONTINUE_SCAN 0
+#define EOB_ACT_END_OF_FILE 1
+#define EOB_ACT_LAST_MATCH 2
+
+    #define YY_LESS_LINENO(n)
+    
+/* Return all but the first "n" matched characters back to the input stream. */
+#define yyless(n) \
+	do \
+		{ \
+		/* Undo effects of setting up yytext. */ \
+        int yyless_macro_arg = (n); \
+        YY_LESS_LINENO(yyless_macro_arg);\
+		*yy_cp = (yy_hold_char); \
+		YY_RESTORE_YY_MORE_OFFSET \
+		(yy_c_buf_p) = yy_cp = yy_bp + yyless_macro_arg - YY_MORE_ADJ; \
+		YY_DO_BEFORE_ACTION; /* set up yytext again */ \
+		} \
+	while ( 0 )
+
+#define unput(c) yyunput( c, (yytext_ptr)  )
+
+/* The following is because we cannot portably get our hands on size_t
+ * (without autoconf's help, which isn't available because we want
+ * flex-generated scanners to compile on their own).
+ */
+
+#ifndef YY_TYPEDEF_YY_SIZE_T
+#define YY_TYPEDEF_YY_SIZE_T
+typedef unsigned int yy_size_t;
+#endif
+
+#ifndef YY_STRUCT_YY_BUFFER_STATE
+#define YY_STRUCT_YY_BUFFER_STATE
+struct yy_buffer_state
+	{
+	FILE *yy_input_file;
+
+	char *yy_ch_buf;		/* input buffer */
+	char *yy_buf_pos;		/* current position in input buffer */
+
+	/* Size of input buffer in bytes, not including room for EOB
+	 * characters.
+	 */
+	yy_size_t yy_buf_size;
+
+	/* Number of characters read into yy_ch_buf, not including EOB
+	 * characters.
+	 */
+	int yy_n_chars;
+
+	/* Whether we "own" the buffer - i.e., we know we created it,
+	 * and can realloc() it to grow it, and should free() it to
+	 * delete it.
+	 */
+	int yy_is_our_buffer;
+
+	/* Whether this is an "interactive" input source; if so, and
+	 * if we're using stdio for input, then we want to use getc()
+	 * instead of fread(), to make sure we stop fetching input after
+	 * each newline.
+	 */
+	int yy_is_interactive;
+
+	/* Whether we're considered to be at the beginning of a line.
+	 * If so, '^' rules will be active on the next match, otherwise
+	 * not.
+	 */
+	int yy_at_bol;
+
+    int yy_bs_lineno; /**< The line count. */
+    int yy_bs_column; /**< The column count. */
+    
+	/* Whether to try to fill the input buffer when we reach the
+	 * end of it.
+	 */
+	int yy_fill_buffer;
+
+	int yy_buffer_status;
+
+#define YY_BUFFER_NEW 0
+#define YY_BUFFER_NORMAL 1
+	/* When an EOF's been seen but there's still some text to process
+	 * then we mark the buffer as YY_EOF_PENDING, to indicate that we
+	 * shouldn't try reading from the input source any more.  We might
+	 * still have a bunch of tokens to match, though, because of
+	 * possible backing-up.
+	 *
+	 * When we actually see the EOF, we change the status to "new"
+	 * (via yyrestart()), so that the user can continue scanning by
+	 * just pointing yyin at a new input file.
+	 */
+#define YY_BUFFER_EOF_PENDING 2
+
+	};
+#endif /* !YY_STRUCT_YY_BUFFER_STATE */
+
+/* Stack of input buffers. */
+static size_t yy_buffer_stack_top = 0; /**< index of top of stack. */
+static size_t yy_buffer_stack_max = 0; /**< capacity of stack. */
+static YY_BUFFER_STATE * yy_buffer_stack = 0; /**< Stack as an array. */
+
+/* We provide macros for accessing buffer states in case in the
+ * future we want to put the buffer states in a more general
+ * "scanner state".
+ *
+ * Returns the top of the stack, or NULL.
+ */
+#define YY_CURRENT_BUFFER ( (yy_buffer_stack) \
+                          ? (yy_buffer_stack)[(yy_buffer_stack_top)] \
+                          : NULL)
+
+/* Same as previous macro, but useful when we know that the buffer stack is not
+ * NULL or when we need an lvalue. For internal use only.
+ */
+#define YY_CURRENT_BUFFER_LVALUE (yy_buffer_stack)[(yy_buffer_stack_top)]
+
+/* yy_hold_char holds the character lost when yytext is formed. */
+static char yy_hold_char;
+static int yy_n_chars;		/* number of characters read into yy_ch_buf */
+static int yyleng;
+
+/* Points to current character in buffer. */
+static char *yy_c_buf_p = (char *) 0;
+static int yy_init = 1;		/* whether we need to initialize */
+static int yy_start = 0;	/* start state number */
+
+/* Flag which is used to allow yywrap()'s to do buffer switches
+ * instead of setting up a fresh yyin.  A bit of a hack ...
+ */
+static int yy_did_buffer_switch_on_eof;
+
+static void yyrestart (FILE *input_file  );
+__attribute__((unused)) static void yy_switch_to_buffer (YY_BUFFER_STATE new_buffer  );
+static YY_BUFFER_STATE yy_create_buffer (FILE *file,int size  );
+static void yy_delete_buffer (YY_BUFFER_STATE b  );
+static void yy_flush_buffer (YY_BUFFER_STATE b  );
+__attribute__((unused)) static void yypush_buffer_state (YY_BUFFER_STATE new_buffer  );
+__attribute__((unused)) static void yypop_buffer_state (void );
+
+static void yyensure_buffer_stack (void );
+static void yy_load_buffer_state (void );
+static void yy_init_buffer (YY_BUFFER_STATE b,FILE *file  );
+
+#define YY_FLUSH_BUFFER yy_flush_buffer(YY_CURRENT_BUFFER )
+
+YY_BUFFER_STATE yy_scan_buffer (char *base,yy_size_t size  );
+YY_BUFFER_STATE yy_scan_string (yyconst char *yy_str  );
+YY_BUFFER_STATE yy_scan_bytes (yyconst char *bytes,int len  );
+
+static void *yyalloc (yy_size_t  );
+static void *yyrealloc (void *,yy_size_t  );
+static void yyfree (void *  );
+
+#define yy_new_buffer yy_create_buffer
+
+#define yy_set_interactive(is_interactive) \
+	{ \
+	if ( ! YY_CURRENT_BUFFER ){ \
+        yyensure_buffer_stack (); \
+		YY_CURRENT_BUFFER_LVALUE =    \
+            yy_create_buffer(yyin,YY_BUF_SIZE ); \
+	} \
+	YY_CURRENT_BUFFER_LVALUE->yy_is_interactive = is_interactive; \
+	}
+
+#define yy_set_bol(at_bol) \
+	{ \
+	if ( ! YY_CURRENT_BUFFER ){\
+        yyensure_buffer_stack (); \
+		YY_CURRENT_BUFFER_LVALUE =    \
+            yy_create_buffer(yyin,YY_BUF_SIZE ); \
+	} \
+	YY_CURRENT_BUFFER_LVALUE->yy_at_bol = at_bol; \
+	}
+
+#define YY_AT_BOL() (YY_CURRENT_BUFFER_LVALUE->yy_at_bol)
+
+/* Begin user sect3 */
+
+#define yywrap(n) 1
+#define YY_SKIP_YYWRAP
+
+typedef unsigned char YY_CHAR;
+
+static FILE *yyin = (FILE *) 0, *yyout = (FILE *) 0;
+
+typedef int yy_state_type;
+
+static int yylineno;
+
+static int yylineno = 1;
+
+static char *yytext;
+#define yytext_ptr yytext
+
+static yy_state_type yy_get_previous_state (void );
+static yy_state_type yy_try_NUL_trans (yy_state_type current_state  );
+static int yy_get_next_buffer (void );
+static void yy_fatal_error (yyconst char msg[]  );
+
+/* Done after the current pattern has been matched and before the
+ * corresponding action - sets up yytext.
+ */
+#define YY_DO_BEFORE_ACTION \
+	(yytext_ptr) = yy_bp; \
+	yyleng = (size_t) (yy_cp - yy_bp); \
+	(yy_hold_char) = *yy_cp; \
+	*yy_cp = '\0'; \
+	(yy_c_buf_p) = yy_cp;
+
+#define YY_NUM_RULES 119
+#define YY_END_OF_BUFFER 120
+/* This struct is not used in this scanner,
+   but its presence is necessary. */
+struct yy_trans_info
+	{
+	flex_int32_t yy_verify;
+	flex_int32_t yy_nxt;
+	};
+static yyconst flex_int16_t yy_accept[399] =
+    {   0,
+        0,    0,  114,  114,    0,    0,    0,    0,  120,  118,
+      117,  117,    8,  118,  109,    5,   98,  104,  107,  105,
+      102,  106,  118,  108,    1,  118,  103,  101,   99,  100,
+      112,   92,   92,   92,   92,   92,   92,   92,   92,   92,
+       92,   92,   92,   92,   92,   92,   92,   92,   92,   92,
+      110,  111,  114,  115,    6,    7,    9,   10,  117,    4,
+       93,  113,    2,    1,    3,   94,   95,   97,   96,   92,
+       92,   92,   92,   92,   92,   44,   92,   92,   92,   92,
+       92,   92,   92,   92,   92,   92,   92,   92,   92,   92,
+       92,   92,   28,   17,   25,   92,   92,   92,   92,   92,
+
+       54,   61,   92,   14,   92,   92,   92,   92,   92,   92,
+       92,   92,   92,   92,   92,   92,   92,   92,   92,   92,
+       92,   92,  114,  115,  115,  116,    6,    7,    9,   10,
+        2,   13,   45,   92,   92,   92,   92,   92,   92,   92,
+       92,   92,   92,   92,   92,   92,   92,   92,   92,   92,
+       92,   27,   92,   92,   92,   41,   92,   92,   92,   92,
+       21,   92,   92,   92,   92,   15,   92,   92,   92,   18,
+       92,   92,   92,   92,   92,   80,   92,   92,   92,   51,
+       92,   12,   92,   36,   92,   92,   92,   92,   92,   92,
+       92,   92,   92,   92,   92,   92,   92,   92,   20,   24,
+
+       92,   92,   92,   92,   92,   92,   92,   92,   92,   92,
+       46,   92,   92,   30,   92,   87,   92,   92,   39,   92,
+       92,   92,   92,   92,   48,   92,   89,   32,   91,   92,
+       11,   64,   92,   92,   92,   42,   92,   92,   92,   92,
+       92,   92,   92,   92,   92,   92,   29,   92,   92,   92,
+       92,   92,   92,   92,   92,   92,   85,   92,   26,   92,
+       66,   92,   92,   92,   37,   92,   92,   92,   92,   92,
+       92,   92,   31,   65,   23,   92,   57,   92,   75,   92,
+       92,   92,   43,   92,   92,   92,   92,   92,   92,   92,
+       92,   90,   92,   92,   56,   92,   92,   92,   92,   92,
+
+       92,   92,   40,   33,   79,   19,   92,   83,   74,   55,
+       92,   63,   92,   52,   92,   92,   92,   47,   92,   76,
+       92,   78,   92,   92,   34,   92,   92,   92,   35,   72,
+       92,   92,   92,   92,   58,   92,   50,   49,   92,   92,
+       53,   62,   92,   92,   92,   22,   92,   92,   73,   81,
+       92,   92,   77,   92,   68,   92,   92,   92,   92,   38,
+       92,   88,   67,   92,   84,   92,   92,   92,   86,   92,
+       59,   92,   16,   92,   70,   69,   92,   92,   82,   92,
+       92,   92,   92,   92,   92,   92,   92,   92,   92,   71,
+       92,   92,   92,   92,   92,   92,   60,    0
+
+    } ;
+
+static yyconst flex_int32_t yy_ec[256] =
+    {   0,
+        1,    1,    1,    1,    1,    1,    1,    1,    2,    3,
+        1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
+        1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
+        1,    2,    1,    4,    1,    5,    6,    1,    7,    8,
+        9,   10,   11,   12,   13,   14,   15,   16,   16,   16,
+       16,   16,   16,   16,   16,   16,   16,   17,   18,   19,
+       20,   21,   22,    1,   23,   24,   25,   26,   27,   28,
+       29,   30,   31,   32,   33,   34,   35,   36,   37,   38,
+       39,   40,   41,   42,   43,   44,   45,   46,   47,   32,
+        1,    1,    1,    1,   48,    1,   32,   32,   32,   32,
+
+       32,   32,   32,   32,   32,   32,   32,   32,   32,   32,
+       32,   32,   32,   32,   32,   32,   32,   32,   32,   32,
+       32,   32,   49,    1,   50,    1,    1,    1,    1,    1,
+        1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
+        1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
+        1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
+        1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
+        1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
+        1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
+        1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
+
+        1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
+        1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
+        1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
+        1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
+        1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
+        1,    1,    1,    1,    1
+    } ;
+
+static yyconst flex_int32_t yy_meta[51] =
+    {   0,
+        1,    1,    1,    2,    1,    1,    3,    1,    1,    4,
+        1,    1,    1,    1,    1,    5,    1,    1,    1,    6,
+        1,    1,    5,    5,    5,    5,    5,    5,    5,    5,
+        5,    5,    5,    5,    5,    5,    5,    5,    5,    5,
+        5,    5,    5,    5,    5,    5,    5,    5,    1,    1
+    } ;
+
+static yyconst flex_int16_t yy_base[409] =
+    {   0,
+        0,    0,  437,  436,  438,  437,  439,  438,  441,  448,
+       49,   51,  448,    0,  448,  448,  448,  448,  448,  448,
+      448,  448,  426,  429,   41,  418,  448,   38,  448,  417,
+      448,   20,   33,   32,   46,   40,   44,    0,   54,   52,
+      399,   48,   60,  395,   65,   67,   81,   27,  411,   75,
+      448,  448,    0,   98,    0,  426,    0,  428,  113,    0,
+      448,  448,  415,   54,  410,  448,  448,  448,  448,    0,
+      403,   68,  399,  391,  389,    0,  402,   80,   84,  397,
+      383,   96,  381,  394,  379,  393,  387,  375,  379,  375,
+      377,  377,    0,   98,    0,  376,   97,  385,  368,  375,
+
+        0,    0,  381,  381,  364,   94,  103,  379,   98,   65,
+      381,  369,  109,  361,  377,  373,  351,   97,  372,  363,
+      115,  356,    0,  137,  138,  448,    0,  388,    0,  390,
+      377,    0,    0,  365,  360,  367,  365,  348,  346,  345,
+      350,  359,  347,  359,   95,  347,  353,  354,  336,  336,
+      123,    0,  334,  350,  351,    0,  338,  347,  344,  122,
+      124,  341,  336,  330,  340,  338,  331,  328,  336,    0,
+      326,  336,  334,  325,  315,  309,  322,  307,  327,    0,
+      313,    0,  311,    0,  325,  316,  313,  131,  309,  316,
+      323,  302,  304,  309,  309,  301,  304,  299,    0,    0,
+
+      311,  295,  305,  312,  292,  291,  305,  294,  307,  287,
+        0,  297,  279,    0,  298,    0,  295,  282,    0,  281,
+      276,  281,  280,  290,    0,  276,    0,    0,    0,  280,
+        0,    0,  276,  273,  287,    0,  272,  272,  270,  286,
+      271,  283,  280,  264,  282,  277,    0,  272,  272,  258,
+      257,  270,  256,  270,  269,  268,    0,  252,    0,  246,
+        0,  265,  249,  248,    0,  262,  252,  247,  246,  258,
+      248,  247,    0,    0,    0,  251,    0,  239,    0,  253,
+      249,  235,    0,  249,  250,  233,  238,  231,  249,  231,
+      228,    0,  229,  226,    0,  231,  243,  230,  237,  227,
+
+      235,  220,    0,    0,    0,  212,  219,    0,    0,    0,
+      216,    0,  230,    0,  231,  218,  217,    0,  213,    0,
+      216,    0,  208,  210,    0,  209,  223,  216,    0,    0,
+      219,  222,  204,  219,    0,  215,    0,    0,  199,  213,
+        0,    0,  197,  196,  201,    0,  210,  195,    0,    0,
+      201,  197,    0,  192,    0,  204,  204,  192,  202,    0,
+      179,    0,    0,  199,    0,  183,  177,  183,    0,  174,
+        0,  193,    0,  192,    0,    0,  183,  187,    0,  174,
+      174,  180,  166,  189,  181,  180,  166,  151,  118,    0,
+      130,  136,  127,  123,  119,  111,    0,  448,  167,  173,
+
+      179,  152,  181,  124,  187,  193,  199,  205
+    } ;
+
+static yyconst flex_int16_t yy_def[409] =
+    {   0,
+      398,    1,  399,  399,  400,  400,  401,  401,  398,  398,
+      398,  398,  398,  402,  398,  398,  398,  398,  398,  398,
+      398,  398,  398,  398,  398,  403,  398,  398,  398,  398,
+      398,  404,  404,  404,  404,  404,  404,  404,  404,  404,
+      404,  404,  404,  404,  404,  404,  404,  404,  404,  404,
+      398,  398,  405,  406,  407,  398,  408,  398,  398,  402,
+      398,  398,  398,  398,  403,  398,  398,  398,  398,  404,
+      404,  404,  404,  404,  404,  404,  404,  404,  404,  404,
+      404,  404,  404,  404,  404,  404,  404,  404,  404,  404,
+      404,  404,  404,  404,  404,  404,  404,  404,  404,  404,
+
+      404,  404,  404,  404,  404,  404,  404,  404,  404,  404,
+      404,  404,  404,  404,  404,  404,  404,  404,  404,  404,
+      404,  404,  405,  406,  406,  398,  407,  398,  408,  398,
+      398,  404,  404,  404,  404,  404,  404,  404,  404,  404,
+      404,  404,  404,  404,  404,  404,  404,  404,  404,  404,
+      404,  404,  404,  404,  404,  404,  404,  404,  404,  404,
+      404,  404,  404,  404,  404,  404,  404,  404,  404,  404,
+      404,  404,  404,  404,  404,  404,  404,  404,  404,  404,
+      404,  404,  404,  404,  404,  404,  404,  404,  404,  404,
+      404,  404,  404,  404,  404,  404,  404,  404,  404,  404,
+
+      404,  404,  404,  404,  404,  404,  404,  404,  404,  404,
+      404,  404,  404,  404,  404,  404,  404,  404,  404,  404,
+      404,  404,  404,  404,  404,  404,  404,  404,  404,  404,
+      404,  404,  404,  404,  404,  404,  404,  404,  404,  404,
+      404,  404,  404,  404,  404,  404,  404,  404,  404,  404,
+      404,  404,  404,  404,  404,  404,  404,  404,  404,  404,
+      404,  404,  404,  404,  404,  404,  404,  404,  404,  404,
+      404,  404,  404,  404,  404,  404,  404,  404,  404,  404,
+      404,  404,  404,  404,  404,  404,  404,  404,  404,  404,
+      404,  404,  404,  404,  404,  404,  404,  404,  404,  404,
+
+      404,  404,  404,  404,  404,  404,  404,  404,  404,  404,
+      404,  404,  404,  404,  404,  404,  404,  404,  404,  404,
+      404,  404,  404,  404,  404,  404,  404,  404,  404,  404,
+      404,  404,  404,  404,  404,  404,  404,  404,  404,  404,
+      404,  404,  404,  404,  404,  404,  404,  404,  404,  404,
+      404,  404,  404,  404,  404,  404,  404,  404,  404,  404,
+      404,  404,  404,  404,  404,  404,  404,  404,  404,  404,
+      404,  404,  404,  404,  404,  404,  404,  404,  404,  404,
+      404,  404,  404,  404,  404,  404,  404,  404,  404,  404,
+      404,  404,  404,  404,  404,  404,  404,    0,  398,  398,
+
+      398,  398,  398,  398,  398,  398,  398,  398
+    } ;
+
+static yyconst flex_int16_t yy_nxt[499] =
+    {   0,
+       10,   11,   12,   13,   14,   15,   16,   17,   18,   19,
+       20,   21,   22,   23,   24,   25,   26,   27,   28,   29,
+       30,   31,   32,   33,   34,   35,   36,   37,   38,   38,
+       39,   38,   38,   40,   41,   42,   43,   44,   38,   45,
+       46,   47,   48,   49,   50,   38,   38,   38,   51,   52,
+       59,   59,   59,   59,   63,   71,   64,   67,   68,   73,
+       72,   77,  118,   74,  119,   78,   75,   63,   79,   64,
+       88,   80,   82,   85,   81,   86,   83,   89,   96,   76,
+       90,   93,   84,   91,   99,   87,   92,  101,   97,   94,
+      100,  107,  133,  110,   95,  102,  111,  103,  179,  104,
+
+      108,  109,  105,  115,  121,  112,  180,  125,  134,  113,
+      116,  122,  126,  114,   59,   59,  139,  117,  141,  142,
+      146,  163,  140,  159,  171,  173,  143,  189,   70,  147,
+      172,  177,  183,  164,  207,  208,  148,  190,  160,  161,
+      174,  193,  178,  184,  175,  194,  398,  125,  222,  214,
+      224,  398,  126,  215,  248,  249,   60,  397,  396,  395,
+      225,  394,  393,  223,  392,  391,  250,   53,   53,   53,
+       53,   53,   53,   55,   55,   55,   55,   55,   55,   57,
+       57,   57,   57,   57,   57,   65,   65,  123,  123,  123,
+      390,  123,  123,  124,  124,  124,  124,  124,  124,  127,
+
+      127,  389,  127,  127,  127,  129,  388,  129,  129,  129,
+      129,  387,  386,  385,  384,  383,  382,  381,  380,  379,
+      378,  377,  376,  375,  374,  373,  372,  371,  370,  369,
+      368,  367,  366,  365,  364,  363,  362,  361,  360,  359,
+      358,  357,  356,  355,  354,  353,  352,  351,  350,  349,
+      348,  347,  346,  345,  344,  343,  342,  341,  340,  339,
+      338,  337,  336,  335,  334,  333,  332,  331,  330,  329,
+      328,  327,  326,  325,  324,  323,  322,  321,  320,  319,
+      318,  317,  316,  315,  314,  313,  312,  311,  310,  309,
+      308,  307,  306,  305,  304,  303,  302,  301,  300,  299,
+
+      298,  297,  296,  295,  294,  293,  292,  291,  290,  289,
+      288,  287,  286,  285,  284,  283,  282,  281,  280,  279,
+      278,  277,  276,  275,  274,  273,  272,  271,  270,  269,
+      268,  267,  266,  265,  264,  263,  262,  261,  260,  259,
+      258,  257,  256,  255,  254,  253,  252,  251,  247,  246,
+      245,  244,  243,  242,  241,  240,  239,  238,  237,  236,
+      235,  234,  233,  232,  231,  230,  229,  228,  227,  226,
+      221,  220,  219,  218,  217,  216,  213,  212,  211,  210,
+      209,  206,  205,  204,  203,  202,  201,  200,  199,  198,
+      197,  196,  131,  130,  128,  195,  192,  191,  188,  187,
+
+      186,  185,  182,  181,  176,  170,  169,  168,  167,  166,
+      165,  162,  158,  157,  156,  155,  154,  153,  152,  151,
+      150,  149,  145,  144,  138,  137,  136,  135,  132,  398,
+      131,  130,  128,  120,  106,   98,   69,   66,   62,   61,
+      398,   58,   58,   56,   56,   54,   54,    9,  398,  398,
+      398,  398,  398,  398,  398,  398,  398,  398,  398,  398,
+      398,  398,  398,  398,  398,  398,  398,  398,  398,  398,
+      398,  398,  398,  398,  398,  398,  398,  398,  398,  398,
+      398,  398,  398,  398,  398,  398,  398,  398,  398,  398,
+      398,  398,  398,  398,  398,  398,  398,  398
+
+    } ;
+
+static yyconst flex_int16_t yy_chk[499] =
+    {   0,
+        1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
+        1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
+        1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
+        1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
+        1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
+       11,   11,   12,   12,   25,   32,   25,   28,   28,   33,
+       32,   34,   48,   33,   48,   34,   33,   64,   34,   64,
+       37,   34,   35,   36,   34,   36,   35,   37,   40,   33,
+       37,   39,   35,   37,   42,   36,   37,   43,   40,   39,
+       42,   45,   72,   46,   39,   43,   46,   43,  110,   43,
+
+       45,   45,   43,   47,   50,   46,  110,   54,   72,   46,
+       47,   50,   54,   46,   59,   59,   78,   47,   79,   79,
+       82,   97,   78,   94,  106,  107,   79,  118,  404,   82,
+      106,  109,  113,   97,  145,  145,   82,  118,   94,   94,
+      107,  121,  109,  113,  107,  121,  124,  125,  160,  151,
+      161,  124,  125,  151,  188,  188,  402,  396,  395,  394,
+      161,  393,  392,  160,  391,  389,  188,  399,  399,  399,
+      399,  399,  399,  400,  400,  400,  400,  400,  400,  401,
+      401,  401,  401,  401,  401,  403,  403,  405,  405,  405,
+      388,  405,  405,  406,  406,  406,  406,  406,  406,  407,
+
+      407,  387,  407,  407,  407,  408,  386,  408,  408,  408,
+      408,  385,  384,  383,  382,  381,  380,  378,  377,  374,
+      372,  370,  368,  367,  366,  364,  361,  359,  358,  357,
+      356,  354,  352,  351,  348,  347,  345,  344,  343,  340,
+      339,  336,  334,  333,  332,  331,  328,  327,  326,  324,
+      323,  321,  319,  317,  316,  315,  313,  311,  307,  306,
+      302,  301,  300,  299,  298,  297,  296,  294,  293,  291,
+      290,  289,  288,  287,  286,  285,  284,  282,  281,  280,
+      278,  276,  272,  271,  270,  269,  268,  267,  266,  264,
+      263,  262,  260,  258,  256,  255,  254,  253,  252,  251,
+
+      250,  249,  248,  246,  245,  244,  243,  242,  241,  240,
+      239,  238,  237,  235,  234,  233,  230,  226,  224,  223,
+      222,  221,  220,  218,  217,  215,  213,  212,  210,  209,
+      208,  207,  206,  205,  204,  203,  202,  201,  198,  197,
+      196,  195,  194,  193,  192,  191,  190,  189,  187,  186,
+      185,  183,  181,  179,  178,  177,  176,  175,  174,  173,
+      172,  171,  169,  168,  167,  166,  165,  164,  163,  162,
+      159,  158,  157,  155,  154,  153,  150,  149,  148,  147,
+      146,  144,  143,  142,  141,  140,  139,  138,  137,  136,
+      135,  134,  131,  130,  128,  122,  120,  119,  117,  116,
+
+      115,  114,  112,  111,  108,  105,  104,  103,  100,   99,
+       98,   96,   92,   91,   90,   89,   88,   87,   86,   85,
+       84,   83,   81,   80,   77,   75,   74,   73,   71,   65,
+       63,   58,   56,   49,   44,   41,   30,   26,   24,   23,
+        9,    8,    7,    6,    5,    4,    3,  398,  398,  398,
+      398,  398,  398,  398,  398,  398,  398,  398,  398,  398,
+      398,  398,  398,  398,  398,  398,  398,  398,  398,  398,
+      398,  398,  398,  398,  398,  398,  398,  398,  398,  398,
+      398,  398,  398,  398,  398,  398,  398,  398,  398,  398,
+      398,  398,  398,  398,  398,  398,  398,  398
+
+    } ;
+
+static yy_state_type yy_last_accepting_state;
+static char *yy_last_accepting_cpos;
+
+static int yy_flex_debug;
+static int yy_flex_debug = 0;
+
+/* The intent behind this definition is that it'll catch
+ * any uses of REJECT which flex missed.
+ */
+#define REJECT reject_used_but_not_detected
+#define yymore() yymore_used_but_not_detected
+#define YY_MORE_ADJ 0
+#define YY_RESTORE_YY_MORE_OFFSET
+static char *yytext;
+#line 1 "pars0lex.l"
+/**************************************************//**
+SQL parser lexical analyzer: input file for the GNU Flex lexer generator
+
+(c) 1997 Innobase Oy
+
+Created 12/14/1997 Heikki Tuuri
+Published under the GPL version 2
+
+The InnoDB parser is frozen because MySQL takes care of SQL parsing.
+Therefore we normally keep the InnoDB parser C files as they are, and do
+not automatically generate them from pars0grm.y and pars0lex.l.
+
+How to make the InnoDB parser and lexer C files:
+
+1. Run ./make_flex.sh to generate lexer files.
+
+2. Run ./make_bison.sh to generate parser files.
+
+These instructions seem to work at least with bison-1.875d and flex-2.5.31 on
+Linux.
+*******************************************************/
+#define YY_NO_INPUT 1
+#define YY_NO_UNISTD_H 1
+#line 38 "pars0lex.l"
+#define YYSTYPE que_node_t*
+
+#include "univ.i"
+#include "pars0pars.h"
+#include "pars0grm.h"
+#include "pars0sym.h"
+#include "mem0mem.h"
+#include "os0proc.h"
+
+#define malloc(A)	ut_malloc(A)
+#define free(A)		ut_free(A)
+#define realloc(P, A)	ut_realloc(P, A)
+#define exit(A) 	ut_error
+
+#define YY_INPUT(buf, result, max_size) pars_get_lex_chars(buf, &result, max_size)
+
+/* String buffer for removing quotes */
+static ulint	stringbuf_len_alloc = 0; /* Allocated length */
+static ulint	stringbuf_len = 0; /* Current length */
+static char*	stringbuf; /* Start of buffer */
+/** Appends a string to the buffer. */
+static
+void
+string_append(
+/*==========*/
+	const char*	str,	/*!< in: string to be appended */
+	ulint		len)	/*!< in: length of the string */
+{
+	if (stringbuf == NULL) {
+		stringbuf = malloc(1);
+		stringbuf_len_alloc = 1;
+	}
+
+	if (stringbuf_len + len > stringbuf_len_alloc) {
+		while (stringbuf_len + len > stringbuf_len_alloc) {
+			stringbuf_len_alloc <<= 1;
+		}
+		stringbuf = realloc(stringbuf, stringbuf_len_alloc);
+	}
+
+	memcpy(stringbuf + stringbuf_len, str, len);
+	stringbuf_len += len;
+}
+
+
+
+
+#line 759 "lexyy.c"
+
+#define INITIAL 0
+#define comment 1
+#define quoted 2
+#define id 3
+
+#ifndef YY_NO_UNISTD_H
+/* Special case for "unistd.h", since it is non-ANSI. We include it way
+ * down here because we want the user's section 1 to have been scanned first.
+ * The user has a chance to override it with an option.
+ */
+#include <unistd.h>
+#endif
+
+#ifndef YY_EXTRA_TYPE
+#define YY_EXTRA_TYPE void *
+#endif
+
+/* Macros after this point can all be overridden by user definitions in
+ * section 1.
+ */
+
+#ifndef YY_SKIP_YYWRAP
+#ifdef __cplusplus
+extern "C" int yywrap (void );
+#else
+extern int yywrap (void );
+#endif
+#endif
+
+#ifndef yytext_ptr
+static void yy_flex_strncpy (char *,yyconst char *,int );
+#endif
+
+#ifdef YY_NEED_STRLEN
+static int yy_flex_strlen (yyconst char * );
+#endif
+
+#ifndef YY_NO_INPUT
+
+#ifdef __cplusplus
+static int yyinput (void );
+#else
+static int input (void );
+#endif
+
+#endif
+
+/* Amount of stuff to slurp up with each read. */
+#ifndef YY_READ_BUF_SIZE
+#define YY_READ_BUF_SIZE 8192
+#endif
+
+/* Copy whatever the last rule matched to the standard output. */
+#ifndef ECHO
+/* This used to be an fputs(), but since the string might contain NUL's,
+ * we now use fwrite().
+ */
+#define ECHO (void) fwrite( yytext, yyleng, 1, yyout )
+#endif
+
+/* Gets input and stuffs it into "buf".  number of characters read, or YY_NULL,
+ * is returned in "result".
+ */
+#ifndef YY_INPUT
+#define YY_INPUT(buf,result,max_size) \
+	if ( YY_CURRENT_BUFFER_LVALUE->yy_is_interactive ) \
+		{ \
+		int c = '*'; \
+		size_t n; \
+		for ( n = 0; n < max_size && \
+			     (c = getc( yyin )) != EOF && c != '\n'; ++n ) \
+			buf[n] = (char) c; \
+		if ( c == '\n' ) \
+			buf[n++] = (char) c; \
+		if ( c == EOF && ferror( yyin ) ) \
+			YY_FATAL_ERROR( "input in flex scanner failed" ); \
+		result = n; \
+		} \
+	else \
+		{ \
+		errno=0; \
+		while ( (result = fread(buf, 1, max_size, yyin))==0 && ferror(yyin)) \
+			{ \
+			if( errno != EINTR) \
+				{ \
+				YY_FATAL_ERROR( "input in flex scanner failed" ); \
+				break; \
+				} \
+			errno=0; \
+			clearerr(yyin); \
+			} \
+		}\
+\
+
+#endif
+
+/* No semi-colon after return; correct usage is to write "yyterminate();" -
+ * we don't want an extra ';' after the "return" because that will cause
+ * some compilers to complain about unreachable statements.
+ */
+#ifndef yyterminate
+#define yyterminate() return YY_NULL
+#endif
+
+/* Number of entries by which start-condition stack grows. */
+#ifndef YY_START_STACK_INCR
+#define YY_START_STACK_INCR 25
+#endif
+
+/* Report a fatal error. */
+#ifndef YY_FATAL_ERROR
+#define YY_FATAL_ERROR(msg) yy_fatal_error( msg )
+#endif
+
+/* end tables serialization structures and prototypes */
+
+/* Default declaration of generated scanner - a define so the user can
+ * easily add parameters.
+ */
+#ifndef YY_DECL
+#define YY_DECL_IS_OURS 1
+
+UNIV_INTERN int yylex (void);
+
+#define YY_DECL UNIV_INTERN int yylex (void)
+#endif /* !YY_DECL */
+
+/* Code executed at the beginning of each rule, after yytext and yyleng
+ * have been set up.
+ */
+#ifndef YY_USER_ACTION
+#define YY_USER_ACTION
+#endif
+
+/* Code executed at the end of each rule. */
+#ifndef YY_BREAK
+#define YY_BREAK break;
+#endif
+
+#define YY_RULE_SETUP \
+	YY_USER_ACTION
+
+/** The main scanner function which does all the work.
+ */
+YY_DECL
+{
+	register yy_state_type yy_current_state;
+	register char *yy_cp, *yy_bp;
+	register int yy_act;
+    
+#line 92 "pars0lex.l"
+
+
+#line 914 "lexyy.c"
+
+	if ( (yy_init) )
+		{
+		(yy_init) = 0;
+
+#ifdef YY_USER_INIT
+		YY_USER_INIT;
+#endif
+
+		if ( ! (yy_start) )
+			(yy_start) = 1;	/* first start state */
+
+		if ( ! yyin )
+			yyin = stdin;
+
+		if ( ! yyout )
+			yyout = stdout;
+
+		if ( ! YY_CURRENT_BUFFER ) {
+			yyensure_buffer_stack ();
+			YY_CURRENT_BUFFER_LVALUE =
+				yy_create_buffer(yyin,YY_BUF_SIZE );
+		}
+
+		yy_load_buffer_state( );
+		}
+
+	while ( 1 )		/* loops until end-of-file is reached */
+		{
+		yy_cp = (yy_c_buf_p);
+
+		/* Support of yytext. */
+		*yy_cp = (yy_hold_char);
+
+		/* yy_bp points to the position in yy_ch_buf of the start of
+		 * the current run.
+		 */
+		yy_bp = yy_cp;
+
+		yy_current_state = (yy_start);
+yy_match:
+		do
+			{
+			register YY_CHAR yy_c = yy_ec[YY_SC_TO_UI(*yy_cp)];
+			if ( yy_accept[yy_current_state] )
+				{
+				(yy_last_accepting_state) = yy_current_state;
+				(yy_last_accepting_cpos) = yy_cp;
+				}
+			while ( yy_chk[yy_base[yy_current_state] + yy_c] != yy_current_state )
+				{
+				yy_current_state = (int) yy_def[yy_current_state];
+				if ( yy_current_state >= 399 )
+					yy_c = yy_meta[(unsigned int) yy_c];
+				}
+			yy_current_state = yy_nxt[yy_base[yy_current_state] + (unsigned int) yy_c];
+			++yy_cp;
+			}
+		while ( yy_current_state != 398 );
+		yy_cp = (yy_last_accepting_cpos);
+		yy_current_state = (yy_last_accepting_state);
+
+yy_find_action:
+		yy_act = yy_accept[yy_current_state];
+
+		YY_DO_BEFORE_ACTION;
+
+do_action:	/* This label is used only to access EOF actions. */
+
+		switch ( yy_act )
+	{ /* beginning of action switch */
+			case 0: /* must back up */
+			/* undo the effects of YY_DO_BEFORE_ACTION */
+			*yy_cp = (yy_hold_char);
+			yy_cp = (yy_last_accepting_cpos);
+			yy_current_state = (yy_last_accepting_state);
+			goto yy_find_action;
+
+case 1:
+YY_RULE_SETUP
+#line 94 "pars0lex.l"
+{
+			yylval = sym_tab_add_int_lit(pars_sym_tab_global,
+								atoi(yytext));
+			return(PARS_INT_LIT);
+}
+	YY_BREAK
+case 2:
+YY_RULE_SETUP
+#line 100 "pars0lex.l"
+{
+			ut_error;	/* not implemented */
+
+			return(PARS_FLOAT_LIT);
+}
+	YY_BREAK
+case 3:
+YY_RULE_SETUP
+#line 106 "pars0lex.l"
+{
+			ulint	type;
+
+			yylval = sym_tab_add_bound_lit(pars_sym_tab_global,
+				yytext + 1, &type);
+
+			return((int) type);
+}
+	YY_BREAK
+case 4:
+YY_RULE_SETUP
+#line 115 "pars0lex.l"
+{
+			yylval = sym_tab_add_bound_id(pars_sym_tab_global,
+				yytext + 1);
+
+			return(PARS_ID_TOKEN);
+}
+	YY_BREAK
+case 5:
+YY_RULE_SETUP
+#line 122 "pars0lex.l"
+{
+/* Quoted character string literals are handled in an explicit
+start state 'quoted'.  This state is entered and the buffer for
+the scanned string is emptied upon encountering a starting quote.
+
+In the state 'quoted', only two actions are possible (defined below). */
+			BEGIN(quoted);
+			stringbuf_len = 0;
+}
+	YY_BREAK
+case 6:
+/* rule 6 can match eol */
+YY_RULE_SETUP
+#line 131 "pars0lex.l"
+{
+			/* Got a sequence of characters other than "'":
+			append to string buffer */
+			string_append(yytext, yyleng);
+}
+	YY_BREAK
+case 7:
+YY_RULE_SETUP
+#line 136 "pars0lex.l"
+{
+			/* Got a sequence of "'" characters:
+			append half of them to string buffer,
+			as "''" represents a single "'".
+			We apply truncating division,
+			so that "'''" will result in "'". */
+
+			string_append(yytext, yyleng / 2);
+
+			/* If we got an odd number of quotes, then the
+			last quote we got is the terminating quote.
+			At the end of the string, we return to the
+			initial start state and report the scanned
+			string literal. */
+
+			if (yyleng % 2) {
+				BEGIN(INITIAL);
+				yylval = sym_tab_add_str_lit(
+					pars_sym_tab_global,
+					(byte*) stringbuf, stringbuf_len);
+				return(PARS_STR_LIT);
+			}
+}
+	YY_BREAK
+case 8:
+YY_RULE_SETUP
+#line 160 "pars0lex.l"
+{
+/* Quoted identifiers are handled in an explicit start state 'id'.
+This state is entered and the buffer for the scanned string is emptied
+upon encountering a starting quote.
+
+In the state 'id', only two actions are possible (defined below). */
+			BEGIN(id);
+			stringbuf_len = 0;
+}
+	YY_BREAK
+case 9:
+/* rule 9 can match eol */
+YY_RULE_SETUP
+#line 169 "pars0lex.l"
+{
+			/* Got a sequence of characters other than '"':
+			append to string buffer */
+			string_append(yytext, yyleng);
+}
+	YY_BREAK
+case 10:
+YY_RULE_SETUP
+#line 174 "pars0lex.l"
+{
+			/* Got a sequence of '"' characters:
+			append half of them to string buffer,
+			as '""' represents a single '"'.
+			We apply truncating division,
+			so that '"""' will result in '"'. */
+
+			string_append(yytext, yyleng / 2);
+
+			/* If we got an odd number of quotes, then the
+			last quote we got is the terminating quote.
+			At the end of the string, we return to the
+			initial start state and report the scanned
+			identifier. */
+
+			if (yyleng % 2) {
+				BEGIN(INITIAL);
+				yylval = sym_tab_add_id(
+					pars_sym_tab_global,
+					(byte*) stringbuf, stringbuf_len);
+
+				return(PARS_ID_TOKEN);
+			}
+}
+	YY_BREAK
+case 11:
+YY_RULE_SETUP
+#line 199 "pars0lex.l"
+{
+			yylval = sym_tab_add_null_lit(pars_sym_tab_global);
+
+			return(PARS_NULL_LIT);
+}
+	YY_BREAK
+case 12:
+YY_RULE_SETUP
+#line 205 "pars0lex.l"
+{
+			/* Implicit cursor name */
+			yylval = sym_tab_add_str_lit(pars_sym_tab_global,
+							(byte*) yytext, yyleng);
+			return(PARS_SQL_TOKEN);
+}
+	YY_BREAK
+case 13:
+YY_RULE_SETUP
+#line 212 "pars0lex.l"
+{
+			return(PARS_AND_TOKEN);
+}
+	YY_BREAK
+case 14:
+YY_RULE_SETUP
+#line 216 "pars0lex.l"
+{
+			return(PARS_OR_TOKEN);
+}
+	YY_BREAK
+case 15:
+YY_RULE_SETUP
+#line 220 "pars0lex.l"
+{
+			return(PARS_NOT_TOKEN);
+}
+	YY_BREAK
+case 16:
+YY_RULE_SETUP
+#line 224 "pars0lex.l"
+{
+			return(PARS_PROCEDURE_TOKEN);
+}
+	YY_BREAK
+case 17:
+YY_RULE_SETUP
+#line 228 "pars0lex.l"
+{
+			return(PARS_IN_TOKEN);
+}
+	YY_BREAK
+case 18:
+YY_RULE_SETUP
+#line 232 "pars0lex.l"
+{
+			return(PARS_OUT_TOKEN);
+}
+	YY_BREAK
+case 19:
+YY_RULE_SETUP
+#line 236 "pars0lex.l"
+{
+	 		return(PARS_BINARY_TOKEN);
+}
+	YY_BREAK
+case 20:
+YY_RULE_SETUP
+#line 240 "pars0lex.l"
+{
+	 		return(PARS_BLOB_TOKEN);
+}
+	YY_BREAK
+case 21:
+YY_RULE_SETUP
+#line 244 "pars0lex.l"
+{
+	 		return(PARS_INT_TOKEN);
+}
+	YY_BREAK
+case 22:
+YY_RULE_SETUP
+#line 248 "pars0lex.l"
+{
+	 		return(PARS_INT_TOKEN);
+}
+	YY_BREAK
+case 23:
+YY_RULE_SETUP
+#line 252 "pars0lex.l"
+{
+	 		return(PARS_FLOAT_TOKEN);
+}
+	YY_BREAK
+case 24:
+YY_RULE_SETUP
+#line 256 "pars0lex.l"
+{
+	 		return(PARS_CHAR_TOKEN);
+}
+	YY_BREAK
+case 25:
+YY_RULE_SETUP
+#line 260 "pars0lex.l"
+{
+			return(PARS_IS_TOKEN);
+}
+	YY_BREAK
+case 26:
+YY_RULE_SETUP
+#line 264 "pars0lex.l"
+{
+			return(PARS_BEGIN_TOKEN);
+}
+	YY_BREAK
+case 27:
+YY_RULE_SETUP
+#line 268 "pars0lex.l"
+{
+			return(PARS_END_TOKEN);
+}
+	YY_BREAK
+case 28:
+YY_RULE_SETUP
+#line 272 "pars0lex.l"
+{
+			return(PARS_IF_TOKEN);
+}
+	YY_BREAK
+case 29:
+YY_RULE_SETUP
+#line 276 "pars0lex.l"
+{
+			return(PARS_THEN_TOKEN);
+}
+	YY_BREAK
+case 30:
+YY_RULE_SETUP
+#line 280 "pars0lex.l"
+{
+			return(PARS_ELSE_TOKEN);
+}
+	YY_BREAK
+case 31:
+YY_RULE_SETUP
+#line 284 "pars0lex.l"
+{
+			return(PARS_ELSIF_TOKEN);
+}
+	YY_BREAK
+case 32:
+YY_RULE_SETUP
+#line 288 "pars0lex.l"
+{
+			return(PARS_LOOP_TOKEN);
+}
+	YY_BREAK
+case 33:
+YY_RULE_SETUP
+#line 292 "pars0lex.l"
+{
+			return(PARS_WHILE_TOKEN);
+}
+	YY_BREAK
+case 34:
+YY_RULE_SETUP
+#line 296 "pars0lex.l"
+{
+			return(PARS_RETURN_TOKEN);
+}
+	YY_BREAK
+case 35:
+YY_RULE_SETUP
+#line 300 "pars0lex.l"
+{
+			return(PARS_SELECT_TOKEN);
+}
+	YY_BREAK
+case 36:
+YY_RULE_SETUP
+#line 304 "pars0lex.l"
+{
+			return(PARS_SUM_TOKEN);
+}
+	YY_BREAK
+case 37:
+YY_RULE_SETUP
+#line 308 "pars0lex.l"
+{
+			return(PARS_COUNT_TOKEN);
+}
+	YY_BREAK
+case 38:
+YY_RULE_SETUP
+#line 312 "pars0lex.l"
+{
+			return(PARS_DISTINCT_TOKEN);
+}
+	YY_BREAK
+case 39:
+YY_RULE_SETUP
+#line 316 "pars0lex.l"
+{
+			return(PARS_FROM_TOKEN);
+}
+	YY_BREAK
+case 40:
+YY_RULE_SETUP
+#line 320 "pars0lex.l"
+{
+			return(PARS_WHERE_TOKEN);
+}
+	YY_BREAK
+case 41:
+YY_RULE_SETUP
+#line 324 "pars0lex.l"
+{
+			return(PARS_FOR_TOKEN);
+}
+	YY_BREAK
+case 42:
+YY_RULE_SETUP
+#line 328 "pars0lex.l"
+{
+			return(PARS_READ_TOKEN);
+}
+	YY_BREAK
+case 43:
+YY_RULE_SETUP
+#line 332 "pars0lex.l"
+{
+			return(PARS_ORDER_TOKEN);
+}
+	YY_BREAK
+case 44:
+YY_RULE_SETUP
+#line 336 "pars0lex.l"
+{
+			return(PARS_BY_TOKEN);
+}
+	YY_BREAK
+case 45:
+YY_RULE_SETUP
+#line 340 "pars0lex.l"
+{
+			return(PARS_ASC_TOKEN);
+}
+	YY_BREAK
+case 46:
+YY_RULE_SETUP
+#line 344 "pars0lex.l"
+{
+			return(PARS_DESC_TOKEN);
+}
+	YY_BREAK
+case 47:
+YY_RULE_SETUP
+#line 348 "pars0lex.l"
+{
+			return(PARS_INSERT_TOKEN);
+}
+	YY_BREAK
+case 48:
+YY_RULE_SETUP
+#line 352 "pars0lex.l"
+{
+			return(PARS_INTO_TOKEN);
+}
+	YY_BREAK
+case 49:
+YY_RULE_SETUP
+#line 356 "pars0lex.l"
+{
+			return(PARS_VALUES_TOKEN);
+}
+	YY_BREAK
+case 50:
+YY_RULE_SETUP
+#line 360 "pars0lex.l"
+{
+			return(PARS_UPDATE_TOKEN);
+}
+	YY_BREAK
+case 51:
+YY_RULE_SETUP
+#line 364 "pars0lex.l"
+{
+			return(PARS_SET_TOKEN);
+}
+	YY_BREAK
+case 52:
+YY_RULE_SETUP
+#line 368 "pars0lex.l"
+{
+			return(PARS_DELETE_TOKEN);
+}
+	YY_BREAK
+case 53:
+YY_RULE_SETUP
+#line 372 "pars0lex.l"
+{
+			return(PARS_CURRENT_TOKEN);
+}
+	YY_BREAK
+case 54:
+YY_RULE_SETUP
+#line 376 "pars0lex.l"
+{
+			return(PARS_OF_TOKEN);
+}
+	YY_BREAK
+case 55:
+YY_RULE_SETUP
+#line 380 "pars0lex.l"
+{
+			return(PARS_CREATE_TOKEN);
+}
+	YY_BREAK
+case 56:
+YY_RULE_SETUP
+#line 384 "pars0lex.l"
+{
+			return(PARS_TABLE_TOKEN);
+}
+	YY_BREAK
+case 57:
+YY_RULE_SETUP
+#line 388 "pars0lex.l"
+{
+	 		return(PARS_INDEX_TOKEN);
+}
+	YY_BREAK
+case 58:
+YY_RULE_SETUP
+#line 392 "pars0lex.l"
+{
+	 		return(PARS_UNIQUE_TOKEN);
+}
+	YY_BREAK
+case 59:
+YY_RULE_SETUP
+#line 396 "pars0lex.l"
+{
+	 		return(PARS_CLUSTERED_TOKEN);
+}
+	YY_BREAK
+case 60:
+YY_RULE_SETUP
+#line 400 "pars0lex.l"
+{
+			return(PARS_DOES_NOT_FIT_IN_MEM_TOKEN);
+}
+	YY_BREAK
+case 61:
+YY_RULE_SETUP
+#line 404 "pars0lex.l"
+{
+	 		return(PARS_ON_TOKEN);
+}
+	YY_BREAK
+case 62:
+YY_RULE_SETUP
+#line 408 "pars0lex.l"
+{
+			return(PARS_DECLARE_TOKEN);
+}
+	YY_BREAK
+case 63:
+YY_RULE_SETUP
+#line 412 "pars0lex.l"
+{
+			return(PARS_CURSOR_TOKEN);
+}
+	YY_BREAK
+case 64:
+YY_RULE_SETUP
+#line 416 "pars0lex.l"
+{
+			return(PARS_OPEN_TOKEN);
+}
+	YY_BREAK
+case 65:
+YY_RULE_SETUP
+#line 420 "pars0lex.l"
+{
+			return(PARS_FETCH_TOKEN);
+}
+	YY_BREAK
+case 66:
+YY_RULE_SETUP
+#line 424 "pars0lex.l"
+{
+			return(PARS_CLOSE_TOKEN);
+}
+	YY_BREAK
+case 67:
+YY_RULE_SETUP
+#line 428 "pars0lex.l"
+{
+			return(PARS_NOTFOUND_TOKEN);
+}
+	YY_BREAK
+case 68:
+YY_RULE_SETUP
+#line 432 "pars0lex.l"
+{
+			return(PARS_TO_CHAR_TOKEN);
+}
+	YY_BREAK
+case 69:
+YY_RULE_SETUP
+#line 436 "pars0lex.l"
+{
+			return(PARS_TO_NUMBER_TOKEN);
+}
+	YY_BREAK
+case 70:
+YY_RULE_SETUP
+#line 440 "pars0lex.l"
+{
+			return(PARS_TO_BINARY_TOKEN);
+}
+	YY_BREAK
+case 71:
+YY_RULE_SETUP
+#line 444 "pars0lex.l"
+{
+			return(PARS_BINARY_TO_NUMBER_TOKEN);
+}
+	YY_BREAK
+case 72:
+YY_RULE_SETUP
+#line 448 "pars0lex.l"
+{
+			return(PARS_SUBSTR_TOKEN);
+}
+	YY_BREAK
+case 73:
+YY_RULE_SETUP
+#line 452 "pars0lex.l"
+{
+			return(PARS_REPLSTR_TOKEN);
+}
+	YY_BREAK
+case 74:
+YY_RULE_SETUP
+#line 456 "pars0lex.l"
+{
+			return(PARS_CONCAT_TOKEN);
+}
+	YY_BREAK
+case 75:
+YY_RULE_SETUP
+#line 460 "pars0lex.l"
+{
+			return(PARS_INSTR_TOKEN);
+}
+	YY_BREAK
+case 76:
+YY_RULE_SETUP
+#line 464 "pars0lex.l"
+{
+			return(PARS_LENGTH_TOKEN);
+}
+	YY_BREAK
+case 77:
+YY_RULE_SETUP
+#line 468 "pars0lex.l"
+{
+			return(PARS_SYSDATE_TOKEN);
+}
+	YY_BREAK
+case 78:
+YY_RULE_SETUP
+#line 472 "pars0lex.l"
+{
+			return(PARS_PRINTF_TOKEN);
+}
+	YY_BREAK
+case 79:
+YY_RULE_SETUP
+#line 476 "pars0lex.l"
+{
+			return(PARS_ASSERT_TOKEN);
+}
+	YY_BREAK
+case 80:
+YY_RULE_SETUP
+#line 480 "pars0lex.l"
+{
+			return(PARS_RND_TOKEN);
+}
+	YY_BREAK
+case 81:
+YY_RULE_SETUP
+#line 484 "pars0lex.l"
+{
+			return(PARS_RND_STR_TOKEN);
+}
+	YY_BREAK
+case 82:
+YY_RULE_SETUP
+#line 488 "pars0lex.l"
+{
+			return(PARS_ROW_PRINTF_TOKEN);
+}
+	YY_BREAK
+case 83:
+YY_RULE_SETUP
+#line 492 "pars0lex.l"
+{
+			return(PARS_COMMIT_TOKEN);
+}
+	YY_BREAK
+case 84:
+YY_RULE_SETUP
+#line 496 "pars0lex.l"
+{
+			return(PARS_ROLLBACK_TOKEN);
+}
+	YY_BREAK
+case 85:
+YY_RULE_SETUP
+#line 500 "pars0lex.l"
+{
+			return(PARS_WORK_TOKEN);
+}
+	YY_BREAK
+case 86:
+YY_RULE_SETUP
+#line 504 "pars0lex.l"
+{
+			return(PARS_UNSIGNED_TOKEN);
+}
+	YY_BREAK
+case 87:
+YY_RULE_SETUP
+#line 508 "pars0lex.l"
+{
+			return(PARS_EXIT_TOKEN);
+}
+	YY_BREAK
+case 88:
+YY_RULE_SETUP
+#line 512 "pars0lex.l"
+{
+			return(PARS_FUNCTION_TOKEN);
+}
+	YY_BREAK
+case 89:
+YY_RULE_SETUP
+#line 516 "pars0lex.l"
+{
+			return(PARS_LOCK_TOKEN);
+}
+	YY_BREAK
+case 90:
+YY_RULE_SETUP
+#line 520 "pars0lex.l"
+{
+			return(PARS_SHARE_TOKEN);
+}
+	YY_BREAK
+case 91:
+YY_RULE_SETUP
+#line 524 "pars0lex.l"
+{
+			return(PARS_MODE_TOKEN);
+}
+	YY_BREAK
+case 92:
+YY_RULE_SETUP
+#line 528 "pars0lex.l"
+{
+			yylval = sym_tab_add_id(pars_sym_tab_global,
+							(byte*)yytext,
+							ut_strlen(yytext));
+			return(PARS_ID_TOKEN);
+}
+	YY_BREAK
+case 93:
+YY_RULE_SETUP
+#line 535 "pars0lex.l"
+{
+			return(PARS_DDOT_TOKEN);
+}
+	YY_BREAK
+case 94:
+YY_RULE_SETUP
+#line 539 "pars0lex.l"
+{
+			return(PARS_ASSIGN_TOKEN);
+}
+	YY_BREAK
+case 95:
+YY_RULE_SETUP
+#line 543 "pars0lex.l"
+{
+			return(PARS_LE_TOKEN);
+}
+	YY_BREAK
+case 96:
+YY_RULE_SETUP
+#line 547 "pars0lex.l"
+{
+			return(PARS_GE_TOKEN);
+}
+	YY_BREAK
+case 97:
+YY_RULE_SETUP
+#line 551 "pars0lex.l"
+{
+			return(PARS_NE_TOKEN);
+}
+	YY_BREAK
+case 98:
+YY_RULE_SETUP
+#line 555 "pars0lex.l"
+{
+
+			return((int)(*yytext));
+}
+	YY_BREAK
+case 99:
+YY_RULE_SETUP
+#line 560 "pars0lex.l"
+{
+
+			return((int)(*yytext));
+}
+	YY_BREAK
+case 100:
+YY_RULE_SETUP
+#line 565 "pars0lex.l"
+{
+
+			return((int)(*yytext));
+}
+	YY_BREAK
+case 101:
+YY_RULE_SETUP
+#line 570 "pars0lex.l"
+{
+
+			return((int)(*yytext));
+}
+	YY_BREAK
+case 102:
+YY_RULE_SETUP
+#line 575 "pars0lex.l"
+{
+
+			return((int)(*yytext));
+}
+	YY_BREAK
+case 103:
+YY_RULE_SETUP
+#line 580 "pars0lex.l"
+{
+
+			return((int)(*yytext));
+}
+	YY_BREAK
+case 104:
+YY_RULE_SETUP
+#line 585 "pars0lex.l"
+{
+
+			return((int)(*yytext));
+}
+	YY_BREAK
+case 105:
+YY_RULE_SETUP
+#line 590 "pars0lex.l"
+{
+
+			return((int)(*yytext));
+}
+	YY_BREAK
+case 106:
+YY_RULE_SETUP
+#line 595 "pars0lex.l"
+{
+
+			return((int)(*yytext));
+}
+	YY_BREAK
+case 107:
+YY_RULE_SETUP
+#line 600 "pars0lex.l"
+{
+
+			return((int)(*yytext));
+}
+	YY_BREAK
+case 108:
+YY_RULE_SETUP
+#line 605 "pars0lex.l"
+{
+
+			return((int)(*yytext));
+}
+	YY_BREAK
+case 109:
+YY_RULE_SETUP
+#line 610 "pars0lex.l"
+{
+
+			return((int)(*yytext));
+}
+	YY_BREAK
+case 110:
+YY_RULE_SETUP
+#line 615 "pars0lex.l"
+{
+
+			return((int)(*yytext));
+}
+	YY_BREAK
+case 111:
+YY_RULE_SETUP
+#line 620 "pars0lex.l"
+{
+
+			return((int)(*yytext));
+}
+	YY_BREAK
+case 112:
+YY_RULE_SETUP
+#line 625 "pars0lex.l"
+{
+
+			return((int)(*yytext));
+}
+	YY_BREAK
+case 113:
+YY_RULE_SETUP
+#line 630 "pars0lex.l"
+BEGIN(comment); /* eat up comment */
+	YY_BREAK
+case 114:
+/* rule 114 can match eol */
+YY_RULE_SETUP
+#line 632 "pars0lex.l"
+
+	YY_BREAK
+case 115:
+/* rule 115 can match eol */
+YY_RULE_SETUP
+#line 633 "pars0lex.l"
+
+	YY_BREAK
+case 116:
+YY_RULE_SETUP
+#line 634 "pars0lex.l"
+BEGIN(INITIAL);
+	YY_BREAK
+case 117:
+/* rule 117 can match eol */
+YY_RULE_SETUP
+#line 636 "pars0lex.l"
+/* eat up whitespace */
+	YY_BREAK
+case 118:
+YY_RULE_SETUP
+#line 639 "pars0lex.l"
+{
+			fprintf(stderr,"Unrecognized character: %02x\n",
+				*yytext);
+
+			ut_error;
+
+			return(0);
+}
+	YY_BREAK
+case 119:
+YY_RULE_SETUP
+#line 648 "pars0lex.l"
+YY_FATAL_ERROR( "flex scanner jammed" );
+	YY_BREAK
+#line 1916 "lexyy.c"
+case YY_STATE_EOF(INITIAL):
+case YY_STATE_EOF(comment):
+case YY_STATE_EOF(quoted):
+case YY_STATE_EOF(id):
+	yyterminate();
+
+	case YY_END_OF_BUFFER:
+		{
+		/* Amount of text matched not including the EOB char. */
+		int yy_amount_of_matched_text = (int) (yy_cp - (yytext_ptr)) - 1;
+
+		/* Undo the effects of YY_DO_BEFORE_ACTION. */
+		*yy_cp = (yy_hold_char);
+		YY_RESTORE_YY_MORE_OFFSET
+
+		if ( YY_CURRENT_BUFFER_LVALUE->yy_buffer_status == YY_BUFFER_NEW )
+			{
+			/* We're scanning a new file or input source.  It's
+			 * possible that this happened because the user
+			 * just pointed yyin at a new source and called
+			 * yylex().  If so, then we have to assure
+			 * consistency between YY_CURRENT_BUFFER and our
+			 * globals.  Here is the right place to do so, because
+			 * this is the first action (other than possibly a
+			 * back-up) that will match for the new input source.
+			 */
+			(yy_n_chars) = YY_CURRENT_BUFFER_LVALUE->yy_n_chars;
+			YY_CURRENT_BUFFER_LVALUE->yy_input_file = yyin;
+			YY_CURRENT_BUFFER_LVALUE->yy_buffer_status = YY_BUFFER_NORMAL;
+			}
+
+		/* Note that here we test for yy_c_buf_p "<=" to the position
+		 * of the first EOB in the buffer, since yy_c_buf_p will
+		 * already have been incremented past the NUL character
+		 * (since all states make transitions on EOB to the
+		 * end-of-buffer state).  Contrast this with the test
+		 * in input().
+		 */
+		if ( (yy_c_buf_p) <= &YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[(yy_n_chars)] )
+			{ /* This was really a NUL. */
+			yy_state_type yy_next_state;
+
+			(yy_c_buf_p) = (yytext_ptr) + yy_amount_of_matched_text;
+
+			yy_current_state = yy_get_previous_state(  );
+
+			/* Okay, we're now positioned to make the NUL
+			 * transition.  We couldn't have
+			 * yy_get_previous_state() go ahead and do it
+			 * for us because it doesn't know how to deal
+			 * with the possibility of jamming (and we don't
+			 * want to build jamming into it because then it
+			 * will run more slowly).
+			 */
+
+			yy_next_state = yy_try_NUL_trans( yy_current_state );
+
+			yy_bp = (yytext_ptr) + YY_MORE_ADJ;
+
+			if ( yy_next_state )
+				{
+				/* Consume the NUL. */
+				yy_cp = ++(yy_c_buf_p);
+				yy_current_state = yy_next_state;
+				goto yy_match;
+				}
+
+			else
+				{
+				yy_cp = (yy_last_accepting_cpos);
+				yy_current_state = (yy_last_accepting_state);
+				goto yy_find_action;
+				}
+			}
+
+		else switch ( yy_get_next_buffer(  ) )
+			{
+			case EOB_ACT_END_OF_FILE:
+				{
+				(yy_did_buffer_switch_on_eof) = 0;
+
+				if ( yywrap( ) )
+					{
+					/* Note: because we've taken care in
+					 * yy_get_next_buffer() to have set up
+					 * yytext, we can now set up
+					 * yy_c_buf_p so that if some total
+					 * hoser (like flex itself) wants to
+					 * call the scanner after we return the
+					 * YY_NULL, it'll still work - another
+					 * YY_NULL will get returned.
+					 */
+					(yy_c_buf_p) = (yytext_ptr) + YY_MORE_ADJ;
+
+					yy_act = YY_STATE_EOF(YY_START);
+					goto do_action;
+					}
+
+				else
+					{
+					if ( ! (yy_did_buffer_switch_on_eof) )
+						YY_NEW_FILE;
+					}
+				break;
+				}
+
+			case EOB_ACT_CONTINUE_SCAN:
+				(yy_c_buf_p) =
+					(yytext_ptr) + yy_amount_of_matched_text;
+
+				yy_current_state = yy_get_previous_state(  );
+
+				yy_cp = (yy_c_buf_p);
+				yy_bp = (yytext_ptr) + YY_MORE_ADJ;
+				goto yy_match;
+
+			case EOB_ACT_LAST_MATCH:
+				(yy_c_buf_p) =
+				&YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[(yy_n_chars)];
+
+				yy_current_state = yy_get_previous_state(  );
+
+				yy_cp = (yy_c_buf_p);
+				yy_bp = (yytext_ptr) + YY_MORE_ADJ;
+				goto yy_find_action;
+			}
+		break;
+		}
+
+	default:
+		YY_FATAL_ERROR(
+			"fatal flex scanner internal error--no action found" );
+	} /* end of action switch */
+		} /* end of scanning one token */
+} /* end of yylex */
+
+/* yy_get_next_buffer - try to read in a new buffer
+ *
+ * Returns a code representing an action:
+ *	EOB_ACT_LAST_MATCH -
+ *	EOB_ACT_CONTINUE_SCAN - continue scanning from current position
+ *	EOB_ACT_END_OF_FILE - end of file
+ */
+static int yy_get_next_buffer (void)
+{
+    	register char *dest = YY_CURRENT_BUFFER_LVALUE->yy_ch_buf;
+	register char *source = (yytext_ptr);
+	register int number_to_move, i;
+	int ret_val;
+
+	if ( (yy_c_buf_p) > &YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[(yy_n_chars) + 1] )
+		YY_FATAL_ERROR(
+		"fatal flex scanner internal error--end of buffer missed" );
+
+	if ( YY_CURRENT_BUFFER_LVALUE->yy_fill_buffer == 0 )
+		{ /* Don't try to fill the buffer, so this is an EOF. */
+		if ( (yy_c_buf_p) - (yytext_ptr) - YY_MORE_ADJ == 1 )
+			{
+			/* We matched a single character, the EOB, so
+			 * treat this as a final EOF.
+			 */
+			return EOB_ACT_END_OF_FILE;
+			}
+
+		else
+			{
+			/* We matched some text prior to the EOB, first
+			 * process it.
+			 */
+			return EOB_ACT_LAST_MATCH;
+			}
+		}
+
+	/* Try to read more data. */
+
+	/* First move last chars to start of buffer. */
+	number_to_move = (int) ((yy_c_buf_p) - (yytext_ptr)) - 1;
+
+	for ( i = 0; i < number_to_move; ++i )
+		*(dest++) = *(source++);
+
+	if ( YY_CURRENT_BUFFER_LVALUE->yy_buffer_status == YY_BUFFER_EOF_PENDING )
+		/* don't do the read, it's not guaranteed to return an EOF,
+		 * just force an EOF
+		 */
+		YY_CURRENT_BUFFER_LVALUE->yy_n_chars = (yy_n_chars) = 0;
+
+	else
+		{
+			size_t num_to_read =
+			YY_CURRENT_BUFFER_LVALUE->yy_buf_size - number_to_move - 1;
+
+		while ( num_to_read <= 0 )
+			{ /* Not enough room in the buffer - grow it. */
+
+			/* just a shorter name for the current buffer */
+			YY_BUFFER_STATE b = YY_CURRENT_BUFFER;
+
+			int yy_c_buf_p_offset =
+				(int) ((yy_c_buf_p) - b->yy_ch_buf);
+
+			if ( b->yy_is_our_buffer )
+				{
+				int new_size = b->yy_buf_size * 2;
+
+				if ( new_size <= 0 )
+					b->yy_buf_size += b->yy_buf_size / 8;
+				else
+					b->yy_buf_size *= 2;
+
+				b->yy_ch_buf = (char *)
+					/* Include room in for 2 EOB chars. */
+					yyrealloc((void *) b->yy_ch_buf,b->yy_buf_size + 2  );
+				}
+			else
+				/* Can't grow it, we don't own it. */
+				b->yy_ch_buf = 0;
+
+			if ( ! b->yy_ch_buf )
+				YY_FATAL_ERROR(
+				"fatal error - scanner input buffer overflow" );
+
+			(yy_c_buf_p) = &b->yy_ch_buf[yy_c_buf_p_offset];
+
+			num_to_read = YY_CURRENT_BUFFER_LVALUE->yy_buf_size -
+						number_to_move - 1;
+
+			}
+
+		if ( num_to_read > YY_READ_BUF_SIZE )
+			num_to_read = YY_READ_BUF_SIZE;
+
+		/* Read in more data. */
+		YY_INPUT( (&YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[number_to_move]),
+			(yy_n_chars), num_to_read );
+
+		YY_CURRENT_BUFFER_LVALUE->yy_n_chars = (yy_n_chars);
+		}
+
+	if ( (yy_n_chars) == 0 )
+		{
+		if ( number_to_move == YY_MORE_ADJ )
+			{
+			ret_val = EOB_ACT_END_OF_FILE;
+			yyrestart(yyin  );
+			}
+
+		else
+			{
+			ret_val = EOB_ACT_LAST_MATCH;
+			YY_CURRENT_BUFFER_LVALUE->yy_buffer_status =
+				YY_BUFFER_EOF_PENDING;
+			}
+		}
+
+	else
+		ret_val = EOB_ACT_CONTINUE_SCAN;
+
+	(yy_n_chars) += number_to_move;
+	YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[(yy_n_chars)] = YY_END_OF_BUFFER_CHAR;
+	YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[(yy_n_chars) + 1] = YY_END_OF_BUFFER_CHAR;
+
+	(yytext_ptr) = &YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[0];
+
+	return ret_val;
+}
+
+/* yy_get_previous_state - get the state just before the EOB char was reached */
+
+    static yy_state_type yy_get_previous_state (void)
+{
+	register yy_state_type yy_current_state;
+	register char *yy_cp;
+    
+	yy_current_state = (yy_start);
+
+	for ( yy_cp = (yytext_ptr) + YY_MORE_ADJ; yy_cp < (yy_c_buf_p); ++yy_cp )
+		{
+		register YY_CHAR yy_c = (*yy_cp ? yy_ec[YY_SC_TO_UI(*yy_cp)] : 1);
+		if ( yy_accept[yy_current_state] )
+			{
+			(yy_last_accepting_state) = yy_current_state;
+			(yy_last_accepting_cpos) = yy_cp;
+			}
+		while ( yy_chk[yy_base[yy_current_state] + yy_c] != yy_current_state )
+			{
+			yy_current_state = (int) yy_def[yy_current_state];
+			if ( yy_current_state >= 399 )
+				yy_c = yy_meta[(unsigned int) yy_c];
+			}
+		yy_current_state = yy_nxt[yy_base[yy_current_state] + (unsigned int) yy_c];
+		}
+
+	return yy_current_state;
+}
+
+/* yy_try_NUL_trans - try to make a transition on the NUL character
+ *
+ * synopsis
+ *	next_state = yy_try_NUL_trans( current_state );
+ */
+    static yy_state_type yy_try_NUL_trans  (yy_state_type yy_current_state )
+{
+	register int yy_is_jam;
+    	register char *yy_cp = (yy_c_buf_p);
+
+	register YY_CHAR yy_c = 1;
+	if ( yy_accept[yy_current_state] )
+		{
+		(yy_last_accepting_state) = yy_current_state;
+		(yy_last_accepting_cpos) = yy_cp;
+		}
+	while ( yy_chk[yy_base[yy_current_state] + yy_c] != yy_current_state )
+		{
+		yy_current_state = (int) yy_def[yy_current_state];
+		if ( yy_current_state >= 399 )
+			yy_c = yy_meta[(unsigned int) yy_c];
+		}
+	yy_current_state = yy_nxt[yy_base[yy_current_state] + (unsigned int) yy_c];
+	yy_is_jam = (yy_current_state == 398);
+
+	return yy_is_jam ? 0 : yy_current_state;
+}
+
+#ifndef YY_NO_INPUT
+#ifdef __cplusplus
+    static int yyinput (void)
+#else
+    static int input  (void)
+#endif
+
+{
+	int c;
+    
+	*(yy_c_buf_p) = (yy_hold_char);
+
+	if ( *(yy_c_buf_p) == YY_END_OF_BUFFER_CHAR )
+		{
+		/* yy_c_buf_p now points to the character we want to return.
+		 * If this occurs *before* the EOB characters, then it's a
+		 * valid NUL; if not, then we've hit the end of the buffer.
+		 */
+		if ( (yy_c_buf_p) < &YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[(yy_n_chars)] )
+			/* This was really a NUL. */
+			*(yy_c_buf_p) = '\0';
+
+		else
+			{ /* need more input */
+			int offset = (int)((yy_c_buf_p) - (yytext_ptr));
+			++(yy_c_buf_p);
+
+			switch ( yy_get_next_buffer(  ) )
+				{
+				case EOB_ACT_LAST_MATCH:
+					/* This happens because yy_g_n_b()
+					 * sees that we've accumulated a
+					 * token and flags that we need to
+					 * try matching the token before
+					 * proceeding.  But for input(),
+					 * there's no matching to consider.
+					 * So convert the EOB_ACT_LAST_MATCH
+					 * to EOB_ACT_END_OF_FILE.
+					 */
+
+					/* Reset buffer status. */
+					yyrestart(yyin );
+
+					/*FALLTHROUGH*/
+
+				case EOB_ACT_END_OF_FILE:
+					{
+					if ( yywrap( ) )
+						return EOF;
+
+					if ( ! (yy_did_buffer_switch_on_eof) )
+						YY_NEW_FILE;
+#ifdef __cplusplus
+					return yyinput();
+#else
+					return input();
+#endif
+					}
+
+				case EOB_ACT_CONTINUE_SCAN:
+					(yy_c_buf_p) = (yytext_ptr) + offset;
+					break;
+				}
+			}
+		}
+
+	c = *(unsigned char *) (yy_c_buf_p);	/* cast for 8-bit char's */
+	*(yy_c_buf_p) = '\0';	/* preserve yytext */
+	(yy_hold_char) = *++(yy_c_buf_p);
+
+	return c;
+}
+#endif	/* ifndef YY_NO_INPUT */
+
+/** Immediately switch to a different input stream.
+ * @param input_file A readable stream.
+ * 
+ * @note This function does not reset the start condition to @c INITIAL .
+ */
+    static void yyrestart  (FILE * input_file )
+{
+    
+	if ( ! YY_CURRENT_BUFFER ){
+        yyensure_buffer_stack ();
+		YY_CURRENT_BUFFER_LVALUE =
+            yy_create_buffer(yyin,YY_BUF_SIZE );
+	}
+
+	yy_init_buffer(YY_CURRENT_BUFFER,input_file );
+	yy_load_buffer_state( );
+}
+
+/** Switch to a different input buffer.
+ * @param new_buffer The new input buffer.
+ * 
+ */
+    __attribute__((unused)) static void yy_switch_to_buffer  (YY_BUFFER_STATE  new_buffer )
+{
+    
+	/* TODO. We should be able to replace this entire function body
+	 * with
+	 *		yypop_buffer_state();
+	 *		yypush_buffer_state(new_buffer);
+     */
+	yyensure_buffer_stack ();
+	if ( YY_CURRENT_BUFFER == new_buffer )
+		return;
+
+	if ( YY_CURRENT_BUFFER )
+		{
+		/* Flush out information for old buffer. */
+		*(yy_c_buf_p) = (yy_hold_char);
+		YY_CURRENT_BUFFER_LVALUE->yy_buf_pos = (yy_c_buf_p);
+		YY_CURRENT_BUFFER_LVALUE->yy_n_chars = (yy_n_chars);
+		}
+
+	YY_CURRENT_BUFFER_LVALUE = new_buffer;
+	yy_load_buffer_state( );
+
+	/* We don't actually know whether we did this switch during
+	 * EOF (yywrap()) processing, but the only time this flag
+	 * is looked at is after yywrap() is called, so it's safe
+	 * to go ahead and always set it.
+	 */
+	(yy_did_buffer_switch_on_eof) = 1;
+}
+
+static void yy_load_buffer_state  (void)
+{
+    	(yy_n_chars) = YY_CURRENT_BUFFER_LVALUE->yy_n_chars;
+	(yytext_ptr) = (yy_c_buf_p) = YY_CURRENT_BUFFER_LVALUE->yy_buf_pos;
+	yyin = YY_CURRENT_BUFFER_LVALUE->yy_input_file;
+	(yy_hold_char) = *(yy_c_buf_p);
+}
+
+/** Allocate and initialize an input buffer state.
+ * @param file A readable stream.
+ * @param size The character buffer size in bytes. When in doubt, use @c YY_BUF_SIZE.
+ * 
+ * @return the allocated buffer state.
+ */
+    static YY_BUFFER_STATE yy_create_buffer  (FILE * file, int  size )
+{
+	YY_BUFFER_STATE b;
+    
+	b = (YY_BUFFER_STATE) yyalloc(sizeof( struct yy_buffer_state )  );
+	if ( ! b )
+		YY_FATAL_ERROR( "out of dynamic memory in yy_create_buffer()" );
+
+	b->yy_buf_size = size;
+
+	/* yy_ch_buf has to be 2 characters longer than the size given because
+	 * we need to put in 2 end-of-buffer characters.
+	 */
+	b->yy_ch_buf = (char *) yyalloc(b->yy_buf_size + 2  );
+	if ( ! b->yy_ch_buf )
+		YY_FATAL_ERROR( "out of dynamic memory in yy_create_buffer()" );
+
+	b->yy_is_our_buffer = 1;
+
+	yy_init_buffer(b,file );
+
+	return b;
+}
+
+/** Destroy the buffer.
+ * @param b a buffer created with yy_create_buffer()
+ * 
+ */
+    static void yy_delete_buffer (YY_BUFFER_STATE  b )
+{
+    
+	if ( ! b )
+		return;
+
+	if ( b == YY_CURRENT_BUFFER ) /* Not sure if we should pop here. */
+		YY_CURRENT_BUFFER_LVALUE = (YY_BUFFER_STATE) 0;
+
+	if ( b->yy_is_our_buffer )
+		yyfree((void *) b->yy_ch_buf  );
+
+	yyfree((void *) b  );
+}
+
+/* Initializes or reinitializes a buffer.
+ * This function is sometimes called more than once on the same buffer,
+ * such as during a yyrestart() or at EOF.
+ */
+    static void yy_init_buffer  (YY_BUFFER_STATE  b, FILE * file )
+
+{
+	int oerrno = errno;
+    
+	yy_flush_buffer(b );
+
+	b->yy_input_file = file;
+	b->yy_fill_buffer = 1;
+
+    /* If b is the current buffer, then yy_init_buffer was _probably_
+     * called from yyrestart() or through yy_get_next_buffer.
+     * In that case, we don't want to reset the lineno or column.
+     */
+    if (b != YY_CURRENT_BUFFER){
+        b->yy_bs_lineno = 1;
+        b->yy_bs_column = 0;
+    }
+
+        b->yy_is_interactive = 0;
+    
+	errno = oerrno;
+}
+
+/** Discard all buffered characters. On the next scan, YY_INPUT will be called.
+ * @param b the buffer state to be flushed, usually @c YY_CURRENT_BUFFER.
+ * 
+ */
+    static void yy_flush_buffer (YY_BUFFER_STATE  b )
+{
+    	if ( ! b )
+		return;
+
+	b->yy_n_chars = 0;
+
+	/* We always need two end-of-buffer characters.  The first causes
+	 * a transition to the end-of-buffer state.  The second causes
+	 * a jam in that state.
+	 */
+	b->yy_ch_buf[0] = YY_END_OF_BUFFER_CHAR;
+	b->yy_ch_buf[1] = YY_END_OF_BUFFER_CHAR;
+
+	b->yy_buf_pos = &b->yy_ch_buf[0];
+
+	b->yy_at_bol = 1;
+	b->yy_buffer_status = YY_BUFFER_NEW;
+
+	if ( b == YY_CURRENT_BUFFER )
+		yy_load_buffer_state( );
+}
+
+/** Pushes the new state onto the stack. The new state becomes
+ *  the current state. This function will allocate the stack
+ *  if necessary.
+ *  @param new_buffer The new state.
+ *  
+ */
+__attribute__((unused)) static void yypush_buffer_state (YY_BUFFER_STATE new_buffer )
+{
+    	if (new_buffer == NULL)
+		return;
+
+	yyensure_buffer_stack();
+
+	/* This block is copied from yy_switch_to_buffer. */
+	if ( YY_CURRENT_BUFFER )
+		{
+		/* Flush out information for old buffer. */
+		*(yy_c_buf_p) = (yy_hold_char);
+		YY_CURRENT_BUFFER_LVALUE->yy_buf_pos = (yy_c_buf_p);
+		YY_CURRENT_BUFFER_LVALUE->yy_n_chars = (yy_n_chars);
+		}
+
+	/* Only push if top exists. Otherwise, replace top. */
+	if (YY_CURRENT_BUFFER)
+		(yy_buffer_stack_top)++;
+	YY_CURRENT_BUFFER_LVALUE = new_buffer;
+
+	/* copied from yy_switch_to_buffer. */
+	yy_load_buffer_state( );
+	(yy_did_buffer_switch_on_eof) = 1;
+}
+
+/** Removes and deletes the top of the stack, if present.
+ *  The next element becomes the new top.
+ *  
+ */
+__attribute__((unused)) static void yypop_buffer_state (void)
+{
+    	if (!YY_CURRENT_BUFFER)
+		return;
+
+	yy_delete_buffer(YY_CURRENT_BUFFER );
+	YY_CURRENT_BUFFER_LVALUE = NULL;
+	if ((yy_buffer_stack_top) > 0)
+		--(yy_buffer_stack_top);
+
+	if (YY_CURRENT_BUFFER) {
+		yy_load_buffer_state( );
+		(yy_did_buffer_switch_on_eof) = 1;
+	}
+}
+
+/* Allocates the stack if it does not exist.
+ *  Guarantees space for at least one push.
+ */
+static void yyensure_buffer_stack (void)
+{
+	int num_to_alloc;
+    
+	if (!(yy_buffer_stack)) {
+
+		/* First allocation is just for 2 elements, since we don't know if this
+		 * scanner will even need a stack. We use 2 instead of 1 to avoid an
+		 * immediate realloc on the next call.
+         */
+		num_to_alloc = 1;
+		(yy_buffer_stack) = (struct yy_buffer_state**)yyalloc
+								(num_to_alloc * sizeof(struct yy_buffer_state*)
+								);
+		
+		memset((yy_buffer_stack), 0, num_to_alloc * sizeof(struct yy_buffer_state*));
+				
+		(yy_buffer_stack_max) = num_to_alloc;
+		(yy_buffer_stack_top) = 0;
+		return;
+	}
+
+	if ((yy_buffer_stack_top) >= ((yy_buffer_stack_max)) - 1){
+
+		/* Increase the buffer to prepare for a possible push. */
+		int grow_size = 8 /* arbitrary grow size */;
+
+		num_to_alloc = (yy_buffer_stack_max) + grow_size;
+		(yy_buffer_stack) = (struct yy_buffer_state**)yyrealloc
+								((yy_buffer_stack),
+								num_to_alloc * sizeof(struct yy_buffer_state*)
+								);
+
+		/* zero only the new slots.*/
+		memset((yy_buffer_stack) + (yy_buffer_stack_max), 0, grow_size * sizeof(struct yy_buffer_state*));
+		(yy_buffer_stack_max) = num_to_alloc;
+	}
+}
+
+#ifndef YY_EXIT_FAILURE
+#define YY_EXIT_FAILURE 2
+#endif
+
+static void yy_fatal_error (yyconst char* msg )
+{
+    	(void) fprintf( stderr, "%s\n", msg );
+	exit( YY_EXIT_FAILURE );
+}
+
+/* Redefine yyless() so it works in section 3 code. */
+
+#undef yyless
+#define yyless(n) \
+	do \
+		{ \
+		/* Undo effects of setting up yytext. */ \
+        int yyless_macro_arg = (n); \
+        YY_LESS_LINENO(yyless_macro_arg);\
+		yytext[yyleng] = (yy_hold_char); \
+		(yy_c_buf_p) = yytext + yyless_macro_arg; \
+		(yy_hold_char) = *(yy_c_buf_p); \
+		*(yy_c_buf_p) = '\0'; \
+		yyleng = yyless_macro_arg; \
+		} \
+	while ( 0 )
+
+/* Accessor  methods (get/set functions) to struct members. */
+
+/** Get the current line number.
+ * 
+ */
+__attribute__((unused)) static int yyget_lineno  (void)
+{
+        
+    return yylineno;
+}
+
+/** Get the input stream.
+ * 
+ */
+__attribute__((unused)) static FILE *yyget_in  (void)
+{
+        return yyin;
+}
+
+/** Get the output stream.
+ * 
+ */
+__attribute__((unused)) static FILE *yyget_out  (void)
+{
+        return yyout;
+}
+
+/** Get the length of the current token.
+ * 
+ */
+__attribute__((unused)) static int yyget_leng  (void)
+{
+        return yyleng;
+}
+
+/** Get the current token.
+ * 
+ */
+
+__attribute__((unused)) static char *yyget_text  (void)
+{
+        return yytext;
+}
+
+/** Set the current line number.
+ * @param line_number
+ * 
+ */
+__attribute__((unused)) static void yyset_lineno (int  line_number )
+{
+    
+    yylineno = line_number;
+}
+
+/** Set the input stream. This does not discard the current
+ * input buffer.
+ * @param in_str A readable stream.
+ * 
+ * @see yy_switch_to_buffer
+ */
+__attribute__((unused)) static void yyset_in (FILE *  in_str )
+{
+        yyin = in_str ;
+}
+
+__attribute__((unused)) static void yyset_out (FILE *  out_str )
+{
+        yyout = out_str ;
+}
+
+__attribute__((unused)) static int yyget_debug  (void)
+{
+        return yy_flex_debug;
+}
+
+__attribute__((unused)) static void yyset_debug (int  bdebug )
+{
+        yy_flex_debug = bdebug ;
+}
+
+/* yylex_destroy is for both reentrant and non-reentrant scanners. */
+__attribute__((unused)) static int yylex_destroy  (void)
+{
+    
+    /* Pop the buffer stack, destroying each element. */
+	while(YY_CURRENT_BUFFER){
+		yy_delete_buffer(YY_CURRENT_BUFFER  );
+		YY_CURRENT_BUFFER_LVALUE = NULL;
+		yypop_buffer_state();
+	}
+
+	/* Destroy the stack itself. */
+	yyfree((yy_buffer_stack) );
+	(yy_buffer_stack) = NULL;
+
+    return 0;
+}
+
+/*
+ * Internal utility routines.
+ */
+
+#ifndef yytext_ptr
+static void yy_flex_strncpy (char* s1, yyconst char * s2, int n )
+{
+	register int i;
+    	for ( i = 0; i < n; ++i )
+		s1[i] = s2[i];
+}
+#endif
+
+#ifdef YY_NEED_STRLEN
+static int yy_flex_strlen (yyconst char * s )
+{
+	register int n;
+    	for ( n = 0; s[n]; ++n )
+		;
+
+	return n;
+}
+#endif
+
+static void *yyalloc (yy_size_t  size )
+{
+	return (void *) malloc( size );
+}
+
+static void *yyrealloc  (void * ptr, yy_size_t  size )
+{
+	/* The cast to (char *) in the following accommodates both
+	 * implementations that use char* generic pointers, and those
+	 * that use void* generic pointers.  It works with the latter
+	 * because both ANSI C and C++ allow castless assignment from
+	 * any pointer type to void*, and deal with argument conversions
+	 * as though doing an assignment.
+	 */
+	return (void *) realloc( (char *) ptr, size );
+}
+
+static void yyfree (void * ptr )
+{
+	free( (char *) ptr );	/* see yyrealloc() for (char *) cast */
+}
+
+#define YYTABLES_NAME "yytables"
+
+#undef YY_NEW_FILE
+#undef YY_FLUSH_BUFFER
+#undef yy_set_bol
+#undef yy_new_buffer
+#undef yy_set_interactive
+#undef yytext_ptr
+#undef YY_DO_BEFORE_ACTION
+
+#ifdef YY_DECL_IS_OURS
+#undef YY_DECL_IS_OURS
+#undef YY_DECL
+#endif
+#line 648 "pars0lex.l"
+
+
+
+
+/**********************************************************************
+Release any resources used by the lexer. */
+UNIV_INTERN
+void
+pars_lexer_close(void)
+/*==================*/
+{
+        if (yy_buffer_stack)
+          yylex_destroy();
+        if (stringbuf)
+	  free(stringbuf);
+	stringbuf = NULL;
+	stringbuf_len_alloc = stringbuf_len = 0;
+}
diff --git a/storage/xtradb/pars/make_bison.sh b/storage/xtradb/pars/make_bison.sh
new file mode 100755
index 00000000000..09bb86e3106
--- /dev/null
+++ b/storage/xtradb/pars/make_bison.sh
@@ -0,0 +1,32 @@
+#!/bin/bash
+#
+# Copyright (c) 2006, 2009, Innobase Oy. All Rights Reserved.
+# 
+# This program is free software; you can redistribute it and/or modify it under
+# the terms of the GNU General Public License as published by the Free Software
+# Foundation; version 2 of the License.
+# 
+# This program is distributed in the hope that it will be useful, but WITHOUT
+# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+# 
+# You should have received a copy of the GNU General Public License along with
+# this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+# Place, Suite 330, Boston, MA 02111-1307 USA
+#
+# generate parser files from bison input files.
+
+set -eu
+TMPFILE=pars0grm.tab.c
+OUTFILE=pars0grm.c
+
+bison -d pars0grm.y
+mv pars0grm.tab.h ../include/pars0grm.h
+
+sed -e '
+s/'"$TMPFILE"'/'"$OUTFILE"'/;
+s/^\(\(YYSTYPE\|int\) yy\(char\|nerrs\)\)/static \1/;
+s/\(\(YYSTYPE\|int\) yy\(lval\|parse\)\)/UNIV_INTERN \1/;
+' < "$TMPFILE" > "$OUTFILE"
+
+rm "$TMPFILE"
diff --git a/storage/xtradb/pars/make_flex.sh b/storage/xtradb/pars/make_flex.sh
new file mode 100755
index 00000000000..89308a6636f
--- /dev/null
+++ b/storage/xtradb/pars/make_flex.sh
@@ -0,0 +1,48 @@
+#!/bin/bash
+#
+# Copyright (c) 2006, 2009, Innobase Oy. All Rights Reserved.
+# 
+# This program is free software; you can redistribute it and/or modify it under
+# the terms of the GNU General Public License as published by the Free Software
+# Foundation; version 2 of the License.
+# 
+# This program is distributed in the hope that it will be useful, but WITHOUT
+# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+# 
+# You should have received a copy of the GNU General Public License along with
+# this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+# Place, Suite 330, Boston, MA 02111-1307 USA
+#
+# generate lexer files from flex input files.
+
+set -eu
+
+TMPFILE=_flex_tmp.c
+OUTFILE=lexyy.c
+
+flex -o $TMPFILE pars0lex.l
+
+# AIX needs its includes done in a certain order, so include "univ.i" first
+# to be sure we get it right.
+echo '#include "univ.i"' > $OUTFILE
+
+# flex assigns a pointer to an int in one place without a cast, resulting in
+# a warning on Win64.  Add the cast.  Also define some symbols as static.
+sed -e '
+s/'"$TMPFILE"'/'"$OUTFILE"'/;
+s/\(int offset = \)\((yy_c_buf_p) - (yytext_ptr)\);/\1(int)(\2);/;
+s/\(void yy\(restart\|_\(delete\|flush\)_buffer\)\)/static \1/;
+s/\(void yy_switch_to_buffer\)/__attribute__((unused)) static \1/;
+s/\(void yy\(push\|pop\)_buffer_state\)/__attribute__((unused)) static \1/;
+s/\(YY_BUFFER_STATE yy_create_buffer\)/static \1/;
+s/\(\(int\|void\) yy[gs]et_\)/__attribute__((unused)) static \1/;
+s/\(void \*\?yy\(\(re\)\?alloc\|free\)\)/static \1/;
+s/\(extern \)\?\(int yy\(leng\|lineno\|_flex_debug\)\)/static \2/;
+s/\(int yylex_destroy\)/__attribute__((unused)) static \1/;
+s/\(extern \)\?\(int yylex \)/UNIV_INTERN \2/;
+s/^\(\(FILE\|char\) *\* *yyget\)/__attribute__((unused)) static \1/;
+s/^\(extern \)\?\(\(FILE\|char\) *\* *yy\)/static \2/;
+' < $TMPFILE >> $OUTFILE
+
+rm $TMPFILE
diff --git a/storage/xtradb/pars/pars0grm.c b/storage/xtradb/pars/pars0grm.c
new file mode 100644
index 00000000000..d667970735e
--- /dev/null
+++ b/storage/xtradb/pars/pars0grm.c
@@ -0,0 +1,2601 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 1984, 1989, 1990, 2000, 2001, 2002, 2003, 2004 Free Software
+Foundation, Inc.
+
+As a special exception, when this file is copied by Bison into a
+Bison output file, you may use that output file without restriction.
+This special exception was added by the Free Software Foundation
+in version 1.24 of Bison.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/* A Bison parser, made by GNU Bison 2.0.  */
+
+/* Written by Richard Stallman by simplifying the original so called
+   ``semantic'' parser.  */
+
+/* All symbols defined below should begin with yy or YY, to avoid
+   infringing on user name space.  This should be done even for local
+   variables, as they might otherwise be expanded by user macros.
+   There are some unavoidable exceptions within include files to
+   define necessary library symbols; they are noted "INFRINGES ON
+   USER NAME SPACE" below.  */
+
+/* Identify Bison output.  */
+#define YYBISON 1
+
+/* Skeleton name.  */
+#define YYSKELETON_NAME "yacc.c"
+
+/* Pure parsers.  */
+#define YYPURE 0
+
+/* Using locations.  */
+#define YYLSP_NEEDED 0
+
+
+
+/* Tokens.  */
+#ifndef YYTOKENTYPE
+# define YYTOKENTYPE
+   /* Put the tokens into the symbol table, so that GDB and other debuggers
+      know about them.  */
+   enum yytokentype {
+     PARS_INT_LIT = 258,
+     PARS_FLOAT_LIT = 259,
+     PARS_STR_LIT = 260,
+     PARS_FIXBINARY_LIT = 261,
+     PARS_BLOB_LIT = 262,
+     PARS_NULL_LIT = 263,
+     PARS_ID_TOKEN = 264,
+     PARS_AND_TOKEN = 265,
+     PARS_OR_TOKEN = 266,
+     PARS_NOT_TOKEN = 267,
+     PARS_GE_TOKEN = 268,
+     PARS_LE_TOKEN = 269,
+     PARS_NE_TOKEN = 270,
+     PARS_PROCEDURE_TOKEN = 271,
+     PARS_IN_TOKEN = 272,
+     PARS_OUT_TOKEN = 273,
+     PARS_BINARY_TOKEN = 274,
+     PARS_BLOB_TOKEN = 275,
+     PARS_INT_TOKEN = 276,
+     PARS_INTEGER_TOKEN = 277,
+     PARS_FLOAT_TOKEN = 278,
+     PARS_CHAR_TOKEN = 279,
+     PARS_IS_TOKEN = 280,
+     PARS_BEGIN_TOKEN = 281,
+     PARS_END_TOKEN = 282,
+     PARS_IF_TOKEN = 283,
+     PARS_THEN_TOKEN = 284,
+     PARS_ELSE_TOKEN = 285,
+     PARS_ELSIF_TOKEN = 286,
+     PARS_LOOP_TOKEN = 287,
+     PARS_WHILE_TOKEN = 288,
+     PARS_RETURN_TOKEN = 289,
+     PARS_SELECT_TOKEN = 290,
+     PARS_SUM_TOKEN = 291,
+     PARS_COUNT_TOKEN = 292,
+     PARS_DISTINCT_TOKEN = 293,
+     PARS_FROM_TOKEN = 294,
+     PARS_WHERE_TOKEN = 295,
+     PARS_FOR_TOKEN = 296,
+     PARS_DDOT_TOKEN = 297,
+     PARS_READ_TOKEN = 298,
+     PARS_ORDER_TOKEN = 299,
+     PARS_BY_TOKEN = 300,
+     PARS_ASC_TOKEN = 301,
+     PARS_DESC_TOKEN = 302,
+     PARS_INSERT_TOKEN = 303,
+     PARS_INTO_TOKEN = 304,
+     PARS_VALUES_TOKEN = 305,
+     PARS_UPDATE_TOKEN = 306,
+     PARS_SET_TOKEN = 307,
+     PARS_DELETE_TOKEN = 308,
+     PARS_CURRENT_TOKEN = 309,
+     PARS_OF_TOKEN = 310,
+     PARS_CREATE_TOKEN = 311,
+     PARS_TABLE_TOKEN = 312,
+     PARS_INDEX_TOKEN = 313,
+     PARS_UNIQUE_TOKEN = 314,
+     PARS_CLUSTERED_TOKEN = 315,
+     PARS_DOES_NOT_FIT_IN_MEM_TOKEN = 316,
+     PARS_ON_TOKEN = 317,
+     PARS_ASSIGN_TOKEN = 318,
+     PARS_DECLARE_TOKEN = 319,
+     PARS_CURSOR_TOKEN = 320,
+     PARS_SQL_TOKEN = 321,
+     PARS_OPEN_TOKEN = 322,
+     PARS_FETCH_TOKEN = 323,
+     PARS_CLOSE_TOKEN = 324,
+     PARS_NOTFOUND_TOKEN = 325,
+     PARS_TO_CHAR_TOKEN = 326,
+     PARS_TO_NUMBER_TOKEN = 327,
+     PARS_TO_BINARY_TOKEN = 328,
+     PARS_BINARY_TO_NUMBER_TOKEN = 329,
+     PARS_SUBSTR_TOKEN = 330,
+     PARS_REPLSTR_TOKEN = 331,
+     PARS_CONCAT_TOKEN = 332,
+     PARS_INSTR_TOKEN = 333,
+     PARS_LENGTH_TOKEN = 334,
+     PARS_SYSDATE_TOKEN = 335,
+     PARS_PRINTF_TOKEN = 336,
+     PARS_ASSERT_TOKEN = 337,
+     PARS_RND_TOKEN = 338,
+     PARS_RND_STR_TOKEN = 339,
+     PARS_ROW_PRINTF_TOKEN = 340,
+     PARS_COMMIT_TOKEN = 341,
+     PARS_ROLLBACK_TOKEN = 342,
+     PARS_WORK_TOKEN = 343,
+     PARS_UNSIGNED_TOKEN = 344,
+     PARS_EXIT_TOKEN = 345,
+     PARS_FUNCTION_TOKEN = 346,
+     PARS_LOCK_TOKEN = 347,
+     PARS_SHARE_TOKEN = 348,
+     PARS_MODE_TOKEN = 349,
+     NEG = 350
+   };
+#endif
+#define PARS_INT_LIT 258
+#define PARS_FLOAT_LIT 259
+#define PARS_STR_LIT 260
+#define PARS_FIXBINARY_LIT 261
+#define PARS_BLOB_LIT 262
+#define PARS_NULL_LIT 263
+#define PARS_ID_TOKEN 264
+#define PARS_AND_TOKEN 265
+#define PARS_OR_TOKEN 266
+#define PARS_NOT_TOKEN 267
+#define PARS_GE_TOKEN 268
+#define PARS_LE_TOKEN 269
+#define PARS_NE_TOKEN 270
+#define PARS_PROCEDURE_TOKEN 271
+#define PARS_IN_TOKEN 272
+#define PARS_OUT_TOKEN 273
+#define PARS_BINARY_TOKEN 274
+#define PARS_BLOB_TOKEN 275
+#define PARS_INT_TOKEN 276
+#define PARS_INTEGER_TOKEN 277
+#define PARS_FLOAT_TOKEN 278
+#define PARS_CHAR_TOKEN 279
+#define PARS_IS_TOKEN 280
+#define PARS_BEGIN_TOKEN 281
+#define PARS_END_TOKEN 282
+#define PARS_IF_TOKEN 283
+#define PARS_THEN_TOKEN 284
+#define PARS_ELSE_TOKEN 285
+#define PARS_ELSIF_TOKEN 286
+#define PARS_LOOP_TOKEN 287
+#define PARS_WHILE_TOKEN 288
+#define PARS_RETURN_TOKEN 289
+#define PARS_SELECT_TOKEN 290
+#define PARS_SUM_TOKEN 291
+#define PARS_COUNT_TOKEN 292
+#define PARS_DISTINCT_TOKEN 293
+#define PARS_FROM_TOKEN 294
+#define PARS_WHERE_TOKEN 295
+#define PARS_FOR_TOKEN 296
+#define PARS_DDOT_TOKEN 297
+#define PARS_READ_TOKEN 298
+#define PARS_ORDER_TOKEN 299
+#define PARS_BY_TOKEN 300
+#define PARS_ASC_TOKEN 301
+#define PARS_DESC_TOKEN 302
+#define PARS_INSERT_TOKEN 303
+#define PARS_INTO_TOKEN 304
+#define PARS_VALUES_TOKEN 305
+#define PARS_UPDATE_TOKEN 306
+#define PARS_SET_TOKEN 307
+#define PARS_DELETE_TOKEN 308
+#define PARS_CURRENT_TOKEN 309
+#define PARS_OF_TOKEN 310
+#define PARS_CREATE_TOKEN 311
+#define PARS_TABLE_TOKEN 312
+#define PARS_INDEX_TOKEN 313
+#define PARS_UNIQUE_TOKEN 314
+#define PARS_CLUSTERED_TOKEN 315
+#define PARS_DOES_NOT_FIT_IN_MEM_TOKEN 316
+#define PARS_ON_TOKEN 317
+#define PARS_ASSIGN_TOKEN 318
+#define PARS_DECLARE_TOKEN 319
+#define PARS_CURSOR_TOKEN 320
+#define PARS_SQL_TOKEN 321
+#define PARS_OPEN_TOKEN 322
+#define PARS_FETCH_TOKEN 323
+#define PARS_CLOSE_TOKEN 324
+#define PARS_NOTFOUND_TOKEN 325
+#define PARS_TO_CHAR_TOKEN 326
+#define PARS_TO_NUMBER_TOKEN 327
+#define PARS_TO_BINARY_TOKEN 328
+#define PARS_BINARY_TO_NUMBER_TOKEN 329
+#define PARS_SUBSTR_TOKEN 330
+#define PARS_REPLSTR_TOKEN 331
+#define PARS_CONCAT_TOKEN 332
+#define PARS_INSTR_TOKEN 333
+#define PARS_LENGTH_TOKEN 334
+#define PARS_SYSDATE_TOKEN 335
+#define PARS_PRINTF_TOKEN 336
+#define PARS_ASSERT_TOKEN 337
+#define PARS_RND_TOKEN 338
+#define PARS_RND_STR_TOKEN 339
+#define PARS_ROW_PRINTF_TOKEN 340
+#define PARS_COMMIT_TOKEN 341
+#define PARS_ROLLBACK_TOKEN 342
+#define PARS_WORK_TOKEN 343
+#define PARS_UNSIGNED_TOKEN 344
+#define PARS_EXIT_TOKEN 345
+#define PARS_FUNCTION_TOKEN 346
+#define PARS_LOCK_TOKEN 347
+#define PARS_SHARE_TOKEN 348
+#define PARS_MODE_TOKEN 349
+#define NEG 350
+
+
+
+
+/* Copy the first part of user declarations.  */
+#line 13 "pars0grm.y"
+
+/* The value of the semantic attribute is a pointer to a query tree node
+que_node_t */
+
+#include "univ.i"
+#include <math.h>				/* Can't be before univ.i */
+#include "pars0pars.h"
+#include "mem0mem.h"
+#include "que0types.h"
+#include "que0que.h"
+#include "row0sel.h"
+
+#define YYSTYPE que_node_t*
+
+/* #define __STDC__ */
+
+int
+yylex(void);
+
+
+/* Enabling traces.  */
+#ifndef YYDEBUG
+# define YYDEBUG 0
+#endif
+
+/* Enabling verbose error messages.  */
+#ifdef YYERROR_VERBOSE
+# undef YYERROR_VERBOSE
+# define YYERROR_VERBOSE 1
+#else
+# define YYERROR_VERBOSE 0
+#endif
+
+#if ! defined (YYSTYPE) && ! defined (YYSTYPE_IS_DECLARED)
+typedef int YYSTYPE;
+# define yystype YYSTYPE /* obsolescent; will be withdrawn */
+# define YYSTYPE_IS_DECLARED 1
+# define YYSTYPE_IS_TRIVIAL 1
+#endif
+
+
+
+/* Copy the second part of user declarations.  */
+
+
+/* Line 213 of yacc.c.  */
+#line 297 "pars0grm.c"
+
+#if ! defined (yyoverflow) || YYERROR_VERBOSE
+
+# ifndef YYFREE
+#  define YYFREE free
+# endif
+# ifndef YYMALLOC
+#  define YYMALLOC malloc
+# endif
+
+/* The parser invokes alloca or malloc; define the necessary symbols.  */
+
+# ifdef YYSTACK_USE_ALLOCA
+#  if YYSTACK_USE_ALLOCA
+#   ifdef __GNUC__
+#    define YYSTACK_ALLOC __builtin_alloca
+#   else
+#    define YYSTACK_ALLOC alloca
+#   endif
+#  endif
+# endif
+
+# ifdef YYSTACK_ALLOC
+   /* Pacify GCC's `empty if-body' warning. */
+#  define YYSTACK_FREE(Ptr) do { /* empty */; } while (0)
+# else
+#  if defined (__STDC__) || defined (__cplusplus)
+#   include <stdlib.h> /* INFRINGES ON USER NAME SPACE */
+#   define YYSIZE_T size_t
+#  endif
+#  define YYSTACK_ALLOC YYMALLOC
+#  define YYSTACK_FREE YYFREE
+# endif
+#endif /* ! defined (yyoverflow) || YYERROR_VERBOSE */
+
+
+#if (! defined (yyoverflow) \
+     && (! defined (__cplusplus) \
+	 || (defined (YYSTYPE_IS_TRIVIAL) && YYSTYPE_IS_TRIVIAL)))
+
+/* A type that is properly aligned for any stack member.  */
+union yyalloc
+{
+  short int yyss;
+  YYSTYPE yyvs;
+  };
+
+/* The size of the maximum gap between one aligned stack and the next.  */
+# define YYSTACK_GAP_MAXIMUM (sizeof (union yyalloc) - 1)
+
+/* The size of an array large to enough to hold all stacks, each with
+   N elements.  */
+# define YYSTACK_BYTES(N) \
+     ((N) * (sizeof (short int) + sizeof (YYSTYPE))			\
+      + YYSTACK_GAP_MAXIMUM)
+
+/* Copy COUNT objects from FROM to TO.  The source and destination do
+   not overlap.  */
+# ifndef YYCOPY
+#  if defined (__GNUC__) && 1 < __GNUC__
+#   define YYCOPY(To, From, Count) \
+      __builtin_memcpy (To, From, (Count) * sizeof (*(From)))
+#  else
+#   define YYCOPY(To, From, Count)		\
+      do					\
+	{					\
+	  register YYSIZE_T yyi;		\
+	  for (yyi = 0; yyi < (Count); yyi++)	\
+	    (To)[yyi] = (From)[yyi];		\
+	}					\
+      while (0)
+#  endif
+# endif
+
+/* Relocate STACK from its old location to the new one.  The
+   local variables YYSIZE and YYSTACKSIZE give the old and new number of
+   elements in the stack, and YYPTR gives the new location of the
+   stack.  Advance YYPTR to a properly aligned location for the next
+   stack.  */
+# define YYSTACK_RELOCATE(Stack)					\
+    do									\
+      {									\
+	YYSIZE_T yynewbytes;						\
+	YYCOPY (&yyptr->Stack, Stack, yysize);				\
+	Stack = &yyptr->Stack;						\
+	yynewbytes = yystacksize * sizeof (*Stack) + YYSTACK_GAP_MAXIMUM; \
+	yyptr += yynewbytes / sizeof (*yyptr);				\
+      }									\
+    while (0)
+
+#endif
+
+#if defined (__STDC__) || defined (__cplusplus)
+   typedef signed char yysigned_char;
+#else
+   typedef short int yysigned_char;
+#endif
+
+/* YYFINAL -- State number of the termination state. */
+#define YYFINAL  5
+/* YYLAST -- Last index in YYTABLE.  */
+#define YYLAST   752
+
+/* YYNTOKENS -- Number of terminals. */
+#define YYNTOKENS  111
+/* YYNNTS -- Number of nonterminals. */
+#define YYNNTS  70
+/* YYNRULES -- Number of rules. */
+#define YYNRULES  175
+/* YYNRULES -- Number of states. */
+#define YYNSTATES  339
+
+/* YYTRANSLATE(YYLEX) -- Bison symbol number corresponding to YYLEX.  */
+#define YYUNDEFTOK  2
+#define YYMAXUTOK   350
+
+#define YYTRANSLATE(YYX) 						\
+  ((unsigned int) (YYX) <= YYMAXUTOK ? yytranslate[YYX] : YYUNDEFTOK)
+
+/* YYTRANSLATE[YYLEX] -- Bison symbol number corresponding to YYLEX.  */
+static const unsigned char yytranslate[] =
+{
+       0,     2,     2,     2,     2,     2,     2,     2,     2,     2,
+       2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
+       2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
+       2,     2,     2,     2,     2,     2,     2,   103,     2,     2,
+     105,   106,   100,    99,   108,    98,     2,   101,     2,     2,
+       2,     2,     2,     2,     2,     2,     2,     2,     2,   104,
+      96,    95,    97,   107,     2,     2,     2,     2,     2,     2,
+       2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
+       2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
+       2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
+       2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
+       2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
+       2,     2,     2,   109,     2,   110,     2,     2,     2,     2,
+       2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
+       2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
+       2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
+       2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
+       2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
+       2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
+       2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
+       2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
+       2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
+       2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
+       2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
+       2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
+       2,     2,     2,     2,     2,     2,     1,     2,     3,     4,
+       5,     6,     7,     8,     9,    10,    11,    12,    13,    14,
+      15,    16,    17,    18,    19,    20,    21,    22,    23,    24,
+      25,    26,    27,    28,    29,    30,    31,    32,    33,    34,
+      35,    36,    37,    38,    39,    40,    41,    42,    43,    44,
+      45,    46,    47,    48,    49,    50,    51,    52,    53,    54,
+      55,    56,    57,    58,    59,    60,    61,    62,    63,    64,
+      65,    66,    67,    68,    69,    70,    71,    72,    73,    74,
+      75,    76,    77,    78,    79,    80,    81,    82,    83,    84,
+      85,    86,    87,    88,    89,    90,    91,    92,    93,    94,
+     102
+};
+
+#if YYDEBUG
+/* YYPRHS[YYN] -- Index of the first RHS symbol of rule number YYN in
+   YYRHS.  */
+static const unsigned short int yyprhs[] =
+{
+       0,     0,     3,     6,     8,    11,    14,    17,    20,    23,
+      26,    29,    32,    35,    38,    41,    44,    47,    50,    53,
+      56,    59,    62,    65,    68,    71,    73,    76,    78,    83,
+      85,    87,    89,    91,    93,    95,    97,   101,   105,   109,
+     113,   116,   120,   124,   128,   132,   136,   140,   144,   148,
+     152,   155,   159,   163,   165,   167,   169,   171,   173,   175,
+     177,   179,   181,   183,   185,   186,   188,   192,   199,   204,
+     206,   208,   210,   214,   216,   220,   221,   223,   227,   228,
+     230,   234,   236,   241,   247,   252,   253,   255,   259,   261,
+     265,   267,   268,   271,   272,   275,   276,   281,   282,   284,
+     286,   287,   292,   301,   305,   311,   314,   318,   320,   324,
+     329,   334,   337,   340,   344,   347,   350,   353,   357,   362,
+     364,   367,   368,   371,   373,   381,   388,   399,   401,   403,
+     406,   409,   414,   419,   425,   427,   431,   432,   436,   437,
+     439,   440,   443,   444,   446,   454,   456,   460,   461,   463,
+     464,   466,   477,   480,   483,   485,   487,   489,   491,   493,
+     497,   501,   502,   504,   508,   512,   513,   515,   518,   525,
+     530,   532,   534,   535,   537,   540
+};
+
+/* YYRHS -- A `-1'-separated list of the rules' RHS. */
+static const short int yyrhs[] =
+{
+     112,     0,    -1,   180,   104,    -1,   118,    -1,   119,   104,
+      -1,   151,   104,    -1,   152,   104,    -1,   153,   104,    -1,
+     150,   104,    -1,   154,   104,    -1,   146,   104,    -1,   133,
+     104,    -1,   135,   104,    -1,   145,   104,    -1,   143,   104,
+      -1,   144,   104,    -1,   140,   104,    -1,   141,   104,    -1,
+     155,   104,    -1,   157,   104,    -1,   156,   104,    -1,   169,
+     104,    -1,   170,   104,    -1,   164,   104,    -1,   168,   104,
+      -1,   113,    -1,   114,   113,    -1,     9,    -1,   116,   105,
+     124,   106,    -1,     3,    -1,     4,    -1,     5,    -1,     6,
+      -1,     7,    -1,     8,    -1,    66,    -1,   115,    99,   115,
+      -1,   115,    98,   115,    -1,   115,   100,   115,    -1,   115,
+     101,   115,    -1,    98,   115,    -1,   105,   115,   106,    -1,
+     115,    95,   115,    -1,   115,    96,   115,    -1,   115,    97,
+     115,    -1,   115,    13,   115,    -1,   115,    14,   115,    -1,
+     115,    15,   115,    -1,   115,    10,   115,    -1,   115,    11,
+     115,    -1,    12,   115,    -1,     9,   103,    70,    -1,    66,
+     103,    70,    -1,    71,    -1,    72,    -1,    73,    -1,    74,
+      -1,    75,    -1,    77,    -1,    78,    -1,    79,    -1,    80,
+      -1,    83,    -1,    84,    -1,    -1,   107,    -1,   117,   108,
+     107,    -1,   109,     9,   105,   117,   106,   110,    -1,   120,
+     105,   124,   106,    -1,    76,    -1,    81,    -1,    82,    -1,
+       9,   105,   106,    -1,     9,    -1,   122,   108,     9,    -1,
+      -1,     9,    -1,   123,   108,     9,    -1,    -1,   115,    -1,
+     124,   108,   115,    -1,   115,    -1,    37,   105,   100,   106,
+      -1,    37,   105,    38,     9,   106,    -1,    36,   105,   115,
+     106,    -1,    -1,   125,    -1,   126,   108,   125,    -1,   100,
+      -1,   126,    49,   123,    -1,   126,    -1,    -1,    40,   115,
+      -1,    -1,    41,    51,    -1,    -1,    92,    17,    93,    94,
+      -1,    -1,    46,    -1,    47,    -1,    -1,    44,    45,     9,
+     131,    -1,    35,   127,    39,   122,   128,   129,   130,   132,
+      -1,    48,    49,     9,    -1,   134,    50,   105,   124,   106,
+      -1,   134,   133,    -1,     9,    95,   115,    -1,   136,    -1,
+     137,   108,   136,    -1,    40,    54,    55,     9,    -1,    51,
+       9,    52,   137,    -1,   139,   128,    -1,   139,   138,    -1,
+      53,    39,     9,    -1,   142,   128,    -1,   142,   138,    -1,
+      85,   133,    -1,     9,    63,   115,    -1,    31,   115,    29,
+     114,    -1,   147,    -1,   148,   147,    -1,    -1,    30,   114,
+      -1,   148,    -1,    28,   115,    29,   114,   149,    27,    28,
+      -1,    33,   115,    32,   114,    27,    32,    -1,    41,     9,
+      17,   115,    42,   115,    32,   114,    27,    32,    -1,    90,
+      -1,    34,    -1,    67,     9,    -1,    69,     9,    -1,    68,
+       9,    49,   123,    -1,    68,     9,    49,   121,    -1,     9,
+     171,   160,   161,   162,    -1,   158,    -1,   159,   108,   158,
+      -1,    -1,   105,     3,   106,    -1,    -1,    89,    -1,    -1,
+      12,     8,    -1,    -1,    61,    -1,    56,    57,     9,   105,
+     159,   106,   163,    -1,     9,    -1,   165,   108,     9,    -1,
+      -1,    59,    -1,    -1,    60,    -1,    56,   166,   167,    58,
+       9,    62,     9,   105,   165,   106,    -1,    86,    88,    -1,
+      87,    88,    -1,    21,    -1,    22,    -1,    24,    -1,    19,
+      -1,    20,    -1,     9,    17,   171,    -1,     9,    18,   171,
+      -1,    -1,   172,    -1,   173,   108,   172,    -1,     9,   171,
+     104,    -1,    -1,   174,    -1,   175,   174,    -1,    64,    65,
+       9,    25,   133,   104,    -1,    64,    91,     9,   104,    -1,
+     176,    -1,   177,    -1,    -1,   178,    -1,   179,   178,    -1,
+      16,     9,   105,   173,   106,    25,   175,   179,    26,   114,
+      27,    -1
+};
+
+/* YYRLINE[YYN] -- source line where rule number YYN was defined.  */
+static const unsigned short int yyrline[] =
+{
+       0,   138,   138,   141,   142,   143,   144,   145,   146,   147,
+     148,   149,   150,   151,   152,   153,   154,   155,   156,   157,
+     158,   159,   160,   161,   162,   166,   167,   172,   173,   175,
+     176,   177,   178,   179,   180,   181,   182,   183,   184,   185,
+     186,   187,   188,   189,   190,   191,   192,   193,   194,   195,
+     196,   197,   199,   204,   205,   206,   207,   209,   210,   211,
+     212,   213,   214,   215,   218,   220,   221,   225,   230,   235,
+     236,   237,   241,   245,   246,   251,   252,   253,   258,   259,
+     260,   264,   265,   270,   276,   283,   284,   285,   290,   292,
+     294,   298,   299,   303,   304,   309,   310,   315,   316,   317,
+     321,   322,   327,   337,   342,   344,   349,   353,   354,   359,
+     365,   372,   377,   382,   388,   393,   398,   403,   408,   414,
+     415,   420,   421,   423,   427,   434,   440,   448,   452,   456,
+     462,   468,   470,   475,   480,   481,   486,   487,   492,   493,
+     499,   500,   506,   507,   513,   519,   520,   525,   526,   530,
+     531,   535,   543,   548,   553,   554,   555,   556,   557,   561,
+     564,   570,   571,   572,   577,   581,   583,   584,   588,   594,
+     599,   600,   603,   605,   606,   610
+};
+#endif
+
+#if YYDEBUG || YYERROR_VERBOSE
+/* YYTNME[SYMBOL-NUM] -- String name of the symbol SYMBOL-NUM.
+   First, the terminals, then, starting at YYNTOKENS, nonterminals. */
+static const char *const yytname[] =
+{
+  "$end", "error", "$undefined", "PARS_INT_LIT", "PARS_FLOAT_LIT",
+  "PARS_STR_LIT", "PARS_FIXBINARY_LIT", "PARS_BLOB_LIT", "PARS_NULL_LIT",
+  "PARS_ID_TOKEN", "PARS_AND_TOKEN", "PARS_OR_TOKEN", "PARS_NOT_TOKEN",
+  "PARS_GE_TOKEN", "PARS_LE_TOKEN", "PARS_NE_TOKEN",
+  "PARS_PROCEDURE_TOKEN", "PARS_IN_TOKEN", "PARS_OUT_TOKEN",
+  "PARS_BINARY_TOKEN", "PARS_BLOB_TOKEN", "PARS_INT_TOKEN",
+  "PARS_INTEGER_TOKEN", "PARS_FLOAT_TOKEN", "PARS_CHAR_TOKEN",
+  "PARS_IS_TOKEN", "PARS_BEGIN_TOKEN", "PARS_END_TOKEN", "PARS_IF_TOKEN",
+  "PARS_THEN_TOKEN", "PARS_ELSE_TOKEN", "PARS_ELSIF_TOKEN",
+  "PARS_LOOP_TOKEN", "PARS_WHILE_TOKEN", "PARS_RETURN_TOKEN",
+  "PARS_SELECT_TOKEN", "PARS_SUM_TOKEN", "PARS_COUNT_TOKEN",
+  "PARS_DISTINCT_TOKEN", "PARS_FROM_TOKEN", "PARS_WHERE_TOKEN",
+  "PARS_FOR_TOKEN", "PARS_DDOT_TOKEN", "PARS_READ_TOKEN",
+  "PARS_ORDER_TOKEN", "PARS_BY_TOKEN", "PARS_ASC_TOKEN", "PARS_DESC_TOKEN",
+  "PARS_INSERT_TOKEN", "PARS_INTO_TOKEN", "PARS_VALUES_TOKEN",
+  "PARS_UPDATE_TOKEN", "PARS_SET_TOKEN", "PARS_DELETE_TOKEN",
+  "PARS_CURRENT_TOKEN", "PARS_OF_TOKEN", "PARS_CREATE_TOKEN",
+  "PARS_TABLE_TOKEN", "PARS_INDEX_TOKEN", "PARS_UNIQUE_TOKEN",
+  "PARS_CLUSTERED_TOKEN", "PARS_DOES_NOT_FIT_IN_MEM_TOKEN",
+  "PARS_ON_TOKEN", "PARS_ASSIGN_TOKEN", "PARS_DECLARE_TOKEN",
+  "PARS_CURSOR_TOKEN", "PARS_SQL_TOKEN", "PARS_OPEN_TOKEN",
+  "PARS_FETCH_TOKEN", "PARS_CLOSE_TOKEN", "PARS_NOTFOUND_TOKEN",
+  "PARS_TO_CHAR_TOKEN", "PARS_TO_NUMBER_TOKEN", "PARS_TO_BINARY_TOKEN",
+  "PARS_BINARY_TO_NUMBER_TOKEN", "PARS_SUBSTR_TOKEN", "PARS_REPLSTR_TOKEN",
+  "PARS_CONCAT_TOKEN", "PARS_INSTR_TOKEN", "PARS_LENGTH_TOKEN",
+  "PARS_SYSDATE_TOKEN", "PARS_PRINTF_TOKEN", "PARS_ASSERT_TOKEN",
+  "PARS_RND_TOKEN", "PARS_RND_STR_TOKEN", "PARS_ROW_PRINTF_TOKEN",
+  "PARS_COMMIT_TOKEN", "PARS_ROLLBACK_TOKEN", "PARS_WORK_TOKEN",
+  "PARS_UNSIGNED_TOKEN", "PARS_EXIT_TOKEN", "PARS_FUNCTION_TOKEN",
+  "PARS_LOCK_TOKEN", "PARS_SHARE_TOKEN", "PARS_MODE_TOKEN", "'='", "'<'",
+  "'>'", "'-'", "'+'", "'*'", "'/'", "NEG", "'%'", "';'", "'('", "')'",
+  "'?'", "','", "'{'", "'}'", "$accept", "top_statement", "statement",
+  "statement_list", "exp", "function_name", "question_mark_list",
+  "stored_procedure_call", "predefined_procedure_call",
+  "predefined_procedure_name", "user_function_call", "table_list",
+  "variable_list", "exp_list", "select_item", "select_item_list",
+  "select_list", "search_condition", "for_update_clause",
+  "lock_shared_clause", "order_direction", "order_by_clause",
+  "select_statement", "insert_statement_start", "insert_statement",
+  "column_assignment", "column_assignment_list", "cursor_positioned",
+  "update_statement_start", "update_statement_searched",
+  "update_statement_positioned", "delete_statement_start",
+  "delete_statement_searched", "delete_statement_positioned",
+  "row_printf_statement", "assignment_statement", "elsif_element",
+  "elsif_list", "else_part", "if_statement", "while_statement",
+  "for_statement", "exit_statement", "return_statement",
+  "open_cursor_statement", "close_cursor_statement", "fetch_statement",
+  "column_def", "column_def_list", "opt_column_len", "opt_unsigned",
+  "opt_not_null", "not_fit_in_memory", "create_table", "column_list",
+  "unique_def", "clustered_def", "create_index", "commit_statement",
+  "rollback_statement", "type_name", "parameter_declaration",
+  "parameter_declaration_list", "variable_declaration",
+  "variable_declaration_list", "cursor_declaration",
+  "function_declaration", "declaration", "declaration_list",
+  "procedure_definition", 0
+};
+#endif
+
+# ifdef YYPRINT
+/* YYTOKNUM[YYLEX-NUM] -- Internal token number corresponding to
+   token YYLEX-NUM.  */
+static const unsigned short int yytoknum[] =
+{
+       0,   256,   257,   258,   259,   260,   261,   262,   263,   264,
+     265,   266,   267,   268,   269,   270,   271,   272,   273,   274,
+     275,   276,   277,   278,   279,   280,   281,   282,   283,   284,
+     285,   286,   287,   288,   289,   290,   291,   292,   293,   294,
+     295,   296,   297,   298,   299,   300,   301,   302,   303,   304,
+     305,   306,   307,   308,   309,   310,   311,   312,   313,   314,
+     315,   316,   317,   318,   319,   320,   321,   322,   323,   324,
+     325,   326,   327,   328,   329,   330,   331,   332,   333,   334,
+     335,   336,   337,   338,   339,   340,   341,   342,   343,   344,
+     345,   346,   347,   348,   349,    61,    60,    62,    45,    43,
+      42,    47,   350,    37,    59,    40,    41,    63,    44,   123,
+     125
+};
+# endif
+
+/* YYR1[YYN] -- Symbol number of symbol that rule YYN derives.  */
+static const unsigned char yyr1[] =
+{
+       0,   111,   112,   113,   113,   113,   113,   113,   113,   113,
+     113,   113,   113,   113,   113,   113,   113,   113,   113,   113,
+     113,   113,   113,   113,   113,   114,   114,   115,   115,   115,
+     115,   115,   115,   115,   115,   115,   115,   115,   115,   115,
+     115,   115,   115,   115,   115,   115,   115,   115,   115,   115,
+     115,   115,   115,   116,   116,   116,   116,   116,   116,   116,
+     116,   116,   116,   116,   117,   117,   117,   118,   119,   120,
+     120,   120,   121,   122,   122,   123,   123,   123,   124,   124,
+     124,   125,   125,   125,   125,   126,   126,   126,   127,   127,
+     127,   128,   128,   129,   129,   130,   130,   131,   131,   131,
+     132,   132,   133,   134,   135,   135,   136,   137,   137,   138,
+     139,   140,   141,   142,   143,   144,   145,   146,   147,   148,
+     148,   149,   149,   149,   150,   151,   152,   153,   154,   155,
+     156,   157,   157,   158,   159,   159,   160,   160,   161,   161,
+     162,   162,   163,   163,   164,   165,   165,   166,   166,   167,
+     167,   168,   169,   170,   171,   171,   171,   171,   171,   172,
+     172,   173,   173,   173,   174,   175,   175,   175,   176,   177,
+     178,   178,   179,   179,   179,   180
+};
+
+/* YYR2[YYN] -- Number of symbols composing right hand side of rule YYN.  */
+static const unsigned char yyr2[] =
+{
+       0,     2,     2,     1,     2,     2,     2,     2,     2,     2,
+       2,     2,     2,     2,     2,     2,     2,     2,     2,     2,
+       2,     2,     2,     2,     2,     1,     2,     1,     4,     1,
+       1,     1,     1,     1,     1,     1,     3,     3,     3,     3,
+       2,     3,     3,     3,     3,     3,     3,     3,     3,     3,
+       2,     3,     3,     1,     1,     1,     1,     1,     1,     1,
+       1,     1,     1,     1,     0,     1,     3,     6,     4,     1,
+       1,     1,     3,     1,     3,     0,     1,     3,     0,     1,
+       3,     1,     4,     5,     4,     0,     1,     3,     1,     3,
+       1,     0,     2,     0,     2,     0,     4,     0,     1,     1,
+       0,     4,     8,     3,     5,     2,     3,     1,     3,     4,
+       4,     2,     2,     3,     2,     2,     2,     3,     4,     1,
+       2,     0,     2,     1,     7,     6,    10,     1,     1,     2,
+       2,     4,     4,     5,     1,     3,     0,     3,     0,     1,
+       0,     2,     0,     1,     7,     1,     3,     0,     1,     0,
+       1,    10,     2,     2,     1,     1,     1,     1,     1,     3,
+       3,     0,     1,     3,     3,     0,     1,     2,     6,     4,
+       1,     1,     0,     1,     2,    11
+};
+
+/* YYDEFACT[STATE-NAME] -- Default rule to reduce with in state
+   STATE-NUM when YYTABLE doesn't specify something else to do.  Zero
+   means the default is an error.  */
+static const unsigned char yydefact[] =
+{
+       0,     0,     0,     0,     0,     1,     2,   161,     0,   162,
+       0,     0,     0,     0,     0,   157,   158,   154,   155,   156,
+     159,   160,   165,   163,     0,   166,   172,     0,     0,   167,
+     170,   171,   173,     0,   164,     0,     0,     0,   174,     0,
+       0,     0,     0,     0,   128,    85,     0,     0,     0,     0,
+     147,     0,     0,     0,    69,    70,    71,     0,     0,     0,
+     127,     0,    25,     0,     3,     0,     0,     0,     0,     0,
+      91,     0,     0,    91,     0,     0,     0,     0,     0,     0,
+       0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
+       0,   169,     0,    29,    30,    31,    32,    33,    34,    27,
+       0,    35,    53,    54,    55,    56,    57,    58,    59,    60,
+      61,    62,    63,     0,     0,     0,     0,     0,     0,     0,
+      88,    81,    86,    90,     0,     0,     0,     0,     0,     0,
+     148,   149,   129,     0,   130,   116,   152,   153,     0,   175,
+      26,     4,    78,    11,     0,   105,    12,     0,   111,   112,
+      16,    17,   114,   115,    14,    15,    13,    10,     8,     5,
+       6,     7,     9,    18,    20,    19,    23,    24,    21,    22,
+       0,   117,     0,    50,     0,    40,     0,     0,     0,     0,
+       0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
+      78,     0,     0,     0,    75,     0,     0,     0,   103,     0,
+     113,     0,   150,     0,    75,    64,    79,     0,    78,     0,
+      92,   168,    51,    52,    41,    48,    49,    45,    46,    47,
+     121,    42,    43,    44,    37,    36,    38,    39,     0,     0,
+       0,     0,     0,    76,    89,    87,    73,    91,     0,     0,
+     107,   110,     0,     0,    76,   132,   131,    65,     0,    68,
+       0,     0,     0,     0,     0,   119,   123,     0,    28,     0,
+      84,     0,    82,     0,     0,     0,    93,     0,     0,     0,
+       0,   134,     0,     0,     0,     0,     0,    80,   104,   109,
+     122,     0,   120,     0,   125,    83,    77,    74,     0,    95,
+       0,   106,   108,   136,   142,     0,     0,    72,    67,    66,
+       0,   124,    94,     0,   100,     0,     0,   138,   143,   144,
+     135,     0,   118,     0,     0,   102,     0,     0,   139,   140,
+       0,     0,     0,     0,   137,     0,   133,   145,     0,    96,
+      97,   126,   141,   151,     0,    98,    99,   101,   146
+};
+
+/* YYDEFGOTO[NTERM-NUM]. */
+static const short int yydefgoto[] =
+{
+      -1,     2,    62,    63,   206,   116,   248,    64,    65,    66,
+     245,   237,   234,   207,   122,   123,   124,   148,   289,   304,
+     337,   315,    67,    68,    69,   240,   241,   149,    70,    71,
+      72,    73,    74,    75,    76,    77,   255,   256,   257,    78,
+      79,    80,    81,    82,    83,    84,    85,   271,   272,   307,
+     319,   326,   309,    86,   328,   131,   203,    87,    88,    89,
+      20,     9,    10,    25,    26,    30,    31,    32,    33,     3
+};
+
+/* YYPACT[STATE-NUM] -- Index in YYTABLE of the portion describing
+   STATE-NUM.  */
+#define YYPACT_NINF -177
+static const short int yypact[] =
+{
+      28,    38,    54,   -46,   -29,  -177,  -177,    56,    50,  -177,
+     -75,     8,     8,    46,    56,  -177,  -177,  -177,  -177,  -177,
+    -177,  -177,    63,  -177,     8,  -177,     2,   -26,   -51,  -177,
+    -177,  -177,  -177,   -13,  -177,    71,    72,   587,  -177,    57,
+     -21,    26,   272,   272,  -177,    13,    91,    55,    96,    67,
+     -22,    99,   100,   103,  -177,  -177,  -177,    75,    29,    35,
+    -177,   116,  -177,   396,  -177,    22,    23,    27,    -9,    30,
+      87,    31,    32,    87,    47,    49,    52,    58,    59,    60,
+      61,    62,    65,    66,    74,    77,    78,    86,    89,   102,
+      75,  -177,   272,  -177,  -177,  -177,  -177,  -177,  -177,    39,
+     272,    51,  -177,  -177,  -177,  -177,  -177,  -177,  -177,  -177,
+    -177,  -177,  -177,   272,   272,   361,    25,   489,    45,    90,
+    -177,   651,  -177,   -39,    93,   142,   124,   108,   152,   170,
+    -177,   131,  -177,   143,  -177,  -177,  -177,  -177,    98,  -177,
+    -177,  -177,   272,  -177,   110,  -177,  -177,   256,  -177,  -177,
+    -177,  -177,  -177,  -177,  -177,  -177,  -177,  -177,  -177,  -177,
+    -177,  -177,  -177,  -177,  -177,  -177,  -177,  -177,  -177,  -177,
+     112,   651,   137,   101,   147,   204,    88,   272,   272,   272,
+     272,   272,   587,   272,   272,   272,   272,   272,   272,   272,
+     272,   587,   272,   -30,   211,   168,   212,   272,  -177,   213,
+    -177,   118,  -177,   167,   217,   122,   651,   -63,   272,   175,
+     651,  -177,  -177,  -177,  -177,   101,   101,    21,    21,   651,
+     332,    21,    21,    21,    -6,    -6,   204,   204,   -60,   460,
+     198,   222,   126,  -177,   125,  -177,  -177,   -33,   584,   140,
+    -177,   128,   228,   229,   139,  -177,   125,  -177,   -53,  -177,
+     272,   -49,   240,   587,   272,  -177,   224,   226,  -177,   225,
+    -177,   150,  -177,   258,   272,   260,   230,   272,   272,   213,
+       8,  -177,   -45,   208,   166,   164,   176,   651,  -177,  -177,
+     587,   631,  -177,   254,  -177,  -177,  -177,  -177,   234,   194,
+     638,   651,  -177,   182,   227,   228,   280,  -177,  -177,  -177,
+     587,  -177,  -177,   273,   247,   587,   289,   214,  -177,  -177,
+    -177,   195,   587,   209,   261,  -177,   524,   199,  -177,   295,
+     292,   215,   299,   279,  -177,   304,  -177,  -177,   -44,  -177,
+      -8,  -177,  -177,  -177,   305,  -177,  -177,  -177,  -177
+};
+
+/* YYPGOTO[NTERM-NUM].  */
+static const short int yypgoto[] =
+{
+    -177,  -177,   -62,  -176,   -40,  -177,  -177,  -177,  -177,  -177,
+    -177,  -177,   109,  -166,   120,  -177,  -177,   -69,  -177,  -177,
+    -177,  -177,   -34,  -177,  -177,    48,  -177,   243,  -177,  -177,
+    -177,  -177,  -177,  -177,  -177,  -177,    64,  -177,  -177,  -177,
+    -177,  -177,  -177,  -177,  -177,  -177,  -177,    24,  -177,  -177,
+    -177,  -177,  -177,  -177,  -177,  -177,  -177,  -177,  -177,  -177,
+     -12,   307,  -177,   297,  -177,  -177,  -177,   285,  -177,  -177
+};
+
+/* YYTABLE[YYPACT[STATE-NUM]].  What to do in state STATE-NUM.  If
+   positive, shift that token.  If negative, reduce the rule which
+   number is the opposite.  If zero, do what YYDEFACT says.
+   If YYTABLE_NINF, syntax error.  */
+#define YYTABLE_NINF -1
+static const unsigned short int yytable[] =
+{
+      21,   140,   115,   117,   152,   121,   220,   264,   231,   181,
+     194,    24,    27,    37,    35,   229,    93,    94,    95,    96,
+      97,    98,    99,   135,   228,   100,    45,    15,    16,    17,
+      18,    13,    19,    14,   145,   129,   181,   130,   335,   336,
+      36,   144,   251,   249,     1,   250,   258,     4,   250,   118,
+     119,    28,   171,   275,     5,   276,   170,   278,     6,   250,
+     173,   294,   333,   295,   334,     8,    28,    11,    12,   195,
+     232,    22,    24,   175,   176,   265,     7,   280,    34,   101,
+      39,    40,    90,    91,   102,   103,   104,   105,   106,    92,
+     107,   108,   109,   110,   188,   189,   111,   112,   177,   178,
+     125,   179,   180,   181,   126,   127,   128,   210,   132,   133,
+      45,   113,   134,   120,   179,   180,   181,   136,   114,   186,
+     187,   188,   189,   137,   312,   138,   141,   147,   142,   316,
+     190,   143,   196,   198,   146,   150,   151,   215,   216,   217,
+     218,   219,   172,   221,   222,   223,   224,   225,   226,   227,
+     192,   154,   230,   155,   174,   121,   156,   238,   140,   197,
+     199,   200,   157,   158,   159,   160,   161,   140,   266,   162,
+     163,    93,    94,    95,    96,    97,    98,    99,   164,   201,
+     100,   165,   166,   183,   184,   185,   186,   187,   188,   189,
+     167,   202,   204,   168,   214,   193,   183,   184,   185,   186,
+     187,   188,   189,   205,   118,   119,   169,   212,   177,   178,
+     277,   179,   180,   181,   281,   208,   211,   213,   140,   181,
+     233,   236,   239,   242,   210,   243,   244,   290,   291,   247,
+     252,   261,   262,   263,   101,   268,   269,   270,   273,   102,
+     103,   104,   105,   106,   274,   107,   108,   109,   110,   279,
+     140,   111,   112,   283,   140,   254,   285,   284,   293,    93,
+      94,    95,    96,    97,    98,    99,   113,   286,   100,   287,
+     296,   288,   297,   114,   298,    93,    94,    95,    96,    97,
+      98,    99,   301,   299,   100,   302,   303,   306,   308,   311,
+     313,   314,   317,   183,   184,   185,   186,   187,   188,   189,
+     320,   327,   321,   318,   260,   324,   322,   325,   330,   329,
+     209,   331,   332,   246,   338,   235,   153,   292,    38,   310,
+     282,    23,   101,    29,     0,     0,     0,   102,   103,   104,
+     105,   106,     0,   107,   108,   109,   110,     0,   101,   111,
+     112,    41,     0,   102,   103,   104,   105,   106,     0,   107,
+     108,   109,   110,     0,   113,   111,   112,     0,     0,     0,
+      42,   114,   253,   254,     0,    43,    44,    45,     0,     0,
+     113,   177,   178,    46,   179,   180,   181,   114,     0,     0,
+      47,     0,     0,    48,     0,    49,     0,     0,    50,     0,
+     182,     0,     0,     0,     0,     0,     0,     0,     0,    51,
+      52,    53,     0,     0,     0,    41,     0,     0,    54,     0,
+       0,     0,     0,    55,    56,     0,     0,    57,    58,    59,
+       0,     0,    60,   139,    42,     0,     0,     0,     0,    43,
+      44,    45,     0,     0,     0,     0,     0,    46,     0,     0,
+       0,    61,     0,     0,    47,     0,     0,    48,     0,    49,
+       0,     0,    50,     0,     0,     0,   183,   184,   185,   186,
+     187,   188,   189,    51,    52,    53,     0,     0,     0,    41,
+       0,     0,    54,     0,     0,     0,     0,    55,    56,     0,
+       0,    57,    58,    59,     0,     0,    60,   259,    42,     0,
+       0,     0,     0,    43,    44,    45,     0,     0,     0,   177,
+     178,    46,   179,   180,   181,    61,     0,     0,    47,     0,
+       0,    48,     0,    49,     0,     0,    50,     0,     0,     0,
+       0,   191,     0,     0,     0,     0,     0,    51,    52,    53,
+       0,     0,     0,    41,     0,     0,    54,     0,     0,     0,
+       0,    55,    56,     0,     0,    57,    58,    59,     0,     0,
+      60,   323,    42,     0,     0,     0,     0,    43,    44,    45,
+       0,     0,     0,     0,     0,    46,     0,     0,     0,    61,
+       0,     0,    47,     0,     0,    48,     0,    49,     0,     0,
+      50,     0,     0,     0,   183,   184,   185,   186,   187,   188,
+     189,    51,    52,    53,   177,   178,    41,   179,   180,   181,
+      54,     0,     0,     0,     0,    55,    56,     0,     0,    57,
+      58,    59,     0,     0,    60,    42,     0,     0,     0,     0,
+      43,    44,    45,     0,     0,     0,   267,     0,    46,     0,
+       0,     0,     0,    61,     0,    47,     0,     0,    48,     0,
+      49,   177,   178,    50,   179,   180,   181,     0,   177,   178,
+       0,   179,   180,   181,    51,    52,    53,     0,     0,     0,
+     300,   177,   178,    54,   179,   180,   181,     0,    55,    56,
+     305,     0,    57,    58,    59,     0,     0,    60,     0,   183,
+     184,   185,   186,   187,   188,   189,     0,     0,     0,     0,
+       0,     0,     0,     0,     0,     0,    61,     0,     0,     0,
+       0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
+       0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
+       0,     0,     0,     0,     0,     0,   183,   184,   185,   186,
+     187,   188,   189,   183,   184,   185,   186,   187,   188,   189,
+       0,     0,     0,     0,     0,     0,   183,   184,   185,   186,
+     187,   188,   189
+};
+
+static const short int yycheck[] =
+{
+      12,    63,    42,    43,    73,    45,   182,    40,    38,    15,
+      49,     9,    24,    26,    65,   191,     3,     4,     5,     6,
+       7,     8,     9,    57,   190,    12,    35,    19,    20,    21,
+      22,   106,    24,   108,    68,    57,    15,    59,    46,    47,
+      91,    50,   208,   106,    16,   108,   106,     9,   108,    36,
+      37,    64,    92,   106,     0,   108,    90,   106,   104,   108,
+     100,   106,   106,   108,   108,     9,    64,    17,    18,   108,
+     100,    25,     9,   113,   114,   108,   105,   253,   104,    66,
+       9,     9,    25,   104,    71,    72,    73,    74,    75,    63,
+      77,    78,    79,    80,   100,   101,    83,    84,    10,    11,
+       9,    13,    14,    15,    49,     9,    39,   147,     9,     9,
+      35,    98,     9,   100,    13,    14,    15,    88,   105,    98,
+      99,   100,   101,    88,   300,     9,   104,    40,   105,   305,
+     105,   104,    39,     9,   104,   104,   104,   177,   178,   179,
+     180,   181,   103,   183,   184,   185,   186,   187,   188,   189,
+     105,   104,   192,   104,   103,   195,   104,   197,   220,    17,
+      52,     9,   104,   104,   104,   104,   104,   229,   237,   104,
+     104,     3,     4,     5,     6,     7,     8,     9,   104,     9,
+      12,   104,   104,    95,    96,    97,    98,    99,   100,   101,
+     104,    60,    49,   104,   106,   105,    95,    96,    97,    98,
+      99,   100,   101,   105,    36,    37,   104,    70,    10,    11,
+     250,    13,    14,    15,   254,   105,   104,    70,   280,    15,
+       9,     9,     9,   105,   264,    58,     9,   267,   268,   107,
+      55,     9,   106,   108,    66,    95,   108,     9,     9,    71,
+      72,    73,    74,    75,   105,    77,    78,    79,    80,     9,
+     312,    83,    84,    27,   316,    31,   106,    32,   270,     3,
+       4,     5,     6,     7,     8,     9,    98,     9,    12,     9,
+      62,    41,   106,   105,   110,     3,     4,     5,     6,     7,
+       8,     9,    28,   107,    12,    51,    92,   105,    61,     9,
+      17,    44,     3,    95,    96,    97,    98,    99,   100,   101,
+     105,     9,    93,    89,   106,   106,    45,    12,     9,    94,
+      54,    32,     8,   204,     9,   195,    73,   269,    33,   295,
+     256,    14,    66,    26,    -1,    -1,    -1,    71,    72,    73,
+      74,    75,    -1,    77,    78,    79,    80,    -1,    66,    83,
+      84,     9,    -1,    71,    72,    73,    74,    75,    -1,    77,
+      78,    79,    80,    -1,    98,    83,    84,    -1,    -1,    -1,
+      28,   105,    30,    31,    -1,    33,    34,    35,    -1,    -1,
+      98,    10,    11,    41,    13,    14,    15,   105,    -1,    -1,
+      48,    -1,    -1,    51,    -1,    53,    -1,    -1,    56,    -1,
+      29,    -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,    67,
+      68,    69,    -1,    -1,    -1,     9,    -1,    -1,    76,    -1,
+      -1,    -1,    -1,    81,    82,    -1,    -1,    85,    86,    87,
+      -1,    -1,    90,    27,    28,    -1,    -1,    -1,    -1,    33,
+      34,    35,    -1,    -1,    -1,    -1,    -1,    41,    -1,    -1,
+      -1,   109,    -1,    -1,    48,    -1,    -1,    51,    -1,    53,
+      -1,    -1,    56,    -1,    -1,    -1,    95,    96,    97,    98,
+      99,   100,   101,    67,    68,    69,    -1,    -1,    -1,     9,
+      -1,    -1,    76,    -1,    -1,    -1,    -1,    81,    82,    -1,
+      -1,    85,    86,    87,    -1,    -1,    90,    27,    28,    -1,
+      -1,    -1,    -1,    33,    34,    35,    -1,    -1,    -1,    10,
+      11,    41,    13,    14,    15,   109,    -1,    -1,    48,    -1,
+      -1,    51,    -1,    53,    -1,    -1,    56,    -1,    -1,    -1,
+      -1,    32,    -1,    -1,    -1,    -1,    -1,    67,    68,    69,
+      -1,    -1,    -1,     9,    -1,    -1,    76,    -1,    -1,    -1,
+      -1,    81,    82,    -1,    -1,    85,    86,    87,    -1,    -1,
+      90,    27,    28,    -1,    -1,    -1,    -1,    33,    34,    35,
+      -1,    -1,    -1,    -1,    -1,    41,    -1,    -1,    -1,   109,
+      -1,    -1,    48,    -1,    -1,    51,    -1,    53,    -1,    -1,
+      56,    -1,    -1,    -1,    95,    96,    97,    98,    99,   100,
+     101,    67,    68,    69,    10,    11,     9,    13,    14,    15,
+      76,    -1,    -1,    -1,    -1,    81,    82,    -1,    -1,    85,
+      86,    87,    -1,    -1,    90,    28,    -1,    -1,    -1,    -1,
+      33,    34,    35,    -1,    -1,    -1,    42,    -1,    41,    -1,
+      -1,    -1,    -1,   109,    -1,    48,    -1,    -1,    51,    -1,
+      53,    10,    11,    56,    13,    14,    15,    -1,    10,    11,
+      -1,    13,    14,    15,    67,    68,    69,    -1,    -1,    -1,
+      29,    10,    11,    76,    13,    14,    15,    -1,    81,    82,
+      32,    -1,    85,    86,    87,    -1,    -1,    90,    -1,    95,
+      96,    97,    98,    99,   100,   101,    -1,    -1,    -1,    -1,
+      -1,    -1,    -1,    -1,    -1,    -1,   109,    -1,    -1,    -1,
+      -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
+      -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
+      -1,    -1,    -1,    -1,    -1,    -1,    95,    96,    97,    98,
+      99,   100,   101,    95,    96,    97,    98,    99,   100,   101,
+      -1,    -1,    -1,    -1,    -1,    -1,    95,    96,    97,    98,
+      99,   100,   101
+};
+
+/* YYSTOS[STATE-NUM] -- The (internal number of the) accessing
+   symbol of state STATE-NUM.  */
+static const unsigned char yystos[] =
+{
+       0,    16,   112,   180,     9,     0,   104,   105,     9,   172,
+     173,    17,    18,   106,   108,    19,    20,    21,    22,    24,
+     171,   171,    25,   172,     9,   174,   175,   171,    64,   174,
+     176,   177,   178,   179,   104,    65,    91,    26,   178,     9,
+       9,     9,    28,    33,    34,    35,    41,    48,    51,    53,
+      56,    67,    68,    69,    76,    81,    82,    85,    86,    87,
+      90,   109,   113,   114,   118,   119,   120,   133,   134,   135,
+     139,   140,   141,   142,   143,   144,   145,   146,   150,   151,
+     152,   153,   154,   155,   156,   157,   164,   168,   169,   170,
+      25,   104,    63,     3,     4,     5,     6,     7,     8,     9,
+      12,    66,    71,    72,    73,    74,    75,    77,    78,    79,
+      80,    83,    84,    98,   105,   115,   116,   115,    36,    37,
+     100,   115,   125,   126,   127,     9,    49,     9,    39,    57,
+      59,   166,     9,     9,     9,   133,    88,    88,     9,    27,
+     113,   104,   105,   104,    50,   133,   104,    40,   128,   138,
+     104,   104,   128,   138,   104,   104,   104,   104,   104,   104,
+     104,   104,   104,   104,   104,   104,   104,   104,   104,   104,
+     133,   115,   103,   115,   103,   115,   115,    10,    11,    13,
+      14,    15,    29,    95,    96,    97,    98,    99,   100,   101,
+     105,    32,   105,   105,    49,   108,    39,    17,     9,    52,
+       9,     9,    60,   167,    49,   105,   115,   124,   105,    54,
+     115,   104,    70,    70,   106,   115,   115,   115,   115,   115,
+     114,   115,   115,   115,   115,   115,   115,   115,   124,   114,
+     115,    38,   100,     9,   123,   125,     9,   122,   115,     9,
+     136,   137,   105,    58,     9,   121,   123,   107,   117,   106,
+     108,   124,    55,    30,    31,   147,   148,   149,   106,    27,
+     106,     9,   106,   108,    40,   108,   128,    42,    95,   108,
+       9,   158,   159,     9,   105,   106,   108,   115,   106,     9,
+     114,   115,   147,    27,    32,   106,     9,     9,    41,   129,
+     115,   115,   136,   171,   106,   108,    62,   106,   110,   107,
+      29,    28,    51,    92,   130,    32,   105,   160,    61,   163,
+     158,     9,   114,    17,    44,   132,   114,     3,    89,   161,
+     105,    93,    45,    27,   106,    12,   162,     9,   165,    94,
+       9,    32,     8,   106,   108,    46,    47,   131,     9
+};
+
+#if ! defined (YYSIZE_T) && defined (__SIZE_TYPE__)
+# define YYSIZE_T __SIZE_TYPE__
+#endif
+#if ! defined (YYSIZE_T) && defined (size_t)
+# define YYSIZE_T size_t
+#endif
+#if ! defined (YYSIZE_T)
+# if defined (__STDC__) || defined (__cplusplus)
+#  include <stddef.h> /* INFRINGES ON USER NAME SPACE */
+#  define YYSIZE_T size_t
+# endif
+#endif
+#if ! defined (YYSIZE_T)
+# define YYSIZE_T unsigned int
+#endif
+
+#define yyerrok		(yyerrstatus = 0)
+#define yyclearin	(yychar = YYEMPTY)
+#define YYEMPTY		(-2)
+#define YYEOF		0
+
+#define YYACCEPT	goto yyacceptlab
+#define YYABORT		goto yyabortlab
+#define YYERROR		goto yyerrorlab
+
+
+/* Like YYERROR except do call yyerror.  This remains here temporarily
+   to ease the transition to the new meaning of YYERROR, for GCC.
+   Once GCC version 2 has supplanted version 1, this can go.  */
+
+#define YYFAIL		goto yyerrlab
+
+#define YYRECOVERING()  (!!yyerrstatus)
+
+#define YYBACKUP(Token, Value)					\
+do								\
+  if (yychar == YYEMPTY && yylen == 1)				\
+    {								\
+      yychar = (Token);						\
+      yylval = (Value);						\
+      yytoken = YYTRANSLATE (yychar);				\
+      YYPOPSTACK;						\
+      goto yybackup;						\
+    }								\
+  else								\
+    { 								\
+      yyerror ("syntax error: cannot back up");\
+      YYERROR;							\
+    }								\
+while (0)
+
+
+#define YYTERROR	1
+#define YYERRCODE	256
+
+
+/* YYLLOC_DEFAULT -- Set CURRENT to span from RHS[1] to RHS[N].
+   If N is 0, then set CURRENT to the empty location which ends
+   the previous symbol: RHS[0] (always defined).  */
+
+#define YYRHSLOC(Rhs, K) ((Rhs)[K])
+#ifndef YYLLOC_DEFAULT
+# define YYLLOC_DEFAULT(Current, Rhs, N)				\
+    do									\
+      if (N)								\
+	{								\
+	  (Current).first_line   = YYRHSLOC (Rhs, 1).first_line;	\
+	  (Current).first_column = YYRHSLOC (Rhs, 1).first_column;	\
+	  (Current).last_line    = YYRHSLOC (Rhs, N).last_line;		\
+	  (Current).last_column  = YYRHSLOC (Rhs, N).last_column;	\
+	}								\
+      else								\
+	{								\
+	  (Current).first_line   = (Current).last_line   =		\
+	    YYRHSLOC (Rhs, 0).last_line;				\
+	  (Current).first_column = (Current).last_column =		\
+	    YYRHSLOC (Rhs, 0).last_column;				\
+	}								\
+    while (0)
+#endif
+
+
+/* YY_LOCATION_PRINT -- Print the location on the stream.
+   This macro was not mandated originally: define only if we know
+   we won't break user code: when these are the locations we know.  */
+
+#ifndef YY_LOCATION_PRINT
+# if YYLTYPE_IS_TRIVIAL
+#  define YY_LOCATION_PRINT(File, Loc)			\
+     fprintf (File, "%d.%d-%d.%d",			\
+              (Loc).first_line, (Loc).first_column,	\
+              (Loc).last_line,  (Loc).last_column)
+# else
+#  define YY_LOCATION_PRINT(File, Loc) ((void) 0)
+# endif
+#endif
+
+
+/* YYLEX -- calling `yylex' with the right arguments.  */
+
+#ifdef YYLEX_PARAM
+# define YYLEX yylex (YYLEX_PARAM)
+#else
+# define YYLEX yylex ()
+#endif
+
+/* Enable debugging if requested.  */
+#if YYDEBUG
+
+# ifndef YYFPRINTF
+#  include <stdio.h> /* INFRINGES ON USER NAME SPACE */
+#  define YYFPRINTF fprintf
+# endif
+
+# define YYDPRINTF(Args)			\
+do {						\
+  if (yydebug)					\
+    YYFPRINTF Args;				\
+} while (0)
+
+# define YY_SYMBOL_PRINT(Title, Type, Value, Location)		\
+do {								\
+  if (yydebug)							\
+    {								\
+      YYFPRINTF (stderr, "%s ", Title);				\
+      yysymprint (stderr, 					\
+                  Type, Value);	\
+      YYFPRINTF (stderr, "\n");					\
+    }								\
+} while (0)
+
+/*------------------------------------------------------------------.
+| yy_stack_print -- Print the state stack from its BOTTOM up to its |
+| TOP (included).                                                   |
+`------------------------------------------------------------------*/
+
+#if defined (__STDC__) || defined (__cplusplus)
+static void
+yy_stack_print (short int *bottom, short int *top)
+#else
+static void
+yy_stack_print (bottom, top)
+    short int *bottom;
+    short int *top;
+#endif
+{
+  YYFPRINTF (stderr, "Stack now");
+  for (/* Nothing. */; bottom <= top; ++bottom)
+    YYFPRINTF (stderr, " %d", *bottom);
+  YYFPRINTF (stderr, "\n");
+}
+
+# define YY_STACK_PRINT(Bottom, Top)				\
+do {								\
+  if (yydebug)							\
+    yy_stack_print ((Bottom), (Top));				\
+} while (0)
+
+
+/*------------------------------------------------.
+| Report that the YYRULE is going to be reduced.  |
+`------------------------------------------------*/
+
+#if defined (__STDC__) || defined (__cplusplus)
+static void
+yy_reduce_print (int yyrule)
+#else
+static void
+yy_reduce_print (yyrule)
+    int yyrule;
+#endif
+{
+  int yyi;
+  unsigned int yylno = yyrline[yyrule];
+  YYFPRINTF (stderr, "Reducing stack by rule %d (line %u), ",
+             yyrule - 1, yylno);
+  /* Print the symbols being reduced, and their result.  */
+  for (yyi = yyprhs[yyrule]; 0 <= yyrhs[yyi]; yyi++)
+    YYFPRINTF (stderr, "%s ", yytname [yyrhs[yyi]]);
+  YYFPRINTF (stderr, "-> %s\n", yytname [yyr1[yyrule]]);
+}
+
+# define YY_REDUCE_PRINT(Rule)		\
+do {					\
+  if (yydebug)				\
+    yy_reduce_print (Rule);		\
+} while (0)
+
+/* Nonzero means print parse trace.  It is left uninitialized so that
+   multiple parsers can coexist.  */
+int yydebug;
+#else /* !YYDEBUG */
+# define YYDPRINTF(Args)
+# define YY_SYMBOL_PRINT(Title, Type, Value, Location)
+# define YY_STACK_PRINT(Bottom, Top)
+# define YY_REDUCE_PRINT(Rule)
+#endif /* !YYDEBUG */
+
+
+/* YYINITDEPTH -- initial size of the parser's stacks.  */
+#ifndef	YYINITDEPTH
+# define YYINITDEPTH 200
+#endif
+
+/* YYMAXDEPTH -- maximum size the stacks can grow to (effective only
+   if the built-in stack extension method is used).
+
+   Do not make this value too large; the results are undefined if
+   SIZE_MAX < YYSTACK_BYTES (YYMAXDEPTH)
+   evaluated with infinite-precision integer arithmetic.  */
+
+#ifndef YYMAXDEPTH
+# define YYMAXDEPTH 10000
+#endif
+
+
+
+#if YYERROR_VERBOSE
+
+# ifndef yystrlen
+#  if defined (__GLIBC__) && defined (_STRING_H)
+#   define yystrlen strlen
+#  else
+/* Return the length of YYSTR.  */
+static YYSIZE_T
+#   if defined (__STDC__) || defined (__cplusplus)
+yystrlen (const char *yystr)
+#   else
+yystrlen (yystr)
+     const char *yystr;
+#   endif
+{
+  register const char *yys = yystr;
+
+  while (*yys++ != '\0')
+    continue;
+
+  return yys - yystr - 1;
+}
+#  endif
+# endif
+
+# ifndef yystpcpy
+#  if defined (__GLIBC__) && defined (_STRING_H) && defined (_GNU_SOURCE)
+#   define yystpcpy stpcpy
+#  else
+/* Copy YYSRC to YYDEST, returning the address of the terminating '\0' in
+   YYDEST.  */
+static char *
+#   if defined (__STDC__) || defined (__cplusplus)
+yystpcpy (char *yydest, const char *yysrc)
+#   else
+yystpcpy (yydest, yysrc)
+     char *yydest;
+     const char *yysrc;
+#   endif
+{
+  register char *yyd = yydest;
+  register const char *yys = yysrc;
+
+  while ((*yyd++ = *yys++) != '\0')
+    continue;
+
+  return yyd - 1;
+}
+#  endif
+# endif
+
+#endif /* !YYERROR_VERBOSE */
+
+
+
+#if YYDEBUG
+/*--------------------------------.
+| Print this symbol on YYOUTPUT.  |
+`--------------------------------*/
+
+#if defined (__STDC__) || defined (__cplusplus)
+static void
+yysymprint (FILE *yyoutput, int yytype, YYSTYPE *yyvaluep)
+#else
+static void
+yysymprint (yyoutput, yytype, yyvaluep)
+    FILE *yyoutput;
+    int yytype;
+    YYSTYPE *yyvaluep;
+#endif
+{
+  /* Pacify ``unused variable'' warnings.  */
+  (void) yyvaluep;
+
+  if (yytype < YYNTOKENS)
+    YYFPRINTF (yyoutput, "token %s (", yytname[yytype]);
+  else
+    YYFPRINTF (yyoutput, "nterm %s (", yytname[yytype]);
+
+
+# ifdef YYPRINT
+  if (yytype < YYNTOKENS)
+    YYPRINT (yyoutput, yytoknum[yytype], *yyvaluep);
+# endif
+  switch (yytype)
+    {
+      default:
+        break;
+    }
+  YYFPRINTF (yyoutput, ")");
+}
+
+#endif /* ! YYDEBUG */
+/*-----------------------------------------------.
+| Release the memory associated to this symbol.  |
+`-----------------------------------------------*/
+
+#if defined (__STDC__) || defined (__cplusplus)
+static void
+yydestruct (const char *yymsg, int yytype, YYSTYPE *yyvaluep)
+#else
+static void
+yydestruct (yymsg, yytype, yyvaluep)
+    const char *yymsg;
+    int yytype;
+    YYSTYPE *yyvaluep;
+#endif
+{
+  /* Pacify ``unused variable'' warnings.  */
+  (void) yyvaluep;
+
+  if (!yymsg)
+    yymsg = "Deleting";
+  YY_SYMBOL_PRINT (yymsg, yytype, yyvaluep, yylocationp);
+
+  switch (yytype)
+    {
+
+      default:
+        break;
+    }
+}
+
+
+/* Prevent warnings from -Wmissing-prototypes.  */
+
+#ifdef YYPARSE_PARAM
+# if defined (__STDC__) || defined (__cplusplus)
+UNIV_INTERN int yyparse (void *YYPARSE_PARAM);
+# else
+UNIV_INTERN int yyparse ();
+# endif
+#else /* ! YYPARSE_PARAM */
+#if defined (__STDC__) || defined (__cplusplus)
+UNIV_INTERN int yyparse (void);
+#else
+UNIV_INTERN int yyparse ();
+#endif
+#endif /* ! YYPARSE_PARAM */
+
+
+
+/* The look-ahead symbol.  */
+static int yychar;
+
+/* The semantic value of the look-ahead symbol.  */
+UNIV_INTERN YYSTYPE yylval;
+
+/* Number of syntax errors so far.  */
+static int yynerrs;
+
+
+
+/*----------.
+| yyparse.  |
+`----------*/
+
+#ifdef YYPARSE_PARAM
+# if defined (__STDC__) || defined (__cplusplus)
+UNIV_INTERN int yyparse (void *YYPARSE_PARAM)
+# else
+UNIV_INTERN int yyparse (YYPARSE_PARAM)
+  void *YYPARSE_PARAM;
+# endif
+#else /* ! YYPARSE_PARAM */
+#if defined (__STDC__) || defined (__cplusplus)
+int
+yyparse (void)
+#else
+int
+yyparse ()
+
+#endif
+#endif
+{
+  
+  register int yystate;
+  register int yyn;
+  int yyresult;
+  /* Number of tokens to shift before error messages enabled.  */
+  int yyerrstatus;
+  /* Look-ahead token as an internal (translated) token number.  */
+  int yytoken = 0;
+
+  /* Three stacks and their tools:
+     `yyss': related to states,
+     `yyvs': related to semantic values,
+     `yyls': related to locations.
+
+     Refer to the stacks thru separate pointers, to allow yyoverflow
+     to reallocate them elsewhere.  */
+
+  /* The state stack.  */
+  short int yyssa[YYINITDEPTH];
+  short int *yyss = yyssa;
+  register short int *yyssp;
+
+  /* The semantic value stack.  */
+  YYSTYPE yyvsa[YYINITDEPTH];
+  YYSTYPE *yyvs = yyvsa;
+  register YYSTYPE *yyvsp;
+
+
+
+#define YYPOPSTACK   (yyvsp--, yyssp--)
+
+  YYSIZE_T yystacksize = YYINITDEPTH;
+
+  /* The variables used to return semantic value and location from the
+     action routines.  */
+  YYSTYPE yyval;
+
+
+  /* When reducing, the number of symbols on the RHS of the reduced
+     rule.  */
+  int yylen;
+
+  YYDPRINTF ((stderr, "Starting parse\n"));
+
+  yystate = 0;
+  yyerrstatus = 0;
+  yynerrs = 0;
+  yychar = YYEMPTY;		/* Cause a token to be read.  */
+
+  /* Initialize stack pointers.
+     Waste one element of value and location stack
+     so that they stay on the same level as the state stack.
+     The wasted elements are never initialized.  */
+
+  yyssp = yyss;
+  yyvsp = yyvs;
+
+
+  yyvsp[0] = yylval;
+
+  goto yysetstate;
+
+/*------------------------------------------------------------.
+| yynewstate -- Push a new state, which is found in yystate.  |
+`------------------------------------------------------------*/
+ yynewstate:
+  /* In all cases, when you get here, the value and location stacks
+     have just been pushed. so pushing a state here evens the stacks.
+     */
+  yyssp++;
+
+ yysetstate:
+  *yyssp = yystate;
+
+  if (yyss + yystacksize - 1 <= yyssp)
+    {
+      /* Get the current used size of the three stacks, in elements.  */
+      YYSIZE_T yysize = yyssp - yyss + 1;
+
+#ifdef yyoverflow
+      {
+	/* Give user a chance to reallocate the stack. Use copies of
+	   these so that the &'s don't force the real ones into
+	   memory.  */
+	YYSTYPE *yyvs1 = yyvs;
+	short int *yyss1 = yyss;
+
+
+	/* Each stack pointer address is followed by the size of the
+	   data in use in that stack, in bytes.  This used to be a
+	   conditional around just the two extra args, but that might
+	   be undefined if yyoverflow is a macro.  */
+	yyoverflow ("parser stack overflow",
+		    &yyss1, yysize * sizeof (*yyssp),
+		    &yyvs1, yysize * sizeof (*yyvsp),
+
+		    &yystacksize);
+
+	yyss = yyss1;
+	yyvs = yyvs1;
+      }
+#else /* no yyoverflow */
+# ifndef YYSTACK_RELOCATE
+      goto yyoverflowlab;
+# else
+      /* Extend the stack our own way.  */
+      if (YYMAXDEPTH <= yystacksize)
+	goto yyoverflowlab;
+      yystacksize *= 2;
+      if (YYMAXDEPTH < yystacksize)
+	yystacksize = YYMAXDEPTH;
+
+      {
+	short int *yyss1 = yyss;
+	union yyalloc *yyptr =
+	  (union yyalloc *) YYSTACK_ALLOC (YYSTACK_BYTES (yystacksize));
+	if (! yyptr)
+	  goto yyoverflowlab;
+	YYSTACK_RELOCATE (yyss);
+	YYSTACK_RELOCATE (yyvs);
+
+#  undef YYSTACK_RELOCATE
+	if (yyss1 != yyssa)
+	  YYSTACK_FREE (yyss1);
+      }
+# endif
+#endif /* no yyoverflow */
+
+      yyssp = yyss + yysize - 1;
+      yyvsp = yyvs + yysize - 1;
+
+
+      YYDPRINTF ((stderr, "Stack size increased to %lu\n",
+		  (unsigned long int) yystacksize));
+
+      if (yyss + yystacksize - 1 <= yyssp)
+	YYABORT;
+    }
+
+  YYDPRINTF ((stderr, "Entering state %d\n", yystate));
+
+  goto yybackup;
+
+/*-----------.
+| yybackup.  |
+`-----------*/
+yybackup:
+
+/* Do appropriate processing given the current state.  */
+/* Read a look-ahead token if we need one and don't already have one.  */
+/* yyresume: */
+
+  /* First try to decide what to do without reference to look-ahead token.  */
+
+  yyn = yypact[yystate];
+  if (yyn == YYPACT_NINF)
+    goto yydefault;
+
+  /* Not known => get a look-ahead token if don't already have one.  */
+
+  /* YYCHAR is either YYEMPTY or YYEOF or a valid look-ahead symbol.  */
+  if (yychar == YYEMPTY)
+    {
+      YYDPRINTF ((stderr, "Reading a token: "));
+      yychar = YYLEX;
+    }
+
+  if (yychar <= YYEOF)
+    {
+      yychar = yytoken = YYEOF;
+      YYDPRINTF ((stderr, "Now at end of input.\n"));
+    }
+  else
+    {
+      yytoken = YYTRANSLATE (yychar);
+      YY_SYMBOL_PRINT ("Next token is", yytoken, &yylval, &yylloc);
+    }
+
+  /* If the proper action on seeing token YYTOKEN is to reduce or to
+     detect an error, take that action.  */
+  yyn += yytoken;
+  if (yyn < 0 || YYLAST < yyn || yycheck[yyn] != yytoken)
+    goto yydefault;
+  yyn = yytable[yyn];
+  if (yyn <= 0)
+    {
+      if (yyn == 0 || yyn == YYTABLE_NINF)
+	goto yyerrlab;
+      yyn = -yyn;
+      goto yyreduce;
+    }
+
+  if (yyn == YYFINAL)
+    YYACCEPT;
+
+  /* Shift the look-ahead token.  */
+  YY_SYMBOL_PRINT ("Shifting", yytoken, &yylval, &yylloc);
+
+  /* Discard the token being shifted unless it is eof.  */
+  if (yychar != YYEOF)
+    yychar = YYEMPTY;
+
+  *++yyvsp = yylval;
+
+
+  /* Count tokens shifted since error; after three, turn off error
+     status.  */
+  if (yyerrstatus)
+    yyerrstatus--;
+
+  yystate = yyn;
+  goto yynewstate;
+
+
+/*-----------------------------------------------------------.
+| yydefault -- do the default action for the current state.  |
+`-----------------------------------------------------------*/
+yydefault:
+  yyn = yydefact[yystate];
+  if (yyn == 0)
+    goto yyerrlab;
+  goto yyreduce;
+
+
+/*-----------------------------.
+| yyreduce -- Do a reduction.  |
+`-----------------------------*/
+yyreduce:
+  /* yyn is the number of a rule to reduce with.  */
+  yylen = yyr2[yyn];
+
+  /* If YYLEN is nonzero, implement the default value of the action:
+     `$$ = $1'.
+
+     Otherwise, the following line sets YYVAL to garbage.
+     This behavior is undocumented and Bison
+     users should not rely upon it.  Assigning to YYVAL
+     unconditionally makes the parser a bit smaller, and it avoids a
+     GCC warning that YYVAL may be used uninitialized.  */
+  yyval = yyvsp[1-yylen];
+
+
+  YY_REDUCE_PRINT (yyn);
+  switch (yyn)
+    {
+        case 25:
+#line 166 "pars0grm.y"
+    { (yyval) = que_node_list_add_last(NULL, (yyvsp[0])); ;}
+    break;
+
+  case 26:
+#line 168 "pars0grm.y"
+    { (yyval) = que_node_list_add_last((yyvsp[-1]), (yyvsp[0])); ;}
+    break;
+
+  case 27:
+#line 172 "pars0grm.y"
+    { (yyval) = (yyvsp[0]);;}
+    break;
+
+  case 28:
+#line 174 "pars0grm.y"
+    { (yyval) = pars_func((yyvsp[-3]), (yyvsp[-1])); ;}
+    break;
+
+  case 29:
+#line 175 "pars0grm.y"
+    { (yyval) = (yyvsp[0]);;}
+    break;
+
+  case 30:
+#line 176 "pars0grm.y"
+    { (yyval) = (yyvsp[0]);;}
+    break;
+
+  case 31:
+#line 177 "pars0grm.y"
+    { (yyval) = (yyvsp[0]);;}
+    break;
+
+  case 32:
+#line 178 "pars0grm.y"
+    { (yyval) = (yyvsp[0]);;}
+    break;
+
+  case 33:
+#line 179 "pars0grm.y"
+    { (yyval) = (yyvsp[0]);;}
+    break;
+
+  case 34:
+#line 180 "pars0grm.y"
+    { (yyval) = (yyvsp[0]);;}
+    break;
+
+  case 35:
+#line 181 "pars0grm.y"
+    { (yyval) = (yyvsp[0]);;}
+    break;
+
+  case 36:
+#line 182 "pars0grm.y"
+    { (yyval) = pars_op('+', (yyvsp[-2]), (yyvsp[0])); ;}
+    break;
+
+  case 37:
+#line 183 "pars0grm.y"
+    { (yyval) = pars_op('-', (yyvsp[-2]), (yyvsp[0])); ;}
+    break;
+
+  case 38:
+#line 184 "pars0grm.y"
+    { (yyval) = pars_op('*', (yyvsp[-2]), (yyvsp[0])); ;}
+    break;
+
+  case 39:
+#line 185 "pars0grm.y"
+    { (yyval) = pars_op('/', (yyvsp[-2]), (yyvsp[0])); ;}
+    break;
+
+  case 40:
+#line 186 "pars0grm.y"
+    { (yyval) = pars_op('-', (yyvsp[0]), NULL); ;}
+    break;
+
+  case 41:
+#line 187 "pars0grm.y"
+    { (yyval) = (yyvsp[-1]); ;}
+    break;
+
+  case 42:
+#line 188 "pars0grm.y"
+    { (yyval) = pars_op('=', (yyvsp[-2]), (yyvsp[0])); ;}
+    break;
+
+  case 43:
+#line 189 "pars0grm.y"
+    { (yyval) = pars_op('<', (yyvsp[-2]), (yyvsp[0])); ;}
+    break;
+
+  case 44:
+#line 190 "pars0grm.y"
+    { (yyval) = pars_op('>', (yyvsp[-2]), (yyvsp[0])); ;}
+    break;
+
+  case 45:
+#line 191 "pars0grm.y"
+    { (yyval) = pars_op(PARS_GE_TOKEN, (yyvsp[-2]), (yyvsp[0])); ;}
+    break;
+
+  case 46:
+#line 192 "pars0grm.y"
+    { (yyval) = pars_op(PARS_LE_TOKEN, (yyvsp[-2]), (yyvsp[0])); ;}
+    break;
+
+  case 47:
+#line 193 "pars0grm.y"
+    { (yyval) = pars_op(PARS_NE_TOKEN, (yyvsp[-2]), (yyvsp[0])); ;}
+    break;
+
+  case 48:
+#line 194 "pars0grm.y"
+    { (yyval) = pars_op(PARS_AND_TOKEN, (yyvsp[-2]), (yyvsp[0])); ;}
+    break;
+
+  case 49:
+#line 195 "pars0grm.y"
+    { (yyval) = pars_op(PARS_OR_TOKEN, (yyvsp[-2]), (yyvsp[0])); ;}
+    break;
+
+  case 50:
+#line 196 "pars0grm.y"
+    { (yyval) = pars_op(PARS_NOT_TOKEN, (yyvsp[0]), NULL); ;}
+    break;
+
+  case 51:
+#line 198 "pars0grm.y"
+    { (yyval) = pars_op(PARS_NOTFOUND_TOKEN, (yyvsp[-2]), NULL); ;}
+    break;
+
+  case 52:
+#line 200 "pars0grm.y"
+    { (yyval) = pars_op(PARS_NOTFOUND_TOKEN, (yyvsp[-2]), NULL); ;}
+    break;
+
+  case 53:
+#line 204 "pars0grm.y"
+    { (yyval) = &pars_to_char_token; ;}
+    break;
+
+  case 54:
+#line 205 "pars0grm.y"
+    { (yyval) = &pars_to_number_token; ;}
+    break;
+
+  case 55:
+#line 206 "pars0grm.y"
+    { (yyval) = &pars_to_binary_token; ;}
+    break;
+
+  case 56:
+#line 208 "pars0grm.y"
+    { (yyval) = &pars_binary_to_number_token; ;}
+    break;
+
+  case 57:
+#line 209 "pars0grm.y"
+    { (yyval) = &pars_substr_token; ;}
+    break;
+
+  case 58:
+#line 210 "pars0grm.y"
+    { (yyval) = &pars_concat_token; ;}
+    break;
+
+  case 59:
+#line 211 "pars0grm.y"
+    { (yyval) = &pars_instr_token; ;}
+    break;
+
+  case 60:
+#line 212 "pars0grm.y"
+    { (yyval) = &pars_length_token; ;}
+    break;
+
+  case 61:
+#line 213 "pars0grm.y"
+    { (yyval) = &pars_sysdate_token; ;}
+    break;
+
+  case 62:
+#line 214 "pars0grm.y"
+    { (yyval) = &pars_rnd_token; ;}
+    break;
+
+  case 63:
+#line 215 "pars0grm.y"
+    { (yyval) = &pars_rnd_str_token; ;}
+    break;
+
+  case 67:
+#line 226 "pars0grm.y"
+    { (yyval) = pars_stored_procedure_call((yyvsp[-4])); ;}
+    break;
+
+  case 68:
+#line 231 "pars0grm.y"
+    { (yyval) = pars_procedure_call((yyvsp[-3]), (yyvsp[-1])); ;}
+    break;
+
+  case 69:
+#line 235 "pars0grm.y"
+    { (yyval) = &pars_replstr_token; ;}
+    break;
+
+  case 70:
+#line 236 "pars0grm.y"
+    { (yyval) = &pars_printf_token; ;}
+    break;
+
+  case 71:
+#line 237 "pars0grm.y"
+    { (yyval) = &pars_assert_token; ;}
+    break;
+
+  case 72:
+#line 241 "pars0grm.y"
+    { (yyval) = (yyvsp[-2]); ;}
+    break;
+
+  case 73:
+#line 245 "pars0grm.y"
+    { (yyval) = que_node_list_add_last(NULL, (yyvsp[0])); ;}
+    break;
+
+  case 74:
+#line 247 "pars0grm.y"
+    { (yyval) = que_node_list_add_last((yyvsp[-2]), (yyvsp[0])); ;}
+    break;
+
+  case 75:
+#line 251 "pars0grm.y"
+    { (yyval) = NULL; ;}
+    break;
+
+  case 76:
+#line 252 "pars0grm.y"
+    { (yyval) = que_node_list_add_last(NULL, (yyvsp[0])); ;}
+    break;
+
+  case 77:
+#line 254 "pars0grm.y"
+    { (yyval) = que_node_list_add_last((yyvsp[-2]), (yyvsp[0])); ;}
+    break;
+
+  case 78:
+#line 258 "pars0grm.y"
+    { (yyval) = NULL; ;}
+    break;
+
+  case 79:
+#line 259 "pars0grm.y"
+    { (yyval) = que_node_list_add_last(NULL, (yyvsp[0]));;}
+    break;
+
+  case 80:
+#line 260 "pars0grm.y"
+    { (yyval) = que_node_list_add_last((yyvsp[-2]), (yyvsp[0])); ;}
+    break;
+
+  case 81:
+#line 264 "pars0grm.y"
+    { (yyval) = (yyvsp[0]); ;}
+    break;
+
+  case 82:
+#line 266 "pars0grm.y"
+    { (yyval) = pars_func(&pars_count_token,
+				          que_node_list_add_last(NULL,
+					    sym_tab_add_int_lit(
+						pars_sym_tab_global, 1))); ;}
+    break;
+
+  case 83:
+#line 271 "pars0grm.y"
+    { (yyval) = pars_func(&pars_count_token,
+					    que_node_list_add_last(NULL,
+						pars_func(&pars_distinct_token,
+						     que_node_list_add_last(
+								NULL, (yyvsp[-1]))))); ;}
+    break;
+
+  case 84:
+#line 277 "pars0grm.y"
+    { (yyval) = pars_func(&pars_sum_token,
+						que_node_list_add_last(NULL,
+									(yyvsp[-1]))); ;}
+    break;
+
+  case 85:
+#line 283 "pars0grm.y"
+    { (yyval) = NULL; ;}
+    break;
+
+  case 86:
+#line 284 "pars0grm.y"
+    { (yyval) = que_node_list_add_last(NULL, (yyvsp[0])); ;}
+    break;
+
+  case 87:
+#line 286 "pars0grm.y"
+    { (yyval) = que_node_list_add_last((yyvsp[-2]), (yyvsp[0])); ;}
+    break;
+
+  case 88:
+#line 290 "pars0grm.y"
+    { (yyval) = pars_select_list(&pars_star_denoter,
+								NULL); ;}
+    break;
+
+  case 89:
+#line 293 "pars0grm.y"
+    { (yyval) = pars_select_list((yyvsp[-2]), (yyvsp[0])); ;}
+    break;
+
+  case 90:
+#line 294 "pars0grm.y"
+    { (yyval) = pars_select_list((yyvsp[0]), NULL); ;}
+    break;
+
+  case 91:
+#line 298 "pars0grm.y"
+    { (yyval) = NULL; ;}
+    break;
+
+  case 92:
+#line 299 "pars0grm.y"
+    { (yyval) = (yyvsp[0]); ;}
+    break;
+
+  case 93:
+#line 303 "pars0grm.y"
+    { (yyval) = NULL; ;}
+    break;
+
+  case 94:
+#line 305 "pars0grm.y"
+    { (yyval) = &pars_update_token; ;}
+    break;
+
+  case 95:
+#line 309 "pars0grm.y"
+    { (yyval) = NULL; ;}
+    break;
+
+  case 96:
+#line 311 "pars0grm.y"
+    { yyval = &pars_share_token; ;}
+    break;
+
+  case 97:
+#line 315 "pars0grm.y"
+    { (yyval) = &pars_asc_token; ;}
+    break;
+
+  case 98:
+#line 316 "pars0grm.y"
+    { (yyval) = &pars_asc_token; ;}
+    break;
+
+  case 99:
+#line 317 "pars0grm.y"
+    { (yyval) = &pars_desc_token; ;}
+    break;
+
+  case 100:
+#line 321 "pars0grm.y"
+    { (yyval) = NULL; ;}
+    break;
+
+  case 101:
+#line 323 "pars0grm.y"
+    { (yyval) = pars_order_by((yyvsp[-1]), (yyvsp[0])); ;}
+    break;
+
+  case 102:
+#line 332 "pars0grm.y"
+    { (yyval) = pars_select_statement((yyvsp[-6]), (yyvsp[-4]), (yyvsp[-3]),
+								(yyvsp[-2]), (yyvsp[-1]), (yyvsp[0])); ;}
+    break;
+
+  case 103:
+#line 338 "pars0grm.y"
+    { (yyval) = (yyvsp[0]); ;}
+    break;
+
+  case 104:
+#line 343 "pars0grm.y"
+    { (yyval) = pars_insert_statement((yyvsp[-4]), (yyvsp[-1]), NULL); ;}
+    break;
+
+  case 105:
+#line 345 "pars0grm.y"
+    { (yyval) = pars_insert_statement((yyvsp[-1]), NULL, (yyvsp[0])); ;}
+    break;
+
+  case 106:
+#line 349 "pars0grm.y"
+    { (yyval) = pars_column_assignment((yyvsp[-2]), (yyvsp[0])); ;}
+    break;
+
+  case 107:
+#line 353 "pars0grm.y"
+    { (yyval) = que_node_list_add_last(NULL, (yyvsp[0])); ;}
+    break;
+
+  case 108:
+#line 355 "pars0grm.y"
+    { (yyval) = que_node_list_add_last((yyvsp[-2]), (yyvsp[0])); ;}
+    break;
+
+  case 109:
+#line 361 "pars0grm.y"
+    { (yyval) = (yyvsp[0]); ;}
+    break;
+
+  case 110:
+#line 367 "pars0grm.y"
+    { (yyval) = pars_update_statement_start(FALSE,
+								(yyvsp[-2]), (yyvsp[0])); ;}
+    break;
+
+  case 111:
+#line 373 "pars0grm.y"
+    { (yyval) = pars_update_statement((yyvsp[-1]), NULL, (yyvsp[0])); ;}
+    break;
+
+  case 112:
+#line 378 "pars0grm.y"
+    { (yyval) = pars_update_statement((yyvsp[-1]), (yyvsp[0]), NULL); ;}
+    break;
+
+  case 113:
+#line 383 "pars0grm.y"
+    { (yyval) = pars_update_statement_start(TRUE,
+								(yyvsp[0]), NULL); ;}
+    break;
+
+  case 114:
+#line 389 "pars0grm.y"
+    { (yyval) = pars_update_statement((yyvsp[-1]), NULL, (yyvsp[0])); ;}
+    break;
+
+  case 115:
+#line 394 "pars0grm.y"
+    { (yyval) = pars_update_statement((yyvsp[-1]), (yyvsp[0]), NULL); ;}
+    break;
+
+  case 116:
+#line 399 "pars0grm.y"
+    { (yyval) = pars_row_printf_statement((yyvsp[0])); ;}
+    break;
+
+  case 117:
+#line 404 "pars0grm.y"
+    { (yyval) = pars_assignment_statement((yyvsp[-2]), (yyvsp[0])); ;}
+    break;
+
+  case 118:
+#line 410 "pars0grm.y"
+    { (yyval) = pars_elsif_element((yyvsp[-2]), (yyvsp[0])); ;}
+    break;
+
+  case 119:
+#line 414 "pars0grm.y"
+    { (yyval) = que_node_list_add_last(NULL, (yyvsp[0])); ;}
+    break;
+
+  case 120:
+#line 416 "pars0grm.y"
+    { (yyval) = que_node_list_add_last((yyvsp[-1]), (yyvsp[0])); ;}
+    break;
+
+  case 121:
+#line 420 "pars0grm.y"
+    { (yyval) = NULL; ;}
+    break;
+
+  case 122:
+#line 422 "pars0grm.y"
+    { (yyval) = (yyvsp[0]); ;}
+    break;
+
+  case 123:
+#line 423 "pars0grm.y"
+    { (yyval) = (yyvsp[0]); ;}
+    break;
+
+  case 124:
+#line 430 "pars0grm.y"
+    { (yyval) = pars_if_statement((yyvsp[-5]), (yyvsp[-3]), (yyvsp[-2])); ;}
+    break;
+
+  case 125:
+#line 436 "pars0grm.y"
+    { (yyval) = pars_while_statement((yyvsp[-4]), (yyvsp[-2])); ;}
+    break;
+
+  case 126:
+#line 444 "pars0grm.y"
+    { (yyval) = pars_for_statement((yyvsp[-8]), (yyvsp[-6]), (yyvsp[-4]), (yyvsp[-2])); ;}
+    break;
+
+  case 127:
+#line 448 "pars0grm.y"
+    { (yyval) = pars_exit_statement(); ;}
+    break;
+
+  case 128:
+#line 452 "pars0grm.y"
+    { (yyval) = pars_return_statement(); ;}
+    break;
+
+  case 129:
+#line 457 "pars0grm.y"
+    { (yyval) = pars_open_statement(
+						ROW_SEL_OPEN_CURSOR, (yyvsp[0])); ;}
+    break;
+
+  case 130:
+#line 463 "pars0grm.y"
+    { (yyval) = pars_open_statement(
+						ROW_SEL_CLOSE_CURSOR, (yyvsp[0])); ;}
+    break;
+
+  case 131:
+#line 469 "pars0grm.y"
+    { (yyval) = pars_fetch_statement((yyvsp[-2]), (yyvsp[0]), NULL); ;}
+    break;
+
+  case 132:
+#line 471 "pars0grm.y"
+    { (yyval) = pars_fetch_statement((yyvsp[-2]), NULL, (yyvsp[0])); ;}
+    break;
+
+  case 133:
+#line 476 "pars0grm.y"
+    { (yyval) = pars_column_def((yyvsp[-4]), (yyvsp[-3]), (yyvsp[-2]), (yyvsp[-1]), (yyvsp[0])); ;}
+    break;
+
+  case 134:
+#line 480 "pars0grm.y"
+    { (yyval) = que_node_list_add_last(NULL, (yyvsp[0])); ;}
+    break;
+
+  case 135:
+#line 482 "pars0grm.y"
+    { (yyval) = que_node_list_add_last((yyvsp[-2]), (yyvsp[0])); ;}
+    break;
+
+  case 136:
+#line 486 "pars0grm.y"
+    { (yyval) = NULL; ;}
+    break;
+
+  case 137:
+#line 488 "pars0grm.y"
+    { (yyval) = (yyvsp[-1]); ;}
+    break;
+
+  case 138:
+#line 492 "pars0grm.y"
+    { (yyval) = NULL; ;}
+    break;
+
+  case 139:
+#line 494 "pars0grm.y"
+    { (yyval) = &pars_int_token;
+					/* pass any non-NULL pointer */ ;}
+    break;
+
+  case 140:
+#line 499 "pars0grm.y"
+    { (yyval) = NULL; ;}
+    break;
+
+  case 141:
+#line 501 "pars0grm.y"
+    { (yyval) = &pars_int_token;
+					/* pass any non-NULL pointer */ ;}
+    break;
+
+  case 142:
+#line 506 "pars0grm.y"
+    { (yyval) = NULL; ;}
+    break;
+
+  case 143:
+#line 508 "pars0grm.y"
+    { (yyval) = &pars_int_token;
+					/* pass any non-NULL pointer */ ;}
+    break;
+
+  case 144:
+#line 515 "pars0grm.y"
+    { (yyval) = pars_create_table((yyvsp[-4]), (yyvsp[-2]), (yyvsp[0])); ;}
+    break;
+
+  case 145:
+#line 519 "pars0grm.y"
+    { (yyval) = que_node_list_add_last(NULL, (yyvsp[0])); ;}
+    break;
+
+  case 146:
+#line 521 "pars0grm.y"
+    { (yyval) = que_node_list_add_last((yyvsp[-2]), (yyvsp[0])); ;}
+    break;
+
+  case 147:
+#line 525 "pars0grm.y"
+    { (yyval) = NULL; ;}
+    break;
+
+  case 148:
+#line 526 "pars0grm.y"
+    { (yyval) = &pars_unique_token; ;}
+    break;
+
+  case 149:
+#line 530 "pars0grm.y"
+    { (yyval) = NULL; ;}
+    break;
+
+  case 150:
+#line 531 "pars0grm.y"
+    { (yyval) = &pars_clustered_token; ;}
+    break;
+
+  case 151:
+#line 539 "pars0grm.y"
+    { (yyval) = pars_create_index((yyvsp[-8]), (yyvsp[-7]), (yyvsp[-5]), (yyvsp[-3]), (yyvsp[-1])); ;}
+    break;
+
+  case 152:
+#line 544 "pars0grm.y"
+    { (yyval) = pars_commit_statement(); ;}
+    break;
+
+  case 153:
+#line 549 "pars0grm.y"
+    { (yyval) = pars_rollback_statement(); ;}
+    break;
+
+  case 154:
+#line 553 "pars0grm.y"
+    { (yyval) = &pars_int_token; ;}
+    break;
+
+  case 155:
+#line 554 "pars0grm.y"
+    { (yyval) = &pars_int_token; ;}
+    break;
+
+  case 156:
+#line 555 "pars0grm.y"
+    { (yyval) = &pars_char_token; ;}
+    break;
+
+  case 157:
+#line 556 "pars0grm.y"
+    { (yyval) = &pars_binary_token; ;}
+    break;
+
+  case 158:
+#line 557 "pars0grm.y"
+    { (yyval) = &pars_blob_token; ;}
+    break;
+
+  case 159:
+#line 562 "pars0grm.y"
+    { (yyval) = pars_parameter_declaration((yyvsp[-2]),
+							PARS_INPUT, (yyvsp[0])); ;}
+    break;
+
+  case 160:
+#line 565 "pars0grm.y"
+    { (yyval) = pars_parameter_declaration((yyvsp[-2]),
+							PARS_OUTPUT, (yyvsp[0])); ;}
+    break;
+
+  case 161:
+#line 570 "pars0grm.y"
+    { (yyval) = NULL; ;}
+    break;
+
+  case 162:
+#line 571 "pars0grm.y"
+    { (yyval) = que_node_list_add_last(NULL, (yyvsp[0])); ;}
+    break;
+
+  case 163:
+#line 573 "pars0grm.y"
+    { (yyval) = que_node_list_add_last((yyvsp[-2]), (yyvsp[0])); ;}
+    break;
+
+  case 164:
+#line 578 "pars0grm.y"
+    { (yyval) = pars_variable_declaration((yyvsp[-2]), (yyvsp[-1])); ;}
+    break;
+
+  case 168:
+#line 590 "pars0grm.y"
+    { (yyval) = pars_cursor_declaration((yyvsp[-3]), (yyvsp[-1])); ;}
+    break;
+
+  case 169:
+#line 595 "pars0grm.y"
+    { (yyval) = pars_function_declaration((yyvsp[-1])); ;}
+    break;
+
+  case 175:
+#line 616 "pars0grm.y"
+    { (yyval) = pars_procedure_definition((yyvsp[-9]), (yyvsp[-7]),
+								(yyvsp[-1])); ;}
+    break;
+
+
+    }
+
+/* Line 1010 of yacc.c.  */
+#line 2345 "pars0grm.c"
+
+  yyvsp -= yylen;
+  yyssp -= yylen;
+
+
+  YY_STACK_PRINT (yyss, yyssp);
+
+  *++yyvsp = yyval;
+
+
+  /* Now `shift' the result of the reduction.  Determine what state
+     that goes to, based on the state we popped back to and the rule
+     number reduced by.  */
+
+  yyn = yyr1[yyn];
+
+  yystate = yypgoto[yyn - YYNTOKENS] + *yyssp;
+  if (0 <= yystate && yystate <= YYLAST && yycheck[yystate] == *yyssp)
+    yystate = yytable[yystate];
+  else
+    yystate = yydefgoto[yyn - YYNTOKENS];
+
+  goto yynewstate;
+
+
+/*------------------------------------.
+| yyerrlab -- here on detecting error |
+`------------------------------------*/
+yyerrlab:
+  /* If not already recovering from an error, report this error.  */
+  if (!yyerrstatus)
+    {
+      ++yynerrs;
+#if YYERROR_VERBOSE
+      yyn = yypact[yystate];
+
+      if (YYPACT_NINF < yyn && yyn < YYLAST)
+	{
+	  YYSIZE_T yysize = 0;
+	  int yytype = YYTRANSLATE (yychar);
+	  const char* yyprefix;
+	  char *yymsg;
+	  int yyx;
+
+	  /* Start YYX at -YYN if negative to avoid negative indexes in
+	     YYCHECK.  */
+	  int yyxbegin = yyn < 0 ? -yyn : 0;
+
+	  /* Stay within bounds of both yycheck and yytname.  */
+	  int yychecklim = YYLAST - yyn;
+	  int yyxend = yychecklim < YYNTOKENS ? yychecklim : YYNTOKENS;
+	  int yycount = 0;
+
+	  yyprefix = ", expecting ";
+	  for (yyx = yyxbegin; yyx < yyxend; ++yyx)
+	    if (yycheck[yyx + yyn] == yyx && yyx != YYTERROR)
+	      {
+		yysize += yystrlen (yyprefix) + yystrlen (yytname [yyx]);
+		yycount += 1;
+		if (yycount == 5)
+		  {
+		    yysize = 0;
+		    break;
+		  }
+	      }
+	  yysize += (sizeof ("syntax error, unexpected ")
+		     + yystrlen (yytname[yytype]));
+	  yymsg = (char *) YYSTACK_ALLOC (yysize);
+	  if (yymsg != 0)
+	    {
+	      char *yyp = yystpcpy (yymsg, "syntax error, unexpected ");
+	      yyp = yystpcpy (yyp, yytname[yytype]);
+
+	      if (yycount < 5)
+		{
+		  yyprefix = ", expecting ";
+		  for (yyx = yyxbegin; yyx < yyxend; ++yyx)
+		    if (yycheck[yyx + yyn] == yyx && yyx != YYTERROR)
+		      {
+			yyp = yystpcpy (yyp, yyprefix);
+			yyp = yystpcpy (yyp, yytname[yyx]);
+			yyprefix = " or ";
+		      }
+		}
+	      yyerror (yymsg);
+	      YYSTACK_FREE (yymsg);
+	    }
+	  else
+	    yyerror ("syntax error; also virtual memory exhausted");
+	}
+      else
+#endif /* YYERROR_VERBOSE */
+	yyerror ("syntax error");
+    }
+
+
+
+  if (yyerrstatus == 3)
+    {
+      /* If just tried and failed to reuse look-ahead token after an
+	 error, discard it.  */
+
+      if (yychar <= YYEOF)
+        {
+          /* If at end of input, pop the error token,
+	     then the rest of the stack, then return failure.  */
+	  if (yychar == YYEOF)
+	     for (;;)
+	       {
+
+		 YYPOPSTACK;
+		 if (yyssp == yyss)
+		   YYABORT;
+		 yydestruct ("Error: popping",
+                             yystos[*yyssp], yyvsp);
+	       }
+        }
+      else
+	{
+	  yydestruct ("Error: discarding", yytoken, &yylval);
+	  yychar = YYEMPTY;
+	}
+    }
+
+  /* Else will try to reuse look-ahead token after shifting the error
+     token.  */
+  goto yyerrlab1;
+
+
+/*---------------------------------------------------.
+| yyerrorlab -- error raised explicitly by YYERROR.  |
+`---------------------------------------------------*/
+yyerrorlab:
+
+#ifdef __GNUC__
+  /* Pacify GCC when the user code never invokes YYERROR and the label
+     yyerrorlab therefore never appears in user code.  */
+  if (0)
+     goto yyerrorlab;
+#endif
+
+yyvsp -= yylen;
+  yyssp -= yylen;
+  yystate = *yyssp;
+  goto yyerrlab1;
+
+
+/*-------------------------------------------------------------.
+| yyerrlab1 -- common code for both syntax error and YYERROR.  |
+`-------------------------------------------------------------*/
+yyerrlab1:
+  yyerrstatus = 3;	/* Each real token shifted decrements this.  */
+
+  for (;;)
+    {
+      yyn = yypact[yystate];
+      if (yyn != YYPACT_NINF)
+	{
+	  yyn += YYTERROR;
+	  if (0 <= yyn && yyn <= YYLAST && yycheck[yyn] == YYTERROR)
+	    {
+	      yyn = yytable[yyn];
+	      if (0 < yyn)
+		break;
+	    }
+	}
+
+      /* Pop the current state because it cannot handle the error token.  */
+      if (yyssp == yyss)
+	YYABORT;
+
+
+      yydestruct ("Error: popping", yystos[yystate], yyvsp);
+      YYPOPSTACK;
+      yystate = *yyssp;
+      YY_STACK_PRINT (yyss, yyssp);
+    }
+
+  if (yyn == YYFINAL)
+    YYACCEPT;
+
+  *++yyvsp = yylval;
+
+
+  /* Shift the error token. */
+  YY_SYMBOL_PRINT ("Shifting", yystos[yyn], yyvsp, yylsp);
+
+  yystate = yyn;
+  goto yynewstate;
+
+
+/*-------------------------------------.
+| yyacceptlab -- YYACCEPT comes here.  |
+`-------------------------------------*/
+yyacceptlab:
+  yyresult = 0;
+  goto yyreturn;
+
+/*-----------------------------------.
+| yyabortlab -- YYABORT comes here.  |
+`-----------------------------------*/
+yyabortlab:
+  yydestruct ("Error: discarding lookahead",
+              yytoken, &yylval);
+  yychar = YYEMPTY;
+  yyresult = 1;
+  goto yyreturn;
+
+#ifndef yyoverflow
+/*----------------------------------------------.
+| yyoverflowlab -- parser overflow comes here.  |
+`----------------------------------------------*/
+yyoverflowlab:
+  yyerror ("parser stack overflow");
+  yyresult = 2;
+  /* Fall through.  */
+#endif
+
+yyreturn:
+#ifndef yyoverflow
+  if (yyss != yyssa)
+    YYSTACK_FREE (yyss);
+#endif
+  return yyresult;
+}
+
+
+#line 620 "pars0grm.y"
+
+
diff --git a/storage/xtradb/pars/pars0grm.y b/storage/xtradb/pars/pars0grm.y
new file mode 100644
index 00000000000..14d64f1826f
--- /dev/null
+++ b/storage/xtradb/pars/pars0grm.y
@@ -0,0 +1,635 @@
+/*****************************************************************************
+
+Copyright (c) 1997, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/******************************************************
+SQL parser: input file for the GNU Bison parser generator
+
+Look from pars0lex.l for instructions how to generate the C files for
+the InnoDB parser.
+
+Created 12/14/1997 Heikki Tuuri
+*******************************************************/
+
+%{
+/* The value of the semantic attribute is a pointer to a query tree node
+que_node_t */
+
+#include "univ.i"
+#include <math.h>				/* Can't be before univ.i */
+#include "pars0pars.h"
+#include "mem0mem.h"
+#include "que0types.h"
+#include "que0que.h"
+#include "row0sel.h"
+
+#define YYSTYPE que_node_t*
+
+/* #define __STDC__ */
+
+int
+yylex(void);
+%}
+
+%token PARS_INT_LIT
+%token PARS_FLOAT_LIT
+%token PARS_STR_LIT
+%token PARS_FIXBINARY_LIT
+%token PARS_BLOB_LIT
+%token PARS_NULL_LIT
+%token PARS_ID_TOKEN
+%token PARS_AND_TOKEN
+%token PARS_OR_TOKEN
+%token PARS_NOT_TOKEN
+%token PARS_GE_TOKEN
+%token PARS_LE_TOKEN
+%token PARS_NE_TOKEN
+%token PARS_PROCEDURE_TOKEN
+%token PARS_IN_TOKEN
+%token PARS_OUT_TOKEN
+%token PARS_BINARY_TOKEN
+%token PARS_BLOB_TOKEN
+%token PARS_INT_TOKEN
+%token PARS_INTEGER_TOKEN
+%token PARS_FLOAT_TOKEN
+%token PARS_CHAR_TOKEN
+%token PARS_IS_TOKEN
+%token PARS_BEGIN_TOKEN
+%token PARS_END_TOKEN
+%token PARS_IF_TOKEN
+%token PARS_THEN_TOKEN
+%token PARS_ELSE_TOKEN
+%token PARS_ELSIF_TOKEN
+%token PARS_LOOP_TOKEN
+%token PARS_WHILE_TOKEN
+%token PARS_RETURN_TOKEN
+%token PARS_SELECT_TOKEN
+%token PARS_SUM_TOKEN
+%token PARS_COUNT_TOKEN
+%token PARS_DISTINCT_TOKEN
+%token PARS_FROM_TOKEN
+%token PARS_WHERE_TOKEN
+%token PARS_FOR_TOKEN
+%token PARS_DDOT_TOKEN
+%token PARS_READ_TOKEN
+%token PARS_ORDER_TOKEN
+%token PARS_BY_TOKEN
+%token PARS_ASC_TOKEN
+%token PARS_DESC_TOKEN
+%token PARS_INSERT_TOKEN
+%token PARS_INTO_TOKEN
+%token PARS_VALUES_TOKEN
+%token PARS_UPDATE_TOKEN
+%token PARS_SET_TOKEN
+%token PARS_DELETE_TOKEN
+%token PARS_CURRENT_TOKEN
+%token PARS_OF_TOKEN
+%token PARS_CREATE_TOKEN
+%token PARS_TABLE_TOKEN
+%token PARS_INDEX_TOKEN
+%token PARS_UNIQUE_TOKEN
+%token PARS_CLUSTERED_TOKEN
+%token PARS_DOES_NOT_FIT_IN_MEM_TOKEN
+%token PARS_ON_TOKEN
+%token PARS_ASSIGN_TOKEN
+%token PARS_DECLARE_TOKEN
+%token PARS_CURSOR_TOKEN
+%token PARS_SQL_TOKEN
+%token PARS_OPEN_TOKEN
+%token PARS_FETCH_TOKEN
+%token PARS_CLOSE_TOKEN
+%token PARS_NOTFOUND_TOKEN
+%token PARS_TO_CHAR_TOKEN
+%token PARS_TO_NUMBER_TOKEN
+%token PARS_TO_BINARY_TOKEN
+%token PARS_BINARY_TO_NUMBER_TOKEN
+%token PARS_SUBSTR_TOKEN
+%token PARS_REPLSTR_TOKEN
+%token PARS_CONCAT_TOKEN
+%token PARS_INSTR_TOKEN
+%token PARS_LENGTH_TOKEN
+%token PARS_SYSDATE_TOKEN
+%token PARS_PRINTF_TOKEN
+%token PARS_ASSERT_TOKEN
+%token PARS_RND_TOKEN
+%token PARS_RND_STR_TOKEN
+%token PARS_ROW_PRINTF_TOKEN
+%token PARS_COMMIT_TOKEN
+%token PARS_ROLLBACK_TOKEN
+%token PARS_WORK_TOKEN
+%token PARS_UNSIGNED_TOKEN
+%token PARS_EXIT_TOKEN
+%token PARS_FUNCTION_TOKEN
+%token PARS_LOCK_TOKEN
+%token PARS_SHARE_TOKEN
+%token PARS_MODE_TOKEN
+
+%left PARS_AND_TOKEN PARS_OR_TOKEN
+%left PARS_NOT_TOKEN
+%left '=' '<' '>' PARS_GE_TOKEN PARS_LE_TOKEN
+%left '-' '+'
+%left '*' '/'
+%left NEG     /* negation--unary minus */
+%left '%'
+
+/* Grammar follows */
+%%
+
+top_statement:
+        procedure_definition ';'
+
+statement:
+	stored_procedure_call
+	| predefined_procedure_call ';'
+	| while_statement ';'
+	| for_statement ';'
+	| exit_statement ';'
+	| if_statement ';'
+	| return_statement ';'
+	| assignment_statement ';'
+	| select_statement ';'
+	| insert_statement ';'
+	| row_printf_statement ';'
+	| delete_statement_searched ';'
+	| delete_statement_positioned ';'
+	| update_statement_searched ';'
+	| update_statement_positioned ';'
+	| open_cursor_statement ';'
+	| fetch_statement ';'
+	| close_cursor_statement ';'
+	| commit_statement ';'
+	| rollback_statement ';'
+	| create_table ';'
+	| create_index ';'
+;
+
+statement_list:
+	statement		{ $$ = que_node_list_add_last(NULL, $1); }
+	| statement_list statement
+				{ $$ = que_node_list_add_last($1, $2); }
+;
+
+exp:
+	PARS_ID_TOKEN		{ $$ = $1;}
+	| function_name '(' exp_list ')'
+				{ $$ = pars_func($1, $3); }
+	| PARS_INT_LIT		{ $$ = $1;}
+	| PARS_FLOAT_LIT	{ $$ = $1;}
+	| PARS_STR_LIT		{ $$ = $1;}
+	| PARS_FIXBINARY_LIT	{ $$ = $1;}
+	| PARS_BLOB_LIT		{ $$ = $1;}
+	| PARS_NULL_LIT		{ $$ = $1;}
+	| PARS_SQL_TOKEN	{ $$ = $1;}
+	| exp '+' exp        	{ $$ = pars_op('+', $1, $3); }
+	| exp '-' exp        	{ $$ = pars_op('-', $1, $3); }
+	| exp '*' exp        	{ $$ = pars_op('*', $1, $3); }
+	| exp '/' exp        	{ $$ = pars_op('/', $1, $3); }
+	| '-' exp %prec NEG 	{ $$ = pars_op('-', $2, NULL); }
+	| '(' exp ')'        	{ $$ = $2; }
+	| exp '=' exp		{ $$ = pars_op('=', $1, $3); }
+	| exp '<' exp		{ $$ = pars_op('<', $1, $3); }
+	| exp '>' exp		{ $$ = pars_op('>', $1, $3); }
+	| exp PARS_GE_TOKEN exp	{ $$ = pars_op(PARS_GE_TOKEN, $1, $3); }
+	| exp PARS_LE_TOKEN exp	{ $$ = pars_op(PARS_LE_TOKEN, $1, $3); }
+	| exp PARS_NE_TOKEN exp	{ $$ = pars_op(PARS_NE_TOKEN, $1, $3); }
+	| exp PARS_AND_TOKEN exp{ $$ = pars_op(PARS_AND_TOKEN, $1, $3); }
+	| exp PARS_OR_TOKEN exp	{ $$ = pars_op(PARS_OR_TOKEN, $1, $3); }
+	| PARS_NOT_TOKEN exp	{ $$ = pars_op(PARS_NOT_TOKEN, $2, NULL); }
+	| PARS_ID_TOKEN '%' PARS_NOTFOUND_TOKEN
+				{ $$ = pars_op(PARS_NOTFOUND_TOKEN, $1, NULL); }
+	| PARS_SQL_TOKEN '%' PARS_NOTFOUND_TOKEN
+				{ $$ = pars_op(PARS_NOTFOUND_TOKEN, $1, NULL); }
+;
+
+function_name:
+	PARS_TO_CHAR_TOKEN	{ $$ = &pars_to_char_token; }
+	| PARS_TO_NUMBER_TOKEN	{ $$ = &pars_to_number_token; }
+	| PARS_TO_BINARY_TOKEN	{ $$ = &pars_to_binary_token; }
+	| PARS_BINARY_TO_NUMBER_TOKEN
+				{ $$ = &pars_binary_to_number_token; }
+	| PARS_SUBSTR_TOKEN	{ $$ = &pars_substr_token; }
+	| PARS_CONCAT_TOKEN	{ $$ = &pars_concat_token; }
+	| PARS_INSTR_TOKEN	{ $$ = &pars_instr_token; }
+	| PARS_LENGTH_TOKEN	{ $$ = &pars_length_token; }
+	| PARS_SYSDATE_TOKEN	{ $$ = &pars_sysdate_token; }
+	| PARS_RND_TOKEN	{ $$ = &pars_rnd_token; }
+	| PARS_RND_STR_TOKEN	{ $$ = &pars_rnd_str_token; }
+;
+
+question_mark_list:
+	/* Nothing */
+	| '?'
+	| question_mark_list ',' '?'
+;
+
+stored_procedure_call:
+	'{' PARS_ID_TOKEN '(' question_mark_list ')' '}'
+				{ $$ = pars_stored_procedure_call($2); }
+;
+
+predefined_procedure_call:
+	predefined_procedure_name '(' exp_list ')'
+				{ $$ = pars_procedure_call($1, $3); }
+;
+
+predefined_procedure_name:
+	PARS_REPLSTR_TOKEN	{ $$ = &pars_replstr_token; }
+	| PARS_PRINTF_TOKEN	{ $$ = &pars_printf_token; }
+	| PARS_ASSERT_TOKEN	{ $$ = &pars_assert_token; }
+;
+
+user_function_call:
+	PARS_ID_TOKEN '(' ')'	{ $$ = $1; }
+;
+
+table_list:
+	PARS_ID_TOKEN		{ $$ = que_node_list_add_last(NULL, $1); }
+	| table_list ',' PARS_ID_TOKEN
+				{ $$ = que_node_list_add_last($1, $3); }
+;
+
+variable_list:
+	/* Nothing */		{ $$ = NULL; }
+	| PARS_ID_TOKEN		{ $$ = que_node_list_add_last(NULL, $1); }
+	| variable_list ',' PARS_ID_TOKEN
+				{ $$ = que_node_list_add_last($1, $3); }
+;
+
+exp_list:
+	/* Nothing */		{ $$ = NULL; }
+	| exp			{ $$ = que_node_list_add_last(NULL, $1);}
+	| exp_list ',' exp	{ $$ = que_node_list_add_last($1, $3); }
+;
+
+select_item:
+	exp			{ $$ = $1; }
+	| PARS_COUNT_TOKEN '(' '*' ')'
+				{ $$ = pars_func(&pars_count_token,
+				          que_node_list_add_last(NULL,
+					    sym_tab_add_int_lit(
+						pars_sym_tab_global, 1))); }
+	| PARS_COUNT_TOKEN '(' PARS_DISTINCT_TOKEN PARS_ID_TOKEN ')'
+				{ $$ = pars_func(&pars_count_token,
+					    que_node_list_add_last(NULL,
+						pars_func(&pars_distinct_token,
+						     que_node_list_add_last(
+								NULL, $4)))); }
+	| PARS_SUM_TOKEN '(' exp ')'
+				{ $$ = pars_func(&pars_sum_token,
+						que_node_list_add_last(NULL,
+									$3)); }
+;
+
+select_item_list:
+	/* Nothing */		{ $$ = NULL; }
+	| select_item		{ $$ = que_node_list_add_last(NULL, $1); }
+	| select_item_list ',' select_item
+				{ $$ = que_node_list_add_last($1, $3); }
+;
+
+select_list:
+	'*'			{ $$ = pars_select_list(&pars_star_denoter,
+								NULL); }
+	| select_item_list PARS_INTO_TOKEN variable_list
+				{ $$ = pars_select_list($1, $3); }
+	| select_item_list	{ $$ = pars_select_list($1, NULL); }
+;
+
+search_condition:
+	/* Nothing */		{ $$ = NULL; }
+	| PARS_WHERE_TOKEN exp	{ $$ = $2; }
+;
+
+for_update_clause:
+	/* Nothing */		{ $$ = NULL; }
+	| PARS_FOR_TOKEN PARS_UPDATE_TOKEN
+				{ $$ = &pars_update_token; }
+;
+
+lock_shared_clause:
+	/* Nothing */		{ $$ = NULL; }
+	| PARS_LOCK_TOKEN PARS_IN_TOKEN PARS_SHARE_TOKEN PARS_MODE_TOKEN
+				{ $$ = &pars_share_token; }
+;
+
+order_direction:
+	/* Nothing */		{ $$ = &pars_asc_token; }
+	| PARS_ASC_TOKEN	{ $$ = &pars_asc_token; }
+	| PARS_DESC_TOKEN	{ $$ = &pars_desc_token; }
+;
+
+order_by_clause:
+	/* Nothing */		{ $$ = NULL; }
+	| PARS_ORDER_TOKEN PARS_BY_TOKEN PARS_ID_TOKEN order_direction
+				{ $$ = pars_order_by($3, $4); }
+;
+
+select_statement:
+	PARS_SELECT_TOKEN select_list
+	PARS_FROM_TOKEN table_list
+	search_condition
+	for_update_clause
+	lock_shared_clause
+	order_by_clause		{ $$ = pars_select_statement($2, $4, $5,
+								$6, $7, $8); }
+;
+
+insert_statement_start:
+	PARS_INSERT_TOKEN PARS_INTO_TOKEN
+	PARS_ID_TOKEN		{ $$ = $3; }
+;
+
+insert_statement:
+	insert_statement_start PARS_VALUES_TOKEN '(' exp_list ')'
+				{ $$ = pars_insert_statement($1, $4, NULL); }
+	| insert_statement_start select_statement
+				{ $$ = pars_insert_statement($1, NULL, $2); }
+;
+
+column_assignment:
+	PARS_ID_TOKEN '=' exp	{ $$ = pars_column_assignment($1, $3); }
+;
+
+column_assignment_list:
+	column_assignment	{ $$ = que_node_list_add_last(NULL, $1); }
+	| column_assignment_list ',' column_assignment
+				{ $$ = que_node_list_add_last($1, $3); }
+;
+
+cursor_positioned:
+	PARS_WHERE_TOKEN
+	PARS_CURRENT_TOKEN PARS_OF_TOKEN
+	PARS_ID_TOKEN 		{ $$ = $4; }
+;
+
+update_statement_start:
+	PARS_UPDATE_TOKEN PARS_ID_TOKEN
+	PARS_SET_TOKEN
+	column_assignment_list	{ $$ = pars_update_statement_start(FALSE,
+								$2, $4); }
+;
+
+update_statement_searched:
+	update_statement_start
+	search_condition	{ $$ = pars_update_statement($1, NULL, $2); }
+;
+
+update_statement_positioned:
+	update_statement_start
+	cursor_positioned	{ $$ = pars_update_statement($1, $2, NULL); }
+;
+
+delete_statement_start:
+	PARS_DELETE_TOKEN PARS_FROM_TOKEN
+	PARS_ID_TOKEN		{ $$ = pars_update_statement_start(TRUE,
+								$3, NULL); }
+;
+
+delete_statement_searched:
+	delete_statement_start
+	search_condition	{ $$ = pars_update_statement($1, NULL, $2); }
+;
+
+delete_statement_positioned:
+	delete_statement_start
+	cursor_positioned	{ $$ = pars_update_statement($1, $2, NULL); }
+;
+
+row_printf_statement:
+	PARS_ROW_PRINTF_TOKEN select_statement
+				{ $$ = pars_row_printf_statement($2); }
+;
+
+assignment_statement:
+	PARS_ID_TOKEN PARS_ASSIGN_TOKEN exp
+				{ $$ = pars_assignment_statement($1, $3); }
+;
+
+elsif_element:
+	PARS_ELSIF_TOKEN
+	exp PARS_THEN_TOKEN statement_list
+				{ $$ = pars_elsif_element($2, $4); }
+;
+
+elsif_list:
+	elsif_element		{ $$ = que_node_list_add_last(NULL, $1); }
+	| elsif_list elsif_element
+				{ $$ = que_node_list_add_last($1, $2); }
+;
+
+else_part:
+	/* Nothing */		{ $$ = NULL; }
+	| PARS_ELSE_TOKEN statement_list
+				{ $$ = $2; }
+	| elsif_list		{ $$ = $1; }
+;
+
+if_statement:
+	PARS_IF_TOKEN exp PARS_THEN_TOKEN statement_list
+	else_part
+	PARS_END_TOKEN PARS_IF_TOKEN
+				{ $$ = pars_if_statement($2, $4, $5); }
+;
+
+while_statement:
+	PARS_WHILE_TOKEN exp PARS_LOOP_TOKEN statement_list
+	PARS_END_TOKEN PARS_LOOP_TOKEN
+				{ $$ = pars_while_statement($2, $4); }
+;
+
+for_statement:
+	PARS_FOR_TOKEN PARS_ID_TOKEN PARS_IN_TOKEN
+	exp PARS_DDOT_TOKEN exp
+	PARS_LOOP_TOKEN statement_list
+	PARS_END_TOKEN PARS_LOOP_TOKEN
+				{ $$ = pars_for_statement($2, $4, $6, $8); }
+;
+
+exit_statement:
+	PARS_EXIT_TOKEN		{ $$ = pars_exit_statement(); }
+;
+
+return_statement:
+	PARS_RETURN_TOKEN	{ $$ = pars_return_statement(); }
+;
+
+open_cursor_statement:
+	PARS_OPEN_TOKEN PARS_ID_TOKEN
+				{ $$ = pars_open_statement(
+						ROW_SEL_OPEN_CURSOR, $2); }
+;
+
+close_cursor_statement:
+	PARS_CLOSE_TOKEN PARS_ID_TOKEN
+				{ $$ = pars_open_statement(
+						ROW_SEL_CLOSE_CURSOR, $2); }
+;
+
+fetch_statement:
+	PARS_FETCH_TOKEN PARS_ID_TOKEN PARS_INTO_TOKEN variable_list
+				{ $$ = pars_fetch_statement($2, $4, NULL); }
+	| PARS_FETCH_TOKEN PARS_ID_TOKEN PARS_INTO_TOKEN user_function_call
+				{ $$ = pars_fetch_statement($2, NULL, $4); }
+;
+
+column_def:
+	PARS_ID_TOKEN type_name	opt_column_len opt_unsigned opt_not_null
+				{ $$ = pars_column_def($1, $2, $3, $4, $5); }
+;
+
+column_def_list:
+	column_def		{ $$ = que_node_list_add_last(NULL, $1); }
+	| column_def_list ',' column_def
+				{ $$ = que_node_list_add_last($1, $3); }
+;
+
+opt_column_len:
+	/* Nothing */		{ $$ = NULL; }
+	| '(' PARS_INT_LIT ')'
+				{ $$ = $2; }
+;
+
+opt_unsigned:
+	/* Nothing */		{ $$ = NULL; }
+	| PARS_UNSIGNED_TOKEN
+				{ $$ = &pars_int_token;
+					/* pass any non-NULL pointer */ }
+;
+
+opt_not_null:
+	/* Nothing */		{ $$ = NULL; }
+	| PARS_NOT_TOKEN PARS_NULL_LIT
+				{ $$ = &pars_int_token;
+					/* pass any non-NULL pointer */ }
+;
+
+not_fit_in_memory:
+	/* Nothing */		{ $$ = NULL; }
+	| PARS_DOES_NOT_FIT_IN_MEM_TOKEN
+				{ $$ = &pars_int_token;
+					/* pass any non-NULL pointer */ }
+;
+
+create_table:
+	PARS_CREATE_TOKEN PARS_TABLE_TOKEN
+	PARS_ID_TOKEN '(' column_def_list ')'
+	not_fit_in_memory	{ $$ = pars_create_table($3, $5, $7); }
+;
+
+column_list:
+	PARS_ID_TOKEN		{ $$ = que_node_list_add_last(NULL, $1); }
+	| column_list ',' PARS_ID_TOKEN
+				{ $$ = que_node_list_add_last($1, $3); }
+;
+
+unique_def:
+	/* Nothing */		{ $$ = NULL; }
+	| PARS_UNIQUE_TOKEN	{ $$ = &pars_unique_token; }
+;
+
+clustered_def:
+	/* Nothing */		{ $$ = NULL; }
+	| PARS_CLUSTERED_TOKEN	{ $$ = &pars_clustered_token; }
+;
+
+create_index:
+	PARS_CREATE_TOKEN unique_def
+	clustered_def
+	PARS_INDEX_TOKEN
+	PARS_ID_TOKEN PARS_ON_TOKEN PARS_ID_TOKEN
+	'(' column_list ')'	{ $$ = pars_create_index($2, $3, $5, $7, $9); }
+;
+
+commit_statement:
+	PARS_COMMIT_TOKEN PARS_WORK_TOKEN
+				{ $$ = pars_commit_statement(); }
+;
+
+rollback_statement:
+	PARS_ROLLBACK_TOKEN PARS_WORK_TOKEN
+				{ $$ = pars_rollback_statement(); }
+;
+
+type_name:
+	PARS_INT_TOKEN		{ $$ = &pars_int_token; }
+	| PARS_INTEGER_TOKEN	{ $$ = &pars_int_token; }
+	| PARS_CHAR_TOKEN	{ $$ = &pars_char_token; }
+	| PARS_BINARY_TOKEN	{ $$ = &pars_binary_token; }
+	| PARS_BLOB_TOKEN	{ $$ = &pars_blob_token; }
+;
+
+parameter_declaration:
+	PARS_ID_TOKEN PARS_IN_TOKEN type_name
+				{ $$ = pars_parameter_declaration($1,
+							PARS_INPUT, $3); }
+	| PARS_ID_TOKEN PARS_OUT_TOKEN type_name
+				{ $$ = pars_parameter_declaration($1,
+							PARS_OUTPUT, $3); }
+;
+
+parameter_declaration_list:
+	/* Nothing */		{ $$ = NULL; }
+	| parameter_declaration	{ $$ = que_node_list_add_last(NULL, $1); }
+	| parameter_declaration_list ',' parameter_declaration
+				{ $$ = que_node_list_add_last($1, $3); }
+;
+
+variable_declaration:
+	PARS_ID_TOKEN type_name ';'
+				{ $$ = pars_variable_declaration($1, $2); }
+;
+
+variable_declaration_list:
+	/* Nothing */
+	| variable_declaration
+	| variable_declaration_list variable_declaration
+;
+
+cursor_declaration:
+	PARS_DECLARE_TOKEN PARS_CURSOR_TOKEN PARS_ID_TOKEN
+	PARS_IS_TOKEN select_statement ';'
+				{ $$ = pars_cursor_declaration($3, $5); }
+;
+
+function_declaration:
+	PARS_DECLARE_TOKEN PARS_FUNCTION_TOKEN PARS_ID_TOKEN ';'
+				{ $$ = pars_function_declaration($3); }
+;
+
+declaration:
+	cursor_declaration
+	| function_declaration
+;
+
+declaration_list:
+	/* Nothing */
+	| declaration
+	| declaration_list declaration
+;
+
+procedure_definition:
+	PARS_PROCEDURE_TOKEN PARS_ID_TOKEN '(' parameter_declaration_list ')'
+	PARS_IS_TOKEN
+	variable_declaration_list
+	declaration_list
+	PARS_BEGIN_TOKEN
+	statement_list
+	PARS_END_TOKEN		{ $$ = pars_procedure_definition($2, $4,
+								$10); }
+;
+
+%%
diff --git a/storage/xtradb/pars/pars0lex.l b/storage/xtradb/pars/pars0lex.l
new file mode 100644
index 00000000000..7bd39f7514b
--- /dev/null
+++ b/storage/xtradb/pars/pars0lex.l
@@ -0,0 +1,678 @@
+/*****************************************************************************
+
+Copyright (c) 1997, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/******************************************************
+SQL parser lexical analyzer: input file for the GNU Flex lexer generator
+
+The InnoDB parser is frozen because MySQL takes care of SQL parsing.
+Therefore we normally keep the InnoDB parser C files as they are, and do
+not automatically generate them from pars0grm.y and pars0lex.l.
+
+How to make the InnoDB parser and lexer C files:
+
+1. Run ./make_flex.sh to generate lexer files.
+
+2. Run ./make_bison.sh to generate parser files.
+
+These instructions seem to work at least with bison-1.875d and flex-2.5.31 on
+Linux.
+
+Created 12/14/1997 Heikki Tuuri
+*******************************************************/
+
+%option nostdinit
+%option 8bit
+%option warn
+%option pointer
+%option never-interactive
+%option nodefault
+%option noinput
+%option nounput
+%option noyywrap
+%option noyy_scan_buffer
+%option noyy_scan_bytes
+%option noyy_scan_string
+%option nounistd
+
+%{
+#define YYSTYPE que_node_t*
+
+#include "univ.i"
+#include "pars0pars.h"
+#include "pars0grm.h"
+#include "pars0sym.h"
+#include "mem0mem.h"
+#include "os0proc.h"
+
+#define malloc(A)	ut_malloc(A)
+#define free(A)		ut_free(A)
+#define realloc(P, A)	ut_realloc(P, A)
+#define exit(A) 	ut_error
+
+#define YY_INPUT(buf, result, max_size) pars_get_lex_chars(buf, &result, max_size)
+
+/* String buffer for removing quotes */
+static ulint	stringbuf_len_alloc = 0; /* Allocated length */
+static ulint	stringbuf_len = 0; /* Current length */
+static char*	stringbuf; /* Start of buffer */
+/** Appends a string to the buffer. */
+static
+void
+string_append(
+/*==========*/
+	const char*	str,	/*!< in: string to be appended */
+	ulint		len)	/*!< in: length of the string */
+{
+	if (stringbuf == NULL) {
+		stringbuf = malloc(1);
+		stringbuf_len_alloc = 1;
+	}
+
+	if (stringbuf_len + len > stringbuf_len_alloc) {
+		while (stringbuf_len + len > stringbuf_len_alloc) {
+			stringbuf_len_alloc <<= 1;
+		}
+		stringbuf = realloc(stringbuf, stringbuf_len_alloc);
+	}
+
+	memcpy(stringbuf + stringbuf_len, str, len);
+	stringbuf_len += len;
+}
+
+%}
+
+DIGIT	[0-9]
+ID	[a-z_A-Z][a-z_A-Z0-9]*
+BOUND_LIT	\:[a-z_A-Z0-9]+
+BOUND_ID	\$[a-z_A-Z0-9]+
+
+%x comment
+%x quoted
+%x id
+%%
+
+{DIGIT}+	{
+			yylval = sym_tab_add_int_lit(pars_sym_tab_global,
+								atoi(yytext));
+			return(PARS_INT_LIT);
+}
+
+{DIGIT}+"."{DIGIT}* {
+			ut_error;	/* not implemented */
+
+			return(PARS_FLOAT_LIT);
+}
+
+{BOUND_LIT}	{
+			ulint	type;
+
+			yylval = sym_tab_add_bound_lit(pars_sym_tab_global,
+				yytext + 1, &type);
+
+			return((int) type);
+}
+
+{BOUND_ID}	{
+			yylval = sym_tab_add_bound_id(pars_sym_tab_global,
+				yytext + 1);
+
+			return(PARS_ID_TOKEN);
+}
+
+"'"		{
+/* Quoted character string literals are handled in an explicit
+start state 'quoted'.  This state is entered and the buffer for
+the scanned string is emptied upon encountering a starting quote.
+
+In the state 'quoted', only two actions are possible (defined below). */
+			BEGIN(quoted);
+			stringbuf_len = 0;
+}
+<quoted>[^\']+	{
+			/* Got a sequence of characters other than "'":
+			append to string buffer */
+			string_append(yytext, yyleng);
+}
+<quoted>"'"+	{
+			/* Got a sequence of "'" characters:
+			append half of them to string buffer,
+			as "''" represents a single "'".
+			We apply truncating division,
+			so that "'''" will result in "'". */
+
+			string_append(yytext, yyleng / 2);
+
+			/* If we got an odd number of quotes, then the
+			last quote we got is the terminating quote.
+			At the end of the string, we return to the
+			initial start state and report the scanned
+			string literal. */
+
+			if (yyleng % 2) {
+				BEGIN(INITIAL);
+				yylval = sym_tab_add_str_lit(
+					pars_sym_tab_global,
+					(byte*) stringbuf, stringbuf_len);
+				return(PARS_STR_LIT);
+			}
+}
+
+\"		{
+/* Quoted identifiers are handled in an explicit start state 'id'.
+This state is entered and the buffer for the scanned string is emptied
+upon encountering a starting quote.
+
+In the state 'id', only two actions are possible (defined below). */
+			BEGIN(id);
+			stringbuf_len = 0;
+}
+<id>[^\"]+	{
+			/* Got a sequence of characters other than '"':
+			append to string buffer */
+			string_append(yytext, yyleng);
+}
+<id>\"+	{
+			/* Got a sequence of '"' characters:
+			append half of them to string buffer,
+			as '""' represents a single '"'.
+			We apply truncating division,
+			so that '"""' will result in '"'. */
+
+			string_append(yytext, yyleng / 2);
+
+			/* If we got an odd number of quotes, then the
+			last quote we got is the terminating quote.
+			At the end of the string, we return to the
+			initial start state and report the scanned
+			identifier. */
+
+			if (yyleng % 2) {
+				BEGIN(INITIAL);
+				yylval = sym_tab_add_id(
+					pars_sym_tab_global,
+					(byte*) stringbuf, stringbuf_len);
+
+				return(PARS_ID_TOKEN);
+			}
+}
+
+"NULL"		{
+			yylval = sym_tab_add_null_lit(pars_sym_tab_global);
+
+			return(PARS_NULL_LIT);
+}
+
+"SQL"		{
+			/* Implicit cursor name */
+			yylval = sym_tab_add_str_lit(pars_sym_tab_global,
+							(byte*) yytext, yyleng);
+			return(PARS_SQL_TOKEN);
+}
+
+"AND"		{
+			return(PARS_AND_TOKEN);
+}
+
+"OR"		{
+			return(PARS_OR_TOKEN);
+}
+
+"NOT"		{
+			return(PARS_NOT_TOKEN);
+}
+
+"PROCEDURE"	{
+			return(PARS_PROCEDURE_TOKEN);
+}
+
+"IN"		{
+			return(PARS_IN_TOKEN);
+}
+
+"OUT"		{
+			return(PARS_OUT_TOKEN);
+}
+
+"BINARY"	{
+	 		return(PARS_BINARY_TOKEN);
+}
+
+"BLOB"		{
+	 		return(PARS_BLOB_TOKEN);
+}
+
+"INT"		{
+	 		return(PARS_INT_TOKEN);
+}
+
+"INTEGER"	{
+	 		return(PARS_INT_TOKEN);
+}
+
+"FLOAT"		{
+	 		return(PARS_FLOAT_TOKEN);
+}
+
+"CHAR"		{
+	 		return(PARS_CHAR_TOKEN);
+}
+
+"IS"		{
+			return(PARS_IS_TOKEN);
+}
+
+"BEGIN"		{
+			return(PARS_BEGIN_TOKEN);
+}
+
+"END"		{
+			return(PARS_END_TOKEN);
+}
+
+"IF"		{
+			return(PARS_IF_TOKEN);
+}
+
+"THEN"		{
+			return(PARS_THEN_TOKEN);
+}
+
+"ELSE"		{
+			return(PARS_ELSE_TOKEN);
+}
+
+"ELSIF"		{
+			return(PARS_ELSIF_TOKEN);
+}
+
+"LOOP"		{
+			return(PARS_LOOP_TOKEN);
+}
+
+"WHILE"		{
+			return(PARS_WHILE_TOKEN);
+}
+
+"RETURN"	{
+			return(PARS_RETURN_TOKEN);
+}
+
+"SELECT"	{
+			return(PARS_SELECT_TOKEN);
+}
+
+"SUM"		{
+			return(PARS_SUM_TOKEN);
+}
+
+"COUNT"		{
+			return(PARS_COUNT_TOKEN);
+}
+
+"DISTINCT"	{
+			return(PARS_DISTINCT_TOKEN);
+}
+
+"FROM"		{
+			return(PARS_FROM_TOKEN);
+}
+
+"WHERE"		{
+			return(PARS_WHERE_TOKEN);
+}
+
+"FOR"		{
+			return(PARS_FOR_TOKEN);
+}
+
+"READ"		{
+			return(PARS_READ_TOKEN);
+}
+
+"ORDER"		{
+			return(PARS_ORDER_TOKEN);
+}
+
+"BY"		{
+			return(PARS_BY_TOKEN);
+}
+
+"ASC"		{
+			return(PARS_ASC_TOKEN);
+}
+
+"DESC"		{
+			return(PARS_DESC_TOKEN);
+}
+
+"INSERT"	{
+			return(PARS_INSERT_TOKEN);
+}
+
+"INTO"		{
+			return(PARS_INTO_TOKEN);
+}
+
+"VALUES"	{
+			return(PARS_VALUES_TOKEN);
+}
+
+"UPDATE"	{
+			return(PARS_UPDATE_TOKEN);
+}
+
+"SET"		{
+			return(PARS_SET_TOKEN);
+}
+
+"DELETE"	{
+			return(PARS_DELETE_TOKEN);
+}
+
+"CURRENT"	{
+			return(PARS_CURRENT_TOKEN);
+}
+
+"OF"		{
+			return(PARS_OF_TOKEN);
+}
+
+"CREATE"	{
+			return(PARS_CREATE_TOKEN);
+}
+
+"TABLE"		{
+			return(PARS_TABLE_TOKEN);
+}
+
+"INDEX"		{
+	 		return(PARS_INDEX_TOKEN);
+}
+
+"UNIQUE"	{
+	 		return(PARS_UNIQUE_TOKEN);
+}
+
+"CLUSTERED"	{
+	 		return(PARS_CLUSTERED_TOKEN);
+}
+
+"DOES_NOT_FIT_IN_MEMORY"	{
+			return(PARS_DOES_NOT_FIT_IN_MEM_TOKEN);
+}
+
+"ON"		{
+	 		return(PARS_ON_TOKEN);
+}
+
+"DECLARE"	{
+			return(PARS_DECLARE_TOKEN);
+}
+
+"CURSOR"	{
+			return(PARS_CURSOR_TOKEN);
+}
+
+"OPEN"	{
+			return(PARS_OPEN_TOKEN);
+}
+
+"FETCH"	{
+			return(PARS_FETCH_TOKEN);
+}
+
+"CLOSE"	{
+			return(PARS_CLOSE_TOKEN);
+}
+
+"NOTFOUND"	{
+			return(PARS_NOTFOUND_TOKEN);
+}
+
+"TO_CHAR"	{
+			return(PARS_TO_CHAR_TOKEN);
+}
+
+"TO_NUMBER"	{
+			return(PARS_TO_NUMBER_TOKEN);
+}
+
+"TO_BINARY"	{
+			return(PARS_TO_BINARY_TOKEN);
+}
+
+"BINARY_TO_NUMBER" {
+			return(PARS_BINARY_TO_NUMBER_TOKEN);
+}
+
+"SUBSTR"	{
+			return(PARS_SUBSTR_TOKEN);
+}
+
+"REPLSTR"	{
+			return(PARS_REPLSTR_TOKEN);
+}
+
+"CONCAT"	{
+			return(PARS_CONCAT_TOKEN);
+}
+
+"INSTR"		{
+			return(PARS_INSTR_TOKEN);
+}
+
+"LENGTH"	{
+			return(PARS_LENGTH_TOKEN);
+}
+
+"SYSDATE"	{
+			return(PARS_SYSDATE_TOKEN);
+}
+
+"PRINTF"	{
+			return(PARS_PRINTF_TOKEN);
+}
+
+"ASSERT"	{
+			return(PARS_ASSERT_TOKEN);
+}
+
+"RND"		{
+			return(PARS_RND_TOKEN);
+}
+
+"RND_STR"	{
+			return(PARS_RND_STR_TOKEN);
+}
+
+"ROW_PRINTF"	{
+			return(PARS_ROW_PRINTF_TOKEN);
+}
+
+"COMMIT"	{
+			return(PARS_COMMIT_TOKEN);
+}
+
+"ROLLBACK"	{
+			return(PARS_ROLLBACK_TOKEN);
+}
+
+"WORK"		{
+			return(PARS_WORK_TOKEN);
+}
+
+"UNSIGNED"	{
+			return(PARS_UNSIGNED_TOKEN);
+}
+
+"EXIT"		{
+			return(PARS_EXIT_TOKEN);
+}
+
+"FUNCTION"	{
+			return(PARS_FUNCTION_TOKEN);
+}
+
+"LOCK"	{
+			return(PARS_LOCK_TOKEN);
+}
+
+"SHARE"	{
+			return(PARS_SHARE_TOKEN);
+}
+
+"MODE"	{
+			return(PARS_MODE_TOKEN);
+}
+
+{ID}		{
+			yylval = sym_tab_add_id(pars_sym_tab_global,
+							(byte*)yytext,
+							ut_strlen(yytext));
+			return(PARS_ID_TOKEN);
+}
+
+".."		{
+			return(PARS_DDOT_TOKEN);
+}
+
+":="		{
+			return(PARS_ASSIGN_TOKEN);
+}
+
+"<="		{
+			return(PARS_LE_TOKEN);
+}
+
+">="		{
+			return(PARS_GE_TOKEN);
+}
+
+"<>"		{
+			return(PARS_NE_TOKEN);
+}
+
+"("		{
+
+			return((int)(*yytext));
+}
+
+"="		{
+
+			return((int)(*yytext));
+}
+
+">"		{
+
+			return((int)(*yytext));
+}
+
+"<"		{
+
+			return((int)(*yytext));
+}
+
+","		{
+
+			return((int)(*yytext));
+}
+
+";"		{
+
+			return((int)(*yytext));
+}
+
+")"		{
+
+			return((int)(*yytext));
+}
+
+"+" 		{
+
+			return((int)(*yytext));
+}
+
+"-"		{
+
+			return((int)(*yytext));
+}
+
+"*"		{
+
+			return((int)(*yytext));
+}
+
+"/"		{
+
+			return((int)(*yytext));
+}
+
+"%"		{
+
+			return((int)(*yytext));
+}
+
+"{"		{
+
+			return((int)(*yytext));
+}
+
+"}"		{
+
+			return((int)(*yytext));
+}
+
+"?"		{
+
+			return((int)(*yytext));
+}
+
+"/*"			BEGIN(comment); /* eat up comment */
+
+<comment>[^*]*
+<comment>"*"+[^*/]*
+<comment>"*"+"/"        BEGIN(INITIAL);
+
+[ \t\n]+		/* eat up whitespace */
+
+
+.		{
+			fprintf(stderr,"Unrecognized character: %02x\n",
+				*yytext);
+
+			ut_error;
+
+			return(0);
+}
+
+%%
+
+/**********************************************************************
+Release any resources used by the lexer. */
+UNIV_INTERN
+void
+pars_lexer_close(void)
+/*==================*/
+{
+        if (yy_buffer_stack)
+  	  yylex_destroy();
+        if (stringbuf)
+  	  free(stringbuf);
+	stringbuf = NULL;
+	stringbuf_len_alloc = stringbuf_len = 0;
+}
diff --git a/storage/xtradb/pars/pars0opt.c b/storage/xtradb/pars/pars0opt.c
new file mode 100644
index 00000000000..2e392ba4836
--- /dev/null
+++ b/storage/xtradb/pars/pars0opt.c
@@ -0,0 +1,1216 @@
+/*****************************************************************************
+
+Copyright (c) 1997, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file pars/pars0opt.c
+Simple SQL optimizer
+
+Created 12/21/1997 Heikki Tuuri
+*******************************************************/
+
+#include "pars0opt.h"
+
+#ifdef UNIV_NONINL
+#include "pars0opt.ic"
+#endif
+
+#include "row0sel.h"
+#include "row0ins.h"
+#include "row0upd.h"
+#include "dict0dict.h"
+#include "dict0mem.h"
+#include "que0que.h"
+#include "pars0grm.h"
+#include "pars0pars.h"
+#include "lock0lock.h"
+
+#define OPT_EQUAL	1	/* comparison by = */
+#define OPT_COMPARISON	2	/* comparison by <, >, <=, or >= */
+
+#define OPT_NOT_COND	1
+#define OPT_END_COND	2
+#define OPT_TEST_COND	3
+#define OPT_SCROLL_COND	4
+
+
+/*******************************************************************//**
+Inverts a comparison operator.
+@return	the equivalent operator when the order of the arguments is switched */
+static
+int
+opt_invert_cmp_op(
+/*==============*/
+	int	op)	/*!< in: operator */
+{
+	if (op == '<') {
+		return('>');
+	} else if (op == '>') {
+		return('<');
+	} else if (op == '=') {
+		return('=');
+	} else if (op == PARS_LE_TOKEN) {
+		return(PARS_GE_TOKEN);
+	} else if (op == PARS_GE_TOKEN) {
+		return(PARS_LE_TOKEN);
+	} else {
+		ut_error;
+	}
+
+	return(0);
+}
+
+/*******************************************************************//**
+Checks if the value of an expression can be calculated BEFORE the nth table
+in a join is accessed. If this is the case, it can possibly be used in an
+index search for the nth table.
+@return	TRUE if already determined */
+static
+ibool
+opt_check_exp_determined_before(
+/*============================*/
+	que_node_t*	exp,		/*!< in: expression */
+	sel_node_t*	sel_node,	/*!< in: select node */
+	ulint		nth_table)	/*!< in: nth table will be accessed */
+{
+	func_node_t*	func_node;
+	sym_node_t*	sym_node;
+	dict_table_t*	table;
+	que_node_t*	arg;
+	ulint		i;
+
+	ut_ad(exp && sel_node);
+
+	if (que_node_get_type(exp) == QUE_NODE_FUNC) {
+		func_node = exp;
+
+		arg = func_node->args;
+
+		while (arg) {
+			if (!opt_check_exp_determined_before(arg, sel_node,
+							     nth_table)) {
+				return(FALSE);
+			}
+
+			arg = que_node_get_next(arg);
+		}
+
+		return(TRUE);
+	}
+
+	ut_a(que_node_get_type(exp) == QUE_NODE_SYMBOL);
+
+	sym_node = exp;
+
+	if (sym_node->token_type != SYM_COLUMN) {
+
+		return(TRUE);
+	}
+
+	for (i = 0; i < nth_table; i++) {
+
+		table = sel_node_get_nth_plan(sel_node, i)->table;
+
+		if (sym_node->table == table) {
+
+			return(TRUE);
+		}
+	}
+
+	return(FALSE);
+}
+
+/*******************************************************************//**
+Looks in a comparison condition if a column value is already restricted by
+it BEFORE the nth table is accessed.
+@return	expression restricting the value of the column, or NULL if not known */
+static
+que_node_t*
+opt_look_for_col_in_comparison_before(
+/*==================================*/
+	ulint		cmp_type,	/*!< in: OPT_EQUAL, OPT_COMPARISON */
+	ulint		col_no,		/*!< in: column number */
+	func_node_t*	search_cond,	/*!< in: comparison condition */
+	sel_node_t*	sel_node,	/*!< in: select node */
+	ulint		nth_table,	/*!< in: nth table in a join (a query
+					from a single table is considered a
+					join of 1 table) */
+	ulint*		op)		/*!< out: comparison operator ('=',
+					PARS_GE_TOKEN, ... ); this is inverted
+					if the column appears on the right
+					side */
+{
+	sym_node_t*	sym_node;
+	dict_table_t*	table;
+	que_node_t*	exp;
+	que_node_t*	arg;
+
+	ut_ad(search_cond);
+
+	ut_a((search_cond->func == '<')
+	     || (search_cond->func == '>')
+	     || (search_cond->func == '=')
+	     || (search_cond->func == PARS_GE_TOKEN)
+	     || (search_cond->func == PARS_LE_TOKEN));
+
+	table = sel_node_get_nth_plan(sel_node, nth_table)->table;
+
+	if ((cmp_type == OPT_EQUAL) && (search_cond->func != '=')) {
+
+		return(NULL);
+
+	} else if ((cmp_type == OPT_COMPARISON)
+		   && (search_cond->func != '<')
+		   && (search_cond->func != '>')
+		   && (search_cond->func != PARS_GE_TOKEN)
+		   && (search_cond->func != PARS_LE_TOKEN)) {
+
+		return(NULL);
+	}
+
+	arg = search_cond->args;
+
+	if (que_node_get_type(arg) == QUE_NODE_SYMBOL) {
+		sym_node = arg;
+
+		if ((sym_node->token_type == SYM_COLUMN)
+		    && (sym_node->table == table)
+		    && (sym_node->col_no == col_no)) {
+
+			/* sym_node contains the desired column id */
+
+			/* Check if the expression on the right side of the
+			operator is already determined */
+
+			exp = que_node_get_next(arg);
+
+			if (opt_check_exp_determined_before(exp, sel_node,
+							    nth_table)) {
+				*op = search_cond->func;
+
+				return(exp);
+			}
+		}
+	}
+
+	exp = search_cond->args;
+	arg = que_node_get_next(arg);
+
+	if (que_node_get_type(arg) == QUE_NODE_SYMBOL) {
+		sym_node = arg;
+
+		if ((sym_node->token_type == SYM_COLUMN)
+		    && (sym_node->table == table)
+		    && (sym_node->col_no == col_no)) {
+
+			if (opt_check_exp_determined_before(exp, sel_node,
+							    nth_table)) {
+				*op = opt_invert_cmp_op(search_cond->func);
+
+				return(exp);
+			}
+		}
+	}
+
+	return(NULL);
+}
+
+/*******************************************************************//**
+Looks in a search condition if a column value is already restricted by the
+search condition BEFORE the nth table is accessed. Takes into account that
+if we will fetch in an ascending order, we cannot utilize an upper limit for
+a column value; in a descending order, respectively, a lower limit.
+@return	expression restricting the value of the column, or NULL if not known */
+static
+que_node_t*
+opt_look_for_col_in_cond_before(
+/*============================*/
+	ulint		cmp_type,	/*!< in: OPT_EQUAL, OPT_COMPARISON */
+	ulint		col_no,		/*!< in: column number */
+	func_node_t*	search_cond,	/*!< in: search condition or NULL */
+	sel_node_t*	sel_node,	/*!< in: select node */
+	ulint		nth_table,	/*!< in: nth table in a join (a query
+					from a single table is considered a
+					join of 1 table) */
+	ulint*		op)		/*!< out: comparison operator ('=',
+					PARS_GE_TOKEN, ... ) */
+{
+	func_node_t*	new_cond;
+	que_node_t*	exp;
+
+	if (search_cond == NULL) {
+
+		return(NULL);
+	}
+
+	ut_a(que_node_get_type(search_cond) == QUE_NODE_FUNC);
+	ut_a(search_cond->func != PARS_OR_TOKEN);
+	ut_a(search_cond->func != PARS_NOT_TOKEN);
+
+	if (search_cond->func == PARS_AND_TOKEN) {
+		new_cond = search_cond->args;
+
+		exp = opt_look_for_col_in_cond_before(cmp_type, col_no,
+						      new_cond, sel_node,
+						      nth_table, op);
+		if (exp) {
+
+			return(exp);
+		}
+
+		new_cond = que_node_get_next(new_cond);
+
+		exp = opt_look_for_col_in_cond_before(cmp_type, col_no,
+						      new_cond, sel_node,
+						      nth_table, op);
+		return(exp);
+	}
+
+	exp = opt_look_for_col_in_comparison_before(cmp_type, col_no,
+						    search_cond, sel_node,
+						    nth_table, op);
+	if (exp == NULL) {
+
+		return(NULL);
+	}
+
+	/* If we will fetch in an ascending order, we cannot utilize an upper
+	limit for a column value; in a descending order, respectively, a lower
+	limit */
+
+	if (sel_node->asc && ((*op == '<') || (*op == PARS_LE_TOKEN))) {
+
+		return(NULL);
+
+	} else if (!sel_node->asc
+		   && ((*op == '>') || (*op == PARS_GE_TOKEN))) {
+
+		return(NULL);
+	}
+
+	return(exp);
+}
+
+/*******************************************************************//**
+Calculates the goodness for an index according to a select node. The
+goodness is 4 times the number of first fields in index whose values we
+already know exactly in the query. If we have a comparison condition for
+an additional field, 2 point are added. If the index is unique, and we know
+all the unique fields for the index we add 1024 points. For a clustered index
+we add 1 point.
+@return	goodness */
+static
+ulint
+opt_calc_index_goodness(
+/*====================*/
+	dict_index_t*	index,		/*!< in: index */
+	sel_node_t*	sel_node,	/*!< in: parsed select node */
+	ulint		nth_table,	/*!< in: nth table in a join */
+	que_node_t**	index_plan,	/*!< in/out: comparison expressions for
+					this index */
+	ulint*		last_op)	/*!< out: last comparison operator, if
+					goodness > 1 */
+{
+	que_node_t*	exp;
+	ulint		goodness;
+	ulint		n_fields;
+	ulint		col_no;
+	ulint		op;
+	ulint		j;
+
+	goodness = 0;
+
+	/* Note that as higher level node pointers in the B-tree contain
+	page addresses as the last field, we must not put more fields in
+	the search tuple than dict_index_get_n_unique_in_tree(index); see
+	the note in btr_cur_search_to_nth_level. */
+
+	n_fields = dict_index_get_n_unique_in_tree(index);
+
+	for (j = 0; j < n_fields; j++) {
+
+		col_no = dict_index_get_nth_col_no(index, j);
+
+		exp = opt_look_for_col_in_cond_before(
+			OPT_EQUAL, col_no, sel_node->search_cond,
+			sel_node, nth_table, &op);
+		if (exp) {
+			/* The value for this column is exactly known already
+			at this stage of the join */
+
+			index_plan[j] = exp;
+			*last_op = op;
+			goodness += 4;
+		} else {
+			/* Look for non-equality comparisons */
+
+			exp = opt_look_for_col_in_cond_before(
+				OPT_COMPARISON, col_no, sel_node->search_cond,
+				sel_node, nth_table, &op);
+			if (exp) {
+				index_plan[j] = exp;
+				*last_op = op;
+				goodness += 2;
+			}
+
+			break;
+		}
+	}
+
+	if (goodness >= 4 * dict_index_get_n_unique(index)) {
+		goodness += 1024;
+
+		if (dict_index_is_clust(index)) {
+
+			goodness += 1024;
+		}
+	}
+
+	/* We have to test for goodness here, as last_op may note be set */
+	if (goodness && dict_index_is_clust(index)) {
+
+		goodness++;
+	}
+
+	return(goodness);
+}
+
+/*******************************************************************//**
+Calculates the number of matched fields based on an index goodness.
+@return	number of excatly or partially matched fields */
+UNIV_INLINE
+ulint
+opt_calc_n_fields_from_goodness(
+/*============================*/
+	ulint	goodness)	/*!< in: goodness */
+{
+	return(((goodness % 1024) + 2) / 4);
+}
+
+/*******************************************************************//**
+Converts a comparison operator to the corresponding search mode PAGE_CUR_GE,
+...
+@return	search mode */
+UNIV_INLINE
+ulint
+opt_op_to_search_mode(
+/*==================*/
+	ibool	asc,	/*!< in: TRUE if the rows should be fetched in an
+			ascending order */
+	ulint	op)	/*!< in: operator '=', PARS_GE_TOKEN, ... */
+{
+	if (op == '=') {
+		if (asc) {
+			return(PAGE_CUR_GE);
+		} else {
+			return(PAGE_CUR_LE);
+		}
+	} else if (op == '<') {
+		ut_a(!asc);
+		return(PAGE_CUR_L);
+	} else if (op == '>') {
+		ut_a(asc);
+		return(PAGE_CUR_G);
+	} else if (op == PARS_GE_TOKEN) {
+		ut_a(asc);
+		return(PAGE_CUR_GE);
+	} else if (op == PARS_LE_TOKEN) {
+		ut_a(!asc);
+		return(PAGE_CUR_LE);
+	} else {
+		ut_error;
+	}
+
+	return(0);
+}
+
+/*******************************************************************//**
+Determines if a node is an argument node of a function node.
+@return	TRUE if is an argument */
+static
+ibool
+opt_is_arg(
+/*=======*/
+	que_node_t*	arg_node,	/*!< in: possible argument node */
+	func_node_t*	func_node)	/*!< in: function node */
+{
+	que_node_t*	arg;
+
+	arg = func_node->args;
+
+	while (arg) {
+		if (arg == arg_node) {
+
+			return(TRUE);
+		}
+
+		arg = que_node_get_next(arg);
+	}
+
+	return(FALSE);
+}
+
+/*******************************************************************//**
+Decides if the fetching of rows should be made in a descending order, and
+also checks that the chosen query plan produces a result which satisfies
+the order-by. */
+static
+void
+opt_check_order_by(
+/*===============*/
+	sel_node_t*	sel_node)	/*!< in: select node; asserts an error
+					if the plan does not agree with the
+					order-by */
+{
+	order_node_t*	order_node;
+	dict_table_t*	order_table;
+	ulint		order_col_no;
+	plan_t*		plan;
+	ulint		i;
+
+	if (!sel_node->order_by) {
+
+		return;
+	}
+
+	order_node = sel_node->order_by;
+	order_col_no = order_node->column->col_no;
+	order_table = order_node->column->table;
+
+	/* If there is an order-by clause, the first non-exactly matched field
+	in the index used for the last table in the table list should be the
+	column defined in the order-by clause, and for all the other tables
+	we should get only at most a single row, otherwise we cannot presently
+	calculate the order-by, as we have no sort utility */
+
+	for (i = 0; i < sel_node->n_tables; i++) {
+
+		plan = sel_node_get_nth_plan(sel_node, i);
+
+		if (i < sel_node->n_tables - 1) {
+			ut_a(dict_index_get_n_unique(plan->index)
+			     <= plan->n_exact_match);
+		} else {
+			ut_a(plan->table == order_table);
+
+			ut_a((dict_index_get_n_unique(plan->index)
+			      <= plan->n_exact_match)
+			     || (dict_index_get_nth_col_no(plan->index,
+							   plan->n_exact_match)
+				 == order_col_no));
+		}
+	}
+}
+
+/*******************************************************************//**
+Optimizes a select. Decides which indexes to tables to use. The tables
+are accessed in the order that they were written to the FROM part in the
+select statement. */
+static
+void
+opt_search_plan_for_table(
+/*======================*/
+	sel_node_t*	sel_node,	/*!< in: parsed select node */
+	ulint		i,		/*!< in: this is the ith table */
+	dict_table_t*	table)		/*!< in: table */
+{
+	plan_t*		plan;
+	dict_index_t*	index;
+	dict_index_t*	best_index;
+	ulint		n_fields;
+	ulint		goodness;
+	ulint		last_op		= 75946965;	/* Eliminate a Purify
+							warning */
+	ulint		best_goodness;
+	ulint		best_last_op = 0; /* remove warning */
+	que_node_t*	index_plan[256];
+	que_node_t*	best_index_plan[256];
+
+	plan = sel_node_get_nth_plan(sel_node, i);
+
+	plan->table = table;
+	plan->asc = sel_node->asc;
+	plan->pcur_is_open = FALSE;
+	plan->cursor_at_end = FALSE;
+
+	/* Calculate goodness for each index of the table */
+
+	index = dict_table_get_first_index(table);
+	best_index = index; /* Eliminate compiler warning */
+	best_goodness = 0;
+
+	/* should be do ... until ? comment by Jani */
+	while (index) {
+		goodness = opt_calc_index_goodness(index, sel_node, i,
+						   index_plan, &last_op);
+		if (goodness > best_goodness) {
+
+			best_index = index;
+			best_goodness = goodness;
+			n_fields = opt_calc_n_fields_from_goodness(goodness);
+
+			ut_memcpy(best_index_plan, index_plan,
+				  n_fields * sizeof(void*));
+			best_last_op = last_op;
+		}
+
+		index = dict_table_get_next_index(index);
+	}
+
+	plan->index = best_index;
+
+	n_fields = opt_calc_n_fields_from_goodness(best_goodness);
+
+	if (n_fields == 0) {
+		plan->tuple = NULL;
+		plan->n_exact_match = 0;
+	} else {
+		plan->tuple = dtuple_create(pars_sym_tab_global->heap,
+					    n_fields);
+		dict_index_copy_types(plan->tuple, plan->index, n_fields);
+
+		plan->tuple_exps = mem_heap_alloc(pars_sym_tab_global->heap,
+						  n_fields * sizeof(void*));
+
+		ut_memcpy(plan->tuple_exps, best_index_plan,
+			  n_fields * sizeof(void*));
+		if (best_last_op == '=') {
+			plan->n_exact_match = n_fields;
+		} else {
+			plan->n_exact_match = n_fields - 1;
+		}
+
+		plan->mode = opt_op_to_search_mode(sel_node->asc,
+						   best_last_op);
+	}
+
+	if (dict_index_is_clust(best_index)
+	    && (plan->n_exact_match >= dict_index_get_n_unique(best_index))) {
+
+		plan->unique_search = TRUE;
+	} else {
+		plan->unique_search = FALSE;
+	}
+
+	plan->old_vers_heap = NULL;
+
+	btr_pcur_init(&(plan->pcur));
+	btr_pcur_init(&(plan->clust_pcur));
+}
+
+/*******************************************************************//**
+Looks at a comparison condition and decides if it can, and need, be tested for
+a table AFTER the table has been accessed.
+@return OPT_NOT_COND if not for this table, else OPT_END_COND,
+OPT_TEST_COND, or OPT_SCROLL_COND, where the last means that the
+condition need not be tested, except when scroll cursors are used */
+static
+ulint
+opt_classify_comparison(
+/*====================*/
+	sel_node_t*	sel_node,	/*!< in: select node */
+	ulint		i,		/*!< in: ith table in the join */
+	func_node_t*	cond)		/*!< in: comparison condition */
+{
+	plan_t*	plan;
+	ulint	n_fields;
+	ulint	op;
+	ulint	j;
+
+	ut_ad(cond && sel_node);
+
+	plan = sel_node_get_nth_plan(sel_node, i);
+
+	/* Check if the condition is determined after the ith table has been
+	accessed, but not after the i - 1:th */
+
+	if (!opt_check_exp_determined_before(cond, sel_node, i + 1)) {
+
+		return(OPT_NOT_COND);
+	}
+
+	if ((i > 0) && opt_check_exp_determined_before(cond, sel_node, i)) {
+
+		return(OPT_NOT_COND);
+	}
+
+	/* If the condition is an exact match condition used in constructing
+	the search tuple, it is classified as OPT_END_COND */
+
+	if (plan->tuple) {
+		n_fields = dtuple_get_n_fields(plan->tuple);
+	} else {
+		n_fields = 0;
+	}
+
+	for (j = 0; j < plan->n_exact_match; j++) {
+
+		if (opt_is_arg(plan->tuple_exps[j], cond)) {
+
+			return(OPT_END_COND);
+		}
+	}
+
+	/* If the condition is an non-exact match condition used in
+	constructing the search tuple, it is classified as OPT_SCROLL_COND.
+	When the cursor is positioned, and if a non-scroll cursor is used,
+	there is no need to test this condition; if a scroll cursor is used
+	the testing is necessary when the cursor is reversed. */
+
+	if ((n_fields > plan->n_exact_match)
+	    && opt_is_arg(plan->tuple_exps[n_fields - 1], cond)) {
+
+		return(OPT_SCROLL_COND);
+	}
+
+	/* If the condition is a non-exact match condition on the first field
+	in index for which there is no exact match, and it limits the search
+	range from the opposite side of the search tuple already BEFORE we
+	access the table, it is classified as OPT_END_COND */
+
+	if ((dict_index_get_n_fields(plan->index) > plan->n_exact_match)
+	    && opt_look_for_col_in_comparison_before(
+		    OPT_COMPARISON,
+		    dict_index_get_nth_col_no(plan->index,
+					      plan->n_exact_match),
+		    cond, sel_node, i, &op)) {
+
+		if (sel_node->asc && ((op == '<') || (op == PARS_LE_TOKEN))) {
+
+			return(OPT_END_COND);
+		}
+
+		if (!sel_node->asc && ((op == '>') || (op == PARS_GE_TOKEN))) {
+
+			return(OPT_END_COND);
+		}
+	}
+
+	/* Otherwise, cond is classified as OPT_TEST_COND */
+
+	return(OPT_TEST_COND);
+}
+
+/*******************************************************************//**
+Recursively looks for test conditions for a table in a join. */
+static
+void
+opt_find_test_conds(
+/*================*/
+	sel_node_t*	sel_node,	/*!< in: select node */
+	ulint		i,		/*!< in: ith table in the join */
+	func_node_t*	cond)		/*!< in: conjunction of search
+					conditions or NULL */
+{
+	func_node_t*	new_cond;
+	ulint		class;
+	plan_t*		plan;
+
+	if (cond == NULL) {
+
+		return;
+	}
+
+	if (cond->func == PARS_AND_TOKEN) {
+		new_cond = cond->args;
+
+		opt_find_test_conds(sel_node, i, new_cond);
+
+		new_cond = que_node_get_next(new_cond);
+
+		opt_find_test_conds(sel_node, i, new_cond);
+
+		return;
+	}
+
+	plan = sel_node_get_nth_plan(sel_node, i);
+
+	class = opt_classify_comparison(sel_node, i, cond);
+
+	if (class == OPT_END_COND) {
+		UT_LIST_ADD_LAST(cond_list, plan->end_conds, cond);
+
+	} else if (class == OPT_TEST_COND) {
+		UT_LIST_ADD_LAST(cond_list, plan->other_conds, cond);
+
+	}
+}
+
+/*******************************************************************//**
+Normalizes a list of comparison conditions so that a column of the table
+appears on the left side of the comparison if possible. This is accomplished
+by switching the arguments of the operator. */
+static
+void
+opt_normalize_cmp_conds(
+/*====================*/
+	func_node_t*	cond,	/*!< in: first in a list of comparison
+				conditions, or NULL */
+	dict_table_t*	table)	/*!< in: table */
+{
+	que_node_t*	arg1;
+	que_node_t*	arg2;
+	sym_node_t*	sym_node;
+
+	while (cond) {
+		arg1 = cond->args;
+		arg2 = que_node_get_next(arg1);
+
+		if (que_node_get_type(arg2) == QUE_NODE_SYMBOL) {
+
+			sym_node = arg2;
+
+			if ((sym_node->token_type == SYM_COLUMN)
+			    && (sym_node->table == table)) {
+
+				/* Switch the order of the arguments */
+
+				cond->args = arg2;
+				que_node_list_add_last(NULL, arg2);
+				que_node_list_add_last(arg2, arg1);
+
+				/* Invert the operator */
+				cond->func = opt_invert_cmp_op(cond->func);
+			}
+		}
+
+		cond = UT_LIST_GET_NEXT(cond_list, cond);
+	}
+}
+
+/*******************************************************************//**
+Finds out the search condition conjuncts we can, and need, to test as the ith
+table in a join is accessed. The search tuple can eliminate the need to test
+some conjuncts. */
+static
+void
+opt_determine_and_normalize_test_conds(
+/*===================================*/
+	sel_node_t*	sel_node,	/*!< in: select node */
+	ulint		i)		/*!< in: ith table in the join */
+{
+	plan_t*	plan;
+
+	plan = sel_node_get_nth_plan(sel_node, i);
+
+	UT_LIST_INIT(plan->end_conds);
+	UT_LIST_INIT(plan->other_conds);
+
+	/* Recursively go through the conjuncts and classify them */
+
+	opt_find_test_conds(sel_node, i, sel_node->search_cond);
+
+	opt_normalize_cmp_conds(UT_LIST_GET_FIRST(plan->end_conds),
+				plan->table);
+
+	ut_a(UT_LIST_GET_LEN(plan->end_conds) >= plan->n_exact_match);
+}
+
+/*******************************************************************//**
+Looks for occurrences of the columns of the table in the query subgraph and
+adds them to the list of columns if an occurrence of the same column does not
+already exist in the list. If the column is already in the list, puts a value
+indirection to point to the occurrence in the column list, except if the
+column occurrence we are looking at is in the column list, in which case
+nothing is done. */
+UNIV_INTERN
+void
+opt_find_all_cols(
+/*==============*/
+	ibool		copy_val,	/*!< in: if TRUE, new found columns are
+					added as columns to copy */
+	dict_index_t*	index,		/*!< in: index of the table to use */
+	sym_node_list_t* col_list,	/*!< in: base node of a list where
+					to add new found columns */
+	plan_t*		plan,		/*!< in: plan or NULL */
+	que_node_t*	exp)		/*!< in: expression or condition or
+					NULL */
+{
+	func_node_t*	func_node;
+	que_node_t*	arg;
+	sym_node_t*	sym_node;
+	sym_node_t*	col_node;
+	ulint		col_pos;
+
+	if (exp == NULL) {
+
+		return;
+	}
+
+	if (que_node_get_type(exp) == QUE_NODE_FUNC) {
+		func_node = exp;
+
+		arg = func_node->args;
+
+		while (arg) {
+			opt_find_all_cols(copy_val, index, col_list, plan,
+					  arg);
+			arg = que_node_get_next(arg);
+		}
+
+		return;
+	}
+
+	ut_a(que_node_get_type(exp) == QUE_NODE_SYMBOL);
+
+	sym_node = exp;
+
+	if (sym_node->token_type != SYM_COLUMN) {
+
+		return;
+	}
+
+	if (sym_node->table != index->table) {
+
+		return;
+	}
+
+	/* Look for an occurrence of the same column in the plan column
+	list */
+
+	col_node = UT_LIST_GET_FIRST(*col_list);
+
+	while (col_node) {
+		if (col_node->col_no == sym_node->col_no) {
+
+			if (col_node == sym_node) {
+				/* sym_node was already in a list: do
+				nothing */
+
+				return;
+			}
+
+			/* Put an indirection */
+			sym_node->indirection = col_node;
+			sym_node->alias = col_node;
+
+			return;
+		}
+
+		col_node = UT_LIST_GET_NEXT(col_var_list, col_node);
+	}
+
+	/* The same column did not occur in the list: add it */
+
+	UT_LIST_ADD_LAST(col_var_list, *col_list, sym_node);
+
+	sym_node->copy_val = copy_val;
+
+	/* Fill in the field_no fields in sym_node */
+
+	sym_node->field_nos[SYM_CLUST_FIELD_NO] = dict_index_get_nth_col_pos(
+		dict_table_get_first_index(index->table), sym_node->col_no);
+	if (!dict_index_is_clust(index)) {
+
+		ut_a(plan);
+
+		col_pos = dict_index_get_nth_col_pos(index, sym_node->col_no);
+
+		if (col_pos == ULINT_UNDEFINED) {
+
+			plan->must_get_clust = TRUE;
+		}
+
+		sym_node->field_nos[SYM_SEC_FIELD_NO] = col_pos;
+	}
+}
+
+/*******************************************************************//**
+Looks for occurrences of the columns of the table in conditions which are
+not yet determined AFTER the join operation has fetched a row in the ith
+table. The values for these column must be copied to dynamic memory for
+later use. */
+static
+void
+opt_find_copy_cols(
+/*===============*/
+	sel_node_t*	sel_node,	/*!< in: select node */
+	ulint		i,		/*!< in: ith table in the join */
+	func_node_t*	search_cond)	/*!< in: search condition or NULL */
+{
+	func_node_t*	new_cond;
+	plan_t*		plan;
+
+	if (search_cond == NULL) {
+
+		return;
+	}
+
+	ut_ad(que_node_get_type(search_cond) == QUE_NODE_FUNC);
+
+	if (search_cond->func == PARS_AND_TOKEN) {
+		new_cond = search_cond->args;
+
+		opt_find_copy_cols(sel_node, i, new_cond);
+
+		new_cond = que_node_get_next(new_cond);
+
+		opt_find_copy_cols(sel_node, i, new_cond);
+
+		return;
+	}
+
+	if (!opt_check_exp_determined_before(search_cond, sel_node, i + 1)) {
+
+		/* Any ith table columns occurring in search_cond should be
+		copied, as this condition cannot be tested already on the
+		fetch from the ith table */
+
+		plan = sel_node_get_nth_plan(sel_node, i);
+
+		opt_find_all_cols(TRUE, plan->index, &(plan->columns), plan,
+				  search_cond);
+	}
+}
+
+/*******************************************************************//**
+Classifies the table columns according to whether we use the column only while
+holding the latch on the page, or whether we have to copy the column value to
+dynamic memory. Puts the first occurrence of a column to either list in the
+plan node, and puts indirections to later occurrences of the column. */
+static
+void
+opt_classify_cols(
+/*==============*/
+	sel_node_t*	sel_node,	/*!< in: select node */
+	ulint		i)		/*!< in: ith table in the join */
+{
+	plan_t*		plan;
+	que_node_t*	exp;
+
+	plan = sel_node_get_nth_plan(sel_node, i);
+
+	/* The final value of the following field will depend on the
+	environment of the select statement: */
+
+	plan->must_get_clust = FALSE;
+
+	UT_LIST_INIT(plan->columns);
+
+	/* All select list columns should be copied: therefore TRUE as the
+	first argument */
+
+	exp = sel_node->select_list;
+
+	while (exp) {
+		opt_find_all_cols(TRUE, plan->index, &(plan->columns), plan,
+				  exp);
+		exp = que_node_get_next(exp);
+	}
+
+	opt_find_copy_cols(sel_node, i, sel_node->search_cond);
+
+	/* All remaining columns in the search condition are temporary
+	columns: therefore FALSE */
+
+	opt_find_all_cols(FALSE, plan->index, &(plan->columns), plan,
+			  sel_node->search_cond);
+}
+
+/*******************************************************************//**
+Fills in the info in plan which is used in accessing a clustered index
+record. The columns must already be classified for the plan node. */
+static
+void
+opt_clust_access(
+/*=============*/
+	sel_node_t*	sel_node,	/*!< in: select node */
+	ulint		n)		/*!< in: nth table in select */
+{
+	plan_t*		plan;
+	dict_table_t*	table;
+	dict_index_t*	clust_index;
+	dict_index_t*	index;
+	mem_heap_t*	heap;
+	ulint		n_fields;
+	ulint		pos;
+	ulint		i;
+
+	plan = sel_node_get_nth_plan(sel_node, n);
+
+	index = plan->index;
+
+	/* The final value of the following field depends on the environment
+	of the select statement: */
+
+	plan->no_prefetch = FALSE;
+
+	if (dict_index_is_clust(index)) {
+		plan->clust_map = NULL;
+		plan->clust_ref = NULL;
+
+		return;
+	}
+
+	table = index->table;
+
+	clust_index = dict_table_get_first_index(table);
+
+	n_fields = dict_index_get_n_unique(clust_index);
+
+	heap = pars_sym_tab_global->heap;
+
+	plan->clust_ref = dtuple_create(heap, n_fields);
+
+	dict_index_copy_types(plan->clust_ref, clust_index, n_fields);
+
+	plan->clust_map = mem_heap_alloc(heap, n_fields * sizeof(ulint));
+
+	for (i = 0; i < n_fields; i++) {
+		pos = dict_index_get_nth_field_pos(index, clust_index, i);
+
+		ut_a(pos != ULINT_UNDEFINED);
+
+		/* We optimize here only queries to InnoDB's internal system
+		tables, and they should not contain column prefix indexes. */
+
+		if (dict_index_get_nth_field(index, pos)->prefix_len != 0
+		    || dict_index_get_nth_field(clust_index, i)
+		    ->prefix_len != 0) {
+			fprintf(stderr,
+				"InnoDB: Error in pars0opt.c:"
+				" table %s has prefix_len != 0\n",
+				index->table_name);
+		}
+
+		*(plan->clust_map + i) = pos;
+
+		ut_ad(pos != ULINT_UNDEFINED);
+	}
+}
+
+/*******************************************************************//**
+Optimizes a select. Decides which indexes to tables to use. The tables
+are accessed in the order that they were written to the FROM part in the
+select statement. */
+UNIV_INTERN
+void
+opt_search_plan(
+/*============*/
+	sel_node_t*	sel_node)	/*!< in: parsed select node */
+{
+	sym_node_t*	table_node;
+	dict_table_t*	table;
+	order_node_t*	order_by;
+	ulint		i;
+
+	sel_node->plans = mem_heap_alloc(pars_sym_tab_global->heap,
+					 sel_node->n_tables * sizeof(plan_t));
+
+	/* Analyze the search condition to find out what we know at each
+	join stage about the conditions that the columns of a table should
+	satisfy */
+
+	table_node = sel_node->table_list;
+
+	if (sel_node->order_by == NULL) {
+		sel_node->asc = TRUE;
+	} else {
+		order_by = sel_node->order_by;
+
+		sel_node->asc = order_by->asc;
+	}
+
+	for (i = 0; i < sel_node->n_tables; i++) {
+
+		table = table_node->table;
+
+		/* Choose index through which to access the table */
+
+		opt_search_plan_for_table(sel_node, i, table);
+
+		/* Determine the search condition conjuncts we can test at
+		this table; normalize the end conditions */
+
+		opt_determine_and_normalize_test_conds(sel_node, i);
+
+		table_node = que_node_get_next(table_node);
+	}
+
+	table_node = sel_node->table_list;
+
+	for (i = 0; i < sel_node->n_tables; i++) {
+
+		/* Classify the table columns into those we only need to access
+		but not copy, and to those we must copy to dynamic memory */
+
+		opt_classify_cols(sel_node, i);
+
+		/* Calculate possible info for accessing the clustered index
+		record */
+
+		opt_clust_access(sel_node, i);
+
+		table_node = que_node_get_next(table_node);
+	}
+
+	/* Check that the plan obeys a possible order-by clause: if not,
+	an assertion error occurs */
+
+	opt_check_order_by(sel_node);
+
+#ifdef UNIV_SQL_DEBUG
+	opt_print_query_plan(sel_node);
+#endif
+}
+
+/********************************************************************//**
+Prints info of a query plan. */
+UNIV_INTERN
+void
+opt_print_query_plan(
+/*=================*/
+	sel_node_t*	sel_node)	/*!< in: select node */
+{
+	plan_t*	plan;
+	ulint	n_fields;
+	ulint	i;
+
+	fputs("QUERY PLAN FOR A SELECT NODE\n", stderr);
+
+	fputs(sel_node->asc ? "Asc. search; " : "Desc. search; ", stderr);
+
+	if (sel_node->set_x_locks) {
+		fputs("sets row x-locks; ", stderr);
+		ut_a(sel_node->row_lock_mode == LOCK_X);
+		ut_a(!sel_node->consistent_read);
+	} else if (sel_node->consistent_read) {
+		fputs("consistent read; ", stderr);
+	} else {
+		ut_a(sel_node->row_lock_mode == LOCK_S);
+		fputs("sets row s-locks; ", stderr);
+	}
+
+	putc('\n', stderr);
+
+	for (i = 0; i < sel_node->n_tables; i++) {
+		plan = sel_node_get_nth_plan(sel_node, i);
+
+		if (plan->tuple) {
+			n_fields = dtuple_get_n_fields(plan->tuple);
+		} else {
+			n_fields = 0;
+		}
+
+		fputs("Table ", stderr);
+		dict_index_name_print(stderr, NULL, plan->index);
+		fprintf(stderr,"; exact m. %lu, match %lu, end conds %lu\n",
+			(unsigned long) plan->n_exact_match,
+			(unsigned long) n_fields,
+			(unsigned long) UT_LIST_GET_LEN(plan->end_conds));
+	}
+}
diff --git a/storage/xtradb/pars/pars0pars.c b/storage/xtradb/pars/pars0pars.c
new file mode 100644
index 00000000000..9faf36d00a8
--- /dev/null
+++ b/storage/xtradb/pars/pars0pars.c
@@ -0,0 +1,2196 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file pars/pars0pars.c
+SQL parser
+
+Created 11/19/1996 Heikki Tuuri
+*******************************************************/
+
+/* Historical note: Innobase executed its first SQL string (CREATE TABLE)
+on 1/27/1998 */
+
+#include "pars0pars.h"
+
+#ifdef UNIV_NONINL
+#include "pars0pars.ic"
+#endif
+
+#include "row0sel.h"
+#include "row0ins.h"
+#include "row0upd.h"
+#include "dict0dict.h"
+#include "dict0mem.h"
+#include "dict0crea.h"
+#include "que0que.h"
+#include "pars0grm.h"
+#include "pars0opt.h"
+#include "data0data.h"
+#include "data0type.h"
+#include "trx0trx.h"
+#include "trx0roll.h"
+#include "lock0lock.h"
+#include "eval0eval.h"
+
+#ifdef UNIV_SQL_DEBUG
+/** If the following is set TRUE, the lexer will print the SQL string
+as it tokenizes it */
+UNIV_INTERN ibool	pars_print_lexed	= FALSE;
+#endif /* UNIV_SQL_DEBUG */
+
+/* Global variable used while parsing a single procedure or query : the code is
+NOT re-entrant */
+UNIV_INTERN sym_tab_t*	pars_sym_tab_global;
+
+/* Global variables used to denote certain reserved words, used in
+constructing the parsing tree */
+
+UNIV_INTERN pars_res_word_t	pars_to_char_token = {PARS_TO_CHAR_TOKEN};
+UNIV_INTERN pars_res_word_t	pars_to_number_token = {PARS_TO_NUMBER_TOKEN};
+UNIV_INTERN pars_res_word_t	pars_to_binary_token = {PARS_TO_BINARY_TOKEN};
+UNIV_INTERN pars_res_word_t	pars_binary_to_number_token = {PARS_BINARY_TO_NUMBER_TOKEN};
+UNIV_INTERN pars_res_word_t	pars_substr_token = {PARS_SUBSTR_TOKEN};
+UNIV_INTERN pars_res_word_t	pars_replstr_token = {PARS_REPLSTR_TOKEN};
+UNIV_INTERN pars_res_word_t	pars_concat_token = {PARS_CONCAT_TOKEN};
+UNIV_INTERN pars_res_word_t	pars_instr_token = {PARS_INSTR_TOKEN};
+UNIV_INTERN pars_res_word_t	pars_length_token = {PARS_LENGTH_TOKEN};
+UNIV_INTERN pars_res_word_t	pars_sysdate_token = {PARS_SYSDATE_TOKEN};
+UNIV_INTERN pars_res_word_t	pars_printf_token = {PARS_PRINTF_TOKEN};
+UNIV_INTERN pars_res_word_t	pars_assert_token = {PARS_ASSERT_TOKEN};
+UNIV_INTERN pars_res_word_t	pars_rnd_token = {PARS_RND_TOKEN};
+UNIV_INTERN pars_res_word_t	pars_rnd_str_token = {PARS_RND_STR_TOKEN};
+UNIV_INTERN pars_res_word_t	pars_count_token = {PARS_COUNT_TOKEN};
+UNIV_INTERN pars_res_word_t	pars_sum_token = {PARS_SUM_TOKEN};
+UNIV_INTERN pars_res_word_t	pars_distinct_token = {PARS_DISTINCT_TOKEN};
+UNIV_INTERN pars_res_word_t	pars_binary_token = {PARS_BINARY_TOKEN};
+UNIV_INTERN pars_res_word_t	pars_blob_token = {PARS_BLOB_TOKEN};
+UNIV_INTERN pars_res_word_t	pars_int_token = {PARS_INT_TOKEN};
+UNIV_INTERN pars_res_word_t	pars_char_token = {PARS_CHAR_TOKEN};
+UNIV_INTERN pars_res_word_t	pars_float_token = {PARS_FLOAT_TOKEN};
+UNIV_INTERN pars_res_word_t	pars_update_token = {PARS_UPDATE_TOKEN};
+UNIV_INTERN pars_res_word_t	pars_asc_token = {PARS_ASC_TOKEN};
+UNIV_INTERN pars_res_word_t	pars_desc_token = {PARS_DESC_TOKEN};
+UNIV_INTERN pars_res_word_t	pars_open_token = {PARS_OPEN_TOKEN};
+UNIV_INTERN pars_res_word_t	pars_close_token = {PARS_CLOSE_TOKEN};
+UNIV_INTERN pars_res_word_t	pars_share_token = {PARS_SHARE_TOKEN};
+UNIV_INTERN pars_res_word_t	pars_unique_token = {PARS_UNIQUE_TOKEN};
+UNIV_INTERN pars_res_word_t	pars_clustered_token = {PARS_CLUSTERED_TOKEN};
+
+/** Global variable used to denote the '*' in SELECT * FROM.. */
+UNIV_INTERN ulint	pars_star_denoter	= 12345678;
+
+
+/*********************************************************************//**
+Determines the class of a function code.
+@return	function class: PARS_FUNC_ARITH, ... */
+static
+ulint
+pars_func_get_class(
+/*================*/
+	int	func)	/*!< in: function code: '=', PARS_GE_TOKEN, ... */
+{
+	switch (func) {
+	case '+': case '-': case '*': case '/':
+		return(PARS_FUNC_ARITH);
+
+	case '=': case '<': case '>':
+	case PARS_GE_TOKEN: case PARS_LE_TOKEN: case PARS_NE_TOKEN:
+		return(PARS_FUNC_CMP);
+
+	case PARS_AND_TOKEN: case PARS_OR_TOKEN: case PARS_NOT_TOKEN:
+		return(PARS_FUNC_LOGICAL);
+
+	case PARS_COUNT_TOKEN: case PARS_SUM_TOKEN:
+		return(PARS_FUNC_AGGREGATE);
+
+	case PARS_TO_CHAR_TOKEN:
+	case PARS_TO_NUMBER_TOKEN:
+	case PARS_TO_BINARY_TOKEN:
+	case PARS_BINARY_TO_NUMBER_TOKEN:
+	case PARS_SUBSTR_TOKEN:
+	case PARS_CONCAT_TOKEN:
+	case PARS_LENGTH_TOKEN:
+	case PARS_INSTR_TOKEN:
+	case PARS_SYSDATE_TOKEN:
+	case PARS_NOTFOUND_TOKEN:
+	case PARS_PRINTF_TOKEN:
+	case PARS_ASSERT_TOKEN:
+	case PARS_RND_TOKEN:
+	case PARS_RND_STR_TOKEN:
+	case PARS_REPLSTR_TOKEN:
+		return(PARS_FUNC_PREDEFINED);
+
+	default:
+		return(PARS_FUNC_OTHER);
+	}
+}
+
+/*********************************************************************//**
+Parses an operator or predefined function expression.
+@return	own: function node in a query tree */
+static
+func_node_t*
+pars_func_low(
+/*==========*/
+	int		func,	/*!< in: function token code */
+	que_node_t*	arg)	/*!< in: first argument in the argument list */
+{
+	func_node_t*	node;
+
+	node = mem_heap_alloc(pars_sym_tab_global->heap, sizeof(func_node_t));
+
+	node->common.type = QUE_NODE_FUNC;
+	dfield_set_data(&(node->common.val), NULL, 0);
+	node->common.val_buf_size = 0;
+
+	node->func = func;
+
+	node->class = pars_func_get_class(func);
+
+	node->args = arg;
+
+	UT_LIST_ADD_LAST(func_node_list, pars_sym_tab_global->func_node_list,
+			 node);
+	return(node);
+}
+
+/*********************************************************************//**
+Parses a function expression.
+@return	own: function node in a query tree */
+UNIV_INTERN
+func_node_t*
+pars_func(
+/*======*/
+	que_node_t*	res_word,/*!< in: function name reserved word */
+	que_node_t*	arg)	/*!< in: first argument in the argument list */
+{
+	return(pars_func_low(((pars_res_word_t*)res_word)->code, arg));
+}
+
+/*********************************************************************//**
+Parses an operator expression.
+@return	own: function node in a query tree */
+UNIV_INTERN
+func_node_t*
+pars_op(
+/*====*/
+	int		func,	/*!< in: operator token code */
+	que_node_t*	arg1,	/*!< in: first argument */
+	que_node_t*	arg2)	/*!< in: second argument or NULL for an unary
+				operator */
+{
+	que_node_list_add_last(NULL, arg1);
+
+	if (arg2) {
+		que_node_list_add_last(arg1, arg2);
+	}
+
+	return(pars_func_low(func, arg1));
+}
+
+/*********************************************************************//**
+Parses an ORDER BY clause. Order by a single column only is supported.
+@return	own: order-by node in a query tree */
+UNIV_INTERN
+order_node_t*
+pars_order_by(
+/*==========*/
+	sym_node_t*	column,	/*!< in: column name */
+	pars_res_word_t* asc)	/*!< in: &pars_asc_token or pars_desc_token */
+{
+	order_node_t*	node;
+
+	node = mem_heap_alloc(pars_sym_tab_global->heap, sizeof(order_node_t));
+
+	node->common.type = QUE_NODE_ORDER;
+
+	node->column = column;
+
+	if (asc == &pars_asc_token) {
+		node->asc = TRUE;
+	} else {
+		ut_a(asc == &pars_desc_token);
+		node->asc = FALSE;
+	}
+
+	return(node);
+}
+
+/*********************************************************************//**
+Determine if a data type is a built-in string data type of the InnoDB
+SQL parser.
+@return	TRUE if string data type */
+static
+ibool
+pars_is_string_type(
+/*================*/
+	ulint	mtype)	/*!< in: main data type */
+{
+	switch (mtype) {
+	case DATA_VARCHAR: case DATA_CHAR:
+	case DATA_FIXBINARY: case DATA_BINARY:
+		return(TRUE);
+	}
+
+	return(FALSE);
+}
+
+/*********************************************************************//**
+Resolves the data type of a function in an expression. The argument data
+types must already be resolved. */
+static
+void
+pars_resolve_func_data_type(
+/*========================*/
+	func_node_t*	node)	/*!< in: function node */
+{
+	que_node_t*	arg;
+
+	ut_a(que_node_get_type(node) == QUE_NODE_FUNC);
+
+	arg = node->args;
+
+	switch (node->func) {
+	case PARS_SUM_TOKEN:
+	case '+': case '-': case '*': case '/':
+		/* Inherit the data type from the first argument (which must
+		not be the SQL null literal whose type is DATA_ERROR) */
+
+		dtype_copy(que_node_get_data_type(node),
+			   que_node_get_data_type(arg));
+
+		ut_a(dtype_get_mtype(que_node_get_data_type(node))
+		     == DATA_INT);
+		break;
+
+	case PARS_COUNT_TOKEN:
+		ut_a(arg);
+		dtype_set(que_node_get_data_type(node), DATA_INT, 0, 4);
+		break;
+
+	case PARS_TO_CHAR_TOKEN:
+	case PARS_RND_STR_TOKEN:
+		ut_a(dtype_get_mtype(que_node_get_data_type(arg)) == DATA_INT);
+		dtype_set(que_node_get_data_type(node), DATA_VARCHAR,
+			  DATA_ENGLISH, 0);
+		break;
+
+	case PARS_TO_BINARY_TOKEN:
+		if (dtype_get_mtype(que_node_get_data_type(arg)) == DATA_INT) {
+			dtype_set(que_node_get_data_type(node), DATA_VARCHAR,
+				  DATA_ENGLISH, 0);
+		} else {
+			dtype_set(que_node_get_data_type(node), DATA_BINARY,
+				  0, 0);
+		}
+		break;
+
+	case PARS_TO_NUMBER_TOKEN:
+	case PARS_BINARY_TO_NUMBER_TOKEN:
+	case PARS_LENGTH_TOKEN:
+	case PARS_INSTR_TOKEN:
+		ut_a(pars_is_string_type(que_node_get_data_type(arg)->mtype));
+		dtype_set(que_node_get_data_type(node), DATA_INT, 0, 4);
+		break;
+
+	case PARS_SYSDATE_TOKEN:
+		ut_a(arg == NULL);
+		dtype_set(que_node_get_data_type(node), DATA_INT, 0, 4);
+		break;
+
+	case PARS_SUBSTR_TOKEN:
+	case PARS_CONCAT_TOKEN:
+		ut_a(pars_is_string_type(que_node_get_data_type(arg)->mtype));
+		dtype_set(que_node_get_data_type(node), DATA_VARCHAR,
+			  DATA_ENGLISH, 0);
+		break;
+
+	case '>': case '<': case '=':
+	case PARS_GE_TOKEN:
+	case PARS_LE_TOKEN:
+	case PARS_NE_TOKEN:
+	case PARS_AND_TOKEN:
+	case PARS_OR_TOKEN:
+	case PARS_NOT_TOKEN:
+	case PARS_NOTFOUND_TOKEN:
+
+		/* We currently have no iboolean type: use integer type */
+		dtype_set(que_node_get_data_type(node), DATA_INT, 0, 4);
+		break;
+
+	case PARS_RND_TOKEN:
+		ut_a(dtype_get_mtype(que_node_get_data_type(arg)) == DATA_INT);
+		dtype_set(que_node_get_data_type(node), DATA_INT, 0, 4);
+		break;
+
+	default:
+		ut_error;
+	}
+}
+
+/*********************************************************************//**
+Resolves the meaning of variables in an expression and the data types of
+functions. It is an error if some identifier cannot be resolved here. */
+static
+void
+pars_resolve_exp_variables_and_types(
+/*=================================*/
+	sel_node_t*	select_node,	/*!< in: select node or NULL; if
+					this is not NULL then the variable
+					sym nodes are added to the
+					copy_variables list of select_node */
+	que_node_t*	exp_node)	/*!< in: expression */
+{
+	func_node_t*	func_node;
+	que_node_t*	arg;
+	sym_node_t*	sym_node;
+	sym_node_t*	node;
+
+	ut_a(exp_node);
+
+	if (que_node_get_type(exp_node) == QUE_NODE_FUNC) {
+		func_node = exp_node;
+
+		arg = func_node->args;
+
+		while (arg) {
+			pars_resolve_exp_variables_and_types(select_node, arg);
+
+			arg = que_node_get_next(arg);
+		}
+
+		pars_resolve_func_data_type(func_node);
+
+		return;
+	}
+
+	ut_a(que_node_get_type(exp_node) == QUE_NODE_SYMBOL);
+
+	sym_node = exp_node;
+
+	if (sym_node->resolved) {
+
+		return;
+	}
+
+	/* Not resolved yet: look in the symbol table for a variable
+	or a cursor or a function with the same name */
+
+	node = UT_LIST_GET_FIRST(pars_sym_tab_global->sym_list);
+
+	while (node) {
+		if (node->resolved
+		    && ((node->token_type == SYM_VAR)
+			|| (node->token_type == SYM_CURSOR)
+			|| (node->token_type == SYM_FUNCTION))
+		    && node->name
+		    && (sym_node->name_len == node->name_len)
+		    && (ut_memcmp(sym_node->name, node->name,
+				  node->name_len) == 0)) {
+
+			/* Found a variable or a cursor declared with
+			the same name */
+
+			break;
+		}
+
+		node = UT_LIST_GET_NEXT(sym_list, node);
+	}
+
+	if (!node) {
+		fprintf(stderr, "PARSER ERROR: Unresolved identifier %s\n",
+			sym_node->name);
+	}
+
+	ut_a(node);
+
+	sym_node->resolved = TRUE;
+	sym_node->token_type = SYM_IMPLICIT_VAR;
+	sym_node->alias = node;
+	sym_node->indirection = node;
+
+	if (select_node) {
+		UT_LIST_ADD_LAST(col_var_list, select_node->copy_variables,
+				 sym_node);
+	}
+
+	dfield_set_type(que_node_get_val(sym_node),
+			que_node_get_data_type(node));
+}
+
+/*********************************************************************//**
+Resolves the meaning of variables in an expression list. It is an error if
+some identifier cannot be resolved here. Resolves also the data types of
+functions. */
+static
+void
+pars_resolve_exp_list_variables_and_types(
+/*======================================*/
+	sel_node_t*	select_node,	/*!< in: select node or NULL */
+	que_node_t*	exp_node)	/*!< in: expression list first node, or
+					NULL */
+{
+	while (exp_node) {
+		pars_resolve_exp_variables_and_types(select_node, exp_node);
+
+		exp_node = que_node_get_next(exp_node);
+	}
+}
+
+/*********************************************************************//**
+Resolves the columns in an expression. */
+static
+void
+pars_resolve_exp_columns(
+/*=====================*/
+	sym_node_t*	table_node,	/*!< in: first node in a table list */
+	que_node_t*	exp_node)	/*!< in: expression */
+{
+	func_node_t*	func_node;
+	que_node_t*	arg;
+	sym_node_t*	sym_node;
+	dict_table_t*	table;
+	sym_node_t*	t_node;
+	ulint		n_cols;
+	ulint		i;
+
+	ut_a(exp_node);
+
+	if (que_node_get_type(exp_node) == QUE_NODE_FUNC) {
+		func_node = exp_node;
+
+		arg = func_node->args;
+
+		while (arg) {
+			pars_resolve_exp_columns(table_node, arg);
+
+			arg = que_node_get_next(arg);
+		}
+
+		return;
+	}
+
+	ut_a(que_node_get_type(exp_node) == QUE_NODE_SYMBOL);
+
+	sym_node = exp_node;
+
+	if (sym_node->resolved) {
+
+		return;
+	}
+
+	/* Not resolved yet: look in the table list for a column with the
+	same name */
+
+	t_node = table_node;
+
+	while (t_node) {
+		table = t_node->table;
+
+		n_cols = dict_table_get_n_cols(table);
+
+		for (i = 0; i < n_cols; i++) {
+			const dict_col_t*	col
+				= dict_table_get_nth_col(table, i);
+			const char*		col_name
+				= dict_table_get_col_name(table, i);
+
+			if ((sym_node->name_len == ut_strlen(col_name))
+			    && (0 == ut_memcmp(sym_node->name, col_name,
+					       sym_node->name_len))) {
+				/* Found */
+				sym_node->resolved = TRUE;
+				sym_node->token_type = SYM_COLUMN;
+				sym_node->table = table;
+				sym_node->col_no = i;
+				sym_node->prefetch_buf = NULL;
+
+				dict_col_copy_type(
+					col,
+					dfield_get_type(&sym_node
+							->common.val));
+
+				return;
+			}
+		}
+
+		t_node = que_node_get_next(t_node);
+	}
+}
+
+/*********************************************************************//**
+Resolves the meaning of columns in an expression list. */
+static
+void
+pars_resolve_exp_list_columns(
+/*==========================*/
+	sym_node_t*	table_node,	/*!< in: first node in a table list */
+	que_node_t*	exp_node)	/*!< in: expression list first node, or
+					NULL */
+{
+	while (exp_node) {
+		pars_resolve_exp_columns(table_node, exp_node);
+
+		exp_node = que_node_get_next(exp_node);
+	}
+}
+
+/*********************************************************************//**
+Retrieves the table definition for a table name id. */
+static
+void
+pars_retrieve_table_def(
+/*====================*/
+	sym_node_t*	sym_node)	/*!< in: table node */
+{
+	const char*	table_name;
+
+	ut_a(sym_node);
+	ut_a(que_node_get_type(sym_node) == QUE_NODE_SYMBOL);
+
+	sym_node->resolved = TRUE;
+	sym_node->token_type = SYM_TABLE;
+
+	table_name = (const char*) sym_node->name;
+
+	sym_node->table = dict_table_get_low(table_name);
+
+	ut_a(sym_node->table);
+}
+
+/*********************************************************************//**
+Retrieves the table definitions for a list of table name ids.
+@return	number of tables */
+static
+ulint
+pars_retrieve_table_list_defs(
+/*==========================*/
+	sym_node_t*	sym_node)	/*!< in: first table node in list */
+{
+	ulint		count		= 0;
+
+	if (sym_node == NULL) {
+
+		return(count);
+	}
+
+	while (sym_node) {
+		pars_retrieve_table_def(sym_node);
+
+		count++;
+
+		sym_node = que_node_get_next(sym_node);
+	}
+
+	return(count);
+}
+
+/*********************************************************************//**
+Adds all columns to the select list if the query is SELECT * FROM ... */
+static
+void
+pars_select_all_columns(
+/*====================*/
+	sel_node_t*	select_node)	/*!< in: select node already containing
+					the table list */
+{
+	sym_node_t*	col_node;
+	sym_node_t*	table_node;
+	dict_table_t*	table;
+	ulint		i;
+
+	select_node->select_list = NULL;
+
+	table_node = select_node->table_list;
+
+	while (table_node) {
+		table = table_node->table;
+
+		for (i = 0; i < dict_table_get_n_user_cols(table); i++) {
+			const char*	col_name = dict_table_get_col_name(
+				table, i);
+
+			col_node = sym_tab_add_id(pars_sym_tab_global,
+						  (byte*)col_name,
+						  ut_strlen(col_name));
+
+			select_node->select_list = que_node_list_add_last(
+				select_node->select_list, col_node);
+		}
+
+		table_node = que_node_get_next(table_node);
+	}
+}
+
+/*********************************************************************//**
+Parses a select list; creates a query graph node for the whole SELECT
+statement.
+@return	own: select node in a query tree */
+UNIV_INTERN
+sel_node_t*
+pars_select_list(
+/*=============*/
+	que_node_t*	select_list,	/*!< in: select list */
+	sym_node_t*	into_list)	/*!< in: variables list or NULL */
+{
+	sel_node_t*	node;
+
+	node = sel_node_create(pars_sym_tab_global->heap);
+
+	node->select_list = select_list;
+	node->into_list = into_list;
+
+	pars_resolve_exp_list_variables_and_types(NULL, into_list);
+
+	return(node);
+}
+
+/*********************************************************************//**
+Checks if the query is an aggregate query, in which case the selct list must
+contain only aggregate function items. */
+static
+void
+pars_check_aggregate(
+/*=================*/
+	sel_node_t*	select_node)	/*!< in: select node already containing
+					the select list */
+{
+	que_node_t*	exp_node;
+	func_node_t*	func_node;
+	ulint		n_nodes			= 0;
+	ulint		n_aggregate_nodes	= 0;
+
+	exp_node = select_node->select_list;
+
+	while (exp_node) {
+
+		n_nodes++;
+
+		if (que_node_get_type(exp_node) == QUE_NODE_FUNC) {
+
+			func_node = exp_node;
+
+			if (func_node->class == PARS_FUNC_AGGREGATE) {
+
+				n_aggregate_nodes++;
+			}
+		}
+
+		exp_node = que_node_get_next(exp_node);
+	}
+
+	if (n_aggregate_nodes > 0) {
+		ut_a(n_nodes == n_aggregate_nodes);
+
+		select_node->is_aggregate = TRUE;
+	} else {
+		select_node->is_aggregate = FALSE;
+	}
+}
+
+/*********************************************************************//**
+Parses a select statement.
+@return	own: select node in a query tree */
+UNIV_INTERN
+sel_node_t*
+pars_select_statement(
+/*==================*/
+	sel_node_t*	select_node,	/*!< in: select node already containing
+					the select list */
+	sym_node_t*	table_list,	/*!< in: table list */
+	que_node_t*	search_cond,	/*!< in: search condition or NULL */
+	pars_res_word_t* for_update,	/*!< in: NULL or &pars_update_token */
+	pars_res_word_t* lock_shared,	/*!< in: NULL or &pars_share_token */
+	order_node_t*	order_by)	/*!< in: NULL or an order-by node */
+{
+	select_node->state = SEL_NODE_OPEN;
+
+	select_node->table_list = table_list;
+	select_node->n_tables = pars_retrieve_table_list_defs(table_list);
+
+	if (select_node->select_list == &pars_star_denoter) {
+
+		/* SELECT * FROM ... */
+		pars_select_all_columns(select_node);
+	}
+
+	if (select_node->into_list) {
+		ut_a(que_node_list_get_len(select_node->into_list)
+		     == que_node_list_get_len(select_node->select_list));
+	}
+
+	UT_LIST_INIT(select_node->copy_variables);
+
+	pars_resolve_exp_list_columns(table_list, select_node->select_list);
+	pars_resolve_exp_list_variables_and_types(select_node,
+						  select_node->select_list);
+	pars_check_aggregate(select_node);
+
+	select_node->search_cond = search_cond;
+
+	if (search_cond) {
+		pars_resolve_exp_columns(table_list, search_cond);
+		pars_resolve_exp_variables_and_types(select_node, search_cond);
+	}
+
+	if (for_update) {
+		ut_a(!lock_shared);
+
+		select_node->set_x_locks = TRUE;
+		select_node->row_lock_mode = LOCK_X;
+
+		select_node->consistent_read = FALSE;
+		select_node->read_view = NULL;
+	} else if (lock_shared){
+		select_node->set_x_locks = FALSE;
+		select_node->row_lock_mode = LOCK_S;
+
+		select_node->consistent_read = FALSE;
+		select_node->read_view = NULL;
+	} else {
+		select_node->set_x_locks = FALSE;
+		select_node->row_lock_mode = LOCK_S;
+
+		select_node->consistent_read = TRUE;
+	}
+
+	select_node->order_by = order_by;
+
+	if (order_by) {
+		pars_resolve_exp_columns(table_list, order_by->column);
+	}
+
+	/* The final value of the following fields depend on the environment
+	where the select statement appears: */
+
+	select_node->can_get_updated = FALSE;
+	select_node->explicit_cursor = NULL;
+
+	opt_search_plan(select_node);
+
+	return(select_node);
+}
+
+/*********************************************************************//**
+Parses a cursor declaration.
+@return	sym_node */
+UNIV_INTERN
+que_node_t*
+pars_cursor_declaration(
+/*====================*/
+	sym_node_t*	sym_node,	/*!< in: cursor id node in the symbol
+					table */
+	sel_node_t*	select_node)	/*!< in: select node */
+{
+	sym_node->resolved = TRUE;
+	sym_node->token_type = SYM_CURSOR;
+	sym_node->cursor_def = select_node;
+
+	select_node->state = SEL_NODE_CLOSED;
+	select_node->explicit_cursor = sym_node;
+
+	return(sym_node);
+}
+
+/*********************************************************************//**
+Parses a function declaration.
+@return	sym_node */
+UNIV_INTERN
+que_node_t*
+pars_function_declaration(
+/*======================*/
+	sym_node_t*	sym_node)	/*!< in: function id node in the symbol
+					table */
+{
+	sym_node->resolved = TRUE;
+	sym_node->token_type = SYM_FUNCTION;
+
+	/* Check that the function exists. */
+	ut_a(pars_info_get_user_func(pars_sym_tab_global->info,
+				     sym_node->name));
+
+	return(sym_node);
+}
+
+/*********************************************************************//**
+Parses a delete or update statement start.
+@return	own: update node in a query tree */
+UNIV_INTERN
+upd_node_t*
+pars_update_statement_start(
+/*========================*/
+	ibool		is_delete,	/*!< in: TRUE if delete */
+	sym_node_t*	table_sym,	/*!< in: table name node */
+	col_assign_node_t* col_assign_list)/*!< in: column assignment list, NULL
+					if delete */
+{
+	upd_node_t*	node;
+
+	node = upd_node_create(pars_sym_tab_global->heap);
+
+	node->is_delete = is_delete;
+
+	node->table_sym = table_sym;
+	node->col_assign_list = col_assign_list;
+
+	return(node);
+}
+
+/*********************************************************************//**
+Parses a column assignment in an update.
+@return	column assignment node */
+UNIV_INTERN
+col_assign_node_t*
+pars_column_assignment(
+/*===================*/
+	sym_node_t*	column,	/*!< in: column to assign */
+	que_node_t*	exp)	/*!< in: value to assign */
+{
+	col_assign_node_t*	node;
+
+	node = mem_heap_alloc(pars_sym_tab_global->heap,
+			      sizeof(col_assign_node_t));
+	node->common.type = QUE_NODE_COL_ASSIGNMENT;
+
+	node->col = column;
+	node->val = exp;
+
+	return(node);
+}
+
+/*********************************************************************//**
+Processes an update node assignment list. */
+static
+void
+pars_process_assign_list(
+/*=====================*/
+	upd_node_t*	node)	/*!< in: update node */
+{
+	col_assign_node_t*	col_assign_list;
+	sym_node_t*		table_sym;
+	col_assign_node_t*	assign_node;
+	upd_field_t*		upd_field;
+	dict_index_t*		clust_index;
+	sym_node_t*		col_sym;
+	ulint			changes_ord_field;
+	ulint			changes_field_size;
+	ulint			n_assigns;
+	ulint			i;
+
+	table_sym = node->table_sym;
+	col_assign_list = node->col_assign_list;
+	clust_index = dict_table_get_first_index(node->table);
+
+	assign_node = col_assign_list;
+	n_assigns = 0;
+
+	while (assign_node) {
+		pars_resolve_exp_columns(table_sym, assign_node->col);
+		pars_resolve_exp_columns(table_sym, assign_node->val);
+		pars_resolve_exp_variables_and_types(NULL, assign_node->val);
+#if 0
+		ut_a(dtype_get_mtype(
+			     dfield_get_type(que_node_get_val(
+						     assign_node->col)))
+		     == dtype_get_mtype(
+			     dfield_get_type(que_node_get_val(
+						     assign_node->val))));
+#endif
+
+		/* Add to the update node all the columns found in assignment
+		values as columns to copy: therefore, TRUE */
+
+		opt_find_all_cols(TRUE, clust_index, &(node->columns), NULL,
+				  assign_node->val);
+		n_assigns++;
+
+		assign_node = que_node_get_next(assign_node);
+	}
+
+	node->update = upd_create(n_assigns, pars_sym_tab_global->heap);
+
+	assign_node = col_assign_list;
+
+	changes_field_size = UPD_NODE_NO_SIZE_CHANGE;
+
+	for (i = 0; i < n_assigns; i++) {
+		upd_field = upd_get_nth_field(node->update, i);
+
+		col_sym = assign_node->col;
+
+		upd_field_set_field_no(upd_field, dict_index_get_nth_col_pos(
+					       clust_index, col_sym->col_no),
+				       clust_index, NULL);
+		upd_field->exp = assign_node->val;
+
+		if (!dict_col_get_fixed_size(
+			    dict_index_get_nth_col(clust_index,
+						   upd_field->field_no),
+			    dict_table_is_comp(node->table))) {
+			changes_field_size = 0;
+		}
+
+		assign_node = que_node_get_next(assign_node);
+	}
+
+	/* Find out if the update can modify an ordering field in any index */
+
+	changes_ord_field = UPD_NODE_NO_ORD_CHANGE;
+
+	if (row_upd_changes_some_index_ord_field_binary(node->table,
+							node->update)) {
+		changes_ord_field = 0;
+	}
+
+	node->cmpl_info = changes_ord_field | changes_field_size;
+}
+
+/*********************************************************************//**
+Parses an update or delete statement.
+@return	own: update node in a query tree */
+UNIV_INTERN
+upd_node_t*
+pars_update_statement(
+/*==================*/
+	upd_node_t*	node,		/*!< in: update node */
+	sym_node_t*	cursor_sym,	/*!< in: pointer to a cursor entry in
+					the symbol table or NULL */
+	que_node_t*	search_cond)	/*!< in: search condition or NULL */
+{
+	sym_node_t*	table_sym;
+	sel_node_t*	sel_node;
+	plan_t*		plan;
+
+	table_sym = node->table_sym;
+
+	pars_retrieve_table_def(table_sym);
+	node->table = table_sym->table;
+
+	UT_LIST_INIT(node->columns);
+
+	/* Make the single table node into a list of table nodes of length 1 */
+
+	que_node_list_add_last(NULL, table_sym);
+
+	if (cursor_sym) {
+		pars_resolve_exp_variables_and_types(NULL, cursor_sym);
+
+		sel_node = cursor_sym->alias->cursor_def;
+
+		node->searched_update = FALSE;
+	} else {
+		sel_node = pars_select_list(NULL, NULL);
+
+		pars_select_statement(sel_node, table_sym, search_cond, NULL,
+				      &pars_share_token, NULL);
+		node->searched_update = TRUE;
+		sel_node->common.parent = node;
+	}
+
+	node->select = sel_node;
+
+	ut_a(!node->is_delete || (node->col_assign_list == NULL));
+	ut_a(node->is_delete || (node->col_assign_list != NULL));
+
+	if (node->is_delete) {
+		node->cmpl_info = 0;
+	} else {
+		pars_process_assign_list(node);
+	}
+
+	if (node->searched_update) {
+		node->has_clust_rec_x_lock = TRUE;
+		sel_node->set_x_locks = TRUE;
+		sel_node->row_lock_mode = LOCK_X;
+	} else {
+		node->has_clust_rec_x_lock = sel_node->set_x_locks;
+	}
+
+	ut_a(sel_node->n_tables == 1);
+	ut_a(sel_node->consistent_read == FALSE);
+	ut_a(sel_node->order_by == NULL);
+	ut_a(sel_node->is_aggregate == FALSE);
+
+	sel_node->can_get_updated = TRUE;
+
+	node->state = UPD_NODE_UPDATE_CLUSTERED;
+
+	plan = sel_node_get_nth_plan(sel_node, 0);
+
+	plan->no_prefetch = TRUE;
+
+	if (!dict_index_is_clust(plan->index)) {
+
+		plan->must_get_clust = TRUE;
+
+		node->pcur = &(plan->clust_pcur);
+	} else {
+		node->pcur = &(plan->pcur);
+	}
+
+	return(node);
+}
+
+/*********************************************************************//**
+Parses an insert statement.
+@return	own: update node in a query tree */
+UNIV_INTERN
+ins_node_t*
+pars_insert_statement(
+/*==================*/
+	sym_node_t*	table_sym,	/*!< in: table name node */
+	que_node_t*	values_list,	/*!< in: value expression list or NULL */
+	sel_node_t*	select)		/*!< in: select condition or NULL */
+{
+	ins_node_t*	node;
+	dtuple_t*	row;
+	ulint		ins_type;
+
+	ut_a(values_list || select);
+	ut_a(!values_list || !select);
+
+	if (values_list) {
+		ins_type = INS_VALUES;
+	} else {
+		ins_type = INS_SEARCHED;
+	}
+
+	pars_retrieve_table_def(table_sym);
+
+	node = ins_node_create(ins_type, table_sym->table,
+			       pars_sym_tab_global->heap);
+
+	row = dtuple_create(pars_sym_tab_global->heap,
+			    dict_table_get_n_cols(node->table));
+
+	dict_table_copy_types(row, table_sym->table);
+
+	ins_node_set_new_row(node, row);
+
+	node->select = select;
+
+	if (select) {
+		select->common.parent = node;
+
+		ut_a(que_node_list_get_len(select->select_list)
+		     == dict_table_get_n_user_cols(table_sym->table));
+	}
+
+	node->values_list = values_list;
+
+	if (node->values_list) {
+		pars_resolve_exp_list_variables_and_types(NULL, values_list);
+
+		ut_a(que_node_list_get_len(values_list)
+		     == dict_table_get_n_user_cols(table_sym->table));
+	}
+
+	return(node);
+}
+
+/*********************************************************************//**
+Set the type of a dfield. */
+static
+void
+pars_set_dfield_type(
+/*=================*/
+	dfield_t*		dfield,		/*!< in: dfield */
+	pars_res_word_t*	type,		/*!< in: pointer to a type
+						token */
+	ulint			len,		/*!< in: length, or 0 */
+	ibool			is_unsigned,	/*!< in: if TRUE, column is
+						UNSIGNED. */
+	ibool			is_not_null)	/*!< in: if TRUE, column is
+						NOT NULL. */
+{
+	ulint flags = 0;
+
+	if (is_not_null) {
+		flags |= DATA_NOT_NULL;
+	}
+
+	if (is_unsigned) {
+		flags |= DATA_UNSIGNED;
+	}
+
+	if (type == &pars_int_token) {
+		ut_a(len == 0);
+
+		dtype_set(dfield_get_type(dfield), DATA_INT, flags, 4);
+
+	} else if (type == &pars_char_token) {
+		ut_a(len == 0);
+
+		dtype_set(dfield_get_type(dfield), DATA_VARCHAR,
+			  DATA_ENGLISH | flags, 0);
+	} else if (type == &pars_binary_token) {
+		ut_a(len != 0);
+
+		dtype_set(dfield_get_type(dfield), DATA_FIXBINARY,
+			  DATA_BINARY_TYPE | flags, len);
+	} else if (type == &pars_blob_token) {
+		ut_a(len == 0);
+
+		dtype_set(dfield_get_type(dfield), DATA_BLOB,
+			  DATA_BINARY_TYPE | flags, 0);
+	} else {
+		ut_error;
+	}
+}
+
+/*********************************************************************//**
+Parses a variable declaration.
+@return	own: symbol table node of type SYM_VAR */
+UNIV_INTERN
+sym_node_t*
+pars_variable_declaration(
+/*======================*/
+	sym_node_t*	node,	/*!< in: symbol table node allocated for the
+				id of the variable */
+	pars_res_word_t* type)	/*!< in: pointer to a type token */
+{
+	node->resolved = TRUE;
+	node->token_type = SYM_VAR;
+
+	node->param_type = PARS_NOT_PARAM;
+
+	pars_set_dfield_type(que_node_get_val(node), type, 0, FALSE, FALSE);
+
+	return(node);
+}
+
+/*********************************************************************//**
+Parses a procedure parameter declaration.
+@return	own: symbol table node of type SYM_VAR */
+UNIV_INTERN
+sym_node_t*
+pars_parameter_declaration(
+/*=======================*/
+	sym_node_t*	node,	/*!< in: symbol table node allocated for the
+				id of the parameter */
+	ulint		param_type,
+				/*!< in: PARS_INPUT or PARS_OUTPUT */
+	pars_res_word_t* type)	/*!< in: pointer to a type token */
+{
+	ut_a((param_type == PARS_INPUT) || (param_type == PARS_OUTPUT));
+
+	pars_variable_declaration(node, type);
+
+	node->param_type = param_type;
+
+	return(node);
+}
+
+/*********************************************************************//**
+Sets the parent field in a query node list. */
+static
+void
+pars_set_parent_in_list(
+/*====================*/
+	que_node_t*	node_list,	/*!< in: first node in a list */
+	que_node_t*	parent)		/*!< in: parent value to set in all
+					nodes of the list */
+{
+	que_common_t*	common;
+
+	common = node_list;
+
+	while (common) {
+		common->parent = parent;
+
+		common = que_node_get_next(common);
+	}
+}
+
+/*********************************************************************//**
+Parses an elsif element.
+@return	elsif node */
+UNIV_INTERN
+elsif_node_t*
+pars_elsif_element(
+/*===============*/
+	que_node_t*	cond,		/*!< in: if-condition */
+	que_node_t*	stat_list)	/*!< in: statement list */
+{
+	elsif_node_t*	node;
+
+	node = mem_heap_alloc(pars_sym_tab_global->heap, sizeof(elsif_node_t));
+
+	node->common.type = QUE_NODE_ELSIF;
+
+	node->cond = cond;
+
+	pars_resolve_exp_variables_and_types(NULL, cond);
+
+	node->stat_list = stat_list;
+
+	return(node);
+}
+
+/*********************************************************************//**
+Parses an if-statement.
+@return	if-statement node */
+UNIV_INTERN
+if_node_t*
+pars_if_statement(
+/*==============*/
+	que_node_t*	cond,		/*!< in: if-condition */
+	que_node_t*	stat_list,	/*!< in: statement list */
+	que_node_t*	else_part)	/*!< in: else-part statement list
+					or elsif element list */
+{
+	if_node_t*	node;
+	elsif_node_t*	elsif_node;
+
+	node = mem_heap_alloc(pars_sym_tab_global->heap, sizeof(if_node_t));
+
+	node->common.type = QUE_NODE_IF;
+
+	node->cond = cond;
+
+	pars_resolve_exp_variables_and_types(NULL, cond);
+
+	node->stat_list = stat_list;
+
+	if (else_part && (que_node_get_type(else_part) == QUE_NODE_ELSIF)) {
+
+		/* There is a list of elsif conditions */
+
+		node->else_part = NULL;
+		node->elsif_list = else_part;
+
+		elsif_node = else_part;
+
+		while (elsif_node) {
+			pars_set_parent_in_list(elsif_node->stat_list, node);
+
+			elsif_node = que_node_get_next(elsif_node);
+		}
+	} else {
+		node->else_part = else_part;
+		node->elsif_list = NULL;
+
+		pars_set_parent_in_list(else_part, node);
+	}
+
+	pars_set_parent_in_list(stat_list, node);
+
+	return(node);
+}
+
+/*********************************************************************//**
+Parses a while-statement.
+@return	while-statement node */
+UNIV_INTERN
+while_node_t*
+pars_while_statement(
+/*=================*/
+	que_node_t*	cond,		/*!< in: while-condition */
+	que_node_t*	stat_list)	/*!< in: statement list */
+{
+	while_node_t*	node;
+
+	node = mem_heap_alloc(pars_sym_tab_global->heap, sizeof(while_node_t));
+
+	node->common.type = QUE_NODE_WHILE;
+
+	node->cond = cond;
+
+	pars_resolve_exp_variables_and_types(NULL, cond);
+
+	node->stat_list = stat_list;
+
+	pars_set_parent_in_list(stat_list, node);
+
+	return(node);
+}
+
+/*********************************************************************//**
+Parses a for-loop-statement.
+@return	for-statement node */
+UNIV_INTERN
+for_node_t*
+pars_for_statement(
+/*===============*/
+	sym_node_t*	loop_var,	/*!< in: loop variable */
+	que_node_t*	loop_start_limit,/*!< in: loop start expression */
+	que_node_t*	loop_end_limit,	/*!< in: loop end expression */
+	que_node_t*	stat_list)	/*!< in: statement list */
+{
+	for_node_t*	node;
+
+	node = mem_heap_alloc(pars_sym_tab_global->heap, sizeof(for_node_t));
+
+	node->common.type = QUE_NODE_FOR;
+
+	pars_resolve_exp_variables_and_types(NULL, loop_var);
+	pars_resolve_exp_variables_and_types(NULL, loop_start_limit);
+	pars_resolve_exp_variables_and_types(NULL, loop_end_limit);
+
+	node->loop_var = loop_var->indirection;
+
+	ut_a(loop_var->indirection);
+
+	node->loop_start_limit = loop_start_limit;
+	node->loop_end_limit = loop_end_limit;
+
+	node->stat_list = stat_list;
+
+	pars_set_parent_in_list(stat_list, node);
+
+	return(node);
+}
+
+/*********************************************************************//**
+Parses an exit statement.
+@return	exit statement node */
+UNIV_INTERN
+exit_node_t*
+pars_exit_statement(void)
+/*=====================*/
+{
+	exit_node_t*	node;
+
+	node = mem_heap_alloc(pars_sym_tab_global->heap, sizeof(exit_node_t));
+	node->common.type = QUE_NODE_EXIT;
+
+	return(node);
+}
+
+/*********************************************************************//**
+Parses a return-statement.
+@return	return-statement node */
+UNIV_INTERN
+return_node_t*
+pars_return_statement(void)
+/*=======================*/
+{
+	return_node_t*	node;
+
+	node = mem_heap_alloc(pars_sym_tab_global->heap,
+			      sizeof(return_node_t));
+	node->common.type = QUE_NODE_RETURN;
+
+	return(node);
+}
+
+/*********************************************************************//**
+Parses an assignment statement.
+@return	assignment statement node */
+UNIV_INTERN
+assign_node_t*
+pars_assignment_statement(
+/*======================*/
+	sym_node_t*	var,	/*!< in: variable to assign */
+	que_node_t*	val)	/*!< in: value to assign */
+{
+	assign_node_t*	node;
+
+	node = mem_heap_alloc(pars_sym_tab_global->heap,
+			      sizeof(assign_node_t));
+	node->common.type = QUE_NODE_ASSIGNMENT;
+
+	node->var = var;
+	node->val = val;
+
+	pars_resolve_exp_variables_and_types(NULL, var);
+	pars_resolve_exp_variables_and_types(NULL, val);
+
+	ut_a(dtype_get_mtype(dfield_get_type(que_node_get_val(var)))
+	     == dtype_get_mtype(dfield_get_type(que_node_get_val(val))));
+
+	return(node);
+}
+
+/*********************************************************************//**
+Parses a procedure call.
+@return	function node */
+UNIV_INTERN
+func_node_t*
+pars_procedure_call(
+/*================*/
+	que_node_t*	res_word,/*!< in: procedure name reserved word */
+	que_node_t*	args)	/*!< in: argument list */
+{
+	func_node_t*	node;
+
+	node = pars_func(res_word, args);
+
+	pars_resolve_exp_list_variables_and_types(NULL, args);
+
+	return(node);
+}
+
+/*********************************************************************//**
+Parses a fetch statement. into_list or user_func (but not both) must be
+non-NULL.
+@return	fetch statement node */
+UNIV_INTERN
+fetch_node_t*
+pars_fetch_statement(
+/*=================*/
+	sym_node_t*	cursor,		/*!< in: cursor node */
+	sym_node_t*	into_list,	/*!< in: variables to set, or NULL */
+	sym_node_t*	user_func)	/*!< in: user function name, or NULL */
+{
+	sym_node_t*	cursor_decl;
+	fetch_node_t*	node;
+
+	/* Logical XOR. */
+	ut_a(!into_list != !user_func);
+
+	node = mem_heap_alloc(pars_sym_tab_global->heap, sizeof(fetch_node_t));
+
+	node->common.type = QUE_NODE_FETCH;
+
+	pars_resolve_exp_variables_and_types(NULL, cursor);
+
+	if (into_list) {
+		pars_resolve_exp_list_variables_and_types(NULL, into_list);
+		node->into_list = into_list;
+		node->func = NULL;
+	} else {
+		pars_resolve_exp_variables_and_types(NULL, user_func);
+
+		node->func = pars_info_get_user_func(pars_sym_tab_global->info,
+						     user_func->name);
+		ut_a(node->func);
+
+		node->into_list = NULL;
+	}
+
+	cursor_decl = cursor->alias;
+
+	ut_a(cursor_decl->token_type == SYM_CURSOR);
+
+	node->cursor_def = cursor_decl->cursor_def;
+
+	if (into_list) {
+		ut_a(que_node_list_get_len(into_list)
+		     == que_node_list_get_len(node->cursor_def->select_list));
+	}
+
+	return(node);
+}
+
+/*********************************************************************//**
+Parses an open or close cursor statement.
+@return	fetch statement node */
+UNIV_INTERN
+open_node_t*
+pars_open_statement(
+/*================*/
+	ulint		type,	/*!< in: ROW_SEL_OPEN_CURSOR
+				or ROW_SEL_CLOSE_CURSOR */
+	sym_node_t*	cursor)	/*!< in: cursor node */
+{
+	sym_node_t*	cursor_decl;
+	open_node_t*	node;
+
+	node = mem_heap_alloc(pars_sym_tab_global->heap, sizeof(open_node_t));
+
+	node->common.type = QUE_NODE_OPEN;
+
+	pars_resolve_exp_variables_and_types(NULL, cursor);
+
+	cursor_decl = cursor->alias;
+
+	ut_a(cursor_decl->token_type == SYM_CURSOR);
+
+	node->op_type = type;
+	node->cursor_def = cursor_decl->cursor_def;
+
+	return(node);
+}
+
+/*********************************************************************//**
+Parses a row_printf-statement.
+@return	row_printf-statement node */
+UNIV_INTERN
+row_printf_node_t*
+pars_row_printf_statement(
+/*======================*/
+	sel_node_t*	sel_node)	/*!< in: select node */
+{
+	row_printf_node_t*	node;
+
+	node = mem_heap_alloc(pars_sym_tab_global->heap,
+			      sizeof(row_printf_node_t));
+	node->common.type = QUE_NODE_ROW_PRINTF;
+
+	node->sel_node = sel_node;
+
+	sel_node->common.parent = node;
+
+	return(node);
+}
+
+/*********************************************************************//**
+Parses a commit statement.
+@return	own: commit node struct */
+UNIV_INTERN
+commit_node_t*
+pars_commit_statement(void)
+/*=======================*/
+{
+	return(commit_node_create(pars_sym_tab_global->heap));
+}
+
+/*********************************************************************//**
+Parses a rollback statement.
+@return	own: rollback node struct */
+UNIV_INTERN
+roll_node_t*
+pars_rollback_statement(void)
+/*=========================*/
+{
+	return(roll_node_create(pars_sym_tab_global->heap));
+}
+
+/*********************************************************************//**
+Parses a column definition at a table creation.
+@return	column sym table node */
+UNIV_INTERN
+sym_node_t*
+pars_column_def(
+/*============*/
+	sym_node_t*		sym_node,	/*!< in: column node in the
+						symbol table */
+	pars_res_word_t*	type,		/*!< in: data type */
+	sym_node_t*		len,		/*!< in: length of column, or
+						NULL */
+	void*			is_unsigned,	/*!< in: if not NULL, column
+						is of type UNSIGNED. */
+	void*			is_not_null)	/*!< in: if not NULL, column
+						is of type NOT NULL. */
+{
+	ulint len2;
+
+	if (len) {
+		len2 = eval_node_get_int_val(len);
+	} else {
+		len2 = 0;
+	}
+
+	pars_set_dfield_type(que_node_get_val(sym_node), type, len2,
+			     is_unsigned != NULL, is_not_null != NULL);
+
+	return(sym_node);
+}
+
+/*********************************************************************//**
+Parses a table creation operation.
+@return	table create subgraph */
+UNIV_INTERN
+tab_node_t*
+pars_create_table(
+/*==============*/
+	sym_node_t*	table_sym,	/*!< in: table name node in the symbol
+					table */
+	sym_node_t*	column_defs,	/*!< in: list of column names */
+	void*		not_fit_in_memory __attribute__((unused)))
+					/*!< in: a non-NULL pointer means that
+					this is a table which in simulations
+					should be simulated as not fitting
+					in memory; thread is put to sleep
+					to simulate disk accesses; NOTE that
+					this flag is not stored to the data
+					dictionary on disk, and the database
+					will forget about non-NULL value if
+					it has to reload the table definition
+					from disk */
+{
+	dict_table_t*	table;
+	sym_node_t*	column;
+	tab_node_t*	node;
+	const dtype_t*	dtype;
+	ulint		n_cols;
+
+	n_cols = que_node_list_get_len(column_defs);
+
+	/* As the InnoDB SQL parser is for internal use only,
+	for creating some system tables, this function will only
+	create tables in the old (not compact) record format. */
+	table = dict_mem_table_create(table_sym->name, 0, n_cols, 0);
+
+#ifdef UNIV_DEBUG
+	if (not_fit_in_memory != NULL) {
+		table->does_not_fit_in_memory = TRUE;
+	}
+#endif /* UNIV_DEBUG */
+	column = column_defs;
+
+	while (column) {
+		dtype = dfield_get_type(que_node_get_val(column));
+
+		dict_mem_table_add_col(table, table->heap,
+				       column->name, dtype->mtype,
+				       dtype->prtype, dtype->len);
+		column->resolved = TRUE;
+		column->token_type = SYM_COLUMN;
+
+		column = que_node_get_next(column);
+	}
+
+	node = tab_create_graph_create(table, pars_sym_tab_global->heap);
+
+	table_sym->resolved = TRUE;
+	table_sym->token_type = SYM_TABLE;
+
+	return(node);
+}
+
+/*********************************************************************//**
+Parses an index creation operation.
+@return	index create subgraph */
+UNIV_INTERN
+ind_node_t*
+pars_create_index(
+/*==============*/
+	pars_res_word_t* unique_def,	/*!< in: not NULL if a unique index */
+	pars_res_word_t* clustered_def,	/*!< in: not NULL if a clustered index */
+	sym_node_t*	index_sym,	/*!< in: index name node in the symbol
+					table */
+	sym_node_t*	table_sym,	/*!< in: table name node in the symbol
+					table */
+	sym_node_t*	column_list)	/*!< in: list of column names */
+{
+	dict_index_t*	index;
+	sym_node_t*	column;
+	ind_node_t*	node;
+	ulint		n_fields;
+	ulint		ind_type;
+
+	n_fields = que_node_list_get_len(column_list);
+
+	ind_type = 0;
+
+	if (unique_def) {
+		ind_type = ind_type | DICT_UNIQUE;
+	}
+
+	if (clustered_def) {
+		ind_type = ind_type | DICT_CLUSTERED;
+	}
+
+	index = dict_mem_index_create(table_sym->name, index_sym->name, 0,
+				      ind_type, n_fields);
+	column = column_list;
+
+	while (column) {
+		dict_mem_index_add_field(index, column->name, 0);
+
+		column->resolved = TRUE;
+		column->token_type = SYM_COLUMN;
+
+		column = que_node_get_next(column);
+	}
+
+	node = ind_create_graph_create(index, pars_sym_tab_global->heap);
+
+	table_sym->resolved = TRUE;
+	table_sym->token_type = SYM_TABLE;
+
+	index_sym->resolved = TRUE;
+	index_sym->token_type = SYM_TABLE;
+
+	return(node);
+}
+
+/*********************************************************************//**
+Parses a procedure definition.
+@return	query fork node */
+UNIV_INTERN
+que_fork_t*
+pars_procedure_definition(
+/*======================*/
+	sym_node_t*	sym_node,	/*!< in: procedure id node in the symbol
+					table */
+	sym_node_t*	param_list,	/*!< in: parameter declaration list */
+	que_node_t*	stat_list)	/*!< in: statement list */
+{
+	proc_node_t*	node;
+	que_fork_t*	fork;
+	que_thr_t*	thr;
+	mem_heap_t*	heap;
+
+	heap = pars_sym_tab_global->heap;
+
+	fork = que_fork_create(NULL, NULL, QUE_FORK_PROCEDURE, heap);
+	fork->trx = NULL;
+
+	thr = que_thr_create(fork, heap);
+
+	node = mem_heap_alloc(heap, sizeof(proc_node_t));
+
+	node->common.type = QUE_NODE_PROC;
+	node->common.parent = thr;
+
+	sym_node->token_type = SYM_PROCEDURE_NAME;
+	sym_node->resolved = TRUE;
+
+	node->proc_id = sym_node;
+	node->param_list = param_list;
+	node->stat_list = stat_list;
+
+	pars_set_parent_in_list(stat_list, node);
+
+	node->sym_tab = pars_sym_tab_global;
+
+	thr->child = node;
+
+	pars_sym_tab_global->query_graph = fork;
+
+	return(fork);
+}
+
+/*************************************************************//**
+Parses a stored procedure call, when this is not within another stored
+procedure, that is, the client issues a procedure call directly.
+In MySQL/InnoDB, stored InnoDB procedures are invoked via the
+parsed procedure tree, not via InnoDB SQL, so this function is not used.
+@return	query graph */
+UNIV_INTERN
+que_fork_t*
+pars_stored_procedure_call(
+/*=======================*/
+	sym_node_t*	sym_node __attribute__((unused)))
+					/*!< in: stored procedure name */
+{
+	ut_error;
+	return(NULL);
+}
+
+/*************************************************************//**
+Retrieves characters to the lexical analyzer. */
+UNIV_INTERN
+void
+pars_get_lex_chars(
+/*===============*/
+	char*	buf,		/*!< in/out: buffer where to copy */
+	int*	result,		/*!< out: number of characters copied or EOF */
+	int	max_size)	/*!< in: maximum number of characters which fit
+				in the buffer */
+{
+	int	len;
+
+	len = pars_sym_tab_global->string_len
+		- pars_sym_tab_global->next_char_pos;
+	if (len == 0) {
+#ifdef YYDEBUG
+		/* fputs("SQL string ends\n", stderr); */
+#endif
+		*result = 0;
+
+		return;
+	}
+
+	if (len > max_size) {
+		len = max_size;
+	}
+
+#ifdef UNIV_SQL_DEBUG
+	if (pars_print_lexed) {
+
+		if (len >= 5) {
+			len = 5;
+		}
+
+		fwrite(pars_sym_tab_global->sql_string
+		       + pars_sym_tab_global->next_char_pos,
+		       1, len, stderr);
+	}
+#endif /* UNIV_SQL_DEBUG */
+
+	ut_memcpy(buf, pars_sym_tab_global->sql_string
+		  + pars_sym_tab_global->next_char_pos, len);
+	*result = len;
+
+	pars_sym_tab_global->next_char_pos += len;
+}
+
+/*************************************************************//**
+Called by yyparse on error. */
+UNIV_INTERN
+void
+yyerror(
+/*====*/
+	const char*	s __attribute__((unused)))
+				/*!< in: error message string */
+{
+	ut_ad(s);
+
+	fputs("PARSER ERROR: Syntax error in SQL string\n", stderr);
+
+	ut_error;
+}
+
+/*************************************************************//**
+Parses an SQL string returning the query graph.
+@return	own: the query graph */
+UNIV_INTERN
+que_t*
+pars_sql(
+/*=====*/
+	pars_info_t*	info,	/*!< in: extra information, or NULL */
+	const char*	str)	/*!< in: SQL string */
+{
+	sym_node_t*	sym_node;
+	mem_heap_t*	heap;
+	que_t*		graph;
+
+	ut_ad(str);
+
+	heap = mem_heap_create(256);
+
+	/* Currently, the parser is not reentrant: */
+	ut_ad(mutex_own(&(dict_sys->mutex)));
+
+	pars_sym_tab_global = sym_tab_create(heap);
+
+	pars_sym_tab_global->string_len = strlen(str);
+	pars_sym_tab_global->sql_string = mem_heap_dup(
+		heap, str, pars_sym_tab_global->string_len + 1);
+	pars_sym_tab_global->next_char_pos = 0;
+	pars_sym_tab_global->info = info;
+
+	yyparse();
+
+	sym_node = UT_LIST_GET_FIRST(pars_sym_tab_global->sym_list);
+
+	while (sym_node) {
+		ut_a(sym_node->resolved);
+
+		sym_node = UT_LIST_GET_NEXT(sym_list, sym_node);
+	}
+
+	graph = pars_sym_tab_global->query_graph;
+
+	graph->sym_tab = pars_sym_tab_global;
+	graph->info = info;
+
+	/* fprintf(stderr, "SQL graph size %lu\n", mem_heap_get_size(heap)); */
+
+	return(graph);
+}
+
+/******************************************************************//**
+Completes a query graph by adding query thread and fork nodes
+above it and prepares the graph for running. The fork created is of
+type QUE_FORK_MYSQL_INTERFACE.
+@return	query thread node to run */
+UNIV_INTERN
+que_thr_t*
+pars_complete_graph_for_exec(
+/*=========================*/
+	que_node_t*	node,	/*!< in: root node for an incomplete
+				query graph */
+	trx_t*		trx,	/*!< in: transaction handle */
+	mem_heap_t*	heap)	/*!< in: memory heap from which allocated */
+{
+	que_fork_t*	fork;
+	que_thr_t*	thr;
+
+	fork = que_fork_create(NULL, NULL, QUE_FORK_MYSQL_INTERFACE, heap);
+	fork->trx = trx;
+
+	thr = que_thr_create(fork, heap);
+
+	thr->child = node;
+
+	que_node_set_parent(node, thr);
+
+	trx->graph = NULL;
+
+	return(thr);
+}
+
+/****************************************************************//**
+Create parser info struct.
+@return	own: info struct */
+UNIV_INTERN
+pars_info_t*
+pars_info_create(void)
+/*==================*/
+{
+	pars_info_t*	info;
+	mem_heap_t*	heap;
+
+	heap = mem_heap_create(512);
+
+	info = mem_heap_alloc(heap, sizeof(*info));
+
+	info->heap = heap;
+	info->funcs = NULL;
+	info->bound_lits = NULL;
+	info->bound_ids = NULL;
+	info->graph_owns_us = TRUE;
+
+	return(info);
+}
+
+/****************************************************************//**
+Free info struct and everything it contains. */
+UNIV_INTERN
+void
+pars_info_free(
+/*===========*/
+	pars_info_t*	info)	/*!< in, own: info struct */
+{
+	mem_heap_free(info->heap);
+}
+
+/****************************************************************//**
+Add bound literal. */
+UNIV_INTERN
+void
+pars_info_add_literal(
+/*==================*/
+	pars_info_t*	info,		/*!< in: info struct */
+	const char*	name,		/*!< in: name */
+	const void*	address,	/*!< in: address */
+	ulint		length,		/*!< in: length of data */
+	ulint		type,		/*!< in: type, e.g. DATA_FIXBINARY */
+	ulint		prtype)		/*!< in: precise type, e.g.
+					DATA_UNSIGNED */
+{
+	pars_bound_lit_t*	pbl;
+
+	ut_ad(!pars_info_get_bound_lit(info, name));
+
+	pbl = mem_heap_alloc(info->heap, sizeof(*pbl));
+
+	pbl->name = name;
+	pbl->address = address;
+	pbl->length = length;
+	pbl->type = type;
+	pbl->prtype = prtype;
+
+	if (!info->bound_lits) {
+		info->bound_lits = ib_vector_create(info->heap, 8);
+	}
+
+	ib_vector_push(info->bound_lits, pbl);
+}
+
+/****************************************************************//**
+Equivalent to pars_info_add_literal(info, name, str, strlen(str),
+DATA_VARCHAR, DATA_ENGLISH). */
+UNIV_INTERN
+void
+pars_info_add_str_literal(
+/*======================*/
+	pars_info_t*	info,		/*!< in: info struct */
+	const char*	name,		/*!< in: name */
+	const char*	str)		/*!< in: string */
+{
+	pars_info_add_literal(info, name, str, strlen(str),
+			      DATA_VARCHAR, DATA_ENGLISH);
+}
+
+/****************************************************************//**
+Equivalent to:
+
+char buf[4];
+mach_write_to_4(buf, val);
+pars_info_add_literal(info, name, buf, 4, DATA_INT, 0);
+
+except that the buffer is dynamically allocated from the info struct's
+heap. */
+UNIV_INTERN
+void
+pars_info_add_int4_literal(
+/*=======================*/
+	pars_info_t*	info,		/*!< in: info struct */
+	const char*	name,		/*!< in: name */
+	lint		val)		/*!< in: value */
+{
+	byte*	buf = mem_heap_alloc(info->heap, 4);
+
+	mach_write_to_4(buf, val);
+	pars_info_add_literal(info, name, buf, 4, DATA_INT, 0);
+}
+
+/****************************************************************//**
+Equivalent to:
+
+char buf[8];
+mach_write_to_8(buf, val);
+pars_info_add_literal(info, name, buf, 8, DATA_FIXBINARY, 0);
+
+except that the buffer is dynamically allocated from the info struct's
+heap. */
+UNIV_INTERN
+void
+pars_info_add_dulint_literal(
+/*=========================*/
+	pars_info_t*	info,		/*!< in: info struct */
+	const char*	name,		/*!< in: name */
+	dulint		val)		/*!< in: value */
+{
+	byte*	buf = mem_heap_alloc(info->heap, 8);
+
+	mach_write_to_8(buf, val);
+
+	pars_info_add_literal(info, name, buf, 8, DATA_FIXBINARY, 0);
+}
+
+/****************************************************************//**
+Add user function. */
+UNIV_INTERN
+void
+pars_info_add_function(
+/*===================*/
+	pars_info_t*		info,	/*!< in: info struct */
+	const char*		name,	/*!< in: function name */
+	pars_user_func_cb_t	func,	/*!< in: function address */
+	void*			arg)	/*!< in: user-supplied argument */
+{
+	pars_user_func_t*	puf;
+
+	ut_ad(!pars_info_get_user_func(info, name));
+
+	puf = mem_heap_alloc(info->heap, sizeof(*puf));
+
+	puf->name = name;
+	puf->func = func;
+	puf->arg = arg;
+
+	if (!info->funcs) {
+		info->funcs = ib_vector_create(info->heap, 8);
+	}
+
+	ib_vector_push(info->funcs, puf);
+}
+
+/****************************************************************//**
+Add bound id. */
+UNIV_INTERN
+void
+pars_info_add_id(
+/*=============*/
+	pars_info_t*	info,		/*!< in: info struct */
+	const char*	name,		/*!< in: name */
+	const char*	id)		/*!< in: id */
+{
+	pars_bound_id_t*	bid;
+
+	ut_ad(!pars_info_get_bound_id(info, name));
+
+	bid = mem_heap_alloc(info->heap, sizeof(*bid));
+
+	bid->name = name;
+	bid->id = id;
+
+	if (!info->bound_ids) {
+		info->bound_ids = ib_vector_create(info->heap, 8);
+	}
+
+	ib_vector_push(info->bound_ids, bid);
+}
+
+/****************************************************************//**
+Get user function with the given name.
+@return	user func, or NULL if not found */
+UNIV_INTERN
+pars_user_func_t*
+pars_info_get_user_func(
+/*====================*/
+	pars_info_t*		info,	/*!< in: info struct */
+	const char*		name)	/*!< in: function name to find*/
+{
+	ulint		i;
+	ib_vector_t*	vec;
+
+	if (!info || !info->funcs) {
+		return(NULL);
+	}
+
+	vec = info->funcs;
+
+	for (i = 0; i < ib_vector_size(vec); i++) {
+		pars_user_func_t*	puf = ib_vector_get(vec, i);
+
+		if (strcmp(puf->name, name) == 0) {
+			return(puf);
+		}
+	}
+
+	return(NULL);
+}
+
+/****************************************************************//**
+Get bound literal with the given name.
+@return	bound literal, or NULL if not found */
+UNIV_INTERN
+pars_bound_lit_t*
+pars_info_get_bound_lit(
+/*====================*/
+	pars_info_t*		info,	/*!< in: info struct */
+	const char*		name)	/*!< in: bound literal name to find */
+{
+	ulint		i;
+	ib_vector_t*	vec;
+
+	if (!info || !info->bound_lits) {
+		return(NULL);
+	}
+
+	vec = info->bound_lits;
+
+	for (i = 0; i < ib_vector_size(vec); i++) {
+		pars_bound_lit_t*	pbl = ib_vector_get(vec, i);
+
+		if (strcmp(pbl->name, name) == 0) {
+			return(pbl);
+		}
+	}
+
+	return(NULL);
+}
+
+/****************************************************************//**
+Get bound id with the given name.
+@return	bound id, or NULL if not found */
+UNIV_INTERN
+pars_bound_id_t*
+pars_info_get_bound_id(
+/*===================*/
+	pars_info_t*		info,	/*!< in: info struct */
+	const char*		name)	/*!< in: bound id name to find */
+{
+	ulint		i;
+	ib_vector_t*	vec;
+
+	if (!info || !info->bound_ids) {
+		return(NULL);
+	}
+
+	vec = info->bound_ids;
+
+	for (i = 0; i < ib_vector_size(vec); i++) {
+		pars_bound_id_t*	bid = ib_vector_get(vec, i);
+
+		if (strcmp(bid->name, name) == 0) {
+			return(bid);
+		}
+	}
+
+	return(NULL);
+}
diff --git a/storage/xtradb/pars/pars0sym.c b/storage/xtradb/pars/pars0sym.c
new file mode 100644
index 00000000000..b56350116bb
--- /dev/null
+++ b/storage/xtradb/pars/pars0sym.c
@@ -0,0 +1,371 @@
+/*****************************************************************************
+
+Copyright (c) 1997, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file pars/pars0sym.c
+SQL parser symbol table
+
+Created 12/15/1997 Heikki Tuuri
+*******************************************************/
+
+#include "pars0sym.h"
+
+#ifdef UNIV_NONINL
+#include "pars0sym.ic"
+#endif
+
+#include "mem0mem.h"
+#include "data0type.h"
+#include "data0data.h"
+#include "pars0grm.h"
+#include "pars0pars.h"
+#include "que0que.h"
+#include "eval0eval.h"
+#include "row0sel.h"
+
+/******************************************************************//**
+Creates a symbol table for a single stored procedure or query.
+@return	own: symbol table */
+UNIV_INTERN
+sym_tab_t*
+sym_tab_create(
+/*===========*/
+	mem_heap_t*	heap)	/*!< in: memory heap where to create */
+{
+	sym_tab_t*	sym_tab;
+
+	sym_tab = mem_heap_alloc(heap, sizeof(sym_tab_t));
+
+	UT_LIST_INIT(sym_tab->sym_list);
+	UT_LIST_INIT(sym_tab->func_node_list);
+
+	sym_tab->heap = heap;
+
+	return(sym_tab);
+}
+
+/******************************************************************//**
+Frees the memory allocated dynamically AFTER parsing phase for variables
+etc. in the symbol table. Does not free the mem heap where the table was
+originally created. Frees also SQL explicit cursor definitions. */
+UNIV_INTERN
+void
+sym_tab_free_private(
+/*=================*/
+	sym_tab_t*	sym_tab)	/*!< in, own: symbol table */
+{
+	sym_node_t*	sym;
+	func_node_t*	func;
+
+	sym = UT_LIST_GET_FIRST(sym_tab->sym_list);
+
+	while (sym) {
+		eval_node_free_val_buf(sym);
+
+		if (sym->prefetch_buf) {
+			sel_col_prefetch_buf_free(sym->prefetch_buf);
+		}
+
+		if (sym->cursor_def) {
+			que_graph_free_recursive(sym->cursor_def);
+		}
+
+		sym = UT_LIST_GET_NEXT(sym_list, sym);
+	}
+
+	func = UT_LIST_GET_FIRST(sym_tab->func_node_list);
+
+	while (func) {
+		eval_node_free_val_buf(func);
+
+		func = UT_LIST_GET_NEXT(func_node_list, func);
+	}
+}
+
+/******************************************************************//**
+Adds an integer literal to a symbol table.
+@return	symbol table node */
+UNIV_INTERN
+sym_node_t*
+sym_tab_add_int_lit(
+/*================*/
+	sym_tab_t*	sym_tab,	/*!< in: symbol table */
+	ulint		val)		/*!< in: integer value */
+{
+	sym_node_t*	node;
+	byte*		data;
+
+	node = mem_heap_alloc(sym_tab->heap, sizeof(sym_node_t));
+
+	node->common.type = QUE_NODE_SYMBOL;
+
+	node->resolved = TRUE;
+	node->token_type = SYM_LIT;
+
+	node->indirection = NULL;
+
+	dtype_set(dfield_get_type(&node->common.val), DATA_INT, 0, 4);
+
+	data = mem_heap_alloc(sym_tab->heap, 4);
+	mach_write_to_4(data, val);
+
+	dfield_set_data(&(node->common.val), data, 4);
+
+	node->common.val_buf_size = 0;
+	node->prefetch_buf = NULL;
+	node->cursor_def = NULL;
+
+	UT_LIST_ADD_LAST(sym_list, sym_tab->sym_list, node);
+
+	node->sym_table = sym_tab;
+
+	return(node);
+}
+
+/******************************************************************//**
+Adds a string literal to a symbol table.
+@return	symbol table node */
+UNIV_INTERN
+sym_node_t*
+sym_tab_add_str_lit(
+/*================*/
+	sym_tab_t*	sym_tab,	/*!< in: symbol table */
+	byte*		str,		/*!< in: string with no quotes around
+					it */
+	ulint		len)		/*!< in: string length */
+{
+	sym_node_t*	node;
+	byte*		data;
+
+	node = mem_heap_alloc(sym_tab->heap, sizeof(sym_node_t));
+
+	node->common.type = QUE_NODE_SYMBOL;
+
+	node->resolved = TRUE;
+	node->token_type = SYM_LIT;
+
+	node->indirection = NULL;
+
+	dtype_set(dfield_get_type(&node->common.val),
+		  DATA_VARCHAR, DATA_ENGLISH, 0);
+
+	if (len) {
+		data = mem_heap_alloc(sym_tab->heap, len);
+		ut_memcpy(data, str, len);
+	} else {
+		data = NULL;
+	}
+
+	dfield_set_data(&(node->common.val), data, len);
+
+	node->common.val_buf_size = 0;
+	node->prefetch_buf = NULL;
+	node->cursor_def = NULL;
+
+	UT_LIST_ADD_LAST(sym_list, sym_tab->sym_list, node);
+
+	node->sym_table = sym_tab;
+
+	return(node);
+}
+
+/******************************************************************//**
+Add a bound literal to a symbol table.
+@return	symbol table node */
+UNIV_INTERN
+sym_node_t*
+sym_tab_add_bound_lit(
+/*==================*/
+	sym_tab_t*	sym_tab,	/*!< in: symbol table */
+	const char*	name,		/*!< in: name of bound literal */
+	ulint*		lit_type)	/*!< out: type of literal (PARS_*_LIT) */
+{
+	sym_node_t*		node;
+	pars_bound_lit_t*	blit;
+	ulint			len = 0;
+
+	blit = pars_info_get_bound_lit(sym_tab->info, name);
+	ut_a(blit);
+
+	node = mem_heap_alloc(sym_tab->heap, sizeof(sym_node_t));
+
+	node->common.type = QUE_NODE_SYMBOL;
+
+	node->resolved = TRUE;
+	node->token_type = SYM_LIT;
+
+	node->indirection = NULL;
+
+	switch (blit->type) {
+	case DATA_FIXBINARY:
+		len = blit->length;
+		*lit_type = PARS_FIXBINARY_LIT;
+		break;
+
+	case DATA_BLOB:
+		*lit_type = PARS_BLOB_LIT;
+		break;
+
+	case DATA_VARCHAR:
+		*lit_type = PARS_STR_LIT;
+		break;
+
+	case DATA_CHAR:
+		ut_a(blit->length > 0);
+
+		len = blit->length;
+		*lit_type = PARS_STR_LIT;
+		break;
+
+	case DATA_INT:
+		ut_a(blit->length > 0);
+		ut_a(blit->length <= 8);
+
+		len = blit->length;
+		*lit_type = PARS_INT_LIT;
+		break;
+
+	default:
+		ut_error;
+	}
+
+	dtype_set(dfield_get_type(&node->common.val),
+		  blit->type, blit->prtype, len);
+
+	dfield_set_data(&(node->common.val), blit->address, blit->length);
+
+	node->common.val_buf_size = 0;
+	node->prefetch_buf = NULL;
+	node->cursor_def = NULL;
+
+	UT_LIST_ADD_LAST(sym_list, sym_tab->sym_list, node);
+
+	node->sym_table = sym_tab;
+
+	return(node);
+}
+
+/******************************************************************//**
+Adds an SQL null literal to a symbol table.
+@return	symbol table node */
+UNIV_INTERN
+sym_node_t*
+sym_tab_add_null_lit(
+/*=================*/
+	sym_tab_t*	sym_tab)	/*!< in: symbol table */
+{
+	sym_node_t*	node;
+
+	node = mem_heap_alloc(sym_tab->heap, sizeof(sym_node_t));
+
+	node->common.type = QUE_NODE_SYMBOL;
+
+	node->resolved = TRUE;
+	node->token_type = SYM_LIT;
+
+	node->indirection = NULL;
+
+	dfield_get_type(&node->common.val)->mtype = DATA_ERROR;
+
+	dfield_set_null(&node->common.val);
+
+	node->common.val_buf_size = 0;
+	node->prefetch_buf = NULL;
+	node->cursor_def = NULL;
+
+	UT_LIST_ADD_LAST(sym_list, sym_tab->sym_list, node);
+
+	node->sym_table = sym_tab;
+
+	return(node);
+}
+
+/******************************************************************//**
+Adds an identifier to a symbol table.
+@return	symbol table node */
+UNIV_INTERN
+sym_node_t*
+sym_tab_add_id(
+/*===========*/
+	sym_tab_t*	sym_tab,	/*!< in: symbol table */
+	byte*		name,		/*!< in: identifier name */
+	ulint		len)		/*!< in: identifier length */
+{
+	sym_node_t*	node;
+
+	node = mem_heap_alloc(sym_tab->heap, sizeof(sym_node_t));
+
+	node->common.type = QUE_NODE_SYMBOL;
+
+	node->resolved = FALSE;
+	node->indirection = NULL;
+
+	node->name = mem_heap_strdupl(sym_tab->heap, (char*) name, len);
+	node->name_len = len;
+
+	UT_LIST_ADD_LAST(sym_list, sym_tab->sym_list, node);
+
+	dfield_set_null(&node->common.val);
+
+	node->common.val_buf_size = 0;
+	node->prefetch_buf = NULL;
+	node->cursor_def = NULL;
+
+	node->sym_table = sym_tab;
+
+	return(node);
+}
+
+/******************************************************************//**
+Add a bound identifier to a symbol table.
+@return	symbol table node */
+UNIV_INTERN
+sym_node_t*
+sym_tab_add_bound_id(
+/*===========*/
+	sym_tab_t*	sym_tab,	/*!< in: symbol table */
+	const char*	name)		/*!< in: name of bound id */
+{
+	sym_node_t*		node;
+	pars_bound_id_t*	bid;
+
+	bid = pars_info_get_bound_id(sym_tab->info, name);
+	ut_a(bid);
+
+	node = mem_heap_alloc(sym_tab->heap, sizeof(sym_node_t));
+
+	node->common.type = QUE_NODE_SYMBOL;
+
+	node->resolved = FALSE;
+	node->indirection = NULL;
+
+	node->name = mem_heap_strdup(sym_tab->heap, bid->id);
+	node->name_len = strlen(node->name);
+
+	UT_LIST_ADD_LAST(sym_list, sym_tab->sym_list, node);
+
+	dfield_set_null(&node->common.val);
+
+	node->common.val_buf_size = 0;
+	node->prefetch_buf = NULL;
+	node->cursor_def = NULL;
+
+	node->sym_table = sym_tab;
+
+	return(node);
+}
diff --git a/storage/xtradb/plug.in b/storage/xtradb/plug.in
new file mode 100644
index 00000000000..37c895fb520
--- /dev/null
+++ b/storage/xtradb/plug.in
@@ -0,0 +1,228 @@
+#
+# Copyright (c) 2006, 2010, Innobase Oy. All Rights Reserved.
+# 
+# This program is free software; you can redistribute it and/or modify it under
+# the terms of the GNU General Public License as published by the Free Software
+# Foundation; version 2 of the License.
+# 
+# This program is distributed in the hope that it will be useful, but WITHOUT
+# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+# 
+# You should have received a copy of the GNU General Public License along with
+# this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+# Place, Suite 330, Boston, MA 02111-1307 USA
+#
+
+MYSQL_STORAGE_ENGINE(xtradb,  xtradb, [XtraDB Storage Engine],
+        [XtraDB - a drop-in replacement for InnoDB], [max,max-no-ndb])
+MYSQL_PLUGIN_DIRECTORY(xtradb, [storage/xtradb])
+MYSQL_PLUGIN_STATIC(xtradb,   [libxtradb.a])
+MYSQL_PLUGIN_DYNAMIC(xtradb,  [ha_xtradb.la])
+MYSQL_PLUGIN_ACTIONS(xtradb,  [
+  with_plugin_innobase=$with_plugin_xtradb # for legacy code in configure.in
+  AC_CHECK_LIB(rt, aio_read, [innodb_system_libs="-lrt"])
+  AC_SUBST(innodb_system_libs)
+  AC_CHECK_HEADERS(aio.h sched.h)
+  AC_CHECK_SIZEOF(int, 4)
+  AC_CHECK_SIZEOF(long, 4)
+  AC_CHECK_SIZEOF(void*, 4)
+  AC_CHECK_FUNCS(sched_yield fdatasync localtime_r)
+  AC_C_BIGENDIAN
+  case "$target_os" in
+	lin*)
+		CFLAGS="$CFLAGS -DUNIV_LINUX";;
+	hpux10*)
+		CFLAGS="$CFLAGS -DUNIV_MUST_NOT_INLINE -DUNIV_HPUX -DUNIV_HPUX10";;
+	hp*)
+		CFLAGS="$CFLAGS -DUNIV_MUST_NOT_INLINE -DUNIV_HPUX";;
+	aix*)
+		CFLAGS="$CFLAGS -DUNIV_AIX";;
+	irix*|osf*|sysv5uw7*|openbsd*)
+		CFLAGS="$CFLAGS -DUNIV_MUST_NOT_INLINE";;
+	*solaris*|*SunOS*)
+		CFLAGS="$CFLAGS -DUNIV_SOLARIS";;
+  esac
+
+  INNODB_DYNAMIC_CFLAGS="-DMYSQL_DYNAMIC_PLUGIN"
+
+  case "$target_cpu" in
+	x86_64)
+		# The AMD64 ABI forbids absolute addresses in shared libraries
+		;;
+	*86)
+		# Use absolute addresses on IA-32
+		INNODB_DYNAMIC_CFLAGS="$INNODB_DYNAMIC_CFLAGS -prefer-non-pic"
+		;;
+  esac
+  AC_SUBST(INNODB_DYNAMIC_CFLAGS)
+
+  AC_MSG_CHECKING(whether GCC atomic builtins are available)
+  # either define HAVE_IB_GCC_ATOMIC_BUILTINS or not
+  AC_TRY_RUN(
+    [
+      int main()
+      {
+	long	x;
+	long	y;
+	long	res;
+	char	c;
+
+	x = 10;
+	y = 123;
+	res = __sync_bool_compare_and_swap(&x, x, y);
+	if (!res || x != y) {
+          return(1);
+        }
+
+	x = 10;
+	y = 123;
+	res = __sync_bool_compare_and_swap(&x, x + 1, y);
+	if (res || x != 10) {
+          return(1);
+        }
+
+	x = 10;
+	y = 123;
+	res = __sync_add_and_fetch(&x, y);
+	if (res != 123 + 10 || x != 123 + 10) {
+	  return(1);
+	}
+
+	c = 10;
+	res = __sync_lock_test_and_set(&c, 123);
+	if (res != 10 || c != 123) {
+	  return(1);
+	}
+
+	return(0);
+      }
+    ],
+    [
+      AC_DEFINE([HAVE_IB_GCC_ATOMIC_BUILTINS], [1],
+                [GCC atomic builtins are available])
+      AC_MSG_RESULT(yes)
+    ],
+    [
+      AC_MSG_RESULT(no)
+    ]
+  )
+
+  AC_MSG_CHECKING(whether pthread_t can be used by GCC atomic builtins)
+  # either define HAVE_IB_ATOMIC_PTHREAD_T_GCC or not
+  AC_TRY_RUN(
+    [
+      #include <pthread.h>
+      #include <string.h>
+
+      int main(int argc, char** argv) {
+        pthread_t       x1;
+        pthread_t       x2;
+        pthread_t       x3;
+
+	memset(&x1, 0x0, sizeof(x1));
+	memset(&x2, 0x0, sizeof(x2));
+	memset(&x3, 0x0, sizeof(x3));
+
+        __sync_bool_compare_and_swap(&x1, x2, x3);
+
+        return(0);
+      }
+    ],
+    [
+      AC_DEFINE([HAVE_IB_ATOMIC_PTHREAD_T_GCC], [1],
+                [pthread_t can be used by GCC atomic builtins])
+      AC_MSG_RESULT(yes)
+    ],
+    [
+      AC_MSG_RESULT(no)
+    ]
+  )
+
+  AC_MSG_CHECKING(whether Solaris libc atomic functions are available)
+  # either define HAVE_IB_SOLARIS_ATOMICS or not
+  AC_CHECK_FUNCS(atomic_add_long \
+		 atomic_cas_32 \
+		 atomic_cas_64 \
+		 atomic_cas_ulong,
+
+		 AC_DEFINE([HAVE_IB_SOLARIS_ATOMICS], [1],
+			   [Define to 1 if Solaris libc atomic functions \
+			    are available])
+  )
+
+  AC_MSG_CHECKING(whether pthread_t can be used by Solaris libc atomic functions)
+  # either define HAVE_IB_ATOMIC_PTHREAD_T_SOLARIS or not
+  AC_TRY_RUN(
+    [
+      #include <pthread.h>
+      #include <string.h>
+
+      int main(int argc, char** argv) {
+        pthread_t       x1;
+        pthread_t       x2;
+        pthread_t       x3;
+
+        memset(&x1, 0x0, sizeof(x1));
+        memset(&x2, 0x0, sizeof(x2));
+        memset(&x3, 0x0, sizeof(x3));
+
+        if (sizeof(pthread_t) == 4) {
+        
+          atomic_cas_32(&x1, x2, x3);
+        
+        } else if (sizeof(pthread_t) == 8) {
+        
+          atomic_cas_64(&x1, x2, x3);
+        
+        } else {
+        
+          return(1);
+        }
+
+	return(0);
+      }
+    ],
+    [
+      AC_DEFINE([HAVE_IB_ATOMIC_PTHREAD_T_SOLARIS], [1],
+                [pthread_t can be used by solaris atomics])
+      AC_MSG_RESULT(yes)
+    ],
+    [
+      AC_MSG_RESULT(no)
+    ]
+  )
+
+  # this is needed to know which one of atomic_cas_32() or atomic_cas_64()
+  # to use in the source
+  AC_CHECK_SIZEOF([pthread_t], [], [#include <pthread.h>])
+
+  # Check for x86 PAUSE instruction
+  AC_MSG_CHECKING(for x86 PAUSE instruction)
+  # We have to actually try running the test program, because of a bug
+  # in Solaris on x86_64, where it wrongly reports that PAUSE is not
+  # supported when trying to run an application. See
+  # http://bugs.opensolaris.org/bugdatabase/printableBug.do?bug_id=6478684
+  # We use ib_ prefix to avoid collisoins if this code is added to
+  # mysql's configure.in.
+  AC_TRY_RUN(
+    [
+      int main() {
+        __asm__ __volatile__ ("pause");
+        return(0);
+      }
+    ],
+    [
+      AC_DEFINE([HAVE_IB_PAUSE_INSTRUCTION], [1], [Does x86 PAUSE instruction exist])
+      AC_MSG_RESULT(yes)
+    ],
+    [
+      AC_MSG_RESULT(no)
+    ],
+    [
+      AC_MSG_RESULT(no)
+    ]
+  )
+  ])
+
+# vim: set ft=config:
diff --git a/storage/xtradb/que/que0que.c b/storage/xtradb/que/que0que.c
new file mode 100644
index 00000000000..5c85a04d139
--- /dev/null
+++ b/storage/xtradb/que/que0que.c
@@ -0,0 +1,1450 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file que/que0que.c
+Query graph
+
+Created 5/27/1996 Heikki Tuuri
+*******************************************************/
+
+#include "que0que.h"
+
+#ifdef UNIV_NONINL
+#include "que0que.ic"
+#endif
+
+#include "srv0que.h"
+#include "usr0sess.h"
+#include "trx0trx.h"
+#include "trx0roll.h"
+#include "row0undo.h"
+#include "row0ins.h"
+#include "row0upd.h"
+#include "row0sel.h"
+#include "row0purge.h"
+#include "dict0crea.h"
+#include "log0log.h"
+#include "eval0proc.h"
+#include "eval0eval.h"
+#include "pars0types.h"
+
+#define QUE_PARALLELIZE_LIMIT	(64 * 256 * 256 * 256)
+#define QUE_ROUND_ROBIN_LIMIT	(64 * 256 * 256 * 256)
+#define QUE_MAX_LOOPS_WITHOUT_CHECK	16
+
+#ifdef UNIV_DEBUG
+/* If the following flag is set TRUE, the module will print trace info
+of SQL execution in the UNIV_SQL_DEBUG version */
+UNIV_INTERN ibool	que_trace_on		= FALSE;
+#endif /* UNIV_DEBUG */
+
+/* Short introduction to query graphs
+   ==================================
+
+A query graph consists of nodes linked to each other in various ways. The
+execution starts at que_run_threads() which takes a que_thr_t parameter.
+que_thr_t contains two fields that control query graph execution: run_node
+and prev_node. run_node is the next node to execute and prev_node is the
+last node executed.
+
+Each node has a pointer to a 'next' statement, i.e., its brother, and a
+pointer to its parent node. The next pointer is NULL in the last statement
+of a block.
+
+Loop nodes contain a link to the first statement of the enclosed statement
+list. While the loop runs, que_thr_step() checks if execution to the loop
+node came from its parent or from one of the statement nodes in the loop. If
+it came from the parent of the loop node it starts executing the first
+statement node in the loop. If it came from one of the statement nodes in
+the loop, then it checks if the statement node has another statement node
+following it, and runs it if so.
+
+To signify loop ending, the loop statements (see e.g. while_step()) set
+que_thr_t->run_node to the loop node's parent node. This is noticed on the
+next call of que_thr_step() and execution proceeds to the node pointed to by
+the loop node's 'next' pointer.
+
+For example, the code:
+
+X := 1;
+WHILE X < 5 LOOP
+ X := X + 1;
+ X := X + 1;
+X := 5
+
+will result in the following node hierarchy, with the X-axis indicating
+'next' links and the Y-axis indicating parent/child links:
+
+A - W - A
+    |
+    |
+    A - A
+
+A = assign_node_t, W = while_node_t. */
+
+/* How a stored procedure containing COMMIT or ROLLBACK commands
+is executed?
+
+The commit or rollback can be seen as a subprocedure call.
+The problem is that if there are several query threads
+currently running within the transaction, their action could
+mess the commit or rollback operation. Or, at the least, the
+operation would be difficult to visualize and keep in control.
+
+Therefore the query thread requesting a commit or a rollback
+sends to the transaction a signal, which moves the transaction
+to TRX_QUE_SIGNALED state. All running query threads of the
+transaction will eventually notice that the transaction is now in
+this state and voluntarily suspend themselves. Only the last
+query thread which suspends itself will trigger handling of
+the signal.
+
+When the transaction starts to handle a rollback or commit
+signal, it builds a query graph which, when executed, will
+roll back or commit the incomplete transaction. The transaction
+is moved to the TRX_QUE_ROLLING_BACK or TRX_QUE_COMMITTING state.
+If specified, the SQL cursors opened by the transaction are closed.
+When the execution of the graph completes, it is like returning
+from a subprocedure: the query thread which requested the operation
+starts running again. */
+
+/**********************************************************************//**
+Moves a thread from another state to the QUE_THR_RUNNING state. Increments
+the n_active_thrs counters of the query graph and transaction.
+***NOTE***: This is the only function in which such a transition is allowed
+to happen! */
+static
+void
+que_thr_move_to_run_state(
+/*======================*/
+	que_thr_t*	thr);	/*!< in: an query thread */
+
+/***********************************************************************//**
+Adds a query graph to the session's list of graphs. */
+UNIV_INTERN
+void
+que_graph_publish(
+/*==============*/
+	que_t*	graph,	/*!< in: graph */
+	sess_t*	sess)	/*!< in: session */
+{
+	ut_ad(mutex_own(&kernel_mutex));
+
+	UT_LIST_ADD_LAST(graphs, sess->graphs, graph);
+}
+
+/***********************************************************************//**
+Creates a query graph fork node.
+@return	own: fork node */
+UNIV_INTERN
+que_fork_t*
+que_fork_create(
+/*============*/
+	que_t*		graph,		/*!< in: graph, if NULL then this
+					fork node is assumed to be the
+					graph root */
+	que_node_t*	parent,		/*!< in: parent node */
+	ulint		fork_type,	/*!< in: fork type */
+	mem_heap_t*	heap)		/*!< in: memory heap where created */
+{
+	que_fork_t*	fork;
+
+	ut_ad(heap);
+
+	fork = mem_heap_alloc(heap, sizeof(que_fork_t));
+
+	fork->common.type = QUE_NODE_FORK;
+	fork->n_active_thrs = 0;
+
+	fork->state = QUE_FORK_COMMAND_WAIT;
+
+	if (graph != NULL) {
+		fork->graph = graph;
+	} else {
+		fork->graph = fork;
+	}
+
+	fork->common.parent = parent;
+	fork->fork_type = fork_type;
+
+	fork->caller = NULL;
+
+	UT_LIST_INIT(fork->thrs);
+
+	fork->sym_tab = NULL;
+	fork->info = NULL;
+
+	fork->heap = heap;
+
+	return(fork);
+}
+
+/***********************************************************************//**
+Creates a query graph thread node.
+@return	own: query thread node */
+UNIV_INTERN
+que_thr_t*
+que_thr_create(
+/*===========*/
+	que_fork_t*	parent,	/*!< in: parent node, i.e., a fork node */
+	mem_heap_t*	heap)	/*!< in: memory heap where created */
+{
+	que_thr_t*	thr;
+
+	ut_ad(parent && heap);
+
+	thr = mem_heap_alloc(heap, sizeof(que_thr_t));
+
+	thr->common.type = QUE_NODE_THR;
+	thr->common.parent = parent;
+
+	thr->magic_n = QUE_THR_MAGIC_N;
+
+	thr->graph = parent->graph;
+
+	thr->state = QUE_THR_COMMAND_WAIT;
+
+	thr->is_active = FALSE;
+
+	thr->run_node = NULL;
+	thr->resource = 0;
+	thr->lock_state = QUE_THR_LOCK_NOLOCK;
+
+	UT_LIST_ADD_LAST(thrs, parent->thrs, thr);
+
+	return(thr);
+}
+
+/**********************************************************************//**
+Moves a suspended query thread to the QUE_THR_RUNNING state and may release
+a single worker thread to execute it. This function should be used to end
+the wait state of a query thread waiting for a lock or a stored procedure
+completion. */
+UNIV_INTERN
+void
+que_thr_end_wait(
+/*=============*/
+	que_thr_t*	thr,		/*!< in: query thread in the
+					QUE_THR_LOCK_WAIT,
+					or QUE_THR_PROCEDURE_WAIT, or
+					QUE_THR_SIG_REPLY_WAIT state */
+	que_thr_t**	next_thr)	/*!< in/out: next query thread to run;
+					if the value which is passed in is
+					a pointer to a NULL pointer, then the
+					calling function can start running
+					a new query thread; if NULL is passed
+					as the parameter, it is ignored */
+{
+	ibool	was_active;
+
+	ut_ad(mutex_own(&kernel_mutex));
+	ut_ad(thr);
+	ut_ad((thr->state == QUE_THR_LOCK_WAIT)
+	      || (thr->state == QUE_THR_PROCEDURE_WAIT)
+	      || (thr->state == QUE_THR_SIG_REPLY_WAIT));
+	ut_ad(thr->run_node);
+
+	thr->prev_node = thr->run_node;
+
+	was_active = thr->is_active;
+
+	que_thr_move_to_run_state(thr);
+
+	if (was_active) {
+
+		return;
+	}
+
+	if (next_thr && *next_thr == NULL) {
+		*next_thr = thr;
+	} else {
+		ut_a(0);
+		srv_que_task_enqueue_low(thr);
+	}
+}
+
+/**********************************************************************//**
+Same as que_thr_end_wait, but no parameter next_thr available. */
+UNIV_INTERN
+void
+que_thr_end_wait_no_next_thr(
+/*=========================*/
+	que_thr_t*	thr)	/*!< in: query thread in the QUE_THR_LOCK_WAIT,
+				or QUE_THR_PROCEDURE_WAIT, or
+				QUE_THR_SIG_REPLY_WAIT state */
+{
+	ibool	was_active;
+
+	ut_a(thr->state == QUE_THR_LOCK_WAIT);	/* In MySQL this is the
+						only possible state here */
+	ut_ad(mutex_own(&kernel_mutex));
+	ut_ad(thr);
+	ut_ad((thr->state == QUE_THR_LOCK_WAIT)
+	      || (thr->state == QUE_THR_PROCEDURE_WAIT)
+	      || (thr->state == QUE_THR_SIG_REPLY_WAIT));
+
+	was_active = thr->is_active;
+
+	que_thr_move_to_run_state(thr);
+
+	if (was_active) {
+
+		return;
+	}
+
+	/* In MySQL we let the OS thread (not just the query thread) to wait
+	for the lock to be released: */
+
+	srv_release_mysql_thread_if_suspended(thr);
+
+	/* srv_que_task_enqueue_low(thr); */
+}
+
+/**********************************************************************//**
+Inits a query thread for a command. */
+UNIV_INLINE
+void
+que_thr_init_command(
+/*=================*/
+	que_thr_t*	thr)	/*!< in: query thread */
+{
+	thr->run_node = thr;
+	thr->prev_node = thr->common.parent;
+
+	que_thr_move_to_run_state(thr);
+}
+
+/**********************************************************************//**
+Starts execution of a command in a query fork. Picks a query thread which
+is not in the QUE_THR_RUNNING state and moves it to that state. If none
+can be chosen, a situation which may arise in parallelized fetches, NULL
+is returned.
+@return a query thread of the graph moved to QUE_THR_RUNNING state, or
+NULL; the query thread should be executed by que_run_threads by the
+caller */
+UNIV_INTERN
+que_thr_t*
+que_fork_start_command(
+/*===================*/
+	que_fork_t*	fork)	/*!< in: a query fork */
+{
+	que_thr_t*	thr;
+	que_thr_t*	suspended_thr = NULL;
+	que_thr_t*	completed_thr = NULL;
+
+	fork->state = QUE_FORK_ACTIVE;
+
+	fork->last_sel_node = NULL;
+
+	suspended_thr = NULL;
+	completed_thr = NULL;
+
+	/* Choose the query thread to run: usually there is just one thread,
+	but in a parallelized select, which necessarily is non-scrollable,
+	there may be several to choose from */
+
+	/* First we try to find a query thread in the QUE_THR_COMMAND_WAIT
+	state. Then we try to find a query thread in the QUE_THR_SUSPENDED
+	state, finally we try to find a query thread in the QUE_THR_COMPLETED
+	state */
+
+	thr = UT_LIST_GET_FIRST(fork->thrs);
+
+	/* We make a single pass over the thr list within which we note which
+	threads are ready to run. */
+	while (thr) {
+		switch (thr->state) {
+		case QUE_THR_COMMAND_WAIT:
+
+			/* We have to send the initial message to query thread
+			to start it */
+
+			que_thr_init_command(thr);
+
+			return(thr);
+
+		case QUE_THR_SUSPENDED:
+			/* In this case the execution of the thread was
+			suspended: no initial message is needed because
+			execution can continue from where it was left */
+			if (!suspended_thr) {
+				suspended_thr = thr;
+			}
+
+			break;
+
+		case QUE_THR_COMPLETED:
+			if (!completed_thr) {
+				completed_thr = thr;
+			}
+
+			break;
+
+		case QUE_THR_LOCK_WAIT:
+			ut_error;
+
+		}
+
+		thr = UT_LIST_GET_NEXT(thrs, thr);
+	}
+
+	if (suspended_thr) {
+
+		thr = suspended_thr;
+		que_thr_move_to_run_state(thr);
+
+	} else if (completed_thr) {
+
+		thr = completed_thr;
+		que_thr_init_command(thr);
+	}
+
+	return(thr);
+}
+
+/**********************************************************************//**
+After signal handling is finished, returns control to a query graph error
+handling routine. (Currently, just returns the control to the root of the
+graph so that the graph can communicate an error message to the client.) */
+UNIV_INTERN
+void
+que_fork_error_handle(
+/*==================*/
+	trx_t*	trx __attribute__((unused)),	/*!< in: trx */
+	que_t*	fork)	/*!< in: query graph which was run before signal
+			handling started, NULL not allowed */
+{
+	que_thr_t*	thr;
+
+	ut_ad(mutex_own(&kernel_mutex));
+	ut_ad(trx->sess->state == SESS_ERROR);
+	ut_ad(UT_LIST_GET_LEN(trx->reply_signals) == 0);
+	ut_ad(UT_LIST_GET_LEN(trx->wait_thrs) == 0);
+
+	thr = UT_LIST_GET_FIRST(fork->thrs);
+
+	while (thr != NULL) {
+		ut_ad(!thr->is_active);
+		ut_ad(thr->state != QUE_THR_SIG_REPLY_WAIT);
+		ut_ad(thr->state != QUE_THR_LOCK_WAIT);
+
+		thr->run_node = thr;
+		thr->prev_node = thr->child;
+		thr->state = QUE_THR_COMPLETED;
+
+		thr = UT_LIST_GET_NEXT(thrs, thr);
+	}
+
+	thr = UT_LIST_GET_FIRST(fork->thrs);
+
+	que_thr_move_to_run_state(thr);
+
+	ut_a(0);
+	srv_que_task_enqueue_low(thr);
+}
+
+/****************************************************************//**
+Tests if all the query threads in the same fork have a given state.
+@return TRUE if all the query threads in the same fork were in the
+given state */
+UNIV_INLINE
+ibool
+que_fork_all_thrs_in_state(
+/*=======================*/
+	que_fork_t*	fork,	/*!< in: query fork */
+	ulint		state)	/*!< in: state */
+{
+	que_thr_t*	thr_node;
+
+	thr_node = UT_LIST_GET_FIRST(fork->thrs);
+
+	while (thr_node != NULL) {
+		if (thr_node->state != state) {
+
+			return(FALSE);
+		}
+
+		thr_node = UT_LIST_GET_NEXT(thrs, thr_node);
+	}
+
+	return(TRUE);
+}
+
+/**********************************************************************//**
+Calls que_graph_free_recursive for statements in a statement list. */
+static
+void
+que_graph_free_stat_list(
+/*=====================*/
+	que_node_t*	node)	/*!< in: first query graph node in the list */
+{
+	while (node) {
+		que_graph_free_recursive(node);
+
+		node = que_node_get_next(node);
+	}
+}
+
+/**********************************************************************//**
+Frees a query graph, but not the heap where it was created. Does not free
+explicit cursor declarations, they are freed in que_graph_free. */
+UNIV_INTERN
+void
+que_graph_free_recursive(
+/*=====================*/
+	que_node_t*	node)	/*!< in: query graph node */
+{
+	que_fork_t*	fork;
+	que_thr_t*	thr;
+	undo_node_t*	undo;
+	sel_node_t*	sel;
+	ins_node_t*	ins;
+	upd_node_t*	upd;
+	tab_node_t*	cre_tab;
+	ind_node_t*	cre_ind;
+	purge_node_t*	purge;
+
+	if (node == NULL) {
+
+		return;
+	}
+
+	switch (que_node_get_type(node)) {
+
+	case QUE_NODE_FORK:
+		fork = node;
+
+		thr = UT_LIST_GET_FIRST(fork->thrs);
+
+		while (thr) {
+			que_graph_free_recursive(thr);
+
+			thr = UT_LIST_GET_NEXT(thrs, thr);
+		}
+
+		break;
+	case QUE_NODE_THR:
+
+		thr = node;
+
+		if (thr->magic_n != QUE_THR_MAGIC_N) {
+			fprintf(stderr,
+				"que_thr struct appears corrupt;"
+				" magic n %lu\n",
+				(unsigned long) thr->magic_n);
+			mem_analyze_corruption(thr);
+			ut_error;
+		}
+
+		thr->magic_n = QUE_THR_MAGIC_FREED;
+
+		que_graph_free_recursive(thr->child);
+
+		break;
+	case QUE_NODE_UNDO:
+
+		undo = node;
+
+		mem_heap_free(undo->heap);
+
+		break;
+	case QUE_NODE_SELECT:
+
+		sel = node;
+
+		sel_node_free_private(sel);
+
+		break;
+	case QUE_NODE_INSERT:
+
+		ins = node;
+
+		que_graph_free_recursive(ins->select);
+
+		mem_heap_free(ins->entry_sys_heap);
+
+		break;
+	case QUE_NODE_PURGE:
+		purge = node;
+
+		mem_heap_free(purge->heap);
+
+		break;
+
+	case QUE_NODE_UPDATE:
+
+		upd = node;
+
+		if (upd->in_mysql_interface) {
+
+			btr_pcur_free_for_mysql(upd->pcur);
+		}
+
+		que_graph_free_recursive(upd->cascade_node);
+
+		if (upd->cascade_heap) {
+			mem_heap_free(upd->cascade_heap);
+		}
+
+		que_graph_free_recursive(upd->select);
+
+		mem_heap_free(upd->heap);
+
+		break;
+	case QUE_NODE_CREATE_TABLE:
+		cre_tab = node;
+
+		que_graph_free_recursive(cre_tab->tab_def);
+		que_graph_free_recursive(cre_tab->col_def);
+		que_graph_free_recursive(cre_tab->commit_node);
+
+		mem_heap_free(cre_tab->heap);
+
+		break;
+	case QUE_NODE_CREATE_INDEX:
+		cre_ind = node;
+
+		que_graph_free_recursive(cre_ind->ind_def);
+		que_graph_free_recursive(cre_ind->field_def);
+		if (srv_use_sys_stats_table)
+			que_graph_free_recursive(cre_ind->stats_def);
+		que_graph_free_recursive(cre_ind->commit_node);
+
+		mem_heap_free(cre_ind->heap);
+
+		break;
+	case QUE_NODE_INSERT_STATS:
+		cre_ind = node;
+
+		que_graph_free_recursive(cre_ind->stats_def);
+		que_graph_free_recursive(cre_ind->commit_node);
+
+		mem_heap_free(cre_ind->heap);
+		break;
+	case QUE_NODE_PROC:
+		que_graph_free_stat_list(((proc_node_t*)node)->stat_list);
+
+		break;
+	case QUE_NODE_IF:
+		que_graph_free_stat_list(((if_node_t*)node)->stat_list);
+		que_graph_free_stat_list(((if_node_t*)node)->else_part);
+		que_graph_free_stat_list(((if_node_t*)node)->elsif_list);
+
+		break;
+	case QUE_NODE_ELSIF:
+		que_graph_free_stat_list(((elsif_node_t*)node)->stat_list);
+
+		break;
+	case QUE_NODE_WHILE:
+		que_graph_free_stat_list(((while_node_t*)node)->stat_list);
+
+		break;
+	case QUE_NODE_FOR:
+		que_graph_free_stat_list(((for_node_t*)node)->stat_list);
+
+		break;
+
+	case QUE_NODE_ASSIGNMENT:
+	case QUE_NODE_EXIT:
+	case QUE_NODE_RETURN:
+	case QUE_NODE_COMMIT:
+	case QUE_NODE_ROLLBACK:
+	case QUE_NODE_LOCK:
+	case QUE_NODE_FUNC:
+	case QUE_NODE_ORDER:
+	case QUE_NODE_ROW_PRINTF:
+	case QUE_NODE_OPEN:
+	case QUE_NODE_FETCH:
+		/* No need to do anything */
+
+		break;
+	default:
+		fprintf(stderr,
+			"que_node struct appears corrupt; type %lu\n",
+			(unsigned long) que_node_get_type(node));
+		mem_analyze_corruption(node);
+		ut_error;
+	}
+}
+
+/**********************************************************************//**
+Frees a query graph. */
+UNIV_INTERN
+void
+que_graph_free(
+/*===========*/
+	que_t*	graph)	/*!< in: query graph; we assume that the memory
+			heap where this graph was created is private
+			to this graph: if not, then use
+			que_graph_free_recursive and free the heap
+			afterwards! */
+{
+	ut_ad(graph);
+
+	if (graph->sym_tab) {
+		/* The following call frees dynamic memory allocated
+		for variables etc. during execution. Frees also explicit
+		cursor definitions. */
+
+		sym_tab_free_private(graph->sym_tab);
+	}
+
+	if (graph->info && graph->info->graph_owns_us) {
+		pars_info_free(graph->info);
+	}
+
+	que_graph_free_recursive(graph);
+
+	mem_heap_free(graph->heap);
+}
+
+/****************************************************************//**
+Performs an execution step on a thr node.
+@return	query thread to run next, or NULL if none */
+static
+que_thr_t*
+que_thr_node_step(
+/*==============*/
+	que_thr_t*	thr)	/*!< in: query thread where run_node must
+				be the thread node itself */
+{
+	ut_ad(thr->run_node == thr);
+
+	if (thr->prev_node == thr->common.parent) {
+		/* If control to the node came from above, it is just passed
+		on */
+
+		thr->run_node = thr->child;
+
+		return(thr);
+	}
+
+	mutex_enter(&kernel_mutex);
+
+	if (que_thr_peek_stop(thr)) {
+
+		mutex_exit(&kernel_mutex);
+
+		return(thr);
+	}
+
+	/* Thread execution completed */
+
+	thr->state = QUE_THR_COMPLETED;
+
+	mutex_exit(&kernel_mutex);
+
+	return(NULL);
+}
+
+/**********************************************************************//**
+Moves a thread from another state to the QUE_THR_RUNNING state. Increments
+the n_active_thrs counters of the query graph and transaction if thr was
+not active.
+***NOTE***: This and ..._mysql are  the only functions in which such a
+transition is allowed to happen! */
+static
+void
+que_thr_move_to_run_state(
+/*======================*/
+	que_thr_t*	thr)	/*!< in: an query thread */
+{
+	trx_t*	trx;
+
+	ut_ad(thr->state != QUE_THR_RUNNING);
+
+	trx = thr_get_trx(thr);
+
+	if (!thr->is_active) {
+
+		(thr->graph)->n_active_thrs++;
+
+		trx->n_active_thrs++;
+
+		thr->is_active = TRUE;
+
+		ut_ad((thr->graph)->n_active_thrs == 1);
+		ut_ad(trx->n_active_thrs == 1);
+	}
+
+	thr->state = QUE_THR_RUNNING;
+}
+
+/**********************************************************************//**
+Decrements the query thread reference counts in the query graph and the
+transaction. May start signal handling, e.g., a rollback.
+*** NOTE ***:
+This and que_thr_stop_for_mysql are the only functions where the reference
+count can be decremented and this function may only be called from inside
+que_run_threads or que_thr_check_if_switch! These restrictions exist to make
+the rollback code easier to maintain. */
+static
+void
+que_thr_dec_refer_count(
+/*====================*/
+	que_thr_t*	thr,		/*!< in: query thread */
+	que_thr_t**	next_thr)	/*!< in/out: next query thread to run;
+					if the value which is passed in is
+					a pointer to a NULL pointer, then the
+					calling function can start running
+					a new query thread */
+{
+	que_fork_t*	fork;
+	trx_t*		trx;
+	ulint		fork_type;
+	ibool		stopped;
+
+	fork = thr->common.parent;
+	trx = thr_get_trx(thr);
+
+	mutex_enter(&kernel_mutex);
+
+	ut_a(thr->is_active);
+
+	if (thr->state == QUE_THR_RUNNING) {
+
+		stopped = que_thr_stop(thr);
+
+		if (!stopped) {
+			/* The reason for the thr suspension or wait was
+			already canceled before we came here: continue
+			running the thread */
+
+			/* fputs("!!!!!!!! Wait already ended: continue thr\n",
+			stderr); */
+
+			if (next_thr && *next_thr == NULL) {
+				/* Normally srv_suspend_mysql_thread resets
+				the state to DB_SUCCESS before waiting, but
+				in this case we have to do it here,
+				otherwise nobody does it. */
+				trx->error_state = DB_SUCCESS;
+
+				*next_thr = thr;
+			} else {
+				ut_error;
+				srv_que_task_enqueue_low(thr);
+			}
+
+			mutex_exit(&kernel_mutex);
+
+			return;
+		}
+	}
+
+	ut_ad(fork->n_active_thrs == 1);
+	ut_ad(trx->n_active_thrs == 1);
+
+	fork->n_active_thrs--;
+	trx->n_active_thrs--;
+
+	thr->is_active = FALSE;
+
+	if (trx->n_active_thrs > 0) {
+
+		mutex_exit(&kernel_mutex);
+
+		return;
+	}
+
+	fork_type = fork->fork_type;
+
+	/* Check if all query threads in the same fork are completed */
+
+	if (que_fork_all_thrs_in_state(fork, QUE_THR_COMPLETED)) {
+
+		switch (fork_type) {
+		case QUE_FORK_ROLLBACK:
+			/* This is really the undo graph used in rollback,
+			no roll_node in this graph */
+
+			ut_ad(UT_LIST_GET_LEN(trx->signals) > 0);
+			ut_ad(trx->handling_signals == TRUE);
+
+			trx_finish_rollback_off_kernel(fork, trx, next_thr);
+			break;
+
+		case QUE_FORK_PURGE:
+		case QUE_FORK_RECOVERY:
+		case QUE_FORK_MYSQL_INTERFACE:
+
+			/* Do nothing */
+			break;
+
+		default:
+			ut_error;	/*!< not used in MySQL */
+		}
+	}
+
+	if (UT_LIST_GET_LEN(trx->signals) > 0 && trx->n_active_thrs == 0) {
+
+		/* If the trx is signaled and its query thread count drops to
+		zero, then we start processing a signal; from it we may get
+		a new query thread to run */
+
+		trx_sig_start_handle(trx, next_thr);
+	}
+
+	if (trx->handling_signals && UT_LIST_GET_LEN(trx->signals) == 0) {
+
+		trx_end_signal_handling(trx);
+	}
+
+	mutex_exit(&kernel_mutex);
+}
+
+/**********************************************************************//**
+Stops a query thread if graph or trx is in a state requiring it. The
+conditions are tested in the order (1) graph, (2) trx. The kernel mutex has
+to be reserved.
+@return	TRUE if stopped */
+UNIV_INTERN
+ibool
+que_thr_stop(
+/*=========*/
+	que_thr_t*	thr)	/*!< in: query thread */
+{
+	trx_t*	trx;
+	que_t*	graph;
+	ibool	ret	= TRUE;
+
+	ut_ad(mutex_own(&kernel_mutex));
+
+	graph = thr->graph;
+	trx = graph->trx;
+
+	if (graph->state == QUE_FORK_COMMAND_WAIT) {
+		thr->state = QUE_THR_SUSPENDED;
+
+	} else if (trx->que_state == TRX_QUE_LOCK_WAIT) {
+
+		UT_LIST_ADD_FIRST(trx_thrs, trx->wait_thrs, thr);
+		thr->state = QUE_THR_LOCK_WAIT;
+
+	} else if (trx->error_state != DB_SUCCESS
+		   && trx->error_state != DB_LOCK_WAIT) {
+
+		/* Error handling built for the MySQL interface */
+		thr->state = QUE_THR_COMPLETED;
+
+	} else if (UT_LIST_GET_LEN(trx->signals) > 0
+		   && graph->fork_type != QUE_FORK_ROLLBACK) {
+
+		thr->state = QUE_THR_SUSPENDED;
+	} else {
+		ut_ad(graph->state == QUE_FORK_ACTIVE);
+
+		ret = FALSE;
+	}
+
+	return(ret);
+}
+
+/**********************************************************************//**
+A patch for MySQL used to 'stop' a dummy query thread used in MySQL. The
+query thread is stopped and made inactive, except in the case where
+it was put to the lock wait state in lock0lock.c, but the lock has already
+been granted or the transaction chosen as a victim in deadlock resolution. */
+UNIV_INTERN
+void
+que_thr_stop_for_mysql(
+/*===================*/
+	que_thr_t*	thr)	/*!< in: query thread */
+{
+	trx_t*	trx;
+
+	trx = thr_get_trx(thr);
+
+	mutex_enter(&kernel_mutex);
+
+	if (thr->state == QUE_THR_RUNNING) {
+
+		if (trx->error_state != DB_SUCCESS
+		    && trx->error_state != DB_LOCK_WAIT) {
+
+			/* Error handling built for the MySQL interface */
+			thr->state = QUE_THR_COMPLETED;
+		} else {
+			/* It must have been a lock wait but the lock was
+			already released, or this transaction was chosen
+			as a victim in selective deadlock resolution */
+
+			mutex_exit(&kernel_mutex);
+
+			return;
+		}
+	}
+
+	ut_ad(thr->is_active == TRUE);
+	ut_ad(trx->n_active_thrs == 1);
+	ut_ad(thr->graph->n_active_thrs == 1);
+
+	thr->is_active = FALSE;
+	(thr->graph)->n_active_thrs--;
+
+	trx->n_active_thrs--;
+
+	mutex_exit(&kernel_mutex);
+}
+
+/**********************************************************************//**
+Moves a thread from another state to the QUE_THR_RUNNING state. Increments
+the n_active_thrs counters of the query graph and transaction if thr was
+not active. */
+UNIV_INTERN
+void
+que_thr_move_to_run_state_for_mysql(
+/*================================*/
+	que_thr_t*	thr,	/*!< in: an query thread */
+	trx_t*		trx)	/*!< in: transaction */
+{
+	if (thr->magic_n != QUE_THR_MAGIC_N) {
+		fprintf(stderr,
+			"que_thr struct appears corrupt; magic n %lu\n",
+			(unsigned long) thr->magic_n);
+
+		mem_analyze_corruption(thr);
+
+		ut_error;
+	}
+
+	if (!thr->is_active) {
+
+		thr->graph->n_active_thrs++;
+
+		trx->n_active_thrs++;
+
+		thr->is_active = TRUE;
+	}
+
+	thr->state = QUE_THR_RUNNING;
+}
+
+/**********************************************************************//**
+A patch for MySQL used to 'stop' a dummy query thread used in MySQL
+select, when there is no error or lock wait. */
+UNIV_INTERN
+void
+que_thr_stop_for_mysql_no_error(
+/*============================*/
+	que_thr_t*	thr,	/*!< in: query thread */
+	trx_t*		trx)	/*!< in: transaction */
+{
+	ut_ad(thr->state == QUE_THR_RUNNING);
+	ut_ad(thr->is_active == TRUE);
+	ut_ad(trx->n_active_thrs == 1);
+	ut_ad(thr->graph->n_active_thrs == 1);
+
+	if (thr->magic_n != QUE_THR_MAGIC_N) {
+		fprintf(stderr,
+			"que_thr struct appears corrupt; magic n %lu\n",
+			(unsigned long) thr->magic_n);
+
+		mem_analyze_corruption(thr);
+
+		ut_error;
+	}
+
+	thr->state = QUE_THR_COMPLETED;
+
+	thr->is_active = FALSE;
+	(thr->graph)->n_active_thrs--;
+
+	trx->n_active_thrs--;
+}
+
+/****************************************************************//**
+Get the first containing loop node (e.g. while_node_t or for_node_t) for the
+given node, or NULL if the node is not within a loop.
+@return	containing loop node, or NULL. */
+UNIV_INTERN
+que_node_t*
+que_node_get_containing_loop_node(
+/*==============================*/
+	que_node_t*	node)	/*!< in: node */
+{
+	ut_ad(node);
+
+	for (;;) {
+		ulint	type;
+
+		node = que_node_get_parent(node);
+
+		if (!node) {
+			break;
+		}
+
+		type = que_node_get_type(node);
+
+		if ((type == QUE_NODE_FOR) || (type == QUE_NODE_WHILE)) {
+			break;
+		}
+	}
+
+	return(node);
+}
+
+/**********************************************************************//**
+Prints info of an SQL query graph node. */
+UNIV_INTERN
+void
+que_node_print_info(
+/*================*/
+	que_node_t*	node)	/*!< in: query graph node */
+{
+	ulint		type;
+	const char*	str;
+
+	type = que_node_get_type(node);
+
+	if (type == QUE_NODE_SELECT) {
+		str = "SELECT";
+	} else if (type == QUE_NODE_INSERT) {
+		str = "INSERT";
+	} else if (type == QUE_NODE_UPDATE) {
+		str = "UPDATE";
+	} else if (type == QUE_NODE_WHILE) {
+		str = "WHILE";
+	} else if (type == QUE_NODE_ASSIGNMENT) {
+		str = "ASSIGNMENT";
+	} else if (type == QUE_NODE_IF) {
+		str = "IF";
+	} else if (type == QUE_NODE_FETCH) {
+		str = "FETCH";
+	} else if (type == QUE_NODE_OPEN) {
+		str = "OPEN";
+	} else if (type == QUE_NODE_PROC) {
+		str = "STORED PROCEDURE";
+	} else if (type == QUE_NODE_FUNC) {
+		str = "FUNCTION";
+	} else if (type == QUE_NODE_LOCK) {
+		str = "LOCK";
+	} else if (type == QUE_NODE_THR) {
+		str = "QUERY THREAD";
+	} else if (type == QUE_NODE_COMMIT) {
+		str = "COMMIT";
+	} else if (type == QUE_NODE_UNDO) {
+		str = "UNDO ROW";
+	} else if (type == QUE_NODE_PURGE) {
+		str = "PURGE ROW";
+	} else if (type == QUE_NODE_ROLLBACK) {
+		str = "ROLLBACK";
+	} else if (type == QUE_NODE_CREATE_TABLE) {
+		str = "CREATE TABLE";
+	} else if (type == QUE_NODE_CREATE_INDEX) {
+		str = "CREATE INDEX";
+	} else if (type == QUE_NODE_INSERT_STATS) {
+		str = "INSERT TO SYS_STATS";
+	} else if (type == QUE_NODE_FOR) {
+		str = "FOR LOOP";
+	} else if (type == QUE_NODE_RETURN) {
+		str = "RETURN";
+	} else if (type == QUE_NODE_EXIT) {
+		str = "EXIT";
+	} else {
+		str = "UNKNOWN NODE TYPE";
+	}
+
+	fprintf(stderr, "Node type %lu: %s, address %p\n",
+		(ulong) type, str, (void*) node);
+}
+
+/**********************************************************************//**
+Performs an execution step on a query thread.
+@return query thread to run next: it may differ from the input
+parameter if, e.g., a subprocedure call is made */
+UNIV_INLINE
+que_thr_t*
+que_thr_step(
+/*=========*/
+	que_thr_t*	thr)	/*!< in: query thread */
+{
+	que_node_t*	node;
+	que_thr_t*	old_thr;
+	trx_t*		trx;
+	ulint		type;
+
+	trx = thr_get_trx(thr);
+
+	ut_ad(thr->state == QUE_THR_RUNNING);
+	ut_a(trx->error_state == DB_SUCCESS);
+
+	thr->resource++;
+
+	node = thr->run_node;
+	type = que_node_get_type(node);
+
+	old_thr = thr;
+
+#ifdef UNIV_DEBUG
+	if (que_trace_on) {
+		fputs("To execute: ", stderr);
+		que_node_print_info(node);
+	}
+#endif
+	if (type & QUE_NODE_CONTROL_STAT) {
+		if ((thr->prev_node != que_node_get_parent(node))
+		    && que_node_get_next(thr->prev_node)) {
+
+			/* The control statements, like WHILE, always pass the
+			control to the next child statement if there is any
+			child left */
+
+			thr->run_node = que_node_get_next(thr->prev_node);
+
+		} else if (type == QUE_NODE_IF) {
+			if_step(thr);
+		} else if (type == QUE_NODE_FOR) {
+			for_step(thr);
+		} else if (type == QUE_NODE_PROC) {
+
+			/* We can access trx->undo_no without reserving
+			trx->undo_mutex, because there cannot be active query
+			threads doing updating or inserting at the moment! */
+
+			if (thr->prev_node == que_node_get_parent(node)) {
+				trx->last_sql_stat_start.least_undo_no
+					= trx->undo_no;
+			}
+
+			proc_step(thr);
+		} else if (type == QUE_NODE_WHILE) {
+			while_step(thr);
+		} else {
+			ut_error;
+		}
+	} else if (type == QUE_NODE_ASSIGNMENT) {
+		assign_step(thr);
+	} else if (type == QUE_NODE_SELECT) {
+		thr = row_sel_step(thr);
+	} else if (type == QUE_NODE_INSERT) {
+		thr = row_ins_step(thr);
+	} else if (type == QUE_NODE_UPDATE) {
+		thr = row_upd_step(thr);
+	} else if (type == QUE_NODE_FETCH) {
+		thr = fetch_step(thr);
+	} else if (type == QUE_NODE_OPEN) {
+		thr = open_step(thr);
+	} else if (type == QUE_NODE_FUNC) {
+		proc_eval_step(thr);
+
+	} else if (type == QUE_NODE_LOCK) {
+
+		ut_error;
+		/*
+		thr = que_lock_step(thr);
+		*/
+	} else if (type == QUE_NODE_THR) {
+		thr = que_thr_node_step(thr);
+	} else if (type == QUE_NODE_COMMIT) {
+		thr = trx_commit_step(thr);
+	} else if (type == QUE_NODE_UNDO) {
+		thr = row_undo_step(thr);
+	} else if (type == QUE_NODE_PURGE) {
+		thr = row_purge_step(thr);
+	} else if (type == QUE_NODE_RETURN) {
+		thr = return_step(thr);
+	} else if (type == QUE_NODE_EXIT) {
+		thr = exit_step(thr);
+	} else if (type == QUE_NODE_ROLLBACK) {
+		thr = trx_rollback_step(thr);
+	} else if (type == QUE_NODE_CREATE_TABLE) {
+		thr = dict_create_table_step(thr);
+	} else if (type == QUE_NODE_CREATE_INDEX) {
+		thr = dict_create_index_step(thr);
+	} else if (type == QUE_NODE_INSERT_STATS) {
+		thr = dict_insert_stats_step(thr);
+	} else if (type == QUE_NODE_ROW_PRINTF) {
+		thr = row_printf_step(thr);
+	} else {
+		ut_error;
+	}
+
+	if (type == QUE_NODE_EXIT) {
+		old_thr->prev_node = que_node_get_containing_loop_node(node);
+	} else {
+		old_thr->prev_node = node;
+	}
+
+	if (thr) {
+		ut_a(thr_get_trx(thr)->error_state == DB_SUCCESS);
+	}
+
+	return(thr);
+}
+
+/**********************************************************************//**
+Run a query thread until it finishes or encounters e.g. a lock wait. */
+static
+void
+que_run_threads_low(
+/*================*/
+	que_thr_t*	thr)	/*!< in: query thread */
+{
+	que_thr_t*	next_thr;
+	ulint		cumul_resource;
+	ulint		loop_count;
+
+	ut_ad(thr->state == QUE_THR_RUNNING);
+	ut_a(thr_get_trx(thr)->error_state == DB_SUCCESS);
+	ut_ad(!mutex_own(&kernel_mutex));
+
+	/* cumul_resource counts how much resources the OS thread (NOT the
+	query thread) has spent in this function */
+
+	loop_count = QUE_MAX_LOOPS_WITHOUT_CHECK;
+	cumul_resource = 0;
+loop:
+	/* Check that there is enough space in the log to accommodate
+	possible log entries by this query step; if the operation can touch
+	more than about 4 pages, checks must be made also within the query
+	step! */
+
+	log_free_check();
+
+	/* Perform the actual query step: note that the query thread
+	may change if, e.g., a subprocedure call is made */
+
+	/*-------------------------*/
+	next_thr = que_thr_step(thr);
+	/*-------------------------*/
+
+	ut_a(!next_thr || (thr_get_trx(next_thr)->error_state == DB_SUCCESS));
+
+	loop_count++;
+
+	if (next_thr != thr) {
+		ut_a(next_thr == NULL);
+
+		/* This can change next_thr to a non-NULL value if there was
+		a lock wait that already completed. */
+		que_thr_dec_refer_count(thr, &next_thr);
+
+		if (next_thr == NULL) {
+
+			return;
+		}
+
+		loop_count = QUE_MAX_LOOPS_WITHOUT_CHECK;
+
+		thr = next_thr;
+	}
+
+	goto loop;
+}
+
+/**********************************************************************//**
+Run a query thread. Handles lock waits. */
+UNIV_INTERN
+void
+que_run_threads(
+/*============*/
+	que_thr_t*	thr)	/*!< in: query thread */
+{
+loop:
+	ut_a(thr_get_trx(thr)->error_state == DB_SUCCESS);
+	que_run_threads_low(thr);
+
+	mutex_enter(&kernel_mutex);
+
+	switch (thr->state) {
+
+	case QUE_THR_RUNNING:
+		/* There probably was a lock wait, but it already ended
+		before we came here: continue running thr */
+
+		mutex_exit(&kernel_mutex);
+
+		goto loop;
+
+	case QUE_THR_LOCK_WAIT:
+		mutex_exit(&kernel_mutex);
+
+		/* The ..._mysql_... function works also for InnoDB's
+		internal threads. Let us wait that the lock wait ends. */
+
+		srv_suspend_mysql_thread(thr);
+
+		if (thr_get_trx(thr)->error_state != DB_SUCCESS) {
+			/* thr was chosen as a deadlock victim or there was
+			a lock wait timeout */
+
+			que_thr_dec_refer_count(thr, NULL);
+
+			return;
+		}
+
+		goto loop;
+
+	case QUE_THR_COMPLETED:
+	case QUE_THR_COMMAND_WAIT:
+		/* Do nothing */
+		break;
+
+	default:
+		ut_error;
+	}
+
+	mutex_exit(&kernel_mutex);
+}
+
+/*********************************************************************//**
+Evaluate the given SQL.
+@return	error code or DB_SUCCESS */
+UNIV_INTERN
+ulint
+que_eval_sql(
+/*=========*/
+	pars_info_t*	info,	/*!< in: info struct, or NULL */
+	const char*	sql,	/*!< in: SQL string */
+	ibool		reserve_dict_mutex,
+				/*!< in: if TRUE, acquire/release
+				dict_sys->mutex around call to pars_sql. */
+	trx_t*		trx)	/*!< in: trx */
+{
+	que_thr_t*	thr;
+	que_t*		graph;
+
+	ut_a(trx->error_state == DB_SUCCESS);
+
+	if (reserve_dict_mutex) {
+		mutex_enter(&dict_sys->mutex);
+	}
+
+	graph = pars_sql(info, sql);
+
+	if (reserve_dict_mutex) {
+		mutex_exit(&dict_sys->mutex);
+	}
+
+	ut_a(graph);
+
+	graph->trx = trx;
+	trx->graph = NULL;
+
+	graph->fork_type = QUE_FORK_MYSQL_INTERFACE;
+
+	ut_a(thr = que_fork_start_command(graph));
+
+	que_run_threads(thr);
+
+	que_graph_free(graph);
+
+	return(trx->error_state);
+}
diff --git a/storage/xtradb/read/read0read.c b/storage/xtradb/read/read0read.c
new file mode 100644
index 00000000000..85adae4ddff
--- /dev/null
+++ b/storage/xtradb/read/read0read.c
@@ -0,0 +1,540 @@
+/*****************************************************************************
+
+Copyright (c) 1997, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file read/read0read.c
+Cursor read
+
+Created 2/16/1997 Heikki Tuuri
+*******************************************************/
+
+#include "read0read.h"
+
+#ifdef UNIV_NONINL
+#include "read0read.ic"
+#endif
+
+#include "srv0srv.h"
+#include "trx0sys.h"
+
+/*
+-------------------------------------------------------------------------------
+FACT A: Cursor read view on a secondary index sees only committed versions
+-------
+of the records in the secondary index or those versions of rows created
+by transaction which created a cursor before cursor was created even
+if transaction which created the cursor has changed that clustered index page.
+
+PROOF: We must show that read goes always to the clustered index record
+to see that record is visible in the cursor read view. Consider e.g.
+following table and SQL-clauses:
+
+create table t1(a int not null, b int, primary key(a), index(b));
+insert into t1 values (1,1),(2,2);
+commit;
+
+Now consider that we have a cursor for a query
+
+select b from t1 where b >= 1;
+
+This query will use secondary key on the table t1. Now after the first fetch
+on this cursor if we do a update:
+
+update t1 set b = 5 where b = 2;
+
+Now second fetch of the cursor should not see record (2,5) instead it should
+see record (2,2).
+
+We also should show that if we have delete t1 where b = 5; we still
+can see record (2,2).
+
+When we access a secondary key record maximum transaction id is fetched
+from this record and this trx_id is compared to up_limit_id in the view.
+If trx_id in the record is greater or equal than up_limit_id in the view
+cluster record is accessed.  Because trx_id of the creating
+transaction is stored when this view was created to the list of
+trx_ids not seen by this read view previous version of the
+record is requested to be built. This is build using clustered record.
+If the secondary key record is delete  marked it's corresponding
+clustered record can be already be purged only if records
+trx_id < low_limit_no. Purge can't remove any record deleted by a
+transaction which was active when cursor was created. But, we still
+may have a deleted secondary key record but no clustered record. But,
+this is not a problem because this case is handled in
+row_sel_get_clust_rec() function which is called
+whenever we note that this read view does not see trx_id in the
+record. Thus, we see correct version. Q. E. D.
+
+-------------------------------------------------------------------------------
+FACT B: Cursor read view on a clustered index sees only committed versions
+-------
+of the records in the clustered index or those versions of rows created
+by transaction which created a cursor before cursor was created even
+if transaction which created the cursor has changed that clustered index page.
+
+PROOF:  Consider e.g.following table and SQL-clauses:
+
+create table t1(a int not null, b int, primary key(a));
+insert into t1 values (1),(2);
+commit;
+
+Now consider that we have a cursor for a query
+
+select a from t1 where a >= 1;
+
+This query will use clustered key on the table t1. Now after the first fetch
+on this cursor if we do a update:
+
+update t1 set a = 5 where a = 2;
+
+Now second fetch of the cursor should not see record (5) instead it should
+see record (2).
+
+We also should show that if we have execute delete t1 where a = 5; after
+the cursor is opened we still can see record (2).
+
+When accessing clustered record we always check if this read view sees
+trx_id stored to clustered record. By default we don't see any changes
+if record trx_id >= low_limit_id i.e. change was made transaction
+which started after transaction which created the cursor. If row
+was changed by the future transaction a previous version of the
+clustered record is created. Thus we see only committed version in
+this case. We see all changes made by committed transactions i.e.
+record trx_id < up_limit_id. In this case we don't need to do anything,
+we already see correct version of the record. We don't see any changes
+made by active transaction except creating transaction. We have stored
+trx_id of creating transaction to list of trx_ids when this view was
+created. Thus we can easily see if this record was changed by the
+creating transaction. Because we already have clustered record we can
+access roll_ptr. Using this roll_ptr we can fetch undo record.
+We can now check that undo_no of the undo record is less than undo_no of the
+trancaction which created a view when cursor was created. We see this
+clustered record only in case when record undo_no is less than undo_no
+in the view. If this is not true we build based on undo_rec previous
+version of the record. This record is found because purge can't remove
+records accessed by active transaction. Thus we see correct version. Q. E. D.
+-------------------------------------------------------------------------------
+FACT C: Purge does not remove any delete marked row that is visible
+-------
+to cursor view.
+
+TODO: proof this
+
+*/
+
+/*********************************************************************//**
+Creates a read view object.
+@return	own: read view struct */
+UNIV_INLINE
+read_view_t*
+read_view_create_low(
+/*=================*/
+	ulint		n,	/*!< in: number of cells in the trx_ids array */
+	mem_heap_t*	heap)	/*!< in: memory heap from which allocated */
+{
+	read_view_t*	view;
+
+	view = mem_heap_alloc(heap, sizeof(read_view_t));
+
+	view->n_trx_ids = n;
+	view->trx_ids = mem_heap_alloc(heap, n * sizeof *view->trx_ids);
+
+	return(view);
+}
+
+/*********************************************************************//**
+Makes a copy of the oldest existing read view, with the exception that also
+the creating trx of the oldest view is set as not visible in the 'copied'
+view. Opens a new view if no views currently exist. The view must be closed
+with ..._close. This is used in purge.
+@return	own: read view struct */
+UNIV_INTERN
+read_view_t*
+read_view_oldest_copy_or_open_new(
+/*==============================*/
+	trx_id_t	cr_trx_id,	/*!< in: trx_id of creating
+					transaction, or ut_dulint_zero
+					used in purge */
+	mem_heap_t*	heap)		/*!< in: memory heap from which
+					allocated */
+{
+	read_view_t*	old_view;
+	read_view_t*	view_copy;
+	ibool		needs_insert	= TRUE;
+	ulint		insert_done	= 0;
+	ulint		n;
+	ulint		i;
+
+	ut_ad(mutex_own(&kernel_mutex));
+
+	old_view = UT_LIST_GET_LAST(trx_sys->view_list);
+
+	if (old_view == NULL) {
+
+		return(read_view_open_now(cr_trx_id, heap));
+	}
+
+	n = old_view->n_trx_ids;
+
+	if (!ut_dulint_is_zero(old_view->creator_trx_id)) {
+		n++;
+	} else {
+		needs_insert = FALSE;
+	}
+
+	view_copy = read_view_create_low(n, heap);
+
+	/* Insert the id of the creator in the right place of the descending
+	array of ids, if needs_insert is TRUE: */
+
+	i = 0;
+	while (i < n) {
+		if (needs_insert
+		    && (i >= old_view->n_trx_ids
+			|| ut_dulint_cmp(old_view->creator_trx_id,
+					 read_view_get_nth_trx_id(old_view, i))
+			> 0)) {
+
+			read_view_set_nth_trx_id(view_copy, i,
+						 old_view->creator_trx_id);
+			needs_insert = FALSE;
+			insert_done = 1;
+		} else {
+			read_view_set_nth_trx_id(view_copy, i,
+						 read_view_get_nth_trx_id(
+							 old_view,
+							 i - insert_done));
+		}
+
+		i++;
+	}
+
+	view_copy->creator_trx_id = cr_trx_id;
+
+	view_copy->low_limit_no = old_view->low_limit_no;
+	view_copy->low_limit_id = old_view->low_limit_id;
+
+
+	if (n > 0) {
+		/* The last active transaction has the smallest id: */
+		view_copy->up_limit_id = read_view_get_nth_trx_id(
+			view_copy, n - 1);
+	} else {
+		view_copy->up_limit_id = old_view->up_limit_id;
+	}
+
+	UT_LIST_ADD_LAST(view_list, trx_sys->view_list, view_copy);
+
+	return(view_copy);
+}
+
+/*********************************************************************//**
+Opens a read view where exactly the transactions serialized before this
+point in time are seen in the view.
+@return	own: read view struct */
+UNIV_INTERN
+read_view_t*
+read_view_open_now(
+/*===============*/
+	trx_id_t	cr_trx_id,	/*!< in: trx_id of creating
+					transaction, or ut_dulint_zero
+					used in purge */
+	mem_heap_t*	heap)		/*!< in: memory heap from which
+					allocated */
+{
+	read_view_t*	view;
+	trx_t*		trx;
+	ulint		n;
+
+	ut_ad(mutex_own(&kernel_mutex));
+
+	view = read_view_create_low(UT_LIST_GET_LEN(trx_sys->trx_list), heap);
+
+	view->creator_trx_id = cr_trx_id;
+	view->type = VIEW_NORMAL;
+	view->undo_no = ut_dulint_zero;
+
+	/* No future transactions should be visible in the view */
+
+	view->low_limit_no = trx_sys->max_trx_id;
+	view->low_limit_id = view->low_limit_no;
+
+	n = 0;
+	trx = UT_LIST_GET_FIRST(trx_sys->trx_list);
+
+	/* No active transaction should be visible, except cr_trx */
+
+	while (trx) {
+		if (ut_dulint_cmp(trx->id, cr_trx_id) != 0
+		    && (trx->conc_state == TRX_ACTIVE
+			|| trx->conc_state == TRX_PREPARED)) {
+
+			read_view_set_nth_trx_id(view, n, trx->id);
+
+			n++;
+
+			/* NOTE that a transaction whose trx number is <
+			trx_sys->max_trx_id can still be active, if it is
+			in the middle of its commit! Note that when a
+			transaction starts, we initialize trx->no to
+			ut_dulint_max. */
+
+			if (ut_dulint_cmp(view->low_limit_no, trx->no) > 0) {
+
+				view->low_limit_no = trx->no;
+			}
+		}
+
+		trx = UT_LIST_GET_NEXT(trx_list, trx);
+	}
+
+	view->n_trx_ids = n;
+
+	if (n > 0) {
+		/* The last active transaction has the smallest id: */
+		view->up_limit_id = read_view_get_nth_trx_id(view, n - 1);
+	} else {
+		view->up_limit_id = view->low_limit_id;
+	}
+
+
+	UT_LIST_ADD_FIRST(view_list, trx_sys->view_list, view);
+
+	return(view);
+}
+
+/*********************************************************************//**
+Closes a read view. */
+UNIV_INTERN
+void
+read_view_close(
+/*============*/
+	read_view_t*	view)	/*!< in: read view */
+{
+	ut_ad(mutex_own(&kernel_mutex));
+
+	UT_LIST_REMOVE(view_list, trx_sys->view_list, view);
+}
+
+/*********************************************************************//**
+Closes a consistent read view for MySQL. This function is called at an SQL
+statement end if the trx isolation level is <= TRX_ISO_READ_COMMITTED. */
+UNIV_INTERN
+void
+read_view_close_for_mysql(
+/*======================*/
+	trx_t*	trx)	/*!< in: trx which has a read view */
+{
+	ut_a(trx->global_read_view);
+
+	mutex_enter(&kernel_mutex);
+
+	read_view_close(trx->global_read_view);
+
+	mem_heap_empty(trx->global_read_view_heap);
+
+	trx->read_view = NULL;
+	trx->global_read_view = NULL;
+
+	mutex_exit(&kernel_mutex);
+}
+
+/*********************************************************************//**
+Prints a read view to stderr. */
+UNIV_INTERN
+void
+read_view_print(
+/*============*/
+	const read_view_t*	view)	/*!< in: read view */
+{
+	ulint	n_ids;
+	ulint	i;
+
+	if (view->type == VIEW_HIGH_GRANULARITY) {
+		fprintf(stderr,
+			"High-granularity read view undo_n:o %lu %lu\n",
+			(ulong) ut_dulint_get_high(view->undo_no),
+			(ulong) ut_dulint_get_low(view->undo_no));
+	} else {
+		fprintf(stderr, "Normal read view\n");
+	}
+
+	fprintf(stderr, "Read view low limit trx n:o %lu %lu\n",
+		(ulong) ut_dulint_get_high(view->low_limit_no),
+		(ulong) ut_dulint_get_low(view->low_limit_no));
+
+	fprintf(stderr, "Read view up limit trx id " TRX_ID_FMT "\n",
+		TRX_ID_PREP_PRINTF(view->up_limit_id));
+
+	fprintf(stderr, "Read view low limit trx id " TRX_ID_FMT "\n",
+		TRX_ID_PREP_PRINTF(view->low_limit_id));
+
+	fprintf(stderr, "Read view individually stored trx ids:\n");
+
+	n_ids = view->n_trx_ids;
+
+	for (i = 0; i < n_ids; i++) {
+		fprintf(stderr, "Read view trx id " TRX_ID_FMT "\n",
+			TRX_ID_PREP_PRINTF(
+				read_view_get_nth_trx_id(view, i)));
+	}
+}
+
+/*********************************************************************//**
+Create a high-granularity consistent cursor view for mysql to be used
+in cursors. In this consistent read view modifications done by the
+creating transaction after the cursor is created or future transactions
+are not visible. */
+UNIV_INTERN
+cursor_view_t*
+read_cursor_view_create_for_mysql(
+/*==============================*/
+	trx_t*	cr_trx)	/*!< in: trx where cursor view is created */
+{
+	cursor_view_t*	curview;
+	read_view_t*	view;
+	mem_heap_t*	heap;
+	trx_t*		trx;
+	ulint		n;
+
+	ut_a(cr_trx);
+
+	/* Use larger heap than in trx_create when creating a read_view
+	because cursors are quite long. */
+
+	heap = mem_heap_create(512);
+
+	curview = (cursor_view_t*) mem_heap_alloc(heap, sizeof(cursor_view_t));
+	curview->heap = heap;
+
+	/* Drop cursor tables from consideration when evaluating the need of
+	auto-commit */
+	curview->n_mysql_tables_in_use = cr_trx->n_mysql_tables_in_use;
+	cr_trx->n_mysql_tables_in_use = 0;
+
+	mutex_enter(&kernel_mutex);
+
+	curview->read_view = read_view_create_low(
+		UT_LIST_GET_LEN(trx_sys->trx_list), curview->heap);
+
+	view = curview->read_view;
+	view->creator_trx_id = cr_trx->id;
+	view->type = VIEW_HIGH_GRANULARITY;
+	view->undo_no = cr_trx->undo_no;
+
+	/* No future transactions should be visible in the view */
+
+	view->low_limit_no = trx_sys->max_trx_id;
+	view->low_limit_id = view->low_limit_no;
+
+	n = 0;
+	trx = UT_LIST_GET_FIRST(trx_sys->trx_list);
+
+	/* No active transaction should be visible */
+
+	while (trx) {
+
+		if (trx->conc_state == TRX_ACTIVE
+		    || trx->conc_state == TRX_PREPARED) {
+
+			read_view_set_nth_trx_id(view, n, trx->id);
+
+			n++;
+
+			/* NOTE that a transaction whose trx number is <
+			trx_sys->max_trx_id can still be active, if it is
+			in the middle of its commit! Note that when a
+			transaction starts, we initialize trx->no to
+			ut_dulint_max. */
+
+			if (ut_dulint_cmp(view->low_limit_no, trx->no) > 0) {
+
+				view->low_limit_no = trx->no;
+			}
+		}
+
+		trx = UT_LIST_GET_NEXT(trx_list, trx);
+	}
+
+	view->n_trx_ids = n;
+
+	if (n > 0) {
+		/* The last active transaction has the smallest id: */
+		view->up_limit_id = read_view_get_nth_trx_id(view, n - 1);
+	} else {
+		view->up_limit_id = view->low_limit_id;
+	}
+
+	UT_LIST_ADD_FIRST(view_list, trx_sys->view_list, view);
+
+	mutex_exit(&kernel_mutex);
+
+	return(curview);
+}
+
+/*********************************************************************//**
+Close a given consistent cursor view for mysql and restore global read view
+back to a transaction read view. */
+UNIV_INTERN
+void
+read_cursor_view_close_for_mysql(
+/*=============================*/
+	trx_t*		trx,	/*!< in: trx */
+	cursor_view_t*	curview)/*!< in: cursor view to be closed */
+{
+	ut_a(curview);
+	ut_a(curview->read_view);
+	ut_a(curview->heap);
+
+	/* Add cursor's tables to the global count of active tables that
+	belong to this transaction */
+	trx->n_mysql_tables_in_use += curview->n_mysql_tables_in_use;
+
+	mutex_enter(&kernel_mutex);
+
+	read_view_close(curview->read_view);
+	trx->read_view = trx->global_read_view;
+
+	mutex_exit(&kernel_mutex);
+
+	mem_heap_free(curview->heap);
+}
+
+/*********************************************************************//**
+This function sets a given consistent cursor view to a transaction
+read view if given consistent cursor view is not NULL. Otherwise, function
+restores a global read view to a transaction read view. */
+UNIV_INTERN
+void
+read_cursor_set_for_mysql(
+/*======================*/
+	trx_t*		trx,	/*!< in: transaction where cursor is set */
+	cursor_view_t*	curview)/*!< in: consistent cursor view to be set */
+{
+	ut_a(trx);
+
+	mutex_enter(&kernel_mutex);
+
+	if (UNIV_LIKELY(curview != NULL)) {
+		trx->read_view = curview->read_view;
+	} else {
+		trx->read_view = trx->global_read_view;
+	}
+
+	mutex_exit(&kernel_mutex);
+}
diff --git a/storage/xtradb/rem/rem0cmp.c b/storage/xtradb/rem/rem0cmp.c
new file mode 100644
index 00000000000..8ee434f85da
--- /dev/null
+++ b/storage/xtradb/rem/rem0cmp.c
@@ -0,0 +1,1204 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/*******************************************************************//**
+@file rem/rem0cmp.c
+Comparison services for records
+
+Created 7/1/1994 Heikki Tuuri
+************************************************************************/
+
+#include "rem0cmp.h"
+
+#ifdef UNIV_NONINL
+#include "rem0cmp.ic"
+#endif
+
+#include "srv0srv.h"
+
+/*		ALPHABETICAL ORDER
+		==================
+
+The records are put into alphabetical order in the following
+way: let F be the first field where two records disagree.
+If there is a character in some position n where the
+records disagree, the order is determined by comparison of
+the characters at position n, possibly after
+collating transformation. If there is no such character,
+but the corresponding fields have different lengths, then
+if the data type of the fields is paddable,
+shorter field is padded with a padding character. If the
+data type is not paddable, longer field is considered greater.
+Finally, the SQL null is bigger than any other value.
+
+At the present, the comparison functions return 0 in the case,
+where two records disagree only in the way that one
+has more fields than the other. */
+
+#ifdef UNIV_DEBUG
+/*************************************************************//**
+Used in debug checking of cmp_dtuple_... .
+This function is used to compare a data tuple to a physical record. If
+dtuple has n fields then rec must have either m >= n fields, or it must
+differ from dtuple in some of the m fields rec has.
+@return 1, 0, -1, if dtuple is greater, equal, less than rec,
+respectively, when only the common first fields are compared */
+static
+int
+cmp_debug_dtuple_rec_with_match(
+/*============================*/
+	const dtuple_t*	dtuple,	/*!< in: data tuple */
+	const rec_t*	rec,	/*!< in: physical record which differs from
+				dtuple in some of the common fields, or which
+				has an equal number or more fields than
+				dtuple */
+	const ulint*	offsets,/*!< in: array returned by rec_get_offsets() */
+	ulint*		matched_fields);/*!< in/out: number of already
+				completely  matched fields; when function
+				returns, contains the value for current
+				comparison */
+#endif /* UNIV_DEBUG */
+/*************************************************************//**
+This function is used to compare two data fields for which the data type
+is such that we must use MySQL code to compare them. The prototype here
+must be a copy of the one in ha_innobase.cc!
+@return	1, 0, -1, if a is greater, equal, less than b, respectively */
+extern
+int
+innobase_mysql_cmp(
+/*===============*/
+	int		mysql_type,	/*!< in: MySQL type */
+	uint		charset_number,	/*!< in: number of the charset */
+	const unsigned char* a,		/*!< in: data field */
+	unsigned int	a_length,	/*!< in: data field length,
+					not UNIV_SQL_NULL */
+	const unsigned char* b,		/*!< in: data field */
+	unsigned int	b_length);	/*!< in: data field length,
+					not UNIV_SQL_NULL */
+/*********************************************************************//**
+Transforms the character code so that it is ordered appropriately for the
+language. This is only used for the latin1 char set. MySQL does the
+comparisons for other char sets.
+@return	collation order position */
+UNIV_INLINE
+ulint
+cmp_collate(
+/*========*/
+	ulint	code)	/*!< in: code of a character stored in database record */
+{
+	return((ulint) srv_latin1_ordering[code]);
+}
+
+/*************************************************************//**
+Returns TRUE if two columns are equal for comparison purposes.
+@return	TRUE if the columns are considered equal in comparisons */
+UNIV_INTERN
+ibool
+cmp_cols_are_equal(
+/*===============*/
+	const dict_col_t*	col1,	/*!< in: column 1 */
+	const dict_col_t*	col2,	/*!< in: column 2 */
+	ibool			check_charsets)
+					/*!< in: whether to check charsets */
+{
+	if (dtype_is_non_binary_string_type(col1->mtype, col1->prtype)
+	    && dtype_is_non_binary_string_type(col2->mtype, col2->prtype)) {
+
+		/* Both are non-binary string types: they can be compared if
+		and only if the charset-collation is the same */
+
+		if (check_charsets) {
+			return(dtype_get_charset_coll(col1->prtype)
+			       == dtype_get_charset_coll(col2->prtype));
+		} else {
+			return(TRUE);
+		}
+	}
+
+	if (dtype_is_binary_string_type(col1->mtype, col1->prtype)
+	    && dtype_is_binary_string_type(col2->mtype, col2->prtype)) {
+
+		/* Both are binary string types: they can be compared */
+
+		return(TRUE);
+	}
+
+	if (col1->mtype != col2->mtype) {
+
+		return(FALSE);
+	}
+
+	if (col1->mtype == DATA_INT
+	    && (col1->prtype & DATA_UNSIGNED)
+	    != (col2->prtype & DATA_UNSIGNED)) {
+
+		/* The storage format of an unsigned integer is different
+		from a signed integer: in a signed integer we OR
+		0x8000... to the value of positive integers. */
+
+		return(FALSE);
+	}
+
+	return(col1->mtype != DATA_INT || col1->len == col2->len);
+}
+
+/*************************************************************//**
+Innobase uses this function to compare two data fields for which the data type
+is such that we must compare whole fields or call MySQL to do the comparison
+@return	1, 0, -1, if a is greater, equal, less than b, respectively */
+static
+int
+cmp_whole_field(
+/*============*/
+	ulint		mtype,		/*!< in: main type */
+	ulint		prtype,		/*!< in: precise type */
+	const byte*	a,		/*!< in: data field */
+	unsigned int	a_length,	/*!< in: data field length,
+					not UNIV_SQL_NULL */
+	const byte*	b,		/*!< in: data field */
+	unsigned int	b_length)	/*!< in: data field length,
+					not UNIV_SQL_NULL */
+{
+	float		f_1;
+	float		f_2;
+	double		d_1;
+	double		d_2;
+	int		swap_flag	= 1;
+
+	switch (mtype) {
+
+	case DATA_DECIMAL:
+		/* Remove preceding spaces */
+		for (; a_length && *a == ' '; a++, a_length--);
+		for (; b_length && *b == ' '; b++, b_length--);
+
+		if (*a == '-') {
+			if (*b != '-') {
+				return(-1);
+			}
+
+			a++; b++;
+			a_length--;
+			b_length--;
+
+			swap_flag = -1;
+
+		} else if (*b == '-') {
+
+			return(1);
+		}
+
+		while (a_length > 0 && (*a == '+' || *a == '0')) {
+			a++; a_length--;
+		}
+
+		while (b_length > 0 && (*b == '+' || *b == '0')) {
+			b++; b_length--;
+		}
+
+		if (a_length != b_length) {
+			if (a_length < b_length) {
+				return(-swap_flag);
+			}
+
+			return(swap_flag);
+		}
+
+		while (a_length > 0 && *a == *b) {
+
+			a++; b++; a_length--;
+		}
+
+		if (a_length == 0) {
+
+			return(0);
+		}
+
+		if (*a > *b) {
+			return(swap_flag);
+		}
+
+		return(-swap_flag);
+	case DATA_DOUBLE:
+		d_1 = mach_double_read(a);
+		d_2 = mach_double_read(b);
+
+		if (d_1 > d_2) {
+			return(1);
+		} else if (d_2 > d_1) {
+			return(-1);
+		}
+
+		return(0);
+
+	case DATA_FLOAT:
+		f_1 = mach_float_read(a);
+		f_2 = mach_float_read(b);
+
+		if (f_1 > f_2) {
+			return(1);
+		} else if (f_2 > f_1) {
+			return(-1);
+		}
+
+		return(0);
+	case DATA_BLOB:
+		if (prtype & DATA_BINARY_TYPE) {
+
+			ut_print_timestamp(stderr);
+			fprintf(stderr,
+				"  InnoDB: Error: comparing a binary BLOB"
+				" with a character set sensitive\n"
+				"InnoDB: comparison!\n");
+		}
+		/* fall through */
+	case DATA_VARMYSQL:
+	case DATA_MYSQL:
+		return(innobase_mysql_cmp(
+			       (int)(prtype & DATA_MYSQL_TYPE_MASK),
+			       (uint)dtype_get_charset_coll(prtype),
+			       a, a_length, b, b_length));
+	default:
+		fprintf(stderr,
+			"InnoDB: unknown type number %lu\n",
+			(ulong) mtype);
+		ut_error;
+	}
+
+	return(0);
+}
+
+/*************************************************************//**
+This function is used to compare two data fields for which we know the
+data type.
+@return	1, 0, -1, if data1 is greater, equal, less than data2, respectively */
+UNIV_INTERN
+int
+cmp_data_data_slow(
+/*===============*/
+	ulint		mtype,	/*!< in: main type */
+	ulint		prtype,	/*!< in: precise type */
+	const byte*	data1,	/*!< in: data field (== a pointer to a memory
+				buffer) */
+	ulint		len1,	/*!< in: data field length or UNIV_SQL_NULL */
+	const byte*	data2,	/*!< in: data field (== a pointer to a memory
+				buffer) */
+	ulint		len2)	/*!< in: data field length or UNIV_SQL_NULL */
+{
+	ulint	data1_byte;
+	ulint	data2_byte;
+	ulint	cur_bytes;
+
+	if (len1 == UNIV_SQL_NULL || len2 == UNIV_SQL_NULL) {
+
+		if (len1 == len2) {
+
+			return(0);
+		}
+
+		if (len1 == UNIV_SQL_NULL) {
+			/* We define the SQL null to be the smallest possible
+			value of a field in the alphabetical order */
+
+			return(-1);
+		}
+
+		return(1);
+	}
+
+	if (mtype >= DATA_FLOAT
+	    || (mtype == DATA_BLOB
+		&& 0 == (prtype & DATA_BINARY_TYPE)
+		&& dtype_get_charset_coll(prtype)
+		!= DATA_MYSQL_LATIN1_SWEDISH_CHARSET_COLL)) {
+
+		return(cmp_whole_field(mtype, prtype,
+				       data1, (unsigned) len1,
+				       data2, (unsigned) len2));
+	}
+
+	/* Compare then the fields */
+
+	cur_bytes = 0;
+
+	for (;;) {
+		if (len1 <= cur_bytes) {
+			if (len2 <= cur_bytes) {
+
+				return(0);
+			}
+
+			data1_byte = dtype_get_pad_char(mtype, prtype);
+
+			if (data1_byte == ULINT_UNDEFINED) {
+
+				return(-1);
+			}
+		} else {
+			data1_byte = *data1;
+		}
+
+		if (len2 <= cur_bytes) {
+			data2_byte = dtype_get_pad_char(mtype, prtype);
+
+			if (data2_byte == ULINT_UNDEFINED) {
+
+				return(1);
+			}
+		} else {
+			data2_byte = *data2;
+		}
+
+		if (data1_byte == data2_byte) {
+			/* If the bytes are equal, they will remain such even
+			after the collation transformation below */
+
+			goto next_byte;
+		}
+
+		if (mtype <= DATA_CHAR
+		    || (mtype == DATA_BLOB
+			&& 0 == (prtype & DATA_BINARY_TYPE))) {
+
+			data1_byte = cmp_collate(data1_byte);
+			data2_byte = cmp_collate(data2_byte);
+		}
+
+		if (data1_byte > data2_byte) {
+
+			return(1);
+		} else if (data1_byte < data2_byte) {
+
+			return(-1);
+		}
+next_byte:
+		/* Next byte */
+		cur_bytes++;
+		data1++;
+		data2++;
+	}
+
+	return(0);		/* Not reached */
+}
+
+/*************************************************************//**
+This function is used to compare a data tuple to a physical record.
+Only dtuple->n_fields_cmp first fields are taken into account for
+the data tuple! If we denote by n = n_fields_cmp, then rec must
+have either m >= n fields, or it must differ from dtuple in some of
+the m fields rec has. If rec has an externally stored field we do not
+compare it but return with value 0 if such a comparison should be
+made.
+@return 1, 0, -1, if dtuple is greater, equal, less than rec,
+respectively, when only the common first fields are compared, or until
+the first externally stored field in rec */
+UNIV_INTERN
+int
+cmp_dtuple_rec_with_match(
+/*======================*/
+	const dtuple_t*	dtuple,	/*!< in: data tuple */
+	const rec_t*	rec,	/*!< in: physical record which differs from
+				dtuple in some of the common fields, or which
+				has an equal number or more fields than
+				dtuple */
+	const ulint*	offsets,/*!< in: array returned by rec_get_offsets() */
+	ulint*		matched_fields, /*!< in/out: number of already completely
+				matched fields; when function returns,
+				contains the value for current comparison */
+	ulint*		matched_bytes) /*!< in/out: number of already matched
+				bytes within the first field not completely
+				matched; when function returns, contains the
+				value for current comparison */
+{
+	const dfield_t*	dtuple_field;	/* current field in logical record */
+	ulint		dtuple_f_len;	/* the length of the current field
+					in the logical record */
+	const byte*	dtuple_b_ptr;	/* pointer to the current byte in
+					logical field data */
+	ulint		dtuple_byte;	/* value of current byte to be compared
+					in dtuple*/
+	ulint		rec_f_len;	/* length of current field in rec */
+	const byte*	rec_b_ptr;	/* pointer to the current byte in
+					rec field */
+	ulint		rec_byte;	/* value of current byte to be
+					compared in rec */
+	ulint		cur_field;	/* current field number */
+	ulint		cur_bytes;	/* number of already matched bytes
+					in current field */
+	int		ret = 3333;	/* return value */
+
+	ut_ad(dtuple && rec && matched_fields && matched_bytes);
+	ut_ad(dtuple_check_typed(dtuple));
+	ut_ad(rec_offs_validate(rec, NULL, offsets));
+
+	cur_field = *matched_fields;
+	cur_bytes = *matched_bytes;
+
+	ut_ad(cur_field <= dtuple_get_n_fields_cmp(dtuple));
+	ut_ad(cur_field <= rec_offs_n_fields(offsets));
+
+	if (cur_bytes == 0 && cur_field == 0) {
+		ulint	rec_info = rec_get_info_bits(rec,
+						     rec_offs_comp(offsets));
+		ulint	tup_info = dtuple_get_info_bits(dtuple);
+
+		if (UNIV_UNLIKELY(rec_info & REC_INFO_MIN_REC_FLAG)) {
+			ret = !(tup_info & REC_INFO_MIN_REC_FLAG);
+			goto order_resolved;
+		} else if (UNIV_UNLIKELY(tup_info & REC_INFO_MIN_REC_FLAG)) {
+			ret = -1;
+			goto order_resolved;
+		}
+	}
+
+	/* Match fields in a loop; stop if we run out of fields in dtuple
+	or find an externally stored field */
+
+	while (cur_field < dtuple_get_n_fields_cmp(dtuple)) {
+
+		ulint	mtype;
+		ulint	prtype;
+
+		dtuple_field = dtuple_get_nth_field(dtuple, cur_field);
+		{
+			const dtype_t*	type
+				= dfield_get_type(dtuple_field);
+
+			mtype = type->mtype;
+			prtype = type->prtype;
+		}
+
+		dtuple_f_len = dfield_get_len(dtuple_field);
+
+		rec_b_ptr = rec_get_nth_field(rec, offsets,
+					      cur_field, &rec_f_len);
+
+		/* If we have matched yet 0 bytes, it may be that one or
+		both the fields are SQL null, or the record or dtuple may be
+		the predefined minimum record, or the field is externally
+		stored */
+
+		if (UNIV_LIKELY(cur_bytes == 0)) {
+			if (rec_offs_nth_extern(offsets, cur_field)) {
+				/* We do not compare to an externally
+				stored field */
+
+				ret = 0;
+
+				goto order_resolved;
+			}
+
+			if (dtuple_f_len == UNIV_SQL_NULL) {
+				if (rec_f_len == UNIV_SQL_NULL) {
+
+					goto next_field;
+				}
+
+				ret = -1;
+				goto order_resolved;
+			} else if (rec_f_len == UNIV_SQL_NULL) {
+				/* We define the SQL null to be the
+				smallest possible value of a field
+				in the alphabetical order */
+
+				ret = 1;
+				goto order_resolved;
+			}
+		}
+
+		if (mtype >= DATA_FLOAT
+		    || (mtype == DATA_BLOB
+			&& 0 == (prtype & DATA_BINARY_TYPE)
+			&& dtype_get_charset_coll(prtype)
+			!= DATA_MYSQL_LATIN1_SWEDISH_CHARSET_COLL)) {
+
+			ret = cmp_whole_field(mtype, prtype,
+					      dfield_get_data(dtuple_field),
+					      (unsigned) dtuple_f_len,
+					      rec_b_ptr, (unsigned) rec_f_len);
+
+			if (ret != 0) {
+				cur_bytes = 0;
+
+				goto order_resolved;
+			} else {
+				goto next_field;
+			}
+		}
+
+		/* Set the pointers at the current byte */
+
+		rec_b_ptr = rec_b_ptr + cur_bytes;
+		dtuple_b_ptr = (byte*)dfield_get_data(dtuple_field)
+			+ cur_bytes;
+		/* Compare then the fields */
+
+		for (;;) {
+			if (UNIV_UNLIKELY(rec_f_len <= cur_bytes)) {
+				if (dtuple_f_len <= cur_bytes) {
+
+					goto next_field;
+				}
+
+				rec_byte = dtype_get_pad_char(mtype, prtype);
+
+				if (rec_byte == ULINT_UNDEFINED) {
+					ret = 1;
+
+					goto order_resolved;
+				}
+			} else {
+				rec_byte = *rec_b_ptr;
+			}
+
+			if (UNIV_UNLIKELY(dtuple_f_len <= cur_bytes)) {
+				dtuple_byte = dtype_get_pad_char(mtype,
+								 prtype);
+
+				if (dtuple_byte == ULINT_UNDEFINED) {
+					ret = -1;
+
+					goto order_resolved;
+				}
+			} else {
+				dtuple_byte = *dtuple_b_ptr;
+			}
+
+			if (dtuple_byte == rec_byte) {
+				/* If the bytes are equal, they will
+				remain such even after the collation
+				transformation below */
+
+				goto next_byte;
+			}
+
+			if (mtype <= DATA_CHAR
+			    || (mtype == DATA_BLOB
+				&& !(prtype & DATA_BINARY_TYPE))) {
+
+				rec_byte = cmp_collate(rec_byte);
+				dtuple_byte = cmp_collate(dtuple_byte);
+			}
+
+			ret = (int) (dtuple_byte - rec_byte);
+			if (UNIV_LIKELY(ret)) {
+				if (ret < 0) {
+					ret = -1;
+					goto order_resolved;
+				} else {
+					ret = 1;
+					goto order_resolved;
+				}
+			}
+next_byte:
+			/* Next byte */
+			cur_bytes++;
+			rec_b_ptr++;
+			dtuple_b_ptr++;
+		}
+
+next_field:
+		cur_field++;
+		cur_bytes = 0;
+	}
+
+	ut_ad(cur_bytes == 0);
+
+	ret = 0;	/* If we ran out of fields, dtuple was equal to rec
+			up to the common fields */
+order_resolved:
+	ut_ad((ret >= - 1) && (ret <= 1));
+	ut_ad(ret == cmp_debug_dtuple_rec_with_match(dtuple, rec, offsets,
+						     matched_fields));
+	ut_ad(*matched_fields == cur_field); /* In the debug version, the
+					     above cmp_debug_... sets
+					     *matched_fields to a value */
+	*matched_fields = cur_field;
+	*matched_bytes = cur_bytes;
+
+	return(ret);
+}
+
+/**************************************************************//**
+Compares a data tuple to a physical record.
+@see cmp_dtuple_rec_with_match
+@return 1, 0, -1, if dtuple is greater, equal, less than rec, respectively */
+UNIV_INTERN
+int
+cmp_dtuple_rec(
+/*===========*/
+	const dtuple_t*	dtuple,	/*!< in: data tuple */
+	const rec_t*	rec,	/*!< in: physical record */
+	const ulint*	offsets)/*!< in: array returned by rec_get_offsets() */
+{
+	ulint	matched_fields	= 0;
+	ulint	matched_bytes	= 0;
+
+	ut_ad(rec_offs_validate(rec, NULL, offsets));
+	return(cmp_dtuple_rec_with_match(dtuple, rec, offsets,
+					 &matched_fields, &matched_bytes));
+}
+
+/**************************************************************//**
+Checks if a dtuple is a prefix of a record. The last field in dtuple
+is allowed to be a prefix of the corresponding field in the record.
+@return	TRUE if prefix */
+UNIV_INTERN
+ibool
+cmp_dtuple_is_prefix_of_rec(
+/*========================*/
+	const dtuple_t*	dtuple,	/*!< in: data tuple */
+	const rec_t*	rec,	/*!< in: physical record */
+	const ulint*	offsets)/*!< in: array returned by rec_get_offsets() */
+{
+	ulint	n_fields;
+	ulint	matched_fields	= 0;
+	ulint	matched_bytes	= 0;
+
+	ut_ad(rec_offs_validate(rec, NULL, offsets));
+	n_fields = dtuple_get_n_fields(dtuple);
+
+	if (n_fields > rec_offs_n_fields(offsets)) {
+
+		return(FALSE);
+	}
+
+	cmp_dtuple_rec_with_match(dtuple, rec, offsets,
+				  &matched_fields, &matched_bytes);
+	if (matched_fields == n_fields) {
+
+		return(TRUE);
+	}
+
+	if (matched_fields == n_fields - 1
+	    && matched_bytes == dfield_get_len(
+		    dtuple_get_nth_field(dtuple, n_fields - 1))) {
+		return(TRUE);
+	}
+
+	return(FALSE);
+}
+
+/*************************************************************//**
+Compare two physical records that contain the same number of columns,
+none of which are stored externally.
+@return	1, 0, -1 if rec1 is greater, equal, less, respectively, than rec2 */
+UNIV_INTERN
+int
+cmp_rec_rec_simple(
+/*===============*/
+	const rec_t*		rec1,	/*!< in: physical record */
+	const rec_t*		rec2,	/*!< in: physical record */
+	const ulint*		offsets1,/*!< in: rec_get_offsets(rec1, ...) */
+	const ulint*		offsets2,/*!< in: rec_get_offsets(rec2, ...) */
+	const dict_index_t*	index,	/*!< in: data dictionary index */
+	ibool*			null_eq)/*!< out: set to TRUE if
+					found matching null values */
+{
+	ulint		rec1_f_len;	/*!< length of current field in rec1 */
+	const byte*	rec1_b_ptr;	/*!< pointer to the current byte
+					in rec1 field */
+	ulint		rec1_byte;	/*!< value of current byte to be
+					compared in rec1 */
+	ulint		rec2_f_len;	/*!< length of current field in rec2 */
+	const byte*	rec2_b_ptr;	/*!< pointer to the current byte
+					in rec2 field */
+	ulint		rec2_byte;	/*!< value of current byte to be
+					compared in rec2 */
+	ulint		cur_field;	/*!< current field number */
+	ulint		n_uniq;
+
+	n_uniq = dict_index_get_n_unique(index);
+	ut_ad(rec_offs_n_fields(offsets1) >= n_uniq);
+	ut_ad(rec_offs_n_fields(offsets2) >= n_uniq);
+
+	ut_ad(rec_offs_comp(offsets1) == rec_offs_comp(offsets2));
+
+	for (cur_field = 0; cur_field < n_uniq; cur_field++) {
+
+		ulint	cur_bytes;
+		ulint	mtype;
+		ulint	prtype;
+
+		{
+			const dict_col_t*	col
+				= dict_index_get_nth_col(index, cur_field);
+
+			mtype = col->mtype;
+			prtype = col->prtype;
+		}
+
+		ut_ad(!rec_offs_nth_extern(offsets1, cur_field));
+		ut_ad(!rec_offs_nth_extern(offsets2, cur_field));
+
+		rec1_b_ptr = rec_get_nth_field(rec1, offsets1,
+					       cur_field, &rec1_f_len);
+		rec2_b_ptr = rec_get_nth_field(rec2, offsets2,
+					       cur_field, &rec2_f_len);
+
+		if (rec1_f_len == UNIV_SQL_NULL
+		    || rec2_f_len == UNIV_SQL_NULL) {
+
+			if (rec1_f_len == rec2_f_len) {
+				if (null_eq) {
+					*null_eq = TRUE;
+				}
+
+				goto next_field;
+
+			} else if (rec2_f_len == UNIV_SQL_NULL) {
+
+				/* We define the SQL null to be the
+				smallest possible value of a field
+				in the alphabetical order */
+
+				return(1);
+			} else {
+				return(-1);
+			}
+		}
+
+		if (mtype >= DATA_FLOAT
+		    || (mtype == DATA_BLOB
+			&& 0 == (prtype & DATA_BINARY_TYPE)
+			&& dtype_get_charset_coll(prtype)
+			!= DATA_MYSQL_LATIN1_SWEDISH_CHARSET_COLL)) {
+			int ret = cmp_whole_field(mtype, prtype,
+						  rec1_b_ptr,
+						  (unsigned) rec1_f_len,
+						  rec2_b_ptr,
+						  (unsigned) rec2_f_len);
+			if (ret) {
+				return(ret);
+			}
+
+			goto next_field;
+		}
+
+		/* Compare the fields */
+		for (cur_bytes = 0;; cur_bytes++, rec1_b_ptr++, rec2_b_ptr++) {
+			if (rec2_f_len <= cur_bytes) {
+
+				if (rec1_f_len <= cur_bytes) {
+
+					goto next_field;
+				}
+
+				rec2_byte = dtype_get_pad_char(mtype, prtype);
+
+				if (rec2_byte == ULINT_UNDEFINED) {
+					return(1);
+				}
+			} else {
+				rec2_byte = *rec2_b_ptr;
+			}
+
+			if (rec1_f_len <= cur_bytes) {
+				rec1_byte = dtype_get_pad_char(mtype, prtype);
+
+				if (rec1_byte == ULINT_UNDEFINED) {
+					return(-1);
+				}
+			} else {
+				rec1_byte = *rec1_b_ptr;
+			}
+
+			if (rec1_byte == rec2_byte) {
+				/* If the bytes are equal, they will remain
+				such even after the collation transformation
+				below */
+
+				continue;
+			}
+
+			if (mtype <= DATA_CHAR
+			    || (mtype == DATA_BLOB
+				&& !(prtype & DATA_BINARY_TYPE))) {
+
+				rec1_byte = cmp_collate(rec1_byte);
+				rec2_byte = cmp_collate(rec2_byte);
+			}
+
+			if (rec1_byte < rec2_byte) {
+				return(-1);
+			} else if (rec1_byte > rec2_byte) {
+				return(1);
+			}
+		}
+next_field:
+		continue;
+	}
+
+	/* If we ran out of fields, rec1 was equal to rec2. */
+	return(0);
+}
+
+/*************************************************************//**
+This function is used to compare two physical records. Only the common
+first fields are compared, and if an externally stored field is
+encountered, then 0 is returned.
+@return 1, 0, -1 if rec1 is greater, equal, less, respectively */
+UNIV_INTERN
+int
+cmp_rec_rec_with_match(
+/*===================*/
+	const rec_t*	rec1,	/*!< in: physical record */
+	const rec_t*	rec2,	/*!< in: physical record */
+	const ulint*	offsets1,/*!< in: rec_get_offsets(rec1, index) */
+	const ulint*	offsets2,/*!< in: rec_get_offsets(rec2, index) */
+	dict_index_t*	index,	/*!< in: data dictionary index */
+	ulint*		matched_fields, /*!< in/out: number of already completely
+				matched fields; when the function returns,
+				contains the value the for current
+				comparison */
+	ulint*		matched_bytes, /*!< in/out: number of already matched
+				bytes within the first field not completely
+				matched; when the function returns, contains
+				the value for the current comparison */
+	ulint		stats_method)
+{
+	ulint		rec1_n_fields;	/* the number of fields in rec */
+	ulint		rec1_f_len;	/* length of current field in rec */
+	const byte*	rec1_b_ptr;	/* pointer to the current byte
+					in rec field */
+	ulint		rec1_byte;	/* value of current byte to be
+					compared in rec */
+	ulint		rec2_n_fields;	/* the number of fields in rec */
+	ulint		rec2_f_len;	/* length of current field in rec */
+	const byte*	rec2_b_ptr;	/* pointer to the current byte
+					in rec field */
+	ulint		rec2_byte;	/* value of current byte to be
+					compared in rec */
+	ulint		cur_field;	/* current field number */
+	ulint		cur_bytes;	/* number of already matched
+					bytes in current field */
+	int		ret = 0;	/* return value */
+	ulint		comp;
+
+	ut_ad(rec1 && rec2 && index);
+	ut_ad(rec_offs_validate(rec1, index, offsets1));
+	ut_ad(rec_offs_validate(rec2, index, offsets2));
+	ut_ad(rec_offs_comp(offsets1) == rec_offs_comp(offsets2));
+
+	comp = rec_offs_comp(offsets1);
+	rec1_n_fields = rec_offs_n_fields(offsets1);
+	rec2_n_fields = rec_offs_n_fields(offsets2);
+
+	cur_field = *matched_fields;
+	cur_bytes = *matched_bytes;
+
+	/* Match fields in a loop */
+
+	while ((cur_field < rec1_n_fields) && (cur_field < rec2_n_fields)) {
+
+		ulint	mtype;
+		ulint	prtype;
+
+		if (UNIV_UNLIKELY(index->type & DICT_UNIVERSAL)) {
+			/* This is for the insert buffer B-tree. */
+			mtype = DATA_BINARY;
+			prtype = 0;
+		} else {
+			const dict_col_t*	col
+				= dict_index_get_nth_col(index, cur_field);
+
+			mtype = col->mtype;
+			prtype = col->prtype;
+		}
+
+		rec1_b_ptr = rec_get_nth_field(rec1, offsets1,
+					       cur_field, &rec1_f_len);
+		rec2_b_ptr = rec_get_nth_field(rec2, offsets2,
+					       cur_field, &rec2_f_len);
+
+		if (cur_bytes == 0) {
+			if (cur_field == 0) {
+				/* Test if rec is the predefined minimum
+				record */
+				if (UNIV_UNLIKELY(rec_get_info_bits(rec1, comp)
+						  & REC_INFO_MIN_REC_FLAG)) {
+
+					if (!(rec_get_info_bits(rec2, comp)
+					      & REC_INFO_MIN_REC_FLAG)) {
+						ret = -1;
+					}
+
+					goto order_resolved;
+
+				} else if (UNIV_UNLIKELY
+					   (rec_get_info_bits(rec2, comp)
+					    & REC_INFO_MIN_REC_FLAG)) {
+
+					ret = 1;
+
+					goto order_resolved;
+				}
+			}
+
+			if (rec_offs_nth_extern(offsets1, cur_field)
+			    || rec_offs_nth_extern(offsets2, cur_field)) {
+				/* We do not compare to an externally
+				stored field */
+
+				goto order_resolved;
+			}
+
+			if (rec1_f_len == UNIV_SQL_NULL
+			    || rec2_f_len == UNIV_SQL_NULL) {
+
+				if (rec1_f_len == rec2_f_len) {
+
+					if (stats_method == SRV_STATS_METHOD_NULLS_EQUAL) {
+						goto next_field;
+					} else {
+						ret = -1;
+					}
+
+				} else if (rec2_f_len == UNIV_SQL_NULL) {
+
+					/* We define the SQL null to be the
+					smallest possible value of a field
+					in the alphabetical order */
+
+					ret = 1;
+				} else {
+					ret = -1;
+				}
+
+				goto order_resolved;
+			}
+		}
+
+		if (mtype >= DATA_FLOAT
+		    || (mtype == DATA_BLOB
+			&& 0 == (prtype & DATA_BINARY_TYPE)
+			&& dtype_get_charset_coll(prtype)
+			!= DATA_MYSQL_LATIN1_SWEDISH_CHARSET_COLL)) {
+
+			ret = cmp_whole_field(mtype, prtype,
+					      rec1_b_ptr,
+					      (unsigned) rec1_f_len,
+					      rec2_b_ptr,
+					      (unsigned) rec2_f_len);
+			if (ret != 0) {
+				cur_bytes = 0;
+
+				goto order_resolved;
+			} else {
+				goto next_field;
+			}
+		}
+
+		/* Set the pointers at the current byte */
+		rec1_b_ptr = rec1_b_ptr + cur_bytes;
+		rec2_b_ptr = rec2_b_ptr + cur_bytes;
+
+		/* Compare then the fields */
+		for (;;) {
+			if (rec2_f_len <= cur_bytes) {
+
+				if (rec1_f_len <= cur_bytes) {
+
+					goto next_field;
+				}
+
+				rec2_byte = dtype_get_pad_char(mtype, prtype);
+
+				if (rec2_byte == ULINT_UNDEFINED) {
+					ret = 1;
+
+					goto order_resolved;
+				}
+			} else {
+				rec2_byte = *rec2_b_ptr;
+			}
+
+			if (rec1_f_len <= cur_bytes) {
+				rec1_byte = dtype_get_pad_char(mtype, prtype);
+
+				if (rec1_byte == ULINT_UNDEFINED) {
+					ret = -1;
+
+					goto order_resolved;
+				}
+			} else {
+				rec1_byte = *rec1_b_ptr;
+			}
+
+			if (rec1_byte == rec2_byte) {
+				/* If the bytes are equal, they will remain
+				such even after the collation transformation
+				below */
+
+				goto next_byte;
+			}
+
+			if (mtype <= DATA_CHAR
+			    || (mtype == DATA_BLOB
+				&& !(prtype & DATA_BINARY_TYPE))) {
+
+				rec1_byte = cmp_collate(rec1_byte);
+				rec2_byte = cmp_collate(rec2_byte);
+			}
+
+			if (rec1_byte < rec2_byte) {
+				ret = -1;
+				goto order_resolved;
+			} else if (rec1_byte > rec2_byte) {
+				ret = 1;
+				goto order_resolved;
+			}
+next_byte:
+			/* Next byte */
+
+			cur_bytes++;
+			rec1_b_ptr++;
+			rec2_b_ptr++;
+		}
+
+next_field:
+		cur_field++;
+		cur_bytes = 0;
+	}
+
+	ut_ad(cur_bytes == 0);
+
+	/* If we ran out of fields, rec1 was equal to rec2 up
+	to the common fields */
+	ut_ad(ret == 0);
+order_resolved:
+
+	ut_ad((ret >= - 1) && (ret <= 1));
+
+	*matched_fields = cur_field;
+	*matched_bytes = cur_bytes;
+
+	return(ret);
+}
+
+#ifdef UNIV_DEBUG
+/*************************************************************//**
+Used in debug checking of cmp_dtuple_... .
+This function is used to compare a data tuple to a physical record. If
+dtuple has n fields then rec must have either m >= n fields, or it must
+differ from dtuple in some of the m fields rec has. If encounters an
+externally stored field, returns 0.
+@return 1, 0, -1, if dtuple is greater, equal, less than rec,
+respectively, when only the common first fields are compared */
+static
+int
+cmp_debug_dtuple_rec_with_match(
+/*============================*/
+	const dtuple_t*	dtuple,	/*!< in: data tuple */
+	const rec_t*	rec,	/*!< in: physical record which differs from
+				dtuple in some of the common fields, or which
+				has an equal number or more fields than
+				dtuple */
+	const ulint*	offsets,/*!< in: array returned by rec_get_offsets() */
+	ulint*		matched_fields) /*!< in/out: number of already
+				completely matched fields; when function
+				returns, contains the value for current
+				comparison */
+{
+	const dfield_t*	dtuple_field;	/* current field in logical record */
+	ulint		dtuple_f_len;	/* the length of the current field
+					in the logical record */
+	const byte*	dtuple_f_data;	/* pointer to the current logical
+					field data */
+	ulint		rec_f_len;	/* length of current field in rec */
+	const byte*	rec_f_data;	/* pointer to the current rec field */
+	int		ret = 3333;	/* return value */
+	ulint		cur_field;	/* current field number */
+
+	ut_ad(dtuple && rec && matched_fields);
+	ut_ad(dtuple_check_typed(dtuple));
+	ut_ad(rec_offs_validate(rec, NULL, offsets));
+
+	ut_ad(*matched_fields <= dtuple_get_n_fields_cmp(dtuple));
+	ut_ad(*matched_fields <= rec_offs_n_fields(offsets));
+
+	cur_field = *matched_fields;
+
+	if (cur_field == 0) {
+		if (UNIV_UNLIKELY
+		    (rec_get_info_bits(rec, rec_offs_comp(offsets))
+		     & REC_INFO_MIN_REC_FLAG)) {
+
+			ret = !(dtuple_get_info_bits(dtuple)
+				& REC_INFO_MIN_REC_FLAG);
+
+			goto order_resolved;
+		}
+
+		if (UNIV_UNLIKELY
+		    (dtuple_get_info_bits(dtuple) & REC_INFO_MIN_REC_FLAG)) {
+			ret = -1;
+
+			goto order_resolved;
+		}
+	}
+
+	/* Match fields in a loop; stop if we run out of fields in dtuple */
+
+	while (cur_field < dtuple_get_n_fields_cmp(dtuple)) {
+
+		ulint	mtype;
+		ulint	prtype;
+
+		dtuple_field = dtuple_get_nth_field(dtuple, cur_field);
+		{
+			const dtype_t*	type
+				= dfield_get_type(dtuple_field);
+
+			mtype = type->mtype;
+			prtype = type->prtype;
+		}
+
+		dtuple_f_data = dfield_get_data(dtuple_field);
+		dtuple_f_len = dfield_get_len(dtuple_field);
+
+		rec_f_data = rec_get_nth_field(rec, offsets,
+					       cur_field, &rec_f_len);
+
+		if (rec_offs_nth_extern(offsets, cur_field)) {
+			/* We do not compare to an externally stored field */
+
+			ret = 0;
+
+			goto order_resolved;
+		}
+
+		ret = cmp_data_data(mtype, prtype, dtuple_f_data, dtuple_f_len,
+				    rec_f_data, rec_f_len);
+		if (ret != 0) {
+			goto order_resolved;
+		}
+
+		cur_field++;
+	}
+
+	ret = 0;	/* If we ran out of fields, dtuple was equal to rec
+			up to the common fields */
+order_resolved:
+	ut_ad((ret >= - 1) && (ret <= 1));
+
+	*matched_fields = cur_field;
+
+	return(ret);
+}
+#endif /* UNIV_DEBUG */
diff --git a/storage/xtradb/rem/rem0rec.c b/storage/xtradb/rem/rem0rec.c
new file mode 100644
index 00000000000..37ba8ca2ffe
--- /dev/null
+++ b/storage/xtradb/rem/rem0rec.c
@@ -0,0 +1,1774 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2010, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/********************************************************************//**
+@file rem/rem0rec.c
+Record manager
+
+Created 5/30/1994 Heikki Tuuri
+*************************************************************************/
+
+#include "rem0rec.h"
+
+#ifdef UNIV_NONINL
+#include "rem0rec.ic"
+#endif
+
+#include "mtr0mtr.h"
+#include "mtr0log.h"
+
+/*			PHYSICAL RECORD (OLD STYLE)
+			===========================
+
+The physical record, which is the data type of all the records
+found in index pages of the database, has the following format
+(lower addresses and more significant bits inside a byte are below
+represented on a higher text line):
+
+| offset of the end of the last field of data, the most significant
+  bit is set to 1 if and only if the field is SQL-null,
+  if the offset is 2-byte, then the second most significant
+  bit is set to 1 if the field is stored on another page:
+  mostly this will occur in the case of big BLOB fields |
+...
+| offset of the end of the first field of data + the SQL-null bit |
+| 4 bits used to delete mark a record, and mark a predefined
+  minimum record in alphabetical order |
+| 4 bits giving the number of records owned by this record
+  (this term is explained in page0page.h) |
+| 13 bits giving the order number of this record in the
+  heap of the index page |
+| 10 bits giving the number of fields in this record |
+| 1 bit which is set to 1 if the offsets above are given in
+  one byte format, 0 if in two byte format |
+| two bytes giving an absolute pointer to the next record in the page |
+ORIGIN of the record
+| first field of data |
+...
+| last field of data |
+
+The origin of the record is the start address of the first field
+of data. The offsets are given relative to the origin.
+The offsets of the data fields are stored in an inverted
+order because then the offset of the first fields are near the
+origin, giving maybe a better processor cache hit rate in searches.
+
+The offsets of the data fields are given as one-byte
+(if there are less than 127 bytes of data in the record)
+or two-byte unsigned integers. The most significant bit
+is not part of the offset, instead it indicates the SQL-null
+if the bit is set to 1. */
+
+/*			PHYSICAL RECORD (NEW STYLE)
+			===========================
+
+The physical record, which is the data type of all the records
+found in index pages of the database, has the following format
+(lower addresses and more significant bits inside a byte are below
+represented on a higher text line):
+
+| length of the last non-null variable-length field of data:
+  if the maximum length is 255, one byte; otherwise,
+  0xxxxxxx (one byte, length=0..127), or 1exxxxxxxxxxxxxx (two bytes,
+  length=128..16383, extern storage flag) |
+...
+| length of first variable-length field of data |
+| SQL-null flags (1 bit per nullable field), padded to full bytes |
+| 4 bits used to delete mark a record, and mark a predefined
+  minimum record in alphabetical order |
+| 4 bits giving the number of records owned by this record
+  (this term is explained in page0page.h) |
+| 13 bits giving the order number of this record in the
+  heap of the index page |
+| 3 bits record type: 000=conventional, 001=node pointer (inside B-tree),
+  010=infimum, 011=supremum, 1xx=reserved |
+| two bytes giving a relative pointer to the next record in the page |
+ORIGIN of the record
+| first field of data |
+...
+| last field of data |
+
+The origin of the record is the start address of the first field
+of data. The offsets are given relative to the origin.
+The offsets of the data fields are stored in an inverted
+order because then the offset of the first fields are near the
+origin, giving maybe a better processor cache hit rate in searches.
+
+The offsets of the data fields are given as one-byte
+(if there are less than 127 bytes of data in the record)
+or two-byte unsigned integers. The most significant bit
+is not part of the offset, instead it indicates the SQL-null
+if the bit is set to 1. */
+
+/* CANONICAL COORDINATES. A record can be seen as a single
+string of 'characters' in the following way: catenate the bytes
+in each field, in the order of fields. An SQL-null field
+is taken to be an empty sequence of bytes. Then after
+the position of each field insert in the string
+the 'character' <FIELD-END>, except that after an SQL-null field
+insert <NULL-FIELD-END>. Now the ordinal position of each
+byte in this canonical string is its canonical coordinate.
+So, for the record ("AA", SQL-NULL, "BB", ""), the canonical
+string is "AA<FIELD_END><NULL-FIELD-END>BB<FIELD-END><FIELD-END>".
+We identify prefixes (= initial segments) of a record
+with prefixes of the canonical string. The canonical
+length of the prefix is the length of the corresponding
+prefix of the canonical string. The canonical length of
+a record is the length of its canonical string.
+
+For example, the maximal common prefix of records
+("AA", SQL-NULL, "BB", "C") and ("AA", SQL-NULL, "B", "C")
+is "AA<FIELD-END><NULL-FIELD-END>B", and its canonical
+length is 5.
+
+A complete-field prefix of a record is a prefix which ends at the
+end of some field (containing also <FIELD-END>).
+A record is a complete-field prefix of another record, if
+the corresponding canonical strings have the same property. */
+
+/* this is used to fool compiler in rec_validate */
+UNIV_INTERN ulint	rec_dummy;
+
+/***************************************************************//**
+Validates the consistency of an old-style physical record.
+@return	TRUE if ok */
+static
+ibool
+rec_validate_old(
+/*=============*/
+	const rec_t*	rec);	/*!< in: physical record */
+
+/******************************************************//**
+Determine how many of the first n columns in a compact
+physical record are stored externally.
+@return	number of externally stored columns */
+UNIV_INTERN
+ulint
+rec_get_n_extern_new(
+/*=================*/
+	const rec_t*	rec,	/*!< in: compact physical record */
+	dict_index_t*	index,	/*!< in: record descriptor */
+	ulint		n)	/*!< in: number of columns to scan */
+{
+	const byte*	nulls;
+	const byte*	lens;
+	dict_field_t*	field;
+	ulint		null_mask;
+	ulint		n_extern;
+	ulint		i;
+
+	ut_ad(dict_table_is_comp(index->table));
+	ut_ad(rec_get_status(rec) == REC_STATUS_ORDINARY);
+	ut_ad(n == ULINT_UNDEFINED || n <= dict_index_get_n_fields(index));
+
+	if (n == ULINT_UNDEFINED) {
+		n = dict_index_get_n_fields(index);
+	}
+
+	nulls = rec - (REC_N_NEW_EXTRA_BYTES + 1);
+	lens = nulls - UT_BITS_IN_BYTES(index->n_nullable);
+	null_mask = 1;
+	n_extern = 0;
+	i = 0;
+
+	/* read the lengths of fields 0..n */
+	do {
+		ulint	len;
+
+		field = dict_index_get_nth_field(index, i);
+		if (!(dict_field_get_col(field)->prtype & DATA_NOT_NULL)) {
+			/* nullable field => read the null flag */
+
+			if (UNIV_UNLIKELY(!(byte) null_mask)) {
+				nulls--;
+				null_mask = 1;
+			}
+
+			if (*nulls & null_mask) {
+				null_mask <<= 1;
+				/* No length is stored for NULL fields. */
+				continue;
+			}
+			null_mask <<= 1;
+		}
+
+		if (UNIV_UNLIKELY(!field->fixed_len)) {
+			/* Variable-length field: read the length */
+			const dict_col_t*	col
+				= dict_field_get_col(field);
+			len = *lens--;
+			/* If the maximum length of the field is up
+			to 255 bytes, the actual length is always
+			stored in one byte. If the maximum length is
+			more than 255 bytes, the actual length is
+			stored in one byte for 0..127.  The length
+			will be encoded in two bytes when it is 128 or
+			more, or when the field is stored externally. */
+			if (UNIV_UNLIKELY(col->len > 255)
+			    || UNIV_UNLIKELY(col->mtype == DATA_BLOB)) {
+				if (len & 0x80) {
+					/* 1exxxxxxx xxxxxxxx */
+					if (len & 0x40) {
+						n_extern++;
+					}
+					lens--;
+				}
+			}
+		}
+	} while (++i < n);
+
+	return(n_extern);
+}
+
+/******************************************************//**
+Determine the offset to each field in a leaf-page record
+in ROW_FORMAT=COMPACT.  This is a special case of
+rec_init_offsets() and rec_get_offsets_func(). */
+UNIV_INTERN
+void
+rec_init_offsets_comp_ordinary(
+/*===========================*/
+	const rec_t*		rec,	/*!< in: physical record in
+					ROW_FORMAT=COMPACT */
+	ulint			extra,	/*!< in: number of bytes to reserve
+					between the record header and
+					the data payload
+					(usually REC_N_NEW_EXTRA_BYTES) */
+	const dict_index_t*	index,	/*!< in: record descriptor */
+	ulint*			offsets)/*!< in/out: array of offsets;
+					in: n=rec_offs_n_fields(offsets) */
+{
+	ulint		i		= 0;
+	ulint		offs		= 0;
+	ulint		any_ext		= 0;
+	const byte*	nulls		= rec - (extra + 1);
+	const byte*	lens		= nulls
+		- UT_BITS_IN_BYTES(index->n_nullable);
+	dict_field_t*	field;
+	ulint		null_mask	= 1;
+
+#ifdef UNIV_DEBUG
+	/* We cannot invoke rec_offs_make_valid() here, because it can hold
+	that extra != REC_N_NEW_EXTRA_BYTES.  Similarly, rec_offs_validate()
+	will fail in that case, because it invokes rec_get_status(). */
+	offsets[2] = (ulint) rec;
+	offsets[3] = (ulint) index;
+#endif /* UNIV_DEBUG */
+
+	/* read the lengths of fields 0..n */
+	do {
+		ulint	len;
+
+		field = dict_index_get_nth_field(index, i);
+		if (!(dict_field_get_col(field)->prtype
+		      & DATA_NOT_NULL)) {
+			/* nullable field => read the null flag */
+
+			if (UNIV_UNLIKELY(!(byte) null_mask)) {
+				nulls--;
+				null_mask = 1;
+			}
+
+			if (*nulls & null_mask) {
+				null_mask <<= 1;
+				/* No length is stored for NULL fields.
+				We do not advance offs, and we set
+				the length to zero and enable the
+				SQL NULL flag in offsets[]. */
+				len = offs | REC_OFFS_SQL_NULL;
+				goto resolved;
+			}
+			null_mask <<= 1;
+		}
+
+		if (UNIV_UNLIKELY(!field->fixed_len)) {
+			/* Variable-length field: read the length */
+			const dict_col_t*	col
+				= dict_field_get_col(field);
+			len = *lens--;
+			/* If the maximum length of the field is up
+			to 255 bytes, the actual length is always
+			stored in one byte. If the maximum length is
+			more than 255 bytes, the actual length is
+			stored in one byte for 0..127.  The length
+			will be encoded in two bytes when it is 128 or
+			more, or when the field is stored externally. */
+			if (UNIV_UNLIKELY(col->len > 255)
+			    || UNIV_UNLIKELY(col->mtype
+					     == DATA_BLOB)) {
+				if (len & 0x80) {
+					/* 1exxxxxxx xxxxxxxx */
+					len <<= 8;
+					len |= *lens--;
+
+					offs += len & 0x3fff;
+					if (UNIV_UNLIKELY(len
+							  & 0x4000)) {
+						ut_ad(dict_index_is_clust
+						      (index));
+						any_ext = REC_OFFS_EXTERNAL;
+						len = offs
+							| REC_OFFS_EXTERNAL;
+					} else {
+						len = offs;
+					}
+
+					goto resolved;
+				}
+			}
+
+			len = offs += len;
+		} else {
+			len = offs += field->fixed_len;
+		}
+resolved:
+		rec_offs_base(offsets)[i + 1] = len;
+	} while (++i < rec_offs_n_fields(offsets));
+
+	*rec_offs_base(offsets)
+		= (rec - (lens + 1)) | REC_OFFS_COMPACT | any_ext;
+}
+
+/******************************************************//**
+The following function determines the offsets to each field in the
+record.	 The offsets are written to a previously allocated array of
+ulint, where rec_offs_n_fields(offsets) has been initialized to the
+number of fields in the record.	 The rest of the array will be
+initialized by this function.  rec_offs_base(offsets)[0] will be set
+to the extra size (if REC_OFFS_COMPACT is set, the record is in the
+new format; if REC_OFFS_EXTERNAL is set, the record contains externally
+stored columns), and rec_offs_base(offsets)[1..n_fields] will be set to
+offsets past the end of fields 0..n_fields, or to the beginning of
+fields 1..n_fields+1.  When the high-order bit of the offset at [i+1]
+is set (REC_OFFS_SQL_NULL), the field i is NULL.  When the second
+high-order bit of the offset at [i+1] is set (REC_OFFS_EXTERNAL), the
+field i is being stored externally. */
+static
+void
+rec_init_offsets(
+/*=============*/
+	const rec_t*		rec,	/*!< in: physical record */
+	const dict_index_t*	index,	/*!< in: record descriptor */
+	ulint*			offsets)/*!< in/out: array of offsets;
+					in: n=rec_offs_n_fields(offsets) */
+{
+	ulint	i	= 0;
+	ulint	offs;
+
+	rec_offs_make_valid(rec, index, offsets);
+
+	if (dict_table_is_comp(index->table)) {
+		const byte*	nulls;
+		const byte*	lens;
+		dict_field_t*	field;
+		ulint		null_mask;
+		ulint		status = rec_get_status(rec);
+		ulint		n_node_ptr_field = ULINT_UNDEFINED;
+
+		switch (UNIV_EXPECT(status, REC_STATUS_ORDINARY)) {
+		case REC_STATUS_INFIMUM:
+		case REC_STATUS_SUPREMUM:
+			/* the field is 8 bytes long */
+			rec_offs_base(offsets)[0]
+				= REC_N_NEW_EXTRA_BYTES | REC_OFFS_COMPACT;
+			rec_offs_base(offsets)[1] = 8;
+			return;
+		case REC_STATUS_NODE_PTR:
+			n_node_ptr_field
+				= dict_index_get_n_unique_in_tree(index);
+			break;
+		case REC_STATUS_ORDINARY:
+			rec_init_offsets_comp_ordinary(rec,
+						       REC_N_NEW_EXTRA_BYTES,
+						       index, offsets);
+			return;
+		}
+
+		nulls = rec - (REC_N_NEW_EXTRA_BYTES + 1);
+		lens = nulls - UT_BITS_IN_BYTES(index->n_nullable);
+		offs = 0;
+		null_mask = 1;
+
+		/* read the lengths of fields 0..n */
+		do {
+			ulint	len;
+			if (UNIV_UNLIKELY(i == n_node_ptr_field)) {
+				len = offs += 4;
+				goto resolved;
+			}
+
+			field = dict_index_get_nth_field(index, i);
+			if (!(dict_field_get_col(field)->prtype
+			      & DATA_NOT_NULL)) {
+				/* nullable field => read the null flag */
+
+				if (UNIV_UNLIKELY(!(byte) null_mask)) {
+					nulls--;
+					null_mask = 1;
+				}
+
+				if (*nulls & null_mask) {
+					null_mask <<= 1;
+					/* No length is stored for NULL fields.
+					We do not advance offs, and we set
+					the length to zero and enable the
+					SQL NULL flag in offsets[]. */
+					len = offs | REC_OFFS_SQL_NULL;
+					goto resolved;
+				}
+				null_mask <<= 1;
+			}
+
+			if (UNIV_UNLIKELY(!field->fixed_len)) {
+				/* Variable-length field: read the length */
+				const dict_col_t*	col
+					= dict_field_get_col(field);
+				len = *lens--;
+				/* If the maximum length of the field
+				is up to 255 bytes, the actual length
+				is always stored in one byte. If the
+				maximum length is more than 255 bytes,
+				the actual length is stored in one
+				byte for 0..127.  The length will be
+				encoded in two bytes when it is 128 or
+				more, or when the field is stored
+				externally. */
+				if (UNIV_UNLIKELY(col->len > 255)
+				    || UNIV_UNLIKELY(col->mtype
+						     == DATA_BLOB)) {
+					if (len & 0x80) {
+						/* 1exxxxxxx xxxxxxxx */
+
+						len <<= 8;
+						len |= *lens--;
+
+						/* B-tree node pointers
+						must not contain externally
+						stored columns.  Thus
+						the "e" flag must be 0. */
+						ut_a(!(len & 0x4000));
+						offs += len & 0x3fff;
+						len = offs;
+
+						goto resolved;
+					}
+				}
+
+				len = offs += len;
+			} else {
+				len = offs += field->fixed_len;
+			}
+resolved:
+			rec_offs_base(offsets)[i + 1] = len;
+		} while (++i < rec_offs_n_fields(offsets));
+
+		*rec_offs_base(offsets)
+			= (rec - (lens + 1)) | REC_OFFS_COMPACT;
+	} else {
+		/* Old-style record: determine extra size and end offsets */
+		offs = REC_N_OLD_EXTRA_BYTES;
+		if (rec_get_1byte_offs_flag(rec)) {
+			offs += rec_offs_n_fields(offsets);
+			*rec_offs_base(offsets) = offs;
+			/* Determine offsets to fields */
+			do {
+				offs = rec_1_get_field_end_info(rec, i);
+				if (offs & REC_1BYTE_SQL_NULL_MASK) {
+					offs &= ~REC_1BYTE_SQL_NULL_MASK;
+					offs |= REC_OFFS_SQL_NULL;
+				}
+				rec_offs_base(offsets)[1 + i] = offs;
+			} while (++i < rec_offs_n_fields(offsets));
+		} else {
+			offs += 2 * rec_offs_n_fields(offsets);
+			*rec_offs_base(offsets) = offs;
+			/* Determine offsets to fields */
+			do {
+				offs = rec_2_get_field_end_info(rec, i);
+				if (offs & REC_2BYTE_SQL_NULL_MASK) {
+					offs &= ~REC_2BYTE_SQL_NULL_MASK;
+					offs |= REC_OFFS_SQL_NULL;
+				}
+				if (offs & REC_2BYTE_EXTERN_MASK) {
+					offs &= ~REC_2BYTE_EXTERN_MASK;
+					offs |= REC_OFFS_EXTERNAL;
+					*rec_offs_base(offsets) |= REC_OFFS_EXTERNAL;
+				}
+				rec_offs_base(offsets)[1 + i] = offs;
+			} while (++i < rec_offs_n_fields(offsets));
+		}
+	}
+}
+
+/******************************************************//**
+The following function determines the offsets to each field
+in the record.	It can reuse a previously returned array.
+@return	the new offsets */
+UNIV_INTERN
+ulint*
+rec_get_offsets_func(
+/*=================*/
+	const rec_t*		rec,	/*!< in: physical record */
+	const dict_index_t*	index,	/*!< in: record descriptor */
+	ulint*			offsets,/*!< in/out: array consisting of
+					offsets[0] allocated elements,
+					or an array from rec_get_offsets(),
+					or NULL */
+	ulint			n_fields,/*!< in: maximum number of
+					initialized fields
+					 (ULINT_UNDEFINED if all fields) */
+	mem_heap_t**		heap,	/*!< in/out: memory heap */
+	const char*		file,	/*!< in: file name where called */
+	ulint			line)	/*!< in: line number where called */
+{
+	ulint	n;
+	ulint	size;
+
+	ut_ad(rec);
+	ut_ad(index);
+	ut_ad(heap);
+
+	if (dict_table_is_comp(index->table)) {
+		switch (UNIV_EXPECT(rec_get_status(rec),
+				    REC_STATUS_ORDINARY)) {
+		case REC_STATUS_ORDINARY:
+			n = dict_index_get_n_fields(index);
+			break;
+		case REC_STATUS_NODE_PTR:
+			n = dict_index_get_n_unique_in_tree(index) + 1;
+			break;
+		case REC_STATUS_INFIMUM:
+		case REC_STATUS_SUPREMUM:
+			/* infimum or supremum record */
+			n = 1;
+			break;
+		default:
+			ut_error;
+			return(NULL);
+		}
+	} else {
+		n = rec_get_n_fields_old(rec);
+	}
+
+	if (UNIV_UNLIKELY(n_fields < n)) {
+		n = n_fields;
+	}
+
+	size = n + (1 + REC_OFFS_HEADER_SIZE);
+
+	if (UNIV_UNLIKELY(!offsets)
+	    || UNIV_UNLIKELY(rec_offs_get_n_alloc(offsets) < size)) {
+		if (UNIV_UNLIKELY(!*heap)) {
+			*heap = mem_heap_create_func(size * sizeof(ulint),
+						     MEM_HEAP_DYNAMIC,
+						     file, line);
+		}
+		offsets = mem_heap_alloc(*heap, size * sizeof(ulint));
+		rec_offs_set_n_alloc(offsets, size);
+	}
+
+	rec_offs_set_n_fields(offsets, n);
+	rec_init_offsets(rec, index, offsets);
+	return(offsets);
+}
+
+/******************************************************//**
+The following function determines the offsets to each field
+in the record.  It can reuse a previously allocated array. */
+UNIV_INTERN
+void
+rec_get_offsets_reverse(
+/*====================*/
+	const byte*		extra,	/*!< in: the extra bytes of a
+					compact record in reverse order,
+					excluding the fixed-size
+					REC_N_NEW_EXTRA_BYTES */
+	const dict_index_t*	index,	/*!< in: record descriptor */
+	ulint			node_ptr,/*!< in: nonzero=node pointer,
+					0=leaf node */
+	ulint*			offsets)/*!< in/out: array consisting of
+					offsets[0] allocated elements */
+{
+	ulint		n;
+	ulint		i;
+	ulint		offs;
+	ulint		any_ext;
+	const byte*	nulls;
+	const byte*	lens;
+	dict_field_t*	field;
+	ulint		null_mask;
+	ulint		n_node_ptr_field;
+
+	ut_ad(extra);
+	ut_ad(index);
+	ut_ad(offsets);
+	ut_ad(dict_table_is_comp(index->table));
+
+	if (UNIV_UNLIKELY(node_ptr)) {
+		n_node_ptr_field = dict_index_get_n_unique_in_tree(index);
+		n = n_node_ptr_field + 1;
+	} else {
+		n_node_ptr_field = ULINT_UNDEFINED;
+		n = dict_index_get_n_fields(index);
+	}
+
+	ut_a(rec_offs_get_n_alloc(offsets) >= n + (1 + REC_OFFS_HEADER_SIZE));
+	rec_offs_set_n_fields(offsets, n);
+
+	nulls = extra;
+	lens = nulls + UT_BITS_IN_BYTES(index->n_nullable);
+	i = offs = 0;
+	null_mask = 1;
+	any_ext = 0;
+
+	/* read the lengths of fields 0..n */
+	do {
+		ulint	len;
+		if (UNIV_UNLIKELY(i == n_node_ptr_field)) {
+			len = offs += 4;
+			goto resolved;
+		}
+
+		field = dict_index_get_nth_field(index, i);
+		if (!(dict_field_get_col(field)->prtype & DATA_NOT_NULL)) {
+			/* nullable field => read the null flag */
+
+			if (UNIV_UNLIKELY(!(byte) null_mask)) {
+				nulls++;
+				null_mask = 1;
+			}
+
+			if (*nulls & null_mask) {
+				null_mask <<= 1;
+				/* No length is stored for NULL fields.
+				We do not advance offs, and we set
+				the length to zero and enable the
+				SQL NULL flag in offsets[]. */
+				len = offs | REC_OFFS_SQL_NULL;
+				goto resolved;
+			}
+			null_mask <<= 1;
+		}
+
+		if (UNIV_UNLIKELY(!field->fixed_len)) {
+			/* Variable-length field: read the length */
+			const dict_col_t*	col
+				= dict_field_get_col(field);
+			len = *lens++;
+			/* If the maximum length of the field is up
+			to 255 bytes, the actual length is always
+			stored in one byte. If the maximum length is
+			more than 255 bytes, the actual length is
+			stored in one byte for 0..127.  The length
+			will be encoded in two bytes when it is 128 or
+			more, or when the field is stored externally. */
+			if (UNIV_UNLIKELY(col->len > 255)
+			    || UNIV_UNLIKELY(col->mtype == DATA_BLOB)) {
+				if (len & 0x80) {
+					/* 1exxxxxxx xxxxxxxx */
+					len <<= 8;
+					len |= *lens++;
+
+					offs += len & 0x3fff;
+					if (UNIV_UNLIKELY(len & 0x4000)) {
+						any_ext = REC_OFFS_EXTERNAL;
+						len = offs | REC_OFFS_EXTERNAL;
+					} else {
+						len = offs;
+					}
+
+					goto resolved;
+				}
+			}
+
+			len = offs += len;
+		} else {
+			len = offs += field->fixed_len;
+		}
+resolved:
+		rec_offs_base(offsets)[i + 1] = len;
+	} while (++i < rec_offs_n_fields(offsets));
+
+	ut_ad(lens >= extra);
+	*rec_offs_base(offsets) = (lens - extra + REC_N_NEW_EXTRA_BYTES)
+		| REC_OFFS_COMPACT | any_ext;
+}
+
+/************************************************************//**
+The following function is used to get the offset to the nth
+data field in an old-style record.
+@return	offset to the field */
+UNIV_INTERN
+ulint
+rec_get_nth_field_offs_old(
+/*=======================*/
+	const rec_t*	rec,	/*!< in: record */
+	ulint		n,	/*!< in: index of the field */
+	ulint*		len)	/*!< out: length of the field;
+				UNIV_SQL_NULL if SQL null */
+{
+	ulint	os;
+	ulint	next_os;
+
+	ut_ad(len);
+	ut_a(rec);
+	ut_a(n < rec_get_n_fields_old(rec));
+
+	if (rec_get_1byte_offs_flag(rec)) {
+		os = rec_1_get_field_start_offs(rec, n);
+
+		next_os = rec_1_get_field_end_info(rec, n);
+
+		if (next_os & REC_1BYTE_SQL_NULL_MASK) {
+			*len = UNIV_SQL_NULL;
+
+			return(os);
+		}
+
+		next_os = next_os & ~REC_1BYTE_SQL_NULL_MASK;
+	} else {
+		os = rec_2_get_field_start_offs(rec, n);
+
+		next_os = rec_2_get_field_end_info(rec, n);
+
+		if (next_os & REC_2BYTE_SQL_NULL_MASK) {
+			*len = UNIV_SQL_NULL;
+
+			return(os);
+		}
+
+		next_os = next_os & ~(REC_2BYTE_SQL_NULL_MASK
+				      | REC_2BYTE_EXTERN_MASK);
+	}
+
+	*len = next_os - os;
+
+	ut_ad(*len < UNIV_PAGE_SIZE);
+
+	return(os);
+}
+
+/**********************************************************//**
+Determines the size of a data tuple prefix in ROW_FORMAT=COMPACT.
+@return	total size */
+UNIV_INTERN
+ulint
+rec_get_converted_size_comp_prefix(
+/*===============================*/
+	const dict_index_t*	index,	/*!< in: record descriptor;
+					dict_table_is_comp() is
+					assumed to hold, even if
+					it does not */
+	const dfield_t*		fields,	/*!< in: array of data fields */
+	ulint			n_fields,/*!< in: number of data fields */
+	ulint*			extra)	/*!< out: extra size */
+{
+	ulint	extra_size;
+	ulint	data_size;
+	ulint	i;
+	ut_ad(index);
+	ut_ad(fields);
+	ut_ad(n_fields > 0);
+	ut_ad(n_fields <= dict_index_get_n_fields(index));
+
+	extra_size = REC_N_NEW_EXTRA_BYTES
+		+ UT_BITS_IN_BYTES(index->n_nullable);
+	data_size = 0;
+
+	/* read the lengths of fields 0..n */
+	for (i = 0; i < n_fields; i++) {
+		const dict_field_t*	field;
+		ulint			len;
+		const dict_col_t*	col;
+
+		field = dict_index_get_nth_field(index, i);
+		len = dfield_get_len(&fields[i]);
+		col = dict_field_get_col(field);
+
+		ut_ad(dict_col_type_assert_equal(col,
+						 dfield_get_type(&fields[i])));
+
+		if (dfield_is_null(&fields[i])) {
+			/* No length is stored for NULL fields. */
+			ut_ad(!(col->prtype & DATA_NOT_NULL));
+			continue;
+		}
+
+		ut_ad(len <= col->len || col->mtype == DATA_BLOB);
+
+		/* If the maximum length of a variable-length field
+		is up to 255 bytes, the actual length is always stored
+		in one byte. If the maximum length is more than 255
+		bytes, the actual length is stored in one byte for
+		0..127.  The length will be encoded in two bytes when
+		it is 128 or more, or when the field is stored externally. */
+
+		if (field->fixed_len) {
+			ut_ad(len == field->fixed_len);
+			/* dict_index_add_col() should guarantee this */
+			ut_ad(!field->prefix_len
+			      || field->fixed_len == field->prefix_len);
+		} else if (dfield_is_ext(&fields[i])) {
+			ut_ad(col->len >= 256 || col->mtype == DATA_BLOB);
+			extra_size += 2;
+		} else if (len < 128
+			   || (col->len < 256 && col->mtype != DATA_BLOB)) {
+			extra_size++;
+		} else {
+			/* For variable-length columns, we look up the
+			maximum length from the column itself.  If this
+			is a prefix index column shorter than 256 bytes,
+			this will waste one byte. */
+			extra_size += 2;
+		}
+		data_size += len;
+	}
+
+	if (UNIV_LIKELY_NULL(extra)) {
+		*extra = extra_size;
+	}
+
+	return(extra_size + data_size);
+}
+
+/**********************************************************//**
+Determines the size of a data tuple in ROW_FORMAT=COMPACT.
+@return	total size */
+UNIV_INTERN
+ulint
+rec_get_converted_size_comp(
+/*========================*/
+	const dict_index_t*	index,	/*!< in: record descriptor;
+					dict_table_is_comp() is
+					assumed to hold, even if
+					it does not */
+	ulint			status,	/*!< in: status bits of the record */
+	const dfield_t*		fields,	/*!< in: array of data fields */
+	ulint			n_fields,/*!< in: number of data fields */
+	ulint*			extra)	/*!< out: extra size */
+{
+	ulint	size;
+	ut_ad(index);
+	ut_ad(fields);
+	ut_ad(n_fields > 0);
+
+	switch (UNIV_EXPECT(status, REC_STATUS_ORDINARY)) {
+	case REC_STATUS_ORDINARY:
+		ut_ad(n_fields == dict_index_get_n_fields(index));
+		size = 0;
+		break;
+	case REC_STATUS_NODE_PTR:
+		n_fields--;
+		ut_ad(n_fields == dict_index_get_n_unique_in_tree(index));
+		ut_ad(dfield_get_len(&fields[n_fields]) == REC_NODE_PTR_SIZE);
+		size = REC_NODE_PTR_SIZE; /* child page number */
+		break;
+	case REC_STATUS_INFIMUM:
+	case REC_STATUS_SUPREMUM:
+		/* infimum or supremum record, 8 data bytes */
+		if (UNIV_LIKELY_NULL(extra)) {
+			*extra = REC_N_NEW_EXTRA_BYTES;
+		}
+		return(REC_N_NEW_EXTRA_BYTES + 8);
+	default:
+		ut_error;
+		return(ULINT_UNDEFINED);
+	}
+
+	return(size + rec_get_converted_size_comp_prefix(index, fields,
+							 n_fields, extra));
+}
+
+/***********************************************************//**
+Sets the value of the ith field SQL null bit of an old-style record. */
+UNIV_INTERN
+void
+rec_set_nth_field_null_bit(
+/*=======================*/
+	rec_t*	rec,	/*!< in: record */
+	ulint	i,	/*!< in: ith field */
+	ibool	val)	/*!< in: value to set */
+{
+	ulint	info;
+
+	if (rec_get_1byte_offs_flag(rec)) {
+
+		info = rec_1_get_field_end_info(rec, i);
+
+		if (val) {
+			info = info | REC_1BYTE_SQL_NULL_MASK;
+		} else {
+			info = info & ~REC_1BYTE_SQL_NULL_MASK;
+		}
+
+		rec_1_set_field_end_info(rec, i, info);
+
+		return;
+	}
+
+	info = rec_2_get_field_end_info(rec, i);
+
+	if (val) {
+		info = info | REC_2BYTE_SQL_NULL_MASK;
+	} else {
+		info = info & ~REC_2BYTE_SQL_NULL_MASK;
+	}
+
+	rec_2_set_field_end_info(rec, i, info);
+}
+
+/***********************************************************//**
+Sets an old-style record field to SQL null.
+The physical size of the field is not changed. */
+UNIV_INTERN
+void
+rec_set_nth_field_sql_null(
+/*=======================*/
+	rec_t*	rec,	/*!< in: record */
+	ulint	n)	/*!< in: index of the field */
+{
+	ulint	offset;
+
+	offset = rec_get_field_start_offs(rec, n);
+
+	data_write_sql_null(rec + offset, rec_get_nth_field_size(rec, n));
+
+	rec_set_nth_field_null_bit(rec, n, TRUE);
+}
+
+/*********************************************************//**
+Builds an old-style physical record out of a data tuple and
+stores it beginning from the start of the given buffer.
+@return	pointer to the origin of physical record */
+static
+rec_t*
+rec_convert_dtuple_to_rec_old(
+/*==========================*/
+	byte*		buf,	/*!< in: start address of the physical record */
+	const dtuple_t*	dtuple,	/*!< in: data tuple */
+	ulint		n_ext)	/*!< in: number of externally stored columns */
+{
+	const dfield_t*	field;
+	ulint		n_fields;
+	ulint		data_size;
+	rec_t*		rec;
+	ulint		end_offset;
+	ulint		ored_offset;
+	ulint		len;
+	ulint		i;
+
+	ut_ad(buf && dtuple);
+	ut_ad(dtuple_validate(dtuple));
+	ut_ad(dtuple_check_typed(dtuple));
+
+	n_fields = dtuple_get_n_fields(dtuple);
+	data_size = dtuple_get_data_size(dtuple, 0);
+
+	ut_ad(n_fields > 0);
+
+	/* Calculate the offset of the origin in the physical record */
+
+	rec = buf + rec_get_converted_extra_size(data_size, n_fields, n_ext);
+#ifdef UNIV_DEBUG
+	/* Suppress Valgrind warnings of ut_ad()
+	in mach_write_to_1(), mach_write_to_2() et al. */
+	memset(buf, 0xff, rec - buf + data_size);
+#endif /* UNIV_DEBUG */
+	/* Store the number of fields */
+	rec_set_n_fields_old(rec, n_fields);
+
+	/* Set the info bits of the record */
+	rec_set_info_bits_old(rec, dtuple_get_info_bits(dtuple)
+			      & REC_INFO_BITS_MASK);
+
+	/* Store the data and the offsets */
+
+	end_offset = 0;
+
+	if (!n_ext && data_size <= REC_1BYTE_OFFS_LIMIT) {
+
+		rec_set_1byte_offs_flag(rec, TRUE);
+
+		for (i = 0; i < n_fields; i++) {
+
+			field = dtuple_get_nth_field(dtuple, i);
+
+			if (dfield_is_null(field)) {
+				len = dtype_get_sql_null_size(
+					dfield_get_type(field), 0);
+				data_write_sql_null(rec + end_offset, len);
+
+				end_offset += len;
+				ored_offset = end_offset
+					| REC_1BYTE_SQL_NULL_MASK;
+			} else {
+				/* If the data is not SQL null, store it */
+				len = dfield_get_len(field);
+
+				memcpy(rec + end_offset,
+				       dfield_get_data(field), len);
+
+				end_offset += len;
+				ored_offset = end_offset;
+			}
+
+			rec_1_set_field_end_info(rec, i, ored_offset);
+		}
+	} else {
+		rec_set_1byte_offs_flag(rec, FALSE);
+
+		for (i = 0; i < n_fields; i++) {
+
+			field = dtuple_get_nth_field(dtuple, i);
+
+			if (dfield_is_null(field)) {
+				len = dtype_get_sql_null_size(
+					dfield_get_type(field), 0);
+				data_write_sql_null(rec + end_offset, len);
+
+				end_offset += len;
+				ored_offset = end_offset
+					| REC_2BYTE_SQL_NULL_MASK;
+			} else {
+				/* If the data is not SQL null, store it */
+				len = dfield_get_len(field);
+
+				memcpy(rec + end_offset,
+				       dfield_get_data(field), len);
+
+				end_offset += len;
+				ored_offset = end_offset;
+
+				if (dfield_is_ext(field)) {
+					ored_offset |= REC_2BYTE_EXTERN_MASK;
+				}
+			}
+
+			rec_2_set_field_end_info(rec, i, ored_offset);
+		}
+	}
+
+	return(rec);
+}
+
+/*********************************************************//**
+Builds a ROW_FORMAT=COMPACT record out of a data tuple. */
+UNIV_INTERN
+void
+rec_convert_dtuple_to_rec_comp(
+/*===========================*/
+	rec_t*			rec,	/*!< in: origin of record */
+	ulint			extra,	/*!< in: number of bytes to
+					reserve between the record
+					header and the data payload
+					(normally REC_N_NEW_EXTRA_BYTES) */
+	const dict_index_t*	index,	/*!< in: record descriptor */
+	ulint			status,	/*!< in: status bits of the record */
+	const dfield_t*		fields,	/*!< in: array of data fields */
+	ulint			n_fields)/*!< in: number of data fields */
+{
+	const dfield_t*	field;
+	const dtype_t*	type;
+	byte*		end;
+	byte*		nulls;
+	byte*		lens;
+	ulint		len;
+	ulint		i;
+	ulint		n_node_ptr_field;
+	ulint		fixed_len;
+	ulint		null_mask	= 1;
+	ut_ad(extra == 0 || dict_table_is_comp(index->table));
+	ut_ad(extra == 0 || extra == REC_N_NEW_EXTRA_BYTES);
+	ut_ad(n_fields > 0);
+
+	switch (UNIV_EXPECT(status, REC_STATUS_ORDINARY)) {
+	case REC_STATUS_ORDINARY:
+		ut_ad(n_fields <= dict_index_get_n_fields(index));
+		n_node_ptr_field = ULINT_UNDEFINED;
+		break;
+	case REC_STATUS_NODE_PTR:
+		ut_ad(n_fields == dict_index_get_n_unique_in_tree(index) + 1);
+		n_node_ptr_field = n_fields - 1;
+		break;
+	case REC_STATUS_INFIMUM:
+	case REC_STATUS_SUPREMUM:
+		ut_ad(n_fields == 1);
+		n_node_ptr_field = ULINT_UNDEFINED;
+		break;
+	default:
+		ut_error;
+		return;
+	}
+
+	end = rec;
+	nulls = rec - (extra + 1);
+	lens = nulls - UT_BITS_IN_BYTES(index->n_nullable);
+	/* clear the SQL-null flags */
+	memset(lens + 1, 0, nulls - lens);
+
+	/* Store the data and the offsets */
+
+	for (i = 0, field = fields; i < n_fields; i++, field++) {
+		const dict_field_t*	ifield;
+
+		type = dfield_get_type(field);
+		len = dfield_get_len(field);
+
+		if (UNIV_UNLIKELY(i == n_node_ptr_field)) {
+			ut_ad(dtype_get_prtype(type) & DATA_NOT_NULL);
+			ut_ad(len == 4);
+			memcpy(end, dfield_get_data(field), len);
+			end += 4;
+			break;
+		}
+
+		if (!(dtype_get_prtype(type) & DATA_NOT_NULL)) {
+			/* nullable field */
+			ut_ad(index->n_nullable > 0);
+
+			if (UNIV_UNLIKELY(!(byte) null_mask)) {
+				nulls--;
+				null_mask = 1;
+			}
+
+			ut_ad(*nulls < null_mask);
+
+			/* set the null flag if necessary */
+			if (dfield_is_null(field)) {
+				*nulls |= null_mask;
+				null_mask <<= 1;
+				continue;
+			}
+
+			null_mask <<= 1;
+		}
+		/* only nullable fields can be null */
+		ut_ad(!dfield_is_null(field));
+
+		ifield = dict_index_get_nth_field(index, i);
+		fixed_len = ifield->fixed_len;
+		/* If the maximum length of a variable-length field
+		is up to 255 bytes, the actual length is always stored
+		in one byte. If the maximum length is more than 255
+		bytes, the actual length is stored in one byte for
+		0..127.  The length will be encoded in two bytes when
+		it is 128 or more, or when the field is stored externally. */
+		if (fixed_len) {
+			ut_ad(len == fixed_len);
+			ut_ad(!dfield_is_ext(field));
+		} else if (dfield_is_ext(field)) {
+			ut_ad(ifield->col->len >= 256
+			      || ifield->col->mtype == DATA_BLOB);
+			ut_ad(len <= REC_MAX_INDEX_COL_LEN
+			      + BTR_EXTERN_FIELD_REF_SIZE);
+			*lens-- = (byte) (len >> 8) | 0xc0;
+			*lens-- = (byte) len;
+		} else {
+			ut_ad(len <= dtype_get_len(type)
+			      || dtype_get_mtype(type) == DATA_BLOB);
+			if (len < 128
+			    || (dtype_get_len(type) < 256
+				&& dtype_get_mtype(type) != DATA_BLOB)) {
+
+				*lens-- = (byte) len;
+			} else {
+				ut_ad(len < 16384);
+				*lens-- = (byte) (len >> 8) | 0x80;
+				*lens-- = (byte) len;
+			}
+		}
+
+		memcpy(end, dfield_get_data(field), len);
+		end += len;
+	}
+}
+
+/*********************************************************//**
+Builds a new-style physical record out of a data tuple and
+stores it beginning from the start of the given buffer.
+@return	pointer to the origin of physical record */
+static
+rec_t*
+rec_convert_dtuple_to_rec_new(
+/*==========================*/
+	byte*			buf,	/*!< in: start address of
+					the physical record */
+	const dict_index_t*	index,	/*!< in: record descriptor */
+	const dtuple_t*		dtuple)	/*!< in: data tuple */
+{
+	ulint	extra_size;
+	ulint	status;
+	rec_t*	rec;
+
+	status = dtuple_get_info_bits(dtuple) & REC_NEW_STATUS_MASK;
+	rec_get_converted_size_comp(index, status,
+				    dtuple->fields, dtuple->n_fields,
+				    &extra_size);
+	rec = buf + extra_size;
+
+	rec_convert_dtuple_to_rec_comp(
+		rec, REC_N_NEW_EXTRA_BYTES, index, status,
+		dtuple->fields, dtuple->n_fields);
+
+	/* Set the info bits of the record */
+	rec_set_info_and_status_bits(rec, dtuple_get_info_bits(dtuple));
+
+	return(rec);
+}
+
+/*********************************************************//**
+Builds a physical record out of a data tuple and
+stores it beginning from the start of the given buffer.
+@return	pointer to the origin of physical record */
+UNIV_INTERN
+rec_t*
+rec_convert_dtuple_to_rec(
+/*======================*/
+	byte*			buf,	/*!< in: start address of the
+					physical record */
+	const dict_index_t*	index,	/*!< in: record descriptor */
+	const dtuple_t*		dtuple,	/*!< in: data tuple */
+	ulint			n_ext)	/*!< in: number of
+					externally stored columns */
+{
+	rec_t*	rec;
+
+	ut_ad(buf && index && dtuple);
+	ut_ad(dtuple_validate(dtuple));
+	ut_ad(dtuple_check_typed(dtuple));
+
+	if (dict_table_is_comp(index->table)) {
+		rec = rec_convert_dtuple_to_rec_new(buf, index, dtuple);
+	} else {
+		rec = rec_convert_dtuple_to_rec_old(buf, dtuple, n_ext);
+	}
+
+#ifdef UNIV_DEBUG
+	{
+		mem_heap_t*	heap	= NULL;
+		ulint		offsets_[REC_OFFS_NORMAL_SIZE];
+		const ulint*	offsets;
+		ulint		i;
+		rec_offs_init(offsets_);
+
+		offsets = rec_get_offsets(rec, index,
+					  offsets_, ULINT_UNDEFINED, &heap);
+		ut_ad(rec_validate(rec, offsets));
+		ut_ad(dtuple_get_n_fields(dtuple)
+		      == rec_offs_n_fields(offsets));
+
+		for (i = 0; i < rec_offs_n_fields(offsets); i++) {
+			ut_ad(!dfield_is_ext(dtuple_get_nth_field(dtuple, i))
+			      == !rec_offs_nth_extern(offsets, i));
+		}
+
+		if (UNIV_LIKELY_NULL(heap)) {
+			mem_heap_free(heap);
+		}
+	}
+#endif /* UNIV_DEBUG */
+	return(rec);
+}
+
+/**************************************************************//**
+Copies the first n fields of a physical record to a data tuple. The fields
+are copied to the memory heap. */
+UNIV_INTERN
+void
+rec_copy_prefix_to_dtuple(
+/*======================*/
+	dtuple_t*		tuple,		/*!< out: data tuple */
+	const rec_t*		rec,		/*!< in: physical record */
+	const dict_index_t*	index,		/*!< in: record descriptor */
+	ulint			n_fields,	/*!< in: number of fields
+						to copy */
+	mem_heap_t*		heap)		/*!< in: memory heap */
+{
+	ulint	i;
+	ulint	offsets_[REC_OFFS_NORMAL_SIZE];
+	ulint*	offsets	= offsets_;
+	rec_offs_init(offsets_);
+
+	offsets = rec_get_offsets(rec, index, offsets, n_fields, &heap);
+
+	ut_ad(rec_validate(rec, offsets));
+	ut_ad(dtuple_check_typed(tuple));
+
+	dtuple_set_info_bits(tuple, rec_get_info_bits(
+				     rec, dict_table_is_comp(index->table)));
+
+	for (i = 0; i < n_fields; i++) {
+		dfield_t*	field;
+		const byte*	data;
+		ulint		len;
+
+		field = dtuple_get_nth_field(tuple, i);
+		data = rec_get_nth_field(rec, offsets, i, &len);
+
+		if (len != UNIV_SQL_NULL) {
+			dfield_set_data(field,
+					mem_heap_dup(heap, data, len), len);
+			ut_ad(!rec_offs_nth_extern(offsets, i));
+		} else {
+			dfield_set_null(field);
+		}
+	}
+}
+
+/**************************************************************//**
+Copies the first n fields of an old-style physical record
+to a new physical record in a buffer.
+@return	own: copied record */
+static
+rec_t*
+rec_copy_prefix_to_buf_old(
+/*=======================*/
+	const rec_t*	rec,		/*!< in: physical record */
+	ulint		n_fields,	/*!< in: number of fields to copy */
+	ulint		area_end,	/*!< in: end of the prefix data */
+	byte**		buf,		/*!< in/out: memory buffer for
+					the copied prefix, or NULL */
+	ulint*		buf_size)	/*!< in/out: buffer size */
+{
+	rec_t*	copy_rec;
+	ulint	area_start;
+	ulint	prefix_len;
+
+	if (rec_get_1byte_offs_flag(rec)) {
+		area_start = REC_N_OLD_EXTRA_BYTES + n_fields;
+	} else {
+		area_start = REC_N_OLD_EXTRA_BYTES + 2 * n_fields;
+	}
+
+	prefix_len = area_start + area_end;
+
+	if ((*buf == NULL) || (*buf_size < prefix_len)) {
+		if (*buf != NULL) {
+			mem_free(*buf);
+		}
+
+		*buf = mem_alloc2(prefix_len, buf_size);
+	}
+
+	ut_memcpy(*buf, rec - area_start, prefix_len);
+
+	copy_rec = *buf + area_start;
+
+	rec_set_n_fields_old(copy_rec, n_fields);
+
+	return(copy_rec);
+}
+
+/**************************************************************//**
+Copies the first n fields of a physical record to a new physical record in
+a buffer.
+@return	own: copied record */
+UNIV_INTERN
+rec_t*
+rec_copy_prefix_to_buf(
+/*===================*/
+	const rec_t*		rec,		/*!< in: physical record */
+	const dict_index_t*	index,		/*!< in: record descriptor */
+	ulint			n_fields,	/*!< in: number of fields
+						to copy */
+	byte**			buf,		/*!< in/out: memory buffer
+						for the copied prefix,
+						or NULL */
+	ulint*			buf_size)	/*!< in/out: buffer size */
+{
+	const byte*	nulls;
+	const byte*	lens;
+	ulint		i;
+	ulint		prefix_len;
+	ulint		null_mask;
+	ulint		status;
+
+	UNIV_PREFETCH_RW(*buf);
+
+	if (!dict_table_is_comp(index->table)) {
+		ut_ad(rec_validate_old(rec));
+		return(rec_copy_prefix_to_buf_old(
+			       rec, n_fields,
+			       rec_get_field_start_offs(rec, n_fields),
+			       buf, buf_size));
+	}
+
+	status = rec_get_status(rec);
+
+	switch (status) {
+	case REC_STATUS_ORDINARY:
+		ut_ad(n_fields <= dict_index_get_n_fields(index));
+		break;
+	case REC_STATUS_NODE_PTR:
+		/* it doesn't make sense to copy the child page number field */
+		ut_ad(n_fields <= dict_index_get_n_unique_in_tree(index));
+		break;
+	case REC_STATUS_INFIMUM:
+	case REC_STATUS_SUPREMUM:
+		/* infimum or supremum record: no sense to copy anything */
+	default:
+		ut_error;
+		return(NULL);
+	}
+
+	nulls = rec - (REC_N_NEW_EXTRA_BYTES + 1);
+	lens = nulls - UT_BITS_IN_BYTES(index->n_nullable);
+	UNIV_PREFETCH_R(lens);
+	prefix_len = 0;
+	null_mask = 1;
+
+	/* read the lengths of fields 0..n */
+	for (i = 0; i < n_fields; i++) {
+		const dict_field_t*	field;
+		const dict_col_t*	col;
+
+		field = dict_index_get_nth_field(index, i);
+		col = dict_field_get_col(field);
+
+		if (!(col->prtype & DATA_NOT_NULL)) {
+			/* nullable field => read the null flag */
+			if (UNIV_UNLIKELY(!(byte) null_mask)) {
+				nulls--;
+				null_mask = 1;
+			}
+
+			if (*nulls & null_mask) {
+				null_mask <<= 1;
+				continue;
+			}
+
+			null_mask <<= 1;
+		}
+
+		if (field->fixed_len) {
+			prefix_len += field->fixed_len;
+		} else {
+			ulint	len = *lens--;
+			/* If the maximum length of the column is up
+			to 255 bytes, the actual length is always
+			stored in one byte. If the maximum length is
+			more than 255 bytes, the actual length is
+			stored in one byte for 0..127.  The length
+			will be encoded in two bytes when it is 128 or
+			more, or when the column is stored externally. */
+			if (col->len > 255 || col->mtype == DATA_BLOB) {
+				if (len & 0x80) {
+					/* 1exxxxxx */
+					len &= 0x3f;
+					len <<= 8;
+					len |= *lens--;
+					UNIV_PREFETCH_R(lens);
+				}
+			}
+			prefix_len += len;
+		}
+	}
+
+	UNIV_PREFETCH_R(rec + prefix_len);
+
+	prefix_len += rec - (lens + 1);
+
+	if ((*buf == NULL) || (*buf_size < prefix_len)) {
+		if (*buf != NULL) {
+			mem_free(*buf);
+		}
+
+		*buf = mem_alloc2(prefix_len, buf_size);
+	}
+
+	memcpy(*buf, lens + 1, prefix_len);
+
+	return(*buf + (rec - (lens + 1)));
+}
+
+/***************************************************************//**
+Validates the consistency of an old-style physical record.
+@return	TRUE if ok */
+static
+ibool
+rec_validate_old(
+/*=============*/
+	const rec_t*	rec)	/*!< in: physical record */
+{
+	const byte*	data;
+	ulint		len;
+	ulint		n_fields;
+	ulint		len_sum		= 0;
+	ulint		sum		= 0;
+	ulint		i;
+
+	ut_a(rec);
+	n_fields = rec_get_n_fields_old(rec);
+
+	if ((n_fields == 0) || (n_fields > REC_MAX_N_FIELDS)) {
+		fprintf(stderr, "InnoDB: Error: record has %lu fields\n",
+			(ulong) n_fields);
+		return(FALSE);
+	}
+
+	for (i = 0; i < n_fields; i++) {
+		data = rec_get_nth_field_old(rec, i, &len);
+
+		if (!((len < UNIV_PAGE_SIZE) || (len == UNIV_SQL_NULL))) {
+			fprintf(stderr,
+				"InnoDB: Error: record field %lu len %lu\n",
+				(ulong) i,
+				(ulong) len);
+			return(FALSE);
+		}
+
+		if (len != UNIV_SQL_NULL) {
+			len_sum += len;
+			sum += *(data + len -1); /* dereference the
+						 end of the field to
+						 cause a memory trap
+						 if possible */
+		} else {
+			len_sum += rec_get_nth_field_size(rec, i);
+		}
+	}
+
+	if (len_sum != rec_get_data_size_old(rec)) {
+		fprintf(stderr,
+			"InnoDB: Error: record len should be %lu, len %lu\n",
+			(ulong) len_sum,
+			rec_get_data_size_old(rec));
+		return(FALSE);
+	}
+
+	rec_dummy = sum; /* This is here only to fool the compiler */
+
+	return(TRUE);
+}
+
+/***************************************************************//**
+Validates the consistency of a physical record.
+@return	TRUE if ok */
+UNIV_INTERN
+ibool
+rec_validate(
+/*=========*/
+	const rec_t*	rec,	/*!< in: physical record */
+	const ulint*	offsets)/*!< in: array returned by rec_get_offsets() */
+{
+	const byte*	data;
+	ulint		len;
+	ulint		n_fields;
+	ulint		len_sum		= 0;
+	ulint		sum		= 0;
+	ulint		i;
+
+	ut_a(rec);
+	n_fields = rec_offs_n_fields(offsets);
+
+	if ((n_fields == 0) || (n_fields > REC_MAX_N_FIELDS)) {
+		fprintf(stderr, "InnoDB: Error: record has %lu fields\n",
+			(ulong) n_fields);
+		return(FALSE);
+	}
+
+	ut_a(rec_offs_comp(offsets) || n_fields <= rec_get_n_fields_old(rec));
+
+	for (i = 0; i < n_fields; i++) {
+		data = rec_get_nth_field(rec, offsets, i, &len);
+
+		if (!((len < UNIV_PAGE_SIZE) || (len == UNIV_SQL_NULL))) {
+			fprintf(stderr,
+				"InnoDB: Error: record field %lu len %lu\n",
+				(ulong) i,
+				(ulong) len);
+			return(FALSE);
+		}
+
+		if (len != UNIV_SQL_NULL) {
+			len_sum += len;
+			sum += *(data + len -1); /* dereference the
+						 end of the field to
+						 cause a memory trap
+						 if possible */
+		} else if (!rec_offs_comp(offsets)) {
+			len_sum += rec_get_nth_field_size(rec, i);
+		}
+	}
+
+	if (len_sum != rec_offs_data_size(offsets)) {
+		fprintf(stderr,
+			"InnoDB: Error: record len should be %lu, len %lu\n",
+			(ulong) len_sum,
+			(ulong) rec_offs_data_size(offsets));
+		return(FALSE);
+	}
+
+	rec_dummy = sum; /* This is here only to fool the compiler */
+
+	if (!rec_offs_comp(offsets)) {
+		ut_a(rec_validate_old(rec));
+	}
+
+	return(TRUE);
+}
+
+/***************************************************************//**
+Prints an old-style physical record. */
+UNIV_INTERN
+void
+rec_print_old(
+/*==========*/
+	FILE*		file,	/*!< in: file where to print */
+	const rec_t*	rec)	/*!< in: physical record */
+{
+	const byte*	data;
+	ulint		len;
+	ulint		n;
+	ulint		i;
+
+	ut_ad(rec);
+
+	n = rec_get_n_fields_old(rec);
+
+	fprintf(file, "PHYSICAL RECORD: n_fields %lu;"
+		" %u-byte offsets; info bits %lu\n",
+		(ulong) n,
+		rec_get_1byte_offs_flag(rec) ? 1 : 2,
+		(ulong) rec_get_info_bits(rec, FALSE));
+
+	for (i = 0; i < n; i++) {
+
+		data = rec_get_nth_field_old(rec, i, &len);
+
+		fprintf(file, " %lu:", (ulong) i);
+
+		if (len != UNIV_SQL_NULL) {
+			if (len <= 30) {
+
+				ut_print_buf(file, data, len);
+			} else {
+				ut_print_buf(file, data, 30);
+
+				fprintf(file, " (total %lu bytes)",
+					(ulong) len);
+			}
+		} else {
+			fprintf(file, " SQL NULL, size %lu ",
+				rec_get_nth_field_size(rec, i));
+		}
+
+		putc(';', file);
+		putc('\n', file);
+	}
+
+	rec_validate_old(rec);
+}
+
+#ifndef UNIV_HOTBACKUP
+/***************************************************************//**
+Prints a physical record in ROW_FORMAT=COMPACT.  Ignores the
+record header. */
+UNIV_INTERN
+void
+rec_print_comp(
+/*===========*/
+	FILE*		file,	/*!< in: file where to print */
+	const rec_t*	rec,	/*!< in: physical record */
+	const ulint*	offsets)/*!< in: array returned by rec_get_offsets() */
+{
+	ulint	i;
+
+	for (i = 0; i < rec_offs_n_fields(offsets); i++) {
+		const byte*	data;
+		ulint		len;
+
+		data = rec_get_nth_field(rec, offsets, i, &len);
+
+		fprintf(file, " %lu:", (ulong) i);
+
+		if (len != UNIV_SQL_NULL) {
+			if (len <= 30) {
+
+				ut_print_buf(file, data, len);
+			} else {
+				ut_print_buf(file, data, 30);
+
+				fprintf(file, " (total %lu bytes)",
+					(ulong) len);
+			}
+		} else {
+			fputs(" SQL NULL", file);
+		}
+		putc(';', file);
+		putc('\n', file);
+	}
+}
+
+/***************************************************************//**
+Prints a physical record. */
+UNIV_INTERN
+void
+rec_print_new(
+/*==========*/
+	FILE*		file,	/*!< in: file where to print */
+	const rec_t*	rec,	/*!< in: physical record */
+	const ulint*	offsets)/*!< in: array returned by rec_get_offsets() */
+{
+	ut_ad(rec);
+	ut_ad(offsets);
+	ut_ad(rec_offs_validate(rec, NULL, offsets));
+
+	if (!rec_offs_comp(offsets)) {
+		rec_print_old(file, rec);
+		return;
+	}
+
+	fprintf(file, "PHYSICAL RECORD: n_fields %lu;"
+		" compact format; info bits %lu\n",
+		(ulong) rec_offs_n_fields(offsets),
+		(ulong) rec_get_info_bits(rec, TRUE));
+
+	rec_print_comp(file, rec, offsets);
+	rec_validate(rec, offsets);
+}
+
+/***************************************************************//**
+Prints a physical record. */
+UNIV_INTERN
+void
+rec_print(
+/*======*/
+	FILE*		file,	/*!< in: file where to print */
+	const rec_t*	rec,	/*!< in: physical record */
+	dict_index_t*	index)	/*!< in: record descriptor */
+{
+	ut_ad(index);
+
+	if (!dict_table_is_comp(index->table)) {
+		rec_print_old(file, rec);
+		return;
+	} else {
+		mem_heap_t*	heap	= NULL;
+		ulint		offsets_[REC_OFFS_NORMAL_SIZE];
+		rec_offs_init(offsets_);
+
+		rec_print_new(file, rec,
+			      rec_get_offsets(rec, index, offsets_,
+					      ULINT_UNDEFINED, &heap));
+		if (UNIV_LIKELY_NULL(heap)) {
+			mem_heap_free(heap);
+		}
+	}
+}
+#endif /* !UNIV_HOTBACKUP */
diff --git a/storage/xtradb/row/row0ext.c b/storage/xtradb/row/row0ext.c
new file mode 100644
index 00000000000..7320f5b1dca
--- /dev/null
+++ b/storage/xtradb/row/row0ext.c
@@ -0,0 +1,115 @@
+/*****************************************************************************
+
+Copyright (c) 2006, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file row/row0ext.c
+Caching of externally stored column prefixes
+
+Created September 2006 Marko Makela
+*******************************************************/
+
+#include "row0ext.h"
+
+#ifdef UNIV_NONINL
+#include "row0ext.ic"
+#endif
+
+#include "btr0cur.h"
+
+/********************************************************************//**
+Fills the column prefix cache of an externally stored column. */
+static
+void
+row_ext_cache_fill(
+/*===============*/
+	row_ext_t*	ext,	/*!< in/out: column prefix cache */
+	ulint		i,	/*!< in: index of ext->ext[] */
+	ulint		zip_size,/*!< compressed page size in bytes, or 0 */
+	const dfield_t*	dfield)	/*!< in: data field */
+{
+	const byte*	field	= dfield_get_data(dfield);
+	ulint		f_len	= dfield_get_len(dfield);
+	byte*		buf	= ext->buf + i * REC_MAX_INDEX_COL_LEN;
+
+	ut_ad(i < ext->n_ext);
+	ut_ad(dfield_is_ext(dfield));
+	ut_a(f_len >= BTR_EXTERN_FIELD_REF_SIZE);
+
+	if (UNIV_UNLIKELY(!memcmp(field_ref_zero,
+				  field + f_len - BTR_EXTERN_FIELD_REF_SIZE,
+				  BTR_EXTERN_FIELD_REF_SIZE))) {
+		/* The BLOB pointer is not set: we cannot fetch it */
+		ext->len[i] = 0;
+	} else {
+		/* Fetch at most REC_MAX_INDEX_COL_LEN of the column.
+		The column should be non-empty.  However,
+		trx_rollback_or_clean_all_recovered() may try to
+		access a half-deleted BLOB if the server previously
+		crashed during the execution of
+		btr_free_externally_stored_field(). */
+		ext->len[i] = btr_copy_externally_stored_field_prefix(
+			buf, REC_MAX_INDEX_COL_LEN, zip_size, field, f_len);
+	}
+}
+
+/********************************************************************//**
+Creates a cache of column prefixes of externally stored columns.
+@return	own: column prefix cache */
+UNIV_INTERN
+row_ext_t*
+row_ext_create(
+/*===========*/
+	ulint		n_ext,	/*!< in: number of externally stored columns */
+	const ulint*	ext,	/*!< in: col_no's of externally stored columns
+				in the InnoDB table object, as reported by
+				dict_col_get_no(); NOT relative to the records
+				in the clustered index */
+	const dtuple_t*	tuple,	/*!< in: data tuple containing the field
+				references of the externally stored
+				columns; must be indexed by col_no;
+				the clustered index record must be
+				covered by a lock or a page latch
+				to prevent deletion (rollback or purge). */
+	ulint		zip_size,/*!< compressed page size in bytes, or 0 */
+	mem_heap_t*	heap)	/*!< in: heap where created */
+{
+	ulint		i;
+	row_ext_t*	ret = mem_heap_alloc(heap, (sizeof *ret)
+					     + (n_ext - 1) * sizeof ret->len);
+
+	ut_ad(ut_is_2pow(zip_size));
+	ut_ad(zip_size <= UNIV_PAGE_SIZE);
+
+	ret->n_ext = n_ext;
+	ret->ext = ext;
+	ret->buf = mem_heap_alloc(heap, n_ext * REC_MAX_INDEX_COL_LEN);
+#ifdef UNIV_DEBUG
+	memset(ret->buf, 0xaa, n_ext * REC_MAX_INDEX_COL_LEN);
+	UNIV_MEM_ALLOC(ret->buf, n_ext * REC_MAX_INDEX_COL_LEN);
+#endif
+
+	/* Fetch the BLOB prefixes */
+	for (i = 0; i < n_ext; i++) {
+		const dfield_t*	dfield;
+
+		dfield = dtuple_get_nth_field(tuple, ext[i]);
+		row_ext_cache_fill(ret, i, zip_size, dfield);
+	}
+
+	return(ret);
+}
diff --git a/storage/xtradb/row/row0ins.c b/storage/xtradb/row/row0ins.c
new file mode 100644
index 00000000000..d4925e46f97
--- /dev/null
+++ b/storage/xtradb/row/row0ins.c
@@ -0,0 +1,2533 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2010, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file row/row0ins.c
+Insert into a table
+
+Created 4/20/1996 Heikki Tuuri
+*******************************************************/
+
+#include "row0ins.h"
+
+#ifdef UNIV_NONINL
+#include "row0ins.ic"
+#endif
+
+#include "ha_prototypes.h"
+#include "dict0dict.h"
+#include "dict0boot.h"
+#include "trx0undo.h"
+#include "btr0btr.h"
+#include "btr0cur.h"
+#include "mach0data.h"
+#include "que0que.h"
+#include "row0upd.h"
+#include "row0sel.h"
+#include "row0row.h"
+#include "rem0cmp.h"
+#include "lock0lock.h"
+#include "log0log.h"
+#include "eval0eval.h"
+#include "data0data.h"
+#include "usr0sess.h"
+#include "buf0lru.h"
+
+#define	ROW_INS_PREV	1
+#define	ROW_INS_NEXT	2
+
+/*************************************************************************
+IMPORTANT NOTE: Any operation that generates redo MUST check that there
+is enough space in the redo log before for that operation. This is
+done by calling log_free_check(). The reason for checking the
+availability of the redo log space before the start of the operation is
+that we MUST not hold any synchonization objects when performing the
+check.
+If you make a change in this module make sure that no codepath is
+introduced where a call to log_free_check() is bypassed. */
+
+/*********************************************************************//**
+Creates an insert node struct.
+@return	own: insert node struct */
+UNIV_INTERN
+ins_node_t*
+ins_node_create(
+/*============*/
+	ulint		ins_type,	/*!< in: INS_VALUES, ... */
+	dict_table_t*	table,		/*!< in: table where to insert */
+	mem_heap_t*	heap)		/*!< in: mem heap where created */
+{
+	ins_node_t*	node;
+
+	node = mem_heap_alloc(heap, sizeof(ins_node_t));
+
+	node->common.type = QUE_NODE_INSERT;
+
+	node->ins_type = ins_type;
+
+	node->state = INS_NODE_SET_IX_LOCK;
+	node->table = table;
+	node->index = NULL;
+	node->entry = NULL;
+
+	node->select = NULL;
+
+	node->trx_id = ut_dulint_zero;
+
+	node->entry_sys_heap = mem_heap_create(128);
+
+	node->magic_n = INS_NODE_MAGIC_N;
+
+	return(node);
+}
+
+/***********************************************************//**
+Creates an entry template for each index of a table. */
+UNIV_INTERN
+void
+ins_node_create_entry_list(
+/*=======================*/
+	ins_node_t*	node)	/*!< in: row insert node */
+{
+	dict_index_t*	index;
+	dtuple_t*	entry;
+
+	ut_ad(node->entry_sys_heap);
+
+	UT_LIST_INIT(node->entry_list);
+
+	index = dict_table_get_first_index(node->table);
+
+	while (index != NULL) {
+		entry = row_build_index_entry(node->row, NULL, index,
+					      node->entry_sys_heap);
+		UT_LIST_ADD_LAST(tuple_list, node->entry_list, entry);
+
+		index = dict_table_get_next_index(index);
+	}
+}
+
+/*****************************************************************//**
+Adds system field buffers to a row. */
+static
+void
+row_ins_alloc_sys_fields(
+/*=====================*/
+	ins_node_t*	node)	/*!< in: insert node */
+{
+	dtuple_t*		row;
+	dict_table_t*		table;
+	mem_heap_t*		heap;
+	const dict_col_t*	col;
+	dfield_t*		dfield;
+	byte*			ptr;
+
+	row = node->row;
+	table = node->table;
+	heap = node->entry_sys_heap;
+
+	ut_ad(row && table && heap);
+	ut_ad(dtuple_get_n_fields(row) == dict_table_get_n_cols(table));
+
+	/* 1. Allocate buffer for row id */
+
+	col = dict_table_get_sys_col(table, DATA_ROW_ID);
+
+	dfield = dtuple_get_nth_field(row, dict_col_get_no(col));
+
+	ptr = mem_heap_zalloc(heap, DATA_ROW_ID_LEN);
+
+	dfield_set_data(dfield, ptr, DATA_ROW_ID_LEN);
+
+	node->row_id_buf = ptr;
+
+	/* 3. Allocate buffer for trx id */
+
+	col = dict_table_get_sys_col(table, DATA_TRX_ID);
+
+	dfield = dtuple_get_nth_field(row, dict_col_get_no(col));
+	ptr = mem_heap_zalloc(heap, DATA_TRX_ID_LEN);
+
+	dfield_set_data(dfield, ptr, DATA_TRX_ID_LEN);
+
+	node->trx_id_buf = ptr;
+
+	/* 4. Allocate buffer for roll ptr */
+
+	col = dict_table_get_sys_col(table, DATA_ROLL_PTR);
+
+	dfield = dtuple_get_nth_field(row, dict_col_get_no(col));
+	ptr = mem_heap_zalloc(heap, DATA_ROLL_PTR_LEN);
+
+	dfield_set_data(dfield, ptr, DATA_ROLL_PTR_LEN);
+}
+
+/*********************************************************************//**
+Sets a new row to insert for an INS_DIRECT node. This function is only used
+if we have constructed the row separately, which is a rare case; this
+function is quite slow. */
+UNIV_INTERN
+void
+ins_node_set_new_row(
+/*=================*/
+	ins_node_t*	node,	/*!< in: insert node */
+	dtuple_t*	row)	/*!< in: new row (or first row) for the node */
+{
+	node->state = INS_NODE_SET_IX_LOCK;
+	node->index = NULL;
+	node->entry = NULL;
+
+	node->row = row;
+
+	mem_heap_empty(node->entry_sys_heap);
+
+	/* Create templates for index entries */
+
+	ins_node_create_entry_list(node);
+
+	/* Allocate from entry_sys_heap buffers for sys fields */
+
+	row_ins_alloc_sys_fields(node);
+
+	/* As we allocated a new trx id buf, the trx id should be written
+	there again: */
+
+	node->trx_id = ut_dulint_zero;
+}
+
+/*******************************************************************//**
+Does an insert operation by updating a delete-marked existing record
+in the index. This situation can occur if the delete-marked record is
+kept in the index for consistent reads.
+@return	DB_SUCCESS or error code */
+static
+ulint
+row_ins_sec_index_entry_by_modify(
+/*==============================*/
+	ulint		mode,	/*!< in: BTR_MODIFY_LEAF or BTR_MODIFY_TREE,
+				depending on whether mtr holds just a leaf
+				latch or also a tree latch */
+	btr_cur_t*	cursor,	/*!< in: B-tree cursor */
+	const dtuple_t*	entry,	/*!< in: index entry to insert */
+	que_thr_t*	thr,	/*!< in: query thread */
+	mtr_t*		mtr)	/*!< in: mtr; must be committed before
+				latching any further pages */
+{
+	big_rec_t*	dummy_big_rec;
+	mem_heap_t*	heap;
+	upd_t*		update;
+	rec_t*		rec;
+	ulint		err;
+
+	rec = btr_cur_get_rec(cursor);
+
+	ut_ad(!dict_index_is_clust(cursor->index));
+	ut_ad(rec_get_deleted_flag(rec,
+				   dict_table_is_comp(cursor->index->table)));
+
+	/* We know that in the alphabetical ordering, entry and rec are
+	identified. But in their binary form there may be differences if
+	there are char fields in them. Therefore we have to calculate the
+	difference. */
+
+	heap = mem_heap_create(1024);
+
+	update = row_upd_build_sec_rec_difference_binary(
+		cursor->index, entry, rec, thr_get_trx(thr), heap);
+	if (mode == BTR_MODIFY_LEAF) {
+		/* Try an optimistic updating of the record, keeping changes
+		within the page */
+
+		err = btr_cur_optimistic_update(BTR_KEEP_SYS_FLAG, cursor,
+						update, 0, thr, mtr);
+		switch (err) {
+		case DB_OVERFLOW:
+		case DB_UNDERFLOW:
+		case DB_ZIP_OVERFLOW:
+			err = DB_FAIL;
+		}
+	} else {
+		ut_a(mode == BTR_MODIFY_TREE);
+		if (buf_LRU_buf_pool_running_out()) {
+
+			err = DB_LOCK_TABLE_FULL;
+
+			goto func_exit;
+		}
+
+		err = btr_cur_pessimistic_update(BTR_KEEP_SYS_FLAG, cursor,
+						 &heap, &dummy_big_rec, update,
+						 0, thr, mtr);
+		ut_ad(!dummy_big_rec);
+	}
+func_exit:
+	mem_heap_free(heap);
+
+	return(err);
+}
+
+/*******************************************************************//**
+Does an insert operation by delete unmarking and updating a delete marked
+existing record in the index. This situation can occur if the delete marked
+record is kept in the index for consistent reads.
+@return	DB_SUCCESS, DB_FAIL, or error code */
+static
+ulint
+row_ins_clust_index_entry_by_modify(
+/*================================*/
+	ulint		mode,	/*!< in: BTR_MODIFY_LEAF or BTR_MODIFY_TREE,
+				depending on whether mtr holds just a leaf
+				latch or also a tree latch */
+	btr_cur_t*	cursor,	/*!< in: B-tree cursor */
+	mem_heap_t**	heap,	/*!< in/out: pointer to memory heap, or NULL */
+	big_rec_t**	big_rec,/*!< out: possible big rec vector of fields
+				which have to be stored externally by the
+				caller */
+	const dtuple_t*	entry,	/*!< in: index entry to insert */
+	que_thr_t*	thr,	/*!< in: query thread */
+	mtr_t*		mtr)	/*!< in: mtr; must be committed before
+				latching any further pages */
+{
+	rec_t*		rec;
+	upd_t*		update;
+	ulint		err;
+
+	ut_ad(dict_index_is_clust(cursor->index));
+
+	*big_rec = NULL;
+
+	rec = btr_cur_get_rec(cursor);
+
+	ut_ad(rec_get_deleted_flag(rec,
+				   dict_table_is_comp(cursor->index->table)));
+
+	if (!*heap) {
+		*heap = mem_heap_create(1024);
+	}
+
+	/* Build an update vector containing all the fields to be modified;
+	NOTE that this vector may NOT contain system columns trx_id or
+	roll_ptr */
+
+	update = row_upd_build_difference_binary(cursor->index, entry, rec,
+						 thr_get_trx(thr), *heap);
+	if (mode == BTR_MODIFY_LEAF) {
+		/* Try optimistic updating of the record, keeping changes
+		within the page */
+
+		err = btr_cur_optimistic_update(0, cursor, update, 0, thr,
+						mtr);
+		switch (err) {
+		case DB_OVERFLOW:
+		case DB_UNDERFLOW:
+		case DB_ZIP_OVERFLOW:
+			err = DB_FAIL;
+		}
+	} else {
+		ut_a(mode == BTR_MODIFY_TREE);
+		if (buf_LRU_buf_pool_running_out()) {
+
+			return(DB_LOCK_TABLE_FULL);
+
+		}
+		err = btr_cur_pessimistic_update(0, cursor,
+						 heap, big_rec, update,
+						 0, thr, mtr);
+	}
+
+	return(err);
+}
+
+/*********************************************************************//**
+Returns TRUE if in a cascaded update/delete an ancestor node of node
+updates (not DELETE, but UPDATE) table.
+@return	TRUE if an ancestor updates table */
+static
+ibool
+row_ins_cascade_ancestor_updates_table(
+/*===================================*/
+	que_node_t*	node,	/*!< in: node in a query graph */
+	dict_table_t*	table)	/*!< in: table */
+{
+	que_node_t*	parent;
+	upd_node_t*	upd_node;
+
+	parent = que_node_get_parent(node);
+
+	while (que_node_get_type(parent) == QUE_NODE_UPDATE) {
+
+		upd_node = parent;
+
+		if (upd_node->table == table && upd_node->is_delete == FALSE) {
+
+			return(TRUE);
+		}
+
+		parent = que_node_get_parent(parent);
+
+		ut_a(parent);
+	}
+
+	return(FALSE);
+}
+
+/*********************************************************************//**
+Returns the number of ancestor UPDATE or DELETE nodes of a
+cascaded update/delete node.
+@return	number of ancestors */
+static
+ulint
+row_ins_cascade_n_ancestors(
+/*========================*/
+	que_node_t*	node)	/*!< in: node in a query graph */
+{
+	que_node_t*	parent;
+	ulint		n_ancestors = 0;
+
+	parent = que_node_get_parent(node);
+
+	while (que_node_get_type(parent) == QUE_NODE_UPDATE) {
+		n_ancestors++;
+
+		parent = que_node_get_parent(parent);
+
+		ut_a(parent);
+	}
+
+	return(n_ancestors);
+}
+
+/******************************************************************//**
+Calculates the update vector node->cascade->update for a child table in
+a cascaded update.
+@return number of fields in the calculated update vector; the value
+can also be 0 if no foreign key fields changed; the returned value is
+ULINT_UNDEFINED if the column type in the child table is too short to
+fit the new value in the parent table: that means the update fails */
+static
+ulint
+row_ins_cascade_calc_update_vec(
+/*============================*/
+	upd_node_t*	node,		/*!< in: update node of the parent
+					table */
+	dict_foreign_t*	foreign,	/*!< in: foreign key constraint whose
+					type is != 0 */
+	mem_heap_t*	heap)		/*!< in: memory heap to use as
+					temporary storage */
+{
+	upd_node_t*	cascade		= node->cascade_node;
+	dict_table_t*	table		= foreign->foreign_table;
+	dict_index_t*	index		= foreign->foreign_index;
+	upd_t*		update;
+	upd_field_t*	ufield;
+	dict_table_t*	parent_table;
+	dict_index_t*	parent_index;
+	upd_t*		parent_update;
+	upd_field_t*	parent_ufield;
+	ulint		n_fields_updated;
+	ulint		parent_field_no;
+	ulint		i;
+	ulint		j;
+
+	ut_a(node);
+	ut_a(foreign);
+	ut_a(cascade);
+	ut_a(table);
+	ut_a(index);
+
+	/* Calculate the appropriate update vector which will set the fields
+	in the child index record to the same value (possibly padded with
+	spaces if the column is a fixed length CHAR or FIXBINARY column) as
+	the referenced index record will get in the update. */
+
+	parent_table = node->table;
+	ut_a(parent_table == foreign->referenced_table);
+	parent_index = foreign->referenced_index;
+	parent_update = node->update;
+
+	update = cascade->update;
+
+	update->info_bits = 0;
+	update->n_fields = foreign->n_fields;
+
+	n_fields_updated = 0;
+
+	for (i = 0; i < foreign->n_fields; i++) {
+
+		parent_field_no = dict_table_get_nth_col_pos(
+			parent_table,
+			dict_index_get_nth_col_no(parent_index, i));
+
+		for (j = 0; j < parent_update->n_fields; j++) {
+			parent_ufield = parent_update->fields + j;
+
+			if (parent_ufield->field_no == parent_field_no) {
+
+				ulint			min_size;
+				const dict_col_t*	col;
+				ulint			ufield_len;
+
+				col = dict_index_get_nth_col(index, i);
+
+				/* A field in the parent index record is
+				updated. Let us make the update vector
+				field for the child table. */
+
+				ufield = update->fields + n_fields_updated;
+
+				ufield->field_no
+					= dict_table_get_nth_col_pos(
+					table, dict_col_get_no(col));
+				ufield->exp = NULL;
+
+				ufield->new_val = parent_ufield->new_val;
+				ufield_len = dfield_get_len(&ufield->new_val);
+
+				/* Clear the "external storage" flag */
+				dfield_set_len(&ufield->new_val, ufield_len);
+
+				/* Do not allow a NOT NULL column to be
+				updated as NULL */
+
+				if (dfield_is_null(&ufield->new_val)
+				    && (col->prtype & DATA_NOT_NULL)) {
+
+					return(ULINT_UNDEFINED);
+				}
+
+				/* If the new value would not fit in the
+				column, do not allow the update */
+
+				if (!dfield_is_null(&ufield->new_val)
+				    && dtype_get_at_most_n_mbchars(
+					col->prtype,
+					col->mbminlen, col->mbmaxlen,
+					col->len,
+					ufield_len,
+					dfield_get_data(&ufield->new_val))
+				    < ufield_len) {
+
+					return(ULINT_UNDEFINED);
+				}
+
+				/* If the parent column type has a different
+				length than the child column type, we may
+				need to pad with spaces the new value of the
+				child column */
+
+				min_size = dict_col_get_min_size(col);
+
+				/* Because UNIV_SQL_NULL (the marker
+				of SQL NULL values) exceeds all possible
+				values of min_size, the test below will
+				not hold for SQL NULL columns. */
+
+				if (min_size > ufield_len) {
+
+					char*		pad_start;
+					const char*	pad_end;
+					char*		padded_data
+						= mem_heap_alloc(
+							heap, min_size);
+					pad_start = padded_data + ufield_len;
+					pad_end = padded_data + min_size;
+
+					memcpy(padded_data,
+					       dfield_get_data(&ufield
+							       ->new_val),
+					       dfield_get_len(&ufield
+							      ->new_val));
+
+					switch (UNIV_EXPECT(col->mbminlen,1)) {
+					default:
+						ut_error;
+						return(ULINT_UNDEFINED);
+					case 1:
+						if (UNIV_UNLIKELY
+						    (dtype_get_charset_coll(
+							    col->prtype)
+						     == DATA_MYSQL_BINARY_CHARSET_COLL)) {
+							/* Do not pad BINARY
+							columns. */
+							return(ULINT_UNDEFINED);
+						}
+
+						/* space=0x20 */
+						memset(pad_start, 0x20,
+						       pad_end - pad_start);
+						break;
+					case 2:
+						/* space=0x0020 */
+						ut_a(!(ufield_len % 2));
+						ut_a(!(min_size % 2));
+						do {
+							*pad_start++ = 0x00;
+							*pad_start++ = 0x20;
+						} while (pad_start < pad_end);
+						break;
+					}
+
+					dfield_set_data(&ufield->new_val,
+							padded_data, min_size);
+				}
+
+				n_fields_updated++;
+			}
+		}
+	}
+
+	update->n_fields = n_fields_updated;
+
+	return(n_fields_updated);
+}
+
+/*********************************************************************//**
+Set detailed error message associated with foreign key errors for
+the given transaction. */
+static
+void
+row_ins_set_detailed(
+/*=================*/
+	trx_t*		trx,		/*!< in: transaction */
+	dict_foreign_t*	foreign)	/*!< in: foreign key constraint */
+{
+	mutex_enter(&srv_misc_tmpfile_mutex);
+	rewind(srv_misc_tmpfile);
+
+	if (os_file_set_eof(srv_misc_tmpfile)) {
+		ut_print_name(srv_misc_tmpfile, trx, TRUE,
+			      foreign->foreign_table_name);
+		dict_print_info_on_foreign_key_in_create_format(
+			srv_misc_tmpfile, trx, foreign, FALSE);
+		trx_set_detailed_error_from_file(trx, srv_misc_tmpfile);
+	} else {
+		trx_set_detailed_error(trx, "temp file operation failed");
+	}
+
+	mutex_exit(&srv_misc_tmpfile_mutex);
+}
+
+/*********************************************************************//**
+Reports a foreign key error associated with an update or a delete of a
+parent table index entry. */
+static
+void
+row_ins_foreign_report_err(
+/*=======================*/
+	const char*	errstr,		/*!< in: error string from the viewpoint
+					of the parent table */
+	que_thr_t*	thr,		/*!< in: query thread whose run_node
+					is an update node */
+	dict_foreign_t*	foreign,	/*!< in: foreign key constraint */
+	const rec_t*	rec,		/*!< in: a matching index record in the
+					child table */
+	const dtuple_t*	entry)		/*!< in: index entry in the parent
+					table */
+{
+	FILE*	ef	= dict_foreign_err_file;
+	trx_t*	trx	= thr_get_trx(thr);
+
+	row_ins_set_detailed(trx, foreign);
+
+	mutex_enter(&dict_foreign_err_mutex);
+	rewind(ef);
+	ut_print_timestamp(ef);
+	fputs(" Transaction:\n", ef);
+	trx_print(ef, trx, 600);
+
+	fputs("Foreign key constraint fails for table ", ef);
+	ut_print_name(ef, trx, TRUE, foreign->foreign_table_name);
+	fputs(":\n", ef);
+	dict_print_info_on_foreign_key_in_create_format(ef, trx, foreign,
+							TRUE);
+	putc('\n', ef);
+	fputs(errstr, ef);
+	fputs(" in parent table, in index ", ef);
+	ut_print_name(ef, trx, FALSE, foreign->referenced_index->name);
+	if (entry) {
+		fputs(" tuple:\n", ef);
+		dtuple_print(ef, entry);
+	}
+	fputs("\nBut in child table ", ef);
+	ut_print_name(ef, trx, TRUE, foreign->foreign_table_name);
+	fputs(", in index ", ef);
+	ut_print_name(ef, trx, FALSE, foreign->foreign_index->name);
+	if (rec) {
+		fputs(", there is a record:\n", ef);
+		rec_print(ef, rec, foreign->foreign_index);
+	} else {
+		fputs(", the record is not available\n", ef);
+	}
+	putc('\n', ef);
+
+	mutex_exit(&dict_foreign_err_mutex);
+}
+
+/*********************************************************************//**
+Reports a foreign key error to dict_foreign_err_file when we are trying
+to add an index entry to a child table. Note that the adding may be the result
+of an update, too. */
+static
+void
+row_ins_foreign_report_add_err(
+/*===========================*/
+	trx_t*		trx,		/*!< in: transaction */
+	dict_foreign_t*	foreign,	/*!< in: foreign key constraint */
+	const rec_t*	rec,		/*!< in: a record in the parent table:
+					it does not match entry because we
+					have an error! */
+	const dtuple_t*	entry)		/*!< in: index entry to insert in the
+					child table */
+{
+	FILE*	ef	= dict_foreign_err_file;
+
+	row_ins_set_detailed(trx, foreign);
+
+	mutex_enter(&dict_foreign_err_mutex);
+	rewind(ef);
+	ut_print_timestamp(ef);
+	fputs(" Transaction:\n", ef);
+	trx_print(ef, trx, 600);
+	fputs("Foreign key constraint fails for table ", ef);
+	ut_print_name(ef, trx, TRUE, foreign->foreign_table_name);
+	fputs(":\n", ef);
+	dict_print_info_on_foreign_key_in_create_format(ef, trx, foreign,
+							TRUE);
+	fputs("\nTrying to add in child table, in index ", ef);
+	ut_print_name(ef, trx, FALSE, foreign->foreign_index->name);
+	if (entry) {
+		fputs(" tuple:\n", ef);
+		/* TODO: DB_TRX_ID and DB_ROLL_PTR may be uninitialized.
+		It would be better to only display the user columns. */
+		dtuple_print(ef, entry);
+	}
+	fputs("\nBut in parent table ", ef);
+	ut_print_name(ef, trx, TRUE, foreign->referenced_table_name);
+	fputs(", in index ", ef);
+	ut_print_name(ef, trx, FALSE, foreign->referenced_index->name);
+	fputs(",\nthe closest match we can find is record:\n", ef);
+	if (rec && page_rec_is_supremum(rec)) {
+		/* If the cursor ended on a supremum record, it is better
+		to report the previous record in the error message, so that
+		the user gets a more descriptive error message. */
+		rec = page_rec_get_prev_const(rec);
+	}
+
+	if (rec) {
+		rec_print(ef, rec, foreign->referenced_index);
+	}
+	putc('\n', ef);
+
+	mutex_exit(&dict_foreign_err_mutex);
+}
+
+/*********************************************************************//**
+Invalidate the query cache for the given table. */
+static
+void
+row_ins_invalidate_query_cache(
+/*===========================*/
+	que_thr_t*	thr,		/*!< in: query thread whose run_node
+					is an update node */
+	const char*	name)		/*!< in: table name prefixed with
+					database name and a '/' character */
+{
+	char*	buf;
+	char*	ptr;
+	ulint	len = strlen(name) + 1;
+
+	buf = mem_strdupl(name, len);
+
+	ptr = strchr(buf, '/');
+	ut_a(ptr);
+	*ptr = '\0';
+
+	innobase_invalidate_query_cache(thr_get_trx(thr), buf, len);
+	mem_free(buf);
+}
+
+/*********************************************************************//**
+Perform referential actions or checks when a parent row is deleted or updated
+and the constraint had an ON DELETE or ON UPDATE condition which was not
+RESTRICT.
+@return	DB_SUCCESS, DB_LOCK_WAIT, or error code */
+static
+ulint
+row_ins_foreign_check_on_constraint(
+/*================================*/
+	que_thr_t*	thr,		/*!< in: query thread whose run_node
+					is an update node */
+	dict_foreign_t*	foreign,	/*!< in: foreign key constraint whose
+					type is != 0 */
+	btr_pcur_t*	pcur,		/*!< in: cursor placed on a matching
+					index record in the child table */
+	dtuple_t*	entry,		/*!< in: index entry in the parent
+					table */
+	mtr_t*		mtr)		/*!< in: mtr holding the latch of pcur
+					page */
+{
+	upd_node_t*	node;
+	upd_node_t*	cascade;
+	dict_table_t*	table		= foreign->foreign_table;
+	dict_index_t*	index;
+	dict_index_t*	clust_index;
+	dtuple_t*	ref;
+	mem_heap_t*	upd_vec_heap	= NULL;
+	const rec_t*	rec;
+	const rec_t*	clust_rec;
+	const buf_block_t* clust_block;
+	upd_t*		update;
+	ulint		n_to_update;
+	ulint		err;
+	ulint		i;
+	trx_t*		trx;
+	mem_heap_t*	tmp_heap	= NULL;
+
+	ut_a(thr);
+	ut_a(foreign);
+	ut_a(pcur);
+	ut_a(mtr);
+
+	trx = thr_get_trx(thr);
+
+	/* Since we are going to delete or update a row, we have to invalidate
+	the MySQL query cache for table. A deadlock of threads is not possible
+	here because the caller of this function does not hold any latches with
+	the sync0sync.h rank above the kernel mutex. The query cache mutex has
+	a rank just above the kernel mutex. */
+
+	row_ins_invalidate_query_cache(thr, table->name);
+
+	node = thr->run_node;
+
+	if (node->is_delete && 0 == (foreign->type
+				     & (DICT_FOREIGN_ON_DELETE_CASCADE
+					| DICT_FOREIGN_ON_DELETE_SET_NULL))) {
+
+		row_ins_foreign_report_err("Trying to delete",
+					   thr, foreign,
+					   btr_pcur_get_rec(pcur), entry);
+
+		return(DB_ROW_IS_REFERENCED);
+	}
+
+	if (!node->is_delete && 0 == (foreign->type
+				      & (DICT_FOREIGN_ON_UPDATE_CASCADE
+					 | DICT_FOREIGN_ON_UPDATE_SET_NULL))) {
+
+		/* This is an UPDATE */
+
+		row_ins_foreign_report_err("Trying to update",
+					   thr, foreign,
+					   btr_pcur_get_rec(pcur), entry);
+
+		return(DB_ROW_IS_REFERENCED);
+	}
+
+	if (node->cascade_node == NULL) {
+		/* Extend our query graph by creating a child to current
+		update node. The child is used in the cascade or set null
+		operation. */
+
+		node->cascade_heap = mem_heap_create(128);
+		node->cascade_node = row_create_update_node_for_mysql(
+			table, node->cascade_heap);
+		que_node_set_parent(node->cascade_node, node);
+	}
+
+	/* Initialize cascade_node to do the operation we want. Note that we
+	use the SAME cascade node to do all foreign key operations of the
+	SQL DELETE: the table of the cascade node may change if there are
+	several child tables to the table where the delete is done! */
+
+	cascade = node->cascade_node;
+
+	cascade->table = table;
+
+	cascade->foreign = foreign;
+
+	if (node->is_delete
+	    && (foreign->type & DICT_FOREIGN_ON_DELETE_CASCADE)) {
+		cascade->is_delete = TRUE;
+	} else {
+		cascade->is_delete = FALSE;
+
+		if (foreign->n_fields > cascade->update_n_fields) {
+			/* We have to make the update vector longer */
+
+			cascade->update = upd_create(foreign->n_fields,
+						     node->cascade_heap);
+			cascade->update_n_fields = foreign->n_fields;
+		}
+	}
+
+	/* We do not allow cyclic cascaded updating (DELETE is allowed,
+	but not UPDATE) of the same table, as this can lead to an infinite
+	cycle. Check that we are not updating the same table which is
+	already being modified in this cascade chain. We have to check
+	this also because the modification of the indexes of a 'parent'
+	table may still be incomplete, and we must avoid seeing the indexes
+	of the parent table in an inconsistent state! */
+
+	if (!cascade->is_delete
+	    && row_ins_cascade_ancestor_updates_table(cascade, table)) {
+
+		/* We do not know if this would break foreign key
+		constraints, but play safe and return an error */
+
+		err = DB_ROW_IS_REFERENCED;
+
+		row_ins_foreign_report_err(
+			"Trying an update, possibly causing a cyclic"
+			" cascaded update\n"
+			"in the child table,", thr, foreign,
+			btr_pcur_get_rec(pcur), entry);
+
+		goto nonstandard_exit_func;
+	}
+
+	if (row_ins_cascade_n_ancestors(cascade) >= 15) {
+		err = DB_ROW_IS_REFERENCED;
+
+		row_ins_foreign_report_err(
+			"Trying a too deep cascaded delete or update\n",
+			thr, foreign, btr_pcur_get_rec(pcur), entry);
+
+		goto nonstandard_exit_func;
+	}
+
+	index = btr_pcur_get_btr_cur(pcur)->index;
+
+	ut_a(index == foreign->foreign_index);
+
+	rec = btr_pcur_get_rec(pcur);
+
+	if (dict_index_is_clust(index)) {
+		/* pcur is already positioned in the clustered index of
+		the child table */
+
+		clust_index = index;
+		clust_rec = rec;
+		clust_block = btr_pcur_get_block(pcur);
+	} else {
+		/* We have to look for the record in the clustered index
+		in the child table */
+
+		clust_index = dict_table_get_first_index(table);
+
+		tmp_heap = mem_heap_create(256);
+
+		ref = row_build_row_ref(ROW_COPY_POINTERS, index, rec,
+					tmp_heap);
+		btr_pcur_open_with_no_init(clust_index, ref,
+					   PAGE_CUR_LE, BTR_SEARCH_LEAF,
+					   cascade->pcur, 0, mtr);
+
+		clust_rec = btr_pcur_get_rec(cascade->pcur);
+		clust_block = btr_pcur_get_block(cascade->pcur);
+
+		if (!page_rec_is_user_rec(clust_rec)
+		    || btr_pcur_get_low_match(cascade->pcur)
+		    < dict_index_get_n_unique(clust_index)) {
+
+			fputs("InnoDB: error in cascade of a foreign key op\n"
+			      "InnoDB: ", stderr);
+			dict_index_name_print(stderr, trx, index);
+
+			fputs("\n"
+			      "InnoDB: record ", stderr);
+			rec_print(stderr, rec, index);
+			fputs("\n"
+			      "InnoDB: clustered record ", stderr);
+			rec_print(stderr, clust_rec, clust_index);
+			fputs("\n"
+			      "InnoDB: Submit a detailed bug report to"
+			      " http://bugs.mysql.com\n", stderr);
+
+			err = DB_SUCCESS;
+
+			goto nonstandard_exit_func;
+		}
+	}
+
+	/* Set an X-lock on the row to delete or update in the child table */
+
+	err = lock_table(0, table, LOCK_IX, thr);
+
+	if (err == DB_SUCCESS) {
+		/* Here it suffices to use a LOCK_REC_NOT_GAP type lock;
+		we already have a normal shared lock on the appropriate
+		gap if the search criterion was not unique */
+
+		err = lock_clust_rec_read_check_and_lock_alt(
+			0, clust_block, clust_rec, clust_index,
+			LOCK_X, LOCK_REC_NOT_GAP, thr);
+	}
+
+	if (err != DB_SUCCESS) {
+
+		goto nonstandard_exit_func;
+	}
+
+	if (rec_get_deleted_flag(clust_rec, dict_table_is_comp(table))) {
+		/* This can happen if there is a circular reference of
+		rows such that cascading delete comes to delete a row
+		already in the process of being delete marked */
+		err = DB_SUCCESS;
+
+		goto nonstandard_exit_func;
+	}
+
+	if ((node->is_delete
+	     && (foreign->type & DICT_FOREIGN_ON_DELETE_SET_NULL))
+	    || (!node->is_delete
+		&& (foreign->type & DICT_FOREIGN_ON_UPDATE_SET_NULL))) {
+
+		/* Build the appropriate update vector which sets
+		foreign->n_fields first fields in rec to SQL NULL */
+
+		update = cascade->update;
+
+		update->info_bits = 0;
+		update->n_fields = foreign->n_fields;
+
+		for (i = 0; i < foreign->n_fields; i++) {
+			upd_field_t*	ufield = &update->fields[i];
+
+			ufield->field_no = dict_table_get_nth_col_pos(
+				table,
+				dict_index_get_nth_col_no(index, i));
+			ufield->orig_len = 0;
+			ufield->exp = NULL;
+			dfield_set_null(&ufield->new_val);
+		}
+	}
+
+	if (!node->is_delete
+	    && (foreign->type & DICT_FOREIGN_ON_UPDATE_CASCADE)) {
+
+		/* Build the appropriate update vector which sets changing
+		foreign->n_fields first fields in rec to new values */
+
+		upd_vec_heap = mem_heap_create(256);
+
+		n_to_update = row_ins_cascade_calc_update_vec(node, foreign,
+							      upd_vec_heap);
+		if (n_to_update == ULINT_UNDEFINED) {
+			err = DB_ROW_IS_REFERENCED;
+
+			row_ins_foreign_report_err(
+				"Trying a cascaded update where the"
+				" updated value in the child\n"
+				"table would not fit in the length"
+				" of the column, or the value would\n"
+				"be NULL and the column is"
+				" declared as not NULL in the child table,",
+				thr, foreign, btr_pcur_get_rec(pcur), entry);
+
+			goto nonstandard_exit_func;
+		}
+
+		if (cascade->update->n_fields == 0) {
+
+			/* The update does not change any columns referred
+			to in this foreign key constraint: no need to do
+			anything */
+
+			err = DB_SUCCESS;
+
+			goto nonstandard_exit_func;
+		}
+	}
+
+	/* Store pcur position and initialize or store the cascade node
+	pcur stored position */
+
+	btr_pcur_store_position(pcur, mtr);
+
+	if (index == clust_index) {
+		btr_pcur_copy_stored_position(cascade->pcur, pcur);
+	} else {
+		btr_pcur_store_position(cascade->pcur, mtr);
+	}
+
+	mtr_commit(mtr);
+
+	ut_a(cascade->pcur->rel_pos == BTR_PCUR_ON);
+
+	cascade->state = UPD_NODE_UPDATE_CLUSTERED;
+
+	err = row_update_cascade_for_mysql(thr, cascade,
+					   foreign->foreign_table);
+
+	if (foreign->foreign_table->n_foreign_key_checks_running == 0) {
+		fprintf(stderr,
+			"InnoDB: error: table %s has the counter 0"
+			" though there is\n"
+			"InnoDB: a FOREIGN KEY check running on it.\n",
+			foreign->foreign_table->name);
+	}
+
+	/* Release the data dictionary latch for a while, so that we do not
+	starve other threads from doing CREATE TABLE etc. if we have a huge
+	cascaded operation running. The counter n_foreign_key_checks_running
+	will prevent other users from dropping or ALTERing the table when we
+	release the latch. */
+
+	row_mysql_unfreeze_data_dictionary(thr_get_trx(thr));
+	row_mysql_freeze_data_dictionary(thr_get_trx(thr));
+
+	mtr_start(mtr);
+
+	/* Restore pcur position */
+
+	btr_pcur_restore_position(BTR_SEARCH_LEAF, pcur, mtr);
+
+	if (tmp_heap) {
+		mem_heap_free(tmp_heap);
+	}
+
+	if (upd_vec_heap) {
+		mem_heap_free(upd_vec_heap);
+	}
+
+	return(err);
+
+nonstandard_exit_func:
+	if (tmp_heap) {
+		mem_heap_free(tmp_heap);
+	}
+
+	if (upd_vec_heap) {
+		mem_heap_free(upd_vec_heap);
+	}
+
+	btr_pcur_store_position(pcur, mtr);
+
+	mtr_commit(mtr);
+	mtr_start(mtr);
+
+	btr_pcur_restore_position(BTR_SEARCH_LEAF, pcur, mtr);
+
+	return(err);
+}
+
+/*********************************************************************//**
+Sets a shared lock on a record. Used in locking possible duplicate key
+records and also in checking foreign key constraints.
+@return	DB_SUCCESS, DB_SUCCESS_LOCKED_REC, or error code */
+static
+enum db_err
+row_ins_set_shared_rec_lock(
+/*========================*/
+	ulint			type,	/*!< in: LOCK_ORDINARY, LOCK_GAP, or
+					LOCK_REC_NOT_GAP type lock */
+	const buf_block_t*	block,	/*!< in: buffer block of rec */
+	const rec_t*		rec,	/*!< in: record */
+	dict_index_t*		index,	/*!< in: index */
+	const ulint*		offsets,/*!< in: rec_get_offsets(rec, index) */
+	que_thr_t*		thr)	/*!< in: query thread */
+{
+	enum db_err	err;
+
+	ut_ad(rec_offs_validate(rec, index, offsets));
+
+	if (dict_index_is_clust(index)) {
+		err = lock_clust_rec_read_check_and_lock(
+			0, block, rec, index, offsets, LOCK_S, type, thr);
+	} else {
+		err = lock_sec_rec_read_check_and_lock(
+			0, block, rec, index, offsets, LOCK_S, type, thr);
+	}
+
+	return(err);
+}
+
+/*********************************************************************//**
+Sets a exclusive lock on a record. Used in locking possible duplicate key
+records
+@return	DB_SUCCESS, DB_SUCCESS_LOCKED_REC, or error code */
+static
+enum db_err
+row_ins_set_exclusive_rec_lock(
+/*===========================*/
+	ulint			type,	/*!< in: LOCK_ORDINARY, LOCK_GAP, or
+					LOCK_REC_NOT_GAP type lock */
+	const buf_block_t*	block,	/*!< in: buffer block of rec */
+	const rec_t*		rec,	/*!< in: record */
+	dict_index_t*		index,	/*!< in: index */
+	const ulint*		offsets,/*!< in: rec_get_offsets(rec, index) */
+	que_thr_t*		thr)	/*!< in: query thread */
+{
+	enum db_err	err;
+
+	ut_ad(rec_offs_validate(rec, index, offsets));
+
+	if (dict_index_is_clust(index)) {
+		err = lock_clust_rec_read_check_and_lock(
+			0, block, rec, index, offsets, LOCK_X, type, thr);
+	} else {
+		err = lock_sec_rec_read_check_and_lock(
+			0, block, rec, index, offsets, LOCK_X, type, thr);
+	}
+
+	return(err);
+}
+
+/***************************************************************//**
+Checks if foreign key constraint fails for an index entry. Sets shared locks
+which lock either the success or the failure of the constraint. NOTE that
+the caller must have a shared latch on dict_operation_lock.
+@return	DB_SUCCESS, DB_NO_REFERENCED_ROW, or DB_ROW_IS_REFERENCED */
+UNIV_INTERN
+ulint
+row_ins_check_foreign_constraint(
+/*=============================*/
+	ibool		check_ref,/*!< in: TRUE if we want to check that
+				the referenced table is ok, FALSE if we
+				want to check the foreign key table */
+	dict_foreign_t*	foreign,/*!< in: foreign constraint; NOTE that the
+				tables mentioned in it must be in the
+				dictionary cache if they exist at all */
+	dict_table_t*	table,	/*!< in: if check_ref is TRUE, then the foreign
+				table, else the referenced table */
+	dtuple_t*	entry,	/*!< in: index entry for index */
+	que_thr_t*	thr)	/*!< in: query thread */
+{
+	upd_node_t*	upd_node;
+	dict_table_t*	check_table;
+	dict_index_t*	check_index;
+	ulint		n_fields_cmp;
+	btr_pcur_t	pcur;
+	int		cmp;
+	ulint		err;
+	ulint		i;
+	mtr_t		mtr;
+	trx_t*		trx		= thr_get_trx(thr);
+	mem_heap_t*	heap		= NULL;
+	ulint		offsets_[REC_OFFS_NORMAL_SIZE];
+	ulint*		offsets		= offsets_;
+	rec_offs_init(offsets_);
+
+run_again:
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(rw_lock_own(&dict_operation_lock, RW_LOCK_SHARED));
+#endif /* UNIV_SYNC_DEBUG */
+
+	err = DB_SUCCESS;
+
+	if (trx->check_foreigns == FALSE) {
+		/* The user has suppressed foreign key checks currently for
+		this session */
+		goto exit_func;
+	}
+
+	/* If any of the foreign key fields in entry is SQL NULL, we
+	suppress the foreign key check: this is compatible with Oracle,
+	for example */
+
+	for (i = 0; i < foreign->n_fields; i++) {
+		if (UNIV_SQL_NULL == dfield_get_len(
+			    dtuple_get_nth_field(entry, i))) {
+
+			goto exit_func;
+		}
+	}
+
+	if (que_node_get_type(thr->run_node) == QUE_NODE_UPDATE) {
+		upd_node = thr->run_node;
+
+		if (!(upd_node->is_delete) && upd_node->foreign == foreign) {
+			/* If a cascaded update is done as defined by a
+			foreign key constraint, do not check that
+			constraint for the child row. In ON UPDATE CASCADE
+			the update of the parent row is only half done when
+			we come here: if we would check the constraint here
+			for the child row it would fail.
+
+			A QUESTION remains: if in the child table there are
+			several constraints which refer to the same parent
+			table, we should merge all updates to the child as
+			one update? And the updates can be contradictory!
+			Currently we just perform the update associated
+			with each foreign key constraint, one after
+			another, and the user has problems predicting in
+			which order they are performed. */
+
+			goto exit_func;
+		}
+	}
+
+	if (check_ref) {
+		check_table = foreign->referenced_table;
+		check_index = foreign->referenced_index;
+	} else {
+		check_table = foreign->foreign_table;
+		check_index = foreign->foreign_index;
+	}
+
+	if (check_table == NULL || check_table->ibd_file_missing) {
+		if (check_ref) {
+			FILE*	ef = dict_foreign_err_file;
+
+			row_ins_set_detailed(trx, foreign);
+
+			mutex_enter(&dict_foreign_err_mutex);
+			rewind(ef);
+			ut_print_timestamp(ef);
+			fputs(" Transaction:\n", ef);
+			trx_print(ef, trx, 600);
+			fputs("Foreign key constraint fails for table ", ef);
+			ut_print_name(ef, trx, TRUE,
+				      foreign->foreign_table_name);
+			fputs(":\n", ef);
+			dict_print_info_on_foreign_key_in_create_format(
+				ef, trx, foreign, TRUE);
+			fputs("\nTrying to add to index ", ef);
+			ut_print_name(ef, trx, FALSE,
+				      foreign->foreign_index->name);
+			fputs(" tuple:\n", ef);
+			dtuple_print(ef, entry);
+			fputs("\nBut the parent table ", ef);
+			ut_print_name(ef, trx, TRUE,
+				      foreign->referenced_table_name);
+			fputs("\nor its .ibd file does"
+			      " not currently exist!\n", ef);
+			mutex_exit(&dict_foreign_err_mutex);
+
+			err = DB_NO_REFERENCED_ROW;
+		}
+
+		goto exit_func;
+	}
+
+	ut_a(check_table);
+	ut_a(check_index);
+
+	if (check_table != table) {
+		/* We already have a LOCK_IX on table, but not necessarily
+		on check_table */
+
+		err = lock_table(0, check_table, LOCK_IS, thr);
+
+		if (err != DB_SUCCESS) {
+
+			goto do_possible_lock_wait;
+		}
+	}
+
+	mtr_start(&mtr);
+
+	/* Store old value on n_fields_cmp */
+
+	n_fields_cmp = dtuple_get_n_fields_cmp(entry);
+
+	dtuple_set_n_fields_cmp(entry, foreign->n_fields);
+
+	btr_pcur_open(check_index, entry, PAGE_CUR_GE,
+		      BTR_SEARCH_LEAF, &pcur, &mtr);
+
+	/* Scan index records and check if there is a matching record */
+
+	do {
+		const rec_t*		rec = btr_pcur_get_rec(&pcur);
+		const buf_block_t*	block = btr_pcur_get_block(&pcur);
+
+		if (srv_pass_corrupt_table && !block) {
+			err = DB_CORRUPTION;
+			break;
+		}
+		ut_a(block);
+
+		if (page_rec_is_infimum(rec)) {
+
+			continue;
+		}
+
+		offsets = rec_get_offsets(rec, check_index,
+					  offsets, ULINT_UNDEFINED, &heap);
+
+		if (page_rec_is_supremum(rec)) {
+
+			err = row_ins_set_shared_rec_lock(LOCK_ORDINARY, block,
+							  rec, check_index,
+							  offsets, thr);
+			switch (err) {
+			case DB_SUCCESS_LOCKED_REC:
+			case DB_SUCCESS:
+				continue;
+			default:
+				goto end_scan;
+			}
+		}
+
+		cmp = cmp_dtuple_rec(entry, rec, offsets);
+
+		if (cmp == 0) {
+			if (rec_get_deleted_flag(rec,
+						 rec_offs_comp(offsets))) {
+				err = row_ins_set_shared_rec_lock(
+					LOCK_ORDINARY, block,
+					rec, check_index, offsets, thr);
+				switch (err) {
+				case DB_SUCCESS_LOCKED_REC:
+				case DB_SUCCESS:
+					break;
+				default:
+					goto end_scan;
+				}
+			} else {
+				/* Found a matching record. Lock only
+				a record because we can allow inserts
+				into gaps */
+
+				err = row_ins_set_shared_rec_lock(
+					LOCK_REC_NOT_GAP, block,
+					rec, check_index, offsets, thr);
+
+				switch (err) {
+				case DB_SUCCESS_LOCKED_REC:
+				case DB_SUCCESS:
+					break;
+				default:
+					goto end_scan;
+				}
+
+				if (check_ref) {
+					err = DB_SUCCESS;
+
+					goto end_scan;
+				} else if (foreign->type != 0) {
+					/* There is an ON UPDATE or ON DELETE
+					condition: check them in a separate
+					function */
+
+					err = row_ins_foreign_check_on_constraint(
+						thr, foreign, &pcur, entry,
+						&mtr);
+					if (err != DB_SUCCESS) {
+						/* Since reporting a plain
+						"duplicate key" error
+						message to the user in
+						cases where a long CASCADE
+						operation would lead to a
+						duplicate key in some
+						other table is very
+						confusing, map duplicate
+						key errors resulting from
+						FK constraints to a
+						separate error code. */
+
+						if (err == DB_DUPLICATE_KEY) {
+							err = DB_FOREIGN_DUPLICATE_KEY;
+						}
+
+						goto end_scan;
+					}
+
+					/* row_ins_foreign_check_on_constraint
+					may have repositioned pcur on a
+					different block */
+					block = btr_pcur_get_block(&pcur);
+				} else {
+					row_ins_foreign_report_err(
+						"Trying to delete or update",
+						thr, foreign, rec, entry);
+
+					err = DB_ROW_IS_REFERENCED;
+					goto end_scan;
+				}
+			}
+		} else {
+			ut_a(cmp < 0);
+
+			err = row_ins_set_shared_rec_lock(
+				LOCK_GAP, block,
+				rec, check_index, offsets, thr);
+
+			switch (err) {
+			case DB_SUCCESS_LOCKED_REC:
+			case DB_SUCCESS:
+				if (check_ref) {
+					err = DB_NO_REFERENCED_ROW;
+					row_ins_foreign_report_add_err(
+						trx, foreign, rec, entry);
+				} else {
+					err = DB_SUCCESS;
+				}
+			}
+
+			goto end_scan;
+		}
+	} while (btr_pcur_move_to_next(&pcur, &mtr));
+
+	if (check_ref) {
+		row_ins_foreign_report_add_err(
+			trx, foreign, btr_pcur_get_rec(&pcur), entry);
+		err = DB_NO_REFERENCED_ROW;
+	} else {
+		err = DB_SUCCESS;
+	}
+
+end_scan:
+	btr_pcur_close(&pcur);
+
+	mtr_commit(&mtr);
+
+	/* Restore old value */
+	dtuple_set_n_fields_cmp(entry, n_fields_cmp);
+
+do_possible_lock_wait:
+	if (err == DB_LOCK_WAIT) {
+		trx->error_state = err;
+
+		que_thr_stop_for_mysql(thr);
+
+		srv_suspend_mysql_thread(thr);
+
+		if (trx->error_state == DB_SUCCESS) {
+
+			goto run_again;
+		}
+
+		err = trx->error_state;
+	}
+
+exit_func:
+	if (UNIV_LIKELY_NULL(heap)) {
+		mem_heap_free(heap);
+	}
+	return(err);
+}
+
+/***************************************************************//**
+Checks if foreign key constraints fail for an index entry. If index
+is not mentioned in any constraint, this function does nothing,
+Otherwise does searches to the indexes of referenced tables and
+sets shared locks which lock either the success or the failure of
+a constraint.
+@return	DB_SUCCESS or error code */
+static
+ulint
+row_ins_check_foreign_constraints(
+/*==============================*/
+	dict_table_t*	table,	/*!< in: table */
+	dict_index_t*	index,	/*!< in: index */
+	dtuple_t*	entry,	/*!< in: index entry for index */
+	que_thr_t*	thr)	/*!< in: query thread */
+{
+	dict_foreign_t*	foreign;
+	ulint		err;
+	trx_t*		trx;
+	ibool		got_s_lock	= FALSE;
+
+	trx = thr_get_trx(thr);
+
+	foreign = UT_LIST_GET_FIRST(table->foreign_list);
+
+	while (foreign) {
+		if (foreign->foreign_index == index) {
+
+			if (foreign->referenced_table == NULL) {
+				dict_table_get(foreign->referenced_table_name,
+					       FALSE);
+			}
+
+			if (0 == trx->dict_operation_lock_mode) {
+				got_s_lock = TRUE;
+
+				row_mysql_freeze_data_dictionary(trx);
+			}
+
+			if (foreign->referenced_table) {
+				mutex_enter(&(dict_sys->mutex));
+
+				(foreign->referenced_table
+				 ->n_foreign_key_checks_running)++;
+
+				mutex_exit(&(dict_sys->mutex));
+			}
+
+			/* NOTE that if the thread ends up waiting for a lock
+			we will release dict_operation_lock temporarily!
+			But the counter on the table protects the referenced
+			table from being dropped while the check is running. */
+
+			err = row_ins_check_foreign_constraint(
+				TRUE, foreign, table, entry, thr);
+
+			if (foreign->referenced_table) {
+				mutex_enter(&(dict_sys->mutex));
+
+				ut_a(foreign->referenced_table
+				     ->n_foreign_key_checks_running > 0);
+				(foreign->referenced_table
+				 ->n_foreign_key_checks_running)--;
+
+				mutex_exit(&(dict_sys->mutex));
+			}
+
+			if (got_s_lock) {
+				row_mysql_unfreeze_data_dictionary(trx);
+			}
+
+			if (err != DB_SUCCESS) {
+				return(err);
+			}
+		}
+
+		foreign = UT_LIST_GET_NEXT(foreign_list, foreign);
+	}
+
+	return(DB_SUCCESS);
+}
+
+/***************************************************************//**
+Checks if a unique key violation to rec would occur at the index entry
+insert.
+@return	TRUE if error */
+static
+ibool
+row_ins_dupl_error_with_rec(
+/*========================*/
+	const rec_t*	rec,	/*!< in: user record; NOTE that we assume
+				that the caller already has a record lock on
+				the record! */
+	const dtuple_t*	entry,	/*!< in: entry to insert */
+	dict_index_t*	index,	/*!< in: index */
+	const ulint*	offsets)/*!< in: rec_get_offsets(rec, index) */
+{
+	ulint	matched_fields;
+	ulint	matched_bytes;
+	ulint	n_unique;
+	ulint	i;
+
+	ut_ad(rec_offs_validate(rec, index, offsets));
+
+	n_unique = dict_index_get_n_unique(index);
+
+	matched_fields = 0;
+	matched_bytes = 0;
+
+	cmp_dtuple_rec_with_match(entry, rec, offsets,
+				  &matched_fields, &matched_bytes);
+
+	if (matched_fields < n_unique) {
+
+		return(FALSE);
+	}
+
+	/* In a unique secondary index we allow equal key values if they
+	contain SQL NULLs */
+
+	if (!dict_index_is_clust(index)) {
+
+		for (i = 0; i < n_unique; i++) {
+			if (UNIV_SQL_NULL == dfield_get_len(
+				    dtuple_get_nth_field(entry, i))) {
+
+				return(FALSE);
+			}
+		}
+	}
+
+	return(!rec_get_deleted_flag(rec, rec_offs_comp(offsets)));
+}
+
+/***************************************************************//**
+Scans a unique non-clustered index at a given index entry to determine
+whether a uniqueness violation has occurred for the key value of the entry.
+Set shared locks on possible duplicate records.
+@return	DB_SUCCESS, DB_DUPLICATE_KEY, or DB_LOCK_WAIT */
+static
+ulint
+row_ins_scan_sec_index_for_duplicate(
+/*=================================*/
+	dict_index_t*	index,	/*!< in: non-clustered unique index */
+	dtuple_t*	entry,	/*!< in: index entry */
+	que_thr_t*	thr)	/*!< in: query thread */
+{
+	ulint		n_unique;
+	ulint		i;
+	int		cmp;
+	ulint		n_fields_cmp;
+	btr_pcur_t	pcur;
+	ulint		err		= DB_SUCCESS;
+	unsigned	allow_duplicates;
+	mtr_t		mtr;
+	mem_heap_t*	heap		= NULL;
+	ulint		offsets_[REC_OFFS_NORMAL_SIZE];
+	ulint*		offsets		= offsets_;
+	rec_offs_init(offsets_);
+
+	n_unique = dict_index_get_n_unique(index);
+
+	/* If the secondary index is unique, but one of the fields in the
+	n_unique first fields is NULL, a unique key violation cannot occur,
+	since we define NULL != NULL in this case */
+
+	for (i = 0; i < n_unique; i++) {
+		if (UNIV_SQL_NULL == dfield_get_len(
+			    dtuple_get_nth_field(entry, i))) {
+
+			return(DB_SUCCESS);
+		}
+	}
+
+	mtr_start(&mtr);
+
+	/* Store old value on n_fields_cmp */
+
+	n_fields_cmp = dtuple_get_n_fields_cmp(entry);
+
+	dtuple_set_n_fields_cmp(entry, dict_index_get_n_unique(index));
+
+	btr_pcur_open(index, entry, PAGE_CUR_GE, BTR_SEARCH_LEAF, &pcur, &mtr);
+
+	allow_duplicates = thr_get_trx(thr)->duplicates & TRX_DUP_IGNORE;
+
+	/* Scan index records and check if there is a duplicate */
+
+	do {
+		const rec_t*		rec	= btr_pcur_get_rec(&pcur);
+		const buf_block_t*	block	= btr_pcur_get_block(&pcur);
+
+		if (page_rec_is_infimum(rec)) {
+
+			continue;
+		}
+
+		offsets = rec_get_offsets(rec, index, offsets,
+					  ULINT_UNDEFINED, &heap);
+
+		if (allow_duplicates) {
+
+			/* If the SQL-query will update or replace
+			duplicate key we will take X-lock for
+			duplicates ( REPLACE, LOAD DATAFILE REPLACE,
+			INSERT ON DUPLICATE KEY UPDATE). */
+
+			err = row_ins_set_exclusive_rec_lock(
+				LOCK_ORDINARY, block,
+				rec, index, offsets, thr);
+		} else {
+
+			err = row_ins_set_shared_rec_lock(
+				LOCK_ORDINARY, block,
+				rec, index, offsets, thr);
+		}
+
+		switch (err) {
+		case DB_SUCCESS_LOCKED_REC:
+			err = DB_SUCCESS;
+		case DB_SUCCESS:
+			break;
+		default:
+			goto end_scan;
+		}
+
+		if (page_rec_is_supremum(rec)) {
+
+			continue;
+		}
+
+		cmp = cmp_dtuple_rec(entry, rec, offsets);
+
+		if (cmp == 0) {
+			if (row_ins_dupl_error_with_rec(rec, entry,
+							index, offsets)) {
+				err = DB_DUPLICATE_KEY;
+
+				thr_get_trx(thr)->error_info = index;
+
+				goto end_scan;
+			}
+		} else {
+			ut_a(cmp < 0);
+			goto end_scan;
+		}
+	} while (btr_pcur_move_to_next(&pcur, &mtr));
+
+end_scan:
+	if (UNIV_LIKELY_NULL(heap)) {
+		mem_heap_free(heap);
+	}
+	mtr_commit(&mtr);
+
+	/* Restore old value */
+	dtuple_set_n_fields_cmp(entry, n_fields_cmp);
+
+	return(err);
+}
+
+/***************************************************************//**
+Checks if a unique key violation error would occur at an index entry
+insert. Sets shared locks on possible duplicate records. Works only
+for a clustered index!
+@return DB_SUCCESS if no error, DB_DUPLICATE_KEY if error,
+DB_LOCK_WAIT if we have to wait for a lock on a possible duplicate
+record */
+static
+ulint
+row_ins_duplicate_error_in_clust(
+/*=============================*/
+	btr_cur_t*	cursor,	/*!< in: B-tree cursor */
+	dtuple_t*	entry,	/*!< in: entry to insert */
+	que_thr_t*	thr,	/*!< in: query thread */
+	mtr_t*		mtr)	/*!< in: mtr */
+{
+	ulint	err;
+	rec_t*	rec;
+	ulint	n_unique;
+	trx_t*	trx		= thr_get_trx(thr);
+	mem_heap_t*heap		= NULL;
+	ulint	offsets_[REC_OFFS_NORMAL_SIZE];
+	ulint*	offsets		= offsets_;
+	rec_offs_init(offsets_);
+
+	UT_NOT_USED(mtr);
+
+	ut_a(dict_index_is_clust(cursor->index));
+	ut_ad(dict_index_is_unique(cursor->index));
+
+	/* NOTE: For unique non-clustered indexes there may be any number
+	of delete marked records with the same value for the non-clustered
+	index key (remember multiversioning), and which differ only in
+	the row refererence part of the index record, containing the
+	clustered index key fields. For such a secondary index record,
+	to avoid race condition, we must FIRST do the insertion and after
+	that check that the uniqueness condition is not breached! */
+
+	/* NOTE: A problem is that in the B-tree node pointers on an
+	upper level may match more to the entry than the actual existing
+	user records on the leaf level. So, even if low_match would suggest
+	that a duplicate key violation may occur, this may not be the case. */
+
+	n_unique = dict_index_get_n_unique(cursor->index);
+
+	if (cursor->low_match >= n_unique) {
+
+		rec = btr_cur_get_rec(cursor);
+
+		if (!page_rec_is_infimum(rec)) {
+			offsets = rec_get_offsets(rec, cursor->index, offsets,
+						  ULINT_UNDEFINED, &heap);
+
+			/* We set a lock on the possible duplicate: this
+			is needed in logical logging of MySQL to make
+			sure that in roll-forward we get the same duplicate
+			errors as in original execution */
+
+			if (trx->duplicates & TRX_DUP_IGNORE) {
+
+				/* If the SQL-query will update or replace
+				duplicate key we will take X-lock for
+				duplicates ( REPLACE, LOAD DATAFILE REPLACE,
+				INSERT ON DUPLICATE KEY UPDATE). */
+
+				err = row_ins_set_exclusive_rec_lock(
+					LOCK_REC_NOT_GAP,
+					btr_cur_get_block(cursor),
+					rec, cursor->index, offsets, thr);
+			} else {
+
+				err = row_ins_set_shared_rec_lock(
+					LOCK_REC_NOT_GAP,
+					btr_cur_get_block(cursor), rec,
+					cursor->index, offsets, thr);
+			}
+
+			switch (err) {
+			case DB_SUCCESS_LOCKED_REC:
+			case DB_SUCCESS:
+				break;
+			default:
+				goto func_exit;
+			}
+
+			if (row_ins_dupl_error_with_rec(
+				    rec, entry, cursor->index, offsets)) {
+				trx->error_info = cursor->index;
+				err = DB_DUPLICATE_KEY;
+				goto func_exit;
+			}
+		}
+	}
+
+	if (cursor->up_match >= n_unique) {
+
+		rec = page_rec_get_next(btr_cur_get_rec(cursor));
+
+		if (!page_rec_is_supremum(rec)) {
+			offsets = rec_get_offsets(rec, cursor->index, offsets,
+						  ULINT_UNDEFINED, &heap);
+
+			if (trx->duplicates & TRX_DUP_IGNORE) {
+
+				/* If the SQL-query will update or replace
+				duplicate key we will take X-lock for
+				duplicates ( REPLACE, LOAD DATAFILE REPLACE,
+				INSERT ON DUPLICATE KEY UPDATE). */
+
+				err = row_ins_set_exclusive_rec_lock(
+					LOCK_REC_NOT_GAP,
+					btr_cur_get_block(cursor),
+					rec, cursor->index, offsets, thr);
+			} else {
+
+				err = row_ins_set_shared_rec_lock(
+					LOCK_REC_NOT_GAP,
+					btr_cur_get_block(cursor),
+					rec, cursor->index, offsets, thr);
+			}
+
+			switch (err) {
+			case DB_SUCCESS_LOCKED_REC:
+			case DB_SUCCESS:
+				break;
+			default:
+				goto func_exit;
+			}
+
+			if (row_ins_dupl_error_with_rec(
+				    rec, entry, cursor->index, offsets)) {
+				trx->error_info = cursor->index;
+				err = DB_DUPLICATE_KEY;
+				goto func_exit;
+			}
+		}
+
+		ut_a(!dict_index_is_clust(cursor->index));
+		/* This should never happen */
+	}
+
+	err = DB_SUCCESS;
+func_exit:
+	if (UNIV_LIKELY_NULL(heap)) {
+		mem_heap_free(heap);
+	}
+	return(err);
+}
+
+/***************************************************************//**
+Checks if an index entry has long enough common prefix with an existing
+record so that the intended insert of the entry must be changed to a modify of
+the existing record. In the case of a clustered index, the prefix must be
+n_unique fields long, and in the case of a secondary index, all fields must be
+equal.
+@return 0 if no update, ROW_INS_PREV if previous should be updated;
+currently we do the search so that only the low_match record can match
+enough to the search tuple, not the next record */
+UNIV_INLINE
+ulint
+row_ins_must_modify(
+/*================*/
+	btr_cur_t*	cursor)	/*!< in: B-tree cursor */
+{
+	ulint	enough_match;
+	rec_t*	rec;
+
+	/* NOTE: (compare to the note in row_ins_duplicate_error) Because node
+	pointers on upper levels of the B-tree may match more to entry than
+	to actual user records on the leaf level, we have to check if the
+	candidate record is actually a user record. In a clustered index
+	node pointers contain index->n_unique first fields, and in the case
+	of a secondary index, all fields of the index. */
+
+	enough_match = dict_index_get_n_unique_in_tree(cursor->index);
+
+	if (cursor->low_match >= enough_match) {
+
+		rec = btr_cur_get_rec(cursor);
+
+		if (!page_rec_is_infimum(rec)) {
+
+			return(ROW_INS_PREV);
+		}
+	}
+
+	return(0);
+}
+
+/***************************************************************//**
+Tries to insert an index entry to an index. If the index is clustered
+and a record with the same unique key is found, the other record is
+necessarily marked deleted by a committed transaction, or a unique key
+violation error occurs. The delete marked record is then updated to an
+existing record, and we must write an undo log record on the delete
+marked record. If the index is secondary, and a record with exactly the
+same fields is found, the other record is necessarily marked deleted.
+It is then unmarked. Otherwise, the entry is just inserted to the index.
+@return DB_SUCCESS, DB_LOCK_WAIT, DB_FAIL if pessimistic retry needed,
+or error code */
+static
+ulint
+row_ins_index_entry_low(
+/*====================*/
+	ulint		mode,	/*!< in: BTR_MODIFY_LEAF or BTR_MODIFY_TREE,
+				depending on whether we wish optimistic or
+				pessimistic descent down the index tree */
+	dict_index_t*	index,	/*!< in: index */
+	dtuple_t*	entry,	/*!< in: index entry to insert */
+	ulint		n_ext,	/*!< in: number of externally stored columns */
+	que_thr_t*	thr)	/*!< in: query thread */
+{
+	btr_cur_t	cursor;
+	ulint		ignore_sec_unique	= 0;
+	ulint		modify = 0; /* remove warning */
+	rec_t*		insert_rec;
+	rec_t*		rec;
+	ulint		err;
+	ulint		n_unique;
+	big_rec_t*	big_rec			= NULL;
+	mtr_t		mtr;
+	mem_heap_t*	heap			= NULL;
+
+	log_free_check();
+
+	mtr_start(&mtr);
+
+	cursor.thr = thr;
+
+	/* Note that we use PAGE_CUR_LE as the search mode, because then
+	the function will return in both low_match and up_match of the
+	cursor sensible values */
+
+	if (!(thr_get_trx(thr)->check_unique_secondary)) {
+		ignore_sec_unique = BTR_IGNORE_SEC_UNIQUE;
+	}
+
+	btr_cur_search_to_nth_level(index, 0, entry, PAGE_CUR_LE,
+				    mode | BTR_INSERT | ignore_sec_unique,
+				    &cursor, 0, __FILE__, __LINE__, &mtr);
+
+	if (cursor.flag == BTR_CUR_INSERT_TO_IBUF) {
+		/* The insertion was made to the insert buffer already during
+		the search: we are done */
+
+		err = DB_SUCCESS;
+
+		goto function_exit;
+	}
+
+#ifdef UNIV_DEBUG
+	{
+		page_t*	page = btr_cur_get_page(&cursor);
+		rec_t*	first_rec = page_rec_get_next(
+			page_get_infimum_rec(page));
+
+		ut_ad(page_rec_is_supremum(first_rec)
+		      || rec_get_n_fields(first_rec, index)
+		      == dtuple_get_n_fields(entry));
+	}
+#endif
+
+	n_unique = dict_index_get_n_unique(index);
+
+	if (dict_index_is_unique(index) && (cursor.up_match >= n_unique
+					    || cursor.low_match >= n_unique)) {
+
+		if (dict_index_is_clust(index)) {
+			/* Note that the following may return also
+			DB_LOCK_WAIT */
+
+			err = row_ins_duplicate_error_in_clust(
+				&cursor, entry, thr, &mtr);
+			if (err != DB_SUCCESS) {
+
+				goto function_exit;
+			}
+		} else {
+			mtr_commit(&mtr);
+			err = row_ins_scan_sec_index_for_duplicate(
+				index, entry, thr);
+			mtr_start(&mtr);
+
+			if (err != DB_SUCCESS) {
+
+				goto function_exit;
+			}
+
+			/* We did not find a duplicate and we have now
+			locked with s-locks the necessary records to
+			prevent any insertion of a duplicate by another
+			transaction. Let us now reposition the cursor and
+			continue the insertion. */
+
+			btr_cur_search_to_nth_level(index, 0, entry,
+						    PAGE_CUR_LE,
+						    mode | BTR_INSERT,
+						    &cursor, 0,
+						    __FILE__, __LINE__, &mtr);
+		}
+	}
+
+	modify = row_ins_must_modify(&cursor);
+
+	if (modify != 0) {
+		/* There is already an index entry with a long enough common
+		prefix, we must convert the insert into a modify of an
+		existing record */
+
+		if (modify == ROW_INS_NEXT) {
+			rec = page_rec_get_next(btr_cur_get_rec(&cursor));
+
+			btr_cur_position(index, rec,
+					 btr_cur_get_block(&cursor),&cursor);
+		}
+
+		if (dict_index_is_clust(index)) {
+			err = row_ins_clust_index_entry_by_modify(
+				mode, &cursor, &heap, &big_rec, entry,
+				thr, &mtr);
+		} else {
+			ut_ad(!n_ext);
+			err = row_ins_sec_index_entry_by_modify(
+				mode, &cursor, entry, thr, &mtr);
+		}
+	} else {
+		if (mode == BTR_MODIFY_LEAF) {
+			err = btr_cur_optimistic_insert(
+				0, &cursor, entry, &insert_rec, &big_rec,
+				n_ext, thr, &mtr);
+		} else {
+			ut_a(mode == BTR_MODIFY_TREE);
+			if (buf_LRU_buf_pool_running_out()) {
+
+				err = DB_LOCK_TABLE_FULL;
+
+				goto function_exit;
+			}
+			err = btr_cur_pessimistic_insert(
+				0, &cursor, entry, &insert_rec, &big_rec,
+				n_ext, thr, &mtr);
+		}
+	}
+
+function_exit:
+	mtr_commit(&mtr);
+
+	if (UNIV_LIKELY_NULL(big_rec)) {
+		rec_t*	rec;
+		ulint*	offsets;
+		mtr_start(&mtr);
+
+		btr_cur_search_to_nth_level(index, 0, entry, PAGE_CUR_LE,
+					    BTR_MODIFY_TREE, &cursor, 0,
+					    __FILE__, __LINE__, &mtr);
+		rec = btr_cur_get_rec(&cursor);
+		offsets = rec_get_offsets(rec, index, NULL,
+					  ULINT_UNDEFINED, &heap);
+
+		err = btr_store_big_rec_extern_fields(
+			index, btr_cur_get_block(&cursor),
+			rec, offsets, big_rec, &mtr);
+
+		if (modify) {
+			dtuple_big_rec_free(big_rec);
+		} else {
+			dtuple_convert_back_big_rec(index, entry, big_rec);
+		}
+
+		mtr_commit(&mtr);
+	}
+
+	if (UNIV_LIKELY_NULL(heap)) {
+		mem_heap_free(heap);
+	}
+	return(err);
+}
+
+/***************************************************************//**
+Inserts an index entry to index. Tries first optimistic, then pessimistic
+descent down the tree. If the entry matches enough to a delete marked record,
+performs the insert by updating or delete unmarking the delete marked
+record.
+@return	DB_SUCCESS, DB_LOCK_WAIT, DB_DUPLICATE_KEY, or some other error code */
+UNIV_INTERN
+ulint
+row_ins_index_entry(
+/*================*/
+	dict_index_t*	index,	/*!< in: index */
+	dtuple_t*	entry,	/*!< in: index entry to insert */
+	ulint		n_ext,	/*!< in: number of externally stored columns */
+	ibool		foreign,/*!< in: TRUE=check foreign key constraints */
+	que_thr_t*	thr)	/*!< in: query thread */
+{
+	ulint	err;
+
+	if (foreign && UT_LIST_GET_FIRST(index->table->foreign_list)) {
+		err = row_ins_check_foreign_constraints(index->table, index,
+							entry, thr);
+		if (err != DB_SUCCESS) {
+
+			return(err);
+		}
+	}
+
+	/* Try first optimistic descent to the B-tree */
+
+	err = row_ins_index_entry_low(BTR_MODIFY_LEAF, index, entry,
+				      n_ext, thr);
+	if (err != DB_FAIL) {
+
+		return(err);
+	}
+
+	/* Try then pessimistic descent to the B-tree */
+
+	err = row_ins_index_entry_low(BTR_MODIFY_TREE, index, entry,
+				      n_ext, thr);
+	return(err);
+}
+
+/***********************************************************//**
+Sets the values of the dtuple fields in entry from the values of appropriate
+columns in row. */
+static
+void
+row_ins_index_entry_set_vals(
+/*=========================*/
+	dict_index_t*	index,	/*!< in: index */
+	dtuple_t*	entry,	/*!< in: index entry to make */
+	const dtuple_t*	row)	/*!< in: row */
+{
+	ulint	n_fields;
+	ulint	i;
+
+	ut_ad(entry && row);
+
+	n_fields = dtuple_get_n_fields(entry);
+
+	for (i = 0; i < n_fields; i++) {
+		dict_field_t*	ind_field;
+		dfield_t*	field;
+		const dfield_t*	row_field;
+		ulint		len;
+
+		field = dtuple_get_nth_field(entry, i);
+		ind_field = dict_index_get_nth_field(index, i);
+		row_field = dtuple_get_nth_field(row, ind_field->col->ind);
+		len = dfield_get_len(row_field);
+
+		/* Check column prefix indexes */
+		if (ind_field->prefix_len > 0
+		    && dfield_get_len(row_field) != UNIV_SQL_NULL) {
+
+			const	dict_col_t*	col
+				= dict_field_get_col(ind_field);
+
+			len = dtype_get_at_most_n_mbchars(
+				col->prtype, col->mbminlen, col->mbmaxlen,
+				ind_field->prefix_len,
+				len, dfield_get_data(row_field));
+
+			ut_ad(!dfield_is_ext(row_field));
+		}
+
+		dfield_set_data(field, dfield_get_data(row_field), len);
+		if (dfield_is_ext(row_field)) {
+			ut_ad(dict_index_is_clust(index));
+			dfield_set_ext(field);
+		}
+	}
+}
+
+/***********************************************************//**
+Inserts a single index entry to the table.
+@return DB_SUCCESS if operation successfully completed, else error
+code or DB_LOCK_WAIT */
+static
+ulint
+row_ins_index_entry_step(
+/*=====================*/
+	ins_node_t*	node,	/*!< in: row insert node */
+	que_thr_t*	thr)	/*!< in: query thread */
+{
+	ulint	err;
+
+	ut_ad(dtuple_check_typed(node->row));
+
+	row_ins_index_entry_set_vals(node->index, node->entry, node->row);
+
+	ut_ad(dtuple_check_typed(node->entry));
+
+	err = row_ins_index_entry(node->index, node->entry, 0, TRUE, thr);
+
+	return(err);
+}
+
+/***********************************************************//**
+Allocates a row id for row and inits the node->index field. */
+UNIV_INLINE
+void
+row_ins_alloc_row_id_step(
+/*======================*/
+	ins_node_t*	node)	/*!< in: row insert node */
+{
+	dulint	row_id;
+
+	ut_ad(node->state == INS_NODE_ALLOC_ROW_ID);
+
+	if (dict_index_is_unique(dict_table_get_first_index(node->table))) {
+
+		/* No row id is stored if the clustered index is unique */
+
+		return;
+	}
+
+	/* Fill in row id value to row */
+
+	row_id = dict_sys_get_new_row_id();
+
+	dict_sys_write_row_id(node->row_id_buf, row_id);
+}
+
+/***********************************************************//**
+Gets a row to insert from the values list. */
+UNIV_INLINE
+void
+row_ins_get_row_from_values(
+/*========================*/
+	ins_node_t*	node)	/*!< in: row insert node */
+{
+	que_node_t*	list_node;
+	dfield_t*	dfield;
+	dtuple_t*	row;
+	ulint		i;
+
+	/* The field values are copied in the buffers of the select node and
+	it is safe to use them until we fetch from select again: therefore
+	we can just copy the pointers */
+
+	row = node->row;
+
+	i = 0;
+	list_node = node->values_list;
+
+	while (list_node) {
+		eval_exp(list_node);
+
+		dfield = dtuple_get_nth_field(row, i);
+		dfield_copy_data(dfield, que_node_get_val(list_node));
+
+		i++;
+		list_node = que_node_get_next(list_node);
+	}
+}
+
+/***********************************************************//**
+Gets a row to insert from the select list. */
+UNIV_INLINE
+void
+row_ins_get_row_from_select(
+/*========================*/
+	ins_node_t*	node)	/*!< in: row insert node */
+{
+	que_node_t*	list_node;
+	dfield_t*	dfield;
+	dtuple_t*	row;
+	ulint		i;
+
+	/* The field values are copied in the buffers of the select node and
+	it is safe to use them until we fetch from select again: therefore
+	we can just copy the pointers */
+
+	row = node->row;
+
+	i = 0;
+	list_node = node->select->select_list;
+
+	while (list_node) {
+		dfield = dtuple_get_nth_field(row, i);
+		dfield_copy_data(dfield, que_node_get_val(list_node));
+
+		i++;
+		list_node = que_node_get_next(list_node);
+	}
+}
+
+/***********************************************************//**
+Inserts a row to a table.
+@return DB_SUCCESS if operation successfully completed, else error
+code or DB_LOCK_WAIT */
+static
+ulint
+row_ins(
+/*====*/
+	ins_node_t*	node,	/*!< in: row insert node */
+	que_thr_t*	thr)	/*!< in: query thread */
+{
+	ulint	err;
+
+	ut_ad(node && thr);
+
+	if (node->state == INS_NODE_ALLOC_ROW_ID) {
+
+		row_ins_alloc_row_id_step(node);
+
+		node->index = dict_table_get_first_index(node->table);
+		node->entry = UT_LIST_GET_FIRST(node->entry_list);
+
+		if (node->ins_type == INS_SEARCHED) {
+
+			row_ins_get_row_from_select(node);
+
+		} else if (node->ins_type == INS_VALUES) {
+
+			row_ins_get_row_from_values(node);
+		}
+
+		node->state = INS_NODE_INSERT_ENTRIES;
+	}
+
+	ut_ad(node->state == INS_NODE_INSERT_ENTRIES);
+
+	while (node->index != NULL) {
+		err = row_ins_index_entry_step(node, thr);
+
+		if (err != DB_SUCCESS) {
+
+			return(err);
+		}
+
+		node->index = dict_table_get_next_index(node->index);
+		node->entry = UT_LIST_GET_NEXT(tuple_list, node->entry);
+	}
+
+	ut_ad(node->entry == NULL);
+
+	node->state = INS_NODE_ALLOC_ROW_ID;
+
+	return(DB_SUCCESS);
+}
+
+/***********************************************************//**
+Inserts a row to a table. This is a high-level function used in SQL execution
+graphs.
+@return	query thread to run next or NULL */
+UNIV_INTERN
+que_thr_t*
+row_ins_step(
+/*=========*/
+	que_thr_t*	thr)	/*!< in: query thread */
+{
+	ins_node_t*	node;
+	que_node_t*	parent;
+	sel_node_t*	sel_node;
+	trx_t*		trx;
+	ulint		err;
+
+	ut_ad(thr);
+
+	trx = thr_get_trx(thr);
+
+	trx_start_if_not_started(trx);
+
+	node = thr->run_node;
+
+	ut_ad(que_node_get_type(node) == QUE_NODE_INSERT);
+
+	parent = que_node_get_parent(node);
+	sel_node = node->select;
+
+	if (thr->prev_node == parent) {
+		node->state = INS_NODE_SET_IX_LOCK;
+	}
+
+	/* If this is the first time this node is executed (or when
+	execution resumes after wait for the table IX lock), set an
+	IX lock on the table and reset the possible select node. MySQL's
+	partitioned table code may also call an insert within the same
+	SQL statement AFTER it has used this table handle to do a search.
+	This happens, for example, when a row update moves it to another
+	partition. In that case, we have already set the IX lock on the
+	table during the search operation, and there is no need to set
+	it again here. But we must write trx->id to node->trx_id_buf. */
+
+	trx_write_trx_id(node->trx_id_buf, trx->id);
+
+	if (node->state == INS_NODE_SET_IX_LOCK) {
+
+		/* It may be that the current session has not yet started
+		its transaction, or it has been committed: */
+
+		if (UT_DULINT_EQ(trx->id, node->trx_id)) {
+			/* No need to do IX-locking */
+
+			goto same_trx;
+		}
+
+		err = lock_table(0, node->table, LOCK_IX, thr);
+
+		if (err != DB_SUCCESS) {
+
+			goto error_handling;
+		}
+
+		node->trx_id = trx->id;
+same_trx:
+		node->state = INS_NODE_ALLOC_ROW_ID;
+
+		if (node->ins_type == INS_SEARCHED) {
+			/* Reset the cursor */
+			sel_node->state = SEL_NODE_OPEN;
+
+			/* Fetch a row to insert */
+
+			thr->run_node = sel_node;
+
+			return(thr);
+		}
+	}
+
+	if ((node->ins_type == INS_SEARCHED)
+	    && (sel_node->state != SEL_NODE_FETCH)) {
+
+		ut_ad(sel_node->state == SEL_NODE_NO_MORE_ROWS);
+
+		/* No more rows to insert */
+		thr->run_node = parent;
+
+		return(thr);
+	}
+
+	/* DO THE CHECKS OF THE CONSISTENCY CONSTRAINTS HERE */
+
+	err = row_ins(node, thr);
+
+error_handling:
+	trx->error_state = err;
+
+	if (err != DB_SUCCESS) {
+		/* err == DB_LOCK_WAIT or SQL error detected */
+		return(NULL);
+	}
+
+	/* DO THE TRIGGER ACTIONS HERE */
+
+	if (node->ins_type == INS_SEARCHED) {
+		/* Fetch a row to insert */
+
+		thr->run_node = sel_node;
+	} else {
+		thr->run_node = que_node_get_parent(node);
+	}
+
+	return(thr);
+}
diff --git a/storage/xtradb/row/row0merge.c b/storage/xtradb/row/row0merge.c
new file mode 100644
index 00000000000..65102851bdf
--- /dev/null
+++ b/storage/xtradb/row/row0merge.c
@@ -0,0 +1,2644 @@
+/*****************************************************************************
+
+Copyright (c) 2005, 2010, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file row/row0merge.c
+New index creation routines using a merge sort
+
+Created 12/4/2005 Jan Lindstrom
+Completed by Sunny Bains and Marko Makela
+*******************************************************/
+
+#include "row0merge.h"
+#include "row0ext.h"
+#include "row0row.h"
+#include "row0upd.h"
+#include "row0ins.h"
+#include "row0sel.h"
+#include "dict0dict.h"
+#include "dict0mem.h"
+#include "dict0boot.h"
+#include "dict0crea.h"
+#include "dict0load.h"
+#include "btr0btr.h"
+#include "mach0data.h"
+#include "trx0rseg.h"
+#include "trx0trx.h"
+#include "trx0roll.h"
+#include "trx0undo.h"
+#include "trx0purge.h"
+#include "trx0rec.h"
+#include "que0que.h"
+#include "rem0cmp.h"
+#include "read0read.h"
+#include "os0file.h"
+#include "lock0lock.h"
+#include "data0data.h"
+#include "data0type.h"
+#include "que0que.h"
+#include "pars0pars.h"
+#include "mem0mem.h"
+#include "log0log.h"
+#include "ut0sort.h"
+#include "handler0alter.h"
+
+#ifdef UNIV_DEBUG
+/** Set these in order ot enable debug printout. */
+/* @{ */
+/** Log the outcome of each row_merge_cmp() call, comparing records. */
+static ibool	row_merge_print_cmp;
+/** Log each record read from temporary file. */
+static ibool	row_merge_print_read;
+/** Log each record write to temporary file. */
+static ibool	row_merge_print_write;
+/** Log each row_merge_blocks() call, merging two blocks of records to
+a bigger one. */
+static ibool	row_merge_print_block;
+/** Log each block read from temporary file. */
+static ibool	row_merge_print_block_read;
+/** Log each block read from temporary file. */
+static ibool	row_merge_print_block_write;
+/* @} */
+#endif /* UNIV_DEBUG */
+
+/** @brief Block size for I/O operations in merge sort.
+
+The minimum is UNIV_PAGE_SIZE, or page_get_free_space_of_empty()
+rounded to a power of 2.
+
+When not creating a PRIMARY KEY that contains column prefixes, this
+can be set as small as UNIV_PAGE_SIZE / 2.  See the comment above
+ut_ad(data_size < sizeof(row_merge_block_t)). */
+typedef byte	row_merge_block_t[1048576];
+
+/** @brief Secondary buffer for I/O operations of merge records.
+
+This buffer is used for writing or reading a record that spans two
+row_merge_block_t.  Thus, it must be able to hold one merge record,
+whose maximum size is the same as the minimum size of
+row_merge_block_t. */
+typedef byte	mrec_buf_t[UNIV_PAGE_SIZE_MAX];
+
+/** @brief Merge record in row_merge_block_t.
+
+The format is the same as a record in ROW_FORMAT=COMPACT with the
+exception that the REC_N_NEW_EXTRA_BYTES are omitted. */
+typedef byte	mrec_t;
+
+/** Buffer for sorting in main memory. */
+struct row_merge_buf_struct {
+	mem_heap_t*	heap;		/*!< memory heap where allocated */
+	dict_index_t*	index;		/*!< the index the tuples belong to */
+	ulint		total_size;	/*!< total amount of data bytes */
+	ulint		n_tuples;	/*!< number of data tuples */
+	ulint		max_tuples;	/*!< maximum number of data tuples */
+	const dfield_t**tuples;		/*!< array of pointers to
+					arrays of fields that form
+					the data tuples */
+	const dfield_t**tmp_tuples;	/*!< temporary copy of tuples,
+					for sorting */
+};
+
+/** Buffer for sorting in main memory. */
+typedef struct row_merge_buf_struct row_merge_buf_t;
+
+/** Information about temporary files used in merge sort */
+struct merge_file_struct {
+	int		fd;		/*!< file descriptor */
+	ulint		offset;		/*!< file offset (end of file) */
+	ib_uint64_t	n_rec;		/*!< number of records in the file */
+};
+
+/** Information about temporary files used in merge sort */
+typedef struct merge_file_struct merge_file_t;
+
+#ifdef UNIV_DEBUG
+/******************************************************//**
+Display a merge tuple. */
+static
+void
+row_merge_tuple_print(
+/*==================*/
+	FILE*		f,	/*!< in: output stream */
+	const dfield_t*	entry,	/*!< in: tuple to print */
+	ulint		n_fields)/*!< in: number of fields in the tuple */
+{
+	ulint	j;
+
+	for (j = 0; j < n_fields; j++) {
+		const dfield_t*	field = &entry[j];
+
+		if (dfield_is_null(field)) {
+			fputs("\n NULL;", f);
+		} else {
+			ulint	field_len	= dfield_get_len(field);
+			ulint	len		= ut_min(field_len, 20);
+			if (dfield_is_ext(field)) {
+				fputs("\nE", f);
+			} else {
+				fputs("\n ", f);
+			}
+			ut_print_buf(f, dfield_get_data(field), len);
+			if (len != field_len) {
+				fprintf(f, " (total %lu bytes)", field_len);
+			}
+		}
+	}
+	putc('\n', f);
+}
+#endif /* UNIV_DEBUG */
+
+/******************************************************//**
+Allocate a sort buffer.
+@return	own: sort buffer */
+static
+row_merge_buf_t*
+row_merge_buf_create_low(
+/*=====================*/
+	mem_heap_t*	heap,		/*!< in: heap where allocated */
+	dict_index_t*	index,		/*!< in: secondary index */
+	ulint		max_tuples,	/*!< in: maximum number of data tuples */
+	ulint		buf_size)	/*!< in: size of the buffer, in bytes */
+{
+	row_merge_buf_t*	buf;
+
+	ut_ad(max_tuples > 0);
+	ut_ad(max_tuples <= sizeof(row_merge_block_t));
+	ut_ad(max_tuples < buf_size);
+
+	buf = mem_heap_zalloc(heap, buf_size);
+	buf->heap = heap;
+	buf->index = index;
+	buf->max_tuples = max_tuples;
+	buf->tuples = mem_heap_alloc(heap,
+				     2 * max_tuples * sizeof *buf->tuples);
+	buf->tmp_tuples = buf->tuples + max_tuples;
+
+	return(buf);
+}
+
+/******************************************************//**
+Allocate a sort buffer.
+@return	own: sort buffer */
+static
+row_merge_buf_t*
+row_merge_buf_create(
+/*=================*/
+	dict_index_t*	index)	/*!< in: secondary index */
+{
+	row_merge_buf_t*	buf;
+	ulint			max_tuples;
+	ulint			buf_size;
+	mem_heap_t*		heap;
+
+	max_tuples = sizeof(row_merge_block_t)
+		/ ut_max(1, dict_index_get_min_size(index));
+
+	buf_size = (sizeof *buf) + (max_tuples - 1) * sizeof *buf->tuples;
+
+	heap = mem_heap_create(buf_size + sizeof(row_merge_block_t));
+
+	buf = row_merge_buf_create_low(heap, index, max_tuples, buf_size);
+
+	return(buf);
+}
+
+/******************************************************//**
+Empty a sort buffer.
+@return	sort buffer */
+static
+row_merge_buf_t*
+row_merge_buf_empty(
+/*================*/
+	row_merge_buf_t*	buf)	/*!< in,own: sort buffer */
+{
+	ulint		buf_size;
+	ulint		max_tuples	= buf->max_tuples;
+	mem_heap_t*	heap		= buf->heap;
+	dict_index_t*	index		= buf->index;
+
+	buf_size = (sizeof *buf) + (max_tuples - 1) * sizeof *buf->tuples;
+
+	mem_heap_empty(heap);
+
+	return(row_merge_buf_create_low(heap, index, max_tuples, buf_size));
+}
+
+/******************************************************//**
+Deallocate a sort buffer. */
+static
+void
+row_merge_buf_free(
+/*===============*/
+	row_merge_buf_t*	buf)	/*!< in,own: sort buffer, to be freed */
+{
+	mem_heap_free(buf->heap);
+}
+
+/******************************************************//**
+Insert a data tuple into a sort buffer.
+@return	TRUE if added, FALSE if out of space */
+static
+ibool
+row_merge_buf_add(
+/*==============*/
+	row_merge_buf_t*	buf,	/*!< in/out: sort buffer */
+	const dtuple_t*		row,	/*!< in: row in clustered index */
+	const row_ext_t*	ext)	/*!< in: cache of externally stored
+					column prefixes, or NULL */
+{
+	ulint			i;
+	ulint			n_fields;
+	ulint			data_size;
+	ulint			extra_size;
+	const dict_index_t*	index;
+	dfield_t*		entry;
+	dfield_t*		field;
+
+	if (buf->n_tuples >= buf->max_tuples) {
+		return(FALSE);
+	}
+
+	UNIV_PREFETCH_R(row->fields);
+
+	index = buf->index;
+
+	n_fields = dict_index_get_n_fields(index);
+
+	entry = mem_heap_alloc(buf->heap, n_fields * sizeof *entry);
+	buf->tuples[buf->n_tuples] = entry;
+	field = entry;
+
+	data_size = 0;
+	extra_size = UT_BITS_IN_BYTES(index->n_nullable);
+
+	for (i = 0; i < n_fields; i++, field++) {
+		const dict_field_t*	ifield;
+		const dict_col_t*	col;
+		ulint			col_no;
+		const dfield_t*		row_field;
+		ulint			len;
+
+		ifield = dict_index_get_nth_field(index, i);
+		col = ifield->col;
+		col_no = dict_col_get_no(col);
+		row_field = dtuple_get_nth_field(row, col_no);
+		dfield_copy(field, row_field);
+		len = dfield_get_len(field);
+
+		if (dfield_is_null(field)) {
+			ut_ad(!(col->prtype & DATA_NOT_NULL));
+			continue;
+		} else if (UNIV_LIKELY(!ext)) {
+		} else if (dict_index_is_clust(index)) {
+			/* Flag externally stored fields. */
+			const byte*	buf = row_ext_lookup(ext, col_no,
+							     &len);
+			if (UNIV_LIKELY_NULL(buf)) {
+				ut_a(buf != field_ref_zero);
+				if (i < dict_index_get_n_unique(index)) {
+					dfield_set_data(field, buf, len);
+				} else {
+					dfield_set_ext(field);
+					len = dfield_get_len(field);
+				}
+			}
+		} else {
+			const byte*	buf = row_ext_lookup(ext, col_no,
+							     &len);
+			if (UNIV_LIKELY_NULL(buf)) {
+				ut_a(buf != field_ref_zero);
+				dfield_set_data(field, buf, len);
+			}
+		}
+
+		/* If a column prefix index, take only the prefix */
+
+		if (ifield->prefix_len) {
+			len = dtype_get_at_most_n_mbchars(
+				col->prtype,
+				col->mbminlen, col->mbmaxlen,
+				ifield->prefix_len,
+				len, dfield_get_data(field));
+			dfield_set_len(field, len);
+		}
+
+		ut_ad(len <= col->len || col->mtype == DATA_BLOB);
+
+		if (ifield->fixed_len) {
+			ut_ad(len == ifield->fixed_len);
+			ut_ad(!dfield_is_ext(field));
+		} else if (dfield_is_ext(field)) {
+			extra_size += 2;
+		} else if (len < 128
+			   || (col->len < 256 && col->mtype != DATA_BLOB)) {
+			extra_size++;
+		} else {
+			/* For variable-length columns, we look up the
+			maximum length from the column itself.  If this
+			is a prefix index column shorter than 256 bytes,
+			this will waste one byte. */
+			extra_size += 2;
+		}
+		data_size += len;
+	}
+
+#ifdef UNIV_DEBUG
+	{
+		ulint	size;
+		ulint	extra;
+
+		size = rec_get_converted_size_comp(index,
+						   REC_STATUS_ORDINARY,
+						   entry, n_fields, &extra);
+
+		ut_ad(data_size + extra_size + REC_N_NEW_EXTRA_BYTES == size);
+		ut_ad(extra_size + REC_N_NEW_EXTRA_BYTES == extra);
+	}
+#endif /* UNIV_DEBUG */
+
+	/* Add to the total size of the record in row_merge_block_t
+	the encoded length of extra_size and the extra bytes (extra_size).
+	See row_merge_buf_write() for the variable-length encoding
+	of extra_size. */
+	data_size += (extra_size + 1) + ((extra_size + 1) >= 0x80);
+
+	/* The following assertion may fail if row_merge_block_t is
+	declared very small and a PRIMARY KEY is being created with
+	many prefix columns.  In that case, the record may exceed the
+	page_zip_rec_needs_ext() limit.  However, no further columns
+	will be moved to external storage until the record is inserted
+	to the clustered index B-tree. */
+	ut_ad(data_size < sizeof(row_merge_block_t));
+
+	/* Reserve one byte for the end marker of row_merge_block_t. */
+	if (buf->total_size + data_size >= sizeof(row_merge_block_t) - 1) {
+		return(FALSE);
+	}
+
+	buf->total_size += data_size;
+	buf->n_tuples++;
+
+	field = entry;
+
+	/* Copy the data fields. */
+
+	do {
+		dfield_dup(field++, buf->heap);
+	} while (--n_fields);
+
+	return(TRUE);
+}
+
+/** Structure for reporting duplicate records. */
+struct row_merge_dup_struct {
+	const dict_index_t*	index;		/*!< index being sorted */
+	TABLE*			table;		/*!< MySQL table object */
+	ulint			n_dup;		/*!< number of duplicates */
+};
+
+/** Structure for reporting duplicate records. */
+typedef struct row_merge_dup_struct row_merge_dup_t;
+
+/*************************************************************//**
+Report a duplicate key. */
+static
+void
+row_merge_dup_report(
+/*=================*/
+	row_merge_dup_t*	dup,	/*!< in/out: for reporting duplicates */
+	const dfield_t*		entry)	/*!< in: duplicate index entry */
+{
+	mrec_buf_t* 		buf;
+	const dtuple_t*		tuple;
+	dtuple_t		tuple_store;
+	const rec_t*		rec;
+	const dict_index_t*	index	= dup->index;
+	ulint			n_fields= dict_index_get_n_fields(index);
+	mem_heap_t*		heap;
+	ulint*			offsets;
+	ulint			n_ext;
+
+	if (dup->n_dup++) {
+		/* Only report the first duplicate record,
+		but count all duplicate records. */
+		return;
+	}
+
+	/* Convert the tuple to a record and then to MySQL format. */
+	heap = mem_heap_create((1 + REC_OFFS_HEADER_SIZE + n_fields)
+			       * sizeof *offsets
+			       + sizeof *buf);
+
+	buf = mem_heap_alloc(heap, sizeof *buf);
+
+	tuple = dtuple_from_fields(&tuple_store, entry, n_fields);
+	n_ext = dict_index_is_clust(index) ? dtuple_get_n_ext(tuple) : 0;
+
+	rec = rec_convert_dtuple_to_rec(*buf, index, tuple, n_ext);
+	offsets = rec_get_offsets(rec, index, NULL, ULINT_UNDEFINED, &heap);
+
+	innobase_rec_to_mysql(dup->table, rec, index, offsets);
+
+	mem_heap_free(heap);
+}
+
+/*************************************************************//**
+Compare two tuples.
+@return	1, 0, -1 if a is greater, equal, less, respectively, than b */
+static
+int
+row_merge_tuple_cmp(
+/*================*/
+	ulint			n_field,/*!< in: number of fields */
+	const dfield_t*		a,	/*!< in: first tuple to be compared */
+	const dfield_t*		b,	/*!< in: second tuple to be compared */
+	row_merge_dup_t*	dup)	/*!< in/out: for reporting duplicates */
+{
+	int		cmp;
+	const dfield_t*	field	= a;
+
+	/* Compare the fields of the tuples until a difference is
+	found or we run out of fields to compare.  If !cmp at the
+	end, the tuples are equal. */
+	do {
+		cmp = cmp_dfield_dfield(a++, b++);
+	} while (!cmp && --n_field);
+
+	if (UNIV_UNLIKELY(!cmp) && UNIV_LIKELY_NULL(dup)) {
+		/* Report a duplicate value error if the tuples are
+		logically equal.  NULL columns are logically inequal,
+		although they are equal in the sorting order.  Find
+		out if any of the fields are NULL. */
+		for (b = field; b != a; b++) {
+			if (dfield_is_null(b)) {
+
+				goto func_exit;
+			}
+		}
+
+		row_merge_dup_report(dup, field);
+	}
+
+func_exit:
+	return(cmp);
+}
+
+/** Wrapper for row_merge_tuple_sort() to inject some more context to
+UT_SORT_FUNCTION_BODY().
+@param a	array of tuples that being sorted
+@param b	aux (work area), same size as tuples[]
+@param c	lower bound of the sorting area, inclusive
+@param d	upper bound of the sorting area, inclusive */
+#define row_merge_tuple_sort_ctx(a,b,c,d) \
+	row_merge_tuple_sort(n_field, dup, a, b, c, d)
+/** Wrapper for row_merge_tuple_cmp() to inject some more context to
+UT_SORT_FUNCTION_BODY().
+@param a	first tuple to be compared
+@param b	second tuple to be compared
+@return	1, 0, -1 if a is greater, equal, less, respectively, than b */
+#define row_merge_tuple_cmp_ctx(a,b) row_merge_tuple_cmp(n_field, a, b, dup)
+
+/**********************************************************************//**
+Merge sort the tuple buffer in main memory. */
+static
+void
+row_merge_tuple_sort(
+/*=================*/
+	ulint			n_field,/*!< in: number of fields */
+	row_merge_dup_t*	dup,	/*!< in/out: for reporting duplicates */
+	const dfield_t**	tuples,	/*!< in/out: tuples */
+	const dfield_t**	aux,	/*!< in/out: work area */
+	ulint			low,	/*!< in: lower bound of the
+					sorting area, inclusive */
+	ulint			high)	/*!< in: upper bound of the
+					sorting area, exclusive */
+{
+	UT_SORT_FUNCTION_BODY(row_merge_tuple_sort_ctx,
+			      tuples, aux, low, high, row_merge_tuple_cmp_ctx);
+}
+
+/******************************************************//**
+Sort a buffer. */
+static
+void
+row_merge_buf_sort(
+/*===============*/
+	row_merge_buf_t*	buf,	/*!< in/out: sort buffer */
+	row_merge_dup_t*	dup)	/*!< in/out: for reporting duplicates */
+{
+	row_merge_tuple_sort(dict_index_get_n_unique(buf->index), dup,
+			     buf->tuples, buf->tmp_tuples, 0, buf->n_tuples);
+}
+
+/******************************************************//**
+Write a buffer to a block. */
+static
+void
+row_merge_buf_write(
+/*================*/
+	const row_merge_buf_t*	buf,	/*!< in: sorted buffer */
+#ifdef UNIV_DEBUG
+	const merge_file_t*	of,	/*!< in: output file */
+#endif /* UNIV_DEBUG */
+	row_merge_block_t*	block)	/*!< out: buffer for writing to file */
+#ifndef UNIV_DEBUG
+# define row_merge_buf_write(buf, of, block) row_merge_buf_write(buf, block)
+#endif /* !UNIV_DEBUG */
+{
+	const dict_index_t*	index	= buf->index;
+	ulint			n_fields= dict_index_get_n_fields(index);
+	byte*			b	= &(*block)[0];
+
+	ulint		i;
+
+	for (i = 0; i < buf->n_tuples; i++) {
+		ulint		size;
+		ulint		extra_size;
+		const dfield_t*	entry		= buf->tuples[i];
+
+		size = rec_get_converted_size_comp(index,
+						   REC_STATUS_ORDINARY,
+						   entry, n_fields,
+						   &extra_size);
+		ut_ad(size > extra_size);
+		ut_ad(extra_size >= REC_N_NEW_EXTRA_BYTES);
+		extra_size -= REC_N_NEW_EXTRA_BYTES;
+		size -= REC_N_NEW_EXTRA_BYTES;
+
+		/* Encode extra_size + 1 */
+		if (extra_size + 1 < 0x80) {
+			*b++ = (byte) (extra_size + 1);
+		} else {
+			ut_ad((extra_size + 1) < 0x8000);
+			*b++ = (byte) (0x80 | ((extra_size + 1) >> 8));
+			*b++ = (byte) (extra_size + 1);
+		}
+
+		ut_ad(b + size < block[1]);
+
+		rec_convert_dtuple_to_rec_comp(b + extra_size, 0, index,
+					       REC_STATUS_ORDINARY,
+					       entry, n_fields);
+
+		b += size;
+
+#ifdef UNIV_DEBUG
+		if (row_merge_print_write) {
+			fprintf(stderr, "row_merge_buf_write %p,%d,%lu %lu",
+				(void*) b, of->fd, (ulong) of->offset,
+				(ulong) i);
+			row_merge_tuple_print(stderr, entry, n_fields);
+		}
+#endif /* UNIV_DEBUG */
+	}
+
+	/* Write an "end-of-chunk" marker. */
+	ut_a(b < block[1]);
+	ut_a(b == block[0] + buf->total_size);
+	*b++ = 0;
+#ifdef UNIV_DEBUG_VALGRIND
+	/* The rest of the block is uninitialized.  Initialize it
+	to avoid bogus warnings. */
+	memset(b, 0xff, block[1] - b);
+#endif /* UNIV_DEBUG_VALGRIND */
+#ifdef UNIV_DEBUG
+	if (row_merge_print_write) {
+		fprintf(stderr, "row_merge_buf_write %p,%d,%lu EOF\n",
+			(void*) b, of->fd, (ulong) of->offset);
+	}
+#endif /* UNIV_DEBUG */
+}
+
+/******************************************************//**
+Create a memory heap and allocate space for row_merge_rec_offsets()
+and mrec_buf_t[3].
+@return	memory heap */
+static
+mem_heap_t*
+row_merge_heap_create(
+/*==================*/
+	const dict_index_t*	index,		/*!< in: record descriptor */
+	mrec_buf_t**		buf,		/*!< out: 3 buffers */
+	ulint**			offsets1,	/*!< out: offsets */
+	ulint**			offsets2)	/*!< out: offsets */
+{
+	ulint		i	= 1 + REC_OFFS_HEADER_SIZE
+		+ dict_index_get_n_fields(index);
+	mem_heap_t*	heap	= mem_heap_create(2 * i * sizeof **offsets1
+						  + 3 * sizeof **buf);
+
+	*buf = mem_heap_alloc(heap, 3 * sizeof **buf);
+	*offsets1 = mem_heap_alloc(heap, i * sizeof **offsets1);
+	*offsets2 = mem_heap_alloc(heap, i * sizeof **offsets2);
+
+	(*offsets1)[0] = (*offsets2)[0] = i;
+	(*offsets1)[1] = (*offsets2)[1] = dict_index_get_n_fields(index);
+
+	return(heap);
+}
+
+/**********************************************************************//**
+Search an index object by name and column names.  If several indexes match,
+return the index with the max id.
+@return	matching index, NULL if not found */
+static
+dict_index_t*
+row_merge_dict_table_get_index(
+/*===========================*/
+	dict_table_t*		table,		/*!< in: table */
+	const merge_index_def_t*index_def)	/*!< in: index definition */
+{
+	ulint		i;
+	dict_index_t*	index;
+	const char**	column_names;
+
+	column_names = mem_alloc(index_def->n_fields * sizeof *column_names);
+
+	for (i = 0; i < index_def->n_fields; ++i) {
+		column_names[i] = index_def->fields[i].field_name;
+	}
+
+	index = dict_table_get_index_by_max_id(
+		table, index_def->name, column_names, index_def->n_fields);
+
+	mem_free((void*) column_names);
+
+	return(index);
+}
+
+/********************************************************************//**
+Read a merge block from the file system.
+@return	TRUE if request was successful, FALSE if fail */
+static
+ibool
+row_merge_read(
+/*===========*/
+	int			fd,	/*!< in: file descriptor */
+	ulint			offset,	/*!< in: offset where to read */
+	row_merge_block_t*	buf)	/*!< out: data */
+{
+	ib_uint64_t	ofs = ((ib_uint64_t) offset) * sizeof *buf;
+	ibool		success;
+
+#ifdef UNIV_DEBUG
+	if (row_merge_print_block_read) {
+		fprintf(stderr, "row_merge_read fd=%d ofs=%lu\n",
+			fd, (ulong) offset);
+	}
+#endif /* UNIV_DEBUG */
+
+	success = os_file_read_no_error_handling(OS_FILE_FROM_FD(fd), buf,
+						 (ulint) (ofs & 0xFFFFFFFF),
+						 (ulint) (ofs >> 32),
+						 sizeof *buf);
+	if (UNIV_UNLIKELY(!success)) {
+		ut_print_timestamp(stderr);
+		fprintf(stderr,
+			"  InnoDB: failed to read merge block at %llu\n", ofs);
+	}
+
+	return(UNIV_LIKELY(success));
+}
+
+/********************************************************************//**
+Write a merge block to the file system.
+@return	TRUE if request was successful, FALSE if fail */
+static
+ibool
+row_merge_write(
+/*============*/
+	int		fd,	/*!< in: file descriptor */
+	ulint		offset,	/*!< in: offset where to read
+				in number of row_merge_block_t
+				elements */
+	const void*	buf)	/*!< in: data */
+{
+	ib_uint64_t	ofs = ((ib_uint64_t) offset)
+		* sizeof(row_merge_block_t);
+
+#ifdef UNIV_DEBUG
+	if (row_merge_print_block_write) {
+		fprintf(stderr, "row_merge_write fd=%d ofs=%lu\n",
+			fd, (ulong) offset);
+	}
+#endif /* UNIV_DEBUG */
+
+	return(UNIV_LIKELY(os_file_write("(merge)", OS_FILE_FROM_FD(fd), buf,
+					 (ulint) (ofs & 0xFFFFFFFF),
+					 (ulint) (ofs >> 32),
+					 sizeof(row_merge_block_t))));
+}
+
+/********************************************************************//**
+Read a merge record.
+@return	pointer to next record, or NULL on I/O error or end of list */
+static __attribute__((nonnull))
+const byte*
+row_merge_read_rec(
+/*===============*/
+	row_merge_block_t*	block,	/*!< in/out: file buffer */
+	mrec_buf_t*		buf,	/*!< in/out: secondary buffer */
+	const byte*		b,	/*!< in: pointer to record */
+	const dict_index_t*	index,	/*!< in: index of the record */
+	int			fd,	/*!< in: file descriptor */
+	ulint*			foffs,	/*!< in/out: file offset */
+	const mrec_t**		mrec,	/*!< out: pointer to merge record,
+					or NULL on end of list
+					(non-NULL on I/O error) */
+	ulint*			offsets)/*!< out: offsets of mrec */
+{
+	ulint	extra_size;
+	ulint	data_size;
+	ulint	avail_size;
+
+	ut_ad(block);
+	ut_ad(buf);
+	ut_ad(b >= block[0]);
+	ut_ad(b < block[1]);
+	ut_ad(index);
+	ut_ad(foffs);
+	ut_ad(mrec);
+	ut_ad(offsets);
+
+	ut_ad(*offsets == 1 + REC_OFFS_HEADER_SIZE
+	      + dict_index_get_n_fields(index));
+
+	extra_size = *b++;
+
+	if (UNIV_UNLIKELY(!extra_size)) {
+		/* End of list */
+		*mrec = NULL;
+#ifdef UNIV_DEBUG
+		if (row_merge_print_read) {
+			fprintf(stderr, "row_merge_read %p,%p,%d,%lu EOF\n",
+				(const void*) b, (const void*) block,
+				fd, (ulong) *foffs);
+		}
+#endif /* UNIV_DEBUG */
+		return(NULL);
+	}
+
+	if (extra_size >= 0x80) {
+		/* Read another byte of extra_size. */
+
+		if (UNIV_UNLIKELY(b >= block[1])) {
+			if (!row_merge_read(fd, ++(*foffs), block)) {
+err_exit:
+				/* Signal I/O error. */
+				*mrec = b;
+				return(NULL);
+			}
+
+			/* Wrap around to the beginning of the buffer. */
+			b = block[0];
+		}
+
+		extra_size = (extra_size & 0x7f) << 8;
+		extra_size |= *b++;
+	}
+
+	/* Normalize extra_size.  Above, value 0 signals "end of list". */
+	extra_size--;
+
+	/* Read the extra bytes. */
+
+	if (UNIV_UNLIKELY(b + extra_size >= block[1])) {
+		/* The record spans two blocks.  Copy the entire record
+		to the auxiliary buffer and handle this as a special
+		case. */
+
+		avail_size = block[1] - b;
+
+		memcpy(*buf, b, avail_size);
+
+		if (!row_merge_read(fd, ++(*foffs), block)) {
+
+			goto err_exit;
+		}
+
+		/* Wrap around to the beginning of the buffer. */
+		b = block[0];
+
+		/* Copy the record. */
+		memcpy(*buf + avail_size, b, extra_size - avail_size);
+		b += extra_size - avail_size;
+
+		*mrec = *buf + extra_size;
+
+		rec_init_offsets_comp_ordinary(*mrec, 0, index, offsets);
+
+		data_size = rec_offs_data_size(offsets);
+
+		/* These overflows should be impossible given that
+		records are much smaller than either buffer, and
+		the record starts near the beginning of each buffer. */
+		ut_a(extra_size + data_size < sizeof *buf);
+		ut_a(b + data_size < block[1]);
+
+		/* Copy the data bytes. */
+		memcpy(*buf + extra_size, b, data_size);
+		b += data_size;
+
+		goto func_exit;
+	}
+
+	*mrec = b + extra_size;
+
+	rec_init_offsets_comp_ordinary(*mrec, 0, index, offsets);
+
+	data_size = rec_offs_data_size(offsets);
+	ut_ad(extra_size + data_size < sizeof *buf);
+
+	b += extra_size + data_size;
+
+	if (UNIV_LIKELY(b < block[1])) {
+		/* The record fits entirely in the block.
+		This is the normal case. */
+		goto func_exit;
+	}
+
+	/* The record spans two blocks.  Copy it to buf. */
+
+	b -= extra_size + data_size;
+	avail_size = block[1] - b;
+	memcpy(*buf, b, avail_size);
+	*mrec = *buf + extra_size;
+#ifdef UNIV_DEBUG
+	/* We cannot invoke rec_offs_make_valid() here, because there
+	are no REC_N_NEW_EXTRA_BYTES between extra_size and data_size.
+	Similarly, rec_offs_validate() would fail, because it invokes
+	rec_get_status(). */
+	offsets[2] = (ulint) *mrec;
+	offsets[3] = (ulint) index;
+#endif /* UNIV_DEBUG */
+
+	if (!row_merge_read(fd, ++(*foffs), block)) {
+
+		goto err_exit;
+	}
+
+	/* Wrap around to the beginning of the buffer. */
+	b = block[0];
+
+	/* Copy the rest of the record. */
+	memcpy(*buf + avail_size, b, extra_size + data_size - avail_size);
+	b += extra_size + data_size - avail_size;
+
+func_exit:
+#ifdef UNIV_DEBUG
+	if (row_merge_print_read) {
+		fprintf(stderr, "row_merge_read %p,%p,%d,%lu ",
+			(const void*) b, (const void*) block,
+			fd, (ulong) *foffs);
+		rec_print_comp(stderr, *mrec, offsets);
+		putc('\n', stderr);
+	}
+#endif /* UNIV_DEBUG */
+
+	return(b);
+}
+
+/********************************************************************//**
+Write a merge record. */
+static
+void
+row_merge_write_rec_low(
+/*====================*/
+	byte*		b,	/*!< out: buffer */
+	ulint		e,	/*!< in: encoded extra_size */
+#ifdef UNIV_DEBUG
+	ulint		size,	/*!< in: total size to write */
+	int		fd,	/*!< in: file descriptor */
+	ulint		foffs,	/*!< in: file offset */
+#endif /* UNIV_DEBUG */
+	const mrec_t*	mrec,	/*!< in: record to write */
+	const ulint*	offsets)/*!< in: offsets of mrec */
+#ifndef UNIV_DEBUG
+# define row_merge_write_rec_low(b, e, size, fd, foffs, mrec, offsets)	\
+	row_merge_write_rec_low(b, e, mrec, offsets)
+#endif /* !UNIV_DEBUG */
+{
+#ifdef UNIV_DEBUG
+	const byte* const end = b + size;
+	ut_ad(e == rec_offs_extra_size(offsets) + 1);
+
+	if (row_merge_print_write) {
+		fprintf(stderr, "row_merge_write %p,%d,%lu ",
+			(void*) b, fd, (ulong) foffs);
+		rec_print_comp(stderr, mrec, offsets);
+		putc('\n', stderr);
+	}
+#endif /* UNIV_DEBUG */
+
+	if (e < 0x80) {
+		*b++ = (byte) e;
+	} else {
+		*b++ = (byte) (0x80 | (e >> 8));
+		*b++ = (byte) e;
+	}
+
+	memcpy(b, mrec - rec_offs_extra_size(offsets), rec_offs_size(offsets));
+	ut_ad(b + rec_offs_size(offsets) == end);
+}
+
+/********************************************************************//**
+Write a merge record.
+@return	pointer to end of block, or NULL on error */
+static
+byte*
+row_merge_write_rec(
+/*================*/
+	row_merge_block_t*	block,	/*!< in/out: file buffer */
+	mrec_buf_t*		buf,	/*!< in/out: secondary buffer */
+	byte*			b,	/*!< in: pointer to end of block */
+	int			fd,	/*!< in: file descriptor */
+	ulint*			foffs,	/*!< in/out: file offset */
+	const mrec_t*		mrec,	/*!< in: record to write */
+	const ulint*		offsets)/*!< in: offsets of mrec */
+{
+	ulint	extra_size;
+	ulint	size;
+	ulint	avail_size;
+
+	ut_ad(block);
+	ut_ad(buf);
+	ut_ad(b >= block[0]);
+	ut_ad(b < block[1]);
+	ut_ad(mrec);
+	ut_ad(foffs);
+	ut_ad(mrec < block[0] || mrec > block[1]);
+	ut_ad(mrec < buf[0] || mrec > buf[1]);
+
+	/* Normalize extra_size.  Value 0 signals "end of list". */
+	extra_size = rec_offs_extra_size(offsets) + 1;
+
+	size = extra_size + (extra_size >= 0x80)
+		+ rec_offs_data_size(offsets);
+
+	if (UNIV_UNLIKELY(b + size >= block[1])) {
+		/* The record spans two blocks.
+		Copy it to the temporary buffer first. */
+		avail_size = block[1] - b;
+
+		row_merge_write_rec_low(buf[0],
+					extra_size, size, fd, *foffs,
+					mrec, offsets);
+
+		/* Copy the head of the temporary buffer, write
+		the completed block, and copy the tail of the
+		record to the head of the new block. */
+		memcpy(b, buf[0], avail_size);
+
+		if (!row_merge_write(fd, (*foffs)++, block)) {
+			return(NULL);
+		}
+
+		UNIV_MEM_INVALID(block[0], sizeof block[0]);
+
+		/* Copy the rest. */
+		b = block[0];
+		memcpy(b, buf[0] + avail_size, size - avail_size);
+		b += size - avail_size;
+	} else {
+		row_merge_write_rec_low(b, extra_size, size, fd, *foffs,
+					mrec, offsets);
+		b += size;
+	}
+
+	return(b);
+}
+
+/********************************************************************//**
+Write an end-of-list marker.
+@return	pointer to end of block, or NULL on error */
+static
+byte*
+row_merge_write_eof(
+/*================*/
+	row_merge_block_t*	block,	/*!< in/out: file buffer */
+	byte*			b,	/*!< in: pointer to end of block */
+	int			fd,	/*!< in: file descriptor */
+	ulint*			foffs)	/*!< in/out: file offset */
+{
+	ut_ad(block);
+	ut_ad(b >= block[0]);
+	ut_ad(b < block[1]);
+	ut_ad(foffs);
+#ifdef UNIV_DEBUG
+	if (row_merge_print_write) {
+		fprintf(stderr, "row_merge_write %p,%p,%d,%lu EOF\n",
+			(void*) b, (void*) block, fd, (ulong) *foffs);
+	}
+#endif /* UNIV_DEBUG */
+
+	*b++ = 0;
+	UNIV_MEM_ASSERT_RW(block[0], b - block[0]);
+	UNIV_MEM_ASSERT_W(block[0], sizeof block[0]);
+#ifdef UNIV_DEBUG_VALGRIND
+	/* The rest of the block is uninitialized.  Initialize it
+	to avoid bogus warnings. */
+	memset(b, 0xff, block[1] - b);
+#endif /* UNIV_DEBUG_VALGRIND */
+
+	if (!row_merge_write(fd, (*foffs)++, block)) {
+		return(NULL);
+	}
+
+	UNIV_MEM_INVALID(block[0], sizeof block[0]);
+	return(block[0]);
+}
+
+/*************************************************************//**
+Compare two merge records.
+@return	1, 0, -1 if mrec1 is greater, equal, less, respectively, than mrec2 */
+static
+int
+row_merge_cmp(
+/*==========*/
+	const mrec_t*		mrec1,		/*!< in: first merge
+						record to be compared */
+	const mrec_t*		mrec2,		/*!< in: second merge
+						record to be compared */
+	const ulint*		offsets1,	/*!< in: first record offsets */
+	const ulint*		offsets2,	/*!< in: second record offsets */
+	const dict_index_t*	index,		/*!< in: index */
+	ibool*			null_eq)	/*!< out: set to TRUE if
+						found matching null values */
+{
+	int	cmp;
+
+	cmp = cmp_rec_rec_simple(mrec1, mrec2, offsets1, offsets2, index,
+				 null_eq);
+
+#ifdef UNIV_DEBUG
+	if (row_merge_print_cmp) {
+		fputs("row_merge_cmp1 ", stderr);
+		rec_print_comp(stderr, mrec1, offsets1);
+		fputs("\nrow_merge_cmp2 ", stderr);
+		rec_print_comp(stderr, mrec2, offsets2);
+		fprintf(stderr, "\nrow_merge_cmp=%d\n", cmp);
+	}
+#endif /* UNIV_DEBUG */
+
+	return(cmp);
+}
+
+/********************************************************************//**
+Reads clustered index of the table and create temporary files
+containing the index entries for the indexes to be built.
+@return	DB_SUCCESS or error */
+static __attribute__((nonnull))
+ulint
+row_merge_read_clustered_index(
+/*===========================*/
+	trx_t*			trx,	/*!< in: transaction */
+	TABLE*			table,	/*!< in/out: MySQL table object,
+					for reporting erroneous records */
+	const dict_table_t*	old_table,/*!< in: table where rows are
+					read from */
+	const dict_table_t*	new_table,/*!< in: table where indexes are
+					created; identical to old_table
+					unless creating a PRIMARY KEY */
+	dict_index_t**		index,	/*!< in: indexes to be created */
+	merge_file_t*		files,	/*!< in: temporary files */
+	ulint			n_index,/*!< in: number of indexes to create */
+	row_merge_block_t*	block)	/*!< in/out: file buffer */
+{
+	dict_index_t*		clust_index;	/* Clustered index */
+	mem_heap_t*		row_heap;	/* Heap memory to create
+						clustered index records */
+	row_merge_buf_t**	merge_buf;	/* Temporary list for records*/
+	btr_pcur_t		pcur;		/* Persistent cursor on the
+						clustered index */
+	mtr_t			mtr;		/* Mini transaction */
+	ulint			err = DB_SUCCESS;/* Return code */
+	ulint			i;
+	ulint			n_nonnull = 0;	/* number of columns
+						changed to NOT NULL */
+	ulint*			nonnull = NULL;	/* NOT NULL columns */
+
+	trx->op_info = "reading clustered index";
+
+	ut_ad(trx);
+	ut_ad(old_table);
+	ut_ad(new_table);
+	ut_ad(index);
+	ut_ad(files);
+
+	/* Create and initialize memory for record buffers */
+
+	merge_buf = mem_alloc(n_index * sizeof *merge_buf);
+
+	for (i = 0; i < n_index; i++) {
+		merge_buf[i] = row_merge_buf_create(index[i]);
+	}
+
+	mtr_start(&mtr);
+
+	/* Find the clustered index and create a persistent cursor
+	based on that. */
+
+	clust_index = dict_table_get_first_index(old_table);
+
+	btr_pcur_open_at_index_side(
+		TRUE, clust_index, BTR_SEARCH_LEAF, &pcur, TRUE, &mtr);
+
+	if (UNIV_UNLIKELY(old_table != new_table)) {
+		ulint	n_cols = dict_table_get_n_cols(old_table);
+
+		/* A primary key will be created.  Identify the
+		columns that were flagged NOT NULL in the new table,
+		so that we can quickly check that the records in the
+		(old) clustered index do not violate the added NOT
+		NULL constraints. */
+
+		ut_a(n_cols == dict_table_get_n_cols(new_table));
+
+		nonnull = mem_alloc(n_cols * sizeof *nonnull);
+
+		for (i = 0; i < n_cols; i++) {
+			if (dict_table_get_nth_col(old_table, i)->prtype
+			    & DATA_NOT_NULL) {
+
+				continue;
+			}
+
+			if (dict_table_get_nth_col(new_table, i)->prtype
+			    & DATA_NOT_NULL) {
+
+				nonnull[n_nonnull++] = i;
+			}
+		}
+
+		if (!n_nonnull) {
+			mem_free(nonnull);
+			nonnull = NULL;
+		}
+	}
+
+	row_heap = mem_heap_create(sizeof(mrec_buf_t));
+
+	/* Scan the clustered index. */
+	for (;;) {
+		const rec_t*	rec;
+		ulint*		offsets;
+		dtuple_t*	row		= NULL;
+		row_ext_t*	ext;
+		ibool		has_next	= TRUE;
+
+		btr_pcur_move_to_next_on_page(&pcur);
+
+		/* When switching pages, commit the mini-transaction
+		in order to release the latch on the old page. */
+
+		if (btr_pcur_is_after_last_on_page(&pcur)) {
+			if (UNIV_UNLIKELY(trx_is_interrupted(trx))) {
+				i = 0;
+				err = DB_INTERRUPTED;
+				goto err_exit;
+			}
+
+			btr_pcur_store_position(&pcur, &mtr);
+			mtr_commit(&mtr);
+			mtr_start(&mtr);
+			btr_pcur_restore_position(BTR_SEARCH_LEAF,
+						  &pcur, &mtr);
+			has_next = btr_pcur_move_to_next_user_rec(&pcur, &mtr);
+		}
+
+		if (UNIV_LIKELY(has_next)) {
+			rec = btr_pcur_get_rec(&pcur);
+
+			if (srv_pass_corrupt_table && !rec) {
+				err = DB_CORRUPTION;
+				goto err_exit;
+			}
+			ut_a(rec);
+
+			offsets = rec_get_offsets(rec, clust_index, NULL,
+						  ULINT_UNDEFINED, &row_heap);
+
+			/* Skip delete marked records. */
+			if (rec_get_deleted_flag(
+				    rec, dict_table_is_comp(old_table))) {
+				continue;
+			}
+
+			srv_n_rows_inserted++;
+
+			/* Build a row based on the clustered index. */
+
+			row = row_build(ROW_COPY_POINTERS, clust_index,
+					rec, offsets,
+					new_table, &ext, row_heap);
+
+			if (UNIV_LIKELY_NULL(nonnull)) {
+				for (i = 0; i < n_nonnull; i++) {
+					dfield_t*	field
+						= &row->fields[nonnull[i]];
+					dtype_t*	field_type
+						= dfield_get_type(field);
+
+					ut_a(!(field_type->prtype
+					       & DATA_NOT_NULL));
+
+					if (dfield_is_null(field)) {
+						err = DB_PRIMARY_KEY_IS_NULL;
+						i = 0;
+						goto err_exit;
+					}
+
+					field_type->prtype |= DATA_NOT_NULL;
+				}
+			}
+		}
+
+		/* Build all entries for all the indexes to be created
+		in a single scan of the clustered index. */
+
+		for (i = 0; i < n_index; i++) {
+			row_merge_buf_t*	buf	= merge_buf[i];
+			merge_file_t*		file	= &files[i];
+			const dict_index_t*	index	= buf->index;
+
+			if (UNIV_LIKELY
+			    (row && row_merge_buf_add(buf, row, ext))) {
+				file->n_rec++;
+				continue;
+			}
+
+			/* The buffer must be sufficiently large
+			to hold at least one record. */
+			ut_ad(buf->n_tuples || !has_next);
+
+			/* We have enough data tuples to form a block.
+			Sort them and write to disk. */
+
+			if (buf->n_tuples) {
+				if (dict_index_is_unique(index)) {
+					row_merge_dup_t	dup;
+					dup.index = buf->index;
+					dup.table = table;
+					dup.n_dup = 0;
+
+					row_merge_buf_sort(buf, &dup);
+
+					if (dup.n_dup) {
+						err = DB_DUPLICATE_KEY;
+err_exit:
+						trx->error_key_num = i;
+						goto func_exit;
+					}
+				} else {
+					row_merge_buf_sort(buf, NULL);
+				}
+			}
+
+			row_merge_buf_write(buf, file, block);
+
+			if (!row_merge_write(file->fd, file->offset++,
+					     block)) {
+				err = DB_OUT_OF_FILE_SPACE;
+				goto err_exit;
+			}
+
+			UNIV_MEM_INVALID(block[0], sizeof block[0]);
+			merge_buf[i] = row_merge_buf_empty(buf);
+
+			if (UNIV_LIKELY(row != NULL)) {
+				/* Try writing the record again, now
+				that the buffer has been written out
+				and emptied. */
+
+				if (UNIV_UNLIKELY
+				    (!row_merge_buf_add(buf, row, ext))) {
+					/* An empty buffer should have enough
+					room for at least one record. */
+					ut_error;
+				}
+
+				file->n_rec++;
+			}
+		}
+
+		mem_heap_empty(row_heap);
+
+		if (UNIV_UNLIKELY(!has_next)) {
+			goto func_exit;
+		}
+	}
+
+func_exit:
+	btr_pcur_close(&pcur);
+	mtr_commit(&mtr);
+	mem_heap_free(row_heap);
+
+	if (UNIV_LIKELY_NULL(nonnull)) {
+		mem_free(nonnull);
+	}
+
+	for (i = 0; i < n_index; i++) {
+		row_merge_buf_free(merge_buf[i]);
+	}
+
+	mem_free(merge_buf);
+
+	trx->op_info = "";
+
+	return(err);
+}
+
+/** Write a record via buffer 2 and read the next record to buffer N.
+@param N	number of the buffer (0 or 1)
+@param AT_END	statement to execute at end of input */
+#define ROW_MERGE_WRITE_GET_NEXT(N, AT_END)				\
+	do {								\
+		b2 = row_merge_write_rec(&block[2], &buf[2], b2,	\
+					 of->fd, &of->offset,		\
+					 mrec##N, offsets##N);		\
+		if (UNIV_UNLIKELY(!b2 || ++of->n_rec > file->n_rec)) {	\
+			goto corrupt;					\
+		}							\
+		b##N = row_merge_read_rec(&block[N], &buf[N],		\
+					  b##N, index,			\
+					  file->fd, foffs##N,		\
+					  &mrec##N, offsets##N);	\
+		if (UNIV_UNLIKELY(!b##N)) {				\
+			if (mrec##N) {					\
+				goto corrupt;				\
+			}						\
+			AT_END;						\
+		}							\
+	} while (0)
+
+/*************************************************************//**
+Merge two blocks of records on disk and write a bigger block.
+@return	DB_SUCCESS or error code */
+static
+ulint
+row_merge_blocks(
+/*=============*/
+	const dict_index_t*	index,	/*!< in: index being created */
+	const merge_file_t*	file,	/*!< in: file containing
+					index entries */
+	row_merge_block_t*	block,	/*!< in/out: 3 buffers */
+	ulint*			foffs0,	/*!< in/out: offset of first
+					source list in the file */
+	ulint*			foffs1,	/*!< in/out: offset of second
+					source list in the file */
+	merge_file_t*		of,	/*!< in/out: output file */
+	TABLE*			table)	/*!< in/out: MySQL table, for
+					reporting erroneous key value
+					if applicable */
+{
+	mem_heap_t*	heap;	/*!< memory heap for offsets0, offsets1 */
+
+	mrec_buf_t*	buf;	/*!< buffer for handling
+				split mrec in block[] */
+	const byte*	b0;	/*!< pointer to block[0] */
+	const byte*	b1;	/*!< pointer to block[1] */
+	byte*		b2;	/*!< pointer to block[2] */
+	const mrec_t*	mrec0;	/*!< merge rec, points to block[0] or buf[0] */
+	const mrec_t*	mrec1;	/*!< merge rec, points to block[1] or buf[1] */
+	ulint*		offsets0;/* offsets of mrec0 */
+	ulint*		offsets1;/* offsets of mrec1 */
+
+#ifdef UNIV_DEBUG
+	if (row_merge_print_block) {
+		fprintf(stderr,
+			"row_merge_blocks fd=%d ofs=%lu + fd=%d ofs=%lu"
+			" = fd=%d ofs=%lu\n",
+			file->fd, (ulong) *foffs0,
+			file->fd, (ulong) *foffs1,
+			of->fd, (ulong) of->offset);
+	}
+#endif /* UNIV_DEBUG */
+
+	heap = row_merge_heap_create(index, &buf, &offsets0, &offsets1);
+
+	/* Write a record and read the next record.  Split the output
+	file in two halves, which can be merged on the following pass. */
+
+	if (!row_merge_read(file->fd, *foffs0, &block[0])
+	    || !row_merge_read(file->fd, *foffs1, &block[1])) {
+corrupt:
+		mem_heap_free(heap);
+		return(DB_CORRUPTION);
+	}
+
+	b0 = block[0];
+	b1 = block[1];
+	b2 = block[2];
+
+	b0 = row_merge_read_rec(&block[0], &buf[0], b0, index, file->fd,
+				foffs0, &mrec0, offsets0);
+	b1 = row_merge_read_rec(&block[1], &buf[1], b1, index, file->fd,
+				foffs1, &mrec1, offsets1);
+	if (UNIV_UNLIKELY(!b0 && mrec0)
+	    || UNIV_UNLIKELY(!b1 && mrec1)) {
+
+		goto corrupt;
+	}
+
+	while (mrec0 && mrec1) {
+		ibool	null_eq = FALSE;
+		switch (row_merge_cmp(mrec0, mrec1,
+				      offsets0, offsets1, index,
+				      &null_eq)) {
+		case 0:
+			if (UNIV_UNLIKELY
+			    (dict_index_is_unique(index) && !null_eq)) {
+				innobase_rec_to_mysql(table, mrec0,
+						      index, offsets0);
+				mem_heap_free(heap);
+				return(DB_DUPLICATE_KEY);
+			}
+			/* fall through */
+		case -1:
+			ROW_MERGE_WRITE_GET_NEXT(0, goto merged);
+			break;
+		case 1:
+			ROW_MERGE_WRITE_GET_NEXT(1, goto merged);
+			break;
+		default:
+			ut_error;
+		}
+
+	}
+
+merged:
+	if (mrec0) {
+		/* append all mrec0 to output */
+		for (;;) {
+			ROW_MERGE_WRITE_GET_NEXT(0, goto done0);
+		}
+	}
+done0:
+	if (mrec1) {
+		/* append all mrec1 to output */
+		for (;;) {
+			ROW_MERGE_WRITE_GET_NEXT(1, goto done1);
+		}
+	}
+done1:
+
+	mem_heap_free(heap);
+	b2 = row_merge_write_eof(&block[2], b2, of->fd, &of->offset);
+	return(b2 ? DB_SUCCESS : DB_CORRUPTION);
+}
+
+/*************************************************************//**
+Copy a block of index entries.
+@return	TRUE on success, FALSE on failure */
+static __attribute__((nonnull))
+ibool
+row_merge_blocks_copy(
+/*==================*/
+	const dict_index_t*	index,	/*!< in: index being created */
+	const merge_file_t*	file,	/*!< in: input file */
+	row_merge_block_t*	block,	/*!< in/out: 3 buffers */
+	ulint*			foffs0,	/*!< in/out: input file offset */
+	merge_file_t*		of)	/*!< in/out: output file */
+{
+	mem_heap_t*	heap;	/*!< memory heap for offsets0, offsets1 */
+
+	mrec_buf_t*	buf;	/*!< buffer for handling
+				split mrec in block[] */
+	const byte*	b0;	/*!< pointer to block[0] */
+	byte*		b2;	/*!< pointer to block[2] */
+	const mrec_t*	mrec0;	/*!< merge rec, points to block[0] */
+	ulint*		offsets0;/* offsets of mrec0 */
+	ulint*		offsets1;/* dummy offsets */
+
+#ifdef UNIV_DEBUG
+	if (row_merge_print_block) {
+		fprintf(stderr,
+			"row_merge_blocks_copy fd=%d ofs=%lu"
+			" = fd=%d ofs=%lu\n",
+			file->fd, (ulong) foffs0,
+			of->fd, (ulong) of->offset);
+	}
+#endif /* UNIV_DEBUG */
+
+	heap = row_merge_heap_create(index, &buf, &offsets0, &offsets1);
+
+	/* Write a record and read the next record.  Split the output
+	file in two halves, which can be merged on the following pass. */
+
+	if (!row_merge_read(file->fd, *foffs0, &block[0])) {
+corrupt:
+		mem_heap_free(heap);
+		return(FALSE);
+	}
+
+	b0 = block[0];
+	b2 = block[2];
+
+	b0 = row_merge_read_rec(&block[0], &buf[0], b0, index, file->fd,
+				foffs0, &mrec0, offsets0);
+	if (UNIV_UNLIKELY(!b0 && mrec0)) {
+
+		goto corrupt;
+	}
+
+	if (mrec0) {
+		/* append all mrec0 to output */
+		for (;;) {
+			ROW_MERGE_WRITE_GET_NEXT(0, goto done0);
+		}
+	}
+done0:
+
+	/* The file offset points to the beginning of the last page
+	that has been read.  Update it to point to the next block. */
+	(*foffs0)++;
+
+	mem_heap_free(heap);
+	return(row_merge_write_eof(&block[2], b2, of->fd, &of->offset)
+	       != NULL);
+}
+
+/*************************************************************//**
+Merge disk files.
+@return	DB_SUCCESS or error code */
+static __attribute__((nonnull))
+ulint
+row_merge(
+/*======*/
+	trx_t*			trx,	/*!< in: transaction */
+	const dict_index_t*	index,	/*!< in: index being created */
+	merge_file_t*		file,	/*!< in/out: file containing
+					index entries */
+	row_merge_block_t*	block,	/*!< in/out: 3 buffers */
+	int*			tmpfd,	/*!< in/out: temporary file handle */
+	TABLE*			table,	/*!< in/out: MySQL table, for
+					reporting erroneous key value
+					if applicable */
+	ulint*			num_run,/*!< in/out: Number of runs remain
+					to be merged */
+	ulint*			run_offset) /*!< in/out: Array contains the
+					first offset number for each merge
+					run */
+{
+	ulint		foffs0;	/*!< first input offset */
+	ulint		foffs1;	/*!< second input offset */
+	ulint		error;	/*!< error code */
+	merge_file_t	of;	/*!< output file */
+	const ulint	ihalf	= run_offset[*num_run / 2];
+				/*!< half the input file */
+	ulint		n_run	= 0;
+				/*!< num of runs generated from this merge */
+
+	UNIV_MEM_ASSERT_W(block[0], 3 * sizeof block[0]);
+
+	ut_ad(ihalf < file->offset);
+
+	of.fd = *tmpfd;
+	of.offset = 0;
+	of.n_rec = 0;
+
+	/* Merge blocks to the output file. */
+	foffs0 = 0;
+	foffs1 = ihalf;
+
+	UNIV_MEM_INVALID(run_offset, *num_run * sizeof *run_offset);
+
+	for (; foffs0 < ihalf && foffs1 < file->offset; foffs0++, foffs1++) {
+
+		if (UNIV_UNLIKELY(trx_is_interrupted(trx))) {
+			return(DB_INTERRUPTED);
+		}
+
+		/* Remember the offset number for this run */
+		run_offset[n_run++] = of.offset;
+
+		error = row_merge_blocks(index, file, block,
+					 &foffs0, &foffs1, &of, table);
+
+		if (error != DB_SUCCESS) {
+			return(error);
+		}
+
+	}
+
+	/* Copy the last blocks, if there are any. */
+
+	while (foffs0 < ihalf) {
+		if (UNIV_UNLIKELY(trx_is_interrupted(trx))) {
+			return(DB_INTERRUPTED);
+		}
+
+		/* Remember the offset number for this run */
+		run_offset[n_run++] = of.offset;
+
+		if (!row_merge_blocks_copy(index, file, block, &foffs0, &of)) {
+			return(DB_CORRUPTION);
+		}
+	}
+
+	ut_ad(foffs0 == ihalf);
+
+	while (foffs1 < file->offset) {
+		if (UNIV_UNLIKELY(trx_is_interrupted(trx))) {
+			return(DB_INTERRUPTED);
+		}
+
+		/* Remember the offset number for this run */
+		run_offset[n_run++] = of.offset;
+
+		if (!row_merge_blocks_copy(index, file, block, &foffs1, &of)) {
+			return(DB_CORRUPTION);
+		}
+	}
+
+	ut_ad(foffs1 == file->offset);
+
+	if (UNIV_UNLIKELY(of.n_rec != file->n_rec)) {
+		return(DB_CORRUPTION);
+	}
+
+	ut_ad(n_run <= *num_run);
+
+	*num_run = n_run;
+
+	/* Each run can contain one or more offsets. As merge goes on,
+	the number of runs (to merge) will reduce until we have one
+	single run. So the number of runs will always be smaller than
+	the number of offsets in file */
+	ut_ad((*num_run) <= file->offset);
+
+	/* The number of offsets in output file is always equal or
+	smaller than input file */
+	ut_ad(of.offset <= file->offset);
+
+	/* Swap file descriptors for the next pass. */
+	*tmpfd = file->fd;
+	*file = of;
+
+	UNIV_MEM_INVALID(block[0], 3 * sizeof block[0]);
+
+	return(DB_SUCCESS);
+}
+
+/*************************************************************//**
+Merge disk files.
+@return	DB_SUCCESS or error code */
+static
+ulint
+row_merge_sort(
+/*===========*/
+	trx_t*			trx,	/*!< in: transaction */
+	const dict_index_t*	index,	/*!< in: index being created */
+	merge_file_t*		file,	/*!< in/out: file containing
+					index entries */
+	row_merge_block_t*	block,	/*!< in/out: 3 buffers */
+	int*			tmpfd,	/*!< in/out: temporary file handle */
+	TABLE*			table)	/*!< in/out: MySQL table, for
+					reporting erroneous key value
+					if applicable */
+{
+	ulint	half = file->offset / 2;
+	ulint	num_runs;
+	ulint*	run_offset;
+	ulint	error = DB_SUCCESS;
+
+	/* Record the number of merge runs we need to perform */
+	num_runs = file->offset;
+
+	/* If num_runs are less than 1, nothing to merge */
+	if (num_runs <= 1) {
+		return(error);
+	}
+
+	/* "run_offset" records each run's first offset number */
+	run_offset = (ulint*) mem_alloc(file->offset * sizeof(ulint));
+
+	/* This tells row_merge() where to start for the first round
+	of merge. */
+	run_offset[half] = half;
+
+	/* The file should always contain at least one byte (the end
+	of file marker).  Thus, it must be at least one block. */
+	ut_ad(file->offset > 0);
+
+	/* Merge the runs until we have one big run */
+	do {
+		error = row_merge(trx, index, file, block, tmpfd,
+				  table, &num_runs, run_offset);
+
+		UNIV_MEM_ASSERT_RW(run_offset, num_runs * sizeof *run_offset);
+
+		if (error != DB_SUCCESS) {
+			break;
+		}
+	} while (num_runs > 1);
+
+	mem_free(run_offset);
+
+	return(error);
+}
+
+/*************************************************************//**
+Copy externally stored columns to the data tuple. */
+static
+void
+row_merge_copy_blobs(
+/*=================*/
+	const mrec_t*	mrec,	/*!< in: merge record */
+	const ulint*	offsets,/*!< in: offsets of mrec */
+	ulint		zip_size,/*!< in: compressed page size in bytes, or 0 */
+	dtuple_t*	tuple,	/*!< in/out: data tuple */
+	mem_heap_t*	heap)	/*!< in/out: memory heap */
+{
+	ulint	i;
+	ulint	n_fields = dtuple_get_n_fields(tuple);
+
+	for (i = 0; i < n_fields; i++) {
+		ulint		len;
+		const void*	data;
+		dfield_t*	field = dtuple_get_nth_field(tuple, i);
+
+		if (!dfield_is_ext(field)) {
+			continue;
+		}
+
+		ut_ad(!dfield_is_null(field));
+
+		/* The table is locked during index creation.
+		Therefore, externally stored columns cannot possibly
+		be freed between the time the BLOB pointers are read
+		(row_merge_read_clustered_index()) and dereferenced
+		(below). */
+		data = btr_rec_copy_externally_stored_field(
+			mrec, offsets, zip_size, i, &len, heap);
+		/* Because we have locked the table, any records
+		written by incomplete transactions must have been
+		rolled back already. There must not be any incomplete
+		BLOB columns. */
+		ut_a(data);
+
+		dfield_set_data(field, data, len);
+	}
+}
+
+/********************************************************************//**
+Read sorted file containing index data tuples and insert these data
+tuples to the index
+@return	DB_SUCCESS or error number */
+static
+ulint
+row_merge_insert_index_tuples(
+/*==========================*/
+	trx_t*			trx,	/*!< in: transaction */
+	dict_index_t*		index,	/*!< in: index */
+	dict_table_t*		table,	/*!< in: new table */
+	ulint			zip_size,/*!< in: compressed page size of
+					 the old table, or 0 if uncompressed */
+	int			fd,	/*!< in: file descriptor */
+	row_merge_block_t*	block)	/*!< in/out: file buffer */
+{
+	const byte*		b;
+	que_thr_t*		thr;
+	ins_node_t*		node;
+	mem_heap_t*		tuple_heap;
+	mem_heap_t*		graph_heap;
+	ulint			error = DB_SUCCESS;
+	ulint			foffs = 0;
+	ulint*			offsets;
+
+	ut_ad(trx);
+	ut_ad(index);
+	ut_ad(table);
+
+	/* We use the insert query graph as the dummy graph
+	needed in the row module call */
+
+	trx->op_info = "inserting index entries";
+
+	graph_heap = mem_heap_create(500 + sizeof(mrec_buf_t));
+	node = ins_node_create(INS_DIRECT, table, graph_heap);
+
+	thr = pars_complete_graph_for_exec(node, trx, graph_heap);
+
+	que_thr_move_to_run_state_for_mysql(thr, trx);
+
+	tuple_heap = mem_heap_create(1000);
+
+	{
+		ulint i	= 1 + REC_OFFS_HEADER_SIZE
+			+ dict_index_get_n_fields(index);
+		offsets = mem_heap_alloc(graph_heap, i * sizeof *offsets);
+		offsets[0] = i;
+		offsets[1] = dict_index_get_n_fields(index);
+	}
+
+	b = *block;
+
+	if (!row_merge_read(fd, foffs, block)) {
+		error = DB_CORRUPTION;
+	} else {
+		mrec_buf_t*	buf = mem_heap_alloc(graph_heap, sizeof *buf);
+
+		for (;;) {
+			const mrec_t*	mrec;
+			dtuple_t*	dtuple;
+			ulint		n_ext;
+
+			b = row_merge_read_rec(block, buf, b, index,
+					       fd, &foffs, &mrec, offsets);
+			if (UNIV_UNLIKELY(!b)) {
+				/* End of list, or I/O error */
+				if (mrec) {
+					error = DB_CORRUPTION;
+				}
+				break;
+			}
+
+			dtuple = row_rec_to_index_entry_low(
+				mrec, index, offsets, &n_ext, tuple_heap);
+
+			if (UNIV_UNLIKELY(n_ext)) {
+				row_merge_copy_blobs(mrec, offsets, zip_size,
+						     dtuple, tuple_heap);
+			}
+
+			node->row = dtuple;
+			node->table = table;
+			node->trx_id = trx->id;
+
+			ut_ad(dtuple_validate(dtuple));
+
+			do {
+				thr->run_node = thr;
+				thr->prev_node = thr->common.parent;
+
+				error = row_ins_index_entry(index, dtuple,
+							    0, FALSE, thr);
+
+				if (UNIV_LIKELY(error == DB_SUCCESS)) {
+
+					goto next_rec;
+				}
+
+				thr->lock_state = QUE_THR_LOCK_ROW;
+				trx->error_state = error;
+				que_thr_stop_for_mysql(thr);
+				thr->lock_state = QUE_THR_LOCK_NOLOCK;
+			} while (row_mysql_handle_errors(&error, trx,
+							 thr, NULL));
+
+			goto err_exit;
+next_rec:
+			mem_heap_empty(tuple_heap);
+		}
+	}
+
+	que_thr_stop_for_mysql_no_error(thr, trx);
+err_exit:
+	que_graph_free(thr->graph);
+
+	trx->op_info = "";
+
+	mem_heap_free(tuple_heap);
+
+	return(error);
+}
+
+/*********************************************************************//**
+Sets an exclusive lock on a table, for the duration of creating indexes.
+@return	error code or DB_SUCCESS */
+UNIV_INTERN
+ulint
+row_merge_lock_table(
+/*=================*/
+	trx_t*		trx,		/*!< in/out: transaction */
+	dict_table_t*	table,		/*!< in: table to lock */
+	enum lock_mode	mode)		/*!< in: LOCK_X or LOCK_S */
+{
+	mem_heap_t*	heap;
+	que_thr_t*	thr;
+	ulint		err;
+	sel_node_t*	node;
+
+	ut_ad(trx);
+	ut_ad(trx->mysql_thread_id == os_thread_get_curr_id());
+	ut_ad(mode == LOCK_X || mode == LOCK_S);
+
+	heap = mem_heap_create(512);
+
+	trx->op_info = "setting table lock for creating or dropping index";
+
+	node = sel_node_create(heap);
+	thr = pars_complete_graph_for_exec(node, trx, heap);
+	thr->graph->state = QUE_FORK_ACTIVE;
+
+	/* We use the select query graph as the dummy graph needed
+	in the lock module call */
+
+	thr = que_fork_get_first_thr(que_node_get_parent(thr));
+	que_thr_move_to_run_state_for_mysql(thr, trx);
+
+run_again:
+	thr->run_node = thr;
+	thr->prev_node = thr->common.parent;
+
+	err = lock_table(0, table, mode, thr);
+
+	trx->error_state = err;
+
+	if (UNIV_LIKELY(err == DB_SUCCESS)) {
+		que_thr_stop_for_mysql_no_error(thr, trx);
+	} else {
+		que_thr_stop_for_mysql(thr);
+
+		if (err != DB_QUE_THR_SUSPENDED) {
+			ibool	was_lock_wait;
+
+			was_lock_wait = row_mysql_handle_errors(
+				&err, trx, thr, NULL);
+
+			if (was_lock_wait) {
+				goto run_again;
+			}
+		} else {
+			que_thr_t*	run_thr;
+			que_node_t*	parent;
+
+			parent = que_node_get_parent(thr);
+			run_thr = que_fork_start_command(parent);
+
+			ut_a(run_thr == thr);
+
+			/* There was a lock wait but the thread was not
+			in a ready to run or running state. */
+			trx->error_state = DB_LOCK_WAIT;
+
+			goto run_again;
+		}
+	}
+
+	que_graph_free(thr->graph);
+	trx->op_info = "";
+
+	return(err);
+}
+
+/*********************************************************************//**
+Drop an index from the InnoDB system tables.  The data dictionary must
+have been locked exclusively by the caller, because the transaction
+will not be committed. */
+UNIV_INTERN
+void
+row_merge_drop_index(
+/*=================*/
+	dict_index_t*	index,	/*!< in: index to be removed */
+	dict_table_t*	table,	/*!< in: table */
+	trx_t*		trx)	/*!< in: transaction handle */
+{
+	ulint		err;
+	pars_info_t*	info = pars_info_create();
+
+	/* We use the private SQL parser of Innobase to generate the
+	query graphs needed in deleting the dictionary data from system
+	tables in Innobase. Deleting a row from SYS_INDEXES table also
+	frees the file segments of the B-tree associated with the index. */
+
+	static const char str1[] =
+		"PROCEDURE DROP_INDEX_PROC () IS\n"
+		"BEGIN\n"
+		/* Rename the index, so that it will be dropped by
+		row_merge_drop_temp_indexes() at crash recovery
+		if the server crashes before this trx is committed. */
+		"UPDATE SYS_INDEXES SET NAME=CONCAT('"
+		TEMP_INDEX_PREFIX_STR "', NAME) WHERE ID = :indexid;\n"
+		"COMMIT WORK;\n"
+		/* Drop the statistics of the index. */
+		"DELETE FROM SYS_STATS WHERE INDEX_ID = :indexid;\n"
+		/* Drop the field definitions of the index. */
+		"DELETE FROM SYS_FIELDS WHERE INDEX_ID = :indexid;\n"
+		/* Drop the index definition and the B-tree. */
+		"DELETE FROM SYS_INDEXES WHERE ID = :indexid;\n"
+		"END;\n";
+
+	ut_ad(index && table && trx);
+
+	pars_info_add_dulint_literal(info, "indexid", index->id);
+
+	trx_start_if_not_started(trx);
+	trx->op_info = "dropping index";
+
+	ut_a(trx->dict_operation_lock_mode == RW_X_LATCH);
+
+	err = que_eval_sql(info, str1, FALSE, trx);
+
+	ut_a(err == DB_SUCCESS);
+
+	/* Replace this index with another equivalent index for all
+	foreign key constraints on this table where this index is used */
+
+	dict_table_replace_index_in_foreign_list(table, index);
+	dict_index_remove_from_cache(table, index);
+
+	trx->op_info = "";
+}
+
+/*********************************************************************//**
+Drop those indexes which were created before an error occurred when
+building an index.  The data dictionary must have been locked
+exclusively by the caller, because the transaction will not be
+committed. */
+UNIV_INTERN
+void
+row_merge_drop_indexes(
+/*===================*/
+	trx_t*		trx,		/*!< in: transaction */
+	dict_table_t*	table,		/*!< in: table containing the indexes */
+	dict_index_t**	index,		/*!< in: indexes to drop */
+	ulint		num_created)	/*!< in: number of elements in index[] */
+{
+	ulint	key_num;
+
+	for (key_num = 0; key_num < num_created; key_num++) {
+		row_merge_drop_index(index[key_num], table, trx);
+	}
+}
+
+/*********************************************************************//**
+Drop all partially created indexes during crash recovery. */
+UNIV_INTERN
+void
+row_merge_drop_temp_indexes(void)
+/*=============================*/
+{
+	trx_t*		trx;
+	btr_pcur_t	pcur;
+	mtr_t		mtr;
+
+	/* Load the table definitions that contain partially defined
+	indexes, so that the data dictionary information can be checked
+	when accessing the tablename.ibd files. */
+	trx = trx_allocate_for_background();
+	trx->op_info = "dropping partially created indexes";
+	row_mysql_lock_data_dictionary(trx);
+
+	mtr_start(&mtr);
+
+	btr_pcur_open_at_index_side(
+		TRUE,
+		dict_table_get_first_index(dict_sys->sys_indexes),
+		BTR_SEARCH_LEAF, &pcur, TRUE, &mtr);
+
+	for (;;) {
+		const rec_t*	rec;
+		const byte*	field;
+		ulint		len;
+		dulint		table_id;
+		dict_table_t*	table;
+
+		btr_pcur_move_to_next_user_rec(&pcur, &mtr);
+
+		if (!btr_pcur_is_on_user_rec(&pcur)) {
+			break;
+		}
+
+		rec = btr_pcur_get_rec(&pcur);
+		field = rec_get_nth_field_old(rec, DICT_SYS_INDEXES_NAME_FIELD,
+					      &len);
+		if (len == UNIV_SQL_NULL || len == 0
+		    || (char) *field != TEMP_INDEX_PREFIX) {
+			continue;
+		}
+
+		/* This is a temporary index. */
+
+		field = rec_get_nth_field_old(rec, 0/*TABLE_ID*/, &len);
+		if (len != 8) {
+			/* Corrupted TABLE_ID */
+			continue;
+		}
+
+		table_id = mach_read_from_8(field);
+
+		btr_pcur_store_position(&pcur, &mtr);
+		btr_pcur_commit_specify_mtr(&pcur, &mtr);
+
+		table = dict_table_get_on_id_low(table_id);
+
+		if (table) {
+			dict_index_t*	index;
+			dict_index_t*	next_index;
+
+			for (index = dict_table_get_first_index(table);
+			     index; index = next_index) {
+
+				next_index = dict_table_get_next_index(index);
+
+				if (*index->name == TEMP_INDEX_PREFIX) {
+					row_merge_drop_index(index, table, trx);
+					trx_commit_for_mysql(trx);
+				}
+			}
+		}
+
+		mtr_start(&mtr);
+		btr_pcur_restore_position(BTR_SEARCH_LEAF,
+					  &pcur, &mtr);
+	}
+
+	btr_pcur_close(&pcur);
+	mtr_commit(&mtr);
+	row_mysql_unlock_data_dictionary(trx);
+	trx_free_for_background(trx);
+}
+
+/*********************************************************************//**
+Create a merge file. */
+static
+void
+row_merge_file_create(
+/*==================*/
+	merge_file_t*	merge_file)	/*!< out: merge file structure */
+{
+	merge_file->fd = innobase_mysql_tmpfile();
+	merge_file->offset = 0;
+	merge_file->n_rec = 0;
+}
+
+/*********************************************************************//**
+Destroy a merge file. */
+static
+void
+row_merge_file_destroy(
+/*===================*/
+	merge_file_t*	merge_file)	/*!< out: merge file structure */
+{
+	if (merge_file->fd != -1) {
+		close(merge_file->fd);
+		merge_file->fd = -1;
+	}
+}
+
+/*********************************************************************//**
+Determine the precise type of a column that is added to a tem
+if a column must be constrained NOT NULL.
+@return	col->prtype, possibly ORed with DATA_NOT_NULL */
+UNIV_INLINE
+ulint
+row_merge_col_prtype(
+/*=================*/
+	const dict_col_t*	col,		/*!< in: column */
+	const char*		col_name,	/*!< in: name of the column */
+	const merge_index_def_t*index_def)	/*!< in: the index definition
+						of the primary key */
+{
+	ulint	prtype = col->prtype;
+	ulint	i;
+
+	ut_ad(index_def->ind_type & DICT_CLUSTERED);
+
+	if (prtype & DATA_NOT_NULL) {
+
+		return(prtype);
+	}
+
+	/* All columns that are included
+	in the PRIMARY KEY must be NOT NULL. */
+
+	for (i = 0; i < index_def->n_fields; i++) {
+		if (!strcmp(col_name, index_def->fields[i].field_name)) {
+			return(prtype | DATA_NOT_NULL);
+		}
+	}
+
+	return(prtype);
+}
+
+/*********************************************************************//**
+Create a temporary table for creating a primary key, using the definition
+of an existing table.
+@return	table, or NULL on error */
+UNIV_INTERN
+dict_table_t*
+row_merge_create_temporary_table(
+/*=============================*/
+	const char*		table_name,	/*!< in: new table name */
+	const merge_index_def_t*index_def,	/*!< in: the index definition
+						of the primary key */
+	const dict_table_t*	table,		/*!< in: old table definition */
+	trx_t*			trx)		/*!< in/out: transaction
+						(sets error_state) */
+{
+	ulint		i;
+	dict_table_t*	new_table = NULL;
+	ulint		n_cols = dict_table_get_n_user_cols(table);
+	ulint		error;
+	mem_heap_t*	heap = mem_heap_create(1000);
+
+	ut_ad(table_name);
+	ut_ad(index_def);
+	ut_ad(table);
+	ut_ad(mutex_own(&dict_sys->mutex));
+
+	new_table = dict_mem_table_create(table_name, 0, n_cols, table->flags);
+
+	for (i = 0; i < n_cols; i++) {
+		const dict_col_t*	col;
+		const char*		col_name;
+
+		col = dict_table_get_nth_col(table, i);
+		col_name = dict_table_get_col_name(table, i);
+
+		dict_mem_table_add_col(new_table, heap, col_name, col->mtype,
+				       row_merge_col_prtype(col, col_name,
+							    index_def),
+				       col->len);
+	}
+
+	error = row_create_table_for_mysql(new_table, trx);
+	mem_heap_free(heap);
+
+	if (error != DB_SUCCESS) {
+		trx->error_state = error;
+		new_table = NULL;
+	}
+
+	return(new_table);
+}
+
+/*********************************************************************//**
+Rename the temporary indexes in the dictionary to permanent ones.  The
+data dictionary must have been locked exclusively by the caller,
+because the transaction will not be committed.
+@return	DB_SUCCESS if all OK */
+UNIV_INTERN
+ulint
+row_merge_rename_indexes(
+/*=====================*/
+	trx_t*		trx,		/*!< in/out: transaction */
+	dict_table_t*	table)		/*!< in/out: table with new indexes */
+{
+	ulint		err = DB_SUCCESS;
+	pars_info_t*	info = pars_info_create();
+
+	/* We use the private SQL parser of Innobase to generate the
+	query graphs needed in renaming indexes. */
+
+	static const char rename_indexes[] =
+		"PROCEDURE RENAME_INDEXES_PROC () IS\n"
+		"BEGIN\n"
+		"UPDATE SYS_INDEXES SET NAME=SUBSTR(NAME,1,LENGTH(NAME)-1)\n"
+		"WHERE TABLE_ID = :tableid AND SUBSTR(NAME,0,1)='"
+		TEMP_INDEX_PREFIX_STR "';\n"
+		"END;\n";
+
+	ut_ad(table);
+	ut_ad(trx);
+	ut_a(trx->dict_operation_lock_mode == RW_X_LATCH);
+
+	trx->op_info = "renaming indexes";
+
+	pars_info_add_dulint_literal(info, "tableid", table->id);
+
+	err = que_eval_sql(info, rename_indexes, FALSE, trx);
+
+	if (err == DB_SUCCESS) {
+		dict_index_t*	index = dict_table_get_first_index(table);
+		do {
+			if (*index->name == TEMP_INDEX_PREFIX) {
+				index->name++;
+			}
+			index = dict_table_get_next_index(index);
+		} while (index);
+	}
+
+	trx->op_info = "";
+
+	return(err);
+}
+
+/*********************************************************************//**
+Rename the tables in the data dictionary.  The data dictionary must
+have been locked exclusively by the caller, because the transaction
+will not be committed.
+@return	error code or DB_SUCCESS */
+UNIV_INTERN
+ulint
+row_merge_rename_tables(
+/*====================*/
+	dict_table_t*	old_table,	/*!< in/out: old table, renamed to
+					tmp_name */
+	dict_table_t*	new_table,	/*!< in/out: new table, renamed to
+					old_table->name */
+	const char*	tmp_name,	/*!< in: new name for old_table */
+	trx_t*		trx)		/*!< in: transaction handle */
+{
+	ulint		err	= DB_ERROR;
+	pars_info_t*	info;
+	char		old_name[MAX_TABLE_NAME_LEN + 1];
+
+	ut_ad(trx->mysql_thread_id == os_thread_get_curr_id());
+	ut_ad(old_table != new_table);
+	ut_ad(mutex_own(&dict_sys->mutex));
+
+	ut_a(trx->dict_operation_lock_mode == RW_X_LATCH);
+
+	/* store the old/current name to an automatic variable */
+	if (strlen(old_table->name) + 1 <= sizeof(old_name)) {
+		memcpy(old_name, old_table->name, strlen(old_table->name) + 1);
+	} else {
+		ut_print_timestamp(stderr);
+		fprintf(stderr, "InnoDB: too long table name: '%s', "
+			"max length is %d\n", old_table->name,
+			MAX_TABLE_NAME_LEN);
+		ut_error;
+	}
+
+	trx->op_info = "renaming tables";
+
+	/* We use the private SQL parser of Innobase to generate the query
+	graphs needed in updating the dictionary data in system tables. */
+
+	info = pars_info_create();
+
+	pars_info_add_str_literal(info, "new_name", new_table->name);
+	pars_info_add_str_literal(info, "old_name", old_name);
+	pars_info_add_str_literal(info, "tmp_name", tmp_name);
+
+	err = que_eval_sql(info,
+			   "PROCEDURE RENAME_TABLES () IS\n"
+			   "BEGIN\n"
+			   "UPDATE SYS_TABLES SET NAME = :tmp_name\n"
+			   " WHERE NAME = :old_name;\n"
+			   "UPDATE SYS_TABLES SET NAME = :old_name\n"
+			   " WHERE NAME = :new_name;\n"
+			   "END;\n", FALSE, trx);
+
+	if (err != DB_SUCCESS) {
+
+		goto err_exit;
+	}
+
+	/* The following calls will also rename the .ibd data files if
+	the tables are stored in a single-table tablespace */
+
+	if (!dict_table_rename_in_cache(old_table, tmp_name, FALSE)
+	    || !dict_table_rename_in_cache(new_table, old_name, FALSE)) {
+
+		err = DB_ERROR;
+		goto err_exit;
+	}
+
+	err = dict_load_foreigns(old_name, FALSE, TRUE);
+
+	if (err != DB_SUCCESS) {
+err_exit:
+		trx->error_state = DB_SUCCESS;
+		trx_general_rollback_for_mysql(trx, NULL);
+		trx->error_state = DB_SUCCESS;
+	}
+
+	trx->op_info = "";
+
+	return(err);
+}
+
+/*********************************************************************//**
+Create and execute a query graph for creating an index.
+@return	DB_SUCCESS or error code */
+static
+ulint
+row_merge_create_index_graph(
+/*=========================*/
+	trx_t*		trx,		/*!< in: trx */
+	dict_table_t*	table,		/*!< in: table */
+	dict_index_t*	index)		/*!< in: index */
+{
+	ind_node_t*	node;		/*!< Index creation node */
+	mem_heap_t*	heap;		/*!< Memory heap */
+	que_thr_t*	thr;		/*!< Query thread */
+	ulint		err;
+
+	ut_ad(trx);
+	ut_ad(table);
+	ut_ad(index);
+
+	heap = mem_heap_create(512);
+
+	index->table = table;
+	node = ind_create_graph_create(index, heap);
+	thr = pars_complete_graph_for_exec(node, trx, heap);
+
+	ut_a(thr == que_fork_start_command(que_node_get_parent(thr)));
+
+	que_run_threads(thr);
+
+	err = trx->error_state;
+
+	que_graph_free((que_t*) que_node_get_parent(thr));
+
+	return(err);
+}
+
+/*********************************************************************//**
+Create the index and load in to the dictionary.
+@return	index, or NULL on error */
+UNIV_INTERN
+dict_index_t*
+row_merge_create_index(
+/*===================*/
+	trx_t*			trx,	/*!< in/out: trx (sets error_state) */
+	dict_table_t*		table,	/*!< in: the index is on this table */
+	const merge_index_def_t*index_def)
+					/*!< in: the index definition */
+{
+	dict_index_t*	index;
+	ulint		err;
+	ulint		n_fields = index_def->n_fields;
+	ulint		i;
+
+	/* Create the index prototype, using the passed in def, this is not
+	a persistent operation. We pass 0 as the space id, and determine at
+	a lower level the space id where to store the table. */
+
+	index = dict_mem_index_create(table->name, index_def->name,
+				      0, index_def->ind_type, n_fields);
+
+	ut_a(index);
+
+	for (i = 0; i < n_fields; i++) {
+		merge_index_field_t*	ifield = &index_def->fields[i];
+
+		dict_mem_index_add_field(index, ifield->field_name,
+					 ifield->prefix_len);
+	}
+
+	/* Add the index to SYS_INDEXES, using the index prototype. */
+	err = row_merge_create_index_graph(trx, table, index);
+
+	if (err == DB_SUCCESS) {
+
+		index = row_merge_dict_table_get_index(
+			table, index_def);
+
+		ut_a(index);
+
+		/* Note the id of the transaction that created this
+		index, we use it to restrict readers from accessing
+		this index, to ensure read consistency. */
+		index->trx_id = (ib_uint64_t)
+			ut_conv_dulint_to_longlong(trx->id);
+	} else {
+		index = NULL;
+	}
+
+	return(index);
+}
+
+/*********************************************************************//**
+Check if a transaction can use an index. */
+UNIV_INTERN
+ibool
+row_merge_is_index_usable(
+/*======================*/
+	const trx_t*		trx,	/*!< in: transaction */
+	const dict_index_t*	index)	/*!< in: index to check */
+{
+	return(!trx->read_view || read_view_sees_trx_id(
+		       trx->read_view,
+		       ut_dulint_create((ulint) (index->trx_id >> 32),
+					(ulint) index->trx_id & 0xFFFFFFFF)));
+}
+
+/*********************************************************************//**
+Drop the old table.
+@return	DB_SUCCESS or error code */
+UNIV_INTERN
+ulint
+row_merge_drop_table(
+/*=================*/
+	trx_t*		trx,		/*!< in: transaction */
+	dict_table_t*	table)		/*!< in: table to drop */
+{
+	/* There must be no open transactions on the table. */
+	ut_a(table->n_mysql_handles_opened == 0);
+
+	return(row_drop_table_for_mysql(table->name, trx, FALSE));
+}
+
+/*********************************************************************//**
+Build indexes on a table by reading a clustered index,
+creating a temporary file containing index entries, merge sorting
+these index entries and inserting sorted index entries to indexes.
+@return	DB_SUCCESS or error code */
+UNIV_INTERN
+ulint
+row_merge_build_indexes(
+/*====================*/
+	trx_t*		trx,		/*!< in: transaction */
+	dict_table_t*	old_table,	/*!< in: table where rows are
+					read from */
+	dict_table_t*	new_table,	/*!< in: table where indexes are
+					created; identical to old_table
+					unless creating a PRIMARY KEY */
+	dict_index_t**	indexes,	/*!< in: indexes to be created */
+	ulint		n_indexes,	/*!< in: size of indexes[] */
+	TABLE*		table)		/*!< in/out: MySQL table, for
+					reporting erroneous key value
+					if applicable */
+{
+	merge_file_t*		merge_files;
+	row_merge_block_t*	block;
+	ulint			block_size;
+	ulint			i;
+	ulint			error;
+	int			tmpfd;
+
+	ut_ad(trx);
+	ut_ad(old_table);
+	ut_ad(new_table);
+	ut_ad(indexes);
+	ut_ad(n_indexes);
+
+	trx_start_if_not_started(trx);
+
+	/* Allocate memory for merge file data structure and initialize
+	fields */
+
+	merge_files = mem_alloc(n_indexes * sizeof *merge_files);
+	block_size = 3 * sizeof *block;
+	block = os_mem_alloc_large(&block_size);
+
+	for (i = 0; i < n_indexes; i++) {
+
+		row_merge_file_create(&merge_files[i]);
+	}
+
+	tmpfd = innobase_mysql_tmpfile();
+
+	/* Reset the MySQL row buffer that is used when reporting
+	duplicate keys. */
+	innobase_rec_reset(table);
+
+	/* Read clustered index of the table and create files for
+	secondary index entries for merge sort */
+
+	error = row_merge_read_clustered_index(
+		trx, table, old_table, new_table, indexes,
+		merge_files, n_indexes, block);
+
+	if (error != DB_SUCCESS) {
+
+		goto func_exit;
+	}
+
+	/* Now we have files containing index entries ready for
+	sorting and inserting. */
+
+	for (i = 0; i < n_indexes; i++) {
+		error = row_merge_sort(trx, indexes[i], &merge_files[i],
+				       block, &tmpfd, table);
+
+		if (error == DB_SUCCESS) {
+			error = row_merge_insert_index_tuples(
+				trx, indexes[i], new_table,
+				dict_table_zip_size(old_table),
+				merge_files[i].fd, block);
+		}
+
+		/* Close the temporary file to free up space. */
+		row_merge_file_destroy(&merge_files[i]);
+
+		if (error != DB_SUCCESS) {
+			trx->error_key_num = i;
+			goto func_exit;
+		}
+	}
+
+func_exit:
+	close(tmpfd);
+
+	for (i = 0; i < n_indexes; i++) {
+		row_merge_file_destroy(&merge_files[i]);
+	}
+
+	mem_free(merge_files);
+	os_mem_free_large(block, block_size);
+
+	return(error);
+}
diff --git a/storage/xtradb/row/row0mysql.c b/storage/xtradb/row/row0mysql.c
new file mode 100644
index 00000000000..56754404b65
--- /dev/null
+++ b/storage/xtradb/row/row0mysql.c
@@ -0,0 +1,4234 @@
+/*****************************************************************************
+
+Copyright (c) 2000, 2010, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file row/row0mysql.c
+Interface between Innobase row operations and MySQL.
+Contains also create table and other data dictionary operations.
+
+Created 9/17/2000 Heikki Tuuri
+*******************************************************/
+
+#include "row0mysql.h"
+
+#ifdef UNIV_NONINL
+#include "row0mysql.ic"
+#endif
+
+#include "ha_prototypes.h"
+#include "row0ins.h"
+#include "row0merge.h"
+#include "row0sel.h"
+#include "row0upd.h"
+#include "row0row.h"
+#include "que0que.h"
+#include "pars0pars.h"
+#include "dict0dict.h"
+#include "dict0crea.h"
+#include "dict0load.h"
+#include "dict0boot.h"
+#include "trx0roll.h"
+#include "trx0purge.h"
+#include "trx0rec.h"
+#include "trx0undo.h"
+#include "lock0lock.h"
+#include "rem0cmp.h"
+#include "log0log.h"
+#include "btr0sea.h"
+#include "fil0fil.h"
+#include "ibuf0ibuf.h"
+
+/** Provide optional 4.x backwards compatibility for 5.0 and above */
+UNIV_INTERN ibool	row_rollback_on_timeout	= FALSE;
+
+/** Chain node of the list of tables to drop in the background. */
+typedef struct row_mysql_drop_struct	row_mysql_drop_t;
+
+/** Chain node of the list of tables to drop in the background. */
+struct row_mysql_drop_struct{
+	char*				table_name;	/*!< table name */
+	UT_LIST_NODE_T(row_mysql_drop_t)row_mysql_drop_list;
+							/*!< list chain node */
+};
+
+/** @brief List of tables we should drop in background.
+
+ALTER TABLE in MySQL requires that the table handler can drop the
+table in background when there are no queries to it any
+more.  Protected by kernel_mutex. */
+static UT_LIST_BASE_NODE_T(row_mysql_drop_t)	row_mysql_drop_list;
+/** Flag: has row_mysql_drop_list been initialized? */
+static ibool	row_mysql_drop_list_inited	= FALSE;
+
+/** Magic table names for invoking various monitor threads */
+/* @{ */
+static const char S_innodb_monitor[] = "innodb_monitor";
+static const char S_innodb_lock_monitor[] = "innodb_lock_monitor";
+static const char S_innodb_tablespace_monitor[] = "innodb_tablespace_monitor";
+static const char S_innodb_table_monitor[] = "innodb_table_monitor";
+static const char S_innodb_mem_validate[] = "innodb_mem_validate";
+/* @} */
+
+/** Evaluates to true if str1 equals str2_onstack, used for comparing
+the magic table names.
+@param str1		in: string to compare
+@param str1_len 	in: length of str1, in bytes, including terminating NUL
+@param str2_onstack	in: char[] array containing a NUL terminated string
+@return			TRUE if str1 equals str2_onstack */
+#define STR_EQ(str1, str1_len, str2_onstack) \
+	((str1_len) == sizeof(str2_onstack) \
+	 && memcmp(str1, str2_onstack, sizeof(str2_onstack)) == 0)
+
+/*******************************************************************//**
+Determine if the given name is a name reserved for MySQL system tables.
+@return	TRUE if name is a MySQL system table name */
+static
+ibool
+row_mysql_is_system_table(
+/*======================*/
+	const char*	name)
+{
+	if (strncmp(name, "mysql/", 6) != 0) {
+
+		return(FALSE);
+	}
+
+	return(0 == strcmp(name + 6, "host")
+	       || 0 == strcmp(name + 6, "user")
+	       || 0 == strcmp(name + 6, "db"));
+}
+
+/*********************************************************************//**
+If a table is not yet in the drop list, adds the table to the list of tables
+which the master thread drops in background. We need this on Unix because in
+ALTER TABLE MySQL may call drop table even if the table has running queries on
+it. Also, if there are running foreign key checks on the table, we drop the
+table lazily.
+@return	TRUE if the table was not yet in the drop list, and was added there */
+static
+ibool
+row_add_table_to_background_drop_list(
+/*==================================*/
+	const char*	name);	/*!< in: table name */
+
+/*******************************************************************//**
+Delays an INSERT, DELETE or UPDATE operation if the purge is lagging. */
+static
+void
+row_mysql_delay_if_needed(void)
+/*===========================*/
+{
+	if (srv_dml_needed_delay) {
+		os_thread_sleep(srv_dml_needed_delay);
+	}
+}
+
+/*******************************************************************//**
+Frees the blob heap in prebuilt when no longer needed. */
+UNIV_INTERN
+void
+row_mysql_prebuilt_free_blob_heap(
+/*==============================*/
+	row_prebuilt_t*	prebuilt)	/*!< in: prebuilt struct of a
+					ha_innobase:: table handle */
+{
+	mem_heap_free(prebuilt->blob_heap);
+	prebuilt->blob_heap = NULL;
+}
+
+/*******************************************************************//**
+Stores a >= 5.0.3 format true VARCHAR length to dest, in the MySQL row
+format.
+@return pointer to the data, we skip the 1 or 2 bytes at the start
+that are used to store the len */
+UNIV_INTERN
+byte*
+row_mysql_store_true_var_len(
+/*=========================*/
+	byte*	dest,	/*!< in: where to store */
+	ulint	len,	/*!< in: length, must fit in two bytes */
+	ulint	lenlen)	/*!< in: storage length of len: either 1 or 2 bytes */
+{
+	if (lenlen == 2) {
+		ut_a(len < 256 * 256);
+
+		mach_write_to_2_little_endian(dest, len);
+
+		return(dest + 2);
+	}
+
+	ut_a(lenlen == 1);
+	ut_a(len < 256);
+
+	mach_write_to_1(dest, len);
+
+	return(dest + 1);
+}
+
+/*******************************************************************//**
+Reads a >= 5.0.3 format true VARCHAR length, in the MySQL row format, and
+returns a pointer to the data.
+@return pointer to the data, we skip the 1 or 2 bytes at the start
+that are used to store the len */
+UNIV_INTERN
+const byte*
+row_mysql_read_true_varchar(
+/*========================*/
+	ulint*		len,	/*!< out: variable-length field length */
+	const byte*	field,	/*!< in: field in the MySQL format */
+	ulint		lenlen)	/*!< in: storage length of len: either 1
+				or 2 bytes */
+{
+	if (lenlen == 2) {
+		*len = mach_read_from_2_little_endian(field);
+
+		return(field + 2);
+	}
+
+	ut_a(lenlen == 1);
+
+	*len = mach_read_from_1(field);
+
+	return(field + 1);
+}
+
+/*******************************************************************//**
+Stores a reference to a BLOB in the MySQL format. */
+UNIV_INTERN
+void
+row_mysql_store_blob_ref(
+/*=====================*/
+	byte*		dest,	/*!< in: where to store */
+	ulint		col_len,/*!< in: dest buffer size: determines into
+				how many bytes the BLOB length is stored,
+				the space for the length may vary from 1
+				to 4 bytes */
+	const void*	data,	/*!< in: BLOB data; if the value to store
+				is SQL NULL this should be NULL pointer */
+	ulint		len)	/*!< in: BLOB length; if the value to store
+				is SQL NULL this should be 0; remember
+				also to set the NULL bit in the MySQL record
+				header! */
+{
+	/* MySQL might assume the field is set to zero except the length and
+	the pointer fields */
+
+	memset(dest, '\0', col_len);
+
+	/* In dest there are 1 - 4 bytes reserved for the BLOB length,
+	and after that 8 bytes reserved for the pointer to the data.
+	In 32-bit architectures we only use the first 4 bytes of the pointer
+	slot. */
+
+	ut_a(col_len - 8 > 1 || len < 256);
+	ut_a(col_len - 8 > 2 || len < 256 * 256);
+	ut_a(col_len - 8 > 3 || len < 256 * 256 * 256);
+
+	mach_write_to_n_little_endian(dest, col_len - 8, len);
+
+	memcpy(dest + col_len - 8, &data, sizeof data);
+}
+
+/*******************************************************************//**
+Reads a reference to a BLOB in the MySQL format.
+@return	pointer to BLOB data */
+UNIV_INTERN
+const byte*
+row_mysql_read_blob_ref(
+/*====================*/
+	ulint*		len,		/*!< out: BLOB length */
+	const byte*	ref,		/*!< in: BLOB reference in the
+					MySQL format */
+	ulint		col_len)	/*!< in: BLOB reference length
+					(not BLOB length) */
+{
+	byte*	data;
+
+	*len = mach_read_from_n_little_endian(ref, col_len - 8);
+
+	memcpy(&data, ref + col_len - 8, sizeof data);
+
+	return(data);
+}
+
+/**************************************************************//**
+Stores a non-SQL-NULL field given in the MySQL format in the InnoDB format.
+The counterpart of this function is row_sel_field_store_in_mysql_format() in
+row0sel.c.
+@return	up to which byte we used buf in the conversion */
+UNIV_INTERN
+byte*
+row_mysql_store_col_in_innobase_format(
+/*===================================*/
+	dfield_t*	dfield,		/*!< in/out: dfield where dtype
+					information must be already set when
+					this function is called! */
+	byte*		buf,		/*!< in/out: buffer for a converted
+					integer value; this must be at least
+					col_len long then! */
+	ibool		row_format_col,	/*!< TRUE if the mysql_data is from
+					a MySQL row, FALSE if from a MySQL
+					key value;
+					in MySQL, a true VARCHAR storage
+					format differs in a row and in a
+					key value: in a key value the length
+					is always stored in 2 bytes! */
+	const byte*	mysql_data,	/*!< in: MySQL column value, not
+					SQL NULL; NOTE that dfield may also
+					get a pointer to mysql_data,
+					therefore do not discard this as long
+					as dfield is used! */
+	ulint		col_len,	/*!< in: MySQL column length; NOTE that
+					this is the storage length of the
+					column in the MySQL format row, not
+					necessarily the length of the actual
+					payload data; if the column is a true
+					VARCHAR then this is irrelevant */
+	ulint		comp)		/*!< in: nonzero=compact format */
+{
+	const byte*	ptr	= mysql_data;
+	const dtype_t*	dtype;
+	ulint		type;
+	ulint		lenlen;
+
+	dtype = dfield_get_type(dfield);
+
+	type = dtype->mtype;
+
+	if (type == DATA_INT) {
+		/* Store integer data in Innobase in a big-endian format,
+		sign bit negated if the data is a signed integer. In MySQL,
+		integers are stored in a little-endian format. */
+
+		byte*	p = buf + col_len;
+
+		for (;;) {
+			p--;
+			*p = *mysql_data;
+			if (p == buf) {
+				break;
+			}
+			mysql_data++;
+		}
+
+		if (!(dtype->prtype & DATA_UNSIGNED)) {
+
+			*buf ^= 128;
+		}
+
+		ptr = buf;
+		buf += col_len;
+	} else if ((type == DATA_VARCHAR
+		    || type == DATA_VARMYSQL
+		    || type == DATA_BINARY)) {
+
+		if (dtype_get_mysql_type(dtype) == DATA_MYSQL_TRUE_VARCHAR) {
+			/* The length of the actual data is stored to 1 or 2
+			bytes at the start of the field */
+
+			if (row_format_col) {
+				if (dtype->prtype & DATA_LONG_TRUE_VARCHAR) {
+					lenlen = 2;
+				} else {
+					lenlen = 1;
+				}
+			} else {
+				/* In a MySQL key value, lenlen is always 2 */
+				lenlen = 2;
+			}
+
+			ptr = row_mysql_read_true_varchar(&col_len, mysql_data,
+							  lenlen);
+		} else {
+			/* Remove trailing spaces from old style VARCHAR
+			columns. */
+
+			/* Handle UCS2 strings differently. */
+			ulint	mbminlen	= dtype_get_mbminlen(dtype);
+
+			ptr = mysql_data;
+
+			if (mbminlen == 2) {
+				/* space=0x0020 */
+				/* Trim "half-chars", just in case. */
+				col_len &= ~1;
+
+				while (col_len >= 2 && ptr[col_len - 2] == 0x00
+				       && ptr[col_len - 1] == 0x20) {
+					col_len -= 2;
+				}
+			} else {
+				ut_a(mbminlen == 1);
+				/* space=0x20 */
+				while (col_len > 0
+				       && ptr[col_len - 1] == 0x20) {
+					col_len--;
+				}
+			}
+		}
+	} else if (comp && type == DATA_MYSQL
+		   && dtype_get_mbminlen(dtype) == 1
+		   && dtype_get_mbmaxlen(dtype) > 1) {
+		/* In some cases we strip trailing spaces from UTF-8 and other
+		multibyte charsets, from FIXED-length CHAR columns, to save
+		space. UTF-8 would otherwise normally use 3 * the string length
+		bytes to store an ASCII string! */
+
+		/* We assume that this CHAR field is encoded in a
+		variable-length character set where spaces have
+		1:1 correspondence to 0x20 bytes, such as UTF-8.
+
+		Consider a CHAR(n) field, a field of n characters.
+		It will contain between n * mbminlen and n * mbmaxlen bytes.
+		We will try to truncate it to n bytes by stripping
+		space padding.	If the field contains single-byte
+		characters only, it will be truncated to n characters.
+		Consider a CHAR(5) field containing the string ".a   "
+		where "." denotes a 3-byte character represented by
+		the bytes "$%&".  After our stripping, the string will
+		be stored as "$%&a " (5 bytes).	 The string ".abc "
+		will be stored as "$%&abc" (6 bytes).
+
+		The space padding will be restored in row0sel.c, function
+		row_sel_field_store_in_mysql_format(). */
+
+		ulint		n_chars;
+
+		ut_a(!(dtype_get_len(dtype) % dtype_get_mbmaxlen(dtype)));
+
+		n_chars = dtype_get_len(dtype) / dtype_get_mbmaxlen(dtype);
+
+		/* Strip space padding. */
+		while (col_len > n_chars && ptr[col_len - 1] == 0x20) {
+			col_len--;
+		}
+	} else if (type == DATA_BLOB && row_format_col) {
+
+		ptr = row_mysql_read_blob_ref(&col_len, mysql_data, col_len);
+	}
+
+	dfield_set_data(dfield, ptr, col_len);
+
+	return(buf);
+}
+
+/**************************************************************//**
+Convert a row in the MySQL format to a row in the Innobase format. Note that
+the function to convert a MySQL format key value to an InnoDB dtuple is
+row_sel_convert_mysql_key_to_innobase() in row0sel.c. */
+static
+void
+row_mysql_convert_row_to_innobase(
+/*==============================*/
+	dtuple_t*	row,		/*!< in/out: Innobase row where the
+					field type information is already
+					copied there! */
+	row_prebuilt_t*	prebuilt,	/*!< in: prebuilt struct where template
+					must be of type ROW_MYSQL_WHOLE_ROW */
+	byte*		mysql_rec)	/*!< in: row in the MySQL format;
+					NOTE: do not discard as long as
+					row is used, as row may contain
+					pointers to this record! */
+{
+	mysql_row_templ_t*	templ;
+	dfield_t*		dfield;
+	ulint			i;
+
+	ut_ad(prebuilt->template_type == ROW_MYSQL_WHOLE_ROW);
+	ut_ad(prebuilt->mysql_template);
+
+	for (i = 0; i < prebuilt->n_template; i++) {
+
+		templ = prebuilt->mysql_template + i;
+		dfield = dtuple_get_nth_field(row, i);
+
+		if (templ->mysql_null_bit_mask != 0) {
+			/* Column may be SQL NULL */
+
+			if (mysql_rec[templ->mysql_null_byte_offset]
+			    & (byte) (templ->mysql_null_bit_mask)) {
+
+				/* It is SQL NULL */
+
+				dfield_set_null(dfield);
+
+				goto next_column;
+			}
+		}
+
+		row_mysql_store_col_in_innobase_format(
+			dfield,
+			prebuilt->ins_upd_rec_buff + templ->mysql_col_offset,
+			TRUE, /* MySQL row format data */
+			mysql_rec + templ->mysql_col_offset,
+			templ->mysql_col_len,
+			dict_table_is_comp(prebuilt->table));
+next_column:
+		;
+	}
+}
+
+/****************************************************************//**
+Handles user errors and lock waits detected by the database engine.
+@return TRUE if it was a lock wait and we should continue running the
+query thread and in that case the thr is ALREADY in the running state. */
+UNIV_INTERN
+ibool
+row_mysql_handle_errors(
+/*====================*/
+	ulint*		new_err,/*!< out: possible new error encountered in
+				lock wait, or if no new error, the value
+				of trx->error_state at the entry of this
+				function */
+	trx_t*		trx,	/*!< in: transaction */
+	que_thr_t*	thr,	/*!< in: query thread */
+	trx_savept_t*	savept)	/*!< in: savepoint or NULL */
+{
+	ulint	err;
+
+handle_new_error:
+	err = trx->error_state;
+
+	ut_a(err != DB_SUCCESS);
+
+	trx->error_state = DB_SUCCESS;
+
+	switch (err) {
+	case DB_LOCK_WAIT_TIMEOUT:
+		if (row_rollback_on_timeout) {
+			trx_general_rollback_for_mysql(trx, NULL);
+			break;
+		}
+		/* fall through */
+	case DB_DUPLICATE_KEY:
+	case DB_FOREIGN_DUPLICATE_KEY:
+	case DB_TOO_BIG_RECORD:
+	case DB_ROW_IS_REFERENCED:
+	case DB_NO_REFERENCED_ROW:
+	case DB_CANNOT_ADD_CONSTRAINT:
+	case DB_TOO_MANY_CONCURRENT_TRXS:
+	case DB_OUT_OF_FILE_SPACE:
+	case DB_INTERRUPTED:
+		if (savept) {
+			/* Roll back the latest, possibly incomplete
+			insertion or update */
+
+			trx_general_rollback_for_mysql(trx, savept);
+		}
+		/* MySQL will roll back the latest SQL statement */
+		break;
+	case DB_LOCK_WAIT:
+		srv_suspend_mysql_thread(thr);
+
+		if (trx->error_state != DB_SUCCESS) {
+			que_thr_stop_for_mysql(thr);
+
+			goto handle_new_error;
+		}
+
+		*new_err = err;
+
+		return(TRUE);
+
+	case DB_DEADLOCK:
+	case DB_LOCK_TABLE_FULL:
+		/* Roll back the whole transaction; this resolution was added
+		to version 3.23.43 */
+
+		trx_general_rollback_for_mysql(trx, NULL);
+		break;
+
+	case DB_MUST_GET_MORE_FILE_SPACE:
+		fputs("InnoDB: The database cannot continue"
+		      " operation because of\n"
+		      "InnoDB: lack of space. You must add"
+		      " a new data file to\n"
+		      "InnoDB: my.cnf and restart the database.\n", stderr);
+
+		exit(1);
+
+	case DB_CORRUPTION:
+		fputs("InnoDB: We detected index corruption"
+		      " in an InnoDB type table.\n"
+		      "InnoDB: You have to dump + drop + reimport"
+		      " the table or, in\n"
+		      "InnoDB: a case of widespread corruption,"
+		      " dump all InnoDB\n"
+		      "InnoDB: tables and recreate the"
+		      " whole InnoDB tablespace.\n"
+		      "InnoDB: If the mysqld server crashes"
+		      " after the startup or when\n"
+		      "InnoDB: you dump the tables, look at\n"
+		      "InnoDB: " REFMAN "forcing-recovery.html"
+		      " for help.\n", stderr);
+		break;
+	case DB_FOREIGN_EXCEED_MAX_CASCADE:
+		fprintf(stderr, "InnoDB: Cannot delete/update rows with"
+			" cascading foreign key constraints that exceed max"
+			" depth of %lu\n"
+			"Please drop excessive foreign constraints"
+			" and try again\n", (ulong) DICT_FK_MAX_RECURSIVE_LOAD);
+		break;
+	default:
+		fprintf(stderr, "InnoDB: unknown error code %lu\n",
+			(ulong) err);
+		ut_error;
+	}
+
+	if (trx->error_state != DB_SUCCESS) {
+		*new_err = trx->error_state;
+	} else {
+		*new_err = err;
+	}
+
+	trx->error_state = DB_SUCCESS;
+
+	return(FALSE);
+}
+
+/********************************************************************//**
+Create a prebuilt struct for a MySQL table handle.
+@return	own: a prebuilt struct */
+UNIV_INTERN
+row_prebuilt_t*
+row_create_prebuilt(
+/*================*/
+	dict_table_t*	table)	/*!< in: Innobase table handle */
+{
+	row_prebuilt_t*	prebuilt;
+	mem_heap_t*	heap;
+	dict_index_t*	clust_index;
+	dtuple_t*	ref;
+	ulint		ref_len;
+
+	heap = mem_heap_create(sizeof *prebuilt + 128);
+
+	prebuilt = mem_heap_zalloc(heap, sizeof *prebuilt);
+
+	prebuilt->magic_n = ROW_PREBUILT_ALLOCATED;
+	prebuilt->magic_n2 = ROW_PREBUILT_ALLOCATED;
+
+	prebuilt->table = table;
+
+	prebuilt->sql_stat_start = TRUE;
+	prebuilt->heap = heap;
+
+	prebuilt->pcur = btr_pcur_create_for_mysql();
+	prebuilt->clust_pcur = btr_pcur_create_for_mysql();
+
+	prebuilt->select_lock_type = LOCK_NONE;
+	prebuilt->stored_select_lock_type = 99999999;
+	UNIV_MEM_INVALID(&prebuilt->stored_select_lock_type,
+			 sizeof prebuilt->stored_select_lock_type);
+
+	prebuilt->search_tuple = dtuple_create(
+		heap, 2 * dict_table_get_n_cols(table));
+
+	clust_index = dict_table_get_first_index(table);
+
+	/* Make sure that search_tuple is long enough for clustered index */
+	ut_a(2 * dict_table_get_n_cols(table) >= clust_index->n_fields);
+
+	ref_len = dict_index_get_n_unique(clust_index);
+
+	ref = dtuple_create(heap, ref_len);
+
+	dict_index_copy_types(ref, clust_index, ref_len);
+
+	prebuilt->clust_ref = ref;
+
+	prebuilt->autoinc_error = 0;
+	prebuilt->autoinc_offset = 0;
+
+	/* Default to 1, we will set the actual value later in 
+	ha_innobase::get_auto_increment(). */
+	prebuilt->autoinc_increment = 1;
+
+	prebuilt->autoinc_last_value = 0;
+
+	return(prebuilt);
+}
+
+/********************************************************************//**
+Free a prebuilt struct for a MySQL table handle. */
+UNIV_INTERN
+void
+row_prebuilt_free(
+/*==============*/
+	row_prebuilt_t*	prebuilt,	/*!< in, own: prebuilt struct */
+	ibool		dict_locked)	/*!< in: TRUE=data dictionary locked */
+{
+	ulint	i;
+
+	if (UNIV_UNLIKELY
+	    (prebuilt->magic_n != ROW_PREBUILT_ALLOCATED
+	     || prebuilt->magic_n2 != ROW_PREBUILT_ALLOCATED)) {
+
+		fprintf(stderr,
+			"InnoDB: Error: trying to free a corrupt\n"
+			"InnoDB: table handle. Magic n %lu,"
+			" magic n2 %lu, table name ",
+			(ulong) prebuilt->magic_n,
+			(ulong) prebuilt->magic_n2);
+		ut_print_name(stderr, NULL, TRUE, prebuilt->table->name);
+		putc('\n', stderr);
+
+		mem_analyze_corruption(prebuilt);
+
+		ut_error;
+	}
+
+	prebuilt->magic_n = ROW_PREBUILT_FREED;
+	prebuilt->magic_n2 = ROW_PREBUILT_FREED;
+
+	btr_pcur_free_for_mysql(prebuilt->pcur);
+	btr_pcur_free_for_mysql(prebuilt->clust_pcur);
+
+	if (prebuilt->mysql_template) {
+		mem_free(prebuilt->mysql_template);
+	}
+
+	if (prebuilt->ins_graph) {
+		que_graph_free_recursive(prebuilt->ins_graph);
+	}
+
+	if (prebuilt->sel_graph) {
+		que_graph_free_recursive(prebuilt->sel_graph);
+	}
+
+	if (prebuilt->upd_graph) {
+		que_graph_free_recursive(prebuilt->upd_graph);
+	}
+
+	if (prebuilt->blob_heap) {
+		mem_heap_free(prebuilt->blob_heap);
+	}
+
+	if (prebuilt->old_vers_heap) {
+		mem_heap_free(prebuilt->old_vers_heap);
+	}
+
+	for (i = 0; i < MYSQL_FETCH_CACHE_SIZE; i++) {
+		if (prebuilt->fetch_cache[i] != NULL) {
+
+			if ((ROW_PREBUILT_FETCH_MAGIC_N != mach_read_from_4(
+				     (prebuilt->fetch_cache[i]) - 4))
+			    || (ROW_PREBUILT_FETCH_MAGIC_N != mach_read_from_4(
+					(prebuilt->fetch_cache[i])
+					+ prebuilt->mysql_row_len))) {
+				fputs("InnoDB: Error: trying to free"
+				      " a corrupt fetch buffer.\n", stderr);
+
+				mem_analyze_corruption(
+					prebuilt->fetch_cache[i]);
+
+				ut_error;
+			}
+
+			mem_free((prebuilt->fetch_cache[i]) - 4);
+		}
+	}
+
+	dict_table_decrement_handle_count(prebuilt->table, dict_locked);
+
+	mem_heap_free(prebuilt->heap);
+}
+
+/*********************************************************************//**
+Updates the transaction pointers in query graphs stored in the prebuilt
+struct. */
+UNIV_INTERN
+void
+row_update_prebuilt_trx(
+/*====================*/
+	row_prebuilt_t*	prebuilt,	/*!< in/out: prebuilt struct
+					in MySQL handle */
+	trx_t*		trx)		/*!< in: transaction handle */
+{
+	if (trx->magic_n != TRX_MAGIC_N) {
+		fprintf(stderr,
+			"InnoDB: Error: trying to use a corrupt\n"
+			"InnoDB: trx handle. Magic n %lu\n",
+			(ulong) trx->magic_n);
+
+		mem_analyze_corruption(trx);
+
+		ut_error;
+	}
+
+	if (prebuilt->magic_n != ROW_PREBUILT_ALLOCATED) {
+		fprintf(stderr,
+			"InnoDB: Error: trying to use a corrupt\n"
+			"InnoDB: table handle. Magic n %lu, table name ",
+			(ulong) prebuilt->magic_n);
+		ut_print_name(stderr, trx, TRUE, prebuilt->table->name);
+		putc('\n', stderr);
+
+		mem_analyze_corruption(prebuilt);
+
+		ut_error;
+	}
+
+	prebuilt->trx = trx;
+
+	if (prebuilt->ins_graph) {
+		prebuilt->ins_graph->trx = trx;
+	}
+
+	if (prebuilt->upd_graph) {
+		prebuilt->upd_graph->trx = trx;
+	}
+
+	if (prebuilt->sel_graph) {
+		prebuilt->sel_graph->trx = trx;
+	}
+}
+
+/*********************************************************************//**
+Gets pointer to a prebuilt dtuple used in insertions. If the insert graph
+has not yet been built in the prebuilt struct, then this function first
+builds it.
+@return	prebuilt dtuple; the column type information is also set in it */
+static
+dtuple_t*
+row_get_prebuilt_insert_row(
+/*========================*/
+	row_prebuilt_t*	prebuilt)	/*!< in: prebuilt struct in MySQL
+					handle */
+{
+	ins_node_t*	node;
+	dtuple_t*	row;
+	dict_table_t*	table	= prebuilt->table;
+
+	ut_ad(prebuilt && table && prebuilt->trx);
+
+	if (prebuilt->ins_node == NULL) {
+
+		/* Not called before for this handle: create an insert node
+		and query graph to the prebuilt struct */
+
+		node = ins_node_create(INS_DIRECT, table, prebuilt->heap);
+
+		prebuilt->ins_node = node;
+
+		if (prebuilt->ins_upd_rec_buff == NULL) {
+			prebuilt->ins_upd_rec_buff = mem_heap_alloc(
+				prebuilt->heap, prebuilt->mysql_row_len);
+		}
+
+		row = dtuple_create(prebuilt->heap,
+				    dict_table_get_n_cols(table));
+
+		dict_table_copy_types(row, table);
+
+		ins_node_set_new_row(node, row);
+
+		prebuilt->ins_graph = que_node_get_parent(
+			pars_complete_graph_for_exec(node,
+						     prebuilt->trx,
+						     prebuilt->heap));
+		prebuilt->ins_graph->state = QUE_FORK_ACTIVE;
+	}
+
+	return(prebuilt->ins_node->row);
+}
+
+/*********************************************************************//**
+Updates the table modification counter and calculates new estimates
+for table and index statistics if necessary. */
+UNIV_INLINE
+void
+row_update_statistics_if_needed(
+/*============================*/
+	dict_table_t*	table)	/*!< in: table */
+{
+	ulint	counter;
+
+	counter = table->stat_modified_counter;
+
+	table->stat_modified_counter = counter + 1;
+
+	if (!srv_stats_auto_update)
+		return;
+
+	/* Calculate new statistics if 1 / 16 of table has been modified
+	since the last time a statistics batch was run, or if
+	stat_modified_counter > 2 000 000 000 (to avoid wrap-around).
+	We calculate statistics at most every 16th round, since we may have
+	a counter table which is very small and updated very often. */
+
+	if (counter > 2000000000
+	    || ((ib_int64_t)counter > 16 + table->stat_n_rows / 16)) {
+
+		dict_update_statistics(table, TRUE);
+	}
+}
+
+/*********************************************************************//**
+Unlocks AUTO_INC type locks that were possibly reserved by a trx. This
+function should be called at the the end of an SQL statement, by the
+connection thread that owns the transaction (trx->mysql_thd). */
+UNIV_INTERN
+void
+row_unlock_table_autoinc_for_mysql(
+/*===============================*/
+	trx_t*	trx)	/*!< in/out: transaction */
+{
+	if (lock_trx_holds_autoinc_locks(trx)) {
+		mutex_enter(&kernel_mutex);
+
+		lock_release_autoinc_locks(trx);
+
+		mutex_exit(&kernel_mutex);
+	}
+}
+
+/*********************************************************************//**
+Sets an AUTO_INC type lock on the table mentioned in prebuilt. The
+AUTO_INC lock gives exclusive access to the auto-inc counter of the
+table. The lock is reserved only for the duration of an SQL statement.
+It is not compatible with another AUTO_INC or exclusive lock on the
+table.
+@return	error code or DB_SUCCESS */
+UNIV_INTERN
+int
+row_lock_table_autoinc_for_mysql(
+/*=============================*/
+	row_prebuilt_t*	prebuilt)	/*!< in: prebuilt struct in the MySQL
+					table handle */
+{
+	trx_t*			trx	= prebuilt->trx;
+	ins_node_t*		node	= prebuilt->ins_node;
+	const dict_table_t*	table	= prebuilt->table;
+	que_thr_t*		thr;
+	ulint			err;
+	ibool			was_lock_wait;
+
+	ut_ad(trx);
+	ut_ad(trx->mysql_thread_id == os_thread_get_curr_id());
+
+	/* If we already hold an AUTOINC lock on the table then do nothing.
+        Note: We peek at the value of the current owner without acquiring
+	the kernel mutex. **/
+	if (trx == table->autoinc_trx) {
+
+		return(DB_SUCCESS);
+	}
+
+	trx->op_info = "setting auto-inc lock";
+
+	if (node == NULL) {
+		row_get_prebuilt_insert_row(prebuilt);
+		node = prebuilt->ins_node;
+	}
+
+	/* We use the insert query graph as the dummy graph needed
+	in the lock module call */
+
+	thr = que_fork_get_first_thr(prebuilt->ins_graph);
+
+	que_thr_move_to_run_state_for_mysql(thr, trx);
+
+run_again:
+	thr->run_node = node;
+	thr->prev_node = node;
+
+	/* It may be that the current session has not yet started
+	its transaction, or it has been committed: */
+
+	trx_start_if_not_started(trx);
+
+	err = lock_table(0, prebuilt->table, LOCK_AUTO_INC, thr);
+
+	trx->error_state = err;
+
+	if (err != DB_SUCCESS) {
+		que_thr_stop_for_mysql(thr);
+
+		was_lock_wait = row_mysql_handle_errors(&err, trx, thr, NULL);
+
+		if (was_lock_wait) {
+			goto run_again;
+		}
+
+		trx->op_info = "";
+
+		return((int) err);
+	}
+
+	que_thr_stop_for_mysql_no_error(thr, trx);
+
+	trx->op_info = "";
+
+	return((int) err);
+}
+
+/*********************************************************************//**
+Sets a table lock on the table mentioned in prebuilt.
+@return	error code or DB_SUCCESS */
+UNIV_INTERN
+int
+row_lock_table_for_mysql(
+/*=====================*/
+	row_prebuilt_t*	prebuilt,	/*!< in: prebuilt struct in the MySQL
+					table handle */
+	dict_table_t*	table,		/*!< in: table to lock, or NULL
+					if prebuilt->table should be
+					locked as
+					prebuilt->select_lock_type */
+	ulint		mode)		/*!< in: lock mode of table
+					(ignored if table==NULL) */
+{
+	trx_t*		trx		= prebuilt->trx;
+	que_thr_t*	thr;
+	ulint		err;
+	ibool		was_lock_wait;
+
+	ut_ad(trx);
+	ut_ad(trx->mysql_thread_id == os_thread_get_curr_id());
+
+	trx->op_info = "setting table lock";
+
+	if (prebuilt->sel_graph == NULL) {
+		/* Build a dummy select query graph */
+		row_prebuild_sel_graph(prebuilt);
+	}
+
+	/* We use the select query graph as the dummy graph needed
+	in the lock module call */
+
+	thr = que_fork_get_first_thr(prebuilt->sel_graph);
+
+	que_thr_move_to_run_state_for_mysql(thr, trx);
+
+run_again:
+	thr->run_node = thr;
+	thr->prev_node = thr->common.parent;
+
+	/* It may be that the current session has not yet started
+	its transaction, or it has been committed: */
+
+	trx_start_if_not_started(trx);
+
+	if (table) {
+		err = lock_table(0, table, mode, thr);
+	} else {
+		err = lock_table(0, prebuilt->table,
+				 prebuilt->select_lock_type, thr);
+	}
+
+	trx->error_state = err;
+
+	if (err != DB_SUCCESS) {
+		que_thr_stop_for_mysql(thr);
+
+		was_lock_wait = row_mysql_handle_errors(&err, trx, thr, NULL);
+
+		if (was_lock_wait) {
+			goto run_again;
+		}
+
+		trx->op_info = "";
+
+		return((int) err);
+	}
+
+	que_thr_stop_for_mysql_no_error(thr, trx);
+
+	trx->op_info = "";
+
+	return((int) err);
+}
+
+/*********************************************************************//**
+Does an insert for MySQL.
+@return	error code or DB_SUCCESS */
+UNIV_INTERN
+int
+row_insert_for_mysql(
+/*=================*/
+	byte*		mysql_rec,	/*!< in: row in the MySQL format */
+	row_prebuilt_t*	prebuilt)	/*!< in: prebuilt struct in MySQL
+					handle */
+{
+	trx_savept_t	savept;
+	que_thr_t*	thr;
+	ulint		err;
+	ibool		was_lock_wait;
+	trx_t*		trx		= prebuilt->trx;
+	ins_node_t*	node		= prebuilt->ins_node;
+
+	ut_ad(trx);
+	ut_ad(trx->mysql_thread_id == os_thread_get_curr_id());
+
+	if (prebuilt->table->ibd_file_missing) {
+		ut_print_timestamp(stderr);
+		fprintf(stderr, "  InnoDB: Error:\n"
+			"InnoDB: MySQL is trying to use a table handle"
+			" but the .ibd file for\n"
+			"InnoDB: table %s does not exist.\n"
+			"InnoDB: Have you deleted the .ibd file"
+			" from the database directory under\n"
+			"InnoDB: the MySQL datadir, or have you"
+			" used DISCARD TABLESPACE?\n"
+			"InnoDB: Look from\n"
+			"InnoDB: " REFMAN "innodb-troubleshooting.html\n"
+			"InnoDB: how you can resolve the problem.\n",
+			prebuilt->table->name);
+		return(DB_ERROR);
+	}
+
+	if (UNIV_UNLIKELY(prebuilt->magic_n != ROW_PREBUILT_ALLOCATED)) {
+		fprintf(stderr,
+			"InnoDB: Error: trying to free a corrupt\n"
+			"InnoDB: table handle. Magic n %lu, table name ",
+			(ulong) prebuilt->magic_n);
+		ut_print_name(stderr, trx, TRUE, prebuilt->table->name);
+		putc('\n', stderr);
+
+		mem_analyze_corruption(prebuilt);
+
+		ut_error;
+	}
+
+	if (UNIV_UNLIKELY(srv_created_new_raw || srv_force_recovery)) {
+		fputs("InnoDB: A new raw disk partition was initialized or\n"
+		      "InnoDB: innodb_force_recovery is on: we do not allow\n"
+		      "InnoDB: database modifications by the user. Shut down\n"
+		      "InnoDB: mysqld and edit my.cnf so that"
+		      " newraw is replaced\n"
+		      "InnoDB: with raw, and innodb_force_... is removed.\n",
+		      stderr);
+
+		return(DB_ERROR);
+	}
+
+	trx->op_info = "inserting";
+
+	row_mysql_delay_if_needed();
+
+	trx_start_if_not_started(trx);
+
+	if (node == NULL) {
+		row_get_prebuilt_insert_row(prebuilt);
+		node = prebuilt->ins_node;
+	}
+
+	row_mysql_convert_row_to_innobase(node->row, prebuilt, mysql_rec);
+
+	savept = trx_savept_take(trx);
+
+	thr = que_fork_get_first_thr(prebuilt->ins_graph);
+
+	if (!prebuilt->mysql_has_locked) {
+		fprintf(stderr, "InnoDB: Error: row_insert_for_mysql is called without ha_innobase::external_lock()\n");
+		if (trx->mysql_thd != NULL) {
+			innobase_mysql_print_thd(stderr, trx->mysql_thd, 600);
+		}
+	}
+
+	if (prebuilt->sql_stat_start) {
+		node->state = INS_NODE_SET_IX_LOCK;
+		prebuilt->sql_stat_start = FALSE;
+	} else {
+		node->state = INS_NODE_ALLOC_ROW_ID;
+	}
+
+	que_thr_move_to_run_state_for_mysql(thr, trx);
+
+run_again:
+	thr->run_node = node;
+	thr->prev_node = node;
+
+	row_ins_step(thr);
+
+	err = trx->error_state;
+
+	if (err != DB_SUCCESS) {
+		que_thr_stop_for_mysql(thr);
+
+		/* TODO: what is this? */ thr->lock_state= QUE_THR_LOCK_ROW;
+
+		was_lock_wait = row_mysql_handle_errors(&err, trx, thr,
+							&savept);
+		thr->lock_state= QUE_THR_LOCK_NOLOCK;
+
+		if (was_lock_wait) {
+			goto run_again;
+		}
+
+		trx->op_info = "";
+
+		return((int) err);
+	}
+
+	que_thr_stop_for_mysql_no_error(thr, trx);
+
+	prebuilt->table->stat_n_rows++;
+
+	srv_n_rows_inserted++;
+
+	if (prebuilt->table->stat_n_rows == 0) {
+		/* Avoid wrap-over */
+		prebuilt->table->stat_n_rows--;
+	}
+
+	row_update_statistics_if_needed(prebuilt->table);
+	trx->op_info = "";
+
+	return((int) err);
+}
+
+/*********************************************************************//**
+Builds a dummy query graph used in selects. */
+UNIV_INTERN
+void
+row_prebuild_sel_graph(
+/*===================*/
+	row_prebuilt_t*	prebuilt)	/*!< in: prebuilt struct in MySQL
+					handle */
+{
+	sel_node_t*	node;
+
+	ut_ad(prebuilt && prebuilt->trx);
+
+	if (prebuilt->sel_graph == NULL) {
+
+		node = sel_node_create(prebuilt->heap);
+
+		prebuilt->sel_graph = que_node_get_parent(
+			pars_complete_graph_for_exec(node,
+						     prebuilt->trx,
+						     prebuilt->heap));
+
+		prebuilt->sel_graph->state = QUE_FORK_ACTIVE;
+	}
+}
+
+/*********************************************************************//**
+Creates an query graph node of 'update' type to be used in the MySQL
+interface.
+@return	own: update node */
+UNIV_INTERN
+upd_node_t*
+row_create_update_node_for_mysql(
+/*=============================*/
+	dict_table_t*	table,	/*!< in: table to update */
+	mem_heap_t*	heap)	/*!< in: mem heap from which allocated */
+{
+	upd_node_t*	node;
+
+	node = upd_node_create(heap);
+
+	node->in_mysql_interface = TRUE;
+	node->is_delete = FALSE;
+	node->searched_update = FALSE;
+	node->select = NULL;
+	node->pcur = btr_pcur_create_for_mysql();
+	node->table = table;
+
+	node->update = upd_create(dict_table_get_n_cols(table), heap);
+
+	node->update_n_fields = dict_table_get_n_cols(table);
+
+	UT_LIST_INIT(node->columns);
+	node->has_clust_rec_x_lock = TRUE;
+	node->cmpl_info = 0;
+
+	node->table_sym = NULL;
+	node->col_assign_list = NULL;
+
+	return(node);
+}
+
+/*********************************************************************//**
+Gets pointer to a prebuilt update vector used in updates. If the update
+graph has not yet been built in the prebuilt struct, then this function
+first builds it.
+@return	prebuilt update vector */
+UNIV_INTERN
+upd_t*
+row_get_prebuilt_update_vector(
+/*===========================*/
+	row_prebuilt_t*	prebuilt)	/*!< in: prebuilt struct in MySQL
+					handle */
+{
+	dict_table_t*	table	= prebuilt->table;
+	upd_node_t*	node;
+
+	ut_ad(prebuilt && table && prebuilt->trx);
+
+	if (prebuilt->upd_node == NULL) {
+
+		/* Not called before for this handle: create an update node
+		and query graph to the prebuilt struct */
+
+		node = row_create_update_node_for_mysql(table, prebuilt->heap);
+
+		prebuilt->upd_node = node;
+
+		prebuilt->upd_graph = que_node_get_parent(
+			pars_complete_graph_for_exec(node,
+						     prebuilt->trx,
+						     prebuilt->heap));
+		prebuilt->upd_graph->state = QUE_FORK_ACTIVE;
+	}
+
+	return(prebuilt->upd_node->update);
+}
+
+/*********************************************************************//**
+Does an update or delete of a row for MySQL.
+@return	error code or DB_SUCCESS */
+UNIV_INTERN
+int
+row_update_for_mysql(
+/*=================*/
+	byte*		mysql_rec,	/*!< in: the row to be updated, in
+					the MySQL format */
+	row_prebuilt_t*	prebuilt)	/*!< in: prebuilt struct in MySQL
+					handle */
+{
+	trx_savept_t	savept;
+	ulint		err;
+	que_thr_t*	thr;
+	ibool		was_lock_wait;
+	dict_index_t*	clust_index;
+	/*	ulint		ref_len; */
+	upd_node_t*	node;
+	dict_table_t*	table		= prebuilt->table;
+	trx_t*		trx		= prebuilt->trx;
+
+	ut_ad(prebuilt && trx);
+	ut_ad(trx->mysql_thread_id == os_thread_get_curr_id());
+	UT_NOT_USED(mysql_rec);
+
+	if (prebuilt->table->ibd_file_missing) {
+		ut_print_timestamp(stderr);
+		fprintf(stderr, "  InnoDB: Error:\n"
+			"InnoDB: MySQL is trying to use a table handle"
+			" but the .ibd file for\n"
+			"InnoDB: table %s does not exist.\n"
+			"InnoDB: Have you deleted the .ibd file"
+			" from the database directory under\n"
+			"InnoDB: the MySQL datadir, or have you"
+			" used DISCARD TABLESPACE?\n"
+			"InnoDB: Look from\n"
+			"InnoDB: " REFMAN "innodb-troubleshooting.html\n"
+			"InnoDB: how you can resolve the problem.\n",
+			prebuilt->table->name);
+		return(DB_ERROR);
+	}
+
+	if (UNIV_UNLIKELY(prebuilt->magic_n != ROW_PREBUILT_ALLOCATED)) {
+		fprintf(stderr,
+			"InnoDB: Error: trying to free a corrupt\n"
+			"InnoDB: table handle. Magic n %lu, table name ",
+			(ulong) prebuilt->magic_n);
+		ut_print_name(stderr, trx, TRUE, prebuilt->table->name);
+		putc('\n', stderr);
+
+		mem_analyze_corruption(prebuilt);
+
+		ut_error;
+	}
+
+	if (UNIV_UNLIKELY(srv_created_new_raw || srv_force_recovery)) {
+		fputs("InnoDB: A new raw disk partition was initialized or\n"
+		      "InnoDB: innodb_force_recovery is on: we do not allow\n"
+		      "InnoDB: database modifications by the user. Shut down\n"
+		      "InnoDB: mysqld and edit my.cnf so that newraw"
+		      " is replaced\n"
+		      "InnoDB: with raw, and innodb_force_... is removed.\n",
+		      stderr);
+
+		return(DB_ERROR);
+	}
+
+	trx->op_info = "updating or deleting";
+
+	row_mysql_delay_if_needed();
+
+	trx_start_if_not_started(trx);
+
+	node = prebuilt->upd_node;
+
+	clust_index = dict_table_get_first_index(table);
+
+	if (prebuilt->pcur->btr_cur.index == clust_index) {
+		btr_pcur_copy_stored_position(node->pcur, prebuilt->pcur);
+	} else {
+		btr_pcur_copy_stored_position(node->pcur,
+					      prebuilt->clust_pcur);
+	}
+
+	ut_a(node->pcur->rel_pos == BTR_PCUR_ON);
+
+	/* MySQL seems to call rnd_pos before updating each row it
+	has cached: we can get the correct cursor position from
+	prebuilt->pcur; NOTE that we cannot build the row reference
+	from mysql_rec if the clustered index was automatically
+	generated for the table: MySQL does not know anything about
+	the row id used as the clustered index key */
+
+	savept = trx_savept_take(trx);
+
+	thr = que_fork_get_first_thr(prebuilt->upd_graph);
+
+	node->state = UPD_NODE_UPDATE_CLUSTERED;
+
+	ut_ad(!prebuilt->sql_stat_start);
+
+	que_thr_move_to_run_state_for_mysql(thr, trx);
+
+run_again:
+	thr->run_node = node;
+	thr->prev_node = node;
+	thr->fk_cascade_depth = 0;
+
+	row_upd_step(thr);
+
+	err = trx->error_state;
+
+	/* Reset fk_cascade_depth back to 0 */
+	thr->fk_cascade_depth = 0;
+
+	if (err != DB_SUCCESS) {
+		que_thr_stop_for_mysql(thr);
+
+		if (err == DB_RECORD_NOT_FOUND) {
+			trx->error_state = DB_SUCCESS;
+			trx->op_info = "";
+
+			return((int) err);
+		}
+
+		thr->lock_state= QUE_THR_LOCK_ROW;
+		was_lock_wait = row_mysql_handle_errors(&err, trx, thr,
+							&savept);
+		thr->lock_state= QUE_THR_LOCK_NOLOCK;
+
+		if (was_lock_wait) {
+			goto run_again;
+		}
+
+		trx->op_info = "";
+
+		return((int) err);
+	}
+
+	que_thr_stop_for_mysql_no_error(thr, trx);
+
+	if (node->is_delete) {
+		if (prebuilt->table->stat_n_rows > 0) {
+			prebuilt->table->stat_n_rows--;
+		}
+
+		srv_n_rows_deleted++;
+	} else {
+		srv_n_rows_updated++;
+	}
+
+	row_update_statistics_if_needed(prebuilt->table);
+
+	trx->op_info = "";
+
+	return((int) err);
+}
+
+/*********************************************************************//**
+This can only be used when srv_locks_unsafe_for_binlog is TRUE or this
+session is using a READ COMMITTED or READ UNCOMMITTED isolation level.
+Before calling this function row_search_for_mysql() must have
+initialized prebuilt->new_rec_locks to store the information which new
+record locks really were set. This function removes a newly set
+clustered index record lock under prebuilt->pcur or
+prebuilt->clust_pcur.  Thus, this implements a 'mini-rollback' that
+releases the latest clustered index record lock we set.
+@return error code or DB_SUCCESS */
+UNIV_INTERN
+int
+row_unlock_for_mysql(
+/*=================*/
+	row_prebuilt_t*	prebuilt,	/*!< in/out: prebuilt struct in MySQL
+					handle */
+	ibool		has_latches_on_recs)/*!< in: TRUE if called so
+					that we have the latches on
+					the records under pcur and
+					clust_pcur, and we do not need
+					to reposition the cursors. */
+{
+	btr_pcur_t*	pcur		= prebuilt->pcur;
+	btr_pcur_t*	clust_pcur	= prebuilt->clust_pcur;
+	trx_t*		trx		= prebuilt->trx;
+
+	ut_ad(prebuilt && trx);
+	ut_ad(trx->mysql_thread_id == os_thread_get_curr_id());
+
+	if (UNIV_UNLIKELY
+	    (!srv_locks_unsafe_for_binlog
+	     && trx->isolation_level > TRX_ISO_READ_COMMITTED)) {
+
+		fprintf(stderr,
+			"InnoDB: Error: calling row_unlock_for_mysql though\n"
+			"InnoDB: innodb_locks_unsafe_for_binlog is FALSE and\n"
+			"InnoDB: this session is not using"
+			" READ COMMITTED isolation level.\n");
+
+		return(DB_SUCCESS);
+	}
+
+	trx->op_info = "unlock_row";
+
+	if (prebuilt->new_rec_locks >= 1) {
+
+		const rec_t*	rec;
+		dict_index_t*	index;
+		trx_id_t	rec_trx_id;
+		mtr_t		mtr;
+
+		mtr_start(&mtr);
+
+		/* Restore the cursor position and find the record */
+
+		if (!has_latches_on_recs) {
+			btr_pcur_restore_position(BTR_SEARCH_LEAF, pcur, &mtr);
+		}
+
+		rec = btr_pcur_get_rec(pcur);
+		index = btr_pcur_get_btr_cur(pcur)->index;
+
+		if (prebuilt->new_rec_locks >= 2) {
+			/* Restore the cursor position and find the record
+			in the clustered index. */
+
+			if (!has_latches_on_recs) {
+				btr_pcur_restore_position(BTR_SEARCH_LEAF,
+							  clust_pcur, &mtr);
+			}
+
+			rec = btr_pcur_get_rec(clust_pcur);
+			index = btr_pcur_get_btr_cur(clust_pcur)->index;
+		}
+
+		if (UNIV_UNLIKELY(!dict_index_is_clust(index))) {
+			/* This is not a clustered index record.  We
+			do not know how to unlock the record. */
+			goto no_unlock;
+		}
+
+		/* If the record has been modified by this
+		transaction, do not unlock it. */
+
+		if (index->trx_id_offset) {
+			rec_trx_id = trx_read_trx_id(rec
+						     + index->trx_id_offset);
+		} else {
+			mem_heap_t*	heap			= NULL;
+			ulint	offsets_[REC_OFFS_NORMAL_SIZE];
+			ulint*	offsets				= offsets_;
+
+			rec_offs_init(offsets_);
+			offsets = rec_get_offsets(rec, index, offsets,
+						  ULINT_UNDEFINED, &heap);
+
+			rec_trx_id = row_get_rec_trx_id(rec, index, offsets);
+
+			if (UNIV_LIKELY_NULL(heap)) {
+				mem_heap_free(heap);
+			}
+		}
+
+		if (ut_dulint_cmp(rec_trx_id, trx->id) != 0) {
+			/* We did not update the record: unlock it */
+
+			rec = btr_pcur_get_rec(pcur);
+			index = btr_pcur_get_btr_cur(pcur)->index;
+
+			lock_rec_unlock(trx, btr_pcur_get_block(pcur),
+					rec, prebuilt->select_lock_type);
+
+			if (prebuilt->new_rec_locks >= 2) {
+				rec = btr_pcur_get_rec(clust_pcur);
+				index = btr_pcur_get_btr_cur(clust_pcur)->index;
+
+				lock_rec_unlock(trx,
+						btr_pcur_get_block(clust_pcur),
+						rec,
+						prebuilt->select_lock_type);
+			}
+		}
+no_unlock:
+		mtr_commit(&mtr);
+	}
+
+	trx->op_info = "";
+
+	return(DB_SUCCESS);
+}
+
+/**********************************************************************//**
+Does a cascaded delete or set null in a foreign key operation.
+@return	error code or DB_SUCCESS */
+UNIV_INTERN
+ulint
+row_update_cascade_for_mysql(
+/*=========================*/
+	que_thr_t*	thr,	/*!< in: query thread */
+	upd_node_t*	node,	/*!< in: update node used in the cascade
+				or set null operation */
+	dict_table_t*	table)	/*!< in: table where we do the operation */
+{
+	ulint	err;
+	trx_t*	trx;
+
+	trx = thr_get_trx(thr);
+
+	thr->fk_cascade_depth++;
+
+	if (thr->fk_cascade_depth > FK_MAX_CASCADE_DEL) {
+		return (DB_FOREIGN_EXCEED_MAX_CASCADE);
+	}
+run_again:
+	thr->run_node = node;
+	thr->prev_node = node;
+
+	row_upd_step(thr);
+
+	err = trx->error_state;
+
+	/* Note that the cascade node is a subnode of another InnoDB
+	query graph node. We do a normal lock wait in this node, but
+	all errors are handled by the parent node. */
+
+	if (err == DB_LOCK_WAIT) {
+		/* Handle lock wait here */
+
+		que_thr_stop_for_mysql(thr);
+
+		srv_suspend_mysql_thread(thr);
+
+		/* Note that a lock wait may also end in a lock wait timeout,
+		or this transaction is picked as a victim in selective
+		deadlock resolution */
+
+		if (trx->error_state != DB_SUCCESS) {
+
+			return(trx->error_state);
+		}
+
+		/* Retry operation after a normal lock wait */
+
+		goto run_again;
+	}
+
+	if (err != DB_SUCCESS) {
+
+		return(err);
+	}
+
+	if (node->is_delete) {
+		if (table->stat_n_rows > 0) {
+			table->stat_n_rows--;
+		}
+
+		srv_n_rows_deleted++;
+	} else {
+		srv_n_rows_updated++;
+	}
+
+	row_update_statistics_if_needed(table);
+
+	return(err);
+}
+
+/*********************************************************************//**
+Checks if a table is such that we automatically created a clustered
+index on it (on row id).
+@return	TRUE if the clustered index was generated automatically */
+UNIV_INTERN
+ibool
+row_table_got_default_clust_index(
+/*==============================*/
+	const dict_table_t*	table)	/*!< in: table */
+{
+	const dict_index_t*	clust_index;
+
+	clust_index = dict_table_get_first_index(table);
+
+	return(dict_index_get_nth_col(clust_index, 0)->mtype == DATA_SYS);
+}
+
+/*********************************************************************//**
+Locks the data dictionary in shared mode from modifications, for performing
+foreign key check, rollback, or other operation invisible to MySQL. */
+UNIV_INTERN
+void
+row_mysql_freeze_data_dictionary_func(
+/*==================================*/
+	trx_t*		trx,	/*!< in/out: transaction */
+	const char*	file,	/*!< in: file name */
+	ulint		line)	/*!< in: line number */
+{
+	ut_a(trx->dict_operation_lock_mode == 0);
+
+	rw_lock_s_lock_func(&dict_operation_lock, 0, file, line);
+
+	trx->dict_operation_lock_mode = RW_S_LATCH;
+}
+
+/*********************************************************************//**
+Unlocks the data dictionary shared lock. */
+UNIV_INTERN
+void
+row_mysql_unfreeze_data_dictionary(
+/*===============================*/
+	trx_t*	trx)	/*!< in/out: transaction */
+{
+	ut_a(trx->dict_operation_lock_mode == RW_S_LATCH);
+
+	rw_lock_s_unlock(&dict_operation_lock);
+
+	trx->dict_operation_lock_mode = 0;
+}
+
+/*********************************************************************//**
+Locks the data dictionary exclusively for performing a table create or other
+data dictionary modification operation. */
+UNIV_INTERN
+void
+row_mysql_lock_data_dictionary_func(
+/*================================*/
+	trx_t*		trx,	/*!< in/out: transaction */
+	const char*	file,	/*!< in: file name */
+	ulint		line)	/*!< in: line number */
+{
+	ut_a(trx->dict_operation_lock_mode == 0
+	     || trx->dict_operation_lock_mode == RW_X_LATCH);
+
+	/* Serialize data dictionary operations with dictionary mutex:
+	no deadlocks or lock waits can occur then in these operations */
+
+	rw_lock_x_lock_func(&dict_operation_lock, 0, file, line);
+	trx->dict_operation_lock_mode = RW_X_LATCH;
+
+	mutex_enter(&(dict_sys->mutex));
+}
+
+/*********************************************************************//**
+Unlocks the data dictionary exclusive lock. */
+UNIV_INTERN
+void
+row_mysql_unlock_data_dictionary(
+/*=============================*/
+	trx_t*	trx)	/*!< in/out: transaction */
+{
+	ut_a(trx->dict_operation_lock_mode == RW_X_LATCH);
+
+	/* Serialize data dictionary operations with dictionary mutex:
+	no deadlocks can occur then in these operations */
+
+	mutex_exit(&(dict_sys->mutex));
+	rw_lock_x_unlock(&dict_operation_lock);
+
+	trx->dict_operation_lock_mode = 0;
+}
+
+/*********************************************************************//**
+Creates a table for MySQL. If the name of the table ends in
+one of "innodb_monitor", "innodb_lock_monitor", "innodb_tablespace_monitor",
+"innodb_table_monitor", then this will also start the printing of monitor
+output by the master thread. If the table name ends in "innodb_mem_validate",
+InnoDB will try to invoke mem_validate().
+@return	error code or DB_SUCCESS */
+UNIV_INTERN
+int
+row_create_table_for_mysql(
+/*=======================*/
+	dict_table_t*	table,	/*!< in, own: table definition
+				(will be freed) */
+	trx_t*		trx)	/*!< in: transaction handle */
+{
+	tab_node_t*	node;
+	mem_heap_t*	heap;
+	que_thr_t*	thr;
+	const char*	table_name;
+	ulint		table_name_len;
+	ulint		err;
+
+	ut_ad(trx->mysql_thread_id == os_thread_get_curr_id());
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(rw_lock_own(&dict_operation_lock, RW_LOCK_EX));
+#endif /* UNIV_SYNC_DEBUG */
+	ut_ad(mutex_own(&(dict_sys->mutex)));
+	ut_ad(trx->dict_operation_lock_mode == RW_X_LATCH);
+
+	if (srv_created_new_raw) {
+		fputs("InnoDB: A new raw disk partition was initialized:\n"
+		      "InnoDB: we do not allow database modifications"
+		      " by the user.\n"
+		      "InnoDB: Shut down mysqld and edit my.cnf so that newraw"
+		      " is replaced with raw.\n", stderr);
+err_exit:
+		dict_mem_table_free(table);
+		trx_commit_for_mysql(trx);
+
+		return(DB_ERROR);
+	}
+
+	trx->op_info = "creating table";
+
+	if (row_mysql_is_system_table(table->name)) {
+
+		fprintf(stderr,
+			"InnoDB: Error: trying to create a MySQL system"
+			" table %s of type InnoDB.\n"
+			"InnoDB: MySQL system tables must be"
+			" of the MyISAM type!\n",
+			table->name);
+		goto err_exit;
+	}
+
+	trx_start_if_not_started(trx);
+
+	/* The table name is prefixed with the database name and a '/'.
+	Certain table names starting with 'innodb_' have their special
+	meaning regardless of the database name.  Thus, we need to
+	ignore the database name prefix in the comparisons. */
+	table_name = strchr(table->name, '/');
+	ut_a(table_name);
+	table_name++;
+	table_name_len = strlen(table_name) + 1;
+
+	if (STR_EQ(table_name, table_name_len, S_innodb_monitor)) {
+
+		/* Table equals "innodb_monitor":
+		start monitor prints */
+
+		srv_print_innodb_monitor = TRUE;
+
+		/* The lock timeout monitor thread also takes care
+		of InnoDB monitor prints */
+
+		os_event_set(srv_lock_timeout_thread_event);
+	} else if (STR_EQ(table_name, table_name_len,
+			  S_innodb_lock_monitor)) {
+
+		srv_print_innodb_monitor = TRUE;
+		srv_print_innodb_lock_monitor = TRUE;
+		os_event_set(srv_lock_timeout_thread_event);
+	} else if (STR_EQ(table_name, table_name_len,
+			  S_innodb_tablespace_monitor)) {
+
+		srv_print_innodb_tablespace_monitor = TRUE;
+		os_event_set(srv_lock_timeout_thread_event);
+	} else if (STR_EQ(table_name, table_name_len,
+			  S_innodb_table_monitor)) {
+
+		srv_print_innodb_table_monitor = TRUE;
+		os_event_set(srv_lock_timeout_thread_event);
+	} else if (STR_EQ(table_name, table_name_len,
+			  S_innodb_mem_validate)) {
+		/* We define here a debugging feature intended for
+		developers */
+
+		fputs("Validating InnoDB memory:\n"
+		      "to use this feature you must compile InnoDB with\n"
+		      "UNIV_MEM_DEBUG defined in univ.i and"
+		      " the server must be\n"
+		      "quiet because allocation from a mem heap"
+		      " is not protected\n"
+		      "by any semaphore.\n", stderr);
+#ifdef UNIV_MEM_DEBUG
+		ut_a(mem_validate());
+		fputs("Memory validated\n", stderr);
+#else /* UNIV_MEM_DEBUG */
+		fputs("Memory NOT validated (recompile with UNIV_MEM_DEBUG)\n",
+		      stderr);
+#endif /* UNIV_MEM_DEBUG */
+	}
+
+	heap = mem_heap_create(512);
+
+	trx_set_dict_operation(trx, TRX_DICT_OP_TABLE);
+
+	node = tab_create_graph_create(table, heap);
+
+	thr = pars_complete_graph_for_exec(node, trx, heap);
+
+	ut_a(thr == que_fork_start_command(que_node_get_parent(thr)));
+	que_run_threads(thr);
+
+	err = trx->error_state;
+
+	if (UNIV_UNLIKELY(err != DB_SUCCESS)) {
+		trx->error_state = DB_SUCCESS;
+		trx_general_rollback_for_mysql(trx, NULL);
+		/* TO DO: free table?  The code below will dereference
+		table->name, though. */
+	}
+
+	switch (err) {
+	case DB_OUT_OF_FILE_SPACE:
+		ut_print_timestamp(stderr);
+		fputs("  InnoDB: Warning: cannot create table ",
+		      stderr);
+		ut_print_name(stderr, trx, TRUE, table->name);
+		fputs(" because tablespace full\n", stderr);
+
+		if (dict_table_get_low(table->name)) {
+
+			row_drop_table_for_mysql(table->name, trx, FALSE);
+			trx_commit_for_mysql(trx);
+		}
+		break;
+
+	case DB_DUPLICATE_KEY:
+		/* We may also get err == DB_ERROR if the .ibd file for the
+		table already exists */
+
+		break;
+	}
+
+	que_graph_free((que_t*) que_node_get_parent(thr));
+
+	trx->op_info = "";
+
+	return((int) err);
+}
+
+/*********************************************************************//**
+Does an index creation operation for MySQL. TODO: currently failure
+to create an index results in dropping the whole table! This is no problem
+currently as all indexes must be created at the same time as the table.
+@return	error number or DB_SUCCESS */
+UNIV_INTERN
+int
+row_create_index_for_mysql(
+/*=======================*/
+	dict_index_t*	index,		/*!< in, own: index definition
+					(will be freed) */
+	trx_t*		trx,		/*!< in: transaction handle */
+	const ulint*	field_lengths)	/*!< in: if not NULL, must contain
+					dict_index_get_n_fields(index)
+					actual field lengths for the
+					index columns, which are
+					then checked for not being too
+					large. */
+{
+	ind_node_t*	node;
+	mem_heap_t*	heap;
+	que_thr_t*	thr;
+	ulint		err;
+	ulint		i;
+	ulint		len;
+	char*		table_name;
+
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(rw_lock_own(&dict_operation_lock, RW_LOCK_EX));
+#endif /* UNIV_SYNC_DEBUG */
+	ut_ad(mutex_own(&(dict_sys->mutex)));
+	ut_ad(trx->mysql_thread_id == os_thread_get_curr_id());
+
+	trx->op_info = "creating index";
+
+	/* Copy the table name because we may want to drop the
+	table later, after the index object is freed (inside
+	que_run_threads()) and thus index->table_name is not available. */
+	table_name = mem_strdup(index->table_name);
+
+	trx_start_if_not_started(trx);
+
+	/* Check that the same column does not appear twice in the index.
+	Starting from 4.0.14, InnoDB should be able to cope with that, but
+	safer not to allow them. */
+
+	for (i = 0; i < dict_index_get_n_fields(index); i++) {
+		ulint		j;
+
+		for (j = 0; j < i; j++) {
+			if (0 == ut_strcmp(
+				    dict_index_get_nth_field(index, j)->name,
+				    dict_index_get_nth_field(index, i)->name)) {
+				ut_print_timestamp(stderr);
+
+				fputs("  InnoDB: Error: column ", stderr);
+				ut_print_name(stderr, trx, FALSE,
+					      dict_index_get_nth_field(
+						      index, i)->name);
+				fputs(" appears twice in ", stderr);
+				dict_index_name_print(stderr, trx, index);
+				fputs("\n"
+				      "InnoDB: This is not allowed"
+				      " in InnoDB.\n", stderr);
+
+				err = DB_COL_APPEARS_TWICE_IN_INDEX;
+
+				goto error_handling;
+			}
+		}
+
+		/* Check also that prefix_len and actual length
+		< DICT_MAX_INDEX_COL_LEN */
+
+		len = dict_index_get_nth_field(index, i)->prefix_len;
+
+		if (field_lengths) {
+			len = ut_max(len, field_lengths[i]);
+		}
+
+		if (len >= DICT_MAX_INDEX_COL_LEN) {
+			err = DB_TOO_BIG_RECORD;
+
+			goto error_handling;
+		}
+	}
+
+	heap = mem_heap_create(512);
+
+	trx_set_dict_operation(trx, TRX_DICT_OP_TABLE);
+
+	/* Note that the space id where we store the index is inherited from
+	the table in dict_build_index_def_step() in dict0crea.c. */
+
+	node = ind_create_graph_create(index, heap);
+
+	thr = pars_complete_graph_for_exec(node, trx, heap);
+
+	ut_a(thr == que_fork_start_command(que_node_get_parent(thr)));
+	que_run_threads(thr);
+
+	err = trx->error_state;
+
+	que_graph_free((que_t*) que_node_get_parent(thr));
+
+error_handling:
+	if (err != DB_SUCCESS) {
+		/* We have special error handling here */
+
+		trx->error_state = DB_SUCCESS;
+
+		trx_general_rollback_for_mysql(trx, NULL);
+
+		row_drop_table_for_mysql(table_name, trx, FALSE);
+
+		trx_commit_for_mysql(trx);
+
+		trx->error_state = DB_SUCCESS;
+	}
+
+	trx->op_info = "";
+
+	mem_free(table_name);
+
+	return((int) err);
+}
+
+/*********************************************************************//**
+*/
+UNIV_INTERN
+int
+row_insert_stats_for_mysql(
+/*=======================*/
+	dict_index_t*	index,
+	trx_t*		trx)
+{
+	ind_node_t*	node;
+	mem_heap_t*	heap;
+	que_thr_t*	thr;
+	ulint		err;
+
+	ut_ad(trx->mysql_thread_id == os_thread_get_curr_id());
+
+	trx->op_info = "try to insert rows to SYS_STATS";
+
+	trx_start_if_not_started(trx);
+	trx->error_state = DB_SUCCESS;
+
+	heap = mem_heap_create(512);
+
+	node = ind_insert_stats_graph_create(index, heap);
+
+	thr = pars_complete_graph_for_exec(node, trx, heap);
+
+	ut_a(thr == que_fork_start_command(que_node_get_parent(thr)));
+	que_run_threads(thr);
+
+	err = trx->error_state;
+
+	que_graph_free((que_t*) que_node_get_parent(thr));
+
+	trx->op_info = "";
+
+	return((int) err);
+}
+
+/*********************************************************************//**
+Scans a table create SQL string and adds to the data dictionary
+the foreign key constraints declared in the string. This function
+should be called after the indexes for a table have been created.
+Each foreign key constraint must be accompanied with indexes in
+both participating tables. The indexes are allowed to contain more
+fields than mentioned in the constraint. Check also that foreign key
+constraints which reference this table are ok.
+@return	error code or DB_SUCCESS */
+UNIV_INTERN
+int
+row_table_add_foreign_constraints(
+/*==============================*/
+	trx_t*		trx,		/*!< in: transaction */
+	const char*	sql_string,	/*!< in: table create statement where
+					foreign keys are declared like:
+				FOREIGN KEY (a, b) REFERENCES table2(c, d),
+					table2 can be written also with the
+					database name before it: test.table2 */
+	size_t		sql_length,	/*!< in: length of sql_string */
+	const char*	name,		/*!< in: table full name in the
+					normalized form
+					database_name/table_name */
+	ibool		reject_fks)	/*!< in: if TRUE, fail with error
+					code DB_CANNOT_ADD_CONSTRAINT if
+					any foreign keys are found. */
+{
+	ulint	err;
+
+	ut_ad(mutex_own(&(dict_sys->mutex)));
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(rw_lock_own(&dict_operation_lock, RW_LOCK_EX));
+#endif /* UNIV_SYNC_DEBUG */
+	ut_a(sql_string);
+
+	trx->op_info = "adding foreign keys";
+
+	trx_start_if_not_started(trx);
+
+	trx_set_dict_operation(trx, TRX_DICT_OP_TABLE);
+
+	err = dict_create_foreign_constraints(trx, sql_string, sql_length,
+					      name, reject_fks);
+	if (err == DB_SUCCESS) {
+		/* Check that also referencing constraints are ok */
+		err = dict_load_foreigns(name, FALSE, TRUE);
+	}
+
+	if (err != DB_SUCCESS) {
+		/* We have special error handling here */
+
+		trx->error_state = DB_SUCCESS;
+
+		trx_general_rollback_for_mysql(trx, NULL);
+
+		row_drop_table_for_mysql(name, trx, FALSE);
+
+		trx_commit_for_mysql(trx);
+
+		trx->error_state = DB_SUCCESS;
+	}
+
+	return((int) err);
+}
+
+/*********************************************************************//**
+Drops a table for MySQL as a background operation. MySQL relies on Unix
+in ALTER TABLE to the fact that the table handler does not remove the
+table before all handles to it has been removed. Furhermore, the MySQL's
+call to drop table must be non-blocking. Therefore we do the drop table
+as a background operation, which is taken care of by the master thread
+in srv0srv.c.
+@return	error code or DB_SUCCESS */
+static
+int
+row_drop_table_for_mysql_in_background(
+/*===================================*/
+	const char*	name)	/*!< in: table name */
+{
+	ulint	error;
+	trx_t*	trx;
+
+	trx = trx_allocate_for_background();
+
+	/* If the original transaction was dropping a table referenced by
+	foreign keys, we must set the following to be able to drop the
+	table: */
+
+	trx->check_foreigns = FALSE;
+
+	/*	fputs("InnoDB: Error: Dropping table ", stderr);
+	ut_print_name(stderr, trx, TRUE, name);
+	fputs(" in background drop list\n", stderr); */
+
+	/* Try to drop the table in InnoDB */
+
+	error = row_drop_table_for_mysql(name, trx, FALSE);
+
+	/* Flush the log to reduce probability that the .frm files and
+	the InnoDB data dictionary get out-of-sync if the user runs
+	with innodb_flush_log_at_trx_commit = 0 */
+
+	log_buffer_flush_to_disk();
+
+	trx_commit_for_mysql(trx);
+
+	trx_free_for_background(trx);
+
+	return((int) error);
+}
+
+/*********************************************************************//**
+The master thread in srv0srv.c calls this regularly to drop tables which
+we must drop in background after queries to them have ended. Such lazy
+dropping of tables is needed in ALTER TABLE on Unix.
+@return	how many tables dropped + remaining tables in list */
+UNIV_INTERN
+ulint
+row_drop_tables_for_mysql_in_background(void)
+/*=========================================*/
+{
+	row_mysql_drop_t*	drop;
+	dict_table_t*		table;
+	ulint			n_tables;
+	ulint			n_tables_dropped = 0;
+loop:
+	mutex_enter(&kernel_mutex);
+
+	if (!row_mysql_drop_list_inited) {
+
+		UT_LIST_INIT(row_mysql_drop_list);
+		row_mysql_drop_list_inited = TRUE;
+	}
+
+	drop = UT_LIST_GET_FIRST(row_mysql_drop_list);
+
+	n_tables = UT_LIST_GET_LEN(row_mysql_drop_list);
+
+	mutex_exit(&kernel_mutex);
+
+	if (drop == NULL) {
+		/* All tables dropped */
+
+		return(n_tables + n_tables_dropped);
+	}
+
+	mutex_enter(&(dict_sys->mutex));
+	table = dict_table_get_low(drop->table_name);
+	mutex_exit(&(dict_sys->mutex));
+
+	if (table == NULL) {
+		/* If for some reason the table has already been dropped
+		through some other mechanism, do not try to drop it */
+
+		goto already_dropped;
+	}
+
+	if (DB_SUCCESS != row_drop_table_for_mysql_in_background(
+		    drop->table_name)) {
+		/* If the DROP fails for some table, we return, and let the
+		main thread retry later */
+
+		return(n_tables + n_tables_dropped);
+	}
+
+	n_tables_dropped++;
+
+already_dropped:
+	mutex_enter(&kernel_mutex);
+
+	UT_LIST_REMOVE(row_mysql_drop_list, row_mysql_drop_list, drop);
+
+	ut_print_timestamp(stderr);
+	fputs("  InnoDB: Dropped table ", stderr);
+	ut_print_name(stderr, NULL, TRUE, drop->table_name);
+	fputs(" in background drop queue.\n", stderr);
+
+	mem_free(drop->table_name);
+
+	mem_free(drop);
+
+	mutex_exit(&kernel_mutex);
+
+	goto loop;
+}
+
+/*********************************************************************//**
+Get the background drop list length. NOTE: the caller must own the kernel
+mutex!
+@return	how many tables in list */
+UNIV_INTERN
+ulint
+row_get_background_drop_list_len_low(void)
+/*======================================*/
+{
+	ut_ad(mutex_own(&kernel_mutex));
+
+	if (!row_mysql_drop_list_inited) {
+
+		UT_LIST_INIT(row_mysql_drop_list);
+		row_mysql_drop_list_inited = TRUE;
+	}
+
+	return(UT_LIST_GET_LEN(row_mysql_drop_list));
+}
+
+/*********************************************************************//**
+If a table is not yet in the drop list, adds the table to the list of tables
+which the master thread drops in background. We need this on Unix because in
+ALTER TABLE MySQL may call drop table even if the table has running queries on
+it. Also, if there are running foreign key checks on the table, we drop the
+table lazily.
+@return	TRUE if the table was not yet in the drop list, and was added there */
+static
+ibool
+row_add_table_to_background_drop_list(
+/*==================================*/
+	const char*	name)	/*!< in: table name */
+{
+	row_mysql_drop_t*	drop;
+
+	mutex_enter(&kernel_mutex);
+
+	if (!row_mysql_drop_list_inited) {
+
+		UT_LIST_INIT(row_mysql_drop_list);
+		row_mysql_drop_list_inited = TRUE;
+	}
+
+	/* Look if the table already is in the drop list */
+	drop = UT_LIST_GET_FIRST(row_mysql_drop_list);
+
+	while (drop != NULL) {
+		if (strcmp(drop->table_name, name) == 0) {
+			/* Already in the list */
+
+			mutex_exit(&kernel_mutex);
+
+			return(FALSE);
+		}
+
+		drop = UT_LIST_GET_NEXT(row_mysql_drop_list, drop);
+	}
+
+	drop = mem_alloc(sizeof(row_mysql_drop_t));
+
+	drop->table_name = mem_strdup(name);
+
+	UT_LIST_ADD_LAST(row_mysql_drop_list, row_mysql_drop_list, drop);
+
+	/*	fputs("InnoDB: Adding table ", stderr);
+	ut_print_name(stderr, trx, TRUE, drop->table_name);
+	fputs(" to background drop list\n", stderr); */
+
+	mutex_exit(&kernel_mutex);
+
+	return(TRUE);
+}
+
+/*********************************************************************//**
+Discards the tablespace of a table which stored in an .ibd file. Discarding
+means that this function deletes the .ibd file and assigns a new table id for
+the table. Also the flag table->ibd_file_missing is set TRUE.
+@return	error code or DB_SUCCESS */
+UNIV_INTERN
+int
+row_discard_tablespace_for_mysql(
+/*=============================*/
+	const char*	name,	/*!< in: table name */
+	trx_t*		trx)	/*!< in: transaction handle */
+{
+	dict_foreign_t*	foreign;
+	dulint		new_id;
+	dict_table_t*	table;
+	ibool		success;
+	ulint		err;
+	pars_info_t*	info = NULL;
+
+	/* How do we prevent crashes caused by ongoing operations on
+	the table? Old operations could try to access non-existent
+	pages.
+
+	1) SQL queries, INSERT, SELECT, ...: we must get an exclusive
+	MySQL table lock on the table before we can do DISCARD
+	TABLESPACE. Then there are no running queries on the table.
+
+	2) Purge and rollback: we assign a new table id for the
+	table. Since purge and rollback look for the table based on
+	the table id, they see the table as 'dropped' and discard
+	their operations.
+
+	3) Insert buffer: we remove all entries for the tablespace in
+	the insert buffer tree; as long as the tablespace mem object
+	does not exist, ongoing insert buffer page merges are
+	discarded in buf0rea.c. If we recreate the tablespace mem
+	object with IMPORT TABLESPACE later, then the tablespace will
+	have the same id, but the tablespace_version field in the mem
+	object is different, and ongoing old insert buffer page merges
+	get discarded.
+
+	4) Linear readahead and random readahead: we use the same
+	method as in 3) to discard ongoing operations.
+
+	5) FOREIGN KEY operations: if
+	table->n_foreign_key_checks_running > 0, we do not allow the
+	discard. We also reserve the data dictionary latch. */
+
+	ut_ad(trx->mysql_thread_id == os_thread_get_curr_id());
+
+	trx->op_info = "discarding tablespace";
+	trx_start_if_not_started(trx);
+
+	/* Serialize data dictionary operations with dictionary mutex:
+	no deadlocks can occur then in these operations */
+
+	row_mysql_lock_data_dictionary(trx);
+
+	table = dict_table_get_low(name);
+
+	if (!table) {
+		err = DB_TABLE_NOT_FOUND;
+
+		goto funct_exit;
+	}
+
+	if (table->space == 0) {
+		ut_print_timestamp(stderr);
+		fputs("  InnoDB: Error: table ", stderr);
+		ut_print_name(stderr, trx, TRUE, name);
+		fputs("\n"
+		      "InnoDB: is in the system tablespace 0"
+		      " which cannot be discarded\n", stderr);
+		err = DB_ERROR;
+
+		goto funct_exit;
+	}
+
+	if (table->n_foreign_key_checks_running > 0) {
+
+		ut_print_timestamp(stderr);
+		fputs("  InnoDB: You are trying to DISCARD table ", stderr);
+		ut_print_name(stderr, trx, TRUE, table->name);
+		fputs("\n"
+		      "InnoDB: though there is a foreign key check"
+		      " running on it.\n"
+		      "InnoDB: Cannot discard the table.\n",
+		      stderr);
+
+		err = DB_ERROR;
+
+		goto funct_exit;
+	}
+
+	/* Check if the table is referenced by foreign key constraints from
+	some other table (not the table itself) */
+
+	foreign = UT_LIST_GET_FIRST(table->referenced_list);
+
+	while (foreign && foreign->foreign_table == table) {
+		foreign = UT_LIST_GET_NEXT(referenced_list, foreign);
+	}
+
+	if (foreign && trx->check_foreigns) {
+
+		FILE*	ef	= dict_foreign_err_file;
+
+		/* We only allow discarding a referenced table if
+		FOREIGN_KEY_CHECKS is set to 0 */
+
+		err = DB_CANNOT_DROP_CONSTRAINT;
+
+		mutex_enter(&dict_foreign_err_mutex);
+		rewind(ef);
+		ut_print_timestamp(ef);
+
+		fputs("  Cannot DISCARD table ", ef);
+		ut_print_name(stderr, trx, TRUE, name);
+		fputs("\n"
+		      "because it is referenced by ", ef);
+		ut_print_name(stderr, trx, TRUE, foreign->foreign_table_name);
+		putc('\n', ef);
+		mutex_exit(&dict_foreign_err_mutex);
+
+		goto funct_exit;
+	}
+
+	dict_hdr_get_new_id(&new_id, NULL, NULL);
+
+	/* Remove all locks except the table-level S and X locks. */
+	lock_remove_all_on_table(table, FALSE);
+
+	info = pars_info_create();
+
+	pars_info_add_str_literal(info, "table_name", name);
+	pars_info_add_dulint_literal(info, "new_id", new_id);
+
+	err = que_eval_sql(info,
+			   "PROCEDURE DISCARD_TABLESPACE_PROC () IS\n"
+			   "old_id CHAR;\n"
+			   "BEGIN\n"
+			   "SELECT ID INTO old_id\n"
+			   "FROM SYS_TABLES\n"
+			   "WHERE NAME = :table_name\n"
+			   "LOCK IN SHARE MODE;\n"
+			   "IF (SQL % NOTFOUND) THEN\n"
+			   "       COMMIT WORK;\n"
+			   "       RETURN;\n"
+			   "END IF;\n"
+			   "UPDATE SYS_TABLES SET ID = :new_id\n"
+			   " WHERE ID = old_id;\n"
+			   "UPDATE SYS_COLUMNS SET TABLE_ID = :new_id\n"
+			   " WHERE TABLE_ID = old_id;\n"
+			   "UPDATE SYS_INDEXES SET TABLE_ID = :new_id\n"
+			   " WHERE TABLE_ID = old_id;\n"
+			   "COMMIT WORK;\n"
+			   "END;\n"
+			   , FALSE, trx);
+
+	if (err != DB_SUCCESS) {
+		trx->error_state = DB_SUCCESS;
+		trx_general_rollback_for_mysql(trx, NULL);
+		trx->error_state = DB_SUCCESS;
+	} else {
+		dict_table_change_id_in_cache(table, new_id);
+
+		success = fil_discard_tablespace(table->space);
+
+		if (!success) {
+			trx->error_state = DB_SUCCESS;
+			trx_general_rollback_for_mysql(trx, NULL);
+			trx->error_state = DB_SUCCESS;
+
+			err = DB_ERROR;
+		} else {
+			/* Set the flag which tells that now it is legal to
+			IMPORT a tablespace for this table */
+			table->tablespace_discarded = TRUE;
+			table->ibd_file_missing = TRUE;
+		}
+	}
+
+funct_exit:
+	trx_commit_for_mysql(trx);
+
+	row_mysql_unlock_data_dictionary(trx);
+
+	trx->op_info = "";
+
+	return((int) err);
+}
+
+/*****************************************************************//**
+Imports a tablespace. The space id in the .ibd file must match the space id
+of the table in the data dictionary.
+@return	error code or DB_SUCCESS */
+UNIV_INTERN
+int
+row_import_tablespace_for_mysql(
+/*============================*/
+	const char*	name,	/*!< in: table name */
+	trx_t*		trx)	/*!< in: transaction handle */
+{
+	dict_table_t*	table;
+	ibool		success;
+	ib_uint64_t	current_lsn;
+	ulint		err		= DB_SUCCESS;
+
+	ut_ad(trx->mysql_thread_id == os_thread_get_curr_id());
+
+	trx_start_if_not_started(trx);
+
+	trx->op_info = "importing tablespace";
+
+	current_lsn = log_get_lsn();
+
+	/* It is possible, though very improbable, that the lsn's in the
+	tablespace to be imported have risen above the current system lsn, if
+	a lengthy purge, ibuf merge, or rollback was performed on a backup
+	taken with ibbackup. If that is the case, reset page lsn's in the
+	file. We assume that mysqld was shut down after it performed these
+	cleanup operations on the .ibd file, so that it stamped the latest lsn
+	to the FIL_PAGE_FILE_FLUSH_LSN in the first page of the .ibd file.
+
+	TODO: reset also the trx id's in clustered index records and write
+	a new space id to each data page. That would allow us to import clean
+	.ibd files from another MySQL installation. */
+
+	success = fil_reset_too_high_lsns(name, current_lsn);
+
+	if (!success) {
+		ut_print_timestamp(stderr);
+		fputs("  InnoDB: Error: cannot reset lsn's in table ", stderr);
+		ut_print_name(stderr, trx, TRUE, name);
+		fputs("\n"
+		      "InnoDB: in ALTER TABLE ... IMPORT TABLESPACE\n",
+		      stderr);
+
+		err = DB_ERROR;
+
+		row_mysql_lock_data_dictionary(trx);
+
+		goto funct_exit;
+	}
+
+	/* Serialize data dictionary operations with dictionary mutex:
+	no deadlocks can occur then in these operations */
+
+	row_mysql_lock_data_dictionary(trx);
+
+	table = dict_table_get_low(name);
+
+	if (!table) {
+		ut_print_timestamp(stderr);
+		fputs("  InnoDB: table ", stderr);
+		ut_print_name(stderr, trx, TRUE, name);
+		fputs("\n"
+		      "InnoDB: does not exist in the InnoDB data dictionary\n"
+		      "InnoDB: in ALTER TABLE ... IMPORT TABLESPACE\n",
+		      stderr);
+
+		err = DB_TABLE_NOT_FOUND;
+
+		goto funct_exit;
+	}
+
+	if (table->space == 0) {
+		ut_print_timestamp(stderr);
+		fputs("  InnoDB: Error: table ", stderr);
+		ut_print_name(stderr, trx, TRUE, name);
+		fputs("\n"
+		      "InnoDB: is in the system tablespace 0"
+		      " which cannot be imported\n", stderr);
+		err = DB_ERROR;
+
+		goto funct_exit;
+	}
+
+	if (!table->tablespace_discarded) {
+		ut_print_timestamp(stderr);
+		fputs("  InnoDB: Error: you are trying to"
+		      " IMPORT a tablespace\n"
+		      "InnoDB: ", stderr);
+		ut_print_name(stderr, trx, TRUE, name);
+		fputs(", though you have not called DISCARD on it yet\n"
+		      "InnoDB: during the lifetime of the mysqld process!\n",
+		      stderr);
+
+		err = DB_ERROR;
+
+		goto funct_exit;
+	}
+
+	/* Play safe and remove all insert buffer entries, though we should
+	have removed them already when DISCARD TABLESPACE was called */
+
+	ibuf_delete_for_discarded_space(table->space);
+
+	success = fil_open_single_table_tablespace(
+		TRUE, table->space,
+		table->flags == DICT_TF_COMPACT ? 0 : table->flags,
+		table->name);
+	if (success) {
+		table->ibd_file_missing = FALSE;
+		table->tablespace_discarded = FALSE;
+	} else {
+		if (table->ibd_file_missing) {
+			ut_print_timestamp(stderr);
+			fputs("  InnoDB: cannot find or open in the"
+			      " database directory the .ibd file of\n"
+			      "InnoDB: table ", stderr);
+			ut_print_name(stderr, trx, TRUE, name);
+			fputs("\n"
+			      "InnoDB: in ALTER TABLE ... IMPORT TABLESPACE\n",
+			      stderr);
+		}
+
+		err = DB_ERROR;
+	}
+
+funct_exit:
+	trx_commit_for_mysql(trx);
+
+	row_mysql_unlock_data_dictionary(trx);
+
+	trx->op_info = "";
+
+	return((int) err);
+}
+
+/*********************************************************************//**
+Truncates a table for MySQL.
+@return	error code or DB_SUCCESS */
+UNIV_INTERN
+int
+row_truncate_table_for_mysql(
+/*=========================*/
+	dict_table_t*	table,	/*!< in: table handle */
+	trx_t*		trx)	/*!< in: transaction handle */
+{
+	dict_foreign_t*	foreign;
+	ulint		err;
+	mem_heap_t*	heap;
+	byte*		buf;
+	dtuple_t*	tuple;
+	dfield_t*	dfield;
+	dict_index_t*	sys_index;
+	btr_pcur_t	pcur;
+	mtr_t		mtr;
+	dulint		new_id;
+	ulint		recreate_space = 0;
+	pars_info_t*	info = NULL;
+
+	/* How do we prevent crashes caused by ongoing operations on
+	the table? Old operations could try to access non-existent
+	pages.
+
+	1) SQL queries, INSERT, SELECT, ...: we must get an exclusive
+	MySQL table lock on the table before we can do TRUNCATE
+	TABLE. Then there are no running queries on the table. This is
+	guaranteed, because in ha_innobase::store_lock(), we do not
+	weaken the TL_WRITE lock requested by MySQL when executing
+	SQLCOM_TRUNCATE.
+
+	2) Purge and rollback: we assign a new table id for the
+	table. Since purge and rollback look for the table based on
+	the table id, they see the table as 'dropped' and discard
+	their operations.
+
+	3) Insert buffer: TRUNCATE TABLE is analogous to DROP TABLE,
+	so we do not have to remove insert buffer records, as the
+	insert buffer works at a low level. If a freed page is later
+	reallocated, the allocator will remove the ibuf entries for
+	it.
+
+	When we truncate *.ibd files by recreating them (analogous to
+	DISCARD TABLESPACE), we remove all entries for the table in the
+	insert buffer tree.  This is not strictly necessary, because
+	in 6) we will assign a new tablespace identifier, but we can
+	free up some space in the system tablespace.
+
+	4) Linear readahead and random readahead: we use the same
+	method as in 3) to discard ongoing operations. (This is only
+	relevant for TRUNCATE TABLE by DISCARD TABLESPACE.)
+
+	5) FOREIGN KEY operations: if
+	table->n_foreign_key_checks_running > 0, we do not allow the
+	TRUNCATE. We also reserve the data dictionary latch.
+
+	6) Crash recovery: To prevent the application of pre-truncation
+	redo log records on the truncated tablespace, we will assign
+	a new tablespace identifier to the truncated tablespace. */
+
+	ut_ad(trx->mysql_thread_id == os_thread_get_curr_id());
+	ut_ad(table);
+
+	if (srv_created_new_raw) {
+		fputs("InnoDB: A new raw disk partition was initialized:\n"
+		      "InnoDB: we do not allow database modifications"
+		      " by the user.\n"
+		      "InnoDB: Shut down mysqld and edit my.cnf so that newraw"
+		      " is replaced with raw.\n", stderr);
+
+		return(DB_ERROR);
+	}
+
+	trx->op_info = "truncating table";
+
+	trx_start_if_not_started(trx);
+
+	/* Serialize data dictionary operations with dictionary mutex:
+	no deadlocks can occur then in these operations */
+
+	ut_a(trx->dict_operation_lock_mode == 0);
+	/* Prevent foreign key checks etc. while we are truncating the
+	table */
+
+	row_mysql_lock_data_dictionary(trx);
+
+	ut_ad(mutex_own(&(dict_sys->mutex)));
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(rw_lock_own(&dict_operation_lock, RW_LOCK_EX));
+#endif /* UNIV_SYNC_DEBUG */
+
+	/* Check if the table is referenced by foreign key constraints from
+	some other table (not the table itself) */
+
+	foreign = UT_LIST_GET_FIRST(table->referenced_list);
+
+	while (foreign && foreign->foreign_table == table) {
+		foreign = UT_LIST_GET_NEXT(referenced_list, foreign);
+	}
+
+	if (foreign && trx->check_foreigns) {
+		FILE*	ef	= dict_foreign_err_file;
+
+		/* We only allow truncating a referenced table if
+		FOREIGN_KEY_CHECKS is set to 0 */
+
+		mutex_enter(&dict_foreign_err_mutex);
+		rewind(ef);
+		ut_print_timestamp(ef);
+
+		fputs("  Cannot truncate table ", ef);
+		ut_print_name(ef, trx, TRUE, table->name);
+		fputs(" by DROP+CREATE\n"
+		      "InnoDB: because it is referenced by ", ef);
+		ut_print_name(ef, trx, TRUE, foreign->foreign_table_name);
+		putc('\n', ef);
+		mutex_exit(&dict_foreign_err_mutex);
+
+		err = DB_ERROR;
+		goto funct_exit;
+	}
+
+	/* TODO: could we replace the counter n_foreign_key_checks_running
+	with lock checks on the table? Acquire here an exclusive lock on the
+	table, and rewrite lock0lock.c and the lock wait in srv0srv.c so that
+	they can cope with the table having been truncated here? Foreign key
+	checks take an IS or IX lock on the table. */
+
+	if (table->n_foreign_key_checks_running > 0) {
+		ut_print_timestamp(stderr);
+		fputs("  InnoDB: Cannot truncate table ", stderr);
+		ut_print_name(stderr, trx, TRUE, table->name);
+		fputs(" by DROP+CREATE\n"
+		      "InnoDB: because there is a foreign key check"
+		      " running on it.\n",
+		      stderr);
+		err = DB_ERROR;
+
+		goto funct_exit;
+	}
+
+	/* Remove all locks except the table-level S and X locks. */
+	lock_remove_all_on_table(table, FALSE);
+
+	trx->table_id = table->id;
+
+	/* Lock all index trees for this table, as we will
+	truncate the table/index and possibly change their metadata.
+	All DML/DDL are blocked by table level lock, with
+	a few exceptions such as queries into information schema
+	about the table, MySQL could try to access index stats
+	for this kind of query, we need to use index locks to
+	sync up */
+	dict_table_x_lock_indexes(table);
+
+	if (table->space && !table->dir_path_of_temp_table) {
+		/* Discard and create the single-table tablespace. */
+		ulint	space	= table->space;
+		ulint	flags	= fil_space_get_flags(space);
+
+		if (flags != ULINT_UNDEFINED
+		    && fil_discard_tablespace(space)) {
+
+			dict_index_t*	index;
+
+			dict_hdr_get_new_id(NULL, NULL, &space);
+
+			if (space == ULINT_UNDEFINED
+			    || fil_create_new_single_table_tablespace(
+				    space, table->name, FALSE, flags,
+				    FIL_IBD_FILE_INITIAL_SIZE) != DB_SUCCESS) {
+				dict_table_x_unlock_indexes(table);
+				ut_print_timestamp(stderr);
+				fprintf(stderr,
+					"  InnoDB: TRUNCATE TABLE %s failed to"
+					" create a new tablespace\n",
+					table->name);
+				table->ibd_file_missing = 1;
+				err = DB_ERROR;
+				goto funct_exit;
+			}
+
+			recreate_space = space;
+
+			/* Replace the space_id in the data dictionary cache.
+			The persisent data dictionary (SYS_TABLES.SPACE
+			and SYS_INDEXES.SPACE) are updated later in this
+			function. */
+			table->space = space;
+			index = dict_table_get_first_index(table);
+			do {
+				index->space = space;
+				index = dict_table_get_next_index(index);
+			} while (index);
+
+			mtr_start(&mtr);
+			fsp_header_init(space,
+					FIL_IBD_FILE_INITIAL_SIZE, &mtr);
+			mtr_commit(&mtr);
+		}
+	}
+
+	/* scan SYS_INDEXES for all indexes of the table */
+	heap = mem_heap_create(800);
+
+	tuple = dtuple_create(heap, 1);
+	dfield = dtuple_get_nth_field(tuple, 0);
+
+	buf = mem_heap_alloc(heap, 8);
+	mach_write_to_8(buf, table->id);
+
+	dfield_set_data(dfield, buf, 8);
+	sys_index = dict_table_get_first_index(dict_sys->sys_indexes);
+	dict_index_copy_types(tuple, sys_index, 1);
+
+	mtr_start(&mtr);
+	btr_pcur_open_on_user_rec(sys_index, tuple, PAGE_CUR_GE,
+				  BTR_MODIFY_LEAF, &pcur, &mtr);
+	for (;;) {
+		rec_t*		rec;
+		const byte*	field;
+		ulint		len;
+		ulint		root_page_no;
+
+		if (!btr_pcur_is_on_user_rec(&pcur)) {
+			/* The end of SYS_INDEXES has been reached. */
+			break;
+		}
+
+		rec = btr_pcur_get_rec(&pcur);
+
+		field = rec_get_nth_field_old(rec, 0, &len);
+		ut_ad(len == 8);
+
+		if (memcmp(buf, field, len) != 0) {
+			/* End of indexes for the table (TABLE_ID mismatch). */
+			break;
+		}
+
+		if (rec_get_deleted_flag(rec, FALSE)) {
+			/* The index has been dropped. */
+			goto next_rec;
+		}
+
+		/* This call may commit and restart mtr
+		and reposition pcur. */
+		root_page_no = dict_truncate_index_tree(table, recreate_space,
+							&pcur, &mtr);
+
+		rec = btr_pcur_get_rec(&pcur);
+
+		if (root_page_no != FIL_NULL) {
+			page_rec_write_index_page_no(
+				rec, DICT_SYS_INDEXES_PAGE_NO_FIELD,
+				root_page_no, &mtr);
+			/* We will need to commit and restart the
+			mini-transaction in order to avoid deadlocks.
+			The dict_truncate_index_tree() call has allocated
+			a page in this mini-transaction, and the rest of
+			this loop could latch another index page. */
+			mtr_commit(&mtr);
+			mtr_start(&mtr);
+			btr_pcur_restore_position(BTR_MODIFY_LEAF,
+						  &pcur, &mtr);
+		}
+
+next_rec:
+		btr_pcur_move_to_next_user_rec(&pcur, &mtr);
+	}
+
+	btr_pcur_close(&pcur);
+	mtr_commit(&mtr);
+
+	mem_heap_free(heap);
+
+	/* Done with index truncation, release index tree locks,
+	subsequent work relates to table level metadata change */
+	dict_table_x_unlock_indexes(table);
+
+	dict_hdr_get_new_id(&new_id, NULL, NULL);
+
+	info = pars_info_create();
+
+	pars_info_add_int4_literal(info, "space", (lint) table->space);
+	pars_info_add_dulint_literal(info, "old_id", table->id);
+	pars_info_add_dulint_literal(info, "new_id", new_id);
+
+	err = que_eval_sql(info,
+			   "PROCEDURE RENUMBER_TABLESPACE_PROC () IS\n"
+			   "BEGIN\n"
+			   "UPDATE SYS_TABLES"
+			   " SET ID = :new_id, SPACE = :space\n"
+			   " WHERE ID = :old_id;\n"
+			   "UPDATE SYS_COLUMNS SET TABLE_ID = :new_id\n"
+			   " WHERE TABLE_ID = :old_id;\n"
+			   "UPDATE SYS_INDEXES"
+			   " SET TABLE_ID = :new_id, SPACE = :space\n"
+			   " WHERE TABLE_ID = :old_id;\n"
+			   "COMMIT WORK;\n"
+			   "END;\n"
+			   , FALSE, trx);
+
+	if (err != DB_SUCCESS) {
+		trx->error_state = DB_SUCCESS;
+		trx_general_rollback_for_mysql(trx, NULL);
+		trx->error_state = DB_SUCCESS;
+		ut_print_timestamp(stderr);
+		fputs("  InnoDB: Unable to assign a new identifier to table ",
+		      stderr);
+		ut_print_name(stderr, trx, TRUE, table->name);
+		fputs("\n"
+		      "InnoDB: after truncating it.  Background processes"
+		      " may corrupt the table!\n", stderr);
+		err = DB_ERROR;
+	} else {
+		dict_table_change_id_in_cache(table, new_id);
+	}
+
+	/* MySQL calls ha_innobase::reset_auto_increment() which does
+	the same thing. */
+	dict_table_autoinc_lock(table);
+	dict_table_autoinc_initialize(table, 1);
+	dict_table_autoinc_unlock(table);
+	dict_update_statistics(table, TRUE);
+
+	trx_commit_for_mysql(trx);
+
+funct_exit:
+
+	row_mysql_unlock_data_dictionary(trx);
+
+	trx->op_info = "";
+
+	srv_wake_master_thread();
+
+	return((int) err);
+}
+
+/*********************************************************************//**
+Drops a table for MySQL.  If the name of the dropped table ends in
+one of "innodb_monitor", "innodb_lock_monitor", "innodb_tablespace_monitor",
+"innodb_table_monitor", then this will also stop the printing of monitor
+output by the master thread.  If the data dictionary was not already locked
+by the transaction, the transaction will be committed.  Otherwise, the
+data dictionary will remain locked.
+@return	error code or DB_SUCCESS */
+UNIV_INTERN
+int
+row_drop_table_for_mysql(
+/*=====================*/
+	const char*	name,	/*!< in: table name */
+	trx_t*		trx,	/*!< in: transaction handle */
+	ibool		drop_db)/*!< in: TRUE=dropping whole database */
+{
+	dict_foreign_t*	foreign;
+	dict_table_t*	table;
+	ulint		space_id;
+	ulint		err;
+	const char*	table_name;
+	ulint		namelen;
+	ibool		locked_dictionary	= FALSE;
+	pars_info_t*    info			= NULL;
+
+	ut_a(name != NULL);
+
+	if (srv_created_new_raw) {
+		fputs("InnoDB: A new raw disk partition was initialized:\n"
+		      "InnoDB: we do not allow database modifications"
+		      " by the user.\n"
+		      "InnoDB: Shut down mysqld and edit my.cnf so that newraw"
+		      " is replaced with raw.\n", stderr);
+
+		return(DB_ERROR);
+	}
+
+	trx->op_info = "dropping table";
+
+	trx_start_if_not_started(trx);
+
+	/* The table name is prefixed with the database name and a '/'.
+	Certain table names starting with 'innodb_' have their special
+	meaning regardless of the database name.  Thus, we need to
+	ignore the database name prefix in the comparisons. */
+	table_name = strchr(name, '/');
+	ut_a(table_name);
+	table_name++;
+	namelen = strlen(table_name) + 1;
+
+	if (namelen == sizeof S_innodb_monitor
+	    && !memcmp(table_name, S_innodb_monitor,
+		       sizeof S_innodb_monitor)) {
+
+		/* Table name equals "innodb_monitor":
+		stop monitor prints */
+
+		srv_print_innodb_monitor = FALSE;
+		srv_print_innodb_lock_monitor = FALSE;
+	} else if (namelen == sizeof S_innodb_lock_monitor
+		   && !memcmp(table_name, S_innodb_lock_monitor,
+			      sizeof S_innodb_lock_monitor)) {
+		srv_print_innodb_monitor = FALSE;
+		srv_print_innodb_lock_monitor = FALSE;
+	} else if (namelen == sizeof S_innodb_tablespace_monitor
+		   && !memcmp(table_name, S_innodb_tablespace_monitor,
+			      sizeof S_innodb_tablespace_monitor)) {
+
+		srv_print_innodb_tablespace_monitor = FALSE;
+	} else if (namelen == sizeof S_innodb_table_monitor
+		   && !memcmp(table_name, S_innodb_table_monitor,
+			      sizeof S_innodb_table_monitor)) {
+
+		srv_print_innodb_table_monitor = FALSE;
+	}
+
+	/* Serialize data dictionary operations with dictionary mutex:
+	no deadlocks can occur then in these operations */
+
+	if (trx->dict_operation_lock_mode != RW_X_LATCH) {
+		/* Prevent foreign key checks etc. while we are dropping the
+		table */
+
+		row_mysql_lock_data_dictionary(trx);
+
+		locked_dictionary = TRUE;
+	}
+
+	ut_ad(mutex_own(&(dict_sys->mutex)));
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(rw_lock_own(&dict_operation_lock, RW_LOCK_EX));
+#endif /* UNIV_SYNC_DEBUG */
+
+	table = dict_table_get_low(name);
+
+	if (!table) {
+		err = DB_TABLE_NOT_FOUND;
+		ut_print_timestamp(stderr);
+
+		fputs("  InnoDB: Error: table ", stderr);
+		ut_print_name(stderr, trx, TRUE, name);
+		fputs(" does not exist in the InnoDB internal\n"
+		      "InnoDB: data dictionary though MySQL is"
+		      " trying to drop it.\n"
+		      "InnoDB: Have you copied the .frm file"
+		      " of the table to the\n"
+		      "InnoDB: MySQL database directory"
+		      " from another database?\n"
+		      "InnoDB: You can look for further help from\n"
+		      "InnoDB: " REFMAN "innodb-troubleshooting.html\n",
+		      stderr);
+		goto funct_exit;
+	}
+
+	/* Check if the table is referenced by foreign key constraints from
+	some other table (not the table itself) */
+
+	foreign = UT_LIST_GET_FIRST(table->referenced_list);
+
+	while (foreign && foreign->foreign_table == table) {
+check_next_foreign:
+		foreign = UT_LIST_GET_NEXT(referenced_list, foreign);
+	}
+
+	if (foreign && trx->check_foreigns
+	    && !(drop_db && dict_tables_have_same_db(
+			 name, foreign->foreign_table_name))) {
+		FILE*	ef	= dict_foreign_err_file;
+
+		/* We only allow dropping a referenced table if
+		FOREIGN_KEY_CHECKS is set to 0 */
+
+		err = DB_CANNOT_DROP_CONSTRAINT;
+
+		mutex_enter(&dict_foreign_err_mutex);
+		rewind(ef);
+		ut_print_timestamp(ef);
+
+		fputs("  Cannot drop table ", ef);
+		ut_print_name(ef, trx, TRUE, name);
+		fputs("\n"
+		      "because it is referenced by ", ef);
+		ut_print_name(ef, trx, TRUE, foreign->foreign_table_name);
+		putc('\n', ef);
+		mutex_exit(&dict_foreign_err_mutex);
+
+		goto funct_exit;
+	}
+
+	if (foreign && trx->check_foreigns) {
+		goto check_next_foreign;
+	}
+
+	if (table->n_mysql_handles_opened > 0) {
+		ibool	added;
+
+		added = row_add_table_to_background_drop_list(table->name);
+
+		if (added) {
+			ut_print_timestamp(stderr);
+			fputs("  InnoDB: Warning: MySQL is"
+			      " trying to drop table ", stderr);
+			ut_print_name(stderr, trx, TRUE, table->name);
+			fputs("\n"
+			      "InnoDB: though there are still"
+			      " open handles to it.\n"
+			      "InnoDB: Adding the table to the"
+			      " background drop queue.\n",
+			      stderr);
+
+			/* We return DB_SUCCESS to MySQL though the drop will
+			happen lazily later */
+			err = DB_SUCCESS;
+		} else {
+			/* The table is already in the background drop list */
+			err = DB_ERROR;
+		}
+
+		goto funct_exit;
+	}
+
+	/* TODO: could we replace the counter n_foreign_key_checks_running
+	with lock checks on the table? Acquire here an exclusive lock on the
+	table, and rewrite lock0lock.c and the lock wait in srv0srv.c so that
+	they can cope with the table having been dropped here? Foreign key
+	checks take an IS or IX lock on the table. */
+
+	if (table->n_foreign_key_checks_running > 0) {
+
+		const char*	table_name = table->name;
+		ibool		added;
+
+		added = row_add_table_to_background_drop_list(table_name);
+
+		if (added) {
+			ut_print_timestamp(stderr);
+			fputs("  InnoDB: You are trying to drop table ",
+			      stderr);
+			ut_print_name(stderr, trx, TRUE, table_name);
+			fputs("\n"
+			      "InnoDB: though there is a"
+			      " foreign key check running on it.\n"
+			      "InnoDB: Adding the table to"
+			      " the background drop queue.\n",
+			      stderr);
+
+			/* We return DB_SUCCESS to MySQL though the drop will
+			happen lazily later */
+
+			err = DB_SUCCESS;
+		} else {
+			/* The table is already in the background drop list */
+			err = DB_ERROR;
+		}
+
+		goto funct_exit;
+	}
+
+	/* Remove all locks there are on the table or its records */
+	lock_remove_all_on_table(table, TRUE);
+
+	trx_set_dict_operation(trx, TRX_DICT_OP_TABLE);
+	trx->table_id = table->id;
+
+	/* We use the private SQL parser of Innobase to generate the
+	query graphs needed in deleting the dictionary data from system
+	tables in Innobase. Deleting a row from SYS_INDEXES table also
+	frees the file segments of the B-tree associated with the index. */
+
+	info = pars_info_create();
+
+	pars_info_add_str_literal(info, "table_name", name);
+
+	err = que_eval_sql(info,
+			   "PROCEDURE DROP_TABLE_PROC () IS\n"
+			   "sys_foreign_id CHAR;\n"
+			   "table_id CHAR;\n"
+			   "index_id CHAR;\n"
+			   "foreign_id CHAR;\n"
+			   "found INT;\n"
+			   "BEGIN\n"
+			   "SELECT ID INTO table_id\n"
+			   "FROM SYS_TABLES\n"
+			   "WHERE NAME = :table_name\n"
+			   "LOCK IN SHARE MODE;\n"
+			   "IF (SQL % NOTFOUND) THEN\n"
+			   "       RETURN;\n"
+			   "END IF;\n"
+			   "found := 1;\n"
+			   "SELECT ID INTO sys_foreign_id\n"
+			   "FROM SYS_TABLES\n"
+			   "WHERE NAME = 'SYS_FOREIGN'\n"
+			   "LOCK IN SHARE MODE;\n"
+			   "IF (SQL % NOTFOUND) THEN\n"
+			   "       found := 0;\n"
+			   "END IF;\n"
+			   "IF (:table_name = 'SYS_FOREIGN') THEN\n"
+			   "       found := 0;\n"
+			   "END IF;\n"
+			   "IF (:table_name = 'SYS_FOREIGN_COLS') THEN\n"
+			   "       found := 0;\n"
+			   "END IF;\n"
+			   "WHILE found = 1 LOOP\n"
+			   "       SELECT ID INTO foreign_id\n"
+			   "       FROM SYS_FOREIGN\n"
+			   "       WHERE FOR_NAME = :table_name\n"
+			   "               AND TO_BINARY(FOR_NAME)\n"
+			   "                 = TO_BINARY(:table_name)\n"
+			   "               LOCK IN SHARE MODE;\n"
+			   "       IF (SQL % NOTFOUND) THEN\n"
+			   "               found := 0;\n"
+			   "       ELSE\n"
+			   "               DELETE FROM SYS_FOREIGN_COLS\n"
+			   "               WHERE ID = foreign_id;\n"
+			   "               DELETE FROM SYS_FOREIGN\n"
+			   "               WHERE ID = foreign_id;\n"
+			   "       END IF;\n"
+			   "END LOOP;\n"
+			   "found := 1;\n"
+			   "WHILE found = 1 LOOP\n"
+			   "       SELECT ID INTO index_id\n"
+			   "       FROM SYS_INDEXES\n"
+			   "       WHERE TABLE_ID = table_id\n"
+			   "       LOCK IN SHARE MODE;\n"
+			   "       IF (SQL % NOTFOUND) THEN\n"
+			   "               found := 0;\n"
+			   "       ELSE\n"
+			   "               DELETE FROM SYS_STATS\n"
+			   "               WHERE INDEX_ID = index_id;\n"
+			   "               DELETE FROM SYS_FIELDS\n"
+			   "               WHERE INDEX_ID = index_id;\n"
+			   "               DELETE FROM SYS_INDEXES\n"
+			   "               WHERE ID = index_id\n"
+			   "               AND TABLE_ID = table_id;\n"
+			   "       END IF;\n"
+			   "END LOOP;\n"
+			   "DELETE FROM SYS_COLUMNS\n"
+			   "WHERE TABLE_ID = table_id;\n"
+			   "DELETE FROM SYS_TABLES\n"
+			   "WHERE ID = table_id;\n"
+			   "END;\n"
+			   , FALSE, trx);
+
+	switch (err) {
+		ibool		is_temp;
+		const char*	name_or_path;
+		mem_heap_t*	heap;
+
+	case DB_SUCCESS:
+
+		heap = mem_heap_create(200);
+
+		/* Clone the name, in case it has been allocated
+		from table->heap, which will be freed by
+		dict_table_remove_from_cache(table) below. */
+		name = mem_heap_strdup(heap, name);
+		space_id = table->space;
+
+		if (table->dir_path_of_temp_table != NULL) {
+			name_or_path = mem_heap_strdup(
+				heap, table->dir_path_of_temp_table);
+			is_temp = TRUE;
+		} else {
+			name_or_path = name;
+			is_temp = (table->flags >> DICT_TF2_SHIFT)
+				& DICT_TF2_TEMPORARY;
+		}
+
+		dict_table_remove_from_cache(table);
+
+		if (dict_load_table(name) != NULL) {
+			ut_print_timestamp(stderr);
+			fputs("  InnoDB: Error: not able to remove table ",
+			      stderr);
+			ut_print_name(stderr, trx, TRUE, name);
+			fputs(" from the dictionary cache!\n", stderr);
+			err = DB_ERROR;
+		}
+
+		/* Do not drop possible .ibd tablespace if something went
+		wrong: we do not want to delete valuable data of the user */
+
+		if (err == DB_SUCCESS && !trx_sys_sys_space(space_id)) {
+			if (!fil_space_for_table_exists_in_mem(space_id,
+							       name_or_path,
+							       is_temp, FALSE,
+							       !is_temp)) {
+				err = DB_SUCCESS;
+
+				fprintf(stderr,
+					"InnoDB: We removed now the InnoDB"
+					" internal data dictionary entry\n"
+					"InnoDB: of table ");
+				ut_print_name(stderr, trx, TRUE, name);
+				fprintf(stderr, ".\n");
+			} else if (!fil_delete_tablespace(space_id)) {
+				fprintf(stderr,
+					"InnoDB: We removed now the InnoDB"
+					" internal data dictionary entry\n"
+					"InnoDB: of table ");
+				ut_print_name(stderr, trx, TRUE, name);
+				fprintf(stderr, ".\n");
+
+				ut_print_timestamp(stderr);
+				fprintf(stderr,
+					"  InnoDB: Error: not able to"
+					" delete tablespace %lu of table ",
+					(ulong) space_id);
+				ut_print_name(stderr, trx, TRUE, name);
+				fputs("!\n", stderr);
+				err = DB_ERROR;
+			}
+		}
+
+		mem_heap_free(heap);
+		break;
+
+	case DB_TOO_MANY_CONCURRENT_TRXS:
+		/* Cannot even find a free slot for the
+		the undo log. We can directly exit here
+		and return the DB_TOO_MANY_CONCURRENT_TRXS
+		error. */
+		break;
+
+	case DB_OUT_OF_FILE_SPACE:
+		err = DB_MUST_GET_MORE_FILE_SPACE;
+
+		row_mysql_handle_errors(&err, trx, NULL, NULL);
+
+		/* Fall through to raise error */
+
+	default:
+		/* No other possible error returns */
+		ut_error;
+	}
+
+funct_exit:
+
+	if (locked_dictionary) {
+		trx_commit_for_mysql(trx);
+
+		row_mysql_unlock_data_dictionary(trx);
+	}
+
+	trx->op_info = "";
+
+	srv_wake_master_thread();
+
+	return((int) err);
+}
+
+/*********************************************************************//**
+Drop all temporary tables during crash recovery. */
+UNIV_INTERN
+void
+row_mysql_drop_temp_tables(void)
+/*============================*/
+{
+	trx_t*		trx;
+	btr_pcur_t	pcur;
+	mtr_t		mtr;
+	mem_heap_t*	heap;
+
+	trx = trx_allocate_for_background();
+	trx->op_info = "dropping temporary tables";
+	row_mysql_lock_data_dictionary(trx);
+
+	heap = mem_heap_create(200);
+
+	mtr_start(&mtr);
+
+	btr_pcur_open_at_index_side(
+		TRUE,
+		dict_table_get_first_index(dict_sys->sys_tables),
+		BTR_SEARCH_LEAF, &pcur, TRUE, &mtr);
+
+	for (;;) {
+		const rec_t*	rec;
+		const byte*	field;
+		ulint		len;
+		const char*	table_name;
+		dict_table_t*	table;
+
+		btr_pcur_move_to_next_user_rec(&pcur, &mtr);
+
+		if (!btr_pcur_is_on_user_rec(&pcur)) {
+			break;
+		}
+
+		rec = btr_pcur_get_rec(&pcur);
+		field = rec_get_nth_field_old(rec, 4/*N_COLS*/, &len);
+		if (len != 4 || !(mach_read_from_4(field) & 0x80000000UL)) {
+			continue;
+		}
+
+		/* Because this is not a ROW_FORMAT=REDUNDANT table,
+		the is_temp flag is valid.  Examine it. */
+
+		field = rec_get_nth_field_old(rec, 7/*MIX_LEN*/, &len);
+		if (len != 4
+		    || !(mach_read_from_4(field) & DICT_TF2_TEMPORARY)) {
+			continue;
+		}
+
+		/* This is a temporary table. */
+		field = rec_get_nth_field_old(rec, 0/*NAME*/, &len);
+		if (len == UNIV_SQL_NULL || len == 0) {
+			/* Corrupted SYS_TABLES.NAME */
+			continue;
+		}
+
+		table_name = mem_heap_strdupl(heap, (const char*) field, len);
+
+		btr_pcur_store_position(&pcur, &mtr);
+		btr_pcur_commit_specify_mtr(&pcur, &mtr);
+
+		table = dict_load_table(table_name);
+
+		if (table) {
+			row_drop_table_for_mysql(table_name, trx, FALSE);
+			trx_commit_for_mysql(trx);
+		}
+
+		mtr_start(&mtr);
+		btr_pcur_restore_position(BTR_SEARCH_LEAF,
+					  &pcur, &mtr);
+	}
+
+	btr_pcur_close(&pcur);
+	mtr_commit(&mtr);
+	mem_heap_free(heap);
+	row_mysql_unlock_data_dictionary(trx);
+	trx_free_for_background(trx);
+}
+
+/*******************************************************************//**
+Drop all foreign keys in a database, see Bug#18942.
+Called at the end of row_drop_database_for_mysql().
+@return	error code or DB_SUCCESS */
+static
+ulint
+drop_all_foreign_keys_in_db(
+/*========================*/
+	const char*	name,	/*!< in: database name which ends to '/' */
+	trx_t*		trx)	/*!< in: transaction handle */
+{
+	pars_info_t*	pinfo;
+	ulint		err;
+
+	ut_a(name[strlen(name) - 1] == '/');
+
+	pinfo = pars_info_create();
+
+	pars_info_add_str_literal(pinfo, "dbname", name);
+
+/** true if for_name is not prefixed with dbname */
+#define TABLE_NOT_IN_THIS_DB \
+"SUBSTR(for_name, 0, LENGTH(:dbname)) <> :dbname"
+
+	err = que_eval_sql(pinfo,
+			   "PROCEDURE DROP_ALL_FOREIGN_KEYS_PROC () IS\n"
+			   "foreign_id CHAR;\n"
+			   "for_name CHAR;\n"
+			   "found INT;\n"
+			   "DECLARE CURSOR cur IS\n"
+			   "SELECT ID, FOR_NAME FROM SYS_FOREIGN\n"
+			   "WHERE FOR_NAME >= :dbname\n"
+			   "LOCK IN SHARE MODE\n"
+			   "ORDER BY FOR_NAME;\n"
+			   "BEGIN\n"
+			   "found := 1;\n"
+			   "OPEN cur;\n"
+			   "WHILE found = 1 LOOP\n"
+			   "        FETCH cur INTO foreign_id, for_name;\n"
+			   "        IF (SQL % NOTFOUND) THEN\n"
+			   "                found := 0;\n"
+			   "        ELSIF (" TABLE_NOT_IN_THIS_DB ") THEN\n"
+			   "                found := 0;\n"
+			   "        ELSIF (1=1) THEN\n"
+			   "                DELETE FROM SYS_FOREIGN_COLS\n"
+			   "                WHERE ID = foreign_id;\n"
+			   "                DELETE FROM SYS_FOREIGN\n"
+			   "                WHERE ID = foreign_id;\n"
+			   "        END IF;\n"
+			   "END LOOP;\n"
+			   "CLOSE cur;\n"
+			   "COMMIT WORK;\n"
+			   "END;\n",
+			   FALSE, /* do not reserve dict mutex,
+				  we are already holding it */
+			   trx);
+
+	return(err);
+}
+
+/*********************************************************************//**
+Drops a database for MySQL.
+@return	error code or DB_SUCCESS */
+UNIV_INTERN
+int
+row_drop_database_for_mysql(
+/*========================*/
+	const char*	name,	/*!< in: database name which ends to '/' */
+	trx_t*		trx)	/*!< in: transaction handle */
+{
+	dict_table_t* table;
+	char*	table_name;
+	int	err	= DB_SUCCESS;
+	ulint	namelen	= strlen(name);
+
+	ut_ad(trx->mysql_thread_id == os_thread_get_curr_id());
+	ut_a(name != NULL);
+	ut_a(name[namelen - 1] == '/');
+
+	trx->op_info = "dropping database";
+
+	trx_start_if_not_started(trx);
+loop:
+	row_mysql_lock_data_dictionary(trx);
+
+	while ((table_name = dict_get_first_table_name_in_db(name))) {
+		ut_a(memcmp(table_name, name, namelen) == 0);
+
+		table = dict_table_get_low(table_name);
+
+		ut_a(table);
+
+		/* Wait until MySQL does not have any queries running on
+		the table */
+
+		if (table->n_mysql_handles_opened > 0) {
+			row_mysql_unlock_data_dictionary(trx);
+
+			ut_print_timestamp(stderr);
+			fputs("  InnoDB: Warning: MySQL is trying to"
+			      " drop database ", stderr);
+			ut_print_name(stderr, trx, TRUE, name);
+			fputs("\n"
+			      "InnoDB: though there are still"
+			      " open handles to table ", stderr);
+			ut_print_name(stderr, trx, TRUE, table_name);
+			fputs(".\n", stderr);
+
+			os_thread_sleep(1000000);
+
+			mem_free(table_name);
+
+			goto loop;
+		}
+
+		err = row_drop_table_for_mysql(table_name, trx, TRUE);
+		trx_commit_for_mysql(trx);
+
+		if (err != DB_SUCCESS) {
+			fputs("InnoDB: DROP DATABASE ", stderr);
+			ut_print_name(stderr, trx, TRUE, name);
+			fprintf(stderr, " failed with error %lu for table ",
+				(ulint) err);
+			ut_print_name(stderr, trx, TRUE, table_name);
+			putc('\n', stderr);
+			mem_free(table_name);
+			break;
+		}
+
+		mem_free(table_name);
+	}
+
+	if (err == DB_SUCCESS) {
+		/* after dropping all tables try to drop all leftover
+		foreign keys in case orphaned ones exist */
+		err = (int) drop_all_foreign_keys_in_db(name, trx);
+
+		if (err != DB_SUCCESS) {
+			fputs("InnoDB: DROP DATABASE ", stderr);
+			ut_print_name(stderr, trx, TRUE, name);
+			fprintf(stderr, " failed with error %d while "
+				"dropping all foreign keys", err);
+		}
+	}
+
+	trx_commit_for_mysql(trx);
+
+	row_mysql_unlock_data_dictionary(trx);
+
+	trx->op_info = "";
+
+	return(err);
+}
+
+/*********************************************************************//**
+Checks if a table name contains the string "/#sql" which denotes temporary
+tables in MySQL.
+@return	TRUE if temporary table */
+static
+ibool
+row_is_mysql_tmp_table_name(
+/*========================*/
+	const char*	name)	/*!< in: table name in the form
+				'database/tablename' */
+{
+	return(strstr(name, "/#sql") != NULL);
+	/* return(strstr(name, "/@0023sql") != NULL); */
+}
+
+/****************************************************************//**
+Delete a single constraint.
+@return	error code or DB_SUCCESS */
+static
+int
+row_delete_constraint_low(
+/*======================*/
+	const char*	id,		/*!< in: constraint id */
+	trx_t*		trx)		/*!< in: transaction handle */
+{
+	pars_info_t*	info = pars_info_create();
+
+	pars_info_add_str_literal(info, "id", id);
+
+	return((int) que_eval_sql(info,
+			    "PROCEDURE DELETE_CONSTRAINT () IS\n"
+			    "BEGIN\n"
+			    "DELETE FROM SYS_FOREIGN_COLS WHERE ID = :id;\n"
+			    "DELETE FROM SYS_FOREIGN WHERE ID = :id;\n"
+			    "END;\n"
+			    , FALSE, trx));
+}
+
+/****************************************************************//**
+Delete a single constraint.
+@return	error code or DB_SUCCESS */
+static
+int
+row_delete_constraint(
+/*==================*/
+	const char*	id,		/*!< in: constraint id */
+	const char*	database_name,	/*!< in: database name, with the
+					trailing '/' */
+	mem_heap_t*	heap,		/*!< in: memory heap */
+	trx_t*		trx)		/*!< in: transaction handle */
+{
+	ulint		err;
+
+	/* New format constraints have ids <databasename>/<constraintname>. */
+	err = row_delete_constraint_low(
+		mem_heap_strcat(heap, database_name, id), trx);
+
+	if ((err == DB_SUCCESS) && !strchr(id, '/')) {
+		/* Old format < 4.0.18 constraints have constraint ids
+		NUMBER_NUMBER. We only try deleting them if the
+		constraint name does not contain a '/' character, otherwise
+		deleting a new format constraint named 'foo/bar' from
+		database 'baz' would remove constraint 'bar' from database
+		'foo', if it existed. */
+
+		err = row_delete_constraint_low(id, trx);
+	}
+
+	return((int) err);
+}
+
+/*********************************************************************//**
+Renames a table for MySQL.
+@return	error code or DB_SUCCESS */
+UNIV_INTERN
+ulint
+row_rename_table_for_mysql(
+/*=======================*/
+	const char*	old_name,	/*!< in: old table name */
+	const char*	new_name,	/*!< in: new table name */
+	trx_t*		trx,		/*!< in: transaction handle */
+	ibool		commit)		/*!< in: if TRUE then commit trx */
+{
+	dict_table_t*	table;
+	ulint		err			= DB_ERROR;
+	mem_heap_t*	heap			= NULL;
+	const char**	constraints_to_drop	= NULL;
+	ulint		n_constraints_to_drop	= 0;
+	ibool		old_is_tmp, new_is_tmp;
+	pars_info_t*	info			= NULL;
+
+	ut_ad(trx->mysql_thread_id == os_thread_get_curr_id());
+	ut_a(old_name != NULL);
+	ut_a(new_name != NULL);
+
+	if (srv_created_new_raw || srv_force_recovery) {
+		fputs("InnoDB: A new raw disk partition was initialized or\n"
+		      "InnoDB: innodb_force_recovery is on: we do not allow\n"
+		      "InnoDB: database modifications by the user. Shut down\n"
+		      "InnoDB: mysqld and edit my.cnf so that newraw"
+		      " is replaced\n"
+		      "InnoDB: with raw, and innodb_force_... is removed.\n",
+		      stderr);
+
+		goto funct_exit;
+	} else if (row_mysql_is_system_table(new_name)) {
+
+		fprintf(stderr,
+			"InnoDB: Error: trying to create a MySQL"
+			" system table %s of type InnoDB.\n"
+			"InnoDB: MySQL system tables must be"
+			" of the MyISAM type!\n",
+			new_name);
+
+		goto funct_exit;
+	}
+
+	trx->op_info = "renaming table";
+	trx_start_if_not_started(trx);
+
+	old_is_tmp = row_is_mysql_tmp_table_name(old_name);
+	new_is_tmp = row_is_mysql_tmp_table_name(new_name);
+
+	table = dict_table_get_low(old_name);
+
+	if (!table) {
+		err = DB_TABLE_NOT_FOUND;
+		ut_print_timestamp(stderr);
+
+		fputs("  InnoDB: Error: table ", stderr);
+		ut_print_name(stderr, trx, TRUE, old_name);
+		fputs(" does not exist in the InnoDB internal\n"
+		      "InnoDB: data dictionary though MySQL is"
+		      " trying to rename the table.\n"
+		      "InnoDB: Have you copied the .frm file"
+		      " of the table to the\n"
+		      "InnoDB: MySQL database directory"
+		      " from another database?\n"
+		      "InnoDB: You can look for further help from\n"
+		      "InnoDB: " REFMAN "innodb-troubleshooting.html\n",
+		      stderr);
+		goto funct_exit;
+	} else if (table->ibd_file_missing) {
+		err = DB_TABLE_NOT_FOUND;
+		ut_print_timestamp(stderr);
+
+		fputs("  InnoDB: Error: table ", stderr);
+		ut_print_name(stderr, trx, TRUE, old_name);
+		fputs(" does not have an .ibd file"
+		      " in the database directory.\n"
+		      "InnoDB: You can look for further help from\n"
+		      "InnoDB: " REFMAN "innodb-troubleshooting.html\n",
+		      stderr);
+		goto funct_exit;
+	} else if (new_is_tmp) {
+		/* MySQL is doing an ALTER TABLE command and it renames the
+		original table to a temporary table name. We want to preserve
+		the original foreign key constraint definitions despite the
+		name change. An exception is those constraints for which
+		the ALTER TABLE contained DROP FOREIGN KEY <foreign key id>.*/
+
+		heap = mem_heap_create(100);
+
+		err = dict_foreign_parse_drop_constraints(
+			heap, trx, table, &n_constraints_to_drop,
+			&constraints_to_drop);
+
+		if (err != DB_SUCCESS) {
+
+			goto funct_exit;
+		}
+	}
+
+	/* We use the private SQL parser of Innobase to generate the query
+	graphs needed in updating the dictionary data from system tables. */
+
+	info = pars_info_create();
+
+	pars_info_add_str_literal(info, "new_table_name", new_name);
+	pars_info_add_str_literal(info, "old_table_name", old_name);
+
+	err = que_eval_sql(info,
+			   "PROCEDURE RENAME_TABLE () IS\n"
+			   "BEGIN\n"
+			   "UPDATE SYS_TABLES SET NAME = :new_table_name\n"
+			   " WHERE NAME = :old_table_name;\n"
+			   "END;\n"
+			   , FALSE, trx);
+
+	if (err != DB_SUCCESS) {
+
+		goto end;
+	} else if (!new_is_tmp) {
+		/* Rename all constraints. */
+
+		info = pars_info_create();
+
+		pars_info_add_str_literal(info, "new_table_name", new_name);
+		pars_info_add_str_literal(info, "old_table_name", old_name);
+
+		err = que_eval_sql(
+			info,
+			"PROCEDURE RENAME_CONSTRAINT_IDS () IS\n"
+			"gen_constr_prefix CHAR;\n"
+			"new_db_name CHAR;\n"
+			"foreign_id CHAR;\n"
+			"new_foreign_id CHAR;\n"
+			"old_db_name_len INT;\n"
+			"old_t_name_len INT;\n"
+			"new_db_name_len INT;\n"
+			"id_len INT;\n"
+			"found INT;\n"
+			"BEGIN\n"
+			"found := 1;\n"
+			"old_db_name_len := INSTR(:old_table_name, '/')-1;\n"
+			"new_db_name_len := INSTR(:new_table_name, '/')-1;\n"
+			"new_db_name := SUBSTR(:new_table_name, 0,\n"
+			"                      new_db_name_len);\n"
+			"old_t_name_len := LENGTH(:old_table_name);\n"
+			"gen_constr_prefix := CONCAT(:old_table_name,\n"
+			"                            '_ibfk_');\n"
+			"WHILE found = 1 LOOP\n"
+			"       SELECT ID INTO foreign_id\n"
+			"        FROM SYS_FOREIGN\n"
+			"        WHERE FOR_NAME = :old_table_name\n"
+			"         AND TO_BINARY(FOR_NAME)\n"
+			"           = TO_BINARY(:old_table_name)\n"
+			"         LOCK IN SHARE MODE;\n"
+			"       IF (SQL % NOTFOUND) THEN\n"
+			"        found := 0;\n"
+			"       ELSE\n"
+			"        UPDATE SYS_FOREIGN\n"
+			"        SET FOR_NAME = :new_table_name\n"
+			"         WHERE ID = foreign_id;\n"
+			"        id_len := LENGTH(foreign_id);\n"
+			"        IF (INSTR(foreign_id, '/') > 0) THEN\n"
+			"               IF (INSTR(foreign_id,\n"
+			"                         gen_constr_prefix) > 0)\n"
+			"               THEN\n"
+			"                new_foreign_id :=\n"
+			"                CONCAT(:new_table_name,\n"
+			"                SUBSTR(foreign_id, old_t_name_len,\n"
+			"                       id_len - old_t_name_len));\n"
+			"               ELSE\n"
+			"                new_foreign_id :=\n"
+			"                CONCAT(new_db_name,\n"
+			"                SUBSTR(foreign_id,\n"
+			"                       old_db_name_len,\n"
+			"                       id_len - old_db_name_len));\n"
+			"               END IF;\n"
+			"               UPDATE SYS_FOREIGN\n"
+			"                SET ID = new_foreign_id\n"
+			"                WHERE ID = foreign_id;\n"
+			"               UPDATE SYS_FOREIGN_COLS\n"
+			"                SET ID = new_foreign_id\n"
+			"                WHERE ID = foreign_id;\n"
+			"        END IF;\n"
+			"       END IF;\n"
+			"END LOOP;\n"
+			"UPDATE SYS_FOREIGN SET REF_NAME = :new_table_name\n"
+			"WHERE REF_NAME = :old_table_name\n"
+			"  AND TO_BINARY(REF_NAME)\n"
+			"    = TO_BINARY(:old_table_name);\n"
+			"END;\n"
+			, FALSE, trx);
+
+	} else if (n_constraints_to_drop > 0) {
+		/* Drop some constraints of tmp tables. */
+
+		ulint	db_name_len = dict_get_db_name_len(old_name) + 1;
+		char*	db_name = mem_heap_strdupl(heap, old_name,
+						   db_name_len);
+		ulint	i;
+
+		for (i = 0; i < n_constraints_to_drop; i++) {
+			err = row_delete_constraint(constraints_to_drop[i],
+						    db_name, heap, trx);
+
+			if (err != DB_SUCCESS) {
+				break;
+			}
+		}
+	}
+
+end:
+	if (err != DB_SUCCESS) {
+		if (err == DB_DUPLICATE_KEY) {
+			ut_print_timestamp(stderr);
+			fputs("  InnoDB: Error; possible reasons:\n"
+			      "InnoDB: 1) Table rename would cause"
+			      " two FOREIGN KEY constraints\n"
+			      "InnoDB: to have the same internal name"
+			      " in case-insensitive comparison.\n"
+			      "InnoDB: 2) table ", stderr);
+			ut_print_name(stderr, trx, TRUE, new_name);
+			fputs(" exists in the InnoDB internal data\n"
+			      "InnoDB: dictionary though MySQL is"
+			      " trying to rename table ", stderr);
+			ut_print_name(stderr, trx, TRUE, old_name);
+			fputs(" to it.\n"
+			      "InnoDB: Have you deleted the .frm file"
+			      " and not used DROP TABLE?\n"
+			      "InnoDB: You can look for further help from\n"
+			      "InnoDB: " REFMAN "innodb-troubleshooting.html\n"
+			      "InnoDB: If table ", stderr);
+			ut_print_name(stderr, trx, TRUE, new_name);
+			fputs(" is a temporary table #sql..., then"
+			      " it can be that\n"
+			      "InnoDB: there are still queries running"
+			      " on the table, and it will be\n"
+			      "InnoDB: dropped automatically when"
+			      " the queries end.\n"
+			      "InnoDB: You can drop the orphaned table"
+			      " inside InnoDB by\n"
+			      "InnoDB: creating an InnoDB table with"
+			      " the same name in another\n"
+			      "InnoDB: database and copying the .frm file"
+			      " to the current database.\n"
+			      "InnoDB: Then MySQL thinks the table exists,"
+			      " and DROP TABLE will\n"
+			      "InnoDB: succeed.\n", stderr);
+		}
+		trx->error_state = DB_SUCCESS;
+		trx_general_rollback_for_mysql(trx, NULL);
+		trx->error_state = DB_SUCCESS;
+	} else {
+		/* The following call will also rename the .ibd data file if
+		the table is stored in a single-table tablespace */
+
+		if (!dict_table_rename_in_cache(table, new_name,
+						!new_is_tmp)) {
+			trx->error_state = DB_SUCCESS;
+			trx_general_rollback_for_mysql(trx, NULL);
+			trx->error_state = DB_SUCCESS;
+			goto funct_exit;
+		}
+
+		/* We only want to switch off some of the type checking in
+		an ALTER, not in a RENAME. */
+
+		err = dict_load_foreigns(
+			new_name, FALSE, !old_is_tmp || trx->check_foreigns);
+
+		if (err != DB_SUCCESS) {
+			ut_print_timestamp(stderr);
+
+			if (old_is_tmp) {
+				fputs("  InnoDB: Error: in ALTER TABLE ",
+				      stderr);
+				ut_print_name(stderr, trx, TRUE, new_name);
+				fputs("\n"
+				      "InnoDB: has or is referenced"
+				      " in foreign key constraints\n"
+				      "InnoDB: which are not compatible"
+				      " with the new table definition.\n",
+				      stderr);
+			} else {
+				fputs("  InnoDB: Error: in RENAME TABLE"
+				      " table ",
+				      stderr);
+				ut_print_name(stderr, trx, TRUE, new_name);
+				fputs("\n"
+				      "InnoDB: is referenced in"
+				      " foreign key constraints\n"
+				      "InnoDB: which are not compatible"
+				      " with the new table definition.\n",
+				      stderr);
+			}
+
+			ut_a(dict_table_rename_in_cache(table,
+							old_name, FALSE));
+			trx->error_state = DB_SUCCESS;
+			trx_general_rollback_for_mysql(trx, NULL);
+			trx->error_state = DB_SUCCESS;
+		}
+	}
+
+funct_exit:
+
+	if (commit) {
+		trx_commit_for_mysql(trx);
+	}
+
+	if (UNIV_LIKELY_NULL(heap)) {
+		mem_heap_free(heap);
+	}
+
+	trx->op_info = "";
+
+	return(err);
+}
+
+/*********************************************************************//**
+Checks that the index contains entries in an ascending order, unique
+constraint is not broken, and calculates the number of index entries
+in the read view of the current transaction.
+@return	TRUE if ok */
+UNIV_INTERN
+ibool
+row_check_index_for_mysql(
+/*======================*/
+	row_prebuilt_t*		prebuilt,	/*!< in: prebuilt struct
+						in MySQL handle */
+	const dict_index_t*	index,		/*!< in: index */
+	ulint*			n_rows)		/*!< out: number of entries
+						seen in the consistent read */
+{
+	dtuple_t*	prev_entry	= NULL;
+	ulint		matched_fields;
+	ulint		matched_bytes;
+	byte*		buf;
+	ulint		ret;
+	rec_t*		rec;
+	ibool		is_ok		= TRUE;
+	int		cmp;
+	ibool		contains_null;
+	ulint		i;
+	ulint		cnt;
+	mem_heap_t*	heap		= NULL;
+	ulint		n_ext;
+	ulint		offsets_[REC_OFFS_NORMAL_SIZE];
+	ulint*		offsets;
+	rec_offs_init(offsets_);
+
+	*n_rows = 0;
+
+	buf = mem_alloc(UNIV_PAGE_SIZE);
+	heap = mem_heap_create(100);
+
+	cnt = 1000;
+
+	ret = row_search_for_mysql(buf, PAGE_CUR_G, prebuilt, 0, 0);
+loop:
+	/* Check thd->killed every 1,000 scanned rows */
+	if (--cnt == 0) {
+		if (trx_is_interrupted(prebuilt->trx)) {
+			goto func_exit;
+		}
+		cnt = 1000;
+	}
+
+	switch (ret) {
+	case DB_SUCCESS:
+		break;
+	default:
+		ut_print_timestamp(stderr);
+		fputs("  InnoDB: Warning: CHECK TABLE on ", stderr);
+		dict_index_name_print(stderr, prebuilt->trx, index);
+		fprintf(stderr, " returned %lu\n", ret);
+		/* fall through (this error is ignored by CHECK TABLE) */
+	case DB_END_OF_INDEX:
+func_exit:
+		mem_free(buf);
+		mem_heap_free(heap);
+
+		return(is_ok);
+	}
+
+	*n_rows = *n_rows + 1;
+
+	/* row_search... returns the index record in buf, record origin offset
+	within buf stored in the first 4 bytes, because we have built a dummy
+	template */
+
+	rec = buf + mach_read_from_4(buf);
+
+	offsets = rec_get_offsets(rec, index, offsets_,
+				  ULINT_UNDEFINED, &heap);
+
+	if (prev_entry != NULL) {
+		matched_fields = 0;
+		matched_bytes = 0;
+
+		cmp = cmp_dtuple_rec_with_match(prev_entry, rec, offsets,
+						&matched_fields,
+						&matched_bytes);
+		contains_null = FALSE;
+
+		/* In a unique secondary index we allow equal key values if
+		they contain SQL NULLs */
+
+		for (i = 0;
+		     i < dict_index_get_n_ordering_defined_by_user(index);
+		     i++) {
+			if (UNIV_SQL_NULL == dfield_get_len(
+				    dtuple_get_nth_field(prev_entry, i))) {
+
+				contains_null = TRUE;
+			}
+		}
+
+		if (cmp > 0) {
+			fputs("InnoDB: index records in a wrong order in ",
+			      stderr);
+not_ok:
+			dict_index_name_print(stderr,
+					      prebuilt->trx, index);
+			fputs("\n"
+			      "InnoDB: prev record ", stderr);
+			dtuple_print(stderr, prev_entry);
+			fputs("\n"
+			      "InnoDB: record ", stderr);
+			rec_print_new(stderr, rec, offsets);
+			putc('\n', stderr);
+			is_ok = FALSE;
+		} else if (dict_index_is_unique(index)
+			   && !contains_null
+			   && matched_fields
+			   >= dict_index_get_n_ordering_defined_by_user(
+				   index)) {
+
+			fputs("InnoDB: duplicate key in ", stderr);
+			goto not_ok;
+		}
+	}
+
+	{
+		mem_heap_t*	tmp_heap = NULL;
+
+		/* Empty the heap on each round.  But preserve offsets[]
+		for the row_rec_to_index_entry() call, by copying them
+		into a separate memory heap when needed. */
+		if (UNIV_UNLIKELY(offsets != offsets_)) {
+			ulint	size = rec_offs_get_n_alloc(offsets)
+				* sizeof *offsets;
+
+			tmp_heap = mem_heap_create(size);
+			offsets = mem_heap_dup(tmp_heap, offsets, size);
+		}
+
+		mem_heap_empty(heap);
+
+		prev_entry = row_rec_to_index_entry(ROW_COPY_DATA, rec,
+						    index, offsets,
+						    &n_ext, heap);
+
+		if (UNIV_LIKELY_NULL(tmp_heap)) {
+			mem_heap_free(tmp_heap);
+		}
+	}
+
+	ret = row_search_for_mysql(buf, PAGE_CUR_G, prebuilt, 0, ROW_SEL_NEXT);
+
+	goto loop;
+}
+
+/*********************************************************************//**
+Determines if a table is a magic monitor table.
+@return	TRUE if monitor table */
+UNIV_INTERN
+ibool
+row_is_magic_monitor_table(
+/*=======================*/
+	const char*	table_name)	/*!< in: name of the table, in the
+					form database/table_name */
+{
+	const char*	name; /* table_name without database/ */
+	ulint		len;
+
+	name = strchr(table_name, '/');
+	ut_a(name != NULL);
+	name++;
+	len = strlen(name) + 1;
+
+	if (STR_EQ(name, len, S_innodb_monitor)
+	    || STR_EQ(name, len, S_innodb_lock_monitor)
+	    || STR_EQ(name, len, S_innodb_tablespace_monitor)
+	    || STR_EQ(name, len, S_innodb_table_monitor)
+	    || STR_EQ(name, len, S_innodb_mem_validate)) {
+
+		return(TRUE);
+	}
+
+	return(FALSE);
+}
diff --git a/storage/xtradb/row/row0purge.c b/storage/xtradb/row/row0purge.c
new file mode 100644
index 00000000000..835af990672
--- /dev/null
+++ b/storage/xtradb/row/row0purge.c
@@ -0,0 +1,700 @@
+/*****************************************************************************
+
+Copyright (c) 1997, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file row/row0purge.c
+Purge obsolete records
+
+Created 3/14/1997 Heikki Tuuri
+*******************************************************/
+
+#include "row0purge.h"
+
+#ifdef UNIV_NONINL
+#include "row0purge.ic"
+#endif
+
+#include "fsp0fsp.h"
+#include "mach0data.h"
+#include "trx0rseg.h"
+#include "trx0trx.h"
+#include "trx0roll.h"
+#include "trx0undo.h"
+#include "trx0purge.h"
+#include "trx0rec.h"
+#include "que0que.h"
+#include "row0row.h"
+#include "row0upd.h"
+#include "row0vers.h"
+#include "row0mysql.h"
+#include "log0log.h"
+
+/*************************************************************************
+IMPORTANT NOTE: Any operation that generates redo MUST check that there
+is enough space in the redo log before for that operation. This is
+done by calling log_free_check(). The reason for checking the
+availability of the redo log space before the start of the operation is
+that we MUST not hold any synchonization objects when performing the
+check.
+If you make a change in this module make sure that no codepath is
+introduced where a call to log_free_check() is bypassed. */
+
+/********************************************************************//**
+Creates a purge node to a query graph.
+@return	own: purge node */
+UNIV_INTERN
+purge_node_t*
+row_purge_node_create(
+/*==================*/
+	que_thr_t*	parent,	/*!< in: parent node, i.e., a thr node */
+	mem_heap_t*	heap)	/*!< in: memory heap where created */
+{
+	purge_node_t*	node;
+
+	ut_ad(parent && heap);
+
+	node = mem_heap_alloc(heap, sizeof(purge_node_t));
+
+	node->common.type = QUE_NODE_PURGE;
+	node->common.parent = parent;
+
+	node->heap = mem_heap_create(256);
+
+	return(node);
+}
+
+/***********************************************************//**
+Repositions the pcur in the purge node on the clustered index record,
+if found.
+@return	TRUE if the record was found */
+static
+ibool
+row_purge_reposition_pcur(
+/*======================*/
+	ulint		mode,	/*!< in: latching mode */
+	purge_node_t*	node,	/*!< in: row purge node */
+	mtr_t*		mtr)	/*!< in: mtr */
+{
+	ibool	found;
+
+	if (node->found_clust) {
+		found = btr_pcur_restore_position(mode, &(node->pcur), mtr);
+
+		return(found);
+	}
+
+	found = row_search_on_row_ref(&(node->pcur), mode, node->table,
+				      node->ref, mtr);
+	node->found_clust = found;
+
+	if (found) {
+		btr_pcur_store_position(&(node->pcur), mtr);
+	}
+
+	return(found);
+}
+
+/***********************************************************//**
+Removes a delete marked clustered index record if possible.
+@return TRUE if success, or if not found, or if modified after the
+delete marking */
+static
+ibool
+row_purge_remove_clust_if_poss_low(
+/*===============================*/
+	purge_node_t*	node,	/*!< in: row purge node */
+	ulint		mode)	/*!< in: BTR_MODIFY_LEAF or BTR_MODIFY_TREE */
+{
+	dict_index_t*	index;
+	btr_pcur_t*	pcur;
+	btr_cur_t*	btr_cur;
+	ibool		success;
+	ulint		err;
+	mtr_t		mtr;
+	rec_t*		rec;
+	mem_heap_t*	heap		= NULL;
+	ulint		offsets_[REC_OFFS_NORMAL_SIZE];
+	rec_offs_init(offsets_);
+
+	index = dict_table_get_first_index(node->table);
+
+	pcur = &(node->pcur);
+	btr_cur = btr_pcur_get_btr_cur(pcur);
+
+	log_free_check();
+	mtr_start(&mtr);
+
+	success = row_purge_reposition_pcur(mode, node, &mtr);
+
+	if (!success) {
+		/* The record is already removed */
+
+		btr_pcur_commit_specify_mtr(pcur, &mtr);
+
+		return(TRUE);
+	}
+
+	rec = btr_pcur_get_rec(pcur);
+
+	if (0 != ut_dulint_cmp(node->roll_ptr, row_get_rec_roll_ptr(
+				       rec, index, rec_get_offsets(
+					       rec, index, offsets_,
+					       ULINT_UNDEFINED, &heap)))) {
+		if (UNIV_LIKELY_NULL(heap)) {
+			mem_heap_free(heap);
+		}
+		/* Someone else has modified the record later: do not remove */
+		btr_pcur_commit_specify_mtr(pcur, &mtr);
+
+		return(TRUE);
+	}
+
+	if (UNIV_LIKELY_NULL(heap)) {
+		mem_heap_free(heap);
+	}
+
+	if (mode == BTR_MODIFY_LEAF) {
+		success = btr_cur_optimistic_delete(btr_cur, &mtr);
+	} else {
+		ut_ad(mode == BTR_MODIFY_TREE);
+		btr_cur_pessimistic_delete(&err, FALSE, btr_cur,
+					   RB_NONE, &mtr);
+
+		if (err == DB_SUCCESS) {
+			success = TRUE;
+		} else if (err == DB_OUT_OF_FILE_SPACE) {
+			success = FALSE;
+		} else {
+			ut_error;
+		}
+	}
+
+	btr_pcur_commit_specify_mtr(pcur, &mtr);
+
+	return(success);
+}
+
+/***********************************************************//**
+Removes a clustered index record if it has not been modified after the delete
+marking. */
+static
+void
+row_purge_remove_clust_if_poss(
+/*===========================*/
+	purge_node_t*	node)	/*!< in: row purge node */
+{
+	ibool	success;
+	ulint	n_tries	= 0;
+
+	/*	fputs("Purge: Removing clustered record\n", stderr); */
+
+	success = row_purge_remove_clust_if_poss_low(node, BTR_MODIFY_LEAF);
+	if (success) {
+
+		return;
+	}
+retry:
+	success = row_purge_remove_clust_if_poss_low(node, BTR_MODIFY_TREE);
+	/* The delete operation may fail if we have little
+	file space left: TODO: easiest to crash the database
+	and restart with more file space */
+
+	if (!success && n_tries < BTR_CUR_RETRY_DELETE_N_TIMES) {
+		n_tries++;
+
+		os_thread_sleep(BTR_CUR_RETRY_SLEEP_TIME);
+
+		goto retry;
+	}
+
+	ut_a(success);
+}
+
+/***********************************************************//**
+Removes a secondary index entry if possible.
+@return	TRUE if success or if not found */
+static
+ibool
+row_purge_remove_sec_if_poss_low(
+/*=============================*/
+	purge_node_t*	node,	/*!< in: row purge node */
+	dict_index_t*	index,	/*!< in: index */
+	const dtuple_t*	entry,	/*!< in: index entry */
+	ulint		mode)	/*!< in: latch mode BTR_MODIFY_LEAF or
+				BTR_MODIFY_TREE */
+{
+	btr_pcur_t	pcur;
+	btr_cur_t*	btr_cur;
+	ibool		success;
+	ibool		old_has = 0; /* remove warning */
+	ibool		found;
+	ulint		err;
+	mtr_t		mtr;
+	mtr_t		mtr_vers;
+
+	log_free_check();
+	mtr_start(&mtr);
+
+	found = row_search_index_entry(index, entry, mode, &pcur, &mtr);
+
+	if (!found) {
+		/* Not found.  This is a legitimate condition.  In a
+		rollback, InnoDB will remove secondary recs that would
+		be purged anyway.  Then the actual purge will not find
+		the secondary index record.  Also, the purge itself is
+		eager: if it comes to consider a secondary index
+		record, and notices it does not need to exist in the
+		index, it will remove it.  Then if/when the purge
+		comes to consider the secondary index record a second
+		time, it will not exist any more in the index. */
+
+		/* fputs("PURGE:........sec entry not found\n", stderr); */
+		/* dtuple_print(stderr, entry); */
+
+		btr_pcur_close(&pcur);
+		mtr_commit(&mtr);
+
+		return(TRUE);
+	}
+
+	btr_cur = btr_pcur_get_btr_cur(&pcur);
+
+	/* We should remove the index record if no later version of the row,
+	which cannot be purged yet, requires its existence. If some requires,
+	we should do nothing. */
+
+	mtr_start(&mtr_vers);
+
+	success = row_purge_reposition_pcur(BTR_SEARCH_LEAF, node, &mtr_vers);
+
+	if (success) {
+		old_has = row_vers_old_has_index_entry(
+			TRUE, btr_pcur_get_rec(&(node->pcur)),
+			&mtr_vers, index, entry);
+	}
+
+	btr_pcur_commit_specify_mtr(&(node->pcur), &mtr_vers);
+
+	if (!success || !old_has) {
+		/* Remove the index record */
+
+		if (mode == BTR_MODIFY_LEAF) {
+			success = btr_cur_optimistic_delete(btr_cur, &mtr);
+		} else {
+			ut_ad(mode == BTR_MODIFY_TREE);
+			btr_cur_pessimistic_delete(&err, FALSE, btr_cur,
+						   RB_NONE, &mtr);
+			success = err == DB_SUCCESS;
+			ut_a(success || err == DB_OUT_OF_FILE_SPACE);
+		}
+	}
+
+	btr_pcur_close(&pcur);
+	mtr_commit(&mtr);
+
+	return(success);
+}
+
+/***********************************************************//**
+Removes a secondary index entry if possible. */
+UNIV_INLINE
+void
+row_purge_remove_sec_if_poss(
+/*=========================*/
+	purge_node_t*	node,	/*!< in: row purge node */
+	dict_index_t*	index,	/*!< in: index */
+	dtuple_t*	entry)	/*!< in: index entry */
+{
+	ibool	success;
+	ulint	n_tries		= 0;
+
+	/*	fputs("Purge: Removing secondary record\n", stderr); */
+
+	success = row_purge_remove_sec_if_poss_low(node, index, entry,
+						   BTR_MODIFY_LEAF);
+	if (success) {
+
+		return;
+	}
+retry:
+	success = row_purge_remove_sec_if_poss_low(node, index, entry,
+						   BTR_MODIFY_TREE);
+	/* The delete operation may fail if we have little
+	file space left: TODO: easiest to crash the database
+	and restart with more file space */
+
+	if (!success && n_tries < BTR_CUR_RETRY_DELETE_N_TIMES) {
+
+		n_tries++;
+
+		os_thread_sleep(BTR_CUR_RETRY_SLEEP_TIME);
+
+		goto retry;
+	}
+
+	ut_a(success);
+}
+
+/***********************************************************//**
+Purges a delete marking of a record. */
+static
+void
+row_purge_del_mark(
+/*===============*/
+	purge_node_t*	node)	/*!< in: row purge node */
+{
+	mem_heap_t*	heap;
+	dtuple_t*	entry;
+	dict_index_t*	index;
+
+	ut_ad(node);
+
+	heap = mem_heap_create(1024);
+
+	while (node->index != NULL) {
+		index = node->index;
+
+		/* Build the index entry */
+		entry = row_build_index_entry(node->row, NULL, index, heap);
+		ut_a(entry);
+		row_purge_remove_sec_if_poss(node, index, entry);
+
+		node->index = dict_table_get_next_index(node->index);
+	}
+
+	mem_heap_free(heap);
+
+	row_purge_remove_clust_if_poss(node);
+}
+
+/***********************************************************//**
+Purges an update of an existing record. Also purges an update of a delete
+marked record if that record contained an externally stored field. */
+static
+void
+row_purge_upd_exist_or_extern(
+/*==========================*/
+	purge_node_t*	node)	/*!< in: row purge node */
+{
+	mem_heap_t*	heap;
+	dtuple_t*	entry;
+	dict_index_t*	index;
+	ibool		is_insert;
+	ulint		rseg_id;
+	ulint		page_no;
+	ulint		offset;
+	ulint		i;
+	mtr_t		mtr;
+
+	ut_ad(node);
+
+	if (node->rec_type == TRX_UNDO_UPD_DEL_REC) {
+
+		goto skip_secondaries;
+	}
+
+	heap = mem_heap_create(1024);
+
+	while (node->index != NULL) {
+		index = node->index;
+
+		if (row_upd_changes_ord_field_binary(NULL, node->index,
+						     node->update)) {
+			/* Build the older version of the index entry */
+			entry = row_build_index_entry(node->row, NULL,
+						      index, heap);
+			ut_a(entry);
+			row_purge_remove_sec_if_poss(node, index, entry);
+		}
+
+		node->index = dict_table_get_next_index(node->index);
+	}
+
+	mem_heap_free(heap);
+
+skip_secondaries:
+	/* Free possible externally stored fields */
+	for (i = 0; i < upd_get_n_fields(node->update); i++) {
+
+		const upd_field_t*	ufield
+			= upd_get_nth_field(node->update, i);
+
+		if (dfield_is_ext(&ufield->new_val)) {
+			buf_block_t*	block;
+			ulint		internal_offset;
+			byte*		data_field;
+
+			/* We use the fact that new_val points to
+			node->undo_rec and get thus the offset of
+			dfield data inside the undo record. Then we
+			can calculate from node->roll_ptr the file
+			address of the new_val data */
+
+			internal_offset
+				= ((const byte*)
+				   dfield_get_data(&ufield->new_val))
+				- node->undo_rec;
+
+			ut_a(internal_offset < UNIV_PAGE_SIZE);
+
+			trx_undo_decode_roll_ptr(node->roll_ptr,
+						 &is_insert, &rseg_id,
+						 &page_no, &offset);
+			mtr_start(&mtr);
+
+			/* We have to acquire an X-latch to the clustered
+			index tree */
+
+			index = dict_table_get_first_index(node->table);
+
+			mtr_x_lock(dict_index_get_lock(index), &mtr);
+
+			/* NOTE: we must also acquire an X-latch to the
+			root page of the tree. We will need it when we
+			free pages from the tree. If the tree is of height 1,
+			the tree X-latch does NOT protect the root page,
+			because it is also a leaf page. Since we will have a
+			latch on an undo log page, we would break the
+			latching order if we would only later latch the
+			root page of such a tree! */
+
+			btr_root_get(index, &mtr);
+
+			/* We assume in purge of externally stored fields
+			that the space id of the undo log record is 0! */
+
+			block = buf_page_get(0, 0, page_no, RW_X_LATCH, &mtr);
+			buf_block_dbg_add_level(block, SYNC_TRX_UNDO_PAGE);
+
+			data_field = buf_block_get_frame(block)
+				+ offset + internal_offset;
+
+			ut_a(dfield_get_len(&ufield->new_val)
+			     >= BTR_EXTERN_FIELD_REF_SIZE);
+			btr_free_externally_stored_field(
+				index,
+				data_field + dfield_get_len(&ufield->new_val)
+				- BTR_EXTERN_FIELD_REF_SIZE,
+				NULL, NULL, NULL, 0, RB_NONE, &mtr);
+			mtr_commit(&mtr);
+		}
+	}
+}
+
+/***********************************************************//**
+Parses the row reference and other info in a modify undo log record.
+@return TRUE if purge operation required: NOTE that then the CALLER
+must unfreeze data dictionary! */
+static
+ibool
+row_purge_parse_undo_rec(
+/*=====================*/
+	purge_node_t*	node,	/*!< in: row undo node */
+	ibool*		updated_extern,
+				/*!< out: TRUE if an externally stored field
+				was updated */
+	que_thr_t*	thr)	/*!< in: query thread */
+{
+	dict_index_t*	clust_index;
+	byte*		ptr;
+	trx_t*		trx;
+	undo_no_t	undo_no;
+	dulint		table_id;
+	trx_id_t	trx_id;
+	roll_ptr_t	roll_ptr;
+	ulint		info_bits;
+	ulint		type;
+	ulint		cmpl_info;
+
+	ut_ad(node && thr);
+
+	trx = thr_get_trx(thr);
+
+	ptr = trx_undo_rec_get_pars(node->undo_rec, &type, &cmpl_info,
+				    updated_extern, &undo_no, &table_id);
+	node->rec_type = type;
+
+	if (type == TRX_UNDO_UPD_DEL_REC && !(*updated_extern)) {
+
+		return(FALSE);
+	}
+
+	ptr = trx_undo_update_rec_get_sys_cols(ptr, &trx_id, &roll_ptr,
+					       &info_bits);
+	node->table = NULL;
+
+	if (type == TRX_UNDO_UPD_EXIST_REC
+	    && cmpl_info & UPD_NODE_NO_ORD_CHANGE && !(*updated_extern)) {
+
+		/* Purge requires no changes to indexes: we may return */
+
+		return(FALSE);
+	}
+
+	/* Prevent DROP TABLE etc. from running when we are doing the purge
+	for this row */
+
+	row_mysql_freeze_data_dictionary(trx);
+
+	mutex_enter(&(dict_sys->mutex));
+
+	node->table = dict_table_get_on_id_low(table_id);
+
+	mutex_exit(&(dict_sys->mutex));
+
+	if (node->table == NULL) {
+		/* The table has been dropped: no need to do purge */
+err_exit:
+		row_mysql_unfreeze_data_dictionary(trx);
+		return(FALSE);
+	}
+
+	if (node->table->ibd_file_missing) {
+		/* We skip purge of missing .ibd files */
+
+		node->table = NULL;
+
+		goto err_exit;
+	}
+
+	clust_index = dict_table_get_first_index(node->table);
+
+	if (clust_index == NULL) {
+		/* The table was corrupt in the data dictionary */
+
+		goto err_exit;
+	}
+
+	ptr = trx_undo_rec_get_row_ref(ptr, clust_index, &(node->ref),
+				       node->heap);
+
+	ptr = trx_undo_update_rec_get_update(ptr, clust_index, type, trx_id,
+					     roll_ptr, info_bits, trx,
+					     node->heap, &(node->update));
+
+	/* Read to the partial row the fields that occur in indexes */
+
+	if (!(cmpl_info & UPD_NODE_NO_ORD_CHANGE)) {
+		ptr = trx_undo_rec_get_partial_row(
+			ptr, clust_index, &node->row,
+			type == TRX_UNDO_UPD_DEL_REC,
+			node->heap);
+	}
+
+	return(TRUE);
+}
+
+/***********************************************************//**
+Fetches an undo log record and does the purge for the recorded operation.
+If none left, or the current purge completed, returns the control to the
+parent node, which is always a query thread node.
+@return	DB_SUCCESS if operation successfully completed, else error code */
+static
+ulint
+row_purge(
+/*======*/
+	purge_node_t*	node,	/*!< in: row purge node */
+	que_thr_t*	thr)	/*!< in: query thread */
+{
+	roll_ptr_t	roll_ptr;
+	ibool		purge_needed;
+	ibool		updated_extern;
+	trx_t*		trx;
+
+	ut_ad(node && thr);
+
+	trx = thr_get_trx(thr);
+
+	node->undo_rec = trx_purge_fetch_next_rec(&roll_ptr,
+						  &(node->reservation),
+						  node->heap);
+	if (!node->undo_rec) {
+		/* Purge completed for this query thread */
+
+		thr->run_node = que_node_get_parent(node);
+
+		return(DB_SUCCESS);
+	}
+
+	node->roll_ptr = roll_ptr;
+
+	if (node->undo_rec == &trx_purge_dummy_rec) {
+		purge_needed = FALSE;
+	} else {
+		purge_needed = row_purge_parse_undo_rec(node, &updated_extern,
+							thr);
+		/* If purge_needed == TRUE, we must also remember to unfreeze
+		data dictionary! */
+	}
+
+	if (purge_needed) {
+		node->found_clust = FALSE;
+
+		node->index = dict_table_get_next_index(
+			dict_table_get_first_index(node->table));
+
+		if (node->rec_type == TRX_UNDO_DEL_MARK_REC) {
+			row_purge_del_mark(node);
+
+		} else if (updated_extern
+			   || node->rec_type == TRX_UNDO_UPD_EXIST_REC) {
+
+			row_purge_upd_exist_or_extern(node);
+		}
+
+		if (node->found_clust) {
+			btr_pcur_close(&(node->pcur));
+		}
+
+		row_mysql_unfreeze_data_dictionary(trx);
+	}
+
+	/* Do some cleanup */
+	trx_purge_rec_release(node->reservation);
+	mem_heap_empty(node->heap);
+
+	thr->run_node = node;
+
+	return(DB_SUCCESS);
+}
+
+/***********************************************************//**
+Does the purge operation for a single undo log record. This is a high-level
+function used in an SQL execution graph.
+@return	query thread to run next or NULL */
+UNIV_INTERN
+que_thr_t*
+row_purge_step(
+/*===========*/
+	que_thr_t*	thr)	/*!< in: query thread */
+{
+	purge_node_t*	node;
+	ulint		err;
+
+	ut_ad(thr);
+
+	node = thr->run_node;
+
+	ut_ad(que_node_get_type(node) == QUE_NODE_PURGE);
+
+	err = row_purge(node, thr);
+
+	ut_ad(err == DB_SUCCESS);
+
+	return(thr);
+}
diff --git a/storage/xtradb/row/row0row.c b/storage/xtradb/row/row0row.c
new file mode 100644
index 00000000000..8e806a14a98
--- /dev/null
+++ b/storage/xtradb/row/row0row.c
@@ -0,0 +1,1179 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2010, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file row/row0row.c
+General row routines
+
+Created 4/20/1996 Heikki Tuuri
+*******************************************************/
+
+#include "row0row.h"
+
+#ifdef UNIV_NONINL
+#include "row0row.ic"
+#endif
+
+#include "data0type.h"
+#include "dict0dict.h"
+#include "btr0btr.h"
+#include "ha_prototypes.h"
+#include "mach0data.h"
+#include "trx0rseg.h"
+#include "trx0trx.h"
+#include "trx0roll.h"
+#include "trx0undo.h"
+#include "trx0purge.h"
+#include "trx0rec.h"
+#include "que0que.h"
+#include "row0ext.h"
+#include "row0upd.h"
+#include "rem0cmp.h"
+#include "read0read.h"
+#include "ut0mem.h"
+
+/*********************************************************************//**
+Gets the offset of trx id field, in bytes relative to the origin of
+a clustered index record.
+@return	offset of DATA_TRX_ID */
+UNIV_INTERN
+ulint
+row_get_trx_id_offset(
+/*==================*/
+	const rec_t*	rec __attribute__((unused)),
+				/*!< in: record */
+	dict_index_t*	index,	/*!< in: clustered index */
+	const ulint*	offsets)/*!< in: rec_get_offsets(rec, index) */
+{
+	ulint	pos;
+	ulint	offset;
+	ulint	len;
+
+	ut_ad(dict_index_is_clust(index));
+	ut_ad(rec_offs_validate(rec, index, offsets));
+
+	pos = dict_index_get_sys_col_pos(index, DATA_TRX_ID);
+
+	offset = rec_get_nth_field_offs(offsets, pos, &len);
+
+	ut_ad(len == DATA_TRX_ID_LEN);
+
+	return(offset);
+}
+
+/*****************************************************************//**
+When an insert or purge to a table is performed, this function builds
+the entry to be inserted into or purged from an index on the table.
+@return index entry which should be inserted or purged, or NULL if the
+externally stored columns in the clustered index record are
+unavailable and ext != NULL */
+UNIV_INTERN
+dtuple_t*
+row_build_index_entry(
+/*==================*/
+	const dtuple_t*	row,	/*!< in: row which should be
+				inserted or purged */
+	row_ext_t*	ext,	/*!< in: externally stored column prefixes,
+				or NULL */
+	dict_index_t*	index,	/*!< in: index on the table */
+	mem_heap_t*	heap)	/*!< in: memory heap from which the memory for
+				the index entry is allocated */
+{
+	dtuple_t*	entry;
+	ulint		entry_len;
+	ulint		i;
+
+	ut_ad(row && index && heap);
+	ut_ad(dtuple_check_typed(row));
+
+	entry_len = dict_index_get_n_fields(index);
+	entry = dtuple_create(heap, entry_len);
+
+	if (UNIV_UNLIKELY(index->type & DICT_UNIVERSAL)) {
+		dtuple_set_n_fields_cmp(entry, entry_len);
+		/* There may only be externally stored columns
+		in a clustered index B-tree of a user table. */
+		ut_a(!ext);
+	} else {
+		dtuple_set_n_fields_cmp(
+			entry, dict_index_get_n_unique_in_tree(index));
+	}
+
+	for (i = 0; i < entry_len; i++) {
+		const dict_field_t*	ind_field
+			= dict_index_get_nth_field(index, i);
+		const dict_col_t*	col
+			= ind_field->col;
+		ulint			col_no
+			= dict_col_get_no(col);
+		dfield_t*		dfield
+			= dtuple_get_nth_field(entry, i);
+		const dfield_t*		dfield2
+			= dtuple_get_nth_field(row, col_no);
+		ulint			len
+			= dfield_get_len(dfield2);
+
+		dfield_copy(dfield, dfield2);
+
+		if (dfield_is_null(dfield) || ind_field->prefix_len == 0) {
+			continue;
+		}
+
+		/* If a column prefix index, take only the prefix.
+		Prefix-indexed columns may be externally stored. */
+		ut_ad(col->ord_part);
+
+		if (UNIV_LIKELY_NULL(ext)) {
+			/* See if the column is stored externally. */
+			const byte*	buf = row_ext_lookup(ext, col_no,
+							     &len);
+			if (UNIV_LIKELY_NULL(buf)) {
+				if (UNIV_UNLIKELY(buf == field_ref_zero)) {
+					return(NULL);
+				}
+				dfield_set_data(dfield, buf, len);
+			}
+		} else if (dfield_is_ext(dfield)) {
+			ut_a(len >= BTR_EXTERN_FIELD_REF_SIZE);
+			len -= BTR_EXTERN_FIELD_REF_SIZE;
+			ut_a(ind_field->prefix_len <= len
+			     || dict_index_is_clust(index));
+		}
+
+		len = dtype_get_at_most_n_mbchars(
+			col->prtype, col->mbminlen, col->mbmaxlen,
+			ind_field->prefix_len, len, dfield_get_data(dfield));
+		dfield_set_len(dfield, len);
+	}
+
+	ut_ad(dtuple_check_typed(entry));
+
+	return(entry);
+}
+
+/*******************************************************************//**
+An inverse function to row_build_index_entry. Builds a row from a
+record in a clustered index.
+@return	own: row built; see the NOTE below! */
+UNIV_INTERN
+dtuple_t*
+row_build(
+/*======*/
+	ulint			type,	/*!< in: ROW_COPY_POINTERS or
+					ROW_COPY_DATA; the latter
+					copies also the data fields to
+					heap while the first only
+					places pointers to data fields
+					on the index page, and thus is
+					more efficient */
+	const dict_index_t*	index,	/*!< in: clustered index */
+	const rec_t*		rec,	/*!< in: record in the clustered
+					index; NOTE: in the case
+					ROW_COPY_POINTERS the data
+					fields in the row will point
+					directly into this record,
+					therefore, the buffer page of
+					this record must be at least
+					s-latched and the latch held
+					as long as the row dtuple is used! */
+	const ulint*		offsets,/*!< in: rec_get_offsets(rec,index)
+					or NULL, in which case this function
+					will invoke rec_get_offsets() */
+	const dict_table_t*	col_table,
+					/*!< in: table, to check which
+					externally stored columns
+					occur in the ordering columns
+					of an index, or NULL if
+					index->table should be
+					consulted instead */
+	row_ext_t**		ext,	/*!< out, own: cache of
+					externally stored column
+					prefixes, or NULL */
+	mem_heap_t*		heap)	/*!< in: memory heap from which
+					the memory needed is allocated */
+{
+	dtuple_t*		row;
+	const dict_table_t*	table;
+	ulint			n_fields;
+	ulint			n_ext_cols;
+	ulint*			ext_cols	= NULL; /* remove warning */
+	ulint			len;
+	ulint			row_len;
+	byte*			buf;
+	ulint			i;
+	ulint			j;
+	mem_heap_t*		tmp_heap	= NULL;
+	ulint			offsets_[REC_OFFS_NORMAL_SIZE];
+	rec_offs_init(offsets_);
+
+	ut_ad(index && rec && heap);
+	ut_ad(dict_index_is_clust(index));
+
+	if (!offsets) {
+		offsets = rec_get_offsets(rec, index, offsets_,
+					  ULINT_UNDEFINED, &tmp_heap);
+	} else {
+		ut_ad(rec_offs_validate(rec, index, offsets));
+	}
+
+	if (type != ROW_COPY_POINTERS) {
+		/* Take a copy of rec to heap */
+		buf = mem_heap_alloc(heap, rec_offs_size(offsets));
+		rec = rec_copy(buf, rec, offsets);
+		/* Avoid a debug assertion in rec_offs_validate(). */
+		rec_offs_make_valid(rec, index, (ulint*) offsets);
+	}
+
+	table = index->table;
+	row_len = dict_table_get_n_cols(table);
+
+	row = dtuple_create(heap, row_len);
+
+	dict_table_copy_types(row, table);
+
+	dtuple_set_info_bits(row, rec_get_info_bits(
+				     rec, dict_table_is_comp(table)));
+
+	n_fields = rec_offs_n_fields(offsets);
+	n_ext_cols = rec_offs_n_extern(offsets);
+	if (n_ext_cols) {
+		ext_cols = mem_heap_alloc(heap, n_ext_cols * sizeof *ext_cols);
+	}
+
+	for (i = j = 0; i < n_fields; i++) {
+		dict_field_t*		ind_field
+			= dict_index_get_nth_field(index, i);
+		const dict_col_t*	col
+			= dict_field_get_col(ind_field);
+		ulint			col_no
+			= dict_col_get_no(col);
+		dfield_t*		dfield
+			= dtuple_get_nth_field(row, col_no);
+
+		if (ind_field->prefix_len == 0) {
+
+			const byte*	field = rec_get_nth_field(
+				rec, offsets, i, &len);
+
+			dfield_set_data(dfield, field, len);
+		}
+
+		if (rec_offs_nth_extern(offsets, i)) {
+			dfield_set_ext(dfield);
+
+			if (UNIV_LIKELY_NULL(col_table)) {
+				ut_a(col_no
+				     < dict_table_get_n_cols(col_table));
+				col = dict_table_get_nth_col(
+					col_table, col_no);
+			}
+
+			if (col->ord_part) {
+				/* We will have to fetch prefixes of
+				externally stored columns that are
+				referenced by column prefixes. */
+				ext_cols[j++] = col_no;
+			}
+		}
+	}
+
+	ut_ad(dtuple_check_typed(row));
+
+	if (!ext) {
+		/* REDUNDANT and COMPACT formats store a local
+		768-byte prefix of each externally stored
+		column. No cache is needed. */
+		ut_ad(dict_table_get_format(index->table)
+		      < DICT_TF_FORMAT_ZIP);
+	} else if (j) {
+		*ext = row_ext_create(j, ext_cols, row,
+				      dict_table_zip_size(index->table),
+				      heap);
+	} else {
+		*ext = NULL;
+	}
+
+	if (tmp_heap) {
+		mem_heap_free(tmp_heap);
+	}
+
+	return(row);
+}
+
+/*******************************************************************//**
+Converts an index record to a typed data tuple.
+@return index entry built; does not set info_bits, and the data fields
+in the entry will point directly to rec */
+UNIV_INTERN
+dtuple_t*
+row_rec_to_index_entry_low(
+/*=======================*/
+	const rec_t*		rec,	/*!< in: record in the index */
+	const dict_index_t*	index,	/*!< in: index */
+	const ulint*		offsets,/*!< in: rec_get_offsets(rec, index) */
+	ulint*			n_ext,	/*!< out: number of externally
+					stored columns */
+	mem_heap_t*		heap)	/*!< in: memory heap from which
+					the memory needed is allocated */
+{
+	dtuple_t*	entry;
+	dfield_t*	dfield;
+	ulint		i;
+	const byte*	field;
+	ulint		len;
+	ulint		rec_len;
+
+	ut_ad(rec && heap && index);
+	/* Because this function may be invoked by row0merge.c
+	on a record whose header is in different format, the check
+	rec_offs_validate(rec, index, offsets) must be avoided here. */
+	ut_ad(n_ext);
+	*n_ext = 0;
+
+	rec_len = rec_offs_n_fields(offsets);
+
+	entry = dtuple_create(heap, rec_len);
+
+	dtuple_set_n_fields_cmp(entry,
+				dict_index_get_n_unique_in_tree(index));
+	ut_ad(rec_len == dict_index_get_n_fields(index));
+
+	dict_index_copy_types(entry, index, rec_len);
+
+	for (i = 0; i < rec_len; i++) {
+
+		dfield = dtuple_get_nth_field(entry, i);
+		field = rec_get_nth_field(rec, offsets, i, &len);
+
+		dfield_set_data(dfield, field, len);
+
+		if (rec_offs_nth_extern(offsets, i)) {
+			dfield_set_ext(dfield);
+			(*n_ext)++;
+		}
+	}
+
+	ut_ad(dtuple_check_typed(entry));
+
+	return(entry);
+}
+
+/*******************************************************************//**
+Converts an index record to a typed data tuple. NOTE that externally
+stored (often big) fields are NOT copied to heap.
+@return	own: index entry built; see the NOTE below! */
+UNIV_INTERN
+dtuple_t*
+row_rec_to_index_entry(
+/*===================*/
+	ulint			type,	/*!< in: ROW_COPY_DATA, or
+					ROW_COPY_POINTERS: the former
+					copies also the data fields to
+					heap as the latter only places
+					pointers to data fields on the
+					index page */
+	const rec_t*		rec,	/*!< in: record in the index;
+					NOTE: in the case
+					ROW_COPY_POINTERS the data
+					fields in the row will point
+					directly into this record,
+					therefore, the buffer page of
+					this record must be at least
+					s-latched and the latch held
+					as long as the dtuple is used! */
+	const dict_index_t*	index,	/*!< in: index */
+	ulint*			offsets,/*!< in/out: rec_get_offsets(rec) */
+	ulint*			n_ext,	/*!< out: number of externally
+					stored columns */
+	mem_heap_t*		heap)	/*!< in: memory heap from which
+					the memory needed is allocated */
+{
+	dtuple_t*	entry;
+	byte*		buf;
+
+	ut_ad(rec && heap && index);
+	ut_ad(rec_offs_validate(rec, index, offsets));
+
+	if (type == ROW_COPY_DATA) {
+		/* Take a copy of rec to heap */
+		buf = mem_heap_alloc(heap, rec_offs_size(offsets));
+		rec = rec_copy(buf, rec, offsets);
+		/* Avoid a debug assertion in rec_offs_validate(). */
+		rec_offs_make_valid(rec, index, offsets);
+	}
+
+	entry = row_rec_to_index_entry_low(rec, index, offsets, n_ext, heap);
+
+	dtuple_set_info_bits(entry,
+			     rec_get_info_bits(rec, rec_offs_comp(offsets)));
+
+	return(entry);
+}
+
+/*******************************************************************//**
+Builds from a secondary index record a row reference with which we can
+search the clustered index record.
+@return	own: row reference built; see the NOTE below! */
+UNIV_INTERN
+dtuple_t*
+row_build_row_ref(
+/*==============*/
+	ulint		type,	/*!< in: ROW_COPY_DATA, or ROW_COPY_POINTERS:
+				the former copies also the data fields to
+				heap, whereas the latter only places pointers
+				to data fields on the index page */
+	dict_index_t*	index,	/*!< in: secondary index */
+	const rec_t*	rec,	/*!< in: record in the index;
+				NOTE: in the case ROW_COPY_POINTERS
+				the data fields in the row will point
+				directly into this record, therefore,
+				the buffer page of this record must be
+				at least s-latched and the latch held
+				as long as the row reference is used! */
+	mem_heap_t*	heap)	/*!< in: memory heap from which the memory
+				needed is allocated */
+{
+	dict_table_t*	table;
+	dict_index_t*	clust_index;
+	dfield_t*	dfield;
+	dtuple_t*	ref;
+	const byte*	field;
+	ulint		len;
+	ulint		ref_len;
+	ulint		pos;
+	byte*		buf;
+	ulint		clust_col_prefix_len;
+	ulint		i;
+	mem_heap_t*	tmp_heap	= NULL;
+	ulint		offsets_[REC_OFFS_NORMAL_SIZE];
+	ulint*		offsets		= offsets_;
+	rec_offs_init(offsets_);
+
+	ut_ad(index && rec && heap);
+	ut_ad(!dict_index_is_clust(index));
+
+	offsets = rec_get_offsets(rec, index, offsets,
+				  ULINT_UNDEFINED, &tmp_heap);
+	/* Secondary indexes must not contain externally stored columns. */
+	ut_ad(!rec_offs_any_extern(offsets));
+
+	if (type == ROW_COPY_DATA) {
+		/* Take a copy of rec to heap */
+
+		buf = mem_heap_alloc(heap, rec_offs_size(offsets));
+
+		rec = rec_copy(buf, rec, offsets);
+		/* Avoid a debug assertion in rec_offs_validate(). */
+		rec_offs_make_valid(rec, index, offsets);
+	}
+
+	table = index->table;
+
+	clust_index = dict_table_get_first_index(table);
+
+	ref_len = dict_index_get_n_unique(clust_index);
+
+	ref = dtuple_create(heap, ref_len);
+
+	dict_index_copy_types(ref, clust_index, ref_len);
+
+	for (i = 0; i < ref_len; i++) {
+		dfield = dtuple_get_nth_field(ref, i);
+
+		pos = dict_index_get_nth_field_pos(index, clust_index, i);
+
+		ut_a(pos != ULINT_UNDEFINED);
+
+		field = rec_get_nth_field(rec, offsets, pos, &len);
+
+		dfield_set_data(dfield, field, len);
+
+		/* If the primary key contains a column prefix, then the
+		secondary index may contain a longer prefix of the same
+		column, or the full column, and we must adjust the length
+		accordingly. */
+
+		clust_col_prefix_len = dict_index_get_nth_field(
+			clust_index, i)->prefix_len;
+
+		if (clust_col_prefix_len > 0) {
+			if (len != UNIV_SQL_NULL) {
+
+				const dtype_t*	dtype
+					= dfield_get_type(dfield);
+
+				dfield_set_len(dfield,
+					       dtype_get_at_most_n_mbchars(
+						       dtype->prtype,
+						       dtype->mbminlen,
+						       dtype->mbmaxlen,
+						       clust_col_prefix_len,
+						       len, (char*) field));
+			}
+		}
+	}
+
+	ut_ad(dtuple_check_typed(ref));
+	if (tmp_heap) {
+		mem_heap_free(tmp_heap);
+	}
+
+	return(ref);
+}
+
+/*******************************************************************//**
+Builds from a secondary index record a row reference with which we can
+search the clustered index record. */
+UNIV_INTERN
+void
+row_build_row_ref_in_tuple(
+/*=======================*/
+	dtuple_t*		ref,	/*!< in/out: row reference built;
+					see the NOTE below! */
+	const rec_t*		rec,	/*!< in: record in the index;
+					NOTE: the data fields in ref
+					will point directly into this
+					record, therefore, the buffer
+					page of this record must be at
+					least s-latched and the latch
+					held as long as the row
+					reference is used! */
+	const dict_index_t*	index,	/*!< in: secondary index */
+	ulint*			offsets,/*!< in: rec_get_offsets(rec, index)
+					or NULL */
+	trx_t*			trx)	/*!< in: transaction */
+{
+	const dict_index_t*	clust_index;
+	dfield_t*		dfield;
+	const byte*		field;
+	ulint			len;
+	ulint			ref_len;
+	ulint			pos;
+	ulint			clust_col_prefix_len;
+	ulint			i;
+	mem_heap_t*		heap		= NULL;
+	ulint			offsets_[REC_OFFS_NORMAL_SIZE];
+	rec_offs_init(offsets_);
+
+	ut_a(ref);
+	ut_a(index);
+	ut_a(rec);
+	ut_ad(!dict_index_is_clust(index));
+
+	if (UNIV_UNLIKELY(!index->table)) {
+		fputs("InnoDB: table ", stderr);
+notfound:
+		ut_print_name(stderr, trx, TRUE, index->table_name);
+		fputs(" for index ", stderr);
+		ut_print_name(stderr, trx, FALSE, index->name);
+		fputs(" not found\n", stderr);
+		ut_error;
+	}
+
+	clust_index = dict_table_get_first_index(index->table);
+
+	if (UNIV_UNLIKELY(!clust_index)) {
+		fputs("InnoDB: clust index for table ", stderr);
+		goto notfound;
+	}
+
+	if (!offsets) {
+		offsets = rec_get_offsets(rec, index, offsets_,
+					  ULINT_UNDEFINED, &heap);
+	} else {
+		ut_ad(rec_offs_validate(rec, index, offsets));
+	}
+
+	/* Secondary indexes must not contain externally stored columns. */
+	ut_ad(!rec_offs_any_extern(offsets));
+	ref_len = dict_index_get_n_unique(clust_index);
+
+	ut_ad(ref_len == dtuple_get_n_fields(ref));
+
+	dict_index_copy_types(ref, clust_index, ref_len);
+
+	for (i = 0; i < ref_len; i++) {
+		dfield = dtuple_get_nth_field(ref, i);
+
+		pos = dict_index_get_nth_field_pos(index, clust_index, i);
+
+		ut_a(pos != ULINT_UNDEFINED);
+
+		field = rec_get_nth_field(rec, offsets, pos, &len);
+
+		dfield_set_data(dfield, field, len);
+
+		/* If the primary key contains a column prefix, then the
+		secondary index may contain a longer prefix of the same
+		column, or the full column, and we must adjust the length
+		accordingly. */
+
+		clust_col_prefix_len = dict_index_get_nth_field(
+			clust_index, i)->prefix_len;
+
+		if (clust_col_prefix_len > 0) {
+			if (len != UNIV_SQL_NULL) {
+
+				const dtype_t*	dtype
+					= dfield_get_type(dfield);
+
+				dfield_set_len(dfield,
+					       dtype_get_at_most_n_mbchars(
+						       dtype->prtype,
+						       dtype->mbminlen,
+						       dtype->mbmaxlen,
+						       clust_col_prefix_len,
+						       len, (char*) field));
+			}
+		}
+	}
+
+	ut_ad(dtuple_check_typed(ref));
+	if (UNIV_LIKELY_NULL(heap)) {
+		mem_heap_free(heap);
+	}
+}
+
+/***************************************************************//**
+Searches the clustered index record for a row, if we have the row reference.
+@return	TRUE if found */
+UNIV_INTERN
+ibool
+row_search_on_row_ref(
+/*==================*/
+	btr_pcur_t*		pcur,	/*!< out: persistent cursor, which must
+					be closed by the caller */
+	ulint			mode,	/*!< in: BTR_MODIFY_LEAF, ... */
+	const dict_table_t*	table,	/*!< in: table */
+	const dtuple_t*		ref,	/*!< in: row reference */
+	mtr_t*			mtr)	/*!< in/out: mtr */
+{
+	ulint		low_match;
+	rec_t*		rec;
+	dict_index_t*	index;
+
+	ut_ad(dtuple_check_typed(ref));
+
+	index = dict_table_get_first_index(table);
+
+	ut_a(dtuple_get_n_fields(ref) == dict_index_get_n_unique(index));
+
+	btr_pcur_open(index, ref, PAGE_CUR_LE, mode, pcur, mtr);
+
+	low_match = btr_pcur_get_low_match(pcur);
+
+	rec = btr_pcur_get_rec(pcur);
+
+	if (page_rec_is_infimum(rec)) {
+
+		return(FALSE);
+	}
+
+	if (low_match != dtuple_get_n_fields(ref)) {
+
+		return(FALSE);
+	}
+
+	return(TRUE);
+}
+
+/*********************************************************************//**
+Fetches the clustered index record for a secondary index record. The latches
+on the secondary index record are preserved.
+@return	record or NULL, if no record found */
+UNIV_INTERN
+rec_t*
+row_get_clust_rec(
+/*==============*/
+	ulint		mode,	/*!< in: BTR_MODIFY_LEAF, ... */
+	const rec_t*	rec,	/*!< in: record in a secondary index */
+	dict_index_t*	index,	/*!< in: secondary index */
+	dict_index_t**	clust_index,/*!< out: clustered index */
+	mtr_t*		mtr)	/*!< in: mtr */
+{
+	mem_heap_t*	heap;
+	dtuple_t*	ref;
+	dict_table_t*	table;
+	btr_pcur_t	pcur;
+	ibool		found;
+	rec_t*		clust_rec;
+
+	ut_ad(!dict_index_is_clust(index));
+
+	table = index->table;
+
+	heap = mem_heap_create(256);
+
+	ref = row_build_row_ref(ROW_COPY_POINTERS, index, rec, heap);
+
+	found = row_search_on_row_ref(&pcur, mode, table, ref, mtr);
+
+	clust_rec = found ? btr_pcur_get_rec(&pcur) : NULL;
+
+	mem_heap_free(heap);
+
+	btr_pcur_close(&pcur);
+
+	*clust_index = dict_table_get_first_index(table);
+
+	return(clust_rec);
+}
+
+/***************************************************************//**
+Searches an index record.
+@return	TRUE if found */
+UNIV_INTERN
+ibool
+row_search_index_entry(
+/*===================*/
+	dict_index_t*	index,	/*!< in: index */
+	const dtuple_t*	entry,	/*!< in: index entry */
+	ulint		mode,	/*!< in: BTR_MODIFY_LEAF, ... */
+	btr_pcur_t*	pcur,	/*!< in/out: persistent cursor, which must
+				be closed by the caller */
+	mtr_t*		mtr)	/*!< in: mtr */
+{
+	ulint	n_fields;
+	ulint	low_match;
+	rec_t*	rec;
+
+	ut_ad(dtuple_check_typed(entry));
+
+	btr_pcur_open(index, entry, PAGE_CUR_LE, mode, pcur, mtr);
+	low_match = btr_pcur_get_low_match(pcur);
+
+	rec = btr_pcur_get_rec(pcur);
+
+	n_fields = dtuple_get_n_fields(entry);
+
+	return(!page_rec_is_infimum(rec) && low_match == n_fields);
+}
+
+#include <my_sys.h>
+
+/*******************************************************************//**
+Formats the raw data in "data" (in InnoDB on-disk format) that is of
+type DATA_INT using "prtype" and writes the result to "buf".
+If the data is in unknown format, then nothing is written to "buf",
+0 is returned and "format_in_hex" is set to TRUE, otherwise
+"format_in_hex" is left untouched.
+Not more than "buf_size" bytes are written to "buf".
+The result is always '\0'-terminated (provided buf_size > 0) and the
+number of bytes that were written to "buf" is returned (including the
+terminating '\0').
+@return	number of bytes that were written */
+static
+ulint
+row_raw_format_int(
+/*===============*/
+	const char*	data,		/*!< in: raw data */
+	ulint		data_len,	/*!< in: raw data length
+					in bytes */
+	ulint		prtype,		/*!< in: precise type */
+	char*		buf,		/*!< out: output buffer */
+	ulint		buf_size,	/*!< in: output buffer size
+					in bytes */
+	ibool*		format_in_hex)	/*!< out: should the data be
+					formated in hex */
+{
+	ulint	ret;
+
+	if (data_len <= sizeof(ullint)) {
+
+		ullint		value;
+		ibool		unsigned_type = prtype & DATA_UNSIGNED;
+
+		value = mach_read_int_type((const byte*) data,
+					   data_len, unsigned_type);
+
+		if (unsigned_type) {
+
+			ret = ut_snprintf(buf, buf_size, "%llu",
+					  value) + 1;
+		} else {
+
+			ret = ut_snprintf(buf, buf_size, "%lld",
+					  (long long) value) + 1;
+		}
+
+	} else {
+
+		*format_in_hex = TRUE;
+		ret = 0;
+	}
+
+	return(ut_min(ret, buf_size));
+}
+
+/*******************************************************************//**
+Formats the raw data in "data" (in InnoDB on-disk format) that is of
+type DATA_(CHAR|VARCHAR|MYSQL|VARMYSQL) using "prtype" and writes the
+result to "buf".
+If the data is in binary format, then nothing is written to "buf",
+0 is returned and "format_in_hex" is set to TRUE, otherwise
+"format_in_hex" is left untouched.
+Not more than "buf_size" bytes are written to "buf".
+The result is always '\0'-terminated (provided buf_size > 0) and the
+number of bytes that were written to "buf" is returned (including the
+terminating '\0').
+@return	number of bytes that were written */
+static
+ulint
+row_raw_format_str(
+/*===============*/
+	const char*	data,		/*!< in: raw data */
+	ulint		data_len,	/*!< in: raw data length
+					in bytes */
+	ulint		prtype,		/*!< in: precise type */
+	char*		buf,		/*!< out: output buffer */
+	ulint		buf_size,	/*!< in: output buffer size
+					in bytes */
+	ibool*		format_in_hex)	/*!< out: should the data be
+					formated in hex */
+{
+	ulint	charset_coll;
+
+	if (buf_size == 0) {
+
+		return(0);
+	}
+
+	/* we assume system_charset_info is UTF-8 */
+
+	charset_coll = dtype_get_charset_coll(prtype);
+
+	if (UNIV_LIKELY(dtype_is_utf8(prtype))) {
+
+		return(ut_str_sql_format(data, data_len, buf, buf_size));
+	}
+	/* else */
+
+	if (charset_coll == DATA_MYSQL_BINARY_CHARSET_COLL) {
+
+		*format_in_hex = TRUE;
+		return(0);
+	}
+	/* else */
+
+	return(innobase_raw_format(data, data_len, charset_coll,
+					  buf, buf_size));
+}
+
+/*******************************************************************//**
+Formats the raw data in "data" (in InnoDB on-disk format) using
+"dict_field" and writes the result to "buf".
+Not more than "buf_size" bytes are written to "buf".
+The result is always NUL-terminated (provided buf_size is positive) and the
+number of bytes that were written to "buf" is returned (including the
+terminating NUL).
+@return	number of bytes that were written */
+UNIV_INTERN
+ulint
+row_raw_format(
+/*===========*/
+	const char*		data,		/*!< in: raw data */
+	ulint			data_len,	/*!< in: raw data length
+						in bytes */
+	const dict_field_t*	dict_field,	/*!< in: index field */
+	char*			buf,		/*!< out: output buffer */
+	ulint			buf_size)	/*!< in: output buffer size
+						in bytes */
+{
+	ulint	mtype;
+	ulint	prtype;
+	ulint	ret;
+	ibool	format_in_hex;
+
+	if (buf_size == 0) {
+
+		return(0);
+	}
+
+	if (data_len == UNIV_SQL_NULL) {
+
+		ret = ut_snprintf((char*) buf, buf_size, "NULL") + 1;
+
+		return(ut_min(ret, buf_size));
+	}
+
+	mtype = dict_field->col->mtype;
+	prtype = dict_field->col->prtype;
+
+	format_in_hex = FALSE;
+
+	switch (mtype) {
+	case DATA_INT:
+
+		ret = row_raw_format_int(data, data_len, prtype,
+					 buf, buf_size, &format_in_hex);
+		if (format_in_hex) {
+
+			goto format_in_hex;
+		}
+		break;
+	case DATA_CHAR:
+	case DATA_VARCHAR:
+	case DATA_MYSQL:
+	case DATA_VARMYSQL:
+
+		ret = row_raw_format_str(data, data_len, prtype,
+					 buf, buf_size, &format_in_hex);
+		if (format_in_hex) {
+
+			goto format_in_hex;
+		}
+
+		break;
+	/* XXX support more data types */
+	default:
+	format_in_hex:
+
+		if (UNIV_LIKELY(buf_size > 2)) {
+
+			memcpy(buf, "0x", 2);
+			buf += 2;
+			buf_size -= 2;
+			ret = 2 + ut_raw_to_hex(data, data_len,
+						buf, buf_size);
+		} else {
+
+			buf[0] = '\0';
+			ret = 1;
+		}
+	}
+
+	return(ret);
+}
+
+#ifdef UNIV_COMPILE_TEST_FUNCS
+
+#include "ut0dbg.h"
+
+void
+test_row_raw_format_int()
+{
+	ulint	ret;
+	char	buf[128];
+	ibool	format_in_hex;
+
+#define CALL_AND_TEST(data, data_len, prtype, buf, buf_size,\
+		      ret_expected, buf_expected, format_in_hex_expected)\
+	do {\
+		ibool	ok = TRUE;\
+		ulint	i;\
+		memset(buf, 'x', 10);\
+		buf[10] = '\0';\
+		format_in_hex = FALSE;\
+		fprintf(stderr, "TESTING \"\\x");\
+		for (i = 0; i < data_len; i++) {\
+			fprintf(stderr, "%02hhX", data[i]);\
+		}\
+		fprintf(stderr, "\", %lu, %lu, %lu\n",\
+                        (ulint) data_len, (ulint) prtype,\
+			(ulint) buf_size);\
+		ret = row_raw_format_int(data, data_len, prtype,\
+					 buf, buf_size, &format_in_hex);\
+		if (ret != ret_expected) {\
+			fprintf(stderr, "expected ret %lu, got %lu\n",\
+				(ulint) ret_expected, ret);\
+			ok = FALSE;\
+                }\
+                if (strcmp((char*) buf, buf_expected) != 0) {\
+                        fprintf(stderr, "expected buf \"%s\", got \"%s\"\n",\
+                                buf_expected, buf);\
+                        ok = FALSE;\
+                }\
+                if (format_in_hex != format_in_hex_expected) {\
+                        fprintf(stderr, "expected format_in_hex %d, got %d\n",\
+                                (int) format_in_hex_expected,\
+				(int) format_in_hex);\
+                        ok = FALSE;\
+                }\
+                if (ok) {\
+                        fprintf(stderr, "OK: %lu, \"%s\" %d\n\n",\
+                                (ulint) ret, buf, (int) format_in_hex);\
+                } else {\
+                        return;\
+                }\
+        } while (0)
+
+#if 1
+	/* min values for signed 1-8 byte integers */
+
+	CALL_AND_TEST("\x00", 1, 0,
+		      buf, sizeof(buf), 5, "-128", 0);
+
+	CALL_AND_TEST("\x00\x00", 2, 0,
+		      buf, sizeof(buf), 7, "-32768", 0);
+
+	CALL_AND_TEST("\x00\x00\x00", 3, 0,
+		      buf, sizeof(buf), 9, "-8388608", 0);
+
+	CALL_AND_TEST("\x00\x00\x00\x00", 4, 0,
+		      buf, sizeof(buf), 12, "-2147483648", 0);
+
+	CALL_AND_TEST("\x00\x00\x00\x00\x00", 5, 0,
+		      buf, sizeof(buf), 14, "-549755813888", 0);
+
+	CALL_AND_TEST("\x00\x00\x00\x00\x00\x00", 6, 0,
+		      buf, sizeof(buf), 17, "-140737488355328", 0);
+
+	CALL_AND_TEST("\x00\x00\x00\x00\x00\x00\x00", 7, 0,
+		      buf, sizeof(buf), 19, "-36028797018963968", 0);
+
+	CALL_AND_TEST("\x00\x00\x00\x00\x00\x00\x00\x00", 8, 0,
+		      buf, sizeof(buf), 21, "-9223372036854775808", 0);
+
+	/* min values for unsigned 1-8 byte integers */
+
+	CALL_AND_TEST("\x00", 1, DATA_UNSIGNED,
+		      buf, sizeof(buf), 2, "0", 0);
+
+	CALL_AND_TEST("\x00\x00", 2, DATA_UNSIGNED,
+		      buf, sizeof(buf), 2, "0", 0);
+
+	CALL_AND_TEST("\x00\x00\x00", 3, DATA_UNSIGNED,
+		      buf, sizeof(buf), 2, "0", 0);
+
+	CALL_AND_TEST("\x00\x00\x00\x00", 4, DATA_UNSIGNED,
+		      buf, sizeof(buf), 2, "0", 0);
+
+	CALL_AND_TEST("\x00\x00\x00\x00\x00", 5, DATA_UNSIGNED,
+		      buf, sizeof(buf), 2, "0", 0);
+
+	CALL_AND_TEST("\x00\x00\x00\x00\x00\x00", 6, DATA_UNSIGNED,
+		      buf, sizeof(buf), 2, "0", 0);
+
+	CALL_AND_TEST("\x00\x00\x00\x00\x00\x00\x00", 7, DATA_UNSIGNED,
+		      buf, sizeof(buf), 2, "0", 0);
+
+	CALL_AND_TEST("\x00\x00\x00\x00\x00\x00\x00\x00", 8, DATA_UNSIGNED,
+		      buf, sizeof(buf), 2, "0", 0);
+
+	/* max values for signed 1-8 byte integers */
+
+	CALL_AND_TEST("\xFF", 1, 0,
+		      buf, sizeof(buf), 4, "127", 0);
+
+	CALL_AND_TEST("\xFF\xFF", 2, 0,
+		      buf, sizeof(buf), 6, "32767", 0);
+
+	CALL_AND_TEST("\xFF\xFF\xFF", 3, 0,
+		      buf, sizeof(buf), 8, "8388607", 0);
+
+	CALL_AND_TEST("\xFF\xFF\xFF\xFF", 4, 0,
+		      buf, sizeof(buf), 11, "2147483647", 0);
+
+	CALL_AND_TEST("\xFF\xFF\xFF\xFF\xFF", 5, 0,
+		      buf, sizeof(buf), 13, "549755813887", 0);
+
+	CALL_AND_TEST("\xFF\xFF\xFF\xFF\xFF\xFF", 6, 0,
+		      buf, sizeof(buf), 16, "140737488355327", 0);
+
+	CALL_AND_TEST("\xFF\xFF\xFF\xFF\xFF\xFF\xFF", 7, 0,
+		      buf, sizeof(buf), 18, "36028797018963967", 0);
+
+	CALL_AND_TEST("\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF", 8, 0,
+		      buf, sizeof(buf), 20, "9223372036854775807", 0);
+
+	/* max values for unsigned 1-8 byte integers */
+
+	CALL_AND_TEST("\xFF", 1, DATA_UNSIGNED,
+		      buf, sizeof(buf), 4, "255", 0);
+
+	CALL_AND_TEST("\xFF\xFF", 2, DATA_UNSIGNED,
+		      buf, sizeof(buf), 6, "65535", 0);
+
+	CALL_AND_TEST("\xFF\xFF\xFF", 3, DATA_UNSIGNED,
+		      buf, sizeof(buf), 9, "16777215", 0);
+
+	CALL_AND_TEST("\xFF\xFF\xFF\xFF", 4, DATA_UNSIGNED,
+		      buf, sizeof(buf), 11, "4294967295", 0);
+
+	CALL_AND_TEST("\xFF\xFF\xFF\xFF\xFF", 5, DATA_UNSIGNED,
+		      buf, sizeof(buf), 14, "1099511627775", 0);
+
+	CALL_AND_TEST("\xFF\xFF\xFF\xFF\xFF\xFF", 6, DATA_UNSIGNED,
+		      buf, sizeof(buf), 16, "281474976710655", 0);
+
+	CALL_AND_TEST("\xFF\xFF\xFF\xFF\xFF\xFF\xFF", 7, DATA_UNSIGNED,
+		      buf, sizeof(buf), 18, "72057594037927935", 0);
+
+	CALL_AND_TEST("\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF", 8, DATA_UNSIGNED,
+		      buf, sizeof(buf), 21, "18446744073709551615", 0);
+
+	/* some random values */
+
+	CALL_AND_TEST("\x52", 1, 0,
+		      buf, sizeof(buf), 4, "-46", 0);
+
+	CALL_AND_TEST("\x0E", 1, DATA_UNSIGNED,
+		      buf, sizeof(buf), 3, "14", 0);
+
+	CALL_AND_TEST("\x62\xCE", 2, 0,
+		      buf, sizeof(buf), 6, "-7474", 0);
+
+	CALL_AND_TEST("\x29\xD6", 2, DATA_UNSIGNED,
+		      buf, sizeof(buf), 6, "10710", 0);
+
+	CALL_AND_TEST("\x7F\xFF\x90", 3, 0,
+		      buf, sizeof(buf), 5, "-112", 0);
+
+	CALL_AND_TEST("\x00\xA1\x16", 3, DATA_UNSIGNED,
+		      buf, sizeof(buf), 6, "41238", 0);
+
+	CALL_AND_TEST("\x7F\xFF\xFF\xF7", 4, 0,
+		      buf, sizeof(buf), 3, "-9", 0);
+
+	CALL_AND_TEST("\x00\x00\x00\x5C", 4, DATA_UNSIGNED,
+		      buf, sizeof(buf), 3, "92", 0);
+
+	CALL_AND_TEST("\x7F\xFF\xFF\xFF\xFF\xFF\xDC\x63", 8, 0,
+		      buf, sizeof(buf), 6, "-9117", 0);
+
+	CALL_AND_TEST("\x00\x00\x00\x00\x00\x01\x64\x62", 8, DATA_UNSIGNED,
+		      buf, sizeof(buf), 6, "91234", 0);
+#endif
+
+	/* speed test */
+
+	speedo_t	speedo;
+	ulint		i;
+
+	speedo_reset(&speedo);
+
+	for (i = 0; i < 1000000; i++) {
+		row_raw_format_int("\x23", 1,
+				   0, buf, sizeof(buf),
+				   &format_in_hex);
+		row_raw_format_int("\x23", 1,
+				   DATA_UNSIGNED, buf, sizeof(buf),
+				   &format_in_hex);
+
+		row_raw_format_int("\x00\x00\x00\x00\x00\x01\x64\x62", 8,
+				   0, buf, sizeof(buf),
+				   &format_in_hex);
+		row_raw_format_int("\x00\x00\x00\x00\x00\x01\x64\x62", 8,
+				   DATA_UNSIGNED, buf, sizeof(buf),
+				   &format_in_hex);
+	}
+
+	speedo_show(&speedo);
+}
+
+#endif /* UNIV_COMPILE_TEST_FUNCS */
diff --git a/storage/xtradb/row/row0sel.c b/storage/xtradb/row/row0sel.c
new file mode 100644
index 00000000000..2839d935167
--- /dev/null
+++ b/storage/xtradb/row/row0sel.c
@@ -0,0 +1,4964 @@
+/*****************************************************************************
+
+Copyright (c) 1997, 2010, Innobase Oy. All Rights Reserved.
+Copyright (c) 2008, Google Inc.
+
+Portions of this file contain modifications contributed and copyrighted by
+Google, Inc. Those modifications are gratefully acknowledged and are described
+briefly in the InnoDB documentation. The contributions by Google are
+incorporated with their permission, and subject to the conditions contained in
+the file COPYING.Google.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/***************************************************//**
+@file row/row0sel.c
+Select
+
+Created 12/19/1997 Heikki Tuuri
+*******************************************************/
+
+#include "row0sel.h"
+
+#ifdef UNIV_NONINL
+#include "row0sel.ic"
+#endif
+
+#include "dict0dict.h"
+#include "dict0boot.h"
+#include "trx0undo.h"
+#include "trx0trx.h"
+#include "btr0btr.h"
+#include "btr0cur.h"
+#include "btr0sea.h"
+#include "mach0data.h"
+#include "que0que.h"
+#include "row0upd.h"
+#include "row0row.h"
+#include "row0vers.h"
+#include "rem0cmp.h"
+#include "lock0lock.h"
+#include "eval0eval.h"
+#include "pars0sym.h"
+#include "pars0pars.h"
+#include "row0mysql.h"
+#include "read0read.h"
+#include "buf0lru.h"
+#include "ha_prototypes.h"
+
+/* Maximum number of rows to prefetch; MySQL interface has another parameter */
+#define SEL_MAX_N_PREFETCH	16
+
+/* Number of rows fetched, after which to start prefetching; MySQL interface
+has another parameter */
+#define SEL_PREFETCH_LIMIT	1
+
+/* When a select has accessed about this many pages, it returns control back
+to que_run_threads: this is to allow canceling runaway queries */
+
+#define SEL_COST_LIMIT	100
+
+/* Flags for search shortcut */
+#define SEL_FOUND	0
+#define	SEL_EXHAUSTED	1
+#define SEL_RETRY	2
+
+/********************************************************************//**
+Returns TRUE if the user-defined column in a secondary index record
+is alphabetically the same as the corresponding BLOB column in the clustered
+index record.
+NOTE: the comparison is NOT done as a binary comparison, but character
+fields are compared with collation!
+@return	TRUE if the columns are equal */
+static
+ibool
+row_sel_sec_rec_is_for_blob(
+/*========================*/
+	ulint		mtype,		/*!< in: main type */
+	ulint		prtype,		/*!< in: precise type */
+	ulint		mbminlen,	/*!< in: minimum length of a
+					multi-byte character */
+	ulint		mbmaxlen,	/*!< in: maximum length of a
+					multi-byte character */
+	const byte*	clust_field,	/*!< in: the locally stored part of
+					the clustered index column, including
+					the BLOB pointer; the clustered
+					index record must be covered by
+					a lock or a page latch to protect it
+					against deletion (rollback or purge) */
+	ulint		clust_len,	/*!< in: length of clust_field */
+	const byte*	sec_field,	/*!< in: column in secondary index */
+	ulint		sec_len,	/*!< in: length of sec_field */
+	ulint		zip_size)	/*!< in: compressed page size, or 0 */
+{
+	ulint	len;
+	byte	buf[DICT_MAX_INDEX_COL_LEN];
+
+	len = btr_copy_externally_stored_field_prefix(buf, sizeof buf,
+						      zip_size,
+						      clust_field, clust_len);
+
+	if (UNIV_UNLIKELY(len == 0)) {
+		/* The BLOB was being deleted as the server crashed.
+		There should not be any secondary index records
+		referring to this clustered index record, because
+		btr_free_externally_stored_field() is called after all
+		secondary index entries of the row have been purged. */
+		return(FALSE);
+	}
+
+	len = dtype_get_at_most_n_mbchars(prtype, mbminlen, mbmaxlen,
+					  sec_len, len, (const char*) buf);
+
+	return(!cmp_data_data(mtype, prtype, buf, len, sec_field, sec_len));
+}
+
+/********************************************************************//**
+Returns TRUE if the user-defined column values in a secondary index record
+are alphabetically the same as the corresponding columns in the clustered
+index record.
+NOTE: the comparison is NOT done as a binary comparison, but character
+fields are compared with collation!
+@return TRUE if the secondary record is equal to the corresponding
+fields in the clustered record, when compared with collation;
+FALSE if not equal or if the clustered record has been marked for deletion */
+static
+ibool
+row_sel_sec_rec_is_for_clust_rec(
+/*=============================*/
+	const rec_t*	sec_rec,	/*!< in: secondary index record */
+	dict_index_t*	sec_index,	/*!< in: secondary index */
+	const rec_t*	clust_rec,	/*!< in: clustered index record;
+					must be protected by a lock or
+					a page latch against deletion
+					in rollback or purge */
+	dict_index_t*	clust_index)	/*!< in: clustered index */
+{
+	const byte*	sec_field;
+	ulint		sec_len;
+	const byte*	clust_field;
+	ulint		n;
+	ulint		i;
+	mem_heap_t*	heap		= NULL;
+	ulint		clust_offsets_[REC_OFFS_NORMAL_SIZE];
+	ulint		sec_offsets_[REC_OFFS_SMALL_SIZE];
+	ulint*		clust_offs	= clust_offsets_;
+	ulint*		sec_offs	= sec_offsets_;
+	ibool		is_equal	= TRUE;
+
+	rec_offs_init(clust_offsets_);
+	rec_offs_init(sec_offsets_);
+
+	if (rec_get_deleted_flag(clust_rec,
+				 dict_table_is_comp(clust_index->table))) {
+
+		/* The clustered index record is delete-marked;
+		it is not visible in the read view.  Besides,
+		if there are any externally stored columns,
+		some of them may have already been purged. */
+		return(FALSE);
+	}
+
+	clust_offs = rec_get_offsets(clust_rec, clust_index, clust_offs,
+				     ULINT_UNDEFINED, &heap);
+	sec_offs = rec_get_offsets(sec_rec, sec_index, sec_offs,
+				   ULINT_UNDEFINED, &heap);
+
+	n = dict_index_get_n_ordering_defined_by_user(sec_index);
+
+	for (i = 0; i < n; i++) {
+		const dict_field_t*	ifield;
+		const dict_col_t*	col;
+		ulint			clust_pos;
+		ulint			clust_len;
+		ulint			len;
+
+		ifield = dict_index_get_nth_field(sec_index, i);
+		col = dict_field_get_col(ifield);
+		clust_pos = dict_col_get_clust_pos(col, clust_index);
+
+		clust_field = rec_get_nth_field(
+			clust_rec, clust_offs, clust_pos, &clust_len);
+		sec_field = rec_get_nth_field(sec_rec, sec_offs, i, &sec_len);
+
+		len = clust_len;
+
+		if (ifield->prefix_len > 0 && len != UNIV_SQL_NULL) {
+
+			if (rec_offs_nth_extern(clust_offs, clust_pos)) {
+				len -= BTR_EXTERN_FIELD_REF_SIZE;
+			}
+
+			len = dtype_get_at_most_n_mbchars(
+				col->prtype, col->mbminlen, col->mbmaxlen,
+				ifield->prefix_len, len, (char*) clust_field);
+
+			if (rec_offs_nth_extern(clust_offs, clust_pos)
+			    && len < sec_len) {
+				if (!row_sel_sec_rec_is_for_blob(
+					    col->mtype, col->prtype,
+					    col->mbminlen, col->mbmaxlen,
+					    clust_field, clust_len,
+					    sec_field, sec_len,
+					    dict_table_zip_size(
+						    clust_index->table))) {
+					goto inequal;
+				}
+
+				continue;
+			}
+		}
+
+		if (0 != cmp_data_data(col->mtype, col->prtype,
+				       clust_field, len,
+				       sec_field, sec_len)) {
+inequal:
+			is_equal = FALSE;
+			goto func_exit;
+		}
+	}
+
+func_exit:
+	if (UNIV_LIKELY_NULL(heap)) {
+		mem_heap_free(heap);
+	}
+	return(is_equal);
+}
+
+/*********************************************************************//**
+Creates a select node struct.
+@return	own: select node struct */
+UNIV_INTERN
+sel_node_t*
+sel_node_create(
+/*============*/
+	mem_heap_t*	heap)	/*!< in: memory heap where created */
+{
+	sel_node_t*	node;
+
+	node = mem_heap_alloc(heap, sizeof(sel_node_t));
+	node->common.type = QUE_NODE_SELECT;
+	node->state = SEL_NODE_OPEN;
+
+	node->plans = NULL;
+
+	return(node);
+}
+
+/*********************************************************************//**
+Frees the memory private to a select node when a query graph is freed,
+does not free the heap where the node was originally created. */
+UNIV_INTERN
+void
+sel_node_free_private(
+/*==================*/
+	sel_node_t*	node)	/*!< in: select node struct */
+{
+	ulint	i;
+	plan_t*	plan;
+
+	if (node->plans != NULL) {
+		for (i = 0; i < node->n_tables; i++) {
+			plan = sel_node_get_nth_plan(node, i);
+
+			btr_pcur_close(&(plan->pcur));
+			btr_pcur_close(&(plan->clust_pcur));
+
+			if (plan->old_vers_heap) {
+				mem_heap_free(plan->old_vers_heap);
+			}
+		}
+	}
+}
+
+/*********************************************************************//**
+Evaluates the values in a select list. If there are aggregate functions,
+their argument value is added to the aggregate total. */
+UNIV_INLINE
+void
+sel_eval_select_list(
+/*=================*/
+	sel_node_t*	node)	/*!< in: select node */
+{
+	que_node_t*	exp;
+
+	exp = node->select_list;
+
+	while (exp) {
+		eval_exp(exp);
+
+		exp = que_node_get_next(exp);
+	}
+}
+
+/*********************************************************************//**
+Assigns the values in the select list to the possible into-variables in
+SELECT ... INTO ... */
+UNIV_INLINE
+void
+sel_assign_into_var_values(
+/*=======================*/
+	sym_node_t*	var,	/*!< in: first variable in a list of variables */
+	sel_node_t*	node)	/*!< in: select node */
+{
+	que_node_t*	exp;
+
+	if (var == NULL) {
+
+		return;
+	}
+
+	exp = node->select_list;
+
+	while (var) {
+		ut_ad(exp);
+
+		eval_node_copy_val(var->alias, exp);
+
+		exp = que_node_get_next(exp);
+		var = que_node_get_next(var);
+	}
+}
+
+/*********************************************************************//**
+Resets the aggregate value totals in the select list of an aggregate type
+query. */
+UNIV_INLINE
+void
+sel_reset_aggregate_vals(
+/*=====================*/
+	sel_node_t*	node)	/*!< in: select node */
+{
+	func_node_t*	func_node;
+
+	ut_ad(node->is_aggregate);
+
+	func_node = node->select_list;
+
+	while (func_node) {
+		eval_node_set_int_val(func_node, 0);
+
+		func_node = que_node_get_next(func_node);
+	}
+
+	node->aggregate_already_fetched = FALSE;
+}
+
+/*********************************************************************//**
+Copies the input variable values when an explicit cursor is opened. */
+UNIV_INLINE
+void
+row_sel_copy_input_variable_vals(
+/*=============================*/
+	sel_node_t*	node)	/*!< in: select node */
+{
+	sym_node_t*	var;
+
+	var = UT_LIST_GET_FIRST(node->copy_variables);
+
+	while (var) {
+		eval_node_copy_val(var, var->alias);
+
+		var->indirection = NULL;
+
+		var = UT_LIST_GET_NEXT(col_var_list, var);
+	}
+}
+
+/*********************************************************************//**
+Fetches the column values from a record. */
+static
+void
+row_sel_fetch_columns(
+/*==================*/
+	dict_index_t*	index,	/*!< in: record index */
+	const rec_t*	rec,	/*!< in: record in a clustered or non-clustered
+				index; must be protected by a page latch */
+	const ulint*	offsets,/*!< in: rec_get_offsets(rec, index) */
+	sym_node_t*	column)	/*!< in: first column in a column list, or
+				NULL */
+{
+	dfield_t*	val;
+	ulint		index_type;
+	ulint		field_no;
+	const byte*	data;
+	ulint		len;
+
+	ut_ad(rec_offs_validate(rec, index, offsets));
+
+	if (dict_index_is_clust(index)) {
+		index_type = SYM_CLUST_FIELD_NO;
+	} else {
+		index_type = SYM_SEC_FIELD_NO;
+	}
+
+	while (column) {
+		mem_heap_t*	heap = NULL;
+		ibool		needs_copy;
+
+		field_no = column->field_nos[index_type];
+
+		if (field_no != ULINT_UNDEFINED) {
+
+			if (UNIV_UNLIKELY(rec_offs_nth_extern(offsets,
+							      field_no))) {
+
+				/* Copy an externally stored field to the
+				temporary heap, if possible. */
+
+				heap = mem_heap_create(1);
+
+				data = btr_rec_copy_externally_stored_field(
+					rec, offsets,
+					dict_table_zip_size(index->table),
+					field_no, &len, heap);
+
+				/* data == NULL means that the
+				externally stored field was not
+				written yet. This record
+				should only be seen by
+				recv_recovery_rollback_active() or any
+				TRX_ISO_READ_UNCOMMITTED
+				transactions. The InnoDB SQL parser
+				(the sole caller of this function)
+				does not implement READ UNCOMMITTED,
+				and it is not involved during rollback. */
+				ut_a(data);
+				ut_a(len != UNIV_SQL_NULL);
+
+				needs_copy = TRUE;
+			} else {
+				data = rec_get_nth_field(rec, offsets,
+							 field_no, &len);
+
+				needs_copy = column->copy_val;
+			}
+
+			if (needs_copy) {
+				eval_node_copy_and_alloc_val(column, data,
+							     len);
+			} else {
+				val = que_node_get_val(column);
+				dfield_set_data(val, data, len);
+			}
+
+			if (UNIV_LIKELY_NULL(heap)) {
+				mem_heap_free(heap);
+			}
+		}
+
+		column = UT_LIST_GET_NEXT(col_var_list, column);
+	}
+}
+
+/*********************************************************************//**
+Allocates a prefetch buffer for a column when prefetch is first time done. */
+static
+void
+sel_col_prefetch_buf_alloc(
+/*=======================*/
+	sym_node_t*	column)	/*!< in: symbol table node for a column */
+{
+	sel_buf_t*	sel_buf;
+	ulint		i;
+
+	ut_ad(que_node_get_type(column) == QUE_NODE_SYMBOL);
+
+	column->prefetch_buf = mem_alloc(SEL_MAX_N_PREFETCH
+					 * sizeof(sel_buf_t));
+	for (i = 0; i < SEL_MAX_N_PREFETCH; i++) {
+		sel_buf = column->prefetch_buf + i;
+
+		sel_buf->data = NULL;
+
+		sel_buf->val_buf_size = 0;
+	}
+}
+
+/*********************************************************************//**
+Frees a prefetch buffer for a column, including the dynamically allocated
+memory for data stored there. */
+UNIV_INTERN
+void
+sel_col_prefetch_buf_free(
+/*======================*/
+	sel_buf_t*	prefetch_buf)	/*!< in, own: prefetch buffer */
+{
+	sel_buf_t*	sel_buf;
+	ulint		i;
+
+	for (i = 0; i < SEL_MAX_N_PREFETCH; i++) {
+		sel_buf = prefetch_buf + i;
+
+		if (sel_buf->val_buf_size > 0) {
+
+			mem_free(sel_buf->data);
+		}
+	}
+}
+
+/*********************************************************************//**
+Pops the column values for a prefetched, cached row from the column prefetch
+buffers and places them to the val fields in the column nodes. */
+static
+void
+sel_pop_prefetched_row(
+/*===================*/
+	plan_t*	plan)	/*!< in: plan node for a table */
+{
+	sym_node_t*	column;
+	sel_buf_t*	sel_buf;
+	dfield_t*	val;
+	byte*		data;
+	ulint		len;
+	ulint		val_buf_size;
+
+	ut_ad(plan->n_rows_prefetched > 0);
+
+	column = UT_LIST_GET_FIRST(plan->columns);
+
+	while (column) {
+		val = que_node_get_val(column);
+
+		if (!column->copy_val) {
+			/* We did not really push any value for the
+			column */
+
+			ut_ad(!column->prefetch_buf);
+			ut_ad(que_node_get_val_buf_size(column) == 0);
+			ut_d(dfield_set_null(val));
+
+			goto next_col;
+		}
+
+		ut_ad(column->prefetch_buf);
+		ut_ad(!dfield_is_ext(val));
+
+		sel_buf = column->prefetch_buf + plan->first_prefetched;
+
+		data = sel_buf->data;
+		len = sel_buf->len;
+		val_buf_size = sel_buf->val_buf_size;
+
+		/* We must keep track of the allocated memory for
+		column values to be able to free it later: therefore
+		we swap the values for sel_buf and val */
+
+		sel_buf->data = dfield_get_data(val);
+		sel_buf->len = dfield_get_len(val);
+		sel_buf->val_buf_size = que_node_get_val_buf_size(column);
+
+		dfield_set_data(val, data, len);
+		que_node_set_val_buf_size(column, val_buf_size);
+next_col:
+		column = UT_LIST_GET_NEXT(col_var_list, column);
+	}
+
+	plan->n_rows_prefetched--;
+
+	plan->first_prefetched++;
+}
+
+/*********************************************************************//**
+Pushes the column values for a prefetched, cached row to the column prefetch
+buffers from the val fields in the column nodes. */
+UNIV_INLINE
+void
+sel_push_prefetched_row(
+/*====================*/
+	plan_t*	plan)	/*!< in: plan node for a table */
+{
+	sym_node_t*	column;
+	sel_buf_t*	sel_buf;
+	dfield_t*	val;
+	byte*		data;
+	ulint		len;
+	ulint		pos;
+	ulint		val_buf_size;
+
+	if (plan->n_rows_prefetched == 0) {
+		pos = 0;
+		plan->first_prefetched = 0;
+	} else {
+		pos = plan->n_rows_prefetched;
+
+		/* We have the convention that pushing new rows starts only
+		after the prefetch stack has been emptied: */
+
+		ut_ad(plan->first_prefetched == 0);
+	}
+
+	plan->n_rows_prefetched++;
+
+	ut_ad(pos < SEL_MAX_N_PREFETCH);
+
+	column = UT_LIST_GET_FIRST(plan->columns);
+
+	while (column) {
+		if (!column->copy_val) {
+			/* There is no sense to push pointers to database
+			page fields when we do not keep latch on the page! */
+
+			goto next_col;
+		}
+
+		if (!column->prefetch_buf) {
+			/* Allocate a new prefetch buffer */
+
+			sel_col_prefetch_buf_alloc(column);
+		}
+
+		sel_buf = column->prefetch_buf + pos;
+
+		val = que_node_get_val(column);
+
+		data = dfield_get_data(val);
+		len = dfield_get_len(val);
+		val_buf_size = que_node_get_val_buf_size(column);
+
+		/* We must keep track of the allocated memory for
+		column values to be able to free it later: therefore
+		we swap the values for sel_buf and val */
+
+		dfield_set_data(val, sel_buf->data, sel_buf->len);
+		que_node_set_val_buf_size(column, sel_buf->val_buf_size);
+
+		sel_buf->data = data;
+		sel_buf->len = len;
+		sel_buf->val_buf_size = val_buf_size;
+next_col:
+		column = UT_LIST_GET_NEXT(col_var_list, column);
+	}
+}
+
+/*********************************************************************//**
+Builds a previous version of a clustered index record for a consistent read
+@return	DB_SUCCESS or error code */
+static
+ulint
+row_sel_build_prev_vers(
+/*====================*/
+	read_view_t*	read_view,	/*!< in: read view */
+	dict_index_t*	index,		/*!< in: plan node for table */
+	rec_t*		rec,		/*!< in: record in a clustered index */
+	ulint**		offsets,	/*!< in/out: offsets returned by
+					rec_get_offsets(rec, plan->index) */
+	mem_heap_t**	offset_heap,	/*!< in/out: memory heap from which
+					the offsets are allocated */
+	mem_heap_t**    old_vers_heap,  /*!< out: old version heap to use */
+	rec_t**		old_vers,	/*!< out: old version, or NULL if the
+					record does not exist in the view:
+					i.e., it was freshly inserted
+					afterwards */
+	mtr_t*		mtr)		/*!< in: mtr */
+{
+	ulint	err;
+
+	if (*old_vers_heap) {
+		mem_heap_empty(*old_vers_heap);
+	} else {
+		*old_vers_heap = mem_heap_create(512);
+	}
+
+	err = row_vers_build_for_consistent_read(
+		rec, mtr, index, offsets, read_view, offset_heap,
+		*old_vers_heap, old_vers);
+	return(err);
+}
+
+/*********************************************************************//**
+Builds the last committed version of a clustered index record for a
+semi-consistent read.
+@return	DB_SUCCESS or error code */
+static
+ulint
+row_sel_build_committed_vers_for_mysql(
+/*===================================*/
+	dict_index_t*	clust_index,	/*!< in: clustered index */
+	row_prebuilt_t*	prebuilt,	/*!< in: prebuilt struct */
+	const rec_t*	rec,		/*!< in: record in a clustered index */
+	ulint**		offsets,	/*!< in/out: offsets returned by
+					rec_get_offsets(rec, clust_index) */
+	mem_heap_t**	offset_heap,	/*!< in/out: memory heap from which
+					the offsets are allocated */
+	const rec_t**	old_vers,	/*!< out: old version, or NULL if the
+					record does not exist in the view:
+					i.e., it was freshly inserted
+					afterwards */
+	mtr_t*		mtr)		/*!< in: mtr */
+{
+	ulint	err;
+
+	if (prebuilt->old_vers_heap) {
+		mem_heap_empty(prebuilt->old_vers_heap);
+	} else {
+		prebuilt->old_vers_heap = mem_heap_create(200);
+	}
+
+	err = row_vers_build_for_semi_consistent_read(
+		rec, mtr, clust_index, offsets, offset_heap,
+		prebuilt->old_vers_heap, old_vers);
+	return(err);
+}
+
+/*********************************************************************//**
+Tests the conditions which determine when the index segment we are searching
+through has been exhausted.
+@return	TRUE if row passed the tests */
+UNIV_INLINE
+ibool
+row_sel_test_end_conds(
+/*===================*/
+	plan_t*	plan)	/*!< in: plan for the table; the column values must
+			already have been retrieved and the right sides of
+			comparisons evaluated */
+{
+	func_node_t*	cond;
+
+	/* All conditions in end_conds are comparisons of a column to an
+	expression */
+
+	cond = UT_LIST_GET_FIRST(plan->end_conds);
+
+	while (cond) {
+		/* Evaluate the left side of the comparison, i.e., get the
+		column value if there is an indirection */
+
+		eval_sym(cond->args);
+
+		/* Do the comparison */
+
+		if (!eval_cmp(cond)) {
+
+			return(FALSE);
+		}
+
+		cond = UT_LIST_GET_NEXT(cond_list, cond);
+	}
+
+	return(TRUE);
+}
+
+/*********************************************************************//**
+Tests the other conditions.
+@return	TRUE if row passed the tests */
+UNIV_INLINE
+ibool
+row_sel_test_other_conds(
+/*=====================*/
+	plan_t*	plan)	/*!< in: plan for the table; the column values must
+			already have been retrieved */
+{
+	func_node_t*	cond;
+
+	cond = UT_LIST_GET_FIRST(plan->other_conds);
+
+	while (cond) {
+		eval_exp(cond);
+
+		if (!eval_node_get_ibool_val(cond)) {
+
+			return(FALSE);
+		}
+
+		cond = UT_LIST_GET_NEXT(cond_list, cond);
+	}
+
+	return(TRUE);
+}
+
+/*********************************************************************//**
+Retrieves the clustered index record corresponding to a record in a
+non-clustered index. Does the necessary locking.
+@return	DB_SUCCESS or error code */
+static
+ulint
+row_sel_get_clust_rec(
+/*==================*/
+	sel_node_t*	node,	/*!< in: select_node */
+	plan_t*		plan,	/*!< in: plan node for table */
+	rec_t*		rec,	/*!< in: record in a non-clustered index */
+	que_thr_t*	thr,	/*!< in: query thread */
+	rec_t**		out_rec,/*!< out: clustered record or an old version of
+				it, NULL if the old version did not exist
+				in the read view, i.e., it was a fresh
+				inserted version */
+	mtr_t*		mtr)	/*!< in: mtr used to get access to the
+				non-clustered record; the same mtr is used to
+				access the clustered index */
+{
+	dict_index_t*	index;
+	rec_t*		clust_rec;
+	rec_t*		old_vers;
+	ulint		err;
+	mem_heap_t*	heap		= NULL;
+	ulint		offsets_[REC_OFFS_NORMAL_SIZE];
+	ulint*		offsets		= offsets_;
+	rec_offs_init(offsets_);
+
+	*out_rec = NULL;
+
+	offsets = rec_get_offsets(rec,
+				  btr_pcur_get_btr_cur(&plan->pcur)->index,
+				  offsets, ULINT_UNDEFINED, &heap);
+
+	row_build_row_ref_fast(plan->clust_ref, plan->clust_map, rec, offsets);
+
+	index = dict_table_get_first_index(plan->table);
+
+	btr_pcur_open_with_no_init(index, plan->clust_ref, PAGE_CUR_LE,
+				   BTR_SEARCH_LEAF, &plan->clust_pcur,
+				   0, mtr);
+
+	clust_rec = btr_pcur_get_rec(&(plan->clust_pcur));
+
+	/* Note: only if the search ends up on a non-infimum record is the
+	low_match value the real match to the search tuple */
+
+	if (!page_rec_is_user_rec(clust_rec)
+	    || btr_pcur_get_low_match(&(plan->clust_pcur))
+	    < dict_index_get_n_unique(index)) {
+
+		ut_a(rec_get_deleted_flag(rec,
+					  dict_table_is_comp(plan->table)));
+		ut_a(node->read_view);
+
+		/* In a rare case it is possible that no clust rec is found
+		for a delete-marked secondary index record: if in row0umod.c
+		in row_undo_mod_remove_clust_low() we have already removed
+		the clust rec, while purge is still cleaning and removing
+		secondary index records associated with earlier versions of
+		the clustered index record. In that case we know that the
+		clustered index record did not exist in the read view of
+		trx. */
+
+		goto func_exit;
+	}
+
+	offsets = rec_get_offsets(clust_rec, index, offsets,
+				  ULINT_UNDEFINED, &heap);
+
+	if (!node->read_view) {
+		/* Try to place a lock on the index record */
+
+		/* If innodb_locks_unsafe_for_binlog option is used
+		or this session is using READ COMMITTED isolation level
+		we lock only the record, i.e., next-key locking is
+		not used. */
+		ulint	lock_type;
+		trx_t*	trx;
+
+		trx = thr_get_trx(thr);
+
+		if (srv_locks_unsafe_for_binlog
+		    || trx->isolation_level <= TRX_ISO_READ_COMMITTED) {
+			lock_type = LOCK_REC_NOT_GAP;
+		} else {
+			lock_type = LOCK_ORDINARY;
+		}
+
+		err = lock_clust_rec_read_check_and_lock(
+			0, btr_pcur_get_block(&plan->clust_pcur),
+			clust_rec, index, offsets,
+			node->row_lock_mode, lock_type, thr);
+
+		switch (err) {
+		case DB_SUCCESS:
+		case DB_SUCCESS_LOCKED_REC:
+			/* Declare the variable uninitialized in Valgrind.
+			It should be set to DB_SUCCESS at func_exit. */
+			UNIV_MEM_INVALID(&err, sizeof err);
+			break;
+		default:
+			goto err_exit;
+		}
+	} else {
+		/* This is a non-locking consistent read: if necessary, fetch
+		a previous version of the record */
+
+		old_vers = NULL;
+
+		if (!lock_clust_rec_cons_read_sees(clust_rec, index, offsets,
+						   node->read_view)) {
+
+			err = row_sel_build_prev_vers(
+				node->read_view, index, clust_rec,
+				&offsets, &heap, &plan->old_vers_heap,
+				&old_vers, mtr);
+
+			if (err != DB_SUCCESS) {
+
+				goto err_exit;
+			}
+
+			clust_rec = old_vers;
+
+			if (clust_rec == NULL) {
+				goto func_exit;
+			}
+		}
+
+		/* If we had to go to an earlier version of row or the
+		secondary index record is delete marked, then it may be that
+		the secondary index record corresponding to clust_rec
+		(or old_vers) is not rec; in that case we must ignore
+		such row because in our snapshot rec would not have existed.
+		Remember that from rec we cannot see directly which transaction
+		id corresponds to it: we have to go to the clustered index
+		record. A query where we want to fetch all rows where
+		the secondary index value is in some interval would return
+		a wrong result if we would not drop rows which we come to
+		visit through secondary index records that would not really
+		exist in our snapshot. */
+
+		if ((old_vers
+		     || rec_get_deleted_flag(rec, dict_table_is_comp(
+						     plan->table)))
+		    && !row_sel_sec_rec_is_for_clust_rec(rec, plan->index,
+							 clust_rec, index)) {
+			goto func_exit;
+		}
+	}
+
+	/* Fetch the columns needed in test conditions.  The clustered
+	index record is protected by a page latch that was acquired
+	when plan->clust_pcur was positioned.  The latch will not be
+	released until mtr_commit(mtr). */
+
+	ut_ad(!rec_get_deleted_flag(clust_rec, rec_offs_comp(offsets)));
+	row_sel_fetch_columns(index, clust_rec, offsets,
+			      UT_LIST_GET_FIRST(plan->columns));
+	*out_rec = clust_rec;
+func_exit:
+	err = DB_SUCCESS;
+err_exit:
+	if (UNIV_LIKELY_NULL(heap)) {
+		mem_heap_free(heap);
+	}
+	return(err);
+}
+
+/*********************************************************************//**
+Sets a lock on a record.
+@return	DB_SUCCESS, DB_SUCCESS_LOCKED_REC, or error code */
+UNIV_INLINE
+enum db_err
+sel_set_rec_lock(
+/*=============*/
+	const buf_block_t*	block,	/*!< in: buffer block of rec */
+	const rec_t*		rec,	/*!< in: record */
+	dict_index_t*		index,	/*!< in: index */
+	const ulint*		offsets,/*!< in: rec_get_offsets(rec, index) */
+	ulint			mode,	/*!< in: lock mode */
+	ulint			type,	/*!< in: LOCK_ORDINARY, LOCK_GAP, or
+					LOC_REC_NOT_GAP */
+	que_thr_t*		thr)	/*!< in: query thread */
+{
+	trx_t*		trx;
+	enum db_err	err;
+
+	trx = thr_get_trx(thr);
+
+	if (UT_LIST_GET_LEN(trx->trx_locks) > 10000) {
+		if (buf_LRU_buf_pool_running_out()) {
+
+			return(DB_LOCK_TABLE_FULL);
+		}
+	}
+
+	if (dict_index_is_clust(index)) {
+		err = lock_clust_rec_read_check_and_lock(
+			0, block, rec, index, offsets, mode, type, thr);
+	} else {
+		err = lock_sec_rec_read_check_and_lock(
+			0, block, rec, index, offsets, mode, type, thr);
+	}
+
+	return(err);
+}
+
+/*********************************************************************//**
+Opens a pcur to a table index. */
+static
+void
+row_sel_open_pcur(
+/*==============*/
+	plan_t*		plan,		/*!< in: table plan */
+	ibool		search_latch_locked,
+					/*!< in: TRUE if the thread currently
+					has the search latch locked in
+					s-mode */
+	mtr_t*		mtr)		/*!< in: mtr */
+{
+	dict_index_t*	index;
+	func_node_t*	cond;
+	que_node_t*	exp;
+	ulint		n_fields;
+	ulint		has_search_latch = 0;	/* RW_S_LATCH or 0 */
+	ulint		i;
+
+	if (search_latch_locked) {
+		has_search_latch = RW_S_LATCH;
+	}
+
+	index = plan->index;
+
+	/* Calculate the value of the search tuple: the exact match columns
+	get their expressions evaluated when we evaluate the right sides of
+	end_conds */
+
+	cond = UT_LIST_GET_FIRST(plan->end_conds);
+
+	while (cond) {
+		eval_exp(que_node_get_next(cond->args));
+
+		cond = UT_LIST_GET_NEXT(cond_list, cond);
+	}
+
+	if (plan->tuple) {
+		n_fields = dtuple_get_n_fields(plan->tuple);
+
+		if (plan->n_exact_match < n_fields) {
+			/* There is a non-exact match field which must be
+			evaluated separately */
+
+			eval_exp(plan->tuple_exps[n_fields - 1]);
+		}
+
+		for (i = 0; i < n_fields; i++) {
+			exp = plan->tuple_exps[i];
+
+			dfield_copy_data(dtuple_get_nth_field(plan->tuple, i),
+					 que_node_get_val(exp));
+		}
+
+		/* Open pcur to the index */
+
+		btr_pcur_open_with_no_init(index, plan->tuple, plan->mode,
+					   BTR_SEARCH_LEAF, &plan->pcur,
+					   has_search_latch, mtr);
+	} else {
+		/* Open the cursor to the start or the end of the index
+		(FALSE: no init) */
+
+		btr_pcur_open_at_index_side(plan->asc, index, BTR_SEARCH_LEAF,
+					    &(plan->pcur), FALSE, mtr);
+	}
+
+	ut_ad(plan->n_rows_prefetched == 0);
+	ut_ad(plan->n_rows_fetched == 0);
+	ut_ad(plan->cursor_at_end == FALSE);
+
+	plan->pcur_is_open = TRUE;
+}
+
+/*********************************************************************//**
+Restores a stored pcur position to a table index.
+@return TRUE if the cursor should be moved to the next record after we
+return from this function (moved to the previous, in the case of a
+descending cursor) without processing again the current cursor
+record */
+static
+ibool
+row_sel_restore_pcur_pos(
+/*=====================*/
+	plan_t*		plan,	/*!< in: table plan */
+	mtr_t*		mtr)	/*!< in: mtr */
+{
+	ibool	equal_position;
+	ulint	relative_position;
+
+	ut_ad(!plan->cursor_at_end);
+
+	relative_position = btr_pcur_get_rel_pos(&(plan->pcur));
+
+	equal_position = btr_pcur_restore_position(BTR_SEARCH_LEAF,
+						   &(plan->pcur), mtr);
+
+	/* If the cursor is traveling upwards, and relative_position is
+
+	(1) BTR_PCUR_BEFORE: this is not allowed, as we did not have a lock
+	yet on the successor of the page infimum;
+	(2) BTR_PCUR_AFTER: btr_pcur_restore_position placed the cursor on the
+	first record GREATER than the predecessor of a page supremum; we have
+	not yet processed the cursor record: no need to move the cursor to the
+	next record;
+	(3) BTR_PCUR_ON: btr_pcur_restore_position placed the cursor on the
+	last record LESS or EQUAL to the old stored user record; (a) if
+	equal_position is FALSE, this means that the cursor is now on a record
+	less than the old user record, and we must move to the next record;
+	(b) if equal_position is TRUE, then if
+	plan->stored_cursor_rec_processed is TRUE, we must move to the next
+	record, else there is no need to move the cursor. */
+
+	if (plan->asc) {
+		if (relative_position == BTR_PCUR_ON) {
+
+			if (equal_position) {
+
+				return(plan->stored_cursor_rec_processed);
+			}
+
+			return(TRUE);
+		}
+
+		ut_ad(relative_position == BTR_PCUR_AFTER
+		      || relative_position == BTR_PCUR_AFTER_LAST_IN_TREE);
+
+		return(FALSE);
+	}
+
+	/* If the cursor is traveling downwards, and relative_position is
+
+	(1) BTR_PCUR_BEFORE: btr_pcur_restore_position placed the cursor on
+	the last record LESS than the successor of a page infimum; we have not
+	processed the cursor record: no need to move the cursor;
+	(2) BTR_PCUR_AFTER: btr_pcur_restore_position placed the cursor on the
+	first record GREATER than the predecessor of a page supremum; we have
+	processed the cursor record: we should move the cursor to the previous
+	record;
+	(3) BTR_PCUR_ON: btr_pcur_restore_position placed the cursor on the
+	last record LESS or EQUAL to the old stored user record; (a) if
+	equal_position is FALSE, this means that the cursor is now on a record
+	less than the old user record, and we need not move to the previous
+	record; (b) if equal_position is TRUE, then if
+	plan->stored_cursor_rec_processed is TRUE, we must move to the previous
+	record, else there is no need to move the cursor. */
+
+	if (relative_position == BTR_PCUR_BEFORE
+	    || relative_position == BTR_PCUR_BEFORE_FIRST_IN_TREE) {
+
+		return(FALSE);
+	}
+
+	if (relative_position == BTR_PCUR_ON) {
+
+		if (equal_position) {
+
+			return(plan->stored_cursor_rec_processed);
+		}
+
+		return(FALSE);
+	}
+
+	ut_ad(relative_position == BTR_PCUR_AFTER
+	      || relative_position == BTR_PCUR_AFTER_LAST_IN_TREE);
+
+	return(TRUE);
+}
+
+/*********************************************************************//**
+Resets a plan cursor to a closed state. */
+UNIV_INLINE
+void
+plan_reset_cursor(
+/*==============*/
+	plan_t*	plan)	/*!< in: plan */
+{
+	plan->pcur_is_open = FALSE;
+	plan->cursor_at_end = FALSE;
+	plan->n_rows_fetched = 0;
+	plan->n_rows_prefetched = 0;
+}
+
+/*********************************************************************//**
+Tries to do a shortcut to fetch a clustered index record with a unique key,
+using the hash index if possible (not always).
+@return	SEL_FOUND, SEL_EXHAUSTED, SEL_RETRY */
+static
+ulint
+row_sel_try_search_shortcut(
+/*========================*/
+	sel_node_t*	node,	/*!< in: select node for a consistent read */
+	plan_t*		plan,	/*!< in: plan for a unique search in clustered
+				index */
+	mtr_t*		mtr)	/*!< in: mtr */
+{
+	dict_index_t*	index;
+	rec_t*		rec;
+	mem_heap_t*	heap		= NULL;
+	ulint		offsets_[REC_OFFS_NORMAL_SIZE];
+	ulint*		offsets		= offsets_;
+	ulint		ret;
+	rec_offs_init(offsets_);
+
+	index = plan->index;
+
+	ut_ad(node->read_view);
+	ut_ad(plan->unique_search);
+	ut_ad(!plan->must_get_clust);
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(rw_lock_own(&btr_search_latch, RW_LOCK_SHARED));
+#endif /* UNIV_SYNC_DEBUG */
+
+	row_sel_open_pcur(plan, TRUE, mtr);
+
+	rec = btr_pcur_get_rec(&(plan->pcur));
+
+	if (!page_rec_is_user_rec(rec)) {
+
+		return(SEL_RETRY);
+	}
+
+	ut_ad(plan->mode == PAGE_CUR_GE);
+
+	/* As the cursor is now placed on a user record after a search with
+	the mode PAGE_CUR_GE, the up_match field in the cursor tells how many
+	fields in the user record matched to the search tuple */
+
+	if (btr_pcur_get_up_match(&(plan->pcur)) < plan->n_exact_match) {
+
+		return(SEL_EXHAUSTED);
+	}
+
+	/* This is a non-locking consistent read: if necessary, fetch
+	a previous version of the record */
+
+	offsets = rec_get_offsets(rec, index, offsets, ULINT_UNDEFINED, &heap);
+
+	if (dict_index_is_clust(index)) {
+		if (!lock_clust_rec_cons_read_sees(rec, index, offsets,
+						   node->read_view)) {
+			ret = SEL_RETRY;
+			goto func_exit;
+		}
+	} else if (!lock_sec_rec_cons_read_sees(rec, node->read_view)) {
+
+		ret = SEL_RETRY;
+		goto func_exit;
+	}
+
+	/* Test the deleted flag. */
+
+	if (rec_get_deleted_flag(rec, dict_table_is_comp(plan->table))) {
+
+		ret = SEL_EXHAUSTED;
+		goto func_exit;
+	}
+
+	/* Fetch the columns needed in test conditions.  The index
+	record is protected by a page latch that was acquired when
+	plan->pcur was positioned.  The latch will not be released
+	until mtr_commit(mtr). */
+
+	row_sel_fetch_columns(index, rec, offsets,
+			      UT_LIST_GET_FIRST(plan->columns));
+
+	/* Test the rest of search conditions */
+
+	if (!row_sel_test_other_conds(plan)) {
+
+		ret = SEL_EXHAUSTED;
+		goto func_exit;
+	}
+
+	ut_ad(plan->pcur.latch_mode == BTR_SEARCH_LEAF);
+
+	plan->n_rows_fetched++;
+	ret = SEL_FOUND;
+func_exit:
+	if (UNIV_LIKELY_NULL(heap)) {
+		mem_heap_free(heap);
+	}
+	return(ret);
+}
+
+/*********************************************************************//**
+Performs a select step.
+@return	DB_SUCCESS or error code */
+static
+ulint
+row_sel(
+/*====*/
+	sel_node_t*	node,	/*!< in: select node */
+	que_thr_t*	thr)	/*!< in: query thread */
+{
+	dict_index_t*	index;
+	plan_t*		plan;
+	mtr_t		mtr;
+	ibool		moved;
+	rec_t*		rec;
+	rec_t*		old_vers;
+	rec_t*		clust_rec;
+	ibool		search_latch_locked;
+	ibool		consistent_read;
+
+	/* The following flag becomes TRUE when we are doing a
+	consistent read from a non-clustered index and we must look
+	at the clustered index to find out the previous delete mark
+	state of the non-clustered record: */
+
+	ibool		cons_read_requires_clust_rec	= FALSE;
+	ulint		cost_counter			= 0;
+	ibool		cursor_just_opened;
+	ibool		must_go_to_next;
+	ibool		mtr_has_extra_clust_latch	= FALSE;
+	/* TRUE if the search was made using
+	a non-clustered index, and we had to
+	access the clustered record: now &mtr
+	contains a clustered index latch, and
+	&mtr must be committed before we move
+	to the next non-clustered record */
+	ulint		found_flag;
+	ulint		err;
+	mem_heap_t*	heap				= NULL;
+	ulint		offsets_[REC_OFFS_NORMAL_SIZE];
+	ulint*		offsets				= offsets_;
+	rec_offs_init(offsets_);
+
+	ut_ad(thr->run_node == node);
+
+	search_latch_locked = FALSE;
+
+	if (node->read_view) {
+		/* In consistent reads, we try to do with the hash index and
+		not to use the buffer page get. This is to reduce memory bus
+		load resulting from semaphore operations. The search latch
+		will be s-locked when we access an index with a unique search
+		condition, but not locked when we access an index with a
+		less selective search condition. */
+
+		consistent_read = TRUE;
+	} else {
+		consistent_read = FALSE;
+	}
+
+table_loop:
+	/* TABLE LOOP
+	----------
+	This is the outer major loop in calculating a join. We come here when
+	node->fetch_table changes, and after adding a row to aggregate totals
+	and, of course, when this function is called. */
+
+	ut_ad(mtr_has_extra_clust_latch == FALSE);
+
+	plan = sel_node_get_nth_plan(node, node->fetch_table);
+	index = plan->index;
+
+	if (plan->n_rows_prefetched > 0) {
+		sel_pop_prefetched_row(plan);
+
+		goto next_table_no_mtr;
+	}
+
+	if (plan->cursor_at_end) {
+		/* The cursor has already reached the result set end: no more
+		rows to process for this table cursor, as also the prefetch
+		stack was empty */
+
+		ut_ad(plan->pcur_is_open);
+
+		goto table_exhausted_no_mtr;
+	}
+
+	/* Open a cursor to index, or restore an open cursor position */
+
+	mtr_start(&mtr);
+
+	if (consistent_read && plan->unique_search && !plan->pcur_is_open
+	    && !plan->must_get_clust
+	    && !plan->table->big_rows) {
+		if (!search_latch_locked) {
+			rw_lock_s_lock(&btr_search_latch);
+
+			search_latch_locked = TRUE;
+		} else if (rw_lock_get_writer(&btr_search_latch) == RW_LOCK_WAIT_EX) {
+
+			/* There is an x-latch request waiting: release the
+			s-latch for a moment; as an s-latch here is often
+			kept for some 10 searches before being released,
+			a waiting x-latch request would block other threads
+			from acquiring an s-latch for a long time, lowering
+			performance significantly in multiprocessors. */
+
+			rw_lock_s_unlock(&btr_search_latch);
+			rw_lock_s_lock(&btr_search_latch);
+		}
+
+		found_flag = row_sel_try_search_shortcut(node, plan, &mtr);
+
+		if (found_flag == SEL_FOUND) {
+
+			goto next_table;
+
+		} else if (found_flag == SEL_EXHAUSTED) {
+
+			goto table_exhausted;
+		}
+
+		ut_ad(found_flag == SEL_RETRY);
+
+		plan_reset_cursor(plan);
+
+		mtr_commit(&mtr);
+		mtr_start(&mtr);
+	}
+
+	if (search_latch_locked) {
+		rw_lock_s_unlock(&btr_search_latch);
+
+		search_latch_locked = FALSE;
+	}
+
+	if (!plan->pcur_is_open) {
+		/* Evaluate the expressions to build the search tuple and
+		open the cursor */
+
+		row_sel_open_pcur(plan, search_latch_locked, &mtr);
+
+		cursor_just_opened = TRUE;
+
+		/* A new search was made: increment the cost counter */
+		cost_counter++;
+	} else {
+		/* Restore pcur position to the index */
+
+		must_go_to_next = row_sel_restore_pcur_pos(plan, &mtr);
+
+		cursor_just_opened = FALSE;
+
+		if (must_go_to_next) {
+			/* We have already processed the cursor record: move
+			to the next */
+
+			goto next_rec;
+		}
+	}
+
+rec_loop:
+	/* RECORD LOOP
+	-----------
+	In this loop we use pcur and try to fetch a qualifying row, and
+	also fill the prefetch buffer for this table if n_rows_fetched has
+	exceeded a threshold. While we are inside this loop, the following
+	holds:
+	(1) &mtr is started,
+	(2) pcur is positioned and open.
+
+	NOTE that if cursor_just_opened is TRUE here, it means that we came
+	to this point right after row_sel_open_pcur. */
+
+	ut_ad(mtr_has_extra_clust_latch == FALSE);
+
+	rec = btr_pcur_get_rec(&(plan->pcur));
+
+	/* PHASE 1: Set a lock if specified */
+
+	if (!node->asc && cursor_just_opened
+	    && !page_rec_is_supremum(rec)) {
+
+		/* When we open a cursor for a descending search, we must set
+		a next-key lock on the successor record: otherwise it would
+		be possible to insert new records next to the cursor position,
+		and it might be that these new records should appear in the
+		search result set, resulting in the phantom problem. */
+
+		if (!consistent_read) {
+
+			/* If innodb_locks_unsafe_for_binlog option is used
+			or this session is using READ COMMITTED isolation
+			level, we lock only the record, i.e., next-key
+			locking is not used. */
+
+			rec_t*	next_rec = page_rec_get_next(rec);
+			ulint	lock_type;
+			trx_t*	trx;
+
+			trx = thr_get_trx(thr);
+
+			offsets = rec_get_offsets(next_rec, index, offsets,
+						  ULINT_UNDEFINED, &heap);
+
+			if (srv_locks_unsafe_for_binlog
+			    || trx->isolation_level
+			    <= TRX_ISO_READ_COMMITTED) {
+
+				if (page_rec_is_supremum(next_rec)) {
+
+					goto skip_lock;
+				}
+
+				lock_type = LOCK_REC_NOT_GAP;
+			} else {
+				lock_type = LOCK_ORDINARY;
+			}
+
+			err = sel_set_rec_lock(btr_pcur_get_block(&plan->pcur),
+					       next_rec, index, offsets,
+					       node->row_lock_mode,
+					       lock_type, thr);
+
+			switch (err) {
+			case DB_SUCCESS_LOCKED_REC:
+				err = DB_SUCCESS;
+			case DB_SUCCESS:
+				break;
+			default:
+				/* Note that in this case we will store in pcur
+				the PREDECESSOR of the record we are waiting
+				the lock for */
+				goto lock_wait_or_error;
+			}
+		}
+	}
+
+skip_lock:
+	if (page_rec_is_infimum(rec)) {
+
+		/* The infimum record on a page cannot be in the result set,
+		and neither can a record lock be placed on it: we skip such
+		a record. We also increment the cost counter as we may have
+		processed yet another page of index. */
+
+		cost_counter++;
+
+		goto next_rec;
+	}
+
+	if (!consistent_read) {
+		/* Try to place a lock on the index record */
+
+		/* If innodb_locks_unsafe_for_binlog option is used
+		or this session is using READ COMMITTED isolation level,
+		we lock only the record, i.e., next-key locking is
+		not used. */
+
+		ulint	lock_type;
+		trx_t*	trx;
+
+		offsets = rec_get_offsets(rec, index, offsets,
+					  ULINT_UNDEFINED, &heap);
+
+		trx = thr_get_trx(thr);
+
+		if (srv_locks_unsafe_for_binlog
+		    || trx->isolation_level <= TRX_ISO_READ_COMMITTED) {
+
+			if (page_rec_is_supremum(rec)) {
+
+				goto next_rec;
+			}
+
+			lock_type = LOCK_REC_NOT_GAP;
+		} else {
+			lock_type = LOCK_ORDINARY;
+		}
+
+		err = sel_set_rec_lock(btr_pcur_get_block(&plan->pcur),
+				       rec, index, offsets,
+				       node->row_lock_mode, lock_type, thr);
+
+		switch (err) {
+		case DB_SUCCESS_LOCKED_REC:
+			err = DB_SUCCESS;
+		case DB_SUCCESS:
+			break;
+		default:
+			goto lock_wait_or_error;
+		}
+	}
+
+	if (page_rec_is_supremum(rec)) {
+
+		/* A page supremum record cannot be in the result set: skip
+		it now when we have placed a possible lock on it */
+
+		goto next_rec;
+	}
+
+	ut_ad(page_rec_is_user_rec(rec));
+
+	if (cost_counter > SEL_COST_LIMIT) {
+
+		/* Now that we have placed the necessary locks, we can stop
+		for a while and store the cursor position; NOTE that if we
+		would store the cursor position BEFORE placing a record lock,
+		it might happen that the cursor would jump over some records
+		that another transaction could meanwhile insert adjacent to
+		the cursor: this would result in the phantom problem. */
+
+		goto stop_for_a_while;
+	}
+
+	/* PHASE 2: Check a mixed index mix id if needed */
+
+	if (plan->unique_search && cursor_just_opened) {
+
+		ut_ad(plan->mode == PAGE_CUR_GE);
+
+		/* As the cursor is now placed on a user record after a search
+		with the mode PAGE_CUR_GE, the up_match field in the cursor
+		tells how many fields in the user record matched to the search
+		tuple */
+
+		if (btr_pcur_get_up_match(&(plan->pcur))
+		    < plan->n_exact_match) {
+			goto table_exhausted;
+		}
+
+		/* Ok, no need to test end_conds or mix id */
+
+	}
+
+	/* We are ready to look at a possible new index entry in the result
+	set: the cursor is now placed on a user record */
+
+	/* PHASE 3: Get previous version in a consistent read */
+
+	cons_read_requires_clust_rec = FALSE;
+	offsets = rec_get_offsets(rec, index, offsets, ULINT_UNDEFINED, &heap);
+
+	if (consistent_read) {
+		/* This is a non-locking consistent read: if necessary, fetch
+		a previous version of the record */
+
+		if (dict_index_is_clust(index)) {
+
+			if (!lock_clust_rec_cons_read_sees(rec, index, offsets,
+							   node->read_view)) {
+
+				err = row_sel_build_prev_vers(
+					node->read_view, index, rec,
+					&offsets, &heap, &plan->old_vers_heap,
+					&old_vers, &mtr);
+
+				if (err != DB_SUCCESS) {
+
+					goto lock_wait_or_error;
+				}
+
+				if (old_vers == NULL) {
+					/* The record does not exist
+					in our read view. Skip it, but
+					first attempt to determine
+					whether the index segment we
+					are searching through has been
+					exhausted. */
+
+					offsets = rec_get_offsets(
+						rec, index, offsets,
+						ULINT_UNDEFINED, &heap);
+
+					/* Fetch the columns needed in
+					test conditions. The clustered
+					index record is protected by a
+					page latch that was acquired
+					by row_sel_open_pcur() or
+					row_sel_restore_pcur_pos().
+					The latch will not be released
+					until mtr_commit(mtr). */
+
+					row_sel_fetch_columns(
+						index, rec, offsets,
+						UT_LIST_GET_FIRST(
+							plan->columns));
+
+					if (!row_sel_test_end_conds(plan)) {
+
+						goto table_exhausted;
+					}
+
+					goto next_rec;
+				}
+
+				rec = old_vers;
+			}
+		} else if (!lock_sec_rec_cons_read_sees(rec,
+							node->read_view)) {
+			cons_read_requires_clust_rec = TRUE;
+		}
+	}
+
+	/* PHASE 4: Test search end conditions and deleted flag */
+
+	/* Fetch the columns needed in test conditions.  The record is
+	protected by a page latch that was acquired by
+	row_sel_open_pcur() or row_sel_restore_pcur_pos().  The latch
+	will not be released until mtr_commit(mtr). */
+
+	row_sel_fetch_columns(index, rec, offsets,
+			      UT_LIST_GET_FIRST(plan->columns));
+
+	/* Test the selection end conditions: these can only contain columns
+	which already are found in the index, even though the index might be
+	non-clustered */
+
+	if (plan->unique_search && cursor_just_opened) {
+
+		/* No test necessary: the test was already made above */
+
+	} else if (!row_sel_test_end_conds(plan)) {
+
+		goto table_exhausted;
+	}
+
+	if (rec_get_deleted_flag(rec, dict_table_is_comp(plan->table))
+	    && !cons_read_requires_clust_rec) {
+
+		/* The record is delete marked: we can skip it if this is
+		not a consistent read which might see an earlier version
+		of a non-clustered index record */
+
+		if (plan->unique_search) {
+
+			goto table_exhausted;
+		}
+
+		goto next_rec;
+	}
+
+	/* PHASE 5: Get the clustered index record, if needed and if we did
+	not do the search using the clustered index */
+
+	if (plan->must_get_clust || cons_read_requires_clust_rec) {
+
+		/* It was a non-clustered index and we must fetch also the
+		clustered index record */
+
+		err = row_sel_get_clust_rec(node, plan, rec, thr, &clust_rec,
+					    &mtr);
+		mtr_has_extra_clust_latch = TRUE;
+
+		if (err != DB_SUCCESS) {
+
+			goto lock_wait_or_error;
+		}
+
+		/* Retrieving the clustered record required a search:
+		increment the cost counter */
+
+		cost_counter++;
+
+		if (clust_rec == NULL) {
+			/* The record did not exist in the read view */
+			ut_ad(consistent_read);
+
+			goto next_rec;
+		}
+
+		if (rec_get_deleted_flag(clust_rec,
+					 dict_table_is_comp(plan->table))) {
+
+			/* The record is delete marked: we can skip it */
+
+			goto next_rec;
+		}
+
+		if (node->can_get_updated) {
+
+			btr_pcur_store_position(&(plan->clust_pcur), &mtr);
+		}
+	}
+
+	/* PHASE 6: Test the rest of search conditions */
+
+	if (!row_sel_test_other_conds(plan)) {
+
+		if (plan->unique_search) {
+
+			goto table_exhausted;
+		}
+
+		goto next_rec;
+	}
+
+	/* PHASE 7: We found a new qualifying row for the current table; push
+	the row if prefetch is on, or move to the next table in the join */
+
+	plan->n_rows_fetched++;
+
+	ut_ad(plan->pcur.latch_mode == BTR_SEARCH_LEAF);
+
+	if ((plan->n_rows_fetched <= SEL_PREFETCH_LIMIT)
+	    || plan->unique_search || plan->no_prefetch
+	    || plan->table->big_rows) {
+
+		/* No prefetch in operation: go to the next table */
+
+		goto next_table;
+	}
+
+	sel_push_prefetched_row(plan);
+
+	if (plan->n_rows_prefetched == SEL_MAX_N_PREFETCH) {
+
+		/* The prefetch buffer is now full */
+
+		sel_pop_prefetched_row(plan);
+
+		goto next_table;
+	}
+
+next_rec:
+	ut_ad(!search_latch_locked);
+
+	if (mtr_has_extra_clust_latch) {
+
+		/* We must commit &mtr if we are moving to the next
+		non-clustered index record, because we could break the
+		latching order if we would access a different clustered
+		index page right away without releasing the previous. */
+
+		goto commit_mtr_for_a_while;
+	}
+
+	if (node->asc) {
+		moved = btr_pcur_move_to_next(&(plan->pcur), &mtr);
+	} else {
+		moved = btr_pcur_move_to_prev(&(plan->pcur), &mtr);
+	}
+
+	if (!moved) {
+
+		goto table_exhausted;
+	}
+
+	cursor_just_opened = FALSE;
+
+	/* END OF RECORD LOOP
+	------------------ */
+	goto rec_loop;
+
+next_table:
+	/* We found a record which satisfies the conditions: we can move to
+	the next table or return a row in the result set */
+
+	ut_ad(btr_pcur_is_on_user_rec(&plan->pcur));
+
+	if (plan->unique_search && !node->can_get_updated) {
+
+		plan->cursor_at_end = TRUE;
+	} else {
+		ut_ad(!search_latch_locked);
+
+		plan->stored_cursor_rec_processed = TRUE;
+
+		btr_pcur_store_position(&(plan->pcur), &mtr);
+	}
+
+	mtr_commit(&mtr);
+
+	mtr_has_extra_clust_latch = FALSE;
+
+next_table_no_mtr:
+	/* If we use 'goto' to this label, it means that the row was popped
+	from the prefetched rows stack, and &mtr is already committed */
+
+	if (node->fetch_table + 1 == node->n_tables) {
+
+		sel_eval_select_list(node);
+
+		if (node->is_aggregate) {
+
+			goto table_loop;
+		}
+
+		sel_assign_into_var_values(node->into_list, node);
+
+		thr->run_node = que_node_get_parent(node);
+
+		err = DB_SUCCESS;
+		goto func_exit;
+	}
+
+	node->fetch_table++;
+
+	/* When we move to the next table, we first reset the plan cursor:
+	we do not care about resetting it when we backtrack from a table */
+
+	plan_reset_cursor(sel_node_get_nth_plan(node, node->fetch_table));
+
+	goto table_loop;
+
+table_exhausted:
+	/* The table cursor pcur reached the result set end: backtrack to the
+	previous table in the join if we do not have cached prefetched rows */
+
+	plan->cursor_at_end = TRUE;
+
+	mtr_commit(&mtr);
+
+	mtr_has_extra_clust_latch = FALSE;
+
+	if (plan->n_rows_prefetched > 0) {
+		/* The table became exhausted during a prefetch */
+
+		sel_pop_prefetched_row(plan);
+
+		goto next_table_no_mtr;
+	}
+
+table_exhausted_no_mtr:
+	if (node->fetch_table == 0) {
+		err = DB_SUCCESS;
+
+		if (node->is_aggregate && !node->aggregate_already_fetched) {
+
+			node->aggregate_already_fetched = TRUE;
+
+			sel_assign_into_var_values(node->into_list, node);
+
+			thr->run_node = que_node_get_parent(node);
+		} else {
+			node->state = SEL_NODE_NO_MORE_ROWS;
+
+			thr->run_node = que_node_get_parent(node);
+		}
+
+		goto func_exit;
+	}
+
+	node->fetch_table--;
+
+	goto table_loop;
+
+stop_for_a_while:
+	/* Return control for a while to que_run_threads, so that runaway
+	queries can be canceled. NOTE that when we come here, we must, in a
+	locking read, have placed the necessary (possibly waiting request)
+	record lock on the cursor record or its successor: when we reposition
+	the cursor, this record lock guarantees that nobody can meanwhile have
+	inserted new records which should have appeared in the result set,
+	which would result in the phantom problem. */
+
+	ut_ad(!search_latch_locked);
+
+	plan->stored_cursor_rec_processed = FALSE;
+	btr_pcur_store_position(&(plan->pcur), &mtr);
+
+	mtr_commit(&mtr);
+
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(sync_thread_levels_empty_gen(TRUE));
+#endif /* UNIV_SYNC_DEBUG */
+	err = DB_SUCCESS;
+	goto func_exit;
+
+commit_mtr_for_a_while:
+	/* Stores the cursor position and commits &mtr; this is used if
+	&mtr may contain latches which would break the latching order if
+	&mtr would not be committed and the latches released. */
+
+	plan->stored_cursor_rec_processed = TRUE;
+
+	ut_ad(!search_latch_locked);
+	btr_pcur_store_position(&(plan->pcur), &mtr);
+
+	mtr_commit(&mtr);
+
+	mtr_has_extra_clust_latch = FALSE;
+
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(sync_thread_levels_empty_gen(TRUE));
+#endif /* UNIV_SYNC_DEBUG */
+
+	goto table_loop;
+
+lock_wait_or_error:
+	/* See the note at stop_for_a_while: the same holds for this case */
+
+	ut_ad(!btr_pcur_is_before_first_on_page(&plan->pcur) || !node->asc);
+	ut_ad(!search_latch_locked);
+
+	plan->stored_cursor_rec_processed = FALSE;
+	btr_pcur_store_position(&(plan->pcur), &mtr);
+
+	mtr_commit(&mtr);
+
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(sync_thread_levels_empty_gen(TRUE));
+#endif /* UNIV_SYNC_DEBUG */
+
+func_exit:
+	if (search_latch_locked) {
+		rw_lock_s_unlock(&btr_search_latch);
+	}
+	if (UNIV_LIKELY_NULL(heap)) {
+		mem_heap_free(heap);
+	}
+	return(err);
+}
+
+/**********************************************************************//**
+Performs a select step. This is a high-level function used in SQL execution
+graphs.
+@return	query thread to run next or NULL */
+UNIV_INTERN
+que_thr_t*
+row_sel_step(
+/*=========*/
+	que_thr_t*	thr)	/*!< in: query thread */
+{
+	ulint		i_lock_mode;
+	sym_node_t*	table_node;
+	sel_node_t*	node;
+	ulint		err;
+
+	ut_ad(thr);
+
+	node = thr->run_node;
+
+	ut_ad(que_node_get_type(node) == QUE_NODE_SELECT);
+
+	/* If this is a new time this node is executed (or when execution
+	resumes after wait for a table intention lock), set intention locks
+	on the tables, or assign a read view */
+
+	if (node->into_list && (thr->prev_node == que_node_get_parent(node))) {
+
+		node->state = SEL_NODE_OPEN;
+	}
+
+	if (node->state == SEL_NODE_OPEN) {
+
+		/* It may be that the current session has not yet started
+		its transaction, or it has been committed: */
+
+		trx_start_if_not_started(thr_get_trx(thr));
+
+		plan_reset_cursor(sel_node_get_nth_plan(node, 0));
+
+		if (node->consistent_read) {
+			/* Assign a read view for the query */
+			node->read_view = trx_assign_read_view(
+				thr_get_trx(thr));
+		} else {
+			if (node->set_x_locks) {
+				i_lock_mode = LOCK_IX;
+			} else {
+				i_lock_mode = LOCK_IS;
+			}
+
+			table_node = node->table_list;
+
+			while (table_node) {
+				err = lock_table(0, table_node->table,
+						 i_lock_mode, thr);
+				if (err != DB_SUCCESS) {
+					thr_get_trx(thr)->error_state = err;
+
+					return(NULL);
+				}
+
+				table_node = que_node_get_next(table_node);
+			}
+		}
+
+		/* If this is an explicit cursor, copy stored procedure
+		variable values, so that the values cannot change between
+		fetches (currently, we copy them also for non-explicit
+		cursors) */
+
+		if (node->explicit_cursor
+		    && UT_LIST_GET_FIRST(node->copy_variables)) {
+
+			row_sel_copy_input_variable_vals(node);
+		}
+
+		node->state = SEL_NODE_FETCH;
+		node->fetch_table = 0;
+
+		if (node->is_aggregate) {
+			/* Reset the aggregate total values */
+			sel_reset_aggregate_vals(node);
+		}
+	}
+
+	err = row_sel(node, thr);
+
+	/* NOTE! if queries are parallelized, the following assignment may
+	have problems; the assignment should be made only if thr is the
+	only top-level thr in the graph: */
+
+	thr->graph->last_sel_node = node;
+
+	if (err != DB_SUCCESS) {
+		thr_get_trx(thr)->error_state = err;
+
+		return(NULL);
+	}
+
+	return(thr);
+}
+
+/**********************************************************************//**
+Performs a fetch for a cursor.
+@return	query thread to run next or NULL */
+UNIV_INTERN
+que_thr_t*
+fetch_step(
+/*=======*/
+	que_thr_t*	thr)	/*!< in: query thread */
+{
+	sel_node_t*	sel_node;
+	fetch_node_t*	node;
+
+	ut_ad(thr);
+
+	node = thr->run_node;
+	sel_node = node->cursor_def;
+
+	ut_ad(que_node_get_type(node) == QUE_NODE_FETCH);
+
+	if (thr->prev_node != que_node_get_parent(node)) {
+
+		if (sel_node->state != SEL_NODE_NO_MORE_ROWS) {
+
+			if (node->into_list) {
+				sel_assign_into_var_values(node->into_list,
+							   sel_node);
+			} else {
+				void* ret = (*node->func->func)(
+					sel_node, node->func->arg);
+
+				if (!ret) {
+					sel_node->state
+						= SEL_NODE_NO_MORE_ROWS;
+				}
+			}
+		}
+
+		thr->run_node = que_node_get_parent(node);
+
+		return(thr);
+	}
+
+	/* Make the fetch node the parent of the cursor definition for
+	the time of the fetch, so that execution knows to return to this
+	fetch node after a row has been selected or we know that there is
+	no row left */
+
+	sel_node->common.parent = node;
+
+	if (sel_node->state == SEL_NODE_CLOSED) {
+		fprintf(stderr,
+			"InnoDB: Error: fetch called on a closed cursor\n");
+
+		thr_get_trx(thr)->error_state = DB_ERROR;
+
+		return(NULL);
+	}
+
+	thr->run_node = sel_node;
+
+	return(thr);
+}
+
+/****************************************************************//**
+Sample callback function for fetch that prints each row.
+@return	always returns non-NULL */
+UNIV_INTERN
+void*
+row_fetch_print(
+/*============*/
+	void*	row,		/*!< in:  sel_node_t* */
+	void*	user_arg)	/*!< in:  not used */
+{
+	sel_node_t*	node = row;
+	que_node_t*	exp;
+	ulint		i = 0;
+
+	UT_NOT_USED(user_arg);
+
+	fprintf(stderr, "row_fetch_print: row %p\n", row);
+
+	exp = node->select_list;
+
+	while (exp) {
+		dfield_t*	dfield = que_node_get_val(exp);
+		const dtype_t*	type = dfield_get_type(dfield);
+
+		fprintf(stderr, " column %lu:\n", (ulong)i);
+
+		dtype_print(type);
+		putc('\n', stderr);
+
+		if (dfield_get_len(dfield) != UNIV_SQL_NULL) {
+			ut_print_buf(stderr, dfield_get_data(dfield),
+				     dfield_get_len(dfield));
+			putc('\n', stderr);
+		} else {
+			fputs(" <NULL>;\n", stderr);
+		}
+
+		exp = que_node_get_next(exp);
+		i++;
+	}
+
+	return((void*)42);
+}
+
+/***********************************************************//**
+Prints a row in a select result.
+@return	query thread to run next or NULL */
+UNIV_INTERN
+que_thr_t*
+row_printf_step(
+/*============*/
+	que_thr_t*	thr)	/*!< in: query thread */
+{
+	row_printf_node_t*	node;
+	sel_node_t*		sel_node;
+	que_node_t*		arg;
+
+	ut_ad(thr);
+
+	node = thr->run_node;
+
+	sel_node = node->sel_node;
+
+	ut_ad(que_node_get_type(node) == QUE_NODE_ROW_PRINTF);
+
+	if (thr->prev_node == que_node_get_parent(node)) {
+
+		/* Reset the cursor */
+		sel_node->state = SEL_NODE_OPEN;
+
+		/* Fetch next row to print */
+
+		thr->run_node = sel_node;
+
+		return(thr);
+	}
+
+	if (sel_node->state != SEL_NODE_FETCH) {
+
+		ut_ad(sel_node->state == SEL_NODE_NO_MORE_ROWS);
+
+		/* No more rows to print */
+
+		thr->run_node = que_node_get_parent(node);
+
+		return(thr);
+	}
+
+	arg = sel_node->select_list;
+
+	while (arg) {
+		dfield_print_also_hex(que_node_get_val(arg));
+
+		fputs(" ::: ", stderr);
+
+		arg = que_node_get_next(arg);
+	}
+
+	putc('\n', stderr);
+
+	/* Fetch next row to print */
+
+	thr->run_node = sel_node;
+
+	return(thr);
+}
+
+/****************************************************************//**
+Converts a key value stored in MySQL format to an Innobase dtuple. The last
+field of the key value may be just a prefix of a fixed length field: hence
+the parameter key_len. But currently we do not allow search keys where the
+last field is only a prefix of the full key field len and print a warning if
+such appears. A counterpart of this function is
+ha_innobase::store_key_val_for_row() in ha_innodb.cc. */
+UNIV_INTERN
+void
+row_sel_convert_mysql_key_to_innobase(
+/*==================================*/
+	dtuple_t*	tuple,		/*!< in/out: tuple where to build;
+					NOTE: we assume that the type info
+					in the tuple is already according
+					to index! */
+	byte*		buf,		/*!< in: buffer to use in field
+					conversions */
+	ulint		buf_len,	/*!< in: buffer length */
+	dict_index_t*	index,		/*!< in: index of the key value */
+	const byte*	key_ptr,	/*!< in: MySQL key value */
+	ulint		key_len,	/*!< in: MySQL key value length */
+	trx_t*		trx)		/*!< in: transaction */
+{
+	byte*		original_buf	= buf;
+	const byte*	original_key_ptr = key_ptr;
+	dict_field_t*	field;
+	dfield_t*	dfield;
+	ulint		data_offset;
+	ulint		data_len;
+	ulint		data_field_len;
+	ibool		is_null;
+	const byte*	key_end;
+	ulint		n_fields = 0;
+
+	/* For documentation of the key value storage format in MySQL, see
+	ha_innobase::store_key_val_for_row() in ha_innodb.cc. */
+
+	key_end = key_ptr + key_len;
+
+	/* Permit us to access any field in the tuple (ULINT_MAX): */
+
+	dtuple_set_n_fields(tuple, ULINT_MAX);
+
+	dfield = dtuple_get_nth_field(tuple, 0);
+	field = dict_index_get_nth_field(index, 0);
+
+	if (UNIV_UNLIKELY(dfield_get_type(dfield)->mtype == DATA_SYS)) {
+		/* A special case: we are looking for a position in the
+		generated clustered index which InnoDB automatically added
+		to a table with no primary key: the first and the only
+		ordering column is ROW_ID which InnoDB stored to the key_ptr
+		buffer. */
+
+		ut_a(key_len == DATA_ROW_ID_LEN);
+
+		dfield_set_data(dfield, key_ptr, DATA_ROW_ID_LEN);
+
+		dtuple_set_n_fields(tuple, 1);
+
+		return;
+	}
+
+	while (key_ptr < key_end) {
+
+		ulint	type = dfield_get_type(dfield)->mtype;
+		ut_a(field->col->mtype == type);
+
+		data_offset = 0;
+		is_null = FALSE;
+
+		if (!(dfield_get_type(dfield)->prtype & DATA_NOT_NULL)) {
+			/* The first byte in the field tells if this is
+			an SQL NULL value */
+
+			data_offset = 1;
+
+			if (*key_ptr != 0) {
+				dfield_set_null(dfield);
+
+				is_null = TRUE;
+			}
+		}
+
+		/* Calculate data length and data field total length */
+
+		if (type == DATA_BLOB) {
+			/* The key field is a column prefix of a BLOB or
+			TEXT */
+
+			ut_a(field->prefix_len > 0);
+
+			/* MySQL stores the actual data length to the first 2
+			bytes after the optional SQL NULL marker byte. The
+			storage format is little-endian, that is, the most
+			significant byte at a higher address. In UTF-8, MySQL
+			seems to reserve field->prefix_len bytes for
+			storing this field in the key value buffer, even
+			though the actual value only takes data_len bytes
+			from the start. */
+
+			data_len = key_ptr[data_offset]
+				+ 256 * key_ptr[data_offset + 1];
+			data_field_len = data_offset + 2 + field->prefix_len;
+
+			data_offset += 2;
+
+			/* Now that we know the length, we store the column
+			value like it would be a fixed char field */
+
+		} else if (field->prefix_len > 0) {
+			/* Looks like MySQL pads unused end bytes in the
+			prefix with space. Therefore, also in UTF-8, it is ok
+			to compare with a prefix containing full prefix_len
+			bytes, and no need to take at most prefix_len / 3
+			UTF-8 characters from the start.
+			If the prefix is used as the upper end of a LIKE
+			'abc%' query, then MySQL pads the end with chars
+			0xff. TODO: in that case does it any harm to compare
+			with the full prefix_len bytes. How do characters
+			0xff in UTF-8 behave? */
+
+			data_len = field->prefix_len;
+			data_field_len = data_offset + data_len;
+		} else {
+			data_len = dfield_get_type(dfield)->len;
+			data_field_len = data_offset + data_len;
+		}
+
+		if (UNIV_UNLIKELY
+		    (dtype_get_mysql_type(dfield_get_type(dfield))
+		     == DATA_MYSQL_TRUE_VARCHAR)
+		    && UNIV_LIKELY(type != DATA_INT)) {
+			/* In a MySQL key value format, a true VARCHAR is
+			always preceded by 2 bytes of a length field.
+			dfield_get_type(dfield)->len returns the maximum
+			'payload' len in bytes. That does not include the
+			2 bytes that tell the actual data length.
+
+			We added the check != DATA_INT to make sure we do
+			not treat MySQL ENUM or SET as a true VARCHAR! */
+
+			data_len += 2;
+			data_field_len += 2;
+		}
+
+		/* Storing may use at most data_len bytes of buf */
+
+		if (UNIV_LIKELY(!is_null)) {
+			row_mysql_store_col_in_innobase_format(
+				dfield, buf,
+				FALSE, /* MySQL key value format col */
+				key_ptr + data_offset, data_len,
+				dict_table_is_comp(index->table));
+			buf += data_len;
+		}
+
+		key_ptr += data_field_len;
+
+		if (UNIV_UNLIKELY(key_ptr > key_end)) {
+			/* The last field in key was not a complete key field
+			but a prefix of it.
+
+			Print a warning about this! HA_READ_PREFIX_LAST does
+			not currently work in InnoDB with partial-field key
+			value prefixes. Since MySQL currently uses a padding
+			trick to calculate LIKE 'abc%' type queries there
+			should never be partial-field prefixes in searches. */
+
+			ut_print_timestamp(stderr);
+
+			fputs("  InnoDB: Warning: using a partial-field"
+			      " key prefix in search.\n"
+			      "InnoDB: ", stderr);
+			dict_index_name_print(stderr, trx, index);
+			fprintf(stderr, ". Last data field length %lu bytes,\n"
+				"InnoDB: key ptr now exceeds"
+				" key end by %lu bytes.\n"
+				"InnoDB: Key value in the MySQL format:\n",
+				(ulong) data_field_len,
+				(ulong) (key_ptr - key_end));
+			fflush(stderr);
+			ut_print_buf(stderr, original_key_ptr, key_len);
+			putc('\n', stderr);
+
+			if (!is_null) {
+				ulint	len = dfield_get_len(dfield);
+				dfield_set_len(dfield, len
+					       - (ulint) (key_ptr - key_end));
+			}
+		}
+
+		n_fields++;
+		field++;
+		dfield++;
+	}
+
+	ut_a(buf <= original_buf + buf_len);
+
+	/* We set the length of tuple to n_fields: we assume that the memory
+	area allocated for it is big enough (usually bigger than n_fields). */
+
+	dtuple_set_n_fields(tuple, n_fields);
+}
+
+/**************************************************************//**
+Stores the row id to the prebuilt struct. */
+static
+void
+row_sel_store_row_id_to_prebuilt(
+/*=============================*/
+	row_prebuilt_t*		prebuilt,	/*!< in/out: prebuilt */
+	const rec_t*		index_rec,	/*!< in: record */
+	const dict_index_t*	index,		/*!< in: index of the record */
+	const ulint*		offsets)	/*!< in: rec_get_offsets
+						(index_rec, index) */
+{
+	const byte*	data;
+	ulint		len;
+
+	ut_ad(rec_offs_validate(index_rec, index, offsets));
+
+	data = rec_get_nth_field(
+		index_rec, offsets,
+		dict_index_get_sys_col_pos(index, DATA_ROW_ID), &len);
+
+	if (UNIV_UNLIKELY(len != DATA_ROW_ID_LEN)) {
+		fprintf(stderr,
+			"InnoDB: Error: Row id field is"
+			" wrong length %lu in ", (ulong) len);
+		dict_index_name_print(stderr, prebuilt->trx, index);
+		fprintf(stderr, "\n"
+			"InnoDB: Field number %lu, record:\n",
+			(ulong) dict_index_get_sys_col_pos(index,
+							   DATA_ROW_ID));
+		rec_print_new(stderr, index_rec, offsets);
+		putc('\n', stderr);
+		ut_error;
+	}
+
+	ut_memcpy(prebuilt->row_id, data, len);
+}
+
+/**************************************************************//**
+Stores a non-SQL-NULL field in the MySQL format. The counterpart of this
+function is row_mysql_store_col_in_innobase_format() in row0mysql.c. */
+static
+void
+row_sel_field_store_in_mysql_format(
+/*================================*/
+	byte*		dest,	/*!< in/out: buffer where to store; NOTE
+				that BLOBs are not in themselves
+				stored here: the caller must allocate
+				and copy the BLOB into buffer before,
+				and pass the pointer to the BLOB in
+				'data' */
+	const mysql_row_templ_t* templ,
+				/*!< in: MySQL column template.
+				Its following fields are referenced:
+				type, is_unsigned, mysql_col_len,
+				mbminlen, mbmaxlen */
+	const byte*	data,	/*!< in: data to store */
+	ulint		len)	/*!< in: length of the data */
+{
+	byte*	ptr;
+	byte*	field_end;
+	byte*	pad_ptr;
+
+	ut_ad(len != UNIV_SQL_NULL);
+	UNIV_MEM_ASSERT_RW(data, len);
+
+	switch (templ->type) {
+	case DATA_INT:
+		/* Convert integer data from Innobase to a little-endian
+		format, sign bit restored to normal */
+
+		ptr = dest + len;
+
+		for (;;) {
+			ptr--;
+			*ptr = *data;
+			if (ptr == dest) {
+				break;
+			}
+			data++;
+		}
+
+		if (!templ->is_unsigned) {
+			dest[len - 1] = (byte) (dest[len - 1] ^ 128);
+		}
+
+		ut_ad(templ->mysql_col_len == len);
+		break;
+
+	case DATA_VARCHAR:
+	case DATA_VARMYSQL:
+	case DATA_BINARY:
+		field_end = dest + templ->mysql_col_len;
+
+		if (templ->mysql_type == DATA_MYSQL_TRUE_VARCHAR) {
+			/* This is a >= 5.0.3 type true VARCHAR. Store the
+			length of the data to the first byte or the first
+			two bytes of dest. */
+
+			dest = row_mysql_store_true_var_len(
+				dest, len, templ->mysql_length_bytes);
+		}
+
+		/* Copy the actual data */
+		ut_memcpy(dest, data, len);
+
+		/* Pad with trailing spaces. We pad with spaces also the
+		unused end of a >= 5.0.3 true VARCHAR column, just in case
+		MySQL expects its contents to be deterministic. */
+
+		pad_ptr = dest + len;
+
+		ut_ad(templ->mbminlen <= templ->mbmaxlen);
+
+		/* We handle UCS2 charset strings differently. */
+		if (templ->mbminlen == 2) {
+			/* A space char is two bytes, 0x0020 in UCS2 */
+
+			if (len & 1) {
+				/* A 0x20 has been stripped from the column.
+				Pad it back. */
+
+				if (pad_ptr < field_end) {
+					*pad_ptr = 0x20;
+					pad_ptr++;
+				}
+			}
+
+			/* Pad the rest of the string with 0x0020 */
+
+			while (pad_ptr < field_end) {
+				*pad_ptr = 0x00;
+				pad_ptr++;
+				*pad_ptr = 0x20;
+				pad_ptr++;
+			}
+		} else {
+			ut_ad(templ->mbminlen == 1);
+			/* space=0x20 */
+
+			memset(pad_ptr, 0x20, field_end - pad_ptr);
+		}
+		break;
+
+	case DATA_BLOB:
+		/* Store a pointer to the BLOB buffer to dest: the BLOB was
+		already copied to the buffer in row_sel_store_mysql_rec */
+
+		row_mysql_store_blob_ref(dest, templ->mysql_col_len, data,
+					 len);
+		break;
+
+	case DATA_MYSQL:
+		memcpy(dest, data, len);
+
+		ut_ad(templ->mysql_col_len >= len);
+		ut_ad(templ->mbmaxlen >= templ->mbminlen);
+
+		ut_ad(templ->mbmaxlen > templ->mbminlen
+		      || templ->mysql_col_len == len);
+		/* The following assertion would fail for old tables
+		containing UTF-8 ENUM columns due to Bug #9526. */
+		ut_ad(!templ->mbmaxlen
+		      || !(templ->mysql_col_len % templ->mbmaxlen));
+		ut_ad(len * templ->mbmaxlen >= templ->mysql_col_len);
+
+		if (templ->mbminlen != templ->mbmaxlen) {
+			/* Pad with spaces. This undoes the stripping
+			done in row0mysql.ic, function
+			row_mysql_store_col_in_innobase_format(). */
+
+			memset(dest + len, 0x20, templ->mysql_col_len - len);
+		}
+		break;
+
+	default:
+#ifdef UNIV_DEBUG
+	case DATA_SYS_CHILD:
+	case DATA_SYS:
+		/* These column types should never be shipped to MySQL. */
+		ut_ad(0);
+
+	case DATA_CHAR:
+	case DATA_FIXBINARY:
+	case DATA_FLOAT:
+	case DATA_DOUBLE:
+	case DATA_DECIMAL:
+		/* Above are the valid column types for MySQL data. */
+#endif /* UNIV_DEBUG */
+		ut_ad(templ->mysql_col_len == len);
+		memcpy(dest, data, len);
+	}
+}
+
+/**************************************************************//**
+Convert a row in the Innobase format to a row in the MySQL format.
+Note that the template in prebuilt may advise us to copy only a few
+columns to mysql_rec, other columns are left blank. All columns may not
+be needed in the query.
+@return TRUE on success, FALSE if not all columns could be retrieved */
+static __attribute__((warn_unused_result))
+ibool
+row_sel_store_mysql_rec(
+/*====================*/
+	byte*		mysql_rec,	/*!< out: row in the MySQL format */
+	row_prebuilt_t*	prebuilt,	/*!< in: prebuilt struct */
+	const rec_t*	rec,		/*!< in: Innobase record in the index
+					which was described in prebuilt's
+					template; must be protected by
+					a page latch */
+	const ulint*	offsets,	/* in: array returned by
+					rec_get_offsets() */
+	ulint		start_field_no,	/* in: start from this field */
+	ulint		end_field_no)	/* in: end at this field */
+{
+	mysql_row_templ_t*	templ;
+	mem_heap_t*		extern_field_heap	= NULL;
+	mem_heap_t*		heap;
+	const byte*		data;
+	ulint			len;
+	ulint			i;
+
+	ut_ad(prebuilt->mysql_template);
+	ut_ad(prebuilt->default_rec);
+	ut_ad(rec_offs_validate(rec, NULL, offsets));
+
+	if (UNIV_LIKELY_NULL(prebuilt->blob_heap)) {
+		mem_heap_free(prebuilt->blob_heap);
+		prebuilt->blob_heap = NULL;
+	}
+
+// psergey@askmonty.org: don't take the following:
+#if 0	
+	/* init null bytes with default values as they might be
+
+	left uninitialized in some cases and these uninited bytes
+	might be copied into mysql record buffer that leads to
+	valgrind warnings */
+	memcpy(mysql_rec, prebuilt->default_rec, prebuilt->null_bitmap_len);
+#endif
+	
+	for (i = start_field_no; i < end_field_no /* prebuilt->n_template */ ; i++) {
+
+		templ = prebuilt->mysql_template + i;
+
+		if (UNIV_UNLIKELY(rec_offs_nth_extern(offsets,
+						      templ->rec_field_no))) {
+
+			/* Copy an externally stored field to the temporary
+			heap */
+
+			ut_a(!prebuilt->trx->has_search_latch);
+
+			if (UNIV_UNLIKELY(templ->type == DATA_BLOB)) {
+				if (prebuilt->blob_heap == NULL) {
+					prebuilt->blob_heap = mem_heap_create(
+						UNIV_PAGE_SIZE);
+				}
+
+				heap = prebuilt->blob_heap;
+			} else {
+				extern_field_heap
+					= mem_heap_create(UNIV_PAGE_SIZE);
+
+				heap = extern_field_heap;
+			}
+
+			/* NOTE: if we are retrieving a big BLOB, we may
+			already run out of memory in the next call, which
+			causes an assert */
+
+			data = btr_rec_copy_externally_stored_field(
+				rec, offsets,
+				dict_table_zip_size(prebuilt->table),
+				templ->rec_field_no, &len, heap);
+
+			if (UNIV_UNLIKELY(!data)) {
+				/* The externally stored field
+				was not written yet. This
+				record should only be seen by
+				recv_recovery_rollback_active()
+				or any TRX_ISO_READ_UNCOMMITTED
+				transactions. */
+
+				if (extern_field_heap) {
+					mem_heap_free(extern_field_heap);
+				}
+
+				return(FALSE);
+			}
+
+			ut_a(len != UNIV_SQL_NULL);
+		} else {
+			/* Field is stored in the row. */
+
+			data = rec_get_nth_field(rec, offsets,
+						 templ->rec_field_no, &len);
+
+			if (UNIV_UNLIKELY(templ->type == DATA_BLOB)
+			    && len != UNIV_SQL_NULL) {
+
+				/* It is a BLOB field locally stored in the
+				InnoDB record: we MUST copy its contents to
+				prebuilt->blob_heap here because later code
+				assumes all BLOB values have been copied to a
+				safe place. */
+
+				if (prebuilt->blob_heap == NULL) {
+					prebuilt->blob_heap = mem_heap_create(
+						UNIV_PAGE_SIZE);
+				}
+
+				data = memcpy(mem_heap_alloc(
+						prebuilt->blob_heap, len),
+						data, len);
+			}
+		}
+
+		if (len != UNIV_SQL_NULL) {
+			row_sel_field_store_in_mysql_format(
+				mysql_rec + templ->mysql_col_offset,
+				templ, data, len);
+
+			/* Cleanup */
+			if (extern_field_heap) {
+				mem_heap_free(extern_field_heap);
+				extern_field_heap = NULL;
+			}
+
+			if (templ->mysql_null_bit_mask) {
+				/* It is a nullable column with a non-NULL
+				value */
+				mysql_rec[templ->mysql_null_byte_offset]
+					&= ~(byte) templ->mysql_null_bit_mask;
+			}
+		} else {
+			/* MySQL assumes that the field for an SQL
+			NULL value is set to the default value. */
+
+			UNIV_MEM_ASSERT_RW(prebuilt->default_rec
+					   + templ->mysql_col_offset,
+					   templ->mysql_col_len);
+			mysql_rec[templ->mysql_null_byte_offset]
+				|= (byte) templ->mysql_null_bit_mask;
+			memcpy(mysql_rec + templ->mysql_col_offset,
+			       (const byte*) prebuilt->default_rec
+			       + templ->mysql_col_offset,
+			       templ->mysql_col_len);
+		}
+	}
+
+	return(TRUE);
+}
+
+/*********************************************************************//**
+Builds a previous version of a clustered index record for a consistent read
+@return	DB_SUCCESS or error code */
+static
+ulint
+row_sel_build_prev_vers_for_mysql(
+/*==============================*/
+	read_view_t*	read_view,	/*!< in: read view */
+	dict_index_t*	clust_index,	/*!< in: clustered index */
+	row_prebuilt_t*	prebuilt,	/*!< in: prebuilt struct */
+	const rec_t*	rec,		/*!< in: record in a clustered index */
+	ulint**		offsets,	/*!< in/out: offsets returned by
+					rec_get_offsets(rec, clust_index) */
+	mem_heap_t**	offset_heap,	/*!< in/out: memory heap from which
+					the offsets are allocated */
+	rec_t**		old_vers,	/*!< out: old version, or NULL if the
+					record does not exist in the view:
+					i.e., it was freshly inserted
+					afterwards */
+	mtr_t*		mtr)		/*!< in: mtr */
+{
+	ulint	err;
+
+	if (prebuilt->old_vers_heap) {
+		mem_heap_empty(prebuilt->old_vers_heap);
+	} else {
+		prebuilt->old_vers_heap = mem_heap_create(200);
+	}
+
+	err = row_vers_build_for_consistent_read(
+		rec, mtr, clust_index, offsets, read_view, offset_heap,
+		prebuilt->old_vers_heap, old_vers);
+	return(err);
+}
+
+/*********************************************************************//**
+Retrieves the clustered index record corresponding to a record in a
+non-clustered index. Does the necessary locking. Used in the MySQL
+interface.
+@return	DB_SUCCESS, DB_SUCCESS_LOCKED_REC, or error code */
+static
+enum db_err
+row_sel_get_clust_rec_for_mysql(
+/*============================*/
+	row_prebuilt_t*	prebuilt,/*!< in: prebuilt struct in the handle */
+	dict_index_t*	sec_index,/*!< in: secondary index where rec resides */
+	const rec_t*	rec,	/*!< in: record in a non-clustered index; if
+				this is a locking read, then rec is not
+				allowed to be delete-marked, and that would
+				not make sense either */
+	que_thr_t*	thr,	/*!< in: query thread */
+	const rec_t**	out_rec,/*!< out: clustered record or an old version of
+				it, NULL if the old version did not exist
+				in the read view, i.e., it was a fresh
+				inserted version */
+	ulint**		offsets,/*!< in: offsets returned by
+				rec_get_offsets(rec, sec_index);
+				out: offsets returned by
+				rec_get_offsets(out_rec, clust_index) */
+	mem_heap_t**	offset_heap,/*!< in/out: memory heap from which
+				the offsets are allocated */
+	mtr_t*		mtr)	/*!< in: mtr used to get access to the
+				non-clustered record; the same mtr is used to
+				access the clustered index */
+{
+	dict_index_t*	clust_index;
+	const rec_t*	clust_rec;
+	rec_t*		old_vers;
+	enum db_err	err;
+	trx_t*		trx;
+
+	*out_rec = NULL;
+	trx = thr_get_trx(thr);
+
+	row_build_row_ref_in_tuple(prebuilt->clust_ref, rec,
+				   sec_index, *offsets, trx);
+
+	clust_index = dict_table_get_first_index(sec_index->table);
+
+	btr_pcur_open_with_no_init(clust_index, prebuilt->clust_ref,
+				   PAGE_CUR_LE, BTR_SEARCH_LEAF,
+				   prebuilt->clust_pcur, 0, mtr);
+
+	clust_rec = btr_pcur_get_rec(prebuilt->clust_pcur);
+
+	prebuilt->clust_pcur->trx_if_known = trx;
+
+	/* Note: only if the search ends up on a non-infimum record is the
+	low_match value the real match to the search tuple */
+
+	if (!page_rec_is_user_rec(clust_rec)
+	    || btr_pcur_get_low_match(prebuilt->clust_pcur)
+	    < dict_index_get_n_unique(clust_index)) {
+
+		/* In a rare case it is possible that no clust rec is found
+		for a delete-marked secondary index record: if in row0umod.c
+		in row_undo_mod_remove_clust_low() we have already removed
+		the clust rec, while purge is still cleaning and removing
+		secondary index records associated with earlier versions of
+		the clustered index record. In that case we know that the
+		clustered index record did not exist in the read view of
+		trx. */
+
+		if (!rec_get_deleted_flag(rec,
+					  dict_table_is_comp(sec_index->table))
+		    || prebuilt->select_lock_type != LOCK_NONE) {
+			ut_print_timestamp(stderr);
+			fputs("  InnoDB: error clustered record"
+			      " for sec rec not found\n"
+			      "InnoDB: ", stderr);
+			dict_index_name_print(stderr, trx, sec_index);
+			fputs("\n"
+			      "InnoDB: sec index record ", stderr);
+			rec_print(stderr, rec, sec_index);
+			fputs("\n"
+			      "InnoDB: clust index record ", stderr);
+			rec_print(stderr, clust_rec, clust_index);
+			putc('\n', stderr);
+			trx_print(stderr, trx, 600);
+
+			fputs("\n"
+			      "InnoDB: Submit a detailed bug report"
+			      " to http://bugs.mysql.com\n", stderr);
+		}
+
+		clust_rec = NULL;
+
+		err = DB_SUCCESS;
+		goto func_exit;
+	}
+
+	*offsets = rec_get_offsets(clust_rec, clust_index, *offsets,
+				   ULINT_UNDEFINED, offset_heap);
+
+	if (prebuilt->select_lock_type != LOCK_NONE) {
+		/* Try to place a lock on the index record; we are searching
+		the clust rec with a unique condition, hence
+		we set a LOCK_REC_NOT_GAP type lock */
+
+		err = lock_clust_rec_read_check_and_lock(
+			0, btr_pcur_get_block(prebuilt->clust_pcur),
+			clust_rec, clust_index, *offsets,
+			prebuilt->select_lock_type, LOCK_REC_NOT_GAP, thr);
+		switch (err) {
+		case DB_SUCCESS:
+		case DB_SUCCESS_LOCKED_REC:
+			break;
+		default:
+			goto err_exit;
+		}
+	} else {
+		/* This is a non-locking consistent read: if necessary, fetch
+		a previous version of the record */
+
+		old_vers = NULL;
+
+		/* If the isolation level allows reading of uncommitted data,
+		then we never look for an earlier version */
+
+		if (trx->isolation_level > TRX_ISO_READ_UNCOMMITTED
+		    && !lock_clust_rec_cons_read_sees(
+			    clust_rec, clust_index, *offsets,
+			    trx->read_view)) {
+
+			/* The following call returns 'offsets' associated with
+			'old_vers' */
+			err = row_sel_build_prev_vers_for_mysql(
+				trx->read_view, clust_index, prebuilt,
+				clust_rec, offsets, offset_heap, &old_vers,
+				mtr);
+
+			if (err != DB_SUCCESS || old_vers == NULL) {
+
+				goto err_exit;
+			}
+
+			clust_rec = old_vers;
+		}
+
+		/* If we had to go to an earlier version of row or the
+		secondary index record is delete marked, then it may be that
+		the secondary index record corresponding to clust_rec
+		(or old_vers) is not rec; in that case we must ignore
+		such row because in our snapshot rec would not have existed.
+		Remember that from rec we cannot see directly which transaction
+		id corresponds to it: we have to go to the clustered index
+		record. A query where we want to fetch all rows where
+		the secondary index value is in some interval would return
+		a wrong result if we would not drop rows which we come to
+		visit through secondary index records that would not really
+		exist in our snapshot. */
+
+		if (clust_rec
+		    && (old_vers
+			|| trx->isolation_level <= TRX_ISO_READ_UNCOMMITTED
+			|| rec_get_deleted_flag(rec, dict_table_is_comp(
+							sec_index->table)))
+		    && !row_sel_sec_rec_is_for_clust_rec(
+			    rec, sec_index, clust_rec, clust_index)) {
+			clust_rec = NULL;
+#ifdef UNIV_SEARCH_DEBUG
+		} else {
+			ut_a(clust_rec == NULL
+			     || row_sel_sec_rec_is_for_clust_rec(
+				     rec, sec_index, clust_rec, clust_index));
+#endif
+		}
+
+		err = DB_SUCCESS;
+	}
+
+func_exit:
+	*out_rec = clust_rec;
+
+	if (prebuilt->select_lock_type != LOCK_NONE) {
+		/* We may use the cursor in update or in unlock_row():
+		store its position */
+
+		btr_pcur_store_position(prebuilt->clust_pcur, mtr);
+	}
+
+err_exit:
+	return(err);
+}
+
+/********************************************************************//**
+Restores cursor position after it has been stored. We have to take into
+account that the record cursor was positioned on may have been deleted.
+Then we may have to move the cursor one step up or down.
+@return TRUE if we may need to process the record the cursor is now
+positioned on (i.e. we should not go to the next record yet) */
+static
+ibool
+sel_restore_position_for_mysql(
+/*===========================*/
+	ibool*		same_user_rec,	/*!< out: TRUE if we were able to restore
+					the cursor on a user record with the
+					same ordering prefix in in the
+					B-tree index */
+	ulint		latch_mode,	/*!< in: latch mode wished in
+					restoration */
+	btr_pcur_t*	pcur,		/*!< in: cursor whose position
+					has been stored */
+	ibool		moves_up,	/*!< in: TRUE if the cursor moves up
+					in the index */
+	mtr_t*		mtr)		/*!< in: mtr; CAUTION: may commit
+					mtr temporarily! */
+{
+	ibool	success;
+	ulint	relative_position;
+
+	relative_position = pcur->rel_pos;
+
+	success = btr_pcur_restore_position(latch_mode, pcur, mtr);
+
+	*same_user_rec = success;
+
+	if (relative_position == BTR_PCUR_ON) {
+		if (success) {
+			return(FALSE);
+		}
+
+		if (moves_up) {
+			btr_pcur_move_to_next(pcur, mtr);
+		}
+
+		return(TRUE);
+	}
+
+	if (relative_position == BTR_PCUR_AFTER
+	    || relative_position == BTR_PCUR_AFTER_LAST_IN_TREE) {
+
+		if (moves_up) {
+			return(TRUE);
+		}
+
+		if (btr_pcur_is_on_user_rec(pcur)) {
+			btr_pcur_move_to_prev(pcur, mtr);
+		}
+
+		return(TRUE);
+	}
+
+	ut_ad(relative_position == BTR_PCUR_BEFORE
+	      || relative_position == BTR_PCUR_BEFORE_FIRST_IN_TREE);
+
+	if (moves_up && btr_pcur_is_on_user_rec(pcur)) {
+		btr_pcur_move_to_next(pcur, mtr);
+	}
+
+	return(TRUE);
+}
+
+/********************************************************************//**
+Pops a cached row for MySQL from the fetch cache. */
+UNIV_INLINE
+void
+row_sel_pop_cached_row_for_mysql(
+/*=============================*/
+	byte*		buf,		/*!< in/out: buffer where to copy the
+					row */
+	row_prebuilt_t*	prebuilt)	/*!< in: prebuilt struct */
+{
+	ulint			i;
+	mysql_row_templ_t*	templ;
+	byte*			cached_rec;
+	ut_ad(prebuilt->n_fetch_cached > 0);
+	ut_ad(prebuilt->mysql_prefix_len <= prebuilt->mysql_row_len);
+
+	if (UNIV_UNLIKELY(prebuilt->keep_other_fields_on_keyread)) {
+		/* Copy cache record field by field, don't touch fields that
+		are not covered by current key */
+		cached_rec = prebuilt->fetch_cache[
+			prebuilt->fetch_cache_first];
+
+		for (i = 0; i < prebuilt->n_template; i++) {
+			templ = prebuilt->mysql_template + i;
+#if 0 /* Some of the cached_rec may legitimately be uninitialized. */
+			UNIV_MEM_ASSERT_RW(cached_rec
+					   + templ->mysql_col_offset,
+					   templ->mysql_col_len);
+#endif
+			ut_memcpy(buf + templ->mysql_col_offset,
+				  cached_rec + templ->mysql_col_offset,
+				  templ->mysql_col_len);
+			/* Copy NULL bit of the current field from cached_rec
+			to buf */
+			if (templ->mysql_null_bit_mask) {
+				/*buf[templ->mysql_null_byte_offset]
+					^= (buf[templ->mysql_null_byte_offset]
+					    ^ cached_rec[templ->mysql_null_byte_offset])
+					& (byte)templ->mysql_null_bit_mask;*/
+                                byte *null_byte= buf + templ->mysql_null_byte_offset;
+                                (*null_byte)&= ~templ->mysql_null_bit_mask;
+                                (*null_byte)|= cached_rec[templ->mysql_null_byte_offset] & 
+                                               templ->mysql_null_bit_mask;
+			}
+		}
+	}
+	else {
+#if 0 /* Some of the cached_rec may legitimately be uninitialized. */
+		UNIV_MEM_ASSERT_RW(prebuilt->fetch_cache
+				   [prebuilt->fetch_cache_first],
+				   prebuilt->mysql_prefix_len);
+#endif
+		ut_memcpy(buf,
+			  prebuilt->fetch_cache[prebuilt->fetch_cache_first],
+			  prebuilt->mysql_prefix_len);
+	}
+	prebuilt->n_fetch_cached--;
+	prebuilt->fetch_cache_first++;
+
+	if (prebuilt->n_fetch_cached == 0) {
+		prebuilt->fetch_cache_first = 0;
+	}
+}
+
+/********************************************************************//**
+Pushes a row for MySQL to the fetch cache.
+@return TRUE on success, FALSE if the record contains incomplete BLOBs */
+UNIV_INLINE __attribute__((warn_unused_result))
+ibool
+row_sel_push_cache_row_for_mysql(
+/*=============================*/
+	row_prebuilt_t*	prebuilt,	/*!< in: prebuilt struct */
+	const rec_t*	rec,		/*!< in: record to push; must
+					be protected by a page latch */
+	const ulint*	offsets,	/* in: rec_get_offsets() */
+	ulint		start_field_no,	/* in: start from this field */
+	byte*		remainder_buf)	/* in: if start_field_no !=0,
+					where to take prev fields */
+{
+	byte*	buf;
+	ulint	i;
+
+	ut_ad(prebuilt->n_fetch_cached < MYSQL_FETCH_CACHE_SIZE);
+	ut_ad(rec_offs_validate(rec, NULL, offsets));
+	ut_a(!prebuilt->templ_contains_blob);
+
+	if (prebuilt->fetch_cache[0] == NULL) {
+		/* Allocate memory for the fetch cache */
+
+		for (i = 0; i < MYSQL_FETCH_CACHE_SIZE; i++) {
+
+			/* A user has reported memory corruption in these
+			buffers in Linux. Put magic numbers there to help
+			to track a possible bug. */
+
+			buf = mem_alloc(prebuilt->mysql_row_len + 8);
+
+			prebuilt->fetch_cache[i] = buf + 4;
+
+			mach_write_to_4(buf, ROW_PREBUILT_FETCH_MAGIC_N);
+			mach_write_to_4(buf + 4 + prebuilt->mysql_row_len,
+					ROW_PREBUILT_FETCH_MAGIC_N);
+		}
+	}
+
+	ut_ad(prebuilt->fetch_cache_first == 0);
+	UNIV_MEM_INVALID(prebuilt->fetch_cache[prebuilt->n_fetch_cached],
+			 prebuilt->mysql_row_len);
+
+	if (UNIV_UNLIKELY(!row_sel_store_mysql_rec(
+				prebuilt->fetch_cache[
+					  prebuilt->n_fetch_cached],
+				prebuilt,
+				rec,
+				offsets,
+				start_field_no,
+				prebuilt->n_template))) {
+		return(FALSE);
+	}
+
+	if (start_field_no) {
+
+		for (i=0; i < start_field_no; i++) {
+			register		ulint offs;
+			mysql_row_templ_t*	templ;
+                        register byte *         null_byte;
+
+			templ = prebuilt->mysql_template + i;
+
+			if (templ->mysql_null_bit_mask) {
+				offs = templ->mysql_null_byte_offset;
+
+                                null_byte= prebuilt->fetch_cache[
+                                             prebuilt->n_fetch_cached]+offs;
+                                (*null_byte)&= ~templ->mysql_null_bit_mask;
+                                (*null_byte)|= (*(remainder_buf + offs) & 
+                                              templ->mysql_null_bit_mask);  
+			}
+
+			offs = templ->mysql_col_offset;
+			memcpy(prebuilt->fetch_cache[prebuilt->n_fetch_cached]
+			       + offs,
+			       remainder_buf + offs,
+			       templ->mysql_col_len);
+		}
+	}
+
+	prebuilt->n_fetch_cached++;
+	return(TRUE);
+}
+
+/*********************************************************************//**
+Tries to do a shortcut to fetch a clustered index record with a unique key,
+using the hash index if possible (not always). We assume that the search
+mode is PAGE_CUR_GE, it is a consistent read, there is a read view in trx,
+btr search latch has been locked in S-mode.
+@return	SEL_FOUND, SEL_EXHAUSTED, SEL_RETRY */
+static
+ulint
+row_sel_try_search_shortcut_for_mysql(
+/*==================================*/
+	const rec_t**	out_rec,/*!< out: record if found */
+	row_prebuilt_t*	prebuilt,/*!< in: prebuilt struct */
+	ulint**		offsets,/*!< in/out: for rec_get_offsets(*out_rec) */
+	mem_heap_t**	heap,	/*!< in/out: heap for rec_get_offsets() */
+	mtr_t*		mtr)	/*!< in: started mtr */
+{
+	dict_index_t*	index		= prebuilt->index;
+	const dtuple_t*	search_tuple	= prebuilt->search_tuple;
+	btr_pcur_t*	pcur		= prebuilt->pcur;
+	trx_t*		trx		= prebuilt->trx;
+	const rec_t*	rec;
+
+	ut_ad(dict_index_is_clust(index));
+	ut_ad(!prebuilt->templ_contains_blob);
+
+#ifndef UNIV_SEARCH_DEBUG
+	btr_pcur_open_with_no_init(index, search_tuple, PAGE_CUR_GE,
+				   BTR_SEARCH_LEAF, pcur,
+				   RW_S_LATCH,
+				   mtr);
+#else /* UNIV_SEARCH_DEBUG */
+	btr_pcur_open_with_no_init(index, search_tuple, PAGE_CUR_GE,
+				   BTR_SEARCH_LEAF, pcur,
+				   0,
+				   mtr);
+#endif /* UNIV_SEARCH_DEBUG */
+	rec = btr_pcur_get_rec(pcur);
+
+	if (!page_rec_is_user_rec(rec)) {
+
+		return(SEL_RETRY);
+	}
+
+	/* As the cursor is now placed on a user record after a search with
+	the mode PAGE_CUR_GE, the up_match field in the cursor tells how many
+	fields in the user record matched to the search tuple */
+
+	if (btr_pcur_get_up_match(pcur) < dtuple_get_n_fields(search_tuple)) {
+
+		return(SEL_EXHAUSTED);
+	}
+
+	/* This is a non-locking consistent read: if necessary, fetch
+	a previous version of the record */
+
+	*offsets = rec_get_offsets(rec, index, *offsets,
+				   ULINT_UNDEFINED, heap);
+
+	if (!lock_clust_rec_cons_read_sees(rec, index,
+					   *offsets, trx->read_view)) {
+
+		return(SEL_RETRY);
+	}
+
+	if (rec_get_deleted_flag(rec, dict_table_is_comp(index->table))) {
+
+		return(SEL_EXHAUSTED);
+	}
+
+	*out_rec = rec;
+
+	return(SEL_FOUND);
+}
+
+/********************************************************************//**
+Searches for rows in the database. This is used in the interface to
+MySQL. This function opens a cursor, and also implements fetch next
+and fetch prev. NOTE that if we do a search with a full key value
+from a unique index (ROW_SEL_EXACT), then we will not store the cursor
+position and fetch next or fetch prev must not be tried to the cursor!
+@return DB_SUCCESS, DB_RECORD_NOT_FOUND, DB_END_OF_INDEX, DB_DEADLOCK,
+DB_LOCK_TABLE_FULL, DB_CORRUPTION, or DB_TOO_BIG_RECORD */
+UNIV_INTERN
+ulint
+row_search_for_mysql(
+/*=================*/
+	byte*		buf,		/*!< in/out: buffer for the fetched
+					row in the MySQL format */
+	ulint		mode,		/*!< in: search mode PAGE_CUR_L, ... */
+	row_prebuilt_t*	prebuilt,	/*!< in: prebuilt struct for the
+					table handle; this contains the info
+					of search_tuple, index; if search
+					tuple contains 0 fields then we
+					position the cursor at the start or
+					the end of the index, depending on
+					'mode' */
+	ulint		match_mode,	/*!< in: 0 or ROW_SEL_EXACT or
+					ROW_SEL_EXACT_PREFIX */
+	ulint		direction)	/*!< in: 0 or ROW_SEL_NEXT or
+					ROW_SEL_PREV; NOTE: if this is != 0,
+					then prebuilt must have a pcur
+					with stored position! In opening of a
+					cursor 'direction' should be 0. */
+{
+	dict_index_t*	index		= prebuilt->index;
+	ibool		comp		= dict_table_is_comp(index->table);
+	const dtuple_t*	search_tuple	= prebuilt->search_tuple;
+	btr_pcur_t*	pcur		= prebuilt->pcur;
+	trx_t*		trx		= prebuilt->trx;
+	dict_index_t*	clust_index;
+	que_thr_t*	thr;
+	const rec_t*	rec;
+	const rec_t*	result_rec;
+	const rec_t*	clust_rec;
+	ulint		err				= DB_SUCCESS;
+	ibool		unique_search			= FALSE;
+	ibool		unique_search_from_clust_index	= FALSE;
+	ibool		mtr_has_extra_clust_latch	= FALSE;
+	ibool		moves_up			= FALSE;
+	ibool		set_also_gap_locks		= TRUE;
+	/* if the query is a plain locking SELECT, and the isolation level
+	is <= TRX_ISO_READ_COMMITTED, then this is set to FALSE */
+	ibool		did_semi_consistent_read	= FALSE;
+	/* if the returned record was locked and we did a semi-consistent
+	read (fetch the newest committed version), then this is set to
+	TRUE */
+#ifdef UNIV_SEARCH_DEBUG
+	ulint		cnt				= 0;
+#endif /* UNIV_SEARCH_DEBUG */
+	ulint		next_offs;
+	ibool		same_user_rec;
+	mtr_t		mtr;
+	mem_heap_t*	heap				= NULL;
+	ulint		offsets_[REC_OFFS_NORMAL_SIZE];
+	ulint*		offsets				= offsets_;
+	ibool		some_fields_in_buffer;
+	ibool		problematic_use = FALSE;
+	ibool		get_clust_rec			= 0;
+
+	rec_offs_init(offsets_);
+
+	ut_ad(index && pcur && search_tuple);
+	ut_ad(trx->mysql_thread_id == os_thread_get_curr_id());
+
+	if (UNIV_UNLIKELY(prebuilt->table->ibd_file_missing)) {
+		ut_print_timestamp(stderr);
+		fprintf(stderr, "  InnoDB: Error:\n"
+			"InnoDB: MySQL is trying to use a table handle"
+			" but the .ibd file for\n"
+			"InnoDB: table %s does not exist.\n"
+			"InnoDB: Have you deleted the .ibd file"
+			" from the database directory under\n"
+			"InnoDB: the MySQL datadir, or have you used"
+			" DISCARD TABLESPACE?\n"
+			"InnoDB: Look from\n"
+			"InnoDB: " REFMAN "innodb-troubleshooting.html\n"
+			"InnoDB: how you can resolve the problem.\n",
+			prebuilt->table->name);
+
+		return(DB_ERROR);
+	}
+
+	if (UNIV_UNLIKELY(!prebuilt->index_usable)) {
+
+		return(DB_MISSING_HISTORY);
+	}
+
+	if (UNIV_UNLIKELY(prebuilt->magic_n != ROW_PREBUILT_ALLOCATED)) {
+		fprintf(stderr,
+			"InnoDB: Error: trying to free a corrupt\n"
+			"InnoDB: table handle. Magic n %lu, table name ",
+			(ulong) prebuilt->magic_n);
+		ut_print_name(stderr, trx, TRUE, prebuilt->table->name);
+		putc('\n', stderr);
+
+		mem_analyze_corruption(prebuilt);
+
+		ut_error;
+	}
+
+#if 0
+	/* August 19, 2005 by Heikki: temporarily disable this error
+	print until the cursor lock count is done correctly.
+	See bugs #12263 and #12456!*/
+
+	if (trx->n_mysql_tables_in_use == 0
+	    && UNIV_UNLIKELY(prebuilt->select_lock_type == LOCK_NONE)) {
+		/* Note that if MySQL uses an InnoDB temp table that it
+		created inside LOCK TABLES, then n_mysql_tables_in_use can
+		be zero; in that case select_lock_type is set to LOCK_X in
+		::start_stmt. */
+
+		fputs("InnoDB: Error: MySQL is trying to perform a SELECT\n"
+		      "InnoDB: but it has not locked"
+		      " any tables in ::external_lock()!\n",
+		      stderr);
+		trx_print(stderr, trx, 600);
+		fputc('\n', stderr);
+	}
+#endif
+
+#if 0
+	fprintf(stderr, "Match mode %lu\n search tuple ",
+		(ulong) match_mode);
+	dtuple_print(search_tuple);
+	fprintf(stderr, "N tables locked %lu\n",
+		(ulong) trx->mysql_n_tables_locked);
+#endif
+	/*-------------------------------------------------------------*/
+	/* PHASE 0: Release a possible s-latch we are holding on the
+	adaptive hash index latch if there is someone waiting behind */
+
+	if (UNIV_UNLIKELY(rw_lock_get_writer(&btr_search_latch) != RW_LOCK_NOT_LOCKED)
+	    && trx->has_search_latch) {
+
+		/* There is an x-latch request on the adaptive hash index:
+		release the s-latch to reduce starvation and wait for
+		BTR_SEA_TIMEOUT rounds before trying to keep it again over
+		calls from MySQL */
+
+		rw_lock_s_unlock(&btr_search_latch);
+		trx->has_search_latch = FALSE;
+
+		trx->search_latch_timeout = BTR_SEA_TIMEOUT;
+	}
+
+	/* Reset the new record lock info if srv_locks_unsafe_for_binlog
+	is set or session is using a READ COMMITED isolation level. Then
+	we are able to remove the record locks set here on an individual
+	row. */
+	prebuilt->new_rec_locks = 0;
+
+	/*-------------------------------------------------------------*/
+	/* PHASE 1: Try to pop the row from the prefetch cache */
+
+	if (UNIV_UNLIKELY(direction == 0)) {
+		trx->op_info = "starting index read";
+
+		prebuilt->n_rows_fetched = 0;
+		prebuilt->n_fetch_cached = 0;
+		prebuilt->fetch_cache_first = 0;
+
+		if (prebuilt->sel_graph == NULL) {
+			/* Build a dummy select query graph */
+			row_prebuild_sel_graph(prebuilt);
+		}
+	} else {
+		trx->op_info = "fetching rows";
+
+		if (prebuilt->n_rows_fetched == 0) {
+			prebuilt->fetch_direction = direction;
+		}
+
+		if (UNIV_UNLIKELY(direction != prebuilt->fetch_direction)) {
+			if (UNIV_UNLIKELY(prebuilt->n_fetch_cached > 0)) {
+				ut_error;
+				/* TODO: scrollable cursor: restore cursor to
+				the place of the latest returned row,
+				or better: prevent caching for a scroll
+				cursor! */
+			}
+
+			prebuilt->n_rows_fetched = 0;
+			prebuilt->n_fetch_cached = 0;
+			prebuilt->fetch_cache_first = 0;
+
+		} else if (UNIV_LIKELY(prebuilt->n_fetch_cached > 0)) {
+			row_sel_pop_cached_row_for_mysql(buf, prebuilt);
+
+			prebuilt->n_rows_fetched++;
+
+			srv_n_rows_read++;
+			err = DB_SUCCESS;
+			goto func_exit;
+		}
+
+		if (prebuilt->fetch_cache_first > 0
+		    && prebuilt->fetch_cache_first < MYSQL_FETCH_CACHE_SIZE) {
+
+			/* The previous returned row was popped from the fetch
+			cache, but the cache was not full at the time of the
+			popping: no more rows can exist in the result set */
+
+			err = DB_RECORD_NOT_FOUND;
+			goto func_exit;
+		}
+
+		prebuilt->n_rows_fetched++;
+
+		if (prebuilt->n_rows_fetched > 1000000000) {
+			/* Prevent wrap-over */
+			prebuilt->n_rows_fetched = 500000000;
+		}
+
+		mode = pcur->search_mode;
+	}
+
+	/* In a search where at most one record in the index may match, we
+	can use a LOCK_REC_NOT_GAP type record lock when locking a
+	non-delete-marked matching record.
+
+	Note that in a unique secondary index there may be different
+	delete-marked versions of a record where only the primary key
+	values differ: thus in a secondary index we must use next-key
+	locks when locking delete-marked records. */
+
+	if (match_mode == ROW_SEL_EXACT
+	    && dict_index_is_unique(index)
+	    && dtuple_get_n_fields(search_tuple)
+	    == dict_index_get_n_unique(index)
+	    && (dict_index_is_clust(index)
+		|| !dtuple_contains_null(search_tuple))) {
+
+		/* Note above that a UNIQUE secondary index can contain many
+		rows with the same key value if one of the columns is the SQL
+		null. A clustered index under MySQL can never contain null
+		columns because we demand that all the columns in primary key
+		are non-null. */
+
+		unique_search = TRUE;
+
+		/* Even if the condition is unique, MySQL seems to try to
+		retrieve also a second row if a primary key contains more than
+		1 column. Return immediately if this is not a HANDLER
+		command. */
+
+		if (UNIV_UNLIKELY(direction != 0
+				  && !prebuilt->used_in_HANDLER)) {
+
+			err = DB_RECORD_NOT_FOUND;
+			goto func_exit;
+		}
+	}
+
+	mtr_start(&mtr);
+
+	/*-------------------------------------------------------------*/
+	/* PHASE 2: Try fast adaptive hash index search if possible */
+
+	/* Next test if this is the special case where we can use the fast
+	adaptive hash index to try the search. Since we must release the
+	search system latch when we retrieve an externally stored field, we
+	cannot use the adaptive hash index in a search in the case the row
+	may be long and there may be externally stored fields */
+
+	if (UNIV_UNLIKELY(direction == 0)
+	    && unique_search
+	    && dict_index_is_clust(index)
+	    && !prebuilt->templ_contains_blob
+	    && !prebuilt->used_in_HANDLER
+	    && (prebuilt->mysql_row_len < UNIV_PAGE_SIZE / 8)) {
+
+		mode = PAGE_CUR_GE;
+
+		unique_search_from_clust_index = TRUE;
+
+		if (trx->mysql_n_tables_locked == 0
+		    && prebuilt->select_lock_type == LOCK_NONE
+		    && trx->isolation_level > TRX_ISO_READ_UNCOMMITTED
+		    && trx->read_view) {
+
+			/* This is a SELECT query done as a consistent read,
+			and the read view has already been allocated:
+			let us try a search shortcut through the hash
+			index.
+			NOTE that we must also test that
+			mysql_n_tables_locked == 0, because this might
+			also be INSERT INTO ... SELECT ... or
+			CREATE TABLE ... SELECT ... . Our algorithm is
+			NOT prepared to inserts interleaved with the SELECT,
+			and if we try that, we can deadlock on the adaptive
+			hash index semaphore! */
+
+#ifndef UNIV_SEARCH_DEBUG
+			if (!trx->has_search_latch) {
+				rw_lock_s_lock(&btr_search_latch);
+				trx->has_search_latch = TRUE;
+			}
+#endif
+			switch (row_sel_try_search_shortcut_for_mysql(
+					&rec, prebuilt, &offsets, &heap,
+					&mtr)) {
+			case SEL_FOUND:
+#ifdef UNIV_SEARCH_DEBUG
+				ut_a(0 == cmp_dtuple_rec(search_tuple,
+							 rec, offsets));
+#endif
+				/* At this point, rec is protected by
+				a page latch that was acquired by
+				row_sel_try_search_shortcut_for_mysql().
+				The latch will not be released until
+				mtr_commit(&mtr). */
+				ut_ad(!rec_get_deleted_flag(rec, comp));
+
+				if (!row_sel_store_mysql_rec(buf, prebuilt,
+						rec, offsets, 0,
+						prebuilt->n_template)) {
+					/* Only fresh inserts may contain
+					incomplete externally stored
+					columns. Pretend that such
+					records do not exist. Such
+					records may only be accessed
+					at the READ UNCOMMITTED
+					isolation level or when
+					rolling back a recovered
+					transaction. Rollback happens
+					at a lower level, not here. */
+					ut_a(trx->isolation_level
+					     == TRX_ISO_READ_UNCOMMITTED);
+
+					/* Proceed as in case SEL_RETRY. */
+					break;
+				}
+
+				mtr_commit(&mtr);
+
+				/* ut_print_name(stderr, index->name);
+				fputs(" shortcut\n", stderr); */
+
+				srv_n_rows_read++;
+
+				err = DB_SUCCESS;
+				goto release_search_latch_if_needed;
+
+			case SEL_EXHAUSTED:
+				mtr_commit(&mtr);
+
+				/* ut_print_name(stderr, index->name);
+				fputs(" record not found 2\n", stderr); */
+
+				err = DB_RECORD_NOT_FOUND;
+release_search_latch_if_needed:
+				if (trx->search_latch_timeout > 0
+				    && trx->has_search_latch) {
+
+					trx->search_latch_timeout--;
+
+					rw_lock_s_unlock(&btr_search_latch);
+					trx->has_search_latch = FALSE;
+				}
+
+				/* NOTE that we do NOT store the cursor
+				position */
+				goto func_exit;
+
+			case SEL_RETRY:
+				break;
+
+			default:
+				ut_ad(0);
+			}
+
+			mtr_commit(&mtr);
+			mtr_start(&mtr);
+		}
+	}
+
+	/*-------------------------------------------------------------*/
+	/* PHASE 3: Open or restore index cursor position */
+
+	if (trx->has_search_latch) {
+		rw_lock_s_unlock(&btr_search_latch);
+		trx->has_search_latch = FALSE;
+	}
+
+	ut_ad(prebuilt->sql_stat_start || trx->conc_state == TRX_ACTIVE);
+	ut_ad(trx->conc_state == TRX_NOT_STARTED
+	      || trx->conc_state == TRX_ACTIVE);
+	ut_ad(prebuilt->sql_stat_start
+	      || prebuilt->select_lock_type != LOCK_NONE
+	      || trx->read_view);
+
+	trx_start_if_not_started(trx);
+
+	if (trx->isolation_level <= TRX_ISO_READ_COMMITTED
+	    && prebuilt->select_lock_type != LOCK_NONE
+	    && trx->mysql_thd != NULL
+	    && thd_is_select(trx->mysql_thd)) {
+		/* It is a plain locking SELECT and the isolation
+		level is low: do not lock gaps */
+
+		set_also_gap_locks = FALSE;
+	}
+
+	/* Note that if the search mode was GE or G, then the cursor
+	naturally moves upward (in fetch next) in alphabetical order,
+	otherwise downward */
+
+	if (UNIV_UNLIKELY(direction == 0)) {
+		if (mode == PAGE_CUR_GE || mode == PAGE_CUR_G) {
+			moves_up = TRUE;
+		}
+	} else if (direction == ROW_SEL_NEXT) {
+		moves_up = TRUE;
+	}
+
+	thr = que_fork_get_first_thr(prebuilt->sel_graph);
+
+	que_thr_move_to_run_state_for_mysql(thr, trx);
+
+	clust_index = dict_table_get_first_index(index->table);
+
+	if (UNIV_LIKELY(direction != 0)) {
+		ibool	need_to_process = sel_restore_position_for_mysql(
+			&same_user_rec, BTR_SEARCH_LEAF,
+			pcur, moves_up, &mtr);
+
+		if (UNIV_UNLIKELY(need_to_process)) {
+			if (UNIV_UNLIKELY(prebuilt->row_read_type
+					  == ROW_READ_DID_SEMI_CONSISTENT)) {
+				/* We did a semi-consistent read,
+				but the record was removed in
+				the meantime. */
+				prebuilt->row_read_type
+					= ROW_READ_TRY_SEMI_CONSISTENT;
+			}
+		} else if (UNIV_LIKELY(prebuilt->row_read_type
+				       != ROW_READ_DID_SEMI_CONSISTENT)) {
+
+			/* The cursor was positioned on the record
+			that we returned previously.  If we need
+			to repeat a semi-consistent read as a
+			pessimistic locking read, the record
+			cannot be skipped. */
+
+			goto next_rec;
+		}
+
+	} else if (dtuple_get_n_fields(search_tuple) > 0) {
+
+		btr_pcur_open_with_no_init(index, search_tuple, mode,
+					   BTR_SEARCH_LEAF,
+					   pcur, 0, &mtr);
+
+		pcur->trx_if_known = trx;
+
+		rec = btr_pcur_get_rec(pcur);
+
+		if (!moves_up
+		    && !page_rec_is_supremum(rec)
+		    && set_also_gap_locks
+		    && !(srv_locks_unsafe_for_binlog
+			 || trx->isolation_level <= TRX_ISO_READ_COMMITTED)
+		    && prebuilt->select_lock_type != LOCK_NONE) {
+
+			/* Try to place a gap lock on the next index record
+			to prevent phantoms in ORDER BY ... DESC queries */
+			const rec_t*	next = page_rec_get_next_const(rec);
+
+			offsets = rec_get_offsets(next, index, offsets,
+						  ULINT_UNDEFINED, &heap);
+			err = sel_set_rec_lock(btr_pcur_get_block(pcur),
+					       next, index, offsets,
+					       prebuilt->select_lock_type,
+					       LOCK_GAP, thr);
+
+			switch (err) {
+			case DB_SUCCESS_LOCKED_REC:
+				err = DB_SUCCESS;
+			case DB_SUCCESS:
+				break;
+			default:
+				goto lock_wait_or_error;
+			}
+		}
+	} else {
+		if (mode == PAGE_CUR_G) {
+			btr_pcur_open_at_index_side(
+				TRUE, index, BTR_SEARCH_LEAF, pcur, FALSE,
+				&mtr);
+		} else if (mode == PAGE_CUR_L) {
+			btr_pcur_open_at_index_side(
+				FALSE, index, BTR_SEARCH_LEAF, pcur, FALSE,
+				&mtr);
+		}
+	}
+
+	if (!prebuilt->mysql_has_locked) {
+		fprintf(stderr, "InnoDB: Error: row_search_for_mysql() is called without ha_innobase::external_lock()\n");
+		if (trx->mysql_thd != NULL) {
+			innobase_mysql_print_thd(stderr, trx->mysql_thd, 600);
+		}
+		problematic_use = TRUE;
+	}
+retry_check:
+	
+	if (!prebuilt->sql_stat_start) {
+		/* No need to set an intention lock or assign a read view */
+
+		if (trx->read_view == NULL
+		    && prebuilt->select_lock_type == LOCK_NONE) {
+
+			fputs("InnoDB: Error: MySQL is trying to"
+			      " perform a consistent read\n"
+			      "InnoDB: but the read view is not assigned!\n",
+			      stderr);
+			if (problematic_use) {
+				fprintf(stderr, "InnoDB: It may be caused by calling "
+						"without ha_innobase::external_lock()\n"
+						"InnoDB: For the first-aid, avoiding the crash. "
+						"But it should be fixed ASAP.\n");
+				prebuilt->sql_stat_start = TRUE;
+				goto retry_check;
+			}
+			trx_print(stderr, trx, 600);
+			fputc('\n', stderr);
+			ut_a(0);
+		}
+	} else if (prebuilt->select_lock_type == LOCK_NONE) {
+		/* This is a consistent read */
+		/* Assign a read view for the query */
+
+		trx_assign_read_view(trx);
+		prebuilt->sql_stat_start = FALSE;
+	} else {
+		ulint	lock_mode;
+		if (prebuilt->select_lock_type == LOCK_S) {
+			lock_mode = LOCK_IS;
+		} else {
+			lock_mode = LOCK_IX;
+		}
+		err = lock_table(0, index->table, lock_mode, thr);
+
+		if (err != DB_SUCCESS) {
+
+			goto lock_wait_or_error;
+		}
+		prebuilt->sql_stat_start = FALSE;
+	}
+
+rec_loop:
+	/*-------------------------------------------------------------*/
+	/* PHASE 4: Look for matching records in a loop */
+
+	rec = btr_pcur_get_rec(pcur);
+
+	if (srv_pass_corrupt_table && !rec) {
+		err = DB_CORRUPTION;
+		goto lock_wait_or_error;
+	}
+	ut_a(rec);
+
+	ut_ad(!!page_rec_is_comp(rec) == comp);
+#ifdef UNIV_SEARCH_DEBUG
+	/*
+	fputs("Using ", stderr);
+	dict_index_name_print(stderr, index);
+	fprintf(stderr, " cnt %lu ; Page no %lu\n", cnt,
+	page_get_page_no(page_align(rec)));
+	rec_print(rec);
+	*/
+#endif /* UNIV_SEARCH_DEBUG */
+
+	if (page_rec_is_infimum(rec)) {
+
+		/* The infimum record on a page cannot be in the result set,
+		and neither can a record lock be placed on it: we skip such
+		a record. */
+
+		goto next_rec;
+	}
+
+	if (page_rec_is_supremum(rec)) {
+
+		if (set_also_gap_locks
+		    && !(srv_locks_unsafe_for_binlog
+			 || trx->isolation_level <= TRX_ISO_READ_COMMITTED)
+		    && prebuilt->select_lock_type != LOCK_NONE) {
+
+			/* Try to place a lock on the index record */
+
+			/* If innodb_locks_unsafe_for_binlog option is used
+			or this session is using a READ COMMITTED isolation
+			level we do not lock gaps. Supremum record is really
+			a gap and therefore we do not set locks there. */
+
+			offsets = rec_get_offsets(rec, index, offsets,
+						  ULINT_UNDEFINED, &heap);
+			err = sel_set_rec_lock(btr_pcur_get_block(pcur),
+					       rec, index, offsets,
+					       prebuilt->select_lock_type,
+					       LOCK_ORDINARY, thr);
+
+			switch (err) {
+			case DB_SUCCESS_LOCKED_REC:
+				err = DB_SUCCESS;
+			case DB_SUCCESS:
+				break;
+			default:
+				goto lock_wait_or_error;
+			}
+		}
+		/* A page supremum record cannot be in the result set: skip
+		it now that we have placed a possible lock on it */
+
+		goto next_rec;
+	}
+
+	/*-------------------------------------------------------------*/
+	/* Do sanity checks in case our cursor has bumped into page
+	corruption */
+
+	if (comp) {
+		next_offs = rec_get_next_offs(rec, TRUE);
+		if (UNIV_UNLIKELY(next_offs < PAGE_NEW_SUPREMUM)) {
+
+			goto wrong_offs;
+		}
+	} else {
+		next_offs = rec_get_next_offs(rec, FALSE);
+		if (UNIV_UNLIKELY(next_offs < PAGE_OLD_SUPREMUM)) {
+
+			goto wrong_offs;
+		}
+	}
+
+	if (UNIV_UNLIKELY(next_offs >= UNIV_PAGE_SIZE - PAGE_DIR)) {
+
+wrong_offs:
+		if (srv_force_recovery == 0 || moves_up == FALSE) {
+			ut_print_timestamp(stderr);
+			buf_page_print(page_align(rec), 0);
+			fprintf(stderr,
+				"\nInnoDB: rec address %p,"
+				" buf block fix count %lu\n",
+				(void*) rec, (ulong)
+				btr_cur_get_block(btr_pcur_get_btr_cur(pcur))
+				->page.buf_fix_count);
+			fprintf(stderr,
+				"InnoDB: Index corruption: rec offs %lu"
+				" next offs %lu, page no %lu,\n"
+				"InnoDB: ",
+				(ulong) page_offset(rec),
+				(ulong) next_offs,
+				(ulong) page_get_page_no(page_align(rec)));
+			dict_index_name_print(stderr, trx, index);
+			fputs(". Run CHECK TABLE. You may need to\n"
+			      "InnoDB: restore from a backup, or"
+			      " dump + drop + reimport the table.\n",
+			      stderr);
+
+			err = DB_CORRUPTION;
+
+			goto lock_wait_or_error;
+		} else {
+			/* The user may be dumping a corrupt table. Jump
+			over the corruption to recover as much as possible. */
+
+			fprintf(stderr,
+				"InnoDB: Index corruption: rec offs %lu"
+				" next offs %lu, page no %lu,\n"
+				"InnoDB: ",
+				(ulong) page_offset(rec),
+				(ulong) next_offs,
+				(ulong) page_get_page_no(page_align(rec)));
+			dict_index_name_print(stderr, trx, index);
+			fputs(". We try to skip the rest of the page.\n",
+			      stderr);
+
+			btr_pcur_move_to_last_on_page(pcur, &mtr);
+
+			goto next_rec;
+		}
+	}
+	/*-------------------------------------------------------------*/
+
+	/* Calculate the 'offsets' associated with 'rec' */
+
+	offsets = rec_get_offsets(rec, index, offsets, ULINT_UNDEFINED, &heap);
+
+	if (UNIV_UNLIKELY(srv_force_recovery > 0)) {
+		if (!rec_validate(rec, offsets)
+		    || !btr_index_rec_validate(rec, index, FALSE)) {
+			fprintf(stderr,
+				"InnoDB: Index corruption: rec offs %lu"
+				" next offs %lu, page no %lu,\n"
+				"InnoDB: ",
+				(ulong) page_offset(rec),
+				(ulong) next_offs,
+				(ulong) page_get_page_no(page_align(rec)));
+			dict_index_name_print(stderr, trx, index);
+			fputs(". We try to skip the record.\n",
+			      stderr);
+
+			goto next_rec;
+		}
+	}
+
+	/* Note that we cannot trust the up_match value in the cursor at this
+	place because we can arrive here after moving the cursor! Thus
+	we have to recompare rec and search_tuple to determine if they
+	match enough. */
+
+	if (match_mode == ROW_SEL_EXACT) {
+		/* Test if the index record matches completely to search_tuple
+		in prebuilt: if not, then we return with DB_RECORD_NOT_FOUND */
+
+		/* fputs("Comparing rec and search tuple\n", stderr); */
+
+		if (0 != cmp_dtuple_rec(search_tuple, rec, offsets)) {
+
+			if (set_also_gap_locks
+			    && !(srv_locks_unsafe_for_binlog
+				 || trx->isolation_level
+				 <= TRX_ISO_READ_COMMITTED)
+			    && prebuilt->select_lock_type != LOCK_NONE) {
+
+				/* Try to place a gap lock on the index
+				record only if innodb_locks_unsafe_for_binlog
+				option is not set or this session is not
+				using a READ COMMITTED isolation level. */
+
+				err = sel_set_rec_lock(
+					btr_pcur_get_block(pcur),
+					rec, index, offsets,
+					prebuilt->select_lock_type, LOCK_GAP,
+					thr);
+
+				switch (err) {
+				case DB_SUCCESS_LOCKED_REC:
+				case DB_SUCCESS:
+					break;
+				default:
+					goto lock_wait_or_error;
+				}
+			}
+
+			btr_pcur_store_position(pcur, &mtr);
+
+			err = DB_RECORD_NOT_FOUND;
+			/* ut_print_name(stderr, index->name);
+			fputs(" record not found 3\n", stderr); */
+
+			goto normal_return;
+		}
+
+	} else if (match_mode == ROW_SEL_EXACT_PREFIX) {
+
+		if (!cmp_dtuple_is_prefix_of_rec(search_tuple, rec, offsets)) {
+
+			if (set_also_gap_locks
+			    && !(srv_locks_unsafe_for_binlog
+				 || trx->isolation_level
+				 <= TRX_ISO_READ_COMMITTED)
+			    && prebuilt->select_lock_type != LOCK_NONE) {
+
+				/* Try to place a gap lock on the index
+				record only if innodb_locks_unsafe_for_binlog
+				option is not set or this session is not
+				using a READ COMMITTED isolation level. */
+
+				err = sel_set_rec_lock(
+					btr_pcur_get_block(pcur),
+					rec, index, offsets,
+					prebuilt->select_lock_type, LOCK_GAP,
+					thr);
+
+				switch (err) {
+				case DB_SUCCESS_LOCKED_REC:
+				case DB_SUCCESS:
+					break;
+				default:
+					goto lock_wait_or_error;
+				}
+			}
+
+			btr_pcur_store_position(pcur, &mtr);
+
+			err = DB_RECORD_NOT_FOUND;
+			/* ut_print_name(stderr, index->name);
+			fputs(" record not found 4\n", stderr); */
+
+			goto normal_return;
+		}
+	}
+
+	/* We are ready to look at a possible new index entry in the result
+	set: the cursor is now placed on a user record */
+
+	if (prebuilt->select_lock_type != LOCK_NONE) {
+		/* Try to place a lock on the index record; note that delete
+		marked records are a special case in a unique search. If there
+		is a non-delete marked record, then it is enough to lock its
+		existence with LOCK_REC_NOT_GAP. */
+
+		/* If innodb_locks_unsafe_for_binlog option is used
+		or this session is using a READ COMMITED isolation
+		level we lock only the record, i.e., next-key locking is
+		not used. */
+
+		ulint	lock_type;
+
+		if (!set_also_gap_locks
+		    || srv_locks_unsafe_for_binlog
+		    || trx->isolation_level <= TRX_ISO_READ_COMMITTED
+		    || (unique_search
+			&& !UNIV_UNLIKELY(rec_get_deleted_flag(rec, comp)))) {
+
+			goto no_gap_lock;
+		} else {
+			lock_type = LOCK_ORDINARY;
+		}
+
+		/* If we are doing a 'greater or equal than a primary key
+		value' search from a clustered index, and we find a record
+		that has that exact primary key value, then there is no need
+		to lock the gap before the record, because no insert in the
+		gap can be in our search range. That is, no phantom row can
+		appear that way.
+
+		An example: if col1 is the primary key, the search is WHERE
+		col1 >= 100, and we find a record where col1 = 100, then no
+		need to lock the gap before that record. */
+
+		if (index == clust_index
+		    && mode == PAGE_CUR_GE
+		    && direction == 0
+		    && dtuple_get_n_fields_cmp(search_tuple)
+		    == dict_index_get_n_unique(index)
+		    && 0 == cmp_dtuple_rec(search_tuple, rec, offsets)) {
+no_gap_lock:
+			lock_type = LOCK_REC_NOT_GAP;
+		}
+
+		err = sel_set_rec_lock(btr_pcur_get_block(pcur),
+				       rec, index, offsets,
+				       prebuilt->select_lock_type,
+				       lock_type, thr);
+
+		switch (err) {
+			const rec_t*	old_vers;
+		case DB_SUCCESS_LOCKED_REC:
+			if (srv_locks_unsafe_for_binlog
+			    || trx->isolation_level
+			    <= TRX_ISO_READ_COMMITTED) {
+				/* Note that a record of
+				prebuilt->index was locked. */
+				prebuilt->new_rec_locks = 1;
+			}
+			err = DB_SUCCESS;
+		case DB_SUCCESS:
+			break;
+		case DB_LOCK_WAIT:
+			/* Never unlock rows that were part of a conflict. */
+			prebuilt->new_rec_locks = 0;
+
+			if (UNIV_LIKELY(prebuilt->row_read_type
+					!= ROW_READ_TRY_SEMI_CONSISTENT)
+			    || unique_search
+			    || index != clust_index) {
+
+				goto lock_wait_or_error;
+			}
+
+			/* The following call returns 'offsets'
+			associated with 'old_vers' */
+			err = row_sel_build_committed_vers_for_mysql(
+				clust_index, prebuilt, rec,
+				&offsets, &heap, &old_vers, &mtr);
+
+			if (err != DB_SUCCESS) {
+
+				goto lock_wait_or_error;
+			}
+
+			mutex_enter(&kernel_mutex);
+			if (trx->was_chosen_as_deadlock_victim) {
+				mutex_exit(&kernel_mutex);
+				err = DB_DEADLOCK;
+
+				goto lock_wait_or_error;
+			}
+			if (UNIV_LIKELY(trx->wait_lock != NULL)) {
+				lock_cancel_waiting_and_release(
+					trx->wait_lock);
+			} else {
+				mutex_exit(&kernel_mutex);
+
+				/* The lock was granted while we were
+				searching for the last committed version.
+				Do a normal locking read. */
+
+				offsets = rec_get_offsets(rec, index, offsets,
+							  ULINT_UNDEFINED,
+							  &heap);
+				err = DB_SUCCESS;
+				break;
+			}
+			mutex_exit(&kernel_mutex);
+
+			if (old_vers == NULL) {
+				/* The row was not yet committed */
+
+				goto next_rec;
+			}
+
+			did_semi_consistent_read = TRUE;
+			rec = old_vers;
+			break;
+		default:
+
+			goto lock_wait_or_error;
+		}
+	} else {
+		/* This is a non-locking consistent read: if necessary, fetch
+		a previous version of the record */
+
+		if (trx->isolation_level == TRX_ISO_READ_UNCOMMITTED) {
+
+			/* Do nothing: we let a non-locking SELECT read the
+			latest version of the record */
+
+		} else if (index == clust_index) {
+
+			/* Fetch a previous version of the row if the current
+			one is not visible in the snapshot; if we have a very
+			high force recovery level set, we try to avoid crashes
+			by skipping this lookup */
+
+			if (UNIV_LIKELY(srv_force_recovery < 5)
+			    && !lock_clust_rec_cons_read_sees(
+				    rec, index, offsets, trx->read_view)) {
+
+				rec_t*	old_vers;
+				/* The following call returns 'offsets'
+				associated with 'old_vers' */
+				err = row_sel_build_prev_vers_for_mysql(
+					trx->read_view, clust_index,
+					prebuilt, rec, &offsets, &heap,
+					&old_vers, &mtr);
+
+				if (err != DB_SUCCESS) {
+
+					goto lock_wait_or_error;
+				}
+
+				if (old_vers == NULL) {
+					/* The row did not exist yet in
+					the read view */
+
+					goto next_rec;
+				}
+
+				rec = old_vers;
+			}
+		} else {
+			/* We are looking into a non-clustered index,
+			and to get the right version of the record we
+			have to look also into the clustered index: this
+			is necessary, because we can only get the undo
+			information via the clustered index record. */
+
+			ut_ad(index != clust_index);
+			ut_ad(!dict_index_is_clust(index));
+
+			if (!lock_sec_rec_cons_read_sees(
+				    rec, trx->read_view)) {
+                       		get_clust_rec = TRUE;
+				goto idx_cond_check;
+			}
+		}
+	}
+
+	/* NOTE that at this point rec can be an old version of a clustered
+	index record built for a consistent read. We cannot assume after this
+	point that rec is on a buffer pool page. Functions like
+	page_rec_is_comp() cannot be used! */
+
+	if (UNIV_UNLIKELY(rec_get_deleted_flag(rec, comp))) {
+
+		/* The record is delete-marked: we can skip it */
+
+		if ((srv_locks_unsafe_for_binlog
+		     || trx->isolation_level <= TRX_ISO_READ_COMMITTED)
+		    && prebuilt->select_lock_type != LOCK_NONE
+		    && !did_semi_consistent_read) {
+
+			/* No need to keep a lock on a delete-marked record
+			if we do not want to use next-key locking. */
+
+			row_unlock_for_mysql(prebuilt, TRUE);
+		}
+
+		/* This is an optimization to skip setting the next key lock
+		on the record that follows this delete-marked record. This
+		optimization works because of the unique search criteria
+		which precludes the presence of a range lock between this
+		delete marked record and the record following it.
+
+		For now this is applicable only to clustered indexes while
+		doing a unique search. There is scope for further optimization
+		applicable to unique secondary indexes. Current behaviour is
+		to widen the scope of a lock on an already delete marked record
+		if the same record is deleted twice by the same transaction */
+		if (index == clust_index && unique_search) {
+			err = DB_RECORD_NOT_FOUND;
+
+			goto normal_return;
+		}
+
+		goto next_rec;
+	}
+
+
+idx_cond_check:
+	if (prebuilt->idx_cond_func) {
+		int res;
+		ut_ad(prebuilt->template_type != ROW_MYSQL_DUMMY_TEMPLATE);
+		offsets = rec_get_offsets(rec, index, offsets, ULINT_UNDEFINED, &heap);
+		row_sel_store_mysql_rec(buf, prebuilt, rec,
+		                        offsets, 0, prebuilt->n_index_fields);
+		res= prebuilt->idx_cond_func(prebuilt->idx_cond_func_arg);
+		if (res == 0)
+			goto next_rec;
+		if (res == 2) {
+			err = DB_RECORD_NOT_FOUND;
+			goto idx_cond_failed;
+		}
+	}
+
+	/* Get the clustered index record if needed, if we did not do the
+	search using the clustered index. */
+	if (get_clust_rec || (index != clust_index
+			      && prebuilt->need_to_access_clustered)) {
+
+		/* We use a 'goto' to the preceding label if a consistent
+		read of a secondary index record requires us to look up old
+		versions of the associated clustered index record. */
+
+		ut_ad(rec_offs_validate(rec, index, offsets));
+
+		/* It was a non-clustered index and we must fetch also the
+		clustered index record */
+
+		mtr_has_extra_clust_latch = TRUE;
+
+		/* The following call returns 'offsets' associated with
+		'clust_rec'. Note that 'clust_rec' can be an old version
+		built for a consistent read. */
+
+		err = row_sel_get_clust_rec_for_mysql(prebuilt, index, rec,
+						      thr, &clust_rec,
+						      &offsets, &heap, &mtr);
+		switch (err) {
+		case DB_SUCCESS:
+			if (clust_rec == NULL) {
+				/* The record did not exist in the read view */
+				ut_ad(prebuilt->select_lock_type == LOCK_NONE);
+
+				goto next_rec;
+			}
+			break;
+		case DB_SUCCESS_LOCKED_REC:
+			ut_a(clust_rec != NULL);
+			if (srv_locks_unsafe_for_binlog
+			     || trx->isolation_level
+			    <= TRX_ISO_READ_COMMITTED) {
+				/* Note that the clustered index record
+				was locked. */
+				prebuilt->new_rec_locks = 2;
+			}
+			err = DB_SUCCESS;
+			break;
+		default:
+			goto lock_wait_or_error;
+		}
+
+		if (UNIV_UNLIKELY(rec_get_deleted_flag(clust_rec, comp))) {
+
+			/* The record is delete marked: we can skip it */
+
+			if ((srv_locks_unsafe_for_binlog
+			     || trx->isolation_level <= TRX_ISO_READ_COMMITTED)
+			    && prebuilt->select_lock_type != LOCK_NONE) {
+
+				/* No need to keep a lock on a delete-marked
+				record if we do not want to use next-key
+				locking. */
+
+				row_unlock_for_mysql(prebuilt, TRUE);
+			}
+
+			goto next_rec;
+		}
+
+		if (prebuilt->need_to_access_clustered) {
+
+			result_rec = clust_rec;
+
+			ut_ad(rec_offs_validate(result_rec, clust_index,
+						offsets));
+		} else {
+			/* We used 'offsets' for the clust rec, recalculate
+			them for 'rec' */
+			offsets = rec_get_offsets(rec, index, offsets,
+						  ULINT_UNDEFINED, &heap);
+			result_rec = rec;
+		}
+
+		/* result_rec can legitimately be delete-marked
+		now that it has been established that it points to a
+		clustered index record that exists in the read view. */
+	} else {
+		result_rec = rec;
+		ut_ad(!rec_get_deleted_flag(rec, comp));
+	}
+
+	/* We found a qualifying record 'result_rec'. At this point,
+	'offsets' are associated with 'result_rec'. */
+
+	ut_ad(rec_offs_validate(result_rec,
+				result_rec != rec ? clust_index : index,
+				offsets));
+
+	/* At this point, the clustered index record is protected
+	by a page latch that was acquired when pcur was positioned.
+	The latch will not be released until mtr_commit(&mtr). */
+
+	if ((match_mode == ROW_SEL_EXACT
+	     || prebuilt->n_rows_fetched >= MYSQL_FETCH_CACHE_THRESHOLD)
+	    && prebuilt->select_lock_type == LOCK_NONE
+	    && !prebuilt->templ_contains_blob
+	    && !prebuilt->clust_index_was_generated
+	    && !prebuilt->used_in_HANDLER
+	    && prebuilt->template_type
+	    != ROW_MYSQL_DUMMY_TEMPLATE) {
+
+		/* Inside an update, for example, we do not cache rows,
+		since we may use the cursor position to do the actual
+		update, that is why we require ...lock_type == LOCK_NONE.
+		Since we keep space in prebuilt only for the BLOBs of
+		a single row, we cannot cache rows in the case there
+		are BLOBs in the fields to be fetched. In HANDLER we do
+		not cache rows because there the cursor is a scrollable
+		cursor. */
+		some_fields_in_buffer = (index != clust_index
+					 && prebuilt->idx_cond_func);
+
+		if (!row_sel_push_cache_row_for_mysql(prebuilt, result_rec,
+                                                      offsets,
+                                                      some_fields_in_buffer?
+                                                      prebuilt->n_index_fields : 0,
+                                                      buf)) {
+			/* Only fresh inserts may contain incomplete
+			externally stored columns. Pretend that such
+			records do not exist. Such records may only be
+			accessed at the READ UNCOMMITTED isolation
+			level or when rolling back a recovered
+			transaction. Rollback happens at a lower
+			level, not here. */
+			ut_a(trx->isolation_level == TRX_ISO_READ_UNCOMMITTED);
+		} else if (prebuilt->n_fetch_cached
+			   == MYSQL_FETCH_CACHE_SIZE) {
+
+			goto got_row;
+		}
+
+		goto next_rec;
+	} else {
+		if (prebuilt->template_type == ROW_MYSQL_DUMMY_TEMPLATE) {
+			memcpy(buf + 4, result_rec
+			       - rec_offs_extra_size(offsets),
+			       rec_offs_size(offsets));
+			mach_write_to_4(buf,
+					rec_offs_extra_size(offsets) + 4);
+		} else {
+			if (!row_sel_store_mysql_rec(buf, prebuilt,
+						   result_rec, offsets,
+						   prebuilt->idx_cond_func?
+						   prebuilt->n_index_fields: 0,
+						   prebuilt->n_template)) {
+				/* Only fresh inserts may contain
+				incomplete externally stored
+				columns. Pretend that such records do
+				not exist. Such records may only be
+				accessed at the READ UNCOMMITTED
+				isolation level or when rolling back a
+				recovered transaction. Rollback
+				happens at a lower level, not here. */
+				ut_a(trx->isolation_level
+				     == TRX_ISO_READ_UNCOMMITTED);
+				goto next_rec;
+			}
+		}
+
+		if (prebuilt->clust_index_was_generated) {
+			if (result_rec != rec) {
+				offsets = rec_get_offsets(
+					rec, index, offsets, ULINT_UNDEFINED,
+					&heap);
+			}
+			row_sel_store_row_id_to_prebuilt(prebuilt, rec,
+							 index, offsets);
+		}
+	}
+
+	/* From this point on, 'offsets' are invalid. */
+
+got_row:
+	/* We have an optimization to save CPU time: if this is a consistent
+	read on a unique condition on the clustered index, then we do not
+	store the pcur position, because any fetch next or prev will anyway
+	return 'end of file'. Exceptions are locking reads and the MySQL
+	HANDLER command where the user can move the cursor with PREV or NEXT
+	even after a unique search. */
+
+	err = DB_SUCCESS;
+
+idx_cond_failed:
+	if (!unique_search_from_clust_index
+	    || prebuilt->select_lock_type != LOCK_NONE
+	    || prebuilt->used_in_HANDLER) {
+
+		/* Inside an update always store the cursor position */
+
+		btr_pcur_store_position(pcur, &mtr);
+	}
+
+	goto normal_return;
+
+next_rec:
+	/* Reset the old and new "did semi-consistent read" flags. */
+	get_clust_rec = FALSE;
+	if (UNIV_UNLIKELY(prebuilt->row_read_type
+			  == ROW_READ_DID_SEMI_CONSISTENT)) {
+		prebuilt->row_read_type = ROW_READ_TRY_SEMI_CONSISTENT;
+	}
+	did_semi_consistent_read = FALSE;
+	prebuilt->new_rec_locks = 0;
+
+	/*-------------------------------------------------------------*/
+	/* PHASE 5: Move the cursor to the next index record */
+
+	if (UNIV_UNLIKELY(mtr_has_extra_clust_latch)) {
+		/* We must commit mtr if we are moving to the next
+		non-clustered index record, because we could break the
+		latching order if we would access a different clustered
+		index page right away without releasing the previous. */
+
+		btr_pcur_store_position(pcur, &mtr);
+
+		mtr_commit(&mtr);
+		mtr_has_extra_clust_latch = FALSE;
+
+		mtr_start(&mtr);
+		if (sel_restore_position_for_mysql(&same_user_rec,
+						   BTR_SEARCH_LEAF,
+						   pcur, moves_up, &mtr)) {
+#ifdef UNIV_SEARCH_DEBUG
+			cnt++;
+#endif /* UNIV_SEARCH_DEBUG */
+
+			goto rec_loop;
+		}
+	}
+
+	if (moves_up) {
+		if (UNIV_UNLIKELY(!btr_pcur_move_to_next(pcur, &mtr))) {
+not_moved:
+			btr_pcur_store_position(pcur, &mtr);
+
+			if (match_mode != 0) {
+				err = DB_RECORD_NOT_FOUND;
+			} else {
+				err = DB_END_OF_INDEX;
+			}
+
+			goto normal_return;
+		}
+	} else {
+		if (UNIV_UNLIKELY(!btr_pcur_move_to_prev(pcur, &mtr))) {
+			goto not_moved;
+		}
+	}
+
+#ifdef UNIV_SEARCH_DEBUG
+	cnt++;
+#endif /* UNIV_SEARCH_DEBUG */
+
+	goto rec_loop;
+
+lock_wait_or_error:
+	/* Reset the old and new "did semi-consistent read" flags. */
+	if (UNIV_UNLIKELY(prebuilt->row_read_type
+			  == ROW_READ_DID_SEMI_CONSISTENT)) {
+		prebuilt->row_read_type = ROW_READ_TRY_SEMI_CONSISTENT;
+	}
+	did_semi_consistent_read = FALSE;
+
+	/*-------------------------------------------------------------*/
+
+	btr_pcur_store_position(pcur, &mtr);
+
+	mtr_commit(&mtr);
+	mtr_has_extra_clust_latch = FALSE;
+
+	trx->error_state = err;
+
+	/* The following is a patch for MySQL */
+
+	que_thr_stop_for_mysql(thr);
+
+	thr->lock_state = QUE_THR_LOCK_ROW;
+
+	if (row_mysql_handle_errors(&err, trx, thr, NULL)) {
+		/* It was a lock wait, and it ended */
+
+		thr->lock_state = QUE_THR_LOCK_NOLOCK;
+		mtr_start(&mtr);
+
+		sel_restore_position_for_mysql(&same_user_rec,
+					       BTR_SEARCH_LEAF, pcur,
+					       moves_up, &mtr);
+
+		if ((srv_locks_unsafe_for_binlog
+		     || trx->isolation_level <= TRX_ISO_READ_COMMITTED)
+		    && !same_user_rec) {
+
+			/* Since we were not able to restore the cursor
+			on the same user record, we cannot use
+			row_unlock_for_mysql() to unlock any records, and
+			we must thus reset the new rec lock info. Since
+			in lock0lock.c we have blocked the inheriting of gap
+			X-locks, we actually do not have any new record locks
+			set in this case.
+
+			Note that if we were able to restore on the 'same'
+			user record, it is still possible that we were actually
+			waiting on a delete-marked record, and meanwhile
+			it was removed by purge and inserted again by some
+			other user. But that is no problem, because in
+			rec_loop we will again try to set a lock, and
+			new_rec_lock_info in trx will be right at the end. */
+
+			prebuilt->new_rec_locks = 0;
+		}
+
+		mode = pcur->search_mode;
+
+		goto rec_loop;
+	}
+
+	thr->lock_state = QUE_THR_LOCK_NOLOCK;
+
+#ifdef UNIV_SEARCH_DEBUG
+	/*	fputs("Using ", stderr);
+	dict_index_name_print(stderr, index);
+	fprintf(stderr, " cnt %lu ret value %lu err\n", cnt, err); */
+#endif /* UNIV_SEARCH_DEBUG */
+	goto func_exit;
+
+normal_return:
+	/*-------------------------------------------------------------*/
+	que_thr_stop_for_mysql_no_error(thr, trx);
+
+	mtr_commit(&mtr);
+
+	if (prebuilt->n_fetch_cached > 0) {
+		row_sel_pop_cached_row_for_mysql(buf, prebuilt);
+
+		err = DB_SUCCESS;
+	}
+
+#ifdef UNIV_SEARCH_DEBUG
+	/*	fputs("Using ", stderr);
+	dict_index_name_print(stderr, index);
+	fprintf(stderr, " cnt %lu ret value %lu err\n", cnt, err); */
+#endif /* UNIV_SEARCH_DEBUG */
+	if (err == DB_SUCCESS) {
+		srv_n_rows_read++;
+	}
+
+func_exit:
+	trx->op_info = "";
+	if (UNIV_LIKELY_NULL(heap)) {
+		mem_heap_free(heap);
+	}
+
+	/* Set or reset the "did semi-consistent read" flag on return.
+	The flag did_semi_consistent_read is set if and only if
+	the record being returned was fetched with a semi-consistent read. */
+	ut_ad(prebuilt->row_read_type != ROW_READ_WITH_LOCKS
+	      || !did_semi_consistent_read);
+
+	if (UNIV_UNLIKELY(prebuilt->row_read_type != ROW_READ_WITH_LOCKS)) {
+		if (UNIV_UNLIKELY(did_semi_consistent_read)) {
+			prebuilt->row_read_type = ROW_READ_DID_SEMI_CONSISTENT;
+		} else {
+			prebuilt->row_read_type = ROW_READ_TRY_SEMI_CONSISTENT;
+		}
+	}
+	return(err);
+}
+
+/*******************************************************************//**
+Checks if MySQL at the moment is allowed for this table to retrieve a
+consistent read result, or store it to the query cache.
+@return	TRUE if storing or retrieving from the query cache is permitted */
+UNIV_INTERN
+ibool
+row_search_check_if_query_cache_permitted(
+/*======================================*/
+	trx_t*		trx,		/*!< in: transaction object */
+	const char*	norm_name)	/*!< in: concatenation of database name,
+					'/' char, table name */
+{
+	dict_table_t*	table;
+	ibool		ret	= FALSE;
+
+	table = dict_table_get(norm_name, FALSE);
+
+	if (table == NULL) {
+
+		return(FALSE);
+	}
+
+	mutex_enter(&kernel_mutex);
+
+	/* Start the transaction if it is not started yet */
+
+	trx_start_if_not_started_low(trx);
+
+	/* If there are locks on the table or some trx has invalidated the
+	cache up to our trx id, then ret = FALSE.
+	We do not check what type locks there are on the table, though only
+	IX type locks actually would require ret = FALSE. */
+
+	if (UT_LIST_GET_LEN(table->locks) == 0
+	    && ut_dulint_cmp(trx->id,
+			     table->query_cache_inv_trx_id) >= 0) {
+
+		ret = TRUE;
+
+		/* If the isolation level is high, assign a read view for the
+		transaction if it does not yet have one */
+
+		if (trx->isolation_level >= TRX_ISO_REPEATABLE_READ
+		    && !trx->read_view) {
+
+			trx->read_view = read_view_open_now(
+				trx->id, trx->global_read_view_heap);
+			trx->global_read_view = trx->read_view;
+		}
+	}
+
+	mutex_exit(&kernel_mutex);
+
+	return(ret);
+}
+
+/*******************************************************************//**
+Read the AUTOINC column from the current row. If the value is less than
+0 and the type is not unsigned then we reset the value to 0.
+@return	value read from the column */
+static
+ib_uint64_t
+row_search_autoinc_read_column(
+/*===========================*/
+	dict_index_t*	index,		/*!< in: index to read from */
+	const rec_t*	rec,		/*!< in: current rec */
+	ulint		col_no,		/*!< in: column number */
+	ulint		mtype,		/*!< in: column main type */
+	ibool		unsigned_type)	/*!< in: signed or unsigned flag */
+{
+	ulint		len;
+	const byte*	data;
+	ib_uint64_t	value;
+	mem_heap_t*	heap = NULL;
+	ulint		offsets_[REC_OFFS_NORMAL_SIZE];
+	ulint*		offsets	= offsets_;
+
+	rec_offs_init(offsets_);
+
+	offsets = rec_get_offsets(rec, index, offsets, ULINT_UNDEFINED, &heap);
+
+	data = rec_get_nth_field(rec, offsets, col_no, &len);
+
+	ut_a(len != UNIV_SQL_NULL);
+
+	switch (mtype) {
+	case DATA_INT:
+		ut_a(len <= sizeof value);
+		value = mach_read_int_type(data, len, unsigned_type);
+		break;
+
+	case DATA_FLOAT:
+		ut_a(len == sizeof(float));
+		value = (ib_uint64_t) mach_float_read(data);
+		break;
+
+	case DATA_DOUBLE:
+		ut_a(len == sizeof(double));
+		value = (ib_uint64_t) mach_double_read(data);
+		break;
+
+	default:
+		ut_error;
+	}
+
+	if (UNIV_LIKELY_NULL(heap)) {
+		mem_heap_free(heap);
+	}
+
+	if (!unsigned_type && (ib_int64_t) value < 0) {
+		value = 0;
+	}
+
+	return(value);
+}
+
+/*******************************************************************//**
+Get the last row.
+@return	current rec or NULL */
+static
+const rec_t*
+row_search_autoinc_get_rec(
+/*=======================*/
+	btr_pcur_t*	pcur,		/*!< in: the current cursor */
+	mtr_t*		mtr)		/*!< in: mini transaction */
+{
+	do {
+		const rec_t* rec = btr_pcur_get_rec(pcur);
+
+		if (page_rec_is_user_rec(rec)) {
+			return(rec);
+		}
+	} while (btr_pcur_move_to_prev(pcur, mtr));
+
+	return(NULL);
+}
+
+/*******************************************************************//**
+Read the max AUTOINC value from an index.
+@return DB_SUCCESS if all OK else error code, DB_RECORD_NOT_FOUND if
+column name can't be found in index */
+UNIV_INTERN
+ulint
+row_search_max_autoinc(
+/*===================*/
+	dict_index_t*	index,		/*!< in: index to search */
+	const char*	col_name,	/*!< in: name of autoinc column */
+	ib_uint64_t*	value)		/*!< out: AUTOINC value read */
+{
+	ulint		i;
+	ulint		n_cols;
+	dict_field_t*	dfield = NULL;
+	ulint		error = DB_SUCCESS;
+
+	n_cols = dict_index_get_n_ordering_defined_by_user(index);
+
+	/* Search the index for the AUTOINC column name */
+	for (i = 0; i < n_cols; ++i) {
+		dfield = dict_index_get_nth_field(index, i);
+
+		if (strcmp(col_name, dfield->name) == 0) {
+			break;
+		}
+	}
+
+	*value = 0;
+
+	/* Must find the AUTOINC column name */
+	if (i < n_cols && dfield) {
+		mtr_t		mtr;
+		btr_pcur_t	pcur;
+
+		mtr_start(&mtr);
+
+		/* Open at the high/right end (FALSE), and INIT
+		cursor (TRUE) */
+		btr_pcur_open_at_index_side(
+			FALSE, index, BTR_SEARCH_LEAF, &pcur, TRUE, &mtr);
+
+		if (page_get_n_recs(btr_pcur_get_page(&pcur)) > 0) {
+			const rec_t*	rec;
+
+			rec = row_search_autoinc_get_rec(&pcur, &mtr);
+
+			if (rec != NULL) {
+				ibool unsigned_type = (
+					dfield->col->prtype & DATA_UNSIGNED);
+
+				*value = row_search_autoinc_read_column(
+					index, rec, i,
+					dfield->col->mtype, unsigned_type);
+			}
+		}
+
+		btr_pcur_close(&pcur);
+
+		mtr_commit(&mtr);
+	} else {
+		error = DB_RECORD_NOT_FOUND;
+	}
+
+	return(error);
+}
diff --git a/storage/xtradb/row/row0uins.c b/storage/xtradb/row/row0uins.c
new file mode 100644
index 00000000000..930a5cf13b6
--- /dev/null
+++ b/storage/xtradb/row/row0uins.c
@@ -0,0 +1,361 @@
+/*****************************************************************************
+
+Copyright (c) 1997, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file row/row0uins.c
+Fresh insert undo
+
+Created 2/25/1997 Heikki Tuuri
+*******************************************************/
+
+#include "row0uins.h"
+
+#ifdef UNIV_NONINL
+#include "row0uins.ic"
+#endif
+
+#include "dict0dict.h"
+#include "dict0boot.h"
+#include "dict0crea.h"
+#include "trx0undo.h"
+#include "trx0roll.h"
+#include "btr0btr.h"
+#include "mach0data.h"
+#include "row0undo.h"
+#include "row0vers.h"
+#include "trx0trx.h"
+#include "trx0rec.h"
+#include "row0row.h"
+#include "row0upd.h"
+#include "que0que.h"
+#include "ibuf0ibuf.h"
+#include "log0log.h"
+
+/*************************************************************************
+IMPORTANT NOTE: Any operation that generates redo MUST check that there
+is enough space in the redo log before for that operation. This is
+done by calling log_free_check(). The reason for checking the
+availability of the redo log space before the start of the operation is
+that we MUST not hold any synchonization objects when performing the
+check.
+If you make a change in this module make sure that no codepath is
+introduced where a call to log_free_check() is bypassed. */
+
+/***************************************************************//**
+Removes a clustered index record. The pcur in node was positioned on the
+record, now it is detached.
+@return	DB_SUCCESS or DB_OUT_OF_FILE_SPACE */
+static
+ulint
+row_undo_ins_remove_clust_rec(
+/*==========================*/
+	undo_node_t*	node)	/*!< in: undo node */
+{
+	btr_cur_t*	btr_cur;
+	ibool		success;
+	ulint		err;
+	ulint		n_tries		= 0;
+	mtr_t		mtr;
+
+	mtr_start(&mtr);
+
+	success = btr_pcur_restore_position(BTR_MODIFY_LEAF, &(node->pcur),
+					    &mtr);
+	ut_a(success);
+
+	if (ut_dulint_cmp(node->table->id, DICT_INDEXES_ID) == 0) {
+		ut_ad(node->trx->dict_operation_lock_mode == RW_X_LATCH);
+
+		/* Drop the index tree associated with the row in
+		SYS_INDEXES table: */
+
+		dict_drop_index_tree(btr_pcur_get_rec(&(node->pcur)), &mtr);
+
+		mtr_commit(&mtr);
+
+		mtr_start(&mtr);
+
+		success = btr_pcur_restore_position(BTR_MODIFY_LEAF,
+						    &(node->pcur), &mtr);
+		ut_a(success);
+	}
+
+	btr_cur = btr_pcur_get_btr_cur(&(node->pcur));
+
+	success = btr_cur_optimistic_delete(btr_cur, &mtr);
+
+	btr_pcur_commit_specify_mtr(&(node->pcur), &mtr);
+
+	if (success) {
+		trx_undo_rec_release(node->trx, node->undo_no);
+
+		return(DB_SUCCESS);
+	}
+retry:
+	/* If did not succeed, try pessimistic descent to tree */
+	mtr_start(&mtr);
+
+	success = btr_pcur_restore_position(BTR_MODIFY_TREE,
+					    &(node->pcur), &mtr);
+	ut_a(success);
+
+	btr_cur_pessimistic_delete(&err, FALSE, btr_cur,
+				   trx_is_recv(node->trx)
+				   ? RB_RECOVERY
+				   : RB_NORMAL, &mtr);
+
+	/* The delete operation may fail if we have little
+	file space left: TODO: easiest to crash the database
+	and restart with more file space */
+
+	if (err == DB_OUT_OF_FILE_SPACE
+	    && n_tries < BTR_CUR_RETRY_DELETE_N_TIMES) {
+
+		btr_pcur_commit_specify_mtr(&(node->pcur), &mtr);
+
+		n_tries++;
+
+		os_thread_sleep(BTR_CUR_RETRY_SLEEP_TIME);
+
+		goto retry;
+	}
+
+	btr_pcur_commit_specify_mtr(&(node->pcur), &mtr);
+
+	trx_undo_rec_release(node->trx, node->undo_no);
+
+	return(err);
+}
+
+/***************************************************************//**
+Removes a secondary index entry if found.
+@return	DB_SUCCESS, DB_FAIL, or DB_OUT_OF_FILE_SPACE */
+static
+ulint
+row_undo_ins_remove_sec_low(
+/*========================*/
+	ulint		mode,	/*!< in: BTR_MODIFY_LEAF or BTR_MODIFY_TREE,
+				depending on whether we wish optimistic or
+				pessimistic descent down the index tree */
+	dict_index_t*	index,	/*!< in: index */
+	dtuple_t*	entry)	/*!< in: index entry to remove */
+{
+	btr_pcur_t	pcur;
+	btr_cur_t*	btr_cur;
+	ibool		found;
+	ibool		success;
+	ulint		err;
+	mtr_t		mtr;
+
+	mtr_start(&mtr);
+
+	found = row_search_index_entry(index, entry, mode, &pcur, &mtr);
+
+	btr_cur = btr_pcur_get_btr_cur(&pcur);
+
+	if (!found) {
+		/* Not found */
+
+		btr_pcur_close(&pcur);
+		mtr_commit(&mtr);
+
+		return(DB_SUCCESS);
+	}
+
+	if (mode == BTR_MODIFY_LEAF) {
+		success = btr_cur_optimistic_delete(btr_cur, &mtr);
+
+		if (success) {
+			err = DB_SUCCESS;
+		} else {
+			err = DB_FAIL;
+		}
+	} else {
+		ut_ad(mode == BTR_MODIFY_TREE);
+
+		/* No need to distinguish RB_RECOVERY here, because we
+		are deleting a secondary index record: the distinction
+		between RB_NORMAL and RB_RECOVERY only matters when
+		deleting a record that contains externally stored
+		columns. */
+		ut_ad(!dict_index_is_clust(index));
+		btr_cur_pessimistic_delete(&err, FALSE, btr_cur,
+					   RB_NORMAL, &mtr);
+	}
+
+	btr_pcur_close(&pcur);
+	mtr_commit(&mtr);
+
+	return(err);
+}
+
+/***************************************************************//**
+Removes a secondary index entry from the index if found. Tries first
+optimistic, then pessimistic descent down the tree.
+@return	DB_SUCCESS or DB_OUT_OF_FILE_SPACE */
+static
+ulint
+row_undo_ins_remove_sec(
+/*====================*/
+	dict_index_t*	index,	/*!< in: index */
+	dtuple_t*	entry)	/*!< in: index entry to insert */
+{
+	ulint	err;
+	ulint	n_tries	= 0;
+
+	/* Try first optimistic descent to the B-tree */
+
+	err = row_undo_ins_remove_sec_low(BTR_MODIFY_LEAF, index, entry);
+
+	if (err == DB_SUCCESS) {
+
+		return(err);
+	}
+
+	/* Try then pessimistic descent to the B-tree */
+retry:
+	err = row_undo_ins_remove_sec_low(BTR_MODIFY_TREE, index, entry);
+
+	/* The delete operation may fail if we have little
+	file space left: TODO: easiest to crash the database
+	and restart with more file space */
+
+	if (err != DB_SUCCESS && n_tries < BTR_CUR_RETRY_DELETE_N_TIMES) {
+
+		n_tries++;
+
+		os_thread_sleep(BTR_CUR_RETRY_SLEEP_TIME);
+
+		goto retry;
+	}
+
+	return(err);
+}
+
+/***********************************************************//**
+Parses the row reference and other info in a fresh insert undo record. */
+static
+void
+row_undo_ins_parse_undo_rec(
+/*========================*/
+	undo_node_t*	node)	/*!< in/out: row undo node */
+{
+	dict_index_t*	clust_index;
+	byte*		ptr;
+	undo_no_t	undo_no;
+	dulint		table_id;
+	ulint		type;
+	ulint		dummy;
+	ibool		dummy_extern;
+
+	ut_ad(node);
+
+	ptr = trx_undo_rec_get_pars(node->undo_rec, &type, &dummy,
+				    &dummy_extern, &undo_no, &table_id);
+	ut_ad(type == TRX_UNDO_INSERT_REC);
+	node->rec_type = type;
+
+	node->update = NULL;
+	node->table = dict_table_get_on_id(table_id, node->trx);
+
+	/* Skip the UNDO if we can't find the table or the .ibd file. */
+	if (UNIV_UNLIKELY(node->table == NULL)) {
+	} else if (UNIV_UNLIKELY(node->table->ibd_file_missing)) {
+		node->table = NULL;
+	} else {
+		clust_index = dict_table_get_first_index(node->table);
+
+		if (clust_index != NULL) {
+			ptr = trx_undo_rec_get_row_ref(
+				ptr, clust_index, &node->ref, node->heap);
+		} else {
+			ut_print_timestamp(stderr);
+			fprintf(stderr, "  InnoDB: table ");
+			ut_print_name(stderr, node->trx, TRUE,
+				      node->table->name);
+			fprintf(stderr, " has no indexes, "
+				"ignoring the table\n");
+
+			node->table = NULL;
+		}
+	}
+}
+
+/***********************************************************//**
+Undoes a fresh insert of a row to a table. A fresh insert means that
+the same clustered index unique key did not have any record, even delete
+marked, at the time of the insert.  InnoDB is eager in a rollback:
+if it figures out that an index record will be removed in the purge
+anyway, it will remove it in the rollback.
+@return	DB_SUCCESS or DB_OUT_OF_FILE_SPACE */
+UNIV_INTERN
+ulint
+row_undo_ins(
+/*=========*/
+	undo_node_t*	node)	/*!< in: row undo node */
+{
+	ut_ad(node);
+	ut_ad(node->state == UNDO_NODE_INSERT);
+
+	row_undo_ins_parse_undo_rec(node);
+
+	if (!node->table || !row_undo_search_clust_to_pcur(node)) {
+		trx_undo_rec_release(node->trx, node->undo_no);
+
+		return(DB_SUCCESS);
+	}
+
+	/* Iterate over all the indexes and undo the insert.*/
+
+	/* Skip the clustered index (the first index) */
+	node->index = dict_table_get_next_index(
+		dict_table_get_first_index(node->table));
+
+	while (node->index != NULL) {
+		dtuple_t*	entry;
+		ulint		err;
+
+		entry = row_build_index_entry(node->row, node->ext,
+					      node->index, node->heap);
+		if (UNIV_UNLIKELY(!entry)) {
+			/* The database must have crashed after
+			inserting a clustered index record but before
+			writing all the externally stored columns of
+			that record.  Because secondary index entries
+			are inserted after the clustered index record,
+			we may assume that the secondary index record
+			does not exist.  However, this situation may
+			only occur during the rollback of incomplete
+			transactions. */
+			ut_a(trx_is_recv(node->trx));
+		} else {
+			log_free_check();
+			err = row_undo_ins_remove_sec(node->index, entry);
+
+			if (err != DB_SUCCESS) {
+
+				return(err);
+			}
+		}
+
+		node->index = dict_table_get_next_index(node->index);
+	}
+
+	log_free_check();
+	return(row_undo_ins_remove_clust_rec(node));
+}
diff --git a/storage/xtradb/row/row0umod.c b/storage/xtradb/row/row0umod.c
new file mode 100644
index 00000000000..8464b0f95cc
--- /dev/null
+++ b/storage/xtradb/row/row0umod.c
@@ -0,0 +1,866 @@
+/*****************************************************************************
+
+Copyright (c) 1997, 2010, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file row/row0umod.c
+Undo modify of a row
+
+Created 2/27/1997 Heikki Tuuri
+*******************************************************/
+
+#include "row0umod.h"
+
+#ifdef UNIV_NONINL
+#include "row0umod.ic"
+#endif
+
+#include "dict0dict.h"
+#include "dict0boot.h"
+#include "trx0undo.h"
+#include "trx0roll.h"
+#include "btr0btr.h"
+#include "mach0data.h"
+#include "row0undo.h"
+#include "row0vers.h"
+#include "trx0trx.h"
+#include "trx0rec.h"
+#include "row0row.h"
+#include "row0upd.h"
+#include "que0que.h"
+#include "log0log.h"
+
+/* Considerations on undoing a modify operation.
+(1) Undoing a delete marking: all index records should be found. Some of
+them may have delete mark already FALSE, if the delete mark operation was
+stopped underway, or if the undo operation ended prematurely because of a
+system crash.
+(2) Undoing an update of a delete unmarked record: the newer version of
+an updated secondary index entry should be removed if no prior version
+of the clustered index record requires its existence. Otherwise, it should
+be delete marked.
+(3) Undoing an update of a delete marked record. In this kind of update a
+delete marked clustered index record was delete unmarked and possibly also
+some of its fields were changed. Now, it is possible that the delete marked
+version has become obsolete at the time the undo is started. */
+
+/*************************************************************************
+IMPORTANT NOTE: Any operation that generates redo MUST check that there
+is enough space in the redo log before for that operation. This is
+done by calling log_free_check(). The reason for checking the
+availability of the redo log space before the start of the operation is
+that we MUST not hold any synchonization objects when performing the
+check.
+If you make a change in this module make sure that no codepath is
+introduced where a call to log_free_check() is bypassed. */
+
+/***********************************************************//**
+Checks if also the previous version of the clustered index record was
+modified or inserted by the same transaction, and its undo number is such
+that it should be undone in the same rollback.
+@return	TRUE if also previous modify or insert of this row should be undone */
+static
+ibool
+row_undo_mod_undo_also_prev_vers(
+/*=============================*/
+	undo_node_t*	node,	/*!< in: row undo node */
+	undo_no_t*	undo_no)/*!< out: the undo number */
+{
+	trx_undo_rec_t*	undo_rec;
+	trx_t*		trx;
+
+	trx = node->trx;
+
+	if (0 != ut_dulint_cmp(node->new_trx_id, trx->id)) {
+
+		*undo_no = ut_dulint_zero;
+		return(FALSE);
+	}
+
+	undo_rec = trx_undo_get_undo_rec_low(node->new_roll_ptr, node->heap);
+
+	*undo_no = trx_undo_rec_get_undo_no(undo_rec);
+
+	return(ut_dulint_cmp(trx->roll_limit, *undo_no) <= 0);
+}
+
+/***********************************************************//**
+Undoes a modify in a clustered index record.
+@return	DB_SUCCESS, DB_FAIL, or error code: we may run out of file space */
+static
+ulint
+row_undo_mod_clust_low(
+/*===================*/
+	undo_node_t*	node,	/*!< in: row undo node */
+	que_thr_t*	thr,	/*!< in: query thread */
+	mtr_t*		mtr,	/*!< in: mtr; must be committed before
+				latching any further pages */
+	ulint		mode)	/*!< in: BTR_MODIFY_LEAF or BTR_MODIFY_TREE */
+{
+	btr_pcur_t*	pcur;
+	btr_cur_t*	btr_cur;
+	ulint		err;
+	ibool		success;
+
+	pcur = &(node->pcur);
+	btr_cur = btr_pcur_get_btr_cur(pcur);
+
+	success = btr_pcur_restore_position(mode, pcur, mtr);
+
+	ut_ad(success);
+
+	if (mode == BTR_MODIFY_LEAF) {
+
+		err = btr_cur_optimistic_update(BTR_NO_LOCKING_FLAG
+						| BTR_NO_UNDO_LOG_FLAG
+						| BTR_KEEP_SYS_FLAG,
+						btr_cur, node->update,
+						node->cmpl_info, thr, mtr);
+	} else {
+		mem_heap_t*	heap		= NULL;
+		big_rec_t*	dummy_big_rec;
+
+		ut_ad(mode == BTR_MODIFY_TREE);
+
+		err = btr_cur_pessimistic_update(
+			BTR_NO_LOCKING_FLAG
+			| BTR_NO_UNDO_LOG_FLAG
+			| BTR_KEEP_SYS_FLAG,
+			btr_cur, &heap, &dummy_big_rec, node->update,
+			node->cmpl_info, thr, mtr);
+
+		ut_a(!dummy_big_rec);
+		if (UNIV_LIKELY_NULL(heap)) {
+			mem_heap_free(heap);
+		}
+	}
+
+	return(err);
+}
+
+/***********************************************************//**
+Removes a clustered index record after undo if possible.
+This is attempted when the record was inserted by updating a
+delete-marked record and there no longer exist transactions
+that would see the delete-marked record.  In other words, we
+roll back the insert by purging the record.
+@return	DB_SUCCESS, DB_FAIL, or error code: we may run out of file space */
+static
+ulint
+row_undo_mod_remove_clust_low(
+/*==========================*/
+	undo_node_t*	node,	/*!< in: row undo node */
+	que_thr_t*	thr,	/*!< in: query thread */
+	mtr_t*		mtr,	/*!< in: mtr */
+	ulint		mode)	/*!< in: BTR_MODIFY_LEAF or BTR_MODIFY_TREE */
+{
+	btr_pcur_t*	pcur;
+	btr_cur_t*	btr_cur;
+	ulint		err;
+	ibool		success;
+
+	ut_ad(node->rec_type == TRX_UNDO_UPD_DEL_REC);
+	pcur = &(node->pcur);
+	btr_cur = btr_pcur_get_btr_cur(pcur);
+
+	success = btr_pcur_restore_position(mode, pcur, mtr);
+
+	if (!success) {
+
+		return(DB_SUCCESS);
+	}
+
+	/* Find out if we can remove the whole clustered index record */
+
+	if (node->rec_type == TRX_UNDO_UPD_DEL_REC
+	    && !row_vers_must_preserve_del_marked(node->new_trx_id, mtr)) {
+
+		/* Ok, we can remove */
+	} else {
+		return(DB_SUCCESS);
+	}
+
+	if (mode == BTR_MODIFY_LEAF) {
+		success = btr_cur_optimistic_delete(btr_cur, mtr);
+
+		if (success) {
+			err = DB_SUCCESS;
+		} else {
+			err = DB_FAIL;
+		}
+	} else {
+		ut_ad(mode == BTR_MODIFY_TREE);
+
+		/* This operation is analogous to purge, we can free also
+		inherited externally stored fields */
+
+		btr_cur_pessimistic_delete(&err, FALSE, btr_cur,
+					   thr_is_recv(thr)
+					   ? RB_RECOVERY_PURGE_REC
+					   : RB_NONE, mtr);
+
+		/* The delete operation may fail if we have little
+		file space left: TODO: easiest to crash the database
+		and restart with more file space */
+	}
+
+	return(err);
+}
+
+/***********************************************************//**
+Undoes a modify in a clustered index record. Sets also the node state for the
+next round of undo.
+@return	DB_SUCCESS or error code: we may run out of file space */
+static
+ulint
+row_undo_mod_clust(
+/*===============*/
+	undo_node_t*	node,	/*!< in: row undo node */
+	que_thr_t*	thr)	/*!< in: query thread */
+{
+	btr_pcur_t*	pcur;
+	mtr_t		mtr;
+	ulint		err;
+	ibool		success;
+	ibool		more_vers;
+	undo_no_t	new_undo_no;
+
+	ut_ad(node && thr);
+
+	log_free_check();
+
+	/* Check if also the previous version of the clustered index record
+	should be undone in this same rollback operation */
+
+	more_vers = row_undo_mod_undo_also_prev_vers(node, &new_undo_no);
+
+	pcur = &(node->pcur);
+
+	mtr_start(&mtr);
+
+	/* Try optimistic processing of the record, keeping changes within
+	the index page */
+
+	err = row_undo_mod_clust_low(node, thr, &mtr, BTR_MODIFY_LEAF);
+
+	if (err != DB_SUCCESS) {
+		btr_pcur_commit_specify_mtr(pcur, &mtr);
+
+		/* We may have to modify tree structure: do a pessimistic
+		descent down the index tree */
+
+		mtr_start(&mtr);
+
+		err = row_undo_mod_clust_low(node, thr, &mtr, BTR_MODIFY_TREE);
+	}
+
+	btr_pcur_commit_specify_mtr(pcur, &mtr);
+
+	if (err == DB_SUCCESS && node->rec_type == TRX_UNDO_UPD_DEL_REC) {
+
+		mtr_start(&mtr);
+
+		err = row_undo_mod_remove_clust_low(node, thr, &mtr,
+						    BTR_MODIFY_LEAF);
+		if (err != DB_SUCCESS) {
+			btr_pcur_commit_specify_mtr(pcur, &mtr);
+
+			/* We may have to modify tree structure: do a
+			pessimistic descent down the index tree */
+
+			mtr_start(&mtr);
+
+			err = row_undo_mod_remove_clust_low(node, thr, &mtr,
+							    BTR_MODIFY_TREE);
+		}
+
+		btr_pcur_commit_specify_mtr(pcur, &mtr);
+	}
+
+	node->state = UNDO_NODE_FETCH_NEXT;
+
+	trx_undo_rec_release(node->trx, node->undo_no);
+
+	if (more_vers && err == DB_SUCCESS) {
+
+		/* Reserve the undo log record to the prior version after
+		committing &mtr: this is necessary to comply with the latching
+		order, as &mtr may contain the fsp latch which is lower in
+		the latch hierarchy than trx->undo_mutex. */
+
+		success = trx_undo_rec_reserve(node->trx, new_undo_no);
+
+		if (success) {
+			node->state = UNDO_NODE_PREV_VERS;
+		}
+	}
+
+	return(err);
+}
+
+/***********************************************************//**
+Delete marks or removes a secondary index entry if found.
+@return	DB_SUCCESS, DB_FAIL, or DB_OUT_OF_FILE_SPACE */
+static
+ulint
+row_undo_mod_del_mark_or_remove_sec_low(
+/*====================================*/
+	undo_node_t*	node,	/*!< in: row undo node */
+	que_thr_t*	thr,	/*!< in: query thread */
+	dict_index_t*	index,	/*!< in: index */
+	dtuple_t*	entry,	/*!< in: index entry */
+	ulint		mode)	/*!< in: latch mode BTR_MODIFY_LEAF or
+				BTR_MODIFY_TREE */
+{
+	ibool		found;
+	btr_pcur_t	pcur;
+	btr_cur_t*	btr_cur;
+	ibool		success;
+	ibool		old_has;
+	ulint		err;
+	mtr_t		mtr;
+	mtr_t		mtr_vers;
+
+	log_free_check();
+	mtr_start(&mtr);
+
+	found = row_search_index_entry(index, entry, mode, &pcur, &mtr);
+
+	btr_cur = btr_pcur_get_btr_cur(&pcur);
+
+	if (!found) {
+		/* In crash recovery, the secondary index record may
+		be missing if the UPDATE did not have time to insert
+		the secondary index records before the crash.  When we
+		are undoing that UPDATE in crash recovery, the record
+		may be missing.
+
+		In normal processing, if an update ends in a deadlock
+		before it has inserted all updated secondary index
+		records, then the undo will not find those records. */
+
+		btr_pcur_close(&pcur);
+		mtr_commit(&mtr);
+
+		return(DB_SUCCESS);
+	}
+
+	/* We should remove the index record if no prior version of the row,
+	which cannot be purged yet, requires its existence. If some requires,
+	we should delete mark the record. */
+
+	mtr_start(&mtr_vers);
+
+	success = btr_pcur_restore_position(BTR_SEARCH_LEAF, &(node->pcur),
+					    &mtr_vers);
+	ut_a(success);
+
+	old_has = row_vers_old_has_index_entry(FALSE,
+					       btr_pcur_get_rec(&(node->pcur)),
+					       &mtr_vers, index, entry);
+	if (old_has) {
+		err = btr_cur_del_mark_set_sec_rec(BTR_NO_LOCKING_FLAG,
+						   btr_cur, TRUE, thr, &mtr);
+		ut_ad(err == DB_SUCCESS);
+	} else {
+		/* Remove the index record */
+
+		if (mode == BTR_MODIFY_LEAF) {
+			success = btr_cur_optimistic_delete(btr_cur, &mtr);
+			if (success) {
+				err = DB_SUCCESS;
+			} else {
+				err = DB_FAIL;
+			}
+		} else {
+			ut_ad(mode == BTR_MODIFY_TREE);
+
+			/* No need to distinguish RB_RECOVERY_PURGE here,
+			because we are deleting a secondary index record:
+			the distinction between RB_NORMAL and
+			RB_RECOVERY_PURGE only matters when deleting a
+			record that contains externally stored
+			columns. */
+			ut_ad(!dict_index_is_clust(index));
+			btr_cur_pessimistic_delete(&err, FALSE, btr_cur,
+						   RB_NORMAL, &mtr);
+
+			/* The delete operation may fail if we have little
+			file space left: TODO: easiest to crash the database
+			and restart with more file space */
+		}
+	}
+
+	btr_pcur_commit_specify_mtr(&(node->pcur), &mtr_vers);
+	btr_pcur_close(&pcur);
+	mtr_commit(&mtr);
+
+	return(err);
+}
+
+/***********************************************************//**
+Delete marks or removes a secondary index entry if found.
+NOTE that if we updated the fields of a delete-marked secondary index record
+so that alphabetically they stayed the same, e.g., 'abc' -> 'aBc', we cannot
+return to the original values because we do not know them. But this should
+not cause problems because in row0sel.c, in queries we always retrieve the
+clustered index record or an earlier version of it, if the secondary index
+record through which we do the search is delete-marked.
+@return	DB_SUCCESS or DB_OUT_OF_FILE_SPACE */
+static
+ulint
+row_undo_mod_del_mark_or_remove_sec(
+/*================================*/
+	undo_node_t*	node,	/*!< in: row undo node */
+	que_thr_t*	thr,	/*!< in: query thread */
+	dict_index_t*	index,	/*!< in: index */
+	dtuple_t*	entry)	/*!< in: index entry */
+{
+	ulint	err;
+
+	err = row_undo_mod_del_mark_or_remove_sec_low(node, thr, index,
+						      entry, BTR_MODIFY_LEAF);
+	if (err == DB_SUCCESS) {
+
+		return(err);
+	}
+
+	err = row_undo_mod_del_mark_or_remove_sec_low(node, thr, index,
+						      entry, BTR_MODIFY_TREE);
+	return(err);
+}
+
+/***********************************************************//**
+Delete unmarks a secondary index entry which must be found. It might not be
+delete-marked at the moment, but it does not harm to unmark it anyway. We also
+need to update the fields of the secondary index record if we updated its
+fields but alphabetically they stayed the same, e.g., 'abc' -> 'aBc'.
+@return	DB_FAIL or DB_SUCCESS or DB_OUT_OF_FILE_SPACE */
+static
+ulint
+row_undo_mod_del_unmark_sec_and_undo_update(
+/*========================================*/
+	ulint		mode,	/*!< in: search mode: BTR_MODIFY_LEAF or
+				BTR_MODIFY_TREE */
+	que_thr_t*	thr,	/*!< in: query thread */
+	dict_index_t*	index,	/*!< in: index */
+	const dtuple_t*	entry)	/*!< in: index entry */
+{
+	mem_heap_t*	heap;
+	btr_pcur_t	pcur;
+	upd_t*		update;
+	ulint		err		= DB_SUCCESS;
+	big_rec_t*	dummy_big_rec;
+	mtr_t		mtr;
+	trx_t*		trx		= thr_get_trx(thr);
+
+	/* Ignore indexes that are being created. */
+	if (UNIV_UNLIKELY(*index->name == TEMP_INDEX_PREFIX)) {
+
+		return(DB_SUCCESS);
+	}
+
+	log_free_check();
+	mtr_start(&mtr);
+
+	if (UNIV_UNLIKELY(!row_search_index_entry(index, entry,
+						  mode, &pcur, &mtr))) {
+		fputs("InnoDB: error in sec index entry del undo in\n"
+		      "InnoDB: ", stderr);
+		dict_index_name_print(stderr, trx, index);
+		fputs("\n"
+		      "InnoDB: tuple ", stderr);
+		dtuple_print(stderr, entry);
+		fputs("\n"
+		      "InnoDB: record ", stderr);
+		rec_print(stderr, btr_pcur_get_rec(&pcur), index);
+		putc('\n', stderr);
+		trx_print(stderr, trx, 0);
+		fputs("\n"
+		      "InnoDB: Submit a detailed bug report"
+		      " to http://bugs.mysql.com\n", stderr);
+	} else {
+		btr_cur_t*	btr_cur = btr_pcur_get_btr_cur(&pcur);
+
+		err = btr_cur_del_mark_set_sec_rec(BTR_NO_LOCKING_FLAG,
+						   btr_cur, FALSE, thr, &mtr);
+		ut_a(err == DB_SUCCESS);
+		heap = mem_heap_create(100);
+
+		update = row_upd_build_sec_rec_difference_binary(
+			index, entry, btr_cur_get_rec(btr_cur), trx, heap);
+		if (upd_get_n_fields(update) == 0) {
+
+			/* Do nothing */
+
+		} else if (mode == BTR_MODIFY_LEAF) {
+			/* Try an optimistic updating of the record, keeping
+			changes within the page */
+
+			err = btr_cur_optimistic_update(
+				BTR_KEEP_SYS_FLAG | BTR_NO_LOCKING_FLAG,
+				btr_cur, update, 0, thr, &mtr);
+			switch (err) {
+			case DB_OVERFLOW:
+			case DB_UNDERFLOW:
+			case DB_ZIP_OVERFLOW:
+				err = DB_FAIL;
+			}
+		} else {
+			ut_a(mode == BTR_MODIFY_TREE);
+			err = btr_cur_pessimistic_update(
+				BTR_KEEP_SYS_FLAG | BTR_NO_LOCKING_FLAG,
+				btr_cur, &heap, &dummy_big_rec,
+				update, 0, thr, &mtr);
+			ut_a(!dummy_big_rec);
+		}
+
+		mem_heap_free(heap);
+	}
+
+	btr_pcur_close(&pcur);
+	mtr_commit(&mtr);
+
+	return(err);
+}
+
+/***********************************************************//**
+Undoes a modify in secondary indexes when undo record type is UPD_DEL.
+@return	DB_SUCCESS or DB_OUT_OF_FILE_SPACE */
+static
+ulint
+row_undo_mod_upd_del_sec(
+/*=====================*/
+	undo_node_t*	node,	/*!< in: row undo node */
+	que_thr_t*	thr)	/*!< in: query thread */
+{
+	mem_heap_t*	heap;
+	dtuple_t*	entry;
+	dict_index_t*	index;
+	ulint		err	= DB_SUCCESS;
+
+	ut_ad(node->rec_type == TRX_UNDO_UPD_DEL_REC);
+	heap = mem_heap_create(1024);
+
+	while (node->index != NULL) {
+		index = node->index;
+
+		entry = row_build_index_entry(node->row, node->ext,
+					      index, heap);
+		if (UNIV_UNLIKELY(!entry)) {
+			/* The database must have crashed after
+			inserting a clustered index record but before
+			writing all the externally stored columns of
+			that record.  Because secondary index entries
+			are inserted after the clustered index record,
+			we may assume that the secondary index record
+			does not exist.  However, this situation may
+			only occur during the rollback of incomplete
+			transactions. */
+			ut_a(thr_is_recv(thr));
+		} else {
+			err = row_undo_mod_del_mark_or_remove_sec(
+				node, thr, index, entry);
+
+			if (err != DB_SUCCESS) {
+
+				break;
+			}
+		}
+
+		mem_heap_empty(heap);
+
+		node->index = dict_table_get_next_index(node->index);
+	}
+
+	mem_heap_free(heap);
+
+	return(err);
+}
+
+/***********************************************************//**
+Undoes a modify in secondary indexes when undo record type is DEL_MARK.
+@return	DB_SUCCESS or DB_OUT_OF_FILE_SPACE */
+static
+ulint
+row_undo_mod_del_mark_sec(
+/*======================*/
+	undo_node_t*	node,	/*!< in: row undo node */
+	que_thr_t*	thr)	/*!< in: query thread */
+{
+	mem_heap_t*	heap;
+	dtuple_t*	entry;
+	dict_index_t*	index;
+	ulint		err;
+
+	heap = mem_heap_create(1024);
+
+	while (node->index != NULL) {
+		index = node->index;
+
+		entry = row_build_index_entry(node->row, node->ext,
+					      index, heap);
+		ut_a(entry);
+		err = row_undo_mod_del_unmark_sec_and_undo_update(
+			BTR_MODIFY_LEAF, thr, index, entry);
+		if (err == DB_FAIL) {
+			err = row_undo_mod_del_unmark_sec_and_undo_update(
+				BTR_MODIFY_TREE, thr, index, entry);
+		}
+
+		if (err != DB_SUCCESS) {
+
+			mem_heap_free(heap);
+
+			return(err);
+		}
+
+		node->index = dict_table_get_next_index(node->index);
+	}
+
+	mem_heap_free(heap);
+
+	return(DB_SUCCESS);
+}
+
+/***********************************************************//**
+Undoes a modify in secondary indexes when undo record type is UPD_EXIST.
+@return	DB_SUCCESS or DB_OUT_OF_FILE_SPACE */
+static
+ulint
+row_undo_mod_upd_exist_sec(
+/*=======================*/
+	undo_node_t*	node,	/*!< in: row undo node */
+	que_thr_t*	thr)	/*!< in: query thread */
+{
+	mem_heap_t*	heap;
+	dtuple_t*	entry;
+	dict_index_t*	index;
+	ulint		err;
+
+	if (node->cmpl_info & UPD_NODE_NO_ORD_CHANGE) {
+		/* No change in secondary indexes */
+
+		return(DB_SUCCESS);
+	}
+
+	heap = mem_heap_create(1024);
+
+	while (node->index != NULL) {
+		index = node->index;
+
+		if (row_upd_changes_ord_field_binary(node->row, node->index,
+						     node->update)) {
+
+			/* Build the newest version of the index entry */
+			entry = row_build_index_entry(node->row, node->ext,
+						      index, heap);
+			if (UNIV_UNLIKELY(!entry)) {
+				/* The server must have crashed in
+				row_upd_clust_rec_by_insert(), in
+				row_ins_index_entry_low() before
+				btr_store_big_rec_extern_fields()
+				has written the externally stored columns
+				(BLOBs) of the new clustered index entry. */
+
+				/* The table must be in DYNAMIC or COMPRESSED
+				format.  REDUNDANT and COMPACT formats
+				store a local 768-byte prefix of each
+				externally stored column. */
+				ut_a(dict_table_get_format(index->table)
+				     >= DICT_TF_FORMAT_ZIP);
+
+				/* This is only legitimate when
+				rolling back an incomplete transaction
+				after crash recovery. */
+				ut_a(thr_get_trx(thr)->is_recovered);
+
+				/* The server must have crashed before
+				completing the insert of the new
+				clustered index entry and before
+				inserting to the secondary indexes.
+				Because node->row was not yet written
+				to this index, we can ignore it.  But
+				we must restore node->undo_row. */
+			} else {
+				/* NOTE that if we updated the fields of a
+				delete-marked secondary index record so that
+				alphabetically they stayed the same, e.g.,
+				'abc' -> 'aBc', we cannot return to the
+				original values because we do not know them.
+				But this should not cause problems because
+				in row0sel.c, in queries we always retrieve
+				the clustered index record or an earlier
+				version of it, if the secondary index record
+				through which we do the search is
+				delete-marked. */
+
+				err = row_undo_mod_del_mark_or_remove_sec(
+					node, thr, index, entry);
+				if (err != DB_SUCCESS) {
+					mem_heap_free(heap);
+
+					return(err);
+				}
+
+				mem_heap_empty(heap);
+			}
+
+			/* We may have to update the delete mark in the
+			secondary index record of the previous version of
+			the row. We also need to update the fields of
+			the secondary index record if we updated its fields
+			but alphabetically they stayed the same, e.g.,
+			'abc' -> 'aBc'. */
+			entry = row_build_index_entry(node->undo_row,
+						      node->undo_ext,
+						      index, heap);
+			ut_a(entry);
+
+			err = row_undo_mod_del_unmark_sec_and_undo_update(
+				BTR_MODIFY_LEAF, thr, index, entry);
+			if (err == DB_FAIL) {
+				err = row_undo_mod_del_unmark_sec_and_undo_update(
+					BTR_MODIFY_TREE, thr, index, entry);
+			}
+
+			if (err != DB_SUCCESS) {
+				mem_heap_free(heap);
+
+				return(err);
+			}
+		}
+
+		node->index = dict_table_get_next_index(node->index);
+	}
+
+	mem_heap_free(heap);
+
+	return(DB_SUCCESS);
+}
+
+/***********************************************************//**
+Parses the row reference and other info in a modify undo log record. */
+static
+void
+row_undo_mod_parse_undo_rec(
+/*========================*/
+	undo_node_t*	node,	/*!< in: row undo node */
+	que_thr_t*	thr)	/*!< in: query thread */
+{
+	dict_index_t*	clust_index;
+	byte*		ptr;
+	undo_no_t	undo_no;
+	dulint		table_id;
+	trx_id_t	trx_id;
+	roll_ptr_t	roll_ptr;
+	ulint		info_bits;
+	ulint		type;
+	ulint		cmpl_info;
+	ibool		dummy_extern;
+	trx_t*		trx;
+
+	ut_ad(node && thr);
+	trx = thr_get_trx(thr);
+	ptr = trx_undo_rec_get_pars(node->undo_rec, &type, &cmpl_info,
+				    &dummy_extern, &undo_no, &table_id);
+	node->rec_type = type;
+
+	node->table = dict_table_get_on_id(table_id, trx);
+
+	/* TODO: other fixes associated with DROP TABLE + rollback in the
+	same table by another user */
+
+	if (node->table == NULL) {
+		/* Table was dropped */
+		return;
+	}
+
+	if (node->table->ibd_file_missing) {
+		/* We skip undo operations to missing .ibd files */
+		node->table = NULL;
+
+		return;
+	}
+
+	clust_index = dict_table_get_first_index(node->table);
+
+	ptr = trx_undo_update_rec_get_sys_cols(ptr, &trx_id, &roll_ptr,
+					       &info_bits);
+
+	ptr = trx_undo_rec_get_row_ref(ptr, clust_index, &(node->ref),
+				       node->heap);
+
+	trx_undo_update_rec_get_update(ptr, clust_index, type, trx_id,
+				       roll_ptr, info_bits, trx,
+				       node->heap, &(node->update));
+	node->new_roll_ptr = roll_ptr;
+	node->new_trx_id = trx_id;
+	node->cmpl_info = cmpl_info;
+}
+
+/***********************************************************//**
+Undoes a modify operation on a row of a table.
+@return	DB_SUCCESS or error code */
+UNIV_INTERN
+ulint
+row_undo_mod(
+/*=========*/
+	undo_node_t*	node,	/*!< in: row undo node */
+	que_thr_t*	thr)	/*!< in: query thread */
+{
+	ulint	err;
+
+	ut_ad(node && thr);
+	ut_ad(node->state == UNDO_NODE_MODIFY);
+
+	row_undo_mod_parse_undo_rec(node, thr);
+
+	if (!node->table || !row_undo_search_clust_to_pcur(node)) {
+		/* It is already undone, or will be undone by another query
+		thread, or table was dropped */
+
+		trx_undo_rec_release(node->trx, node->undo_no);
+		node->state = UNDO_NODE_FETCH_NEXT;
+
+		return(DB_SUCCESS);
+	}
+
+	node->index = dict_table_get_next_index(
+		dict_table_get_first_index(node->table));
+
+	if (node->rec_type == TRX_UNDO_UPD_EXIST_REC) {
+
+		err = row_undo_mod_upd_exist_sec(node, thr);
+
+	} else if (node->rec_type == TRX_UNDO_DEL_MARK_REC) {
+
+		err = row_undo_mod_del_mark_sec(node, thr);
+	} else {
+		ut_ad(node->rec_type == TRX_UNDO_UPD_DEL_REC);
+		err = row_undo_mod_upd_del_sec(node, thr);
+	}
+
+	if (err != DB_SUCCESS) {
+
+		return(err);
+	}
+
+	err = row_undo_mod_clust(node, thr);
+
+	return(err);
+}
diff --git a/storage/xtradb/row/row0undo.c b/storage/xtradb/row/row0undo.c
new file mode 100644
index 00000000000..fd28a4f6520
--- /dev/null
+++ b/storage/xtradb/row/row0undo.c
@@ -0,0 +1,393 @@
+/*****************************************************************************
+
+Copyright (c) 1997, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file row/row0undo.c
+Row undo
+
+Created 1/8/1997 Heikki Tuuri
+*******************************************************/
+
+#include "row0undo.h"
+
+#ifdef UNIV_NONINL
+#include "row0undo.ic"
+#endif
+
+#include "fsp0fsp.h"
+#include "mach0data.h"
+#include "trx0rseg.h"
+#include "trx0trx.h"
+#include "trx0roll.h"
+#include "trx0undo.h"
+#include "trx0purge.h"
+#include "trx0rec.h"
+#include "que0que.h"
+#include "row0row.h"
+#include "row0uins.h"
+#include "row0umod.h"
+#include "row0upd.h"
+#include "row0mysql.h"
+#include "srv0srv.h"
+
+/* How to undo row operations?
+(1) For an insert, we have stored a prefix of the clustered index record
+in the undo log. Using it, we look for the clustered record, and using
+that we look for the records in the secondary indexes. The insert operation
+may have been left incomplete, if the database crashed, for example.
+We may have look at the trx id and roll ptr to make sure the record in the
+clustered index is really the one for which the undo log record was
+written. We can use the framework we get from the original insert op.
+(2) Delete marking: We can use the framework we get from the original
+delete mark op. We only have to check the trx id.
+(3) Update: This may be the most complicated. We have to use the framework
+we get from the original update op.
+
+What if the same trx repeatedly deletes and inserts an identical row.
+Then the row id changes and also roll ptr. What if the row id was not
+part of the ordering fields in the clustered index? Maybe we have to write
+it to undo log. Well, maybe not, because if we order the row id and trx id
+in descending order, then the only undeleted copy is the first in the
+index. Our searches in row operations always position the cursor before
+the first record in the result set. But, if there is no key defined for
+a table, then it would be desirable that row id is in ascending order.
+So, lets store row id in descending order only if it is not an ordering
+field in the clustered index.
+
+NOTE: Deletes and inserts may lead to situation where there are identical
+records in a secondary index. Is that a problem in the B-tree? Yes.
+Also updates can lead to this, unless trx id and roll ptr are included in
+ord fields.
+(1) Fix in clustered indexes: include row id, trx id, and roll ptr
+in node pointers of B-tree.
+(2) Fix in secondary indexes: include all fields in node pointers, and
+if an entry is inserted, check if it is equal to the right neighbor,
+in which case update the right neighbor: the neighbor must be delete
+marked, set it unmarked and write the trx id of the current transaction.
+
+What if the same trx repeatedly updates the same row, updating a secondary
+index field or not? Updating a clustered index ordering field?
+
+(1) If it does not update the secondary index and not the clustered index
+ord field. Then the secondary index record stays unchanged, but the
+trx id in the secondary index record may be smaller than in the clustered
+index record. This is no problem?
+(2) If it updates secondary index ord field but not clustered: then in
+secondary index there are delete marked records, which differ in an
+ord field. No problem.
+(3) Updates clustered ord field but not secondary, and secondary index
+is unique. Then the record in secondary index is just updated at the
+clustered ord field.
+(4)
+
+Problem with duplicate records:
+Fix 1: Add a trx op no field to all indexes. A problem: if a trx with a
+bigger trx id has inserted and delete marked a similar row, our trx inserts
+again a similar row, and a trx with an even bigger id delete marks it. Then
+the position of the row should change in the index if the trx id affects
+the alphabetical ordering.
+
+Fix 2: If an insert encounters a similar row marked deleted, we turn the
+insert into an 'update' of the row marked deleted. Then we must write undo
+info on the update. A problem: what if a purge operation tries to remove
+the delete marked row?
+
+We can think of the database row versions as a linked list which starts
+from the record in the clustered index, and is linked by roll ptrs
+through undo logs. The secondary index records are references which tell
+what kinds of records can be found in this linked list for a record
+in the clustered index.
+
+How to do the purge? A record can be removed from the clustered index
+if its linked list becomes empty, i.e., the row has been marked deleted
+and its roll ptr points to the record in the undo log we are going through,
+doing the purge. Similarly, during a rollback, a record can be removed
+if the stored roll ptr in the undo log points to a trx already (being) purged,
+or if the roll ptr is NULL, i.e., it was a fresh insert. */
+
+/********************************************************************//**
+Creates a row undo node to a query graph.
+@return	own: undo node */
+UNIV_INTERN
+undo_node_t*
+row_undo_node_create(
+/*=================*/
+	trx_t*		trx,	/*!< in: transaction */
+	que_thr_t*	parent,	/*!< in: parent node, i.e., a thr node */
+	mem_heap_t*	heap)	/*!< in: memory heap where created */
+{
+	undo_node_t*	undo;
+
+	ut_ad(trx && parent && heap);
+
+	undo = mem_heap_alloc(heap, sizeof(undo_node_t));
+
+	undo->common.type = QUE_NODE_UNDO;
+	undo->common.parent = parent;
+
+	undo->state = UNDO_NODE_FETCH_NEXT;
+	undo->trx = trx;
+
+	btr_pcur_init(&(undo->pcur));
+
+	undo->heap = mem_heap_create(256);
+
+	return(undo);
+}
+
+/***********************************************************//**
+Looks for the clustered index record when node has the row reference.
+The pcur in node is used in the search. If found, stores the row to node,
+and stores the position of pcur, and detaches it. The pcur must be closed
+by the caller in any case.
+@return TRUE if found; NOTE the node->pcur must be closed by the
+caller, regardless of the return value */
+UNIV_INTERN
+ibool
+row_undo_search_clust_to_pcur(
+/*==========================*/
+	undo_node_t*	node)	/*!< in: row undo node */
+{
+	dict_index_t*	clust_index;
+	ibool		found;
+	mtr_t		mtr;
+	ibool		ret;
+	rec_t*		rec;
+	mem_heap_t*	heap		= NULL;
+	ulint		offsets_[REC_OFFS_NORMAL_SIZE];
+	ulint*		offsets		= offsets_;
+	rec_offs_init(offsets_);
+
+	mtr_start(&mtr);
+
+	clust_index = dict_table_get_first_index(node->table);
+
+	found = row_search_on_row_ref(&(node->pcur), BTR_MODIFY_LEAF,
+				      node->table, node->ref, &mtr);
+
+	rec = btr_pcur_get_rec(&(node->pcur));
+
+	offsets = rec_get_offsets(rec, clust_index, offsets,
+				  ULINT_UNDEFINED, &heap);
+
+	if (!found || 0 != ut_dulint_cmp(node->roll_ptr,
+					 row_get_rec_roll_ptr(rec, clust_index,
+							      offsets))) {
+
+		/* We must remove the reservation on the undo log record
+		BEFORE releasing the latch on the clustered index page: this
+		is to make sure that some thread will eventually undo the
+		modification corresponding to node->roll_ptr. */
+
+		/* fputs("--------------------undoing a previous version\n",
+		stderr); */
+
+		ret = FALSE;
+	} else {
+		row_ext_t**	ext;
+
+		if (dict_table_get_format(node->table) >= DICT_TF_FORMAT_ZIP) {
+			/* In DYNAMIC or COMPRESSED format, there is
+			no prefix of externally stored columns in the
+			clustered index record. Build a cache of
+			column prefixes. */
+			ext = &node->ext;
+		} else {
+			/* REDUNDANT and COMPACT formats store a local
+			768-byte prefix of each externally stored
+			column. No cache is needed. */
+			ext = NULL;
+			node->ext = NULL;
+		}
+
+		node->row = row_build(ROW_COPY_DATA, clust_index, rec,
+				      offsets, NULL, ext, node->heap);
+		if (node->update) {
+			node->undo_row = dtuple_copy(node->row, node->heap);
+			row_upd_replace(node->undo_row, &node->undo_ext,
+					clust_index, node->update, node->heap);
+		} else {
+			node->undo_row = NULL;
+			node->undo_ext = NULL;
+		}
+
+		btr_pcur_store_position(&(node->pcur), &mtr);
+
+		ret = TRUE;
+	}
+
+	btr_pcur_commit_specify_mtr(&(node->pcur), &mtr);
+
+	if (UNIV_LIKELY_NULL(heap)) {
+		mem_heap_free(heap);
+	}
+	return(ret);
+}
+
+/***********************************************************//**
+Fetches an undo log record and does the undo for the recorded operation.
+If none left, or a partial rollback completed, returns control to the
+parent node, which is always a query thread node.
+@return	DB_SUCCESS if operation successfully completed, else error code */
+static
+ulint
+row_undo(
+/*=====*/
+	undo_node_t*	node,	/*!< in: row undo node */
+	que_thr_t*	thr)	/*!< in: query thread */
+{
+	ulint		err;
+	trx_t*		trx;
+	roll_ptr_t	roll_ptr;
+	ibool		locked_data_dict;
+
+	ut_ad(node && thr);
+
+	trx = node->trx;
+
+	if (node->state == UNDO_NODE_FETCH_NEXT) {
+
+		node->undo_rec = trx_roll_pop_top_rec_of_trx(trx,
+							     trx->roll_limit,
+							     &roll_ptr,
+							     node->heap);
+		if (!node->undo_rec) {
+			/* Rollback completed for this query thread */
+
+			thr->run_node = que_node_get_parent(node);
+
+			return(DB_SUCCESS);
+		}
+
+		node->roll_ptr = roll_ptr;
+		node->undo_no = trx_undo_rec_get_undo_no(node->undo_rec);
+
+		if (trx_undo_roll_ptr_is_insert(roll_ptr)) {
+
+			node->state = UNDO_NODE_INSERT;
+		} else {
+			node->state = UNDO_NODE_MODIFY;
+		}
+
+	} else if (node->state == UNDO_NODE_PREV_VERS) {
+
+		/* Undo should be done to the same clustered index record
+		again in this same rollback, restoring the previous version */
+
+		roll_ptr = node->new_roll_ptr;
+
+		node->undo_rec = trx_undo_get_undo_rec_low(roll_ptr,
+							   node->heap);
+		node->roll_ptr = roll_ptr;
+		node->undo_no = trx_undo_rec_get_undo_no(node->undo_rec);
+
+		if (trx_undo_roll_ptr_is_insert(roll_ptr)) {
+
+			node->state = UNDO_NODE_INSERT;
+		} else {
+			node->state = UNDO_NODE_MODIFY;
+		}
+	}
+
+	/* Prevent DROP TABLE etc. while we are rolling back this row.
+	If we are doing a TABLE CREATE or some other dictionary operation,
+	then we already have dict_operation_lock locked in x-mode. Do not
+	try to lock again, because that would cause a hang. */
+
+	locked_data_dict = (trx->dict_operation_lock_mode == 0);
+
+	if (locked_data_dict) {
+
+		row_mysql_freeze_data_dictionary(trx);
+	}
+
+	if (node->state == UNDO_NODE_INSERT) {
+
+		err = row_undo_ins(node);
+
+		node->state = UNDO_NODE_FETCH_NEXT;
+	} else {
+		ut_ad(node->state == UNDO_NODE_MODIFY);
+		err = row_undo_mod(node, thr);
+	}
+
+	if (locked_data_dict) {
+
+		row_mysql_unfreeze_data_dictionary(trx);
+	}
+
+	/* Do some cleanup */
+	btr_pcur_close(&(node->pcur));
+
+	mem_heap_empty(node->heap);
+
+	thr->run_node = node;
+
+	return(err);
+}
+
+/***********************************************************//**
+Undoes a row operation in a table. This is a high-level function used
+in SQL execution graphs.
+@return	query thread to run next or NULL */
+UNIV_INTERN
+que_thr_t*
+row_undo_step(
+/*==========*/
+	que_thr_t*	thr)	/*!< in: query thread */
+{
+	ulint		err;
+	undo_node_t*	node;
+	trx_t*		trx;
+
+	ut_ad(thr);
+
+	srv_activity_count++;
+
+	trx = thr_get_trx(thr);
+
+	node = thr->run_node;
+
+	ut_ad(que_node_get_type(node) == QUE_NODE_UNDO);
+
+	err = row_undo(node, thr);
+
+	trx->error_state = err;
+
+	if (err != DB_SUCCESS) {
+		/* SQL error detected */
+
+		fprintf(stderr, "InnoDB: Fatal error %lu in rollback.\n",
+			(ulong) err);
+
+		if (err == DB_OUT_OF_FILE_SPACE) {
+			fprintf(stderr,
+				"InnoDB: Error 13 means out of tablespace.\n"
+				"InnoDB: Consider increasing"
+				" your tablespace.\n");
+
+			exit(1);
+		}
+
+		ut_error;
+
+		return(NULL);
+	}
+
+	return(thr);
+}
diff --git a/storage/xtradb/row/row0upd.c b/storage/xtradb/row/row0upd.c
new file mode 100644
index 00000000000..04c3139fcc7
--- /dev/null
+++ b/storage/xtradb/row/row0upd.c
@@ -0,0 +1,2203 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file row/row0upd.c
+Update of a row
+
+Created 12/27/1996 Heikki Tuuri
+*******************************************************/
+
+#include "row0upd.h"
+
+#ifdef UNIV_NONINL
+#include "row0upd.ic"
+#endif
+
+#include "dict0dict.h"
+#include "trx0undo.h"
+#include "rem0rec.h"
+#ifndef UNIV_HOTBACKUP
+#include "dict0boot.h"
+#include "dict0crea.h"
+#include "mach0data.h"
+#include "btr0btr.h"
+#include "btr0cur.h"
+#include "que0que.h"
+#include "row0ext.h"
+#include "row0ins.h"
+#include "row0sel.h"
+#include "row0row.h"
+#include "rem0cmp.h"
+#include "lock0lock.h"
+#include "log0log.h"
+#include "pars0sym.h"
+#include "eval0eval.h"
+#include "buf0lru.h"
+
+
+/* What kind of latch and lock can we assume when the control comes to
+   -------------------------------------------------------------------
+an update node?
+--------------
+Efficiency of massive updates would require keeping an x-latch on a
+clustered index page through many updates, and not setting an explicit
+x-lock on clustered index records, as they anyway will get an implicit
+x-lock when they are updated. A problem is that the read nodes in the
+graph should know that they must keep the latch when passing the control
+up to the update node, and not set any record lock on the record which
+will be updated. Another problem occurs if the execution is stopped,
+as the kernel switches to another query thread, or the transaction must
+wait for a lock. Then we should be able to release the latch and, maybe,
+acquire an explicit x-lock on the record.
+	Because this seems too complicated, we conclude that the less
+efficient solution of releasing all the latches when the control is
+transferred to another node, and acquiring explicit x-locks, is better. */
+
+/* How is a delete performed? If there is a delete without an
+explicit cursor, i.e., a searched delete, there are at least
+two different situations:
+the implicit select cursor may run on (1) the clustered index or
+on (2) a secondary index. The delete is performed by setting
+the delete bit in the record and substituting the id of the
+deleting transaction for the original trx id, and substituting a
+new roll ptr for previous roll ptr. The old trx id and roll ptr
+are saved in the undo log record. Thus, no physical changes occur
+in the index tree structure at the time of the delete. Only
+when the undo log is purged, the index records will be physically
+deleted from the index trees.
+
+The query graph executing a searched delete would consist of
+a delete node which has as a subtree a select subgraph.
+The select subgraph should return a (persistent) cursor
+in the clustered index, placed on page which is x-latched.
+The delete node should look for all secondary index records for
+this clustered index entry and mark them as deleted. When is
+the x-latch freed? The most efficient way for performing a
+searched delete is obviously to keep the x-latch for several
+steps of query graph execution. */
+
+/*************************************************************************
+IMPORTANT NOTE: Any operation that generates redo MUST check that there
+is enough space in the redo log before for that operation. This is
+done by calling log_free_check(). The reason for checking the
+availability of the redo log space before the start of the operation is
+that we MUST not hold any synchonization objects when performing the
+check.
+If you make a change in this module make sure that no codepath is
+introduced where a call to log_free_check() is bypassed. */
+
+/***********************************************************//**
+Checks if an update vector changes some of the first ordering fields of an
+index record. This is only used in foreign key checks and we can assume
+that index does not contain column prefixes.
+@return	TRUE if changes */
+static
+ibool
+row_upd_changes_first_fields_binary(
+/*================================*/
+	dtuple_t*	entry,	/*!< in: old value of index entry */
+	dict_index_t*	index,	/*!< in: index of entry */
+	const upd_t*	update,	/*!< in: update vector for the row */
+	ulint		n);	/*!< in: how many first fields to check */
+
+
+/*********************************************************************//**
+Checks if index currently is mentioned as a referenced index in a foreign
+key constraint.
+
+NOTE that since we do not hold dict_operation_lock when leaving the
+function, it may be that the referencing table has been dropped when
+we leave this function: this function is only for heuristic use!
+
+@return TRUE if referenced */
+static
+ibool
+row_upd_index_is_referenced(
+/*========================*/
+	dict_index_t*	index,	/*!< in: index */
+	trx_t*		trx)	/*!< in: transaction */
+{
+	dict_table_t*	table		= index->table;
+	dict_foreign_t*	foreign;
+	ibool		froze_data_dict	= FALSE;
+	ibool		is_referenced	= FALSE;
+
+	if (!UT_LIST_GET_FIRST(table->referenced_list)) {
+
+		return(FALSE);
+	}
+
+	if (trx->dict_operation_lock_mode == 0) {
+		row_mysql_freeze_data_dictionary(trx);
+		froze_data_dict = TRUE;
+	}
+
+	foreign = UT_LIST_GET_FIRST(table->referenced_list);
+
+	while (foreign) {
+		if (foreign->referenced_index == index) {
+
+			is_referenced = TRUE;
+			goto func_exit;
+		}
+
+		foreign = UT_LIST_GET_NEXT(referenced_list, foreign);
+	}
+
+func_exit:
+	if (froze_data_dict) {
+		row_mysql_unfreeze_data_dictionary(trx);
+	}
+
+	return(is_referenced);
+}
+
+/*********************************************************************//**
+Checks if possible foreign key constraints hold after a delete of the record
+under pcur.
+
+NOTE that this function will temporarily commit mtr and lose the
+pcur position!
+
+@return	DB_SUCCESS or an error code */
+static
+ulint
+row_upd_check_references_constraints(
+/*=================================*/
+	upd_node_t*	node,	/*!< in: row update node */
+	btr_pcur_t*	pcur,	/*!< in: cursor positioned on a record; NOTE: the
+				cursor position is lost in this function! */
+	dict_table_t*	table,	/*!< in: table in question */
+	dict_index_t*	index,	/*!< in: index of the cursor */
+	ulint*		offsets,/*!< in/out: rec_get_offsets(pcur.rec, index) */
+	que_thr_t*	thr,	/*!< in: query thread */
+	mtr_t*		mtr)	/*!< in: mtr */
+{
+	dict_foreign_t*	foreign;
+	mem_heap_t*	heap;
+	dtuple_t*	entry;
+	trx_t*		trx;
+	const rec_t*	rec;
+	ulint		n_ext;
+	ulint		err;
+	ibool		got_s_lock	= FALSE;
+
+	if (UT_LIST_GET_FIRST(table->referenced_list) == NULL) {
+
+		return(DB_SUCCESS);
+	}
+
+	trx = thr_get_trx(thr);
+
+	rec = btr_pcur_get_rec(pcur);
+	ut_ad(rec_offs_validate(rec, index, offsets));
+
+	heap = mem_heap_create(500);
+
+	entry = row_rec_to_index_entry(ROW_COPY_DATA, rec, index, offsets,
+				       &n_ext, heap);
+
+	mtr_commit(mtr);
+
+	mtr_start(mtr);
+
+	if (trx->dict_operation_lock_mode == 0) {
+		got_s_lock = TRUE;
+
+		row_mysql_freeze_data_dictionary(trx);
+	}
+
+	foreign = UT_LIST_GET_FIRST(table->referenced_list);
+
+	while (foreign) {
+		/* Note that we may have an update which updates the index
+		record, but does NOT update the first fields which are
+		referenced in a foreign key constraint. Then the update does
+		NOT break the constraint. */
+
+		if (foreign->referenced_index == index
+		    && (node->is_delete
+			|| row_upd_changes_first_fields_binary(
+				entry, index, node->update,
+				foreign->n_fields))) {
+
+			if (foreign->foreign_table == NULL) {
+				dict_table_get(foreign->foreign_table_name,
+					       FALSE);
+			}
+
+			if (foreign->foreign_table) {
+				mutex_enter(&(dict_sys->mutex));
+
+				(foreign->foreign_table
+				 ->n_foreign_key_checks_running)++;
+
+				mutex_exit(&(dict_sys->mutex));
+			}
+
+			/* NOTE that if the thread ends up waiting for a lock
+			we will release dict_operation_lock temporarily!
+			But the counter on the table protects 'foreign' from
+			being dropped while the check is running. */
+
+			err = row_ins_check_foreign_constraint(
+				FALSE, foreign, table, entry, thr);
+
+			if (foreign->foreign_table) {
+				mutex_enter(&(dict_sys->mutex));
+
+				ut_a(foreign->foreign_table
+				     ->n_foreign_key_checks_running > 0);
+
+				(foreign->foreign_table
+				 ->n_foreign_key_checks_running)--;
+
+				mutex_exit(&(dict_sys->mutex));
+			}
+
+			if (err != DB_SUCCESS) {
+
+				goto func_exit;
+			}
+		}
+
+		foreign = UT_LIST_GET_NEXT(referenced_list, foreign);
+	}
+
+	err = DB_SUCCESS;
+
+func_exit:
+	if (got_s_lock) {
+		row_mysql_unfreeze_data_dictionary(trx);
+	}
+
+	mem_heap_free(heap);
+
+	return(err);
+}
+
+/*********************************************************************//**
+Creates an update node for a query graph.
+@return	own: update node */
+UNIV_INTERN
+upd_node_t*
+upd_node_create(
+/*============*/
+	mem_heap_t*	heap)	/*!< in: mem heap where created */
+{
+	upd_node_t*	node;
+
+	node = mem_heap_alloc(heap, sizeof(upd_node_t));
+	node->common.type = QUE_NODE_UPDATE;
+
+	node->state = UPD_NODE_UPDATE_CLUSTERED;
+	node->in_mysql_interface = FALSE;
+
+	node->row = NULL;
+	node->ext = NULL;
+	node->upd_row = NULL;
+	node->upd_ext = NULL;
+	node->index = NULL;
+	node->update = NULL;
+
+	node->foreign = NULL;
+	node->cascade_heap = NULL;
+	node->cascade_node = NULL;
+
+	node->select = NULL;
+
+	node->heap = mem_heap_create(128);
+	node->magic_n = UPD_NODE_MAGIC_N;
+
+	node->cmpl_info = 0;
+
+	return(node);
+}
+#endif /* !UNIV_HOTBACKUP */
+
+/*********************************************************************//**
+Updates the trx id and roll ptr field in a clustered index record in database
+recovery. */
+UNIV_INTERN
+void
+row_upd_rec_sys_fields_in_recovery(
+/*===============================*/
+	rec_t*		rec,	/*!< in/out: record */
+	page_zip_des_t*	page_zip,/*!< in/out: compressed page, or NULL */
+	const ulint*	offsets,/*!< in: array returned by rec_get_offsets() */
+	ulint		pos,	/*!< in: TRX_ID position in rec */
+	trx_id_t	trx_id,	/*!< in: transaction id */
+	roll_ptr_t	roll_ptr)/*!< in: roll ptr of the undo log record */
+{
+	ut_ad(rec_offs_validate(rec, NULL, offsets));
+
+	if (UNIV_LIKELY_NULL(page_zip)) {
+		page_zip_write_trx_id_and_roll_ptr(
+			page_zip, rec, offsets, pos, trx_id, roll_ptr);
+	} else {
+		byte*	field;
+		ulint	len;
+
+		field = rec_get_nth_field(rec, offsets, pos, &len);
+		ut_ad(len == DATA_TRX_ID_LEN);
+#if DATA_TRX_ID + 1 != DATA_ROLL_PTR
+# error "DATA_TRX_ID + 1 != DATA_ROLL_PTR"
+#endif
+		trx_write_trx_id(field, trx_id);
+		trx_write_roll_ptr(field + DATA_TRX_ID_LEN, roll_ptr);
+	}
+}
+
+#ifndef UNIV_HOTBACKUP
+/*********************************************************************//**
+Sets the trx id or roll ptr field of a clustered index entry. */
+UNIV_INTERN
+void
+row_upd_index_entry_sys_field(
+/*==========================*/
+	const dtuple_t*	entry,	/*!< in: index entry, where the memory buffers
+				for sys fields are already allocated:
+				the function just copies the new values to
+				them */
+	dict_index_t*	index,	/*!< in: clustered index */
+	ulint		type,	/*!< in: DATA_TRX_ID or DATA_ROLL_PTR */
+	dulint		val)	/*!< in: value to write */
+{
+	dfield_t*	dfield;
+	byte*		field;
+	ulint		pos;
+
+	ut_ad(dict_index_is_clust(index));
+
+	pos = dict_index_get_sys_col_pos(index, type);
+
+	dfield = dtuple_get_nth_field(entry, pos);
+	field = dfield_get_data(dfield);
+
+	if (type == DATA_TRX_ID) {
+		trx_write_trx_id(field, val);
+	} else {
+		ut_ad(type == DATA_ROLL_PTR);
+		trx_write_roll_ptr(field, val);
+	}
+}
+
+/***********************************************************//**
+Returns TRUE if row update changes size of some field in index or if some
+field to be updated is stored externally in rec or update.
+@return TRUE if the update changes the size of some field in index or
+the field is external in rec or update */
+UNIV_INTERN
+ibool
+row_upd_changes_field_size_or_external(
+/*===================================*/
+	dict_index_t*	index,	/*!< in: index */
+	const ulint*	offsets,/*!< in: rec_get_offsets(rec, index) */
+	const upd_t*	update)	/*!< in: update vector */
+{
+	const upd_field_t*	upd_field;
+	const dfield_t*		new_val;
+	ulint			old_len;
+	ulint			new_len;
+	ulint			n_fields;
+	ulint			i;
+
+	ut_ad(rec_offs_validate(NULL, index, offsets));
+	n_fields = upd_get_n_fields(update);
+
+	for (i = 0; i < n_fields; i++) {
+		upd_field = upd_get_nth_field(update, i);
+
+		new_val = &(upd_field->new_val);
+		new_len = dfield_get_len(new_val);
+
+		if (dfield_is_null(new_val) && !rec_offs_comp(offsets)) {
+			/* A bug fixed on Dec 31st, 2004: we looked at the
+			SQL NULL size from the wrong field! We may backport
+			this fix also to 4.0. The merge to 5.0 will be made
+			manually immediately after we commit this to 4.1. */
+
+			new_len = dict_col_get_sql_null_size(
+				dict_index_get_nth_col(index,
+						       upd_field->field_no),
+				0);
+		}
+
+		old_len = rec_offs_nth_size(offsets, upd_field->field_no);
+
+		if (rec_offs_comp(offsets)
+		    && rec_offs_nth_sql_null(offsets,
+					     upd_field->field_no)) {
+			/* Note that in the compact table format, for a
+			variable length field, an SQL NULL will use zero
+			bytes in the offset array at the start of the physical
+			record, but a zero-length value (empty string) will
+			use one byte! Thus, we cannot use update-in-place
+			if we update an SQL NULL varchar to an empty string! */
+
+			old_len = UNIV_SQL_NULL;
+		}
+
+		if (dfield_is_ext(new_val) || old_len != new_len
+		    || rec_offs_nth_extern(offsets, upd_field->field_no)) {
+
+			return(TRUE);
+		}
+	}
+
+	return(FALSE);
+}
+#endif /* !UNIV_HOTBACKUP */
+
+/***********************************************************//**
+Replaces the new column values stored in the update vector to the record
+given. No field size changes are allowed. */
+UNIV_INTERN
+void
+row_upd_rec_in_place(
+/*=================*/
+	rec_t*		rec,	/*!< in/out: record where replaced */
+	dict_index_t*	index,	/*!< in: the index the record belongs to */
+	const ulint*	offsets,/*!< in: array returned by rec_get_offsets() */
+	const upd_t*	update,	/*!< in: update vector */
+	page_zip_des_t*	page_zip)/*!< in: compressed page with enough space
+				available, or NULL */
+{
+	const upd_field_t*	upd_field;
+	const dfield_t*		new_val;
+	ulint			n_fields;
+	ulint			i;
+
+	ut_ad(rec_offs_validate(rec, index, offsets));
+
+	if (rec_offs_comp(offsets)) {
+		rec_set_info_bits_new(rec, update->info_bits);
+	} else {
+		rec_set_info_bits_old(rec, update->info_bits);
+	}
+
+	n_fields = upd_get_n_fields(update);
+
+	for (i = 0; i < n_fields; i++) {
+		upd_field = upd_get_nth_field(update, i);
+		new_val = &(upd_field->new_val);
+		ut_ad(!dfield_is_ext(new_val) ==
+		      !rec_offs_nth_extern(offsets, upd_field->field_no));
+
+		rec_set_nth_field(rec, offsets, upd_field->field_no,
+				  dfield_get_data(new_val),
+				  dfield_get_len(new_val));
+	}
+
+	if (UNIV_LIKELY_NULL(page_zip)) {
+		page_zip_write_rec(page_zip, rec, index, offsets, 0);
+	}
+}
+
+#ifndef UNIV_HOTBACKUP
+/*********************************************************************//**
+Writes into the redo log the values of trx id and roll ptr and enough info
+to determine their positions within a clustered index record.
+@return	new pointer to mlog */
+UNIV_INTERN
+byte*
+row_upd_write_sys_vals_to_log(
+/*==========================*/
+	dict_index_t*	index,	/*!< in: clustered index */
+	trx_t*		trx,	/*!< in: transaction */
+	roll_ptr_t	roll_ptr,/*!< in: roll ptr of the undo log record */
+	byte*		log_ptr,/*!< pointer to a buffer of size > 20 opened
+				in mlog */
+	mtr_t*		mtr __attribute__((unused))) /*!< in: mtr */
+{
+	ut_ad(dict_index_is_clust(index));
+	ut_ad(mtr);
+
+	log_ptr += mach_write_compressed(log_ptr,
+					 dict_index_get_sys_col_pos(
+						 index, DATA_TRX_ID));
+
+	trx_write_roll_ptr(log_ptr, roll_ptr);
+	log_ptr += DATA_ROLL_PTR_LEN;
+
+	log_ptr += mach_dulint_write_compressed(log_ptr, trx->id);
+
+	return(log_ptr);
+}
+#endif /* !UNIV_HOTBACKUP */
+
+/*********************************************************************//**
+Parses the log data of system field values.
+@return	log data end or NULL */
+UNIV_INTERN
+byte*
+row_upd_parse_sys_vals(
+/*===================*/
+	byte*		ptr,	/*!< in: buffer */
+	byte*		end_ptr,/*!< in: buffer end */
+	ulint*		pos,	/*!< out: TRX_ID position in record */
+	trx_id_t*	trx_id,	/*!< out: trx id */
+	roll_ptr_t*	roll_ptr)/*!< out: roll ptr */
+{
+	ptr = mach_parse_compressed(ptr, end_ptr, pos);
+
+	if (ptr == NULL) {
+
+		return(NULL);
+	}
+
+	if (end_ptr < ptr + DATA_ROLL_PTR_LEN) {
+
+		return(NULL);
+	}
+
+	*roll_ptr = trx_read_roll_ptr(ptr);
+	ptr += DATA_ROLL_PTR_LEN;
+
+	ptr = mach_dulint_parse_compressed(ptr, end_ptr, trx_id);
+
+	return(ptr);
+}
+
+#ifndef UNIV_HOTBACKUP
+/***********************************************************//**
+Writes to the redo log the new values of the fields occurring in the index. */
+UNIV_INTERN
+void
+row_upd_index_write_log(
+/*====================*/
+	const upd_t*	update,	/*!< in: update vector */
+	byte*		log_ptr,/*!< in: pointer to mlog buffer: must
+				contain at least MLOG_BUF_MARGIN bytes
+				of free space; the buffer is closed
+				within this function */
+	mtr_t*		mtr)	/*!< in: mtr into whose log to write */
+{
+	const upd_field_t*	upd_field;
+	const dfield_t*		new_val;
+	ulint			len;
+	ulint			n_fields;
+	byte*			buf_end;
+	ulint			i;
+
+	n_fields = upd_get_n_fields(update);
+
+	buf_end = log_ptr + MLOG_BUF_MARGIN;
+
+	mach_write_to_1(log_ptr, update->info_bits);
+	log_ptr++;
+	log_ptr += mach_write_compressed(log_ptr, n_fields);
+
+	for (i = 0; i < n_fields; i++) {
+
+#if MLOG_BUF_MARGIN <= 30
+# error "MLOG_BUF_MARGIN <= 30"
+#endif
+
+		if (log_ptr + 30 > buf_end) {
+			mlog_close(mtr, log_ptr);
+
+			log_ptr = mlog_open(mtr, MLOG_BUF_MARGIN);
+			buf_end = log_ptr + MLOG_BUF_MARGIN;
+		}
+
+		upd_field = upd_get_nth_field(update, i);
+
+		new_val = &(upd_field->new_val);
+
+		len = dfield_get_len(new_val);
+
+		log_ptr += mach_write_compressed(log_ptr, upd_field->field_no);
+		log_ptr += mach_write_compressed(log_ptr, len);
+
+		if (len != UNIV_SQL_NULL) {
+			if (log_ptr + len < buf_end) {
+				memcpy(log_ptr, dfield_get_data(new_val), len);
+
+				log_ptr += len;
+			} else {
+				mlog_close(mtr, log_ptr);
+
+				mlog_catenate_string(mtr,
+						     dfield_get_data(new_val),
+						     len);
+
+				log_ptr = mlog_open(mtr, MLOG_BUF_MARGIN);
+				buf_end = log_ptr + MLOG_BUF_MARGIN;
+			}
+		}
+	}
+
+	mlog_close(mtr, log_ptr);
+}
+#endif /* !UNIV_HOTBACKUP */
+
+/*********************************************************************//**
+Parses the log data written by row_upd_index_write_log.
+@return	log data end or NULL */
+UNIV_INTERN
+byte*
+row_upd_index_parse(
+/*================*/
+	byte*		ptr,	/*!< in: buffer */
+	byte*		end_ptr,/*!< in: buffer end */
+	mem_heap_t*	heap,	/*!< in: memory heap where update vector is
+				built */
+	upd_t**		update_out)/*!< out: update vector */
+{
+	upd_t*		update;
+	upd_field_t*	upd_field;
+	dfield_t*	new_val;
+	ulint		len;
+	ulint		n_fields;
+	ulint		info_bits;
+	ulint		i;
+
+	if (end_ptr < ptr + 1) {
+
+		return(NULL);
+	}
+
+	info_bits = mach_read_from_1(ptr);
+	ptr++;
+	ptr = mach_parse_compressed(ptr, end_ptr, &n_fields);
+
+	if (ptr == NULL) {
+
+		return(NULL);
+	}
+
+	update = upd_create(n_fields, heap);
+	update->info_bits = info_bits;
+
+	for (i = 0; i < n_fields; i++) {
+		ulint	field_no;
+		upd_field = upd_get_nth_field(update, i);
+		new_val = &(upd_field->new_val);
+
+		ptr = mach_parse_compressed(ptr, end_ptr, &field_no);
+
+		if (ptr == NULL) {
+
+			return(NULL);
+		}
+
+		upd_field->field_no = field_no;
+
+		ptr = mach_parse_compressed(ptr, end_ptr, &len);
+
+		if (ptr == NULL) {
+
+			return(NULL);
+		}
+
+		if (len != UNIV_SQL_NULL) {
+
+			if (end_ptr < ptr + len) {
+
+				return(NULL);
+			}
+
+			dfield_set_data(new_val,
+					mem_heap_dup(heap, ptr, len), len);
+			ptr += len;
+		} else {
+			dfield_set_null(new_val);
+		}
+	}
+
+	*update_out = update;
+
+	return(ptr);
+}
+
+#ifndef UNIV_HOTBACKUP
+/***************************************************************//**
+Builds an update vector from those fields which in a secondary index entry
+differ from a record that has the equal ordering fields. NOTE: we compare
+the fields as binary strings!
+@return	own: update vector of differing fields */
+UNIV_INTERN
+upd_t*
+row_upd_build_sec_rec_difference_binary(
+/*====================================*/
+	dict_index_t*	index,	/*!< in: index */
+	const dtuple_t*	entry,	/*!< in: entry to insert */
+	const rec_t*	rec,	/*!< in: secondary index record */
+	trx_t*		trx,	/*!< in: transaction */
+	mem_heap_t*	heap)	/*!< in: memory heap from which allocated */
+{
+	upd_field_t*	upd_field;
+	const dfield_t*	dfield;
+	const byte*	data;
+	ulint		len;
+	upd_t*		update;
+	ulint		n_diff;
+	ulint		i;
+	ulint		offsets_[REC_OFFS_SMALL_SIZE];
+	const ulint*	offsets;
+	rec_offs_init(offsets_);
+
+	/* This function is used only for a secondary index */
+	ut_a(!dict_index_is_clust(index));
+
+	update = upd_create(dtuple_get_n_fields(entry), heap);
+
+	n_diff = 0;
+	offsets = rec_get_offsets(rec, index, offsets_,
+				  ULINT_UNDEFINED, &heap);
+
+	for (i = 0; i < dtuple_get_n_fields(entry); i++) {
+
+		data = rec_get_nth_field(rec, offsets, i, &len);
+
+		dfield = dtuple_get_nth_field(entry, i);
+
+		/* NOTE that it may be that len != dfield_get_len(dfield) if we
+		are updating in a character set and collation where strings of
+		different length can be equal in an alphabetical comparison,
+		and also in the case where we have a column prefix index
+		and the last characters in the index field are spaces; the
+		latter case probably caused the assertion failures reported at
+		row0upd.c line 713 in versions 4.0.14 - 4.0.16. */
+
+		/* NOTE: we compare the fields as binary strings!
+		(No collation) */
+
+		if (!dfield_data_is_binary_equal(dfield, len, data)) {
+
+			upd_field = upd_get_nth_field(update, n_diff);
+
+			dfield_copy(&(upd_field->new_val), dfield);
+
+			upd_field_set_field_no(upd_field, i, index, trx);
+
+			n_diff++;
+		}
+	}
+
+	update->n_fields = n_diff;
+
+	return(update);
+}
+
+/***************************************************************//**
+Builds an update vector from those fields, excluding the roll ptr and
+trx id fields, which in an index entry differ from a record that has
+the equal ordering fields. NOTE: we compare the fields as binary strings!
+@return own: update vector of differing fields, excluding roll ptr and
+trx id */
+UNIV_INTERN
+upd_t*
+row_upd_build_difference_binary(
+/*============================*/
+	dict_index_t*	index,	/*!< in: clustered index */
+	const dtuple_t*	entry,	/*!< in: entry to insert */
+	const rec_t*	rec,	/*!< in: clustered index record */
+	trx_t*		trx,	/*!< in: transaction */
+	mem_heap_t*	heap)	/*!< in: memory heap from which allocated */
+{
+	upd_field_t*	upd_field;
+	const dfield_t*	dfield;
+	const byte*	data;
+	ulint		len;
+	upd_t*		update;
+	ulint		n_diff;
+	ulint		roll_ptr_pos;
+	ulint		trx_id_pos;
+	ulint		i;
+	ulint		offsets_[REC_OFFS_NORMAL_SIZE];
+	const ulint*	offsets;
+	rec_offs_init(offsets_);
+
+	/* This function is used only for a clustered index */
+	ut_a(dict_index_is_clust(index));
+
+	update = upd_create(dtuple_get_n_fields(entry), heap);
+
+	n_diff = 0;
+
+	roll_ptr_pos = dict_index_get_sys_col_pos(index, DATA_ROLL_PTR);
+	trx_id_pos = dict_index_get_sys_col_pos(index, DATA_TRX_ID);
+
+	offsets = rec_get_offsets(rec, index, offsets_,
+				  ULINT_UNDEFINED, &heap);
+
+	for (i = 0; i < dtuple_get_n_fields(entry); i++) {
+
+		data = rec_get_nth_field(rec, offsets, i, &len);
+
+		dfield = dtuple_get_nth_field(entry, i);
+
+		/* NOTE: we compare the fields as binary strings!
+		(No collation) */
+
+		if (i == trx_id_pos || i == roll_ptr_pos) {
+
+			goto skip_compare;
+		}
+
+		if (UNIV_UNLIKELY(!dfield_is_ext(dfield)
+				  != !rec_offs_nth_extern(offsets, i))
+		    || !dfield_data_is_binary_equal(dfield, len, data)) {
+
+			upd_field = upd_get_nth_field(update, n_diff);
+
+			dfield_copy(&(upd_field->new_val), dfield);
+
+			upd_field_set_field_no(upd_field, i, index, trx);
+
+			n_diff++;
+		}
+skip_compare:
+		;
+	}
+
+	update->n_fields = n_diff;
+
+	return(update);
+}
+
+/***********************************************************//**
+Fetch a prefix of an externally stored column.  This is similar
+to row_ext_lookup(), but the row_ext_t holds the old values
+of the column and must not be poisoned with the new values.
+@return	BLOB prefix */
+static
+byte*
+row_upd_ext_fetch(
+/*==============*/
+	const byte*	data,		/*!< in: 'internally' stored part of the
+					field containing also the reference to
+					the external part */
+	ulint		local_len,	/*!< in: length of data, in bytes */
+	ulint		zip_size,	/*!< in: nonzero=compressed BLOB
+					page size, zero for uncompressed
+					BLOBs */
+	ulint*		len,		/*!< in: length of prefix to fetch;
+					out: fetched length of the prefix */
+	mem_heap_t*	heap)		/*!< in: heap where to allocate */
+{
+	byte*	buf = mem_heap_alloc(heap, *len);
+
+	*len = btr_copy_externally_stored_field_prefix(buf, *len,
+						       zip_size,
+						       data, local_len);
+	/* We should never update records containing a half-deleted BLOB. */
+	ut_a(*len);
+
+	return(buf);
+}
+
+/***********************************************************//**
+Replaces the new column value stored in the update vector in
+the given index entry field. */
+static
+void
+row_upd_index_replace_new_col_val(
+/*==============================*/
+	dfield_t*		dfield,	/*!< in/out: data field
+					of the index entry */
+	const dict_field_t*	field,	/*!< in: index field */
+	const dict_col_t*	col,	/*!< in: field->col */
+	const upd_field_t*	uf,	/*!< in: update field */
+	mem_heap_t*		heap,	/*!< in: memory heap for allocating
+					and copying the new value */
+	ulint			zip_size)/*!< in: compressed page
+					 size of the table, or 0 */
+{
+	ulint		len;
+	const byte*	data;
+
+	dfield_copy_data(dfield, &uf->new_val);
+
+	if (dfield_is_null(dfield)) {
+		return;
+	}
+
+	len = dfield_get_len(dfield);
+	data = dfield_get_data(dfield);
+
+	if (field->prefix_len > 0) {
+		ibool		fetch_ext = dfield_is_ext(dfield)
+			&& len < (ulint) field->prefix_len
+			+ BTR_EXTERN_FIELD_REF_SIZE;
+
+		if (fetch_ext) {
+			ulint	l = len;
+
+			len = field->prefix_len;
+
+			data = row_upd_ext_fetch(data, l, zip_size,
+						 &len, heap);
+		}
+
+		len = dtype_get_at_most_n_mbchars(col->prtype,
+						  col->mbminlen, col->mbmaxlen,
+						  field->prefix_len, len,
+						  (const char*) data);
+
+		dfield_set_data(dfield, data, len);
+
+		if (!fetch_ext) {
+			dfield_dup(dfield, heap);
+		}
+
+		return;
+	}
+
+	switch (uf->orig_len) {
+		byte*	buf;
+	case BTR_EXTERN_FIELD_REF_SIZE:
+		/* Restore the original locally stored
+		part of the column.  In the undo log,
+		InnoDB writes a longer prefix of externally
+		stored columns, so that column prefixes
+		in secondary indexes can be reconstructed. */
+		dfield_set_data(dfield,
+				data + len - BTR_EXTERN_FIELD_REF_SIZE,
+				BTR_EXTERN_FIELD_REF_SIZE);
+		dfield_set_ext(dfield);
+		/* fall through */
+	case 0:
+		dfield_dup(dfield, heap);
+		break;
+	default:
+		/* Reconstruct the original locally
+		stored part of the column.  The data
+		will have to be copied. */
+		ut_a(uf->orig_len > BTR_EXTERN_FIELD_REF_SIZE);
+		buf = mem_heap_alloc(heap, uf->orig_len);
+		/* Copy the locally stored prefix. */
+		memcpy(buf, data,
+		       uf->orig_len - BTR_EXTERN_FIELD_REF_SIZE);
+		/* Copy the BLOB pointer. */
+		memcpy(buf + uf->orig_len - BTR_EXTERN_FIELD_REF_SIZE,
+		       data + len - BTR_EXTERN_FIELD_REF_SIZE,
+		       BTR_EXTERN_FIELD_REF_SIZE);
+
+		dfield_set_data(dfield, buf, uf->orig_len);
+		dfield_set_ext(dfield);
+		break;
+	}
+}
+
+/***********************************************************//**
+Replaces the new column values stored in the update vector to the index entry
+given. */
+UNIV_INTERN
+void
+row_upd_index_replace_new_col_vals_index_pos(
+/*=========================================*/
+	dtuple_t*	entry,	/*!< in/out: index entry where replaced;
+				the clustered index record must be
+				covered by a lock or a page latch to
+				prevent deletion (rollback or purge) */
+	dict_index_t*	index,	/*!< in: index; NOTE that this may also be a
+				non-clustered index */
+	const upd_t*	update,	/*!< in: an update vector built for the index so
+				that the field number in an upd_field is the
+				index position */
+	ibool		order_only,
+				/*!< in: if TRUE, limit the replacement to
+				ordering fields of index; note that this
+				does not work for non-clustered indexes. */
+	mem_heap_t*	heap)	/*!< in: memory heap for allocating and
+				copying the new values */
+{
+	ulint		i;
+	ulint		n_fields;
+	const ulint	zip_size	= dict_table_zip_size(index->table);
+
+	ut_ad(index);
+
+	dtuple_set_info_bits(entry, update->info_bits);
+
+	if (order_only) {
+		n_fields = dict_index_get_n_unique(index);
+	} else {
+		n_fields = dict_index_get_n_fields(index);
+	}
+
+	for (i = 0; i < n_fields; i++) {
+		const dict_field_t*	field;
+		const dict_col_t*	col;
+		const upd_field_t*	uf;
+
+		field = dict_index_get_nth_field(index, i);
+		col = dict_field_get_col(field);
+		uf = upd_get_field_by_field_no(update, i);
+
+		if (uf) {
+			row_upd_index_replace_new_col_val(
+				dtuple_get_nth_field(entry, i),
+				field, col, uf, heap, zip_size);
+		}
+	}
+}
+
+/***********************************************************//**
+Replaces the new column values stored in the update vector to the index entry
+given. */
+UNIV_INTERN
+void
+row_upd_index_replace_new_col_vals(
+/*===============================*/
+	dtuple_t*	entry,	/*!< in/out: index entry where replaced;
+				the clustered index record must be
+				covered by a lock or a page latch to
+				prevent deletion (rollback or purge) */
+	dict_index_t*	index,	/*!< in: index; NOTE that this may also be a
+				non-clustered index */
+	const upd_t*	update,	/*!< in: an update vector built for the
+				CLUSTERED index so that the field number in
+				an upd_field is the clustered index position */
+	mem_heap_t*	heap)	/*!< in: memory heap for allocating and
+				copying the new values */
+{
+	ulint			i;
+	const dict_index_t*	clust_index
+		= dict_table_get_first_index(index->table);
+	const ulint		zip_size
+		= dict_table_zip_size(index->table);
+
+	dtuple_set_info_bits(entry, update->info_bits);
+
+	for (i = 0; i < dict_index_get_n_fields(index); i++) {
+		const dict_field_t*	field;
+		const dict_col_t*	col;
+		const upd_field_t*	uf;
+
+		field = dict_index_get_nth_field(index, i);
+		col = dict_field_get_col(field);
+		uf = upd_get_field_by_field_no(
+			update, dict_col_get_clust_pos(col, clust_index));
+
+		if (uf) {
+			row_upd_index_replace_new_col_val(
+				dtuple_get_nth_field(entry, i),
+				field, col, uf, heap, zip_size);
+		}
+	}
+}
+
+/***********************************************************//**
+Replaces the new column values stored in the update vector. */
+UNIV_INTERN
+void
+row_upd_replace(
+/*============*/
+	dtuple_t*		row,	/*!< in/out: row where replaced,
+					indexed by col_no;
+					the clustered index record must be
+					covered by a lock or a page latch to
+					prevent deletion (rollback or purge) */
+	row_ext_t**		ext,	/*!< out, own: NULL, or externally
+					stored column prefixes */
+	const dict_index_t*	index,	/*!< in: clustered index */
+	const upd_t*		update,	/*!< in: an update vector built for the
+					clustered index */
+	mem_heap_t*		heap)	/*!< in: memory heap */
+{
+	ulint			col_no;
+	ulint			i;
+	ulint			n_cols;
+	ulint			n_ext_cols;
+	ulint*			ext_cols;
+	const dict_table_t*	table;
+
+	ut_ad(row);
+	ut_ad(ext);
+	ut_ad(index);
+	ut_ad(dict_index_is_clust(index));
+	ut_ad(update);
+	ut_ad(heap);
+
+	n_cols = dtuple_get_n_fields(row);
+	table = index->table;
+	ut_ad(n_cols == dict_table_get_n_cols(table));
+
+	ext_cols = mem_heap_alloc(heap, n_cols * sizeof *ext_cols);
+	n_ext_cols = 0;
+
+	dtuple_set_info_bits(row, update->info_bits);
+
+	for (col_no = 0; col_no < n_cols; col_no++) {
+
+		const dict_col_t*	col
+			= dict_table_get_nth_col(table, col_no);
+		const ulint		clust_pos
+			= dict_col_get_clust_pos(col, index);
+		dfield_t*		dfield;
+
+		if (UNIV_UNLIKELY(clust_pos == ULINT_UNDEFINED)) {
+
+			continue;
+		}
+
+		dfield = dtuple_get_nth_field(row, col_no);
+
+		for (i = 0; i < upd_get_n_fields(update); i++) {
+
+			const upd_field_t*	upd_field
+				= upd_get_nth_field(update, i);
+
+			if (upd_field->field_no != clust_pos) {
+
+				continue;
+			}
+
+			dfield_copy_data(dfield, &upd_field->new_val);
+			break;
+		}
+
+		if (dfield_is_ext(dfield) && col->ord_part) {
+			ext_cols[n_ext_cols++] = col_no;
+		}
+	}
+
+	if (n_ext_cols) {
+		*ext = row_ext_create(n_ext_cols, ext_cols, row,
+				      dict_table_zip_size(table), heap);
+	} else {
+		*ext = NULL;
+	}
+}
+
+/***********************************************************//**
+Checks if an update vector changes an ordering field of an index record.
+
+This function is fast if the update vector is short or the number of ordering
+fields in the index is small. Otherwise, this can be quadratic.
+NOTE: we compare the fields as binary strings!
+@return TRUE if update vector changes an ordering field in the index record */
+UNIV_INTERN
+ibool
+row_upd_changes_ord_field_binary(
+/*=============================*/
+	const dtuple_t*	row,	/*!< in: old value of row, or NULL if the
+				row and the data values in update are not
+				known when this function is called, e.g., at
+				compile time */
+	dict_index_t*	index,	/*!< in: index of the record */
+	const upd_t*	update)	/*!< in: update vector for the row; NOTE: the
+				field numbers in this MUST be clustered index
+				positions! */
+{
+	ulint		n_unique;
+	ulint		n_upd_fields;
+	ulint		i, j;
+	dict_index_t*	clust_index;
+
+	ut_ad(update && index);
+
+	n_unique = dict_index_get_n_unique(index);
+	n_upd_fields = upd_get_n_fields(update);
+
+	clust_index = dict_table_get_first_index(index->table);
+
+	for (i = 0; i < n_unique; i++) {
+
+		const dict_field_t*	ind_field;
+		const dict_col_t*	col;
+		ulint			col_pos;
+		ulint			col_no;
+
+		ind_field = dict_index_get_nth_field(index, i);
+		col = dict_field_get_col(ind_field);
+		col_pos = dict_col_get_clust_pos(col, clust_index);
+		col_no = dict_col_get_no(col);
+
+		for (j = 0; j < n_upd_fields; j++) {
+
+			const upd_field_t*	upd_field
+				= upd_get_nth_field(update, j);
+
+			/* Note that if the index field is a column prefix
+			then it may be that row does not contain an externally
+			stored part of the column value, and we cannot compare
+			the datas */
+
+			if (col_pos == upd_field->field_no
+			    && (row == NULL
+				|| ind_field->prefix_len > 0
+				|| !dfield_datas_are_binary_equal(
+					dtuple_get_nth_field(row, col_no),
+					&(upd_field->new_val)))) {
+
+				return(TRUE);
+			}
+		}
+	}
+
+	return(FALSE);
+}
+
+/***********************************************************//**
+Checks if an update vector changes an ordering field of an index record.
+NOTE: we compare the fields as binary strings!
+@return TRUE if update vector may change an ordering field in an index
+record */
+UNIV_INTERN
+ibool
+row_upd_changes_some_index_ord_field_binary(
+/*========================================*/
+	const dict_table_t*	table,	/*!< in: table */
+	const upd_t*		update)	/*!< in: update vector for the row */
+{
+	upd_field_t*	upd_field;
+	dict_index_t*	index;
+	ulint		i;
+
+	index = dict_table_get_first_index(table);
+
+	for (i = 0; i < upd_get_n_fields(update); i++) {
+
+		upd_field = upd_get_nth_field(update, i);
+
+		if (dict_field_get_col(dict_index_get_nth_field(
+					       index, upd_field->field_no))
+		    ->ord_part) {
+
+			return(TRUE);
+		}
+	}
+
+	return(FALSE);
+}
+
+/***********************************************************//**
+Checks if an update vector changes some of the first ordering fields of an
+index record. This is only used in foreign key checks and we can assume
+that index does not contain column prefixes.
+@return	TRUE if changes */
+static
+ibool
+row_upd_changes_first_fields_binary(
+/*================================*/
+	dtuple_t*	entry,	/*!< in: index entry */
+	dict_index_t*	index,	/*!< in: index of entry */
+	const upd_t*	update,	/*!< in: update vector for the row */
+	ulint		n)	/*!< in: how many first fields to check */
+{
+	ulint		n_upd_fields;
+	ulint		i, j;
+	dict_index_t*	clust_index;
+
+	ut_ad(update && index);
+	ut_ad(n <= dict_index_get_n_fields(index));
+
+	n_upd_fields = upd_get_n_fields(update);
+	clust_index = dict_table_get_first_index(index->table);
+
+	for (i = 0; i < n; i++) {
+
+		const dict_field_t*	ind_field;
+		const dict_col_t*	col;
+		ulint			col_pos;
+
+		ind_field = dict_index_get_nth_field(index, i);
+		col = dict_field_get_col(ind_field);
+		col_pos = dict_col_get_clust_pos(col, clust_index);
+
+		ut_a(ind_field->prefix_len == 0);
+
+		for (j = 0; j < n_upd_fields; j++) {
+
+			upd_field_t*	upd_field
+				= upd_get_nth_field(update, j);
+
+			if (col_pos == upd_field->field_no
+			    && !dfield_datas_are_binary_equal(
+				    dtuple_get_nth_field(entry, i),
+				    &(upd_field->new_val))) {
+
+				return(TRUE);
+			}
+		}
+	}
+
+	return(FALSE);
+}
+
+/*********************************************************************//**
+Copies the column values from a record. */
+UNIV_INLINE
+void
+row_upd_copy_columns(
+/*=================*/
+	rec_t*		rec,	/*!< in: record in a clustered index */
+	const ulint*	offsets,/*!< in: array returned by rec_get_offsets() */
+	sym_node_t*	column)	/*!< in: first column in a column list, or
+				NULL */
+{
+	byte*	data;
+	ulint	len;
+
+	while (column) {
+		data = rec_get_nth_field(rec, offsets,
+					 column->field_nos[SYM_CLUST_FIELD_NO],
+					 &len);
+		eval_node_copy_and_alloc_val(column, data, len);
+
+		column = UT_LIST_GET_NEXT(col_var_list, column);
+	}
+}
+
+/*********************************************************************//**
+Calculates the new values for fields to update. Note that row_upd_copy_columns
+must have been called first. */
+UNIV_INLINE
+void
+row_upd_eval_new_vals(
+/*==================*/
+	upd_t*	update)	/*!< in/out: update vector */
+{
+	que_node_t*	exp;
+	upd_field_t*	upd_field;
+	ulint		n_fields;
+	ulint		i;
+
+	n_fields = upd_get_n_fields(update);
+
+	for (i = 0; i < n_fields; i++) {
+		upd_field = upd_get_nth_field(update, i);
+
+		exp = upd_field->exp;
+
+		eval_exp(exp);
+
+		dfield_copy_data(&(upd_field->new_val), que_node_get_val(exp));
+	}
+}
+
+/***********************************************************//**
+Stores to the heap the row on which the node->pcur is positioned. */
+static
+void
+row_upd_store_row(
+/*==============*/
+	upd_node_t*	node)	/*!< in: row update node */
+{
+	dict_index_t*	clust_index;
+	rec_t*		rec;
+	mem_heap_t*	heap		= NULL;
+	row_ext_t**	ext;
+	ulint		offsets_[REC_OFFS_NORMAL_SIZE];
+	const ulint*	offsets;
+	rec_offs_init(offsets_);
+
+	ut_ad(node->pcur->latch_mode != BTR_NO_LATCHES);
+
+	if (node->row != NULL) {
+		mem_heap_empty(node->heap);
+	}
+
+	clust_index = dict_table_get_first_index(node->table);
+
+	rec = btr_pcur_get_rec(node->pcur);
+
+	offsets = rec_get_offsets(rec, clust_index, offsets_,
+				  ULINT_UNDEFINED, &heap);
+
+	if (dict_table_get_format(node->table) >= DICT_TF_FORMAT_ZIP) {
+		/* In DYNAMIC or COMPRESSED format, there is no prefix
+		of externally stored columns in the clustered index
+		record. Build a cache of column prefixes. */
+		ext = &node->ext;
+	} else {
+		/* REDUNDANT and COMPACT formats store a local
+		768-byte prefix of each externally stored column.
+		No cache is needed. */
+		ext = NULL;
+		node->ext = NULL;
+	}
+
+	node->row = row_build(ROW_COPY_DATA, clust_index, rec, offsets,
+			      NULL, ext, node->heap);
+	if (node->is_delete) {
+		node->upd_row = NULL;
+		node->upd_ext = NULL;
+	} else {
+		node->upd_row = dtuple_copy(node->row, node->heap);
+		row_upd_replace(node->upd_row, &node->upd_ext,
+				clust_index, node->update, node->heap);
+	}
+
+	if (UNIV_LIKELY_NULL(heap)) {
+		mem_heap_free(heap);
+	}
+}
+
+/***********************************************************//**
+Updates a secondary index entry of a row.
+@return DB_SUCCESS if operation successfully completed, else error
+code or DB_LOCK_WAIT */
+static
+ulint
+row_upd_sec_index_entry(
+/*====================*/
+	upd_node_t*	node,	/*!< in: row update node */
+	que_thr_t*	thr)	/*!< in: query thread */
+{
+	ibool		check_ref;
+	ibool		found;
+	dict_index_t*	index;
+	dtuple_t*	entry;
+	btr_pcur_t	pcur;
+	btr_cur_t*	btr_cur;
+	mem_heap_t*	heap;
+	rec_t*		rec;
+	ulint		err	= DB_SUCCESS;
+	mtr_t		mtr;
+	trx_t*		trx	= thr_get_trx(thr);
+
+	index = node->index;
+
+	check_ref = row_upd_index_is_referenced(index, trx);
+
+	heap = mem_heap_create(1024);
+
+	/* Build old index entry */
+	entry = row_build_index_entry(node->row, node->ext, index, heap);
+	ut_a(entry);
+
+	mtr_start(&mtr);
+
+	found = row_search_index_entry(index, entry, BTR_MODIFY_LEAF, &pcur,
+				       &mtr);
+	btr_cur = btr_pcur_get_btr_cur(&pcur);
+
+	rec = btr_cur_get_rec(btr_cur);
+
+	if (UNIV_UNLIKELY(!found)) {
+		fputs("InnoDB: error in sec index entry update in\n"
+		      "InnoDB: ", stderr);
+		dict_index_name_print(stderr, trx, index);
+		fputs("\n"
+		      "InnoDB: tuple ", stderr);
+		dtuple_print(stderr, entry);
+		fputs("\n"
+		      "InnoDB: record ", stderr);
+		rec_print(stderr, rec, index);
+		putc('\n', stderr);
+
+		trx_print(stderr, trx, 0);
+
+		fputs("\n"
+		      "InnoDB: Submit a detailed bug report"
+		      " to http://bugs.mysql.com\n", stderr);
+	} else {
+		/* Delete mark the old index record; it can already be
+		delete marked if we return after a lock wait in
+		row_ins_index_entry below */
+
+		if (!rec_get_deleted_flag(rec,
+					  dict_table_is_comp(index->table))) {
+			err = btr_cur_del_mark_set_sec_rec(0, btr_cur, TRUE,
+							   thr, &mtr);
+			if (err == DB_SUCCESS && check_ref) {
+
+				ulint*	offsets = rec_get_offsets(
+					rec, index, NULL,
+					ULINT_UNDEFINED, &heap);
+				/* NOTE that the following call loses
+				the position of pcur ! */
+				err = row_upd_check_references_constraints(
+					node, &pcur, index->table,
+					index, offsets, thr, &mtr);
+			}
+		}
+	}
+
+	btr_pcur_close(&pcur);
+	mtr_commit(&mtr);
+
+	if (node->is_delete || err != DB_SUCCESS) {
+
+		goto func_exit;
+	}
+
+	/* Build a new index entry */
+	entry = row_build_index_entry(node->upd_row, node->upd_ext,
+				      index, heap);
+	ut_a(entry);
+
+	/* Insert new index entry */
+	err = row_ins_index_entry(index, entry, 0, TRUE, thr);
+
+func_exit:
+	mem_heap_free(heap);
+
+	return(err);
+}
+
+/***********************************************************//**
+Updates the secondary index record if it is changed in the row update or
+deletes it if this is a delete.
+@return DB_SUCCESS if operation successfully completed, else error
+code or DB_LOCK_WAIT */
+static
+ulint
+row_upd_sec_step(
+/*=============*/
+	upd_node_t*	node,	/*!< in: row update node */
+	que_thr_t*	thr)	/*!< in: query thread */
+{
+	ut_ad((node->state == UPD_NODE_UPDATE_ALL_SEC)
+	      || (node->state == UPD_NODE_UPDATE_SOME_SEC));
+	ut_ad(!dict_index_is_clust(node->index));
+
+	if (node->state == UPD_NODE_UPDATE_ALL_SEC
+	    || row_upd_changes_ord_field_binary(node->row, node->index,
+						node->update)) {
+		return(row_upd_sec_index_entry(node, thr));
+	}
+
+	return(DB_SUCCESS);
+}
+
+/***********************************************************//**
+Marks the clustered index record deleted and inserts the updated version
+of the record to the index. This function should be used when the ordering
+fields of the clustered index record change. This should be quite rare in
+database applications.
+@return DB_SUCCESS if operation successfully completed, else error
+code or DB_LOCK_WAIT */
+static
+ulint
+row_upd_clust_rec_by_insert(
+/*========================*/
+	upd_node_t*	node,	/*!< in: row update node */
+	dict_index_t*	index,	/*!< in: clustered index of the record */
+	que_thr_t*	thr,	/*!< in: query thread */
+	ibool		check_ref,/*!< in: TRUE if index may be referenced in
+				a foreign key constraint */
+	mtr_t*		mtr)	/*!< in: mtr; gets committed here */
+{
+	mem_heap_t*	heap	= NULL;
+	btr_pcur_t*	pcur;
+	btr_cur_t*	btr_cur;
+	trx_t*		trx;
+	dict_table_t*	table;
+	dtuple_t*	entry;
+	ulint		err;
+	ibool		change_ownership = FALSE;
+
+	ut_ad(node);
+	ut_ad(dict_index_is_clust(index));
+
+	trx = thr_get_trx(thr);
+	table = node->table;
+	pcur = node->pcur;
+	btr_cur	= btr_pcur_get_btr_cur(pcur);
+
+	if (node->state != UPD_NODE_INSERT_CLUSTERED) {
+		rec_t*		rec;
+		dict_index_t*	index;
+		ulint		offsets_[REC_OFFS_NORMAL_SIZE];
+		ulint*		offsets;
+		rec_offs_init(offsets_);
+
+		err = btr_cur_del_mark_set_clust_rec(BTR_NO_LOCKING_FLAG,
+						     btr_cur, TRUE, thr, mtr);
+		if (err != DB_SUCCESS) {
+			mtr_commit(mtr);
+			return(err);
+		}
+
+		/* Mark as not-owned the externally stored fields which the new
+		row inherits from the delete marked record: purge should not
+		free those externally stored fields even if the delete marked
+		record is removed from the index tree, or updated. */
+
+		rec = btr_cur_get_rec(btr_cur);
+		index = dict_table_get_first_index(table);
+		offsets = rec_get_offsets(rec, index, offsets_,
+					  ULINT_UNDEFINED, &heap);
+		change_ownership = btr_cur_mark_extern_inherited_fields(
+			btr_cur_get_page_zip(btr_cur), rec, index, offsets,
+			node->update, mtr);
+		if (check_ref) {
+			/* NOTE that the following call loses
+			the position of pcur ! */
+			err = row_upd_check_references_constraints(
+				node, pcur, table, index, offsets, thr, mtr);
+			if (err != DB_SUCCESS) {
+				mtr_commit(mtr);
+				if (UNIV_LIKELY_NULL(heap)) {
+					mem_heap_free(heap);
+				}
+				return(err);
+			}
+		}
+	}
+
+	mtr_commit(mtr);
+
+	if (!heap) {
+		heap = mem_heap_create(500);
+	}
+	node->state = UPD_NODE_INSERT_CLUSTERED;
+
+	entry = row_build_index_entry(node->upd_row, node->upd_ext,
+				      index, heap);
+	ut_a(entry);
+
+	row_upd_index_entry_sys_field(entry, index, DATA_TRX_ID, trx->id);
+
+	if (change_ownership) {
+		/* If we return from a lock wait, for example, we may have
+		extern fields marked as not-owned in entry (marked in the
+		if-branch above). We must unmark them, take the ownership
+		back. */
+
+		btr_cur_unmark_dtuple_extern_fields(entry);
+
+		/* We must mark non-updated extern fields in entry as
+		inherited, so that a possible rollback will not free them. */
+
+		btr_cur_mark_dtuple_inherited_extern(entry, node->update);
+	}
+
+	err = row_ins_index_entry(index, entry,
+				  node->upd_ext ? node->upd_ext->n_ext : 0,
+				  TRUE, thr);
+	mem_heap_free(heap);
+
+	return(err);
+}
+
+/***********************************************************//**
+Updates a clustered index record of a row when the ordering fields do
+not change.
+@return DB_SUCCESS if operation successfully completed, else error
+code or DB_LOCK_WAIT */
+static
+ulint
+row_upd_clust_rec(
+/*==============*/
+	upd_node_t*	node,	/*!< in: row update node */
+	dict_index_t*	index,	/*!< in: clustered index */
+	que_thr_t*	thr,	/*!< in: query thread */
+	mtr_t*		mtr)	/*!< in: mtr; gets committed here */
+{
+	mem_heap_t*	heap	= NULL;
+	big_rec_t*	big_rec	= NULL;
+	btr_pcur_t*	pcur;
+	btr_cur_t*	btr_cur;
+	ulint		err;
+
+	ut_ad(node);
+	ut_ad(dict_index_is_clust(index));
+
+	pcur = node->pcur;
+	btr_cur = btr_pcur_get_btr_cur(pcur);
+
+	ut_ad(!rec_get_deleted_flag(btr_pcur_get_rec(pcur),
+				    dict_table_is_comp(index->table)));
+
+	/* Try optimistic updating of the record, keeping changes within
+	the page; we do not check locks because we assume the x-lock on the
+	record to update */
+
+	if (node->cmpl_info & UPD_NODE_NO_SIZE_CHANGE) {
+		err = btr_cur_update_in_place(BTR_NO_LOCKING_FLAG,
+					      btr_cur, node->update,
+					      node->cmpl_info, thr, mtr);
+	} else {
+		err = btr_cur_optimistic_update(BTR_NO_LOCKING_FLAG,
+						btr_cur, node->update,
+						node->cmpl_info, thr, mtr);
+	}
+
+	mtr_commit(mtr);
+
+	if (UNIV_LIKELY(err == DB_SUCCESS)) {
+
+		return(DB_SUCCESS);
+	}
+
+	if (buf_LRU_buf_pool_running_out()) {
+
+		return(DB_LOCK_TABLE_FULL);
+	}
+	/* We may have to modify the tree structure: do a pessimistic descent
+	down the index tree */
+
+	mtr_start(mtr);
+
+	/* NOTE: this transaction has an s-lock or x-lock on the record and
+	therefore other transactions cannot modify the record when we have no
+	latch on the page. In addition, we assume that other query threads of
+	the same transaction do not modify the record in the meantime.
+	Therefore we can assert that the restoration of the cursor succeeds. */
+
+	ut_a(btr_pcur_restore_position(BTR_MODIFY_TREE, pcur, mtr));
+
+	ut_ad(!rec_get_deleted_flag(btr_pcur_get_rec(pcur),
+				    dict_table_is_comp(index->table)));
+
+	err = btr_cur_pessimistic_update(BTR_NO_LOCKING_FLAG, btr_cur,
+					 &heap, &big_rec, node->update,
+					 node->cmpl_info, thr, mtr);
+	mtr_commit(mtr);
+
+	if (err == DB_SUCCESS && big_rec) {
+		ulint		offsets_[REC_OFFS_NORMAL_SIZE];
+		rec_t*		rec;
+		rec_offs_init(offsets_);
+
+		mtr_start(mtr);
+
+		ut_a(btr_pcur_restore_position(BTR_MODIFY_TREE, pcur, mtr));
+		rec = btr_cur_get_rec(btr_cur);
+		err = btr_store_big_rec_extern_fields(
+			index, btr_cur_get_block(btr_cur), rec,
+			rec_get_offsets(rec, index, offsets_,
+					ULINT_UNDEFINED, &heap),
+			big_rec, mtr);
+		mtr_commit(mtr);
+	}
+
+	if (UNIV_LIKELY_NULL(heap)) {
+		mem_heap_free(heap);
+	}
+
+	if (big_rec) {
+		dtuple_big_rec_free(big_rec);
+	}
+
+	return(err);
+}
+
+/***********************************************************//**
+Delete marks a clustered index record.
+@return	DB_SUCCESS if operation successfully completed, else error code */
+static
+ulint
+row_upd_del_mark_clust_rec(
+/*=======================*/
+	upd_node_t*	node,	/*!< in: row update node */
+	dict_index_t*	index,	/*!< in: clustered index */
+	ulint*		offsets,/*!< in/out: rec_get_offsets() for the
+				record under the cursor */
+	que_thr_t*	thr,	/*!< in: query thread */
+	ibool		check_ref,/*!< in: TRUE if index may be referenced in
+				a foreign key constraint */
+	mtr_t*		mtr)	/*!< in: mtr; gets committed here */
+{
+	btr_pcur_t*	pcur;
+	btr_cur_t*	btr_cur;
+	ulint		err;
+
+	ut_ad(node);
+	ut_ad(dict_index_is_clust(index));
+	ut_ad(node->is_delete);
+
+	pcur = node->pcur;
+	btr_cur = btr_pcur_get_btr_cur(pcur);
+
+	/* Store row because we have to build also the secondary index
+	entries */
+
+	row_upd_store_row(node);
+
+	/* Mark the clustered index record deleted; we do not have to check
+	locks, because we assume that we have an x-lock on the record */
+
+	err = btr_cur_del_mark_set_clust_rec(BTR_NO_LOCKING_FLAG,
+					     btr_cur, TRUE, thr, mtr);
+	if (err == DB_SUCCESS && check_ref) {
+		/* NOTE that the following call loses the position of pcur ! */
+
+		err = row_upd_check_references_constraints(node,
+							   pcur, index->table,
+							   index, offsets,
+							   thr, mtr);
+	}
+
+	mtr_commit(mtr);
+
+	return(err);
+}
+
+/***********************************************************//**
+Updates the clustered index record.
+@return DB_SUCCESS if operation successfully completed, DB_LOCK_WAIT
+in case of a lock wait, else error code */
+static
+ulint
+row_upd_clust_step(
+/*===============*/
+	upd_node_t*	node,	/*!< in: row update node */
+	que_thr_t*	thr)	/*!< in: query thread */
+{
+	dict_index_t*	index;
+	btr_pcur_t*	pcur;
+	ibool		success;
+	ibool		check_ref;
+	ulint		err;
+	mtr_t*		mtr;
+	mtr_t		mtr_buf;
+	rec_t*		rec;
+	mem_heap_t*	heap		= NULL;
+	ulint		offsets_[REC_OFFS_NORMAL_SIZE];
+	ulint*		offsets;
+	rec_offs_init(offsets_);
+
+	index = dict_table_get_first_index(node->table);
+
+	check_ref = row_upd_index_is_referenced(index, thr_get_trx(thr));
+
+	pcur = node->pcur;
+
+	/* We have to restore the cursor to its position */
+	mtr = &mtr_buf;
+
+	mtr_start(mtr);
+
+	/* If the restoration does not succeed, then the same
+	transaction has deleted the record on which the cursor was,
+	and that is an SQL error. If the restoration succeeds, it may
+	still be that the same transaction has successively deleted
+	and inserted a record with the same ordering fields, but in
+	that case we know that the transaction has at least an
+	implicit x-lock on the record. */
+
+	ut_a(pcur->rel_pos == BTR_PCUR_ON);
+
+	success = btr_pcur_restore_position(BTR_MODIFY_LEAF, pcur, mtr);
+
+	if (!success) {
+		err = DB_RECORD_NOT_FOUND;
+
+		mtr_commit(mtr);
+
+		return(err);
+	}
+
+	/* If this is a row in SYS_INDEXES table of the data dictionary,
+	then we have to free the file segments of the index tree associated
+	with the index */
+
+	if (node->is_delete
+	    && ut_dulint_cmp(node->table->id, DICT_INDEXES_ID) == 0) {
+
+		dict_drop_index_tree(btr_pcur_get_rec(pcur), mtr);
+
+		mtr_commit(mtr);
+
+		mtr_start(mtr);
+
+		success = btr_pcur_restore_position(BTR_MODIFY_LEAF, pcur,
+						    mtr);
+		if (!success) {
+			err = DB_ERROR;
+
+			mtr_commit(mtr);
+
+			return(err);
+		}
+	}
+
+	rec = btr_pcur_get_rec(pcur);
+	offsets = rec_get_offsets(rec, index, offsets_,
+				  ULINT_UNDEFINED, &heap);
+
+	if (!node->has_clust_rec_x_lock) {
+		err = lock_clust_rec_modify_check_and_lock(
+			0, btr_pcur_get_block(pcur),
+			rec, index, offsets, thr);
+		if (err != DB_SUCCESS) {
+			mtr_commit(mtr);
+			goto exit_func;
+		}
+	}
+
+	/* NOTE: the following function calls will also commit mtr */
+
+	if (node->is_delete) {
+		err = row_upd_del_mark_clust_rec(node, index, offsets,
+						 thr, check_ref, mtr);
+		if (err == DB_SUCCESS) {
+			node->state = UPD_NODE_UPDATE_ALL_SEC;
+			node->index = dict_table_get_next_index(index);
+		}
+exit_func:
+		if (UNIV_LIKELY_NULL(heap)) {
+			mem_heap_free(heap);
+		}
+		return(err);
+	}
+
+	/* If the update is made for MySQL, we already have the update vector
+	ready, else we have to do some evaluation: */
+
+	if (UNIV_UNLIKELY(!node->in_mysql_interface)) {
+		/* Copy the necessary columns from clust_rec and calculate the
+		new values to set */
+		row_upd_copy_columns(rec, offsets,
+				     UT_LIST_GET_FIRST(node->columns));
+		row_upd_eval_new_vals(node->update);
+	}
+
+	if (UNIV_LIKELY_NULL(heap)) {
+		mem_heap_free(heap);
+	}
+
+	if (node->cmpl_info & UPD_NODE_NO_ORD_CHANGE) {
+
+		err = row_upd_clust_rec(node, index, thr, mtr);
+		return(err);
+	}
+
+	row_upd_store_row(node);
+
+	if (row_upd_changes_ord_field_binary(node->row, index, node->update)) {
+
+		/* Update causes an ordering field (ordering fields within
+		the B-tree) of the clustered index record to change: perform
+		the update by delete marking and inserting.
+
+		TODO! What to do to the 'Halloween problem', where an update
+		moves the record forward in index so that it is again
+		updated when the cursor arrives there? Solution: the
+		read operation must check the undo record undo number when
+		choosing records to update. MySQL solves now the problem
+		externally! */
+
+		err = row_upd_clust_rec_by_insert(node, index, thr, check_ref,
+						  mtr);
+		if (err != DB_SUCCESS) {
+
+			return(err);
+		}
+
+		node->state = UPD_NODE_UPDATE_ALL_SEC;
+	} else {
+		err = row_upd_clust_rec(node, index, thr, mtr);
+
+		if (err != DB_SUCCESS) {
+
+			return(err);
+		}
+
+		node->state = UPD_NODE_UPDATE_SOME_SEC;
+	}
+
+	node->index = dict_table_get_next_index(index);
+
+	return(err);
+}
+
+/***********************************************************//**
+Updates the affected index records of a row. When the control is transferred
+to this node, we assume that we have a persistent cursor which was on a
+record, and the position of the cursor is stored in the cursor.
+@return DB_SUCCESS if operation successfully completed, else error
+code or DB_LOCK_WAIT */
+static
+ulint
+row_upd(
+/*====*/
+	upd_node_t*	node,	/*!< in: row update node */
+	que_thr_t*	thr)	/*!< in: query thread */
+{
+	ulint	err	= DB_SUCCESS;
+
+	ut_ad(node && thr);
+
+	if (UNIV_LIKELY(node->in_mysql_interface)) {
+
+		/* We do not get the cmpl_info value from the MySQL
+		interpreter: we must calculate it on the fly: */
+
+		if (node->is_delete
+		    || row_upd_changes_some_index_ord_field_binary(
+			    node->table, node->update)) {
+			node->cmpl_info = 0;
+		} else {
+			node->cmpl_info = UPD_NODE_NO_ORD_CHANGE;
+		}
+	}
+
+	if (node->state == UPD_NODE_UPDATE_CLUSTERED
+	    || node->state == UPD_NODE_INSERT_CLUSTERED) {
+
+		log_free_check();
+		err = row_upd_clust_step(node, thr);
+
+		if (err != DB_SUCCESS) {
+
+			goto function_exit;
+		}
+	}
+
+	if (!node->is_delete && (node->cmpl_info & UPD_NODE_NO_ORD_CHANGE)) {
+
+		goto function_exit;
+	}
+
+	while (node->index != NULL) {
+
+		log_free_check();
+		err = row_upd_sec_step(node, thr);
+
+		if (err != DB_SUCCESS) {
+
+			goto function_exit;
+		}
+
+		node->index = dict_table_get_next_index(node->index);
+	}
+
+function_exit:
+	if (err == DB_SUCCESS) {
+		/* Do some cleanup */
+
+		if (node->row != NULL) {
+			node->row = NULL;
+			node->ext = NULL;
+			node->upd_row = NULL;
+			node->upd_ext = NULL;
+			mem_heap_empty(node->heap);
+		}
+
+		node->state = UPD_NODE_UPDATE_CLUSTERED;
+	}
+
+	return(err);
+}
+
+/***********************************************************//**
+Updates a row in a table. This is a high-level function used in SQL execution
+graphs.
+@return	query thread to run next or NULL */
+UNIV_INTERN
+que_thr_t*
+row_upd_step(
+/*=========*/
+	que_thr_t*	thr)	/*!< in: query thread */
+{
+	upd_node_t*	node;
+	sel_node_t*	sel_node;
+	que_node_t*	parent;
+	ulint		err		= DB_SUCCESS;
+	trx_t*		trx;
+
+	ut_ad(thr);
+
+	trx = thr_get_trx(thr);
+
+	trx_start_if_not_started(trx);
+
+	node = thr->run_node;
+
+	sel_node = node->select;
+
+	parent = que_node_get_parent(node);
+
+	ut_ad(que_node_get_type(node) == QUE_NODE_UPDATE);
+
+	if (thr->prev_node == parent) {
+		node->state = UPD_NODE_SET_IX_LOCK;
+	}
+
+	if (node->state == UPD_NODE_SET_IX_LOCK) {
+
+		if (!node->has_clust_rec_x_lock) {
+			/* It may be that the current session has not yet
+			started its transaction, or it has been committed: */
+
+			err = lock_table(0, node->table, LOCK_IX, thr);
+
+			if (err != DB_SUCCESS) {
+
+				goto error_handling;
+			}
+		}
+
+		node->state = UPD_NODE_UPDATE_CLUSTERED;
+
+		if (node->searched_update) {
+			/* Reset the cursor */
+			sel_node->state = SEL_NODE_OPEN;
+
+			/* Fetch a row to update */
+
+			thr->run_node = sel_node;
+
+			return(thr);
+		}
+	}
+
+	/* sel_node is NULL if we are in the MySQL interface */
+
+	if (sel_node && (sel_node->state != SEL_NODE_FETCH)) {
+
+		if (!node->searched_update) {
+			/* An explicit cursor should be positioned on a row
+			to update */
+
+			ut_error;
+
+			err = DB_ERROR;
+
+			goto error_handling;
+		}
+
+		ut_ad(sel_node->state == SEL_NODE_NO_MORE_ROWS);
+
+		/* No more rows to update, or the select node performed the
+		updates directly in-place */
+
+		thr->run_node = parent;
+
+		return(thr);
+	}
+
+	/* DO THE CHECKS OF THE CONSISTENCY CONSTRAINTS HERE */
+
+	err = row_upd(node, thr);
+
+error_handling:
+	trx->error_state = err;
+
+	if (err != DB_SUCCESS) {
+		return(NULL);
+	}
+
+	/* DO THE TRIGGER ACTIONS HERE */
+
+	if (node->searched_update) {
+		/* Fetch next row to update */
+
+		thr->run_node = sel_node;
+	} else {
+		/* It was an explicit cursor update */
+
+		thr->run_node = parent;
+	}
+
+	node->state = UPD_NODE_UPDATE_CLUSTERED;
+
+	return(thr);
+}
+#endif /* !UNIV_HOTBACKUP */
diff --git a/storage/xtradb/row/row0vers.c b/storage/xtradb/row/row0vers.c
new file mode 100644
index 00000000000..a4fbb5289aa
--- /dev/null
+++ b/storage/xtradb/row/row0vers.c
@@ -0,0 +1,741 @@
+/*****************************************************************************
+
+Copyright (c) 1997, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file row/row0vers.c
+Row versions
+
+Created 2/6/1997 Heikki Tuuri
+*******************************************************/
+
+#include "row0vers.h"
+
+#ifdef UNIV_NONINL
+#include "row0vers.ic"
+#endif
+
+#include "dict0dict.h"
+#include "dict0boot.h"
+#include "btr0btr.h"
+#include "mach0data.h"
+#include "trx0rseg.h"
+#include "trx0trx.h"
+#include "trx0roll.h"
+#include "trx0undo.h"
+#include "trx0purge.h"
+#include "trx0rec.h"
+#include "que0que.h"
+#include "row0row.h"
+#include "row0upd.h"
+#include "rem0cmp.h"
+#include "read0read.h"
+#include "lock0lock.h"
+
+/*****************************************************************//**
+Finds out if an active transaction has inserted or modified a secondary
+index record. NOTE: the kernel mutex is temporarily released in this
+function!
+@return NULL if committed, else the active transaction */
+UNIV_INTERN
+trx_t*
+row_vers_impl_x_locked_off_kernel(
+/*==============================*/
+	const rec_t*	rec,	/*!< in: record in a secondary index */
+	dict_index_t*	index,	/*!< in: the secondary index */
+	const ulint*	offsets)/*!< in: rec_get_offsets(rec, index) */
+{
+	dict_index_t*	clust_index;
+	rec_t*		clust_rec;
+	ulint*		clust_offsets;
+	rec_t*		version;
+	trx_id_t	trx_id;
+	mem_heap_t*	heap;
+	mem_heap_t*	heap2;
+	dtuple_t*	row;
+	dtuple_t*	entry	= NULL; /* assignment to eliminate compiler
+					warning */
+	trx_t*		trx;
+	ulint		rec_del;
+	ulint		err;
+	mtr_t		mtr;
+	ulint		comp;
+
+	ut_ad(mutex_own(&kernel_mutex));
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(!rw_lock_own(&(purge_sys->latch), RW_LOCK_SHARED));
+#endif /* UNIV_SYNC_DEBUG */
+
+	mutex_exit(&kernel_mutex);
+
+	mtr_start(&mtr);
+
+	/* Search for the clustered index record: this is a time-consuming
+	operation: therefore we release the kernel mutex; also, the release
+	is required by the latching order convention. The latch on the
+	clustered index locks the top of the stack of versions. We also
+	reserve purge_latch to lock the bottom of the version stack. */
+
+	clust_rec = row_get_clust_rec(BTR_SEARCH_LEAF, rec, index,
+				      &clust_index, &mtr);
+	if (!clust_rec) {
+		/* In a rare case it is possible that no clust rec is found
+		for a secondary index record: if in row0umod.c
+		row_undo_mod_remove_clust_low() we have already removed the
+		clust rec, while purge is still cleaning and removing
+		secondary index records associated with earlier versions of
+		the clustered index record. In that case there cannot be
+		any implicit lock on the secondary index record, because
+		an active transaction which has modified the secondary index
+		record has also modified the clustered index record. And in
+		a rollback we always undo the modifications to secondary index
+		records before the clustered index record. */
+
+		mutex_enter(&kernel_mutex);
+		mtr_commit(&mtr);
+
+		return(NULL);
+	}
+
+	heap = mem_heap_create(1024);
+	clust_offsets = rec_get_offsets(clust_rec, clust_index, NULL,
+					ULINT_UNDEFINED, &heap);
+	trx_id = row_get_rec_trx_id(clust_rec, clust_index, clust_offsets);
+
+	mtr_s_lock(&(purge_sys->latch), &mtr);
+
+	mutex_enter(&kernel_mutex);
+
+	trx = NULL;
+	if (!trx_is_active(trx_id)) {
+		/* The transaction that modified or inserted clust_rec is no
+		longer active: no implicit lock on rec */
+		goto exit_func;
+	}
+
+	if (!lock_check_trx_id_sanity(trx_id, clust_rec, clust_index,
+				      clust_offsets, TRUE)) {
+		/* Corruption noticed: try to avoid a crash by returning */
+		goto exit_func;
+	}
+
+	comp = page_rec_is_comp(rec);
+	ut_ad(index->table == clust_index->table);
+	ut_ad(!!comp == dict_table_is_comp(index->table));
+	ut_ad(!comp == !page_rec_is_comp(clust_rec));
+
+	/* We look up if some earlier version, which was modified by the trx_id
+	transaction, of the clustered index record would require rec to be in
+	a different state (delete marked or unmarked, or have different field
+	values, or not existing). If there is such a version, then rec was
+	modified by the trx_id transaction, and it has an implicit x-lock on
+	rec. Note that if clust_rec itself would require rec to be in a
+	different state, then the trx_id transaction has not yet had time to
+	modify rec, and does not necessarily have an implicit x-lock on rec. */
+
+	rec_del = rec_get_deleted_flag(rec, comp);
+	trx = NULL;
+
+	version = clust_rec;
+
+	for (;;) {
+		rec_t*		prev_version;
+		ulint		vers_del;
+		row_ext_t*	ext;
+		trx_id_t	prev_trx_id;
+
+		mutex_exit(&kernel_mutex);
+
+		/* While we retrieve an earlier version of clust_rec, we
+		release the kernel mutex, because it may take time to access
+		the disk. After the release, we have to check if the trx_id
+		transaction is still active. We keep the semaphore in mtr on
+		the clust_rec page, so that no other transaction can update
+		it and get an implicit x-lock on rec. */
+
+		heap2 = heap;
+		heap = mem_heap_create(1024);
+		err = trx_undo_prev_version_build(clust_rec, &mtr, version,
+						  clust_index, clust_offsets,
+						  heap, &prev_version);
+		mem_heap_free(heap2); /* free version and clust_offsets */
+
+		if (prev_version == NULL) {
+			mutex_enter(&kernel_mutex);
+
+			if (!trx_is_active(trx_id)) {
+				/* Transaction no longer active: no
+				implicit x-lock */
+
+				break;
+			}
+
+			/* If the transaction is still active,
+			clust_rec must be a fresh insert, because no
+			previous version was found. */
+			ut_ad(err == DB_SUCCESS);
+
+			/* It was a freshly inserted version: there is an
+			implicit x-lock on rec */
+
+			trx = trx_get_on_id(trx_id);
+
+			break;
+		}
+
+		clust_offsets = rec_get_offsets(prev_version, clust_index,
+						NULL, ULINT_UNDEFINED, &heap);
+
+		vers_del = rec_get_deleted_flag(prev_version, comp);
+		prev_trx_id = row_get_rec_trx_id(prev_version, clust_index,
+						 clust_offsets);
+
+		/* If the trx_id and prev_trx_id are different and if
+		the prev_version is marked deleted then the
+		prev_trx_id must have already committed for the trx_id
+		to be able to modify the row. Therefore, prev_trx_id
+		cannot hold any implicit lock. */
+		if (vers_del && 0 != ut_dulint_cmp(trx_id, prev_trx_id)) {
+
+			mutex_enter(&kernel_mutex);
+			break;
+		}
+
+		/* The stack of versions is locked by mtr.  Thus, it
+		is safe to fetch the prefixes for externally stored
+		columns. */
+		row = row_build(ROW_COPY_POINTERS, clust_index, prev_version,
+				clust_offsets, NULL, &ext, heap);
+		entry = row_build_index_entry(row, ext, index, heap);
+		/* entry may be NULL if a record was inserted in place
+		of a deleted record, and the BLOB pointers of the new
+		record were not initialized yet.  But in that case,
+		prev_version should be NULL. */
+		ut_a(entry);
+
+		mutex_enter(&kernel_mutex);
+
+		if (!trx_is_active(trx_id)) {
+			/* Transaction no longer active: no implicit x-lock */
+
+			break;
+		}
+
+		/* If we get here, we know that the trx_id transaction is
+		still active and it has modified prev_version. Let us check
+		if prev_version would require rec to be in a different
+		state. */
+
+		/* The previous version of clust_rec must be
+		accessible, because the transaction is still active
+		and clust_rec was not a fresh insert. */
+		ut_ad(err == DB_SUCCESS);
+
+		/* We check if entry and rec are identified in the alphabetical
+		ordering */
+		if (0 == cmp_dtuple_rec(entry, rec, offsets)) {
+			/* The delete marks of rec and prev_version should be
+			equal for rec to be in the state required by
+			prev_version */
+
+			if (rec_del != vers_del) {
+				trx = trx_get_on_id(trx_id);
+
+				break;
+			}
+
+			/* It is possible that the row was updated so that the
+			secondary index record remained the same in
+			alphabetical ordering, but the field values changed
+			still. For example, 'abc' -> 'ABC'. Check also that. */
+
+			dtuple_set_types_binary(entry,
+						dtuple_get_n_fields(entry));
+			if (0 != cmp_dtuple_rec(entry, rec, offsets)) {
+
+				trx = trx_get_on_id(trx_id);
+
+				break;
+			}
+		} else if (!rec_del) {
+			/* The delete mark should be set in rec for it to be
+			in the state required by prev_version */
+
+			trx = trx_get_on_id(trx_id);
+
+			break;
+		}
+
+		if (0 != ut_dulint_cmp(trx_id, prev_trx_id)) {
+			/* The versions modified by the trx_id transaction end
+			to prev_version: no implicit x-lock */
+
+			break;
+		}
+
+		version = prev_version;
+	}/* for (;;) */
+
+exit_func:
+	mtr_commit(&mtr);
+	mem_heap_free(heap);
+
+	return(trx);
+}
+
+/*****************************************************************//**
+Finds out if we must preserve a delete marked earlier version of a clustered
+index record, because it is >= the purge view.
+@return	TRUE if earlier version should be preserved */
+UNIV_INTERN
+ibool
+row_vers_must_preserve_del_marked(
+/*==============================*/
+	trx_id_t	trx_id,	/*!< in: transaction id in the version */
+	mtr_t*		mtr)	/*!< in: mtr holding the latch on the
+				clustered index record; it will also
+				hold the latch on purge_view */
+{
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(!rw_lock_own(&(purge_sys->latch), RW_LOCK_SHARED));
+#endif /* UNIV_SYNC_DEBUG */
+
+	mtr_s_lock(&(purge_sys->latch), mtr);
+
+	if (trx_purge_update_undo_must_exist(trx_id)) {
+
+		/* A purge operation is not yet allowed to remove this
+		delete marked record */
+
+		return(TRUE);
+	}
+
+	return(FALSE);
+}
+
+/*****************************************************************//**
+Finds out if a version of the record, where the version >= the current
+purge view, should have ientry as its secondary index entry. We check
+if there is any not delete marked version of the record where the trx
+id >= purge view, and the secondary index entry and ientry are identified in
+the alphabetical ordering; exactly in this case we return TRUE.
+@return	TRUE if earlier version should have */
+UNIV_INTERN
+ibool
+row_vers_old_has_index_entry(
+/*=========================*/
+	ibool		also_curr,/*!< in: TRUE if also rec is included in the
+				versions to search; otherwise only versions
+				prior to it are searched */
+	const rec_t*	rec,	/*!< in: record in the clustered index; the
+				caller must have a latch on the page */
+	mtr_t*		mtr,	/*!< in: mtr holding the latch on rec; it will
+				also hold the latch on purge_view */
+	dict_index_t*	index,	/*!< in: the secondary index */
+	const dtuple_t*	ientry)	/*!< in: the secondary index entry */
+{
+	const rec_t*	version;
+	rec_t*		prev_version;
+	dict_index_t*	clust_index;
+	ulint*		clust_offsets;
+	mem_heap_t*	heap;
+	mem_heap_t*	heap2;
+	const dtuple_t*	row;
+	const dtuple_t*	entry;
+	ulint		err;
+	ulint		comp;
+
+	ut_ad(mtr_memo_contains_page(mtr, rec, MTR_MEMO_PAGE_X_FIX)
+	      || mtr_memo_contains_page(mtr, rec, MTR_MEMO_PAGE_S_FIX));
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(!rw_lock_own(&(purge_sys->latch), RW_LOCK_SHARED));
+#endif /* UNIV_SYNC_DEBUG */
+	mtr_s_lock(&(purge_sys->latch), mtr);
+
+	clust_index = dict_table_get_first_index(index->table);
+
+	comp = page_rec_is_comp(rec);
+	ut_ad(!dict_table_is_comp(index->table) == !comp);
+	heap = mem_heap_create(1024);
+	clust_offsets = rec_get_offsets(rec, clust_index, NULL,
+					ULINT_UNDEFINED, &heap);
+
+	if (also_curr && !rec_get_deleted_flag(rec, comp)) {
+		row_ext_t*	ext;
+
+		/* The stack of versions is locked by mtr.
+		Thus, it is safe to fetch the prefixes for
+		externally stored columns. */
+		row = row_build(ROW_COPY_POINTERS, clust_index,
+				rec, clust_offsets, NULL, &ext, heap);
+		entry = row_build_index_entry(row, ext, index, heap);
+
+		/* If entry == NULL, the record contains unset BLOB
+		pointers.  This must be a freshly inserted record.  If
+		this is called from
+		row_purge_remove_sec_if_poss_low(), the thread will
+		hold latches on the clustered index and the secondary
+		index.  Because the insert works in three steps:
+
+			(1) insert the record to clustered index
+			(2) store the BLOBs and update BLOB pointers
+			(3) insert records to secondary indexes
+
+		the purge thread can safely ignore freshly inserted
+		records and delete the secondary index record.  The
+		thread that inserted the new record will be inserting
+		the secondary index records. */
+
+		/* NOTE that we cannot do the comparison as binary
+		fields because the row is maybe being modified so that
+		the clustered index record has already been updated to
+		a different binary value in a char field, but the
+		collation identifies the old and new value anyway! */
+		if (entry && !dtuple_coll_cmp(ientry, entry)) {
+
+			mem_heap_free(heap);
+
+			return(TRUE);
+		}
+	}
+
+	version = rec;
+
+	for (;;) {
+		heap2 = heap;
+		heap = mem_heap_create(1024);
+		err = trx_undo_prev_version_build(rec, mtr, version,
+						  clust_index, clust_offsets,
+						  heap, &prev_version);
+		mem_heap_free(heap2); /* free version and clust_offsets */
+
+		if (err != DB_SUCCESS || !prev_version) {
+			/* Versions end here */
+
+			mem_heap_free(heap);
+
+			return(FALSE);
+		}
+
+		clust_offsets = rec_get_offsets(prev_version, clust_index,
+						NULL, ULINT_UNDEFINED, &heap);
+
+		if (!rec_get_deleted_flag(prev_version, comp)) {
+			row_ext_t*	ext;
+
+			/* The stack of versions is locked by mtr.
+			Thus, it is safe to fetch the prefixes for
+			externally stored columns. */
+			row = row_build(ROW_COPY_POINTERS, clust_index,
+					prev_version, clust_offsets,
+					NULL, &ext, heap);
+			entry = row_build_index_entry(row, ext, index, heap);
+
+			/* If entry == NULL, the record contains unset
+			BLOB pointers.  This must be a freshly
+			inserted record that we can safely ignore.
+			For the justification, see the comments after
+			the previous row_build_index_entry() call. */
+
+			/* NOTE that we cannot do the comparison as binary
+			fields because maybe the secondary index record has
+			already been updated to a different binary value in
+			a char field, but the collation identifies the old
+			and new value anyway! */
+
+			if (entry && !dtuple_coll_cmp(ientry, entry)) {
+
+				mem_heap_free(heap);
+
+				return(TRUE);
+			}
+		}
+
+		version = prev_version;
+	}
+}
+
+/*****************************************************************//**
+Constructs the version of a clustered index record which a consistent
+read should see. We assume that the trx id stored in rec is such that
+the consistent read should not see rec in its present version.
+@return	DB_SUCCESS or DB_MISSING_HISTORY */
+UNIV_INTERN
+ulint
+row_vers_build_for_consistent_read(
+/*===============================*/
+	const rec_t*	rec,	/*!< in: record in a clustered index; the
+				caller must have a latch on the page; this
+				latch locks the top of the stack of versions
+				of this records */
+	mtr_t*		mtr,	/*!< in: mtr holding the latch on rec */
+	dict_index_t*	index,	/*!< in: the clustered index */
+	ulint**		offsets,/*!< in/out: offsets returned by
+				rec_get_offsets(rec, index) */
+	read_view_t*	view,	/*!< in: the consistent read view */
+	mem_heap_t**	offset_heap,/*!< in/out: memory heap from which
+				the offsets are allocated */
+	mem_heap_t*	in_heap,/*!< in: memory heap from which the memory for
+				*old_vers is allocated; memory for possible
+				intermediate versions is allocated and freed
+				locally within the function */
+	rec_t**		old_vers)/*!< out, own: old version, or NULL if the
+				record does not exist in the view, that is,
+				it was freshly inserted afterwards */
+{
+	const rec_t*	version;
+	rec_t*		prev_version;
+	trx_id_t	trx_id;
+	mem_heap_t*	heap		= NULL;
+	byte*		buf;
+	ulint		err;
+
+	ut_ad(dict_index_is_clust(index));
+	ut_ad(mtr_memo_contains_page(mtr, rec, MTR_MEMO_PAGE_X_FIX)
+	      || mtr_memo_contains_page(mtr, rec, MTR_MEMO_PAGE_S_FIX));
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(!rw_lock_own(&(purge_sys->latch), RW_LOCK_SHARED));
+#endif /* UNIV_SYNC_DEBUG */
+
+	ut_ad(rec_offs_validate(rec, index, *offsets));
+
+	trx_id = row_get_rec_trx_id(rec, index, *offsets);
+
+	ut_ad(!read_view_sees_trx_id(view, trx_id));
+
+	rw_lock_s_lock(&(purge_sys->latch));
+	version = rec;
+
+	for (;;) {
+		mem_heap_t*	heap2	= heap;
+		trx_undo_rec_t* undo_rec;
+		roll_ptr_t	roll_ptr;
+		undo_no_t	undo_no;
+		heap = mem_heap_create(1024);
+
+		/* If we have high-granularity consistent read view and
+		creating transaction of the view is the same as trx_id in
+		the record we see this record only in the case when
+		undo_no of the record is < undo_no in the view. */
+
+		if (view->type == VIEW_HIGH_GRANULARITY
+		    && ut_dulint_cmp(view->creator_trx_id, trx_id) == 0) {
+
+			roll_ptr = row_get_rec_roll_ptr(version, index,
+							*offsets);
+			undo_rec = trx_undo_get_undo_rec_low(roll_ptr, heap);
+			undo_no = trx_undo_rec_get_undo_no(undo_rec);
+			mem_heap_empty(heap);
+
+			if (ut_dulint_cmp(view->undo_no, undo_no) > 0) {
+				/* The view already sees this version: we can
+				copy it to in_heap and return */
+
+				buf = mem_heap_alloc(in_heap,
+						     rec_offs_size(*offsets));
+				*old_vers = rec_copy(buf, version, *offsets);
+				rec_offs_make_valid(*old_vers, index,
+						    *offsets);
+				err = DB_SUCCESS;
+
+				break;
+			}
+		}
+
+		err = trx_undo_prev_version_build(rec, mtr, version, index,
+						  *offsets, heap,
+						  &prev_version);
+		if (heap2) {
+			mem_heap_free(heap2); /* free version */
+		}
+
+		if (err != DB_SUCCESS) {
+			break;
+		}
+
+		if (prev_version == NULL) {
+			/* It was a freshly inserted version */
+			*old_vers = NULL;
+			err = DB_SUCCESS;
+
+			break;
+		}
+
+		*offsets = rec_get_offsets(prev_version, index, *offsets,
+					   ULINT_UNDEFINED, offset_heap);
+
+		trx_id = row_get_rec_trx_id(prev_version, index, *offsets);
+
+		if (read_view_sees_trx_id(view, trx_id)) {
+
+			/* The view already sees this version: we can copy
+			it to in_heap and return */
+
+			buf = mem_heap_alloc(in_heap, rec_offs_size(*offsets));
+			*old_vers = rec_copy(buf, prev_version, *offsets);
+			rec_offs_make_valid(*old_vers, index, *offsets);
+			err = DB_SUCCESS;
+
+			break;
+		}
+
+		version = prev_version;
+	}/* for (;;) */
+
+	mem_heap_free(heap);
+	rw_lock_s_unlock(&(purge_sys->latch));
+
+	return(err);
+}
+
+/*****************************************************************//**
+Constructs the last committed version of a clustered index record,
+which should be seen by a semi-consistent read.
+@return	DB_SUCCESS or DB_MISSING_HISTORY */
+UNIV_INTERN
+ulint
+row_vers_build_for_semi_consistent_read(
+/*====================================*/
+	const rec_t*	rec,	/*!< in: record in a clustered index; the
+				caller must have a latch on the page; this
+				latch locks the top of the stack of versions
+				of this records */
+	mtr_t*		mtr,	/*!< in: mtr holding the latch on rec */
+	dict_index_t*	index,	/*!< in: the clustered index */
+	ulint**		offsets,/*!< in/out: offsets returned by
+				rec_get_offsets(rec, index) */
+	mem_heap_t**	offset_heap,/*!< in/out: memory heap from which
+				the offsets are allocated */
+	mem_heap_t*	in_heap,/*!< in: memory heap from which the memory for
+				*old_vers is allocated; memory for possible
+				intermediate versions is allocated and freed
+				locally within the function */
+	const rec_t**	old_vers)/*!< out: rec, old version, or NULL if the
+				record does not exist in the view, that is,
+				it was freshly inserted afterwards */
+{
+	const rec_t*	version;
+	mem_heap_t*	heap		= NULL;
+	byte*		buf;
+	ulint		err;
+	trx_id_t	rec_trx_id	= ut_dulint_zero;
+
+	ut_ad(dict_index_is_clust(index));
+	ut_ad(mtr_memo_contains_page(mtr, rec, MTR_MEMO_PAGE_X_FIX)
+	      || mtr_memo_contains_page(mtr, rec, MTR_MEMO_PAGE_S_FIX));
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(!rw_lock_own(&(purge_sys->latch), RW_LOCK_SHARED));
+#endif /* UNIV_SYNC_DEBUG */
+
+	ut_ad(rec_offs_validate(rec, index, *offsets));
+
+	rw_lock_s_lock(&(purge_sys->latch));
+	/* The S-latch on purge_sys prevents the purge view from
+	changing.  Thus, if we have an uncommitted transaction at
+	this point, then purge cannot remove its undo log even if
+	the transaction could commit now. */
+
+	version = rec;
+
+	for (;;) {
+		trx_t*		version_trx;
+		mem_heap_t*	heap2;
+		rec_t*		prev_version;
+		trx_id_t	version_trx_id;
+
+		version_trx_id = row_get_rec_trx_id(version, index, *offsets);
+		if (rec == version) {
+			rec_trx_id = version_trx_id;
+		}
+
+		mutex_enter(&kernel_mutex);
+		version_trx = trx_get_on_id(version_trx_id);
+		mutex_exit(&kernel_mutex);
+
+		if (!version_trx
+		    || version_trx->conc_state == TRX_NOT_STARTED
+		    || version_trx->conc_state == TRX_COMMITTED_IN_MEMORY) {
+
+			/* We found a version that belongs to a
+			committed transaction: return it. */
+
+			if (rec == version) {
+				*old_vers = rec;
+				err = DB_SUCCESS;
+				break;
+			}
+
+			/* We assume that a rolled-back transaction stays in
+			TRX_ACTIVE state until all the changes have been
+			rolled back and the transaction is removed from
+			the global list of transactions. */
+
+			if (!ut_dulint_cmp(rec_trx_id, version_trx_id)) {
+				/* The transaction was committed while
+				we searched for earlier versions.
+				Return the current version as a
+				semi-consistent read. */
+
+				version = rec;
+				*offsets = rec_get_offsets(version,
+							   index, *offsets,
+							   ULINT_UNDEFINED,
+							   offset_heap);
+			}
+
+			buf = mem_heap_alloc(in_heap, rec_offs_size(*offsets));
+			*old_vers = rec_copy(buf, version, *offsets);
+			rec_offs_make_valid(*old_vers, index, *offsets);
+			err = DB_SUCCESS;
+
+			break;
+		}
+
+		heap2 = heap;
+		heap = mem_heap_create(1024);
+
+		err = trx_undo_prev_version_build(rec, mtr, version, index,
+						  *offsets, heap,
+						  &prev_version);
+		if (heap2) {
+			mem_heap_free(heap2); /* free version */
+		}
+
+		if (UNIV_UNLIKELY(err != DB_SUCCESS)) {
+			break;
+		}
+
+		if (prev_version == NULL) {
+			/* It was a freshly inserted version */
+			*old_vers = NULL;
+			err = DB_SUCCESS;
+
+			break;
+		}
+
+		version = prev_version;
+		*offsets = rec_get_offsets(version, index, *offsets,
+					   ULINT_UNDEFINED, offset_heap);
+	}/* for (;;) */
+
+	if (heap) {
+		mem_heap_free(heap);
+	}
+	rw_lock_s_unlock(&(purge_sys->latch));
+
+	return(err);
+}
diff --git a/storage/xtradb/scripts/install_innodb_plugins.sql b/storage/xtradb/scripts/install_innodb_plugins.sql
new file mode 100644
index 00000000000..5a555a652f7
--- /dev/null
+++ b/storage/xtradb/scripts/install_innodb_plugins.sql
@@ -0,0 +1,17 @@
+-- execute these to install InnoDB if it is built as a dynamic plugin
+INSTALL PLUGIN innodb SONAME 'ha_innodb.so';
+INSTALL PLUGIN innodb_trx SONAME 'ha_innodb.so';
+INSTALL PLUGIN innodb_locks SONAME 'ha_innodb.so';
+INSTALL PLUGIN innodb_lock_waits SONAME 'ha_innodb.so';
+INSTALL PLUGIN innodb_cmp SONAME 'ha_innodb.so';
+INSTALL PLUGIN innodb_cmp_reset SONAME 'ha_innodb.so';
+INSTALL PLUGIN innodb_cmpmem SONAME 'ha_innodb.so';
+INSTALL PLUGIN innodb_cmpmem_reset SONAME 'ha_innodb.so';
+INSTALL PLUGIN XTRADB_ENHANCEMENTS SONAME 'ha_innodb.so';
+INSTALL PLUGIN INNODB_BUFFER_POOL_PAGES SONAME 'ha_innodb.so';
+INSTALL PLUGIN INNODB_BUFFER_POOL_PAGES_BLOB SONAME 'ha_innodb.so';
+INSTALL PLUGIN INNODB_BUFFER_POOL_PAGES_INDEX SONAME 'ha_innodb.so';
+INSTALL PLUGIN innodb_rseg SONAME 'ha_innodb.so';
+INSTALL PLUGIN innodb_table_stats SONAME 'ha_innodb.so';
+INSTALL PLUGIN innodb_index_stats SONAME 'ha_innodb.so';
+INSTALL PLUGIN xtradb_admin_command SONAME 'ha_innodb.so';
diff --git a/storage/xtradb/scripts/install_innodb_plugins_win.sql b/storage/xtradb/scripts/install_innodb_plugins_win.sql
new file mode 100644
index 00000000000..7cda3335694
--- /dev/null
+++ b/storage/xtradb/scripts/install_innodb_plugins_win.sql
@@ -0,0 +1,17 @@
+-- execute these to install InnoDB if it is built as a dynamic plugin
+INSTALL PLUGIN innodb SONAME 'ha_innodb.dll';
+INSTALL PLUGIN innodb_trx SONAME 'ha_innodb.dll';
+INSTALL PLUGIN innodb_locks SONAME 'ha_innodb.dll';
+INSTALL PLUGIN innodb_lock_waits SONAME 'ha_innodb.dll';
+INSTALL PLUGIN innodb_cmp SONAME 'ha_innodb.dll';
+INSTALL PLUGIN innodb_cmp_reset SONAME 'ha_innodb.dll';
+INSTALL PLUGIN innodb_cmpmem SONAME 'ha_innodb.dll';
+INSTALL PLUGIN innodb_cmpmem_reset SONAME 'ha_innodb.dll';
+INSTALL PLUGIN XTRADB_ENHANCEMENTS SONAME 'ha_innodb.dll';
+INSTALL PLUGIN INNODB_BUFFER_POOL_PAGES SONAME 'ha_innodb.dll';
+INSTALL PLUGIN INNODB_BUFFER_POOL_PAGES_BLOB SONAME 'ha_innodb.dll';
+INSTALL PLUGIN INNODB_BUFFER_POOL_PAGES_INDEX SONAME 'ha_innodb.dll';
+INSTALL PLUGIN innodb_rseg SONAME 'ha_innodb.dll';
+INSTALL PLUGIN innodb_table_stats SONAME 'ha_innodb.dll';
+INSTALL PLUGIN innodb_index_stats SONAME 'ha_innodb.dll';
+INSTALL PLUGIN xtradb_admin_command SONAME 'ha_innodb.dll';
diff --git a/storage/xtradb/srv/srv0que.c b/storage/xtradb/srv/srv0que.c
new file mode 100644
index 00000000000..fc50a86a55c
--- /dev/null
+++ b/storage/xtradb/srv/srv0que.c
@@ -0,0 +1,49 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file srv/srv0que.c
+Server query execution
+
+Created 6/5/1996 Heikki Tuuri
+*******************************************************/
+
+#include "srv0que.h"
+
+#include "srv0srv.h"
+#include "sync0sync.h"
+#include "os0thread.h"
+#include "usr0sess.h"
+#include "que0que.h"
+
+/**********************************************************************//**
+Enqueues a task to server task queue and releases a worker thread, if there
+is a suspended one. */
+UNIV_INTERN
+void
+srv_que_task_enqueue_low(
+/*=====================*/
+	que_thr_t*	thr)	/*!< in: query thread */
+{
+	ut_ad(thr);
+	ut_ad(mutex_own(&kernel_mutex));
+
+	UT_LIST_ADD_LAST(queue, srv_sys->tasks, thr);
+
+	srv_release_threads(SRV_WORKER, 1);
+}
diff --git a/storage/xtradb/srv/srv0srv.c b/storage/xtradb/srv/srv0srv.c
new file mode 100644
index 00000000000..43799aab196
--- /dev/null
+++ b/storage/xtradb/srv/srv0srv.c
@@ -0,0 +1,3440 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2010, Innobase Oy. All Rights Reserved.
+Copyright (c) 2008, 2009 Google Inc.
+Copyright (c) 2009, Percona Inc.
+
+Portions of this file contain modifications contributed and copyrighted by
+Google, Inc. Those modifications are gratefully acknowledged and are described
+briefly in the InnoDB documentation. The contributions by Google are
+incorporated with their permission, and subject to the conditions contained in
+the file COPYING.Google.
+
+Portions of this file contain modifications contributed and copyrighted
+by Percona Inc.. Those modifications are
+gratefully acknowledged and are described briefly in the InnoDB
+documentation. The contributions by Percona Inc. are incorporated with
+their permission, and subject to the conditions contained in the file
+COPYING.Percona.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file srv/srv0srv.c
+The database server main program
+
+NOTE: SQL Server 7 uses something which the documentation
+calls user mode scheduled threads (UMS threads). One such
+thread is usually allocated per processor. Win32
+documentation does not know any UMS threads, which suggests
+that the concept is internal to SQL Server 7. It may mean that
+SQL Server 7 does all the scheduling of threads itself, even
+in i/o waits. We should maybe modify InnoDB to use the same
+technique, because thread switches within NT may be too slow.
+
+SQL Server 7 also mentions fibers, which are cooperatively
+scheduled threads. They can boost performance by 5 %,
+according to the Delaney and Soukup's book.
+
+Windows 2000 will have something called thread pooling
+(see msdn website), which we could possibly use.
+
+Another possibility could be to use some very fast user space
+thread library. This might confuse NT though.
+
+Created 10/8/1995 Heikki Tuuri
+*******************************************************/
+
+/* Dummy comment */
+#include "srv0srv.h"
+
+#include "ut0mem.h"
+#include "ut0ut.h"
+#include "os0proc.h"
+#include "mem0mem.h"
+#include "mem0pool.h"
+#include "sync0sync.h"
+#include "thr0loc.h"
+#include "que0que.h"
+#include "srv0que.h"
+#include "log0recv.h"
+#include "pars0pars.h"
+#include "usr0sess.h"
+#include "lock0lock.h"
+#include "trx0purge.h"
+#include "ibuf0ibuf.h"
+#include "buf0flu.h"
+#include "buf0lru.h"
+#include "btr0sea.h"
+#include "dict0load.h"
+#include "dict0boot.h"
+#include "srv0start.h"
+#include "row0mysql.h"
+#include "ha_prototypes.h"
+#include "trx0i_s.h"
+#include "os0sync.h" /* for HAVE_ATOMIC_BUILTINS */
+
+/* prototypes for new functions added to ha_innodb.cc */
+ibool	innobase_get_slow_log();
+
+/* This is set to TRUE if the MySQL user has set it in MySQL; currently
+affects only FOREIGN KEY definition parsing */
+UNIV_INTERN ibool	srv_lower_case_table_names	= FALSE;
+
+/* The following counter is incremented whenever there is some user activity
+in the server */
+UNIV_INTERN ulint	srv_activity_count	= 0;
+
+/* The following is the maximum allowed duration of a lock wait. */
+UNIV_INTERN ulint	srv_fatal_semaphore_wait_threshold = 600;
+
+/* How much data manipulation language (DML) statements need to be delayed,
+in microseconds, in order to reduce the lagging of the purge thread. */
+UNIV_INTERN ulint	srv_dml_needed_delay = 0;
+
+UNIV_INTERN ibool	srv_lock_timeout_active = FALSE;
+UNIV_INTERN ibool	srv_monitor_active = FALSE;
+UNIV_INTERN ibool	srv_error_monitor_active = FALSE;
+
+UNIV_INTERN const char*	srv_main_thread_op_info = "";
+
+/** Prefix used by MySQL to indicate pre-5.1 table name encoding */
+UNIV_INTERN const char	srv_mysql50_table_name_prefix[9] = "#mysql50#";
+
+/* Server parameters which are read from the initfile */
+
+/* The following three are dir paths which are catenated before file
+names, where the file name itself may also contain a path */
+
+UNIV_INTERN char*	srv_data_home	= NULL;
+#ifdef UNIV_LOG_ARCHIVE
+UNIV_INTERN char*	srv_arch_dir	= NULL;
+#endif /* UNIV_LOG_ARCHIVE */
+
+/** store to its own file each table created by an user; data
+dictionary tables are in the system tablespace 0 */
+UNIV_INTERN my_bool	srv_file_per_table;
+/** The file format to use on new *.ibd files. */
+UNIV_INTERN ulint	srv_file_format = 0;
+/** Whether to check file format during startup.  A value of
+DICT_TF_FORMAT_MAX + 1 means no checking ie. FALSE.  The default is to
+set it to the highest format we support. */
+UNIV_INTERN ulint	srv_check_file_format_at_startup = DICT_TF_FORMAT_MAX;
+
+#if DICT_TF_FORMAT_51
+# error "DICT_TF_FORMAT_51 must be 0!"
+#endif
+/** Place locks to records only i.e. do not use next-key locking except
+on duplicate key checking and foreign key checking */
+UNIV_INTERN ibool	srv_locks_unsafe_for_binlog = FALSE;
+
+UNIV_INTERN ulint	srv_n_data_files = 0;
+UNIV_INTERN char**	srv_data_file_names = NULL;
+/* size in database pages */
+UNIV_INTERN ulint*	srv_data_file_sizes = NULL;
+
+UNIV_INTERN char*	srv_doublewrite_file = NULL;
+
+UNIV_INTERN ibool	srv_extra_undoslots = FALSE;
+
+UNIV_INTERN ibool	srv_recovery_stats = FALSE;
+
+UNIV_INTERN ulint	srv_use_purge_thread = 0;
+
+/* if TRUE, then we auto-extend the last data file */
+UNIV_INTERN ibool	srv_auto_extend_last_data_file	= FALSE;
+/* if != 0, this tells the max size auto-extending may increase the
+last data file size */
+UNIV_INTERN ulint	srv_last_file_size_max	= 0;
+/* If the last data file is auto-extended, we add this
+many pages to it at a time */
+UNIV_INTERN ulong	srv_auto_extend_increment = 8;
+UNIV_INTERN ulint*	srv_data_file_is_raw_partition = NULL;
+
+/* If the following is TRUE we do not allow inserts etc. This protects
+the user from forgetting the 'newraw' keyword to my.cnf */
+
+UNIV_INTERN ibool	srv_created_new_raw	= FALSE;
+
+UNIV_INTERN char**	srv_log_group_home_dirs = NULL;
+
+UNIV_INTERN ulint	srv_n_log_groups	= ULINT_MAX;
+UNIV_INTERN ulint	srv_n_log_files		= ULINT_MAX;
+/* size in database pages */
+UNIV_INTERN ulint	srv_log_file_size	= ULINT_MAX;
+/* size in database pages */
+UNIV_INTERN ulint	srv_log_buffer_size	= ULINT_MAX;
+UNIV_INTERN ulong	srv_flush_log_at_trx_commit = 1;
+
+/* Try to flush dirty pages so as to avoid IO bursts at
+the checkpoints. */
+UNIV_INTERN char	srv_adaptive_flushing	= TRUE;
+
+UNIV_INTERN ulong	srv_show_locks_held	= 10;
+UNIV_INTERN ulong	srv_show_verbose_locks	= 0;
+
+/** Maximum number of times allowed to conditionally acquire
+mutex before switching to blocking wait on the mutex */
+#define MAX_MUTEX_NOWAIT	20
+
+/** Check whether the number of failed nonblocking mutex
+acquisition attempts exceeds maximum allowed value. If so,
+srv_printf_innodb_monitor() will request mutex acquisition
+with mutex_enter(), which will wait until it gets the mutex. */
+#define MUTEX_NOWAIT(mutex_skipped)	((mutex_skipped) < MAX_MUTEX_NOWAIT)
+
+/** The sort order table of the MySQL latin1_swedish_ci character set
+collation */
+UNIV_INTERN const byte*	srv_latin1_ordering;
+
+/* use os/external memory allocator */
+UNIV_INTERN my_bool	srv_use_sys_malloc	= TRUE;
+/* requested size in kilobytes */
+UNIV_INTERN ulint	srv_buf_pool_size	= ULINT_MAX;
+/* previously requested size */
+UNIV_INTERN ulint	srv_buf_pool_old_size;
+/* current size in kilobytes */
+UNIV_INTERN ulint	srv_buf_pool_curr_size	= 0;
+/* size in bytes */
+UNIV_INTERN ulint	srv_mem_pool_size	= ULINT_MAX;
+UNIV_INTERN ulint	srv_lock_table_size	= ULINT_MAX;
+
+/* key value for shm */
+UNIV_INTERN uint	srv_buffer_pool_shm_key	= 0;
+UNIV_INTERN ibool	srv_buffer_pool_shm_is_reused = FALSE;
+UNIV_INTERN ibool	srv_buffer_pool_shm_checksum = TRUE;
+
+/* This parameter is deprecated. Use srv_n_io_[read|write]_threads
+instead. */
+UNIV_INTERN ulint	srv_n_file_io_threads	= ULINT_MAX;
+UNIV_INTERN ulint	srv_n_read_io_threads	= ULINT_MAX;
+UNIV_INTERN ulint	srv_n_write_io_threads	= ULINT_MAX;
+
+/* The universal page size of the database */
+UNIV_INTERN ulint	srv_page_size_shift	= 0;
+UNIV_INTERN ulint	srv_page_size		= 0;
+
+/* User settable value of the number of pages that must be present
+in the buffer cache and accessed sequentially for InnoDB to trigger a
+readahead request. */
+UNIV_INTERN ulong	srv_read_ahead_threshold	= 56;
+
+#ifdef UNIV_LOG_ARCHIVE
+UNIV_INTERN ibool		srv_log_archive_on	= FALSE;
+UNIV_INTERN ibool		srv_archive_recovery	= 0;
+UNIV_INTERN ib_uint64_t	srv_archive_recovery_limit_lsn;
+#endif /* UNIV_LOG_ARCHIVE */
+
+/* This parameter is used to throttle the number of insert buffers that are
+merged in a batch. By increasing this parameter on a faster disk you can
+possibly reduce the number of I/O operations performed to complete the
+merge operation. The value of this parameter is used as is by the
+background loop when the system is idle (low load), on a busy system
+the parameter is scaled down by a factor of 4, this is to avoid putting
+a heavier load on the I/O sub system. */
+
+UNIV_INTERN ulong	srv_insert_buffer_batch_size = 20;
+
+UNIV_INTERN char*	srv_file_flush_method_str = NULL;
+UNIV_INTERN ulint	srv_unix_file_flush_method = SRV_UNIX_FSYNC;
+UNIV_INTERN ulint	srv_win_file_flush_method = SRV_WIN_IO_UNBUFFERED;
+
+UNIV_INTERN ulint	srv_max_n_open_files	  = 300;
+
+/* Number of IO operations per second the server can do */
+UNIV_INTERN ulong	srv_io_capacity         = 200;
+
+/* The InnoDB main thread tries to keep the ratio of modified pages
+in the buffer pool to all database pages in the buffer pool smaller than
+the following number. But it is not guaranteed that the value stays below
+that during a time of heavy update/insert activity. */
+
+UNIV_INTERN ulong	srv_max_buf_pool_modified_pct	= 75;
+
+/* variable counts amount of data read in total (in bytes) */
+UNIV_INTERN ulint srv_data_read = 0;
+
+/* here we count the amount of data written in total (in bytes) */
+UNIV_INTERN ulint srv_data_written = 0;
+
+/* the number of the log write requests done */
+UNIV_INTERN ulint srv_log_write_requests = 0;
+
+/* the number of physical writes to the log performed */
+UNIV_INTERN ulint srv_log_writes = 0;
+
+/* amount of data written to the log files in bytes */
+UNIV_INTERN ulint srv_os_log_written = 0;
+
+/* amount of writes being done to the log files */
+UNIV_INTERN ulint srv_os_log_pending_writes = 0;
+
+/* we increase this counter, when there we don't have enough space in the
+log buffer and have to flush it */
+UNIV_INTERN ulint srv_log_waits = 0;
+
+/* this variable counts the amount of times, when the doublewrite buffer
+was flushed */
+UNIV_INTERN ulint srv_dblwr_writes = 0;
+
+/* here we store the number of pages that have been flushed to the
+doublewrite buffer */
+UNIV_INTERN ulint srv_dblwr_pages_written = 0;
+
+/* in this variable we store the number of write requests issued */
+UNIV_INTERN ulint srv_buf_pool_write_requests = 0;
+
+/* here we store the number of times when we had to wait for a free page
+in the buffer pool. It happens when the buffer pool is full and we need
+to make a flush, in order to be able to read or create a page. */
+UNIV_INTERN ulint srv_buf_pool_wait_free = 0;
+
+/* variable to count the number of pages that were written from buffer
+pool to the disk */
+UNIV_INTERN ulint srv_buf_pool_flushed = 0;
+
+/** Number of buffer pool reads that led to the
+reading of a disk page */
+UNIV_INTERN ulint srv_buf_pool_reads = 0;
+
+/** Time in seconds between automatic buffer pool dumps */
+UNIV_INTERN uint srv_auto_lru_dump = 0;
+
+/* structure to pass status variables to MySQL */
+UNIV_INTERN export_struc export_vars;
+
+/* If the following is != 0 we do not allow inserts etc. This protects
+the user from forgetting the innodb_force_recovery keyword to my.cnf */
+
+UNIV_INTERN ulint	srv_force_recovery	= 0;
+/*-----------------------*/
+/* We are prepared for a situation that we have this many threads waiting for
+a semaphore inside InnoDB. innobase_start_or_create_for_mysql() sets the
+value. */
+
+UNIV_INTERN ulint	srv_max_n_threads	= 0;
+
+/* The following controls how many threads we let inside InnoDB concurrently:
+threads waiting for locks are not counted into the number because otherwise
+we could get a deadlock. MySQL creates a thread for each user session, and
+semaphore contention and convoy problems can occur withput this restriction.
+Value 10 should be good if there are less than 4 processors + 4 disks in the
+computer. Bigger computers need bigger values. Value 0 will disable the
+concurrency check. */
+
+UNIV_INTERN ibool	srv_thread_concurrency_timer_based = FALSE;
+UNIV_INTERN ulong	srv_thread_concurrency	= 0;
+
+/* this mutex protects srv_conc data structures */
+UNIV_INTERN os_fast_mutex_t	srv_conc_mutex;
+/* number of transactions that have declared_to_be_inside_innodb set.
+It used to be a non-error for this value to drop below zero temporarily.
+This is no longer true. We'll, however, keep the lint datatype to add
+assertions to catch any corner cases that we may have missed. */
+UNIV_INTERN lint	srv_conc_n_threads	= 0;
+/* number of OS threads waiting in the FIFO for a permission to enter
+InnoDB */
+UNIV_INTERN ulint	srv_conc_n_waiting_threads = 0;
+
+typedef struct srv_conc_slot_struct	srv_conc_slot_t;
+struct srv_conc_slot_struct{
+	os_event_t			event;		/*!< event to wait */
+	ibool				reserved;	/*!< TRUE if slot
+							reserved */
+	ibool				wait_ended;	/*!< TRUE when another
+							thread has already set
+							the event and the
+							thread in this slot is
+							free to proceed; but
+							reserved may still be
+							TRUE at that point */
+	UT_LIST_NODE_T(srv_conc_slot_t)	srv_conc_queue;	/*!< queue node */
+};
+
+/* queue of threads waiting to get in */
+UNIV_INTERN UT_LIST_BASE_NODE_T(srv_conc_slot_t)	srv_conc_queue;
+/* array of wait slots */
+UNIV_INTERN srv_conc_slot_t* srv_conc_slots;
+
+/* Number of times a thread is allowed to enter InnoDB within the same
+SQL query after it has once got the ticket at srv_conc_enter_innodb */
+#define SRV_FREE_TICKETS_TO_ENTER srv_n_free_tickets_to_enter
+#define SRV_THREAD_SLEEP_DELAY srv_thread_sleep_delay
+/*-----------------------*/
+/* If the following is set to 1 then we do not run purge and insert buffer
+merge to completion before shutdown. If it is set to 2, do not even flush the
+buffer pool to data files at the shutdown: we effectively 'crash'
+InnoDB (but lose no committed transactions). */
+UNIV_INTERN ulint	srv_fast_shutdown	= 0;
+
+/* Generate a innodb_status.<pid> file */
+UNIV_INTERN ibool	srv_innodb_status	= FALSE;
+
+/* When estimating number of different key values in an index, sample
+this many index pages */
+UNIV_INTERN unsigned long long	srv_stats_sample_pages = 8;
+UNIV_INTERN ulong	srv_stats_method = 0;
+UNIV_INTERN ulong	srv_stats_auto_update = 1;
+UNIV_INTERN ulint	srv_stats_update_need_lock = 1;
+UNIV_INTERN ibool	srv_use_sys_stats_table = FALSE;
+
+UNIV_INTERN ibool	srv_use_doublewrite_buf	= TRUE;
+UNIV_INTERN ibool	srv_use_checksums = TRUE;
+UNIV_INTERN ibool	srv_fast_checksum = FALSE;
+
+UNIV_INTERN ibool	srv_set_thread_priorities = TRUE;
+UNIV_INTERN int	srv_query_thread_priority = 0;
+
+UNIV_INTERN ulong	srv_replication_delay		= 0;
+
+UNIV_INTERN long long	srv_ibuf_max_size = 0;
+UNIV_INTERN ulong	srv_ibuf_active_contract = 0; /* 0:disable 1:enable */
+UNIV_INTERN ulong	srv_ibuf_accel_rate = 100;
+#define PCT_IBUF_IO(pct) ((ulint) (srv_io_capacity * srv_ibuf_accel_rate * ((double) pct / 10000.0)))
+
+UNIV_INTERN ulint	srv_checkpoint_age_target = 0;
+UNIV_INTERN ulong	srv_flush_neighbor_pages = 1; /* 0:disable 1:enable */
+
+UNIV_INTERN ulong	srv_enable_unsafe_group_commit = 0; /* 0:disable 1:enable */
+UNIV_INTERN ulong	srv_read_ahead = 3; /* 1: random  2: linear  3: Both */
+UNIV_INTERN ulong	srv_adaptive_checkpoint = 0; /* 0: none  1: reflex  2: estimate */
+
+UNIV_INTERN ulong	srv_expand_import = 0; /* 0:disable 1:enable */
+UNIV_INTERN ulint	srv_pass_corrupt_table = 0; /* 0:disable 1:enable */
+
+UNIV_INTERN ulong	srv_extra_rsegments = 0; /* extra rseg for users */
+UNIV_INTERN ulong	srv_dict_size_limit = 0;
+/*-------------------------------------------*/
+UNIV_INTERN ulong	srv_n_spin_wait_rounds	= 30;
+UNIV_INTERN ulong	srv_n_free_tickets_to_enter = 500;
+UNIV_INTERN ulong	srv_thread_sleep_delay = 10000;
+UNIV_INTERN ulong	srv_spin_wait_delay	= 6;
+UNIV_INTERN ibool	srv_priority_boost	= TRUE;
+
+#ifdef UNIV_DEBUG
+UNIV_INTERN ibool	srv_print_thread_releases	= FALSE;
+UNIV_INTERN ibool	srv_print_lock_waits		= FALSE;
+UNIV_INTERN ibool	srv_print_buf_io		= FALSE;
+UNIV_INTERN ibool	srv_print_log_io		= FALSE;
+UNIV_INTERN ibool	srv_print_latch_waits		= FALSE;
+#endif /* UNIV_DEBUG */
+
+UNIV_INTERN ulint		srv_n_rows_inserted		= 0;
+UNIV_INTERN ulint		srv_n_rows_updated		= 0;
+UNIV_INTERN ulint		srv_n_rows_deleted		= 0;
+UNIV_INTERN ulint		srv_n_rows_read			= 0;
+
+static ulint	srv_n_rows_inserted_old		= 0;
+static ulint	srv_n_rows_updated_old		= 0;
+static ulint	srv_n_rows_deleted_old		= 0;
+static ulint	srv_n_rows_read_old		= 0;
+UNIV_INTERN ulint               srv_n_lock_deadlock_count       = 0;
+UNIV_INTERN ulint		srv_n_lock_wait_count		= 0;
+UNIV_INTERN ulint		srv_n_lock_wait_current_count	= 0;
+UNIV_INTERN ib_int64_t	srv_n_lock_wait_time		= 0;
+UNIV_INTERN ulint		srv_n_lock_max_wait_time	= 0;
+
+
+/*
+  Set the following to 0 if you want InnoDB to write messages on
+  stderr on startup/shutdown
+*/
+UNIV_INTERN ibool	srv_print_verbose_log		= TRUE;
+UNIV_INTERN ibool	srv_print_innodb_monitor	= FALSE;
+UNIV_INTERN ibool	srv_print_innodb_lock_monitor	= FALSE;
+UNIV_INTERN ibool	srv_print_innodb_tablespace_monitor = FALSE;
+UNIV_INTERN ibool	srv_print_innodb_table_monitor = FALSE;
+
+/* Array of English strings describing the current state of an
+i/o handler thread */
+
+UNIV_INTERN const char* srv_io_thread_op_info[SRV_MAX_N_IO_THREADS];
+UNIV_INTERN const char* srv_io_thread_function[SRV_MAX_N_IO_THREADS];
+
+UNIV_INTERN time_t	srv_last_monitor_time;
+
+UNIV_INTERN mutex_t	srv_innodb_monitor_mutex;
+
+/* Mutex for locking srv_monitor_file */
+UNIV_INTERN mutex_t	srv_monitor_file_mutex;
+/* Temporary file for innodb monitor output */
+UNIV_INTERN FILE*	srv_monitor_file;
+/* Mutex for locking srv_dict_tmpfile.
+This mutex has a very high rank; threads reserving it should not
+be holding any InnoDB latches. */
+UNIV_INTERN mutex_t	srv_dict_tmpfile_mutex;
+/* Temporary file for output from the data dictionary */
+UNIV_INTERN FILE*	srv_dict_tmpfile;
+/* Mutex for locking srv_misc_tmpfile.
+This mutex has a very low rank; threads reserving it should not
+acquire any further latches or sleep before releasing this one. */
+UNIV_INTERN mutex_t	srv_misc_tmpfile_mutex;
+/* Temporary file for miscellanous diagnostic output */
+UNIV_INTERN FILE*	srv_misc_tmpfile;
+
+UNIV_INTERN ulint	srv_main_thread_process_no	= 0;
+UNIV_INTERN ulint	srv_main_thread_id		= 0;
+
+/* The following count work done by srv_master_thread. */
+
+/* Iterations by the 'once per second' loop. */
+static ulint   srv_main_1_second_loops		= 0;
+/* Calls to sleep by the 'once per second' loop. */
+static ulint   srv_main_sleeps			= 0;
+/* Iterations by the 'once per 10 seconds' loop. */
+static ulint   srv_main_10_second_loops		= 0;
+/* Iterations of the loop bounded by the 'background_loop' label. */
+static ulint   srv_main_background_loops	= 0;
+/* Iterations of the loop bounded by the 'flush_loop' label. */
+static ulint   srv_main_flush_loops		= 0;
+/* Log writes involving flush. */
+static ulint   srv_log_writes_and_flush		= 0;
+
+/* This is only ever touched by the master thread. It records the
+time when the last flush of log file has happened. The master
+thread ensures that we flush the log files at least once per
+second. */
+static time_t	srv_last_log_flush_time;
+
+/* The master thread performs various tasks based on the current
+state of IO activity and the level of IO utilization is past
+intervals. Following macros define thresholds for these conditions. */
+#define SRV_PEND_IO_THRESHOLD	(PCT_IO(3))
+#define SRV_RECENT_IO_ACTIVITY	(PCT_IO(5))
+#define SRV_PAST_IO_ACTIVITY	(PCT_IO(200))
+
+/*
+	IMPLEMENTATION OF THE SERVER MAIN PROGRAM
+	=========================================
+
+There is the following analogue between this database
+server and an operating system kernel:
+
+DB concept			equivalent OS concept
+----------			---------------------
+transaction		--	process;
+
+query thread		--	thread;
+
+lock			--	semaphore;
+
+transaction set to
+the rollback state	--	kill signal delivered to a process;
+
+kernel			--	kernel;
+
+query thread execution:
+(a) without kernel mutex
+reserved		--	process executing in user mode;
+(b) with kernel mutex reserved
+			--	process executing in kernel mode;
+
+The server is controlled by a master thread which runs at
+a priority higher than normal, that is, higher than user threads.
+It sleeps most of the time, and wakes up, say, every 300 milliseconds,
+to check whether there is anything happening in the server which
+requires intervention of the master thread. Such situations may be,
+for example, when flushing of dirty blocks is needed in the buffer
+pool or old version of database rows have to be cleaned away.
+
+The threads which we call user threads serve the queries of
+the clients and input from the console of the server.
+They run at normal priority. The server may have several
+communications endpoints. A dedicated set of user threads waits
+at each of these endpoints ready to receive a client request.
+Each request is taken by a single user thread, which then starts
+processing and, when the result is ready, sends it to the client
+and returns to wait at the same endpoint the thread started from.
+
+So, we do not have dedicated communication threads listening at
+the endpoints and dealing the jobs to dedicated worker threads.
+Our architecture saves one thread swithch per request, compared
+to the solution with dedicated communication threads
+which amounts to 15 microseconds on 100 MHz Pentium
+running NT. If the client
+is communicating over a network, this saving is negligible, but
+if the client resides in the same machine, maybe in an SMP machine
+on a different processor from the server thread, the saving
+can be important as the threads can communicate over shared
+memory with an overhead of a few microseconds.
+
+We may later implement a dedicated communication thread solution
+for those endpoints which communicate over a network.
+
+Our solution with user threads has two problems: for each endpoint
+there has to be a number of listening threads. If there are many
+communication endpoints, it may be difficult to set the right number
+of concurrent threads in the system, as many of the threads
+may always be waiting at less busy endpoints. Another problem
+is queuing of the messages, as the server internally does not
+offer any queue for jobs.
+
+Another group of user threads is intended for splitting the
+queries and processing them in parallel. Let us call these
+parallel communication threads. These threads are waiting for
+parallelized tasks, suspended on event semaphores.
+
+A single user thread waits for input from the console,
+like a command to shut the database.
+
+Utility threads are a different group of threads which takes
+care of the buffer pool flushing and other, mainly background
+operations, in the server.
+Some of these utility threads always run at a lower than normal
+priority, so that they are always in background. Some of them
+may dynamically boost their priority by the pri_adjust function,
+even to higher than normal priority, if their task becomes urgent.
+The running of utilities is controlled by high- and low-water marks
+of urgency. The urgency may be measured by the number of dirty blocks
+in the buffer pool, in the case of the flush thread, for example.
+When the high-water mark is exceeded, an utility starts running, until
+the urgency drops under the low-water mark. Then the utility thread
+suspend itself to wait for an event. The master thread is
+responsible of signaling this event when the utility thread is
+again needed.
+
+For each individual type of utility, some threads always remain
+at lower than normal priority. This is because pri_adjust is implemented
+so that the threads at normal or higher priority control their
+share of running time by calling sleep. Thus, if the load of the
+system sudenly drops, these threads cannot necessarily utilize
+the system fully. The background priority threads make up for this,
+starting to run when the load drops.
+
+When there is no activity in the system, also the master thread
+suspends itself to wait for an event making
+the server totally silent. The responsibility to signal this
+event is on the user thread which again receives a message
+from a client.
+
+There is still one complication in our server design. If a
+background utility thread obtains a resource (e.g., mutex) needed by a user
+thread, and there is also some other user activity in the system,
+the user thread may have to wait indefinitely long for the
+resource, as the OS does not schedule a background thread if
+there is some other runnable user thread. This problem is called
+priority inversion in real-time programming.
+
+One solution to the priority inversion problem would be to
+keep record of which thread owns which resource and
+in the above case boost the priority of the background thread
+so that it will be scheduled and it can release the resource.
+This solution is called priority inheritance in real-time programming.
+A drawback of this solution is that the overhead of acquiring a mutex
+increases slightly, maybe 0.2 microseconds on a 100 MHz Pentium, because
+the thread has to call os_thread_get_curr_id.
+This may be compared to 0.5 microsecond overhead for a mutex lock-unlock
+pair. Note that the thread
+cannot store the information in the resource, say mutex, itself,
+because competing threads could wipe out the information if it is
+stored before acquiring the mutex, and if it stored afterwards,
+the information is outdated for the time of one machine instruction,
+at least. (To be precise, the information could be stored to
+lock_word in mutex if the machine supports atomic swap.)
+
+The above solution with priority inheritance may become actual in the
+future, but at the moment we plan to implement a more coarse solution,
+which could be called a global priority inheritance. If a thread
+has to wait for a long time, say 300 milliseconds, for a resource,
+we just guess that it may be waiting for a resource owned by a background
+thread, and boost the priority of all runnable background threads
+to the normal level. The background threads then themselves adjust
+their fixed priority back to background after releasing all resources
+they had (or, at some fixed points in their program code).
+
+What is the performance of the global priority inheritance solution?
+We may weigh the length of the wait time 300 milliseconds, during
+which the system processes some other thread
+to the cost of boosting the priority of each runnable background
+thread, rescheduling it, and lowering the priority again.
+On 100 MHz Pentium + NT this overhead may be of the order 100
+microseconds per thread. So, if the number of runnable background
+threads is not very big, say < 100, the cost is tolerable.
+Utility threads probably will access resources used by
+user threads not very often, so collisions of user threads
+to preempted utility threads should not happen very often.
+
+The thread table contains
+information of the current status of each thread existing in the system,
+and also the event semaphores used in suspending the master thread
+and utility and parallel communication threads when they have nothing to do.
+The thread table can be seen as an analogue to the process table
+in a traditional Unix implementation.
+
+The thread table is also used in the global priority inheritance
+scheme. This brings in one additional complication: threads accessing
+the thread table must have at least normal fixed priority,
+because the priority inheritance solution does not work if a background
+thread is preempted while possessing the mutex protecting the thread table.
+So, if a thread accesses the thread table, its priority has to be
+boosted at least to normal. This priority requirement can be seen similar to
+the privileged mode used when processing the kernel calls in traditional
+Unix.*/
+
+/* Thread slot in the thread table */
+struct srv_slot_struct{
+	os_thread_id_t	id;		/*!< thread id */
+	os_thread_t	handle;		/*!< thread handle */
+	unsigned	type:3;		/*!< thread type: user, utility etc. */
+	unsigned	in_use:1;	/*!< TRUE if this slot is in use */
+	unsigned	suspended:1;	/*!< TRUE if the thread is waiting
+					for the event of this slot */
+	ib_time_t	suspend_time;	/*!< time when the thread was
+					suspended */
+	os_event_t	event;		/*!< event used in suspending the
+					thread when it has nothing to do */
+	que_thr_t*	thr;		/*!< suspended query thread (only
+					used for MySQL threads) */
+};
+
+/* Table for MySQL threads where they will be suspended to wait for locks */
+UNIV_INTERN srv_slot_t*	srv_mysql_table = NULL;
+
+UNIV_INTERN os_event_t	srv_lock_timeout_thread_event;
+
+UNIV_INTERN os_event_t	srv_purge_thread_event;
+
+UNIV_INTERN srv_sys_t*	srv_sys	= NULL;
+
+/* padding to prevent other memory update hotspots from residing on
+the same memory cache line */
+UNIV_INTERN byte	srv_pad1[64];
+/* mutex protecting the server, trx structs, query threads, and lock table */
+UNIV_INTERN mutex_t*	kernel_mutex_temp;
+/* padding to prevent other memory update hotspots from residing on
+the same memory cache line */
+UNIV_INTERN byte	srv_pad2[64];
+
+#if 0
+/* The following three values measure the urgency of the jobs of
+buffer, version, and insert threads. They may vary from 0 - 1000.
+The server mutex protects all these variables. The low-water values
+tell that the server can acquiesce the utility when the value
+drops below this low-water mark. */
+
+static ulint	srv_meter[SRV_MASTER + 1];
+static ulint	srv_meter_low_water[SRV_MASTER + 1];
+static ulint	srv_meter_high_water[SRV_MASTER + 1];
+static ulint	srv_meter_high_water2[SRV_MASTER + 1];
+static ulint	srv_meter_foreground[SRV_MASTER + 1];
+#endif
+
+/* The following values give info about the activity going on in
+the database. They are protected by the server mutex. The arrays
+are indexed by the type of the thread. */
+
+UNIV_INTERN ulint	srv_n_threads_active[SRV_MASTER + 1];
+UNIV_INTERN ulint	srv_n_threads[SRV_MASTER + 1];
+
+/***********************************************************************
+Prints counters for work done by srv_master_thread. */
+static
+void
+srv_print_master_thread_info(
+/*=========================*/
+	FILE  *file)    /* in: output stream */
+{
+	fprintf(file, "srv_master_thread loops: %lu 1_second, %lu sleeps, "
+		"%lu 10_second, %lu background, %lu flush\n",
+		srv_main_1_second_loops, srv_main_sleeps,
+		srv_main_10_second_loops, srv_main_background_loops,
+		srv_main_flush_loops);
+	fprintf(file, "srv_master_thread log flush and writes: %lu\n",
+		      srv_log_writes_and_flush);
+}
+
+/*********************************************************************//**
+Sets the info describing an i/o thread current state. */
+UNIV_INTERN
+void
+srv_set_io_thread_op_info(
+/*======================*/
+	ulint		i,	/*!< in: the 'segment' of the i/o thread */
+	const char*	str)	/*!< in: constant char string describing the
+				state */
+{
+	ut_a(i < SRV_MAX_N_IO_THREADS);
+
+	srv_io_thread_op_info[i] = str;
+}
+
+/*********************************************************************//**
+Accessor function to get pointer to n'th slot in the server thread
+table.
+@return	pointer to the slot */
+static
+srv_slot_t*
+srv_table_get_nth_slot(
+/*===================*/
+	ulint	index)		/*!< in: index of the slot */
+{
+	ut_a(index < OS_THREAD_MAX_N);
+
+	return(srv_sys->threads + index);
+}
+
+/*********************************************************************//**
+Gets the number of threads in the system.
+@return	sum of srv_n_threads[] */
+UNIV_INTERN
+ulint
+srv_get_n_threads(void)
+/*===================*/
+{
+	ulint	i;
+	ulint	n_threads	= 0;
+
+	mutex_enter(&kernel_mutex);
+
+	for (i = SRV_COM; i < SRV_MASTER + 1; i++) {
+
+		n_threads += srv_n_threads[i];
+	}
+
+	mutex_exit(&kernel_mutex);
+
+	return(n_threads);
+}
+
+/*********************************************************************//**
+Reserves a slot in the thread table for the current thread. Also creates the
+thread local storage struct for the current thread. NOTE! The server mutex
+has to be reserved by the caller!
+@return	reserved slot index */
+static
+ulint
+srv_table_reserve_slot(
+/*===================*/
+	enum srv_thread_type	type)	/*!< in: type of the thread */
+{
+	srv_slot_t*	slot;
+	ulint		i;
+
+	ut_a(type > 0);
+	ut_a(type <= SRV_MASTER);
+
+	i = 0;
+	slot = srv_table_get_nth_slot(i);
+
+	while (slot->in_use) {
+		i++;
+		slot = srv_table_get_nth_slot(i);
+	}
+
+	ut_a(slot->in_use == FALSE);
+
+	slot->in_use = TRUE;
+	slot->suspended = FALSE;
+	slot->type = type;
+	slot->id = os_thread_get_curr_id();
+	slot->handle = os_thread_get_curr();
+
+	thr_local_create();
+
+	thr_local_set_slot_no(os_thread_get_curr_id(), i);
+
+	return(i);
+}
+
+/*********************************************************************//**
+Suspends the calling thread to wait for the event in its thread slot.
+NOTE! The server mutex has to be reserved by the caller!
+@return	event for the calling thread to wait */
+static
+os_event_t
+srv_suspend_thread(void)
+/*====================*/
+{
+	srv_slot_t*		slot;
+	os_event_t		event;
+	ulint			slot_no;
+	enum srv_thread_type	type;
+
+	ut_ad(mutex_own(&kernel_mutex));
+
+	slot_no = thr_local_get_slot_no(os_thread_get_curr_id());
+
+	if (srv_print_thread_releases) {
+		fprintf(stderr,
+			"Suspending thread %lu to slot %lu\n",
+			(ulong) os_thread_get_curr_id(), (ulong) slot_no);
+	}
+
+	slot = srv_table_get_nth_slot(slot_no);
+
+	type = slot->type;
+
+	ut_ad(type >= SRV_WORKER);
+	ut_ad(type <= SRV_MASTER);
+
+	event = slot->event;
+
+	slot->suspended = TRUE;
+
+	ut_ad(srv_n_threads_active[type] > 0);
+
+	srv_n_threads_active[type]--;
+
+	os_event_reset(event);
+
+	return(event);
+}
+
+/*********************************************************************//**
+Releases threads of the type given from suspension in the thread table.
+NOTE! The server mutex has to be reserved by the caller!
+@return number of threads released: this may be less than n if not
+enough threads were suspended at the moment */
+UNIV_INTERN
+ulint
+srv_release_threads(
+/*================*/
+	enum srv_thread_type	type,	/*!< in: thread type */
+	ulint			n)	/*!< in: number of threads to release */
+{
+	srv_slot_t*	slot;
+	ulint		i;
+	ulint		count	= 0;
+
+	ut_ad(type >= SRV_WORKER);
+	ut_ad(type <= SRV_MASTER);
+	ut_ad(n > 0);
+	ut_ad(mutex_own(&kernel_mutex));
+
+	for (i = 0; i < OS_THREAD_MAX_N; i++) {
+
+		slot = srv_table_get_nth_slot(i);
+
+		if (slot->in_use && slot->type == type && slot->suspended) {
+
+			slot->suspended = FALSE;
+
+			srv_n_threads_active[type]++;
+
+			os_event_set(slot->event);
+
+			if (srv_print_thread_releases) {
+				fprintf(stderr,
+					"Releasing thread %lu type %lu"
+					" from slot %lu\n",
+					(ulong) slot->id, (ulong) type,
+					(ulong) i);
+			}
+
+			count++;
+
+			if (count == n) {
+				break;
+			}
+		}
+	}
+
+	return(count);
+}
+
+/*********************************************************************//**
+Returns the calling thread type.
+@return	SRV_COM, ... */
+UNIV_INTERN
+enum srv_thread_type
+srv_get_thread_type(void)
+/*=====================*/
+{
+	ulint			slot_no;
+	srv_slot_t*		slot;
+	enum srv_thread_type	type;
+
+	mutex_enter(&kernel_mutex);
+
+	slot_no = thr_local_get_slot_no(os_thread_get_curr_id());
+
+	slot = srv_table_get_nth_slot(slot_no);
+
+	type = slot->type;
+
+	ut_ad(type >= SRV_WORKER);
+	ut_ad(type <= SRV_MASTER);
+
+	mutex_exit(&kernel_mutex);
+
+	return(type);
+}
+
+/*********************************************************************//**
+Initializes the server. */
+UNIV_INTERN
+void
+srv_init(void)
+/*==========*/
+{
+	srv_conc_slot_t*	conc_slot;
+	srv_slot_t*		slot;
+	ulint			i;
+
+	srv_sys = mem_alloc(sizeof(srv_sys_t));
+
+	kernel_mutex_temp = mem_alloc(sizeof(mutex_t));
+	mutex_create(&kernel_mutex, SYNC_KERNEL);
+
+	mutex_create(&srv_innodb_monitor_mutex, SYNC_NO_ORDER_CHECK);
+
+	srv_sys->threads = mem_alloc(OS_THREAD_MAX_N * sizeof(srv_slot_t));
+
+	for (i = 0; i < OS_THREAD_MAX_N; i++) {
+		slot = srv_table_get_nth_slot(i);
+		slot->in_use = FALSE;
+		slot->type=0;	/* Avoid purify errors */
+		slot->event = os_event_create(NULL);
+		ut_a(slot->event);
+	}
+
+	srv_mysql_table = mem_alloc(OS_THREAD_MAX_N * sizeof(srv_slot_t));
+
+	for (i = 0; i < OS_THREAD_MAX_N; i++) {
+		slot = srv_mysql_table + i;
+		slot->in_use = FALSE;
+		slot->type = 0;
+		slot->event = os_event_create(NULL);
+		ut_a(slot->event);
+	}
+
+	srv_lock_timeout_thread_event = os_event_create(NULL);
+	srv_purge_thread_event = os_event_create(NULL);
+
+	for (i = 0; i < SRV_MASTER + 1; i++) {
+		srv_n_threads_active[i] = 0;
+		srv_n_threads[i] = 0;
+#if 0
+		srv_meter[i] = 30;
+		srv_meter_low_water[i] = 50;
+		srv_meter_high_water[i] = 100;
+		srv_meter_high_water2[i] = 200;
+		srv_meter_foreground[i] = 250;
+#endif
+	}
+
+	UT_LIST_INIT(srv_sys->tasks);
+
+	/* Create dummy indexes for infimum and supremum records */
+
+	dict_ind_init();
+
+	/* Init the server concurrency restriction data structures */
+
+	os_fast_mutex_init(&srv_conc_mutex);
+
+	UT_LIST_INIT(srv_conc_queue);
+
+	srv_conc_slots = mem_alloc(OS_THREAD_MAX_N * sizeof(srv_conc_slot_t));
+
+	for (i = 0; i < OS_THREAD_MAX_N; i++) {
+		conc_slot = srv_conc_slots + i;
+		conc_slot->reserved = FALSE;
+		conc_slot->event = os_event_create(NULL);
+		ut_a(conc_slot->event);
+	}
+
+	/* Initialize some INFORMATION SCHEMA internal structures */
+	trx_i_s_cache_init(trx_i_s_cache);
+}
+
+/*********************************************************************//**
+Frees the data structures created in srv_init(). */
+UNIV_INTERN
+void
+srv_free(void)
+/*==========*/
+{
+	os_fast_mutex_free(&srv_conc_mutex);
+	mem_free(srv_conc_slots);
+	srv_conc_slots = NULL;
+
+	mem_free(srv_sys->threads);
+	mem_free(srv_sys);
+	srv_sys = NULL;
+
+	mem_free(kernel_mutex_temp);
+	kernel_mutex_temp = NULL;
+	mem_free(srv_mysql_table);
+	srv_mysql_table = NULL;
+
+	trx_i_s_cache_free(trx_i_s_cache);
+}
+
+/*********************************************************************//**
+Initializes the synchronization primitives, memory system, and the thread
+local storage. */
+UNIV_INTERN
+void
+srv_general_init(void)
+/*==================*/
+{
+	ut_mem_init();
+	/* Reset the system variables in the recovery module. */
+	recv_sys_var_init();
+	os_sync_init();
+	sync_init();
+	mem_init(srv_mem_pool_size);
+	thr_local_init();
+}
+
+/*======================= InnoDB Server FIFO queue =======================*/
+
+/* Maximum allowable purge history length.  <=0 means 'infinite'. */
+UNIV_INTERN ulong	srv_max_purge_lag		= 0;
+
+/*********************************************************************//**
+Puts an OS thread to wait if there are too many concurrent threads
+(>= srv_thread_concurrency) inside InnoDB. The threads wait in a FIFO queue. */
+
+#ifdef HAVE_ATOMIC_BUILTINS
+static void
+enter_innodb_with_tickets(trx_t* trx)
+{
+	trx->declared_to_be_inside_innodb = TRUE;
+	trx->n_tickets_to_enter_innodb = SRV_FREE_TICKETS_TO_ENTER;
+	return;
+}
+
+static void
+srv_conc_enter_innodb_timer_based(trx_t* trx)
+{
+	lint	conc_n_threads;
+	ibool	has_yielded = FALSE;
+	ulint	has_slept = 0;
+
+	if (trx->declared_to_be_inside_innodb) {
+		ut_print_timestamp(stderr);
+		fputs(
+"  InnoDB: Error: trying to declare trx to enter InnoDB, but\n"
+"InnoDB: it already is declared.\n", stderr);
+		trx_print(stderr, trx, 0);
+		putc('\n', stderr);
+	}
+retry:
+	if (srv_conc_n_threads < (lint) srv_thread_concurrency) {
+		conc_n_threads = os_atomic_increment_lint(&srv_conc_n_threads, 1);
+		if (conc_n_threads <= (lint) srv_thread_concurrency) {
+			enter_innodb_with_tickets(trx);
+			return;
+		}
+		os_atomic_increment_lint(&srv_conc_n_threads, -1);
+	}
+	if (!has_yielded)
+	{
+		has_yielded = TRUE;
+		os_thread_yield();
+		goto retry;
+	}
+	if (trx->has_search_latch
+	    || NULL != UT_LIST_GET_FIRST(trx->trx_locks)) {
+
+		conc_n_threads = os_atomic_increment_lint(&srv_conc_n_threads, 1);
+		enter_innodb_with_tickets(trx);
+		return;
+	}
+	if (has_slept < 2)
+	{
+		trx->op_info = "sleeping before entering InnoDB";
+		os_thread_sleep(10000);
+		trx->op_info = "";
+		has_slept++;
+	}
+	conc_n_threads = os_atomic_increment_lint(&srv_conc_n_threads, 1);
+	enter_innodb_with_tickets(trx);
+	return;
+}
+
+static void
+srv_conc_exit_innodb_timer_based(trx_t* trx)
+{
+	os_atomic_increment_lint(&srv_conc_n_threads, -1);
+	trx->declared_to_be_inside_innodb = FALSE;
+	trx->n_tickets_to_enter_innodb = 0;
+	return;
+}
+#endif
+
+UNIV_INTERN
+void
+srv_conc_enter_innodb(
+/*==================*/
+	trx_t*	trx)	/*!< in: transaction object associated with the
+			thread */
+{
+	ibool			has_slept = FALSE;
+	srv_conc_slot_t*	slot	  = NULL;
+	ulint			i;
+	ib_uint64_t             start_time = 0L;
+	ib_uint64_t             finish_time = 0L;
+	ulint                   sec;
+	ulint                   ms;
+
+	if (trx->mysql_thd != NULL
+	    && thd_is_replication_slave_thread(trx->mysql_thd)) {
+
+		UT_WAIT_FOR(srv_conc_n_threads
+			    < (lint)srv_thread_concurrency,
+			    srv_replication_delay * 1000);
+
+		return;
+	}
+
+	/* If trx has 'free tickets' to enter the engine left, then use one
+	such ticket */
+
+	if (trx->n_tickets_to_enter_innodb > 0) {
+		trx->n_tickets_to_enter_innodb--;
+
+		return;
+	}
+
+#ifdef HAVE_ATOMIC_BUILTINS
+	if (srv_thread_concurrency_timer_based) {
+		srv_conc_enter_innodb_timer_based(trx);
+		return;
+	}
+#endif
+
+	os_fast_mutex_lock(&srv_conc_mutex);
+retry:
+	if (trx->declared_to_be_inside_innodb) {
+		ut_print_timestamp(stderr);
+		fputs("  InnoDB: Error: trying to declare trx"
+		      " to enter InnoDB, but\n"
+		      "InnoDB: it already is declared.\n", stderr);
+		trx_print(stderr, trx, 0);
+		putc('\n', stderr);
+		os_fast_mutex_unlock(&srv_conc_mutex);
+
+		return;
+	}
+
+	ut_ad(srv_conc_n_threads >= 0);
+
+	if (srv_conc_n_threads < (lint)srv_thread_concurrency) {
+
+		srv_conc_n_threads++;
+		trx->declared_to_be_inside_innodb = TRUE;
+		trx->n_tickets_to_enter_innodb = SRV_FREE_TICKETS_TO_ENTER;
+
+		os_fast_mutex_unlock(&srv_conc_mutex);
+
+		return;
+	}
+
+	/* If the transaction is not holding resources, let it sleep
+	for SRV_THREAD_SLEEP_DELAY microseconds, and try again then */
+
+	if (!has_slept && !trx->has_search_latch
+	    && NULL == UT_LIST_GET_FIRST(trx->trx_locks)) {
+
+		has_slept = TRUE; /* We let it sleep only once to avoid
+				  starvation */
+
+		srv_conc_n_waiting_threads++;
+
+		os_fast_mutex_unlock(&srv_conc_mutex);
+
+		trx->op_info = "sleeping before joining InnoDB queue";
+
+		/* Peter Zaitsev suggested that we take the sleep away
+		altogether. But the sleep may be good in pathological
+		situations of lots of thread switches. Simply put some
+		threads aside for a while to reduce the number of thread
+		switches. */
+		if (SRV_THREAD_SLEEP_DELAY > 0) {
+			os_thread_sleep(SRV_THREAD_SLEEP_DELAY);
+			trx->innodb_que_wait_timer += SRV_THREAD_SLEEP_DELAY;
+		}
+
+		trx->op_info = "";
+
+		os_fast_mutex_lock(&srv_conc_mutex);
+
+		srv_conc_n_waiting_threads--;
+
+		goto retry;
+	}
+
+	/* Too many threads inside: put the current thread to a queue */
+
+	for (i = 0; i < OS_THREAD_MAX_N; i++) {
+		slot = srv_conc_slots + i;
+
+		if (!slot->reserved) {
+
+			break;
+		}
+	}
+
+	if (i == OS_THREAD_MAX_N) {
+		/* Could not find a free wait slot, we must let the
+		thread enter */
+
+		srv_conc_n_threads++;
+		trx->declared_to_be_inside_innodb = TRUE;
+		trx->n_tickets_to_enter_innodb = 0;
+
+		os_fast_mutex_unlock(&srv_conc_mutex);
+
+		return;
+	}
+
+	/* Release possible search system latch this thread has */
+	if (trx->has_search_latch) {
+		trx_search_latch_release_if_reserved(trx);
+	}
+
+	/* Add to the queue */
+	slot->reserved = TRUE;
+	slot->wait_ended = FALSE;
+
+	UT_LIST_ADD_LAST(srv_conc_queue, srv_conc_queue, slot);
+
+	os_event_reset(slot->event);
+
+	srv_conc_n_waiting_threads++;
+
+	os_fast_mutex_unlock(&srv_conc_mutex);
+
+	/* Go to wait for the event; when a thread leaves InnoDB it will
+	release this thread */
+
+	if (innobase_get_slow_log() && trx->take_stats) {
+		ut_usectime(&sec, &ms);
+		start_time = (ib_uint64_t)sec * 1000000 + ms;
+	} else {
+		start_time = 0;
+	}
+
+	trx->op_info = "waiting in InnoDB queue";
+
+	os_event_wait(slot->event);
+
+	trx->op_info = "";
+
+	if (innobase_get_slow_log() && trx->take_stats && start_time) {
+		ut_usectime(&sec, &ms);
+		finish_time = (ib_uint64_t)sec * 1000000 + ms;
+		trx->innodb_que_wait_timer += (ulint)(finish_time - start_time);
+	}
+
+	os_fast_mutex_lock(&srv_conc_mutex);
+
+	srv_conc_n_waiting_threads--;
+
+	/* NOTE that the thread which released this thread already
+	incremented the thread counter on behalf of this thread */
+
+	slot->reserved = FALSE;
+
+	UT_LIST_REMOVE(srv_conc_queue, srv_conc_queue, slot);
+
+	trx->declared_to_be_inside_innodb = TRUE;
+	trx->n_tickets_to_enter_innodb = SRV_FREE_TICKETS_TO_ENTER;
+
+	os_fast_mutex_unlock(&srv_conc_mutex);
+}
+
+/*********************************************************************//**
+This lets a thread enter InnoDB regardless of the number of threads inside
+InnoDB. This must be called when a thread ends a lock wait. */
+UNIV_INTERN
+void
+srv_conc_force_enter_innodb(
+/*========================*/
+	trx_t*	trx)	/*!< in: transaction object associated with the
+			thread */
+{
+	if (UNIV_LIKELY(!srv_thread_concurrency)) {
+
+		return;
+	}
+
+	ut_ad(srv_conc_n_threads >= 0);
+#ifdef HAVE_ATOMIC_BUILTINS
+	if (srv_thread_concurrency_timer_based) {
+		os_atomic_increment_lint(&srv_conc_n_threads, 1);
+		trx->declared_to_be_inside_innodb = TRUE;
+		trx->n_tickets_to_enter_innodb = 1;
+		return;
+	}
+#endif
+
+	os_fast_mutex_lock(&srv_conc_mutex);
+
+	srv_conc_n_threads++;
+	trx->declared_to_be_inside_innodb = TRUE;
+	trx->n_tickets_to_enter_innodb = 1;
+
+	os_fast_mutex_unlock(&srv_conc_mutex);
+}
+
+/*********************************************************************//**
+This must be called when a thread exits InnoDB in a lock wait or at the
+end of an SQL statement. */
+UNIV_INTERN
+void
+srv_conc_force_exit_innodb(
+/*=======================*/
+	trx_t*	trx)	/*!< in: transaction object associated with the
+			thread */
+{
+	srv_conc_slot_t*	slot	= NULL;
+
+	if (trx->mysql_thd != NULL
+	    && thd_is_replication_slave_thread(trx->mysql_thd)) {
+
+		return;
+	}
+
+	if (trx->declared_to_be_inside_innodb == FALSE) {
+
+		return;
+	}
+
+#ifdef HAVE_ATOMIC_BUILTINS
+	if (srv_thread_concurrency_timer_based) {
+		srv_conc_exit_innodb_timer_based(trx);
+		return;
+	}
+#endif
+
+	os_fast_mutex_lock(&srv_conc_mutex);
+
+	ut_ad(srv_conc_n_threads > 0);
+	srv_conc_n_threads--;
+	trx->declared_to_be_inside_innodb = FALSE;
+	trx->n_tickets_to_enter_innodb = 0;
+
+	if (srv_conc_n_threads < (lint)srv_thread_concurrency) {
+		/* Look for a slot where a thread is waiting and no other
+		thread has yet released the thread */
+
+		slot = UT_LIST_GET_FIRST(srv_conc_queue);
+
+		while (slot && slot->wait_ended == TRUE) {
+			slot = UT_LIST_GET_NEXT(srv_conc_queue, slot);
+		}
+
+		if (slot != NULL) {
+			slot->wait_ended = TRUE;
+
+			/* We increment the count on behalf of the released
+			thread */
+
+			srv_conc_n_threads++;
+		}
+	}
+
+	os_fast_mutex_unlock(&srv_conc_mutex);
+
+	if (slot != NULL) {
+		os_event_set(slot->event);
+	}
+}
+
+/*********************************************************************//**
+This must be called when a thread exits InnoDB. */
+UNIV_INTERN
+void
+srv_conc_exit_innodb(
+/*=================*/
+	trx_t*	trx)	/*!< in: transaction object associated with the
+			thread */
+{
+	if (trx->n_tickets_to_enter_innodb > 0) {
+		/* We will pretend the thread is still inside InnoDB though it
+		now leaves the InnoDB engine. In this way we save
+		a lot of semaphore operations. srv_conc_force_exit_innodb is
+		used to declare the thread definitely outside InnoDB. It
+		should be called when there is a lock wait or an SQL statement
+		ends. */
+
+		return;
+	}
+
+	srv_conc_force_exit_innodb(trx);
+}
+
+/*========================================================================*/
+
+/*********************************************************************//**
+Normalizes init parameter values to use units we use inside InnoDB.
+@return	DB_SUCCESS or error code */
+static
+ulint
+srv_normalize_init_values(void)
+/*===========================*/
+{
+	ulint	n;
+	ulint	i;
+
+	n = srv_n_data_files;
+
+	for (i = 0; i < n; i++) {
+		srv_data_file_sizes[i] = srv_data_file_sizes[i]
+			* ((1024 * 1024) / UNIV_PAGE_SIZE);
+	}
+
+	srv_last_file_size_max = srv_last_file_size_max
+		* ((1024 * 1024) / UNIV_PAGE_SIZE);
+
+	srv_log_file_size = srv_log_file_size / UNIV_PAGE_SIZE;
+
+	srv_log_buffer_size = srv_log_buffer_size / UNIV_PAGE_SIZE;
+
+	srv_lock_table_size = 5 * (srv_buf_pool_size / UNIV_PAGE_SIZE);
+
+	return(DB_SUCCESS);
+}
+
+/*********************************************************************//**
+Boots the InnoDB server.
+@return	DB_SUCCESS or error code */
+UNIV_INTERN
+ulint
+srv_boot(void)
+/*==========*/
+{
+	ulint	err;
+
+	/* Transform the init parameter values given by MySQL to
+	use units we use inside InnoDB: */
+
+	err = srv_normalize_init_values();
+
+	if (err != DB_SUCCESS) {
+		return(err);
+	}
+
+	/* Initialize synchronization primitives, memory management, and thread
+	local storage */
+
+	srv_general_init();
+
+	/* Initialize this module */
+
+	srv_init();
+
+	return(DB_SUCCESS);
+}
+
+/*********************************************************************//**
+Reserves a slot in the thread table for the current MySQL OS thread.
+NOTE! The kernel mutex has to be reserved by the caller!
+@return	reserved slot */
+static
+srv_slot_t*
+srv_table_reserve_slot_for_mysql(void)
+/*==================================*/
+{
+	srv_slot_t*	slot;
+	ulint		i;
+
+	ut_ad(mutex_own(&kernel_mutex));
+
+	i = 0;
+	slot = srv_mysql_table + i;
+
+	while (slot->in_use) {
+		i++;
+
+		if (i >= OS_THREAD_MAX_N) {
+
+			ut_print_timestamp(stderr);
+
+			fprintf(stderr,
+				"  InnoDB: There appear to be %lu MySQL"
+				" threads currently waiting\n"
+				"InnoDB: inside InnoDB, which is the"
+				" upper limit. Cannot continue operation.\n"
+				"InnoDB: We intentionally generate"
+				" a seg fault to print a stack trace\n"
+				"InnoDB: on Linux. But first we print"
+				" a list of waiting threads.\n", (ulong) i);
+
+			for (i = 0; i < OS_THREAD_MAX_N; i++) {
+
+				slot = srv_mysql_table + i;
+
+				fprintf(stderr,
+					"Slot %lu: thread id %lu, type %lu,"
+					" in use %lu, susp %lu, time %lu\n",
+					(ulong) i,
+					(ulong) os_thread_pf(slot->id),
+					(ulong) slot->type,
+					(ulong) slot->in_use,
+					(ulong) slot->suspended,
+					(ulong) difftime(ut_time(),
+							 slot->suspend_time));
+			}
+
+			ut_error;
+		}
+
+		slot = srv_mysql_table + i;
+	}
+
+	ut_a(slot->in_use == FALSE);
+
+	slot->in_use = TRUE;
+	slot->id = os_thread_get_curr_id();
+	slot->handle = os_thread_get_curr();
+
+	return(slot);
+}
+
+/***************************************************************//**
+Puts a MySQL OS thread to wait for a lock to be released. If an error
+occurs during the wait trx->error_state associated with thr is
+!= DB_SUCCESS when we return. DB_LOCK_WAIT_TIMEOUT and DB_DEADLOCK
+are possible errors. DB_DEADLOCK is returned if selective deadlock
+resolution chose this transaction as a victim. */
+UNIV_INTERN
+void
+srv_suspend_mysql_thread(
+/*=====================*/
+	que_thr_t*	thr)	/*!< in: query thread associated with the MySQL
+				OS thread */
+{
+	srv_slot_t*	slot;
+	os_event_t	event;
+	double		wait_time;
+	trx_t*		trx;
+	ulint		had_dict_lock;
+	ibool		was_declared_inside_innodb	= FALSE;
+	ib_int64_t	start_time			= 0;
+	ib_int64_t	finish_time;
+	ulint		diff_time;
+	ulint		sec;
+	ulint		ms;
+	ulong		lock_wait_timeout;
+
+	ut_ad(!mutex_own(&kernel_mutex));
+
+	trx = thr_get_trx(thr);
+
+	os_event_set(srv_lock_timeout_thread_event);
+
+	mutex_enter(&kernel_mutex);
+
+	trx->error_state = DB_SUCCESS;
+
+	if (thr->state == QUE_THR_RUNNING) {
+
+		ut_ad(thr->is_active == TRUE);
+
+		/* The lock has already been released or this transaction
+		was chosen as a deadlock victim: no need to suspend */
+
+		if (trx->was_chosen_as_deadlock_victim) {
+
+			trx->error_state = DB_DEADLOCK;
+			trx->was_chosen_as_deadlock_victim = FALSE;
+		}
+
+		mutex_exit(&kernel_mutex);
+
+		return;
+	}
+
+	ut_ad(thr->is_active == FALSE);
+
+	slot = srv_table_reserve_slot_for_mysql();
+
+	event = slot->event;
+
+	slot->thr = thr;
+
+	os_event_reset(event);
+
+	slot->suspend_time = ut_time();
+
+	if (thr->lock_state == QUE_THR_LOCK_ROW) {
+		srv_n_lock_wait_count++;
+		srv_n_lock_wait_current_count++;
+
+		if (ut_usectime(&sec, &ms) == -1) {
+			start_time = -1;
+		} else {
+			start_time = (ib_int64_t) sec * 1000000 + ms;
+		}
+	}
+	/* Wake the lock timeout monitor thread, if it is suspended */
+
+	os_event_set(srv_lock_timeout_thread_event);
+
+	mutex_exit(&kernel_mutex);
+
+	if (trx->declared_to_be_inside_innodb) {
+
+		was_declared_inside_innodb = TRUE;
+
+		/* We must declare this OS thread to exit InnoDB, since a
+		possible other thread holding a lock which this thread waits
+		for must be allowed to enter, sooner or later */
+
+		srv_conc_force_exit_innodb(trx);
+	}
+
+	had_dict_lock = trx->dict_operation_lock_mode;
+
+	switch (had_dict_lock) {
+	case RW_S_LATCH:
+		/* Release foreign key check latch */
+		row_mysql_unfreeze_data_dictionary(trx);
+		break;
+	case RW_X_LATCH:
+		/* Release fast index creation latch */
+		row_mysql_unlock_data_dictionary(trx);
+		break;
+	}
+
+	ut_a(trx->dict_operation_lock_mode == 0);
+
+	/* Suspend this thread and wait for the event. */
+
+	os_event_wait(event);
+
+	/* After resuming, reacquire the data dictionary latch if
+	necessary. */
+
+	switch (had_dict_lock) {
+	case RW_S_LATCH:
+		row_mysql_freeze_data_dictionary(trx);
+		break;
+	case RW_X_LATCH:
+		row_mysql_lock_data_dictionary(trx);
+		break;
+	}
+
+	if (was_declared_inside_innodb) {
+
+		/* Return back inside InnoDB */
+
+		srv_conc_force_enter_innodb(trx);
+	}
+
+	mutex_enter(&kernel_mutex);
+
+	/* Release the slot for others to use */
+
+	slot->in_use = FALSE;
+
+	wait_time = ut_difftime(ut_time(), slot->suspend_time);
+
+	if (thr->lock_state == QUE_THR_LOCK_ROW) {
+		if (ut_usectime(&sec, &ms) == -1) {
+			finish_time = -1;
+		} else {
+			finish_time = (ib_int64_t) sec * 1000000 + ms;
+		}
+
+		diff_time = (ulint) (finish_time - start_time);
+
+		srv_n_lock_wait_current_count--;
+		srv_n_lock_wait_time = srv_n_lock_wait_time + diff_time;
+		if (diff_time > srv_n_lock_max_wait_time &&
+		    /* only update the variable if we successfully
+		    retrieved the start and finish times. See Bug#36819. */
+		    start_time != -1 && finish_time != -1) {
+			srv_n_lock_max_wait_time = diff_time;
+		}
+	}
+
+	if (trx->was_chosen_as_deadlock_victim) {
+
+		trx->error_state = DB_DEADLOCK;
+		trx->was_chosen_as_deadlock_victim = FALSE;
+	}
+
+	mutex_exit(&kernel_mutex);
+
+	/* InnoDB system transactions (such as the purge, and
+	incomplete transactions that are being rolled back after crash
+	recovery) will use the global value of
+	innodb_lock_wait_timeout, because trx->mysql_thd == NULL. */
+	lock_wait_timeout = thd_lock_wait_timeout(trx->mysql_thd);
+
+	if (lock_wait_timeout < 100000000
+	    && wait_time > (double) lock_wait_timeout) {
+
+		trx->error_state = DB_LOCK_WAIT_TIMEOUT;
+	}
+
+	if (trx_is_interrupted(trx)) {
+
+		trx->error_state = DB_INTERRUPTED;
+	}
+}
+
+/********************************************************************//**
+Releases a MySQL OS thread waiting for a lock to be released, if the
+thread is already suspended. */
+UNIV_INTERN
+void
+srv_release_mysql_thread_if_suspended(
+/*==================================*/
+	que_thr_t*	thr)	/*!< in: query thread associated with the
+				MySQL OS thread	 */
+{
+	srv_slot_t*	slot;
+	ulint		i;
+
+	ut_ad(mutex_own(&kernel_mutex));
+
+	for (i = 0; i < OS_THREAD_MAX_N; i++) {
+
+		slot = srv_mysql_table + i;
+
+		if (slot->in_use && slot->thr == thr) {
+			/* Found */
+
+			os_event_set(slot->event);
+
+			return;
+		}
+	}
+
+	/* not found */
+}
+
+/******************************************************************//**
+Refreshes the values used to calculate per-second averages. */
+static
+void
+srv_refresh_innodb_monitor_stats(void)
+/*==================================*/
+{
+	mutex_enter(&srv_innodb_monitor_mutex);
+
+	srv_last_monitor_time = time(NULL);
+
+	os_aio_refresh_stats();
+
+	btr_cur_n_sea_old = btr_cur_n_sea;
+	btr_cur_n_non_sea_old = btr_cur_n_non_sea;
+
+	log_refresh_stats();
+
+	buf_refresh_io_stats();
+
+	srv_n_rows_inserted_old = srv_n_rows_inserted;
+	srv_n_rows_updated_old = srv_n_rows_updated;
+	srv_n_rows_deleted_old = srv_n_rows_deleted;
+	srv_n_rows_read_old = srv_n_rows_read;
+
+	mutex_exit(&srv_innodb_monitor_mutex);
+}
+
+/******************************************************************//**
+Outputs to a file the output of the InnoDB Monitor.
+@return FALSE if not all information printed
+due to failure to obtain necessary mutex */
+UNIV_INTERN
+ibool
+srv_printf_innodb_monitor(
+/*======================*/
+	FILE*	file,		/*!< in: output stream */
+	ibool	nowait,		/*!< in: whether to wait for kernel mutex */
+	ulint*	trx_start,	/*!< out: file position of the start of
+				the list of active transactions */
+	ulint*	trx_end)	/*!< out: file position of the end of
+				the list of active transactions */
+{
+	double	time_elapsed;
+	time_t	current_time;
+	ulint	n_reserved;
+	ibool	ret;
+
+	ulint	btr_search_sys_subtotal;
+	ulint	lock_sys_subtotal;
+	ulint	recv_sys_subtotal;
+
+	ulint	i;
+	trx_t*	trx;
+
+	mutex_enter(&srv_innodb_monitor_mutex);
+
+	current_time = time(NULL);
+
+	/* We add 0.001 seconds to time_elapsed to prevent division
+	by zero if two users happen to call SHOW INNODB STATUS at the same
+	time */
+
+	time_elapsed = difftime(current_time, srv_last_monitor_time)
+		+ 0.001;
+
+	srv_last_monitor_time = time(NULL);
+
+	fputs("\n=====================================\n", file);
+
+	ut_print_timestamp(file);
+	fprintf(file,
+		" INNODB MONITOR OUTPUT\n"
+		"=====================================\n"
+		"Per second averages calculated from the last %lu seconds\n",
+		(ulong)time_elapsed);
+
+	fputs("-----------------\n"
+	      "BACKGROUND THREAD\n"
+	      "-----------------\n", file);
+	srv_print_master_thread_info(file);
+
+	fputs("----------\n"
+	      "SEMAPHORES\n"
+	      "----------\n", file);
+	sync_print(file);
+
+	/* Conceptually, srv_innodb_monitor_mutex has a very high latching
+	order level in sync0sync.h, while dict_foreign_err_mutex has a very
+	low level 135. Therefore we can reserve the latter mutex here without
+	a danger of a deadlock of threads. */
+
+	mutex_enter(&dict_foreign_err_mutex);
+
+	if (ftell(dict_foreign_err_file) != 0L) {
+		fputs("------------------------\n"
+		      "LATEST FOREIGN KEY ERROR\n"
+		      "------------------------\n", file);
+		ut_copy_file(file, dict_foreign_err_file);
+	}
+
+	mutex_exit(&dict_foreign_err_mutex);
+
+	fputs("--------\n"
+	      "FILE I/O\n"
+	      "--------\n", file);
+	os_aio_print(file);
+
+	fputs("-------------------------------------\n"
+	      "INSERT BUFFER AND ADAPTIVE HASH INDEX\n"
+	      "-------------------------------------\n", file);
+	ibuf_print(file);
+
+	ha_print_info(file, btr_search_sys->hash_index);
+
+	fprintf(file,
+		"%.2f hash searches/s, %.2f non-hash searches/s\n",
+		(btr_cur_n_sea - btr_cur_n_sea_old)
+		/ time_elapsed,
+		(btr_cur_n_non_sea - btr_cur_n_non_sea_old)
+		/ time_elapsed);
+	btr_cur_n_sea_old = btr_cur_n_sea;
+	btr_cur_n_non_sea_old = btr_cur_n_non_sea;
+
+	fputs("---\n"
+	      "LOG\n"
+	      "---\n", file);
+	log_print(file);
+
+	fputs("----------------------\n"
+	      "BUFFER POOL AND MEMORY\n"
+	      "----------------------\n", file);
+	fprintf(file,
+			"Total memory allocated " ULINTPF
+			"; in additional pool allocated " ULINTPF "\n",
+			ut_total_allocated_memory,
+			mem_pool_get_reserved(mem_comm_pool));
+	/* Calcurate reserved memories */
+	if (btr_search_sys && btr_search_sys->hash_index->heap) {
+		btr_search_sys_subtotal = mem_heap_get_size(btr_search_sys->hash_index->heap);
+	} else {
+		btr_search_sys_subtotal = 0;
+		for (i=0; i < btr_search_sys->hash_index->n_mutexes; i++) {
+			btr_search_sys_subtotal += mem_heap_get_size(btr_search_sys->hash_index->heaps[i]);
+		}
+	}
+
+	lock_sys_subtotal = 0;
+	if (trx_sys) {
+		mutex_enter(&kernel_mutex);
+		trx = UT_LIST_GET_FIRST(trx_sys->mysql_trx_list);
+		while (trx) {
+			lock_sys_subtotal += ((trx->lock_heap) ? mem_heap_get_size(trx->lock_heap) : 0);
+			trx = UT_LIST_GET_NEXT(mysql_trx_list, trx);
+		}
+		mutex_exit(&kernel_mutex);
+	}
+
+	recv_sys_subtotal = ((recv_sys && recv_sys->addr_hash)
+			? mem_heap_get_size(recv_sys->heap) : 0);
+
+	fprintf(file,
+			"Internal hash tables (constant factor + variable factor)\n"
+			"    Adaptive hash index %lu \t(%lu + %lu)\n"
+			"    Page hash           %lu\n"
+			"    Dictionary cache    %lu \t(%lu + %lu)\n"
+			"    File system         %lu \t(%lu + %lu)\n"
+			"    Lock system         %lu \t(%lu + %lu)\n"
+			"    Recovery system     %lu \t(%lu + %lu)\n"
+			"    Threads             %lu \t(%lu + %lu)\n",
+
+			(ulong) (btr_search_sys
+				? (btr_search_sys->hash_index->n_cells * sizeof(hash_cell_t)) : 0)
+			+ btr_search_sys_subtotal,
+			(ulong) (btr_search_sys
+				? (btr_search_sys->hash_index->n_cells * sizeof(hash_cell_t)) : 0),
+			(ulong) btr_search_sys_subtotal,
+
+			(ulong) (buf_pool->page_hash->n_cells * sizeof(hash_cell_t)),
+
+			(ulong) (dict_sys ? ((dict_sys->table_hash->n_cells
+						+ dict_sys->table_id_hash->n_cells
+						) * sizeof(hash_cell_t)
+					+ dict_sys->size) : 0),
+			(ulong) (dict_sys ? ((dict_sys->table_hash->n_cells
+							+ dict_sys->table_id_hash->n_cells
+							) * sizeof(hash_cell_t)) : 0),
+			(ulong) (dict_sys ? (dict_sys->size) : 0),
+
+			(ulong) (fil_system_hash_cells() * sizeof(hash_cell_t)
+					+ fil_system_hash_nodes()),
+			(ulong) (fil_system_hash_cells() * sizeof(hash_cell_t)),
+			(ulong) fil_system_hash_nodes(),
+
+			(ulong) ((lock_sys ? (lock_sys->rec_hash->n_cells * sizeof(hash_cell_t)) : 0)
+					+ lock_sys_subtotal),
+			(ulong) (lock_sys ? (lock_sys->rec_hash->n_cells * sizeof(hash_cell_t)) : 0),
+			(ulong) lock_sys_subtotal,
+
+			(ulong) (((recv_sys && recv_sys->addr_hash)
+						? (recv_sys->addr_hash->n_cells * sizeof(hash_cell_t)) : 0)
+					+ recv_sys_subtotal),
+			(ulong) ((recv_sys && recv_sys->addr_hash)
+					? (recv_sys->addr_hash->n_cells * sizeof(hash_cell_t)) : 0),
+			(ulong) recv_sys_subtotal,
+
+			(ulong) (thr_local_hash_cells() * sizeof(hash_cell_t)
+					+ thr_local_hash_nodes()),
+			(ulong) (thr_local_hash_cells() * sizeof(hash_cell_t)),
+			(ulong) thr_local_hash_nodes());
+
+	fprintf(file, "Dictionary memory allocated " ULINTPF "\n",
+		dict_sys->size);
+
+	buf_print_io(file);
+
+	fputs("--------------\n"
+	      "ROW OPERATIONS\n"
+	      "--------------\n", file);
+	fprintf(file, "%ld queries inside InnoDB, %lu queries in queue\n",
+		(long) srv_conc_n_threads,
+		(ulong) srv_conc_n_waiting_threads);
+
+	fprintf(file, "%lu read views open inside InnoDB\n",
+		UT_LIST_GET_LEN(trx_sys->view_list));
+
+	n_reserved = fil_space_get_n_reserved_extents(0);
+	if (n_reserved > 0) {
+		fprintf(file,
+			"%lu tablespace extents now reserved for"
+			" B-tree split operations\n",
+			(ulong) n_reserved);
+	}
+
+#ifdef UNIV_LINUX
+	fprintf(file, "Main thread process no. %lu, id %lu, state: %s\n",
+		(ulong) srv_main_thread_process_no,
+		(ulong) srv_main_thread_id,
+		srv_main_thread_op_info);
+#else
+	fprintf(file, "Main thread id %lu, state: %s\n",
+		(ulong) srv_main_thread_id,
+		srv_main_thread_op_info);
+#endif
+	fprintf(file,
+		"Number of rows inserted " ULINTPF
+		", updated " ULINTPF ", deleted " ULINTPF
+		", read " ULINTPF "\n",
+		srv_n_rows_inserted,
+		srv_n_rows_updated,
+		srv_n_rows_deleted,
+		srv_n_rows_read);
+	fprintf(file,
+		"%.2f inserts/s, %.2f updates/s,"
+		" %.2f deletes/s, %.2f reads/s\n",
+		(srv_n_rows_inserted - srv_n_rows_inserted_old)
+		/ time_elapsed,
+		(srv_n_rows_updated - srv_n_rows_updated_old)
+		/ time_elapsed,
+		(srv_n_rows_deleted - srv_n_rows_deleted_old)
+		/ time_elapsed,
+		(srv_n_rows_read - srv_n_rows_read_old)
+		/ time_elapsed);
+
+	srv_n_rows_inserted_old = srv_n_rows_inserted;
+	srv_n_rows_updated_old = srv_n_rows_updated;
+	srv_n_rows_deleted_old = srv_n_rows_deleted;
+	srv_n_rows_read_old = srv_n_rows_read;
+
+	/* Only if lock_print_info_summary proceeds correctly,
+	before we call the lock_print_info_all_transactions
+	to print all the lock information. */
+	ret = lock_print_info_summary(file, nowait);
+
+	if (ret) {
+		if (trx_start) {
+			long	t = ftell(file);
+			if (t < 0) {
+				*trx_start = ULINT_UNDEFINED;
+			} else {
+				*trx_start = (ulint) t;
+			}
+		}
+		lock_print_info_all_transactions(file);
+		if (trx_end) {
+			long	t = ftell(file);
+			if (t < 0) {
+				*trx_end = ULINT_UNDEFINED;
+			} else {
+				*trx_end = (ulint) t;
+			}
+		}
+	}
+
+	fputs("----------------------------\n"
+	      "END OF INNODB MONITOR OUTPUT\n"
+	      "============================\n", file);
+	mutex_exit(&srv_innodb_monitor_mutex);
+	fflush(file);
+
+	return(ret);
+}
+
+/******************************************************************//**
+Function to pass InnoDB status variables to MySQL */
+UNIV_INTERN
+void
+srv_export_innodb_status(void)
+/*==========================*/
+{
+	mutex_enter(&srv_innodb_monitor_mutex);
+
+	export_vars.innodb_data_pending_reads
+		= os_n_pending_reads;
+	export_vars.innodb_data_pending_writes
+		= os_n_pending_writes;
+	export_vars.innodb_data_pending_fsyncs
+		= fil_n_pending_log_flushes
+		+ fil_n_pending_tablespace_flushes;
+	export_vars.innodb_data_fsyncs = os_n_fsyncs;
+	export_vars.innodb_data_read = srv_data_read;
+	export_vars.innodb_data_reads = os_n_file_reads;
+	export_vars.innodb_data_writes = os_n_file_writes;
+	export_vars.innodb_data_written = srv_data_written;
+	export_vars.innodb_dict_tables= (dict_sys ? UT_LIST_GET_LEN(dict_sys->table_LRU) : 0);
+	export_vars.innodb_buffer_pool_read_requests = buf_pool->stat.n_page_gets;
+	export_vars.innodb_buffer_pool_write_requests
+		= srv_buf_pool_write_requests;
+	export_vars.innodb_buffer_pool_wait_free = srv_buf_pool_wait_free;
+	export_vars.innodb_buffer_pool_pages_flushed = srv_buf_pool_flushed;
+	export_vars.innodb_buffer_pool_reads = srv_buf_pool_reads;
+	export_vars.innodb_buffer_pool_read_ahead
+		= buf_pool->stat.n_ra_pages_read;
+	export_vars.innodb_buffer_pool_read_ahead_evicted
+		= buf_pool->stat.n_ra_pages_evicted;
+	export_vars.innodb_buffer_pool_pages_data
+		= UT_LIST_GET_LEN(buf_pool->LRU);
+	export_vars.innodb_buffer_pool_pages_dirty
+		= UT_LIST_GET_LEN(buf_pool->flush_list);
+	export_vars.innodb_buffer_pool_pages_free
+		= UT_LIST_GET_LEN(buf_pool->free);
+	export_vars.innodb_deadlocks
+	        = srv_n_lock_deadlock_count;
+#ifdef UNIV_DEBUG
+	export_vars.innodb_buffer_pool_pages_latched
+		= buf_get_latched_pages_number();
+#endif /* UNIV_DEBUG */
+	export_vars.innodb_buffer_pool_pages_total = buf_pool->curr_size;
+
+	export_vars.innodb_buffer_pool_pages_misc = buf_pool->curr_size
+		- UT_LIST_GET_LEN(buf_pool->LRU)
+		- UT_LIST_GET_LEN(buf_pool->free);
+#ifdef HAVE_ATOMIC_BUILTINS
+	export_vars.innodb_have_atomic_builtins = 1;
+#else
+	export_vars.innodb_have_atomic_builtins = 0;
+#endif
+	export_vars.innodb_page_size = UNIV_PAGE_SIZE;
+	export_vars.innodb_log_waits = srv_log_waits;
+	export_vars.innodb_os_log_written = srv_os_log_written;
+	export_vars.innodb_os_log_fsyncs = fil_n_log_flushes;
+	export_vars.innodb_os_log_pending_fsyncs = fil_n_pending_log_flushes;
+	export_vars.innodb_os_log_pending_writes = srv_os_log_pending_writes;
+	export_vars.innodb_log_write_requests = srv_log_write_requests;
+	export_vars.innodb_log_writes = srv_log_writes;
+	export_vars.innodb_dblwr_pages_written = srv_dblwr_pages_written;
+	export_vars.innodb_dblwr_writes = srv_dblwr_writes;
+	export_vars.innodb_pages_created = buf_pool->stat.n_pages_created;
+	export_vars.innodb_pages_read = buf_pool->stat.n_pages_read;
+	export_vars.innodb_pages_written = buf_pool->stat.n_pages_written;
+	export_vars.innodb_row_lock_waits = srv_n_lock_wait_count;
+	export_vars.innodb_row_lock_current_waits
+		= srv_n_lock_wait_current_count;
+	export_vars.innodb_row_lock_time = srv_n_lock_wait_time / 1000;
+	if (srv_n_lock_wait_count > 0) {
+		export_vars.innodb_row_lock_time_avg = (ulint)
+			(srv_n_lock_wait_time / 1000 / srv_n_lock_wait_count);
+	} else {
+		export_vars.innodb_row_lock_time_avg = 0;
+	}
+	export_vars.innodb_row_lock_time_max
+		= srv_n_lock_max_wait_time / 1000;
+	export_vars.innodb_rows_read = srv_n_rows_read;
+	export_vars.innodb_rows_inserted = srv_n_rows_inserted;
+	export_vars.innodb_rows_updated = srv_n_rows_updated;
+	export_vars.innodb_rows_deleted = srv_n_rows_deleted;
+
+	mutex_exit(&srv_innodb_monitor_mutex);
+}
+
+/*********************************************************************//**
+A thread which prints the info output by various InnoDB monitors.
+@return	a dummy parameter */
+UNIV_INTERN
+os_thread_ret_t
+srv_monitor_thread(
+/*===============*/
+	void*	arg __attribute__((unused)))
+			/*!< in: a dummy parameter required by
+			os_thread_create */
+{
+	double		time_elapsed;
+	time_t		current_time;
+	time_t		last_table_monitor_time;
+	time_t		last_tablespace_monitor_time;
+	time_t		last_monitor_time;
+	ulint		mutex_skipped;
+	ibool		last_srv_print_monitor;
+
+#ifdef UNIV_DEBUG_THREAD_CREATION
+	fprintf(stderr, "Lock timeout thread starts, id %lu\n",
+		os_thread_pf(os_thread_get_curr_id()));
+#endif
+	UT_NOT_USED(arg);
+	srv_last_monitor_time = time(NULL);
+	last_table_monitor_time = time(NULL);
+	last_tablespace_monitor_time = time(NULL);
+	last_monitor_time = time(NULL);
+	mutex_skipped = 0;
+	last_srv_print_monitor = srv_print_innodb_monitor;
+loop:
+	srv_monitor_active = TRUE;
+
+	/* Wake up every 5 seconds to see if we need to print
+	monitor information. */
+
+	os_thread_sleep(5000000);
+
+	current_time = time(NULL);
+
+	time_elapsed = difftime(current_time, last_monitor_time);
+
+	if (time_elapsed > 15) {
+		last_monitor_time = time(NULL);
+
+		if (srv_print_innodb_monitor) {
+			/* Reset mutex_skipped counter everytime
+			srv_print_innodb_monitor changes. This is to
+			ensure we will not be blocked by kernel_mutex
+			for short duration information printing,
+			such as requested by sync_array_print_long_waits() */
+			if (!last_srv_print_monitor) {
+				mutex_skipped = 0;
+				last_srv_print_monitor = TRUE;
+			}
+
+			if (!srv_printf_innodb_monitor(stderr,
+						MUTEX_NOWAIT(mutex_skipped),
+						NULL, NULL)) {
+				mutex_skipped++;
+			} else {
+				/* Reset the counter */
+				mutex_skipped = 0;
+			}
+		} else {
+			last_srv_print_monitor = FALSE;
+		}
+
+
+		if (srv_innodb_status) {
+			mutex_enter(&srv_monitor_file_mutex);
+			rewind(srv_monitor_file);
+			if (!srv_printf_innodb_monitor(srv_monitor_file,
+						MUTEX_NOWAIT(mutex_skipped),
+						NULL, NULL)) {
+				mutex_skipped++;
+			} else {
+				mutex_skipped = 0;
+			}
+
+			os_file_set_eof(srv_monitor_file);
+			mutex_exit(&srv_monitor_file_mutex);
+		}
+
+		if (srv_print_innodb_tablespace_monitor
+		    && difftime(current_time,
+				last_tablespace_monitor_time) > 60) {
+			last_tablespace_monitor_time = time(NULL);
+
+			fputs("========================"
+			      "========================\n",
+			      stderr);
+
+			ut_print_timestamp(stderr);
+
+			fputs(" INNODB TABLESPACE MONITOR OUTPUT\n"
+			      "========================"
+			      "========================\n",
+			      stderr);
+
+			fsp_print(0);
+			fputs("Validating tablespace\n", stderr);
+			fsp_validate(0);
+			fputs("Validation ok\n"
+			      "---------------------------------------\n"
+			      "END OF INNODB TABLESPACE MONITOR OUTPUT\n"
+			      "=======================================\n",
+			      stderr);
+		}
+
+		if (srv_print_innodb_table_monitor
+		    && difftime(current_time, last_table_monitor_time) > 60) {
+
+			last_table_monitor_time = time(NULL);
+
+			fputs("===========================================\n",
+			      stderr);
+
+			ut_print_timestamp(stderr);
+
+			fputs(" INNODB TABLE MONITOR OUTPUT\n"
+			      "===========================================\n",
+			      stderr);
+			dict_print();
+
+			fputs("-----------------------------------\n"
+			      "END OF INNODB TABLE MONITOR OUTPUT\n"
+			      "==================================\n",
+			      stderr);
+		}
+	}
+
+	if (srv_shutdown_state >= SRV_SHUTDOWN_CLEANUP) {
+		goto exit_func;
+	}
+
+	if (srv_print_innodb_monitor
+	    || srv_print_innodb_lock_monitor
+	    || srv_print_innodb_tablespace_monitor
+	    || srv_print_innodb_table_monitor) {
+		goto loop;
+	}
+
+	srv_monitor_active = FALSE;
+
+	goto loop;
+
+exit_func:
+	srv_monitor_active = FALSE;
+
+	/* We count the number of threads in os_thread_exit(). A created
+	thread should always use that to exit and not use return() to exit. */
+
+	os_thread_exit(NULL);
+
+	OS_THREAD_DUMMY_RETURN;
+}
+
+/*********************************************************************//**
+A thread which wakes up threads whose lock wait may have lasted too long.
+@return	a dummy parameter */
+UNIV_INTERN
+os_thread_ret_t
+srv_lock_timeout_thread(
+/*====================*/
+	void*	arg __attribute__((unused)))
+			/* in: a dummy parameter required by
+			os_thread_create */
+{
+	srv_slot_t*	slot;
+	ibool		some_waits;
+	double		wait_time;
+	ulint		i;
+
+loop:
+	/* When someone is waiting for a lock, we wake up every second
+	and check if a timeout has passed for a lock wait */
+
+	os_thread_sleep(1000000);
+
+	srv_lock_timeout_active = TRUE;
+
+	mutex_enter(&kernel_mutex);
+
+	some_waits = FALSE;
+
+	/* Check of all slots if a thread is waiting there, and if it
+	has exceeded the time limit */
+
+	for (i = 0; i < OS_THREAD_MAX_N; i++) {
+
+		slot = srv_mysql_table + i;
+
+		if (slot->in_use) {
+			trx_t*	trx;
+			ulong	lock_wait_timeout;
+
+			some_waits = TRUE;
+
+			wait_time = ut_difftime(ut_time(), slot->suspend_time);
+
+			trx = thr_get_trx(slot->thr);
+			lock_wait_timeout = thd_lock_wait_timeout(
+				trx->mysql_thd);
+
+			if (trx_is_interrupted(trx)
+			    || (lock_wait_timeout < 100000000
+				&& (wait_time > (double) lock_wait_timeout
+				    || wait_time < 0))) {
+
+				/* Timeout exceeded or a wrap-around in system
+				time counter: cancel the lock request queued
+				by the transaction and release possible
+				other transactions waiting behind; it is
+				possible that the lock has already been
+				granted: in that case do nothing */
+
+				if (trx->wait_lock) {
+					lock_cancel_waiting_and_release(
+						trx->wait_lock);
+				}
+			}
+		}
+	}
+
+	os_event_reset(srv_lock_timeout_thread_event);
+
+	mutex_exit(&kernel_mutex);
+
+	if (srv_shutdown_state >= SRV_SHUTDOWN_CLEANUP) {
+		goto exit_func;
+	}
+
+	if (some_waits) {
+		goto loop;
+	}
+
+	srv_lock_timeout_active = FALSE;
+
+#if 0
+	/* The following synchronisation is disabled, since
+	the InnoDB monitor output is to be updated every 15 seconds. */
+	os_event_wait(srv_lock_timeout_thread_event);
+#endif
+	goto loop;
+
+exit_func:
+	srv_lock_timeout_active = FALSE;
+
+	/* We count the number of threads in os_thread_exit(). A created
+	thread should always use that to exit and not use return() to exit. */
+
+	os_thread_exit(NULL);
+
+	OS_THREAD_DUMMY_RETURN;
+}
+
+/*********************************************************************//**
+A thread which prints warnings about semaphore waits which have lasted
+too long. These can be used to track bugs which cause hangs.
+@return	a dummy parameter */
+UNIV_INTERN
+os_thread_ret_t
+srv_error_monitor_thread(
+/*=====================*/
+	void*	arg __attribute__((unused)))
+			/*!< in: a dummy parameter required by
+			os_thread_create */
+{
+	/* number of successive fatal timeouts observed */
+	ulint		fatal_cnt	= 0;
+	ib_uint64_t	old_lsn;
+	ib_uint64_t	new_lsn;
+
+	old_lsn = srv_start_lsn;
+
+#ifdef UNIV_DEBUG_THREAD_CREATION
+	fprintf(stderr, "Error monitor thread starts, id %lu\n",
+		os_thread_pf(os_thread_get_curr_id()));
+#endif
+loop:
+	srv_error_monitor_active = TRUE;
+
+	/* Try to track a strange bug reported by Harald Fuchs and others,
+	where the lsn seems to decrease at times */
+
+	new_lsn = log_get_lsn();
+
+	if (new_lsn < old_lsn) {
+		ut_print_timestamp(stderr);
+		fprintf(stderr,
+			"  InnoDB: Error: old log sequence number %llu"
+			" was greater\n"
+			"InnoDB: than the new log sequence number %llu!\n"
+			"InnoDB: Please submit a bug report"
+			" to http://bugs.mysql.com\n",
+			old_lsn, new_lsn);
+	}
+
+	old_lsn = new_lsn;
+
+	if (difftime(time(NULL), srv_last_monitor_time) > 60) {
+		/* We referesh InnoDB Monitor values so that averages are
+		printed from at most 60 last seconds */
+
+		srv_refresh_innodb_monitor_stats();
+	}
+
+	/* Update the statistics collected for deciding LRU
+	eviction policy. */
+	buf_LRU_stat_update();
+
+	/* Update the statistics collected for flush rate policy. */
+	buf_flush_stat_update();
+
+	/* In case mutex_exit is not a memory barrier, it is
+	theoretically possible some threads are left waiting though
+	the semaphore is already released. Wake up those threads: */
+
+	sync_arr_wake_threads_if_sema_free();
+
+	if (sync_array_print_long_waits()) {
+		fatal_cnt++;
+		if (fatal_cnt > 10) {
+
+			fprintf(stderr,
+				"InnoDB: Error: semaphore wait has lasted"
+				" > %lu seconds\n"
+				"InnoDB: We intentionally crash the server,"
+				" because it appears to be hung.\n",
+				(ulong) srv_fatal_semaphore_wait_threshold);
+
+			ut_error;
+		}
+	} else {
+		fatal_cnt = 0;
+	}
+
+	/* Flush stderr so that a database user gets the output
+	to possible MySQL error file */
+
+	fflush(stderr);
+
+	os_thread_sleep(1000000);
+
+	if (srv_shutdown_state < SRV_SHUTDOWN_CLEANUP) {
+
+		goto loop;
+	}
+
+	srv_error_monitor_active = FALSE;
+
+	/* We count the number of threads in os_thread_exit(). A created
+	thread should always use that to exit and not use return() to exit. */
+
+	os_thread_exit(NULL);
+
+	OS_THREAD_DUMMY_RETURN;
+}
+
+/*********************************************************************//**
+A thread which restores the buffer pool from a dump file on startup and does
+periodic buffer pool dumps.
+@return	a dummy parameter */
+UNIV_INTERN
+os_thread_ret_t
+srv_LRU_dump_restore_thread(
+/*====================*/
+	void*	arg __attribute__((unused)))
+			/*!< in: a dummy parameter required by
+			os_thread_create */
+{
+	uint	auto_lru_dump;
+	time_t	last_dump_time;
+	time_t	time_elapsed;
+
+#ifdef UNIV_DEBUG_THREAD_CREATION
+	fprintf(stderr, "LRU dump/restore thread starts, id %lu\n",
+		os_thread_pf(os_thread_get_curr_id()));
+#endif
+
+	if (srv_auto_lru_dump)
+		buf_LRU_file_restore();
+
+	last_dump_time = time(NULL);
+
+loop:
+	os_thread_sleep(5000000);
+
+	if (srv_shutdown_state >= SRV_SHUTDOWN_CLEANUP) {
+		goto exit_func;
+	}
+
+	time_elapsed = time(NULL) - last_dump_time;
+	auto_lru_dump = srv_auto_lru_dump;
+	if (auto_lru_dump > 0 && (time_t) auto_lru_dump < time_elapsed) {
+		last_dump_time = time(NULL);
+		buf_LRU_file_dump();
+	}
+
+	goto loop;
+exit_func:
+	/* We count the number of threads in os_thread_exit(). A created
+	thread should always use that to exit and not use return() to exit. */
+
+	os_thread_exit(NULL);
+
+	OS_THREAD_DUMMY_RETURN;
+}
+
+/*******************************************************************//**
+Tells the InnoDB server that there has been activity in the database
+and wakes up the master thread if it is suspended (not sleeping). Used
+in the MySQL interface. Note that there is a small chance that the master
+thread stays suspended (we do not protect our operation with the kernel
+mutex, for performace reasons). */
+UNIV_INTERN
+void
+srv_active_wake_master_thread(void)
+/*===============================*/
+{
+	srv_activity_count++;
+
+	if (srv_n_threads_active[SRV_MASTER] == 0) {
+
+		mutex_enter(&kernel_mutex);
+
+		srv_release_threads(SRV_MASTER, 1);
+
+		mutex_exit(&kernel_mutex);
+	}
+}
+
+/*******************************************************************//**
+Wakes up the master thread if it is suspended or being suspended. */
+UNIV_INTERN
+void
+srv_wake_master_thread(void)
+/*========================*/
+{
+	srv_activity_count++;
+
+	mutex_enter(&kernel_mutex);
+
+	srv_release_threads(SRV_MASTER, 1);
+
+	mutex_exit(&kernel_mutex);
+}
+
+/**********************************************************************
+The master thread is tasked to ensure that flush of log file happens
+once every second in the background. This is to ensure that not more
+than one second of trxs are lost in case of crash when
+innodb_flush_logs_at_trx_commit != 1 */
+static
+void
+srv_sync_log_buffer_in_background(void)
+/*===================================*/
+{
+	time_t	current_time = time(NULL);
+
+	srv_main_thread_op_info = "flushing log";
+	if (difftime(current_time, srv_last_log_flush_time) >= 1) {
+		log_buffer_sync_in_background(TRUE);
+		srv_last_log_flush_time = current_time;
+		srv_log_writes_and_flush++;
+	}
+}
+
+/*********************************************************************//**
+The master thread controlling the server.
+@return	a dummy parameter */
+UNIV_INTERN
+os_thread_ret_t
+srv_master_thread(
+/*==============*/
+	void*	arg __attribute__((unused)))
+			/*!< in: a dummy parameter required by
+			os_thread_create */
+{
+	os_event_t	event;
+	ulint		old_activity_count;
+	ulint		n_pages_purged	= 0;
+	ulint		n_bytes_merged;
+	ulint		n_pages_flushed;
+	ulint		n_bytes_archived;
+	ulint		n_tables_to_drop;
+	ulint		n_ios;
+	ulint		n_ios_old;
+	ulint		n_ios_very_old;
+	ulint		n_pend_ios;
+	ibool		skip_sleep	= FALSE;
+	ulint		i;
+
+	ib_uint64_t	lsn_old;
+
+	ib_uint64_t	oldest_lsn;
+
+#ifdef UNIV_DEBUG_THREAD_CREATION
+	fprintf(stderr, "Master thread starts, id %lu\n",
+		os_thread_pf(os_thread_get_curr_id()));
+#endif
+	srv_main_thread_process_no = os_proc_get_number();
+	srv_main_thread_id = os_thread_pf(os_thread_get_curr_id());
+
+
+	mutex_enter(&kernel_mutex);
+
+	srv_table_reserve_slot(SRV_MASTER);
+	srv_n_threads_active[SRV_MASTER]++;
+
+	mutex_exit(&kernel_mutex);
+
+	mutex_enter(&(log_sys->mutex));
+	lsn_old = log_sys->lsn;
+	mutex_exit(&(log_sys->mutex));
+loop:
+	/*****************************************************************/
+	/* ---- When there is database activity by users, we cycle in this
+	loop */
+
+	srv_main_thread_op_info = "reserving kernel mutex";
+
+	n_ios_very_old = log_sys->n_log_ios + buf_pool->stat.n_pages_read
+		+ buf_pool->stat.n_pages_written;
+	mutex_enter(&kernel_mutex);
+
+	/* Store the user activity counter at the start of this loop */
+	old_activity_count = srv_activity_count;
+
+	mutex_exit(&kernel_mutex);
+
+	if (srv_force_recovery >= SRV_FORCE_NO_BACKGROUND) {
+
+		goto suspend_thread;
+	}
+
+	/* ---- We run the following loop approximately once per second
+	when there is database activity */
+
+	srv_last_log_flush_time = time(NULL);
+	skip_sleep = FALSE;
+
+	for (i = 0; i < 10; i++) {
+		n_ios_old = log_sys->n_log_ios + buf_pool->stat.n_pages_read
+			+ buf_pool->stat.n_pages_written;
+		srv_main_thread_op_info = "sleeping";
+		srv_main_1_second_loops++;
+
+		if (!skip_sleep) {
+
+			os_thread_sleep(1000000);
+			srv_main_sleeps++;
+
+			/*
+			mutex_enter(&(log_sys->mutex));
+			oldest_lsn = buf_pool_get_oldest_modification();
+			ib_uint64_t	lsn = log_sys->lsn;
+			mutex_exit(&(log_sys->mutex));
+
+			if(oldest_lsn)
+			fprintf(stderr,
+				"InnoDB flush: age pct: %lu, lsn progress: %lu\n",
+				(lsn - oldest_lsn) * 100 / log_sys->max_checkpoint_age,
+				lsn - lsn_old);
+			*/
+		}
+
+		skip_sleep = FALSE;
+
+		/* ALTER TABLE in MySQL requires on Unix that the table handler
+		can drop tables lazily after there no longer are SELECT
+		queries to them. */
+
+		srv_main_thread_op_info = "doing background drop tables";
+
+		row_drop_tables_for_mysql_in_background();
+
+		srv_main_thread_op_info = "";
+
+		if (srv_fast_shutdown && srv_shutdown_state > 0) {
+
+			goto background_loop;
+		}
+
+		/* Flush logs if needed */
+		srv_sync_log_buffer_in_background();
+
+		srv_main_thread_op_info = "making checkpoint";
+		log_free_check();
+
+		/* If i/os during one second sleep were less than 5% of
+                capacity, we assume that there is free disk i/o capacity
+                available, and it makes sense to do an insert buffer merge. */
+
+		n_pend_ios = buf_get_n_pending_ios()
+			+ log_sys->n_pending_writes;
+		n_ios = log_sys->n_log_ios + buf_pool->stat.n_pages_read
+			+ buf_pool->stat.n_pages_written;
+		if (n_pend_ios < SRV_PEND_IO_THRESHOLD
+		    && (n_ios - n_ios_old < SRV_RECENT_IO_ACTIVITY)) {
+			srv_main_thread_op_info = "doing insert buffer merge";
+			ibuf_contract_for_n_pages(FALSE, PCT_IBUF_IO(5));
+
+			/* Flush logs if needed */
+			srv_sync_log_buffer_in_background();
+		}
+
+		if (UNIV_UNLIKELY(buf_get_modified_ratio_pct()
+				  > srv_max_buf_pool_modified_pct)) {
+
+			/* Try to keep the number of modified pages in the
+			buffer pool under the limit wished by the user */
+
+			srv_main_thread_op_info =
+				"flushing buffer pool pages";
+			n_pages_flushed = buf_flush_batch(BUF_FLUSH_LIST,
+							  PCT_IO(100),
+							  IB_ULONGLONG_MAX);
+
+			/* If we had to do the flush, it may have taken
+			even more than 1 second, and also, there may be more
+			to flush. Do not sleep 1 second during the next
+			iteration of this loop. */
+
+			skip_sleep = TRUE;
+
+			mutex_enter(&(log_sys->mutex));
+			lsn_old = log_sys->lsn;
+			mutex_exit(&(log_sys->mutex));
+		} else if (srv_adaptive_flushing) {
+
+			/* Try to keep the rate of flushing of dirty
+			pages such that redo log generation does not
+			produce bursts of IO at checkpoint time. */
+			ulint n_flush = buf_flush_get_desired_flush_rate();
+
+			if (n_flush) {
+				srv_main_thread_op_info =
+					"flushing buffer pool pages";
+				n_flush = ut_min(PCT_IO(100), n_flush);
+				n_pages_flushed =
+					buf_flush_batch(
+						BUF_FLUSH_LIST,
+						n_flush,
+						IB_ULONGLONG_MAX);
+
+				if (n_flush == PCT_IO(100)) {
+					skip_sleep = TRUE;
+				}
+			}
+
+			mutex_enter(&(log_sys->mutex));
+			lsn_old = log_sys->lsn;
+			mutex_exit(&(log_sys->mutex));
+		} else if (srv_adaptive_checkpoint == 1) {
+			/* adaptive_flushing option is prior to adaptive_checkpoint option, for now */
+
+			/* Try to keep modified age not to exceed
+			max_checkpoint_age * 7/8 line */
+
+			mutex_enter(&(log_sys->mutex));
+			lsn_old = log_sys->lsn;
+			oldest_lsn = buf_pool_get_oldest_modification();
+			if (oldest_lsn == 0) {
+
+				mutex_exit(&(log_sys->mutex));
+
+			} else {
+				if ((log_sys->lsn - oldest_lsn)
+				    > (log_sys->max_checkpoint_age) - ((log_sys->max_checkpoint_age) / 8)) {
+					/* LOG_POOL_PREFLUSH_RATIO_ASYNC is exceeded. */
+					/* We should not flush from here. */
+					mutex_exit(&(log_sys->mutex));
+				} else if ((log_sys->lsn - oldest_lsn)
+				    > (log_sys->max_checkpoint_age) - ((log_sys->max_checkpoint_age) / 4)) {
+
+					/* 2nd defence line (max_checkpoint_age * 3/4) */
+
+					mutex_exit(&(log_sys->mutex));
+
+					n_pages_flushed = buf_flush_batch(BUF_FLUSH_LIST, PCT_IO(100),
+									  IB_ULONGLONG_MAX);
+					skip_sleep = TRUE;
+				} else if ((log_sys->lsn - oldest_lsn)
+					   > (log_sys->max_checkpoint_age)/2 ) {
+
+					/* 1st defence line (max_checkpoint_age * 1/2) */
+
+					mutex_exit(&(log_sys->mutex));
+
+					n_pages_flushed = buf_flush_batch(BUF_FLUSH_LIST, PCT_IO(10),
+									  IB_ULONGLONG_MAX);
+					skip_sleep = TRUE;
+				} else {
+					mutex_exit(&(log_sys->mutex));
+				}
+			}
+		} else if (srv_adaptive_checkpoint == 2) {
+
+			/* Try to keep modified age not to exceed
+			max_checkpoint_age * 7/8 line */
+
+			mutex_enter(&(log_sys->mutex));
+
+			oldest_lsn = buf_pool_get_oldest_modification();
+			if (oldest_lsn == 0) {
+				lsn_old = log_sys->lsn;
+				mutex_exit(&(log_sys->mutex));
+
+			} else {
+				if ((log_sys->lsn - oldest_lsn)
+				    > (log_sys->max_checkpoint_age) - ((log_sys->max_checkpoint_age) / 8)) {
+					/* LOG_POOL_PREFLUSH_RATIO_ASYNC is exceeded. */
+					/* We should not flush from here. */
+					lsn_old = log_sys->lsn;
+					mutex_exit(&(log_sys->mutex));
+				} else if ((log_sys->lsn - oldest_lsn)
+					   > (log_sys->max_checkpoint_age)/4 ) {
+
+					/* defence line (max_checkpoint_age * 1/2) */
+					ib_uint64_t	lsn = log_sys->lsn;
+
+					ib_uint64_t level, bpl;
+					buf_page_t* bpage;
+
+					mutex_exit(&(log_sys->mutex));
+
+					mutex_enter(&flush_list_mutex);
+
+					level = 0;
+					bpage = UT_LIST_GET_FIRST(buf_pool->flush_list);
+
+					while (bpage != NULL) {
+						ib_uint64_t	oldest_modification = bpage->oldest_modification;
+						if (oldest_modification != 0) {
+							level += log_sys->max_checkpoint_age
+								 - (lsn - oldest_modification);
+						}
+						bpage = UT_LIST_GET_NEXT(flush_list, bpage);
+					}
+
+					if (level) {
+						bpl = ((ib_uint64_t) UT_LIST_GET_LEN(buf_pool->flush_list)
+							* UT_LIST_GET_LEN(buf_pool->flush_list)
+							* (lsn - lsn_old)) / level;
+					} else {
+						bpl = 0;
+					}
+
+					mutex_exit(&flush_list_mutex);
+
+					if (!srv_use_doublewrite_buf) {
+						/* flush is faster than when doublewrite */
+						bpl = (bpl * 7) / 8;
+					}
+
+					if (bpl) {
+retry_flush_batch:
+						n_pages_flushed = buf_flush_batch(BUF_FLUSH_LIST,
+									(ulint) bpl,
+									oldest_lsn + (lsn - lsn_old));
+						if (n_pages_flushed == ULINT_UNDEFINED) {
+							os_thread_sleep(5000);
+							goto retry_flush_batch;
+						}
+					}
+
+					lsn_old = lsn;
+					/*
+					fprintf(stderr,
+						"InnoDB flush: age pct: %lu, lsn progress: %lu, blocks to flush:%llu\n",
+						(lsn - oldest_lsn) * 100 / log_sys->max_checkpoint_age,
+						lsn - lsn_old, bpl);
+					*/
+				} else {
+					lsn_old = log_sys->lsn;
+					mutex_exit(&(log_sys->mutex));
+				}
+			}
+
+		} else {
+			mutex_enter(&(log_sys->mutex));
+			lsn_old = log_sys->lsn;
+			mutex_exit(&(log_sys->mutex));
+		}
+
+		if (srv_activity_count == old_activity_count) {
+
+			/* There is no user activity at the moment, go to
+			the background loop */
+
+			goto background_loop;
+		}
+	}
+
+	/* ---- We perform the following code approximately once per
+	10 seconds when there is database activity */
+
+#ifdef MEM_PERIODIC_CHECK
+	/* Check magic numbers of every allocated mem block once in 10
+	seconds */
+	mem_validate_all_blocks();
+#endif
+	/* If i/os during the 10 second period were less than 200% of
+	capacity, we assume that there is free disk i/o capacity
+	available, and it makes sense to flush srv_io_capacity pages.
+
+	Note that this is done regardless of the fraction of dirty
+	pages relative to the max requested by the user. The one second
+	loop above requests writes for that case. The writes done here
+	are not required, and may be disabled. */
+
+	n_pend_ios = buf_get_n_pending_ios() + log_sys->n_pending_writes;
+	n_ios = log_sys->n_log_ios + buf_pool->stat.n_pages_read
+		+ buf_pool->stat.n_pages_written;
+
+	srv_main_10_second_loops++;
+	if (n_pend_ios < SRV_PEND_IO_THRESHOLD
+	    && (n_ios - n_ios_very_old < SRV_PAST_IO_ACTIVITY)) {
+
+		srv_main_thread_op_info = "flushing buffer pool pages";
+		buf_flush_batch(BUF_FLUSH_LIST, PCT_IO(100),
+				IB_ULONGLONG_MAX);
+
+		/* Flush logs if needed */
+		srv_sync_log_buffer_in_background();
+	}
+
+	/* We run a batch of insert buffer merge every 10 seconds,
+	even if the server were active */
+
+	srv_main_thread_op_info = "doing insert buffer merge";
+	ibuf_contract_for_n_pages(FALSE, PCT_IBUF_IO(5));
+
+	/* Flush logs if needed */
+	srv_sync_log_buffer_in_background();
+
+	if (!srv_use_purge_thread) {
+	/* We run a full purge every 10 seconds, even if the server
+	were active */
+	do {
+
+		if (srv_fast_shutdown && srv_shutdown_state > 0) {
+
+			goto background_loop;
+		}
+
+		srv_main_thread_op_info = "purging";
+		n_pages_purged = trx_purge();
+
+		/* Flush logs if needed */
+		srv_sync_log_buffer_in_background();
+
+	} while (n_pages_purged);
+	}
+
+	srv_main_thread_op_info = "flushing buffer pool pages";
+
+	/* Flush a few oldest pages to make a new checkpoint younger */
+
+	if (buf_get_modified_ratio_pct() > 70) {
+
+		/* If there are lots of modified pages in the buffer pool
+		(> 70 %), we assume we can afford reserving the disk(s) for
+		the time it requires to flush 100 pages */
+
+		n_pages_flushed = buf_flush_batch(BUF_FLUSH_LIST,
+						  PCT_IO(100),
+						  IB_ULONGLONG_MAX);
+	} else {
+		/* Otherwise, we only flush a small number of pages so that
+		we do not unnecessarily use much disk i/o capacity from
+		other work */
+
+		n_pages_flushed = buf_flush_batch(BUF_FLUSH_LIST,
+						  PCT_IO(10),
+						  IB_ULONGLONG_MAX);
+	}
+
+	srv_main_thread_op_info = "making checkpoint";
+
+	/* Make a new checkpoint about once in 10 seconds */
+
+	log_checkpoint(TRUE, FALSE);
+
+	srv_main_thread_op_info = "reserving kernel mutex";
+
+	mutex_enter(&kernel_mutex);
+
+	/* ---- When there is database activity, we jump from here back to
+	the start of loop */
+
+	if (srv_activity_count != old_activity_count) {
+		mutex_exit(&kernel_mutex);
+		goto loop;
+	}
+
+	mutex_exit(&kernel_mutex);
+
+	/* If the database is quiet, we enter the background loop */
+
+	/*****************************************************************/
+background_loop:
+	/* ---- In this loop we run background operations when the server
+	is quiet from user activity. Also in the case of a shutdown, we
+	loop here, flushing the buffer pool to the data files. */
+
+	/* The server has been quiet for a while: start running background
+	operations */
+	srv_main_background_loops++;
+	srv_main_thread_op_info = "doing background drop tables";
+
+	n_tables_to_drop = row_drop_tables_for_mysql_in_background();
+
+	if (n_tables_to_drop > 0) {
+		/* Do not monopolize the CPU even if there are tables waiting
+		in the background drop queue. (It is essentially a bug if
+		MySQL tries to drop a table while there are still open handles
+		to it and we had to put it to the background drop queue.) */
+
+		os_thread_sleep(100000);
+	}
+
+	if (!srv_use_purge_thread) {
+	srv_main_thread_op_info = "purging";
+
+	/* Run a full purge */
+	do {
+		if (srv_fast_shutdown && srv_shutdown_state > 0) {
+
+			break;
+		}
+
+		srv_main_thread_op_info = "purging";
+		n_pages_purged = trx_purge();
+
+		/* Flush logs if needed */
+		srv_sync_log_buffer_in_background();
+
+	} while (n_pages_purged);
+	}
+
+	srv_main_thread_op_info = "reserving kernel mutex";
+
+	mutex_enter(&kernel_mutex);
+	if (srv_activity_count != old_activity_count) {
+		mutex_exit(&kernel_mutex);
+		goto loop;
+	}
+	mutex_exit(&kernel_mutex);
+
+	srv_main_thread_op_info = "doing insert buffer merge";
+
+	if (srv_fast_shutdown && srv_shutdown_state > 0) {
+		n_bytes_merged = 0;
+	} else {
+		/* This should do an amount of IO similar to the number of
+		dirty pages that will be flushed in the call to
+		buf_flush_batch below. Otherwise, the system favors
+		clean pages over cleanup throughput. */
+		n_bytes_merged = ibuf_contract_for_n_pages(FALSE,
+							   PCT_IBUF_IO(100));
+	}
+
+	srv_main_thread_op_info = "reserving kernel mutex";
+
+	mutex_enter(&kernel_mutex);
+	if (srv_activity_count != old_activity_count) {
+		mutex_exit(&kernel_mutex);
+		goto loop;
+	}
+	mutex_exit(&kernel_mutex);
+
+flush_loop:
+	srv_main_thread_op_info = "flushing buffer pool pages";
+	srv_main_flush_loops++;
+	if (srv_fast_shutdown < 2) {
+		n_pages_flushed = buf_flush_batch(BUF_FLUSH_LIST,
+						  PCT_IO(100),
+						  IB_ULONGLONG_MAX);
+	} else {
+		/* In the fastest shutdown we do not flush the buffer pool
+		to data files: we set n_pages_flushed to 0 artificially. */
+
+		n_pages_flushed = 0;
+	}
+
+	srv_main_thread_op_info = "reserving kernel mutex";
+
+	mutex_enter(&kernel_mutex);
+	if (srv_activity_count != old_activity_count) {
+		mutex_exit(&kernel_mutex);
+		goto loop;
+	}
+	mutex_exit(&kernel_mutex);
+
+	srv_main_thread_op_info = "waiting for buffer pool flush to end";
+	buf_flush_wait_batch_end(BUF_FLUSH_LIST);
+
+	/* Flush logs if needed */
+	srv_sync_log_buffer_in_background();
+
+	srv_main_thread_op_info = "making checkpoint";
+
+	log_checkpoint(TRUE, FALSE);
+
+	if (buf_get_modified_ratio_pct() > srv_max_buf_pool_modified_pct) {
+
+		/* Try to keep the number of modified pages in the
+		buffer pool under the limit wished by the user */
+
+		goto flush_loop;
+	}
+
+	srv_main_thread_op_info = "reserving kernel mutex";
+
+	mutex_enter(&kernel_mutex);
+	if (srv_activity_count != old_activity_count) {
+		mutex_exit(&kernel_mutex);
+		goto loop;
+	}
+	mutex_exit(&kernel_mutex);
+	/*
+	srv_main_thread_op_info = "archiving log (if log archive is on)";
+
+	log_archive_do(FALSE, &n_bytes_archived);
+	*/
+	n_bytes_archived = 0;
+
+	/* Keep looping in the background loop if still work to do */
+
+	if (srv_fast_shutdown && srv_shutdown_state > 0) {
+		if (n_tables_to_drop + n_pages_flushed
+		    + n_bytes_archived != 0) {
+
+			/* If we are doing a fast shutdown (= the default)
+			we do not do purge or insert buffer merge. But we
+			flush the buffer pool completely to disk.
+			In a 'very fast' shutdown we do not flush the buffer
+			pool to data files: we have set n_pages_flushed to
+			0 artificially. */
+
+			goto background_loop;
+		}
+	} else if (n_tables_to_drop
+		   + n_pages_purged + n_bytes_merged + n_pages_flushed
+		   + n_bytes_archived != 0) {
+		/* In a 'slow' shutdown we run purge and the insert buffer
+		merge to completion */
+
+		goto background_loop;
+	}
+
+	/* There is no work for background operations either: suspend
+	master thread to wait for more server activity */
+
+suspend_thread:
+	srv_main_thread_op_info = "suspending";
+
+	mutex_enter(&kernel_mutex);
+
+	if (row_get_background_drop_list_len_low() > 0) {
+		mutex_exit(&kernel_mutex);
+
+		goto loop;
+	}
+
+	event = srv_suspend_thread();
+
+	mutex_exit(&kernel_mutex);
+
+	/* DO NOT CHANGE THIS STRING. innobase_start_or_create_for_mysql()
+	waits for database activity to die down when converting < 4.1.x
+	databases, and relies on this string being exactly as it is. InnoDB
+	manual also mentions this string in several places. */
+	srv_main_thread_op_info = "waiting for server activity";
+
+	os_event_wait(event);
+
+	if (srv_shutdown_state == SRV_SHUTDOWN_EXIT_THREADS) {
+		/* This is only extra safety, the thread should exit
+		already when the event wait ends */
+
+		os_thread_exit(NULL);
+	}
+
+	/* When there is user activity, InnoDB will set the event and the
+	main thread goes back to loop. */
+
+	goto loop;
+
+	OS_THREAD_DUMMY_RETURN;	/* Not reached, avoid compiler warning */
+}
+
+/*************************************************************************
+A thread which is devoted to purge, for take over the master thread's
+purging */
+UNIV_INTERN
+os_thread_ret_t
+srv_purge_thread(
+/*=============*/
+	void*	arg __attribute__((unused)))
+			/* in: a dummy parameter required by os_thread_create */
+{
+	ulint	n_pages_purged;
+	ulint	n_pages_purged_sum = 1; /* dummy */
+	ulint	history_len;
+	ulint	sleep_ms= 10000; /* initial: 10 sec. */
+	ibool	can_be_last = FALSE;
+
+#ifdef UNIV_DEBUG_THREAD_CREATION
+	fprintf(stderr, "Purge thread starts, id %lu\n",
+		os_thread_pf(os_thread_get_curr_id()));
+#endif
+
+	mutex_enter(&kernel_mutex);
+	srv_table_reserve_slot(SRV_PURGE);
+	srv_n_threads_active[SRV_PURGE]++;
+	mutex_exit(&kernel_mutex);
+
+loop:
+	if (srv_shutdown_state > 0) {
+		if (srv_fast_shutdown) {
+			/* someone other should wait the end of the workers */
+			goto exit_func;
+		}
+
+		mutex_enter(&kernel_mutex);
+		if (srv_n_threads_active[SRV_PURGE_WORKER]) {
+			can_be_last = FALSE;
+		} else {
+			can_be_last = TRUE;
+		}
+		mutex_exit(&kernel_mutex);
+
+		sleep_ms = 10;
+		os_event_reset(srv_purge_thread_event);
+	}
+
+	os_event_wait_time(srv_purge_thread_event, sleep_ms * 1000);
+
+	history_len = trx_sys->rseg_history_len;
+	if (history_len > 1000)
+		sleep_ms /= 10;
+	if (sleep_ms < 10)
+		sleep_ms = 10;
+
+	n_pages_purged_sum = 0;
+
+	do {
+		if (srv_fast_shutdown && srv_shutdown_state > 0) {
+			goto exit_func;
+		}
+		n_pages_purged = trx_purge();
+		n_pages_purged_sum += n_pages_purged;
+	} while (n_pages_purged);
+
+	if (srv_shutdown_state > 0 && can_be_last) {
+		/* the last trx_purge() is executed without workers */
+		goto exit_func;
+	}
+
+	if (n_pages_purged_sum) {
+		srv_active_wake_master_thread();
+	}
+
+	if (n_pages_purged_sum == 0)
+		sleep_ms *= 10;
+	if (sleep_ms > 10000)
+		sleep_ms = 10000;
+
+	goto loop;
+
+exit_func:
+	trx_purge_worker_wake(); /* It may not make sense. for safety only */
+
+	/* wake master thread to flush the pages */
+	srv_wake_master_thread();
+
+	mutex_enter(&kernel_mutex);
+	srv_n_threads_active[SRV_PURGE]--;
+	mutex_exit(&kernel_mutex);
+	os_thread_exit(NULL);
+
+	OS_THREAD_DUMMY_RETURN;
+}
+
+/*************************************************************************
+A thread which is devoted to purge, for take over the master thread's
+purging */
+UNIV_INTERN
+os_thread_ret_t
+srv_purge_worker_thread(
+/*====================*/
+	void*	arg)
+{
+	ulint	worker_id; /* index for array */
+
+	worker_id = *((ulint*)arg);
+
+#ifdef UNIV_DEBUG_THREAD_CREATION
+	fprintf(stderr, "Purge worker thread starts, id %lu\n",
+		os_thread_pf(os_thread_get_curr_id()));
+#endif
+	mutex_enter(&kernel_mutex);
+	srv_table_reserve_slot(SRV_PURGE_WORKER);
+	srv_n_threads_active[SRV_PURGE_WORKER]++;
+	mutex_exit(&kernel_mutex);
+
+loop:
+	/* purge worker threads only works when srv_shutdown_state==0 */
+	/* for safety and exactness. */
+	if (srv_shutdown_state > 0) {
+		goto exit_func;
+	}
+
+	trx_purge_worker_wait();
+
+	if (srv_shutdown_state > 0) {
+		goto exit_func;
+	}
+
+	trx_purge_worker(worker_id);
+
+	goto loop;
+
+exit_func:
+	mutex_enter(&kernel_mutex);
+	srv_n_threads_active[SRV_PURGE_WORKER]--;
+	mutex_exit(&kernel_mutex);
+	os_thread_exit(NULL);
+
+	OS_THREAD_DUMMY_RETURN;
+}
diff --git a/storage/xtradb/srv/srv0start.c b/storage/xtradb/srv/srv0start.c
new file mode 100644
index 00000000000..b36faf2d2d7
--- /dev/null
+++ b/storage/xtradb/srv/srv0start.c
@@ -0,0 +1,2268 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2010, Innobase Oy. All Rights Reserved.
+Copyright (c) 2008, Google Inc.
+Copyright (c) 2009, Percona Inc.
+
+Portions of this file contain modifications contributed and copyrighted by
+Google, Inc. Those modifications are gratefully acknowledged and are described
+briefly in the InnoDB documentation. The contributions by Google are
+incorporated with their permission, and subject to the conditions contained in
+the file COPYING.Google.
+
+Portions of this file contain modifications contributed and copyrighted
+by Percona Inc.. Those modifications are
+gratefully acknowledged and are described briefly in the InnoDB
+documentation. The contributions by Percona Inc. are incorporated with
+their permission, and subject to the conditions contained in the file
+COPYING.Percona.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/********************************************************************//**
+@file srv/srv0start.c
+Starts the InnoDB database server
+
+Created 2/16/1996 Heikki Tuuri
+*************************************************************************/
+
+#include "ut0mem.h"
+#include "mem0mem.h"
+#include "data0data.h"
+#include "data0type.h"
+#include "dict0dict.h"
+#include "buf0buf.h"
+#include "os0file.h"
+#include "os0thread.h"
+#include "fil0fil.h"
+#include "fsp0fsp.h"
+#include "rem0rec.h"
+#include "mtr0mtr.h"
+#include "log0log.h"
+#include "log0recv.h"
+#include "page0page.h"
+#include "page0cur.h"
+#include "trx0trx.h"
+#include "trx0sys.h"
+#include "btr0btr.h"
+#include "btr0cur.h"
+#include "rem0rec.h"
+#include "ibuf0ibuf.h"
+#include "srv0start.h"
+#include "srv0srv.h"
+#ifndef UNIV_HOTBACKUP
+# include "os0proc.h"
+# include "sync0sync.h"
+# include "buf0flu.h"
+# include "buf0rea.h"
+# include "dict0boot.h"
+# include "dict0load.h"
+# include "que0que.h"
+# include "usr0sess.h"
+# include "lock0lock.h"
+# include "trx0roll.h"
+# include "trx0purge.h"
+# include "lock0lock.h"
+# include "pars0pars.h"
+# include "btr0sea.h"
+# include "rem0cmp.h"
+# include "dict0crea.h"
+# include "row0ins.h"
+# include "row0sel.h"
+# include "row0upd.h"
+# include "row0row.h"
+# include "row0mysql.h"
+# include "btr0pcur.h"
+# include "thr0loc.h"
+# include "os0sync.h" /* for INNODB_RW_LOCKS_USE_ATOMICS */
+# include "zlib.h" /* for ZLIB_VERSION */
+
+/** Log sequence number immediately after startup */
+UNIV_INTERN ib_uint64_t	srv_start_lsn;
+/** Log sequence number at shutdown */
+UNIV_INTERN ib_uint64_t	srv_shutdown_lsn;
+
+#ifdef HAVE_DARWIN_THREADS
+# include <sys/utsname.h>
+/** TRUE if the F_FULLFSYNC option is available */
+UNIV_INTERN ibool	srv_have_fullfsync = FALSE;
+#endif
+
+/** TRUE if a raw partition is in use */
+UNIV_INTERN ibool	srv_start_raw_disk_in_use = FALSE;
+
+/** TRUE if the server is being started, before rolling back any
+incomplete transactions */
+UNIV_INTERN ibool	srv_startup_is_before_trx_rollback_phase = FALSE;
+/** TRUE if the server is being started */
+UNIV_INTERN ibool	srv_is_being_started = FALSE;
+/** TRUE if the server was successfully started */
+UNIV_INTERN ibool	srv_was_started = FALSE;
+/** TRUE if innobase_start_or_create_for_mysql() has been called */
+static ibool	srv_start_has_been_called = FALSE;
+
+/** At a shutdown this value climbs from SRV_SHUTDOWN_NONE to
+SRV_SHUTDOWN_CLEANUP and then to SRV_SHUTDOWN_LAST_PHASE, and so on */
+UNIV_INTERN enum srv_shutdown_state	srv_shutdown_state = SRV_SHUTDOWN_NONE;
+
+/** Files comprising the system tablespace */
+static os_file_t	files[1000];
+
+/** Mutex protecting the ios count */
+static mutex_t		ios_mutex;
+/** Count of I/O operations in io_handler_thread() */
+static ulint		ios;
+
+/** io_handler_thread parameters for thread identification */
+static ulint		n[SRV_MAX_N_IO_THREADS + 7 + 64];
+/** io_handler_thread identifiers */
+static os_thread_id_t	thread_ids[SRV_MAX_N_IO_THREADS + 7 + 64];
+
+/** We use this mutex to test the return value of pthread_mutex_trylock
+   on successful locking. HP-UX does NOT return 0, though Linux et al do. */
+static os_fast_mutex_t	srv_os_test_mutex;
+
+/** Name of srv_monitor_file */
+static char*	srv_monitor_file_name;
+#endif /* !UNIV_HOTBACKUP */
+
+/** */
+#define SRV_N_PENDING_IOS_PER_THREAD	OS_AIO_N_PENDING_IOS_PER_THREAD
+#define SRV_MAX_N_PENDING_SYNC_IOS	100
+
+
+/*********************************************************************//**
+Convert a numeric string that optionally ends in G or M, to a number
+containing megabytes.
+@return	next character in string */
+static
+char*
+srv_parse_megabytes(
+/*================*/
+	char*	str,	/*!< in: string containing a quantity in bytes */
+	ulint*	megs)	/*!< out: the number in megabytes */
+{
+	char*	endp;
+	ulint	size;
+
+	size = strtoul(str, &endp, 10);
+
+	str = endp;
+
+	switch (*str) {
+	case 'G': case 'g':
+		size *= 1024;
+		/* fall through */
+	case 'M': case 'm':
+		str++;
+		break;
+	default:
+		size /= 1024 * 1024;
+		break;
+	}
+
+	*megs = size;
+	return(str);
+}
+
+/*********************************************************************//**
+Reads the data files and their sizes from a character string given in
+the .cnf file.
+@return	TRUE if ok, FALSE on parse error */
+UNIV_INTERN
+ibool
+srv_parse_data_file_paths_and_sizes(
+/*================================*/
+	char*	str)	/*!< in/out: the data file path string */
+{
+	char*	input_str;
+	char*	path;
+	ulint	size;
+	ulint	i	= 0;
+
+	srv_auto_extend_last_data_file = FALSE;
+	srv_last_file_size_max = 0;
+	srv_data_file_names = NULL;
+	srv_data_file_sizes = NULL;
+	srv_data_file_is_raw_partition = NULL;
+
+	input_str = str;
+
+	/* First calculate the number of data files and check syntax:
+	path:size[M | G];path:size[M | G]... . Note that a Windows path may
+	contain a drive name and a ':'. */
+
+	while (*str != '\0') {
+		path = str;
+
+		while ((*str != ':' && *str != '\0')
+		       || (*str == ':'
+			   && (*(str + 1) == '\\' || *(str + 1) == '/'
+			       || *(str + 1) == ':'))) {
+			str++;
+		}
+
+		if (*str == '\0') {
+			return(FALSE);
+		}
+
+		str++;
+
+		str = srv_parse_megabytes(str, &size);
+
+		if (0 == strncmp(str, ":autoextend",
+				 (sizeof ":autoextend") - 1)) {
+
+			str += (sizeof ":autoextend") - 1;
+
+			if (0 == strncmp(str, ":max:",
+					 (sizeof ":max:") - 1)) {
+
+				str += (sizeof ":max:") - 1;
+
+				str = srv_parse_megabytes(str, &size);
+			}
+
+			if (*str != '\0') {
+
+				return(FALSE);
+			}
+		}
+
+		if (strlen(str) >= 6
+		    && *str == 'n'
+		    && *(str + 1) == 'e'
+		    && *(str + 2) == 'w') {
+			str += 3;
+		}
+
+		if (*str == 'r' && *(str + 1) == 'a' && *(str + 2) == 'w') {
+			str += 3;
+		}
+
+		if (size == 0) {
+			return(FALSE);
+		}
+
+		i++;
+
+		if (*str == ';') {
+			str++;
+		} else if (*str != '\0') {
+
+			return(FALSE);
+		}
+	}
+
+	if (i == 0) {
+		/* If innodb_data_file_path was defined it must contain
+		at least one data file definition */
+
+		return(FALSE);
+	}
+
+	srv_data_file_names = malloc(i * sizeof *srv_data_file_names);
+	srv_data_file_sizes = malloc(i * sizeof *srv_data_file_sizes);
+	srv_data_file_is_raw_partition = malloc(
+		i * sizeof *srv_data_file_is_raw_partition);
+
+	srv_n_data_files = i;
+
+	/* Then store the actual values to our arrays */
+
+	str = input_str;
+	i = 0;
+
+	while (*str != '\0') {
+		path = str;
+
+		/* Note that we must step over the ':' in a Windows path;
+		a Windows path normally looks like C:\ibdata\ibdata1:1G, but
+		a Windows raw partition may have a specification like
+		\\.\C::1Gnewraw or \\.\PHYSICALDRIVE2:1Gnewraw */
+
+		while ((*str != ':' && *str != '\0')
+		       || (*str == ':'
+			   && (*(str + 1) == '\\' || *(str + 1) == '/'
+			       || *(str + 1) == ':'))) {
+			str++;
+		}
+
+		if (*str == ':') {
+			/* Make path a null-terminated string */
+			*str = '\0';
+			str++;
+		}
+
+		str = srv_parse_megabytes(str, &size);
+
+		srv_data_file_names[i] = path;
+		srv_data_file_sizes[i] = size;
+
+		if (0 == strncmp(str, ":autoextend",
+				 (sizeof ":autoextend") - 1)) {
+
+			srv_auto_extend_last_data_file = TRUE;
+
+			str += (sizeof ":autoextend") - 1;
+
+			if (0 == strncmp(str, ":max:",
+					 (sizeof ":max:") - 1)) {
+
+				str += (sizeof ":max:") - 1;
+
+				str = srv_parse_megabytes(
+					str, &srv_last_file_size_max);
+			}
+
+			if (*str != '\0') {
+
+				return(FALSE);
+			}
+		}
+
+		(srv_data_file_is_raw_partition)[i] = 0;
+
+		if (strlen(str) >= 6
+		    && *str == 'n'
+		    && *(str + 1) == 'e'
+		    && *(str + 2) == 'w') {
+			str += 3;
+			(srv_data_file_is_raw_partition)[i] = SRV_NEW_RAW;
+		}
+
+		if (*str == 'r' && *(str + 1) == 'a' && *(str + 2) == 'w') {
+			str += 3;
+
+			if ((srv_data_file_is_raw_partition)[i] == 0) {
+				(srv_data_file_is_raw_partition)[i] = SRV_OLD_RAW;
+			}
+		}
+
+		i++;
+
+		if (*str == ';') {
+			str++;
+		}
+	}
+
+	return(TRUE);
+}
+
+/*********************************************************************//**
+Reads log group home directories from a character string given in
+the .cnf file.
+@return	TRUE if ok, FALSE on parse error */
+UNIV_INTERN
+ibool
+srv_parse_log_group_home_dirs(
+/*==========================*/
+	char*	str)	/*!< in/out: character string */
+{
+	char*	input_str;
+	char*	path;
+	ulint	i	= 0;
+
+	srv_log_group_home_dirs = NULL;
+
+	input_str = str;
+
+	/* First calculate the number of directories and check syntax:
+	path;path;... */
+
+	while (*str != '\0') {
+		path = str;
+
+		while (*str != ';' && *str != '\0') {
+			str++;
+		}
+
+		i++;
+
+		if (*str == ';') {
+			str++;
+		} else if (*str != '\0') {
+
+			return(FALSE);
+		}
+	}
+
+	if (i != 1) {
+		/* If innodb_log_group_home_dir was defined it must
+		contain exactly one path definition under current MySQL */
+
+		return(FALSE);
+	}
+
+	srv_log_group_home_dirs = malloc(i * sizeof *srv_log_group_home_dirs);
+
+	/* Then store the actual values to our array */
+
+	str = input_str;
+	i = 0;
+
+	while (*str != '\0') {
+		path = str;
+
+		while (*str != ';' && *str != '\0') {
+			str++;
+		}
+
+		if (*str == ';') {
+			*str = '\0';
+			str++;
+		}
+
+		srv_log_group_home_dirs[i] = path;
+
+		i++;
+	}
+
+	return(TRUE);
+}
+
+/*********************************************************************//**
+Frees the memory allocated by srv_parse_data_file_paths_and_sizes()
+and srv_parse_log_group_home_dirs(). */
+UNIV_INTERN
+void
+srv_free_paths_and_sizes(void)
+/*==========================*/
+{
+	free(srv_data_file_names);
+	srv_data_file_names = NULL;
+	free(srv_data_file_sizes);
+	srv_data_file_sizes = NULL;
+	free(srv_data_file_is_raw_partition);
+	srv_data_file_is_raw_partition = NULL;
+	free(srv_log_group_home_dirs);
+	srv_log_group_home_dirs = NULL;
+}
+
+#ifndef UNIV_HOTBACKUP
+/********************************************************************//**
+I/o-handler thread function.
+@return	OS_THREAD_DUMMY_RETURN */
+static
+os_thread_ret_t
+io_handler_thread(
+/*==============*/
+	void*	arg)	/*!< in: pointer to the number of the segment in
+			the aio array */
+{
+	ulint	segment;
+	ulint	i;
+
+	segment = *((ulint*)arg);
+
+#ifdef UNIV_DEBUG_THREAD_CREATION
+	fprintf(stderr, "Io handler thread %lu starts, id %lu\n", segment,
+		os_thread_pf(os_thread_get_curr_id()));
+#endif
+	for (i = 0;; i++) {
+		fil_aio_wait(segment);
+
+		mutex_enter(&ios_mutex);
+		ios++;
+		mutex_exit(&ios_mutex);
+	}
+
+	thr_local_free(os_thread_get_curr_id());
+
+	/* We count the number of threads in os_thread_exit(). A created
+	thread should always use that to exit and not use return() to exit.
+	The thread actually never comes here because it is exited in an
+	os_event_wait(). */
+
+	os_thread_exit(NULL);
+
+	OS_THREAD_DUMMY_RETURN;
+}
+#endif /* !UNIV_HOTBACKUP */
+
+#ifdef __WIN__
+#define SRV_PATH_SEPARATOR	'\\'
+#else
+#define SRV_PATH_SEPARATOR	'/'
+#endif
+
+/*********************************************************************//**
+Normalizes a directory path for Windows: converts slashes to backslashes. */
+UNIV_INTERN
+void
+srv_normalize_path_for_win(
+/*=======================*/
+	char*	str __attribute__((unused)))	/*!< in/out: null-terminated
+						character string */
+{
+#ifdef __WIN__
+	for (; *str; str++) {
+
+		if (*str == '/') {
+			*str = '\\';
+		}
+	}
+#endif
+}
+
+#ifndef UNIV_HOTBACKUP
+/*********************************************************************//**
+Calculates the low 32 bits when a file size which is given as a number
+database pages is converted to the number of bytes.
+@return	low 32 bytes of file size when expressed in bytes */
+static
+ulint
+srv_calc_low32(
+/*===========*/
+	ulint	file_size)	/*!< in: file size in database pages */
+{
+	return(0xFFFFFFFFUL & (file_size << UNIV_PAGE_SIZE_SHIFT));
+}
+
+/*********************************************************************//**
+Calculates the high 32 bits when a file size which is given as a number
+database pages is converted to the number of bytes.
+@return	high 32 bytes of file size when expressed in bytes */
+static
+ulint
+srv_calc_high32(
+/*============*/
+	ulint	file_size)	/*!< in: file size in database pages */
+{
+	return(file_size >> (32 - UNIV_PAGE_SIZE_SHIFT));
+}
+
+/*********************************************************************//**
+Creates or opens the log files and closes them.
+@return	DB_SUCCESS or error code */
+static
+ulint
+open_or_create_log_file(
+/*====================*/
+	ibool	create_new_db,		/*!< in: TRUE if we should create a
+					new database */
+	ibool*	log_file_created,	/*!< out: TRUE if new log file
+					created */
+	ibool	log_file_has_been_opened,/*!< in: TRUE if a log file has been
+					opened before: then it is an error
+					to try to create another log file */
+	ulint	k,			/*!< in: log group number */
+	ulint	i)			/*!< in: log file number in group */
+{
+	ibool	ret;
+	ulint	size;
+	ulint	size_high;
+	char	name[10000];
+	ulint	dirnamelen;
+
+	UT_NOT_USED(create_new_db);
+
+	*log_file_created = FALSE;
+
+	srv_normalize_path_for_win(srv_log_group_home_dirs[k]);
+
+	dirnamelen = strlen(srv_log_group_home_dirs[k]);
+	ut_a(dirnamelen < (sizeof name) - 10 - sizeof "ib_logfile");
+	memcpy(name, srv_log_group_home_dirs[k], dirnamelen);
+
+	/* Add a path separator if needed. */
+	if (dirnamelen && name[dirnamelen - 1] != SRV_PATH_SEPARATOR) {
+		name[dirnamelen++] = SRV_PATH_SEPARATOR;
+	}
+
+	sprintf(name + dirnamelen, "%s%lu", "ib_logfile", (ulong) i);
+
+	files[i] = os_file_create(name, OS_FILE_CREATE, OS_FILE_NORMAL,
+				  OS_LOG_FILE, &ret);
+	if (ret == FALSE) {
+		if (os_file_get_last_error(FALSE) != OS_FILE_ALREADY_EXISTS
+#ifdef UNIV_AIX
+		    /* AIX 5.1 after security patch ML7 may have errno set
+		    to 0 here, which causes our function to return 100;
+		    work around that AIX problem */
+		    && os_file_get_last_error(FALSE) != 100
+#endif
+		    ) {
+			fprintf(stderr,
+				"InnoDB: Error in creating"
+				" or opening %s\n", name);
+
+			return(DB_ERROR);
+		}
+
+		files[i] = os_file_create(name, OS_FILE_OPEN, OS_FILE_AIO,
+					  OS_LOG_FILE, &ret);
+		if (!ret) {
+			fprintf(stderr,
+				"InnoDB: Error in opening %s\n", name);
+
+			return(DB_ERROR);
+		}
+
+		ret = os_file_get_size(files[i], &size, &size_high);
+		ut_a(ret);
+
+		if (size != srv_calc_low32(srv_log_file_size)
+		    || size_high != srv_calc_high32(srv_log_file_size)) {
+
+			fprintf(stderr,
+				"InnoDB: Error: log file %s is"
+				" of different size %lu %lu bytes\n"
+				"InnoDB: than specified in the .cnf"
+				" file %lu %lu bytes!\n",
+				name, (ulong) size_high, (ulong) size,
+				(ulong) srv_calc_high32(srv_log_file_size),
+				(ulong) srv_calc_low32(srv_log_file_size));
+
+			return(DB_ERROR);
+		}
+	} else {
+		*log_file_created = TRUE;
+
+		ut_print_timestamp(stderr);
+
+		fprintf(stderr,
+			"  InnoDB: Log file %s did not exist:"
+			" new to be created\n",
+			name);
+		if (log_file_has_been_opened) {
+
+			return(DB_ERROR);
+		}
+
+		fprintf(stderr, "InnoDB: Setting log file %s size to %lu MB\n",
+			name, (ulong) srv_log_file_size
+			>> (20 - UNIV_PAGE_SIZE_SHIFT));
+
+		fprintf(stderr,
+			"InnoDB: Database physically writes the file"
+			" full: wait...\n");
+
+		ret = os_file_set_size(name, files[i],
+				       srv_calc_low32(srv_log_file_size),
+				       srv_calc_high32(srv_log_file_size));
+		if (!ret) {
+			fprintf(stderr,
+				"InnoDB: Error in creating %s:"
+				" probably out of disk space\n",
+				name);
+
+			return(DB_ERROR);
+		}
+	}
+
+	ret = os_file_close(files[i]);
+	ut_a(ret);
+
+	if (i == 0) {
+		/* Create in memory the file space object
+		which is for this log group */
+
+		fil_space_create(name,
+				 2 * k + SRV_LOG_SPACE_FIRST_ID, 0, FIL_LOG);
+	}
+
+	ut_a(fil_validate());
+
+	fil_node_create(name, srv_log_file_size,
+			2 * k + SRV_LOG_SPACE_FIRST_ID, FALSE);
+#ifdef UNIV_LOG_ARCHIVE
+	/* If this is the first log group, create the file space object
+	for archived logs.
+	Under MySQL, no archiving ever done. */
+
+	if (k == 0 && i == 0) {
+		arch_space_id = 2 * k + 1 + SRV_LOG_SPACE_FIRST_ID;
+
+		fil_space_create("arch_log_space", arch_space_id, 0, FIL_LOG);
+	} else {
+		arch_space_id = ULINT_UNDEFINED;
+	}
+#endif /* UNIV_LOG_ARCHIVE */
+	if (i == 0) {
+		log_group_init(k, srv_n_log_files,
+			       srv_log_file_size * UNIV_PAGE_SIZE,
+			       2 * k + SRV_LOG_SPACE_FIRST_ID,
+			       SRV_LOG_SPACE_FIRST_ID + 1); /* dummy arch
+							    space id */
+	}
+
+	return(DB_SUCCESS);
+}
+
+/*********************************************************************//**
+Creates or opens database data files and closes them.
+@return	DB_SUCCESS or error code */
+static
+ulint
+open_or_create_data_files(
+/*======================*/
+	ibool*		create_new_db,	/*!< out: TRUE if new database should be
+					created */
+	ibool*		create_new_doublewrite_file,
+#ifdef UNIV_LOG_ARCHIVE
+	ulint*		min_arch_log_no,/*!< out: min of archived log
+					numbers in data files */
+	ulint*		max_arch_log_no,/*!< out: max of archived log
+					numbers in data files */
+#endif /* UNIV_LOG_ARCHIVE */
+	ib_uint64_t*	min_flushed_lsn,/*!< out: min of flushed lsn
+					values in data files */
+	ib_uint64_t*	max_flushed_lsn,/*!< out: max of flushed lsn
+					values in data files */
+	ulint*		sum_of_new_sizes)/*!< out: sum of sizes of the
+					new files added */
+{
+	ibool	ret;
+	ulint	i;
+	ibool	one_opened	= FALSE;
+	ibool	one_created	= FALSE;
+	ulint	size;
+	ulint	size_high;
+	ulint	rounded_size_pages;
+	char	name[10000];
+
+	if (srv_n_data_files >= 1000) {
+		fprintf(stderr, "InnoDB: can only have < 1000 data files\n"
+			"InnoDB: you have defined %lu\n",
+			(ulong) srv_n_data_files);
+		return(DB_ERROR);
+	}
+
+	*sum_of_new_sizes = 0;
+
+	*create_new_db = FALSE;
+	*create_new_doublewrite_file = FALSE;
+
+	srv_normalize_path_for_win(srv_data_home);
+
+	for (i = 0; i < srv_n_data_files; i++) {
+		ulint	dirnamelen;
+
+		srv_normalize_path_for_win(srv_data_file_names[i]);
+		dirnamelen = strlen(srv_data_home);
+
+		ut_a(dirnamelen + strlen(srv_data_file_names[i])
+		     < (sizeof name) - 1);
+		memcpy(name, srv_data_home, dirnamelen);
+		/* Add a path separator if needed. */
+		if (dirnamelen && name[dirnamelen - 1] != SRV_PATH_SEPARATOR) {
+			name[dirnamelen++] = SRV_PATH_SEPARATOR;
+		}
+
+		strcpy(name + dirnamelen, srv_data_file_names[i]);
+
+		if (srv_data_file_is_raw_partition[i] == 0) {
+
+			/* First we try to create the file: if it already
+			exists, ret will get value FALSE */
+
+			files[i] = os_file_create(name, OS_FILE_CREATE,
+						  OS_FILE_NORMAL,
+						  OS_DATA_FILE, &ret);
+
+			if (ret == FALSE && os_file_get_last_error(FALSE)
+			    != OS_FILE_ALREADY_EXISTS
+#ifdef UNIV_AIX
+			    /* AIX 5.1 after security patch ML7 may have
+			    errno set to 0 here, which causes our function
+			    to return 100; work around that AIX problem */
+			    && os_file_get_last_error(FALSE) != 100
+#endif
+			    ) {
+				fprintf(stderr,
+					"InnoDB: Error in creating"
+					" or opening %s\n",
+					name);
+
+				return(DB_ERROR);
+			}
+		} else if (srv_data_file_is_raw_partition[i] == SRV_NEW_RAW) {
+			/* The partition is opened, not created; then it is
+			written over */
+
+			srv_start_raw_disk_in_use = TRUE;
+			srv_created_new_raw = TRUE;
+
+			files[i] = os_file_create(name, OS_FILE_OPEN_RAW,
+						  OS_FILE_NORMAL,
+						  OS_DATA_FILE, &ret);
+			if (!ret) {
+				fprintf(stderr,
+					"InnoDB: Error in opening %s\n", name);
+
+				return(DB_ERROR);
+			}
+		} else if (srv_data_file_is_raw_partition[i] == SRV_OLD_RAW) {
+			srv_start_raw_disk_in_use = TRUE;
+
+			ret = FALSE;
+		} else {
+			ut_a(0);
+		}
+
+		if (ret == FALSE) {
+			/* We open the data file */
+
+			if (one_created) {
+				fprintf(stderr,
+					"InnoDB: Error: data files can only"
+					" be added at the end\n");
+				fprintf(stderr,
+					"InnoDB: of a tablespace, but"
+					" data file %s existed beforehand.\n",
+					name);
+				return(DB_ERROR);
+			}
+
+			if (srv_data_file_is_raw_partition[i] == SRV_OLD_RAW) {
+				files[i] = os_file_create(
+					name, OS_FILE_OPEN_RAW,
+					OS_FILE_NORMAL, OS_DATA_FILE, &ret);
+			} else if (i == 0) {
+				files[i] = os_file_create(
+					name, OS_FILE_OPEN_RETRY,
+					OS_FILE_NORMAL, OS_DATA_FILE, &ret);
+			} else {
+				files[i] = os_file_create(
+					name, OS_FILE_OPEN, OS_FILE_NORMAL,
+					OS_DATA_FILE, &ret);
+			}
+
+			if (!ret) {
+				fprintf(stderr,
+					"InnoDB: Error in opening %s\n", name);
+				os_file_get_last_error(TRUE);
+
+				return(DB_ERROR);
+			}
+
+			if (srv_data_file_is_raw_partition[i] == SRV_OLD_RAW) {
+
+				goto skip_size_check;
+			}
+
+			ret = os_file_get_size(files[i], &size, &size_high);
+			ut_a(ret);
+			/* Round size downward to megabytes */
+
+			rounded_size_pages
+				= (size / (1024 * 1024) + 4096 * size_high)
+					<< (20 - UNIV_PAGE_SIZE_SHIFT);
+
+			if (i == srv_n_data_files - 1
+			    && srv_auto_extend_last_data_file) {
+
+				if (srv_data_file_sizes[i] > rounded_size_pages
+				    || (srv_last_file_size_max > 0
+					&& srv_last_file_size_max
+					< rounded_size_pages)) {
+
+					fprintf(stderr,
+						"InnoDB: Error: auto-extending"
+						" data file %s is"
+						" of a different size\n"
+						"InnoDB: %lu pages (rounded"
+						" down to MB) than specified"
+						" in the .cnf file:\n"
+						"InnoDB: initial %lu pages,"
+						" max %lu (relevant if"
+						" non-zero) pages!\n",
+						name,
+						(ulong) rounded_size_pages,
+						(ulong) srv_data_file_sizes[i],
+						(ulong)
+						srv_last_file_size_max);
+
+					return(DB_ERROR);
+				}
+
+				srv_data_file_sizes[i] = rounded_size_pages;
+			}
+
+			if (rounded_size_pages != srv_data_file_sizes[i]) {
+
+				fprintf(stderr,
+					"InnoDB: Error: data file %s"
+					" is of a different size\n"
+					"InnoDB: %lu pages"
+					" (rounded down to MB)\n"
+					"InnoDB: than specified"
+					" in the .cnf file %lu pages!\n",
+					name,
+					(ulong) rounded_size_pages,
+					(ulong) srv_data_file_sizes[i]);
+
+				return(DB_ERROR);
+			}
+skip_size_check:
+			fil_read_flushed_lsn_and_arch_log_no(
+				files[i], one_opened,
+#ifdef UNIV_LOG_ARCHIVE
+				min_arch_log_no, max_arch_log_no,
+#endif /* UNIV_LOG_ARCHIVE */
+				min_flushed_lsn, max_flushed_lsn);
+			one_opened = TRUE;
+		} else {
+			/* We created the data file and now write it full of
+			zeros */
+
+			one_created = TRUE;
+
+			if (i > 0) {
+				ut_print_timestamp(stderr);
+				fprintf(stderr,
+					"  InnoDB: Data file %s did not"
+					" exist: new to be created\n",
+					name);
+			} else {
+				fprintf(stderr,
+					"InnoDB: The first specified"
+					" data file %s did not exist:\n"
+					"InnoDB: a new database"
+					" to be created!\n", name);
+				*create_new_db = TRUE;
+			}
+
+			ut_print_timestamp(stderr);
+			fprintf(stderr,
+				"  InnoDB: Setting file %s size to %lu MB\n",
+				name,
+				(ulong) (srv_data_file_sizes[i]
+					 >> (20 - UNIV_PAGE_SIZE_SHIFT)));
+
+			fprintf(stderr,
+				"InnoDB: Database physically writes the"
+				" file full: wait...\n");
+
+			ret = os_file_set_size(
+				name, files[i],
+				srv_calc_low32(srv_data_file_sizes[i]),
+				srv_calc_high32(srv_data_file_sizes[i]));
+
+			if (!ret) {
+				fprintf(stderr,
+					"InnoDB: Error in creating %s:"
+					" probably out of disk space\n", name);
+
+				return(DB_ERROR);
+			}
+
+			*sum_of_new_sizes = *sum_of_new_sizes
+				+ srv_data_file_sizes[i];
+		}
+
+		ret = os_file_close(files[i]);
+		ut_a(ret);
+
+		if (i == 0) {
+			fil_space_create(name, 0, 0, FIL_TABLESPACE);
+		}
+
+		ut_a(fil_validate());
+
+		fil_node_create(name, srv_data_file_sizes[i], 0,
+				srv_data_file_is_raw_partition[i] != 0);
+	}
+
+	/* special file for doublewrite buffer */
+	if (srv_doublewrite_file)
+	{
+		srv_normalize_path_for_win(srv_doublewrite_file);
+
+		fprintf(stderr,
+			"InnoDB: Notice: innodb_doublewrite_file is specified.\n"
+			"InnoDB: This is for expert only. Don't use if you don't understand what is it 'WELL'.\n"
+			"InnoDB: ### Don't specify older file than the last checkpoint ###\n"
+			"InnoDB: otherwise the older doublewrite buffer will break your data during recovery!\n");
+
+		strcpy(name, srv_doublewrite_file);
+
+		/* First we try to create the file: if it already
+		exists, ret will get value FALSE */
+
+		files[i] = os_file_create(name, OS_FILE_CREATE,
+					  OS_FILE_NORMAL,
+					  OS_DATA_FILE, &ret);
+
+		if (ret == FALSE && os_file_get_last_error(FALSE)
+		    != OS_FILE_ALREADY_EXISTS
+#ifdef UNIV_AIX
+		    /* AIX 5.1 after security patch ML7 may have
+		    errno set to 0 here, which causes our function
+		    to return 100; work around that AIX problem */
+		    && os_file_get_last_error(FALSE) != 100
+#endif
+		    ) {
+			fprintf(stderr,
+				"InnoDB: Error in creating"
+				" or opening %s\n",
+				name);
+
+			return(DB_ERROR);
+		}
+
+		if (ret == FALSE) {
+			/* We open the data file */
+
+			files[i] = os_file_create(
+				name, OS_FILE_OPEN, OS_FILE_NORMAL,
+				OS_DATA_FILE, &ret);
+
+			if (!ret) {
+				fprintf(stderr,
+					"InnoDB: Error in opening %s\n", name);
+				os_file_get_last_error(TRUE);
+
+				return(DB_ERROR);
+			}
+
+			ret = os_file_get_size(files[i], &size, &size_high);
+			ut_a(ret);
+			/* Round size downward to megabytes */
+
+			rounded_size_pages
+				= (size / (1024 * 1024) + 4096 * size_high)
+					<< (20 - UNIV_PAGE_SIZE_SHIFT);
+
+			if (rounded_size_pages != TRX_SYS_DOUBLEWRITE_BLOCK_SIZE * 9) {
+
+				fprintf(stderr,
+					"InnoDB: Warning: doublewrite buffer file %s"
+					" is of a different size\n"
+					"InnoDB: %lu pages"
+					" (rounded down to MB)\n"
+					"InnoDB: than intended size"
+					" %lu pages...\n",
+					name,
+					(ulong) rounded_size_pages,
+					(ulong) TRX_SYS_DOUBLEWRITE_BLOCK_SIZE * 9);
+			}
+
+			fil_read_flushed_lsn_and_arch_log_no(
+				files[i], one_opened,
+#ifdef UNIV_LOG_ARCHIVE
+				min_arch_log_no, max_arch_log_no,
+#endif /* UNIV_LOG_ARCHIVE */
+				min_flushed_lsn, max_flushed_lsn);
+			one_opened = TRUE;
+		} else {
+			/* We created the data file and now write it full of
+			zeros */
+
+			*create_new_doublewrite_file = TRUE;
+
+			ut_print_timestamp(stderr);
+			fprintf(stderr,
+				"  InnoDB: Doublewrite buffer file %s did not"
+				" exist: new to be created\n",
+				name);
+
+			if (*create_new_db == FALSE) {
+				fprintf(stderr,
+					"InnoDB: Warning: Previous version's ibdata files may cause crash.\n"
+					"        If you use that, please use the ibdata files of this version.\n");
+			}
+
+			ut_print_timestamp(stderr);
+			fprintf(stderr,
+				"  InnoDB: Setting file %s size to %lu MB\n",
+				name,
+				(ulong) ((TRX_SYS_DOUBLEWRITE_BLOCK_SIZE * 9)
+					 >> (20 - UNIV_PAGE_SIZE_SHIFT)));
+
+			fprintf(stderr,
+				"InnoDB: Database physically writes the"
+				" file full: wait...\n");
+
+			ret = os_file_set_size(
+				name, files[i],
+				srv_calc_low32(TRX_SYS_DOUBLEWRITE_BLOCK_SIZE * 9),
+				srv_calc_high32(TRX_SYS_DOUBLEWRITE_BLOCK_SIZE * 9));
+
+			if (!ret) {
+				fprintf(stderr,
+					"InnoDB: Error in creating %s:"
+					" probably out of disk space\n", name);
+
+				return(DB_ERROR);
+			}
+		}
+
+		ret = os_file_close(files[i]);
+		ut_a(ret);
+
+		fil_space_create(name, TRX_DOUBLEWRITE_SPACE, 0, FIL_TABLESPACE);
+
+		ut_a(fil_validate());
+
+		fil_node_create(name, TRX_SYS_DOUBLEWRITE_BLOCK_SIZE * 9, TRX_DOUBLEWRITE_SPACE, FALSE);
+
+		i++;
+	}
+
+	ios = 0;
+
+	mutex_create(&ios_mutex, SYNC_NO_ORDER_CHECK);
+
+	return(DB_SUCCESS);
+}
+
+/********************************************************************
+Starts InnoDB and creates a new database if database files
+are not found and the user wants.
+@return	DB_SUCCESS or error code */
+UNIV_INTERN
+int
+innobase_start_or_create_for_mysql(void)
+/*====================================*/
+{
+	buf_pool_t*	ret;
+	ibool		create_new_db;
+	ibool		create_new_doublewrite_file;
+	ibool		log_file_created;
+	ibool		log_created	= FALSE;
+	ibool		log_opened	= FALSE;
+	ib_uint64_t	min_flushed_lsn;
+	ib_uint64_t	max_flushed_lsn;
+#ifdef UNIV_LOG_ARCHIVE
+	ulint		min_arch_log_no;
+	ulint		max_arch_log_no;
+#endif /* UNIV_LOG_ARCHIVE */
+	ulint		sum_of_new_sizes;
+	ulint		sum_of_data_file_sizes;
+	ulint		tablespace_size_in_header;
+	ulint		err;
+	ulint		i;
+	ulint		io_limit;
+	my_bool		srv_file_per_table_original_value
+		= srv_file_per_table;
+	mtr_t		mtr;
+#ifdef HAVE_DARWIN_THREADS
+# ifdef F_FULLFSYNC
+	/* This executable has been compiled on Mac OS X 10.3 or later.
+	Assume that F_FULLFSYNC is available at run-time. */
+	srv_have_fullfsync = TRUE;
+# else /* F_FULLFSYNC */
+	/* This executable has been compiled on Mac OS X 10.2
+	or earlier.  Determine if the executable is running
+	on Mac OS X 10.3 or later. */
+	struct utsname utsname;
+	if (uname(&utsname)) {
+		fputs("InnoDB: cannot determine Mac OS X version!\n", stderr);
+	} else {
+		srv_have_fullfsync = strcmp(utsname.release, "7.") >= 0;
+	}
+	if (!srv_have_fullfsync) {
+		fputs("InnoDB: On Mac OS X, fsync() may be"
+		      " broken on internal drives,\n"
+		      "InnoDB: making transactions unsafe!\n", stderr);
+	}
+# endif /* F_FULLFSYNC */
+#endif /* HAVE_DARWIN_THREADS */
+
+	if (sizeof(ulint) != sizeof(void*)) {
+		fprintf(stderr,
+			"InnoDB: Error: size of InnoDB's ulint is %lu,"
+			" but size of void* is %lu.\n"
+			"InnoDB: The sizes should be the same"
+			" so that on a 64-bit platform you can\n"
+			"InnoDB: allocate more than 4 GB of memory.",
+			(ulong)sizeof(ulint), (ulong)sizeof(void*));
+	}
+
+	/* System tables are created in tablespace 0.  Thus, we must
+	temporarily clear srv_file_per_table.  This is ok, because the
+	server will not accept connections (which could modify
+	innodb_file_per_table) until this function has returned. */
+	srv_file_per_table = FALSE;
+#ifdef UNIV_DEBUG
+	fprintf(stderr,
+		"InnoDB: !!!!!!!! UNIV_DEBUG switched on !!!!!!!!!\n");
+#endif
+
+#ifdef UNIV_IBUF_DEBUG
+	fprintf(stderr,
+		"InnoDB: !!!!!!!! UNIV_IBUF_DEBUG switched on !!!!!!!!!\n"
+# ifdef UNIV_IBUF_COUNT_DEBUG
+		"InnoDB: !!!!!!!! UNIV_IBUF_COUNT_DEBUG switched on !!!!!!!!!\n"
+		"InnoDB: Crash recovery will fail with UNIV_IBUF_COUNT_DEBUG\n"
+# endif
+		);
+#endif
+
+#ifdef UNIV_SYNC_DEBUG
+	fprintf(stderr,
+		"InnoDB: !!!!!!!! UNIV_SYNC_DEBUG switched on !!!!!!!!!\n");
+#endif
+
+#ifdef UNIV_SEARCH_DEBUG
+	fprintf(stderr,
+		"InnoDB: !!!!!!!! UNIV_SEARCH_DEBUG switched on !!!!!!!!!\n");
+#endif
+
+#ifdef UNIV_LOG_LSN_DEBUG
+	fprintf(stderr,
+		"InnoDB: !!!!!!!! UNIV_LOG_LSN_DEBUG switched on !!!!!!!!!\n");
+#endif /* UNIV_LOG_LSN_DEBUG */
+#ifdef UNIV_MEM_DEBUG
+	fprintf(stderr,
+		"InnoDB: !!!!!!!! UNIV_MEM_DEBUG switched on !!!!!!!!!\n");
+#endif
+
+	if (UNIV_LIKELY(srv_use_sys_malloc)) {
+		fprintf(stderr,
+			"InnoDB: The InnoDB memory heap is disabled\n");
+	}
+
+	fputs("InnoDB: " IB_ATOMICS_STARTUP_MSG
+	      "\nInnoDB: Compressed tables use zlib " ZLIB_VERSION
+#ifdef UNIV_ZIP_DEBUG
+	      " with validation"
+#endif /* UNIV_ZIP_DEBUG */
+#ifdef UNIV_ZIP_COPY
+	      " and extra copying"
+#endif /* UNIV_ZIP_COPY */
+	      "\n" , stderr);
+
+	/* Since InnoDB does not currently clean up all its internal data
+	structures in MySQL Embedded Server Library server_end(), we
+	print an error message if someone tries to start up InnoDB a
+	second time during the process lifetime. */
+
+	if (srv_start_has_been_called) {
+		fprintf(stderr,
+			"InnoDB: Error: startup called second time"
+			" during the process lifetime.\n"
+			"InnoDB: In the MySQL Embedded Server Library"
+			" you cannot call server_init()\n"
+			"InnoDB: more than once during"
+			" the process lifetime.\n");
+	}
+
+	srv_start_has_been_called = TRUE;
+
+#ifdef UNIV_DEBUG
+	log_do_write = TRUE;
+#endif /* UNIV_DEBUG */
+	/*	yydebug = TRUE; */
+
+	srv_is_being_started = TRUE;
+	srv_startup_is_before_trx_rollback_phase = TRUE;
+	os_aio_use_native_aio = FALSE;
+
+#ifdef __WIN__
+	switch (os_get_os_version()) {
+	case OS_WIN95:
+	case OS_WIN31:
+	case OS_WINNT:
+		/* On Win 95, 98, ME, Win32 subsystem for Windows 3.1,
+		and NT use simulated aio. In NT Windows provides async i/o,
+		but when run in conjunction with InnoDB Hot Backup, it seemed
+		to corrupt the data files. */
+
+		os_aio_use_native_aio = FALSE;
+		break;
+	default:
+		/* On Win 2000 and XP use async i/o */
+		//os_aio_use_native_aio = TRUE;
+		os_aio_use_native_aio = FALSE;
+		fprintf(stderr,
+			"InnoDB: Windows native async i/o is disabled as default.\n"
+			"InnoDB:   It is not applicable for the current"
+			" multi io threads implementation.\n");
+		break;
+	}
+#endif
+	if (srv_file_flush_method_str == NULL) {
+		/* These are the default options */
+
+		srv_unix_file_flush_method = SRV_UNIX_FSYNC;
+
+		srv_win_file_flush_method = SRV_WIN_IO_UNBUFFERED;
+#ifndef __WIN__
+	} else if (0 == ut_strcmp(srv_file_flush_method_str, "fsync")) {
+		srv_unix_file_flush_method = SRV_UNIX_FSYNC;
+
+	} else if (0 == ut_strcmp(srv_file_flush_method_str, "O_DSYNC")) {
+		srv_unix_file_flush_method = SRV_UNIX_O_DSYNC;
+
+	} else if (0 == ut_strcmp(srv_file_flush_method_str, "O_DIRECT")) {
+		srv_unix_file_flush_method = SRV_UNIX_O_DIRECT;
+
+	} else if (0 == ut_strcmp(srv_file_flush_method_str, "ALL_O_DIRECT")) {
+		srv_unix_file_flush_method = SRV_UNIX_ALL_O_DIRECT;
+
+	} else if (0 == ut_strcmp(srv_file_flush_method_str, "littlesync")) {
+		srv_unix_file_flush_method = SRV_UNIX_LITTLESYNC;
+
+	} else if (0 == ut_strcmp(srv_file_flush_method_str, "nosync")) {
+		srv_unix_file_flush_method = SRV_UNIX_NOSYNC;
+#else
+	} else if (0 == ut_strcmp(srv_file_flush_method_str, "normal")) {
+		srv_win_file_flush_method = SRV_WIN_IO_NORMAL;
+		os_aio_use_native_aio = FALSE;
+
+	} else if (0 == ut_strcmp(srv_file_flush_method_str, "unbuffered")) {
+		srv_win_file_flush_method = SRV_WIN_IO_UNBUFFERED;
+		os_aio_use_native_aio = FALSE;
+
+	} else if (0 == ut_strcmp(srv_file_flush_method_str,
+				  "async_unbuffered")) {
+		srv_win_file_flush_method = SRV_WIN_IO_UNBUFFERED;
+		os_aio_use_native_aio = TRUE;
+		srv_n_read_io_threads = srv_n_write_io_threads = 1;
+		fprintf(stderr,
+			"InnoDB: 'async_unbuffered' was detected as innodb_flush_method.\n"
+			"InnoDB:   Windows native async i/o is enabled.\n"
+			"InnoDB:   And io threads are restricted.\n");
+#endif
+	} else {
+		fprintf(stderr,
+			"InnoDB: Unrecognized value %s for"
+			" innodb_flush_method\n",
+			srv_file_flush_method_str);
+		return(DB_ERROR);
+	}
+
+	/* Note that the call srv_boot() also changes the values of
+	some variables to the units used by InnoDB internally */
+
+	/* Set the maximum number of threads which can wait for a semaphore
+	inside InnoDB: this is the 'sync wait array' size, as well as the
+	maximum number of threads that can wait in the 'srv_conc array' for
+	their time to enter InnoDB. */
+
+#if defined(__NETWARE__)
+
+	/* Create less event semaphores because Win 98/ME had
+	difficulty creating 40000 event semaphores.  Comment from
+	Novell, Inc.: also, these just take a lot of memory on
+	NetWare. */
+	srv_max_n_threads = 1000;
+#else
+	if (srv_buf_pool_size >= 1000 * 1024 * 1024) {
+		/* If buffer pool is less than 1000 MB,
+		assume fewer threads. */
+		srv_max_n_threads = 50000;
+
+	} else if (srv_buf_pool_size >= 8 * 1024 * 1024) {
+
+		srv_max_n_threads = 10000;
+	} else {
+		srv_max_n_threads = 1000;	/* saves several MB of memory,
+						especially in 64-bit
+						computers */
+	}
+#endif
+	err = srv_boot();
+
+	if (err != DB_SUCCESS) {
+
+		return((int) err);
+	}
+
+	mutex_create(&srv_monitor_file_mutex, SYNC_NO_ORDER_CHECK);
+
+	if (srv_innodb_status) {
+		srv_monitor_file_name = mem_alloc(
+			strlen(fil_path_to_mysql_datadir)
+			+ 20 + sizeof "/innodb_status.");
+		sprintf(srv_monitor_file_name, "%s/innodb_status.%lu",
+			fil_path_to_mysql_datadir, os_proc_get_number());
+		srv_monitor_file = fopen(srv_monitor_file_name, "w+");
+		if (!srv_monitor_file) {
+			fprintf(stderr, "InnoDB: unable to create %s: %s\n",
+				srv_monitor_file_name, strerror(errno));
+			return(DB_ERROR);
+		}
+	} else {
+		srv_monitor_file_name = NULL;
+		srv_monitor_file = os_file_create_tmpfile();
+		if (!srv_monitor_file) {
+			return(DB_ERROR);
+		}
+	}
+
+	mutex_create(&srv_dict_tmpfile_mutex, SYNC_DICT_OPERATION);
+
+	srv_dict_tmpfile = os_file_create_tmpfile();
+	if (!srv_dict_tmpfile) {
+		return(DB_ERROR);
+	}
+
+	mutex_create(&srv_misc_tmpfile_mutex, SYNC_ANY_LATCH);
+
+	srv_misc_tmpfile = os_file_create_tmpfile();
+	if (!srv_misc_tmpfile) {
+		return(DB_ERROR);
+	}
+
+	/* If user has set the value of innodb_file_io_threads then
+	we'll emit a message telling the user that this parameter
+	is now deprecated. */
+	if (srv_n_file_io_threads != 4) {
+		fprintf(stderr, "InnoDB: Warning:"
+			" innodb_file_io_threads is deprecated."
+			" Please use innodb_read_io_threads and"
+			" innodb_write_io_threads instead\n");
+	}
+
+	/* Now overwrite the value on srv_n_file_io_threads */
+	srv_n_file_io_threads = 2 + srv_n_read_io_threads
+				+ srv_n_write_io_threads;
+
+	ut_a(srv_n_file_io_threads <= SRV_MAX_N_IO_THREADS);
+
+	/* TODO: Investigate if SRV_N_PENDING_IOS_PER_THREAD (32) limit
+	still applies to windows. */
+	if (!os_aio_use_native_aio) {
+		io_limit = 8 * SRV_N_PENDING_IOS_PER_THREAD;
+	} else {
+		io_limit = SRV_N_PENDING_IOS_PER_THREAD;
+	}
+
+	os_aio_init(io_limit,
+		    srv_n_read_io_threads,
+		    srv_n_write_io_threads,
+		    SRV_MAX_N_PENDING_SYNC_IOS);
+
+	fil_init(srv_file_per_table ? 50000 : 5000,
+		 srv_max_n_open_files);
+
+	ret = buf_pool_init();
+
+	if (ret == NULL) {
+		fprintf(stderr,
+			"InnoDB: Fatal error: cannot allocate the memory"
+			" for the buffer pool\n");
+
+		return(DB_ERROR);
+	}
+
+#ifdef UNIV_DEBUG
+	/* We have observed deadlocks with a 5MB buffer pool but
+	the actual lower limit could very well be a little higher. */
+
+	if (srv_buf_pool_size <= 5 * 1024 * 1024) {
+
+		fprintf(stderr, "InnoDB: Warning: Small buffer pool size "
+			"(%luM), the flst_validate() debug function "
+			"can cause a deadlock if the buffer pool fills up.\n",
+			srv_buf_pool_size / 1024 / 1024);
+	}
+#endif
+
+	fsp_init();
+	log_init();
+
+	lock_sys_create(srv_lock_table_size);
+
+	/* Create i/o-handler threads: */
+
+	for (i = 0; i < srv_n_file_io_threads; i++) {
+		n[i] = i;
+
+		os_thread_create(io_handler_thread, n + i, thread_ids + i);
+	}
+
+#ifdef UNIV_LOG_ARCHIVE
+	if (0 != ut_strcmp(srv_log_group_home_dirs[0], srv_arch_dir)) {
+		fprintf(stderr,
+			"InnoDB: Error: you must set the log group"
+			" home dir in my.cnf the\n"
+			"InnoDB: same as log arch dir.\n");
+
+		return(DB_ERROR);
+	}
+#endif /* UNIV_LOG_ARCHIVE */
+
+	if (sizeof(ulint) == 4
+	    && srv_n_log_files * srv_log_file_size
+	       >= ((ulint)1 << (32 - UNIV_PAGE_SIZE_SHIFT))) {
+		fprintf(stderr,
+			"InnoDB: Error: combined size of log files"
+			" must be < 4 GB on 32-bit systems\n");
+
+		return(DB_ERROR);
+	}
+
+	sum_of_new_sizes = 0;
+
+	for (i = 0; i < srv_n_data_files; i++) {
+#ifndef __WIN__
+		if (sizeof(off_t) < 5 && srv_data_file_sizes[i] >= ((ulint)1 << (32 - UNIV_PAGE_SIZE_SHIFT))) {
+			fprintf(stderr,
+				"InnoDB: Error: file size must be < 4 GB"
+				" with this MySQL binary\n"
+				"InnoDB: and operating system combination,"
+				" in some OS's < 2 GB\n");
+
+			return(DB_ERROR);
+		}
+#endif
+		sum_of_new_sizes += srv_data_file_sizes[i];
+	}
+
+	if (sum_of_new_sizes < 10485760 / UNIV_PAGE_SIZE) {
+		fprintf(stderr,
+			"InnoDB: Error: tablespace size must be"
+			" at least 10 MB\n");
+
+		return(DB_ERROR);
+	}
+
+	err = open_or_create_data_files(&create_new_db,
+					&create_new_doublewrite_file,
+#ifdef UNIV_LOG_ARCHIVE
+					&min_arch_log_no, &max_arch_log_no,
+#endif /* UNIV_LOG_ARCHIVE */
+					&min_flushed_lsn, &max_flushed_lsn,
+					&sum_of_new_sizes);
+	if (err != DB_SUCCESS) {
+		fprintf(stderr,
+			"InnoDB: Could not open or create data files.\n"
+			"InnoDB: If you tried to add new data files,"
+			" and it failed here,\n"
+			"InnoDB: you should now edit innodb_data_file_path"
+			" in my.cnf back\n"
+			"InnoDB: to what it was, and remove the"
+			" new ibdata files InnoDB created\n"
+			"InnoDB: in this failed attempt. InnoDB only wrote"
+			" those files full of\n"
+			"InnoDB: zeros, but did not yet use them in any way."
+			" But be careful: do not\n"
+			"InnoDB: remove old data files"
+			" which contain your precious data!\n");
+
+		return((int) err);
+	}
+
+#ifdef UNIV_LOG_ARCHIVE
+	srv_normalize_path_for_win(srv_arch_dir);
+	srv_arch_dir = srv_add_path_separator_if_needed(srv_arch_dir);
+#endif /* UNIV_LOG_ARCHIVE */
+
+	for (i = 0; i < srv_n_log_files; i++) {
+		err = open_or_create_log_file(create_new_db, &log_file_created,
+					      log_opened, 0, i);
+		if (err != DB_SUCCESS) {
+
+			return((int) err);
+		}
+
+		if (log_file_created) {
+			log_created = TRUE;
+		} else {
+			log_opened = TRUE;
+		}
+		if ((log_opened && create_new_db)
+		    || (log_opened && log_created)) {
+			fprintf(stderr,
+				"InnoDB: Error: all log files must be"
+				" created at the same time.\n"
+				"InnoDB: All log files must be"
+				" created also in database creation.\n"
+				"InnoDB: If you want bigger or smaller"
+				" log files, shut down the\n"
+				"InnoDB: database and make sure there"
+				" were no errors in shutdown.\n"
+				"InnoDB: Then delete the existing log files."
+				" Edit the .cnf file\n"
+				"InnoDB: and start the database again.\n");
+
+			return(DB_ERROR);
+		}
+	}
+
+	/* Open all log files and data files in the system tablespace: we
+	keep them open until database shutdown */
+
+	fil_open_log_and_system_tablespace_files();
+
+	if (log_created && !create_new_db
+#ifdef UNIV_LOG_ARCHIVE
+	    && !srv_archive_recovery
+#endif /* UNIV_LOG_ARCHIVE */
+	    ) {
+		if (max_flushed_lsn != min_flushed_lsn
+#ifdef UNIV_LOG_ARCHIVE
+		    || max_arch_log_no != min_arch_log_no
+#endif /* UNIV_LOG_ARCHIVE */
+		    ) {
+			fprintf(stderr,
+				"InnoDB: Cannot initialize created"
+				" log files because\n"
+				"InnoDB: data files were not in sync"
+				" with each other\n"
+				"InnoDB: or the data files are corrupt.\n");
+
+			return(DB_ERROR);
+		}
+
+		if (max_flushed_lsn < (ib_uint64_t) 1000) {
+			fprintf(stderr,
+				"InnoDB: Cannot initialize created"
+				" log files because\n"
+				"InnoDB: data files are corrupt,"
+				" or new data files were\n"
+				"InnoDB: created when the database"
+				" was started previous\n"
+				"InnoDB: time but the database"
+				" was not shut down\n"
+				"InnoDB: normally after that.\n");
+
+			return(DB_ERROR);
+		}
+
+		mutex_enter(&(log_sys->mutex));
+
+#ifdef UNIV_LOG_ARCHIVE
+		/* Do not + 1 arch_log_no because we do not use log
+		archiving */
+		recv_reset_logs(max_flushed_lsn, max_arch_log_no, TRUE);
+#else
+		recv_reset_logs(max_flushed_lsn, TRUE);
+#endif /* UNIV_LOG_ARCHIVE */
+
+		mutex_exit(&(log_sys->mutex));
+	}
+
+	trx_sys_file_format_init();
+
+	if (create_new_doublewrite_file) {
+		mtr_start(&mtr);
+		fsp_header_init(TRX_DOUBLEWRITE_SPACE, TRX_SYS_DOUBLEWRITE_BLOCK_SIZE * 9, &mtr);
+		mtr_commit(&mtr);
+
+		trx_sys_dummy_create(TRX_DOUBLEWRITE_SPACE);
+	}
+
+	if (create_new_db) {
+		mtr_start(&mtr);
+		fsp_header_init(0, sum_of_new_sizes, &mtr);
+
+		mtr_commit(&mtr);
+
+		trx_sys_create();
+		dict_create();
+		srv_startup_is_before_trx_rollback_phase = FALSE;
+
+		if (trx_doublewrite == NULL) {
+			/* Create the doublewrite buffer here to avoid assertion error
+			   about page_no of doublewrite_buf */
+			trx_sys_create_doublewrite_buf();
+		}
+
+		if (srv_extra_rsegments)
+			trx_sys_create_extra_rseg(srv_extra_rsegments);
+#ifdef UNIV_LOG_ARCHIVE
+	} else if (srv_archive_recovery) {
+		fprintf(stderr,
+			"InnoDB: Starting archive"
+			" recovery from a backup...\n");
+		err = recv_recovery_from_archive_start(
+			min_flushed_lsn, srv_archive_recovery_limit_lsn,
+			min_arch_log_no);
+		if (err != DB_SUCCESS) {
+
+			return(DB_ERROR);
+		}
+		/* Since ibuf init is in dict_boot, and ibuf is needed
+		in any disk i/o, first call dict_boot */
+
+		dict_boot();
+		trx_sys_init_at_db_start();
+		srv_startup_is_before_trx_rollback_phase = FALSE;
+
+		/* Initialize the fsp free limit global variable in the log
+		system */
+		fsp_header_get_free_limit();
+
+		recv_recovery_from_archive_finish();
+#endif /* UNIV_LOG_ARCHIVE */
+	} else {
+
+		/* Check if we support the max format that is stamped
+		on the system tablespace. 
+		Note:  We are NOT allowed to make any modifications to
+		the TRX_SYS_PAGE_NO page before recovery  because this
+		page also contains the max_trx_id etc. important system
+		variables that are required for recovery.  We need to
+		ensure that we return the system to a state where normal
+		recovery is guaranteed to work. We do this by
+		invalidating the buffer cache, this will force the
+		reread of the page and restoration to its last known
+		consistent state, this is REQUIRED for the recovery
+		process to work. */
+		err = trx_sys_file_format_max_check(
+			srv_check_file_format_at_startup);
+
+		if (err != DB_SUCCESS) {
+			return(err);
+		}
+
+		/* Invalidate the buffer pool to ensure that we reread
+		the page that we read above, during recovery.
+		Note that this is not as heavy weight as it seems. At
+		this point there will be only ONE page in the buf_LRU
+		and there must be no page in the buf_flush list. */
+		/* buffer_pool_shm should not be reused when recovery was needed. */
+		if (!srv_buffer_pool_shm_is_reused)
+		buf_pool_invalidate();
+
+		/* We always try to do a recovery, even if the database had
+		been shut down normally: this is the normal startup path */
+
+		err = recv_recovery_from_checkpoint_start(LOG_CHECKPOINT,
+							  IB_ULONGLONG_MAX,
+							  min_flushed_lsn,
+							  max_flushed_lsn);
+		if (err != DB_SUCCESS) {
+
+			return(DB_ERROR);
+		}
+
+		/* Since the insert buffer init is in dict_boot, and the
+		insert buffer is needed in any disk i/o, first we call
+		dict_boot(). Note that trx_sys_init_at_db_start() only needs
+		to access space 0, and the insert buffer at this stage already
+		works for space 0. */
+
+		dict_boot();
+		trx_sys_init_at_db_start();
+
+		/* Initialize the fsp free limit global variable in the log
+		system */
+		fsp_header_get_free_limit();
+
+		/* recv_recovery_from_checkpoint_finish needs trx lists which
+		are initialized in trx_sys_init_at_db_start(). */
+
+		recv_recovery_from_checkpoint_finish();
+		if (srv_force_recovery < SRV_FORCE_NO_IBUF_MERGE) {
+			/* The following call is necessary for the insert
+			buffer to work with multiple tablespaces. We must
+			know the mapping between space id's and .ibd file
+			names.
+
+			In a crash recovery, we check that the info in data
+			dictionary is consistent with what we already know
+			about space id's from the call of
+			fil_load_single_table_tablespaces().
+
+			In a normal startup, we create the space objects for
+			every table in the InnoDB data dictionary that has
+			an .ibd file.
+
+			We also determine the maximum tablespace id used. */
+
+			dict_check_tablespaces_and_store_max_id(
+				recv_needed_recovery);
+		}
+
+		srv_startup_is_before_trx_rollback_phase = FALSE;
+		recv_recovery_rollback_active();
+
+		/* It is possible that file_format tag has never
+		been set. In this case we initialize it to minimum
+		value.  Important to note that we can do it ONLY after
+		we have finished the recovery process so that the
+		image of TRX_SYS_PAGE_NO is not stale. */
+		trx_sys_file_format_tag_init();
+	}
+
+	if (!create_new_db && sum_of_new_sizes > 0) {
+		/* New data file(s) were added */
+		mtr_start(&mtr);
+
+		fsp_header_inc_size(0, sum_of_new_sizes, &mtr);
+
+		mtr_commit(&mtr);
+
+		/* Immediately write the log record about increased tablespace
+		size to disk, so that it is durable even if mysqld would crash
+		quickly */
+
+		log_buffer_flush_to_disk();
+	}
+
+#ifdef UNIV_LOG_ARCHIVE
+	/* Archiving is always off under MySQL */
+	if (!srv_log_archive_on) {
+		ut_a(DB_SUCCESS == log_archive_noarchivelog());
+	} else {
+		mutex_enter(&(log_sys->mutex));
+
+		start_archive = FALSE;
+
+		if (log_sys->archiving_state == LOG_ARCH_OFF) {
+			start_archive = TRUE;
+		}
+
+		mutex_exit(&(log_sys->mutex));
+
+		if (start_archive) {
+			ut_a(DB_SUCCESS == log_archive_archivelog());
+		}
+	}
+#endif /* UNIV_LOG_ARCHIVE */
+
+	/* fprintf(stderr, "Max allowed record size %lu\n",
+	page_get_free_space_of_empty() / 2); */
+
+	/* Create the thread which watches the timeouts for lock waits */
+	os_thread_create(&srv_lock_timeout_thread, NULL,
+			 thread_ids + 2 + SRV_MAX_N_IO_THREADS);
+
+	/* Create the thread which warns of long semaphore waits */
+	os_thread_create(&srv_error_monitor_thread, NULL,
+			 thread_ids + 3 + SRV_MAX_N_IO_THREADS);
+
+	/* Create the thread which prints InnoDB monitor info */
+	os_thread_create(&srv_monitor_thread, NULL,
+			 thread_ids + 4 + SRV_MAX_N_IO_THREADS);
+
+	/* Create the thread which automaticaly dumps/restore buffer pool */
+	os_thread_create(&srv_LRU_dump_restore_thread, NULL,
+			 thread_ids + 5 + SRV_MAX_N_IO_THREADS);
+
+	srv_is_being_started = FALSE;
+
+	if (trx_doublewrite == NULL) {
+		/* Create the doublewrite buffer to a new tablespace */
+
+		trx_sys_create_doublewrite_buf();
+	}
+
+	err = dict_create_or_check_foreign_constraint_tables();
+
+	if (err != DB_SUCCESS) {
+		return((int)DB_ERROR);
+	}
+
+	/* Create the master thread which does purge and other utility
+	operations */
+
+	os_thread_create(&srv_master_thread, NULL, thread_ids
+			 + (1 + SRV_MAX_N_IO_THREADS));
+
+	if (srv_use_purge_thread) {
+		ulint i;
+
+		os_thread_create(&srv_purge_thread, NULL, thread_ids
+				 + (6 + SRV_MAX_N_IO_THREADS));
+
+		for (i = 0; i < srv_use_purge_thread - 1; i++) {
+			n[7 + i + SRV_MAX_N_IO_THREADS] = i; /* using as index for arrays in purge_sys */
+			os_thread_create(&srv_purge_worker_thread,
+					 n + (7 + i + SRV_MAX_N_IO_THREADS),
+					 thread_ids + (7 + i + SRV_MAX_N_IO_THREADS));
+		}
+	}
+#ifdef UNIV_DEBUG
+	/* buf_debug_prints = TRUE; */
+#endif /* UNIV_DEBUG */
+	sum_of_data_file_sizes = 0;
+
+	for (i = 0; i < srv_n_data_files; i++) {
+		sum_of_data_file_sizes += srv_data_file_sizes[i];
+	}
+
+	tablespace_size_in_header = fsp_header_get_tablespace_size();
+
+	if (!srv_auto_extend_last_data_file
+	    && sum_of_data_file_sizes != tablespace_size_in_header) {
+
+		fprintf(stderr,
+			"InnoDB: Error: tablespace size"
+			" stored in header is %lu pages, but\n"
+			"InnoDB: the sum of data file sizes is %lu pages\n",
+			(ulong) tablespace_size_in_header,
+			(ulong) sum_of_data_file_sizes);
+
+		if (srv_force_recovery == 0
+		    && sum_of_data_file_sizes < tablespace_size_in_header) {
+			/* This is a fatal error, the tail of a tablespace is
+			missing */
+
+			fprintf(stderr,
+				"InnoDB: Cannot start InnoDB."
+				" The tail of the system tablespace is\n"
+				"InnoDB: missing. Have you edited"
+				" innodb_data_file_path in my.cnf in an\n"
+				"InnoDB: inappropriate way, removing"
+				" ibdata files from there?\n"
+				"InnoDB: You can set innodb_force_recovery=1"
+				" in my.cnf to force\n"
+				"InnoDB: a startup if you are trying"
+				" to recover a badly corrupt database.\n");
+
+			return(DB_ERROR);
+		}
+	}
+
+	if (srv_auto_extend_last_data_file
+	    && sum_of_data_file_sizes < tablespace_size_in_header) {
+
+		fprintf(stderr,
+			"InnoDB: Error: tablespace size stored in header"
+			" is %lu pages, but\n"
+			"InnoDB: the sum of data file sizes"
+			" is only %lu pages\n",
+			(ulong) tablespace_size_in_header,
+			(ulong) sum_of_data_file_sizes);
+
+		if (srv_force_recovery == 0) {
+
+			fprintf(stderr,
+				"InnoDB: Cannot start InnoDB. The tail of"
+				" the system tablespace is\n"
+				"InnoDB: missing. Have you edited"
+				" innodb_data_file_path in my.cnf in an\n"
+				"InnoDB: inappropriate way, removing"
+				" ibdata files from there?\n"
+				"InnoDB: You can set innodb_force_recovery=1"
+				" in my.cnf to force\n"
+				"InnoDB: a startup if you are trying to"
+				" recover a badly corrupt database.\n");
+
+			return(DB_ERROR);
+		}
+	}
+
+	/* Check that os_fast_mutexes work as expected */
+	os_fast_mutex_init(&srv_os_test_mutex);
+
+	if (0 != os_fast_mutex_trylock(&srv_os_test_mutex)) {
+		fprintf(stderr,
+			"InnoDB: Error: pthread_mutex_trylock returns"
+			" an unexpected value on\n"
+			"InnoDB: success! Cannot continue.\n");
+		exit(1);
+	}
+
+	os_fast_mutex_unlock(&srv_os_test_mutex);
+
+	os_fast_mutex_lock(&srv_os_test_mutex);
+
+	os_fast_mutex_unlock(&srv_os_test_mutex);
+
+	os_fast_mutex_free(&srv_os_test_mutex);
+
+	if (!srv_file_per_table_original_value
+	    && srv_pass_corrupt_table) {
+		fprintf(stderr, "InnoDB: Warning:"
+			" innodb_file_per_table is diabled."
+			" So innodb_pass_corrupt_table doesn't make sence\n");
+	}
+
+	if (srv_print_verbose_log) {
+		ut_print_timestamp(stderr);
+		fprintf(stderr,
+			" Percona XtraDB (http://www.percona.com) %s started; "
+			"log sequence number %llu\n",
+			INNODB_VERSION_STR, srv_start_lsn);
+	}
+
+	if (srv_force_recovery > 0) {
+		fprintf(stderr,
+			"InnoDB: !!! innodb_force_recovery"
+			" is set to %lu !!!\n",
+			(ulong) srv_force_recovery);
+	}
+
+	fflush(stderr);
+
+	if (trx_doublewrite_must_reset_space_ids) {
+		/* Actually, we did not change the undo log format between
+		4.0 and 4.1.1, and we would not need to run purge to
+		completion. Note also that the purge algorithm in 4.1.1
+		can process the history list again even after a full
+		purge, because our algorithm does not cut the end of the
+		history list in all cases so that it would become empty
+		after a full purge. That mean that we may purge 4.0 type
+		undo log even after this phase.
+
+		The insert buffer record format changed between 4.0 and
+		4.1.1. It is essential that the insert buffer is emptied
+		here! */
+
+		fprintf(stderr,
+			"InnoDB: You are upgrading to an"
+			" InnoDB version which allows multiple\n"
+			"InnoDB: tablespaces. Wait that purge"
+			" and insert buffer merge run to\n"
+			"InnoDB: completion...\n");
+		for (;;) {
+			os_thread_sleep(1000000);
+
+			if (0 == strcmp(srv_main_thread_op_info,
+					"waiting for server activity")) {
+
+				ut_a(ibuf_is_empty());
+
+				break;
+			}
+		}
+		fprintf(stderr,
+			"InnoDB: Full purge and insert buffer merge"
+			" completed.\n");
+
+		trx_sys_mark_upgraded_to_multiple_tablespaces();
+
+		fprintf(stderr,
+			"InnoDB: You have now successfully upgraded"
+			" to the multiple tablespaces\n"
+			"InnoDB: format. You should NOT DOWNGRADE"
+			" to an earlier version of\n"
+			"InnoDB: InnoDB! But if you absolutely need to"
+			" downgrade, see\n"
+			"InnoDB: " REFMAN "multiple-tablespaces.html\n"
+			"InnoDB: for instructions.\n");
+	}
+
+	if (srv_force_recovery == 0) {
+		/* In the insert buffer we may have even bigger tablespace
+		id's, because we may have dropped those tablespaces, but
+		insert buffer merge has not had time to clean the records from
+		the ibuf tree. */
+
+		ibuf_update_max_tablespace_id();
+	}
+
+	srv_file_per_table = srv_file_per_table_original_value;
+
+	srv_was_started = TRUE;
+
+	return((int) DB_SUCCESS);
+}
+
+/****************************************************************//**
+Shuts down the InnoDB database.
+@return	DB_SUCCESS or error code */
+UNIV_INTERN
+int
+innobase_shutdown_for_mysql(void)
+/*=============================*/
+{
+	ulint	i;
+#ifdef __NETWARE__
+	extern ibool panic_shutdown;
+#endif
+	if (!srv_was_started) {
+		if (srv_is_being_started) {
+			ut_print_timestamp(stderr);
+			fprintf(stderr,
+				"  InnoDB: Warning: shutting down"
+				" a not properly started\n"
+				"InnoDB: or created database!\n");
+		}
+
+		return(DB_SUCCESS);
+	}
+
+	/* 1. Flush the buffer pool to disk, write the current lsn to
+	the tablespace header(s), and copy all log data to archive.
+	The step 1 is the real InnoDB shutdown. The remaining steps 2 - ...
+	just free data structures after the shutdown. */
+
+
+	if (srv_fast_shutdown == 2) {
+		ut_print_timestamp(stderr);
+		fprintf(stderr,
+			"  InnoDB: MySQL has requested a very fast shutdown"
+			" without flushing "
+			"the InnoDB buffer pool to data files."
+			" At the next mysqld startup "
+			"InnoDB will do a crash recovery!\n");
+	}
+
+#ifdef __NETWARE__
+	if (!panic_shutdown)
+#endif
+		logs_empty_and_mark_files_at_shutdown();
+
+	if (srv_conc_n_threads != 0) {
+		fprintf(stderr,
+			"InnoDB: Warning: query counter shows %ld queries"
+			" still\n"
+			"InnoDB: inside InnoDB at shutdown\n",
+			srv_conc_n_threads);
+	}
+
+	/* 2. Make all threads created by InnoDB to exit */
+
+	srv_shutdown_state = SRV_SHUTDOWN_EXIT_THREADS;
+
+	/* In a 'very fast' shutdown, we do not need to wait for these threads
+	to die; all which counts is that we flushed the log; a 'very fast'
+	shutdown is essentially a crash. */
+
+	if (srv_fast_shutdown == 2) {
+		return(DB_SUCCESS);
+	}
+
+	/* All threads end up waiting for certain events. Put those events
+	to the signaled state. Then the threads will exit themselves in
+	os_thread_event_wait(). */
+
+	for (i = 0; i < 1000; i++) {
+		/* NOTE: IF YOU CREATE THREADS IN INNODB, YOU MUST EXIT THEM
+		HERE OR EARLIER */
+
+		/* a. Let the lock timeout thread exit */
+		os_event_set(srv_lock_timeout_thread_event);
+
+		/* b. srv error monitor thread exits automatically, no need
+		to do anything here */
+
+		/* c. We wake the master thread so that it exits */
+		srv_wake_master_thread();
+
+		/* d. Exit the i/o threads */
+
+		os_aio_wake_all_threads_at_shutdown();
+
+		os_mutex_enter(os_sync_mutex);
+
+		if (os_thread_count == 0) {
+			/* All the threads have exited or are just exiting;
+			NOTE that the threads may not have completed their
+			exit yet. Should we use pthread_join() to make sure
+			they have exited? If we did, we would have to
+			remove the pthread_detach() from
+			os_thread_exit().  Now we just sleep 0.1
+			seconds and hope that is enough! */
+
+			os_mutex_exit(os_sync_mutex);
+
+			os_thread_sleep(100000);
+
+			break;
+		}
+
+		os_mutex_exit(os_sync_mutex);
+
+		os_thread_sleep(100000);
+	}
+
+	if (i == 1000) {
+		fprintf(stderr,
+			"InnoDB: Warning: %lu threads created by InnoDB"
+			" had not exited at shutdown!\n",
+			(ulong) os_thread_count);
+	}
+
+	if (srv_monitor_file) {
+		fclose(srv_monitor_file);
+		srv_monitor_file = 0;
+		if (srv_monitor_file_name) {
+			unlink(srv_monitor_file_name);
+			mem_free(srv_monitor_file_name);
+		}
+	}
+	if (srv_dict_tmpfile) {
+		fclose(srv_dict_tmpfile);
+		srv_dict_tmpfile = 0;
+	}
+
+	if (srv_misc_tmpfile) {
+		fclose(srv_misc_tmpfile);
+		srv_misc_tmpfile = 0;
+	}
+
+	/* This must be disabled before closing the buffer pool
+	and closing the data dictionary.  */
+	btr_search_disable();
+
+	ibuf_close();
+	log_shutdown();
+	lock_sys_close();
+	thr_local_close();
+	trx_sys_file_format_close();
+	trx_sys_close();
+
+	mutex_free(&srv_monitor_file_mutex);
+	mutex_free(&srv_dict_tmpfile_mutex);
+	mutex_free(&srv_misc_tmpfile_mutex);
+	dict_close();
+	btr_search_sys_free();
+
+	/* 3. Free all InnoDB's own mutexes and the os_fast_mutexes inside
+	them */
+	os_aio_free();
+	sync_close();
+	srv_free();
+	fil_close();
+
+	/* 4. Free the os_conc_mutex and all os_events and os_mutexes */
+
+	os_sync_free();
+
+	/* 5. Free all allocated memory */
+
+	pars_lexer_close();
+	log_mem_free();
+	buf_pool_free();
+	mem_close();
+
+	/* ut_free_all_mem() frees all allocated memory not freed yet
+	in shutdown, and it will also free the ut_list_mutex, so it
+	should be the last one for all operation */
+	ut_free_all_mem();
+
+	if (os_thread_count != 0
+	    || os_event_count != 0
+	    || os_mutex_count != 0
+	    || os_fast_mutex_count != 0) {
+		fprintf(stderr,
+			"InnoDB: Warning: some resources were not"
+			" cleaned up in shutdown:\n"
+			"InnoDB: threads %lu, events %lu,"
+			" os_mutexes %lu, os_fast_mutexes %lu\n",
+			(ulong) os_thread_count, (ulong) os_event_count,
+			(ulong) os_mutex_count, (ulong) os_fast_mutex_count);
+	}
+
+	if (dict_foreign_err_file) {
+		fclose(dict_foreign_err_file);
+	}
+	if (lock_latest_err_file) {
+		fclose(lock_latest_err_file);
+	}
+
+	if (srv_print_verbose_log) {
+		ut_print_timestamp(stderr);
+		fprintf(stderr,
+			"  InnoDB: Shutdown completed;"
+			" log sequence number %llu\n",
+			srv_shutdown_lsn);
+	}
+
+	srv_was_started = FALSE;
+	srv_start_has_been_called = FALSE;
+
+	return((int) DB_SUCCESS);
+}
+
+#ifdef __NETWARE__
+void set_panic_flag_for_netware()
+{
+	extern ibool panic_shutdown;
+	panic_shutdown = TRUE;
+}
+#endif /* __NETWARE__ */
+#endif /* !UNIV_HOTBACKUP */
diff --git a/storage/xtradb/sync/sync0arr.c b/storage/xtradb/sync/sync0arr.c
new file mode 100644
index 00000000000..223e1715944
--- /dev/null
+++ b/storage/xtradb/sync/sync0arr.c
@@ -0,0 +1,1023 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 2008, Google Inc.
+
+Portions of this file contain modifications contributed and copyrighted by
+Google, Inc. Those modifications are gratefully acknowledged and are described
+briefly in the InnoDB documentation. The contributions by Google are
+incorporated with their permission, and subject to the conditions contained in
+the file COPYING.Google.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file sync/sync0arr.c
+The wait array used in synchronization primitives
+
+Created 9/5/1995 Heikki Tuuri
+*******************************************************/
+
+#include "sync0arr.h"
+#ifdef UNIV_NONINL
+#include "sync0arr.ic"
+#endif
+
+#include "sync0sync.h"
+#include "sync0rw.h"
+#include "os0sync.h"
+#include "os0file.h"
+#include "srv0srv.h"
+
+/*
+			WAIT ARRAY
+			==========
+
+The wait array consists of cells each of which has an
+an operating system event object created for it. The threads
+waiting for a mutex, for example, can reserve a cell
+in the array and suspend themselves to wait for the event
+to become signaled. When using the wait array, remember to make
+sure that some thread holding the synchronization object
+will eventually know that there is a waiter in the array and
+signal the object, to prevent infinite wait.
+Why we chose to implement a wait array? First, to make
+mutexes fast, we had to code our own implementation of them,
+which only in usually uncommon cases resorts to using
+slow operating system primitives. Then we had the choice of
+assigning a unique OS event for each mutex, which would
+be simpler, or using a global wait array. In some operating systems,
+the global wait array solution is more efficient and flexible,
+because we can do with a very small number of OS events,
+say 200. In NT 3.51, allocating events seems to be a quadratic
+algorithm, because 10 000 events are created fast, but
+100 000 events takes a couple of minutes to create.
+
+As of 5.0.30 the above mentioned design is changed. Since now
+OS can handle millions of wait events efficiently, we no longer
+have this concept of each cell of wait array having one event.
+Instead, now the event that a thread wants to wait on is embedded
+in the wait object (mutex or rw_lock). We still keep the global
+wait array for the sake of diagnostics and also to avoid infinite
+wait The error_monitor thread scans the global wait array to signal
+any waiting threads who have missed the signal. */
+
+/** A cell where an individual thread may wait suspended
+until a resource is released. The suspending is implemented
+using an operating system event semaphore. */
+struct sync_cell_struct {
+	void*		wait_object;	/*!< pointer to the object the
+					thread is waiting for; if NULL
+					the cell is free for use */
+	mutex_t*	old_wait_mutex;	/*!< the latest wait mutex in cell */
+	rw_lock_t*	old_wait_rw_lock;
+					/*!< the latest wait rw-lock
+					in cell */
+	ulint		request_type;	/*!< lock type requested on the
+					object */
+	const char*	file;		/*!< in debug version file where
+					requested */
+	ulint		line;		/*!< in debug version line where
+					requested */
+	os_thread_id_t	thread;		/*!< thread id of this waiting
+					thread */
+	ibool		waiting;	/*!< TRUE if the thread has already
+					called sync_array_event_wait
+					on this cell */
+	ib_int64_t	signal_count;	/*!< We capture the signal_count
+					of the wait_object when we
+					reset the event. This value is
+					then passed on to os_event_wait
+					and we wait only if the event
+					has not been signalled in the
+					period between the reset and
+					wait call. */
+	time_t		reservation_time;/*!< time when the thread reserved
+					the wait cell */
+};
+
+/* NOTE: It is allowed for a thread to wait
+for an event allocated for the array without owning the
+protecting mutex (depending on the case: OS or database mutex), but
+all changes (set or reset) to the state of the event must be made
+while owning the mutex. */
+
+/** Synchronization array */
+struct sync_array_struct {
+	ulint		n_reserved;	/*!< number of currently reserved
+					cells in the wait array */
+	ulint		n_cells;	/*!< number of cells in the
+					wait array */
+	sync_cell_t*	array;		/*!< pointer to wait array */
+	ulint		protection;	/*!< this flag tells which
+					mutex protects the data */
+	mutex_t		mutex;		/*!< possible database mutex
+					protecting this data structure */
+	os_mutex_t	os_mutex;	/*!< Possible operating system mutex
+					protecting the data structure.
+					As this data structure is used in
+					constructing the database mutex,
+					to prevent infinite recursion
+					in implementation, we fall back to
+					an OS mutex. */
+	ulint		sg_count;	/*!< count of how many times an
+					object has been signalled */
+	ulint		res_count;	/*!< count of cell reservations
+					since creation of the array */
+};
+
+#ifdef UNIV_SYNC_DEBUG
+/******************************************************************//**
+This function is called only in the debug version. Detects a deadlock
+of one or more threads because of waits of semaphores.
+@return	TRUE if deadlock detected */
+static
+ibool
+sync_array_detect_deadlock(
+/*=======================*/
+	sync_array_t*	arr,	/*!< in: wait array; NOTE! the caller must
+				own the mutex to array */
+	sync_cell_t*	start,	/*!< in: cell where recursive search started */
+	sync_cell_t*	cell,	/*!< in: cell to search */
+	ulint		depth);	/*!< in: recursion depth */
+#endif /* UNIV_SYNC_DEBUG */
+
+/*****************************************************************//**
+Gets the nth cell in array.
+@return	cell */
+static
+sync_cell_t*
+sync_array_get_nth_cell(
+/*====================*/
+	sync_array_t*	arr,	/*!< in: sync array */
+	ulint		n)	/*!< in: index */
+{
+	ut_a(arr);
+	ut_a(n < arr->n_cells);
+
+	return(arr->array + n);
+}
+
+/******************************************************************//**
+Reserves the mutex semaphore protecting a sync array. */
+static
+void
+sync_array_enter(
+/*=============*/
+	sync_array_t*	arr)	/*!< in: sync wait array */
+{
+	ulint	protection;
+
+	protection = arr->protection;
+
+	if (protection == SYNC_ARRAY_OS_MUTEX) {
+		os_mutex_enter(arr->os_mutex);
+	} else if (protection == SYNC_ARRAY_MUTEX) {
+		mutex_enter(&(arr->mutex));
+	} else {
+		ut_error;
+	}
+}
+
+/******************************************************************//**
+Releases the mutex semaphore protecting a sync array. */
+static
+void
+sync_array_exit(
+/*============*/
+	sync_array_t*	arr)	/*!< in: sync wait array */
+{
+	ulint	protection;
+
+	protection = arr->protection;
+
+	if (protection == SYNC_ARRAY_OS_MUTEX) {
+		os_mutex_exit(arr->os_mutex);
+	} else if (protection == SYNC_ARRAY_MUTEX) {
+		mutex_exit(&(arr->mutex));
+	} else {
+		ut_error;
+	}
+}
+
+/*******************************************************************//**
+Creates a synchronization wait array. It is protected by a mutex
+which is automatically reserved when the functions operating on it
+are called.
+@return	own: created wait array */
+UNIV_INTERN
+sync_array_t*
+sync_array_create(
+/*==============*/
+	ulint	n_cells,	/*!< in: number of cells in the array
+				to create */
+	ulint	protection)	/*!< in: either SYNC_ARRAY_OS_MUTEX or
+				SYNC_ARRAY_MUTEX: determines the type
+				of mutex protecting the data structure */
+{
+	ulint		sz;
+	sync_array_t*	arr;
+
+	ut_a(n_cells > 0);
+
+	/* Allocate memory for the data structures */
+	arr = ut_malloc(sizeof(sync_array_t));
+	memset(arr, 0x0, sizeof(*arr));
+
+	sz = sizeof(sync_cell_t) * n_cells;
+	arr->array = ut_malloc(sz);
+	memset(arr->array, 0x0, sz);
+
+	arr->n_cells = n_cells;
+	arr->protection = protection;
+
+	/* Then create the mutex to protect the wait array complex */
+	if (protection == SYNC_ARRAY_OS_MUTEX) {
+		arr->os_mutex = os_mutex_create(NULL);
+	} else if (protection == SYNC_ARRAY_MUTEX) {
+		mutex_create(&arr->mutex, SYNC_NO_ORDER_CHECK);
+	} else {
+		ut_error;
+	}
+
+	return(arr);
+}
+
+/******************************************************************//**
+Frees the resources in a wait array. */
+UNIV_INTERN
+void
+sync_array_free(
+/*============*/
+	sync_array_t*	arr)	/*!< in, own: sync wait array */
+{
+	ulint		protection;
+
+	ut_a(arr->n_reserved == 0);
+
+	sync_array_validate(arr);
+
+	protection = arr->protection;
+
+	/* Release the mutex protecting the wait array complex */
+
+	if (protection == SYNC_ARRAY_OS_MUTEX) {
+		os_mutex_free(arr->os_mutex);
+	} else if (protection == SYNC_ARRAY_MUTEX) {
+		mutex_free(&(arr->mutex));
+	} else {
+		ut_error;
+	}
+
+	ut_free(arr->array);
+	ut_free(arr);
+}
+
+/********************************************************************//**
+Validates the integrity of the wait array. Checks
+that the number of reserved cells equals the count variable. */
+UNIV_INTERN
+void
+sync_array_validate(
+/*================*/
+	sync_array_t*	arr)	/*!< in: sync wait array */
+{
+	ulint		i;
+	sync_cell_t*	cell;
+	ulint		count		= 0;
+
+	sync_array_enter(arr);
+
+	for (i = 0; i < arr->n_cells; i++) {
+		cell = sync_array_get_nth_cell(arr, i);
+		if (cell->wait_object != NULL) {
+			count++;
+		}
+	}
+
+	ut_a(count == arr->n_reserved);
+
+	sync_array_exit(arr);
+}
+
+/*******************************************************************//**
+Returns the event that the thread owning the cell waits for. */
+static
+os_event_t
+sync_cell_get_event(
+/*================*/
+	sync_cell_t*	cell) /*!< in: non-empty sync array cell */
+{
+	ulint type = cell->request_type;
+
+	if (type == SYNC_MUTEX) {
+		return(((mutex_t *) cell->wait_object)->event);
+	} else if (type == RW_LOCK_WAIT_EX) {
+		return(((rw_lock_t *) cell->wait_object)->wait_ex_event);
+	} else { /* RW_LOCK_SHARED and RW_LOCK_EX wait on the same event */
+		return(((rw_lock_t *) cell->wait_object)->event);
+	}
+}
+
+/******************************************************************//**
+Reserves a wait array cell for waiting for an object.
+The event of the cell is reset to nonsignalled state. */
+UNIV_INTERN
+void
+sync_array_reserve_cell(
+/*====================*/
+	sync_array_t*	arr,	/*!< in: wait array */
+	void*		object, /*!< in: pointer to the object to wait for */
+	ulint		type,	/*!< in: lock request type */
+	const char*	file,	/*!< in: file where requested */
+	ulint		line,	/*!< in: line where requested */
+	ulint*		index)	/*!< out: index of the reserved cell */
+{
+	sync_cell_t*	cell;
+	os_event_t      event;
+	ulint		i;
+
+	ut_a(object);
+	ut_a(index);
+
+	sync_array_enter(arr);
+
+	arr->res_count++;
+
+	/* Reserve a new cell. */
+	for (i = 0; i < arr->n_cells; i++) {
+		cell = sync_array_get_nth_cell(arr, i);
+
+		if (cell->wait_object == NULL) {
+
+			cell->waiting = FALSE;
+			cell->wait_object = object;
+
+			if (type == SYNC_MUTEX) {
+				cell->old_wait_mutex = object;
+			} else {
+				cell->old_wait_rw_lock = object;
+			}
+
+			cell->request_type = type;
+
+			cell->file = file;
+			cell->line = line;
+
+			arr->n_reserved++;
+
+			*index = i;
+
+			sync_array_exit(arr);
+
+			/* Make sure the event is reset and also store
+			the value of signal_count at which the event
+			was reset. */
+                        event = sync_cell_get_event(cell);
+			cell->signal_count = os_event_reset(event);
+
+			cell->reservation_time = time(NULL);
+
+			cell->thread = os_thread_get_curr_id();
+
+			return;
+		}
+	}
+
+	ut_error; /* No free cell found */
+
+	return;
+}
+
+/******************************************************************//**
+This function should be called when a thread starts to wait on
+a wait array cell. In the debug version this function checks
+if the wait for a semaphore will result in a deadlock, in which
+case prints info and asserts. */
+UNIV_INTERN
+void
+sync_array_wait_event(
+/*==================*/
+	sync_array_t*	arr,	/*!< in: wait array */
+	ulint		index)	/*!< in: index of the reserved cell */
+{
+	sync_cell_t*	cell;
+	os_event_t	event;
+
+	ut_a(arr);
+
+	sync_array_enter(arr);
+
+	cell = sync_array_get_nth_cell(arr, index);
+
+	ut_a(cell->wait_object);
+	ut_a(!cell->waiting);
+	ut_ad(os_thread_get_curr_id() == cell->thread);
+
+	event = sync_cell_get_event(cell);
+		cell->waiting = TRUE;
+
+#ifdef UNIV_SYNC_DEBUG
+
+	/* We use simple enter to the mutex below, because if
+	we cannot acquire it at once, mutex_enter would call
+	recursively sync_array routines, leading to trouble.
+	rw_lock_debug_mutex freezes the debug lists. */
+
+	rw_lock_debug_mutex_enter();
+
+	if (TRUE == sync_array_detect_deadlock(arr, cell, cell, 0)) {
+
+		fputs("########################################\n", stderr);
+		ut_error;
+	}
+
+	rw_lock_debug_mutex_exit();
+#endif
+	sync_array_exit(arr);
+
+	os_event_wait_low(event, cell->signal_count);
+
+	sync_array_free_cell(arr, index);
+}
+
+/******************************************************************//**
+Reports info of a wait array cell. */
+static
+void
+sync_array_cell_print(
+/*==================*/
+	FILE*		file,	/*!< in: file where to print */
+	sync_cell_t*	cell)	/*!< in: sync cell */
+{
+	mutex_t*	mutex;
+	rw_lock_t*	rwlock;
+	ulint		type;
+	ulint		writer;
+
+	type = cell->request_type;
+
+	fprintf(file,
+		"--Thread %lu has waited at %s line %lu"
+		" for %#.5g seconds the semaphore:\n",
+		(ulong) os_thread_pf(cell->thread), cell->file,
+		(ulong) cell->line,
+		difftime(time(NULL), cell->reservation_time));
+
+	if (type == SYNC_MUTEX) {
+		/* We use old_wait_mutex in case the cell has already
+		been freed meanwhile */
+		mutex = cell->old_wait_mutex;
+
+		fprintf(file,
+			"Mutex at %p '%s', lock var %lu\n"
+#ifdef UNIV_SYNC_DEBUG
+			"Last time reserved in file %s line %lu, "
+#endif /* UNIV_SYNC_DEBUG */
+			"waiters flag %lu\n",
+			(void*) mutex, mutex->cmutex_name,
+			(ulong) mutex->lock_word,
+#ifdef UNIV_SYNC_DEBUG
+			mutex->file_name, (ulong) mutex->line,
+#endif /* UNIV_SYNC_DEBUG */
+			(ulong) mutex->waiters);
+
+	} else if (type == RW_LOCK_EX
+		   || type == RW_LOCK_WAIT_EX
+		   || type == RW_LOCK_SHARED) {
+
+		fputs(type == RW_LOCK_EX ? "X-lock on"
+		      : type == RW_LOCK_WAIT_EX ? "X-lock (wait_ex) on"
+		      : "S-lock on", file);
+
+		rwlock = cell->old_wait_rw_lock;
+
+		fprintf(file,
+			" RW-latch at %p '%s'\n",
+			(void*) rwlock, rwlock->lock_name);
+		writer = rw_lock_get_writer(rwlock);
+		if (writer != RW_LOCK_NOT_LOCKED) {
+			fprintf(file,
+				"a writer (thread id %lu) has"
+				" reserved it in mode %s",
+				(ulong) os_thread_pf(rwlock->writer_thread),
+				writer == RW_LOCK_EX
+				? " exclusive\n"
+				: " wait exclusive\n");
+		}
+
+		fprintf(file,
+			"number of readers %lu, waiters flag %lu, "
+                        "lock_word: %lx\n"
+			"Last time read locked in file %s line %lu\n"
+			"Last time write locked in file %s line %lu\n",
+			(ulong) rw_lock_get_reader_count(rwlock),
+			(ulong) rwlock->waiters,
+			rwlock->lock_word,
+			rwlock->last_s_file_name,
+			(ulong) rwlock->last_s_line,
+			rwlock->last_x_file_name,
+			(ulong) rwlock->last_x_line);
+	} else {
+		ut_error;
+	}
+
+	if (!cell->waiting) {
+		fputs("wait has ended\n", file);
+	}
+}
+
+#ifdef UNIV_SYNC_DEBUG
+/******************************************************************//**
+Looks for a cell with the given thread id.
+@return	pointer to cell or NULL if not found */
+static
+sync_cell_t*
+sync_array_find_thread(
+/*===================*/
+	sync_array_t*	arr,	/*!< in: wait array */
+	os_thread_id_t	thread)	/*!< in: thread id */
+{
+	ulint		i;
+	sync_cell_t*	cell;
+
+	for (i = 0; i < arr->n_cells; i++) {
+
+		cell = sync_array_get_nth_cell(arr, i);
+
+		if (cell->wait_object != NULL
+		    && os_thread_eq(cell->thread, thread)) {
+
+			return(cell);	/* Found */
+		}
+	}
+
+	return(NULL);	/* Not found */
+}
+
+/******************************************************************//**
+Recursion step for deadlock detection.
+@return	TRUE if deadlock detected */
+static
+ibool
+sync_array_deadlock_step(
+/*=====================*/
+	sync_array_t*	arr,	/*!< in: wait array; NOTE! the caller must
+				own the mutex to array */
+	sync_cell_t*	start,	/*!< in: cell where recursive search
+				started */
+	os_thread_id_t	thread,	/*!< in: thread to look at */
+	ulint		pass,	/*!< in: pass value */
+	ulint		depth)	/*!< in: recursion depth */
+{
+	sync_cell_t*	new;
+	ibool		ret;
+
+	depth++;
+
+	if (pass != 0) {
+		/* If pass != 0, then we do not know which threads are
+		responsible of releasing the lock, and no deadlock can
+		be detected. */
+
+		return(FALSE);
+	}
+
+	new = sync_array_find_thread(arr, thread);
+
+	if (new == start) {
+		/* Stop running of other threads */
+
+		ut_dbg_stop_threads = TRUE;
+
+		/* Deadlock */
+		fputs("########################################\n"
+		      "DEADLOCK of threads detected!\n", stderr);
+
+		return(TRUE);
+
+	} else if (new) {
+		ret = sync_array_detect_deadlock(arr, start, new, depth);
+
+		if (ret) {
+			return(TRUE);
+		}
+	}
+	return(FALSE);
+}
+
+/******************************************************************//**
+This function is called only in the debug version. Detects a deadlock
+of one or more threads because of waits of semaphores.
+@return	TRUE if deadlock detected */
+static
+ibool
+sync_array_detect_deadlock(
+/*=======================*/
+	sync_array_t*	arr,	/*!< in: wait array; NOTE! the caller must
+				own the mutex to array */
+	sync_cell_t*	start,	/*!< in: cell where recursive search started */
+	sync_cell_t*	cell,	/*!< in: cell to search */
+	ulint		depth)	/*!< in: recursion depth */
+{
+	mutex_t*	mutex;
+	rw_lock_t*	lock;
+	os_thread_id_t	thread;
+	ibool		ret;
+	rw_lock_debug_t*debug;
+
+	ut_a(arr);
+	ut_a(start);
+	ut_a(cell);
+	ut_ad(cell->wait_object);
+	ut_ad(os_thread_get_curr_id() == start->thread);
+	ut_ad(depth < 100);
+
+	depth++;
+
+	if (!cell->waiting) {
+
+		return(FALSE); /* No deadlock here */
+	}
+
+	if (cell->request_type == SYNC_MUTEX) {
+
+		mutex = cell->wait_object;
+
+		if (mutex_get_lock_word(mutex) != 0) {
+
+			thread = mutex->thread_id;
+
+			/* Note that mutex->thread_id above may be
+			also OS_THREAD_ID_UNDEFINED, because the
+			thread which held the mutex maybe has not
+			yet updated the value, or it has already
+			released the mutex: in this case no deadlock
+			can occur, as the wait array cannot contain
+			a thread with ID_UNDEFINED value. */
+
+			ret = sync_array_deadlock_step(arr, start, thread, 0,
+						       depth);
+			if (ret) {
+				fprintf(stderr,
+			"Mutex %p owned by thread %lu file %s line %lu\n",
+					mutex, (ulong) os_thread_pf(mutex->thread_id),
+					mutex->file_name, (ulong) mutex->line);
+				sync_array_cell_print(stderr, cell);
+
+				return(TRUE);
+			}
+		}
+
+		return(FALSE); /* No deadlock */
+
+	} else if (cell->request_type == RW_LOCK_EX
+		   || cell->request_type == RW_LOCK_WAIT_EX) {
+
+		lock = cell->wait_object;
+
+		debug = UT_LIST_GET_FIRST(lock->debug_list);
+
+		while (debug != NULL) {
+
+			thread = debug->thread_id;
+
+			if (((debug->lock_type == RW_LOCK_EX)
+			     && !os_thread_eq(thread, cell->thread))
+			    || ((debug->lock_type == RW_LOCK_WAIT_EX)
+				&& !os_thread_eq(thread, cell->thread))
+			    || (debug->lock_type == RW_LOCK_SHARED)) {
+
+				/* The (wait) x-lock request can block
+				infinitely only if someone (can be also cell
+				thread) is holding s-lock, or someone
+				(cannot be cell thread) (wait) x-lock, and
+				he is blocked by start thread */
+
+				ret = sync_array_deadlock_step(
+					arr, start, thread, debug->pass,
+					depth);
+				if (ret) {
+print:
+					fprintf(stderr, "rw-lock %p ",
+						(void*) lock);
+					sync_array_cell_print(stderr, cell);
+					rw_lock_debug_print(debug);
+					return(TRUE);
+				}
+			}
+
+			debug = UT_LIST_GET_NEXT(list, debug);
+		}
+
+		return(FALSE);
+
+	} else if (cell->request_type == RW_LOCK_SHARED) {
+
+		lock = cell->wait_object;
+		debug = UT_LIST_GET_FIRST(lock->debug_list);
+
+		while (debug != NULL) {
+
+			thread = debug->thread_id;
+
+			if ((debug->lock_type == RW_LOCK_EX)
+			    || (debug->lock_type == RW_LOCK_WAIT_EX)) {
+
+				/* The s-lock request can block infinitely
+				only if someone (can also be cell thread) is
+				holding (wait) x-lock, and he is blocked by
+				start thread */
+
+				ret = sync_array_deadlock_step(
+					arr, start, thread, debug->pass,
+					depth);
+				if (ret) {
+					goto print;
+				}
+			}
+
+			debug = UT_LIST_GET_NEXT(list, debug);
+		}
+
+		return(FALSE);
+
+	} else {
+		ut_error;
+	}
+
+	return(TRUE);	/* Execution never reaches this line: for compiler
+			fooling only */
+}
+#endif /* UNIV_SYNC_DEBUG */
+
+/******************************************************************//**
+Determines if we can wake up the thread waiting for a sempahore. */
+static
+ibool
+sync_arr_cell_can_wake_up(
+/*======================*/
+	sync_cell_t*	cell)	/*!< in: cell to search */
+{
+	mutex_t*	mutex;
+	rw_lock_t*	lock;
+
+	if (cell->request_type == SYNC_MUTEX) {
+
+		mutex = cell->wait_object;
+
+		if (mutex_get_lock_word(mutex) == 0) {
+
+			return(TRUE);
+		}
+
+	} else if (cell->request_type == RW_LOCK_EX) {
+
+		lock = cell->wait_object;
+
+		if (lock->lock_word > 0) {
+		/* Either unlocked or only read locked. */
+
+			return(TRUE);
+		}
+
+        } else if (cell->request_type == RW_LOCK_WAIT_EX) {
+
+		lock = cell->wait_object;
+
+                /* lock_word == 0 means all readers have left */
+		if (lock->lock_word == 0) {
+
+			return(TRUE);
+		}
+	} else if (cell->request_type == RW_LOCK_SHARED) {
+		lock = cell->wait_object;
+
+                /* lock_word > 0 means no writer or reserved writer */
+		if (lock->lock_word > 0) {
+
+			return(TRUE);
+		}
+	}
+
+	return(FALSE);
+}
+
+/******************************************************************//**
+Frees the cell. NOTE! sync_array_wait_event frees the cell
+automatically! */
+UNIV_INTERN
+void
+sync_array_free_cell(
+/*=================*/
+	sync_array_t*	arr,	/*!< in: wait array */
+	ulint		index)  /*!< in: index of the cell in array */
+{
+	sync_cell_t*	cell;
+
+	sync_array_enter(arr);
+
+	cell = sync_array_get_nth_cell(arr, index);
+
+	ut_a(cell->wait_object != NULL);
+
+	cell->waiting = FALSE;
+	cell->wait_object =  NULL;
+	cell->signal_count = 0;
+
+	ut_a(arr->n_reserved > 0);
+	arr->n_reserved--;
+
+	sync_array_exit(arr);
+}
+
+/**********************************************************************//**
+Increments the signalled count. */
+UNIV_INTERN
+void
+sync_array_object_signalled(
+/*========================*/
+	sync_array_t*	arr)	/*!< in: wait array */
+{
+#ifdef HAVE_ATOMIC_BUILTINS
+	(void) os_atomic_increment_ulint(&arr->sg_count, 1);
+#else
+	sync_array_enter(arr);
+
+	arr->sg_count++;
+
+	sync_array_exit(arr);
+#endif
+}
+
+/**********************************************************************//**
+If the wakeup algorithm does not work perfectly at semaphore relases,
+this function will do the waking (see the comment in mutex_exit). This
+function should be called about every 1 second in the server.
+
+Note that there's a race condition between this thread and mutex_exit
+changing the lock_word and calling signal_object, so sometimes this finds
+threads to wake up even when nothing has gone wrong. */
+UNIV_INTERN
+void
+sync_arr_wake_threads_if_sema_free(void)
+/*====================================*/
+{
+	sync_array_t*	arr	= sync_primary_wait_array;
+	sync_cell_t*	cell;
+	ulint		count;
+	ulint		i;
+	os_event_t      event;
+
+	sync_array_enter(arr);
+
+	i = 0;
+	count = 0;
+
+	while (count < arr->n_reserved) {
+
+		cell = sync_array_get_nth_cell(arr, i);
+		i++;
+
+		if (cell->wait_object == NULL) {
+			continue;
+		}
+			count++;
+
+			if (sync_arr_cell_can_wake_up(cell)) {
+
+			event = sync_cell_get_event(cell);
+
+			os_event_set(event);
+		}
+
+	}
+
+	sync_array_exit(arr);
+}
+
+/**********************************************************************//**
+Prints warnings of long semaphore waits to stderr.
+@return	TRUE if fatal semaphore wait threshold was exceeded */
+UNIV_INTERN
+ibool
+sync_array_print_long_waits(void)
+/*=============================*/
+{
+	sync_cell_t*	cell;
+	ibool		old_val;
+	ibool		noticed = FALSE;
+	ulint		i;
+	ulint		fatal_timeout = srv_fatal_semaphore_wait_threshold;
+	ibool		fatal = FALSE;
+
+	for (i = 0; i < sync_primary_wait_array->n_cells; i++) {
+
+		cell = sync_array_get_nth_cell(sync_primary_wait_array, i);
+
+		if (cell->wait_object != NULL && cell->waiting
+		    && difftime(time(NULL), cell->reservation_time) > 240) {
+			fputs("InnoDB: Warning: a long semaphore wait:\n",
+			      stderr);
+			sync_array_cell_print(stderr, cell);
+			noticed = TRUE;
+		}
+
+		if (cell->wait_object != NULL && cell->waiting
+		    && difftime(time(NULL), cell->reservation_time)
+		    > fatal_timeout) {
+			fatal = TRUE;
+		}
+	}
+
+	if (noticed) {
+		fprintf(stderr,
+			"InnoDB: ###### Starts InnoDB Monitor"
+			" for 30 secs to print diagnostic info:\n");
+		old_val = srv_print_innodb_monitor;
+
+		/* If some crucial semaphore is reserved, then also the InnoDB
+		Monitor can hang, and we do not get diagnostics. Since in
+		many cases an InnoDB hang is caused by a pwrite() or a pread()
+		call hanging inside the operating system, let us print right
+		now the values of pending calls of these. */
+
+		fprintf(stderr,
+			"InnoDB: Pending preads %lu, pwrites %lu\n",
+			(ulong)os_file_n_pending_preads,
+			(ulong)os_file_n_pending_pwrites);
+
+		srv_print_innodb_monitor = TRUE;
+		os_event_set(srv_lock_timeout_thread_event);
+
+		os_thread_sleep(30000000);
+
+		srv_print_innodb_monitor = old_val;
+		fprintf(stderr,
+			"InnoDB: ###### Diagnostic info printed"
+			" to the standard error stream\n");
+	}
+
+	return(fatal);
+}
+
+/**********************************************************************//**
+Prints info of the wait array. */
+static
+void
+sync_array_output_info(
+/*===================*/
+	FILE*		file,	/*!< in: file where to print */
+	sync_array_t*	arr)	/*!< in: wait array; NOTE! caller must own the
+				mutex */
+{
+	sync_cell_t*	cell;
+	ulint		count;
+	ulint		i;
+
+	fprintf(file,
+		"OS WAIT ARRAY INFO: reservation count %ld, signal count %ld\n",
+						(long) arr->res_count, (long) arr->sg_count);
+	i = 0;
+	count = 0;
+
+	while (count < arr->n_reserved) {
+
+		cell = sync_array_get_nth_cell(arr, i);
+
+	if (cell->wait_object != NULL) {
+		count++;
+			sync_array_cell_print(file, cell);
+		}
+
+		i++;
+	}
+}
+
+/**********************************************************************//**
+Prints info of the wait array. */
+UNIV_INTERN
+void
+sync_array_print_info(
+/*==================*/
+	FILE*		file,	/*!< in: file where to print */
+	sync_array_t*	arr)	/*!< in: wait array */
+{
+	sync_array_enter(arr);
+
+	sync_array_output_info(file, arr);
+
+	sync_array_exit(arr);
+}
diff --git a/storage/xtradb/sync/sync0rw.c b/storage/xtradb/sync/sync0rw.c
new file mode 100644
index 00000000000..9e10f6e943b
--- /dev/null
+++ b/storage/xtradb/sync/sync0rw.c
@@ -0,0 +1,1037 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 2008, Google Inc.
+
+Portions of this file contain modifications contributed and copyrighted by
+Google, Inc. Those modifications are gratefully acknowledged and are described
+briefly in the InnoDB documentation. The contributions by Google are
+incorporated with their permission, and subject to the conditions contained in
+the file COPYING.Google.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file sync/sync0rw.c
+The read-write lock (for thread synchronization)
+
+Created 9/11/1995 Heikki Tuuri
+*******************************************************/
+
+#include "sync0rw.h"
+#ifdef UNIV_NONINL
+#include "sync0rw.ic"
+#endif
+
+#include "os0thread.h"
+#include "mem0mem.h"
+#include "srv0srv.h"
+#include "os0sync.h" /* for INNODB_RW_LOCKS_USE_ATOMICS */
+
+/*
+	IMPLEMENTATION OF THE RW_LOCK
+	=============================
+The status of a rw_lock is held in lock_word. The initial value of lock_word is
+X_LOCK_DECR. lock_word is decremented by 1 for each s-lock and by X_LOCK_DECR
+for each x-lock. This describes the lock state for each value of lock_word:
+
+lock_word == X_LOCK_DECR:      Unlocked.
+0 < lock_word < X_LOCK_DECR:   Read locked, no waiting writers.
+			       (X_LOCK_DECR - lock_word) is the
+			       number of readers that hold the lock.
+lock_word == 0:		       Write locked
+-X_LOCK_DECR < lock_word < 0:  Read locked, with a waiting writer.
+			       (-lock_word) is the number of readers
+			       that hold the lock.
+lock_word <= -X_LOCK_DECR:     Recursively write locked. lock_word has been
+			       decremented by X_LOCK_DECR once for each lock,
+			       so the number of locks is:
+			       ((-lock_word) / X_LOCK_DECR) + 1
+When lock_word <= -X_LOCK_DECR, we also know that lock_word % X_LOCK_DECR == 0:
+other values of lock_word are invalid.
+
+The lock_word is always read and updated atomically and consistently, so that
+it always represents the state of the lock, and the state of the lock changes
+with a single atomic operation. This lock_word holds all of the information
+that a thread needs in order to determine if it is eligible to gain the lock
+or if it must spin or sleep. The one exception to this is that writer_thread
+must be verified before recursive write locks: to solve this scenario, we make
+writer_thread readable by all threads, but only writeable by the x-lock holder.
+
+The other members of the lock obey the following rules to remain consistent:
+
+recursive:	This and the writer_thread field together control the
+		behaviour of recursive x-locking.
+		lock->recursive must be FALSE in following states:
+			1) The writer_thread contains garbage i.e.: the
+			lock has just been initialized.
+			2) The lock is not x-held and there is no
+			x-waiter waiting on WAIT_EX event.
+			3) The lock is x-held or there is an x-waiter
+			waiting on WAIT_EX event but the 'pass' value
+			is non-zero.
+		lock->recursive is TRUE iff:
+			1) The lock is x-held or there is an x-waiter
+			waiting on WAIT_EX event and the 'pass' value
+			is zero.
+		This flag must be set after the writer_thread field
+		has been updated with a memory ordering barrier.
+		It is unset before the lock_word has been incremented.
+writer_thread:	Is used only in recursive x-locking. Can only be safely
+		read iff lock->recursive flag is TRUE.
+		This field is uninitialized at lock creation time and
+		is updated atomically when x-lock is acquired or when
+		move_ownership is called. A thread is only allowed to
+		set the value of this field to it's thread_id i.e.: a
+		thread cannot set writer_thread to some other thread's
+		id.
+waiters:	May be set to 1 anytime, but to avoid unnecessary wake-up
+		signals, it should only be set to 1 when there are threads
+		waiting on event. Must be 1 when a writer starts waiting to
+		ensure the current x-locking thread sends a wake-up signal
+		during unlock. May only be reset to 0 immediately before a
+		a wake-up signal is sent to event. On most platforms, a
+		memory barrier is required after waiters is set, and before
+		verifying lock_word is still held, to ensure some unlocker
+		really does see the flags new value.
+event:		Threads wait on event for read or writer lock when another
+		thread has an x-lock or an x-lock reservation (wait_ex). A
+		thread may only	wait on event after performing the following
+		actions in order:
+		   (1) Record the counter value of event (with os_event_reset).
+		   (2) Set waiters to 1.
+		   (3) Verify lock_word <= 0.
+		(1) must come before (2) to ensure signal is not missed.
+		(2) must come before (3) to ensure a signal is sent.
+		These restrictions force the above ordering.
+		Immediately before sending the wake-up signal, we should:
+		   (1) Verify lock_word == X_LOCK_DECR (unlocked)
+		   (2) Reset waiters to 0.
+wait_ex_event:	A thread may only wait on the wait_ex_event after it has
+		performed the following actions in order:
+		   (1) Decrement lock_word by X_LOCK_DECR.
+		   (2) Record counter value of wait_ex_event (os_event_reset,
+                       called from sync_array_reserve_cell).
+		   (3) Verify that lock_word < 0.
+		(1) must come first to ensures no other threads become reader
+                or next writer, and notifies unlocker that signal must be sent.
+                (2) must come before (3) to ensure the signal is not missed.
+		These restrictions force the above ordering.
+		Immediately before sending the wake-up signal, we should:
+		   Verify lock_word == 0 (waiting thread holds x_lock)
+*/
+
+
+/** number of spin waits on rw-latches,
+resulted during shared (read) locks */
+UNIV_INTERN ib_int64_t	rw_s_spin_wait_count	= 0;
+/** number of spin loop rounds on rw-latches,
+resulted during shared (read) locks */
+UNIV_INTERN ib_int64_t	rw_s_spin_round_count	= 0;
+
+/** number of OS waits on rw-latches,
+resulted during shared (read) locks */
+UNIV_INTERN ib_int64_t	rw_s_os_wait_count	= 0;
+
+/** number of unlocks (that unlock shared locks),
+set only when UNIV_SYNC_PERF_STAT is defined */
+UNIV_INTERN ib_int64_t	rw_s_exit_count		= 0;
+
+/** number of spin waits on rw-latches,
+resulted during exclusive (write) locks */
+UNIV_INTERN ib_int64_t	rw_x_spin_wait_count	= 0;
+/** number of spin loop rounds on rw-latches,
+resulted during exclusive (write) locks */
+UNIV_INTERN ib_int64_t	rw_x_spin_round_count	= 0;
+
+/** number of OS waits on rw-latches,
+resulted during exclusive (write) locks */
+UNIV_INTERN ib_int64_t	rw_x_os_wait_count	= 0;
+
+/** number of unlocks (that unlock exclusive locks),
+set only when UNIV_SYNC_PERF_STAT is defined */
+UNIV_INTERN ib_int64_t	rw_x_exit_count		= 0;
+
+/* The global list of rw-locks */
+UNIV_INTERN rw_lock_list_t	rw_lock_list;
+UNIV_INTERN mutex_t		rw_lock_list_mutex;
+
+#ifdef UNIV_SYNC_DEBUG
+/* The global mutex which protects debug info lists of all rw-locks.
+To modify the debug info list of an rw-lock, this mutex has to be
+acquired in addition to the mutex protecting the lock. */
+
+UNIV_INTERN mutex_t		rw_lock_debug_mutex;
+/* If deadlock detection does not get immediately the mutex,
+it may wait for this event */
+UNIV_INTERN os_event_t		rw_lock_debug_event;
+/* This is set to TRUE, if there may be waiters for the event */
+UNIV_INTERN ibool		rw_lock_debug_waiters;
+
+/******************************************************************//**
+Creates a debug info struct. */
+static
+rw_lock_debug_t*
+rw_lock_debug_create(void);
+/*======================*/
+/******************************************************************//**
+Frees a debug info struct. */
+static
+void
+rw_lock_debug_free(
+/*===============*/
+	rw_lock_debug_t* info);
+
+/******************************************************************//**
+Creates a debug info struct.
+@return	own: debug info struct */
+static
+rw_lock_debug_t*
+rw_lock_debug_create(void)
+/*======================*/
+{
+	return((rw_lock_debug_t*) mem_alloc(sizeof(rw_lock_debug_t)));
+}
+
+/******************************************************************//**
+Frees a debug info struct. */
+static
+void
+rw_lock_debug_free(
+/*===============*/
+	rw_lock_debug_t* info)
+{
+	mem_free(info);
+}
+#endif /* UNIV_SYNC_DEBUG */
+
+/******************************************************************//**
+Creates, or rather, initializes an rw-lock object in a specified memory
+location (which must be appropriately aligned). The rw-lock is initialized
+to the non-locked state. Explicit freeing of the rw-lock with rw_lock_free
+is necessary only if the memory block containing it is freed. */
+UNIV_INTERN
+void
+rw_lock_create_func(
+/*================*/
+	rw_lock_t*	lock,		/*!< in: pointer to memory */
+#ifdef UNIV_DEBUG
+# ifdef UNIV_SYNC_DEBUG
+	ulint		level,		/*!< in: level */
+# endif /* UNIV_SYNC_DEBUG */
+#endif /* UNIV_DEBUG */
+	const char*	cmutex_name, 	/*!< in: mutex name */
+	const char*	cfile_name,	/*!< in: file name where created */
+	ulint		cline)		/*!< in: file line where created */
+{
+	/* If this is the very first time a synchronization object is
+	created, then the following call initializes the sync system. */
+
+#ifndef INNODB_RW_LOCKS_USE_ATOMICS
+	mutex_create(rw_lock_get_mutex(lock), SYNC_NO_ORDER_CHECK);
+
+	ut_d(lock->mutex.cfile_name = cfile_name);
+	ut_d(lock->mutex.cline = cline);
+
+	lock->mutex.cmutex_name = cmutex_name;
+	ut_d(lock->mutex.mutex_type = 1);
+#else /* INNODB_RW_LOCKS_USE_ATOMICS */
+# ifdef UNIV_DEBUG
+	UT_NOT_USED(cfile_name);
+	UT_NOT_USED(cline);
+# endif
+#endif /* INNODB_RW_LOCKS_USE_ATOMICS */
+
+	lock->lock_word = X_LOCK_DECR;
+	lock->waiters = 0;
+
+	/* We set this value to signify that lock->writer_thread
+	contains garbage at initialization and cannot be used for
+	recursive x-locking. */
+	lock->recursive = FALSE;
+
+#ifdef UNIV_SYNC_DEBUG
+	UT_LIST_INIT(lock->debug_list);
+
+	lock->level = level;
+#endif /* UNIV_SYNC_DEBUG */
+
+	ut_d(lock->magic_n = RW_LOCK_MAGIC_N);
+
+	lock->lock_name = cmutex_name;
+
+	lock->count_os_wait = 0;
+	lock->last_s_file_name = "not yet reserved";
+	lock->last_x_file_name = "not yet reserved";
+	lock->last_s_line = 0;
+	lock->last_x_line = 0;
+	lock->event = os_event_create(NULL);
+	lock->wait_ex_event = os_event_create(NULL);
+
+	mutex_enter(&rw_lock_list_mutex);
+
+	ut_ad(UT_LIST_GET_FIRST(rw_lock_list) == NULL
+	      || UT_LIST_GET_FIRST(rw_lock_list)->magic_n == RW_LOCK_MAGIC_N);
+
+	UT_LIST_ADD_FIRST(list, rw_lock_list, lock);
+
+	mutex_exit(&rw_lock_list_mutex);
+}
+
+/******************************************************************//**
+Calling this function is obligatory only if the memory buffer containing
+the rw-lock is freed. Removes an rw-lock object from the global list. The
+rw-lock is checked to be in the non-locked state. */
+UNIV_INTERN
+void
+rw_lock_free(
+/*=========*/
+	rw_lock_t*	lock)	/*!< in: rw-lock */
+{
+	ut_ad(rw_lock_validate(lock));
+	ut_a(lock->lock_word == X_LOCK_DECR);
+
+#ifndef INNODB_RW_LOCKS_USE_ATOMICS
+	mutex_free(rw_lock_get_mutex(lock));
+#endif /* INNODB_RW_LOCKS_USE_ATOMICS */
+
+	mutex_enter(&rw_lock_list_mutex);
+	os_event_free(lock->event);
+
+	os_event_free(lock->wait_ex_event);
+
+	ut_ad(UT_LIST_GET_PREV(list, lock) == NULL
+	      || UT_LIST_GET_PREV(list, lock)->magic_n == RW_LOCK_MAGIC_N);
+	ut_ad(UT_LIST_GET_NEXT(list, lock) == NULL
+	      || UT_LIST_GET_NEXT(list, lock)->magic_n == RW_LOCK_MAGIC_N);
+
+	UT_LIST_REMOVE(list, rw_lock_list, lock);
+
+	mutex_exit(&rw_lock_list_mutex);
+
+	ut_d(lock->magic_n = 0);
+}
+
+#ifdef UNIV_DEBUG
+/******************************************************************//**
+Checks that the rw-lock has been initialized and that there are no
+simultaneous shared and exclusive locks.
+@return	TRUE */
+UNIV_INTERN
+ibool
+rw_lock_validate(
+/*=============*/
+	rw_lock_t*	lock)	/*!< in: rw-lock */
+{
+	ut_a(lock);
+
+	ulint waiters = rw_lock_get_waiters(lock);
+	lint lock_word = lock->lock_word;
+
+	ut_ad(lock->magic_n == RW_LOCK_MAGIC_N);
+	ut_a(waiters == 0 || waiters == 1);
+	ut_a(lock_word > -X_LOCK_DECR ||(-lock_word) % X_LOCK_DECR == 0);
+
+	return(TRUE);
+}
+#endif /* UNIV_DEBUG */
+
+/******************************************************************//**
+Lock an rw-lock in shared mode for the current thread. If the rw-lock is
+locked in exclusive mode, or there is an exclusive lock request waiting,
+the function spins a preset time (controlled by SYNC_SPIN_ROUNDS), waiting
+for the lock, before suspending the thread. */
+UNIV_INTERN
+void
+rw_lock_s_lock_spin(
+/*================*/
+	rw_lock_t*	lock,	/*!< in: pointer to rw-lock */
+	ulint		pass,	/*!< in: pass value; != 0, if the lock
+				will be passed to another thread to unlock */
+	const char*	file_name, /*!< in: file name where lock requested */
+	ulint		line)	/*!< in: line where requested */
+{
+	ulint	 index;	/* index of the reserved wait cell */
+	ulint	 i = 0;	/* spin round count */
+
+	ut_ad(rw_lock_validate(lock));
+
+	rw_s_spin_wait_count++;	/*!< Count calls to this function */
+lock_loop:
+
+	/* Spin waiting for the writer field to become free */
+	while (i < SYNC_SPIN_ROUNDS && lock->lock_word <= 0) {
+		if (srv_spin_wait_delay) {
+			ut_delay(ut_rnd_interval(0, srv_spin_wait_delay));
+		}
+
+		i++;
+	}
+
+	if (i == SYNC_SPIN_ROUNDS) {
+		os_thread_yield();
+	}
+
+	if (srv_print_latch_waits) {
+		fprintf(stderr,
+			"Thread %lu spin wait rw-s-lock at %p"
+			" '%s' rnds %lu\n",
+			(ulong) os_thread_pf(os_thread_get_curr_id()),
+			(void*) lock,
+			lock->lock_name, (ulong) i);
+	}
+
+	/* We try once again to obtain the lock */
+	if (TRUE == rw_lock_s_lock_low(lock, pass, file_name, line)) {
+		rw_s_spin_round_count += i;
+
+		return; /* Success */
+	} else {
+
+		if (i < SYNC_SPIN_ROUNDS) {
+			goto lock_loop;
+		}
+
+		rw_s_spin_round_count += i;
+
+		sync_array_reserve_cell(sync_primary_wait_array,
+					lock, RW_LOCK_SHARED,
+					file_name, line,
+					&index);
+
+		/* Set waiters before checking lock_word to ensure wake-up
+                signal is sent. This may lead to some unnecessary signals. */
+		rw_lock_set_waiter_flag(lock);
+
+		if (TRUE == rw_lock_s_lock_low(lock, pass, file_name, line)) {
+			sync_array_free_cell(sync_primary_wait_array, index);
+			return; /* Success */
+		}
+
+		if (srv_print_latch_waits) {
+			fprintf(stderr,
+				"Thread %lu OS wait rw-s-lock at %p"
+				" '%s'\n",
+				os_thread_pf(os_thread_get_curr_id()),
+				(void*) lock, lock->lock_name);
+		}
+
+		/* these stats may not be accurate */
+		lock->count_os_wait++;
+		rw_s_os_wait_count++;
+
+		sync_array_wait_event(sync_primary_wait_array, index);
+
+		i = 0;
+		goto lock_loop;
+	}
+}
+
+/******************************************************************//**
+This function is used in the insert buffer to move the ownership of an
+x-latch on a buffer frame to the current thread. The x-latch was set by
+the buffer read operation and it protected the buffer frame while the
+read was done. The ownership is moved because we want that the current
+thread is able to acquire a second x-latch which is stored in an mtr.
+This, in turn, is needed to pass the debug checks of index page
+operations. */
+UNIV_INTERN
+void
+rw_lock_x_lock_move_ownership(
+/*==========================*/
+	rw_lock_t*	lock)	/*!< in: lock which was x-locked in the
+				buffer read */
+{
+	ut_ad(rw_lock_is_locked(lock, RW_LOCK_EX));
+
+	rw_lock_set_writer_id_and_recursion_flag(lock, TRUE);
+}
+
+/******************************************************************//**
+Function for the next writer to call. Waits for readers to exit.
+The caller must have already decremented lock_word by X_LOCK_DECR. */
+UNIV_INLINE
+void
+rw_lock_x_lock_wait(
+/*================*/
+	rw_lock_t*	lock,	/*!< in: pointer to rw-lock */
+#ifdef UNIV_SYNC_DEBUG
+	ulint		pass,	/*!< in: pass value; != 0, if the lock will
+				be passed to another thread to unlock */
+#endif
+	const char*	file_name,/*!< in: file name where lock requested */
+	ulint		line)	/*!< in: line where requested */
+{
+	ulint index;
+	ulint i = 0;
+
+	ut_ad(lock->lock_word <= 0);
+
+	while (lock->lock_word < 0) {
+		if (srv_spin_wait_delay) {
+			ut_delay(ut_rnd_interval(0, srv_spin_wait_delay));
+		}
+		if(i < SYNC_SPIN_ROUNDS) {
+			i++;
+			continue;
+		}
+
+		/* If there is still a reader, then go to sleep.*/
+		rw_x_spin_round_count += i;
+		i = 0;
+		sync_array_reserve_cell(sync_primary_wait_array,
+					lock,
+					RW_LOCK_WAIT_EX,
+					file_name, line,
+					&index);
+		/* Check lock_word to ensure wake-up isn't missed.*/
+		if(lock->lock_word < 0) {
+
+			/* these stats may not be accurate */
+			lock->count_os_wait++;
+			rw_x_os_wait_count++;
+
+                        /* Add debug info as it is needed to detect possible
+                        deadlock. We must add info for WAIT_EX thread for
+                        deadlock detection to work properly. */
+#ifdef UNIV_SYNC_DEBUG
+			rw_lock_add_debug_info(lock, pass, RW_LOCK_WAIT_EX,
+					       file_name, line);
+#endif
+
+			sync_array_wait_event(sync_primary_wait_array,
+					      index);
+#ifdef UNIV_SYNC_DEBUG
+			rw_lock_remove_debug_info(lock, pass,
+					       RW_LOCK_WAIT_EX);
+#endif
+                        /* It is possible to wake when lock_word < 0.
+                        We must pass the while-loop check to proceed.*/
+		} else {
+			sync_array_free_cell(sync_primary_wait_array,
+					     index);
+		}
+	}
+	rw_x_spin_round_count += i;
+}
+
+/******************************************************************//**
+Low-level function for acquiring an exclusive lock.
+@return	RW_LOCK_NOT_LOCKED if did not succeed, RW_LOCK_EX if success. */
+UNIV_INLINE
+ibool
+rw_lock_x_lock_low(
+/*===============*/
+	rw_lock_t*	lock,	/*!< in: pointer to rw-lock */
+	ulint		pass,	/*!< in: pass value; != 0, if the lock will
+				be passed to another thread to unlock */
+	const char*	file_name,/*!< in: file name where lock requested */
+	ulint		line)	/*!< in: line where requested */
+{
+	os_thread_id_t	curr_thread	= os_thread_get_curr_id();
+
+	if (rw_lock_lock_word_decr(lock, X_LOCK_DECR)) {
+
+		/* lock->recursive also tells us if the writer_thread
+		field is stale or active. As we are going to write
+		our own thread id in that field it must be that the
+		current writer_thread value is not active. */
+		ut_a(!lock->recursive);
+
+		/* Decrement occurred: we are writer or next-writer. */
+		rw_lock_set_writer_id_and_recursion_flag(lock,
+						pass ? FALSE : TRUE);
+
+		rw_lock_x_lock_wait(lock,
+#ifdef UNIV_SYNC_DEBUG
+				    pass,
+#endif
+                                    file_name, line);
+
+	} else {
+		/* Decrement failed: relock or failed lock */
+		if (!pass && lock->recursive
+		    && os_thread_eq(lock->writer_thread, curr_thread)) {
+			/* Relock */
+                        lock->lock_word -= X_LOCK_DECR;
+		} else {
+			/* Another thread locked before us */
+			return(FALSE);
+		}
+	}
+#ifdef UNIV_SYNC_DEBUG
+	rw_lock_add_debug_info(lock, pass, RW_LOCK_EX,
+			       file_name, line);
+#endif
+	lock->last_x_file_name = file_name;
+	lock->last_x_line = (unsigned int) line;
+
+	return(TRUE);
+}
+
+/******************************************************************//**
+NOTE! Use the corresponding macro, not directly this function! Lock an
+rw-lock in exclusive mode for the current thread. If the rw-lock is locked
+in shared or exclusive mode, or there is an exclusive lock request waiting,
+the function spins a preset time (controlled by SYNC_SPIN_ROUNDS), waiting
+for the lock before suspending the thread. If the same thread has an x-lock
+on the rw-lock, locking succeed, with the following exception: if pass != 0,
+only a single x-lock may be taken on the lock. NOTE: If the same thread has
+an s-lock, locking does not succeed! */
+UNIV_INTERN
+void
+rw_lock_x_lock_func(
+/*================*/
+	rw_lock_t*	lock,	/*!< in: pointer to rw-lock */
+	ulint		pass,	/*!< in: pass value; != 0, if the lock will
+				be passed to another thread to unlock */
+	const char*	file_name,/*!< in: file name where lock requested */
+	ulint		line)	/*!< in: line where requested */
+{
+	ulint	index;	/*!< index of the reserved wait cell */
+	ulint	i;	/*!< spin round count */
+	ibool   spinning = FALSE;
+
+	ut_ad(rw_lock_validate(lock));
+
+	i = 0;
+
+lock_loop:
+
+	if (rw_lock_x_lock_low(lock, pass, file_name, line)) {
+		rw_x_spin_round_count += i;
+
+		return;	/* Locking succeeded */
+
+	} else {
+
+                if (!spinning) {
+                        spinning = TRUE;
+                        rw_x_spin_wait_count++;
+		}
+
+		/* Spin waiting for the lock_word to become free */
+		while (i < SYNC_SPIN_ROUNDS
+		       && lock->lock_word <= 0) {
+			if (srv_spin_wait_delay) {
+				ut_delay(ut_rnd_interval(0,
+							 srv_spin_wait_delay));
+			}
+
+			i++;
+		}
+		if (i == SYNC_SPIN_ROUNDS) {
+			os_thread_yield();
+		} else {
+			goto lock_loop;
+		}
+	}
+
+	rw_x_spin_round_count += i;
+
+	if (srv_print_latch_waits) {
+		fprintf(stderr,
+			"Thread %lu spin wait rw-x-lock at %p"
+			" '%s' rnds %lu\n",
+			os_thread_pf(os_thread_get_curr_id()), (void*) lock,
+			lock->lock_name, (ulong) i);
+	}
+
+	sync_array_reserve_cell(sync_primary_wait_array,
+				lock,
+				RW_LOCK_EX,
+				file_name, line,
+				&index);
+
+	/* Waiters must be set before checking lock_word, to ensure signal
+	is sent. This could lead to a few unnecessary wake-up signals. */
+	rw_lock_set_waiter_flag(lock);
+
+	if (rw_lock_x_lock_low(lock, pass, file_name, line)) {
+		sync_array_free_cell(sync_primary_wait_array, index);
+		return; /* Locking succeeded */
+	}
+
+	if (srv_print_latch_waits) {
+		fprintf(stderr,
+			"Thread %lu OS wait for rw-x-lock at %p"
+			" '%s'\n",
+			os_thread_pf(os_thread_get_curr_id()), (void*) lock,
+			lock->lock_name);
+	}
+
+	/* these stats may not be accurate */
+	lock->count_os_wait++;
+	rw_x_os_wait_count++;
+
+	sync_array_wait_event(sync_primary_wait_array, index);
+
+	i = 0;
+	goto lock_loop;
+}
+
+#ifdef UNIV_SYNC_DEBUG
+/******************************************************************//**
+Acquires the debug mutex. We cannot use the mutex defined in sync0sync,
+because the debug mutex is also acquired in sync0arr while holding the OS
+mutex protecting the sync array, and the ordinary mutex_enter might
+recursively call routines in sync0arr, leading to a deadlock on the OS
+mutex. */
+UNIV_INTERN
+void
+rw_lock_debug_mutex_enter(void)
+/*==========================*/
+{
+loop:
+	if (0 == mutex_enter_nowait(&rw_lock_debug_mutex)) {
+		return;
+	}
+
+	os_event_reset(rw_lock_debug_event);
+
+	rw_lock_debug_waiters = TRUE;
+
+	if (0 == mutex_enter_nowait(&rw_lock_debug_mutex)) {
+		return;
+	}
+
+	os_event_wait(rw_lock_debug_event);
+
+	goto loop;
+}
+
+/******************************************************************//**
+Releases the debug mutex. */
+UNIV_INTERN
+void
+rw_lock_debug_mutex_exit(void)
+/*==========================*/
+{
+	mutex_exit(&rw_lock_debug_mutex);
+
+	if (rw_lock_debug_waiters) {
+		rw_lock_debug_waiters = FALSE;
+		os_event_set(rw_lock_debug_event);
+	}
+}
+
+/******************************************************************//**
+Inserts the debug information for an rw-lock. */
+UNIV_INTERN
+void
+rw_lock_add_debug_info(
+/*===================*/
+	rw_lock_t*	lock,		/*!< in: rw-lock */
+	ulint		pass,		/*!< in: pass value */
+	ulint		lock_type,	/*!< in: lock type */
+	const char*	file_name,	/*!< in: file where requested */
+	ulint		line)		/*!< in: line where requested */
+{
+	rw_lock_debug_t*	info;
+
+	ut_ad(lock);
+	ut_ad(file_name);
+
+	info = rw_lock_debug_create();
+
+	rw_lock_debug_mutex_enter();
+
+	info->file_name = file_name;
+	info->line	= line;
+	info->lock_type = lock_type;
+	info->thread_id = os_thread_get_curr_id();
+	info->pass	= pass;
+
+	UT_LIST_ADD_FIRST(list, lock->debug_list, info);
+
+	rw_lock_debug_mutex_exit();
+
+	if ((pass == 0) && (lock_type != RW_LOCK_WAIT_EX)) {
+		sync_thread_add_level(lock, lock->level);
+	}
+}
+
+/******************************************************************//**
+Removes a debug information struct for an rw-lock. */
+UNIV_INTERN
+void
+rw_lock_remove_debug_info(
+/*======================*/
+	rw_lock_t*	lock,		/*!< in: rw-lock */
+	ulint		pass,		/*!< in: pass value */
+	ulint		lock_type)	/*!< in: lock type */
+{
+	rw_lock_debug_t*	info;
+
+	ut_ad(lock);
+
+	if ((pass == 0) && (lock_type != RW_LOCK_WAIT_EX)) {
+		sync_thread_reset_level(lock);
+	}
+
+	rw_lock_debug_mutex_enter();
+
+	info = UT_LIST_GET_FIRST(lock->debug_list);
+
+	while (info != NULL) {
+		if ((pass == info->pass)
+		    && ((pass != 0)
+			|| os_thread_eq(info->thread_id,
+					os_thread_get_curr_id()))
+		    && (info->lock_type == lock_type)) {
+
+			/* Found! */
+			UT_LIST_REMOVE(list, lock->debug_list, info);
+			rw_lock_debug_mutex_exit();
+
+			rw_lock_debug_free(info);
+
+			return;
+		}
+
+		info = UT_LIST_GET_NEXT(list, info);
+	}
+
+	ut_error;
+}
+#endif /* UNIV_SYNC_DEBUG */
+
+#ifdef UNIV_SYNC_DEBUG
+/******************************************************************//**
+Checks if the thread has locked the rw-lock in the specified mode, with
+the pass value == 0.
+@return	TRUE if locked */
+UNIV_INTERN
+ibool
+rw_lock_own(
+/*========*/
+	rw_lock_t*	lock,		/*!< in: rw-lock */
+	ulint		lock_type)	/*!< in: lock type: RW_LOCK_SHARED,
+					RW_LOCK_EX */
+{
+	rw_lock_debug_t*	info;
+
+	ut_ad(lock);
+	ut_ad(rw_lock_validate(lock));
+
+	rw_lock_debug_mutex_enter();
+
+	info = UT_LIST_GET_FIRST(lock->debug_list);
+
+	while (info != NULL) {
+
+		if (os_thread_eq(info->thread_id, os_thread_get_curr_id())
+		    && (info->pass == 0)
+		    && (info->lock_type == lock_type)) {
+
+			rw_lock_debug_mutex_exit();
+			/* Found! */
+
+			return(TRUE);
+		}
+
+		info = UT_LIST_GET_NEXT(list, info);
+	}
+	rw_lock_debug_mutex_exit();
+
+	return(FALSE);
+}
+#endif /* UNIV_SYNC_DEBUG */
+
+/******************************************************************//**
+Checks if somebody has locked the rw-lock in the specified mode.
+@return	TRUE if locked */
+UNIV_INTERN
+ibool
+rw_lock_is_locked(
+/*==============*/
+	rw_lock_t*	lock,		/*!< in: rw-lock */
+	ulint		lock_type)	/*!< in: lock type: RW_LOCK_SHARED,
+					RW_LOCK_EX */
+{
+	ibool	ret	= FALSE;
+
+	ut_ad(lock);
+	ut_ad(rw_lock_validate(lock));
+
+	if (lock_type == RW_LOCK_SHARED) {
+		if (rw_lock_get_reader_count(lock) > 0) {
+			ret = TRUE;
+		}
+	} else if (lock_type == RW_LOCK_EX) {
+		if (rw_lock_get_writer(lock) == RW_LOCK_EX) {
+			ret = TRUE;
+		}
+	} else {
+		ut_error;
+	}
+
+	return(ret);
+}
+
+#ifdef UNIV_SYNC_DEBUG
+/***************************************************************//**
+Prints debug info of currently locked rw-locks. */
+UNIV_INTERN
+void
+rw_lock_list_print_info(
+/*====================*/
+	FILE*	file)		/*!< in: file where to print */
+{
+	rw_lock_t*	lock;
+	ulint		count		= 0;
+	rw_lock_debug_t* info;
+
+	mutex_enter(&rw_lock_list_mutex);
+
+	fputs("-------------\n"
+	      "RW-LATCH INFO\n"
+	      "-------------\n", file);
+
+	lock = UT_LIST_GET_FIRST(rw_lock_list);
+
+	while (lock != NULL) {
+
+		count++;
+
+#ifndef INNODB_RW_LOCKS_USE_ATOMICS
+		mutex_enter(&(lock->mutex));
+#endif
+		if (lock->lock_word != X_LOCK_DECR) {
+
+			fprintf(file, "RW-LOCK: %p ", (void*) lock);
+
+			if (rw_lock_get_waiters(lock)) {
+				fputs(" Waiters for the lock exist\n", file);
+			} else {
+				putc('\n', file);
+			}
+
+			info = UT_LIST_GET_FIRST(lock->debug_list);
+			while (info != NULL) {
+				rw_lock_debug_print(info);
+				info = UT_LIST_GET_NEXT(list, info);
+			}
+		}
+#ifndef INNODB_RW_LOCKS_USE_ATOMICS
+		mutex_exit(&(lock->mutex));
+#endif
+
+		lock = UT_LIST_GET_NEXT(list, lock);
+	}
+
+	fprintf(file, "Total number of rw-locks %ld\n", count);
+	mutex_exit(&rw_lock_list_mutex);
+}
+
+/***************************************************************//**
+Prints debug info of an rw-lock. */
+UNIV_INTERN
+void
+rw_lock_print(
+/*==========*/
+	rw_lock_t*	lock)	/*!< in: rw-lock */
+{
+	rw_lock_debug_t* info;
+
+	fprintf(stderr,
+		"-------------\n"
+		"RW-LATCH INFO\n"
+		"RW-LATCH: %p ", (void*) lock);
+
+#ifndef INNODB_RW_LOCKS_USE_ATOMICS
+	/* We used to acquire lock->mutex here, but it would cause a
+	recursive call to sync_thread_add_level() if UNIV_SYNC_DEBUG
+	is defined.  Since this function is only invoked from
+	sync_thread_levels_g(), let us choose the smaller evil:
+	performing dirty reads instead of causing bogus deadlocks or
+	assertion failures. */
+#endif
+	if (lock->lock_word != X_LOCK_DECR) {
+
+		if (rw_lock_get_waiters(lock)) {
+			fputs(" Waiters for the lock exist\n", stderr);
+		} else {
+			putc('\n', stderr);
+		}
+
+		info = UT_LIST_GET_FIRST(lock->debug_list);
+		while (info != NULL) {
+			rw_lock_debug_print(info);
+			info = UT_LIST_GET_NEXT(list, info);
+		}
+	}
+}
+
+/*********************************************************************//**
+Prints info of a debug struct. */
+UNIV_INTERN
+void
+rw_lock_debug_print(
+/*================*/
+	rw_lock_debug_t*	info)	/*!< in: debug struct */
+{
+	ulint	rwt;
+
+	rwt	  = info->lock_type;
+
+	fprintf(stderr, "Locked: thread %ld file %s line %ld  ",
+		(ulong) os_thread_pf(info->thread_id), info->file_name,
+		(ulong) info->line);
+	if (rwt == RW_LOCK_SHARED) {
+		fputs("S-LOCK", stderr);
+	} else if (rwt == RW_LOCK_EX) {
+		fputs("X-LOCK", stderr);
+	} else if (rwt == RW_LOCK_WAIT_EX) {
+		fputs("WAIT X-LOCK", stderr);
+	} else {
+		ut_error;
+	}
+	if (info->pass != 0) {
+		fprintf(stderr, " pass value %lu", (ulong) info->pass);
+	}
+	putc('\n', stderr);
+}
+
+/***************************************************************//**
+Returns the number of currently locked rw-locks. Works only in the debug
+version.
+@return	number of locked rw-locks */
+UNIV_INTERN
+ulint
+rw_lock_n_locked(void)
+/*==================*/
+{
+	rw_lock_t*	lock;
+	ulint		count		= 0;
+
+	mutex_enter(&rw_lock_list_mutex);
+
+	lock = UT_LIST_GET_FIRST(rw_lock_list);
+
+	while (lock != NULL) {
+
+		if (lock->lock_word != X_LOCK_DECR) {
+			count++;
+		}
+
+		lock = UT_LIST_GET_NEXT(list, lock);
+	}
+
+	mutex_exit(&rw_lock_list_mutex);
+
+	return(count);
+}
+#endif /* UNIV_SYNC_DEBUG */
diff --git a/storage/xtradb/sync/sync0sync.c b/storage/xtradb/sync/sync0sync.c
new file mode 100644
index 00000000000..225f28df78e
--- /dev/null
+++ b/storage/xtradb/sync/sync0sync.c
@@ -0,0 +1,1525 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2010, Innobase Oy. All Rights Reserved.
+Copyright (c) 2008, Google Inc.
+
+Portions of this file contain modifications contributed and copyrighted by
+Google, Inc. Those modifications are gratefully acknowledged and are described
+briefly in the InnoDB documentation. The contributions by Google are
+incorporated with their permission, and subject to the conditions contained in
+the file COPYING.Google.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file sync/sync0sync.c
+Mutex, the basic synchronization primitive
+
+Created 9/5/1995 Heikki Tuuri
+*******************************************************/
+
+#include "sync0sync.h"
+#ifdef UNIV_NONINL
+#include "sync0sync.ic"
+#endif
+
+#include "sync0rw.h"
+#include "buf0buf.h"
+#include "srv0srv.h"
+#include "buf0types.h"
+#include "os0sync.h" /* for HAVE_ATOMIC_BUILTINS */
+
+/*
+	REASONS FOR IMPLEMENTING THE SPIN LOCK MUTEX
+	============================================
+
+Semaphore operations in operating systems are slow: Solaris on a 1993 Sparc
+takes 3 microseconds (us) for a lock-unlock pair and Windows NT on a 1995
+Pentium takes 20 microseconds for a lock-unlock pair. Therefore, we have to
+implement our own efficient spin lock mutex. Future operating systems may
+provide efficient spin locks, but we cannot count on that.
+
+Another reason for implementing a spin lock is that on multiprocessor systems
+it can be more efficient for a processor to run a loop waiting for the
+semaphore to be released than to switch to a different thread. A thread switch
+takes 25 us on both platforms mentioned above. See Gray and Reuter's book
+Transaction processing for background.
+
+How long should the spin loop last before suspending the thread? On a
+uniprocessor, spinning does not help at all, because if the thread owning the
+mutex is not executing, it cannot be released. Spinning actually wastes
+resources.
+
+On a multiprocessor, we do not know if the thread owning the mutex is
+executing or not. Thus it would make sense to spin as long as the operation
+guarded by the mutex would typically last assuming that the thread is
+executing. If the mutex is not released by that time, we may assume that the
+thread owning the mutex is not executing and suspend the waiting thread.
+
+A typical operation (where no i/o involved) guarded by a mutex or a read-write
+lock may last 1 - 20 us on the current Pentium platform. The longest
+operations are the binary searches on an index node.
+
+We conclude that the best choice is to set the spin time at 20 us. Then the
+system should work well on a multiprocessor. On a uniprocessor we have to
+make sure that thread swithches due to mutex collisions are not frequent,
+i.e., they do not happen every 100 us or so, because that wastes too much
+resources. If the thread switches are not frequent, the 20 us wasted in spin
+loop is not too much.
+
+Empirical studies on the effect of spin time should be done for different
+platforms.
+
+
+	IMPLEMENTATION OF THE MUTEX
+	===========================
+
+For background, see Curt Schimmel's book on Unix implementation on modern
+architectures. The key points in the implementation are atomicity and
+serialization of memory accesses. The test-and-set instruction (XCHG in
+Pentium) must be atomic. As new processors may have weak memory models, also
+serialization of memory references may be necessary. The successor of Pentium,
+P6, has at least one mode where the memory model is weak. As far as we know,
+in Pentium all memory accesses are serialized in the program order and we do
+not have to worry about the memory model. On other processors there are
+special machine instructions called a fence, memory barrier, or storage
+barrier (STBAR in Sparc), which can be used to serialize the memory accesses
+to happen in program order relative to the fence instruction.
+
+Leslie Lamport has devised a "bakery algorithm" to implement a mutex without
+the atomic test-and-set, but his algorithm should be modified for weak memory
+models. We do not use Lamport's algorithm, because we guess it is slower than
+the atomic test-and-set.
+
+Our mutex implementation works as follows: After that we perform the atomic
+test-and-set instruction on the memory word. If the test returns zero, we
+know we got the lock first. If the test returns not zero, some other thread
+was quicker and got the lock: then we spin in a loop reading the memory word,
+waiting it to become zero. It is wise to just read the word in the loop, not
+perform numerous test-and-set instructions, because they generate memory
+traffic between the cache and the main memory. The read loop can just access
+the cache, saving bus bandwidth.
+
+If we cannot acquire the mutex lock in the specified time, we reserve a cell
+in the wait array, set the waiters byte in the mutex to 1. To avoid a race
+condition, after setting the waiters byte and before suspending the waiting
+thread, we still have to check that the mutex is reserved, because it may
+have happened that the thread which was holding the mutex has just released
+it and did not see the waiters byte set to 1, a case which would lead the
+other thread to an infinite wait.
+
+LEMMA 1: After a thread resets the event of a mutex (or rw_lock), some
+=======
+thread will eventually call os_event_set() on that particular event.
+Thus no infinite wait is possible in this case.
+
+Proof:	After making the reservation the thread sets the waiters field in the
+mutex to 1. Then it checks that the mutex is still reserved by some thread,
+or it reserves the mutex for itself. In any case, some thread (which may be
+also some earlier thread, not necessarily the one currently holding the mutex)
+will set the waiters field to 0 in mutex_exit, and then call
+os_event_set() with the mutex as an argument.
+Q.E.D.
+
+LEMMA 2: If an os_event_set() call is made after some thread has called
+=======
+the os_event_reset() and before it starts wait on that event, the call
+will not be lost to the second thread. This is true even if there is an
+intervening call to os_event_reset() by another thread.
+Thus no infinite wait is possible in this case.
+
+Proof (non-windows platforms): os_event_reset() returns a monotonically
+increasing value of signal_count. This value is increased at every
+call of os_event_set() If thread A has called os_event_reset() followed
+by thread B calling os_event_set() and then some other thread C calling
+os_event_reset(), the is_set flag of the event will be set to FALSE;
+but now if thread A calls os_event_wait_low() with the signal_count
+value returned from the earlier call of os_event_reset(), it will
+return immediately without waiting.
+Q.E.D.
+
+Proof (windows): If there is a writer thread which is forced to wait for
+the lock, it may be able to set the state of rw_lock to RW_LOCK_WAIT_EX
+The design of rw_lock ensures that there is one and only one thread
+that is able to change the state to RW_LOCK_WAIT_EX and this thread is
+guaranteed to acquire the lock after it is released by the current
+holders and before any other waiter gets the lock.
+On windows this thread waits on a separate event i.e.: wait_ex_event.
+Since only one thread can wait on this event there is no chance
+of this event getting reset before the writer starts wait on it.
+Therefore, this thread is guaranteed to catch the os_set_event()
+signalled unconditionally at the release of the lock.
+Q.E.D. */
+
+/* Number of spin waits on mutexes: for performance monitoring */
+
+/** The number of iterations in the mutex_spin_wait() spin loop.
+Intended for performance monitoring. */
+static ib_int64_t	mutex_spin_round_count		= 0;
+/** The number of mutex_spin_wait() calls.  Intended for
+performance monitoring. */
+static ib_int64_t	mutex_spin_wait_count		= 0;
+/** The number of OS waits in mutex_spin_wait().  Intended for
+performance monitoring. */
+static ib_int64_t	mutex_os_wait_count		= 0;
+/** The number of mutex_exit() calls. Intended for performance
+monitoring. */
+UNIV_INTERN ib_int64_t	mutex_exit_count		= 0;
+
+/** The global array of wait cells for implementation of the database's own
+mutexes and read-write locks */
+UNIV_INTERN sync_array_t*	sync_primary_wait_array;
+
+/** This variable is set to TRUE when sync_init is called */
+UNIV_INTERN ibool	sync_initialized	= FALSE;
+
+/** An acquired mutex or rw-lock and its level in the latching order */
+typedef struct sync_level_struct	sync_level_t;
+/** Mutexes or rw-locks held by a thread */
+typedef struct sync_thread_struct	sync_thread_t;
+
+#ifdef UNIV_SYNC_DEBUG
+/** The latch levels currently owned by threads are stored in this data
+structure; the size of this array is OS_THREAD_MAX_N */
+
+UNIV_INTERN sync_thread_t*	sync_thread_level_arrays;
+
+/** Mutex protecting sync_thread_level_arrays */
+UNIV_INTERN mutex_t		sync_thread_mutex;
+#endif /* UNIV_SYNC_DEBUG */
+
+/** Global list of database mutexes (not OS mutexes) created. */
+UNIV_INTERN ut_list_base_node_t  mutex_list;
+
+/** Mutex protecting the mutex_list variable */
+UNIV_INTERN mutex_t mutex_list_mutex;
+
+#ifdef UNIV_SYNC_DEBUG
+/** Latching order checks start when this is set TRUE */
+UNIV_INTERN ibool	sync_order_checks_on	= FALSE;
+#endif /* UNIV_SYNC_DEBUG */
+
+/** Mutexes or rw-locks held by a thread */
+struct sync_thread_struct{
+	os_thread_id_t	id;	/*!< OS thread id */
+	sync_level_t*	levels;	/*!< level array for this thread; if
+				this is NULL this slot is unused */
+};
+
+/** Number of slots reserved for each OS thread in the sync level array */
+#define SYNC_THREAD_N_LEVELS	10000
+
+/** An acquired mutex or rw-lock and its level in the latching order */
+struct sync_level_struct{
+	void*	latch;	/*!< pointer to a mutex or an rw-lock; NULL means that
+			the slot is empty */
+	ulint	level;	/*!< level of the latch in the latching order */
+};
+
+/******************************************************************//**
+Creates, or rather, initializes a mutex object in a specified memory
+location (which must be appropriately aligned). The mutex is initialized
+in the reset state. Explicit freeing of the mutex with mutex_free is
+necessary only if the memory block containing it is freed. */
+UNIV_INTERN
+void
+mutex_create_func(
+/*==============*/
+	mutex_t*	mutex,		/*!< in: pointer to memory */
+	const char*	cmutex_name,	/*!< in: mutex name */
+#ifdef UNIV_DEBUG
+# ifdef UNIV_SYNC_DEBUG
+	ulint		level,		/*!< in: level */
+# endif /* UNIV_SYNC_DEBUG */
+#endif /* UNIV_DEBUG */
+	const char*	cfile_name,	/*!< in: file name where created */
+	ulint		cline)		/*!< in: file line where created */
+{
+#if defined(HAVE_ATOMIC_BUILTINS)
+	mutex_reset_lock_word(mutex);
+#else
+	os_fast_mutex_init(&(mutex->os_fast_mutex));
+	mutex->lock_word = 0;
+#endif
+	mutex->event = os_event_create(NULL);
+	mutex->waiters = 0;
+#ifdef UNIV_DEBUG
+	mutex->magic_n = MUTEX_MAGIC_N;
+#endif /* UNIV_DEBUG */
+#ifdef UNIV_SYNC_DEBUG
+	mutex->line = 0;
+	mutex->file_name = "not yet reserved";
+	mutex->level = level;
+#endif /* UNIV_SYNC_DEBUG */
+#ifdef UNIV_DEBUG
+	mutex->cfile_name = cfile_name;
+	mutex->cline = cline;
+#endif /* UNIV_DEBUG */
+	mutex->count_os_wait = 0;
+	mutex->cmutex_name=	  cmutex_name;
+#ifdef UNIV_DEBUG
+	mutex->count_using=	  0;
+	mutex->mutex_type=	  0;
+	mutex->lspent_time=	  0;
+	mutex->lmax_spent_time=     0;
+	mutex->count_spin_loop= 0;
+	mutex->count_spin_rounds=   0;
+	mutex->count_os_yield=  0;
+#endif /* UNIV_DEBUG */
+
+	/* Check that lock_word is aligned; this is important on Intel */
+	ut_ad(((ulint)(&(mutex->lock_word))) % 4 == 0);
+
+	/* NOTE! The very first mutexes are not put to the mutex list */
+
+	if ((mutex == &mutex_list_mutex)
+#ifdef UNIV_SYNC_DEBUG
+	    || (mutex == &sync_thread_mutex)
+#endif /* UNIV_SYNC_DEBUG */
+	    ) {
+
+		return;
+	}
+
+	mutex_enter(&mutex_list_mutex);
+
+	ut_ad(UT_LIST_GET_LEN(mutex_list) == 0
+	      || UT_LIST_GET_FIRST(mutex_list)->magic_n == MUTEX_MAGIC_N);
+
+	UT_LIST_ADD_FIRST(list, mutex_list, mutex);
+
+	mutex_exit(&mutex_list_mutex);
+}
+
+/******************************************************************//**
+Calling this function is obligatory only if the memory buffer containing
+the mutex is freed. Removes a mutex object from the mutex list. The mutex
+is checked to be in the reset state. */
+UNIV_INTERN
+void
+mutex_free(
+/*=======*/
+	mutex_t*	mutex)	/*!< in: mutex */
+{
+	ut_ad(mutex_validate(mutex));
+	ut_a(mutex_get_lock_word(mutex) == 0);
+	ut_a(mutex_get_waiters(mutex) == 0);
+
+#ifdef UNIV_MEM_DEBUG
+	if (mutex == &mem_hash_mutex) {
+		ut_ad(UT_LIST_GET_LEN(mutex_list) == 1);
+		ut_ad(UT_LIST_GET_FIRST(mutex_list) == &mem_hash_mutex);
+		UT_LIST_REMOVE(list, mutex_list, mutex);
+		goto func_exit;
+	}
+#endif /* UNIV_MEM_DEBUG */
+
+	if (mutex != &mutex_list_mutex
+#ifdef UNIV_SYNC_DEBUG
+	    && mutex != &sync_thread_mutex
+#endif /* UNIV_SYNC_DEBUG */
+	    ) {
+
+		mutex_enter(&mutex_list_mutex);
+
+		ut_ad(!UT_LIST_GET_PREV(list, mutex)
+		      || UT_LIST_GET_PREV(list, mutex)->magic_n
+		      == MUTEX_MAGIC_N);
+		ut_ad(!UT_LIST_GET_NEXT(list, mutex)
+		      || UT_LIST_GET_NEXT(list, mutex)->magic_n
+		      == MUTEX_MAGIC_N);
+
+		UT_LIST_REMOVE(list, mutex_list, mutex);
+
+		mutex_exit(&mutex_list_mutex);
+	}
+
+	os_event_free(mutex->event);
+#ifdef UNIV_MEM_DEBUG
+func_exit:
+#endif /* UNIV_MEM_DEBUG */
+#if !defined(HAVE_ATOMIC_BUILTINS)
+	os_fast_mutex_free(&(mutex->os_fast_mutex));
+#endif
+	/* If we free the mutex protecting the mutex list (freeing is
+	not necessary), we have to reset the magic number AFTER removing
+	it from the list. */
+#ifdef UNIV_DEBUG
+	mutex->magic_n = 0;
+#endif /* UNIV_DEBUG */
+}
+
+/********************************************************************//**
+NOTE! Use the corresponding macro in the header file, not this function
+directly. Tries to lock the mutex for the current thread. If the lock is not
+acquired immediately, returns with return value 1.
+@return	0 if succeed, 1 if not */
+UNIV_INTERN
+ulint
+mutex_enter_nowait_func(
+/*====================*/
+	mutex_t*	mutex,		/*!< in: pointer to mutex */
+	const char*	file_name __attribute__((unused)),
+					/*!< in: file name where mutex
+					requested */
+	ulint		line __attribute__((unused)))
+					/*!< in: line where requested */
+{
+	ut_ad(mutex_validate(mutex));
+
+	if (!mutex_test_and_set(mutex)) {
+
+		ut_d(mutex->thread_id = os_thread_get_curr_id());
+#ifdef UNIV_SYNC_DEBUG
+		mutex_set_debug_info(mutex, file_name, line);
+#endif
+
+		return(0);	/* Succeeded! */
+	}
+
+	return(1);
+}
+
+#ifdef UNIV_DEBUG
+/******************************************************************//**
+Checks that the mutex has been initialized.
+@return	TRUE */
+UNIV_INTERN
+ibool
+mutex_validate(
+/*===========*/
+	const mutex_t*	mutex)	/*!< in: mutex */
+{
+	ut_a(mutex);
+	ut_a(mutex->magic_n == MUTEX_MAGIC_N);
+
+	return(TRUE);
+}
+
+/******************************************************************//**
+Checks that the current thread owns the mutex. Works only in the debug
+version.
+@return	TRUE if owns */
+UNIV_INTERN
+ibool
+mutex_own(
+/*======*/
+	const mutex_t*	mutex)	/*!< in: mutex */
+{
+	ut_ad(mutex_validate(mutex));
+
+	return(mutex_get_lock_word(mutex) == 1
+	       && os_thread_eq(mutex->thread_id, os_thread_get_curr_id()));
+}
+#endif /* UNIV_DEBUG */
+
+/******************************************************************//**
+Sets the waiters field in a mutex. */
+UNIV_INTERN
+void
+mutex_set_waiters(
+/*==============*/
+	mutex_t*	mutex,	/*!< in: mutex */
+	ulint		n)	/*!< in: value to set */
+{
+#ifdef INNODB_RW_LOCKS_USE_ATOMICS
+	ut_ad(mutex);
+
+	if (n) {
+		os_compare_and_swap_ulint(&mutex->waiters, 0, 1);
+	} else {
+		os_compare_and_swap_ulint(&mutex->waiters, 1, 0);
+	}
+#else
+	volatile ulint*	ptr;		/* declared volatile to ensure that
+					the value is stored to memory */
+	ut_ad(mutex);
+
+	ptr = &(mutex->waiters);
+
+	*ptr = n;		/* Here we assume that the write of a single
+				word in memory is atomic */
+#endif
+}
+
+/******************************************************************//**
+Reserves a mutex for the current thread. If the mutex is reserved, the
+function spins a preset time (controlled by SYNC_SPIN_ROUNDS), waiting
+for the mutex before suspending the thread. */
+UNIV_INTERN
+void
+mutex_spin_wait(
+/*============*/
+	mutex_t*	mutex,		/*!< in: pointer to mutex */
+	const char*	file_name,	/*!< in: file name where mutex
+					requested */
+	ulint		line)		/*!< in: line where requested */
+{
+	ulint	   index; /* index of the reserved wait cell */
+	ulint	   i;	  /* spin round count */
+#ifdef UNIV_DEBUG
+	ib_int64_t lstart_time = 0, lfinish_time; /* for timing os_wait */
+	ulint ltime_diff;
+	ulint sec;
+	ulint ms;
+	uint timer_started = 0;
+#endif /* UNIV_DEBUG */
+	ut_ad(mutex);
+
+	/* This update is not thread safe, but we don't mind if the count
+	isn't exact. Moved out of ifdef that follows because we are willing
+	to sacrifice the cost of counting this as the data is valuable.
+	Count the number of calls to mutex_spin_wait. */
+	mutex_spin_wait_count++;
+
+mutex_loop:
+
+	i = 0;
+
+	/* Spin waiting for the lock word to become zero. Note that we do
+	not have to assume that the read access to the lock word is atomic,
+	as the actual locking is always committed with atomic test-and-set.
+	In reality, however, all processors probably have an atomic read of
+	a memory word. */
+
+spin_loop:
+	ut_d(mutex->count_spin_loop++);
+
+	while (mutex_get_lock_word(mutex) != 0 && i < SYNC_SPIN_ROUNDS) {
+		if (srv_spin_wait_delay) {
+			ut_delay(ut_rnd_interval(0, srv_spin_wait_delay));
+		}
+
+		i++;
+	}
+
+	if (i == SYNC_SPIN_ROUNDS) {
+#ifdef UNIV_DEBUG
+		mutex->count_os_yield++;
+#ifndef UNIV_HOTBACKUP
+		if (timed_mutexes && timer_started == 0) {
+			ut_usectime(&sec, &ms);
+			lstart_time= (ib_int64_t)sec * 1000000 + ms;
+			timer_started = 1;
+		}
+#endif /* UNIV_HOTBACKUP */
+#endif /* UNIV_DEBUG */
+		os_thread_yield();
+	}
+
+#ifdef UNIV_SRV_PRINT_LATCH_WAITS
+	fprintf(stderr,
+		"Thread %lu spin wait mutex at %p"
+		" '%s' rnds %lu\n",
+		(ulong) os_thread_pf(os_thread_get_curr_id()), (void*) mutex,
+		mutex->cmutex_name, (ulong) i);
+#endif
+
+	mutex_spin_round_count += i;
+
+	ut_d(mutex->count_spin_rounds += i);
+
+	if (mutex_test_and_set(mutex) == 0) {
+		/* Succeeded! */
+
+		ut_d(mutex->thread_id = os_thread_get_curr_id());
+#ifdef UNIV_SYNC_DEBUG
+		mutex_set_debug_info(mutex, file_name, line);
+#endif
+
+		goto finish_timing;
+	}
+
+	/* We may end up with a situation where lock_word is 0 but the OS
+	fast mutex is still reserved. On FreeBSD the OS does not seem to
+	schedule a thread which is constantly calling pthread_mutex_trylock
+	(in mutex_test_and_set implementation). Then we could end up
+	spinning here indefinitely. The following 'i++' stops this infinite
+	spin. */
+
+	i++;
+
+	if (i < SYNC_SPIN_ROUNDS) {
+		goto spin_loop;
+	}
+
+	sync_array_reserve_cell(sync_primary_wait_array, mutex,
+				SYNC_MUTEX, file_name, line, &index);
+
+	/* The memory order of the array reservation and the change in the
+	waiters field is important: when we suspend a thread, we first
+	reserve the cell and then set waiters field to 1. When threads are
+	released in mutex_exit, the waiters field is first set to zero and
+	then the event is set to the signaled state. */
+
+	mutex_set_waiters(mutex, 1);
+
+	/* Try to reserve still a few times */
+	for (i = 0; i < 4; i++) {
+		if (mutex_test_and_set(mutex) == 0) {
+			/* Succeeded! Free the reserved wait cell */
+
+			sync_array_free_cell(sync_primary_wait_array, index);
+
+			ut_d(mutex->thread_id = os_thread_get_curr_id());
+#ifdef UNIV_SYNC_DEBUG
+			mutex_set_debug_info(mutex, file_name, line);
+#endif
+
+#ifdef UNIV_SRV_PRINT_LATCH_WAITS
+			fprintf(stderr, "Thread %lu spin wait succeeds at 2:"
+				" mutex at %p\n",
+				(ulong) os_thread_pf(os_thread_get_curr_id()),
+				(void*) mutex);
+#endif
+
+			goto finish_timing;
+
+			/* Note that in this case we leave the waiters field
+			set to 1. We cannot reset it to zero, as we do not
+			know if there are other waiters. */
+		}
+	}
+
+	/* Now we know that there has been some thread holding the mutex
+	after the change in the wait array and the waiters field was made.
+	Now there is no risk of infinite wait on the event. */
+
+#ifdef UNIV_SRV_PRINT_LATCH_WAITS
+	fprintf(stderr,
+		"Thread %lu OS wait mutex at %p '%s' rnds %lu\n",
+		(ulong) os_thread_pf(os_thread_get_curr_id()), (void*) mutex,
+		mutex->cmutex_name, (ulong) i);
+#endif
+
+	mutex_os_wait_count++;
+
+	mutex->count_os_wait++;
+#ifdef UNIV_DEBUG
+	/* !!!!! Sometimes os_wait can be called without os_thread_yield */
+#ifndef UNIV_HOTBACKUP
+	if (timed_mutexes == 1 && timer_started == 0) {
+		ut_usectime(&sec, &ms);
+		lstart_time= (ib_int64_t)sec * 1000000 + ms;
+		timer_started = 1;
+	}
+#endif /* UNIV_HOTBACKUP */
+#endif /* UNIV_DEBUG */
+
+	sync_array_wait_event(sync_primary_wait_array, index);
+	goto mutex_loop;
+
+finish_timing:
+#ifdef UNIV_DEBUG
+	if (timed_mutexes == 1 && timer_started==1) {
+		ut_usectime(&sec, &ms);
+		lfinish_time= (ib_int64_t)sec * 1000000 + ms;
+
+		ltime_diff= (ulint) (lfinish_time - lstart_time);
+		mutex->lspent_time += ltime_diff;
+
+		if (mutex->lmax_spent_time < ltime_diff) {
+			mutex->lmax_spent_time= ltime_diff;
+		}
+	}
+#endif /* UNIV_DEBUG */
+	return;
+}
+
+/******************************************************************//**
+Releases the threads waiting in the primary wait array for this mutex. */
+UNIV_INTERN
+void
+mutex_signal_object(
+/*================*/
+	mutex_t*	mutex)	/*!< in: mutex */
+{
+	mutex_set_waiters(mutex, 0);
+
+	/* The memory order of resetting the waiters field and
+	signaling the object is important. See LEMMA 1 above. */
+	os_event_set(mutex->event);
+	sync_array_object_signalled(sync_primary_wait_array);
+}
+
+#ifdef UNIV_SYNC_DEBUG
+/******************************************************************//**
+Sets the debug information for a reserved mutex. */
+UNIV_INTERN
+void
+mutex_set_debug_info(
+/*=================*/
+	mutex_t*	mutex,		/*!< in: mutex */
+	const char*	file_name,	/*!< in: file where requested */
+	ulint		line)		/*!< in: line where requested */
+{
+	ut_ad(mutex);
+	ut_ad(file_name);
+
+	sync_thread_add_level(mutex, mutex->level);
+
+	mutex->file_name = file_name;
+	mutex->line	 = line;
+}
+
+/******************************************************************//**
+Gets the debug information for a reserved mutex. */
+UNIV_INTERN
+void
+mutex_get_debug_info(
+/*=================*/
+	mutex_t*	mutex,		/*!< in: mutex */
+	const char**	file_name,	/*!< out: file where requested */
+	ulint*		line,		/*!< out: line where requested */
+	os_thread_id_t* thread_id)	/*!< out: id of the thread which owns
+					the mutex */
+{
+	ut_ad(mutex);
+
+	*file_name = mutex->file_name;
+	*line	   = mutex->line;
+	*thread_id = mutex->thread_id;
+}
+
+/******************************************************************//**
+Prints debug info of currently reserved mutexes. */
+static
+void
+mutex_list_print_info(
+/*==================*/
+	FILE*	file)		/*!< in: file where to print */
+{
+	mutex_t*	mutex;
+	const char*	file_name;
+	ulint		line;
+	os_thread_id_t	thread_id;
+	ulint		count		= 0;
+
+	fputs("----------\n"
+	      "MUTEX INFO\n"
+	      "----------\n", file);
+
+	mutex_enter(&mutex_list_mutex);
+
+	mutex = UT_LIST_GET_FIRST(mutex_list);
+
+	while (mutex != NULL) {
+		count++;
+
+		if (mutex_get_lock_word(mutex) != 0) {
+			mutex_get_debug_info(mutex, &file_name, &line,
+					     &thread_id);
+			fprintf(file,
+				"Locked mutex: addr %p thread %ld"
+				" file %s line %ld\n",
+				(void*) mutex, os_thread_pf(thread_id),
+				file_name, line);
+		}
+
+		mutex = UT_LIST_GET_NEXT(list, mutex);
+	}
+
+	fprintf(file, "Total number of mutexes %ld\n", count);
+
+	mutex_exit(&mutex_list_mutex);
+}
+
+/******************************************************************//**
+Counts currently reserved mutexes. Works only in the debug version.
+@return	number of reserved mutexes */
+UNIV_INTERN
+ulint
+mutex_n_reserved(void)
+/*==================*/
+{
+	mutex_t*	mutex;
+	ulint		count		= 0;
+
+	mutex_enter(&mutex_list_mutex);
+
+	mutex = UT_LIST_GET_FIRST(mutex_list);
+
+	while (mutex != NULL) {
+		if (mutex_get_lock_word(mutex) != 0) {
+
+			count++;
+		}
+
+		mutex = UT_LIST_GET_NEXT(list, mutex);
+	}
+
+	mutex_exit(&mutex_list_mutex);
+
+	ut_a(count >= 1);
+
+	return(count - 1); /* Subtract one, because this function itself
+			   was holding one mutex (mutex_list_mutex) */
+}
+
+/******************************************************************//**
+Returns TRUE if no mutex or rw-lock is currently locked. Works only in
+the debug version.
+@return	TRUE if no mutexes and rw-locks reserved */
+UNIV_INTERN
+ibool
+sync_all_freed(void)
+/*================*/
+{
+	return(mutex_n_reserved() + rw_lock_n_locked() == 0);
+}
+
+/******************************************************************//**
+Gets the value in the nth slot in the thread level arrays.
+@return	pointer to thread slot */
+static
+sync_thread_t*
+sync_thread_level_arrays_get_nth(
+/*=============================*/
+	ulint	n)	/*!< in: slot number */
+{
+	ut_ad(n < OS_THREAD_MAX_N);
+
+	return(sync_thread_level_arrays + n);
+}
+
+/******************************************************************//**
+Looks for the thread slot for the calling thread.
+@return	pointer to thread slot, NULL if not found */
+static
+sync_thread_t*
+sync_thread_level_arrays_find_slot(void)
+/*====================================*/
+
+{
+	sync_thread_t*	slot;
+	os_thread_id_t	id;
+	ulint		i;
+
+	id = os_thread_get_curr_id();
+
+	for (i = 0; i < OS_THREAD_MAX_N; i++) {
+
+		slot = sync_thread_level_arrays_get_nth(i);
+
+		if (slot->levels && os_thread_eq(slot->id, id)) {
+
+			return(slot);
+		}
+	}
+
+	return(NULL);
+}
+
+/******************************************************************//**
+Looks for an unused thread slot.
+@return	pointer to thread slot */
+static
+sync_thread_t*
+sync_thread_level_arrays_find_free(void)
+/*====================================*/
+
+{
+	sync_thread_t*	slot;
+	ulint		i;
+
+	for (i = 0; i < OS_THREAD_MAX_N; i++) {
+
+		slot = sync_thread_level_arrays_get_nth(i);
+
+		if (slot->levels == NULL) {
+
+			return(slot);
+		}
+	}
+
+	return(NULL);
+}
+
+/******************************************************************//**
+Gets the value in the nth slot in the thread level array.
+@return	pointer to level slot */
+static
+sync_level_t*
+sync_thread_levels_get_nth(
+/*=======================*/
+	sync_level_t*	arr,	/*!< in: pointer to level array for an OS
+				thread */
+	ulint		n)	/*!< in: slot number */
+{
+	ut_ad(n < SYNC_THREAD_N_LEVELS);
+
+	return(arr + n);
+}
+
+/******************************************************************//**
+Checks if all the level values stored in the level array are greater than
+the given limit.
+@return	TRUE if all greater */
+static
+ibool
+sync_thread_levels_g(
+/*=================*/
+	sync_level_t*	arr,	/*!< in: pointer to level array for an OS
+				thread */
+	ulint		limit,	/*!< in: level limit */
+	ulint		warn)	/*!< in: TRUE=display a diagnostic message */
+{
+	sync_level_t*	slot;
+	rw_lock_t*	lock;
+	mutex_t*	mutex;
+	ulint		i;
+
+	for (i = 0; i < SYNC_THREAD_N_LEVELS; i++) {
+
+		slot = sync_thread_levels_get_nth(arr, i);
+
+		if (slot->latch != NULL) {
+			if (slot->level <= limit) {
+
+				if (!warn) {
+
+					return(FALSE);
+				}
+
+				lock = slot->latch;
+				mutex = slot->latch;
+
+				fprintf(stderr,
+					"InnoDB: sync levels should be"
+					" > %lu but a level is %lu\n",
+					(ulong) limit, (ulong) slot->level);
+
+				if (mutex->magic_n == MUTEX_MAGIC_N) {
+					fprintf(stderr,
+						"Mutex '%s'\n",
+						mutex->cmutex_name);
+
+					if (mutex_get_lock_word(mutex) != 0) {
+						const char*	file_name;
+						ulint		line;
+						os_thread_id_t	thread_id;
+
+						mutex_get_debug_info(
+							mutex, &file_name,
+							&line, &thread_id);
+
+						fprintf(stderr,
+							"InnoDB: Locked mutex:"
+							" addr %p thread %ld"
+							" file %s line %ld\n",
+							(void*) mutex,
+							os_thread_pf(
+								thread_id),
+							file_name,
+							(ulong) line);
+					} else {
+						fputs("Not locked\n", stderr);
+					}
+				} else {
+					rw_lock_print(lock);
+				}
+
+				return(FALSE);
+			}
+		}
+	}
+
+	return(TRUE);
+}
+
+/******************************************************************//**
+Checks if the level value is stored in the level array.
+@return	TRUE if stored */
+static
+ibool
+sync_thread_levels_contain(
+/*=======================*/
+	sync_level_t*	arr,	/*!< in: pointer to level array for an OS
+				thread */
+	ulint		level)	/*!< in: level */
+{
+	sync_level_t*	slot;
+	ulint		i;
+
+	for (i = 0; i < SYNC_THREAD_N_LEVELS; i++) {
+
+		slot = sync_thread_levels_get_nth(arr, i);
+
+		if (slot->latch != NULL) {
+			if (slot->level == level) {
+
+				return(TRUE);
+			}
+		}
+	}
+
+	return(FALSE);
+}
+
+/******************************************************************//**
+Checks if the level array for the current thread contains a
+mutex or rw-latch at the specified level.
+@return	a matching latch, or NULL if not found */
+UNIV_INTERN
+void*
+sync_thread_levels_contains(
+/*========================*/
+	ulint	level)			/*!< in: latching order level
+					(SYNC_DICT, ...)*/
+{
+	sync_level_t*	arr;
+	sync_thread_t*	thread_slot;
+	sync_level_t*	slot;
+	ulint		i;
+
+	if (!sync_order_checks_on) {
+
+		return(NULL);
+	}
+
+	mutex_enter(&sync_thread_mutex);
+
+	thread_slot = sync_thread_level_arrays_find_slot();
+
+	if (thread_slot == NULL) {
+
+		mutex_exit(&sync_thread_mutex);
+
+		return(NULL);
+	}
+
+	arr = thread_slot->levels;
+
+	for (i = 0; i < SYNC_THREAD_N_LEVELS; i++) {
+
+		slot = sync_thread_levels_get_nth(arr, i);
+
+		if (slot->latch != NULL && slot->level == level) {
+
+			mutex_exit(&sync_thread_mutex);
+			return(slot->latch);
+		}
+	}
+
+	mutex_exit(&sync_thread_mutex);
+
+	return(NULL);
+}
+
+/******************************************************************//**
+Checks that the level array for the current thread is empty.
+@return	a latch, or NULL if empty except the exceptions specified below */
+UNIV_INTERN
+void*
+sync_thread_levels_nonempty_gen(
+/*============================*/
+	ibool	dict_mutex_allowed)	/*!< in: TRUE if dictionary mutex is
+					allowed to be owned by the thread,
+					also purge_is_running mutex is
+					allowed */
+{
+	sync_level_t*	arr;
+	sync_thread_t*	thread_slot;
+	sync_level_t*	slot;
+	ulint		i;
+
+	if (!sync_order_checks_on) {
+
+		return(NULL);
+	}
+
+	mutex_enter(&sync_thread_mutex);
+
+	thread_slot = sync_thread_level_arrays_find_slot();
+
+	if (thread_slot == NULL) {
+
+		mutex_exit(&sync_thread_mutex);
+
+		return(NULL);
+	}
+
+	arr = thread_slot->levels;
+
+	for (i = 0; i < SYNC_THREAD_N_LEVELS; i++) {
+
+		slot = sync_thread_levels_get_nth(arr, i);
+
+		if (slot->latch != NULL
+		    && (!dict_mutex_allowed
+			|| (slot->level != SYNC_DICT
+			    && slot->level != SYNC_DICT_OPERATION))) {
+
+			mutex_exit(&sync_thread_mutex);
+			ut_error;
+
+			return(slot->latch);
+		}
+	}
+
+	mutex_exit(&sync_thread_mutex);
+
+	return(NULL);
+}
+
+/******************************************************************//**
+Checks that the level array for the current thread is empty.
+@return	TRUE if empty */
+UNIV_INTERN
+ibool
+sync_thread_levels_empty(void)
+/*==========================*/
+{
+	return(sync_thread_levels_empty_gen(FALSE));
+}
+
+/******************************************************************//**
+Adds a latch and its level in the thread level array. Allocates the memory
+for the array if called first time for this OS thread. Makes the checks
+against other latch levels stored in the array for this thread. */
+UNIV_INTERN
+void
+sync_thread_add_level(
+/*==================*/
+	void*	latch,	/*!< in: pointer to a mutex or an rw-lock */
+	ulint	level)	/*!< in: level in the latching order; if
+			SYNC_LEVEL_VARYING, nothing is done */
+{
+	sync_level_t*	array;
+	sync_level_t*	slot;
+	sync_thread_t*	thread_slot;
+	ulint		i;
+
+	if (!sync_order_checks_on) {
+
+		return;
+	}
+
+	if ((latch == (void*)&sync_thread_mutex)
+	    || (latch == (void*)&mutex_list_mutex)
+	    || (latch == (void*)&rw_lock_debug_mutex)
+	    || (latch == (void*)&rw_lock_list_mutex)) {
+
+		return;
+	}
+
+	if (level == SYNC_LEVEL_VARYING) {
+
+		return;
+	}
+
+	mutex_enter(&sync_thread_mutex);
+
+	thread_slot = sync_thread_level_arrays_find_slot();
+
+	if (thread_slot == NULL) {
+		/* We have to allocate the level array for a new thread */
+		array = ut_malloc(sizeof(sync_level_t) * SYNC_THREAD_N_LEVELS);
+
+		thread_slot = sync_thread_level_arrays_find_free();
+
+		thread_slot->id = os_thread_get_curr_id();
+		thread_slot->levels = array;
+
+		for (i = 0; i < SYNC_THREAD_N_LEVELS; i++) {
+
+			slot = sync_thread_levels_get_nth(array, i);
+
+			slot->latch = NULL;
+		}
+	}
+
+	array = thread_slot->levels;
+
+	/* NOTE that there is a problem with _NODE and _LEAF levels: if the
+	B-tree height changes, then a leaf can change to an internal node
+	or the other way around. We do not know at present if this can cause
+	unnecessary assertion failures below. */
+
+	switch (level) {
+	case SYNC_NO_ORDER_CHECK:
+	case SYNC_EXTERN_STORAGE:
+	case SYNC_TREE_NODE_FROM_HASH:
+		/* Do no order checking */
+		break;
+	case SYNC_MEM_POOL:
+	case SYNC_MEM_HASH:
+	case SYNC_RECV:
+	case SYNC_WORK_QUEUE:
+	case SYNC_LOG:
+	case SYNC_THR_LOCAL:
+	case SYNC_ANY_LATCH:
+	case SYNC_TRX_SYS_HEADER:
+	case SYNC_FILE_FORMAT_TAG:
+	case SYNC_DOUBLEWRITE:
+	case SYNC_BUF_LRU_LIST:
+	case SYNC_BUF_FLUSH_LIST:
+	case SYNC_BUF_PAGE_HASH:
+	case SYNC_BUF_FREE_LIST:
+	case SYNC_BUF_ZIP_FREE:
+	case SYNC_BUF_ZIP_HASH:
+	case SYNC_BUF_POOL:
+	case SYNC_SEARCH_SYS:
+	case SYNC_SEARCH_SYS_CONF:
+	case SYNC_TRX_LOCK_HEAP:
+	case SYNC_KERNEL:
+	case SYNC_IBUF_BITMAP_MUTEX:
+	case SYNC_RSEG:
+	case SYNC_TRX_UNDO:
+	case SYNC_PURGE_LATCH:
+	case SYNC_PURGE_SYS:
+	case SYNC_DICT_AUTOINC_MUTEX:
+	case SYNC_DICT_OPERATION:
+	case SYNC_DICT_HEADER:
+	case SYNC_TRX_I_S_RWLOCK:
+	case SYNC_TRX_I_S_LAST_READ:
+		if (!sync_thread_levels_g(array, level, TRUE)) {
+			fprintf(stderr,
+				"InnoDB: sync_thread_levels_g(array, %lu)"
+				" does not hold!\n", level);
+			ut_error;
+		}
+		break;
+	case SYNC_BUF_BLOCK:
+		/* Either the thread must own the buffer pool mutex
+		(buf_pool_mutex), or it is allowed to latch only ONE
+		buffer block (block->mutex or buf_pool_zip_mutex). */
+		if (!sync_thread_levels_g(array, level, FALSE)) {
+			ut_a(sync_thread_levels_g(array, level - 1, TRUE));
+			ut_a(sync_thread_levels_contain(array, SYNC_BUF_LRU_LIST));
+		}
+		break;
+	case SYNC_REC_LOCK:
+		if (sync_thread_levels_contain(array, SYNC_KERNEL)) {
+			ut_a(sync_thread_levels_g(array, SYNC_REC_LOCK - 1,
+						  TRUE));
+		} else {
+			ut_a(sync_thread_levels_g(array, SYNC_REC_LOCK, TRUE));
+		}
+		break;
+	case SYNC_IBUF_BITMAP:
+		/* Either the thread must own the master mutex to all
+		the bitmap pages, or it is allowed to latch only ONE
+		bitmap page. */
+		if (sync_thread_levels_contain(array,
+					       SYNC_IBUF_BITMAP_MUTEX)) {
+			ut_a(sync_thread_levels_g(array, SYNC_IBUF_BITMAP - 1,
+						  TRUE));
+		} else {
+			ut_a(sync_thread_levels_g(array, SYNC_IBUF_BITMAP,
+						  TRUE));
+		}
+		break;
+	case SYNC_FSP_PAGE:
+		ut_a(sync_thread_levels_contain(array, SYNC_FSP));
+		break;
+	case SYNC_FSP:
+		ut_a(sync_thread_levels_contain(array, SYNC_FSP)
+		     || sync_thread_levels_g(array, SYNC_FSP, TRUE));
+		break;
+	case SYNC_TRX_UNDO_PAGE:
+		ut_a(sync_thread_levels_contain(array, SYNC_TRX_UNDO)
+		     || sync_thread_levels_contain(array, SYNC_RSEG)
+		     || sync_thread_levels_contain(array, SYNC_PURGE_SYS)
+		     || sync_thread_levels_g(array, SYNC_TRX_UNDO_PAGE, TRUE));
+		break;
+	case SYNC_RSEG_HEADER:
+		ut_a(sync_thread_levels_contain(array, SYNC_RSEG));
+		break;
+	case SYNC_RSEG_HEADER_NEW:
+		ut_a(sync_thread_levels_contain(array, SYNC_KERNEL)
+		     && sync_thread_levels_contain(array, SYNC_FSP_PAGE));
+		break;
+	case SYNC_TREE_NODE:
+		ut_a(sync_thread_levels_contain(array, SYNC_INDEX_TREE)
+		     || sync_thread_levels_contain(array, SYNC_DICT_OPERATION)
+		     || sync_thread_levels_g(array, SYNC_TREE_NODE - 1, TRUE));
+		break;
+	case SYNC_TREE_NODE_NEW:
+		ut_a(sync_thread_levels_contain(array, SYNC_FSP_PAGE)
+		     || sync_thread_levels_contain(array, SYNC_IBUF_MUTEX));
+		break;
+	case SYNC_INDEX_TREE:
+		if (sync_thread_levels_contain(array, SYNC_IBUF_MUTEX)
+		    && sync_thread_levels_contain(array, SYNC_FSP)) {
+			ut_a(sync_thread_levels_g(array, SYNC_FSP_PAGE - 1,
+						  TRUE));
+		} else {
+			ut_a(sync_thread_levels_g(array, SYNC_TREE_NODE - 1,
+						  TRUE));
+		}
+		break;
+	case SYNC_IBUF_MUTEX:
+		ut_a(sync_thread_levels_g(array, SYNC_FSP_PAGE - 1, TRUE));
+		break;
+	case SYNC_IBUF_PESS_INSERT_MUTEX:
+		ut_a(sync_thread_levels_g(array, SYNC_FSP - 1, TRUE));
+		ut_a(!sync_thread_levels_contain(array, SYNC_IBUF_MUTEX));
+		break;
+	case SYNC_IBUF_HEADER:
+		ut_a(sync_thread_levels_g(array, SYNC_FSP - 1, TRUE));
+		ut_a(!sync_thread_levels_contain(array, SYNC_IBUF_MUTEX));
+		ut_a(!sync_thread_levels_contain(array,
+						 SYNC_IBUF_PESS_INSERT_MUTEX));
+		break;
+	case SYNC_DICT:
+#ifdef UNIV_DEBUG
+		ut_a(buf_debug_prints
+		     || sync_thread_levels_g(array, SYNC_DICT, TRUE));
+#else /* UNIV_DEBUG */
+		ut_a(sync_thread_levels_g(array, SYNC_DICT, TRUE));
+#endif /* UNIV_DEBUG */
+		break;
+	default:
+		ut_error;
+	}
+
+	for (i = 0; i < SYNC_THREAD_N_LEVELS; i++) {
+
+		slot = sync_thread_levels_get_nth(array, i);
+
+		if (slot->latch == NULL) {
+			slot->latch = latch;
+			slot->level = level;
+
+			break;
+		}
+	}
+
+	ut_a(i < SYNC_THREAD_N_LEVELS);
+
+	mutex_exit(&sync_thread_mutex);
+}
+
+/******************************************************************//**
+Removes a latch from the thread level array if it is found there.
+@return TRUE if found in the array; it is no error if the latch is
+not found, as we presently are not able to determine the level for
+every latch reservation the program does */
+UNIV_INTERN
+ibool
+sync_thread_reset_level(
+/*====================*/
+	void*	latch)	/*!< in: pointer to a mutex or an rw-lock */
+{
+	sync_level_t*	array;
+	sync_level_t*	slot;
+	sync_thread_t*	thread_slot;
+	ulint		i;
+
+	if (!sync_order_checks_on) {
+
+		return(FALSE);
+	}
+
+	if ((latch == (void*)&sync_thread_mutex)
+	    || (latch == (void*)&mutex_list_mutex)
+	    || (latch == (void*)&rw_lock_debug_mutex)
+	    || (latch == (void*)&rw_lock_list_mutex)) {
+
+		return(FALSE);
+	}
+
+	mutex_enter(&sync_thread_mutex);
+
+	thread_slot = sync_thread_level_arrays_find_slot();
+
+	if (thread_slot == NULL) {
+
+		ut_error;
+
+		mutex_exit(&sync_thread_mutex);
+		return(FALSE);
+	}
+
+	array = thread_slot->levels;
+
+	for (i = 0; i < SYNC_THREAD_N_LEVELS; i++) {
+
+		slot = sync_thread_levels_get_nth(array, i);
+
+		if (slot->latch == latch) {
+			slot->latch = NULL;
+
+			mutex_exit(&sync_thread_mutex);
+
+			return(TRUE);
+		}
+	}
+
+	if (((mutex_t*) latch)->magic_n != MUTEX_MAGIC_N) {
+		rw_lock_t*	rw_lock;
+
+		rw_lock = (rw_lock_t*) latch;
+
+		if (rw_lock->level == SYNC_LEVEL_VARYING) {
+			mutex_exit(&sync_thread_mutex);
+
+			return(TRUE);
+		}
+	}
+
+	ut_error;
+
+	mutex_exit(&sync_thread_mutex);
+
+	return(FALSE);
+}
+#endif /* UNIV_SYNC_DEBUG */
+
+/******************************************************************//**
+Initializes the synchronization data structures. */
+UNIV_INTERN
+void
+sync_init(void)
+/*===========*/
+{
+#ifdef UNIV_SYNC_DEBUG
+	sync_thread_t*	thread_slot;
+	ulint		i;
+#endif /* UNIV_SYNC_DEBUG */
+
+	ut_a(sync_initialized == FALSE);
+
+	sync_initialized = TRUE;
+
+	/* Create the primary system wait array which is protected by an OS
+	mutex */
+
+	sync_primary_wait_array = sync_array_create(OS_THREAD_MAX_N,
+						    SYNC_ARRAY_OS_MUTEX);
+#ifdef UNIV_SYNC_DEBUG
+	/* Create the thread latch level array where the latch levels
+	are stored for each OS thread */
+
+	sync_thread_level_arrays = ut_malloc(OS_THREAD_MAX_N
+					     * sizeof(sync_thread_t));
+	for (i = 0; i < OS_THREAD_MAX_N; i++) {
+
+		thread_slot = sync_thread_level_arrays_get_nth(i);
+		thread_slot->levels = NULL;
+	}
+#endif /* UNIV_SYNC_DEBUG */
+	/* Init the mutex list and create the mutex to protect it. */
+
+	UT_LIST_INIT(mutex_list);
+	mutex_create(&mutex_list_mutex, SYNC_NO_ORDER_CHECK);
+#ifdef UNIV_SYNC_DEBUG
+	mutex_create(&sync_thread_mutex, SYNC_NO_ORDER_CHECK);
+#endif /* UNIV_SYNC_DEBUG */
+
+	/* Init the rw-lock list and create the mutex to protect it. */
+
+	UT_LIST_INIT(rw_lock_list);
+	mutex_create(&rw_lock_list_mutex, SYNC_NO_ORDER_CHECK);
+
+#ifdef UNIV_SYNC_DEBUG
+	mutex_create(&rw_lock_debug_mutex, SYNC_NO_ORDER_CHECK);
+
+	rw_lock_debug_event = os_event_create(NULL);
+	rw_lock_debug_waiters = FALSE;
+#endif /* UNIV_SYNC_DEBUG */
+}
+
+/******************************************************************//**
+Frees the resources in InnoDB's own synchronization data structures. Use
+os_sync_free() after calling this. */
+UNIV_INTERN
+void
+sync_close(void)
+/*===========*/
+{
+	mutex_t*	mutex;
+
+	sync_array_free(sync_primary_wait_array);
+
+	mutex = UT_LIST_GET_FIRST(mutex_list);
+
+	while (mutex) {
+#ifdef UNIV_MEM_DEBUG
+		if (mutex == &mem_hash_mutex) {
+			mutex = UT_LIST_GET_NEXT(list, mutex);
+			continue;
+		}
+#endif /* UNIV_MEM_DEBUG */
+		mutex_free(mutex);
+		mutex = UT_LIST_GET_FIRST(mutex_list);
+	}
+
+	mutex_free(&mutex_list_mutex);
+#ifdef UNIV_SYNC_DEBUG
+	mutex_free(&sync_thread_mutex);
+
+	/* Switch latching order checks on in sync0sync.c */
+	sync_order_checks_on = FALSE;
+#endif /* UNIV_SYNC_DEBUG */
+
+	sync_initialized = FALSE;	
+}
+
+/*******************************************************************//**
+Prints wait info of the sync system. */
+UNIV_INTERN
+void
+sync_print_wait_info(
+/*=================*/
+	FILE*	file)		/*!< in: file where to print */
+{
+#ifdef UNIV_SYNC_DEBUG
+	fprintf(file, "Mutex exits %llu, rws exits %llu, rwx exits %llu\n",
+		mutex_exit_count, rw_s_exit_count, rw_x_exit_count);
+#endif
+
+	fprintf(file,
+		"Mutex spin waits %llu, rounds %llu, OS waits %llu\n"
+		"RW-shared spins %llu, OS waits %llu;"
+		" RW-excl spins %llu, OS waits %llu\n",
+		mutex_spin_wait_count,
+		mutex_spin_round_count,
+		mutex_os_wait_count,
+		rw_s_spin_wait_count,
+		rw_s_os_wait_count,
+		rw_x_spin_wait_count,
+		rw_x_os_wait_count);
+
+	fprintf(file,
+		"Spin rounds per wait: %.2f mutex, %.2f RW-shared, "
+		"%.2f RW-excl\n",
+		(double) mutex_spin_round_count /
+		(mutex_spin_wait_count ? mutex_spin_wait_count : 1),
+		(double) rw_s_spin_round_count /
+		(rw_s_spin_wait_count ? rw_s_spin_wait_count : 1),
+		(double) rw_x_spin_round_count /
+		(rw_x_spin_wait_count ? rw_x_spin_wait_count : 1));
+}
+
+/*******************************************************************//**
+Prints info of the sync system. */
+UNIV_INTERN
+void
+sync_print(
+/*=======*/
+	FILE*	file)		/*!< in: file where to print */
+{
+#ifdef UNIV_SYNC_DEBUG
+	mutex_list_print_info(file);
+
+	rw_lock_list_print_info(file);
+#endif /* UNIV_SYNC_DEBUG */
+
+	sync_array_print_info(file, sync_primary_wait_array);
+
+	sync_print_wait_info(file);
+}
diff --git a/storage/xtradb/thr/thr0loc.c b/storage/xtradb/thr/thr0loc.c
new file mode 100644
index 00000000000..5b9e83be920
--- /dev/null
+++ b/storage/xtradb/thr/thr0loc.c
@@ -0,0 +1,308 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file thr/thr0loc.c
+The thread local storage
+
+Created 10/5/1995 Heikki Tuuri
+*******************************************************/
+
+#include "thr0loc.h"
+#ifdef UNIV_NONINL
+#include "thr0loc.ic"
+#endif
+
+#include "sync0sync.h"
+#include "hash0hash.h"
+#include "mem0mem.h"
+#include "srv0srv.h"
+
+/*
+	IMPLEMENTATION OF THREAD LOCAL STORAGE
+	======================================
+
+The threads sometimes need private data which depends on the thread id.
+This is implemented as a hash table, where the hash value is calculated
+from the thread id, to prepare for a large number of threads. The hash table
+is protected by a mutex. If you need modify the program and put new data to
+the thread local storage, just add it to struct thr_local_struct in the
+header file. */
+
+/** Mutex protecting thr_local_hash */
+static mutex_t		thr_local_mutex;
+
+/** The hash table. The module is not yet initialized when it is NULL. */
+static hash_table_t*	thr_local_hash	= NULL;
+ulint		thr_local_hash_n_nodes = 0;
+
+/** Thread local data */
+typedef struct thr_local_struct thr_local_t;
+
+/** @brief Thread local data.
+The private data for each thread should be put to
+the structure below and the accessor functions written
+for the field. */
+struct thr_local_struct{
+	os_thread_id_t	id;	/*!< id of the thread which owns this struct */
+	os_thread_t	handle;	/*!< operating system handle to the thread */
+	ulint		slot_no;/*!< the index of the slot in the thread table
+				for this thread */
+	ibool		in_ibuf;/*!< TRUE if the thread is doing an ibuf
+				operation */
+	hash_node_t	hash;	/*!< hash chain node */
+	ulint		magic_n;/*!< magic number (THR_LOCAL_MAGIC_N) */
+};
+
+/** The value of thr_local_struct::magic_n */
+#define THR_LOCAL_MAGIC_N	1231234
+
+/*******************************************************************//**
+Returns the local storage struct for a thread.
+@return	local storage */
+static
+thr_local_t*
+thr_local_get(
+/*==========*/
+	os_thread_id_t	id)	/*!< in: thread id of the thread */
+{
+	thr_local_t*	local;
+
+try_again:
+	ut_ad(thr_local_hash);
+	ut_ad(mutex_own(&thr_local_mutex));
+
+	/* Look for the local struct in the hash table */
+
+	local = NULL;
+
+	HASH_SEARCH(hash, thr_local_hash, os_thread_pf(id),
+		    thr_local_t*, local,, os_thread_eq(local->id, id));
+	if (local == NULL) {
+		mutex_exit(&thr_local_mutex);
+
+		thr_local_create();
+
+		mutex_enter(&thr_local_mutex);
+
+		goto try_again;
+	}
+
+	ut_ad(local->magic_n == THR_LOCAL_MAGIC_N);
+
+	return(local);
+}
+
+/*******************************************************************//**
+Gets the slot number in the thread table of a thread.
+@return	slot number */
+UNIV_INTERN
+ulint
+thr_local_get_slot_no(
+/*==================*/
+	os_thread_id_t	id)	/*!< in: thread id of the thread */
+{
+	ulint		slot_no;
+	thr_local_t*	local;
+
+	mutex_enter(&thr_local_mutex);
+
+	local = thr_local_get(id);
+
+	slot_no = local->slot_no;
+
+	mutex_exit(&thr_local_mutex);
+
+	return(slot_no);
+}
+
+/*******************************************************************//**
+Sets the slot number in the thread table of a thread. */
+UNIV_INTERN
+void
+thr_local_set_slot_no(
+/*==================*/
+	os_thread_id_t	id,	/*!< in: thread id of the thread */
+	ulint		slot_no)/*!< in: slot number */
+{
+	thr_local_t*	local;
+
+	mutex_enter(&thr_local_mutex);
+
+	local = thr_local_get(id);
+
+	local->slot_no = slot_no;
+
+	mutex_exit(&thr_local_mutex);
+}
+
+/*******************************************************************//**
+Returns pointer to the 'in_ibuf' field within the current thread local
+storage.
+@return	pointer to the in_ibuf field */
+UNIV_INTERN
+ibool*
+thr_local_get_in_ibuf_field(void)
+/*=============================*/
+{
+	thr_local_t*	local;
+
+	mutex_enter(&thr_local_mutex);
+
+	local = thr_local_get(os_thread_get_curr_id());
+
+	mutex_exit(&thr_local_mutex);
+
+	return(&(local->in_ibuf));
+}
+
+/*******************************************************************//**
+Creates a local storage struct for the calling new thread. */
+UNIV_INTERN
+void
+thr_local_create(void)
+/*==================*/
+{
+	thr_local_t*	local;
+
+	if (thr_local_hash == NULL) {
+		thr_local_init();
+	}
+
+	local = mem_alloc(sizeof(thr_local_t));
+
+	local->id = os_thread_get_curr_id();
+	local->handle = os_thread_get_curr();
+	local->magic_n = THR_LOCAL_MAGIC_N;
+
+	local->in_ibuf = FALSE;
+
+	mutex_enter(&thr_local_mutex);
+
+	HASH_INSERT(thr_local_t, hash, thr_local_hash,
+		    os_thread_pf(os_thread_get_curr_id()),
+		    local);
+
+	thr_local_hash_n_nodes++;
+	mutex_exit(&thr_local_mutex);
+}
+
+/*******************************************************************//**
+Frees the local storage struct for the specified thread. */
+UNIV_INTERN
+void
+thr_local_free(
+/*===========*/
+	os_thread_id_t	id)	/*!< in: thread id */
+{
+	thr_local_t*	local;
+
+	mutex_enter(&thr_local_mutex);
+
+	/* Look for the local struct in the hash table */
+
+	HASH_SEARCH(hash, thr_local_hash, os_thread_pf(id),
+		    thr_local_t*, local,, os_thread_eq(local->id, id));
+	if (local == NULL) {
+		mutex_exit(&thr_local_mutex);
+
+		return;
+	}
+
+	HASH_DELETE(thr_local_t, hash, thr_local_hash,
+		    os_thread_pf(id), local);
+	thr_local_hash_n_nodes--;
+
+	mutex_exit(&thr_local_mutex);
+
+	ut_a(local->magic_n == THR_LOCAL_MAGIC_N);
+
+	mem_free(local);
+}
+
+/****************************************************************//**
+Initializes the thread local storage module. */
+UNIV_INTERN
+void
+thr_local_init(void)
+/*================*/
+{
+
+	ut_a(thr_local_hash == NULL);
+
+	thr_local_hash = hash_create(OS_THREAD_MAX_N + 100);
+
+	mutex_create(&thr_local_mutex, SYNC_THR_LOCAL);
+}
+
+/********************************************************************
+Close the thread local storage module. */
+UNIV_INTERN
+void
+thr_local_close(void)
+/*=================*/
+{
+	ulint		i;
+
+	ut_a(thr_local_hash != NULL);
+
+	/* Free the hash elements. We don't remove them from the table
+	because we are going to destroy the table anyway. */
+	for (i = 0; i < hash_get_n_cells(thr_local_hash); i++) {
+		thr_local_t*	local;
+
+		local = HASH_GET_FIRST(thr_local_hash, i);
+
+		while (local) {
+			thr_local_t*	prev_local = local;
+
+			local = HASH_GET_NEXT(hash, prev_local);
+			ut_a(prev_local->magic_n == THR_LOCAL_MAGIC_N);
+			mem_free(prev_local);
+		}
+	}
+
+	hash_table_free(thr_local_hash);
+	thr_local_hash = NULL;
+}
+
+/*************************************************************************
+Return local hash table informations. */
+
+ulint
+thr_local_hash_cells(void)
+/*======================*/
+{
+	if (thr_local_hash) {
+		return (thr_local_hash->n_cells);
+	} else {
+		return 0;
+	}
+}
+
+ulint
+thr_local_hash_nodes(void)
+/*======================*/
+{
+	if (thr_local_hash) {
+		return (thr_local_hash_n_nodes
+			* (sizeof(thr_local_t) + MEM_BLOCK_HEADER_SIZE));
+	} else {
+		return 0;
+	}
+}
diff --git a/storage/xtradb/trx/trx0i_s.c b/storage/xtradb/trx/trx0i_s.c
new file mode 100644
index 00000000000..5bc8302d0c0
--- /dev/null
+++ b/storage/xtradb/trx/trx0i_s.c
@@ -0,0 +1,1481 @@
+/*****************************************************************************
+
+Copyright (c) 2007, 2010, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file trx/trx0i_s.c
+INFORMATION SCHEMA innodb_trx, innodb_locks and
+innodb_lock_waits tables fetch code.
+
+The code below fetches information needed to fill those
+3 dynamic tables and uploads it into a "transactions
+table cache" for later retrieval.
+
+Created July 17, 2007 Vasil Dimov
+*******************************************************/
+
+/* Found during the build of 5.5.3 on Linux 2.4 and early 2.6 kernels:
+   The includes "univ.i" -> "my_global.h" cause a different path
+   to be taken further down with pthread functions and types,
+   so they must come first.
+   From the symptoms, this is related to bug#46587 in the MySQL bug DB.
+*/
+#include "univ.i"
+
+#include <mysql/plugin.h>
+
+#include "mysql_addons.h"
+
+#include "buf0buf.h"
+#include "dict0dict.h"
+#include "ha0storage.h"
+#include "ha_prototypes.h"
+#include "hash0hash.h"
+#include "lock0iter.h"
+#include "lock0lock.h"
+#include "mem0mem.h"
+#include "page0page.h"
+#include "rem0rec.h"
+#include "row0row.h"
+#include "srv0srv.h"
+#include "sync0rw.h"
+#include "sync0sync.h"
+#include "sync0types.h"
+#include "trx0i_s.h"
+#include "trx0sys.h"
+#include "trx0trx.h"
+#include "ut0mem.h"
+#include "ut0ut.h"
+
+/** Initial number of rows in the table cache */
+#define TABLE_CACHE_INITIAL_ROWSNUM	1024
+
+/** @brief The maximum number of chunks to allocate for a table cache.
+
+The rows of a table cache are stored in a set of chunks. When a new
+row is added a new chunk is allocated if necessary. Assuming that the
+first one is 1024 rows (TABLE_CACHE_INITIAL_ROWSNUM) and each
+subsequent is N/2 where N is the number of rows we have allocated till
+now, then 39th chunk would accommodate 1677416425 rows and all chunks
+would accommodate 3354832851 rows. */
+#define MEM_CHUNKS_IN_TABLE_CACHE	39
+
+/** The following are some testing auxiliary macros. Do not enable them
+in a production environment. */
+/* @{ */
+
+#if 0
+/** If this is enabled then lock folds will always be different
+resulting in equal rows being put in a different cells of the hash
+table. Checking for duplicates will be flawed because different
+fold will be calculated when a row is searched in the hash table. */
+#define TEST_LOCK_FOLD_ALWAYS_DIFFERENT
+#endif
+
+#if 0
+/** This effectively kills the search-for-duplicate-before-adding-a-row
+function, but searching in the hash is still performed. It will always
+be assumed that lock is not present and insertion will be performed in
+the hash table. */
+#define TEST_NO_LOCKS_ROW_IS_EVER_EQUAL_TO_LOCK_T
+#endif
+
+#if 0
+/** This aggressively repeats adding each row many times. Depending on
+the above settings this may be noop or may result in lots of rows being
+added. */
+#define TEST_ADD_EACH_LOCKS_ROW_MANY_TIMES
+#endif
+
+#if 0
+/** Very similar to TEST_NO_LOCKS_ROW_IS_EVER_EQUAL_TO_LOCK_T but hash
+table search is not performed at all. */
+#define TEST_DO_NOT_CHECK_FOR_DUPLICATE_ROWS
+#endif
+
+#if 0
+/** Do not insert each row into the hash table, duplicates may appear
+if this is enabled, also if this is enabled searching into the hash is
+noop because it will be empty. */
+#define TEST_DO_NOT_INSERT_INTO_THE_HASH_TABLE
+#endif
+/* @} */
+
+/** Memory limit passed to ha_storage_put_memlim().
+@param cache	hash storage
+@return		maximum allowed allocation size */
+#define MAX_ALLOWED_FOR_STORAGE(cache)		\
+	(TRX_I_S_MEM_LIMIT			\
+	 - (cache)->mem_allocd)
+
+/** Memory limit in table_cache_create_empty_row().
+@param cache	hash storage
+@return		maximum allowed allocation size */
+#define MAX_ALLOWED_FOR_ALLOC(cache)		\
+	(TRX_I_S_MEM_LIMIT			\
+	 - (cache)->mem_allocd			\
+	 - ha_storage_get_size((cache)->storage))
+
+/** Memory for each table in the intermediate buffer is allocated in
+separate chunks. These chunks are considered to be concatenated to
+represent one flat array of rows. */
+typedef struct i_s_mem_chunk_struct {
+	ulint	offset;		/*!< offset, in number of rows */
+	ulint	rows_allocd;	/*!< the size of this chunk, in number
+				of rows */
+	void*	base;		/*!< start of the chunk */
+} i_s_mem_chunk_t;
+
+/** This represents one table's cache. */
+typedef struct i_s_table_cache_struct {
+	ulint		rows_used;	/*!< number of used rows */
+	ulint		rows_allocd;	/*!< number of allocated rows */
+	ulint		row_size;	/*!< size of a single row */
+	i_s_mem_chunk_t	chunks[MEM_CHUNKS_IN_TABLE_CACHE]; /*!< array of
+					memory chunks that stores the
+					rows */
+} i_s_table_cache_t;
+
+/** This structure describes the intermediate buffer */
+struct trx_i_s_cache_struct {
+	rw_lock_t	rw_lock;	/*!< read-write lock protecting
+					the rest of this structure */
+	ullint		last_read;	/*!< last time the cache was read;
+					measured in microseconds since
+					epoch */
+	mutex_t		last_read_mutex;/*!< mutex protecting the
+					last_read member - it is updated
+					inside a shared lock of the
+					rw_lock member */
+	i_s_table_cache_t innodb_trx;	/*!< innodb_trx table */
+	i_s_table_cache_t innodb_locks;	/*!< innodb_locks table */
+	i_s_table_cache_t innodb_lock_waits;/*!< innodb_lock_waits table */
+/** the hash table size is LOCKS_HASH_CELLS_NUM * sizeof(void*) bytes */
+#define LOCKS_HASH_CELLS_NUM		10000
+	hash_table_t*	locks_hash;	/*!< hash table used to eliminate
+					duplicate entries in the
+					innodb_locks table */
+/** Initial size of the cache storage */
+#define CACHE_STORAGE_INITIAL_SIZE	1024
+/** Number of hash cells in the cache storage */
+#define CACHE_STORAGE_HASH_CELLS	2048
+	ha_storage_t*	storage;	/*!< storage for external volatile
+					data that can possibly not be
+					available later, when we release
+					the kernel mutex */
+	ulint		mem_allocd;	/*!< the amount of memory
+					allocated with mem_alloc*() */
+	ibool		is_truncated;	/*!< this is TRUE if the memory
+					limit was hit and thus the data
+					in the cache is truncated */
+};
+
+/** This is the intermediate buffer where data needed to fill the
+INFORMATION SCHEMA tables is fetched and later retrieved by the C++
+code in handler/i_s.cc. */
+static trx_i_s_cache_t	trx_i_s_cache_static;
+/** This is the intermediate buffer where data needed to fill the
+INFORMATION SCHEMA tables is fetched and later retrieved by the C++
+code in handler/i_s.cc. */
+UNIV_INTERN trx_i_s_cache_t*	trx_i_s_cache = &trx_i_s_cache_static;
+
+/*******************************************************************//**
+For a record lock that is in waiting state retrieves the only bit that
+is set, for a table lock returns ULINT_UNDEFINED.
+@return	record number within the heap */
+static
+ulint
+wait_lock_get_heap_no(
+/*==================*/
+	const lock_t*	lock)	/*!< in: lock */
+{
+	ulint	ret;
+
+	switch (lock_get_type(lock)) {
+	case LOCK_REC:
+		ret = lock_rec_find_set_bit(lock);
+		ut_a(ret != ULINT_UNDEFINED);
+		break;
+	case LOCK_TABLE:
+		ret = ULINT_UNDEFINED;
+		break;
+	default:
+		ut_error;
+	}
+
+	return(ret);
+}
+
+/*******************************************************************//**
+Initializes the members of a table cache. */
+static
+void
+table_cache_init(
+/*=============*/
+	i_s_table_cache_t*	table_cache,	/*!< out: table cache */
+	size_t			row_size)	/*!< in: the size of a
+						row */
+{
+	ulint	i;
+
+	table_cache->rows_used = 0;
+	table_cache->rows_allocd = 0;
+	table_cache->row_size = row_size;
+
+	for (i = 0; i < MEM_CHUNKS_IN_TABLE_CACHE; i++) {
+
+		/* the memory is actually allocated in
+		table_cache_create_empty_row() */
+		table_cache->chunks[i].base = NULL;
+	}
+}
+
+/*******************************************************************//**
+Frees a table cache. */
+static
+void
+table_cache_free(
+/*=============*/
+	i_s_table_cache_t*	table_cache)	/*!< in/out: table cache */
+{
+	ulint	i;
+
+	for (i = 0; i < MEM_CHUNKS_IN_TABLE_CACHE; i++) {
+
+		/* the memory is actually allocated in
+		table_cache_create_empty_row() */
+		if (table_cache->chunks[i].base) {
+			mem_free(table_cache->chunks[i].base);
+			table_cache->chunks[i].base = NULL;
+		}
+	}
+}
+
+/*******************************************************************//**
+Returns an empty row from a table cache. The row is allocated if no more
+empty rows are available. The number of used rows is incremented.
+If the memory limit is hit then NULL is returned and nothing is
+allocated.
+@return	empty row, or NULL if out of memory */
+static
+void*
+table_cache_create_empty_row(
+/*=========================*/
+	i_s_table_cache_t*	table_cache,	/*!< in/out: table cache */
+	trx_i_s_cache_t*	cache)		/*!< in/out: cache to record
+						how many bytes are
+						allocated */
+{
+	ulint	i;
+	void*	row;
+
+	ut_a(table_cache->rows_used <= table_cache->rows_allocd);
+
+	if (table_cache->rows_used == table_cache->rows_allocd) {
+
+		/* rows_used == rows_allocd means that new chunk needs
+		to be allocated: either no more empty rows in the
+		last allocated chunk or nothing has been allocated yet
+		(rows_num == rows_allocd == 0); */
+
+		i_s_mem_chunk_t*	chunk;
+		ulint			req_bytes;
+		ulint			got_bytes;
+		ulint			req_rows;
+		ulint			got_rows;
+
+		/* find the first not allocated chunk */
+		for (i = 0; i < MEM_CHUNKS_IN_TABLE_CACHE; i++) {
+
+			if (table_cache->chunks[i].base == NULL) {
+
+				break;
+			}
+		}
+
+		/* i == MEM_CHUNKS_IN_TABLE_CACHE means that all chunks
+		have been allocated :-X */
+		ut_a(i < MEM_CHUNKS_IN_TABLE_CACHE);
+
+		/* allocate the chunk we just found */
+
+		if (i == 0) {
+
+			/* first chunk, nothing is allocated yet */
+			req_rows = TABLE_CACHE_INITIAL_ROWSNUM;
+		} else {
+
+			/* Memory is increased by the formula
+			new = old + old / 2; We are trying not to be
+			aggressive here (= using the common new = old * 2)
+			because the allocated memory will not be freed
+			until InnoDB exit (it is reused). So it is better
+			to once allocate the memory in more steps, but
+			have less unused/wasted memory than to use less
+			steps in allocation (which is done once in a
+			lifetime) but end up with lots of unused/wasted
+			memory. */
+			req_rows = table_cache->rows_allocd / 2;
+		}
+		req_bytes = req_rows * table_cache->row_size;
+
+		if (req_bytes > MAX_ALLOWED_FOR_ALLOC(cache)) {
+
+			return(NULL);
+		}
+
+		chunk = &table_cache->chunks[i];
+
+		chunk->base = mem_alloc2(req_bytes, &got_bytes);
+
+		got_rows = got_bytes / table_cache->row_size;
+
+		cache->mem_allocd += got_bytes;
+
+#if 0
+		printf("allocating chunk %d req bytes=%lu, got bytes=%lu, "
+		       "row size=%lu, "
+		       "req rows=%lu, got rows=%lu\n",
+		       i, req_bytes, got_bytes,
+		       table_cache->row_size,
+		       req_rows, got_rows);
+#endif
+
+		chunk->rows_allocd = got_rows;
+
+		table_cache->rows_allocd += got_rows;
+
+		/* adjust the offset of the next chunk */
+		if (i < MEM_CHUNKS_IN_TABLE_CACHE - 1) {
+
+			table_cache->chunks[i + 1].offset
+				= chunk->offset + chunk->rows_allocd;
+		}
+
+		/* return the first empty row in the newly allocated
+		chunk */
+		row = chunk->base;
+	} else {
+
+		char*	chunk_start;
+		ulint	offset;
+
+		/* there is an empty row, no need to allocate new
+		chunks */
+
+		/* find the first chunk that contains allocated but
+		empty/unused rows */
+		for (i = 0; i < MEM_CHUNKS_IN_TABLE_CACHE; i++) {
+
+			if (table_cache->chunks[i].offset
+			    + table_cache->chunks[i].rows_allocd
+			    > table_cache->rows_used) {
+
+				break;
+			}
+		}
+
+		/* i == MEM_CHUNKS_IN_TABLE_CACHE means that all chunks
+		are full, but
+		table_cache->rows_used != table_cache->rows_allocd means
+		exactly the opposite - there are allocated but
+		empty/unused rows :-X */
+		ut_a(i < MEM_CHUNKS_IN_TABLE_CACHE);
+
+		chunk_start = (char*) table_cache->chunks[i].base;
+		offset = table_cache->rows_used
+			- table_cache->chunks[i].offset;
+
+		row = chunk_start + offset * table_cache->row_size;
+	}
+
+	table_cache->rows_used++;
+
+	return(row);
+}
+
+/*******************************************************************//**
+Fills i_s_trx_row_t object.
+If memory can not be allocated then FALSE is returned.
+@return	FALSE if allocation fails */
+static
+ibool
+fill_trx_row(
+/*=========*/
+	i_s_trx_row_t*		row,		/*!< out: result object
+						that's filled */
+	const trx_t*		trx,		/*!< in: transaction to
+						get data from */
+	const i_s_locks_row_t*	requested_lock_row,/*!< in: pointer to the
+						corresponding row in
+						innodb_locks if trx is
+						waiting or NULL if trx
+						is not waiting */
+	trx_i_s_cache_t*	cache)		/*!< in/out: cache into
+						which to copy volatile
+						strings */
+{
+	const char*	stmt;
+	size_t		stmt_len;
+
+	row->trx_id = trx_get_id(trx);
+	row->trx_started = (ib_time_t) trx->start_time;
+	row->trx_state = trx_get_que_state_str(trx);
+
+	if (trx->wait_lock != NULL) {
+
+		ut_a(requested_lock_row != NULL);
+
+		row->requested_lock_row = requested_lock_row;
+		row->trx_wait_started = (ib_time_t) trx->wait_started;
+	} else {
+
+		ut_a(requested_lock_row == NULL);
+
+		row->requested_lock_row = NULL;
+		row->trx_wait_started = 0;
+	}
+
+	row->trx_weight = (ullint) ut_conv_dulint_to_longlong(TRX_WEIGHT(trx));
+
+	if (trx->mysql_thd == NULL) {
+		/* For internal transactions e.g., purge and transactions
+		being recovered at startup there is no associated MySQL
+		thread data structure. */
+		row->trx_mysql_thread_id = 0;
+		row->trx_query = NULL;
+		return(TRUE);
+	}
+
+	row->trx_mysql_thread_id = thd_get_thread_id(trx->mysql_thd);
+	stmt = innobase_get_stmt(trx->mysql_thd, &stmt_len);
+
+	if (stmt != NULL) {
+
+		char	query[TRX_I_S_TRX_QUERY_MAX_LEN + 1];
+
+		if (stmt_len > TRX_I_S_TRX_QUERY_MAX_LEN) {
+			stmt_len = TRX_I_S_TRX_QUERY_MAX_LEN;
+		}
+
+		memcpy(query, stmt, stmt_len);
+		query[stmt_len] = '\0';
+
+		row->trx_query = ha_storage_put_memlim(
+			cache->storage, stmt, stmt_len + 1,
+			MAX_ALLOWED_FOR_STORAGE(cache));
+
+		if (row->trx_query == NULL) {
+
+			return(FALSE);
+		}
+	} else {
+
+		row->trx_query = NULL;
+	}
+
+	return(TRUE);
+}
+
+/*******************************************************************//**
+Format the nth field of "rec" and put it in "buf". The result is always
+NUL-terminated. Returns the number of bytes that were written to "buf"
+(including the terminating NUL).
+@return	end of the result */
+static
+ulint
+put_nth_field(
+/*==========*/
+	char*			buf,	/*!< out: buffer */
+	ulint			buf_size,/*!< in: buffer size in bytes */
+	ulint			n,	/*!< in: number of field */
+	const dict_index_t*	index,	/*!< in: index */
+	const rec_t*		rec,	/*!< in: record */
+	const ulint*		offsets)/*!< in: record offsets, returned
+					by rec_get_offsets() */
+{
+	const byte*	data;
+	ulint		data_len;
+	dict_field_t*	dict_field;
+	ulint		ret;
+
+	ut_ad(rec_offs_validate(rec, NULL, offsets));
+
+	if (buf_size == 0) {
+
+		return(0);
+	}
+
+	ret = 0;
+
+	if (n > 0) {
+		/* we must append ", " before the actual data */
+
+		if (buf_size < 3) {
+
+			buf[0] = '\0';
+			return(1);
+		}
+
+		memcpy(buf, ", ", 3);
+
+		buf += 2;
+		buf_size -= 2;
+		ret += 2;
+	}
+
+	/* now buf_size >= 1 */
+
+	data = rec_get_nth_field(rec, offsets, n, &data_len);
+
+	dict_field = dict_index_get_nth_field(index, n);
+
+	ret += row_raw_format((const char*) data, data_len,
+			      dict_field, buf, buf_size);
+
+	return(ret);
+}
+
+/*******************************************************************//**
+Fills the "lock_data" member of i_s_locks_row_t object.
+If memory can not be allocated then FALSE is returned.
+@return	FALSE if allocation fails */
+static
+ibool
+fill_lock_data(
+/*===========*/
+	const char**		lock_data,/*!< out: "lock_data" to fill */
+	const lock_t*		lock,	/*!< in: lock used to find the data */
+	ulint			heap_no,/*!< in: rec num used to find the data */
+	trx_i_s_cache_t*	cache)	/*!< in/out: cache where to store
+					volatile data */
+{
+	mtr_t			mtr;
+
+	const buf_block_t*	block;
+	const page_t*		page;
+	const rec_t*		rec;
+
+	ut_a(lock_get_type(lock) == LOCK_REC);
+
+	mtr_start(&mtr);
+
+	block = buf_page_try_get(lock_rec_get_space_id(lock),
+				 lock_rec_get_page_no(lock),
+				 &mtr);
+
+	if (block == NULL) {
+
+		*lock_data = NULL;
+
+		mtr_commit(&mtr);
+
+		return(TRUE);
+	}
+
+	page = (const page_t*) buf_block_get_frame(block);
+
+	rec = page_find_rec_with_heap_no(page, heap_no);
+
+	if (page_rec_is_infimum(rec)) {
+
+		*lock_data = ha_storage_put_str_memlim(
+			cache->storage, "infimum pseudo-record",
+			MAX_ALLOWED_FOR_STORAGE(cache));
+	} else if (page_rec_is_supremum(rec)) {
+
+		*lock_data = ha_storage_put_str_memlim(
+			cache->storage, "supremum pseudo-record",
+			MAX_ALLOWED_FOR_STORAGE(cache));
+	} else {
+
+		const dict_index_t*	index;
+		ulint			n_fields;
+		mem_heap_t*		heap;
+		ulint			offsets_onstack[REC_OFFS_NORMAL_SIZE];
+		ulint*			offsets;
+		char			buf[TRX_I_S_LOCK_DATA_MAX_LEN];
+		ulint			buf_used;
+		ulint			i;
+
+		rec_offs_init(offsets_onstack);
+		offsets = offsets_onstack;
+
+		index = lock_rec_get_index(lock);
+
+		n_fields = dict_index_get_n_unique(index);
+
+		ut_a(n_fields > 0);
+
+		heap = NULL;
+		offsets = rec_get_offsets(rec, index, offsets, n_fields,
+					  &heap);
+
+		/* format and store the data */
+
+		buf_used = 0;
+		for (i = 0; i < n_fields; i++) {
+
+			buf_used += put_nth_field(
+				buf + buf_used, sizeof(buf) - buf_used,
+				i, index, rec, offsets) - 1;
+		}
+
+		*lock_data = (const char*) ha_storage_put_memlim(
+			cache->storage, buf, buf_used + 1,
+			MAX_ALLOWED_FOR_STORAGE(cache));
+
+		if (UNIV_UNLIKELY(heap != NULL)) {
+
+			/* this means that rec_get_offsets() has created a new
+			heap and has stored offsets in it; check that this is
+			really the case and free the heap */
+			ut_a(offsets != offsets_onstack);
+			mem_heap_free(heap);
+		}
+	}
+
+	mtr_commit(&mtr);
+
+	if (*lock_data == NULL) {
+
+		return(FALSE);
+	}
+
+	return(TRUE);
+}
+
+/*******************************************************************//**
+Fills i_s_locks_row_t object. Returns its first argument.
+If memory can not be allocated then FALSE is returned.
+@return	FALSE if allocation fails */
+static
+ibool
+fill_locks_row(
+/*===========*/
+	i_s_locks_row_t* row,	/*!< out: result object that's filled */
+	const lock_t*	lock,	/*!< in: lock to get data from */
+	ulint		heap_no,/*!< in: lock's record number
+				or ULINT_UNDEFINED if the lock
+				is a table lock */
+	trx_i_s_cache_t* cache)	/*!< in/out: cache into which to copy
+				volatile strings */
+{
+	row->lock_trx_id = lock_get_trx_id(lock);
+	row->lock_mode = lock_get_mode_str(lock);
+	row->lock_type = lock_get_type_str(lock);
+
+	row->lock_table = ha_storage_put_str_memlim(
+		cache->storage, lock_get_table_name(lock),
+		MAX_ALLOWED_FOR_STORAGE(cache));
+
+	/* memory could not be allocated */
+	if (row->lock_table == NULL) {
+
+		return(FALSE);
+	}
+
+	switch (lock_get_type(lock)) {
+	case LOCK_REC:
+		row->lock_index = ha_storage_put_str_memlim(
+			cache->storage, lock_rec_get_index_name(lock),
+			MAX_ALLOWED_FOR_STORAGE(cache));
+
+		/* memory could not be allocated */
+		if (row->lock_index == NULL) {
+
+			return(FALSE);
+		}
+
+		row->lock_space = lock_rec_get_space_id(lock);
+		row->lock_page = lock_rec_get_page_no(lock);
+		row->lock_rec = heap_no;
+
+		if (!fill_lock_data(&row->lock_data, lock, heap_no, cache)) {
+
+			/* memory could not be allocated */
+			return(FALSE);
+		}
+
+		break;
+	case LOCK_TABLE:
+		row->lock_index = NULL;
+
+		row->lock_space = ULINT_UNDEFINED;
+		row->lock_page = ULINT_UNDEFINED;
+		row->lock_rec = ULINT_UNDEFINED;
+
+		row->lock_data = NULL;
+
+		break;
+	default:
+		ut_error;
+	}
+
+	row->lock_table_id = lock_get_table_id(lock);
+
+	row->hash_chain.value = row;
+
+	return(TRUE);
+}
+
+/*******************************************************************//**
+Fills i_s_lock_waits_row_t object. Returns its first argument.
+@return	result object that's filled */
+static
+i_s_lock_waits_row_t*
+fill_lock_waits_row(
+/*================*/
+	i_s_lock_waits_row_t*	row,		/*!< out: result object
+						that's filled */
+	const i_s_locks_row_t*	requested_lock_row,/*!< in: pointer to the
+						relevant requested lock
+						row in innodb_locks */
+	const i_s_locks_row_t*	blocking_lock_row)/*!< in: pointer to the
+						relevant blocking lock
+						row in innodb_locks */
+{
+	row->requested_lock_row = requested_lock_row;
+	row->blocking_lock_row = blocking_lock_row;
+
+	return(row);
+}
+
+/*******************************************************************//**
+Calculates a hash fold for a lock. For a record lock the fold is
+calculated from 4 elements, which uniquely identify a lock at a given
+point in time: transaction id, space id, page number, record number.
+For a table lock the fold is table's id.
+@return	fold */
+static
+ulint
+fold_lock(
+/*======*/
+	const lock_t*	lock,	/*!< in: lock object to fold */
+	ulint		heap_no)/*!< in: lock's record number
+				or ULINT_UNDEFINED if the lock
+				is a table lock */
+{
+#ifdef TEST_LOCK_FOLD_ALWAYS_DIFFERENT
+	static ulint	fold = 0;
+
+	return(fold++);
+#else
+	ulint	ret;
+
+	switch (lock_get_type(lock)) {
+	case LOCK_REC:
+		ut_a(heap_no != ULINT_UNDEFINED);
+
+		ret = ut_fold_ulint_pair((ulint) lock_get_trx_id(lock),
+					 lock_rec_get_space_id(lock));
+
+		ret = ut_fold_ulint_pair(ret,
+					 lock_rec_get_page_no(lock));
+
+		ret = ut_fold_ulint_pair(ret, heap_no);
+
+		break;
+	case LOCK_TABLE:
+		/* this check is actually not necessary for continuing
+		correct operation, but something must have gone wrong if
+		it fails. */
+		ut_a(heap_no == ULINT_UNDEFINED);
+
+		ret = (ulint) lock_get_table_id(lock);
+
+		break;
+	default:
+		ut_error;
+	}
+
+	return(ret);
+#endif
+}
+
+/*******************************************************************//**
+Checks whether i_s_locks_row_t object represents a lock_t object.
+@return	TRUE if they match */
+static
+ibool
+locks_row_eq_lock(
+/*==============*/
+	const i_s_locks_row_t*	row,	/*!< in: innodb_locks row */
+	const lock_t*		lock,	/*!< in: lock object */
+	ulint			heap_no)/*!< in: lock's record number
+					or ULINT_UNDEFINED if the lock
+					is a table lock */
+{
+#ifdef TEST_NO_LOCKS_ROW_IS_EVER_EQUAL_TO_LOCK_T
+	return(0);
+#else
+	switch (lock_get_type(lock)) {
+	case LOCK_REC:
+		ut_a(heap_no != ULINT_UNDEFINED);
+
+		return(row->lock_trx_id == lock_get_trx_id(lock)
+		       && row->lock_space == lock_rec_get_space_id(lock)
+		       && row->lock_page == lock_rec_get_page_no(lock)
+		       && row->lock_rec == heap_no);
+
+	case LOCK_TABLE:
+		/* this check is actually not necessary for continuing
+		correct operation, but something must have gone wrong if
+		it fails. */
+		ut_a(heap_no == ULINT_UNDEFINED);
+
+		return(row->lock_trx_id == lock_get_trx_id(lock)
+		       && row->lock_table_id == lock_get_table_id(lock));
+
+	default:
+		ut_error;
+		return(FALSE);
+	}
+#endif
+}
+
+/*******************************************************************//**
+Searches for a row in the innodb_locks cache that has a specified id.
+This happens in O(1) time since a hash table is used. Returns pointer to
+the row or NULL if none is found.
+@return	row or NULL */
+static
+i_s_locks_row_t*
+search_innodb_locks(
+/*================*/
+	trx_i_s_cache_t*	cache,	/*!< in: cache */
+	const lock_t*		lock,	/*!< in: lock to search for */
+	ulint			heap_no)/*!< in: lock's record number
+					or ULINT_UNDEFINED if the lock
+					is a table lock */
+{
+	i_s_hash_chain_t*	hash_chain;
+
+	HASH_SEARCH(
+		/* hash_chain->"next" */
+		next,
+		/* the hash table */
+		cache->locks_hash,
+		/* fold */
+		fold_lock(lock, heap_no),
+		/* the type of the next variable */
+		i_s_hash_chain_t*,
+		/* auxiliary variable */
+		hash_chain,
+		/* assertion on every traversed item */
+		,
+		/* this determines if we have found the lock */
+		locks_row_eq_lock(hash_chain->value, lock, heap_no));
+
+	if (hash_chain == NULL) {
+
+		return(NULL);
+	}
+	/* else */
+
+	return(hash_chain->value);
+}
+
+/*******************************************************************//**
+Adds new element to the locks cache, enlarging it if necessary.
+Returns a pointer to the added row. If the row is already present then
+no row is added and a pointer to the existing row is returned.
+If row can not be allocated then NULL is returned.
+@return	row */
+static
+i_s_locks_row_t*
+add_lock_to_cache(
+/*==============*/
+	trx_i_s_cache_t*	cache,	/*!< in/out: cache */
+	const lock_t*		lock,	/*!< in: the element to add */
+	ulint			heap_no)/*!< in: lock's record number
+					or ULINT_UNDEFINED if the lock
+					is a table lock */
+{
+	i_s_locks_row_t*	dst_row;
+
+#ifdef TEST_ADD_EACH_LOCKS_ROW_MANY_TIMES
+	ulint	i;
+	for (i = 0; i < 10000; i++) {
+#endif
+#ifndef TEST_DO_NOT_CHECK_FOR_DUPLICATE_ROWS
+	/* quit if this lock is already present */
+	dst_row = search_innodb_locks(cache, lock, heap_no);
+	if (dst_row != NULL) {
+
+		return(dst_row);
+	}
+#endif
+
+	dst_row = (i_s_locks_row_t*)
+		table_cache_create_empty_row(&cache->innodb_locks, cache);
+
+	/* memory could not be allocated */
+	if (dst_row == NULL) {
+
+		return(NULL);
+	}
+
+	if (!fill_locks_row(dst_row, lock, heap_no, cache)) {
+
+		/* memory could not be allocated */
+		cache->innodb_locks.rows_used--;
+		return(NULL);
+	}
+
+#ifndef TEST_DO_NOT_INSERT_INTO_THE_HASH_TABLE
+	HASH_INSERT(
+		/* the type used in the hash chain */
+		i_s_hash_chain_t,
+		/* hash_chain->"next" */
+		next,
+		/* the hash table */
+		cache->locks_hash,
+		/* fold */
+		fold_lock(lock, heap_no),
+		/* add this data to the hash */
+		&dst_row->hash_chain);
+#endif
+#ifdef TEST_ADD_EACH_LOCKS_ROW_MANY_TIMES
+	} /* for()-loop */
+#endif
+
+	return(dst_row);
+}
+
+/*******************************************************************//**
+Adds new pair of locks to the lock waits cache.
+If memory can not be allocated then FALSE is returned.
+@return	FALSE if allocation fails */
+static
+ibool
+add_lock_wait_to_cache(
+/*===================*/
+	trx_i_s_cache_t*	cache,		/*!< in/out: cache */
+	const i_s_locks_row_t*	requested_lock_row,/*!< in: pointer to the
+						relevant requested lock
+						row in innodb_locks */
+	const i_s_locks_row_t*	blocking_lock_row)/*!< in: pointer to the
+						relevant blocking lock
+						row in innodb_locks */
+{
+	i_s_lock_waits_row_t*	dst_row;
+
+	dst_row = (i_s_lock_waits_row_t*)
+		table_cache_create_empty_row(&cache->innodb_lock_waits,
+					     cache);
+
+	/* memory could not be allocated */
+	if (dst_row == NULL) {
+
+		return(FALSE);
+	}
+
+	fill_lock_waits_row(dst_row, requested_lock_row, blocking_lock_row);
+
+	return(TRUE);
+}
+
+/*******************************************************************//**
+Adds transaction's relevant (important) locks to cache.
+If the transaction is waiting, then the wait lock is added to
+innodb_locks and a pointer to the added row is returned in
+requested_lock_row, otherwise requested_lock_row is set to NULL.
+If rows can not be allocated then FALSE is returned and the value of
+requested_lock_row is undefined.
+@return	FALSE if allocation fails */
+static
+ibool
+add_trx_relevant_locks_to_cache(
+/*============================*/
+	trx_i_s_cache_t*	cache,	/*!< in/out: cache */
+	const trx_t*		trx,	/*!< in: transaction */
+	i_s_locks_row_t**	requested_lock_row)/*!< out: pointer to the
+					requested lock row, or NULL or
+					undefined */
+{
+	ut_ad(mutex_own(&kernel_mutex));
+
+	/* If transaction is waiting we add the wait lock and all locks
+	from another transactions that are blocking the wait lock. */
+	if (trx->que_state == TRX_QUE_LOCK_WAIT) {
+
+		const lock_t*		curr_lock;
+		ulint			wait_lock_heap_no;
+		i_s_locks_row_t*	blocking_lock_row;
+		lock_queue_iterator_t	iter;
+
+		ut_a(trx->wait_lock != NULL);
+
+		wait_lock_heap_no
+			= wait_lock_get_heap_no(trx->wait_lock);
+
+		/* add the requested lock */
+		*requested_lock_row
+			= add_lock_to_cache(cache, trx->wait_lock,
+					    wait_lock_heap_no);
+
+		/* memory could not be allocated */
+		if (*requested_lock_row == NULL) {
+
+			return(FALSE);
+		}
+
+		/* then iterate over the locks before the wait lock and
+		add the ones that are blocking it */
+
+		lock_queue_iterator_reset(&iter, trx->wait_lock,
+					  ULINT_UNDEFINED);
+
+		curr_lock = lock_queue_iterator_get_prev(&iter);
+		while (curr_lock != NULL) {
+
+			if (lock_has_to_wait(trx->wait_lock,
+					     curr_lock)) {
+
+				/* add the lock that is
+				blocking trx->wait_lock */
+				blocking_lock_row
+					= add_lock_to_cache(
+						cache, curr_lock,
+						/* heap_no is the same
+						for the wait and waited
+						locks */
+						wait_lock_heap_no);
+
+				/* memory could not be allocated */
+				if (blocking_lock_row == NULL) {
+
+					return(FALSE);
+				}
+
+				/* add the relation between both locks
+				to innodb_lock_waits */
+				if (!add_lock_wait_to_cache(
+						cache, *requested_lock_row,
+						blocking_lock_row)) {
+
+					/* memory could not be allocated */
+					return(FALSE);
+				}
+			}
+
+			curr_lock = lock_queue_iterator_get_prev(&iter);
+		}
+	} else {
+
+		*requested_lock_row = NULL;
+	}
+
+	return(TRUE);
+}
+
+/** The minimum time that a cache must not be updated after it has been
+read for the last time; measured in microseconds. We use this technique
+to ensure that SELECTs which join several INFORMATION SCHEMA tables read
+the same version of the cache. */
+#define CACHE_MIN_IDLE_TIME_US	100000 /* 0.1 sec */
+
+/*******************************************************************//**
+Checks if the cache can safely be updated.
+@return	TRUE if can be updated */
+static
+ibool
+can_cache_be_updated(
+/*=================*/
+	trx_i_s_cache_t*	cache)	/*!< in: cache */
+{
+	ullint	now;
+
+	/* Here we read cache->last_read without acquiring its mutex
+	because last_read is only updated when a shared rw lock on the
+	whole cache is being held (see trx_i_s_cache_end_read()) and
+	we are currently holding an exclusive rw lock on the cache.
+	So it is not possible for last_read to be updated while we are
+	reading it. */
+
+#ifdef UNIV_SYNC_DEBUG
+	ut_a(rw_lock_own(&cache->rw_lock, RW_LOCK_EX));
+#endif
+
+	now = ut_time_us(NULL);
+	if (now - cache->last_read > CACHE_MIN_IDLE_TIME_US) {
+
+		return(TRUE);
+	}
+
+	return(FALSE);
+}
+
+/*******************************************************************//**
+Declare a cache empty, preparing it to be filled up. Not all resources
+are freed because they can be reused. */
+static
+void
+trx_i_s_cache_clear(
+/*================*/
+	trx_i_s_cache_t*	cache)	/*!< out: cache to clear */
+{
+	cache->innodb_trx.rows_used = 0;
+	cache->innodb_locks.rows_used = 0;
+	cache->innodb_lock_waits.rows_used = 0;
+
+	hash_table_clear(cache->locks_hash);
+
+	ha_storage_empty(&cache->storage);
+}
+
+/*******************************************************************//**
+Fetches the data needed to fill the 3 INFORMATION SCHEMA tables into the
+table cache buffer. Cache must be locked for write. */
+static
+void
+fetch_data_into_cache(
+/*==================*/
+	trx_i_s_cache_t*	cache)	/*!< in/out: cache */
+{
+	trx_t*			trx;
+	i_s_trx_row_t*		trx_row;
+	i_s_locks_row_t*	requested_lock_row;
+
+	ut_ad(mutex_own(&kernel_mutex));
+
+	trx_i_s_cache_clear(cache);
+
+	/* We iterate over the list of all transactions and add each one
+	to innodb_trx's cache. We also add all locks that are relevant
+	to each transaction into innodb_locks' and innodb_lock_waits'
+	caches. */
+
+	for (trx = UT_LIST_GET_FIRST(trx_sys->trx_list);
+	     trx != NULL;
+	     trx = UT_LIST_GET_NEXT(trx_list, trx)) {
+
+		if (!add_trx_relevant_locks_to_cache(cache, trx,
+						     &requested_lock_row)) {
+
+			cache->is_truncated = TRUE;
+			return;
+		}
+
+		trx_row = (i_s_trx_row_t*)
+			table_cache_create_empty_row(&cache->innodb_trx,
+						     cache);
+
+		/* memory could not be allocated */
+		if (trx_row == NULL) {
+
+			cache->is_truncated = TRUE;
+			return;
+		}
+
+		if (!fill_trx_row(trx_row, trx, requested_lock_row, cache)) {
+
+			/* memory could not be allocated */
+			cache->innodb_trx.rows_used--;
+			cache->is_truncated = TRUE;
+			return;
+		}
+	}
+
+	cache->is_truncated = FALSE;
+}
+
+/*******************************************************************//**
+Update the transactions cache if it has not been read for some time.
+Called from handler/i_s.cc.
+@return	0 - fetched, 1 - not */
+UNIV_INTERN
+int
+trx_i_s_possibly_fetch_data_into_cache(
+/*===================================*/
+	trx_i_s_cache_t*	cache)	/*!< in/out: cache */
+{
+	if (!can_cache_be_updated(cache)) {
+
+		return(1);
+	}
+
+	/* We need to read trx_sys and record/table lock queues */
+	mutex_enter(&kernel_mutex);
+
+	fetch_data_into_cache(cache);
+
+	mutex_exit(&kernel_mutex);
+
+	return(0);
+}
+
+/*******************************************************************//**
+Returns TRUE if the data in the cache is truncated due to the memory
+limit posed by TRX_I_S_MEM_LIMIT.
+@return	TRUE if truncated */
+UNIV_INTERN
+ibool
+trx_i_s_cache_is_truncated(
+/*=======================*/
+	trx_i_s_cache_t*	cache)	/*!< in: cache */
+{
+	return(cache->is_truncated);
+}
+
+/*******************************************************************//**
+Initialize INFORMATION SCHEMA trx related cache. */
+UNIV_INTERN
+void
+trx_i_s_cache_init(
+/*===============*/
+	trx_i_s_cache_t*	cache)	/*!< out: cache to init */
+{
+	/* The latching is done in the following order:
+	acquire trx_i_s_cache_t::rw_lock, X
+	acquire kernel_mutex
+	release kernel_mutex
+	release trx_i_s_cache_t::rw_lock
+	acquire trx_i_s_cache_t::rw_lock, S
+	acquire trx_i_s_cache_t::last_read_mutex
+	release trx_i_s_cache_t::last_read_mutex
+	release trx_i_s_cache_t::rw_lock */
+
+	rw_lock_create(&cache->rw_lock, SYNC_TRX_I_S_RWLOCK);
+
+	cache->last_read = 0;
+
+	mutex_create(&cache->last_read_mutex, SYNC_TRX_I_S_LAST_READ);
+
+	table_cache_init(&cache->innodb_trx, sizeof(i_s_trx_row_t));
+	table_cache_init(&cache->innodb_locks, sizeof(i_s_locks_row_t));
+	table_cache_init(&cache->innodb_lock_waits,
+			 sizeof(i_s_lock_waits_row_t));
+
+	cache->locks_hash = hash_create(LOCKS_HASH_CELLS_NUM);
+
+	cache->storage = ha_storage_create(CACHE_STORAGE_INITIAL_SIZE,
+					   CACHE_STORAGE_HASH_CELLS);
+
+	cache->mem_allocd = 0;
+
+	cache->is_truncated = FALSE;
+}
+
+/*******************************************************************//**
+Free the INFORMATION SCHEMA trx related cache. */
+UNIV_INTERN
+void
+trx_i_s_cache_free(
+/*===============*/
+	trx_i_s_cache_t*	cache)	/*!< in, own: cache to free */
+{
+	hash_table_free(cache->locks_hash);
+	ha_storage_free(cache->storage);
+	table_cache_free(&cache->innodb_trx);
+	table_cache_free(&cache->innodb_locks);
+	table_cache_free(&cache->innodb_lock_waits);
+	memset(cache, 0, sizeof *cache);
+}
+
+/*******************************************************************//**
+Issue a shared/read lock on the tables cache. */
+UNIV_INTERN
+void
+trx_i_s_cache_start_read(
+/*=====================*/
+	trx_i_s_cache_t*	cache)	/*!< in: cache */
+{
+	rw_lock_s_lock(&cache->rw_lock);
+}
+
+/*******************************************************************//**
+Release a shared/read lock on the tables cache. */
+UNIV_INTERN
+void
+trx_i_s_cache_end_read(
+/*===================*/
+	trx_i_s_cache_t*	cache)	/*!< in: cache */
+{
+	ullint	now;
+
+#ifdef UNIV_SYNC_DEBUG
+	ut_a(rw_lock_own(&cache->rw_lock, RW_LOCK_SHARED));
+#endif
+
+	/* update cache last read time */
+	now = ut_time_us(NULL);
+	mutex_enter(&cache->last_read_mutex);
+	cache->last_read = now;
+	mutex_exit(&cache->last_read_mutex);
+
+	rw_lock_s_unlock(&cache->rw_lock);
+}
+
+/*******************************************************************//**
+Issue an exclusive/write lock on the tables cache. */
+UNIV_INTERN
+void
+trx_i_s_cache_start_write(
+/*======================*/
+	trx_i_s_cache_t*	cache)	/*!< in: cache */
+{
+	rw_lock_x_lock(&cache->rw_lock);
+}
+
+/*******************************************************************//**
+Release an exclusive/write lock on the tables cache. */
+UNIV_INTERN
+void
+trx_i_s_cache_end_write(
+/*====================*/
+	trx_i_s_cache_t*	cache)	/*!< in: cache */
+{
+#ifdef UNIV_SYNC_DEBUG
+	ut_a(rw_lock_own(&cache->rw_lock, RW_LOCK_EX));
+#endif
+
+	rw_lock_x_unlock(&cache->rw_lock);
+}
+
+/*******************************************************************//**
+Selects a INFORMATION SCHEMA table cache from the whole cache.
+@return	table cache */
+static
+i_s_table_cache_t*
+cache_select_table(
+/*===============*/
+	trx_i_s_cache_t*	cache,	/*!< in: whole cache */
+	enum i_s_table		table)	/*!< in: which table */
+{
+	i_s_table_cache_t*	table_cache;
+
+#ifdef UNIV_SYNC_DEBUG
+	ut_a(rw_lock_own(&cache->rw_lock, RW_LOCK_SHARED)
+	     || rw_lock_own(&cache->rw_lock, RW_LOCK_EX));
+#endif
+
+	switch (table) {
+	case I_S_INNODB_TRX:
+		table_cache = &cache->innodb_trx;
+		break;
+	case I_S_INNODB_LOCKS:
+		table_cache = &cache->innodb_locks;
+		break;
+	case I_S_INNODB_LOCK_WAITS:
+		table_cache = &cache->innodb_lock_waits;
+		break;
+	default:
+		ut_error;
+	}
+
+	return(table_cache);
+}
+
+/*******************************************************************//**
+Retrieves the number of used rows in the cache for a given
+INFORMATION SCHEMA table.
+@return	number of rows */
+UNIV_INTERN
+ulint
+trx_i_s_cache_get_rows_used(
+/*========================*/
+	trx_i_s_cache_t*	cache,	/*!< in: cache */
+	enum i_s_table		table)	/*!< in: which table */
+{
+	i_s_table_cache_t*	table_cache;
+
+	table_cache = cache_select_table(cache, table);
+
+	return(table_cache->rows_used);
+}
+
+/*******************************************************************//**
+Retrieves the nth row (zero-based) in the cache for a given
+INFORMATION SCHEMA table.
+@return	row */
+UNIV_INTERN
+void*
+trx_i_s_cache_get_nth_row(
+/*======================*/
+	trx_i_s_cache_t*	cache,	/*!< in: cache */
+	enum i_s_table		table,	/*!< in: which table */
+	ulint			n)	/*!< in: row number */
+{
+	i_s_table_cache_t*	table_cache;
+	ulint			i;
+	void*			row;
+
+	table_cache = cache_select_table(cache, table);
+
+	ut_a(n < table_cache->rows_used);
+
+	row = NULL;
+
+	for (i = 0; i < MEM_CHUNKS_IN_TABLE_CACHE; i++) {
+
+		if (table_cache->chunks[i].offset
+		    + table_cache->chunks[i].rows_allocd > n) {
+
+			row = (char*) table_cache->chunks[i].base
+				+ (n - table_cache->chunks[i].offset)
+				* table_cache->row_size;
+			break;
+		}
+	}
+
+	ut_a(row != NULL);
+
+	return(row);
+}
+
+/*******************************************************************//**
+Crafts a lock id string from a i_s_locks_row_t object. Returns its
+second argument. This function aborts if there is not enough space in
+lock_id. Be sure to provide at least TRX_I_S_LOCK_ID_MAX_LEN + 1 if you
+want to be 100% sure that it will not abort.
+@return	resulting lock id */
+UNIV_INTERN
+char*
+trx_i_s_create_lock_id(
+/*===================*/
+	const i_s_locks_row_t*	row,	/*!< in: innodb_locks row */
+	char*			lock_id,/*!< out: resulting lock_id */
+	ulint			lock_id_size)/*!< in: size of the lock id
+					buffer */
+{
+	int	res_len;
+
+	/* please adjust TRX_I_S_LOCK_ID_MAX_LEN if you change this */
+
+	if (row->lock_space != ULINT_UNDEFINED) {
+		/* record lock */
+		res_len = ut_snprintf(lock_id, lock_id_size,
+				      TRX_ID_FMT ":%lu:%lu:%lu",
+				      row->lock_trx_id, row->lock_space,
+				      row->lock_page, row->lock_rec);
+	} else {
+		/* table lock */
+		res_len = ut_snprintf(lock_id, lock_id_size,
+				      TRX_ID_FMT ":%llu",
+				      row->lock_trx_id,
+				      row->lock_table_id);
+	}
+
+	/* the typecast is safe because snprintf(3) never returns
+	negative result */
+	ut_a(res_len >= 0);
+	ut_a((ulint) res_len < lock_id_size);
+
+	return(lock_id);
+}
diff --git a/storage/xtradb/trx/trx0purge.c b/storage/xtradb/trx/trx0purge.c
new file mode 100644
index 00000000000..1c317665878
--- /dev/null
+++ b/storage/xtradb/trx/trx0purge.c
@@ -0,0 +1,1288 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file trx/trx0purge.c
+Purge old versions
+
+Created 3/26/1996 Heikki Tuuri
+*******************************************************/
+
+#include "trx0purge.h"
+
+#ifdef UNIV_NONINL
+#include "trx0purge.ic"
+#endif
+
+#include "fsp0fsp.h"
+#include "mach0data.h"
+#include "mtr0log.h"
+#include "trx0rseg.h"
+#include "trx0trx.h"
+#include "trx0roll.h"
+#include "read0read.h"
+#include "fut0fut.h"
+#include "que0que.h"
+#include "row0purge.h"
+#include "row0upd.h"
+#include "trx0rec.h"
+#include "srv0que.h"
+#include "os0thread.h"
+
+/** The global data structure coordinating a purge */
+UNIV_INTERN trx_purge_t*	purge_sys = NULL;
+
+/** A dummy undo record used as a return value when we have a whole undo log
+which needs no purge */
+UNIV_INTERN trx_undo_rec_t	trx_purge_dummy_rec;
+
+/*****************************************************************//**
+Checks if trx_id is >= purge_view: then it is guaranteed that its update
+undo log still exists in the system.
+@return TRUE if is sure that it is preserved, also if the function
+returns FALSE, it is possible that the undo log still exists in the
+system */
+UNIV_INTERN
+ibool
+trx_purge_update_undo_must_exist(
+/*=============================*/
+	trx_id_t	trx_id)	/*!< in: transaction id */
+{
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(rw_lock_own(&(purge_sys->latch), RW_LOCK_SHARED));
+#endif /* UNIV_SYNC_DEBUG */
+
+	if (!read_view_sees_trx_id(purge_sys->view, trx_id)) {
+
+		return(TRUE);
+	}
+
+	return(FALSE);
+}
+
+/*=================== PURGE RECORD ARRAY =============================*/
+
+/*******************************************************************//**
+Stores info of an undo log record during a purge.
+@return	pointer to the storage cell */
+static
+trx_undo_inf_t*
+trx_purge_arr_store_info(
+/*=====================*/
+	trx_id_t	trx_no,	/*!< in: transaction number */
+	undo_no_t	undo_no)/*!< in: undo number */
+{
+	trx_undo_inf_t*	cell;
+	trx_undo_arr_t*	arr;
+	ulint		i;
+
+	arr = purge_sys->arr;
+
+	for (i = 0;; i++) {
+		cell = trx_undo_arr_get_nth_info(arr, i);
+
+		if (!(cell->in_use)) {
+			/* Not in use, we may store here */
+			cell->undo_no = undo_no;
+			cell->trx_no = trx_no;
+			cell->in_use = TRUE;
+
+			arr->n_used++;
+
+			return(cell);
+		}
+	}
+}
+
+/*******************************************************************//**
+Removes info of an undo log record during a purge. */
+UNIV_INLINE
+void
+trx_purge_arr_remove_info(
+/*======================*/
+	trx_undo_inf_t*	cell)	/*!< in: pointer to the storage cell */
+{
+	trx_undo_arr_t*	arr;
+
+	arr = purge_sys->arr;
+
+	cell->in_use = FALSE;
+
+	ut_ad(arr->n_used > 0);
+
+	arr->n_used--;
+}
+
+/*******************************************************************//**
+Gets the biggest pair of a trx number and an undo number in a purge array. */
+static
+void
+trx_purge_arr_get_biggest(
+/*======================*/
+	trx_undo_arr_t*	arr,	/*!< in: purge array */
+	trx_id_t*	trx_no,	/*!< out: transaction number: ut_dulint_zero
+				if array is empty */
+	undo_no_t*	undo_no)/*!< out: undo number */
+{
+	trx_undo_inf_t*	cell;
+	trx_id_t	pair_trx_no;
+	undo_no_t	pair_undo_no;
+	int		trx_cmp;
+	ulint		n_used;
+	ulint		i;
+	ulint		n;
+
+	n = 0;
+	n_used = arr->n_used;
+	pair_trx_no = ut_dulint_zero;
+	pair_undo_no = ut_dulint_zero;
+
+	for (i = 0;; i++) {
+		cell = trx_undo_arr_get_nth_info(arr, i);
+
+		if (cell->in_use) {
+			n++;
+			trx_cmp = ut_dulint_cmp(cell->trx_no, pair_trx_no);
+
+			if ((trx_cmp > 0)
+			    || ((trx_cmp == 0)
+				&& (ut_dulint_cmp(cell->undo_no,
+						  pair_undo_no) >= 0))) {
+
+				pair_trx_no = cell->trx_no;
+				pair_undo_no = cell->undo_no;
+			}
+		}
+
+		if (n == n_used) {
+			*trx_no = pair_trx_no;
+			*undo_no = pair_undo_no;
+
+			return;
+		}
+	}
+}
+
+/****************************************************************//**
+Builds a purge 'query' graph. The actual purge is performed by executing
+this query graph.
+@return	own: the query graph */
+static
+que_t*
+trx_purge_graph_build(
+/*=======================*/
+	trx_t*	trx)
+{
+	mem_heap_t*	heap;
+	que_fork_t*	fork;
+	que_thr_t*	thr;
+	/*	que_thr_t*	thr2; */
+
+	heap = mem_heap_create(512);
+	fork = que_fork_create(NULL, NULL, QUE_FORK_PURGE, heap);
+	fork->trx = trx;
+
+	thr = que_thr_create(fork, heap);
+
+	thr->child = row_purge_node_create(thr, heap);
+
+	/*	thr2 = que_thr_create(fork, fork, heap);
+
+	thr2->child = row_purge_node_create(fork, thr2, heap);	 */
+
+	return(fork);
+}
+
+/********************************************************************//**
+Creates the global purge system control structure and inits the history
+mutex. */
+UNIV_INTERN
+void
+trx_purge_sys_create(void)
+/*======================*/
+{
+	ut_ad(mutex_own(&kernel_mutex));
+
+	purge_sys = mem_alloc(sizeof(trx_purge_t));
+
+	purge_sys->state = TRX_STOP_PURGE;
+
+	purge_sys->n_pages_handled = 0;
+
+	purge_sys->purge_trx_no = ut_dulint_zero;
+	purge_sys->purge_undo_no = ut_dulint_zero;
+	purge_sys->next_stored = FALSE;
+
+	rw_lock_create(&purge_sys->latch, SYNC_PURGE_LATCH);
+
+	mutex_create(&purge_sys->mutex, SYNC_PURGE_SYS);
+
+	purge_sys->heap = mem_heap_create(256);
+
+	purge_sys->arr = trx_undo_arr_create();
+
+	purge_sys->sess = sess_open();
+
+	purge_sys->trx = purge_sys->sess->trx;
+
+	purge_sys->trx->is_purge = 1;
+
+	ut_a(trx_start_low(purge_sys->trx, ULINT_UNDEFINED));
+
+	purge_sys->query = trx_purge_graph_build(purge_sys->trx);
+
+	purge_sys->view = read_view_oldest_copy_or_open_new(ut_dulint_zero,
+							    purge_sys->heap);
+
+	purge_sys->n_worker = 0;
+	if (srv_use_purge_thread > 1) {
+		/* Use worker threads */
+		ulint i;
+
+		purge_sys->n_worker = srv_use_purge_thread - 1;
+
+		purge_sys->sess_arr = mem_alloc(sizeof(sess_t*) * purge_sys->n_worker);
+		purge_sys->trx_arr = mem_alloc(sizeof(trx_t*) * purge_sys->n_worker);
+		purge_sys->query_arr = mem_alloc(sizeof(que_t*) * purge_sys->n_worker);
+
+		purge_sys->worker_event = os_event_create(NULL);
+		os_event_reset(purge_sys->worker_event);
+
+		for (i = 0; i < purge_sys->n_worker; i++) {
+			purge_sys->sess_arr[i] = sess_open();
+
+			purge_sys->trx_arr[i] = purge_sys->sess_arr[i]->trx;
+			purge_sys->trx_arr[i]->is_purge = 1;
+			ut_a(trx_start_low(purge_sys->trx_arr[i], ULINT_UNDEFINED));
+
+			purge_sys->query_arr[i] = trx_purge_graph_build(purge_sys->trx_arr[i]);
+		}
+	}
+}
+
+/************************************************************************
+Frees the global purge system control structure. */
+UNIV_INTERN
+void
+trx_purge_sys_close(void)
+/*======================*/
+{
+	ut_ad(!mutex_own(&kernel_mutex));
+
+	que_graph_free(purge_sys->query);
+
+	ut_a(purge_sys->sess->trx->is_purge);
+	purge_sys->sess->trx->conc_state = TRX_NOT_STARTED;
+	sess_close(purge_sys->sess);
+	purge_sys->sess = NULL;
+
+	if (purge_sys->view != NULL) {
+		/* Because acquiring the kernel mutex is a pre-condition
+		of read_view_close(). We don't really need it here. */
+		mutex_enter(&kernel_mutex);
+
+		read_view_close(purge_sys->view);
+		purge_sys->view = NULL;
+
+		mutex_exit(&kernel_mutex);
+	}
+
+	trx_undo_arr_free(purge_sys->arr);
+
+	rw_lock_free(&purge_sys->latch);
+	mutex_free(&purge_sys->mutex);
+
+	mem_heap_free(purge_sys->heap);
+	mem_free(purge_sys);
+
+	purge_sys = NULL;
+}
+
+/*================ UNDO LOG HISTORY LIST =============================*/
+
+/********************************************************************//**
+Adds the update undo log as the first log in the history list. Removes the
+update undo log segment from the rseg slot if it is too big for reuse. */
+UNIV_INTERN
+void
+trx_purge_add_update_undo_to_history(
+/*=================================*/
+	trx_t*	trx,		/*!< in: transaction */
+	page_t*	undo_page,	/*!< in: update undo log header page,
+				x-latched */
+	mtr_t*	mtr)		/*!< in: mtr */
+{
+	trx_undo_t*	undo;
+	trx_rseg_t*	rseg;
+	trx_rsegf_t*	rseg_header;
+	trx_usegf_t*	seg_header;
+	trx_ulogf_t*	undo_header;
+	trx_upagef_t*	page_header;
+	ulint		hist_size;
+
+	undo = trx->update_undo;
+
+	ut_ad(undo);
+
+	rseg = undo->rseg;
+
+	ut_ad(mutex_own(&(rseg->mutex)));
+
+	rseg_header = trx_rsegf_get(rseg->space, rseg->zip_size,
+				    rseg->page_no, mtr);
+
+	undo_header = undo_page + undo->hdr_offset;
+	seg_header  = undo_page + TRX_UNDO_SEG_HDR;
+	page_header = undo_page + TRX_UNDO_PAGE_HDR;
+
+	if (undo->state != TRX_UNDO_CACHED) {
+		/* The undo log segment will not be reused */
+
+		if (undo->id >= TRX_RSEG_N_SLOTS) {
+			fprintf(stderr,
+				"InnoDB: Error: undo->id is %lu\n",
+				(ulong) undo->id);
+			ut_error;
+		}
+
+		trx_rsegf_set_nth_undo(rseg_header, undo->id, FIL_NULL, mtr);
+
+		hist_size = mtr_read_ulint(rseg_header + TRX_RSEG_HISTORY_SIZE,
+					   MLOG_4BYTES, mtr);
+		ut_ad(undo->size == flst_get_len(
+			      seg_header + TRX_UNDO_PAGE_LIST, mtr));
+
+		mlog_write_ulint(rseg_header + TRX_RSEG_HISTORY_SIZE,
+				 hist_size + undo->size, MLOG_4BYTES, mtr);
+	}
+
+	/* Add the log as the first in the history list */
+	flst_add_first(rseg_header + TRX_RSEG_HISTORY,
+		       undo_header + TRX_UNDO_HISTORY_NODE, mtr);
+	mutex_enter(&kernel_mutex);
+	trx_sys->rseg_history_len++;
+	mutex_exit(&kernel_mutex);
+
+	/* Write the trx number to the undo log header */
+	mlog_write_dulint(undo_header + TRX_UNDO_TRX_NO, trx->no, mtr);
+	/* Write information about delete markings to the undo log header */
+
+	if (!undo->del_marks) {
+		mlog_write_ulint(undo_header + TRX_UNDO_DEL_MARKS, FALSE,
+				 MLOG_2BYTES, mtr);
+	}
+
+	if (rseg->last_page_no == FIL_NULL) {
+
+		rseg->last_page_no = undo->hdr_page_no;
+		rseg->last_offset = undo->hdr_offset;
+		rseg->last_trx_no = trx->no;
+		rseg->last_del_marks = undo->del_marks;
+	}
+}
+
+/**********************************************************************//**
+Frees an undo log segment which is in the history list. Cuts the end of the
+history list at the youngest undo log in this segment. */
+static
+void
+trx_purge_free_segment(
+/*===================*/
+	trx_rseg_t*	rseg,		/*!< in: rollback segment */
+	fil_addr_t	hdr_addr,	/*!< in: the file address of log_hdr */
+	ulint		n_removed_logs)	/*!< in: count of how many undo logs we
+					will cut off from the end of the
+					history list */
+{
+	page_t*		undo_page;
+	trx_rsegf_t*	rseg_hdr;
+	trx_ulogf_t*	log_hdr;
+	trx_usegf_t*	seg_hdr;
+	ibool		freed;
+	ulint		seg_size;
+	ulint		hist_size;
+	ibool		marked		= FALSE;
+	mtr_t		mtr;
+
+	/*	fputs("Freeing an update undo log segment\n", stderr); */
+
+	ut_ad(mutex_own(&(purge_sys->mutex)));
+loop:
+	mtr_start(&mtr);
+	mutex_enter(&(rseg->mutex));
+
+	rseg_hdr = trx_rsegf_get(rseg->space, rseg->zip_size,
+				 rseg->page_no, &mtr);
+
+	undo_page = trx_undo_page_get(rseg->space, rseg->zip_size,
+				      hdr_addr.page, &mtr);
+	seg_hdr = undo_page + TRX_UNDO_SEG_HDR;
+	log_hdr = undo_page + hdr_addr.boffset;
+
+	/* Mark the last undo log totally purged, so that if the system
+	crashes, the tail of the undo log will not get accessed again. The
+	list of pages in the undo log tail gets inconsistent during the
+	freeing of the segment, and therefore purge should not try to access
+	them again. */
+
+	if (!marked) {
+		mlog_write_ulint(log_hdr + TRX_UNDO_DEL_MARKS, FALSE,
+				 MLOG_2BYTES, &mtr);
+		marked = TRUE;
+	}
+
+	freed = fseg_free_step_not_header(seg_hdr + TRX_UNDO_FSEG_HEADER,
+					  &mtr);
+	if (!freed) {
+		mutex_exit(&(rseg->mutex));
+		mtr_commit(&mtr);
+
+		goto loop;
+	}
+
+	/* The page list may now be inconsistent, but the length field
+	stored in the list base node tells us how big it was before we
+	started the freeing. */
+
+	seg_size = flst_get_len(seg_hdr + TRX_UNDO_PAGE_LIST, &mtr);
+
+	/* We may free the undo log segment header page; it must be freed
+	within the same mtr as the undo log header is removed from the
+	history list: otherwise, in case of a database crash, the segment
+	could become inaccessible garbage in the file space. */
+
+	flst_cut_end(rseg_hdr + TRX_RSEG_HISTORY,
+		     log_hdr + TRX_UNDO_HISTORY_NODE, n_removed_logs, &mtr);
+
+	mutex_enter(&kernel_mutex);
+	ut_ad(trx_sys->rseg_history_len >= n_removed_logs);
+	trx_sys->rseg_history_len -= n_removed_logs;
+	mutex_exit(&kernel_mutex);
+
+	freed = FALSE;
+
+	while (!freed) {
+		/* Here we assume that a file segment with just the header
+		page can be freed in a few steps, so that the buffer pool
+		is not flooded with bufferfixed pages: see the note in
+		fsp0fsp.c. */
+
+		freed = fseg_free_step(seg_hdr + TRX_UNDO_FSEG_HEADER,
+				       &mtr);
+	}
+
+	hist_size = mtr_read_ulint(rseg_hdr + TRX_RSEG_HISTORY_SIZE,
+				   MLOG_4BYTES, &mtr);
+	ut_ad(hist_size >= seg_size);
+
+	mlog_write_ulint(rseg_hdr + TRX_RSEG_HISTORY_SIZE,
+			 hist_size - seg_size, MLOG_4BYTES, &mtr);
+
+	ut_ad(rseg->curr_size >= seg_size);
+
+	rseg->curr_size -= seg_size;
+
+	mutex_exit(&(rseg->mutex));
+
+	mtr_commit(&mtr);
+}
+
+/********************************************************************//**
+Removes unnecessary history data from a rollback segment. */
+static
+void
+trx_purge_truncate_rseg_history(
+/*============================*/
+	trx_rseg_t*	rseg,		/*!< in: rollback segment */
+	trx_id_t	limit_trx_no,	/*!< in: remove update undo logs whose
+					trx number is < limit_trx_no */
+	undo_no_t	limit_undo_no)	/*!< in: if transaction number is equal
+					to limit_trx_no, truncate undo records
+					with undo number < limit_undo_no */
+{
+	fil_addr_t	hdr_addr;
+	fil_addr_t	prev_hdr_addr;
+	trx_rsegf_t*	rseg_hdr;
+	page_t*		undo_page;
+	trx_ulogf_t*	log_hdr;
+	trx_usegf_t*	seg_hdr;
+	int		cmp;
+	ulint		n_removed_logs	= 0;
+	mtr_t		mtr;
+
+	ut_ad(mutex_own(&(purge_sys->mutex)));
+
+	mtr_start(&mtr);
+	mutex_enter(&(rseg->mutex));
+
+	rseg_hdr = trx_rsegf_get(rseg->space, rseg->zip_size,
+				 rseg->page_no, &mtr);
+
+	hdr_addr = trx_purge_get_log_from_hist(
+		flst_get_last(rseg_hdr + TRX_RSEG_HISTORY, &mtr));
+loop:
+	if (hdr_addr.page == FIL_NULL) {
+
+		mutex_exit(&(rseg->mutex));
+
+		mtr_commit(&mtr);
+
+		return;
+	}
+
+	undo_page = trx_undo_page_get(rseg->space, rseg->zip_size,
+				      hdr_addr.page, &mtr);
+
+	log_hdr = undo_page + hdr_addr.boffset;
+
+	cmp = ut_dulint_cmp(mach_read_from_8(log_hdr + TRX_UNDO_TRX_NO),
+			    limit_trx_no);
+	if (cmp == 0) {
+		trx_undo_truncate_start(rseg, rseg->space, hdr_addr.page,
+					hdr_addr.boffset, limit_undo_no);
+	}
+
+	if (cmp >= 0) {
+		mutex_enter(&kernel_mutex);
+		ut_a(trx_sys->rseg_history_len >= n_removed_logs);
+		trx_sys->rseg_history_len -= n_removed_logs;
+		mutex_exit(&kernel_mutex);
+
+		flst_truncate_end(rseg_hdr + TRX_RSEG_HISTORY,
+				  log_hdr + TRX_UNDO_HISTORY_NODE,
+				  n_removed_logs, &mtr);
+
+		mutex_exit(&(rseg->mutex));
+		mtr_commit(&mtr);
+
+		return;
+	}
+
+	prev_hdr_addr = trx_purge_get_log_from_hist(
+		flst_get_prev_addr(log_hdr + TRX_UNDO_HISTORY_NODE, &mtr));
+	n_removed_logs++;
+
+	seg_hdr = undo_page + TRX_UNDO_SEG_HDR;
+
+	if ((mach_read_from_2(seg_hdr + TRX_UNDO_STATE) == TRX_UNDO_TO_PURGE)
+	    && (mach_read_from_2(log_hdr + TRX_UNDO_NEXT_LOG) == 0)) {
+
+		/* We can free the whole log segment */
+
+		mutex_exit(&(rseg->mutex));
+		mtr_commit(&mtr);
+
+		trx_purge_free_segment(rseg, hdr_addr, n_removed_logs);
+
+		n_removed_logs = 0;
+	} else {
+		mutex_exit(&(rseg->mutex));
+		mtr_commit(&mtr);
+	}
+
+	mtr_start(&mtr);
+	mutex_enter(&(rseg->mutex));
+
+	rseg_hdr = trx_rsegf_get(rseg->space, rseg->zip_size,
+				 rseg->page_no, &mtr);
+
+	hdr_addr = prev_hdr_addr;
+
+	goto loop;
+}
+
+/********************************************************************//**
+Removes unnecessary history data from rollback segments. NOTE that when this
+function is called, the caller must not have any latches on undo log pages! */
+static
+void
+trx_purge_truncate_history(void)
+/*============================*/
+{
+	trx_rseg_t*	rseg;
+	trx_id_t	limit_trx_no;
+	undo_no_t	limit_undo_no;
+
+	ut_ad(mutex_own(&(purge_sys->mutex)));
+
+	trx_purge_arr_get_biggest(purge_sys->arr, &limit_trx_no,
+				  &limit_undo_no);
+
+	if (ut_dulint_is_zero(limit_trx_no)) {
+
+		limit_trx_no = purge_sys->purge_trx_no;
+		limit_undo_no = purge_sys->purge_undo_no;
+	}
+
+	/* We play safe and set the truncate limit at most to the purge view
+	low_limit number, though this is not necessary */
+
+	if (ut_dulint_cmp(limit_trx_no, purge_sys->view->low_limit_no) >= 0) {
+		limit_trx_no = purge_sys->view->low_limit_no;
+		limit_undo_no = ut_dulint_zero;
+	}
+
+	ut_ad((ut_dulint_cmp(limit_trx_no,
+			     purge_sys->view->low_limit_no) <= 0));
+
+	rseg = UT_LIST_GET_FIRST(trx_sys->rseg_list);
+
+	while (rseg) {
+		trx_purge_truncate_rseg_history(rseg, limit_trx_no,
+						limit_undo_no);
+		rseg = UT_LIST_GET_NEXT(rseg_list, rseg);
+	}
+}
+
+/********************************************************************//**
+Does a truncate if the purge array is empty. NOTE that when this function is
+called, the caller must not have any latches on undo log pages!
+@return	TRUE if array empty */
+UNIV_INLINE
+ibool
+trx_purge_truncate_if_arr_empty(void)
+/*=================================*/
+{
+	ut_ad(mutex_own(&(purge_sys->mutex)));
+
+	if (purge_sys->arr->n_used == 0) {
+
+		trx_purge_truncate_history();
+
+		return(TRUE);
+	}
+
+	return(FALSE);
+}
+
+/***********************************************************************//**
+Updates the last not yet purged history log info in rseg when we have purged
+a whole undo log. Advances also purge_sys->purge_trx_no past the purged log. */
+static
+void
+trx_purge_rseg_get_next_history_log(
+/*================================*/
+	trx_rseg_t*	rseg)	/*!< in: rollback segment */
+{
+	page_t*		undo_page;
+	trx_ulogf_t*	log_hdr;
+	trx_usegf_t*	seg_hdr;
+	fil_addr_t	prev_log_addr;
+	trx_id_t	trx_no;
+	ibool		del_marks;
+	mtr_t		mtr;
+
+	ut_ad(mutex_own(&(purge_sys->mutex)));
+
+	mutex_enter(&(rseg->mutex));
+
+	ut_a(rseg->last_page_no != FIL_NULL);
+
+	purge_sys->purge_trx_no = ut_dulint_add(rseg->last_trx_no, 1);
+	purge_sys->purge_undo_no = ut_dulint_zero;
+	purge_sys->next_stored = FALSE;
+
+	mtr_start(&mtr);
+
+	undo_page = trx_undo_page_get_s_latched(rseg->space, rseg->zip_size,
+						rseg->last_page_no, &mtr);
+	log_hdr = undo_page + rseg->last_offset;
+	seg_hdr = undo_page + TRX_UNDO_SEG_HDR;
+
+	/* Increase the purge page count by one for every handled log */
+
+	purge_sys->n_pages_handled++;
+
+	prev_log_addr = trx_purge_get_log_from_hist(
+		flst_get_prev_addr(log_hdr + TRX_UNDO_HISTORY_NODE, &mtr));
+	if (prev_log_addr.page == FIL_NULL) {
+		/* No logs left in the history list */
+
+		rseg->last_page_no = FIL_NULL;
+
+		mutex_exit(&(rseg->mutex));
+		mtr_commit(&mtr);
+
+		mutex_enter(&kernel_mutex);
+
+		/* Add debug code to track history list corruption reported
+		on the MySQL mailing list on Nov 9, 2004. The fut0lst.c
+		file-based list was corrupt. The prev node pointer was
+		FIL_NULL, even though the list length was over 8 million nodes!
+		We assume that purge truncates the history list in moderate
+		size pieces, and if we here reach the head of the list, the
+		list cannot be longer than 20 000 undo logs now. */
+
+		if (trx_sys->rseg_history_len > 20000) {
+			ut_print_timestamp(stderr);
+			fprintf(stderr,
+				"  InnoDB: Warning: purge reached the"
+				" head of the history list,\n"
+				"InnoDB: but its length is still"
+				" reported as %lu! Make a detailed bug\n"
+				"InnoDB: report, and submit it"
+				" to http://bugs.mysql.com\n",
+				(ulong) trx_sys->rseg_history_len);
+		}
+
+		mutex_exit(&kernel_mutex);
+
+		return;
+	}
+
+	mutex_exit(&(rseg->mutex));
+	mtr_commit(&mtr);
+
+	/* Read the trx number and del marks from the previous log header */
+	mtr_start(&mtr);
+
+	log_hdr = trx_undo_page_get_s_latched(rseg->space, rseg->zip_size,
+					      prev_log_addr.page, &mtr)
+		+ prev_log_addr.boffset;
+
+	trx_no = mach_read_from_8(log_hdr + TRX_UNDO_TRX_NO);
+
+	del_marks = mach_read_from_2(log_hdr + TRX_UNDO_DEL_MARKS);
+
+	mtr_commit(&mtr);
+
+	mutex_enter(&(rseg->mutex));
+
+	rseg->last_page_no = prev_log_addr.page;
+	rseg->last_offset = prev_log_addr.boffset;
+	rseg->last_trx_no = trx_no;
+	rseg->last_del_marks = del_marks;
+
+	mutex_exit(&(rseg->mutex));
+}
+
+/***********************************************************************//**
+Chooses the next undo log to purge and updates the info in purge_sys. This
+function is used to initialize purge_sys when the next record to purge is
+not known, and also to update the purge system info on the next record when
+purge has handled the whole undo log for a transaction. */
+static
+void
+trx_purge_choose_next_log(void)
+/*===========================*/
+{
+	trx_undo_rec_t*	rec;
+	trx_rseg_t*	rseg;
+	trx_rseg_t*	min_rseg;
+	trx_id_t	min_trx_no;
+	ulint		space = 0;   /* remove warning (??? bug ???) */
+	ulint		zip_size = 0;
+	ulint		page_no = 0; /* remove warning (??? bug ???) */
+	ulint		offset = 0;  /* remove warning (??? bug ???) */
+	mtr_t		mtr;
+
+	ut_ad(mutex_own(&(purge_sys->mutex)));
+	ut_ad(purge_sys->next_stored == FALSE);
+
+	rseg = UT_LIST_GET_FIRST(trx_sys->rseg_list);
+
+	min_trx_no = ut_dulint_max;
+
+	min_rseg = NULL;
+
+	while (rseg) {
+		mutex_enter(&(rseg->mutex));
+
+		if (rseg->last_page_no != FIL_NULL) {
+
+			if ((min_rseg == NULL)
+			    || (ut_dulint_cmp(min_trx_no,
+					      rseg->last_trx_no) > 0)) {
+
+				min_rseg = rseg;
+				min_trx_no = rseg->last_trx_no;
+				space = rseg->space;
+				zip_size = rseg->zip_size;
+				ut_a(space == 0); /* We assume in purge of
+						  externally stored fields
+						  that space id == 0 */
+				page_no = rseg->last_page_no;
+				offset = rseg->last_offset;
+			}
+		}
+
+		mutex_exit(&(rseg->mutex));
+
+		rseg = UT_LIST_GET_NEXT(rseg_list, rseg);
+	}
+
+	if (min_rseg == NULL) {
+
+		return;
+	}
+
+	mtr_start(&mtr);
+
+	if (!min_rseg->last_del_marks) {
+		/* No need to purge this log */
+
+		rec = &trx_purge_dummy_rec;
+	} else {
+		rec = trx_undo_get_first_rec(space, zip_size, page_no, offset,
+					     RW_S_LATCH, &mtr);
+		if (rec == NULL) {
+			/* Undo log empty */
+
+			rec = &trx_purge_dummy_rec;
+		}
+	}
+
+	purge_sys->next_stored = TRUE;
+	purge_sys->rseg = min_rseg;
+
+	purge_sys->hdr_page_no = page_no;
+	purge_sys->hdr_offset = offset;
+
+	purge_sys->purge_trx_no = min_trx_no;
+
+	if (rec == &trx_purge_dummy_rec) {
+
+		purge_sys->purge_undo_no = ut_dulint_zero;
+		purge_sys->page_no = page_no;
+		purge_sys->offset = 0;
+	} else {
+		purge_sys->purge_undo_no = trx_undo_rec_get_undo_no(rec);
+
+		purge_sys->page_no = page_get_page_no(page_align(rec));
+		purge_sys->offset = page_offset(rec);
+	}
+
+	mtr_commit(&mtr);
+}
+
+/***********************************************************************//**
+Gets the next record to purge and updates the info in the purge system.
+@return	copy of an undo log record or pointer to the dummy undo log record */
+static
+trx_undo_rec_t*
+trx_purge_get_next_rec(
+/*===================*/
+	mem_heap_t*	heap)	/*!< in: memory heap where copied */
+{
+	trx_undo_rec_t*	rec;
+	trx_undo_rec_t*	rec_copy;
+	trx_undo_rec_t*	rec2;
+	trx_undo_rec_t*	next_rec;
+	page_t*		undo_page;
+	page_t*		page;
+	ulint		offset;
+	ulint		page_no;
+	ulint		space;
+	ulint		zip_size;
+	ulint		type;
+	ulint		cmpl_info;
+	mtr_t		mtr;
+
+	ut_ad(mutex_own(&(purge_sys->mutex)));
+	ut_ad(purge_sys->next_stored);
+
+	space = purge_sys->rseg->space;
+	zip_size = purge_sys->rseg->zip_size;
+	page_no = purge_sys->page_no;
+	offset = purge_sys->offset;
+
+	if (offset == 0) {
+		/* It is the dummy undo log record, which means that there is
+		no need to purge this undo log */
+
+		trx_purge_rseg_get_next_history_log(purge_sys->rseg);
+
+		/* Look for the next undo log and record to purge */
+
+		trx_purge_choose_next_log();
+
+		return(&trx_purge_dummy_rec);
+	}
+
+	mtr_start(&mtr);
+
+	undo_page = trx_undo_page_get_s_latched(space, zip_size,
+						page_no, &mtr);
+	rec = undo_page + offset;
+
+	rec2 = rec;
+
+	for (;;) {
+		/* Try first to find the next record which requires a purge
+		operation from the same page of the same undo log */
+
+		next_rec = trx_undo_page_get_next_rec(rec2,
+						      purge_sys->hdr_page_no,
+						      purge_sys->hdr_offset);
+		if (next_rec == NULL) {
+			rec2 = trx_undo_get_next_rec(
+				rec2, purge_sys->hdr_page_no,
+				purge_sys->hdr_offset, &mtr);
+			break;
+		}
+
+		rec2 = next_rec;
+
+		type = trx_undo_rec_get_type(rec2);
+
+		if (type == TRX_UNDO_DEL_MARK_REC) {
+
+			break;
+		}
+
+		cmpl_info = trx_undo_rec_get_cmpl_info(rec2);
+
+		if (trx_undo_rec_get_extern_storage(rec2)) {
+			break;
+		}
+
+		if ((type == TRX_UNDO_UPD_EXIST_REC)
+		    && !(cmpl_info & UPD_NODE_NO_ORD_CHANGE)) {
+			break;
+		}
+	}
+
+	if (rec2 == NULL) {
+		mtr_commit(&mtr);
+
+		trx_purge_rseg_get_next_history_log(purge_sys->rseg);
+
+		/* Look for the next undo log and record to purge */
+
+		trx_purge_choose_next_log();
+
+		mtr_start(&mtr);
+
+		undo_page = trx_undo_page_get_s_latched(space, zip_size,
+							page_no, &mtr);
+
+		rec = undo_page + offset;
+	} else {
+		page = page_align(rec2);
+
+		purge_sys->purge_undo_no = trx_undo_rec_get_undo_no(rec2);
+		purge_sys->page_no = page_get_page_no(page);
+		purge_sys->offset = rec2 - page;
+
+		if (undo_page != page) {
+			/* We advance to a new page of the undo log: */
+			purge_sys->n_pages_handled++;
+		}
+	}
+
+	rec_copy = trx_undo_rec_copy(rec, heap);
+
+	mtr_commit(&mtr);
+
+	return(rec_copy);
+}
+
+/********************************************************************//**
+Fetches the next undo log record from the history list to purge. It must be
+released with the corresponding release function.
+@return copy of an undo log record or pointer to trx_purge_dummy_rec,
+if the whole undo log can skipped in purge; NULL if none left */
+UNIV_INTERN
+trx_undo_rec_t*
+trx_purge_fetch_next_rec(
+/*=====================*/
+	roll_ptr_t*	roll_ptr,/*!< out: roll pointer to undo record */
+	trx_undo_inf_t** cell,	/*!< out: storage cell for the record in the
+				purge array */
+	mem_heap_t*	heap)	/*!< in: memory heap where copied */
+{
+	trx_undo_rec_t*	undo_rec;
+
+	mutex_enter(&(purge_sys->mutex));
+
+	if (purge_sys->state == TRX_STOP_PURGE) {
+		trx_purge_truncate_if_arr_empty();
+
+		mutex_exit(&(purge_sys->mutex));
+
+		return(NULL);
+	}
+
+	if (!purge_sys->next_stored) {
+		trx_purge_choose_next_log();
+
+		if (!purge_sys->next_stored) {
+			purge_sys->state = TRX_STOP_PURGE;
+
+			trx_purge_truncate_if_arr_empty();
+
+			if (srv_print_thread_releases) {
+				fprintf(stderr,
+					"Purge: No logs left in the"
+					" history list; pages handled %lu\n",
+					(ulong) purge_sys->n_pages_handled);
+			}
+
+			mutex_exit(&(purge_sys->mutex));
+
+			return(NULL);
+		}
+	}
+
+	if (purge_sys->n_pages_handled >= purge_sys->handle_limit) {
+
+		purge_sys->state = TRX_STOP_PURGE;
+
+		trx_purge_truncate_if_arr_empty();
+
+		mutex_exit(&(purge_sys->mutex));
+
+		return(NULL);
+	}
+
+	if (ut_dulint_cmp(purge_sys->purge_trx_no,
+			  purge_sys->view->low_limit_no) >= 0) {
+		purge_sys->state = TRX_STOP_PURGE;
+
+		trx_purge_truncate_if_arr_empty();
+
+		mutex_exit(&(purge_sys->mutex));
+
+		return(NULL);
+	}
+
+	/*	fprintf(stderr, "Thread %lu purging trx %lu undo record %lu\n",
+	os_thread_get_curr_id(),
+	ut_dulint_get_low(purge_sys->purge_trx_no),
+	ut_dulint_get_low(purge_sys->purge_undo_no)); */
+
+	*roll_ptr = trx_undo_build_roll_ptr(FALSE, (purge_sys->rseg)->id,
+					    purge_sys->page_no,
+					    purge_sys->offset);
+
+	*cell = trx_purge_arr_store_info(purge_sys->purge_trx_no,
+					 purge_sys->purge_undo_no);
+
+	ut_ad(ut_dulint_cmp(purge_sys->purge_trx_no,
+			    (purge_sys->view)->low_limit_no) < 0);
+
+	/* The following call will advance the stored values of purge_trx_no
+	and purge_undo_no, therefore we had to store them first */
+
+	undo_rec = trx_purge_get_next_rec(heap);
+
+	mutex_exit(&(purge_sys->mutex));
+
+	return(undo_rec);
+}
+
+/*******************************************************************//**
+Releases a reserved purge undo record. */
+UNIV_INTERN
+void
+trx_purge_rec_release(
+/*==================*/
+	trx_undo_inf_t*	cell)	/*!< in: storage cell */
+{
+	trx_undo_arr_t*	arr;
+
+	mutex_enter(&(purge_sys->mutex));
+
+	arr = purge_sys->arr;
+
+	trx_purge_arr_remove_info(cell);
+
+	mutex_exit(&(purge_sys->mutex));
+}
+
+/*******************************************************************//**
+This function runs a purge batch.
+@return	number of undo log pages handled in the batch */
+UNIV_INTERN
+ulint
+trx_purge(void)
+/*===========*/
+{
+	que_thr_t*	thr;
+	/*	que_thr_t*	thr2; */
+	ulint		old_pages_handled;
+
+	mutex_enter(&(purge_sys->mutex));
+
+	if (purge_sys->trx->n_active_thrs > 0) {
+
+		mutex_exit(&(purge_sys->mutex));
+
+		/* Should not happen */
+
+		ut_error;
+
+		return(0);
+	}
+
+	rw_lock_x_lock(&(purge_sys->latch));
+
+	mutex_enter(&kernel_mutex);
+
+	/* Close and free the old purge view */
+
+	read_view_close(purge_sys->view);
+	purge_sys->view = NULL;
+	mem_heap_empty(purge_sys->heap);
+
+	/* Determine how much data manipulation language (DML) statements
+	need to be delayed in order to reduce the lagging of the purge
+	thread. */
+	srv_dml_needed_delay = 0; /* in microseconds; default: no delay */
+
+	/* If we cannot advance the 'purge view' because of an old
+	'consistent read view', then the DML statements cannot be delayed.
+	Also, srv_max_purge_lag <= 0 means 'infinity'. */
+	if (srv_max_purge_lag > 0) {
+		float	ratio = (float) trx_sys->rseg_history_len
+			/ srv_max_purge_lag;
+		if (ratio > ULINT_MAX / 10000) {
+			/* Avoid overflow: maximum delay is 4295 seconds */
+			srv_dml_needed_delay = ULINT_MAX;
+		} else if (ratio > 1) {
+			/* If the history list length exceeds the
+			innodb_max_purge_lag, the
+			data manipulation statements are delayed
+			by at least 5000 microseconds. */
+			srv_dml_needed_delay = (ulint) ((ratio - .5) * 10000);
+		}
+	}
+
+	purge_sys->view = read_view_oldest_copy_or_open_new(ut_dulint_zero,
+							    purge_sys->heap);
+	mutex_exit(&kernel_mutex);
+
+	rw_lock_x_unlock(&(purge_sys->latch));
+
+	purge_sys->state = TRX_PURGE_ON;
+
+	/* Handle at most 20 undo log pages in one purge batch */
+
+	purge_sys->handle_limit = purge_sys->n_pages_handled + 20 * (srv_use_purge_thread + 1);
+
+	old_pages_handled = purge_sys->n_pages_handled;
+
+	mutex_exit(&(purge_sys->mutex));
+
+	mutex_enter(&kernel_mutex);
+
+	thr = que_fork_start_command(purge_sys->query);
+
+	ut_ad(thr);
+
+	/*	thr2 = que_fork_start_command(purge_sys->query);
+
+	ut_ad(thr2); */
+
+
+	mutex_exit(&kernel_mutex);
+
+	if (purge_sys->n_worker)
+		os_event_set(purge_sys->worker_event);
+
+	/*	srv_que_task_enqueue(thr2); */
+
+	if (srv_print_thread_releases) {
+
+		fputs("Starting purge\n", stderr);
+	}
+
+	que_run_threads(thr);
+
+	if (purge_sys->n_worker)
+		os_event_reset(purge_sys->worker_event);
+
+	if (srv_print_thread_releases) {
+
+		fprintf(stderr,
+			"Purge ends; pages handled %lu\n",
+			(ulong) purge_sys->n_pages_handled);
+	}
+
+	return(purge_sys->n_pages_handled - old_pages_handled);
+}
+
+/**********************************************************************
+This function runs a purge worker batch */
+UNIV_INTERN
+void
+trx_purge_worker(
+/*=============*/
+	ulint	worker_id)
+{
+	que_thr_t*	thr;
+
+	mutex_enter(&kernel_mutex);
+
+	thr = que_fork_start_command(purge_sys->query_arr[worker_id]);
+
+	ut_ad(thr);
+
+	mutex_exit(&kernel_mutex);
+
+	que_run_threads(thr);
+
+	if (purge_sys->state == TRX_STOP_PURGE) { /* optimistic */
+		os_event_reset(purge_sys->worker_event);
+	}
+}
+
+/**********************************************************************
+This function waits the event for worker batch */
+UNIV_INTERN
+void
+trx_purge_worker_wait(void)
+/*=======================*/
+{
+	os_event_wait(purge_sys->worker_event);
+}
+
+/**********************************************************************
+This function wakes the waiting worker batch */
+UNIV_INTERN
+void
+trx_purge_worker_wake(void)
+/*=======================*/
+{
+	if (purge_sys->n_worker)
+		os_event_set(purge_sys->worker_event);
+}
+
+/******************************************************************//**
+Prints information of the purge system to stderr. */
+UNIV_INTERN
+void
+trx_purge_sys_print(void)
+/*=====================*/
+{
+	fprintf(stderr, "InnoDB: Purge system view:\n");
+	read_view_print(purge_sys->view);
+
+	fprintf(stderr, "InnoDB: Purge trx n:o " TRX_ID_FMT
+		", undo n:o " TRX_ID_FMT "\n",
+		TRX_ID_PREP_PRINTF(purge_sys->purge_trx_no),
+		TRX_ID_PREP_PRINTF(purge_sys->purge_undo_no));
+	fprintf(stderr,
+		"InnoDB: Purge next stored %lu, page_no %lu, offset %lu,\n"
+		"InnoDB: Purge hdr_page_no %lu, hdr_offset %lu\n",
+		(ulong) purge_sys->next_stored,
+		(ulong) purge_sys->page_no,
+		(ulong) purge_sys->offset,
+		(ulong) purge_sys->hdr_page_no,
+		(ulong) purge_sys->hdr_offset);
+}
diff --git a/storage/xtradb/trx/trx0rec.c b/storage/xtradb/trx/trx0rec.c
new file mode 100644
index 00000000000..f50e10ed756
--- /dev/null
+++ b/storage/xtradb/trx/trx0rec.c
@@ -0,0 +1,1611 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2010, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file trx/trx0rec.c
+Transaction undo log record
+
+Created 3/26/1996 Heikki Tuuri
+*******************************************************/
+
+#include "trx0rec.h"
+
+#ifdef UNIV_NONINL
+#include "trx0rec.ic"
+#endif
+
+#include "fsp0fsp.h"
+#include "mach0data.h"
+#include "trx0undo.h"
+#include "mtr0log.h"
+#ifndef UNIV_HOTBACKUP
+#include "dict0dict.h"
+#include "ut0mem.h"
+#include "row0ext.h"
+#include "row0upd.h"
+#include "que0que.h"
+#include "trx0purge.h"
+#include "trx0rseg.h"
+#include "row0row.h"
+
+/*=========== UNDO LOG RECORD CREATION AND DECODING ====================*/
+
+/**********************************************************************//**
+Writes the mtr log entry of the inserted undo log record on the undo log
+page. */
+UNIV_INLINE
+void
+trx_undof_page_add_undo_rec_log(
+/*============================*/
+	page_t* undo_page,	/*!< in: undo log page */
+	ulint	old_free,	/*!< in: start offset of the inserted entry */
+	ulint	new_free,	/*!< in: end offset of the entry */
+	mtr_t*	mtr)		/*!< in: mtr */
+{
+	byte*		log_ptr;
+	const byte*	log_end;
+	ulint		len;
+
+	log_ptr = mlog_open(mtr, 11 + 13 + MLOG_BUF_MARGIN);
+
+	if (log_ptr == NULL) {
+
+		return;
+	}
+
+	log_end = &log_ptr[11 + 13 + MLOG_BUF_MARGIN];
+	log_ptr = mlog_write_initial_log_record_fast(
+		undo_page, MLOG_UNDO_INSERT, log_ptr, mtr);
+	len = new_free - old_free - 4;
+
+	mach_write_to_2(log_ptr, len);
+	log_ptr += 2;
+
+	if (log_ptr + len <= log_end) {
+		memcpy(log_ptr, undo_page + old_free + 2, len);
+		mlog_close(mtr, log_ptr + len);
+	} else {
+		mlog_close(mtr, log_ptr);
+		mlog_catenate_string(mtr, undo_page + old_free + 2, len);
+	}
+}
+#endif /* !UNIV_HOTBACKUP */
+
+/***********************************************************//**
+Parses a redo log record of adding an undo log record.
+@return	end of log record or NULL */
+UNIV_INTERN
+byte*
+trx_undo_parse_add_undo_rec(
+/*========================*/
+	byte*	ptr,	/*!< in: buffer */
+	byte*	end_ptr,/*!< in: buffer end */
+	page_t*	page)	/*!< in: page or NULL */
+{
+	ulint	len;
+	byte*	rec;
+	ulint	first_free;
+
+	if (end_ptr < ptr + 2) {
+
+		return(NULL);
+	}
+
+	len = mach_read_from_2(ptr);
+	ptr += 2;
+
+	if (end_ptr < ptr + len) {
+
+		return(NULL);
+	}
+
+	if (page == NULL) {
+
+		return(ptr + len);
+	}
+
+	first_free = mach_read_from_2(page + TRX_UNDO_PAGE_HDR
+				      + TRX_UNDO_PAGE_FREE);
+	rec = page + first_free;
+
+	mach_write_to_2(rec, first_free + 4 + len);
+	mach_write_to_2(rec + 2 + len, first_free);
+
+	mach_write_to_2(page + TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_FREE,
+			first_free + 4 + len);
+	ut_memcpy(rec + 2, ptr, len);
+
+	return(ptr + len);
+}
+
+#ifndef UNIV_HOTBACKUP
+/**********************************************************************//**
+Calculates the free space left for extending an undo log record.
+@return	bytes left */
+UNIV_INLINE
+ulint
+trx_undo_left(
+/*==========*/
+	const page_t*	page,	/*!< in: undo log page */
+	const byte*	ptr)	/*!< in: pointer to page */
+{
+	/* The '- 10' is a safety margin, in case we have some small
+	calculation error below */
+
+	return(UNIV_PAGE_SIZE - (ptr - page) - 10 - FIL_PAGE_DATA_END);
+}
+
+/**********************************************************************//**
+Set the next and previous pointers in the undo page for the undo record
+that was written to ptr. Update the first free value by the number of bytes
+written for this undo record.
+@return	offset of the inserted entry on the page if succeeded, 0 if fail */
+static
+ulint
+trx_undo_page_set_next_prev_and_add(
+/*================================*/
+	page_t*		undo_page,	/*!< in/out: undo log page */
+	byte*		ptr,		/*!< in: ptr up to where data has been
+					written on this undo page. */
+	mtr_t*		mtr)		/*!< in: mtr */
+{
+	ulint		first_free;	/*!< offset within undo_page */
+	ulint		end_of_rec;	/*!< offset within undo_page */
+	byte*		ptr_to_first_free;
+					/* pointer within undo_page
+					that points to the next free
+					offset value within undo_page.*/
+
+	ut_ad(ptr > undo_page);
+	ut_ad(ptr < undo_page + UNIV_PAGE_SIZE);
+
+	if (UNIV_UNLIKELY(trx_undo_left(undo_page, ptr) < 2)) {
+
+		return(0);
+	}
+
+	ptr_to_first_free = undo_page + TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_FREE;
+
+	first_free = mach_read_from_2(ptr_to_first_free);
+
+	/* Write offset of the previous undo log record */
+	mach_write_to_2(ptr, first_free);
+	ptr += 2;
+
+	end_of_rec = ptr - undo_page;
+
+	/* Write offset of the next undo log record */
+	mach_write_to_2(undo_page + first_free, end_of_rec);
+
+	/* Update the offset to first free undo record */
+	mach_write_to_2(ptr_to_first_free, end_of_rec);
+
+	/* Write this log entry to the UNDO log */
+	trx_undof_page_add_undo_rec_log(undo_page, first_free,
+					end_of_rec, mtr);
+
+	return(first_free);
+}
+
+/**********************************************************************//**
+Reports in the undo log of an insert of a clustered index record.
+@return	offset of the inserted entry on the page if succeed, 0 if fail */
+static
+ulint
+trx_undo_page_report_insert(
+/*========================*/
+	page_t*		undo_page,	/*!< in: undo log page */
+	trx_t*		trx,		/*!< in: transaction */
+	dict_index_t*	index,		/*!< in: clustered index */
+	const dtuple_t*	clust_entry,	/*!< in: index entry which will be
+					inserted to the clustered index */
+	mtr_t*		mtr)		/*!< in: mtr */
+{
+	ulint		first_free;
+	byte*		ptr;
+	ulint		i;
+
+	ut_ad(dict_index_is_clust(index));
+	ut_ad(mach_read_from_2(undo_page + TRX_UNDO_PAGE_HDR
+			       + TRX_UNDO_PAGE_TYPE) == TRX_UNDO_INSERT);
+
+	first_free = mach_read_from_2(undo_page + TRX_UNDO_PAGE_HDR
+				      + TRX_UNDO_PAGE_FREE);
+	ptr = undo_page + first_free;
+
+	ut_ad(first_free <= UNIV_PAGE_SIZE);
+
+	if (trx_undo_left(undo_page, ptr) < 2 + 1 + 11 + 11) {
+
+		/* Not enough space for writing the general parameters */
+
+		return(0);
+	}
+
+	/* Reserve 2 bytes for the pointer to the next undo log record */
+	ptr += 2;
+
+	/* Store first some general parameters to the undo log */
+	*ptr++ = TRX_UNDO_INSERT_REC;
+	ptr += mach_dulint_write_much_compressed(ptr, trx->undo_no);
+	ptr += mach_dulint_write_much_compressed(ptr, index->table->id);
+	/*----------------------------------------*/
+	/* Store then the fields required to uniquely determine the record
+	to be inserted in the clustered index */
+
+	for (i = 0; i < dict_index_get_n_unique(index); i++) {
+
+		const dfield_t*	field	= dtuple_get_nth_field(clust_entry, i);
+		ulint		flen	= dfield_get_len(field);
+
+		if (trx_undo_left(undo_page, ptr) < 5) {
+
+			return(0);
+		}
+
+		ptr += mach_write_compressed(ptr, flen);
+
+		if (flen != UNIV_SQL_NULL) {
+			if (trx_undo_left(undo_page, ptr) < flen) {
+
+				return(0);
+			}
+
+			ut_memcpy(ptr, dfield_get_data(field), flen);
+			ptr += flen;
+		}
+	}
+
+	return(trx_undo_page_set_next_prev_and_add(undo_page, ptr, mtr));
+}
+
+/**********************************************************************//**
+Reads from an undo log record the general parameters.
+@return	remaining part of undo log record after reading these values */
+UNIV_INTERN
+byte*
+trx_undo_rec_get_pars(
+/*==================*/
+	trx_undo_rec_t*	undo_rec,	/*!< in: undo log record */
+	ulint*		type,		/*!< out: undo record type:
+					TRX_UNDO_INSERT_REC, ... */
+	ulint*		cmpl_info,	/*!< out: compiler info, relevant only
+					for update type records */
+	ibool*		updated_extern,	/*!< out: TRUE if we updated an
+					externally stored fild */
+	undo_no_t*	undo_no,	/*!< out: undo log record number */
+	dulint*		table_id)	/*!< out: table id */
+{
+	byte*		ptr;
+	ulint		type_cmpl;
+
+	ptr = undo_rec + 2;
+
+	type_cmpl = mach_read_from_1(ptr);
+	ptr++;
+
+	if (type_cmpl & TRX_UNDO_UPD_EXTERN) {
+		*updated_extern = TRUE;
+		type_cmpl -= TRX_UNDO_UPD_EXTERN;
+	} else {
+		*updated_extern = FALSE;
+	}
+
+	*type = type_cmpl & (TRX_UNDO_CMPL_INFO_MULT - 1);
+	*cmpl_info = type_cmpl / TRX_UNDO_CMPL_INFO_MULT;
+
+	*undo_no = mach_dulint_read_much_compressed(ptr);
+	ptr += mach_dulint_get_much_compressed_size(*undo_no);
+
+	*table_id = mach_dulint_read_much_compressed(ptr);
+	ptr += mach_dulint_get_much_compressed_size(*table_id);
+
+	return(ptr);
+}
+
+/**********************************************************************//**
+Reads from an undo log record a stored column value.
+@return	remaining part of undo log record after reading these values */
+static
+byte*
+trx_undo_rec_get_col_val(
+/*=====================*/
+	byte*	ptr,	/*!< in: pointer to remaining part of undo log record */
+	byte**	field,	/*!< out: pointer to stored field */
+	ulint*	len,	/*!< out: length of the field, or UNIV_SQL_NULL */
+	ulint*	orig_len)/*!< out: original length of the locally
+			stored part of an externally stored column, or 0 */
+{
+	*len = mach_read_compressed(ptr);
+	ptr += mach_get_compressed_size(*len);
+
+	*orig_len = 0;
+
+	switch (*len) {
+	case UNIV_SQL_NULL:
+		*field = NULL;
+		break;
+	case UNIV_EXTERN_STORAGE_FIELD:
+		*orig_len = mach_read_compressed(ptr);
+		ptr += mach_get_compressed_size(*orig_len);
+		*len = mach_read_compressed(ptr);
+		ptr += mach_get_compressed_size(*len);
+		*field = ptr;
+		ptr += *len;
+
+		ut_ad(*orig_len >= BTR_EXTERN_FIELD_REF_SIZE);
+		ut_ad(*len > *orig_len);
+		/* @see dtuple_convert_big_rec() */
+		ut_ad(*len >= BTR_EXTERN_FIELD_REF_SIZE * 2);
+		/* we do not have access to index->table here
+		ut_ad(dict_table_get_format(index->table) >= DICT_TF_FORMAT_ZIP
+		      || *len >= REC_MAX_INDEX_COL_LEN
+		      + BTR_EXTERN_FIELD_REF_SIZE);
+		*/
+
+		*len += UNIV_EXTERN_STORAGE_FIELD;
+		break;
+	default:
+		*field = ptr;
+		if (*len >= UNIV_EXTERN_STORAGE_FIELD) {
+			ptr += *len - UNIV_EXTERN_STORAGE_FIELD;
+		} else {
+			ptr += *len;
+		}
+	}
+
+	return(ptr);
+}
+
+/*******************************************************************//**
+Builds a row reference from an undo log record.
+@return	pointer to remaining part of undo record */
+UNIV_INTERN
+byte*
+trx_undo_rec_get_row_ref(
+/*=====================*/
+	byte*		ptr,	/*!< in: remaining part of a copy of an undo log
+				record, at the start of the row reference;
+				NOTE that this copy of the undo log record must
+				be preserved as long as the row reference is
+				used, as we do NOT copy the data in the
+				record! */
+	dict_index_t*	index,	/*!< in: clustered index */
+	dtuple_t**	ref,	/*!< out, own: row reference */
+	mem_heap_t*	heap)	/*!< in: memory heap from which the memory
+				needed is allocated */
+{
+	ulint		ref_len;
+	ulint		i;
+
+	ut_ad(index && ptr && ref && heap);
+	ut_a(dict_index_is_clust(index));
+
+	ref_len = dict_index_get_n_unique(index);
+
+	*ref = dtuple_create(heap, ref_len);
+
+	dict_index_copy_types(*ref, index, ref_len);
+
+	for (i = 0; i < ref_len; i++) {
+		dfield_t*	dfield;
+		byte*		field;
+		ulint		len;
+		ulint		orig_len;
+
+		dfield = dtuple_get_nth_field(*ref, i);
+
+		ptr = trx_undo_rec_get_col_val(ptr, &field, &len, &orig_len);
+
+		dfield_set_data(dfield, field, len);
+	}
+
+	return(ptr);
+}
+
+/*******************************************************************//**
+Skips a row reference from an undo log record.
+@return	pointer to remaining part of undo record */
+UNIV_INTERN
+byte*
+trx_undo_rec_skip_row_ref(
+/*======================*/
+	byte*		ptr,	/*!< in: remaining part in update undo log
+				record, at the start of the row reference */
+	dict_index_t*	index)	/*!< in: clustered index */
+{
+	ulint	ref_len;
+	ulint	i;
+
+	ut_ad(index && ptr);
+	ut_a(dict_index_is_clust(index));
+
+	ref_len = dict_index_get_n_unique(index);
+
+	for (i = 0; i < ref_len; i++) {
+		byte*	field;
+		ulint	len;
+		ulint	orig_len;
+
+		ptr = trx_undo_rec_get_col_val(ptr, &field, &len, &orig_len);
+	}
+
+	return(ptr);
+}
+
+/**********************************************************************//**
+Fetch a prefix of an externally stored column, for writing to the undo log
+of an update or delete marking of a clustered index record.
+@return	ext_buf */
+static
+byte*
+trx_undo_page_fetch_ext(
+/*====================*/
+	byte*		ext_buf,	/*!< in: a buffer of
+					REC_MAX_INDEX_COL_LEN
+					+ BTR_EXTERN_FIELD_REF_SIZE */
+	ulint		zip_size,	/*!< compressed page size in bytes,
+					or 0 for uncompressed BLOB  */
+	const byte*	field,		/*!< in: an externally stored column */
+	ulint*		len)		/*!< in: length of field;
+					out: used length of ext_buf */
+{
+	/* Fetch the BLOB. */
+	ulint	ext_len = btr_copy_externally_stored_field_prefix(
+		ext_buf, REC_MAX_INDEX_COL_LEN, zip_size, field, *len);
+	/* BLOBs should always be nonempty. */
+	ut_a(ext_len);
+	/* Append the BLOB pointer to the prefix. */
+	memcpy(ext_buf + ext_len,
+	       field + *len - BTR_EXTERN_FIELD_REF_SIZE,
+	       BTR_EXTERN_FIELD_REF_SIZE);
+	*len = ext_len + BTR_EXTERN_FIELD_REF_SIZE;
+	return(ext_buf);
+}
+
+/**********************************************************************//**
+Writes to the undo log a prefix of an externally stored column.
+@return	undo log position */
+static
+byte*
+trx_undo_page_report_modify_ext(
+/*============================*/
+	byte*		ptr,		/*!< in: undo log position,
+					at least 15 bytes must be available */
+	byte*		ext_buf,	/*!< in: a buffer of
+					REC_MAX_INDEX_COL_LEN
+					+ BTR_EXTERN_FIELD_REF_SIZE,
+					or NULL when should not fetch
+					a longer prefix */
+	ulint		zip_size,	/*!< compressed page size in bytes,
+					or 0 for uncompressed BLOB  */
+	const byte**	field,		/*!< in/out: the locally stored part of
+					the externally stored column */
+	ulint*		len)		/*!< in/out: length of field, in bytes */
+{
+	if (ext_buf) {
+		/* If an ordering column is externally stored, we will
+		have to store a longer prefix of the field.  In this
+		case, write to the log a marker followed by the
+		original length and the real length of the field. */
+		ptr += mach_write_compressed(ptr, UNIV_EXTERN_STORAGE_FIELD);
+
+		ptr += mach_write_compressed(ptr, *len);
+
+		*field = trx_undo_page_fetch_ext(ext_buf, zip_size,
+						 *field, len);
+
+		ptr += mach_write_compressed(ptr, *len);
+	} else {
+		ptr += mach_write_compressed(ptr, UNIV_EXTERN_STORAGE_FIELD
+					     + *len);
+	}
+
+	return(ptr);
+}
+
+/**********************************************************************//**
+Reports in the undo log of an update or delete marking of a clustered index
+record.
+@return byte offset of the inserted undo log entry on the page if
+succeed, 0 if fail */
+static
+ulint
+trx_undo_page_report_modify(
+/*========================*/
+	page_t*		undo_page,	/*!< in: undo log page */
+	trx_t*		trx,		/*!< in: transaction */
+	dict_index_t*	index,		/*!< in: clustered index where update or
+					delete marking is done */
+	const rec_t*	rec,		/*!< in: clustered index record which
+					has NOT yet been modified */
+	const ulint*	offsets,	/*!< in: rec_get_offsets(rec, index) */
+	const upd_t*	update,		/*!< in: update vector which tells the
+					columns to be updated; in the case of
+					a delete, this should be set to NULL */
+	ulint		cmpl_info,	/*!< in: compiler info on secondary
+					index updates */
+	mtr_t*		mtr)		/*!< in: mtr */
+{
+	dict_table_t*	table;
+	ulint		first_free;
+	byte*		ptr;
+	const byte*	field;
+	ulint		flen;
+	ulint		col_no;
+	ulint		type_cmpl;
+	byte*		type_cmpl_ptr;
+	ulint		i;
+	trx_id_t	trx_id;
+	ibool		ignore_prefix = FALSE;
+	byte		ext_buf[REC_MAX_INDEX_COL_LEN
+				+ BTR_EXTERN_FIELD_REF_SIZE];
+
+	ut_a(dict_index_is_clust(index));
+	ut_ad(rec_offs_validate(rec, index, offsets));
+	ut_ad(mach_read_from_2(undo_page + TRX_UNDO_PAGE_HDR
+			       + TRX_UNDO_PAGE_TYPE) == TRX_UNDO_UPDATE);
+	table = index->table;
+
+	first_free = mach_read_from_2(undo_page + TRX_UNDO_PAGE_HDR
+				      + TRX_UNDO_PAGE_FREE);
+	ptr = undo_page + first_free;
+
+	ut_ad(first_free <= UNIV_PAGE_SIZE);
+
+	if (trx_undo_left(undo_page, ptr) < 50) {
+
+		/* NOTE: the value 50 must be big enough so that the general
+		fields written below fit on the undo log page */
+
+		return(0);
+	}
+
+	/* Reserve 2 bytes for the pointer to the next undo log record */
+	ptr += 2;
+
+	/* Store first some general parameters to the undo log */
+
+	if (!update) {
+		type_cmpl = TRX_UNDO_DEL_MARK_REC;
+	} else if (rec_get_deleted_flag(rec, dict_table_is_comp(table))) {
+		type_cmpl = TRX_UNDO_UPD_DEL_REC;
+		/* We are about to update a delete marked record.
+		We don't typically need the prefix in this case unless
+		the delete marking is done by the same transaction
+		(which we check below). */
+		ignore_prefix = TRUE;
+	} else {
+		type_cmpl = TRX_UNDO_UPD_EXIST_REC;
+	}
+
+	type_cmpl |= cmpl_info * TRX_UNDO_CMPL_INFO_MULT;
+	type_cmpl_ptr = ptr;
+
+	*ptr++ = (byte) type_cmpl;
+	ptr += mach_dulint_write_much_compressed(ptr, trx->undo_no);
+
+	ptr += mach_dulint_write_much_compressed(ptr, table->id);
+
+	/*----------------------------------------*/
+	/* Store the state of the info bits */
+
+	*ptr++ = (byte) rec_get_info_bits(rec, dict_table_is_comp(table));
+
+	/* Store the values of the system columns */
+	field = rec_get_nth_field(rec, offsets,
+				  dict_index_get_sys_col_pos(
+					  index, DATA_TRX_ID), &flen);
+	ut_ad(flen == DATA_TRX_ID_LEN);
+
+	trx_id = trx_read_trx_id(field);
+
+	/* If it is an update of a delete marked record, then we are
+	allowed to ignore blob prefixes if the delete marking was done
+	by some other trx as it must have committed by now for us to
+	allow an over-write. */
+	if (ignore_prefix) {
+		ignore_prefix = ut_dulint_cmp(trx_id, trx->id) != 0;
+	}
+	ptr += mach_dulint_write_compressed(ptr, trx_id);
+
+	field = rec_get_nth_field(rec, offsets,
+				  dict_index_get_sys_col_pos(
+					  index, DATA_ROLL_PTR), &flen);
+	ut_ad(flen == DATA_ROLL_PTR_LEN);
+
+	ptr += mach_dulint_write_compressed(ptr, trx_read_roll_ptr(field));
+
+	/*----------------------------------------*/
+	/* Store then the fields required to uniquely determine the
+	record which will be modified in the clustered index */
+
+	for (i = 0; i < dict_index_get_n_unique(index); i++) {
+
+		field = rec_get_nth_field(rec, offsets, i, &flen);
+
+		/* The ordering columns must not be stored externally. */
+		ut_ad(!rec_offs_nth_extern(offsets, i));
+		ut_ad(dict_index_get_nth_col(index, i)->ord_part);
+
+		if (trx_undo_left(undo_page, ptr) < 5) {
+
+			return(0);
+		}
+
+		ptr += mach_write_compressed(ptr, flen);
+
+		if (flen != UNIV_SQL_NULL) {
+			if (trx_undo_left(undo_page, ptr) < flen) {
+
+				return(0);
+			}
+
+			ut_memcpy(ptr, field, flen);
+			ptr += flen;
+		}
+	}
+
+	/*----------------------------------------*/
+	/* Save to the undo log the old values of the columns to be updated. */
+
+	if (update) {
+		if (trx_undo_left(undo_page, ptr) < 5) {
+
+			return(0);
+		}
+
+		ptr += mach_write_compressed(ptr, upd_get_n_fields(update));
+
+		for (i = 0; i < upd_get_n_fields(update); i++) {
+
+			ulint	pos = upd_get_nth_field(update, i)->field_no;
+
+			/* Write field number to undo log */
+			if (trx_undo_left(undo_page, ptr) < 5) {
+
+				return(0);
+			}
+
+			ptr += mach_write_compressed(ptr, pos);
+
+			/* Save the old value of field */
+			field = rec_get_nth_field(rec, offsets, pos, &flen);
+
+			if (trx_undo_left(undo_page, ptr) < 15) {
+
+				return(0);
+			}
+
+			if (rec_offs_nth_extern(offsets, pos)) {
+				ptr = trx_undo_page_report_modify_ext(
+					ptr,
+					dict_index_get_nth_col(index, pos)
+					->ord_part
+					&& !ignore_prefix
+					&& flen < REC_MAX_INDEX_COL_LEN
+					? ext_buf : NULL,
+					dict_table_zip_size(table),
+					&field, &flen);
+
+				/* Notify purge that it eventually has to
+				free the old externally stored field */
+
+				trx->update_undo->del_marks = TRUE;
+
+				*type_cmpl_ptr |= TRX_UNDO_UPD_EXTERN;
+			} else {
+				ptr += mach_write_compressed(ptr, flen);
+			}
+
+			if (flen != UNIV_SQL_NULL) {
+				if (trx_undo_left(undo_page, ptr) < flen) {
+
+					return(0);
+				}
+
+				ut_memcpy(ptr, field, flen);
+				ptr += flen;
+			}
+		}
+	}
+
+	/*----------------------------------------*/
+	/* In the case of a delete marking, and also in the case of an update
+	where any ordering field of any index changes, store the values of all
+	columns which occur as ordering fields in any index. This info is used
+	in the purge of old versions where we use it to build and search the
+	delete marked index records, to look if we can remove them from the
+	index tree. Note that starting from 4.0.14 also externally stored
+	fields can be ordering in some index. Starting from 5.2, we no longer
+	store REC_MAX_INDEX_COL_LEN first bytes to the undo log record,
+	but we can construct the column prefix fields in the index by
+	fetching the first page of the BLOB that is pointed to by the
+	clustered index. This works also in crash recovery, because all pages
+	(including BLOBs) are recovered before anything is rolled back. */
+
+	if (!update || !(cmpl_info & UPD_NODE_NO_ORD_CHANGE)) {
+		byte*	old_ptr = ptr;
+
+		trx->update_undo->del_marks = TRUE;
+
+		if (trx_undo_left(undo_page, ptr) < 5) {
+
+			return(0);
+		}
+
+		/* Reserve 2 bytes to write the number of bytes the stored
+		fields take in this undo record */
+
+		ptr += 2;
+
+		for (col_no = 0; col_no < dict_table_get_n_cols(table);
+		     col_no++) {
+
+			const dict_col_t*	col
+				= dict_table_get_nth_col(table, col_no);
+
+			if (col->ord_part) {
+				ulint	pos;
+
+				/* Write field number to undo log */
+				if (trx_undo_left(undo_page, ptr) < 5 + 15) {
+
+					return(0);
+				}
+
+				pos = dict_index_get_nth_col_pos(index,
+								 col_no);
+				ptr += mach_write_compressed(ptr, pos);
+
+				/* Save the old value of field */
+				field = rec_get_nth_field(rec, offsets, pos,
+							  &flen);
+
+				if (rec_offs_nth_extern(offsets, pos)) {
+					ptr = trx_undo_page_report_modify_ext(
+						ptr,
+						flen < REC_MAX_INDEX_COL_LEN
+						&& !ignore_prefix
+						? ext_buf : NULL,
+						dict_table_zip_size(table),
+						&field, &flen);
+				} else {
+					ptr += mach_write_compressed(
+						ptr, flen);
+				}
+
+				if (flen != UNIV_SQL_NULL) {
+					if (trx_undo_left(undo_page, ptr)
+					    < flen) {
+
+						return(0);
+					}
+
+					ut_memcpy(ptr, field, flen);
+					ptr += flen;
+				}
+			}
+		}
+
+		mach_write_to_2(old_ptr, ptr - old_ptr);
+	}
+
+	/*----------------------------------------*/
+	/* Write pointers to the previous and the next undo log records */
+	if (trx_undo_left(undo_page, ptr) < 2) {
+
+		return(0);
+	}
+
+	mach_write_to_2(ptr, first_free);
+	ptr += 2;
+	mach_write_to_2(undo_page + first_free, ptr - undo_page);
+
+	mach_write_to_2(undo_page + TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_FREE,
+			ptr - undo_page);
+
+	/* Write to the REDO log about this change in the UNDO log */
+
+	trx_undof_page_add_undo_rec_log(undo_page, first_free,
+					ptr - undo_page, mtr);
+	return(first_free);
+}
+
+/**********************************************************************//**
+Reads from an undo log update record the system field values of the old
+version.
+@return	remaining part of undo log record after reading these values */
+UNIV_INTERN
+byte*
+trx_undo_update_rec_get_sys_cols(
+/*=============================*/
+	byte*		ptr,		/*!< in: remaining part of undo
+					log record after reading
+					general parameters */
+	trx_id_t*	trx_id,		/*!< out: trx id */
+	roll_ptr_t*	roll_ptr,	/*!< out: roll ptr */
+	ulint*		info_bits)	/*!< out: info bits state */
+{
+	/* Read the state of the info bits */
+	*info_bits = mach_read_from_1(ptr);
+	ptr += 1;
+
+	/* Read the values of the system columns */
+
+	*trx_id = mach_dulint_read_compressed(ptr);
+	ptr += mach_dulint_get_compressed_size(*trx_id);
+
+	*roll_ptr = mach_dulint_read_compressed(ptr);
+	ptr += mach_dulint_get_compressed_size(*roll_ptr);
+
+	return(ptr);
+}
+
+/**********************************************************************//**
+Reads from an update undo log record the number of updated fields.
+@return	remaining part of undo log record after reading this value */
+UNIV_INLINE
+byte*
+trx_undo_update_rec_get_n_upd_fields(
+/*=================================*/
+	byte*	ptr,	/*!< in: pointer to remaining part of undo log record */
+	ulint*	n)	/*!< out: number of fields */
+{
+	*n = mach_read_compressed(ptr);
+	ptr += mach_get_compressed_size(*n);
+
+	return(ptr);
+}
+
+/**********************************************************************//**
+Reads from an update undo log record a stored field number.
+@return	remaining part of undo log record after reading this value */
+UNIV_INLINE
+byte*
+trx_undo_update_rec_get_field_no(
+/*=============================*/
+	byte*	ptr,	/*!< in: pointer to remaining part of undo log record */
+	ulint*	field_no)/*!< out: field number */
+{
+	*field_no = mach_read_compressed(ptr);
+	ptr += mach_get_compressed_size(*field_no);
+
+	return(ptr);
+}
+
+/*******************************************************************//**
+Builds an update vector based on a remaining part of an undo log record.
+@return remaining part of the record, NULL if an error detected, which
+means that the record is corrupted */
+UNIV_INTERN
+byte*
+trx_undo_update_rec_get_update(
+/*===========================*/
+	byte*		ptr,	/*!< in: remaining part in update undo log
+				record, after reading the row reference
+				NOTE that this copy of the undo log record must
+				be preserved as long as the update vector is
+				used, as we do NOT copy the data in the
+				record! */
+	dict_index_t*	index,	/*!< in: clustered index */
+	ulint		type,	/*!< in: TRX_UNDO_UPD_EXIST_REC,
+				TRX_UNDO_UPD_DEL_REC, or
+				TRX_UNDO_DEL_MARK_REC; in the last case,
+				only trx id and roll ptr fields are added to
+				the update vector */
+	trx_id_t	trx_id,	/*!< in: transaction id from this undo record */
+	roll_ptr_t	roll_ptr,/*!< in: roll pointer from this undo record */
+	ulint		info_bits,/*!< in: info bits from this undo record */
+	trx_t*		trx,	/*!< in: transaction */
+	mem_heap_t*	heap,	/*!< in: memory heap from which the memory
+				needed is allocated */
+	upd_t**		upd)	/*!< out, own: update vector */
+{
+	upd_field_t*	upd_field;
+	upd_t*		update;
+	ulint		n_fields;
+	byte*		buf;
+	ulint		i;
+
+	ut_a(dict_index_is_clust(index));
+
+	if (type != TRX_UNDO_DEL_MARK_REC) {
+		ptr = trx_undo_update_rec_get_n_upd_fields(ptr, &n_fields);
+	} else {
+		n_fields = 0;
+	}
+
+	update = upd_create(n_fields + 2, heap);
+
+	update->info_bits = info_bits;
+
+	/* Store first trx id and roll ptr to update vector */
+
+	upd_field = upd_get_nth_field(update, n_fields);
+	buf = mem_heap_alloc(heap, DATA_TRX_ID_LEN);
+	trx_write_trx_id(buf, trx_id);
+
+	upd_field_set_field_no(upd_field,
+			       dict_index_get_sys_col_pos(index, DATA_TRX_ID),
+			       index, trx);
+	dfield_set_data(&(upd_field->new_val), buf, DATA_TRX_ID_LEN);
+
+	upd_field = upd_get_nth_field(update, n_fields + 1);
+	buf = mem_heap_alloc(heap, DATA_ROLL_PTR_LEN);
+	trx_write_roll_ptr(buf, roll_ptr);
+
+	upd_field_set_field_no(
+		upd_field, dict_index_get_sys_col_pos(index, DATA_ROLL_PTR),
+		index, trx);
+	dfield_set_data(&(upd_field->new_val), buf, DATA_ROLL_PTR_LEN);
+
+	/* Store then the updated ordinary columns to the update vector */
+
+	for (i = 0; i < n_fields; i++) {
+
+		byte*	field;
+		ulint	len;
+		ulint	field_no;
+		ulint	orig_len;
+
+		ptr = trx_undo_update_rec_get_field_no(ptr, &field_no);
+
+		if (field_no >= dict_index_get_n_fields(index)) {
+			fprintf(stderr,
+				"InnoDB: Error: trying to access"
+				" update undo rec field %lu in ",
+				(ulong) field_no);
+			dict_index_name_print(stderr, trx, index);
+			fprintf(stderr, "\n"
+				"InnoDB: but index has only %lu fields\n"
+				"InnoDB: Submit a detailed bug report"
+				" to http://bugs.mysql.com\n"
+				"InnoDB: Run also CHECK TABLE ",
+				(ulong) dict_index_get_n_fields(index));
+			ut_print_name(stderr, trx, TRUE, index->table_name);
+			fprintf(stderr, "\n"
+				"InnoDB: n_fields = %lu, i = %lu, ptr %p\n",
+				(ulong) n_fields, (ulong) i, ptr);
+			*upd = NULL;
+			return(NULL);
+		}
+
+		upd_field = upd_get_nth_field(update, i);
+
+		upd_field_set_field_no(upd_field, field_no, index, trx);
+
+		ptr = trx_undo_rec_get_col_val(ptr, &field, &len, &orig_len);
+
+		upd_field->orig_len = orig_len;
+
+		if (len == UNIV_SQL_NULL) {
+			dfield_set_null(&upd_field->new_val);
+		} else if (len < UNIV_EXTERN_STORAGE_FIELD) {
+			dfield_set_data(&upd_field->new_val, field, len);
+		} else {
+			len -= UNIV_EXTERN_STORAGE_FIELD;
+
+			dfield_set_data(&upd_field->new_val, field, len);
+			dfield_set_ext(&upd_field->new_val);
+		}
+	}
+
+	*upd = update;
+
+	return(ptr);
+}
+
+/*******************************************************************//**
+Builds a partial row from an update undo log record. It contains the
+columns which occur as ordering in any index of the table.
+@return	pointer to remaining part of undo record */
+UNIV_INTERN
+byte*
+trx_undo_rec_get_partial_row(
+/*=========================*/
+	byte*		ptr,	/*!< in: remaining part in update undo log
+				record of a suitable type, at the start of
+				the stored index columns;
+				NOTE that this copy of the undo log record must
+				be preserved as long as the partial row is
+				used, as we do NOT copy the data in the
+				record! */
+	dict_index_t*	index,	/*!< in: clustered index */
+	dtuple_t**	row,	/*!< out, own: partial row */
+	ibool		ignore_prefix, /*!< in: flag to indicate if we
+				expect blob prefixes in undo. Used
+				only in the assertion. */
+	mem_heap_t*	heap)	/*!< in: memory heap from which the memory
+				needed is allocated */
+{
+	const byte*	end_ptr;
+	ulint		row_len;
+
+	ut_ad(index);
+	ut_ad(ptr);
+	ut_ad(row);
+	ut_ad(heap);
+	ut_ad(dict_index_is_clust(index));
+
+	row_len = dict_table_get_n_cols(index->table);
+
+	*row = dtuple_create(heap, row_len);
+
+	dict_table_copy_types(*row, index->table);
+
+	end_ptr = ptr + mach_read_from_2(ptr);
+	ptr += 2;
+
+	while (ptr != end_ptr) {
+		dfield_t*		dfield;
+		byte*			field;
+		ulint			field_no;
+		const dict_col_t*	col;
+		ulint			col_no;
+		ulint			len;
+		ulint			orig_len;
+
+		ptr = trx_undo_update_rec_get_field_no(ptr, &field_no);
+
+		col = dict_index_get_nth_col(index, field_no);
+		col_no = dict_col_get_no(col);
+
+		ptr = trx_undo_rec_get_col_val(ptr, &field, &len, &orig_len);
+
+		dfield = dtuple_get_nth_field(*row, col_no);
+
+		dfield_set_data(dfield, field, len);
+
+		if (len != UNIV_SQL_NULL
+		    && len >= UNIV_EXTERN_STORAGE_FIELD) {
+			dfield_set_len(dfield,
+				       len - UNIV_EXTERN_STORAGE_FIELD);
+			dfield_set_ext(dfield);
+			/* If the prefix of this column is indexed,
+			ensure that enough prefix is stored in the
+			undo log record. */
+			if (!ignore_prefix && col->ord_part) {
+				ut_a(dfield_get_len(dfield)
+				     >= 2 * BTR_EXTERN_FIELD_REF_SIZE);
+				ut_a(dict_table_get_format(index->table)
+				     >= DICT_TF_FORMAT_ZIP
+				     || dfield_get_len(dfield)
+				     >= REC_MAX_INDEX_COL_LEN
+				     + BTR_EXTERN_FIELD_REF_SIZE);
+			}
+		}
+	}
+
+	return(ptr);
+}
+#endif /* !UNIV_HOTBACKUP */
+
+/***********************************************************************//**
+Erases the unused undo log page end. */
+static
+void
+trx_undo_erase_page_end(
+/*====================*/
+	page_t*	undo_page,	/*!< in: undo page whose end to erase */
+	mtr_t*	mtr)		/*!< in: mtr */
+{
+	ulint	first_free;
+
+	first_free = mach_read_from_2(undo_page + TRX_UNDO_PAGE_HDR
+				      + TRX_UNDO_PAGE_FREE);
+	memset(undo_page + first_free, 0xff,
+	       (UNIV_PAGE_SIZE - FIL_PAGE_DATA_END) - first_free);
+
+	mlog_write_initial_log_record(undo_page, MLOG_UNDO_ERASE_END, mtr);
+}
+
+/***********************************************************//**
+Parses a redo log record of erasing of an undo page end.
+@return	end of log record or NULL */
+UNIV_INTERN
+byte*
+trx_undo_parse_erase_page_end(
+/*==========================*/
+	byte*	ptr,	/*!< in: buffer */
+	byte*	end_ptr __attribute__((unused)), /*!< in: buffer end */
+	page_t*	page,	/*!< in: page or NULL */
+	mtr_t*	mtr)	/*!< in: mtr or NULL */
+{
+	ut_ad(ptr && end_ptr);
+
+	if (page == NULL) {
+
+		return(ptr);
+	}
+
+	trx_undo_erase_page_end(page, mtr);
+
+	return(ptr);
+}
+
+#ifndef UNIV_HOTBACKUP
+/***********************************************************************//**
+Writes information to an undo log about an insert, update, or a delete marking
+of a clustered index record. This information is used in a rollback of the
+transaction and in consistent reads that must look to the history of this
+transaction.
+@return	DB_SUCCESS or error code */
+UNIV_INTERN
+ulint
+trx_undo_report_row_operation(
+/*==========================*/
+	ulint		flags,		/*!< in: if BTR_NO_UNDO_LOG_FLAG bit is
+					set, does nothing */
+	ulint		op_type,	/*!< in: TRX_UNDO_INSERT_OP or
+					TRX_UNDO_MODIFY_OP */
+	que_thr_t*	thr,		/*!< in: query thread */
+	dict_index_t*	index,		/*!< in: clustered index */
+	const dtuple_t*	clust_entry,	/*!< in: in the case of an insert,
+					index entry to insert into the
+					clustered index, otherwise NULL */
+	const upd_t*	update,		/*!< in: in the case of an update,
+					the update vector, otherwise NULL */
+	ulint		cmpl_info,	/*!< in: compiler info on secondary
+					index updates */
+	const rec_t*	rec,		/*!< in: in case of an update or delete
+					marking, the record in the clustered
+					index, otherwise NULL */
+	roll_ptr_t*	roll_ptr)	/*!< out: rollback pointer to the
+					inserted undo log record,
+					ut_dulint_zero if BTR_NO_UNDO_LOG
+					flag was specified */
+{
+	trx_t*		trx;
+	trx_undo_t*	undo;
+	ulint		page_no;
+	trx_rseg_t*	rseg;
+	mtr_t		mtr;
+	ulint		err		= DB_SUCCESS;
+	mem_heap_t*	heap		= NULL;
+	ulint		offsets_[REC_OFFS_NORMAL_SIZE];
+	ulint*		offsets		= offsets_;
+	rec_offs_init(offsets_);
+
+	ut_a(dict_index_is_clust(index));
+
+	if (flags & BTR_NO_UNDO_LOG_FLAG) {
+
+		*roll_ptr = ut_dulint_zero;
+
+		return(DB_SUCCESS);
+	}
+
+	ut_ad(thr);
+	ut_ad((op_type != TRX_UNDO_INSERT_OP)
+	      || (clust_entry && !update && !rec));
+
+	trx = thr_get_trx(thr);
+	rseg = trx->rseg;
+
+	mutex_enter(&(trx->undo_mutex));
+
+	/* If the undo log is not assigned yet, assign one */
+
+	if (op_type == TRX_UNDO_INSERT_OP) {
+
+		if (trx->insert_undo == NULL) {
+
+			err = trx_undo_assign_undo(trx, TRX_UNDO_INSERT);
+		}
+
+		undo = trx->insert_undo;
+
+		if (UNIV_UNLIKELY(!undo)) {
+			/* Did not succeed */
+			mutex_exit(&(trx->undo_mutex));
+
+			return(err);
+		}
+	} else {
+		ut_ad(op_type == TRX_UNDO_MODIFY_OP);
+
+		if (trx->update_undo == NULL) {
+
+			err = trx_undo_assign_undo(trx, TRX_UNDO_UPDATE);
+
+		}
+
+		undo = trx->update_undo;
+
+		if (UNIV_UNLIKELY(!undo)) {
+			/* Did not succeed */
+			mutex_exit(&(trx->undo_mutex));
+			return(err);
+		}
+
+		offsets = rec_get_offsets(rec, index, offsets,
+					  ULINT_UNDEFINED, &heap);
+	}
+
+	page_no = undo->last_page_no;
+
+	mtr_start(&mtr);
+
+	for (;;) {
+		buf_block_t*	undo_block;
+		page_t*		undo_page;
+		ulint		offset;
+
+		undo_block = buf_page_get_gen(undo->space, undo->zip_size,
+					      page_no, RW_X_LATCH,
+					      undo->guess_block, BUF_GET,
+					      __FILE__, __LINE__, &mtr);
+		buf_block_dbg_add_level(undo_block, SYNC_TRX_UNDO_PAGE);
+
+		undo_page = buf_block_get_frame(undo_block);
+
+		if (op_type == TRX_UNDO_INSERT_OP) {
+			offset = trx_undo_page_report_insert(
+				undo_page, trx, index, clust_entry, &mtr);
+		} else {
+			offset = trx_undo_page_report_modify(
+				undo_page, trx, index, rec, offsets, update,
+				cmpl_info, &mtr);
+		}
+
+		if (UNIV_UNLIKELY(offset == 0)) {
+			/* The record did not fit on the page. We erase the
+			end segment of the undo log page and write a log
+			record of it: this is to ensure that in the debug
+			version the replicate page constructed using the log
+			records stays identical to the original page */
+
+			trx_undo_erase_page_end(undo_page, &mtr);
+			mtr_commit(&mtr);
+		} else {
+			/* Success */
+
+			mtr_commit(&mtr);
+
+			undo->empty = FALSE;
+			undo->top_page_no = page_no;
+			undo->top_offset  = offset;
+			undo->top_undo_no = trx->undo_no;
+			undo->guess_block = undo_block;
+
+			UT_DULINT_INC(trx->undo_no);
+
+			mutex_exit(&trx->undo_mutex);
+
+			*roll_ptr = trx_undo_build_roll_ptr(
+				op_type == TRX_UNDO_INSERT_OP,
+				rseg->id, page_no, offset);
+			if (UNIV_LIKELY_NULL(heap)) {
+				mem_heap_free(heap);
+			}
+			return(DB_SUCCESS);
+		}
+
+		ut_ad(page_no == undo->last_page_no);
+
+		/* We have to extend the undo log by one page */
+
+		mtr_start(&mtr);
+
+		/* When we add a page to an undo log, this is analogous to
+		a pessimistic insert in a B-tree, and we must reserve the
+		counterpart of the tree latch, which is the rseg mutex. */
+
+		mutex_enter(&(rseg->mutex));
+
+		page_no = trx_undo_add_page(trx, undo, &mtr);
+
+		mutex_exit(&(rseg->mutex));
+
+		if (UNIV_UNLIKELY(page_no == FIL_NULL)) {
+			/* Did not succeed: out of space */
+
+			mutex_exit(&(trx->undo_mutex));
+			mtr_commit(&mtr);
+			if (UNIV_LIKELY_NULL(heap)) {
+				mem_heap_free(heap);
+			}
+			return(DB_OUT_OF_FILE_SPACE);
+		}
+	}
+}
+
+/*============== BUILDING PREVIOUS VERSION OF A RECORD ===============*/
+
+/******************************************************************//**
+Copies an undo record to heap. This function can be called if we know that
+the undo log record exists.
+@return	own: copy of the record */
+UNIV_INTERN
+trx_undo_rec_t*
+trx_undo_get_undo_rec_low(
+/*======================*/
+	roll_ptr_t	roll_ptr,	/*!< in: roll pointer to record */
+	mem_heap_t*	heap)		/*!< in: memory heap where copied */
+{
+	trx_undo_rec_t*	undo_rec;
+	ulint		rseg_id;
+	ulint		page_no;
+	ulint		offset;
+	const page_t*	undo_page;
+	trx_rseg_t*	rseg;
+	ibool		is_insert;
+	mtr_t		mtr;
+
+	trx_undo_decode_roll_ptr(roll_ptr, &is_insert, &rseg_id, &page_no,
+				 &offset);
+	rseg = trx_rseg_get_on_id(rseg_id);
+
+	mtr_start(&mtr);
+
+	undo_page = trx_undo_page_get_s_latched(rseg->space, rseg->zip_size,
+						page_no, &mtr);
+
+	undo_rec = trx_undo_rec_copy(undo_page + offset, heap);
+
+	mtr_commit(&mtr);
+
+	return(undo_rec);
+}
+
+/******************************************************************//**
+Copies an undo record to heap.
+
+NOTE: the caller must have latches on the clustered index page and
+purge_view.
+
+@return DB_SUCCESS, or DB_MISSING_HISTORY if the undo log has been
+truncated and we cannot fetch the old version */
+UNIV_INTERN
+ulint
+trx_undo_get_undo_rec(
+/*==================*/
+	roll_ptr_t	roll_ptr,	/*!< in: roll pointer to record */
+	trx_id_t	trx_id,		/*!< in: id of the trx that generated
+					the roll pointer: it points to an
+					undo log of this transaction */
+	trx_undo_rec_t** undo_rec,	/*!< out, own: copy of the record */
+	mem_heap_t*	heap)		/*!< in: memory heap where copied */
+{
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(rw_lock_own(&(purge_sys->latch), RW_LOCK_SHARED));
+#endif /* UNIV_SYNC_DEBUG */
+
+	if (!trx_purge_update_undo_must_exist(trx_id)) {
+
+		/* It may be that the necessary undo log has already been
+		deleted */
+
+		return(DB_MISSING_HISTORY);
+	}
+
+	*undo_rec = trx_undo_get_undo_rec_low(roll_ptr, heap);
+
+	return(DB_SUCCESS);
+}
+
+/*******************************************************************//**
+Build a previous version of a clustered index record. This function checks
+that the caller has a latch on the index page of the clustered index record
+and an s-latch on the purge_view. This guarantees that the stack of versions
+is locked all the way down to the purge_view.
+@return DB_SUCCESS, or DB_MISSING_HISTORY if the previous version is
+earlier than purge_view, which means that it may have been removed,
+DB_ERROR if corrupted record */
+UNIV_INTERN
+ulint
+trx_undo_prev_version_build(
+/*========================*/
+	const rec_t*	index_rec,/*!< in: clustered index record in the
+				index tree */
+	mtr_t*		index_mtr __attribute__((unused)),
+				/*!< in: mtr which contains the latch to
+				index_rec page and purge_view */
+	const rec_t*	rec,	/*!< in: version of a clustered index record */
+	dict_index_t*	index,	/*!< in: clustered index */
+	ulint*		offsets,/*!< in: rec_get_offsets(rec, index) */
+	mem_heap_t*	heap,	/*!< in: memory heap from which the memory
+				needed is allocated */
+	rec_t**		old_vers)/*!< out, own: previous version, or NULL if
+				rec is the first inserted version, or if
+				history data has been deleted (an error),
+				or if the purge COULD have removed the version
+				though it has not yet done so */
+{
+	trx_undo_rec_t*	undo_rec	= NULL;
+	dtuple_t*	entry;
+	trx_id_t	rec_trx_id;
+	ulint		type;
+	undo_no_t	undo_no;
+	dulint		table_id;
+	trx_id_t	trx_id;
+	roll_ptr_t	roll_ptr;
+	roll_ptr_t	old_roll_ptr;
+	upd_t*		update;
+	byte*		ptr;
+	ulint		info_bits;
+	ulint		cmpl_info;
+	ibool		dummy_extern;
+	byte*		buf;
+	ulint		err;
+#ifdef UNIV_SYNC_DEBUG
+	ut_ad(rw_lock_own(&(purge_sys->latch), RW_LOCK_SHARED));
+#endif /* UNIV_SYNC_DEBUG */
+	ut_ad(mtr_memo_contains_page(index_mtr, index_rec, MTR_MEMO_PAGE_S_FIX)
+	      || mtr_memo_contains_page(index_mtr, index_rec,
+					MTR_MEMO_PAGE_X_FIX));
+	ut_ad(rec_offs_validate(rec, index, offsets));
+
+	if (!dict_index_is_clust(index)) {
+		fprintf(stderr, "InnoDB: Error: trying to access"
+			" update undo rec for non-clustered index %s\n"
+			"InnoDB: Submit a detailed bug report to"
+			" http://bugs.mysql.com\n"
+			"InnoDB: index record ", index->name);
+		rec_print(stderr, index_rec, index);
+		fputs("\n"
+		      "InnoDB: record version ", stderr);
+		rec_print_new(stderr, rec, offsets);
+		putc('\n', stderr);
+		return(DB_ERROR);
+	}
+
+	roll_ptr = row_get_rec_roll_ptr(rec, index, offsets);
+	old_roll_ptr = roll_ptr;
+
+	*old_vers = NULL;
+
+	if (trx_undo_roll_ptr_is_insert(roll_ptr)) {
+
+		/* The record rec is the first inserted version */
+
+		return(DB_SUCCESS);
+	}
+
+	rec_trx_id = row_get_rec_trx_id(rec, index, offsets);
+
+	err = trx_undo_get_undo_rec(roll_ptr, rec_trx_id, &undo_rec, heap);
+
+	if (UNIV_UNLIKELY(err != DB_SUCCESS)) {
+		/* The undo record may already have been purged.
+		This should never happen in InnoDB. */
+
+		return(err);
+	}
+
+	ptr = trx_undo_rec_get_pars(undo_rec, &type, &cmpl_info,
+				    &dummy_extern, &undo_no, &table_id);
+
+	ptr = trx_undo_update_rec_get_sys_cols(ptr, &trx_id, &roll_ptr,
+					       &info_bits);
+
+	/* (a) If a clustered index record version is such that the
+	trx id stamp in it is bigger than purge_sys->view, then the
+	BLOBs in that version are known to exist (the purge has not
+	progressed that far);
+
+	(b) if the version is the first version such that trx id in it
+	is less than purge_sys->view, and it is not delete-marked,
+	then the BLOBs in that version are known to exist (the purge
+	cannot have purged the BLOBs referenced by that version
+	yet).
+
+	This function does not fetch any BLOBs.  The callers might, by
+	possibly invoking row_ext_create() via row_build().  However,
+	they should have all needed information in the *old_vers
+	returned by this function.  This is because *old_vers is based
+	on the transaction undo log records.  The function
+	trx_undo_page_fetch_ext() will write BLOB prefixes to the
+	transaction undo log that are at least as long as the longest
+	possible column prefix in a secondary index.  Thus, secondary
+	index entries for *old_vers can be constructed without
+	dereferencing any BLOB pointers. */
+
+	ptr = trx_undo_rec_skip_row_ref(ptr, index);
+
+	ptr = trx_undo_update_rec_get_update(ptr, index, type, trx_id,
+					     roll_ptr, info_bits,
+					     NULL, heap, &update);
+
+	if (ut_dulint_cmp(table_id, index->table->id) != 0) {
+		ptr = NULL;
+
+		fprintf(stderr,
+			"InnoDB: Error: trying to access update undo rec"
+			" for table %s\n"
+			"InnoDB: but the table id in the"
+			" undo record is wrong\n"
+			"InnoDB: Submit a detailed bug report"
+			" to http://bugs.mysql.com\n"
+			"InnoDB: Run also CHECK TABLE %s\n",
+			index->table_name, index->table_name);
+	}
+
+	if (ptr == NULL) {
+		/* The record was corrupted, return an error; these printfs
+		should catch an elusive bug in row_vers_old_has_index_entry */
+
+		fprintf(stderr,
+			"InnoDB: table %s, index %s, n_uniq %lu\n"
+			"InnoDB: undo rec address %p, type %lu cmpl_info %lu\n"
+			"InnoDB: undo rec table id %lu %lu,"
+			" index table id %lu %lu\n"
+			"InnoDB: dump of 150 bytes in undo rec: ",
+			index->table_name, index->name,
+			(ulong) dict_index_get_n_unique(index),
+			undo_rec, (ulong) type, (ulong) cmpl_info,
+			(ulong) ut_dulint_get_high(table_id),
+			(ulong) ut_dulint_get_low(table_id),
+			(ulong) ut_dulint_get_high(index->table->id),
+			(ulong) ut_dulint_get_low(index->table->id));
+		ut_print_buf(stderr, undo_rec, 150);
+		fputs("\n"
+		      "InnoDB: index record ", stderr);
+		rec_print(stderr, index_rec, index);
+		fputs("\n"
+		      "InnoDB: record version ", stderr);
+		rec_print_new(stderr, rec, offsets);
+		fprintf(stderr, "\n"
+			"InnoDB: Record trx id " TRX_ID_FMT
+			", update rec trx id " TRX_ID_FMT "\n"
+			"InnoDB: Roll ptr in rec %lu %lu, in update rec"
+			" %lu %lu\n",
+			TRX_ID_PREP_PRINTF(rec_trx_id),
+			TRX_ID_PREP_PRINTF(trx_id),
+			(ulong) ut_dulint_get_high(old_roll_ptr),
+			(ulong) ut_dulint_get_low(old_roll_ptr),
+			(ulong) ut_dulint_get_high(roll_ptr),
+			(ulong) ut_dulint_get_low(roll_ptr));
+
+		trx_purge_sys_print();
+		return(DB_ERROR);
+	}
+
+	if (row_upd_changes_field_size_or_external(index, offsets, update)) {
+		ulint	n_ext;
+
+		/* We have to set the appropriate extern storage bits in the
+		old version of the record: the extern bits in rec for those
+		fields that update does NOT update, as well as the bits for
+		those fields that update updates to become externally stored
+		fields. Store the info: */
+
+		entry = row_rec_to_index_entry(ROW_COPY_DATA, rec, index,
+					       offsets, &n_ext, heap);
+		n_ext += btr_push_update_extern_fields(entry, update, heap);
+		/* The page containing the clustered index record
+		corresponding to entry is latched in mtr.  Thus the
+		following call is safe. */
+		row_upd_index_replace_new_col_vals(entry, index, update, heap);
+
+		buf = mem_heap_alloc(heap, rec_get_converted_size(index, entry,
+								  n_ext));
+
+		*old_vers = rec_convert_dtuple_to_rec(buf, index,
+						      entry, n_ext);
+	} else {
+		buf = mem_heap_alloc(heap, rec_offs_size(offsets));
+		*old_vers = rec_copy(buf, rec, offsets);
+		rec_offs_make_valid(*old_vers, index, offsets);
+		row_upd_rec_in_place(*old_vers, index, offsets, update, NULL);
+	}
+
+	return(DB_SUCCESS);
+}
+#endif /* !UNIV_HOTBACKUP */
diff --git a/storage/xtradb/trx/trx0roll.c b/storage/xtradb/trx/trx0roll.c
new file mode 100644
index 00000000000..c925478cdf4
--- /dev/null
+++ b/storage/xtradb/trx/trx0roll.c
@@ -0,0 +1,1366 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file trx/trx0roll.c
+Transaction rollback
+
+Created 3/26/1996 Heikki Tuuri
+*******************************************************/
+
+#include "trx0roll.h"
+
+#ifdef UNIV_NONINL
+#include "trx0roll.ic"
+#endif
+
+#include "fsp0fsp.h"
+#include "mach0data.h"
+#include "trx0rseg.h"
+#include "trx0trx.h"
+#include "trx0undo.h"
+#include "trx0rec.h"
+#include "que0que.h"
+#include "usr0sess.h"
+#include "srv0que.h"
+#include "srv0start.h"
+#include "row0undo.h"
+#include "row0mysql.h"
+#include "lock0lock.h"
+#include "pars0pars.h"
+
+/** This many pages must be undone before a truncate is tried within
+rollback */
+#define TRX_ROLL_TRUNC_THRESHOLD	1
+
+/** In crash recovery, the current trx to be rolled back */
+static trx_t*		trx_roll_crash_recv_trx	= NULL;
+
+/** In crash recovery we set this to the undo n:o of the current trx to be
+rolled back. Then we can print how many % the rollback has progressed. */
+static ib_int64_t	trx_roll_max_undo_no;
+
+/** Auxiliary variable which tells the previous progress % we printed */
+static ulint		trx_roll_progress_printed_pct;
+
+/*******************************************************************//**
+Rollback a transaction used in MySQL.
+@return	error code or DB_SUCCESS */
+UNIV_INTERN
+int
+trx_general_rollback_for_mysql(
+/*===========================*/
+	trx_t*		trx,	/*!< in: transaction handle */
+	trx_savept_t*	savept)	/*!< in: pointer to savepoint undo number, if
+				partial rollback requested, or NULL for
+				complete rollback */
+{
+	mem_heap_t*	heap;
+	que_thr_t*	thr;
+	roll_node_t*	roll_node;
+
+	/* Tell Innobase server that there might be work for
+	utility threads: */
+
+	srv_active_wake_master_thread();
+
+	trx_start_if_not_started(trx);
+
+	heap = mem_heap_create(512);
+
+	roll_node = roll_node_create(heap);
+
+	if (savept) {
+		roll_node->partial = TRUE;
+		roll_node->savept = *savept;
+	}
+
+	trx->error_state = DB_SUCCESS;
+
+	thr = pars_complete_graph_for_exec(roll_node, trx, heap);
+
+	ut_a(thr == que_fork_start_command(que_node_get_parent(thr)));
+	que_run_threads(thr);
+
+	mutex_enter(&kernel_mutex);
+
+	while (trx->que_state != TRX_QUE_RUNNING) {
+
+		mutex_exit(&kernel_mutex);
+
+		os_thread_sleep(100000);
+
+		mutex_enter(&kernel_mutex);
+	}
+
+	mutex_exit(&kernel_mutex);
+
+	mem_heap_free(heap);
+
+	ut_a(trx->error_state == DB_SUCCESS);
+
+	/* Tell Innobase server that there might be work for
+	utility threads: */
+
+	srv_active_wake_master_thread();
+
+	return((int) trx->error_state);
+}
+
+/*******************************************************************//**
+Rollback a transaction used in MySQL.
+@return	error code or DB_SUCCESS */
+UNIV_INTERN
+int
+trx_rollback_for_mysql(
+/*===================*/
+	trx_t*	trx)	/*!< in: transaction handle */
+{
+	int	err;
+
+	if (trx->conc_state == TRX_NOT_STARTED) {
+
+		return(DB_SUCCESS);
+	}
+
+	trx->op_info = "rollback";
+
+	/* If we are doing the XA recovery of prepared transactions, then
+	the transaction object does not have an InnoDB session object, and we
+	set a dummy session that we use for all MySQL transactions. */
+
+	err = trx_general_rollback_for_mysql(trx, NULL);
+
+	trx->op_info = "";
+
+	return(err);
+}
+
+/*******************************************************************//**
+Rollback the latest SQL statement for MySQL.
+@return	error code or DB_SUCCESS */
+UNIV_INTERN
+int
+trx_rollback_last_sql_stat_for_mysql(
+/*=================================*/
+	trx_t*	trx)	/*!< in: transaction handle */
+{
+	int	err;
+
+	if (trx->conc_state == TRX_NOT_STARTED) {
+
+		return(DB_SUCCESS);
+	}
+
+	trx->op_info = "rollback of SQL statement";
+
+	err = trx_general_rollback_for_mysql(trx, &trx->last_sql_stat_start);
+	/* The following call should not be needed, but we play safe: */
+	trx_mark_sql_stat_end(trx);
+
+	trx->op_info = "";
+
+	return(err);
+}
+
+/*******************************************************************//**
+Frees a single savepoint struct. */
+UNIV_INTERN
+void
+trx_roll_savepoint_free(
+/*=====================*/
+	trx_t*			trx,	/*!< in: transaction handle */
+	trx_named_savept_t*	savep)	/*!< in: savepoint to free */
+{
+	ut_a(savep != NULL);
+	ut_a(UT_LIST_GET_LEN(trx->trx_savepoints) > 0);
+
+	UT_LIST_REMOVE(trx_savepoints, trx->trx_savepoints, savep);
+	mem_free(savep->name);
+	mem_free(savep);
+}
+
+/*******************************************************************//**
+Frees savepoint structs starting from savep, if savep == NULL then
+free all savepoints. */
+UNIV_INTERN
+void
+trx_roll_savepoints_free(
+/*=====================*/
+	trx_t*			trx,	/*!< in: transaction handle */
+	trx_named_savept_t*	savep)	/*!< in: free all savepoints > this one;
+					if this is NULL, free all savepoints
+					of trx */
+{
+	trx_named_savept_t*	next_savep;
+
+	if (savep == NULL) {
+		savep = UT_LIST_GET_FIRST(trx->trx_savepoints);
+	} else {
+		savep = UT_LIST_GET_NEXT(trx_savepoints, savep);
+	}
+
+	while (savep != NULL) {
+		next_savep = UT_LIST_GET_NEXT(trx_savepoints, savep);
+
+		trx_roll_savepoint_free(trx, savep);
+
+		savep = next_savep;
+	}
+}
+
+/*******************************************************************//**
+Rolls back a transaction back to a named savepoint. Modifications after the
+savepoint are undone but InnoDB does NOT release the corresponding locks
+which are stored in memory. If a lock is 'implicit', that is, a new inserted
+row holds a lock where the lock information is carried by the trx id stored in
+the row, these locks are naturally released in the rollback. Savepoints which
+were set after this savepoint are deleted.
+@return if no savepoint of the name found then DB_NO_SAVEPOINT,
+otherwise DB_SUCCESS */
+UNIV_INTERN
+ulint
+trx_rollback_to_savepoint_for_mysql(
+/*================================*/
+	trx_t*		trx,			/*!< in: transaction handle */
+	const char*	savepoint_name,		/*!< in: savepoint name */
+	ib_int64_t*	mysql_binlog_cache_pos)	/*!< out: the MySQL binlog cache
+						position corresponding to this
+						savepoint; MySQL needs this
+						information to remove the
+						binlog entries of the queries
+						executed after the savepoint */
+{
+	trx_named_savept_t*	savep;
+	ulint			err;
+
+	savep = UT_LIST_GET_FIRST(trx->trx_savepoints);
+
+	while (savep != NULL) {
+		if (0 == ut_strcmp(savep->name, savepoint_name)) {
+			/* Found */
+			break;
+		}
+		savep = UT_LIST_GET_NEXT(trx_savepoints, savep);
+	}
+
+	if (savep == NULL) {
+
+		return(DB_NO_SAVEPOINT);
+	}
+
+	if (trx->conc_state == TRX_NOT_STARTED) {
+		ut_print_timestamp(stderr);
+		fputs("  InnoDB: Error: transaction has a savepoint ", stderr);
+		ut_print_name(stderr, trx, FALSE, savep->name);
+		fputs(" though it is not started\n", stderr);
+		return(DB_ERROR);
+	}
+
+	/* We can now free all savepoints strictly later than this one */
+
+	trx_roll_savepoints_free(trx, savep);
+
+	*mysql_binlog_cache_pos = savep->mysql_binlog_cache_pos;
+
+	trx->op_info = "rollback to a savepoint";
+
+	err = trx_general_rollback_for_mysql(trx, &savep->savept);
+
+	/* Store the current undo_no of the transaction so that we know where
+	to roll back if we have to roll back the next SQL statement: */
+
+	trx_mark_sql_stat_end(trx);
+
+	trx->op_info = "";
+
+	return(err);
+}
+
+/*******************************************************************//**
+Creates a named savepoint. If the transaction is not yet started, starts it.
+If there is already a savepoint of the same name, this call erases that old
+savepoint and replaces it with a new. Savepoints are deleted in a transaction
+commit or rollback.
+@return	always DB_SUCCESS */
+UNIV_INTERN
+ulint
+trx_savepoint_for_mysql(
+/*====================*/
+	trx_t*		trx,			/*!< in: transaction handle */
+	const char*	savepoint_name,		/*!< in: savepoint name */
+	ib_int64_t	binlog_cache_pos)	/*!< in: MySQL binlog cache
+						position corresponding to this
+						connection at the time of the
+						savepoint */
+{
+	trx_named_savept_t*	savep;
+
+	ut_a(trx);
+	ut_a(savepoint_name);
+
+	trx_start_if_not_started(trx);
+
+	savep = UT_LIST_GET_FIRST(trx->trx_savepoints);
+
+	while (savep != NULL) {
+		if (0 == ut_strcmp(savep->name, savepoint_name)) {
+			/* Found */
+			break;
+		}
+		savep = UT_LIST_GET_NEXT(trx_savepoints, savep);
+	}
+
+	if (savep) {
+		/* There is a savepoint with the same name: free that */
+
+		UT_LIST_REMOVE(trx_savepoints, trx->trx_savepoints, savep);
+
+		mem_free(savep->name);
+		mem_free(savep);
+	}
+
+	/* Create a new savepoint and add it as the last in the list */
+
+	savep = mem_alloc(sizeof(trx_named_savept_t));
+
+	savep->name = mem_strdup(savepoint_name);
+
+	savep->savept = trx_savept_take(trx);
+
+	savep->mysql_binlog_cache_pos = binlog_cache_pos;
+
+	UT_LIST_ADD_LAST(trx_savepoints, trx->trx_savepoints, savep);
+
+	return(DB_SUCCESS);
+}
+
+/*******************************************************************//**
+Releases only the named savepoint. Savepoints which were set after this
+savepoint are left as is.
+@return if no savepoint of the name found then DB_NO_SAVEPOINT,
+otherwise DB_SUCCESS */
+UNIV_INTERN
+ulint
+trx_release_savepoint_for_mysql(
+/*============================*/
+	trx_t*		trx,			/*!< in: transaction handle */
+	const char*	savepoint_name)		/*!< in: savepoint name */
+{
+	trx_named_savept_t*	savep;
+
+	savep = UT_LIST_GET_FIRST(trx->trx_savepoints);
+
+	/* Search for the savepoint by name and free if found. */
+	while (savep != NULL) {
+		if (0 == ut_strcmp(savep->name, savepoint_name)) {
+			trx_roll_savepoint_free(trx, savep);
+			return(DB_SUCCESS);
+		}
+		savep = UT_LIST_GET_NEXT(trx_savepoints, savep);
+	}
+
+	return(DB_NO_SAVEPOINT);
+}
+
+/*******************************************************************//**
+Determines if this transaction is rolling back an incomplete transaction
+in crash recovery.
+@return TRUE if trx is an incomplete transaction that is being rolled
+back in crash recovery */
+UNIV_INTERN
+ibool
+trx_is_recv(
+/*========*/
+	const trx_t*	trx)	/*!< in: transaction */
+{
+	return(trx == trx_roll_crash_recv_trx);
+}
+
+/*******************************************************************//**
+Returns a transaction savepoint taken at this point in time.
+@return	savepoint */
+UNIV_INTERN
+trx_savept_t
+trx_savept_take(
+/*============*/
+	trx_t*	trx)	/*!< in: transaction */
+{
+	trx_savept_t	savept;
+
+	savept.least_undo_no = trx->undo_no;
+
+	return(savept);
+}
+
+/*******************************************************************//**
+Roll back an active transaction. */
+static
+void
+trx_rollback_active(
+/*================*/
+	trx_t*	trx)	/*!< in/out: transaction */
+{
+	mem_heap_t*	heap;
+	que_fork_t*	fork;
+	que_thr_t*	thr;
+	roll_node_t*	roll_node;
+	dict_table_t*	table;
+	ib_int64_t	rows_to_undo;
+	const char*	unit		= "";
+	ibool		dictionary_locked = FALSE;
+
+	heap = mem_heap_create(512);
+
+	fork = que_fork_create(NULL, NULL, QUE_FORK_RECOVERY, heap);
+	fork->trx = trx;
+
+	thr = que_thr_create(fork, heap);
+
+	roll_node = roll_node_create(heap);
+
+	thr->child = roll_node;
+	roll_node->common.parent = thr;
+
+	mutex_enter(&kernel_mutex);
+
+	trx->graph = fork;
+
+	ut_a(thr == que_fork_start_command(fork));
+
+	trx_roll_crash_recv_trx	= trx;
+	trx_roll_max_undo_no = ut_conv_dulint_to_longlong(trx->undo_no);
+	trx_roll_progress_printed_pct = 0;
+	rows_to_undo = trx_roll_max_undo_no;
+
+	if (rows_to_undo > 1000000000) {
+		rows_to_undo = rows_to_undo / 1000000;
+		unit = "M";
+	}
+
+	ut_print_timestamp(stderr);
+	fprintf(stderr,
+		"  InnoDB: Rolling back trx with id " TRX_ID_FMT ", %lu%s"
+		" rows to undo\n",
+		TRX_ID_PREP_PRINTF(trx->id),
+		(ulong) rows_to_undo, unit);
+	mutex_exit(&kernel_mutex);
+
+	trx->mysql_thread_id = os_thread_get_curr_id();
+
+	trx->mysql_process_no = os_proc_get_number();
+
+	if (trx_get_dict_operation(trx) != TRX_DICT_OP_NONE) {
+		row_mysql_lock_data_dictionary(trx);
+		dictionary_locked = TRUE;
+	}
+
+	que_run_threads(thr);
+
+	mutex_enter(&kernel_mutex);
+
+	while (trx->que_state != TRX_QUE_RUNNING) {
+
+		mutex_exit(&kernel_mutex);
+
+		fprintf(stderr,
+			"InnoDB: Waiting for rollback of trx id %lu to end\n",
+			(ulong) ut_dulint_get_low(trx->id));
+		os_thread_sleep(100000);
+
+		mutex_enter(&kernel_mutex);
+	}
+
+	mutex_exit(&kernel_mutex);
+
+	if (trx_get_dict_operation(trx) != TRX_DICT_OP_NONE
+	    && !ut_dulint_is_zero(trx->table_id)) {
+
+		/* If the transaction was for a dictionary operation, we
+		drop the relevant table, if it still exists */
+
+		fprintf(stderr,
+			"InnoDB: Dropping table with id %lu %lu"
+			" in recovery if it exists\n",
+			(ulong) ut_dulint_get_high(trx->table_id),
+			(ulong) ut_dulint_get_low(trx->table_id));
+
+		table = dict_table_get_on_id_low(trx->table_id);
+
+		if (table) {
+			ulint	err;
+
+			fputs("InnoDB: Table found: dropping table ", stderr);
+			ut_print_name(stderr, trx, TRUE, table->name);
+			fputs(" in recovery\n", stderr);
+
+			err = row_drop_table_for_mysql(table->name, trx, TRUE);
+			trx_commit_for_mysql(trx);
+
+			ut_a(err == (int) DB_SUCCESS);
+		}
+	}
+
+	if (dictionary_locked) {
+		row_mysql_unlock_data_dictionary(trx);
+	}
+
+	fprintf(stderr, "\nInnoDB: Rolling back of trx id " TRX_ID_FMT
+		" completed\n",
+		TRX_ID_PREP_PRINTF(trx->id));
+	mem_heap_free(heap);
+
+	trx_roll_crash_recv_trx	= NULL;
+}
+
+/*******************************************************************//**
+Rollback or clean up any incomplete transactions which were
+encountered in crash recovery.  If the transaction already was
+committed, then we clean up a possible insert undo log. If the
+transaction was not yet committed, then we roll it back. */
+UNIV_INTERN
+void
+trx_rollback_or_clean_recovered(
+/*============================*/
+	ibool	all)	/*!< in: FALSE=roll back dictionary transactions;
+			TRUE=roll back all non-PREPARED transactions */
+{
+	trx_t*	trx;
+
+	mutex_enter(&kernel_mutex);
+
+	if (!UT_LIST_GET_FIRST(trx_sys->trx_list)) {
+		goto leave_function;
+	}
+
+	if (all) {
+		fprintf(stderr,
+			"InnoDB: Starting in background the rollback"
+			" of uncommitted transactions\n");
+	}
+
+	mutex_exit(&kernel_mutex);
+
+loop:
+	mutex_enter(&kernel_mutex);
+
+	for (trx = UT_LIST_GET_FIRST(trx_sys->trx_list); trx;
+	     trx = UT_LIST_GET_NEXT(trx_list, trx)) {
+		if (!trx->is_recovered) {
+			continue;
+		}
+
+		switch (trx->conc_state) {
+		case TRX_NOT_STARTED:
+		case TRX_PREPARED:
+			continue;
+
+		case TRX_COMMITTED_IN_MEMORY:
+			mutex_exit(&kernel_mutex);
+			fprintf(stderr,
+				"InnoDB: Cleaning up trx with id "
+				TRX_ID_FMT "\n",
+				TRX_ID_PREP_PRINTF(trx->id));
+			trx_cleanup_at_db_startup(trx);
+			goto loop;
+
+		case TRX_ACTIVE:
+			if (all || trx_get_dict_operation(trx)
+			    != TRX_DICT_OP_NONE) {
+				mutex_exit(&kernel_mutex);
+				trx_rollback_active(trx);
+				goto loop;
+			}
+		}
+	}
+
+	if (all) {
+		ut_print_timestamp(stderr);
+		fprintf(stderr,
+			"  InnoDB: Rollback of non-prepared"
+			" transactions completed\n");
+	}
+
+leave_function:
+	mutex_exit(&kernel_mutex);
+}
+
+/*******************************************************************//**
+Rollback or clean up any incomplete transactions which were
+encountered in crash recovery.  If the transaction already was
+committed, then we clean up a possible insert undo log. If the
+transaction was not yet committed, then we roll it back.
+Note: this is done in a background thread.
+@return	a dummy parameter */
+UNIV_INTERN
+os_thread_ret_t
+trx_rollback_or_clean_all_recovered(
+/*================================*/
+	void*	arg __attribute__((unused)))
+			/*!< in: a dummy parameter required by
+			os_thread_create */
+{
+	trx_rollback_or_clean_recovered(TRUE);
+
+	/* We count the number of threads in os_thread_exit(). A created
+	thread should always use that to exit and not use return() to exit. */
+
+	os_thread_exit(NULL);
+
+	OS_THREAD_DUMMY_RETURN;
+}
+
+/*******************************************************************//**
+Creates an undo number array.
+@return	own: undo number array */
+UNIV_INTERN
+trx_undo_arr_t*
+trx_undo_arr_create(void)
+/*=====================*/
+{
+	trx_undo_arr_t*	arr;
+	mem_heap_t*	heap;
+	ulint		i;
+
+	heap = mem_heap_create(1024);
+
+	arr = mem_heap_alloc(heap, sizeof(trx_undo_arr_t));
+
+	arr->infos = mem_heap_alloc(heap, sizeof(trx_undo_inf_t)
+				    * UNIV_MAX_PARALLELISM);
+	arr->n_cells = UNIV_MAX_PARALLELISM;
+	arr->n_used = 0;
+
+	arr->heap = heap;
+
+	for (i = 0; i < UNIV_MAX_PARALLELISM; i++) {
+
+		(trx_undo_arr_get_nth_info(arr, i))->in_use = FALSE;
+	}
+
+	return(arr);
+}
+
+/*******************************************************************//**
+Frees an undo number array. */
+UNIV_INTERN
+void
+trx_undo_arr_free(
+/*==============*/
+	trx_undo_arr_t*	arr)	/*!< in: undo number array */
+{
+	ut_ad(arr->n_used == 0);
+
+	mem_heap_free(arr->heap);
+}
+
+/*******************************************************************//**
+Stores info of an undo log record to the array if it is not stored yet.
+@return	FALSE if the record already existed in the array */
+static
+ibool
+trx_undo_arr_store_info(
+/*====================*/
+	trx_t*		trx,	/*!< in: transaction */
+	undo_no_t	undo_no)/*!< in: undo number */
+{
+	trx_undo_inf_t*	cell;
+	trx_undo_inf_t*	stored_here;
+	trx_undo_arr_t*	arr;
+	ulint		n_used;
+	ulint		n;
+	ulint		i;
+
+	n = 0;
+	arr = trx->undo_no_arr;
+	n_used = arr->n_used;
+	stored_here = NULL;
+
+	for (i = 0;; i++) {
+		cell = trx_undo_arr_get_nth_info(arr, i);
+
+		if (!cell->in_use) {
+			if (!stored_here) {
+				/* Not in use, we may store here */
+				cell->undo_no = undo_no;
+				cell->in_use = TRUE;
+
+				arr->n_used++;
+
+				stored_here = cell;
+			}
+		} else {
+			n++;
+
+			if (0 == ut_dulint_cmp(cell->undo_no, undo_no)) {
+
+				if (stored_here) {
+					stored_here->in_use = FALSE;
+					ut_ad(arr->n_used > 0);
+					arr->n_used--;
+				}
+
+				ut_ad(arr->n_used == n_used);
+
+				return(FALSE);
+			}
+		}
+
+		if (n == n_used && stored_here) {
+
+			ut_ad(arr->n_used == 1 + n_used);
+
+			return(TRUE);
+		}
+	}
+}
+
+/*******************************************************************//**
+Removes an undo number from the array. */
+static
+void
+trx_undo_arr_remove_info(
+/*=====================*/
+	trx_undo_arr_t*	arr,	/*!< in: undo number array */
+	undo_no_t	undo_no)/*!< in: undo number */
+{
+	trx_undo_inf_t*	cell;
+	ulint		n_used;
+	ulint		n;
+	ulint		i;
+
+	n_used = arr->n_used;
+	n = 0;
+
+	for (i = 0;; i++) {
+		cell = trx_undo_arr_get_nth_info(arr, i);
+
+		if (cell->in_use
+		    && 0 == ut_dulint_cmp(cell->undo_no, undo_no)) {
+
+			cell->in_use = FALSE;
+
+			ut_ad(arr->n_used > 0);
+
+			arr->n_used--;
+
+			return;
+		}
+	}
+}
+
+/*******************************************************************//**
+Gets the biggest undo number in an array.
+@return	biggest value, ut_dulint_zero if the array is empty */
+static
+undo_no_t
+trx_undo_arr_get_biggest(
+/*=====================*/
+	trx_undo_arr_t*	arr)	/*!< in: undo number array */
+{
+	trx_undo_inf_t*	cell;
+	ulint		n_used;
+	undo_no_t	biggest;
+	ulint		n;
+	ulint		i;
+
+	n = 0;
+	n_used = arr->n_used;
+	biggest = ut_dulint_zero;
+
+	for (i = 0;; i++) {
+		cell = trx_undo_arr_get_nth_info(arr, i);
+
+		if (cell->in_use) {
+			n++;
+			if (ut_dulint_cmp(cell->undo_no, biggest) > 0) {
+
+				biggest = cell->undo_no;
+			}
+		}
+
+		if (n == n_used) {
+			return(biggest);
+		}
+	}
+}
+
+/***********************************************************************//**
+Tries truncate the undo logs. */
+UNIV_INTERN
+void
+trx_roll_try_truncate(
+/*==================*/
+	trx_t*	trx)	/*!< in/out: transaction */
+{
+	trx_undo_arr_t*	arr;
+	undo_no_t	limit;
+	undo_no_t	biggest;
+
+	ut_ad(mutex_own(&(trx->undo_mutex)));
+	ut_ad(mutex_own(&((trx->rseg)->mutex)));
+
+	trx->pages_undone = 0;
+
+	arr = trx->undo_no_arr;
+
+	limit = trx->undo_no;
+
+	if (arr->n_used > 0) {
+		biggest = trx_undo_arr_get_biggest(arr);
+
+		if (ut_dulint_cmp(biggest, limit) >= 0) {
+
+			limit = ut_dulint_add(biggest, 1);
+		}
+	}
+
+	if (trx->insert_undo) {
+		trx_undo_truncate_end(trx, trx->insert_undo, limit);
+	}
+
+	if (trx->update_undo) {
+		trx_undo_truncate_end(trx, trx->update_undo, limit);
+	}
+}
+
+/***********************************************************************//**
+Pops the topmost undo log record in a single undo log and updates the info
+about the topmost record in the undo log memory struct.
+@return	undo log record, the page s-latched */
+static
+trx_undo_rec_t*
+trx_roll_pop_top_rec(
+/*=================*/
+	trx_t*		trx,	/*!< in: transaction */
+	trx_undo_t*	undo,	/*!< in: undo log */
+	mtr_t*		mtr)	/*!< in: mtr */
+{
+	page_t*		undo_page;
+	ulint		offset;
+	trx_undo_rec_t*	prev_rec;
+	page_t*		prev_rec_page;
+
+	ut_ad(mutex_own(&(trx->undo_mutex)));
+
+	undo_page = trx_undo_page_get_s_latched(undo->space, undo->zip_size,
+						undo->top_page_no, mtr);
+	offset = undo->top_offset;
+
+	/*	fprintf(stderr, "Thread %lu undoing trx %lu undo record %lu\n",
+	os_thread_get_curr_id(), ut_dulint_get_low(trx->id),
+	ut_dulint_get_low(undo->top_undo_no)); */
+
+	prev_rec = trx_undo_get_prev_rec(undo_page + offset,
+					 undo->hdr_page_no, undo->hdr_offset,
+					 mtr);
+	if (prev_rec == NULL) {
+
+		undo->empty = TRUE;
+	} else {
+		prev_rec_page = page_align(prev_rec);
+
+		if (prev_rec_page != undo_page) {
+
+			trx->pages_undone++;
+		}
+
+		undo->top_page_no = page_get_page_no(prev_rec_page);
+		undo->top_offset  = prev_rec - prev_rec_page;
+		undo->top_undo_no = trx_undo_rec_get_undo_no(prev_rec);
+	}
+
+	return(undo_page + offset);
+}
+
+/********************************************************************//**
+Pops the topmost record when the two undo logs of a transaction are seen
+as a single stack of records ordered by their undo numbers. Inserts the
+undo number of the popped undo record to the array of currently processed
+undo numbers in the transaction. When the query thread finishes processing
+of this undo record, it must be released with trx_undo_rec_release.
+@return undo log record copied to heap, NULL if none left, or if the
+undo number of the top record would be less than the limit */
+UNIV_INTERN
+trx_undo_rec_t*
+trx_roll_pop_top_rec_of_trx(
+/*========================*/
+	trx_t*		trx,	/*!< in: transaction */
+	undo_no_t	limit,	/*!< in: least undo number we need */
+	roll_ptr_t*	roll_ptr,/*!< out: roll pointer to undo record */
+	mem_heap_t*	heap)	/*!< in: memory heap where copied */
+{
+	trx_undo_t*	undo;
+	trx_undo_t*	ins_undo;
+	trx_undo_t*	upd_undo;
+	trx_undo_rec_t*	undo_rec;
+	trx_undo_rec_t*	undo_rec_copy;
+	undo_no_t	undo_no;
+	ibool		is_insert;
+	trx_rseg_t*	rseg;
+	ulint		progress_pct;
+	mtr_t		mtr;
+
+	rseg = trx->rseg;
+try_again:
+	mutex_enter(&(trx->undo_mutex));
+
+	if (trx->pages_undone >= TRX_ROLL_TRUNC_THRESHOLD) {
+		mutex_enter(&(rseg->mutex));
+
+		trx_roll_try_truncate(trx);
+
+		mutex_exit(&(rseg->mutex));
+	}
+
+	ins_undo = trx->insert_undo;
+	upd_undo = trx->update_undo;
+
+	if (!ins_undo || ins_undo->empty) {
+		undo = upd_undo;
+	} else if (!upd_undo || upd_undo->empty) {
+		undo = ins_undo;
+	} else if (ut_dulint_cmp(upd_undo->top_undo_no,
+				 ins_undo->top_undo_no) > 0) {
+		undo = upd_undo;
+	} else {
+		undo = ins_undo;
+	}
+
+	if (!undo || undo->empty
+	    || (ut_dulint_cmp(limit, undo->top_undo_no) > 0)) {
+
+		if ((trx->undo_no_arr)->n_used == 0) {
+			/* Rollback is ending */
+
+			mutex_enter(&(rseg->mutex));
+
+			trx_roll_try_truncate(trx);
+
+			mutex_exit(&(rseg->mutex));
+		}
+
+		mutex_exit(&(trx->undo_mutex));
+
+		return(NULL);
+	}
+
+	if (undo == ins_undo) {
+		is_insert = TRUE;
+	} else {
+		is_insert = FALSE;
+	}
+
+	*roll_ptr = trx_undo_build_roll_ptr(is_insert, (undo->rseg)->id,
+					    undo->top_page_no,
+					    undo->top_offset);
+	mtr_start(&mtr);
+
+	undo_rec = trx_roll_pop_top_rec(trx, undo, &mtr);
+
+	undo_no = trx_undo_rec_get_undo_no(undo_rec);
+
+	ut_ad(ut_dulint_cmp(ut_dulint_add(undo_no, 1), trx->undo_no) == 0);
+
+	/* We print rollback progress info if we are in a crash recovery
+	and the transaction has at least 1000 row operations to undo. */
+
+	if (trx == trx_roll_crash_recv_trx && trx_roll_max_undo_no > 1000) {
+
+		progress_pct = 100 - (ulint)
+			((ut_conv_dulint_to_longlong(undo_no) * 100)
+			 / trx_roll_max_undo_no);
+		if (progress_pct != trx_roll_progress_printed_pct) {
+			if (trx_roll_progress_printed_pct == 0) {
+				fprintf(stderr,
+					"\nInnoDB: Progress in percents:"
+					" %lu", (ulong) progress_pct);
+			} else {
+				fprintf(stderr,
+					" %lu", (ulong) progress_pct);
+			}
+			fflush(stderr);
+			trx_roll_progress_printed_pct = progress_pct;
+		}
+	}
+
+	trx->undo_no = undo_no;
+
+	if (!trx_undo_arr_store_info(trx, undo_no)) {
+		/* A query thread is already processing this undo log record */
+
+		mutex_exit(&(trx->undo_mutex));
+
+		mtr_commit(&mtr);
+
+		goto try_again;
+	}
+
+	undo_rec_copy = trx_undo_rec_copy(undo_rec, heap);
+
+	mutex_exit(&(trx->undo_mutex));
+
+	mtr_commit(&mtr);
+
+	return(undo_rec_copy);
+}
+
+/********************************************************************//**
+Reserves an undo log record for a query thread to undo. This should be
+called if the query thread gets the undo log record not using the pop
+function above.
+@return	TRUE if succeeded */
+UNIV_INTERN
+ibool
+trx_undo_rec_reserve(
+/*=================*/
+	trx_t*		trx,	/*!< in/out: transaction */
+	undo_no_t	undo_no)/*!< in: undo number of the record */
+{
+	ibool	ret;
+
+	mutex_enter(&(trx->undo_mutex));
+
+	ret = trx_undo_arr_store_info(trx, undo_no);
+
+	mutex_exit(&(trx->undo_mutex));
+
+	return(ret);
+}
+
+/*******************************************************************//**
+Releases a reserved undo record. */
+UNIV_INTERN
+void
+trx_undo_rec_release(
+/*=================*/
+	trx_t*		trx,	/*!< in/out: transaction */
+	undo_no_t	undo_no)/*!< in: undo number */
+{
+	trx_undo_arr_t*	arr;
+
+	mutex_enter(&(trx->undo_mutex));
+
+	arr = trx->undo_no_arr;
+
+	trx_undo_arr_remove_info(arr, undo_no);
+
+	mutex_exit(&(trx->undo_mutex));
+}
+
+/*********************************************************************//**
+Starts a rollback operation. */
+UNIV_INTERN
+void
+trx_rollback(
+/*=========*/
+	trx_t*		trx,	/*!< in: transaction */
+	trx_sig_t*	sig,	/*!< in: signal starting the rollback */
+	que_thr_t**	next_thr)/*!< in/out: next query thread to run;
+				if the value which is passed in is
+				a pointer to a NULL pointer, then the
+				calling function can start running
+				a new query thread; if the passed value is
+				NULL, the parameter is ignored */
+{
+	que_t*		roll_graph;
+	que_thr_t*	thr;
+	/*	que_thr_t*	thr2; */
+
+	ut_ad(mutex_own(&kernel_mutex));
+	ut_ad((trx->undo_no_arr == NULL) || ((trx->undo_no_arr)->n_used == 0));
+
+	/* Initialize the rollback field in the transaction */
+
+	if (sig->type == TRX_SIG_TOTAL_ROLLBACK) {
+
+		trx->roll_limit = ut_dulint_zero;
+
+	} else if (sig->type == TRX_SIG_ROLLBACK_TO_SAVEPT) {
+
+		trx->roll_limit = (sig->savept).least_undo_no;
+
+	} else if (sig->type == TRX_SIG_ERROR_OCCURRED) {
+
+		trx->roll_limit = trx->last_sql_stat_start.least_undo_no;
+	} else {
+		ut_error;
+	}
+
+	ut_a(ut_dulint_cmp(trx->roll_limit, trx->undo_no) <= 0);
+
+	trx->pages_undone = 0;
+
+	if (trx->undo_no_arr == NULL) {
+		trx->undo_no_arr = trx_undo_arr_create();
+	}
+
+	/* Build a 'query' graph which will perform the undo operations */
+
+	roll_graph = trx_roll_graph_build(trx);
+
+	trx->graph = roll_graph;
+	trx->que_state = TRX_QUE_ROLLING_BACK;
+
+	thr = que_fork_start_command(roll_graph);
+
+	ut_ad(thr);
+
+	/*	thr2 = que_fork_start_command(roll_graph);
+
+	ut_ad(thr2); */
+
+	if (next_thr && (*next_thr == NULL)) {
+		*next_thr = thr;
+		/*		srv_que_task_enqueue_low(thr2); */
+	} else {
+		srv_que_task_enqueue_low(thr);
+		/*		srv_que_task_enqueue_low(thr2); */
+	}
+}
+
+/****************************************************************//**
+Builds an undo 'query' graph for a transaction. The actual rollback is
+performed by executing this query graph like a query subprocedure call.
+The reply about the completion of the rollback will be sent by this
+graph.
+@return	own: the query graph */
+UNIV_INTERN
+que_t*
+trx_roll_graph_build(
+/*=================*/
+	trx_t*	trx)	/*!< in: trx handle */
+{
+	mem_heap_t*	heap;
+	que_fork_t*	fork;
+	que_thr_t*	thr;
+	/*	que_thr_t*	thr2; */
+
+	ut_ad(mutex_own(&kernel_mutex));
+
+	heap = mem_heap_create(512);
+	fork = que_fork_create(NULL, NULL, QUE_FORK_ROLLBACK, heap);
+	fork->trx = trx;
+
+	thr = que_thr_create(fork, heap);
+	/*	thr2 = que_thr_create(fork, heap); */
+
+	thr->child = row_undo_node_create(trx, thr, heap);
+	/*	thr2->child = row_undo_node_create(trx, thr2, heap); */
+
+	return(fork);
+}
+
+/*********************************************************************//**
+Finishes error processing after the necessary partial rollback has been
+done. */
+static
+void
+trx_finish_error_processing(
+/*========================*/
+	trx_t*	trx)	/*!< in: transaction */
+{
+	trx_sig_t*	sig;
+	trx_sig_t*	next_sig;
+
+	ut_ad(mutex_own(&kernel_mutex));
+
+	sig = UT_LIST_GET_FIRST(trx->signals);
+
+	while (sig != NULL) {
+		next_sig = UT_LIST_GET_NEXT(signals, sig);
+
+		if (sig->type == TRX_SIG_ERROR_OCCURRED) {
+
+			trx_sig_remove(trx, sig);
+		}
+
+		sig = next_sig;
+	}
+
+	trx->que_state = TRX_QUE_RUNNING;
+}
+
+/*********************************************************************//**
+Finishes a partial rollback operation. */
+static
+void
+trx_finish_partial_rollback_off_kernel(
+/*===================================*/
+	trx_t*		trx,	/*!< in: transaction */
+	que_thr_t**	next_thr)/*!< in/out: next query thread to run;
+				if the value which is passed in is a pointer
+				to a NULL pointer, then the calling function
+				can start running a new query thread; if this
+				parameter is NULL, it is ignored */
+{
+	trx_sig_t*	sig;
+
+	ut_ad(mutex_own(&kernel_mutex));
+
+	sig = UT_LIST_GET_FIRST(trx->signals);
+
+	/* Remove the signal from the signal queue and send reply message
+	to it */
+
+	trx_sig_reply(sig, next_thr);
+	trx_sig_remove(trx, sig);
+
+	trx->que_state = TRX_QUE_RUNNING;
+}
+
+/****************************************************************//**
+Finishes a transaction rollback. */
+UNIV_INTERN
+void
+trx_finish_rollback_off_kernel(
+/*===========================*/
+	que_t*		graph,	/*!< in: undo graph which can now be freed */
+	trx_t*		trx,	/*!< in: transaction */
+	que_thr_t**	next_thr)/*!< in/out: next query thread to run;
+				if the value which is passed in is
+				a pointer to a NULL pointer, then the
+				calling function can start running
+				a new query thread; if this parameter is
+				NULL, it is ignored */
+{
+	trx_sig_t*	sig;
+	trx_sig_t*	next_sig;
+
+	ut_ad(mutex_own(&kernel_mutex));
+
+	ut_a(trx->undo_no_arr == NULL || trx->undo_no_arr->n_used == 0);
+
+	/* Free the memory reserved by the undo graph */
+	que_graph_free(graph);
+
+	sig = UT_LIST_GET_FIRST(trx->signals);
+
+	if (sig->type == TRX_SIG_ROLLBACK_TO_SAVEPT) {
+
+		trx_finish_partial_rollback_off_kernel(trx, next_thr);
+
+		return;
+
+	} else if (sig->type == TRX_SIG_ERROR_OCCURRED) {
+
+		trx_finish_error_processing(trx);
+
+		return;
+	}
+
+#ifdef UNIV_DEBUG
+	if (lock_print_waits) {
+		fprintf(stderr, "Trx %lu rollback finished\n",
+			(ulong) ut_dulint_get_low(trx->id));
+	}
+#endif /* UNIV_DEBUG */
+
+	trx_commit_off_kernel(trx);
+
+	/* Remove all TRX_SIG_TOTAL_ROLLBACK signals from the signal queue and
+	send reply messages to them */
+
+	trx->que_state = TRX_QUE_RUNNING;
+
+	while (sig != NULL) {
+		next_sig = UT_LIST_GET_NEXT(signals, sig);
+
+		if (sig->type == TRX_SIG_TOTAL_ROLLBACK) {
+
+			trx_sig_reply(sig, next_thr);
+
+			trx_sig_remove(trx, sig);
+		}
+
+		sig = next_sig;
+	}
+}
+
+/*********************************************************************//**
+Creates a rollback command node struct.
+@return	own: rollback node struct */
+UNIV_INTERN
+roll_node_t*
+roll_node_create(
+/*=============*/
+	mem_heap_t*	heap)	/*!< in: mem heap where created */
+{
+	roll_node_t*	node;
+
+	node = mem_heap_alloc(heap, sizeof(roll_node_t));
+	node->common.type = QUE_NODE_ROLLBACK;
+	node->state = ROLL_NODE_SEND;
+
+	node->partial = FALSE;
+
+	return(node);
+}
+
+/***********************************************************//**
+Performs an execution step for a rollback command node in a query graph.
+@return	query thread to run next, or NULL */
+UNIV_INTERN
+que_thr_t*
+trx_rollback_step(
+/*==============*/
+	que_thr_t*	thr)	/*!< in: query thread */
+{
+	roll_node_t*	node;
+	ulint		sig_no;
+	trx_savept_t*	savept;
+
+	node = thr->run_node;
+
+	ut_ad(que_node_get_type(node) == QUE_NODE_ROLLBACK);
+
+	if (thr->prev_node == que_node_get_parent(node)) {
+		node->state = ROLL_NODE_SEND;
+	}
+
+	if (node->state == ROLL_NODE_SEND) {
+		mutex_enter(&kernel_mutex);
+
+		node->state = ROLL_NODE_WAIT;
+
+		if (node->partial) {
+			sig_no = TRX_SIG_ROLLBACK_TO_SAVEPT;
+			savept = &(node->savept);
+		} else {
+			sig_no = TRX_SIG_TOTAL_ROLLBACK;
+			savept = NULL;
+		}
+
+		/* Send a rollback signal to the transaction */
+
+		trx_sig_send(thr_get_trx(thr), sig_no, TRX_SIG_SELF, thr,
+			     savept, NULL);
+
+		thr->state = QUE_THR_SIG_REPLY_WAIT;
+
+		mutex_exit(&kernel_mutex);
+
+		return(NULL);
+	}
+
+	ut_ad(node->state == ROLL_NODE_WAIT);
+
+	thr->run_node = que_node_get_parent(node);
+
+	return(thr);
+}
diff --git a/storage/xtradb/trx/trx0rseg.c b/storage/xtradb/trx/trx0rseg.c
new file mode 100644
index 00000000000..57b5611d624
--- /dev/null
+++ b/storage/xtradb/trx/trx0rseg.c
@@ -0,0 +1,324 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2010, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file trx/trx0rseg.c
+Rollback segment
+
+Created 3/26/1996 Heikki Tuuri
+*******************************************************/
+
+#include "trx0rseg.h"
+
+#ifdef UNIV_NONINL
+#include "trx0rseg.ic"
+#endif
+
+#include "trx0undo.h"
+#include "fut0lst.h"
+#include "srv0srv.h"
+#include "trx0purge.h"
+
+/******************************************************************//**
+Looks for a rollback segment, based on the rollback segment id.
+@return	rollback segment */
+UNIV_INTERN
+trx_rseg_t*
+trx_rseg_get_on_id(
+/*===============*/
+	ulint	id)	/*!< in: rollback segment id */
+{
+	trx_rseg_t*	rseg;
+
+	rseg = UT_LIST_GET_FIRST(trx_sys->rseg_list);
+	ut_ad(rseg);
+
+	while (rseg->id != id) {
+		rseg = UT_LIST_GET_NEXT(rseg_list, rseg);
+		ut_ad(rseg);
+	}
+
+	return(rseg);
+}
+
+/****************************************************************//**
+Creates a rollback segment header. This function is called only when
+a new rollback segment is created in the database.
+@return	page number of the created segment, FIL_NULL if fail */
+UNIV_INTERN
+ulint
+trx_rseg_header_create(
+/*===================*/
+	ulint	space,		/*!< in: space id */
+	ulint	zip_size,	/*!< in: compressed page size in bytes
+				or 0 for uncompressed pages */
+	ulint	max_size,	/*!< in: max size in pages */
+	ulint*	slot_no,	/*!< out: rseg id == slot number in trx sys */
+	mtr_t*	mtr)		/*!< in: mtr */
+{
+	ulint		page_no;
+	trx_rsegf_t*	rsegf;
+	trx_sysf_t*	sys_header;
+	ulint		i;
+	buf_block_t*	block;
+
+	ut_ad(mtr);
+	ut_ad(mutex_own(&kernel_mutex));
+	ut_ad(mtr_memo_contains(mtr, fil_space_get_latch(space, NULL),
+				MTR_MEMO_X_LOCK));
+	sys_header = trx_sysf_get(mtr);
+
+	*slot_no = trx_sysf_rseg_find_free(mtr);
+
+	if (*slot_no == ULINT_UNDEFINED) {
+
+		return(FIL_NULL);
+	}
+
+	/* Allocate a new file segment for the rollback segment */
+	block = fseg_create(space, 0,
+			    TRX_RSEG + TRX_RSEG_FSEG_HEADER, mtr);
+
+	if (block == NULL) {
+		/* No space left */
+
+		return(FIL_NULL);
+	}
+
+	buf_block_dbg_add_level(block, SYNC_RSEG_HEADER_NEW);
+
+	page_no = buf_block_get_page_no(block);
+
+	/* Get the rollback segment file page */
+	rsegf = trx_rsegf_get_new(space, zip_size, page_no, mtr);
+
+	/* Initialize max size field */
+	mlog_write_ulint(rsegf + TRX_RSEG_MAX_SIZE, max_size,
+			 MLOG_4BYTES, mtr);
+
+	/* Initialize the history list */
+
+	mlog_write_ulint(rsegf + TRX_RSEG_HISTORY_SIZE, 0, MLOG_4BYTES, mtr);
+	flst_init(rsegf + TRX_RSEG_HISTORY, mtr);
+
+	/* Reset the undo log slots */
+	for (i = 0; i < TRX_RSEG_N_SLOTS; i++) {
+
+		trx_rsegf_set_nth_undo(rsegf, i, FIL_NULL, mtr);
+	}
+
+	/* Add the rollback segment info to the free slot in the trx system
+	header */
+
+	trx_sysf_rseg_set_space(sys_header, *slot_no, space, mtr);
+	trx_sysf_rseg_set_page_no(sys_header, *slot_no, page_no, mtr);
+
+	return(page_no);
+}
+
+/***********************************************************************//**
+Free's an instance of the rollback segment in memory. */
+UNIV_INTERN
+void
+trx_rseg_mem_free(
+/*==============*/
+	trx_rseg_t*	rseg)	/* in, own: instance to free */
+{
+	trx_undo_t*	undo;
+
+	mutex_free(&rseg->mutex);
+
+	/* There can't be any active transactions. */
+	ut_a(UT_LIST_GET_LEN(rseg->update_undo_list) == 0);
+	ut_a(UT_LIST_GET_LEN(rseg->insert_undo_list) == 0);
+
+	undo = UT_LIST_GET_FIRST(rseg->update_undo_cached);
+
+	while (undo != NULL) {
+		trx_undo_t*	prev_undo = undo;
+
+		undo = UT_LIST_GET_NEXT(undo_list, undo);
+		UT_LIST_REMOVE(undo_list, rseg->update_undo_cached, prev_undo);
+
+		trx_undo_mem_free(prev_undo);
+	}
+
+	undo = UT_LIST_GET_FIRST(rseg->insert_undo_cached);
+
+	while (undo != NULL) {
+		trx_undo_t*	prev_undo = undo;
+
+		undo = UT_LIST_GET_NEXT(undo_list, undo);
+		UT_LIST_REMOVE(undo_list, rseg->insert_undo_cached, prev_undo);
+
+		trx_undo_mem_free(prev_undo);
+	}
+
+	trx_sys_set_nth_rseg(trx_sys, rseg->id, NULL);
+
+	mem_free(rseg);
+}
+
+/***************************************************************************
+Creates and initializes a rollback segment object. The values for the
+fields are read from the header. The object is inserted to the rseg
+list of the trx system object and a pointer is inserted in the rseg
+array in the trx system object.
+@return	own: rollback segment object */
+static
+trx_rseg_t*
+trx_rseg_mem_create(
+/*================*/
+	ulint	id,		/*!< in: rollback segment id */
+	ulint	space,		/*!< in: space where the segment placed */
+	ulint	zip_size,	/*!< in: compressed page size in bytes
+				or 0 for uncompressed pages */
+	ulint	page_no,	/*!< in: page number of the segment header */
+	mtr_t*	mtr)		/*!< in: mtr */
+{
+	trx_rsegf_t*	rseg_header;
+	trx_rseg_t*	rseg;
+	trx_ulogf_t*	undo_log_hdr;
+	fil_addr_t	node_addr;
+	ulint		sum_of_undo_sizes;
+	ulint		len;
+
+	ut_ad(mutex_own(&kernel_mutex));
+
+	rseg = mem_alloc(sizeof(trx_rseg_t));
+
+	rseg->id = id;
+	rseg->space = space;
+	rseg->zip_size = zip_size;
+	rseg->page_no = page_no;
+
+	mutex_create(&rseg->mutex, SYNC_RSEG);
+
+	UT_LIST_ADD_LAST(rseg_list, trx_sys->rseg_list, rseg);
+
+	trx_sys_set_nth_rseg(trx_sys, id, rseg);
+
+	rseg_header = trx_rsegf_get_new(space, zip_size, page_no, mtr);
+
+	rseg->max_size = mtr_read_ulint(rseg_header + TRX_RSEG_MAX_SIZE,
+					MLOG_4BYTES, mtr);
+
+	/* Initialize the undo log lists according to the rseg header */
+
+	sum_of_undo_sizes = trx_undo_lists_init(rseg);
+
+	rseg->curr_size = mtr_read_ulint(rseg_header + TRX_RSEG_HISTORY_SIZE,
+					 MLOG_4BYTES, mtr)
+		+ 1 + sum_of_undo_sizes;
+
+	len = flst_get_len(rseg_header + TRX_RSEG_HISTORY, mtr);
+	if (len > 0) {
+		trx_sys->rseg_history_len += len;
+
+		node_addr = trx_purge_get_log_from_hist(
+			flst_get_last(rseg_header + TRX_RSEG_HISTORY, mtr));
+		rseg->last_page_no = node_addr.page;
+		rseg->last_offset = node_addr.boffset;
+
+		undo_log_hdr = trx_undo_page_get(rseg->space, rseg->zip_size,
+						 node_addr.page,
+						 mtr) + node_addr.boffset;
+
+		rseg->last_trx_no = mtr_read_dulint(
+			undo_log_hdr + TRX_UNDO_TRX_NO, mtr);
+		rseg->last_del_marks = mtr_read_ulint(
+			undo_log_hdr + TRX_UNDO_DEL_MARKS, MLOG_2BYTES, mtr);
+	} else {
+		rseg->last_page_no = FIL_NULL;
+	}
+
+	return(rseg);
+}
+
+/*********************************************************************//**
+Creates the memory copies for rollback segments and initializes the
+rseg list and array in trx_sys at a database startup. */
+UNIV_INTERN
+void
+trx_rseg_list_and_array_init(
+/*=========================*/
+	trx_sysf_t*	sys_header,	/*!< in: trx system header */
+	mtr_t*		mtr)		/*!< in: mtr */
+{
+	ulint	i;
+	ulint	page_no;
+	ulint	space;
+
+	UT_LIST_INIT(trx_sys->rseg_list);
+
+	trx_sys->rseg_history_len = 0;
+
+	for (i = 0; i < TRX_SYS_N_RSEGS; i++) {
+
+		page_no = trx_sysf_rseg_get_page_no(sys_header, i, mtr);
+
+		if (page_no == FIL_NULL) {
+
+			trx_sys_set_nth_rseg(trx_sys, i, NULL);
+		} else {
+			ulint	zip_size;
+
+			space = trx_sysf_rseg_get_space(sys_header, i, mtr);
+
+			zip_size = space ? fil_space_get_zip_size(space) : 0;
+
+			trx_rseg_mem_create(i, space, zip_size, page_no, mtr);
+		}
+	}
+}
+
+/****************************************************************//**
+Creates a new rollback segment to the database.
+@return	the created segment object, NULL if fail */
+UNIV_INTERN
+trx_rseg_t*
+trx_rseg_create(
+/*============*/
+	ulint	space,		/*!< in: space id */
+	ulint	max_size,	/*!< in: max size in pages */
+	ulint*	id,		/*!< out: rseg id */
+	mtr_t*	mtr)		/*!< in: mtr */
+{
+	ulint		flags;
+	ulint		zip_size;
+	ulint		page_no;
+	trx_rseg_t*	rseg;
+
+	mtr_x_lock(fil_space_get_latch(space, &flags), mtr);
+	zip_size = dict_table_flags_to_zip_size(flags);
+	mutex_enter(&kernel_mutex);
+
+	page_no = trx_rseg_header_create(space, zip_size, max_size, id, mtr);
+
+	if (page_no == FIL_NULL) {
+
+		mutex_exit(&kernel_mutex);
+		return(NULL);
+	}
+
+	rseg = trx_rseg_mem_create(*id, space, zip_size, page_no, mtr);
+
+	mutex_exit(&kernel_mutex);
+
+	return(rseg);
+}
diff --git a/storage/xtradb/trx/trx0sys.c b/storage/xtradb/trx/trx0sys.c
new file mode 100644
index 00000000000..11581a3f2ae
--- /dev/null
+++ b/storage/xtradb/trx/trx0sys.c
@@ -0,0 +1,1936 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2010, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file trx/trx0sys.c
+Transaction system
+
+Created 3/26/1996 Heikki Tuuri
+*******************************************************/
+
+#include "trx0sys.h"
+
+#ifdef UNIV_NONINL
+#include "trx0sys.ic"
+#endif
+
+#ifndef UNIV_HOTBACKUP
+#include "fsp0fsp.h"
+#include "mtr0log.h"
+#include "mtr0log.h"
+#include "trx0trx.h"
+#include "trx0rseg.h"
+#include "trx0undo.h"
+#include "srv0srv.h"
+#include "trx0purge.h"
+#include "log0log.h"
+#include "log0recv.h"
+#include "os0file.h"
+#include "read0read.h"
+
+/** The file format tag structure with id and name. */
+struct file_format_struct {
+	ulint		id;		/*!< id of the file format */
+	const char*	name;		/*!< text representation of the
+					file format */
+	mutex_t		mutex;		/*!< covers changes to the above
+					fields */
+};
+
+/** The file format tag */
+typedef struct file_format_struct	file_format_t;
+
+/** The transaction system */
+UNIV_INTERN trx_sys_t*		trx_sys		= NULL;
+/** The doublewrite buffer */
+UNIV_INTERN trx_doublewrite_t*	trx_doublewrite = NULL;
+
+/** The following is set to TRUE when we are upgrading from pre-4.1
+format data files to the multiple tablespaces format data files */
+UNIV_INTERN ibool	trx_doublewrite_must_reset_space_ids	= FALSE;
+/** Set to TRUE when the doublewrite buffer is being created */
+UNIV_INTERN ibool	trx_doublewrite_buf_is_being_created = FALSE;
+
+/** The following is TRUE when we are using the database in the
+post-4.1 format, i.e., we have successfully upgraded, or have created
+a new database installation */
+UNIV_INTERN ibool	trx_sys_multiple_tablespace_format	= FALSE;
+
+/** In a MySQL replication slave, in crash recovery we store the master log
+file name and position here. */
+/* @{ */
+/** Master binlog file name */
+UNIV_INTERN char	trx_sys_mysql_master_log_name[TRX_SYS_MYSQL_MASTER_LOG_NAME_LEN];
+/** Master binlog file position.  We have successfully got the updates
+up to this position.  -1 means that no crash recovery was needed, or
+there was no master log position info inside InnoDB.*/
+UNIV_INTERN ib_int64_t	trx_sys_mysql_master_log_pos	= -1;
+/* @} */
+
+UNIV_INTERN char	trx_sys_mysql_relay_log_name[TRX_SYS_MYSQL_MASTER_LOG_NAME_LEN];
+UNIV_INTERN ib_int64_t	trx_sys_mysql_relay_log_pos	= -1;
+
+/** If this MySQL server uses binary logging, after InnoDB has been inited
+and if it has done a crash recovery, we store the binlog file name and position
+here. */
+/* @{ */
+/** Binlog file name */
+UNIV_INTERN char	trx_sys_mysql_bin_log_name[TRX_SYS_MYSQL_LOG_NAME_LEN];
+/** Binlog file position, or -1 if unknown */
+UNIV_INTERN ib_int64_t	trx_sys_mysql_bin_log_pos	= -1;
+/* @} */
+#endif /* !UNIV_HOTBACKUP */
+
+/** List of animal names representing file format. */
+static const char*	file_format_name_map[] = {
+	"Antelope",
+	"Barracuda",
+	"Cheetah",
+	"Dragon",
+	"Elk",
+	"Fox",
+	"Gazelle",
+	"Hornet",
+	"Impala",
+	"Jaguar",
+	"Kangaroo",
+	"Leopard",
+	"Moose",
+	"Nautilus",
+	"Ocelot",
+	"Porpoise",
+	"Quail",
+	"Rabbit",
+	"Shark",
+	"Tiger",
+	"Urchin",
+	"Viper",
+	"Whale",
+	"Xenops",
+	"Yak",
+	"Zebra"
+};
+
+/** The number of elements in the file format name array. */
+static const ulint	FILE_FORMAT_NAME_N
+	= sizeof(file_format_name_map) / sizeof(file_format_name_map[0]);
+
+#ifndef UNIV_HOTBACKUP
+/** This is used to track the maximum file format id known to InnoDB. It's
+updated via SET GLOBAL innodb_file_format_check = 'x' or when we open
+or create a table. */
+static	file_format_t	file_format_max;
+
+/****************************************************************//**
+Determines if a page number is located inside the doublewrite buffer.
+@return TRUE if the location is inside the two blocks of the
+doublewrite buffer */
+UNIV_INTERN
+ibool
+trx_doublewrite_page_inside(
+/*========================*/
+	ulint	page_no)	/*!< in: page number */
+{
+	if (trx_doublewrite == NULL) {
+
+		return(FALSE);
+	}
+
+	if (page_no >= trx_doublewrite->block1
+	    && page_no < trx_doublewrite->block1
+	    + TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) {
+		return(TRUE);
+	}
+
+	if (page_no >= trx_doublewrite->block2
+	    && page_no < trx_doublewrite->block2
+	    + TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) {
+		return(TRUE);
+	}
+
+	return(FALSE);
+}
+
+/****************************************************************//**
+Creates or initialializes the doublewrite buffer at a database start. */
+static
+void
+trx_doublewrite_init(
+/*=================*/
+	byte*	doublewrite)	/*!< in: pointer to the doublewrite buf
+				header on trx sys page */
+{
+	trx_doublewrite = mem_alloc(sizeof(trx_doublewrite_t));
+
+	/* Since we now start to use the doublewrite buffer, no need to call
+	fsync() after every write to a data file */
+#ifdef UNIV_DO_FLUSH
+	os_do_not_call_flush_at_each_write = TRUE;
+#endif /* UNIV_DO_FLUSH */
+
+	mutex_create(&trx_doublewrite->mutex, SYNC_DOUBLEWRITE);
+
+	trx_doublewrite->first_free = 0;
+
+	trx_doublewrite->block1 = mach_read_from_4(
+		doublewrite + TRX_SYS_DOUBLEWRITE_BLOCK1);
+	trx_doublewrite->block2 = mach_read_from_4(
+		doublewrite + TRX_SYS_DOUBLEWRITE_BLOCK2);
+	trx_doublewrite->write_buf_unaligned = ut_malloc(
+		(1 + 2 * TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) * UNIV_PAGE_SIZE);
+
+	trx_doublewrite->write_buf = ut_align(
+		trx_doublewrite->write_buf_unaligned, UNIV_PAGE_SIZE);
+	trx_doublewrite->buf_block_arr = mem_alloc(
+		2 * TRX_SYS_DOUBLEWRITE_BLOCK_SIZE * sizeof(void*));
+}
+
+/****************************************************************//**
+Marks the trx sys header when we have successfully upgraded to the >= 4.1.x
+multiple tablespace format. */
+UNIV_INTERN
+void
+trx_sys_mark_upgraded_to_multiple_tablespaces(void)
+/*===============================================*/
+{
+	buf_block_t*	block;
+	byte*		doublewrite;
+	mtr_t		mtr;
+
+	/* We upgraded to 4.1.x and reset the space id fields in the
+	doublewrite buffer. Let us mark to the trx_sys header that the upgrade
+	has been done. */
+
+	mtr_start(&mtr);
+
+	block = buf_page_get(TRX_SYS_SPACE, 0, TRX_SYS_PAGE_NO,
+			     RW_X_LATCH, &mtr);
+	buf_block_dbg_add_level(block, SYNC_NO_ORDER_CHECK);
+
+	doublewrite = buf_block_get_frame(block) + TRX_SYS_DOUBLEWRITE;
+
+	mlog_write_ulint(doublewrite + TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED,
+			 TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED_N,
+			 MLOG_4BYTES, &mtr);
+	mtr_commit(&mtr);
+
+	/* Flush the modified pages to disk and make a checkpoint */
+	log_make_checkpoint_at(IB_ULONGLONG_MAX, TRUE);
+
+	trx_sys_multiple_tablespace_format = TRUE;
+}
+
+/****************************************************************//**
+Creates the doublewrite buffer to a new InnoDB installation. The header of the
+doublewrite buffer is placed on the trx system header page. */
+UNIV_INTERN
+void
+trx_sys_create_doublewrite_buf(void)
+/*================================*/
+{
+	buf_block_t*	block;
+	buf_block_t*	block2;
+	buf_block_t*	new_block;
+	byte*	doublewrite;
+	byte*	fseg_header;
+	ulint	page_no;
+	ulint	prev_page_no;
+	ulint	i;
+	mtr_t	mtr;
+
+	if (trx_doublewrite) {
+		/* Already inited */
+
+		return;
+	}
+
+start_again:
+	mtr_start(&mtr);
+	trx_doublewrite_buf_is_being_created = TRUE;
+
+	block = buf_page_get(TRX_SYS_SPACE, 0, TRX_SYS_PAGE_NO,
+			     RW_X_LATCH, &mtr);
+	buf_block_dbg_add_level(block, SYNC_NO_ORDER_CHECK);
+
+	doublewrite = buf_block_get_frame(block) + TRX_SYS_DOUBLEWRITE;
+
+	if (mach_read_from_4(doublewrite + TRX_SYS_DOUBLEWRITE_MAGIC)
+	    == TRX_SYS_DOUBLEWRITE_MAGIC_N) {
+		/* The doublewrite buffer has already been created:
+		just read in some numbers */
+
+		trx_doublewrite_init(doublewrite);
+
+		mtr_commit(&mtr);
+		trx_doublewrite_buf_is_being_created = FALSE;
+	} else {
+		fprintf(stderr,
+			"InnoDB: Doublewrite buffer not found:"
+			" creating new\n");
+
+		if (buf_pool_get_curr_size()
+		    < ((2 * TRX_SYS_DOUBLEWRITE_BLOCK_SIZE
+			+ FSP_EXTENT_SIZE / 2 + 100)
+		       * UNIV_PAGE_SIZE)) {
+			fprintf(stderr,
+				"InnoDB: Cannot create doublewrite buffer:"
+				" you must\n"
+				"InnoDB: increase your buffer pool size.\n"
+				"InnoDB: Cannot continue operation.\n");
+
+			exit(1);
+		}
+
+		block2 = fseg_create(TRX_SYS_SPACE, TRX_SYS_PAGE_NO,
+				     TRX_SYS_DOUBLEWRITE
+				     + TRX_SYS_DOUBLEWRITE_FSEG, &mtr);
+
+		/* fseg_create acquires a second latch on the page,
+		therefore we must declare it: */
+
+		buf_block_dbg_add_level(block2, SYNC_NO_ORDER_CHECK);
+
+		if (block2 == NULL) {
+			fprintf(stderr,
+				"InnoDB: Cannot create doublewrite buffer:"
+				" you must\n"
+				"InnoDB: increase your tablespace size.\n"
+				"InnoDB: Cannot continue operation.\n");
+
+			/* We exit without committing the mtr to prevent
+			its modifications to the database getting to disk */
+
+			exit(1);
+		}
+
+		fseg_header = buf_block_get_frame(block)
+			+ TRX_SYS_DOUBLEWRITE + TRX_SYS_DOUBLEWRITE_FSEG;
+		prev_page_no = 0;
+
+		for (i = 0; i < 2 * TRX_SYS_DOUBLEWRITE_BLOCK_SIZE
+			     + FSP_EXTENT_SIZE / 2; i++) {
+			page_no = fseg_alloc_free_page(fseg_header,
+						       prev_page_no + 1,
+						       FSP_UP, &mtr);
+			if (page_no == FIL_NULL) {
+				fprintf(stderr,
+					"InnoDB: Cannot create doublewrite"
+					" buffer: you must\n"
+					"InnoDB: increase your"
+					" tablespace size.\n"
+					"InnoDB: Cannot continue operation.\n"
+					);
+
+				exit(1);
+			}
+
+			/* We read the allocated pages to the buffer pool;
+			when they are written to disk in a flush, the space
+			id and page number fields are also written to the
+			pages. When we at database startup read pages
+			from the doublewrite buffer, we know that if the
+			space id and page number in them are the same as
+			the page position in the tablespace, then the page
+			has not been written to in doublewrite. */
+
+			new_block = buf_page_get(TRX_SYS_SPACE, 0, page_no,
+						 RW_X_LATCH, &mtr);
+			buf_block_dbg_add_level(new_block,
+						SYNC_NO_ORDER_CHECK);
+
+			if (i == FSP_EXTENT_SIZE / 2) {
+				ut_a(page_no == FSP_EXTENT_SIZE);
+				mlog_write_ulint(doublewrite
+						 + TRX_SYS_DOUBLEWRITE_BLOCK1,
+						 page_no, MLOG_4BYTES, &mtr);
+				mlog_write_ulint(doublewrite
+						 + TRX_SYS_DOUBLEWRITE_REPEAT
+						 + TRX_SYS_DOUBLEWRITE_BLOCK1,
+						 page_no, MLOG_4BYTES, &mtr);
+			} else if (i == FSP_EXTENT_SIZE / 2
+				   + TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) {
+				ut_a(page_no == 2 * FSP_EXTENT_SIZE);
+				mlog_write_ulint(doublewrite
+						 + TRX_SYS_DOUBLEWRITE_BLOCK2,
+						 page_no, MLOG_4BYTES, &mtr);
+				mlog_write_ulint(doublewrite
+						 + TRX_SYS_DOUBLEWRITE_REPEAT
+						 + TRX_SYS_DOUBLEWRITE_BLOCK2,
+						 page_no, MLOG_4BYTES, &mtr);
+			} else if (i > FSP_EXTENT_SIZE / 2) {
+				ut_a(page_no == prev_page_no + 1);
+			}
+
+			prev_page_no = page_no;
+		}
+
+		mlog_write_ulint(doublewrite + TRX_SYS_DOUBLEWRITE_MAGIC,
+				 TRX_SYS_DOUBLEWRITE_MAGIC_N,
+				 MLOG_4BYTES, &mtr);
+		mlog_write_ulint(doublewrite + TRX_SYS_DOUBLEWRITE_MAGIC
+				 + TRX_SYS_DOUBLEWRITE_REPEAT,
+				 TRX_SYS_DOUBLEWRITE_MAGIC_N,
+				 MLOG_4BYTES, &mtr);
+
+		mlog_write_ulint(doublewrite
+				 + TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED,
+				 TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED_N,
+				 MLOG_4BYTES, &mtr);
+		mtr_commit(&mtr);
+
+		/* Flush the modified pages to disk and make a checkpoint */
+		log_make_checkpoint_at(IB_ULONGLONG_MAX, TRUE);
+
+		fprintf(stderr, "InnoDB: Doublewrite buffer created\n");
+
+		trx_sys_multiple_tablespace_format = TRUE;
+
+		goto start_again;
+	}
+
+    if (srv_doublewrite_file) {
+	/* the same doublewrite buffer to TRX_SYS_SPACE should exist.
+	check and create if not exist.*/
+
+	mtr_start(&mtr);
+	trx_doublewrite_buf_is_being_created = TRUE;
+
+	block = buf_page_get(TRX_DOUBLEWRITE_SPACE, 0, TRX_SYS_PAGE_NO,
+			     RW_X_LATCH, &mtr);
+	buf_block_dbg_add_level(block, SYNC_NO_ORDER_CHECK);
+
+	doublewrite = buf_block_get_frame(block) + TRX_SYS_DOUBLEWRITE;
+
+	if (mach_read_from_4(doublewrite + TRX_SYS_DOUBLEWRITE_MAGIC)
+	    == TRX_SYS_DOUBLEWRITE_MAGIC_N) {
+		/* The doublewrite buffer has already been created:
+		just read in some numbers */
+
+		mtr_commit(&mtr);
+	} else {
+		fprintf(stderr,
+			"InnoDB: Doublewrite buffer not found in the doublewrite file:"
+			" creating new\n");
+
+		if (buf_pool_get_curr_size()
+		    < ((2 * TRX_SYS_DOUBLEWRITE_BLOCK_SIZE
+			+ FSP_EXTENT_SIZE / 2 + 100)
+		       * UNIV_PAGE_SIZE)) {
+			fprintf(stderr,
+				"InnoDB: Cannot create doublewrite buffer:"
+				" you must\n"
+				"InnoDB: increase your buffer pool size.\n"
+				"InnoDB: Cannot continue operation.\n");
+
+			exit(1);
+		}
+
+		block2 = fseg_create(TRX_DOUBLEWRITE_SPACE, TRX_SYS_PAGE_NO,
+				     TRX_SYS_DOUBLEWRITE
+				     + TRX_SYS_DOUBLEWRITE_FSEG, &mtr);
+
+		/* fseg_create acquires a second latch on the page,
+		therefore we must declare it: */
+
+		buf_block_dbg_add_level(block2, SYNC_NO_ORDER_CHECK);
+
+		if (block2 == NULL) {
+			fprintf(stderr,
+				"InnoDB: Cannot create doublewrite buffer:"
+				" you must\n"
+				"InnoDB: increase your tablespace size.\n"
+				"InnoDB: Cannot continue operation.\n");
+
+			/* We exit without committing the mtr to prevent
+			its modifications to the database getting to disk */
+
+			exit(1);
+		}
+
+		fseg_header = buf_block_get_frame(block)
+			+ TRX_SYS_DOUBLEWRITE + TRX_SYS_DOUBLEWRITE_FSEG;
+		prev_page_no = 0;
+
+		for (i = 0; i < 2 * TRX_SYS_DOUBLEWRITE_BLOCK_SIZE
+			     + FSP_EXTENT_SIZE / 2; i++) {
+			page_no = fseg_alloc_free_page(fseg_header,
+						       prev_page_no + 1,
+						       FSP_UP, &mtr);
+			if (page_no == FIL_NULL) {
+				fprintf(stderr,
+					"InnoDB: Cannot create doublewrite"
+					" buffer: you must\n"
+					"InnoDB: increase your"
+					" tablespace size.\n"
+					"InnoDB: Cannot continue operation.\n"
+					);
+
+				exit(1);
+			}
+
+			/* We read the allocated pages to the buffer pool;
+			when they are written to disk in a flush, the space
+			id and page number fields are also written to the
+			pages. When we at database startup read pages
+			from the doublewrite buffer, we know that if the
+			space id and page number in them are the same as
+			the page position in the tablespace, then the page
+			has not been written to in doublewrite. */
+
+			new_block = buf_page_get(TRX_DOUBLEWRITE_SPACE, 0, page_no,
+						 RW_X_LATCH, &mtr);
+			buf_block_dbg_add_level(new_block,
+						SYNC_NO_ORDER_CHECK);
+
+			if (i == FSP_EXTENT_SIZE / 2) {
+				ut_a(page_no == FSP_EXTENT_SIZE);
+				mlog_write_ulint(doublewrite
+						 + TRX_SYS_DOUBLEWRITE_BLOCK1,
+						 page_no, MLOG_4BYTES, &mtr);
+				mlog_write_ulint(doublewrite
+						 + TRX_SYS_DOUBLEWRITE_REPEAT
+						 + TRX_SYS_DOUBLEWRITE_BLOCK1,
+						 page_no, MLOG_4BYTES, &mtr);
+			} else if (i == FSP_EXTENT_SIZE / 2
+				   + TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) {
+				ut_a(page_no == 2 * FSP_EXTENT_SIZE);
+				mlog_write_ulint(doublewrite
+						 + TRX_SYS_DOUBLEWRITE_BLOCK2,
+						 page_no, MLOG_4BYTES, &mtr);
+				mlog_write_ulint(doublewrite
+						 + TRX_SYS_DOUBLEWRITE_REPEAT
+						 + TRX_SYS_DOUBLEWRITE_BLOCK2,
+						 page_no, MLOG_4BYTES, &mtr);
+			} else if (i > FSP_EXTENT_SIZE / 2) {
+				ut_a(page_no == prev_page_no + 1);
+			}
+
+			prev_page_no = page_no;
+		}
+
+		mlog_write_ulint(doublewrite + TRX_SYS_DOUBLEWRITE_MAGIC,
+				 TRX_SYS_DOUBLEWRITE_MAGIC_N,
+				 MLOG_4BYTES, &mtr);
+		mlog_write_ulint(doublewrite + TRX_SYS_DOUBLEWRITE_MAGIC
+				 + TRX_SYS_DOUBLEWRITE_REPEAT,
+				 TRX_SYS_DOUBLEWRITE_MAGIC_N,
+				 MLOG_4BYTES, &mtr);
+
+		mlog_write_ulint(doublewrite
+				 + TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED,
+				 TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED_N,
+				 MLOG_4BYTES, &mtr);
+		mtr_commit(&mtr);
+
+		/* Flush the modified pages to disk and make a checkpoint */
+		log_make_checkpoint_at(IB_ULONGLONG_MAX, TRUE);
+
+		fprintf(stderr, "InnoDB: Doublewrite buffer created in the doublewrite file\n");
+		trx_sys_multiple_tablespace_format = TRUE;
+	}
+	trx_doublewrite_buf_is_being_created = FALSE;
+    }
+}
+
+/****************************************************************//**
+At a database startup initializes the doublewrite buffer memory structure if
+we already have a doublewrite buffer created in the data files. If we are
+upgrading to an InnoDB version which supports multiple tablespaces, then this
+function performs the necessary update operations. If we are in a crash
+recovery, this function uses a possible doublewrite buffer to restore
+half-written pages in the data files. */
+UNIV_INTERN
+void
+trx_sys_doublewrite_init_or_restore_pages(
+/*======================================*/
+	ibool	restore_corrupt_pages)	/*!< in: TRUE=restore pages */
+{
+	byte*	buf;
+	byte*	read_buf;
+	byte*	unaligned_read_buf;
+	ulint	block1;
+	ulint	block2;
+	ulint	source_page_no;
+	byte*	page;
+	byte*	doublewrite;
+	ulint	doublewrite_space_id;
+	ulint	space_id;
+	ulint	page_no;
+	ulint	i;
+
+	doublewrite_space_id = (srv_doublewrite_file ? TRX_DOUBLEWRITE_SPACE : TRX_SYS_SPACE);
+
+	if (srv_doublewrite_file) {
+		fprintf(stderr,
+			"InnoDB: doublewrite file '%s' is used.\n",
+			srv_doublewrite_file);
+	}
+
+	/* We do the file i/o past the buffer pool */
+
+	unaligned_read_buf = ut_malloc(2 * UNIV_PAGE_SIZE);
+	read_buf = ut_align(unaligned_read_buf, UNIV_PAGE_SIZE);
+
+	/* Read the trx sys header to check if we are using the doublewrite
+	buffer */
+
+	fil_io(OS_FILE_READ, TRUE, doublewrite_space_id, 0, TRX_SYS_PAGE_NO, 0,
+	       UNIV_PAGE_SIZE, read_buf, NULL);
+	doublewrite = read_buf + TRX_SYS_DOUBLEWRITE;
+
+	if (mach_read_from_4(doublewrite + TRX_SYS_DOUBLEWRITE_MAGIC)
+	    == TRX_SYS_DOUBLEWRITE_MAGIC_N) {
+		/* The doublewrite buffer has been created */
+
+		trx_doublewrite_init(doublewrite);
+
+		block1 = trx_doublewrite->block1;
+		block2 = trx_doublewrite->block2;
+
+		buf = trx_doublewrite->write_buf;
+	} else {
+		goto leave_func;
+	}
+
+	if (mach_read_from_4(doublewrite + TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED)
+	    != TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED_N) {
+
+		/* We are upgrading from a version < 4.1.x to a version where
+		multiple tablespaces are supported. We must reset the space id
+		field in the pages in the doublewrite buffer because starting
+		from this version the space id is stored to
+		FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID. */
+
+		trx_doublewrite_must_reset_space_ids = TRUE;
+
+		fprintf(stderr,
+			"InnoDB: Resetting space id's in the"
+			" doublewrite buffer\n");
+	} else {
+		trx_sys_multiple_tablespace_format = TRUE;
+	}
+
+	/* Read the pages from the doublewrite buffer to memory */
+
+	fil_io(OS_FILE_READ, TRUE, doublewrite_space_id, 0, block1, 0,
+	       TRX_SYS_DOUBLEWRITE_BLOCK_SIZE * UNIV_PAGE_SIZE,
+	       buf, NULL);
+	fil_io(OS_FILE_READ, TRUE, doublewrite_space_id, 0, block2, 0,
+	       TRX_SYS_DOUBLEWRITE_BLOCK_SIZE * UNIV_PAGE_SIZE,
+	       buf + TRX_SYS_DOUBLEWRITE_BLOCK_SIZE * UNIV_PAGE_SIZE,
+	       NULL);
+	/* Check if any of these pages is half-written in data files, in the
+	intended position */
+
+	page = buf;
+
+	for (i = 0; i < TRX_SYS_DOUBLEWRITE_BLOCK_SIZE * 2; i++) {
+
+		page_no = mach_read_from_4(page + FIL_PAGE_OFFSET);
+
+		if (trx_doublewrite_must_reset_space_ids) {
+
+			space_id = 0;
+			mach_write_to_4(page
+					+ FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID, 0);
+			/* We do not need to calculate new checksums for the
+			pages because the field .._SPACE_ID does not affect
+			them. Write the page back to where we read it from. */
+
+			if (i < TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) {
+				source_page_no = block1 + i;
+			} else {
+				source_page_no = block2
+					+ i - TRX_SYS_DOUBLEWRITE_BLOCK_SIZE;
+			}
+
+			fil_io(OS_FILE_WRITE, TRUE, 0, 0, source_page_no, 0,
+			       UNIV_PAGE_SIZE, page, NULL);
+			/* printf("Resetting space id in page %lu\n",
+			source_page_no); */
+		} else {
+			space_id = mach_read_from_4(
+				page + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID);
+		}
+
+		if (!restore_corrupt_pages) {
+			/* The database was shut down gracefully: no need to
+			restore pages */
+
+		} else if (!fil_tablespace_exists_in_mem(space_id)) {
+			/* Maybe we have dropped the single-table tablespace
+			and this page once belonged to it: do nothing */
+
+		} else if (!fil_check_adress_in_tablespace(space_id,
+							   page_no)) {
+			fprintf(stderr,
+				"InnoDB: Warning: a page in the"
+				" doublewrite buffer is not within space\n"
+				"InnoDB: bounds; space id %lu"
+				" page number %lu, page %lu in"
+				" doublewrite buf.\n",
+				(ulong) space_id, (ulong) page_no, (ulong) i);
+
+		} else if ((space_id == TRX_SYS_SPACE
+			    || (srv_doublewrite_file && space_id == TRX_DOUBLEWRITE_SPACE))
+			   && ((page_no >= block1
+				&& page_no
+				< block1 + TRX_SYS_DOUBLEWRITE_BLOCK_SIZE)
+			       || (page_no >= block2
+				   && page_no
+				   < (block2
+				      + TRX_SYS_DOUBLEWRITE_BLOCK_SIZE)))) {
+
+			/* It is an unwritten doublewrite buffer page:
+			do nothing */
+		} else {
+			ulint	zip_size = fil_space_get_zip_size(space_id);
+
+			/* Read in the actual page from the file */
+			fil_io(OS_FILE_READ, TRUE, space_id, zip_size,
+			       page_no, 0,
+			       zip_size ? zip_size : UNIV_PAGE_SIZE,
+			       read_buf, NULL);
+
+			if (srv_recovery_stats && recv_recovery_is_on()) {
+				mutex_enter(&(recv_sys->mutex));
+				recv_sys->stats_doublewrite_check_pages++;
+				mutex_exit(&(recv_sys->mutex));
+			}
+
+			/* Check if the page is corrupt */
+
+			if (UNIV_UNLIKELY
+			    (buf_page_is_corrupted(read_buf, zip_size))) {
+
+				fprintf(stderr,
+					"InnoDB: Warning: database page"
+					" corruption or a failed\n"
+					"InnoDB: file read of"
+					" space %lu page %lu.\n"
+					"InnoDB: Trying to recover it from"
+					" the doublewrite buffer.\n",
+					(ulong) space_id, (ulong) page_no);
+
+				if (buf_page_is_corrupted(page, zip_size)) {
+					fprintf(stderr,
+						"InnoDB: Dump of the page:\n");
+					buf_page_print(read_buf, zip_size);
+					fprintf(stderr,
+						"InnoDB: Dump of"
+						" corresponding page"
+						" in doublewrite buffer:\n");
+					buf_page_print(page, zip_size);
+
+					fprintf(stderr,
+						"InnoDB: Also the page in the"
+						" doublewrite buffer"
+						" is corrupt.\n"
+						"InnoDB: Cannot continue"
+						" operation.\n"
+						"InnoDB: You can try to"
+						" recover the database"
+						" with the my.cnf\n"
+						"InnoDB: option:\n"
+						"InnoDB:"
+						" innodb_force_recovery=6\n");
+					exit(1);
+				}
+
+				/* Write the good page from the
+				doublewrite buffer to the intended
+				position */
+
+				fil_io(OS_FILE_WRITE, TRUE, space_id,
+				       zip_size, page_no, 0,
+				       zip_size ? zip_size : UNIV_PAGE_SIZE,
+				       page, NULL);
+
+				if (srv_recovery_stats && recv_recovery_is_on()) {
+					mutex_enter(&(recv_sys->mutex));
+					recv_sys->stats_doublewrite_overwrite_pages++;
+					mutex_exit(&(recv_sys->mutex));
+				}
+
+				fprintf(stderr,
+					"InnoDB: Recovered the page from"
+					" the doublewrite buffer.\n");
+			}
+		}
+
+		page += UNIV_PAGE_SIZE;
+	}
+
+	fil_flush_file_spaces(FIL_TABLESPACE);
+
+leave_func:
+	ut_free(unaligned_read_buf);
+}
+
+/****************************************************************//**
+Checks that trx is in the trx list.
+@return	TRUE if is in */
+UNIV_INTERN
+ibool
+trx_in_trx_list(
+/*============*/
+	trx_t*	in_trx)	/*!< in: trx */
+{
+	trx_t*	trx;
+
+	ut_ad(mutex_own(&(kernel_mutex)));
+
+	trx = UT_LIST_GET_FIRST(trx_sys->trx_list);
+
+	while (trx != NULL) {
+
+		if (trx == in_trx) {
+
+			return(TRUE);
+		}
+
+		trx = UT_LIST_GET_NEXT(trx_list, trx);
+	}
+
+	return(FALSE);
+}
+
+/*****************************************************************//**
+Writes the value of max_trx_id to the file based trx system header. */
+UNIV_INTERN
+void
+trx_sys_flush_max_trx_id(void)
+/*==========================*/
+{
+	trx_sysf_t*	sys_header;
+	mtr_t		mtr;
+
+	ut_ad(mutex_own(&kernel_mutex));
+
+	mtr_start(&mtr);
+
+	sys_header = trx_sysf_get(&mtr);
+
+	mlog_write_dulint(sys_header + TRX_SYS_TRX_ID_STORE,
+			  trx_sys->max_trx_id, &mtr);
+	mtr_commit(&mtr);
+}
+
+/*****************************************************************//**
+Updates the offset information about the end of the MySQL binlog entry
+which corresponds to the transaction just being committed. In a MySQL
+replication slave updates the latest master binlog position up to which
+replication has proceeded. */
+UNIV_INTERN
+void
+trx_sys_update_mysql_binlog_offset(
+/*===============================*/
+	trx_sysf_t*	sys_header,
+	const char*	file_name_in,/*!< in: MySQL log file name */
+	ib_int64_t	offset,	/*!< in: position in that log file */
+	ulint		field,	/*!< in: offset of the MySQL log info field in
+				the trx sys header */
+	mtr_t*		mtr)	/*!< in: mtr */
+{
+	const char*	file_name;
+
+	if (ut_strlen(file_name_in) >= TRX_SYS_MYSQL_MASTER_LOG_NAME_LEN) {
+
+		/* We cannot fit the name to the 512 bytes we have reserved */
+		/* -> To store relay log file information, file_name must fit to the 480 bytes */
+
+		file_name = "";
+	}
+	else {
+		file_name = file_name_in;
+	}
+
+	if (mach_read_from_4(sys_header + field
+			     + TRX_SYS_MYSQL_LOG_MAGIC_N_FLD)
+	    != TRX_SYS_MYSQL_LOG_MAGIC_N) {
+
+		mlog_write_ulint(sys_header + field
+				 + TRX_SYS_MYSQL_LOG_MAGIC_N_FLD,
+				 TRX_SYS_MYSQL_LOG_MAGIC_N,
+				 MLOG_4BYTES, mtr);
+	}
+
+	if (0 != strcmp((char*) (sys_header + field + TRX_SYS_MYSQL_LOG_NAME),
+			file_name)) {
+
+		mlog_write_string(sys_header + field
+				  + TRX_SYS_MYSQL_LOG_NAME,
+				  (byte*) file_name, 1 + ut_strlen(file_name),
+				  mtr);
+	}
+
+	if (mach_read_from_4(sys_header + field
+			     + TRX_SYS_MYSQL_LOG_OFFSET_HIGH) > 0
+	    || (offset >> 32) > 0) {
+
+		mlog_write_ulint(sys_header + field
+				 + TRX_SYS_MYSQL_LOG_OFFSET_HIGH,
+				 (ulint)(offset >> 32),
+				 MLOG_4BYTES, mtr);
+	}
+
+	mlog_write_ulint(sys_header + field
+			 + TRX_SYS_MYSQL_LOG_OFFSET_LOW,
+			 (ulint)(offset & 0xFFFFFFFFUL),
+			 MLOG_4BYTES, mtr);
+}
+
+/*****************************************************************//**
+Stores the MySQL binlog offset info in the trx system header if
+the magic number shows it valid, and print the info to stderr */
+UNIV_INTERN
+void
+trx_sys_print_mysql_binlog_offset(void)
+/*===================================*/
+{
+	trx_sysf_t*	sys_header;
+	mtr_t		mtr;
+	ulint		trx_sys_mysql_bin_log_pos_high;
+	ulint		trx_sys_mysql_bin_log_pos_low;
+
+	mtr_start(&mtr);
+
+	sys_header = trx_sysf_get(&mtr);
+
+	if (mach_read_from_4(sys_header + TRX_SYS_MYSQL_LOG_INFO
+			     + TRX_SYS_MYSQL_LOG_MAGIC_N_FLD)
+	    != TRX_SYS_MYSQL_LOG_MAGIC_N) {
+
+		mtr_commit(&mtr);
+
+		return;
+	}
+
+	trx_sys_mysql_bin_log_pos_high = mach_read_from_4(
+		sys_header + TRX_SYS_MYSQL_LOG_INFO
+		+ TRX_SYS_MYSQL_LOG_OFFSET_HIGH);
+	trx_sys_mysql_bin_log_pos_low = mach_read_from_4(
+		sys_header + TRX_SYS_MYSQL_LOG_INFO
+		+ TRX_SYS_MYSQL_LOG_OFFSET_LOW);
+
+	trx_sys_mysql_bin_log_pos
+		= (((ib_int64_t)trx_sys_mysql_bin_log_pos_high) << 32)
+		+ (ib_int64_t)trx_sys_mysql_bin_log_pos_low;
+
+	ut_memcpy(trx_sys_mysql_bin_log_name,
+		  sys_header + TRX_SYS_MYSQL_LOG_INFO
+		  + TRX_SYS_MYSQL_LOG_NAME, TRX_SYS_MYSQL_LOG_NAME_LEN);
+
+	fprintf(stderr,
+		"InnoDB: Last MySQL binlog file position %lu %lu,"
+		" file name %s\n",
+		trx_sys_mysql_bin_log_pos_high, trx_sys_mysql_bin_log_pos_low,
+		trx_sys_mysql_bin_log_name);
+
+	mtr_commit(&mtr);
+}
+
+/*****************************************************************//**
+Prints to stderr the MySQL master log offset info in the trx system header if
+the magic number shows it valid. */
+UNIV_INTERN
+void
+trx_sys_print_mysql_master_log_pos(void)
+/*====================================*/
+{
+	trx_sysf_t*	sys_header;
+	mtr_t		mtr;
+
+	mtr_start(&mtr);
+
+	sys_header = trx_sysf_get(&mtr);
+
+	if (mach_read_from_4(sys_header + TRX_SYS_MYSQL_MASTER_LOG_INFO
+			     + TRX_SYS_MYSQL_LOG_MAGIC_N_FLD)
+	    != TRX_SYS_MYSQL_LOG_MAGIC_N) {
+
+		mtr_commit(&mtr);
+
+		return;
+	}
+
+	fprintf(stderr,
+		"InnoDB: In a MySQL replication slave the last"
+		" master binlog file\n"
+		"InnoDB: position %lu %lu, file name %s\n",
+		(ulong) mach_read_from_4(sys_header
+					 + TRX_SYS_MYSQL_MASTER_LOG_INFO
+					 + TRX_SYS_MYSQL_LOG_OFFSET_HIGH),
+		(ulong) mach_read_from_4(sys_header
+					 + TRX_SYS_MYSQL_MASTER_LOG_INFO
+					 + TRX_SYS_MYSQL_LOG_OFFSET_LOW),
+		sys_header + TRX_SYS_MYSQL_MASTER_LOG_INFO
+		+ TRX_SYS_MYSQL_LOG_NAME);
+
+	fprintf(stderr,
+		"InnoDB: and relay log file\n"
+		"InnoDB: position %lu %lu, file name %s\n",
+		(ulong) mach_read_from_4(sys_header
+					 + TRX_SYS_MYSQL_RELAY_LOG_INFO
+					 + TRX_SYS_MYSQL_LOG_OFFSET_HIGH),
+		(ulong) mach_read_from_4(sys_header
+					 + TRX_SYS_MYSQL_RELAY_LOG_INFO
+					 + TRX_SYS_MYSQL_LOG_OFFSET_LOW),
+		sys_header + TRX_SYS_MYSQL_RELAY_LOG_INFO
+		+ TRX_SYS_MYSQL_LOG_NAME);
+
+	/* Copy the master log position info to global variables we can
+	use in ha_innobase.cc to initialize glob_mi to right values */
+
+	ut_memcpy(trx_sys_mysql_master_log_name,
+		  sys_header + TRX_SYS_MYSQL_MASTER_LOG_INFO
+		  + TRX_SYS_MYSQL_LOG_NAME,
+		  TRX_SYS_MYSQL_MASTER_LOG_NAME_LEN);
+
+	trx_sys_mysql_master_log_pos
+		= (((ib_int64_t) mach_read_from_4(
+			    sys_header + TRX_SYS_MYSQL_MASTER_LOG_INFO
+			    + TRX_SYS_MYSQL_LOG_OFFSET_HIGH)) << 32)
+		+ ((ib_int64_t) mach_read_from_4(
+			   sys_header + TRX_SYS_MYSQL_MASTER_LOG_INFO
+			   + TRX_SYS_MYSQL_LOG_OFFSET_LOW));
+
+	ut_memcpy(trx_sys_mysql_relay_log_name,
+		  sys_header + TRX_SYS_MYSQL_RELAY_LOG_INFO
+		  + TRX_SYS_MYSQL_LOG_NAME,
+		  TRX_SYS_MYSQL_MASTER_LOG_NAME_LEN);
+
+	trx_sys_mysql_relay_log_pos
+		= (((ib_int64_t) mach_read_from_4(
+			    sys_header + TRX_SYS_MYSQL_RELAY_LOG_INFO
+			    + TRX_SYS_MYSQL_LOG_OFFSET_HIGH)) << 32)
+		+ ((ib_int64_t) mach_read_from_4(
+			   sys_header + TRX_SYS_MYSQL_RELAY_LOG_INFO
+			   + TRX_SYS_MYSQL_LOG_OFFSET_LOW));
+	mtr_commit(&mtr);
+}
+
+/****************************************************************//**
+Looks for a free slot for a rollback segment in the trx system file copy.
+@return	slot index or ULINT_UNDEFINED if not found */
+UNIV_INTERN
+ulint
+trx_sysf_rseg_find_free(
+/*====================*/
+	mtr_t*	mtr)	/*!< in: mtr */
+{
+	trx_sysf_t*	sys_header;
+	ulint		page_no;
+	ulint		i;
+
+	ut_ad(mutex_own(&(kernel_mutex)));
+
+	sys_header = trx_sysf_get(mtr);
+
+	for (i = 0; i < TRX_SYS_N_RSEGS; i++) {
+
+		page_no = trx_sysf_rseg_get_page_no(sys_header, i, mtr);
+
+		if (page_no == FIL_NULL) {
+
+			return(i);
+		}
+	}
+
+	return(ULINT_UNDEFINED);
+}
+
+/*****************************************************************//**
+Creates the file page for the transaction system. This function is called only
+at the database creation, before trx_sys_init. */
+static
+void
+trx_sysf_create(
+/*============*/
+	mtr_t*	mtr)	/*!< in: mtr */
+{
+	trx_sysf_t*	sys_header;
+	ulint		slot_no;
+	buf_block_t*	block;
+	page_t*		page;
+	ulint		page_no;
+	ulint		i;
+
+	ut_ad(mtr);
+
+	/* Note that below we first reserve the file space x-latch, and
+	then enter the kernel: we must do it in this order to conform
+	to the latching order rules. */
+
+	mtr_x_lock(fil_space_get_latch(TRX_SYS_SPACE, NULL), mtr);
+	mutex_enter(&kernel_mutex);
+
+	/* Create the trx sys file block in a new allocated file segment */
+	block = fseg_create(TRX_SYS_SPACE, 0, TRX_SYS + TRX_SYS_FSEG_HEADER,
+			    mtr);
+	buf_block_dbg_add_level(block, SYNC_TRX_SYS_HEADER);
+
+	ut_a(buf_block_get_page_no(block) == TRX_SYS_PAGE_NO);
+
+	page = buf_block_get_frame(block);
+
+	mlog_write_ulint(page + FIL_PAGE_TYPE, FIL_PAGE_TYPE_TRX_SYS,
+			 MLOG_2BYTES, mtr);
+
+	/* Reset the doublewrite buffer magic number to zero so that we
+	know that the doublewrite buffer has not yet been created (this
+	suppresses a Valgrind warning) */
+
+	mlog_write_ulint(page + TRX_SYS_DOUBLEWRITE
+			 + TRX_SYS_DOUBLEWRITE_MAGIC, 0, MLOG_4BYTES, mtr);
+
+	sys_header = trx_sysf_get(mtr);
+
+	/* Start counting transaction ids from number 1 up */
+	mlog_write_dulint(sys_header + TRX_SYS_TRX_ID_STORE,
+			  ut_dulint_create(0, 1), mtr);
+
+	/* Reset the rollback segment slots */
+	for (i = 0; i < TRX_SYS_N_RSEGS; i++) {
+
+		trx_sysf_rseg_set_space(sys_header, i, ULINT_UNDEFINED, mtr);
+		trx_sysf_rseg_set_page_no(sys_header, i, FIL_NULL, mtr);
+	}
+
+	/* The remaining area (up to the page trailer) is uninitialized.
+	Silence Valgrind warnings about it. */
+	UNIV_MEM_VALID(sys_header + (TRX_SYS_RSEGS
+				     + TRX_SYS_N_RSEGS * TRX_SYS_RSEG_SLOT_SIZE
+				     + TRX_SYS_RSEG_SPACE),
+		       (UNIV_PAGE_SIZE - FIL_PAGE_DATA_END
+			- (TRX_SYS_RSEGS
+			   + TRX_SYS_N_RSEGS * TRX_SYS_RSEG_SLOT_SIZE
+			   + TRX_SYS_RSEG_SPACE))
+		       + page - sys_header);
+
+	/* Create the first rollback segment in the SYSTEM tablespace */
+	page_no = trx_rseg_header_create(TRX_SYS_SPACE, 0, ULINT_MAX, &slot_no,
+					 mtr);
+	ut_a(slot_no == TRX_SYS_SYSTEM_RSEG_ID);
+	ut_a(page_no != FIL_NULL);
+
+	mutex_exit(&kernel_mutex);
+}
+
+/*****************************************************************//**
+Creates dummy of the file page for the transaction system. */
+static
+void
+trx_sysf_dummy_create(
+/*==================*/
+	ulint	space,
+	mtr_t*	mtr)
+{
+	buf_block_t*	block;
+	page_t*		page;
+
+	ut_ad(mtr);
+
+	/* Note that below we first reserve the file space x-latch, and
+	then enter the kernel: we must do it in this order to conform
+	to the latching order rules. */
+
+	mtr_x_lock(fil_space_get_latch(space, NULL), mtr);
+	mutex_enter(&kernel_mutex);
+
+	/* Create the trx sys file block in a new allocated file segment */
+	block = fseg_create(space, 0, TRX_SYS + TRX_SYS_FSEG_HEADER,
+			    mtr);
+	buf_block_dbg_add_level(block, SYNC_TRX_SYS_HEADER);
+
+	fprintf(stderr, "%lu\n", buf_block_get_page_no(block));
+	ut_a(buf_block_get_page_no(block) == TRX_SYS_PAGE_NO);
+
+	page = buf_block_get_frame(block);
+
+	mlog_write_ulint(page + FIL_PAGE_TYPE, FIL_PAGE_TYPE_TRX_SYS,
+			 MLOG_2BYTES, mtr);
+
+	/* Reset the doublewrite buffer magic number to zero so that we
+	know that the doublewrite buffer has not yet been created (this
+	suppresses a Valgrind warning) */
+
+	mlog_write_ulint(page + TRX_SYS_DOUBLEWRITE
+			 + TRX_SYS_DOUBLEWRITE_MAGIC, 0, MLOG_4BYTES, mtr);
+
+#ifdef UNDEFINED
+	/* TODO: REMOVE IT: The bellow is not needed, I think */
+	sys_header = trx_sysf_get(mtr);
+
+	/* Start counting transaction ids from number 1 up */
+	mlog_write_dulint(sys_header + TRX_SYS_TRX_ID_STORE,
+			  ut_dulint_create(0, 1), mtr);
+
+	/* Reset the rollback segment slots */
+	for (i = 0; i < TRX_SYS_N_RSEGS; i++) {
+
+		trx_sysf_rseg_set_space(sys_header, i, ULINT_UNDEFINED, mtr);
+		trx_sysf_rseg_set_page_no(sys_header, i, FIL_NULL, mtr);
+	}
+
+	/* The remaining area (up to the page trailer) is uninitialized.
+	Silence Valgrind warnings about it. */
+	UNIV_MEM_VALID(sys_header + (TRX_SYS_RSEGS
+				     + TRX_SYS_N_RSEGS * TRX_SYS_RSEG_SLOT_SIZE
+				     + TRX_SYS_RSEG_SPACE),
+		       (UNIV_PAGE_SIZE - FIL_PAGE_DATA_END
+			- (TRX_SYS_RSEGS
+			   + TRX_SYS_N_RSEGS * TRX_SYS_RSEG_SLOT_SIZE
+			   + TRX_SYS_RSEG_SPACE))
+		       + page - sys_header);
+
+	/* Create the first rollback segment in the SYSTEM tablespace */
+	page_no = trx_rseg_header_create(space, 0, ULINT_MAX, &slot_no,
+					 mtr);
+	ut_a(slot_no == TRX_SYS_SYSTEM_RSEG_ID);
+	ut_a(page_no != FIL_NULL);
+#endif
+
+	mutex_exit(&kernel_mutex);
+}
+
+/*****************************************************************//**
+Creates and initializes the central memory structures for the transaction
+system. This is called when the database is started. */
+UNIV_INTERN
+void
+trx_sys_init_at_db_start(void)
+/*==========================*/
+{
+	trx_sysf_t*	sys_header;
+	ib_int64_t	rows_to_undo	= 0;
+	const char*	unit		= "";
+	trx_t*		trx;
+	mtr_t		mtr;
+
+	mtr_start(&mtr);
+
+	ut_ad(trx_sys == NULL);
+
+	mutex_enter(&kernel_mutex);
+
+	trx_sys = mem_alloc(sizeof(trx_sys_t));
+
+	sys_header = trx_sysf_get(&mtr);
+
+	trx_rseg_list_and_array_init(sys_header, &mtr);
+
+	trx_sys->latest_rseg = UT_LIST_GET_FIRST(trx_sys->rseg_list);
+
+	/* VERY important: after the database is started, max_trx_id value is
+	divisible by TRX_SYS_TRX_ID_WRITE_MARGIN, and the 'if' in
+	trx_sys_get_new_trx_id will evaluate to TRUE when the function
+	is first time called, and the value for trx id will be written
+	to the disk-based header! Thus trx id values will not overlap when
+	the database is repeatedly started! */
+
+	trx_sys->max_trx_id = ut_dulint_add(
+		ut_dulint_align_up(mtr_read_dulint(
+					   sys_header
+					   + TRX_SYS_TRX_ID_STORE, &mtr),
+				   TRX_SYS_TRX_ID_WRITE_MARGIN),
+		2 * TRX_SYS_TRX_ID_WRITE_MARGIN);
+
+	UT_LIST_INIT(trx_sys->mysql_trx_list);
+	trx_dummy_sess = sess_open();
+	trx_lists_init_at_db_start();
+
+	if (UT_LIST_GET_LEN(trx_sys->trx_list) > 0) {
+		trx = UT_LIST_GET_FIRST(trx_sys->trx_list);
+
+		for (;;) {
+
+			if ( trx->conc_state != TRX_PREPARED) {
+				rows_to_undo += ut_conv_dulint_to_longlong(
+					trx->undo_no);
+			}
+
+			trx = UT_LIST_GET_NEXT(trx_list, trx);
+
+			if (!trx) {
+				break;
+			}
+		}
+
+		if (rows_to_undo > 1000000000) {
+			unit = "M";
+			rows_to_undo = rows_to_undo / 1000000;
+		}
+
+		fprintf(stderr,
+			"InnoDB: %lu transaction(s) which must be"
+			" rolled back or cleaned up\n"
+			"InnoDB: in total %lu%s row operations to undo\n",
+			(ulong) UT_LIST_GET_LEN(trx_sys->trx_list),
+			(ulong) rows_to_undo, unit);
+
+		fprintf(stderr, "InnoDB: Trx id counter is " TRX_ID_FMT "\n",
+			TRX_ID_PREP_PRINTF(trx_sys->max_trx_id));
+	}
+
+	UT_LIST_INIT(trx_sys->view_list);
+
+	trx_purge_sys_create();
+
+	mutex_exit(&kernel_mutex);
+
+	mtr_commit(&mtr);
+}
+
+/*****************************************************************//**
+Creates and initializes the transaction system at the database creation. */
+UNIV_INTERN
+void
+trx_sys_create(void)
+/*================*/
+{
+	mtr_t	mtr;
+
+	mtr_start(&mtr);
+
+	trx_sysf_create(&mtr);
+
+	mtr_commit(&mtr);
+
+	trx_sys_init_at_db_start();
+}
+
+/*****************************************************************//**
+Creates and initializes the dummy transaction system page for tablespace. */
+UNIV_INTERN
+void
+trx_sys_dummy_create(
+/*=================*/
+	ulint	space)
+{
+	mtr_t	mtr;
+
+	/* This function is only for doublewrite file for now */
+	ut_a(space == TRX_DOUBLEWRITE_SPACE);
+
+	mtr_start(&mtr);
+
+	trx_sysf_dummy_create(space, &mtr);
+
+	mtr_commit(&mtr);
+}
+
+/*********************************************************************
+Create extra rollback segments when create_new_db */
+UNIV_INTERN
+void
+trx_sys_create_extra_rseg(
+/*======================*/
+	ulint	num)	/* in: number of extra user rollback segments */
+{
+	mtr_t	mtr;
+	ulint	slot_no;
+	ulint	i;
+
+	/* Craete extra rollback segments */
+	mtr_start(&mtr);
+	for (i = 1; i < num + 1; i++) {
+		if(!trx_rseg_create(TRX_SYS_SPACE, ULINT_MAX, &slot_no, &mtr)) {
+			fprintf(stderr,
+"InnoDB: Warning: Failed to create extra rollback segments.\n");
+			break;
+		}
+		ut_a(slot_no == i);
+	}
+	mtr_commit(&mtr);
+}
+
+/*****************************************************************//**
+Update the file format tag.
+@return	always TRUE */
+static
+ibool
+trx_sys_file_format_max_write(
+/*==========================*/
+	ulint		format_id,	/*!< in: file format id */
+	const char**	name)		/*!< out: max file format name, can
+					be NULL */
+{
+	mtr_t		mtr;
+	byte*		ptr;
+	buf_block_t*	block;
+	ulint		tag_value_low;
+
+	mtr_start(&mtr);
+
+	block = buf_page_get(
+		TRX_SYS_SPACE, 0, TRX_SYS_PAGE_NO, RW_X_LATCH, &mtr);
+
+	file_format_max.id = format_id;
+	file_format_max.name = trx_sys_file_format_id_to_name(format_id);
+
+	ptr = buf_block_get_frame(block) + TRX_SYS_FILE_FORMAT_TAG;
+	tag_value_low = format_id + TRX_SYS_FILE_FORMAT_TAG_MAGIC_N_LOW;
+
+	if (name) {
+		*name = file_format_max.name;
+	}
+
+	mlog_write_dulint(
+		ptr,
+		ut_dulint_create(TRX_SYS_FILE_FORMAT_TAG_MAGIC_N_HIGH,
+				 tag_value_low),
+		&mtr);
+
+	mtr_commit(&mtr);
+
+	return(TRUE);
+}
+
+/*****************************************************************//**
+Read the file format tag.
+@return	the file format or ULINT_UNDEFINED if not set. */
+static
+ulint
+trx_sys_file_format_max_read(void)
+/*==============================*/
+{
+	mtr_t			mtr;
+	const byte*		ptr;
+	const buf_block_t*	block;
+	ulint			format_id;
+	dulint			file_format_id;
+
+	/* Since this is called during the startup phase it's safe to
+	read the value without a covering mutex. */
+	mtr_start(&mtr);
+
+	block = buf_page_get(
+		TRX_SYS_SPACE, 0, TRX_SYS_PAGE_NO, RW_X_LATCH, &mtr);
+
+	ptr = buf_block_get_frame(block) + TRX_SYS_FILE_FORMAT_TAG;
+	file_format_id = mach_read_from_8(ptr);
+
+	mtr_commit(&mtr);
+
+	format_id = file_format_id.low - TRX_SYS_FILE_FORMAT_TAG_MAGIC_N_LOW;
+
+	if (file_format_id.high != TRX_SYS_FILE_FORMAT_TAG_MAGIC_N_HIGH
+	    || format_id >= FILE_FORMAT_NAME_N) {
+
+		/* Either it has never been tagged, or garbage in it. */
+		return(ULINT_UNDEFINED);
+	}
+
+	return(format_id);
+}
+
+/*****************************************************************//**
+Get the name representation of the file format from its id.
+@return	pointer to the name */
+UNIV_INTERN
+const char*
+trx_sys_file_format_id_to_name(
+/*===========================*/
+	const ulint	id)	/*!< in: id of the file format */
+{
+	ut_a(id < FILE_FORMAT_NAME_N);
+
+	return(file_format_name_map[id]);
+}
+
+/*****************************************************************//**
+Check for the max file format tag stored on disk. Note: If max_format_id
+is == DICT_TF_FORMAT_MAX + 1 then we only print a warning.
+@return	DB_SUCCESS or error code */
+UNIV_INTERN
+ulint
+trx_sys_file_format_max_check(
+/*==========================*/
+	ulint	max_format_id)	/*!< in: max format id to check */
+{
+	ulint	format_id;
+
+	/* Check the file format in the tablespace. Do not try to
+	recover if the file format is not supported by the engine
+	unless forced by the user. */
+	format_id = trx_sys_file_format_max_read();
+	if (format_id == ULINT_UNDEFINED) {
+		/* Format ID was not set. Set it to minimum possible
+		value. */
+		format_id = DICT_TF_FORMAT_51;
+	}
+
+	ut_print_timestamp(stderr);
+	fprintf(stderr,
+		"  InnoDB: highest supported file format is %s.\n",
+		trx_sys_file_format_id_to_name(DICT_TF_FORMAT_MAX));
+
+	if (format_id > DICT_TF_FORMAT_MAX) {
+
+		ut_a(format_id < FILE_FORMAT_NAME_N);
+
+		ut_print_timestamp(stderr);
+		fprintf(stderr,
+			"  InnoDB: %s: the system tablespace is in a file "
+			"format that this version doesn't support - %s\n",
+			((max_format_id <= DICT_TF_FORMAT_MAX)
+				? "Error" : "Warning"),
+			trx_sys_file_format_id_to_name(format_id));
+
+		if (max_format_id <= DICT_TF_FORMAT_MAX) {
+			return(DB_ERROR);
+		}
+	}
+
+	format_id = (format_id > max_format_id) ? format_id : max_format_id;
+
+	/* We don't need a mutex here, as this function should only
+	be called once at start up. */
+	file_format_max.id = format_id;
+	file_format_max.name = trx_sys_file_format_id_to_name(format_id);
+
+	return(DB_SUCCESS);
+}
+
+/*****************************************************************//**
+Set the file format id unconditionally except if it's already the
+same value.
+@return	TRUE if value updated */
+UNIV_INTERN
+ibool
+trx_sys_file_format_max_set(
+/*========================*/
+	ulint		format_id,	/*!< in: file format id */
+	const char**	name)		/*!< out: max file format name or
+					NULL if not needed. */
+{
+	ibool		ret = FALSE;
+
+	ut_a(format_id <= DICT_TF_FORMAT_MAX);
+
+	mutex_enter(&file_format_max.mutex);
+
+	/* Only update if not already same value. */
+	if (format_id != file_format_max.id) {
+
+		ret = trx_sys_file_format_max_write(format_id, name);
+	}
+
+	mutex_exit(&file_format_max.mutex);
+
+	return(ret);
+}
+
+/********************************************************************//**
+Tags the system table space with minimum format id if it has not been
+tagged yet.
+WARNING: This function is only called during the startup and AFTER the
+redo log application during recovery has finished. */
+UNIV_INTERN
+void
+trx_sys_file_format_tag_init(void)
+/*==============================*/
+{
+	ulint	format_id;
+
+	format_id = trx_sys_file_format_max_read();
+
+	/* If format_id is not set then set it to the minimum. */
+	if (format_id == ULINT_UNDEFINED) {
+		trx_sys_file_format_max_set(DICT_TF_FORMAT_51, NULL);
+	}
+}
+
+/********************************************************************//**
+Update the file format tag in the system tablespace only if the given
+format id is greater than the known max id.
+@return	TRUE if format_id was bigger than the known max id */
+UNIV_INTERN
+ibool
+trx_sys_file_format_max_upgrade(
+/*============================*/
+	const char**	name,		/*!< out: max file format name */
+	ulint		format_id)	/*!< in: file format identifier */
+{
+	ibool		ret = FALSE;
+
+	ut_a(name);
+	ut_a(file_format_max.name != NULL);
+	ut_a(format_id <= DICT_TF_FORMAT_MAX);
+
+	mutex_enter(&file_format_max.mutex);
+
+	if (format_id > file_format_max.id) {
+
+		ret = trx_sys_file_format_max_write(format_id, name);
+	}
+
+	mutex_exit(&file_format_max.mutex);
+
+	return(ret);
+}
+
+/*****************************************************************//**
+Get the name representation of the file format from its id.
+@return	pointer to the max format name */
+UNIV_INTERN
+const char*
+trx_sys_file_format_max_get(void)
+/*=============================*/
+{
+	return(file_format_max.name);
+}
+
+/*****************************************************************//**
+Initializes the tablespace tag system. */
+UNIV_INTERN
+void
+trx_sys_file_format_init(void)
+/*==========================*/
+{
+	mutex_create(&file_format_max.mutex, SYNC_FILE_FORMAT_TAG);
+
+	/* We don't need a mutex here, as this function should only
+	be called once at start up. */
+	file_format_max.id = DICT_TF_FORMAT_51;
+
+	file_format_max.name = trx_sys_file_format_id_to_name(
+		file_format_max.id);
+}
+
+/*****************************************************************//**
+Closes the tablespace tag system. */
+UNIV_INTERN
+void
+trx_sys_file_format_close(void)
+/*===========================*/
+{
+	/* Does nothing at the moment */
+}
+#else /* !UNIV_HOTBACKUP */
+/*****************************************************************//**
+Prints to stderr the MySQL binlog info in the system header if the
+magic number shows it valid. */
+UNIV_INTERN
+void
+trx_sys_print_mysql_binlog_offset_from_page(
+/*========================================*/
+	const byte*	page)	/*!< in: buffer containing the trx
+				system header page, i.e., page number
+				TRX_SYS_PAGE_NO in the tablespace */
+{
+	const trx_sysf_t*	sys_header;
+
+	sys_header = page + TRX_SYS;
+
+	if (mach_read_from_4(sys_header + TRX_SYS_MYSQL_LOG_INFO
+			     + TRX_SYS_MYSQL_LOG_MAGIC_N_FLD)
+	    == TRX_SYS_MYSQL_LOG_MAGIC_N) {
+
+		fprintf(stderr,
+			"ibbackup: Last MySQL binlog file position %lu %lu,"
+			" file name %s\n",
+			(ulong) mach_read_from_4(
+				sys_header + TRX_SYS_MYSQL_LOG_INFO
+				+ TRX_SYS_MYSQL_LOG_OFFSET_HIGH),
+			(ulong) mach_read_from_4(
+				sys_header + TRX_SYS_MYSQL_LOG_INFO
+				+ TRX_SYS_MYSQL_LOG_OFFSET_LOW),
+			sys_header + TRX_SYS_MYSQL_LOG_INFO
+			+ TRX_SYS_MYSQL_LOG_NAME);
+	}
+}
+
+
+/* THESE ARE COPIED FROM NON-HOTBACKUP PART OF THE INNODB SOURCE TREE
+   (This code duplicaton should be fixed at some point!)
+*/
+
+#define	TRX_SYS_SPACE	0	/* the SYSTEM tablespace */
+/* The offset of the file format tag on the trx system header page */
+#define TRX_SYS_FILE_FORMAT_TAG		(UNIV_PAGE_SIZE - 16)
+/* We use these random constants to reduce the probability of reading
+garbage (from previous versions) that maps to an actual format id. We
+use these as bit masks at the time of  reading and writing from/to disk. */
+#define TRX_SYS_FILE_FORMAT_TAG_MAGIC_N_LOW	3645922177UL
+#define TRX_SYS_FILE_FORMAT_TAG_MAGIC_N_HIGH	2745987765UL
+
+/* END OF COPIED DEFINITIONS */
+
+
+/*****************************************************************//**
+Reads the file format id from the first system table space file.
+Even if the call succeeds and returns TRUE, the returned format id
+may be ULINT_UNDEFINED signalling that the format id was not present
+in the data file.
+@return TRUE if call succeeds */
+UNIV_INTERN
+ibool
+trx_sys_read_file_format_id(
+/*========================*/
+	const char *pathname,  /*!< in: pathname of the first system
+				        table space file */
+	ulint *format_id)      /*!< out: file format of the system table
+				         space */
+{
+	os_file_t	file;
+	ibool		success;
+	byte		buf[UNIV_PAGE_SIZE * 2];
+	page_t*		page = ut_align(buf, UNIV_PAGE_SIZE);
+	const byte*	ptr;
+	dulint		file_format_id;
+
+	*format_id = ULINT_UNDEFINED;
+	
+	file = os_file_create_simple_no_error_handling(
+		pathname,
+		OS_FILE_OPEN,
+		OS_FILE_READ_ONLY,
+		&success
+	);
+	if (!success) {
+		/* The following call prints an error message */
+		os_file_get_last_error(TRUE);
+        
+		ut_print_timestamp(stderr);
+        
+		fprintf(stderr,
+"  ibbackup: Error: trying to read system tablespace file format,\n"
+"  ibbackup: but could not open the tablespace file %s!\n",
+			pathname
+		);
+		return(FALSE);
+	}
+
+	/* Read the page on which file format is stored */
+
+	success = os_file_read_no_error_handling(
+		file, page, TRX_SYS_PAGE_NO * UNIV_PAGE_SIZE, 0, UNIV_PAGE_SIZE
+	);
+	if (!success) {
+		/* The following call prints an error message */
+		os_file_get_last_error(TRUE);
+        
+		ut_print_timestamp(stderr);
+        
+		fprintf(stderr,
+"  ibbackup: Error: trying to read system table space file format,\n"
+"  ibbackup: but failed to read the tablespace file %s!\n",
+			pathname
+		);
+		os_file_close(file);
+		return(FALSE);
+	}
+	os_file_close(file);
+
+	/* get the file format from the page */
+	ptr = page + TRX_SYS_FILE_FORMAT_TAG;
+	file_format_id = mach_read_from_8(ptr);
+
+	*format_id = file_format_id.low - TRX_SYS_FILE_FORMAT_TAG_MAGIC_N_LOW;
+
+	if (file_format_id.high != TRX_SYS_FILE_FORMAT_TAG_MAGIC_N_HIGH
+	    || *format_id >= FILE_FORMAT_NAME_N) {
+
+		/* Either it has never been tagged, or garbage in it. */
+		*format_id = ULINT_UNDEFINED;
+		return(TRUE);
+	}
+	
+	return(TRUE);
+}
+
+
+/*****************************************************************//**
+Reads the file format id from the given per-table data file.
+@return TRUE if call succeeds */
+UNIV_INTERN
+ibool
+trx_sys_read_pertable_file_format_id(
+/*=================================*/
+	const char *pathname,  /*!< in: pathname of a per-table
+				        datafile */
+	ulint *format_id)      /*!< out: file format of the per-table
+				         data file */
+{
+	os_file_t	file;
+	ibool		success;
+	byte		buf[UNIV_PAGE_SIZE * 2];
+	page_t*		page = ut_align(buf, UNIV_PAGE_SIZE);
+	const byte*	ptr;
+	ib_uint32_t	flags;
+
+	*format_id = ULINT_UNDEFINED;
+	
+	file = os_file_create_simple_no_error_handling(
+		pathname,
+		OS_FILE_OPEN,
+		OS_FILE_READ_ONLY,
+		&success
+	);
+	if (!success) {
+		/* The following call prints an error message */
+		os_file_get_last_error(TRUE);
+        
+		ut_print_timestamp(stderr);
+        
+		fprintf(stderr,
+"  ibbackup: Error: trying to read per-table tablespace format,\n"
+"  ibbackup: but could not open the tablespace file %s!\n",
+			pathname
+		);
+		return(FALSE);
+	}
+
+	/* Read the first page of the per-table datafile */
+
+	success = os_file_read_no_error_handling(
+		file, page, 0, 0, UNIV_PAGE_SIZE
+	);
+	if (!success) {
+		/* The following call prints an error message */
+		os_file_get_last_error(TRUE);
+        
+		ut_print_timestamp(stderr);
+        
+		fprintf(stderr,
+"  ibbackup: Error: trying to per-table data file format,\n"
+"  ibbackup: but failed to read the tablespace file %s!\n",
+			pathname
+		);
+		os_file_close(file);
+		return(FALSE);
+	}
+	os_file_close(file);
+
+	/* get the file format from the page */
+	ptr = page + 54;
+	flags = mach_read_from_4(ptr);
+	if (flags == 0) {
+		/* file format is Antelope */
+		*format_id = 0;
+		return (TRUE);
+	} else if (flags & 1) {
+		/* tablespace flags are ok */
+		*format_id = (flags / 32) % 128;
+		return (TRUE);
+	} else {
+		/* bad tablespace flags */
+		return(FALSE);
+	}
+}
+
+
+/*****************************************************************//**
+Get the name representation of the file format from its id.
+@return	pointer to the name */
+UNIV_INTERN
+const char*
+trx_sys_file_format_id_to_name(
+/*===========================*/
+	const ulint	id)	/*!< in: id of the file format */
+{
+	if (!(id < FILE_FORMAT_NAME_N)) {
+		/* unknown id */
+		return ("Unknown");
+	}
+
+	return(file_format_name_map[id]);
+}
+
+#endif /* !UNIV_HOTBACKUP */
+
+#ifndef UNIV_HOTBACKUP
+/*********************************************************************
+Shutdown/Close the transaction system. */
+UNIV_INTERN
+void
+trx_sys_close(void)
+/*===============*/
+{
+	trx_rseg_t*	rseg;
+	read_view_t*	view;
+
+	ut_ad(trx_sys != NULL);
+
+	/* Check that all read views are closed except read view owned
+	by a purge. */
+
+	if (UT_LIST_GET_LEN(trx_sys->view_list) > 1) {
+		fprintf(stderr,
+			"InnoDB: Error: all read views were not closed"
+			" before shutdown:\n"
+			"InnoDB: %lu read views open \n",
+			UT_LIST_GET_LEN(trx_sys->view_list) - 1);
+	}
+
+	sess_close(trx_dummy_sess);
+	trx_dummy_sess = NULL;
+
+	trx_purge_sys_close();
+
+	mutex_enter(&kernel_mutex);
+
+	/* Free the double write data structures. */
+	ut_a(trx_doublewrite != NULL);
+	ut_free(trx_doublewrite->write_buf_unaligned);
+	trx_doublewrite->write_buf_unaligned = NULL;
+
+	mem_free(trx_doublewrite->buf_block_arr);
+	trx_doublewrite->buf_block_arr = NULL;
+
+	mutex_free(&trx_doublewrite->mutex);
+	mem_free(trx_doublewrite);
+	trx_doublewrite = NULL;
+
+	/* There can't be any active transactions. */
+	rseg = UT_LIST_GET_FIRST(trx_sys->rseg_list);
+
+	while (rseg != NULL) {
+		trx_rseg_t*	prev_rseg = rseg;
+
+		rseg = UT_LIST_GET_NEXT(rseg_list, prev_rseg);
+		UT_LIST_REMOVE(rseg_list, trx_sys->rseg_list, prev_rseg);
+
+		trx_rseg_mem_free(prev_rseg);
+	}
+
+	view = UT_LIST_GET_FIRST(trx_sys->view_list);
+
+	while (view != NULL) {
+		read_view_t*	prev_view = view;
+
+		view = UT_LIST_GET_NEXT(view_list, prev_view);
+
+		/* Views are allocated from the trx_sys->global_read_view_heap.
+		So, we simply remove the element here. */
+		UT_LIST_REMOVE(view_list, trx_sys->view_list, prev_view);
+	}
+
+	ut_a(UT_LIST_GET_LEN(trx_sys->trx_list) == 0);
+	ut_a(UT_LIST_GET_LEN(trx_sys->rseg_list) == 0);
+	ut_a(UT_LIST_GET_LEN(trx_sys->view_list) == 0);
+	ut_a(UT_LIST_GET_LEN(trx_sys->mysql_trx_list) == 0);
+
+	mem_free(trx_sys);
+
+	trx_sys = NULL;
+	mutex_exit(&kernel_mutex);
+}
+#endif /* !UNIV_HOTBACKUP */
diff --git a/storage/xtradb/trx/trx0trx.c b/storage/xtradb/trx/trx0trx.c
new file mode 100644
index 00000000000..9584f0c4c46
--- /dev/null
+++ b/storage/xtradb/trx/trx0trx.c
@@ -0,0 +1,2156 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2010, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file trx/trx0trx.c
+The transaction
+
+Created 3/26/1996 Heikki Tuuri
+*******************************************************/
+
+#include "trx0trx.h"
+
+#ifdef UNIV_NONINL
+#include "trx0trx.ic"
+#endif
+
+#include "trx0undo.h"
+#include "trx0rseg.h"
+#include "log0log.h"
+#include "que0que.h"
+#include "lock0lock.h"
+#include "trx0roll.h"
+#include "usr0sess.h"
+#include "read0read.h"
+#include "srv0srv.h"
+#include "thr0loc.h"
+#include "btr0sea.h"
+#include "os0proc.h"
+#include "trx0xa.h"
+#include "ha_prototypes.h"
+
+/** Dummy session used currently in MySQL interface */
+UNIV_INTERN sess_t*		trx_dummy_sess = NULL;
+
+/** Number of transactions currently allocated for MySQL: protected by
+the kernel mutex */
+UNIV_INTERN ulint	trx_n_mysql_transactions = 0;
+
+/*************************************************************//**
+Set detailed error message for the transaction. */
+UNIV_INTERN
+void
+trx_set_detailed_error(
+/*===================*/
+	trx_t*		trx,	/*!< in: transaction struct */
+	const char*	msg)	/*!< in: detailed error message */
+{
+	ut_strlcpy(trx->detailed_error, msg, sizeof(trx->detailed_error));
+}
+
+/*************************************************************//**
+Set detailed error message for the transaction from a file. Note that the
+file is rewinded before reading from it. */
+UNIV_INTERN
+void
+trx_set_detailed_error_from_file(
+/*=============================*/
+	trx_t*	trx,	/*!< in: transaction struct */
+	FILE*	file)	/*!< in: file to read message from */
+{
+	os_file_read_string(file, trx->detailed_error,
+			    sizeof(trx->detailed_error));
+}
+
+/****************************************************************//**
+Creates and initializes a transaction object.
+@return	own: the transaction */
+UNIV_INTERN
+trx_t*
+trx_create(
+/*=======*/
+	sess_t*	sess)	/*!< in: session */
+{
+	trx_t*	trx;
+
+	ut_ad(mutex_own(&kernel_mutex));
+	ut_ad(sess);
+
+	trx = mem_alloc(sizeof(trx_t));
+
+	trx->magic_n = TRX_MAGIC_N;
+
+	trx->op_info = "";
+
+	trx->is_purge = 0;
+	trx->is_recovered = 0;
+	trx->conc_state = TRX_NOT_STARTED;
+	trx->start_time = time(NULL);
+
+	trx->isolation_level = TRX_ISO_REPEATABLE_READ;
+
+	trx->id = ut_dulint_zero;
+	trx->no = ut_dulint_max;
+
+	trx->support_xa = TRUE;
+
+	trx->flush_log_at_trx_commit_session = 3; /* means to use innodb_flush_log_at_trx_commit value */
+
+	trx->check_foreigns = TRUE;
+	trx->check_unique_secondary = TRUE;
+
+	trx->flush_log_later = FALSE;
+	trx->must_flush_log_later = FALSE;
+
+	trx->dict_operation = TRX_DICT_OP_NONE;
+	trx->table_id = ut_dulint_zero;
+
+	trx->mysql_thd = NULL;
+	trx->active_trans = 0;
+	trx->duplicates = 0;
+
+	trx->n_mysql_tables_in_use = 0;
+	trx->mysql_n_tables_locked = 0;
+
+	trx->mysql_log_file_name = NULL;
+	trx->mysql_log_offset = 0;
+	trx->mysql_master_log_file_name = "";
+	trx->mysql_master_log_pos = 0;
+	trx->mysql_relay_log_file_name = "";
+	trx->mysql_relay_log_pos = 0;
+
+	mutex_create(&trx->undo_mutex, SYNC_TRX_UNDO);
+
+	trx->rseg = NULL;
+
+	trx->undo_no = ut_dulint_zero;
+	trx->last_sql_stat_start.least_undo_no = ut_dulint_zero;
+	trx->insert_undo = NULL;
+	trx->update_undo = NULL;
+	trx->undo_no_arr = NULL;
+
+	trx->error_state = DB_SUCCESS;
+	trx->error_key_num = 0;
+	trx->detailed_error[0] = '\0';
+
+	trx->sess = sess;
+	trx->que_state = TRX_QUE_RUNNING;
+	trx->n_active_thrs = 0;
+
+	trx->handling_signals = FALSE;
+
+	UT_LIST_INIT(trx->signals);
+	UT_LIST_INIT(trx->reply_signals);
+
+	trx->graph = NULL;
+
+	trx->wait_lock = NULL;
+	trx->was_chosen_as_deadlock_victim = FALSE;
+	UT_LIST_INIT(trx->wait_thrs);
+
+	trx->lock_heap = mem_heap_create_in_buffer(256);
+	UT_LIST_INIT(trx->trx_locks);
+
+	UT_LIST_INIT(trx->trx_savepoints);
+
+	trx->dict_operation_lock_mode = 0;
+	trx->has_search_latch = FALSE;
+	trx->search_latch_timeout = BTR_SEA_TIMEOUT;
+
+	trx->declared_to_be_inside_innodb = FALSE;
+	trx->n_tickets_to_enter_innodb = 0;
+
+	trx->global_read_view_heap = mem_heap_create(256);
+	trx->global_read_view = NULL;
+	trx->read_view = NULL;
+
+	trx->io_reads = 0;
+	trx->io_read = 0;
+	trx->io_reads_wait_timer = 0;
+	trx->lock_que_wait_timer = 0;
+	trx->innodb_que_wait_timer = 0;
+	trx->distinct_page_access = 0;
+	trx->distinct_page_access_hash = NULL;
+	trx->take_stats = FALSE;
+
+	/* Set X/Open XA transaction identification to NULL */
+	memset(&trx->xid, 0, sizeof(trx->xid));
+	trx->xid.formatID = -1;
+
+	trx->n_autoinc_rows = 0;
+
+	/* Remember to free the vector explicitly. */
+	trx->autoinc_locks = ib_vector_create(
+		mem_heap_create(sizeof(ib_vector_t) + sizeof(void*) * 4), 4);
+
+	return(trx);
+}
+
+/********************************************************************//**
+Creates a transaction object for MySQL.
+@return	own: transaction object */
+UNIV_INTERN
+trx_t*
+trx_allocate_for_mysql(void)
+/*========================*/
+{
+	trx_t*	trx;
+
+	mutex_enter(&kernel_mutex);
+
+	trx = trx_create(trx_dummy_sess);
+
+	trx_n_mysql_transactions++;
+
+	UT_LIST_ADD_FIRST(mysql_trx_list, trx_sys->mysql_trx_list, trx);
+
+	mutex_exit(&kernel_mutex);
+
+	trx->mysql_thread_id = os_thread_get_curr_id();
+
+	trx->mysql_process_no = os_proc_get_number();
+
+	if (innobase_get_slow_log() && trx->take_stats) {
+		trx->distinct_page_access_hash = mem_alloc(DPAH_SIZE);
+		memset(trx->distinct_page_access_hash, 0, DPAH_SIZE);
+	}
+
+	return(trx);
+}
+
+/********************************************************************//**
+Creates a transaction object for background operations by the master thread.
+@return	own: transaction object */
+UNIV_INTERN
+trx_t*
+trx_allocate_for_background(void)
+/*=============================*/
+{
+	trx_t*	trx;
+
+	mutex_enter(&kernel_mutex);
+
+	trx = trx_create(trx_dummy_sess);
+
+	mutex_exit(&kernel_mutex);
+
+	return(trx);
+}
+
+/********************************************************************//**
+Releases the search latch if trx has reserved it. */
+UNIV_INTERN
+void
+trx_search_latch_release_if_reserved(
+/*=================================*/
+	trx_t*	   trx) /*!< in: transaction */
+{
+	if (trx->has_search_latch) {
+		rw_lock_s_unlock(&btr_search_latch);
+
+		trx->has_search_latch = FALSE;
+	}
+}
+
+/********************************************************************//**
+Frees a transaction object. */
+UNIV_INTERN
+void
+trx_free(
+/*=====*/
+	trx_t*	trx)	/*!< in, own: trx object */
+{
+	ut_ad(mutex_own(&kernel_mutex));
+
+	if (trx->declared_to_be_inside_innodb) {
+		ut_print_timestamp(stderr);
+		fputs("  InnoDB: Error: Freeing a trx which is declared"
+		      " to be processing\n"
+		      "InnoDB: inside InnoDB.\n", stderr);
+		trx_print(stderr, trx, 600);
+		putc('\n', stderr);
+
+		/* This is an error but not a fatal error. We must keep
+		the counters like srv_conc_n_threads accurate. */
+		srv_conc_force_exit_innodb(trx);
+	}
+
+	if (trx->n_mysql_tables_in_use != 0
+	    || trx->mysql_n_tables_locked != 0) {
+
+		ut_print_timestamp(stderr);
+		fprintf(stderr,
+			"  InnoDB: Error: MySQL is freeing a thd\n"
+			"InnoDB: though trx->n_mysql_tables_in_use is %lu\n"
+			"InnoDB: and trx->mysql_n_tables_locked is %lu.\n",
+			(ulong)trx->n_mysql_tables_in_use,
+			(ulong)trx->mysql_n_tables_locked);
+
+		trx_print(stderr, trx, 600);
+
+		ut_print_buf(stderr, trx, sizeof(trx_t));
+		putc('\n', stderr);
+	}
+
+	ut_a(trx->magic_n == TRX_MAGIC_N);
+
+	trx->magic_n = 11112222;
+
+	ut_a(trx->conc_state == TRX_NOT_STARTED);
+
+	mutex_free(&(trx->undo_mutex));
+
+	ut_a(trx->insert_undo == NULL);
+	ut_a(trx->update_undo == NULL);
+
+	if (trx->undo_no_arr) {
+		trx_undo_arr_free(trx->undo_no_arr);
+	}
+
+	ut_a(UT_LIST_GET_LEN(trx->signals) == 0);
+	ut_a(UT_LIST_GET_LEN(trx->reply_signals) == 0);
+
+	ut_a(trx->wait_lock == NULL);
+	ut_a(UT_LIST_GET_LEN(trx->wait_thrs) == 0);
+
+	ut_a(!trx->has_search_latch);
+
+	ut_a(trx->dict_operation_lock_mode == 0);
+
+	if (trx->lock_heap) {
+		mem_heap_free(trx->lock_heap);
+	}
+
+	ut_a(UT_LIST_GET_LEN(trx->trx_locks) == 0);
+
+	if (trx->global_read_view_heap) {
+		mem_heap_free(trx->global_read_view_heap);
+	}
+
+	trx->global_read_view = NULL;
+
+	ut_a(trx->read_view == NULL);
+
+	ut_a(ib_vector_is_empty(trx->autoinc_locks));
+	/* We allocated a dedicated heap for the vector. */
+	ib_vector_free(trx->autoinc_locks);
+
+	mem_free(trx);
+}
+
+/********************************************************************//**
+Frees a transaction object for MySQL. */
+UNIV_INTERN
+void
+trx_free_for_mysql(
+/*===============*/
+	trx_t*	trx)	/*!< in, own: trx object */
+{
+	if (trx->distinct_page_access_hash)
+	{
+		mem_free(trx->distinct_page_access_hash);
+		trx->distinct_page_access_hash= NULL;
+	}
+
+	mutex_enter(&kernel_mutex);
+
+	UT_LIST_REMOVE(mysql_trx_list, trx_sys->mysql_trx_list, trx);
+
+	trx_free(trx);
+
+	ut_a(trx_n_mysql_transactions > 0);
+
+	trx_n_mysql_transactions--;
+
+	mutex_exit(&kernel_mutex);
+}
+
+/********************************************************************//**
+Frees a transaction object of a background operation of the master thread. */
+UNIV_INTERN
+void
+trx_free_for_background(
+/*====================*/
+	trx_t*	trx)	/*!< in, own: trx object */
+{
+	if (trx->distinct_page_access_hash)
+	{
+		mem_free(trx->distinct_page_access_hash);
+		trx->distinct_page_access_hash= NULL;
+	}
+
+	mutex_enter(&kernel_mutex);
+
+	trx_free(trx);
+
+	mutex_exit(&kernel_mutex);
+}
+
+/****************************************************************//**
+Inserts the trx handle in the trx system trx list in the right position.
+The list is sorted on the trx id so that the biggest id is at the list
+start. This function is used at the database startup to insert incomplete
+transactions to the list. */
+static
+void
+trx_list_insert_ordered(
+/*====================*/
+	trx_t*	trx)	/*!< in: trx handle */
+{
+	trx_t*	trx2;
+
+	ut_ad(mutex_own(&kernel_mutex));
+
+	trx2 = UT_LIST_GET_FIRST(trx_sys->trx_list);
+
+	while (trx2 != NULL) {
+		if (ut_dulint_cmp(trx->id, trx2->id) >= 0) {
+
+			ut_ad(ut_dulint_cmp(trx->id, trx2->id) == 1);
+			break;
+		}
+		trx2 = UT_LIST_GET_NEXT(trx_list, trx2);
+	}
+
+	if (trx2 != NULL) {
+		trx2 = UT_LIST_GET_PREV(trx_list, trx2);
+
+		if (trx2 == NULL) {
+			UT_LIST_ADD_FIRST(trx_list, trx_sys->trx_list, trx);
+		} else {
+			UT_LIST_INSERT_AFTER(trx_list, trx_sys->trx_list,
+					     trx2, trx);
+		}
+	} else {
+		UT_LIST_ADD_LAST(trx_list, trx_sys->trx_list, trx);
+	}
+}
+
+/****************************************************************//**
+Creates trx objects for transactions and initializes the trx list of
+trx_sys at database start. Rollback segment and undo log lists must
+already exist when this function is called, because the lists of
+transactions to be rolled back or cleaned up are built based on the
+undo log lists. */
+UNIV_INTERN
+void
+trx_lists_init_at_db_start(void)
+/*============================*/
+{
+	trx_rseg_t*	rseg;
+	trx_undo_t*	undo;
+	trx_t*		trx;
+
+	ut_ad(mutex_own(&kernel_mutex));
+	UT_LIST_INIT(trx_sys->trx_list);
+
+	/* Look from the rollback segments if there exist undo logs for
+	transactions */
+
+	rseg = UT_LIST_GET_FIRST(trx_sys->rseg_list);
+
+	while (rseg != NULL) {
+		undo = UT_LIST_GET_FIRST(rseg->insert_undo_list);
+
+		while (undo != NULL) {
+
+			trx = trx_create(trx_dummy_sess);
+
+			trx->is_recovered = TRUE;
+			trx->id = undo->trx_id;
+			trx->xid = undo->xid;
+			trx->insert_undo = undo;
+			trx->rseg = rseg;
+
+			if (undo->state != TRX_UNDO_ACTIVE) {
+
+				/* Prepared transactions are left in
+				the prepared state waiting for a
+				commit or abort decision from MySQL */
+
+				if (undo->state == TRX_UNDO_PREPARED) {
+
+					fprintf(stderr,
+						"InnoDB: Transaction "
+						TRX_ID_FMT
+						" was in the"
+						" XA prepared state.\n",
+						TRX_ID_PREP_PRINTF(trx->id));
+
+					if (srv_force_recovery == 0) {
+
+						trx->conc_state = TRX_PREPARED;
+					} else {
+						fprintf(stderr,
+							"InnoDB: Since"
+							" innodb_force_recovery"
+							" > 0, we will"
+							" rollback it"
+							" anyway.\n");
+
+						trx->conc_state = TRX_ACTIVE;
+					}
+				} else {
+					trx->conc_state
+						= TRX_COMMITTED_IN_MEMORY;
+				}
+
+				/* We give a dummy value for the trx no;
+				this should have no relevance since purge
+				is not interested in committed transaction
+				numbers, unless they are in the history
+				list, in which case it looks the number
+				from the disk based undo log structure */
+
+				trx->no = trx->id;
+			} else {
+				trx->conc_state = TRX_ACTIVE;
+
+				/* A running transaction always has the number
+				field inited to ut_dulint_max */
+
+				trx->no = ut_dulint_max;
+			}
+
+			if (undo->dict_operation) {
+				trx_set_dict_operation(
+					trx, TRX_DICT_OP_TABLE);
+				trx->table_id = undo->table_id;
+			}
+
+			if (!undo->empty) {
+				trx->undo_no = ut_dulint_add(undo->top_undo_no,
+							     1);
+			}
+
+			trx_list_insert_ordered(trx);
+
+			undo = UT_LIST_GET_NEXT(undo_list, undo);
+		}
+
+		undo = UT_LIST_GET_FIRST(rseg->update_undo_list);
+
+		while (undo != NULL) {
+			trx = trx_get_on_id(undo->trx_id);
+
+			if (NULL == trx) {
+				trx = trx_create(trx_dummy_sess);
+
+				trx->is_recovered = TRUE;
+				trx->id = undo->trx_id;
+				trx->xid = undo->xid;
+
+				if (undo->state != TRX_UNDO_ACTIVE) {
+
+					/* Prepared transactions are left in
+					the prepared state waiting for a
+					commit or abort decision from MySQL */
+
+					if (undo->state == TRX_UNDO_PREPARED) {
+						fprintf(stderr,
+							"InnoDB: Transaction "
+							TRX_ID_FMT " was in the"
+							" XA prepared state.\n",
+							TRX_ID_PREP_PRINTF(
+								trx->id));
+
+						if (srv_force_recovery == 0) {
+
+							trx->conc_state
+								= TRX_PREPARED;
+						} else {
+							fprintf(stderr,
+								"InnoDB: Since"
+								" innodb_force_recovery"
+								" > 0, we will"
+								" rollback it"
+								" anyway.\n");
+
+							trx->conc_state
+								= TRX_ACTIVE;
+						}
+					} else {
+						trx->conc_state
+							= TRX_COMMITTED_IN_MEMORY;
+					}
+
+					/* We give a dummy value for the trx
+					number */
+
+					trx->no = trx->id;
+				} else {
+					trx->conc_state = TRX_ACTIVE;
+
+					/* A running transaction always has
+					the number field inited to
+					ut_dulint_max */
+
+					trx->no = ut_dulint_max;
+				}
+
+				trx->rseg = rseg;
+				trx_list_insert_ordered(trx);
+
+				if (undo->dict_operation) {
+					trx_set_dict_operation(
+						trx, TRX_DICT_OP_TABLE);
+					trx->table_id = undo->table_id;
+				}
+			}
+
+			trx->update_undo = undo;
+
+			if ((!undo->empty)
+			    && (ut_dulint_cmp(undo->top_undo_no,
+					      trx->undo_no) >= 0)) {
+
+				trx->undo_no = ut_dulint_add(undo->top_undo_no,
+							     1);
+			}
+
+			undo = UT_LIST_GET_NEXT(undo_list, undo);
+		}
+
+		rseg = UT_LIST_GET_NEXT(rseg_list, rseg);
+	}
+}
+
+/******************************************************************//**
+Assigns a rollback segment to a transaction in a round-robin fashion.
+Skips the SYSTEM rollback segment if another is available.
+@return	assigned rollback segment id */
+UNIV_INLINE
+ulint
+trx_assign_rseg(void)
+/*=================*/
+{
+	trx_rseg_t*	rseg	= trx_sys->latest_rseg;
+
+	ut_ad(mutex_own(&kernel_mutex));
+loop:
+	/* Get next rseg in a round-robin fashion */
+
+	rseg = UT_LIST_GET_NEXT(rseg_list, rseg);
+
+	if (rseg == NULL) {
+		rseg = UT_LIST_GET_FIRST(trx_sys->rseg_list);
+	}
+
+	/* If it is the SYSTEM rollback segment, and there exist others, skip
+	it */
+
+	if ((rseg->id == TRX_SYS_SYSTEM_RSEG_ID)
+	    && (UT_LIST_GET_LEN(trx_sys->rseg_list) > 1)) {
+		goto loop;
+	}
+
+	trx_sys->latest_rseg = rseg;
+
+	return(rseg->id);
+}
+
+/****************************************************************//**
+Starts a new transaction.
+@return	TRUE */
+UNIV_INTERN
+ibool
+trx_start_low(
+/*==========*/
+	trx_t*	trx,	/*!< in: transaction */
+	ulint	rseg_id)/*!< in: rollback segment id; if ULINT_UNDEFINED
+			is passed, the system chooses the rollback segment
+			automatically in a round-robin fashion */
+{
+	trx_rseg_t*	rseg;
+
+	ut_ad(mutex_own(&kernel_mutex));
+	ut_ad(trx->rseg == NULL);
+
+	if (trx->is_purge) {
+		trx->id = ut_dulint_zero;
+		trx->conc_state = TRX_ACTIVE;
+		trx->start_time = time(NULL);
+
+		return(TRUE);
+	}
+
+	ut_ad(trx->conc_state != TRX_ACTIVE);
+
+	if (rseg_id == ULINT_UNDEFINED) {
+
+		rseg_id = trx_assign_rseg();
+	}
+
+	rseg = trx_sys_get_nth_rseg(trx_sys, rseg_id);
+
+	trx->id = trx_sys_get_new_trx_id();
+
+	/* The initial value for trx->no: ut_dulint_max is used in
+	read_view_open_now: */
+
+	trx->no = ut_dulint_max;
+
+	trx->rseg = rseg;
+
+	trx->conc_state = TRX_ACTIVE;
+	trx->start_time = time(NULL);
+
+	UT_LIST_ADD_FIRST(trx_list, trx_sys->trx_list, trx);
+
+	return(TRUE);
+}
+
+/****************************************************************//**
+Starts a new transaction.
+@return	TRUE */
+UNIV_INTERN
+ibool
+trx_start(
+/*======*/
+	trx_t*	trx,	/*!< in: transaction */
+	ulint	rseg_id)/*!< in: rollback segment id; if ULINT_UNDEFINED
+			is passed, the system chooses the rollback segment
+			automatically in a round-robin fashion */
+{
+	ibool	ret;
+
+	/* Update the info whether we should skip XA steps that eat CPU time
+	For the duration of the transaction trx->support_xa is not reread
+	from thd so any changes in the value take effect in the next
+	transaction. This is to avoid a scenario where some undo
+	generated by a transaction, has XA stuff, and other undo,
+	generated by the same transaction, doesn't. */
+	trx->support_xa = thd_supports_xa(trx->mysql_thd);
+
+	trx->flush_log_at_trx_commit_session =
+		thd_flush_log_at_trx_commit_session(trx->mysql_thd);
+
+	mutex_enter(&kernel_mutex);
+
+	ret = trx_start_low(trx, rseg_id);
+
+	mutex_exit(&kernel_mutex);
+
+	return(ret);
+}
+
+/****************************************************************//**
+Commits a transaction. */
+UNIV_INTERN
+void
+trx_commit_off_kernel(
+/*==================*/
+	trx_t*	trx)	/*!< in: transaction */
+{
+	page_t*		update_hdr_page;
+	ib_uint64_t	lsn		= 0;
+	trx_rseg_t*	rseg;
+	trx_undo_t*	undo;
+	mtr_t		mtr;
+	trx_sysf_t*	sys_header = NULL;
+
+	ut_ad(mutex_own(&kernel_mutex));
+
+	trx->must_flush_log_later = FALSE;
+
+	rseg = trx->rseg;
+
+	if (trx->insert_undo != NULL || trx->update_undo != NULL) {
+
+		mutex_exit(&kernel_mutex);
+
+		mtr_start(&mtr);
+
+		/* Change the undo log segment states from TRX_UNDO_ACTIVE
+		to some other state: these modifications to the file data
+		structure define the transaction as committed in the file
+		based world, at the serialization point of the log sequence
+		number lsn obtained below. */
+
+		mutex_enter(&(rseg->mutex));
+
+		if (trx->insert_undo != NULL) {
+			trx_undo_set_state_at_finish(
+				rseg, trx, trx->insert_undo, &mtr);
+		}
+
+		undo = trx->update_undo;
+
+		if (undo) {
+			mutex_enter(&kernel_mutex);
+			trx->no = trx_sys_get_new_trx_no();
+
+			mutex_exit(&kernel_mutex);
+
+			/* It is not necessary to obtain trx->undo_mutex here
+			because only a single OS thread is allowed to do the
+			transaction commit for this transaction. */
+
+			update_hdr_page = trx_undo_set_state_at_finish(
+				rseg, trx, undo, &mtr);
+
+			/* We have to do the cleanup for the update log while
+			holding the rseg mutex because update log headers
+			have to be put to the history list in the order of
+			the trx number. */
+
+			trx_undo_update_cleanup(trx, update_hdr_page, &mtr);
+		}
+
+		mutex_exit(&(rseg->mutex));
+
+		/* Update the latest MySQL binlog name and offset info
+		in trx sys header if MySQL binlogging is on or the database
+		server is a MySQL replication slave */
+
+		if (trx->mysql_log_file_name
+		    && trx->mysql_log_file_name[0] != '\0') {
+			if (!sys_header) {
+				sys_header = trx_sysf_get(&mtr);
+			}
+			trx_sys_update_mysql_binlog_offset(
+				sys_header,
+				trx->mysql_log_file_name,
+				trx->mysql_log_offset,
+				TRX_SYS_MYSQL_LOG_INFO, &mtr);
+			trx->mysql_log_file_name = NULL;
+		}
+
+		if (trx->mysql_master_log_file_name[0] != '\0') {
+			/* This database server is a MySQL replication slave */
+			if (!sys_header) {
+				sys_header = trx_sysf_get(&mtr);
+			}
+			trx_sys_update_mysql_binlog_offset(
+				sys_header,
+				trx->mysql_relay_log_file_name,
+				trx->mysql_relay_log_pos,
+				TRX_SYS_MYSQL_RELAY_LOG_INFO, &mtr);
+			trx_sys_update_mysql_binlog_offset(
+				sys_header,
+				trx->mysql_master_log_file_name,
+				trx->mysql_master_log_pos,
+				TRX_SYS_MYSQL_MASTER_LOG_INFO, &mtr);
+			trx->mysql_master_log_file_name = "";
+		}
+
+		/* The following call commits the mini-transaction, making the
+		whole transaction committed in the file-based world, at this
+		log sequence number. The transaction becomes 'durable' when
+		we write the log to disk, but in the logical sense the commit
+		in the file-based data structures (undo logs etc.) happens
+		here.
+
+		NOTE that transaction numbers, which are assigned only to
+		transactions with an update undo log, do not necessarily come
+		in exactly the same order as commit lsn's, if the transactions
+		have different rollback segments. To get exactly the same
+		order we should hold the kernel mutex up to this point,
+		adding to the contention of the kernel mutex. However, if
+		a transaction T2 is able to see modifications made by
+		a transaction T1, T2 will always get a bigger transaction
+		number and a bigger commit lsn than T1. */
+
+		/*--------------*/
+		mtr_commit(&mtr);
+		/*--------------*/
+		lsn = mtr.end_lsn;
+
+		mutex_enter(&kernel_mutex);
+	}
+
+	ut_ad(trx->conc_state == TRX_ACTIVE
+	      || trx->conc_state == TRX_PREPARED);
+	ut_ad(mutex_own(&kernel_mutex));
+
+	/* The following assignment makes the transaction committed in memory
+	and makes its changes to data visible to other transactions.
+	NOTE that there is a small discrepancy from the strict formal
+	visibility rules here: a human user of the database can see
+	modifications made by another transaction T even before the necessary
+	log segment has been flushed to the disk. If the database happens to
+	crash before the flush, the user has seen modifications from T which
+	will never be a committed transaction. However, any transaction T2
+	which sees the modifications of the committing transaction T, and
+	which also itself makes modifications to the database, will get an lsn
+	larger than the committing transaction T. In the case where the log
+	flush fails, and T never gets committed, also T2 will never get
+	committed. */
+
+	/*--------------------------------------*/
+	trx->conc_state = TRX_COMMITTED_IN_MEMORY;
+	/*--------------------------------------*/
+
+	/* If we release kernel_mutex below and we are still doing
+	recovery i.e.: back ground rollback thread is still active
+	then there is a chance that the rollback thread may see
+	this trx as COMMITTED_IN_MEMORY and goes adhead to clean it
+	up calling trx_cleanup_at_db_startup(). This can happen
+	in the case we are committing a trx here that is left in
+	PREPARED state during the crash. Note that commit of the
+	rollback of a PREPARED trx happens in the recovery thread
+	while the rollback of other transactions happen in the
+	background thread. To avoid this race we unconditionally
+	unset the is_recovered flag from the trx. */
+
+	trx->is_recovered = FALSE;
+
+	lock_release_off_kernel(trx);
+
+	if (trx->global_read_view) {
+		read_view_close(trx->global_read_view);
+		mem_heap_empty(trx->global_read_view_heap);
+		trx->global_read_view = NULL;
+	}
+
+	trx->read_view = NULL;
+
+	if (lsn) {
+		ulint	flush_log_at_trx_commit;
+
+		mutex_exit(&kernel_mutex);
+
+		if (trx->insert_undo != NULL) {
+
+			trx_undo_insert_cleanup(trx);
+		}
+
+		if (trx->flush_log_at_trx_commit_session == 3) {
+			flush_log_at_trx_commit = srv_flush_log_at_trx_commit;
+		} else {
+			flush_log_at_trx_commit = trx->flush_log_at_trx_commit_session;
+		}
+
+		/* NOTE that we could possibly make a group commit more
+		efficient here: call os_thread_yield here to allow also other
+		trxs to come to commit! */
+
+		/*-------------------------------------*/
+
+		/* Depending on the my.cnf options, we may now write the log
+		buffer to the log files, making the transaction durable if
+		the OS does not crash. We may also flush the log files to
+		disk, making the transaction durable also at an OS crash or a
+		power outage.
+
+		The idea in InnoDB's group commit is that a group of
+		transactions gather behind a trx doing a physical disk write
+		to log files, and when that physical write has been completed,
+		one of those transactions does a write which commits the whole
+		group. Note that this group commit will only bring benefit if
+		there are > 2 users in the database. Then at least 2 users can
+		gather behind one doing the physical log write to disk.
+
+		If we are calling trx_commit() under prepare_commit_mutex, we
+		will delay possible log write and flush to a separate function
+		trx_commit_complete_for_mysql(), which is only called when the
+		thread has released the mutex. This is to make the
+		group commit algorithm to work. Otherwise, the prepare_commit
+		mutex would serialize all commits and prevent a group of
+		transactions from gathering. */
+
+		if (trx->flush_log_later) {
+			/* Do nothing yet */
+			trx->must_flush_log_later = TRUE;
+		} else if (flush_log_at_trx_commit == 0) {
+			/* Do nothing */
+		} else if (flush_log_at_trx_commit == 1) {
+			if (srv_unix_file_flush_method == SRV_UNIX_NOSYNC) {
+				/* Write the log but do not flush it to disk */
+
+				log_write_up_to(lsn, LOG_WAIT_ONE_GROUP,
+						FALSE);
+			} else {
+				/* Write the log to the log files AND flush
+				them to disk */
+
+				log_write_up_to(lsn, LOG_WAIT_ONE_GROUP, TRUE);
+			}
+		} else if (flush_log_at_trx_commit == 2) {
+
+			/* Write the log but do not flush it to disk */
+
+			log_write_up_to(lsn, LOG_WAIT_ONE_GROUP, FALSE);
+		} else {
+			ut_error;
+		}
+
+		trx->commit_lsn = lsn;
+
+		/*-------------------------------------*/
+
+		mutex_enter(&kernel_mutex);
+	}
+
+	/* Free all savepoints */
+	trx_roll_free_all_savepoints(trx);
+
+	trx->conc_state = TRX_NOT_STARTED;
+	trx->rseg = NULL;
+	trx->undo_no = ut_dulint_zero;
+	trx->last_sql_stat_start.least_undo_no = ut_dulint_zero;
+
+	ut_ad(UT_LIST_GET_LEN(trx->wait_thrs) == 0);
+	ut_ad(UT_LIST_GET_LEN(trx->trx_locks) == 0);
+
+	UT_LIST_REMOVE(trx_list, trx_sys->trx_list, trx);
+}
+
+/****************************************************************//**
+Cleans up a transaction at database startup. The cleanup is needed if
+the transaction already got to the middle of a commit when the database
+crashed, and we cannot roll it back. */
+UNIV_INTERN
+void
+trx_cleanup_at_db_startup(
+/*======================*/
+	trx_t*	trx)	/*!< in: transaction */
+{
+	if (trx->insert_undo != NULL) {
+
+		trx_undo_insert_cleanup(trx);
+	}
+
+	trx->conc_state = TRX_NOT_STARTED;
+	trx->rseg = NULL;
+	trx->undo_no = ut_dulint_zero;
+	trx->last_sql_stat_start.least_undo_no = ut_dulint_zero;
+
+	UT_LIST_REMOVE(trx_list, trx_sys->trx_list, trx);
+}
+
+/********************************************************************//**
+Assigns a read view for a consistent read query. All the consistent reads
+within the same transaction will get the same read view, which is created
+when this function is first called for a new started transaction.
+@return	consistent read view */
+UNIV_INTERN
+read_view_t*
+trx_assign_read_view(
+/*=================*/
+	trx_t*	trx)	/*!< in: active transaction */
+{
+	ut_ad(trx->conc_state == TRX_ACTIVE);
+
+	if (trx->read_view) {
+		return(trx->read_view);
+	}
+
+	mutex_enter(&kernel_mutex);
+
+	if (!trx->read_view) {
+		trx->read_view = read_view_open_now(
+			trx->id, trx->global_read_view_heap);
+		trx->global_read_view = trx->read_view;
+	}
+
+	mutex_exit(&kernel_mutex);
+
+	return(trx->read_view);
+}
+
+/****************************************************************//**
+Commits a transaction. NOTE that the kernel mutex is temporarily released. */
+static
+void
+trx_handle_commit_sig_off_kernel(
+/*=============================*/
+	trx_t*		trx,		/*!< in: transaction */
+	que_thr_t**	next_thr)	/*!< in/out: next query thread to run;
+					if the value which is passed in is
+					a pointer to a NULL pointer, then the
+					calling function can start running
+					a new query thread */
+{
+	trx_sig_t*	sig;
+	trx_sig_t*	next_sig;
+
+	ut_ad(mutex_own(&kernel_mutex));
+
+	trx->que_state = TRX_QUE_COMMITTING;
+
+	trx_commit_off_kernel(trx);
+
+	ut_ad(UT_LIST_GET_LEN(trx->wait_thrs) == 0);
+
+	/* Remove all TRX_SIG_COMMIT signals from the signal queue and send
+	reply messages to them */
+
+	sig = UT_LIST_GET_FIRST(trx->signals);
+
+	while (sig != NULL) {
+		next_sig = UT_LIST_GET_NEXT(signals, sig);
+
+		if (sig->type == TRX_SIG_COMMIT) {
+
+			trx_sig_reply(sig, next_thr);
+			trx_sig_remove(trx, sig);
+		}
+
+		sig = next_sig;
+	}
+
+	trx->que_state = TRX_QUE_RUNNING;
+}
+
+/***********************************************************//**
+The transaction must be in the TRX_QUE_LOCK_WAIT state. Puts it to
+the TRX_QUE_RUNNING state and releases query threads which were
+waiting for a lock in the wait_thrs list. */
+UNIV_INTERN
+void
+trx_end_lock_wait(
+/*==============*/
+	trx_t*	trx)	/*!< in: transaction */
+{
+	que_thr_t*	thr;
+	ulint           sec;
+	ulint           ms;
+	ib_uint64_t     now;
+
+	ut_ad(mutex_own(&kernel_mutex));
+	ut_ad(trx->que_state == TRX_QUE_LOCK_WAIT);
+
+	thr = UT_LIST_GET_FIRST(trx->wait_thrs);
+
+	while (thr != NULL) {
+		que_thr_end_wait_no_next_thr(thr);
+
+		UT_LIST_REMOVE(trx_thrs, trx->wait_thrs, thr);
+
+		thr = UT_LIST_GET_FIRST(trx->wait_thrs);
+	}
+
+	if (innobase_get_slow_log() && trx->take_stats) {
+		ut_usectime(&sec, &ms);
+		now = (ib_uint64_t)sec * 1000000 + ms;
+		trx->lock_que_wait_timer += (ulint)(now - trx->lock_que_wait_ustarted);
+	}
+	trx->que_state = TRX_QUE_RUNNING;
+}
+
+/***********************************************************//**
+Moves the query threads in the lock wait list to the SUSPENDED state and puts
+the transaction to the TRX_QUE_RUNNING state. */
+static
+void
+trx_lock_wait_to_suspended(
+/*=======================*/
+	trx_t*	trx)	/*!< in: transaction in the TRX_QUE_LOCK_WAIT state */
+{
+	que_thr_t*	thr;
+	ulint           sec;
+	ulint           ms;
+	ib_uint64_t     now;
+
+	ut_ad(mutex_own(&kernel_mutex));
+	ut_ad(trx->que_state == TRX_QUE_LOCK_WAIT);
+
+	thr = UT_LIST_GET_FIRST(trx->wait_thrs);
+
+	while (thr != NULL) {
+		thr->state = QUE_THR_SUSPENDED;
+
+		UT_LIST_REMOVE(trx_thrs, trx->wait_thrs, thr);
+
+		thr = UT_LIST_GET_FIRST(trx->wait_thrs);
+	}
+
+	if (innobase_get_slow_log() && trx->take_stats) {
+		ut_usectime(&sec, &ms);
+		now = (ib_uint64_t)sec * 1000000 + ms;
+		trx->lock_que_wait_timer += (ulint)(now - trx->lock_que_wait_ustarted);
+	}
+	trx->que_state = TRX_QUE_RUNNING;
+}
+
+/***********************************************************//**
+Moves the query threads in the sig reply wait list of trx to the SUSPENDED
+state. */
+static
+void
+trx_sig_reply_wait_to_suspended(
+/*============================*/
+	trx_t*	trx)	/*!< in: transaction */
+{
+	trx_sig_t*	sig;
+	que_thr_t*	thr;
+
+	ut_ad(mutex_own(&kernel_mutex));
+
+	sig = UT_LIST_GET_FIRST(trx->reply_signals);
+
+	while (sig != NULL) {
+		thr = sig->receiver;
+
+		ut_ad(thr->state == QUE_THR_SIG_REPLY_WAIT);
+
+		thr->state = QUE_THR_SUSPENDED;
+
+		sig->receiver = NULL;
+
+		UT_LIST_REMOVE(reply_signals, trx->reply_signals, sig);
+
+		sig = UT_LIST_GET_FIRST(trx->reply_signals);
+	}
+}
+
+/*****************************************************************//**
+Checks the compatibility of a new signal with the other signals in the
+queue.
+@return	TRUE if the signal can be queued */
+static
+ibool
+trx_sig_is_compatible(
+/*==================*/
+	trx_t*	trx,	/*!< in: trx handle */
+	ulint	type,	/*!< in: signal type */
+	ulint	sender)	/*!< in: TRX_SIG_SELF or TRX_SIG_OTHER_SESS */
+{
+	trx_sig_t*	sig;
+
+	ut_ad(mutex_own(&kernel_mutex));
+
+	if (UT_LIST_GET_LEN(trx->signals) == 0) {
+
+		return(TRUE);
+	}
+
+	if (sender == TRX_SIG_SELF) {
+		if (type == TRX_SIG_ERROR_OCCURRED) {
+
+			return(TRUE);
+
+		} else if (type == TRX_SIG_BREAK_EXECUTION) {
+
+			return(TRUE);
+		} else {
+			return(FALSE);
+		}
+	}
+
+	ut_ad(sender == TRX_SIG_OTHER_SESS);
+
+	sig = UT_LIST_GET_FIRST(trx->signals);
+
+	if (type == TRX_SIG_COMMIT) {
+		while (sig != NULL) {
+
+			if (sig->type == TRX_SIG_TOTAL_ROLLBACK) {
+
+				return(FALSE);
+			}
+
+			sig = UT_LIST_GET_NEXT(signals, sig);
+		}
+
+		return(TRUE);
+
+	} else if (type == TRX_SIG_TOTAL_ROLLBACK) {
+		while (sig != NULL) {
+
+			if (sig->type == TRX_SIG_COMMIT) {
+
+				return(FALSE);
+			}
+
+			sig = UT_LIST_GET_NEXT(signals, sig);
+		}
+
+		return(TRUE);
+
+	} else if (type == TRX_SIG_BREAK_EXECUTION) {
+
+		return(TRUE);
+	} else {
+		ut_error;
+
+		return(FALSE);
+	}
+}
+
+/****************************************************************//**
+Sends a signal to a trx object. */
+UNIV_INTERN
+void
+trx_sig_send(
+/*=========*/
+	trx_t*		trx,		/*!< in: trx handle */
+	ulint		type,		/*!< in: signal type */
+	ulint		sender,		/*!< in: TRX_SIG_SELF or
+					TRX_SIG_OTHER_SESS */
+	que_thr_t*	receiver_thr,	/*!< in: query thread which wants the
+					reply, or NULL; if type is
+					TRX_SIG_END_WAIT, this must be NULL */
+	trx_savept_t*	savept,		/*!< in: possible rollback savepoint, or
+					NULL */
+	que_thr_t**	next_thr)	/*!< in/out: next query thread to run;
+					if the value which is passed in is
+					a pointer to a NULL pointer, then the
+					calling function can start running
+					a new query thread; if the parameter
+					is NULL, it is ignored */
+{
+	trx_sig_t*	sig;
+	trx_t*		receiver_trx;
+
+	ut_ad(trx);
+	ut_ad(mutex_own(&kernel_mutex));
+
+	if (!trx_sig_is_compatible(trx, type, sender)) {
+		/* The signal is not compatible with the other signals in
+		the queue: die */
+
+		ut_error;
+	}
+
+	/* Queue the signal object */
+
+	if (UT_LIST_GET_LEN(trx->signals) == 0) {
+
+		/* The signal list is empty: the 'sig' slot must be unused
+		(we improve performance a bit by avoiding mem_alloc) */
+		sig = &(trx->sig);
+	} else {
+		/* It might be that the 'sig' slot is unused also in this
+		case, but we choose the easy way of using mem_alloc */
+
+		sig = mem_alloc(sizeof(trx_sig_t));
+	}
+
+	UT_LIST_ADD_LAST(signals, trx->signals, sig);
+
+	sig->type = type;
+	sig->sender = sender;
+	sig->receiver = receiver_thr;
+
+	if (savept) {
+		sig->savept = *savept;
+	}
+
+	if (receiver_thr) {
+		receiver_trx = thr_get_trx(receiver_thr);
+
+		UT_LIST_ADD_LAST(reply_signals, receiver_trx->reply_signals,
+				 sig);
+	}
+
+	if (trx->sess->state == SESS_ERROR) {
+
+		trx_sig_reply_wait_to_suspended(trx);
+	}
+
+	if ((sender != TRX_SIG_SELF) || (type == TRX_SIG_BREAK_EXECUTION)) {
+		ut_error;
+	}
+
+	/* If there were no other signals ahead in the queue, try to start
+	handling of the signal */
+
+	if (UT_LIST_GET_FIRST(trx->signals) == sig) {
+
+		trx_sig_start_handle(trx, next_thr);
+	}
+}
+
+/****************************************************************//**
+Ends signal handling. If the session is in the error state, and
+trx->graph_before_signal_handling != NULL, then returns control to the error
+handling routine of the graph (currently just returns the control to the
+graph root which then will send an error message to the client). */
+UNIV_INTERN
+void
+trx_end_signal_handling(
+/*====================*/
+	trx_t*	trx)	/*!< in: trx */
+{
+	ut_ad(mutex_own(&kernel_mutex));
+	ut_ad(trx->handling_signals == TRUE);
+
+	trx->handling_signals = FALSE;
+
+	trx->graph = trx->graph_before_signal_handling;
+
+	if (trx->graph && (trx->sess->state == SESS_ERROR)) {
+
+		que_fork_error_handle(trx, trx->graph);
+	}
+}
+
+/****************************************************************//**
+Starts handling of a trx signal. */
+UNIV_INTERN
+void
+trx_sig_start_handle(
+/*=================*/
+	trx_t*		trx,		/*!< in: trx handle */
+	que_thr_t**	next_thr)	/*!< in/out: next query thread to run;
+					if the value which is passed in is
+					a pointer to a NULL pointer, then the
+					calling function can start running
+					a new query thread; if the parameter
+					is NULL, it is ignored */
+{
+	trx_sig_t*	sig;
+	ulint		type;
+loop:
+	/* We loop in this function body as long as there are queued signals
+	we can process immediately */
+
+	ut_ad(trx);
+	ut_ad(mutex_own(&kernel_mutex));
+
+	if (trx->handling_signals && (UT_LIST_GET_LEN(trx->signals) == 0)) {
+
+		trx_end_signal_handling(trx);
+
+		return;
+	}
+
+	if (trx->conc_state == TRX_NOT_STARTED) {
+
+		trx_start_low(trx, ULINT_UNDEFINED);
+	}
+
+	/* If the trx is in a lock wait state, moves the waiting query threads
+	to the suspended state */
+
+	if (trx->que_state == TRX_QUE_LOCK_WAIT) {
+
+		trx_lock_wait_to_suspended(trx);
+	}
+
+	/* If the session is in the error state and this trx has threads
+	waiting for reply from signals, moves these threads to the suspended
+	state, canceling wait reservations; note that if the transaction has
+	sent a commit or rollback signal to itself, and its session is not in
+	the error state, then nothing is done here. */
+
+	if (trx->sess->state == SESS_ERROR) {
+		trx_sig_reply_wait_to_suspended(trx);
+	}
+
+	/* If there are no running query threads, we can start processing of a
+	signal, otherwise we have to wait until all query threads of this
+	transaction are aware of the arrival of the signal. */
+
+	if (trx->n_active_thrs > 0) {
+
+		return;
+	}
+
+	if (trx->handling_signals == FALSE) {
+		trx->graph_before_signal_handling = trx->graph;
+
+		trx->handling_signals = TRUE;
+	}
+
+	sig = UT_LIST_GET_FIRST(trx->signals);
+	type = sig->type;
+
+	if (type == TRX_SIG_COMMIT) {
+
+		trx_handle_commit_sig_off_kernel(trx, next_thr);
+
+	} else if ((type == TRX_SIG_TOTAL_ROLLBACK)
+		   || (type == TRX_SIG_ROLLBACK_TO_SAVEPT)) {
+
+		trx_rollback(trx, sig, next_thr);
+
+		/* No further signals can be handled until the rollback
+		completes, therefore we return */
+
+		return;
+
+	} else if (type == TRX_SIG_ERROR_OCCURRED) {
+
+		trx_rollback(trx, sig, next_thr);
+
+		/* No further signals can be handled until the rollback
+		completes, therefore we return */
+
+		return;
+
+	} else if (type == TRX_SIG_BREAK_EXECUTION) {
+
+		trx_sig_reply(sig, next_thr);
+		trx_sig_remove(trx, sig);
+	} else {
+		ut_error;
+	}
+
+	goto loop;
+}
+
+/****************************************************************//**
+Send the reply message when a signal in the queue of the trx has been
+handled. */
+UNIV_INTERN
+void
+trx_sig_reply(
+/*==========*/
+	trx_sig_t*	sig,		/*!< in: signal */
+	que_thr_t**	next_thr)	/*!< in/out: next query thread to run;
+					if the value which is passed in is
+					a pointer to a NULL pointer, then the
+					calling function can start running
+					a new query thread */
+{
+	trx_t*	receiver_trx;
+
+	ut_ad(sig);
+	ut_ad(mutex_own(&kernel_mutex));
+
+	if (sig->receiver != NULL) {
+		ut_ad((sig->receiver)->state == QUE_THR_SIG_REPLY_WAIT);
+
+		receiver_trx = thr_get_trx(sig->receiver);
+
+		UT_LIST_REMOVE(reply_signals, receiver_trx->reply_signals,
+			       sig);
+		ut_ad(receiver_trx->sess->state != SESS_ERROR);
+
+		que_thr_end_wait(sig->receiver, next_thr);
+
+		sig->receiver = NULL;
+
+	}
+}
+
+/****************************************************************//**
+Removes a signal object from the trx signal queue. */
+UNIV_INTERN
+void
+trx_sig_remove(
+/*===========*/
+	trx_t*		trx,	/*!< in: trx handle */
+	trx_sig_t*	sig)	/*!< in, own: signal */
+{
+	ut_ad(trx && sig);
+	ut_ad(mutex_own(&kernel_mutex));
+
+	ut_ad(sig->receiver == NULL);
+
+	UT_LIST_REMOVE(signals, trx->signals, sig);
+	sig->type = 0;	/* reset the field to catch possible bugs */
+
+	if (sig != &(trx->sig)) {
+		mem_free(sig);
+	}
+}
+
+/*********************************************************************//**
+Creates a commit command node struct.
+@return	own: commit node struct */
+UNIV_INTERN
+commit_node_t*
+commit_node_create(
+/*===============*/
+	mem_heap_t*	heap)	/*!< in: mem heap where created */
+{
+	commit_node_t*	node;
+
+	node = mem_heap_alloc(heap, sizeof(commit_node_t));
+	node->common.type  = QUE_NODE_COMMIT;
+	node->state = COMMIT_NODE_SEND;
+
+	return(node);
+}
+
+/***********************************************************//**
+Performs an execution step for a commit type node in a query graph.
+@return	query thread to run next, or NULL */
+UNIV_INTERN
+que_thr_t*
+trx_commit_step(
+/*============*/
+	que_thr_t*	thr)	/*!< in: query thread */
+{
+	commit_node_t*	node;
+	que_thr_t*	next_thr;
+
+	node = thr->run_node;
+
+	ut_ad(que_node_get_type(node) == QUE_NODE_COMMIT);
+
+	if (thr->prev_node == que_node_get_parent(node)) {
+		node->state = COMMIT_NODE_SEND;
+	}
+
+	if (node->state == COMMIT_NODE_SEND) {
+		mutex_enter(&kernel_mutex);
+
+		node->state = COMMIT_NODE_WAIT;
+
+		next_thr = NULL;
+
+		thr->state = QUE_THR_SIG_REPLY_WAIT;
+
+		/* Send the commit signal to the transaction */
+
+		trx_sig_send(thr_get_trx(thr), TRX_SIG_COMMIT, TRX_SIG_SELF,
+			     thr, NULL, &next_thr);
+
+		mutex_exit(&kernel_mutex);
+
+		return(next_thr);
+	}
+
+	ut_ad(node->state == COMMIT_NODE_WAIT);
+
+	node->state = COMMIT_NODE_SEND;
+
+	thr->run_node = que_node_get_parent(node);
+
+	return(thr);
+}
+
+/**********************************************************************//**
+Does the transaction commit for MySQL.
+@return	DB_SUCCESS or error number */
+UNIV_INTERN
+ulint
+trx_commit_for_mysql(
+/*=================*/
+	trx_t*	trx)	/*!< in: trx handle */
+{
+	/* Because we do not do the commit by sending an Innobase
+	sig to the transaction, we must here make sure that trx has been
+	started. */
+
+	ut_a(trx);
+
+	trx_start_if_not_started(trx);
+
+	trx->op_info = "committing";
+
+	mutex_enter(&kernel_mutex);
+
+	trx_commit_off_kernel(trx);
+
+	mutex_exit(&kernel_mutex);
+
+	trx->op_info = "";
+
+	return(DB_SUCCESS);
+}
+
+/**********************************************************************//**
+If required, flushes the log to disk if we called trx_commit_for_mysql()
+with trx->flush_log_later == TRUE.
+@return	0 or error number */
+UNIV_INTERN
+ulint
+trx_commit_complete_for_mysql(
+/*==========================*/
+	trx_t*	trx)	/*!< in: trx handle */
+{
+	ib_uint64_t	lsn	= trx->commit_lsn;
+	ulint		flush_log_at_trx_commit;
+
+	ut_a(trx);
+
+	trx->op_info = "flushing log";
+
+	if (trx->flush_log_at_trx_commit_session == 3) {
+		flush_log_at_trx_commit = srv_flush_log_at_trx_commit;
+	} else {
+		flush_log_at_trx_commit = trx->flush_log_at_trx_commit_session;
+	}
+
+	if (!trx->must_flush_log_later) {
+		/* Do nothing */
+	} else if (flush_log_at_trx_commit == 0) {
+		/* Do nothing */
+	} else if (flush_log_at_trx_commit == 1) {
+		if (srv_unix_file_flush_method == SRV_UNIX_NOSYNC) {
+			/* Write the log but do not flush it to disk */
+
+			log_write_up_to(lsn, LOG_WAIT_ONE_GROUP, FALSE);
+		} else {
+			/* Write the log to the log files AND flush them to
+			disk */
+
+			log_write_up_to(lsn, LOG_WAIT_ONE_GROUP, TRUE);
+		}
+	} else if (flush_log_at_trx_commit == 2) {
+
+		/* Write the log but do not flush it to disk */
+
+		log_write_up_to(lsn, LOG_WAIT_ONE_GROUP, FALSE);
+	} else {
+		ut_error;
+	}
+
+	trx->must_flush_log_later = FALSE;
+
+	trx->op_info = "";
+
+	return(0);
+}
+
+/**********************************************************************//**
+Marks the latest SQL statement ended. */
+UNIV_INTERN
+void
+trx_mark_sql_stat_end(
+/*==================*/
+	trx_t*	trx)	/*!< in: trx handle */
+{
+	ut_a(trx);
+
+	if (trx->conc_state == TRX_NOT_STARTED) {
+		trx->undo_no = ut_dulint_zero;
+	}
+
+	trx->last_sql_stat_start.least_undo_no = trx->undo_no;
+}
+
+/**********************************************************************//**
+Prints info about a transaction to the given file. The caller must own the
+kernel mutex. */
+UNIV_INTERN
+void
+trx_print(
+/*======*/
+	FILE*	f,		/*!< in: output stream */
+	trx_t*	trx,		/*!< in: transaction */
+	ulint	max_query_len)	/*!< in: max query length to print, or 0 to
+				   use the default max length */
+{
+	ibool	newline;
+
+	fprintf(f, "TRANSACTION " TRX_ID_FMT, TRX_ID_PREP_PRINTF(trx->id));
+
+	switch (trx->conc_state) {
+	case TRX_NOT_STARTED:
+		fputs(", not started", f);
+		break;
+	case TRX_ACTIVE:
+		fprintf(f, ", ACTIVE %lu sec",
+			(ulong)difftime(time(NULL), trx->start_time));
+		break;
+	case TRX_PREPARED:
+		fprintf(f, ", ACTIVE (PREPARED) %lu sec",
+			(ulong)difftime(time(NULL), trx->start_time));
+		break;
+	case TRX_COMMITTED_IN_MEMORY:
+		fputs(", COMMITTED IN MEMORY", f);
+		break;
+	default:
+		fprintf(f, " state %lu", (ulong) trx->conc_state);
+	}
+
+#ifdef UNIV_LINUX
+	fprintf(f, ", process no %lu", trx->mysql_process_no);
+#endif
+	fprintf(f, ", OS thread id %lu",
+		(ulong) os_thread_pf(trx->mysql_thread_id));
+
+	if (*trx->op_info) {
+		putc(' ', f);
+		fputs(trx->op_info, f);
+	}
+
+	if (trx->is_recovered) {
+		fputs(" recovered trx", f);
+	}
+
+	if (trx->is_purge) {
+		fputs(" purge trx", f);
+	}
+
+	if (trx->declared_to_be_inside_innodb) {
+		fprintf(f, ", thread declared inside InnoDB %lu",
+			(ulong) trx->n_tickets_to_enter_innodb);
+	}
+
+	putc('\n', f);
+
+	if (trx->n_mysql_tables_in_use > 0 || trx->mysql_n_tables_locked > 0) {
+		fprintf(f, "mysql tables in use %lu, locked %lu\n",
+			(ulong) trx->n_mysql_tables_in_use,
+			(ulong) trx->mysql_n_tables_locked);
+	}
+
+	newline = TRUE;
+
+	switch (trx->que_state) {
+	case TRX_QUE_RUNNING:
+		newline = FALSE; break;
+	case TRX_QUE_LOCK_WAIT:
+		fputs("LOCK WAIT ", f); break;
+	case TRX_QUE_ROLLING_BACK:
+		fputs("ROLLING BACK ", f); break;
+	case TRX_QUE_COMMITTING:
+		fputs("COMMITTING ", f); break;
+	default:
+		fprintf(f, "que state %lu ", (ulong) trx->que_state);
+	}
+
+	if (0 < UT_LIST_GET_LEN(trx->trx_locks)
+	    || mem_heap_get_size(trx->lock_heap) > 400) {
+		newline = TRUE;
+
+		fprintf(f, "%lu lock struct(s), heap size %lu,"
+			" %lu row lock(s)",
+			(ulong) UT_LIST_GET_LEN(trx->trx_locks),
+			(ulong) mem_heap_get_size(trx->lock_heap),
+			(ulong) lock_number_of_rows_locked(trx));
+	}
+
+	if (trx->has_search_latch) {
+		newline = TRUE;
+		fputs(", holds adaptive hash latch", f);
+	}
+
+	if (!ut_dulint_is_zero(trx->undo_no)) {
+		newline = TRUE;
+		fprintf(f, ", undo log entries %lu",
+			(ulong) ut_dulint_get_low(trx->undo_no));
+	}
+
+	if (newline) {
+		putc('\n', f);
+	}
+
+	if (trx->mysql_thd != NULL) {
+		innobase_mysql_print_thd(f, trx->mysql_thd, max_query_len);
+	}
+}
+
+/*******************************************************************//**
+Compares the "weight" (or size) of two transactions. Transactions that
+have edited non-transactional tables are considered heavier than ones
+that have not.
+@return	<0, 0 or >0; similar to strcmp(3) */
+UNIV_INTERN
+int
+trx_weight_cmp(
+/*===========*/
+	const trx_t*	a,	/*!< in: the first transaction to be compared */
+	const trx_t*	b)	/*!< in: the second transaction to be compared */
+{
+	ibool	a_notrans_edit;
+	ibool	b_notrans_edit;
+
+	/* If mysql_thd is NULL for a transaction we assume that it has
+	not edited non-transactional tables. */
+
+	a_notrans_edit = a->mysql_thd != NULL
+	    && thd_has_edited_nontrans_tables(a->mysql_thd);
+
+	b_notrans_edit = b->mysql_thd != NULL
+	    && thd_has_edited_nontrans_tables(b->mysql_thd);
+
+	if (a_notrans_edit && !b_notrans_edit) {
+
+		return(1);
+	}
+
+	if (!a_notrans_edit && b_notrans_edit) {
+
+		return(-1);
+	}
+
+	/* Either both had edited non-transactional tables or both had
+	not, we fall back to comparing the number of altered/locked
+	rows. */
+
+#if 0
+	fprintf(stderr,
+		"%s TRX_WEIGHT(a): %lld+%lu, TRX_WEIGHT(b): %lld+%lu\n",
+		__func__,
+		ut_conv_dulint_to_longlong(a->undo_no),
+		UT_LIST_GET_LEN(a->trx_locks),
+		ut_conv_dulint_to_longlong(b->undo_no),
+		UT_LIST_GET_LEN(b->trx_locks));
+#endif
+
+	return(ut_dulint_cmp(TRX_WEIGHT(a), TRX_WEIGHT(b)));
+}
+
+/****************************************************************//**
+Prepares a transaction. */
+UNIV_INTERN
+void
+trx_prepare_off_kernel(
+/*===================*/
+	trx_t*	trx)	/*!< in: transaction */
+{
+	page_t*		update_hdr_page;
+	trx_rseg_t*	rseg;
+	ib_uint64_t	lsn		= 0;
+	mtr_t		mtr;
+
+	ut_ad(mutex_own(&kernel_mutex));
+
+	rseg = trx->rseg;
+
+	if (trx->insert_undo != NULL || trx->update_undo != NULL) {
+
+		mutex_exit(&kernel_mutex);
+
+		mtr_start(&mtr);
+
+		/* Change the undo log segment states from TRX_UNDO_ACTIVE
+		to TRX_UNDO_PREPARED: these modifications to the file data
+		structure define the transaction as prepared in the
+		file-based world, at the serialization point of lsn. */
+
+		mutex_enter(&(rseg->mutex));
+
+		if (trx->insert_undo != NULL) {
+
+			/* It is not necessary to obtain trx->undo_mutex here
+			because only a single OS thread is allowed to do the
+			transaction prepare for this transaction. */
+
+			trx_undo_set_state_at_prepare(trx, trx->insert_undo,
+						      &mtr);
+		}
+
+		if (trx->update_undo) {
+			update_hdr_page = trx_undo_set_state_at_prepare(
+				trx, trx->update_undo, &mtr);
+		}
+
+		mutex_exit(&(rseg->mutex));
+
+		/*--------------*/
+		mtr_commit(&mtr);	/* This mtr commit makes the
+					transaction prepared in the file-based
+					world */
+		/*--------------*/
+		lsn = mtr.end_lsn;
+
+		mutex_enter(&kernel_mutex);
+	}
+
+	ut_ad(mutex_own(&kernel_mutex));
+
+	/*--------------------------------------*/
+	trx->conc_state = TRX_PREPARED;
+	/*--------------------------------------*/
+
+	if (lsn) {
+		ulint	flush_log_at_trx_commit;
+
+		/* Depending on the my.cnf options, we may now write the log
+		buffer to the log files, making the prepared state of the
+		transaction durable if the OS does not crash. We may also
+		flush the log files to disk, making the prepared state of the
+		transaction durable also at an OS crash or a power outage.
+
+		The idea in InnoDB's group prepare is that a group of
+		transactions gather behind a trx doing a physical disk write
+		to log files, and when that physical write has been completed,
+		one of those transactions does a write which prepares the whole
+		group. Note that this group prepare will only bring benefit if
+		there are > 2 users in the database. Then at least 2 users can
+		gather behind one doing the physical log write to disk.
+
+		TODO: find out if MySQL holds some mutex when calling this.
+		That would spoil our group prepare algorithm. */
+
+		mutex_exit(&kernel_mutex);
+
+		if (trx->flush_log_at_trx_commit_session == 3) {
+			flush_log_at_trx_commit = srv_flush_log_at_trx_commit;
+		} else {
+			flush_log_at_trx_commit = trx->flush_log_at_trx_commit_session;
+		}
+
+		if (flush_log_at_trx_commit == 0) {
+			/* Do nothing */
+		} else if (flush_log_at_trx_commit == 1) {
+			if (srv_unix_file_flush_method == SRV_UNIX_NOSYNC) {
+				/* Write the log but do not flush it to disk */
+
+				log_write_up_to(lsn, LOG_WAIT_ONE_GROUP,
+						FALSE);
+			} else {
+				/* Write the log to the log files AND flush
+				them to disk */
+
+				log_write_up_to(lsn, LOG_WAIT_ONE_GROUP, TRUE);
+			}
+		} else if (flush_log_at_trx_commit == 2) {
+
+			/* Write the log but do not flush it to disk */
+
+			log_write_up_to(lsn, LOG_WAIT_ONE_GROUP, FALSE);
+		} else {
+			ut_error;
+		}
+
+		mutex_enter(&kernel_mutex);
+	}
+}
+
+/**********************************************************************//**
+Does the transaction prepare for MySQL.
+@return	0 or error number */
+UNIV_INTERN
+ulint
+trx_prepare_for_mysql(
+/*==================*/
+	trx_t*	trx)	/*!< in: trx handle */
+{
+	/* Because we do not do the prepare by sending an Innobase
+	sig to the transaction, we must here make sure that trx has been
+	started. */
+
+	ut_a(trx);
+
+	trx->op_info = "preparing";
+
+	trx_start_if_not_started(trx);
+
+	mutex_enter(&kernel_mutex);
+
+	trx_prepare_off_kernel(trx);
+
+	mutex_exit(&kernel_mutex);
+
+	trx->op_info = "";
+
+	return(0);
+}
+
+/**********************************************************************//**
+This function is used to find number of prepared transactions and
+their transaction objects for a recovery.
+@return	number of prepared transactions stored in xid_list */
+UNIV_INTERN
+int
+trx_recover_for_mysql(
+/*==================*/
+	XID*	xid_list,	/*!< in/out: prepared transactions */
+	ulint	len)		/*!< in: number of slots in xid_list */
+{
+	trx_t*	trx;
+	ulint	count = 0;
+
+	ut_ad(xid_list);
+	ut_ad(len);
+
+	/* We should set those transactions which are in the prepared state
+	to the xid_list */
+
+	mutex_enter(&kernel_mutex);
+
+	trx = UT_LIST_GET_FIRST(trx_sys->trx_list);
+
+	while (trx) {
+		if (trx->conc_state == TRX_PREPARED) {
+			xid_list[count] = trx->xid;
+
+			if (count == 0) {
+				ut_print_timestamp(stderr);
+				fprintf(stderr,
+					"  InnoDB: Starting recovery for"
+					" XA transactions...\n");
+			}
+
+			ut_print_timestamp(stderr);
+			fprintf(stderr,
+				"  InnoDB: Transaction " TRX_ID_FMT " in"
+				" prepared state after recovery\n",
+				TRX_ID_PREP_PRINTF(trx->id));
+
+			ut_print_timestamp(stderr);
+			fprintf(stderr,
+				"  InnoDB: Transaction contains changes"
+				" to %lu rows\n",
+				(ulong) ut_conv_dulint_to_longlong(
+					trx->undo_no));
+
+			count++;
+
+			if (count == len) {
+				break;
+			}
+		}
+
+		trx = UT_LIST_GET_NEXT(trx_list, trx);
+	}
+
+	mutex_exit(&kernel_mutex);
+
+	if (count > 0){
+		ut_print_timestamp(stderr);
+		fprintf(stderr,
+			"  InnoDB: %lu transactions in prepared state"
+			" after recovery\n",
+			(ulong) count);
+	}
+
+	return ((int) count);
+}
+
+/*******************************************************************//**
+This function is used to find one X/Open XA distributed transaction
+which is in the prepared state
+@return	trx or NULL */
+UNIV_INTERN
+trx_t*
+trx_get_trx_by_xid(
+/*===============*/
+	XID*	xid)	/*!< in: X/Open XA transaction identification */
+{
+	trx_t*	trx;
+
+	if (xid == NULL) {
+
+		return (NULL);
+	}
+
+	mutex_enter(&kernel_mutex);
+
+	trx = UT_LIST_GET_FIRST(trx_sys->trx_list);
+
+	while (trx) {
+		/* Compare two X/Open XA transaction id's: their
+		length should be the same and binary comparison
+		of gtrid_lenght+bqual_length bytes should be
+		the same */
+
+		if (xid->gtrid_length == trx->xid.gtrid_length
+		    && xid->bqual_length == trx->xid.bqual_length
+		    && memcmp(xid->data, trx->xid.data,
+			      xid->gtrid_length + xid->bqual_length) == 0) {
+			break;
+		}
+
+		trx = UT_LIST_GET_NEXT(trx_list, trx);
+	}
+
+	mutex_exit(&kernel_mutex);
+
+	if (trx) {
+		if (trx->conc_state != TRX_PREPARED) {
+
+			return(NULL);
+		}
+
+		return(trx);
+	} else {
+		return(NULL);
+	}
+}
diff --git a/storage/xtradb/trx/trx0undo.c b/storage/xtradb/trx/trx0undo.c
new file mode 100644
index 00000000000..ec4beb5660a
--- /dev/null
+++ b/storage/xtradb/trx/trx0undo.c
@@ -0,0 +1,2032 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file trx/trx0undo.c
+Transaction undo log
+
+Created 3/26/1996 Heikki Tuuri
+*******************************************************/
+
+#include "trx0undo.h"
+
+#ifdef UNIV_NONINL
+#include "trx0undo.ic"
+#endif
+
+#include "fsp0fsp.h"
+#ifndef UNIV_HOTBACKUP
+#include "mach0data.h"
+#include "mtr0log.h"
+#include "trx0rseg.h"
+#include "trx0trx.h"
+#include "srv0srv.h"
+#include "trx0rec.h"
+#include "trx0purge.h"
+
+/* How should the old versions in the history list be managed?
+   ----------------------------------------------------------
+If each transaction is given a whole page for its update undo log, file
+space consumption can be 10 times higher than necessary. Therefore,
+partly filled update undo log pages should be reusable. But then there
+is no way individual pages can be ordered so that the ordering agrees
+with the serialization numbers of the transactions on the pages. Thus,
+the history list must be formed of undo logs, not their header pages as
+it was in the old implementation.
+	However, on a single header page the transactions are placed in
+the order of their serialization numbers. As old versions are purged, we
+may free the page when the last transaction on the page has been purged.
+	A problem is that the purge has to go through the transactions
+in the serialization order. This means that we have to look through all
+rollback segments for the one that has the smallest transaction number
+in its history list.
+	When should we do a purge? A purge is necessary when space is
+running out in any of the rollback segments. Then we may have to purge
+also old version which might be needed by some consistent read. How do
+we trigger the start of a purge? When a transaction writes to an undo log,
+it may notice that the space is running out. When a read view is closed,
+it may make some history superfluous. The server can have an utility which
+periodically checks if it can purge some history.
+	In a parallellized purge we have the problem that a query thread
+can remove a delete marked clustered index record before another query
+thread has processed an earlier version of the record, which cannot then
+be done because the row cannot be constructed from the clustered index
+record. To avoid this problem, we will store in the update and delete mark
+undo record also the columns necessary to construct the secondary index
+entries which are modified.
+	We can latch the stack of versions of a single clustered index record
+by taking a latch on the clustered index page. As long as the latch is held,
+no new versions can be added and no versions removed by undo. But, a purge
+can still remove old versions from the bottom of the stack. */
+
+/* How to protect rollback segments, undo logs, and history lists with
+   -------------------------------------------------------------------
+latches?
+-------
+The contention of the kernel mutex should be minimized. When a transaction
+does its first insert or modify in an index, an undo log is assigned for it.
+Then we must have an x-latch to the rollback segment header.
+	When the transaction does more modifys or rolls back, the undo log is
+protected with undo_mutex in the transaction.
+	When the transaction commits, its insert undo log is either reset and
+cached for a fast reuse, or freed. In these cases we must have an x-latch on
+the rollback segment page. The update undo log is put to the history list. If
+it is not suitable for reuse, its slot in the rollback segment is reset. In
+both cases, an x-latch must be acquired on the rollback segment.
+	The purge operation steps through the history list without modifying
+it until a truncate operation occurs, which can remove undo logs from the end
+of the list and release undo log segments. In stepping through the list,
+s-latches on the undo log pages are enough, but in a truncate, x-latches must
+be obtained on the rollback segment and individual pages. */
+#endif /* !UNIV_HOTBACKUP */
+
+/********************************************************************//**
+Initializes the fields in an undo log segment page. */
+static
+void
+trx_undo_page_init(
+/*===============*/
+	page_t* undo_page,	/*!< in: undo log segment page */
+	ulint	type,		/*!< in: undo log segment type */
+	mtr_t*	mtr);		/*!< in: mtr */
+
+#ifndef UNIV_HOTBACKUP
+/********************************************************************//**
+Creates and initializes an undo log memory object.
+@return	own: the undo log memory object */
+static
+trx_undo_t*
+trx_undo_mem_create(
+/*================*/
+	trx_rseg_t*	rseg,	/*!< in: rollback segment memory object */
+	ulint		id,	/*!< in: slot index within rseg */
+	ulint		type,	/*!< in: type of the log: TRX_UNDO_INSERT or
+				TRX_UNDO_UPDATE */
+	trx_id_t	trx_id,	/*!< in: id of the trx for which the undo log
+				is created */
+	const XID*	xid,	/*!< in: X/Open XA transaction identification*/
+	ulint		page_no,/*!< in: undo log header page number */
+	ulint		offset);/*!< in: undo log header byte offset on page */
+#endif /* !UNIV_HOTBACKUP */
+/***************************************************************//**
+Initializes a cached insert undo log header page for new use. NOTE that this
+function has its own log record type MLOG_UNDO_HDR_REUSE. You must NOT change
+the operation of this function!
+@return	undo log header byte offset on page */
+static
+ulint
+trx_undo_insert_header_reuse(
+/*=========================*/
+	page_t*		undo_page,	/*!< in/out: insert undo log segment
+					header page, x-latched */
+	trx_id_t	trx_id,		/*!< in: transaction id */
+	mtr_t*		mtr);		/*!< in: mtr */
+/**********************************************************************//**
+If an update undo log can be discarded immediately, this function frees the
+space, resetting the page to the proper state for caching. */
+static
+void
+trx_undo_discard_latest_update_undo(
+/*================================*/
+	page_t*	undo_page,	/*!< in: header page of an undo log of size 1 */
+	mtr_t*	mtr);		/*!< in: mtr */
+
+#ifndef UNIV_HOTBACKUP
+/***********************************************************************//**
+Gets the previous record in an undo log from the previous page.
+@return	undo log record, the page s-latched, NULL if none */
+static
+trx_undo_rec_t*
+trx_undo_get_prev_rec_from_prev_page(
+/*=================================*/
+	trx_undo_rec_t*	rec,	/*!< in: undo record */
+	ulint		page_no,/*!< in: undo log header page number */
+	ulint		offset,	/*!< in: undo log header offset on page */
+	mtr_t*		mtr)	/*!< in: mtr */
+{
+	ulint	space;
+	ulint	zip_size;
+	ulint	prev_page_no;
+	page_t* prev_page;
+	page_t*	undo_page;
+
+	undo_page = page_align(rec);
+
+	prev_page_no = flst_get_prev_addr(undo_page + TRX_UNDO_PAGE_HDR
+					  + TRX_UNDO_PAGE_NODE, mtr)
+		.page;
+
+	if (prev_page_no == FIL_NULL) {
+
+		return(NULL);
+	}
+
+	space = page_get_space_id(undo_page);
+	zip_size = fil_space_get_zip_size(space);
+
+	prev_page = trx_undo_page_get_s_latched(space, zip_size,
+						prev_page_no, mtr);
+
+	return(trx_undo_page_get_last_rec(prev_page, page_no, offset));
+}
+
+/***********************************************************************//**
+Gets the previous record in an undo log.
+@return	undo log record, the page s-latched, NULL if none */
+UNIV_INTERN
+trx_undo_rec_t*
+trx_undo_get_prev_rec(
+/*==================*/
+	trx_undo_rec_t*	rec,	/*!< in: undo record */
+	ulint		page_no,/*!< in: undo log header page number */
+	ulint		offset,	/*!< in: undo log header offset on page */
+	mtr_t*		mtr)	/*!< in: mtr */
+{
+	trx_undo_rec_t*	prev_rec;
+
+	prev_rec = trx_undo_page_get_prev_rec(rec, page_no, offset);
+
+	if (prev_rec) {
+
+		return(prev_rec);
+	}
+
+	/* We have to go to the previous undo log page to look for the
+	previous record */
+
+	return(trx_undo_get_prev_rec_from_prev_page(rec, page_no, offset,
+						    mtr));
+}
+
+/***********************************************************************//**
+Gets the next record in an undo log from the next page.
+@return	undo log record, the page latched, NULL if none */
+static
+trx_undo_rec_t*
+trx_undo_get_next_rec_from_next_page(
+/*=================================*/
+	ulint	space,	/*!< in: undo log header space */
+	ulint	zip_size,/*!< in: compressed page size in bytes
+			or 0 for uncompressed pages */
+	page_t*	undo_page, /*!< in: undo log page */
+	ulint	page_no,/*!< in: undo log header page number */
+	ulint	offset,	/*!< in: undo log header offset on page */
+	ulint	mode,	/*!< in: latch mode: RW_S_LATCH or RW_X_LATCH */
+	mtr_t*	mtr)	/*!< in: mtr */
+{
+	trx_ulogf_t*	log_hdr;
+	ulint		next_page_no;
+	page_t*		next_page;
+	ulint		next;
+
+	if (page_no == page_get_page_no(undo_page)) {
+
+		log_hdr = undo_page + offset;
+		next = mach_read_from_2(log_hdr + TRX_UNDO_NEXT_LOG);
+
+		if (next != 0) {
+
+			return(NULL);
+		}
+	}
+
+	next_page_no = flst_get_next_addr(undo_page + TRX_UNDO_PAGE_HDR
+					  + TRX_UNDO_PAGE_NODE, mtr)
+		.page;
+	if (next_page_no == FIL_NULL) {
+
+		return(NULL);
+	}
+
+	if (mode == RW_S_LATCH) {
+		next_page = trx_undo_page_get_s_latched(space, zip_size,
+							next_page_no, mtr);
+	} else {
+		ut_ad(mode == RW_X_LATCH);
+		next_page = trx_undo_page_get(space, zip_size,
+					      next_page_no, mtr);
+	}
+
+	return(trx_undo_page_get_first_rec(next_page, page_no, offset));
+}
+
+/***********************************************************************//**
+Gets the next record in an undo log.
+@return	undo log record, the page s-latched, NULL if none */
+UNIV_INTERN
+trx_undo_rec_t*
+trx_undo_get_next_rec(
+/*==================*/
+	trx_undo_rec_t*	rec,	/*!< in: undo record */
+	ulint		page_no,/*!< in: undo log header page number */
+	ulint		offset,	/*!< in: undo log header offset on page */
+	mtr_t*		mtr)	/*!< in: mtr */
+{
+	ulint		space;
+	ulint		zip_size;
+	trx_undo_rec_t*	next_rec;
+
+	next_rec = trx_undo_page_get_next_rec(rec, page_no, offset);
+
+	if (next_rec) {
+		return(next_rec);
+	}
+
+	space = page_get_space_id(page_align(rec));
+	zip_size = fil_space_get_zip_size(space);
+
+	return(trx_undo_get_next_rec_from_next_page(space, zip_size,
+						    page_align(rec),
+						    page_no, offset,
+						    RW_S_LATCH, mtr));
+}
+
+/***********************************************************************//**
+Gets the first record in an undo log.
+@return	undo log record, the page latched, NULL if none */
+UNIV_INTERN
+trx_undo_rec_t*
+trx_undo_get_first_rec(
+/*===================*/
+	ulint	space,	/*!< in: undo log header space */
+	ulint	zip_size,/*!< in: compressed page size in bytes
+			or 0 for uncompressed pages */
+	ulint	page_no,/*!< in: undo log header page number */
+	ulint	offset,	/*!< in: undo log header offset on page */
+	ulint	mode,	/*!< in: latching mode: RW_S_LATCH or RW_X_LATCH */
+	mtr_t*	mtr)	/*!< in: mtr */
+{
+	page_t*		undo_page;
+	trx_undo_rec_t*	rec;
+
+	if (mode == RW_S_LATCH) {
+		undo_page = trx_undo_page_get_s_latched(space, zip_size,
+							page_no, mtr);
+	} else {
+		undo_page = trx_undo_page_get(space, zip_size, page_no, mtr);
+	}
+
+	rec = trx_undo_page_get_first_rec(undo_page, page_no, offset);
+
+	if (rec) {
+		return(rec);
+	}
+
+	return(trx_undo_get_next_rec_from_next_page(space, zip_size,
+						    undo_page, page_no, offset,
+						    mode, mtr));
+}
+
+/*============== UNDO LOG FILE COPY CREATION AND FREEING ==================*/
+
+/**********************************************************************//**
+Writes the mtr log entry of an undo log page initialization. */
+UNIV_INLINE
+void
+trx_undo_page_init_log(
+/*===================*/
+	page_t* undo_page,	/*!< in: undo log page */
+	ulint	type,		/*!< in: undo log type */
+	mtr_t*	mtr)		/*!< in: mtr */
+{
+	mlog_write_initial_log_record(undo_page, MLOG_UNDO_INIT, mtr);
+
+	mlog_catenate_ulint_compressed(mtr, type);
+}
+#else /* !UNIV_HOTBACKUP */
+# define trx_undo_page_init_log(undo_page,type,mtr) ((void) 0)
+#endif /* !UNIV_HOTBACKUP */
+
+/***********************************************************//**
+Parses the redo log entry of an undo log page initialization.
+@return	end of log record or NULL */
+UNIV_INTERN
+byte*
+trx_undo_parse_page_init(
+/*=====================*/
+	byte*	ptr,	/*!< in: buffer */
+	byte*	end_ptr,/*!< in: buffer end */
+	page_t*	page,	/*!< in: page or NULL */
+	mtr_t*	mtr)	/*!< in: mtr or NULL */
+{
+	ulint	type;
+
+	ptr = mach_parse_compressed(ptr, end_ptr, &type);
+
+	if (ptr == NULL) {
+
+		return(NULL);
+	}
+
+	if (page) {
+		trx_undo_page_init(page, type, mtr);
+	}
+
+	return(ptr);
+}
+
+/********************************************************************//**
+Initializes the fields in an undo log segment page. */
+static
+void
+trx_undo_page_init(
+/*===============*/
+	page_t* undo_page,	/*!< in: undo log segment page */
+	ulint	type,		/*!< in: undo log segment type */
+	mtr_t*	mtr)		/*!< in: mtr */
+{
+	trx_upagef_t*	page_hdr;
+
+	page_hdr = undo_page + TRX_UNDO_PAGE_HDR;
+
+	mach_write_to_2(page_hdr + TRX_UNDO_PAGE_TYPE, type);
+
+	mach_write_to_2(page_hdr + TRX_UNDO_PAGE_START,
+			TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_HDR_SIZE);
+	mach_write_to_2(page_hdr + TRX_UNDO_PAGE_FREE,
+			TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_HDR_SIZE);
+
+	fil_page_set_type(undo_page, FIL_PAGE_UNDO_LOG);
+
+	trx_undo_page_init_log(undo_page, type, mtr);
+}
+
+#ifndef UNIV_HOTBACKUP
+/***************************************************************//**
+Creates a new undo log segment in file.
+@return DB_SUCCESS if page creation OK possible error codes are:
+DB_TOO_MANY_CONCURRENT_TRXS DB_OUT_OF_FILE_SPACE */
+static
+ulint
+trx_undo_seg_create(
+/*================*/
+	trx_rseg_t*	rseg __attribute__((unused)),/*!< in: rollback segment */
+	trx_rsegf_t*	rseg_hdr,/*!< in: rollback segment header, page
+				x-latched */
+	ulint		type,	/*!< in: type of the segment: TRX_UNDO_INSERT or
+				TRX_UNDO_UPDATE */
+	ulint*		id,	/*!< out: slot index within rseg header */
+	page_t**	undo_page,
+				/*!< out: segment header page x-latched, NULL
+				if there was an error */
+	mtr_t*		mtr)	/*!< in: mtr */
+{
+	ulint		slot_no;
+	ulint		space;
+	buf_block_t*	block;
+	trx_upagef_t*	page_hdr;
+	trx_usegf_t*	seg_hdr;
+	ulint		n_reserved;
+	ibool		success;
+	ulint		err = DB_SUCCESS;
+
+	ut_ad(mtr && id && rseg_hdr);
+	ut_ad(mutex_own(&(rseg->mutex)));
+
+	/*	fputs(type == TRX_UNDO_INSERT
+	? "Creating insert undo log segment\n"
+	: "Creating update undo log segment\n", stderr); */
+	slot_no = trx_rsegf_undo_find_free(rseg_hdr, mtr);
+
+	if (slot_no == ULINT_UNDEFINED) {
+		ut_print_timestamp(stderr);
+		fprintf(stderr,
+			"  InnoDB: Warning: cannot find a free slot for"
+			" an undo log. Do you have too\n"
+			"InnoDB: many active transactions"
+			" running concurrently?\n");
+
+		return(DB_TOO_MANY_CONCURRENT_TRXS);
+	}
+
+	space = page_get_space_id(page_align(rseg_hdr));
+
+	success = fsp_reserve_free_extents(&n_reserved, space, 2, FSP_UNDO,
+					   mtr);
+	if (!success) {
+
+		return(DB_OUT_OF_FILE_SPACE);
+	}
+
+	/* Allocate a new file segment for the undo log */
+	block = fseg_create_general(space, 0,
+				    TRX_UNDO_SEG_HDR
+				    + TRX_UNDO_FSEG_HEADER, TRUE, mtr);
+
+	fil_space_release_free_extents(space, n_reserved);
+
+	if (block == NULL) {
+		/* No space left */
+
+		return(DB_OUT_OF_FILE_SPACE);
+	}
+
+	buf_block_dbg_add_level(block, SYNC_TRX_UNDO_PAGE);
+
+	*undo_page = buf_block_get_frame(block);
+
+	page_hdr = *undo_page + TRX_UNDO_PAGE_HDR;
+	seg_hdr = *undo_page + TRX_UNDO_SEG_HDR;
+
+	trx_undo_page_init(*undo_page, type, mtr);
+
+	mlog_write_ulint(page_hdr + TRX_UNDO_PAGE_FREE,
+			 TRX_UNDO_SEG_HDR + TRX_UNDO_SEG_HDR_SIZE,
+			 MLOG_2BYTES, mtr);
+
+	mlog_write_ulint(seg_hdr + TRX_UNDO_LAST_LOG, 0, MLOG_2BYTES, mtr);
+
+	flst_init(seg_hdr + TRX_UNDO_PAGE_LIST, mtr);
+
+	flst_add_last(seg_hdr + TRX_UNDO_PAGE_LIST,
+		      page_hdr + TRX_UNDO_PAGE_NODE, mtr);
+
+	trx_rsegf_set_nth_undo(rseg_hdr, slot_no,
+			       page_get_page_no(*undo_page), mtr);
+	*id = slot_no;
+
+	return(err);
+}
+
+/**********************************************************************//**
+Writes the mtr log entry of an undo log header initialization. */
+UNIV_INLINE
+void
+trx_undo_header_create_log(
+/*=======================*/
+	const page_t*	undo_page,	/*!< in: undo log header page */
+	trx_id_t	trx_id,		/*!< in: transaction id */
+	mtr_t*		mtr)		/*!< in: mtr */
+{
+	mlog_write_initial_log_record(undo_page, MLOG_UNDO_HDR_CREATE, mtr);
+
+	mlog_catenate_dulint_compressed(mtr, trx_id);
+}
+#else /* !UNIV_HOTBACKUP */
+# define trx_undo_header_create_log(undo_page,trx_id,mtr) ((void) 0)
+#endif /* !UNIV_HOTBACKUP */
+
+/***************************************************************//**
+Creates a new undo log header in file. NOTE that this function has its own
+log record type MLOG_UNDO_HDR_CREATE. You must NOT change the operation of
+this function!
+@return	header byte offset on page */
+static
+ulint
+trx_undo_header_create(
+/*===================*/
+	page_t*		undo_page,	/*!< in/out: undo log segment
+					header page, x-latched; it is
+					assumed that there is
+					TRX_UNDO_LOG_XA_HDR_SIZE bytes
+					free space on it */
+	trx_id_t	trx_id,		/*!< in: transaction id */
+	mtr_t*		mtr)		/*!< in: mtr */
+{
+	trx_upagef_t*	page_hdr;
+	trx_usegf_t*	seg_hdr;
+	trx_ulogf_t*	log_hdr;
+	trx_ulogf_t*	prev_log_hdr;
+	ulint		prev_log;
+	ulint		free;
+	ulint		new_free;
+
+	ut_ad(mtr && undo_page);
+
+	page_hdr = undo_page + TRX_UNDO_PAGE_HDR;
+	seg_hdr = undo_page + TRX_UNDO_SEG_HDR;
+
+	free = mach_read_from_2(page_hdr + TRX_UNDO_PAGE_FREE);
+
+	log_hdr = undo_page + free;
+
+	new_free = free + TRX_UNDO_LOG_OLD_HDR_SIZE;
+
+	ut_a(free + TRX_UNDO_LOG_XA_HDR_SIZE < UNIV_PAGE_SIZE - 100);
+
+	mach_write_to_2(page_hdr + TRX_UNDO_PAGE_START, new_free);
+
+	mach_write_to_2(page_hdr + TRX_UNDO_PAGE_FREE, new_free);
+
+	mach_write_to_2(seg_hdr + TRX_UNDO_STATE, TRX_UNDO_ACTIVE);
+
+	prev_log = mach_read_from_2(seg_hdr + TRX_UNDO_LAST_LOG);
+
+	if (prev_log != 0) {
+		prev_log_hdr = undo_page + prev_log;
+
+		mach_write_to_2(prev_log_hdr + TRX_UNDO_NEXT_LOG, free);
+	}
+
+	mach_write_to_2(seg_hdr + TRX_UNDO_LAST_LOG, free);
+
+	log_hdr = undo_page + free;
+
+	mach_write_to_2(log_hdr + TRX_UNDO_DEL_MARKS, TRUE);
+
+	mach_write_to_8(log_hdr + TRX_UNDO_TRX_ID, trx_id);
+	mach_write_to_2(log_hdr + TRX_UNDO_LOG_START, new_free);
+
+	mach_write_to_1(log_hdr + TRX_UNDO_XID_EXISTS, FALSE);
+	mach_write_to_1(log_hdr + TRX_UNDO_DICT_TRANS, FALSE);
+
+	mach_write_to_2(log_hdr + TRX_UNDO_NEXT_LOG, 0);
+	mach_write_to_2(log_hdr + TRX_UNDO_PREV_LOG, prev_log);
+
+	/* Write the log record about the header creation */
+	trx_undo_header_create_log(undo_page, trx_id, mtr);
+
+	return(free);
+}
+
+#ifndef UNIV_HOTBACKUP
+/********************************************************************//**
+Write X/Open XA Transaction Identification (XID) to undo log header */
+static
+void
+trx_undo_write_xid(
+/*===============*/
+	trx_ulogf_t*	log_hdr,/*!< in: undo log header */
+	const XID*	xid,	/*!< in: X/Open XA Transaction Identification */
+	mtr_t*		mtr)	/*!< in: mtr */
+{
+	mlog_write_ulint(log_hdr + TRX_UNDO_XA_FORMAT,
+			 (ulint)xid->formatID, MLOG_4BYTES, mtr);
+
+	mlog_write_ulint(log_hdr + TRX_UNDO_XA_TRID_LEN,
+			 (ulint)xid->gtrid_length, MLOG_4BYTES, mtr);
+
+	mlog_write_ulint(log_hdr + TRX_UNDO_XA_BQUAL_LEN,
+			 (ulint)xid->bqual_length, MLOG_4BYTES, mtr);
+
+	mlog_write_string(log_hdr + TRX_UNDO_XA_XID, (const byte*) xid->data,
+			  XIDDATASIZE, mtr);
+}
+
+/********************************************************************//**
+Read X/Open XA Transaction Identification (XID) from undo log header */
+static
+void
+trx_undo_read_xid(
+/*==============*/
+	trx_ulogf_t*	log_hdr,/*!< in: undo log header */
+	XID*		xid)	/*!< out: X/Open XA Transaction Identification */
+{
+	xid->formatID = (long)mach_read_from_4(log_hdr + TRX_UNDO_XA_FORMAT);
+
+	xid->gtrid_length
+		= (long) mach_read_from_4(log_hdr + TRX_UNDO_XA_TRID_LEN);
+	xid->bqual_length
+		= (long) mach_read_from_4(log_hdr + TRX_UNDO_XA_BQUAL_LEN);
+
+	memcpy(xid->data, log_hdr + TRX_UNDO_XA_XID, XIDDATASIZE);
+}
+
+/***************************************************************//**
+Adds space for the XA XID after an undo log old-style header. */
+static
+void
+trx_undo_header_add_space_for_xid(
+/*==============================*/
+	page_t*		undo_page,/*!< in: undo log segment header page */
+	trx_ulogf_t*	log_hdr,/*!< in: undo log header */
+	mtr_t*		mtr)	/*!< in: mtr */
+{
+	trx_upagef_t*	page_hdr;
+	ulint		free;
+	ulint		new_free;
+
+	page_hdr = undo_page + TRX_UNDO_PAGE_HDR;
+
+	free = mach_read_from_2(page_hdr + TRX_UNDO_PAGE_FREE);
+
+	/* free is now the end offset of the old style undo log header */
+
+	ut_a(free == (ulint)(log_hdr - undo_page) + TRX_UNDO_LOG_OLD_HDR_SIZE);
+
+	new_free = free + (TRX_UNDO_LOG_XA_HDR_SIZE
+			   - TRX_UNDO_LOG_OLD_HDR_SIZE);
+
+	/* Add space for a XID after the header, update the free offset
+	fields on the undo log page and in the undo log header */
+
+	mlog_write_ulint(page_hdr + TRX_UNDO_PAGE_START, new_free,
+			 MLOG_2BYTES, mtr);
+
+	mlog_write_ulint(page_hdr + TRX_UNDO_PAGE_FREE, new_free,
+			 MLOG_2BYTES, mtr);
+
+	mlog_write_ulint(log_hdr + TRX_UNDO_LOG_START, new_free,
+			 MLOG_2BYTES, mtr);
+}
+
+/**********************************************************************//**
+Writes the mtr log entry of an undo log header reuse. */
+UNIV_INLINE
+void
+trx_undo_insert_header_reuse_log(
+/*=============================*/
+	const page_t*	undo_page,	/*!< in: undo log header page */
+	trx_id_t	trx_id,		/*!< in: transaction id */
+	mtr_t*		mtr)		/*!< in: mtr */
+{
+	mlog_write_initial_log_record(undo_page, MLOG_UNDO_HDR_REUSE, mtr);
+
+	mlog_catenate_dulint_compressed(mtr, trx_id);
+}
+#else /* !UNIV_HOTBACKUP */
+# define trx_undo_insert_header_reuse_log(undo_page,trx_id,mtr) ((void) 0)
+#endif /* !UNIV_HOTBACKUP */
+
+/***********************************************************//**
+Parses the redo log entry of an undo log page header create or reuse.
+@return	end of log record or NULL */
+UNIV_INTERN
+byte*
+trx_undo_parse_page_header(
+/*=======================*/
+	ulint	type,	/*!< in: MLOG_UNDO_HDR_CREATE or MLOG_UNDO_HDR_REUSE */
+	byte*	ptr,	/*!< in: buffer */
+	byte*	end_ptr,/*!< in: buffer end */
+	page_t*	page,	/*!< in: page or NULL */
+	mtr_t*	mtr)	/*!< in: mtr or NULL */
+{
+	trx_id_t	trx_id;
+
+	ptr = mach_dulint_parse_compressed(ptr, end_ptr, &trx_id);
+
+	if (ptr == NULL) {
+
+		return(NULL);
+	}
+
+	if (page) {
+		if (type == MLOG_UNDO_HDR_CREATE) {
+			trx_undo_header_create(page, trx_id, mtr);
+		} else {
+			ut_ad(type == MLOG_UNDO_HDR_REUSE);
+			trx_undo_insert_header_reuse(page, trx_id, mtr);
+		}
+	}
+
+	return(ptr);
+}
+
+/***************************************************************//**
+Initializes a cached insert undo log header page for new use. NOTE that this
+function has its own log record type MLOG_UNDO_HDR_REUSE. You must NOT change
+the operation of this function!
+@return	undo log header byte offset on page */
+static
+ulint
+trx_undo_insert_header_reuse(
+/*=========================*/
+	page_t*		undo_page,	/*!< in/out: insert undo log segment
+					header page, x-latched */
+	trx_id_t	trx_id,		/*!< in: transaction id */
+	mtr_t*		mtr)		/*!< in: mtr */
+{
+	trx_upagef_t*	page_hdr;
+	trx_usegf_t*	seg_hdr;
+	trx_ulogf_t*	log_hdr;
+	ulint		free;
+	ulint		new_free;
+
+	ut_ad(mtr && undo_page);
+
+	page_hdr = undo_page + TRX_UNDO_PAGE_HDR;
+	seg_hdr = undo_page + TRX_UNDO_SEG_HDR;
+
+	free = TRX_UNDO_SEG_HDR + TRX_UNDO_SEG_HDR_SIZE;
+
+	ut_a(free + TRX_UNDO_LOG_XA_HDR_SIZE < UNIV_PAGE_SIZE - 100);
+
+	log_hdr = undo_page + free;
+
+	new_free = free + TRX_UNDO_LOG_OLD_HDR_SIZE;
+
+	/* Insert undo data is not needed after commit: we may free all
+	the space on the page */
+
+	ut_a(mach_read_from_2(undo_page + TRX_UNDO_PAGE_HDR
+			      + TRX_UNDO_PAGE_TYPE)
+	     == TRX_UNDO_INSERT);
+
+	mach_write_to_2(page_hdr + TRX_UNDO_PAGE_START, new_free);
+
+	mach_write_to_2(page_hdr + TRX_UNDO_PAGE_FREE, new_free);
+
+	mach_write_to_2(seg_hdr + TRX_UNDO_STATE, TRX_UNDO_ACTIVE);
+
+	log_hdr = undo_page + free;
+
+	mach_write_to_8(log_hdr + TRX_UNDO_TRX_ID, trx_id);
+	mach_write_to_2(log_hdr + TRX_UNDO_LOG_START, new_free);
+
+	mach_write_to_1(log_hdr + TRX_UNDO_XID_EXISTS, FALSE);
+	mach_write_to_1(log_hdr + TRX_UNDO_DICT_TRANS, FALSE);
+
+	/* Write the log record MLOG_UNDO_HDR_REUSE */
+	trx_undo_insert_header_reuse_log(undo_page, trx_id, mtr);
+
+	return(free);
+}
+
+#ifndef UNIV_HOTBACKUP
+/**********************************************************************//**
+Writes the redo log entry of an update undo log header discard. */
+UNIV_INLINE
+void
+trx_undo_discard_latest_log(
+/*========================*/
+	page_t* undo_page,	/*!< in: undo log header page */
+	mtr_t*	mtr)		/*!< in: mtr */
+{
+	mlog_write_initial_log_record(undo_page, MLOG_UNDO_HDR_DISCARD, mtr);
+}
+#else /* !UNIV_HOTBACKUP */
+# define trx_undo_discard_latest_log(undo_page, mtr) ((void) 0)
+#endif /* !UNIV_HOTBACKUP */
+
+/***********************************************************//**
+Parses the redo log entry of an undo log page header discard.
+@return	end of log record or NULL */
+UNIV_INTERN
+byte*
+trx_undo_parse_discard_latest(
+/*==========================*/
+	byte*	ptr,	/*!< in: buffer */
+	byte*	end_ptr __attribute__((unused)), /*!< in: buffer end */
+	page_t*	page,	/*!< in: page or NULL */
+	mtr_t*	mtr)	/*!< in: mtr or NULL */
+{
+	ut_ad(end_ptr);
+
+	if (page) {
+		trx_undo_discard_latest_update_undo(page, mtr);
+	}
+
+	return(ptr);
+}
+
+/**********************************************************************//**
+If an update undo log can be discarded immediately, this function frees the
+space, resetting the page to the proper state for caching. */
+static
+void
+trx_undo_discard_latest_update_undo(
+/*================================*/
+	page_t*	undo_page,	/*!< in: header page of an undo log of size 1 */
+	mtr_t*	mtr)		/*!< in: mtr */
+{
+	trx_usegf_t*	seg_hdr;
+	trx_upagef_t*	page_hdr;
+	trx_ulogf_t*	log_hdr;
+	trx_ulogf_t*	prev_log_hdr;
+	ulint		free;
+	ulint		prev_hdr_offset;
+
+	seg_hdr = undo_page + TRX_UNDO_SEG_HDR;
+	page_hdr = undo_page + TRX_UNDO_PAGE_HDR;
+
+	free = mach_read_from_2(seg_hdr + TRX_UNDO_LAST_LOG);
+	log_hdr = undo_page + free;
+
+	prev_hdr_offset = mach_read_from_2(log_hdr + TRX_UNDO_PREV_LOG);
+
+	if (prev_hdr_offset != 0) {
+		prev_log_hdr = undo_page + prev_hdr_offset;
+
+		mach_write_to_2(page_hdr + TRX_UNDO_PAGE_START,
+				mach_read_from_2(prev_log_hdr
+						 + TRX_UNDO_LOG_START));
+		mach_write_to_2(prev_log_hdr + TRX_UNDO_NEXT_LOG, 0);
+	}
+
+	mach_write_to_2(page_hdr + TRX_UNDO_PAGE_FREE, free);
+
+	mach_write_to_2(seg_hdr + TRX_UNDO_STATE, TRX_UNDO_CACHED);
+	mach_write_to_2(seg_hdr + TRX_UNDO_LAST_LOG, prev_hdr_offset);
+
+	trx_undo_discard_latest_log(undo_page, mtr);
+}
+
+#ifndef UNIV_HOTBACKUP
+/********************************************************************//**
+Tries to add a page to the undo log segment where the undo log is placed.
+@return	page number if success, else FIL_NULL */
+UNIV_INTERN
+ulint
+trx_undo_add_page(
+/*==============*/
+	trx_t*		trx,	/*!< in: transaction */
+	trx_undo_t*	undo,	/*!< in: undo log memory object */
+	mtr_t*		mtr)	/*!< in: mtr which does not have a latch to any
+				undo log page; the caller must have reserved
+				the rollback segment mutex */
+{
+	page_t*		header_page;
+	page_t*		new_page;
+	trx_rseg_t*	rseg;
+	ulint		page_no;
+	ulint		n_reserved;
+	ibool		success;
+
+	ut_ad(mutex_own(&(trx->undo_mutex)));
+	ut_ad(!mutex_own(&kernel_mutex));
+	ut_ad(mutex_own(&(trx->rseg->mutex)));
+
+	rseg = trx->rseg;
+
+	if (rseg->curr_size == rseg->max_size) {
+
+		return(FIL_NULL);
+	}
+
+	header_page = trx_undo_page_get(undo->space, undo->zip_size,
+					undo->hdr_page_no, mtr);
+
+	success = fsp_reserve_free_extents(&n_reserved, undo->space, 1,
+					   FSP_UNDO, mtr);
+	if (!success) {
+
+		return(FIL_NULL);
+	}
+
+	page_no = fseg_alloc_free_page_general(header_page + TRX_UNDO_SEG_HDR
+					       + TRX_UNDO_FSEG_HEADER,
+					       undo->top_page_no + 1, FSP_UP,
+					       TRUE, mtr);
+
+	fil_space_release_free_extents(undo->space, n_reserved);
+
+	if (page_no == FIL_NULL) {
+
+		/* No space left */
+
+		return(FIL_NULL);
+	}
+
+	undo->last_page_no = page_no;
+
+	new_page = trx_undo_page_get(undo->space, undo->zip_size,
+				     page_no, mtr);
+
+	trx_undo_page_init(new_page, undo->type, mtr);
+
+	flst_add_last(header_page + TRX_UNDO_SEG_HDR + TRX_UNDO_PAGE_LIST,
+		      new_page + TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_NODE, mtr);
+	undo->size++;
+	rseg->curr_size++;
+
+	return(page_no);
+}
+
+/********************************************************************//**
+Frees an undo log page that is not the header page.
+@return	last page number in remaining log */
+static
+ulint
+trx_undo_free_page(
+/*===============*/
+	trx_rseg_t* rseg,	/*!< in: rollback segment */
+	ibool	in_history,	/*!< in: TRUE if the undo log is in the history
+				list */
+	ulint	space,		/*!< in: space */
+	ulint	hdr_page_no,	/*!< in: header page number */
+	ulint	page_no,	/*!< in: page number to free: must not be the
+				header page */
+	mtr_t*	mtr)		/*!< in: mtr which does not have a latch to any
+				undo log page; the caller must have reserved
+				the rollback segment mutex */
+{
+	page_t*		header_page;
+	page_t*		undo_page;
+	fil_addr_t	last_addr;
+	trx_rsegf_t*	rseg_header;
+	ulint		hist_size;
+	ulint		zip_size;
+
+	ut_a(hdr_page_no != page_no);
+	ut_ad(!mutex_own(&kernel_mutex));
+	ut_ad(mutex_own(&(rseg->mutex)));
+
+	zip_size = rseg->zip_size;
+
+	undo_page = trx_undo_page_get(space, zip_size, page_no, mtr);
+
+	header_page = trx_undo_page_get(space, zip_size, hdr_page_no, mtr);
+
+	flst_remove(header_page + TRX_UNDO_SEG_HDR + TRX_UNDO_PAGE_LIST,
+		    undo_page + TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_NODE, mtr);
+
+	fseg_free_page(header_page + TRX_UNDO_SEG_HDR + TRX_UNDO_FSEG_HEADER,
+		       space, page_no, mtr);
+
+	last_addr = flst_get_last(header_page + TRX_UNDO_SEG_HDR
+				  + TRX_UNDO_PAGE_LIST, mtr);
+	rseg->curr_size--;
+
+	if (in_history) {
+		rseg_header = trx_rsegf_get(space, zip_size,
+					    rseg->page_no, mtr);
+
+		hist_size = mtr_read_ulint(rseg_header + TRX_RSEG_HISTORY_SIZE,
+					   MLOG_4BYTES, mtr);
+		ut_ad(hist_size > 0);
+		mlog_write_ulint(rseg_header + TRX_RSEG_HISTORY_SIZE,
+				 hist_size - 1, MLOG_4BYTES, mtr);
+	}
+
+	return(last_addr.page);
+}
+
+/********************************************************************//**
+Frees an undo log page when there is also the memory object for the undo
+log. */
+static
+void
+trx_undo_free_page_in_rollback(
+/*===========================*/
+	trx_t*		trx __attribute__((unused)), /*!< in: transaction */
+	trx_undo_t*	undo,	/*!< in: undo log memory copy */
+	ulint		page_no,/*!< in: page number to free: must not be the
+				header page */
+	mtr_t*		mtr)	/*!< in: mtr which does not have a latch to any
+				undo log page; the caller must have reserved
+				the rollback segment mutex */
+{
+	ulint	last_page_no;
+
+	ut_ad(undo->hdr_page_no != page_no);
+	ut_ad(mutex_own(&(trx->undo_mutex)));
+
+	last_page_no = trx_undo_free_page(undo->rseg, FALSE, undo->space,
+					  undo->hdr_page_no, page_no, mtr);
+
+	undo->last_page_no = last_page_no;
+	undo->size--;
+}
+
+/********************************************************************//**
+Empties an undo log header page of undo records for that undo log. Other
+undo logs may still have records on that page, if it is an update undo log. */
+static
+void
+trx_undo_empty_header_page(
+/*=======================*/
+	ulint	space,		/*!< in: space */
+	ulint	zip_size,	/*!< in: compressed page size in bytes
+				or 0 for uncompressed pages */
+	ulint	hdr_page_no,	/*!< in: header page number */
+	ulint	hdr_offset,	/*!< in: header offset */
+	mtr_t*	mtr)		/*!< in: mtr */
+{
+	page_t*		header_page;
+	trx_ulogf_t*	log_hdr;
+	ulint		end;
+
+	header_page = trx_undo_page_get(space, zip_size, hdr_page_no, mtr);
+
+	log_hdr = header_page + hdr_offset;
+
+	end = trx_undo_page_get_end(header_page, hdr_page_no, hdr_offset);
+
+	mlog_write_ulint(log_hdr + TRX_UNDO_LOG_START, end, MLOG_2BYTES, mtr);
+}
+
+/***********************************************************************//**
+Truncates an undo log from the end. This function is used during a rollback
+to free space from an undo log. */
+UNIV_INTERN
+void
+trx_undo_truncate_end(
+/*==================*/
+	trx_t*		trx,	/*!< in: transaction whose undo log it is */
+	trx_undo_t*	undo,	/*!< in: undo log */
+	undo_no_t	limit)	/*!< in: all undo records with undo number
+				>= this value should be truncated */
+{
+	page_t*		undo_page;
+	ulint		last_page_no;
+	trx_undo_rec_t* rec;
+	trx_undo_rec_t* trunc_here;
+	trx_rseg_t*	rseg;
+	mtr_t		mtr;
+
+	ut_ad(mutex_own(&(trx->undo_mutex)));
+	ut_ad(mutex_own(&(trx->rseg->mutex)));
+
+	rseg = trx->rseg;
+
+	for (;;) {
+		mtr_start(&mtr);
+
+		trunc_here = NULL;
+
+		last_page_no = undo->last_page_no;
+
+		undo_page = trx_undo_page_get(undo->space, undo->zip_size,
+					      last_page_no, &mtr);
+
+		rec = trx_undo_page_get_last_rec(undo_page, undo->hdr_page_no,
+						 undo->hdr_offset);
+		for (;;) {
+			if (rec == NULL) {
+				if (last_page_no == undo->hdr_page_no) {
+
+					goto function_exit;
+				}
+
+				trx_undo_free_page_in_rollback(
+					trx, undo, last_page_no, &mtr);
+				break;
+			}
+
+			if (ut_dulint_cmp(trx_undo_rec_get_undo_no(rec), limit)
+			    >= 0) {
+				/* Truncate at least this record off, maybe
+				more */
+				trunc_here = rec;
+			} else {
+				goto function_exit;
+			}
+
+			rec = trx_undo_page_get_prev_rec(rec,
+							 undo->hdr_page_no,
+							 undo->hdr_offset);
+		}
+
+		mtr_commit(&mtr);
+	}
+
+function_exit:
+	if (trunc_here) {
+		mlog_write_ulint(undo_page + TRX_UNDO_PAGE_HDR
+				 + TRX_UNDO_PAGE_FREE,
+				 trunc_here - undo_page, MLOG_2BYTES, &mtr);
+	}
+
+	mtr_commit(&mtr);
+}
+
+/***********************************************************************//**
+Truncates an undo log from the start. This function is used during a purge
+operation. */
+UNIV_INTERN
+void
+trx_undo_truncate_start(
+/*====================*/
+	trx_rseg_t*	rseg,		/*!< in: rollback segment */
+	ulint		space,		/*!< in: space id of the log */
+	ulint		hdr_page_no,	/*!< in: header page number */
+	ulint		hdr_offset,	/*!< in: header offset on the page */
+	undo_no_t	limit)		/*!< in: all undo pages with
+					undo numbers < this value
+					should be truncated; NOTE that
+					the function only frees whole
+					pages; the header page is not
+					freed, but emptied, if all the
+					records there are < limit */
+{
+	page_t*		undo_page;
+	trx_undo_rec_t* rec;
+	trx_undo_rec_t* last_rec;
+	ulint		page_no;
+	mtr_t		mtr;
+
+	ut_ad(mutex_own(&(rseg->mutex)));
+
+	if (ut_dulint_is_zero(limit)) {
+
+		return;
+	}
+loop:
+	mtr_start(&mtr);
+
+	rec = trx_undo_get_first_rec(space, rseg->zip_size,
+				     hdr_page_no, hdr_offset,
+				     RW_X_LATCH, &mtr);
+	if (rec == NULL) {
+		/* Already empty */
+
+		mtr_commit(&mtr);
+
+		return;
+	}
+
+	undo_page = page_align(rec);
+
+	last_rec = trx_undo_page_get_last_rec(undo_page, hdr_page_no,
+					      hdr_offset);
+	if (ut_dulint_cmp(trx_undo_rec_get_undo_no(last_rec), limit) >= 0) {
+
+		mtr_commit(&mtr);
+
+		return;
+	}
+
+	page_no = page_get_page_no(undo_page);
+
+	if (page_no == hdr_page_no) {
+		trx_undo_empty_header_page(space, rseg->zip_size,
+					   hdr_page_no, hdr_offset,
+					   &mtr);
+	} else {
+		trx_undo_free_page(rseg, TRUE, space, hdr_page_no,
+				   page_no, &mtr);
+	}
+
+	mtr_commit(&mtr);
+
+	goto loop;
+}
+
+/**********************************************************************//**
+Frees an undo log segment which is not in the history list. */
+static
+void
+trx_undo_seg_free(
+/*==============*/
+	trx_undo_t*	undo)	/*!< in: undo log */
+{
+	trx_rseg_t*	rseg;
+	fseg_header_t*	file_seg;
+	trx_rsegf_t*	rseg_header;
+	trx_usegf_t*	seg_header;
+	ibool		finished;
+	mtr_t		mtr;
+
+	rseg = undo->rseg;
+
+	do {
+
+		mtr_start(&mtr);
+
+		ut_ad(!mutex_own(&kernel_mutex));
+
+		mutex_enter(&(rseg->mutex));
+
+		seg_header = trx_undo_page_get(undo->space, undo->zip_size,
+					       undo->hdr_page_no,
+					       &mtr) + TRX_UNDO_SEG_HDR;
+
+		file_seg = seg_header + TRX_UNDO_FSEG_HEADER;
+
+		finished = fseg_free_step(file_seg, &mtr);
+
+		if (finished) {
+			/* Update the rseg header */
+			rseg_header = trx_rsegf_get(
+				rseg->space, rseg->zip_size, rseg->page_no,
+				&mtr);
+			trx_rsegf_set_nth_undo(rseg_header, undo->id, FIL_NULL,
+					       &mtr);
+		}
+
+		mutex_exit(&(rseg->mutex));
+		mtr_commit(&mtr);
+	} while (!finished);
+}
+
+/*========== UNDO LOG MEMORY COPY INITIALIZATION =====================*/
+
+/********************************************************************//**
+Creates and initializes an undo log memory object according to the values
+in the header in file, when the database is started. The memory object is
+inserted in the appropriate list of rseg.
+@return	own: the undo log memory object */
+static
+trx_undo_t*
+trx_undo_mem_create_at_db_start(
+/*============================*/
+	trx_rseg_t*	rseg,	/*!< in: rollback segment memory object */
+	ulint		id,	/*!< in: slot index within rseg */
+	ulint		page_no,/*!< in: undo log segment page number */
+	mtr_t*		mtr)	/*!< in: mtr */
+{
+	page_t*		undo_page;
+	trx_upagef_t*	page_header;
+	trx_usegf_t*	seg_header;
+	trx_ulogf_t*	undo_header;
+	trx_undo_t*	undo;
+	ulint		type;
+	ulint		state;
+	trx_id_t	trx_id;
+	ulint		offset;
+	fil_addr_t	last_addr;
+	page_t*		last_page;
+	trx_undo_rec_t*	rec;
+	XID		xid;
+	ibool		xid_exists = FALSE;
+
+	if (id >= TRX_RSEG_N_SLOTS) {
+		fprintf(stderr,
+			"InnoDB: Error: undo->id is %lu\n", (ulong) id);
+		ut_error;
+	}
+
+	undo_page = trx_undo_page_get(rseg->space, rseg->zip_size,
+				      page_no, mtr);
+
+	page_header = undo_page + TRX_UNDO_PAGE_HDR;
+
+	type = mtr_read_ulint(page_header + TRX_UNDO_PAGE_TYPE, MLOG_2BYTES,
+			      mtr);
+	seg_header = undo_page + TRX_UNDO_SEG_HDR;
+
+	state = mach_read_from_2(seg_header + TRX_UNDO_STATE);
+
+	offset = mach_read_from_2(seg_header + TRX_UNDO_LAST_LOG);
+
+	undo_header = undo_page + offset;
+
+	trx_id = mtr_read_dulint(undo_header + TRX_UNDO_TRX_ID, mtr);
+
+	xid_exists = mtr_read_ulint(undo_header + TRX_UNDO_XID_EXISTS,
+				    MLOG_1BYTE, mtr);
+
+	/* Read X/Open XA transaction identification if it exists, or
+	set it to NULL. */
+
+	memset(&xid, 0, sizeof(xid));
+	xid.formatID = -1;
+
+	if (xid_exists == TRUE) {
+		trx_undo_read_xid(undo_header, &xid);
+	}
+
+	mutex_enter(&(rseg->mutex));
+
+	undo = trx_undo_mem_create(rseg, id, type, trx_id, &xid,
+				   page_no, offset);
+	mutex_exit(&(rseg->mutex));
+
+	undo->dict_operation =	mtr_read_ulint(
+		undo_header + TRX_UNDO_DICT_TRANS, MLOG_1BYTE, mtr);
+
+	undo->table_id = mtr_read_dulint(undo_header + TRX_UNDO_TABLE_ID, mtr);
+	undo->state = state;
+	undo->size = flst_get_len(seg_header + TRX_UNDO_PAGE_LIST, mtr);
+
+	/* If the log segment is being freed, the page list is inconsistent! */
+	if (state == TRX_UNDO_TO_FREE) {
+
+		goto add_to_list;
+	}
+
+	last_addr = flst_get_last(seg_header + TRX_UNDO_PAGE_LIST, mtr);
+
+	undo->last_page_no = last_addr.page;
+	undo->top_page_no = last_addr.page;
+
+	last_page = trx_undo_page_get(rseg->space, rseg->zip_size,
+				      undo->last_page_no, mtr);
+
+	rec = trx_undo_page_get_last_rec(last_page, page_no, offset);
+
+	if (rec == NULL) {
+		undo->empty = TRUE;
+	} else {
+		undo->empty = FALSE;
+		undo->top_offset = rec - last_page;
+		undo->top_undo_no = trx_undo_rec_get_undo_no(rec);
+	}
+add_to_list:
+	if (type == TRX_UNDO_INSERT) {
+		if (state != TRX_UNDO_CACHED) {
+			UT_LIST_ADD_LAST(undo_list, rseg->insert_undo_list,
+					 undo);
+		} else {
+			UT_LIST_ADD_LAST(undo_list, rseg->insert_undo_cached,
+					 undo);
+		}
+	} else {
+		ut_ad(type == TRX_UNDO_UPDATE);
+		if (state != TRX_UNDO_CACHED) {
+			UT_LIST_ADD_LAST(undo_list, rseg->update_undo_list,
+					 undo);
+		} else {
+			UT_LIST_ADD_LAST(undo_list, rseg->update_undo_cached,
+					 undo);
+		}
+	}
+
+	return(undo);
+}
+
+/********************************************************************//**
+Initializes the undo log lists for a rollback segment memory copy. This
+function is only called when the database is started or a new rollback
+segment is created.
+@return	the combined size of undo log segments in pages */
+UNIV_INTERN
+ulint
+trx_undo_lists_init(
+/*================*/
+	trx_rseg_t*	rseg)	/*!< in: rollback segment memory object */
+{
+	ulint		page_no;
+	trx_undo_t*	undo;
+	ulint		size	= 0;
+	trx_rsegf_t*	rseg_header;
+	ulint		i;
+	mtr_t		mtr;
+
+	UT_LIST_INIT(rseg->update_undo_list);
+	UT_LIST_INIT(rseg->update_undo_cached);
+	UT_LIST_INIT(rseg->insert_undo_list);
+	UT_LIST_INIT(rseg->insert_undo_cached);
+
+	mtr_start(&mtr);
+
+	rseg_header = trx_rsegf_get_new(rseg->space, rseg->zip_size,
+					rseg->page_no, &mtr);
+
+	if (!srv_extra_undoslots) {
+		/* uses direct call for avoid "Assertion failure" */
+		//page_no = trx_rsegf_get_nth_undo(rseg_header, TRX_RSEG_N_EXTRA_SLOTS - 1, &mtr);
+		page_no = mtr_read_ulint(rseg_header + TRX_RSEG_UNDO_SLOTS
+					 + (TRX_RSEG_N_EXTRA_SLOTS - 1) * TRX_RSEG_SLOT_SIZE,
+					 MLOG_4BYTES, &mtr);
+		if (page_no != 0) {
+			/* check extended slots are not used */
+			for (i = TRX_RSEG_N_SLOTS; i < TRX_RSEG_N_EXTRA_SLOTS; i++) {
+				/* uses direct call for avoid "Assertion failure" */
+				page_no = mtr_read_ulint(rseg_header + TRX_RSEG_UNDO_SLOTS
+							 + i * TRX_RSEG_SLOT_SIZE,
+							 MLOG_4BYTES, &mtr);
+				if (page_no != FIL_NULL) {
+					srv_extra_undoslots = TRUE;
+					fprintf(stderr,
+"InnoDB: Error: innodb_extra_undoslots option is disabled, but it was enabled before.\n"
+"InnoDB: The datafile is not normal for mysqld and disabled innodb_extra_undoslots.\n"
+"InnoDB: Enable innodb_extra_undoslots if it was enabled before, and\n"
+"InnoDB: ### don't use this datafile with other mysqld or ibbackup! ###\n"
+"InnoDB: Cannot continue operation for the safety. Calling exit(1).\n");
+					exit(1);
+				}
+			}
+			fprintf(stderr,
+"InnoDB: Warning: innodb_extra_undoslots option is disabled, but it was  enabled before.\n"
+"InnoDB: But extended undo slots seem not used, so continue operation.\n");
+		}
+	}
+
+	for (i = 0; i < TRX_RSEG_N_SLOTS; i++) {
+		page_no = trx_rsegf_get_nth_undo(rseg_header, i, &mtr);
+
+		/* If it was not initialized when the datafile created,
+		page_no will be 0 for the extended slots after that */
+
+		if (page_no == 0) {
+			page_no = FIL_NULL;
+			trx_rsegf_set_nth_undo(rseg_header, i, page_no, &mtr);
+		}
+
+		/* In forced recovery: try to avoid operations which look
+		at database pages; undo logs are rapidly changing data, and
+		the probability that they are in an inconsistent state is
+		high */
+
+		if (page_no != FIL_NULL
+		    && srv_force_recovery < SRV_FORCE_NO_UNDO_LOG_SCAN) {
+
+			undo = trx_undo_mem_create_at_db_start(rseg, i,
+							       page_no, &mtr);
+			size += undo->size;
+
+			mtr_commit(&mtr);
+
+			mtr_start(&mtr);
+
+			rseg_header = trx_rsegf_get(
+				rseg->space, rseg->zip_size, rseg->page_no,
+				&mtr);
+		}
+	}
+
+	mtr_commit(&mtr);
+
+	return(size);
+}
+
+/********************************************************************//**
+Creates and initializes an undo log memory object.
+@return	own: the undo log memory object */
+static
+trx_undo_t*
+trx_undo_mem_create(
+/*================*/
+	trx_rseg_t*	rseg,	/*!< in: rollback segment memory object */
+	ulint		id,	/*!< in: slot index within rseg */
+	ulint		type,	/*!< in: type of the log: TRX_UNDO_INSERT or
+				TRX_UNDO_UPDATE */
+	trx_id_t	trx_id,	/*!< in: id of the trx for which the undo log
+				is created */
+	const XID*	xid,	/*!< in: X/Open transaction identification */
+	ulint		page_no,/*!< in: undo log header page number */
+	ulint		offset)	/*!< in: undo log header byte offset on page */
+{
+	trx_undo_t*	undo;
+
+	ut_ad(mutex_own(&(rseg->mutex)));
+
+	if (id >= TRX_RSEG_N_SLOTS) {
+		fprintf(stderr,
+			"InnoDB: Error: undo->id is %lu\n", (ulong) id);
+		ut_error;
+	}
+
+	undo = mem_alloc(sizeof(trx_undo_t));
+
+	if (undo == NULL) {
+
+		return NULL;
+	}
+
+	undo->id = id;
+	undo->type = type;
+	undo->state = TRX_UNDO_ACTIVE;
+	undo->del_marks = FALSE;
+	undo->trx_id = trx_id;
+	undo->xid = *xid;
+
+	undo->dict_operation = FALSE;
+
+	undo->rseg = rseg;
+
+	undo->space = rseg->space;
+	undo->zip_size = rseg->zip_size;
+	undo->hdr_page_no = page_no;
+	undo->hdr_offset = offset;
+	undo->last_page_no = page_no;
+	undo->size = 1;
+
+	undo->empty = TRUE;
+	undo->top_page_no = page_no;
+	undo->guess_block = NULL;
+
+	return(undo);
+}
+
+/********************************************************************//**
+Initializes a cached undo log object for new use. */
+static
+void
+trx_undo_mem_init_for_reuse(
+/*========================*/
+	trx_undo_t*	undo,	/*!< in: undo log to init */
+	trx_id_t	trx_id,	/*!< in: id of the trx for which the undo log
+				is created */
+	const XID*	xid,	/*!< in: X/Open XA transaction identification*/
+	ulint		offset)	/*!< in: undo log header byte offset on page */
+{
+	ut_ad(mutex_own(&((undo->rseg)->mutex)));
+
+	if (UNIV_UNLIKELY(undo->id >= TRX_RSEG_N_SLOTS)) {
+		fprintf(stderr, "InnoDB: Error: undo->id is %lu\n",
+			(ulong) undo->id);
+
+		mem_analyze_corruption(undo);
+		ut_error;
+	}
+
+	undo->state = TRX_UNDO_ACTIVE;
+	undo->del_marks = FALSE;
+	undo->trx_id = trx_id;
+	undo->xid = *xid;
+
+	undo->dict_operation = FALSE;
+
+	undo->hdr_offset = offset;
+	undo->empty = TRUE;
+}
+
+/********************************************************************//**
+Frees an undo log memory copy. */
+UNIV_INTERN
+void
+trx_undo_mem_free(
+/*==============*/
+	trx_undo_t*	undo)	/*!< in: the undo object to be freed */
+{
+	if (undo->id >= TRX_RSEG_N_SLOTS) {
+		fprintf(stderr,
+			"InnoDB: Error: undo->id is %lu\n", (ulong) undo->id);
+		ut_error;
+	}
+
+	mem_free(undo);
+}
+
+/**********************************************************************//**
+Creates a new undo log.
+@return DB_SUCCESS if successful in creating the new undo lob object,
+possible error codes are: DB_TOO_MANY_CONCURRENT_TRXS
+DB_OUT_OF_FILE_SPACE DB_OUT_OF_MEMORY */
+static
+ulint
+trx_undo_create(
+/*============*/
+	trx_t*		trx,	/*!< in: transaction */
+	trx_rseg_t*	rseg,	/*!< in: rollback segment memory copy */
+	ulint		type,	/*!< in: type of the log: TRX_UNDO_INSERT or
+				TRX_UNDO_UPDATE */
+	trx_id_t	trx_id,	/*!< in: id of the trx for which the undo log
+				is created */
+	const XID*	xid,	/*!< in: X/Open transaction identification*/
+	trx_undo_t**	undo,	/*!< out: the new undo log object, undefined
+				 * if did not succeed */
+	mtr_t*		mtr)	/*!< in: mtr */
+{
+	trx_rsegf_t*	rseg_header;
+	ulint		page_no;
+	ulint		offset;
+	ulint		id;
+	page_t*		undo_page;
+	ulint		err;
+
+	ut_ad(mutex_own(&(rseg->mutex)));
+
+	if (rseg->curr_size == rseg->max_size) {
+
+		return(DB_OUT_OF_FILE_SPACE);
+	}
+
+	rseg->curr_size++;
+
+	rseg_header = trx_rsegf_get(rseg->space, rseg->zip_size, rseg->page_no,
+				    mtr);
+
+	err = trx_undo_seg_create(rseg, rseg_header, type, &id,
+				  &undo_page, mtr);
+
+	if (err != DB_SUCCESS) {
+		/* Did not succeed */
+
+		rseg->curr_size--;
+
+		return(err);
+	}
+
+	page_no = page_get_page_no(undo_page);
+
+	offset = trx_undo_header_create(undo_page, trx_id, mtr);
+
+	if (trx->support_xa) {
+		trx_undo_header_add_space_for_xid(undo_page,
+						  undo_page + offset, mtr);
+	}
+
+	*undo = trx_undo_mem_create(rseg, id, type, trx_id, xid,
+				   page_no, offset);
+	if (*undo == NULL) {
+
+		err = DB_OUT_OF_MEMORY;
+	}
+
+	return(err);
+}
+
+/*================ UNDO LOG ASSIGNMENT AND CLEANUP =====================*/
+
+/********************************************************************//**
+Reuses a cached undo log.
+@return	the undo log memory object, NULL if none cached */
+static
+trx_undo_t*
+trx_undo_reuse_cached(
+/*==================*/
+	trx_t*		trx,	/*!< in: transaction */
+	trx_rseg_t*	rseg,	/*!< in: rollback segment memory object */
+	ulint		type,	/*!< in: type of the log: TRX_UNDO_INSERT or
+				TRX_UNDO_UPDATE */
+	trx_id_t	trx_id,	/*!< in: id of the trx for which the undo log
+				is used */
+	const XID*	xid,	/*!< in: X/Open XA transaction identification */
+	mtr_t*		mtr)	/*!< in: mtr */
+{
+	trx_undo_t*	undo;
+	page_t*		undo_page;
+	ulint		offset;
+
+	ut_ad(mutex_own(&(rseg->mutex)));
+
+	if (type == TRX_UNDO_INSERT) {
+
+		undo = UT_LIST_GET_FIRST(rseg->insert_undo_cached);
+		if (undo == NULL) {
+
+			return(NULL);
+		}
+
+		UT_LIST_REMOVE(undo_list, rseg->insert_undo_cached, undo);
+	} else {
+		ut_ad(type == TRX_UNDO_UPDATE);
+
+		undo = UT_LIST_GET_FIRST(rseg->update_undo_cached);
+		if (undo == NULL) {
+
+			return(NULL);
+		}
+
+		UT_LIST_REMOVE(undo_list, rseg->update_undo_cached, undo);
+	}
+
+	ut_ad(undo->size == 1);
+
+	if (undo->id >= TRX_RSEG_N_SLOTS) {
+		fprintf(stderr, "InnoDB: Error: undo->id is %lu\n",
+			(ulong) undo->id);
+		mem_analyze_corruption(undo);
+		ut_error;
+	}
+
+	undo_page = trx_undo_page_get(undo->space, undo->zip_size,
+				      undo->hdr_page_no, mtr);
+
+	if (type == TRX_UNDO_INSERT) {
+		offset = trx_undo_insert_header_reuse(undo_page, trx_id, mtr);
+
+		if (trx->support_xa) {
+			trx_undo_header_add_space_for_xid(
+				undo_page, undo_page + offset, mtr);
+		}
+	} else {
+		ut_a(mach_read_from_2(undo_page + TRX_UNDO_PAGE_HDR
+				      + TRX_UNDO_PAGE_TYPE)
+		     == TRX_UNDO_UPDATE);
+
+		offset = trx_undo_header_create(undo_page, trx_id, mtr);
+
+		if (trx->support_xa) {
+			trx_undo_header_add_space_for_xid(
+				undo_page, undo_page + offset, mtr);
+		}
+	}
+
+	trx_undo_mem_init_for_reuse(undo, trx_id, xid, offset);
+
+	return(undo);
+}
+
+/**********************************************************************//**
+Marks an undo log header as a header of a data dictionary operation
+transaction. */
+static
+void
+trx_undo_mark_as_dict_operation(
+/*============================*/
+	trx_t*		trx,	/*!< in: dict op transaction */
+	trx_undo_t*	undo,	/*!< in: assigned undo log */
+	mtr_t*		mtr)	/*!< in: mtr */
+{
+	page_t*	hdr_page;
+
+	hdr_page = trx_undo_page_get(undo->space, undo->zip_size,
+				     undo->hdr_page_no, mtr);
+
+	switch (trx_get_dict_operation(trx)) {
+	case TRX_DICT_OP_NONE:
+		ut_error;
+	case TRX_DICT_OP_INDEX:
+		/* Do not discard the table on recovery. */
+		undo->table_id = ut_dulint_zero;
+		break;
+	case TRX_DICT_OP_TABLE:
+		undo->table_id = trx->table_id;
+		break;
+	}
+
+	mlog_write_ulint(hdr_page + undo->hdr_offset
+			 + TRX_UNDO_DICT_TRANS,
+			 TRUE, MLOG_1BYTE, mtr);
+
+	mlog_write_dulint(hdr_page + undo->hdr_offset + TRX_UNDO_TABLE_ID,
+			  undo->table_id, mtr);
+
+	undo->dict_operation = TRUE;
+}
+
+/**********************************************************************//**
+Assigns an undo log for a transaction. A new undo log is created or a cached
+undo log reused.
+@return DB_SUCCESS if undo log assign successful, possible error codes
+are: DB_TOO_MANY_CONCURRENT_TRXS DB_OUT_OF_FILE_SPACE
+DB_OUT_OF_MEMORY */
+UNIV_INTERN
+ulint
+trx_undo_assign_undo(
+/*=================*/
+	trx_t*		trx,	/*!< in: transaction */
+	ulint		type)	/*!< in: TRX_UNDO_INSERT or TRX_UNDO_UPDATE */
+{
+	trx_rseg_t*	rseg;
+	trx_undo_t*	undo;
+	mtr_t		mtr;
+	ulint		err = DB_SUCCESS;
+
+	ut_ad(trx);
+	ut_ad(trx->rseg);
+
+	rseg = trx->rseg;
+
+	ut_ad(mutex_own(&(trx->undo_mutex)));
+
+	mtr_start(&mtr);
+
+	ut_ad(!mutex_own(&kernel_mutex));
+
+	mutex_enter(&(rseg->mutex));
+
+	undo = trx_undo_reuse_cached(trx, rseg, type, trx->id, &trx->xid,
+				     &mtr);
+	if (undo == NULL) {
+		err = trx_undo_create(trx, rseg, type, trx->id, &trx->xid,
+								&undo, &mtr);
+		if (err != DB_SUCCESS) {
+
+			goto func_exit;
+		}
+	}
+
+	if (type == TRX_UNDO_INSERT) {
+		UT_LIST_ADD_FIRST(undo_list, rseg->insert_undo_list, undo);
+		ut_ad(trx->insert_undo == NULL);
+		trx->insert_undo = undo;
+	} else {
+		UT_LIST_ADD_FIRST(undo_list, rseg->update_undo_list, undo);
+		ut_ad(trx->update_undo == NULL);
+		trx->update_undo = undo;
+	}
+
+	if (trx_get_dict_operation(trx) != TRX_DICT_OP_NONE) {
+		trx_undo_mark_as_dict_operation(trx, undo, &mtr);
+	}
+
+func_exit:
+	mutex_exit(&(rseg->mutex));
+	mtr_commit(&mtr);
+
+	return err;
+}
+
+/******************************************************************//**
+Sets the state of the undo log segment at a transaction finish.
+@return	undo log segment header page, x-latched */
+UNIV_INTERN
+page_t*
+trx_undo_set_state_at_finish(
+/*=========================*/
+	trx_rseg_t*	rseg,	/*!< in: rollback segment memory object */
+	trx_t*		trx __attribute__((unused)), /*!< in: transaction */
+	trx_undo_t*	undo,	/*!< in: undo log memory copy */
+	mtr_t*		mtr)	/*!< in: mtr */
+{
+	trx_usegf_t*	seg_hdr;
+	trx_upagef_t*	page_hdr;
+	page_t*		undo_page;
+	ulint		state;
+
+	ut_ad(trx);
+	ut_ad(undo);
+	ut_ad(mtr);
+	ut_ad(mutex_own(&rseg->mutex));
+
+	if (undo->id >= TRX_RSEG_N_SLOTS) {
+		fprintf(stderr, "InnoDB: Error: undo->id is %lu\n",
+			(ulong) undo->id);
+		mem_analyze_corruption(undo);
+		ut_error;
+	}
+
+	undo_page = trx_undo_page_get(undo->space, undo->zip_size,
+				      undo->hdr_page_no, mtr);
+
+	seg_hdr = undo_page + TRX_UNDO_SEG_HDR;
+	page_hdr = undo_page + TRX_UNDO_PAGE_HDR;
+
+	if (undo->size == 1
+	    && mach_read_from_2(page_hdr + TRX_UNDO_PAGE_FREE)
+	       < TRX_UNDO_PAGE_REUSE_LIMIT) {
+
+		/* This is a heuristic to avoid the problem of all UNDO
+		slots ending up in one of the UNDO lists. Previously if
+		the server crashed with all the slots in one of the lists,
+		transactions that required the slots of a different type
+		would fail for lack of slots. */
+
+		if (UT_LIST_GET_LEN(rseg->update_undo_list) < 500
+		    && UT_LIST_GET_LEN(rseg->insert_undo_list) < 500) {
+
+			state = TRX_UNDO_CACHED;
+		} else {
+			state = TRX_UNDO_TO_FREE;
+		}
+
+	} else if (undo->type == TRX_UNDO_INSERT) {
+
+		state = TRX_UNDO_TO_FREE;
+	} else {
+		state = TRX_UNDO_TO_PURGE;
+	}
+
+	undo->state = state;
+
+	mlog_write_ulint(seg_hdr + TRX_UNDO_STATE, state, MLOG_2BYTES, mtr);
+
+	return(undo_page);
+}
+
+/******************************************************************//**
+Sets the state of the undo log segment at a transaction prepare.
+@return	undo log segment header page, x-latched */
+UNIV_INTERN
+page_t*
+trx_undo_set_state_at_prepare(
+/*==========================*/
+	trx_t*		trx,	/*!< in: transaction */
+	trx_undo_t*	undo,	/*!< in: undo log memory copy */
+	mtr_t*		mtr)	/*!< in: mtr */
+{
+	trx_usegf_t*	seg_hdr;
+	trx_upagef_t*	page_hdr;
+	trx_ulogf_t*	undo_header;
+	page_t*		undo_page;
+	ulint		offset;
+
+	ut_ad(trx && undo && mtr);
+
+	if (undo->id >= TRX_RSEG_N_SLOTS) {
+		fprintf(stderr, "InnoDB: Error: undo->id is %lu\n",
+			(ulong) undo->id);
+		mem_analyze_corruption(undo);
+		ut_error;
+	}
+
+	undo_page = trx_undo_page_get(undo->space, undo->zip_size,
+				      undo->hdr_page_no, mtr);
+
+	seg_hdr = undo_page + TRX_UNDO_SEG_HDR;
+	page_hdr = undo_page + TRX_UNDO_PAGE_HDR;
+
+	/*------------------------------*/
+	undo->state = TRX_UNDO_PREPARED;
+	undo->xid   = trx->xid;
+	/*------------------------------*/
+
+	mlog_write_ulint(seg_hdr + TRX_UNDO_STATE, undo->state,
+			 MLOG_2BYTES, mtr);
+
+	offset = mach_read_from_2(seg_hdr + TRX_UNDO_LAST_LOG);
+	undo_header = undo_page + offset;
+
+	mlog_write_ulint(undo_header + TRX_UNDO_XID_EXISTS,
+			 TRUE, MLOG_1BYTE, mtr);
+
+	trx_undo_write_xid(undo_header, &undo->xid, mtr);
+
+	return(undo_page);
+}
+
+/**********************************************************************//**
+Adds the update undo log header as the first in the history list, and
+frees the memory object, or puts it to the list of cached update undo log
+segments. */
+UNIV_INTERN
+void
+trx_undo_update_cleanup(
+/*====================*/
+	trx_t*	trx,		/*!< in: trx owning the update undo log */
+	page_t*	undo_page,	/*!< in: update undo log header page,
+				x-latched */
+	mtr_t*	mtr)		/*!< in: mtr */
+{
+	trx_rseg_t*	rseg;
+	trx_undo_t*	undo;
+
+	undo = trx->update_undo;
+	rseg = trx->rseg;
+
+	ut_ad(mutex_own(&(rseg->mutex)));
+
+	trx_purge_add_update_undo_to_history(trx, undo_page, mtr);
+
+	UT_LIST_REMOVE(undo_list, rseg->update_undo_list, undo);
+
+	trx->update_undo = NULL;
+
+	if (undo->state == TRX_UNDO_CACHED) {
+
+		UT_LIST_ADD_FIRST(undo_list, rseg->update_undo_cached, undo);
+	} else {
+		ut_ad(undo->state == TRX_UNDO_TO_PURGE
+		      || undo->state == TRX_UNDO_TO_FREE);
+
+		trx_undo_mem_free(undo);
+	}
+}
+
+/******************************************************************//**
+Frees or caches an insert undo log after a transaction commit or rollback.
+Knowledge of inserts is not needed after a commit or rollback, therefore
+the data can be discarded. */
+UNIV_INTERN
+void
+trx_undo_insert_cleanup(
+/*====================*/
+	trx_t*	trx)	/*!< in: transaction handle */
+{
+	trx_undo_t*	undo;
+	trx_rseg_t*	rseg;
+
+	undo = trx->insert_undo;
+	ut_ad(undo);
+
+	rseg = trx->rseg;
+
+	mutex_enter(&(rseg->mutex));
+
+	UT_LIST_REMOVE(undo_list, rseg->insert_undo_list, undo);
+	trx->insert_undo = NULL;
+
+	if (undo->state == TRX_UNDO_CACHED) {
+
+		UT_LIST_ADD_FIRST(undo_list, rseg->insert_undo_cached, undo);
+	} else {
+		ut_ad(undo->state == TRX_UNDO_TO_FREE);
+
+		/* Delete first the undo log segment in the file */
+
+		mutex_exit(&(rseg->mutex));
+
+		trx_undo_seg_free(undo);
+
+		mutex_enter(&(rseg->mutex));
+
+		ut_ad(rseg->curr_size > undo->size);
+
+		rseg->curr_size -= undo->size;
+
+		trx_undo_mem_free(undo);
+	}
+
+	mutex_exit(&(rseg->mutex));
+}
+#endif /* !UNIV_HOTBACKUP */
diff --git a/storage/xtradb/usr/usr0sess.c b/storage/xtradb/usr/usr0sess.c
new file mode 100644
index 00000000000..8087dcb4170
--- /dev/null
+++ b/storage/xtradb/usr/usr0sess.c
@@ -0,0 +1,71 @@
+/*****************************************************************************
+
+Copyright (c) 1996, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file usr/usr0sess.c
+Sessions
+
+Created 6/25/1996 Heikki Tuuri
+*******************************************************/
+
+#include "usr0sess.h"
+
+#ifdef UNIV_NONINL
+#include "usr0sess.ic"
+#endif
+
+#include "trx0trx.h"
+
+/*********************************************************************//**
+Opens a session.
+@return	own: session object */
+UNIV_INTERN
+sess_t*
+sess_open(void)
+/*===========*/
+{
+	sess_t*	sess;
+
+	ut_ad(mutex_own(&kernel_mutex));
+
+	sess = mem_alloc(sizeof(sess_t));
+
+	sess->state = SESS_ACTIVE;
+
+	sess->trx = trx_create(sess);
+
+	UT_LIST_INIT(sess->graphs);
+
+	return(sess);
+}
+
+/*********************************************************************//**
+Closes a session, freeing the memory occupied by it. */
+UNIV_INTERN
+void
+sess_close(
+/*=======*/
+	sess_t*	sess)	/*!< in, own: session object */
+{
+	ut_ad(!mutex_own(&kernel_mutex));
+
+	ut_a(UT_LIST_GET_LEN(sess->graphs) == 0);
+
+	trx_free_for_background(sess->trx);
+	mem_free(sess);
+}
diff --git a/storage/xtradb/ut/ut0byte.c b/storage/xtradb/ut/ut0byte.c
new file mode 100644
index 00000000000..4e093f72ce2
--- /dev/null
+++ b/storage/xtradb/ut/ut0byte.c
@@ -0,0 +1,55 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/***************************************************************//**
+@file ut/ut0byte.c
+Byte utilities
+
+Created 5/11/1994 Heikki Tuuri
+********************************************************************/
+
+#include "ut0byte.h"
+
+#ifdef UNIV_NONINL
+#include "ut0byte.ic"
+#endif
+
+/** Zero value for a dulint */
+UNIV_INTERN const dulint	ut_dulint_zero	= {0, 0};
+
+/** Maximum value for a dulint */
+UNIV_INTERN const dulint	ut_dulint_max	= {0xFFFFFFFFUL, 0xFFFFFFFFUL};
+
+#ifdef notdefined /* unused code */
+#include "ut0sort.h"
+
+/************************************************************//**
+Sort function for dulint arrays. */
+UNIV_INTERN
+void
+ut_dulint_sort(
+/*===========*/
+	dulint*	arr,	/*!< in/out: array to be sorted */
+	dulint*	aux_arr,/*!< in/out: auxiliary array (same size as arr) */
+	ulint	low,	/*!< in: low bound of sort interval, inclusive */
+	ulint	high)	/*!< in: high bound of sort interval, noninclusive */
+{
+	UT_SORT_FUNCTION_BODY(ut_dulint_sort, arr, aux_arr, low, high,
+			      ut_dulint_cmp);
+}
+#endif /* notdefined */
diff --git a/storage/xtradb/ut/ut0dbg.c b/storage/xtradb/ut/ut0dbg.c
new file mode 100644
index 00000000000..4484e6c36de
--- /dev/null
+++ b/storage/xtradb/ut/ut0dbg.c
@@ -0,0 +1,187 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/*****************************************************************//**
+@file ut/ut0dbg.c
+Debug utilities for Innobase.
+
+Created 1/30/1994 Heikki Tuuri
+**********************************************************************/
+
+#include "univ.i"
+#include "ut0dbg.h"
+
+#if defined(__GNUC__) && (__GNUC__ > 2)
+#else
+/** This is used to eliminate compiler warnings */
+UNIV_INTERN ulint	ut_dbg_zero	= 0;
+#endif
+
+#if defined(UNIV_SYNC_DEBUG) || !defined(UT_DBG_USE_ABORT)
+/** If this is set to TRUE by ut_dbg_assertion_failed(), all threads
+will stop at the next ut_a() or ut_ad(). */
+UNIV_INTERN ibool	ut_dbg_stop_threads	= FALSE;
+#endif
+#ifdef __NETWARE__
+/** Flag for ignoring further assertion failures.  This is set to TRUE
+when on NetWare there happens an InnoDB assertion failure or other
+fatal error condition that requires an immediate shutdown. */
+UNIV_INTERN ibool panic_shutdown = FALSE;
+#elif !defined(UT_DBG_USE_ABORT)
+/** A null pointer that will be dereferenced to trigger a memory trap */
+UNIV_INTERN ulint*	ut_dbg_null_ptr		= NULL;
+#endif
+
+/*************************************************************//**
+Report a failed assertion. */
+UNIV_INTERN
+void
+ut_dbg_assertion_failed(
+/*====================*/
+	const char* expr,	/*!< in: the failed assertion (optional) */
+	const char* file,	/*!< in: source file containing the assertion */
+	ulint line)		/*!< in: line number of the assertion */
+{
+	ut_print_timestamp(stderr);
+#ifdef UNIV_HOTBACKUP
+	fprintf(stderr, "  InnoDB: Assertion failure in file %s line %lu\n",
+		file, line);
+#else /* UNIV_HOTBACKUP */
+	fprintf(stderr,
+		"  InnoDB: Assertion failure in thread %lu"
+		" in file %s line %lu\n",
+		os_thread_pf(os_thread_get_curr_id()), file, line);
+#endif /* UNIV_HOTBACKUP */
+	if (expr) {
+		fprintf(stderr,
+			"InnoDB: Failing assertion: %s\n", expr);
+	}
+
+	fputs("InnoDB: We intentionally generate a memory trap.\n"
+	      "InnoDB: Submit a detailed bug report"
+	      " to http://bugs.mysql.com.\n"
+	      "InnoDB: If you get repeated assertion failures"
+	      " or crashes, even\n"
+	      "InnoDB: immediately after the mysqld startup, there may be\n"
+	      "InnoDB: corruption in the InnoDB tablespace. Please refer to\n"
+	      "InnoDB: " REFMAN "forcing-recovery.html\n"
+	      "InnoDB: about forcing recovery.\n", stderr);
+#if defined(UNIV_SYNC_DEBUG) || !defined(UT_DBG_USE_ABORT)
+	ut_dbg_stop_threads = TRUE;
+#endif
+}
+
+#ifdef __NETWARE__
+/*************************************************************//**
+Shut down MySQL/InnoDB after assertion failure. */
+UNIV_INTERN
+void
+ut_dbg_panic(void)
+/*==============*/
+{
+	if (!panic_shutdown) {
+		panic_shutdown = TRUE;
+		innobase_shutdown_for_mysql();
+	}
+	exit(1);
+}
+#else /* __NETWARE__ */
+# if defined(UNIV_SYNC_DEBUG) || !defined(UT_DBG_USE_ABORT)
+/*************************************************************//**
+Stop a thread after assertion failure. */
+UNIV_INTERN
+void
+ut_dbg_stop_thread(
+/*===============*/
+	const char*	file,
+	ulint		line)
+{
+#ifndef UNIV_HOTBACKUP
+	fprintf(stderr, "InnoDB: Thread %lu stopped in file %s line %lu\n",
+		os_thread_pf(os_thread_get_curr_id()), file, line);
+	os_thread_sleep(1000000000);
+#endif /* !UNIV_HOTBACKUP */
+}
+# endif
+#endif /* __NETWARE__ */
+
+#ifdef UNIV_COMPILE_TEST_FUNCS
+
+#include <sys/types.h>
+#include <sys/time.h>
+#include <sys/resource.h>
+
+#include <unistd.h>
+
+#ifndef timersub
+#define timersub(a, b, r)						\
+	do {								\
+		(r)->tv_sec = (a)->tv_sec - (b)->tv_sec;		\
+		(r)->tv_usec = (a)->tv_usec - (b)->tv_usec;		\
+		if ((r)->tv_usec < 0) {					\
+			(r)->tv_sec--;					\
+			(r)->tv_usec += 1000000;			\
+		}							\
+	} while (0)
+#endif /* timersub */
+
+/*******************************************************************//**
+Resets a speedo (records the current time in it). */
+UNIV_INTERN
+void
+speedo_reset(
+/*=========*/
+	speedo_t*	speedo)	/*!< out: speedo */
+{
+	gettimeofday(&speedo->tv, NULL);
+
+	getrusage(RUSAGE_SELF, &speedo->ru);
+}
+
+/*******************************************************************//**
+Shows the time elapsed and usage statistics since the last reset of a
+speedo. */
+UNIV_INTERN
+void
+speedo_show(
+/*========*/
+	const speedo_t*	speedo)	/*!< in: speedo */
+{
+	struct rusage	ru_now;
+	struct timeval	tv_now;
+	struct timeval	tv_diff;
+
+	getrusage(RUSAGE_SELF, &ru_now);
+
+	gettimeofday(&tv_now, NULL);
+
+#define PRINT_TIMEVAL(prefix, tvp)		\
+	fprintf(stderr, "%s% 5ld.%06ld sec\n",	\
+		prefix, (tvp)->tv_sec, (tvp)->tv_usec)
+
+	timersub(&tv_now, &speedo->tv, &tv_diff);
+	PRINT_TIMEVAL("real", &tv_diff);
+
+	timersub(&ru_now.ru_utime, &speedo->ru.ru_utime, &tv_diff);
+	PRINT_TIMEVAL("user", &tv_diff);
+
+	timersub(&ru_now.ru_stime, &speedo->ru.ru_stime, &tv_diff);
+	PRINT_TIMEVAL("sys ", &tv_diff);
+}
+
+#endif /* UNIV_COMPILE_TEST_FUNCS */
diff --git a/storage/xtradb/ut/ut0list.c b/storage/xtradb/ut/ut0list.c
new file mode 100644
index 00000000000..895a575c535
--- /dev/null
+++ b/storage/xtradb/ut/ut0list.c
@@ -0,0 +1,194 @@
+/*****************************************************************************
+
+Copyright (c) 2006, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/*******************************************************************//**
+@file ut/ut0list.c
+A double-linked list
+
+Created 4/26/2006 Osku Salerma
+************************************************************************/
+
+#include "ut0list.h"
+#ifdef UNIV_NONINL
+#include "ut0list.ic"
+#endif
+
+/****************************************************************//**
+Create a new list.
+@return	list */
+UNIV_INTERN
+ib_list_t*
+ib_list_create(void)
+/*=================*/
+{
+	ib_list_t*	list = mem_alloc(sizeof(ib_list_t));
+
+	list->first = NULL;
+	list->last = NULL;
+	list->is_heap_list = FALSE;
+
+	return(list);
+}
+
+/****************************************************************//**
+Create a new list using the given heap. ib_list_free MUST NOT BE CALLED for
+lists created with this function.
+@return	list */
+UNIV_INTERN
+ib_list_t*
+ib_list_create_heap(
+/*================*/
+	mem_heap_t*	heap)	/*!< in: memory heap to use */
+{
+	ib_list_t*	list = mem_heap_alloc(heap, sizeof(ib_list_t));
+
+	list->first = NULL;
+	list->last = NULL;
+	list->is_heap_list = TRUE;
+
+	return(list);
+}
+
+/****************************************************************//**
+Free a list. */
+UNIV_INTERN
+void
+ib_list_free(
+/*=========*/
+	ib_list_t*	list)	/*!< in: list */
+{
+	ut_a(!list->is_heap_list);
+
+	/* We don't check that the list is empty because it's entirely valid
+	to e.g. have all the nodes allocated from a single heap that is then
+	freed after the list itself is freed. */
+
+	mem_free(list);
+}
+
+/****************************************************************//**
+Add the data to the start of the list.
+@return	new list node */
+UNIV_INTERN
+ib_list_node_t*
+ib_list_add_first(
+/*==============*/
+	ib_list_t*	list,	/*!< in: list */
+	void*		data,	/*!< in: data */
+	mem_heap_t*	heap)	/*!< in: memory heap to use */
+{
+	return(ib_list_add_after(list, ib_list_get_first(list), data, heap));
+}
+
+/****************************************************************//**
+Add the data to the end of the list.
+@return	new list node */
+UNIV_INTERN
+ib_list_node_t*
+ib_list_add_last(
+/*=============*/
+	ib_list_t*	list,	/*!< in: list */
+	void*		data,	/*!< in: data */
+	mem_heap_t*	heap)	/*!< in: memory heap to use */
+{
+	return(ib_list_add_after(list, ib_list_get_last(list), data, heap));
+}
+
+/****************************************************************//**
+Add the data after the indicated node.
+@return	new list node */
+UNIV_INTERN
+ib_list_node_t*
+ib_list_add_after(
+/*==============*/
+	ib_list_t*	list,		/*!< in: list */
+	ib_list_node_t*	prev_node,	/*!< in: node preceding new node (can
+					be NULL) */
+	void*		data,		/*!< in: data */
+	mem_heap_t*	heap)		/*!< in: memory heap to use */
+{
+	ib_list_node_t*	node = mem_heap_alloc(heap, sizeof(ib_list_node_t));
+
+	node->data = data;
+
+	if (!list->first) {
+		/* Empty list. */
+
+		ut_a(!prev_node);
+
+		node->prev = NULL;
+		node->next = NULL;
+
+		list->first = node;
+		list->last = node;
+	} else if (!prev_node) {
+		/* Start of list. */
+
+		node->prev = NULL;
+		node->next = list->first;
+
+		list->first->prev = node;
+
+		list->first = node;
+	} else {
+		/* Middle or end of list. */
+
+		node->prev = prev_node;
+		node->next = prev_node->next;
+
+		prev_node->next = node;
+
+		if (node->next) {
+			node->next->prev = node;
+		} else {
+			list->last = node;
+		}
+	}
+
+	return(node);
+}
+
+/****************************************************************//**
+Remove the node from the list. */
+UNIV_INTERN
+void
+ib_list_remove(
+/*===========*/
+	ib_list_t*	list,	/*!< in: list */
+	ib_list_node_t*	node)	/*!< in: node to remove */
+{
+	if (node->prev) {
+		node->prev->next = node->next;
+	} else {
+		/* First item in list. */
+
+		ut_ad(list->first == node);
+
+		list->first = node->next;
+	}
+
+	if (node->next) {
+		node->next->prev = node->prev;
+	} else {
+		/* Last item in list. */
+
+		ut_ad(list->last == node);
+
+		list->last = node->prev;
+	}
+}
diff --git a/storage/xtradb/ut/ut0mem.c b/storage/xtradb/ut/ut0mem.c
new file mode 100644
index 00000000000..bf55e4273b6
--- /dev/null
+++ b/storage/xtradb/ut/ut0mem.c
@@ -0,0 +1,711 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/********************************************************************//**
+@file ut/ut0mem.c
+Memory primitives
+
+Created 5/11/1994 Heikki Tuuri
+*************************************************************************/
+
+#include "ut0mem.h"
+
+#ifdef UNIV_NONINL
+#include "ut0mem.ic"
+#endif
+
+#ifndef UNIV_HOTBACKUP
+# include "os0thread.h"
+# include "srv0srv.h"
+
+#include <stdlib.h>
+
+/** This struct is placed first in every allocated memory block */
+typedef struct ut_mem_block_struct ut_mem_block_t;
+
+/** The total amount of memory currently allocated from the operating
+system with os_mem_alloc_large() or malloc().  Does not count malloc()
+if srv_use_sys_malloc is set.  Protected by ut_list_mutex. */
+UNIV_INTERN ulint		ut_total_allocated_memory	= 0;
+
+/** Mutex protecting ut_total_allocated_memory and ut_mem_block_list */
+UNIV_INTERN os_fast_mutex_t	ut_list_mutex;
+
+/** Dynamically allocated memory block */
+struct ut_mem_block_struct{
+	UT_LIST_NODE_T(ut_mem_block_t) mem_block_list;
+			/*!< mem block list node */
+	ulint	size;	/*!< size of allocated memory */
+	ulint	magic_n;/*!< magic number (UT_MEM_MAGIC_N) */
+};
+
+/** The value of ut_mem_block_struct::magic_n.  Used in detecting
+memory corruption. */
+#define UT_MEM_MAGIC_N	1601650166
+
+/** List of all memory blocks allocated from the operating system
+with malloc.  Protected by ut_list_mutex. */
+static UT_LIST_BASE_NODE_T(ut_mem_block_t)   ut_mem_block_list;
+
+/** Flag: has ut_mem_block_list been initialized? */
+static ibool  ut_mem_block_list_inited = FALSE;
+
+/** A dummy pointer for generating a null pointer exception in
+ut_malloc_low() */
+static ulint*	ut_mem_null_ptr	= NULL;
+
+/**********************************************************************//**
+Initializes the mem block list at database startup. */
+UNIV_INTERN
+void
+ut_mem_init(void)
+/*=============*/
+{
+	ut_a(!ut_mem_block_list_inited);
+	os_fast_mutex_init(&ut_list_mutex);
+	UT_LIST_INIT(ut_mem_block_list);
+	ut_mem_block_list_inited = TRUE;
+}
+#endif /* !UNIV_HOTBACKUP */
+
+/**********************************************************************//**
+Allocates memory. Sets it also to zero if UNIV_SET_MEM_TO_ZERO is
+defined and set_to_zero is TRUE.
+@return	own: allocated memory */
+UNIV_INTERN
+void*
+ut_malloc_low(
+/*==========*/
+	ulint	n,		/*!< in: number of bytes to allocate */
+	ibool	set_to_zero,	/*!< in: TRUE if allocated memory should be
+				set to zero if UNIV_SET_MEM_TO_ZERO is
+				defined */
+	ibool	assert_on_error)/*!< in: if TRUE, we crash mysqld if the
+				memory cannot be allocated */
+{
+#ifndef UNIV_HOTBACKUP
+	ulint	retry_count;
+	void*	ret;
+
+	if (UNIV_LIKELY(srv_use_sys_malloc)) {
+		ret = malloc(n);
+		ut_a(ret || !assert_on_error);
+
+#ifdef UNIV_SET_MEM_TO_ZERO
+		if (set_to_zero) {
+			memset(ret, '\0', n);
+			UNIV_MEM_ALLOC(ret, n);
+		}
+#endif
+		return(ret);
+	}
+
+	ut_ad((sizeof(ut_mem_block_t) % 8) == 0); /* check alignment ok */
+	ut_a(ut_mem_block_list_inited);
+
+	retry_count = 0;
+retry:
+	os_fast_mutex_lock(&ut_list_mutex);
+
+	ret = malloc(n + sizeof(ut_mem_block_t));
+
+	if (ret == NULL && retry_count < 60) {
+		if (retry_count == 0) {
+			ut_print_timestamp(stderr);
+
+			fprintf(stderr,
+				"  InnoDB: Error: cannot allocate"
+				" %lu bytes of\n"
+				"InnoDB: memory with malloc!"
+				" Total allocated memory\n"
+				"InnoDB: by InnoDB %lu bytes."
+				" Operating system errno: %lu\n"
+				"InnoDB: Check if you should"
+				" increase the swap file or\n"
+				"InnoDB: ulimits of your operating system.\n"
+				"InnoDB: On FreeBSD check you"
+				" have compiled the OS with\n"
+				"InnoDB: a big enough maximum process size.\n"
+				"InnoDB: Note that in most 32-bit"
+				" computers the process\n"
+				"InnoDB: memory space is limited"
+				" to 2 GB or 4 GB.\n"
+				"InnoDB: We keep retrying"
+				" the allocation for 60 seconds...\n",
+				(ulong) n, (ulong) ut_total_allocated_memory,
+#ifdef __WIN__
+				(ulong) GetLastError()
+#else
+				(ulong) errno
+#endif
+				);
+		}
+
+		os_fast_mutex_unlock(&ut_list_mutex);
+
+		/* Sleep for a second and retry the allocation; maybe this is
+		just a temporary shortage of memory */
+
+		os_thread_sleep(1000000);
+
+		retry_count++;
+
+		goto retry;
+	}
+
+	if (ret == NULL) {
+		/* Flush stderr to make more probable that the error
+		message gets in the error file before we generate a seg
+		fault */
+
+		fflush(stderr);
+
+		os_fast_mutex_unlock(&ut_list_mutex);
+
+		/* Make an intentional seg fault so that we get a stack
+		trace */
+		/* Intentional segfault on NetWare causes an abend. Avoid this
+		by graceful exit handling in ut_a(). */
+#if (!defined __NETWARE__)
+		if (assert_on_error) {
+			ut_print_timestamp(stderr);
+
+			fprintf(stderr,
+				"  InnoDB: We now intentionally"
+				" generate a seg fault so that\n"
+				"InnoDB: on Linux we get a stack trace.\n");
+
+			if (*ut_mem_null_ptr) ut_mem_null_ptr = 0;
+		} else {
+			return(NULL);
+		}
+#else
+		ut_a(0);
+#endif
+	}
+
+	if (set_to_zero) {
+#ifdef UNIV_SET_MEM_TO_ZERO
+		memset(ret, '\0', n + sizeof(ut_mem_block_t));
+#endif
+	}
+
+	UNIV_MEM_ALLOC(ret, n + sizeof(ut_mem_block_t));
+
+	((ut_mem_block_t*)ret)->size = n + sizeof(ut_mem_block_t);
+	((ut_mem_block_t*)ret)->magic_n = UT_MEM_MAGIC_N;
+
+	ut_total_allocated_memory += n + sizeof(ut_mem_block_t);
+
+	UT_LIST_ADD_FIRST(mem_block_list, ut_mem_block_list,
+			  ((ut_mem_block_t*)ret));
+	os_fast_mutex_unlock(&ut_list_mutex);
+
+	return((void*)((byte*)ret + sizeof(ut_mem_block_t)));
+#else /* !UNIV_HOTBACKUP */
+	void*	ret = malloc(n);
+	ut_a(ret || !assert_on_error);
+
+# ifdef UNIV_SET_MEM_TO_ZERO
+	if (set_to_zero) {
+		memset(ret, '\0', n);
+	}
+# endif
+	return(ret);
+#endif /* !UNIV_HOTBACKUP */
+}
+
+/**********************************************************************//**
+Allocates memory. Sets it also to zero if UNIV_SET_MEM_TO_ZERO is
+defined.
+@return	own: allocated memory */
+UNIV_INTERN
+void*
+ut_malloc(
+/*======*/
+	ulint	n)	/*!< in: number of bytes to allocate */
+{
+#ifndef UNIV_HOTBACKUP
+	return(ut_malloc_low(n, TRUE, TRUE));
+#else /* !UNIV_HOTBACKUP */
+	return(malloc(n));
+#endif /* !UNIV_HOTBACKUP */
+}
+
+#ifndef UNIV_HOTBACKUP
+/**********************************************************************//**
+Tests if malloc of n bytes would succeed. ut_malloc() asserts if memory runs
+out. It cannot be used if we want to return an error message. Prints to
+stderr a message if fails.
+@return	TRUE if succeeded */
+UNIV_INTERN
+ibool
+ut_test_malloc(
+/*===========*/
+	ulint	n)	/*!< in: try to allocate this many bytes */
+{
+	void*	ret;
+
+	ret = malloc(n);
+
+	if (ret == NULL) {
+		ut_print_timestamp(stderr);
+		fprintf(stderr,
+			"  InnoDB: Error: cannot allocate"
+			" %lu bytes of memory for\n"
+			"InnoDB: a BLOB with malloc! Total allocated memory\n"
+			"InnoDB: by InnoDB %lu bytes."
+			" Operating system errno: %d\n"
+			"InnoDB: Check if you should increase"
+			" the swap file or\n"
+			"InnoDB: ulimits of your operating system.\n"
+			"InnoDB: On FreeBSD check you have"
+			" compiled the OS with\n"
+			"InnoDB: a big enough maximum process size.\n",
+			(ulong) n,
+			(ulong) ut_total_allocated_memory,
+			(int) errno);
+		return(FALSE);
+	}
+
+	free(ret);
+
+	return(TRUE);
+}
+#endif /* !UNIV_HOTBACKUP */
+
+/**********************************************************************//**
+Frees a memory block allocated with ut_malloc. Freeing a NULL pointer is
+a nop. */
+UNIV_INTERN
+void
+ut_free(
+/*====*/
+	void* ptr)  /*!< in, own: memory block */
+{
+#ifndef UNIV_HOTBACKUP
+	ut_mem_block_t* block;
+
+	if (ptr == NULL) {
+		return;
+	} else if (UNIV_LIKELY(srv_use_sys_malloc)) {
+		free(ptr);
+		return;
+	}
+
+	block = (ut_mem_block_t*)((byte*)ptr - sizeof(ut_mem_block_t));
+
+	os_fast_mutex_lock(&ut_list_mutex);
+
+	ut_a(block->magic_n == UT_MEM_MAGIC_N);
+	ut_a(ut_total_allocated_memory >= block->size);
+
+	ut_total_allocated_memory -= block->size;
+
+	UT_LIST_REMOVE(mem_block_list, ut_mem_block_list, block);
+	free(block);
+
+	os_fast_mutex_unlock(&ut_list_mutex);
+#else /* !UNIV_HOTBACKUP */
+	free(ptr);
+#endif /* !UNIV_HOTBACKUP */
+}
+
+#ifndef UNIV_HOTBACKUP
+/**********************************************************************//**
+Implements realloc. This is needed by /pars/lexyy.c. Otherwise, you should not
+use this function because the allocation functions in mem0mem.h are the
+recommended ones in InnoDB.
+
+man realloc in Linux, 2004:
+
+       realloc()  changes the size of the memory block pointed to
+       by ptr to size bytes.  The contents will be  unchanged  to
+       the minimum of the old and new sizes; newly allocated mem-
+       ory will be uninitialized.  If ptr is NULL,  the	 call  is
+       equivalent  to malloc(size); if size is equal to zero, the
+       call is equivalent to free(ptr).	 Unless ptr is	NULL,  it
+       must  have  been	 returned by an earlier call to malloc(),
+       calloc() or realloc().
+
+RETURN VALUE
+       realloc() returns a pointer to the newly allocated memory,
+       which is suitably aligned for any kind of variable and may
+       be different from ptr, or NULL if the  request  fails.  If
+       size  was equal to 0, either NULL or a pointer suitable to
+       be passed to free() is returned.	 If realloc()  fails  the
+       original	 block	is  left  untouched  - it is not freed or
+       moved.
+@return	own: pointer to new mem block or NULL */
+UNIV_INTERN
+void*
+ut_realloc(
+/*=======*/
+	void*	ptr,	/*!< in: pointer to old block or NULL */
+	ulint	size)	/*!< in: desired size */
+{
+	ut_mem_block_t* block;
+	ulint		old_size;
+	ulint		min_size;
+	void*		new_ptr;
+
+	if (UNIV_LIKELY(srv_use_sys_malloc)) {
+		return(realloc(ptr, size));
+	}
+
+	if (ptr == NULL) {
+
+		return(ut_malloc(size));
+	}
+
+	if (size == 0) {
+		ut_free(ptr);
+
+		return(NULL);
+	}
+
+	block = (ut_mem_block_t*)((byte*)ptr - sizeof(ut_mem_block_t));
+
+	ut_a(block->magic_n == UT_MEM_MAGIC_N);
+
+	old_size = block->size - sizeof(ut_mem_block_t);
+
+	if (size < old_size) {
+		min_size = size;
+	} else {
+		min_size = old_size;
+	}
+
+	new_ptr = ut_malloc(size);
+
+	if (new_ptr == NULL) {
+
+		return(NULL);
+	}
+
+	/* Copy the old data from ptr */
+	ut_memcpy(new_ptr, ptr, min_size);
+
+	ut_free(ptr);
+
+	return(new_ptr);
+}
+
+/**********************************************************************//**
+Frees in shutdown all allocated memory not freed yet. */
+UNIV_INTERN
+void
+ut_free_all_mem(void)
+/*=================*/
+{
+	ut_mem_block_t* block;
+
+	ut_a(ut_mem_block_list_inited);
+	ut_mem_block_list_inited = FALSE;
+	os_fast_mutex_free(&ut_list_mutex);
+
+	while ((block = UT_LIST_GET_FIRST(ut_mem_block_list))) {
+
+		ut_a(block->magic_n == UT_MEM_MAGIC_N);
+		ut_a(ut_total_allocated_memory >= block->size);
+
+		ut_total_allocated_memory -= block->size;
+
+		UT_LIST_REMOVE(mem_block_list, ut_mem_block_list, block);
+		free(block);
+	}
+
+	if (ut_total_allocated_memory != 0) {
+		fprintf(stderr,
+			"InnoDB: Warning: after shutdown"
+			" total allocated memory is %lu\n",
+			(ulong) ut_total_allocated_memory);
+	}
+
+	ut_mem_block_list_inited = FALSE;
+}
+#endif /* !UNIV_HOTBACKUP */
+
+/**********************************************************************//**
+Copies up to size - 1 characters from the NUL-terminated string src to
+dst, NUL-terminating the result. Returns strlen(src), so truncation
+occurred if the return value >= size.
+@return	strlen(src) */
+UNIV_INTERN
+ulint
+ut_strlcpy(
+/*=======*/
+	char*		dst,	/*!< in: destination buffer */
+	const char*	src,	/*!< in: source buffer */
+	ulint		size)	/*!< in: size of destination buffer */
+{
+	ulint	src_size = strlen(src);
+
+	if (size != 0) {
+		ulint	n = ut_min(src_size, size - 1);
+
+		memcpy(dst, src, n);
+		dst[n] = '\0';
+	}
+
+	return(src_size);
+}
+
+/**********************************************************************//**
+Like ut_strlcpy, but if src doesn't fit in dst completely, copies the last
+(size - 1) bytes of src, not the first.
+@return	strlen(src) */
+UNIV_INTERN
+ulint
+ut_strlcpy_rev(
+/*===========*/
+	char*		dst,	/*!< in: destination buffer */
+	const char*	src,	/*!< in: source buffer */
+	ulint		size)	/*!< in: size of destination buffer */
+{
+	ulint	src_size = strlen(src);
+
+	if (size != 0) {
+		ulint	n = ut_min(src_size, size - 1);
+
+		memcpy(dst, src + src_size - n, n + 1);
+	}
+
+	return(src_size);
+}
+
+/**********************************************************************//**
+Make a quoted copy of a NUL-terminated string.	Leading and trailing
+quotes will not be included; only embedded quotes will be escaped.
+See also ut_strlenq() and ut_memcpyq().
+@return	pointer to end of dest */
+UNIV_INTERN
+char*
+ut_strcpyq(
+/*=======*/
+	char*		dest,	/*!< in: output buffer */
+	char		q,	/*!< in: the quote character */
+	const char*	src)	/*!< in: null-terminated string */
+{
+	while (*src) {
+		if ((*dest++ = *src++) == q) {
+			*dest++ = q;
+		}
+	}
+
+	return(dest);
+}
+
+/**********************************************************************//**
+Make a quoted copy of a fixed-length string.  Leading and trailing
+quotes will not be included; only embedded quotes will be escaped.
+See also ut_strlenq() and ut_strcpyq().
+@return	pointer to end of dest */
+UNIV_INTERN
+char*
+ut_memcpyq(
+/*=======*/
+	char*		dest,	/*!< in: output buffer */
+	char		q,	/*!< in: the quote character */
+	const char*	src,	/*!< in: string to be quoted */
+	ulint		len)	/*!< in: length of src */
+{
+	const char*	srcend = src + len;
+
+	while (src < srcend) {
+		if ((*dest++ = *src++) == q) {
+			*dest++ = q;
+		}
+	}
+
+	return(dest);
+}
+
+#ifndef UNIV_HOTBACKUP
+/**********************************************************************//**
+Return the number of times s2 occurs in s1. Overlapping instances of s2
+are only counted once.
+@return	the number of times s2 occurs in s1 */
+UNIV_INTERN
+ulint
+ut_strcount(
+/*========*/
+	const char*	s1,	/*!< in: string to search in */
+	const char*	s2)	/*!< in: string to search for */
+{
+	ulint	count = 0;
+	ulint	len = strlen(s2);
+
+	if (len == 0) {
+
+		return(0);
+	}
+
+	for (;;) {
+		s1 = strstr(s1, s2);
+
+		if (!s1) {
+
+			break;
+		}
+
+		count++;
+		s1 += len;
+	}
+
+	return(count);
+}
+
+/**********************************************************************//**
+Replace every occurrence of s1 in str with s2. Overlapping instances of s1
+are only replaced once.
+@return	own: modified string, must be freed with mem_free() */
+UNIV_INTERN
+char*
+ut_strreplace(
+/*==========*/
+	const char*	str,	/*!< in: string to operate on */
+	const char*	s1,	/*!< in: string to replace */
+	const char*	s2)	/*!< in: string to replace s1 with */
+{
+	char*		new_str;
+	char*		ptr;
+	const char*	str_end;
+	ulint		str_len = strlen(str);
+	ulint		s1_len = strlen(s1);
+	ulint		s2_len = strlen(s2);
+	ulint		count = 0;
+	int		len_delta = (int)s2_len - (int)s1_len;
+
+	str_end = str + str_len;
+
+	if (len_delta <= 0) {
+		len_delta = 0;
+	} else {
+		count = ut_strcount(str, s1);
+	}
+
+	new_str = mem_alloc(str_len + count * len_delta + 1);
+	ptr = new_str;
+
+	while (str) {
+		const char*	next = strstr(str, s1);
+
+		if (!next) {
+			next = str_end;
+		}
+
+		memcpy(ptr, str, next - str);
+		ptr += next - str;
+
+		if (next == str_end) {
+
+			break;
+		}
+
+		memcpy(ptr, s2, s2_len);
+		ptr += s2_len;
+
+		str = next + s1_len;
+	}
+
+	*ptr = '\0';
+
+	return(new_str);
+}
+
+#ifdef UNIV_COMPILE_TEST_FUNCS
+
+void
+test_ut_str_sql_format()
+{
+	char	buf[128];
+	ulint	ret;
+
+#define CALL_AND_TEST(str, str_len, buf, buf_size, ret_expected, buf_expected)\
+	do {\
+		ibool	ok = TRUE;\
+		memset(buf, 'x', 10);\
+		buf[10] = '\0';\
+		fprintf(stderr, "TESTING \"%s\", %lu, %lu\n",\
+			str, (ulint) str_len, (ulint) buf_size);\
+		ret = ut_str_sql_format(str, str_len, buf, buf_size);\
+		if (ret != ret_expected) {\
+			fprintf(stderr, "expected ret %lu, got %lu\n",\
+				(ulint) ret_expected, ret);\
+			ok = FALSE;\
+		}\
+		if (strcmp((char*) buf, buf_expected) != 0) {\
+			fprintf(stderr, "expected buf \"%s\", got \"%s\"\n",\
+				buf_expected, buf);\
+			ok = FALSE;\
+		}\
+		if (ok) {\
+			fprintf(stderr, "OK: %lu, \"%s\"\n\n",\
+				(ulint) ret, buf);\
+		} else {\
+			return;\
+		}\
+	} while (0)
+
+	CALL_AND_TEST("abcd", 4, buf, 0, 0, "xxxxxxxxxx");
+
+	CALL_AND_TEST("abcd", 4, buf, 1, 1, "");
+
+	CALL_AND_TEST("abcd", 4, buf, 2, 1, "");
+
+	CALL_AND_TEST("abcd", 0, buf, 3, 3, "''");
+	CALL_AND_TEST("abcd", 1, buf, 3, 1, "");
+	CALL_AND_TEST("abcd", 2, buf, 3, 1, "");
+	CALL_AND_TEST("abcd", 3, buf, 3, 1, "");
+	CALL_AND_TEST("abcd", 4, buf, 3, 1, "");
+
+	CALL_AND_TEST("abcd", 0, buf, 4, 3, "''");
+	CALL_AND_TEST("abcd", 1, buf, 4, 4, "'a'");
+	CALL_AND_TEST("abcd", 2, buf, 4, 4, "'a'");
+	CALL_AND_TEST("abcd", 3, buf, 4, 4, "'a'");
+	CALL_AND_TEST("abcd", 4, buf, 4, 4, "'a'");
+	CALL_AND_TEST("abcde", 5, buf, 4, 4, "'a'");
+	CALL_AND_TEST("'", 1, buf, 4, 3, "''");
+	CALL_AND_TEST("''", 2, buf, 4, 3, "''");
+	CALL_AND_TEST("a'", 2, buf, 4, 4, "'a'");
+	CALL_AND_TEST("'a", 2, buf, 4, 3, "''");
+	CALL_AND_TEST("ab", 2, buf, 4, 4, "'a'");
+
+	CALL_AND_TEST("abcdef", 0, buf, 5, 3, "''");
+	CALL_AND_TEST("abcdef", 1, buf, 5, 4, "'a'");
+	CALL_AND_TEST("abcdef", 2, buf, 5, 5, "'ab'");
+	CALL_AND_TEST("abcdef", 3, buf, 5, 5, "'ab'");
+	CALL_AND_TEST("abcdef", 4, buf, 5, 5, "'ab'");
+	CALL_AND_TEST("abcdef", 5, buf, 5, 5, "'ab'");
+	CALL_AND_TEST("abcdef", 6, buf, 5, 5, "'ab'");
+	CALL_AND_TEST("'", 1, buf, 5, 5, "''''");
+	CALL_AND_TEST("''", 2, buf, 5, 5, "''''");
+	CALL_AND_TEST("a'", 2, buf, 5, 4, "'a'");
+	CALL_AND_TEST("'a", 2, buf, 5, 5, "''''");
+	CALL_AND_TEST("ab", 2, buf, 5, 5, "'ab'");
+	CALL_AND_TEST("abc", 3, buf, 5, 5, "'ab'");
+
+	CALL_AND_TEST("ab", 2, buf, 6, 5, "'ab'");
+
+	CALL_AND_TEST("a'b'c", 5, buf, 32, 10, "'a''b''c'");
+	CALL_AND_TEST("a'b'c'", 6, buf, 32, 12, "'a''b''c'''");
+}
+
+#endif /* UNIV_COMPILE_TEST_FUNCS */
+#endif /* !UNIV_HOTBACKUP */
diff --git a/storage/xtradb/ut/ut0rbt.c b/storage/xtradb/ut/ut0rbt.c
new file mode 100644
index 00000000000..3d7bc91e714
--- /dev/null
+++ b/storage/xtradb/ut/ut0rbt.c
@@ -0,0 +1,1249 @@
+/*****************************************************************************
+
+Copyright (c) 2006, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/*******************************************************************//**
+@file ut/ut0rbt.c
+Red-Black tree implementation
+
+Created 2007-03-20 Sunny Bains
+***********************************************************************/
+
+#include "ut0rbt.h"
+
+/************************************************************************
+Definition of a red-black tree
+==============================
+
+A red-black tree is a binary search tree which has the following
+red-black properties:
+
+   1. Every node is either red or black.
+   2. Every leaf (NULL - in our case tree->nil) is black.
+   3. If a node is red, then both its children are black.
+   4. Every simple path from a node to a descendant leaf contains the
+      same number of black nodes.
+
+   from (3) above, the implication is that on any path from the root
+   to a leaf, red nodes must not be adjacent.
+
+   However, any number of black nodes may appear in a sequence. */
+
+#if	defined(IB_RBT_TESTING)
+#warning "Testing enabled!"
+#endif
+
+#define ROOT(t)		(t->root->left)
+#define	SIZEOF_NODE(t)	((sizeof(ib_rbt_node_t) + t->sizeof_value) - 1)
+
+/****************************************************************//**
+Print out the sub-tree recursively. */
+static
+void
+rbt_print_subtree(
+/*==============*/
+	const ib_rbt_t*		tree,	/*!< in: tree to traverse */
+	const ib_rbt_node_t*	node,	/*!< in: node to print */
+	ib_rbt_print_node	print)	/*!< in: print key function */
+{
+	/* FIXME: Doesn't do anything yet */
+	if (node != tree->nil) {
+		print(node);
+		rbt_print_subtree(tree, node->left, print);
+		rbt_print_subtree(tree, node->right, print);
+	}
+}
+
+/****************************************************************//**
+Verify that the keys are in order.
+@return	TRUE of OK. FALSE if not ordered */
+static
+ibool
+rbt_check_ordering(
+/*===============*/
+	const ib_rbt_t*		tree)	/*!< in: tree to verfify */
+{
+	const ib_rbt_node_t*	node;
+	const ib_rbt_node_t*	prev = NULL;
+
+	/* Iterate over all the nodes, comparing each node with the prev */
+	for (node = rbt_first(tree); node; node = rbt_next(tree, prev)) {
+
+		if (prev && tree->compare(prev->value, node->value) >= 0) {
+			return(FALSE);
+		}
+
+		prev = node;
+	}
+
+	return(TRUE);
+}
+
+/****************************************************************//**
+Check that every path from the root to the leaves has the same count.
+Count is expressed in the number of black nodes.
+@return	0 on failure else black height of the subtree */
+static
+ibool
+rbt_count_black_nodes(
+/*==================*/
+	const ib_rbt_t*		tree,	/*!< in: tree to verify */
+	const ib_rbt_node_t*	node)	/*!< in: start of sub-tree */
+{
+	ulint	result;
+
+	if (node != tree->nil) {
+		ulint	left_height = rbt_count_black_nodes(tree, node->left);
+
+		ulint	right_height = rbt_count_black_nodes(tree, node->right);
+
+		if (left_height == 0
+		    || right_height == 0
+		    || left_height != right_height) {
+
+			result = 0;
+		} else if (node->color == IB_RBT_RED) {
+
+			/* Case 3 */
+			if (node->left->color != IB_RBT_BLACK
+			    || node->right->color != IB_RBT_BLACK) {
+
+				result = 0;
+			} else {
+				result = left_height;
+			}
+		/* Check if it's anything other than RED or BLACK. */
+		} else if (node->color != IB_RBT_BLACK) {
+
+			result = 0;
+		} else {
+
+			result = right_height + 1;
+		}
+	} else {
+		result = 1;
+	}
+
+	return(result);
+}
+
+/****************************************************************//**
+Turn the node's right child's left sub-tree into node's right sub-tree.
+This will also make node's right child it's parent. */
+static
+void
+rbt_rotate_left(
+/*============*/
+	const ib_rbt_node_t*	nil,	/*!< in: nil node of the tree */
+	ib_rbt_node_t*		node)	/*!< in: node to rotate */
+{
+	ib_rbt_node_t*	right = node->right;
+
+	node->right = right->left;
+
+	if (right->left != nil) {
+		right->left->parent = node;
+	}
+
+	/* Right's new parent was node's parent. */
+	right->parent = node->parent;
+
+	/* Since root's parent is tree->nil and root->parent->left points
+	back to root, we can avoid the check. */
+	if (node == node->parent->left) {
+		/* Node was on the left of its parent. */
+		node->parent->left = right;
+	} else {
+		/* Node must have been on the right. */
+		node->parent->right = right;
+	}
+
+	/* Finally, put node on right's left. */
+	right->left = node;
+	node->parent = right;
+}
+
+/****************************************************************//**
+Turn the node's left child's right sub-tree into node's left sub-tree.
+This also make node's left child it's parent. */
+static
+void
+rbt_rotate_right(
+/*=============*/
+	const ib_rbt_node_t*	nil,	/*!< in: nil node of tree */
+	ib_rbt_node_t*		node)	/*!< in: node to rotate */
+{
+	ib_rbt_node_t*	left = node->left;
+
+	node->left = left->right;
+
+	if (left->right != nil) {
+		left->right->parent = node;
+	}
+
+	/* Left's new parent was node's parent. */
+	left->parent = node->parent;
+
+	/* Since root's parent is tree->nil and root->parent->left points
+	back to root, we can avoid the check. */
+	if (node == node->parent->right) {
+	    /* Node was on the left of its parent. */
+            node->parent->right = left;
+	} else {
+	    /* Node must have been on the left. */
+            node->parent->left = left;
+	}
+
+	/* Finally, put node on left's right. */
+	left->right = node;
+	node->parent = left;
+}
+
+/****************************************************************//**
+Append a node to the tree.
+@return inserted node */
+static
+ib_rbt_node_t*
+rbt_tree_add_child(
+/*===============*/
+	const ib_rbt_t*	tree,		/*!< in: rbt tree */
+	ib_rbt_bound_t*	parent,		/*!< in: node's parent */
+	ib_rbt_node_t*	node)		/*!< in: node to add */
+{
+	/* Cast away the const. */
+	ib_rbt_node_t*	last = (ib_rbt_node_t*) parent->last;
+
+	if (last == tree->root || parent->result < 0) {
+		last->left = node;
+	} else {
+		/* FIXME: We don't handle duplicates (yet)! */
+		ut_a(parent->result != 0);
+
+		last->right = node;
+	}
+
+	node->parent = last;
+
+	return(node);
+}
+
+/****************************************************************//**
+Generic binary tree insert
+@return inserted node */
+static
+ib_rbt_node_t*
+rbt_tree_insert(
+/*============*/
+	ib_rbt_t*	tree,		/*!< in: rb tree */
+	const void*	key,		/*!< in: key for ordering */
+	ib_rbt_node_t*	node)		/*!< in: node hold the insert value */
+{
+	ib_rbt_bound_t	parent;
+	ib_rbt_node_t*	current = ROOT(tree);
+
+	parent.result = 0;
+	parent.last = tree->root;
+
+	/* Regular binary search. */
+	while (current != tree->nil) {
+
+		parent.last = current;
+		parent.result = tree->compare(key, current->value);
+
+		if (parent.result < 0) {
+			current = current->left;
+		} else {
+			current = current->right;
+		}
+	}
+
+	ut_a(current == tree->nil);
+
+	rbt_tree_add_child(tree, &parent, node);
+
+	return(node);
+}
+
+/****************************************************************//**
+Balance a tree after inserting a node. */
+static
+void
+rbt_balance_tree(
+/*=============*/
+	const ib_rbt_t*	tree,		/*!< in: tree to balance */
+	ib_rbt_node_t*	node)		/*!< in: node that was inserted */
+{
+	const ib_rbt_node_t*	nil = tree->nil;
+	ib_rbt_node_t*		parent = node->parent;
+
+	/* Restore the red-black property. */
+	node->color = IB_RBT_RED;
+
+	while (node != ROOT(tree) && parent->color == IB_RBT_RED) {
+		ib_rbt_node_t*	grand_parent = parent->parent;
+
+		if (parent == grand_parent->left) {
+			ib_rbt_node_t*	uncle = grand_parent->right;
+
+			if (uncle->color == IB_RBT_RED) {
+
+				/* Case 1 - change the colors. */
+				uncle->color = IB_RBT_BLACK;
+				parent->color = IB_RBT_BLACK;
+				grand_parent->color = IB_RBT_RED;
+
+				/* Move node up the tree. */
+				node = grand_parent;
+
+			} else {
+
+				if (node == parent->right) {
+					/* Right is a black node and node is
+					to the right, case 2 - move node
+					up and rotate. */
+					node = parent;
+					rbt_rotate_left(nil, node);
+				}
+
+				grand_parent = node->parent->parent;
+
+				/* Case 3. */
+				node->parent->color = IB_RBT_BLACK;
+				grand_parent->color = IB_RBT_RED;
+
+				rbt_rotate_right(nil, grand_parent);
+			}
+
+		} else {
+			ib_rbt_node_t*	uncle = grand_parent->left;
+
+			if (uncle->color == IB_RBT_RED) {
+
+				/* Case 1 - change the colors. */
+				uncle->color = IB_RBT_BLACK;
+				parent->color = IB_RBT_BLACK;
+				grand_parent->color = IB_RBT_RED;
+
+				/* Move node up the tree. */
+				node = grand_parent;
+
+			} else {
+
+				if (node == parent->left) {
+					/* Left is a black node and node is to
+					the right, case 2 - move node up and
+					rotate. */
+					node = parent;
+					rbt_rotate_right(nil, node);
+				}
+
+				grand_parent = node->parent->parent;
+
+				/* Case 3. */
+				node->parent->color = IB_RBT_BLACK;
+				grand_parent->color = IB_RBT_RED;
+
+				rbt_rotate_left(nil, grand_parent);
+			}
+		}
+
+		parent = node->parent;
+	}
+
+	/* Color the root black. */
+	ROOT(tree)->color = IB_RBT_BLACK;
+}
+
+/****************************************************************//**
+Find the given node's successor.
+@return	successor node or NULL if no successor */
+static
+ib_rbt_node_t*
+rbt_find_successor(
+/*===============*/
+	const ib_rbt_t*		tree,	/*!< in: rb tree */
+	const ib_rbt_node_t*	current)/*!< in: this is declared const
+					because it can be called via
+					rbt_next() */
+{
+	const ib_rbt_node_t*	nil = tree->nil;
+	ib_rbt_node_t*		next = current->right;
+
+	/* Is there a sub-tree to the right that we can follow. */
+	if (next != nil) {
+
+		/* Follow the left most links of the current right child. */
+		while (next->left != nil) {
+			next = next->left;
+		}
+
+	} else { /* We will have to go up the tree to find the successor. */
+		ib_rbt_node_t*	parent = current->parent;
+
+		/* Cast away the const. */
+		next = (ib_rbt_node_t*) current;
+
+		while (parent != tree->root && next == parent->right) {
+			next = parent;
+			parent = next->parent;
+		}
+
+		next = (parent == tree->root) ? NULL : parent;
+	}
+
+	return(next);
+}
+
+/****************************************************************//**
+Find the given node's precedecessor.
+@return	predecessor node or NULL if no predecesor */
+static
+ib_rbt_node_t*
+rbt_find_predecessor(
+/*=================*/
+	const ib_rbt_t*		tree,		/*!< in: rb tree */
+	const ib_rbt_node_t*	current)	/*!< in: this is declared const
+						because it can be called via
+						rbt_prev() */
+{
+	const ib_rbt_node_t*	nil = tree->nil;
+	ib_rbt_node_t*		prev = current->left;
+
+	/* Is there a sub-tree to the left that we can follow. */
+	if (prev != nil) {
+
+		/* Follow the right most links of the current left child. */
+		while (prev->right != nil) {
+			prev = prev->right;
+		}
+
+	} else { /* We will have to go up the tree to find the precedecessor. */
+		ib_rbt_node_t*	parent = current->parent;
+
+		/* Cast away the const. */
+		prev = (ib_rbt_node_t*)current;
+
+		while (parent != tree->root && prev == parent->left) {
+			prev = parent;
+			parent = prev->parent;
+		}
+
+		prev = (parent == tree->root) ? NULL : parent;
+	}
+
+	return(prev);
+}
+
+/****************************************************************//**
+Replace node with child. After applying transformations eject becomes
+an orphan. */
+static
+void
+rbt_eject_node(
+/*===========*/
+	ib_rbt_node_t*	eject,		/*!< in: node to eject */
+	ib_rbt_node_t*	node)		/*!< in: node to replace with */
+{
+	/* Update the to be ejected node's parent's child pointers. */
+	if (eject->parent->left == eject) {
+		eject->parent->left = node;
+	} else if (eject->parent->right == eject) {
+		eject->parent->right = node;
+	} else {
+		ut_a(0);
+	}
+	/* eject is now an orphan but otherwise its pointers
+	and color are left intact. */
+
+	node->parent = eject->parent;
+}
+
+/****************************************************************//**
+Replace a node with another node. */
+static
+void
+rbt_replace_node(
+/*=============*/
+	ib_rbt_node_t*	replace,	/*!< in: node to replace */
+	ib_rbt_node_t*	node)		/*!< in: node to replace with */
+{
+	ib_rbt_color_t	color = node->color;
+
+	/* Update the node pointers. */
+	node->left = replace->left;
+	node->right = replace->right;
+
+	/* Update the child node pointers. */
+	node->left->parent = node;
+	node->right->parent = node;
+
+	/* Make the parent of replace point to node. */
+	rbt_eject_node(replace, node);
+
+	/* Swap the colors. */
+	node->color = replace->color;
+	replace->color = color;
+}
+
+/****************************************************************//**
+Detach node from the tree replacing it with one of it's children.
+@return	the child node that now occupies the position of the detached node */
+static
+ib_rbt_node_t*
+rbt_detach_node(
+/*============*/
+	const ib_rbt_t*	tree,		/*!< in: rb tree */
+	ib_rbt_node_t*	node)		/*!< in: node to detach */
+{
+	ib_rbt_node_t*		child;
+	const ib_rbt_node_t*	nil = tree->nil;
+
+	if (node->left != nil && node->right != nil) {
+		/* Case where the node to be deleted has two children. */
+		ib_rbt_node_t*	successor = rbt_find_successor(tree, node);
+
+		ut_a(successor != nil);
+		ut_a(successor->parent != nil);
+		ut_a(successor->left == nil);
+
+		child = successor->right;
+
+		/* Remove the successor node and replace with its child. */
+		rbt_eject_node(successor, child);
+
+		/* Replace the node to delete with its successor node. */
+		rbt_replace_node(node, successor);
+	} else {
+		ut_a(node->left == nil || node->right == nil);
+
+		child = (node->left != nil) ? node->left : node->right;
+
+		/* Replace the node to delete with one of it's children. */
+		rbt_eject_node(node, child);
+	}
+
+	/* Reset the node links. */
+	node->parent = node->right = node->left = tree->nil;
+
+	return(child);
+}
+
+/****************************************************************//**
+Rebalance the right sub-tree after deletion.
+@return	node to rebalance if more rebalancing required else NULL */
+static
+ib_rbt_node_t*
+rbt_balance_right(
+/*==============*/
+	const ib_rbt_node_t*	nil,	/*!< in: rb tree nil node */
+	ib_rbt_node_t*		parent,	/*!< in: parent node */
+	ib_rbt_node_t*		sibling)/*!< in: sibling node */
+{
+	ib_rbt_node_t*		node = NULL;
+
+	ut_a(sibling != nil);
+
+	/* Case 3. */
+	if (sibling->color == IB_RBT_RED) {
+
+		parent->color = IB_RBT_RED;
+		sibling->color = IB_RBT_BLACK;
+
+		rbt_rotate_left(nil, parent);
+
+		sibling = parent->right;
+
+		ut_a(sibling != nil);
+	}
+
+	/* Since this will violate case 3 because of the change above. */
+	if (sibling->left->color == IB_RBT_BLACK
+	    && sibling->right->color == IB_RBT_BLACK) {
+
+		node = parent; /* Parent needs to be rebalanced too. */
+		sibling->color = IB_RBT_RED;
+
+	} else {
+		if (sibling->right->color == IB_RBT_BLACK) {
+
+			ut_a(sibling->left->color == IB_RBT_RED);
+
+			sibling->color = IB_RBT_RED;
+			sibling->left->color = IB_RBT_BLACK;
+
+			rbt_rotate_right(nil, sibling);
+
+			sibling = parent->right;
+			ut_a(sibling != nil);
+		}
+
+		sibling->color = parent->color;
+		sibling->right->color = IB_RBT_BLACK;
+
+		parent->color = IB_RBT_BLACK;
+
+		rbt_rotate_left(nil, parent);
+	}
+
+	return(node);
+}
+
+/****************************************************************//**
+Rebalance the left sub-tree after deletion.
+@return	node to rebalance if more rebalancing required else NULL */
+static
+ib_rbt_node_t*
+rbt_balance_left(
+/*=============*/
+	const ib_rbt_node_t*	nil,	/*!< in: rb tree nil node */
+	ib_rbt_node_t*		parent,	/*!< in: parent node */
+	ib_rbt_node_t*		sibling)/*!< in: sibling node */
+{
+	ib_rbt_node_t*	node = NULL;
+
+	ut_a(sibling != nil);
+
+	/* Case 3. */
+	if (sibling->color == IB_RBT_RED) {
+
+		parent->color = IB_RBT_RED;
+		sibling->color = IB_RBT_BLACK;
+
+		rbt_rotate_right(nil, parent);
+		sibling = parent->left;
+
+		ut_a(sibling != nil);
+	}
+
+	/* Since this will violate case 3 because of the change above. */
+	if (sibling->right->color == IB_RBT_BLACK
+	    && sibling->left->color == IB_RBT_BLACK) {
+
+		node = parent; /* Parent needs to be rebalanced too. */
+		sibling->color = IB_RBT_RED;
+
+	} else {
+		if (sibling->left->color == IB_RBT_BLACK) {
+
+			ut_a(sibling->right->color == IB_RBT_RED);
+
+			sibling->color = IB_RBT_RED;
+			sibling->right->color = IB_RBT_BLACK;
+
+			rbt_rotate_left(nil, sibling);
+
+			sibling = parent->left;
+
+			ut_a(sibling != nil);
+		}
+
+		sibling->color = parent->color;
+		sibling->left->color = IB_RBT_BLACK;
+
+		parent->color = IB_RBT_BLACK;
+
+		rbt_rotate_right(nil, parent);
+	}
+
+	return(node);
+}
+
+/****************************************************************//**
+Delete the node and rebalance the tree if necessary */
+static
+void
+rbt_remove_node_and_rebalance(
+/*==========================*/
+	ib_rbt_t*	tree,		/*!< in: rb tree */
+	ib_rbt_node_t*	node)		/*!< in: node to remove */
+{
+	/* Detach node and get the node that will be used
+	as rebalance start. */
+	ib_rbt_node_t*	child = rbt_detach_node(tree, node);
+
+	if (node->color == IB_RBT_BLACK) {
+		ib_rbt_node_t*	last = child;
+
+		ROOT(tree)->color = IB_RBT_RED;
+
+		while (child && child->color == IB_RBT_BLACK) {
+			ib_rbt_node_t*	parent = child->parent;
+
+			/* Did the deletion cause an imbalance in the
+			parents left sub-tree. */
+			if (parent->left == child) {
+
+				child = rbt_balance_right(
+					tree->nil, parent, parent->right);
+
+			} else if (parent->right == child) {
+
+				child = rbt_balance_left(
+					tree->nil, parent, parent->left);
+
+			} else {
+				ut_error;
+			}
+
+			if (child) {
+				last = child;
+			}
+		}
+
+		ut_a(last);
+
+		last->color = IB_RBT_BLACK;
+		ROOT(tree)->color = IB_RBT_BLACK;
+	}
+
+	/* Note that we have removed a node from the tree. */
+	--tree->n_nodes;
+}
+
+/****************************************************************//**
+Recursively free the nodes. */
+static
+void
+rbt_free_node(
+/*==========*/
+	ib_rbt_node_t*	node,		/*!< in: node to free */
+	ib_rbt_node_t*	nil)		/*!< in: rb tree nil node */
+{
+	if (node != nil) {
+		rbt_free_node(node->left, nil);
+		rbt_free_node(node->right, nil);
+
+		ut_free(node);
+	}
+}
+
+/****************************************************************//**
+Free all the nodes and free the tree. */
+UNIV_INTERN
+void
+rbt_free(
+/*=====*/
+	ib_rbt_t*	tree)		/*!< in: rb tree to free */
+{
+	rbt_free_node(tree->root, tree->nil);
+	ut_free(tree->nil);
+	ut_free(tree);
+}
+
+/****************************************************************//**
+Create an instance of a red black tree.
+@return	an empty rb tree */
+UNIV_INTERN
+ib_rbt_t*
+rbt_create(
+/*=======*/
+	size_t		sizeof_value,	/*!< in: sizeof data item */
+	ib_rbt_compare	compare)	/*!< in: fn to compare items */
+{
+	ib_rbt_t*	tree;
+	ib_rbt_node_t*	node;
+
+	tree = (ib_rbt_t*) ut_malloc(sizeof(*tree));
+	memset(tree, 0, sizeof(*tree));
+
+	tree->sizeof_value = sizeof_value;
+
+	/* Create the sentinel (NIL) node. */
+	node = tree->nil = (ib_rbt_node_t*) ut_malloc(sizeof(*node));
+	memset(node, 0, sizeof(*node));
+
+	node->color = IB_RBT_BLACK;
+	node->parent = node->left = node->right = node;
+
+	/* Create the "fake" root, the real root node will be the
+	left child of this node. */
+	node = tree->root = (ib_rbt_node_t*) ut_malloc(sizeof(*node));
+	memset(node, 0, sizeof(*node));
+
+	node->color = IB_RBT_BLACK;
+	node->parent = node->left = node->right = tree->nil;
+
+	tree->compare = compare;
+
+	return(tree);
+}
+
+/****************************************************************//**
+Generic insert of a value in the rb tree.
+@return	inserted node */
+UNIV_INTERN
+const ib_rbt_node_t*
+rbt_insert(
+/*=======*/
+	ib_rbt_t*	tree,		/*!< in: rb tree */
+	const void*	key,		/*!< in: key for ordering */
+	const void*	value)		/*!< in: value of key, this value
+					is copied to the node */
+{
+	ib_rbt_node_t*	node;
+
+	/* Create the node that will hold the value data. */
+	node = (ib_rbt_node_t*) ut_malloc(SIZEOF_NODE(tree));
+
+	memcpy(node->value, value, tree->sizeof_value);
+	node->parent = node->left = node->right = tree->nil;
+
+	/* Insert in the tree in the usual way. */
+	rbt_tree_insert(tree, key, node);
+	rbt_balance_tree(tree, node);
+
+	++tree->n_nodes;
+
+	return(node);
+}
+
+/****************************************************************//**
+Add a new node to the tree, useful for data that is pre-sorted.
+@return	appended node */
+UNIV_INTERN
+const ib_rbt_node_t*
+rbt_add_node(
+/*=========*/
+	ib_rbt_t*	tree,		/*!< in: rb tree */
+	ib_rbt_bound_t*	parent,		/*!< in: bounds */
+	const void*	value)		/*!< in: this value is copied
+					to the node */
+{
+	ib_rbt_node_t*	node;
+
+	/* Create the node that will hold the value data */
+	node = (ib_rbt_node_t*) ut_malloc(SIZEOF_NODE(tree));
+
+	memcpy(node->value, value, tree->sizeof_value);
+	node->parent = node->left = node->right = tree->nil;
+
+	/* If tree is empty */
+	if (parent->last == NULL) {
+		parent->last = tree->root;
+	}
+
+	/* Append the node, the hope here is that the caller knows
+	what s/he is doing. */
+	rbt_tree_add_child(tree, parent, node);
+	rbt_balance_tree(tree, node);
+
+	++tree->n_nodes;
+
+#if	defined(IB_RBT_TESTING)
+	ut_a(rbt_validate(tree));
+#endif
+	return(node);
+}
+
+/****************************************************************//**
+Find a matching node in the rb tree.
+@return	NULL if not found else the node where key was found */
+UNIV_INTERN
+const ib_rbt_node_t*
+rbt_lookup(
+/*=======*/
+	const ib_rbt_t*	tree,		/*!< in: rb tree */
+	const void*	key)		/*!< in: key to use for search */
+{
+	const ib_rbt_node_t*	current = ROOT(tree);
+
+	/* Regular binary search. */
+	while (current != tree->nil) {
+		int	result = tree->compare(key, current->value);
+
+		if (result < 0) {
+			current = current->left;
+		} else if (result > 0) {
+			current = current->right;
+		} else {
+			break;
+		}
+	}
+
+	return(current != tree->nil ? current : NULL);
+}
+
+/****************************************************************//**
+Delete a node from the red black tree, identified by key.
+@return	TRUE if success FALSE if not found */
+UNIV_INTERN
+ibool
+rbt_delete(
+/*=======*/
+	ib_rbt_t*	tree,		/*!< in: rb tree */
+	const void*	key)		/*!< in: key to delete */
+{
+	ibool		deleted = FALSE;
+	ib_rbt_node_t*	node = (ib_rbt_node_t*) rbt_lookup(tree, key);
+
+	if (node) {
+		rbt_remove_node_and_rebalance(tree, node);
+
+		ut_free(node);
+		deleted = TRUE;
+	}
+
+	return(deleted);
+}
+
+/****************************************************************//**
+Remove a node from the rb tree, the node is not free'd, that is the
+callers responsibility.
+@return	deleted node but without the const */
+UNIV_INTERN
+ib_rbt_node_t*
+rbt_remove_node(
+/*============*/
+	ib_rbt_t*		tree,		/*!< in: rb tree */
+	const ib_rbt_node_t*	const_node)	/*!< in: node to delete, this
+						is a fudge and declared const
+						because the caller can access
+						only const nodes */
+{
+	/* Cast away the const. */
+	rbt_remove_node_and_rebalance(tree, (ib_rbt_node_t*) const_node);
+
+	/* This is to make it easier to do something like this:
+		ut_free(rbt_remove_node(node));
+	*/
+
+	return((ib_rbt_node_t*) const_node);
+}
+
+/****************************************************************//**
+Find the node that has the lowest key that is >= key.
+@return	node satisfying the lower bound constraint or NULL */
+UNIV_INTERN
+const ib_rbt_node_t*
+rbt_lower_bound(
+/*============*/
+	const ib_rbt_t*	tree,		/*!< in: rb tree */
+	const void*	key)		/*!< in: key to search */
+{
+	ib_rbt_node_t*	lb_node = NULL;
+	ib_rbt_node_t*	current = ROOT(tree);
+
+	while (current != tree->nil) {
+		int result = tree->compare(key, current->value);
+
+		if (result > 0) {
+
+			current = current->right;
+
+		} else if (result < 0) {
+
+			lb_node = current;
+			current = current->left;
+
+		} else {
+			lb_node = current;
+			break;
+		}
+	}
+
+	return(lb_node);
+}
+
+/****************************************************************//**
+Find the node that has the greatest key that is <= key.
+@return	node satisfying the upper bound constraint or NULL */
+UNIV_INTERN
+const ib_rbt_node_t*
+rbt_upper_bound(
+/*============*/
+	const ib_rbt_t*	tree,		/*!< in: rb tree */
+	const void*	key)		/*!< in: key to search */
+{
+	ib_rbt_node_t*	ub_node = NULL;
+	ib_rbt_node_t*	current = ROOT(tree);
+
+	while (current != tree->nil) {
+		int result = tree->compare(key, current->value);
+
+		if (result > 0) {
+
+			ub_node = current;
+			current = current->right;
+
+		} else if (result < 0) {
+
+			current = current->left;
+
+		} else {
+			ub_node = current;
+			break;
+		}
+	}
+
+	return(ub_node);
+}
+
+/****************************************************************//**
+Find the node that has the greatest key that is <= key.
+@return	value of result */
+UNIV_INTERN
+int
+rbt_search(
+/*=======*/
+	const ib_rbt_t*	tree,		/*!< in: rb tree */
+	ib_rbt_bound_t*	parent,		/*!< in: search bounds */
+	const void*	key)		/*!< in: key to search */
+{
+	ib_rbt_node_t*	current = ROOT(tree);
+
+	/* Every thing is greater than the NULL root. */
+	parent->result = 1;
+	parent->last = NULL;
+
+	while (current != tree->nil) {
+
+		parent->last = current;
+		parent->result = tree->compare(key, current->value);
+
+		if (parent->result > 0) {
+			current = current->right;
+		} else if (parent->result < 0) {
+			current = current->left;
+		} else {
+			break;
+		}
+	}
+
+	return(parent->result);
+}
+
+/****************************************************************//**
+Find the node that has the greatest key that is <= key. But use the
+supplied comparison function.
+@return	value of result */
+UNIV_INTERN
+int
+rbt_search_cmp(
+/*===========*/
+	const ib_rbt_t*	tree,		/*!< in: rb tree */
+	ib_rbt_bound_t*	parent,		/*!< in: search bounds */
+	const void*	key,		/*!< in: key to search */
+	ib_rbt_compare	compare)	/*!< in: fn to compare items */
+{
+	ib_rbt_node_t*	current = ROOT(tree);
+
+	/* Every thing is greater than the NULL root. */
+	parent->result = 1;
+	parent->last = NULL;
+
+	while (current != tree->nil) {
+
+		parent->last = current;
+		parent->result = compare(key, current->value);
+
+		if (parent->result > 0) {
+			current = current->right;
+		} else if (parent->result < 0) {
+			current = current->left;
+		} else {
+			break;
+		}
+	}
+
+	return(parent->result);
+}
+
+/****************************************************************//**
+Get the leftmost node.
+Return the left most node in the tree. */
+UNIV_INTERN
+const ib_rbt_node_t*
+rbt_first(
+/*======*/
+	const ib_rbt_t*	tree)		/* in: rb tree */
+{
+	ib_rbt_node_t*	first = NULL;
+	ib_rbt_node_t*	current = ROOT(tree);
+
+	while (current != tree->nil) {
+		first = current;
+		current = current->left;
+	}
+
+	return(first);
+}
+
+/****************************************************************//**
+Return the right most node in the tree.
+@return	the rightmost node or NULL */
+UNIV_INTERN
+const ib_rbt_node_t*
+rbt_last(
+/*=====*/
+	const ib_rbt_t*	tree)		/*!< in: rb tree */
+{
+	ib_rbt_node_t*	last = NULL;
+	ib_rbt_node_t*	current = ROOT(tree);
+
+	while (current != tree->nil) {
+		last = current;
+		current = current->right;
+	}
+
+	return(last);
+}
+
+/****************************************************************//**
+Return the next node.
+@return	node next from current */
+UNIV_INTERN
+const ib_rbt_node_t*
+rbt_next(
+/*=====*/
+	const ib_rbt_t*		tree,	/*!< in: rb tree */
+	const ib_rbt_node_t*	current)/*!< in: current node */
+{
+	return(current ? rbt_find_successor(tree, current) : NULL);
+}
+
+/****************************************************************//**
+Return the previous node.
+@return	node prev from current */
+UNIV_INTERN
+const ib_rbt_node_t*
+rbt_prev(
+/*=====*/
+	const ib_rbt_t*		tree,	/*!< in: rb tree */
+	const ib_rbt_node_t*	current)/*!< in: current node */
+{
+	return(current ? rbt_find_predecessor(tree, current) : NULL);
+}
+
+/****************************************************************//**
+Reset the tree. Delete all the nodes. */
+UNIV_INTERN
+void
+rbt_clear(
+/*======*/
+	ib_rbt_t*	tree)		/*!< in: rb tree */
+{
+	rbt_free_node(ROOT(tree), tree->nil);
+
+	tree->n_nodes = 0;
+	tree->root->left = tree->root->right = tree->nil;
+}
+
+/****************************************************************//**
+Merge the node from dst into src. Return the number of nodes merged.
+@return	no. of recs merged */
+UNIV_INTERN
+ulint
+rbt_merge_uniq(
+/*===========*/
+	ib_rbt_t*	dst,		/*!< in: dst rb tree */
+	const ib_rbt_t*	src)		/*!< in: src rb tree */
+{
+	ib_rbt_bound_t		parent;
+	ulint			n_merged = 0;
+	const	ib_rbt_node_t*	src_node = rbt_first(src);
+
+	if (rbt_empty(src) || dst == src) {
+		return(0);
+	}
+
+	for (/* No op */; src_node; src_node = rbt_next(src, src_node)) {
+
+		if (rbt_search(dst, &parent, src_node->value) != 0) {
+			rbt_add_node(dst, &parent, src_node->value);
+			++n_merged;
+		}
+	}
+
+	return(n_merged);
+}
+
+/****************************************************************//**
+Merge the node from dst into src. Return the number of nodes merged.
+Delete the nodes from src after copying node to dst. As a side effect
+the duplicates will be left untouched in the src.
+@return	no. of recs merged */
+UNIV_INTERN
+ulint
+rbt_merge_uniq_destructive(
+/*=======================*/
+	ib_rbt_t*	dst,		/*!< in: dst rb tree */
+	ib_rbt_t*	src)		/*!< in: src rb tree */
+{
+	ib_rbt_bound_t	parent;
+	ib_rbt_node_t*	src_node;
+	ulint		old_size = rbt_size(dst);
+
+	if (rbt_empty(src) || dst == src) {
+		return(0);
+	}
+
+	for (src_node = (ib_rbt_node_t*) rbt_first(src); src_node; /* */) {
+		ib_rbt_node_t*	prev = src_node;
+
+		src_node = (ib_rbt_node_t*)rbt_next(src, prev);
+
+		/* Skip duplicates. */
+		if (rbt_search(dst, &parent, prev->value) != 0) {
+
+			/* Remove and reset the node but preserve
+			the node (data) value. */
+			rbt_remove_node_and_rebalance(src, prev);
+
+			/* The nil should be taken from the dst tree. */
+			prev->parent = prev->left = prev->right = dst->nil;
+			rbt_tree_add_child(dst, &parent, prev);
+			rbt_balance_tree(dst, prev);
+
+			++dst->n_nodes;
+		}
+	}
+
+#if	defined(IB_RBT_TESTING)
+	ut_a(rbt_validate(dst));
+	ut_a(rbt_validate(src));
+#endif
+	return(rbt_size(dst) - old_size);
+}
+
+/****************************************************************//**
+Check that every path from the root to the leaves has the same count and
+the tree nodes are in order.
+@return	TRUE if OK FALSE otherwise */
+UNIV_INTERN
+ibool
+rbt_validate(
+/*=========*/
+	const ib_rbt_t*	tree)		/*!< in: RB tree to validate */
+{
+	if (rbt_count_black_nodes(tree, ROOT(tree)) > 0) {
+		return(rbt_check_ordering(tree));
+	}
+
+	return(FALSE);
+}
+
+/****************************************************************//**
+Iterate over the tree in depth first order. */
+UNIV_INTERN
+void
+rbt_print(
+/*======*/
+	const ib_rbt_t*		tree,	/*!< in: tree to traverse */
+	ib_rbt_print_node	print)	/*!< in: print function */
+{
+	rbt_print_subtree(tree, ROOT(tree), print);
+}
diff --git a/storage/xtradb/ut/ut0rnd.c b/storage/xtradb/ut/ut0rnd.c
new file mode 100644
index 00000000000..cefd0990ecc
--- /dev/null
+++ b/storage/xtradb/ut/ut0rnd.c
@@ -0,0 +1,97 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/***************************************************************//**
+@file ut/ut0rnd.c
+Random numbers and hashing
+
+Created 5/11/1994 Heikki Tuuri
+********************************************************************/
+
+#include "ut0rnd.h"
+
+#ifdef UNIV_NONINL
+#include "ut0rnd.ic"
+#endif
+
+/** These random numbers are used in ut_find_prime */
+/*@{*/
+#define	UT_RANDOM_1	1.0412321
+#define	UT_RANDOM_2	1.1131347
+#define UT_RANDOM_3	1.0132677
+/*@}*/
+
+/** Seed value of ut_rnd_gen_ulint(). */
+UNIV_INTERN ulint	ut_rnd_ulint_counter = 65654363;
+
+/***********************************************************//**
+Looks for a prime number slightly greater than the given argument.
+The prime is chosen so that it is not near any power of 2.
+@return	prime */
+UNIV_INTERN
+ulint
+ut_find_prime(
+/*==========*/
+	ulint	n)	/*!< in: positive number > 100 */
+{
+	ulint	pow2;
+	ulint	i;
+
+	n += 100;
+
+	pow2 = 1;
+	while (pow2 * 2 < n) {
+		pow2 = 2 * pow2;
+	}
+
+	if ((double)n < 1.05 * (double)pow2) {
+		n = (ulint) ((double)n * UT_RANDOM_1);
+	}
+
+	pow2 = 2 * pow2;
+
+	if ((double)n > 0.95 * (double)pow2) {
+		n = (ulint) ((double)n * UT_RANDOM_2);
+	}
+
+	if (n > pow2 - 20) {
+		n += 30;
+	}
+
+	/* Now we have n far enough from powers of 2. To make
+	n more random (especially, if it was not near
+	a power of 2), we then multiply it by a random number. */
+
+	n = (ulint) ((double)n * UT_RANDOM_3);
+
+	for (;; n++) {
+		i = 2;
+		while (i * i <= n) {
+			if (n % i == 0) {
+				goto next_n;
+			}
+			i++;
+		}
+
+		/* Found a prime */
+		break;
+next_n:		;
+	}
+
+	return(n);
+}
diff --git a/storage/xtradb/ut/ut0ut.c b/storage/xtradb/ut/ut0ut.c
new file mode 100644
index 00000000000..498873e290a
--- /dev/null
+++ b/storage/xtradb/ut/ut0ut.c
@@ -0,0 +1,625 @@
+/*****************************************************************************
+
+Copyright (c) 1994, 2009, Innobase Oy. All Rights Reserved.
+Copyright (c) 2009, Sun Microsystems, Inc.
+
+Portions of this file contain modifications contributed and copyrighted by
+Sun Microsystems, Inc. Those modifications are gratefully acknowledged and
+are described briefly in the InnoDB documentation. The contributions by
+Sun Microsystems are incorporated with their permission, and subject to the
+conditions contained in the file COPYING.Sun_Microsystems.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/***************************************************************//**
+@file ut/ut0ut.c
+Various utilities for Innobase.
+
+Created 5/11/1994 Heikki Tuuri
+********************************************************************/
+
+#include "ut0ut.h"
+
+#ifdef UNIV_NONINL
+#include "ut0ut.ic"
+#endif
+
+#include <stdarg.h>
+#include <string.h>
+#include <ctype.h>
+
+#ifndef UNIV_HOTBACKUP
+# include "trx0trx.h"
+# include "ha_prototypes.h"
+# include "mysql_com.h" /* NAME_LEN */
+#endif /* UNIV_HOTBACKUP */
+
+/** A constant to prevent the compiler from optimizing ut_delay() away. */
+UNIV_INTERN ibool	ut_always_false	= FALSE;
+
+#ifdef __WIN__
+/*****************************************************************//**
+NOTE: The Windows epoch starts from 1601/01/01 whereas the Unix
+epoch starts from 1970/1/1. For selection of constant see:
+http://support.microsoft.com/kb/167296/ */
+#define WIN_TO_UNIX_DELTA_USEC  ((ib_int64_t) 11644473600000000ULL)
+
+
+/*****************************************************************//**
+This is the Windows version of gettimeofday(2).
+@return	0 if all OK else -1 */
+static
+int
+ut_gettimeofday(
+/*============*/
+	struct timeval*	tv,	/*!< out: Values are relative to Unix epoch */
+	void*		tz)	/*!< in: not used */
+{
+	FILETIME	ft;
+	ib_int64_t	tm;
+
+	if (!tv) {
+		errno = EINVAL;
+		return(-1);
+	}
+
+	GetSystemTimeAsFileTime(&ft);
+
+	tm = (ib_int64_t) ft.dwHighDateTime << 32;
+	tm |= ft.dwLowDateTime;
+
+	ut_a(tm >= 0);	/* If tm wraps over to negative, the quotient / 10
+			does not work */
+
+	tm /= 10;	/* Convert from 100 nsec periods to usec */
+
+	/* If we don't convert to the Unix epoch the value for
+	struct timeval::tv_sec will overflow.*/
+	tm -= WIN_TO_UNIX_DELTA_USEC;
+
+	tv->tv_sec  = (long) (tm / 1000000L);
+	tv->tv_usec = (long) (tm % 1000000L);
+
+	return(0);
+}
+#else
+/** An alias for gettimeofday(2).  On Microsoft Windows, we have to
+reimplement this function. */
+#define	ut_gettimeofday		gettimeofday
+#endif
+
+/********************************************************//**
+Gets the high 32 bits in a ulint. That is makes a shift >> 32,
+but since there seem to be compiler bugs in both gcc and Visual C++,
+we do this by a special conversion.
+@return	a >> 32 */
+UNIV_INTERN
+ulint
+ut_get_high32(
+/*==========*/
+	ulint	a)	/*!< in: ulint */
+{
+	ib_int64_t	i;
+
+	i = (ib_int64_t)a;
+
+	i = i >> 32;
+
+	return((ulint)i);
+}
+
+/**********************************************************//**
+Returns system time. We do not specify the format of the time returned:
+the only way to manipulate it is to use the function ut_difftime.
+@return	system time */
+UNIV_INTERN
+ib_time_t
+ut_time(void)
+/*=========*/
+{
+	return(time(NULL));
+}
+
+#ifndef UNIV_HOTBACKUP
+/**********************************************************//**
+Returns system time.
+Upon successful completion, the value 0 is returned; otherwise the
+value -1 is returned and the global variable errno is set to indicate the
+error.
+@return	0 on success, -1 otherwise */
+UNIV_INTERN
+int
+ut_usectime(
+/*========*/
+	ulint*	sec,	/*!< out: seconds since the Epoch */
+	ulint*	ms)	/*!< out: microseconds since the Epoch+*sec */
+{
+	struct timeval	tv;
+	int		ret;
+	int		errno_gettimeofday;
+	int		i;
+
+	for (i = 0; i < 10; i++) {
+
+		ret = ut_gettimeofday(&tv, NULL);
+
+		if (ret == -1) {
+			errno_gettimeofday = errno;
+			ut_print_timestamp(stderr);
+			fprintf(stderr, "  InnoDB: gettimeofday(): %s\n",
+				strerror(errno_gettimeofday));
+			os_thread_sleep(100000);  /* 0.1 sec */
+			errno = errno_gettimeofday;
+		} else {
+			break;
+		}
+	}
+
+	if (ret != -1) {
+		*sec = (ulint) tv.tv_sec;
+		*ms  = (ulint) tv.tv_usec;
+	}
+
+	return(ret);
+}
+
+/**********************************************************//**
+Returns the number of microseconds since epoch. Similar to
+time(3), the return value is also stored in *tloc, provided
+that tloc is non-NULL.
+@return	us since epoch */
+UNIV_INTERN
+ullint
+ut_time_us(
+/*=======*/
+	ullint*	tloc)	/*!< out: us since epoch, if non-NULL */
+{
+	struct timeval	tv;
+	ullint		us;
+
+	ut_gettimeofday(&tv, NULL);
+
+	us = (ullint) tv.tv_sec * 1000000 + tv.tv_usec;
+
+	if (tloc != NULL) {
+		*tloc = us;
+	}
+
+	return(us);
+}
+
+/**********************************************************//**
+Returns the number of milliseconds since some epoch.  The
+value may wrap around.  It should only be used for heuristic
+purposes.
+@return	ms since epoch */
+UNIV_INTERN
+ulint
+ut_time_ms(void)
+/*============*/
+{
+	struct timeval	tv;
+
+	ut_gettimeofday(&tv, NULL);
+
+	return((ulint) tv.tv_sec * 1000 + tv.tv_usec / 1000);
+}
+#endif /* !UNIV_HOTBACKUP */
+
+/**********************************************************//**
+Returns the difference of two times in seconds.
+@return	time2 - time1 expressed in seconds */
+UNIV_INTERN
+double
+ut_difftime(
+/*========*/
+	ib_time_t	time2,	/*!< in: time */
+	ib_time_t	time1)	/*!< in: time */
+{
+	return(difftime(time2, time1));
+}
+
+/**********************************************************//**
+Prints a timestamp to a file. */
+UNIV_INTERN
+void
+ut_print_timestamp(
+/*===============*/
+	FILE*  file) /*!< in: file where to print */
+{
+#ifdef __WIN__
+	SYSTEMTIME cal_tm;
+
+	GetLocalTime(&cal_tm);
+
+	fprintf(file,"%02d%02d%02d %2d:%02d:%02d",
+		(int)cal_tm.wYear % 100,
+		(int)cal_tm.wMonth,
+		(int)cal_tm.wDay,
+		(int)cal_tm.wHour,
+		(int)cal_tm.wMinute,
+		(int)cal_tm.wSecond);
+#else
+	struct tm  cal_tm;
+	struct tm* cal_tm_ptr;
+	time_t	   tm;
+
+	time(&tm);
+
+#ifdef HAVE_LOCALTIME_R
+	localtime_r(&tm, &cal_tm);
+	cal_tm_ptr = &cal_tm;
+#else
+	cal_tm_ptr = localtime(&tm);
+#endif
+	fprintf(file,"%02d%02d%02d %2d:%02d:%02d",
+		cal_tm_ptr->tm_year % 100,
+		cal_tm_ptr->tm_mon + 1,
+		cal_tm_ptr->tm_mday,
+		cal_tm_ptr->tm_hour,
+		cal_tm_ptr->tm_min,
+		cal_tm_ptr->tm_sec);
+#endif
+}
+
+/**********************************************************//**
+Sprintfs a timestamp to a buffer, 13..14 chars plus terminating NUL. */
+UNIV_INTERN
+void
+ut_sprintf_timestamp(
+/*=================*/
+	char*	buf) /*!< in: buffer where to sprintf */
+{
+#ifdef __WIN__
+	SYSTEMTIME cal_tm;
+
+	GetLocalTime(&cal_tm);
+
+	sprintf(buf, "%02d%02d%02d %2d:%02d:%02d",
+		(int)cal_tm.wYear % 100,
+		(int)cal_tm.wMonth,
+		(int)cal_tm.wDay,
+		(int)cal_tm.wHour,
+		(int)cal_tm.wMinute,
+		(int)cal_tm.wSecond);
+#else
+	struct tm  cal_tm;
+	struct tm* cal_tm_ptr;
+	time_t	   tm;
+
+	time(&tm);
+
+#ifdef HAVE_LOCALTIME_R
+	localtime_r(&tm, &cal_tm);
+	cal_tm_ptr = &cal_tm;
+#else
+	cal_tm_ptr = localtime(&tm);
+#endif
+	sprintf(buf, "%02d%02d%02d %2d:%02d:%02d",
+		cal_tm_ptr->tm_year % 100,
+		cal_tm_ptr->tm_mon + 1,
+		cal_tm_ptr->tm_mday,
+		cal_tm_ptr->tm_hour,
+		cal_tm_ptr->tm_min,
+		cal_tm_ptr->tm_sec);
+#endif
+}
+
+#ifdef UNIV_HOTBACKUP
+/**********************************************************//**
+Sprintfs a timestamp to a buffer with no spaces and with ':' characters
+replaced by '_'. */
+UNIV_INTERN
+void
+ut_sprintf_timestamp_without_extra_chars(
+/*=====================================*/
+	char*	buf) /*!< in: buffer where to sprintf */
+{
+#ifdef __WIN__
+	SYSTEMTIME cal_tm;
+
+	GetLocalTime(&cal_tm);
+
+	sprintf(buf, "%02d%02d%02d_%2d_%02d_%02d",
+		(int)cal_tm.wYear % 100,
+		(int)cal_tm.wMonth,
+		(int)cal_tm.wDay,
+		(int)cal_tm.wHour,
+		(int)cal_tm.wMinute,
+		(int)cal_tm.wSecond);
+#else
+	struct tm  cal_tm;
+	struct tm* cal_tm_ptr;
+	time_t	   tm;
+
+	time(&tm);
+
+#ifdef HAVE_LOCALTIME_R
+	localtime_r(&tm, &cal_tm);
+	cal_tm_ptr = &cal_tm;
+#else
+	cal_tm_ptr = localtime(&tm);
+#endif
+	sprintf(buf, "%02d%02d%02d_%2d_%02d_%02d",
+		cal_tm_ptr->tm_year % 100,
+		cal_tm_ptr->tm_mon + 1,
+		cal_tm_ptr->tm_mday,
+		cal_tm_ptr->tm_hour,
+		cal_tm_ptr->tm_min,
+		cal_tm_ptr->tm_sec);
+#endif
+}
+
+/**********************************************************//**
+Returns current year, month, day. */
+UNIV_INTERN
+void
+ut_get_year_month_day(
+/*==================*/
+	ulint*	year,	/*!< out: current year */
+	ulint*	month,	/*!< out: month */
+	ulint*	day)	/*!< out: day */
+{
+#ifdef __WIN__
+	SYSTEMTIME cal_tm;
+
+	GetLocalTime(&cal_tm);
+
+	*year = (ulint)cal_tm.wYear;
+	*month = (ulint)cal_tm.wMonth;
+	*day = (ulint)cal_tm.wDay;
+#else
+	struct tm  cal_tm;
+	struct tm* cal_tm_ptr;
+	time_t	   tm;
+
+	time(&tm);
+
+#ifdef HAVE_LOCALTIME_R
+	localtime_r(&tm, &cal_tm);
+	cal_tm_ptr = &cal_tm;
+#else
+	cal_tm_ptr = localtime(&tm);
+#endif
+	*year = (ulint)cal_tm_ptr->tm_year + 1900;
+	*month = (ulint)cal_tm_ptr->tm_mon + 1;
+	*day = (ulint)cal_tm_ptr->tm_mday;
+#endif
+}
+#endif /* UNIV_HOTBACKUP */
+
+#ifndef UNIV_HOTBACKUP
+/*************************************************************//**
+Runs an idle loop on CPU. The argument gives the desired delay
+in microseconds on 100 MHz Pentium + Visual C++.
+@return	dummy value */
+UNIV_INTERN
+ulint
+ut_delay(
+/*=====*/
+	ulint	delay)	/*!< in: delay in microseconds on 100 MHz Pentium */
+{
+	ulint	i, j;
+
+	j = 0;
+
+	for (i = 0; i < delay * 50; i++) {
+		j += i;
+		UT_RELAX_CPU();
+	}
+
+	if (ut_always_false) {
+		ut_always_false = (ibool) j;
+	}
+
+	return(j);
+}
+#endif /* !UNIV_HOTBACKUP */
+
+/*************************************************************//**
+Prints the contents of a memory buffer in hex and ascii. */
+UNIV_INTERN
+void
+ut_print_buf(
+/*=========*/
+	FILE*		file,	/*!< in: file where to print */
+	const void*	buf,	/*!< in: memory buffer */
+	ulint		len)	/*!< in: length of the buffer */
+{
+	const byte*	data;
+	ulint		i;
+
+	UNIV_MEM_ASSERT_RW(buf, len);
+
+	fprintf(file, " len %lu; hex ", len);
+
+	for (data = (const byte*)buf, i = 0; i < len; i++) {
+		fprintf(file, "%02lx", (ulong)*data++);
+	}
+
+	fputs("; asc ", file);
+
+	data = (const byte*)buf;
+
+	for (i = 0; i < len; i++) {
+		int	c = (int) *data++;
+		putc(isprint(c) ? c : ' ', file);
+	}
+
+	putc(';', file);
+}
+
+/*************************************************************//**
+Calculates fast the number rounded up to the nearest power of 2.
+@return	first power of 2 which is >= n */
+UNIV_INTERN
+ulint
+ut_2_power_up(
+/*==========*/
+	ulint	n)	/*!< in: number != 0 */
+{
+	ulint	res;
+
+	res = 1;
+
+	ut_ad(n > 0);
+
+	while (res < n) {
+		res = res * 2;
+	}
+
+	return(res);
+}
+
+/**********************************************************************//**
+Outputs a NUL-terminated file name, quoted with apostrophes. */
+UNIV_INTERN
+void
+ut_print_filename(
+/*==============*/
+	FILE*		f,	/*!< in: output stream */
+	const char*	name)	/*!< in: name to print */
+{
+	putc('\'', f);
+	for (;;) {
+		int	c = *name++;
+		switch (c) {
+		case 0:
+			goto done;
+		case '\'':
+			putc(c, f);
+			/* fall through */
+		default:
+			putc(c, f);
+		}
+	}
+done:
+	putc('\'', f);
+}
+#ifndef UNIV_HOTBACKUP
+/**********************************************************************//**
+Outputs a fixed-length string, quoted as an SQL identifier.
+If the string contains a slash '/', the string will be
+output as two identifiers separated by a period (.),
+as in SQL database_name.identifier. */
+UNIV_INTERN
+void
+ut_print_name(
+/*==========*/
+	FILE*		f,	/*!< in: output stream */
+	trx_t*		trx,	/*!< in: transaction */
+	ibool		table_id,/*!< in: TRUE=print a table name,
+				FALSE=print other identifier */
+	const char*	name)	/*!< in: name to print */
+{
+	ut_print_namel(f, trx, table_id, name, strlen(name));
+}
+
+/**********************************************************************//**
+Outputs a fixed-length string, quoted as an SQL identifier.
+If the string contains a slash '/', the string will be
+output as two identifiers separated by a period (.),
+as in SQL database_name.identifier. */
+UNIV_INTERN
+void
+ut_print_namel(
+/*===========*/
+	FILE*		f,	/*!< in: output stream */
+	trx_t*		trx,	/*!< in: transaction (NULL=no quotes) */
+	ibool		table_id,/*!< in: TRUE=print a table name,
+				FALSE=print other identifier */
+	const char*	name,	/*!< in: name to print */
+	ulint		namelen)/*!< in: length of name */
+{
+	/* 2 * NAME_LEN for database and table name,
+	and some slack for the #mysql50# prefix and quotes */
+	char		buf[3 * NAME_LEN];
+	const char*	bufend;
+
+	bufend = innobase_convert_name(buf, sizeof buf,
+				       name, namelen,
+				       trx ? trx->mysql_thd : NULL,
+				       table_id);
+
+	fwrite(buf, 1, bufend - buf, f);
+}
+
+/**********************************************************************//**
+Catenate files. */
+UNIV_INTERN
+void
+ut_copy_file(
+/*=========*/
+	FILE*	dest,	/*!< in: output file */
+	FILE*	src)	/*!< in: input file to be appended to output */
+{
+	long	len = ftell(src);
+	char	buf[4096];
+
+	rewind(src);
+	do {
+		size_t	maxs = len < (long) sizeof buf
+			? (size_t) len
+			: sizeof buf;
+		size_t	size = fread(buf, 1, maxs, src);
+		fwrite(buf, 1, size, dest);
+		len -= (long) size;
+		if (size < maxs) {
+			break;
+		}
+	} while (len > 0);
+}
+#endif /* !UNIV_HOTBACKUP */
+
+#ifdef __WIN__
+# include <stdarg.h>
+/**********************************************************************//**
+A substitute for snprintf(3), formatted output conversion into
+a limited buffer.
+@return number of characters that would have been printed if the size
+were unlimited, not including the terminating '\0'. */
+UNIV_INTERN
+int
+ut_snprintf(
+/*========*/
+	char*		str,	/*!< out: string */
+	size_t		size,	/*!< in: str size */
+	const char*	fmt,	/*!< in: format */
+	...)			/*!< in: format values */
+{
+	int	res;
+	va_list	ap1;
+	va_list	ap2;
+
+	va_start(ap1, fmt);
+	va_start(ap2, fmt);
+
+	res = _vscprintf(fmt, ap1);
+	ut_a(res != -1);
+
+	if (size > 0) {
+		_vsnprintf(str, size, fmt, ap2);
+
+		if ((size_t) res >= size) {
+			str[size - 1] = '\0';
+		}
+	}
+
+	va_end(ap1);
+	va_end(ap2);
+
+	return(res);
+}
+#endif /* __WIN__ */
diff --git a/storage/xtradb/ut/ut0vec.c b/storage/xtradb/ut/ut0vec.c
new file mode 100644
index 00000000000..45f2bc9771f
--- /dev/null
+++ b/storage/xtradb/ut/ut0vec.c
@@ -0,0 +1,79 @@
+/*****************************************************************************
+
+Copyright (c) 2006, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+/*******************************************************************//**
+@file ut/ut0vec.c
+A vector of pointers to data items
+
+Created 4/6/2006 Osku Salerma
+************************************************************************/
+
+#include "ut0vec.h"
+#ifdef UNIV_NONINL
+#include "ut0vec.ic"
+#endif
+#include <string.h>
+
+/****************************************************************//**
+Create a new vector with the given initial size.
+@return	vector */
+UNIV_INTERN
+ib_vector_t*
+ib_vector_create(
+/*=============*/
+	mem_heap_t*	heap,	/*!< in: heap */
+	ulint		size)	/*!< in: initial size */
+{
+	ib_vector_t*	vec;
+
+	ut_a(size > 0);
+
+	vec = mem_heap_alloc(heap, sizeof(*vec));
+
+	vec->heap = heap;
+	vec->data = mem_heap_alloc(heap, sizeof(void*) * size);
+	vec->used = 0;
+	vec->total = size;
+
+	return(vec);
+}
+
+/****************************************************************//**
+Push a new element to the vector, increasing its size if necessary. */
+UNIV_INTERN
+void
+ib_vector_push(
+/*===========*/
+	ib_vector_t*	vec,	/*!< in: vector */
+	void*		elem)	/*!< in: data element */
+{
+	if (vec->used >= vec->total) {
+		void**	new_data;
+		ulint	new_total = vec->total * 2;
+
+		new_data = mem_heap_alloc(vec->heap,
+					  sizeof(void*) * new_total);
+		memcpy(new_data, vec->data, sizeof(void*) * vec->total);
+
+		vec->data = new_data;
+		vec->total = new_total;
+	}
+
+	vec->data[vec->used] = elem;
+	vec->used++;
+}
diff --git a/storage/xtradb/ut/ut0wqueue.c b/storage/xtradb/ut/ut0wqueue.c
new file mode 100644
index 00000000000..5220d1e17f4
--- /dev/null
+++ b/storage/xtradb/ut/ut0wqueue.c
@@ -0,0 +1,118 @@
+/*****************************************************************************
+
+Copyright (c) 2006, 2009, Innobase Oy. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+Place, Suite 330, Boston, MA 02111-1307 USA
+
+*****************************************************************************/
+
+#include "ut0wqueue.h"
+
+/*******************************************************************//**
+@file ut/ut0wqueue.c
+A work queue
+
+Created 4/26/2006 Osku Salerma
+************************************************************************/
+
+/****************************************************************//**
+Create a new work queue.
+@return	work queue */
+UNIV_INTERN
+ib_wqueue_t*
+ib_wqueue_create(void)
+/*===================*/
+{
+	ib_wqueue_t*	wq = mem_alloc(sizeof(ib_wqueue_t));
+
+	mutex_create(&wq->mutex, SYNC_WORK_QUEUE);
+
+	wq->items = ib_list_create();
+	wq->event = os_event_create(NULL);
+
+	return(wq);
+}
+
+/****************************************************************//**
+Free a work queue. */
+UNIV_INTERN
+void
+ib_wqueue_free(
+/*===========*/
+	ib_wqueue_t*	wq)	/*!< in: work queue */
+{
+	ut_a(!ib_list_get_first(wq->items));
+
+	mutex_free(&wq->mutex);
+	ib_list_free(wq->items);
+	os_event_free(wq->event);
+
+	mem_free(wq);
+}
+
+/****************************************************************//**
+Add a work item to the queue. */
+UNIV_INTERN
+void
+ib_wqueue_add(
+/*==========*/
+	ib_wqueue_t*	wq,	/*!< in: work queue */
+	void*		item,	/*!< in: work item */
+	mem_heap_t*	heap)	/*!< in: memory heap to use for allocating the
+				list node */
+{
+	mutex_enter(&wq->mutex);
+
+	ib_list_add_last(wq->items, item, heap);
+	os_event_set(wq->event);
+
+	mutex_exit(&wq->mutex);
+}
+
+/****************************************************************//**
+Wait for a work item to appear in the queue.
+@return	work item */
+UNIV_INTERN
+void*
+ib_wqueue_wait(
+/*===========*/
+	ib_wqueue_t*	wq)	/*!< in: work queue */
+{
+	ib_list_node_t*	node;
+
+	for (;;) {
+		os_event_wait(wq->event);
+
+		mutex_enter(&wq->mutex);
+
+		node = ib_list_get_first(wq->items);
+
+		if (node) {
+			ib_list_remove(wq->items, node);
+
+			if (!ib_list_get_first(wq->items)) {
+				/* We must reset the event when the list
+				gets emptied. */
+				os_event_reset(wq->event);
+			}
+
+			break;
+		}
+
+		mutex_exit(&wq->mutex);
+	}
+
+	mutex_exit(&wq->mutex);
+
+	return(node->data);
+}
author	Sergei Golubchik <sergii@pisem.net>	2010-11-25 18:17:28 +0100
committer	Sergei Golubchik <sergii@pisem.net>	2010-11-25 18:17:28 +0100
commit	65ca700def99289cc31a7040537f5aa6e12bf485 (patch)
tree	97b3a07299b626c519da0e80c122b5b79b933914 /storage
parent	2ab57de38d13d927ddff2d51aed4af34e13998f5 (diff)
parent	6e5bcca7935d3c62f84bb640e5357664a210ee12 (diff)
download	mariadb-git-65ca700def99289cc31a7040537f5aa6e12bf485.tar.gz